Merge tag 'driver-core-3.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core Pull driver core updates from Greg KH: "Here's the big driver core merge for 3.11-rc1 Lots of little things, and larger firmware subsystem updates, all described in the shortlog. Nice thing here is that we finally get rid of CONFIG_HOTPLUG, after 10+ years, thanks to Stephen Rohtwell (it had been always on for a number of kernel releases, now it's just removed)" * tag 'driver-core-3.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core: (27 commits) driver core: device.h: fix doc compilation warnings firmware loader: fix another compile warning with PM_SLEEP unset build some drivers only when compile-testing firmware loader: fix compile warning with PM_SLEEP set kobject: sanitize argument for format string sysfs_notify is only possible on file attributes firmware loader: simplify holding module for request_firmware firmware loader: don't export cache_firmware and uncache_firmware drivers/base: Use attribute groups to create sysfs memory files firmware loader: fix compile warning firmware loader: fix build failure with !CONFIG_FW_LOADER_USER_HELPER Documentation: Updated broken link in HOWTO Finally eradicate CONFIG_HOTPLUG driver core: firmware loader: kill FW_ACTION_NOHOTPLUG requests before suspend driver core: firmware loader: don't cache FW_ACTION_NOHOTPLUG firmware Documentation: Tidy up some drivers/base/core.c kerneldoc content. platform_device: use a macro instead of platform_driver_register firmware: move EXPORT_SYMBOL annotations firmware: Avoid deadlock of usermodehelper lock at shutdown dell_rbu: Select CONFIG_FW_LOADER_USER_HELPER explicitly ...

commit: fc76a258d41eea7953bb763397c3d1e589d3bb98 [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Tue Jul 02 11:44:19 2013 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> Tue Jul 02 11:44:19 2013 -0700
tree: ef8c4b828f3011adf9fc9ef976b9d07c35628e81
parent: fe3c22bd5cadd8e36977b218b27fbea821381ec8 [diff]
parent: bfd63cd24df69120585c22e09fda78723772ee2a [diff]
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 45b3df9..0c4cc688 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX

@@ -187,6 +187,8 @@
 	- request_firmware() hotplug interface info.
 flexible-arrays.txt
 	- how to make use of flexible sized arrays in linux
+fmc/
+	- information about the FMC bus abstraction
 frv/
 	- Fujitsu FR-V Linux documentation.
 futex-requeue-pi.txt

diff --git a/Documentation/ABI/testing/configfs-usb-gadget b/Documentation/ABI/testing/configfs-usb-gadget
new file mode 100644
index 0000000..01e769d
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget

@@ -0,0 +1,81 @@
+What:		/config/usb-gadget
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+		This group contains sub-groups corresponding to created
+		USB gadgets.
+
+What:		/config/usb-gadget/gadget
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+
+		The attributes of a gadget:
+
+		UDC		- bind a gadget to UDC/unbind a gadget;
+				write UDC's name found in /sys/class/udc/*
+				to bind a gadget, empty string "" to unbind.
+
+		bDeviceClass	- USB device class code
+		bDeviceSubClass	- USB device subclass code
+		bDeviceProtocol	- USB device protocol code
+		bMaxPacketSize0	- maximum endpoint 0 packet size
+		bcdDevice	- bcd device release number
+		bcdUSB		- bcd USB specification version number
+		idProduct	- product ID
+		idVendor	- vendor ID
+
+What:		/config/usb-gadget/gadget/configs
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+		This group contains a USB gadget's configurations
+
+What:		/config/usb-gadget/gadget/configs/config
+Date:		Jun 2013
+KernelVersion:	3.11
+Description:
+		The attributes of a configuration:
+
+		bmAttributes	- configuration characteristics
+		MaxPower	- maximum power consumption from the bus
+
+What:		/config/usb-gadget/gadget/configs/config/strings
+Date:		Jun 2013
+KernelVersion:	3.11
+Description:
+		This group contains subdirectories for language-specific
+		strings for this configuration.
+
+What:		/config/usb-gadget/gadget/configs/config/strings/language
+Date:		Jun 2013
+KernelVersion:	3.11
+Description:
+		The attributes:
+
+		configuration	- configuration description
+
+
+What:		/config/usb-gadget/gadget/functions
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+		This group contains functions available to this USB gadget.
+
+What:		/config/usb-gadget/gadget/strings
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+		This group contains subdirectories for language-specific
+		strings for this gadget.
+
+What:		/config/usb-gadget/gadget/strings/language
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+		The attributes:
+
+		serialnumber	- gadget's serial number (string)
+		product		- gadget's product description
+		manufacturer	- gadget's manufacturer description
+

diff --git a/Documentation/ABI/testing/configfs-usb-gadget-acm b/Documentation/ABI/testing/configfs-usb-gadget-acm
new file mode 100644
index 0000000..5708a56
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget-acm

@@ -0,0 +1,8 @@
+What:		/config/usb-gadget/gadget/functions/acm.name
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+
+		This item contains just one readonly attribute: port_num.
+		It contains the port number of the /dev/ttyGS<n> device
+		associated with acm function's instance "name".

diff --git a/Documentation/ABI/testing/configfs-usb-gadget-ecm b/Documentation/ABI/testing/configfs-usb-gadget-ecm
new file mode 100644
index 0000000..6b9a582
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget-ecm

@@ -0,0 +1,16 @@
+What:		/config/usb-gadget/gadget/functions/ecm.name
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+		The attributes:
+
+		ifname		- network device interface name associated with
+				this function instance
+		qmult		- queue length multiplier for high and
+				super speed
+		host_addr	- MAC address of host's end of this
+				Ethernet over USB link
+		dev_addr	- MAC address of device's end of this
+				Ethernet over USB link
+
+

diff --git a/Documentation/ABI/testing/configfs-usb-gadget-eem b/Documentation/ABI/testing/configfs-usb-gadget-eem
new file mode 100644
index 0000000..dbddf36
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget-eem

@@ -0,0 +1,14 @@
+What:		/config/usb-gadget/gadget/functions/eem.name
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+		The attributes:
+
+		ifname		- network device interface name associated with
+				this function instance
+		qmult		- queue length multiplier for high and
+				super speed
+		host_addr	- MAC address of host's end of this
+				Ethernet over USB link
+		dev_addr	- MAC address of device's end of this
+				Ethernet over USB link

diff --git a/Documentation/ABI/testing/configfs-usb-gadget-ncm b/Documentation/ABI/testing/configfs-usb-gadget-ncm
new file mode 100644
index 0000000..bc309f423
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget-ncm

@@ -0,0 +1,15 @@
+What:		/config/usb-gadget/gadget/functions/ncm.name
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+		The attributes:
+
+		ifname		- network device interface name associated with
+				this function instance
+		qmult		- queue length multiplier for high and
+				super speed
+		host_addr	- MAC address of host's end of this
+				Ethernet over USB link
+		dev_addr	- MAC address of device's end of this
+				Ethernet over USB link
+

diff --git a/Documentation/ABI/testing/configfs-usb-gadget-obex b/Documentation/ABI/testing/configfs-usb-gadget-obex
new file mode 100644
index 0000000..aaa5c96
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget-obex

@@ -0,0 +1,9 @@
+What:		/config/usb-gadget/gadget/functions/obex.name
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+
+		This item contains just one readonly attribute: port_num.
+		It contains the port number of the /dev/ttyGS<n> device
+		associated with obex function's instance "name".
+

diff --git a/Documentation/ABI/testing/configfs-usb-gadget-phonet b/Documentation/ABI/testing/configfs-usb-gadget-phonet
new file mode 100644
index 0000000..3e3b742
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget-phonet

@@ -0,0 +1,8 @@
+What:		/config/usb-gadget/gadget/functions/phonet.name
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+
+		This item contains just one readonly attribute: ifname.
+		It contains the network interface name assigned during
+		network device registration.

diff --git a/Documentation/ABI/testing/configfs-usb-gadget-rndis b/Documentation/ABI/testing/configfs-usb-gadget-rndis
new file mode 100644
index 0000000..822e6da
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget-rndis

@@ -0,0 +1,14 @@
+What:		/config/usb-gadget/gadget/functions/rndis.name
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+		The attributes:
+
+		ifname		- network device interface name associated with
+				this function instance
+		qmult		- queue length multiplier for high and
+				super speed
+		host_addr	- MAC address of host's end of this
+				Ethernet over USB link
+		dev_addr	- MAC address of device's end of this
+				Ethernet over USB link

diff --git a/Documentation/ABI/testing/configfs-usb-gadget-serial b/Documentation/ABI/testing/configfs-usb-gadget-serial
new file mode 100644
index 0000000..16f130c1
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget-serial

@@ -0,0 +1,9 @@
+What:		/config/usb-gadget/gadget/functions/gser.name
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+
+		This item contains just one readonly attribute: port_num.
+		It contains the port number of the /dev/ttyGS<n> device
+		associated with gser function's instance "name".
+

diff --git a/Documentation/ABI/testing/configfs-usb-gadget-subset b/Documentation/ABI/testing/configfs-usb-gadget-subset
new file mode 100644
index 0000000..154ae59
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget-subset

@@ -0,0 +1,14 @@
+What:		/config/usb-gadget/gadget/functions/geth.name
+Date:		Jun 2013
+KenelVersion:	3.11
+Description:
+		The attributes:
+
+		ifname		- network device interface name associated with
+				this function instance
+		qmult		- queue length multiplier for high and
+				super speed
+		host_addr	- MAC address of host's end of this
+				Ethernet over USB link
+		dev_addr	- MAC address of device's end of this
+				Ethernet over USB link

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 2e33dc6..dda81ff 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio

@@ -690,45 +690,45 @@
 		Actually start the buffer capture up.  Will start trigger
 		if first device and appropriate.
 
-What:		/sys/bus/iio/devices/iio:deviceX/buffer/scan_elements
+What:		/sys/bus/iio/devices/iio:deviceX/scan_elements
 KernelVersion:	2.6.37
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Directory containing interfaces for elements that will be
 		captured for a single triggered sample set in the buffer.
 
-What:		/sys/.../buffer/scan_elements/in_accel_x_en
-What:		/sys/.../buffer/scan_elements/in_accel_y_en
-What:		/sys/.../buffer/scan_elements/in_accel_z_en
-What:		/sys/.../buffer/scan_elements/in_anglvel_x_en
-What:		/sys/.../buffer/scan_elements/in_anglvel_y_en
-What:		/sys/.../buffer/scan_elements/in_anglvel_z_en
-What:		/sys/.../buffer/scan_elements/in_magn_x_en
-What:		/sys/.../buffer/scan_elements/in_magn_y_en
-What:		/sys/.../buffer/scan_elements/in_magn_z_en
-What:		/sys/.../buffer/scan_elements/in_timestamp_en
-What:		/sys/.../buffer/scan_elements/in_voltageY_supply_en
-What:		/sys/.../buffer/scan_elements/in_voltageY_en
-What:		/sys/.../buffer/scan_elements/in_voltageY-voltageZ_en
-What:		/sys/.../buffer/scan_elements/in_incli_x_en
-What:		/sys/.../buffer/scan_elements/in_incli_y_en
-What:		/sys/.../buffer/scan_elements/in_pressureY_en
-What:		/sys/.../buffer/scan_elements/in_pressure_en
+What:		/sys/.../iio:deviceX/scan_elements/in_accel_x_en
+What:		/sys/.../iio:deviceX/scan_elements/in_accel_y_en
+What:		/sys/.../iio:deviceX/scan_elements/in_accel_z_en
+What:		/sys/.../iio:deviceX/scan_elements/in_anglvel_x_en
+What:		/sys/.../iio:deviceX/scan_elements/in_anglvel_y_en
+What:		/sys/.../iio:deviceX/scan_elements/in_anglvel_z_en
+What:		/sys/.../iio:deviceX/scan_elements/in_magn_x_en
+What:		/sys/.../iio:deviceX/scan_elements/in_magn_y_en
+What:		/sys/.../iio:deviceX/scan_elements/in_magn_z_en
+What:		/sys/.../iio:deviceX/scan_elements/in_timestamp_en
+What:		/sys/.../iio:deviceX/scan_elements/in_voltageY_supply_en
+What:		/sys/.../iio:deviceX/scan_elements/in_voltageY_en
+What:		/sys/.../iio:deviceX/scan_elements/in_voltageY-voltageZ_en
+What:		/sys/.../iio:deviceX/scan_elements/in_incli_x_en
+What:		/sys/.../iio:deviceX/scan_elements/in_incli_y_en
+What:		/sys/.../iio:deviceX/scan_elements/in_pressureY_en
+What:		/sys/.../iio:deviceX/scan_elements/in_pressure_en
 KernelVersion:	2.6.37
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Scan element control for triggered data capture.
 
-What:		/sys/.../buffer/scan_elements/in_accel_type
-What:		/sys/.../buffer/scan_elements/in_anglvel_type
-What:		/sys/.../buffer/scan_elements/in_magn_type
-What:		/sys/.../buffer/scan_elements/in_incli_type
-What:		/sys/.../buffer/scan_elements/in_voltageY_type
-What:		/sys/.../buffer/scan_elements/in_voltage_type
-What:		/sys/.../buffer/scan_elements/in_voltageY_supply_type
-What:		/sys/.../buffer/scan_elements/in_timestamp_type
-What:		/sys/.../buffer/scan_elements/in_pressureY_type
-What:		/sys/.../buffer/scan_elements/in_pressure_type
+What:		/sys/.../iio:deviceX/scan_elements/in_accel_type
+What:		/sys/.../iio:deviceX/scan_elements/in_anglvel_type
+What:		/sys/.../iio:deviceX/scan_elements/in_magn_type
+What:		/sys/.../iio:deviceX/scan_elements/in_incli_type
+What:		/sys/.../iio:deviceX/scan_elements/in_voltageY_type
+What:		/sys/.../iio:deviceX/scan_elements/in_voltage_type
+What:		/sys/.../iio:deviceX/scan_elements/in_voltageY_supply_type
+What:		/sys/.../iio:deviceX/scan_elements/in_timestamp_type
+What:		/sys/.../iio:deviceX/scan_elements/in_pressureY_type
+What:		/sys/.../iio:deviceX/scan_elements/in_pressure_type
 KernelVersion:	2.6.37
 Contact:	linux-iio@vger.kernel.org
 Description:
@@ -752,29 +752,29 @@
 		For other storage combinations this attribute will be extended
 		appropriately.
 
-What:		/sys/.../buffer/scan_elements/in_accel_type_available
+What:		/sys/.../iio:deviceX/scan_elements/in_accel_type_available
 KernelVersion:	2.6.37
 Contact:	linux-iio@vger.kernel.org
 Description:
 		If the type parameter can take one of a small set of values,
 		this attribute lists them.
 
-What:		/sys/.../buffer/scan_elements/in_voltageY_index
-What:		/sys/.../buffer/scan_elements/in_voltageY_supply_index
-What:		/sys/.../buffer/scan_elements/in_accel_x_index
-What:		/sys/.../buffer/scan_elements/in_accel_y_index
-What:		/sys/.../buffer/scan_elements/in_accel_z_index
-What:		/sys/.../buffer/scan_elements/in_anglvel_x_index
-What:		/sys/.../buffer/scan_elements/in_anglvel_y_index
-What:		/sys/.../buffer/scan_elements/in_anglvel_z_index
-What:		/sys/.../buffer/scan_elements/in_magn_x_index
-What:		/sys/.../buffer/scan_elements/in_magn_y_index
-What:		/sys/.../buffer/scan_elements/in_magn_z_index
-What:		/sys/.../buffer/scan_elements/in_incli_x_index
-What:		/sys/.../buffer/scan_elements/in_incli_y_index
-What:		/sys/.../buffer/scan_elements/in_timestamp_index
-What:		/sys/.../buffer/scan_elements/in_pressureY_index
-What:		/sys/.../buffer/scan_elements/in_pressure_index
+What:		/sys/.../iio:deviceX/scan_elements/in_voltageY_index
+What:		/sys/.../iio:deviceX/scan_elements/in_voltageY_supply_index
+What:		/sys/.../iio:deviceX/scan_elements/in_accel_x_index
+What:		/sys/.../iio:deviceX/scan_elements/in_accel_y_index
+What:		/sys/.../iio:deviceX/scan_elements/in_accel_z_index
+What:		/sys/.../iio:deviceX/scan_elements/in_anglvel_x_index
+What:		/sys/.../iio:deviceX/scan_elements/in_anglvel_y_index
+What:		/sys/.../iio:deviceX/scan_elements/in_anglvel_z_index
+What:		/sys/.../iio:deviceX/scan_elements/in_magn_x_index
+What:		/sys/.../iio:deviceX/scan_elements/in_magn_y_index
+What:		/sys/.../iio:deviceX/scan_elements/in_magn_z_index
+What:		/sys/.../iio:deviceX/scan_elements/in_incli_x_index
+What:		/sys/.../iio:deviceX/scan_elements/in_incli_y_index
+What:		/sys/.../iio:deviceX/scan_elements/in_timestamp_index
+What:		/sys/.../iio:deviceX/scan_elements/in_pressureY_index
+What:		/sys/.../iio:deviceX/scan_elements/in_pressure_index
 KernelVersion:	2.6.37
 Contact:	linux-iio@vger.kernel.org
 Description:

diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index f093e59..9759b8c 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb

@@ -236,3 +236,30 @@
 		This attribute is to expose these information to user space.
 		The file will read "hotplug", "wired" and "not used" if the
 		information is available, and "unknown" otherwise.
+
+What:		/sys/bus/usb/devices/.../power/usb2_lpm_l1_timeout
+Date:		May 2013
+Contact:	Mathias Nyman <mathias.nyman@linux.intel.com>
+Description:
+		USB 2.0 devices may support hardware link power management (LPM)
+		L1 sleep state. The usb2_lpm_l1_timeout attribute allows
+		tuning the timeout for L1 inactivity timer (LPM timer), e.g.
+		needed inactivity time before host requests the device to go to L1 sleep.
+		Useful for power management tuning.
+		Supported values are 0 - 65535 microseconds.
+
+What:		/sys/bus/usb/devices/.../power/usb2_lpm_besl
+Date:		May 2013
+Contact:	Mathias Nyman <mathias.nyman@linux.intel.com>
+Description:
+		USB 2.0 devices that support hardware link power management (LPM)
+		L1 sleep state now use a best effort service latency value (BESL) to
+		indicate the best effort to resumption of service to the device after the
+		initiation of the resume event.
+		If the device does not have a preferred besl value then the host can select
+		one instead. This usb2_lpm_besl attribute allows to tune the host selected besl
+		value in order to tune power saving and service latency.
+
+		Supported values are 0 - 15.
+		More information on how besl values map to microseconds can be found in
+		USB 2.0 ECN Errata for Link Power Management, section 4.10)

diff --git a/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc b/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc
index 25b1e75..5977e28 100644
--- a/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc
+++ b/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc

@@ -36,3 +36,22 @@
 
                 Refer to [ECMA-368] section 10.3.1.1 for the value to
                 use.
+
+What:           /sys/class/uwb_rc/uwbN/wusbhc/wusb_dnts
+Date:           June 2013
+KernelVersion:  3.11
+Contact:        Thomas Pugliese <thomas.pugliese@gmail.com>
+Description:
+                The device notification time slot (DNTS) count and inverval in
+                milliseconds that the WUSB host should use.  This controls how
+                often the devices will have the opportunity to send
+                notifications to the host.
+
+What:           /sys/class/uwb_rc/uwbN/wusbhc/wusb_retry_count
+Date:           June 2013
+KernelVersion:  3.11
+Contact:        Thomas Pugliese <thomas.pugliese@gmail.com>
+Description:
+                The number of retries that the WUSB host should attempt
+                before reporting an error for a bus transaction.  The range of
+                valid values is [0..15], where 0 indicates infinite retries.

diff --git a/Documentation/console/console.txt b/Documentation/console/console.txt
index 926cf1b..f93810d 100644
--- a/Documentation/console/console.txt
+++ b/Documentation/console/console.txt

@@ -12,20 +12,20 @@
 any time with each driver sharing the console with other drivers including
 the system driver. However, modular drivers cannot take over the console
 that is currently occupied by another modular driver. (Exception: Drivers that
-call take_over_console() will succeed in the takeover regardless of the type
+call do_take_over_console() will succeed in the takeover regardless of the type
 of driver occupying the consoles.) They can only take over the console that is
 occupied by the system driver. In the same token, if the modular driver is
 released by the console, the system driver will take over.
 
 Modular drivers, from the programmer's point of view, has to call:
 
-	 take_over_console() - load and bind driver to console layer
-	 give_up_console() - unbind and unload driver
+	 do_take_over_console() - load and bind driver to console layer
+	 give_up_console() - unload driver, it will only work if driver is fully unbond
 
 In newer kernels, the following are also available:
 
-	 register_con_driver()
-	 unregister_con_driver()
+	 do_register_con_driver()
+	 do_unregister_con_driver()
 
 If sysfs is enabled, the contents of /sys/class/vtconsole can be
 examined. This shows the console backends currently registered by the
@@ -94,12 +94,12 @@
 Notes for developers:
 =====================
 
-take_over_console() is now broken up into:
+do_take_over_console() is now broken up into:
 
-     register_con_driver()
-     bind_con_driver() - private function
+     do_register_con_driver()
+     do_bind_con_driver() - private function
 
-give_up_console() is a wrapper to unregister_con_driver(), and a driver must
+give_up_console() is a wrapper to do_unregister_con_driver(), and a driver must
 be fully unbound for this call to succeed. con_is_bound() will check if the
 driver is bound or not.
 
@@ -109,10 +109,10 @@
 In order for binding to and unbinding from the console to properly work,
 console drivers must follow these guidelines:
 
-1. All drivers, except system drivers, must call either register_con_driver()
-   or take_over_console(). register_con_driver() will just add the driver to
+1. All drivers, except system drivers, must call either do_register_con_driver()
+   or do_take_over_console(). do_register_con_driver() will just add the driver to
    the console's internal list. It won't take over the
-   console. take_over_console(), as it name implies, will also take over (or
+   console. do_take_over_console(), as it name implies, will also take over (or
    bind to) the console.
 
 2. All resources allocated during con->con_init() must be released in
@@ -128,10 +128,10 @@
    rebind the driver to the console arrives.
 
 4. Upon exit of the driver, ensure that the driver is totally unbound. If the
-   condition is satisfied, then the driver must call unregister_con_driver()
+   condition is satisfied, then the driver must call do_unregister_con_driver()
    or give_up_console().
 
-5. unregister_con_driver() can also be called on conditions which make it
+5. do_unregister_con_driver() can also be called on conditions which make it
    impossible for the driver to service console requests.  This can happen
    with the framebuffer console that suddenly lost all of its drivers.
 

diff --git a/Documentation/devicetree/bindings/ata/atmel-at91_cf.txt b/Documentation/devicetree/bindings/ata/atmel-at91_cf.txt
new file mode 100644
index 0000000..c1d22b3
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/atmel-at91_cf.txt

@@ -0,0 +1,19 @@
+Atmel AT91RM9200 CompactFlash
+
+Required properties:
+- compatible : "atmel,at91rm9200-cf".
+- reg : should specify localbus address and size used.
+- gpios : specifies the gpio pins to control the CF device. Detect
+  and reset gpio's are mandatory while irq and vcc gpio's are
+  optional and may be set to 0 if not present.
+
+Example:
+compact-flash@50000000 {
+	compatible = "atmel,at91rm9200-cf";
+	reg = <0x50000000 0x30000000>;
+	gpios = <&pioC 13 0	/* irq */
+		 &pioC 15 0 	/* detect */
+		 0		/* vcc */
+		 &pioC  5 0	/* reset */
+		>;
+};

diff --git a/Documentation/devicetree/bindings/extcon/extcon-twl.txt b/Documentation/devicetree/bindings/extcon/extcon-twl.txt
new file mode 100644
index 0000000..58f531a
--- /dev/null
+++ b/Documentation/devicetree/bindings/extcon/extcon-twl.txt

@@ -0,0 +1,15 @@
+EXTCON FOR TWL CHIPS
+
+PALMAS USB COMPARATOR
+Required Properties:
+ - compatible : Should be "ti,palmas-usb" or "ti,twl6035-usb"
+ - vbus-supply : phandle to the regulator device tree node.
+
+Optional Properties:
+ - ti,wakeup : To enable the wakeup comparator in probe
+
+palmas-usb {
+       compatible = "ti,twl6035-usb", "ti,palmas-usb";
+       vbus-supply = <&smps10_reg>;
+       ti,wakeup;
+};

diff --git a/Documentation/devicetree/bindings/iio/dac/ad7303.txt b/Documentation/devicetree/bindings/iio/dac/ad7303.txt
new file mode 100644
index 0000000..914610f
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/dac/ad7303.txt

@@ -0,0 +1,23 @@
+Analog Devices AD7303 DAC device driver
+
+Required properties:
+	- compatible: Must be "adi,ad7303"
+	- reg: SPI chip select number for the device
+	- spi-max-frequency: Max SPI frequency to use (< 30000000)
+	- Vdd-supply: Phandle to the Vdd power supply
+
+Optional properties:
+	- REF-supply: Phandle to the external reference voltage supply. This should
+	  only be set if there is an external reference voltage connected to the REF
+	  pin. If the property is not set Vdd/2 is used as the reference voltage.
+
+Example:
+
+		ad7303@4 {
+			compatible = "adi,ad7303";
+			reg = <4>;
+			spi-max-frequency = <10000000>;
+			Vdd-supply = <&vdd_supply>;
+			adi,use-external-reference;
+			REF-supply = <&vref_supply>;
+		};

diff --git a/Documentation/devicetree/bindings/iio/frequency/adf4350.txt b/Documentation/devicetree/bindings/iio/frequency/adf4350.txt
new file mode 100644
index 0000000..f8c181d
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/frequency/adf4350.txt

@@ -0,0 +1,86 @@
+Analog Devices ADF4350/ADF4351 device driver
+
+Required properties:
+	- compatible: Should be one of
+		* "adi,adf4350": When using the ADF4350 device
+		* "adi,adf4351": When using the ADF4351 device
+	- reg: SPI chip select numbert for the device
+	- spi-max-frequency: Max SPI frequency to use (< 20000000)
+	- clocks: From common clock binding. Clock is phandle to clock for
+		ADF435x Reference Clock (CLKIN).
+
+Optional properties:
+	- gpios:	 GPIO Lock detect - If set with a valid phandle and GPIO number,
+			pll lock state is tested upon read.
+	- adi,channel-spacing: Channel spacing in Hz (influences MODULUS).
+	- adi,power-up-frequency:	If set in Hz the PLL tunes to
+			the desired frequency on probe.
+	- adi,reference-div-factor: If set the driver skips dynamic calculation
+			and uses this default value instead.
+	- adi,reference-doubler-enable: Enables reference doubler.
+	- adi,reference-div2-enable: Enables reference divider.
+	- adi,phase-detector-polarity-positive-enable: Enables positive phase
+			detector polarity. Default = negative.
+	- adi,lock-detect-precision-6ns-enable: Enables 6ns lock detect precision.
+			Default = 10ns.
+	- adi,lock-detect-function-integer-n-enable: Enables lock detect
+			for integer-N mode. Default = factional-N mode.
+	- adi,charge-pump-current: Charge pump current in mA.
+			Default = 2500mA.
+	- adi,muxout-select: On chip multiplexer output selection.
+			Valid values for the multiplexer output are:
+			0: Three-State Output (default)
+			1: DVDD
+			2: DGND
+			3: R-Counter output
+			4: N-Divider output
+			5: Analog lock detect
+			6: Digital lock detect
+	- adi,low-spur-mode-enable: Enables low spur mode.
+			Default = Low noise mode.
+	- adi,cycle-slip-reduction-enable: Enables cycle slip reduction.
+	- adi,charge-cancellation-enable: Enabled charge pump
+			charge cancellation for integer-N modes.
+	- adi,anti-backlash-3ns-enable: Enables 3ns antibacklash pulse width
+			 for integer-N modes.
+	- adi,band-select-clock-mode-high-enable: Enables faster band
+			selection logic.
+	- adi,12bit-clk-divider: Clock divider value used when
+			adi,12bit-clkdiv-mode != 0
+	- adi,clk-divider-mode:
+			Valid values for the clkdiv mode are:
+			0: Clock divider off (default)
+			1: Fast lock enable
+			2: Phase resync enable
+	- adi,aux-output-enable: Enables auxiliary RF output.
+	- adi,aux-output-fundamental-enable: Selects fundamental VCO output on
+			the auxiliary RF output. Default = Output of RF dividers.
+	- adi,mute-till-lock-enable: Enables Mute-Till-Lock-Detect function.
+	- adi,output-power: Output power selection.
+			Valid values for the power mode are:
+			0: -4dBm (default)
+			1: -1dBm
+			2: +2dBm
+			3: +5dBm
+	- adi,aux-output-power: Auxiliary output power selection.
+			Valid values for the power mode are:
+			0: -4dBm (default)
+			1: -1dBm
+			2: +2dBm
+			3: +5dBm
+
+
+Example:
+		lo_pll0_rx_adf4351: adf4351-rx-lpc@4 {
+			compatible = "adi,adf4351";
+			reg = <4>;
+			spi-max-frequency = <10000000>;
+			clocks = <&clk0_ad9523 9>;
+			clock-names = "clkin";
+			adi,channel-spacing = <10000>;
+			adi,power-up-frequency = <2400000000>;
+			adi,phase-detector-polarity-positive-enable;
+			adi,charge-pump-current = <2500>;
+			adi,output-power = <3>;
+			adi,mute-till-lock-enable;
+		};

diff --git a/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt b/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt
new file mode 100644
index 0000000..011679f
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt

@@ -0,0 +1,18 @@
+* AsahiKASEI AK8975 magnetometer sensor
+
+Required properties:
+
+  - compatible : should be "asahi-kasei,ak8975"
+  - reg : the I2C address of the magnetometer
+
+Optional properties:
+
+  - gpios : should be device tree identifier of the magnetometer DRDY pin
+
+Example:
+
+ak8975@0c {
+        compatible = "asahi-kasei,ak8975";
+        reg = <0x0c>;
+        gpios = <&gpj0 7 0>;
+};

diff --git a/Documentation/devicetree/bindings/memory-controllers/mvebu-devbus.txt b/Documentation/devicetree/bindings/memory-controllers/mvebu-devbus.txt
new file mode 100644
index 0000000..653c90c
--- /dev/null
+++ b/Documentation/devicetree/bindings/memory-controllers/mvebu-devbus.txt

@@ -0,0 +1,156 @@
+Device tree bindings for MVEBU Device Bus controllers
+
+The Device Bus controller available in some Marvell's SoC allows to control
+different types of standard memory and I/O devices such as NOR, NAND, and FPGA.
+The actual devices are instantiated from the child nodes of a Device Bus node.
+
+Required properties:
+
+ - compatible:          Currently only Armada 370/XP SoC are supported,
+                        with this compatible string:
+
+                        marvell,mvebu-devbus
+
+ - reg:                 A resource specifier for the register space.
+                        This is the base address of a chip select within
+			the controller's register space.
+                        (see the example below)
+
+ - #address-cells:      Must be set to 1
+ - #size-cells:         Must be set to 1
+ - ranges:              Must be set up to reflect the memory layout with four
+                        integer values for each chip-select line in use:
+                        0 <physical address of mapping> <size>
+
+Mandatory timing properties for child nodes:
+
+Read parameters:
+
+ - devbus,turn-off-ps:  Defines the time during which the controller does not
+                        drive the AD bus after the completion of a device read.
+                        This prevents contentions on the Device Bus after a read
+                        cycle from a slow device.
+
+ - devbus,bus-width:    Defines the bus width (e.g. <16>)
+
+ - devbus,badr-skew-ps: Defines the time delay from from A[2:0] toggle,
+                        to read data sample. This parameter is useful for
+                        synchronous pipelined devices, where the address
+                        precedes the read data by one or two cycles.
+
+ - devbus,acc-first-ps: Defines the time delay from the negation of
+                        ALE[0] to the cycle that the first read data is sampled
+                        by the controller.
+
+ - devbus,acc-next-ps:  Defines the time delay between the cycle that
+                        samples data N and the cycle that samples data N+1
+                        (in burst accesses).
+
+ - devbus,rd-setup-ps:  Defines the time delay between DEV_CSn assertion to
+			DEV_OEn assertion. If set to 0 (default),
+                        DEV_OEn and DEV_CSn are asserted at the same cycle.
+                        This parameter has no affect on <acc-first-ps> parameter
+                        (no affect on first data sample). Set <rd-setup-ps>
+                        to a value smaller than <acc-first-ps>.
+
+ - devbus,rd-hold-ps:   Defines the time between the last data sample to the
+			de-assertion of DEV_CSn. If set to 0 (default),
+			DEV_OEn and DEV_CSn are de-asserted at the same cycle
+			(the cycle of the last data sample).
+                        This parameter has no affect on DEV_OEn de-assertion.
+                        DEV_OEn is always de-asserted the next cycle after
+                        last data sampled. Also this parameter has no
+                        affect on <turn-off-ps> parameter.
+                        Set <rd-hold-ps> to a value smaller than <turn-off-ps>.
+
+Write parameters:
+
+ - devbus,ale-wr-ps:    Defines the time delay from the ALE[0] negation cycle
+			to the DEV_WEn assertion.
+
+ - devbus,wr-low-ps:    Defines the time during which DEV_WEn is active.
+                        A[2:0] and Data are kept valid as long as DEV_WEn
+                        is active. This parameter defines the setup time of
+                        address and data to DEV_WEn rise.
+
+ - devbus,wr-high-ps:   Defines the time during which DEV_WEn is kept
+                        inactive (high) between data beats of a burst write.
+                        DEV_A[2:0] and Data are kept valid (do not toggle) for
+                        <wr-high-ps> - <tick> ps.
+			This parameter defines the hold time of address and
+			data after DEV_WEn rise.
+
+ - devbus,sync-enable: Synchronous device enable.
+                       1: True
+                       0: False
+
+An example for an Armada XP GP board, with a 16 MiB NOR device as child
+is showed below. Note that the Device Bus driver is in charge of allocating
+the mbus address decoding window for each of its child devices.
+The window is created using the chip select specified in the child
+device node together with the base address and size specified in the ranges
+property. For instance, in the example below the allocated decoding window
+will start at base address 0xf0000000, with a size 0x1000000 (16 MiB)
+for chip select 0 (a.k.a DEV_BOOTCS).
+
+This address window handling is done in this mvebu-devbus only as a temporary
+solution. It will be removed when the support for mbus device tree binding is
+added.
+
+The reg property implicitly specifies the chip select as this:
+
+  0x10400: DEV_BOOTCS
+  0x10408: DEV_CS0
+  0x10410: DEV_CS1
+  0x10418: DEV_CS2
+  0x10420: DEV_CS3
+
+Example:
+
+	devbus-bootcs@d0010400 {
+		status = "okay";
+		ranges = <0 0xf0000000 0x1000000>; /* @addr 0xf0000000, size 0x1000000 */
+		#address-cells = <1>;
+		#size-cells = <1>;
+
+		/* Device Bus parameters are required */
+
+		/* Read parameters */
+		devbus,bus-width    = <8>;
+		devbus,turn-off-ps  = <60000>;
+		devbus,badr-skew-ps = <0>;
+		devbus,acc-first-ps = <124000>;
+		devbus,acc-next-ps  = <248000>;
+		devbus,rd-setup-ps  = <0>;
+		devbus,rd-hold-ps   = <0>;
+
+		/* Write parameters */
+		devbus,sync-enable = <0>;
+		devbus,wr-high-ps  = <60000>;
+		devbus,wr-low-ps   = <60000>;
+		devbus,ale-wr-ps   = <60000>;
+
+		flash@0 {
+			compatible = "cfi-flash";
+
+			/* 16 MiB */
+			reg = <0 0x1000000>;
+			bank-width = <2>;
+			#address-cells = <1>;
+			#size-cells = <1>;
+
+			/*
+			 * We split the 16 MiB in two partitions,
+			 * just as an example.
+			 */
+			partition@0 {
+				label = "First";
+				reg = <0 0x800000>;
+			};
+
+			partition@800000 {
+				label = "Second";
+				reg = <0x800000 0x800000>;
+			};
+		};
+	};

diff --git a/Documentation/devicetree/bindings/staging/imx-drm/ldb.txt b/Documentation/devicetree/bindings/staging/imx-drm/ldb.txt
new file mode 100644
index 0000000..ed937781
--- /dev/null
+++ b/Documentation/devicetree/bindings/staging/imx-drm/ldb.txt

@@ -0,0 +1,99 @@
+Device-Tree bindings for LVDS Display Bridge (ldb)
+
+LVDS Display Bridge
+===================
+
+The LVDS Display Bridge device tree node contains up to two lvds-channel
+nodes describing each of the two LVDS encoder channels of the bridge.
+
+Required properties:
+ - #address-cells : should be <1>
+ - #size-cells : should be <0>
+ - compatible : should be "fsl,imx53-ldb" or "fsl,imx6q-ldb".
+                Both LDB versions are similar, but i.MX6 has an additional
+                multiplexer in the front to select any of the four IPU display
+                interfaces as input for each LVDS channel.
+ - gpr : should be <&gpr> on i.MX53 and i.MX6q.
+         The phandle points to the iomuxc-gpr region containing the LVDS
+         control register.
+- clocks, clock-names : phandles to the LDB divider and selector clocks and to
+                        the display interface selector clocks, as described in
+                        Documentation/devicetree/bindings/clock/clock-bindings.txt
+        The following clocks are expected on i.MX53:
+                "di0_pll" - LDB LVDS channel 0 mux
+                "di1_pll" - LDB LVDS channel 1 mux
+                "di0" - LDB LVDS channel 0 gate
+                "di1" - LDB LVDS channel 1 gate
+                "di0_sel" - IPU1 DI0 mux
+                "di1_sel" - IPU1 DI1 mux
+        On i.MX6q the following additional clocks are needed:
+                "di2_sel" - IPU2 DI0 mux
+                "di3_sel" - IPU2 DI1 mux
+        The needed clock numbers for each are documented in
+        Documentation/devicetree/bindings/clock/imx5-clock.txt, and in
+        Documentation/devicetree/bindings/clock/imx6q-clock.txt.
+
+Optional properties:
+ - pinctrl-names : should be "default" on i.MX53, not used on i.MX6q
+ - pinctrl-0 : a phandle pointing to LVDS pin settings on i.MX53,
+               not used on i.MX6q
+ - fsl,dual-channel : boolean. if it exists, only LVDS channel 0 should
+   be configured - one input will be distributed on both outputs in dual
+   channel mode
+
+LVDS Channel
+============
+
+Each LVDS Channel has to contain a display-timings node that describes the
+video timings for the connected LVDS display. For detailed information, also
+have a look at Documentation/devicetree/bindings/video/display-timing.txt.
+
+Required properties:
+ - reg : should be <0> or <1>
+ - crtcs : a list of phandles with index pointing to the IPU display interfaces
+           that can be used as video source for this channel.
+ - fsl,data-mapping : should be "spwg" or "jeida"
+                      This describes how the color bits are laid out in the
+                      serialized LVDS signal.
+ - fsl,data-width : should be <18> or <24>
+
+example:
+
+gpr: iomuxc-gpr@53fa8000 {
+	/* ... */
+};
+
+ldb: ldb@53fa8008 {
+	#address-cells = <1>;
+	#size-cells = <0>;
+	compatible = "fsl,imx53-ldb";
+	gpr = <&gpr>;
+	clocks = <&clks 122>, <&clks 120>,
+		 <&clks 115>, <&clks 116>,
+		 <&clks 123>, <&clks 85>;
+	clock-names = "di0_pll", "di1_pll",
+		      "di0_sel", "di1_sel",
+		      "di0", "di1";
+
+	lvds-channel@0 {
+		reg = <0>;
+		crtcs = <&ipu 0>;
+		fsl,data-mapping = "spwg";
+		fsl,data-width = <24>;
+
+		display-timings {
+			/* ... */
+		};
+	};
+
+	lvds-channel@1 {
+		reg = <1>;
+		crtcs = <&ipu 1>;
+		fsl,data-mapping = "spwg";
+		fsl,data-width = <24>;
+
+		display-timings {
+			/* ... */
+		};
+	};
+};

diff --git a/Documentation/devicetree/bindings/tty/serial/fsl-imx-uart.txt b/Documentation/devicetree/bindings/tty/serial/fsl-imx-uart.txt
index b462d0c..c662eb3 100644
--- a/Documentation/devicetree/bindings/tty/serial/fsl-imx-uart.txt
+++ b/Documentation/devicetree/bindings/tty/serial/fsl-imx-uart.txt

@@ -8,6 +8,8 @@
 Optional properties:
 - fsl,uart-has-rtscts : Indicate the uart has rts and cts
 - fsl,irda-mode : Indicate the uart supports irda mode
+- fsl,dte-mode : Indicate the uart works in DTE mode. The uart works
+                  is DCE mode by default.
 
 Example:
 
@@ -16,4 +18,5 @@
 	reg = <0x73fbc000 0x4000>;
 	interrupts = <31>;
 	fsl,uart-has-rtscts;
+	fsl,dte-mode;
 };

diff --git a/Documentation/devicetree/bindings/tty/serial/fsl-lpuart.txt b/Documentation/devicetree/bindings/tty/serial/fsl-lpuart.txt
new file mode 100644
index 0000000..6fd1dd1
--- /dev/null
+++ b/Documentation/devicetree/bindings/tty/serial/fsl-lpuart.txt

@@ -0,0 +1,14 @@
+* Freescale low power universal asynchronous receiver/transmitter (lpuart)
+
+Required properties:
+- compatible : Should be "fsl,<soc>-lpuart"
+- reg : Address and length of the register set for the device
+- interrupts : Should contain uart interrupt
+
+Example:
+
+uart0: serial@40027000 {
+	       compatible = "fsl,vf610-lpuart";
+	       reg = <0x40027000 0x1000>;
+	       interrupts = <0 61 0x00>;
+       };

diff --git a/Documentation/devicetree/bindings/usb/ci13xxx-imx.txt b/Documentation/devicetree/bindings/usb/ci13xxx-imx.txt
index 1c04a4c..b4b5b79 100644
--- a/Documentation/devicetree/bindings/usb/ci13xxx-imx.txt
+++ b/Documentation/devicetree/bindings/usb/ci13xxx-imx.txt

@@ -5,6 +5,12 @@
 - reg: Should contain registers location and length
 - interrupts: Should contain controller interrupt
 
+Recommended properies:
+- phy_type: the type of the phy connected to the core. Should be one
+  of "utmi", "utmi_wide", "ulpi", "serial" or "hsic". Without this
+  property the PORTSC register won't be touched
+- dr_mode: One of "host", "peripheral" or "otg". Defaults to "otg"
+
 Optional properties:
 - fsl,usbphy: phandler of usb phy that connects to the only one port
 - fsl,usbmisc: phandler of non-core register device, with one argument

diff --git a/Documentation/devicetree/bindings/usb/nvidia,tegra20-ehci.txt b/Documentation/devicetree/bindings/usb/nvidia,tegra20-ehci.txt
index 34c9528..df093304 100644
--- a/Documentation/devicetree/bindings/usb/nvidia,tegra20-ehci.txt
+++ b/Documentation/devicetree/bindings/usb/nvidia,tegra20-ehci.txt

@@ -6,27 +6,10 @@
 and additions :
 
 Required properties :
- - compatible : Should be "nvidia,tegra20-ehci" for USB controllers
-   used in host mode.
- - phy_type : Should be one of "ulpi" or "utmi".
- - nvidia,vbus-gpio : If present, specifies a gpio that needs to be
-   activated for the bus to be powered.
- - nvidia,phy : phandle of the PHY instance, the controller is connected to.
-
-Required properties for phy_type == ulpi:
-  - nvidia,phy-reset-gpio : The GPIO used to reset the PHY.
+ - compatible : Should be "nvidia,tegra20-ehci".
+ - nvidia,phy : phandle of the PHY that the controller is connected to.
+ - clocks : Contains a single entry which defines the USB controller's clock.
 
 Optional properties:
-  - dr_mode : dual role mode. Indicates the working mode for
-   nvidia,tegra20-ehci compatible controllers.  Can be "host", "peripheral",
-   or "otg".  Default to "host" if not defined for backward compatibility.
-      host means this is a host controller
-      peripheral means it is device controller
-      otg means it can operate as either ("on the go")
-  - nvidia,has-legacy-mode : boolean indicates whether this controller can
-    operate in legacy mode (as APX 2500 / 2600). In legacy mode some
-    registers are accessed through the APB_MISC base address instead of
-    the USB controller. Since this is a legacy issue it probably does not
-    warrant a compatible string of its own.
-  - nvidia,needs-double-reset : boolean is to be set for some of the Tegra2
-    USB ports, which need reset twice due to hardware issues.
+ - nvidia,needs-double-reset : boolean is to be set for some of the Tegra20
+   USB ports, which need reset twice due to hardware issues.

diff --git a/Documentation/devicetree/bindings/usb/nvidia,tegra20-usb-phy.txt b/Documentation/devicetree/bindings/usb/nvidia,tegra20-usb-phy.txt
index 6bdaba2..c4c9e9e 100644
--- a/Documentation/devicetree/bindings/usb/nvidia,tegra20-usb-phy.txt
+++ b/Documentation/devicetree/bindings/usb/nvidia,tegra20-usb-phy.txt

@@ -4,14 +4,49 @@
 
 Required properties :
  - compatible : Should be "nvidia,tegra20-usb-phy".
- - reg : Address and length of the register set for the USB PHY interface.
- - phy_type : Should be one of "ulpi" or "utmi".
+ - reg : Defines the following set of registers, in the order listed:
+   - The PHY's own register set.
+     Always present.
+   - The register set of the PHY containing the UTMI pad control registers.
+     Present if-and-only-if phy_type == utmi.
+ - phy_type : Should be one of "utmi", "ulpi" or "hsic".
+ - clocks : Defines the clocks listed in the clock-names property.
+ - clock-names : The following clock names must be present:
+   - reg: The clock needed to access the PHY's own registers. This is the
+     associated EHCI controller's clock. Always present.
+   - pll_u: PLL_U. Always present.
+   - timer: The timeout clock (clk_m). Present if phy_type == utmi.
+   - utmi-pads: The clock needed to access the UTMI pad control registers.
+     Present if phy_type == utmi.
+   - ulpi-link: The clock Tegra provides to the ULPI PHY (cdev2).
+     Present if phy_type == ulpi, and ULPI link mode is in use.
 
 Required properties for phy_type == ulpi:
   - nvidia,phy-reset-gpio : The GPIO used to reset the PHY.
 
+Required PHY timing params for utmi phy:
+  - nvidia,hssync-start-delay : Number of 480 Mhz clock cycles to wait before
+    start of sync launches RxActive
+  - nvidia,elastic-limit : Variable FIFO Depth of elastic input store
+  - nvidia,idle-wait-delay : Number of 480 Mhz clock cycles of idle to wait
+    before declare IDLE.
+  - nvidia,term-range-adj : Range adjusment on terminations
+  - nvidia,xcvr-setup : HS driver output control
+  - nvidia,xcvr-lsfslew : LS falling slew rate control.
+  - nvidia,xcvr-lsrslew :  LS rising slew rate control.
+
 Optional properties:
   - nvidia,has-legacy-mode : boolean indicates whether this controller can
     operate in legacy mode (as APX 2500 / 2600). In legacy mode some
     registers are accessed through the APB_MISC base address instead of
-    the USB controller.
\ No newline at end of file
+    the USB controller.
+  - nvidia,is-wired : boolean. Indicates whether we can do certain kind of power
+    optimizations for the devices that are always connected. e.g. modem.
+  - dr_mode : dual role mode. Indicates the working mode for the PHY. Can be
+    "host", "peripheral", or "otg". Defaults to "host" if not defined.
+      host means this is a host controller
+      peripheral means it is device controller
+      otg means it can operate as either ("on the go")
+
+Required properties for dr_mode == otg:
+  - vbus-supply: regulator for VBUS

diff --git a/Documentation/devicetree/bindings/usb/usb3503.txt b/Documentation/devicetree/bindings/usb/usb3503.txt
index 6813a71..8c5be48 100644
--- a/Documentation/devicetree/bindings/usb/usb3503.txt
+++ b/Documentation/devicetree/bindings/usb/usb3503.txt

@@ -4,6 +4,10 @@
 - compatible: Should be "smsc,usb3503".
 - reg: Specifies the i2c slave address, it should be 0x08.
 - connect-gpios: Should specify GPIO for connect.
+- disabled-ports: Should specify the ports unused.
+	'1' or '2' or '3' are availe for this property to describe the port
+	number. 1~3 property values are possible to be desribed.
+	Do not describe this property if all ports have to be enabled.
 - intn-gpios: Should specify GPIO for interrupt.
 - reset-gpios: Should specify GPIO for reset.
 - initial-mode: Should specify initial mode.
@@ -14,6 +18,7 @@
 		compatible = "smsc,usb3503";
 		reg = <0x08>;
 		connect-gpios = <&gpx3 0 1>;
+		disabled-ports = <2 3>;
 		intn-gpios = <&gpx3 4 1>;
 		reset-gpios = <&gpx3 5 1>;
 		initial-mode = <1>;

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 0706d32a..9858f33 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking

@@ -189,7 +189,7 @@
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 	sector_t (*bmap)(struct address_space *, sector_t);
-	int (*invalidatepage) (struct page *, unsigned long);
+	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
 	int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
@@ -310,8 +310,8 @@
 keep it that way and don't breed new callers.
 
 	->invalidatepage() is called when the filesystem must attempt to drop
-some or all of the buffers from the page when it is being truncated.  It
-returns zero on success.  If ->invalidatepage is zero, the kernel uses
+some or all of the buffers from the page when it is being truncated. It
+returns zero on success. If ->invalidatepage is zero, the kernel uses
 block_invalidatepage() instead.
 
 	->releasepage() is called when the kernel is about to try to drop the
@@ -414,7 +414,7 @@
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
-	int (*readdir) (struct file *, void *, filldir_t);
+	int (*iterate) (struct file *, struct dir_context *);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index bd3c56c..b91e2f2 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt

@@ -98,8 +98,13 @@
 MOUNT OPTIONS
 ================================================================================
 
-background_gc_off      Turn off cleaning operations, namely garbage collection,
-		       triggered in background when I/O subsystem is idle.
+background_gc=%s       Turn on/off cleaning operations, namely garbage
+                       collection, triggered in background when I/O subsystem is
+                       idle. If background_gc=on, it will turn on the garbage
+                       collection and if background_gc=off, garbage collection
+                       will be truned off.
+                       Default value for this option is on. So garbage
+                       collection is on by default.
 disable_roll_forward   Disable the roll-forward recovery routine
 discard                Issue discard/TRIM commands when a segment is cleaned.
 no_heap                Disable heap-style segment allocation which finds free

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 4db22f6..206a1bd 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting

@@ -445,3 +445,9 @@
 [mandatory]
 	FS_REVAL_DOT is gone; if you used to have it, add ->d_weak_revalidate()
 in your dentry operations instead.
+--
+[mandatory]
+	vfs_readdir() is gone; switch to iterate_dir() instead
+--
+[mandatory]
+	->readdir() is gone now; switch to ->iterate()

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index bc4b06b..e6bd1ff 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt

@@ -549,7 +549,7 @@
 -------------------------------
 
 This describes how the VFS can manipulate mapping of a file to page cache in
-your filesystem. As of kernel 2.6.22, the following members are defined:
+your filesystem. The following members are defined:
 
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -566,7 +566,7 @@
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 	sector_t (*bmap)(struct address_space *, sector_t);
-	int (*invalidatepage) (struct page *, unsigned long);
+	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
@@ -685,14 +685,14 @@
   invalidatepage: If a page has PagePrivate set, then invalidatepage
         will be called when part or all of the page is to be removed
 	from the address space.  This generally corresponds to either a
-	truncation or a complete invalidation of the address space
-	(in the latter case 'offset' will always be 0).
-	Any private data associated with the page should be updated
-	to reflect this truncation.  If offset is 0, then
-	the private data should be released, because the page
-	must be able to be completely discarded.  This may be done by
-        calling the ->releasepage function, but in this case the
-        release MUST succeed.
+	truncation, punch hole  or a complete invalidation of the address
+	space (in the latter case 'offset' will always be 0 and 'length'
+	will be PAGE_CACHE_SIZE). Any private data associated with the page
+	should be updated to reflect this truncation.  If offset is 0 and
+	length is PAGE_CACHE_SIZE, then the private data should be released,
+	because the page must be able to be completely discarded.  This may
+	be done by calling the ->releasepage function, but in this case the
+	release MUST succeed.
 
   releasepage: releasepage is called on PagePrivate pages to indicate
         that the page should be freed if possible.  ->releasepage
@@ -777,7 +777,7 @@
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
-	int (*readdir) (struct file *, void *, filldir_t);
+	int (*iterate) (struct file *, struct dir_context *);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
@@ -815,7 +815,7 @@
 
   aio_write: called by io_submit(2) and other asynchronous I/O operations
 
-  readdir: called when the VFS needs to read the directory contents
+  iterate: called when the VFS needs to read the directory contents
 
   poll: called by the VFS when a process wants to check if there is
 	activity on this file and (optionally) go to sleep until there

diff --git a/Documentation/fmc/00-INDEX b/Documentation/fmc/00-INDEX
new file mode 100644
index 0000000..431c695
--- /dev/null
+++ b/Documentation/fmc/00-INDEX

@@ -0,0 +1,38 @@
+
+Documentation in this directory comes from sections of the manual we
+wrote for the externally-developed fmc-bus package. The complete
+manual as of today (2013-02) is available in PDF format at
+http://www.ohwr.org/projects/fmc-bus/files
+
+00-INDEX
+	- this file.
+
+FMC-and-SDB.txt
+	- What are FMC and SDB, basic concepts for this framework
+
+API.txt
+	- The functions that are exported by the bus driver
+
+parameters.txt
+	- The module parameters
+
+carrier.txt
+	- writing a carrier (a device)
+
+mezzanine.txt
+	- writing code for your mezzanine (a driver)
+
+identifiers.txt
+	- how identification and matching works
+
+fmc-fakedev.txt
+	- about drivers/fmc/fmc-fakedev.ko
+
+fmc-trivial.txt
+	- about drivers/fmc/fmc-trivial.ko
+
+fmc-write-eeprom.txt
+	- about drivers/fmc/fmc-write-eeprom.ko
+
+fmc-chardev.txt
+	- about drivers/fmc/fmc-chardev.ko

diff --git a/Documentation/fmc/API.txt b/Documentation/fmc/API.txt
new file mode 100644
index 0000000..06b06b9
--- /dev/null
+++ b/Documentation/fmc/API.txt

@@ -0,0 +1,47 @@
+Functions Exported by fmc.ko
+****************************
+
+The FMC core exports the usual 4 functions that are needed for a bus to
+work, and a few more:
+
+        int fmc_driver_register(struct fmc_driver *drv);
+        void fmc_driver_unregister(struct fmc_driver *drv);
+        int fmc_device_register(struct fmc_device *fmc);
+        void fmc_device_unregister(struct fmc_device *fmc);
+
+        int fmc_device_register_n(struct fmc_device **fmc, int n);
+        void fmc_device_unregister_n(struct fmc_device **fmc, int n);
+
+        uint32_t fmc_readl(struct fmc_device *fmc, int offset);
+        void fmc_writel(struct fmc_device *fmc, uint32_t val, int off);
+        void *fmc_get_drvdata(struct fmc_device *fmc);
+        void fmc_set_drvdata(struct fmc_device *fmc, void *data);
+
+        int fmc_reprogram(struct fmc_device *f, struct fmc_driver *d, char *gw,
+                          int sdb_entry);
+
+The data structure that describe a device is detailed in *note FMC
+Device::, the one that describes a driver is detailed in *note FMC
+Driver::.  Please note that structures of type fmc_device must be
+allocated by the caller, but must not be released after unregistering.
+The fmc-bus itself takes care of releasing the structure when their use
+count reaches zero - actually, the device model does that in lieu of us.
+
+The functions to register and unregister n devices are meant to be used
+by carriers that host more than one mezzanine. The devices must all be
+registered at the same time because if the FPGA is reprogrammed, all
+devices in the array are affected. Usually, the driver matching the
+first device will reprogram the FPGA, so other devices must know they
+are already driven by a reprogrammed FPGA.
+
+If a carrier hosts slots that are driven by different FPGA devices, it
+should register as a group only mezzanines that are driven by the same
+FPGA, for the reason outlined above.
+
+Finally, the fmc_reprogram function calls the reprogram method (see
+*note The API Offered by Carriers:: and also scans the memory area for
+an SDB tree. You can pass -1 as sdb_entry to disable such scan.
+Otherwise, the function fails if no tree is found at the specified
+entry point.  The function is meant to factorize common code, and by
+the time you read this it is already used by the spec-sw and fine-delay
+modules.

diff --git a/Documentation/fmc/FMC-and-SDB.txt b/Documentation/fmc/FMC-and-SDB.txt
new file mode 100644
index 0000000..fa14e0b
--- /dev/null
+++ b/Documentation/fmc/FMC-and-SDB.txt

@@ -0,0 +1,88 @@
+
+FMC (FPGA Mezzanine Card) is the standard we use for our I/O devices,
+in the context of White Rabbit and related hardware.
+
+In our I/O environments we need to write drivers for each mezzanine
+card, and such drivers must work regardless of the carrier being used.
+To achieve this, we abstract the FMC interface.
+
+We have a carrier for PCI-E called SPEC and one for VME called SVEC,
+but more are planned.  Also, we support stand-alone devices (usually
+plugged on a SPEC card), controlled through Etherbone, developed by GSI.
+
+Code and documentation for the FMC bus was born as part of the spec-sw
+project, but now it lives in its own project. Other projects, i.e.
+software support for the various carriers, should include this as a
+submodule.
+
+The most up to date version of code and documentation is always
+available from the repository you can clone from:
+
+        git://ohwr.org/fmc-projects/fmc-bus.git (read-only)
+        git@ohwr.org:fmc-projects/fmc-bus.git (read-write for developers)
+
+Selected versions of the documentation, as well as complete tar
+archives for selected revisions are placed to the Files section of the
+project: `http://www.ohwr.org/projects/fmc-bus/files'
+
+
+What is FMC
+***********
+
+FMC, as said, stands for "FPGA Mezzanine Card". It is a standard
+developed by the VME consortium called VITA (VMEbus International Trade
+Association and ratified by ANSI, the American National Standard
+Institute.  The official documentation is called "ANSI-VITA 57.1".
+
+The FMC card is an almost square PCB, around 70x75 millimeters, that is
+called mezzanine in this document.  It usually lives plugged into
+another PCB for power supply and control; such bigger circuit board is
+called carrier from now on, and a single carrier may host more than one
+mezzanine.
+
+In the typical application the mezzanine is mostly analog while the
+carrier is mostly digital, and hosts an FPGA that must be configured to
+match the specific mezzanine and the desired application. Thus, you may
+need to load different FPGA images to drive different instances of the
+same mezzanine.
+
+FMC, as such, is not a bus in the usual meaning of the term, because
+most carriers have only one connector, and carriers with several
+connectors have completely separate electrical connections to them.
+This package, however, implements a bus as a software abstraction.
+
+
+What is SDB
+***********
+
+SDB (Self Describing Bus) is a set of data structures that we use for
+enumerating the internal structure of an FPGA image. We also use it as
+a filesystem inside the FMC EEPROM.
+
+SDB is not mandatory for use of this FMC kernel bus, but if you have SDB
+this package can make good use of it.  SDB itself is developed in the
+fpga-config-space OHWR project. The link to the repository is
+`git://ohwr.org/hdl-core-lib/fpga-config-space.git' and what is used in
+this project lives in the sdbfs subdirectory in there.
+
+SDB support for FMC is described in *note FMC Identification:: and
+*note SDB Support::
+
+
+SDB Support
+***********
+
+The fmc.ko bus driver exports a few functions to help drivers taking
+advantage of the SDB information that may be present in your own FPGA
+memory image.
+
+The module exports the following functions, in the special header
+<linux/fmc-sdb.h>. The linux/ prefix in the name is there because we
+plan to submit it upstream in the future, and don't want to force
+changes on our drivers if that happens.
+
+         int fmc_scan_sdb_tree(struct fmc_device *fmc, unsigned long address);
+         void fmc_show_sdb_tree(struct fmc_device *fmc);
+         signed long fmc_find_sdb_device(struct sdb_array *tree, uint64_t vendor,
+                                         uint32_t device, unsigned long *sz);
+         int fmc_free_sdb_tree(struct fmc_device *fmc);

diff --git a/Documentation/fmc/carrier.txt b/Documentation/fmc/carrier.txt
new file mode 100644
index 0000000..173f6d6
--- /dev/null
+++ b/Documentation/fmc/carrier.txt

@@ -0,0 +1,311 @@
+FMC Device
+**********
+
+Within the Linux bus framework, the FMC device is created and
+registered by the carrier driver. For example, the PCI driver for the
+SPEC card fills a data structure for each SPEC that it drives, and
+registers an associated FMC device for each card.  The SVEC driver can
+do exactly the same for the VME carrier (actually, it should do it
+twice, because the SVEC carries two FMC mezzanines).  Similarly, an
+Etherbone driver will be able to register its own FMC devices, offering
+communication primitives through frame exchange.
+
+The contents of the EEPROM within the FMC are used for identification
+purposes, i.e. for matching the device with its own driver. For this
+reason the device structure includes a complete copy of the EEPROM
+(actually, the carrier driver may choose whether or not to return it -
+for example we most likely won't have the whole EEPROM available for
+Etherbone devices.
+
+The following listing shows the current structure defining a device.
+Please note that all the machinery is in place but some details may
+still change in the future.  For this reason, there is a version field
+at the beginning of the structure.  As usual, the minor number will
+change for compatible changes (like a new flag) and the major number
+will increase when an incompatible change happens (for example, a
+change in layout of some fmc data structures).  Device writers should
+just set it to the value FMC_VERSION, and be ready to get back -EINVAL
+at registration time.
+
+     struct fmc_device {
+             unsigned long version;
+             unsigned long flags;
+             struct module *owner;           /* char device must pin it */
+             struct fmc_fru_id id;           /* for EEPROM-based match */
+             struct fmc_operations *op;      /* carrier-provided */
+             int irq;                        /* according to host bus. 0 == none */
+             int eeprom_len;                 /* Usually 8kB, may be less */
+             int eeprom_addr;                /* 0x50, 0x52 etc */
+             uint8_t *eeprom;                /* Full contents or leading part */
+             char *carrier_name;             /* "SPEC" or similar, for special use */
+             void *carrier_data;             /* "struct spec *" or equivalent */
+             __iomem void *fpga_base;        /* May be NULL (Etherbone) */
+             __iomem void *slot_base;        /* Set by the driver */
+             struct fmc_device **devarray;   /* Allocated by the bus */
+             int slot_id;                    /* Index in the slot array */
+             int nr_slots;                   /* Number of slots in this carrier */
+             unsigned long memlen;           /* Used for the char device */
+             struct device dev;              /* For Linux use */
+             struct device *hwdev;           /* The underlying hardware device */
+             unsigned long sdbfs_entry;
+             struct sdb_array *sdb;
+             uint32_t device_id;             /* Filled by the device */
+             char *mezzanine_name;           /* Defaults to ``fmc'' */
+             void *mezzanine_data;
+     };
+
+The meaning of most fields is summarized in the code comment above.
+
+The following fields must be filled by the carrier driver before
+registration:
+
+   * version: must be set to FMC_VERSION.
+
+   * owner: set to MODULE_OWNER.
+
+   * op: the operations to act on the device.
+
+   * irq: number for the mezzanine; may be zero.
+
+   * eeprom_len: length of the following array.
+
+   * eeprom_addr: 0x50 for first mezzanine and so on.
+
+   * eeprom: the full content of the I2C EEPROM.
+
+   * carrier_name.
+
+   * carrier_data: a unique pointer for the carrier.
+
+   * fpga_base: the I/O memory address (may be NULL).
+
+   * slot_id: the index of this slot (starting from zero).
+
+   * memlen: if fpga_base is valid, the length of I/O memory.
+
+   * hwdev: to be used in some dev_err() calls.
+
+   * device_id: a slot-specific unique integer number.
+
+
+Please note that the carrier should read its own EEPROM memory before
+registering the device, as well as fill all other fields listed above.
+
+The following fields should not be assigned, because they are filled
+later by either the bus or the device driver:
+
+   * flags.
+
+   * fru_id: filled by the bus, parsing the eeprom.
+
+   * slot_base: filled and used by the driver, if useful to it.
+
+   * devarray: an array og all mezzanines driven by a singe FPGA.
+
+   * nr_slots: set by the core at registration time.
+
+   * dev: used by Linux.
+
+   * sdb: FPGA contents, scanned according to driver's directions.
+
+   * sdbfs_entry: SDB entry point in EEPROM: autodetected.
+
+   * mezzanine_data: available for the driver.
+
+   * mezzanine_name: filled by fmc-bus during identification.
+
+
+Note: mezzanine_data may be redundant, because Linux offers the drvdata
+approach, so the field may be removed in later versions of this bus
+implementation.
+
+As I write this, she SPEC carrier is already completely functional in
+the fmc-bus environment, and is a good reference to look at.
+
+
+The API Offered by Carriers
+===========================
+
+The carrier provides a number of methods by means of the
+`fmc_operations' structure, which currently is defined like this
+(again, it is a moving target, please refer to the header rather than
+this document):
+
+     struct fmc_operations {
+             uint32_t (*readl)(struct fmc_device *fmc, int offset);
+             void (*writel)(struct fmc_device *fmc, uint32_t value, int offset);
+             int (*reprogram)(struct fmc_device *f, struct fmc_driver *d, char *gw);
+             int (*validate)(struct fmc_device *fmc, struct fmc_driver *drv);
+             int (*irq_request)(struct fmc_device *fmc, irq_handler_t h,
+                                char *name, int flags);
+             void (*irq_ack)(struct fmc_device *fmc);
+             int (*irq_free)(struct fmc_device *fmc);
+             int (*gpio_config)(struct fmc_device *fmc, struct fmc_gpio *gpio,
+                                int ngpio);
+             int (*read_ee)(struct fmc_device *fmc, int pos, void *d, int l);
+             int (*write_ee)(struct fmc_device *fmc, int pos, const void *d, int l);
+     };
+
+The individual methods perform the following tasks:
+
+`readl'
+`writel'
+     These functions access FPGA registers by whatever means the
+     carrier offers. They are not expected to fail, and most of the time
+     they will just make a memory access to the host bus. If the
+     carrier provides a fpga_base pointer, the driver may use direct
+     access through that pointer. For this reason the header offers the
+     inline functions fmc_readl and fmc_writel that access fpga_base if
+     the respective method is NULL. A driver that wants to be portable
+     and efficient should use fmc_readl and fmc_writel.  For Etherbone,
+     or other non-local carriers, error-management is still to be
+     defined.
+
+`validate'
+     Module parameters are used to manage different applications for
+     two or more boards of the same kind. Validation is based on the
+     busid module parameter, if provided, and returns the matching
+     index in the associated array. See *note Module Parameters:: in in
+     doubt. If no match is found, `-ENOENT' is returned; if the user
+     didn't pass `busid=', all devices will pass validation.  The value
+     returned by the validate method can be used as index into other
+     parameters (for example, some drivers use the `lm32=' parameter in
+     this way). Such "generic parameters" are documented in *note
+     Module Parameters::, below. The validate method is used by
+     `fmc-trivial.ko', described in *note fmc-trivial::.
+
+`reprogram'
+     The carrier enumerates FMC devices by loading a standard (or
+     golden) FPGA binary that allows EEPROM access. Each driver, then,
+     will need to reprogram the FPGA by calling this function.  If the
+     name argument is NULL, the carrier should reprogram the golden
+     binary. If the gateware name has been overridden through module
+     parameters (in a carrier-specific way) the file loaded will match
+     the parameters. Per-device gateware names can be specified using
+     the `gateware=' parameter, see *note Module Parameters::.  Note:
+     Clients should call rhe new helper, fmc_reprogram, which both
+     calls this method and parse the SDB tree of the FPGA.
+
+`irq_request'
+`irq_ack'
+`irq_free'
+     Interrupt management is carrier-specific, so it is abstracted as
+     operations. The interrupt number is listed in the device
+     structure, and for the mezzanine driver the number is only
+     informative.  The handler will receive the fmc pointer as dev_id;
+     the flags argument is passed to the Linux request_irq function,
+     but fmc-specific flags may be added in the future. You'll most
+     likely want to pass the `IRQF_SHARED' flag.
+
+`gpio_config'
+     The method allows to configure a GPIO pin in the carrier, and read
+     its current value if it is configured as input. See *note The GPIO
+     Abstraction:: for details.
+
+`read_ee'
+`write_ee'
+     Read or write the EEPROM. The functions are expected to be only
+     called before reprogramming and the carrier should refuse them
+     with `ENODEV' after reprogramming.  The offset is expected to be
+     within 8kB (the current size), but addresses up to 1MB are
+     reserved to fit bigger I2C devices in the future. Carriers may
+     offer access to other internal flash memories using these same
+     methods: for example the SPEC driver may define that its carrier
+     I2C memory is seen at offset 1M and the internal SPI flash is seen
+     at offset 16M.  This multiplexing of several flash memories in the
+     same address space is is carrier-specific and should only be used
+     by a driver that has verified the `carrier_name' field.
+
+
+
+The GPIO Abstraction
+====================
+
+Support for GPIO pins in the fmc-bus environment is not very
+straightforward and deserves special discussion.
+
+While the general idea of a carrier-independent driver seems to fly,
+configuration of specific signals within the carrier needs at least
+some knowledge of the carrier itself.  For this reason, the specific
+driver can request to configure carrier-specific GPIO pins, numbered
+from 0 to at most 4095.  Configuration is performed by passing a
+pointer to an array of struct fmc_gpio items, as well as the length of
+the array. This is the data structure:
+
+        struct fmc_gpio {
+                char *carrier_name;
+                int gpio;
+                int _gpio;      /* internal use by the carrier */
+                int mode;       /* GPIOF_DIR_OUT etc, from <linux/gpio.h> */
+                int irqmode;    /* IRQF_TRIGGER_LOW and so on */
+        };
+
+By specifying a carrier_name for each pin, the driver may access
+different pins in different carriers.  The gpio_config method is
+expected to return the number of pins successfully configured, ignoring
+requests for other carriers. However, if no pin is configured (because
+no structure at all refers to the current carrier_name), the operation
+returns an error so the caller will know that it is running under a
+yet-unsupported carrier.
+
+So, for example, a driver that has been developed and tested on both
+the SPEC and the SVEC may request configuration of two different GPIO
+pins, and expect one such configuration to succeed - if none succeeds
+it most likely means that the current carrier is a still-unknown one.
+
+If, however, your GPIO pin has a specific known role, you can pass a
+special number in the gpio field, using one of the following macros:
+
+        #define FMC_GPIO_RAW(x)         (x)             /* 4096 of them */
+        #define FMC_GPIO_IRQ(x)         ((x) + 0x1000)  /*  256 of them */
+        #define FMC_GPIO_LED(x)         ((x) + 0x1100)  /*  256 of them */
+        #define FMC_GPIO_KEY(x)         ((x) + 0x1200)  /*  256 of them */
+        #define FMC_GPIO_TP(x)          ((x) + 0x1300)  /*  256 of them */
+        #define FMC_GPIO_USER(x)        ((x) + 0x1400)  /*  256 of them */
+
+Use of virtual GPIO numbers (anything but FMC_GPIO_RAW) is allowed
+provided the carrier_name field in the data structure is left
+unspecified (NULL). Each carrier is responsible for providing a mapping
+between virtual and physical GPIO numbers. The carrier may then use the
+_gpio field to cache the result of this mapping.
+
+All carriers must map their I/O lines to the sets above starting from
+zero.  The SPEC, for example, maps interrupt pins 0 and 1, and test
+points 0 through 3 (even if the test points on the PCB are called
+5,6,7,8).
+
+If, for example, a driver requires a free LED and a test point (for a
+scope probe to be plugged at some point during development) it may ask
+for FMC_GPIO_LED(0) and FMC_GPIO_TP(0). Each carrier will provide
+suitable GPIO pins.  Clearly, the person running the drivers will know
+the order used by the specific carrier driver in assigning leds and
+testpoints, so to make a carrier-dependent use of the diagnostic tools.
+
+In theory, some form of autodetection should be possible: a driver like
+the wr-nic (which uses IRQ(1) on the SPEC card) should configure
+IRQ(0), make a test with software-generated interrupts and configure
+IRQ(1) if the test fails. This probing step should be used because even
+if the wr-nic gateware is known to use IRQ1 on the SPEC, the driver
+should be carrier-independent and thus use IRQ(0) as a first bet -
+actually, the knowledge that IRQ0 may fail is carrier-dependent
+information, but using it doesn't make the driver unsuitable for other
+carriers.
+
+The return value of gpio_config is defined as follows:
+
+   * If no pin in the array can be used by the carrier, `-ENODEV'.
+
+   * If at least one virtual GPIO number cannot be mapped, `-ENOENT'.
+
+   * On success, 0 or positive. The value returned is the number of
+     high input bits (if no input is configured, the value for success
+     is 0).
+
+While I admit the procedure is not completely straightforward, it
+allows configuration, input and output with a single carrier operation.
+Given the typical use case of FMC devices, GPIO operations are not
+expected to ever by in hot paths, and GPIO access so fare has only been
+used to configure the interrupt pin, mode and polarity. Especially
+reading inputs is not expected to be common. If your device has GPIO
+capabilities in the hot path, you should consider using the kernel's
+GPIO mechanisms.

diff --git a/Documentation/fmc/fmc-chardev.txt b/Documentation/fmc/fmc-chardev.txt
new file mode 100644
index 0000000..d9ccb27
--- /dev/null
+++ b/Documentation/fmc/fmc-chardev.txt

@@ -0,0 +1,64 @@
+fmc-chardev
+===========
+
+This is a simple generic driver, that allows user access by means of a
+character device (actually, one for each mezzanine it takes hold of).
+
+The char device is created as a misc device. Its name in /dev (as
+created by udev) is the same name as the underlying FMC device. Thus,
+the name can be a silly fmc-0000 look-alike if the device has no
+identifiers nor bus_id, a more specific fmc-0400 if the device has a
+bus-specific address but no associated name, or something like
+fdelay-0400 if the FMC core can rely on both a mezzanine name and a bus
+address.
+
+Currently the driver only supports read and write: you can lseek to the
+desired address and read or write a register.
+
+The driver assumes all registers are 32-bit in size, and only accepts a
+single read or write per system call. However, as a result of Unix read
+and write semantics, users can simply fread or fwrite bigger areas in
+order to dump or store bigger memory areas.
+
+There is currently no support for mmap, user-space interrupt management
+and DMA buffers. They may be added in later versions, if the need
+arises.
+
+The example below shows raw access to a SPEC card programmed with its
+golden FPGA file, that features an SDB structure at offset 256 - i.e.
+64 words.  The mezzanine's EEPROM in this case is not programmed, so the
+default name is fmc-<bus><devfn>, and there are two cards in the system:
+
+  spusa.root# insmod fmc-chardev.ko
+  [ 1073.339332] spec 0000:02:00.0: Driver has no ID: matches all
+  [ 1073.345051] spec 0000:02:00.0: Created misc device "fmc-0200"
+  [ 1073.350821] spec 0000:04:00.0: Driver has no ID: matches all
+  [ 1073.356525] spec 0000:04:00.0: Created misc device "fmc-0400"
+  spusa.root# ls -l /dev/fmc*
+  crw------- 1 root root 10, 58 Nov 20 19:23 /dev/fmc-0200
+  crw------- 1 root root 10, 57 Nov 20 19:23 /dev/fmc-0400
+  spusa.root# dd bs=4 skip=64 count=1 if=/dev/fmc-0200 2> /dev/null | od -t x1z
+  0000000 2d 42 44 53                                      >-BDS<
+  0000004
+
+The simple program tools/fmc-mem in this package can access an FMC char
+device and read or write a word or a whole area.  Actually, the program
+is not specific to FMC at all, it just uses lseek, read and write.
+
+Its first argument is the device name, the second the offset, the third
+(if any) the value to write and the optional last argument that must
+begin with "+" is the number of bytes to read or write.  In case of
+repeated reading data is written to stdout; repeated writes read from
+stdin and the value argument is ignored.
+
+The following examples show reading the SDB magic number and the first
+SDB record from a SPEC device programmed with its golden image:
+
+     spusa.root# ./fmc-mem /dev/fmc-0200 100
+     5344422d
+     spusa.root# ./fmc-mem /dev/fmc-0200 100 +40 | od -Ax -t x1z
+     000000 2d 42 44 53 00 01 02 00 00 00 00 00 00 00 00 00  >-BDS............<
+     000010 00 00 00 00 ff 01 00 00 00 00 00 00 51 06 00 00  >............Q...<
+     000020 c9 42 a5 e6 02 00 00 00 11 05 12 20 2d 34 42 57  >.B......... -4BW<
+     000030 73 6f 72 43 72 61 62 73 49 53 47 2d 00 20 20 20  >sorCrabsISG-.   <
+     000040

diff --git a/Documentation/fmc/fmc-fakedev.txt b/Documentation/fmc/fmc-fakedev.txt
new file mode 100644
index 0000000..e85b74a
--- /dev/null
+++ b/Documentation/fmc/fmc-fakedev.txt

@@ -0,0 +1,36 @@
+fmc-fakedev
+===========
+
+This package includes a software-only device, called fmc-fakedev, which
+is able to register up to 4 mezzanines (by default it registers one).
+Unlike the SPEC driver, which creates an FMC device for each PCI cards
+it manages, this module creates a single instance of its set of
+mezzanines.
+
+It is meant as the simplest possible example of how a driver should be
+written, and it includes a fake EEPROM image (built using the tools
+described in *note FMC Identification::),, which by default is
+replicated for each fake mezzanine.
+
+You can also use this device to verify the match algorithms, by asking
+it to test your own EEPROM image. You can provide the image by means of
+the eeprom= module parameter: the new EEPROM image is loaded, as usual,
+by means of the firmware loader.  This example shows the defaults and a
+custom EEPROM image:
+
+     spusa.root# insmod fmc-fakedev.ko
+     [   99.971247]  fake-fmc-carrier: mezzanine 0
+     [   99.975393]       Manufacturer: fake-vendor
+     [   99.979624]       Product name: fake-design-for-testing
+     spusa.root# rmmod fmc-fakedev
+     spusa.root# insmod fmc-fakedev.ko eeprom=fdelay-eeprom.bin
+     [  121.447464]  fake-fmc-carrier: Mezzanine 0: eeprom "fdelay-eeprom.bin"
+     [  121.462725]  fake-fmc-carrier: mezzanine 0
+     [  121.466858]       Manufacturer: CERN
+     [  121.470477]       Product name: FmcDelay1ns4cha
+     spusa.root# rmmod fmc-fakedev
+
+After loading the device, you can use the write_ee method do modify its
+own internal fake EEPROM: whenever the image is overwritten starting at
+offset 0, the module will unregister and register again the FMC device.
+This is shown in fmc-write-eeprom.txt

diff --git a/Documentation/fmc/fmc-trivial.txt b/Documentation/fmc/fmc-trivial.txt
new file mode 100644
index 0000000..d1910bc
--- /dev/null
+++ b/Documentation/fmc/fmc-trivial.txt

@@ -0,0 +1,17 @@
+fmc-trivial
+===========
+
+The simple module fmc-trivial is just a simple client that registers an
+interrupt handler. I used it to verify the basic mechanism of the FMC
+bus and how interrupts worked.
+
+The module implements the generic FMC parameters, so it can program a
+different gateware file in each card. The whole list of parameters it
+accepts are:
+
+`busid='
+`gateware='
+     Generic parameters. See mezzanine.txt
+
+
+This driver is worth reading, in my opinion.

diff --git a/Documentation/fmc/fmc-write-eeprom.txt b/Documentation/fmc/fmc-write-eeprom.txt
new file mode 100644
index 0000000..44a3bc6
--- /dev/null
+++ b/Documentation/fmc/fmc-write-eeprom.txt

@@ -0,0 +1,125 @@
+fmc-write-eeprom
+================
+
+This module is designed to load a binary file from /lib/firmware and to
+write it to the internal EEPROM of the mezzanine card. This driver uses
+the `busid' generic parameter.
+
+Overwriting the EEPROM is not something you should do daily, and it is
+expected to only happen during manufacturing. For this reason, the
+module makes it unlikely for the random user to change a working EEPROM.
+
+The module takes the following measures:
+
+   * It accepts a `file=' argument (within /lib/firmware) and if no
+     such argument is received, it doesn't write anything to EEPROM
+     (i.e. there is no default file name).
+
+   * If the file name ends with `.bin' it is written verbatim starting
+     at offset 0.
+
+   * If the file name ends with `.tlv' it is interpreted as
+     type-length-value (i.e., it allows writev(2)-like operation).
+
+   * If the file name doesn't match any of the patterns above, it is
+     ignored and no write is performed.
+
+   * Only cards listed with `busid=' are written to. If no busid is
+     specified, no programming is done (and the probe function of the
+     driver will fail).
+
+
+Each TLV tuple is formatted in this way: the header is 5 bytes,
+followed by data. The first byte is `w' for write, the next two bytes
+represent the address, in little-endian byte order, and the next two
+represent the data length, in little-endian order. The length does not
+include the header (it is the actual number of bytes to be written).
+
+This is a real example: that writes 5 bytes at position 0x110:
+
+        spusa.root# od -t x1 -Ax /lib/firmware/try.tlv
+        000000 77 10 01 05 00 30 31 32 33 34
+        00000a
+        spusa.root# insmod /tmp/fmc-write-eeprom.ko busid=0x0200 file=try.tlv
+        [19983.391498] spec 0000:03:00.0: write 5 bytes at 0x0110
+        [19983.414615] spec 0000:03:00.0: write_eeprom: success
+
+Please note that you'll most likely want to use SDBFS to build your
+EEPROM image, at least if your mezzanines are being used in the White
+Rabbit environment. For this reason the TLV format is not expected to
+be used much and is not expected to be developed further.
+
+If you want to try reflashing fake EEPROM devices, you can use the
+fmc-fakedev.ko module (see *note fmc-fakedev::).  Whenever you change
+the image starting at offset 0, it will deregister and register again
+after two seconds.  Please note, however, that if fmc-write-eeprom is
+still loaded, the system will associate it to the new device, which
+will be reprogrammed and thus will be unloaded after two seconds.  The
+following example removes the module after it reflashed fakedev the
+first time.
+
+     spusa.root# insmod fmc-fakedev.ko
+        [   72.984733]  fake-fmc: Manufacturer: fake-vendor
+        [   72.989434]  fake-fmc: Product name: fake-design-for-testing
+        spusa.root# insmod fmc-write-eeprom.ko busid=0 file=fdelay-eeprom.bin; \
+            rmmod fmc-write-eeprom
+        [  130.874098]  fake-fmc: Matching a generic driver (no ID)
+        [  130.887845]  fake-fmc: programming 6155 bytes
+        [  130.894567]  fake-fmc: write_eeprom: success
+        [  132.895794]  fake-fmc: Manufacturer: CERN
+        [  132.899872]  fake-fmc: Product name: FmcDelay1ns4cha
+
+
+Writing to the EEPROM
+=====================
+
+Once you have created a binary file for your EEPROM, you can write it
+to the storage medium using the fmc-write-eeprom (See *note
+fmc-write-eeprom::, while relying on a carrier driver.  The procedure
+here shown here uses the SPEC driver
+(`http://www.ohwr.org/projects/spec-sw').
+
+The example assumes no driver is already loaded (actually, I unloaded
+them by hand as everything loads automatically at boot time after you
+installed the modules), and shows kernel messages together with
+commands. Here the prompt is spusa.root# and two SPEC cards are plugged
+in the system.
+
+     spusa.root# insmod fmc.ko
+     spusa.root# insmod spec.ko
+     [13972.382818] spec 0000:02:00.0:  probe for device 0002:0000
+     [13972.392773] spec 0000:02:00.0: got file "fmc/spec-init.bin", 1484404 (0x16a674) bytes
+     [13972.591388] spec 0000:02:00.0: FPGA programming successful
+     [13972.883011] spec 0000:02:00.0: EEPROM has no FRU information
+     [13972.888719] spec 0000:02:00.0: No device_id filled, using index
+     [13972.894676] spec 0000:02:00.0: No mezzanine_name found
+     [13972.899863] /home/rubini/wip/spec-sw/kernel/spec-gpio.c - spec_gpio_init
+     [13972.906578] spec 0000:04:00.0:  probe for device 0004:0000
+     [13972.916509] spec 0000:04:00.0: got file "fmc/spec-init.bin", 1484404 (0x16a674) bytes
+     [13973.115096] spec 0000:04:00.0: FPGA programming successful
+     [13973.401798] spec 0000:04:00.0: EEPROM has no FRU information
+     [13973.407474] spec 0000:04:00.0: No device_id filled, using index
+     [13973.413417] spec 0000:04:00.0: No mezzanine_name found
+     [13973.418600] /home/rubini/wip/spec-sw/kernel/spec-gpio.c - spec_gpio_init
+     spusa.root# ls /sys/bus/fmc/devices
+     fmc-0000  fmc-0001
+     spusa.root# insmod fmc-write-eeprom.ko busid=0x0200 file=fdelay-eeprom.bin
+     [14103.966259] spec 0000:02:00.0: Matching an generic driver (no ID)
+     [14103.975519] spec 0000:02:00.0: programming 6155 bytes
+     [14126.373762] spec 0000:02:00.0: write_eeprom: success
+     [14126.378770] spec 0000:04:00.0: Matching an generic driver (no ID)
+     [14126.384903] spec 0000:04:00.0: fmc_write_eeprom: no filename given: not programming
+     [14126.392600] fmc_write_eeprom: probe of fmc-0001 failed with error -2
+
+Reading back the EEPROM
+=======================
+
+In order to read back the binary content of the EEPROM of your
+mezzanine device, the bus creates a read-only sysfs file called eeprom
+for each mezzanine it knows about:
+
+   spusa.root# cd /sys/bus/fmc/devices; ls -l */eeprom
+   -r--r--r-- 1 root root 8192 Apr  9 16:53 FmcDelay1ns4cha-f001/eeprom
+   -r--r--r-- 1 root root 8192 Apr  9 17:19 fake-design-for-testing-f002/eeprom
+   -r--r--r-- 1 root root 8192 Apr  9 17:19 fake-design-for-testing-f003/eeprom
+   -r--r--r-- 1 root root 8192 Apr  9 17:19 fmc-f004/eeprom

diff --git a/Documentation/fmc/identifiers.txt b/Documentation/fmc/identifiers.txt
new file mode 100644
index 0000000..3bb577f
--- /dev/null
+++ b/Documentation/fmc/identifiers.txt

@@ -0,0 +1,168 @@
+FMC Identification
+******************
+
+The FMC standard requires every compliant mezzanine to carry
+identification information in an I2C EEPROM.  The information must be
+laid out according to the "IPMI Platform Management FRU Information",
+where IPMI is a lie I'd better not expand, and FRU means "Field
+Replaceable Unit".
+
+The FRU information is an intricate unreadable binary blob that must
+live at offset 0 of the EEPROM, and typically extends for a few hundred
+bytes. The standard allows the application to use all the remaining
+storage area of the EEPROM as it wants.
+
+This chapter explains how to create your own EEPROM image and how to
+write it in your mezzanine, as well as how devices and drivers are
+paired at run time.  EEPROM programming uses tools that are part of this
+package and SDB (part of the fpga-config-space package).
+
+The first sections are only interesting for manufacturers who need to
+write the EEPROM. If you are just a software developer writing an FMC
+device or driver, you may jump straight to *note SDB Support::.
+
+
+Building the FRU Structure
+==========================
+
+If you want to know the internals of the FRU structure and despair, you
+can retrieve the document from
+`http://download.intel.com/design/servers/ipmi/FRU1011.pdf' .  The
+standard is awful and difficult without reason, so we only support the
+minimum mandatory subset - we create a simple structure and parse it
+back at run time, but we are not able to either generate or parse more
+arcane features like non-english languages and 6-bit text.  If you need
+more items of the FRU standard for your boards, please submit patches.
+
+This package includes the Python script that Matthieu Cattin wrote to
+generate the FRU binary blob, based on an helper libipmi by Manohar
+Vanga and Matthieu himself.  I changed the test script to receive
+parameters from the command line or from the environment (the command
+line takes precedence)
+
+To make a long story short, in order to build a standard-compliant
+binary file to be burned in your EEPROM, you need the following items:
+
+        Environment    Opt     Official Name          Default
+---------------------------------------------------------------------
+        FRU_VENDOR     -v      "Board Manufacturer"   fmc-example
+        FRU_NAME       -n      "Board Product Name"   mezzanine
+        FRU_SERIAL     -s      `Board Serial Number"  0001
+        FRU_PART       -p      "Board Part Number"    sample-part
+        FRU_OUTPUT     -o      not applicable         /dev/stdout
+
+The "Official Name" above is what you find in the FRU official
+documentation, chapter 11, page 7 ("Board Info Area Format").  The
+output option is used to save the generated binary to a specific file
+name instead of stdout.
+
+You can pass the items to the FRU generator either in the environment
+or on the command line.  This package has currently no support for
+specifying power consumption or such stuff, but I plan to add it as
+soon as I find some time for that.
+
+FIXME: consumption etc for FRU are here or in PTS?
+
+The following example creates a binary image for a specific board:
+
+        ./tools/fru-generator -v CERN -n FmcAdc100m14b4cha \
+               -s HCCFFIA___-CR000003 -p EDA-02063-V5-0 > eeprom.bin
+
+The following example shows a script that builds several binary EEPROM
+images for a series of boards, changing the serial number for each of
+them. The script uses a mix of environment variables and command line
+options, and uses the same string patterns shown above.
+
+        #!/bin/sh
+
+        export FRU_VENDOR="CERN"
+        export FRU_NAME="FmcAdc100m14b4cha"
+        export FRU_PART="EDA-02063-V5-0"
+
+        serial="HCCFFIA___-CR"
+
+        for number in $(seq 1 50); do
+           # build number-string "ns"
+           ns="$(printf %06d $number)"
+           ./fru-generator -s "${serial}${ns}" > eeprom-${ns}.bin
+        done
+
+
+Using SDB-FS in the EEPROM
+==========================
+
+If you want to use SDB as a filesystem in the EEPROM device within the
+mezzanine, you should create one such filesystem using gensdbfs, from
+the fpga-config-space package on OHWR.
+
+By using an SBD filesystem you can cluster several files in a single
+EEPROM, so both the host system and a soft-core running in the FPGA (if
+any) can access extra production-time information.
+
+We chose to use SDB as a storage filesystem because the format is very
+simple, and both the host system and the soft-core will likely already
+include support code for such format. The SDB library offered by the
+fpga-config-space is less than 1kB under LM32, so it proves quite up to
+the task.
+
+The SDB entry point (which acts as a directory listing) cannot live at
+offset zero in the flash device, because the FRU information must live
+there.  To avoid wasting precious storage space while still allowing
+for more-than-minimal FRU structures, the fmc.ko will look for the SDB
+record at address 256, 512 and 1024.
+
+In order to generate the complete EEPROM image you'll need a
+configuration file for gensdbfs: you tell the program where to place
+the sdb entry point, and you must force the FRU data file to be placed
+at the beginning of the storage device. If needed, you can also place
+other files at a special offset (we sometimes do it for backward
+compatibility with drivers we wrote before implementing SDB for flash
+memory).
+
+The directory tools/sdbfs of this package includes a well-commented
+example that you may want to use as a starting point (the comments are
+in the file called -SDB-CONFIG-).  Reading documentation for gensdbfs
+is a suggested first step anyways.
+
+This package (generic FMC bus support) only accesses two files in the
+EEPROM: the FRU information, at offset zero, with a suggested filename
+of IPMI-FRU and the short name for the mezzanine, in a file called
+name. The IPMI-FRU name is not mandatory, but a strongly suggested
+choice; the name filename is mandatory, because this is the preferred
+short name used by the FMC core.  For example, a name of "fdelay" may
+supplement a Product Name like "FmcDelay1ns4cha" - exactly as
+demonstrated in `tools/sdbfs'.
+
+Note: SDB access to flash memory is not yet supported, so the short
+name currently in use is just the "Product Name" FRU string.
+
+The example in tools/sdbfs includes an extra file, that is needed by
+the fine-delay driver, and must live at a known address of 0x1800.  By
+running gensdbfs on that directory you can output your binary EEPROM
+image (here below spusa$ is the shell prompt):
+
+        spusa$ ../fru-generator -v CERN -n FmcDelay1ns4cha -s proto-0 \
+                      -p EDA-02267-V3 > IPMI-FRU
+        spusa$ ls -l
+        total 16
+        -rw-rw-r-- 1 rubini staff 975 Nov 19 18:08 --SDB-CONFIG--
+        -rw-rw-r-- 1 rubini staff 216 Nov 19 18:13 IPMI-FRU
+        -rw-rw-r-- 1 rubini staff  11 Nov 19 18:04 fd-calib
+        -rw-rw-r-- 1 rubini staff   7 Nov 19 18:04 name
+        spusa$ sudo gensdbfs . /lib/firmware/fdelay-eeprom.bin
+        spusa$ sdb-read -l -e 0x100 /lib/firmware/fdelay-eeprom.bin
+        /home/rubini/wip/sdbfs/userspace/sdb-read: listing format is to be defined
+        46696c6544617461:2e202020  00000100-000018ff .
+        46696c6544617461:6e616d65  00000200-00000206 name
+        46696c6544617461:66642d63  00001800-000018ff fd-calib
+        46696c6544617461:49504d49  00000000-000000d7 IPMI-FRU
+        spusa$ ../fru-dump /lib/firmware/fdelay-eeprom.bin
+        /lib/firmware/fdelay-eeprom.bin: manufacturer: CERN
+        /lib/firmware/fdelay-eeprom.bin: product-name: FmcDelay1ns4cha
+        /lib/firmware/fdelay-eeprom.bin: serial-number: proto-0
+        /lib/firmware/fdelay-eeprom.bin: part-number: EDA-02267-V3
+
+As expected, the output file is both a proper sdbfs object and an IPMI
+FRU information blob. The fd-calib file lives at offset 0x1800 and is
+over-allocated to 256 bytes, according to the configuration file for
+gensdbfs.

diff --git a/Documentation/fmc/mezzanine.txt b/Documentation/fmc/mezzanine.txt
new file mode 100644
index 0000000..87910db
--- /dev/null
+++ b/Documentation/fmc/mezzanine.txt

@@ -0,0 +1,123 @@
+FMC Driver
+**********
+
+An FMC driver is concerned with the specific mezzanine and associated
+gateware. As such, it is expected to be independent of the carrier
+being used: it will perform I/O accesses only by means of
+carrier-provided functions.
+
+The matching between device and driver is based on the content of the
+EEPROM (as mandated by the FMC standard) or by the actual cores
+configured in the FPGA; the latter technique is used when the FPGA is
+already programmed when the device is registered to the bus core.
+
+In some special cases it is possible for a driver to directly access
+FPGA registers, by means of the `fpga_base' field of the device
+structure. This may be needed for high-bandwidth peripherals like fast
+ADC cards. If the device module registered a remote device (for example
+by means of Etherbone), the `fpga_base' pointer will be NULL.
+Therefore, drivers must be ready to deal with NULL base pointers, and
+fail gracefully.  Most driver, however, are not expected to access the
+pointer directly but run fmc_readl and fmc_writel instead, which will
+work in any case.
+
+In even more special cases, the driver may access carrier-specific
+functionality: the `carrier_name' string allows the driver to check
+which is the current carrier and make use of the `carrier_data'
+pointer.  We chose to use carrier names rather than numeric identifiers
+for greater flexibility, but also to avoid a central registry within
+the `fmc.h' file - we hope other users will exploit our framework with
+their own carriers.  An example use of carrier names is in GPIO setup
+(see *note The GPIO Abstraction::), although the name match is not
+expected to be performed by the driver.  If you depend on specific
+carriers, please check the carrier name and fail gracefully if your
+driver finds it is running in a yet-unknown-to-it environment.
+
+
+ID Table
+========
+
+Like most other Linux drivers, and FMC driver must list all the devices
+which it is able to drive.  This is usually done by means of a device
+table, but in FMC we can match hardware based either on the contents of
+their EEPROM or on the actual FPGA cores that can be enumerated.
+Therefore, we have two tables of identifiers.
+
+Matching of FRU information depends on two names, the manufacturer (or
+vendor) and the device (see *note FMC Identification::); for
+flexibility during production (i.e. before writing to the EEPROM) the
+bus supports a catch-all driver that specifies NULL strings. For this
+reason, the table is specified as pointer-and-length, not a a
+null-terminated array - the entry with NULL names can be a valid entry.
+
+Matching on FPGA cores depends on two numeric fields: the 64-bit vendor
+number and the 32-bit device number. Support for matching based on
+class is not yet implemented.  Each device is expected to be uniquely
+identified by an array of cores (it matches if all of the cores are
+instantiated), and for consistency the list is passed as
+pointer-and-length.  Several similar devices can be driven by the same
+driver, and thus the driver specifies and array of such arrays.
+
+The complete set of involved data structures is thus the following:
+
+        struct fmc_fru_id { char *manufacturer; char *product_name; };
+        struct fmc_sdb_one_id { uint64_t vendor; uint32_t device; };
+        struct fmc_sdb_id { struct fmc_sdb_one_id *cores; int cores_nr; };
+
+        struct fmc_device_id {
+                struct fmc_fru_id *fru_id; int fru_id_nr;
+                struct fmc_sdb_id *sdb_id; int sdb_id_nr;
+        };
+
+A better reference, with full explanation, is the <linux/fmc.h> header.
+
+
+Module Parameters
+=================
+
+Most of the FMC drivers need the same set of kernel parameters. This
+package includes support to implement common parameters by means of
+fields in the `fmc_driver' structure and simple macro definitions.
+
+The parameters are carrier-specific, in that they rely on the busid
+concept, that varies among carriers. For the SPEC, the identifier is a
+PCI bus and devfn number, 16 bits wide in total; drivers for other
+carriers will most likely offer something similar but not identical,
+and some code duplication is unavoidable.
+
+This is the list of parameters that are common to several modules to
+see how they are actually used, please look at spec-trivial.c.
+
+`busid='
+     This is an array of integers, listing carrier-specific
+     identification numbers. For PIC, for example, `0x0400' represents
+     bus 4, slot 0.  If any such ID is specified, the driver will only
+     accept to drive cards that appear in the list (even if the FMC ID
+     matches). This is accomplished by the validate carrier method.
+
+`gateware='
+     The argument is an array of strings. If no busid= is specified,
+     the first string of gateware= is used for all cards; otherwise the
+     identifiers and gateware names are paired one by one, in the order
+     specified.
+
+`show_sdb='
+     For modules supporting it, this parameter asks to show the SDB
+     internal structure by means of kernel messages. It is disabled by
+     default because those lines tend to hide more important messages,
+     if you look at the system console while loading the drivers.
+     Note: the parameter is being obsoleted, because fmc.ko itself now
+     supports dump_sdb= that applies to every client driver.
+
+
+For example, if you are using the trivial driver to load two different
+gateware files to two different cards, you can use the following
+parameters to load different binaries to the cards, after looking up
+the PCI identifiers. This has been tested with a SPEC carrier.
+
+        insmod fmc-trivial.ko \
+                              busid=0x0200,0x0400 \
+                              gateware=fmc/fine-delay.bin,fmc/simple-dio.bin
+
+Please note that not all sub-modules support all of those parameters.
+You can use modinfo to check what is supported by each module.

diff --git a/Documentation/fmc/parameters.txt b/Documentation/fmc/parameters.txt
new file mode 100644
index 0000000..59edf08
--- /dev/null
+++ b/Documentation/fmc/parameters.txt

@@ -0,0 +1,56 @@
+Module Parameters in fmc.ko
+***************************
+
+The core driver receives two module parameters, meant to help debugging
+client modules. Both parameters can be modified by writing to
+/sys/module/fmc/parameters/, because they are used when client drivers
+are devices are registered, not when fmc.ko is loaded.
+
+`dump_eeprom='
+     If not zero, the parameter asks the bus controller to dump the
+     EEPROM of any device that is registered, using printk.
+
+`dump_sdb='
+     If not zero, the parameter prints the SDB tree of every FPGA it is
+     loaded by fmc_reprogram(). If greater than one, it asks to dump
+     the binary content of SDB records.  This currently only dumps the
+     top-level SDB array, though.
+
+
+EEPROM dumping avoids repeating lines, since most of the contents is
+usually empty and all bits are one or zero. This is an example of the
+output:
+
+        [ 6625.850480] spec 0000:02:00.0: FPGA programming successful
+        [ 6626.139949] spec 0000:02:00.0: Manufacturer: CERN
+        [ 6626.144666] spec 0000:02:00.0: Product name: FmcDelay1ns4cha
+        [ 6626.150370] FMC: mezzanine 0: 0000:02:00.0 on SPEC
+        [ 6626.155179] FMC: dumping eeprom 0x2000 (8192) bytes
+        [ 6626.160087] 0000: 01 00 00 01  00 0b 00 f3  01 0a 00 a5  85 87 c4 43
+        [ 6626.167069] 0010: 45 52 4e cf  46 6d 63 44  65 6c 61 79  31 6e 73 34
+        [ 6626.174019] 0020: 63 68 61 c7  70 72 6f 74  6f 2d 30 cc  45 44 41 2d
+        [ 6626.180975] 0030: 30 32 32 36  37 2d 56 33  da 32 30 31  32 2d 31 31
+        [...]
+        [ 6626.371366] 0200: 66 64 65 6c  61 79 0a 00  00 00 00 00  00 00 00 00
+        [ 6626.378359] 0210: 00 00 00 00  00 00 00 00  00 00 00 00  00 00 00 00
+        [ 6626.385361] [...]
+        [ 6626.387308] 1800: 70 6c 61 63  65 68 6f 6c  64 65 72 ff  ff ff ff ff
+        [ 6626.394259] 1810: ff ff ff ff  ff ff ff ff  ff ff ff ff  ff ff ff ff
+        [ 6626.401250] [...]
+
+The dump of SDB looks like the following; the example shows the simple
+golden gateware for the SPEC card, removing the leading timestamps to
+fit the page:
+
+        spec 0000:02:00.0: SDB: 00000651:e6a542c9 WB4-Crossbar-GSI
+        spec 0000:02:00.0: SDB: 0000ce42:ff07fc47 WR-Periph-Syscon (00000000-000000ff)
+        FMC: mezzanine 0: 0000:02:00.0 on SPEC
+        FMC: poor dump of sdb first level:
+        0000: 53 44 42 2d  00 02 01 00  00 00 00 00  00 00 00 00
+        0010: 00 00 00 00  00 00 01 ff  00 00 00 00  00 00 06 51
+        0020: e6 a5 42 c9  00 00 00 02  20 12 05 11  57 42 34 2d
+        0030: 43 72 6f 73  73 62 61 72  2d 47 53 49  20 20 20 00
+        0040: 00 00 01 01  00 00 00 07  00 00 00 00  00 00 00 00
+        0050: 00 00 00 00  00 00 00 ff  00 00 00 00  00 00 ce 42
+        0060: ff 07 fc 47  00 00 00 01  20 12 03 05  57 52 2d 50
+        0070: 65 72 69 70  68 2d 53 79  73 63 6f 6e  20 20 20 01

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f98ca63..3458d63 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt

@@ -420,10 +420,10 @@
 	for a passive TCP connection will happen after 63seconds.
 
 tcp_syncookies - BOOLEAN
-	Only valid when the kernel was compiled with CONFIG_SYNCOOKIES
+	Only valid when the kernel was compiled with CONFIG_SYN_COOKIES
 	Send out syncookies when the syn backlog queue of a socket
 	overflows. This is to prevent against the common 'SYN flood attack'
-	Default: FALSE
+	Default: 1
 
 	Note, that syncookies is fallback facility.
 	It MUST NOT be used to help highly loaded servers to stand

diff --git a/Documentation/serial/00-INDEX b/Documentation/serial/00-INDEX
index f7b0c7d..1f1b22f 100644
--- a/Documentation/serial/00-INDEX
+++ b/Documentation/serial/00-INDEX

@@ -16,8 +16,6 @@
 	- info about RS485 structures and support in the kernel.
 specialix.txt
 	- info on hardware/driver for specialix IO8+ multiport serial card.
-stallion.txt
-	- info on using the Stallion multiport serial driver.
 sx.txt
 	- info on the Specialix SX/SI multiport serial driver.
 tty.txt

diff --git a/Documentation/serial/stallion.txt b/Documentation/serial/stallion.txt
deleted file mode 100644
index 4d798c0..0000000
--- a/Documentation/serial/stallion.txt
+++ /dev/null

@@ -1,392 +0,0 @@
-* NOTE - This is an unmaintained driver.  Lantronix, which bought Stallion
-technologies, is not active in driver maintenance, and they have no information
-on when or if they will have a 2.6 driver.
-
-James Nelson <james4765@gmail.com> - 12-12-2004
-
-Stallion Multiport Serial Driver Readme
----------------------------------------
-
-Copyright (C) 1994-1999,  Stallion Technologies.
-
-Version:   5.5.1
-Date:      28MAR99
-
-
-
-1. INTRODUCTION
-
-There are two drivers that work with the different families of Stallion
-multiport serial boards. One is for the Stallion smart boards - that is
-EasyIO, EasyConnection 8/32 and EasyConnection 8/64-PCI, the other for
-the true Stallion intelligent multiport boards - EasyConnection 8/64
-(ISA, EISA), EasyConnection/RA-PCI, ONboard and Brumby.
-
-If you are using any of the Stallion intelligent multiport boards (Brumby,
-ONboard, EasyConnection 8/64 (ISA, EISA), EasyConnection/RA-PCI) with
-Linux you will need to get the driver utility package.  This contains a
-firmware loader and the firmware images necessary to make the devices operate.
-
-The Stallion Technologies ftp site, ftp.stallion.com, will always have
-the latest version of the driver utility package.
-
-ftp://ftp.stallion.com/drivers/ata5/Linux/ata-linux-550.tar.gz
-
-As of the printing of this document the latest version of the driver
-utility package is 5.5.0. If a later version is now available then you
-should use the latest version.
-
-If you are using the EasyIO, EasyConnection 8/32 or EasyConnection 8/64-PCI
-boards then you don't need this package, although it does have a serial stats
-display program.
-
-If you require DIP switch settings, or EISA configuration files, or any
-other information related to Stallion boards then have a look at Stallion's
-web pages at http://www.stallion.com.
-
-
-
-2. INSTALLATION
-
-The drivers can be used as loadable modules or compiled into the kernel.
-You can choose which when doing a "config" on the kernel.
-
-All ISA, and EISA boards that you want to use need to be configured into
-the driver(s). All PCI boards will be automatically detected when you load
-the driver - so they do not need to be entered into the driver(s)
-configuration structure. Note that kernel PCI support is required to use PCI
-boards.
-
-There are two methods of configuring ISA and EISA boards into the drivers.
-If using the driver as a loadable module then the simplest method is to pass
-the driver configuration as module arguments. The other method is to modify
-the driver source to add configuration lines for each board in use.
-
-If you have pre-built Stallion driver modules then the module argument
-configuration method should be used. A lot of Linux distributions come with
-pre-built driver modules in /lib/modules/X.Y.Z/misc for the kernel in use.
-That makes things pretty simple to get going.
-
-
-2.1 MODULE DRIVER CONFIGURATION:
-
-The simplest configuration for modules is to use the module load arguments
-to configure any ISA or EISA boards. PCI boards are automatically
-detected, so do not need any additional configuration at all.
-
-If using EasyIO, EasyConnection 8/32 ISA, or EasyConnection 8/63-PCI
-boards then use the "stallion" driver module, Otherwise if you are using
-an EasyConnection 8/64 ISA or EISA, EasyConnection/RA-PCI, ONboard,
-Brumby or original Stallion board then use the "istallion" driver module.
-
-Typically to load up the smart board driver use:
-
-    modprobe stallion
-
-This will load the EasyIO and EasyConnection 8/32 driver. It will output a
-message to say that it loaded and print the driver version number. It will
-also print out whether it found the configured boards or not. These messages
-may not appear on the console, but typically are always logged to
-/var/adm/messages or /var/log/syslog files - depending on how the klogd and
-syslogd daemons are setup on your system.
-
-To load the intelligent board driver use:
-
-    modprobe istallion
-
-It will output similar messages to the smart board driver.
-
-If not using an auto-detectable board type (that is a PCI board) then you
-will also need to supply command line arguments to the modprobe command
-when loading the driver. The general form of the configuration argument is
-
-    board?=<name>[,<ioaddr>[,<addr>][,<irq>]]
-
-where:
-
-    board?  -- specifies the arbitrary board number of this board,
-               can be in the range 0 to 3.
-
-    name    -- textual name of this board. The board name is the common
-               board name, or any "shortened" version of that. The board
-               type number may also be used here.
-
-    ioaddr  -- specifies the I/O address of this board. This argument is
-               optional, but should generally be specified.
-
-    addr    -- optional second address argument. Some board types require
-               a second I/O address, some require a memory address. The
-               exact meaning of this argument depends on the board type.
-
-    irq     -- optional IRQ line used by this board.
-
-Up to 4 board configuration arguments can be specified on the load line.
-Here is some examples:
-
-    modprobe stallion board0=easyio,0x2a0,5
-
-This configures an EasyIO board as board 0 at I/O address 0x2a0 and IRQ 5.
-
-    modprobe istallion board3=ec8/64,0x2c0,0xcc000
-
-This configures an EasyConnection 8/64 ISA as board 3 at I/O address 0x2c0 at
-memory address 0xcc000.
-
-    modprobe stallion board1=ec8/32-at,0x2a0,0x280,10
-
-This configures an EasyConnection 8/32 ISA board at primary I/O address 0x2a0,
-secondary address 0x280 and IRQ 10.
-
-You will probably want to enter this module load and configuration information
-into your system startup scripts so that the drivers are loaded and configured
-on each system boot. Typically configuration files are put in the
-/etc/modprobe.d/ directory.
-
-
-2.2 STATIC DRIVER CONFIGURATION:
-
-For static driver configuration you need to modify the driver source code.
-Entering ISA and EISA boards into the driver(s) configuration structure
-involves editing the driver(s) source file. It's pretty easy if you follow
-the instructions below. Both drivers can support up to 4 boards. The smart
-card driver (the stallion.c driver) supports any combination of EasyIO and
-EasyConnection 8/32 boards (up to a total of 4). The intelligent driver
-supports any combination of ONboards, Brumbys, Stallions and EasyConnection
-8/64 (ISA and EISA) boards (up to a total of 4).
-
-To set up the driver(s) for the boards that you want to use you need to
-edit the appropriate driver file and add configuration entries.
-
-If using EasyIO or EasyConnection 8/32 ISA boards,
-   In drivers/char/stallion.c:
-      - find the definition of the stl_brdconf array (of structures)
-        near the top of the file
-      - modify this to match the boards you are going to install
-	(the comments before this structure should help)
-      - save and exit
-
-If using ONboard, Brumby, Stallion or EasyConnection 8/64 (ISA or EISA)
-boards,
-   In drivers/char/istallion.c:
-      - find the definition of the stli_brdconf array (of structures)
-        near the top of the file
-      - modify this to match the boards you are going to install
-	(the comments before this structure should help)
-      - save and exit
-
-Once you have set up the board configurations then you are ready to build
-the kernel or modules.
-
-When the new kernel is booted, or the loadable module loaded then the
-driver will emit some kernel trace messages about whether the configured
-boards were detected or not. Depending on how your system logger is set
-up these may come out on the console, or just be logged to
-/var/adm/messages or /var/log/syslog. You should check the messages to
-confirm that all is well.
-
-
-2.3 SHARING INTERRUPTS
-
-It is possible to share interrupts between multiple EasyIO and
-EasyConnection 8/32 boards in an EISA system. To do this you must be using
-static driver configuration, modifying the driver source code to add driver
-configuration. Then a couple of extra things are required:
-
-1. When entering the board resources into the stallion.c file you need to
-   mark the boards as using level triggered interrupts. Do this by replacing
-   the "0" entry at field position 6 (the last field) in the board
-   configuration structure with a "1". (This is the structure that defines
-   the board type, I/O locations, etc. for each board). All boards that are
-   sharing an interrupt must be set this way, and each board should have the
-   same interrupt number specified here as well. Now build the module or
-   kernel as you would normally.
-
-2. When physically installing the boards into the system you must enter
-   the system EISA configuration utility. You will need to install the EISA
-   configuration files for *all* the EasyIO and EasyConnection 8/32 boards
-   that are sharing interrupts. The Stallion EasyIO and EasyConnection 8/32
-   EISA configuration files required are supplied by Stallion Technologies
-   on the EASY Utilities floppy diskette (usually supplied in the box with
-   the board when purchased. If not, you can pick it up from Stallion's FTP
-   site, ftp.stallion.com). You will need to edit the board resources to
-   choose level triggered interrupts, and make sure to set each board's
-   interrupt to the same IRQ number.
-
-You must complete both the above steps for this to work. When you reboot
-or load the driver your EasyIO and EasyConnection 8/32 boards will be
-sharing interrupts.
-
-
-2.4 USING HIGH SHARED MEMORY
-
-The EasyConnection 8/64-EI, ONboard and Stallion boards are capable of
-using shared memory addresses above the usual 640K - 1Mb range. The ONboard
-ISA and the Stallion boards can be programmed to use memory addresses up to
-16Mb (the ISA bus addressing limit), and the EasyConnection 8/64-EI and
-ONboard/E can be programmed for memory addresses up to 4Gb (the EISA bus
-addressing limit).
-
-The higher than 1Mb memory addresses are fully supported by this driver.
-Just enter the address as you normally would for a lower than 1Mb address
-(in the driver's board configuration structure).
-
-
-
-2.5 TROUBLE SHOOTING
-
-If a board is not found by the driver but is actually in the system then the
-most likely problem is that the I/O address is wrong. Change the module load
-argument for the loadable module form. Or change it in the driver stallion.c
-or istallion.c configuration structure and rebuild the kernel or modules, or
-change it on the board.
-
-On EasyIO and EasyConnection 8/32 boards the IRQ is software programmable, so
-if there is a conflict you may need to change the IRQ used for a board. There
-are no interrupts to worry about for ONboard, Brumby or EasyConnection 8/64
-(ISA and EISA) boards. The memory region on EasyConnection 8/64 and
-ONboard boards is software programmable, but not on the Brumby boards.
-
-
-
-3. USING THE DRIVERS
-
-3.1 INTELLIGENT DRIVER OPERATION
-
-The intelligent boards also need to have their "firmware" code downloaded
-to them. This is done via a user level application supplied in the driver
-utility package called "stlload". Compile this program wherever you dropped
-the package files, by typing "make". In its simplest form you can then type
-
-    ./stlload -i cdk.sys
-
-in this directory and that will download board 0 (assuming board 0 is an
-EasyConnection 8/64 or EasyConnection/RA board). To download to an
-ONboard, Brumby or Stallion do:
-
-    ./stlload -i 2681.sys
-
-Normally you would want all boards to be downloaded as part of the standard
-system startup. To achieve this, add one of the lines above into the
-/etc/rc.d/rc.S or /etc/rc.d/rc.serial file. To download each board just add
-the "-b <brd-number>" option to the line. You will need to download code for
-every board. You should probably move the stlload program into a system
-directory, such as /usr/sbin. Also, the default location of the cdk.sys image
-file in the stlload down-loader is /usr/lib/stallion. Create that directory
-and put the cdk.sys and 2681.sys files in it. (It's a convenient place to put
-them anyway). As an example your /etc/rc.d/rc.S file might have the
-following lines added to it (if you had 3 boards):
-
-    /usr/sbin/stlload -b 0 -i /usr/lib/stallion/cdk.sys
-    /usr/sbin/stlload -b 1 -i /usr/lib/stallion/2681.sys
-    /usr/sbin/stlload -b 2 -i /usr/lib/stallion/2681.sys
-
-The image files cdk.sys and 2681.sys are specific to the board types. The
-cdk.sys will only function correctly on an EasyConnection 8/64 board. Similarly
-the 2681.sys image fill only operate on ONboard, Brumby and Stallion boards.
-If you load the wrong image file into a board it will fail to start up, and
-of course the ports will not be operational!
-
-If you are using the modularized version of the driver you might want to put
-the modprobe calls in the startup script as well (before the download lines
-obviously).
-
-
-3.2 USING THE SERIAL PORTS
-
-Once the driver is installed you will need to setup some device nodes to
-access the serial ports. The simplest method is to use the /dev/MAKEDEV program.
-It will automatically create device entries for Stallion boards. This will
-create the normal serial port devices as /dev/ttyE# where# is the port number
-starting from 0. A bank of 64 minor device numbers is allocated to each board,
-so the first port on the second board is port 64,etc. A set of callout type
-devices may also be created. They are created as the devices /dev/cue# where #
-is the same as for the ttyE devices.
-
-For the most part the Stallion driver tries to emulate the standard PC system
-COM ports and the standard Linux serial driver. The idea is that you should
-be able to use Stallion board ports and COM ports interchangeably without
-modifying anything but the device name. Anything that doesn't work like that
-should be considered a bug in this driver!
-
-If you look at the driver code you will notice that it is fairly closely
-based on the Linux serial driver (linux/drivers/char/serial.c). This is
-intentional, obviously this is the easiest way to emulate its behavior!
-
-Since this driver tries to emulate the standard serial ports as much as
-possible, most system utilities should work as they do for the standard
-COM ports. Most importantly "stty" works as expected and "setserial" can
-also be used (excepting the ability to auto-configure the I/O and IRQ
-addresses of boards). Higher baud rates are supported in the usual fashion
-through setserial or using the CBAUDEX extensions. Note that the EasyIO and
-EasyConnection (all types) support at least 57600 and 115200 baud. The newer
-EasyConnection XP modules and new EasyIO boards support 230400 and 460800
-baud as well. The older boards including ONboard and Brumby support a
-maximum baud rate of 38400.
-
-If you are unfamiliar with how to use serial ports, then get the Serial-HOWTO
-by Greg Hankins. It will explain everything you need to know!
-
-
-
-4. NOTES
-
-You can use both drivers at once if you have a mix of board types installed
-in a system. However to do this you will need to change the major numbers
-used by one of the drivers. Currently both drivers use major numbers 24, 25
-and 28 for their devices. Change one driver to use some other major numbers,
-and then modify the mkdevnods script to make device nodes based on those new
-major numbers. For example, you could change the istallion.c driver to use
-major numbers 60, 61 and 62. You will also need to create device nodes with
-different names for the ports, for example ttyF# and cuf#.
-
-The original Stallion board is no longer supported by Stallion Technologies.
-Although it is known to work with the istallion driver.
-
-Finding a free physical memory address range can be a problem. The older
-boards like the Stallion and ONboard need large areas (64K or even 128K), so
-they can be very difficult to get into a system. If you have 16 Mb of RAM
-then you have no choice but to put them somewhere in the 640K -> 1Mb range.
-ONboards require 64K, so typically 0xd0000 is good, or 0xe0000 on some
-systems. If you have an original Stallion board, "V4.0" or Rev.O, then you
-need a 64K memory address space, so again 0xd0000 and 0xe0000 are good.
-Older Stallion boards are a much bigger problem. They need 128K of address
-space and must be on a 128K boundary. If you don't have a VGA card then
-0xc0000 might be usable - there is really no other place you can put them
-below 1Mb.
-
-Both the ONboard and old Stallion boards can use higher memory addresses as
-well, but you must have less than 16Mb of RAM to be able to use them. Usual
-high memory addresses used include 0xec0000 and 0xf00000.
-
-The Brumby boards only require 16Kb of address space, so you can usually
-squeeze them in somewhere. Common addresses are 0xc8000, 0xcc000, or in
-the 0xd0000 range. EasyConnection 8/64 boards are even better, they only
-require 4Kb of address space, again usually 0xc8000, 0xcc000 or 0xd0000
-are good.
-
-If you are using an EasyConnection 8/64-EI or ONboard/E then usually the
-0xd0000 or 0xe0000 ranges are the best options below 1Mb. If neither of
-them can be used then the high memory support to use the really high address
-ranges is the best option. Typically the 2Gb range is convenient for them,
-and gets them well out of the way.
-
-The ports of the EasyIO-8M board do not have DCD or DTR signals. So these
-ports cannot be used as real modem devices. Generally, when using these
-ports you should only use the cueX devices.
-
-The driver utility package contains a couple of very useful programs. One 
-is a serial port statistics collection and display program - very handy
-for solving serial port problems. The other is an extended option setting
-program that works with the intelligent boards.
-
-
-
-5. DISCLAIMER
-
-The information contained in this document is believed to be accurate and
-reliable. However, no responsibility is assumed by Stallion Technologies
-Pty. Ltd. for its use, nor any infringements of patents or other rights
-of third parties resulting from its use. Stallion Technologies reserves
-the right to modify the design of its products and will endeavour to change
-the information in manuals and accompanying documentation accordingly.
-

diff --git a/Documentation/usb/gadget_configfs.txt b/Documentation/usb/gadget_configfs.txt
new file mode 100644
index 0000000..8ec2a67
--- /dev/null
+++ b/Documentation/usb/gadget_configfs.txt

@@ -0,0 +1,384 @@
+
+
+
+
+		Linux USB gadget configured through configfs
+
+
+			     25th April 2013
+
+
+
+
+Overview
+========
+
+A USB Linux Gadget is a device which has a UDC (USB Device Controller) and can
+be connected to a USB Host to extend it with additional functions like a serial
+port or a mass storage capability.
+
+A gadget is seen by its host as a set of configurations, each of which contains
+a number of interfaces which, from the gadget's perspective, are known as
+functions, each function representing e.g. a serial connection or a SCSI disk.
+
+Linux provides a number of functions for gadgets to use.
+
+Creating a gadget means deciding what configurations there will be
+and which functions each configuration will provide.
+
+Configfs (please see Documentation/filesystems/configfs/*) lends itslef nicely
+for the purpose of telling the kernel about the above mentioned decision.
+This document is about how to do it.
+
+It also describes how configfs integration into gadget is designed.
+
+
+
+
+Requirements
+============
+
+In order for this to work configfs must be available, so CONFIGFS_FS must be
+'y' or 'm' in .config. As of this writing USB_LIBCOMPOSITE selects CONFIGFS_FS.
+
+
+
+
+Usage
+=====
+
+(The original post describing the first function
+made available through configfs can be seen here:
+http://www.spinics.net/lists/linux-usb/msg76388.html)
+
+$ modprobe libcomposite
+$ mount none $CONFIGFS_HOME -t configfs
+
+where CONFIGFS_HOME is the mount point for configfs
+
+1. Creating the gadgets
+-----------------------
+
+For each gadget to be created its corresponding directory must be created:
+
+$ mkdir $CONFIGFS_HOME/usb_gadget/<gadget name>
+
+e.g.:
+
+$ mkdir $CONFIGFS_HOME/usb_gadget/g1
+
+...
+...
+...
+
+$ cd $CONFIGFS_HOME/usb_gadget/g1
+
+Each gadget needs to have its vendor id <VID> and product id <PID> specified:
+
+$ echo <VID> > idVendor
+$ echo <PID> > idProduct
+
+A gadget also needs its serial number, manufacturer and product strings.
+In order to have a place to store them, a strings subdirectory must be created
+for each language, e.g.:
+
+$ mkdir strings/0x409
+
+Then the strings can be specified:
+
+$ echo <serial number> > strings/0x409/serialnumber
+$ echo <manufacturer> > strings/0x409/manufacturer
+$ echo <product> > strings/0x409/product
+
+2. Creating the configurations
+------------------------------
+
+Each gadget will consist of a number of configurations, their corresponding
+directories must be created:
+
+$ mkdir configs/<name>.<number>
+
+where <name> can be any string which is legal in a filesystem and the
+<numebr> is the configuration's number, e.g.:
+
+$ mkdir configs/c.1
+
+...
+...
+...
+
+Each configuration also needs its strings, so a subdirectory must be created
+for each language, e.g.:
+
+$ mkdir configs/c.1/strings/0x409
+
+Then the configuration string can be specified:
+
+$ echo <configuration> > configs/c.1/strings/0x409/configuration
+
+Some attributes can also be set for a configuration, e.g.:
+
+$ echo 120 > configs/c.1/MaxPower
+
+3. Creating the functions
+-------------------------
+
+The gadget will provide some functions, for each function its corresponding
+directory must be created:
+
+$ mkdir functions/<name>.<instance name>
+
+where <name> corresponds to one of allowed function names and instance name
+is an arbitrary string allowed in a filesystem, e.g.:
+
+$ mkdir functions/ncm.usb0 # usb_f_ncm.ko gets loaded with request_module()
+
+...
+...
+...
+
+Each function provides its specific set of attributes, with either read-only
+or read-write access. Where applicable they need to be written to as
+appropriate.
+Please refer to Documentation/ABI/*/configfs-usb-gadget* for more information.
+
+4. Associating the functions with their configurations
+------------------------------------------------------
+
+At this moment a number of gadgets is created, each of which has a number of
+configurations specified and a number of functions available. What remains
+is specifying which function is available in which configuration (the same
+function can be used in multiple configurations). This is achieved with
+creating symbolic links:
+
+$ ln -s functions/<name>.<instance name> configs/<name>.<number>
+
+e.g.:
+
+$ ln -s functions/ncm.usb0 configs/c.1
+
+...
+...
+...
+
+5. Enabling the gadget
+----------------------
+
+All the above steps serve the purpose of composing the gadget of
+configurations and functions.
+
+An example directory structure might look like this:
+
+.
+./strings
+./strings/0x409
+./strings/0x409/serialnumber
+./strings/0x409/product
+./strings/0x409/manufacturer
+./configs
+./configs/c.1
+./configs/c.1/ncm.usb0 -> ../../../../usb_gadget/g1/functions/ncm.usb0
+./configs/c.1/strings
+./configs/c.1/strings/0x409
+./configs/c.1/strings/0x409/configuration
+./configs/c.1/bmAttributes
+./configs/c.1/MaxPower
+./functions
+./functions/ncm.usb0
+./functions/ncm.usb0/ifname
+./functions/ncm.usb0/qmult
+./functions/ncm.usb0/host_addr
+./functions/ncm.usb0/dev_addr
+./UDC
+./bcdUSB
+./bcdDevice
+./idProduct
+./idVendor
+./bMaxPacketSize0
+./bDeviceProtocol
+./bDeviceSubClass
+./bDeviceClass
+
+
+Such a gadget must be finally enabled so that the USB host can enumerate it.
+In order to enable the gadget it must be bound to a UDC (USB Device Controller).
+
+$ echo <udc name> > UDC
+
+where <udc name> is one of those found in /sys/class/udc/*
+e.g.:
+
+$ echo s3c-hsotg > UDC
+
+
+6. Disabling the gadget
+-----------------------
+
+$ echo "" > UDC
+
+7. Cleaning up
+--------------
+
+Remove functions from configurations:
+
+$ rm configs/<config name>.<number>/<function>
+
+where <config name>.<number> specify the configuration and <function> is
+a symlink to a function being removed from the configuration, e.g.:
+
+$ rm configfs/c.1/ncm.usb0
+
+...
+...
+...
+
+Remove strings directories in configurations
+
+$ rmdir configs/<config name>.<number>/strings/<lang>
+
+e.g.:
+
+$ rmdir configs/c.1/strings/0x409
+
+...
+...
+...
+
+and remove the configurations
+
+$ rmdir configs/<config name>.<number>
+
+e.g.:
+
+rmdir configs/c.1
+
+...
+...
+...
+
+Remove functions (function modules are not unloaded, though)
+
+$ rmdir functions/<name>.<instance name>
+
+e.g.:
+
+$ rmdir functions/ncm.usb0
+
+...
+...
+...
+
+Remove strings directories in the gadget
+
+$ rmdir strings/<lang>
+
+e.g.:
+
+$ rmdir strings/0x409
+
+and finally remove the gadget:
+
+$ cd ..
+$ rmdir <gadget name>
+
+e.g.:
+
+$ rmdir g1
+
+
+
+
+Implementation design
+=====================
+
+Below the idea of how configfs works is presented.
+In configfs there are items and groups, both represented as directories.
+The difference between an item and a group is that a group can contain
+other groups. In the picture below only an item is shown.
+Both items and groups can have attributes, which are represented as files.
+The user can create and remove directories, but cannot remove files,
+which can be read-only or read-write, depending on what they represent.
+
+The filesystem part of configfs operates on config_items/groups and
+configfs_attributes which are generic and of the same type for all
+configured elements. However, they are embedded in usage-specific
+larger structures. In the picture below there is a "cs" which contains
+a config_item and an "sa" which contains a configfs_attribute.
+
+The filesystem view would be like this:
+
+./
+./cs        (directory)
+   |
+   +--sa    (file)
+   |
+   .
+   .
+   .
+
+Whenever a user reads/writes the "sa" file, a function is called
+which accepts a struct config_item and a struct configfs_attribute.
+In the said function the "cs" and "sa" are retrieved using the well
+known container_of technique and an appropriate sa's function (show or
+store) is called and passed the "cs" and a character buffer. The "show"
+is for displaying the file's contents (copy data from the cs to the
+buffer), while the "store" is for modifying the file's contents (copy data
+from the buffer to the cs), but it is up to the implementer of the
+two functions to decide what they actually do.
+
+typedef struct configured_structure cs;
+typedef struc specific_attribute sa;
+
+                                       sa
+                       +----------------------------------+
+        cs             |  (*show)(cs *, buffer);          |
++-----------------+    |  (*store)(cs *, buffer, length); |
+|                 |    |                                  |
+| +-------------+ |    |       +------------------+       |
+| | struct      |-|----|------>|struct            |       |
+| | config_item | |    |       |configfs_attribute|       |
+| +-------------+ |    |       +------------------+       |
+|                 |    +----------------------------------+
+| data to be set  |                .
+|                 |                .
++-----------------+                .
+
+The file names are decided by the config item/group designer, while
+the directories in general can be named at will. A group can have
+a number of its default sub-groups created automatically.
+
+For more information on configfs please see
+Documentation/filesystems/configfs/*.
+
+The concepts described above translate to USB gadgets like this:
+
+1. A gadget has its config group, which has some attributes (idVendor,
+idProduct etc) and default sub-groups (configs, functions, strings).
+Writing to the attributes causes the information to be stored in
+appropriate locations. In the configs, functions and strings sub-groups
+a user can create their sub-groups to represent configurations, functions,
+and groups of strings in a given language.
+
+2. The user creates configurations and functions, in the configurations
+creates symbolic links to functions. This information is used when the
+gadget's UDC attribute is written to, which means binding the gadget
+to the UDC. The code in drivers/usb/gadget/configfs.c iterates over
+all configurations, and in each configuration it iterates over all
+functions and binds them. This way the whole gadget is bound.
+
+3. The file drivers/usb/gadget/configfs.c contains code for
+
+	- gadget's config_group
+	- gadget's default groups (configs, functions, strings)
+	- associating functions with configurations (symlinks)
+
+4. Each USB function naturally has its own view of what it wants
+configured, so config_groups for particular functions are defined
+in the functions implementation files drivers/usb/gadget/f_*.c.
+
+5. Funciton's code is written in such a way that it uses
+
+usb_get_function_instance(), which, in turn, calls request_module.
+So, provided that modprobe works, modules for particular functions
+are loaded automatically. Please note that the converse is not true:
+after a gadget is disabled and torn down, the modules remain loaded.

diff --git a/Documentation/w1/w1.generic b/Documentation/w1/w1.generic
index 212f4ac..a31c5a2 100644
--- a/Documentation/w1/w1.generic
+++ b/Documentation/w1/w1.generic

@@ -25,8 +25,8 @@
  - sysfs entries for that w1 master are created
  - the w1 bus is periodically searched for new slave devices
 
-When a device is found on the bus, w1 core checks if driver for its family is
-loaded. If so, the family driver is attached to the slave.
+When a device is found on the bus, w1 core tries to load the driver for its family
+and check if it is loaded. If so, the family driver is attached to the slave.
 If there is no driver for the family, default one is assigned, which allows to perform
 almost any kind of operations. Each logical operation is a transaction
 in nature, which can contain several (two or one) low-level operations.

diff --git a/MAINTAINERS b/MAINTAINERS
index 5be702c..7fab329 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS

@@ -3220,7 +3220,7 @@
 
 FCOE SUBSYSTEM (libfc, libfcoe, fcoe)
 M:	Robert Love <robert.w.love@intel.com>
-L:	devel@open-fcoe.org
+L:	fcoe-devel@open-fcoe.org
 W:	www.Open-FCoE.org
 S:	Supported
 F:	drivers/scsi/libfc/
@@ -3309,6 +3309,15 @@
 S:	Odd fixes
 F:	drivers/block/floppy.c
 
+FMC SUBSYSTEM
+M:	Alessandro Rubini <rubini@gnudd.com>
+W:	http://www.ohwr.org/projects/fmc-bus
+S:	Supported
+F:	drivers/fmc/
+F:	include/linux/fmc*.h
+F:	include/linux/ipmi-fru.h
+K:	fmc_d.*register
+
 FPU EMULATOR
 M:	Bill Metzenthen <billm@melbpc.org.au>
 W:	http://floatingpoint.sourceforge.net/emulator/index.html
@@ -4577,7 +4586,7 @@
 F:	include/linux/jbd2.h
 
 JSM Neo PCI based serial card
-M:	Lucas Tavares <lucaskt@linux.vnet.ibm.com>
+M:	Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
 L:	linux-serial@vger.kernel.org
 S:	Maintained
 F:	drivers/tty/serial/jsm/
@@ -7667,6 +7676,7 @@
 M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 L:	stable@vger.kernel.org
 S:	Supported
+F:	Documentation/stable_kernel_rules.txt
 
 STAGING SUBSYSTEM
 M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
@@ -7783,7 +7793,7 @@
 STAGING - SPEAKUP CONSOLE SPEECH DRIVER
 M:	William Hubbs <w.d.hubbs@gmail.com>
 M:	Chris Brannon <chris@the-brannons.com>
-M:	Kirk Reiser <kirk@braille.uwo.ca>
+M:	Kirk Reiser <kirk@reisers.ca>
 M:	Samuel Thibault <samuel.thibault@ens-lyon.org>
 L:	speakup@braille.uwo.ca
 W:	http://www.linux-speakup.org/

diff --git a/Makefile b/Makefile
index 0142c93..e5e3ba0 100644
--- a/Makefile
+++ b/Makefile

@@ -1,7 +1,7 @@
 VERSION = 3
 PATCHLEVEL = 10
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
 NAME = Unicycling Gorilla
 
 # *DOCUMENTATION*

diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 81a4342..d8f9b7e 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h

@@ -354,9 +354,6 @@
 #define kern_addr_valid(addr)	(1)
 #endif
 
-#define io_remap_pfn_range(vma, start, pfn, size, prot)	\
-		remap_pfn_range(vma, start, pfn, size, prot)
-
 #define pte_ERROR(e) \
 	printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pmd_ERROR(e) \

diff --git a/arch/alpha/kernel/console.c b/arch/alpha/kernel/console.c
index da711e3..6a61dee 100644
--- a/arch/alpha/kernel/console.c
+++ b/arch/alpha/kernel/console.c

@@ -61,7 +61,9 @@
 
 	/* Set the VGA hose and init the new console. */
 	pci_vga_hose = hose;
-	take_over_console(&vga_con, 0, MAX_NR_CONSOLES-1, 1);
+	console_lock();
+	do_take_over_console(&vga_con, 0, MAX_NR_CONSOLES-1, 1);
+	console_unlock();
 }
 
 void __init

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index b9e37ad..1402fcc 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c

@@ -96,6 +96,7 @@
 };
 
 struct osf_dirent_callback {
+	struct dir_context ctx;
 	struct osf_dirent __user *dirent;
 	long __user *basep;
 	unsigned int count;
@@ -146,17 +147,17 @@
 {
 	int error;
 	struct fd arg = fdget(fd);
-	struct osf_dirent_callback buf;
+	struct osf_dirent_callback buf = {
+		.ctx.actor = osf_filldir,
+		.dirent = dirent,
+		.basep = basep,
+		.count = count
+	};
 
 	if (!arg.file)
 		return -EBADF;
 
-	buf.dirent = dirent;
-	buf.basep = basep;
-	buf.count = count;
-	buf.error = 0;
-
-	error = vfs_readdir(arg.file, osf_filldir, &buf);
+	error = iterate_dir(arg.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	if (count != buf.count)

diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c
index b51f7b4..2b183b0 100644
--- a/arch/alpha/kernel/pci-sysfs.c
+++ b/arch/alpha/kernel/pci-sysfs.c

@@ -26,7 +26,6 @@
 		base = sparse ? hose->sparse_io_base : hose->dense_io_base;
 
 	vma->vm_pgoff += base >> PAGE_SHIFT;
-	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
 	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 				  vma->vm_end - vma->vm_start,

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index ab80a80..f2360a7 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c

@@ -117,7 +117,9 @@
 		if (in_interrupt())
 			irq_exit();
 		/* This has the effect of resetting the VGA video origin.  */
-		take_over_console(&dummy_con, 0, MAX_NR_CONSOLES-1, 1);
+		console_lock();
+		do_take_over_console(&dummy_con, 0, MAX_NR_CONSOLES-1, 1);
+		console_unlock();
 #endif
 		pci_restore_srm_config();
 		set_hae(srm_hae);

diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 95b1522..c110ac8 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h

@@ -394,9 +394,6 @@
  * remap a physical page `pfn' of size `size' with page protection `prot'
  * into virtual address `from'
  */
-#define io_remap_pfn_range(vma, from, pfn, size, prot) \
-			remap_pfn_range(vma, from, pfn, size, prot)
-
 #include <asm-generic/pgtable.h>
 
 /* to cope with aliasing VIPT cache */

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0a20413..b519015 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig

@@ -1087,6 +1087,20 @@
 source "arch/arm/Kconfig-nommu"
 endif
 
+config PJ4B_ERRATA_4742
+	bool "PJ4B Errata 4742: IDLE Wake Up Commands can Cause the CPU Core to Cease Operation"
+	depends on CPU_PJ4B && MACH_ARMADA_370
+	default y
+	help
+	  When coming out of either a Wait for Interrupt (WFI) or a Wait for
+	  Event (WFE) IDLE states, a specific timing sensitivity exists between
+	  the retiring WFI/WFE instructions and the newly issued subsequent
+	  instructions.  This sensitivity can result in a CPU hang scenario.
+	  Workaround:
+	  The software must insert either a Data Synchronization Barrier (DSB)
+	  or Data Memory Barrier (DMB) command immediately after the WFI/WFE
+	  instruction
+
 config ARM_ERRATA_326103
 	bool "ARM errata: FSR write bit incorrect on a SWP to read-only memory"
 	depends on CPU_V6

diff --git a/arch/arm/boot/dts/tegra20-colibri-512.dtsi b/arch/arm/boot/dts/tegra20-colibri-512.dtsi
index a573b94..c12af78 100644
--- a/arch/arm/boot/dts/tegra20-colibri-512.dtsi
+++ b/arch/arm/boot/dts/tegra20-colibri-512.dtsi

@@ -449,7 +449,11 @@
 
 	usb@c5004000 {
 		status = "okay";
-		nvidia,phy-reset-gpio = <&gpio 169 0>; /* gpio PV1 */
+		nvidia,phy-reset-gpio = <&gpio 169 1>; /* gpio PV1, active low */
+	};
+
+	usb-phy@c5004000 {
+		nvidia,phy-reset-gpio = <&gpio 169 1>; /* gpio PV1, active low */
 	};
 
 	sdhci@c8000600 {

diff --git a/arch/arm/boot/dts/tegra20-harmony.dts b/arch/arm/boot/dts/tegra20-harmony.dts
index e7d5de4..ec52937 100644
--- a/arch/arm/boot/dts/tegra20-harmony.dts
+++ b/arch/arm/boot/dts/tegra20-harmony.dts

@@ -428,17 +428,26 @@
 		status = "okay";
 	};
 
+	usb-phy@c5000000 {
+		status = "okay";
+	};
+
 	usb@c5004000 {
 		status = "okay";
-		nvidia,phy-reset-gpio = <&gpio 169 0>; /* gpio PV1 */
+		nvidia,phy-reset-gpio = <&gpio 169 1>; /* gpio PV1, active low */
+	};
+
+	usb-phy@c5004000 {
+		status = "okay";
+		nvidia,phy-reset-gpio = <&gpio 169 1>; /* gpio PV1, active low */
 	};
 
 	usb@c5008000 {
 		status = "okay";
 	};
 
-	usb-phy@c5004400 {
-		nvidia,phy-reset-gpio = <&gpio 169 0>; /* gpio PV1 */
+	usb-phy@c5008000 {
+		status = "okay";
 	};
 
 	sdhci@c8000200 {

diff --git a/arch/arm/boot/dts/tegra20-iris-512.dts b/arch/arm/boot/dts/tegra20-iris-512.dts
index 52f1103..9f64f70 100644
--- a/arch/arm/boot/dts/tegra20-iris-512.dts
+++ b/arch/arm/boot/dts/tegra20-iris-512.dts

@@ -38,13 +38,20 @@
 
 	usb@c5000000 {
 		status = "okay";
-		dr_mode = "otg";
+	};
+
+	usb-phy@c5000000 {
+		status = "okay";
 	};
 
 	usb@c5008000 {
 		status = "okay";
 	};
 
+	usb-phy@c5008000 {
+		status = "okay";
+	};
+
 	serial@70006000 {
 		status = "okay";
 	};

diff --git a/arch/arm/boot/dts/tegra20-paz00.dts b/arch/arm/boot/dts/tegra20-paz00.dts
index e3e0c99..1c17ffa 100644
--- a/arch/arm/boot/dts/tegra20-paz00.dts
+++ b/arch/arm/boot/dts/tegra20-paz00.dts

@@ -427,17 +427,26 @@
 		status = "okay";
 	};
 
+	usb-phy@c5000000 {
+		status = "okay";
+	};
+
 	usb@c5004000 {
 		status = "okay";
-		nvidia,phy-reset-gpio = <&gpio 168 0>; /* gpio PV0 */
+		nvidia,phy-reset-gpio = <&gpio 168 1>; /* gpio PV0, active low */
+	};
+
+	usb-phy@c5004000 {
+		status = "okay";
+		nvidia,phy-reset-gpio = <&gpio 168 1>; /* gpio PV0, active low */
 	};
 
 	usb@c5008000 {
 		status = "okay";
 	};
 
-	usb-phy@c5004400 {
-		nvidia,phy-reset-gpio = <&gpio 168 0>; /* gpio PV0 */
+	usb-phy@c5008000 {
+		status = "okay";
 	};
 
 	sdhci@c8000000 {

diff --git a/arch/arm/boot/dts/tegra20-seaboard.dts b/arch/arm/boot/dts/tegra20-seaboard.dts
index cee4c34..009dafe 100644
--- a/arch/arm/boot/dts/tegra20-seaboard.dts
+++ b/arch/arm/boot/dts/tegra20-seaboard.dts

@@ -569,17 +569,28 @@
 		dr_mode = "otg";
 	};
 
+	usb-phy@c5000000 {
+		status = "okay";
+		vbus-supply = <&vbus_reg>;
+		dr_mode = "otg";
+	};
+
 	usb@c5004000 {
 		status = "okay";
-		nvidia,phy-reset-gpio = <&gpio 169 0>; /* gpio PV1 */
+		nvidia,phy-reset-gpio = <&gpio 169 1>; /* gpio PV1, active low */
+	};
+
+	usb-phy@c5004000 {
+		status = "okay";
+		nvidia,phy-reset-gpio = <&gpio 169 1>; /* gpio PV1, active low */
 	};
 
 	usb@c5008000 {
 		status = "okay";
 	};
 
-	usb-phy@c5004400 {
-		nvidia,phy-reset-gpio = <&gpio 169 0>; /* gpio PV1 */
+	usb-phy@c5008000 {
+		status = "okay";
 	};
 
 	sdhci@c8000000 {
@@ -807,6 +818,15 @@
 			gpio = <&pmic 1 0>;
 			enable-active-high;
 		};
+
+		vbus_reg: regulator@3 {
+			compatible = "regulator-fixed";
+			reg = <3>;
+			regulator-name = "vdd_vbus_wup1";
+			regulator-min-microvolt = <5000000>;
+			regulator-max-microvolt = <5000000>;
+			gpio = <&gpio 24 0>; /* PD0 */
+		};
 	};
 
 	sound {

diff --git a/arch/arm/boot/dts/tegra20-tamonten.dtsi b/arch/arm/boot/dts/tegra20-tamonten.dtsi
index 50b3ec1..fc2f7d6 100644
--- a/arch/arm/boot/dts/tegra20-tamonten.dtsi
+++ b/arch/arm/boot/dts/tegra20-tamonten.dtsi

@@ -470,6 +470,10 @@
 		status = "okay";
 	};
 
+	usb-phy@c5008000 {
+		status = "okay";
+	};
+
 	sdhci@c8000600 {
 		cd-gpios = <&gpio 58 1>; /* gpio PH2 */
 		wp-gpios = <&gpio 59 0>; /* gpio PH3 */

diff --git a/arch/arm/boot/dts/tegra20-trimslice.dts b/arch/arm/boot/dts/tegra20-trimslice.dts
index 9cc78a1..0e65c00 100644
--- a/arch/arm/boot/dts/tegra20-trimslice.dts
+++ b/arch/arm/boot/dts/tegra20-trimslice.dts

@@ -314,17 +314,27 @@
 		nvidia,vbus-gpio = <&gpio 170 0>; /* gpio PV2 */
 	};
 
+	usb-phy@c5000000 {
+		status = "okay";
+		vbus-supply = <&vbus_reg>;
+	};
+
 	usb@c5004000 {
 		status = "okay";
-		nvidia,phy-reset-gpio = <&gpio 168 0>; /* gpio PV0 */
+		nvidia,phy-reset-gpio = <&gpio 168 1>; /* gpio PV0, active low */
+	};
+
+	usb-phy@c5004000 {
+		status = "okay";
+		nvidia,phy-reset-gpio = <&gpio 168 1>; /* gpio PV0, active low */
 	};
 
 	usb@c5008000 {
 		status = "okay";
 	};
 
-	usb-phy@c5004400 {
-		nvidia,phy-reset-gpio = <&gpio 168 0>; /* gpio PV0 */
+	usb-phy@c5008000 {
+		status = "okay";
 	};
 
 	sdhci@c8000000 {
@@ -390,6 +400,15 @@
 			regulator-max-microvolt = <1800000>;
 			regulator-always-on;
 		};
+
+		vbus_reg: regulator@2 {
+			compatible = "regulator-fixed";
+			reg = <2>;
+			regulator-name = "usb1_vbus";
+			regulator-min-microvolt = <5000000>;
+			regulator-max-microvolt = <5000000>;
+			gpio = <&gpio 170 0>; /* PV2 */
+		};
 	};
 
 	sound {

diff --git a/arch/arm/boot/dts/tegra20-ventana.dts b/arch/arm/boot/dts/tegra20-ventana.dts
index dd38f1f..e00f89e 100644
--- a/arch/arm/boot/dts/tegra20-ventana.dts
+++ b/arch/arm/boot/dts/tegra20-ventana.dts

@@ -505,17 +505,26 @@
 		status = "okay";
 	};
 
+	usb-phy@c5000000 {
+		status = "okay";
+	};
+
 	usb@c5004000 {
 		status = "okay";
-		nvidia,phy-reset-gpio = <&gpio 169 0>; /* gpio PV1 */
+		nvidia,phy-reset-gpio = <&gpio 169 1>; /* gpio PV1, active low */
+	};
+
+	usb-phy@c5004000 {
+		status = "okay";
+		nvidia,phy-reset-gpio = <&gpio 169 1>; /* gpio PV1, active low */
 	};
 
 	usb@c5008000 {
 		status = "okay";
 	};
 
-	usb-phy@c5004400 {
-		nvidia,phy-reset-gpio = <&gpio 169 0>; /* gpio PV1 */
+	usb-phy@c5008000 {
+		status = "okay";
 	};
 
 	sdhci@c8000000 {

diff --git a/arch/arm/boot/dts/tegra20-whistler.dts b/arch/arm/boot/dts/tegra20-whistler.dts
index d2567f8..3c24c9b 100644
--- a/arch/arm/boot/dts/tegra20-whistler.dts
+++ b/arch/arm/boot/dts/tegra20-whistler.dts

@@ -511,11 +511,21 @@
 		nvidia,vbus-gpio = <&tca6416 0 0>; /* GPIO_PMU0 */
 	};
 
+	usb-phy@c5000000 {
+		status = "okay";
+		vbus-supply = <&vbus1_reg>;
+	};
+
 	usb@c5008000 {
 		status = "okay";
 		nvidia,vbus-gpio = <&tca6416 1 0>; /* GPIO_PMU1 */
 	};
 
+	usb-phy@c5008000 {
+		status = "okay";
+		vbus-supply = <&vbus3_reg>;
+	};
+
 	sdhci@c8000400 {
 		status = "okay";
 		cd-gpios = <&gpio 69 1>; /* gpio PI5 */
@@ -568,6 +578,24 @@
 			regulator-max-microvolt = <5000000>;
 			regulator-always-on;
 		};
+
+		vbus1_reg: regulator@2 {
+			compatible = "regulator-fixed";
+			reg = <2>;
+			regulator-name = "vbus1";
+			regulator-min-microvolt = <5000000>;
+			regulator-max-microvolt = <5000000>;
+			gpio = <&tca6416 0 0>; /* GPIO_PMU0 */
+		};
+
+		vbus3_reg: regulator@3 {
+			compatible = "regulator-fixed";
+			reg = <3>;
+			regulator-name = "vbus3";
+			regulator-min-microvolt = <5000000>;
+			regulator-max-microvolt = <5000000>;
+			gpio = <&tca6416 1 0>; /* GPIO_PMU1 */
+		};
 	};
 
 	sound {

diff --git a/arch/arm/boot/dts/tegra20.dtsi b/arch/arm/boot/dts/tegra20.dtsi
index 56a9110..96d6d8a 100644
--- a/arch/arm/boot/dts/tegra20.dtsi
+++ b/arch/arm/boot/dts/tegra20.dtsi

@@ -455,13 +455,24 @@
 		status = "disabled";
 	};
 
-	phy1: usb-phy@c5000400 {
+	phy1: usb-phy@c5000000 {
 		compatible = "nvidia,tegra20-usb-phy";
-		reg = <0xc5000400 0x3c00>;
+		reg = <0xc5000000 0x4000 0xc5000000 0x4000>;
 		phy_type = "utmi";
+		clocks = <&tegra_car 22>,
+			 <&tegra_car 127>,
+			 <&tegra_car 106>,
+			 <&tegra_car 22>;
+		clock-names = "reg", "pll_u", "timer", "utmi-pads";
 		nvidia,has-legacy-mode;
-		clocks = <&tegra_car 22>, <&tegra_car 127>;
-		clock-names = "phy", "pll_u";
+		hssync_start_delay = <9>;
+		idle_wait_delay = <17>;
+		elastic_limit = <16>;
+		term_range_adj = <6>;
+		xcvr_setup = <9>;
+		xcvr_lsfslew = <1>;
+		xcvr_lsrslew = <1>;
+		status = "disabled";
 	};
 
 	usb@c5004000 {
@@ -474,12 +485,15 @@
 		status = "disabled";
 	};
 
-	phy2: usb-phy@c5004400 {
+	phy2: usb-phy@c5004000 {
 		compatible = "nvidia,tegra20-usb-phy";
-		reg = <0xc5004400 0x3c00>;
+		reg = <0xc5004000 0x4000>;
 		phy_type = "ulpi";
-		clocks = <&tegra_car 93>, <&tegra_car 127>;
-		clock-names = "phy", "pll_u";
+		clocks = <&tegra_car 58>,
+			 <&tegra_car 127>,
+			 <&tegra_car 93>;
+		clock-names = "reg", "pll_u", "ulpi-link";
+		status = "disabled";
 	};
 
 	usb@c5008000 {
@@ -492,12 +506,23 @@
 		status = "disabled";
 	};
 
-	phy3: usb-phy@c5008400 {
+	phy3: usb-phy@c5008000 {
 		compatible = "nvidia,tegra20-usb-phy";
-		reg = <0xc5008400 0x3c00>;
+		reg = <0xc5008000 0x4000 0xc5000000 0x4000>;
 		phy_type = "utmi";
-		clocks = <&tegra_car 22>, <&tegra_car 127>;
-		clock-names = "phy", "pll_u";
+		clocks = <&tegra_car 59>,
+			 <&tegra_car 127>,
+			 <&tegra_car 106>,
+			 <&tegra_car 22>;
+		clock-names = "reg", "pll_u", "timer", "utmi-pads";
+		hssync_start_delay = <9>;
+		idle_wait_delay = <17>;
+		elastic_limit = <16>;
+		term_range_adj = <6>;
+		xcvr_setup = <9>;
+		xcvr_lsfslew = <2>;
+		xcvr_lsrslew = <2>;
+		status = "disabled";
 	};
 
 	sdhci@c8000000 {

diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h
index 7652712..dba62cb 100644
--- a/arch/arm/include/asm/cputype.h
+++ b/arch/arm/include/asm/cputype.h

@@ -32,6 +32,8 @@
 
 #define MPIDR_HWID_BITMASK 0xFFFFFF
 
+#define MPIDR_INVALID (~MPIDR_HWID_BITMASK)
+
 #define MPIDR_LEVEL_BITS 8
 #define MPIDR_LEVEL_MASK ((1 << MPIDR_LEVEL_BITS) - 1)
 

diff --git a/arch/arm/include/asm/glue-proc.h b/arch/arm/include/asm/glue-proc.h
index ac1dd54..8017e94 100644
--- a/arch/arm/include/asm/glue-proc.h
+++ b/arch/arm/include/asm/glue-proc.h

@@ -230,6 +230,15 @@
 # endif
 #endif
 
+#ifdef CONFIG_CPU_PJ4B
+# ifdef CPU_NAME
+#  undef  MULTI_CPU
+#  define MULTI_CPU
+# else
+#  define CPU_NAME cpu_pj4b
+# endif
+#endif
+
 #ifndef MULTI_CPU
 #define cpu_proc_init			__glue(CPU_NAME,_proc_init)
 #define cpu_proc_fin			__glue(CPU_NAME,_proc_fin)

diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h
index 7ec60d60..0642228 100644
--- a/arch/arm/include/asm/pgtable-nommu.h
+++ b/arch/arm/include/asm/pgtable-nommu.h

@@ -79,8 +79,6 @@
  * No page table caches to initialise.
  */
 #define pgtable_cache_init()	do { } while (0)
-#define io_remap_pfn_range	remap_pfn_range
-
 
 /*
  * All 32bit addresses are effectively valid for vmalloc...

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 9bcd262..229e0dd 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h

@@ -318,13 +318,6 @@
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 
-/*
- * remap a physical page `pfn' of size `size' with page protection `prot'
- * into virtual address `from'
- */
-#define io_remap_pfn_range(vma,from,pfn,size,prot) \
-		remap_pfn_range(vma, from, pfn, size, prot)
-
 #define pgtable_cache_init() do { } while (0)
 
 #endif /* !__ASSEMBLY__ */

diff --git a/arch/arm/include/asm/smp_plat.h b/arch/arm/include/asm/smp_plat.h
index aaa61b6f..e789832 100644
--- a/arch/arm/include/asm/smp_plat.h
+++ b/arch/arm/include/asm/smp_plat.h

@@ -49,7 +49,7 @@
 /*
  * Logical CPU mapping.
  */
-extern int __cpu_logical_map[];
+extern u32 __cpu_logical_map[];
 #define cpu_logical_map(cpu)	__cpu_logical_map[cpu]
 /*
  * Retrieve logical cpu index corresponding to a given MPIDR[23:0]

diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c
index 5af04f6..5859c8b 100644
--- a/arch/arm/kernel/devtree.c
+++ b/arch/arm/kernel/devtree.c

@@ -82,7 +82,7 @@
 	u32 i, j, cpuidx = 1;
 	u32 mpidr = is_smp() ? read_cpuid_mpidr() & MPIDR_HWID_BITMASK : 0;
 
-	u32 tmp_map[NR_CPUS] = { [0 ... NR_CPUS-1] = UINT_MAX };
+	u32 tmp_map[NR_CPUS] = { [0 ... NR_CPUS-1] = MPIDR_INVALID };
 	bool bootcpu_valid = false;
 	cpus = of_find_node_by_path("/cpus");
 
@@ -92,6 +92,9 @@
 	for_each_child_of_node(cpus, cpu) {
 		u32 hwid;
 
+		if (of_node_cmp(cpu->type, "cpu"))
+			continue;
+
 		pr_debug(" * %s...\n", cpu->full_name);
 		/*
 		 * A device tree containing CPU nodes with missing "reg"
@@ -149,9 +152,10 @@
 		tmp_map[i] = hwid;
 	}
 
-	if (WARN(!bootcpu_valid, "DT missing boot CPU MPIDR[23:0], "
-				 "fall back to default cpu_logical_map\n"))
+	if (!bootcpu_valid) {
+		pr_warn("DT missing boot CPU MPIDR[23:0], fall back to default cpu_logical_map\n");
 		return;
+	}
 
 	/*
 	 * Since the boot CPU node contains proper data, and all nodes have

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 1522c7a..b4b1d39 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c

@@ -444,7 +444,7 @@
 	    : "r14");
 }
 
-int __cpu_logical_map[NR_CPUS];
+u32 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = MPIDR_INVALID };
 
 void __init smp_setup_processor_id(void)
 {

diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index d51225f..eb5293a 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c

@@ -57,6 +57,12 @@
 }
 EXPORT_SYMBOL(flush_dcache_page);
 
+void flush_kernel_dcache_page(struct page *page)
+{
+	__cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
+}
+EXPORT_SYMBOL(flush_kernel_dcache_page);
+
 void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
 		       unsigned long uaddr, void *dst, const void *src,
 		       unsigned long len)

diff --git a/arch/arm/mm/proc-fa526.S b/arch/arm/mm/proc-fa526.S
index d217e97..aaeb6c1 100644
--- a/arch/arm/mm/proc-fa526.S
+++ b/arch/arm/mm/proc-fa526.S

@@ -81,7 +81,6 @@
  */
 	.align	4
 ENTRY(cpu_fa526_do_idle)
-	mcr	p15, 0, r0, c7, c0, 4		@ Wait for interrupt
 	mov	pc, lr
 
 

diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S
index f9a0aa7..e3c48a3 100644
--- a/arch/arm/mm/proc-macros.S
+++ b/arch/arm/mm/proc-macros.S

@@ -333,3 +333,8 @@
 	.endif
 	.size	\name\()_tlb_fns, . - \name\()_tlb_fns
 .endm
+
+.macro globl_equ x, y
+	.globl	\x
+	.equ	\x, \y
+.endm

diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 4c8c9c1..e35fec3 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S

@@ -140,6 +140,29 @@
 ENDPROC(cpu_v7_do_resume)
 #endif
 
+#ifdef CONFIG_CPU_PJ4B
+	globl_equ	cpu_pj4b_switch_mm,     cpu_v7_switch_mm
+	globl_equ	cpu_pj4b_set_pte_ext,	cpu_v7_set_pte_ext
+	globl_equ	cpu_pj4b_proc_init,	cpu_v7_proc_init
+	globl_equ	cpu_pj4b_proc_fin, 	cpu_v7_proc_fin
+	globl_equ	cpu_pj4b_reset,	   	cpu_v7_reset
+#ifdef CONFIG_PJ4B_ERRATA_4742
+ENTRY(cpu_pj4b_do_idle)
+	dsb					@ WFI may enter a low-power mode
+	wfi
+	dsb					@barrier
+	mov	pc, lr
+ENDPROC(cpu_pj4b_do_idle)
+#else
+	globl_equ	cpu_pj4b_do_idle,  	cpu_v7_do_idle
+#endif
+	globl_equ	cpu_pj4b_dcache_clean_area,	cpu_v7_dcache_clean_area
+	globl_equ	cpu_pj4b_do_suspend,	cpu_v7_do_suspend
+	globl_equ	cpu_pj4b_do_resume,	cpu_v7_do_resume
+	globl_equ	cpu_pj4b_suspend_size,	cpu_v7_suspend_size
+
+#endif
+
 	__CPUINIT
 
 /*
@@ -350,6 +373,9 @@
 
 	@ define struct processor (see <asm/proc-fns.h> and proc-macros.S)
 	define_processor_functions v7, dabort=v7_early_abort, pabort=v7_pabort, suspend=1
+#ifdef CONFIG_CPU_PJ4B
+	define_processor_functions pj4b, dabort=v7_early_abort, pabort=v7_pabort, suspend=1
+#endif
 
 	.section ".rodata"
 
@@ -362,7 +388,7 @@
 	/*
 	 * Standard v7 proc info content
 	 */
-.macro __v7_proc initfunc, mm_mmuflags = 0, io_mmuflags = 0, hwcaps = 0
+.macro __v7_proc initfunc, mm_mmuflags = 0, io_mmuflags = 0, hwcaps = 0, proc_fns = v7_processor_functions
 	ALT_SMP(.long	PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AP_READ | \
 			PMD_SECT_AF | PMD_FLAGS_SMP | \mm_mmuflags)
 	ALT_UP(.long	PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AP_READ | \
@@ -375,7 +401,7 @@
 	.long	HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB | HWCAP_FAST_MULT | \
 		HWCAP_EDSP | HWCAP_TLS | \hwcaps
 	.long	cpu_v7_name
-	.long	v7_processor_functions
+	.long	\proc_fns
 	.long	v7wbi_tlb_fns
 	.long	v6_user_fns
 	.long	v7_cache_fns
@@ -407,12 +433,14 @@
 	/*
 	 * Marvell PJ4B processor.
 	 */
+#ifdef CONFIG_CPU_PJ4B
 	.type   __v7_pj4b_proc_info, #object
 __v7_pj4b_proc_info:
 	.long	0x560f5800
 	.long	0xff0fff00
-	__v7_proc __v7_pj4b_setup
+	__v7_proc __v7_pj4b_setup, proc_fns = pj4b_processor_functions
 	.size	__v7_pj4b_proc_info, . - __v7_pj4b_proc_info
+#endif
 
 	/*
 	 * ARM Ltd. Cortex A7 processor.

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e333a24..3a768e9 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h

@@ -320,13 +320,6 @@
 
 #include <asm-generic/pgtable.h>
 
-/*
- * remap a physical page `pfn' of size `size' with page protection `prot'
- * into virtual address `from'
- */
-#define io_remap_pfn_range(vma,from,pfn,size,prot) \
-		remap_pfn_range(vma, from, pfn, size, prot)
-
 #define pgtable_cache_init() do { } while (0)
 
 #endif /* !__ASSEMBLY__ */

diff --git a/arch/avr32/include/asm/pgtable.h b/arch/avr32/include/asm/pgtable.h
index 6fbfea6..4beff97 100644
--- a/arch/avr32/include/asm/pgtable.h
+++ b/arch/avr32/include/asm/pgtable.h

@@ -362,9 +362,6 @@
 
 #define kern_addr_valid(addr)	(1)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)	\
-	remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 /* No page table caches to initialize (?) */
 #define pgtable_cache_init()	do { } while(0)
 

diff --git a/arch/blackfin/include/asm/pgtable.h b/arch/blackfin/include/asm/pgtable.h
index b866392..0b04901 100644
--- a/arch/blackfin/include/asm/pgtable.h
+++ b/arch/blackfin/include/asm/pgtable.h

@@ -88,7 +88,6 @@
  * No page table caches to initialise.
  */
 #define pgtable_cache_init()	do { } while (0)
-#define io_remap_pfn_range      remap_pfn_range
 
 /*
  * All 32bit addresses are effectively valid for vmalloc...

diff --git a/arch/c6x/include/asm/pgtable.h b/arch/c6x/include/asm/pgtable.h
index 38a4312..c0eed5b 100644
--- a/arch/c6x/include/asm/pgtable.h
+++ b/arch/c6x/include/asm/pgtable.h

@@ -71,7 +71,6 @@
  * No page table caches to initialise
  */
 #define pgtable_cache_init()   do { } while (0)
-#define io_remap_pfn_range      remap_pfn_range
 
 #include <asm-generic/pgtable.h>
 

diff --git a/arch/cris/include/asm/pgtable.h b/arch/cris/include/asm/pgtable.h
index 7df4301..8b8c867 100644
--- a/arch/cris/include/asm/pgtable.h
+++ b/arch/cris/include/asm/pgtable.h

@@ -258,9 +258,6 @@
 #define pgd_ERROR(e) \
         printk("%s:%d: bad pgd %p(%08lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)         \
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* defined in head.S */
 

diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h
index 6bc241e..eb0110a 100644
--- a/arch/frv/include/asm/pgtable.h
+++ b/arch/frv/include/asm/pgtable.h

@@ -488,9 +488,6 @@
 #define PageSkip(page)		(0)
 #define kern_addr_valid(addr)	(1)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT

diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h
index 62ef176..7ca20f8 100644
--- a/arch/h8300/include/asm/pgtable.h
+++ b/arch/h8300/include/asm/pgtable.h

@@ -52,9 +52,6 @@
  */
 #define pgtable_cache_init()   do { } while (0)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 /*
  * All 32bit addresses are effectively valid for vmalloc...
  * Sort of meaningless for non-VM targets.

diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index 20d55f6..d8bd54f 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h

@@ -452,10 +452,6 @@
 
 #define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 
-/* Nothing special about IO remapping at this point */
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
-	remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 /*  I think this is in case we have page table caches; needed by init/main.c  */
 #define pgtable_cache_init()    do { } while (0)
 

diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 815810c..7935115 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h

@@ -493,9 +493,6 @@
 #define pte_to_pgoff(pte)		((pte_val(pte) << 1) >> 3)
 #define pgoff_to_pte(off)		((pte_t) { ((off) << 2) | _PAGE_FILE })
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..

diff --git a/arch/m32r/include/asm/pgtable.h b/arch/m32r/include/asm/pgtable.h
index 8a28cfe..103ce67 100644
--- a/arch/m32r/include/asm/pgtable.h
+++ b/arch/m32r/include/asm/pgtable.h

@@ -347,9 +347,6 @@
 /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
 #define kern_addr_valid(addr)	(1)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)	\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT

diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index dc35e0e..9f5abbd 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h

@@ -135,9 +135,6 @@
 
 #define kern_addr_valid(addr)	(1)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 /* MMU-specific headers */
 
 #ifdef CONFIG_SUN3

diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h
index 037028f..c527fc2 100644
--- a/arch/m68k/include/asm/pgtable_no.h
+++ b/arch/m68k/include/asm/pgtable_no.h

@@ -55,9 +55,6 @@
  */
 #define pgtable_cache_init()	do { } while (0)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 /*
  * All 32bit addresses are effectively valid for vmalloc...
  * Sort of meaningless for non-VM targets.

diff --git a/arch/metag/include/asm/pgtable.h b/arch/metag/include/asm/pgtable.h
index 1cd13d5..0d9dc54 100644
--- a/arch/metag/include/asm/pgtable.h
+++ b/arch/metag/include/asm/pgtable.h

@@ -333,9 +333,6 @@
 
 #define kern_addr_valid(addr)	(1)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-	remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 /*
  * No page table caches to initialise
  */

diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index a7311cd..95cef0b 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h

@@ -13,9 +13,6 @@
 
 #include <asm/setup.h>
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 #ifndef __ASSEMBLY__
 extern int mem_init_done;
 #endif

diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 8b8f6b3..008324d 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h

@@ -394,9 +394,7 @@
 	phys_t phys_addr_high = fixup_bigphys_addr(pfn << PAGE_SHIFT, size);
 	return remap_pfn_range(vma, vaddr, phys_addr_high >> PAGE_SHIFT, size, prot);
 }
-#else
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
+#define io_remap_pfn_range io_remap_pfn_range
 #endif
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE

diff --git a/arch/mips/pci/pci-bcm1480.c b/arch/mips/pci/pci-bcm1480.c
index e2e69e1..44dd5aa 100644
--- a/arch/mips/pci/pci-bcm1480.c
+++ b/arch/mips/pci/pci-bcm1480.c

@@ -257,7 +257,9 @@
 	register_pci_controller(&bcm1480_controller);
 
 #ifdef CONFIG_VGA_CONSOLE
-	take_over_console(&vga_con, 0, MAX_NR_CONSOLES-1, 1);
+	console_lock();
+	do_take_over_console(&vga_con, 0, MAX_NR_CONSOLES-1, 1);
+	console_unlock();
 #endif
 	return 0;
 }

diff --git a/arch/mips/pci/pci-sb1250.c b/arch/mips/pci/pci-sb1250.c
index cdefcc4..fc634ae 100644
--- a/arch/mips/pci/pci-sb1250.c
+++ b/arch/mips/pci/pci-sb1250.c

@@ -283,7 +283,9 @@
 	register_pci_controller(&sb1250_controller);
 
 #ifdef CONFIG_VGA_CONSOLE
-	take_over_console(&vga_con, 0, MAX_NR_CONSOLES - 1, 1);
+	console_lock();
+	do_take_over_console(&vga_con, 0, MAX_NR_CONSOLES - 1, 1);
+	console_unlock();
 #endif
 	return 0;
 }

diff --git a/arch/mn10300/include/asm/pgtable.h b/arch/mn10300/include/asm/pgtable.h
index a1e894b..2ddaa67e 100644
--- a/arch/mn10300/include/asm/pgtable.h
+++ b/arch/mn10300/include/asm/pgtable.h

@@ -486,9 +486,6 @@
 
 #define kern_addr_valid(addr)	(1)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
-	remap_pfn_range((vma), (vaddr), (pfn), (size), (prot))
-
 #define MK_IOSPACE_PFN(space, pfn)	(pfn)
 #define GET_IOSPACE(pfn)		0
 #define GET_PFN(pfn)			(pfn)

diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h
index 780560b..d7966e0 100644
--- a/arch/mn10300/include/asm/uaccess.h
+++ b/arch/mn10300/include/asm/uaccess.h

@@ -161,7 +161,7 @@
 
 #define __get_user_check(x, ptr, size)					\
 ({									\
-	const __typeof__(ptr) __guc_ptr = (ptr);			\
+	const __typeof__(*(ptr))* __guc_ptr = (ptr);			\
 	int _e;								\
 	if (likely(__access_ok((unsigned long) __guc_ptr, (size))))	\
 		_e = __get_user_nocheck((x), __guc_ptr, (size));	\

diff --git a/arch/mn10300/kernel/setup.c b/arch/mn10300/kernel/setup.c
index 33c3bd1..ebac9c1 100644
--- a/arch/mn10300/kernel/setup.c
+++ b/arch/mn10300/kernel/setup.c

@@ -38,6 +38,7 @@
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0x18000000;
 
+static char __initdata cmd_line[COMMAND_LINE_SIZE];
 char redboot_command_line[COMMAND_LINE_SIZE] =
 	"console=ttyS0,115200 root=/dev/mtdblock3 rw";
 
@@ -74,45 +75,19 @@
 };
 
 /*
- *
+ * Pick out the memory size.  We look for mem=size,
+ * where size is "size[KkMm]"
  */
-static void __init parse_mem_cmdline(char **cmdline_p)
+static int __init early_mem(char *p)
 {
-	char *from, *to, c;
-
-	/* save unparsed command line copy for /proc/cmdline */
-	strcpy(boot_command_line, redboot_command_line);
-
-	/* see if there's an explicit memory size option */
-	from = redboot_command_line;
-	to = redboot_command_line;
-	c = ' ';
-
-	for (;;) {
-		if (c == ' ' && !memcmp(from, "mem=", 4)) {
-			if (to != redboot_command_line)
-				to--;
-			memory_size = memparse(from + 4, &from);
-		}
-
-		c = *(from++);
-		if (!c)
-			break;
-
-		*(to++) = c;
-	}
-
-	*to = '\0';
-	*cmdline_p = redboot_command_line;
+	memory_size = memparse(p, &p);
 
 	if (memory_size == 0)
 		panic("Memory size not known\n");
 
-	memory_end = (unsigned long) CONFIG_KERNEL_RAM_BASE_ADDRESS +
-		memory_size;
-	if (memory_end > phys_memory_end)
-		memory_end = phys_memory_end;
+	return 0;
 }
+early_param("mem", early_mem);
 
 /*
  * architecture specific setup
@@ -125,7 +100,20 @@
 	cpu_init();
 	unit_setup();
 	smp_init_cpus();
-	parse_mem_cmdline(cmdline_p);
+
+	/* save unparsed command line copy for /proc/cmdline */
+	strlcpy(boot_command_line, redboot_command_line, COMMAND_LINE_SIZE);
+
+	/* populate cmd_line too for later use, preserving boot_command_line */
+	strlcpy(cmd_line, boot_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = cmd_line;
+
+	parse_early_param();
+
+	memory_end = (unsigned long) CONFIG_KERNEL_RAM_BASE_ADDRESS +
+		memory_size;
+	if (memory_end > phys_memory_end)
+		memory_end = phys_memory_end;
 
 	init_mm.start_code = (unsigned long)&_text;
 	init_mm.end_code = (unsigned long) &_etext;

diff --git a/arch/mn10300/unit-asb2305/pci-asb2305.c b/arch/mn10300/unit-asb2305/pci-asb2305.c
index c4e2e79..febb9cd 100644
--- a/arch/mn10300/unit-asb2305/pci-asb2305.c
+++ b/arch/mn10300/unit-asb2305/pci-asb2305.c

@@ -221,7 +221,7 @@
 	/* Leave vm_pgoff as-is, the PCI space address is the physical
 	 * address on this platform.
 	 */
-	vma->vm_flags |= VM_LOCKED | VM_IO;
+	vma->vm_flags |= VM_LOCKED;
 
 	prot = pgprot_val(vma->vm_page_prot);
 	prot &= ~_PAGE_CACHE;

diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 14c900c..37bf6a3 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h

@@ -446,9 +446,6 @@
 
 #define kern_addr_valid(addr)           (1)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)         \
-	remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 #include <asm-generic/pgtable.h>
 
 /*

diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c
index 838b479..88d0962 100644
--- a/arch/parisc/hpux/fs.c
+++ b/arch/parisc/hpux/fs.c

@@ -60,6 +60,7 @@
 };
 
 struct getdents_callback {
+	struct dir_context ctx;
 	struct hpux_dirent __user *current_dir;
 	struct hpux_dirent __user *previous;
 	int count;
@@ -110,24 +111,23 @@
 {
 	struct fd arg;
 	struct hpux_dirent __user * lastdirent;
-	struct getdents_callback buf;
+	struct getdents_callback buf = {
+		.ctx.actor = filldir,
+		.current_dir = dirent,
+		.count = count
+	};
 	int error;
 
 	arg = fdget(fd);
 	if (!arg.file)
 		return -EBADF;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
-
-	error = vfs_readdir(arg.file, filldir, &buf);
+	error = iterate_dir(arg.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		if (put_user(arg.file->f_pos, &lastdirent->d_off))
+		if (put_user(buf.ctx.pos, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;

diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 1e40d7f..34899b5 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h

@@ -506,9 +506,6 @@
 #endif
 
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 #define pgprot_noncached(prot) __pgprot(pgprot_val(prot) | _PAGE_NO_CACHE)
 
 /* We provide our own get_unmapped_area to provide cache coherency */

diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 1e95b20..7349a3f 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c

@@ -156,7 +156,7 @@
 #endif
 
 #if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
-	conswitchp = &dummy_con;	/* we use take_over_console() later ! */
+	conswitchp = &dummy_con;	/* we use do_take_over_console() later ! */
 #endif
 
 }

diff --git a/arch/powerpc/include/asm/mpc52xx_psc.h b/arch/powerpc/include/asm/mpc52xx_psc.h
index 2966df6..d0ece257 100644
--- a/arch/powerpc/include/asm/mpc52xx_psc.h
+++ b/arch/powerpc/include/asm/mpc52xx_psc.h

@@ -299,4 +299,53 @@
 #define rxdata_32 rxdata.rxdata_32
 };
 
+struct mpc5125_psc {
+	u8		mr1;			/* PSC + 0x00 */
+	u8		reserved0[3];
+	u8		mr2;			/* PSC + 0x04 */
+	u8		reserved1[3];
+	struct {
+		u16		status;		/* PSC + 0x08 */
+		u8		reserved2[2];
+		u8		clock_select;	/* PSC + 0x0c */
+		u8		reserved3[3];
+	} sr_csr;
+	u8		command;		/* PSC + 0x10 */
+	u8		reserved4[3];
+	union {					/* PSC + 0x14 */
+		u8		buffer_8;
+		u16		buffer_16;
+		u32		buffer_32;
+	} buffer;
+	struct {
+		u8		ipcr;		/* PSC + 0x18 */
+		u8		reserved5[3];
+		u8		acr;		/* PSC + 0x1c */
+		u8		reserved6[3];
+	} ipcr_acr;
+	struct {
+		u16		isr;		/* PSC + 0x20 */
+		u8		reserved7[2];
+		u16		imr;		/* PSC + 0x24 */
+		u8		reserved8[2];
+	} isr_imr;
+	u8		ctur;			/* PSC + 0x28 */
+	u8		reserved9[3];
+	u8		ctlr;			/* PSC + 0x2c */
+	u8		reserved10[3];
+	u32		ccr;			/* PSC + 0x30 */
+	u32		ac97slots;		/* PSC + 0x34 */
+	u32		ac97cmd;		/* PSC + 0x38 */
+	u32		ac97data;		/* PSC + 0x3c */
+	u8		reserved11[4];
+	u8		ip;			/* PSC + 0x44 */
+	u8		reserved12[3];
+	u8		op1;			/* PSC + 0x48 */
+	u8		reserved13[3];
+	u8		op0;			/* PSC + 0x4c */
+	u8		reserved14[3];
+	u32		sicr;			/* PSC + 0x50 */
+	u8		reserved15[4];	/* make eq. sizeof(mpc52xx_psc) */
+};
+
 #endif  /* __ASM_MPC52xx_PSC_H__ */

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 7aeb955..b6293d2 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h

@@ -198,9 +198,6 @@
  */
 #define kern_addr_valid(addr)	(1)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 #include <asm-generic/pgtable.h>
 
 

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index eabeec9..f46914a 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c

@@ -994,7 +994,7 @@
 		ppc_md.pci_dma_bus_setup(bus);
 }
 
-void pcibios_setup_device(struct pci_dev *dev)
+static void pcibios_setup_device(struct pci_dev *dev)
 {
 	/* Fixup NUMA node as it may not be setup yet by the generic
 	 * code and is needed by the DMA init
@@ -1015,6 +1015,17 @@
 		ppc_md.pci_irq_fixup(dev);
 }
 
+int pcibios_add_device(struct pci_dev *dev)
+{
+	/*
+	 * We can only call pcibios_setup_device() after bus setup is complete,
+	 * since some of the platform specific DMA setup code depends on it.
+	 */
+	if (dev->bus->is_added)
+		pcibios_setup_device(dev);
+	return 0;
+}
+
 void pcibios_setup_bus_devices(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
@@ -1469,10 +1480,6 @@
 		if (ppc_md.pcibios_enable_device_hook(dev))
 			return -EINVAL;
 
-	/* avoid pcie irq fix up impact on cardbus */
-	if (dev->hdr_type != PCI_HEADER_TYPE_CARDBUS)
-		pcibios_setup_device(dev);
-
 	return pci_enable_resources(dev, mask);
 }
 

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 35f77a4..f390042 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c

@@ -238,7 +238,7 @@
 	.release	= spufs_dir_close,
 	.llseek		= dcache_dir_lseek,
 	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
+	.iterate	= dcache_readdir,
 	.fsync		= noop_fsync,
 };
 EXPORT_SYMBOL_GPL(spufs_context_fops);

diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
index 5a4c879..5ce3ba7 100644
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ b/arch/powerpc/platforms/pseries/eeh_cache.c

@@ -294,8 +294,6 @@
 	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
 
 	for_each_pci_dev(dev) {
-		eeh_addr_cache_insert_dev(dev);
-
 		dn = pci_device_to_OF_node(dev);
 		if (!dn)
 			continue;
@@ -308,6 +306,8 @@
 		dev->dev.archdata.edev = edev;
 		edev->pdev = dev;
 
+		eeh_addr_cache_insert_dev(dev);
+
 		eeh_sysfs_add_device(dev);
 	}
 

diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
index fe43d1a..9d4a9e8 100644
--- a/arch/powerpc/platforms/pseries/eeh_pe.c
+++ b/arch/powerpc/platforms/pseries/eeh_pe.c

@@ -639,7 +639,8 @@
 
 	if (pe->type & EEH_PE_PHB) {
 		bus = pe->phb->bus;
-	} else if (pe->type & EEH_PE_BUS) {
+	} else if (pe->type & EEH_PE_BUS ||
+		   pe->type & EEH_PE_DEVICE) {
 		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
 		pdev = eeh_dev_to_pci_dev(edev);
 		if (pdev)

diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 028ac1f..46ac1dd 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c

@@ -97,22 +97,14 @@
 	return indirect_read_config(bus, devfn, offset, len, val);
 }
 
-static struct pci_ops fsl_indirect_pci_ops =
+#if defined(CONFIG_FSL_SOC_BOOKE) || defined(CONFIG_PPC_86xx)
+
+static struct pci_ops fsl_indirect_pcie_ops =
 {
 	.read = fsl_indirect_read_config,
 	.write = indirect_write_config,
 };
 
-static void __init fsl_setup_indirect_pci(struct pci_controller* hose,
-					  resource_size_t cfg_addr,
-					  resource_size_t cfg_data, u32 flags)
-{
-	setup_indirect_pci(hose, cfg_addr, cfg_data, flags);
-	hose->ops = &fsl_indirect_pci_ops;
-}
-
-#if defined(CONFIG_FSL_SOC_BOOKE) || defined(CONFIG_PPC_86xx)
-
 #define MAX_PHYS_ADDR_BITS	40
 static u64 pci64_dma_offset = 1ull << MAX_PHYS_ADDR_BITS;
 
@@ -504,13 +496,15 @@
 	if (!hose->private_data)
 		goto no_bridge;
 
-	fsl_setup_indirect_pci(hose, rsrc.start, rsrc.start + 0x4,
-			       PPC_INDIRECT_TYPE_BIG_ENDIAN);
+	setup_indirect_pci(hose, rsrc.start, rsrc.start + 0x4,
+			   PPC_INDIRECT_TYPE_BIG_ENDIAN);
 
 	if (in_be32(&pci->block_rev1) < PCIE_IP_REV_3_0)
 		hose->indirect_type |= PPC_INDIRECT_TYPE_FSL_CFG_REG_LINK;
 
 	if (early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP)) {
+		/* use fsl_indirect_read_config for PCIe */
+		hose->ops = &fsl_indirect_pcie_ops;
 		/* For PCIE read HEADER_TYPE to identify controler mode */
 		early_read_config_byte(hose, 0, 0, PCI_HEADER_TYPE, &hdr_type);
 		if ((hdr_type & 0x7f) != PCI_HEADER_TYPE_BRIDGE)
@@ -814,8 +808,8 @@
 		if (ret)
 			goto err0;
 	} else {
-		fsl_setup_indirect_pci(hose, rsrc_cfg.start,
-				       rsrc_cfg.start + 4, 0);
+		setup_indirect_pci(hose, rsrc_cfg.start,
+				   rsrc_cfg.start + 4, 0);
 	}
 
 	printk(KERN_INFO "Found FSL PCI host bridge at 0x%016llx. "

diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h
index 886ac7d..2f8c1ab 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h

@@ -50,9 +50,10 @@
 {
 	struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
+	debug_dma_mapping_error(dev, dma_addr);
 	if (dma_ops->mapping_error)
 		return dma_ops->mapping_error(dev, dma_addr);
-	return (dma_addr == 0UL);
+	return (dma_addr == DMA_ERROR_CODE);
 }
 
 static inline void *dma_alloc_coherent(struct device *dev, size_t size,

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index e8b6e5b..9aefa3c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h

@@ -58,9 +58,6 @@
 #define __HAVE_COLOR_ZERO_PAGE
 
 /* TODO: s390 cannot support io_remap_pfn_range... */
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) 	       \
-	remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 #endif /* !__ASSEMBLY__ */
 
 /*

diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index d8a6a38..feb719d 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c

@@ -754,9 +754,9 @@
 	.write = reipl_fcp_scpdata_write,
 };
 
-DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%016llx\n",
+DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%llx\n",
 		   reipl_block_fcp->ipl_info.fcp.wwpn);
-DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%016llx\n",
+DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%llx\n",
 		   reipl_block_fcp->ipl_info.fcp.lun);
 DEFINE_IPL_ATTR_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n",
 		   reipl_block_fcp->ipl_info.fcp.bootprog);
@@ -1323,9 +1323,9 @@
 
 /* FCP dump device attributes */
 
-DEFINE_IPL_ATTR_RW(dump_fcp, wwpn, "0x%016llx\n", "%016llx\n",
+DEFINE_IPL_ATTR_RW(dump_fcp, wwpn, "0x%016llx\n", "%llx\n",
 		   dump_block_fcp->ipl_info.fcp.wwpn);
-DEFINE_IPL_ATTR_RW(dump_fcp, lun, "0x%016llx\n", "%016llx\n",
+DEFINE_IPL_ATTR_RW(dump_fcp, lun, "0x%016llx\n", "%llx\n",
 		   dump_block_fcp->ipl_info.fcp.lun);
 DEFINE_IPL_ATTR_RW(dump_fcp, bootprog, "%lld\n", "%lld\n",
 		   dump_block_fcp->ipl_info.fcp.bootprog);

diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 408e866..dd3c199 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c

@@ -312,6 +312,7 @@
 }
 EXPORT_SYMBOL(measurement_alert_subclass_unregister);
 
+#ifdef CONFIG_SMP
 void synchronize_irq(unsigned int irq)
 {
 	/*
@@ -320,6 +321,7 @@
 	 */
 }
 EXPORT_SYMBOL_GPL(synchronize_irq);
+#endif
 
 #ifndef CONFIG_PCI
 

diff --git a/arch/s390/mm/mem_detect.c b/arch/s390/mm/mem_detect.c
index 3cbd3b8..cca3882 100644
--- a/arch/s390/mm/mem_detect.c
+++ b/arch/s390/mm/mem_detect.c

@@ -123,7 +123,8 @@
 			continue;
 		} else if ((addr <= chunk->addr) &&
 			   (addr + size >= chunk->addr + chunk->size)) {
-			memset(chunk, 0 , sizeof(*chunk));
+			memmove(chunk, chunk + 1, (MEMORY_CHUNKS-i-1) * sizeof(*chunk));
+			memset(&mem_chunk[MEMORY_CHUNKS-1], 0, sizeof(*chunk));
 		} else if (addr + size < chunk->addr + chunk->size) {
 			chunk->size =  chunk->addr + chunk->size - addr - size;
 			chunk->addr = addr + size;

diff --git a/arch/score/include/asm/pgtable.h b/arch/score/include/asm/pgtable.h
index 2fd4698..db96ad9 100644
--- a/arch/score/include/asm/pgtable.h
+++ b/arch/score/include/asm/pgtable.h

@@ -113,9 +113,6 @@
 #define pte_clear(mm, addr, xp)		\
 	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 /*
  * The "pgd_xxx()" functions here are trivial for a folded two-level
  * setup: the pgd is never bad, and a pmd always exists (as it's folded

diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 9210e93..cf434c6 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h

@@ -124,9 +124,6 @@
 
 #define kern_addr_valid(addr)	(1)
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 #define pte_pfn(x)		((unsigned long)(((x).pte_low >> PAGE_SHIFT)))
 
 /*

diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 6fc1348..502f632 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h

@@ -443,6 +443,7 @@
 
 	return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot);
 }
+#define io_remap_pfn_range io_remap_pfn_range 
 
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 7619f2f..79c214e 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h

@@ -914,6 +914,7 @@
 
 	return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot);
 }
+#define io_remap_pfn_range io_remap_pfn_range 
 
 #include <asm/tlbflush.h>
 #include <asm-generic/pgtable.h>

diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index baf4366..2031c65 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c

@@ -773,15 +773,6 @@
 	return 0;
 }
 
-/* Set vm_flags of VMA, as appropriate for this architecture, for a pci device
- * mapping.
- */
-static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma,
-					    enum pci_mmap_state mmap_state)
-{
-	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
-}
-
 /* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
  * device mapping.
  */
@@ -809,7 +800,6 @@
 	if (ret < 0)
 		return ret;
 
-	__pci_mmap_set_flags(dev, vma, mmap_state);
 	__pci_mmap_set_pgprot(dev, vma, mmap_state);
 
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 73b1a4c..33587f1 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h

@@ -362,9 +362,6 @@
 #define kern_addr_valid(addr)	(1)
 #endif /* CONFIG_FLATMEM */
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 extern void vmalloc_sync_all(void);
 
 #endif /* !__ASSEMBLY__ */

diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index ae02909..bf974f7 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h

@@ -69,8 +69,6 @@
 #define PAGE_KERNEL	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
 #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC)
 
-#define io_remap_pfn_range	remap_pfn_range
-
 /*
  * The i386 can't do page protection for execute, and considers that the same
  * are read.

diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h
index 68b2f29..233c258 100644
--- a/arch/unicore32/include/asm/pgtable.h
+++ b/arch/unicore32/include/asm/pgtable.h

@@ -303,13 +303,6 @@
 
 #include <asm-generic/pgtable.h>
 
-/*
- * remap a physical page `pfn' of size `size' with page protection `prot'
- * into virtual address `from'
- */
-#define io_remap_pfn_range(vma, from, pfn, size, prot)	\
-		remap_pfn_range(vma, from, pfn, size, prot)
-
 #define pgtable_cache_init() do { } while (0)
 
 #endif /* !__ASSEMBLY__ */

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1e67223..5b0818b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h

@@ -506,9 +506,6 @@
 	return npg >> (20 - PAGE_SHIFT);
 }
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)	\
-	remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 #if PAGETABLE_LEVELS > 2
 static inline int pud_none(pud_t pud)
 {

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 9895a9a..211bce4 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c

@@ -365,10 +365,14 @@
 	return insn.length;
 }
 
-static void __kprobes arch_copy_kprobe(struct kprobe *p)
+static int __kprobes arch_copy_kprobe(struct kprobe *p)
 {
+	int ret;
+
 	/* Copy an instruction with recovering if other optprobe modifies it.*/
-	__copy_instruction(p->ainsn.insn, p->addr);
+	ret = __copy_instruction(p->ainsn.insn, p->addr);
+	if (!ret)
+		return -EINVAL;
 
 	/*
 	 * __copy_instruction can modify the displacement of the instruction,
@@ -384,6 +388,8 @@
 
 	/* Also, displacement change doesn't affect the first byte */
 	p->opcode = p->ainsn.insn[0];
+
+	return 0;
 }
 
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
@@ -397,8 +403,8 @@
 	p->ainsn.insn = get_insn_slot();
 	if (!p->ainsn.insn)
 		return -ENOMEM;
-	arch_copy_kprobe(p);
-	return 0;
+
+	return arch_copy_kprobe(p);
 }
 
 void __kprobes arch_arm_kprobe(struct kprobe *p)

diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index d7546c9..8f017eb 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h

@@ -393,14 +393,6 @@
 extern  void update_mmu_cache(struct vm_area_struct * vma,
 			      unsigned long address, pte_t *ptep);
 
-/*
- * remap a physical page `pfn' of size `size' with page protection `prot'
- * into virtual address `from'
- */
-
-#define io_remap_pfn_range(vma,from,pfn,size,prot) \
-	remap_pfn_range(vma, from, pfn, size, prot)
-
 typedef pte_t *pte_addr_t;
 
 #endif /* !defined (__ASSEMBLY__) */

diff --git a/crypto/algboss.c b/crypto/algboss.c
index 769219b..76fc0b2 100644
--- a/crypto/algboss.c
+++ b/crypto/algboss.c

@@ -45,10 +45,9 @@
 		} nu32;
 	} attrs[CRYPTO_MAX_ATTRS];
 
-	char larval[CRYPTO_MAX_ALG_NAME];
 	char template[CRYPTO_MAX_ALG_NAME];
 
-	struct completion *completion;
+	struct crypto_larval *larval;
 
 	u32 otype;
 	u32 omask;
@@ -87,7 +86,8 @@
 	crypto_tmpl_put(tmpl);
 
 out:
-	complete_all(param->completion);
+	complete_all(&param->larval->completion);
+	crypto_alg_put(&param->larval->alg);
 	kfree(param);
 	module_put_and_exit(0);
 }
@@ -187,18 +187,19 @@
 	param->otype = larval->alg.cra_flags;
 	param->omask = larval->mask;
 
-	memcpy(param->larval, larval->alg.cra_name, CRYPTO_MAX_ALG_NAME);
-
-	param->completion = &larval->completion;
+	crypto_alg_get(&larval->alg);
+	param->larval = larval;
 
 	thread = kthread_run(cryptomgr_probe, param, "cryptomgr_probe");
 	if (IS_ERR(thread))
-		goto err_free_param;
+		goto err_put_larval;
 
 	wait_for_completion_interruptible(&larval->completion);
 
 	return NOTIFY_STOP;
 
+err_put_larval:
+	crypto_alg_put(&larval->alg);
 err_free_param:
 	kfree(param);
 err_put_module:

diff --git a/crypto/api.c b/crypto/api.c
index 033a714..3b61803 100644
--- a/crypto/api.c
+++ b/crypto/api.c

@@ -34,12 +34,6 @@
 BLOCKING_NOTIFIER_HEAD(crypto_chain);
 EXPORT_SYMBOL_GPL(crypto_chain);
 
-static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg)
-{
-	atomic_inc(&alg->cra_refcnt);
-	return alg;
-}
-
 struct crypto_alg *crypto_mod_get(struct crypto_alg *alg)
 {
 	return try_module_get(alg->cra_module) ? crypto_alg_get(alg) : NULL;

diff --git a/crypto/internal.h b/crypto/internal.h
index 9ebedae3f..bd39bfc 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h

@@ -103,6 +103,12 @@
 int crypto_unregister_notifier(struct notifier_block *nb);
 int crypto_probing_notify(unsigned long val, void *v);
 
+static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg)
+{
+	atomic_inc(&alg->cra_refcnt);
+	return alg;
+}
+
 static inline void crypto_alg_put(struct crypto_alg *alg)
 {
 	if (atomic_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy)

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 9953a42..ae050b5 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig

@@ -166,4 +166,6 @@
 
 source "drivers/reset/Kconfig"
 
+source "drivers/fmc/Kconfig"
+
 endmenu

diff --git a/drivers/Makefile b/drivers/Makefile
index 130abc1..336b0ad 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile

@@ -152,3 +152,4 @@
 obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_IPACK_BUS)		+= ipack/
 obj-$(CONFIG_NTB)		+= ntb/
+obj-$(CONFIG_FMC)		+= fmc/

diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c
index ec117c6..14de9f4 100644
--- a/drivers/acpi/dock.c
+++ b/drivers/acpi/dock.c

@@ -66,20 +66,21 @@
 	spinlock_t dd_lock;
 	struct mutex hp_lock;
 	struct list_head dependent_devices;
-	struct list_head hotplug_devices;
 
 	struct list_head sibling;
 	struct platform_device *dock_device;
 };
 static LIST_HEAD(dock_stations);
 static int dock_station_count;
+static DEFINE_MUTEX(hotplug_lock);
 
 struct dock_dependent_device {
 	struct list_head list;
-	struct list_head hotplug_list;
 	acpi_handle handle;
-	const struct acpi_dock_ops *ops;
-	void *context;
+	const struct acpi_dock_ops *hp_ops;
+	void *hp_context;
+	unsigned int hp_refcount;
+	void (*hp_release)(void *);
 };
 
 #define DOCK_DOCKING	0x00000001
@@ -111,7 +112,6 @@
 
 	dd->handle = handle;
 	INIT_LIST_HEAD(&dd->list);
-	INIT_LIST_HEAD(&dd->hotplug_list);
 
 	spin_lock(&ds->dd_lock);
 	list_add_tail(&dd->list, &ds->dependent_devices);
@@ -121,35 +121,90 @@
 }
 
 /**
- * dock_add_hotplug_device - associate a hotplug handler with the dock station
- * @ds: The dock station
- * @dd: The dependent device struct
- *
- * Add the dependent device to the dock's hotplug device list
+ * dock_init_hotplug - Initialize a hotplug device on a docking station.
+ * @dd: Dock-dependent device.
+ * @ops: Dock operations to attach to the dependent device.
+ * @context: Data to pass to the @ops callbacks and @release.
+ * @init: Optional initialization routine to run after setting up context.
+ * @release: Optional release routine to run on removal.
  */
-static void
-dock_add_hotplug_device(struct dock_station *ds,
-			struct dock_dependent_device *dd)
+static int dock_init_hotplug(struct dock_dependent_device *dd,
+			     const struct acpi_dock_ops *ops, void *context,
+			     void (*init)(void *), void (*release)(void *))
 {
-	mutex_lock(&ds->hp_lock);
-	list_add_tail(&dd->hotplug_list, &ds->hotplug_devices);
-	mutex_unlock(&ds->hp_lock);
+	int ret = 0;
+
+	mutex_lock(&hotplug_lock);
+
+	if (dd->hp_context) {
+		ret = -EEXIST;
+	} else {
+		dd->hp_refcount = 1;
+		dd->hp_ops = ops;
+		dd->hp_context = context;
+		dd->hp_release = release;
+	}
+
+	if (!WARN_ON(ret) && init)
+		init(context);
+
+	mutex_unlock(&hotplug_lock);
+	return ret;
 }
 
 /**
- * dock_del_hotplug_device - remove a hotplug handler from the dock station
- * @ds: The dock station
- * @dd: the dependent device struct
+ * dock_release_hotplug - Decrement hotplug reference counter of dock device.
+ * @dd: Dock-dependent device.
  *
- * Delete the dependent device from the dock's hotplug device list
+ * Decrement the reference counter of @dd and if 0, detach its hotplug
+ * operations from it, reset its context pointer and run the optional release
+ * routine if present.
  */
-static void
-dock_del_hotplug_device(struct dock_station *ds,
-			struct dock_dependent_device *dd)
+static void dock_release_hotplug(struct dock_dependent_device *dd)
 {
-	mutex_lock(&ds->hp_lock);
-	list_del(&dd->hotplug_list);
-	mutex_unlock(&ds->hp_lock);
+	void (*release)(void *) = NULL;
+	void *context = NULL;
+
+	mutex_lock(&hotplug_lock);
+
+	if (dd->hp_context && !--dd->hp_refcount) {
+		dd->hp_ops = NULL;
+		context = dd->hp_context;
+		dd->hp_context = NULL;
+		release = dd->hp_release;
+		dd->hp_release = NULL;
+	}
+
+	if (release && context)
+		release(context);
+
+	mutex_unlock(&hotplug_lock);
+}
+
+static void dock_hotplug_event(struct dock_dependent_device *dd, u32 event,
+			       bool uevent)
+{
+	acpi_notify_handler cb = NULL;
+	bool run = false;
+
+	mutex_lock(&hotplug_lock);
+
+	if (dd->hp_context) {
+		run = true;
+		dd->hp_refcount++;
+		if (dd->hp_ops)
+			cb = uevent ? dd->hp_ops->uevent : dd->hp_ops->handler;
+	}
+
+	mutex_unlock(&hotplug_lock);
+
+	if (!run)
+		return;
+
+	if (cb)
+		cb(dd->handle, event, dd->hp_context);
+
+	dock_release_hotplug(dd);
 }
 
 /**
@@ -360,9 +415,8 @@
 	/*
 	 * First call driver specific hotplug functions
 	 */
-	list_for_each_entry(dd, &ds->hotplug_devices, hotplug_list)
-		if (dd->ops && dd->ops->handler)
-			dd->ops->handler(dd->handle, event, dd->context);
+	list_for_each_entry(dd, &ds->dependent_devices, list)
+		dock_hotplug_event(dd, event, false);
 
 	/*
 	 * Now make sure that an acpi_device is created for each
@@ -398,9 +452,8 @@
 	if (num == DOCK_EVENT)
 		kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, envp);
 
-	list_for_each_entry(dd, &ds->hotplug_devices, hotplug_list)
-		if (dd->ops && dd->ops->uevent)
-			dd->ops->uevent(dd->handle, event, dd->context);
+	list_for_each_entry(dd, &ds->dependent_devices, list)
+		dock_hotplug_event(dd, event, true);
 
 	if (num != DOCK_EVENT)
 		kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, envp);
@@ -570,19 +623,24 @@
  * @handle: the handle of the device
  * @ops: handlers to call after docking
  * @context: device specific data
+ * @init: Optional initialization routine to run after registration
+ * @release: Optional release routine to run on unregistration
  *
  * If a driver would like to perform a hotplug operation after a dock
  * event, they can register an acpi_notifiy_handler to be called by
  * the dock driver after _DCK is executed.
  */
-int
-register_hotplug_dock_device(acpi_handle handle, const struct acpi_dock_ops *ops,
-			     void *context)
+int register_hotplug_dock_device(acpi_handle handle,
+				 const struct acpi_dock_ops *ops, void *context,
+				 void (*init)(void *), void (*release)(void *))
 {
 	struct dock_dependent_device *dd;
 	struct dock_station *dock_station;
 	int ret = -EINVAL;
 
+	if (WARN_ON(!context))
+		return -EINVAL;
+
 	if (!dock_station_count)
 		return -ENODEV;
 
@@ -597,12 +655,8 @@
 		 * ops
 		 */
 		dd = find_dock_dependent_device(dock_station, handle);
-		if (dd) {
-			dd->ops = ops;
-			dd->context = context;
-			dock_add_hotplug_device(dock_station, dd);
+		if (dd && !dock_init_hotplug(dd, ops, context, init, release))
 			ret = 0;
-		}
 	}
 
 	return ret;
@@ -624,7 +678,7 @@
 	list_for_each_entry(dock_station, &dock_stations, sibling) {
 		dd = find_dock_dependent_device(dock_station, handle);
 		if (dd)
-			dock_del_hotplug_device(dock_station, dd);
+			dock_release_hotplug(dd);
 	}
 }
 EXPORT_SYMBOL_GPL(unregister_hotplug_dock_device);
@@ -953,7 +1007,6 @@
 	mutex_init(&dock_station->hp_lock);
 	spin_lock_init(&dock_station->dd_lock);
 	INIT_LIST_HEAD(&dock_station->sibling);
-	INIT_LIST_HEAD(&dock_station->hotplug_devices);
 	ATOMIC_INIT_NOTIFIER_HEAD(&dock_notifier_list);
 	INIT_LIST_HEAD(&dock_station->dependent_devices);
 
@@ -994,30 +1047,6 @@
 }
 
 /**
- * dock_remove - free up resources related to the dock station
- */
-static int dock_remove(struct dock_station *ds)
-{
-	struct dock_dependent_device *dd, *tmp;
-	struct platform_device *dock_device = ds->dock_device;
-
-	if (!dock_station_count)
-		return 0;
-
-	/* remove dependent devices */
-	list_for_each_entry_safe(dd, tmp, &ds->dependent_devices, list)
-		kfree(dd);
-
-	list_del(&ds->sibling);
-
-	/* cleanup sysfs */
-	sysfs_remove_group(&dock_device->dev.kobj, &dock_attribute_group);
-	platform_device_unregister(dock_device);
-
-	return 0;
-}
-
-/**
  * find_dock_and_bay - look for dock stations and bays
  * @handle: acpi handle of a device
  * @lvl: unused
@@ -1035,7 +1064,7 @@
 	return AE_OK;
 }
 
-static int __init dock_init(void)
+int __init acpi_dock_init(void)
 {
 	if (acpi_disabled)
 		return 0;
@@ -1054,19 +1083,3 @@
 		ACPI_DOCK_DRIVER_DESCRIPTION, dock_station_count);
 	return 0;
 }
-
-static void __exit dock_exit(void)
-{
-	struct dock_station *tmp, *dock_station;
-
-	unregister_acpi_bus_notifier(&dock_acpi_notifier);
-	list_for_each_entry_safe(dock_station, tmp, &dock_stations, sibling)
-		dock_remove(dock_station);
-}
-
-/*
- * Must be called before drivers of devices in dock, otherwise we can't know
- * which devices are in a dock
- */
-subsys_initcall(dock_init);
-module_exit(dock_exit);

diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 297cbf4..c610a76 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h

@@ -40,6 +40,11 @@
 #else
 static inline void acpi_container_init(void) {}
 #endif
+#ifdef CONFIG_ACPI_DOCK
+void acpi_dock_init(void);
+#else
+static inline void acpi_dock_init(void) {}
+#endif
 #ifdef CONFIG_ACPI_HOTPLUG_MEMORY
 void acpi_memory_hotplug_init(void);
 #else

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index b14ac46..27da630 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c

@@ -2042,6 +2042,7 @@
 	acpi_lpss_init();
 	acpi_container_init();
 	acpi_memory_hotplug_init();
+	acpi_dock_init();
 
 	mutex_lock(&acpi_scan_lock);
 	/*

diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index 87f2f39..cf4e702 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c

@@ -156,8 +156,10 @@
 
 	spin_unlock_irqrestore(ap->lock, flags);
 
-	if (wait)
+	if (wait) {
 		ata_port_wait_eh(ap);
+		flush_work(&ap->hotplug_task.work);
+	}
 }
 
 static void ata_acpi_dev_notify_dock(acpi_handle handle, u32 event, void *data)
@@ -214,6 +216,39 @@
 	.uevent = ata_acpi_ap_uevent,
 };
 
+void ata_acpi_hotplug_init(struct ata_host *host)
+{
+	int i;
+
+	for (i = 0; i < host->n_ports; i++) {
+		struct ata_port *ap = host->ports[i];
+		acpi_handle handle;
+		struct ata_device *dev;
+
+		if (!ap)
+			continue;
+
+		handle = ata_ap_acpi_handle(ap);
+		if (handle) {
+			/* we might be on a docking station */
+			register_hotplug_dock_device(handle,
+						     &ata_acpi_ap_dock_ops, ap,
+						     NULL, NULL);
+		}
+
+		ata_for_each_dev(dev, &ap->link, ALL) {
+			handle = ata_dev_acpi_handle(dev);
+			if (!handle)
+				continue;
+
+			/* we might be on a docking station */
+			register_hotplug_dock_device(handle,
+						     &ata_acpi_dev_dock_ops,
+						     dev, NULL, NULL);
+		}
+	}
+}
+
 /**
  * ata_acpi_dissociate - dissociate ATA host from ACPI objects
  * @host: target ATA host

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index f218427..adf002a3 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c

@@ -6148,6 +6148,8 @@
 	if (rc)
 		goto err_tadd;
 
+	ata_acpi_hotplug_init(host);
+
 	/* set cable, sata_spd_limit and report */
 	for (i = 0; i < host->n_ports; i++) {
 		struct ata_port *ap = host->ports[i];

diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index c949dd3..577d902 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h

@@ -122,6 +122,7 @@
 extern void ata_acpi_unregister(void);
 extern void ata_acpi_bind(struct ata_device *dev);
 extern void ata_acpi_unbind(struct ata_device *dev);
+extern void ata_acpi_hotplug_init(struct ata_host *host);
 #else
 static inline void ata_acpi_dissociate(struct ata_host *host) { }
 static inline int ata_acpi_on_suspend(struct ata_port *ap) { return 0; }
@@ -134,6 +135,7 @@
 static inline void ata_acpi_unregister(void) { }
 static inline void ata_acpi_bind(struct ata_device *dev) { }
 static inline void ata_acpi_unbind(struct ata_device *dev) { }
+static inline void ata_acpi_hotplug_init(struct ata_host *host) {}
 #endif
 
 /* libata-scsi.c */

diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c
index 8b6bb76..99e773c 100644
--- a/drivers/block/cryptoloop.c
+++ b/drivers/block/cryptoloop.c

@@ -25,9 +25,9 @@
 #include <linux/string.h>
 #include <linux/crypto.h>
 #include <linux/blkdev.h>
-#include <linux/loop.h>
 #include <linux/scatterlist.h>
 #include <asm/uaccess.h>
+#include "loop.h"
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("loop blockdevice transferfunction adaptor / CryptoAPI");

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d92d50f..40e7155 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c

@@ -63,7 +63,6 @@
 #include <linux/init.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
-#include <linux/loop.h>
 #include <linux/compat.h>
 #include <linux/suspend.h>
 #include <linux/freezer.h>
@@ -76,6 +75,7 @@
 #include <linux/sysfs.h>
 #include <linux/miscdevice.h>
 #include <linux/falloc.h>
+#include "loop.h"
 
 #include <asm/uaccess.h>
 

diff --git a/drivers/block/loop.h b/drivers/block/loop.h
new file mode 100644
index 0000000..90df5d6
--- /dev/null
+++ b/drivers/block/loop.h

@@ -0,0 +1,85 @@
+/*
+ * loop.h
+ *
+ * Written by Theodore Ts'o, 3/29/93.
+ *
+ * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
+ * permitted under the GNU General Public License.
+ */
+#ifndef _LINUX_LOOP_H
+#define _LINUX_LOOP_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <uapi/linux/loop.h>
+
+/* Possible states of device */
+enum {
+	Lo_unbound,
+	Lo_bound,
+	Lo_rundown,
+};
+
+struct loop_func_table;
+
+struct loop_device {
+	int		lo_number;
+	int		lo_refcnt;
+	loff_t		lo_offset;
+	loff_t		lo_sizelimit;
+	int		lo_flags;
+	int		(*transfer)(struct loop_device *, int cmd,
+				    struct page *raw_page, unsigned raw_off,
+				    struct page *loop_page, unsigned loop_off,
+				    int size, sector_t real_block);
+	char		lo_file_name[LO_NAME_SIZE];
+	char		lo_crypt_name[LO_NAME_SIZE];
+	char		lo_encrypt_key[LO_KEY_SIZE];
+	int		lo_encrypt_key_size;
+	struct loop_func_table *lo_encryption;
+	__u32           lo_init[2];
+	kuid_t		lo_key_owner;	/* Who set the key */
+	int		(*ioctl)(struct loop_device *, int cmd, 
+				 unsigned long arg); 
+
+	struct file *	lo_backing_file;
+	struct block_device *lo_device;
+	unsigned	lo_blocksize;
+	void		*key_data; 
+
+	gfp_t		old_gfp_mask;
+
+	spinlock_t		lo_lock;
+	struct bio_list		lo_bio_list;
+	unsigned int		lo_bio_count;
+	int			lo_state;
+	struct mutex		lo_ctl_mutex;
+	struct task_struct	*lo_thread;
+	wait_queue_head_t	lo_event;
+	/* wait queue for incoming requests */
+	wait_queue_head_t	lo_req_wait;
+
+	struct request_queue	*lo_queue;
+	struct gendisk		*lo_disk;
+};
+
+/* Support for loadable transfer modules */
+struct loop_func_table {
+	int number;	/* filter type */ 
+	int (*transfer)(struct loop_device *lo, int cmd,
+			struct page *raw_page, unsigned raw_off,
+			struct page *loop_page, unsigned loop_off,
+			int size, sector_t real_block);
+	int (*init)(struct loop_device *, const struct loop_info64 *); 
+	/* release is called from loop_unregister_transfer or clr_fd */
+	int (*release)(struct loop_device *); 
+	int (*ioctl)(struct loop_device *, int cmd, unsigned long arg);
+	struct module *owner;
+}; 
+
+int loop_register_transfer(struct loop_func_table *funcs);
+int loop_unregister_transfer(int number); 
+
+#endif

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 49394e3..aff789d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c

@@ -2252,13 +2252,17 @@
 					obj_request->pages, length,
 					offset & ~PAGE_MASK, false, false);
 
+		/*
+		 * set obj_request->img_request before formatting
+		 * the osd_request so that it gets the right snapc
+		 */
+		rbd_img_obj_request_add(img_request, obj_request);
 		if (write_request)
 			rbd_osd_req_format_write(obj_request);
 		else
 			rbd_osd_req_format_read(obj_request);
 
 		obj_request->img_offset = img_offset;
-		rbd_img_obj_request_add(img_request, obj_request);
 
 		img_offset += length;
 		resid -= length;
@@ -4243,6 +4247,10 @@
 
 	down_write(&rbd_dev->header_rwsem);
 
+	ret = rbd_dev_v2_image_size(rbd_dev);
+	if (ret)
+		goto out;
+
 	if (first_time) {
 		ret = rbd_dev_v2_header_onetime(rbd_dev);
 		if (ret)
@@ -4276,10 +4284,6 @@
 					"is EXPERIMENTAL!");
 	}
 
-	ret = rbd_dev_v2_image_size(rbd_dev);
-	if (ret)
-		goto out;
-
 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
 		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
 			rbd_dev->mapping.size = rbd_dev->header.image_size;

diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c
index 3a4343b..9a9f518 100644
--- a/drivers/bluetooth/btmrvl_main.c
+++ b/drivers/bluetooth/btmrvl_main.c

@@ -498,6 +498,10 @@
 		add_wait_queue(&thread->wait_q, &wait);
 
 		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop()) {
+			BT_DBG("main_thread: break from main thread");
+			break;
+		}
 
 		if (adapter->wakeup_tries ||
 				((!adapter->int_count) &&
@@ -513,11 +517,6 @@
 
 		BT_DBG("main_thread woke up");
 
-		if (kthread_should_stop()) {
-			BT_DBG("main_thread: break from main thread");
-			break;
-		}
-
 		spin_lock_irqsave(&priv->driver_lock, flags);
 		if (adapter->int_count) {
 			adapter->int_count = 0;

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 3bb6fa3..1421997 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig

@@ -15,18 +15,6 @@
 	  kind of kernel debugging operations.
 	  When in doubt, say "N".
 
-config STALDRV
-	bool "Stallion multiport serial support"
-	depends on SERIAL_NONSTANDARD
-	help
-	  Stallion cards give you many serial ports.  You would need something
-	  like this to connect more than two modems to your Linux box, for
-	  instance in order to become a dial-in server.  If you say Y here,
-	  you will be asked for your specific card model in the next
-	  questions.  Make sure to read <file:Documentation/serial/stallion.txt>
-	  in this case.  If you have never heard about all this, it's safe to
-	  say N.
-
 config SGI_SNSC
 	bool "SGI Altix system controller communication support"
 	depends on (IA64_SGI_SN2 || IA64_GENERIC)

diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index d784650..448ce5e 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c

@@ -725,7 +725,7 @@
 	return 0;
 }
 
-static ctl_table hpet_table[] = {
+static struct ctl_table hpet_table[] = {
 	{
 	 .procname = "max-user-freq",
 	 .data = &hpet_max_freq,
@@ -736,7 +736,7 @@
 	{}
 };
 
-static ctl_table hpet_root[] = {
+static struct ctl_table hpet_root[] = {
 	{
 	 .procname = "hpet",
 	 .maxlen = 0,
@@ -746,7 +746,7 @@
 	{}
 };
 
-static ctl_table dev_root[] = {
+static struct ctl_table dev_root[] = {
 	{
 	 .procname = "dev",
 	 .maxlen = 0,

diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index 2efa176..9f2e3be 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c

@@ -659,7 +659,7 @@
 #ifdef CONFIG_PROC_FS
 #include <linux/sysctl.h>
 
-static ctl_table ipmi_table[] = {
+static struct ctl_table ipmi_table[] = {
 	{ .procname	= "poweroff_powercycle",
 	  .data		= &poweroff_powercycle,
 	  .maxlen	= sizeof(poweroff_powercycle),
@@ -668,14 +668,14 @@
 	{ }
 };
 
-static ctl_table ipmi_dir_table[] = {
+static struct ctl_table ipmi_dir_table[] = {
 	{ .procname	= "ipmi",
 	  .mode		= 0555,
 	  .child	= ipmi_table },
 	{ }
 };
 
-static ctl_table ipmi_root_table[] = {
+static struct ctl_table ipmi_root_table[] = {
 	{ .procname	= "dev",
 	  .mode		= 0555,
 	  .child	= ipmi_dir_table },

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 1ccbe94..2ca6d78 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c

@@ -745,7 +745,7 @@
 		offset += file->f_pos;
 	case SEEK_SET:
 		/* to avoid userland mistaking f_pos=-9 as -EBADF=-9 */
-		if ((unsigned long long)offset >= ~0xFFFULL) {
+		if (IS_ERR_VALUE((unsigned long long)offset)) {
 			ret = -EOVERFLOW;
 			break;
 		}

diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index e1f60f9..f1d7fa4 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c

@@ -267,7 +267,7 @@
 	if ((vma->vm_flags & VM_WRITE) == 0)
 		return -EPERM;
 
-	pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	pages = vma_pages(vma);
 	vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
 	if (vdata_size <= PAGE_SIZE)
 		vdata = kzalloc(vdata_size, GFP_KERNEL);

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 35487e8..0d91fe5 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c

@@ -1381,10 +1381,10 @@
  * as an ASCII string in the standard UUID format.  If accesses via the
  * sysctl system call, it is returned as 16 bytes of binary data.
  */
-static int proc_do_uuid(ctl_table *table, int write,
+static int proc_do_uuid(struct ctl_table *table, int write,
 			void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	ctl_table fake_table;
+	struct ctl_table fake_table;
 	unsigned char buf[64], tmp_uuid[16], *uuid;
 
 	uuid = table->data;
@@ -1409,8 +1409,8 @@
 }
 
 static int sysctl_poolsize = INPUT_POOL_WORDS * 32;
-extern ctl_table random_table[];
-ctl_table random_table[] = {
+extern struct ctl_table random_table[];
+struct ctl_table random_table[] = {
 	{
 		.procname	= "poolsize",
 		.data		= &sysctl_poolsize,

diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 91470fd..c0cbbd4 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c

@@ -280,7 +280,7 @@
 /*
  * sysctl-tuning infrastructure.
  */
-static ctl_table rtc_table[] = {
+static struct ctl_table rtc_table[] = {
 	{
 		.procname	= "max-user-freq",
 		.data		= &rtc_max_user_freq,
@@ -291,7 +291,7 @@
 	{ }
 };
 
-static ctl_table rtc_root[] = {
+static struct ctl_table rtc_root[] = {
 	{
 		.procname	= "rtc",
 		.mode		= 0555,
@@ -300,7 +300,7 @@
 	{ }
 };
 
-static ctl_table dev_root[] = {
+static struct ctl_table dev_root[] = {
 	{
 		.procname	= "dev",
 		.mode		= 0555,

diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.h b/drivers/char/xilinx_hwicap/xilinx_hwicap.h
index d31ee23..38b145e 100644
--- a/drivers/char/xilinx_hwicap/xilinx_hwicap.h
+++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.h

@@ -37,7 +37,7 @@
 #include <linux/cdev.h>
 #include <linux/platform_device.h>
 
-#include <asm/io.h>
+#include <linux/io.h>
 
 struct hwicap_drvdata {
 	u32 write_buffer_in_use;  /* Always in [0,3] */
@@ -85,7 +85,13 @@
 	void (*reset)(struct hwicap_drvdata *drvdata);
 };
 
-/* Number of times to poll the done regsiter */
+/* Number of times to poll the done register. This has to be large
+ * enough to allow an entire configuration to complete. If an entire
+ * page (4kb) is configured at once, that could take up to 4k cycles
+ * with a byte-wide icap interface. In most cases, this driver is
+ * used with a much smaller fifo, but this should be sufficient in the
+ * worst case.
+ */
 #define XHI_MAX_RETRIES     5000
 
 /************ Constant Definitions *************/

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 4b9bb5d..93eb5cb 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c

@@ -47,6 +47,8 @@
 static struct cpufreq_governor cpufreq_gov_ondemand;
 #endif
 
+static unsigned int default_powersave_bias;
+
 static void ondemand_powersave_bias_init_cpu(int cpu)
 {
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
@@ -543,7 +545,7 @@
 
 	tuners->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
 	tuners->ignore_nice = 0;
-	tuners->powersave_bias = 0;
+	tuners->powersave_bias = default_powersave_bias;
 	tuners->io_is_busy = should_io_be_busy();
 
 	dbs_data->tuners = tuners;
@@ -585,6 +587,7 @@
 	unsigned int cpu;
 	cpumask_t done;
 
+	default_powersave_bias = powersave_bias;
 	cpumask_clear(&done);
 
 	get_online_cpus();
@@ -593,11 +596,17 @@
 			continue;
 
 		policy = per_cpu(od_cpu_dbs_info, cpu).cdbs.cur_policy;
-		dbs_data = policy->governor_data;
-		od_tuners = dbs_data->tuners;
-		od_tuners->powersave_bias = powersave_bias;
+		if (!policy)
+			continue;
 
 		cpumask_or(&done, &done, policy->cpus);
+
+		if (policy->governor != &cpufreq_gov_ondemand)
+			continue;
+
+		dbs_data = policy->governor_data;
+		od_tuners = dbs_data->tuners;
+		od_tuners->powersave_bias = default_powersave_bias;
 	}
 	put_online_cpus();
 }

diff --git a/drivers/extcon/Kconfig b/drivers/extcon/Kconfig
index 3297301..63f454e 100644
--- a/drivers/extcon/Kconfig
+++ b/drivers/extcon/Kconfig

@@ -53,4 +53,11 @@
 	  with Wolfson Arizona devices. These are audio CODECs with
 	  advanced audio accessory detection support.
 
+config EXTCON_PALMAS
+	tristate "Palmas USB EXTCON support"
+	depends on MFD_PALMAS
+	help
+	  Say Y here to enable support for USB peripheral and USB host
+	  detection by palmas usb.
+
 endif # MULTISTATE_SWITCH

diff --git a/drivers/extcon/Makefile b/drivers/extcon/Makefile
index f98a3c4..540e2c3 100644
--- a/drivers/extcon/Makefile
+++ b/drivers/extcon/Makefile

@@ -8,3 +8,4 @@
 obj-$(CONFIG_EXTCON_MAX77693)	+= extcon-max77693.o
 obj-$(CONFIG_EXTCON_MAX8997)	+= extcon-max8997.o
 obj-$(CONFIG_EXTCON_ARIZONA)	+= extcon-arizona.o
+obj-$(CONFIG_EXTCON_PALMAS)	+= extcon-palmas.o

diff --git a/drivers/extcon/extcon-class.c b/drivers/extcon/extcon-class.c
index 60adc04..8c69803 100644
--- a/drivers/extcon/extcon-class.c
+++ b/drivers/extcon/extcon-class.c

@@ -185,26 +185,6 @@
 					       cable->cable_index));
 }
 
-static ssize_t cable_state_store(struct device *dev,
-				 struct device_attribute *attr, const char *buf,
-				 size_t count)
-{
-	struct extcon_cable *cable = container_of(attr, struct extcon_cable,
-						  attr_state);
-	int ret, state;
-
-	ret = sscanf(buf, "%d", &state);
-	if (ret == 0)
-		ret = -EINVAL;
-	else
-		ret = extcon_set_cable_state_(cable->edev, cable->cable_index,
-					      state);
-
-	if (ret < 0)
-		return ret;
-	return count;
-}
-
 /**
  * extcon_update_state() - Update the cable attach states of the extcon device
  *			only for the masked bits.
@@ -501,6 +481,7 @@
 		return -ENODEV;
 	}
 }
+EXPORT_SYMBOL_GPL(extcon_register_interest);
 
 /**
  * extcon_unregister_interest() - Unregister the notifier registered by
@@ -515,6 +496,7 @@
 
 	return raw_notifier_chain_unregister(&obj->edev->nh, &obj->internal_nb);
 }
+EXPORT_SYMBOL_GPL(extcon_unregister_interest);
 
 /**
  * extcon_register_notifier() - Register a notifiee to get notified by
@@ -665,9 +647,8 @@
 
 			sysfs_attr_init(&cable->attr_state.attr);
 			cable->attr_state.attr.name = "state";
-			cable->attr_state.attr.mode = 0644;
+			cable->attr_state.attr.mode = 0444;
 			cable->attr_state.show = cable_state_show;
-			cable->attr_state.store = cable_state_store;
 		}
 	}
 

diff --git a/drivers/extcon/extcon-palmas.c b/drivers/extcon/extcon-palmas.c
new file mode 100644
index 0000000..b752a0a
--- /dev/null
+++ b/drivers/extcon/extcon-palmas.c

@@ -0,0 +1,246 @@
+/*
+ * Palmas USB transceiver driver
+ *
+ * Copyright (C) 2013 Texas Instruments Incorporated - http://www.ti.com
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Author: Graeme Gregory <gg@slimlogic.co.uk>
+ * Author: Kishon Vijay Abraham I <kishon@ti.com>
+ *
+ * Based on twl6030_usb.c
+ *
+ * Author: Hema HK <hemahk@ti.com>
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/err.h>
+#include <linux/mfd/palmas.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+
+static const char *palmas_extcon_cable[] = {
+	[0] = "USB",
+	[1] = "USB-HOST",
+	NULL,
+};
+
+static const int mutually_exclusive[] = {0x3, 0x0};
+
+static void palmas_usb_wakeup(struct palmas *palmas, int enable)
+{
+	if (enable)
+		palmas_write(palmas, PALMAS_USB_OTG_BASE, PALMAS_USB_WAKEUP,
+			PALMAS_USB_WAKEUP_ID_WK_UP_COMP);
+	else
+		palmas_write(palmas, PALMAS_USB_OTG_BASE, PALMAS_USB_WAKEUP, 0);
+}
+
+static irqreturn_t palmas_vbus_irq_handler(int irq, void *_palmas_usb)
+{
+	struct palmas_usb *palmas_usb = _palmas_usb;
+	unsigned int vbus_line_state;
+
+	palmas_read(palmas_usb->palmas, PALMAS_INTERRUPT_BASE,
+		PALMAS_INT3_LINE_STATE, &vbus_line_state);
+
+	if (vbus_line_state & PALMAS_INT3_LINE_STATE_VBUS) {
+		if (palmas_usb->linkstat != PALMAS_USB_STATE_VBUS) {
+			palmas_usb->linkstat = PALMAS_USB_STATE_VBUS;
+			extcon_set_cable_state(&palmas_usb->edev, "USB", true);
+		} else {
+			dev_dbg(palmas_usb->dev,
+				"Spurious connect event detected\n");
+		}
+	} else if (!(vbus_line_state & PALMAS_INT3_LINE_STATE_VBUS)) {
+		if (palmas_usb->linkstat == PALMAS_USB_STATE_VBUS) {
+			palmas_usb->linkstat = PALMAS_USB_STATE_DISCONNECT;
+			extcon_set_cable_state(&palmas_usb->edev, "USB", false);
+		} else {
+			dev_dbg(palmas_usb->dev,
+				"Spurious disconnect event detected\n");
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t palmas_id_irq_handler(int irq, void *_palmas_usb)
+{
+	unsigned int set;
+	struct palmas_usb *palmas_usb = _palmas_usb;
+
+	palmas_read(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+		PALMAS_USB_ID_INT_LATCH_SET, &set);
+
+	if (set & PALMAS_USB_ID_INT_SRC_ID_GND) {
+		palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+			PALMAS_USB_ID_INT_EN_HI_SET,
+			PALMAS_USB_ID_INT_EN_HI_SET_ID_FLOAT);
+		palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+			PALMAS_USB_ID_INT_EN_HI_CLR,
+			PALMAS_USB_ID_INT_EN_HI_CLR_ID_GND);
+		palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+			PALMAS_USB_ID_INT_LATCH_CLR,
+			PALMAS_USB_ID_INT_EN_HI_CLR_ID_GND);
+		palmas_usb->linkstat = PALMAS_USB_STATE_ID;
+		extcon_set_cable_state(&palmas_usb->edev, "USB-HOST", true);
+	} else if (set & PALMAS_USB_ID_INT_SRC_ID_FLOAT) {
+		palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+			PALMAS_USB_ID_INT_EN_HI_SET,
+			PALMAS_USB_ID_INT_EN_HI_SET_ID_GND);
+		palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+			PALMAS_USB_ID_INT_EN_HI_CLR,
+			PALMAS_USB_ID_INT_EN_HI_CLR_ID_FLOAT);
+		palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+			PALMAS_USB_ID_INT_LATCH_CLR,
+			PALMAS_USB_ID_INT_EN_HI_CLR_ID_FLOAT);
+		palmas_usb->linkstat = PALMAS_USB_STATE_DISCONNECT;
+		extcon_set_cable_state(&palmas_usb->edev, "USB-HOST", false);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void palmas_enable_irq(struct palmas_usb *palmas_usb)
+{
+	palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+		PALMAS_USB_VBUS_CTRL_SET,
+		PALMAS_USB_VBUS_CTRL_SET_VBUS_ACT_COMP);
+
+	palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+		PALMAS_USB_ID_CTRL_SET, PALMAS_USB_ID_CTRL_SET_ID_ACT_COMP);
+
+	palmas_write(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
+		PALMAS_USB_ID_INT_EN_HI_SET,
+		PALMAS_USB_ID_INT_EN_HI_SET_ID_GND);
+
+	palmas_vbus_irq_handler(palmas_usb->vbus_irq, palmas_usb);
+
+	/* cold plug for host mode needs this delay */
+	msleep(30);
+	palmas_id_irq_handler(palmas_usb->id_irq, palmas_usb);
+}
+
+static int palmas_usb_probe(struct platform_device *pdev)
+{
+	struct palmas *palmas = dev_get_drvdata(pdev->dev.parent);
+	struct palmas_usb_platform_data	*pdata = pdev->dev.platform_data;
+	struct device_node *node = pdev->dev.of_node;
+	struct palmas_usb *palmas_usb;
+	int status;
+
+	if (node && !pdata) {
+		pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+
+		if (!pdata)
+			return -ENOMEM;
+
+		pdata->wakeup = of_property_read_bool(node, "ti,wakeup");
+	} else if (!pdata) {
+		return -EINVAL;
+	}
+
+	palmas_usb = devm_kzalloc(&pdev->dev, sizeof(*palmas_usb), GFP_KERNEL);
+	if (!palmas_usb)
+		return -ENOMEM;
+
+	palmas->usb = palmas_usb;
+	palmas_usb->palmas = palmas;
+
+	palmas_usb->dev	 = &pdev->dev;
+
+	palmas_usb->id_otg_irq = regmap_irq_get_virq(palmas->irq_data,
+						PALMAS_ID_OTG_IRQ);
+	palmas_usb->id_irq = regmap_irq_get_virq(palmas->irq_data,
+						PALMAS_ID_IRQ);
+	palmas_usb->vbus_otg_irq = regmap_irq_get_virq(palmas->irq_data,
+						PALMAS_VBUS_OTG_IRQ);
+	palmas_usb->vbus_irq = regmap_irq_get_virq(palmas->irq_data,
+						PALMAS_VBUS_IRQ);
+
+	palmas_usb_wakeup(palmas, pdata->wakeup);
+
+	platform_set_drvdata(pdev, palmas_usb);
+
+	palmas_usb->edev.name = "palmas-usb";
+	palmas_usb->edev.supported_cable = palmas_extcon_cable;
+	palmas_usb->edev.mutually_exclusive = mutually_exclusive;
+
+	status = extcon_dev_register(&palmas_usb->edev, palmas_usb->dev);
+	if (status) {
+		dev_err(&pdev->dev, "failed to register extcon device\n");
+		return status;
+	}
+
+	status = devm_request_threaded_irq(palmas_usb->dev, palmas_usb->id_irq,
+			NULL, palmas_id_irq_handler,
+			IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING,
+			"palmas_usb_id", palmas_usb);
+	if (status < 0) {
+		dev_err(&pdev->dev, "can't get IRQ %d, err %d\n",
+					palmas_usb->id_irq, status);
+		goto fail_extcon;
+	}
+
+	status = devm_request_threaded_irq(palmas_usb->dev,
+			palmas_usb->vbus_irq, NULL, palmas_vbus_irq_handler,
+			IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING,
+			"palmas_usb_vbus", palmas_usb);
+	if (status < 0) {
+		dev_err(&pdev->dev, "can't get IRQ %d, err %d\n",
+					palmas_usb->vbus_irq, status);
+		goto fail_extcon;
+	}
+
+	palmas_enable_irq(palmas_usb);
+
+	return 0;
+
+fail_extcon:
+	extcon_dev_unregister(&palmas_usb->edev);
+
+	return status;
+}
+
+static int palmas_usb_remove(struct platform_device *pdev)
+{
+	struct palmas_usb *palmas_usb = platform_get_drvdata(pdev);
+
+	extcon_dev_unregister(&palmas_usb->edev);
+
+	return 0;
+}
+
+static struct of_device_id of_palmas_match_tbl[] = {
+	{ .compatible = "ti,palmas-usb", },
+	{ .compatible = "ti,twl6035-usb", },
+	{ /* end */ }
+};
+
+static struct platform_driver palmas_usb_driver = {
+	.probe = palmas_usb_probe,
+	.remove = palmas_usb_remove,
+	.driver = {
+		.name = "palmas-usb",
+		.of_match_table = of_palmas_match_tbl,
+		.owner = THIS_MODULE,
+	},
+};
+
+module_platform_driver(palmas_usb_driver);
+
+MODULE_ALIAS("platform:palmas-usb");
+MODULE_AUTHOR("Graeme Gregory <gg@slimlogic.co.uk>");
+MODULE_DESCRIPTION("Palmas USB transceiver driver");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(of, of_palmas_match_tbl);

diff --git a/drivers/fmc/Kconfig b/drivers/fmc/Kconfig
new file mode 100644
index 0000000..c01cf45
--- /dev/null
+++ b/drivers/fmc/Kconfig

@@ -0,0 +1,51 @@
+#
+# FMC (ANSI-VITA 57.1) bus support
+#
+
+menuconfig FMC
+	tristate "FMC support"
+	help
+
+	  FMC (FPGA Mezzanine Carrier) is a mechanical and electrical
+	  standard for mezzanine cards that plug into a carrier board.
+	  This kernel subsystem supports the matching between carrier
+	  and mezzanine based on identifiers stored in the internal I2C
+	  EEPROM, as well as having carrier-independent drivers.
+
+	  The framework was born outside of the kernel and at this time
+	  the off-tree code base is more complete.  Code and documentation
+	  is at git://ohwr.org/fmc-projects/fmc-bus.git .
+
+if FMC
+
+config FMC_FAKEDEV
+	tristate "FMC fake device (software testing)"
+	help
+	  This is a fake carrier, bringing a default EEPROM content
+	  that can be rewritten at run time and usef for matching
+	  mezzanines.
+
+config FMC_TRIVIAL
+	tristate "FMC trivial mezzanine driver (software testing)"
+	help
+	  This is a fake mezzanine driver, to show how FMC works and test it.
+	  The driver also handles interrupts (we used it with a real carrier
+	  before the mezzanines were produced)
+
+config FMC_WRITE_EEPROM
+	tristate "FMC mezzanine driver to write I2C EEPROM"
+	help
+	  This driver matches every mezzanine device and can write the
+	  internal EEPROM of the PCB, using the firmware loader to get
+	  its binary and the function carrier->reprogram to actually do it.
+	  It is useful when the mezzanines are produced.
+
+config FMC_CHARDEV
+	tristate "FMC mezzanine driver that registers a char device"
+	help
+	  This driver matches every mezzanine device and allows user
+	  space to read and write registers using a char device. It
+	  can be used to write user-space drivers, or just get
+	  aquainted with a mezzanine before writing its specific driver.
+
+endif # FMC

diff --git a/drivers/fmc/Makefile b/drivers/fmc/Makefile
new file mode 100644
index 0000000..b945291
--- /dev/null
+++ b/drivers/fmc/Makefile

@@ -0,0 +1,13 @@
+
+obj-$(CONFIG_FMC) += fmc.o
+
+fmc-y = fmc-core.o
+fmc-y += fmc-match.o
+fmc-y += fmc-sdb.o
+fmc-y += fru-parse.o
+fmc-y += fmc-dump.o
+
+obj-$(CONFIG_FMC_FAKEDEV) += fmc-fakedev.o
+obj-$(CONFIG_FMC_TRIVIAL) += fmc-trivial.o
+obj-$(CONFIG_FMC_WRITE_EEPROM) += fmc-write-eeprom.o
+obj-$(CONFIG_FMC_CHARDEV) += fmc-chardev.o

diff --git a/drivers/fmc/fmc-chardev.c b/drivers/fmc/fmc-chardev.c
new file mode 100644
index 0000000..cc031db
--- /dev/null
+++ b/drivers/fmc/fmc-chardev.c

@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2012 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ *
+ * This work is part of the White Rabbit project, a research effort led
+ * by CERN, the European Institute for Nuclear Research.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/fmc.h>
+#include <linux/uaccess.h>
+
+static LIST_HEAD(fc_devices);
+static DEFINE_SPINLOCK(fc_lock);
+
+struct fc_instance {
+	struct list_head list;
+	struct fmc_device *fmc;
+	struct miscdevice misc;
+};
+
+/* at open time, we must identify our device */
+static int fc_open(struct inode *ino, struct file *f)
+{
+	struct fmc_device *fmc;
+	struct fc_instance *fc;
+	int minor = iminor(ino);
+
+	list_for_each_entry(fc, &fc_devices, list)
+		if (fc->misc.minor == minor)
+			break;
+	if (fc->misc.minor != minor)
+		return -ENODEV;
+	fmc = fc->fmc;
+	if (try_module_get(fmc->owner) == 0)
+		return -ENODEV;
+
+	f->private_data = fmc;
+	return 0;
+}
+
+static int fc_release(struct inode *ino, struct file *f)
+{
+	struct fmc_device *fmc = f->private_data;
+	module_put(fmc->owner);
+	return 0;
+}
+
+/* read and write are simple after the default llseek has been used */
+static ssize_t fc_read(struct file *f, char __user *buf, size_t count,
+		       loff_t *offp)
+{
+	struct fmc_device *fmc = f->private_data;
+	unsigned long addr;
+	uint32_t val;
+
+	if (count < sizeof(val))
+		return -EINVAL;
+	count = sizeof(val);
+
+	addr = *offp;
+	if (addr > fmc->memlen)
+		return -ESPIPE; /* Illegal seek */
+	val = fmc_readl(fmc, addr);
+	if (copy_to_user(buf, &val, count))
+		return -EFAULT;
+	*offp += count;
+	return count;
+}
+
+static ssize_t fc_write(struct file *f, const char __user *buf, size_t count,
+			loff_t *offp)
+{
+	struct fmc_device *fmc = f->private_data;
+	unsigned long addr;
+	uint32_t val;
+
+	if (count < sizeof(val))
+		return -EINVAL;
+	count = sizeof(val);
+
+	addr = *offp;
+	if (addr > fmc->memlen)
+		return -ESPIPE; /* Illegal seek */
+	if (copy_from_user(&val, buf, count))
+		return -EFAULT;
+	fmc_writel(fmc, val, addr);
+	*offp += count;
+	return count;
+}
+
+static const struct file_operations fc_fops = {
+	.owner = THIS_MODULE,
+	.open = fc_open,
+	.release = fc_release,
+	.llseek = generic_file_llseek,
+	.read = fc_read,
+	.write = fc_write,
+};
+
+
+/* Device part .. */
+static int fc_probe(struct fmc_device *fmc);
+static int fc_remove(struct fmc_device *fmc);
+
+static struct fmc_driver fc_drv = {
+	.version = FMC_VERSION,
+	.driver.name = KBUILD_MODNAME,
+	.probe = fc_probe,
+	.remove = fc_remove,
+	/* no table: we want to match everything */
+};
+
+/* We accept the generic busid parameter */
+FMC_PARAM_BUSID(fc_drv);
+
+/* probe and remove must allocate and release a misc device */
+static int fc_probe(struct fmc_device *fmc)
+{
+	int ret;
+	int index = 0;
+
+	struct fc_instance *fc;
+
+	if (fmc->op->validate)
+		index = fmc->op->validate(fmc, &fc_drv);
+	if (index < 0)
+		return -EINVAL; /* not our device: invalid */
+
+	/* Create a char device: we want to create it anew */
+	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
+	if (!fc)
+		return -ENOMEM;
+	fc->fmc = fmc;
+	fc->misc.minor = MISC_DYNAMIC_MINOR;
+	fc->misc.fops = &fc_fops;
+	fc->misc.name = kstrdup(dev_name(&fmc->dev), GFP_KERNEL);
+
+	spin_lock(&fc_lock);
+	ret = misc_register(&fc->misc);
+	if (ret < 0)
+		goto err_unlock;
+	list_add(&fc->list, &fc_devices);
+	spin_unlock(&fc_lock);
+	dev_info(&fc->fmc->dev, "Created misc device \"%s\"\n",
+		 fc->misc.name);
+	return 0;
+
+err_unlock:
+	spin_unlock(&fc_lock);
+	kfree(fc->misc.name);
+	kfree(fc);
+	return ret;
+}
+
+static int fc_remove(struct fmc_device *fmc)
+{
+	struct fc_instance *fc;
+
+	list_for_each_entry(fc, &fc_devices, list)
+		if (fc->fmc == fmc)
+			break;
+	if (fc->fmc != fmc) {
+		dev_err(&fmc->dev, "remove called but not found\n");
+		return -ENODEV;
+	}
+
+	spin_lock(&fc_lock);
+	list_del(&fc->list);
+	misc_deregister(&fc->misc);
+	kfree(fc->misc.name);
+	kfree(fc);
+	spin_unlock(&fc_lock);
+
+	return 0;
+}
+
+
+static int fc_init(void)
+{
+	int ret;
+
+	ret = fmc_driver_register(&fc_drv);
+	return ret;
+}
+
+static void fc_exit(void)
+{
+	fmc_driver_unregister(&fc_drv);
+}
+
+module_init(fc_init);
+module_exit(fc_exit);
+
+MODULE_LICENSE("GPL");

diff --git a/drivers/fmc/fmc-core.c b/drivers/fmc/fmc-core.c
new file mode 100644
index 0000000..24d5249
--- /dev/null
+++ b/drivers/fmc/fmc-core.c

@@ -0,0 +1,296 @@
+/*
+ * Copyright (C) 2012 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ *
+ * This work is part of the White Rabbit project, a research effort led
+ * by CERN, the European Institute for Nuclear Research.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/fmc.h>
+
+static int fmc_check_version(unsigned long version, const char *name)
+{
+	if (__FMC_MAJOR(version) != FMC_MAJOR) {
+		pr_err("%s: \"%s\" has wrong major (has %li, expected %i)\n",
+		       __func__, name, __FMC_MAJOR(version), FMC_MAJOR);
+		return -EINVAL;
+	}
+
+	if (__FMC_MINOR(version) != FMC_MINOR)
+		pr_info("%s: \"%s\" has wrong minor (has %li, expected %i)\n",
+		       __func__, name, __FMC_MINOR(version), FMC_MINOR);
+	return 0;
+}
+
+static int fmc_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	/* struct fmc_device *fdev = to_fmc_device(dev); */
+
+	/* FIXME: The MODALIAS */
+	add_uevent_var(env, "MODALIAS=%s", "fmc");
+	return 0;
+}
+
+static int fmc_probe(struct device *dev)
+{
+	struct fmc_driver *fdrv = to_fmc_driver(dev->driver);
+	struct fmc_device *fdev = to_fmc_device(dev);
+
+	return fdrv->probe(fdev);
+}
+
+static int fmc_remove(struct device *dev)
+{
+	struct fmc_driver *fdrv = to_fmc_driver(dev->driver);
+	struct fmc_device *fdev = to_fmc_device(dev);
+
+	return fdrv->remove(fdev);
+}
+
+static void fmc_shutdown(struct device *dev)
+{
+	/* not implemented but mandatory */
+}
+
+static struct bus_type fmc_bus_type = {
+	.name = "fmc",
+	.match = fmc_match,
+	.uevent = fmc_uevent,
+	.probe = fmc_probe,
+	.remove = fmc_remove,
+	.shutdown = fmc_shutdown,
+};
+
+static void fmc_release(struct device *dev)
+{
+	struct fmc_device *fmc = container_of(dev, struct fmc_device, dev);
+
+	kfree(fmc);
+}
+
+/*
+ * The eeprom is exported in sysfs, through a binary attribute
+ */
+
+static ssize_t fmc_read_eeprom(struct file *file, struct kobject *kobj,
+			   struct bin_attribute *bin_attr,
+			   char *buf, loff_t off, size_t count)
+{
+	struct device *dev;
+	struct fmc_device *fmc;
+	int eelen;
+
+	dev = container_of(kobj, struct device, kobj);
+	fmc = container_of(dev, struct fmc_device, dev);
+	eelen = fmc->eeprom_len;
+	if (off > eelen)
+		return -ESPIPE;
+	if (off == eelen)
+		return 0; /* EOF */
+	if (off + count > eelen)
+		count = eelen - off;
+	memcpy(buf, fmc->eeprom + off, count);
+	return count;
+}
+
+static struct bin_attribute fmc_eeprom_attr = {
+	.attr = { .name = "eeprom", .mode = S_IRUGO, },
+	.size = 8192, /* more or less standard */
+	.read = fmc_read_eeprom,
+};
+
+/*
+ * Functions for client modules follow
+ */
+
+int fmc_driver_register(struct fmc_driver *drv)
+{
+	if (fmc_check_version(drv->version, drv->driver.name))
+		return -EINVAL;
+	drv->driver.bus = &fmc_bus_type;
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(fmc_driver_register);
+
+void fmc_driver_unregister(struct fmc_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(fmc_driver_unregister);
+
+/*
+ * When a device set is registered, all eeproms must be read
+ * and all FRUs must be parsed
+ */
+int fmc_device_register_n(struct fmc_device **devs, int n)
+{
+	struct fmc_device *fmc, **devarray;
+	uint32_t device_id;
+	int i, ret = 0;
+
+	if (n < 1)
+		return 0;
+
+	/* Check the version of the first data structure (function prints) */
+	if (fmc_check_version(devs[0]->version, devs[0]->carrier_name))
+		return -EINVAL;
+
+	devarray = kmemdup(devs, n * sizeof(*devs), GFP_KERNEL);
+	if (!devarray)
+		return -ENOMEM;
+
+	/* Make all other checks before continuing, for all devices */
+	for (i = 0; i < n; i++) {
+		fmc = devarray[i];
+		if (!fmc->hwdev) {
+			pr_err("%s: device nr. %i has no hwdev pointer\n",
+			       __func__, i);
+			ret = -EINVAL;
+			break;
+		}
+		if (fmc->flags == FMC_DEVICE_NO_MEZZANINE) {
+			dev_info(fmc->hwdev, "absent mezzanine in slot %d\n",
+				 fmc->slot_id);
+			continue;
+		}
+		if (!fmc->eeprom) {
+			dev_err(fmc->hwdev, "no eeprom provided for slot %i\n",
+				fmc->slot_id);
+			ret = -EINVAL;
+		}
+		if (!fmc->eeprom_addr) {
+			dev_err(fmc->hwdev, "no eeprom_addr for slot %i\n",
+				fmc->slot_id);
+			ret = -EINVAL;
+		}
+		if (!fmc->carrier_name || !fmc->carrier_data ||
+		    !fmc->device_id) {
+			dev_err(fmc->hwdev,
+				"deivce nr %i: carrier name, "
+				"data or dev_id not set\n", i);
+			ret = -EINVAL;
+		}
+		if (ret)
+			break;
+
+	}
+	if (ret) {
+		kfree(devarray);
+		return ret;
+	}
+
+	/* Validation is ok. Now init and register the devices */
+	for (i = 0; i < n; i++) {
+		fmc = devarray[i];
+
+		if (fmc->flags == FMC_DEVICE_NO_MEZZANINE)
+			continue; /* dev_info already done above */
+
+		fmc->nr_slots = n; /* each slot must know how many are there */
+		fmc->devarray = devarray;
+
+		device_initialize(&fmc->dev);
+		fmc->dev.release = fmc_release;
+		fmc->dev.parent = fmc->hwdev;
+
+		/* Fill the identification stuff (may fail) */
+		fmc_fill_id_info(fmc);
+
+		fmc->dev.bus = &fmc_bus_type;
+
+		/* Name from mezzanine info or carrier info. Or 0,1,2.. */
+		device_id = fmc->device_id;
+		if (!fmc->mezzanine_name)
+			dev_set_name(&fmc->dev, "fmc-%04x", device_id);
+		else
+			dev_set_name(&fmc->dev, "%s-%04x", fmc->mezzanine_name,
+				     device_id);
+		ret = device_add(&fmc->dev);
+		if (ret < 0) {
+			dev_err(fmc->hwdev, "Slot %i: Failed in registering "
+				"\"%s\"\n", fmc->slot_id, fmc->dev.kobj.name);
+			goto out;
+		}
+		ret = sysfs_create_bin_file(&fmc->dev.kobj, &fmc_eeprom_attr);
+		if (ret < 0) {
+			dev_err(&fmc->dev, "Failed in registering eeprom\n");
+			goto out1;
+		}
+		/* This device went well, give information to the user */
+		fmc_dump_eeprom(fmc);
+		fmc_dump_sdb(fmc);
+	}
+	return 0;
+
+out1:
+	device_del(&fmc->dev);
+out:
+	fmc_free_id_info(fmc);
+	put_device(&fmc->dev);
+
+	kfree(devarray);
+	for (i--; i >= 0; i--) {
+		sysfs_remove_bin_file(&devs[i]->dev.kobj, &fmc_eeprom_attr);
+		device_del(&devs[i]->dev);
+		fmc_free_id_info(devs[i]);
+		put_device(&devs[i]->dev);
+	}
+	return ret;
+
+}
+EXPORT_SYMBOL(fmc_device_register_n);
+
+int fmc_device_register(struct fmc_device *fmc)
+{
+	return fmc_device_register_n(&fmc, 1);
+}
+EXPORT_SYMBOL(fmc_device_register);
+
+void fmc_device_unregister_n(struct fmc_device **devs, int n)
+{
+	int i;
+
+	if (n < 1)
+		return;
+
+	/* Free devarray first, not used by the later loop */
+	kfree(devs[0]->devarray);
+
+	for (i = 0; i < n; i++) {
+		if (devs[i]->flags == FMC_DEVICE_NO_MEZZANINE)
+			continue;
+		sysfs_remove_bin_file(&devs[i]->dev.kobj, &fmc_eeprom_attr);
+		device_del(&devs[i]->dev);
+		fmc_free_id_info(devs[i]);
+		put_device(&devs[i]->dev);
+	}
+}
+EXPORT_SYMBOL(fmc_device_unregister_n);
+
+void fmc_device_unregister(struct fmc_device *fmc)
+{
+	fmc_device_unregister_n(&fmc, 1);
+}
+EXPORT_SYMBOL(fmc_device_unregister);
+
+/* Init and exit are trivial */
+static int fmc_init(void)
+{
+	return bus_register(&fmc_bus_type);
+}
+
+static void fmc_exit(void)
+{
+	bus_unregister(&fmc_bus_type);
+}
+
+module_init(fmc_init);
+module_exit(fmc_exit);
+
+MODULE_LICENSE("GPL");

diff --git a/drivers/fmc/fmc-dump.c b/drivers/fmc/fmc-dump.c
new file mode 100644
index 0000000..c91afd6
--- /dev/null
+++ b/drivers/fmc/fmc-dump.c

@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2013 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ *
+ * This work is part of the White Rabbit project, a research effort led
+ * by CERN, the European Institute for Nuclear Research.
+ */
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/fmc.h>
+#include <linux/fmc-sdb.h>
+
+static int fmc_must_dump_eeprom;
+module_param_named(dump_eeprom, fmc_must_dump_eeprom, int, 0644);
+static int fmc_must_dump_sdb;
+module_param_named(dump_sdb, fmc_must_dump_sdb, int, 0644);
+
+#define LINELEN 16
+
+/* Dumping 8k takes oh so much: avoid duplicate lines */
+static const uint8_t *dump_line(int addr, const uint8_t *line,
+				const uint8_t *prev)
+{
+	int i;
+
+	if (!prev || memcmp(line, prev, LINELEN)) {
+		pr_info("%04x: ", addr);
+		for (i = 0; i < LINELEN; ) {
+			printk(KERN_CONT "%02x", line[i]);
+			i++;
+			printk(i & 3 ? " " : i & (LINELEN - 1) ? "  " : "\n");
+		}
+		return line;
+	}
+	/* repeated line */
+	if (line == prev + LINELEN)
+		pr_info("[...]\n");
+	return prev;
+}
+
+void fmc_dump_eeprom(const struct fmc_device *fmc)
+{
+	const uint8_t *line, *prev;
+	int i;
+
+	if (!fmc_must_dump_eeprom)
+		return;
+
+	pr_info("FMC: %s (%s), slot %i, device %s\n", dev_name(fmc->hwdev),
+		fmc->carrier_name, fmc->slot_id, dev_name(&fmc->dev));
+	pr_info("FMC: dumping eeprom 0x%x (%i) bytes\n", fmc->eeprom_len,
+	       fmc->eeprom_len);
+
+	line = fmc->eeprom;
+	prev = NULL;
+	for (i = 0; i < fmc->eeprom_len; i += LINELEN, line += LINELEN)
+		prev = dump_line(i, line, prev);
+}
+
+void fmc_dump_sdb(const struct fmc_device *fmc)
+{
+	const uint8_t *line, *prev;
+	int i, len;
+
+	if (!fmc->sdb)
+		return;
+	if (!fmc_must_dump_sdb)
+		return;
+
+	/* If the argument is not-zero, do simple dump (== show) */
+	if (fmc_must_dump_sdb > 0)
+		fmc_show_sdb_tree(fmc);
+
+	if (fmc_must_dump_sdb == 1)
+		return;
+
+	/* If bigger than 1, dump it seriously, to help debugging */
+
+	/*
+	 * Here we should really use libsdbfs (which is designed to
+	 * work in kernel space as well) , but it doesn't support
+	 * directories yet, and it requires better intergration (it
+	 * should be used instead of fmc-specific code).
+	 *
+	 * So, lazily, just dump the top-level array
+	 */
+	pr_info("FMC: %s (%s), slot %i, device %s\n", dev_name(fmc->hwdev),
+		fmc->carrier_name, fmc->slot_id, dev_name(&fmc->dev));
+	pr_info("FMC: poor dump of sdb first level:\n");
+
+	len = fmc->sdb->len * sizeof(union sdb_record);
+	line = (void *)fmc->sdb->record;
+	prev = NULL;
+	for (i = 0; i < len; i += LINELEN, line += LINELEN)
+		prev = dump_line(i, line, prev);
+	return;
+}

diff --git a/drivers/fmc/fmc-fakedev.c b/drivers/fmc/fmc-fakedev.c
new file mode 100644
index 0000000..941d093
--- /dev/null
+++ b/drivers/fmc/fmc-fakedev.c

@@ -0,0 +1,355 @@
+/*
+ * Copyright (C) 2012 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * The software is provided "as is"; the copyright holders disclaim
+ * all warranties and liabilities, to the extent permitted by
+ * applicable law.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/firmware.h>
+#include <linux/workqueue.h>
+#include <linux/err.h>
+#include <linux/fmc.h>
+
+#define FF_EEPROM_SIZE		8192	/* The standard eeprom size */
+#define FF_MAX_MEZZANINES	4	/* Fakes a multi-mezzanine carrier */
+
+/* The user can pass up to 4 names of eeprom images to load */
+static char *ff_eeprom[FF_MAX_MEZZANINES];
+static int ff_nr_eeprom;
+module_param_array_named(eeprom, ff_eeprom, charp, &ff_nr_eeprom, 0444);
+
+/* The user can ask for a multi-mezzanine carrier, with the default eeprom */
+static int ff_nr_dev = 1;
+module_param_named(ndev, ff_nr_dev, int, 0444);
+
+
+/* Lazily, don't support the "standard" module parameters */
+
+/*
+ * Eeprom built from these commands:
+
+	../fru-generator -v fake-vendor -n fake-design-for-testing \
+		-s 01234 -p none > IPMI-FRU
+
+	gensdbfs . ../fake-eeprom.bin
+*/
+static char ff_eeimg[FF_MAX_MEZZANINES][FF_EEPROM_SIZE] = {
+	{
+	0x01, 0x00, 0x00, 0x01, 0x00, 0x0c, 0x00, 0xf2, 0x01, 0x0b, 0x00, 0xb2,
+	0x86, 0x87, 0xcb, 0x66, 0x61, 0x6b, 0x65, 0x2d, 0x76, 0x65, 0x6e, 0x64,
+	0x6f, 0x72, 0xd7, 0x66, 0x61, 0x6b, 0x65, 0x2d, 0x64, 0x65, 0x73, 0x69,
+	0x67, 0x6e, 0x2d, 0x66, 0x6f, 0x72, 0x2d, 0x74, 0x65, 0x73, 0x74, 0x69,
+	0x6e, 0x67, 0xc5, 0x30, 0x31, 0x32, 0x33, 0x34, 0xc4, 0x6e, 0x6f, 0x6e,
+	0x65, 0xda, 0x32, 0x30, 0x31, 0x32, 0x2d, 0x31, 0x31, 0x2d, 0x31, 0x39,
+	0x20, 0x32, 0x32, 0x3a, 0x34, 0x32, 0x3a, 0x33, 0x30, 0x2e, 0x30, 0x37,
+	0x34, 0x30, 0x35, 0x35, 0xc1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87,
+	0x02, 0x02, 0x0d, 0xf7, 0xf8, 0x02, 0xb0, 0x04, 0x74, 0x04, 0xec, 0x04,
+	0x00, 0x00, 0x00, 0x00, 0xe8, 0x03, 0x02, 0x02, 0x0d, 0x5c, 0x93, 0x01,
+	0x4a, 0x01, 0x39, 0x01, 0x5a, 0x01, 0x00, 0x00, 0x00, 0x00, 0xb8, 0x0b,
+	0x02, 0x02, 0x0d, 0x63, 0x8c, 0x00, 0xfa, 0x00, 0xed, 0x00, 0x06, 0x01,
+	0x00, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x01, 0x02, 0x0d, 0xfb, 0xf5, 0x05,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x01, 0x02, 0x0d, 0xfc, 0xf4, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x0d, 0xfd, 0xf3, 0x03,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0xfa, 0x82, 0x0b, 0xea, 0x8f, 0xa2, 0x12, 0x00, 0x00, 0x1e, 0x44, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x53, 0x44, 0x42, 0x2d, 0x00, 0x03, 0x01, 0x01,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x01, 0xc4, 0x46, 0x69, 0x6c, 0x65, 0x44, 0x61, 0x74, 0x61,
+	0x2e, 0x20, 0x20, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+	0x2e, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+	0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xc0,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xc4, 0x46, 0x69, 0x6c, 0x65,
+	0x44, 0x61, 0x74, 0x61, 0x6e, 0x61, 0x6d, 0x65, 0x00, 0x00, 0x00, 0x01,
+	0x00, 0x00, 0x00, 0x00, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x20, 0x20, 0x20,
+	0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x01,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf,
+	0x46, 0x69, 0x6c, 0x65, 0x44, 0x61, 0x74, 0x61, 0x49, 0x50, 0x4d, 0x49,
+	0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x49, 0x50, 0x4d, 0x49,
+	0x2d, 0x46, 0x52, 0x55, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+	0x20, 0x20, 0x20, 0x01, 0x66, 0x61, 0x6b, 0x65, 0x0a,
+	},
+};
+
+struct ff_dev {
+	struct fmc_device *fmc[FF_MAX_MEZZANINES];
+	struct device dev;
+};
+
+static struct ff_dev *ff_current_dev; /* We have 1 carrier, 1 slot */
+
+static int ff_reprogram(struct fmc_device *fmc, struct fmc_driver *drv,
+			  char *gw)
+{
+	const struct firmware *fw;
+	int ret;
+
+	if (!gw) {
+		/* program golden: success */
+		fmc->flags &= ~FMC_DEVICE_HAS_CUSTOM;
+		fmc->flags |= FMC_DEVICE_HAS_GOLDEN;
+		return 0;
+	}
+
+	dev_info(&fmc->dev, "reprogramming with %s\n", gw);
+	ret = request_firmware(&fw, gw, &fmc->dev);
+	if (ret < 0) {
+		dev_warn(&fmc->dev, "request firmware \"%s\": error %i\n",
+			 gw, ret);
+		goto out;
+	}
+	fmc->flags &= ~FMC_DEVICE_HAS_GOLDEN;
+	fmc->flags |= FMC_DEVICE_HAS_CUSTOM;
+
+out:
+	release_firmware(fw);
+	return ret;
+}
+
+static int ff_irq_request(struct fmc_device *fmc, irq_handler_t handler,
+			    char *name, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+/* FIXME: should also have some fake FMC GPIO mapping */
+
+
+/*
+ * This work function is called when we changed the eeprom. It removes the
+ * current fmc device and registers a new one, with different identifiers.
+ */
+static struct ff_dev *ff_dev_create(void); /* defined later */
+
+static void ff_work_fn(struct work_struct *work)
+{
+	struct ff_dev *ff = ff_current_dev;
+	int ret;
+
+	fmc_device_unregister_n(ff->fmc, ff_nr_dev);
+	device_unregister(&ff->dev);
+	ff_current_dev = NULL;
+
+	ff = ff_dev_create();
+	if (IS_ERR(ff)) {
+		pr_warning("%s: can't re-create FMC devices\n", __func__);
+		return;
+	}
+	ret = fmc_device_register_n(ff->fmc, ff_nr_dev);
+	if (ret < 0) {
+		dev_warn(&ff->dev, "can't re-register FMC devices\n");
+		device_unregister(&ff->dev);
+		return;
+	}
+
+	ff_current_dev = ff;
+}
+
+static DECLARE_DELAYED_WORK(ff_work, ff_work_fn);
+
+
+/* low-level i2c */
+static int ff_eeprom_read(struct fmc_device *fmc, uint32_t offset,
+		void *buf, size_t size)
+{
+	if (offset > FF_EEPROM_SIZE)
+		return -EINVAL;
+	if (offset + size > FF_EEPROM_SIZE)
+		size = FF_EEPROM_SIZE - offset;
+	memcpy(buf, fmc->eeprom + offset, size);
+	return size;
+}
+
+static int ff_eeprom_write(struct fmc_device *fmc, uint32_t offset,
+		    const void *buf, size_t size)
+{
+	if (offset > FF_EEPROM_SIZE)
+		return -EINVAL;
+	if (offset + size > FF_EEPROM_SIZE)
+		size = FF_EEPROM_SIZE - offset;
+	dev_info(&fmc->dev, "write_eeprom: offset %i, size %zi\n",
+		 (int)offset, size);
+	memcpy(fmc->eeprom + offset, buf, size);
+	schedule_delayed_work(&ff_work, HZ * 2); /* remove, replug, in 2s */
+	return size;
+}
+
+/* i2c operations for fmc */
+static int ff_read_ee(struct fmc_device *fmc, int pos, void *data, int len)
+{
+	if (!(fmc->flags & FMC_DEVICE_HAS_GOLDEN))
+		return -EOPNOTSUPP;
+	return ff_eeprom_read(fmc, pos, data, len);
+}
+
+static int ff_write_ee(struct fmc_device *fmc, int pos,
+			 const void *data, int len)
+{
+	if (!(fmc->flags & FMC_DEVICE_HAS_GOLDEN))
+		return -EOPNOTSUPP;
+	return ff_eeprom_write(fmc, pos, data, len);
+}
+
+/* readl and writel do not do anything. Don't waste RAM with "base" */
+static uint32_t ff_readl(struct fmc_device *fmc, int offset)
+{
+	return 0;
+}
+
+static void ff_writel(struct fmc_device *fmc, uint32_t value, int offset)
+{
+	return;
+}
+
+/* validate is useful so fmc-write-eeprom will not reprogram every 2 seconds */
+static int ff_validate(struct fmc_device *fmc, struct fmc_driver *drv)
+{
+	int i;
+
+	if (!drv->busid_n)
+		return 0; /* everyhing is valid */
+	for (i = 0; i < drv->busid_n; i++)
+		if (drv->busid_val[i] == fmc->device_id)
+			return i;
+	return -ENOENT;
+}
+
+
+
+static struct fmc_operations ff_fmc_operations = {
+	.read32 =		ff_readl,
+	.write32 =		ff_writel,
+	.reprogram =		ff_reprogram,
+	.irq_request =		ff_irq_request,
+	.read_ee =		ff_read_ee,
+	.write_ee =		ff_write_ee,
+	.validate =		ff_validate,
+};
+
+/* This device is kmalloced: release it */
+static void ff_dev_release(struct device *dev)
+{
+	struct ff_dev *ff = container_of(dev, struct ff_dev, dev);
+	kfree(ff);
+}
+
+static struct fmc_device ff_template_fmc = {
+	.version = FMC_VERSION,
+	.owner = THIS_MODULE,
+	.carrier_name = "fake-fmc-carrier",
+	.device_id = 0xf001, /* fool */
+	.eeprom_len = sizeof(ff_eeimg[0]),
+	.memlen = 0x1000, /* 4k, to show something */
+	.op = &ff_fmc_operations,
+	.hwdev = NULL, /* filled at creation time */
+	.flags = FMC_DEVICE_HAS_GOLDEN,
+};
+
+static struct ff_dev *ff_dev_create(void)
+{
+	struct ff_dev *ff;
+	struct fmc_device *fmc;
+	int i, ret;
+
+	ff = kzalloc(sizeof(*ff), GFP_KERNEL);
+	if (!ff)
+		return ERR_PTR(-ENOMEM);
+	dev_set_name(&ff->dev, "fake-fmc-carrier");
+	ff->dev.release = ff_dev_release;
+
+	ret = device_register(&ff->dev);
+	if (ret < 0) {
+		put_device(&ff->dev);
+		return ERR_PTR(ret);
+	}
+
+	/* Create fmc structures that refer to this new "hw" device */
+	for (i = 0; i < ff_nr_dev; i++) {
+		fmc = kmemdup(&ff_template_fmc, sizeof(ff_template_fmc),
+			      GFP_KERNEL);
+		fmc->hwdev = &ff->dev;
+		fmc->carrier_data = ff;
+		fmc->nr_slots = ff_nr_dev;
+		/* the following fields are different for each slot */
+		fmc->eeprom = ff_eeimg[i];
+		fmc->eeprom_addr = 0x50 + 2 * i;
+		fmc->slot_id = i;
+		ff->fmc[i] = fmc;
+		/* increment the identifier, each must be different */
+		ff_template_fmc.device_id++;
+	}
+	return ff;
+}
+
+/* init and exit */
+static int ff_init(void)
+{
+	struct ff_dev *ff;
+	const struct firmware *fw;
+	int i, len, ret = 0;
+
+	/* Replicate the default eeprom for the max number of mezzanines */
+	for (i = 1; i < FF_MAX_MEZZANINES; i++)
+		memcpy(ff_eeimg[i], ff_eeimg[0], sizeof(ff_eeimg[0]));
+
+	if (ff_nr_eeprom > ff_nr_dev)
+		ff_nr_dev = ff_nr_eeprom;
+
+	ff = ff_dev_create();
+	if (IS_ERR(ff))
+		return PTR_ERR(ff);
+
+	/* If the user passed "eeprom=" as a parameter, fetch them */
+	for (i = 0; i < ff_nr_eeprom; i++) {
+		if (!strlen(ff_eeprom[i]))
+			continue;
+		ret = request_firmware(&fw, ff_eeprom[i], &ff->dev);
+		if (ret < 0) {
+			dev_err(&ff->dev, "Mezzanine %i: can't load \"%s\" "
+				"(error %i)\n", i, ff_eeprom[i], -ret);
+		} else {
+			len = min_t(size_t, fw->size, (size_t)FF_EEPROM_SIZE);
+			memcpy(ff_eeimg[i], fw->data, len);
+			release_firmware(fw);
+			dev_info(&ff->dev, "Mezzanine %i: eeprom \"%s\"\n", i,
+				ff_eeprom[i]);
+		}
+	}
+
+	ret = fmc_device_register_n(ff->fmc, ff_nr_dev);
+	if (ret) {
+		device_unregister(&ff->dev);
+		return ret;
+	}
+	ff_current_dev = ff;
+	return ret;
+}
+
+static void ff_exit(void)
+{
+	if (ff_current_dev) {
+		fmc_device_unregister_n(ff_current_dev->fmc, ff_nr_dev);
+		device_unregister(&ff_current_dev->dev);
+	}
+	cancel_delayed_work_sync(&ff_work);
+}
+
+module_init(ff_init);
+module_exit(ff_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");

diff --git a/drivers/fmc/fmc-match.c b/drivers/fmc/fmc-match.c
new file mode 100644
index 0000000..104a5ef
--- /dev/null
+++ b/drivers/fmc/fmc-match.c

@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2012 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ *
+ * This work is part of the White Rabbit project, a research effort led
+ * by CERN, the European Institute for Nuclear Research.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fmc.h>
+#include <linux/ipmi-fru.h>
+
+/* The fru parser is both user and kernel capable: it needs alloc */
+void *fru_alloc(size_t size)
+{
+	return kzalloc(size, GFP_KERNEL);
+}
+
+/* The actual match function */
+int fmc_match(struct device *dev, struct device_driver *drv)
+{
+	struct fmc_driver *fdrv = to_fmc_driver(drv);
+	struct fmc_device *fdev = to_fmc_device(dev);
+	struct fmc_fru_id *fid;
+	int i, matched = 0;
+
+	/* This currently only matches the EEPROM (FRU id) */
+	fid = fdrv->id_table.fru_id;
+	if (!fid) {
+		dev_warn(&fdev->dev, "Driver has no ID: matches all\n");
+		matched = 1;
+	} else {
+		if (!fdev->id.manufacturer || !fdev->id.product_name)
+			return 0; /* the device has no FRU information */
+		for (i = 0; i < fdrv->id_table.fru_id_nr; i++, fid++) {
+			if (fid->manufacturer &&
+			    strcmp(fid->manufacturer, fdev->id.manufacturer))
+				continue;
+			if (fid->product_name &&
+			    strcmp(fid->product_name, fdev->id.product_name))
+				continue;
+			matched = 1;
+			break;
+		}
+	}
+
+	/* FIXME: match SDB contents */
+	return matched;
+}
+
+/* This function creates ID info for a newly registered device */
+int fmc_fill_id_info(struct fmc_device *fmc)
+{
+	struct fru_common_header *h;
+	struct fru_board_info_area *bia;
+	int ret, allocated = 0;
+
+	/* If we know the eeprom length, try to read it off the device */
+	if (fmc->eeprom_len && !fmc->eeprom) {
+		fmc->eeprom = kzalloc(fmc->eeprom_len, GFP_KERNEL);
+		if (!fmc->eeprom)
+			return -ENOMEM;
+		allocated = 1;
+		ret = fmc->op->read_ee(fmc, 0, fmc->eeprom, fmc->eeprom_len);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* If no eeprom, continue with other matches */
+	if (!fmc->eeprom)
+		return 0;
+
+	dev_info(fmc->hwdev, "mezzanine %i\n", fmc->slot_id); /* header */
+
+	/* So we have the eeprom: parse the FRU part (if any) */
+	h = (void *)fmc->eeprom;
+	if (h->format != 1) {
+		pr_info("      EEPROM has no FRU information\n");
+		goto out;
+	}
+	if (!fru_header_cksum_ok(h)) {
+		pr_info("      FRU: wrong header checksum\n");
+		goto out;
+	}
+	bia = fru_get_board_area(h);
+	if (!fru_bia_cksum_ok(bia)) {
+		pr_info("      FRU: wrong board area checksum\n");
+		goto out;
+	}
+	fmc->id.manufacturer = fru_get_board_manufacturer(h);
+	fmc->id.product_name = fru_get_product_name(h);
+	pr_info("      Manufacturer: %s\n", fmc->id.manufacturer);
+	pr_info("      Product name: %s\n", fmc->id.product_name);
+
+	/* Create the short name (FIXME: look in sdb as well) */
+	fmc->mezzanine_name = kstrdup(fmc->id.product_name, GFP_KERNEL);
+
+out:
+	if (allocated) {
+		kfree(fmc->eeprom);
+		fmc->eeprom = NULL;
+	}
+	return 0; /* no error: let other identification work */
+}
+
+/* Some ID data is allocated using fru_alloc() above, so release it */
+void fmc_free_id_info(struct fmc_device *fmc)
+{
+	kfree(fmc->mezzanine_name);
+	kfree(fmc->id.manufacturer);
+	kfree(fmc->id.product_name);
+}

diff --git a/drivers/fmc/fmc-sdb.c b/drivers/fmc/fmc-sdb.c
new file mode 100644
index 0000000..79adc39
--- /dev/null
+++ b/drivers/fmc/fmc-sdb.c

@@ -0,0 +1,266 @@
+/*
+ * Copyright (C) 2012 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ *
+ * This work is part of the White Rabbit project, a research effort led
+ * by CERN, the European Institute for Nuclear Research.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/fmc.h>
+#include <linux/sdb.h>
+#include <linux/err.h>
+#include <linux/fmc-sdb.h>
+#include <asm/byteorder.h>
+
+static uint32_t __sdb_rd(struct fmc_device *fmc, unsigned long address,
+			int convert)
+{
+	uint32_t res = fmc_readl(fmc, address);
+	if (convert)
+		return __be32_to_cpu(res);
+	return res;
+}
+
+static struct sdb_array *__fmc_scan_sdb_tree(struct fmc_device *fmc,
+					     unsigned long sdb_addr,
+					     unsigned long reg_base, int level)
+{
+	uint32_t onew;
+	int i, j, n, convert = 0;
+	struct sdb_array *arr, *sub;
+
+	onew = fmc_readl(fmc, sdb_addr);
+	if (onew == SDB_MAGIC) {
+		/* Uh! If we are little-endian, we must convert */
+		if (SDB_MAGIC != __be32_to_cpu(SDB_MAGIC))
+			convert = 1;
+	} else if (onew == __be32_to_cpu(SDB_MAGIC)) {
+		/* ok, don't convert */
+	} else {
+		return ERR_PTR(-ENOENT);
+	}
+	/* So, the magic was there: get the count from offset 4*/
+	onew = __sdb_rd(fmc, sdb_addr + 4, convert);
+	n = __be16_to_cpu(*(uint16_t *)&onew);
+	arr = kzalloc(sizeof(*arr), GFP_KERNEL);
+	if (!arr)
+		return ERR_PTR(-ENOMEM);
+	arr->record = kzalloc(sizeof(arr->record[0]) * n, GFP_KERNEL);
+	arr->subtree = kzalloc(sizeof(arr->subtree[0]) * n, GFP_KERNEL);
+	if (!arr->record || !arr->subtree) {
+		kfree(arr->record);
+		kfree(arr->subtree);
+		kfree(arr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	arr->len = n;
+	arr->level = level;
+	arr->fmc = fmc;
+	for (i = 0; i < n; i++) {
+		union  sdb_record *r;
+
+		for (j = 0; j < sizeof(arr->record[0]); j += 4) {
+			*(uint32_t *)((void *)(arr->record + i) + j) =
+				__sdb_rd(fmc, sdb_addr + (i * 64) + j, convert);
+		}
+		r = &arr->record[i];
+		arr->subtree[i] = ERR_PTR(-ENODEV);
+		if (r->empty.record_type == sdb_type_bridge) {
+			struct sdb_component *c = &r->bridge.sdb_component;
+			uint64_t subaddr = __be64_to_cpu(r->bridge.sdb_child);
+			uint64_t newbase = __be64_to_cpu(c->addr_first);
+
+			subaddr += reg_base;
+			newbase += reg_base;
+			sub = __fmc_scan_sdb_tree(fmc, subaddr, newbase,
+						  level + 1);
+			arr->subtree[i] = sub; /* may be error */
+			if (IS_ERR(sub))
+				continue;
+			sub->parent = arr;
+			sub->baseaddr = newbase;
+		}
+	}
+	return arr;
+}
+
+int fmc_scan_sdb_tree(struct fmc_device *fmc, unsigned long address)
+{
+	struct sdb_array *ret;
+	if (fmc->sdb)
+		return -EBUSY;
+	ret = __fmc_scan_sdb_tree(fmc, address, 0 /* regs */, 0);
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
+	fmc->sdb = ret;
+	return 0;
+}
+EXPORT_SYMBOL(fmc_scan_sdb_tree);
+
+static void __fmc_sdb_free(struct sdb_array *arr)
+{
+	int i, n;
+
+	if (!arr)
+		return;
+	n = arr->len;
+	for (i = 0; i < n; i++) {
+		if (IS_ERR(arr->subtree[i]))
+			continue;
+		__fmc_sdb_free(arr->subtree[i]);
+	}
+	kfree(arr->record);
+	kfree(arr->subtree);
+	kfree(arr);
+}
+
+int fmc_free_sdb_tree(struct fmc_device *fmc)
+{
+	__fmc_sdb_free(fmc->sdb);
+	fmc->sdb = NULL;
+	return 0;
+}
+EXPORT_SYMBOL(fmc_free_sdb_tree);
+
+/* This helper calls reprogram and inizialized sdb as well */
+int fmc_reprogram(struct fmc_device *fmc, struct fmc_driver *d, char *gw,
+			 int sdb_entry)
+{
+	int ret;
+
+	ret = fmc->op->reprogram(fmc, d, gw);
+	if (ret < 0)
+		return ret;
+	if (sdb_entry < 0)
+		return ret;
+
+	/* We are required to find SDB at a given offset */
+	ret = fmc_scan_sdb_tree(fmc, sdb_entry);
+	if (ret < 0) {
+		dev_err(&fmc->dev, "Can't find SDB at address 0x%x\n",
+			sdb_entry);
+		return -ENODEV;
+	}
+	fmc_dump_sdb(fmc);
+	return 0;
+}
+EXPORT_SYMBOL(fmc_reprogram);
+
+static void __fmc_show_sdb_tree(const struct fmc_device *fmc,
+				const struct sdb_array *arr)
+{
+	int i, j, n = arr->len, level = arr->level;
+	const struct sdb_array *ap;
+
+	for (i = 0; i < n; i++) {
+		unsigned long base;
+		union  sdb_record *r;
+		struct sdb_product *p;
+		struct sdb_component *c;
+		r = &arr->record[i];
+		c = &r->dev.sdb_component;
+		p = &c->product;
+		base = 0;
+		for (ap = arr; ap; ap = ap->parent)
+			base += ap->baseaddr;
+		dev_info(&fmc->dev, "SDB: ");
+
+		for (j = 0; j < level; j++)
+			printk(KERN_CONT "   ");
+		switch (r->empty.record_type) {
+		case sdb_type_interconnect:
+			printk(KERN_CONT "%08llx:%08x %.19s\n",
+			       __be64_to_cpu(p->vendor_id),
+			       __be32_to_cpu(p->device_id),
+			       p->name);
+			break;
+		case sdb_type_device:
+			printk(KERN_CONT "%08llx:%08x %.19s (%08llx-%08llx)\n",
+			       __be64_to_cpu(p->vendor_id),
+			       __be32_to_cpu(p->device_id),
+			       p->name,
+			       __be64_to_cpu(c->addr_first) + base,
+			       __be64_to_cpu(c->addr_last) + base);
+			break;
+		case sdb_type_bridge:
+			printk(KERN_CONT "%08llx:%08x %.19s (bridge: %08llx)\n",
+			       __be64_to_cpu(p->vendor_id),
+			       __be32_to_cpu(p->device_id),
+			       p->name,
+			       __be64_to_cpu(c->addr_first) + base);
+			if (IS_ERR(arr->subtree[i])) {
+				printk(KERN_CONT "(bridge error %li)\n",
+				       PTR_ERR(arr->subtree[i]));
+				break;
+			}
+			__fmc_show_sdb_tree(fmc, arr->subtree[i]);
+			break;
+		case sdb_type_integration:
+			printk(KERN_CONT "integration\n");
+			break;
+		case sdb_type_repo_url:
+			printk(KERN_CONT "repo-url\n");
+			break;
+		case sdb_type_synthesis:
+			printk(KERN_CONT "synthesis-info\n");
+			break;
+		case sdb_type_empty:
+			printk(KERN_CONT "empty\n");
+			break;
+		default:
+			printk(KERN_CONT "UNKNOWN TYPE 0x%02x\n",
+			       r->empty.record_type);
+			break;
+		}
+	}
+}
+
+void fmc_show_sdb_tree(const struct fmc_device *fmc)
+{
+	if (!fmc->sdb)
+		return;
+	__fmc_show_sdb_tree(fmc, fmc->sdb);
+}
+EXPORT_SYMBOL(fmc_show_sdb_tree);
+
+signed long fmc_find_sdb_device(struct sdb_array *tree,
+				uint64_t vid, uint32_t did, unsigned long *sz)
+{
+	signed long res = -ENODEV;
+	union  sdb_record *r;
+	struct sdb_product *p;
+	struct sdb_component *c;
+	int i, n = tree->len;
+	uint64_t last, first;
+
+	/* FIXME: what if the first interconnect is not at zero? */
+	for (i = 0; i < n; i++) {
+		r = &tree->record[i];
+		c = &r->dev.sdb_component;
+		p = &c->product;
+
+		if (!IS_ERR(tree->subtree[i]))
+			res = fmc_find_sdb_device(tree->subtree[i],
+						  vid, did, sz);
+		if (res >= 0)
+			return res + tree->baseaddr;
+		if (r->empty.record_type != sdb_type_device)
+			continue;
+		if (__be64_to_cpu(p->vendor_id) != vid)
+			continue;
+		if (__be32_to_cpu(p->device_id) != did)
+			continue;
+		/* found */
+		last = __be64_to_cpu(c->addr_last);
+		first = __be64_to_cpu(c->addr_first);
+		if (sz)
+			*sz = (typeof(*sz))(last + 1 - first);
+		return first + tree->baseaddr;
+	}
+	return res;
+}
+EXPORT_SYMBOL(fmc_find_sdb_device);

diff --git a/drivers/fmc/fmc-trivial.c b/drivers/fmc/fmc-trivial.c
new file mode 100644
index 0000000..6c590f5
--- /dev/null
+++ b/drivers/fmc/fmc-trivial.c

@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2012 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * The software is provided "as is"; the copyright holders disclaim
+ * all warranties and liabilities, to the extent permitted by
+ * applicable law.
+ */
+
+/* A trivial fmc driver that can load a gateware file and reports interrupts */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/gpio.h>
+#include <linux/fmc.h>
+
+static struct fmc_driver t_drv; /* initialized later */
+
+static irqreturn_t t_handler(int irq, void *dev_id)
+{
+	struct fmc_device *fmc = dev_id;
+
+	fmc->op->irq_ack(fmc);
+	dev_info(&fmc->dev, "received irq %i\n", irq);
+	return IRQ_HANDLED;
+}
+
+static struct fmc_gpio t_gpio[] = {
+	{
+		.gpio = FMC_GPIO_IRQ(0),
+		.mode = GPIOF_DIR_IN,
+		.irqmode = IRQF_TRIGGER_RISING,
+	}, {
+		.gpio = FMC_GPIO_IRQ(1),
+		.mode = GPIOF_DIR_IN,
+		.irqmode = IRQF_TRIGGER_RISING,
+	}
+};
+
+static int t_probe(struct fmc_device *fmc)
+{
+	int ret;
+	int index = 0;
+
+	if (fmc->op->validate)
+		index = fmc->op->validate(fmc, &t_drv);
+	if (index < 0)
+		return -EINVAL; /* not our device: invalid */
+
+	ret = fmc->op->irq_request(fmc, t_handler, "fmc-trivial", IRQF_SHARED);
+	if (ret < 0)
+		return ret;
+	/* ignore error code of call below, we really don't care */
+	fmc->op->gpio_config(fmc, t_gpio, ARRAY_SIZE(t_gpio));
+
+	/* Reprogram, if asked to. ESRCH == no filename specified */
+	ret = -ESRCH;
+	if (fmc->op->reprogram)
+		ret = fmc->op->reprogram(fmc, &t_drv, "");
+	if (ret == -ESRCH)
+		ret = 0;
+	if (ret < 0)
+		fmc->op->irq_free(fmc);
+
+	/* FIXME: reprogram LM32 too */
+	return ret;
+}
+
+static int t_remove(struct fmc_device *fmc)
+{
+	fmc->op->irq_free(fmc);
+	return 0;
+}
+
+static struct fmc_driver t_drv = {
+	.version = FMC_VERSION,
+	.driver.name = KBUILD_MODNAME,
+	.probe = t_probe,
+	.remove = t_remove,
+	/* no table, as the current match just matches everything */
+};
+
+ /* We accept the generic parameters */
+FMC_PARAM_BUSID(t_drv);
+FMC_PARAM_GATEWARE(t_drv);
+
+static int t_init(void)
+{
+	int ret;
+
+	ret = fmc_driver_register(&t_drv);
+	return ret;
+}
+
+static void t_exit(void)
+{
+	fmc_driver_unregister(&t_drv);
+}
+
+module_init(t_init);
+module_exit(t_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");

diff --git a/drivers/fmc/fmc-write-eeprom.c b/drivers/fmc/fmc-write-eeprom.c
new file mode 100644
index 0000000..2cc680d
--- /dev/null
+++ b/drivers/fmc/fmc-write-eeprom.c

@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2012 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ *
+ * This work is part of the White Rabbit project, a research effort led
+ * by CERN, the European Institute for Nuclear Research.
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/firmware.h>
+#include <linux/init.h>
+#include <linux/fmc.h>
+#include <asm/unaligned.h>
+
+/*
+ * This module uses the firmware loader to program the whole or part
+ * of the FMC eeprom. The meat is in the _run functions.  However, no
+ * default file name is provided, to avoid accidental mishaps. Also,
+ * you must pass the busid argument
+ */
+static struct fmc_driver fwe_drv;
+
+FMC_PARAM_BUSID(fwe_drv);
+
+/* The "file=" is like the generic "gateware=" used elsewhere */
+static char *fwe_file[FMC_MAX_CARDS];
+static int fwe_file_n;
+module_param_array_named(file, fwe_file, charp, &fwe_file_n, 444);
+
+static int fwe_run_tlv(struct fmc_device *fmc, const struct firmware *fw,
+	int write)
+{
+	const uint8_t *p = fw->data;
+	int len = fw->size;
+	uint16_t thislen, thisaddr;
+	int err;
+
+	/* format is: 'w' addr16 len16 data... */
+	while (len > 5) {
+		thisaddr = get_unaligned_le16(p+1);
+		thislen = get_unaligned_le16(p+3);
+		if (p[0] != 'w' || thislen + 5 > len) {
+			dev_err(&fmc->dev, "invalid tlv at offset %ti\n",
+				p - fw->data);
+			return -EINVAL;
+		}
+		err = 0;
+		if (write) {
+			dev_info(&fmc->dev, "write %i bytes at 0x%04x\n",
+				 thislen, thisaddr);
+			err = fmc->op->write_ee(fmc, thisaddr, p + 5, thislen);
+		}
+		if (err < 0) {
+			dev_err(&fmc->dev, "write failure @0x%04x\n",
+				thisaddr);
+			return err;
+		}
+		p += 5 + thislen;
+		len -= 5 + thislen;
+	}
+	if (write)
+		dev_info(&fmc->dev, "write_eeprom: success\n");
+	return 0;
+}
+
+static int fwe_run_bin(struct fmc_device *fmc, const struct firmware *fw)
+{
+	int ret;
+
+	dev_info(&fmc->dev, "programming %zi bytes\n", fw->size);
+	ret = fmc->op->write_ee(fmc, 0, (void *)fw->data, fw->size);
+	if (ret < 0) {
+		dev_info(&fmc->dev, "write_eeprom: error %i\n", ret);
+		return ret;
+	}
+	dev_info(&fmc->dev, "write_eeprom: success\n");
+	return 0;
+}
+
+static int fwe_run(struct fmc_device *fmc, const struct firmware *fw, char *s)
+{
+	char *last4 = s + strlen(s) - 4;
+	int err;
+
+	if (!strcmp(last4, ".bin"))
+		return fwe_run_bin(fmc, fw);
+	if (!strcmp(last4, ".tlv")) {
+		err = fwe_run_tlv(fmc, fw, 0);
+		if (!err)
+			err = fwe_run_tlv(fmc, fw, 1);
+		return err;
+	}
+	dev_err(&fmc->dev, "invalid file name \"%s\"\n", s);
+	return -EINVAL;
+}
+
+/*
+ * Programming is done at probe time. Morever, only those listed with
+ * busid= are programmed.
+ * card is probed for, only one is programmed. Unfortunately, it's
+ * difficult to know in advance when probing the first card if others
+ * are there.
+ */
+int fwe_probe(struct fmc_device *fmc)
+{
+	int err, index = 0;
+	const struct firmware *fw;
+	struct device *dev = &fmc->dev;
+	char *s;
+
+	if (!fwe_drv.busid_n) {
+		dev_err(dev, "%s: no busid passed, refusing all cards\n",
+			KBUILD_MODNAME);
+		return -ENODEV;
+	}
+	if (fmc->op->validate)
+		index = fmc->op->validate(fmc, &fwe_drv);
+	if (index < 0) {
+		pr_err("%s: refusing device \"%s\"\n", KBUILD_MODNAME,
+		       dev_name(dev));
+		return -ENODEV;
+	}
+	if (index >= fwe_file_n) {
+		pr_err("%s: no filename for device index %i\n",
+			KBUILD_MODNAME, index);
+		return -ENODEV;
+	}
+	s = fwe_file[index];
+	if (!s) {
+		pr_err("%s: no filename for \"%s\" not programming\n",
+		       KBUILD_MODNAME, dev_name(dev));
+		return -ENOENT;
+	}
+	err = request_firmware(&fw, s, dev);
+	if (err < 0) {
+		dev_err(&fmc->dev, "request firmware \"%s\": error %i\n",
+			s, err);
+		return err;
+	}
+	fwe_run(fmc, fw, s);
+	release_firmware(fw);
+	return 0;
+}
+
+int fwe_remove(struct fmc_device *fmc)
+{
+	return 0;
+}
+
+static struct fmc_driver fwe_drv = {
+	.version = FMC_VERSION,
+	.driver.name = KBUILD_MODNAME,
+	.probe = fwe_probe,
+	.remove = fwe_remove,
+	/* no table, as the current match just matches everything */
+};
+
+static int fwe_init(void)
+{
+	int ret;
+
+	ret = fmc_driver_register(&fwe_drv);
+	return ret;
+}
+
+static void fwe_exit(void)
+{
+	fmc_driver_unregister(&fwe_drv);
+}
+
+module_init(fwe_init);
+module_exit(fwe_exit);
+
+MODULE_LICENSE("GPL");

diff --git a/drivers/fmc/fru-parse.c b/drivers/fmc/fru-parse.c
new file mode 100644
index 0000000..cb46263
--- /dev/null
+++ b/drivers/fmc/fru-parse.c

@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2012 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ *
+ * This work is part of the White Rabbit project, a research effort led
+ * by CERN, the European Institute for Nuclear Research.
+ */
+#include <linux/ipmi-fru.h>
+
+/* Some internal helpers */
+static struct fru_type_length *
+__fru_get_board_tl(struct fru_common_header *header, int nr)
+{
+	struct fru_board_info_area *bia;
+	struct fru_type_length *tl;
+
+	bia = fru_get_board_area(header);
+	tl = bia->tl;
+	while (nr > 0 && !fru_is_eof(tl)) {
+		tl = fru_next_tl(tl);
+		nr--;
+	}
+	if (fru_is_eof(tl))
+		return NULL;
+	return tl;
+}
+
+static char *__fru_alloc_get_tl(struct fru_common_header *header, int nr)
+{
+	struct fru_type_length *tl;
+	char *res;
+	int len;
+
+	tl = __fru_get_board_tl(header, nr);
+	if (!tl)
+		return NULL;
+	len = fru_strlen(tl);
+	res = fru_alloc(fru_strlen(tl) + 1);
+	if (!res)
+		return NULL;
+	return fru_strcpy(res, tl);
+}
+
+/* Public checksum verifiers */
+int fru_header_cksum_ok(struct fru_common_header *header)
+{
+	uint8_t *ptr = (void *)header;
+	int i, sum;
+
+	for (i = sum = 0; i < sizeof(*header); i++)
+		sum += ptr[i];
+	return (sum & 0xff) == 0;
+}
+int fru_bia_cksum_ok(struct fru_board_info_area *bia)
+{
+	uint8_t *ptr = (void *)bia;
+	int i, sum;
+
+	for (i = sum = 0; i < 8 * bia->area_len; i++)
+		sum += ptr[i];
+	return (sum & 0xff) == 0;
+}
+
+/* Get various stuff, trivial */
+char *fru_get_board_manufacturer(struct fru_common_header *header)
+{
+	return __fru_alloc_get_tl(header, 0);
+}
+char *fru_get_product_name(struct fru_common_header *header)
+{
+	return __fru_alloc_get_tl(header, 1);
+}
+char *fru_get_serial_number(struct fru_common_header *header)
+{
+	return __fru_alloc_get_tl(header, 2);
+}
+char *fru_get_part_number(struct fru_common_header *header)
+{
+	return __fru_alloc_get_tl(header, 3);
+}

diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index d3f7d2d..4a43036 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c

@@ -1094,6 +1094,9 @@
 	const struct omap_gpio_platform_data *pdata;
 	struct resource *res;
 	struct gpio_bank *bank;
+#ifdef CONFIG_ARCH_OMAP1
+	int irq_base;
+#endif
 
 	match = of_match_device(of_match_ptr(omap_gpio_match), dev);
 
@@ -1135,11 +1138,28 @@
 				pdata->get_context_loss_count;
 	}
 
+#ifdef CONFIG_ARCH_OMAP1
+	/*
+	 * REVISIT: Once we have OMAP1 supporting SPARSE_IRQ, we can drop
+	 * irq_alloc_descs() and irq_domain_add_legacy() and just use a
+	 * linear IRQ domain mapping for all OMAP platforms.
+	 */
+	irq_base = irq_alloc_descs(-1, 0, bank->width, 0);
+	if (irq_base < 0) {
+		dev_err(dev, "Couldn't allocate IRQ numbers\n");
+		return -ENODEV;
+	}
 
+	bank->domain = irq_domain_add_legacy(node, bank->width, irq_base,
+					     0, &irq_domain_simple_ops, NULL);
+#else
 	bank->domain = irq_domain_add_linear(node, bank->width,
 					     &irq_domain_simple_ops, NULL);
-	if (!bank->domain)
+#endif
+	if (!bank->domain) {
+		dev_err(dev, "Couldn't register an IRQ domain\n");
 		return -ENODEV;
+	}
 
 	if (bank->regs->set_dataout && bank->regs->clr_dataout)
 		bank->set_dataout = _set_gpio_dataout_reg;

diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index 1d4f7c9..67969e2 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c

@@ -617,7 +617,6 @@
 	case _DRM_FRAME_BUFFER:
 	case _DRM_REGISTERS:
 		offset = drm_core_get_reg_ofs(dev);
-		vma->vm_flags |= VM_IO;	/* not in core dump */
 		vma->vm_page_prot = drm_io_prot(map->type, vma);
 		if (io_remap_pfn_range(vma, vma->vm_start,
 				       (map->offset + offset) >> PAGE_SHIFT,

diff --git a/drivers/gpu/drm/i810/i810_dma.c b/drivers/gpu/drm/i810/i810_dma.c
index 004ecdf..ada49ed 100644
--- a/drivers/gpu/drm/i810/i810_dma.c
+++ b/drivers/gpu/drm/i810/i810_dma.c

@@ -97,7 +97,7 @@
 	buf = dev_priv->mmap_buffer;
 	buf_priv = buf->dev_private;
 
-	vma->vm_flags |= (VM_IO | VM_DONTCOPY);
+	vma->vm_flags |= VM_DONTCOPY;
 
 	buf_priv->currently_mapped = I810_BUF_MAPPED;
 

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b9d00dc..9669a0b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h

@@ -1697,6 +1697,8 @@
 struct dma_buf *i915_gem_prime_export(struct drm_device *dev,
 				struct drm_gem_object *gem_obj, int flags);
 
+void i915_gem_restore_fences(struct drm_device *dev);
+
 /* i915_gem_context.c */
 void i915_gem_context_init(struct drm_device *dev);
 void i915_gem_context_fini(struct drm_device *dev);

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 970ad17..9e35daf 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c

@@ -1801,7 +1801,14 @@
 			gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD;
 			gfp &= ~(__GFP_IO | __GFP_WAIT);
 		}
-
+#ifdef CONFIG_SWIOTLB
+		if (swiotlb_nr_tbl()) {
+			st->nents++;
+			sg_set_page(sg, page, PAGE_SIZE, 0);
+			sg = sg_next(sg);
+			continue;
+		}
+#endif
 		if (!i || page_to_pfn(page) != last_pfn + 1) {
 			if (i)
 				sg = sg_next(sg);
@@ -1812,8 +1819,10 @@
 		}
 		last_pfn = page_to_pfn(page);
 	}
-
-	sg_mark_end(sg);
+#ifdef CONFIG_SWIOTLB
+	if (!swiotlb_nr_tbl())
+#endif
+		sg_mark_end(sg);
 	obj->pages = st;
 
 	if (i915_gem_object_needs_bit17_swizzle(obj))
@@ -2117,25 +2126,15 @@
 	}
 }
 
-static void i915_gem_reset_fences(struct drm_device *dev)
+void i915_gem_restore_fences(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int i;
 
 	for (i = 0; i < dev_priv->num_fence_regs; i++) {
 		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
-
-		if (reg->obj)
-			i915_gem_object_fence_lost(reg->obj);
-
-		i915_gem_write_fence(dev, i, NULL);
-
-		reg->pin_count = 0;
-		reg->obj = NULL;
-		INIT_LIST_HEAD(&reg->lru_list);
+		i915_gem_write_fence(dev, i, reg->obj);
 	}
-
-	INIT_LIST_HEAD(&dev_priv->mm.fence_list);
 }
 
 void i915_gem_reset(struct drm_device *dev)
@@ -2158,8 +2157,7 @@
 		obj->base.read_domains &= ~I915_GEM_GPU_DOMAINS;
 	}
 
-	/* The fence registers are invalidated so clear them out */
-	i915_gem_reset_fences(dev);
+	i915_gem_restore_fences(dev);
 }
 
 /**
@@ -3865,8 +3863,6 @@
 	if (!drm_core_check_feature(dev, DRIVER_MODESET))
 		i915_gem_evict_everything(dev);
 
-	i915_gem_reset_fences(dev);
-
 	/* Hack!  Don't let anybody do execbuf while we don't control the chip.
 	 * We need to replace this with a semaphore, or something.
 	 * And not confound mm.suspended!
@@ -4193,7 +4189,8 @@
 		dev_priv->num_fence_regs = 8;
 
 	/* Initialize fence registers to zero */
-	i915_gem_reset_fences(dev);
+	INIT_LIST_HEAD(&dev_priv->mm.fence_list);
+	i915_gem_restore_fences(dev);
 
 	i915_gem_detect_bit_6_swizzle(dev);
 	init_waitqueue_head(&dev_priv->pending_flip_queue);

diff --git a/drivers/gpu/drm/i915/i915_suspend.c b/drivers/gpu/drm/i915/i915_suspend.c
index 41f0fde..369b3d8 100644
--- a/drivers/gpu/drm/i915/i915_suspend.c
+++ b/drivers/gpu/drm/i915/i915_suspend.c

@@ -384,6 +384,7 @@
 
 	mutex_lock(&dev->struct_mutex);
 
+	i915_gem_restore_fences(dev);
 	i915_restore_display(dev);
 
 	if (!drm_core_check_feature(dev, DRIVER_MODESET)) {

diff --git a/drivers/gpu/drm/qxl/qxl_ioctl.c b/drivers/gpu/drm/qxl/qxl_ioctl.c
index a4b71b2..a30f294 100644
--- a/drivers/gpu/drm/qxl/qxl_ioctl.c
+++ b/drivers/gpu/drm/qxl/qxl_ioctl.c

@@ -171,6 +171,11 @@
 		if (user_cmd.command_size > PAGE_SIZE - sizeof(union qxl_release_info))
 			return -EINVAL;
 
+		if (!access_ok(VERIFY_READ,
+			       (void *)(unsigned long)user_cmd.command,
+			       user_cmd.command_size))
+			return -EFAULT;
+
 		ret = qxl_alloc_release_reserved(qdev,
 						 sizeof(union qxl_release_info) +
 						 user_cmd.command_size,

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 0b122f8..6de6c98 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c

@@ -116,6 +116,15 @@
 	unsigned long flags;
 	int ret, t, err = 0;
 
+	spin_lock_irqsave(&newchannel->sc_lock, flags);
+	if (newchannel->state == CHANNEL_OPEN_STATE) {
+		newchannel->state = CHANNEL_OPENING_STATE;
+	} else {
+		spin_unlock_irqrestore(&newchannel->sc_lock, flags);
+		return -EINVAL;
+	}
+	spin_unlock_irqrestore(&newchannel->sc_lock, flags);
+
 	newchannel->onchannel_callback = onchannelcallback;
 	newchannel->channel_callback_context = context;
 
@@ -216,6 +225,9 @@
 	list_del(&open_info->msglistentry);
 	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
 
+	if (err == 0)
+		newchannel->state = CHANNEL_OPENED_STATE;
+
 	kfree(open_info);
 	return err;
 
@@ -500,15 +512,14 @@
 }
 EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl);
 
-/*
- * vmbus_close - Close the specified channel
- */
-void vmbus_close(struct vmbus_channel *channel)
+static void vmbus_close_internal(struct vmbus_channel *channel)
 {
 	struct vmbus_channel_close_channel *msg;
 	int ret;
 	unsigned long flags;
 
+	channel->state = CHANNEL_OPEN_STATE;
+	channel->sc_creation_callback = NULL;
 	/* Stop callback and cancel the timer asap */
 	spin_lock_irqsave(&channel->inbound_lock, flags);
 	channel->onchannel_callback = NULL;
@@ -538,6 +549,37 @@
 
 
 }
+
+/*
+ * vmbus_close - Close the specified channel
+ */
+void vmbus_close(struct vmbus_channel *channel)
+{
+	struct list_head *cur, *tmp;
+	struct vmbus_channel *cur_channel;
+
+	if (channel->primary_channel != NULL) {
+		/*
+		 * We will only close sub-channels when
+		 * the primary is closed.
+		 */
+		return;
+	}
+	/*
+	 * Close all the sub-channels first and then close the
+	 * primary channel.
+	 */
+	list_for_each_safe(cur, tmp, &channel->sc_list) {
+		cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
+		if (cur_channel->state != CHANNEL_OPENED_STATE)
+			continue;
+		vmbus_close_internal(cur_channel);
+	}
+	/*
+	 * Now close the primary.
+	 */
+	vmbus_close_internal(channel);
+}
 EXPORT_SYMBOL_GPL(vmbus_close);
 
 /**

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 21ef689..0df7590 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c

@@ -115,6 +115,9 @@
 		return NULL;
 
 	spin_lock_init(&channel->inbound_lock);
+	spin_lock_init(&channel->sc_lock);
+
+	INIT_LIST_HEAD(&channel->sc_list);
 
 	channel->controlwq = create_workqueue("hv_vmbus_ctl");
 	if (!channel->controlwq) {
@@ -166,6 +169,7 @@
 						     struct vmbus_channel,
 						     work);
 	unsigned long flags;
+	struct vmbus_channel *primary_channel;
 	struct vmbus_channel_relid_released msg;
 
 	vmbus_device_unregister(channel->device_obj);
@@ -174,9 +178,16 @@
 	msg.header.msgtype = CHANNELMSG_RELID_RELEASED;
 	vmbus_post_msg(&msg, sizeof(struct vmbus_channel_relid_released));
 
-	spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
-	list_del(&channel->listentry);
-	spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
+	if (channel->primary_channel == NULL) {
+		spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
+		list_del(&channel->listentry);
+		spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
+	} else {
+		primary_channel = channel->primary_channel;
+		spin_lock_irqsave(&primary_channel->sc_lock, flags);
+		list_del(&channel->listentry);
+		spin_unlock_irqrestore(&primary_channel->sc_lock, flags);
+	}
 	free_channel(channel);
 }
 
@@ -228,6 +239,24 @@
 	spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
 
 	if (!fnew) {
+		/*
+		 * Check to see if this is a sub-channel.
+		 */
+		if (newchannel->offermsg.offer.sub_channel_index != 0) {
+			/*
+			 * Process the sub-channel.
+			 */
+			newchannel->primary_channel = channel;
+			spin_lock_irqsave(&channel->sc_lock, flags);
+			list_add_tail(&newchannel->sc_list, &channel->sc_list);
+			spin_unlock_irqrestore(&channel->sc_lock, flags);
+			newchannel->state = CHANNEL_OPEN_STATE;
+			if (channel->sc_creation_callback != NULL)
+				channel->sc_creation_callback(newchannel);
+
+			return;
+		}
+
 		free_channel(newchannel);
 		return;
 	}
@@ -685,4 +714,86 @@
 	return ret;
 }
 
-/* eof */
+/*
+ * Retrieve the (sub) channel on which to send an outgoing request.
+ * When a primary channel has multiple sub-channels, we choose a
+ * channel whose VCPU binding is closest to the VCPU on which
+ * this call is being made.
+ */
+struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
+{
+	struct list_head *cur, *tmp;
+	int cur_cpu = hv_context.vp_index[smp_processor_id()];
+	struct vmbus_channel *cur_channel;
+	struct vmbus_channel *outgoing_channel = primary;
+	int cpu_distance, new_cpu_distance;
+
+	if (list_empty(&primary->sc_list))
+		return outgoing_channel;
+
+	list_for_each_safe(cur, tmp, &primary->sc_list) {
+		cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
+		if (cur_channel->state != CHANNEL_OPENED_STATE)
+			continue;
+
+		if (cur_channel->target_vp == cur_cpu)
+			return cur_channel;
+
+		cpu_distance = ((outgoing_channel->target_vp > cur_cpu) ?
+				(outgoing_channel->target_vp - cur_cpu) :
+				(cur_cpu - outgoing_channel->target_vp));
+
+		new_cpu_distance = ((cur_channel->target_vp > cur_cpu) ?
+				(cur_channel->target_vp - cur_cpu) :
+				(cur_cpu - cur_channel->target_vp));
+
+		if (cpu_distance < new_cpu_distance)
+			continue;
+
+		outgoing_channel = cur_channel;
+	}
+
+	return outgoing_channel;
+}
+EXPORT_SYMBOL_GPL(vmbus_get_outgoing_channel);
+
+static void invoke_sc_cb(struct vmbus_channel *primary_channel)
+{
+	struct list_head *cur, *tmp;
+	struct vmbus_channel *cur_channel;
+
+	if (primary_channel->sc_creation_callback == NULL)
+		return;
+
+	list_for_each_safe(cur, tmp, &primary_channel->sc_list) {
+		cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
+
+		primary_channel->sc_creation_callback(cur_channel);
+	}
+}
+
+void vmbus_set_sc_create_callback(struct vmbus_channel *primary_channel,
+				void (*sc_cr_cb)(struct vmbus_channel *new_sc))
+{
+	primary_channel->sc_creation_callback = sc_cr_cb;
+}
+EXPORT_SYMBOL_GPL(vmbus_set_sc_create_callback);
+
+bool vmbus_are_subchannels_present(struct vmbus_channel *primary)
+{
+	bool ret;
+
+	ret = !list_empty(&primary->sc_list);
+
+	if (ret) {
+		/*
+		 * Invoke the callback on sub-channel creation.
+		 * This will present a uniform interface to the
+		 * clients.
+		 */
+		invoke_sc_cb(primary);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vmbus_are_subchannels_present);

diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 253a74b..ec3b8cd 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c

@@ -246,12 +246,26 @@
 	struct vmbus_channel *channel;
 	struct vmbus_channel *found_channel  = NULL;
 	unsigned long flags;
+	struct list_head *cur, *tmp;
+	struct vmbus_channel *cur_sc;
 
 	spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
 	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
 		if (channel->offermsg.child_relid == relid) {
 			found_channel = channel;
 			break;
+		} else if (!list_empty(&channel->sc_list)) {
+			/*
+			 * Deal with sub-channels.
+			 */
+			list_for_each_safe(cur, tmp, &channel->sc_list) {
+				cur_sc = list_entry(cur, struct vmbus_channel,
+							sc_list);
+				if (cur_sc->offermsg.child_relid == relid) {
+					found_channel = cur_sc;
+					break;
+				}
+			}
 		}
 	}
 	spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);

diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index ae49237..88f4096 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c

@@ -265,6 +265,59 @@
 	return status;
 }
 
+
+int hv_synic_alloc(void)
+{
+	size_t size = sizeof(struct tasklet_struct);
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		hv_context.event_dpc[cpu] = kmalloc(size, GFP_ATOMIC);
+		if (hv_context.event_dpc[cpu] == NULL) {
+			pr_err("Unable to allocate event dpc\n");
+			goto err;
+		}
+		tasklet_init(hv_context.event_dpc[cpu], vmbus_on_event, cpu);
+
+		hv_context.synic_message_page[cpu] =
+			(void *)get_zeroed_page(GFP_ATOMIC);
+
+		if (hv_context.synic_message_page[cpu] == NULL) {
+			pr_err("Unable to allocate SYNIC message page\n");
+			goto err;
+		}
+
+		hv_context.synic_event_page[cpu] =
+			(void *)get_zeroed_page(GFP_ATOMIC);
+
+		if (hv_context.synic_event_page[cpu] == NULL) {
+			pr_err("Unable to allocate SYNIC event page\n");
+			goto err;
+		}
+	}
+
+	return 0;
+err:
+	return -ENOMEM;
+}
+
+void hv_synic_free_cpu(int cpu)
+{
+	kfree(hv_context.event_dpc[cpu]);
+	if (hv_context.synic_message_page[cpu])
+		free_page((unsigned long)hv_context.synic_event_page[cpu]);
+	if (hv_context.synic_message_page[cpu])
+		free_page((unsigned long)hv_context.synic_message_page[cpu]);
+}
+
+void hv_synic_free(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		hv_synic_free_cpu(cpu);
+}
+
 /*
  * hv_synic_init - Initialize the Synthethic Interrupt Controller.
  *
@@ -289,30 +342,6 @@
 	/* Check the version */
 	rdmsrl(HV_X64_MSR_SVERSION, version);
 
-	hv_context.event_dpc[cpu] = kmalloc(sizeof(struct tasklet_struct),
-					    GFP_ATOMIC);
-	if (hv_context.event_dpc[cpu] == NULL) {
-		pr_err("Unable to allocate event dpc\n");
-		goto cleanup;
-	}
-	tasklet_init(hv_context.event_dpc[cpu], vmbus_on_event, cpu);
-
-	hv_context.synic_message_page[cpu] =
-		(void *)get_zeroed_page(GFP_ATOMIC);
-
-	if (hv_context.synic_message_page[cpu] == NULL) {
-		pr_err("Unable to allocate SYNIC message page\n");
-		goto cleanup;
-	}
-
-	hv_context.synic_event_page[cpu] =
-		(void *)get_zeroed_page(GFP_ATOMIC);
-
-	if (hv_context.synic_event_page[cpu] == NULL) {
-		pr_err("Unable to allocate SYNIC event page\n");
-		goto cleanup;
-	}
-
 	/* Setup the Synic's message page */
 	rdmsrl(HV_X64_MSR_SIMP, simp.as_uint64);
 	simp.simp_enabled = 1;
@@ -355,14 +384,6 @@
 	rdmsrl(HV_X64_MSR_VP_INDEX, vp_index);
 	hv_context.vp_index[cpu] = (u32)vp_index;
 	return;
-
-cleanup:
-	if (hv_context.synic_event_page[cpu])
-		free_page((unsigned long)hv_context.synic_event_page[cpu]);
-
-	if (hv_context.synic_message_page[cpu])
-		free_page((unsigned long)hv_context.synic_message_page[cpu]);
-	return;
 }
 
 /*

diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 12f2f9e..d84918f 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h

@@ -527,6 +527,10 @@
 
 extern u16 hv_signal_event(void *con_id);
 
+extern int hv_synic_alloc(void);
+
+extern void hv_synic_free(void);
+
 extern void hv_synic_init(void *irqarg);
 
 extern void hv_synic_cleanup(void *arg);

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index d6fbb577..26c93cf 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c

@@ -32,7 +32,7 @@
 void hv_begin_read(struct hv_ring_buffer_info *rbi)
 {
 	rbi->ring_buffer->interrupt_mask = 1;
-	smp_mb();
+	mb();
 }
 
 u32 hv_end_read(struct hv_ring_buffer_info *rbi)
@@ -41,7 +41,7 @@
 	u32 write;
 
 	rbi->ring_buffer->interrupt_mask = 0;
-	smp_mb();
+	mb();
 
 	/*
 	 * Now check to see if the ring buffer is still empty.
@@ -71,10 +71,12 @@
 
 static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi)
 {
-	smp_mb();
+	mb();
 	if (rbi->ring_buffer->interrupt_mask)
 		return false;
 
+	/* check interrupt_mask before read_index */
+	rmb();
 	/*
 	 * This is the only case we need to signal when the
 	 * ring transitions from being empty to non-empty.
@@ -442,7 +444,7 @@
 					     sizeof(u64));
 
 	/* Issue a full memory barrier before updating the write index */
-	smp_mb();
+	mb();
 
 	/* Now, update the write location */
 	hv_set_next_write_location(outring_info, next_write_location);
@@ -549,7 +551,7 @@
 	/* Make sure all reads are done before we update the read index since */
 	/* the writer may start writing to the read area once the read index */
 	/*is updated */
-	smp_mb();
+	mb();
 
 	/* Update the read index */
 	hv_set_next_read_location(inring_info, next_read_location);

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index bf421e0..a2464bf0 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c

@@ -434,7 +434,7 @@
 		 * will not deliver any more messages since there is
 		 * no empty slot
 		 */
-		smp_mb();
+		mb();
 
 		if (msg->header.message_flags.msg_pending) {
 			/*
@@ -563,6 +563,9 @@
 	 */
 	hv_register_vmbus_handler(irq, vmbus_isr);
 
+	ret = hv_synic_alloc();
+	if (ret)
+		goto err_alloc;
 	/*
 	 * Initialize the per-cpu interrupt state and
 	 * connect to the host.
@@ -570,13 +573,14 @@
 	on_each_cpu(hv_synic_init, NULL, 1);
 	ret = vmbus_connect();
 	if (ret)
-		goto err_irq;
+		goto err_alloc;
 
 	vmbus_request_offers();
 
 	return 0;
 
-err_irq:
+err_alloc:
+	hv_synic_free();
 	free_irq(irq, hv_acpi_dev);
 
 err_unregister:

diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig
index b2f963be..9af763a 100644
--- a/drivers/iio/Kconfig
+++ b/drivers/iio/Kconfig

@@ -70,5 +70,9 @@
 source "drivers/iio/imu/Kconfig"
 source "drivers/iio/light/Kconfig"
 source "drivers/iio/magnetometer/Kconfig"
+if IIO_TRIGGER
+   source "drivers/iio/trigger/Kconfig"
+endif #IIO_TRIGGER
+source "drivers/iio/pressure/Kconfig"
 
 endif # IIO

diff --git a/drivers/iio/Makefile b/drivers/iio/Makefile
index a0e8cdd..7a3866c 100644
--- a/drivers/iio/Makefile
+++ b/drivers/iio/Makefile

@@ -21,3 +21,5 @@
 obj-y += imu/
 obj-y += light/
 obj-y += magnetometer/
+obj-y += trigger/
+obj-y += pressure/

diff --git a/drivers/iio/accel/Kconfig b/drivers/iio/accel/Kconfig
index bb59496..719d83f 100644
--- a/drivers/iio/accel/Kconfig
+++ b/drivers/iio/accel/Kconfig

@@ -28,7 +28,6 @@
 	select IIO_ST_ACCEL_I2C_3AXIS if (I2C)
 	select IIO_ST_ACCEL_SPI_3AXIS if (SPI_MASTER)
 	select IIO_TRIGGERED_BUFFER if (IIO_BUFFER)
-	select IIO_ST_ACCEL_BUFFER if (IIO_TRIGGERED_BUFFER)
 	help
 	  Say yes here to build support for STMicroelectronics accelerometers:
 	  LSM303DLH, LSM303DLHC, LIS3DH, LSM330D, LSM330DL, LSM330DLC,

diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index e0f5a3c..4aec1212 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c

@@ -26,6 +26,8 @@
 #include <linux/iio/common/st_sensors.h>
 #include "st_accel.h"
 
+#define ST_ACCEL_NUMBER_DATA_CHANNELS		3
+
 /* DEFAULT VALUE FOR SENSORS */
 #define ST_ACCEL_DEFAULT_OUT_X_L_ADDR		0x28
 #define ST_ACCEL_DEFAULT_OUT_Y_L_ADDR		0x2a
@@ -125,22 +127,34 @@
 #define ST_ACCEL_3_MULTIREAD_BIT		false
 
 static const struct iio_chan_spec st_accel_12bit_channels[] = {
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_X, IIO_MOD_X, IIO_LE,
-		ST_SENSORS_DEFAULT_12_REALBITS, ST_ACCEL_DEFAULT_OUT_X_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_Y, IIO_MOD_Y, IIO_LE,
-		ST_SENSORS_DEFAULT_12_REALBITS, ST_ACCEL_DEFAULT_OUT_Y_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_Z, IIO_MOD_Z, IIO_LE,
-		ST_SENSORS_DEFAULT_12_REALBITS, ST_ACCEL_DEFAULT_OUT_Z_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 12, 16,
+			ST_ACCEL_DEFAULT_OUT_X_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 12, 16,
+			ST_ACCEL_DEFAULT_OUT_Y_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 12, 16,
+			ST_ACCEL_DEFAULT_OUT_Z_L_ADDR),
 	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
 static const struct iio_chan_spec st_accel_16bit_channels[] = {
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_X, IIO_MOD_X, IIO_LE,
-		ST_SENSORS_DEFAULT_16_REALBITS, ST_ACCEL_DEFAULT_OUT_X_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_Y, IIO_MOD_Y, IIO_LE,
-		ST_SENSORS_DEFAULT_16_REALBITS, ST_ACCEL_DEFAULT_OUT_Y_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_Z, IIO_MOD_Z, IIO_LE,
-		ST_SENSORS_DEFAULT_16_REALBITS, ST_ACCEL_DEFAULT_OUT_Z_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 16, 16,
+			ST_ACCEL_DEFAULT_OUT_X_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 16, 16,
+			ST_ACCEL_DEFAULT_OUT_Y_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 16, 16,
+			ST_ACCEL_DEFAULT_OUT_Z_L_ADDR),
 	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
@@ -442,6 +456,7 @@
 	if (err < 0)
 		goto st_accel_common_probe_error;
 
+	adata->num_data_channels = ST_ACCEL_NUMBER_DATA_CHANNELS;
 	adata->multiread_bit = adata->sensor->multi_read_bit;
 	indio_dev->channels = adata->sensor->ch;
 	indio_dev->num_channels = ST_SENSORS_NUMBER_ALL_CHANNELS;

diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig
index ab0767e6..93129ec 100644
--- a/drivers/iio/adc/Kconfig
+++ b/drivers/iio/adc/Kconfig

@@ -133,6 +133,16 @@
 	  max11646, max11647) Provides direct access via sysfs and buffered
 	  data via the iio dev interface.
 
+config MCP320X
+	tristate "Microchip Technology MCP3204/08"
+	depends on SPI
+	help
+	  Say yes here to build support for Microchip Technology's MCP3204 or
+	  MCP3208 analog to digital converter.
+
+	  This driver can also be built as a module. If so, the module will be
+	  called mcp320x.
+
 config TI_ADC081C
 	tristate "Texas Instruments ADC081C021/027"
 	depends on I2C

diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile
index 0a825be..8f475d31 100644
--- a/drivers/iio/adc/Makefile
+++ b/drivers/iio/adc/Makefile

@@ -14,6 +14,7 @@
 obj-$(CONFIG_EXYNOS_ADC) += exynos_adc.o
 obj-$(CONFIG_LP8788_ADC) += lp8788_adc.o
 obj-$(CONFIG_MAX1363) += max1363.o
+obj-$(CONFIG_MCP320X) += mcp320x.o
 obj-$(CONFIG_TI_ADC081C) += ti-adc081c.o
 obj-$(CONFIG_TI_AM335X_ADC) += ti_am335x_adc.o
 obj-$(CONFIG_VIPERBOARD_ADC) += viperboard_adc.o

diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c
index e5b88d5..b6db6a0 100644
--- a/drivers/iio/adc/at91_adc.c
+++ b/drivers/iio/adc/at91_adc.c

@@ -774,11 +774,13 @@
 	return 0;
 }
 
+#ifdef CONFIG_OF
 static const struct of_device_id at91_adc_dt_ids[] = {
 	{ .compatible = "atmel,at91sam9260-adc" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, at91_adc_dt_ids);
+#endif
 
 static struct platform_driver at91_adc_driver = {
 	.probe = at91_adc_probe,

diff --git a/drivers/iio/adc/exynos_adc.c b/drivers/iio/adc/exynos_adc.c
index b3d03d3..9809fc9 100644
--- a/drivers/iio/adc/exynos_adc.c
+++ b/drivers/iio/adc/exynos_adc.c

@@ -270,16 +270,16 @@
 	info = iio_priv(indio_dev);
 
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	info->regs = devm_request_and_ioremap(&pdev->dev, mem);
-	if (!info->regs) {
-		ret = -ENOMEM;
+	info->regs = devm_ioremap_resource(&pdev->dev, mem);
+	if (IS_ERR(info->regs)) {
+		ret = PTR_ERR(info->regs);
 		goto err_iio;
 	}
 
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	info->enable_reg = devm_request_and_ioremap(&pdev->dev, mem);
-	if (!info->enable_reg) {
-		ret = -ENOMEM;
+	info->enable_reg = devm_ioremap_resource(&pdev->dev, mem);
+	if (IS_ERR(info->enable_reg)) {
+		ret = PTR_ERR(info->enable_reg);
 		goto err_iio;
 	}
 

diff --git a/drivers/iio/adc/max1363.c b/drivers/iio/adc/max1363.c
index 9e6da72..f148d00 100644
--- a/drivers/iio/adc/max1363.c
+++ b/drivers/iio/adc/max1363.c

@@ -660,7 +660,7 @@
 	unsigned long val;
 	bool found = false;
 
-	ret = strict_strtoul(buf, 10, &val);
+	ret = kstrtoul(buf, 10, &val);
 	if (ret)
 		return -EINVAL;
 	for (i = 0; i < ARRAY_SIZE(max1363_monitor_speeds); i++)

diff --git a/drivers/iio/adc/mcp320x.c b/drivers/iio/adc/mcp320x.c
new file mode 100644
index 0000000..ebc0159
--- /dev/null
+++ b/drivers/iio/adc/mcp320x.c

@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2013 Oskar Andero <oskar.andero@gmail.com>
+ *
+ * Driver for Microchip Technology's MCP3204 and MCP3208 ADC chips.
+ * Datasheet can be found here:
+ * http://ww1.microchip.com/downloads/en/devicedoc/21298c.pdf
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/spi/spi.h>
+#include <linux/module.h>
+#include <linux/iio/iio.h>
+#include <linux/regulator/consumer.h>
+
+#define MCP_SINGLE_ENDED	(1 << 3)
+#define MCP_START_BIT		(1 << 4)
+
+enum {
+	mcp3204,
+	mcp3208,
+};
+
+struct mcp320x {
+	struct spi_device *spi;
+	struct spi_message msg;
+	struct spi_transfer transfer[2];
+
+	u8 tx_buf;
+	u8 rx_buf[2];
+
+	struct regulator *reg;
+	struct mutex lock;
+};
+
+static int mcp320x_adc_conversion(struct mcp320x *adc, u8 msg)
+{
+	int ret;
+
+	adc->tx_buf = msg;
+	ret = spi_sync(adc->spi, &adc->msg);
+	if (ret < 0)
+		return ret;
+
+	return ((adc->rx_buf[0] & 0x3f) << 6)  |
+		(adc->rx_buf[1] >> 2);
+}
+
+static int mcp320x_read_raw(struct iio_dev *indio_dev,
+			    struct iio_chan_spec const *channel, int *val,
+			    int *val2, long mask)
+{
+	struct mcp320x *adc = iio_priv(indio_dev);
+	int ret = -EINVAL;
+
+	mutex_lock(&adc->lock);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_RAW:
+		if (channel->differential)
+			ret = mcp320x_adc_conversion(adc,
+				MCP_START_BIT | channel->address);
+		else
+			ret = mcp320x_adc_conversion(adc,
+				MCP_START_BIT | MCP_SINGLE_ENDED |
+				channel->address);
+		if (ret < 0)
+			goto out;
+
+		*val = ret;
+		ret = IIO_VAL_INT;
+		break;
+
+	case IIO_CHAN_INFO_SCALE:
+		/* Digital output code = (4096 * Vin) / Vref */
+		ret = regulator_get_voltage(adc->reg);
+		if (ret < 0)
+			goto out;
+
+		*val = ret / 1000;
+		*val2 = 12;
+		ret = IIO_VAL_FRACTIONAL_LOG2;
+		break;
+
+	default:
+		break;
+	}
+
+out:
+	mutex_unlock(&adc->lock);
+
+	return ret;
+}
+
+#define MCP320X_VOLTAGE_CHANNEL(num)				\
+	{							\
+		.type = IIO_VOLTAGE,				\
+		.indexed = 1,					\
+		.channel = (num),				\
+		.address = (num),				\
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),	\
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE) \
+	}
+
+#define MCP320X_VOLTAGE_CHANNEL_DIFF(num)			\
+	{							\
+		.type = IIO_VOLTAGE,				\
+		.indexed = 1,					\
+		.channel = (num * 2),				\
+		.channel2 = (num * 2 + 1),			\
+		.address = (num * 2),				\
+		.differential = 1,				\
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),	\
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE) \
+	}
+
+static const struct iio_chan_spec mcp3204_channels[] = {
+	MCP320X_VOLTAGE_CHANNEL(0),
+	MCP320X_VOLTAGE_CHANNEL(1),
+	MCP320X_VOLTAGE_CHANNEL(2),
+	MCP320X_VOLTAGE_CHANNEL(3),
+	MCP320X_VOLTAGE_CHANNEL_DIFF(0),
+	MCP320X_VOLTAGE_CHANNEL_DIFF(1),
+};
+
+static const struct iio_chan_spec mcp3208_channels[] = {
+	MCP320X_VOLTAGE_CHANNEL(0),
+	MCP320X_VOLTAGE_CHANNEL(1),
+	MCP320X_VOLTAGE_CHANNEL(2),
+	MCP320X_VOLTAGE_CHANNEL(3),
+	MCP320X_VOLTAGE_CHANNEL(4),
+	MCP320X_VOLTAGE_CHANNEL(5),
+	MCP320X_VOLTAGE_CHANNEL(6),
+	MCP320X_VOLTAGE_CHANNEL(7),
+	MCP320X_VOLTAGE_CHANNEL_DIFF(0),
+	MCP320X_VOLTAGE_CHANNEL_DIFF(1),
+	MCP320X_VOLTAGE_CHANNEL_DIFF(2),
+	MCP320X_VOLTAGE_CHANNEL_DIFF(3),
+};
+
+static const struct iio_info mcp320x_info = {
+	.read_raw = mcp320x_read_raw,
+	.driver_module = THIS_MODULE,
+};
+
+struct mcp3208_chip_info {
+	const struct iio_chan_spec *channels;
+	unsigned int num_channels;
+};
+
+static const struct mcp3208_chip_info mcp3208_chip_infos[] = {
+	[mcp3204] = {
+		.channels = mcp3204_channels,
+		.num_channels = ARRAY_SIZE(mcp3204_channels)
+	},
+	[mcp3208] = {
+		.channels = mcp3208_channels,
+		.num_channels = ARRAY_SIZE(mcp3208_channels)
+	},
+};
+
+static int mcp320x_probe(struct spi_device *spi)
+{
+	struct iio_dev *indio_dev;
+	struct mcp320x *adc;
+	const struct mcp3208_chip_info *chip_info;
+	int ret;
+
+	indio_dev = iio_device_alloc(sizeof(*adc));
+	if (!indio_dev)
+		return -ENOMEM;
+
+	adc = iio_priv(indio_dev);
+	adc->spi = spi;
+
+	indio_dev->dev.parent = &spi->dev;
+	indio_dev->name = spi_get_device_id(spi)->name;
+	indio_dev->modes = INDIO_DIRECT_MODE;
+	indio_dev->info = &mcp320x_info;
+
+	chip_info = &mcp3208_chip_infos[spi_get_device_id(spi)->driver_data];
+	indio_dev->channels = chip_info->channels;
+	indio_dev->num_channels = chip_info->num_channels;
+
+	adc->transfer[0].tx_buf = &adc->tx_buf;
+	adc->transfer[0].len = sizeof(adc->tx_buf);
+	adc->transfer[1].rx_buf = adc->rx_buf;
+	adc->transfer[1].len = sizeof(adc->rx_buf);
+
+	spi_message_init_with_transfers(&adc->msg, adc->transfer,
+					ARRAY_SIZE(adc->transfer));
+
+	adc->reg = regulator_get(&spi->dev, "vref");
+	if (IS_ERR(adc->reg)) {
+		ret = PTR_ERR(adc->reg);
+		goto iio_free;
+	}
+
+	ret = regulator_enable(adc->reg);
+	if (ret < 0)
+		goto reg_free;
+
+	mutex_init(&adc->lock);
+
+	ret = iio_device_register(indio_dev);
+	if (ret < 0)
+		goto reg_disable;
+
+	return 0;
+
+reg_disable:
+	regulator_disable(adc->reg);
+reg_free:
+	regulator_put(adc->reg);
+iio_free:
+	iio_device_free(indio_dev);
+
+	return ret;
+}
+
+static int mcp320x_remove(struct spi_device *spi)
+{
+	struct iio_dev *indio_dev = spi_get_drvdata(spi);
+	struct mcp320x *adc = iio_priv(indio_dev);
+
+	iio_device_unregister(indio_dev);
+	regulator_disable(adc->reg);
+	regulator_put(adc->reg);
+	iio_device_free(indio_dev);
+
+	return 0;
+}
+
+static const struct spi_device_id mcp320x_id[] = {
+	{ "mcp3204", mcp3204 },
+	{ "mcp3208", mcp3208 },
+	{ }
+};
+MODULE_DEVICE_TABLE(spi, mcp320x_id);
+
+static struct spi_driver mcp320x_driver = {
+	.driver = {
+		.name = "mcp320x",
+		.owner = THIS_MODULE,
+	},
+	.probe = mcp320x_probe,
+	.remove = mcp320x_remove,
+	.id_table = mcp320x_id,
+};
+module_spi_driver(mcp320x_driver);
+
+MODULE_AUTHOR("Oskar Andero <oskar.andero@gmail.com>");
+MODULE_DESCRIPTION("Microchip Technology MCP3204/08");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/iio/common/st_sensors/st_sensors_buffer.c b/drivers/iio/common/st_sensors/st_sensors_buffer.c
index 09b236d..71a2c5f 100644
--- a/drivers/iio/common/st_sensors/st_sensors_buffer.c
+++ b/drivers/iio/common/st_sensors/st_sensors_buffer.c

@@ -24,11 +24,20 @@
 
 int st_sensors_get_buffer_element(struct iio_dev *indio_dev, u8 *buf)
 {
+	u8 *addr;
 	int i, n = 0, len;
-	u8 addr[ST_SENSORS_NUMBER_DATA_CHANNELS];
 	struct st_sensor_data *sdata = iio_priv(indio_dev);
+	unsigned int num_data_channels = sdata->num_data_channels;
+	unsigned int byte_for_channel =
+			indio_dev->channels[0].scan_type.storagebits >> 3;
 
-	for (i = 0; i < ST_SENSORS_NUMBER_DATA_CHANNELS; i++) {
+	addr = kmalloc(num_data_channels, GFP_KERNEL);
+	if (!addr) {
+		len = -ENOMEM;
+		goto st_sensors_get_buffer_element_error;
+	}
+
+	for (i = 0; i < num_data_channels; i++) {
 		if (test_bit(i, indio_dev->active_scan_mask)) {
 			addr[n] = indio_dev->channels[i].address;
 			n++;
@@ -37,52 +46,58 @@
 	switch (n) {
 	case 1:
 		len = sdata->tf->read_multiple_byte(&sdata->tb, sdata->dev,
-			addr[0], ST_SENSORS_BYTE_FOR_CHANNEL, buf,
-			sdata->multiread_bit);
+			addr[0], byte_for_channel, buf, sdata->multiread_bit);
 		break;
 	case 2:
-		if ((addr[1] - addr[0]) == ST_SENSORS_BYTE_FOR_CHANNEL) {
+		if ((addr[1] - addr[0]) == byte_for_channel) {
 			len = sdata->tf->read_multiple_byte(&sdata->tb,
-					sdata->dev, addr[0],
-					ST_SENSORS_BYTE_FOR_CHANNEL*n,
-					buf, sdata->multiread_bit);
+				sdata->dev, addr[0], byte_for_channel * n,
+				buf, sdata->multiread_bit);
 		} else {
-			u8 rx_array[ST_SENSORS_BYTE_FOR_CHANNEL*
-				    ST_SENSORS_NUMBER_DATA_CHANNELS];
+			u8 *rx_array;
+			rx_array = kmalloc(byte_for_channel * num_data_channels,
+					   GFP_KERNEL);
+			if (!rx_array) {
+				len = -ENOMEM;
+				goto st_sensors_free_memory;
+			}
+
 			len = sdata->tf->read_multiple_byte(&sdata->tb,
 				sdata->dev, addr[0],
-				ST_SENSORS_BYTE_FOR_CHANNEL*
-				ST_SENSORS_NUMBER_DATA_CHANNELS,
+				byte_for_channel * num_data_channels,
 				rx_array, sdata->multiread_bit);
-			if (len < 0)
-				goto read_data_channels_error;
+			if (len < 0) {
+				kfree(rx_array);
+				goto st_sensors_free_memory;
+			}
 
-			for (i = 0; i < n * ST_SENSORS_NUMBER_DATA_CHANNELS;
-									i++) {
+			for (i = 0; i < n * num_data_channels; i++) {
 				if (i < n)
 					buf[i] = rx_array[i];
 				else
 					buf[i] = rx_array[n + i];
 			}
-			len = ST_SENSORS_BYTE_FOR_CHANNEL*n;
+			kfree(rx_array);
+			len = byte_for_channel * n;
 		}
 		break;
 	case 3:
 		len = sdata->tf->read_multiple_byte(&sdata->tb, sdata->dev,
-			addr[0], ST_SENSORS_BYTE_FOR_CHANNEL*
-			ST_SENSORS_NUMBER_DATA_CHANNELS,
+			addr[0], byte_for_channel * num_data_channels,
 			buf, sdata->multiread_bit);
 		break;
 	default:
 		len = -EINVAL;
-		goto read_data_channels_error;
+		goto st_sensors_free_memory;
 	}
-	if (len != ST_SENSORS_BYTE_FOR_CHANNEL*n) {
+	if (len != byte_for_channel * n) {
 		len = -EIO;
-		goto read_data_channels_error;
+		goto st_sensors_free_memory;
 	}
 
-read_data_channels_error:
+st_sensors_free_memory:
+	kfree(addr);
+st_sensors_get_buffer_element_error:
 	return len;
 }
 EXPORT_SYMBOL(st_sensors_get_buffer_element);

diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c
index ed9bc8a..865b178 100644
--- a/drivers/iio/common/st_sensors/st_sensors_core.c
+++ b/drivers/iio/common/st_sensors/st_sensors_core.c

@@ -20,6 +20,11 @@
 
 #define ST_SENSORS_WAI_ADDRESS		0x0f
 
+static inline u32 st_sensors_get_unaligned_le24(const u8 *p)
+{
+	return ((s32)((p[0] | p[1] << 8 | p[2] << 16) << 8) >> 8);
+}
+
 static int st_sensors_write_data_with_mask(struct iio_dev *indio_dev,
 						u8 reg_addr, u8 mask, u8 data)
 {
@@ -112,7 +117,8 @@
 	return ret;
 }
 
-static int st_sensors_set_fullscale(struct iio_dev *indio_dev, unsigned int fs)
+static int st_sensors_set_fullscale(struct iio_dev *indio_dev,
+							unsigned int fs)
 {
 	int err, i = 0;
 	struct st_sensor_data *sdata = iio_priv(indio_dev);
@@ -273,21 +279,33 @@
 EXPORT_SYMBOL(st_sensors_set_fullscale_by_gain);
 
 static int st_sensors_read_axis_data(struct iio_dev *indio_dev,
-							u8 ch_addr, int *data)
+				struct iio_chan_spec const *ch, int *data)
 {
 	int err;
-	u8 outdata[ST_SENSORS_BYTE_FOR_CHANNEL];
+	u8 *outdata;
 	struct st_sensor_data *sdata = iio_priv(indio_dev);
+	unsigned int byte_for_channel = ch->scan_type.storagebits >> 3;
+
+	outdata = kmalloc(byte_for_channel, GFP_KERNEL);
+	if (!outdata) {
+		err = -EINVAL;
+		goto st_sensors_read_axis_data_error;
+	}
 
 	err = sdata->tf->read_multiple_byte(&sdata->tb, sdata->dev,
-				ch_addr, ST_SENSORS_BYTE_FOR_CHANNEL,
+				ch->address, byte_for_channel,
 				outdata, sdata->multiread_bit);
 	if (err < 0)
-		goto read_error;
+		goto st_sensors_free_memory;
 
-	*data = (s16)get_unaligned_le16(outdata);
+	if (byte_for_channel == 2)
+		*data = (s16)get_unaligned_le16(outdata);
+	else if (byte_for_channel == 3)
+		*data = (s32)st_sensors_get_unaligned_le24(outdata);
 
-read_error:
+st_sensors_free_memory:
+	kfree(outdata);
+st_sensors_read_axis_data_error:
 	return err;
 }
 
@@ -307,7 +325,7 @@
 			goto read_error;
 
 		msleep((sdata->sensor->bootime * 1000) / sdata->odr);
-		err = st_sensors_read_axis_data(indio_dev, ch->address, val);
+		err = st_sensors_read_axis_data(indio_dev, ch, val);
 		if (err < 0)
 			goto read_error;
 

diff --git a/drivers/iio/dac/Kconfig b/drivers/iio/dac/Kconfig
index b61160b..c9c33ce3 100644
--- a/drivers/iio/dac/Kconfig
+++ b/drivers/iio/dac/Kconfig

@@ -130,6 +130,16 @@
 	  To compile this driver as a module, choose M here: the
 	  module will be called ad5686.
 
+config AD7303
+	tristate "Analog Devices Analog Devices AD7303 DAC driver"
+	depends on SPI
+	help
+	  Say yes here to build support for Analog Devices AD7303 Digital to Analog
+	  Converters (DAC).
+
+	  To compile this driver as module choose M here: the module will be called
+	  ad7303.
+
 config MAX517
 	tristate "Maxim MAX517/518/519 DAC driver"
 	depends on I2C

diff --git a/drivers/iio/dac/Makefile b/drivers/iio/dac/Makefile
index 5b528eb..c8d7ab6 100644
--- a/drivers/iio/dac/Makefile
+++ b/drivers/iio/dac/Makefile

@@ -14,5 +14,6 @@
 obj-$(CONFIG_AD5764) += ad5764.o
 obj-$(CONFIG_AD5791) += ad5791.o
 obj-$(CONFIG_AD5686) += ad5686.o
+obj-$(CONFIG_AD7303) += ad7303.o
 obj-$(CONFIG_MAX517) += max517.o
 obj-$(CONFIG_MCP4725) += mcp4725.o

diff --git a/drivers/iio/dac/ad7303.c b/drivers/iio/dac/ad7303.c
new file mode 100644
index 0000000..85aeef6
--- /dev/null
+++ b/drivers/iio/dac/ad7303.c

@@ -0,0 +1,315 @@
+/*
+ * AD7303 Digital to analog converters driver
+ *
+ * Copyright 2013 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/spi/spi.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/regulator/consumer.h>
+#include <linux/of.h>
+
+#include <linux/iio/iio.h>
+#include <linux/iio/sysfs.h>
+
+#include <linux/platform_data/ad7303.h>
+
+#define AD7303_CFG_EXTERNAL_VREF BIT(15)
+#define AD7303_CFG_POWER_DOWN(ch) BIT(11 + (ch))
+#define AD7303_CFG_ADDR_OFFSET	10
+
+#define AD7303_CMD_UPDATE_DAC	(0x3 << 8)
+
+/**
+ * struct ad7303_state - driver instance specific data
+ * @spi:		the device for this driver instance
+ * @config:		cached config register value
+ * @dac_cache:		current DAC raw value (chip does not support readback)
+ * @data:		spi transfer buffer
+ */
+
+struct ad7303_state {
+	struct spi_device *spi;
+	uint16_t config;
+	uint8_t dac_cache[2];
+
+	struct regulator *vdd_reg;
+	struct regulator *vref_reg;
+
+	/*
+	 * DMA (thus cache coherency maintenance) requires the
+	 * transfer buffers to live in their own cache lines.
+	 */
+	__be16 data ____cacheline_aligned;
+};
+
+static int ad7303_write(struct ad7303_state *st, unsigned int chan,
+	uint8_t val)
+{
+	st->data = cpu_to_be16(AD7303_CMD_UPDATE_DAC |
+		(chan << AD7303_CFG_ADDR_OFFSET) |
+		st->config | val);
+
+	return spi_write(st->spi, &st->data, sizeof(st->data));
+}
+
+static ssize_t ad7303_read_dac_powerdown(struct iio_dev *indio_dev,
+	uintptr_t private, const struct iio_chan_spec *chan, char *buf)
+{
+	struct ad7303_state *st = iio_priv(indio_dev);
+
+	return sprintf(buf, "%d\n", (bool)(st->config &
+		AD7303_CFG_POWER_DOWN(chan->channel)));
+}
+
+static ssize_t ad7303_write_dac_powerdown(struct iio_dev *indio_dev,
+	 uintptr_t private, const struct iio_chan_spec *chan, const char *buf,
+	 size_t len)
+{
+	struct ad7303_state *st = iio_priv(indio_dev);
+	bool pwr_down;
+	int ret;
+
+	ret = strtobool(buf, &pwr_down);
+	if (ret)
+		return ret;
+
+	mutex_lock(&indio_dev->mlock);
+
+	if (pwr_down)
+		st->config |= AD7303_CFG_POWER_DOWN(chan->channel);
+	else
+		st->config &= ~AD7303_CFG_POWER_DOWN(chan->channel);
+
+	/* There is no noop cmd which allows us to only update the powerdown
+	 * mode, so just write one of the DAC channels again */
+	ad7303_write(st, chan->channel, st->dac_cache[chan->channel]);
+
+	mutex_unlock(&indio_dev->mlock);
+	return ret ? ret : len;
+}
+
+static int ad7303_get_vref(struct ad7303_state *st,
+	struct iio_chan_spec const *chan)
+{
+	int ret;
+
+	if (st->config & AD7303_CFG_EXTERNAL_VREF)
+		return regulator_get_voltage(st->vref_reg);
+
+	ret = regulator_get_voltage(st->vdd_reg);
+	if (ret < 0)
+		return ret;
+	return ret / 2;
+}
+
+static int ad7303_read_raw(struct iio_dev *indio_dev,
+	struct iio_chan_spec const *chan, int *val, int *val2, long info)
+{
+	struct ad7303_state *st = iio_priv(indio_dev);
+	int vref_uv;
+
+	switch (info) {
+	case IIO_CHAN_INFO_RAW:
+		*val = st->dac_cache[chan->channel];
+		return IIO_VAL_INT;
+	case IIO_CHAN_INFO_SCALE:
+		vref_uv = ad7303_get_vref(st, chan);
+		if (vref_uv < 0)
+			return vref_uv;
+
+		*val = 2 * vref_uv / 1000;
+		*val2 = chan->scan_type.realbits;
+
+		return IIO_VAL_FRACTIONAL_LOG2;
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
+static int ad7303_write_raw(struct iio_dev *indio_dev,
+	struct iio_chan_spec const *chan, int val, int val2, long mask)
+{
+	struct ad7303_state *st = iio_priv(indio_dev);
+	int ret;
+
+	switch (mask) {
+	case IIO_CHAN_INFO_RAW:
+		if (val >= (1 << chan->scan_type.realbits) || val < 0)
+			return -EINVAL;
+
+		mutex_lock(&indio_dev->mlock);
+		ret = ad7303_write(st, chan->address, val);
+		if (ret == 0)
+			st->dac_cache[chan->channel] = val;
+		mutex_unlock(&indio_dev->mlock);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static const struct iio_info ad7303_info = {
+	.read_raw = ad7303_read_raw,
+	.write_raw = ad7303_write_raw,
+	.driver_module = THIS_MODULE,
+};
+
+static const struct iio_chan_spec_ext_info ad7303_ext_info[] = {
+	{
+		.name = "powerdown",
+		.read = ad7303_read_dac_powerdown,
+		.write = ad7303_write_dac_powerdown,
+	},
+	{ },
+};
+
+#define AD7303_CHANNEL(chan) {					\
+	.type = IIO_VOLTAGE,					\
+	.indexed = 1,						\
+	.output = 1,						\
+	.channel = (chan),					\
+	.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),		\
+	.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE),	\
+	.address = (chan),					\
+	.scan_type = {						\
+		.sign = 'u',					\
+		.realbits = '8',				\
+		.storagebits = '8',				\
+		.shift = '0',					\
+	},							\
+	.ext_info = ad7303_ext_info,				\
+}
+
+static const struct iio_chan_spec ad7303_channels[] = {
+	AD7303_CHANNEL(0),
+	AD7303_CHANNEL(1),
+};
+
+static int ad7303_probe(struct spi_device *spi)
+{
+	const struct spi_device_id *id = spi_get_device_id(spi);
+	struct iio_dev *indio_dev;
+	struct ad7303_state *st;
+	bool ext_ref;
+	int ret;
+
+	indio_dev = iio_device_alloc(sizeof(*st));
+	if (indio_dev == NULL)
+		return -ENOMEM;
+
+	st = iio_priv(indio_dev);
+	spi_set_drvdata(spi, indio_dev);
+
+	st->spi = spi;
+
+	st->vdd_reg = regulator_get(&spi->dev, "Vdd");
+	if (IS_ERR(st->vdd_reg)) {
+		ret = PTR_ERR(st->vdd_reg);
+		goto err_free;
+	}
+
+	ret = regulator_enable(st->vdd_reg);
+	if (ret)
+		goto err_put_vdd_reg;
+
+	if (spi->dev.of_node) {
+		ext_ref = of_property_read_bool(spi->dev.of_node,
+				"REF-supply");
+	} else {
+		struct ad7303_platform_data *pdata = spi->dev.platform_data;
+		if (pdata && pdata->use_external_ref)
+			ext_ref = true;
+		else
+		    ext_ref = false;
+	}
+
+	if (ext_ref) {
+		st->vref_reg = regulator_get(&spi->dev, "REF");
+		if (IS_ERR(st->vref_reg))
+			goto err_disable_vdd_reg;
+
+		ret = regulator_enable(st->vref_reg);
+		if (ret)
+			goto err_put_vref_reg;
+
+		st->config |= AD7303_CFG_EXTERNAL_VREF;
+	}
+
+	indio_dev->dev.parent = &spi->dev;
+	indio_dev->name = id->name;
+	indio_dev->info = &ad7303_info;
+	indio_dev->modes = INDIO_DIRECT_MODE;
+	indio_dev->channels = ad7303_channels;
+	indio_dev->num_channels = ARRAY_SIZE(ad7303_channels);
+
+	ret = iio_device_register(indio_dev);
+	if (ret)
+		goto err_disable_vref_reg;
+
+	return 0;
+
+err_disable_vref_reg:
+	if (st->vref_reg)
+		regulator_disable(st->vref_reg);
+err_put_vref_reg:
+	if (st->vref_reg)
+		regulator_put(st->vref_reg);
+err_disable_vdd_reg:
+	regulator_disable(st->vdd_reg);
+err_put_vdd_reg:
+	regulator_put(st->vdd_reg);
+err_free:
+	iio_device_free(indio_dev);
+
+	return ret;
+}
+
+static int ad7303_remove(struct spi_device *spi)
+{
+	struct iio_dev *indio_dev = spi_get_drvdata(spi);
+	struct ad7303_state *st = iio_priv(indio_dev);
+
+	iio_device_unregister(indio_dev);
+
+	if (st->vref_reg) {
+		regulator_disable(st->vref_reg);
+		regulator_put(st->vref_reg);
+	}
+	regulator_disable(st->vdd_reg);
+	regulator_put(st->vdd_reg);
+
+	iio_device_free(indio_dev);
+
+	return 0;
+}
+
+static const struct spi_device_id ad7303_spi_ids[] = {
+	{ "ad7303", 0 },
+	{}
+};
+MODULE_DEVICE_TABLE(spi, ad7303_spi_ids);
+
+static struct spi_driver ad7303_driver = {
+	.driver = {
+		.name = "ad7303",
+		.owner = THIS_MODULE,
+	},
+	.probe = ad7303_probe,
+	.remove = ad7303_remove,
+	.id_table = ad7303_spi_ids,
+};
+module_spi_driver(ad7303_driver);
+
+MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
+MODULE_DESCRIPTION("Analog Devices AD7303 DAC driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/iio/frequency/adf4350.c b/drivers/iio/frequency/adf4350.c
index e76d4ace..a4157cd 100644
--- a/drivers/iio/frequency/adf4350.c
+++ b/drivers/iio/frequency/adf4350.c

@@ -1,7 +1,7 @@
 /*
  * ADF4350/ADF4351 SPI Wideband Synthesizer driver
  *
- * Copyright 2012 Analog Devices Inc.
+ * Copyright 2012-2013 Analog Devices Inc.
  *
  * Licensed under the GPL-2.
  */
@@ -17,6 +17,9 @@
 #include <linux/gcd.h>
 #include <linux/gpio.h>
 #include <asm/div64.h>
+#include <linux/clk.h>
+#include <linux/of.h>
+#include <linux/of_gpio.h>
 
 #include <linux/iio/iio.h>
 #include <linux/iio/sysfs.h>
@@ -33,6 +36,7 @@
 	struct spi_device		*spi;
 	struct regulator		*reg;
 	struct adf4350_platform_data	*pdata;
+	struct clk			*clk;
 	unsigned long			clkin;
 	unsigned long			chspc; /* Channel Spacing */
 	unsigned long			fpfd; /* Phase Frequency Detector */
@@ -43,7 +47,7 @@
 	unsigned			r4_rf_div_sel;
 	unsigned long			regs[6];
 	unsigned long			regs_hw[6];
-
+	unsigned long long		freq_req;
 	/*
 	 * DMA (thus cache coherency maintenance) requires the
 	 * transfer buffers to live in their own cache lines.
@@ -52,7 +56,6 @@
 };
 
 static struct adf4350_platform_data default_pdata = {
-	.clkin = 122880000,
 	.channel_spacing = 10000,
 	.r2_user_settings = ADF4350_REG2_PD_POLARITY_POS |
 			    ADF4350_REG2_CHARGE_PUMP_CURR_uA(2500),
@@ -235,6 +238,7 @@
 		ADF4350_REG4_MUTE_TILL_LOCK_EN));
 
 	st->regs[ADF4350_REG5] = ADF4350_REG5_LD_PIN_MODE_DIGITAL;
+	st->freq_req = freq;
 
 	return adf4350_sync_config(st);
 }
@@ -246,6 +250,7 @@
 {
 	struct adf4350_state *st = iio_priv(indio_dev);
 	unsigned long long readin;
+	unsigned long tmp;
 	int ret;
 
 	ret = kstrtoull(buf, 10, &readin);
@@ -258,10 +263,23 @@
 		ret = adf4350_set_freq(st, readin);
 		break;
 	case ADF4350_FREQ_REFIN:
-		if (readin > ADF4350_MAX_FREQ_REFIN)
+		if (readin > ADF4350_MAX_FREQ_REFIN) {
 			ret = -EINVAL;
-		else
-			st->clkin = readin;
+			break;
+		}
+
+		if (st->clk) {
+			tmp = clk_round_rate(st->clk, readin);
+			if (tmp != readin) {
+				ret = -EINVAL;
+				break;
+			}
+			ret = clk_set_rate(st->clk, tmp);
+			if (ret < 0)
+				break;
+		}
+		st->clkin = readin;
+		ret = adf4350_set_freq(st, st->freq_req);
 		break;
 	case ADF4350_FREQ_RESOLUTION:
 		if (readin == 0)
@@ -308,6 +326,9 @@
 			}
 		break;
 	case ADF4350_FREQ_REFIN:
+		if (st->clk)
+			st->clkin = clk_get_rate(st->clk);
+
 		val = st->clkin;
 		break;
 	case ADF4350_FREQ_RESOLUTION:
@@ -318,6 +339,7 @@
 		break;
 	default:
 		ret = -EINVAL;
+		val = 0;
 	}
 	mutex_unlock(&indio_dev->mlock);
 
@@ -355,19 +377,153 @@
 	.driver_module = THIS_MODULE,
 };
 
+#ifdef CONFIG_OF
+static struct adf4350_platform_data *adf4350_parse_dt(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	struct adf4350_platform_data *pdata;
+	unsigned int tmp;
+	int ret;
+
+	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata) {
+		dev_err(dev, "could not allocate memory for platform data\n");
+		return NULL;
+	}
+
+	strncpy(&pdata->name[0], np->name, SPI_NAME_SIZE - 1);
+
+	tmp = 10000;
+	of_property_read_u32(np, "adi,channel-spacing", &tmp);
+	pdata->channel_spacing = tmp;
+
+	tmp = 0;
+	of_property_read_u32(np, "adi,power-up-frequency", &tmp);
+	pdata->power_up_frequency = tmp;
+
+	tmp = 0;
+	of_property_read_u32(np, "adi,reference-div-factor", &tmp);
+	pdata->ref_div_factor = tmp;
+
+	ret = of_get_gpio(np, 0);
+	if (ret < 0)
+		pdata->gpio_lock_detect = -1;
+	else
+		pdata->gpio_lock_detect = ret;
+
+	pdata->ref_doubler_en = of_property_read_bool(np,
+			"adi,reference-doubler-enable");
+	pdata->ref_div2_en = of_property_read_bool(np,
+			"adi,reference-div2-enable");
+
+	/* r2_user_settings */
+	pdata->r2_user_settings = of_property_read_bool(np,
+			"adi,phase-detector-polarity-positive-enable") ?
+			ADF4350_REG2_PD_POLARITY_POS : 0;
+	pdata->r2_user_settings |= of_property_read_bool(np,
+			"adi,lock-detect-precision-6ns-enable") ?
+			ADF4350_REG2_LDP_6ns : 0;
+	pdata->r2_user_settings |= of_property_read_bool(np,
+			"adi,lock-detect-function-integer-n-enable") ?
+			ADF4350_REG2_LDF_INT_N : 0;
+
+	tmp = 2500;
+	of_property_read_u32(np, "adi,charge-pump-current", &tmp);
+	pdata->r2_user_settings |= ADF4350_REG2_CHARGE_PUMP_CURR_uA(tmp);
+
+	tmp = 0;
+	of_property_read_u32(np, "adi,muxout-select", &tmp);
+	pdata->r2_user_settings |= ADF4350_REG2_MUXOUT(tmp);
+
+	pdata->r2_user_settings |= of_property_read_bool(np,
+			"adi,low-spur-mode-enable") ?
+			ADF4350_REG2_NOISE_MODE(0x3) : 0;
+
+	/* r3_user_settings */
+
+	pdata->r3_user_settings = of_property_read_bool(np,
+			"adi,cycle-slip-reduction-enable") ?
+			ADF4350_REG3_12BIT_CSR_EN : 0;
+	pdata->r3_user_settings |= of_property_read_bool(np,
+			"adi,charge-cancellation-enable") ?
+			ADF4351_REG3_CHARGE_CANCELLATION_EN : 0;
+
+	pdata->r3_user_settings |= of_property_read_bool(np,
+			"adi,anti-backlash-3ns-enable") ?
+			ADF4351_REG3_ANTI_BACKLASH_3ns_EN : 0;
+	pdata->r3_user_settings |= of_property_read_bool(np,
+			"adi,band-select-clock-mode-high-enable") ?
+			ADF4351_REG3_BAND_SEL_CLOCK_MODE_HIGH : 0;
+
+	tmp = 0;
+	of_property_read_u32(np, "adi,12bit-clk-divider", &tmp);
+	pdata->r3_user_settings |= ADF4350_REG3_12BIT_CLKDIV(tmp);
+
+	tmp = 0;
+	of_property_read_u32(np, "adi,clk-divider-mode", &tmp);
+	pdata->r3_user_settings |= ADF4350_REG3_12BIT_CLKDIV_MODE(tmp);
+
+	/* r4_user_settings */
+
+	pdata->r4_user_settings = of_property_read_bool(np,
+			"adi,aux-output-enable") ?
+			ADF4350_REG4_AUX_OUTPUT_EN : 0;
+	pdata->r4_user_settings |= of_property_read_bool(np,
+			"adi,aux-output-fundamental-enable") ?
+			ADF4350_REG4_AUX_OUTPUT_FUND : 0;
+	pdata->r4_user_settings |= of_property_read_bool(np,
+			"adi,mute-till-lock-enable") ?
+			ADF4350_REG4_MUTE_TILL_LOCK_EN : 0;
+
+	tmp = 0;
+	of_property_read_u32(np, "adi,output-power", &tmp);
+	pdata->r4_user_settings |= ADF4350_REG4_OUTPUT_PWR(tmp);
+
+	tmp = 0;
+	of_property_read_u32(np, "adi,aux-output-power", &tmp);
+	pdata->r4_user_settings |= ADF4350_REG4_AUX_OUTPUT_PWR(tmp);
+
+	return pdata;
+}
+#else
+static
+struct adf4350_platform_data *adf4350_parse_dt(struct device *dev)
+{
+	return NULL;
+}
+#endif
+
 static int adf4350_probe(struct spi_device *spi)
 {
-	struct adf4350_platform_data *pdata = spi->dev.platform_data;
+	struct adf4350_platform_data *pdata;
 	struct iio_dev *indio_dev;
 	struct adf4350_state *st;
+	struct clk *clk = NULL;
 	int ret;
 
+	if (spi->dev.of_node) {
+		pdata = adf4350_parse_dt(&spi->dev);
+		if (pdata == NULL)
+			return -EINVAL;
+	} else {
+		pdata = spi->dev.platform_data;
+	}
+
 	if (!pdata) {
 		dev_warn(&spi->dev, "no platform data? using default\n");
-
 		pdata = &default_pdata;
 	}
 
+	if (!pdata->clkin) {
+		clk = clk_get(&spi->dev, "clkin");
+		if (IS_ERR(clk))
+			return -EPROBE_DEFER;
+
+		ret = clk_prepare_enable(clk);
+		if (ret < 0)
+			return ret;
+	}
+
 	indio_dev = iio_device_alloc(sizeof(*st));
 	if (indio_dev == NULL)
 		return -ENOMEM;
@@ -395,7 +551,12 @@
 	indio_dev->num_channels = 1;
 
 	st->chspc = pdata->channel_spacing;
-	st->clkin = pdata->clkin;
+	if (clk) {
+		st->clk = clk;
+		st->clkin = clk_get_rate(clk);
+	} else {
+		st->clkin = pdata->clkin;
+	}
 
 	st->min_out_freq = spi_get_device_id(spi)->driver_data == 4351 ?
 		ADF4351_MIN_OUT_FREQ : ADF4350_MIN_OUT_FREQ;
@@ -435,6 +596,8 @@
 	if (!IS_ERR(st->reg))
 		regulator_put(st->reg);
 
+	if (clk)
+		clk_disable_unprepare(clk);
 	iio_device_free(indio_dev);
 
 	return ret;
@@ -451,6 +614,9 @@
 
 	iio_device_unregister(indio_dev);
 
+	if (st->clk)
+		clk_disable_unprepare(st->clk);
+
 	if (!IS_ERR(reg)) {
 		regulator_disable(reg);
 		regulator_put(reg);
@@ -481,6 +647,6 @@
 };
 module_spi_driver(adf4350_driver);
 
-MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
+MODULE_AUTHOR("Michael Hennerich <michael.hennerich@analog.com>");
 MODULE_DESCRIPTION("Analog Devices ADF4350/ADF4351 PLL");
 MODULE_LICENSE("GPL v2");

diff --git a/drivers/iio/gyro/Kconfig b/drivers/iio/gyro/Kconfig
index 6be4628..8498e9d 100644
--- a/drivers/iio/gyro/Kconfig
+++ b/drivers/iio/gyro/Kconfig

@@ -10,6 +10,13 @@
 	  Say yes here to build support for Analog Devices ADIS16080, ADIS16100 Yaw
 	  Rate Gyroscope with SPI.
 
+config ADIS16130
+	tristate "Analog Devices ADIS16130 High Precision Angular Rate Sensor driver"
+	depends on SPI
+	help
+	  Say yes here to build support for Analog Devices ADIS16130 High Precision
+	  Angular Rate Sensor driver.
+
 config ADIS16136
 	tristate "Analog devices ADIS16136 and similar gyroscopes driver"
 	depends on SPI_MASTER
@@ -47,7 +54,6 @@
 	select IIO_ST_GYRO_I2C_3AXIS if (I2C)
 	select IIO_ST_GYRO_SPI_3AXIS if (SPI_MASTER)
 	select IIO_TRIGGERED_BUFFER if (IIO_BUFFER)
-	select IIO_ST_GYRO_BUFFER if (IIO_TRIGGERED_BUFFER)
 	help
 	  Say yes here to build support for STMicroelectronics gyroscopes:
 	  L3G4200D, LSM330DL, L3GD20, L3GD20H, LSM330DLC, L3G4IS, LSM330.

diff --git a/drivers/iio/gyro/Makefile b/drivers/iio/gyro/Makefile
index 225d289..e9dc034 100644
--- a/drivers/iio/gyro/Makefile
+++ b/drivers/iio/gyro/Makefile

@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_ADIS16080) += adis16080.o
+obj-$(CONFIG_ADIS16130) += adis16130.o
 obj-$(CONFIG_ADIS16136) += adis16136.o
 obj-$(CONFIG_ADXRS450) += adxrs450.o
 

diff --git a/drivers/iio/gyro/adis16130.c b/drivers/iio/gyro/adis16130.c
new file mode 100644
index 0000000..129acdf
--- /dev/null
+++ b/drivers/iio/gyro/adis16130.c

@@ -0,0 +1,207 @@
+/*
+ * ADIS16130 Digital Output, High Precision Angular Rate Sensor driver
+ *
+ * Copyright 2010 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/mutex.h>
+#include <linux/kernel.h>
+#include <linux/spi/spi.h>
+#include <linux/module.h>
+
+#include <linux/iio/iio.h>
+
+#define ADIS16130_CON         0x0
+#define ADIS16130_CON_RD      (1 << 6)
+#define ADIS16130_IOP         0x1
+
+/* 1 = data-ready signal low when unread data on all channels; */
+#define ADIS16130_IOP_ALL_RDY (1 << 3)
+#define ADIS16130_IOP_SYNC    (1 << 0) /* 1 = synchronization enabled */
+#define ADIS16130_RATEDATA    0x8 /* Gyroscope output, rate of rotation */
+#define ADIS16130_TEMPDATA    0xA /* Temperature output */
+#define ADIS16130_RATECS      0x28 /* Gyroscope channel setup */
+#define ADIS16130_RATECS_EN   (1 << 3) /* 1 = channel enable; */
+#define ADIS16130_TEMPCS      0x2A /* Temperature channel setup */
+#define ADIS16130_TEMPCS_EN   (1 << 3)
+#define ADIS16130_RATECONV    0x30
+#define ADIS16130_TEMPCONV    0x32
+#define ADIS16130_MODE        0x38
+#define ADIS16130_MODE_24BIT  (1 << 1) /* 1 = 24-bit resolution; */
+
+/**
+ * struct adis16130_state - device instance specific data
+ * @us:			actual spi_device to write data
+ * @buf_lock:		mutex to protect tx and rx
+ * @buf:		unified tx/rx buffer
+ **/
+struct adis16130_state {
+	struct spi_device		*us;
+	struct mutex			buf_lock;
+	u8				buf[4] ____cacheline_aligned;
+};
+
+static int adis16130_spi_read(struct iio_dev *indio_dev, u8 reg_addr, u32 *val)
+{
+	int ret;
+	struct adis16130_state *st = iio_priv(indio_dev);
+	struct spi_message msg;
+	struct spi_transfer xfer = {
+		.tx_buf = st->buf,
+		.rx_buf = st->buf,
+		.len = 4,
+	};
+
+	mutex_lock(&st->buf_lock);
+
+	st->buf[0] = ADIS16130_CON_RD | reg_addr;
+	st->buf[1] = st->buf[2] = st->buf[3] = 0;
+
+	spi_message_init(&msg);
+	spi_message_add_tail(&xfer, &msg);
+	ret = spi_sync(st->us, &msg);
+
+	if (ret == 0)
+		*val = (st->buf[1] << 16) | (st->buf[2] << 8) | st->buf[3];
+	mutex_unlock(&st->buf_lock);
+
+	return ret;
+}
+
+static int adis16130_read_raw(struct iio_dev *indio_dev,
+			      struct iio_chan_spec const *chan,
+			      int *val, int *val2,
+			      long mask)
+{
+	int ret;
+	u32 temp;
+
+	switch (mask) {
+	case IIO_CHAN_INFO_RAW:
+		/* Take the iio_dev status lock */
+		mutex_lock(&indio_dev->mlock);
+		ret = adis16130_spi_read(indio_dev, chan->address, &temp);
+		mutex_unlock(&indio_dev->mlock);
+		if (ret)
+			return ret;
+		*val = temp;
+		return IIO_VAL_INT;
+	case IIO_CHAN_INFO_SCALE:
+		switch (chan->type) {
+		case IIO_ANGL_VEL:
+			/* 0 degree = 838860, 250 degree = 14260608 */
+			*val = 250;
+			*val2 = 336440817; /* RAD_TO_DEGREE(14260608 - 8388608) */
+			return IIO_VAL_FRACTIONAL;
+		case IIO_TEMP:
+			/* 0C = 8036283, 105C = 9516048 */
+			*val = 105000;
+			*val2 = 9516048 - 8036283;
+			return IIO_VAL_FRACTIONAL;
+		default:
+			return -EINVAL;
+		}
+		break;
+	case IIO_CHAN_INFO_OFFSET:
+		switch (chan->type) {
+		case IIO_ANGL_VEL:
+			*val = -8388608;
+			return IIO_VAL_INT;
+		case IIO_TEMP:
+			*val = -8036283;
+			return IIO_VAL_INT;
+		default:
+			return -EINVAL;
+		}
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static const struct iio_chan_spec adis16130_channels[] = {
+	{
+		.type = IIO_ANGL_VEL,
+		.modified = 1,
+		.channel2 = IIO_MOD_Z,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+			BIT(IIO_CHAN_INFO_SCALE) |
+			BIT(IIO_CHAN_INFO_OFFSET),
+		.address = ADIS16130_RATEDATA,
+	}, {
+		.type = IIO_TEMP,
+		.indexed = 1,
+		.channel = 0,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+			BIT(IIO_CHAN_INFO_SCALE) |
+			BIT(IIO_CHAN_INFO_OFFSET),
+		.address = ADIS16130_TEMPDATA,
+	}
+};
+
+static const struct iio_info adis16130_info = {
+	.read_raw = &adis16130_read_raw,
+	.driver_module = THIS_MODULE,
+};
+
+static int adis16130_probe(struct spi_device *spi)
+{
+	int ret;
+	struct adis16130_state *st;
+	struct iio_dev *indio_dev;
+
+	/* setup the industrialio driver allocated elements */
+	indio_dev = iio_device_alloc(sizeof(*st));
+	if (indio_dev == NULL) {
+		ret = -ENOMEM;
+		goto error_ret;
+	}
+	st = iio_priv(indio_dev);
+	/* this is only used for removal purposes */
+	spi_set_drvdata(spi, indio_dev);
+	st->us = spi;
+	mutex_init(&st->buf_lock);
+	indio_dev->name = spi->dev.driver->name;
+	indio_dev->channels = adis16130_channels;
+	indio_dev->num_channels = ARRAY_SIZE(adis16130_channels);
+	indio_dev->dev.parent = &spi->dev;
+	indio_dev->info = &adis16130_info;
+	indio_dev->modes = INDIO_DIRECT_MODE;
+
+	ret = iio_device_register(indio_dev);
+	if (ret)
+		goto error_free_dev;
+
+	return 0;
+
+error_free_dev:
+	iio_device_free(indio_dev);
+
+error_ret:
+	return ret;
+}
+
+static int adis16130_remove(struct spi_device *spi)
+{
+	iio_device_unregister(spi_get_drvdata(spi));
+	iio_device_free(spi_get_drvdata(spi));
+
+	return 0;
+}
+
+static struct spi_driver adis16130_driver = {
+	.driver = {
+		.name = "adis16130",
+		.owner = THIS_MODULE,
+	},
+	.probe = adis16130_probe,
+	.remove = adis16130_remove,
+};
+module_spi_driver(adis16130_driver);
+
+MODULE_AUTHOR("Barry Song <21cnbao@gmail.com>");
+MODULE_DESCRIPTION("Analog Devices ADIS16130 High Precision Angular Rate");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("spi:adis16130");

diff --git a/drivers/iio/gyro/st_gyro_core.c b/drivers/iio/gyro/st_gyro_core.c
index fa9b242..f9ed348 100644
--- a/drivers/iio/gyro/st_gyro_core.c
+++ b/drivers/iio/gyro/st_gyro_core.c

@@ -27,6 +27,8 @@
 #include <linux/iio/common/st_sensors.h>
 #include "st_gyro.h"
 
+#define ST_GYRO_NUMBER_DATA_CHANNELS		3
+
 /* DEFAULT VALUE FOR SENSORS */
 #define ST_GYRO_DEFAULT_OUT_X_L_ADDR		0x28
 #define ST_GYRO_DEFAULT_OUT_Y_L_ADDR		0x2a
@@ -86,15 +88,18 @@
 #define ST_GYRO_2_MULTIREAD_BIT			true
 
 static const struct iio_chan_spec st_gyro_16bit_channels[] = {
-	ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL, ST_SENSORS_SCAN_X,
-		IIO_MOD_X, IIO_LE, ST_SENSORS_DEFAULT_16_REALBITS,
-						ST_GYRO_DEFAULT_OUT_X_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL, ST_SENSORS_SCAN_Y,
-		IIO_MOD_Y, IIO_LE, ST_SENSORS_DEFAULT_16_REALBITS,
-						ST_GYRO_DEFAULT_OUT_Y_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL, ST_SENSORS_SCAN_Z,
-		IIO_MOD_Z, IIO_LE, ST_SENSORS_DEFAULT_16_REALBITS,
-						ST_GYRO_DEFAULT_OUT_Z_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 16, 16,
+			ST_GYRO_DEFAULT_OUT_X_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 16, 16,
+			ST_GYRO_DEFAULT_OUT_Y_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 16, 16,
+			ST_GYRO_DEFAULT_OUT_Z_L_ADDR),
 	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
@@ -310,6 +315,7 @@
 	if (err < 0)
 		goto st_gyro_common_probe_error;
 
+	gdata->num_data_channels = ST_GYRO_NUMBER_DATA_CHANNELS;
 	gdata->multiread_bit = gdata->sensor->multi_read_bit;
 	indio_dev->channels = gdata->sensor->ch;
 	indio_dev->num_channels = ST_SENSORS_NUMBER_ALL_CHANNELS;

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index aaadd32..e73033f 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c

@@ -542,8 +542,7 @@
 		ret = indio_dev->setup_ops->preenable(indio_dev);
 		if (ret) {
 			printk(KERN_ERR
-			       "Buffer not started:"
-			       "buffer preenable failed\n");
+			       "Buffer not started: buffer preenable failed (%d)\n", ret);
 			goto error_remove_inserted;
 		}
 	}
@@ -556,8 +555,7 @@
 			ret = buffer->access->request_update(buffer);
 			if (ret) {
 				printk(KERN_INFO
-				       "Buffer not started:"
-				       "buffer parameter update failed\n");
+				       "Buffer not started: buffer parameter update failed (%d)\n", ret);
 				goto error_run_postdisable;
 			}
 		}
@@ -566,7 +564,7 @@
 			->update_scan_mode(indio_dev,
 					   indio_dev->active_scan_mask);
 		if (ret < 0) {
-			printk(KERN_INFO "update scan mode failed\n");
+			printk(KERN_INFO "Buffer not started: update scan mode failed (%d)\n", ret);
 			goto error_run_postdisable;
 		}
 	}
@@ -590,7 +588,7 @@
 		ret = indio_dev->setup_ops->postenable(indio_dev);
 		if (ret) {
 			printk(KERN_INFO
-			       "Buffer not started: postenable failed\n");
+			       "Buffer not started: postenable failed (%d)\n", ret);
 			indio_dev->currentmode = INDIO_DIRECT_MODE;
 			if (indio_dev->setup_ops->postdisable)
 				indio_dev->setup_ops->postdisable(indio_dev);

diff --git a/drivers/iio/light/hid-sensor-als.c b/drivers/iio/light/hid-sensor-als.c
index 80d68ff..cdc2cad 100644
--- a/drivers/iio/light/hid-sensor-als.c
+++ b/drivers/iio/light/hid-sensor-als.c

@@ -31,7 +31,7 @@
 #include "../common/hid-sensors/hid-sensor-trigger.h"
 
 /*Format: HID-SENSOR-usage_id_in_hex*/
-/*Usage ID from spec for Accelerometer-3D: 0x200041*/
+/*Usage ID from spec for Ambiant-Light: 0x200041*/
 #define DRIVER_NAME "HID-SENSOR-200041"
 
 #define CHANNEL_SCAN_INDEX_ILLUM 0

diff --git a/drivers/iio/magnetometer/Kconfig b/drivers/iio/magnetometer/Kconfig
index bd1cfb6..c332b0a 100644
--- a/drivers/iio/magnetometer/Kconfig
+++ b/drivers/iio/magnetometer/Kconfig

@@ -32,7 +32,6 @@
 	select IIO_ST_MAGN_I2C_3AXIS if (I2C)
 	select IIO_ST_MAGN_SPI_3AXIS if (SPI_MASTER)
 	select IIO_TRIGGERED_BUFFER if (IIO_BUFFER)
-	select IIO_ST_MAGN_BUFFER if (IIO_TRIGGERED_BUFFER)
 	help
 	  Say yes here to build support for STMicroelectronics magnetometers:
 	  LSM303DLHC, LSM303DLM, LIS3MDL.

diff --git a/drivers/iio/magnetometer/ak8975.c b/drivers/iio/magnetometer/ak8975.c
index af6c320..7105f22 100644
--- a/drivers/iio/magnetometer/ak8975.c
+++ b/drivers/iio/magnetometer/ak8975.c

@@ -24,11 +24,13 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
+#include <linux/interrupt.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
 #include <linux/delay.h>
-
+#include <linux/bitops.h>
 #include <linux/gpio.h>
+#include <linux/of_gpio.h>
 
 #include <linux/iio/iio.h>
 #include <linux/iio/sysfs.h>
@@ -82,6 +84,7 @@
  */
 #define AK8975_MAX_CONVERSION_TIMEOUT	500
 #define AK8975_CONVERSION_DONE_POLL_TIME 10
+#define AK8975_DATA_READY_TIMEOUT	((100*HZ)/1000)
 
 /*
  * Per-instance context data for the device.
@@ -94,6 +97,9 @@
 	long			raw_to_gauss[3];
 	u8			reg_cache[AK8975_MAX_REGS];
 	int			eoc_gpio;
+	int			eoc_irq;
+	wait_queue_head_t	data_ready_queue;
+	unsigned long		flags;
 };
 
 static const int ak8975_index_to_reg[] = {
@@ -123,6 +129,51 @@
 }
 
 /*
+ * Handle data ready irq
+ */
+static irqreturn_t ak8975_irq_handler(int irq, void *data)
+{
+	struct ak8975_data *ak8975 = data;
+
+	set_bit(0, &ak8975->flags);
+	wake_up(&ak8975->data_ready_queue);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Install data ready interrupt handler
+ */
+static int ak8975_setup_irq(struct ak8975_data *data)
+{
+	struct i2c_client *client = data->client;
+	int rc;
+	int irq;
+
+	if (client->irq)
+		irq = client->irq;
+	else
+		irq = gpio_to_irq(data->eoc_gpio);
+
+	rc = request_irq(irq, ak8975_irq_handler,
+			 IRQF_TRIGGER_RISING | IRQF_ONESHOT,
+			 dev_name(&client->dev), data);
+	if (rc < 0) {
+		dev_err(&client->dev,
+			"irq %d request failed, (gpio %d): %d\n",
+			irq, data->eoc_gpio, rc);
+		return rc;
+	}
+
+	init_waitqueue_head(&data->data_ready_queue);
+	clear_bit(0, &data->flags);
+	data->eoc_irq = irq;
+
+	return rc;
+}
+
+
+/*
  * Perform some start-of-day setup, including reading the asa calibration
  * values and caching them.
  */
@@ -170,6 +221,16 @@
 				AK8975_REG_CNTL_MODE_POWER_DOWN,
 				AK8975_REG_CNTL_MODE_MASK,
 				AK8975_REG_CNTL_MODE_SHIFT);
+
+	if (data->eoc_gpio > 0 || client->irq) {
+		ret = ak8975_setup_irq(data);
+		if (ret < 0) {
+			dev_err(&client->dev,
+				"Error setting data ready interrupt\n");
+			return ret;
+		}
+	}
+
 	if (ret < 0) {
 		dev_err(&client->dev, "Error in setting power-down mode\n");
 		return ret;
@@ -266,9 +327,23 @@
 		dev_err(&client->dev, "Conversion timeout happened\n");
 		return -EINVAL;
 	}
+
 	return read_status;
 }
 
+/* Returns 0 if the end of conversion interrupt occured or -ETIME otherwise */
+static int wait_conversion_complete_interrupt(struct ak8975_data *data)
+{
+	int ret;
+
+	ret = wait_event_timeout(data->data_ready_queue,
+				 test_bit(0, &data->flags),
+				 AK8975_DATA_READY_TIMEOUT);
+	clear_bit(0, &data->flags);
+
+	return ret > 0 ? 0 : -ETIME;
+}
+
 /*
  * Emits the raw flux value for the x, y, or z axis.
  */
@@ -294,13 +369,16 @@
 	}
 
 	/* Wait for the conversion to complete. */
-	if (gpio_is_valid(data->eoc_gpio))
+	if (data->eoc_irq)
+		ret = wait_conversion_complete_interrupt(data);
+	else if (gpio_is_valid(data->eoc_gpio))
 		ret = wait_conversion_complete_gpio(data);
 	else
 		ret = wait_conversion_complete_polled(data);
 	if (ret < 0)
 		goto exit;
 
+	/* This will be executed only for non-interrupt based waiting case */
 	if (ret & AK8975_REG_ST1_DRDY_MASK) {
 		ret = i2c_smbus_read_byte_data(client, AK8975_REG_ST2);
 		if (ret < 0) {
@@ -384,10 +462,15 @@
 	int err;
 
 	/* Grab and set up the supplied GPIO. */
-	if (client->dev.platform_data == NULL)
-		eoc_gpio = -1;
-	else
+	if (client->dev.platform_data)
 		eoc_gpio = *(int *)(client->dev.platform_data);
+	else if (client->dev.of_node)
+		eoc_gpio = of_get_gpio(client->dev.of_node, 0);
+	else
+		eoc_gpio = -1;
+
+	if (eoc_gpio == -EPROBE_DEFER)
+		return -EPROBE_DEFER;
 
 	/* We may not have a GPIO based IRQ to scan, that is fine, we will
 	   poll if so */
@@ -409,6 +492,11 @@
 	}
 	data = iio_priv(indio_dev);
 	i2c_set_clientdata(client, indio_dev);
+
+	data->client = client;
+	data->eoc_gpio = eoc_gpio;
+	data->eoc_irq = 0;
+
 	/* Perform some basic start-of-day setup of the device. */
 	err = ak8975_setup(client);
 	if (err < 0) {
@@ -433,6 +521,8 @@
 
 exit_free_iio:
 	iio_device_free(indio_dev);
+	if (data->eoc_irq)
+		free_irq(data->eoc_irq, data);
 exit_gpio:
 	if (gpio_is_valid(eoc_gpio))
 		gpio_free(eoc_gpio);
@@ -447,6 +537,9 @@
 
 	iio_device_unregister(indio_dev);
 
+	if (data->eoc_irq)
+		free_irq(data->eoc_irq, data);
+
 	if (gpio_is_valid(data->eoc_gpio))
 		gpio_free(data->eoc_gpio);
 

diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c
index 16f0d6d..ebfe8f1 100644
--- a/drivers/iio/magnetometer/st_magn_core.c
+++ b/drivers/iio/magnetometer/st_magn_core.c

@@ -26,6 +26,8 @@
 #include <linux/iio/common/st_sensors.h>
 #include "st_magn.h"
 
+#define ST_MAGN_NUMBER_DATA_CHANNELS		3
+
 /* DEFAULT VALUE FOR SENSORS */
 #define ST_MAGN_DEFAULT_OUT_X_L_ADDR		0X04
 #define ST_MAGN_DEFAULT_OUT_Y_L_ADDR		0X08
@@ -113,22 +115,34 @@
 #define ST_MAGN_2_OUT_Z_L_ADDR			0x2c
 
 static const struct iio_chan_spec st_magn_16bit_channels[] = {
-	ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_X, IIO_MOD_X, IIO_LE,
-		ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_DEFAULT_OUT_X_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_Y, IIO_MOD_Y, IIO_LE,
-		ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_DEFAULT_OUT_Y_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_Z, IIO_MOD_Z, IIO_LE,
-		ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_DEFAULT_OUT_Z_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 16, 16,
+			ST_MAGN_DEFAULT_OUT_X_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 16, 16,
+			ST_MAGN_DEFAULT_OUT_Y_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 16, 16,
+			ST_MAGN_DEFAULT_OUT_Z_L_ADDR),
 	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
 static const struct iio_chan_spec st_magn_2_16bit_channels[] = {
-	ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_X, IIO_MOD_X, IIO_LE,
-		ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_2_OUT_X_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_Y, IIO_MOD_Y, IIO_LE,
-		ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_2_OUT_Y_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_Z, IIO_MOD_Z, IIO_LE,
-		ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_2_OUT_Z_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 16, 16,
+			ST_MAGN_2_OUT_X_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 16, 16,
+			ST_MAGN_2_OUT_Y_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 16, 16,
+			ST_MAGN_2_OUT_Z_L_ADDR),
 	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
@@ -344,6 +358,7 @@
 	if (err < 0)
 		goto st_magn_common_probe_error;
 
+	mdata->num_data_channels = ST_MAGN_NUMBER_DATA_CHANNELS;
 	mdata->multiread_bit = mdata->sensor->multi_read_bit;
 	indio_dev->channels = mdata->sensor->ch;
 	indio_dev->num_channels = ST_SENSORS_NUMBER_ALL_CHANNELS;

diff --git a/drivers/iio/pressure/Kconfig b/drivers/iio/pressure/Kconfig
new file mode 100644
index 0000000..9427f01
--- /dev/null
+++ b/drivers/iio/pressure/Kconfig

@@ -0,0 +1,35 @@
+#
+# Pressure drivers
+#
+menu "Pressure Sensors"
+
+config IIO_ST_PRESS
+	tristate "STMicroelectronics pressures Driver"
+	depends on (I2C || SPI_MASTER) && SYSFS
+	select IIO_ST_SENSORS_CORE
+	select IIO_ST_PRESS_I2C if (I2C)
+	select IIO_ST_PRESS_SPI if (SPI_MASTER)
+	select IIO_TRIGGERED_BUFFER if (IIO_BUFFER)
+	help
+	  Say yes here to build support for STMicroelectronics pressures:
+	  LPS331AP.
+
+	  This driver can also be built as a module. If so, will be created
+	  these modules:
+	  - st_pressure (core functions for the driver [it is mandatory]);
+	  - st_pressure_i2c (necessary for the I2C devices [optional*]);
+	  - st_pressure_spi (necessary for the SPI devices [optional*]);
+
+	  (*) one of these is necessary to do something.
+
+config IIO_ST_PRESS_I2C
+	tristate
+	depends on IIO_ST_PRESS
+	depends on IIO_ST_SENSORS_I2C
+
+config IIO_ST_PRESS_SPI
+	tristate
+	depends on IIO_ST_PRESS
+	depends on IIO_ST_SENSORS_SPI
+
+endmenu

diff --git a/drivers/iio/pressure/Makefile b/drivers/iio/pressure/Makefile
new file mode 100644
index 0000000..d4bb33e
--- /dev/null
+++ b/drivers/iio/pressure/Makefile

@@ -0,0 +1,10 @@
+#
+# Makefile for industrial I/O pressure drivers
+#
+
+obj-$(CONFIG_IIO_ST_PRESS) += st_pressure.o
+st_pressure-y := st_pressure_core.o
+st_pressure-$(CONFIG_IIO_BUFFER) += st_pressure_buffer.o
+
+obj-$(CONFIG_IIO_ST_PRESS_I2C) += st_pressure_i2c.o
+obj-$(CONFIG_IIO_ST_PRESS_SPI) += st_pressure_spi.o

diff --git a/drivers/iio/pressure/st_pressure.h b/drivers/iio/pressure/st_pressure.h
new file mode 100644
index 0000000..414e45a
--- /dev/null
+++ b/drivers/iio/pressure/st_pressure.h

@@ -0,0 +1,39 @@
+/*
+ * STMicroelectronics pressures driver
+ *
+ * Copyright 2013 STMicroelectronics Inc.
+ *
+ * Denis Ciocca <denis.ciocca@st.com>
+ * v. 1.0.0
+ * Licensed under the GPL-2.
+ */
+
+#ifndef ST_PRESS_H
+#define ST_PRESS_H
+
+#include <linux/types.h>
+#include <linux/iio/common/st_sensors.h>
+
+#define LPS331AP_PRESS_DEV_NAME		"lps331ap"
+
+int st_press_common_probe(struct iio_dev *indio_dev);
+void st_press_common_remove(struct iio_dev *indio_dev);
+
+#ifdef CONFIG_IIO_BUFFER
+int st_press_allocate_ring(struct iio_dev *indio_dev);
+void st_press_deallocate_ring(struct iio_dev *indio_dev);
+int st_press_trig_set_state(struct iio_trigger *trig, bool state);
+#define ST_PRESS_TRIGGER_SET_STATE (&st_press_trig_set_state)
+#else /* CONFIG_IIO_BUFFER */
+static inline int st_press_allocate_ring(struct iio_dev *indio_dev)
+{
+	return 0;
+}
+
+static inline void st_press_deallocate_ring(struct iio_dev *indio_dev)
+{
+}
+#define ST_PRESS_TRIGGER_SET_STATE NULL
+#endif /* CONFIG_IIO_BUFFER */
+
+#endif /* ST_PRESS_H */

diff --git a/drivers/iio/pressure/st_pressure_buffer.c b/drivers/iio/pressure/st_pressure_buffer.c
new file mode 100644
index 0000000..f877ef8
--- /dev/null
+++ b/drivers/iio/pressure/st_pressure_buffer.c

@@ -0,0 +1,105 @@
+/*
+ * STMicroelectronics pressures driver
+ *
+ * Copyright 2013 STMicroelectronics Inc.
+ *
+ * Denis Ciocca <denis.ciocca@st.com>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/interrupt.h>
+#include <linux/i2c.h>
+#include <linux/delay.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/buffer.h>
+#include <linux/iio/trigger_consumer.h>
+#include <linux/iio/triggered_buffer.h>
+
+#include <linux/iio/common/st_sensors.h>
+#include "st_pressure.h"
+
+int st_press_trig_set_state(struct iio_trigger *trig, bool state)
+{
+	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
+
+	return st_sensors_set_dataready_irq(indio_dev, state);
+}
+
+static int st_press_buffer_preenable(struct iio_dev *indio_dev)
+{
+	int err;
+
+	err = st_sensors_set_enable(indio_dev, true);
+	if (err < 0)
+		goto st_press_set_enable_error;
+
+	err = iio_sw_buffer_preenable(indio_dev);
+
+st_press_set_enable_error:
+	return err;
+}
+
+static int st_press_buffer_postenable(struct iio_dev *indio_dev)
+{
+	int err;
+	struct st_sensor_data *pdata = iio_priv(indio_dev);
+
+	pdata->buffer_data = kmalloc(indio_dev->scan_bytes, GFP_KERNEL);
+	if (pdata->buffer_data == NULL) {
+		err = -ENOMEM;
+		goto allocate_memory_error;
+	}
+
+	err = iio_triggered_buffer_postenable(indio_dev);
+	if (err < 0)
+		goto st_press_buffer_postenable_error;
+
+	return err;
+
+st_press_buffer_postenable_error:
+	kfree(pdata->buffer_data);
+allocate_memory_error:
+	return err;
+}
+
+static int st_press_buffer_predisable(struct iio_dev *indio_dev)
+{
+	int err;
+	struct st_sensor_data *pdata = iio_priv(indio_dev);
+
+	err = iio_triggered_buffer_predisable(indio_dev);
+	if (err < 0)
+		goto st_press_buffer_predisable_error;
+
+	err = st_sensors_set_enable(indio_dev, false);
+
+st_press_buffer_predisable_error:
+	kfree(pdata->buffer_data);
+	return err;
+}
+
+static const struct iio_buffer_setup_ops st_press_buffer_setup_ops = {
+	.preenable = &st_press_buffer_preenable,
+	.postenable = &st_press_buffer_postenable,
+	.predisable = &st_press_buffer_predisable,
+};
+
+int st_press_allocate_ring(struct iio_dev *indio_dev)
+{
+	return iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time,
+		&st_sensors_trigger_handler, &st_press_buffer_setup_ops);
+}
+
+void st_press_deallocate_ring(struct iio_dev *indio_dev)
+{
+	iio_triggered_buffer_cleanup(indio_dev);
+}
+
+MODULE_AUTHOR("Denis Ciocca <denis.ciocca@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics pressures buffer");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
new file mode 100644
index 0000000..9c343b4
--- /dev/null
+++ b/drivers/iio/pressure/st_pressure_core.c

@@ -0,0 +1,272 @@
+/*
+ * STMicroelectronics pressures driver
+ *
+ * Copyright 2013 STMicroelectronics Inc.
+ *
+ * Denis Ciocca <denis.ciocca@st.com>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/interrupt.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/sysfs.h>
+#include <linux/iio/trigger.h>
+#include <linux/iio/buffer.h>
+#include <asm/unaligned.h>
+
+#include <linux/iio/common/st_sensors.h>
+#include "st_pressure.h"
+
+#define ST_PRESS_MBAR_TO_KPASCAL(x)		(x * 10)
+#define ST_PRESS_NUMBER_DATA_CHANNELS		1
+
+/* DEFAULT VALUE FOR SENSORS */
+#define ST_PRESS_DEFAULT_OUT_XL_ADDR		0x28
+#define ST_TEMP_DEFAULT_OUT_L_ADDR		0x2b
+
+/* FULLSCALE */
+#define ST_PRESS_FS_AVL_1260MB			1260
+
+/* CUSTOM VALUES FOR SENSOR 1 */
+#define ST_PRESS_1_WAI_EXP			0xbb
+#define ST_PRESS_1_ODR_ADDR			0x20
+#define ST_PRESS_1_ODR_MASK			0x70
+#define ST_PRESS_1_ODR_AVL_1HZ_VAL		0x01
+#define ST_PRESS_1_ODR_AVL_7HZ_VAL		0x05
+#define ST_PRESS_1_ODR_AVL_13HZ_VAL		0x06
+#define ST_PRESS_1_ODR_AVL_25HZ_VAL		0x07
+#define ST_PRESS_1_PW_ADDR			0x20
+#define ST_PRESS_1_PW_MASK			0x80
+#define ST_PRESS_1_FS_ADDR			0x23
+#define ST_PRESS_1_FS_MASK			0x30
+#define ST_PRESS_1_FS_AVL_1260_VAL		0x00
+#define ST_PRESS_1_FS_AVL_1260_GAIN		ST_PRESS_MBAR_TO_KPASCAL(244141)
+#define ST_PRESS_1_FS_AVL_TEMP_GAIN		2083000
+#define ST_PRESS_1_BDU_ADDR			0x20
+#define ST_PRESS_1_BDU_MASK			0x04
+#define ST_PRESS_1_DRDY_IRQ_ADDR		0x22
+#define ST_PRESS_1_DRDY_IRQ_MASK		0x04
+#define ST_PRESS_1_MULTIREAD_BIT		true
+#define ST_PRESS_1_TEMP_OFFSET			42500
+
+static const struct iio_chan_spec st_press_channels[] = {
+	ST_SENSORS_LSM_CHANNELS(IIO_PRESSURE,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_X, 0, IIO_NO_MOD, 'u', IIO_LE, 24, 24,
+			ST_PRESS_DEFAULT_OUT_XL_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_TEMP,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE) |
+						BIT(IIO_CHAN_INFO_OFFSET),
+			-1, 0, IIO_NO_MOD, 's', IIO_LE, 16, 16,
+			ST_TEMP_DEFAULT_OUT_L_ADDR),
+	IIO_CHAN_SOFT_TIMESTAMP(1)
+};
+
+static const struct st_sensors st_press_sensors[] = {
+	{
+		.wai = ST_PRESS_1_WAI_EXP,
+		.sensors_supported = {
+			[0] = LPS331AP_PRESS_DEV_NAME,
+		},
+		.ch = (struct iio_chan_spec *)st_press_channels,
+		.odr = {
+			.addr = ST_PRESS_1_ODR_ADDR,
+			.mask = ST_PRESS_1_ODR_MASK,
+			.odr_avl = {
+				{ 1, ST_PRESS_1_ODR_AVL_1HZ_VAL, },
+				{ 7, ST_PRESS_1_ODR_AVL_7HZ_VAL, },
+				{ 13, ST_PRESS_1_ODR_AVL_13HZ_VAL, },
+				{ 25, ST_PRESS_1_ODR_AVL_25HZ_VAL, },
+			},
+		},
+		.pw = {
+			.addr = ST_PRESS_1_PW_ADDR,
+			.mask = ST_PRESS_1_PW_MASK,
+			.value_on = ST_SENSORS_DEFAULT_POWER_ON_VALUE,
+			.value_off = ST_SENSORS_DEFAULT_POWER_OFF_VALUE,
+		},
+		.fs = {
+			.addr = ST_PRESS_1_FS_ADDR,
+			.mask = ST_PRESS_1_FS_MASK,
+			.fs_avl = {
+				[0] = {
+					.num = ST_PRESS_FS_AVL_1260MB,
+					.value = ST_PRESS_1_FS_AVL_1260_VAL,
+					.gain = ST_PRESS_1_FS_AVL_1260_GAIN,
+					.gain2 = ST_PRESS_1_FS_AVL_TEMP_GAIN,
+				},
+			},
+		},
+		.bdu = {
+			.addr = ST_PRESS_1_BDU_ADDR,
+			.mask = ST_PRESS_1_BDU_MASK,
+		},
+		.drdy_irq = {
+			.addr = ST_PRESS_1_DRDY_IRQ_ADDR,
+			.mask = ST_PRESS_1_DRDY_IRQ_MASK,
+		},
+		.multi_read_bit = ST_PRESS_1_MULTIREAD_BIT,
+		.bootime = 2,
+	},
+};
+
+static int st_press_read_raw(struct iio_dev *indio_dev,
+			struct iio_chan_spec const *ch, int *val,
+							int *val2, long mask)
+{
+	int err;
+	struct st_sensor_data *pdata = iio_priv(indio_dev);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_RAW:
+		err = st_sensors_read_info_raw(indio_dev, ch, val);
+		if (err < 0)
+			goto read_error;
+
+		return IIO_VAL_INT;
+	case IIO_CHAN_INFO_SCALE:
+		*val = 0;
+
+		switch (ch->type) {
+		case IIO_PRESSURE:
+			*val2 = pdata->current_fullscale->gain;
+			break;
+		case IIO_TEMP:
+			*val2 = pdata->current_fullscale->gain2;
+			break;
+		default:
+			err = -EINVAL;
+			goto read_error;
+		}
+
+		return IIO_VAL_INT_PLUS_NANO;
+	case IIO_CHAN_INFO_OFFSET:
+		switch (ch->type) {
+		case IIO_TEMP:
+			*val = 425;
+			*val2 = 10;
+			break;
+		default:
+			err = -EINVAL;
+			goto read_error;
+		}
+
+		return IIO_VAL_FRACTIONAL;
+	default:
+		return -EINVAL;
+	}
+
+read_error:
+	return err;
+}
+
+static ST_SENSOR_DEV_ATTR_SAMP_FREQ();
+static ST_SENSORS_DEV_ATTR_SAMP_FREQ_AVAIL();
+
+static struct attribute *st_press_attributes[] = {
+	&iio_dev_attr_sampling_frequency_available.dev_attr.attr,
+	&iio_dev_attr_sampling_frequency.dev_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group st_press_attribute_group = {
+	.attrs = st_press_attributes,
+};
+
+static const struct iio_info press_info = {
+	.driver_module = THIS_MODULE,
+	.attrs = &st_press_attribute_group,
+	.read_raw = &st_press_read_raw,
+};
+
+#ifdef CONFIG_IIO_TRIGGER
+static const struct iio_trigger_ops st_press_trigger_ops = {
+	.owner = THIS_MODULE,
+	.set_trigger_state = ST_PRESS_TRIGGER_SET_STATE,
+};
+#define ST_PRESS_TRIGGER_OPS (&st_press_trigger_ops)
+#else
+#define ST_PRESS_TRIGGER_OPS NULL
+#endif
+
+int st_press_common_probe(struct iio_dev *indio_dev)
+{
+	int err;
+	struct st_sensor_data *pdata = iio_priv(indio_dev);
+
+	indio_dev->modes = INDIO_DIRECT_MODE;
+	indio_dev->info = &press_info;
+
+	err = st_sensors_check_device_support(indio_dev,
+				ARRAY_SIZE(st_press_sensors), st_press_sensors);
+	if (err < 0)
+		goto st_press_common_probe_error;
+
+	pdata->num_data_channels = ST_PRESS_NUMBER_DATA_CHANNELS;
+	pdata->multiread_bit = pdata->sensor->multi_read_bit;
+	indio_dev->channels = pdata->sensor->ch;
+	indio_dev->num_channels = ARRAY_SIZE(st_press_channels);
+
+	pdata->current_fullscale = (struct st_sensor_fullscale_avl *)
+						&pdata->sensor->fs.fs_avl[0];
+	pdata->odr = pdata->sensor->odr.odr_avl[0].hz;
+
+	err = st_sensors_init_sensor(indio_dev);
+	if (err < 0)
+		goto st_press_common_probe_error;
+
+	if (pdata->get_irq_data_ready(indio_dev) > 0) {
+		err = st_press_allocate_ring(indio_dev);
+		if (err < 0)
+			goto st_press_common_probe_error;
+
+		err = st_sensors_allocate_trigger(indio_dev,
+							ST_PRESS_TRIGGER_OPS);
+		if (err < 0)
+			goto st_press_probe_trigger_error;
+	}
+
+	err = iio_device_register(indio_dev);
+	if (err)
+		goto st_press_device_register_error;
+
+	return err;
+
+st_press_device_register_error:
+	if (pdata->get_irq_data_ready(indio_dev) > 0)
+		st_sensors_deallocate_trigger(indio_dev);
+st_press_probe_trigger_error:
+	if (pdata->get_irq_data_ready(indio_dev) > 0)
+		st_press_deallocate_ring(indio_dev);
+st_press_common_probe_error:
+	return err;
+}
+EXPORT_SYMBOL(st_press_common_probe);
+
+void st_press_common_remove(struct iio_dev *indio_dev)
+{
+	struct st_sensor_data *pdata = iio_priv(indio_dev);
+
+	iio_device_unregister(indio_dev);
+	if (pdata->get_irq_data_ready(indio_dev) > 0) {
+		st_sensors_deallocate_trigger(indio_dev);
+		st_press_deallocate_ring(indio_dev);
+	}
+	iio_device_free(indio_dev);
+}
+EXPORT_SYMBOL(st_press_common_remove);
+
+MODULE_AUTHOR("Denis Ciocca <denis.ciocca@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics pressures driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/iio/pressure/st_pressure_i2c.c b/drivers/iio/pressure/st_pressure_i2c.c
new file mode 100644
index 0000000..7cebcc7
--- /dev/null
+++ b/drivers/iio/pressure/st_pressure_i2c.c

@@ -0,0 +1,77 @@
+/*
+ * STMicroelectronics pressures driver
+ *
+ * Copyright 2013 STMicroelectronics Inc.
+ *
+ * Denis Ciocca <denis.ciocca@st.com>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/iio/iio.h>
+
+#include <linux/iio/common/st_sensors.h>
+#include <linux/iio/common/st_sensors_i2c.h>
+#include "st_pressure.h"
+
+static int st_press_i2c_probe(struct i2c_client *client,
+						const struct i2c_device_id *id)
+{
+	struct iio_dev *indio_dev;
+	struct st_sensor_data *pdata;
+	int err;
+
+	indio_dev = iio_device_alloc(sizeof(*pdata));
+	if (indio_dev == NULL) {
+		err = -ENOMEM;
+		goto iio_device_alloc_error;
+	}
+
+	pdata = iio_priv(indio_dev);
+	pdata->dev = &client->dev;
+
+	st_sensors_i2c_configure(indio_dev, client, pdata);
+
+	err = st_press_common_probe(indio_dev);
+	if (err < 0)
+		goto st_press_common_probe_error;
+
+	return 0;
+
+st_press_common_probe_error:
+	iio_device_free(indio_dev);
+iio_device_alloc_error:
+	return err;
+}
+
+static int st_press_i2c_remove(struct i2c_client *client)
+{
+	st_press_common_remove(i2c_get_clientdata(client));
+
+	return 0;
+}
+
+static const struct i2c_device_id st_press_id_table[] = {
+	{ LPS331AP_PRESS_DEV_NAME },
+	{},
+};
+MODULE_DEVICE_TABLE(i2c, st_press_id_table);
+
+static struct i2c_driver st_press_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "st-press-i2c",
+	},
+	.probe = st_press_i2c_probe,
+	.remove = st_press_i2c_remove,
+	.id_table = st_press_id_table,
+};
+module_i2c_driver(st_press_driver);
+
+MODULE_AUTHOR("Denis Ciocca <denis.ciocca@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics pressures i2c driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/iio/pressure/st_pressure_spi.c b/drivers/iio/pressure/st_pressure_spi.c
new file mode 100644
index 0000000..17a1490
--- /dev/null
+++ b/drivers/iio/pressure/st_pressure_spi.c

@@ -0,0 +1,76 @@
+/*
+ * STMicroelectronics pressures driver
+ *
+ * Copyright 2013 STMicroelectronics Inc.
+ *
+ * Denis Ciocca <denis.ciocca@st.com>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spi/spi.h>
+#include <linux/iio/iio.h>
+
+#include <linux/iio/common/st_sensors.h>
+#include <linux/iio/common/st_sensors_spi.h>
+#include "st_pressure.h"
+
+static int st_press_spi_probe(struct spi_device *spi)
+{
+	struct iio_dev *indio_dev;
+	struct st_sensor_data *pdata;
+	int err;
+
+	indio_dev = iio_device_alloc(sizeof(*pdata));
+	if (indio_dev == NULL) {
+		err = -ENOMEM;
+		goto iio_device_alloc_error;
+	}
+
+	pdata = iio_priv(indio_dev);
+	pdata->dev = &spi->dev;
+
+	st_sensors_spi_configure(indio_dev, spi, pdata);
+
+	err = st_press_common_probe(indio_dev);
+	if (err < 0)
+		goto st_press_common_probe_error;
+
+	return 0;
+
+st_press_common_probe_error:
+	iio_device_free(indio_dev);
+iio_device_alloc_error:
+	return err;
+}
+
+static int st_press_spi_remove(struct spi_device *spi)
+{
+	st_press_common_remove(spi_get_drvdata(spi));
+
+	return 0;
+}
+
+static const struct spi_device_id st_press_id_table[] = {
+	{ LPS331AP_PRESS_DEV_NAME },
+	{},
+};
+MODULE_DEVICE_TABLE(spi, st_press_id_table);
+
+static struct spi_driver st_press_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "st-press-spi",
+	},
+	.probe = st_press_spi_probe,
+	.remove = st_press_spi_remove,
+	.id_table = st_press_id_table,
+};
+module_spi_driver(st_press_driver);
+
+MODULE_AUTHOR("Denis Ciocca <denis.ciocca@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics pressures spi driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/iio/trigger/Kconfig b/drivers/iio/trigger/Kconfig
new file mode 100644
index 0000000..360fd50
--- /dev/null
+++ b/drivers/iio/trigger/Kconfig

@@ -0,0 +1,26 @@
+#
+# Industrial I/O standalone triggers
+#
+menu "Triggers - standalone"
+
+config IIO_INTERRUPT_TRIGGER
+	tristate "Generic interrupt trigger"
+	help
+	  Provides support for using an interrupt of any type as an IIO
+	  trigger.  This may be provided by a gpio driver for example.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called iio-trig-interrupt.
+
+config IIO_SYSFS_TRIGGER
+	tristate "SYSFS trigger"
+	depends on SYSFS
+	select IRQ_WORK
+	help
+	  Provides support for using SYSFS entry as IIO triggers.
+	  If unsure, say N (but it's safe to say "Y").
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called iio-trig-sysfs.
+
+endmenu

diff --git a/drivers/iio/trigger/Makefile b/drivers/iio/trigger/Makefile
new file mode 100644
index 0000000..ce319a5
--- /dev/null
+++ b/drivers/iio/trigger/Makefile

@@ -0,0 +1,6 @@
+#
+# Makefile for triggers not associated with iio-devices
+#
+
+obj-$(CONFIG_IIO_INTERRUPT_TRIGGER) += iio-trig-interrupt.o
+obj-$(CONFIG_IIO_SYSFS_TRIGGER) += iio-trig-sysfs.o

diff --git a/drivers/iio/trigger/iio-trig-interrupt.c b/drivers/iio/trigger/iio-trig-interrupt.c
new file mode 100644
index 0000000..02577ec
--- /dev/null
+++ b/drivers/iio/trigger/iio-trig-interrupt.c

@@ -0,0 +1,121 @@
+/*
+ * Industrial I/O - generic interrupt based trigger support
+ *
+ * Copyright (c) 2008-2013 Jonathan Cameron
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include <linux/iio/iio.h>
+#include <linux/iio/trigger.h>
+
+
+struct iio_interrupt_trigger_info {
+	unsigned int irq;
+};
+
+static irqreturn_t iio_interrupt_trigger_poll(int irq, void *private)
+{
+	/* Timestamp not currently provided */
+	iio_trigger_poll(private, 0);
+	return IRQ_HANDLED;
+}
+
+static const struct iio_trigger_ops iio_interrupt_trigger_ops = {
+	.owner = THIS_MODULE,
+};
+
+static int iio_interrupt_trigger_probe(struct platform_device *pdev)
+{
+	struct iio_interrupt_trigger_info *trig_info;
+	struct iio_trigger *trig;
+	unsigned long irqflags;
+	struct resource *irq_res;
+	int irq, ret = 0;
+
+	irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+
+	if (irq_res == NULL)
+		return -ENODEV;
+
+	irqflags = (irq_res->flags & IRQF_TRIGGER_MASK) | IRQF_SHARED;
+
+	irq = irq_res->start;
+
+	trig = iio_trigger_alloc("irqtrig%d", irq);
+	if (!trig) {
+		ret = -ENOMEM;
+		goto error_ret;
+	}
+
+	trig_info = kzalloc(sizeof(*trig_info), GFP_KERNEL);
+	if (!trig_info) {
+		ret = -ENOMEM;
+		goto error_put_trigger;
+	}
+	iio_trigger_set_drvdata(trig, trig_info);
+	trig_info->irq = irq;
+	trig->ops = &iio_interrupt_trigger_ops;
+	ret = request_irq(irq, iio_interrupt_trigger_poll,
+			  irqflags, trig->name, trig);
+	if (ret) {
+		dev_err(&pdev->dev,
+			"request IRQ-%d failed", irq);
+		goto error_free_trig_info;
+	}
+
+	ret = iio_trigger_register(trig);
+	if (ret)
+		goto error_release_irq;
+	platform_set_drvdata(pdev, trig);
+
+	return 0;
+
+/* First clean up the partly allocated trigger */
+error_release_irq:
+	free_irq(irq, trig);
+error_free_trig_info:
+	kfree(trig_info);
+error_put_trigger:
+	iio_trigger_put(trig);
+error_ret:
+	return ret;
+}
+
+static int iio_interrupt_trigger_remove(struct platform_device *pdev)
+{
+	struct iio_trigger *trig;
+	struct iio_interrupt_trigger_info *trig_info;
+
+	trig = platform_get_drvdata(pdev);
+	trig_info = iio_trigger_get_drvdata(trig);
+	iio_trigger_unregister(trig);
+	free_irq(trig_info->irq, trig);
+	kfree(trig_info);
+	iio_trigger_put(trig);
+
+	return 0;
+}
+
+static struct platform_driver iio_interrupt_trigger_driver = {
+	.probe = iio_interrupt_trigger_probe,
+	.remove = iio_interrupt_trigger_remove,
+	.driver = {
+		.name = "iio_interrupt_trigger",
+		.owner = THIS_MODULE,
+	},
+};
+
+module_platform_driver(iio_interrupt_trigger_driver);
+
+MODULE_AUTHOR("Jonathan Cameron <jic23@kernel.org>");
+MODULE_DESCRIPTION("Interrupt trigger for the iio subsystem");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/iio/trigger/iio-trig-sysfs.c b/drivers/iio/trigger/iio-trig-sysfs.c
new file mode 100644
index 0000000..effcd0a
--- /dev/null
+++ b/drivers/iio/trigger/iio-trig-sysfs.c

@@ -0,0 +1,227 @@
+/*
+ * Copyright 2011 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/irq_work.h>
+
+#include <linux/iio/iio.h>
+#include <linux/iio/trigger.h>
+
+struct iio_sysfs_trig {
+	struct iio_trigger *trig;
+	struct irq_work work;
+	int id;
+	struct list_head l;
+};
+
+static LIST_HEAD(iio_sysfs_trig_list);
+static DEFINE_MUTEX(iio_syfs_trig_list_mut);
+
+static int iio_sysfs_trigger_probe(int id);
+static ssize_t iio_sysfs_trig_add(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf,
+				  size_t len)
+{
+	int ret;
+	unsigned long input;
+
+	ret = kstrtoul(buf, 10, &input);
+	if (ret)
+		return ret;
+	ret = iio_sysfs_trigger_probe(input);
+	if (ret)
+		return ret;
+	return len;
+}
+static DEVICE_ATTR(add_trigger, S_IWUSR, NULL, &iio_sysfs_trig_add);
+
+static int iio_sysfs_trigger_remove(int id);
+static ssize_t iio_sysfs_trig_remove(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf,
+				     size_t len)
+{
+	int ret;
+	unsigned long input;
+
+	ret = kstrtoul(buf, 10, &input);
+	if (ret)
+		return ret;
+	ret = iio_sysfs_trigger_remove(input);
+	if (ret)
+		return ret;
+	return len;
+}
+
+static DEVICE_ATTR(remove_trigger, S_IWUSR, NULL, &iio_sysfs_trig_remove);
+
+static struct attribute *iio_sysfs_trig_attrs[] = {
+	&dev_attr_add_trigger.attr,
+	&dev_attr_remove_trigger.attr,
+	NULL,
+};
+
+static const struct attribute_group iio_sysfs_trig_group = {
+	.attrs = iio_sysfs_trig_attrs,
+};
+
+static const struct attribute_group *iio_sysfs_trig_groups[] = {
+	&iio_sysfs_trig_group,
+	NULL
+};
+
+
+/* Nothing to actually do upon release */
+static void iio_trigger_sysfs_release(struct device *dev)
+{
+}
+
+static struct device iio_sysfs_trig_dev = {
+	.bus = &iio_bus_type,
+	.groups = iio_sysfs_trig_groups,
+	.release = &iio_trigger_sysfs_release,
+};
+
+static void iio_sysfs_trigger_work(struct irq_work *work)
+{
+	struct iio_sysfs_trig *trig = container_of(work, struct iio_sysfs_trig,
+							work);
+
+	iio_trigger_poll(trig->trig, 0);
+}
+
+static ssize_t iio_sysfs_trigger_poll(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct iio_trigger *trig = to_iio_trigger(dev);
+	struct iio_sysfs_trig *sysfs_trig = iio_trigger_get_drvdata(trig);
+
+	irq_work_queue(&sysfs_trig->work);
+
+	return count;
+}
+
+static DEVICE_ATTR(trigger_now, S_IWUSR, NULL, iio_sysfs_trigger_poll);
+
+static struct attribute *iio_sysfs_trigger_attrs[] = {
+	&dev_attr_trigger_now.attr,
+	NULL,
+};
+
+static const struct attribute_group iio_sysfs_trigger_attr_group = {
+	.attrs = iio_sysfs_trigger_attrs,
+};
+
+static const struct attribute_group *iio_sysfs_trigger_attr_groups[] = {
+	&iio_sysfs_trigger_attr_group,
+	NULL
+};
+
+static const struct iio_trigger_ops iio_sysfs_trigger_ops = {
+	.owner = THIS_MODULE,
+};
+
+static int iio_sysfs_trigger_probe(int id)
+{
+	struct iio_sysfs_trig *t;
+	int ret;
+	bool foundit = false;
+	mutex_lock(&iio_syfs_trig_list_mut);
+	list_for_each_entry(t, &iio_sysfs_trig_list, l)
+		if (id == t->id) {
+			foundit = true;
+			break;
+		}
+	if (foundit) {
+		ret = -EINVAL;
+		goto out1;
+	}
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (t == NULL) {
+		ret = -ENOMEM;
+		goto out1;
+	}
+	t->id = id;
+	t->trig = iio_trigger_alloc("sysfstrig%d", id);
+	if (!t->trig) {
+		ret = -ENOMEM;
+		goto free_t;
+	}
+
+	t->trig->dev.groups = iio_sysfs_trigger_attr_groups;
+	t->trig->ops = &iio_sysfs_trigger_ops;
+	t->trig->dev.parent = &iio_sysfs_trig_dev;
+	iio_trigger_set_drvdata(t->trig, t);
+
+	init_irq_work(&t->work, iio_sysfs_trigger_work);
+
+	ret = iio_trigger_register(t->trig);
+	if (ret)
+		goto out2;
+	list_add(&t->l, &iio_sysfs_trig_list);
+	__module_get(THIS_MODULE);
+	mutex_unlock(&iio_syfs_trig_list_mut);
+	return 0;
+
+out2:
+	iio_trigger_put(t->trig);
+free_t:
+	kfree(t);
+out1:
+	mutex_unlock(&iio_syfs_trig_list_mut);
+	return ret;
+}
+
+static int iio_sysfs_trigger_remove(int id)
+{
+	bool foundit = false;
+	struct iio_sysfs_trig *t;
+	mutex_lock(&iio_syfs_trig_list_mut);
+	list_for_each_entry(t, &iio_sysfs_trig_list, l)
+		if (id == t->id) {
+			foundit = true;
+			break;
+		}
+	if (!foundit) {
+		mutex_unlock(&iio_syfs_trig_list_mut);
+		return -EINVAL;
+	}
+
+	iio_trigger_unregister(t->trig);
+	iio_trigger_free(t->trig);
+
+	list_del(&t->l);
+	kfree(t);
+	module_put(THIS_MODULE);
+	mutex_unlock(&iio_syfs_trig_list_mut);
+	return 0;
+}
+
+
+static int __init iio_sysfs_trig_init(void)
+{
+	device_initialize(&iio_sysfs_trig_dev);
+	dev_set_name(&iio_sysfs_trig_dev, "iio_sysfs_trigger");
+	return device_add(&iio_sysfs_trig_dev);
+}
+module_init(iio_sysfs_trig_init);
+
+static void __exit iio_sysfs_trig_exit(void)
+{
+	device_unregister(&iio_sysfs_trig_dev);
+}
+module_exit(iio_sysfs_trig_exit);
+
+MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
+MODULE_DESCRIPTION("Sysfs based trigger for the iio subsystem");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:iio-trig-sysfs");

diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c
index d6cbfe9..fa061d4 100644
--- a/drivers/input/joystick/xpad.c
+++ b/drivers/input/joystick/xpad.c

@@ -137,7 +137,7 @@
 	{ 0x0738, 0x4540, "Mad Catz Beat Pad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX },
 	{ 0x0738, 0x4556, "Mad Catz Lynx Wireless Controller", 0, XTYPE_XBOX },
 	{ 0x0738, 0x4716, "Mad Catz Wired Xbox 360 Controller", 0, XTYPE_XBOX360 },
-	{ 0x0738, 0x4728, "Mad Catz Street Fighter IV FightPad", XTYPE_XBOX360 },
+	{ 0x0738, 0x4728, "Mad Catz Street Fighter IV FightPad", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
 	{ 0x0738, 0x4738, "Mad Catz Wired Xbox 360 Controller (SFIV)", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
 	{ 0x0738, 0x6040, "Mad Catz Beat Pad Pro", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX },
 	{ 0x0738, 0xbeef, "Mad Catz JOYTECH NEO SE Advanced GamePad", XTYPE_XBOX360 },

diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index 62a2c0e..7ac9c98 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig

@@ -431,6 +431,7 @@
 
 config KEYBOARD_OPENCORES
 	tristate "OpenCores Keyboard Controller"
+	depends on HAS_IOMEM
 	help
 	  Say Y here if you want to use the OpenCores Keyboard Controller
 	  http://www.opencores.org/project,keyboardcontroller

diff --git a/drivers/input/serio/Kconfig b/drivers/input/serio/Kconfig
index aebfe3e..1bda828 100644
--- a/drivers/input/serio/Kconfig
+++ b/drivers/input/serio/Kconfig

@@ -205,6 +205,7 @@
 
 config SERIO_ALTERA_PS2
 	tristate "Altera UP PS/2 controller"
+	depends on HAS_IOMEM
 	help
 	  Say Y here if you have Altera University Program PS/2 ports.
 

diff --git a/drivers/input/tablet/wacom_wac.c b/drivers/input/tablet/wacom_wac.c
index 518282d..384fbcd 100644
--- a/drivers/input/tablet/wacom_wac.c
+++ b/drivers/input/tablet/wacom_wac.c

@@ -363,6 +363,7 @@
 		case 0x140802: /* Intuos4/5 13HD/24HD Classic Pen */
 		case 0x160802: /* Cintiq 13HD Pro Pen */
 		case 0x180802: /* DTH2242 Pen */
+		case 0x100802: /* Intuos4/5 13HD/24HD General Pen */
 			wacom->tool[idx] = BTN_TOOL_PEN;
 			break;
 
@@ -401,6 +402,7 @@
 		case 0x10080c: /* Intuos4/5 13HD/24HD Art Pen Eraser */
 		case 0x16080a: /* Cintiq 13HD Pro Pen Eraser */
 		case 0x18080a: /* DTH2242 Eraser */
+		case 0x10080a: /* Intuos4/5 13HD/24HD General Pen Eraser */
 			wacom->tool[idx] = BTN_TOOL_RUBBER;
 			break;
 

diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c
index 8e60437..ae89d26 100644
--- a/drivers/input/touchscreen/cyttsp_core.c
+++ b/drivers/input/touchscreen/cyttsp_core.c

@@ -116,6 +116,15 @@
 	return ttsp_write_block_data(ts, CY_REG_BASE, sizeof(cmd), &cmd);
 }
 
+static int cyttsp_handshake(struct cyttsp *ts)
+{
+	if (ts->pdata->use_hndshk)
+		return ttsp_send_command(ts,
+				ts->xy_data.hst_mode ^ CY_HNDSHK_BIT);
+
+	return 0;
+}
+
 static int cyttsp_load_bl_regs(struct cyttsp *ts)
 {
 	memset(&ts->bl_data, 0, sizeof(ts->bl_data));
@@ -133,7 +142,7 @@
 	memcpy(bl_cmd, bl_command, sizeof(bl_command));
 	if (ts->pdata->bl_keys)
 		memcpy(&bl_cmd[sizeof(bl_command) - CY_NUM_BL_KEYS],
-			ts->pdata->bl_keys, sizeof(bl_command));
+			ts->pdata->bl_keys, CY_NUM_BL_KEYS);
 
 	error = ttsp_write_block_data(ts, CY_REG_BASE,
 				      sizeof(bl_cmd), bl_cmd);
@@ -167,6 +176,10 @@
 	if (error)
 		return error;
 
+	error = cyttsp_handshake(ts);
+	if (error)
+		return error;
+
 	return ts->xy_data.act_dist == CY_ACT_DIST_DFLT ? -EIO : 0;
 }
 
@@ -188,6 +201,10 @@
 	if (error)
 		return error;
 
+	error = cyttsp_handshake(ts);
+	if (error)
+		return error;
+
 	if (!ts->sysinfo_data.tts_verh && !ts->sysinfo_data.tts_verl)
 		return -EIO;
 
@@ -344,12 +361,9 @@
 		goto out;
 
 	/* provide flow control handshake */
-	if (ts->pdata->use_hndshk) {
-		error = ttsp_send_command(ts,
-				ts->xy_data.hst_mode ^ CY_HNDSHK_BIT);
-		if (error)
-			goto out;
-	}
+	error = cyttsp_handshake(ts);
+	if (error)
+		goto out;
 
 	if (unlikely(ts->state == CY_IDLE_STATE))
 		goto out;

diff --git a/drivers/input/touchscreen/cyttsp_core.h b/drivers/input/touchscreen/cyttsp_core.h
index 1aa3c69..f1ebde3 100644
--- a/drivers/input/touchscreen/cyttsp_core.h
+++ b/drivers/input/touchscreen/cyttsp_core.h

@@ -67,8 +67,8 @@
 /* TTSP System Information interface definition */
 struct cyttsp_sysinfo_data {
 	u8 hst_mode;
-	u8 mfg_cmd;
 	u8 mfg_stat;
+	u8 mfg_cmd;
 	u8 cid[3];
 	u8 tt_undef1;
 	u8 uid[8];

diff --git a/drivers/memory/Kconfig b/drivers/memory/Kconfig
index 067f311..29a11db 100644
--- a/drivers/memory/Kconfig
+++ b/drivers/memory/Kconfig

@@ -20,6 +20,16 @@
 	  parameters and other settings during frequency, voltage and
 	  temperature changes
 
+config MVEBU_DEVBUS
+	bool "Marvell EBU Device Bus Controller"
+	default y
+	depends on PLAT_ORION && OF
+	help
+	  This driver is for the Device Bus controller available in some
+	  Marvell EBU SoCs such as Discovery (mv78xx0), Orion (88f5xxx) and
+	  Armada 370 and Armada XP. This controller allows to handle flash
+	  devices such as NOR, NAND, SRAM, and FPGA.
+
 config TEGRA20_MC
 	bool "Tegra20 Memory Controller(MC) driver"
 	default y

diff --git a/drivers/memory/Makefile b/drivers/memory/Makefile
index 9cce5d7..969d923 100644
--- a/drivers/memory/Makefile
+++ b/drivers/memory/Makefile

@@ -6,5 +6,6 @@
 obj-$(CONFIG_OF)		+= of_memory.o
 endif
 obj-$(CONFIG_TI_EMIF)		+= emif.o
+obj-$(CONFIG_MVEBU_DEVBUS)	+= mvebu-devbus.o
 obj-$(CONFIG_TEGRA20_MC)	+= tegra20-mc.o
 obj-$(CONFIG_TEGRA30_MC)	+= tegra30-mc.o

diff --git a/drivers/memory/mvebu-devbus.c b/drivers/memory/mvebu-devbus.c
new file mode 100644
index 0000000..978e8e3
--- /dev/null
+++ b/drivers/memory/mvebu-devbus.c

@@ -0,0 +1,340 @@
+/*
+ * Marvell EBU SoC Device Bus Controller
+ * (memory controller for NOR/NAND/SRAM/FPGA devices)
+ *
+ * Copyright (C) 2013 Marvell
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/mbus.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+
+/* Register definitions */
+#define DEV_WIDTH_BIT		30
+#define BADR_SKEW_BIT		28
+#define RD_HOLD_BIT		23
+#define ACC_NEXT_BIT		17
+#define RD_SETUP_BIT		12
+#define ACC_FIRST_BIT		6
+
+#define SYNC_ENABLE_BIT		24
+#define WR_HIGH_BIT		16
+#define WR_LOW_BIT		8
+
+#define READ_PARAM_OFFSET	0x0
+#define WRITE_PARAM_OFFSET	0x4
+
+static const char * const devbus_wins[] = {
+	"devbus-boot",
+	"devbus-cs0",
+	"devbus-cs1",
+	"devbus-cs2",
+	"devbus-cs3",
+};
+
+struct devbus_read_params {
+	u32 bus_width;
+	u32 badr_skew;
+	u32 turn_off;
+	u32 acc_first;
+	u32 acc_next;
+	u32 rd_setup;
+	u32 rd_hold;
+};
+
+struct devbus_write_params {
+	u32 sync_enable;
+	u32 wr_high;
+	u32 wr_low;
+	u32 ale_wr;
+};
+
+struct devbus {
+	struct device *dev;
+	void __iomem *base;
+	unsigned long tick_ps;
+};
+
+static int get_timing_param_ps(struct devbus *devbus,
+			       struct device_node *node,
+			       const char *name,
+			       u32 *ticks)
+{
+	u32 time_ps;
+	int err;
+
+	err = of_property_read_u32(node, name, &time_ps);
+	if (err < 0) {
+		dev_err(devbus->dev, "%s has no '%s' property\n",
+			name, node->full_name);
+		return err;
+	}
+
+	*ticks = (time_ps + devbus->tick_ps - 1) / devbus->tick_ps;
+
+	dev_dbg(devbus->dev, "%s: %u ps -> 0x%x\n",
+		name, time_ps, *ticks);
+	return 0;
+}
+
+static int devbus_set_timing_params(struct devbus *devbus,
+				    struct device_node *node)
+{
+	struct devbus_read_params r;
+	struct devbus_write_params w;
+	u32 value;
+	int err;
+
+	dev_dbg(devbus->dev, "Setting timing parameter, tick is %lu ps\n",
+		devbus->tick_ps);
+
+	/* Get read timings */
+	err = of_property_read_u32(node, "devbus,bus-width", &r.bus_width);
+	if (err < 0) {
+		dev_err(devbus->dev,
+			"%s has no 'devbus,bus-width' property\n",
+			node->full_name);
+		return err;
+	}
+	/* Convert bit width to byte width */
+	r.bus_width /= 8;
+
+	err = get_timing_param_ps(devbus, node, "devbus,badr-skew-ps",
+				 &r.badr_skew);
+	if (err < 0)
+		return err;
+
+	err = get_timing_param_ps(devbus, node, "devbus,turn-off-ps",
+				 &r.turn_off);
+	if (err < 0)
+		return err;
+
+	err = get_timing_param_ps(devbus, node, "devbus,acc-first-ps",
+				 &r.acc_first);
+	if (err < 0)
+		return err;
+
+	err = get_timing_param_ps(devbus, node, "devbus,acc-next-ps",
+				 &r.acc_next);
+	if (err < 0)
+		return err;
+
+	err = get_timing_param_ps(devbus, node, "devbus,rd-setup-ps",
+				 &r.rd_setup);
+	if (err < 0)
+		return err;
+
+	err = get_timing_param_ps(devbus, node, "devbus,rd-hold-ps",
+				 &r.rd_hold);
+	if (err < 0)
+		return err;
+
+	/* Get write timings */
+	err = of_property_read_u32(node, "devbus,sync-enable",
+				  &w.sync_enable);
+	if (err < 0) {
+		dev_err(devbus->dev,
+			"%s has no 'devbus,sync-enable' property\n",
+			node->full_name);
+		return err;
+	}
+
+	err = get_timing_param_ps(devbus, node, "devbus,ale-wr-ps",
+				 &w.ale_wr);
+	if (err < 0)
+		return err;
+
+	err = get_timing_param_ps(devbus, node, "devbus,wr-low-ps",
+				 &w.wr_low);
+	if (err < 0)
+		return err;
+
+	err = get_timing_param_ps(devbus, node, "devbus,wr-high-ps",
+				 &w.wr_high);
+	if (err < 0)
+		return err;
+
+	/* Set read timings */
+	value = r.bus_width << DEV_WIDTH_BIT |
+		r.badr_skew << BADR_SKEW_BIT |
+		r.rd_hold   << RD_HOLD_BIT   |
+		r.acc_next  << ACC_NEXT_BIT  |
+		r.rd_setup  << RD_SETUP_BIT  |
+		r.acc_first << ACC_FIRST_BIT |
+		r.turn_off;
+
+	dev_dbg(devbus->dev, "read parameters register 0x%p = 0x%x\n",
+		devbus->base + READ_PARAM_OFFSET,
+		value);
+
+	writel(value, devbus->base + READ_PARAM_OFFSET);
+
+	/* Set write timings */
+	value = w.sync_enable  << SYNC_ENABLE_BIT |
+		w.wr_low       << WR_LOW_BIT      |
+		w.wr_high      << WR_HIGH_BIT     |
+		w.ale_wr;
+
+	dev_dbg(devbus->dev, "write parameters register: 0x%p = 0x%x\n",
+		devbus->base + WRITE_PARAM_OFFSET,
+		value);
+
+	writel(value, devbus->base + WRITE_PARAM_OFFSET);
+
+	return 0;
+}
+
+static int mvebu_devbus_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct device_node *node = pdev->dev.of_node;
+	struct device_node *parent;
+	struct devbus *devbus;
+	struct resource *res;
+	struct clk *clk;
+	unsigned long rate;
+	const __be32 *ranges;
+	int err, cs;
+	int addr_cells, p_addr_cells, size_cells;
+	int ranges_len, tuple_len;
+	u32 base, size;
+
+	devbus = devm_kzalloc(&pdev->dev, sizeof(struct devbus), GFP_KERNEL);
+	if (!devbus)
+		return -ENOMEM;
+
+	devbus->dev = dev;
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	devbus->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(devbus->base))
+		return PTR_ERR(devbus->base);
+
+	clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(clk))
+		return PTR_ERR(clk);
+	clk_prepare_enable(clk);
+
+	/*
+	 * Obtain clock period in picoseconds,
+	 * we need this in order to convert timing
+	 * parameters from cycles to picoseconds.
+	 */
+	rate = clk_get_rate(clk) / 1000;
+	devbus->tick_ps = 1000000000 / rate;
+
+	/* Read the device tree node and set the new timing parameters */
+	err = devbus_set_timing_params(devbus, node);
+	if (err < 0)
+		return err;
+
+	/*
+	 * Allocate an address window for this device.
+	 * If the device probing fails, then we won't be able to
+	 * remove the allocated address decoding window.
+	 *
+	 * FIXME: This is only a temporary hack! We need to do this here
+	 * because we still don't have device tree bindings for mbus.
+	 * Once that support is added, we will declare these address windows
+	 * statically in the device tree, and remove the window configuration
+	 * from here.
+	 */
+
+	/*
+	 * Get the CS to choose the window string.
+	 * This is a bit hacky, but it will be removed once the
+	 * address windows are declared in the device tree.
+	 */
+	cs = (((unsigned long)devbus->base) % 0x400) / 8;
+
+	/*
+	 * Parse 'ranges' property to obtain a (base,size) window tuple.
+	 * This will be removed once the address windows
+	 * are declared in the device tree.
+	 */
+	parent = of_get_parent(node);
+	if (!parent)
+		return -EINVAL;
+
+	p_addr_cells = of_n_addr_cells(parent);
+	of_node_put(parent);
+
+	addr_cells = of_n_addr_cells(node);
+	size_cells = of_n_size_cells(node);
+	tuple_len = (p_addr_cells + addr_cells + size_cells) * sizeof(__be32);
+
+	ranges = of_get_property(node, "ranges", &ranges_len);
+	if (ranges == NULL || ranges_len != tuple_len)
+		return -EINVAL;
+
+	base = of_translate_address(node, ranges + addr_cells);
+	if (base == OF_BAD_ADDR)
+		return -EINVAL;
+	size = of_read_number(ranges + addr_cells + p_addr_cells, size_cells);
+
+	/*
+	 * Create an mbus address windows.
+	 * FIXME: Remove this, together with the above code, once the
+	 * address windows are declared in the device tree.
+	 */
+	err = mvebu_mbus_add_window(devbus_wins[cs], base, size);
+	if (err < 0)
+		return err;
+
+	/*
+	 * We need to create a child device explicitly from here to
+	 * guarantee that the child will be probed after the timing
+	 * parameters for the bus are written.
+	 */
+	err = of_platform_populate(node, NULL, NULL, dev);
+	if (err < 0) {
+		mvebu_mbus_del_window(base, size);
+		return err;
+	}
+
+	return 0;
+}
+
+static const struct of_device_id mvebu_devbus_of_match[] = {
+	{ .compatible = "marvell,mvebu-devbus" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, mvebu_devbus_of_match);
+
+static struct platform_driver mvebu_devbus_driver = {
+	.probe		= mvebu_devbus_probe,
+	.driver		= {
+		.name	= "mvebu-devbus",
+		.owner	= THIS_MODULE,
+		.of_match_table = mvebu_devbus_of_match,
+	},
+};
+
+static int __init mvebu_devbus_init(void)
+{
+	return platform_driver_register(&mvebu_devbus_driver);
+}
+module_init(mvebu_devbus_init);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Ezequiel Garcia <ezequiel.garcia@free-electrons.com>");
+MODULE_DESCRIPTION("Marvell EBU SoC Device Bus controller");

diff --git a/drivers/memory/tegra20-mc.c b/drivers/memory/tegra20-mc.c
index 2ca5f28..0548eea 100644
--- a/drivers/memory/tegra20-mc.c
+++ b/drivers/memory/tegra20-mc.c

@@ -193,8 +193,11 @@
 	mask &= stat;
 	if (!mask)
 		return IRQ_NONE;
-	while ((bit = ffs(mask)) != 0)
+	while ((bit = ffs(mask)) != 0) {
 		tegra20_mc_decode(mc, bit - 1);
+		mask &= ~BIT(bit - 1);
+	}
+
 	mc_writel(mc, stat, MC_INTSTATUS);
 	return IRQ_HANDLED;
 }

diff --git a/drivers/memory/tegra30-mc.c b/drivers/memory/tegra30-mc.c
index f4ae074..58d2979 100644
--- a/drivers/memory/tegra30-mc.c
+++ b/drivers/memory/tegra30-mc.c

@@ -218,7 +218,7 @@
 		return;
 	}
 
-	err = readl(mc + MC_ERR_STATUS);
+	err = mc_readl(mc, MC_ERR_STATUS);
 
 	type = (err & MC_ERR_TYPE_MASK) >> MC_ERR_TYPE_SHIFT;
 	perm = (err & MC_ERR_INVALID_SMMU_PAGE_MASK) >>
@@ -235,7 +235,7 @@
 	if (cid < ARRAY_SIZE(tegra30_mc_client))
 		client = tegra30_mc_client[cid];
 
-	addr = readl(mc + MC_ERR_ADR);
+	addr = mc_readl(mc, MC_ERR_ADR);
 
 	dev_err_ratelimited(mc->dev, "%s (0x%08x): 0x%08x %s (%s %s %s %s)\n",
 			   mc_int_err[idx], err, addr, client,
@@ -313,8 +313,11 @@
 	mask &= stat;
 	if (!mask)
 		return IRQ_NONE;
-	while ((bit = ffs(mask)) != 0)
+	while ((bit = ffs(mask)) != 0) {
 		tegra30_mc_decode(mc, bit - 1);
+		mask &= ~BIT(bit - 1);
+	}
+
 	mc_writel(mc, stat, MC_INTSTATUS);
 	return IRQ_HANDLED;
 }

diff --git a/drivers/mfd/tps6586x.c b/drivers/mfd/tps6586x.c
index 721b918..4b93ed4 100644
--- a/drivers/mfd/tps6586x.c
+++ b/drivers/mfd/tps6586x.c

@@ -107,7 +107,7 @@
 		.name = "tps6586x-gpio",
 	},
 	{
-		.name = "tps6586x-pmic",
+		.name = "tps6586x-regulator",
 	},
 	{
 		.name = "tps6586x-rtc",

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index e23df6c..8dacd4c 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig

@@ -480,6 +480,7 @@
 
 config PCH_PHUB
 	tristate "Intel EG20T PCH/LAPIS Semicon IOH(ML7213/ML7223/ML7831) PHUB"
+	select GENERIC_NET_UTILS
 	depends on PCI
 	help
 	  This driver is for PCH(Platform controller Hub) PHUB(Packet Hub) of

diff --git a/drivers/misc/ad525x_dpot.c b/drivers/misc/ad525x_dpot.c
index 8f99e8e..0daadcf 100644
--- a/drivers/misc/ad525x_dpot.c
+++ b/drivers/misc/ad525x_dpot.c

@@ -470,7 +470,7 @@
 		!test_bit(DPOT_RDAC_MASK & reg, data->otp_en_mask))
 		return -EPERM;
 
-	err = strict_strtoul(buf, 10, &value);
+	err = kstrtoul(buf, 10, &value);
 	if (err)
 		return err;
 

diff --git a/drivers/misc/apds9802als.c b/drivers/misc/apds9802als.c
index 5b5fd84..0c6e037 100644
--- a/drivers/misc/apds9802als.c
+++ b/drivers/misc/apds9802als.c

@@ -126,8 +126,9 @@
 	int ret_val;
 	unsigned long val;
 
-	if (strict_strtoul(buf, 10, &val))
-		return -EINVAL;
+	ret_val = kstrtoul(buf, 10, &val);
+	if (ret_val)
+		return ret_val;
 
 	if (val < 4096)
 		val = 1;

diff --git a/drivers/misc/apds990x.c b/drivers/misc/apds990x.c
index 98f9bb2..868a30a 100644
--- a/drivers/misc/apds990x.c
+++ b/drivers/misc/apds990x.c

@@ -696,9 +696,11 @@
 {
 	struct apds990x_chip *chip = dev_get_drvdata(dev);
 	unsigned long value;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	chip->lux_calib = value;
 
@@ -759,8 +761,9 @@
 	unsigned long value;
 	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	mutex_lock(&chip->mutex);
 	ret = apds990x_set_arate(chip, value);
@@ -813,9 +816,11 @@
 {
 	struct apds990x_chip *chip =  dev_get_drvdata(dev);
 	unsigned long value;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	mutex_lock(&chip->mutex);
 
@@ -892,11 +897,12 @@
 static ssize_t apds990x_set_lux_thresh(struct apds990x_chip *chip, u32 *target,
 				const char *buf)
 {
-	int ret = 0;
 	unsigned long thresh;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &thresh))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &thresh);
+	if (ret)
+		return ret;
 
 	if (thresh > APDS_RANGE)
 		return -EINVAL;
@@ -957,9 +963,11 @@
 {
 	struct apds990x_chip *chip =  dev_get_drvdata(dev);
 	unsigned long value;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	if ((value > APDS_RANGE) || (value == 0) ||
 		(value < APDS_PROX_HYSTERESIS))
@@ -990,9 +998,12 @@
 {
 	struct apds990x_chip *chip =  dev_get_drvdata(dev);
 	unsigned long value;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
+
 	if (value) {
 		pm_runtime_get_sync(dev);
 		mutex_lock(&chip->mutex);

diff --git a/drivers/misc/arm-charlcd.c b/drivers/misc/arm-charlcd.c
index 48651ef..1256a4b 100644
--- a/drivers/misc/arm-charlcd.c
+++ b/drivers/misc/arm-charlcd.c

@@ -291,7 +291,7 @@
 	lcd->virtbase = ioremap(lcd->phybase, lcd->physize);
 	if (!lcd->virtbase) {
 		ret = -ENOMEM;
-		goto out_no_remap;
+		goto out_no_memregion;
 	}
 
 	lcd->irq = platform_get_irq(pdev, 0);
@@ -320,8 +320,6 @@
 
 out_no_irq:
 	iounmap(lcd->virtbase);
-out_no_remap:
-	platform_set_drvdata(pdev, NULL);
 out_no_memregion:
 	release_mem_region(lcd->phybase, SZ_4K);
 out_no_resource:
@@ -337,7 +335,6 @@
 		free_irq(lcd->irq, lcd);
 		iounmap(lcd->virtbase);
 		release_mem_region(lcd->phybase, lcd->physize);
-		platform_set_drvdata(pdev, NULL);
 		kfree(lcd);
 	}
 

diff --git a/drivers/misc/bh1770glc.c b/drivers/misc/bh1770glc.c
index f4975f7..99a0468 100644
--- a/drivers/misc/bh1770glc.c
+++ b/drivers/misc/bh1770glc.c

@@ -651,8 +651,9 @@
 	unsigned long value;
 	ssize_t ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	mutex_lock(&chip->mutex);
 	if (value) {
@@ -726,9 +727,11 @@
 {
 	struct bh1770_chip *chip =  dev_get_drvdata(dev);
 	unsigned long value;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	mutex_lock(&chip->mutex);
 	/* Assume no proximity. Sensor will tell real state soon */
@@ -824,9 +827,11 @@
 {
 	struct bh1770_chip *chip =  dev_get_drvdata(dev);
 	unsigned long value;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	mutex_lock(&chip->mutex);
 	chip->prox_rate_threshold = bh1770_prox_rate_validate(value);
@@ -840,9 +845,11 @@
 {
 	struct bh1770_chip *chip =  dev_get_drvdata(dev);
 	unsigned long value;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	mutex_lock(&chip->mutex);
 	chip->prox_rate = bh1770_prox_rate_validate(value);
@@ -865,8 +872,10 @@
 	unsigned long value;
 	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
+
 	if (value > BH1770_PROX_RANGE)
 		return -EINVAL;
 
@@ -893,9 +902,11 @@
 {
 	struct bh1770_chip *chip = dev_get_drvdata(dev);
 	unsigned long value;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	if (value > BH1770_PROX_MAX_PERSISTENCE)
 		return -EINVAL;
@@ -918,9 +929,11 @@
 {
 	struct bh1770_chip *chip = dev_get_drvdata(dev);
 	unsigned long value;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	if (value > BH1770_PROX_RANGE)
 		return -EINVAL;
@@ -963,9 +976,11 @@
 	unsigned long value;
 	u32 old_calib;
 	u32 new_corr;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &value))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &value);
+	if (ret)
+		return ret;
 
 	mutex_lock(&chip->mutex);
 	old_calib = chip->lux_calib;
@@ -1012,8 +1027,9 @@
 	unsigned long rate_hz;
 	int ret, i;
 
-	if (strict_strtoul(buf, 0, &rate_hz))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &rate_hz);
+	if (ret)
+		return ret;
 
 	for (i = 0; i < ARRAY_SIZE(lux_rates_hz) - 1; i++)
 		if (rate_hz >= lux_rates_hz[i])
@@ -1047,11 +1063,12 @@
 static ssize_t bh1770_set_lux_thresh(struct bh1770_chip *chip, u16 *target,
 				const char *buf)
 {
-	int ret = 0;
 	unsigned long thresh;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &thresh))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &thresh);
+	if (ret)
+		return ret;
 
 	if (thresh > BH1770_LUX_RANGE)
 		return -EINVAL;

diff --git a/drivers/misc/bh1780gli.c b/drivers/misc/bh1780gli.c
index 818f3a0..057580e 100644
--- a/drivers/misc/bh1780gli.c
+++ b/drivers/misc/bh1780gli.c

@@ -107,7 +107,7 @@
 	unsigned long val;
 	int error;
 
-	error = strict_strtoul(buf, 0, &val);
+	error = kstrtoul(buf, 0, &val);
 	if (error)
 		return error;
 

diff --git a/drivers/misc/carma/carma-fpga-program.c b/drivers/misc/carma/carma-fpga-program.c
index 736c771..c6bd7e8 100644
--- a/drivers/misc/carma/carma-fpga-program.c
+++ b/drivers/misc/carma/carma-fpga-program.c

@@ -830,8 +830,9 @@
 	unsigned long val;
 	int ret;
 
-	if (strict_strtoul(buf, 0, &val))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &val);
+	if (ret)
+		return ret;
 
 	if (val) {
 		ret = fpga_enable_power_supplies(priv);
@@ -859,8 +860,9 @@
 	unsigned long val;
 	int ret;
 
-	if (strict_strtoul(buf, 0, &val))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &val);
+	if (ret)
+		return ret;
 
 	/* We can't have an image writer and be programming simultaneously */
 	if (mutex_lock_interruptible(&priv->lock))
@@ -919,7 +921,7 @@
 
 static int fpga_of_remove(struct platform_device *op)
 {
-	struct fpga_dev *priv = dev_get_drvdata(&op->dev);
+	struct fpga_dev *priv = platform_get_drvdata(op);
 	struct device *this_device = priv->miscdev.this_device;
 
 	sysfs_remove_group(&this_device->kobj, &fpga_attr_group);
@@ -969,7 +971,7 @@
 
 	kref_init(&priv->ref);
 
-	dev_set_drvdata(&op->dev, priv);
+	platform_set_drvdata(op, priv);
 	priv->dev = &op->dev;
 	mutex_init(&priv->lock);
 	init_completion(&priv->completion);

diff --git a/drivers/misc/carma/carma-fpga.c b/drivers/misc/carma/carma-fpga.c
index 7508caf..7b56563f 100644
--- a/drivers/misc/carma/carma-fpga.c
+++ b/drivers/misc/carma/carma-fpga.c

@@ -1002,10 +1002,10 @@
 	unsigned long enable;
 	int ret;
 
-	ret = strict_strtoul(buf, 0, &enable);
+	ret = kstrtoul(buf, 0, &enable);
 	if (ret) {
 		dev_err(priv->dev, "unable to parse enable input\n");
-		return -EINVAL;
+		return ret;
 	}
 
 	/* protect against concurrent enable/disable */
@@ -1296,7 +1296,7 @@
 		goto out_return;
 	}
 
-	dev_set_drvdata(&op->dev, priv);
+	platform_set_drvdata(op, priv);
 	priv->dev = &op->dev;
 	kref_init(&priv->ref);
 	mutex_init(&priv->mutex);
@@ -1400,7 +1400,7 @@
 
 static int data_of_remove(struct platform_device *op)
 {
-	struct fpga_device *priv = dev_get_drvdata(&op->dev);
+	struct fpga_device *priv = platform_get_drvdata(op);
 	struct device *this_device = priv->miscdev.this_device;
 
 	/* remove all sysfs files, now the device cannot be re-enabled */

diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c
index 2baeec5..5d4fd69 100644
--- a/drivers/misc/eeprom/at24.c
+++ b/drivers/misc/eeprom/at24.c

@@ -492,10 +492,9 @@
 	if (client->dev.platform_data) {
 		chip = *(struct at24_platform_data *)client->dev.platform_data;
 	} else {
-		if (!id->driver_data) {
-			err = -ENODEV;
-			goto err_out;
-		}
+		if (!id->driver_data)
+			return -ENODEV;
+
 		magic = id->driver_data;
 		chip.byte_len = BIT(magic & AT24_BITMASK(AT24_SIZE_BYTELEN));
 		magic >>= AT24_SIZE_BYTELEN;
@@ -519,8 +518,7 @@
 			"byte_len looks suspicious (no power of 2)!\n");
 	if (!chip.page_size) {
 		dev_err(&client->dev, "page_size must not be 0!\n");
-		err = -EINVAL;
-		goto err_out;
+		return -EINVAL;
 	}
 	if (!is_power_of_2(chip.page_size))
 		dev_warn(&client->dev,
@@ -528,10 +526,9 @@
 
 	/* Use I2C operations unless we're stuck with SMBus extensions. */
 	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
-		if (chip.flags & AT24_FLAG_ADDR16) {
-			err = -EPFNOSUPPORT;
-			goto err_out;
-		}
+		if (chip.flags & AT24_FLAG_ADDR16)
+			return -EPFNOSUPPORT;
+
 		if (i2c_check_functionality(client->adapter,
 				I2C_FUNC_SMBUS_READ_I2C_BLOCK)) {
 			use_smbus = I2C_SMBUS_I2C_BLOCK_DATA;
@@ -542,8 +539,7 @@
 				I2C_FUNC_SMBUS_READ_BYTE_DATA)) {
 			use_smbus = I2C_SMBUS_BYTE_DATA;
 		} else {
-			err = -EPFNOSUPPORT;
-			goto err_out;
+			return -EPFNOSUPPORT;
 		}
 	}
 
@@ -553,12 +549,10 @@
 		num_addresses =	DIV_ROUND_UP(chip.byte_len,
 			(chip.flags & AT24_FLAG_ADDR16) ? 65536 : 256);
 
-	at24 = kzalloc(sizeof(struct at24_data) +
+	at24 = devm_kzalloc(&client->dev, sizeof(struct at24_data) +
 		num_addresses * sizeof(struct i2c_client *), GFP_KERNEL);
-	if (!at24) {
-		err = -ENOMEM;
-		goto err_out;
-	}
+	if (!at24)
+		return -ENOMEM;
 
 	mutex_init(&at24->lock);
 	at24->use_smbus = use_smbus;
@@ -596,11 +590,10 @@
 			at24->write_max = write_max;
 
 			/* buffer (data + address at the beginning) */
-			at24->writebuf = kmalloc(write_max + 2, GFP_KERNEL);
-			if (!at24->writebuf) {
-				err = -ENOMEM;
-				goto err_struct;
-			}
+			at24->writebuf = devm_kzalloc(&client->dev,
+				write_max + 2, GFP_KERNEL);
+			if (!at24->writebuf)
+				return -ENOMEM;
 		} else {
 			dev_warn(&client->dev,
 				"cannot write due to controller restrictions.");
@@ -648,11 +641,6 @@
 		if (at24->client[i])
 			i2c_unregister_device(at24->client[i]);
 
-	kfree(at24->writebuf);
-err_struct:
-	kfree(at24);
-err_out:
-	dev_dbg(&client->dev, "probe error %d\n", err);
 	return err;
 }
 
@@ -667,8 +655,6 @@
 	for (i = 1; i < at24->num_addresses; i++)
 		i2c_unregister_device(at24->client[i]);
 
-	kfree(at24->writebuf);
-	kfree(at24);
 	return 0;
 }
 

diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c
index ad8fd8e..840b359 100644
--- a/drivers/misc/eeprom/at25.c
+++ b/drivers/misc/eeprom/at25.c

@@ -371,11 +371,10 @@
 		if (np) {
 			err = at25_np_to_chip(&spi->dev, np, &chip);
 			if (err)
-				goto fail;
+				return err;
 		} else {
 			dev_err(&spi->dev, "Error: no chip description\n");
-			err = -ENODEV;
-			goto fail;
+			return -ENODEV;
 		}
 	} else
 		chip = *(struct spi_eeprom *)spi->dev.platform_data;
@@ -389,8 +388,7 @@
 		addrlen = 3;
 	else {
 		dev_dbg(&spi->dev, "unsupported address type\n");
-		err = -EINVAL;
-		goto fail;
+		return -EINVAL;
 	}
 
 	/* Ping the chip ... the status register is pretty portable,
@@ -400,14 +398,12 @@
 	sr = spi_w8r8(spi, AT25_RDSR);
 	if (sr < 0 || sr & AT25_SR_nRDY) {
 		dev_dbg(&spi->dev, "rdsr --> %d (%02x)\n", sr, sr);
-		err = -ENXIO;
-		goto fail;
+		return -ENXIO;
 	}
 
-	if (!(at25 = kzalloc(sizeof *at25, GFP_KERNEL))) {
-		err = -ENOMEM;
-		goto fail;
-	}
+	at25 = devm_kzalloc(&spi->dev, sizeof(struct at25_data), GFP_KERNEL);
+	if (!at25)
+		return -ENOMEM;
 
 	mutex_init(&at25->lock);
 	at25->chip = chip;
@@ -439,7 +435,7 @@
 
 	err = sysfs_create_bin_file(&spi->dev.kobj, &at25->bin);
 	if (err)
-		goto fail;
+		return err;
 
 	if (chip.setup)
 		chip.setup(&at25->mem, chip.context);
@@ -453,10 +449,6 @@
 		(chip.flags & EE_READONLY) ? " (readonly)" : "",
 		at25->chip.page_size);
 	return 0;
-fail:
-	dev_dbg(&spi->dev, "probe err %d\n", err);
-	kfree(at25);
-	return err;
 }
 
 static int at25_remove(struct spi_device *spi)
@@ -465,7 +457,6 @@
 
 	at25 = spi_get_drvdata(spi);
 	sysfs_remove_bin_file(&spi->dev.kobj, &at25->bin);
-	kfree(at25);
 	return 0;
 }
 

diff --git a/drivers/misc/ep93xx_pwm.c b/drivers/misc/ep93xx_pwm.c
index 96787ec..cdb67a9 100644
--- a/drivers/misc/ep93xx_pwm.c
+++ b/drivers/misc/ep93xx_pwm.c

@@ -39,63 +39,6 @@
 	u32		duty_percent;
 };
 
-static inline void ep93xx_pwm_writel(struct ep93xx_pwm *pwm,
-		unsigned int val, unsigned int off)
-{
-	__raw_writel(val, pwm->mmio_base + off);
-}
-
-static inline unsigned int ep93xx_pwm_readl(struct ep93xx_pwm *pwm,
-		unsigned int off)
-{
-	return __raw_readl(pwm->mmio_base + off);
-}
-
-static inline void ep93xx_pwm_write_tc(struct ep93xx_pwm *pwm, u16 value)
-{
-	ep93xx_pwm_writel(pwm, value, EP93XX_PWMx_TERM_COUNT);
-}
-
-static inline u16 ep93xx_pwm_read_tc(struct ep93xx_pwm *pwm)
-{
-	return ep93xx_pwm_readl(pwm, EP93XX_PWMx_TERM_COUNT);
-}
-
-static inline void ep93xx_pwm_write_dc(struct ep93xx_pwm *pwm, u16 value)
-{
-	ep93xx_pwm_writel(pwm, value, EP93XX_PWMx_DUTY_CYCLE);
-}
-
-static inline void ep93xx_pwm_enable(struct ep93xx_pwm *pwm)
-{
-	ep93xx_pwm_writel(pwm, 0x1, EP93XX_PWMx_ENABLE);
-}
-
-static inline void ep93xx_pwm_disable(struct ep93xx_pwm *pwm)
-{
-	ep93xx_pwm_writel(pwm, 0x0, EP93XX_PWMx_ENABLE);
-}
-
-static inline int ep93xx_pwm_is_enabled(struct ep93xx_pwm *pwm)
-{
-	return ep93xx_pwm_readl(pwm, EP93XX_PWMx_ENABLE) & 0x1;
-}
-
-static inline void ep93xx_pwm_invert(struct ep93xx_pwm *pwm)
-{
-	ep93xx_pwm_writel(pwm, 0x1, EP93XX_PWMx_INVERT);
-}
-
-static inline void ep93xx_pwm_normal(struct ep93xx_pwm *pwm)
-{
-	ep93xx_pwm_writel(pwm, 0x0, EP93XX_PWMx_INVERT);
-}
-
-static inline int ep93xx_pwm_is_inverted(struct ep93xx_pwm *pwm)
-{
-	return ep93xx_pwm_readl(pwm, EP93XX_PWMx_INVERT) & 0x1;
-}
-
 /*
  * /sys/devices/platform/ep93xx-pwm.N
  *   /min_freq      read-only   minimum pwm output frequency
@@ -131,9 +74,9 @@
 	struct platform_device *pdev = to_platform_device(dev);
 	struct ep93xx_pwm *pwm = platform_get_drvdata(pdev);
 
-	if (ep93xx_pwm_is_enabled(pwm)) {
+	if (readl(pwm->mmio_base + EP93XX_PWMx_ENABLE) & 0x1) {
 		unsigned long rate = clk_get_rate(pwm->clk);
-		u16 term = ep93xx_pwm_read_tc(pwm);
+		u16 term = readl(pwm->mmio_base + EP93XX_PWMx_TERM_COUNT);
 
 		return sprintf(buf, "%ld\n", rate / (term + 1));
 	} else {
@@ -149,12 +92,12 @@
 	long val;
 	int err;
 
-	err = strict_strtol(buf, 10, &val);
+	err = kstrtol(buf, 10, &val);
 	if (err)
 		return -EINVAL;
 
 	if (val == 0) {
-		ep93xx_pwm_disable(pwm);
+		writel(0x0, pwm->mmio_base + EP93XX_PWMx_ENABLE);
 	} else if (val <= (clk_get_rate(pwm->clk) / 2)) {
 		u32 term, duty;
 
@@ -164,20 +107,20 @@
 		if (val < 1)
 			val = 1;
 
-		term = ep93xx_pwm_read_tc(pwm);
+		term = readl(pwm->mmio_base + EP93XX_PWMx_TERM_COUNT);
 		duty = ((val + 1) * pwm->duty_percent / 100) - 1;
 
 		/* If pwm is running, order is important */
 		if (val > term) {
-			ep93xx_pwm_write_tc(pwm, val);
-			ep93xx_pwm_write_dc(pwm, duty);
+			writel(val, pwm->mmio_base + EP93XX_PWMx_TERM_COUNT);
+			writel(duty, pwm->mmio_base + EP93XX_PWMx_DUTY_CYCLE);
 		} else {
-			ep93xx_pwm_write_dc(pwm, duty);
-			ep93xx_pwm_write_tc(pwm, val);
+			writel(duty, pwm->mmio_base + EP93XX_PWMx_DUTY_CYCLE);
+			writel(val, pwm->mmio_base + EP93XX_PWMx_TERM_COUNT);
 		}
 
-		if (!ep93xx_pwm_is_enabled(pwm))
-			ep93xx_pwm_enable(pwm);
+		if (!readl(pwm->mmio_base + EP93XX_PWMx_ENABLE) & 0x1)
+			writel(0x1, pwm->mmio_base + EP93XX_PWMx_ENABLE);
 	} else {
 		return -EINVAL;
 	}
@@ -202,13 +145,15 @@
 	long val;
 	int err;
 
-	err = strict_strtol(buf, 10, &val);
+	err = kstrtol(buf, 10, &val);
 	if (err)
 		return -EINVAL;
 
 	if (val > 0 && val < 100) {
-		u32 term = ep93xx_pwm_read_tc(pwm);
-		ep93xx_pwm_write_dc(pwm, ((term + 1) * val / 100) - 1);
+		u32 term = readl(pwm->mmio_base + EP93XX_PWMx_TERM_COUNT);
+		u32 duty = ((term + 1) * val / 100) - 1;
+
+		writel(duty, pwm->mmio_base + EP93XX_PWMx_DUTY_CYCLE);
 		pwm->duty_percent = val;
 		return count;
 	}
@@ -221,8 +166,9 @@
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct ep93xx_pwm *pwm = platform_get_drvdata(pdev);
+	int inverted = readl(pwm->mmio_base + EP93XX_PWMx_INVERT) & 0x1;
 
-	return sprintf(buf, "%d\n", ep93xx_pwm_is_inverted(pwm));
+	return sprintf(buf, "%d\n", inverted);
 }
 
 static ssize_t ep93xx_pwm_set_invert(struct device *dev,
@@ -233,14 +179,14 @@
 	long val;
 	int err;
 
-	err = strict_strtol(buf, 10, &val);
+	err = kstrtol(buf, 10, &val);
 	if (err)
 		return -EINVAL;
 
 	if (val == 0)
-		ep93xx_pwm_normal(pwm);
+		writel(0x0, pwm->mmio_base + EP93XX_PWMx_INVERT);
 	else if (val == 1)
-		ep93xx_pwm_invert(pwm);
+		writel(0x1, pwm->mmio_base + EP93XX_PWMx_INVERT);
 	else
 		return -EINVAL;
 
@@ -269,89 +215,55 @@
 	.attrs	= ep93xx_pwm_attrs,
 };
 
-static int __init ep93xx_pwm_probe(struct platform_device *pdev)
+static int ep93xx_pwm_probe(struct platform_device *pdev)
 {
 	struct ep93xx_pwm *pwm;
 	struct resource *res;
-	int err;
+	int ret;
 
-	err = ep93xx_pwm_acquire_gpio(pdev);
-	if (err)
-		return err;
+	pwm = devm_kzalloc(&pdev->dev, sizeof(*pwm), GFP_KERNEL);
+	if (!pwm)
+		return -ENOMEM;
 
-	pwm = kzalloc(sizeof(struct ep93xx_pwm), GFP_KERNEL);
-	if (!pwm) {
-		err = -ENOMEM;
-		goto fail_no_mem;
-	}
+	pwm->clk = devm_clk_get(&pdev->dev, "pwm_clk");
+	if (IS_ERR(pwm->clk))
+		return PTR_ERR(pwm->clk);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (res == NULL) {
-		err = -ENXIO;
-		goto fail_no_mem_resource;
-	}
+	pwm->mmio_base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(pwm->mmio_base))
+		return PTR_ERR(pwm->mmio_base);
 
-	res = request_mem_region(res->start, resource_size(res), pdev->name);
-	if (res == NULL) {
-		err = -EBUSY;
-		goto fail_no_mem_resource;
-	}
+	ret = ep93xx_pwm_acquire_gpio(pdev);
+	if (ret)
+		return ret;
 
-	pwm->mmio_base = ioremap(res->start, resource_size(res));
-	if (pwm->mmio_base == NULL) {
-		err = -ENXIO;
-		goto fail_no_ioremap;
-	}
-
-	err = sysfs_create_group(&pdev->dev.kobj, &ep93xx_pwm_sysfs_files);
-	if (err)
-		goto fail_no_sysfs;
-
-	pwm->clk = clk_get(&pdev->dev, "pwm_clk");
-	if (IS_ERR(pwm->clk)) {
-		err = PTR_ERR(pwm->clk);
-		goto fail_no_clk;
+	ret = sysfs_create_group(&pdev->dev.kobj, &ep93xx_pwm_sysfs_files);
+	if (ret) {
+		ep93xx_pwm_release_gpio(pdev);
+		return ret;
 	}
 
 	pwm->duty_percent = 50;
 
-	platform_set_drvdata(pdev, pwm);
-
 	/* disable pwm at startup. Avoids zero value. */
-	ep93xx_pwm_disable(pwm);
-	ep93xx_pwm_write_tc(pwm, EP93XX_PWM_MAX_COUNT);
-	ep93xx_pwm_write_dc(pwm, EP93XX_PWM_MAX_COUNT / 2);
+	writel(0x0, pwm->mmio_base + EP93XX_PWMx_ENABLE);
+	writel(EP93XX_PWM_MAX_COUNT, pwm->mmio_base + EP93XX_PWMx_TERM_COUNT);
+	writel(EP93XX_PWM_MAX_COUNT/2, pwm->mmio_base + EP93XX_PWMx_DUTY_CYCLE);
 
 	clk_enable(pwm->clk);
 
+	platform_set_drvdata(pdev, pwm);
 	return 0;
-
-fail_no_clk:
-	sysfs_remove_group(&pdev->dev.kobj, &ep93xx_pwm_sysfs_files);
-fail_no_sysfs:
-	iounmap(pwm->mmio_base);
-fail_no_ioremap:
-	release_mem_region(res->start, resource_size(res));
-fail_no_mem_resource:
-	kfree(pwm);
-fail_no_mem:
-	ep93xx_pwm_release_gpio(pdev);
-	return err;
 }
 
-static int __exit ep93xx_pwm_remove(struct platform_device *pdev)
+static int ep93xx_pwm_remove(struct platform_device *pdev)
 {
 	struct ep93xx_pwm *pwm = platform_get_drvdata(pdev);
-	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 
-	ep93xx_pwm_disable(pwm);
+	writel(0x0, pwm->mmio_base + EP93XX_PWMx_ENABLE);
 	clk_disable(pwm->clk);
-	clk_put(pwm->clk);
-	platform_set_drvdata(pdev, NULL);
 	sysfs_remove_group(&pdev->dev.kobj, &ep93xx_pwm_sysfs_files);
-	iounmap(pwm->mmio_base);
-	release_mem_region(res->start, resource_size(res));
-	kfree(pwm);
 	ep93xx_pwm_release_gpio(pdev);
 
 	return 0;
@@ -362,10 +274,10 @@
 		.name	= "ep93xx-pwm",
 		.owner	= THIS_MODULE,
 	},
-	.remove		= __exit_p(ep93xx_pwm_remove),
+	.probe		= ep93xx_pwm_probe,
+	.remove		= ep93xx_pwm_remove,
 };
-
-module_platform_driver_probe(ep93xx_pwm_driver, ep93xx_pwm_probe);
+module_platform_driver(ep93xx_pwm_driver);
 
 MODULE_AUTHOR("Matthieu Crapet <mcrapet@gmail.com>, "
 	      "H Hartley Sweeten <hsweeten@visionengravers.com>");

diff --git a/drivers/misc/hmc6352.c b/drivers/misc/hmc6352.c
index 423cd40..170bd3d 100644
--- a/drivers/misc/hmc6352.c
+++ b/drivers/misc/hmc6352.c

@@ -46,8 +46,9 @@
 	int ret;
 	unsigned long val;
 
-	if (strict_strtoul(buf, 10, &val))
-		return -EINVAL;
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
 	if (val >= strlen(map))
 		return -EINVAL;
 	mutex_lock(&compass_mutex);

diff --git a/drivers/misc/isl29003.c b/drivers/misc/isl29003.c
index c5145b3..e3183f2 100644
--- a/drivers/misc/isl29003.c
+++ b/drivers/misc/isl29003.c

@@ -208,7 +208,11 @@
 	unsigned long val;
 	int ret;
 
-	if ((strict_strtoul(buf, 10, &val) < 0) || (val > 3))
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	if (val > 3)
 		return -EINVAL;
 
 	ret = isl29003_set_range(client, val);
@@ -239,7 +243,11 @@
 	unsigned long val;
 	int ret;
 
-	if ((strict_strtoul(buf, 10, &val) < 0) || (val > 3))
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	if (val > 3)
 		return -EINVAL;
 
 	ret = isl29003_set_resolution(client, val);
@@ -267,7 +275,11 @@
 	unsigned long val;
 	int ret;
 
-	if ((strict_strtoul(buf, 10, &val) < 0) || (val > 2))
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	if (val > 2)
 		return -EINVAL;
 
 	ret = isl29003_set_mode(client, val);
@@ -298,7 +310,11 @@
 	unsigned long val;
 	int ret;
 
-	if ((strict_strtoul(buf, 10, &val) < 0) || (val > 1))
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
+
+	if (val > 1)
 		return -EINVAL;
 
 	ret = isl29003_set_power_state(client, val);

diff --git a/drivers/misc/isl29020.c b/drivers/misc/isl29020.c
index 0aa08c7..b7f84da 100644
--- a/drivers/misc/isl29020.c
+++ b/drivers/misc/isl29020.c

@@ -90,8 +90,10 @@
 	int ret_val;
 	unsigned long val;
 
-	if (strict_strtoul(buf, 10, &val))
-		return -EINVAL;
+	ret_val = kstrtoul(buf, 10, &val);
+	if (ret_val)
+		return ret_val;
+
 	if (val < 1 || val > 64000)
 		return -EINVAL;
 

diff --git a/drivers/misc/lis3lv02d/lis3lv02d.c b/drivers/misc/lis3lv02d/lis3lv02d.c
index 4cd4a3d..036effe 100644
--- a/drivers/misc/lis3lv02d/lis3lv02d.c
+++ b/drivers/misc/lis3lv02d/lis3lv02d.c

@@ -831,9 +831,11 @@
 {
 	struct lis3lv02d *lis3 = dev_get_drvdata(dev);
 	unsigned long rate;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &rate))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &rate);
+	if (ret)
+		return ret;
 
 	lis3lv02d_sysfs_poweron(lis3);
 	if (lis3lv02d_set_odr(lis3, rate))

diff --git a/drivers/misc/mei/amthif.c b/drivers/misc/mei/amthif.c
index b3e5098..749452f 100644
--- a/drivers/misc/mei/amthif.c
+++ b/drivers/misc/mei/amthif.c

@@ -443,11 +443,11 @@
  *
  * returns 0, OK; otherwise, error.
  */
-int mei_amthif_irq_write_complete(struct mei_device *dev, s32 *slots,
-			struct mei_cl_cb *cb, struct mei_cl_cb *cmpl_list)
+int mei_amthif_irq_write_complete(struct mei_cl *cl, struct mei_cl_cb *cb,
+				  s32 *slots, struct mei_cl_cb *cmpl_list)
 {
+	struct mei_device *dev = cl->dev;
 	struct mei_msg_hdr mei_hdr;
-	struct mei_cl *cl = cb->cl;
 	size_t len = dev->iamthif_msg_buf_size - dev->iamthif_msg_buf_index;
 	u32 msg_slots = mei_data2slots(len);
 

diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c
index e310ca6..21d3f5a 100644
--- a/drivers/misc/mei/client.c
+++ b/drivers/misc/mei/client.c

@@ -485,7 +485,6 @@
 {
 	struct mei_device *dev;
 	struct mei_cl_cb *cb;
-	long timeout = mei_secs_to_jiffies(MEI_CL_CONNECT_TIMEOUT);
 	int rets;
 
 	if (WARN_ON(!cl || !cl->dev))
@@ -518,7 +517,7 @@
 	rets = wait_event_timeout(dev->wait_recvd_msg,
 				 (cl->state == MEI_FILE_CONNECTED ||
 				  cl->state == MEI_FILE_DISCONNECTED),
-				 timeout * HZ);
+				 mei_secs_to_jiffies(MEI_CL_CONNECT_TIMEOUT));
 	mutex_lock(&dev->device_lock);
 
 	if (cl->state != MEI_FILE_CONNECTED) {
@@ -682,6 +681,68 @@
 }
 
 /**
+ * mei_cl_irq_write_complete - write a message to device
+ *	from the interrupt thread context
+ *
+ * @cl: client
+ * @cb: callback block.
+ * @slots: free slots.
+ * @cmpl_list: complete list.
+ *
+ * returns 0, OK; otherwise error.
+ */
+int mei_cl_irq_write_complete(struct mei_cl *cl, struct mei_cl_cb *cb,
+				     s32 *slots, struct mei_cl_cb *cmpl_list)
+{
+	struct mei_device *dev = cl->dev;
+	struct mei_msg_hdr mei_hdr;
+	size_t len = cb->request_buffer.size - cb->buf_idx;
+	u32 msg_slots = mei_data2slots(len);
+
+	mei_hdr.host_addr = cl->host_client_id;
+	mei_hdr.me_addr = cl->me_client_id;
+	mei_hdr.reserved = 0;
+
+	if (*slots >= msg_slots) {
+		mei_hdr.length = len;
+		mei_hdr.msg_complete = 1;
+	/* Split the message only if we can write the whole host buffer */
+	} else if (*slots == dev->hbuf_depth) {
+		msg_slots = *slots;
+		len = (*slots * sizeof(u32)) - sizeof(struct mei_msg_hdr);
+		mei_hdr.length = len;
+		mei_hdr.msg_complete = 0;
+	} else {
+		/* wait for next time the host buffer is empty */
+		return 0;
+	}
+
+	dev_dbg(&dev->pdev->dev, "buf: size = %d idx = %lu\n",
+			cb->request_buffer.size, cb->buf_idx);
+	dev_dbg(&dev->pdev->dev, MEI_HDR_FMT, MEI_HDR_PRM(&mei_hdr));
+
+	*slots -=  msg_slots;
+	if (mei_write_message(dev, &mei_hdr,
+			cb->request_buffer.data + cb->buf_idx)) {
+		cl->status = -ENODEV;
+		list_move_tail(&cb->list, &cmpl_list->list);
+		return -ENODEV;
+	}
+
+	cl->status = 0;
+	cl->writing_state = MEI_WRITING;
+	cb->buf_idx += mei_hdr.length;
+
+	if (mei_hdr.msg_complete) {
+		if (mei_cl_flow_ctrl_reduce(cl))
+			return -ENODEV;
+		list_move_tail(&cb->list, &dev->write_waiting_list.list);
+	}
+
+	return 0;
+}
+
+/**
  * mei_cl_write - submit a write cb to mei device
 	assumes device_lock is locked
  *
@@ -723,7 +784,6 @@
 		cb->buf_idx = 0;
 		/* unseting complete will enqueue the cb for write */
 		mei_hdr.msg_complete = 0;
-		cl->writing_state = MEI_WRITING;
 		rets = buf->size;
 		goto out;
 	}
@@ -785,6 +845,32 @@
 }
 
 
+/**
+ * mei_cl_complete - processes completed operation for a client
+ *
+ * @cl: private data of the file object.
+ * @cb: callback block.
+ */
+void mei_cl_complete(struct mei_cl *cl, struct mei_cl_cb *cb)
+{
+	if (cb->fop_type == MEI_FOP_WRITE) {
+		mei_io_cb_free(cb);
+		cb = NULL;
+		cl->writing_state = MEI_WRITE_COMPLETE;
+		if (waitqueue_active(&cl->tx_wait))
+			wake_up_interruptible(&cl->tx_wait);
+
+	} else if (cb->fop_type == MEI_FOP_READ &&
+			MEI_READING == cl->reading_state) {
+		cl->reading_state = MEI_READ_COMPLETE;
+		if (waitqueue_active(&cl->rx_wait))
+			wake_up_interruptible(&cl->rx_wait);
+		else
+			mei_cl_bus_rx_event(cl);
+
+	}
+}
+
 
 /**
  * mei_cl_all_disconnect - disconnect forcefully all connected clients

diff --git a/drivers/misc/mei/client.h b/drivers/misc/mei/client.h
index cfdb144..26b157d 100644
--- a/drivers/misc/mei/client.h
+++ b/drivers/misc/mei/client.h

@@ -89,6 +89,10 @@
 int mei_cl_connect(struct mei_cl *cl, struct file *file);
 int mei_cl_read_start(struct mei_cl *cl, size_t length);
 int mei_cl_write(struct mei_cl *cl, struct mei_cl_cb *cb, bool blocking);
+int mei_cl_irq_write_complete(struct mei_cl *cl, struct mei_cl_cb *cb,
+				s32 *slots, struct mei_cl_cb *cmpl_list);
+
+void mei_cl_complete(struct mei_cl *cl, struct mei_cl_cb *cb);
 
 void mei_host_client_init(struct work_struct *work);
 

diff --git a/drivers/misc/mei/hbm.c b/drivers/misc/mei/hbm.c
index 6916045..565027b 100644
--- a/drivers/misc/mei/hbm.c
+++ b/drivers/misc/mei/hbm.c

@@ -536,6 +536,20 @@
 
 
 /**
+ * mei_hbm_version_is_supported - checks whether the driver can
+ *     support the hbm version of the device
+ *
+ * @dev: the device structure
+ * returns true if driver can support hbm version of the device
+ */
+bool mei_hbm_version_is_supported(struct mei_device *dev)
+{
+	return	(dev->version.major_version < HBM_MAJOR_VERSION) ||
+		(dev->version.major_version == HBM_MAJOR_VERSION &&
+		 dev->version.minor_version <= HBM_MINOR_VERSION);
+}
+
+/**
  * mei_hbm_dispatch - bottom half read routine after ISR to
  * handle the read bus message cmd processing.
  *
@@ -562,9 +576,24 @@
 	switch (mei_msg->hbm_cmd) {
 	case HOST_START_RES_CMD:
 		version_res = (struct hbm_host_version_response *)mei_msg;
-		if (!version_res->host_version_supported) {
-			dev->version = version_res->me_max_version;
-			dev_dbg(&dev->pdev->dev, "version mismatch.\n");
+
+		dev_dbg(&dev->pdev->dev, "HBM VERSION: DRIVER=%02d:%02d DEVICE=%02d:%02d\n",
+				HBM_MAJOR_VERSION, HBM_MINOR_VERSION,
+				version_res->me_max_version.major_version,
+				version_res->me_max_version.minor_version);
+
+		if (version_res->host_version_supported) {
+			dev->version.major_version = HBM_MAJOR_VERSION;
+			dev->version.minor_version = HBM_MINOR_VERSION;
+		} else {
+			dev->version.major_version =
+				version_res->me_max_version.major_version;
+			dev->version.minor_version =
+				version_res->me_max_version.minor_version;
+		}
+
+		if (!mei_hbm_version_is_supported(dev)) {
+			dev_warn(&dev->pdev->dev, "hbm version mismatch: stopping the driver.\n");
 
 			dev->hbm_state = MEI_HBM_STOP;
 			mei_hbm_stop_req_prepare(dev, &dev->wr_msg.hdr,
@@ -575,8 +604,6 @@
 			return;
 		}
 
-		dev->version.major_version = HBM_MAJOR_VERSION;
-		dev->version.minor_version = HBM_MINOR_VERSION;
 		if (dev->dev_state == MEI_DEV_INIT_CLIENTS &&
 		    dev->hbm_state == MEI_HBM_START) {
 			dev->init_clients_timer = 0;

diff --git a/drivers/misc/mei/hbm.h b/drivers/misc/mei/hbm.h
index e80dc24..4ae2e56 100644
--- a/drivers/misc/mei/hbm.h
+++ b/drivers/misc/mei/hbm.h

@@ -54,7 +54,7 @@
 int mei_hbm_cl_flow_control_req(struct mei_device *dev, struct mei_cl *cl);
 int mei_hbm_cl_disconnect_req(struct mei_device *dev, struct mei_cl *cl);
 int mei_hbm_cl_connect_req(struct mei_device *dev, struct mei_cl *cl);
-
+bool mei_hbm_version_is_supported(struct mei_device *dev);
 
 #endif /* _MEI_HBM_H_ */
 

diff --git a/drivers/misc/mei/hw-me.c b/drivers/misc/mei/hw-me.c
index 822170f..e4f8dec 100644
--- a/drivers/misc/mei/hw-me.c
+++ b/drivers/misc/mei/hw-me.c

@@ -171,7 +171,7 @@
  * @dev: the device structure
  * @intr_enable: if interrupt should be enabled after reset.
  */
-static void mei_me_hw_reset(struct mei_device *dev, bool intr_enable)
+static int mei_me_hw_reset(struct mei_device *dev, bool intr_enable)
 {
 	struct mei_me_hw *hw = to_me_hw(dev);
 	u32 hcsr = mei_hcsr_read(hw);
@@ -191,6 +191,7 @@
 		mei_me_hw_reset_release(dev);
 
 	dev_dbg(&dev->pdev->dev, "current HCSR = 0x%08x.\n", mei_hcsr_read(hw));
+	return 0;
 }
 
 /**

diff --git a/drivers/misc/mei/init.c b/drivers/misc/mei/init.c
index f580d30..6fc573c 100644
--- a/drivers/misc/mei/init.c
+++ b/drivers/misc/mei/init.c

@@ -106,8 +106,7 @@
 		goto err;
 	}
 
-	if (dev->version.major_version != HBM_MAJOR_VERSION ||
-	    dev->version.minor_version != HBM_MINOR_VERSION) {
+	if (!mei_hbm_version_is_supported(dev)) {
 		dev_dbg(&dev->pdev->dev, "MEI start failed.\n");
 		goto err;
 	}
@@ -133,13 +132,19 @@
 void mei_reset(struct mei_device *dev, int interrupts_enabled)
 {
 	bool unexpected;
+	int ret;
 
 	unexpected = (dev->dev_state != MEI_DEV_INITIALIZING &&
 			dev->dev_state != MEI_DEV_DISABLED &&
 			dev->dev_state != MEI_DEV_POWER_DOWN &&
 			dev->dev_state != MEI_DEV_POWER_UP);
 
-	mei_hw_reset(dev, interrupts_enabled);
+	ret = mei_hw_reset(dev, interrupts_enabled);
+	if (ret) {
+		dev_err(&dev->pdev->dev, "hw reset failed disabling the device\n");
+		interrupts_enabled = false;
+		dev->dev_state = MEI_DEV_DISABLED;
+	}
 
 	dev->hbm_state = MEI_HBM_IDLE;
 
@@ -176,7 +181,12 @@
 		return;
 	}
 
-	mei_hw_start(dev);
+	ret = mei_hw_start(dev);
+	if (ret) {
+		dev_err(&dev->pdev->dev, "hw_start failed disabling the device\n");
+		dev->dev_state = MEI_DEV_DISABLED;
+		return;
+	}
 
 	dev_dbg(&dev->pdev->dev, "link is established start sending messages.\n");
 	/* link is established * start sending messages.  */

diff --git a/drivers/misc/mei/interrupt.c b/drivers/misc/mei/interrupt.c
index 2ad7369..4b59cb7 100644
--- a/drivers/misc/mei/interrupt.c
+++ b/drivers/misc/mei/interrupt.c

@@ -31,32 +31,6 @@
 
 
 /**
- * mei_cl_complete_handler - processes completed operation for a client
- *
- * @cl: private data of the file object.
- * @cb: callback block.
- */
-static void mei_cl_complete_handler(struct mei_cl *cl, struct mei_cl_cb *cb)
-{
-	if (cb->fop_type == MEI_FOP_WRITE) {
-		mei_io_cb_free(cb);
-		cb = NULL;
-		cl->writing_state = MEI_WRITE_COMPLETE;
-		if (waitqueue_active(&cl->tx_wait))
-			wake_up_interruptible(&cl->tx_wait);
-
-	} else if (cb->fop_type == MEI_FOP_READ &&
-			MEI_READING == cl->reading_state) {
-		cl->reading_state = MEI_READ_COMPLETE;
-		if (waitqueue_active(&cl->rx_wait))
-			wake_up_interruptible(&cl->rx_wait);
-		else
-			mei_cl_bus_rx_event(cl);
-
-	}
-}
-
-/**
  * mei_irq_compl_handler - dispatch complete handelers
  *	for the completed callbacks
  *
@@ -78,7 +52,7 @@
 		if (cl == &dev->iamthif_cl)
 			mei_amthif_complete(dev, cb);
 		else
-			mei_cl_complete_handler(cl, cb);
+			mei_cl_complete(cl, cb);
 	}
 }
 EXPORT_SYMBOL_GPL(mei_irq_compl_handler);
@@ -189,21 +163,21 @@
 }
 
 /**
- * _mei_irq_thread_close - processes close related operation.
+ * mei_cl_irq_close - processes close related operation from
+ *	interrupt thread context - send disconnect request
  *
- * @dev: the device structure.
+ * @cl: client
+ * @cb: callback block.
  * @slots: free slots.
- * @cb_pos: callback block.
- * @cl: private data of the file object.
  * @cmpl_list: complete list.
  *
  * returns 0, OK; otherwise, error.
  */
-static int _mei_irq_thread_close(struct mei_device *dev, s32 *slots,
-				struct mei_cl_cb *cb_pos,
-				struct mei_cl *cl,
-				struct mei_cl_cb *cmpl_list)
+static int mei_cl_irq_close(struct mei_cl *cl, struct mei_cl_cb *cb,
+			s32 *slots, struct mei_cl_cb *cmpl_list)
 {
+	struct mei_device *dev = cl->dev;
+
 	u32 msg_slots =
 		mei_data2slots(sizeof(struct hbm_client_connect_request));
 
@@ -214,15 +188,15 @@
 
 	if (mei_hbm_cl_disconnect_req(dev, cl)) {
 		cl->status = 0;
-		cb_pos->buf_idx = 0;
-		list_move_tail(&cb_pos->list, &cmpl_list->list);
+		cb->buf_idx = 0;
+		list_move_tail(&cb->list, &cmpl_list->list);
 		return -EIO;
 	}
 
 	cl->state = MEI_FILE_DISCONNECTING;
 	cl->status = 0;
-	cb_pos->buf_idx = 0;
-	list_move_tail(&cb_pos->list, &dev->ctrl_rd_list.list);
+	cb->buf_idx = 0;
+	list_move_tail(&cb->list, &dev->ctrl_rd_list.list);
 	cl->timer_count = MEI_CONNECT_TIMEOUT;
 
 	return 0;
@@ -230,26 +204,26 @@
 
 
 /**
- * _mei_irq_thread_read - processes read related operation.
+ * mei_cl_irq_close - processes client read related operation from the
+ *	interrupt thread context - request for flow control credits
  *
- * @dev: the device structure.
+ * @cl: client
+ * @cb: callback block.
  * @slots: free slots.
- * @cb_pos: callback block.
- * @cl: private data of the file object.
  * @cmpl_list: complete list.
  *
  * returns 0, OK; otherwise, error.
  */
-static int _mei_irq_thread_read(struct mei_device *dev,	s32 *slots,
-			struct mei_cl_cb *cb_pos,
-			struct mei_cl *cl,
-			struct mei_cl_cb *cmpl_list)
+static int mei_cl_irq_read(struct mei_cl *cl, struct mei_cl_cb *cb,
+			   s32 *slots, struct mei_cl_cb *cmpl_list)
 {
+	struct mei_device *dev = cl->dev;
+
 	u32 msg_slots = mei_data2slots(sizeof(struct hbm_flow_control));
 
 	if (*slots < msg_slots) {
 		/* return the cancel routine */
-		list_del(&cb_pos->list);
+		list_del(&cb->list);
 		return -EMSGSIZE;
 	}
 
@@ -257,38 +231,38 @@
 
 	if (mei_hbm_cl_flow_control_req(dev, cl)) {
 		cl->status = -ENODEV;
-		cb_pos->buf_idx = 0;
-		list_move_tail(&cb_pos->list, &cmpl_list->list);
+		cb->buf_idx = 0;
+		list_move_tail(&cb->list, &cmpl_list->list);
 		return -ENODEV;
 	}
-	list_move_tail(&cb_pos->list, &dev->read_list.list);
+	list_move_tail(&cb->list, &dev->read_list.list);
 
 	return 0;
 }
 
 
 /**
- * _mei_irq_thread_ioctl - processes ioctl related operation.
+ * mei_cl_irq_ioctl - processes client ioctl related operation from the
+ *	interrupt thread context -   send connection request
  *
- * @dev: the device structure.
+ * @cl: client
+ * @cb: callback block.
  * @slots: free slots.
- * @cb_pos: callback block.
- * @cl: private data of the file object.
  * @cmpl_list: complete list.
  *
  * returns 0, OK; otherwise, error.
  */
-static int _mei_irq_thread_ioctl(struct mei_device *dev, s32 *slots,
-			struct mei_cl_cb *cb_pos,
-			struct mei_cl *cl,
-			struct mei_cl_cb *cmpl_list)
+static int mei_cl_irq_ioctl(struct mei_cl *cl, struct mei_cl_cb *cb,
+			   s32 *slots, struct mei_cl_cb *cmpl_list)
 {
+	struct mei_device *dev = cl->dev;
+
 	u32 msg_slots =
 		mei_data2slots(sizeof(struct hbm_client_connect_request));
 
 	if (*slots < msg_slots) {
 		/* return the cancel routine */
-		list_del(&cb_pos->list);
+		list_del(&cb->list);
 		return -EMSGSIZE;
 	}
 
@@ -298,76 +272,17 @@
 
 	if (mei_hbm_cl_connect_req(dev, cl)) {
 		cl->status = -ENODEV;
-		cb_pos->buf_idx = 0;
-		list_del(&cb_pos->list);
-		return -ENODEV;
-	} else {
-		list_move_tail(&cb_pos->list, &dev->ctrl_rd_list.list);
-		cl->timer_count = MEI_CONNECT_TIMEOUT;
-	}
-	return 0;
-}
-
-/**
- * mei_irq_thread_write_complete - write messages to device.
- *
- * @dev: the device structure.
- * @slots: free slots.
- * @cb: callback block.
- * @cmpl_list: complete list.
- *
- * returns 0, OK; otherwise, error.
- */
-static int mei_irq_thread_write_complete(struct mei_device *dev, s32 *slots,
-			struct mei_cl_cb *cb, struct mei_cl_cb *cmpl_list)
-{
-	struct mei_msg_hdr mei_hdr;
-	struct mei_cl *cl = cb->cl;
-	size_t len = cb->request_buffer.size - cb->buf_idx;
-	u32 msg_slots = mei_data2slots(len);
-
-	mei_hdr.host_addr = cl->host_client_id;
-	mei_hdr.me_addr = cl->me_client_id;
-	mei_hdr.reserved = 0;
-
-	if (*slots >= msg_slots) {
-		mei_hdr.length = len;
-		mei_hdr.msg_complete = 1;
-	/* Split the message only if we can write the whole host buffer */
-	} else if (*slots == dev->hbuf_depth) {
-		msg_slots = *slots;
-		len = (*slots * sizeof(u32)) - sizeof(struct mei_msg_hdr);
-		mei_hdr.length = len;
-		mei_hdr.msg_complete = 0;
-	} else {
-		/* wait for next time the host buffer is empty */
-		return 0;
-	}
-
-	dev_dbg(&dev->pdev->dev, "buf: size = %d idx = %lu\n",
-			cb->request_buffer.size, cb->buf_idx);
-	dev_dbg(&dev->pdev->dev, MEI_HDR_FMT, MEI_HDR_PRM(&mei_hdr));
-
-	*slots -=  msg_slots;
-	if (mei_write_message(dev, &mei_hdr,
-			cb->request_buffer.data + cb->buf_idx)) {
-		cl->status = -ENODEV;
-		list_move_tail(&cb->list, &cmpl_list->list);
+		cb->buf_idx = 0;
+		list_del(&cb->list);
 		return -ENODEV;
 	}
 
-
-	cl->status = 0;
-	cb->buf_idx += mei_hdr.length;
-	if (mei_hdr.msg_complete) {
-		if (mei_cl_flow_ctrl_reduce(cl))
-			return -ENODEV;
-		list_move_tail(&cb->list, &dev->write_waiting_list.list);
-	}
-
+	list_move_tail(&cb->list, &dev->ctrl_rd_list.list);
+	cl->timer_count = MEI_CONNECT_TIMEOUT;
 	return 0;
 }
 
+
 /**
  * mei_irq_read_handler - bottom half read routine after ISR to
  * handle the read processing.
@@ -481,7 +396,7 @@
 {
 
 	struct mei_cl *cl;
-	struct mei_cl_cb *pos = NULL, *next = NULL;
+	struct mei_cl_cb *cb, *next;
 	struct mei_cl_cb *list;
 	s32 slots;
 	int ret;
@@ -498,19 +413,19 @@
 	dev_dbg(&dev->pdev->dev, "complete all waiting for write cb.\n");
 
 	list = &dev->write_waiting_list;
-	list_for_each_entry_safe(pos, next, &list->list, list) {
-		cl = pos->cl;
+	list_for_each_entry_safe(cb, next, &list->list, list) {
+		cl = cb->cl;
 		if (cl == NULL)
 			continue;
 
 		cl->status = 0;
-		list_del(&pos->list);
+		list_del(&cb->list);
 		if (MEI_WRITING == cl->writing_state &&
-		    pos->fop_type == MEI_FOP_WRITE &&
+		    cb->fop_type == MEI_FOP_WRITE &&
 		    cl != &dev->iamthif_cl) {
 			dev_dbg(&dev->pdev->dev, "MEI WRITE COMPLETE\n");
 			cl->writing_state = MEI_WRITE_COMPLETE;
-			list_add_tail(&pos->list, &cmpl_list->list);
+			list_add_tail(&cb->list, &cmpl_list->list);
 		}
 		if (cl == &dev->iamthif_cl) {
 			dev_dbg(&dev->pdev->dev, "check iamthif flow control.\n");
@@ -552,25 +467,23 @@
 
 	/* complete control write list CB */
 	dev_dbg(&dev->pdev->dev, "complete control write list cb.\n");
-	list_for_each_entry_safe(pos, next, &dev->ctrl_wr_list.list, list) {
-		cl = pos->cl;
+	list_for_each_entry_safe(cb, next, &dev->ctrl_wr_list.list, list) {
+		cl = cb->cl;
 		if (!cl) {
-			list_del(&pos->list);
+			list_del(&cb->list);
 			return -ENODEV;
 		}
-		switch (pos->fop_type) {
+		switch (cb->fop_type) {
 		case MEI_FOP_CLOSE:
 			/* send disconnect message */
-			ret = _mei_irq_thread_close(dev, &slots, pos,
-						cl, cmpl_list);
+			ret = mei_cl_irq_close(cl, cb, &slots, cmpl_list);
 			if (ret)
 				return ret;
 
 			break;
 		case MEI_FOP_READ:
 			/* send flow control message */
-			ret = _mei_irq_thread_read(dev, &slots, pos,
-						cl, cmpl_list);
+			ret = mei_cl_irq_read(cl, cb, &slots, cmpl_list);
 			if (ret)
 				return ret;
 
@@ -579,8 +492,7 @@
 			/* connect message */
 			if (mei_cl_is_other_connecting(cl))
 				continue;
-			ret = _mei_irq_thread_ioctl(dev, &slots, pos,
-						cl, cmpl_list);
+			ret = mei_cl_irq_ioctl(cl, cb, &slots, cmpl_list);
 			if (ret)
 				return ret;
 
@@ -593,8 +505,8 @@
 	}
 	/* complete  write list CB */
 	dev_dbg(&dev->pdev->dev, "complete write list cb.\n");
-	list_for_each_entry_safe(pos, next, &dev->write_list.list, list) {
-		cl = pos->cl;
+	list_for_each_entry_safe(cb, next, &dev->write_list.list, list) {
+		cl = cb->cl;
 		if (cl == NULL)
 			continue;
 		if (mei_cl_flow_ctrl_creds(cl) <= 0) {
@@ -605,14 +517,13 @@
 		}
 
 		if (cl == &dev->iamthif_cl)
-			ret = mei_amthif_irq_write_complete(dev, &slots,
-							pos, cmpl_list);
+			ret = mei_amthif_irq_write_complete(cl, cb,
+						&slots, cmpl_list);
 		else
-			ret = mei_irq_thread_write_complete(dev, &slots, pos,
-						cmpl_list);
+			ret = mei_cl_irq_write_complete(cl, cb,
+						&slots, cmpl_list);
 		if (ret)
 			return ret;
-
 	}
 	return 0;
 }

diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index 053139f..5e11b5b 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c

@@ -194,7 +194,6 @@
 	struct mei_cl_cb *cb_pos = NULL;
 	struct mei_cl_cb *cb = NULL;
 	struct mei_device *dev;
-	int i;
 	int rets;
 	int err;
 
@@ -210,38 +209,26 @@
 		goto out;
 	}
 
-	if ((cl->sm_state & MEI_WD_STATE_INDEPENDENCE_MSG_SENT) == 0) {
-		/* Do not allow to read watchdog client */
-		i = mei_me_cl_by_uuid(dev, &mei_wd_guid);
-		if (i >= 0) {
-			struct mei_me_client *me_client = &dev->me_clients[i];
-			if (cl->me_client_id == me_client->client_id) {
-				rets = -EBADF;
-				goto out;
-			}
-		}
-	} else {
-		cl->sm_state &= ~MEI_WD_STATE_INDEPENDENCE_MSG_SENT;
-	}
-
 	if (cl == &dev->iamthif_cl) {
 		rets = mei_amthif_read(dev, file, ubuf, length, offset);
 		goto out;
 	}
 
-	if (cl->read_cb && cl->read_cb->buf_idx > *offset) {
+	if (cl->read_cb) {
 		cb = cl->read_cb;
-		goto copy_buffer;
-	} else if (cl->read_cb && cl->read_cb->buf_idx > 0 &&
-		   cl->read_cb->buf_idx <= *offset) {
-		cb = cl->read_cb;
-		rets = 0;
-		goto free;
-	} else if ((!cl->read_cb || !cl->read_cb->buf_idx) && *offset > 0) {
-		/*Offset needs to be cleaned for contiguous reads*/
+		/* read what left */
+		if (cb->buf_idx > *offset)
+			goto copy_buffer;
+		/* offset is beyond buf_idx we have no more data return 0 */
+		if (cb->buf_idx > 0 && cb->buf_idx <= *offset) {
+			rets = 0;
+			goto free;
+		}
+		/* Offset needs to be cleaned for contiguous reads*/
+		if (cb->buf_idx == 0 && *offset > 0)
+			*offset = 0;
+	} else if (*offset > 0) {
 		*offset = 0;
-		rets = 0;
-		goto out;
 	}
 
 	err = mei_cl_read_start(cl, length);
@@ -420,16 +407,6 @@
 	if (rets)
 		goto out;
 
-	cl->sm_state = 0;
-	if (length == 4 &&
-	    ((memcmp(mei_wd_state_independence_msg[0],
-				 write_cb->request_buffer.data, 4) == 0) ||
-	     (memcmp(mei_wd_state_independence_msg[1],
-				 write_cb->request_buffer.data, 4) == 0) ||
-	     (memcmp(mei_wd_state_independence_msg[2],
-				 write_cb->request_buffer.data, 4) == 0)))
-		cl->sm_state |= MEI_WD_STATE_INDEPENDENCE_MSG_SENT;
-
 	if (cl == &dev->iamthif_cl) {
 		rets = mei_amthif_write(dev, write_cb);
 

diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index 4de5140..7b918b2 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h

@@ -56,11 +56,6 @@
 extern const uuid_le mei_wd_guid;
 
 /*
- * Watchdog independence state message
- */
-extern const u8 mei_wd_state_independence_msg[3][4];
-
-/*
  * Number of Maximum MEI Clients
  */
 #define MEI_CLIENTS_MAX 256
@@ -201,7 +196,6 @@
 	u8 timer_count;
 	enum mei_file_transaction_states reading_state;
 	enum mei_file_transaction_states writing_state;
-	int sm_state;
 	struct mei_cl_cb *read_cb;
 
 	/* MEI CL bus data */
@@ -239,7 +233,7 @@
 	bool (*host_is_ready) (struct mei_device *dev);
 
 	bool (*hw_is_ready) (struct mei_device *dev);
-	void (*hw_reset) (struct mei_device *dev, bool enable);
+	int (*hw_reset) (struct mei_device *dev, bool enable);
 	int  (*hw_start) (struct mei_device *dev);
 	void (*hw_config) (struct mei_device *dev);
 
@@ -502,8 +496,8 @@
 
 void mei_amthif_run_next_cmd(struct mei_device *dev);
 
-int mei_amthif_irq_write_complete(struct mei_device *dev, s32 *slots,
-			struct mei_cl_cb *cb, struct mei_cl_cb *cmpl_list);
+int mei_amthif_irq_write_complete(struct mei_cl *cl, struct mei_cl_cb *cb,
+				  s32 *slots, struct mei_cl_cb *cmpl_list);
 
 void mei_amthif_complete(struct mei_device *dev, struct mei_cl_cb *cb);
 int mei_amthif_irq_read_msg(struct mei_device *dev,
@@ -522,15 +516,6 @@
  */
 extern const uuid_le mei_nfc_guid;
 
-int mei_amthif_irq_write_complete(struct mei_device *dev, s32 *slots,
-			struct mei_cl_cb *cb, struct mei_cl_cb *cmpl_list);
-
-void mei_amthif_complete(struct mei_device *dev, struct mei_cl_cb *cb);
-int mei_amthif_irq_read_message(struct mei_cl_cb *complete_list,
-		struct mei_device *dev, struct mei_msg_hdr *mei_hdr);
-int mei_amthif_irq_read(struct mei_device *dev, s32 *slots);
-
-
 int mei_wd_send(struct mei_device *dev);
 int mei_wd_stop(struct mei_device *dev);
 int mei_wd_host_init(struct mei_device *dev);
@@ -554,14 +539,14 @@
 {
 	dev->ops->hw_config(dev);
 }
-static inline void mei_hw_reset(struct mei_device *dev, bool enable)
+static inline int mei_hw_reset(struct mei_device *dev, bool enable)
 {
-	dev->ops->hw_reset(dev, enable);
+	return dev->ops->hw_reset(dev, enable);
 }
 
-static inline void mei_hw_start(struct mei_device *dev)
+static inline int mei_hw_start(struct mei_device *dev)
 {
-	dev->ops->hw_start(dev);
+	return dev->ops->hw_start(dev);
 }
 
 static inline void mei_clear_interrupts(struct mei_device *dev)

diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c
index 0f26832..1b3844e8 100644
--- a/drivers/misc/mei/pci-me.c
+++ b/drivers/misc/mei/pci-me.c

@@ -43,9 +43,6 @@
 #include "hw-me.h"
 #include "client.h"
 
-/* AMT device is a singleton on the platform */
-static struct pci_dev *mei_pdev;
-
 /* mei_pci_tbl - PCI Device ID Table */
 static DEFINE_PCI_DEVICE_TABLE(mei_me_pci_tbl) = {
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, MEI_DEV_ID_82946GZ)},
@@ -88,8 +85,6 @@
 
 MODULE_DEVICE_TABLE(pci, mei_me_pci_tbl);
 
-static DEFINE_MUTEX(mei_mutex);
-
 /**
  * mei_quirk_probe - probe for devices that doesn't valid ME interface
  *
@@ -126,17 +121,12 @@
 	struct mei_me_hw *hw;
 	int err;
 
-	mutex_lock(&mei_mutex);
 
 	if (!mei_me_quirk_probe(pdev, ent)) {
 		err = -ENODEV;
 		goto end;
 	}
 
-	if (mei_pdev) {
-		err = -EEXIST;
-		goto end;
-	}
 	/* enable pci dev */
 	err = pci_enable_device(pdev);
 	if (err) {
@@ -195,13 +185,10 @@
 	if (err)
 		goto release_irq;
 
-	mei_pdev = pdev;
 	pci_set_drvdata(pdev, dev);
 
 	schedule_delayed_work(&dev->timer_work, HZ);
 
-	mutex_unlock(&mei_mutex);
-
 	pr_debug("initialization successful.\n");
 
 	return 0;
@@ -220,7 +207,6 @@
 disable_device:
 	pci_disable_device(pdev);
 end:
-	mutex_unlock(&mei_mutex);
 	dev_err(&pdev->dev, "initialization failed.\n");
 	return err;
 }
@@ -238,9 +224,6 @@
 	struct mei_device *dev;
 	struct mei_me_hw *hw;
 
-	if (mei_pdev != pdev)
-		return;
-
 	dev = pci_get_drvdata(pdev);
 	if (!dev)
 		return;
@@ -251,8 +234,6 @@
 	dev_err(&pdev->dev, "stop\n");
 	mei_stop(dev);
 
-	mei_pdev = NULL;
-
 	/* disable interrupts */
 	mei_disable_interrupts(dev);
 

diff --git a/drivers/misc/mei/wd.c b/drivers/misc/mei/wd.c
index 6251a4e..b892143 100644
--- a/drivers/misc/mei/wd.c
+++ b/drivers/misc/mei/wd.c

@@ -31,12 +31,6 @@
 static const u8 mei_start_wd_params[] = { 0x02, 0x12, 0x13, 0x10 };
 static const u8 mei_stop_wd_params[] = { 0x02, 0x02, 0x14, 0x10 };
 
-const u8 mei_wd_state_independence_msg[3][4] = {
-	{0x05, 0x02, 0x51, 0x10},
-	{0x05, 0x02, 0x52, 0x10},
-	{0x07, 0x02, 0x01, 0x10}
-};
-
 /*
  * AMT Watchdog Device
  */

diff --git a/drivers/misc/pch_phub.c b/drivers/misc/pch_phub.c
index 931e635..a5925f7f 100644
--- a/drivers/misc/pch_phub.c
+++ b/drivers/misc/pch_phub.c

@@ -633,17 +633,13 @@
 static ssize_t store_pch_mac(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t count)
 {
-	u8 mac[6];
+	u8 mac[ETH_ALEN];
 	ssize_t rom_size;
 	struct pch_phub_reg *chip = dev_get_drvdata(dev);
 
-	if (count != 18)
+	if (!mac_pton(buf, mac))
 		return -EINVAL;
 
-	sscanf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
-		(u32 *)&mac[0], (u32 *)&mac[1], (u32 *)&mac[2], (u32 *)&mac[3],
-		(u32 *)&mac[4], (u32 *)&mac[5]);
-
 	chip->pch_phub_extrom_base_address = pci_map_rom(chip->pdev, &rom_size);
 	if (!chip->pch_phub_extrom_base_address)
 		return -ENOMEM;
@@ -669,8 +665,6 @@
 static int pch_phub_probe(struct pci_dev *pdev,
 				    const struct pci_device_id *id)
 {
-	int retval;
-
 	int ret;
 	struct pch_phub_reg *chip;
 
@@ -713,13 +707,13 @@
 	if (id->driver_data == 1) { /* EG20T PCH */
 		const char *board_name;
 
-		retval = sysfs_create_file(&pdev->dev.kobj,
-					   &dev_attr_pch_mac.attr);
-		if (retval)
+		ret = sysfs_create_file(&pdev->dev.kobj,
+					&dev_attr_pch_mac.attr);
+		if (ret)
 			goto err_sysfs_create;
 
-		retval = sysfs_create_bin_file(&pdev->dev.kobj, &pch_bin_attr);
-		if (retval)
+		ret = sysfs_create_bin_file(&pdev->dev.kobj, &pch_bin_attr);
+		if (ret)
 			goto exit_bin_attr;
 
 		pch_phub_read_modify_write_reg(chip,
@@ -743,8 +737,8 @@
 		chip->pch_opt_rom_start_address = PCH_PHUB_ROM_START_ADDR_EG20T;
 		chip->pch_mac_start_address = PCH_PHUB_MAC_START_ADDR_EG20T;
 	} else if (id->driver_data == 2) { /* ML7213 IOH */
-		retval = sysfs_create_bin_file(&pdev->dev.kobj, &pch_bin_attr);
-		if (retval)
+		ret = sysfs_create_bin_file(&pdev->dev.kobj, &pch_bin_attr);
+		if (ret)
 			goto err_sysfs_create;
 		/* set the prefech value
 		 * Device2(USB OHCI #1/ USB EHCI #1/ USB Device):a
@@ -766,12 +760,12 @@
 						 PCH_PHUB_ROM_START_ADDR_ML7223;
 		chip->pch_mac_start_address = PCH_PHUB_MAC_START_ADDR_ML7223;
 	} else if (id->driver_data == 4) { /* ML7223 IOH Bus-n*/
-		retval = sysfs_create_file(&pdev->dev.kobj,
-					   &dev_attr_pch_mac.attr);
-		if (retval)
+		ret = sysfs_create_file(&pdev->dev.kobj,
+					&dev_attr_pch_mac.attr);
+		if (ret)
 			goto err_sysfs_create;
-		retval = sysfs_create_bin_file(&pdev->dev.kobj, &pch_bin_attr);
-		if (retval)
+		ret = sysfs_create_bin_file(&pdev->dev.kobj, &pch_bin_attr);
+		if (ret)
 			goto exit_bin_attr;
 		/* set the prefech value
 		 * Device2(USB OHCI #0,1,2,3/ USB EHCI #0):a
@@ -783,13 +777,13 @@
 						 PCH_PHUB_ROM_START_ADDR_ML7223;
 		chip->pch_mac_start_address = PCH_PHUB_MAC_START_ADDR_ML7223;
 	} else if (id->driver_data == 5) { /* ML7831 */
-		retval = sysfs_create_file(&pdev->dev.kobj,
-					   &dev_attr_pch_mac.attr);
-		if (retval)
+		ret = sysfs_create_file(&pdev->dev.kobj,
+					&dev_attr_pch_mac.attr);
+		if (ret)
 			goto err_sysfs_create;
 
-		retval = sysfs_create_bin_file(&pdev->dev.kobj, &pch_bin_attr);
-		if (retval)
+		ret = sysfs_create_bin_file(&pdev->dev.kobj, &pch_bin_attr);
+		if (ret)
 			goto exit_bin_attr;
 
 		/* set the prefech value */

diff --git a/drivers/misc/sgi-gru/gruprocfs.c b/drivers/misc/sgi-gru/gruprocfs.c
index 797d796..4f76359 100644
--- a/drivers/misc/sgi-gru/gruprocfs.c
+++ b/drivers/misc/sgi-gru/gruprocfs.c

@@ -160,15 +160,11 @@
 static ssize_t options_write(struct file *file, const char __user *userbuf,
 			     size_t count, loff_t *data)
 {
-	char buf[20];
+	int ret;
 
-	if (count >= sizeof(buf))
-		return -EINVAL;
-	if (copy_from_user(buf, userbuf, count))
-		return -EFAULT;
-	buf[count] = '\0';
-	if (strict_strtoul(buf, 0, &gru_options))
-		return -EINVAL;
+	ret = kstrtoul_from_user(userbuf, count, 0, &gru_options);
+	if (ret)
+		return ret;
 
 	return count;
 }

diff --git a/drivers/misc/spear13xx_pcie_gadget.c b/drivers/misc/spear13xx_pcie_gadget.c
index 7deb25d..2e13614 100644
--- a/drivers/misc/spear13xx_pcie_gadget.c
+++ b/drivers/misc/spear13xx_pcie_gadget.c

@@ -316,8 +316,12 @@
 		struct spear_pcie_gadget_config *config,
 		const char *buf, size_t count)
 {
-	if (strict_strtoul(buf, 0, &config->requested_msi))
-		return -EINVAL;
+	int ret;
+
+	ret = kstrtoul(buf, 0, &config->requested_msi);
+	if (ret)
+		return ret;
+
 	if (config->requested_msi > 32)
 		config->requested_msi = 32;
 
@@ -330,9 +334,11 @@
 {
 	struct pcie_app_reg __iomem *app_reg = config->va_app_base;
 	ulong en;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &en))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &en);
+	if (ret)
+		return ret;
 
 	if (en)
 		writel(readl(&app_reg->app_ctrl_0) | (1 << SYS_INT_ID),
@@ -351,9 +357,11 @@
 	struct pcie_app_reg __iomem *app_reg = config->va_app_base;
 	ulong vector;
 	u32 ven_msi;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &vector))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &vector);
+	if (ret)
+		return ret;
 
 	if (!config->configured_msi)
 		return -EINVAL;
@@ -395,9 +403,11 @@
 		const char *buf, size_t count)
 {
 	ulong id;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &id))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &id);
+	if (ret)
+		return ret;
 
 	spear_dbi_write_reg(config, PCI_VENDOR_ID, 2, id);
 
@@ -420,9 +430,11 @@
 		const char *buf, size_t count)
 {
 	ulong id;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &id))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &id);
+	if (ret)
+		return ret;
 
 	spear_dbi_write_reg(config, PCI_DEVICE_ID, 2, id);
 
@@ -443,9 +455,12 @@
 	ulong size;
 	u32 pos, pos1;
 	u32 no_of_bit = 0;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &size))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &size);
+	if (ret)
+		return ret;
+
 	/* min bar size is 256 */
 	if (size <= 0x100)
 		size = 0x100;
@@ -490,9 +505,11 @@
 {
 	struct pcie_app_reg __iomem *app_reg = config->va_app_base;
 	ulong address;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &address))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &address);
+	if (ret)
+		return ret;
 
 	address &= ~(config->bar0_size - 1);
 	if (config->va_bar0_address)
@@ -518,9 +535,11 @@
 		const char *buf, size_t count)
 {
 	ulong offset;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &offset))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &offset);
+	if (ret)
+		return ret;
 
 	if (offset % 4)
 		return -EINVAL;
@@ -549,9 +568,11 @@
 		const char *buf, size_t count)
 {
 	ulong data;
+	int ret;
 
-	if (strict_strtoul(buf, 0, &data))
-		return -EINVAL;
+	ret = kstrtoul(buf, 0, &data);
+	if (ret)
+		return ret;
 
 	if (!config->va_bar0_address)
 		return -ENOMEM;
@@ -776,7 +797,7 @@
 		goto err_iounmap_app;
 	}
 
-	dev_set_drvdata(&pdev->dev, target);
+	platform_set_drvdata(pdev, target);
 
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
@@ -814,9 +835,11 @@
 		clk = clk_get_sys("pcie1", NULL);
 		if (IS_ERR(clk)) {
 			pr_err("%s:couldn't get clk for pcie1\n", __func__);
+			status = PTR_ERR(clk);
 			goto err_irq;
 		}
-		if (clk_enable(clk)) {
+		status = clk_enable(clk);
+		if (status) {
 			pr_err("%s:couldn't enable clk for pcie1\n", __func__);
 			goto err_irq;
 		}
@@ -828,9 +851,11 @@
 		clk = clk_get_sys("pcie2", NULL);
 		if (IS_ERR(clk)) {
 			pr_err("%s:couldn't get clk for pcie2\n", __func__);
+			status = PTR_ERR(clk);
 			goto err_irq;
 		}
-		if (clk_enable(clk)) {
+		status = clk_enable(clk);
+		if (status) {
 			pr_err("%s:couldn't enable clk for pcie2\n", __func__);
 			goto err_irq;
 		}
@@ -863,7 +888,7 @@
 	res0 = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	res1 = platform_get_resource(pdev, IORESOURCE_MEM, 1);
 	irq = platform_get_irq(pdev, 0);
-	target = dev_get_drvdata(&pdev->dev);
+	target = platform_get_drvdata(pdev);
 	config = &target->config;
 
 	free_irq(irq, NULL);

diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
index 437192e..d87cc91 100644
--- a/drivers/misc/sram.c
+++ b/drivers/misc/sram.c

@@ -45,15 +45,12 @@
 	int ret;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -EINVAL;
+	virt_base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(virt_base))
+		return PTR_ERR(virt_base);
 
 	size = resource_size(res);
 
-	virt_base = devm_request_and_ioremap(&pdev->dev, res);
-	if (!virt_base)
-		return -EADDRNOTAVAIL;
-
 	sram = devm_kzalloc(&pdev->dev, sizeof(*sram), GFP_KERNEL);
 	if (!sram)
 		return -ENOMEM;

diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c
index 83269f1..83907c7 100644
--- a/drivers/misc/ti-st/st_kim.c
+++ b/drivers/misc/ti-st/st_kim.c

@@ -680,7 +680,7 @@
 		*core_data = NULL;
 		return;
 	}
-	kim_gdata = dev_get_drvdata(&pdev->dev);
+	kim_gdata = platform_get_drvdata(pdev);
 	*core_data = kim_gdata->core_data;
 }
 
@@ -735,7 +735,7 @@
 		pr_err("no mem to allocate");
 		return -ENOMEM;
 	}
-	dev_set_drvdata(&pdev->dev, kim_gdata);
+	platform_set_drvdata(pdev, kim_gdata);
 
 	err = st_core_init(&kim_gdata->core_data);
 	if (err != 0) {
@@ -810,7 +810,7 @@
 	struct ti_st_plat_data	*pdata = pdev->dev.platform_data;
 	struct kim_data_s	*kim_gdata;
 
-	kim_gdata = dev_get_drvdata(&pdev->dev);
+	kim_gdata = platform_get_drvdata(pdev);
 
 	/* Free the Bluetooth/FM/GPIO
 	 * nShutdown gpio from the system

diff --git a/drivers/misc/ti_dac7512.c b/drivers/misc/ti_dac7512.c
index 1d86407..9b23722 100644
--- a/drivers/misc/ti_dac7512.c
+++ b/drivers/misc/ti_dac7512.c

@@ -33,9 +33,11 @@
 	struct spi_device *spi = to_spi_device(dev);
 	unsigned char tmp[2];
 	unsigned long val;
+	int ret;
 
-	if (strict_strtoul(buf, 10, &val) < 0)
-		return -EINVAL;
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return ret;
 
 	tmp[0] = val >> 8;
 	tmp[1] = val & 0xff;

diff --git a/drivers/misc/tsl2550.c b/drivers/misc/tsl2550.c
index 1dfde4d..5bc10fa1 100644
--- a/drivers/misc/tsl2550.c
+++ b/drivers/misc/tsl2550.c

@@ -204,7 +204,7 @@
 	unsigned long val = simple_strtoul(buf, NULL, 10);
 	int ret;
 
-	if (val < 0 || val > 1)
+	if (val > 1)
 		return -EINVAL;
 
 	mutex_lock(&data->update_lock);
@@ -236,7 +236,7 @@
 	unsigned long val = simple_strtoul(buf, NULL, 10);
 	int ret;
 
-	if (val < 0 || val > 1)
+	if (val > 1)
 		return -EINVAL;
 
 	if (data->power_state == 0)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 02d9ae7..f975696 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c

@@ -2413,7 +2413,8 @@
 
 			pr_info("%s: link status definitely up for interface %s, %u Mbps %s duplex.\n",
 				bond->dev->name, slave->dev->name,
-				slave->speed, slave->duplex ? "full" : "half");
+				slave->speed == SPEED_UNKNOWN ? 0 : slave->speed,
+				slave->duplex ? "full" : "half");
 
 			/* notify ad that the link status has changed */
 			if (bond->params.mode == BOND_MODE_8023AD)

diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c
index 6e15ef0..cbd388e 100644
--- a/drivers/net/can/usb/usb_8dev.c
+++ b/drivers/net/can/usb/usb_8dev.c

@@ -977,7 +977,7 @@
 	err = usb_8dev_cmd_version(priv, &version);
 	if (err) {
 		netdev_err(netdev, "can't get firmware version\n");
-		goto cleanup_cmd_msg_buffer;
+		goto cleanup_unregister_candev;
 	} else {
 		netdev_info(netdev,
 			 "firmware: %d.%d, hardware: %d.%d\n",
@@ -989,6 +989,9 @@
 
 	return 0;
 
+cleanup_unregister_candev:
+	unregister_netdev(priv->netdev);
+
 cleanup_cmd_msg_buffer:
 	kfree(priv->cmd_msg_buffer);
 

diff --git a/drivers/net/ethernet/atheros/Kconfig b/drivers/net/ethernet/atheros/Kconfig
index 36d6abd..ad6aa1e9 100644
--- a/drivers/net/ethernet/atheros/Kconfig
+++ b/drivers/net/ethernet/atheros/Kconfig

@@ -67,4 +67,22 @@
 	  To compile this driver as a module, choose M here.  The module
 	  will be called atl1c.
 
+config ALX
+	tristate "Qualcomm Atheros AR816x/AR817x support"
+	depends on PCI
+	select CRC32
+	select NET_CORE
+	select MDIO
+	help
+	  This driver supports the Qualcomm Atheros L1F ethernet adapter,
+	  i.e. the following chipsets:
+
+	  1969:1091 - AR8161 Gigabit Ethernet
+	  1969:1090 - AR8162 Fast Ethernet
+	  1969:10A1 - AR8171 Gigabit Ethernet
+	  1969:10A0 - AR8172 Fast Ethernet
+
+	  To compile this driver as a module, choose M here.  The module
+	  will be called alx.
+
 endif # NET_VENDOR_ATHEROS

diff --git a/drivers/net/ethernet/atheros/Makefile b/drivers/net/ethernet/atheros/Makefile
index e7e76fb..5cf1c65 100644
--- a/drivers/net/ethernet/atheros/Makefile
+++ b/drivers/net/ethernet/atheros/Makefile

@@ -6,3 +6,4 @@
 obj-$(CONFIG_ATL2) += atlx/
 obj-$(CONFIG_ATL1E) += atl1e/
 obj-$(CONFIG_ATL1C) += atl1c/
+obj-$(CONFIG_ALX) += alx/

diff --git a/drivers/net/ethernet/atheros/alx/Makefile b/drivers/net/ethernet/atheros/alx/Makefile
new file mode 100644
index 0000000..5901fa4
--- /dev/null
+++ b/drivers/net/ethernet/atheros/alx/Makefile

@@ -0,0 +1,3 @@
+obj-$(CONFIG_ALX) += alx.o
+alx-objs := main.o ethtool.o hw.o
+ccflags-y += -D__CHECK_ENDIAN__

diff --git a/drivers/net/ethernet/atheros/alx/alx.h b/drivers/net/ethernet/atheros/alx/alx.h
new file mode 100644
index 0000000..50b3ae2
--- /dev/null
+++ b/drivers/net/ethernet/atheros/alx/alx.h

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2013 Johannes Berg <johannes@sipsolutions.net>
+ *
+ *  This file is free software: you may copy, redistribute and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation, either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  This file is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This file incorporates work covered by the following copyright and
+ * permission notice:
+ *
+ * Copyright (c) 2012 Qualcomm Atheros, Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _ALX_H_
+#define _ALX_H_
+
+#include <linux/types.h>
+#include <linux/etherdevice.h>
+#include <linux/dma-mapping.h>
+#include <linux/spinlock.h>
+#include "hw.h"
+
+#define ALX_WATCHDOG_TIME   (5 * HZ)
+
+struct alx_buffer {
+	struct sk_buff *skb;
+	DEFINE_DMA_UNMAP_ADDR(dma);
+	DEFINE_DMA_UNMAP_LEN(size);
+};
+
+struct alx_rx_queue {
+	struct alx_rrd *rrd;
+	dma_addr_t rrd_dma;
+
+	struct alx_rfd *rfd;
+	dma_addr_t rfd_dma;
+
+	struct alx_buffer *bufs;
+
+	u16 write_idx, read_idx;
+	u16 rrd_read_idx;
+};
+#define ALX_RX_ALLOC_THRESH	32
+
+struct alx_tx_queue {
+	struct alx_txd *tpd;
+	dma_addr_t tpd_dma;
+	struct alx_buffer *bufs;
+	u16 write_idx, read_idx;
+};
+
+#define ALX_DEFAULT_TX_WORK 128
+
+enum alx_device_quirks {
+	ALX_DEV_QUIRK_MSI_INTX_DISABLE_BUG = BIT(0),
+};
+
+struct alx_priv {
+	struct net_device *dev;
+
+	struct alx_hw hw;
+
+	/* all descriptor memory */
+	struct {
+		dma_addr_t dma;
+		void *virt;
+		int size;
+	} descmem;
+
+	/* protect int_mask updates */
+	spinlock_t irq_lock;
+	u32 int_mask;
+
+	int tx_ringsz;
+	int rx_ringsz;
+	int rxbuf_size;
+
+	struct napi_struct napi;
+	struct alx_tx_queue txq;
+	struct alx_rx_queue rxq;
+
+	struct work_struct link_check_wk;
+	struct work_struct reset_wk;
+
+	u16 msg_enable;
+
+	bool msi;
+};
+
+extern const struct ethtool_ops alx_ethtool_ops;
+extern const char alx_drv_name[];
+
+#endif

diff --git a/drivers/net/ethernet/atheros/alx/ethtool.c b/drivers/net/ethernet/atheros/alx/ethtool.c
new file mode 100644
index 0000000..6fa2aec
--- /dev/null
+++ b/drivers/net/ethernet/atheros/alx/ethtool.c

@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2013 Johannes Berg <johannes@sipsolutions.net>
+ *
+ *  This file is free software: you may copy, redistribute and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation, either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  This file is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This file incorporates work covered by the following copyright and
+ * permission notice:
+ *
+ * Copyright (c) 2012 Qualcomm Atheros, Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mdio.h>
+#include <linux/interrupt.h>
+#include <asm/byteorder.h>
+
+#include "alx.h"
+#include "reg.h"
+#include "hw.h"
+
+
+static int alx_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+
+	ecmd->supported = SUPPORTED_10baseT_Half |
+			  SUPPORTED_10baseT_Full |
+			  SUPPORTED_100baseT_Half |
+			  SUPPORTED_100baseT_Full |
+			  SUPPORTED_Autoneg |
+			  SUPPORTED_TP |
+			  SUPPORTED_Pause;
+	if (alx_hw_giga(hw))
+		ecmd->supported |= SUPPORTED_1000baseT_Full;
+
+	ecmd->advertising = ADVERTISED_TP;
+	if (hw->adv_cfg & ADVERTISED_Autoneg)
+		ecmd->advertising |= hw->adv_cfg;
+
+	ecmd->port = PORT_TP;
+	ecmd->phy_address = 0;
+	if (hw->adv_cfg & ADVERTISED_Autoneg)
+		ecmd->autoneg = AUTONEG_ENABLE;
+	else
+		ecmd->autoneg = AUTONEG_DISABLE;
+	ecmd->transceiver = XCVR_INTERNAL;
+
+	if (hw->flowctrl & ALX_FC_ANEG && hw->adv_cfg & ADVERTISED_Autoneg) {
+		if (hw->flowctrl & ALX_FC_RX) {
+			ecmd->advertising |= ADVERTISED_Pause;
+
+			if (!(hw->flowctrl & ALX_FC_TX))
+				ecmd->advertising |= ADVERTISED_Asym_Pause;
+		} else if (hw->flowctrl & ALX_FC_TX) {
+			ecmd->advertising |= ADVERTISED_Asym_Pause;
+		}
+	}
+
+	if (hw->link_speed != SPEED_UNKNOWN) {
+		ethtool_cmd_speed_set(ecmd,
+				      hw->link_speed - hw->link_speed % 10);
+		ecmd->duplex = hw->link_speed % 10;
+	} else {
+		ethtool_cmd_speed_set(ecmd, SPEED_UNKNOWN);
+		ecmd->duplex = DUPLEX_UNKNOWN;
+	}
+
+	return 0;
+}
+
+static int alx_set_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+	u32 adv_cfg;
+
+	ASSERT_RTNL();
+
+	if (ecmd->autoneg == AUTONEG_ENABLE) {
+		if (ecmd->advertising & ADVERTISED_1000baseT_Half)
+			return -EINVAL;
+		adv_cfg = ecmd->advertising | ADVERTISED_Autoneg;
+	} else {
+		int speed = ethtool_cmd_speed(ecmd);
+
+		switch (speed + ecmd->duplex) {
+		case SPEED_10 + DUPLEX_HALF:
+			adv_cfg = ADVERTISED_10baseT_Half;
+			break;
+		case SPEED_10 + DUPLEX_FULL:
+			adv_cfg = ADVERTISED_10baseT_Full;
+			break;
+		case SPEED_100 + DUPLEX_HALF:
+			adv_cfg = ADVERTISED_100baseT_Half;
+			break;
+		case SPEED_100 + DUPLEX_FULL:
+			adv_cfg = ADVERTISED_100baseT_Full;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	hw->adv_cfg = adv_cfg;
+	return alx_setup_speed_duplex(hw, adv_cfg, hw->flowctrl);
+}
+
+static void alx_get_pauseparam(struct net_device *netdev,
+			       struct ethtool_pauseparam *pause)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+
+	if (hw->flowctrl & ALX_FC_ANEG &&
+	    hw->adv_cfg & ADVERTISED_Autoneg)
+		pause->autoneg = AUTONEG_ENABLE;
+	else
+		pause->autoneg = AUTONEG_DISABLE;
+
+	if (hw->flowctrl & ALX_FC_TX)
+		pause->tx_pause = 1;
+	else
+		pause->tx_pause = 0;
+
+	if (hw->flowctrl & ALX_FC_RX)
+		pause->rx_pause = 1;
+	else
+		pause->rx_pause = 0;
+}
+
+
+static int alx_set_pauseparam(struct net_device *netdev,
+			      struct ethtool_pauseparam *pause)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+	int err = 0;
+	bool reconfig_phy = false;
+	u8 fc = 0;
+
+	if (pause->tx_pause)
+		fc |= ALX_FC_TX;
+	if (pause->rx_pause)
+		fc |= ALX_FC_RX;
+	if (pause->autoneg)
+		fc |= ALX_FC_ANEG;
+
+	ASSERT_RTNL();
+
+	/* restart auto-neg for auto-mode */
+	if (hw->adv_cfg & ADVERTISED_Autoneg) {
+		if (!((fc ^ hw->flowctrl) & ALX_FC_ANEG))
+			reconfig_phy = true;
+		if (fc & hw->flowctrl & ALX_FC_ANEG &&
+		    (fc ^ hw->flowctrl) & (ALX_FC_RX | ALX_FC_TX))
+			reconfig_phy = true;
+	}
+
+	if (reconfig_phy) {
+		err = alx_setup_speed_duplex(hw, hw->adv_cfg, fc);
+		return err;
+	}
+
+	/* flow control on mac */
+	if ((fc ^ hw->flowctrl) & (ALX_FC_RX | ALX_FC_TX))
+		alx_cfg_mac_flowcontrol(hw, fc);
+
+	hw->flowctrl = fc;
+
+	return 0;
+}
+
+static u32 alx_get_msglevel(struct net_device *netdev)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+
+	return alx->msg_enable;
+}
+
+static void alx_set_msglevel(struct net_device *netdev, u32 data)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+
+	alx->msg_enable = data;
+}
+
+static void alx_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+
+	wol->supported = WAKE_MAGIC | WAKE_PHY;
+	wol->wolopts = 0;
+
+	if (hw->sleep_ctrl & ALX_SLEEP_WOL_MAGIC)
+		wol->wolopts |= WAKE_MAGIC;
+	if (hw->sleep_ctrl & ALX_SLEEP_WOL_PHY)
+		wol->wolopts |= WAKE_PHY;
+}
+
+static int alx_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+
+	if (wol->wolopts & (WAKE_ARP | WAKE_MAGICSECURE |
+			    WAKE_UCAST | WAKE_BCAST | WAKE_MCAST))
+		return -EOPNOTSUPP;
+
+	hw->sleep_ctrl = 0;
+
+	if (wol->wolopts & WAKE_MAGIC)
+		hw->sleep_ctrl |= ALX_SLEEP_WOL_MAGIC;
+	if (wol->wolopts & WAKE_PHY)
+		hw->sleep_ctrl |= ALX_SLEEP_WOL_PHY;
+
+	device_set_wakeup_enable(&alx->hw.pdev->dev, hw->sleep_ctrl);
+
+	return 0;
+}
+
+static void alx_get_drvinfo(struct net_device *netdev,
+			    struct ethtool_drvinfo *drvinfo)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+
+	strlcpy(drvinfo->driver, alx_drv_name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->bus_info, pci_name(alx->hw.pdev),
+		sizeof(drvinfo->bus_info));
+}
+
+const struct ethtool_ops alx_ethtool_ops = {
+	.get_settings	= alx_get_settings,
+	.set_settings	= alx_set_settings,
+	.get_pauseparam	= alx_get_pauseparam,
+	.set_pauseparam	= alx_set_pauseparam,
+	.get_drvinfo	= alx_get_drvinfo,
+	.get_msglevel	= alx_get_msglevel,
+	.set_msglevel	= alx_set_msglevel,
+	.get_wol	= alx_get_wol,
+	.set_wol	= alx_set_wol,
+	.get_link	= ethtool_op_get_link,
+};

diff --git a/drivers/net/ethernet/atheros/alx/hw.c b/drivers/net/ethernet/atheros/alx/hw.c
new file mode 100644
index 0000000..220a16a
--- /dev/null
+++ b/drivers/net/ethernet/atheros/alx/hw.c

@@ -0,0 +1,1226 @@
+/*
+ * Copyright (c) 2013 Johannes Berg <johannes@sipsolutions.net>
+ *
+ *  This file is free software: you may copy, redistribute and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation, either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  This file is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This file incorporates work covered by the following copyright and
+ * permission notice:
+ *
+ * Copyright (c) 2012 Qualcomm Atheros, Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/mdio.h>
+#include "reg.h"
+#include "hw.h"
+
+static inline bool alx_is_rev_a(u8 rev)
+{
+	return rev == ALX_REV_A0 || rev == ALX_REV_A1;
+}
+
+static int alx_wait_mdio_idle(struct alx_hw *hw)
+{
+	u32 val;
+	int i;
+
+	for (i = 0; i < ALX_MDIO_MAX_AC_TO; i++) {
+		val = alx_read_mem32(hw, ALX_MDIO);
+		if (!(val & ALX_MDIO_BUSY))
+			return 0;
+		udelay(10);
+	}
+
+	return -ETIMEDOUT;
+}
+
+static int alx_read_phy_core(struct alx_hw *hw, bool ext, u8 dev,
+			     u16 reg, u16 *phy_data)
+{
+	u32 val, clk_sel;
+	int err;
+
+	*phy_data = 0;
+
+	/* use slow clock when it's in hibernation status */
+	clk_sel = hw->link_speed != SPEED_UNKNOWN ?
+			ALX_MDIO_CLK_SEL_25MD4 :
+			ALX_MDIO_CLK_SEL_25MD128;
+
+	if (ext) {
+		val = dev << ALX_MDIO_EXTN_DEVAD_SHIFT |
+		      reg << ALX_MDIO_EXTN_REG_SHIFT;
+		alx_write_mem32(hw, ALX_MDIO_EXTN, val);
+
+		val = ALX_MDIO_SPRES_PRMBL | ALX_MDIO_START |
+		      ALX_MDIO_MODE_EXT | ALX_MDIO_OP_READ |
+		      clk_sel << ALX_MDIO_CLK_SEL_SHIFT;
+	} else {
+		val = ALX_MDIO_SPRES_PRMBL |
+		      clk_sel << ALX_MDIO_CLK_SEL_SHIFT |
+		      reg << ALX_MDIO_REG_SHIFT |
+		      ALX_MDIO_START | ALX_MDIO_OP_READ;
+	}
+	alx_write_mem32(hw, ALX_MDIO, val);
+
+	err = alx_wait_mdio_idle(hw);
+	if (err)
+		return err;
+	val = alx_read_mem32(hw, ALX_MDIO);
+	*phy_data = ALX_GET_FIELD(val, ALX_MDIO_DATA);
+	return 0;
+}
+
+static int alx_write_phy_core(struct alx_hw *hw, bool ext, u8 dev,
+			      u16 reg, u16 phy_data)
+{
+	u32 val, clk_sel;
+
+	/* use slow clock when it's in hibernation status */
+	clk_sel = hw->link_speed != SPEED_UNKNOWN ?
+			ALX_MDIO_CLK_SEL_25MD4 :
+			ALX_MDIO_CLK_SEL_25MD128;
+
+	if (ext) {
+		val = dev << ALX_MDIO_EXTN_DEVAD_SHIFT |
+		      reg << ALX_MDIO_EXTN_REG_SHIFT;
+		alx_write_mem32(hw, ALX_MDIO_EXTN, val);
+
+		val = ALX_MDIO_SPRES_PRMBL |
+		      clk_sel << ALX_MDIO_CLK_SEL_SHIFT |
+		      phy_data << ALX_MDIO_DATA_SHIFT |
+		      ALX_MDIO_START | ALX_MDIO_MODE_EXT;
+	} else {
+		val = ALX_MDIO_SPRES_PRMBL |
+		      clk_sel << ALX_MDIO_CLK_SEL_SHIFT |
+		      reg << ALX_MDIO_REG_SHIFT |
+		      phy_data << ALX_MDIO_DATA_SHIFT |
+		      ALX_MDIO_START;
+	}
+	alx_write_mem32(hw, ALX_MDIO, val);
+
+	return alx_wait_mdio_idle(hw);
+}
+
+static int __alx_read_phy_reg(struct alx_hw *hw, u16 reg, u16 *phy_data)
+{
+	return alx_read_phy_core(hw, false, 0, reg, phy_data);
+}
+
+static int __alx_write_phy_reg(struct alx_hw *hw, u16 reg, u16 phy_data)
+{
+	return alx_write_phy_core(hw, false, 0, reg, phy_data);
+}
+
+static int __alx_read_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 *pdata)
+{
+	return alx_read_phy_core(hw, true, dev, reg, pdata);
+}
+
+static int __alx_write_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 data)
+{
+	return alx_write_phy_core(hw, true, dev, reg, data);
+}
+
+static int __alx_read_phy_dbg(struct alx_hw *hw, u16 reg, u16 *pdata)
+{
+	int err;
+
+	err = __alx_write_phy_reg(hw, ALX_MII_DBG_ADDR, reg);
+	if (err)
+		return err;
+
+	return __alx_read_phy_reg(hw, ALX_MII_DBG_DATA, pdata);
+}
+
+static int __alx_write_phy_dbg(struct alx_hw *hw, u16 reg, u16 data)
+{
+	int err;
+
+	err = __alx_write_phy_reg(hw, ALX_MII_DBG_ADDR, reg);
+	if (err)
+		return err;
+
+	return __alx_write_phy_reg(hw, ALX_MII_DBG_DATA, data);
+}
+
+int alx_read_phy_reg(struct alx_hw *hw, u16 reg, u16 *phy_data)
+{
+	int err;
+
+	spin_lock(&hw->mdio_lock);
+	err = __alx_read_phy_reg(hw, reg, phy_data);
+	spin_unlock(&hw->mdio_lock);
+
+	return err;
+}
+
+int alx_write_phy_reg(struct alx_hw *hw, u16 reg, u16 phy_data)
+{
+	int err;
+
+	spin_lock(&hw->mdio_lock);
+	err = __alx_write_phy_reg(hw, reg, phy_data);
+	spin_unlock(&hw->mdio_lock);
+
+	return err;
+}
+
+int alx_read_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 *pdata)
+{
+	int err;
+
+	spin_lock(&hw->mdio_lock);
+	err = __alx_read_phy_ext(hw, dev, reg, pdata);
+	spin_unlock(&hw->mdio_lock);
+
+	return err;
+}
+
+int alx_write_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 data)
+{
+	int err;
+
+	spin_lock(&hw->mdio_lock);
+	err = __alx_write_phy_ext(hw, dev, reg, data);
+	spin_unlock(&hw->mdio_lock);
+
+	return err;
+}
+
+static int alx_read_phy_dbg(struct alx_hw *hw, u16 reg, u16 *pdata)
+{
+	int err;
+
+	spin_lock(&hw->mdio_lock);
+	err = __alx_read_phy_dbg(hw, reg, pdata);
+	spin_unlock(&hw->mdio_lock);
+
+	return err;
+}
+
+static int alx_write_phy_dbg(struct alx_hw *hw, u16 reg, u16 data)
+{
+	int err;
+
+	spin_lock(&hw->mdio_lock);
+	err = __alx_write_phy_dbg(hw, reg, data);
+	spin_unlock(&hw->mdio_lock);
+
+	return err;
+}
+
+static u16 alx_get_phy_config(struct alx_hw *hw)
+{
+	u32 val;
+	u16 phy_val;
+
+	val = alx_read_mem32(hw, ALX_PHY_CTRL);
+	/* phy in reset */
+	if ((val & ALX_PHY_CTRL_DSPRST_OUT) == 0)
+		return ALX_DRV_PHY_UNKNOWN;
+
+	val = alx_read_mem32(hw, ALX_DRV);
+	val = ALX_GET_FIELD(val, ALX_DRV_PHY);
+	if (ALX_DRV_PHY_UNKNOWN == val)
+		return ALX_DRV_PHY_UNKNOWN;
+
+	alx_read_phy_reg(hw, ALX_MII_DBG_ADDR, &phy_val);
+	if (ALX_PHY_INITED == phy_val)
+		return val;
+
+	return ALX_DRV_PHY_UNKNOWN;
+}
+
+static bool alx_wait_reg(struct alx_hw *hw, u32 reg, u32 wait, u32 *val)
+{
+	u32 read;
+	int i;
+
+	for (i = 0; i < ALX_SLD_MAX_TO; i++) {
+		read = alx_read_mem32(hw, reg);
+		if ((read & wait) == 0) {
+			if (val)
+				*val = read;
+			return true;
+		}
+		mdelay(1);
+	}
+
+	return false;
+}
+
+static bool alx_read_macaddr(struct alx_hw *hw, u8 *addr)
+{
+	u32 mac0, mac1;
+
+	mac0 = alx_read_mem32(hw, ALX_STAD0);
+	mac1 = alx_read_mem32(hw, ALX_STAD1);
+
+	/* addr should be big-endian */
+	*(__be32 *)(addr + 2) = cpu_to_be32(mac0);
+	*(__be16 *)addr = cpu_to_be16(mac1);
+
+	return is_valid_ether_addr(addr);
+}
+
+int alx_get_perm_macaddr(struct alx_hw *hw, u8 *addr)
+{
+	u32 val;
+
+	/* try to get it from register first */
+	if (alx_read_macaddr(hw, addr))
+		return 0;
+
+	/* try to load from efuse */
+	if (!alx_wait_reg(hw, ALX_SLD, ALX_SLD_STAT | ALX_SLD_START, &val))
+		return -EIO;
+	alx_write_mem32(hw, ALX_SLD, val | ALX_SLD_START);
+	if (!alx_wait_reg(hw, ALX_SLD, ALX_SLD_START, NULL))
+		return -EIO;
+	if (alx_read_macaddr(hw, addr))
+		return 0;
+
+	/* try to load from flash/eeprom (if present) */
+	val = alx_read_mem32(hw, ALX_EFLD);
+	if (val & (ALX_EFLD_F_EXIST | ALX_EFLD_E_EXIST)) {
+		if (!alx_wait_reg(hw, ALX_EFLD,
+				  ALX_EFLD_STAT | ALX_EFLD_START, &val))
+			return -EIO;
+		alx_write_mem32(hw, ALX_EFLD, val | ALX_EFLD_START);
+		if (!alx_wait_reg(hw, ALX_EFLD, ALX_EFLD_START, NULL))
+			return -EIO;
+		if (alx_read_macaddr(hw, addr))
+			return 0;
+	}
+
+	return -EIO;
+}
+
+void alx_set_macaddr(struct alx_hw *hw, const u8 *addr)
+{
+	u32 val;
+
+	/* for example: 00-0B-6A-F6-00-DC * STAD0=6AF600DC, STAD1=000B */
+	val = be32_to_cpu(*(__be32 *)(addr + 2));
+	alx_write_mem32(hw, ALX_STAD0, val);
+	val = be16_to_cpu(*(__be16 *)addr);
+	alx_write_mem32(hw, ALX_STAD1, val);
+}
+
+static void alx_enable_osc(struct alx_hw *hw)
+{
+	u32 val;
+
+	/* rising edge */
+	val = alx_read_mem32(hw, ALX_MISC);
+	alx_write_mem32(hw, ALX_MISC, val & ~ALX_MISC_INTNLOSC_OPEN);
+	alx_write_mem32(hw, ALX_MISC, val | ALX_MISC_INTNLOSC_OPEN);
+}
+
+static void alx_reset_osc(struct alx_hw *hw, u8 rev)
+{
+	u32 val, val2;
+
+	/* clear Internal OSC settings, switching OSC by hw itself */
+	val = alx_read_mem32(hw, ALX_MISC3);
+	alx_write_mem32(hw, ALX_MISC3,
+			(val & ~ALX_MISC3_25M_BY_SW) |
+			ALX_MISC3_25M_NOTO_INTNL);
+
+	/* 25M clk from chipset may be unstable 1s after de-assert of
+	 * PERST, driver need re-calibrate before enter Sleep for WoL
+	 */
+	val = alx_read_mem32(hw, ALX_MISC);
+	if (rev >= ALX_REV_B0) {
+		/* restore over current protection def-val,
+		 * this val could be reset by MAC-RST
+		 */
+		ALX_SET_FIELD(val, ALX_MISC_PSW_OCP, ALX_MISC_PSW_OCP_DEF);
+		/* a 0->1 change will update the internal val of osc */
+		val &= ~ALX_MISC_INTNLOSC_OPEN;
+		alx_write_mem32(hw, ALX_MISC, val);
+		alx_write_mem32(hw, ALX_MISC, val | ALX_MISC_INTNLOSC_OPEN);
+		/* hw will automatically dis OSC after cab. */
+		val2 = alx_read_mem32(hw, ALX_MSIC2);
+		val2 &= ~ALX_MSIC2_CALB_START;
+		alx_write_mem32(hw, ALX_MSIC2, val2);
+		alx_write_mem32(hw, ALX_MSIC2, val2 | ALX_MSIC2_CALB_START);
+	} else {
+		val &= ~ALX_MISC_INTNLOSC_OPEN;
+		/* disable isolate for rev A devices */
+		if (alx_is_rev_a(rev))
+			val &= ~ALX_MISC_ISO_EN;
+
+		alx_write_mem32(hw, ALX_MISC, val | ALX_MISC_INTNLOSC_OPEN);
+		alx_write_mem32(hw, ALX_MISC, val);
+	}
+
+	udelay(20);
+}
+
+static int alx_stop_mac(struct alx_hw *hw)
+{
+	u32 rxq, txq, val;
+	u16 i;
+
+	rxq = alx_read_mem32(hw, ALX_RXQ0);
+	alx_write_mem32(hw, ALX_RXQ0, rxq & ~ALX_RXQ0_EN);
+	txq = alx_read_mem32(hw, ALX_TXQ0);
+	alx_write_mem32(hw, ALX_TXQ0, txq & ~ALX_TXQ0_EN);
+
+	udelay(40);
+
+	hw->rx_ctrl &= ~(ALX_MAC_CTRL_RX_EN | ALX_MAC_CTRL_TX_EN);
+	alx_write_mem32(hw, ALX_MAC_CTRL, hw->rx_ctrl);
+
+	for (i = 0; i < ALX_DMA_MAC_RST_TO; i++) {
+		val = alx_read_mem32(hw, ALX_MAC_STS);
+		if (!(val & ALX_MAC_STS_IDLE))
+			return 0;
+		udelay(10);
+	}
+
+	return -ETIMEDOUT;
+}
+
+int alx_reset_mac(struct alx_hw *hw)
+{
+	u32 val, pmctrl;
+	int i, ret;
+	u8 rev;
+	bool a_cr;
+
+	pmctrl = 0;
+	rev = alx_hw_revision(hw);
+	a_cr = alx_is_rev_a(rev) && alx_hw_with_cr(hw);
+
+	/* disable all interrupts, RXQ/TXQ */
+	alx_write_mem32(hw, ALX_MSIX_MASK, 0xFFFFFFFF);
+	alx_write_mem32(hw, ALX_IMR, 0);
+	alx_write_mem32(hw, ALX_ISR, ALX_ISR_DIS);
+
+	ret = alx_stop_mac(hw);
+	if (ret)
+		return ret;
+
+	/* mac reset workaroud */
+	alx_write_mem32(hw, ALX_RFD_PIDX, 1);
+
+	/* dis l0s/l1 before mac reset */
+	if (a_cr) {
+		pmctrl = alx_read_mem32(hw, ALX_PMCTRL);
+		if (pmctrl & (ALX_PMCTRL_L1_EN | ALX_PMCTRL_L0S_EN))
+			alx_write_mem32(hw, ALX_PMCTRL,
+					pmctrl & ~(ALX_PMCTRL_L1_EN |
+						   ALX_PMCTRL_L0S_EN));
+	}
+
+	/* reset whole mac safely */
+	val = alx_read_mem32(hw, ALX_MASTER);
+	alx_write_mem32(hw, ALX_MASTER,
+			val | ALX_MASTER_DMA_MAC_RST | ALX_MASTER_OOB_DIS);
+
+	/* make sure it's real idle */
+	udelay(10);
+	for (i = 0; i < ALX_DMA_MAC_RST_TO; i++) {
+		val = alx_read_mem32(hw, ALX_RFD_PIDX);
+		if (val == 0)
+			break;
+		udelay(10);
+	}
+	for (; i < ALX_DMA_MAC_RST_TO; i++) {
+		val = alx_read_mem32(hw, ALX_MASTER);
+		if ((val & ALX_MASTER_DMA_MAC_RST) == 0)
+			break;
+		udelay(10);
+	}
+	if (i == ALX_DMA_MAC_RST_TO)
+		return -EIO;
+	udelay(10);
+
+	if (a_cr) {
+		alx_write_mem32(hw, ALX_MASTER, val | ALX_MASTER_PCLKSEL_SRDS);
+		/* restore l0s / l1 */
+		if (pmctrl & (ALX_PMCTRL_L1_EN | ALX_PMCTRL_L0S_EN))
+			alx_write_mem32(hw, ALX_PMCTRL, pmctrl);
+	}
+
+	alx_reset_osc(hw, rev);
+
+	/* clear Internal OSC settings, switching OSC by hw itself,
+	 * disable isolate for rev A devices
+	 */
+	val = alx_read_mem32(hw, ALX_MISC3);
+	alx_write_mem32(hw, ALX_MISC3,
+			(val & ~ALX_MISC3_25M_BY_SW) |
+			ALX_MISC3_25M_NOTO_INTNL);
+	val = alx_read_mem32(hw, ALX_MISC);
+	val &= ~ALX_MISC_INTNLOSC_OPEN;
+	if (alx_is_rev_a(rev))
+		val &= ~ALX_MISC_ISO_EN;
+	alx_write_mem32(hw, ALX_MISC, val);
+	udelay(20);
+
+	/* driver control speed/duplex, hash-alg */
+	alx_write_mem32(hw, ALX_MAC_CTRL, hw->rx_ctrl);
+
+	val = alx_read_mem32(hw, ALX_SERDES);
+	alx_write_mem32(hw, ALX_SERDES,
+			val | ALX_SERDES_MACCLK_SLWDWN |
+			ALX_SERDES_PHYCLK_SLWDWN);
+
+	return 0;
+}
+
+void alx_reset_phy(struct alx_hw *hw)
+{
+	int i;
+	u32 val;
+	u16 phy_val;
+
+	/* (DSP)reset PHY core */
+	val = alx_read_mem32(hw, ALX_PHY_CTRL);
+	val &= ~(ALX_PHY_CTRL_DSPRST_OUT | ALX_PHY_CTRL_IDDQ |
+		 ALX_PHY_CTRL_GATE_25M | ALX_PHY_CTRL_POWER_DOWN |
+		 ALX_PHY_CTRL_CLS);
+	val |= ALX_PHY_CTRL_RST_ANALOG;
+
+	val |= (ALX_PHY_CTRL_HIB_PULSE | ALX_PHY_CTRL_HIB_EN);
+	alx_write_mem32(hw, ALX_PHY_CTRL, val);
+	udelay(10);
+	alx_write_mem32(hw, ALX_PHY_CTRL, val | ALX_PHY_CTRL_DSPRST_OUT);
+
+	for (i = 0; i < ALX_PHY_CTRL_DSPRST_TO; i++)
+		udelay(10);
+
+	/* phy power saving & hib */
+	alx_write_phy_dbg(hw, ALX_MIIDBG_LEGCYPS, ALX_LEGCYPS_DEF);
+	alx_write_phy_dbg(hw, ALX_MIIDBG_SYSMODCTRL,
+			  ALX_SYSMODCTRL_IECHOADJ_DEF);
+	alx_write_phy_ext(hw, ALX_MIIEXT_PCS, ALX_MIIEXT_VDRVBIAS,
+			  ALX_VDRVBIAS_DEF);
+
+	/* EEE advertisement */
+	val = alx_read_mem32(hw, ALX_LPI_CTRL);
+	alx_write_mem32(hw, ALX_LPI_CTRL, val & ~ALX_LPI_CTRL_EN);
+	alx_write_phy_ext(hw, ALX_MIIEXT_ANEG, ALX_MIIEXT_LOCAL_EEEADV, 0);
+
+	/* phy power saving */
+	alx_write_phy_dbg(hw, ALX_MIIDBG_TST10BTCFG, ALX_TST10BTCFG_DEF);
+	alx_write_phy_dbg(hw, ALX_MIIDBG_SRDSYSMOD, ALX_SRDSYSMOD_DEF);
+	alx_write_phy_dbg(hw, ALX_MIIDBG_TST100BTCFG, ALX_TST100BTCFG_DEF);
+	alx_write_phy_dbg(hw, ALX_MIIDBG_ANACTRL, ALX_ANACTRL_DEF);
+	alx_read_phy_dbg(hw, ALX_MIIDBG_GREENCFG2, &phy_val);
+	alx_write_phy_dbg(hw, ALX_MIIDBG_GREENCFG2,
+			  phy_val & ~ALX_GREENCFG2_GATE_DFSE_EN);
+	/* rtl8139c, 120m issue */
+	alx_write_phy_ext(hw, ALX_MIIEXT_ANEG, ALX_MIIEXT_NLP78,
+			  ALX_MIIEXT_NLP78_120M_DEF);
+	alx_write_phy_ext(hw, ALX_MIIEXT_ANEG, ALX_MIIEXT_S3DIG10,
+			  ALX_MIIEXT_S3DIG10_DEF);
+
+	if (hw->lnk_patch) {
+		/* Turn off half amplitude */
+		alx_read_phy_ext(hw, ALX_MIIEXT_PCS, ALX_MIIEXT_CLDCTRL3,
+				 &phy_val);
+		alx_write_phy_ext(hw, ALX_MIIEXT_PCS, ALX_MIIEXT_CLDCTRL3,
+				  phy_val | ALX_CLDCTRL3_BP_CABLE1TH_DET_GT);
+		/* Turn off Green feature */
+		alx_read_phy_dbg(hw, ALX_MIIDBG_GREENCFG2, &phy_val);
+		alx_write_phy_dbg(hw, ALX_MIIDBG_GREENCFG2,
+				  phy_val | ALX_GREENCFG2_BP_GREEN);
+		/* Turn off half Bias */
+		alx_read_phy_ext(hw, ALX_MIIEXT_PCS, ALX_MIIEXT_CLDCTRL5,
+				 &phy_val);
+		alx_write_phy_ext(hw, ALX_MIIEXT_PCS, ALX_MIIEXT_CLDCTRL5,
+				  phy_val | ALX_CLDCTRL5_BP_VD_HLFBIAS);
+	}
+
+	/* set phy interrupt mask */
+	alx_write_phy_reg(hw, ALX_MII_IER, ALX_IER_LINK_UP | ALX_IER_LINK_DOWN);
+}
+
+#define ALX_PCI_CMD (PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY | PCI_COMMAND_IO)
+
+void alx_reset_pcie(struct alx_hw *hw)
+{
+	u8 rev = alx_hw_revision(hw);
+	u32 val;
+	u16 val16;
+
+	/* Workaround for PCI problem when BIOS sets MMRBC incorrectly. */
+	pci_read_config_word(hw->pdev, PCI_COMMAND, &val16);
+	if (!(val16 & ALX_PCI_CMD) || (val16 & PCI_COMMAND_INTX_DISABLE)) {
+		val16 = (val16 | ALX_PCI_CMD) & ~PCI_COMMAND_INTX_DISABLE;
+		pci_write_config_word(hw->pdev, PCI_COMMAND, val16);
+	}
+
+	/* clear WoL setting/status */
+	val = alx_read_mem32(hw, ALX_WOL0);
+	alx_write_mem32(hw, ALX_WOL0, 0);
+
+	val = alx_read_mem32(hw, ALX_PDLL_TRNS1);
+	alx_write_mem32(hw, ALX_PDLL_TRNS1, val & ~ALX_PDLL_TRNS1_D3PLLOFF_EN);
+
+	/* mask some pcie error bits */
+	val = alx_read_mem32(hw, ALX_UE_SVRT);
+	val &= ~(ALX_UE_SVRT_DLPROTERR | ALX_UE_SVRT_FCPROTERR);
+	alx_write_mem32(hw, ALX_UE_SVRT, val);
+
+	/* wol 25M & pclk */
+	val = alx_read_mem32(hw, ALX_MASTER);
+	if (alx_is_rev_a(rev) && alx_hw_with_cr(hw)) {
+		if ((val & ALX_MASTER_WAKEN_25M) == 0 ||
+		    (val & ALX_MASTER_PCLKSEL_SRDS) == 0)
+			alx_write_mem32(hw, ALX_MASTER,
+					val | ALX_MASTER_PCLKSEL_SRDS |
+					ALX_MASTER_WAKEN_25M);
+	} else {
+		if ((val & ALX_MASTER_WAKEN_25M) == 0 ||
+		    (val & ALX_MASTER_PCLKSEL_SRDS) != 0)
+			alx_write_mem32(hw, ALX_MASTER,
+					(val & ~ALX_MASTER_PCLKSEL_SRDS) |
+					ALX_MASTER_WAKEN_25M);
+	}
+
+	/* ASPM setting */
+	alx_enable_aspm(hw, true, true);
+
+	udelay(10);
+}
+
+void alx_start_mac(struct alx_hw *hw)
+{
+	u32 mac, txq, rxq;
+
+	rxq = alx_read_mem32(hw, ALX_RXQ0);
+	alx_write_mem32(hw, ALX_RXQ0, rxq | ALX_RXQ0_EN);
+	txq = alx_read_mem32(hw, ALX_TXQ0);
+	alx_write_mem32(hw, ALX_TXQ0, txq | ALX_TXQ0_EN);
+
+	mac = hw->rx_ctrl;
+	if (hw->link_speed % 10 == DUPLEX_FULL)
+		mac |= ALX_MAC_CTRL_FULLD;
+	else
+		mac &= ~ALX_MAC_CTRL_FULLD;
+	ALX_SET_FIELD(mac, ALX_MAC_CTRL_SPEED,
+		      hw->link_speed >= SPEED_1000 ? ALX_MAC_CTRL_SPEED_1000 :
+						     ALX_MAC_CTRL_SPEED_10_100);
+	mac |= ALX_MAC_CTRL_TX_EN | ALX_MAC_CTRL_RX_EN;
+	hw->rx_ctrl = mac;
+	alx_write_mem32(hw, ALX_MAC_CTRL, mac);
+}
+
+void alx_cfg_mac_flowcontrol(struct alx_hw *hw, u8 fc)
+{
+	if (fc & ALX_FC_RX)
+		hw->rx_ctrl |= ALX_MAC_CTRL_RXFC_EN;
+	else
+		hw->rx_ctrl &= ~ALX_MAC_CTRL_RXFC_EN;
+
+	if (fc & ALX_FC_TX)
+		hw->rx_ctrl |= ALX_MAC_CTRL_TXFC_EN;
+	else
+		hw->rx_ctrl &= ~ALX_MAC_CTRL_TXFC_EN;
+
+	alx_write_mem32(hw, ALX_MAC_CTRL, hw->rx_ctrl);
+}
+
+void alx_enable_aspm(struct alx_hw *hw, bool l0s_en, bool l1_en)
+{
+	u32 pmctrl;
+	u8 rev = alx_hw_revision(hw);
+
+	pmctrl = alx_read_mem32(hw, ALX_PMCTRL);
+
+	ALX_SET_FIELD(pmctrl, ALX_PMCTRL_LCKDET_TIMER,
+		      ALX_PMCTRL_LCKDET_TIMER_DEF);
+	pmctrl |= ALX_PMCTRL_RCVR_WT_1US |
+		  ALX_PMCTRL_L1_CLKSW_EN |
+		  ALX_PMCTRL_L1_SRDSRX_PWD;
+	ALX_SET_FIELD(pmctrl, ALX_PMCTRL_L1REQ_TO, ALX_PMCTRL_L1REG_TO_DEF);
+	ALX_SET_FIELD(pmctrl, ALX_PMCTRL_L1_TIMER, ALX_PMCTRL_L1_TIMER_16US);
+	pmctrl &= ~(ALX_PMCTRL_L1_SRDS_EN |
+		    ALX_PMCTRL_L1_SRDSPLL_EN |
+		    ALX_PMCTRL_L1_BUFSRX_EN |
+		    ALX_PMCTRL_SADLY_EN |
+		    ALX_PMCTRL_HOTRST_WTEN|
+		    ALX_PMCTRL_L0S_EN |
+		    ALX_PMCTRL_L1_EN |
+		    ALX_PMCTRL_ASPM_FCEN |
+		    ALX_PMCTRL_TXL1_AFTER_L0S |
+		    ALX_PMCTRL_RXL1_AFTER_L0S);
+	if (alx_is_rev_a(rev) && alx_hw_with_cr(hw))
+		pmctrl |= ALX_PMCTRL_L1_SRDS_EN | ALX_PMCTRL_L1_SRDSPLL_EN;
+
+	if (l0s_en)
+		pmctrl |= (ALX_PMCTRL_L0S_EN | ALX_PMCTRL_ASPM_FCEN);
+	if (l1_en)
+		pmctrl |= (ALX_PMCTRL_L1_EN | ALX_PMCTRL_ASPM_FCEN);
+
+	alx_write_mem32(hw, ALX_PMCTRL, pmctrl);
+}
+
+
+static u32 ethadv_to_hw_cfg(struct alx_hw *hw, u32 ethadv_cfg)
+{
+	u32 cfg = 0;
+
+	if (ethadv_cfg & ADVERTISED_Autoneg) {
+		cfg |= ALX_DRV_PHY_AUTO;
+		if (ethadv_cfg & ADVERTISED_10baseT_Half)
+			cfg |= ALX_DRV_PHY_10;
+		if (ethadv_cfg & ADVERTISED_10baseT_Full)
+			cfg |= ALX_DRV_PHY_10 | ALX_DRV_PHY_DUPLEX;
+		if (ethadv_cfg & ADVERTISED_100baseT_Half)
+			cfg |= ALX_DRV_PHY_100;
+		if (ethadv_cfg & ADVERTISED_100baseT_Full)
+			cfg |= ALX_DRV_PHY_100 | ALX_DRV_PHY_DUPLEX;
+		if (ethadv_cfg & ADVERTISED_1000baseT_Half)
+			cfg |= ALX_DRV_PHY_1000;
+		if (ethadv_cfg & ADVERTISED_1000baseT_Full)
+			cfg |= ALX_DRV_PHY_100 | ALX_DRV_PHY_DUPLEX;
+		if (ethadv_cfg & ADVERTISED_Pause)
+			cfg |= ADVERTISE_PAUSE_CAP;
+		if (ethadv_cfg & ADVERTISED_Asym_Pause)
+			cfg |= ADVERTISE_PAUSE_ASYM;
+	} else {
+		switch (ethadv_cfg) {
+		case ADVERTISED_10baseT_Half:
+			cfg |= ALX_DRV_PHY_10;
+			break;
+		case ADVERTISED_100baseT_Half:
+			cfg |= ALX_DRV_PHY_100;
+			break;
+		case ADVERTISED_10baseT_Full:
+			cfg |= ALX_DRV_PHY_10 | ALX_DRV_PHY_DUPLEX;
+			break;
+		case ADVERTISED_100baseT_Full:
+			cfg |= ALX_DRV_PHY_100 | ALX_DRV_PHY_DUPLEX;
+			break;
+		}
+	}
+
+	return cfg;
+}
+
+int alx_setup_speed_duplex(struct alx_hw *hw, u32 ethadv, u8 flowctrl)
+{
+	u16 adv, giga, cr;
+	u32 val;
+	int err = 0;
+
+	alx_write_phy_reg(hw, ALX_MII_DBG_ADDR, 0);
+	val = alx_read_mem32(hw, ALX_DRV);
+	ALX_SET_FIELD(val, ALX_DRV_PHY, 0);
+
+	if (ethadv & ADVERTISED_Autoneg) {
+		adv = ADVERTISE_CSMA;
+		adv |= ethtool_adv_to_mii_adv_t(ethadv);
+
+		if (flowctrl & ALX_FC_ANEG) {
+			if (flowctrl & ALX_FC_RX) {
+				adv |= ADVERTISED_Pause;
+				if (!(flowctrl & ALX_FC_TX))
+					adv |= ADVERTISED_Asym_Pause;
+			} else if (flowctrl & ALX_FC_TX) {
+				adv |= ADVERTISED_Asym_Pause;
+			}
+		}
+		giga = 0;
+		if (alx_hw_giga(hw))
+			giga = ethtool_adv_to_mii_ctrl1000_t(ethadv);
+
+		cr = BMCR_RESET | BMCR_ANENABLE | BMCR_ANRESTART;
+
+		if (alx_write_phy_reg(hw, MII_ADVERTISE, adv) ||
+		    alx_write_phy_reg(hw, MII_CTRL1000, giga) ||
+		    alx_write_phy_reg(hw, MII_BMCR, cr))
+			err = -EBUSY;
+	} else {
+		cr = BMCR_RESET;
+		if (ethadv == ADVERTISED_100baseT_Half ||
+		    ethadv == ADVERTISED_100baseT_Full)
+			cr |= BMCR_SPEED100;
+		if (ethadv == ADVERTISED_10baseT_Full ||
+		    ethadv == ADVERTISED_100baseT_Full)
+			cr |= BMCR_FULLDPLX;
+
+		err = alx_write_phy_reg(hw, MII_BMCR, cr);
+	}
+
+	if (!err) {
+		alx_write_phy_reg(hw, ALX_MII_DBG_ADDR, ALX_PHY_INITED);
+		val |= ethadv_to_hw_cfg(hw, ethadv);
+	}
+
+	alx_write_mem32(hw, ALX_DRV, val);
+
+	return err;
+}
+
+
+void alx_post_phy_link(struct alx_hw *hw)
+{
+	u16 phy_val, len, agc;
+	u8 revid = alx_hw_revision(hw);
+	bool adj_th = revid == ALX_REV_B0;
+	int speed;
+
+	if (hw->link_speed == SPEED_UNKNOWN)
+		speed = SPEED_UNKNOWN;
+	else
+		speed = hw->link_speed - hw->link_speed % 10;
+
+	if (revid != ALX_REV_B0 && !alx_is_rev_a(revid))
+		return;
+
+	/* 1000BT/AZ, wrong cable length */
+	if (speed != SPEED_UNKNOWN) {
+		alx_read_phy_ext(hw, ALX_MIIEXT_PCS, ALX_MIIEXT_CLDCTRL6,
+				 &phy_val);
+		len = ALX_GET_FIELD(phy_val, ALX_CLDCTRL6_CAB_LEN);
+		alx_read_phy_dbg(hw, ALX_MIIDBG_AGC, &phy_val);
+		agc = ALX_GET_FIELD(phy_val, ALX_AGC_2_VGA);
+
+		if ((speed == SPEED_1000 &&
+		     (len > ALX_CLDCTRL6_CAB_LEN_SHORT1G ||
+		      (len == 0 && agc > ALX_AGC_LONG1G_LIMT))) ||
+		    (speed == SPEED_100 &&
+		     (len > ALX_CLDCTRL6_CAB_LEN_SHORT100M ||
+		      (len == 0 && agc > ALX_AGC_LONG100M_LIMT)))) {
+			alx_write_phy_dbg(hw, ALX_MIIDBG_AZ_ANADECT,
+					  ALX_AZ_ANADECT_LONG);
+			alx_read_phy_ext(hw, ALX_MIIEXT_ANEG, ALX_MIIEXT_AFE,
+					 &phy_val);
+			alx_write_phy_ext(hw, ALX_MIIEXT_ANEG, ALX_MIIEXT_AFE,
+					  phy_val | ALX_AFE_10BT_100M_TH);
+		} else {
+			alx_write_phy_dbg(hw, ALX_MIIDBG_AZ_ANADECT,
+					  ALX_AZ_ANADECT_DEF);
+			alx_read_phy_ext(hw, ALX_MIIEXT_ANEG,
+					 ALX_MIIEXT_AFE, &phy_val);
+			alx_write_phy_ext(hw, ALX_MIIEXT_ANEG, ALX_MIIEXT_AFE,
+					  phy_val & ~ALX_AFE_10BT_100M_TH);
+		}
+
+		/* threshold adjust */
+		if (adj_th && hw->lnk_patch) {
+			if (speed == SPEED_100) {
+				alx_write_phy_dbg(hw, ALX_MIIDBG_MSE16DB,
+						  ALX_MSE16DB_UP);
+			} else if (speed == SPEED_1000) {
+				/*
+				 * Giga link threshold, raise the tolerance of
+				 * noise 50%
+				 */
+				alx_read_phy_dbg(hw, ALX_MIIDBG_MSE20DB,
+						 &phy_val);
+				ALX_SET_FIELD(phy_val, ALX_MSE20DB_TH,
+					      ALX_MSE20DB_TH_HI);
+				alx_write_phy_dbg(hw, ALX_MIIDBG_MSE20DB,
+						  phy_val);
+			}
+		}
+	} else {
+		alx_read_phy_ext(hw, ALX_MIIEXT_ANEG, ALX_MIIEXT_AFE,
+				 &phy_val);
+		alx_write_phy_ext(hw, ALX_MIIEXT_ANEG, ALX_MIIEXT_AFE,
+				  phy_val & ~ALX_AFE_10BT_100M_TH);
+
+		if (adj_th && hw->lnk_patch) {
+			alx_write_phy_dbg(hw, ALX_MIIDBG_MSE16DB,
+					  ALX_MSE16DB_DOWN);
+			alx_read_phy_dbg(hw, ALX_MIIDBG_MSE20DB, &phy_val);
+			ALX_SET_FIELD(phy_val, ALX_MSE20DB_TH,
+				      ALX_MSE20DB_TH_DEF);
+			alx_write_phy_dbg(hw, ALX_MIIDBG_MSE20DB, phy_val);
+		}
+	}
+}
+
+
+/* NOTE:
+ *    1. phy link must be established before calling this function
+ *    2. wol option (pattern,magic,link,etc.) is configed before call it.
+ */
+int alx_pre_suspend(struct alx_hw *hw, int speed)
+{
+	u32 master, mac, phy, val;
+	int err = 0;
+
+	master = alx_read_mem32(hw, ALX_MASTER);
+	master &= ~ALX_MASTER_PCLKSEL_SRDS;
+	mac = hw->rx_ctrl;
+	/* 10/100 half */
+	ALX_SET_FIELD(mac, ALX_MAC_CTRL_SPEED,  ALX_MAC_CTRL_SPEED_10_100);
+	mac &= ~(ALX_MAC_CTRL_FULLD | ALX_MAC_CTRL_RX_EN | ALX_MAC_CTRL_TX_EN);
+
+	phy = alx_read_mem32(hw, ALX_PHY_CTRL);
+	phy &= ~(ALX_PHY_CTRL_DSPRST_OUT | ALX_PHY_CTRL_CLS);
+	phy |= ALX_PHY_CTRL_RST_ANALOG | ALX_PHY_CTRL_HIB_PULSE |
+	       ALX_PHY_CTRL_HIB_EN;
+
+	/* without any activity  */
+	if (!(hw->sleep_ctrl & ALX_SLEEP_ACTIVE)) {
+		err = alx_write_phy_reg(hw, ALX_MII_IER, 0);
+		if (err)
+			return err;
+		phy |= ALX_PHY_CTRL_IDDQ | ALX_PHY_CTRL_POWER_DOWN;
+	} else {
+		if (hw->sleep_ctrl & (ALX_SLEEP_WOL_MAGIC | ALX_SLEEP_CIFS))
+			mac |= ALX_MAC_CTRL_RX_EN | ALX_MAC_CTRL_BRD_EN;
+		if (hw->sleep_ctrl & ALX_SLEEP_CIFS)
+			mac |= ALX_MAC_CTRL_TX_EN;
+		if (speed % 10 == DUPLEX_FULL)
+			mac |= ALX_MAC_CTRL_FULLD;
+		if (speed >= SPEED_1000)
+			ALX_SET_FIELD(mac, ALX_MAC_CTRL_SPEED,
+				      ALX_MAC_CTRL_SPEED_1000);
+		phy |= ALX_PHY_CTRL_DSPRST_OUT;
+		err = alx_write_phy_ext(hw, ALX_MIIEXT_ANEG,
+					ALX_MIIEXT_S3DIG10,
+					ALX_MIIEXT_S3DIG10_SL);
+		if (err)
+			return err;
+	}
+
+	alx_enable_osc(hw);
+	hw->rx_ctrl = mac;
+	alx_write_mem32(hw, ALX_MASTER, master);
+	alx_write_mem32(hw, ALX_MAC_CTRL, mac);
+	alx_write_mem32(hw, ALX_PHY_CTRL, phy);
+
+	/* set val of PDLL D3PLLOFF */
+	val = alx_read_mem32(hw, ALX_PDLL_TRNS1);
+	val |= ALX_PDLL_TRNS1_D3PLLOFF_EN;
+	alx_write_mem32(hw, ALX_PDLL_TRNS1, val);
+
+	return 0;
+}
+
+bool alx_phy_configured(struct alx_hw *hw)
+{
+	u32 cfg, hw_cfg;
+
+	cfg = ethadv_to_hw_cfg(hw, hw->adv_cfg);
+	cfg = ALX_GET_FIELD(cfg, ALX_DRV_PHY);
+	hw_cfg = alx_get_phy_config(hw);
+
+	if (hw_cfg == ALX_DRV_PHY_UNKNOWN)
+		return false;
+
+	return cfg == hw_cfg;
+}
+
+int alx_get_phy_link(struct alx_hw *hw, int *speed)
+{
+	struct pci_dev *pdev = hw->pdev;
+	u16 bmsr, giga;
+	int err;
+
+	err = alx_read_phy_reg(hw, MII_BMSR, &bmsr);
+	if (err)
+		return err;
+
+	err = alx_read_phy_reg(hw, MII_BMSR, &bmsr);
+	if (err)
+		return err;
+
+	if (!(bmsr & BMSR_LSTATUS)) {
+		*speed = SPEED_UNKNOWN;
+		return 0;
+	}
+
+	/* speed/duplex result is saved in PHY Specific Status Register */
+	err = alx_read_phy_reg(hw, ALX_MII_GIGA_PSSR, &giga);
+	if (err)
+		return err;
+
+	if (!(giga & ALX_GIGA_PSSR_SPD_DPLX_RESOLVED))
+		goto wrong_speed;
+
+	switch (giga & ALX_GIGA_PSSR_SPEED) {
+	case ALX_GIGA_PSSR_1000MBS:
+		*speed = SPEED_1000;
+		break;
+	case ALX_GIGA_PSSR_100MBS:
+		*speed = SPEED_100;
+		break;
+	case ALX_GIGA_PSSR_10MBS:
+		*speed = SPEED_10;
+		break;
+	default:
+		goto wrong_speed;
+	}
+
+	*speed += (giga & ALX_GIGA_PSSR_DPLX) ? DUPLEX_FULL : DUPLEX_HALF;
+	return 1;
+
+wrong_speed:
+	dev_err(&pdev->dev, "invalid PHY speed/duplex: 0x%x\n", giga);
+	return -EINVAL;
+}
+
+int alx_clear_phy_intr(struct alx_hw *hw)
+{
+	u16 isr;
+
+	/* clear interrupt status by reading it */
+	return alx_read_phy_reg(hw, ALX_MII_ISR, &isr);
+}
+
+int alx_config_wol(struct alx_hw *hw)
+{
+	u32 wol = 0;
+	int err = 0;
+
+	/* turn on magic packet event */
+	if (hw->sleep_ctrl & ALX_SLEEP_WOL_MAGIC)
+		wol |= ALX_WOL0_MAGIC_EN | ALX_WOL0_PME_MAGIC_EN;
+
+	/* turn on link up event */
+	if (hw->sleep_ctrl & ALX_SLEEP_WOL_PHY) {
+		wol |=  ALX_WOL0_LINK_EN | ALX_WOL0_PME_LINK;
+		/* only link up can wake up */
+		err = alx_write_phy_reg(hw, ALX_MII_IER, ALX_IER_LINK_UP);
+	}
+	alx_write_mem32(hw, ALX_WOL0, wol);
+
+	return err;
+}
+
+void alx_disable_rss(struct alx_hw *hw)
+{
+	u32 ctrl = alx_read_mem32(hw, ALX_RXQ0);
+
+	ctrl &= ~ALX_RXQ0_RSS_HASH_EN;
+	alx_write_mem32(hw, ALX_RXQ0, ctrl);
+}
+
+void alx_configure_basic(struct alx_hw *hw)
+{
+	u32 val, raw_mtu, max_payload;
+	u16 val16;
+	u8 chip_rev = alx_hw_revision(hw);
+
+	alx_set_macaddr(hw, hw->mac_addr);
+
+	alx_write_mem32(hw, ALX_CLK_GATE, ALX_CLK_GATE_ALL);
+
+	/* idle timeout to switch clk_125M */
+	if (chip_rev >= ALX_REV_B0)
+		alx_write_mem32(hw, ALX_IDLE_DECISN_TIMER,
+				ALX_IDLE_DECISN_TIMER_DEF);
+
+	alx_write_mem32(hw, ALX_SMB_TIMER, hw->smb_timer * 500UL);
+
+	val = alx_read_mem32(hw, ALX_MASTER);
+	val |= ALX_MASTER_IRQMOD2_EN |
+	       ALX_MASTER_IRQMOD1_EN |
+	       ALX_MASTER_SYSALVTIMER_EN;
+	alx_write_mem32(hw, ALX_MASTER, val);
+	alx_write_mem32(hw, ALX_IRQ_MODU_TIMER,
+			(hw->imt >> 1) << ALX_IRQ_MODU_TIMER1_SHIFT);
+	/* intr re-trig timeout */
+	alx_write_mem32(hw, ALX_INT_RETRIG, ALX_INT_RETRIG_TO);
+	/* tpd threshold to trig int */
+	alx_write_mem32(hw, ALX_TINT_TPD_THRSHLD, hw->ith_tpd);
+	alx_write_mem32(hw, ALX_TINT_TIMER, hw->imt);
+
+	raw_mtu = hw->mtu + ETH_HLEN;
+	alx_write_mem32(hw, ALX_MTU, raw_mtu + 8);
+	if (raw_mtu > ALX_MTU_JUMBO_TH)
+		hw->rx_ctrl &= ~ALX_MAC_CTRL_FAST_PAUSE;
+
+	if ((raw_mtu + 8) < ALX_TXQ1_JUMBO_TSO_TH)
+		val = (raw_mtu + 8 + 7) >> 3;
+	else
+		val = ALX_TXQ1_JUMBO_TSO_TH >> 3;
+	alx_write_mem32(hw, ALX_TXQ1, val | ALX_TXQ1_ERRLGPKT_DROP_EN);
+
+	max_payload = pcie_get_readrq(hw->pdev) >> 8;
+	/*
+	 * if BIOS had changed the default dma read max length,
+	 * restore it to default value
+	 */
+	if (max_payload < ALX_DEV_CTRL_MAXRRS_MIN)
+		pcie_set_readrq(hw->pdev, 128 << ALX_DEV_CTRL_MAXRRS_MIN);
+
+	val = ALX_TXQ_TPD_BURSTPREF_DEF << ALX_TXQ0_TPD_BURSTPREF_SHIFT |
+	      ALX_TXQ0_MODE_ENHANCE | ALX_TXQ0_LSO_8023_EN |
+	      ALX_TXQ0_SUPT_IPOPT |
+	      ALX_TXQ_TXF_BURST_PREF_DEF << ALX_TXQ0_TXF_BURST_PREF_SHIFT;
+	alx_write_mem32(hw, ALX_TXQ0, val);
+	val = ALX_TXQ_TPD_BURSTPREF_DEF << ALX_HQTPD_Q1_NUMPREF_SHIFT |
+	      ALX_TXQ_TPD_BURSTPREF_DEF << ALX_HQTPD_Q2_NUMPREF_SHIFT |
+	      ALX_TXQ_TPD_BURSTPREF_DEF << ALX_HQTPD_Q3_NUMPREF_SHIFT |
+	      ALX_HQTPD_BURST_EN;
+	alx_write_mem32(hw, ALX_HQTPD, val);
+
+	/* rxq, flow control */
+	val = alx_read_mem32(hw, ALX_SRAM5);
+	val = ALX_GET_FIELD(val, ALX_SRAM_RXF_LEN) << 3;
+	if (val > ALX_SRAM_RXF_LEN_8K) {
+		val16 = ALX_MTU_STD_ALGN >> 3;
+		val = (val - ALX_RXQ2_RXF_FLOW_CTRL_RSVD) >> 3;
+	} else {
+		val16 = ALX_MTU_STD_ALGN >> 3;
+		val = (val - ALX_MTU_STD_ALGN) >> 3;
+	}
+	alx_write_mem32(hw, ALX_RXQ2,
+			val16 << ALX_RXQ2_RXF_XOFF_THRESH_SHIFT |
+			val << ALX_RXQ2_RXF_XON_THRESH_SHIFT);
+	val = ALX_RXQ0_NUM_RFD_PREF_DEF << ALX_RXQ0_NUM_RFD_PREF_SHIFT |
+	      ALX_RXQ0_RSS_MODE_DIS << ALX_RXQ0_RSS_MODE_SHIFT |
+	      ALX_RXQ0_IDT_TBL_SIZE_DEF << ALX_RXQ0_IDT_TBL_SIZE_SHIFT |
+	      ALX_RXQ0_RSS_HSTYP_ALL | ALX_RXQ0_RSS_HASH_EN |
+	      ALX_RXQ0_IPV6_PARSE_EN;
+
+	if (alx_hw_giga(hw))
+		ALX_SET_FIELD(val, ALX_RXQ0_ASPM_THRESH,
+			      ALX_RXQ0_ASPM_THRESH_100M);
+
+	alx_write_mem32(hw, ALX_RXQ0, val);
+
+	val = alx_read_mem32(hw, ALX_DMA);
+	val = ALX_DMA_RORDER_MODE_OUT << ALX_DMA_RORDER_MODE_SHIFT |
+	      ALX_DMA_RREQ_PRI_DATA |
+	      max_payload << ALX_DMA_RREQ_BLEN_SHIFT |
+	      ALX_DMA_WDLY_CNT_DEF << ALX_DMA_WDLY_CNT_SHIFT |
+	      ALX_DMA_RDLY_CNT_DEF << ALX_DMA_RDLY_CNT_SHIFT |
+	      (hw->dma_chnl - 1) << ALX_DMA_RCHNL_SEL_SHIFT;
+	alx_write_mem32(hw, ALX_DMA, val);
+
+	/* default multi-tx-q weights */
+	val = ALX_WRR_PRI_RESTRICT_NONE << ALX_WRR_PRI_SHIFT |
+	      4 << ALX_WRR_PRI0_SHIFT |
+	      4 << ALX_WRR_PRI1_SHIFT |
+	      4 << ALX_WRR_PRI2_SHIFT |
+	      4 << ALX_WRR_PRI3_SHIFT;
+	alx_write_mem32(hw, ALX_WRR, val);
+}
+
+static inline u32 alx_speed_to_ethadv(int speed)
+{
+	switch (speed) {
+	case SPEED_1000 + DUPLEX_FULL:
+		return ADVERTISED_1000baseT_Full;
+	case SPEED_100 + DUPLEX_FULL:
+		return ADVERTISED_100baseT_Full;
+	case SPEED_100 + DUPLEX_HALF:
+		return ADVERTISED_10baseT_Half;
+	case SPEED_10 + DUPLEX_FULL:
+		return ADVERTISED_10baseT_Full;
+	case SPEED_10 + DUPLEX_HALF:
+		return ADVERTISED_10baseT_Half;
+	default:
+		return 0;
+	}
+}
+
+int alx_select_powersaving_speed(struct alx_hw *hw, int *speed)
+{
+	int i, err, spd;
+	u16 lpa;
+
+	err = alx_get_phy_link(hw, &spd);
+	if (err < 0)
+		return err;
+
+	if (spd == SPEED_UNKNOWN)
+		return 0;
+
+	err = alx_read_phy_reg(hw, MII_LPA, &lpa);
+	if (err)
+		return err;
+
+	if (!(lpa & LPA_LPACK)) {
+		*speed = spd;
+		return 0;
+	}
+
+	if (lpa & LPA_10FULL)
+		*speed = SPEED_10 + DUPLEX_FULL;
+	else if (lpa & LPA_10HALF)
+		*speed = SPEED_10 + DUPLEX_HALF;
+	else if (lpa & LPA_100FULL)
+		*speed = SPEED_100 + DUPLEX_FULL;
+	else
+		*speed = SPEED_100 + DUPLEX_HALF;
+
+	if (*speed != spd) {
+		err = alx_write_phy_reg(hw, ALX_MII_IER, 0);
+		if (err)
+			return err;
+		err = alx_setup_speed_duplex(hw,
+					     alx_speed_to_ethadv(*speed) |
+					     ADVERTISED_Autoneg,
+					     ALX_FC_ANEG | ALX_FC_RX |
+					     ALX_FC_TX);
+		if (err)
+			return err;
+
+		/* wait for linkup */
+		for (i = 0; i < ALX_MAX_SETUP_LNK_CYCLE; i++) {
+			int speed2;
+
+			msleep(100);
+
+			err = alx_get_phy_link(hw, &speed2);
+			if (err < 0)
+				return err;
+			if (speed2 != SPEED_UNKNOWN)
+				break;
+		}
+		if (i == ALX_MAX_SETUP_LNK_CYCLE)
+			return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+bool alx_get_phy_info(struct alx_hw *hw)
+{
+	u16  devs1, devs2;
+
+	if (alx_read_phy_reg(hw, MII_PHYSID1, &hw->phy_id[0]) ||
+	    alx_read_phy_reg(hw, MII_PHYSID2, &hw->phy_id[1]))
+		return false;
+
+	/* since we haven't PMA/PMD status2 register, we can't
+	 * use mdio45_probe function for prtad and mmds.
+	 * use fixed MMD3 to get mmds.
+	 */
+	if (alx_read_phy_ext(hw, 3, MDIO_DEVS1, &devs1) ||
+	    alx_read_phy_ext(hw, 3, MDIO_DEVS2, &devs2))
+		return false;
+	hw->mdio.mmds = devs1 | devs2 << 16;
+
+	return true;
+}

diff --git a/drivers/net/ethernet/atheros/alx/hw.h b/drivers/net/ethernet/atheros/alx/hw.h
new file mode 100644
index 0000000..65e723d
--- /dev/null
+++ b/drivers/net/ethernet/atheros/alx/hw.h

@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2013 Johannes Berg <johannes@sipsolutions.net>
+ *
+ *  This file is free software: you may copy, redistribute and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation, either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  This file is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This file incorporates work covered by the following copyright and
+ * permission notice:
+ *
+ * Copyright (c) 2012 Qualcomm Atheros, Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef ALX_HW_H_
+#define ALX_HW_H_
+#include <linux/types.h>
+#include <linux/mdio.h>
+#include <linux/pci.h>
+#include "reg.h"
+
+/* Transmit Packet Descriptor, contains 4 32-bit words.
+ *
+ *   31               16               0
+ *   +----------------+----------------+
+ *   |    vlan-tag    |   buf length   |
+ *   +----------------+----------------+
+ *   |              Word 1             |
+ *   +----------------+----------------+
+ *   |      Word 2: buf addr lo        |
+ *   +----------------+----------------+
+ *   |      Word 3: buf addr hi        |
+ *   +----------------+----------------+
+ *
+ * Word 2 and 3 combine to form a 64-bit buffer address
+ *
+ * Word 1 has three forms, depending on the state of bit 8/12/13:
+ * if bit8 =='1', the definition is just for custom checksum offload.
+ * if bit8 == '0' && bit12 == '1' && bit13 == '1', the *FIRST* descriptor
+ *     for the skb is special for LSO V2, Word 2 become total skb length ,
+ *     Word 3 is meaningless.
+ * other condition, the definition is for general skb or ip/tcp/udp
+ *     checksum or LSO(TSO) offload.
+ *
+ * Here is the depiction:
+ *
+ *   0-+                                  0-+
+ *   1 |                                  1 |
+ *   2 |                                  2 |
+ *   3 |    Payload offset                3 |    L4 header offset
+ *   4 |        (7:0)                     4 |        (7:0)
+ *   5 |                                  5 |
+ *   6 |                                  6 |
+ *   7-+                                  7-+
+ *   8      Custom csum enable = 1        8      Custom csum enable = 0
+ *   9      General IPv4 checksum         9      General IPv4 checksum
+ *   10     General TCP checksum          10     General TCP checksum
+ *   11     General UDP checksum          11     General UDP checksum
+ *   12     Large Send Segment enable     12     Large Send Segment enable
+ *   13     Large Send Segment type       13     Large Send Segment type
+ *   14     VLAN tagged                   14     VLAN tagged
+ *   15     Insert VLAN tag               15     Insert VLAN tag
+ *   16     IPv4 packet                   16     IPv4 packet
+ *   17     Ethernet frame type           17     Ethernet frame type
+ *   18-+                                 18-+
+ *   19 |                                 19 |
+ *   20 |                                 20 |
+ *   21 |   Custom csum offset            21 |
+ *   22 |       (25:18)                   22 |
+ *   23 |                                 23 |   MSS (30:18)
+ *   24 |                                 24 |
+ *   25-+                                 25 |
+ *   26-+                                 26 |
+ *   27 |                                 27 |
+ *   28 |   Reserved                      28 |
+ *   29 |                                 29 |
+ *   30-+                                 30-+
+ *   31     End of packet                 31     End of packet
+ */
+struct alx_txd {
+	__le16 len;
+	__le16 vlan_tag;
+	__le32 word1;
+	union {
+		__le64 addr;
+		struct {
+			__le32 pkt_len;
+			__le32 resvd;
+		} l;
+	} adrl;
+} __packed;
+
+/* tpd word 1 */
+#define TPD_CXSUMSTART_MASK		0x00FF
+#define TPD_CXSUMSTART_SHIFT		0
+#define TPD_L4HDROFFSET_MASK		0x00FF
+#define TPD_L4HDROFFSET_SHIFT		0
+#define TPD_CXSUM_EN_MASK		0x0001
+#define TPD_CXSUM_EN_SHIFT		8
+#define TPD_IP_XSUM_MASK		0x0001
+#define TPD_IP_XSUM_SHIFT		9
+#define TPD_TCP_XSUM_MASK		0x0001
+#define TPD_TCP_XSUM_SHIFT		10
+#define TPD_UDP_XSUM_MASK		0x0001
+#define TPD_UDP_XSUM_SHIFT		11
+#define TPD_LSO_EN_MASK			0x0001
+#define TPD_LSO_EN_SHIFT		12
+#define TPD_LSO_V2_MASK			0x0001
+#define TPD_LSO_V2_SHIFT		13
+#define TPD_VLTAGGED_MASK		0x0001
+#define TPD_VLTAGGED_SHIFT		14
+#define TPD_INS_VLTAG_MASK		0x0001
+#define TPD_INS_VLTAG_SHIFT		15
+#define TPD_IPV4_MASK			0x0001
+#define TPD_IPV4_SHIFT			16
+#define TPD_ETHTYPE_MASK		0x0001
+#define TPD_ETHTYPE_SHIFT		17
+#define TPD_CXSUMOFFSET_MASK		0x00FF
+#define TPD_CXSUMOFFSET_SHIFT		18
+#define TPD_MSS_MASK			0x1FFF
+#define TPD_MSS_SHIFT			18
+#define TPD_EOP_MASK			0x0001
+#define TPD_EOP_SHIFT			31
+
+#define DESC_GET(_x, _name) ((_x) >> _name##SHIFT & _name##MASK)
+
+/* Receive Free Descriptor */
+struct alx_rfd {
+	__le64 addr;		/* data buffer address, length is
+				 * declared in register --- every
+				 * buffer has the same size
+				 */
+} __packed;
+
+/* Receive Return Descriptor, contains 4 32-bit words.
+ *
+ *   31               16               0
+ *   +----------------+----------------+
+ *   |              Word 0             |
+ *   +----------------+----------------+
+ *   |     Word 1: RSS Hash value      |
+ *   +----------------+----------------+
+ *   |              Word 2             |
+ *   +----------------+----------------+
+ *   |              Word 3             |
+ *   +----------------+----------------+
+ *
+ * Word 0 depiction         &            Word 2 depiction:
+ *
+ *   0--+                                 0--+
+ *   1  |                                 1  |
+ *   2  |                                 2  |
+ *   3  |                                 3  |
+ *   4  |                                 4  |
+ *   5  |                                 5  |
+ *   6  |                                 6  |
+ *   7  |    IP payload checksum          7  |     VLAN tag
+ *   8  |         (15:0)                  8  |      (15:0)
+ *   9  |                                 9  |
+ *   10 |                                 10 |
+ *   11 |                                 11 |
+ *   12 |                                 12 |
+ *   13 |                                 13 |
+ *   14 |                                 14 |
+ *   15-+                                 15-+
+ *   16-+                                 16-+
+ *   17 |     Number of RFDs              17 |
+ *   18 |        (19:16)                  18 |
+ *   19-+                                 19 |     Protocol ID
+ *   20-+                                 20 |      (23:16)
+ *   21 |                                 21 |
+ *   22 |                                 22 |
+ *   23 |                                 23-+
+ *   24 |                                 24 |     Reserved
+ *   25 |     Start index of RFD-ring     25-+
+ *   26 |         (31:20)                 26 |     RSS Q-num (27:25)
+ *   27 |                                 27-+
+ *   28 |                                 28-+
+ *   29 |                                 29 |     RSS Hash algorithm
+ *   30 |                                 30 |      (31:28)
+ *   31-+                                 31-+
+ *
+ * Word 3 depiction:
+ *
+ *   0--+
+ *   1  |
+ *   2  |
+ *   3  |
+ *   4  |
+ *   5  |
+ *   6  |
+ *   7  |    Packet length (include FCS)
+ *   8  |         (13:0)
+ *   9  |
+ *   10 |
+ *   11 |
+ *   12 |
+ *   13-+
+ *   14      L4 Header checksum error
+ *   15      IPv4 checksum error
+ *   16      VLAN tagged
+ *   17-+
+ *   18 |    Protocol ID (19:17)
+ *   19-+
+ *   20      Receive error summary
+ *   21      FCS(CRC) error
+ *   22      Frame alignment error
+ *   23      Truncated packet
+ *   24      Runt packet
+ *   25      Incomplete packet due to insufficient rx-desc
+ *   26      Broadcast packet
+ *   27      Multicast packet
+ *   28      Ethernet type (EII or 802.3)
+ *   29      FIFO overflow
+ *   30      Length error (for 802.3, length field mismatch with actual len)
+ *   31      Updated, indicate to driver that this RRD is refreshed.
+ */
+struct alx_rrd {
+	__le32 word0;
+	__le32 rss_hash;
+	__le32 word2;
+	__le32 word3;
+} __packed;
+
+/* rrd word 0 */
+#define RRD_XSUM_MASK		0xFFFF
+#define RRD_XSUM_SHIFT		0
+#define RRD_NOR_MASK		0x000F
+#define RRD_NOR_SHIFT		16
+#define RRD_SI_MASK		0x0FFF
+#define RRD_SI_SHIFT		20
+
+/* rrd word 2 */
+#define RRD_VLTAG_MASK		0xFFFF
+#define RRD_VLTAG_SHIFT		0
+#define RRD_PID_MASK		0x00FF
+#define RRD_PID_SHIFT		16
+/* non-ip packet */
+#define RRD_PID_NONIP		0
+/* ipv4(only) */
+#define RRD_PID_IPV4		1
+/* tcp/ipv6 */
+#define RRD_PID_IPV6TCP		2
+/* tcp/ipv4 */
+#define RRD_PID_IPV4TCP		3
+/* udp/ipv6 */
+#define RRD_PID_IPV6UDP		4
+/* udp/ipv4 */
+#define RRD_PID_IPV4UDP		5
+/* ipv6(only) */
+#define RRD_PID_IPV6		6
+/* LLDP packet */
+#define RRD_PID_LLDP		7
+/* 1588 packet */
+#define RRD_PID_1588		8
+#define RRD_RSSQ_MASK		0x0007
+#define RRD_RSSQ_SHIFT		25
+#define RRD_RSSALG_MASK		0x000F
+#define RRD_RSSALG_SHIFT	28
+#define RRD_RSSALG_TCPV6	0x1
+#define RRD_RSSALG_IPV6		0x2
+#define RRD_RSSALG_TCPV4	0x4
+#define RRD_RSSALG_IPV4		0x8
+
+/* rrd word 3 */
+#define RRD_PKTLEN_MASK		0x3FFF
+#define RRD_PKTLEN_SHIFT	0
+#define RRD_ERR_L4_MASK		0x0001
+#define RRD_ERR_L4_SHIFT	14
+#define RRD_ERR_IPV4_MASK	0x0001
+#define RRD_ERR_IPV4_SHIFT	15
+#define RRD_VLTAGGED_MASK	0x0001
+#define RRD_VLTAGGED_SHIFT	16
+#define RRD_OLD_PID_MASK	0x0007
+#define RRD_OLD_PID_SHIFT	17
+#define RRD_ERR_RES_MASK	0x0001
+#define RRD_ERR_RES_SHIFT	20
+#define RRD_ERR_FCS_MASK	0x0001
+#define RRD_ERR_FCS_SHIFT	21
+#define RRD_ERR_FAE_MASK	0x0001
+#define RRD_ERR_FAE_SHIFT	22
+#define RRD_ERR_TRUNC_MASK	0x0001
+#define RRD_ERR_TRUNC_SHIFT	23
+#define RRD_ERR_RUNT_MASK	0x0001
+#define RRD_ERR_RUNT_SHIFT	24
+#define RRD_ERR_ICMP_MASK	0x0001
+#define RRD_ERR_ICMP_SHIFT	25
+#define RRD_BCAST_MASK		0x0001
+#define RRD_BCAST_SHIFT		26
+#define RRD_MCAST_MASK		0x0001
+#define RRD_MCAST_SHIFT		27
+#define RRD_ETHTYPE_MASK	0x0001
+#define RRD_ETHTYPE_SHIFT	28
+#define RRD_ERR_FIFOV_MASK	0x0001
+#define RRD_ERR_FIFOV_SHIFT	29
+#define RRD_ERR_LEN_MASK	0x0001
+#define RRD_ERR_LEN_SHIFT	30
+#define RRD_UPDATED_MASK	0x0001
+#define RRD_UPDATED_SHIFT	31
+
+
+#define ALX_MAX_SETUP_LNK_CYCLE	50
+
+/* for FlowControl */
+#define ALX_FC_RX		0x01
+#define ALX_FC_TX		0x02
+#define ALX_FC_ANEG		0x04
+
+/* for sleep control */
+#define ALX_SLEEP_WOL_PHY	0x00000001
+#define ALX_SLEEP_WOL_MAGIC	0x00000002
+#define ALX_SLEEP_CIFS		0x00000004
+#define ALX_SLEEP_ACTIVE	(ALX_SLEEP_WOL_PHY | \
+				 ALX_SLEEP_WOL_MAGIC | \
+				 ALX_SLEEP_CIFS)
+
+/* for RSS hash type */
+#define ALX_RSS_HASH_TYPE_IPV4		0x1
+#define ALX_RSS_HASH_TYPE_IPV4_TCP	0x2
+#define ALX_RSS_HASH_TYPE_IPV6		0x4
+#define ALX_RSS_HASH_TYPE_IPV6_TCP	0x8
+#define ALX_RSS_HASH_TYPE_ALL		(ALX_RSS_HASH_TYPE_IPV4 | \
+					 ALX_RSS_HASH_TYPE_IPV4_TCP | \
+					 ALX_RSS_HASH_TYPE_IPV6 | \
+					 ALX_RSS_HASH_TYPE_IPV6_TCP)
+#define ALX_DEF_RXBUF_SIZE	1536
+#define ALX_MAX_JUMBO_PKT_SIZE	(9*1024)
+#define ALX_MAX_TSO_PKT_SIZE	(7*1024)
+#define ALX_MAX_FRAME_SIZE	ALX_MAX_JUMBO_PKT_SIZE
+#define ALX_MIN_FRAME_SIZE	68
+#define ALX_RAW_MTU(_mtu)	(_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN)
+
+#define ALX_MAX_RX_QUEUES	8
+#define ALX_MAX_TX_QUEUES	4
+#define ALX_MAX_HANDLED_INTRS	5
+
+#define ALX_ISR_MISC		(ALX_ISR_PCIE_LNKDOWN | \
+				 ALX_ISR_DMAW | \
+				 ALX_ISR_DMAR | \
+				 ALX_ISR_SMB | \
+				 ALX_ISR_MANU | \
+				 ALX_ISR_TIMER)
+
+#define ALX_ISR_FATAL		(ALX_ISR_PCIE_LNKDOWN | \
+				 ALX_ISR_DMAW | ALX_ISR_DMAR)
+
+#define ALX_ISR_ALERT		(ALX_ISR_RXF_OV | \
+				 ALX_ISR_TXF_UR | \
+				 ALX_ISR_RFD_UR)
+
+#define ALX_ISR_ALL_QUEUES	(ALX_ISR_TX_Q0 | \
+				 ALX_ISR_TX_Q1 | \
+				 ALX_ISR_TX_Q2 | \
+				 ALX_ISR_TX_Q3 | \
+				 ALX_ISR_RX_Q0 | \
+				 ALX_ISR_RX_Q1 | \
+				 ALX_ISR_RX_Q2 | \
+				 ALX_ISR_RX_Q3 | \
+				 ALX_ISR_RX_Q4 | \
+				 ALX_ISR_RX_Q5 | \
+				 ALX_ISR_RX_Q6 | \
+				 ALX_ISR_RX_Q7)
+
+/* maximum interrupt vectors for msix */
+#define ALX_MAX_MSIX_INTRS	16
+
+#define ALX_GET_FIELD(_data, _field)					\
+	(((_data) >> _field ## _SHIFT) & _field ## _MASK)
+
+#define ALX_SET_FIELD(_data, _field, _value)	do {			\
+		(_data) &= ~(_field ## _MASK << _field ## _SHIFT);	\
+		(_data) |= ((_value) & _field ## _MASK) << _field ## _SHIFT;\
+	} while (0)
+
+struct alx_hw {
+	struct pci_dev *pdev;
+	u8 __iomem *hw_addr;
+
+	/* current & permanent mac addr */
+	u8 mac_addr[ETH_ALEN];
+	u8 perm_addr[ETH_ALEN];
+
+	u16 mtu;
+	u16 imt;
+	u8 dma_chnl;
+	u8 max_dma_chnl;
+	/* tpd threshold to trig INT */
+	u32 ith_tpd;
+	u32 rx_ctrl;
+	u32 mc_hash[2];
+
+	u32 smb_timer;
+	/* SPEED_* + DUPLEX_*, SPEED_UNKNOWN if link is down */
+	int link_speed;
+
+	/* auto-neg advertisement or force mode config */
+	u32 adv_cfg;
+	u8 flowctrl;
+
+	u32 sleep_ctrl;
+
+	spinlock_t mdio_lock;
+	struct mdio_if_info mdio;
+	u16 phy_id[2];
+
+	/* PHY link patch flag */
+	bool lnk_patch;
+};
+
+static inline int alx_hw_revision(struct alx_hw *hw)
+{
+	return hw->pdev->revision >> ALX_PCI_REVID_SHIFT;
+}
+
+static inline bool alx_hw_with_cr(struct alx_hw *hw)
+{
+	return hw->pdev->revision & 1;
+}
+
+static inline bool alx_hw_giga(struct alx_hw *hw)
+{
+	return hw->pdev->device & 1;
+}
+
+static inline void alx_write_mem8(struct alx_hw *hw, u32 reg, u8 val)
+{
+	writeb(val, hw->hw_addr + reg);
+}
+
+static inline void alx_write_mem16(struct alx_hw *hw, u32 reg, u16 val)
+{
+	writew(val, hw->hw_addr + reg);
+}
+
+static inline u16 alx_read_mem16(struct alx_hw *hw, u32 reg)
+{
+	return readw(hw->hw_addr + reg);
+}
+
+static inline void alx_write_mem32(struct alx_hw *hw, u32 reg, u32 val)
+{
+	writel(val, hw->hw_addr + reg);
+}
+
+static inline u32 alx_read_mem32(struct alx_hw *hw, u32 reg)
+{
+	return readl(hw->hw_addr + reg);
+}
+
+static inline void alx_post_write(struct alx_hw *hw)
+{
+	readl(hw->hw_addr);
+}
+
+int alx_get_perm_macaddr(struct alx_hw *hw, u8 *addr);
+void alx_reset_phy(struct alx_hw *hw);
+void alx_reset_pcie(struct alx_hw *hw);
+void alx_enable_aspm(struct alx_hw *hw, bool l0s_en, bool l1_en);
+int alx_setup_speed_duplex(struct alx_hw *hw, u32 ethadv, u8 flowctrl);
+void alx_post_phy_link(struct alx_hw *hw);
+int alx_pre_suspend(struct alx_hw *hw, int speed);
+int alx_read_phy_reg(struct alx_hw *hw, u16 reg, u16 *phy_data);
+int alx_write_phy_reg(struct alx_hw *hw, u16 reg, u16 phy_data);
+int alx_read_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 *pdata);
+int alx_write_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 data);
+int alx_get_phy_link(struct alx_hw *hw, int *speed);
+int alx_clear_phy_intr(struct alx_hw *hw);
+int alx_config_wol(struct alx_hw *hw);
+void alx_cfg_mac_flowcontrol(struct alx_hw *hw, u8 fc);
+void alx_start_mac(struct alx_hw *hw);
+int alx_reset_mac(struct alx_hw *hw);
+void alx_set_macaddr(struct alx_hw *hw, const u8 *addr);
+bool alx_phy_configured(struct alx_hw *hw);
+void alx_configure_basic(struct alx_hw *hw);
+void alx_disable_rss(struct alx_hw *hw);
+int alx_select_powersaving_speed(struct alx_hw *hw, int *speed);
+bool alx_get_phy_info(struct alx_hw *hw);
+
+#endif

diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
new file mode 100644
index 0000000..418de8b
--- /dev/null
+++ b/drivers/net/ethernet/atheros/alx/main.c

@@ -0,0 +1,1625 @@
+/*
+ * Copyright (c) 2013 Johannes Berg <johannes@sipsolutions.net>
+ *
+ *  This file is free software: you may copy, redistribute and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation, either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  This file is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This file incorporates work covered by the following copyright and
+ * permission notice:
+ *
+ * Copyright (c) 2012 Qualcomm Atheros, Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/mdio.h>
+#include <linux/aer.h>
+#include <linux/bitops.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/ip6_checksum.h>
+#include <linux/crc32.h>
+#include "alx.h"
+#include "hw.h"
+#include "reg.h"
+
+const char alx_drv_name[] = "alx";
+
+
+static void alx_free_txbuf(struct alx_priv *alx, int entry)
+{
+	struct alx_buffer *txb = &alx->txq.bufs[entry];
+
+	if (dma_unmap_len(txb, size)) {
+		dma_unmap_single(&alx->hw.pdev->dev,
+				 dma_unmap_addr(txb, dma),
+				 dma_unmap_len(txb, size),
+				 DMA_TO_DEVICE);
+		dma_unmap_len_set(txb, size, 0);
+	}
+
+	if (txb->skb) {
+		dev_kfree_skb_any(txb->skb);
+		txb->skb = NULL;
+	}
+}
+
+static int alx_refill_rx_ring(struct alx_priv *alx, gfp_t gfp)
+{
+	struct alx_rx_queue *rxq = &alx->rxq;
+	struct sk_buff *skb;
+	struct alx_buffer *cur_buf;
+	dma_addr_t dma;
+	u16 cur, next, count = 0;
+
+	next = cur = rxq->write_idx;
+	if (++next == alx->rx_ringsz)
+		next = 0;
+	cur_buf = &rxq->bufs[cur];
+
+	while (!cur_buf->skb && next != rxq->read_idx) {
+		struct alx_rfd *rfd = &rxq->rfd[cur];
+
+		skb = __netdev_alloc_skb(alx->dev, alx->rxbuf_size, gfp);
+		if (!skb)
+			break;
+		dma = dma_map_single(&alx->hw.pdev->dev,
+				     skb->data, alx->rxbuf_size,
+				     DMA_FROM_DEVICE);
+		if (dma_mapping_error(&alx->hw.pdev->dev, dma)) {
+			dev_kfree_skb(skb);
+			break;
+		}
+
+		/* Unfortunately, RX descriptor buffers must be 4-byte
+		 * aligned, so we can't use IP alignment.
+		 */
+		if (WARN_ON(dma & 3)) {
+			dev_kfree_skb(skb);
+			break;
+		}
+
+		cur_buf->skb = skb;
+		dma_unmap_len_set(cur_buf, size, alx->rxbuf_size);
+		dma_unmap_addr_set(cur_buf, dma, dma);
+		rfd->addr = cpu_to_le64(dma);
+
+		cur = next;
+		if (++next == alx->rx_ringsz)
+			next = 0;
+		cur_buf = &rxq->bufs[cur];
+		count++;
+	}
+
+	if (count) {
+		/* flush all updates before updating hardware */
+		wmb();
+		rxq->write_idx = cur;
+		alx_write_mem16(&alx->hw, ALX_RFD_PIDX, cur);
+	}
+
+	return count;
+}
+
+static inline int alx_tpd_avail(struct alx_priv *alx)
+{
+	struct alx_tx_queue *txq = &alx->txq;
+
+	if (txq->write_idx >= txq->read_idx)
+		return alx->tx_ringsz + txq->read_idx - txq->write_idx - 1;
+	return txq->read_idx - txq->write_idx - 1;
+}
+
+static bool alx_clean_tx_irq(struct alx_priv *alx)
+{
+	struct alx_tx_queue *txq = &alx->txq;
+	u16 hw_read_idx, sw_read_idx;
+	unsigned int total_bytes = 0, total_packets = 0;
+	int budget = ALX_DEFAULT_TX_WORK;
+
+	sw_read_idx = txq->read_idx;
+	hw_read_idx = alx_read_mem16(&alx->hw, ALX_TPD_PRI0_CIDX);
+
+	if (sw_read_idx != hw_read_idx) {
+		while (sw_read_idx != hw_read_idx && budget > 0) {
+			struct sk_buff *skb;
+
+			skb = txq->bufs[sw_read_idx].skb;
+			if (skb) {
+				total_bytes += skb->len;
+				total_packets++;
+				budget--;
+			}
+
+			alx_free_txbuf(alx, sw_read_idx);
+
+			if (++sw_read_idx == alx->tx_ringsz)
+				sw_read_idx = 0;
+		}
+		txq->read_idx = sw_read_idx;
+
+		netdev_completed_queue(alx->dev, total_packets, total_bytes);
+	}
+
+	if (netif_queue_stopped(alx->dev) && netif_carrier_ok(alx->dev) &&
+	    alx_tpd_avail(alx) > alx->tx_ringsz/4)
+		netif_wake_queue(alx->dev);
+
+	return sw_read_idx == hw_read_idx;
+}
+
+static void alx_schedule_link_check(struct alx_priv *alx)
+{
+	schedule_work(&alx->link_check_wk);
+}
+
+static void alx_schedule_reset(struct alx_priv *alx)
+{
+	schedule_work(&alx->reset_wk);
+}
+
+static bool alx_clean_rx_irq(struct alx_priv *alx, int budget)
+{
+	struct alx_rx_queue *rxq = &alx->rxq;
+	struct alx_rrd *rrd;
+	struct alx_buffer *rxb;
+	struct sk_buff *skb;
+	u16 length, rfd_cleaned = 0;
+
+	while (budget > 0) {
+		rrd = &rxq->rrd[rxq->rrd_read_idx];
+		if (!(rrd->word3 & cpu_to_le32(1 << RRD_UPDATED_SHIFT)))
+			break;
+		rrd->word3 &= ~cpu_to_le32(1 << RRD_UPDATED_SHIFT);
+
+		if (ALX_GET_FIELD(le32_to_cpu(rrd->word0),
+				  RRD_SI) != rxq->read_idx ||
+		    ALX_GET_FIELD(le32_to_cpu(rrd->word0),
+				  RRD_NOR) != 1) {
+			alx_schedule_reset(alx);
+			return 0;
+		}
+
+		rxb = &rxq->bufs[rxq->read_idx];
+		dma_unmap_single(&alx->hw.pdev->dev,
+				 dma_unmap_addr(rxb, dma),
+				 dma_unmap_len(rxb, size),
+				 DMA_FROM_DEVICE);
+		dma_unmap_len_set(rxb, size, 0);
+		skb = rxb->skb;
+		rxb->skb = NULL;
+
+		if (rrd->word3 & cpu_to_le32(1 << RRD_ERR_RES_SHIFT) ||
+		    rrd->word3 & cpu_to_le32(1 << RRD_ERR_LEN_SHIFT)) {
+			rrd->word3 = 0;
+			dev_kfree_skb_any(skb);
+			goto next_pkt;
+		}
+
+		length = ALX_GET_FIELD(le32_to_cpu(rrd->word3),
+				       RRD_PKTLEN) - ETH_FCS_LEN;
+		skb_put(skb, length);
+		skb->protocol = eth_type_trans(skb, alx->dev);
+
+		skb_checksum_none_assert(skb);
+		if (alx->dev->features & NETIF_F_RXCSUM &&
+		    !(rrd->word3 & (cpu_to_le32(1 << RRD_ERR_L4_SHIFT) |
+				    cpu_to_le32(1 << RRD_ERR_IPV4_SHIFT)))) {
+			switch (ALX_GET_FIELD(le32_to_cpu(rrd->word2),
+					      RRD_PID)) {
+			case RRD_PID_IPV6UDP:
+			case RRD_PID_IPV4UDP:
+			case RRD_PID_IPV4TCP:
+			case RRD_PID_IPV6TCP:
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+				break;
+			}
+		}
+
+		napi_gro_receive(&alx->napi, skb);
+		budget--;
+
+next_pkt:
+		if (++rxq->read_idx == alx->rx_ringsz)
+			rxq->read_idx = 0;
+		if (++rxq->rrd_read_idx == alx->rx_ringsz)
+			rxq->rrd_read_idx = 0;
+
+		if (++rfd_cleaned > ALX_RX_ALLOC_THRESH)
+			rfd_cleaned -= alx_refill_rx_ring(alx, GFP_ATOMIC);
+	}
+
+	if (rfd_cleaned)
+		alx_refill_rx_ring(alx, GFP_ATOMIC);
+
+	return budget > 0;
+}
+
+static int alx_poll(struct napi_struct *napi, int budget)
+{
+	struct alx_priv *alx = container_of(napi, struct alx_priv, napi);
+	struct alx_hw *hw = &alx->hw;
+	bool complete = true;
+	unsigned long flags;
+
+	complete = alx_clean_tx_irq(alx) &&
+		   alx_clean_rx_irq(alx, budget);
+
+	if (!complete)
+		return 1;
+
+	napi_complete(&alx->napi);
+
+	/* enable interrupt */
+	spin_lock_irqsave(&alx->irq_lock, flags);
+	alx->int_mask |= ALX_ISR_TX_Q0 | ALX_ISR_RX_Q0;
+	alx_write_mem32(hw, ALX_IMR, alx->int_mask);
+	spin_unlock_irqrestore(&alx->irq_lock, flags);
+
+	alx_post_write(hw);
+
+	return 0;
+}
+
+static irqreturn_t alx_intr_handle(struct alx_priv *alx, u32 intr)
+{
+	struct alx_hw *hw = &alx->hw;
+	bool write_int_mask = false;
+
+	spin_lock(&alx->irq_lock);
+
+	/* ACK interrupt */
+	alx_write_mem32(hw, ALX_ISR, intr | ALX_ISR_DIS);
+	intr &= alx->int_mask;
+
+	if (intr & ALX_ISR_FATAL) {
+		netif_warn(alx, hw, alx->dev,
+			   "fatal interrupt 0x%x, resetting\n", intr);
+		alx_schedule_reset(alx);
+		goto out;
+	}
+
+	if (intr & ALX_ISR_ALERT)
+		netdev_warn(alx->dev, "alert interrupt: 0x%x\n", intr);
+
+	if (intr & ALX_ISR_PHY) {
+		/* suppress PHY interrupt, because the source
+		 * is from PHY internal. only the internal status
+		 * is cleared, the interrupt status could be cleared.
+		 */
+		alx->int_mask &= ~ALX_ISR_PHY;
+		write_int_mask = true;
+		alx_schedule_link_check(alx);
+	}
+
+	if (intr & (ALX_ISR_TX_Q0 | ALX_ISR_RX_Q0)) {
+		napi_schedule(&alx->napi);
+		/* mask rx/tx interrupt, enable them when napi complete */
+		alx->int_mask &= ~ALX_ISR_ALL_QUEUES;
+		write_int_mask = true;
+	}
+
+	if (write_int_mask)
+		alx_write_mem32(hw, ALX_IMR, alx->int_mask);
+
+	alx_write_mem32(hw, ALX_ISR, 0);
+
+ out:
+	spin_unlock(&alx->irq_lock);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t alx_intr_msi(int irq, void *data)
+{
+	struct alx_priv *alx = data;
+
+	return alx_intr_handle(alx, alx_read_mem32(&alx->hw, ALX_ISR));
+}
+
+static irqreturn_t alx_intr_legacy(int irq, void *data)
+{
+	struct alx_priv *alx = data;
+	struct alx_hw *hw = &alx->hw;
+	u32 intr;
+
+	intr = alx_read_mem32(hw, ALX_ISR);
+
+	if (intr & ALX_ISR_DIS || !(intr & alx->int_mask))
+		return IRQ_NONE;
+
+	return alx_intr_handle(alx, intr);
+}
+
+static void alx_init_ring_ptrs(struct alx_priv *alx)
+{
+	struct alx_hw *hw = &alx->hw;
+	u32 addr_hi = ((u64)alx->descmem.dma) >> 32;
+
+	alx->rxq.read_idx = 0;
+	alx->rxq.write_idx = 0;
+	alx->rxq.rrd_read_idx = 0;
+	alx_write_mem32(hw, ALX_RX_BASE_ADDR_HI, addr_hi);
+	alx_write_mem32(hw, ALX_RRD_ADDR_LO, alx->rxq.rrd_dma);
+	alx_write_mem32(hw, ALX_RRD_RING_SZ, alx->rx_ringsz);
+	alx_write_mem32(hw, ALX_RFD_ADDR_LO, alx->rxq.rfd_dma);
+	alx_write_mem32(hw, ALX_RFD_RING_SZ, alx->rx_ringsz);
+	alx_write_mem32(hw, ALX_RFD_BUF_SZ, alx->rxbuf_size);
+
+	alx->txq.read_idx = 0;
+	alx->txq.write_idx = 0;
+	alx_write_mem32(hw, ALX_TX_BASE_ADDR_HI, addr_hi);
+	alx_write_mem32(hw, ALX_TPD_PRI0_ADDR_LO, alx->txq.tpd_dma);
+	alx_write_mem32(hw, ALX_TPD_RING_SZ, alx->tx_ringsz);
+
+	/* load these pointers into the chip */
+	alx_write_mem32(hw, ALX_SRAM9, ALX_SRAM_LOAD_PTR);
+}
+
+static void alx_free_txring_buf(struct alx_priv *alx)
+{
+	struct alx_tx_queue *txq = &alx->txq;
+	int i;
+
+	if (!txq->bufs)
+		return;
+
+	for (i = 0; i < alx->tx_ringsz; i++)
+		alx_free_txbuf(alx, i);
+
+	memset(txq->bufs, 0, alx->tx_ringsz * sizeof(struct alx_buffer));
+	memset(txq->tpd, 0, alx->tx_ringsz * sizeof(struct alx_txd));
+	txq->write_idx = 0;
+	txq->read_idx = 0;
+
+	netdev_reset_queue(alx->dev);
+}
+
+static void alx_free_rxring_buf(struct alx_priv *alx)
+{
+	struct alx_rx_queue *rxq = &alx->rxq;
+	struct alx_buffer *cur_buf;
+	u16 i;
+
+	if (rxq == NULL)
+		return;
+
+	for (i = 0; i < alx->rx_ringsz; i++) {
+		cur_buf = rxq->bufs + i;
+		if (cur_buf->skb) {
+			dma_unmap_single(&alx->hw.pdev->dev,
+					 dma_unmap_addr(cur_buf, dma),
+					 dma_unmap_len(cur_buf, size),
+					 DMA_FROM_DEVICE);
+			dev_kfree_skb(cur_buf->skb);
+			cur_buf->skb = NULL;
+			dma_unmap_len_set(cur_buf, size, 0);
+			dma_unmap_addr_set(cur_buf, dma, 0);
+		}
+	}
+
+	rxq->write_idx = 0;
+	rxq->read_idx = 0;
+	rxq->rrd_read_idx = 0;
+}
+
+static void alx_free_buffers(struct alx_priv *alx)
+{
+	alx_free_txring_buf(alx);
+	alx_free_rxring_buf(alx);
+}
+
+static int alx_reinit_rings(struct alx_priv *alx)
+{
+	alx_free_buffers(alx);
+
+	alx_init_ring_ptrs(alx);
+
+	if (!alx_refill_rx_ring(alx, GFP_KERNEL))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void alx_add_mc_addr(struct alx_hw *hw, const u8 *addr, u32 *mc_hash)
+{
+	u32 crc32, bit, reg;
+
+	crc32 = ether_crc(ETH_ALEN, addr);
+	reg = (crc32 >> 31) & 0x1;
+	bit = (crc32 >> 26) & 0x1F;
+
+	mc_hash[reg] |= BIT(bit);
+}
+
+static void __alx_set_rx_mode(struct net_device *netdev)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+	struct netdev_hw_addr *ha;
+	u32 mc_hash[2] = {};
+
+	if (!(netdev->flags & IFF_ALLMULTI)) {
+		netdev_for_each_mc_addr(ha, netdev)
+			alx_add_mc_addr(hw, ha->addr, mc_hash);
+
+		alx_write_mem32(hw, ALX_HASH_TBL0, mc_hash[0]);
+		alx_write_mem32(hw, ALX_HASH_TBL1, mc_hash[1]);
+	}
+
+	hw->rx_ctrl &= ~(ALX_MAC_CTRL_MULTIALL_EN | ALX_MAC_CTRL_PROMISC_EN);
+	if (netdev->flags & IFF_PROMISC)
+		hw->rx_ctrl |= ALX_MAC_CTRL_PROMISC_EN;
+	if (netdev->flags & IFF_ALLMULTI)
+		hw->rx_ctrl |= ALX_MAC_CTRL_MULTIALL_EN;
+
+	alx_write_mem32(hw, ALX_MAC_CTRL, hw->rx_ctrl);
+}
+
+static void alx_set_rx_mode(struct net_device *netdev)
+{
+	__alx_set_rx_mode(netdev);
+}
+
+static int alx_set_mac_address(struct net_device *netdev, void *data)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+	struct sockaddr *addr = data;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (netdev->addr_assign_type & NET_ADDR_RANDOM)
+		netdev->addr_assign_type ^= NET_ADDR_RANDOM;
+
+	memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len);
+	memcpy(hw->mac_addr, addr->sa_data, netdev->addr_len);
+	alx_set_macaddr(hw, hw->mac_addr);
+
+	return 0;
+}
+
+static int alx_alloc_descriptors(struct alx_priv *alx)
+{
+	alx->txq.bufs = kcalloc(alx->tx_ringsz,
+				sizeof(struct alx_buffer),
+				GFP_KERNEL);
+	if (!alx->txq.bufs)
+		return -ENOMEM;
+
+	alx->rxq.bufs = kcalloc(alx->rx_ringsz,
+				sizeof(struct alx_buffer),
+				GFP_KERNEL);
+	if (!alx->rxq.bufs)
+		goto out_free;
+
+	/* physical tx/rx ring descriptors
+	 *
+	 * Allocate them as a single chunk because they must not cross a
+	 * 4G boundary (hardware has a single register for high 32 bits
+	 * of addresses only)
+	 */
+	alx->descmem.size = sizeof(struct alx_txd) * alx->tx_ringsz +
+			    sizeof(struct alx_rrd) * alx->rx_ringsz +
+			    sizeof(struct alx_rfd) * alx->rx_ringsz;
+	alx->descmem.virt = dma_zalloc_coherent(&alx->hw.pdev->dev,
+						alx->descmem.size,
+						&alx->descmem.dma,
+						GFP_KERNEL);
+	if (!alx->descmem.virt)
+		goto out_free;
+
+	alx->txq.tpd = (void *)alx->descmem.virt;
+	alx->txq.tpd_dma = alx->descmem.dma;
+
+	/* alignment requirement for next block */
+	BUILD_BUG_ON(sizeof(struct alx_txd) % 8);
+
+	alx->rxq.rrd =
+		(void *)((u8 *)alx->descmem.virt +
+			 sizeof(struct alx_txd) * alx->tx_ringsz);
+	alx->rxq.rrd_dma = alx->descmem.dma +
+			   sizeof(struct alx_txd) * alx->tx_ringsz;
+
+	/* alignment requirement for next block */
+	BUILD_BUG_ON(sizeof(struct alx_rrd) % 8);
+
+	alx->rxq.rfd =
+		(void *)((u8 *)alx->descmem.virt +
+			 sizeof(struct alx_txd) * alx->tx_ringsz +
+			 sizeof(struct alx_rrd) * alx->rx_ringsz);
+	alx->rxq.rfd_dma = alx->descmem.dma +
+			   sizeof(struct alx_txd) * alx->tx_ringsz +
+			   sizeof(struct alx_rrd) * alx->rx_ringsz;
+
+	return 0;
+out_free:
+	kfree(alx->txq.bufs);
+	kfree(alx->rxq.bufs);
+	return -ENOMEM;
+}
+
+static int alx_alloc_rings(struct alx_priv *alx)
+{
+	int err;
+
+	err = alx_alloc_descriptors(alx);
+	if (err)
+		return err;
+
+	alx->int_mask &= ~ALX_ISR_ALL_QUEUES;
+	alx->int_mask |= ALX_ISR_TX_Q0 | ALX_ISR_RX_Q0;
+	alx->tx_ringsz = alx->tx_ringsz;
+
+	netif_napi_add(alx->dev, &alx->napi, alx_poll, 64);
+
+	alx_reinit_rings(alx);
+	return 0;
+}
+
+static void alx_free_rings(struct alx_priv *alx)
+{
+	netif_napi_del(&alx->napi);
+	alx_free_buffers(alx);
+
+	kfree(alx->txq.bufs);
+	kfree(alx->rxq.bufs);
+
+	dma_free_coherent(&alx->hw.pdev->dev,
+			  alx->descmem.size,
+			  alx->descmem.virt,
+			  alx->descmem.dma);
+}
+
+static void alx_config_vector_mapping(struct alx_priv *alx)
+{
+	struct alx_hw *hw = &alx->hw;
+
+	alx_write_mem32(hw, ALX_MSI_MAP_TBL1, 0);
+	alx_write_mem32(hw, ALX_MSI_MAP_TBL2, 0);
+	alx_write_mem32(hw, ALX_MSI_ID_MAP, 0);
+}
+
+static void alx_irq_enable(struct alx_priv *alx)
+{
+	struct alx_hw *hw = &alx->hw;
+
+	/* level-1 interrupt switch */
+	alx_write_mem32(hw, ALX_ISR, 0);
+	alx_write_mem32(hw, ALX_IMR, alx->int_mask);
+	alx_post_write(hw);
+}
+
+static void alx_irq_disable(struct alx_priv *alx)
+{
+	struct alx_hw *hw = &alx->hw;
+
+	alx_write_mem32(hw, ALX_ISR, ALX_ISR_DIS);
+	alx_write_mem32(hw, ALX_IMR, 0);
+	alx_post_write(hw);
+
+	synchronize_irq(alx->hw.pdev->irq);
+}
+
+static int alx_request_irq(struct alx_priv *alx)
+{
+	struct pci_dev *pdev = alx->hw.pdev;
+	struct alx_hw *hw = &alx->hw;
+	int err;
+	u32 msi_ctrl;
+
+	msi_ctrl = (hw->imt >> 1) << ALX_MSI_RETRANS_TM_SHIFT;
+
+	if (!pci_enable_msi(alx->hw.pdev)) {
+		alx->msi = true;
+
+		alx_write_mem32(hw, ALX_MSI_RETRANS_TIMER,
+				msi_ctrl | ALX_MSI_MASK_SEL_LINE);
+		err = request_irq(pdev->irq, alx_intr_msi, 0,
+				  alx->dev->name, alx);
+		if (!err)
+			goto out;
+		/* fall back to legacy interrupt */
+		pci_disable_msi(alx->hw.pdev);
+	}
+
+	alx_write_mem32(hw, ALX_MSI_RETRANS_TIMER, 0);
+	err = request_irq(pdev->irq, alx_intr_legacy, IRQF_SHARED,
+			  alx->dev->name, alx);
+out:
+	if (!err)
+		alx_config_vector_mapping(alx);
+	return err;
+}
+
+static void alx_free_irq(struct alx_priv *alx)
+{
+	struct pci_dev *pdev = alx->hw.pdev;
+
+	free_irq(pdev->irq, alx);
+
+	if (alx->msi) {
+		pci_disable_msi(alx->hw.pdev);
+		alx->msi = false;
+	}
+}
+
+static int alx_identify_hw(struct alx_priv *alx)
+{
+	struct alx_hw *hw = &alx->hw;
+	int rev = alx_hw_revision(hw);
+
+	if (rev > ALX_REV_C0)
+		return -EINVAL;
+
+	hw->max_dma_chnl = rev >= ALX_REV_B0 ? 4 : 2;
+
+	return 0;
+}
+
+static int alx_init_sw(struct alx_priv *alx)
+{
+	struct pci_dev *pdev = alx->hw.pdev;
+	struct alx_hw *hw = &alx->hw;
+	int err;
+
+	err = alx_identify_hw(alx);
+	if (err) {
+		dev_err(&pdev->dev, "unrecognized chip, aborting\n");
+		return err;
+	}
+
+	alx->hw.lnk_patch =
+		pdev->device == ALX_DEV_ID_AR8161 &&
+		pdev->subsystem_vendor == PCI_VENDOR_ID_ATTANSIC &&
+		pdev->subsystem_device == 0x0091 &&
+		pdev->revision == 0;
+
+	hw->smb_timer = 400;
+	hw->mtu = alx->dev->mtu;
+	alx->rxbuf_size = ALIGN(ALX_RAW_MTU(hw->mtu), 8);
+	alx->tx_ringsz = 256;
+	alx->rx_ringsz = 512;
+	hw->sleep_ctrl = ALX_SLEEP_WOL_MAGIC | ALX_SLEEP_WOL_PHY;
+	hw->imt = 200;
+	alx->int_mask = ALX_ISR_MISC;
+	hw->dma_chnl = hw->max_dma_chnl;
+	hw->ith_tpd = alx->tx_ringsz / 3;
+	hw->link_speed = SPEED_UNKNOWN;
+	hw->adv_cfg = ADVERTISED_Autoneg |
+		      ADVERTISED_10baseT_Half |
+		      ADVERTISED_10baseT_Full |
+		      ADVERTISED_100baseT_Full |
+		      ADVERTISED_100baseT_Half |
+		      ADVERTISED_1000baseT_Full;
+	hw->flowctrl = ALX_FC_ANEG | ALX_FC_RX | ALX_FC_TX;
+
+	hw->rx_ctrl = ALX_MAC_CTRL_WOLSPED_SWEN |
+		      ALX_MAC_CTRL_MHASH_ALG_HI5B |
+		      ALX_MAC_CTRL_BRD_EN |
+		      ALX_MAC_CTRL_PCRCE |
+		      ALX_MAC_CTRL_CRCE |
+		      ALX_MAC_CTRL_RXFC_EN |
+		      ALX_MAC_CTRL_TXFC_EN |
+		      7 << ALX_MAC_CTRL_PRMBLEN_SHIFT;
+
+	return err;
+}
+
+
+static netdev_features_t alx_fix_features(struct net_device *netdev,
+					  netdev_features_t features)
+{
+	if (netdev->mtu > ALX_MAX_TSO_PKT_SIZE)
+		features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
+
+	return features;
+}
+
+static void alx_netif_stop(struct alx_priv *alx)
+{
+	alx->dev->trans_start = jiffies;
+	if (netif_carrier_ok(alx->dev)) {
+		netif_carrier_off(alx->dev);
+		netif_tx_disable(alx->dev);
+		napi_disable(&alx->napi);
+	}
+}
+
+static void alx_halt(struct alx_priv *alx)
+{
+	struct alx_hw *hw = &alx->hw;
+
+	alx_netif_stop(alx);
+	hw->link_speed = SPEED_UNKNOWN;
+
+	alx_reset_mac(hw);
+
+	/* disable l0s/l1 */
+	alx_enable_aspm(hw, false, false);
+	alx_irq_disable(alx);
+	alx_free_buffers(alx);
+}
+
+static void alx_configure(struct alx_priv *alx)
+{
+	struct alx_hw *hw = &alx->hw;
+
+	alx_configure_basic(hw);
+	alx_disable_rss(hw);
+	__alx_set_rx_mode(alx->dev);
+
+	alx_write_mem32(hw, ALX_MAC_CTRL, hw->rx_ctrl);
+}
+
+static void alx_activate(struct alx_priv *alx)
+{
+	/* hardware setting lost, restore it */
+	alx_reinit_rings(alx);
+	alx_configure(alx);
+
+	/* clear old interrupts */
+	alx_write_mem32(&alx->hw, ALX_ISR, ~(u32)ALX_ISR_DIS);
+
+	alx_irq_enable(alx);
+
+	alx_schedule_link_check(alx);
+}
+
+static void alx_reinit(struct alx_priv *alx)
+{
+	ASSERT_RTNL();
+
+	alx_halt(alx);
+	alx_activate(alx);
+}
+
+static int alx_change_mtu(struct net_device *netdev, int mtu)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	int max_frame = mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+
+	if ((max_frame < ALX_MIN_FRAME_SIZE) ||
+	    (max_frame > ALX_MAX_FRAME_SIZE))
+		return -EINVAL;
+
+	if (netdev->mtu == mtu)
+		return 0;
+
+	netdev->mtu = mtu;
+	alx->hw.mtu = mtu;
+	alx->rxbuf_size = mtu > ALX_DEF_RXBUF_SIZE ?
+			   ALIGN(max_frame, 8) : ALX_DEF_RXBUF_SIZE;
+	netdev_update_features(netdev);
+	if (netif_running(netdev))
+		alx_reinit(alx);
+	return 0;
+}
+
+static void alx_netif_start(struct alx_priv *alx)
+{
+	netif_tx_wake_all_queues(alx->dev);
+	napi_enable(&alx->napi);
+	netif_carrier_on(alx->dev);
+}
+
+static int __alx_open(struct alx_priv *alx, bool resume)
+{
+	int err;
+
+	if (!resume)
+		netif_carrier_off(alx->dev);
+
+	err = alx_alloc_rings(alx);
+	if (err)
+		return err;
+
+	alx_configure(alx);
+
+	err = alx_request_irq(alx);
+	if (err)
+		goto out_free_rings;
+
+	/* clear old interrupts */
+	alx_write_mem32(&alx->hw, ALX_ISR, ~(u32)ALX_ISR_DIS);
+
+	alx_irq_enable(alx);
+
+	if (!resume)
+		netif_tx_start_all_queues(alx->dev);
+
+	alx_schedule_link_check(alx);
+	return 0;
+
+out_free_rings:
+	alx_free_rings(alx);
+	return err;
+}
+
+static void __alx_stop(struct alx_priv *alx)
+{
+	alx_halt(alx);
+	alx_free_irq(alx);
+	alx_free_rings(alx);
+}
+
+static const char *alx_speed_desc(u16 speed)
+{
+	switch (speed) {
+	case SPEED_1000 + DUPLEX_FULL:
+		return "1 Gbps Full";
+	case SPEED_100 + DUPLEX_FULL:
+		return "100 Mbps Full";
+	case SPEED_100 + DUPLEX_HALF:
+		return "100 Mbps Half";
+	case SPEED_10 + DUPLEX_FULL:
+		return "10 Mbps Full";
+	case SPEED_10 + DUPLEX_HALF:
+		return "10 Mbps Half";
+	default:
+		return "Unknown speed";
+	}
+}
+
+static void alx_check_link(struct alx_priv *alx)
+{
+	struct alx_hw *hw = &alx->hw;
+	unsigned long flags;
+	int speed, old_speed;
+	int err;
+
+	/* clear PHY internal interrupt status, otherwise the main
+	 * interrupt status will be asserted forever
+	 */
+	alx_clear_phy_intr(hw);
+
+	err = alx_get_phy_link(hw, &speed);
+	if (err < 0)
+		goto reset;
+
+	spin_lock_irqsave(&alx->irq_lock, flags);
+	alx->int_mask |= ALX_ISR_PHY;
+	alx_write_mem32(hw, ALX_IMR, alx->int_mask);
+	spin_unlock_irqrestore(&alx->irq_lock, flags);
+
+	old_speed = hw->link_speed;
+
+	if (old_speed == speed)
+		return;
+	hw->link_speed = speed;
+
+	if (speed != SPEED_UNKNOWN) {
+		netif_info(alx, link, alx->dev,
+			   "NIC Up: %s\n", alx_speed_desc(speed));
+		alx_post_phy_link(hw);
+		alx_enable_aspm(hw, true, true);
+		alx_start_mac(hw);
+
+		if (old_speed == SPEED_UNKNOWN)
+			alx_netif_start(alx);
+	} else {
+		/* link is now down */
+		alx_netif_stop(alx);
+		netif_info(alx, link, alx->dev, "Link Down\n");
+		err = alx_reset_mac(hw);
+		if (err)
+			goto reset;
+		alx_irq_disable(alx);
+
+		/* MAC reset causes all HW settings to be lost, restore all */
+		err = alx_reinit_rings(alx);
+		if (err)
+			goto reset;
+		alx_configure(alx);
+		alx_enable_aspm(hw, false, true);
+		alx_post_phy_link(hw);
+		alx_irq_enable(alx);
+	}
+
+	return;
+
+reset:
+	alx_schedule_reset(alx);
+}
+
+static int alx_open(struct net_device *netdev)
+{
+	return __alx_open(netdev_priv(netdev), false);
+}
+
+static int alx_stop(struct net_device *netdev)
+{
+	__alx_stop(netdev_priv(netdev));
+	return 0;
+}
+
+static int __alx_shutdown(struct pci_dev *pdev, bool *wol_en)
+{
+	struct alx_priv *alx = pci_get_drvdata(pdev);
+	struct net_device *netdev = alx->dev;
+	struct alx_hw *hw = &alx->hw;
+	int err, speed;
+
+	netif_device_detach(netdev);
+
+	if (netif_running(netdev))
+		__alx_stop(alx);
+
+#ifdef CONFIG_PM_SLEEP
+	err = pci_save_state(pdev);
+	if (err)
+		return err;
+#endif
+
+	err = alx_select_powersaving_speed(hw, &speed);
+	if (err)
+		return err;
+	err = alx_clear_phy_intr(hw);
+	if (err)
+		return err;
+	err = alx_pre_suspend(hw, speed);
+	if (err)
+		return err;
+	err = alx_config_wol(hw);
+	if (err)
+		return err;
+
+	*wol_en = false;
+	if (hw->sleep_ctrl & ALX_SLEEP_ACTIVE) {
+		netif_info(alx, wol, netdev,
+			   "wol: ctrl=%X, speed=%X\n",
+			   hw->sleep_ctrl, speed);
+		device_set_wakeup_enable(&pdev->dev, true);
+		*wol_en = true;
+	}
+
+	pci_disable_device(pdev);
+
+	return 0;
+}
+
+static void alx_shutdown(struct pci_dev *pdev)
+{
+	int err;
+	bool wol_en;
+
+	err = __alx_shutdown(pdev, &wol_en);
+	if (!err) {
+		pci_wake_from_d3(pdev, wol_en);
+		pci_set_power_state(pdev, PCI_D3hot);
+	} else {
+		dev_err(&pdev->dev, "shutdown fail %d\n", err);
+	}
+}
+
+static void alx_link_check(struct work_struct *work)
+{
+	struct alx_priv *alx;
+
+	alx = container_of(work, struct alx_priv, link_check_wk);
+
+	rtnl_lock();
+	alx_check_link(alx);
+	rtnl_unlock();
+}
+
+static void alx_reset(struct work_struct *work)
+{
+	struct alx_priv *alx = container_of(work, struct alx_priv, reset_wk);
+
+	rtnl_lock();
+	alx_reinit(alx);
+	rtnl_unlock();
+}
+
+static int alx_tx_csum(struct sk_buff *skb, struct alx_txd *first)
+{
+	u8 cso, css;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return 0;
+
+	cso = skb_checksum_start_offset(skb);
+	if (cso & 1)
+		return -EINVAL;
+
+	css = cso + skb->csum_offset;
+	first->word1 |= cpu_to_le32((cso >> 1) << TPD_CXSUMSTART_SHIFT);
+	first->word1 |= cpu_to_le32((css >> 1) << TPD_CXSUMOFFSET_SHIFT);
+	first->word1 |= cpu_to_le32(1 << TPD_CXSUM_EN_SHIFT);
+
+	return 0;
+}
+
+static int alx_map_tx_skb(struct alx_priv *alx, struct sk_buff *skb)
+{
+	struct alx_tx_queue *txq = &alx->txq;
+	struct alx_txd *tpd, *first_tpd;
+	dma_addr_t dma;
+	int maplen, f, first_idx = txq->write_idx;
+
+	first_tpd = &txq->tpd[txq->write_idx];
+	tpd = first_tpd;
+
+	maplen = skb_headlen(skb);
+	dma = dma_map_single(&alx->hw.pdev->dev, skb->data, maplen,
+			     DMA_TO_DEVICE);
+	if (dma_mapping_error(&alx->hw.pdev->dev, dma))
+		goto err_dma;
+
+	dma_unmap_len_set(&txq->bufs[txq->write_idx], size, maplen);
+	dma_unmap_addr_set(&txq->bufs[txq->write_idx], dma, dma);
+
+	tpd->adrl.addr = cpu_to_le64(dma);
+	tpd->len = cpu_to_le16(maplen);
+
+	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++) {
+		struct skb_frag_struct *frag;
+
+		frag = &skb_shinfo(skb)->frags[f];
+
+		if (++txq->write_idx == alx->tx_ringsz)
+			txq->write_idx = 0;
+		tpd = &txq->tpd[txq->write_idx];
+
+		tpd->word1 = first_tpd->word1;
+
+		maplen = skb_frag_size(frag);
+		dma = skb_frag_dma_map(&alx->hw.pdev->dev, frag, 0,
+				       maplen, DMA_TO_DEVICE);
+		if (dma_mapping_error(&alx->hw.pdev->dev, dma))
+			goto err_dma;
+		dma_unmap_len_set(&txq->bufs[txq->write_idx], size, maplen);
+		dma_unmap_addr_set(&txq->bufs[txq->write_idx], dma, dma);
+
+		tpd->adrl.addr = cpu_to_le64(dma);
+		tpd->len = cpu_to_le16(maplen);
+	}
+
+	/* last TPD, set EOP flag and store skb */
+	tpd->word1 |= cpu_to_le32(1 << TPD_EOP_SHIFT);
+	txq->bufs[txq->write_idx].skb = skb;
+
+	if (++txq->write_idx == alx->tx_ringsz)
+		txq->write_idx = 0;
+
+	return 0;
+
+err_dma:
+	f = first_idx;
+	while (f != txq->write_idx) {
+		alx_free_txbuf(alx, f);
+		if (++f == alx->tx_ringsz)
+			f = 0;
+	}
+	return -ENOMEM;
+}
+
+static netdev_tx_t alx_start_xmit(struct sk_buff *skb,
+				  struct net_device *netdev)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_tx_queue *txq = &alx->txq;
+	struct alx_txd *first;
+	int tpdreq = skb_shinfo(skb)->nr_frags + 1;
+
+	if (alx_tpd_avail(alx) < tpdreq) {
+		netif_stop_queue(alx->dev);
+		goto drop;
+	}
+
+	first = &txq->tpd[txq->write_idx];
+	memset(first, 0, sizeof(*first));
+
+	if (alx_tx_csum(skb, first))
+		goto drop;
+
+	if (alx_map_tx_skb(alx, skb) < 0)
+		goto drop;
+
+	netdev_sent_queue(alx->dev, skb->len);
+
+	/* flush updates before updating hardware */
+	wmb();
+	alx_write_mem16(&alx->hw, ALX_TPD_PRI0_PIDX, txq->write_idx);
+
+	if (alx_tpd_avail(alx) < alx->tx_ringsz/8)
+		netif_stop_queue(alx->dev);
+
+	return NETDEV_TX_OK;
+
+drop:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static void alx_tx_timeout(struct net_device *dev)
+{
+	struct alx_priv *alx = netdev_priv(dev);
+
+	alx_schedule_reset(alx);
+}
+
+static int alx_mdio_read(struct net_device *netdev,
+			 int prtad, int devad, u16 addr)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+	u16 val;
+	int err;
+
+	if (prtad != hw->mdio.prtad)
+		return -EINVAL;
+
+	if (devad == MDIO_DEVAD_NONE)
+		err = alx_read_phy_reg(hw, addr, &val);
+	else
+		err = alx_read_phy_ext(hw, devad, addr, &val);
+
+	if (err)
+		return err;
+	return val;
+}
+
+static int alx_mdio_write(struct net_device *netdev,
+			  int prtad, int devad, u16 addr, u16 val)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+
+	if (prtad != hw->mdio.prtad)
+		return -EINVAL;
+
+	if (devad == MDIO_DEVAD_NONE)
+		return alx_write_phy_reg(hw, addr, val);
+
+	return alx_write_phy_ext(hw, devad, addr, val);
+}
+
+static int alx_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+
+	if (!netif_running(netdev))
+		return -EAGAIN;
+
+	return mdio_mii_ioctl(&alx->hw.mdio, if_mii(ifr), cmd);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void alx_poll_controller(struct net_device *netdev)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+
+	if (alx->msi)
+		alx_intr_msi(0, alx);
+	else
+		alx_intr_legacy(0, alx);
+}
+#endif
+
+static const struct net_device_ops alx_netdev_ops = {
+	.ndo_open               = alx_open,
+	.ndo_stop               = alx_stop,
+	.ndo_start_xmit         = alx_start_xmit,
+	.ndo_set_rx_mode        = alx_set_rx_mode,
+	.ndo_validate_addr      = eth_validate_addr,
+	.ndo_set_mac_address    = alx_set_mac_address,
+	.ndo_change_mtu         = alx_change_mtu,
+	.ndo_do_ioctl           = alx_ioctl,
+	.ndo_tx_timeout         = alx_tx_timeout,
+	.ndo_fix_features	= alx_fix_features,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller    = alx_poll_controller,
+#endif
+};
+
+static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct net_device *netdev;
+	struct alx_priv *alx;
+	struct alx_hw *hw;
+	bool phy_configured;
+	int bars, pm_cap, err;
+
+	err = pci_enable_device_mem(pdev);
+	if (err)
+		return err;
+
+	/* The alx chip can DMA to 64-bit addresses, but it uses a single
+	 * shared register for the high 32 bits, so only a single, aligned,
+	 * 4 GB physical address range can be used for descriptors.
+	 */
+	if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)) &&
+	    !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64))) {
+		dev_dbg(&pdev->dev, "DMA to 64-BIT addresses\n");
+	} else {
+		err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+		if (err) {
+			err = dma_set_coherent_mask(&pdev->dev,
+						    DMA_BIT_MASK(32));
+			if (err) {
+				dev_err(&pdev->dev,
+					"No usable DMA config, aborting\n");
+				goto out_pci_disable;
+			}
+		}
+	}
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	err = pci_request_selected_regions(pdev, bars, alx_drv_name);
+	if (err) {
+		dev_err(&pdev->dev,
+			"pci_request_selected_regions failed(bars:%d)\n", bars);
+		goto out_pci_disable;
+	}
+
+	pci_enable_pcie_error_reporting(pdev);
+	pci_set_master(pdev);
+
+	pm_cap = pci_find_capability(pdev, PCI_CAP_ID_PM);
+	if (pm_cap == 0) {
+		dev_err(&pdev->dev,
+			"Can't find power management capability, aborting\n");
+		err = -EIO;
+		goto out_pci_release;
+	}
+
+	err = pci_set_power_state(pdev, PCI_D0);
+	if (err)
+		goto out_pci_release;
+
+	netdev = alloc_etherdev(sizeof(*alx));
+	if (!netdev) {
+		err = -ENOMEM;
+		goto out_pci_release;
+	}
+
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+	alx = netdev_priv(netdev);
+	alx->dev = netdev;
+	alx->hw.pdev = pdev;
+	alx->msg_enable = NETIF_MSG_LINK | NETIF_MSG_HW | NETIF_MSG_IFUP |
+			  NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR | NETIF_MSG_WOL;
+	hw = &alx->hw;
+	pci_set_drvdata(pdev, alx);
+
+	hw->hw_addr = pci_ioremap_bar(pdev, 0);
+	if (!hw->hw_addr) {
+		dev_err(&pdev->dev, "cannot map device registers\n");
+		err = -EIO;
+		goto out_free_netdev;
+	}
+
+	netdev->netdev_ops = &alx_netdev_ops;
+	SET_ETHTOOL_OPS(netdev, &alx_ethtool_ops);
+	netdev->irq = pdev->irq;
+	netdev->watchdog_timeo = ALX_WATCHDOG_TIME;
+
+	if (ent->driver_data & ALX_DEV_QUIRK_MSI_INTX_DISABLE_BUG)
+		pdev->dev_flags |= PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG;
+
+	err = alx_init_sw(alx);
+	if (err) {
+		dev_err(&pdev->dev, "net device private data init failed\n");
+		goto out_unmap;
+	}
+
+	alx_reset_pcie(hw);
+
+	phy_configured = alx_phy_configured(hw);
+
+	if (!phy_configured)
+		alx_reset_phy(hw);
+
+	err = alx_reset_mac(hw);
+	if (err) {
+		dev_err(&pdev->dev, "MAC Reset failed, error = %d\n", err);
+		goto out_unmap;
+	}
+
+	/* setup link to put it in a known good starting state */
+	if (!phy_configured) {
+		err = alx_setup_speed_duplex(hw, hw->adv_cfg, hw->flowctrl);
+		if (err) {
+			dev_err(&pdev->dev,
+				"failed to configure PHY speed/duplex (err=%d)\n",
+				err);
+			goto out_unmap;
+		}
+	}
+
+	netdev->hw_features = NETIF_F_SG | NETIF_F_HW_CSUM;
+
+	if (alx_get_perm_macaddr(hw, hw->perm_addr)) {
+		dev_warn(&pdev->dev,
+			 "Invalid permanent address programmed, using random one\n");
+		eth_hw_addr_random(netdev);
+		memcpy(hw->perm_addr, netdev->dev_addr, netdev->addr_len);
+	}
+
+	memcpy(hw->mac_addr, hw->perm_addr, ETH_ALEN);
+	memcpy(netdev->dev_addr, hw->mac_addr, ETH_ALEN);
+	memcpy(netdev->perm_addr, hw->perm_addr, ETH_ALEN);
+
+	hw->mdio.prtad = 0;
+	hw->mdio.mmds = 0;
+	hw->mdio.dev = netdev;
+	hw->mdio.mode_support = MDIO_SUPPORTS_C45 |
+				MDIO_SUPPORTS_C22 |
+				MDIO_EMULATE_C22;
+	hw->mdio.mdio_read = alx_mdio_read;
+	hw->mdio.mdio_write = alx_mdio_write;
+
+	if (!alx_get_phy_info(hw)) {
+		dev_err(&pdev->dev, "failed to identify PHY\n");
+		err = -EIO;
+		goto out_unmap;
+	}
+
+	INIT_WORK(&alx->link_check_wk, alx_link_check);
+	INIT_WORK(&alx->reset_wk, alx_reset);
+	spin_lock_init(&alx->hw.mdio_lock);
+	spin_lock_init(&alx->irq_lock);
+
+	netif_carrier_off(netdev);
+
+	err = register_netdev(netdev);
+	if (err) {
+		dev_err(&pdev->dev, "register netdevice failed\n");
+		goto out_unmap;
+	}
+
+	device_set_wakeup_enable(&pdev->dev, hw->sleep_ctrl);
+
+	netdev_info(netdev,
+		    "Qualcomm Atheros AR816x/AR817x Ethernet [%pM]\n",
+		    netdev->dev_addr);
+
+	return 0;
+
+out_unmap:
+	iounmap(hw->hw_addr);
+out_free_netdev:
+	free_netdev(netdev);
+out_pci_release:
+	pci_release_selected_regions(pdev, bars);
+out_pci_disable:
+	pci_disable_device(pdev);
+	return err;
+}
+
+static void alx_remove(struct pci_dev *pdev)
+{
+	struct alx_priv *alx = pci_get_drvdata(pdev);
+	struct alx_hw *hw = &alx->hw;
+
+	cancel_work_sync(&alx->link_check_wk);
+	cancel_work_sync(&alx->reset_wk);
+
+	/* restore permanent mac address */
+	alx_set_macaddr(hw, hw->perm_addr);
+
+	unregister_netdev(alx->dev);
+	iounmap(hw->hw_addr);
+	pci_release_selected_regions(pdev,
+				     pci_select_bars(pdev, IORESOURCE_MEM));
+
+	pci_disable_pcie_error_reporting(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+
+	free_netdev(alx->dev);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int alx_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int err;
+	bool wol_en;
+
+	err = __alx_shutdown(pdev, &wol_en);
+	if (err) {
+		dev_err(&pdev->dev, "shutdown fail in suspend %d\n", err);
+		return err;
+	}
+
+	if (wol_en) {
+		pci_prepare_to_sleep(pdev);
+	} else {
+		pci_wake_from_d3(pdev, false);
+		pci_set_power_state(pdev, PCI_D3hot);
+	}
+
+	return 0;
+}
+
+static int alx_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct alx_priv *alx = pci_get_drvdata(pdev);
+	struct net_device *netdev = alx->dev;
+	struct alx_hw *hw = &alx->hw;
+	int err;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+
+	pci_enable_wake(pdev, PCI_D3hot, 0);
+	pci_enable_wake(pdev, PCI_D3cold, 0);
+
+	hw->link_speed = SPEED_UNKNOWN;
+	alx->int_mask = ALX_ISR_MISC;
+
+	alx_reset_pcie(hw);
+	alx_reset_phy(hw);
+
+	err = alx_reset_mac(hw);
+	if (err) {
+		netif_err(alx, hw, alx->dev,
+			  "resume:reset_mac fail %d\n", err);
+		return -EIO;
+	}
+
+	err = alx_setup_speed_duplex(hw, hw->adv_cfg, hw->flowctrl);
+	if (err) {
+		netif_err(alx, hw, alx->dev,
+			  "resume:setup_speed_duplex fail %d\n", err);
+		return -EIO;
+	}
+
+	if (netif_running(netdev)) {
+		err = __alx_open(alx, true);
+		if (err)
+			return err;
+	}
+
+	netif_device_attach(netdev);
+
+	return err;
+}
+#endif
+
+static pci_ers_result_t alx_pci_error_detected(struct pci_dev *pdev,
+					       pci_channel_state_t state)
+{
+	struct alx_priv *alx = pci_get_drvdata(pdev);
+	struct net_device *netdev = alx->dev;
+	pci_ers_result_t rc = PCI_ERS_RESULT_NEED_RESET;
+
+	dev_info(&pdev->dev, "pci error detected\n");
+
+	rtnl_lock();
+
+	if (netif_running(netdev)) {
+		netif_device_detach(netdev);
+		alx_halt(alx);
+	}
+
+	if (state == pci_channel_io_perm_failure)
+		rc = PCI_ERS_RESULT_DISCONNECT;
+	else
+		pci_disable_device(pdev);
+
+	rtnl_unlock();
+
+	return rc;
+}
+
+static pci_ers_result_t alx_pci_error_slot_reset(struct pci_dev *pdev)
+{
+	struct alx_priv *alx = pci_get_drvdata(pdev);
+	struct alx_hw *hw = &alx->hw;
+	pci_ers_result_t rc = PCI_ERS_RESULT_DISCONNECT;
+
+	dev_info(&pdev->dev, "pci error slot reset\n");
+
+	rtnl_lock();
+
+	if (pci_enable_device(pdev)) {
+		dev_err(&pdev->dev, "Failed to re-enable PCI device after reset\n");
+		goto out;
+	}
+
+	pci_set_master(pdev);
+	pci_enable_wake(pdev, PCI_D3hot, 0);
+	pci_enable_wake(pdev, PCI_D3cold, 0);
+
+	alx_reset_pcie(hw);
+	if (!alx_reset_mac(hw))
+		rc = PCI_ERS_RESULT_RECOVERED;
+out:
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+
+	rtnl_unlock();
+
+	return rc;
+}
+
+static void alx_pci_error_resume(struct pci_dev *pdev)
+{
+	struct alx_priv *alx = pci_get_drvdata(pdev);
+	struct net_device *netdev = alx->dev;
+
+	dev_info(&pdev->dev, "pci error resume\n");
+
+	rtnl_lock();
+
+	if (netif_running(netdev)) {
+		alx_activate(alx);
+		netif_device_attach(netdev);
+	}
+
+	rtnl_unlock();
+}
+
+static const struct pci_error_handlers alx_err_handlers = {
+	.error_detected = alx_pci_error_detected,
+	.slot_reset     = alx_pci_error_slot_reset,
+	.resume         = alx_pci_error_resume,
+};
+
+#ifdef CONFIG_PM_SLEEP
+static SIMPLE_DEV_PM_OPS(alx_pm_ops, alx_suspend, alx_resume);
+#define ALX_PM_OPS      (&alx_pm_ops)
+#else
+#define ALX_PM_OPS      NULL
+#endif
+
+static DEFINE_PCI_DEVICE_TABLE(alx_pci_tbl) = {
+	{ PCI_VDEVICE(ATTANSIC, ALX_DEV_ID_AR8161),
+	  .driver_data = ALX_DEV_QUIRK_MSI_INTX_DISABLE_BUG },
+	{ PCI_VDEVICE(ATTANSIC, ALX_DEV_ID_E2200),
+	  .driver_data = ALX_DEV_QUIRK_MSI_INTX_DISABLE_BUG },
+	{ PCI_VDEVICE(ATTANSIC, ALX_DEV_ID_AR8162),
+	  .driver_data = ALX_DEV_QUIRK_MSI_INTX_DISABLE_BUG },
+	{ PCI_VDEVICE(ATTANSIC, ALX_DEV_ID_AR8171) },
+	{ PCI_VDEVICE(ATTANSIC, ALX_DEV_ID_AR8172) },
+	{}
+};
+
+static struct pci_driver alx_driver = {
+	.name        = alx_drv_name,
+	.id_table    = alx_pci_tbl,
+	.probe       = alx_probe,
+	.remove      = alx_remove,
+	.shutdown    = alx_shutdown,
+	.err_handler = &alx_err_handlers,
+	.driver.pm   = ALX_PM_OPS,
+};
+
+module_pci_driver(alx_driver);
+MODULE_DEVICE_TABLE(pci, alx_pci_tbl);
+MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
+MODULE_AUTHOR("Qualcomm Corporation, <nic-devel@qualcomm.com>");
+MODULE_DESCRIPTION(
+	"Qualcomm Atheros(R) AR816x/AR817x PCI-E Ethernet Network Driver");
+MODULE_LICENSE("GPL");

diff --git a/drivers/net/ethernet/atheros/alx/reg.h b/drivers/net/ethernet/atheros/alx/reg.h
new file mode 100644
index 0000000..e4358c9
--- /dev/null
+++ b/drivers/net/ethernet/atheros/alx/reg.h

@@ -0,0 +1,810 @@
+/*
+ * Copyright (c) 2013 Johannes Berg <johannes@sipsolutions.net>
+ *
+ *  This file is free software: you may copy, redistribute and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation, either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  This file is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This file incorporates work covered by the following copyright and
+ * permission notice:
+ *
+ * Copyright (c) 2012 Qualcomm Atheros, Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef ALX_REG_H
+#define ALX_REG_H
+
+#define ALX_DEV_ID_AR8161				0x1091
+#define ALX_DEV_ID_E2200				0xe091
+#define ALX_DEV_ID_AR8162				0x1090
+#define ALX_DEV_ID_AR8171				0x10A1
+#define ALX_DEV_ID_AR8172				0x10A0
+
+/* rev definition,
+ * bit(0): with xD support
+ * bit(1): with Card Reader function
+ * bit(7:2): real revision
+ */
+#define ALX_PCI_REVID_SHIFT				3
+#define ALX_REV_A0					0
+#define ALX_REV_A1					1
+#define ALX_REV_B0					2
+#define ALX_REV_C0					3
+
+#define ALX_DEV_CTRL					0x0060
+#define ALX_DEV_CTRL_MAXRRS_MIN				2
+
+#define ALX_MSIX_MASK					0x0090
+
+#define ALX_UE_SVRT					0x010C
+#define ALX_UE_SVRT_FCPROTERR				BIT(13)
+#define ALX_UE_SVRT_DLPROTERR				BIT(4)
+
+/* eeprom & flash load register */
+#define ALX_EFLD					0x0204
+#define ALX_EFLD_F_EXIST				BIT(10)
+#define ALX_EFLD_E_EXIST				BIT(9)
+#define ALX_EFLD_STAT					BIT(5)
+#define ALX_EFLD_START					BIT(0)
+
+/* eFuse load register */
+#define ALX_SLD						0x0218
+#define ALX_SLD_STAT					BIT(12)
+#define ALX_SLD_START					BIT(11)
+#define ALX_SLD_MAX_TO					100
+
+#define ALX_PDLL_TRNS1					0x1104
+#define ALX_PDLL_TRNS1_D3PLLOFF_EN			BIT(11)
+
+#define ALX_PMCTRL					0x12F8
+#define ALX_PMCTRL_HOTRST_WTEN				BIT(31)
+/* bit30: L0s/L1 controlled by MAC based on throughput(setting in 15A0) */
+#define ALX_PMCTRL_ASPM_FCEN				BIT(30)
+#define ALX_PMCTRL_SADLY_EN				BIT(29)
+#define ALX_PMCTRL_LCKDET_TIMER_MASK			0xF
+#define ALX_PMCTRL_LCKDET_TIMER_SHIFT			24
+#define ALX_PMCTRL_LCKDET_TIMER_DEF			0xC
+/* bit[23:20] if pm_request_l1 time > @, then enter L0s not L1 */
+#define ALX_PMCTRL_L1REQ_TO_MASK			0xF
+#define ALX_PMCTRL_L1REQ_TO_SHIFT			20
+#define ALX_PMCTRL_L1REG_TO_DEF				0xF
+#define ALX_PMCTRL_TXL1_AFTER_L0S			BIT(19)
+#define ALX_PMCTRL_L1_TIMER_MASK			0x7
+#define ALX_PMCTRL_L1_TIMER_SHIFT			16
+#define ALX_PMCTRL_L1_TIMER_16US			4
+#define ALX_PMCTRL_RCVR_WT_1US				BIT(15)
+/* bit13: enable pcie clk switch in L1 state */
+#define ALX_PMCTRL_L1_CLKSW_EN				BIT(13)
+#define ALX_PMCTRL_L0S_EN				BIT(12)
+#define ALX_PMCTRL_RXL1_AFTER_L0S			BIT(11)
+#define ALX_PMCTRL_L1_BUFSRX_EN				BIT(7)
+/* bit6: power down serdes RX */
+#define ALX_PMCTRL_L1_SRDSRX_PWD			BIT(6)
+#define ALX_PMCTRL_L1_SRDSPLL_EN			BIT(5)
+#define ALX_PMCTRL_L1_SRDS_EN				BIT(4)
+#define ALX_PMCTRL_L1_EN				BIT(3)
+
+/*******************************************************/
+/* following registers are mapped only to memory space */
+/*******************************************************/
+
+#define ALX_MASTER					0x1400
+/* bit12: 1:alwys select pclk from serdes, not sw to 25M */
+#define ALX_MASTER_PCLKSEL_SRDS				BIT(12)
+/* bit11: irq moduration for rx */
+#define ALX_MASTER_IRQMOD2_EN				BIT(11)
+/* bit10: irq moduration for tx/rx */
+#define ALX_MASTER_IRQMOD1_EN				BIT(10)
+#define ALX_MASTER_SYSALVTIMER_EN			BIT(7)
+#define ALX_MASTER_OOB_DIS				BIT(6)
+/* bit5: wakeup without pcie clk */
+#define ALX_MASTER_WAKEN_25M				BIT(5)
+/* bit0: MAC & DMA reset */
+#define ALX_MASTER_DMA_MAC_RST				BIT(0)
+#define ALX_DMA_MAC_RST_TO				50
+
+#define ALX_IRQ_MODU_TIMER				0x1408
+#define ALX_IRQ_MODU_TIMER1_MASK			0xFFFF
+#define ALX_IRQ_MODU_TIMER1_SHIFT			0
+
+#define ALX_PHY_CTRL					0x140C
+#define ALX_PHY_CTRL_100AB_EN				BIT(17)
+/* bit14: affect MAC & PHY, go to low power sts */
+#define ALX_PHY_CTRL_POWER_DOWN				BIT(14)
+/* bit13: 1:pll always ON, 0:can switch in lpw */
+#define ALX_PHY_CTRL_PLL_ON				BIT(13)
+#define ALX_PHY_CTRL_RST_ANALOG				BIT(12)
+#define ALX_PHY_CTRL_HIB_PULSE				BIT(11)
+#define ALX_PHY_CTRL_HIB_EN				BIT(10)
+#define ALX_PHY_CTRL_IDDQ				BIT(7)
+#define ALX_PHY_CTRL_GATE_25M				BIT(5)
+#define ALX_PHY_CTRL_LED_MODE				BIT(2)
+/* bit0: out of dsp RST state */
+#define ALX_PHY_CTRL_DSPRST_OUT				BIT(0)
+#define ALX_PHY_CTRL_DSPRST_TO				80
+#define ALX_PHY_CTRL_CLS	(ALX_PHY_CTRL_LED_MODE | \
+				 ALX_PHY_CTRL_100AB_EN | \
+				 ALX_PHY_CTRL_PLL_ON)
+
+#define ALX_MAC_STS					0x1410
+#define ALX_MAC_STS_TXQ_BUSY				BIT(3)
+#define ALX_MAC_STS_RXQ_BUSY				BIT(2)
+#define ALX_MAC_STS_TXMAC_BUSY				BIT(1)
+#define ALX_MAC_STS_RXMAC_BUSY				BIT(0)
+#define ALX_MAC_STS_IDLE	(ALX_MAC_STS_TXQ_BUSY | \
+				 ALX_MAC_STS_RXQ_BUSY | \
+				 ALX_MAC_STS_TXMAC_BUSY | \
+				 ALX_MAC_STS_RXMAC_BUSY)
+
+#define ALX_MDIO					0x1414
+#define ALX_MDIO_MODE_EXT				BIT(30)
+#define ALX_MDIO_BUSY					BIT(27)
+#define ALX_MDIO_CLK_SEL_MASK				0x7
+#define ALX_MDIO_CLK_SEL_SHIFT				24
+#define ALX_MDIO_CLK_SEL_25MD4				0
+#define ALX_MDIO_CLK_SEL_25MD128			7
+#define ALX_MDIO_START					BIT(23)
+#define ALX_MDIO_SPRES_PRMBL				BIT(22)
+/* bit21: 1:read,0:write */
+#define ALX_MDIO_OP_READ				BIT(21)
+#define ALX_MDIO_REG_MASK				0x1F
+#define ALX_MDIO_REG_SHIFT				16
+#define ALX_MDIO_DATA_MASK				0xFFFF
+#define ALX_MDIO_DATA_SHIFT				0
+#define ALX_MDIO_MAX_AC_TO				120
+
+#define ALX_MDIO_EXTN					0x1448
+#define ALX_MDIO_EXTN_DEVAD_MASK			0x1F
+#define ALX_MDIO_EXTN_DEVAD_SHIFT			16
+#define ALX_MDIO_EXTN_REG_MASK				0xFFFF
+#define ALX_MDIO_EXTN_REG_SHIFT				0
+
+#define ALX_SERDES					0x1424
+#define ALX_SERDES_PHYCLK_SLWDWN			BIT(18)
+#define ALX_SERDES_MACCLK_SLWDWN			BIT(17)
+
+#define ALX_LPI_CTRL					0x1440
+#define ALX_LPI_CTRL_EN					BIT(0)
+
+/* for B0+, bit[13..] for C0+ */
+#define ALX_HRTBT_EXT_CTRL				0x1AD0
+#define L1F_HRTBT_EXT_CTRL_PERIOD_HIGH_MASK		0x3F
+#define L1F_HRTBT_EXT_CTRL_PERIOD_HIGH_SHIFT		24
+#define L1F_HRTBT_EXT_CTRL_SWOI_STARTUP_PKT_EN		BIT(23)
+#define L1F_HRTBT_EXT_CTRL_IOAC_2_FRAGMENTED		BIT(22)
+#define L1F_HRTBT_EXT_CTRL_IOAC_1_FRAGMENTED		BIT(21)
+#define L1F_HRTBT_EXT_CTRL_IOAC_1_KEEPALIVE_EN		BIT(20)
+#define L1F_HRTBT_EXT_CTRL_IOAC_1_HAS_VLAN		BIT(19)
+#define L1F_HRTBT_EXT_CTRL_IOAC_1_IS_8023		BIT(18)
+#define L1F_HRTBT_EXT_CTRL_IOAC_1_IS_IPV6		BIT(17)
+#define L1F_HRTBT_EXT_CTRL_IOAC_2_KEEPALIVE_EN		BIT(16)
+#define L1F_HRTBT_EXT_CTRL_IOAC_2_HAS_VLAN		BIT(15)
+#define L1F_HRTBT_EXT_CTRL_IOAC_2_IS_8023		BIT(14)
+#define L1F_HRTBT_EXT_CTRL_IOAC_2_IS_IPV6		BIT(13)
+#define ALX_HRTBT_EXT_CTRL_NS_EN			BIT(12)
+#define ALX_HRTBT_EXT_CTRL_FRAG_LEN_MASK		0xFF
+#define ALX_HRTBT_EXT_CTRL_FRAG_LEN_SHIFT		4
+#define ALX_HRTBT_EXT_CTRL_IS_8023			BIT(3)
+#define ALX_HRTBT_EXT_CTRL_IS_IPV6			BIT(2)
+#define ALX_HRTBT_EXT_CTRL_WAKEUP_EN			BIT(1)
+#define ALX_HRTBT_EXT_CTRL_ARP_EN			BIT(0)
+
+#define ALX_HRTBT_REM_IPV4_ADDR				0x1AD4
+#define ALX_HRTBT_HOST_IPV4_ADDR			0x1478
+#define ALX_HRTBT_REM_IPV6_ADDR3			0x1AD8
+#define ALX_HRTBT_REM_IPV6_ADDR2			0x1ADC
+#define ALX_HRTBT_REM_IPV6_ADDR1			0x1AE0
+#define ALX_HRTBT_REM_IPV6_ADDR0			0x1AE4
+
+/* 1B8C ~ 1B94 for C0+ */
+#define ALX_SWOI_ACER_CTRL				0x1B8C
+#define ALX_SWOI_ORIG_ACK_NAK_EN			BIT(20)
+#define ALX_SWOI_ORIG_ACK_NAK_PKT_LEN_MASK		0XFF
+#define ALX_SWOI_ORIG_ACK_NAK_PKT_LEN_SHIFT		12
+#define ALX_SWOI_ORIG_ACK_ADDR_MASK			0XFFF
+#define ALX_SWOI_ORIG_ACK_ADDR_SHIFT			0
+
+#define ALX_SWOI_IOAC_CTRL_2				0x1B90
+#define ALX_SWOI_IOAC_CTRL_2_SWOI_1_FRAG_LEN_MASK	0xFF
+#define ALX_SWOI_IOAC_CTRL_2_SWOI_1_FRAG_LEN_SHIFT	24
+#define ALX_SWOI_IOAC_CTRL_2_SWOI_1_PKT_LEN_MASK	0xFFF
+#define ALX_SWOI_IOAC_CTRL_2_SWOI_1_PKT_LEN_SHIFT	12
+#define ALX_SWOI_IOAC_CTRL_2_SWOI_1_HDR_ADDR_MASK	0xFFF
+#define ALX_SWOI_IOAC_CTRL_2_SWOI_1_HDR_ADDR_SHIFT	0
+
+#define ALX_SWOI_IOAC_CTRL_3				0x1B94
+#define ALX_SWOI_IOAC_CTRL_3_SWOI_2_FRAG_LEN_MASK	0xFF
+#define ALX_SWOI_IOAC_CTRL_3_SWOI_2_FRAG_LEN_SHIFT	24
+#define ALX_SWOI_IOAC_CTRL_3_SWOI_2_PKT_LEN_MASK	0xFFF
+#define ALX_SWOI_IOAC_CTRL_3_SWOI_2_PKT_LEN_SHIFT	12
+#define ALX_SWOI_IOAC_CTRL_3_SWOI_2_HDR_ADDR_MASK	0xFFF
+#define ALX_SWOI_IOAC_CTRL_3_SWOI_2_HDR_ADDR_SHIFT	0
+
+/* for B0 */
+#define ALX_IDLE_DECISN_TIMER				0x1474
+/* 1ms */
+#define ALX_IDLE_DECISN_TIMER_DEF			0x400
+
+#define ALX_MAC_CTRL					0x1480
+#define ALX_MAC_CTRL_FAST_PAUSE				BIT(31)
+#define ALX_MAC_CTRL_WOLSPED_SWEN			BIT(30)
+/* bit29: 1:legacy(hi5b), 0:marvl(lo5b)*/
+#define ALX_MAC_CTRL_MHASH_ALG_HI5B			BIT(29)
+#define ALX_MAC_CTRL_BRD_EN				BIT(26)
+#define ALX_MAC_CTRL_MULTIALL_EN			BIT(25)
+#define ALX_MAC_CTRL_SPEED_MASK				0x3
+#define ALX_MAC_CTRL_SPEED_SHIFT			20
+#define ALX_MAC_CTRL_SPEED_10_100			1
+#define ALX_MAC_CTRL_SPEED_1000				2
+#define ALX_MAC_CTRL_PROMISC_EN				BIT(15)
+#define ALX_MAC_CTRL_VLANSTRIP				BIT(14)
+#define ALX_MAC_CTRL_PRMBLEN_MASK			0xF
+#define ALX_MAC_CTRL_PRMBLEN_SHIFT			10
+#define ALX_MAC_CTRL_PCRCE				BIT(7)
+#define ALX_MAC_CTRL_CRCE				BIT(6)
+#define ALX_MAC_CTRL_FULLD				BIT(5)
+#define ALX_MAC_CTRL_RXFC_EN				BIT(3)
+#define ALX_MAC_CTRL_TXFC_EN				BIT(2)
+#define ALX_MAC_CTRL_RX_EN				BIT(1)
+#define ALX_MAC_CTRL_TX_EN				BIT(0)
+
+#define ALX_STAD0					0x1488
+#define ALX_STAD1					0x148C
+
+#define ALX_HASH_TBL0					0x1490
+#define ALX_HASH_TBL1					0x1494
+
+#define ALX_MTU						0x149C
+#define ALX_MTU_JUMBO_TH				1514
+#define ALX_MTU_STD_ALGN				1536
+
+#define ALX_SRAM5					0x1524
+#define ALX_SRAM_RXF_LEN_MASK				0xFFF
+#define ALX_SRAM_RXF_LEN_SHIFT				0
+#define ALX_SRAM_RXF_LEN_8K				(8*1024)
+
+#define ALX_SRAM9					0x1534
+#define ALX_SRAM_LOAD_PTR				BIT(0)
+
+#define ALX_RX_BASE_ADDR_HI				0x1540
+
+#define ALX_TX_BASE_ADDR_HI				0x1544
+
+#define ALX_RFD_ADDR_LO					0x1550
+#define ALX_RFD_RING_SZ					0x1560
+#define ALX_RFD_BUF_SZ					0x1564
+
+#define ALX_RRD_ADDR_LO					0x1568
+#define ALX_RRD_RING_SZ					0x1578
+
+/* pri3: highest, pri0: lowest */
+#define ALX_TPD_PRI3_ADDR_LO				0x14E4
+#define ALX_TPD_PRI2_ADDR_LO				0x14E0
+#define ALX_TPD_PRI1_ADDR_LO				0x157C
+#define ALX_TPD_PRI0_ADDR_LO				0x1580
+
+/* producer index is 16bit */
+#define ALX_TPD_PRI3_PIDX				0x1618
+#define ALX_TPD_PRI2_PIDX				0x161A
+#define ALX_TPD_PRI1_PIDX				0x15F0
+#define ALX_TPD_PRI0_PIDX				0x15F2
+
+/* consumer index is 16bit */
+#define ALX_TPD_PRI3_CIDX				0x161C
+#define ALX_TPD_PRI2_CIDX				0x161E
+#define ALX_TPD_PRI1_CIDX				0x15F4
+#define ALX_TPD_PRI0_CIDX				0x15F6
+
+#define ALX_TPD_RING_SZ					0x1584
+
+#define ALX_TXQ0					0x1590
+#define ALX_TXQ0_TXF_BURST_PREF_MASK			0xFFFF
+#define ALX_TXQ0_TXF_BURST_PREF_SHIFT			16
+#define ALX_TXQ_TXF_BURST_PREF_DEF			0x200
+#define ALX_TXQ0_LSO_8023_EN				BIT(7)
+#define ALX_TXQ0_MODE_ENHANCE				BIT(6)
+#define ALX_TXQ0_EN					BIT(5)
+#define ALX_TXQ0_SUPT_IPOPT				BIT(4)
+#define ALX_TXQ0_TPD_BURSTPREF_MASK			0xF
+#define ALX_TXQ0_TPD_BURSTPREF_SHIFT			0
+#define ALX_TXQ_TPD_BURSTPREF_DEF			5
+
+#define ALX_TXQ1					0x1594
+/* bit11:  drop large packet, len > (rfd buf) */
+#define ALX_TXQ1_ERRLGPKT_DROP_EN			BIT(11)
+#define ALX_TXQ1_JUMBO_TSO_TH				(7*1024)
+
+#define ALX_RXQ0					0x15A0
+#define ALX_RXQ0_EN					BIT(31)
+#define ALX_RXQ0_RSS_HASH_EN				BIT(29)
+#define ALX_RXQ0_RSS_MODE_MASK				0x3
+#define ALX_RXQ0_RSS_MODE_SHIFT				26
+#define ALX_RXQ0_RSS_MODE_DIS				0
+#define ALX_RXQ0_RSS_MODE_MQMI				3
+#define ALX_RXQ0_NUM_RFD_PREF_MASK			0x3F
+#define ALX_RXQ0_NUM_RFD_PREF_SHIFT			20
+#define ALX_RXQ0_NUM_RFD_PREF_DEF			8
+#define ALX_RXQ0_IDT_TBL_SIZE_MASK			0x1FF
+#define ALX_RXQ0_IDT_TBL_SIZE_SHIFT			8
+#define ALX_RXQ0_IDT_TBL_SIZE_DEF			0x100
+#define ALX_RXQ0_IDT_TBL_SIZE_NORMAL			128
+#define ALX_RXQ0_IPV6_PARSE_EN				BIT(7)
+#define ALX_RXQ0_RSS_HSTYP_MASK				0xF
+#define ALX_RXQ0_RSS_HSTYP_SHIFT			2
+#define ALX_RXQ0_RSS_HSTYP_IPV6_TCP_EN			BIT(5)
+#define ALX_RXQ0_RSS_HSTYP_IPV6_EN			BIT(4)
+#define ALX_RXQ0_RSS_HSTYP_IPV4_TCP_EN			BIT(3)
+#define ALX_RXQ0_RSS_HSTYP_IPV4_EN			BIT(2)
+#define ALX_RXQ0_RSS_HSTYP_ALL		(ALX_RXQ0_RSS_HSTYP_IPV6_TCP_EN | \
+					 ALX_RXQ0_RSS_HSTYP_IPV4_TCP_EN | \
+					 ALX_RXQ0_RSS_HSTYP_IPV6_EN | \
+					 ALX_RXQ0_RSS_HSTYP_IPV4_EN)
+#define ALX_RXQ0_ASPM_THRESH_MASK			0x3
+#define ALX_RXQ0_ASPM_THRESH_SHIFT			0
+#define ALX_RXQ0_ASPM_THRESH_100M			3
+
+#define ALX_RXQ2					0x15A8
+#define ALX_RXQ2_RXF_XOFF_THRESH_MASK			0xFFF
+#define ALX_RXQ2_RXF_XOFF_THRESH_SHIFT			16
+#define ALX_RXQ2_RXF_XON_THRESH_MASK			0xFFF
+#define ALX_RXQ2_RXF_XON_THRESH_SHIFT			0
+/* Size = tx-packet(1522) + IPG(12) + SOF(8) + 64(Pause) + IPG(12) + SOF(8) +
+ *        rx-packet(1522) + delay-of-link(64)
+ *      = 3212.
+ */
+#define ALX_RXQ2_RXF_FLOW_CTRL_RSVD			3212
+
+#define ALX_DMA						0x15C0
+#define ALX_DMA_RCHNL_SEL_MASK				0x3
+#define ALX_DMA_RCHNL_SEL_SHIFT				26
+#define ALX_DMA_WDLY_CNT_MASK				0xF
+#define ALX_DMA_WDLY_CNT_SHIFT				16
+#define ALX_DMA_WDLY_CNT_DEF				4
+#define ALX_DMA_RDLY_CNT_MASK				0x1F
+#define ALX_DMA_RDLY_CNT_SHIFT				11
+#define ALX_DMA_RDLY_CNT_DEF				15
+/* bit10: 0:tpd with pri, 1: data */
+#define ALX_DMA_RREQ_PRI_DATA				BIT(10)
+#define ALX_DMA_RREQ_BLEN_MASK				0x7
+#define ALX_DMA_RREQ_BLEN_SHIFT				4
+#define ALX_DMA_RORDER_MODE_MASK			0x7
+#define ALX_DMA_RORDER_MODE_SHIFT			0
+#define ALX_DMA_RORDER_MODE_OUT				4
+
+#define ALX_WOL0					0x14A0
+#define ALX_WOL0_PME_LINK				BIT(5)
+#define ALX_WOL0_LINK_EN				BIT(4)
+#define ALX_WOL0_PME_MAGIC_EN				BIT(3)
+#define ALX_WOL0_MAGIC_EN				BIT(2)
+
+#define ALX_RFD_PIDX					0x15E0
+
+#define ALX_RFD_CIDX					0x15F8
+
+/* MIB */
+#define ALX_MIB_BASE					0x1700
+#define ALX_MIB_RX_OK					(ALX_MIB_BASE + 0)
+#define ALX_MIB_RX_ERRADDR				(ALX_MIB_BASE + 92)
+#define ALX_MIB_TX_OK					(ALX_MIB_BASE + 96)
+#define ALX_MIB_TX_MCCNT				(ALX_MIB_BASE + 192)
+
+#define ALX_RX_STATS_BIN				ALX_MIB_RX_OK
+#define ALX_RX_STATS_END				ALX_MIB_RX_ERRADDR
+#define ALX_TX_STATS_BIN				ALX_MIB_TX_OK
+#define ALX_TX_STATS_END				ALX_MIB_TX_MCCNT
+
+#define ALX_ISR						0x1600
+#define ALX_ISR_DIS					BIT(31)
+#define ALX_ISR_RX_Q7					BIT(30)
+#define ALX_ISR_RX_Q6					BIT(29)
+#define ALX_ISR_RX_Q5					BIT(28)
+#define ALX_ISR_RX_Q4					BIT(27)
+#define ALX_ISR_PCIE_LNKDOWN				BIT(26)
+#define ALX_ISR_RX_Q3					BIT(19)
+#define ALX_ISR_RX_Q2					BIT(18)
+#define ALX_ISR_RX_Q1					BIT(17)
+#define ALX_ISR_RX_Q0					BIT(16)
+#define ALX_ISR_TX_Q0					BIT(15)
+#define ALX_ISR_PHY					BIT(12)
+#define ALX_ISR_DMAW					BIT(10)
+#define ALX_ISR_DMAR					BIT(9)
+#define ALX_ISR_TXF_UR					BIT(8)
+#define ALX_ISR_TX_Q3					BIT(7)
+#define ALX_ISR_TX_Q2					BIT(6)
+#define ALX_ISR_TX_Q1					BIT(5)
+#define ALX_ISR_RFD_UR					BIT(4)
+#define ALX_ISR_RXF_OV					BIT(3)
+#define ALX_ISR_MANU					BIT(2)
+#define ALX_ISR_TIMER					BIT(1)
+#define ALX_ISR_SMB					BIT(0)
+
+#define ALX_IMR						0x1604
+
+/* re-send assert msg if SW no response */
+#define ALX_INT_RETRIG					0x1608
+/* 40ms */
+#define ALX_INT_RETRIG_TO				20000
+
+#define ALX_SMB_TIMER					0x15C4
+
+#define ALX_TINT_TPD_THRSHLD				0x15C8
+
+#define ALX_TINT_TIMER					0x15CC
+
+#define ALX_CLK_GATE					0x1814
+#define ALX_CLK_GATE_RXMAC				BIT(5)
+#define ALX_CLK_GATE_TXMAC				BIT(4)
+#define ALX_CLK_GATE_RXQ				BIT(3)
+#define ALX_CLK_GATE_TXQ				BIT(2)
+#define ALX_CLK_GATE_DMAR				BIT(1)
+#define ALX_CLK_GATE_DMAW				BIT(0)
+#define ALX_CLK_GATE_ALL		(ALX_CLK_GATE_RXMAC | \
+					 ALX_CLK_GATE_TXMAC | \
+					 ALX_CLK_GATE_RXQ | \
+					 ALX_CLK_GATE_TXQ | \
+					 ALX_CLK_GATE_DMAR | \
+					 ALX_CLK_GATE_DMAW)
+
+/* interop between drivers */
+#define ALX_DRV						0x1804
+#define ALX_DRV_PHY_AUTO				BIT(28)
+#define ALX_DRV_PHY_1000				BIT(27)
+#define ALX_DRV_PHY_100					BIT(26)
+#define ALX_DRV_PHY_10					BIT(25)
+#define ALX_DRV_PHY_DUPLEX				BIT(24)
+/* bit23: adv Pause */
+#define ALX_DRV_PHY_PAUSE				BIT(23)
+/* bit22: adv Asym Pause */
+#define ALX_DRV_PHY_MASK				0xFF
+#define ALX_DRV_PHY_SHIFT				21
+#define ALX_DRV_PHY_UNKNOWN				0
+
+/* flag of phy inited */
+#define ALX_PHY_INITED					0x003F
+
+/* reg 1830 ~ 186C for C0+, 16 bit map patterns and wake packet detection */
+#define ALX_WOL_CTRL2					0x1830
+#define ALX_WOL_CTRL2_DATA_STORE			BIT(3)
+#define ALX_WOL_CTRL2_PTRN_EVT				BIT(2)
+#define ALX_WOL_CTRL2_PME_PTRN_EN			BIT(1)
+#define ALX_WOL_CTRL2_PTRN_EN				BIT(0)
+
+#define ALX_WOL_CTRL3					0x1834
+#define ALX_WOL_CTRL3_PTRN_ADDR_MASK			0xFFFFF
+#define ALX_WOL_CTRL3_PTRN_ADDR_SHIFT			0
+
+#define ALX_WOL_CTRL4					0x1838
+#define ALX_WOL_CTRL4_PT15_MATCH			BIT(31)
+#define ALX_WOL_CTRL4_PT14_MATCH			BIT(30)
+#define ALX_WOL_CTRL4_PT13_MATCH			BIT(29)
+#define ALX_WOL_CTRL4_PT12_MATCH			BIT(28)
+#define ALX_WOL_CTRL4_PT11_MATCH			BIT(27)
+#define ALX_WOL_CTRL4_PT10_MATCH			BIT(26)
+#define ALX_WOL_CTRL4_PT9_MATCH				BIT(25)
+#define ALX_WOL_CTRL4_PT8_MATCH				BIT(24)
+#define ALX_WOL_CTRL4_PT7_MATCH				BIT(23)
+#define ALX_WOL_CTRL4_PT6_MATCH				BIT(22)
+#define ALX_WOL_CTRL4_PT5_MATCH				BIT(21)
+#define ALX_WOL_CTRL4_PT4_MATCH				BIT(20)
+#define ALX_WOL_CTRL4_PT3_MATCH				BIT(19)
+#define ALX_WOL_CTRL4_PT2_MATCH				BIT(18)
+#define ALX_WOL_CTRL4_PT1_MATCH				BIT(17)
+#define ALX_WOL_CTRL4_PT0_MATCH				BIT(16)
+#define ALX_WOL_CTRL4_PT15_EN				BIT(15)
+#define ALX_WOL_CTRL4_PT14_EN				BIT(14)
+#define ALX_WOL_CTRL4_PT13_EN				BIT(13)
+#define ALX_WOL_CTRL4_PT12_EN				BIT(12)
+#define ALX_WOL_CTRL4_PT11_EN				BIT(11)
+#define ALX_WOL_CTRL4_PT10_EN				BIT(10)
+#define ALX_WOL_CTRL4_PT9_EN				BIT(9)
+#define ALX_WOL_CTRL4_PT8_EN				BIT(8)
+#define ALX_WOL_CTRL4_PT7_EN				BIT(7)
+#define ALX_WOL_CTRL4_PT6_EN				BIT(6)
+#define ALX_WOL_CTRL4_PT5_EN				BIT(5)
+#define ALX_WOL_CTRL4_PT4_EN				BIT(4)
+#define ALX_WOL_CTRL4_PT3_EN				BIT(3)
+#define ALX_WOL_CTRL4_PT2_EN				BIT(2)
+#define ALX_WOL_CTRL4_PT1_EN				BIT(1)
+#define ALX_WOL_CTRL4_PT0_EN				BIT(0)
+
+#define ALX_WOL_CTRL5					0x183C
+#define ALX_WOL_CTRL5_PT3_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT3_LEN_SHIFT			24
+#define ALX_WOL_CTRL5_PT2_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT2_LEN_SHIFT			16
+#define ALX_WOL_CTRL5_PT1_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT1_LEN_SHIFT			8
+#define ALX_WOL_CTRL5_PT0_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT0_LEN_SHIFT			0
+
+#define ALX_WOL_CTRL6					0x1840
+#define ALX_WOL_CTRL5_PT7_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT7_LEN_SHIFT			24
+#define ALX_WOL_CTRL5_PT6_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT6_LEN_SHIFT			16
+#define ALX_WOL_CTRL5_PT5_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT5_LEN_SHIFT			8
+#define ALX_WOL_CTRL5_PT4_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT4_LEN_SHIFT			0
+
+#define ALX_WOL_CTRL7					0x1844
+#define ALX_WOL_CTRL5_PT11_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT11_LEN_SHIFT			24
+#define ALX_WOL_CTRL5_PT10_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT10_LEN_SHIFT			16
+#define ALX_WOL_CTRL5_PT9_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT9_LEN_SHIFT			8
+#define ALX_WOL_CTRL5_PT8_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT8_LEN_SHIFT			0
+
+#define ALX_WOL_CTRL8					0x1848
+#define ALX_WOL_CTRL5_PT15_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT15_LEN_SHIFT			24
+#define ALX_WOL_CTRL5_PT14_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT14_LEN_SHIFT			16
+#define ALX_WOL_CTRL5_PT13_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT13_LEN_SHIFT			8
+#define ALX_WOL_CTRL5_PT12_LEN_MASK			0xFF
+#define ALX_WOL_CTRL5_PT12_LEN_SHIFT			0
+
+#define ALX_ACER_FIXED_PTN0				0x1850
+#define ALX_ACER_FIXED_PTN0_MASK			0xFFFFFFFF
+#define ALX_ACER_FIXED_PTN0_SHIFT			0
+
+#define ALX_ACER_FIXED_PTN1				0x1854
+#define ALX_ACER_FIXED_PTN1_MASK			0xFFFF
+#define ALX_ACER_FIXED_PTN1_SHIFT			0
+
+#define ALX_ACER_RANDOM_NUM0				0x1858
+#define ALX_ACER_RANDOM_NUM0_MASK			0xFFFFFFFF
+#define ALX_ACER_RANDOM_NUM0_SHIFT			0
+
+#define ALX_ACER_RANDOM_NUM1				0x185C
+#define ALX_ACER_RANDOM_NUM1_MASK			0xFFFFFFFF
+#define ALX_ACER_RANDOM_NUM1_SHIFT			0
+
+#define ALX_ACER_RANDOM_NUM2				0x1860
+#define ALX_ACER_RANDOM_NUM2_MASK			0xFFFFFFFF
+#define ALX_ACER_RANDOM_NUM2_SHIFT			0
+
+#define ALX_ACER_RANDOM_NUM3				0x1864
+#define ALX_ACER_RANDOM_NUM3_MASK			0xFFFFFFFF
+#define ALX_ACER_RANDOM_NUM3_SHIFT			0
+
+#define ALX_ACER_MAGIC					0x1868
+#define ALX_ACER_MAGIC_EN				BIT(31)
+#define ALX_ACER_MAGIC_PME_EN				BIT(30)
+#define ALX_ACER_MAGIC_MATCH				BIT(29)
+#define ALX_ACER_MAGIC_FF_CHECK				BIT(10)
+#define ALX_ACER_MAGIC_RAN_LEN_MASK			0x1F
+#define ALX_ACER_MAGIC_RAN_LEN_SHIFT			5
+#define ALX_ACER_MAGIC_FIX_LEN_MASK			0x1F
+#define ALX_ACER_MAGIC_FIX_LEN_SHIFT			0
+
+#define ALX_ACER_TIMER					0x186C
+#define ALX_ACER_TIMER_EN				BIT(31)
+#define ALX_ACER_TIMER_PME_EN				BIT(30)
+#define ALX_ACER_TIMER_MATCH				BIT(29)
+#define ALX_ACER_TIMER_THRES_MASK			0x1FFFF
+#define ALX_ACER_TIMER_THRES_SHIFT			0
+#define ALX_ACER_TIMER_THRES_DEF			1
+
+/* RSS definitions */
+#define ALX_RSS_KEY0					0x14B0
+#define ALX_RSS_KEY1					0x14B4
+#define ALX_RSS_KEY2					0x14B8
+#define ALX_RSS_KEY3					0x14BC
+#define ALX_RSS_KEY4					0x14C0
+#define ALX_RSS_KEY5					0x14C4
+#define ALX_RSS_KEY6					0x14C8
+#define ALX_RSS_KEY7					0x14CC
+#define ALX_RSS_KEY8					0x14D0
+#define ALX_RSS_KEY9					0x14D4
+
+#define ALX_RSS_IDT_TBL0				0x1B00
+
+#define ALX_MSI_MAP_TBL1				0x15D0
+#define ALX_MSI_MAP_TBL1_TXQ1_SHIFT			20
+#define ALX_MSI_MAP_TBL1_TXQ0_SHIFT			16
+#define ALX_MSI_MAP_TBL1_RXQ3_SHIFT			12
+#define ALX_MSI_MAP_TBL1_RXQ2_SHIFT			8
+#define ALX_MSI_MAP_TBL1_RXQ1_SHIFT			4
+#define ALX_MSI_MAP_TBL1_RXQ0_SHIFT			0
+
+#define ALX_MSI_MAP_TBL2				0x15D8
+#define ALX_MSI_MAP_TBL2_TXQ3_SHIFT			20
+#define ALX_MSI_MAP_TBL2_TXQ2_SHIFT			16
+#define ALX_MSI_MAP_TBL2_RXQ7_SHIFT			12
+#define ALX_MSI_MAP_TBL2_RXQ6_SHIFT			8
+#define ALX_MSI_MAP_TBL2_RXQ5_SHIFT			4
+#define ALX_MSI_MAP_TBL2_RXQ4_SHIFT			0
+
+#define ALX_MSI_ID_MAP					0x15D4
+
+#define ALX_MSI_RETRANS_TIMER				0x1920
+/* bit16: 1:line,0:standard */
+#define ALX_MSI_MASK_SEL_LINE				BIT(16)
+#define ALX_MSI_RETRANS_TM_MASK				0xFFFF
+#define ALX_MSI_RETRANS_TM_SHIFT			0
+
+/* CR DMA ctrl */
+
+/* TX QoS */
+#define ALX_WRR						0x1938
+#define ALX_WRR_PRI_MASK				0x3
+#define ALX_WRR_PRI_SHIFT				29
+#define ALX_WRR_PRI_RESTRICT_NONE			3
+#define ALX_WRR_PRI3_MASK				0x1F
+#define ALX_WRR_PRI3_SHIFT				24
+#define ALX_WRR_PRI2_MASK				0x1F
+#define ALX_WRR_PRI2_SHIFT				16
+#define ALX_WRR_PRI1_MASK				0x1F
+#define ALX_WRR_PRI1_SHIFT				8
+#define ALX_WRR_PRI0_MASK				0x1F
+#define ALX_WRR_PRI0_SHIFT				0
+
+#define ALX_HQTPD					0x193C
+#define ALX_HQTPD_BURST_EN				BIT(31)
+#define ALX_HQTPD_Q3_NUMPREF_MASK			0xF
+#define ALX_HQTPD_Q3_NUMPREF_SHIFT			8
+#define ALX_HQTPD_Q2_NUMPREF_MASK			0xF
+#define ALX_HQTPD_Q2_NUMPREF_SHIFT			4
+#define ALX_HQTPD_Q1_NUMPREF_MASK			0xF
+#define ALX_HQTPD_Q1_NUMPREF_SHIFT			0
+
+#define ALX_MISC					0x19C0
+#define ALX_MISC_PSW_OCP_MASK				0x7
+#define ALX_MISC_PSW_OCP_SHIFT				21
+#define ALX_MISC_PSW_OCP_DEF				0x7
+#define ALX_MISC_ISO_EN					BIT(12)
+#define ALX_MISC_INTNLOSC_OPEN				BIT(3)
+
+#define ALX_MSIC2					0x19C8
+#define ALX_MSIC2_CALB_START				BIT(0)
+
+#define ALX_MISC3					0x19CC
+/* bit1: 1:Software control 25M */
+#define ALX_MISC3_25M_BY_SW				BIT(1)
+/* bit0: 25M switch to intnl OSC */
+#define ALX_MISC3_25M_NOTO_INTNL			BIT(0)
+
+/* MSIX tbl in memory space */
+#define ALX_MSIX_ENTRY_BASE				0x2000
+
+/********************* PHY regs definition ***************************/
+
+/* PHY Specific Status Register */
+#define ALX_MII_GIGA_PSSR				0x11
+#define ALX_GIGA_PSSR_SPD_DPLX_RESOLVED			0x0800
+#define ALX_GIGA_PSSR_DPLX				0x2000
+#define ALX_GIGA_PSSR_SPEED				0xC000
+#define ALX_GIGA_PSSR_10MBS				0x0000
+#define ALX_GIGA_PSSR_100MBS				0x4000
+#define ALX_GIGA_PSSR_1000MBS				0x8000
+
+/* PHY Interrupt Enable Register */
+#define ALX_MII_IER					0x12
+#define ALX_IER_LINK_UP					0x0400
+#define ALX_IER_LINK_DOWN				0x0800
+
+/* PHY Interrupt Status Register */
+#define ALX_MII_ISR					0x13
+
+#define ALX_MII_DBG_ADDR				0x1D
+#define ALX_MII_DBG_DATA				0x1E
+
+/***************************** debug port *************************************/
+
+#define ALX_MIIDBG_ANACTRL				0x00
+#define ALX_ANACTRL_DEF					0x02EF
+
+#define ALX_MIIDBG_SYSMODCTRL				0x04
+/* en half bias */
+#define ALX_SYSMODCTRL_IECHOADJ_DEF			0xBB8B
+
+#define ALX_MIIDBG_SRDSYSMOD				0x05
+#define ALX_SRDSYSMOD_DEEMP_EN				0x0040
+#define ALX_SRDSYSMOD_DEF				0x2C46
+
+#define ALX_MIIDBG_HIBNEG				0x0B
+#define ALX_HIBNEG_PSHIB_EN				0x8000
+#define ALX_HIBNEG_HIB_PSE				0x1000
+#define ALX_HIBNEG_DEF					0xBC40
+#define ALX_HIBNEG_NOHIB	(ALX_HIBNEG_DEF & \
+				 ~(ALX_HIBNEG_PSHIB_EN | ALX_HIBNEG_HIB_PSE))
+
+#define ALX_MIIDBG_TST10BTCFG				0x12
+#define ALX_TST10BTCFG_DEF				0x4C04
+
+#define ALX_MIIDBG_AZ_ANADECT				0x15
+#define ALX_AZ_ANADECT_DEF				0x3220
+#define ALX_AZ_ANADECT_LONG				0x3210
+
+#define ALX_MIIDBG_MSE16DB				0x18
+#define ALX_MSE16DB_UP					0x05EA
+#define ALX_MSE16DB_DOWN				0x02EA
+
+#define ALX_MIIDBG_MSE20DB				0x1C
+#define ALX_MSE20DB_TH_MASK				0x7F
+#define ALX_MSE20DB_TH_SHIFT				2
+#define ALX_MSE20DB_TH_DEF				0x2E
+#define ALX_MSE20DB_TH_HI				0x54
+
+#define ALX_MIIDBG_AGC					0x23
+#define ALX_AGC_2_VGA_MASK				0x3FU
+#define ALX_AGC_2_VGA_SHIFT				8
+#define ALX_AGC_LONG1G_LIMT				40
+#define ALX_AGC_LONG100M_LIMT				44
+
+#define ALX_MIIDBG_LEGCYPS				0x29
+#define ALX_LEGCYPS_EN					0x8000
+#define ALX_LEGCYPS_DEF					0x129D
+
+#define ALX_MIIDBG_TST100BTCFG				0x36
+#define ALX_TST100BTCFG_DEF				0xE12C
+
+#define ALX_MIIDBG_GREENCFG				0x3B
+#define ALX_GREENCFG_DEF				0x7078
+
+#define ALX_MIIDBG_GREENCFG2				0x3D
+#define ALX_GREENCFG2_BP_GREEN				0x8000
+#define ALX_GREENCFG2_GATE_DFSE_EN			0x0080
+
+/******* dev 3 *********/
+#define ALX_MIIEXT_PCS					3
+
+#define ALX_MIIEXT_CLDCTRL3				0x8003
+#define ALX_CLDCTRL3_BP_CABLE1TH_DET_GT			0x8000
+
+#define ALX_MIIEXT_CLDCTRL5				0x8005
+#define ALX_CLDCTRL5_BP_VD_HLFBIAS			0x4000
+
+#define ALX_MIIEXT_CLDCTRL6				0x8006
+#define ALX_CLDCTRL6_CAB_LEN_MASK			0xFF
+#define ALX_CLDCTRL6_CAB_LEN_SHIFT			0
+#define ALX_CLDCTRL6_CAB_LEN_SHORT1G			116
+#define ALX_CLDCTRL6_CAB_LEN_SHORT100M			152
+
+#define ALX_MIIEXT_VDRVBIAS				0x8062
+#define ALX_VDRVBIAS_DEF				0x3
+
+/********* dev 7 **********/
+#define ALX_MIIEXT_ANEG					7
+
+#define ALX_MIIEXT_LOCAL_EEEADV				0x3C
+#define ALX_LOCAL_EEEADV_1000BT				0x0004
+#define ALX_LOCAL_EEEADV_100BT				0x0002
+
+#define ALX_MIIEXT_AFE					0x801A
+#define ALX_AFE_10BT_100M_TH				0x0040
+
+#define ALX_MIIEXT_S3DIG10				0x8023
+/* bit0: 1:bypass 10BT rx fifo, 0:original 10BT rx */
+#define ALX_MIIEXT_S3DIG10_SL				0x0001
+#define ALX_MIIEXT_S3DIG10_DEF				0
+
+#define ALX_MIIEXT_NLP78				0x8027
+#define ALX_MIIEXT_NLP78_120M_DEF			0x8A05
+
+#endif

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index c777b90..a13463e 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c

@@ -744,6 +744,9 @@
 		status = tg3_ape_read32(tp, gnt + off);
 		if (status == bit)
 			break;
+		if (pci_channel_offline(tp->pdev))
+			break;
+
 		udelay(10);
 	}
 
@@ -1635,6 +1638,9 @@
 	for (i = 0; i < delay_cnt; i++) {
 		if (!(tr32(GRC_RX_CPU_EVENT) & GRC_RX_CPU_DRIVER_EVENT))
 			break;
+		if (pci_channel_offline(tp->pdev))
+			break;
+
 		udelay(8);
 	}
 }
@@ -1813,6 +1819,9 @@
 		for (i = 0; i < 200; i++) {
 			if (tr32(VCPU_STATUS) & VCPU_STATUS_INIT_DONE)
 				return 0;
+			if (pci_channel_offline(tp->pdev))
+				return -ENODEV;
+
 			udelay(100);
 		}
 		return -ENODEV;
@@ -1823,6 +1832,15 @@
 		tg3_read_mem(tp, NIC_SRAM_FIRMWARE_MBOX, &val);
 		if (val == ~NIC_SRAM_FIRMWARE_MBOX_MAGIC1)
 			break;
+		if (pci_channel_offline(tp->pdev)) {
+			if (!tg3_flag(tp, NO_FWARE_REPORTED)) {
+				tg3_flag_set(tp, NO_FWARE_REPORTED);
+				netdev_info(tp->dev, "No firmware running\n");
+			}
+
+			break;
+		}
+
 		udelay(10);
 	}
 
@@ -3520,6 +3538,8 @@
 		tw32(cpu_base + CPU_MODE,  CPU_MODE_HALT);
 		if (tr32(cpu_base + CPU_MODE) & CPU_MODE_HALT)
 			break;
+		if (pci_channel_offline(tp->pdev))
+			return -EBUSY;
 	}
 
 	return (i == iters) ? -EBUSY : 0;
@@ -8589,6 +8609,14 @@
 	tw32_f(ofs, val);
 
 	for (i = 0; i < MAX_WAIT_CNT; i++) {
+		if (pci_channel_offline(tp->pdev)) {
+			dev_err(&tp->pdev->dev,
+				"tg3_stop_block device offline, "
+				"ofs=%lx enable_bit=%x\n",
+				ofs, enable_bit);
+			return -ENODEV;
+		}
+
 		udelay(100);
 		val = tr32(ofs);
 		if ((val & enable_bit) == 0)
@@ -8612,6 +8640,13 @@
 
 	tg3_disable_ints(tp);
 
+	if (pci_channel_offline(tp->pdev)) {
+		tp->rx_mode &= ~(RX_MODE_ENABLE | TX_MODE_ENABLE);
+		tp->mac_mode &= ~MAC_MODE_TDE_ENABLE;
+		err = -ENODEV;
+		goto err_no_dev;
+	}
+
 	tp->rx_mode &= ~RX_MODE_ENABLE;
 	tw32_f(MAC_RX_MODE, tp->rx_mode);
 	udelay(10);
@@ -8660,6 +8695,7 @@
 	err |= tg3_stop_block(tp, BUFMGR_MODE, BUFMGR_MODE_ENABLE, silent);
 	err |= tg3_stop_block(tp, MEMARB_MODE, MEMARB_MODE_ENABLE, silent);
 
+err_no_dev:
 	for (i = 0; i < tp->irq_cnt; i++) {
 		struct tg3_napi *tnapi = &tp->napi[i];
 		if (tnapi->hw_status)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index a667015..d48099f 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c

@@ -516,6 +516,7 @@
 	/* Set MII speed */
 	writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED);
 
+#if !defined(CONFIG_M5272)
 	/* set RX checksum */
 	val = readl(fep->hwp + FEC_RACC);
 	if (fep->csum_flags & FLAG_RX_CSUM_ENABLED)
@@ -523,6 +524,7 @@
 	else
 		val &= ~FEC_RACC_OPTIONS;
 	writel(val, fep->hwp + FEC_RACC);
+#endif
 
 	/*
 	 * The phy interface and speed need to get configured
@@ -575,6 +577,7 @@
 #endif
 	}
 
+#if !defined(CONFIG_M5272)
 	/* enable pause frame*/
 	if ((fep->pause_flag & FEC_PAUSE_FLAG_ENABLE) ||
 	    ((fep->pause_flag & FEC_PAUSE_FLAG_AUTONEG) &&
@@ -592,6 +595,7 @@
 	} else {
 		rcntl &= ~FEC_ENET_FCE;
 	}
+#endif /* !defined(CONFIG_M5272) */
 
 	writel(rcntl, fep->hwp + FEC_R_CNTRL);
 
@@ -1205,7 +1209,9 @@
 	/* mask with MAC supported features */
 	if (id_entry->driver_data & FEC_QUIRK_HAS_GBIT) {
 		phy_dev->supported &= PHY_GBIT_FEATURES;
+#if !defined(CONFIG_M5272)
 		phy_dev->supported |= SUPPORTED_Pause;
+#endif
 	}
 	else
 		phy_dev->supported &= PHY_BASIC_FEATURES;
@@ -1390,6 +1396,8 @@
 	}
 }
 
+#if !defined(CONFIG_M5272)
+
 static void fec_enet_get_pauseparam(struct net_device *ndev,
 				    struct ethtool_pauseparam *pause)
 {
@@ -1436,9 +1444,13 @@
 	return 0;
 }
 
+#endif /* !defined(CONFIG_M5272) */
+
 static const struct ethtool_ops fec_enet_ethtool_ops = {
+#if !defined(CONFIG_M5272)
 	.get_pauseparam		= fec_enet_get_pauseparam,
 	.set_pauseparam		= fec_enet_set_pauseparam,
+#endif
 	.get_settings		= fec_enet_get_settings,
 	.set_settings		= fec_enet_set_settings,
 	.get_drvinfo		= fec_enet_get_drvinfo,
@@ -1874,10 +1886,12 @@
 	/* setup board info structure */
 	fep = netdev_priv(ndev);
 
+#if !defined(CONFIG_M5272)
 	/* default enable pause frame auto negotiation */
 	if (pdev->id_entry &&
 	    (pdev->id_entry->driver_data & FEC_QUIRK_HAS_GBIT))
 		fep->pause_flag |= FEC_PAUSE_FLAG_AUTONEG;
+#endif
 
 	fep->hwp = devm_request_and_ioremap(&pdev->dev, r);
 	fep->pdev = pdev;

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 2ad1494..d1cbfb1 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c

@@ -1757,7 +1757,7 @@
 	memset(rxq->rx_desc_area, 0, size);
 
 	rxq->rx_desc_area_size = size;
-	rxq->rx_skb = kmalloc_array(rxq->rx_ring_size, sizeof(*rxq->rx_skb),
+	rxq->rx_skb = kcalloc(rxq->rx_ring_size, sizeof(*rxq->rx_skb),
 				    GFP_KERNEL);
 	if (rxq->rx_skb == NULL)
 		goto out_free;

diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c
index 339bb32..1c8af8b 100644
--- a/drivers/net/ethernet/marvell/pxa168_eth.c
+++ b/drivers/net/ethernet/marvell/pxa168_eth.c

@@ -1015,7 +1015,7 @@
 	int rx_desc_num = pep->rx_ring_size;
 
 	/* Allocate RX skb rings */
-	pep->rx_skb = kmalloc(sizeof(*pep->rx_skb) * pep->rx_ring_size,
+	pep->rx_skb = kzalloc(sizeof(*pep->rx_skb) * pep->rx_ring_size,
 			     GFP_KERNEL);
 	if (!pep->rx_skb)
 		return -ENOMEM;
@@ -1076,7 +1076,7 @@
 	int size = 0, i = 0;
 	int tx_desc_num = pep->tx_ring_size;
 
-	pep->tx_skb = kmalloc(sizeof(*pep->tx_skb) * pep->tx_ring_size,
+	pep->tx_skb = kzalloc(sizeof(*pep->tx_skb) * pep->tx_ring_size,
 			     GFP_KERNEL);
 	if (!pep->tx_skb)
 		return -ENOMEM;

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 2f4a260..8a43499 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c

@@ -632,6 +632,9 @@
 		dev->caps.cqe_size   = 32;
 	}
 
+	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+	mlx4_warn(dev, "Timestamping is not supported in slave mode.\n");
+
 	slave_adjust_steering_mode(dev, &dev_cap, &hca_param);
 
 	return 0;

diff --git a/drivers/net/ethernet/octeon/octeon_mgmt.c b/drivers/net/ethernet/octeon/octeon_mgmt.c
index 921729f..91a8a5d 100644
--- a/drivers/net/ethernet/octeon/octeon_mgmt.c
+++ b/drivers/net/ethernet/octeon/octeon_mgmt.c

@@ -46,17 +46,25 @@
 union mgmt_port_ring_entry {
 	u64 d64;
 	struct {
-		u64    reserved_62_63:2;
-		/* Length of the buffer/packet in bytes */
-		u64    len:14;
-		/* For TX, signals that the packet should be timestamped */
-		u64    tstamp:1;
-		/* The RX error code */
-		u64    code:7;
 #define RING_ENTRY_CODE_DONE 0xf
 #define RING_ENTRY_CODE_MORE 0x10
+#ifdef __BIG_ENDIAN_BITFIELD
+		u64 reserved_62_63:2;
+		/* Length of the buffer/packet in bytes */
+		u64 len:14;
+		/* For TX, signals that the packet should be timestamped */
+		u64 tstamp:1;
+		/* The RX error code */
+		u64 code:7;
 		/* Physical address of the buffer */
-		u64    addr:40;
+		u64 addr:40;
+#else
+		u64 addr:40;
+		u64 code:7;
+		u64 tstamp:1;
+		u64 len:14;
+		u64 reserved_62_63:2;
+#endif
 	} s;
 };
 
@@ -1141,10 +1149,13 @@
 		/* For compensation state to lock. */
 		ndelay(1040 * NS_PER_PHY_CLK);
 
-		/* Some Ethernet switches cannot handle standard
-		 * Interframe Gap, increase to 16 bytes.
+		/* Default Interframe Gaps are too small.  Recommended
+		 * workaround is.
+		 *
+		 * AGL_GMX_TX_IFG[IFG1]=14
+		 * AGL_GMX_TX_IFG[IFG2]=10
 		 */
-		cvmx_write_csr(CVMX_AGL_GMX_TX_IFG, 0x88);
+		cvmx_write_csr(CVMX_AGL_GMX_TX_IFG, 0xae);
 	}
 
 	octeon_mgmt_rx_fill_ring(netdev);

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
index 43562c2..6acf82b 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c

@@ -642,7 +642,7 @@
 				qlcnic_83xx_config_intrpt(adapter, 0);
 		}
 		/* Allow dma queues to drain after context reset */
-		msleep(20);
+		mdelay(20);
 	}
 }
 

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 5e3982f..e29fe8d 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c

@@ -380,8 +380,9 @@
 	.eesipr_value	= 0x01ff009f,
 
 	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_RTO,
-	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RDE |
-			  EESR_RFRMER | EESR_TFE | EESR_TDE | EESR_ECI,
+	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE |
+			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE |
+			  EESR_ECI,
 	.tx_error_check	= EESR_TWB | EESR_TABT | EESR_TDE | EESR_TFE,
 
 	.apr		= 1,
@@ -427,8 +428,9 @@
 	.eesipr_value	= DMAC_M_RFRMER | DMAC_M_ECI | 0x01ff009f,
 
 	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_RTO,
-	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RDE |
-			  EESR_RFRMER | EESR_TFE | EESR_TDE | EESR_ECI,
+	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE |
+			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE |
+			  EESR_ECI,
 	.tx_error_check	= EESR_TWB | EESR_TABT | EESR_TDE | EESR_TFE,
 
 	.apr		= 1,
@@ -478,8 +480,9 @@
 	.rmcr_value	= 0x00000001,
 
 	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_RTO,
-	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RDE |
-			  EESR_RFRMER | EESR_TFE | EESR_TDE | EESR_ECI,
+	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE |
+			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE |
+			  EESR_ECI,
 	.tx_error_check	= EESR_TWB | EESR_TABT | EESR_TDE | EESR_TFE,
 
 	.apr		= 1,
@@ -592,9 +595,9 @@
 	.eesipr_value	= DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff,
 
 	.tx_check	= EESR_TC1 | EESR_FTC,
-	.eesr_err_check	= EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT | \
-			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE | \
-			  EESR_ECI,
+	.eesr_err_check	= EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT |
+			  EESR_RFE | EESR_RDE | EESR_RFRMER | EESR_TFE |
+			  EESR_TDE | EESR_ECI,
 	.tx_error_check	= EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_TDE | \
 			  EESR_TFE,
 	.fdr_value	= 0x0000072f,
@@ -674,9 +677,9 @@
 	.eesipr_value	= DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff,
 
 	.tx_check	= EESR_TC1 | EESR_FTC,
-	.eesr_err_check	= EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT | \
-			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE | \
-			  EESR_ECI,
+	.eesr_err_check	= EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT |
+			  EESR_RFE | EESR_RDE | EESR_RFRMER | EESR_TFE |
+			  EESR_TDE | EESR_ECI,
 	.tx_error_check	= EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_TDE | \
 			  EESR_TFE,
 
@@ -811,9 +814,9 @@
 	.eesipr_value	= DMAC_M_RFRMER | DMAC_M_ECI | 0x003fffff,
 
 	.tx_check	= EESR_TC1 | EESR_FTC,
-	.eesr_err_check	= EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT | \
-			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE | \
-			  EESR_ECI,
+	.eesr_err_check	= EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT |
+			  EESR_RFE | EESR_RDE | EESR_RFRMER | EESR_TFE |
+			  EESR_TDE | EESR_ECI,
 	.tx_error_check	= EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_TDE | \
 			  EESR_TFE,
 
@@ -1549,11 +1552,12 @@
 
 ignore_link:
 	if (intr_status & EESR_TWB) {
-		/* Write buck end. unused write back interrupt */
-		if (intr_status & EESR_TABT)	/* Transmit Abort int */
+		/* Unused write back interrupt */
+		if (intr_status & EESR_TABT) {	/* Transmit Abort int */
 			ndev->stats.tx_aborted_errors++;
 			if (netif_msg_tx_err(mdp))
 				dev_err(&ndev->dev, "Transmit Abort\n");
+		}
 	}
 
 	if (intr_status & EESR_RABT) {

diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index 1ddc9f2..62689a5 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h

@@ -253,7 +253,7 @@
 
 #define DEFAULT_TX_CHECK	(EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | \
 				 EESR_RTO)
-#define DEFAULT_EESR_ERR_CHECK	(EESR_TWB | EESR_TABT | EESR_RABT | \
+#define DEFAULT_EESR_ERR_CHECK	(EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE | \
 				 EESR_RDE | EESR_RFRMER | EESR_ADE | \
 				 EESR_TFE | EESR_TDE | EESR_ECI)
 #define DEFAULT_TX_ERROR_CHECK	(EESR_TWB | EESR_TABT | EESR_ADE | EESR_TDE | \

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 39e4cb3..4a14a94 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c

@@ -2139,7 +2139,7 @@
 	struct efx_nic *efx = pci_get_drvdata(to_pci_dev(dev));
 	return sprintf(buf, "%d\n", efx->phy_type);
 }
-static DEVICE_ATTR(phy_type, 0644, show_phy_type, NULL);
+static DEVICE_ATTR(phy_type, 0444, show_phy_type, NULL);
 
 static int efx_register_netdev(struct efx_nic *efx)
 {

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 7788fbe..9517697 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h

@@ -297,8 +297,8 @@
 #define MAC_RNABLE_RX		0x00000004	/* Receiver Enable */
 
 /* Default LPI timers */
-#define STMMAC_DEFAULT_LIT_LS_TIMER	0x3E8
-#define STMMAC_DEFAULT_TWT_LS_TIMER	0x0
+#define STMMAC_DEFAULT_LIT_LS	0x3E8
+#define STMMAC_DEFAULT_TWT_LS	0x0
 
 #define STMMAC_CHAIN_MODE	0x1
 #define STMMAC_RING_MODE	0x2

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ee919ca..e9eab29 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c

@@ -130,7 +130,7 @@
 static int eee_timer = STMMAC_DEFAULT_LPI_TIMER;
 module_param(eee_timer, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(eee_timer, "LPI tx expiration time in msec");
-#define STMMAC_LPI_TIMER(x) (jiffies + msecs_to_jiffies(x))
+#define STMMAC_LPI_T(x) (jiffies + msecs_to_jiffies(x))
 
 /* By default the driver will use the ring mode to manage tx and rx descriptors
  * but passing this value so user can force to use the chain instead of the ring
@@ -288,7 +288,7 @@
 	struct stmmac_priv *priv = (struct stmmac_priv *)arg;
 
 	stmmac_enable_eee_mode(priv);
-	mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_TIMER(eee_timer));
+	mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(eee_timer));
 }
 
 /**
@@ -304,22 +304,34 @@
 {
 	bool ret = false;
 
+	/* Using PCS we cannot dial with the phy registers at this stage
+	 * so we do not support extra feature like EEE.
+	 */
+	if ((priv->pcs == STMMAC_PCS_RGMII) || (priv->pcs == STMMAC_PCS_TBI) ||
+	    (priv->pcs == STMMAC_PCS_RTBI))
+		goto out;
+
 	/* MAC core supports the EEE feature. */
 	if (priv->dma_cap.eee) {
 		/* Check if the PHY supports EEE */
 		if (phy_init_eee(priv->phydev, 1))
 			goto out;
 
-		priv->eee_active = 1;
-		init_timer(&priv->eee_ctrl_timer);
-		priv->eee_ctrl_timer.function = stmmac_eee_ctrl_timer;
-		priv->eee_ctrl_timer.data = (unsigned long)priv;
-		priv->eee_ctrl_timer.expires = STMMAC_LPI_TIMER(eee_timer);
-		add_timer(&priv->eee_ctrl_timer);
+		if (!priv->eee_active) {
+			priv->eee_active = 1;
+			init_timer(&priv->eee_ctrl_timer);
+			priv->eee_ctrl_timer.function = stmmac_eee_ctrl_timer;
+			priv->eee_ctrl_timer.data = (unsigned long)priv;
+			priv->eee_ctrl_timer.expires = STMMAC_LPI_T(eee_timer);
+			add_timer(&priv->eee_ctrl_timer);
 
-		priv->hw->mac->set_eee_timer(priv->ioaddr,
-					     STMMAC_DEFAULT_LIT_LS_TIMER,
-					     priv->tx_lpi_timer);
+			priv->hw->mac->set_eee_timer(priv->ioaddr,
+						     STMMAC_DEFAULT_LIT_LS,
+						     priv->tx_lpi_timer);
+		} else
+			/* Set HW EEE according to the speed */
+			priv->hw->mac->set_eee_pls(priv->ioaddr,
+						   priv->phydev->link);
 
 		pr_info("stmmac: Energy-Efficient Ethernet initialized\n");
 
@@ -329,20 +341,6 @@
 	return ret;
 }
 
-/**
- * stmmac_eee_adjust: adjust HW EEE according to the speed
- * @priv: driver private structure
- * Description:
- *	When the EEE has been already initialised we have to
- *	modify the PLS bit in the LPI ctrl & status reg according
- *	to the PHY link status. For this reason.
- */
-static void stmmac_eee_adjust(struct stmmac_priv *priv)
-{
-	if (priv->eee_enabled)
-		priv->hw->mac->set_eee_pls(priv->ioaddr, priv->phydev->link);
-}
-
 /* stmmac_get_tx_hwtstamp: get HW TX timestamps
  * @priv: driver private structure
  * @entry : descriptor index to be used.
@@ -769,7 +767,10 @@
 	if (new_state && netif_msg_link(priv))
 		phy_print_status(phydev);
 
-	stmmac_eee_adjust(priv);
+	/* At this stage, it could be needed to setup the EEE or adjust some
+	 * MAC related HW registers.
+	 */
+	priv->eee_enabled = stmmac_eee_init(priv);
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 
@@ -1277,7 +1278,7 @@
 
 	if ((priv->eee_enabled) && (!priv->tx_path_in_lpi_mode)) {
 		stmmac_enable_eee_mode(priv);
-		mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_TIMER(eee_timer));
+		mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(eee_timer));
 	}
 	spin_unlock(&priv->tx_lock);
 }
@@ -1671,14 +1672,9 @@
 	if (priv->phydev)
 		phy_start(priv->phydev);
 
-	priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS_TIMER;
+	priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS;
 
-	/* Using PCS we cannot dial with the phy registers at this stage
-	 * so we do not support extra feature like EEE.
-	 */
-	if (priv->pcs != STMMAC_PCS_RGMII && priv->pcs != STMMAC_PCS_TBI &&
-	    priv->pcs != STMMAC_PCS_RTBI)
-		priv->eee_enabled = stmmac_eee_init(priv);
+	priv->eee_enabled = stmmac_eee_init(priv);
 
 	stmmac_init_tx_coalesce(priv);
 

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 21a5b29..d1a769f 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c

@@ -1679,7 +1679,7 @@
 	priv->rx_packet_max = max(rx_packet_max, 128);
 	priv->cpts = devm_kzalloc(&pdev->dev, sizeof(struct cpts), GFP_KERNEL);
 	priv->irq_enabled = true;
-	if (!ndev) {
+	if (!priv->cpts) {
 		pr_err("error allocating cpts\n");
 		goto clean_ndev_ret;
 	}
@@ -1973,9 +1973,12 @@
 {
 	struct platform_device	*pdev = to_platform_device(dev);
 	struct net_device	*ndev = platform_get_drvdata(pdev);
+	struct cpsw_priv	*priv = netdev_priv(ndev);
 
 	if (netif_running(ndev))
 		cpsw_ndo_stop(ndev);
+	soft_reset("sliver 0", &priv->slaves[0].sliver->soft_reset);
+	soft_reset("sliver 1", &priv->slaves[1].sliver->soft_reset);
 	pm_runtime_put_sync(&pdev->dev);
 
 	return 0;

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 49dfd592..053c84f 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c

@@ -705,6 +705,13 @@
 	}
 
 	buffer = dma_map_single(ctlr->dev, data, len, chan->dir);
+	ret = dma_mapping_error(ctlr->dev, buffer);
+	if (ret) {
+		cpdma_desc_free(ctlr->pool, desc, 1);
+		ret = -EINVAL;
+		goto unlock_ret;
+	}
+
 	mode = CPDMA_DESC_OWNER | CPDMA_DESC_SOP | CPDMA_DESC_EOP;
 	cpdma_desc_to_port(chan, mode, directed);
 

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index ab2307b..4dccead 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c

@@ -285,7 +285,9 @@
 
 	skb->protocol = eth_type_trans(skb, net);
 	skb->ip_summed = CHECKSUM_NONE;
-	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), packet->vlan_tci);
+	if (packet->vlan_tci & VLAN_TAG_PRESENT)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+				       packet->vlan_tci);
 
 	net->stats.rx_packets++;
 	net->stats.rx_bytes += packet->total_data_buflen;

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 59e9605..b6dd6a7 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c

@@ -524,8 +524,10 @@
 			return -EMSGSIZE;
 		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
 		if (num_pages != size) {
-			for (i = 0; i < num_pages; i++)
-				put_page(page[i]);
+			int j;
+
+			for (j = 0; j < num_pages; j++)
+				put_page(page[i + j]);
 			return -EFAULT;
 		}
 		truesize = size * PAGE_SIZE;

diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index 59ac143..4f777ed9 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c

@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/console.h>
 #include <linux/moduleparam.h>
+#include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/netpoll.h>
 #include <linux/inet.h>

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bfa9bb4..9c61f87 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c

@@ -1010,8 +1010,10 @@
 			return -EMSGSIZE;
 		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
 		if (num_pages != size) {
-			for (i = 0; i < num_pages; i++)
-				put_page(page[i]);
+			int j;
+
+			for (j = 0; j < num_pages; j++)
+				put_page(page[i + j]);
 			return -EFAULT;
 		}
 		truesize = size * PAGE_SIZE;

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index d095d0d..5645921 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c

@@ -590,7 +590,13 @@
 	{QMI_GOBI1K_DEVICE(0x03f0, 0x1f1d)},	/* HP un2400 Gobi Modem Device */
 	{QMI_GOBI1K_DEVICE(0x04da, 0x250d)},	/* Panasonic Gobi Modem device */
 	{QMI_GOBI1K_DEVICE(0x413c, 0x8172)},	/* Dell Gobi Modem device */
-	{QMI_GOBI1K_DEVICE(0x1410, 0xa001)},	/* Novatel Gobi Modem device */
+	{QMI_GOBI1K_DEVICE(0x1410, 0xa001)},	/* Novatel/Verizon USB-1000 */
+	{QMI_GOBI1K_DEVICE(0x1410, 0xa002)},	/* Novatel Gobi Modem device */
+	{QMI_GOBI1K_DEVICE(0x1410, 0xa003)},	/* Novatel Gobi Modem device */
+	{QMI_GOBI1K_DEVICE(0x1410, 0xa004)},	/* Novatel Gobi Modem device */
+	{QMI_GOBI1K_DEVICE(0x1410, 0xa005)},	/* Novatel Gobi Modem device */
+	{QMI_GOBI1K_DEVICE(0x1410, 0xa006)},	/* Novatel Gobi Modem device */
+	{QMI_GOBI1K_DEVICE(0x1410, 0xa007)},	/* Novatel Gobi Modem device */
 	{QMI_GOBI1K_DEVICE(0x0b05, 0x1776)},	/* Asus Gobi Modem device */
 	{QMI_GOBI1K_DEVICE(0x19d2, 0xfff3)},	/* ONDA Gobi Modem device */
 	{QMI_GOBI1K_DEVICE(0x05c6, 0x9001)},	/* Generic Gobi Modem device */

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3b1d2ee..57325f3 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c

@@ -565,18 +565,22 @@
 
 /* Watch incoming packets to learn mapping between Ethernet address
  * and Tunnel endpoint.
+ * Return true if packet is bogus and should be droppped.
  */
-static void vxlan_snoop(struct net_device *dev,
+static bool vxlan_snoop(struct net_device *dev,
 			__be32 src_ip, const u8 *src_mac)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
-	int err;
 
 	f = vxlan_find_mac(vxlan, src_mac);
 	if (likely(f)) {
 		if (likely(f->remote.remote_ip == src_ip))
-			return;
+			return false;
+
+		/* Don't migrate static entries, drop packets */
+		if (f->state & NUD_NOARP)
+			return true;
 
 		if (net_ratelimit())
 			netdev_info(dev,
@@ -588,14 +592,19 @@
 	} else {
 		/* learned new entry */
 		spin_lock(&vxlan->hash_lock);
-		err = vxlan_fdb_create(vxlan, src_mac, src_ip,
-				       NUD_REACHABLE,
-				       NLM_F_EXCL|NLM_F_CREATE,
-				       vxlan->dst_port,
-				       vxlan->default_dst.remote_vni,
-				       0, NTF_SELF);
+
+		/* close off race between vxlan_flush and incoming packets */
+		if (netif_running(dev))
+			vxlan_fdb_create(vxlan, src_mac, src_ip,
+					 NUD_REACHABLE,
+					 NLM_F_EXCL|NLM_F_CREATE,
+					 vxlan->dst_port,
+					 vxlan->default_dst.remote_vni,
+					 0, NTF_SELF);
 		spin_unlock(&vxlan->hash_lock);
 	}
+
+	return false;
 }
 
 
@@ -727,8 +736,9 @@
 			       vxlan->dev->dev_addr) == 0)
 		goto drop;
 
-	if (vxlan->flags & VXLAN_F_LEARN)
-		vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
+	if ((vxlan->flags & VXLAN_F_LEARN) &&
+	    vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source))
+		goto drop;
 
 	__skb_tunnel_rx(skb, vxlan->dev);
 	skb_reset_network_header(skb);
@@ -1151,9 +1161,11 @@
 		struct sk_buff *skb1;
 
 		skb1 = skb_clone(skb, GFP_ATOMIC);
-		rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc);
-		if (rc == NETDEV_TX_OK)
-			rc = rc1;
+		if (skb1) {
+			rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc);
+			if (rc == NETDEV_TX_OK)
+				rc = rc1;
+		}
 	}
 
 	rc1 = vxlan_xmit_one(skb, dev, rdst0, did_rsc);

diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c
index 147614e..6a8a382 100644
--- a/drivers/net/wan/dlci.c
+++ b/drivers/net/wan/dlci.c

@@ -384,21 +384,37 @@
 	struct frad_local	*flp;
 	struct net_device	*master, *slave;
 	int			err;
+	bool			found = false;
+
+	rtnl_lock();
 
 	/* validate slave device */
 	master = __dev_get_by_name(&init_net, dlci->devname);
-	if (!master)
-		return -ENODEV;
+	if (!master) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	list_for_each_entry(dlp, &dlci_devs, list) {
+		if (dlp->master == master) {
+			found = true;
+			break;
+		}
+	}
+	if (!found) {
+		err = -ENODEV;
+		goto out;
+	}
 
 	if (netif_running(master)) {
-		return -EBUSY;
+		err = -EBUSY;
+		goto out;
 	}
 
 	dlp = netdev_priv(master);
 	slave = dlp->slave;
 	flp = netdev_priv(slave);
 
-	rtnl_lock();
 	err = (*flp->deassoc)(slave, master);
 	if (!err) {
 		list_del(&dlp->list);
@@ -407,8 +423,8 @@
 
 		dev_put(slave);
 	}
+out:
 	rtnl_unlock();
-
 	return err;
 }
 

diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
index 0743a47..62f1b76 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c

@@ -1174,7 +1174,7 @@
 		mutex_lock(&priv->htc_pm_lock);
 
 		priv->ps_idle = !!(conf->flags & IEEE80211_CONF_IDLE);
-		if (priv->ps_idle)
+		if (!priv->ps_idle)
 			chip_reset = true;
 
 		mutex_unlock(&priv->htc_pm_lock);

diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index 1c9b1ba..83ab6be 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c

@@ -1570,6 +1570,8 @@
 	    txq->axq_ampdu_depth >= ATH_AGGR_MIN_QDEPTH)
 		return;
 
+	rcu_read_lock();
+
 	ac = list_first_entry(&txq->axq_acq, struct ath_atx_ac, list);
 	last_ac = list_entry(txq->axq_acq.prev, struct ath_atx_ac, list);
 
@@ -1608,8 +1610,10 @@
 
 		if (ac == last_ac ||
 		    txq->axq_ampdu_depth >= ATH_AGGR_MIN_QDEPTH)
-			return;
+			break;
 	}
+
+	rcu_read_unlock();
 }
 
 /***********/

diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
index b98f223..2c59357 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c

@@ -930,6 +930,10 @@
 			brcmf_fws_del_interface(ifp);
 			brcmf_fws_deinit(drvr);
 		}
+		if (drvr->iflist[0]) {
+			free_netdev(ifp->ndev);
+			drvr->iflist[0] = NULL;
+		}
 		if (p2p_ifp) {
 			free_netdev(p2p_ifp->ndev);
 			drvr->iflist[1] = NULL;

diff --git a/drivers/net/wireless/brcm80211/brcmsmac/main.c b/drivers/net/wireless/brcm80211/brcmsmac/main.c
index 28e7aee..9fd6f2f 100644
--- a/drivers/net/wireless/brcm80211/brcmsmac/main.c
+++ b/drivers/net/wireless/brcm80211/brcmsmac/main.c

@@ -3074,21 +3074,8 @@
  */
 static bool brcms_c_ps_allowed(struct brcms_c_info *wlc)
 {
-	/* disallow PS when one of the following global conditions meets */
-	if (!wlc->pub->associated)
-		return false;
-
-	/* disallow PS when one of these meets when not scanning */
-	if (wlc->filter_flags & FIF_PROMISC_IN_BSS)
-		return false;
-
-	if (wlc->bsscfg->type == BRCMS_TYPE_AP)
-		return false;
-
-	if (wlc->bsscfg->type == BRCMS_TYPE_ADHOC)
-		return false;
-
-	return true;
+	/* not supporting PS so always return false for now */
+	return false;
 }
 
 static void brcms_c_statsupd(struct brcms_c_info *wlc)

diff --git a/drivers/net/wireless/iwlegacy/3945-rs.c b/drivers/net/wireless/iwlegacy/3945-rs.c
index c9f197d..fe31590 100644
--- a/drivers/net/wireless/iwlegacy/3945-rs.c
+++ b/drivers/net/wireless/iwlegacy/3945-rs.c

@@ -816,6 +816,7 @@
 		rs_sta->last_txrate_idx = idx;
 		info->control.rates[0].idx = rs_sta->last_txrate_idx;
 	}
+	info->control.rates[0].count = 1;
 
 	D_RATE("leave: %d\n", idx);
 }

diff --git a/drivers/net/wireless/iwlegacy/4965-rs.c b/drivers/net/wireless/iwlegacy/4965-rs.c
index 1fc0b227..ed3c42a 100644
--- a/drivers/net/wireless/iwlegacy/4965-rs.c
+++ b/drivers/net/wireless/iwlegacy/4965-rs.c

@@ -2268,7 +2268,7 @@
 		info->control.rates[0].flags = 0;
 	}
 	info->control.rates[0].idx = rate_idx;
-
+	info->control.rates[0].count = 1;
 }
 
 static void *

diff --git a/drivers/net/wireless/iwlwifi/dvm/rs.c b/drivers/net/wireless/iwlwifi/dvm/rs.c
index 907bd6e..10fbb17 100644
--- a/drivers/net/wireless/iwlwifi/dvm/rs.c
+++ b/drivers/net/wireless/iwlwifi/dvm/rs.c

@@ -2799,7 +2799,7 @@
 		info->control.rates[0].flags = 0;
 	}
 	info->control.rates[0].idx = rate_idx;
-
+	info->control.rates[0].count = 1;
 }
 
 static void *rs_alloc_sta(void *priv_rate, struct ieee80211_sta *sta,

diff --git a/drivers/net/wireless/iwlwifi/dvm/rxon.c b/drivers/net/wireless/iwlwifi/dvm/rxon.c
index 707446f..cd1ad001 100644
--- a/drivers/net/wireless/iwlwifi/dvm/rxon.c
+++ b/drivers/net/wireless/iwlwifi/dvm/rxon.c

@@ -1378,7 +1378,7 @@
 	struct iwl_chain_noise_data *data = &priv->chain_noise_data;
 	int ret;
 
-	if (!(priv->calib_disabled & IWL_CHAIN_NOISE_CALIB_DISABLED))
+	if (priv->calib_disabled & IWL_CHAIN_NOISE_CALIB_DISABLED)
 		return;
 
 	if ((data->state == IWL_CHAIN_NOISE_ALIVE) &&

diff --git a/drivers/net/wireless/iwlwifi/iwl-drv.c b/drivers/net/wireless/iwlwifi/iwl-drv.c
index 39aad98..40fed1f 100644
--- a/drivers/net/wireless/iwlwifi/iwl-drv.c
+++ b/drivers/net/wireless/iwlwifi/iwl-drv.c

@@ -1000,10 +1000,12 @@
 	 */
 	if (load_module) {
 		err = request_module("%s", op->name);
+#ifdef CONFIG_IWLWIFI_OPMODE_MODULAR
 		if (err)
 			IWL_ERR(drv,
 				"failed to load module %s (error %d), is dynamic loading enabled?\n",
 				op->name, err);
+#endif
 	}
 	return;
 

diff --git a/drivers/net/wireless/iwlwifi/mvm/rs.c b/drivers/net/wireless/iwlwifi/mvm/rs.c
index 55334d5..b99fe31 100644
--- a/drivers/net/wireless/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/iwlwifi/mvm/rs.c

@@ -2546,6 +2546,7 @@
 		info->control.rates[0].flags = 0;
 	}
 	info->control.rates[0].idx = rate_idx;
+	info->control.rates[0].count = 1;
 }
 
 static void *rs_alloc_sta(void *mvm_rate, struct ieee80211_sta *sta,

diff --git a/drivers/net/wireless/iwlwifi/mvm/tx.c b/drivers/net/wireless/iwlwifi/mvm/tx.c
index f212f16..48c1891 100644
--- a/drivers/net/wireless/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/iwlwifi/mvm/tx.c

@@ -180,7 +180,8 @@
 		tx_cmd->tx_flags |= cpu_to_le32(TX_CMD_FLG_STA_RATE);
 		return;
 	} else if (ieee80211_is_back_req(fc)) {
-		tx_cmd->tx_flags |= cpu_to_le32(TX_CMD_FLG_STA_RATE);
+		tx_cmd->tx_flags |=
+			cpu_to_le32(TX_CMD_FLG_ACK | TX_CMD_FLG_BAR);
 	}
 
 	/* HT rate doesn't make sense for a non data frame */

diff --git a/drivers/net/wireless/rt2x00/rt2800lib.c b/drivers/net/wireless/rt2x00/rt2800lib.c
index b52d70c..72f32e5 100644
--- a/drivers/net/wireless/rt2x00/rt2800lib.c
+++ b/drivers/net/wireless/rt2x00/rt2800lib.c

@@ -3027,19 +3027,26 @@
 	 * TODO: we do not use +6 dBm option to do not increase power beyond
 	 * regulatory limit, however this could be utilized for devices with
 	 * CAPABILITY_POWER_LIMIT.
+	 *
+	 * TODO: add different temperature compensation code for RT3290 & RT5390
+	 * to allow to use BBP_R1 for those chips.
 	 */
-	rt2800_bbp_read(rt2x00dev, 1, &r1);
-	if (delta <= -12) {
-		power_ctrl = 2;
-		delta += 12;
-	} else if (delta <= -6) {
-		power_ctrl = 1;
-		delta += 6;
-	} else {
-		power_ctrl = 0;
+	if (!rt2x00_rt(rt2x00dev, RT3290) &&
+	    !rt2x00_rt(rt2x00dev, RT5390)) {
+		rt2800_bbp_read(rt2x00dev, 1, &r1);
+		if (delta <= -12) {
+			power_ctrl = 2;
+			delta += 12;
+		} else if (delta <= -6) {
+			power_ctrl = 1;
+			delta += 6;
+		} else {
+			power_ctrl = 0;
+		}
+		rt2x00_set_field8(&r1, BBP1_TX_POWER_CTRL, power_ctrl);
+		rt2800_bbp_write(rt2x00dev, 1, r1);
 	}
-	rt2x00_set_field8(&r1, BBP1_TX_POWER_CTRL, power_ctrl);
-	rt2800_bbp_write(rt2x00dev, 1, r1);
+
 	offset = TX_PWR_CFG_0;
 
 	for (i = 0; i < EEPROM_TXPOWER_BYRATE_SIZE; i += 2) {

diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 716aa93..59df857 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c

@@ -61,6 +61,7 @@
 static void handle_hotplug_event_bridge (acpi_handle, u32, void *);
 static void acpiphp_sanitize_bus(struct pci_bus *bus);
 static void acpiphp_set_hpp_values(struct pci_bus *bus);
+static void hotplug_event_func(acpi_handle handle, u32 type, void *context);
 static void handle_hotplug_event_func(acpi_handle handle, u32 type, void *context);
 static void free_bridge(struct kref *kref);
 
@@ -147,7 +148,7 @@
 
 
 static const struct acpi_dock_ops acpiphp_dock_ops = {
-	.handler = handle_hotplug_event_func,
+	.handler = hotplug_event_func,
 };
 
 /* Check whether the PCI device is managed by native PCIe hotplug driver */
@@ -179,6 +180,20 @@
 	return true;
 }
 
+static void acpiphp_dock_init(void *data)
+{
+	struct acpiphp_func *func = data;
+
+	get_bridge(func->slot->bridge);
+}
+
+static void acpiphp_dock_release(void *data)
+{
+	struct acpiphp_func *func = data;
+
+	put_bridge(func->slot->bridge);
+}
+
 /* callback routine to register each ACPI PCI slot object */
 static acpi_status
 register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
@@ -298,7 +313,8 @@
 		 */
 		newfunc->flags &= ~FUNC_HAS_EJ0;
 		if (register_hotplug_dock_device(handle,
-			&acpiphp_dock_ops, newfunc))
+			&acpiphp_dock_ops, newfunc,
+			acpiphp_dock_init, acpiphp_dock_release))
 			dbg("failed to register dock device\n");
 
 		/* we need to be notified when dock events happen
@@ -670,6 +686,7 @@
 	struct pci_bus *bus = slot->bridge->pci_bus;
 	struct acpiphp_func *func;
 	int num, max, pass;
+	LIST_HEAD(add_list);
 
 	if (slot->flags & SLOT_ENABLED)
 		goto err_exit;
@@ -694,13 +711,15 @@
 				max = pci_scan_bridge(bus, dev, max, pass);
 				if (pass && dev->subordinate) {
 					check_hotplug_bridge(slot, dev);
-					pci_bus_size_bridges(dev->subordinate);
+					pcibios_resource_survey_bus(dev->subordinate);
+					__pci_bus_size_bridges(dev->subordinate,
+							       &add_list);
 				}
 			}
 		}
 	}
 
-	pci_bus_assign_resources(bus);
+	__pci_bus_assign_resources(bus, &add_list, NULL);
 	acpiphp_sanitize_bus(bus);
 	acpiphp_set_hpp_values(bus);
 	acpiphp_set_acpi_region(slot);
@@ -1065,22 +1084,12 @@
 	alloc_acpi_hp_work(handle, type, context, _handle_hotplug_event_bridge);
 }
 
-static void _handle_hotplug_event_func(struct work_struct *work)
+static void hotplug_event_func(acpi_handle handle, u32 type, void *context)
 {
-	struct acpiphp_func *func;
+	struct acpiphp_func *func = context;
 	char objname[64];
 	struct acpi_buffer buffer = { .length = sizeof(objname),
 				      .pointer = objname };
-	struct acpi_hp_work *hp_work;
-	acpi_handle handle;
-	u32 type;
-
-	hp_work = container_of(work, struct acpi_hp_work, work);
-	handle = hp_work->handle;
-	type = hp_work->type;
-	func = (struct acpiphp_func *)hp_work->context;
-
-	acpi_scan_lock_acquire();
 
 	acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer);
 
@@ -1113,6 +1122,18 @@
 		warn("notify_handler: unknown event type 0x%x for %s\n", type, objname);
 		break;
 	}
+}
+
+static void _handle_hotplug_event_func(struct work_struct *work)
+{
+	struct acpi_hp_work *hp_work;
+	struct acpiphp_func *func;
+
+	hp_work = container_of(work, struct acpi_hp_work, work);
+	func = hp_work->context;
+	acpi_scan_lock_acquire();
+
+	hotplug_event_func(hp_work->handle, hp_work->type, func);
 
 	acpi_scan_lock_release();
 	kfree(hp_work); /* allocated in handle_hotplug_event_func */

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 68678ed..d1182c4 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h

@@ -202,6 +202,11 @@
 		    struct resource *res, unsigned int reg);
 int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type);
 void pci_configure_ari(struct pci_dev *dev);
+void __ref __pci_bus_size_bridges(struct pci_bus *bus,
+			struct list_head *realloc_head);
+void __ref __pci_bus_assign_resources(const struct pci_bus *bus,
+				      struct list_head *realloc_head,
+				      struct list_head *fail_head);
 
 /**
  * pci_ari_enabled - query ARI forwarding status

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 16abaaa..d254e23 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c

@@ -1044,7 +1044,7 @@
 	;
 }
 
-static void __ref __pci_bus_size_bridges(struct pci_bus *bus,
+void __ref __pci_bus_size_bridges(struct pci_bus *bus,
 			struct list_head *realloc_head)
 {
 	struct pci_dev *dev;
@@ -1115,9 +1115,9 @@
 }
 EXPORT_SYMBOL(pci_bus_size_bridges);
 
-static void __ref __pci_bus_assign_resources(const struct pci_bus *bus,
-					 struct list_head *realloc_head,
-					 struct list_head *fail_head)
+void __ref __pci_bus_assign_resources(const struct pci_bus *bus,
+				      struct list_head *realloc_head,
+				      struct list_head *fail_head)
 {
 	struct pci_bus *b;
 	struct pci_dev *dev;

diff --git a/drivers/pcmcia/Kconfig b/drivers/pcmcia/Kconfig
index 1c63624..0c657d6a 100644
--- a/drivers/pcmcia/Kconfig
+++ b/drivers/pcmcia/Kconfig

@@ -242,7 +242,7 @@
 
 config PCMCIA_PROBE
 	bool
-	default y if ISA && !ARCH_SA1100 && !ARCH_CLPS711X && !PARISC
+	default y if ISA && !ARCH_SA1100 && !PARISC
 
 config M32R_PCC
 	bool "M32R PCMCIA I/F"
@@ -287,7 +287,7 @@
 
 config AT91_CF
 	tristate "AT91 CompactFlash Controller"
-	depends on PCMCIA && ARCH_AT91RM9200
+	depends on PCMCIA && ARCH_AT91
 	help
 	  Say Y here to support the CompactFlash controller on AT91 chips.
 	  Or choose M to compile the driver as a module named "at91_cf".

diff --git a/drivers/pcmcia/at91_cf.c b/drivers/pcmcia/at91_cf.c
index 01463c7..b8f5acf 100644
--- a/drivers/pcmcia/at91_cf.c
+++ b/drivers/pcmcia/at91_cf.c

@@ -18,13 +18,14 @@
 #include <linux/slab.h>
 #include <linux/gpio.h>
 #include <linux/platform_data/atmel.h>
+#include <linux/io.h>
+#include <linux/sizes.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_gpio.h>
 
 #include <pcmcia/ss.h>
 
-#include <mach/hardware.h>
-#include <asm/io.h>
-#include <asm/sizes.h>
-
 #include <mach/at91rm9200_mc.h>
 #include <mach/at91_ramc.h>
 
@@ -41,8 +42,6 @@
 
 /*--------------------------------------------------------------------------*/
 
-static const char driver_name[] = "at91_cf";
-
 struct at91_cf_socket {
 	struct pcmcia_socket	socket;
 
@@ -76,7 +75,7 @@
 		/* kick pccard as needed */
 		if (present != cf->present) {
 			cf->present = present;
-			pr_debug("%s: card %s\n", driver_name,
+			dev_dbg(&cf->pdev->dev, "card %s\n",
 					present ? "present" : "gone");
 			pcmcia_parse_events(&cf->socket, SS_DETECT);
 		}
@@ -100,9 +99,9 @@
 		int vcc	= gpio_is_valid(cf->board->vcc_pin);
 
 		*sp = SS_DETECT | SS_3VCARD;
-		if (!rdy || gpio_get_value(rdy))
+		if (!rdy || gpio_get_value(cf->board->irq_pin))
 			*sp |= SS_READY;
-		if (!vcc || gpio_get_value(vcc))
+		if (!vcc || gpio_get_value(cf->board->vcc_pin))
 			*sp |= SS_POWERON;
 	} else
 		*sp = 0;
@@ -120,22 +119,22 @@
 	/* switch Vcc if needed and possible */
 	if (gpio_is_valid(cf->board->vcc_pin)) {
 		switch (s->Vcc) {
-			case 0:
-				gpio_set_value(cf->board->vcc_pin, 0);
-				break;
-			case 33:
-				gpio_set_value(cf->board->vcc_pin, 1);
-				break;
-			default:
-				return -EINVAL;
+		case 0:
+			gpio_set_value(cf->board->vcc_pin, 0);
+			break;
+		case 33:
+			gpio_set_value(cf->board->vcc_pin, 1);
+			break;
+		default:
+			return -EINVAL;
 		}
 	}
 
 	/* toggle reset if needed */
 	gpio_set_value(cf->board->rst_pin, s->flags & SS_RESET);
 
-	pr_debug("%s: Vcc %d, io_irq %d, flags %04x csc %04x\n",
-		driver_name, s->Vcc, s->io_irq, s->flags, s->csc_mask);
+	dev_dbg(&cf->pdev->dev, "Vcc %d, io_irq %d, flags %04x csc %04x\n",
+				s->Vcc, s->io_irq, s->flags, s->csc_mask);
 
 	return 0;
 }
@@ -171,10 +170,10 @@
 	 */
 	if (!(io->flags & (MAP_16BIT | MAP_AUTOSZ))) {
 		csr |= AT91_SMC_DBW_8;
-		pr_debug("%s: 8bit i/o bus\n", driver_name);
+		dev_dbg(&cf->pdev->dev, "8bit i/o bus\n");
 	} else {
 		csr |= AT91_SMC_DBW_16;
-		pr_debug("%s: 16bit i/o bus\n", driver_name);
+		dev_dbg(&cf->pdev->dev, "16bit i/o bus\n");
 	}
 	at91_ramc_write(0, AT91_SMC_CSR(cf->board->chipselect), csr);
 
@@ -215,6 +214,37 @@
 
 /*--------------------------------------------------------------------------*/
 
+#if defined(CONFIG_OF)
+static const struct of_device_id at91_cf_dt_ids[] = {
+	{ .compatible = "atmel,at91rm9200-cf" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, at91_cf_dt_ids);
+
+static int at91_cf_dt_init(struct platform_device *pdev)
+{
+	struct at91_cf_data *board;
+
+	board = devm_kzalloc(&pdev->dev, sizeof(*board), GFP_KERNEL);
+	if (!board)
+		return -ENOMEM;
+
+	board->irq_pin = of_get_gpio(pdev->dev.of_node, 0);
+	board->det_pin = of_get_gpio(pdev->dev.of_node, 1);
+	board->vcc_pin = of_get_gpio(pdev->dev.of_node, 2);
+	board->rst_pin = of_get_gpio(pdev->dev.of_node, 3);
+
+	pdev->dev.platform_data = board;
+
+	return 0;
+}
+#else
+static int at91_cf_dt_init(struct platform_device *pdev)
+{
+	return -ENODEV;
+}
+#endif
+
 static int __init at91_cf_probe(struct platform_device *pdev)
 {
 	struct at91_cf_socket	*cf;
@@ -222,14 +252,22 @@
 	struct resource		*io;
 	int			status;
 
-	if (!board || !gpio_is_valid(board->det_pin) || !gpio_is_valid(board->rst_pin))
+	if (!board) {
+		status = at91_cf_dt_init(pdev);
+		if (status)
+			return status;
+
+		board = pdev->dev.platform_data;
+	}
+
+	if (!gpio_is_valid(board->det_pin) || !gpio_is_valid(board->rst_pin))
 		return -ENODEV;
 
 	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!io)
 		return -ENODEV;
 
-	cf = kzalloc(sizeof *cf, GFP_KERNEL);
+	cf = devm_kzalloc(&pdev->dev, sizeof(*cf), GFP_KERNEL);
 	if (!cf)
 		return -ENOMEM;
 
@@ -239,22 +277,25 @@
 	platform_set_drvdata(pdev, cf);
 
 	/* must be a GPIO; ergo must trigger on both edges */
-	status = gpio_request(board->det_pin, "cf_det");
+	status = devm_gpio_request(&pdev->dev, board->det_pin, "cf_det");
 	if (status < 0)
-		goto fail0;
-	status = request_irq(gpio_to_irq(board->det_pin), at91_cf_irq, 0, driver_name, cf);
+		return status;
+
+	status = devm_request_irq(&pdev->dev, gpio_to_irq(board->det_pin),
+					at91_cf_irq, 0, "at91_cf detect", cf);
 	if (status < 0)
-		goto fail00;
+		return status;
+
 	device_init_wakeup(&pdev->dev, 1);
 
-	status = gpio_request(board->rst_pin, "cf_rst");
+	status = devm_gpio_request(&pdev->dev, board->rst_pin, "cf_rst");
 	if (status < 0)
 		goto fail0a;
 
 	if (gpio_is_valid(board->vcc_pin)) {
-		status = gpio_request(board->vcc_pin, "cf_vcc");
+		status = devm_gpio_request(&pdev->dev, board->vcc_pin, "cf_vcc");
 		if (status < 0)
-			goto fail0b;
+			goto fail0a;
 	}
 
 	/*
@@ -264,32 +305,33 @@
 	 * (Note:  DK board doesn't wire the IRQ pin...)
 	 */
 	if (gpio_is_valid(board->irq_pin)) {
-		status = gpio_request(board->irq_pin, "cf_irq");
+		status = devm_gpio_request(&pdev->dev, board->irq_pin, "cf_irq");
 		if (status < 0)
-			goto fail0c;
-		status = request_irq(gpio_to_irq(board->irq_pin), at91_cf_irq,
-				IRQF_SHARED, driver_name, cf);
+			goto fail0a;
+
+		status = devm_request_irq(&pdev->dev, gpio_to_irq(board->irq_pin),
+					at91_cf_irq, IRQF_SHARED, "at91_cf", cf);
 		if (status < 0)
-			goto fail0d;
+			goto fail0a;
 		cf->socket.pci_irq = gpio_to_irq(board->irq_pin);
 	} else
 		cf->socket.pci_irq = nr_irqs + 1;
 
 	/* pcmcia layer only remaps "real" memory not iospace */
-	cf->socket.io_offset = (unsigned long)
-			ioremap(cf->phys_baseaddr + CF_IO_PHYS, SZ_2K);
+	cf->socket.io_offset = (unsigned long) devm_ioremap(&pdev->dev,
+					cf->phys_baseaddr + CF_IO_PHYS, SZ_2K);
 	if (!cf->socket.io_offset) {
 		status = -ENXIO;
-		goto fail1;
+		goto fail0a;
 	}
 
 	/* reserve chip-select regions */
-	if (!request_mem_region(io->start, resource_size(io), driver_name)) {
+	if (!devm_request_mem_region(&pdev->dev, io->start, resource_size(io), "at91_cf")) {
 		status = -ENXIO;
-		goto fail1;
+		goto fail0a;
 	}
 
-	pr_info("%s: irqs det #%d, io #%d\n", driver_name,
+	dev_info(&pdev->dev, "irqs det #%d, io #%d\n",
 		gpio_to_irq(board->det_pin), gpio_to_irq(board->irq_pin));
 
 	cf->socket.owner = THIS_MODULE;
@@ -303,55 +345,22 @@
 
 	status = pcmcia_register_socket(&cf->socket);
 	if (status < 0)
-		goto fail2;
+		goto fail0a;
 
 	return 0;
 
-fail2:
-	release_mem_region(io->start, resource_size(io));
-fail1:
-	if (cf->socket.io_offset)
-		iounmap((void __iomem *) cf->socket.io_offset);
-	if (gpio_is_valid(board->irq_pin)) {
-		free_irq(gpio_to_irq(board->irq_pin), cf);
-fail0d:
-		gpio_free(board->irq_pin);
-	}
-fail0c:
-	if (gpio_is_valid(board->vcc_pin))
-		gpio_free(board->vcc_pin);
-fail0b:
-	gpio_free(board->rst_pin);
 fail0a:
 	device_init_wakeup(&pdev->dev, 0);
-	free_irq(gpio_to_irq(board->det_pin), cf);
-fail00:
-	gpio_free(board->det_pin);
-fail0:
-	kfree(cf);
 	return status;
 }
 
 static int __exit at91_cf_remove(struct platform_device *pdev)
 {
 	struct at91_cf_socket	*cf = platform_get_drvdata(pdev);
-	struct at91_cf_data	*board = cf->board;
-	struct resource		*io = cf->socket.io[0].res;
 
 	pcmcia_unregister_socket(&cf->socket);
-	release_mem_region(io->start, resource_size(io));
-	iounmap((void __iomem *) cf->socket.io_offset);
-	if (gpio_is_valid(board->irq_pin)) {
-		free_irq(gpio_to_irq(board->irq_pin), cf);
-		gpio_free(board->irq_pin);
-	}
-	if (gpio_is_valid(board->vcc_pin))
-		gpio_free(board->vcc_pin);
-	gpio_free(board->rst_pin);
 	device_init_wakeup(&pdev->dev, 0);
-	free_irq(gpio_to_irq(board->det_pin), cf);
-	gpio_free(board->det_pin);
-	kfree(cf);
+
 	return 0;
 }
 
@@ -391,8 +400,9 @@
 
 static struct platform_driver at91_cf_driver = {
 	.driver = {
-		.name		= (char *) driver_name,
+		.name		= "at91_cf",
 		.owner		= THIS_MODULE,
+		.of_match_table = of_match_ptr(at91_cf_dt_ids),
 	},
 	.remove		= __exit_p(at91_cf_remove),
 	.suspend	= at91_cf_suspend,
@@ -401,17 +411,7 @@
 
 /*--------------------------------------------------------------------------*/
 
-static int __init at91_cf_init(void)
-{
-	return platform_driver_probe(&at91_cf_driver, at91_cf_probe);
-}
-module_init(at91_cf_init);
-
-static void __exit at91_cf_exit(void)
-{
-	platform_driver_unregister(&at91_cf_driver);
-}
-module_exit(at91_cf_exit);
+module_platform_driver_probe(at91_cf_driver, at91_cf_probe);
 
 MODULE_DESCRIPTION("AT91 Compact Flash Driver");
 MODULE_AUTHOR("David Brownell");

diff --git a/drivers/pcmcia/pd6729.c b/drivers/pcmcia/pd6729.c
index b29d97e..a4c16ee 100644
--- a/drivers/pcmcia/pd6729.c
+++ b/drivers/pcmcia/pd6729.c

@@ -644,6 +644,7 @@
 	if (!pci_resource_start(dev, 0)) {
 		dev_warn(&dev->dev, "refusing to load the driver as the "
 			"io_base is NULL.\n");
+		ret = -ENOMEM;
 		goto err_out_disable;
 	}
 
@@ -673,6 +674,7 @@
 	mask = pd6729_isa_scan();
 	if (irq_mode == 0 && mask == 0) {
 		dev_warn(&dev->dev, "no ISA interrupt is available.\n");
+		ret = -ENODEV;
 		goto err_out_free_res;
 	}
 

diff --git a/drivers/regulator/tps6586x-regulator.c b/drivers/regulator/tps6586x-regulator.c
index d8fa37d..2c9155b 100644
--- a/drivers/regulator/tps6586x-regulator.c
+++ b/drivers/regulator/tps6586x-regulator.c

@@ -439,7 +439,7 @@
 
 static struct platform_driver tps6586x_regulator_driver = {
 	.driver	= {
-		.name	= "tps6586x-pmic",
+		.name	= "tps6586x-regulator",
 		.owner	= THIS_MODULE,
 	},
 	.probe		= tps6586x_regulator_probe,

diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 292b24f..32ae6c6 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c

@@ -1656,9 +1656,12 @@
 
 	if (fcoe->netdev->priv_flags & IFF_802_1Q_VLAN &&
 	    fcoe->realdev->features & NETIF_F_HW_VLAN_CTAG_TX) {
-		skb->vlan_tci = VLAN_TAG_PRESENT |
-				vlan_dev_vlan_id(fcoe->netdev);
+		/* must set skb->dev before calling vlan_put_tag */
 		skb->dev = fcoe->realdev;
+		skb = __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+					     vlan_dev_vlan_id(fcoe->netdev));
+		if (!skb)
+			return -ENOMEM;
 	} else
 		skb->dev = fcoe->netdev;
 

diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
index cd743c5..795843d 100644
--- a/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c

@@ -1548,9 +1548,6 @@
 {
 	struct fcoe_fcf *fcf;
 	struct fcoe_fcf *best = fip->sel_fcf;
-	struct fcoe_fcf *first;
-
-	first = list_first_entry(&fip->fcfs, struct fcoe_fcf, list);
 
 	list_for_each_entry(fcf, &fip->fcfs, list) {
 		LIBFCOE_FIP_DBG(fip, "consider FCF fab %16.16llx "
@@ -1568,17 +1565,15 @@
 					"" : "un");
 			continue;
 		}
-		if (fcf->fabric_name != first->fabric_name ||
-		    fcf->vfid != first->vfid ||
-		    fcf->fc_map != first->fc_map) {
+		if (!best || fcf->pri < best->pri || best->flogi_sent)
+			best = fcf;
+		if (fcf->fabric_name != best->fabric_name ||
+		    fcf->vfid != best->vfid ||
+		    fcf->fc_map != best->fc_map) {
 			LIBFCOE_FIP_DBG(fip, "Conflicting fabric, VFID, "
 					"or FC-MAP\n");
 			return NULL;
 		}
-		if (fcf->flogi_sent)
-			continue;
-		if (!best || fcf->pri < best->pri || best->flogi_sent)
-			best = fcf;
 	}
 	fip->sel_fcf = best;
 	if (best) {

diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 82a3c1e..6c4cedb 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c

@@ -8980,19 +8980,6 @@
 	if (!ioa_cfg->res_entries)
 		goto out;
 
-	if (ioa_cfg->sis64) {
-		ioa_cfg->target_ids = kzalloc(sizeof(unsigned long) *
-					      BITS_TO_LONGS(ioa_cfg->max_devs_supported), GFP_KERNEL);
-		ioa_cfg->array_ids = kzalloc(sizeof(unsigned long) *
-					     BITS_TO_LONGS(ioa_cfg->max_devs_supported), GFP_KERNEL);
-		ioa_cfg->vset_ids = kzalloc(sizeof(unsigned long) *
-					    BITS_TO_LONGS(ioa_cfg->max_devs_supported), GFP_KERNEL);
-
-		if (!ioa_cfg->target_ids || !ioa_cfg->array_ids
-			|| !ioa_cfg->vset_ids)
-			goto out_free_res_entries;
-	}
-
 	for (i = 0; i < ioa_cfg->max_devs_supported; i++) {
 		list_add_tail(&ioa_cfg->res_entries[i].queue, &ioa_cfg->free_res_q);
 		ioa_cfg->res_entries[i].ioa_cfg = ioa_cfg;
@@ -9089,9 +9076,6 @@
 			    ioa_cfg->vpd_cbs, ioa_cfg->vpd_cbs_dma);
 out_free_res_entries:
 	kfree(ioa_cfg->res_entries);
-	kfree(ioa_cfg->target_ids);
-	kfree(ioa_cfg->array_ids);
-	kfree(ioa_cfg->vset_ids);
 	goto out;
 }
 

diff --git a/drivers/scsi/ipr.h b/drivers/scsi/ipr.h
index a1fb8405..07a85ce 100644
--- a/drivers/scsi/ipr.h
+++ b/drivers/scsi/ipr.h

@@ -1440,9 +1440,9 @@
 	/*
 	 * Bitmaps for SIS64 generated target values
 	 */
-	unsigned long *target_ids;
-	unsigned long *array_ids;
-	unsigned long *vset_ids;
+	unsigned long target_ids[BITS_TO_LONGS(IPR_MAX_SIS64_DEVS)];
+	unsigned long array_ids[BITS_TO_LONGS(IPR_MAX_SIS64_DEVS)];
+	unsigned long vset_ids[BITS_TO_LONGS(IPR_MAX_SIS64_DEVS)];
 
 	u16 type; /* CCIN of the card */
 

diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
index c772d8d..8b928c6 100644
--- a/drivers/scsi/libfc/fc_exch.c
+++ b/drivers/scsi/libfc/fc_exch.c

@@ -463,13 +463,7 @@
 	fc_exch_release(ep);	/* drop hold for exch in mp */
 }
 
-/**
- * fc_seq_send() - Send a frame using existing sequence/exchange pair
- * @lport: The local port that the exchange will be sent on
- * @sp:	   The sequence to be sent
- * @fp:	   The frame to be sent on the exchange
- */
-static int fc_seq_send(struct fc_lport *lport, struct fc_seq *sp,
+static int fc_seq_send_locked(struct fc_lport *lport, struct fc_seq *sp,
 		       struct fc_frame *fp)
 {
 	struct fc_exch *ep;
@@ -479,7 +473,7 @@
 	u8 fh_type = fh->fh_type;
 
 	ep = fc_seq_exch(sp);
-	WARN_ON((ep->esb_stat & ESB_ST_SEQ_INIT) != ESB_ST_SEQ_INIT);
+	WARN_ON(!(ep->esb_stat & ESB_ST_SEQ_INIT));
 
 	f_ctl = ntoh24(fh->fh_f_ctl);
 	fc_exch_setup_hdr(ep, fp, f_ctl);
@@ -502,17 +496,34 @@
 	error = lport->tt.frame_send(lport, fp);
 
 	if (fh_type == FC_TYPE_BLS)
-		return error;
+		goto out;
 
 	/*
 	 * Update the exchange and sequence flags,
 	 * assuming all frames for the sequence have been sent.
 	 * We can only be called to send once for each sequence.
 	 */
-	spin_lock_bh(&ep->ex_lock);
 	ep->f_ctl = f_ctl & ~FC_FC_FIRST_SEQ;	/* not first seq */
 	if (f_ctl & FC_FC_SEQ_INIT)
 		ep->esb_stat &= ~ESB_ST_SEQ_INIT;
+out:
+	return error;
+}
+
+/**
+ * fc_seq_send() - Send a frame using existing sequence/exchange pair
+ * @lport: The local port that the exchange will be sent on
+ * @sp:	   The sequence to be sent
+ * @fp:	   The frame to be sent on the exchange
+ */
+static int fc_seq_send(struct fc_lport *lport, struct fc_seq *sp,
+		       struct fc_frame *fp)
+{
+	struct fc_exch *ep;
+	int error;
+	ep = fc_seq_exch(sp);
+	spin_lock_bh(&ep->ex_lock);
+	error = fc_seq_send_locked(lport, sp, fp);
 	spin_unlock_bh(&ep->ex_lock);
 	return error;
 }
@@ -629,7 +640,7 @@
 	if (fp) {
 		fc_fill_fc_hdr(fp, FC_RCTL_BA_ABTS, ep->did, ep->sid,
 			       FC_TYPE_BLS, FC_FC_END_SEQ | FC_FC_SEQ_INIT, 0);
-		error = fc_seq_send(ep->lp, sp, fp);
+		error = fc_seq_send_locked(ep->lp, sp, fp);
 	} else
 		error = -ENOBUFS;
 	return error;
@@ -1132,7 +1143,7 @@
 	f_ctl = FC_FC_LAST_SEQ | FC_FC_END_SEQ | FC_FC_SEQ_INIT;
 	f_ctl |= ep->f_ctl;
 	fc_fill_fc_hdr(fp, rctl, ep->did, ep->sid, fh_type, f_ctl, 0);
-	fc_seq_send(ep->lp, sp, fp);
+	fc_seq_send_locked(ep->lp, sp, fp);
 }
 
 /**
@@ -1307,8 +1318,8 @@
 		ap->ba_low_seq_cnt = htons(sp->cnt);
 	}
 	sp = fc_seq_start_next_locked(sp);
-	spin_unlock_bh(&ep->ex_lock);
 	fc_seq_send_last(sp, fp, FC_RCTL_BA_ACC, FC_TYPE_BLS);
+	spin_unlock_bh(&ep->ex_lock);
 	fc_frame_free(rx_fp);
 	return;
 

diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c
index d518d17..6bbb944 100644
--- a/drivers/scsi/libfc/fc_rport.c
+++ b/drivers/scsi/libfc/fc_rport.c

@@ -1962,7 +1962,7 @@
 		rdata->flags |= FC_RP_FLAGS_RETRY;
 	rdata->supported_classes = FC_COS_CLASS3;
 
-	if (!(lport->service_params & FC_RPORT_ROLE_FCP_INITIATOR))
+	if (!(lport->service_params & FCP_SPPF_INIT_FCN))
 		return 0;
 
 	spp->spp_flags |= rspp->spp_flags & FC_SPP_EST_IMG_PAIR;

diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
index 98ab921..0a5c895 100644
--- a/drivers/scsi/qla2xxx/qla_inline.h
+++ b/drivers/scsi/qla2xxx/qla_inline.h

@@ -278,3 +278,14 @@
 
 	set_bit(HOST_RAMP_UP_QUEUE_DEPTH, &vha->dpc_flags);
 }
+
+static inline void
+qla2x00_handle_mbx_completion(struct qla_hw_data *ha, int status)
+{
+	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
+	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
+		set_bit(MBX_INTERRUPT, &ha->mbx_cmd_flags);
+		clear_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags);
+		complete(&ha->mbx_intr_comp);
+	}
+}

diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index 259d920..d2a4c75 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c

@@ -104,14 +104,9 @@
 			RD_REG_WORD(&reg->hccr);
 		}
 	}
+	qla2x00_handle_mbx_completion(ha, status);
 	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
-	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
-	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
-		set_bit(MBX_INTERRUPT, &ha->mbx_cmd_flags);
-		complete(&ha->mbx_intr_comp);
-	}
-
 	return (IRQ_HANDLED);
 }
 
@@ -221,14 +216,9 @@
 		WRT_REG_WORD(&reg->hccr, HCCR_CLR_RISC_INT);
 		RD_REG_WORD_RELAXED(&reg->hccr);
 	}
+	qla2x00_handle_mbx_completion(ha, status);
 	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
-	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
-	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
-		set_bit(MBX_INTERRUPT, &ha->mbx_cmd_flags);
-		complete(&ha->mbx_intr_comp);
-	}
-
 	return (IRQ_HANDLED);
 }
 
@@ -2613,14 +2603,9 @@
 		if (unlikely(IS_QLA83XX(ha) && (ha->pdev->revision == 1)))
 			ndelay(3500);
 	}
+	qla2x00_handle_mbx_completion(ha, status);
 	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
-	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
-	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
-		set_bit(MBX_INTERRUPT, &ha->mbx_cmd_flags);
-		complete(&ha->mbx_intr_comp);
-	}
-
 	return IRQ_HANDLED;
 }
 
@@ -2763,13 +2748,9 @@
 		}
 		WRT_REG_DWORD(&reg->hccr, HCCRX_CLR_RISC_INT);
 	} while (0);
+	qla2x00_handle_mbx_completion(ha, status);
 	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
-	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
-	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
-		set_bit(MBX_INTERRUPT, &ha->mbx_cmd_flags);
-		complete(&ha->mbx_intr_comp);
-	}
 	return IRQ_HANDLED;
 }
 

diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c
index 9e5d89d..3587ec2 100644
--- a/drivers/scsi/qla2xxx/qla_mbx.c
+++ b/drivers/scsi/qla2xxx/qla_mbx.c

@@ -179,8 +179,6 @@
 
 		wait_for_completion_timeout(&ha->mbx_intr_comp, mcp->tov * HZ);
 
-		clear_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags);
-
 	} else {
 		ql_dbg(ql_dbg_mbx, vha, 0x1011,
 		    "Cmd=%x Polling Mode.\n", command);

diff --git a/drivers/scsi/qla2xxx/qla_mr.c b/drivers/scsi/qla2xxx/qla_mr.c
index 937fed8..a6df558 100644
--- a/drivers/scsi/qla2xxx/qla_mr.c
+++ b/drivers/scsi/qla2xxx/qla_mr.c

@@ -148,9 +148,6 @@
 		spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
 		wait_for_completion_timeout(&ha->mbx_intr_comp, mcp->tov * HZ);
-
-		clear_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags);
-
 	} else {
 		ql_dbg(ql_dbg_mbx, vha, 0x112c,
 		    "Cmd=%x Polling Mode.\n", command);
@@ -2934,13 +2931,10 @@
 		QLAFX00_CLR_INTR_REG(ha, clr_intr);
 		QLAFX00_RD_INTR_REG(ha);
 	}
+
+	qla2x00_handle_mbx_completion(ha, status);
 	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
-	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
-	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
-		set_bit(MBX_INTERRUPT, &ha->mbx_cmd_flags);
-		complete(&ha->mbx_intr_comp);
-	}
 	return IRQ_HANDLED;
 }
 

diff --git a/drivers/scsi/qla2xxx/qla_nx.c b/drivers/scsi/qla2xxx/qla_nx.c
index 10754f51..cce0cd0 100644
--- a/drivers/scsi/qla2xxx/qla_nx.c
+++ b/drivers/scsi/qla2xxx/qla_nx.c

@@ -2074,9 +2074,6 @@
 		}
 		WRT_REG_DWORD(&reg->host_int, 0);
 	}
-	spin_unlock_irqrestore(&ha->hardware_lock, flags);
-	if (!ha->flags.msi_enabled)
-		qla82xx_wr_32(ha, ha->nx_legacy_intr.tgt_mask_reg, 0xfbff);
 
 #ifdef QL_DEBUG_LEVEL_17
 	if (!irq && ha->flags.eeh_busy)
@@ -2085,11 +2082,12 @@
 		    status, ha->mbx_cmd_flags, ha->flags.mbox_int, stat);
 #endif
 
-	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
-	    (status & MBX_INTERRUPT) && ha->flags.mbox_int) {
-		set_bit(MBX_INTERRUPT, &ha->mbx_cmd_flags);
-		complete(&ha->mbx_intr_comp);
-	}
+	qla2x00_handle_mbx_completion(ha, status);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
+
+	if (!ha->flags.msi_enabled)
+		qla82xx_wr_32(ha, ha->nx_legacy_intr.tgt_mask_reg, 0xfbff);
+
 	return IRQ_HANDLED;
 }
 
@@ -2149,8 +2147,6 @@
 		WRT_REG_DWORD(&reg->host_int, 0);
 	} while (0);
 
-	spin_unlock_irqrestore(&ha->hardware_lock, flags);
-
 #ifdef QL_DEBUG_LEVEL_17
 	if (!irq && ha->flags.eeh_busy)
 		ql_log(ql_log_warn, vha, 0x5044,
@@ -2158,11 +2154,9 @@
 		    status, ha->mbx_cmd_flags, ha->flags.mbox_int, stat);
 #endif
 
-	if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags) &&
-		(status & MBX_INTERRUPT) && ha->flags.mbox_int) {
-			set_bit(MBX_INTERRUPT, &ha->mbx_cmd_flags);
-			complete(&ha->mbx_intr_comp);
-	}
+	qla2x00_handle_mbx_completion(ha, status);
+	spin_unlock_irqrestore(&ha->hardware_lock, flags);
+
 	return IRQ_HANDLED;
 }
 
@@ -3345,7 +3339,7 @@
 		ha->flags.mbox_busy = 0;
 		ql_log(ql_log_warn, vha, 0x6010,
 		    "Doing premature completion of mbx command.\n");
-		if (test_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags))
+		if (test_and_clear_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags))
 			complete(&ha->mbx_intr_comp);
 	}
 }

diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c
index c735c5a..6427600 100644
--- a/drivers/spi/spi-pxa2xx-dma.c
+++ b/drivers/spi/spi-pxa2xx-dma.c

@@ -59,7 +59,7 @@
 		int ret;
 
 		sg_free_table(sgt);
-		ret = sg_alloc_table(sgt, nents, GFP_KERNEL);
+		ret = sg_alloc_table(sgt, nents, GFP_ATOMIC);
 		if (ret)
 			return ret;
 	}

diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index f5d84d6..48b396f 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c

@@ -1075,7 +1075,7 @@
 	    acpi_bus_get_device(ACPI_HANDLE(&pdev->dev), &adev))
 		return NULL;
 
-	pdata = devm_kzalloc(&pdev->dev, sizeof(*ssp), GFP_KERNEL);
+	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata) {
 		dev_err(&pdev->dev,
 			"failed to allocate memory for platform data\n");

diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c
index 5000586..71cc3e6 100644
--- a/drivers/spi/spi-s3c64xx.c
+++ b/drivers/spi/spi-s3c64xx.c

@@ -444,7 +444,7 @@
 	}
 
 	ret = pm_runtime_get_sync(&sdd->pdev->dev);
-	if (ret != 0) {
+	if (ret < 0) {
 		dev_err(dev, "Failed to enable device: %d\n", ret);
 		goto out_tx;
 	}

diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index aefe820..f64b662 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig

@@ -62,6 +62,8 @@
 
 source "drivers/staging/octeon/Kconfig"
 
+source "drivers/staging/octeon-usb/Kconfig"
+
 source "drivers/staging/serqt_usb2/Kconfig"
 
 source "drivers/staging/vt6655/Kconfig"
@@ -140,4 +142,8 @@
 
 source "drivers/staging/dwc2/Kconfig"
 
+source "drivers/staging/lustre/Kconfig"
+
+source "drivers/staging/btmtk_usb/Kconfig"
+
 endif # STAGING

diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 415772e..1fb58a1 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile

@@ -25,6 +25,7 @@
 obj-$(CONFIG_NETLOGIC_XLR_NET)	+= netlogic/
 obj-$(CONFIG_USB_SERIAL_QUATECH2)	+= serqt_usb2/
 obj-$(CONFIG_OCTEON_ETHERNET)	+= octeon/
+obj-$(CONFIG_OCTEON_USB)	+= octeon-usb/
 obj-$(CONFIG_VT6655)		+= vt6655/
 obj-$(CONFIG_VT6656)		+= vt6656/
 obj-$(CONFIG_VME_BUS)		+= vme/
@@ -62,3 +63,5 @@
 obj-$(CONFIG_ZCACHE)		+= zcache/
 obj-$(CONFIG_GOLDFISH)		+= goldfish/
 obj-$(CONFIG_USB_DWC2)		+= dwc2/
+obj-$(CONFIG_LUSTRE_FS)		+= lustre/
+obj-$(CONFIG_USB_BTMTK)		+= btmtk_usb/

diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index e681bdd..21a3f72 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c

@@ -704,7 +704,8 @@
 
 /* support of 32bit userspace on 64bit platforms */
 #ifdef CONFIG_COMPAT
-static long compat_ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long compat_ashmem_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
 {
 
 	switch (cmd) {

diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c
index 1567ac2..c6dc184 100644
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c

@@ -790,7 +790,7 @@
 	list_del(&buffer->entry);
 	if (free_page_start || free_page_end) {
 		binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
-			     "%d: merge free, buffer %p do not share page%s%s with with %p or %p\n",
+			     "%d: merge free, buffer %p do not share page%s%s with %p or %p\n",
 			     proc->pid, buffer, free_page_start ? "" : " end",
 			     free_page_end ? "" : " start", prev, next);
 		binder_update_page_range(proc, 0, free_page_start ?

diff --git a/drivers/staging/android/sw_sync.c b/drivers/staging/android/sw_sync.c
index 4928f93..765c757 100644
--- a/drivers/staging/android/sw_sync.c
+++ b/drivers/staging/android/sw_sync.c

@@ -160,7 +160,8 @@
 	return 0;
 }
 
-static long sw_sync_ioctl_create_fence(struct sw_sync_timeline *obj, unsigned long arg)
+static long sw_sync_ioctl_create_fence(struct sw_sync_timeline *obj,
+				       unsigned long arg)
 {
 	int fd = get_unused_fd();
 	int err;
@@ -218,7 +219,8 @@
 	return 0;
 }
 
-static long sw_sync_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long sw_sync_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
 {
 	struct sw_sync_timeline *obj = file->private_data;
 

diff --git a/drivers/staging/android/sync.c b/drivers/staging/android/sync.c
index 3893a35..2996077 100644
--- a/drivers/staging/android/sync.c
+++ b/drivers/staging/android/sync.c

@@ -125,9 +125,9 @@
 	spin_unlock_irqrestore(&obj->active_list_lock, flags);
 
 	spin_lock_irqsave(&obj->child_list_lock, flags);
-	if (!list_empty(&pt->child_list)) {
+	if (!list_empty(&pt->child_list))
 		list_del_init(&pt->child_list);
-	}
+
 	spin_unlock_irqrestore(&obj->child_list_lock, flags);
 }
 
@@ -876,11 +876,11 @@
 			seq_printf(s, " / %s", value);
 		}
 	} else if (pt->parent->ops->print_pt) {
-		seq_printf(s, ": ");
+		seq_puts(s, ": ");
 		pt->parent->ops->print_pt(s, pt);
 	}
 
-	seq_printf(s, "\n");
+	seq_puts(s, "\n");
 }
 
 static void sync_print_obj(struct seq_file *s, struct sync_timeline *obj)
@@ -895,11 +895,11 @@
 		obj->ops->timeline_value_str(obj, value, sizeof(value));
 		seq_printf(s, ": %s", value);
 	} else if (obj->ops->print_obj) {
-		seq_printf(s, ": ");
+		seq_puts(s, ": ");
 		obj->ops->print_obj(s, obj);
 	}
 
-	seq_printf(s, "\n");
+	seq_puts(s, "\n");
 
 	spin_lock_irqsave(&obj->child_list_lock, flags);
 	list_for_each(pos, &obj->child_list_head) {
@@ -940,7 +940,7 @@
 	unsigned long flags;
 	struct list_head *pos;
 
-	seq_printf(s, "objs:\n--------------\n");
+	seq_puts(s, "objs:\n--------------\n");
 
 	spin_lock_irqsave(&sync_timeline_list_lock, flags);
 	list_for_each(pos, &sync_timeline_list_head) {
@@ -949,11 +949,11 @@
 				     sync_timeline_list);
 
 		sync_print_obj(s, obj);
-		seq_printf(s, "\n");
+		seq_puts(s, "\n");
 	}
 	spin_unlock_irqrestore(&sync_timeline_list_lock, flags);
 
-	seq_printf(s, "fences:\n--------------\n");
+	seq_puts(s, "fences:\n--------------\n");
 
 	spin_lock_irqsave(&sync_fence_list_lock, flags);
 	list_for_each(pos, &sync_fence_list_head) {
@@ -961,7 +961,7 @@
 			container_of(pos, struct sync_fence, sync_fence_list);
 
 		sync_print_fence(s, fence);
-		seq_printf(s, "\n");
+		seq_puts(s, "\n");
 	}
 	spin_unlock_irqrestore(&sync_fence_list_lock, flags);
 	return 0;
@@ -988,7 +988,7 @@
 
 #define DUMP_CHUNK 256
 static char sync_dump_buf[64 * 1024];
-void sync_dump(void)
+static void sync_dump(void)
 {
 	struct seq_file s = {
 		.buf = sync_dump_buf,

diff --git a/drivers/staging/asus_oled/asus_oled.c b/drivers/staging/asus_oled/asus_oled.c
index d0a5a28..3654dc3 100644
--- a/drivers/staging/asus_oled/asus_oled.c
+++ b/drivers/staging/asus_oled/asus_oled.c

@@ -50,9 +50,9 @@
 #define ASUS_OLED_DISP_HEIGHT		32
 #define ASUS_OLED_PACKET_BUF_SIZE	256
 
-#define USB_VENDOR_ID_ASUS      0x0b05
-#define USB_DEVICE_ID_ASUS_LCM      0x1726
-#define USB_DEVICE_ID_ASUS_LCM2     0x175b
+#define USB_VENDOR_ID_ASUS		0x0b05
+#define USB_DEVICE_ID_ASUS_LCM		0x1726
+#define USB_DEVICE_ID_ASUS_LCM2		0x175b
 
 MODULE_AUTHOR("Jakub Schmidtke, sjakub@gmail.com");
 MODULE_DESCRIPTION("Asus OLED Driver");
@@ -324,9 +324,11 @@
 		return;
 
 	if (odev->pack_mode == PACK_MODE_G1) {
-		/* When sending roll-mode data the display updated only
-		   first packet.  I have no idea why, but when static picture
-		   is sent just before rolling picture everything works fine. */
+		/*
+		 * When sending roll-mode data the display updated only
+		 * first packet.  I have no idea why, but when static picture
+		 * is sent just before rolling picture everything works fine.
+		 */
 		if (odev->pic_mode == ASUS_OLED_ROLL)
 			send_packets(odev->udev, packet, odev->buf,
 				     ASUS_OLED_STATIC, 2);
@@ -363,9 +365,11 @@
 
 		switch (odev->pack_mode) {
 		case PACK_MODE_G1:
-			/* i = (x/128)*640 + 127 - x + (y/8)*128;
-			   This one for 128 is the same, but might be better
-			   for different widths? */
+			/*
+			 * i = (x/128)*640 + 127 - x + (y/8)*128;
+			 * This one for 128 is the same, but might be better
+			 * for different widths?
+			 */
 			i = (x/odev->dev_width)*640 +
 				odev->dev_width - 1 - x +
 				(y/8)*odev->dev_width;
@@ -383,10 +387,8 @@
 		}
 
 		if (i >= odev->buf_size) {
-			dev_err(odev->dev, "Buffer overflow! Report a bug:"
-			       "offs: %d >= %d i: %d (x: %d y: %d)\n",
-			       (int) odev->buf_offs, (int) odev->buf_size,
-			       (int) i, (int) x, (int) y);
+			dev_err(odev->dev, "Buffer overflow! Report a bug: offs: %zu >= %zu i: %zu (x: %zu y: %zu)\n",
+			       odev->buf_offs, odev->buf_size, i, x, y);
 			return -EIO;
 		}
 
@@ -401,7 +403,7 @@
 
 		default:
 			/* cannot get here; stops gcc complaining*/
-			;
+			break;
 		}
 
 		odev->buf_offs++;
@@ -566,9 +568,11 @@
 			if (ret < 0)
 				return ret;
 		} else if (buf[offs] == '\n') {
-			/* New line detected. Lets assume, that all characters
-			   till the end of the line were equal to the last
-			   character in this line.*/
+			/*
+			 * New line detected. Lets assume, that all characters
+			 * till the end of the line were equal to the last
+			 * character in this line.
+			 */
 			if (odev->buf_offs % odev->width != 0)
 				ret = append_values(odev, odev->last_val,
 						    odev->width -

diff --git a/drivers/staging/bcm/Bcmchar.c b/drivers/staging/bcm/Bcmchar.c
index 35641e5..f67a225 100644
--- a/drivers/staging/bcm/Bcmchar.c
+++ b/drivers/staging/bcm/Bcmchar.c

@@ -13,7 +13,7 @@
 * Returns	  - Zero(Success)
 ****************************************************************/
 
-static int bcm_char_open(struct inode *inode, struct file * filp)
+static int bcm_char_open(struct inode *inode, struct file *filp)
 {
 	struct bcm_mini_adapter *Adapter = NULL;
 	struct bcm_tarang_data *pTarang = NULL;

diff --git a/drivers/staging/bcm/InterfaceIdleMode.c b/drivers/staging/bcm/InterfaceIdleMode.c
index a1bf215..5347828 100644
--- a/drivers/staging/bcm/InterfaceIdleMode.c
+++ b/drivers/staging/bcm/InterfaceIdleMode.c

@@ -42,107 +42,95 @@
 */
 
 
-int InterfaceIdleModeRespond(struct bcm_mini_adapter *Adapter, unsigned int* puiBuffer)
+int InterfaceIdleModeRespond(struct bcm_mini_adapter *Adapter, unsigned int *puiBuffer)
 {
 	int	status = STATUS_SUCCESS;
 	unsigned int	uiRegRead = 0;
 	int bytes;
 
-	BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"SubType of Message :0x%X", ntohl(*puiBuffer));
+	BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "SubType of Message :0x%X", ntohl(*puiBuffer));
 
-	if(ntohl(*puiBuffer) == GO_TO_IDLE_MODE_PAYLOAD)
-	{
-		BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL," Got GO_TO_IDLE_MODE_PAYLOAD(210) Msg Subtype");
-		if(ntohl(*(puiBuffer+1)) == 0 )
-		{
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Got IDLE MODE WAKE UP Response From F/W");
+	if (ntohl(*puiBuffer) == GO_TO_IDLE_MODE_PAYLOAD) {
+		BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, " Got GO_TO_IDLE_MODE_PAYLOAD(210) Msg Subtype");
+		if (ntohl(*(puiBuffer+1)) == 0 ) {
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Got IDLE MODE WAKE UP Response From F/W");
 
-			status = wrmalt (Adapter,SW_ABORT_IDLEMODE_LOC, &uiRegRead, sizeof(uiRegRead));
-			if(status)
-			{
-				BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode Reg");
+			status = wrmalt (Adapter, SW_ABORT_IDLEMODE_LOC, &uiRegRead, sizeof(uiRegRead));
+			if (status) {
+				BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode Reg");
 				return status;
 			}
 
-			if(Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING)
-			{
+			if (Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING) {
 				uiRegRead = 0x00000000 ;
-				status = wrmalt (Adapter,DEBUG_INTERRUPT_GENERATOR_REGISTOR, &uiRegRead, sizeof(uiRegRead));
-				if(status)
-				{
-					BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode	Reg");
+				status = wrmalt (Adapter, DEBUG_INTERRUPT_GENERATOR_REGISTOR, &uiRegRead, sizeof(uiRegRead));
+				if (status) {
+					BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode	Reg");
 					return status;
 				}
 			}
-			//Below Register should not br read in case of Manual and Protocol Idle mode.
-			else if(Adapter->ulPowerSaveMode != DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE)
-			{
-				//clear on read Register
+			/* Below Register should not br read in case of Manual and Protocol Idle mode */
+			else if (Adapter->ulPowerSaveMode != DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE) {
+				/* clear on read Register */
 				bytes = rdmalt(Adapter, DEVICE_INT_OUT_EP_REG0, &uiRegRead, sizeof(uiRegRead));
 				if (bytes < 0) {
 					status = bytes;
-					BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0, "rdm failed while clearing H/W Abort Reg0");
+					BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "rdm failed while clearing H/W Abort Reg0");
 					return status;
 				}
-				//clear on read Register
+				/* clear on read Register */
 				bytes = rdmalt(Adapter, DEVICE_INT_OUT_EP_REG1, &uiRegRead, sizeof(uiRegRead));
 				if (bytes < 0) {
 					status = bytes;
-					BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0, "rdm failed while clearing H/W Abort	Reg1");
+					BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "rdm failed while clearing H/W Abort	Reg1");
 					return status;
 				}
 			}
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Device Up from Idle Mode");
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Device Up from Idle Mode");
 
-			// Set Idle Mode Flag to False and Clear IdleMode reg.
+			/* Set Idle Mode Flag to False and Clear IdleMode reg. */
 			Adapter->IdleMode = FALSE;
 			Adapter->bTriedToWakeUpFromlowPowerMode = FALSE;
 
 			wake_up(&Adapter->lowpower_mode_wait_queue);
 
-		}
-		else
-		{
-			if(TRUE == Adapter->IdleMode)
+		} else {
+			if (TRUE == Adapter->IdleMode)
 			{
-				BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Device is already in Idle mode....");
+				BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Device is already in Idle mode....");
 				return status ;
 			}
 
 			uiRegRead = 0;
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Got Req from F/W to go in IDLE mode \n");
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Got Req from F/W to go in IDLE mode \n");
 
-			if (Adapter->chip_id== BCS220_2 ||
+			if (Adapter->chip_id == BCS220_2 ||
 				Adapter->chip_id == BCS220_2BC ||
-					Adapter->chip_id== BCS250_BC ||
-					Adapter->chip_id== BCS220_3)
-			{
+					Adapter->chip_id == BCS250_BC ||
+					Adapter->chip_id == BCS220_3) {
 
 				bytes = rdmalt(Adapter, HPM_CONFIG_MSW, &uiRegRead, sizeof(uiRegRead));
 				if (bytes < 0) {
 					status = bytes;
-					BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "rdm failed while Reading HPM_CONFIG_LDO145 Reg 0\n");
+					BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "rdm failed while Reading HPM_CONFIG_LDO145 Reg 0\n");
 					return status;
 				}
 
 
 				uiRegRead |= (1<<17);
 
-				status = wrmalt (Adapter,HPM_CONFIG_MSW, &uiRegRead, sizeof(uiRegRead));
-				if(status)
-				{
-					BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode Reg\n");
+				status = wrmalt (Adapter, HPM_CONFIG_MSW, &uiRegRead, sizeof(uiRegRead));
+				if (status) {
+					BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode Reg\n");
 					return status;
 				}
 
 			}
 			SendIdleModeResponse(Adapter);
 		}
-	}
-	else if(ntohl(*puiBuffer) == IDLE_MODE_SF_UPDATE_MSG)
-	{
-		BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "OverRiding Service Flow Params");
-		OverrideServiceFlowParams(Adapter,puiBuffer);
+	} else if (ntohl(*puiBuffer) == IDLE_MODE_SF_UPDATE_MSG) {
+		BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "OverRiding Service Flow Params");
+		OverrideServiceFlowParams(Adapter, puiBuffer);
 	}
 	return status;
 }
@@ -152,46 +140,40 @@
 	int 	status = STATUS_SUCCESS;
 	unsigned int value;
 	unsigned int chip_id ;
-	unsigned long timeout = 0 ,itr = 0;
+	unsigned long timeout = 0, itr = 0;
 
 	int 	lenwritten = 0;
-	unsigned char aucAbortPattern[8]={0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
+	unsigned char aucAbortPattern[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
 	struct bcm_interface_adapter *psInterfaceAdapter = Adapter->pvInterfaceAdapter;
 
-	//Abort Bus suspend if its already suspended
-	if((TRUE == psInterfaceAdapter->bSuspended) && (TRUE == Adapter->bDoSuspend))
-	{
+	/* Abort Bus suspend if its already suspended */
+	if ((TRUE == psInterfaceAdapter->bSuspended) && (TRUE == Adapter->bDoSuspend)) {
 		status = usb_autopm_get_interface(psInterfaceAdapter->interface);
-		BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Bus got wakeup..Aborting Idle mode... status:%d \n",status);
+		BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Bus got wakeup..Aborting Idle mode... status:%d \n", status);
 
 	}
 
-	if((Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING)
+	if ((Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING)
 									||
-	   (Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE))
-	{
-		//write the SW abort pattern.
-		BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Writing pattern<%d> to SW_ABORT_IDLEMODE_LOC\n", Pattern);
-		status = wrmalt(Adapter,SW_ABORT_IDLEMODE_LOC, &Pattern, sizeof(Pattern));
-		if(status)
-		{
-				BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"WRM to Register SW_ABORT_IDLEMODE_LOC failed..");
+	   (Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE)) {
+		/* write the SW abort pattern. */
+		BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Writing pattern<%d> to SW_ABORT_IDLEMODE_LOC\n", Pattern);
+		status = wrmalt(Adapter, SW_ABORT_IDLEMODE_LOC, &Pattern, sizeof(Pattern));
+		if (status) {
+				BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "WRM to Register SW_ABORT_IDLEMODE_LOC failed..");
 				return status;
 		}
 	}
 
-	if(Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING)
-	{
+	if (Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING) {
 		value = 0x80000000;
-		status = wrmalt(Adapter,DEBUG_INTERRUPT_GENERATOR_REGISTOR, &value, sizeof(value));
-		if(status)
+		status = wrmalt(Adapter, DEBUG_INTERRUPT_GENERATOR_REGISTOR, &value, sizeof(value));
+		if (status)
 		{
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"WRM to DEBUG_INTERRUPT_GENERATOR_REGISTOR Register failed");
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "WRM to DEBUG_INTERRUPT_GENERATOR_REGISTOR Register failed");
 			return status;
 		}
-	}
-	else if(Adapter->ulPowerSaveMode != DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE)
-	{
+	} else if (Adapter->ulPowerSaveMode != DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE) {
 		/*
 		 * Get a Interrupt Out URB and send 8 Bytes Down
 		 * To be Done in Thread Context.
@@ -204,43 +186,32 @@
 			8,
 			&lenwritten,
 			5000);
-		if(status)
-		{
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Sending Abort pattern down fails with status:%d..\n",status);
+		if (status) {
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Sending Abort pattern down fails with status:%d..\n", status);
 			return status;
-		}
-		else
-		{
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "NOB Sent down :%d", lenwritten);
+		} else {
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "NOB Sent down :%d", lenwritten);
 		}
 
-		//mdelay(25);
+		/* mdelay(25); */
 
-		timeout= jiffies +  msecs_to_jiffies(50) ;
-		while( timeout > jiffies )
-		{
+		timeout = jiffies +  msecs_to_jiffies(50) ;
+		while ( timeout > jiffies ) {
 			itr++ ;
 			rdmalt(Adapter, CHIP_ID_REG, &chip_id, sizeof(UINT));
-			if(0xbece3200==(chip_id&~(0xF0)))
-			{
+			if (0xbece3200 == (chip_id&~(0xF0)))
 				chip_id = chip_id&~(0xF0);
-			}
-			if(chip_id == Adapter->chip_id)
+			if (chip_id == Adapter->chip_id)
 				break;
 		}
-		if(timeout < jiffies )
-		{
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Not able to read chip-id even after 25 msec");
-		}
+		if (timeout < jiffies )
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Not able to read chip-id even after 25 msec");
 		else
-		{
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Number of completed iteration to read chip-id :%lu", itr);
-		}
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Number of completed iteration to read chip-id :%lu", itr);
 
-		status = wrmalt(Adapter,SW_ABORT_IDLEMODE_LOC, &Pattern, sizeof(status));
-		if(status)
-		{
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0,"WRM to Register SW_ABORT_IDLEMODE_LOC failed..");
+		status = wrmalt(Adapter, SW_ABORT_IDLEMODE_LOC, &Pattern, sizeof(status));
+		if (status) {
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "WRM to Register SW_ABORT_IDLEMODE_LOC failed..");
 			return status;
 		}
 	}
@@ -249,13 +220,10 @@
 int InterfaceIdleModeWakeup(struct bcm_mini_adapter *Adapter)
 {
 	ULONG	Status = 0;
-	if(Adapter->bTriedToWakeUpFromlowPowerMode)
-	{
-		BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Wake up already attempted.. ignoring\n");
-	}
-	else
-	{
-		BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Writing Low Power Mode Abort pattern to the Device\n");
+	if (Adapter->bTriedToWakeUpFromlowPowerMode) {
+		BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Wake up already attempted.. ignoring\n");
+	} else {
+		BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Writing Low Power Mode Abort pattern to the Device\n");
 		Adapter->bTriedToWakeUpFromlowPowerMode = TRUE;
 		InterfaceAbortIdlemode(Adapter, Adapter->usIdleModePattern);
 
@@ -269,33 +237,30 @@
 	INT Status = 0;
 	int bytes;
 
-	if(Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING)
-	{
-		// clear idlemode interrupt.
+	if (Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING) {
+		/* clear idlemode interrupt. */
 		uiRegVal = 0;
-		Status =wrmalt(Adapter,DEBUG_INTERRUPT_GENERATOR_REGISTOR, &uiRegVal, sizeof(uiRegVal));
-		if(Status)
-		{
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0,"WRM to DEBUG_INTERRUPT_GENERATOR_REGISTOR Failed with err :%d", Status);
+		Status = wrmalt(Adapter, DEBUG_INTERRUPT_GENERATOR_REGISTOR, &uiRegVal, sizeof(uiRegVal));
+		if (Status) {
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0,"WRM to DEBUG_INTERRUPT_GENERATOR_REGISTOR Failed with err :%d", Status);
 			return;
 		}
 	}
 
-    else
-	{
+    else {
 
-        //clear Interrupt EP registers.
-		bytes = rdmalt(Adapter,DEVICE_INT_OUT_EP_REG0, &uiRegVal, sizeof(uiRegVal));
+        /* clear Interrupt EP registers. */
+		bytes = rdmalt(Adapter, DEVICE_INT_OUT_EP_REG0, &uiRegVal, sizeof(uiRegVal));
 		if (bytes < 0) {
 			Status = bytes;
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0,"RDM of DEVICE_INT_OUT_EP_REG0 failed with Err :%d", Status);
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "RDM of DEVICE_INT_OUT_EP_REG0 failed with Err :%d", Status);
 			return;
 		}
 
-		bytes = rdmalt(Adapter,DEVICE_INT_OUT_EP_REG1, &uiRegVal, sizeof(uiRegVal));
+		bytes = rdmalt(Adapter, DEVICE_INT_OUT_EP_REG1, &uiRegVal, sizeof(uiRegVal));
 		if (bytes < 0) {
 			Status = bytes;
-			BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0,"RDM of DEVICE_INT_OUT_EP_REG1 failed with Err :%d", Status);
+			BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "RDM of DEVICE_INT_OUT_EP_REG1 failed with Err :%d", Status);
 			return;
 		}
 	}

diff --git a/drivers/staging/bcm/Version.h b/drivers/staging/bcm/Version.h
index a07b956..f1cb9de 100644
--- a/drivers/staging/bcm/Version.h
+++ b/drivers/staging/bcm/Version.h

@@ -1,4 +1,3 @@
-
 /*Copyright (c) 2005 Beceem Communications Inc.
 
 Module Name:
@@ -17,7 +16,6 @@
 #define VER_FILETYPE                VFT_DRV
 #define VER_FILESUBTYPE             VFT2_DRV_NETWORK
 
-
 #define VER_FILEVERSION             5.2.45
 #define VER_FILEVERSION_STR         "5.2.45"
 
@@ -28,8 +26,4 @@
 #define VER_PRODUCTVERSION_STR      VER_FILEVERSION_STR
 
 
-
-
-//#include "common.ver"
-
-#endif 	//VERSION_H
+#endif /* VERSION_H */

diff --git a/drivers/staging/bcm/vendorspecificextn.c b/drivers/staging/bcm/vendorspecificextn.c
index be1f91d..d38a06f 100644
--- a/drivers/staging/bcm/vendorspecificextn.c
+++ b/drivers/staging/bcm/vendorspecificextn.c

@@ -1,70 +1,70 @@
 #include "headers.h"
-//-----------------------------------------------------------------------------
-// Procedure:	vendorextnGetSectionInfo
-//
-// Description: Finds the type of NVM used.
-//
-// Arguments:
-//		Adapter    - ptr to Adapter object instance
-//		pNVMType   - ptr to NVM type.
-// Returns:
-//		STATUS_SUCCESS/STATUS_FAILURE
-//
-//-----------------------------------------------------------------------------
+/*
+ * Procedure:	vendorextnGetSectionInfo
+ *
+ * Description: Finds the type of NVM used.
+ *
+ * Arguments:
+ *		Adapter    - ptr to Adapter object instance
+ *		pNVMType   - ptr to NVM type.
+ * Returns:
+ *		STATUS_SUCCESS/STATUS_FAILURE
+ *
+ */
 INT vendorextnGetSectionInfo(PVOID  pContext, struct bcm_flash2x_vendor_info *pVendorInfo)
 {
 	return STATUS_FAILURE;
 }
 
-//-----------------------------------------------------------------------------
-// Procedure:   vendorextnInit
-//
-// Description: Initializing the vendor extension NVM interface
-//
-// Arguments:
-//              Adapter   - Pointer to MINI Adapter Structure.
-
-// Returns:
-//              STATUS_SUCCESS/STATUS_FAILURE
-//
-//-----------------------------------------------------------------------------
+/*
+ * Procedure:   vendorextnInit
+ *
+ * Description: Initializing the vendor extension NVM interface
+ *
+ * Arguments:
+ *              Adapter   - Pointer to MINI Adapter Structure
+ * Returns:
+ *             STATUS_SUCCESS/STATUS_FAILURE
+ *
+ *
+ */
 INT vendorextnInit(struct bcm_mini_adapter *Adapter)
 {
 	return STATUS_SUCCESS;
 }
 
-//-----------------------------------------------------------------------------
-// Procedure:   vendorextnExit
-//
-// Description: Free the resource associated with vendor extension NVM interface
-//
-// Arguments:
-//              Adapter   - Pointer to MINI Adapter Structure.
-
-// Returns:
-//              STATUS_SUCCESS/STATUS_FAILURE
-//
-//-----------------------------------------------------------------------------
+/*
+ * Procedure:   vendorextnExit
+ *
+ * Description: Free the resource associated with vendor extension NVM interface
+ *
+ * Arguments:
+ *
+ * Returns:
+ *              STATUS_SUCCESS/STATUS_FAILURE
+ *
+ *
+ */
 INT vendorextnExit(struct bcm_mini_adapter *Adapter)
 {
 	return STATUS_SUCCESS;
 }
 
-//------------------------------------------------------------------------
-// Procedure:	vendorextnIoctl
-//
-// Description: 	execute the vendor extension specific ioctl
-//
-//Arguments:
-//		Adapter -Beceem private Adapter Structure
-//		cmd 	-vendor extension specific Ioctl commad
-//		arg		-input parameter sent by vendor
-//
-// Returns:
-//		CONTINUE_COMMON_PATH in case it is not meant to be processed by vendor ioctls
-//		STATUS_SUCCESS/STATUS_FAILURE as per the IOCTL return value
-//
-//--------------------------------------------------------------------------
+/*
+ * Procedure:	vendorextnIoctl
+ *
+ * Description: execute the vendor extension specific ioctl
+ *
+ * Arguments:
+ *		Adapter -Beceem private Adapter Structure
+ *		cmd	-vendor extension specific Ioctl commad
+ *		arg	-input parameter sent by vendor
+ *
+ * Returns:
+ *		CONTINUE_COMMON_PATH in case it is not meant to be processed by vendor ioctls
+ *		STATUS_SUCCESS/STATUS_FAILURE as per the IOCTL return value
+ */
+
 INT vendorextnIoctl(struct bcm_mini_adapter *Adapter, UINT cmd, ULONG arg)
 {
 	return CONTINUE_COMMON_PATH;
@@ -72,22 +72,21 @@
 
 
 
-//------------------------------------------------------------------
-// Procedure:	vendorextnReadSection
-//
-// Description: Reads from a section of NVM
-//
-// Arguments:
-//		pContext - ptr to Adapter object instance
-//		pBuffer - Read the data from Vendor Area to this buffer
-//		SectionVal   - Value of type of Section
-//		Offset - Read from the Offset of the Vendor Section.
-//		numOfBytes - Read numOfBytes from the Vendor section to Buffer
-//
-// Returns:
-//		STATUS_SUCCESS/STATUS_FAILURE
-//
-//------------------------------------------------------------------
+/*
+ * Procedure:	vendorextnReadSection
+ *
+ * Description: Reads from a section of NVM
+ *
+ * Arguments:
+ *		pContext - ptr to Adapter object instance
+ *		pBuffer - Read the data from Vendor Area to this buffer
+ *		SectionVal   - Value of type of Section
+ *		Offset - Read from the Offset of the Vendor Section.
+ *		numOfBytes - Read numOfBytes from the Vendor section to Buffer
+ *
+ * Returns:
+ *		STATUS_SUCCESS/STATUS_FAILURE
+ */
 
 INT vendorextnReadSection(PVOID  pContext, PUCHAR pBuffer, enum bcm_flash2x_section_val SectionVal,
 			UINT offset, UINT numOfBytes)
@@ -97,23 +96,22 @@
 
 
 
-//------------------------------------------------------------------
-// Procedure:	vendorextnWriteSection
-//
-// Description: Write to a Section of NVM
-//
-// Arguments:
-//		pContext - ptr to Adapter object instance
-//		pBuffer - Write the data provided in the buffer
-//		SectionVal   - Value of type of Section
-//		Offset - Writes to the Offset of the Vendor Section.
-//		numOfBytes - Write num Bytes after reading from pBuffer.
-//		bVerify - the Buffer Written should be verified.
-//
-// Returns:
-//		STATUS_SUCCESS/STATUS_FAILURE
-//
-//------------------------------------------------------------------
+/*
+ * Procedure:	vendorextnWriteSection
+ *
+ * Description: Write to a Section of NVM
+ *
+ * Arguments:
+ *		pContext - ptr to Adapter object instance
+ *		pBuffer - Write the data provided in the buffer
+ *		SectionVal   - Value of type of Section
+ *		Offset - Writes to the Offset of the Vendor Section.
+ *		numOfBytes - Write num Bytes after reading from pBuffer.
+ *		bVerify - the Buffer Written should be verified.
+ *
+ * Returns:
+ *		STATUS_SUCCESS/STATUS_FAILURE
+ */
 INT vendorextnWriteSection(PVOID  pContext, PUCHAR pBuffer, enum bcm_flash2x_section_val SectionVal,
 			UINT offset, UINT numOfBytes, BOOLEAN bVerify)
 {
@@ -122,25 +120,23 @@
 
 
 
-//------------------------------------------------------------------
-// Procedure:	vendorextnWriteSectionWithoutErase
-//
-// Description: Write to a Section of NVM without erasing the sector
-//
-// Arguments:
-//		pContext - ptr to Adapter object instance
-//		pBuffer - Write the data provided in the buffer
-//		SectionVal   - Value of type of Section
-//		Offset - Writes to the Offset of the Vendor Section.
-//		numOfBytes - Write num Bytes after reading from pBuffer.
-//
-// Returns:
-//		STATUS_SUCCESS/STATUS_FAILURE
-//
-//------------------------------------------------------------------
+/*
+ * Procedure:	vendorextnWriteSectionWithoutErase
+ *
+ * Description: Write to a Section of NVM without erasing the sector
+ *
+ * Arguments:
+ *		pContext - ptr to Adapter object instance
+ *		pBuffer - Write the data provided in the buffer
+ *		SectionVal   - Value of type of Section
+ *		Offset - Writes to the Offset of the Vendor Section.
+ *		numOfBytes - Write num Bytes after reading from pBuffer.
+ *
+ * Returns:
+ *		STATUS_SUCCESS/STATUS_FAILURE
+ */
 INT vendorextnWriteSectionWithoutErase(PVOID  pContext, PUCHAR pBuffer, enum bcm_flash2x_section_val SectionVal,
 			UINT offset, UINT numOfBytes)
 {
 	return STATUS_FAILURE;
 }
-

diff --git a/drivers/staging/btmtk_usb/Kconfig b/drivers/staging/btmtk_usb/Kconfig
new file mode 100644
index 0000000..a425ebd
--- /dev/null
+++ b/drivers/staging/btmtk_usb/Kconfig

@@ -0,0 +1,11 @@
+config USB_BTMTK
+	tristate "Mediatek Bluetooth support"
+	depends on USB && BT && m
+	---help---
+	  Say Y here if you wish to control a MTK USB Bluetooth.
+
+	  This option depends on 'USB' support being enabled
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called btmtk_usb.
+

diff --git a/drivers/staging/btmtk_usb/Makefile b/drivers/staging/btmtk_usb/Makefile
new file mode 100644
index 0000000..4d6c9d7
--- /dev/null
+++ b/drivers/staging/btmtk_usb/Makefile

@@ -0,0 +1 @@
+obj-$(CONFIG_USB_BTMTK)	+= btmtk_usb.o

diff --git a/drivers/staging/btmtk_usb/README b/drivers/staging/btmtk_usb/README
new file mode 100644
index 0000000..c046c8e
--- /dev/null
+++ b/drivers/staging/btmtk_usb/README

@@ -0,0 +1,14 @@
+-build driver modules
+	make
+
+-install driver modules
+	make install
+
+-remove driver modules
+	make clean
+
+-dynamic debug message
+	turn on CONFIG_DYNAMIC_DEBUG compiler flag for current kernel
+	mount -t debugfs none /sys/kernel/debug/
+	echo "module module_name +p" > /sys/kernel/debug/dynamic_debug/control(turn on debug messages, module name such as btmtk_usb)
+	echo "module module_name -p" > /sys/kernel/debug/dynamic_debug/control(turn off debug messages, module name such as btmtk_usb)

diff --git a/drivers/staging/btmtk_usb/TODO b/drivers/staging/btmtk_usb/TODO
new file mode 100644
index 0000000..a71d129
--- /dev/null
+++ b/drivers/staging/btmtk_usb/TODO

@@ -0,0 +1,10 @@
+TODO:
+        - checkpatch.pl clean
+	- determine if the driver should not be using a duplicate
+          version of the usb-bluetooth interface code, but should
+          be merged into the drivers/bluetooth/ directory and
+          infrastructure instead.
+	- review by the bluetooth developer community
+
+Please send any patches for this driver to Yu-Chen, Cho <acho@suse.com> and
+jay.hung@mediatek.com

diff --git a/drivers/staging/btmtk_usb/btmtk_usb.c b/drivers/staging/btmtk_usb/btmtk_usb.c
new file mode 100644
index 0000000..0e783e8
--- /dev/null
+++ b/drivers/staging/btmtk_usb/btmtk_usb.c

@@ -0,0 +1,1784 @@
+/*
+ *  MediaTek Bluetooth USB Driver
+ *
+ *  Copyright (C) 2013, MediaTek co.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  or on the worldwide web at http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/completion.h>
+#include <linux/firmware.h>
+#include <linux/usb.h>
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "btmtk_usb.h"
+
+#define VERSION "1.0.4"
+#define MT7650_FIRMWARE	"mt7650.bin"
+#define MT7662_FIRMWARE	"mt7662.bin"
+
+static struct usb_driver btmtk_usb_driver;
+
+
+static int btmtk_usb_load_rom_patch(struct btmtk_usb_data *);
+static int btmtk_usb_load_fw(struct btmtk_usb_data *);
+
+static void hex_dump(char *str, u8 *src_buf, u32 src_buf_len)
+{
+	unsigned char *pt;
+	int x;
+
+	pt = src_buf;
+
+	BT_DBG("%s: %p, len = %d\n", str, src_buf, src_buf_len);
+
+	for (x = 0; x < src_buf_len; x++) {
+		if (x % 16 == 0)
+			BT_DBG("0x%04x : ", x);
+		BT_DBG("%02x ", ((unsigned char)pt[x]));
+		if (x % 16 == 15)
+			BT_DBG("\n");
+	}
+
+	BT_DBG("\n");
+}
+
+static int btmtk_usb_reset(struct usb_device *udev)
+{
+	int ret;
+
+	BT_DBG("%s\n", __func__);
+
+	ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x01, DEVICE_VENDOR_REQUEST_OUT,
+						  0x01, 0x00, NULL, 0x00, CONTROL_TIMEOUT_JIFFIES);
+
+	if (ret < 0) {
+		BT_ERR("%s error(%d)\n", __func__, ret);
+		return ret;
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int btmtk_usb_io_read32(struct btmtk_usb_data *data, u32 reg, u32 *val)
+{
+	u8 request = data->r_request;
+	struct usb_device *udev = data->udev;
+	int ret;
+
+	ret = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), request, DEVICE_VENDOR_REQUEST_IN,
+						  0x0, reg, data->io_buf, 4,
+						  CONTROL_TIMEOUT_JIFFIES);
+
+	if (ret < 0) {
+		*val = 0xffffffff;
+		BT_ERR("%s error(%d), reg=%x, value=%x\n", __func__, ret, reg, *val);
+		return ret;
+	}
+
+	memmove(val, data->io_buf, 4);
+
+	*val = le32_to_cpu(*val);
+
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int btmtk_usb_io_write32(struct btmtk_usb_data *data, u32 reg, u32 val)
+{
+	u16 value, index;
+	u8 request = data->w_request;
+	struct usb_device *udev = data->udev;
+	int ret;
+
+	index = (u16)reg;
+	value = val & 0x0000ffff;
+
+	ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), request, DEVICE_VENDOR_REQUEST_OUT,
+						  value, index, NULL, 0,
+						  CONTROL_TIMEOUT_JIFFIES);
+
+	if (ret < 0) {
+		BT_ERR("%s error(%d), reg=%x, value=%x\n", __func__, ret, reg, val);
+		return ret;
+	}
+
+	index = (u16)(reg + 2);
+	value = (val & 0xffff0000) >> 16;
+
+	ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
+				request, DEVICE_VENDOR_REQUEST_OUT,
+				value, index, NULL, 0, CONTROL_TIMEOUT_JIFFIES);
+
+	if (ret < 0) {
+		BT_ERR("%s error(%d), reg=%x, value=%x\n", __func__, ret, reg, val);
+		return ret;
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int btmtk_usb_switch_iobase(struct btmtk_usb_data *data, int base)
+{
+	int ret = 0;
+
+	switch (base) {
+	case SYSCTL:
+		data->w_request = 0x42;
+		data->r_request = 0x47;
+		break;
+	case WLAN:
+		data->w_request = 0x02;
+		data->r_request = 0x07;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static void btmtk_usb_cap_init(struct btmtk_usb_data *data)
+{
+	const struct firmware	*firmware;
+	struct usb_device   *udev = data->udev;
+	int ret;
+
+	btmtk_usb_io_read32(data, 0x00, &data->chip_id);
+
+	BT_DBG("chip id = %x\n", data->chip_id);
+
+	if (is_mt7630(data) || is_mt7650(data)) {
+		data->need_load_fw = 1;
+		data->need_load_rom_patch = 0;
+		ret = request_firmware(&firmware, MT7650_FIRMWARE, &udev->dev);
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				BT_ERR("Firmware file \"%s\" not found \n", MT7650_FIRMWARE);
+			} else {
+				BT_ERR("Firmware file \"%s\" request failed (err=%d) \n",
+					MT7650_FIRMWARE, ret);
+			}
+		} else {
+			BT_DBG("Firmware file \"%s\" Found \n", MT7650_FIRMWARE);
+			/* load firmware here */
+			data->firmware = firmware;
+			btmtk_usb_load_fw(data);
+		}
+		release_firmware(firmware);
+	} else if (is_mt7632(data) || is_mt7662(data)) {
+		data->need_load_fw = 0;
+		data->need_load_rom_patch = 1;
+		data->rom_patch_offset = 0x90000;
+		ret = request_firmware(&firmware, MT7662_FIRMWARE, &udev->dev);
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				BT_ERR("Firmware file \"%s\" not found\n", MT7662_FIRMWARE);
+			} else {
+				BT_ERR("Firmware file \"%s\" request failed (err=%d)\n",
+					MT7662_FIRMWARE, ret);
+			}
+		} else {
+		    BT_DBG("Firmware file \"%s\" Found\n", MT7662_FIRMWARE);
+		    /* load rom patch here */
+		    data->firmware = firmware;
+		    data->rom_patch_len = firmware->size;
+		    btmtk_usb_load_rom_patch(data);
+		}
+		release_firmware(firmware);
+	} else {
+		BT_ERR("unknow chip(%x)\n", data->chip_id);
+	}
+}
+
+static u16 checksume16(u8 *pData, int len)
+{
+	int sum = 0;
+
+	while (len > 1) {
+		sum += *((u16 *)pData);
+
+		pData = pData + 2;
+
+		if (sum & 0x80000000)
+			sum = (sum & 0xFFFF) + (sum >> 16);
+
+		len -= 2;
+	}
+
+	if (len)
+		sum += *((u8 *)pData);
+
+	while (sum >> 16) {
+		sum = (sum & 0xFFFF) + (sum >> 16);
+	}
+
+	return ~sum;
+}
+
+static int btmtk_usb_chk_crc(struct btmtk_usb_data *data, u32 checksum_len)
+{
+	int ret = 0;
+	struct usb_device *udev = data->udev;
+
+	BT_DBG("%s\n", __func__);
+
+	memmove(data->io_buf, &data->rom_patch_offset, 4);
+	memmove(&data->io_buf[4], &checksum_len, 4);
+
+	ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x1, DEVICE_VENDOR_REQUEST_IN,
+						  0x20, 0x00, data->io_buf, 8,
+						  CONTROL_TIMEOUT_JIFFIES);
+
+	if (ret < 0) {
+		BT_ERR("%s error(%d)\n", __func__, ret);
+	}
+
+	return ret;
+}
+
+static u16 btmtk_usb_get_crc(struct btmtk_usb_data *data)
+{
+	int ret = 0;
+	struct usb_device *udev = data->udev;
+	u16 crc, count = 0;
+
+	BT_DBG("%s\n", __func__);
+
+	while (1) {
+		ret = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
+					0x01, DEVICE_VENDOR_REQUEST_IN,
+					0x21, 0x00, data->io_buf, 2,
+					CONTROL_TIMEOUT_JIFFIES);
+
+		if (ret < 0) {
+			crc = 0xFFFF;
+			BT_ERR("%s error(%d)\n", __func__, ret);
+		}
+
+		memmove(&crc, data->io_buf, 2);
+
+		crc = le16_to_cpu(crc);
+
+		if (crc != 0xFFFF)
+			break;
+
+		mdelay(100);
+
+		if (count++ > 100) {
+			BT_ERR("Query CRC over %d times\n", count);
+			break;
+		}
+	}
+
+	return crc;
+}
+
+static int btmtk_usb_reset_wmt(struct btmtk_usb_data *data)
+{
+	int ret = 0;
+
+	/* reset command */
+	u8 cmd[8] = {0x6F, 0xFC, 0x05, 0x01, 0x07, 0x01, 0x00, 0x04};
+
+	memmove(data->io_buf, cmd, 8);
+
+	BT_DBG("%s\n", __func__);
+
+	ret = usb_control_msg(data->udev, usb_sndctrlpipe(data->udev, 0), 0x01,
+				DEVICE_CLASS_REQUEST_OUT, 0x12, 0x00, data->io_buf,
+				8, CONTROL_TIMEOUT_JIFFIES);
+
+	if (ret)
+		BT_ERR("%s:(%d)\n", __func__, ret);
+
+	return ret;
+}
+
+static void load_rom_patch_complete(struct urb *urb)
+{
+
+	struct completion *sent_to_mcu_done = (struct completion *)urb->context;
+
+	complete(sent_to_mcu_done);
+}
+
+static int btmtk_usb_load_rom_patch(struct btmtk_usb_data *data)
+{
+	u32 loop = 0;
+	u32 value;
+	s32 sent_len;
+	int ret = 0, total_checksum = 0;
+	struct urb *urb;
+	u32 patch_len = 0;
+	u32 cur_len = 0;
+	dma_addr_t data_dma;
+	struct completion sent_to_mcu_done;
+	int first_block = 1;
+	unsigned char phase;
+	void *buf;
+	char *pos;
+	unsigned int pipe = usb_sndbulkpipe(data->udev, data->bulk_tx_ep->bEndpointAddress);
+
+	if (!data->firmware) {
+		BT_ERR("%s:please assign a rom patch\n", __func__);
+		return -1;
+	}
+
+load_patch_protect:
+	btmtk_usb_switch_iobase(data, WLAN);
+	btmtk_usb_io_read32(data, SEMAPHORE_03, &value);
+	loop++;
+
+	if (((value & 0x01) == 0x00) && (loop < 600)) {
+		mdelay(1);
+		goto load_patch_protect;
+	}
+
+	btmtk_usb_io_write32(data, 0x1004, 0x2c);
+
+	btmtk_usb_switch_iobase(data, SYSCTL);
+
+	btmtk_usb_io_write32(data, 0x1c, 0x30);
+
+	/* Enable USB_DMA_CFG */
+	btmtk_usb_io_write32(data, 0x9018, 0x00c00020);
+
+	btmtk_usb_switch_iobase(data, WLAN);
+
+	/* check ROM patch if upgrade */
+	btmtk_usb_io_read32(data, COM_REG0, &value);
+
+	if ((value & 0x02) == 0x02)
+		goto error0;
+
+	urb = usb_alloc_urb(0, GFP_ATOMIC);
+
+	if (!urb) {
+		ret = -ENOMEM;
+		goto error0;
+	}
+
+	buf = usb_alloc_coherent(data->udev, UPLOAD_PATCH_UNIT, GFP_ATOMIC, &data_dma);
+
+	if (!buf) {
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	pos = buf;
+	BT_DBG("loading rom patch");
+
+	init_completion(&sent_to_mcu_done);
+
+	cur_len = 0x00;
+	patch_len = data->rom_patch_len - PATCH_INFO_SIZE;
+
+	/* loading rom patch */
+	while (1) {
+		s32 sent_len_max = UPLOAD_PATCH_UNIT - PATCH_HEADER_SIZE;
+		sent_len = (patch_len - cur_len) >= sent_len_max ? sent_len_max : (patch_len - cur_len);
+
+		BT_DBG("patch_len = %d\n", patch_len);
+		BT_DBG("cur_len = %d\n", cur_len);
+		BT_DBG("sent_len = %d\n", sent_len);
+
+		if (sent_len > 0) {
+			if (first_block == 1) {
+				if (sent_len < sent_len_max)
+					phase = PATCH_PHASE3;
+				else
+					phase = PATCH_PHASE1;
+				first_block = 0;
+			} else if (sent_len == sent_len_max) {
+				phase = PATCH_PHASE2;
+			} else {
+				phase = PATCH_PHASE3;
+			}
+
+			/* prepare HCI header */
+			pos[0] = 0x6F;
+			pos[1] = 0xFC;
+			pos[2] = (sent_len + 5) & 0xFF;
+			pos[3] = ((sent_len + 5) >> 8) & 0xFF;
+
+			/* prepare WMT header */
+			pos[4] = 0x01;
+			pos[5] = 0x01;
+			pos[6] = (sent_len + 1) & 0xFF;
+			pos[7] = ((sent_len + 1) >> 8) & 0xFF;
+
+			pos[8] = phase;
+
+			memcpy(&pos[9], data->firmware->data + PATCH_INFO_SIZE + cur_len, sent_len);
+
+			BT_DBG("sent_len + PATCH_HEADER_SIZE = %d, phase = %d\n",
+					sent_len + PATCH_HEADER_SIZE, phase);
+
+			usb_fill_bulk_urb(urb,
+					data->udev,
+					pipe,
+					buf,
+					sent_len + PATCH_HEADER_SIZE,
+					load_rom_patch_complete,
+					&sent_to_mcu_done);
+
+			urb->transfer_dma = data_dma;
+			urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+
+			ret = usb_submit_urb(urb, GFP_ATOMIC);
+
+			if (ret)
+				goto error2;
+
+			if (!wait_for_completion_timeout(&sent_to_mcu_done, msecs_to_jiffies(1000))) {
+				usb_kill_urb(urb);
+				BT_ERR("upload rom_patch timeout\n");
+				goto error2;
+			}
+
+			BT_DBG(".");
+
+			mdelay(200);
+
+			cur_len += sent_len;
+
+		} else {
+			break;
+		}
+	}
+
+	total_checksum = checksume16((u8 *)data->firmware->data + PATCH_INFO_SIZE, patch_len);
+
+	BT_DBG("Send checksum req..\n");
+
+	btmtk_usb_chk_crc(data, patch_len);
+
+	mdelay(20);
+
+	if (total_checksum != btmtk_usb_get_crc(data)) {
+		BT_ERR("checksum fail!, local(0x%x) <> fw(0x%x)\n",
+				total_checksum, btmtk_usb_get_crc(data));
+		ret = -1;
+		goto error2;
+	}
+
+	mdelay(20);
+
+	ret = btmtk_usb_reset_wmt(data);
+
+	mdelay(20);
+
+error2:
+	usb_free_coherent(data->udev, UPLOAD_PATCH_UNIT, buf, data_dma);
+error1:
+	usb_free_urb(urb);
+error0:
+	btmtk_usb_io_write32(data, SEMAPHORE_03, 0x1);
+	return ret;
+}
+
+
+static int load_fw_iv(struct btmtk_usb_data *data)
+{
+	int ret;
+	struct usb_device *udev = data->udev;
+	char *buf = kmalloc(64, GFP_ATOMIC);
+
+	memmove(buf, data->firmware->data + 32, 64);
+
+	ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x01,
+						  DEVICE_VENDOR_REQUEST_OUT, 0x12, 0x0, buf, 64,
+						  CONTROL_TIMEOUT_JIFFIES);
+
+	if (ret < 0) {
+		BT_ERR("%s error(%d) step4\n", __func__, ret);
+		kfree(buf);
+		return ret;
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+	kfree(buf);
+
+	return ret;
+}
+
+static void load_fw_complete(struct urb *urb)
+{
+
+	struct completion *sent_to_mcu_done = (struct completion *)urb->context;
+
+	complete(sent_to_mcu_done);
+}
+
+static int btmtk_usb_load_fw(struct btmtk_usb_data *data)
+{
+	struct usb_device *udev = data->udev;
+	struct urb *urb;
+	void *buf;
+	u32 cur_len = 0;
+	u32 packet_header = 0;
+	u32 value;
+	u32 ilm_len = 0, dlm_len = 0;
+	u16 fw_ver, build_ver;
+	u32 loop = 0;
+	dma_addr_t data_dma;
+	int ret = 0, sent_len;
+	struct completion sent_to_mcu_done;
+	unsigned int pipe = usb_sndbulkpipe(data->udev, data->bulk_tx_ep->bEndpointAddress);
+
+	if (!data->firmware) {
+		BT_ERR("%s:please assign a fw\n", __func__);
+		return -1;
+	}
+
+	BT_DBG("bulk_tx_ep = %x\n", data->bulk_tx_ep->bEndpointAddress);
+
+loadfw_protect:
+	btmtk_usb_switch_iobase(data, WLAN);
+	btmtk_usb_io_read32(data, SEMAPHORE_00, &value);
+	loop++;
+
+	if (((value & 0x1) == 0) && (loop < 10000))
+		goto loadfw_protect;
+
+	/* check MCU if ready */
+	btmtk_usb_io_read32(data, COM_REG0, &value);
+
+	if ((value & 0x01) == 0x01)
+		goto error0;
+
+	/* Enable MPDMA TX and EP2 load FW mode */
+	btmtk_usb_io_write32(data, 0x238, 0x1c000000);
+
+	btmtk_usb_reset(udev);
+	mdelay(100);
+
+	ilm_len = (*(data->firmware->data + 3) << 24)
+			| (*(data->firmware->data + 2) << 16)
+			| (*(data->firmware->data + 1) << 8)
+			| (*data->firmware->data);
+
+	dlm_len = (*(data->firmware->data + 7) << 24)
+			| (*(data->firmware->data + 6) << 16)
+			| (*(data->firmware->data + 5) << 8)
+			| (*(data->firmware->data + 4));
+
+	fw_ver = (*(data->firmware->data + 11) << 8) | (*(data->firmware->data + 10));
+
+	build_ver = (*(data->firmware->data + 9) << 8) | (*(data->firmware->data + 8));
+
+	BT_DBG("fw version:%d.%d.%02d ",
+			(fw_ver & 0xf000) >> 8,
+			(fw_ver & 0x0f00) >> 8,
+			(fw_ver & 0x00ff));
+
+	BT_DBG("build:%x\n", build_ver);
+
+	BT_DBG("build Time =");
+
+	for (loop = 0; loop < 16; loop++)
+		BT_DBG("%c", *(data->firmware->data + 16 + loop));
+
+	BT_DBG("\n");
+
+	BT_DBG("ILM length = %d(bytes)\n", ilm_len);
+	BT_DBG("DLM length = %d(bytes)\n", dlm_len);
+
+	btmtk_usb_switch_iobase(data, SYSCTL);
+
+	/* U2M_PDMA rx_ring_base_ptr */
+	btmtk_usb_io_write32(data, 0x790, 0x400230);
+
+	/* U2M_PDMA rx_ring_max_cnt */
+	btmtk_usb_io_write32(data, 0x794, 0x1);
+
+	/* U2M_PDMA cpu_idx */
+	btmtk_usb_io_write32(data, 0x798, 0x1);
+
+	/* U2M_PDMA enable */
+	btmtk_usb_io_write32(data, 0x704, 0x44);
+
+	urb = usb_alloc_urb(0, GFP_ATOMIC);
+
+	if (!urb) {
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	buf = usb_alloc_coherent(udev, 14592, GFP_ATOMIC, &data_dma);
+
+	if (!buf) {
+		ret = -ENOMEM;
+		goto error2;
+	}
+
+	BT_DBG("loading fw");
+
+	init_completion(&sent_to_mcu_done);
+
+	btmtk_usb_switch_iobase(data, SYSCTL);
+
+	cur_len = 0x40;
+
+	/* Loading ILM */
+	while (1) {
+		sent_len = (ilm_len - cur_len) >= 14336 ? 14336 : (ilm_len - cur_len);
+
+		if (sent_len > 0) {
+			packet_header &= ~(0xffffffff);
+			packet_header |= (sent_len << 16);
+			packet_header = cpu_to_le32(packet_header);
+
+			memmove(buf, &packet_header, 4);
+			memmove(buf + 4, data->firmware->data + 32 + cur_len, sent_len);
+
+			/* U2M_PDMA descriptor */
+			btmtk_usb_io_write32(data, 0x230, cur_len);
+
+			while ((sent_len % 4) != 0) {
+				sent_len++;
+			}
+
+			/* U2M_PDMA length */
+			btmtk_usb_io_write32(data, 0x234, sent_len << 16);
+
+			usb_fill_bulk_urb(urb,
+					udev,
+					pipe,
+					buf,
+					sent_len + 4,
+					load_fw_complete,
+					&sent_to_mcu_done);
+
+			urb->transfer_dma = data_dma;
+			urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+
+			ret = usb_submit_urb(urb, GFP_ATOMIC);
+
+			if (ret)
+				goto error3;
+
+			if (!wait_for_completion_timeout(&sent_to_mcu_done, msecs_to_jiffies(1000))) {
+				usb_kill_urb(urb);
+				BT_ERR("upload ilm fw timeout\n");
+				goto error3;
+			}
+
+			BT_DBG(".");
+
+			mdelay(200);
+
+			cur_len += sent_len;
+		} else {
+			break;
+		}
+	}
+
+	init_completion(&sent_to_mcu_done);
+	cur_len = 0x00;
+
+	/* Loading DLM */
+	while (1) {
+		sent_len = (dlm_len - cur_len) >= 14336 ? 14336 : (dlm_len - cur_len);
+
+		if (sent_len > 0) {
+			packet_header &= ~(0xffffffff);
+			packet_header |= (sent_len << 16);
+			packet_header = cpu_to_le32(packet_header);
+
+			memmove(buf, &packet_header, 4);
+			memmove(buf + 4, data->firmware->data + 32 + ilm_len + cur_len, sent_len);
+
+			/* U2M_PDMA descriptor */
+			btmtk_usb_io_write32(data, 0x230, 0x80000 + cur_len);
+
+			while ((sent_len % 4) != 0) {
+				BT_DBG("sent_len is not divided by 4\n");
+				sent_len++;
+			}
+
+			/* U2M_PDMA length */
+			btmtk_usb_io_write32(data, 0x234, sent_len << 16);
+
+			usb_fill_bulk_urb(urb,
+					udev,
+					pipe,
+					buf,
+					sent_len + 4,
+					load_fw_complete,
+					&sent_to_mcu_done);
+
+			urb->transfer_dma = data_dma;
+			urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+
+			ret = usb_submit_urb(urb, GFP_ATOMIC);
+
+			if (ret)
+				goto error3;
+
+			if (!wait_for_completion_timeout(&sent_to_mcu_done, msecs_to_jiffies(1000))) {
+				usb_kill_urb(urb);
+				BT_ERR("upload dlm fw timeout\n");
+				goto error3;
+			}
+
+			BT_DBG(".");
+
+			mdelay(500);
+
+			cur_len += sent_len;
+
+		} else {
+			break;
+		}
+	}
+
+	/* upload 64bytes interrupt vector */
+	ret = load_fw_iv(data);
+	mdelay(100);
+
+	btmtk_usb_switch_iobase(data, WLAN);
+
+	/* check MCU if ready */
+	loop = 0;
+
+	do {
+		btmtk_usb_io_read32(data, COM_REG0, &value);
+
+		if (value == 0x01)
+			break;
+
+		mdelay(10);
+		loop++;
+	} while (loop <= 100);
+
+	if (loop > 1000) {
+		BT_ERR("wait for 100 times\n");
+		ret = -ENODEV;
+	}
+
+error3:
+	usb_free_coherent(udev, 14592, buf, data_dma);
+error2:
+	usb_free_urb(urb);
+error1:
+	/* Disbale load fw mode */
+	btmtk_usb_io_read32(data, 0x238, &value);
+	value = value & ~(0x10000000);
+	btmtk_usb_io_write32(data,  0x238, value);
+error0:
+	btmtk_usb_io_write32(data, SEMAPHORE_00, 0x1);
+	return ret;
+}
+
+static int inc_tx(struct btmtk_usb_data *data)
+{
+	unsigned long flags;
+	int rv;
+
+	spin_lock_irqsave(&data->txlock, flags);
+	rv = test_bit(BTUSB_SUSPENDING, &data->flags);
+	if (!rv)
+		data->tx_in_flight++;
+	spin_unlock_irqrestore(&data->txlock, flags);
+
+	return rv;
+}
+
+static void btmtk_usb_intr_complete(struct urb *urb)
+{
+	struct hci_dev *hdev = urb->context;
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+	int err;
+
+	BT_DBG("%s: %s urb %p status %d count %d\n", __func__, hdev->name,
+					urb, urb->status, urb->actual_length);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		return;
+
+	if (urb->status == 0) {
+		hdev->stat.byte_rx += urb->actual_length;
+
+		hex_dump("hci event", urb->transfer_buffer, urb->actual_length);
+
+		if (hci_recv_fragment(hdev, HCI_EVENT_PKT,
+						urb->transfer_buffer,
+						urb->actual_length) < 0) {
+			BT_ERR("%s corrupted event packet", hdev->name);
+			hdev->stat.err_rx++;
+		}
+	}
+
+	if (!test_bit(BTUSB_INTR_RUNNING, &data->flags))
+		return;
+
+	usb_mark_last_busy(data->udev);
+	usb_anchor_urb(urb, &data->intr_anchor);
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+
+	if (err < 0) {
+		/* -EPERM: urb is being killed;
+		 * -ENODEV: device got disconnected */
+		if (err != -EPERM && err != -ENODEV)
+			BT_ERR("%s urb %p failed to resubmit (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+	}
+}
+
+static int btmtk_usb_submit_intr_urb(struct hci_dev *hdev, gfp_t mem_flags)
+{
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+	struct urb *urb;
+	unsigned char *buf;
+	unsigned int pipe;
+	int err, size;
+
+	BT_DBG("%s\n", __func__);
+
+	if (!data->intr_ep)
+		return -ENODEV;
+
+	urb = usb_alloc_urb(0, mem_flags);
+	if (!urb)
+		return -ENOMEM;
+
+	size = le16_to_cpu(data->intr_ep->wMaxPacketSize);
+
+	buf = kmalloc(size, mem_flags);
+	if (!buf) {
+		usb_free_urb(urb);
+		return -ENOMEM;
+	}
+
+	pipe = usb_rcvintpipe(data->udev, data->intr_ep->bEndpointAddress);
+
+	usb_fill_int_urb(urb, data->udev, pipe, buf, size,
+						btmtk_usb_intr_complete, hdev,
+						data->intr_ep->bInterval);
+
+	urb->transfer_flags |= URB_FREE_BUFFER;
+
+	usb_anchor_urb(urb, &data->intr_anchor);
+
+	err = usb_submit_urb(urb, mem_flags);
+	if (err < 0) {
+		if (err != -EPERM && err != -ENODEV)
+			BT_ERR("%s urb %p submission failed (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+	}
+
+	usb_free_urb(urb);
+
+	return err;
+
+}
+
+static void btmtk_usb_bulk_in_complete(struct urb *urb)
+{
+	struct hci_dev *hdev = urb->context;
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+	int err;
+
+	BT_DBG("%s:%s urb %p status %d count %d", __func__, hdev->name,
+					urb, urb->status, urb->actual_length);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags)) {
+		return;
+	}
+
+	if (urb->status == 0) {
+		hdev->stat.byte_rx += urb->actual_length;
+
+		if (hci_recv_fragment(hdev, HCI_ACLDATA_PKT,
+						urb->transfer_buffer,
+						urb->actual_length) < 0) {
+			BT_ERR("%s corrupted ACL packet", hdev->name);
+			hdev->stat.err_rx++;
+		}
+	}
+
+	if (!test_bit(BTUSB_BULK_RUNNING, &data->flags))
+		return;
+
+	usb_anchor_urb(urb, &data->bulk_anchor);
+	usb_mark_last_busy(data->udev);
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+	if (err < 0) {
+		/* -EPERM: urb is being killed;
+		 * -ENODEV: device got disconnected */
+		if (err != -EPERM && err != -ENODEV)
+			BT_ERR("%s urb %p failed to resubmit (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+	}
+}
+
+static int btmtk_usb_submit_bulk_in_urb(struct hci_dev *hdev, gfp_t mem_flags)
+{
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+	struct urb *urb;
+	unsigned char *buf;
+	unsigned int pipe;
+	int err, size = HCI_MAX_FRAME_SIZE;
+
+	BT_DBG("%s:%s\n", __func__, hdev->name);
+
+	if (!data->bulk_rx_ep)
+		return -ENODEV;
+
+	urb = usb_alloc_urb(0, mem_flags);
+	if (!urb)
+		return -ENOMEM;
+
+	buf = kmalloc(size, mem_flags);
+	if (!buf) {
+		usb_free_urb(urb);
+		return -ENOMEM;
+	}
+
+	pipe = usb_rcvbulkpipe(data->udev, data->bulk_rx_ep->bEndpointAddress);
+
+	usb_fill_bulk_urb(urb, data->udev, pipe,
+					buf, size, btmtk_usb_bulk_in_complete, hdev);
+
+	urb->transfer_flags |= URB_FREE_BUFFER;
+
+	usb_mark_last_busy(data->udev);
+	usb_anchor_urb(urb, &data->bulk_anchor);
+
+	err = usb_submit_urb(urb, mem_flags);
+	if (err < 0) {
+		if (err != -EPERM && err != -ENODEV)
+			BT_ERR("%s urb %p submission failed (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+	}
+
+	usb_free_urb(urb);
+
+	return err;
+}
+
+static void btmtk_usb_isoc_in_complete(struct urb *urb)
+
+{
+	struct hci_dev *hdev = urb->context;
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+	int i, err;
+
+	BT_DBG("%s: %s urb %p status %d count %d", __func__, hdev->name,
+					urb, urb->status, urb->actual_length);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		return;
+
+	if (urb->status == 0) {
+		for (i = 0; i < urb->number_of_packets; i++) {
+			unsigned int offset = urb->iso_frame_desc[i].offset;
+			unsigned int length = urb->iso_frame_desc[i].actual_length;
+
+			if (urb->iso_frame_desc[i].status)
+				continue;
+
+			hdev->stat.byte_rx += length;
+
+			if (hci_recv_fragment(hdev, HCI_SCODATA_PKT,
+						urb->transfer_buffer + offset,
+								length) < 0) {
+				BT_ERR("%s corrupted SCO packet", hdev->name);
+				hdev->stat.err_rx++;
+			}
+		}
+	}
+
+	if (!test_bit(BTUSB_ISOC_RUNNING, &data->flags))
+		return;
+
+	usb_anchor_urb(urb, &data->isoc_anchor);
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+	if (err < 0) {
+		/* -EPERM: urb is being killed;
+		 * -ENODEV: device got disconnected */
+		if (err != -EPERM && err != -ENODEV)
+			BT_ERR("%s urb %p failed to resubmit (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+	}
+}
+
+static inline void __fill_isoc_descriptor(struct urb *urb, int len, int mtu)
+{
+	int i, offset = 0;
+
+	BT_DBG("len %d mtu %d", len, mtu);
+
+	for (i = 0; i < BTUSB_MAX_ISOC_FRAMES && len >= mtu;
+					i++, offset += mtu, len -= mtu) {
+		urb->iso_frame_desc[i].offset = offset;
+		urb->iso_frame_desc[i].length = mtu;
+	}
+
+	if (len && i < BTUSB_MAX_ISOC_FRAMES) {
+		urb->iso_frame_desc[i].offset = offset;
+		urb->iso_frame_desc[i].length = len;
+		i++;
+	}
+
+	urb->number_of_packets = i;
+}
+
+static int btmtk_usb_submit_isoc_in_urb(struct hci_dev *hdev, gfp_t mem_flags)
+{
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+	struct urb *urb;
+	unsigned char *buf;
+	unsigned int pipe;
+	int err, size;
+
+	BT_DBG("%s\n", __func__);
+
+	if (!data->isoc_rx_ep)
+		return -ENODEV;
+
+	urb = usb_alloc_urb(BTUSB_MAX_ISOC_FRAMES, mem_flags);
+	if (!urb)
+		return -ENOMEM;
+
+	size = le16_to_cpu(data->isoc_rx_ep->wMaxPacketSize) *
+						BTUSB_MAX_ISOC_FRAMES;
+
+	buf = kmalloc(size, mem_flags);
+	if (!buf) {
+		usb_free_urb(urb);
+		return -ENOMEM;
+	}
+
+	pipe = usb_rcvisocpipe(data->udev, data->isoc_rx_ep->bEndpointAddress);
+
+	usb_fill_int_urb(urb, data->udev, pipe, buf, size, btmtk_usb_isoc_in_complete,
+				hdev, data->isoc_rx_ep->bInterval);
+
+	urb->transfer_flags  = URB_FREE_BUFFER | URB_ISO_ASAP;
+
+	__fill_isoc_descriptor(urb, size,
+			le16_to_cpu(data->isoc_rx_ep->wMaxPacketSize));
+
+	usb_anchor_urb(urb, &data->isoc_anchor);
+
+	err = usb_submit_urb(urb, mem_flags);
+	if (err < 0) {
+		if (err != -EPERM && err != -ENODEV)
+			BT_ERR("%s urb %p submission failed (%d)",
+						hdev->name, urb, -err);
+		usb_unanchor_urb(urb);
+	}
+
+	usb_free_urb(urb);
+
+	return err;
+}
+
+static int btmtk_usb_open(struct hci_dev *hdev)
+{
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+	int err;
+
+	BT_DBG("%s\n", __func__);
+
+	err = usb_autopm_get_interface(data->intf);
+	if (err < 0)
+		return err;
+
+	data->intf->needs_remote_wakeup = 1;
+
+	if (test_and_set_bit(HCI_RUNNING, &hdev->flags))
+		goto done;
+
+	if (test_and_set_bit(BTUSB_INTR_RUNNING, &data->flags))
+		goto done;
+
+	err = btmtk_usb_submit_intr_urb(hdev, GFP_KERNEL);
+	if (err < 0)
+		goto failed;
+
+	err = btmtk_usb_submit_bulk_in_urb(hdev, GFP_KERNEL);
+	if (err < 0) {
+		usb_kill_anchored_urbs(&data->intr_anchor);
+		goto failed;
+	}
+
+	set_bit(BTUSB_BULK_RUNNING, &data->flags);
+	btmtk_usb_submit_bulk_in_urb(hdev, GFP_KERNEL);
+
+done:
+	usb_autopm_put_interface(data->intf);
+	return 0;
+
+failed:
+	clear_bit(BTUSB_INTR_RUNNING, &data->flags);
+	clear_bit(HCI_RUNNING, &hdev->flags);
+	usb_autopm_put_interface(data->intf);
+	return err;
+}
+
+static void btmtk_usb_stop_traffic(struct btmtk_usb_data *data)
+{
+	BT_DBG("%s\n", __func__);
+
+	usb_kill_anchored_urbs(&data->intr_anchor);
+	usb_kill_anchored_urbs(&data->bulk_anchor);
+	usb_kill_anchored_urbs(&data->isoc_anchor);
+}
+
+static int btmtk_usb_close(struct hci_dev *hdev)
+{
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+	int err;
+
+	BT_DBG("%s\n", __func__);
+
+	if (!test_and_clear_bit(HCI_RUNNING, &hdev->flags))
+		return 0;
+
+	cancel_work_sync(&data->work);
+	cancel_work_sync(&data->waker);
+
+	clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+	clear_bit(BTUSB_BULK_RUNNING, &data->flags);
+	clear_bit(BTUSB_INTR_RUNNING, &data->flags);
+
+	btmtk_usb_stop_traffic(data);
+
+	err = usb_autopm_get_interface(data->intf);
+	if (err < 0)
+		goto failed;
+
+	data->intf->needs_remote_wakeup = 0;
+	usb_autopm_put_interface(data->intf);
+
+failed:
+	usb_scuttle_anchored_urbs(&data->deferred);
+	return 0;
+}
+
+static int btmtk_usb_flush(struct hci_dev *hdev)
+{
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+
+	BT_DBG("%s\n", __func__);
+
+	usb_kill_anchored_urbs(&data->tx_anchor);
+
+	return 0;
+}
+
+static void btmtk_usb_tx_complete(struct urb *urb)
+{
+	struct sk_buff *skb = urb->context;
+	struct hci_dev *hdev = (struct hci_dev *)skb->dev;
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+
+	BT_DBG("%s: %s urb %p status %d count %d\n", __func__, hdev->name,
+					urb, urb->status, urb->actual_length);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		goto done;
+
+	if (!urb->status)
+		hdev->stat.byte_tx += urb->transfer_buffer_length;
+	else
+		hdev->stat.err_tx++;
+
+done:
+	spin_lock(&data->txlock);
+	data->tx_in_flight--;
+	spin_unlock(&data->txlock);
+
+	kfree(urb->setup_packet);
+
+	kfree_skb(skb);
+}
+
+static void btmtk_usb_isoc_tx_complete(struct urb *urb)
+{
+	struct sk_buff *skb = urb->context;
+	struct hci_dev *hdev = (struct hci_dev *) skb->dev;
+
+	BT_DBG("%s: %s urb %p status %d count %d", __func__, hdev->name,
+					urb, urb->status, urb->actual_length);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		goto done;
+
+	if (!urb->status)
+		hdev->stat.byte_tx += urb->transfer_buffer_length;
+	else
+		hdev->stat.err_tx++;
+
+done:
+	kfree(urb->setup_packet);
+
+	kfree_skb(skb);
+}
+
+static int btmtk_usb_send_frame(struct sk_buff *skb)
+{
+	struct hci_dev *hdev = (struct hci_dev *)skb->dev;
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+	struct usb_ctrlrequest *dr;
+	struct urb *urb;
+	unsigned int pipe;
+	int err;
+
+	BT_DBG("%s\n", __func__);
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		return -EBUSY;
+
+	switch (bt_cb(skb)->pkt_type) {
+	case HCI_COMMAND_PKT:
+		urb = usb_alloc_urb(0, GFP_ATOMIC);
+		if (!urb)
+			return -ENOMEM;
+
+		dr = kmalloc(sizeof(*dr), GFP_ATOMIC);
+		if (!dr) {
+			usb_free_urb(urb);
+			return -ENOMEM;
+		}
+
+		dr->bRequestType = data->cmdreq_type;
+		dr->bRequest     = 0;
+		dr->wIndex       = 0;
+		dr->wValue       = 0;
+		dr->wLength      = __cpu_to_le16(skb->len);
+
+		pipe = usb_sndctrlpipe(data->udev, 0x00);
+
+		if (test_bit(HCI_RUNNING, &hdev->flags)) {
+			u16 op_code;
+			memcpy(&op_code, skb->data, 2);
+			BT_DBG("ogf = %x\n", (op_code & 0xfc00) >> 10);
+			BT_DBG("ocf = %x\n", op_code & 0x03ff);
+			hex_dump("hci command", skb->data, skb->len);
+
+		}
+
+		usb_fill_control_urb(urb, data->udev, pipe, (void *) dr,
+				skb->data, skb->len, btmtk_usb_tx_complete, skb);
+
+		hdev->stat.cmd_tx++;
+		break;
+
+	case HCI_ACLDATA_PKT:
+		if (!data->bulk_tx_ep)
+			return -ENODEV;
+
+		urb = usb_alloc_urb(0, GFP_ATOMIC);
+		if (!urb)
+			return -ENOMEM;
+
+		pipe = usb_sndbulkpipe(data->udev,
+					data->bulk_tx_ep->bEndpointAddress);
+
+		usb_fill_bulk_urb(urb, data->udev, pipe,
+				skb->data, skb->len, btmtk_usb_tx_complete, skb);
+
+		hdev->stat.acl_tx++;
+		BT_DBG("HCI_ACLDATA_PKT:\n");
+		break;
+
+	case HCI_SCODATA_PKT:
+		if (!data->isoc_tx_ep || hdev->conn_hash.sco_num < 1)
+			return -ENODEV;
+
+		urb = usb_alloc_urb(BTUSB_MAX_ISOC_FRAMES, GFP_ATOMIC);
+		if (!urb)
+			return -ENOMEM;
+
+		pipe = usb_sndisocpipe(data->udev,
+					data->isoc_tx_ep->bEndpointAddress);
+
+		usb_fill_int_urb(urb, data->udev, pipe,
+				skb->data, skb->len, btmtk_usb_isoc_tx_complete,
+				skb, data->isoc_tx_ep->bInterval);
+
+		urb->transfer_flags  = URB_ISO_ASAP;
+
+		__fill_isoc_descriptor(urb, skb->len,
+				le16_to_cpu(data->isoc_tx_ep->wMaxPacketSize));
+
+		hdev->stat.sco_tx++;
+		BT_DBG("HCI_SCODATA_PKT:\n");
+		goto skip_waking;
+
+	default:
+		return -EILSEQ;
+	}
+
+	err = inc_tx(data);
+
+	if (err) {
+		usb_anchor_urb(urb, &data->deferred);
+		schedule_work(&data->waker);
+		err = 0;
+		goto done;
+	}
+
+skip_waking:
+	usb_anchor_urb(urb, &data->tx_anchor);
+
+	err = usb_submit_urb(urb, GFP_ATOMIC);
+	if (err < 0) {
+		if (err != -EPERM && err != -ENODEV)
+			BT_ERR("%s urb %p submission failed (%d)",
+						hdev->name, urb, -err);
+		kfree(urb->setup_packet);
+		usb_unanchor_urb(urb);
+	} else {
+		usb_mark_last_busy(data->udev);
+	}
+
+done:
+	usb_free_urb(urb);
+	return err;
+}
+
+static void btmtk_usb_notify(struct hci_dev *hdev, unsigned int evt)
+{
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+
+	BT_DBG("%s evt %d", hdev->name, evt);
+
+	if (hdev->conn_hash.sco_num != data->sco_num) {
+		data->sco_num = hdev->conn_hash.sco_num;
+		schedule_work(&data->work);
+	}
+}
+
+static inline int __set_isoc_interface(struct hci_dev *hdev, int altsetting)
+{
+	struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+	struct usb_interface *intf = data->isoc;
+	struct usb_endpoint_descriptor *ep_desc;
+	int i, err;
+
+	if (!data->isoc)
+		return -ENODEV;
+
+	err = usb_set_interface(data->udev, 1, altsetting);
+	if (err < 0) {
+		BT_ERR("%s setting interface failed (%d)", hdev->name, -err);
+		return err;
+	}
+
+	data->isoc_altsetting = altsetting;
+
+	data->isoc_tx_ep = NULL;
+	data->isoc_rx_ep = NULL;
+
+	for (i = 0; i < intf->cur_altsetting->desc.bNumEndpoints; i++) {
+		ep_desc = &intf->cur_altsetting->endpoint[i].desc;
+
+		if (!data->isoc_tx_ep && usb_endpoint_is_isoc_out(ep_desc)) {
+			data->isoc_tx_ep = ep_desc;
+			continue;
+		}
+
+		if (!data->isoc_rx_ep && usb_endpoint_is_isoc_in(ep_desc)) {
+			data->isoc_rx_ep = ep_desc;
+			continue;
+		}
+	}
+
+	if (!data->isoc_tx_ep || !data->isoc_rx_ep) {
+		BT_ERR("%s invalid SCO descriptors", hdev->name);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void btmtk_usb_work(struct work_struct *work)
+{
+	struct btmtk_usb_data *data = container_of(work, struct btmtk_usb_data, work);
+	struct hci_dev *hdev = data->hdev;
+	int new_alts;
+	int err;
+
+	BT_DBG("%s\n", __func__);
+
+	if (hdev->conn_hash.sco_num > 0) {
+		if (!test_bit(BTUSB_DID_ISO_RESUME, &data->flags)) {
+			err = usb_autopm_get_interface(data->isoc ? data->isoc : data->intf);
+			if (err < 0) {
+				clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+				usb_kill_anchored_urbs(&data->isoc_anchor);
+				return;
+			}
+
+			set_bit(BTUSB_DID_ISO_RESUME, &data->flags);
+		}
+
+		if (hdev->voice_setting & 0x0020) {
+			static const int alts[3] = { 2, 4, 5 };
+			new_alts = alts[hdev->conn_hash.sco_num - 1];
+		} else {
+			new_alts = hdev->conn_hash.sco_num;
+		}
+
+		if (data->isoc_altsetting != new_alts) {
+			clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+			usb_kill_anchored_urbs(&data->isoc_anchor);
+
+			if (__set_isoc_interface(hdev, new_alts) < 0)
+				return;
+		}
+
+		if (!test_and_set_bit(BTUSB_ISOC_RUNNING, &data->flags)) {
+			if (btmtk_usb_submit_isoc_in_urb(hdev, GFP_KERNEL) < 0)
+				clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+			else
+				btmtk_usb_submit_isoc_in_urb(hdev, GFP_KERNEL);
+		}
+	} else {
+		clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+		usb_kill_anchored_urbs(&data->isoc_anchor);
+
+		__set_isoc_interface(hdev, 0);
+
+		if (test_and_clear_bit(BTUSB_DID_ISO_RESUME, &data->flags))
+			 usb_autopm_put_interface(data->isoc ? data->isoc : data->intf);
+	}
+}
+
+static void btmtk_usb_waker(struct work_struct *work)
+{
+	struct btmtk_usb_data *data = container_of(work, struct btmtk_usb_data, waker);
+	int err;
+
+	err = usb_autopm_get_interface(data->intf);
+
+	if (err < 0)
+		return;
+
+	usb_autopm_put_interface(data->intf);
+}
+
+static int btmtk_usb_probe(struct usb_interface *intf,
+					const struct usb_device_id *id)
+{
+	struct btmtk_usb_data *data;
+	struct usb_endpoint_descriptor *ep_desc;
+	int i, err;
+	struct hci_dev *hdev;
+
+	/* interface numbers are hardcoded in the spec */
+	if (intf->cur_altsetting->desc.bInterfaceNumber != 0)
+		return -ENODEV;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+
+	if (!data)
+		return -ENOMEM;
+
+	for (i = 0; i < intf->cur_altsetting->desc.bNumEndpoints; i++) {
+		ep_desc = &intf->cur_altsetting->endpoint[i].desc;
+
+		if (!data->intr_ep && usb_endpoint_is_int_in(ep_desc)) {
+			data->intr_ep = ep_desc;
+			continue;
+		}
+
+		if (!data->bulk_tx_ep && usb_endpoint_is_bulk_out(ep_desc)) {
+			data->bulk_tx_ep = ep_desc;
+			continue;
+		}
+
+		if (!data->bulk_rx_ep && usb_endpoint_is_bulk_in(ep_desc)) {
+			data->bulk_rx_ep = ep_desc;
+			continue;
+		}
+	}
+
+	if (!data->intr_ep || !data->bulk_tx_ep || !data->bulk_rx_ep) {
+		kfree(data);
+		return -ENODEV;
+	}
+
+	data->cmdreq_type = USB_TYPE_CLASS;
+
+	data->udev = interface_to_usbdev(intf);
+	data->intf = intf;
+
+	spin_lock_init(&data->lock);
+	INIT_WORK(&data->work, btmtk_usb_work);
+	INIT_WORK(&data->waker, btmtk_usb_waker);
+	spin_lock_init(&data->txlock);
+
+	init_usb_anchor(&data->tx_anchor);
+	init_usb_anchor(&data->intr_anchor);
+	init_usb_anchor(&data->bulk_anchor);
+	init_usb_anchor(&data->isoc_anchor);
+	init_usb_anchor(&data->deferred);
+
+	hdev = hci_alloc_dev();
+	if (!hdev) {
+		kfree(data);
+		return -ENOMEM;
+	}
+
+	hdev->bus = HCI_USB;
+
+	hci_set_drvdata(hdev, data);
+
+	data->hdev = hdev;
+
+	SET_HCIDEV_DEV(hdev, &intf->dev);
+
+	hdev->open     = btmtk_usb_open;
+	hdev->close    = btmtk_usb_close;
+	hdev->flush    = btmtk_usb_flush;
+	hdev->send     = btmtk_usb_send_frame;
+	hdev->notify   = btmtk_usb_notify;
+
+	/* Interface numbers are hardcoded in the specification */
+	data->isoc = usb_ifnum_to_if(data->udev, 1);
+
+	if (data->isoc) {
+		err = usb_driver_claim_interface(&btmtk_usb_driver,
+							data->isoc, data);
+		if (err < 0) {
+			hci_free_dev(hdev);
+			kfree(data);
+			return err;
+		}
+	}
+
+	data->io_buf = kmalloc(256, GFP_KERNEL);
+	if (!data->io_buf) {
+		hci_free_dev(hdev);
+		kfree(data);
+		return -ENOMEM;
+	}
+
+	btmtk_usb_switch_iobase(data, WLAN);
+
+	btmtk_usb_cap_init(data);
+
+	err = hci_register_dev(hdev);
+	if (err < 0) {
+		hci_free_dev(hdev);
+		kfree(data);
+		return err;
+	}
+
+	usb_set_intfdata(intf, data);
+
+	return 0;
+}
+
+static void btmtk_usb_disconnect(struct usb_interface *intf)
+{
+	struct btmtk_usb_data *data = usb_get_intfdata(intf);
+	struct hci_dev *hdev;
+
+	BT_DBG("%s\n", __func__);
+
+	if (!data)
+		return;
+
+	hdev = data->hdev;
+	usb_set_intfdata(data->intf, NULL);
+
+	if (data->isoc)
+		usb_set_intfdata(data->isoc, NULL);
+
+	hci_unregister_dev(hdev);
+
+	if (intf == data->isoc)
+		usb_driver_release_interface(&btmtk_usb_driver, data->intf);
+	else if (data->isoc)
+		usb_driver_release_interface(&btmtk_usb_driver, data->isoc);
+
+	hci_free_dev(hdev);
+
+	kfree(data->io_buf);
+
+	kfree(data);
+}
+
+#ifdef CONFIG_PM
+static int btmtk_usb_suspend(struct usb_interface *intf, pm_message_t message)
+{
+	struct btmtk_usb_data *data = usb_get_intfdata(intf);
+
+	BT_DBG("%s\n", __func__);
+
+	if (data->suspend_count++)
+		return 0;
+
+	spin_lock_irq(&data->txlock);
+	if (!(PMSG_IS_AUTO(message) && data->tx_in_flight)) {
+		set_bit(BTUSB_SUSPENDING, &data->flags);
+		spin_unlock_irq(&data->txlock);
+	} else {
+		spin_unlock_irq(&data->txlock);
+		data->suspend_count--;
+		return -EBUSY;
+	}
+
+	cancel_work_sync(&data->work);
+
+	btmtk_usb_stop_traffic(data);
+	usb_kill_anchored_urbs(&data->tx_anchor);
+
+	return 0;
+}
+
+static void play_deferred(struct btmtk_usb_data *data)
+{
+	struct urb *urb;
+	int err;
+
+	while ((urb = usb_get_from_anchor(&data->deferred))) {
+		err = usb_submit_urb(urb, GFP_ATOMIC);
+		if (err < 0)
+			break;
+
+		data->tx_in_flight++;
+	}
+
+	usb_scuttle_anchored_urbs(&data->deferred);
+}
+
+static int btmtk_usb_resume(struct usb_interface *intf)
+{
+	struct btmtk_usb_data *data = usb_get_intfdata(intf);
+	struct hci_dev *hdev = data->hdev;
+	int err = 0;
+
+	BT_DBG("%s\n", __func__);
+
+	if (--data->suspend_count)
+		return 0;
+
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		goto done;
+
+	if (test_bit(BTUSB_INTR_RUNNING, &data->flags)) {
+		err = btmtk_usb_submit_intr_urb(hdev, GFP_NOIO);
+		if (err < 0) {
+			clear_bit(BTUSB_INTR_RUNNING, &data->flags);
+			goto failed;
+		}
+	}
+
+	if (test_bit(BTUSB_BULK_RUNNING, &data->flags)) {
+		err = btmtk_usb_submit_bulk_in_urb(hdev, GFP_NOIO);
+		if (err < 0) {
+			clear_bit(BTUSB_BULK_RUNNING, &data->flags);
+			goto failed;
+		}
+
+		btmtk_usb_submit_bulk_in_urb(hdev, GFP_NOIO);
+	}
+
+	if (test_bit(BTUSB_ISOC_RUNNING, &data->flags)) {
+		if (btmtk_usb_submit_isoc_in_urb(hdev, GFP_NOIO) < 0)
+			clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+		else
+			btmtk_usb_submit_isoc_in_urb(hdev, GFP_NOIO);
+	}
+
+	spin_lock_irq(&data->txlock);
+	play_deferred(data);
+	clear_bit(BTUSB_SUSPENDING, &data->flags);
+	spin_unlock_irq(&data->txlock);
+	schedule_work(&data->work);
+
+	return 0;
+
+failed:
+	usb_scuttle_anchored_urbs(&data->deferred);
+done:
+	spin_lock_irq(&data->txlock);
+	clear_bit(BTUSB_SUSPENDING, &data->flags);
+	spin_unlock_irq(&data->txlock);
+
+	return err;
+}
+#endif
+
+static struct usb_device_id btmtk_usb_table[] = {
+	/* Mediatek MT7650 */
+	{ USB_DEVICE(0x0e8d, 0x7650) },
+	{ USB_DEVICE(0x0e8d, 0x7630) },
+	{ USB_DEVICE(0x0e8d, 0x763e) },
+	/* Mediatek MT662 */
+	{ USB_DEVICE(0x0e8d, 0x7662) },
+	{ USB_DEVICE(0x0e8d, 0x7632) },
+	{ }	/* Terminating entry */
+};
+
+static struct usb_driver btmtk_usb_driver = {
+	.name		= "btmtk_usb",
+	.probe		= btmtk_usb_probe,
+	.disconnect	= btmtk_usb_disconnect,
+#ifdef CONFIG_PM
+	.suspend	= btmtk_usb_suspend,
+	.resume		= btmtk_usb_resume,
+#endif
+	.id_table	= btmtk_usb_table,
+	.supports_autosuspend = 1,
+	.disable_hub_initiated_lpm = 1,
+};
+
+module_usb_driver(btmtk_usb_driver);
+
+MODULE_DESCRIPTION("Mediatek Bluetooth USB driver ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_FIRMWARE(MT7650_FIRMWARE);
+MODULE_FIRMWARE(MT7662_FIRMWARE);

diff --git a/drivers/staging/btmtk_usb/btmtk_usb.h b/drivers/staging/btmtk_usb/btmtk_usb.h
new file mode 100644
index 0000000..12f0d3b
--- /dev/null
+++ b/drivers/staging/btmtk_usb/btmtk_usb.h

@@ -0,0 +1,138 @@
+/*
+ *  MediaTek Bluetooth USB Driver
+ *
+ *  Copyright (C) 2013, MediaTek co.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  or on the worldwide web at
+ *  http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
+ *
+ */
+
+#ifndef __BTMTK_USB_H__
+#define __BTMTK_USB_H_
+
+/* Memory map for MTK BT */
+
+/* SYS Control */
+#define SYSCTL	0x400000
+
+/* WLAN */
+#define WLAN		0x410000
+
+/* MCUCTL */
+#define INT_LEVEL		0x0718
+#define COM_REG0		0x0730
+#define SEMAPHORE_00	0x07B0
+#define SEMAPHORE_01	0x07B4
+#define SEMAPHORE_02	0x07B8
+#define SEMAPHORE_03	0x07BC
+
+/* Chip definition */
+
+#define CONTROL_TIMEOUT_JIFFIES ((300 * HZ) / 100)
+#define DEVICE_VENDOR_REQUEST_OUT	0x40
+#define DEVICE_VENDOR_REQUEST_IN	0xc0
+#define DEVICE_CLASS_REQUEST_OUT	0x20
+
+#define BTUSB_MAX_ISOC_FRAMES	10
+#define BTUSB_INTR_RUNNING	0
+#define BTUSB_BULK_RUNNING	1
+#define BTUSB_ISOC_RUNNING	2
+#define BTUSB_SUSPENDING	3
+#define BTUSB_DID_ISO_RESUME	4
+
+/* ROM Patch */
+#define PATCH_HCI_HEADER_SIZE 4
+#define PATCH_WMT_HEADER_SIZE 5
+#define PATCH_HEADER_SIZE (PATCH_HCI_HEADER_SIZE + PATCH_WMT_HEADER_SIZE)
+#define UPLOAD_PATCH_UNIT 2048
+#define PATCH_INFO_SIZE 30
+#define PATCH_PHASE1 1
+#define PATCH_PHASE2 2
+#define PATCH_PHASE3 3
+
+struct btmtk_usb_data {
+	struct hci_dev *hdev;
+	struct usb_device    *udev;
+	struct usb_interface *intf;
+	struct usb_interface *isoc;
+
+	spinlock_t lock;
+
+	unsigned long flags;
+	struct work_struct work;
+	struct work_struct waker;
+
+	struct usb_anchor tx_anchor;
+	struct usb_anchor intr_anchor;
+	struct usb_anchor bulk_anchor;
+	struct usb_anchor isoc_anchor;
+	struct usb_anchor deferred;
+	int tx_in_flight;
+	spinlock_t txlock;
+
+	struct usb_endpoint_descriptor *intr_ep;
+	struct usb_endpoint_descriptor *bulk_tx_ep;
+	struct usb_endpoint_descriptor *bulk_rx_ep;
+	struct usb_endpoint_descriptor *isoc_tx_ep;
+	struct usb_endpoint_descriptor *isoc_rx_ep;
+
+	__u8 cmdreq_type;
+
+	unsigned int sco_num;
+	int isoc_altsetting;
+	int suspend_count;
+
+	/* request for different io operation */
+	u8 w_request;
+	u8 r_request;
+
+	/* io buffer for usb control transfer */
+	char *io_buf;
+
+	struct semaphore fw_upload_sem;
+
+	/* unsigned char *fw_image; */
+	/* unsigned char *rom_patch; */
+	const struct firmware *firmware;
+	u32 chip_id;
+	u8 need_load_fw;
+	u8 need_load_rom_patch;
+	u32 rom_patch_offset;
+	u32 rom_patch_len;
+};
+
+static inline int is_mt7630(struct btmtk_usb_data *data)
+{
+	return ((data->chip_id & 0xffff0000) == 0x76300000);
+}
+
+static inline int is_mt7650(struct btmtk_usb_data *data)
+{
+	return ((data->chip_id & 0xffff0000) == 0x76500000);
+}
+
+static inline int is_mt7632(struct btmtk_usb_data *data)
+{
+	return ((data->chip_id & 0xffff0000) == 0x76320000);
+}
+
+static inline int is_mt7662(struct btmtk_usb_data *data)
+{
+	return ((data->chip_id & 0xffff0000) == 0x76620000);
+}
+
+#endif

diff --git a/drivers/staging/ced1401/ced_ioc.c b/drivers/staging/ced1401/ced_ioc.c
index 82a333f..2dbaf39e 100644
--- a/drivers/staging/ced1401/ced_ioc.c
+++ b/drivers/staging/ced1401/ced_ioc.c

@@ -37,13 +37,14 @@
 **
 ** Empties the Output buffer and sets int lines. Used from user level only
 ****************************************************************************/
-void FlushOutBuff(DEVICE_EXTENSION * pdx)
+static void FlushOutBuff(DEVICE_EXTENSION *pdx)
 {
 	dev_dbg(&pdx->interface->dev, "%s currentState=%d", __func__,
 		pdx->sCurrentState);
 	if (pdx->sCurrentState == U14ERR_TIME)	/* Do nothing if hardware in trouble */
 		return;
-//    CharSend_Cancel(pdx);                   /* Kill off any pending I/O */
+	/* Kill off any pending I/O */
+	/* CharSend_Cancel(pdx);  */
 	spin_lock_irq(&pdx->charOutLock);
 	pdx->dwNumOutput = 0;
 	pdx->dwOutBuffGet = 0;
@@ -57,13 +58,14 @@
 **
 ** Empties the input buffer and sets int lines
 ****************************************************************************/
-void FlushInBuff(DEVICE_EXTENSION * pdx)
+static void FlushInBuff(DEVICE_EXTENSION *pdx)
 {
 	dev_dbg(&pdx->interface->dev, "%s currentState=%d", __func__,
 		pdx->sCurrentState);
 	if (pdx->sCurrentState == U14ERR_TIME)	/* Do nothing if hardware in trouble */
 		return;
-//    CharRead_Cancel(pDevObject);            /* Kill off any pending I/O */
+	/* Kill off any pending I/O */
+	/*     CharRead_Cancel(pDevObject);  */
 	spin_lock_irq(&pdx->charInLock);
 	pdx->dwNumInput = 0;
 	pdx->dwInBuffGet = 0;
@@ -77,11 +79,11 @@
 ** Utility routine to copy chars into the output buffer and fire them off.
 ** called from user mode, holds charOutLock.
 ****************************************************************************/
-static int PutChars(DEVICE_EXTENSION * pdx, const char *pCh,
+static int PutChars(DEVICE_EXTENSION *pdx, const char *pCh,
 		    unsigned int uCount)
 {
 	int iReturn;
-	spin_lock_irq(&pdx->charOutLock);	// get the output spin lock
+	spin_lock_irq(&pdx->charOutLock);	/*  get the output spin lock */
 	if ((OUTBUF_SZ - pdx->dwNumOutput) >= uCount) {
 		unsigned int u;
 		for (u = 0; u < uCount; u++) {
@@ -91,9 +93,9 @@
 		}
 		pdx->dwNumOutput += uCount;
 		spin_unlock_irq(&pdx->charOutLock);
-		iReturn = SendChars(pdx);	// ...give a chance to transmit data
+		iReturn = SendChars(pdx);	/*  ...give a chance to transmit data */
 	} else {
-		iReturn = U14ERR_NOOUT;	// no room at the out (ha-ha)
+		iReturn = U14ERR_NOOUT;	/*  no room at the out (ha-ha) */
 		spin_unlock_irq(&pdx->charOutLock);
 	}
 	return iReturn;
@@ -104,26 +106,25 @@
 ** trigger an output transfer if this is appropriate. User mode.
 ** Holds the io_mutex
 *****************************************************************************/
-int SendString(DEVICE_EXTENSION * pdx, const char __user * pData,
+int SendString(DEVICE_EXTENSION *pdx, const char __user *pData,
 	       unsigned int n)
 {
-	int iReturn = U14ERR_NOERROR;	// assume all will be well
-	char buffer[OUTBUF_SZ + 1];	// space in our address space for characters
-	if (n > OUTBUF_SZ)	// check space in local buffer...
-		return U14ERR_NOOUT;	// ...too many characters
+	int iReturn = U14ERR_NOERROR;	/*  assume all will be well */
+	char buffer[OUTBUF_SZ + 1];	/*  space in our address space for characters */
+	if (n > OUTBUF_SZ)	/*  check space in local buffer... */
+		return U14ERR_NOOUT;	/*  ...too many characters */
 	if (copy_from_user(buffer, pData, n))
 		return -EFAULT;
-	buffer[n] = 0;		// terminate for debug purposes
+	buffer[n] = 0;		/*  terminate for debug purposes */
 
-	mutex_lock(&pdx->io_mutex);	// Protect disconnect from new i/o
-	if (n > 0)		// do nothing if nowt to do!
-	{
+	mutex_lock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
+	if (n > 0) {		/*  do nothing if nowt to do! */
 		dev_dbg(&pdx->interface->dev, "%s n=%d>%s<", __func__, n,
 			buffer);
 		iReturn = PutChars(pdx, buffer, n);
 	}
 
-	Allowi(pdx);		// make sure we have input int
+	Allowi(pdx);		/*  make sure we have input int */
 	mutex_unlock(&pdx->io_mutex);
 
 	return iReturn;
@@ -134,13 +135,13 @@
 **
 ** Sends a single character to the 1401. User mode, holds io_mutex.
 ****************************************************************************/
-int SendChar(DEVICE_EXTENSION * pdx, char c)
+int SendChar(DEVICE_EXTENSION *pdx, char c)
 {
 	int iReturn;
-	mutex_lock(&pdx->io_mutex);	// Protect disconnect from new i/o
+	mutex_lock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
 	iReturn = PutChars(pdx, &c, 1);
 	dev_dbg(&pdx->interface->dev, "SendChar >%c< (0x%02x)", c, c);
-	Allowi(pdx);	// Make sure char reads are running
+	Allowi(pdx);	/*  Make sure char reads are running */
 	mutex_unlock(&pdx->io_mutex);
 	return iReturn;
 }
@@ -171,20 +172,20 @@
 **
 ** return error code (U14ERR_NOERROR for OK)
 */
-int Get1401State(DEVICE_EXTENSION * pdx, __u32 * state, __u32 * error)
+int Get1401State(DEVICE_EXTENSION *pdx, __u32 *state, __u32 *error)
 {
 	int nGot;
 	dev_dbg(&pdx->interface->dev, "Get1401State() entry");
 
-	*state = 0xFFFFFFFF;	// Start off with invalid state
+	*state = 0xFFFFFFFF;	/*  Start off with invalid state */
 	nGot = usb_control_msg(pdx->udev, usb_rcvctrlpipe(pdx->udev, 0),
 			       GET_STATUS, (D_TO_H | VENDOR | DEVREQ), 0, 0,
 			       pdx->statBuf, sizeof(pdx->statBuf), HZ);
 	if (nGot != sizeof(pdx->statBuf)) {
 		dev_err(&pdx->interface->dev,
 			"Get1401State() FAILED, return code %d", nGot);
-		pdx->sCurrentState = U14ERR_TIME;	// Indicate that things are very wrong indeed
-		*state = 0;	// Force status values to a known state
+		pdx->sCurrentState = U14ERR_TIME;	/*  Indicate that things are very wrong indeed */
+		*state = 0;	/*  Force status values to a known state */
 		*error = 0;
 	} else {
 		int nDevice;
@@ -192,17 +193,16 @@
 			"Get1401State() Success, state: 0x%x, 0x%x",
 			pdx->statBuf[0], pdx->statBuf[1]);
 
-		*state = pdx->statBuf[0];	// Return the state values to the calling code
+		*state = pdx->statBuf[0];	/*  Return the state values to the calling code */
 		*error = pdx->statBuf[1];
 
-		nDevice = pdx->udev->descriptor.bcdDevice >> 8;	// 1401 type code value
-		switch (nDevice)	// so we can clean up current state
-		{
+		nDevice = pdx->udev->descriptor.bcdDevice >> 8;	/*  1401 type code value */
+		switch (nDevice) {	/*  so we can clean up current state */
 		case 0:
 			pdx->sCurrentState = U14ERR_U1401;
 			break;
 
-		default:	// allow lots of device codes for future 1401s
+		default:	/*  allow lots of device codes for future 1401s */
 			if ((nDevice >= 1) && (nDevice <= 23))
 				pdx->sCurrentState = (short)(nDevice + 6);
 			else
@@ -219,7 +219,7 @@
 **
 ** Kills off staged read\write request from the USB if one is pending.
 ****************************************************************************/
-int ReadWrite_Cancel(DEVICE_EXTENSION * pdx)
+int ReadWrite_Cancel(DEVICE_EXTENSION *pdx)
 {
 	dev_dbg(&pdx->interface->dev, "ReadWrite_Cancel entry %d",
 		pdx->bStagedUrbPending);
@@ -227,24 +227,23 @@
 	int ntStatus = STATUS_SUCCESS;
 	bool bResult = false;
 	unsigned int i;
-	// We can fill this in when we know how we will implement the staged transfer stuff
+	/*  We can fill this in when we know how we will implement the staged transfer stuff */
 	spin_lock_irq(&pdx->stagedLock);
 
-	if (pdx->bStagedUrbPending)	// anything to be cancelled? May need more...
-	{
+	if (pdx->bStagedUrbPending) {	/*  anything to be cancelled? May need more... */
 		dev_info(&pdx->interface - dev,
 			 "ReadWrite_Cancel about to cancel Urb");
-
-		//       KeClearEvent(&pdx->StagingDoneEvent);   // Clear the staging done flag
+		/* Clear the staging done flag */
+		/* KeClearEvent(&pdx->StagingDoneEvent); */
 		USB_ASSERT(pdx->pStagedIrp != NULL);
 
-		// Release the spinlock first otherwise the completion routine may hang
-		//  on the spinlock while this function hands waiting for the event.
+		/*  Release the spinlock first otherwise the completion routine may hang */
+		/*   on the spinlock while this function hands waiting for the event. */
 		spin_unlock_irq(&pdx->stagedLock);
-		bResult = IoCancelIrp(pdx->pStagedIrp);	// Actually do the cancel
+		bResult = IoCancelIrp(pdx->pStagedIrp);	/*  Actually do the cancel */
 		if (bResult) {
 			LARGE_INTEGER timeout;
-			timeout.QuadPart = -10000000;	// Use a timeout of 1 second
+			timeout.QuadPart = -10000000;	/*  Use a timeout of 1 second */
 			dev_info(&pdx->interface - dev,
 				 "ReadWrite_Cancel about to wait till done");
 			ntStatus =
@@ -274,14 +273,14 @@
 ** InSelfTest - utility to check in self test. Return 1 for ST, 0 for not or
 ** a -ve error code if we failed for some reason.
 ***************************************************************************/
-static int InSelfTest(DEVICE_EXTENSION * pdx, unsigned int *pState)
+static int InSelfTest(DEVICE_EXTENSION *pdx, unsigned int *pState)
 {
 	unsigned int state, error;
-	int iReturn = Get1401State(pdx, &state, &error);	// see if in self-test
-	if (iReturn == U14ERR_NOERROR)	// if all still OK
-		iReturn = (state == (unsigned int)-1) ||	// TX problem or...
-		    ((state & 0xff) == 0x80);	// ...self test
-	*pState = state;	// return actual state
+	int iReturn = Get1401State(pdx, &state, &error);	/*  see if in self-test */
+	if (iReturn == U14ERR_NOERROR)	/*  if all still OK */
+		iReturn = (state == (unsigned int)-1) ||	/*  TX problem or... */
+		    ((state & 0xff) == 0x80);	/*  ...self test */
+	*pState = state;	/*  return actual state */
 	return iReturn;
 }
 
@@ -303,48 +302,45 @@
 **
 **  Returns TRUE if a 1401 detected and OK, else FALSE
 ****************************************************************************/
-bool Is1401(DEVICE_EXTENSION * pdx)
+bool Is1401(DEVICE_EXTENSION *pdx)
 {
 	int iReturn;
 	dev_dbg(&pdx->interface->dev, "%s", __func__);
 
-	ced_draw_down(pdx);	// wait for, then kill outstanding Urbs
-	FlushInBuff(pdx);	// Clear out input buffer & pipe
-	FlushOutBuff(pdx);	// Clear output buffer & pipe
+	ced_draw_down(pdx);	/*  wait for, then kill outstanding Urbs */
+	FlushInBuff(pdx);	/*  Clear out input buffer & pipe */
+	FlushOutBuff(pdx);	/*  Clear output buffer & pipe */
 
-	// The next call returns 0 if OK, but has returned 1 in the past, meaning that
-	// usb_unlock_device() is needed... now it always is
+	/*  The next call returns 0 if OK, but has returned 1 in the past, meaning that */
+	/*  usb_unlock_device() is needed... now it always is */
 	iReturn = usb_lock_device_for_reset(pdx->udev, pdx->interface);
 
-	// release the io_mutex because if we don't, we will deadlock due to system
-	// calls back into the driver.
-	mutex_unlock(&pdx->io_mutex);	// locked, so we will not get system calls
-	if (iReturn >= 0)	// if we failed
-	{
-		iReturn = usb_reset_device(pdx->udev);	// try to do the reset
-		usb_unlock_device(pdx->udev);	// undo the lock
+	/*  release the io_mutex because if we don't, we will deadlock due to system */
+	/*  calls back into the driver. */
+	mutex_unlock(&pdx->io_mutex);	/*  locked, so we will not get system calls */
+	if (iReturn >= 0) {	/*  if we failed */
+		iReturn = usb_reset_device(pdx->udev);	/*  try to do the reset */
+		usb_unlock_device(pdx->udev);	/*  undo the lock */
 	}
 
-	mutex_lock(&pdx->io_mutex);	// hold stuff off while we wait
-	pdx->dwDMAFlag = MODE_CHAR;	// Clear DMA mode flag regardless!
-	if (iReturn == 0)	// if all is OK still
-	{
+	mutex_lock(&pdx->io_mutex);	/*  hold stuff off while we wait */
+	pdx->dwDMAFlag = MODE_CHAR;	/*  Clear DMA mode flag regardless! */
+	if (iReturn == 0) {	/*  if all is OK still */
 		unsigned int state;
-		iReturn = InSelfTest(pdx, &state);	// see if likely in self test
-		if (iReturn > 0)	// do we need to wait for self-test?
-		{
-			unsigned long ulTimeOut = jiffies + 30 * HZ;	// when to give up
+		iReturn = InSelfTest(pdx, &state);	/*  see if likely in self test */
+		if (iReturn > 0) {	/*  do we need to wait for self-test? */
+			unsigned long ulTimeOut = jiffies + 30 * HZ;	/*  when to give up */
 			while ((iReturn > 0) && time_before(jiffies, ulTimeOut)) {
-				schedule();	// let other stuff run
-				iReturn = InSelfTest(pdx, &state);	// see if done yet
+				schedule();	/*  let other stuff run */
+				iReturn = InSelfTest(pdx, &state);	/*  see if done yet */
 			}
 		}
 
-		if (iReturn == 0)	// if all is OK...
-			iReturn = state == 0;	// then success is that the state is 0
+		if (iReturn == 0)	/*  if all is OK... */
+			iReturn = state == 0;	/*  then success is that the state is 0 */
 	} else
-		iReturn = 0;	// we failed
-	pdx->bForceReset = false;	// Clear forced reset flag now
+		iReturn = 0;	/*  we failed */
+	pdx->bForceReset = false;	/*  Clear forced reset flag now */
 
 	return iReturn > 0;
 }
@@ -363,45 +359,42 @@
 **
 ** The return value is TRUE if a useable 1401 is found, FALSE if not
 */
-bool QuickCheck(DEVICE_EXTENSION * pdx, bool bTestBuff, bool bCanReset)
+bool QuickCheck(DEVICE_EXTENSION *pdx, bool bTestBuff, bool bCanReset)
 {
-	bool bRet = false;	// assume it will fail and we will reset
+	bool bRet = false;	/*  assume it will fail and we will reset */
 	bool bShortTest;
 
-	bShortTest = ((pdx->dwDMAFlag == MODE_CHAR) &&	// no DMA running
-		      (!pdx->bForceReset) &&	// Not had a real reset forced
-		      (pdx->sCurrentState >= U14ERR_STD));	// No 1401 errors stored
+	bShortTest = ((pdx->dwDMAFlag == MODE_CHAR) &&	/*  no DMA running */
+		      (!pdx->bForceReset) &&	/*  Not had a real reset forced */
+		      (pdx->sCurrentState >= U14ERR_STD));	/*  No 1401 errors stored */
 
 	dev_dbg(&pdx->interface->dev,
 		"%s DMAFlag:%d, state:%d, force:%d, testBuff:%d, short:%d",
 		__func__, pdx->dwDMAFlag, pdx->sCurrentState, pdx->bForceReset,
 		bTestBuff, bShortTest);
 
-	if ((bTestBuff) &&	// Buffer check requested, and...
-	    (pdx->dwNumInput || pdx->dwNumOutput))	// ...characters were in the buffer?
-	{
-		bShortTest = false;	// Then do the full test
+	if ((bTestBuff) &&	/*  Buffer check requested, and... */
+	    (pdx->dwNumInput || pdx->dwNumOutput)) {	/*  ...characters were in the buffer? */
+		bShortTest = false;	/*  Then do the full test */
 		dev_dbg(&pdx->interface->dev,
 			"%s will reset as buffers not empty", __func__);
 	}
 
-	if (bShortTest || !bCanReset)	// Still OK to try the short test?
-	{			// Always test if no reset - we want state update
+	if (bShortTest || !bCanReset) {	/*  Still OK to try the short test? */
+				/*  Always test if no reset - we want state update */
 		unsigned int state, error;
 		dev_dbg(&pdx->interface->dev, "%s->Get1401State", __func__);
-		if (Get1401State(pdx, &state, &error) == U14ERR_NOERROR)	// Check on the 1401 state
-		{
-			if ((state & 0xFF) == 0)	// If call worked, check the status value
-				bRet = true;	// If that was zero, all is OK, no reset needed
+		if (Get1401State(pdx, &state, &error) == U14ERR_NOERROR) {	/*  Check on the 1401 state */
+			if ((state & 0xFF) == 0)	/*  If call worked, check the status value */
+				bRet = true;	/*  If that was zero, all is OK, no reset needed */
 		}
 	}
 
-	if (!bRet && bCanReset)	// If all not OK, then
-	{
+	if (!bRet && bCanReset)	{ /*  If all not OK, then */
 		dev_info(&pdx->interface->dev, "%s->Is1401 %d %d %d %d",
 			 __func__, bShortTest, pdx->sCurrentState, bTestBuff,
 			 pdx->bForceReset);
-		bRet = Is1401(pdx);	//  do full test
+		bRet = Is1401(pdx);	/*   do full test */
 	}
 
 	return bRet;
@@ -412,11 +405,11 @@
 **
 ** Resets the 1401 and empties the i/o buffers
 *****************************************************************************/
-int Reset1401(DEVICE_EXTENSION * pdx)
+int Reset1401(DEVICE_EXTENSION *pdx)
 {
-	mutex_lock(&pdx->io_mutex);	// Protect disconnect from new i/o
+	mutex_lock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
 	dev_dbg(&pdx->interface->dev, "ABout to call QuickCheck");
-	QuickCheck(pdx, true, true);	// Check 1401, reset if not OK
+	QuickCheck(pdx, true, true);	/*  Check 1401, reset if not OK */
 	mutex_unlock(&pdx->io_mutex);
 	return U14ERR_NOERROR;
 }
@@ -426,30 +419,29 @@
 **
 ** Gets a single character from the 1401
 ****************************************************************************/
-int GetChar(DEVICE_EXTENSION * pdx)
+int GetChar(DEVICE_EXTENSION *pdx)
 {
-	int iReturn = U14ERR_NOIN;	// assume we will get  nothing
-	mutex_lock(&pdx->io_mutex);	// Protect disconnect from new i/o
+	int iReturn = U14ERR_NOIN;	/*  assume we will get  nothing */
+	mutex_lock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
 
 	dev_dbg(&pdx->interface->dev, "GetChar");
 
-	Allowi(pdx);	// Make sure char reads are running
-	SendChars(pdx);	// and send any buffered chars
+	Allowi(pdx);	/*  Make sure char reads are running */
+	SendChars(pdx);	/*  and send any buffered chars */
 
 	spin_lock_irq(&pdx->charInLock);
-	if (pdx->dwNumInput > 0)	// worth looking
-	{
+	if (pdx->dwNumInput > 0) {	/*  worth looking */
 		iReturn = pdx->inputBuffer[pdx->dwInBuffGet++];
 		if (pdx->dwInBuffGet >= INBUF_SZ)
 			pdx->dwInBuffGet = 0;
 		pdx->dwNumInput--;
 	} else
-		iReturn = U14ERR_NOIN;	// no input data to read
+		iReturn = U14ERR_NOIN;	/*  no input data to read */
 	spin_unlock_irq(&pdx->charInLock);
 
-	Allowi(pdx);	// Make sure char reads are running
+	Allowi(pdx);	/*  Make sure char reads are running */
 
-	mutex_unlock(&pdx->io_mutex);	// Protect disconnect from new i/o
+	mutex_unlock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
 	return iReturn;
 }
 
@@ -464,46 +456,43 @@
 ** returns the count of characters (including the terminator, or 0 if none
 ** or a negative error code.
 ****************************************************************************/
-int GetString(DEVICE_EXTENSION * pdx, char __user * pUser, int n)
+int GetString(DEVICE_EXTENSION *pdx, char __user *pUser, int n)
 {
-	int nAvailable;		// character in the buffer
+	int nAvailable;		/*  character in the buffer */
 	int iReturn = U14ERR_NOIN;
 	if (n <= 0)
 		return -ENOMEM;
 
-	mutex_lock(&pdx->io_mutex);	// Protect disconnect from new i/o
-	Allowi(pdx);	// Make sure char reads are running
-	SendChars(pdx);		// and send any buffered chars
+	mutex_lock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
+	Allowi(pdx);	/*  Make sure char reads are running */
+	SendChars(pdx);		/*  and send any buffered chars */
 
 	spin_lock_irq(&pdx->charInLock);
-	nAvailable = pdx->dwNumInput;	// characters available now
-	if (nAvailable > n)	// read max of space in pUser...
-		nAvailable = n;	// ...or input characters
+	nAvailable = pdx->dwNumInput;	/*  characters available now */
+	if (nAvailable > n)	/*  read max of space in pUser... */
+		nAvailable = n;	/*  ...or input characters */
 
-	if (nAvailable > 0)	// worth looking?
-	{
-		char buffer[INBUF_SZ + 1];	// space for a linear copy of data
+	if (nAvailable > 0) {	/*  worth looking? */
+		char buffer[INBUF_SZ + 1];	/*  space for a linear copy of data */
 		int nGot = 0;
-		int nCopyToUser;	// number to copy to user
+		int nCopyToUser;	/*  number to copy to user */
 		char cData;
 		do {
 			cData = pdx->inputBuffer[pdx->dwInBuffGet++];
-			if (cData == CR_CHAR)	// replace CR with zero
+			if (cData == CR_CHAR)	/*  replace CR with zero */
 				cData = (char)0;
 
 			if (pdx->dwInBuffGet >= INBUF_SZ)
-				pdx->dwInBuffGet = 0;	// wrap buffer pointer
+				pdx->dwInBuffGet = 0;	/*  wrap buffer pointer */
 
-			buffer[nGot++] = cData;	// save the output
-		}
-		while ((nGot < nAvailable) && cData);
+			buffer[nGot++] = cData;	/*  save the output */
+		} while ((nGot < nAvailable) && cData);
 
-		nCopyToUser = nGot;	// what to copy...
-		if (cData)	// do we need null
-		{
-			buffer[nGot] = (char)0;	// make it tidy
-			if (nGot < n)	// if space in user buffer...
-				++nCopyToUser;	// ...copy the 0 as well.
+		nCopyToUser = nGot;	/*  what to copy... */
+		if (cData) {	/*  do we need null */
+			buffer[nGot] = (char)0;	/*  make it tidy */
+			if (nGot < n)	/*  if space in user buffer... */
+				++nCopyToUser;	/*  ...copy the 0 as well. */
 		}
 
 		pdx->dwNumInput -= nGot;
@@ -514,12 +503,12 @@
 		if (copy_to_user(pUser, buffer, nCopyToUser))
 			iReturn = -EFAULT;
 		else
-			iReturn = nGot;		// report characters read
+			iReturn = nGot;		/*  report characters read */
 	} else
 		spin_unlock_irq(&pdx->charInLock);
 
-	Allowi(pdx);	// Make sure char reads are running
-	mutex_unlock(&pdx->io_mutex);	// Protect disconnect from new i/o
+	Allowi(pdx);	/*  Make sure char reads are running */
+	mutex_unlock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
 
 	return iReturn;
 }
@@ -527,14 +516,14 @@
 /*******************************************************************************
 ** Get count of characters in the inout buffer.
 *******************************************************************************/
-int Stat1401(DEVICE_EXTENSION * pdx)
+int Stat1401(DEVICE_EXTENSION *pdx)
 {
 	int iReturn;
-	mutex_lock(&pdx->io_mutex);	// Protect disconnect from new i/o
-	Allowi(pdx);		// make sure we allow pending chars
-	SendChars(pdx);		// in both directions
-	iReturn = pdx->dwNumInput;	// no lock as single read
-	mutex_unlock(&pdx->io_mutex);	// Protect disconnect from new i/o
+	mutex_lock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
+	Allowi(pdx);		/*  make sure we allow pending chars */
+	SendChars(pdx);		/*  in both directions */
+	iReturn = pdx->dwNumInput;	/*  no lock as single read */
+	mutex_unlock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
 	return iReturn;
 }
 
@@ -545,32 +534,30 @@
 ** any fancy interlocks as we only read the interrupt routine data, and the
 ** system is arranged so nothing can be destroyed.
 ****************************************************************************/
-int LineCount(DEVICE_EXTENSION * pdx)
+int LineCount(DEVICE_EXTENSION *pdx)
 {
-	int iReturn = 0;	// will be count of line ends
+	int iReturn = 0;	/*  will be count of line ends */
 
-	mutex_lock(&pdx->io_mutex);	// Protect disconnect from new i/o
-	Allowi(pdx);		// Make sure char reads are running
-	SendChars(pdx);		// and send any buffered chars
-	spin_lock_irq(&pdx->charInLock);	// Get protection
+	mutex_lock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
+	Allowi(pdx);		/*  Make sure char reads are running */
+	SendChars(pdx);		/*  and send any buffered chars */
+	spin_lock_irq(&pdx->charInLock);	/*  Get protection */
 
-	if (pdx->dwNumInput > 0)	// worth looking?
-	{
-		unsigned int dwIndex = pdx->dwInBuffGet;	// start at first available
-		unsigned int dwEnd = pdx->dwInBuffPut;	// Position for search end
+	if (pdx->dwNumInput > 0) {	/*  worth looking? */
+		unsigned int dwIndex = pdx->dwInBuffGet;	/*  start at first available */
+		unsigned int dwEnd = pdx->dwInBuffPut;	/*  Position for search end */
 		do {
 			if (pdx->inputBuffer[dwIndex++] == CR_CHAR)
-				++iReturn;	// inc count if CR
+				++iReturn;	/*  inc count if CR */
 
-			if (dwIndex >= INBUF_SZ)	// see if we fall off buff
+			if (dwIndex >= INBUF_SZ)	/*  see if we fall off buff */
 				dwIndex = 0;
-		}
-		while (dwIndex != dwEnd);	// go to last available
+		} while (dwIndex != dwEnd);	/*  go to last available */
 	}
 
 	spin_unlock_irq(&pdx->charInLock);
 	dev_dbg(&pdx->interface->dev, "LineCount returned %d", iReturn);
-	mutex_unlock(&pdx->io_mutex);	// Protect disconnect from new i/o
+	mutex_unlock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
 	return iReturn;
 }
 
@@ -579,14 +566,14 @@
 **
 ** Gets the space in the output buffer. Called from user code.
 *****************************************************************************/
-int GetOutBufSpace(DEVICE_EXTENSION * pdx)
+int GetOutBufSpace(DEVICE_EXTENSION *pdx)
 {
 	int iReturn;
-	mutex_lock(&pdx->io_mutex);	// Protect disconnect from new i/o
-	SendChars(pdx);		// send any buffered chars
-	iReturn = (int)(OUTBUF_SZ - pdx->dwNumOutput);	// no lock needed for single read
+	mutex_lock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
+	SendChars(pdx);		/*  send any buffered chars */
+	iReturn = (int)(OUTBUF_SZ - pdx->dwNumOutput);	/*  no lock needed for single read */
 	dev_dbg(&pdx->interface->dev, "OutBufSpace %d", iReturn);
-	mutex_unlock(&pdx->io_mutex);	// Protect disconnect from new i/o
+	mutex_unlock(&pdx->io_mutex);	/*  Protect disconnect from new i/o */
 	return iReturn;
 }
 
@@ -597,7 +584,7 @@
 ** Clears up a transfer area. This is always called in the context of a user
 ** request, never from a call-back.
 ****************************************************************************/
-int ClearArea(DEVICE_EXTENSION * pdx, int nArea)
+int ClearArea(DEVICE_EXTENSION *pdx, int nArea)
 {
 	int iReturn = U14ERR_NOERROR;
 
@@ -606,14 +593,14 @@
 		dev_err(&pdx->interface->dev, "%s Attempt to clear area %d",
 			__func__, nArea);
 	} else {
-		TRANSAREA *pTA = &pdx->rTransDef[nArea];	// to save typing
-		if (!pTA->bUsed)	// if not used...
-			iReturn = U14ERR_NOTSET;	// ...nothing to be done
+		TRANSAREA *pTA = &pdx->rTransDef[nArea];	/*  to save typing */
+		if (!pTA->bUsed)	/*  if not used... */
+			iReturn = U14ERR_NOTSET;	/*  ...nothing to be done */
 		else {
-			// We must save the memory we return as we shouldn't mess with memory while
-			// holding a spin lock.
-			struct page **pPages = 0;	// save page address list
-			int nPages = 0;	// and number of pages
+			/*  We must save the memory we return as we shouldn't mess with memory while */
+			/*  holding a spin lock. */
+			struct page **pPages = NULL; /*save page address list*/
+			int nPages = 0;	/*  and number of pages */
 			int np;
 
 			dev_dbg(&pdx->interface->dev, "%s area %d", __func__,
@@ -621,33 +608,32 @@
 			spin_lock_irq(&pdx->stagedLock);
 			if ((pdx->StagedId == nArea)
 			    && (pdx->dwDMAFlag > MODE_CHAR)) {
-				iReturn = U14ERR_UNLOCKFAIL;	// cannot delete as in use
+				iReturn = U14ERR_UNLOCKFAIL;	/*  cannot delete as in use */
 				dev_err(&pdx->interface->dev,
 					"%s call on area %d while active",
 					__func__, nArea);
 			} else {
-				pPages = pTA->pPages;	// save page address list
-				nPages = pTA->nPages;	// and page count
-				if (pTA->dwEventSz)	// if events flagging in use
-					wake_up_interruptible(&pTA->wqEvent);	// release anything that was waiting
+				pPages = pTA->pPages;	/*  save page address list */
+				nPages = pTA->nPages;	/*  and page count */
+				if (pTA->dwEventSz)	/*  if events flagging in use */
+					wake_up_interruptible(&pTA->wqEvent);	/*  release anything that was waiting */
 
 				if (pdx->bXFerWaiting
 				    && (pdx->rDMAInfo.wIdent == nArea))
-					pdx->bXFerWaiting = false;	// Cannot have pending xfer if area cleared
+					pdx->bXFerWaiting = false;	/*  Cannot have pending xfer if area cleared */
 
-				// Clean out the TRANSAREA except for the wait queue, which is at the end
-				// This sets bUsed to false and dwEventSz to 0 to say area not used and no events.
+				/*  Clean out the TRANSAREA except for the wait queue, which is at the end */
+				/*  This sets bUsed to false and dwEventSz to 0 to say area not used and no events. */
 				memset(pTA, 0,
 				       sizeof(TRANSAREA) -
 				       sizeof(wait_queue_head_t));
 			}
 			spin_unlock_irq(&pdx->stagedLock);
 
-			if (pPages)	// if we decided to release the memory
-			{
-				// Now we must undo the pinning down of the pages. We will assume the worst and mark
-				// all the pages as dirty. Don't be tempted to move this up above as you must not be
-				// holding a spin lock to do this stuff as it is not atomic.
+			if (pPages) { 	/*  if we decided to release the memory */
+				/*  Now we must undo the pinning down of the pages. We will assume the worst and mark */
+				/*  all the pages as dirty. Don't be tempted to move this up above as you must not be */
+				/*  holding a spin lock to do this stuff as it is not atomic. */
 				dev_dbg(&pdx->interface->dev, "%s nPages=%d",
 					__func__, nPages);
 
@@ -674,29 +660,29 @@
 ** Sets up a transfer area - the functional part. Called by both
 ** SetTransfer and SetCircular.
 ****************************************************************************/
-static int SetArea(DEVICE_EXTENSION * pdx, int nArea, char __user * puBuf,
+static int SetArea(DEVICE_EXTENSION *pdx, int nArea, char __user *puBuf,
 		   unsigned int dwLength, bool bCircular, bool bCircToHost)
 {
-	// Start by working out the page aligned start of the area and the size
-	// of the area in pages, allowing for the start not being aligned and the
-	// end needing to be rounded up to a page boundary.
+	/*  Start by working out the page aligned start of the area and the size */
+	/*  of the area in pages, allowing for the start not being aligned and the */
+	/*  end needing to be rounded up to a page boundary. */
 	unsigned long ulStart = ((unsigned long)puBuf) & PAGE_MASK;
 	unsigned int ulOffset = ((unsigned long)puBuf) & (PAGE_SIZE - 1);
 	int len = (dwLength + ulOffset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-	TRANSAREA *pTA = &pdx->rTransDef[nArea];	// to save typing
-	struct page **pPages = 0;	// space for page tables
-	int nPages = 0;		// and number of pages
+	TRANSAREA *pTA = &pdx->rTransDef[nArea];	/*  to save typing */
+	struct page **pPages = NULL;	/*  space for page tables */
+	int nPages = 0;		/*  and number of pages */
 
-	int iReturn = ClearArea(pdx, nArea);	// see if OK to use this area
-	if ((iReturn != U14ERR_NOTSET) &&	// if not area unused and...
-	    (iReturn != U14ERR_NOERROR))	// ...not all OK, then...
-		return iReturn;	// ...we cannot use this area
+	int iReturn = ClearArea(pdx, nArea);	/*  see if OK to use this area */
+	if ((iReturn != U14ERR_NOTSET) &&	/*  if not area unused and... */
+	    (iReturn != U14ERR_NOERROR))	/*  ...not all OK, then... */
+		return iReturn;	/*  ...we cannot use this area */
 
-	if (!access_ok(VERIFY_WRITE, puBuf, dwLength))	// if we cannot access the memory...
-		return -EFAULT;	// ...then we are done
+	if (!access_ok(VERIFY_WRITE, puBuf, dwLength))	/*  if we cannot access the memory... */
+		return -EFAULT;	/*  ...then we are done */
 
-	// Now allocate space to hold the page pointer and virtual address pointer tables
+	/*  Now allocate space to hold the page pointer and virtual address pointer tables */
 	pPages = kmalloc(len * sizeof(struct page *), GFP_KERNEL);
 	if (!pPages) {
 		iReturn = U14ERR_NOMEMORY;
@@ -705,24 +691,23 @@
 	dev_dbg(&pdx->interface->dev, "%s %p, length=%06x, circular %d",
 		__func__, puBuf, dwLength, bCircular);
 
-	// To pin down user pages we must first acquire the mapping semaphore.
-	down_read(&current->mm->mmap_sem);	// get memory map semaphore
-	nPages =
-	    get_user_pages(current, current->mm, ulStart, len, 1, 0, pPages, 0);
-	up_read(&current->mm->mmap_sem);	// release the semaphore
+	/*  To pin down user pages we must first acquire the mapping semaphore. */
+	down_read(&current->mm->mmap_sem);	/*  get memory map semaphore */
+	nPages = get_user_pages(current, current->mm, ulStart, len, 1, 0,
+				pPages, NULL);
+	up_read(&current->mm->mmap_sem);	/*  release the semaphore */
 	dev_dbg(&pdx->interface->dev, "%s nPages = %d", __func__, nPages);
 
-	if (nPages > 0)		// if we succeeded
-	{
-		// If you are tempted to use page_address (form LDD3), forget it. You MUST use
-		// kmap() or kmap_atomic() to get a virtual address. page_address will give you
-		// (null) or at least it does in this context with an x86 machine.
+	if (nPages > 0) {		/*  if we succeeded */
+		/*  If you are tempted to use page_address (form LDD3), forget it. You MUST use */
+		/*  kmap() or kmap_atomic() to get a virtual address. page_address will give you */
+		/*  (null) or at least it does in this context with an x86 machine. */
 		spin_lock_irq(&pdx->stagedLock);
-		pTA->lpvBuff = puBuf;	// keep start of region (user address)
-		pTA->dwBaseOffset = ulOffset;	// save offset in first page to start of xfer
-		pTA->dwLength = dwLength;	// Size if the region in bytes
-		pTA->pPages = pPages;	// list of pages that are used by buffer
-		pTA->nPages = nPages;	// number of pages
+		pTA->lpvBuff = puBuf;	/*  keep start of region (user address) */
+		pTA->dwBaseOffset = ulOffset;	/*  save offset in first page to start of xfer */
+		pTA->dwLength = dwLength;	/*  Size if the region in bytes */
+		pTA->pPages = pPages;	/*  list of pages that are used by buffer */
+		pTA->nPages = nPages;	/*  number of pages */
 
 		pTA->bCircular = bCircular;
 		pTA->bCircToHost = bCircToHost;
@@ -731,10 +716,10 @@
 		pTA->aBlocks[0].dwSize = 0;
 		pTA->aBlocks[1].dwOffset = 0;
 		pTA->aBlocks[1].dwSize = 0;
-		pTA->bUsed = true;	// This is now a used block
+		pTA->bUsed = true;	/*  This is now a used block */
 
 		spin_unlock_irq(&pdx->stagedLock);
-		iReturn = U14ERR_NOERROR;	// say all was well
+		iReturn = U14ERR_NOERROR;	/*  say all was well */
 	} else {
 		iReturn = U14ERR_LOCKFAIL;
 		goto error;
@@ -754,7 +739,7 @@
 ** unset it. Unsetting will fail if the area is booked, and a transfer to that
 ** area is in progress. Otherwise, we will release the area and re-assign it.
 ****************************************************************************/
-int SetTransfer(DEVICE_EXTENSION * pdx, TRANSFERDESC __user * pTD)
+int SetTransfer(DEVICE_EXTENSION *pdx, TRANSFERDESC __user *pTD)
 {
 	int iReturn;
 	TRANSFERDESC td;
@@ -765,9 +750,9 @@
 	mutex_lock(&pdx->io_mutex);
 	dev_dbg(&pdx->interface->dev, "%s area:%d, size:%08x", __func__,
 		td.wAreaNum, td.dwLength);
-	// The strange cast is done so that we don't get warnings in 32-bit linux about the size of the
-	// pointer. The pointer is always passed as a 64-bit object so that we don't have problems using
-	// a 32-bit program on a 64-bit system. unsigned long is 64-bits on a 64-bit system.
+	/*  The strange cast is done so that we don't get warnings in 32-bit linux about the size of the */
+	/*  pointer. The pointer is always passed as a 64-bit object so that we don't have problems using */
+	/*  a 32-bit program on a 64-bit system. unsigned long is 64-bits on a 64-bit system. */
 	iReturn =
 	    SetArea(pdx, td.wAreaNum,
 		    (char __user *)((unsigned long)td.lpvBuff), td.dwLength,
@@ -780,7 +765,7 @@
 ** UnSetTransfer
 ** Erases a transfer area record
 ****************************************************************************/
-int UnsetTransfer(DEVICE_EXTENSION * pdx, int nArea)
+int UnsetTransfer(DEVICE_EXTENSION *pdx, int nArea)
 {
 	int iReturn;
 	mutex_lock(&pdx->io_mutex);
@@ -797,27 +782,26 @@
 ** pretend that whatever the user asked for was achieved, so we return 1 if
 ** try to create one, and 0 if they ask to remove (assuming all else was OK).
 ****************************************************************************/
-int SetEvent(DEVICE_EXTENSION * pdx, TRANSFEREVENT __user * pTE)
+int SetEvent(DEVICE_EXTENSION *pdx, TRANSFEREVENT __user *pTE)
 {
 	int iReturn = U14ERR_NOERROR;
 	TRANSFEREVENT te;
 
-	// get a local copy of the data
+	/*  get a local copy of the data */
 	if (copy_from_user(&te, pTE, sizeof(te)))
 		return -EFAULT;
 
-	if (te.wAreaNum >= MAX_TRANSAREAS)	// the area must exist
+	if (te.wAreaNum >= MAX_TRANSAREAS)	/*  the area must exist */
 		return U14ERR_BADAREA;
 	else {
 		TRANSAREA *pTA = &pdx->rTransDef[te.wAreaNum];
-		mutex_lock(&pdx->io_mutex);	// make sure we have no competitor
+		mutex_lock(&pdx->io_mutex);	/*  make sure we have no competitor */
 		spin_lock_irq(&pdx->stagedLock);
-		if (pTA->bUsed)	// area must be in use
-		{
-			pTA->dwEventSt = te.dwStart;	// set area regions
-			pTA->dwEventSz = te.dwLength;	// set size (0 cancels it)
-			pTA->bEventToHost = te.wFlags & 1;	// set the direction
-			pTA->iWakeUp = 0;	// zero the wake up count
+		if (pTA->bUsed) {	/*  area must be in use */
+			pTA->dwEventSt = te.dwStart;	/*  set area regions */
+			pTA->dwEventSz = te.dwLength;	/*  set size (0 cancels it) */
+			pTA->bEventToHost = te.wFlags & 1;	/*  set the direction */
+			pTA->iWakeUp = 0;	/*  zero the wake up count */
 		} else
 			iReturn = U14ERR_NOTSET;
 		spin_unlock_irq(&pdx->stagedLock);
@@ -833,7 +817,7 @@
 ** of times that a block met the event condition since we last cleared it or
 ** 0 if timed out, or -ve error (bad area or not set, or signal).
 ****************************************************************************/
-int WaitEvent(DEVICE_EXTENSION * pdx, int nArea, int msTimeOut)
+int WaitEvent(DEVICE_EXTENSION *pdx, int nArea, int msTimeOut)
 {
 	int iReturn;
 	if ((unsigned)nArea >= MAX_TRANSAREAS)
@@ -841,15 +825,15 @@
 	else {
 		int iWait;
 		TRANSAREA *pTA = &pdx->rTransDef[nArea];
-		msTimeOut = (msTimeOut * HZ + 999) / 1000;	// convert timeout to jiffies
+		msTimeOut = (msTimeOut * HZ + 999) / 1000;	/*  convert timeout to jiffies */
 
-		// We cannot wait holding the mutex, but we check the flags while holding
-		// it. This may well be pointless as another thread could get in between
-		// releasing it and the wait call. However, this would have to clear the
-		// iWakeUp flag. However, the !pTA-bUsed may help us in this case.
-		mutex_lock(&pdx->io_mutex);	// make sure we have no competitor
-		if (!pTA->bUsed || !pTA->dwEventSz)	// check something to wait for...
-			return U14ERR_NOTSET;	// ...else we do nothing
+		/*  We cannot wait holding the mutex, but we check the flags while holding */
+		/*  it. This may well be pointless as another thread could get in between */
+		/*  releasing it and the wait call. However, this would have to clear the */
+		/*  iWakeUp flag. However, the !pTA-bUsed may help us in this case. */
+		mutex_lock(&pdx->io_mutex);	/*  make sure we have no competitor */
+		if (!pTA->bUsed || !pTA->dwEventSz)	/*  check something to wait for... */
+			return U14ERR_NOTSET;	/*  ...else we do nothing */
 		mutex_unlock(&pdx->io_mutex);
 
 		if (msTimeOut)
@@ -863,12 +847,12 @@
 			    wait_event_interruptible(pTA->wqEvent, pTA->iWakeUp
 						     || !pTA->bUsed);
 		if (iWait)
-			iReturn = -ERESTARTSYS;	// oops - we have had a SIGNAL
+			iReturn = -ERESTARTSYS;	/*  oops - we have had a SIGNAL */
 		else
-			iReturn = pTA->iWakeUp;	// else the wakeup count
+			iReturn = pTA->iWakeUp;	/*  else the wakeup count */
 
 		spin_lock_irq(&pdx->stagedLock);
-		pTA->iWakeUp = 0;	// clear the flag
+		pTA->iWakeUp = 0;	/*  clear the flag */
 		spin_unlock_irq(&pdx->stagedLock);
 	}
 	return iReturn;
@@ -880,17 +864,17 @@
 ** number of times a block completed since the last call, or 0 if none or a
 ** negative error.
 ****************************************************************************/
-int TestEvent(DEVICE_EXTENSION * pdx, int nArea)
+int TestEvent(DEVICE_EXTENSION *pdx, int nArea)
 {
 	int iReturn;
 	if ((unsigned)nArea >= MAX_TRANSAREAS)
 		iReturn = U14ERR_BADAREA;
 	else {
 		TRANSAREA *pTA = &pdx->rTransDef[nArea];
-		mutex_lock(&pdx->io_mutex);	// make sure we have no competitor
+		mutex_lock(&pdx->io_mutex);	/*  make sure we have no competitor */
 		spin_lock_irq(&pdx->stagedLock);
-		iReturn = pTA->iWakeUp;	// get wakeup count since last call
-		pTA->iWakeUp = 0;	// clear the count
+		iReturn = pTA->iWakeUp;	/*  get wakeup count since last call */
+		pTA->iWakeUp = 0;	/*  clear the count */
 		spin_unlock_irq(&pdx->stagedLock);
 		mutex_unlock(&pdx->io_mutex);
 	}
@@ -901,17 +885,17 @@
 ** GetTransferInfo
 ** Puts the current state of the 1401 in a TGET_TX_BLOCK.
 *****************************************************************************/
-int GetTransfer(DEVICE_EXTENSION * pdx, TGET_TX_BLOCK __user * pTX)
+int GetTransfer(DEVICE_EXTENSION *pdx, TGET_TX_BLOCK __user *pTX)
 {
 	int iReturn = U14ERR_NOERROR;
 	unsigned int dwIdent;
 
 	mutex_lock(&pdx->io_mutex);
-	dwIdent = pdx->StagedId;	// area ident for last xfer
+	dwIdent = pdx->StagedId;	/*  area ident for last xfer */
 	if (dwIdent >= MAX_TRANSAREAS)
 		iReturn = U14ERR_BADAREA;
 	else {
-		// Return the best information we have - we don't have physical addresses
+		/*  Return the best information we have - we don't have physical addresses */
 		TGET_TX_BLOCK *tx;
 
 		tx = kzalloc(sizeof(*tx), GFP_KERNEL);
@@ -921,8 +905,8 @@
 		}
 		tx->size = pdx->rTransDef[dwIdent].dwLength;
 		tx->linear = (long long)((long)pdx->rTransDef[dwIdent].lpvBuff);
-		tx->avail = GET_TX_MAXENTRIES;	// how many blocks we could return
-		tx->used = 1;	// number we actually return
+		tx->avail = GET_TX_MAXENTRIES;	/*  how many blocks we could return */
+		tx->used = 1;	/*  number we actually return */
 		tx->entries[0].physical =
 		    (long long)(tx->linear + pdx->StagedOffset);
 		tx->entries[0].size = tx->size;
@@ -940,7 +924,7 @@
 **
 ** Empties the host i/o buffers
 ****************************************************************************/
-int KillIO1401(DEVICE_EXTENSION * pdx)
+int KillIO1401(DEVICE_EXTENSION *pdx)
 {
 	dev_dbg(&pdx->interface->dev, "%s", __func__);
 	mutex_lock(&pdx->io_mutex);
@@ -955,7 +939,7 @@
 ** Returns a 0 or a 1 for whether DMA is happening. No point holding a mutex
 ** for this as it only does one read.
 *****************************************************************************/
-int BlkTransState(DEVICE_EXTENSION * pdx)
+int BlkTransState(DEVICE_EXTENSION *pdx)
 {
 	int iReturn = pdx->dwDMAFlag != MODE_CHAR;
 	dev_dbg(&pdx->interface->dev, "%s = %d", __func__, iReturn);
@@ -967,12 +951,12 @@
 **
 ** Puts the current state of the 1401 in the Irp return buffer.
 *****************************************************************************/
-int StateOf1401(DEVICE_EXTENSION * pdx)
+int StateOf1401(DEVICE_EXTENSION *pdx)
 {
 	int iReturn;
 	mutex_lock(&pdx->io_mutex);
 
-	QuickCheck(pdx, false, false);	// get state up to date, no reset
+	QuickCheck(pdx, false, false);	/*  get state up to date, no reset */
 	iReturn = pdx->sCurrentState;
 
 	mutex_unlock(&pdx->io_mutex);
@@ -987,20 +971,23 @@
 ** Initiates a self-test cycle. The assumption is that we have no interrupts
 ** active, so we should make sure that this is the case.
 *****************************************************************************/
-int StartSelfTest(DEVICE_EXTENSION * pdx)
+int StartSelfTest(DEVICE_EXTENSION *pdx)
 {
 	int nGot;
 	mutex_lock(&pdx->io_mutex);
 	dev_dbg(&pdx->interface->dev, "%s", __func__);
 
-	ced_draw_down(pdx);	// wait for, then kill outstanding Urbs
-	FlushInBuff(pdx);	// Clear out input buffer & pipe
-	FlushOutBuff(pdx);	// Clear output buffer & pipe
-//    ReadWrite_Cancel(pDeviceObject);        /* so things stay tidy */
+	ced_draw_down(pdx);	/*  wait for, then kill outstanding Urbs */
+	FlushInBuff(pdx);	/*  Clear out input buffer & pipe */
+	FlushOutBuff(pdx);	/*  Clear output buffer & pipe */
+	/* so things stay tidy */
+	/* ReadWrite_Cancel(pDeviceObject); */
 	pdx->dwDMAFlag = MODE_CHAR;	/* Clear DMA mode flags here */
 
-	nGot = usb_control_msg(pdx->udev, usb_rcvctrlpipe(pdx->udev, 0), DB_SELFTEST, (H_TO_D | VENDOR | DEVREQ), 0, 0, 0, 0, HZ);	// allow 1 second timeout
-	pdx->ulSelfTestTime = jiffies + HZ * 30;	// 30 seconds into the future
+	nGot = usb_control_msg(pdx->udev, usb_rcvctrlpipe(pdx->udev, 0),
+			       DB_SELFTEST, (H_TO_D | VENDOR | DEVREQ),
+			       0, 0, NULL, 0, HZ); /* allow 1 second timeout */
+	pdx->ulSelfTestTime = jiffies + HZ * 30;	/*  30 seconds into the future */
 
 	mutex_unlock(&pdx->io_mutex);
 	if (nGot < 0)
@@ -1013,53 +1000,49 @@
 **
 ** Check progress of a self-test cycle
 ****************************************************************************/
-int CheckSelfTest(DEVICE_EXTENSION * pdx, TGET_SELFTEST __user * pGST)
+int CheckSelfTest(DEVICE_EXTENSION *pdx, TGET_SELFTEST __user *pGST)
 {
 	unsigned int state, error;
 	int iReturn;
-	TGET_SELFTEST gst;	// local work space
-	memset(&gst, 0, sizeof(gst));	// clear out the space (sets code 0)
+	TGET_SELFTEST gst;	/*  local work space */
+	memset(&gst, 0, sizeof(gst));	/*  clear out the space (sets code 0) */
 
 	mutex_lock(&pdx->io_mutex);
 
 	dev_dbg(&pdx->interface->dev, "%s", __func__);
 	iReturn = Get1401State(pdx, &state, &error);
-	if (iReturn == U14ERR_NOERROR)	// Only accept zero if it happens twice
+	if (iReturn == U14ERR_NOERROR)	/*  Only accept zero if it happens twice */
 		iReturn = Get1401State(pdx, &state, &error);
 
-	if (iReturn != U14ERR_NOERROR)	// Self-test can cause comms errors
-	{			// so we assume still testing
+	if (iReturn != U14ERR_NOERROR) {	/*  Self-test can cause comms errors */
+				/*  so we assume still testing */
 		dev_err(&pdx->interface->dev,
 			"%s Get1401State=%d, assuming still testing", __func__,
 			iReturn);
-		state = 0x80;	// Force still-testing, no error
+		state = 0x80;	/*  Force still-testing, no error */
 		error = 0;
 		iReturn = U14ERR_NOERROR;
 	}
 
-	if ((state == -1) && (error == -1))	// If Get1401State had problems
-	{
+	if ((state == -1) && (error == -1)) {	/*  If Get1401State had problems */
 		dev_err(&pdx->interface->dev,
 			"%s Get1401State failed, assuming still testing",
 			__func__);
-		state = 0x80;	// Force still-testing, no error
+		state = 0x80;	/*  Force still-testing, no error */
 		error = 0;
 	}
 
-	if ((state & 0xFF) == 0x80)	// If we are still in self-test
-	{
-		if (state & 0x00FF0000)	// Have we got an error?
-		{
-			gst.code = (state & 0x00FF0000) >> 16;	// read the error code
-			gst.x = error & 0x0000FFFF;	// Error data X
-			gst.y = (error & 0xFFFF0000) >> 16;	// and data Y
+	if ((state & 0xFF) == 0x80) {	/*  If we are still in self-test */
+		if (state & 0x00FF0000)	{ /*  Have we got an error? */
+			gst.code = (state & 0x00FF0000) >> 16;	/*  read the error code */
+			gst.x = error & 0x0000FFFF;	/*  Error data X */
+			gst.y = (error & 0xFFFF0000) >> 16;	/*  and data Y */
 			dev_dbg(&pdx->interface->dev, "Self-test error code %d",
 				gst.code);
-		} else		// No error, check for timeout
-		{
-			unsigned long ulNow = jiffies;	// get current time
+		} else {		/*  No error, check for timeout */
+			unsigned long ulNow = jiffies;	/*  get current time */
 			if (time_after(ulNow, pdx->ulSelfTestTime)) {
-				gst.code = -2;	// Flag the timeout
+				gst.code = -2;	/*  Flag the timeout */
 				dev_dbg(&pdx->interface->dev,
 					"Self-test timed-out");
 			} else
@@ -1067,16 +1050,16 @@
 					"Self-test on-going");
 		}
 	} else {
-		gst.code = -1;	// Flag the test is done
+		gst.code = -1;	/*  Flag the test is done */
 		dev_dbg(&pdx->interface->dev, "Self-test done");
 	}
 
-	if (gst.code < 0)	// If we have a problem or finished
-	{			// If using the 2890 we should reset properly
+	if (gst.code < 0) {	/*  If we have a problem or finished */
+				/*  If using the 2890 we should reset properly */
 		if ((pdx->nPipes == 4) && (pdx->s1401Type <= TYPEPOWER))
-			Is1401(pdx);	// Get 1401 reset and OK
+			Is1401(pdx);	/*  Get 1401 reset and OK */
 		else
-			QuickCheck(pdx, true, true);	// Otherwise check without reset unless problems
+			QuickCheck(pdx, true, true);	/*  Otherwise check without reset unless problems */
 	}
 	mutex_unlock(&pdx->io_mutex);
 
@@ -1091,7 +1074,7 @@
 **
 ** Returns code for standard, plus, micro1401, power1401 or none
 ****************************************************************************/
-int TypeOf1401(DEVICE_EXTENSION * pdx)
+int TypeOf1401(DEVICE_EXTENSION *pdx)
 {
 	int iReturn = TYPEUNKNOWN;
 	mutex_lock(&pdx->io_mutex);
@@ -1100,7 +1083,7 @@
 	switch (pdx->s1401Type) {
 	case TYPE1401:
 		iReturn = U14ERR_STD;
-		break;		// Handle these types directly
+		break;		/*  Handle these types directly */
 	case TYPEPLUS:
 		iReturn = U14ERR_PLUS;
 		break;
@@ -1109,9 +1092,9 @@
 		break;
 	default:
 		if ((pdx->s1401Type >= TYPEPOWER) && (pdx->s1401Type <= 25))
-			iReturn = pdx->s1401Type + 4;	// We can calculate types
-		else		//  for up-coming 1401 designs
-			iReturn = TYPEUNKNOWN;	// Don't know or not there
+			iReturn = pdx->s1401Type + 4;	/*  We can calculate types */
+		else		/*   for up-coming 1401 designs */
+			iReturn = TYPEUNKNOWN;	/*  Don't know or not there */
 	}
 	dev_dbg(&pdx->interface->dev, "%s %d", __func__, iReturn);
 	mutex_unlock(&pdx->io_mutex);
@@ -1124,13 +1107,13 @@
 **
 ** Returns flags on block transfer abilities
 ****************************************************************************/
-int TransferFlags(DEVICE_EXTENSION * pdx)
+int TransferFlags(DEVICE_EXTENSION *pdx)
 {
-	int iReturn = U14TF_MULTIA | U14TF_DIAG |	// we always have multiple DMA area
-	    U14TF_NOTIFY | U14TF_CIRCTH;	// diagnostics, notify and circular
+	int iReturn = U14TF_MULTIA | U14TF_DIAG |	/*  we always have multiple DMA area */
+	    U14TF_NOTIFY | U14TF_CIRCTH;	/*  diagnostics, notify and circular */
 	dev_dbg(&pdx->interface->dev, "%s", __func__);
 	mutex_lock(&pdx->io_mutex);
-	if (pdx->bIsUSB2)	// Set flag for USB2 if appropriate
+	if (pdx->bIsUSB2)	/*  Set flag for USB2 if appropriate */
 		iReturn |= U14TF_USB2;
 	mutex_unlock(&pdx->io_mutex);
 
@@ -1142,12 +1125,16 @@
 ** Issues a debug\diagnostic command to the 1401 along with a 32-bit datum
 ** This is a utility command used for dbg operations.
 */
-static int DbgCmd1401(DEVICE_EXTENSION * pdx, unsigned char cmd,
+static int DbgCmd1401(DEVICE_EXTENSION *pdx, unsigned char cmd,
 		      unsigned int data)
 {
 	int iReturn;
 	dev_dbg(&pdx->interface->dev, "%s entry", __func__);
-	iReturn = usb_control_msg(pdx->udev, usb_sndctrlpipe(pdx->udev, 0), cmd, (H_TO_D | VENDOR | DEVREQ), (unsigned short)data, (unsigned short)(data >> 16), 0, 0, HZ);	// allow 1 second timeout
+	iReturn = usb_control_msg(pdx->udev, usb_sndctrlpipe(pdx->udev, 0), cmd,
+				  (H_TO_D | VENDOR | DEVREQ),
+				  (unsigned short)data,
+				  (unsigned short)(data >> 16), NULL, 0, HZ);
+						/* allow 1 second timeout */
 	if (iReturn < 0)
 		dev_err(&pdx->interface->dev, "%s fail code=%d", __func__,
 			iReturn);
@@ -1160,7 +1147,7 @@
 **
 ** Execute the diagnostic peek operation. Uses address, width and repeats.
 ****************************************************************************/
-int DbgPeek(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
+int DbgPeek(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB)
 {
 	int iReturn;
 	TDBGBLOCK db;
@@ -1189,7 +1176,7 @@
 ** Execute the diagnostic poke operation. Parameters are in the CSBLOCK struct
 ** in order address, size, repeats and value to poke.
 ****************************************************************************/
-int DbgPoke(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
+int DbgPoke(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB)
 {
 	int iReturn;
 	TDBGBLOCK db;
@@ -1218,7 +1205,7 @@
 ** Execute the diagnostic ramp data operation. Parameters are in the CSBLOCK struct
 ** in order address, default, enable mask, size and repeats.
 ****************************************************************************/
-int DbgRampData(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
+int DbgRampData(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB)
 {
 	int iReturn;
 	TDBGBLOCK db;
@@ -1250,7 +1237,7 @@
 **
 ** Execute the diagnostic ramp address operation
 ****************************************************************************/
-int DbgRampAddr(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
+int DbgRampAddr(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB)
 {
 	int iReturn;
 	TDBGBLOCK db;
@@ -1280,16 +1267,16 @@
 **
 ** Retrieve the data resulting from the last debug Peek operation
 ****************************************************************************/
-int DbgGetData(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
+int DbgGetData(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB)
 {
 	int iReturn;
 	TDBGBLOCK db;
-	memset(&db, 0, sizeof(db));	// fill returned block with 0s
+	memset(&db, 0, sizeof(db));	/*  fill returned block with 0s */
 
 	mutex_lock(&pdx->io_mutex);
 	dev_dbg(&pdx->interface->dev, "%s", __func__);
 
-	// Read back the last peeked value from the 1401.
+	/*  Read back the last peeked value from the 1401. */
 	iReturn = usb_control_msg(pdx->udev, usb_rcvctrlpipe(pdx->udev, 0),
 				  DB_DATA, (D_TO_H | VENDOR | DEVREQ), 0, 0,
 				  &db.iData, sizeof(db.iData), HZ);
@@ -1313,7 +1300,7 @@
 ** Stop any never-ending debug loop, we just call Get1401State for USB
 **
 ****************************************************************************/
-int DbgStopLoop(DEVICE_EXTENSION * pdx)
+int DbgStopLoop(DEVICE_EXTENSION *pdx)
 {
 	int iReturn;
 	unsigned int uState, uErr;
@@ -1334,7 +1321,7 @@
 ** booked and a transfer to that area is in progress. Otherwise, we will
 ** release the area and re-assign it.
 ****************************************************************************/
-int SetCircular(DEVICE_EXTENSION * pdx, TRANSFERDESC __user * pTD)
+int SetCircular(DEVICE_EXTENSION *pdx, TRANSFERDESC __user *pTD)
 {
 	int iReturn;
 	bool bToHost;
@@ -1346,11 +1333,11 @@
 	mutex_lock(&pdx->io_mutex);
 	dev_dbg(&pdx->interface->dev, "%s area:%d, size:%08x", __func__,
 		td.wAreaNum, td.dwLength);
-	bToHost = td.eSize != 0;	// this is used as the tohost flag
+	bToHost = td.eSize != 0;	/*  this is used as the tohost flag */
 
-	// The strange cast is done so that we don't get warnings in 32-bit linux about the size of the
-	// pointer. The pointer is always passed as a 64-bit object so that we don't have problems using
-	// a 32-bit program on a 64-bit system. unsigned long is 64-bits on a 64-bit system.
+	/*  The strange cast is done so that we don't get warnings in 32-bit linux about the size of the */
+	/*  pointer. The pointer is always passed as a 64-bit object so that we don't have problems using */
+	/*  a 32-bit program on a 64-bit system. unsigned long is 64-bits on a 64-bit system. */
 	iReturn =
 	    SetArea(pdx, td.wAreaNum,
 		    (char __user *)((unsigned long)td.lpvBuff), td.dwLength,
@@ -1364,7 +1351,7 @@
 **
 ** Return the next available block of circularly-transferred data.
 ****************************************************************************/
-int GetCircBlock(DEVICE_EXTENSION * pdx, TCIRCBLOCK __user * pCB)
+int GetCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __user *pCB)
 {
 	int iReturn = U14ERR_NOERROR;
 	unsigned int nArea;
@@ -1377,20 +1364,17 @@
 
 	mutex_lock(&pdx->io_mutex);
 
-	nArea = cb.nArea;	// Retrieve parameters first
-	cb.dwOffset = 0;	// set default result (nothing)
+	nArea = cb.nArea;	/*  Retrieve parameters first */
+	cb.dwOffset = 0;	/*  set default result (nothing) */
 	cb.dwSize = 0;
 
-	if (nArea < MAX_TRANSAREAS)	// The area number must be OK
-	{
-		TRANSAREA *pArea = &pdx->rTransDef[nArea];	// Pointer to relevant info
-		spin_lock_irq(&pdx->stagedLock);	// Lock others out
+	if (nArea < MAX_TRANSAREAS) {	/*  The area number must be OK */
+		TRANSAREA *pArea = &pdx->rTransDef[nArea];	/*  Pointer to relevant info */
+		spin_lock_irq(&pdx->stagedLock);	/*  Lock others out */
 
-		if ((pArea->bUsed) && (pArea->bCircular) &&	// Must be circular area
-		    (pArea->bCircToHost))	// For now at least must be to host
-		{
-			if (pArea->aBlocks[0].dwSize > 0)	// Got anything?
-			{
+		if ((pArea->bUsed) && (pArea->bCircular) &&	/*  Must be circular area */
+		    (pArea->bCircToHost)) {	/*  For now at least must be to host */
+			if (pArea->aBlocks[0].dwSize > 0) {	/*  Got anything? */
 				cb.dwOffset = pArea->aBlocks[0].dwOffset;
 				cb.dwSize = pArea->aBlocks[0].dwSize;
 				dev_dbg(&pdx->interface->dev,
@@ -1416,7 +1400,7 @@
 **
 ** Frees a block of circularly-transferred data and returns the next one.
 ****************************************************************************/
-int FreeCircBlock(DEVICE_EXTENSION * pdx, TCIRCBLOCK __user * pCB)
+int FreeCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __user *pCB)
 {
 	int iReturn = U14ERR_NOERROR;
 	unsigned int nArea, uStart, uSize;
@@ -1429,33 +1413,28 @@
 
 	mutex_lock(&pdx->io_mutex);
 
-	nArea = cb.nArea;	// Retrieve parameters first
+	nArea = cb.nArea;	/*  Retrieve parameters first */
 	uStart = cb.dwOffset;
 	uSize = cb.dwSize;
-	cb.dwOffset = 0;	// then set default result (nothing)
+	cb.dwOffset = 0;	/*  then set default result (nothing) */
 	cb.dwSize = 0;
 
-	if (nArea < MAX_TRANSAREAS)	// The area number must be OK
-	{
-		TRANSAREA *pArea = &pdx->rTransDef[nArea];	// Pointer to relevant info
-		spin_lock_irq(&pdx->stagedLock);	// Lock others out
+	if (nArea < MAX_TRANSAREAS) {	/*  The area number must be OK */
+		TRANSAREA *pArea = &pdx->rTransDef[nArea];	/*  Pointer to relevant info */
+		spin_lock_irq(&pdx->stagedLock);	/*  Lock others out */
 
-		if ((pArea->bUsed) && (pArea->bCircular) &&	// Must be circular area
-		    (pArea->bCircToHost))	// For now at least must be to host
-		{
+		if ((pArea->bUsed) && (pArea->bCircular) &&	/*  Must be circular area */
+		    (pArea->bCircToHost)) {	/*  For now at least must be to host */
 			bool bWaiting = false;
 
-			if ((pArea->aBlocks[0].dwSize >= uSize) &&	// Got anything?
-			    (pArea->aBlocks[0].dwOffset == uStart))	// Must be legal data
-			{
+			if ((pArea->aBlocks[0].dwSize >= uSize) &&	/*  Got anything? */
+			    (pArea->aBlocks[0].dwOffset == uStart)) {	/*  Must be legal data */
 				pArea->aBlocks[0].dwSize -= uSize;
 				pArea->aBlocks[0].dwOffset += uSize;
-				if (pArea->aBlocks[0].dwSize == 0)	// Have we emptied this block?
-				{
-					if (pArea->aBlocks[1].dwSize)	// Is there a second block?
-					{
-						pArea->aBlocks[0] = pArea->aBlocks[1];	// Copy down block 2 data
-						pArea->aBlocks[1].dwSize = 0;	// and mark the second block as unused
+				if (pArea->aBlocks[0].dwSize == 0) {	/*  Have we emptied this block? */
+					if (pArea->aBlocks[1].dwSize) {	/*  Is there a second block? */
+						pArea->aBlocks[0] = pArea->aBlocks[1];	/*  Copy down block 2 data */
+						pArea->aBlocks[1].dwSize = 0;	/*  and mark the second block as unused */
 						pArea->aBlocks[1].dwOffset = 0;
 					} else
 						pArea->aBlocks[0].dwOffset = 0;
@@ -1468,9 +1447,8 @@
 					pArea->aBlocks[0].dwOffset,
 					pdx->bXFerWaiting);
 
-				// Return the next available block of memory as well
-				if (pArea->aBlocks[0].dwSize > 0)	// Got anything?
-				{
+				/*  Return the next available block of memory as well */
+				if (pArea->aBlocks[0].dwSize > 0) {	/*  Got anything? */
 					cb.dwOffset =
 					    pArea->aBlocks[0].dwOffset;
 					cb.dwSize = pArea->aBlocks[0].dwSize;
@@ -1492,9 +1470,8 @@
 				iReturn = U14ERR_NOMEMORY;
 			}
 
-			// If we have one, kick off pending transfer
-			if (bWaiting)	// Got a block xfer waiting?
-			{
+			/*  If we have one, kick off pending transfer */
+			if (bWaiting) {	/*  Got a block xfer waiting? */
 				int RWMStat =
 				    ReadWriteMem(pdx, !pdx->rDMAInfo.bOutWard,
 						 pdx->rDMAInfo.wIdent,

diff --git a/drivers/staging/ced1401/ced_ioctl.h b/drivers/staging/ced1401/ced_ioctl.h
index 0895c941..aa68878 100644
--- a/drivers/staging/ced1401/ced_ioctl.h
+++ b/drivers/staging/ced1401/ced_ioctl.h

@@ -35,7 +35,7 @@
 	short eSize;		/* element size - is tohost flag for circular */
 } TRANSFERDESC;
 
-typedef TRANSFERDESC * LPTRANSFERDESC;
+typedef TRANSFERDESC *LPTRANSFERDESC;
 
 typedef struct TransferEvent {
 	unsigned int dwStart;		/* offset into the area */

diff --git a/drivers/staging/ced1401/machine.h b/drivers/staging/ced1401/machine.h
index af07379..dbd4036d 100644
--- a/drivers/staging/ced1401/machine.h
+++ b/drivers/staging/ced1401/machine.h

@@ -77,20 +77,13 @@
 #endif
 
 #if defined(LINUX) || defined(MAXOSX)
-    #define FAR
+	#define FAR
 
-    typedef int BOOL;       // To match Windows
-    typedef char * LPSTR;
-    typedef const char * LPCSTR;
-    typedef unsigned short WORD;
-    typedef unsigned int  DWORD;
-    typedef unsigned char  BYTE;
-    typedef BYTE  BOOLEAN;
-    typedef unsigned char UCHAR;
-    #define __packed __attribute__((packed))
-    typedef BYTE * LPBYTE;
-    #define HIWORD(x) (WORD)(((x)>>16) & 0xffff)
-    #define LOWORD(x) (WORD)((x) & 0xffff)
+	typedef int BOOL;       /*  To match Windows */
+	typedef unsigned char  BYTE;
+	#define __packed __attribute__((packed))
+	#define HIWORD(x) (unsigned short)(((x)>>16) & 0xffff)
+	#define LOWORD(x) (unsigned short)((x) & 0xffff)
 #endif
 
 #ifdef _IS_WINDOWS_
@@ -104,21 +97,20 @@
 ** a synonym.
 */
 #ifdef GNUC
-    #define DllExport __attribute__((dllexport))
-    #define DllImport __attribute__((dllimport))
+	#define DllExport __attribute__((dllexport))
+	#define DllImport __attribute__((dllimport))
 #endif
 
 #ifndef DllExport
 #ifdef _IS_WINDOWS_
-    #define DllExport __declspec(dllexport)
-    #define DllImport __declspec(dllimport)
+	#define DllExport __declspec(dllexport)
+	#define DllImport __declspec(dllimport)
 #else
-    #define DllExport
-    #define DllImport
+	#define DllExport
+	#define DllImport
 #endif
 #endif /* _IS_WINDOWS_ */
 
-    
 #ifndef TRUE
    #define TRUE 1
    #define FALSE 0

diff --git a/drivers/staging/ced1401/usb1401.c b/drivers/staging/ced1401/usb1401.c
index 254131d..97c55f9 100644
--- a/drivers/staging/ced1401/usb1401.c
+++ b/drivers/staging/ced1401/usb1401.c

@@ -126,18 +126,18 @@
 {
 	DEVICE_EXTENSION *pdx = to_DEVICE_EXTENSION(kref);
 
-	// Free up the output buffer, then free the output urb. Note that the interface member
-	// of pdx will probably be NULL, so cannot be used to get to dev.
+	/*  Free up the output buffer, then free the output urb. Note that the interface member */
+	/*  of pdx will probably be NULL, so cannot be used to get to dev. */
 	usb_free_coherent(pdx->udev, OUTBUF_SZ, pdx->pCoherCharOut,
 			  pdx->pUrbCharOut->transfer_dma);
 	usb_free_urb(pdx->pUrbCharOut);
 
-	// Do the same for chan input
+	/*  Do the same for chan input */
 	usb_free_coherent(pdx->udev, INBUF_SZ, pdx->pCoherCharIn,
 			  pdx->pUrbCharIn->transfer_dma);
 	usb_free_urb(pdx->pUrbCharIn);
 
-	// Do the same for the block transfers
+	/*  Do the same for the block transfers */
 	usb_free_coherent(pdx->udev, STAGED_SZ, pdx->pCoherStagedIO,
 			  pdx->pStagedUrb->transfer_dma);
 	usb_free_urb(pdx->pStagedUrb);
@@ -146,7 +146,7 @@
 	kfree(pdx);
 }
 
-// This is the driver end of the open() call from user space.
+/*  This is the driver end of the open() call from user space. */
 static int ced_open(struct inode *inode, struct file *file)
 {
 	DEVICE_EXTENSION *pdx;
@@ -184,7 +184,7 @@
 			kref_put(&pdx->kref, ced_delete);
 			goto exit;
 		}
-	} else {		//uncomment this block if you want exclusive open
+	} else {		/* uncomment this block if you want exclusive open */
 		dev_err(&interface->dev, "%s fail: already open", __func__);
 		retval = -EBUSY;
 		pdx->open_count--;
@@ -210,11 +210,11 @@
 
 	dev_dbg(&pdx->interface->dev, "%s called", __func__);
 	mutex_lock(&pdx->io_mutex);
-	if (!--pdx->open_count && pdx->interface)	// Allow autosuspend
+	if (!--pdx->open_count && pdx->interface)	/*  Allow autosuspend */
 		usb_autopm_put_interface(pdx->interface);
 	mutex_unlock(&pdx->io_mutex);
 
-	kref_put(&pdx->kref, ced_delete);	// decrement the count on our device
+	kref_put(&pdx->kref, ced_delete);	/*  decrement the count on our device */
 	return 0;
 }
 
@@ -252,9 +252,9 @@
 ** not help with a device extension held by a file.
 ** return true if can accept new io requests, else false
 */
-static bool CanAcceptIoRequests(DEVICE_EXTENSION * pdx)
+static bool CanAcceptIoRequests(DEVICE_EXTENSION *pdx)
 {
-	return pdx && pdx->interface;	// Can we accept IO requests
+	return pdx && pdx->interface;	/*  Can we accept IO requests */
 }
 
 /****************************************************************************
@@ -264,9 +264,9 @@
 static void ced_writechar_callback(struct urb *pUrb)
 {
 	DEVICE_EXTENSION *pdx = pUrb->context;
-	int nGot = pUrb->actual_length;	// what we transferred
+	int nGot = pUrb->actual_length;	/*  what we transferred */
 
-	if (pUrb->status) {	// sync/async unlink faults aren't errors
+	if (pUrb->status) {	/*  sync/async unlink faults aren't errors */
 		if (!
 		    (pUrb->status == -ENOENT || pUrb->status == -ECONNRESET
 		     || pUrb->status == -ESHUTDOWN)) {
@@ -278,36 +278,35 @@
 		spin_lock(&pdx->err_lock);
 		pdx->errors = pUrb->status;
 		spin_unlock(&pdx->err_lock);
-		nGot = 0;	//  and tidy up again if so
+		nGot = 0;	/*   and tidy up again if so */
 
-		spin_lock(&pdx->charOutLock);	// already at irq level
-		pdx->dwOutBuffGet = 0;	// Reset the output buffer
+		spin_lock(&pdx->charOutLock);	/*  already at irq level */
+		pdx->dwOutBuffGet = 0;	/*  Reset the output buffer */
 		pdx->dwOutBuffPut = 0;
-		pdx->dwNumOutput = 0;	// Clear the char count
-		pdx->bPipeError[0] = 1;	// Flag an error for later
-		pdx->bSendCharsPending = false;	// Allow other threads again
-		spin_unlock(&pdx->charOutLock);	// already at irq level
+		pdx->dwNumOutput = 0;	/*  Clear the char count */
+		pdx->bPipeError[0] = 1;	/*  Flag an error for later */
+		pdx->bSendCharsPending = false;	/*  Allow other threads again */
+		spin_unlock(&pdx->charOutLock);	/*  already at irq level */
 		dev_dbg(&pdx->interface->dev,
 			"%s - char out done, 0 chars sent", __func__);
 	} else {
 		dev_dbg(&pdx->interface->dev,
 			"%s - char out done, %d chars sent", __func__, nGot);
-		spin_lock(&pdx->charOutLock);	// already at irq level
-		pdx->dwNumOutput -= nGot;	// Now adjust the char send buffer
-		pdx->dwOutBuffGet += nGot;	// to match what we did
-		if (pdx->dwOutBuffGet >= OUTBUF_SZ)	// Can't do this any earlier as data could be overwritten
+		spin_lock(&pdx->charOutLock);	/*  already at irq level */
+		pdx->dwNumOutput -= nGot;	/*  Now adjust the char send buffer */
+		pdx->dwOutBuffGet += nGot;	/*  to match what we did */
+		if (pdx->dwOutBuffGet >= OUTBUF_SZ)	/*  Can't do this any earlier as data could be overwritten */
 			pdx->dwOutBuffGet = 0;
 
-		if (pdx->dwNumOutput > 0)	// if more to be done...
-		{
-			int nPipe = 0;	// The pipe number to use
+		if (pdx->dwNumOutput > 0) {	/*  if more to be done... */
+			int nPipe = 0;	/*  The pipe number to use */
 			int iReturn;
 			char *pDat = &pdx->outputBuffer[pdx->dwOutBuffGet];
-			unsigned int dwCount = pdx->dwNumOutput;	// maximum to send
-			if ((pdx->dwOutBuffGet + dwCount) > OUTBUF_SZ)	// does it cross buffer end?
+			unsigned int dwCount = pdx->dwNumOutput;	/*  maximum to send */
+			if ((pdx->dwOutBuffGet + dwCount) > OUTBUF_SZ)	/*  does it cross buffer end? */
 				dwCount = OUTBUF_SZ - pdx->dwOutBuffGet;
-			spin_unlock(&pdx->charOutLock);	// we are done with stuff that changes
-			memcpy(pdx->pCoherCharOut, pDat, dwCount);	// copy output data to the buffer
+			spin_unlock(&pdx->charOutLock);	/*  we are done with stuff that changes */
+			memcpy(pdx->pCoherCharOut, pDat, dwCount);	/*  copy output data to the buffer */
 			usb_fill_bulk_urb(pdx->pUrbCharOut, pdx->udev,
 					  usb_sndbulkpipe(pdx->udev,
 							  pdx->epAddr[0]),
@@ -315,22 +314,22 @@
 					  ced_writechar_callback, pdx);
 			pdx->pUrbCharOut->transfer_flags |=
 			    URB_NO_TRANSFER_DMA_MAP;
-			usb_anchor_urb(pdx->pUrbCharOut, &pdx->submitted);	// in case we need to kill it
+			usb_anchor_urb(pdx->pUrbCharOut, &pdx->submitted);	/*  in case we need to kill it */
 			iReturn = usb_submit_urb(pdx->pUrbCharOut, GFP_ATOMIC);
 			dev_dbg(&pdx->interface->dev, "%s n=%d>%s<", __func__,
 				dwCount, pDat);
-			spin_lock(&pdx->charOutLock);	// grab lock for errors
+			spin_lock(&pdx->charOutLock);	/*  grab lock for errors */
 			if (iReturn) {
-				pdx->bPipeError[nPipe] = 1;	// Flag an error to be handled later
-				pdx->bSendCharsPending = false;	// Allow other threads again
+				pdx->bPipeError[nPipe] = 1;	/*  Flag an error to be handled later */
+				pdx->bSendCharsPending = false;	/*  Allow other threads again */
 				usb_unanchor_urb(pdx->pUrbCharOut);
 				dev_err(&pdx->interface->dev,
 					"%s usb_submit_urb() returned %d",
 					__func__, iReturn);
 			}
 		} else
-			pdx->bSendCharsPending = false;	// Allow other threads again
-		spin_unlock(&pdx->charOutLock);	// already at irq level
+			pdx->bSendCharsPending = false;	/*  Allow other threads again */
+		spin_unlock(&pdx->charOutLock);	/*  already at irq level */
 	}
 }
 
@@ -339,44 +338,43 @@
 ** Transmit the characters in the output buffer to the 1401. This may need
 ** breaking down into multiple transfers.
 ****************************************************************************/
-int SendChars(DEVICE_EXTENSION * pdx)
+int SendChars(DEVICE_EXTENSION *pdx)
 {
 	int iReturn = U14ERR_NOERROR;
 
-	spin_lock_irq(&pdx->charOutLock);	// Protect ourselves
+	spin_lock_irq(&pdx->charOutLock);	/*  Protect ourselves */
 
-	if ((!pdx->bSendCharsPending) &&	// Not currently sending
-	    (pdx->dwNumOutput > 0) &&	//  has characters to output
-	    (CanAcceptIoRequests(pdx)))	//  and current activity is OK
-	{
-		unsigned int dwCount = pdx->dwNumOutput;	// Get a copy of the character count
-		pdx->bSendCharsPending = true;	// Set flag to lock out other threads
+	if ((!pdx->bSendCharsPending) &&	/*  Not currently sending */
+	    (pdx->dwNumOutput > 0) &&	/*   has characters to output */
+	    (CanAcceptIoRequests(pdx)))	{ /*   and current activity is OK */
+		unsigned int dwCount = pdx->dwNumOutput;	/*  Get a copy of the character count */
+		pdx->bSendCharsPending = true;	/*  Set flag to lock out other threads */
 
 		dev_dbg(&pdx->interface->dev,
 			"Send %d chars to 1401, EP0 flag %d\n", dwCount,
 			pdx->nPipes == 3);
-		// If we have only 3 end points we must send the characters to the 1401 using EP0.
+		/*  If we have only 3 end points we must send the characters to the 1401 using EP0. */
 		if (pdx->nPipes == 3) {
-			// For EP0 character transmissions to the 1401, we have to hang about until they
-			// are gone, as otherwise without more character IO activity they will never go.
-			unsigned int count = dwCount;	// Local char counter
-			unsigned int index = 0;	// The index into the char buffer
+			/*  For EP0 character transmissions to the 1401, we have to hang about until they */
+			/*  are gone, as otherwise without more character IO activity they will never go. */
+			unsigned int count = dwCount;	/*  Local char counter */
+			unsigned int index = 0;	/*  The index into the char buffer */
 
-			spin_unlock_irq(&pdx->charOutLock);	// Free spinlock as we call USBD
+			spin_unlock_irq(&pdx->charOutLock);	/*  Free spinlock as we call USBD */
 
 			while ((count > 0) && (iReturn == U14ERR_NOERROR)) {
-				// We have to break the transfer up into 64-byte chunks because of a 2270 problem
-				int n = count > 64 ? 64 : count;	// Chars for this xfer, max of 64
+				/*  We have to break the transfer up into 64-byte chunks because of a 2270 problem */
+				int n = count > 64 ? 64 : count;	/*  Chars for this xfer, max of 64 */
 				int nSent = usb_control_msg(pdx->udev,
-							    usb_sndctrlpipe(pdx->udev, 0),	// use end point 0
-							    DB_CHARS,	// bRequest
-							    (H_TO_D | VENDOR | DEVREQ),	// to the device, vendor request to the device
-							    0, 0,	// value and index are both 0
-							    &pdx->outputBuffer[index],	// where to send from
-							    n,	// how much to send
-							    1000);	// timeout in jiffies
+							    usb_sndctrlpipe(pdx->udev, 0),	/*  use end point 0 */
+							    DB_CHARS,	/*  bRequest */
+							    (H_TO_D | VENDOR | DEVREQ),	/*  to the device, vendor request to the device */
+							    0, 0,	/*  value and index are both 0 */
+							    &pdx->outputBuffer[index],	/*  where to send from */
+							    n,	/*  how much to send */
+							    1000);	/*  timeout in jiffies */
 				if (nSent <= 0) {
-					iReturn = nSent ? nSent : -ETIMEDOUT;	// if 0 chars says we timed out
+					iReturn = nSent ? nSent : -ETIMEDOUT;	/*  if 0 chars says we timed out */
 					dev_err(&pdx->interface->dev,
 						"Send %d chars by EP0 failed: %d",
 						n, iReturn);
@@ -388,19 +386,19 @@
 				}
 			}
 
-			spin_lock_irq(&pdx->charOutLock);	// Protect pdx changes, released by general code
-			pdx->dwOutBuffGet = 0;	// so reset the output buffer
+			spin_lock_irq(&pdx->charOutLock);	/*  Protect pdx changes, released by general code */
+			pdx->dwOutBuffGet = 0;	/*  so reset the output buffer */
 			pdx->dwOutBuffPut = 0;
-			pdx->dwNumOutput = 0;	// and clear the buffer count
-			pdx->bSendCharsPending = false;	// Allow other threads again
-		} else {	// Here for sending chars normally - we hold the spin lock
-			int nPipe = 0;	// The pipe number to use
+			pdx->dwNumOutput = 0;	/*  and clear the buffer count */
+			pdx->bSendCharsPending = false;	/*  Allow other threads again */
+		} else {	/*  Here for sending chars normally - we hold the spin lock */
+			int nPipe = 0;	/*  The pipe number to use */
 			char *pDat = &pdx->outputBuffer[pdx->dwOutBuffGet];
 
-			if ((pdx->dwOutBuffGet + dwCount) > OUTBUF_SZ)	// does it cross buffer end?
+			if ((pdx->dwOutBuffGet + dwCount) > OUTBUF_SZ)	/*  does it cross buffer end? */
 				dwCount = OUTBUF_SZ - pdx->dwOutBuffGet;
-			spin_unlock_irq(&pdx->charOutLock);	// we are done with stuff that changes
-			memcpy(pdx->pCoherCharOut, pDat, dwCount);	// copy output data to the buffer
+			spin_unlock_irq(&pdx->charOutLock);	/*  we are done with stuff that changes */
+			memcpy(pdx->pCoherCharOut, pDat, dwCount);	/*  copy output data to the buffer */
 			usb_fill_bulk_urb(pdx->pUrbCharOut, pdx->udev,
 					  usb_sndbulkpipe(pdx->udev,
 							  pdx->epAddr[0]),
@@ -410,11 +408,11 @@
 			    URB_NO_TRANSFER_DMA_MAP;
 			usb_anchor_urb(pdx->pUrbCharOut, &pdx->submitted);
 			iReturn = usb_submit_urb(pdx->pUrbCharOut, GFP_KERNEL);
-			spin_lock_irq(&pdx->charOutLock);	// grab lock for errors
+			spin_lock_irq(&pdx->charOutLock);	/*  grab lock for errors */
 			if (iReturn) {
-				pdx->bPipeError[nPipe] = 1;	// Flag an error to be handled later
-				pdx->bSendCharsPending = false;	// Allow other threads again
-				usb_unanchor_urb(pdx->pUrbCharOut);	// remove from list of active urbs
+				pdx->bPipeError[nPipe] = 1;	/*  Flag an error to be handled later */
+				pdx->bSendCharsPending = false;	/*  Allow other threads again */
+				usb_unanchor_urb(pdx->pUrbCharOut);	/*  remove from list of active urbs */
 			}
 		}
 	} else if (pdx->bSendCharsPending && (pdx->dwNumOutput > 0))
@@ -422,7 +420,7 @@
 			"SendChars bSendCharsPending:true");
 
 	dev_dbg(&pdx->interface->dev, "SendChars exit code: %d", iReturn);
-	spin_unlock_irq(&pdx->charOutLock);	// Now let go of the spinlock
+	spin_unlock_irq(&pdx->charOutLock);	/*  Now let go of the spinlock */
 	return iReturn;
 }
 
@@ -440,14 +438,14 @@
 ** pdx  Is our device extension which holds all we know about the transfer.
 ** n    The number of bytes to move one way or the other.
 ***************************************************************************/
-static void CopyUserSpace(DEVICE_EXTENSION * pdx, int n)
+static void CopyUserSpace(DEVICE_EXTENSION *pdx, int n)
 {
 	unsigned int nArea = pdx->StagedId;
 	if (nArea < MAX_TRANSAREAS) {
-		TRANSAREA *pArea = &pdx->rTransDef[nArea];	// area to be used
+		TRANSAREA *pArea = &pdx->rTransDef[nArea];	/*  area to be used */
 		unsigned int dwOffset =
 		    pdx->StagedDone + pdx->StagedOffset + pArea->dwBaseOffset;
-		char *pCoherBuf = pdx->pCoherStagedIO;	// coherent buffer
+		char *pCoherBuf = pdx->pCoherStagedIO;	/*  coherent buffer */
 		if (!pArea->bUsed) {
 			dev_err(&pdx->interface->dev, "%s area %d unused",
 				__func__, nArea);
@@ -455,15 +453,15 @@
 		}
 
 		while (n) {
-			int nPage = dwOffset >> PAGE_SHIFT;	// page number in table
+			int nPage = dwOffset >> PAGE_SHIFT;	/*  page number in table */
 			if (nPage < pArea->nPages) {
 				char *pvAddress =
 				    (char *)kmap_atomic(pArea->pPages[nPage]);
 				if (pvAddress) {
-					unsigned int uiPageOff = dwOffset & (PAGE_SIZE - 1);	// offset into the page
-					size_t uiXfer = PAGE_SIZE - uiPageOff;	// max to transfer on this page
-					if (uiXfer > n)	// limit byte count if too much
-						uiXfer = n;	// for the page
+					unsigned int uiPageOff = dwOffset & (PAGE_SIZE - 1);	/*  offset into the page */
+					size_t uiXfer = PAGE_SIZE - uiPageOff;	/*  max to transfer on this page */
+					if (uiXfer > n)	/*  limit byte count if too much */
+						uiXfer = n;	/*  for the page */
 					if (pdx->StagedRead)
 						memcpy(pvAddress + uiPageOff,
 						       pCoherBuf, uiXfer);
@@ -494,8 +492,8 @@
 			nArea);
 }
 
-// Forward declarations for stuff used circularly
-static int StageChunk(DEVICE_EXTENSION * pdx);
+/*  Forward declarations for stuff used circularly */
+static int StageChunk(DEVICE_EXTENSION *pdx);
 /***************************************************************************
 ** ReadWrite_Complete
 **
@@ -504,14 +502,14 @@
 static void staged_callback(struct urb *pUrb)
 {
 	DEVICE_EXTENSION *pdx = pUrb->context;
-	unsigned int nGot = pUrb->actual_length;	// what we transferred
+	unsigned int nGot = pUrb->actual_length;	/*  what we transferred */
 	bool bCancel = false;
-	bool bRestartCharInput;	// used at the end
+	bool bRestartCharInput;	/*  used at the end */
 
-	spin_lock(&pdx->stagedLock);	// stop ReadWriteMem() action while this routine is running
-	pdx->bStagedUrbPending = false;	// clear the flag for staged IRP pending
+	spin_lock(&pdx->stagedLock);	/*  stop ReadWriteMem() action while this routine is running */
+	pdx->bStagedUrbPending = false;	/*  clear the flag for staged IRP pending */
 
-	if (pUrb->status) {	// sync/async unlink faults aren't errors
+	if (pUrb->status) {	/*  sync/async unlink faults aren't errors */
 		if (!
 		    (pUrb->status == -ENOENT || pUrb->status == -ECONNRESET
 		     || pUrb->status == -ESHUTDOWN)) {
@@ -525,40 +523,37 @@
 		spin_lock(&pdx->err_lock);
 		pdx->errors = pUrb->status;
 		spin_unlock(&pdx->err_lock);
-		nGot = 0;	//  and tidy up again if so
+		nGot = 0;	/*   and tidy up again if so */
 		bCancel = true;
 	} else {
 		dev_dbg(&pdx->interface->dev, "%s %d chars xferred", __func__,
 			nGot);
-		if (pdx->StagedRead)	// if reading, save to user space
-			CopyUserSpace(pdx, nGot);	// copy from buffer to user
+		if (pdx->StagedRead)	/*  if reading, save to user space */
+			CopyUserSpace(pdx, nGot);	/*  copy from buffer to user */
 		if (nGot == 0)
 			dev_dbg(&pdx->interface->dev, "%s ZLP", __func__);
 	}
 
-	// Update the transfer length based on the TransferBufferLength value in the URB
+	/*  Update the transfer length based on the TransferBufferLength value in the URB */
 	pdx->StagedDone += nGot;
 
 	dev_dbg(&pdx->interface->dev, "%s, done %d bytes of %d", __func__,
 		pdx->StagedDone, pdx->StagedLength);
 
-	if ((pdx->StagedDone == pdx->StagedLength) ||	// If no more to do
-	    (bCancel))		// or this IRP was cancelled
-	{
-		TRANSAREA *pArea = &pdx->rTransDef[pdx->StagedId];	// Transfer area info
+	if ((pdx->StagedDone == pdx->StagedLength) ||	/*  If no more to do */
+	    (bCancel)) {		/*  or this IRP was cancelled */
+		TRANSAREA *pArea = &pdx->rTransDef[pdx->StagedId];	/*  Transfer area info */
 		dev_dbg(&pdx->interface->dev,
 			"%s transfer done, bytes %d, cancel %d", __func__,
 			pdx->StagedDone, bCancel);
 
-		// Here is where we sort out what to do with this transfer if using a circular buffer. We have
-		//  a completed transfer that can be assumed to fit into the transfer area. We should be able to
-		//  add this to the end of a growing block or to use it to start a new block unless the code
-		//  that calculates the offset to use (in ReadWriteMem) is totally duff.
-		if ((pArea->bCircular) && (pArea->bCircToHost) && (!bCancel) &&	// Time to sort out circular buffer info?
-		    (pdx->StagedRead))	// Only for tohost transfers for now
-		{
-			if (pArea->aBlocks[1].dwSize > 0)	// If block 1 is in use we must append to it
-			{
+		/*  Here is where we sort out what to do with this transfer if using a circular buffer. We have */
+		/*   a completed transfer that can be assumed to fit into the transfer area. We should be able to */
+		/*   add this to the end of a growing block or to use it to start a new block unless the code */
+		/*   that calculates the offset to use (in ReadWriteMem) is totally duff. */
+		if ((pArea->bCircular) && (pArea->bCircToHost) && (!bCancel) &&	/*  Time to sort out circular buffer info? */
+		    (pdx->StagedRead)) {	/*  Only for tohost transfers for now */
+			if (pArea->aBlocks[1].dwSize > 0) {	/*  If block 1 is in use we must append to it */
 				if (pdx->StagedOffset ==
 				    (pArea->aBlocks[1].dwOffset +
 				     pArea->aBlocks[1].dwSize)) {
@@ -569,7 +564,7 @@
 						pArea->aBlocks[1].dwSize,
 						pArea->aBlocks[1].dwOffset);
 				} else {
-					// Here things have gone very, very, wrong, but I cannot see how this can actually be achieved
+					/*  Here things have gone very, very, wrong, but I cannot see how this can actually be achieved */
 					pArea->aBlocks[1].dwOffset =
 					    pdx->StagedOffset;
 					pArea->aBlocks[1].dwSize =
@@ -580,22 +575,20 @@
 						pArea->aBlocks[1].dwSize,
 						pArea->aBlocks[1].dwOffset);
 				}
-			} else	// If block 1 is not used, we try to add to block 0
-			{
-				if (pArea->aBlocks[0].dwSize > 0)	// Got stored block 0 information?
-				{	// Must append onto the existing block 0
+			} else {	/*  If block 1 is not used, we try to add to block 0 */
+				if (pArea->aBlocks[0].dwSize > 0) {	/*  Got stored block 0 information? */
+					/*  Must append onto the existing block 0 */
 					if (pdx->StagedOffset ==
 					    (pArea->aBlocks[0].dwOffset +
 					     pArea->aBlocks[0].dwSize)) {
-						pArea->aBlocks[0].dwSize += pdx->StagedLength;	// Just add this transfer in
+						pArea->aBlocks[0].dwSize += pdx->StagedLength;	/*  Just add this transfer in */
 						dev_dbg(&pdx->interface->dev,
 							"RWM_Complete, circ block 0 now %d bytes at %d",
 							pArea->aBlocks[0].
 							dwSize,
 							pArea->aBlocks[0].
 							dwOffset);
-					} else	// If it doesn't append, put into new block 1
-					{
+					} else {	/*  If it doesn't append, put into new block 1 */
 						pArea->aBlocks[1].dwOffset =
 						    pdx->StagedOffset;
 						pArea->aBlocks[1].dwSize =
@@ -607,8 +600,7 @@
 							pArea->aBlocks[1].
 							dwOffset);
 					}
-				} else	// No info stored yet, just save in block 0
-				{
+				} else	{ /*  No info stored yet, just save in block 0 */
 					pArea->aBlocks[0].dwOffset =
 					    pdx->StagedOffset;
 					pArea->aBlocks[0].dwSize =
@@ -621,21 +613,19 @@
 			}
 		}
 
-		if (!bCancel)	// Don't generate an event if cancelled
-		{
+		if (!bCancel) { /*  Don't generate an event if cancelled */
 			dev_dbg(&pdx->interface->dev,
 				"RWM_Complete,  bCircular %d, bToHost %d, eStart %d, eSize %d",
 				pArea->bCircular, pArea->bEventToHost,
 				pArea->dwEventSt, pArea->dwEventSz);
-			if ((pArea->dwEventSz) &&	// Set a user-mode event...
-			    (pdx->StagedRead == pArea->bEventToHost))	// ...on transfers in this direction?
-			{
-				int iWakeUp = 0;	// assume
-				// If we have completed the right sort of DMA transfer then set the event to notify
-				//   the user code to wake up anyone that is waiting.
-				if ((pArea->bCircular) &&	// Circular areas use a simpler test
-				    (pArea->bCircToHost))	// only in supported direction
-				{	// Is total data waiting up to size limit?
+			if ((pArea->dwEventSz) &&	/*  Set a user-mode event... */
+			    (pdx->StagedRead == pArea->bEventToHost)) {	/*  ...on transfers in this direction? */
+				int iWakeUp = 0;	/*  assume */
+				/*  If we have completed the right sort of DMA transfer then set the event to notify */
+				/*    the user code to wake up anyone that is waiting. */
+				if ((pArea->bCircular) &&	/*  Circular areas use a simpler test */
+				    (pArea->bCircToHost)) {	/*  only in supported direction */
+					/*  Is total data waiting up to size limit? */
 					unsigned int dwTotal =
 					    pArea->aBlocks[0].dwSize +
 					    pArea->aBlocks[1].dwSize;
@@ -653,19 +643,17 @@
 				if (iWakeUp) {
 					dev_dbg(&pdx->interface->dev,
 						"About to set event to notify app");
-					wake_up_interruptible(&pArea->wqEvent);	// wake up waiting processes
-					++pArea->iWakeUp;	// increment wakeup count
+					wake_up_interruptible(&pArea->wqEvent);	/*  wake up waiting processes */
+					++pArea->iWakeUp;	/*  increment wakeup count */
 				}
 			}
 		}
 
-		pdx->dwDMAFlag = MODE_CHAR;	// Switch back to char mode before ReadWriteMem call
+		pdx->dwDMAFlag = MODE_CHAR;	/*  Switch back to char mode before ReadWriteMem call */
 
-		if (!bCancel)	// Don't look for waiting transfer if cancelled
-		{
-			// If we have a transfer waiting, kick it off
-			if (pdx->bXFerWaiting)	// Got a block xfer waiting?
-			{
+		if (!bCancel) {	/*  Don't look for waiting transfer if cancelled */
+			/*  If we have a transfer waiting, kick it off */
+			if (pdx->bXFerWaiting) {	/*  Got a block xfer waiting? */
 				int iReturn;
 				dev_info(&pdx->interface->dev,
 					 "*** RWM_Complete *** pending transfer will now be set up!!!");
@@ -682,22 +670,22 @@
 			}
 		}
 
-	} else			// Here for more to do
-		StageChunk(pdx);	// fire off the next bit
+	} else			/*  Here for more to do */
+		StageChunk(pdx);	/*  fire off the next bit */
 
-	// While we hold the stagedLock, see if we should reallow character input ints
-	// Don't allow if cancelled, or if a new block has started or if there is a waiting block.
-	// This feels wrong as we should ask which spin lock protects dwDMAFlag.
+	/*  While we hold the stagedLock, see if we should reallow character input ints */
+	/*  Don't allow if cancelled, or if a new block has started or if there is a waiting block. */
+	/*  This feels wrong as we should ask which spin lock protects dwDMAFlag. */
 	bRestartCharInput = !bCancel && (pdx->dwDMAFlag == MODE_CHAR)
 	    && !pdx->bXFerWaiting;
 
-	spin_unlock(&pdx->stagedLock);	// Finally release the lock again
+	spin_unlock(&pdx->stagedLock);	/*  Finally release the lock again */
 
-	// This is not correct as dwDMAFlag is protected by the staged lock, but it is treated
-	// in Allowi as if it were protected by the char lock. In any case, most systems will
-	// not be upset by char input during DMA... sigh. Needs sorting out.
-	if (bRestartCharInput)	// may be out of date, but...
-		Allowi(pdx);	// ...Allowi tests a lock too.
+	/*  This is not correct as dwDMAFlag is protected by the staged lock, but it is treated */
+	/*  in Allowi as if it were protected by the char lock. In any case, most systems will */
+	/*  not be upset by char input during DMA... sigh. Needs sorting out. */
+	if (bRestartCharInput)	/*  may be out of date, but... */
+		Allowi(pdx);	/*  ...Allowi tests a lock too. */
 	dev_dbg(&pdx->interface->dev, "%s done", __func__);
 }
 
@@ -709,29 +697,28 @@
 ** The calling code must have acquired the staging spinlock before calling
 **  this function, and is responsible for releasing it. We are at callback level.
 ****************************************************************************/
-static int StageChunk(DEVICE_EXTENSION * pdx)
+static int StageChunk(DEVICE_EXTENSION *pdx)
 {
 	int iReturn = U14ERR_NOERROR;
 	unsigned int ChunkSize;
-	int nPipe = pdx->StagedRead ? 3 : 2;	// The pipe number to use for reads or writes
+	int nPipe = pdx->StagedRead ? 3 : 2;	/*  The pipe number to use for reads or writes */
 	if (pdx->nPipes == 3)
-		nPipe--;	// Adjust for the 3-pipe case
-	if (nPipe < 0)		// and trap case that should never happen
+		nPipe--;	/*  Adjust for the 3-pipe case */
+	if (nPipe < 0)		/*  and trap case that should never happen */
 		return U14ERR_FAIL;
 
-	if (!CanAcceptIoRequests(pdx))	// got sudden remove?
-	{
+	if (!CanAcceptIoRequests(pdx)) {	/*  got sudden remove? */
 		dev_info(&pdx->interface->dev, "%s sudden remove, giving up",
 			 __func__);
-		return U14ERR_FAIL;	// could do with a better error
+		return U14ERR_FAIL;	/*  could do with a better error */
 	}
 
-	ChunkSize = (pdx->StagedLength - pdx->StagedDone);	// transfer length remaining
-	if (ChunkSize > STAGED_SZ)	// make sure to keep legal
-		ChunkSize = STAGED_SZ;	//  limit to max allowed
+	ChunkSize = (pdx->StagedLength - pdx->StagedDone);	/*  transfer length remaining */
+	if (ChunkSize > STAGED_SZ)	/*  make sure to keep legal */
+		ChunkSize = STAGED_SZ;	/*   limit to max allowed */
 
-	if (!pdx->StagedRead)	// if writing...
-		CopyUserSpace(pdx, ChunkSize);	// ...copy data into the buffer
+	if (!pdx->StagedRead)	/*  if writing... */
+		CopyUserSpace(pdx, ChunkSize);	/*  ...copy data into the buffer */
 
 	usb_fill_bulk_urb(pdx->pStagedUrb, pdx->udev,
 			  pdx->StagedRead ? usb_rcvbulkpipe(pdx->udev,
@@ -740,15 +727,15 @@
 			  usb_sndbulkpipe(pdx->udev, pdx->epAddr[nPipe]),
 			  pdx->pCoherStagedIO, ChunkSize, staged_callback, pdx);
 	pdx->pStagedUrb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
-	usb_anchor_urb(pdx->pStagedUrb, &pdx->submitted);	// in case we need to kill it
+	usb_anchor_urb(pdx->pStagedUrb, &pdx->submitted);	/*  in case we need to kill it */
 	iReturn = usb_submit_urb(pdx->pStagedUrb, GFP_ATOMIC);
 	if (iReturn) {
-		usb_unanchor_urb(pdx->pStagedUrb);	// kill it
-		pdx->bPipeError[nPipe] = 1;	// Flag an error to be handled later
+		usb_unanchor_urb(pdx->pStagedUrb);	/*  kill it */
+		pdx->bPipeError[nPipe] = 1;	/*  Flag an error to be handled later */
 		dev_err(&pdx->interface->dev, "%s submit urb failed, code %d",
 			__func__, iReturn);
 	} else
-		pdx->bStagedUrbPending = true;	// Set the flag for staged URB pending
+		pdx->bStagedUrbPending = true;	/*  Set the flag for staged URB pending */
 	dev_dbg(&pdx->interface->dev, "%s done so far:%d, this size:%d",
 		__func__, pdx->StagedDone, ChunkSize);
 
@@ -772,13 +759,12 @@
 **             transfer.
 **    dwLen - the number of bytes to transfer.
 */
-int ReadWriteMem(DEVICE_EXTENSION * pdx, bool Read, unsigned short wIdent,
+int ReadWriteMem(DEVICE_EXTENSION *pdx, bool Read, unsigned short wIdent,
 		 unsigned int dwOffs, unsigned int dwLen)
 {
-	TRANSAREA *pArea = &pdx->rTransDef[wIdent];	// Transfer area info
+	TRANSAREA *pArea = &pdx->rTransDef[wIdent];	/*  Transfer area info */
 
-	if (!CanAcceptIoRequests(pdx))	// Are we in a state to accept new requests?
-	{
+	if (!CanAcceptIoRequests(pdx)) {	/*  Are we in a state to accept new requests? */
 		dev_err(&pdx->interface->dev, "%s can't accept requests",
 			__func__);
 		return U14ERR_FAIL;
@@ -788,56 +774,51 @@
 		"%s xfer %d bytes to %s, offset %d, area %d", __func__, dwLen,
 		Read ? "host" : "1401", dwOffs, wIdent);
 
-	// Amazingly, we can get an escape sequence back before the current staged Urb is done, so we
-	//  have to check for this situation and, if so, wait until all is OK.
+	/*  Amazingly, we can get an escape sequence back before the current staged Urb is done, so we */
+	/*   have to check for this situation and, if so, wait until all is OK. */
 	if (pdx->bStagedUrbPending) {
-		pdx->bXFerWaiting = true;	// Flag we are waiting
+		pdx->bXFerWaiting = true;	/*  Flag we are waiting */
 		dev_info(&pdx->interface->dev,
 			 "%s xfer is waiting, as previous staged pending",
 			 __func__);
 		return U14ERR_NOERROR;
 	}
 
-	if (dwLen == 0)		// allow 0-len read or write; just return success
-	{
+	if (dwLen == 0) {		/*  allow 0-len read or write; just return success */
 		dev_dbg(&pdx->interface->dev,
 			"%s OK; zero-len read/write request", __func__);
 		return U14ERR_NOERROR;
 	}
 
-	if ((pArea->bCircular) &&	// Circular transfer?
-	    (pArea->bCircToHost) && (Read))	// In a supported direction
-	{			// If so, we sort out offset ourself
-		bool bWait = false;	// Flag for transfer having to wait
+	if ((pArea->bCircular) &&	/*  Circular transfer? */
+	    (pArea->bCircToHost) && (Read)) {	/*  In a supported direction */
+				/*  If so, we sort out offset ourself */
+		bool bWait = false;	/*  Flag for transfer having to wait */
 
 		dev_dbg(&pdx->interface->dev,
 			"Circular buffers are %d at %d and %d at %d",
 			pArea->aBlocks[0].dwSize, pArea->aBlocks[0].dwOffset,
 			pArea->aBlocks[1].dwSize, pArea->aBlocks[1].dwOffset);
-		if (pArea->aBlocks[1].dwSize > 0)	// Using the second block already?
-		{
-			dwOffs = pArea->aBlocks[1].dwOffset + pArea->aBlocks[1].dwSize;	// take offset from that
-			bWait = (dwOffs + dwLen) > pArea->aBlocks[0].dwOffset;	// Wait if will overwrite block 0?
-			bWait |= (dwOffs + dwLen) > pArea->dwLength;	// or if it overflows the buffer
-		} else		// Area 1 not in use, try to use area 0
-		{
-			if (pArea->aBlocks[0].dwSize == 0)	// Reset block 0 if not in use
+		if (pArea->aBlocks[1].dwSize > 0) {	/*  Using the second block already? */
+			dwOffs = pArea->aBlocks[1].dwOffset + pArea->aBlocks[1].dwSize;	/*  take offset from that */
+			bWait = (dwOffs + dwLen) > pArea->aBlocks[0].dwOffset;	/*  Wait if will overwrite block 0? */
+			bWait |= (dwOffs + dwLen) > pArea->dwLength;	/*  or if it overflows the buffer */
+		} else {		/*  Area 1 not in use, try to use area 0 */
+			if (pArea->aBlocks[0].dwSize == 0)	/*  Reset block 0 if not in use */
 				pArea->aBlocks[0].dwOffset = 0;
 			dwOffs =
 			    pArea->aBlocks[0].dwOffset +
 			    pArea->aBlocks[0].dwSize;
-			if ((dwOffs + dwLen) > pArea->dwLength)	// Off the end of the buffer?
-			{
-				pArea->aBlocks[1].dwOffset = 0;	// Set up to use second block
+			if ((dwOffs + dwLen) > pArea->dwLength) {	/*  Off the end of the buffer? */
+				pArea->aBlocks[1].dwOffset = 0;	/*  Set up to use second block */
 				dwOffs = 0;
-				bWait = (dwOffs + dwLen) > pArea->aBlocks[0].dwOffset;	// Wait if will overwrite block 0?
-				bWait |= (dwOffs + dwLen) > pArea->dwLength;	// or if it overflows the buffer
+				bWait = (dwOffs + dwLen) > pArea->aBlocks[0].dwOffset;	/*  Wait if will overwrite block 0? */
+				bWait |= (dwOffs + dwLen) > pArea->dwLength;	/*  or if it overflows the buffer */
 			}
 		}
 
-		if (bWait)	// This transfer will have to wait?
-		{
-			pdx->bXFerWaiting = true;	// Flag we are waiting
+		if (bWait) {	/*  This transfer will have to wait? */
+			pdx->bXFerWaiting = true;	/*  Flag we are waiting */
 			dev_dbg(&pdx->interface->dev,
 				"%s xfer waiting for circular buffer space",
 				__func__);
@@ -848,17 +829,17 @@
 			"%s circular xfer, %d bytes starting at %d", __func__,
 			dwLen, dwOffs);
 	}
-	// Save the parameters for the read\write transfer
-	pdx->StagedRead = Read;	// Save the parameters for this read
-	pdx->StagedId = wIdent;	// ID allows us to get transfer area info
-	pdx->StagedOffset = dwOffs;	// The area within the transfer area
+	/*  Save the parameters for the read\write transfer */
+	pdx->StagedRead = Read;	/*  Save the parameters for this read */
+	pdx->StagedId = wIdent;	/*  ID allows us to get transfer area info */
+	pdx->StagedOffset = dwOffs;	/*  The area within the transfer area */
 	pdx->StagedLength = dwLen;
-	pdx->StagedDone = 0;	// Initialise the byte count
-	pdx->dwDMAFlag = MODE_LINEAR;	// Set DMA mode flag at this point
-	pdx->bXFerWaiting = false;	// Clearly not a transfer waiting now
+	pdx->StagedDone = 0;	/*  Initialise the byte count */
+	pdx->dwDMAFlag = MODE_LINEAR;	/*  Set DMA mode flag at this point */
+	pdx->bXFerWaiting = false;	/*  Clearly not a transfer waiting now */
 
-//    KeClearEvent(&pdx->StagingDoneEvent);           // Clear the transfer done event
-	StageChunk(pdx);	// fire off the first chunk
+/*     KeClearEvent(&pdx->StagingDoneEvent);           // Clear the transfer done event */
+	StageChunk(pdx);	/*  fire off the first chunk */
 
 	return U14ERR_NOERROR;
 }
@@ -877,12 +858,11 @@
 	bool bRead = false;
 	unsigned int dDone = *pdDone;
 
-	if (dDone < dGot)	// If there is more data
-	{
-		*pChar = (unsigned char)pBuf[dDone];	// Extract the next char
-		dDone++;	// Increment the done count
+	if (dDone < dGot) {	/*  If there is more data */
+		*pChar = (unsigned char)pBuf[dDone];	/*  Extract the next char */
+		dDone++;	/*  Increment the done count */
 		*pdDone = dDone;
-		bRead = true;	// and flag success
+		bRead = true;	/*  and flag success */
 	}
 
 	return bRead;
@@ -962,32 +942,32 @@
 **  we start handling the data at offset zero.
 **
 *****************************************************************************/
-static bool ReadDMAInfo(volatile DMADESC * pDmaDesc, DEVICE_EXTENSION * pdx,
+static bool ReadDMAInfo(volatile DMADESC *pDmaDesc, DEVICE_EXTENSION *pdx,
 			char *pBuf, unsigned int dwCount)
 {
-	bool bResult = false;	// assume we won't succeed
+	bool bResult = false;	/*  assume we won't succeed */
 	unsigned char ucData;
-	unsigned int dDone = 0;	// We haven't parsed anything so far
+	unsigned int dDone = 0;	/*  We haven't parsed anything so far */
 
 	dev_dbg(&pdx->interface->dev, "%s", __func__);
 
 	if (ReadChar(&ucData, pBuf, &dDone, dwCount)) {
-		unsigned char ucTransCode = (ucData & 0x0F);	// get code for transfer type
-		unsigned short wIdent = ((ucData >> 4) & 0x07);	// and area identifier
+		unsigned char ucTransCode = (ucData & 0x0F);	/*  get code for transfer type */
+		unsigned short wIdent = ((ucData >> 4) & 0x07);	/*  and area identifier */
 
-		// fill in the structure we were given
-		pDmaDesc->wTransType = ucTransCode;	// type of transfer
-		pDmaDesc->wIdent = wIdent;	// area to use
-		pDmaDesc->dwSize = 0;	// initialise other bits
+		/*  fill in the structure we were given */
+		pDmaDesc->wTransType = ucTransCode;	/*  type of transfer */
+		pDmaDesc->wIdent = wIdent;	/*  area to use */
+		pDmaDesc->dwSize = 0;	/*  initialise other bits */
 		pDmaDesc->dwOffset = 0;
 
 		dev_dbg(&pdx->interface->dev, "%s type: %d ident: %d", __func__,
 			pDmaDesc->wTransType, pDmaDesc->wIdent);
 
-		pDmaDesc->bOutWard = (ucTransCode != TM_EXTTOHOST);	// set transfer direction
+		pDmaDesc->bOutWard = (ucTransCode != TM_EXTTOHOST);	/*  set transfer direction */
 
 		switch (ucTransCode) {
-		case TM_EXTTOHOST:	// Extended linear transfer modes (the only ones!)
+		case TM_EXTTOHOST:	/*  Extended linear transfer modes (the only ones!) */
 		case TM_EXTTO1401:
 			{
 				bResult =
@@ -1001,14 +981,14 @@
 						__func__, pDmaDesc->dwOffset,
 						pDmaDesc->dwSize);
 
-					if ((wIdent >= MAX_TRANSAREAS) ||	// Illegal area number, or...
-					    (!pdx->rTransDef[wIdent].bUsed) ||	// area not set up, or...
-					    (pDmaDesc->dwOffset > pdx->rTransDef[wIdent].dwLength) ||	// range/size
+					if ((wIdent >= MAX_TRANSAREAS) ||	/*  Illegal area number, or... */
+					    (!pdx->rTransDef[wIdent].bUsed) ||	/*  area not set up, or... */
+					    (pDmaDesc->dwOffset > pdx->rTransDef[wIdent].dwLength) ||	/*  range/size */
 					    ((pDmaDesc->dwOffset +
 					      pDmaDesc->dwSize) >
 					     (pdx->rTransDef[wIdent].
 					      dwLength))) {
-						bResult = false;	// bad parameter(s)
+						bResult = false;	/*  bad parameter(s) */
 						dev_dbg(&pdx->interface->dev,
 							"%s bad param - id %d, bUsed %d, offset %d, size %d, area length %d",
 							__func__, wIdent,
@@ -1028,7 +1008,7 @@
 	} else
 		bResult = false;
 
-	if (!bResult)		// now check parameters for validity
+	if (!bResult)		/*  now check parameters for validity */
 		dev_err(&pdx->interface->dev, "%s error reading Esc sequence",
 			__func__);
 
@@ -1049,30 +1029,29 @@
 **           this is known to be at least 2 or we will not be called.
 **
 ****************************************************************************/
-static int Handle1401Esc(DEVICE_EXTENSION * pdx, char *pCh,
+static int Handle1401Esc(DEVICE_EXTENSION *pdx, char *pCh,
 			 unsigned int dwCount)
 {
 	int iReturn = U14ERR_FAIL;
 
-	// I have no idea what this next test is about. '?' is 0x3f, which is area 3, code
-	// 15. At the moment, this is not used, so it does no harm, but unless someone can
-	// tell me what this is for, it should be removed from this and the Windows driver.
-	if (pCh[0] == '?')	// Is this an information response
-	{			// Parse and save the information
+	/*  I have no idea what this next test is about. '?' is 0x3f, which is area 3, code */
+	/*  15. At the moment, this is not used, so it does no harm, but unless someone can */
+	/*  tell me what this is for, it should be removed from this and the Windows driver. */
+	if (pCh[0] == '?') {	/*  Is this an information response */
+				/*  Parse and save the information */
 	} else {
-		spin_lock(&pdx->stagedLock);	// Lock others out
+		spin_lock(&pdx->stagedLock);	/*  Lock others out */
 
-		if (ReadDMAInfo(&pdx->rDMAInfo, pdx, pCh, dwCount))	// Get DMA parameters
-		{
-			unsigned short wTransType = pdx->rDMAInfo.wTransType;	// check transfer type
+		if (ReadDMAInfo(&pdx->rDMAInfo, pdx, pCh, dwCount)) {	/*  Get DMA parameters */
+			unsigned short wTransType = pdx->rDMAInfo.wTransType;	/*  check transfer type */
 
 			dev_dbg(&pdx->interface->dev,
 				"%s xfer to %s, offset %d, length %d", __func__,
 				pdx->rDMAInfo.bOutWard ? "1401" : "host",
 				pdx->rDMAInfo.dwOffset, pdx->rDMAInfo.dwSize);
 
-			if (pdx->bXFerWaiting)	// Check here for badly out of kilter...
-			{	// This can never happen, really
+			if (pdx->bXFerWaiting) { /*  Check here for badly out of kilter... */
+				/*  This can never happen, really */
 				dev_err(&pdx->interface->dev,
 					"ERROR: DMA setup while transfer still waiting");
 				spin_unlock(&pdx->stagedLock);
@@ -1090,16 +1069,16 @@
 						dev_err(&pdx->interface->dev,
 							"%s ReadWriteMem() failed %d",
 							__func__, iReturn);
-				} else	// This covers non-linear transfer setup
+				} else	/*  This covers non-linear transfer setup */
 					dev_err(&pdx->interface->dev,
 						"%s Unknown block xfer type %d",
 						__func__, wTransType);
 			}
-		} else		// Failed to read parameters
+		} else		/*  Failed to read parameters */
 			dev_err(&pdx->interface->dev, "%s ReadDMAInfo() fail",
 				__func__);
 
-		spin_unlock(&pdx->stagedLock);	// OK here
+		spin_unlock(&pdx->stagedLock);	/*  OK here */
 	}
 
 	dev_dbg(&pdx->interface->dev, "%s returns %d", __func__, iReturn);
@@ -1113,12 +1092,11 @@
 static void ced_readchar_callback(struct urb *pUrb)
 {
 	DEVICE_EXTENSION *pdx = pUrb->context;
-	int nGot = pUrb->actual_length;	// what we transferred
+	int nGot = pUrb->actual_length;	/*  what we transferred */
 
-	if (pUrb->status)	// Do we have a problem to handle?
-	{
-		int nPipe = pdx->nPipes == 4 ? 1 : 0;	// The pipe number to use for error
-		// sync/async unlink faults aren't errors... just saying device removed or stopped
+	if (pUrb->status) {	/*  Do we have a problem to handle? */
+		int nPipe = pdx->nPipes == 4 ? 1 : 0;	/*  The pipe number to use for error */
+		/*  sync/async unlink faults aren't errors... just saying device removed or stopped */
 		if (!
 		    (pUrb->status == -ENOENT || pUrb->status == -ECONNRESET
 		     || pUrb->status == -ESHUTDOWN)) {
@@ -1133,27 +1111,26 @@
 		spin_lock(&pdx->err_lock);
 		pdx->errors = pUrb->status;
 		spin_unlock(&pdx->err_lock);
-		nGot = 0;	//  and tidy up again if so
+		nGot = 0;	/*   and tidy up again if so */
 
-		spin_lock(&pdx->charInLock);	// already at irq level
-		pdx->bPipeError[nPipe] = 1;	// Flag an error for later
+		spin_lock(&pdx->charInLock);	/*  already at irq level */
+		pdx->bPipeError[nPipe] = 1;	/*  Flag an error for later */
 	} else {
-		if ((nGot > 1) && ((pdx->pCoherCharIn[0] & 0x7f) == 0x1b))	// Esc sequence?
-		{
-			Handle1401Esc(pdx, &pdx->pCoherCharIn[1], nGot - 1);	// handle it
-			spin_lock(&pdx->charInLock);	// already at irq level
+		if ((nGot > 1) && ((pdx->pCoherCharIn[0] & 0x7f) == 0x1b)) {	/*  Esc sequence? */
+			Handle1401Esc(pdx, &pdx->pCoherCharIn[1], nGot - 1);	/*  handle it */
+			spin_lock(&pdx->charInLock);	/*  already at irq level */
 		} else {
-			spin_lock(&pdx->charInLock);	// already at irq level
+			spin_lock(&pdx->charInLock);	/*  already at irq level */
 			if (nGot > 0) {
 				unsigned int i;
 				if (nGot < INBUF_SZ) {
-					pdx->pCoherCharIn[nGot] = 0;	// tidy the string
+					pdx->pCoherCharIn[nGot] = 0;	/*  tidy the string */
 					dev_dbg(&pdx->interface->dev,
 						"%s got %d chars >%s<",
 						__func__, nGot,
 						pdx->pCoherCharIn);
 				}
-				// We know that whatever we read must fit in the input buffer
+				/*  We know that whatever we read must fit in the input buffer */
 				for (i = 0; i < nGot; i++) {
 					pdx->inputBuffer[pdx->dwInBuffPut++] =
 					    pdx->pCoherCharIn[i] & 0x7F;
@@ -1162,17 +1139,17 @@
 				}
 
 				if ((pdx->dwNumInput + nGot) <= INBUF_SZ)
-					pdx->dwNumInput += nGot;	// Adjust the buffer count accordingly
+					pdx->dwNumInput += nGot;	/*  Adjust the buffer count accordingly */
 			} else
 				dev_dbg(&pdx->interface->dev, "%s read ZLP",
 					__func__);
 		}
 	}
 
-	pdx->bReadCharsPending = false;	// No longer have a pending read
-	spin_unlock(&pdx->charInLock);	// already at irq level
+	pdx->bReadCharsPending = false;	/*  No longer have a pending read */
+	spin_unlock(&pdx->charInLock);	/*  already at irq level */
 
-	Allowi(pdx);	// see if we can do the next one
+	Allowi(pdx);	/*  see if we can do the next one */
 }
 
 /****************************************************************************
@@ -1182,25 +1159,25 @@
 ** we can pick up any inward transfers. This can be called in multiple contexts
 ** so we use the irqsave version of the spinlock.
 ****************************************************************************/
-int Allowi(DEVICE_EXTENSION * pdx)
+int Allowi(DEVICE_EXTENSION *pdx)
 {
 	int iReturn = U14ERR_NOERROR;
 	unsigned long flags;
-	spin_lock_irqsave(&pdx->charInLock, flags);	// can be called in multiple contexts
+	spin_lock_irqsave(&pdx->charInLock, flags);	/*  can be called in multiple contexts */
 
-	// We don't want char input running while DMA is in progress as we know that this
-	//  can cause sequencing problems for the 2270. So don't. It will also allow the
-	//  ERR response to get back to the host code too early on some PCs, even if there
-	//  is no actual driver failure, so we don't allow this at all.
-	if (!pdx->bInDrawDown &&	// stop input if
-	    !pdx->bReadCharsPending &&	// If no read request outstanding
-	    (pdx->dwNumInput < (INBUF_SZ / 2)) &&	//  and there is some space
-	    (pdx->dwDMAFlag == MODE_CHAR) &&	//  not doing any DMA
-	    (!pdx->bXFerWaiting) &&	//  no xfer waiting to start
-	    (CanAcceptIoRequests(pdx)))	//  and activity is generally OK
-	{			//  then off we go
-		unsigned int nMax = INBUF_SZ - pdx->dwNumInput;	// max we could read
-		int nPipe = pdx->nPipes == 4 ? 1 : 0;	// The pipe number to use
+	/*  We don't want char input running while DMA is in progress as we know that this */
+	/*   can cause sequencing problems for the 2270. So don't. It will also allow the */
+	/*   ERR response to get back to the host code too early on some PCs, even if there */
+	/*   is no actual driver failure, so we don't allow this at all. */
+	if (!pdx->bInDrawDown &&	/*  stop input if */
+	    !pdx->bReadCharsPending &&	/*  If no read request outstanding */
+	    (pdx->dwNumInput < (INBUF_SZ / 2)) &&	/*   and there is some space */
+	    (pdx->dwDMAFlag == MODE_CHAR) &&	/*   not doing any DMA */
+	    (!pdx->bXFerWaiting) &&	/*   no xfer waiting to start */
+	    (CanAcceptIoRequests(pdx)))	{ /*   and activity is generally OK */
+				/*   then off we go */
+		unsigned int nMax = INBUF_SZ - pdx->dwNumInput;	/*  max we could read */
+		int nPipe = pdx->nPipes == 4 ? 1 : 0;	/*  The pipe number to use */
 
 		dev_dbg(&pdx->interface->dev, "%s %d chars in input buffer",
 			__func__, pdx->dwNumInput);
@@ -1209,16 +1186,16 @@
 				 usb_rcvintpipe(pdx->udev, pdx->epAddr[nPipe]),
 				 pdx->pCoherCharIn, nMax, ced_readchar_callback,
 				 pdx, pdx->bInterval);
-		pdx->pUrbCharIn->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;	// short xfers are OK by default
-		usb_anchor_urb(pdx->pUrbCharIn, &pdx->submitted);	// in case we need to kill it
+		pdx->pUrbCharIn->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;	/*  short xfers are OK by default */
+		usb_anchor_urb(pdx->pUrbCharIn, &pdx->submitted);	/*  in case we need to kill it */
 		iReturn = usb_submit_urb(pdx->pUrbCharIn, GFP_ATOMIC);
 		if (iReturn) {
-			usb_unanchor_urb(pdx->pUrbCharIn);	// remove from list of active Urbs
-			pdx->bPipeError[nPipe] = 1;	// Flag an error to be handled later
+			usb_unanchor_urb(pdx->pUrbCharIn);	/*  remove from list of active Urbs */
+			pdx->bPipeError[nPipe] = 1;	/*  Flag an error to be handled later */
 			dev_err(&pdx->interface->dev,
 				"%s submit urb failed: %d", __func__, iReturn);
 		} else
-			pdx->bReadCharsPending = true;	// Flag that we are active here
+			pdx->bReadCharsPending = true;	/*  Flag that we are active here */
 	}
 
 	spin_unlock_irqrestore(&pdx->charInLock, flags);
@@ -1238,15 +1215,15 @@
 {
 	int err = 0;
 	DEVICE_EXTENSION *pdx = file->private_data;
-	if (!CanAcceptIoRequests(pdx))	// check we still exist
+	if (!CanAcceptIoRequests(pdx))	/*  check we still exist */
 		return -ENODEV;
 
-	// Check that access is allowed, where is is needed. Anything that would have an indeterminate
-	// size will be checked by the specific command.
-	if (_IOC_DIR(cmd) & _IOC_READ)	// read from point of view of user...
-		err = !access_ok(VERIFY_WRITE, (void __user *)ulArg, _IOC_SIZE(cmd));	// is kernel write
-	else if (_IOC_DIR(cmd) & _IOC_WRITE)	// and write from point of view of user...
-		err = !access_ok(VERIFY_READ, (void __user *)ulArg, _IOC_SIZE(cmd));	// is kernel read
+	/*  Check that access is allowed, where is is needed. Anything that would have an indeterminate */
+	/*  size will be checked by the specific command. */
+	if (_IOC_DIR(cmd) & _IOC_READ)	/*  read from point of view of user... */
+		err = !access_ok(VERIFY_WRITE, (void __user *)ulArg, _IOC_SIZE(cmd));	/*  is kernel write */
+	else if (_IOC_DIR(cmd) & _IOC_WRITE)	/*  and write from point of view of user... */
+		err = !access_ok(VERIFY_READ, (void __user *)ulArg, _IOC_SIZE(cmd));	/*  is kernel read */
 	if (err)
 		return -EFAULT;
 
@@ -1289,7 +1266,7 @@
 		return -1;
 
 	case _IOC_NR(IOCTL_CED_GETDRIVERREVISION):
-		return (2 << 24) | (DRIVERMAJREV << 16) | DRIVERMINREV;	// USB | MAJOR | MINOR
+		return (2 << 24) | (DRIVERMAJREV << 16) | DRIVERMINREV;	/*  USB | MAJOR | MINOR */
 
 	case _IOC_NR(IOCTL_CED_GETTRANSFER):
 		return GetTransfer(pdx, (TGET_TX_BLOCK __user *) ulArg);
@@ -1335,7 +1312,7 @@
 		return DbgStopLoop(pdx);
 
 	case _IOC_NR(IOCTL_CED_FULLRESET):
-		pdx->bForceReset = true;	// Set a flag for a full reset
+		pdx->bForceReset = true;	/*  Set a flag for a full reset */
 		break;
 
 	case _IOC_NR(IOCTL_CED_SETCIRCULAR):
@@ -1378,8 +1355,8 @@
 	.minor_base = USB_CED_MINOR_BASE,
 };
 
-// Check that the device that matches a 1401 vendor and product ID is OK to use and
-// initialise our DEVICE_EXTENSION.
+/*  Check that the device that matches a 1401 vendor and product ID is OK to use and */
+/*  initialise our DEVICE_EXTENSION. */
 static int ced_probe(struct usb_interface *interface,
 		     const struct usb_device_id *id)
 {
@@ -1389,23 +1366,22 @@
 	int i, bcdDevice;
 	int retval = -ENOMEM;
 
-	// allocate memory for our device extension and initialize it
+	/*  allocate memory for our device extension and initialize it */
 	pdx = kzalloc(sizeof(*pdx), GFP_KERNEL);
 	if (!pdx)
 		goto error;
 
-	for (i = 0; i < MAX_TRANSAREAS; ++i)	// Initialise the wait queues
-	{
+	for (i = 0; i < MAX_TRANSAREAS; ++i) {	/*  Initialise the wait queues */
 		init_waitqueue_head(&pdx->rTransDef[i].wqEvent);
 	}
 
-	// Put initialises for our stuff here. Note that all of *pdx is zero, so
-	// no need to explicitly zero it.
+	/*  Put initialises for our stuff here. Note that all of *pdx is zero, so */
+	/*  no need to explicitly zero it. */
 	spin_lock_init(&pdx->charOutLock);
 	spin_lock_init(&pdx->charInLock);
 	spin_lock_init(&pdx->stagedLock);
 
-	// Initialises from the skeleton stuff
+	/*  Initialises from the skeleton stuff */
 	kref_init(&pdx->kref);
 	mutex_init(&pdx->io_mutex);
 	spin_lock_init(&pdx->err_lock);
@@ -1414,7 +1390,7 @@
 	pdx->udev = usb_get_dev(interface_to_usbdev(interface));
 	pdx->interface = interface;
 
-	// Attempt to identify the device
+	/*  Attempt to identify the device */
 	bcdDevice = pdx->udev->descriptor.bcdDevice;
 	i = (bcdDevice >> 8);
 	if (i == 0)
@@ -1426,8 +1402,8 @@
 			__func__, bcdDevice);
 		goto error;
 	}
-	// set up the endpoint information. We only care about the number of EP as
-	// we know that we are dealing with a 1401 device.
+	/*  set up the endpoint information. We only care about the number of EP as */
+	/*  we know that we are dealing with a 1401 device. */
 	iface_desc = interface->cur_altsetting;
 	pdx->nPipes = iface_desc->desc.bNumEndpoints;
 	dev_info(&interface->dev, "1401Type=%d with %d End Points",
@@ -1435,10 +1411,10 @@
 	if ((pdx->nPipes < 3) || (pdx->nPipes > 4))
 		goto error;
 
-	// Allocate the URBs we hold for performing transfers
-	pdx->pUrbCharOut = usb_alloc_urb(0, GFP_KERNEL);	// character output URB
-	pdx->pUrbCharIn = usb_alloc_urb(0, GFP_KERNEL);	// character input URB
-	pdx->pStagedUrb = usb_alloc_urb(0, GFP_KERNEL);	// block transfer URB
+	/*  Allocate the URBs we hold for performing transfers */
+	pdx->pUrbCharOut = usb_alloc_urb(0, GFP_KERNEL);	/*  character output URB */
+	pdx->pUrbCharIn = usb_alloc_urb(0, GFP_KERNEL);	/*  character input URB */
+	pdx->pStagedUrb = usb_alloc_urb(0, GFP_KERNEL);	/*  block transfer URB */
 	if (!pdx->pUrbCharOut || !pdx->pUrbCharIn || !pdx->pStagedUrb) {
 		dev_err(&interface->dev, "%s URB alloc failed", __func__);
 		goto error;
@@ -1464,15 +1440,14 @@
 		pdx->epAddr[i] = endpoint->bEndpointAddress;
 		dev_info(&interface->dev, "Pipe %d, ep address %02x", i,
 			 pdx->epAddr[i]);
-		if (((pdx->nPipes == 3) && (i == 0)) ||	// if char input end point
+		if (((pdx->nPipes == 3) && (i == 0)) ||	/*  if char input end point */
 		    ((pdx->nPipes == 4) && (i == 1))) {
-			pdx->bInterval = endpoint->bInterval;	// save the endpoint interrupt interval
+			pdx->bInterval = endpoint->bInterval;	/*  save the endpoint interrupt interval */
 			dev_info(&interface->dev, "Pipe %d, bInterval = %d", i,
 				 pdx->bInterval);
 		}
-		// Detect USB2 by checking last ep size (64 if USB1)
-		if (i == pdx->nPipes - 1)	// if this is the last ep (bulk)
-		{
+		/*  Detect USB2 by checking last ep size (64 if USB1) */
+		if (i == pdx->nPipes - 1) {	/*  if this is the last ep (bulk) */
 			pdx->bIsUSB2 =
 			    le16_to_cpu(endpoint->wMaxPacketSize) > 64;
 			dev_info(&pdx->interface->dev, "USB%d",
@@ -1501,7 +1476,7 @@
 
 error:
 	if (pdx)
-		kref_put(&pdx->kref, ced_delete);	// frees allocated memory
+		kref_put(&pdx->kref, ced_delete);	/*  frees allocated memory */
 	return retval;
 }
 
@@ -1511,39 +1486,39 @@
 	int minor = interface->minor;
 	int i;
 
-	usb_set_intfdata(interface, NULL);	// remove the pdx from the interface
-	usb_deregister_dev(interface, &ced_class);	// give back our minor device number
+	usb_set_intfdata(interface, NULL);	/*  remove the pdx from the interface */
+	usb_deregister_dev(interface, &ced_class);	/*  give back our minor device number */
 
-	mutex_lock(&pdx->io_mutex);	// stop more I/O starting while...
-	ced_draw_down(pdx);	// ...wait for then kill any io
+	mutex_lock(&pdx->io_mutex);	/*  stop more I/O starting while... */
+	ced_draw_down(pdx);	/*  ...wait for then kill any io */
 	for (i = 0; i < MAX_TRANSAREAS; ++i) {
-		int iErr = ClearArea(pdx, i);	// ...release any used memory
+		int iErr = ClearArea(pdx, i);	/*  ...release any used memory */
 		if (iErr == U14ERR_UNLOCKFAIL)
 			dev_err(&pdx->interface->dev, "%s Area %d was in used",
 				__func__, i);
 	}
-	pdx->interface = NULL;	// ...we kill off link to interface
+	pdx->interface = NULL;	/*  ...we kill off link to interface */
 	mutex_unlock(&pdx->io_mutex);
 
 	usb_kill_anchored_urbs(&pdx->submitted);
 
-	kref_put(&pdx->kref, ced_delete);	// decrement our usage count
+	kref_put(&pdx->kref, ced_delete);	/*  decrement our usage count */
 
 	dev_info(&interface->dev, "USB cedusb #%d now disconnected", minor);
 }
 
-// Wait for all the urbs we know of to be done with, then kill off any that
-// are left. NBNB we will need to have a mechanism to stop circular xfers
-// from trying to fire off more urbs. We will wait up to 3 seconds for Urbs
-// to be done.
-void ced_draw_down(DEVICE_EXTENSION * pdx)
+/*  Wait for all the urbs we know of to be done with, then kill off any that */
+/*  are left. NBNB we will need to have a mechanism to stop circular xfers */
+/*  from trying to fire off more urbs. We will wait up to 3 seconds for Urbs */
+/*  to be done. */
+void ced_draw_down(DEVICE_EXTENSION *pdx)
 {
 	int time;
 	dev_dbg(&pdx->interface->dev, "%s called", __func__);
 
 	pdx->bInDrawDown = true;
 	time = usb_wait_anchor_empty_timeout(&pdx->submitted, 3000);
-	if (!time) {		// if we timed out we kill the urbs
+	if (!time) {		/*  if we timed out we kill the urbs */
 		usb_kill_anchored_urbs(&pdx->submitted);
 		dev_err(&pdx->interface->dev, "%s timed out", __func__);
 	}

diff --git a/drivers/staging/ced1401/usb1401.h b/drivers/staging/ced1401/usb1401.h
index 8fc6958..f031e3a 100644
--- a/drivers/staging/ced1401/usb1401.h
+++ b/drivers/staging/ced1401/usb1401.h

@@ -26,31 +26,32 @@
 #define UINT unsigned int
 #endif
 
-/// Device type codes, but these don't need to be extended - a succession is assumed
-/// These are set for usb from the bcdDevice field (suitably mangled). Future devices
-/// will be added in order of device creation to the list, so the names here are just
-/// to help use remember which device is which. The U14ERR_... values follow the same
-/// pattern for modern devices.
-#define TYPEUNKNOWN        -1             // dont know
-#define TYPE1401           0              // standard 1401
-#define TYPEPLUS           1              // 1401 plus
-#define TYPEU1401          2              // u1401
-#define TYPEPOWER          3              // Power1401
-#define TYPEU14012         4              // u1401 mkII
-#define TYPEPOWER2         5              // Power1401 mk II
-#define TYPEMICRO3         6              // Micro1401-3
-#define TYPEPOWER3         7              // Power1401-3
+/** Device type codes, but these don't need to be extended - a succession is assumed
+** These are set for usb from the bcdDevice field (suitably mangled). Future devices
+** will be added in order of device creation to the list, so the names here are just
+** to help use remember which device is which. The U14ERR_... values follow the same
+** pattern for modern devices.a
+**/
+#define TYPEUNKNOWN        -1             /*  dont know */
+#define TYPE1401           0              /*  standard 1401 */
+#define TYPEPLUS           1              /*  1401 plus */
+#define TYPEU1401          2              /*  u1401 */
+#define TYPEPOWER          3              /*  Power1401 */
+#define TYPEU14012         4              /*  u1401 mkII */
+#define TYPEPOWER2         5              /*  Power1401 mk II */
+#define TYPEMICRO3         6              /*  Micro1401-3 */
+#define TYPEPOWER3         7              /*  Power1401-3 */
 
-/// Some useful defines of constants. DONT FORGET to change the version in the
-/// resources whenever you change it here!.
-#define DRIVERMAJREV      2             // driver revision level major (match windows)
-#define DRIVERMINREV      0             // driver revision level minor
+/*  Some useful defines of constants. DONT FORGET to change the version in the */
+/*  resources whenever you change it here!. */
+#define DRIVERMAJREV      2             /*  driver revision level major (match windows) */
+#define DRIVERMINREV      0             /*  driver revision level minor */
 
-/// Definitions of the various block transfer command codes
-#define TM_EXTTOHOST    8               // extended tohost
-#define TM_EXTTO1401    9               // extended to1401
+/*  Definitions of the various block transfer command codes */
+#define TM_EXTTOHOST    8               /*  extended tohost */
+#define TM_EXTTO1401    9               /*  extended to1401 */
 
-/// Definitions of values in usbReqtype. Used in sorting out setup actions
+/*  Definitions of values in usbReqtype. Used in sorting out setup actions */
 #define H_TO_D 0x00
 #define D_TO_H 0x80
 #define VENDOR 0x40
@@ -58,7 +59,7 @@
 #define INTREQ 0x01
 #define ENDREQ 0x02
 
-/// Definition of values in usbRequest, again used to sort out setup
+/*  Definition of values in usbRequest, again used to sort out setup */
 #define GET_STATUS      0x00
 #define CLEAR_FEATURE   0x01
 #define SET_FEATURE     0x03
@@ -71,8 +72,8 @@
 #define SET_INTERFACE   0x0b
 #define SYNCH_FRAME     0x0c
 
-/// Definitions of the various debug command codes understood by the 1401. These
-/// are used in various vendor-specific commands to achieve the desired effect
+/*  Definitions of the various debug command codes understood by the 1401. These */
+/*  are used in various vendor-specific commands to achieve the desired effect */
 #define DB_GRAB         0x50            /* Grab is a NOP for USB */
 #define DB_FREE         0x51            /* Free is a NOP for the USB */
 #define DB_SETADD       0x52            /* Set debug address (double) */
@@ -91,139 +92,135 @@
 #define CR_CHAR          0x0D           /* The carriage return character */
 #define CR_CHAR_80       0x8d           /*  and with bit 7 set */
 
-/// A structure holding information about a block of memory for use in circular transfers
-typedef struct circBlk
-{
-    volatile UINT dwOffset;             /* Offset within area of block start */
-    volatile UINT dwSize;               /* Size of the block, in bytes (0 = unused) */
+/*  A structure holding information about a block of memory for use in circular transfers */
+typedef struct circBlk {
+	volatile UINT dwOffset;             /* Offset within area of block start */
+	volatile UINT dwSize;               /* Size of the block, in bytes (0 = unused) */
 } CIRCBLK;
 
-/// A structure holding all of the information about a transfer area - an area of
-///  memory set up for use either as a source or destination in DMA transfers.
-typedef struct transarea
-{
-    void*       lpvBuff;                // User address of xfer area saved for completeness
-    UINT        dwBaseOffset;           // offset to start of xfer area in first page
-    UINT        dwLength;               // Length of xfer area, in bytes
-    struct page **pPages;               // Points at array of locked down pages
-    int         nPages;                 // number of pages that are locked down
-    bool        bUsed;                  // Is this structure in use?
-    bool        bCircular;              // Is this area for circular transfers?
-    bool        bCircToHost;            // Flag for direction of circular transfer
-    bool        bEventToHost;           // Set event on transfer to host?
-    int         iWakeUp;                // Set 1 on event, cleared by TestEvent()
-    UINT        dwEventSt;              // Defines section within xfer area for...
-    UINT        dwEventSz;              // ...notification by the event SZ is 0 if unset
-    CIRCBLK     aBlocks[2];             // Info on a pair of circular blocks
-    wait_queue_head_t wqEvent;          // The wait queue for events in this area MUST BE LAST
+/*  A structure holding all of the information about a transfer area - an area of */
+/*   memory set up for use either as a source or destination in DMA transfers. */
+typedef struct transarea {
+	void	*lpvBuff;                /*  User address of xfer area saved for completeness */
+	UINT        dwBaseOffset;           /*  offset to start of xfer area in first page */
+	UINT        dwLength;               /*  Length of xfer area, in bytes */
+	struct page **pPages;               /*  Points at array of locked down pages */
+	int         nPages;                 /*  number of pages that are locked down */
+	bool        bUsed;                  /*  Is this structure in use? */
+	bool        bCircular;              /*  Is this area for circular transfers? */
+	bool        bCircToHost;            /*  Flag for direction of circular transfer */
+	bool        bEventToHost;           /*  Set event on transfer to host? */
+	int         iWakeUp;                /*  Set 1 on event, cleared by TestEvent() */
+	UINT        dwEventSt;              /*  Defines section within xfer area for... */
+	UINT        dwEventSz;              /*  ...notification by the event SZ is 0 if unset */
+	CIRCBLK     aBlocks[2];             /*  Info on a pair of circular blocks */
+	wait_queue_head_t wqEvent;          /*  The wait queue for events in this area MUST BE LAST */
 } TRANSAREA;
 
-/// The DMADESC structure is used to hold information on the transfer in progress. It
-/// is set up by ReadDMAInfo, using information sent by the 1401 in an escape sequence.
-typedef struct dmadesc
-{
-    unsigned short wTransType;          /* transfer type as TM_xxx above        */
-    unsigned short wIdent;              /* identifier word                      */
-    unsigned int   dwSize;              /* bytes to transfer                    */
-    unsigned int   dwOffset;            /* offset into transfer area for trans  */
-    bool           bOutWard;            /* true when data is going TO 1401      */
+/*  The DMADESC structure is used to hold information on the transfer in progress. It */
+/*  is set up by ReadDMAInfo, using information sent by the 1401 in an escape sequence. */
+typedef struct dmadesc {
+	unsigned short wTransType;          /* transfer type as TM_xxx above        */
+	unsigned short wIdent;              /* identifier word                      */
+	unsigned int   dwSize;              /* bytes to transfer                    */
+	unsigned int   dwOffset;            /* offset into transfer area for trans  */
+	bool           bOutWard;            /* true when data is going TO 1401      */
 } DMADESC;
 
 #define INBUF_SZ         256            /* input buffer size */
 #define OUTBUF_SZ        256            /* output buffer size */
-#define STAGED_SZ 0x10000               // size of coherent buffer for staged transfers
+#define STAGED_SZ 0x10000               /*  size of coherent buffer for staged transfers */
 
-/// Structure to hold all of our device specific stuff. We are making this as similar as we
-/// can to the Windows driver to help in our understanding of what is going on.
-typedef struct _DEVICE_EXTENSION
-{
-    char inputBuffer[INBUF_SZ];         /* The two buffers */
-    char outputBuffer[OUTBUF_SZ];       /* accessed by the host functions */
-    volatile unsigned int dwNumInput;   /* num of chars in input buffer   */
-    volatile unsigned int dwInBuffGet;  /* where to get from input buffer */
-    volatile unsigned int dwInBuffPut;  /* where to put into input buffer */
-    volatile unsigned int dwNumOutput;  /* num of chars in output buffer  */
-    volatile unsigned int dwOutBuffGet; /* where to get from output buffer*/
-    volatile unsigned int dwOutBuffPut; /* where to put into output buffer*/
+/*  Structure to hold all of our device specific stuff. We are making this as similar as we */
+/*  can to the Windows driver to help in our understanding of what is going on. */
+typedef struct _DEVICE_EXTENSION {
+	char inputBuffer[INBUF_SZ];         /* The two buffers */
+	char outputBuffer[OUTBUF_SZ];       /* accessed by the host functions */
+	volatile unsigned int dwNumInput;   /* num of chars in input buffer   */
+	volatile unsigned int dwInBuffGet;  /* where to get from input buffer */
+	volatile unsigned int dwInBuffPut;  /* where to put into input buffer */
+	volatile unsigned int dwNumOutput;  /* num of chars in output buffer  */
+	volatile unsigned int dwOutBuffGet; /* where to get from output buffer*/
+	volatile unsigned int dwOutBuffPut; /* where to put into output buffer*/
 
-    volatile bool bSendCharsPending;    /* Flag to indicate sendchar active */
-    volatile bool bReadCharsPending;    /* Flag to indicate a read is primed */
-    char* pCoherCharOut;                /* special aligned buffer for chars to 1401 */
-    struct urb* pUrbCharOut;            /* urb used for chars to 1401 */
-    char* pCoherCharIn;                 /* special aligned buffer for chars to host */
-    struct urb* pUrbCharIn;             /* urb used for chars to host */
+	volatile bool bSendCharsPending;    /* Flag to indicate sendchar active */
+	volatile bool bReadCharsPending;    /* Flag to indicate a read is primed */
+	char *pCoherCharOut;                /* special aligned buffer for chars to 1401 */
+	struct urb *pUrbCharOut;            /* urb used for chars to 1401 */
+	char *pCoherCharIn;                 /* special aligned buffer for chars to host */
+	struct urb *pUrbCharIn;             /* urb used for chars to host */
 
-    spinlock_t charOutLock;             /* to protect the outputBuffer and outputting */
-    spinlock_t charInLock;              /* to protect the inputBuffer and char reads */
-    __u8 bInterval;                     /* Interrupt end point interval */
+	spinlock_t charOutLock;             /* to protect the outputBuffer and outputting */
+	spinlock_t charInLock;              /* to protect the inputBuffer and char reads */
+	__u8 bInterval;                     /* Interrupt end point interval */
 
-    volatile unsigned int dwDMAFlag;    /* state of DMA */
-    TRANSAREA rTransDef[MAX_TRANSAREAS];/* transfer area info */
-    volatile DMADESC rDMAInfo;          // info on current DMA transfer
-    volatile bool bXFerWaiting;         // Flag set if DMA transfer stalled
-    volatile bool bInDrawDown;          // Flag that we want to halt transfers
+	volatile unsigned int dwDMAFlag;    /* state of DMA */
+	TRANSAREA rTransDef[MAX_TRANSAREAS];/* transfer area info */
+	volatile DMADESC rDMAInfo;          /*  info on current DMA transfer */
+	volatile bool bXFerWaiting;         /*  Flag set if DMA transfer stalled */
+	volatile bool bInDrawDown;          /*  Flag that we want to halt transfers */
 
-    // Parameters relating to a block read\write that is in progress. Some of these values
-    //  are equivalent to values in rDMAInfo. The values here are those in use, while those
-    //  in rDMAInfo are those received from the 1401 via an escape sequence. If another
-    //  escape sequence arrives before the previous xfer ends, rDMAInfo values are updated while these
-    //  are used to finish off the current transfer.
-    volatile short StagedId;            // The transfer area id for this transfer
-    volatile bool StagedRead;           // Flag TRUE for read from 1401, FALSE for write
-    volatile unsigned int StagedLength; // Total length of this transfer
-    volatile unsigned int StagedOffset; // Offset within memory area for transfer start
-    volatile unsigned int StagedDone;   // Bytes transferred so far
-    volatile bool bStagedUrbPending;    // Flag to indicate active
-    char* pCoherStagedIO;               // buffer used for block transfers
-    struct urb* pStagedUrb;             // The URB to use
-    spinlock_t stagedLock;              // protects ReadWriteMem() and circular buffer stuff
+	/*  Parameters relating to a block read\write that is in progress. Some of these values */
+	/*   are equivalent to values in rDMAInfo. The values here are those in use, while those */
+	/*   in rDMAInfo are those received from the 1401 via an escape sequence. If another */
+	/*   escape sequence arrives before the previous xfer ends, rDMAInfo values are updated while these */
+	/*   are used to finish off the current transfer. */
+	volatile short StagedId;            /*  The transfer area id for this transfer */
+	volatile bool StagedRead;           /*  Flag TRUE for read from 1401, FALSE for write */
+	volatile unsigned int StagedLength; /*  Total length of this transfer */
+	volatile unsigned int StagedOffset; /*  Offset within memory area for transfer start */
+	volatile unsigned int StagedDone;   /*  Bytes transferred so far */
+	volatile bool bStagedUrbPending;    /*  Flag to indicate active */
+	char *pCoherStagedIO;               /*  buffer used for block transfers */
+	struct urb *pStagedUrb;             /*  The URB to use */
+	spinlock_t stagedLock;              /*  protects ReadWriteMem() and circular buffer stuff */
 
-    short s1401Type;                    // type of 1401 attached
-    short sCurrentState;                // current error state
-    bool bIsUSB2;                       // type of the interface we connect to
-    bool bForceReset;                   // Flag to make sure we get a real reset
-    __u32 statBuf[2];                   // buffer for 1401 state info
+	short s1401Type;                    /*  type of 1401 attached */
+	short sCurrentState;                /*  current error state */
+	bool bIsUSB2;                       /*  type of the interface we connect to */
+	bool bForceReset;                   /*  Flag to make sure we get a real reset */
+	__u32 statBuf[2];                   /*  buffer for 1401 state info */
 
-    unsigned long ulSelfTestTime;       // used to timeout self test
+	unsigned long ulSelfTestTime;       /*  used to timeout self test */
 
-    int nPipes;                         // Should be 3 or 4 depending on 1401 usb chip
-    int bPipeError[4];                  // set non-zero if an error on one of the pipe
-    __u8 epAddr[4];                     // addresses of the 3/4 end points
+	int nPipes;                         /*  Should be 3 or 4 depending on 1401 usb chip */
+	int bPipeError[4];                  /*  set non-zero if an error on one of the pipe */
+	__u8 epAddr[4];                     /*  addresses of the 3/4 end points */
 
-    struct usb_device *udev;            // the usb device for this device
-    struct usb_interface *interface;    // the interface for this device, NULL if removed
-    struct usb_anchor submitted;        // in case we need to retract our submissions
-    struct mutex io_mutex;              // synchronize I/O with disconnect, one user-mode caller at a time
+	struct usb_device *udev;            /*  the usb device for this device */
+	struct usb_interface *interface;    /*  the interface for this device, NULL if removed */
+	struct usb_anchor submitted;        /*  in case we need to retract our submissions */
+	struct mutex io_mutex;              /*  synchronize I/O with disconnect, one user-mode caller at a time */
 
-    int    errors;                      // the last request tanked
-    int    open_count;                  // count the number of openers
-    spinlock_t err_lock;                // lock for errors
-    struct kref kref;
-}DEVICE_EXTENSION, *PDEVICE_EXTENSION;
+	int    errors;                      /*  the last request tanked */
+	int    open_count;                  /*  count the number of openers */
+	spinlock_t err_lock;                /*  lock for errors */
+	struct kref kref;
+} DEVICE_EXTENSION, *PDEVICE_EXTENSION;
 #define to_DEVICE_EXTENSION(d) container_of(d, DEVICE_EXTENSION, kref)
 
-/// Definitions of routimes used between compilation object files
-// in usb1401.c
-extern int Allowi(DEVICE_EXTENSION* pdx);
-extern int SendChars(DEVICE_EXTENSION* pdx);
+/*  Definitions of routimes used between compilation object files */
+/*  in usb1401.c */
+extern int Allowi(DEVICE_EXTENSION *pdx);
+extern int SendChars(DEVICE_EXTENSION *pdx);
 extern void ced_draw_down(DEVICE_EXTENSION *pdx);
 extern int ReadWriteMem(DEVICE_EXTENSION *pdx, bool Read, unsigned short wIdent,
-                      unsigned int dwOffs, unsigned int dwLen);
+				unsigned int dwOffs, unsigned int dwLen);
 
-// in ced_ioc.c
+/*  in ced_ioc.c */
 extern int ClearArea(DEVICE_EXTENSION *pdx, int nArea);
-extern int SendString(DEVICE_EXTENSION* pdx, const char __user* pData, unsigned int n);
+extern int SendString(DEVICE_EXTENSION *pdx, const char __user *pData, unsigned int n);
 extern int SendChar(DEVICE_EXTENSION *pdx, char c);
-extern int Get1401State(DEVICE_EXTENSION* pdx, __u32* state, __u32* error);
+extern int Get1401State(DEVICE_EXTENSION *pdx, __u32 *state, __u32 *error);
 extern int ReadWrite_Cancel(DEVICE_EXTENSION *pdx);
-extern bool Is1401(DEVICE_EXTENSION* pdx);
-extern bool QuickCheck(DEVICE_EXTENSION* pdx, bool bTestBuff, bool bCanReset);
+extern bool Is1401(DEVICE_EXTENSION *pdx);
+extern bool QuickCheck(DEVICE_EXTENSION *pdx, bool bTestBuff, bool bCanReset);
 extern int Reset1401(DEVICE_EXTENSION *pdx);
 extern int GetChar(DEVICE_EXTENSION *pdx);
-extern int GetString(DEVICE_EXTENSION *pdx, char __user* pUser, int n);
+extern int GetString(DEVICE_EXTENSION *pdx, char __user *pUser, int n);
 extern int SetTransfer(DEVICE_EXTENSION *pdx, TRANSFERDESC __user *pTD);
 extern int UnsetTransfer(DEVICE_EXTENSION *pdx, int nArea);
-extern int SetEvent(DEVICE_EXTENSION *pdx, TRANSFEREVENT __user*pTE);
+extern int SetEvent(DEVICE_EXTENSION *pdx, TRANSFEREVENT __user *pTE);
 extern int Stat1401(DEVICE_EXTENSION *pdx);
 extern int LineCount(DEVICE_EXTENSION *pdx);
 extern int GetOutBufSpace(DEVICE_EXTENSION *pdx);
@@ -235,15 +232,15 @@
 extern int CheckSelfTest(DEVICE_EXTENSION *pdx, TGET_SELFTEST __user *pGST);
 extern int TypeOf1401(DEVICE_EXTENSION *pdx);
 extern int TransferFlags(DEVICE_EXTENSION *pdx);
-extern int DbgPeek(DEVICE_EXTENSION *pdx, TDBGBLOCK __user* pDB);
+extern int DbgPeek(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB);
 extern int DbgPoke(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB);
 extern int DbgRampData(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB);
 extern int DbgRampAddr(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB);
 extern int DbgGetData(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB);
 extern int DbgStopLoop(DEVICE_EXTENSION *pdx);
 extern int SetCircular(DEVICE_EXTENSION *pdx, TRANSFERDESC __user *pTD);
-extern int GetCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __user* pCB);
-extern int FreeCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __user* pCB);
+extern int GetCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __user *pCB);
+extern int FreeCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __user *pCB);
 extern int WaitEvent(DEVICE_EXTENSION *pdx, int nArea, int msTimeOut);
 extern int TestEvent(DEVICE_EXTENSION *pdx, int nArea);
 #endif

diff --git a/drivers/staging/ced1401/use1401.h b/drivers/staging/ced1401/use1401.h
index 86294e2..b7997c9 100644
--- a/drivers/staging/ced1401/use1401.h
+++ b/drivers/staging/ced1401/use1401.h

@@ -11,16 +11,16 @@
 #define __USE1401_H__
 #include "machine.h"
 
-// Some definitions to make things compatible. If you want to use Use1401 directly
-//  from a Windows program you should define U14_NOT_DLL, in which case you also
-//  MUST make sure that your application startup code calls U14InitLib().
-// DLL_USE1401 is defined when you are building the Use1401 dll, not otherwise.
+/*  Some definitions to make things compatible. If you want to use Use1401 directly */
+/*   from a Windows program you should define U14_NOT_DLL, in which case you also */
+/*   MUST make sure that your application startup code calls U14InitLib(). */
+/*  DLL_USE1401 is defined when you are building the Use1401 dll, not otherwise. */
 #ifdef _IS_WINDOWS_
 #ifndef U14_NOT_DLL
 #ifdef DLL_USE1401
-#define U14API(retType) retType DllExport __stdcall
+#define U14API(retType) (retType DllExport __stdcall)
 #else
-#define U14API(retType) retType DllImport __stdcall
+#define U14API(retType) (retType DllImport __stdcall)
 #endif
 #endif
 
@@ -36,7 +36,7 @@
 #ifdef _QT
 #ifndef U14_NOT_DLL
 #undef U14API
-#define U14API(retType) retType __declspec(dllimport) __stdcall
+#define U14API(retType) (retType __declspec(dllimport) __stdcall)
 #endif
 #undef U14LONG
 #define U14LONG int
@@ -50,20 +50,20 @@
 #define U14LONG long
 #endif
 
-/// Error codes: We need them here as user space can see them.
-#define U14ERR_NOERROR        0             // no problems
+/* Error codes: We need them here as user space can see them. */
+#define U14ERR_NOERROR        0             /*  no problems */
 
-/// Device error codes, but these don't need to be extended - a succession is assumed
-#define U14ERR_STD            4              // standard 1401 connected
-#define U14ERR_U1401          5              // u1401 connected
-#define U14ERR_PLUS           6              // 1401 plus connected
-#define U14ERR_POWER          7              // Power1401 connected
-#define U14ERR_U14012         8              // u1401 mkII connected
+/* Device error codes, but these don't need to be extended - a succession is assumed */
+#define U14ERR_STD            4              /*  standard 1401 connected */
+#define U14ERR_U1401          5              /*  u1401 connected */
+#define U14ERR_PLUS           6              /*  1401 plus connected */
+#define U14ERR_POWER          7              /*  Power1401 connected */
+#define U14ERR_U14012         8              /*  u1401 mkII connected */
 #define U14ERR_POWER2         9
 #define U14ERR_U14013        10
 #define U14ERR_POWER3        11
 
-/// NBNB Error numbers need shifting as some linux error codes start at 512
+/* NBNB Error numbers need shifting as some linux error codes start at 512 */
 #define U14ERR(n)             (n+U14ERRBASE)
 #define U14ERR_OFF            U14ERR(0)      /* 1401 there but switched off    */
 #define U14ERR_NC             U14ERR(-1)     /* 1401 not connected             */
@@ -113,7 +113,7 @@
 #define U14ERR_DRIVCOMMS      U14ERR(-110)   /* failed talking to driver       */
 #define U14ERR_OUTOFMEMORY    U14ERR(-111)   /* needed memory and couldnt get it*/
 
-/// 1401 type codes.
+/* / 1401 type codes. */
 #define U14TYPE1401           0           /* standard 1401                  */
 #define U14TYPEPLUS           1           /* 1401 plus                      */
 #define U14TYPEU1401          2           /* u1401                          */
@@ -124,9 +124,9 @@
 #define U14TYPEPOWER3         7           /* power1401-3                    */
 #define U14TYPEUNKNOWN        -1          /* dont know                      */
 
-/// Transfer flags to allow driver capabilities to be interrogated
+/* Transfer flags to allow driver capabilities to be interrogated */
 
-/// Constants for transfer flags
+/* Constants for transfer flags */
 #define U14TF_USEDMA          1           /* Transfer flag for use DMA      */
 #define U14TF_MULTIA          2           /* Transfer flag for multi areas  */
 #define U14TF_FIFO            4           /* for FIFO interface card        */
@@ -138,18 +138,18 @@
 #define U14TF_DIAG            256         /* Diagnostics/debug functions    */
 #define U14TF_CIRC14          512         /* Circular-mode to 1401          */
 
-/// Definitions of element sizes for DMA transfers - to allow byte-swapping
+/* Definitions of element sizes for DMA transfers - to allow byte-swapping */
 #define ESZBYTES              0           /* BYTE element size value        */
-#define ESZWORDS              1           /* WORD element size value        */
+#define ESZWORDS              1           /* unsigned short element size value        */
 #define ESZLONGS              2           /* long element size value        */
 #define ESZUNKNOWN            0           /* unknown element size value     */
 
-/// These define required access types for the debug/diagnostics function
+/* These define required access types for the debug/diagnostics function */
 #define BYTE_SIZE             1           /* 8-bit access                   */
 #define WORD_SIZE             2           /* 16-bit access                  */
 #define LONG_SIZE             3           /* 32-bit access                  */
 
-/// Stuff used by U14_GetTransfer
+/* Stuff used by U14_GetTransfer */
 #define GET_TX_MAXENTRIES  257          /* (max length / page size + 1) */
 
 #ifdef _IS_WINDOWS_
@@ -157,19 +157,19 @@
 
 typedef struct                          /* used for U14_GetTransfer results */
 {                                          /* Info on a single mapped block */
-   U14LONG physical;
-   U14LONG size;
+	U14LONG physical;
+	U14LONG size;
 } TXENTRY;
 
 typedef struct TGetTxBlock              /* used for U14_GetTransfer results */
 {                                               /* matches structure in VXD */
-   U14LONG size;
-   U14LONG linear;
-   short   seg;
-   short   reserved;
-   short   avail;                      /* number of available entries */
-   short   used;                       /* number of used entries */
-   TXENTRY entries[GET_TX_MAXENTRIES];       /* Array of mapped block info */
+	U14LONG size;
+	U14LONG linear;
+	short   seg;
+	short   reserved;
+	short   avail;                      /* number of available entries */
+	short   used;                       /* number of used entries */
+	TXENTRY entries[GET_TX_MAXENTRIES];       /* Array of mapped block info */
 } TGET_TX_BLOCK;
 
 typedef TGET_TX_BLOCK *LPGET_TX_BLOCK;
@@ -180,19 +180,19 @@
 #ifdef LINUX
 typedef struct                          /* used for U14_GetTransfer results */
 {                                       /* Info on a single mapped block */
-   long long physical;
-   long     size;
+	long long physical;
+	long     size;
 } TXENTRY;
 
 typedef struct TGetTxBlock              /* used for U14_GetTransfer results */
 {                                       /* matches structure in VXD */
-   long long linear;                    /* linear address */
-   long     size;                       /* total size of the mapped area, holds id when called */
-   short    seg;                        /* segment of the address for Win16 */
-   short    reserved;
-   short    avail;                      /* number of available entries */
-   short    used;                       /* number of used entries */
-   TXENTRY  entries[GET_TX_MAXENTRIES]; /* Array of mapped block info */
+	long long linear;                    /* linear address */
+	long     size;                       /* total size of the mapped area, holds id when called */
+	short    seg;                        /* segment of the address for Win16 */
+	short    reserved;
+	short    avail;                      /* number of available entries */
+	short    used;                       /* number of used entries */
+	TXENTRY  entries[GET_TX_MAXENTRIES]; /* Array of mapped block info */
 } TGET_TX_BLOCK;
 #endif
 
@@ -200,84 +200,84 @@
 extern "C" {
 #endif
 
-U14API(int)   U14WhenToTimeOut(short hand);         // when to timeout in ms
-U14API(short) U14PassedTime(int iTime);             // non-zero if iTime passed
+U14API(int)   U14WhenToTimeOut(short hand);         /*  when to timeout in ms */
+U14API(short)	U14PassedTime(int iTime);             /*  non-zero if iTime passed */
 
-U14API(short) U14LastErrCode(short hand);
+U14API(short)	U14LastErrCode(short hand);
 
-U14API(short) U14Open1401(short n1401);
-U14API(short) U14Close1401(short hand);
-U14API(short) U14Reset1401(short hand);
-U14API(short) U14ForceReset(short hand);
-U14API(short) U14TypeOf1401(short hand);
-U14API(short) U14NameOf1401(short hand, char* pBuf, WORD wMax);
+U14API(short)	U14Open1401(short n1401);
+U14API(short)	U14Close1401(short hand);
+U14API(short)	U14Reset1401(short hand);
+U14API(short)	U14ForceReset(short hand);
+U14API(short)	U14TypeOf1401(short hand);
+U14API(short)	U14NameOf1401(short hand, char *pBuf, unsigned short wMax);
 
-U14API(short) U14Stat1401(short hand);
-U14API(short) U14CharCount(short hand);
-U14API(short) U14LineCount(short hand);
+U14API(short)	U14Stat1401(short hand);
+U14API(short)	U14CharCount(short hand);
+U14API(short)	U14LineCount(short hand);
 
-U14API(short) U14SendString(short hand, const char* pString);
-U14API(short) U14GetString(short hand, char* pBuffer, WORD wMaxLen);
-U14API(short) U14SendChar(short hand, char cChar);
-U14API(short) U14GetChar(short hand, char* pcChar);
+U14API(short)	U14SendString(short hand, const char *pString);
+U14API(short)	U14GetString(short hand, char *pBuffer, unsigned short wMaxLen);
+U14API(short)	U14SendChar(short hand, char cChar);
+U14API(short)	U14GetChar(short hand, char *pcChar);
 
-U14API(short) U14LdCmd(short hand, const char* command);
-U14API(DWORD) U14Ld(short hand, const char* vl, const char* str);
+U14API(short)	U14LdCmd(short hand, const char *command);
+U14API(unsigned int) U14Ld(short hand, const char *vl, const char *str);
 
-U14API(short) U14SetTransArea(short hand, WORD wArea, void *pvBuff,
-                                            DWORD dwLength, short eSz);
-U14API(short) U14UnSetTransfer(short hand, WORD wArea);
-U14API(short) U14SetTransferEvent(short hand, WORD wArea, BOOL bEvent,
-                                  BOOL bToHost, DWORD dwStart, DWORD dwLength);
-U14API(int)   U14TestTransferEvent(short hand, WORD wArea);
-U14API(int)   U14WaitTransferEvent(short hand, WORD wArea, int msTimeOut);
-U14API(short) U14GetTransfer(short hand, TGET_TX_BLOCK *pTransBlock);
+U14API(short)	U14SetTransArea(short hand, unsigned short wArea, void *pvBuff,
+					unsigned int dwLength, short eSz);
+U14API(short)	U14UnSetTransfer(short hand, unsigned short wArea);
+U14API(short)	U14SetTransferEvent(short hand, unsigned short wArea, BOOL bEvent,
+					BOOL bToHost, unsigned int dwStart, unsigned int dwLength);
+U14API(int)   U14TestTransferEvent(short hand, unsigned short wArea);
+U14API(int)   U14WaitTransferEvent(short hand, unsigned short wArea, int msTimeOut);
+U14API(short)	U14GetTransfer(short hand, TGET_TX_BLOCK *pTransBlock);
 
-U14API(short) U14ToHost(short hand, char* pAddrHost,DWORD dwSize,DWORD dw1401,
-                                                            short eSz);
-U14API(short) U14To1401(short hand, const char* pAddrHost,DWORD dwSize,DWORD dw1401,
-                                                            short eSz);
+U14API(short)	U14ToHost(short hand, char *pAddrHost, unsigned int dwSize, unsigned int dw1401,
+								short eSz);
+U14API(short)	U14To1401(short hand, const char *pAddrHost, unsigned int dwSize, unsigned int dw1401,
+								short eSz);
 
-U14API(short) U14SetCircular(short hand, WORD wArea, BOOL bToHost, void *pvBuff,
-                                         DWORD dwLength);
+U14API(short)	U14SetCircular(short hand, unsigned short wArea, BOOL bToHost, void *pvBuff,
+							unsigned int dwLength);
 
-U14API(int)   U14GetCircBlk(short hand, WORD wArea, DWORD *pdwOffs);
-U14API(int)   U14FreeCircBlk(short hand, WORD wArea, DWORD dwOffs, DWORD dwSize,
-                                         DWORD *pdwOffs);
+U14API(int)   U14GetCircBlk(short hand, unsigned short wArea, unsigned int *pdwOffs);
+U14API(int)   U14FreeCircBlk(short hand, unsigned short wArea, unsigned int dwOffs, unsigned int dwSize,
+							unsigned int *pdwOffs);
 
-U14API(short) U14StrToLongs(const char* pszBuff, U14LONG *palNums, short sMaxLongs);
-U14API(short) U14LongsFrom1401(short hand, U14LONG *palBuff, short sMaxLongs);
+U14API(short)	U14StrToLongs(const char *pszBuff, U14LONG *palNums, short sMaxLongs);
+U14API(short)	U14LongsFrom1401(short hand, U14LONG *palBuff, short sMaxLongs);
 
 U14API(void)  U14SetTimeout(short hand, int lTimeout);
 U14API(int)   U14GetTimeout(short hand);
-U14API(short) U14OutBufSpace(short hand);
+U14API(short)	U14OutBufSpace(short hand);
 U14API(int)   U14BaseAddr1401(short hand);
 U14API(int)   U14DriverVersion(short hand);
 U14API(int)   U14DriverType(short hand);
-U14API(short) U14DriverName(short hand, char* pBuf, WORD wMax);
-U14API(short) U14GetUserMemorySize(short hand, DWORD *pMemorySize);
-U14API(short) U14KillIO1401(short hand);
+U14API(short)	U14DriverName(short hand, char *pBuf, unsigned short wMax);
+U14API(short)	U14GetUserMemorySize(short hand, unsigned int *pMemorySize);
+U14API(short)	U14KillIO1401(short hand);
 
-U14API(short) U14BlkTransState(short hand);
-U14API(short) U14StateOf1401(short hand);
+U14API(short)	U14BlkTransState(short hand);
+U14API(short)	U14StateOf1401(short hand);
 
-U14API(short) U14Grab1401(short hand);
-U14API(short) U14Free1401(short hand);
-U14API(short) U14Peek1401(short hand, DWORD dwAddr, int nSize, int nRepeats);
-U14API(short) U14Poke1401(short hand, DWORD dwAddr, DWORD dwValue, int nSize, int nRepeats);
-U14API(short) U14Ramp1401(short hand, DWORD dwAddr, DWORD dwDef, DWORD dwEnable, int nSize, int nRepeats);
-U14API(short) U14RampAddr(short hand, DWORD dwDef, DWORD dwEnable, int nSize, int nRepeats);
-U14API(short) U14StopDebugLoop(short hand);
-U14API(short) U14GetDebugData(short hand, U14LONG *plValue);
+U14API(short)	U14Grab1401(short hand);
+U14API(short)	U14Free1401(short hand);
+U14API(short)	U14Peek1401(short hand, unsigned int dwAddr, int nSize, int nRepeats);
+U14API(short)	U14Poke1401(short hand, unsigned int dwAddr, unsigned int dwValue, int nSize, int nRepeats);
+U14API(short)	U14Ramp1401(short hand, unsigned int dwAddr, unsigned int dwDef, unsigned int dwEnable, int nSize, int nRepeats);
+U14API(short)	U14RampAddr(short hand, unsigned int dwDef, unsigned int dwEnable, int nSize, int nRepeats);
+U14API(short)	U14StopDebugLoop(short hand);
+U14API(short)	U14GetDebugData(short hand, U14LONG *plValue);
 
-U14API(short) U14StartSelfTest(short hand);
-U14API(short) U14CheckSelfTest(short hand, U14LONG *pData);
-U14API(short) U14TransferFlags(short hand);
-U14API(void)  U14GetErrorString(short nErr, char* pStr, WORD wMax);
+U14API(short)	U14StartSelfTest(short hand);
+U14API(short)	U14CheckSelfTest(short hand, U14LONG *pData);
+U14API(short)	U14TransferFlags(short hand);
+U14API(void)  U14GetErrorString(short nErr, char *pStr, unsigned short wMax);
 U14API(int)   U14MonitorRev(short hand);
 U14API(void)  U14CloseAll(void);
 
-U14API(short) U14WorkingSet(DWORD dwMinKb, DWORD dwMaxKb);
+U14API(short)	U14WorkingSet(unsigned int dwMinKb, unsigned int dwMaxKb);
 U14API(int)   U14InitLib(void);
 
 #ifdef __cplusplus
@@ -285,3 +285,4 @@
 #endif
 
 #endif /* End of ifndef __USE1401_H__ */
+

diff --git a/drivers/staging/ced1401/use14_ioc.h b/drivers/staging/ced1401/use14_ioc.h
index 15ca638..97d7913 100644
--- a/drivers/staging/ced1401/use14_ioc.h
+++ b/drivers/staging/ced1401/use14_ioc.h

@@ -19,283 +19,282 @@
 ** The IOCTL function codes from 0x80 to 0xFF are for developer use.
 */
 #define  FILE_DEVICE_CED1401    0x8001
-#define  FNNUMBASE              0x800
+						FNNUMBASE              0x800
 
-#define  U14_OPEN1401            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE,               \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_OPEN1401            CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE,               \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_CLOSE1401           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+1,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_CLOSE1401           CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+1,             \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_SENDSTRING          CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+2,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_SENDSTRING          CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+2,             \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_RESET1401           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+3,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_RESET1401           CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+3,             \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_GETCHAR             CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+4,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_GETCHAR             CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+4,             \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_SENDCHAR            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+5,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_SENDCHAR            CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+5,             \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_STAT1401            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+6,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_STAT1401            CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+6,             \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_LINECOUNT           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+7,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_LINECOUNT           CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+7,             \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_GETSTRING           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+8,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_GETSTRING           CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+8,             \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_REGCALLBACK         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+9,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_REGCALLBACK         CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+9,             \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_GETMONITORBUF       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+10,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_GETMONITORBUF       CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+10,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_SETTRANSFER         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+11,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_SETTRANSFER         CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+11,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_UNSETTRANSFER       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+12,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_UNSETTRANSFER       CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+12,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_SETTRANSEVENT       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+13,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_SETTRANSEVENT       CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+13,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_GETOUTBUFSPACE      CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+14,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_GETOUTBUFSPACE      CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+14,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_GETBASEADDRESS      CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+15,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_GETBASEADDRESS      CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+15,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_GETDRIVERREVISION   CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+16,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_GETDRIVERREVISION   CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+16,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_GETTRANSFER         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+17,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_GETTRANSFER         CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+17,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_KILLIO1401          CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+18,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_KILLIO1401          CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+18,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_BLKTRANSSTATE       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+19,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_BLKTRANSSTATE       CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+19,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_BYTECOUNT           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+20,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_BYTECOUNT           CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+20,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_ZEROBLOCKCOUNT      CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+21,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_ZEROBLOCKCOUNT      CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+21,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_STOPCIRCULAR        CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+22,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_STOPCIRCULAR        CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+22,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_STATEOF1401         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+23,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_STATEOF1401         CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+23,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_REGISTERS1401       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+24,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_REGISTERS1401       CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+24,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_GRAB1401            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+25,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_GRAB1401            CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+25,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_FREE1401            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+26,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_FREE1401            CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+26,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_STEP1401            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+27,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_STEP1401            CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+27,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_SET1401REGISTERS    CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+28,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_SET1401REGISTERS    CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+28,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_STEPTILL1401        CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+29,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_STEPTILL1401        CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+29,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_SETORIN             CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+30,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_SETORIN             CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+30,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_STARTSELFTEST       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+31,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_STARTSELFTEST       CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+31,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_CHECKSELFTEST       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+32,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_CHECKSELFTEST       CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+32,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_TYPEOF1401          CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+33,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_TYPEOF1401          CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+33,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_TRANSFERFLAGS       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+34,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_TRANSFERFLAGS       CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+34,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_DBGPEEK             CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+35,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_DBGPEEK             CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+35,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_DBGPOKE             CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+36,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_DBGPOKE             CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+36,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_DBGRAMPDATA         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+37,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_DBGRAMPDATA         CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+37,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_DBGRAMPADDR         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+38,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_DBGRAMPADDR         CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+38,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_DBGGETDATA          CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+39,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_DBGGETDATA          CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+39,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_DBGSTOPLOOP         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+40,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_DBGSTOPLOOP         CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+40,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_FULLRESET           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+41,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_FULLRESET           CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+41,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_SETCIRCULAR         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+42,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_SETCIRCULAR         CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+42,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_GETCIRCBLK          CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+43,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_GETCIRCBLK          CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+43,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-#define  U14_FREECIRCBLK         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+44,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
+#define  U14_FREECIRCBLK         CTL_CODE(FILE_DEVICE_CED1401,     \
+						FNNUMBASE+44,            \
+						METHOD_BUFFERED,         \
+						FILE_ANY_ACCESS)
 
-//--------------- Structures that are shared with the driver -------------
+/*--------------- Structures that are shared with the driver ------------- */
 #pragma pack(1)
 
 typedef struct                  /* used for get/set standard 1401 registers */
 {
-   short   sPC;
-   char    A;
-   char    X;
-   char    Y;
-   char    stat;
-   char    rubbish;
+	short   sPC;
+	char    A;
+	char    X;
+	char    Y;
+	char    stat;
+	char    rubbish;
 } T1401REGISTERS;
 
 typedef union     /* to communicate with 1401 driver status & control funcs */
 {
-   char           chrs[22];
-   short          ints[11];
-   long           longs[5];
-   T1401REGISTERS registers;
+	char           chrs[22];
+	short          ints[11];
+	long           longs[5];
+	T1401REGISTERS registers;
 } TCSBLOCK;
 
 typedef TCSBLOCK*  LPTCSBLOCK;
 
-typedef struct paramBlk
-{
-    short       sState;
-    TCSBLOCK    csBlock;
+typedef struct paramBlk {
+	 short       sState;
+	 TCSBLOCK    csBlock;
 } PARAMBLK;
 
 typedef PARAMBLK*   PPARAMBLK;
 
 typedef struct TransferDesc          /* Structure and type for SetTransArea */
 {
-   WORD        wArea;            /* number of transfer area to set up       */
-   void FAR *  lpvBuff;          /* address of transfer area                */
-   DWORD       dwLength;         /* length of area to set up                */
-   short       eSize;            /* size to move (for swapping on MAC)      */
+	unsigned short        wArea;            /* number of transfer area to set up       */
+	void FAR *lpvBuff;          /* address of transfer area                */
+	unsigned int       dwLength;         /* length of area to set up                */
+	short       eSize;            /* size to move (for swapping on MAC)      */
 } TRANSFERDESC;
 
-typedef TRANSFERDESC FAR *    LPTRANSFERDESC;
+typedef TRANSFERDESC FAR *LPTRANSFERDESC;
 
 /* This is the structure used to set up a transfer area */
 typedef struct VXTransferDesc    /* use1401.c and use1432x.x use only       */
 {
-   WORD        wArea;            /* number of transfer area to set up       */
-   WORD        wAddrSel;         /* 16 bit selector for area                */
-   DWORD       dwAddrOfs;        /* 32 bit offset for area start            */
-   DWORD       dwLength;         /* length of area to set up                */
+	unsigned short        wArea;            /* number of transfer area to set up       */
+	unsigned short        wAddrSel;         /* 16 bit selector for area                */
+	unsigned int       dwAddrOfs;        /* 32 bit offset for area start            */
+	unsigned int       dwLength;         /* length of area to set up                */
 } VXTRANSFERDESC;
 
 #pragma pack()
 
-#endif
\ No newline at end of file
+#endif

diff --git a/drivers/staging/ced1401/userspace/use1401.c b/drivers/staging/ced1401/userspace/use1401.c
index 38e7c1c..c9bc2eb 100644
--- a/drivers/staging/ced1401/userspace/use1401.c
+++ b/drivers/staging/ced1401/userspace/use1401.c

@@ -36,7 +36,7 @@
 ** Under Windows 9x and NT, Use1401 uses DeviceIoControl to get access to
 ** the 1401 driver. This has parameters for the device handle, the function
 ** code, an input pointer and byte count, an output pointer and byte count
-** and a pointer to a DWORD to hold the output byte count. Note that input
+** and a pointer to a unsigned int to hold the output byte count. Note that input
 ** and output are from the point-of-view of the driver, so the output stuff
 ** is used to read values from the 1401, not send to the 1401. The use of
 ** these parameters varies with the function in use and the operating
@@ -250,7 +250,7 @@
 static HANDLE aHand1401[MAX1401] = {0};         // handles for 1401s
 static HANDLE aXferEvent[MAX1401] = {0};        // transfer events for the 1401s
 static LPVOID apAreas[MAX1401][MAX_TRANSAREAS]; // Locked areas
-static DWORD  auAreas[MAX1401][MAX_TRANSAREAS]; // Size of locked areas
+static unsigned int  auAreas[MAX1401][MAX_TRANSAREAS]; // Size of locked areas
 static BOOL   bWindows9x = FALSE;               // if we are Windows 95 or better
 #ifdef _WIN64
 #define USE_NT_DIOC(ind) TRUE
@@ -276,8 +276,8 @@
 typedef struct CmdHead          // defines header block on command
 {                               // for PC commands
    char   acBasic[5];           // BASIC information - needed to align things
-   WORD   wBasicSz;             // size as seen by BASIC
-   WORD   wCmdSize;             // size of the following info
+   unsigned short   wBasicSz;             // size as seen by BASIC
+   unsigned short   wCmdSize;             // size of the following info
 } __packed CMDHEAD;
 #pragma pack()                  // back to normal
 
@@ -311,7 +311,7 @@
 ****************************************************************************/
 static short U14Status1401(short sHand, LONG lCode, TCSBLOCK* pBlk)
 {
-    DWORD dwBytes = 0;
+    unsigned int dwBytes = 0;
 
     if ((sHand < 0) || (sHand >= MAX1401))  /* Check parameters */
         return U14ERR_BADHAND;
@@ -345,7 +345,7 @@
 ****************************************************************************/
 static short U14Control1401(short sHand, LONG lCode, TCSBLOCK* pBlk)
 {
-    DWORD dwBytes = 0;
+    unsigned int dwBytes = 0;
 
     if ((sHand < 0) || (sHand >= MAX1401))              /* Check parameters */
         return U14ERR_BADHAND;
@@ -455,7 +455,7 @@
 ****************************************************************************/
 U14API(short) U14StrToLongs(const char* pszBuff, U14LONG *palNums, short sMaxLongs)
 {
-    WORD wChInd = 0;                // index into source
+    unsigned short wChInd = 0;                // index into source
     short sLgInd = 0;               // index into result longs
 
     while (pszBuff[wChInd] &&       // until we get to end of string...
@@ -681,7 +681,7 @@
 ** U14DriverName
 ** Returns the driver type as 3 character (ISA, PCI, USB or HSS))
 ****************************************************************************/
-U14API(short) U14DriverName(short hand, char* pBuf, WORD wMax)
+U14API(short) U14DriverName(short hand, char* pBuf, unsigned short wMax)
 {
     char* pName;
     *pBuf = 0;                             // Start off with a blank string
@@ -779,7 +779,7 @@
 ** is called. After the peek is done, use U14GetDebugData to retrieve
 ** the results of the peek.
 ****************************************************************************/
-U14API(short) U14Peek1401(short hand, DWORD dwAddr, int nSize, int nRepeats)
+U14API(short) U14Peek1401(short hand, unsigned int dwAddr, int nSize, int nRepeats)
 {
     short sErr = CheckHandle(hand);
     if (sErr == U14ERR_NOERROR)
@@ -813,7 +813,7 @@
 ** If lRepeats is zero, the loop will continue until U14StopDebugLoop
 ** is called.
 ****************************************************************************/
-U14API(short) U14Poke1401(short hand, DWORD dwAddr, DWORD dwValue,
+U14API(short) U14Poke1401(short hand, unsigned int dwAddr, unsigned int dwValue,
                                       int nSize, int nRepeats)
 {
     short sErr = CheckHandle(hand);
@@ -849,7 +849,7 @@
 ** DESCRIPTION  Cause the 1401 to loop, writing a ramp to a location.
 ** If lRepeats is zero, the loop will continue until U14StopDebugLoop.
 ****************************************************************************/
-U14API(short) U14Ramp1401(short hand, DWORD dwAddr, DWORD dwDef, DWORD dwEnable,
+U14API(short) U14Ramp1401(short hand, unsigned int dwAddr, unsigned int dwDef, unsigned int dwEnable,
                                       int nSize, int nRepeats)
 {
     short sErr = CheckHandle(hand);
@@ -887,7 +887,7 @@
 ** DESCRIPTION  Cause the 1401 to loop, reading from a ramping location.
 ** If lRepeats is zero, the loop will continue until U14StopDebugLoop
 ****************************************************************************/
-U14API(short) U14RampAddr(short hand, DWORD dwDef, DWORD dwEnable,
+U14API(short) U14RampAddr(short hand, unsigned int dwDef, unsigned int dwEnable,
                                       int nSize, int nRepeats)
 {
     short sErr = CheckHandle(hand);
@@ -1024,7 +1024,7 @@
 /****************************************************************************
 ** U14GetUserMemorySize
 ****************************************************************************/
-U14API(short) U14GetUserMemorySize(short hand, DWORD *pMemorySize)
+U14API(short) U14GetUserMemorySize(short hand, unsigned int *pMemorySize)
 {
     // The original 1401 used a different command for getting the size
     short sErr = U14SendString(hand, (asType1401[hand] == U14TYPE1401) ? "MEMTOP;" : "MEMTOP,?;");
@@ -1061,7 +1061,7 @@
 ** U14NameOf1401
 ** Returns the type of the 1401 as a string, blank if unknown
 ****************************************************************************/
-U14API(short) U14NameOf1401(short hand, char* pBuf, WORD wMax)
+U14API(short) U14NameOf1401(short hand, char* pBuf, unsigned short wMax)
 {
     short sErr = CheckHandle(hand);
     if (sErr == U14ERR_NOERROR)
@@ -1207,7 +1207,7 @@
 {
     short sErr = U14ERR_NOERROR;
     HANDLE hDevice = INVALID_HANDLE_VALUE;
-    DWORD dwErr = 0;
+    unsigned int dwErr = 0;
     int nFirst, nLast, nDev = 0;        /* Used for the search for a 1401 */
     BOOL bOldName = FALSE;               /* start by looking for a modern driver */
 
@@ -1262,7 +1262,7 @@
             }
             else
             {
-                DWORD dwe = GetLastError();     /* Get error code otherwise */
+                unsigned int dwe = GetLastError();     /* Get error code otherwise */
                 if ((dwe != ERROR_FILE_NOT_FOUND) || (dwErr == 0))
                     dwErr = dwe;                /* Ignore repeats of 'not found' */
             }
@@ -1454,7 +1454,7 @@
         U14Reset1401(hand);                     // in case an active transfer running
         for (j = 0; j < MAX_TRANSAREAS; ++j)    // Locate locked areas
             if (iAreaMask & (1 << j))           // And kill off any transfers
-                U14UnSetTransfer(hand, (WORD)j);
+                U14UnSetTransfer(hand, (unsigned short)j);
     }
 
 #ifdef _IS_WINDOWS_
@@ -1581,7 +1581,7 @@
         if (bSpaceToSend)
         {
             PARAMBLK    rData;
-            DWORD       dwBytes;
+            unsigned int       dwBytes;
             char        tstr[MAXSTRLEN+5];          /* Buffer for chars */
 
             if ((hand < 0) || (hand >= MAX1401))
@@ -1592,18 +1592,18 @@
 #ifndef _WIN64
                 if (!USE_NT_DIOC(hand))             /* Using WIN 95 driver access? */
                 {
-                    int iOK = DeviceIoControl(aHand1401[hand], (DWORD)U14_SENDSTRING,
+                    int iOK = DeviceIoControl(aHand1401[hand], (unsigned int)U14_SENDSTRING,
                                     NULL, 0, tstr, nChars,
                                     &dwBytes, NULL);
                     if (iOK)
-                        sErr = (dwBytes >= (DWORD)nChars) ? U14ERR_NOERROR : U14ERR_DRIVCOMMS;
+                        sErr = (dwBytes >= (unsigned int)nChars) ? U14ERR_NOERROR : U14ERR_DRIVCOMMS;
                     else
                         sErr = (short)GetLastError();
                 }
                 else
 #endif
                 {
-                    int iOK = DeviceIoControl(aHand1401[hand],(DWORD)U14_SENDSTRING,
+                    int iOK = DeviceIoControl(aHand1401[hand],(unsigned int)U14_SENDSTRING,
                                     tstr, nChars,
                                     &rData,sizeof(PARAMBLK),&dwBytes,NULL);
                     if (iOK && (dwBytes >= sizeof(PARAMBLK)))
@@ -1697,7 +1697,7 @@
 **          error code. Any error from the device causes us to set up for
 **          a full reset.
 ****************************************************************************/
-U14API(short) U14GetString(short hand, char* pBuffer, WORD wMaxLen)
+U14API(short) U14GetString(short hand, char* pBuffer, unsigned short wMaxLen)
 {
     short sErr = CheckHandle(hand);
     if (sErr != U14ERR_NOERROR)             // If an error...
@@ -1726,8 +1726,8 @@
         {
             if (asLastRetCode[hand] == U14ERR_NOERROR)     /* all ok so far */
             {
-                DWORD       dwBytes = 0;
-                *((WORD *)pBuffer) = wMaxLen;       /* set up length */
+                unsigned int       dwBytes = 0;
+                *((unsigned short *)pBuffer) = wMaxLen;       /* set up length */
 #ifndef _WIN64
                 if (!USE_NT_DIOC(hand))             /* Win 95 DIOC here ? */
                 {
@@ -1737,9 +1737,9 @@
                     if (wMaxLen > MAXSTRLEN)        /* Truncate length */
                         wMaxLen = MAXSTRLEN;    
 
-                    *((WORD *)tstr) = wMaxLen;      /* set len */
+                    *((unsigned short *)tstr) = wMaxLen;      /* set len */
 
-                    iOK = DeviceIoControl(aHand1401[hand],(DWORD)U14_GETSTRING,
+                    iOK = DeviceIoControl(aHand1401[hand],(unsigned int)U14_GETSTRING,
                                     NULL, 0, tstr, wMaxLen+sizeof(short),
                                     &dwBytes, NULL);
                     if (iOK)                        /* Device IO control OK ? */
@@ -1768,7 +1768,7 @@
                         char* pMem = (char*)GlobalLock(hMem);
                         if (pMem)
                         {
-                            int iOK = DeviceIoControl(aHand1401[hand],(DWORD)U14_GETSTRING,
+                            int iOK = DeviceIoControl(aHand1401[hand],(unsigned int)U14_GETSTRING,
                                             NULL, 0, pMem, wMaxLen+sizeof(short),
                                             &dwBytes, NULL);
                             if (iOK)                /* Device IO control OK ? */
@@ -1946,7 +1946,7 @@
 **       other functions after getting an error and before using
 **       this function.
 ****************************************************************************/
-U14API(void)  U14GetErrorString(short nErr, char* pStr, WORD wMax)
+U14API(void)  U14GetErrorString(short nErr, char* pStr, unsigned short wMax)
 {
     char    wstr[150];
 
@@ -2105,7 +2105,7 @@
         break;
 
     }
-    if ((WORD)strlen(wstr) >= wMax-1)  /* Check for string being too long */
+    if ((unsigned short)strlen(wstr) >= wMax-1)  /* Check for string being too long */
         wstr[wMax-1] = 0;                          /* and truncate it if so */
     strcpy(pStr, wstr);                       /* Return the error string */
 }
@@ -2120,8 +2120,8 @@
 #ifdef _IS_WINDOWS_
     if (sErr == U14ERR_NOERROR)
     { 
-        DWORD dwBytes = 0;
-        BOOL bOK = DeviceIoControl(aHand1401[hand], (DWORD)U14_GETTRANSFER, NULL, 0, pTransBlock,
+        unsigned int dwBytes = 0;
+        BOOL bOK = DeviceIoControl(aHand1401[hand], (unsigned int)U14_GETTRANSFER, NULL, 0, pTransBlock,
                               sizeof(TGET_TX_BLOCK), &dwBytes, NULL);
     
         if (bOK && (dwBytes >= sizeof(TGET_TX_BLOCK)))
@@ -2145,12 +2145,12 @@
 //     1 unable to access process (insufficient rights?)
 //     2 unable to read process working set
 //     3 unable to set process working set - bad parameters?
-U14API(short) U14WorkingSet(DWORD dwMinKb, DWORD dwMaxKb)
+U14API(short) U14WorkingSet(unsigned int dwMinKb, unsigned int dwMaxKb)
 {
 #ifdef _IS_WINDOWS_
     short sRetVal = 0;                      // 0 means all is OK
     HANDLE hProcess;
-    DWORD dwVer = GetVersion();
+    unsigned int dwVer = GetVersion();
 	if (dwVer & 0x80000000)                 // is this not NT?
         return 0;                           // then give up right now
 
@@ -2164,8 +2164,8 @@
         SIZE_T dwMinSize,dwMaxSize;
         if (GetProcessWorkingSetSize(hProcess, &dwMinSize, &dwMaxSize))
         {
-            DWORD dwMin = dwMinKb << 10;    // convert from kb to bytes
-            DWORD dwMax = dwMaxKb << 10;
+            unsigned int dwMin = dwMinKb << 10;    // convert from kb to bytes
+            unsigned int dwMax = dwMaxKb << 10;
 
             // if we get here, we have managed to read the current size
             if (dwMin > dwMinSize)          // need to change sizes?
@@ -2200,7 +2200,7 @@
 ** U14UnSetTransfer  Cancels a transfer area
 ** wArea    The index of a block previously used in by SetTransfer
 *****************************************************************************/
-U14API(short) U14UnSetTransfer(short hand, WORD wArea)
+U14API(short) U14UnSetTransfer(short hand, unsigned short wArea)
 {
     short sErr = CheckHandle(hand);
 #ifdef _IS_WINDOWS_
@@ -2223,13 +2223,13 @@
 
 /****************************************************************************
 ** U14SetTransArea      Sets an area up to be used for transfers
-** WORD  wArea     The area number to set up
+** unsigned short  wArea     The area number to set up
 ** void *pvBuff    The address of the buffer for the data.
-** DWORD dwLength  The length of the buffer for the data
+** unsigned int dwLength  The length of the buffer for the data
 ** short eSz       The element size (used for byte swapping on the Mac)
 ****************************************************************************/
-U14API(short) U14SetTransArea(short hand, WORD wArea, void *pvBuff,
-                                          DWORD dwLength, short eSz)
+U14API(short) U14SetTransArea(short hand, unsigned short wArea, void *pvBuff,
+                                          unsigned int dwLength, short eSz)
 {
     TRANSFERDESC td;
     short sErr = CheckHandle(hand);
@@ -2254,7 +2254,7 @@
 #ifndef _WIN64
     if (!USE_NT_DIOC(hand))                         /* Use Win 9x DIOC? */
     {
-        DWORD dwBytes;
+        unsigned int dwBytes;
         VXTRANSFERDESC vxDesc;                      /* Structure to pass to VXD */
         vxDesc.wArea = wArea;                       /* Copy across simple params */
         vxDesc.dwLength = dwLength;
@@ -2264,10 +2264,10 @@
             sErr = U14ERR_DRIVTOOOLD;
         else
         {
-            vxDesc.dwAddrOfs = (DWORD)pvBuff;       /* 32 bit offset */
+            vxDesc.dwAddrOfs = (unsigned int)pvBuff;       /* 32 bit offset */
             vxDesc.wAddrSel  = 0;
 
-            if (DeviceIoControl(aHand1401[hand], (DWORD)U14_SETTRANSFER,
+            if (DeviceIoControl(aHand1401[hand], (unsigned int)U14_SETTRANSFER,
                                 pvBuff,dwLength,    /* Will translate pointer */
                                 &vxDesc,sizeof(VXTRANSFERDESC),
                                 &dwBytes,NULL))
@@ -2285,13 +2285,13 @@
 #endif
     {
         PARAMBLK rWork;
-        DWORD dwBytes;
+        unsigned int dwBytes;
         td.wArea = wArea;     /* Pure NT - put data into struct */
         td.lpvBuff = pvBuff;
         td.dwLength = dwLength;
         td.eSize = 0;                // Dummy element size
 
-        if (DeviceIoControl(aHand1401[hand],(DWORD)U14_SETTRANSFER,
+        if (DeviceIoControl(aHand1401[hand],(unsigned int)U14_SETTRANSFER,
                             &td,sizeof(TRANSFERDESC),
                             &rWork,sizeof(PARAMBLK),&dwBytes,NULL))
         {
@@ -2344,8 +2344,8 @@
 ** Returns 1 if an event handle exists, 0 if all OK and no event handle or
 ** a negative code for an error.
 ****************************************************************************/
-U14API(short) U14SetTransferEvent(short hand, WORD wArea, BOOL bEvent,
-                                  BOOL bToHost, DWORD dwStart, DWORD dwLength)
+U14API(short) U14SetTransferEvent(short hand, unsigned short wArea, BOOL bEvent,
+                                  BOOL bToHost, unsigned int dwStart, unsigned int dwLength)
 {
 #ifdef _IS_WINDOWS_
     TCSBLOCK csBlock;
@@ -2416,7 +2416,7 @@
 ** Would a U14WaitTransferEvent() call return immediately? return 1 if so,
 ** 0 if not or a negative code if a problem.
 ****************************************************************************/
-U14API(int) U14TestTransferEvent(short hand, WORD wArea)
+U14API(int) U14TestTransferEvent(short hand, unsigned short wArea)
 {
 #ifdef _IS_WINDOWS_
     int iErr = CheckHandle(hand);
@@ -2441,7 +2441,7 @@
 ** Returns   If no event handle then return immediately. Else return 1 if
 **           timed out or 0=event, and a negative code if a problem.
 ****************************************************************************/
-U14API(int) U14WaitTransferEvent(short hand, WORD wArea, int msTimeOut)
+U14API(int) U14WaitTransferEvent(short hand, unsigned short wArea, int msTimeOut)
 {
 #ifdef _IS_WINDOWS_
     int iErr = CheckHandle(hand);
@@ -2466,13 +2466,13 @@
 
 /****************************************************************************
 ** U14SetCircular    Sets an area up for circular DMA transfers
-** WORD  wArea          The area number to set up
+** unsigned short  wArea          The area number to set up
 ** BOOL  bToHost        Sets the direction of data transfer
 ** void *pvBuff        The address of the buffer for the data
-** DWORD dwLength       The length of the buffer for the data
+** unsigned int dwLength       The length of the buffer for the data
 ****************************************************************************/
-U14API(short) U14SetCircular(short hand, WORD wArea, BOOL bToHost,
-									void *pvBuff, DWORD dwLength)
+U14API(short) U14SetCircular(short hand, unsigned short wArea, BOOL bToHost,
+									void *pvBuff, unsigned int dwLength)
 {
     short sErr = CheckHandle(hand);
     if (sErr != U14ERR_NOERROR)
@@ -2495,14 +2495,14 @@
     else
     {
         PARAMBLK rWork;
-        DWORD dwBytes;
+        unsigned int dwBytes;
         TRANSFERDESC txDesc;
         txDesc.wArea = wArea;             /* Pure NT - put data into struct */
         txDesc.lpvBuff = pvBuff;
         txDesc.dwLength = dwLength;
         txDesc.eSize = (short)bToHost;       /* Use this for direction flag */
    
-        if (DeviceIoControl(aHand1401[hand],(DWORD)U14_SETCIRCULAR,
+        if (DeviceIoControl(aHand1401[hand],(unsigned int)U14_SETCIRCULAR,
                            &txDesc, sizeof(TRANSFERDESC),
                            &rWork, sizeof(PARAMBLK),&dwBytes,NULL))
         {
@@ -2542,7 +2542,7 @@
 ** Function  GetCircBlk returns the size (& start offset) of the next
 **           available block of circular data.
 ****************************************************************************/
-U14API(int) U14GetCircBlk(short hand, WORD wArea, DWORD *pdwOffs)
+U14API(int) U14GetCircBlk(short hand, unsigned short wArea, unsigned int *pdwOffs)
 {
     int lErr = CheckHandle(hand);
     if (lErr != U14ERR_NOERROR)
@@ -2555,10 +2555,10 @@
 #ifdef _IS_WINDOWS_
         PARAMBLK rWork;
         TCSBLOCK csBlock;
-        DWORD dwBytes;
+        unsigned int dwBytes;
         csBlock.longs[0] = wArea;               // Area number into control block
         rWork.sState = U14ERR_DRIVCOMMS;
-        if (DeviceIoControl(aHand1401[hand], (DWORD)U14_GETCIRCBLK, &csBlock, sizeof(TCSBLOCK), &rWork, sizeof(PARAMBLK), &dwBytes, NULL) &&
+        if (DeviceIoControl(aHand1401[hand], (unsigned int)U14_GETCIRCBLK, &csBlock, sizeof(TCSBLOCK), &rWork, sizeof(PARAMBLK), &dwBytes, NULL) &&
            (dwBytes >= sizeof(PARAMBLK)))
             lErr = rWork.sState;
         else
@@ -2591,8 +2591,8 @@
 **           resuse for circular transfers and returns the size (& start
 **           offset) of the next available block of circular data.
 ****************************************************************************/
-U14API(int) U14FreeCircBlk(short hand, WORD wArea, DWORD dwOffs, DWORD dwSize,
-                                        DWORD *pdwOffs)
+U14API(int) U14FreeCircBlk(short hand, unsigned short wArea, unsigned int dwOffs, unsigned int dwSize,
+                                        unsigned int *pdwOffs)
 {
     int lErr = CheckHandle(hand);
     if (lErr != U14ERR_NOERROR)
@@ -2603,12 +2603,12 @@
 #ifdef _IS_WINDOWS_
         PARAMBLK rWork;
         TCSBLOCK csBlock;
-        DWORD dwBytes;
+        unsigned int dwBytes;
         csBlock.longs[0] = wArea;               // Area number into control block
         csBlock.longs[1] = dwOffs;
         csBlock.longs[2] = dwSize;
         rWork.sState = U14ERR_DRIVCOMMS;
-        if (DeviceIoControl(aHand1401[hand], (DWORD)U14_FREECIRCBLK, &csBlock, sizeof(TCSBLOCK),
+        if (DeviceIoControl(aHand1401[hand], (unsigned int)U14_FREECIRCBLK, &csBlock, sizeof(TCSBLOCK),
                            &rWork, sizeof(PARAMBLK), &dwBytes, NULL) &&
            (dwBytes >= sizeof(PARAMBLK)))
            lErr = rWork.sState;
@@ -2647,7 +2647,7 @@
 ** which it should be to get a pointer
 *****************************************************************************/
 static short Transfer(short hand, BOOL bTo1401, char* pData,
-                       DWORD dwSize, DWORD dw1401, short eSz)
+                       unsigned int dwSize, unsigned int dw1401, short eSz)
 {
     char strcopy[MAXSTRLEN+1];          // to hold copy of work string
     short sResult = U14SetTransArea(hand, 0, (void *)pData, dwSize, eSz);
@@ -2670,8 +2670,8 @@
 /****************************************************************************
 ** Function  ToHost transfers data into the host from the 1401
 ****************************************************************************/
-U14API(short) U14ToHost(short hand, char* pAddrHost, DWORD dwSize,
-                                            DWORD dw1401, short eSz)
+U14API(short) U14ToHost(short hand, char* pAddrHost, unsigned int dwSize,
+                                            unsigned int dw1401, short eSz)
 {
     short sErr = CheckHandle(hand);
     if ((sErr == U14ERR_NOERROR) && dwSize) // TOHOST is a constant
@@ -2682,8 +2682,8 @@
 /****************************************************************************
 ** Function  To1401 transfers data into the 1401 from the host
 ****************************************************************************/
-U14API(short) U14To1401(short hand, const char* pAddrHost,DWORD dwSize,
-                                    DWORD dw1401, short eSz)
+U14API(short) U14To1401(short hand, const char* pAddrHost,unsigned int dwSize,
+                                    unsigned int dw1401, short eSz)
 {
     short sErr = CheckHandle(hand);
     if ((sErr == U14ERR_NOERROR) && dwSize) // TO1401 is a constant
@@ -2707,7 +2707,7 @@
 #define file_close(h)   close(h)
 #define file_seek(h, pos) lseek(h, pos, SEEK_SET) 
 #define file_read(h, buffer, size) (read(h, buffer, size) == (ssize_t)size)
-static DWORD GetModuleFileName(void* dummy, char* buffer, int max)
+static unsigned int GetModuleFileName(void* dummy, char* buffer, int max)
 {
     // The following works for Linux systems with a /proc file system.
     char szProcPath[32];
@@ -2766,7 +2766,7 @@
     // application was run from.
     if (!bGotIt)                            // Still not got it?
     {
-        DWORD dwLen = GetModuleFileName(NULL, filnam, FNSZ); // Get app path
+        unsigned int dwLen = GetModuleFileName(NULL, filnam, FNSZ); // Get app path
         if (dwLen > 0)                      // and use it as path if found
         {
             char* pStr = strrchr(filnam, PATHSEP);    // Point to last separator
@@ -2821,7 +2821,7 @@
                 file_seek(iFHandle, sizeof(CMDHEAD));
                 if (file_read(iFHandle, pMem, (UINT)nComSize))
                 {
-                    sErr = U14SetTransArea(hand, 0, (void *)pMem, (DWORD)nComSize, ESZBYTES);
+                    sErr = U14SetTransArea(hand, 0, (void *)pMem, (unsigned int)nComSize, ESZBYTES);
                     if (sErr == U14ERR_NOERROR)
                     {
                         sprintf(strcopy, "CLOAD,0,$%X;", (int)nComSize);
@@ -2858,9 +2858,9 @@
 ** Returns NOERROR code or a long with error in lo word and index of
 ** command that failed in high word
 ****************************************************************************/
-U14API(DWORD) U14Ld(short hand, const char* vl, const char* str)
+U14API(unsigned int) U14Ld(short hand, const char* vl, const char* str)
 {
-    DWORD dwIndex = 0;              // index to current command
+    unsigned int dwIndex = 0;              // index to current command
     long lErr = U14ERR_NOERROR;     // what the error was that went wrong
     char strcopy[MAXSTRLEN+1];      // stores unmodified str parameter
     char szFExt[8];                 // The command file extension
@@ -2939,7 +2939,7 @@
         return lErr;
     }
     else
-        return ((dwIndex<<16) | ((DWORD)lErr & 0x0000FFFF));
+        return ((dwIndex<<16) | ((unsigned int)lErr & 0x0000FFFF));
 }
 
 // Initialise the library (if not initialised) and return the library version
@@ -2951,7 +2951,7 @@
         int i;
 #ifdef _IS_WINDOWS_
         int j;
-        DWORD   dwVersion = GetVersion();
+        unsigned int   dwVersion = GetVersion();
         bWindows9x = FALSE;                  // Assume not Win9x
 
         if (dwVersion & 0x80000000)                 // if not windows NT
@@ -2993,12 +2993,12 @@
 #ifdef _IS_WINDOWS_
 #ifndef U14_NOT_DLL
 /****************************************************************************
-** FUNCTION: DllMain(HANDLE, DWORD, LPVOID)
+** FUNCTION: DllMain(HANDLE, unsigned int, LPVOID)
 ** LibMain is called by Windows when the DLL is initialized, Thread Attached,
 ** and other times. Refer to SDK documentation, as to the different ways this
 ** may be called.
 ****************************************************************************/
-INT APIENTRY DllMain(HANDLE hInst, DWORD ul_reason_being_called, LPVOID lpReserved)
+INT APIENTRY DllMain(HANDLE hInst, unsigned int ul_reason_being_called, LPVOID lpReserved)
 {
     int iRetVal = 1;
 

diff --git a/drivers/staging/comedi/Kconfig b/drivers/staging/comedi/Kconfig
index 87e852a..8c8a551 100644
--- a/drivers/staging/comedi/Kconfig
+++ b/drivers/staging/comedi/Kconfig

@@ -110,15 +110,6 @@
 
 if COMEDI_ISA_DRIVERS
 
-config COMEDI_ACL7225B
-	tristate "ADlink NuDAQ ACL-7225b and compatibles support"
-	---help---
-	  Enable support for ADlink NuDAQ ACL-7225b and compatibles,
-	  ADlink ACL-7225b (acl7225b), ICP P16R16DIO (p16r16dio)
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called acl7225b.
-
 config COMEDI_PCL711
 	tristate "Advantech PCL-711/711b and ADlink ACL-8112 ISA card support"
 	---help---
@@ -137,14 +128,6 @@
 	  To compile this driver as a module, choose M here: the module will be
 	  called pcl724.
 
-config COMEDI_PCL725
-	tristate "Advantech PCL-725 and compatible ISA card support"
-	---help---
-	  Enable support for Advantech PCL-725 and compatible ISA cards.
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called pcl725.
-
 config COMEDI_PCL726
 	tristate "Advantech PCL-726 and compatible ISA card support"
 	---help---
@@ -154,10 +137,21 @@
 	  called pcl726.
 
 config COMEDI_PCL730
-	tristate "Advantech PCL-730 and ADlink ACL-7130 ISA card support"
+	tristate "Simple Digital I/O board support (8-bit ports)"
 	---help---
-	  Enable support for Advantech PCL-730, ICP ISO-730 and ADlink
-	  ACL-7130 ISA cards
+	  Enable support for various simple ISA or PC/104 Digital I/O boards.
+	  These boards all use 8-bit I/O ports.
+
+	  Advantech PCL-730   isolated - 16 in/16 out  ttl - 16 in/16 out
+	  ICP ISO-730         isolated - 16 in/16 out  ttl - 16 in/16 out
+	  ADlink ACL-7130     isolated - 16 in/16 out  ttl - 16 in/16 out
+	  Advantech PCM-3730  isolated - 8 in/8 out    ttl - 16 in/16 out
+	  Advantech PCL-725   isolated - 8 in/8 out
+	  ICP P8R8-DIO        isolated - 8 in/8 out
+	  ADlink ACL-7225b    isolated - 16 in/16 out
+	  ICP P16R16-DIO      isolated - 16 in/16 out
+	  Advantech PCL-733   isolated - 32 in
+	  Advantech PCL-734   isolated - 32 out
 
 	  To compile this driver as a module, choose M here: the module will be
 	  called pcl730.
@@ -201,14 +195,6 @@
 	  To compile this driver as a module, choose M here: the module will be
 	  called pcm3724.
 
-config COMEDI_PCM3730
-	tristate "Advantech PCM-3730 and clone PC/104 board support"
-	---help---
-	  Enable support for Advantech PCM-3730 and clone PC/104 boards
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called pcm3730.
-
 config COMEDI_AMPLC_DIO200_ISA
 	tristate "Amplicon PC212E/PC214E/PC215E/PC218E/PC272E"
 	select COMEDI_AMPLC_DIO200
@@ -543,12 +529,19 @@
 	tristate "Generic driver for very simple devices"
 	---help---
 	  Enable generic support for very simple / POC (Piece of Crap) boards,
-	  Keithley Metrabyte DAC-02 (dac02), Advantech PCL-733 (pcl733) and
-	  PCL-734 (pcl734)
+	  Keithley Metrabyte DAC-02 (dac02).
 
 	  To compile this driver as a module, choose M here: the module will be
 	  called poc.
 
+config COMEDI_S526
+	tristate "Sensoray s526 support"
+	---help---
+	  Enable support for Sensoray s526
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called s526.
+
 endif # COMEDI_ISA_DRIVERS
 
 menuconfig COMEDI_PCI_DRIVERS
@@ -1076,14 +1069,6 @@
 	  To compile this driver as a module, choose M here: the module will be
 	  called rtd520.
 
-config COMEDI_S526
-	tristate "Sensoray s526 support"
-	---help---
-	  Enable support for Sensoray s526
-
-	  To compile this driver as a module, choose M here: the module will be
-	  called s526.
-
 config COMEDI_S626
 	tristate "Sensoray 626 support"
 	select COMEDI_FC

diff --git a/drivers/staging/comedi/comedi.h b/drivers/staging/comedi/comedi.h
index 4233605..6bbbe5b 100644
--- a/drivers/staging/comedi/comedi.h
+++ b/drivers/staging/comedi/comedi.h

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _COMEDI_H

diff --git a/drivers/staging/comedi/comedi_buf.c b/drivers/staging/comedi/comedi_buf.c
index d4be0e6..b4c001b 100644
--- a/drivers/staging/comedi/comedi_buf.c
+++ b/drivers/staging/comedi/comedi_buf.c

@@ -13,10 +13,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include "comedidev.h"

diff --git a/drivers/staging/comedi/comedi_compat32.c b/drivers/staging/comedi/comedi_compat32.c
index ad208cd..2dfb06a 100644
--- a/drivers/staging/comedi/comedi_compat32.c
+++ b/drivers/staging/comedi/comedi_compat32.c

@@ -17,11 +17,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #include <linux/uaccess.h>

diff --git a/drivers/staging/comedi/comedi_compat32.h b/drivers/staging/comedi/comedi_compat32.h
index 60cf51c..28e3c30 100644
--- a/drivers/staging/comedi/comedi_compat32.h
+++ b/drivers/staging/comedi/comedi_compat32.h

@@ -17,11 +17,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _COMEDI_COMPAT32_H

diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c
index 924c54c..0794aac 100644
--- a/drivers/staging/comedi/comedi_fops.c
+++ b/drivers/staging/comedi/comedi_fops.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #undef DEBUG
@@ -536,6 +531,23 @@
 	return (runflags & (SRF_ERROR | SRF_RUNNING)) ? false : true;
 }
 
+/**
+ * comedi_alloc_spriv() - Allocate memory for the subdevice private data.
+ * @s: comedi_subdevice struct
+ * @size: size of the memory to allocate
+ *
+ * This also sets the subdevice runflags to allow the core to automatically
+ * free the private data during the detach.
+ */
+void *comedi_alloc_spriv(struct comedi_subdevice *s, size_t size)
+{
+	s->private = kzalloc(size, GFP_KERNEL);
+	if (s->private)
+		comedi_set_subdevice_runflags(s, ~0, SRF_FREE_SPRIV);
+	return s->private;
+}
+EXPORT_SYMBOL_GPL(comedi_alloc_spriv);
+
 /*
    This function restores a subdevice to an idle state.
  */
@@ -665,7 +677,7 @@
 	if (copy_from_user(&bc, arg, sizeof(bc)))
 		return -EFAULT;
 
-	if (bc.subdevice >= dev->n_subdevices || bc.subdevice < 0)
+	if (bc.subdevice >= dev->n_subdevices)
 		return -EINVAL;
 
 	s = &dev->subdevices[bc.subdevice];
@@ -918,7 +930,7 @@
 	if (copy_from_user(&bi, arg, sizeof(bi)))
 		return -EFAULT;
 
-	if (bi.subdevice >= dev->n_subdevices || bi.subdevice < 0)
+	if (bi.subdevice >= dev->n_subdevices)
 		return -EINVAL;
 
 	s = &dev->subdevices[bi.subdevice];

diff --git a/drivers/staging/comedi/comedi_pci.c b/drivers/staging/comedi/comedi_pci.c
index 5fad084..abbc0e4 100644
--- a/drivers/staging/comedi/comedi_pci.c
+++ b/drivers/staging/comedi/comedi_pci.c

@@ -14,10 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/pci.h>

diff --git a/drivers/staging/comedi/comedi_pcmcia.c b/drivers/staging/comedi/comedi_pcmcia.c
index 453ff3b..9d49d5d 100644
--- a/drivers/staging/comedi/comedi_pcmcia.c
+++ b/drivers/staging/comedi/comedi_pcmcia.c

@@ -14,10 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/kernel.h>

diff --git a/drivers/staging/comedi/comedi_usb.c b/drivers/staging/comedi/comedi_usb.c
index 9d9716a..13f18be 100644
--- a/drivers/staging/comedi/comedi_usb.c
+++ b/drivers/staging/comedi/comedi_usb.c

@@ -14,10 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/usb.h>
@@ -35,6 +31,18 @@
 EXPORT_SYMBOL_GPL(comedi_to_usb_interface);
 
 /**
+ * comedi_to_usb_dev() - comedi_device pointer to usb_device pointer.
+ * @dev: comedi_device struct
+ */
+struct usb_device *comedi_to_usb_dev(struct comedi_device *dev)
+{
+	struct usb_interface *intf = comedi_to_usb_interface(dev);
+
+	return intf ? interface_to_usbdev(intf) : NULL;
+}
+EXPORT_SYMBOL_GPL(comedi_to_usb_dev);
+
+/**
  * comedi_usb_auto_config() - Configure/probe a comedi USB driver.
  * @intf: usb_interface struct
  * @driver: comedi_driver struct

diff --git a/drivers/staging/comedi/comedidev.h b/drivers/staging/comedi/comedidev.h
index cdd4720..b75915f 100644
--- a/drivers/staging/comedi/comedidev.h
+++ b/drivers/staging/comedi/comedidev.h

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _COMEDIDEV_H
@@ -270,11 +265,14 @@
 	/* indicates an COMEDI_CB_ERROR event has occurred since the last
 	 * command was started */
 	SRF_ERROR = 0x00000004,
-	SRF_RUNNING = 0x08000000
+	SRF_RUNNING = 0x08000000,
+	SRF_FREE_SPRIV = 0x80000000,	/* free s->private on detach */
 };
 
 bool comedi_is_subdevice_running(struct comedi_subdevice *s);
 
+void *comedi_alloc_spriv(struct comedi_subdevice *s, size_t size);
+
 int comedi_check_chanlist(struct comedi_subdevice *s,
 			  int n,
 			  unsigned int *chanlist);
@@ -312,6 +310,18 @@
 	struct comedi_krange range[GCC_ZERO_LENGTH_ARRAY];
 };
 
+static inline bool comedi_range_is_bipolar(struct comedi_subdevice *s,
+					   unsigned int range)
+{
+	return s->range_table->range[range].min < 0;
+}
+
+static inline bool comedi_range_is_unipolar(struct comedi_subdevice *s,
+					    unsigned int range)
+{
+	return s->range_table->range[range].min >= 0;
+}
+
 /* some silly little inline functions */
 
 static inline unsigned int bytes_per_sample(const struct comedi_subdevice *subd)
@@ -349,7 +359,12 @@
 
 int comedi_alloc_subdevices(struct comedi_device *, int);
 
-void comedi_spriv_free(struct comedi_device *, int subdev_num);
+int comedi_load_firmware(struct comedi_device *, struct device *,
+			 const char *name,
+			 int (*cb)(struct comedi_device *,
+				   const u8 *data, size_t size,
+				   unsigned long context),
+			 unsigned long context);
 
 int __comedi_request_region(struct comedi_device *,
 			    unsigned long start, unsigned long len);
@@ -489,6 +504,7 @@
 struct usb_interface;
 
 struct usb_interface *comedi_to_usb_interface(struct comedi_device *);
+struct usb_device *comedi_to_usb_dev(struct comedi_device *);
 
 int comedi_usb_auto_config(struct usb_interface *, struct comedi_driver *,
 			   unsigned long context);

diff --git a/drivers/staging/comedi/comedilib.h b/drivers/staging/comedi/comedilib.h
index ca92c43..1a78b15 100644
--- a/drivers/staging/comedi/comedilib.h
+++ b/drivers/staging/comedi/comedilib.h

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _LINUX_COMEDILIB_H

diff --git a/drivers/staging/comedi/drivers.c b/drivers/staging/comedi/drivers.c
index 06d190f..e25eba5 100644
--- a/drivers/staging/comedi/drivers.c
+++ b/drivers/staging/comedi/drivers.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #include <linux/device.h>
@@ -38,6 +33,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/io.h>
 #include <linux/interrupt.h>
+#include <linux/firmware.h>
 
 #include "comedidev.h"
 #include "comedi_internal.h"
@@ -87,18 +83,6 @@
 }
 EXPORT_SYMBOL_GPL(comedi_alloc_subdevices);
 
-void comedi_spriv_free(struct comedi_device *dev, int subdev_num)
-{
-	struct comedi_subdevice *s;
-
-	if (dev->subdevices && subdev_num < dev->n_subdevices) {
-		s = &dev->subdevices[subdev_num];
-		kfree(s->private);
-		s->private = NULL;
-	}
-}
-EXPORT_SYMBOL_GPL(comedi_spriv_free);
-
 static void cleanup_device(struct comedi_device *dev)
 {
 	int i;
@@ -107,6 +91,8 @@
 	if (dev->subdevices) {
 		for (i = 0; i < dev->n_subdevices; i++) {
 			s = &dev->subdevices[i];
+			if (s->runflags & SRF_FREE_SPRIV)
+				kfree(s->private);
 			comedi_free_subdevice_minor(s);
 			if (s->async) {
 				comedi_buf_alloc(dev, s, 0);
@@ -352,6 +338,38 @@
 }
 
 /**
+ * comedi_load_firmware() - Request and load firmware for a device.
+ * @dev: comedi_device struct
+ * @hw_device: device struct for the comedi_device
+ * @name: the name of the firmware image
+ * @cb: callback to the upload the firmware image
+ * @context: private context from the driver
+ */
+int comedi_load_firmware(struct comedi_device *dev,
+			 struct device *device,
+			 const char *name,
+			 int (*cb)(struct comedi_device *dev,
+				   const u8 *data, size_t size,
+				   unsigned long context),
+			 unsigned long context)
+{
+	const struct firmware *fw;
+	int ret;
+
+	if (!cb)
+		return -EINVAL;
+
+	ret = request_firmware(&fw, name, device);
+	if (ret == 0) {
+		ret = cb(dev, fw->data, fw->size, context);
+		release_firmware(fw);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(comedi_load_firmware);
+
+/**
  * __comedi_request_region() - Request an I/O reqion for a legacy driver.
  * @dev: comedi_device struct
  * @start: base address of the I/O reqion

diff --git a/drivers/staging/comedi/drivers/8253.h b/drivers/staging/comedi/drivers/8253.h
index 429e0d6..3abedcd 100644
--- a/drivers/staging/comedi/drivers/8253.h
+++ b/drivers/staging/comedi/drivers/8253.h

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _8253_H

diff --git a/drivers/staging/comedi/drivers/8255.c b/drivers/staging/comedi/drivers/8255.c
index 1d48aa6..94e1750 100644
--- a/drivers/staging/comedi/drivers/8255.c
+++ b/drivers/staging/comedi/drivers/8255.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: 8255
@@ -81,7 +76,6 @@
 #include "../comedidev.h"
 
 #include <linux/ioport.h>
-#include <linux/slab.h>
 
 #include "comedi_fc.h"
 #include "8255.h"
@@ -290,15 +284,13 @@
 {
 	struct subdev_8255_private *spriv;
 
-	spriv = kzalloc(sizeof(*spriv), GFP_KERNEL);
+	spriv = comedi_alloc_spriv(s, sizeof(*spriv));
 	if (!spriv)
 		return -ENOMEM;
 
 	spriv->iobase	= iobase;
 	spriv->io	= io ? io : subdev_8255_io;
 
-	s->private	= spriv;
-
 	s->type		= COMEDI_SUBD_DIO;
 	s->subdev_flags	= SDF_READABLE | SDF_WRITABLE;
 	s->n_chan	= 24;
@@ -391,7 +383,6 @@
 			spriv = s->private;
 			release_region(spriv->iobase, _8255_SIZE);
 		}
-		comedi_spriv_free(dev, i);
 	}
 }
 

diff --git a/drivers/staging/comedi/drivers/8255.h b/drivers/staging/comedi/drivers/8255.h
index 0f6e749..4f16ea7 100644
--- a/drivers/staging/comedi/drivers/8255.h
+++ b/drivers/staging/comedi/drivers/8255.h

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _8255_H

diff --git a/drivers/staging/comedi/drivers/8255_pci.c b/drivers/staging/comedi/drivers/8255_pci.c
index 76dec96..3d3547c 100644
--- a/drivers/staging/comedi/drivers/8255_pci.c
+++ b/drivers/staging/comedi/drivers/8255_pci.c

@@ -19,10 +19,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -242,10 +238,7 @@
 static void pci_8255_detach(struct comedi_device *dev)
 {
 	struct pci_8255_private *devpriv = dev->private;
-	int i;
 
-	for (i = 0; i < dev->n_subdevices; i++)
-		comedi_spriv_free(dev, i);
 	if (devpriv && devpriv->mmio_base)
 		iounmap(devpriv->mmio_base);
 	comedi_pci_disable(dev);

diff --git a/drivers/staging/comedi/drivers/Makefile b/drivers/staging/comedi/drivers/Makefile
index 57e984f..dbb93e3 100644
--- a/drivers/staging/comedi/drivers/Makefile
+++ b/drivers/staging/comedi/drivers/Makefile

@@ -11,19 +11,16 @@
 obj-$(CONFIG_COMEDI_SKEL)		+= skel.o
 
 # Comedi ISA drivers
-obj-$(CONFIG_COMEDI_ACL7225B)		+= acl7225b.o
 obj-$(CONFIG_COMEDI_AMPLC_DIO200_ISA)	+= amplc_dio200.o
 obj-$(CONFIG_COMEDI_AMPLC_PC263_ISA)	+= amplc_pc263.o
 obj-$(CONFIG_COMEDI_PCL711)		+= pcl711.o
 obj-$(CONFIG_COMEDI_PCL724)		+= pcl724.o
-obj-$(CONFIG_COMEDI_PCL725)		+= pcl725.o
 obj-$(CONFIG_COMEDI_PCL726)		+= pcl726.o
 obj-$(CONFIG_COMEDI_PCL730)		+= pcl730.o
 obj-$(CONFIG_COMEDI_PCL812)		+= pcl812.o
 obj-$(CONFIG_COMEDI_PCL816)		+= pcl816.o
 obj-$(CONFIG_COMEDI_PCL818)		+= pcl818.o
 obj-$(CONFIG_COMEDI_PCM3724)		+= pcm3724.o
-obj-$(CONFIG_COMEDI_PCM3730)		+= pcm3730.o
 obj-$(CONFIG_COMEDI_RTI800)		+= rti800.o
 obj-$(CONFIG_COMEDI_RTI802)		+= rti802.o
 obj-$(CONFIG_COMEDI_DAS16M1)		+= das16m1.o
@@ -55,6 +52,7 @@
 obj-$(CONFIG_COMEDI_PCMUIO)		+= pcmuio.o
 obj-$(CONFIG_COMEDI_MULTIQ3)		+= multiq3.o
 obj-$(CONFIG_COMEDI_POC)		+= poc.o
+obj-$(CONFIG_COMEDI_S526)		+= s526.o
 
 # Comedi PCI drivers
 obj-$(CONFIG_COMEDI_8255_PCI)		+= 8255_pci.o
@@ -110,7 +108,6 @@
 obj-$(CONFIG_COMEDI_NI_PCIDIO)		+= ni_pcidio.o
 obj-$(CONFIG_COMEDI_NI_PCIMIO)		+= ni_pcimio.o
 obj-$(CONFIG_COMEDI_RTD520)		+= rtd520.o
-obj-$(CONFIG_COMEDI_S526)		+= s526.o
 obj-$(CONFIG_COMEDI_S626)		+= s626.o
 obj-$(CONFIG_COMEDI_SSV_DNP)		+= ssv_dnp.o
 

diff --git a/drivers/staging/comedi/drivers/acl7225b.c b/drivers/staging/comedi/drivers/acl7225b.c
deleted file mode 100644
index 9e2c7ae..0000000
--- a/drivers/staging/comedi/drivers/acl7225b.c
+++ /dev/null

@@ -1,136 +0,0 @@
-/*
- * comedi/drivers/acl7225b.c
- * Driver for Adlink NuDAQ ACL-7225b and clones
- * José Luis Sánchez
- */
-/*
-Driver: acl7225b
-Description: Adlink NuDAQ ACL-7225b & compatibles
-Author: José Luis Sánchez (jsanchezv@teleline.es)
-Status: testing
-Devices: [Adlink] ACL-7225b (acl7225b), [ICP] P16R16DIO (p16r16dio)
-*/
-
-#include "../comedidev.h"
-
-#include <linux/ioport.h>
-
-#define ACL7225_RIO_LO 0	/* Relays input/output low byte (R0-R7) */
-#define ACL7225_RIO_HI 1	/* Relays input/output high byte (R8-R15) */
-#define ACL7225_DI_LO  2	/* Digital input low byte (DI0-DI7) */
-#define ACL7225_DI_HI  3	/* Digital input high byte (DI8-DI15) */
-
-struct acl7225b_boardinfo {
-	const char *name;
-	int io_range;
-};
-
-static const struct acl7225b_boardinfo acl7225b_boards[] = {
-	{
-		.name		= "acl7225b",
-		.io_range	= 8,		/* only 4 are used */
-	}, {
-		.name		= "p16r16dio",
-		.io_range	= 4,
-	},
-};
-
-static int acl7225b_do_insn_bits(struct comedi_device *dev,
-				 struct comedi_subdevice *s,
-				 struct comedi_insn *insn,
-				 unsigned int *data)
-{
-	unsigned long reg = (unsigned long)s->private;
-	unsigned int mask = data[0];
-	unsigned int bits = data[1];
-
-	if (mask) {
-		s->state &= ~mask;
-		s->state |= (bits & mask);
-
-		if (mask & 0x00ff)
-			outb(s->state & 0xff, dev->iobase + reg);
-		if (mask & 0xff00)
-			outb((s->state >> 8), dev->iobase + reg + 1);
-	}
-
-	data[1] = s->state;
-
-	return insn->n;
-}
-
-static int acl7225b_di_insn_bits(struct comedi_device *dev,
-				 struct comedi_subdevice *s,
-				 struct comedi_insn *insn,
-				 unsigned int *data)
-{
-	unsigned long reg = (unsigned long)s->private;
-
-	data[1] = inb(dev->iobase + reg) |
-		  (inb(dev->iobase + reg + 1) << 8);
-
-	return insn->n;
-}
-
-static int acl7225b_attach(struct comedi_device *dev,
-			   struct comedi_devconfig *it)
-{
-	const struct acl7225b_boardinfo *board = comedi_board(dev);
-	struct comedi_subdevice *s;
-	int ret;
-
-	ret = comedi_request_region(dev, it->options[0], board->io_range);
-	if (ret)
-		return ret;
-
-	ret = comedi_alloc_subdevices(dev, 3);
-	if (ret)
-		return ret;
-
-	s = &dev->subdevices[0];
-	/* Relays outputs */
-	s->type		= COMEDI_SUBD_DO;
-	s->subdev_flags	= SDF_WRITABLE;
-	s->maxdata	= 1;
-	s->n_chan	= 16;
-	s->insn_bits	= acl7225b_do_insn_bits;
-	s->range_table	= &range_digital;
-	s->private	= (void *)ACL7225_RIO_LO;
-
-	s = &dev->subdevices[1];
-	/* Relays status */
-	s->type		= COMEDI_SUBD_DI;
-	s->subdev_flags	= SDF_READABLE;
-	s->maxdata	= 1;
-	s->n_chan	= 16;
-	s->insn_bits	= acl7225b_di_insn_bits;
-	s->range_table	= &range_digital;
-	s->private	= (void *)ACL7225_RIO_LO;
-
-	s = &dev->subdevices[2];
-	/* Isolated digital inputs */
-	s->type		= COMEDI_SUBD_DI;
-	s->subdev_flags	= SDF_READABLE;
-	s->maxdata	= 1;
-	s->n_chan	= 16;
-	s->insn_bits	= acl7225b_di_insn_bits;
-	s->range_table	= &range_digital;
-	s->private	= (void *)ACL7225_DI_LO;
-
-	return 0;
-}
-
-static struct comedi_driver acl7225b_driver = {
-	.driver_name	= "acl7225b",
-	.module		= THIS_MODULE,
-	.attach		= acl7225b_attach,
-	.detach		= comedi_legacy_detach,
-	.board_name	= &acl7225b_boards[0].name,
-	.num_names	= ARRAY_SIZE(acl7225b_boards),
-	.offset		= sizeof(struct acl7225b_boardinfo),
-};
-module_comedi_driver(acl7225b_driver);
-
-MODULE_DESCRIPTION("Comedi: NuDAQ ACL-7225B, 16 Relay & 16 Isolated DI Card");
-MODULE_AUTHOR("Comedi http://www.comedi.org");
-MODULE_LICENSE("GPL");

diff --git a/drivers/staging/comedi/drivers/addi-data/APCI1710_Chrono.c b/drivers/staging/comedi/drivers/addi-data/APCI1710_Chrono.c
index 5bd7fe6..d91f586 100644
--- a/drivers/staging/comedi/drivers/addi-data/APCI1710_Chrono.c
+++ b/drivers/staging/comedi/drivers/addi-data/APCI1710_Chrono.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/APCI1710_Dig_io.c b/drivers/staging/comedi/drivers/addi-data/APCI1710_Dig_io.c
index 6b38ce7..27de18e 100644
--- a/drivers/staging/comedi/drivers/addi-data/APCI1710_Dig_io.c
+++ b/drivers/staging/comedi/drivers/addi-data/APCI1710_Dig_io.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/APCI1710_INCCPT.c b/drivers/staging/comedi/drivers/addi-data/APCI1710_INCCPT.c
index 70a7f95..c9db601 100644
--- a/drivers/staging/comedi/drivers/addi-data/APCI1710_INCCPT.c
+++ b/drivers/staging/comedi/drivers/addi-data/APCI1710_INCCPT.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/APCI1710_Inp_cpt.c b/drivers/staging/comedi/drivers/addi-data/APCI1710_Inp_cpt.c
index be0c6ad..6bbcb06 100644
--- a/drivers/staging/comedi/drivers/addi-data/APCI1710_Inp_cpt.c
+++ b/drivers/staging/comedi/drivers/addi-data/APCI1710_Inp_cpt.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/APCI1710_Pwm.c b/drivers/staging/comedi/drivers/addi-data/APCI1710_Pwm.c
index a211e78..5c83033 100644
--- a/drivers/staging/comedi/drivers/addi-data/APCI1710_Pwm.c
+++ b/drivers/staging/comedi/drivers/addi-data/APCI1710_Pwm.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/APCI1710_Ssi.c b/drivers/staging/comedi/drivers/addi-data/APCI1710_Ssi.c
index 97e7eec..6ef1d6a 100644
--- a/drivers/staging/comedi/drivers/addi-data/APCI1710_Ssi.c
+++ b/drivers/staging/comedi/drivers/addi-data/APCI1710_Ssi.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/APCI1710_Tor.c b/drivers/staging/comedi/drivers/addi-data/APCI1710_Tor.c
index 3bc9826..0b79531 100644
--- a/drivers/staging/comedi/drivers/addi-data/APCI1710_Tor.c
+++ b/drivers/staging/comedi/drivers/addi-data/APCI1710_Tor.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/APCI1710_Ttl.c b/drivers/staging/comedi/drivers/addi-data/APCI1710_Ttl.c
index c8238b8..fb56360 100644
--- a/drivers/staging/comedi/drivers/addi-data/APCI1710_Ttl.c
+++ b/drivers/staging/comedi/drivers/addi-data/APCI1710_Ttl.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/addi_common.c b/drivers/staging/comedi/drivers/addi-data/addi_common.c
index 0c3db57..f25e008 100644
--- a/drivers/staging/comedi/drivers/addi-data/addi_common.c
+++ b/drivers/staging/comedi/drivers/addi-data/addi_common.c

@@ -20,13 +20,6 @@
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this
-source code.
-
 @endverbatim
 */
 /*
@@ -46,10 +39,6 @@
   +-----------------------------------------------------------------------+
 */
 
-#ifndef COMEDI_SUBD_TTLIO
-#define COMEDI_SUBD_TTLIO   11	/* Digital Input Output But TTL */
-#endif
-
 static int i_ADDIDATA_InsnReadEeprom(struct comedi_device *dev,
 				     struct comedi_subdevice *s,
 				     struct comedi_insn *insn,
@@ -105,23 +94,14 @@
 	if (ret)
 		return ret;
 
-	if (!this_board->pc_EepromChip ||
-	    strcmp(this_board->pc_EepromChip, ADDIDATA_9054)) {
-		/* board does not have an eeprom or is not ADDIDATA_9054 */
-		if (this_board->i_IorangeBase1)
-			dev->iobase = pci_resource_start(pcidev, 1);
-		else
-			dev->iobase = pci_resource_start(pcidev, 0);
+	if (this_board->i_IorangeBase1)
+		dev->iobase = pci_resource_start(pcidev, 1);
+	else
+		dev->iobase = pci_resource_start(pcidev, 0);
 
-		devpriv->iobase = dev->iobase;
-		devpriv->i_IobaseAmcc = pci_resource_start(pcidev, 0);
-		devpriv->i_IobaseAddon = pci_resource_start(pcidev, 2);
-	} else {
-		/* board has an ADDIDATA_9054 eeprom */
-		dev->iobase = pci_resource_start(pcidev, 2);
-		devpriv->iobase = pci_resource_start(pcidev, 2);
-		devpriv->dw_AiBase = pci_ioremap_bar(pcidev, 3);
-	}
+	devpriv->iobase = dev->iobase;
+	devpriv->i_IobaseAmcc = pci_resource_start(pcidev, 0);
+	devpriv->i_IobaseAddon = pci_resource_start(pcidev, 2);
 	devpriv->i_IobaseReserved = pci_resource_start(pcidev, 3);
 
 	/* Initialize parameters that can be overridden in EEPROM */
@@ -132,7 +112,6 @@
 	devpriv->s_EeParameters.i_NbrDiChannel = this_board->i_NbrDiChannel;
 	devpriv->s_EeParameters.i_NbrDoChannel = this_board->i_NbrDoChannel;
 	devpriv->s_EeParameters.i_DoMaxdata = this_board->i_DoMaxdata;
-	devpriv->s_EeParameters.i_Dma = this_board->i_Dma;
 	devpriv->s_EeParameters.i_Timer = this_board->i_Timer;
 	devpriv->s_EeParameters.ui_MinAcquisitiontimeNs =
 		this_board->ui_MinAcquisitiontimeNs;
@@ -191,9 +170,6 @@
 		s->len_chanlist = this_board->i_AiChannelList;
 		s->range_table = this_board->pr_AiRangelist;
 
-		/* Set the initialisation flag */
-		devpriv->b_AiInitialisation = 1;
-
 		s->insn_config = this_board->ai_config;
 		s->insn_read = this_board->ai_read;
 		s->insn_write = this_board->ai_write;
@@ -215,8 +191,6 @@
 		s->maxdata = devpriv->s_EeParameters.i_AoMaxdata;
 		s->len_chanlist =
 			devpriv->s_EeParameters.i_NbrAoChannel;
-		s->range_table = this_board->pr_AoRangelist;
-		s->insn_config = this_board->ao_config;
 		s->insn_write = this_board->ao_write;
 	} else {
 		s->type = COMEDI_SUBD_UNUSED;
@@ -281,22 +255,7 @@
 
 	/*  Allocate and Initialise TTL */
 	s = &dev->subdevices[5];
-	if (this_board->i_NbrTTLChannel) {
-		s->type = COMEDI_SUBD_TTLIO;
-		s->subdev_flags =
-			SDF_WRITEABLE | SDF_READABLE | SDF_GROUND | SDF_COMMON;
-		s->n_chan = this_board->i_NbrTTLChannel;
-		s->maxdata = 1;
-		s->io_bits = 0;	/* all bits input */
-		s->len_chanlist = this_board->i_NbrTTLChannel;
-		s->range_table = &range_digital;
-		s->insn_config = this_board->ttl_config;
-		s->insn_bits = this_board->ttl_bits;
-		s->insn_read = this_board->ttl_read;
-		s->insn_write = this_board->ttl_write;
-	} else {
-		s->type = COMEDI_SUBD_UNUSED;
-	}
+	s->type = COMEDI_SUBD_UNUSED;
 
 	/* EEPROM */
 	s = &dev->subdevices[6];
@@ -323,8 +282,6 @@
 			i_ADDI_Reset(dev);
 		if (dev->irq)
 			free_irq(dev->irq, dev);
-		if (devpriv->dw_AiBase)
-			iounmap(devpriv->dw_AiBase);
 	}
 	comedi_pci_disable(dev);
 }

diff --git a/drivers/staging/comedi/drivers/addi-data/addi_common.h b/drivers/staging/comedi/drivers/addi-data/addi_common.h
index c034bf1..f1be5ad 100644
--- a/drivers/staging/comedi/drivers/addi-data/addi_common.h
+++ b/drivers/staging/comedi/drivers/addi-data/addi_common.h

@@ -18,12 +18,8 @@
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 
-#define LOBYTE(W)	(unsigned char)((W) & 0xFF)
-#define HIBYTE(W)	(unsigned char)(((W) >> 8) & 0xFF)
-#define MAKEWORD(H, L)	(unsigned short)((L) | ((H) << 8))
 #define LOWORD(W)	(unsigned short)((W) & 0xFFFF)
 #define HIWORD(W)	(unsigned short)(((W) >> 16) & 0xFFFF)
-#define MAKEDWORD(H, L)	(unsigned int)((L) | ((H) << 16))
 
 #define ADDI_ENABLE		1
 #define ADDI_DISABLE		0
@@ -33,8 +29,6 @@
 #define ADDIDATA_NO_EEPROM	0
 #define ADDIDATA_93C76		"93C76"
 #define ADDIDATA_S5920		"S5920"
-#define ADDIDATA_S5933		"S5933"
-#define ADDIDATA_9054		"9054"
 
 /* ADDIDATA Enable Disable */
 #define ADDIDATA_ENABLE		1
@@ -55,17 +49,12 @@
 	int i_AiMaxdata;	/*  resolution of A/D */
 	int i_AoMaxdata;	/*  resolution of D/A */
 	const struct comedi_lrange *pr_AiRangelist;	/* rangelist for A/D */
-	const struct comedi_lrange *pr_AoRangelist;	/* rangelist for D/A */
 
 	int i_NbrDiChannel;	/*  Number of DI channels */
 	int i_NbrDoChannel;	/*  Number of DO channels */
 	int i_DoMaxdata;	/*  data to set all channels high */
 
-	int i_NbrTTLChannel;	/*  Number of TTL channels */
-
-	int i_Dma;		/*  dma present or not */
 	int i_Timer;		/*    timer subdevice present or not */
-	unsigned char b_AvailableConvertUnit;
 	unsigned int ui_MinAcquisitiontimeNs;	/*  Minimum Acquisition in Nano secs */
 	unsigned int ui_MinDelaytimeNs;	/*  Minimum Delay in Nano secs */
 
@@ -90,12 +79,8 @@
 	int (*ai_cancel)(struct comedi_device *, struct comedi_subdevice *);
 
 	/* Analog Output */
-	int (*ao_config)(struct comedi_device *, struct comedi_subdevice *,
-			 struct comedi_insn *, unsigned int *);
 	int (*ao_write)(struct comedi_device *, struct comedi_subdevice *,
 			struct comedi_insn *, unsigned int *);
-	int (*ao_bits)(struct comedi_device *, struct comedi_subdevice *,
-		       struct comedi_insn *, unsigned int *);
 
 	/* Digital Input */
 	int (*di_config)(struct comedi_device *, struct comedi_subdevice *,
@@ -126,16 +111,6 @@
 			  struct comedi_insn *, unsigned int *);
 	int (*timer_bits)(struct comedi_device *, struct comedi_subdevice *,
 			  struct comedi_insn *, unsigned int *);
-
-	/* TTL IO */
-	int (*ttl_config)(struct comedi_device *, struct comedi_subdevice *,
-			  struct comedi_insn *, unsigned int *);
-	int (*ttl_bits)(struct comedi_device *, struct comedi_subdevice *,
-			struct comedi_insn *, unsigned int *);
-	int (*ttl_read)(struct comedi_device *, struct comedi_subdevice *,
-			struct comedi_insn *, unsigned int *);
-	int (*ttl_write)(struct comedi_device *, struct comedi_subdevice *,
-			 struct comedi_insn *, unsigned int *);
 };
 
 /* MODULE INFO STRUCTURE */
@@ -283,58 +258,41 @@
 
 /* Private structure for the addi_apci3120 driver */
 struct addi_private {
-
 	int iobase;
 	int i_IobaseAmcc;	/*  base+size for AMCC chip */
 	int i_IobaseAddon;	/* addon base address */
 	int i_IobaseReserved;
-	void __iomem *dw_AiBase;
 	unsigned char b_AiContinuous;	/*  we do unlimited AI */
-	unsigned char b_AiInitialisation;
 	unsigned int ui_AiActualScan;	/* how many scans we finished */
-	unsigned int ui_AiBufferPtr;	/*  data buffer ptr in samples */
 	unsigned int ui_AiNbrofChannels;	/*  how many channels is measured */
 	unsigned int ui_AiScanLength;	/*  Length of actual scanlist */
-	unsigned int ui_AiActualScanPosition;	/*  position in actual scan */
 	unsigned int *pui_AiChannelList;	/*  actual chanlist */
 	unsigned int ui_AiChannelList[32];	/*  actual chanlist */
-	unsigned char b_AiChannelConfiguration[32];	/*  actual chanlist */
 	unsigned int ui_AiReadData[32];
-	unsigned int dw_AiInitialised;
 	unsigned int ui_AiTimer0;	/* Timer Constant for Timer0 */
 	unsigned int ui_AiTimer1;	/* Timer constant for Timer1 */
 	unsigned int ui_AiFlags;
 	unsigned int ui_AiDataLength;
-	short *AiData;	/*  Pointer to sample data */
 	unsigned int ui_AiNbrofScans;	/*  number of scans to do */
 	unsigned short us_UseDma;	/*  To use Dma or not */
 	unsigned char b_DmaDoubleBuffer;	/*  we can use double buffering */
 	unsigned int ui_DmaActualBuffer;	/*  which buffer is used now */
-	/* UPDATE-0.7.57->0.7.68 */
-	/* unsigned int               ul_DmaBufferVirtual[2]; pointers to begin of DMA buffer */
 	short *ul_DmaBufferVirtual[2];	/*  pointers to begin of DMA buffer */
 	unsigned int ul_DmaBufferHw[2];	/*  hw address of DMA buff */
 	unsigned int ui_DmaBufferSize[2];	/*  size of dma buffer in bytes */
 	unsigned int ui_DmaBufferUsesize[2];	/*  which size we may now used for transfer */
-	unsigned int ui_DmaBufferSamples[2];	/*  size in samples */
 	unsigned int ui_DmaBufferPages[2];	/*  number of pages in buffer */
 	unsigned char b_DigitalOutputRegister;	/*  Digital Output Register */
 	unsigned char b_OutputMemoryStatus;
-	unsigned char b_AnalogInputChannelNbr;	/*  Analog input channel Nbr */
-	unsigned char b_AnalogOutputChannelNbr;	/*  Analog input Output  Nbr */
 	unsigned char b_TimerSelectMode;	/*  Contain data written at iobase + 0C */
 	unsigned char b_ModeSelectRegister;	/*  Contain data written at iobase + 0E */
 	unsigned short us_OutputRegister;	/*  Contain data written at iobase + 0 */
-	unsigned char b_InterruptState;
-	unsigned char b_TimerInit;	/*  Specify if InitTimerWatchdog was load */
-	unsigned char b_TimerStarted;	/*  Specify if timer 2 is running or not */
 	unsigned char b_Timer2Mode;	/*  Specify the timer 2 mode */
 	unsigned char b_Timer2Interrupt;	/* Timer2  interrupt enable or disable */
 	unsigned char b_AiCyclicAcquisition;	/*  indicate cyclic acquisition */
 	unsigned char b_InterruptMode;	/*  eoc eos or dma */
 	unsigned char b_EocEosInterrupt;	/*  Enable disable eoc eos interrupt */
 	unsigned int ui_EocEosConversionTime;
-	unsigned char b_EocEosConversionTimeBase;
 	unsigned char b_SingelDiff;
 	unsigned char b_ExttrigEnable;	/* To enable or disable external trigger */
 
@@ -365,7 +323,6 @@
 	} s_InterruptParameters;
 
 	union str_ModuleInfo s_ModuleInfo[4];
-	unsigned int ul_TTLPortConfiguration[10];
 
 	/* Parameters read from EEPROM overriding static board info */
 	struct {
@@ -376,7 +333,6 @@
 		int i_NbrDiChannel;	/*  Number of DI channels */
 		int i_NbrDoChannel;	/*  Number of DO channels */
 		int i_DoMaxdata;	/*  data to set all channels high */
-		int i_Dma;		/*  dma present or not */
 		int i_Timer;		/*  timer subdevice present or not */
 		unsigned int ui_MinAcquisitiontimeNs;
 					/*  Minimum Acquisition in Nano secs */

diff --git a/drivers/staging/comedi/drivers/addi-data/addi_eeprom.c b/drivers/staging/comedi/drivers/addi-data/addi_eeprom.c
index 5124ac9..dc031c4 100644
--- a/drivers/staging/comedi/drivers/addi-data/addi_eeprom.c
+++ b/drivers/staging/comedi/drivers/addi-data/addi_eeprom.c

@@ -20,13 +20,6 @@
  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  * for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #define NVRAM_USER_DATA_START	0x100
@@ -302,7 +295,7 @@
 	devpriv->s_EeParameters.ui_MinDelaytimeNs = tmp * 1000;
 
 	tmp = addi_eeprom_readw(iobase, type, addr + 20);
-	devpriv->s_EeParameters.i_Dma = (tmp >> 13) & 0x01;
+	/* dma = (tmp >> 13) & 0x01; */
 
 	tmp = addi_eeprom_readw(iobase, type, addr + 72) & 0xff;
 	if (tmp) {		/* > 0 */

diff --git a/drivers/staging/comedi/drivers/addi-data/hwdrv_APCI1710.c b/drivers/staging/comedi/drivers/addi-data/hwdrv_APCI1710.c
index b05f850..b1a7ec1 100644
--- a/drivers/staging/comedi/drivers/addi-data/hwdrv_APCI1710.c
+++ b/drivers/staging/comedi/drivers/addi-data/hwdrv_APCI1710.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci035.c b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci035.c
index 3d66e48..1128c22 100644
--- a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci035.c
+++ b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci035.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci1500.c b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci1500.c
index 24c4c98..0549105 100644
--- a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci1500.c
+++ b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci1500.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci1564.c b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci1564.c
index fc31c4b..e3cc429 100644
--- a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci1564.c
+++ b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci1564.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3120.c b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3120.c
index 74065ba..a89e505 100644
--- a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3120.c
+++ b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3120.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
@@ -724,9 +720,7 @@
 	inb(dev->iobase + APCI3120_RESET_FIFO);
 	inw(dev->iobase + APCI3120_RD_STATUS);
 	devpriv->ui_AiActualScan = 0;
-	devpriv->ui_AiActualScanPosition = 0;
 	s->async->cur_chan = 0;
-	devpriv->ui_AiBufferPtr = 0;
 	devpriv->b_AiContinuous = 0;
 	devpriv->ui_DmaActualBuffer = 0;
 
@@ -895,9 +889,7 @@
 	/* END JK 07.05.04: Comparison between WIN32 and Linux driver */
 
 	devpriv->ui_AiActualScan = 0;
-	devpriv->ui_AiActualScanPosition = 0;
 	s->async->cur_chan = 0;
-	devpriv->ui_AiBufferPtr = 0;
 	devpriv->ui_DmaActualBuffer = 0;
 
 	/*  value for timer2  minus -2 has to be done .....dunno y?? */
@@ -1351,8 +1343,6 @@
 	devpriv->ui_AiScanLength = cmd->scan_end_arg;
 	devpriv->pui_AiChannelList = cmd->chanlist;
 
-	/* UPDATE-0.7.57->0.7.68devpriv->AiData=s->async->data; */
-	devpriv->AiData = s->async->prealloc_buf;
 	/* UPDATE-0.7.57->0.7.68devpriv->ui_AiDataLength=s->async->data_len; */
 	devpriv->ui_AiDataLength = s->async->prealloc_bufsz;
 

diff --git a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3200.c b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3200.c
index c790873..32dce03 100644
--- a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3200.c
+++ b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3200.c

@@ -15,10 +15,6 @@
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*

diff --git a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3xxx.c b/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3xxx.c
deleted file mode 100644
index a45a2a2..0000000
--- a/drivers/staging/comedi/drivers/addi-data/hwdrv_apci3xxx.c
+++ /dev/null

@@ -1,1376 +0,0 @@
-/**
-@verbatim
-
-Copyright (C) 2004,2005  ADDI-DATA GmbH for the source code of this module.
-
-	ADDI-DATA GmbH
-	Dieselstrasse 3
-	D-77833 Ottersweier
-	Tel: +19(0)7223/9493-0
-	Fax: +49(0)7223/9493-92
-	http://www.addi-data.com
-	info@addi-data.com
-
-This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
-@endverbatim
-*/
-/*
-  +-----------------------------------------------------------------------+
-  | (C) ADDI-DATA GmbH          Dieselstrasse 3      D-77833 Ottersweier  |
-  +-----------------------------------------------------------------------+
-  | Tel : +49 (0) 7223/9493-0     | email    : info@addi-data.com         |
-  | Fax : +49 (0) 7223/9493-92    | Internet : http://www.addi-data.com   |
-  +-----------------------------------------------------------------------+
-  | Project     : APCI-3XXX       | Compiler   : GCC                      |
-  | Module name : hwdrv_apci3xxx.c| Version    : 2.96                     |
-  +-------------------------------+---------------------------------------+
-  | Project manager: S. Weber     | Date       :  15/09/2005              |
-  +-----------------------------------------------------------------------+
-  | Description :APCI3XXX Module.  Hardware abstraction Layer for APCI3XXX|
-  +-----------------------------------------------------------------------+
-  |                             UPDATE'S                                  |
-  +-----------------------------------------------------------------------+
-  |   Date   |   Author  |          Description of updates                |
-  +----------+-----------+------------------------------------------------+
-  |          | 		 | 						  |
-  |          |           |						  |
-  +----------+-----------+------------------------------------------------+
-*/
-
-#ifndef COMEDI_SUBD_TTLIO
-#define COMEDI_SUBD_TTLIO	11	/* Digital Input Output But TTL */
-#endif
-
-#define APCI3XXX_SINGLE				0
-#define APCI3XXX_DIFF				1
-#define APCI3XXX_CONFIGURATION			0
-
-#define APCI3XXX_TTL_INIT_DIRECTION_PORT2	0
-
-static const struct comedi_lrange range_apci3XXX_ai = {
-	8, {
-		BIP_RANGE(10),
-		BIP_RANGE(5),
-		BIP_RANGE(2),
-		BIP_RANGE(1),
-		UNI_RANGE(10),
-		UNI_RANGE(5),
-		UNI_RANGE(2),
-		UNI_RANGE(1)
-	}
-};
-
-static const struct comedi_lrange range_apci3XXX_ao = {
-	2, {
-		BIP_RANGE(10),
-		UNI_RANGE(10)
-	}
-};
-
-/*
-+----------------------------------------------------------------------------+
-|                         ANALOG INPUT FUNCTIONS                             |
-+----------------------------------------------------------------------------+
-*/
-
-/*
-+----------------------------------------------------------------------------+
-| Function Name     : int   i_APCI3XXX_TestConversionStarted                 |
-|                          (struct comedi_device    *dev)                           |
-+----------------------------------------------------------------------------+
-| Task                Test if any conversion started                         |
-+----------------------------------------------------------------------------+
-| Input Parameters  : -                                                      |
-+----------------------------------------------------------------------------+
-| Output Parameters : -                                                      |
-+----------------------------------------------------------------------------+
-| Return Value      : 0 : Conversion not started                             |
-|                     1 : Conversion started                                 |
-+----------------------------------------------------------------------------+
-*/
-static int i_APCI3XXX_TestConversionStarted(struct comedi_device *dev)
-{
-	struct addi_private *devpriv = dev->private;
-
-	if ((readl(devpriv->dw_AiBase + 8) & 0x80000UL) == 0x80000UL)
-		return 1;
-	else
-		return 0;
-
-}
-
-/*
-+----------------------------------------------------------------------------+
-| Function Name     : int   i_APCI3XXX_AnalogInputConfigOperatingMode        |
-|                          (struct comedi_device    *dev,                           |
-|                           struct comedi_subdevice *s,                             |
-|                           struct comedi_insn      *insn,                          |
-|                           unsigned int         *data)                          |
-+----------------------------------------------------------------------------+
-| Task           Converting mode and convert time selection                  |
-+----------------------------------------------------------------------------+
-| Input Parameters  : b_SingleDiff  = (unsigned char)  data[1];                       |
-|                     b_TimeBase    = (unsigned char)  data[2]; (0: ns, 1:micros 2:ms)|
-|                    dw_ReloadValue = (unsigned int) data[3];                       |
-|                     ........                                               |
-+----------------------------------------------------------------------------+
-| Output Parameters : -                                                      |
-+----------------------------------------------------------------------------+
-| Return Value      :>0 : No error                                           |
-|                    -1 : Single/Diff selection error                        |
-|                    -2 : Convert time base unity selection error            |
-|                    -3 : Convert time value selection error                 |
-|                    -10: Any conversion started                             |
-|                    ....                                                    |
-|                    -100 : Config command error                             |
-|                    -101 : Data size error                                  |
-+----------------------------------------------------------------------------+
-*/
-static int i_APCI3XXX_AnalogInputConfigOperatingMode(struct comedi_device *dev,
-						     struct comedi_subdevice *s,
-						     struct comedi_insn *insn,
-						     unsigned int *data)
-{
-	const struct addi_board *this_board = comedi_board(dev);
-	struct addi_private *devpriv = dev->private;
-	int i_ReturnValue = insn->n;
-	unsigned char b_TimeBase = 0;
-	unsigned char b_SingleDiff = 0;
-	unsigned int dw_ReloadValue = 0;
-	unsigned int dw_TestReloadValue = 0;
-
-	/************************/
-	/* Test the buffer size */
-	/************************/
-
-	if (insn->n == 4) {
-	   /****************************/
-		/* Get the Singel/Diff flag */
-	   /****************************/
-
-		b_SingleDiff = (unsigned char) data[1];
-
-	   /****************************/
-		/* Get the time base unitiy */
-	   /****************************/
-
-		b_TimeBase = (unsigned char) data[2];
-
-	   /*************************************/
-		/* Get the convert time reload value */
-	   /*************************************/
-
-		dw_ReloadValue = (unsigned int) data[3];
-
-	   /**********************/
-		/* Test the time base */
-	   /**********************/
-
-		if ((this_board->b_AvailableConvertUnit & (1 << b_TimeBase)) !=
-			0) {
-	      /*******************************/
-			/* Test the convert time value */
-	      /*******************************/
-
-			if (dw_ReloadValue <= 65535) {
-				dw_TestReloadValue = dw_ReloadValue;
-
-				if (b_TimeBase == 1) {
-					dw_TestReloadValue =
-						dw_TestReloadValue * 1000UL;
-				}
-				if (b_TimeBase == 2) {
-					dw_TestReloadValue =
-						dw_TestReloadValue * 1000000UL;
-				}
-
-		 /*******************************/
-				/* Test the convert time value */
-		 /*******************************/
-
-				if (dw_TestReloadValue >=
-					devpriv->s_EeParameters.
-					ui_MinAcquisitiontimeNs) {
-					if ((b_SingleDiff == APCI3XXX_SINGLE)
-						|| (b_SingleDiff ==
-							APCI3XXX_DIFF)) {
-						if (((b_SingleDiff == APCI3XXX_SINGLE)
-						        && (devpriv->s_EeParameters.i_NbrAiChannel == 0))
-						    || ((b_SingleDiff == APCI3XXX_DIFF)
-							&& (this_board->i_NbrAiChannelDiff == 0))
-						    ) {
-			   /*******************************/
-							/* Single/Diff selection error */
-			   /*******************************/
-
-							printk("Single/Diff selection error\n");
-							i_ReturnValue = -1;
-						} else {
-			   /**********************************/
-							/* Test if conversion not started */
-			   /**********************************/
-
-							if (i_APCI3XXX_TestConversionStarted(dev) == 0) {
-								devpriv->
-									ui_EocEosConversionTime
-									=
-									(unsigned int)
-									dw_ReloadValue;
-								devpriv->
-									b_EocEosConversionTimeBase
-									=
-									b_TimeBase;
-								devpriv->
-									b_SingelDiff
-									=
-									b_SingleDiff;
-								devpriv->
-									b_AiInitialisation
-									= 1;
-
-			      /*******************************/
-								/* Set the convert timing unit */
-			      /*******************************/
-
-								writel((unsigned int)b_TimeBase,
-									devpriv->dw_AiBase + 36);
-
-			      /**************************/
-								/* Set the convert timing */
-			      /*************************/
-
-								writel(dw_ReloadValue, devpriv->dw_AiBase + 32);
-							} else {
-			      /**************************/
-								/* Any conversion started */
-			      /**************************/
-
-								printk("Any conversion started\n");
-								i_ReturnValue =
-									-10;
-							}
-						}
-					} else {
-		       /*******************************/
-						/* Single/Diff selection error */
-		       /*******************************/
-
-						printk("Single/Diff selection error\n");
-						i_ReturnValue = -1;
-					}
-				} else {
-		    /************************/
-					/* Time selection error */
-		    /************************/
-
-					printk("Convert time value selection error\n");
-					i_ReturnValue = -3;
-				}
-			} else {
-		 /************************/
-				/* Time selection error */
-		 /************************/
-
-				printk("Convert time value selection error\n");
-				i_ReturnValue = -3;
-			}
-		} else {
-	      /*****************************/
-			/* Time base selection error */
-	      /*****************************/
-
-			printk("Convert time base unity selection error\n");
-			i_ReturnValue = -2;
-		}
-	} else {
-	   /*******************/
-		/* Data size error */
-	   /*******************/
-
-		printk("Buffer size error\n");
-		i_ReturnValue = -101;
-	}
-
-	return i_ReturnValue;
-}
-
-/*
-+----------------------------------------------------------------------------+
-| Function Name     : int   i_APCI3XXX_InsnConfigAnalogInput                 |
-|                          (struct comedi_device    *dev,                           |
-|                           struct comedi_subdevice *s,                             |
-|                           struct comedi_insn      *insn,                          |
-|                           unsigned int         *data)                          |
-+----------------------------------------------------------------------------+
-| Task           Converting mode and convert time selection                  |
-+----------------------------------------------------------------------------+
-| Input Parameters  : b_ConvertMode = (unsigned char)  data[0];                       |
-|                     b_TimeBase    = (unsigned char)  data[1]; (0: ns, 1:micros 2:ms)|
-|                    dw_ReloadValue = (unsigned int) data[2];                       |
-|                     ........                                               |
-+----------------------------------------------------------------------------+
-| Output Parameters : -                                                      |
-+----------------------------------------------------------------------------+
-| Return Value      :>0: No error                                            |
-|                    ....                                                    |
-|                    -100 : Config command error                             |
-|                    -101 : Data size error                                  |
-+----------------------------------------------------------------------------+
-*/
-static int i_APCI3XXX_InsnConfigAnalogInput(struct comedi_device *dev,
-					    struct comedi_subdevice *s,
-					    struct comedi_insn *insn,
-					    unsigned int *data)
-{
-	int i_ReturnValue = insn->n;
-
-	/************************/
-	/* Test the buffer size */
-	/************************/
-
-	if (insn->n >= 1) {
-		switch ((unsigned char) data[0]) {
-		case APCI3XXX_CONFIGURATION:
-			i_ReturnValue =
-				i_APCI3XXX_AnalogInputConfigOperatingMode(dev,
-				s, insn, data);
-			break;
-
-		default:
-			i_ReturnValue = -100;
-			printk("Config command error %d\n", data[0]);
-			break;
-		}
-	} else {
-	   /*******************/
-		/* Data size error */
-	   /*******************/
-
-		printk("Buffer size error\n");
-		i_ReturnValue = -101;
-	}
-
-	return i_ReturnValue;
-}
-
-/*
-+----------------------------------------------------------------------------+
-| Function Name     : int   i_APCI3XXX_InsnReadAnalogInput                   |
-|                          (struct comedi_device    *dev,                           |
-|                           struct comedi_subdevice *s,                             |
-|                           struct comedi_insn      *insn,                          |
-|                           unsigned int         *data)                          |
-+----------------------------------------------------------------------------+
-| Task                Read 1 analog input                                    |
-+----------------------------------------------------------------------------+
-| Input Parameters  : b_Range             = CR_RANGE(insn->chanspec);        |
-|                     b_Channel           = CR_CHAN(insn->chanspec);         |
-|                     dw_NbrOfAcquisition = insn->n;                         |
-+----------------------------------------------------------------------------+
-| Output Parameters : -                                                      |
-+----------------------------------------------------------------------------+
-| Return Value      :>0: No error                                            |
-|                    -3 : Channel selection error                            |
-|                    -4 : Configuration selelection error                    |
-|                    -10: Any conversion started                             |
-|                    ....                                                    |
-|                    -100 : Config command error                             |
-|                    -101 : Data size error                                  |
-+----------------------------------------------------------------------------+
-*/
-static int i_APCI3XXX_InsnReadAnalogInput(struct comedi_device *dev,
-					  struct comedi_subdevice *s,
-					  struct comedi_insn *insn,
-					  unsigned int *data)
-{
-	const struct addi_board *this_board = comedi_board(dev);
-	struct addi_private *devpriv = dev->private;
-	int i_ReturnValue = insn->n;
-	unsigned char b_Configuration = (unsigned char) CR_RANGE(insn->chanspec);
-	unsigned char b_Channel = (unsigned char) CR_CHAN(insn->chanspec);
-	unsigned int dw_Temp = 0;
-	unsigned int dw_Configuration = 0;
-	unsigned int dw_AcquisitionCpt = 0;
-	unsigned char b_Interrupt = 0;
-
-	/*************************************/
-	/* Test if operating mode configured */
-	/*************************************/
-
-	if (devpriv->b_AiInitialisation) {
-	   /***************************/
-		/* Test the channel number */
-	   /***************************/
-
-		if (((b_Channel < devpriv->s_EeParameters.i_NbrAiChannel)
-				&& (devpriv->b_SingelDiff == APCI3XXX_SINGLE))
-			|| ((b_Channel < this_board->i_NbrAiChannelDiff)
-				&& (devpriv->b_SingelDiff == APCI3XXX_DIFF))) {
-	      /**********************************/
-			/* Test the channel configuration */
-	      /**********************************/
-
-			if (b_Configuration > 7) {
-		 /***************************/
-				/* Channel not initialised */
-		 /***************************/
-
-				i_ReturnValue = -4;
-				printk("Channel %d range %d selection error\n",
-					b_Channel, b_Configuration);
-			}
-		} else {
-	      /***************************/
-			/* Channel selection error */
-	      /***************************/
-
-			i_ReturnValue = -3;
-			printk("Channel %d selection error\n", b_Channel);
-		}
-
-	   /**************************/
-		/* Test if no error occur */
-	   /**************************/
-
-		if (i_ReturnValue >= 0) {
-	      /************************/
-			/* Test the buffer size */
-	      /************************/
-
-			if ((b_Interrupt != 0) || ((b_Interrupt == 0)
-					&& (insn->n >= 1))) {
-		 /**********************************/
-				/* Test if conversion not started */
-		 /**********************************/
-
-				if (i_APCI3XXX_TestConversionStarted(dev) == 0) {
-		    /******************/
-					/* Clear the FIFO */
-		    /******************/
-
-					writel(0x10000UL, devpriv->dw_AiBase + 12);
-
-		    /*******************************/
-					/* Get and save the delay mode */
-		    /*******************************/
-
-					dw_Temp = readl(devpriv->dw_AiBase + 4);
-					dw_Temp = dw_Temp & 0xFFFFFEF0UL;
-
-		    /***********************************/
-					/* Channel configuration selection */
-		    /***********************************/
-
-					writel(dw_Temp, devpriv->dw_AiBase + 4);
-
-		    /**************************/
-					/* Make the configuration */
-		    /**************************/
-
-					dw_Configuration =
-						(b_Configuration & 3) |
-						((unsigned int) (b_Configuration >> 2)
-						<< 6) | ((unsigned int) devpriv->
-						b_SingelDiff << 7);
-
-		    /***************************/
-					/* Write the configuration */
-		    /***************************/
-
-					writel(dw_Configuration,
-					       devpriv->dw_AiBase + 0);
-
-		    /*********************/
-					/* Channel selection */
-		    /*********************/
-
-					writel(dw_Temp | 0x100UL,
-					       devpriv->dw_AiBase + 4);
-					writel((unsigned int) b_Channel,
-					       devpriv->dw_AiBase + 0);
-
-		    /***********************/
-					/* Restaure delay mode */
-		    /***********************/
-
-					writel(dw_Temp, devpriv->dw_AiBase + 4);
-
-		    /***********************************/
-					/* Set the number of sequence to 1 */
-		    /***********************************/
-
-					writel(1, devpriv->dw_AiBase + 48);
-
-		    /***************************/
-					/* Save the interrupt flag */
-		    /***************************/
-
-					devpriv->b_EocEosInterrupt =
-						b_Interrupt;
-
-		    /*******************************/
-					/* Save the number of channels */
-		    /*******************************/
-
-					devpriv->ui_AiNbrofChannels = 1;
-
-		    /******************************/
-					/* Test if interrupt not used */
-		    /******************************/
-
-					if (b_Interrupt == 0) {
-						for (dw_AcquisitionCpt = 0;
-							dw_AcquisitionCpt <
-							insn->n;
-							dw_AcquisitionCpt++) {
-			  /************************/
-							/* Start the conversion */
-			  /************************/
-
-							writel(0x80000UL, devpriv->dw_AiBase + 8);
-
-			  /****************/
-							/* Wait the EOS */
-			  /****************/
-
-							do {
-								dw_Temp = readl(devpriv->dw_AiBase + 20);
-								dw_Temp = dw_Temp & 1;
-							} while (dw_Temp != 1);
-
-			  /*************************/
-							/* Read the analog value */
-			  /*************************/
-
-							data[dw_AcquisitionCpt] = (unsigned int)readl(devpriv->dw_AiBase + 28);
-						}
-					} else {
-		       /************************/
-						/* Start the conversion */
-		       /************************/
-
-						writel(0x180000UL, devpriv->dw_AiBase + 8);
-					}
-				} else {
-		    /**************************/
-					/* Any conversion started */
-		    /**************************/
-
-					printk("Any conversion started\n");
-					i_ReturnValue = -10;
-				}
-			} else {
-		 /*******************/
-				/* Data size error */
-		 /*******************/
-
-				printk("Buffer size error\n");
-				i_ReturnValue = -101;
-			}
-		}
-	} else {
-	   /***************************/
-		/* Channel selection error */
-	   /***************************/
-
-		printk("Operating mode not configured\n");
-		i_ReturnValue = -1;
-	}
-	return i_ReturnValue;
-}
-
-/*
-+----------------------------------------------------------------------------+
-| Function name     : void v_APCI3XXX_Interrupt (int            irq,         |
-|                                                void           *d)       |
-+----------------------------------------------------------------------------+
-| Task              :Interrupt handler for APCI3XXX                          |
-|                    When interrupt occurs this gets called.                 |
-|                    First it finds which interrupt has been generated and   |
-|                    handles  corresponding interrupt                        |
-+----------------------------------------------------------------------------+
-| Input Parameters  : -                                                      |
-+----------------------------------------------------------------------------+
-| Return Value      : -                                                      |
-+----------------------------------------------------------------------------+
-*/
-
-static void v_APCI3XXX_Interrupt(int irq, void *d)
-{
-	struct comedi_device *dev = d;
-	struct addi_private *devpriv = dev->private;
-	unsigned char b_CopyCpt = 0;
-	unsigned int dw_Status = 0;
-
-	/***************************/
-	/* Test if interrupt occur */
-	/***************************/
-
-	dw_Status = readl(devpriv->dw_AiBase + 16);
-	if ( (dw_Status & 0x2UL) == 0x2UL) {
-	   /***********************/
-		/* Reset the interrupt */
-	   /***********************/
-
-		writel(dw_Status, devpriv->dw_AiBase + 16);
-
-	   /*****************************/
-		/* Test if interrupt enabled */
-	   /*****************************/
-
-		if (devpriv->b_EocEosInterrupt == 1) {
-	      /********************************/
-			/* Read all analog inputs value */
-	      /********************************/
-
-			for (b_CopyCpt = 0;
-				b_CopyCpt < devpriv->ui_AiNbrofChannels;
-				b_CopyCpt++) {
-				devpriv->ui_AiReadData[b_CopyCpt] =
-					(unsigned int)readl(devpriv->dw_AiBase + 28);
-			}
-
-	      /**************************/
-			/* Set the interrupt flag */
-	      /**************************/
-
-			devpriv->b_EocEosInterrupt = 2;
-
-	      /**********************************************/
-			/* Send a signal to from kernel to user space */
-	      /**********************************************/
-
-			send_sig(SIGIO, devpriv->tsk_Current, 0);
-		}
-	}
-}
-
-/*
-+----------------------------------------------------------------------------+
-|                            ANALOG OUTPUT SUBDEVICE                         |
-+----------------------------------------------------------------------------+
-*/
-
-/*
-+----------------------------------------------------------------------------+
-| Function Name     : int   i_APCI3XXX_InsnWriteAnalogOutput                 |
-|                          (struct comedi_device    *dev,                           |
-|                           struct comedi_subdevice *s,                             |
-|                           struct comedi_insn      *insn,                          |
-|                           unsigned int         *data)                          |
-+----------------------------------------------------------------------------+
-| Task                Read 1 analog input                                    |
-+----------------------------------------------------------------------------+
-| Input Parameters  : b_Range    = CR_RANGE(insn->chanspec);                 |
-|                     b_Channel  = CR_CHAN(insn->chanspec);                  |
-|                     data[0]    = analog value;                             |
-+----------------------------------------------------------------------------+
-| Output Parameters : -                                                      |
-+----------------------------------------------------------------------------+
-| Return Value      :>0: No error                                            |
-|                    -3 : Channel selection error                            |
-|                    -4 : Configuration selelection error                    |
-|                    ....                                                    |
-|                    -101 : Data size error                                  |
-+----------------------------------------------------------------------------+
-*/
-static int i_APCI3XXX_InsnWriteAnalogOutput(struct comedi_device *dev,
-					    struct comedi_subdevice *s,
-					    struct comedi_insn *insn,
-					    unsigned int *data)
-{
-	struct addi_private *devpriv = dev->private;
-	unsigned char b_Range = (unsigned char) CR_RANGE(insn->chanspec);
-	unsigned char b_Channel = (unsigned char) CR_CHAN(insn->chanspec);
-	unsigned int dw_Status = 0;
-	int i_ReturnValue = insn->n;
-
-	/************************/
-	/* Test the buffer size */
-	/************************/
-
-	if (insn->n >= 1) {
-	   /***************************/
-		/* Test the channel number */
-	   /***************************/
-
-		if (b_Channel < devpriv->s_EeParameters.i_NbrAoChannel) {
-	      /**********************************/
-			/* Test the channel configuration */
-	      /**********************************/
-
-			if (b_Range < 2) {
-		 /***************************/
-				/* Set the range selection */
-		 /***************************/
-
-				writel(b_Range, devpriv->dw_AiBase + 96);
-
-		 /**************************************************/
-				/* Write the analog value to the selected channel */
-		 /**************************************************/
-
-				writel((data[0] << 8) | b_Channel,
-					devpriv->dw_AiBase + 100);
-
-		 /****************************/
-				/* Wait the end of transfer */
-		 /****************************/
-
-				do {
-					dw_Status = readl(devpriv->dw_AiBase + 96);
-				} while ((dw_Status & 0x100) != 0x100);
-			} else {
-		 /***************************/
-				/* Channel not initialised */
-		 /***************************/
-
-				i_ReturnValue = -4;
-				printk("Channel %d range %d selection error\n",
-					b_Channel, b_Range);
-			}
-		} else {
-	      /***************************/
-			/* Channel selection error */
-	      /***************************/
-
-			i_ReturnValue = -3;
-			printk("Channel %d selection error\n", b_Channel);
-		}
-	} else {
-	   /*******************/
-		/* Data size error */
-	   /*******************/
-
-		printk("Buffer size error\n");
-		i_ReturnValue = -101;
-	}
-
-	return i_ReturnValue;
-}
-
-/*
-+----------------------------------------------------------------------------+
-|                              TTL FUNCTIONS                                 |
-+----------------------------------------------------------------------------+
-*/
-
-/*
-+----------------------------------------------------------------------------+
-| Function Name     : int   i_APCI3XXX_InsnConfigInitTTLIO                   |
-|                          (struct comedi_device    *dev,                           |
-|                           struct comedi_subdevice *s,                             |
-|                           struct comedi_insn      *insn,                          |
-|                           unsigned int         *data)                          |
-+----------------------------------------------------------------------------+
-| Task           You must calling this function be                           |
-|                for you call any other function witch access of TTL.        |
-|                APCI3XXX_TTL_INIT_DIRECTION_PORT2(user inputs for direction)|
-+----------------------------------------------------------------------------+
-| Input Parameters  : b_InitType    = (unsigned char) data[0];                        |
-|                     b_Port2Mode   = (unsigned char) data[1];                        |
-+----------------------------------------------------------------------------+
-| Output Parameters : -                                                      |
-+----------------------------------------------------------------------------+
-| Return Value      :>0: No error                                            |
-|                    -1: Port 2 mode selection is wrong                      |
-|                    ....                                                    |
-|                    -100 : Config command error                             |
-|                    -101 : Data size error                                  |
-+----------------------------------------------------------------------------+
-*/
-static int i_APCI3XXX_InsnConfigInitTTLIO(struct comedi_device *dev,
-					  struct comedi_subdevice *s,
-					  struct comedi_insn *insn,
-					  unsigned int *data)
-{
-	struct addi_private *devpriv = dev->private;
-	int i_ReturnValue = insn->n;
-	unsigned char b_Command = 0;
-
-	/************************/
-	/* Test the buffer size */
-	/************************/
-
-	if (insn->n >= 1) {
-	   /*******************/
-		/* Get the command */
-		/* **************** */
-
-		b_Command = (unsigned char) data[0];
-
-	   /********************/
-		/* Test the command */
-	   /********************/
-
-		if (b_Command == APCI3XXX_TTL_INIT_DIRECTION_PORT2) {
-	      /***************************************/
-			/* Test the initialisation buffer size */
-	      /***************************************/
-
-			if ((b_Command == APCI3XXX_TTL_INIT_DIRECTION_PORT2)
-				&& (insn->n != 2)) {
-		 /*******************/
-				/* Data size error */
-		 /*******************/
-
-				printk("Buffer size error\n");
-				i_ReturnValue = -101;
-			}
-		} else {
-	      /************************/
-			/* Config command error */
-	      /************************/
-
-			printk("Command selection error\n");
-			i_ReturnValue = -100;
-		}
-	} else {
-	   /*******************/
-		/* Data size error */
-	   /*******************/
-
-		printk("Buffer size error\n");
-		i_ReturnValue = -101;
-	}
-
-	/*********************************************************************************/
-	/* Test if no error occur and APCI3XXX_TTL_INIT_DIRECTION_PORT2 command selected */
-	/*********************************************************************************/
-
-	if ((i_ReturnValue >= 0)
-		&& (b_Command == APCI3XXX_TTL_INIT_DIRECTION_PORT2)) {
-	   /**********************/
-		/* Test the direction */
-	   /**********************/
-
-		if ((data[1] == 0) || (data[1] == 0xFF)) {
-	      /**************************/
-			/* Save the configuration */
-	      /**************************/
-
-			devpriv->ul_TTLPortConfiguration[0] =
-				devpriv->ul_TTLPortConfiguration[0] | data[1];
-		} else {
-	      /************************/
-			/* Port direction error */
-	      /************************/
-
-			printk("Port 2 direction selection error\n");
-			i_ReturnValue = -1;
-		}
-	}
-
-	/**************************/
-	/* Test if no error occur */
-	/**************************/
-
-	if (i_ReturnValue >= 0) {
-	   /***********************************/
-		/* Test if TTL port initilaisation */
-	   /***********************************/
-
-		if (b_Command == APCI3XXX_TTL_INIT_DIRECTION_PORT2) {
-	      /*************************/
-			/* Set the configuration */
-	      /*************************/
-
-			outl(data[1], devpriv->iobase + 224);
-		}
-	}
-
-	return i_ReturnValue;
-}
-
-/*
-+----------------------------------------------------------------------------+
-|                        TTL INPUT FUNCTIONS                                 |
-+----------------------------------------------------------------------------+
-*/
-
-/*
-+----------------------------------------------------------------------------+
-| Function Name     : int     i_APCI3XXX_InsnBitsTTLIO                       |
-|                          (struct comedi_device    *dev,                           |
-|                           struct comedi_subdevice *s,                             |
-|                           struct comedi_insn      *insn,                          |
-|                           unsigned int         *data)                          |
-+----------------------------------------------------------------------------+
-| Task              : Write the selected output mask and read the status from|
-|                     all TTL channles                                       |
-+----------------------------------------------------------------------------+
-| Input Parameters  : dw_ChannelMask = data [0];                             |
-|                     dw_BitMask     = data [1];                             |
-+----------------------------------------------------------------------------+
-| Output Parameters : data[1] : All TTL channles states                      |
-+----------------------------------------------------------------------------+
-| Return Value      : >0  : No error                                         |
-|                    -4   : Channel mask error                               |
-|                    -101 : Data size error                                  |
-+----------------------------------------------------------------------------+
-*/
-static int i_APCI3XXX_InsnBitsTTLIO(struct comedi_device *dev,
-				    struct comedi_subdevice *s,
-				    struct comedi_insn *insn,
-				    unsigned int *data)
-{
-	struct addi_private *devpriv = dev->private;
-	int i_ReturnValue = insn->n;
-	unsigned char b_ChannelCpt = 0;
-	unsigned int dw_ChannelMask = 0;
-	unsigned int dw_BitMask = 0;
-	unsigned int dw_Status = 0;
-
-	/************************/
-	/* Test the buffer size */
-	/************************/
-
-	if (insn->n >= 2) {
-	   /*******************************/
-		/* Get the channe and bit mask */
-	   /*******************************/
-
-		dw_ChannelMask = data[0];
-		dw_BitMask = data[1];
-
-	   /*************************/
-		/* Test the channel mask */
-	   /*************************/
-
-		if (((dw_ChannelMask & 0XFF00FF00) == 0) &&
-			(((devpriv->ul_TTLPortConfiguration[0] & 0xFF) == 0xFF)
-				|| (((devpriv->ul_TTLPortConfiguration[0] &
-							0xFF) == 0)
-					&& ((dw_ChannelMask & 0XFF0000) ==
-						0)))) {
-	      /*********************************/
-			/* Test if set/reset any channel */
-	      /*********************************/
-
-			if (dw_ChannelMask) {
-		 /****************************************/
-				/* Test if set/rest any port 0 channels */
-		 /****************************************/
-
-				if (dw_ChannelMask & 0xFF) {
-		    /*******************************************/
-					/* Read port 0 (first digital output port) */
-		    /*******************************************/
-
-					dw_Status = inl(devpriv->iobase + 80);
-
-					for (b_ChannelCpt = 0; b_ChannelCpt < 8;
-						b_ChannelCpt++) {
-						if ((dw_ChannelMask >>
-								b_ChannelCpt) &
-							1) {
-							dw_Status =
-								(dw_Status &
-								(0xFF - (1 << b_ChannelCpt))) | (dw_BitMask & (1 << b_ChannelCpt));
-						}
-					}
-
-					outl(dw_Status, devpriv->iobase + 80);
-				}
-
-		 /****************************************/
-				/* Test if set/rest any port 2 channels */
-		 /****************************************/
-
-				if (dw_ChannelMask & 0xFF0000) {
-					dw_BitMask = dw_BitMask >> 16;
-					dw_ChannelMask = dw_ChannelMask >> 16;
-
-		    /********************************************/
-					/* Read port 2 (second digital output port) */
-		    /********************************************/
-
-					dw_Status = inl(devpriv->iobase + 112);
-
-					for (b_ChannelCpt = 0; b_ChannelCpt < 8;
-						b_ChannelCpt++) {
-						if ((dw_ChannelMask >>
-								b_ChannelCpt) &
-							1) {
-							dw_Status =
-								(dw_Status &
-								(0xFF - (1 << b_ChannelCpt))) | (dw_BitMask & (1 << b_ChannelCpt));
-						}
-					}
-
-					outl(dw_Status, devpriv->iobase + 112);
-				}
-			}
-
-	      /*******************************************/
-			/* Read port 0 (first digital output port) */
-	      /*******************************************/
-
-			data[1] = inl(devpriv->iobase + 80);
-
-	      /******************************************/
-			/* Read port 1 (first digital input port) */
-	      /******************************************/
-
-			data[1] = data[1] | (inl(devpriv->iobase + 64) << 8);
-
-	      /************************/
-			/* Test if port 2 input */
-	      /************************/
-
-			if ((devpriv->ul_TTLPortConfiguration[0] & 0xFF) == 0) {
-				data[1] =
-					data[1] | (inl(devpriv->iobase +
-						96) << 16);
-			} else {
-				data[1] =
-					data[1] | (inl(devpriv->iobase +
-						112) << 16);
-			}
-		} else {
-	      /************************/
-			/* Config command error */
-	      /************************/
-
-			printk("Channel mask error\n");
-			i_ReturnValue = -4;
-		}
-	} else {
-	   /*******************/
-		/* Data size error */
-	   /*******************/
-
-		printk("Buffer size error\n");
-		i_ReturnValue = -101;
-	}
-
-	return i_ReturnValue;
-}
-
-/*
-+----------------------------------------------------------------------------+
-| Function Name     : int i_APCI3XXX_InsnReadTTLIO                           |
-|                          (struct comedi_device    *dev,                           |
-|                           struct comedi_subdevice *s,                             |
-|                           struct comedi_insn      *insn,                          |
-|                           unsigned int         *data)                          |
-+----------------------------------------------------------------------------+
-| Task              : Read the status from selected channel                  |
-+----------------------------------------------------------------------------+
-| Input Parameters  : b_Channel = CR_CHAN(insn->chanspec)                    |
-+----------------------------------------------------------------------------+
-| Output Parameters : data[0] : Selected TTL channel state                   |
-+----------------------------------------------------------------------------+
-| Return Value      : 0   : No error                                         |
-|                    -3   : Channel selection error                          |
-|                    -101 : Data size error                                  |
-+----------------------------------------------------------------------------+
-*/
-static int i_APCI3XXX_InsnReadTTLIO(struct comedi_device *dev,
-				    struct comedi_subdevice *s,
-				    struct comedi_insn *insn,
-				    unsigned int *data)
-{
-	struct addi_private *devpriv = dev->private;
-	unsigned char b_Channel = (unsigned char) CR_CHAN(insn->chanspec);
-	int i_ReturnValue = insn->n;
-	unsigned int *pls_ReadData = data;
-
-	/************************/
-	/* Test the buffer size */
-	/************************/
-
-	if (insn->n >= 1) {
-	   /***********************/
-		/* Test if read port 0 */
-	   /***********************/
-
-		if (b_Channel < 8) {
-	      /*******************************************/
-			/* Read port 0 (first digital output port) */
-	      /*******************************************/
-
-			pls_ReadData[0] = inl(devpriv->iobase + 80);
-			pls_ReadData[0] = (pls_ReadData[0] >> b_Channel) & 1;
-		} else {
-	      /***********************/
-			/* Test if read port 1 */
-	      /***********************/
-
-			if ((b_Channel > 7) && (b_Channel < 16)) {
-		 /******************************************/
-				/* Read port 1 (first digital input port) */
-		 /******************************************/
-
-				pls_ReadData[0] = inl(devpriv->iobase + 64);
-				pls_ReadData[0] =
-					(pls_ReadData[0] >> (b_Channel -
-						8)) & 1;
-			} else {
-		 /***********************/
-				/* Test if read port 2 */
-		 /***********************/
-
-				if ((b_Channel > 15) && (b_Channel < 24)) {
-		    /************************/
-					/* Test if port 2 input */
-		    /************************/
-
-					if ((devpriv->ul_TTLPortConfiguration[0]
-							& 0xFF) == 0) {
-						pls_ReadData[0] =
-							inl(devpriv->iobase +
-							96);
-						pls_ReadData[0] =
-							(pls_ReadData[0] >>
-							(b_Channel - 16)) & 1;
-					} else {
-						pls_ReadData[0] =
-							inl(devpriv->iobase +
-							112);
-						pls_ReadData[0] =
-							(pls_ReadData[0] >>
-							(b_Channel - 16)) & 1;
-					}
-				} else {
-		    /***************************/
-					/* Channel selection error */
-		    /***************************/
-
-					i_ReturnValue = -3;
-					printk("Channel %d selection error\n",
-						b_Channel);
-				}
-			}
-		}
-	} else {
-	   /*******************/
-		/* Data size error */
-	   /*******************/
-
-		printk("Buffer size error\n");
-		i_ReturnValue = -101;
-	}
-
-	return i_ReturnValue;
-}
-
-/*
-+----------------------------------------------------------------------------+
-|                        TTL OUTPUT FUNCTIONS                                |
-+----------------------------------------------------------------------------+
-*/
-
-/*
-+----------------------------------------------------------------------------+
-| Function Name     : int     i_APCI3XXX_InsnWriteTTLIO                      |
-|                          (struct comedi_device    *dev,                           |
-|                           struct comedi_subdevice *s,                             |
-|                           struct comedi_insn      *insn,                          |
-|                           unsigned int         *data)                          |
-+----------------------------------------------------------------------------+
-| Task              : Set the state from TTL output channel                  |
-+----------------------------------------------------------------------------+
-| Input Parameters  : b_Channel = CR_CHAN(insn->chanspec)                    |
-|                     b_State   = data [0]                                   |
-+----------------------------------------------------------------------------+
-| Output Parameters : -                                                      |
-+----------------------------------------------------------------------------+
-| Return Value      : 0   : No error                                         |
-|                    -3   : Channel selection error                          |
-|                    -101 : Data size error                                  |
-+----------------------------------------------------------------------------+
-*/
-static int i_APCI3XXX_InsnWriteTTLIO(struct comedi_device *dev,
-				     struct comedi_subdevice *s,
-				     struct comedi_insn *insn,
-				     unsigned int *data)
-{
-	struct addi_private *devpriv = dev->private;
-	int i_ReturnValue = insn->n;
-	unsigned char b_Channel = (unsigned char) CR_CHAN(insn->chanspec);
-	unsigned char b_State = 0;
-	unsigned int dw_Status = 0;
-
-	/************************/
-	/* Test the buffer size */
-	/************************/
-
-	if (insn->n >= 1) {
-		b_State = (unsigned char) data[0];
-
-	   /***********************/
-		/* Test if read port 0 */
-	   /***********************/
-
-		if (b_Channel < 8) {
-	      /*****************************************************************************/
-			/* Read port 0 (first digital output port) and set/reset the selected channel */
-	      /*****************************************************************************/
-
-			dw_Status = inl(devpriv->iobase + 80);
-			dw_Status =
-				(dw_Status & (0xFF -
-					(1 << b_Channel))) | ((b_State & 1) <<
-				b_Channel);
-			outl(dw_Status, devpriv->iobase + 80);
-		} else {
-	      /***********************/
-			/* Test if read port 2 */
-	      /***********************/
-
-			if ((b_Channel > 15) && (b_Channel < 24)) {
-		 /*************************/
-				/* Test if port 2 output */
-		 /*************************/
-
-				if ((devpriv->ul_TTLPortConfiguration[0] & 0xFF)
-					== 0xFF) {
-		    /*****************************************************************************/
-					/* Read port 2 (first digital output port) and set/reset the selected channel */
-		    /*****************************************************************************/
-
-					dw_Status = inl(devpriv->iobase + 112);
-					dw_Status =
-						(dw_Status & (0xFF -
-							(1 << (b_Channel -
-									16)))) |
-						((b_State & 1) << (b_Channel -
-							16));
-					outl(dw_Status, devpriv->iobase + 112);
-				} else {
-		    /***************************/
-					/* Channel selection error */
-		    /***************************/
-
-					i_ReturnValue = -3;
-					printk("Channel %d selection error\n",
-						b_Channel);
-				}
-			} else {
-		 /***************************/
-				/* Channel selection error */
-		 /***************************/
-
-				i_ReturnValue = -3;
-				printk("Channel %d selection error\n",
-					b_Channel);
-			}
-		}
-	} else {
-	   /*******************/
-		/* Data size error */
-	   /*******************/
-
-		printk("Buffer size error\n");
-		i_ReturnValue = -101;
-	}
-
-	return i_ReturnValue;
-}
-
-static int apci3xxx_di_insn_bits(struct comedi_device *dev,
-				 struct comedi_subdevice *s,
-				 struct comedi_insn *insn,
-				 unsigned int *data)
-{
-	struct addi_private *devpriv = dev->private;
-
-	data[1] = inl(devpriv->iobase + 32) & 0xf;
-
-	return insn->n;
-}
-
-static int apci3xxx_do_insn_bits(struct comedi_device *dev,
-				 struct comedi_subdevice *s,
-				 struct comedi_insn *insn,
-				 unsigned int *data)
-{
-	struct addi_private *devpriv = dev->private;
-	unsigned int mask = data[0];
-	unsigned int bits = data[1];
-
-	s->state = inl(devpriv->iobase + 48) & 0xf;
-	if (mask) {
-		s->state &= ~mask;
-		s->state |= (bits & mask);
-
-		outl(s->state, devpriv->iobase + 48);
-	}
-
-	data[1] = s->state;
-
-	return insn->n;
-}
-
-/*
-+----------------------------------------------------------------------------+
-| Function   Name   : int i_APCI3XXX_Reset(struct comedi_device *dev)               |                                                         +----------------------------------------------------------------------------+
-| Task              :resets all the registers                                |
-+----------------------------------------------------------------------------+
-| Input Parameters  : struct comedi_device *dev                                     |
-+----------------------------------------------------------------------------+
-| Output Parameters : -                                                      |
-+----------------------------------------------------------------------------+
-| Return Value      : -                                                      |
-+----------------------------------------------------------------------------+
-*/
-
-static int i_APCI3XXX_Reset(struct comedi_device *dev)
-{
-	struct addi_private *devpriv = dev->private;
-	unsigned char b_Cpt = 0;
-
-	/*************************/
-	/* Disable the interrupt */
-	/*************************/
-
-	disable_irq(dev->irq);
-
-	/****************************/
-	/* Reset the interrupt flag */
-	/****************************/
-
-	devpriv->b_EocEosInterrupt = 0;
-
-	/***************************/
-	/* Clear the start command */
-	/***************************/
-
-	writel(0, devpriv->dw_AiBase + 8);
-
-	/*****************************/
-	/* Reset the interrupt flags */
-	/*****************************/
-
-	writel(readl(devpriv->dw_AiBase + 16), devpriv->dw_AiBase + 16);
-
-	/*****************/
-	/* clear the EOS */
-	/*****************/
-
-	readl(devpriv->dw_AiBase + 20);
-
-	/******************/
-	/* Clear the FIFO */
-	/******************/
-
-	for (b_Cpt = 0; b_Cpt < 16; b_Cpt++) {
-		readl(devpriv->dw_AiBase + 28);
-	}
-
-	/************************/
-	/* Enable the interrupt */
-	/************************/
-
-	enable_irq(dev->irq);
-
-	return 0;
-}

diff --git a/drivers/staging/comedi/drivers/addi_apci_1032.c b/drivers/staging/comedi/drivers/addi_apci_1032.c
index 3d4878f..8a93542 100644
--- a/drivers/staging/comedi/drivers/addi_apci_1032.c
+++ b/drivers/staging/comedi/drivers/addi_apci_1032.c

@@ -20,13 +20,6 @@
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying this
- * source code.
  */
 
 #include <linux/pci.h>

diff --git a/drivers/staging/comedi/drivers/addi_apci_1516.c b/drivers/staging/comedi/drivers/addi_apci_1516.c
index ed01c56..b626738 100644
--- a/drivers/staging/comedi/drivers/addi_apci_1516.c
+++ b/drivers/staging/comedi/drivers/addi_apci_1516.c

@@ -20,13 +20,6 @@
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #include <linux/pci.h>
@@ -203,7 +196,6 @@
 {
 	if (dev->iobase)
 		apci1516_reset(dev);
-	comedi_spriv_free(dev, 2);
 	comedi_pci_disable(dev);
 }
 

diff --git a/drivers/staging/comedi/drivers/addi_apci_16xx.c b/drivers/staging/comedi/drivers/addi_apci_16xx.c
index 4c6a9b5..1f7bed9 100644
--- a/drivers/staging/comedi/drivers/addi_apci_16xx.c
+++ b/drivers/staging/comedi/drivers/addi_apci_16xx.c

@@ -20,13 +20,6 @@
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #include <linux/pci.h>

diff --git a/drivers/staging/comedi/drivers/addi_apci_2032.c b/drivers/staging/comedi/drivers/addi_apci_2032.c
index b666637..89ead8e 100644
--- a/drivers/staging/comedi/drivers/addi_apci_2032.c
+++ b/drivers/staging/comedi/drivers/addi_apci_2032.c

@@ -20,13 +20,6 @@
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #include <linux/pci.h>
@@ -354,7 +347,6 @@
 		free_irq(dev->irq, dev);
 	if (dev->read_subdev)
 		kfree(dev->read_subdev->private);
-	comedi_spriv_free(dev, 1);
 	comedi_pci_disable(dev);
 }
 

diff --git a/drivers/staging/comedi/drivers/addi_apci_2200.c b/drivers/staging/comedi/drivers/addi_apci_2200.c
index 1cdc08d..ca1bd92 100644
--- a/drivers/staging/comedi/drivers/addi_apci_2200.c
+++ b/drivers/staging/comedi/drivers/addi_apci_2200.c

@@ -20,13 +20,6 @@
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #include <linux/pci.h>
@@ -130,7 +123,6 @@
 {
 	if (dev->iobase)
 		apci2200_reset(dev);
-	comedi_spriv_free(dev, 2);
 	comedi_pci_disable(dev);
 }
 

diff --git a/drivers/staging/comedi/drivers/addi_apci_3120.c b/drivers/staging/comedi/drivers/addi_apci_3120.c
index 317a26d..6145284 100644
--- a/drivers/staging/comedi/drivers/addi_apci_3120.c
+++ b/drivers/staging/comedi/drivers/addi_apci_3120.c

@@ -103,8 +103,6 @@
 		if (devpriv->ul_DmaBufferVirtual[i]) {
 			devpriv->ui_DmaBufferPages[i] = pages;
 			devpriv->ui_DmaBufferSize[i] = PAGE_SIZE * pages;
-			devpriv->ui_DmaBufferSamples[i] =
-				devpriv->ui_DmaBufferSize[i] >> 1;
 			devpriv->ul_DmaBufferHw[i] =
 				virt_to_bus((void *)devpriv->
 				ul_DmaBufferVirtual[i]);
@@ -138,9 +136,6 @@
 	s->len_chanlist = this_board->i_AiChannelList;
 	s->range_table = &range_apci3120_ai;
 
-	/* Set the initialisation flag */
-	devpriv->b_AiInitialisation = 1;
-
 	s->insn_config = i_APCI3120_InsnConfigAnalogInput;
 	s->insn_read = i_APCI3120_InsnReadAnalogInput;
 	s->do_cmdtest = i_APCI3120_CommandTestAnalogInput;

diff --git a/drivers/staging/comedi/drivers/addi_apci_3501.c b/drivers/staging/comedi/drivers/addi_apci_3501.c
index a0cf6ec..f9b6368 100644
--- a/drivers/staging/comedi/drivers/addi_apci_3501.c
+++ b/drivers/staging/comedi/drivers/addi_apci_3501.c

@@ -20,13 +20,6 @@
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #include <linux/pci.h>

diff --git a/drivers/staging/comedi/drivers/addi_apci_3xxx.c b/drivers/staging/comedi/drivers/addi_apci_3xxx.c
index ec4d6ca..5b37cbf 100644
--- a/drivers/staging/comedi/drivers/addi_apci_3xxx.c
+++ b/drivers/staging/comedi/drivers/addi_apci_3xxx.c

@@ -1,14 +1,57 @@
+/*
+ * addi_apci_3xxx.c
+ * Copyright (C) 2004,2005  ADDI-DATA GmbH for the source code of this module.
+ * Project manager: S. Weber
+ *
+ *	ADDI-DATA GmbH
+ *	Dieselstrasse 3
+ *	D-77833 Ottersweier
+ *	Tel: +19(0)7223/9493-0
+ *	Fax: +49(0)7223/9493-92
+ *	http://www.addi-data.com
+ *	info@addi-data.com
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
 #include <linux/pci.h>
+#include <linux/interrupt.h>
 
 #include "../comedidev.h"
+
 #include "comedi_fc.h"
-#include "amcc_s5933.h"
 
-#include "addi-data/addi_common.h"
+#define CONV_UNIT_NS		(1 << 0)
+#define CONV_UNIT_US		(1 << 1)
+#define CONV_UNIT_MS		(1 << 2)
 
-#include "addi-data/addi_eeprom.c"
-#include "addi-data/hwdrv_apci3xxx.c"
-#include "addi-data/addi_common.c"
+static const struct comedi_lrange apci3xxx_ai_range = {
+	8, {
+		BIP_RANGE(10),
+		BIP_RANGE(5),
+		BIP_RANGE(2),
+		BIP_RANGE(1),
+		UNI_RANGE(10),
+		UNI_RANGE(5),
+		UNI_RANGE(2),
+		UNI_RANGE(1)
+	}
+};
+
+static const struct comedi_lrange apci3xxx_ao_range = {
+	2, {
+		BIP_RANGE(10),
+		UNI_RANGE(10)
+	}
+};
 
 enum apci3xxx_boardid {
 	BOARD_APCI3000_16,
@@ -38,651 +81,853 @@
 	BOARD_APCI3500,
 };
 
-static const struct addi_board apci3xxx_boardtypes[] = {
+struct apci3xxx_boardinfo {
+	const char *name;
+	int ai_subdev_flags;
+	int ai_n_chan;
+	unsigned int ai_maxdata;
+	unsigned char ai_conv_units;
+	unsigned int ai_min_acq_ns;
+	unsigned int has_ao:1;
+	unsigned int has_dig_in:1;
+	unsigned int has_dig_out:1;
+	unsigned int has_ttl_io:1;
+};
+
+static const struct apci3xxx_boardinfo apci3xxx_boardtypes[] = {
 	[BOARD_APCI3000_16] = {
-		.pc_DriverName		= "apci3000-16",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 16,
-		.i_NbrAiChannelDiff	= 8,
-		.i_AiChannelList	= 16,
-		.i_AiMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 10000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3000-16",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 16,
+		.ai_maxdata		= 0x0fff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 10000,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3000_8] = {
-		.pc_DriverName		= "apci3000-8",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 8,
-		.i_NbrAiChannelDiff	= 4,
-		.i_AiChannelList	= 8,
-		.i_AiMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 10000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3000-8",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 8,
+		.ai_maxdata		= 0x0fff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 10000,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3000_4] = {
-		.pc_DriverName		= "apci3000-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 4,
-		.i_NbrAiChannelDiff	= 2,
-		.i_AiChannelList	= 4,
-		.i_AiMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 10000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3000-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 4,
+		.ai_maxdata		= 0x0fff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 10000,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3006_16] = {
-		.pc_DriverName		= "apci3006-16",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 16,
-		.i_NbrAiChannelDiff	= 8,
-		.i_AiChannelList	= 16,
-		.i_AiMaxdata		= 65535,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 10000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3006-16",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 16,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 10000,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3006_8] = {
-		.pc_DriverName		= "apci3006-8",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 8,
-		.i_NbrAiChannelDiff	= 4,
-		.i_AiChannelList	= 8,
-		.i_AiMaxdata		= 65535,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 10000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3006-8",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 8,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 10000,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3006_4] = {
-		.pc_DriverName		= "apci3006-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 4,
-		.i_NbrAiChannelDiff	= 2,
-		.i_AiChannelList	= 4,
-		.i_AiMaxdata		= 65535,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 10000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3006-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 4,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 10000,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3010_16] = {
-		.pc_DriverName		= "apci3010-16",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 16,
-		.i_NbrAiChannelDiff	= 8,
-		.i_AiChannelList	= 16,
-		.i_AiMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3010-16",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 16,
+		.ai_maxdata		= 0x0fff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3010_8] = {
-		.pc_DriverName		= "apci3010-8",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 8,
-		.i_NbrAiChannelDiff	= 4,
-		.i_AiChannelList	= 8,
-		.i_AiMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3010-8",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 8,
+		.ai_maxdata		= 0x0fff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3010_4] = {
-		.pc_DriverName		= "apci3010-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 4,
-		.i_NbrAiChannelDiff	= 2,
-		.i_AiChannelList	= 4,
-		.i_AiMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3010-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 4,
+		.ai_maxdata		= 0x0fff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3016_16] = {
-		.pc_DriverName		= "apci3016-16",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 16,
-		.i_NbrAiChannelDiff	= 8,
-		.i_AiChannelList	= 16,
-		.i_AiMaxdata		= 65535,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3016-16",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 16,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3016_8] = {
-		.pc_DriverName		= "apci3016-8",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 8,
-		.i_NbrAiChannelDiff	= 4,
-		.i_AiChannelList	= 8,
-		.i_AiMaxdata		= 65535,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3016-8",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 8,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3016_4] = {
-		.pc_DriverName		= "apci3016-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 4,
-		.i_NbrAiChannelDiff	= 2,
-		.i_AiChannelList	= 4,
-		.i_AiMaxdata		= 65535,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3016-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 4,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3100_16_4] = {
-		.pc_DriverName		= "apci3100-16-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 16,
-		.i_NbrAiChannelDiff	= 8,
-		.i_AiChannelList	= 16,
-		.i_NbrAoChannel		= 4,
-		.i_AiMaxdata		= 4095,
-		.i_AoMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.pr_AoRangelist		= &range_apci3XXX_ao,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 10000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ao_write		= i_APCI3XXX_InsnWriteAnalogOutput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3100-16-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 16,
+		.ai_maxdata		= 0x0fff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 10000,
+		.has_ao			= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3100_8_4] = {
-		.pc_DriverName		= "apci3100-8-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 8,
-		.i_NbrAiChannelDiff	= 4,
-		.i_AiChannelList	= 8,
-		.i_NbrAoChannel		= 4,
-		.i_AiMaxdata		= 4095,
-		.i_AoMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.pr_AoRangelist		= &range_apci3XXX_ao,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 10000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ao_write		= i_APCI3XXX_InsnWriteAnalogOutput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3100-8-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 8,
+		.ai_maxdata		= 0x0fff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 10000,
+		.has_ao			= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3106_16_4] = {
-		.pc_DriverName		= "apci3106-16-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 16,
-		.i_NbrAiChannelDiff	= 8,
-		.i_AiChannelList	= 16,
-		.i_NbrAoChannel		= 4,
-		.i_AiMaxdata		= 65535,
-		.i_AoMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.pr_AoRangelist		= &range_apci3XXX_ao,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 10000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ao_write		= i_APCI3XXX_InsnWriteAnalogOutput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3106-16-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 16,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 10000,
+		.has_ao			= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3106_8_4] = {
-		.pc_DriverName		= "apci3106-8-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 8,
-		.i_NbrAiChannelDiff	= 4,
-		.i_AiChannelList	= 8,
-		.i_NbrAoChannel		= 4,
-		.i_AiMaxdata		= 65535,
-		.i_AoMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.pr_AoRangelist		= &range_apci3XXX_ao,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 10000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ao_write		= i_APCI3XXX_InsnWriteAnalogOutput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3106-8-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 8,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 10000,
+		.has_ao			= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3110_16_4] = {
-		.pc_DriverName		= "apci3110-16-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 16,
-		.i_NbrAiChannelDiff	= 8,
-		.i_AiChannelList	= 16,
-		.i_NbrAoChannel		= 4,
-		.i_AiMaxdata		= 4095,
-		.i_AoMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.pr_AoRangelist		= &range_apci3XXX_ao,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ao_write		= i_APCI3XXX_InsnWriteAnalogOutput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3110-16-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 16,
+		.ai_maxdata		= 0x0fff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_ao			= 1,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3110_8_4] = {
-		.pc_DriverName		= "apci3110-8-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 8,
-		.i_NbrAiChannelDiff	= 4,
-		.i_AiChannelList	= 8,
-		.i_NbrAoChannel		= 4,
-		.i_AiMaxdata		= 4095,
-		.i_AoMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.pr_AoRangelist		= &range_apci3XXX_ao,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ao_write		= i_APCI3XXX_InsnWriteAnalogOutput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3110-8-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 8,
+		.ai_maxdata		= 0x0fff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_ao			= 1,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3116_16_4] = {
-		.pc_DriverName		= "apci3116-16-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 16,
-		.i_NbrAiChannelDiff	= 8,
-		.i_AiChannelList	= 16,
-		.i_NbrAoChannel		= 4,
-		.i_AiMaxdata		= 65535,
-		.i_AoMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.pr_AoRangelist		= &range_apci3XXX_ao,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ao_write		= i_APCI3XXX_InsnWriteAnalogOutput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3116-16-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 16,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_ao			= 1,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3116_8_4] = {
-		.pc_DriverName		= "apci3116-8-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannel		= 8,
-		.i_NbrAiChannelDiff	= 4,
-		.i_AiChannelList	= 8,
-		.i_NbrAoChannel		= 4,
-		.i_AiMaxdata		= 65535,
-		.i_AoMaxdata		= 4095,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.pr_AoRangelist		= &range_apci3XXX_ao,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.i_NbrTTLChannel	= 24,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.ao_write		= i_APCI3XXX_InsnWriteAnalogOutput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3116-8-4",
+		.ai_subdev_flags	= SDF_COMMON | SDF_GROUND | SDF_DIFF,
+		.ai_n_chan		= 8,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_ao			= 1,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
+		.has_ttl_io		= 1,
 	},
 	[BOARD_APCI3003] = {
-		.pc_DriverName		= "apci3003",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannelDiff	= 4,
-		.i_AiChannelList	= 4,
-		.i_AiMaxdata		= 65535,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.b_AvailableConvertUnit	= 7,
-		.ui_MinAcquisitiontimeNs = 2500,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
+		.name			= "apci3003",
+		.ai_subdev_flags	= SDF_DIFF,
+		.ai_n_chan		= 4,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US |
+					  CONV_UNIT_NS,
+		.ai_min_acq_ns		= 2500,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
 	},
 	[BOARD_APCI3002_16] = {
-		.pc_DriverName		= "apci3002-16",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannelDiff	= 16,
-		.i_AiChannelList	= 16,
-		.i_AiMaxdata		= 65535,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
+		.name			= "apci3002-16",
+		.ai_subdev_flags	= SDF_DIFF,
+		.ai_n_chan		= 16,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
 	},
 	[BOARD_APCI3002_8] = {
-		.pc_DriverName		= "apci3002-8",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannelDiff	= 8,
-		.i_AiChannelList	= 8,
-		.i_AiMaxdata		= 65535,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
+		.name			= "apci3002-8",
+		.ai_subdev_flags	= SDF_DIFF,
+		.ai_n_chan		= 8,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
 	},
 	[BOARD_APCI3002_4] = {
-		.pc_DriverName		= "apci3002-4",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAiChannelDiff	= 4,
-		.i_AiChannelList	= 4,
-		.i_AiMaxdata		= 65535,
-		.pr_AiRangelist		= &range_apci3XXX_ai,
-		.i_NbrDiChannel		= 4,
-		.i_NbrDoChannel		= 4,
-		.i_DoMaxdata		= 1,
-		.b_AvailableConvertUnit	= 6,
-		.ui_MinAcquisitiontimeNs = 5000,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ai_config		= i_APCI3XXX_InsnConfigAnalogInput,
-		.ai_read		= i_APCI3XXX_InsnReadAnalogInput,
-		.di_bits		= apci3xxx_di_insn_bits,
-		.do_bits		= apci3xxx_do_insn_bits,
+		.name			= "apci3002-4",
+		.ai_subdev_flags	= SDF_DIFF,
+		.ai_n_chan		= 4,
+		.ai_maxdata		= 0xffff,
+		.ai_conv_units		= CONV_UNIT_MS | CONV_UNIT_US,
+		.ai_min_acq_ns		= 5000,
+		.has_dig_in		= 1,
+		.has_dig_out		= 1,
 	},
 	[BOARD_APCI3500] = {
-		.pc_DriverName		= "apci3500",
-		.i_IorangeBase1		= 256,
-		.i_PCIEeprom		= ADDIDATA_NO_EEPROM,
-		.pc_EepromChip		= ADDIDATA_9054,
-		.i_NbrAoChannel		= 4,
-		.i_AoMaxdata		= 4095,
-		.pr_AoRangelist		= &range_apci3XXX_ao,
-		.i_NbrTTLChannel	= 24,
-		.interrupt		= v_APCI3XXX_Interrupt,
-		.reset			= i_APCI3XXX_Reset,
-		.ao_write		= i_APCI3XXX_InsnWriteAnalogOutput,
-		.ttl_config		= i_APCI3XXX_InsnConfigInitTTLIO,
-		.ttl_bits		= i_APCI3XXX_InsnBitsTTLIO,
-		.ttl_read		= i_APCI3XXX_InsnReadTTLIO,
-		.ttl_write		= i_APCI3XXX_InsnWriteTTLIO,
+		.name			= "apci3500",
+		.has_ao			= 1,
+		.has_ttl_io		= 1,
 	},
 };
 
+struct apci3xxx_private {
+	void __iomem *mmio;
+	unsigned int ai_timer;
+	unsigned char ai_time_base;
+};
+
+static irqreturn_t apci3xxx_irq_handler(int irq, void *d)
+{
+	struct comedi_device *dev = d;
+	struct apci3xxx_private *devpriv = dev->private;
+	struct comedi_subdevice *s = dev->read_subdev;
+	unsigned int status;
+	unsigned int val;
+
+	/* Test if interrupt occur */
+	status = readl(devpriv->mmio + 16);
+	if ((status & 0x2) == 0x2) {
+		/* Reset the interrupt */
+		writel(status, devpriv->mmio + 16);
+
+		val = readl(devpriv->mmio + 28);
+		comedi_buf_put(s->async, val);
+
+		s->async->events |= COMEDI_CB_EOA;
+		comedi_event(dev, s);
+
+		return IRQ_HANDLED;
+	}
+	return IRQ_NONE;
+}
+
+static int apci3xxx_ai_started(struct comedi_device *dev)
+{
+	struct apci3xxx_private *devpriv = dev->private;
+
+	if ((readl(devpriv->mmio + 8) & 0x80000) == 0x80000)
+		return 1;
+	else
+		return 0;
+
+}
+
+static int apci3xxx_ai_setup(struct comedi_device *dev, unsigned int chanspec)
+{
+	struct apci3xxx_private *devpriv = dev->private;
+	unsigned int chan = CR_CHAN(chanspec);
+	unsigned int range = CR_RANGE(chanspec);
+	unsigned int aref = CR_AREF(chanspec);
+	unsigned int delay_mode;
+	unsigned int val;
+
+	if (apci3xxx_ai_started(dev))
+		return -EBUSY;
+
+	/* Clear the FIFO */
+	writel(0x10000, devpriv->mmio + 12);
+
+	/* Get and save the delay mode */
+	delay_mode = readl(devpriv->mmio + 4);
+	delay_mode &= 0xfffffef0;
+
+	/* Channel configuration selection */
+	writel(delay_mode, devpriv->mmio + 4);
+
+	/* Make the configuration */
+	val = (range & 3) | ((range >> 2) << 6) |
+	      ((aref == AREF_DIFF) << 7);
+	writel(val, devpriv->mmio + 0);
+
+	/* Channel selection */
+	writel(delay_mode | 0x100, devpriv->mmio + 4);
+	writel(chan, devpriv->mmio + 0);
+
+	/* Restore delay mode */
+	writel(delay_mode, devpriv->mmio + 4);
+
+	/* Set the number of sequence to 1 */
+	writel(1, devpriv->mmio + 48);
+
+	return 0;
+}
+
+static int apci3xxx_ai_insn_read(struct comedi_device *dev,
+				 struct comedi_subdevice *s,
+				 struct comedi_insn *insn,
+				 unsigned int *data)
+{
+	struct apci3xxx_private *devpriv = dev->private;
+	unsigned int val;
+	int ret;
+	int i;
+
+	ret = apci3xxx_ai_setup(dev, insn->chanspec);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < insn->n; i++) {
+		/* Start the conversion */
+		writel(0x80000, devpriv->mmio + 8);
+
+		/* Wait the EOS */
+		do {
+			val = readl(devpriv->mmio + 20);
+			val &= 0x1;
+		} while (!val);
+
+		/* Read the analog value */
+		data[i] = readl(devpriv->mmio + 28);
+	}
+
+	return insn->n;
+}
+
+static int apci3xxx_ai_ns_to_timer(struct comedi_device *dev,
+				   unsigned int *ns, int round_mode)
+{
+	const struct apci3xxx_boardinfo *board = comedi_board(dev);
+	struct apci3xxx_private *devpriv = dev->private;
+	unsigned int base;
+	unsigned int timer;
+	int time_base;
+
+	/* time_base: 0 = ns, 1 = us, 2 = ms */
+	for (time_base = 0; time_base < 3; time_base++) {
+		/* skip unsupported time bases */
+		if (!(board->ai_conv_units & (1 << time_base)))
+			continue;
+
+		switch (time_base) {
+		case 0:
+			base = 1;
+			break;
+		case 1:
+			base = 1000;
+			break;
+		case 2:
+			base = 1000000;
+			break;
+		}
+
+		switch (round_mode) {
+		case TRIG_ROUND_NEAREST:
+		default:
+			timer = (*ns + base / 2) / base;
+			break;
+		case TRIG_ROUND_DOWN:
+			timer = *ns / base;
+			break;
+		case TRIG_ROUND_UP:
+			timer = (*ns + base - 1) / base;
+			break;
+		}
+
+		if (timer < 0x10000) {
+			devpriv->ai_time_base = time_base;
+			devpriv->ai_timer = timer;
+			*ns = timer * time_base;
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+static int apci3xxx_ai_cmdtest(struct comedi_device *dev,
+			       struct comedi_subdevice *s,
+			       struct comedi_cmd *cmd)
+{
+	const struct apci3xxx_boardinfo *board = comedi_board(dev);
+	int err = 0;
+	unsigned int tmp;
+
+	/* Step 1 : check if triggers are trivially valid */
+
+	err |= cfc_check_trigger_src(&cmd->start_src, TRIG_NOW);
+	err |= cfc_check_trigger_src(&cmd->scan_begin_src, TRIG_FOLLOW);
+	err |= cfc_check_trigger_src(&cmd->convert_src, TRIG_TIMER);
+	err |= cfc_check_trigger_src(&cmd->scan_end_src, TRIG_COUNT);
+	err |= cfc_check_trigger_src(&cmd->stop_src, TRIG_COUNT | TRIG_NONE);
+
+	if (err)
+		return 1;
+
+	/* Step 2a : make sure trigger sources are unique */
+
+	err |= cfc_check_trigger_is_unique(cmd->stop_src);
+
+	/* Step 2b : and mutually compatible */
+
+	if (err)
+		return 2;
+
+	/* Step 3: check if arguments are trivially valid */
+
+	err |= cfc_check_trigger_arg_is(&cmd->start_arg, 0);
+	err |= cfc_check_trigger_arg_is(&cmd->scan_begin_arg, 0);
+	err |= cfc_check_trigger_arg_min(&cmd->convert_arg,
+					 board->ai_min_acq_ns);
+	err |= cfc_check_trigger_arg_is(&cmd->scan_end_arg, cmd->chanlist_len);
+
+	if (cmd->stop_src == TRIG_COUNT)
+		err |= cfc_check_trigger_arg_min(&cmd->stop_arg, 1);
+	else	/* TRIG_NONE */
+		err |= cfc_check_trigger_arg_is(&cmd->stop_arg, 0);
+
+	if (err)
+		return 3;
+
+	/* step 4: fix up any arguments */
+
+	/*
+	 * FIXME: The hardware supports multiple scan modes but the original
+	 * addi-data driver only supported reading a single channel with
+	 * interrupts. Need a proper datasheet to fix this.
+	 *
+	 * The following scan modes are supported by the hardware:
+	 * 1) Single software scan
+	 * 2) Single hardware triggered scan
+	 * 3) Continuous software scan
+	 * 4) Continuous software scan with timer delay
+	 * 5) Continuous hardware triggered scan
+	 * 6) Continuous hardware triggered scan with timer delay
+	 *
+	 * For now, limit the chanlist to a single channel.
+	 */
+	if (cmd->chanlist_len > 1) {
+		cmd->chanlist_len = 1;
+		err |= -EINVAL;
+	}
+
+	tmp = cmd->convert_arg;
+	err |= apci3xxx_ai_ns_to_timer(dev, &cmd->convert_arg,
+				       cmd->flags & TRIG_ROUND_MASK);
+	if (tmp != cmd->convert_arg)
+		err |= -EINVAL;
+
+	if (err)
+		return 4;
+
+	return 0;
+}
+
+static int apci3xxx_ai_cmd(struct comedi_device *dev,
+			   struct comedi_subdevice *s)
+{
+	struct apci3xxx_private *devpriv = dev->private;
+	struct comedi_cmd *cmd = &s->async->cmd;
+	int ret;
+
+	ret = apci3xxx_ai_setup(dev, cmd->chanlist[0]);
+	if (ret)
+		return ret;
+
+	/* Set the convert timing unit */
+	writel(devpriv->ai_time_base, devpriv->mmio + 36);
+
+	/* Set the convert timing */
+	writel(devpriv->ai_timer, devpriv->mmio + 32);
+
+	/* Start the conversion */
+	writel(0x180000, devpriv->mmio + 8);
+
+	return 0;
+}
+
+static int apci3xxx_ai_cancel(struct comedi_device *dev,
+			      struct comedi_subdevice *s)
+{
+	return 0;
+}
+
+static int apci3xxx_ao_insn_write(struct comedi_device *dev,
+				  struct comedi_subdevice *s,
+				  struct comedi_insn *insn,
+				  unsigned int *data)
+{
+	struct apci3xxx_private *devpriv = dev->private;
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	unsigned int range = CR_RANGE(insn->chanspec);
+	unsigned int status;
+	int i;
+
+	for (i = 0; i < insn->n; i++) {
+		/* Set the range selection */
+		writel(range, devpriv->mmio + 96);
+
+		/* Write the analog value to the selected channel */
+		writel((data[i] << 8) | chan, devpriv->mmio + 100);
+
+		/* Wait the end of transfer */
+		do {
+			status = readl(devpriv->mmio + 96);
+		} while ((status & 0x100) != 0x100);
+	}
+
+	return insn->n;
+}
+
+static int apci3xxx_di_insn_bits(struct comedi_device *dev,
+				 struct comedi_subdevice *s,
+				 struct comedi_insn *insn,
+				 unsigned int *data)
+{
+	data[1] = inl(dev->iobase + 32) & 0xf;
+
+	return insn->n;
+}
+
+static int apci3xxx_do_insn_bits(struct comedi_device *dev,
+				 struct comedi_subdevice *s,
+				 struct comedi_insn *insn,
+				 unsigned int *data)
+{
+	unsigned int mask = data[0];
+	unsigned int bits = data[1];
+
+	s->state = inl(dev->iobase + 48) & 0xf;
+	if (mask) {
+		s->state &= ~mask;
+		s->state |= (bits & mask);
+
+		outl(s->state, dev->iobase + 48);
+	}
+
+	data[1] = s->state;
+
+	return insn->n;
+}
+
+static int apci3xxx_dio_insn_config(struct comedi_device *dev,
+				    struct comedi_subdevice *s,
+				    struct comedi_insn *insn,
+				    unsigned int *data)
+{
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	unsigned int mask = 1 << chan;
+	unsigned int bits;
+
+	/*
+	 * Port 0 (channels 0-7) are always inputs
+	 * Port 1 (channels 8-15) are always outputs
+	 * Port 2 (channels 16-23) are programmable i/o
+	 *
+	 * Changing any channel in port 2 changes the entire port.
+	 */
+	if (mask & 0xff0000)
+		bits = 0xff0000;
+	else
+		bits = 0;
+
+	switch (data[0]) {
+	case INSN_CONFIG_DIO_INPUT:
+		s->io_bits &= ~bits;
+		break;
+	case INSN_CONFIG_DIO_OUTPUT:
+		s->io_bits |= bits;
+		break;
+	case INSN_CONFIG_DIO_QUERY:
+		data[1] = (s->io_bits & bits) ? COMEDI_OUTPUT : COMEDI_INPUT;
+		return insn->n;
+	default:
+		return -EINVAL;
+	}
+
+	/* update port 2 configuration */
+	if (bits)
+		outl((s->io_bits >> 24) & 0xff, dev->iobase + 224);
+
+	return insn->n;
+}
+
+static int apci3xxx_dio_insn_bits(struct comedi_device *dev,
+				  struct comedi_subdevice *s,
+				  struct comedi_insn *insn,
+				  unsigned int *data)
+{
+	unsigned int mask = data[0];
+	unsigned int bits = data[1];
+	unsigned int val;
+
+	/* only update output channels */
+	mask &= s->io_bits;
+	if (mask) {
+		s->state &= ~mask;
+		s->state |= (bits & mask);
+
+		if (mask & 0xff)
+			outl(s->state & 0xff, dev->iobase + 80);
+		if (mask & 0xff0000)
+			outl((s->state >> 16) & 0xff, dev->iobase + 112);
+	}
+
+	val = inl(dev->iobase + 80);
+	val |= (inl(dev->iobase + 64) << 8);
+	if (s->io_bits & 0xff0000)
+		val |= (inl(dev->iobase + 112) << 16);
+	else
+		val |= (inl(dev->iobase + 96) << 16);
+
+	data[1] = val;
+
+	return insn->n;
+}
+
+static int apci3xxx_reset(struct comedi_device *dev)
+{
+	struct apci3xxx_private *devpriv = dev->private;
+	unsigned int val;
+	int i;
+
+	/* Disable the interrupt */
+	disable_irq(dev->irq);
+
+	/* Clear the start command */
+	writel(0, devpriv->mmio + 8);
+
+	/* Reset the interrupt flags */
+	val = readl(devpriv->mmio + 16);
+	writel(val, devpriv->mmio + 16);
+
+	/* clear the EOS */
+	readl(devpriv->mmio + 20);
+
+	/* Clear the FIFO */
+	for (i = 0; i < 16; i++)
+		val = readl(devpriv->mmio + 28);
+
+	/* Enable the interrupt */
+	enable_irq(dev->irq);
+
+	return 0;
+}
+
 static int apci3xxx_auto_attach(struct comedi_device *dev,
 				unsigned long context)
 {
-	const struct addi_board *board = NULL;
+	struct pci_dev *pcidev = comedi_to_pci_dev(dev);
+	const struct apci3xxx_boardinfo *board = NULL;
+	struct apci3xxx_private *devpriv;
+	struct comedi_subdevice *s;
+	int n_subdevices;
+	int subdev;
+	int ret;
 
 	if (context < ARRAY_SIZE(apci3xxx_boardtypes))
 		board = &apci3xxx_boardtypes[context];
 	if (!board)
 		return -ENODEV;
 	dev->board_ptr = board;
+	dev->board_name = board->name;
 
-	return addi_auto_attach(dev, context);
+	devpriv = kzalloc(sizeof(*devpriv), GFP_KERNEL);
+	if (!devpriv)
+		return -ENOMEM;
+	dev->private = devpriv;
+
+	ret = comedi_pci_enable(dev);
+	if (ret)
+		return ret;
+
+	dev->iobase = pci_resource_start(pcidev, 2);
+	devpriv->mmio = pci_ioremap_bar(pcidev, 3);
+
+	if (pcidev->irq > 0) {
+		ret = request_irq(pcidev->irq, apci3xxx_irq_handler,
+				  IRQF_SHARED, dev->board_name, dev);
+		if (ret == 0)
+			dev->irq = pcidev->irq;
+	}
+
+	n_subdevices = (board->ai_n_chan ? 0 : 1) + board->has_ao +
+		       board->has_dig_in + board->has_dig_out +
+		       board->has_ttl_io;
+	ret = comedi_alloc_subdevices(dev, n_subdevices);
+	if (ret)
+		return ret;
+
+	subdev = 0;
+
+	/* Analog Input subdevice */
+	if (board->ai_n_chan) {
+		s = &dev->subdevices[subdev];
+		s->type		= COMEDI_SUBD_AI;
+		s->subdev_flags	= SDF_READABLE | board->ai_subdev_flags;
+		s->n_chan	= board->ai_n_chan;
+		s->maxdata	= board->ai_maxdata;
+		s->len_chanlist	= s->n_chan;
+		s->range_table	= &apci3xxx_ai_range;
+		s->insn_read	= apci3xxx_ai_insn_read;
+		if (dev->irq) {
+			dev->read_subdev = s;
+			s->subdev_flags	|= SDF_CMD_READ;
+			s->do_cmdtest	= apci3xxx_ai_cmdtest;
+			s->do_cmd	= apci3xxx_ai_cmd;
+			s->cancel	= apci3xxx_ai_cancel;
+		}
+
+		subdev++;
+	}
+
+	/* Analog Output subdevice */
+	if (board->has_ao) {
+		s = &dev->subdevices[subdev];
+		s->type		= COMEDI_SUBD_AO;
+		s->subdev_flags	= SDF_WRITEABLE | SDF_GROUND | SDF_COMMON;
+		s->n_chan	= 4;
+		s->maxdata	= 0x0fff;
+		s->range_table	= &apci3xxx_ao_range;
+		s->insn_write	= apci3xxx_ao_insn_write;
+
+		subdev++;
+	}
+
+	/* Digital Input subdevice */
+	if (board->has_dig_in) {
+		s = &dev->subdevices[subdev];
+		s->type		= COMEDI_SUBD_DI;
+		s->subdev_flags	= SDF_READABLE;
+		s->n_chan	= 4;
+		s->maxdata	= 1;
+		s->range_table	= &range_digital;
+		s->insn_bits	= apci3xxx_di_insn_bits;
+
+		subdev++;
+	}
+
+	/* Digital Output subdevice */
+	if (board->has_dig_out) {
+		s = &dev->subdevices[subdev];
+		s->type		= COMEDI_SUBD_DO;
+		s->subdev_flags	= SDF_WRITEABLE;
+		s->n_chan	= 4;
+		s->maxdata	= 1;
+		s->range_table	= &range_digital;
+		s->insn_bits	= apci3xxx_do_insn_bits;
+
+		subdev++;
+	}
+
+	/* TTL Digital I/O subdevice */
+	if (board->has_ttl_io) {
+		s = &dev->subdevices[subdev];
+		s->type		= COMEDI_SUBD_DIO;
+		s->subdev_flags	= SDF_READABLE | SDF_WRITEABLE;
+		s->n_chan	= 24;
+		s->maxdata	= 1;
+		s->io_bits	= 0xff;	/* channels 0-7 are always outputs */
+		s->range_table	= &range_digital;
+		s->insn_config	= apci3xxx_dio_insn_config;
+		s->insn_bits	= apci3xxx_dio_insn_bits;
+
+		subdev++;
+	}
+
+	apci3xxx_reset(dev);
+	return 0;
+}
+
+static void apci3xxx_detach(struct comedi_device *dev)
+{
+	struct apci3xxx_private *devpriv = dev->private;
+
+	if (devpriv) {
+		if (dev->iobase)
+			apci3xxx_reset(dev);
+		if (dev->irq)
+			free_irq(dev->irq, dev);
+		if (devpriv->mmio)
+			iounmap(devpriv->mmio);
+	}
+	comedi_pci_disable(dev);
 }
 
 static struct comedi_driver apci3xxx_driver = {
 	.driver_name	= "addi_apci_3xxx",
 	.module		= THIS_MODULE,
 	.auto_attach	= apci3xxx_auto_attach,
-	.detach		= i_ADDI_Detach,
+	.detach		= apci3xxx_detach,
 };
 
 static int apci3xxx_pci_probe(struct pci_dev *dev,

diff --git a/drivers/staging/comedi/drivers/addi_watchdog.c b/drivers/staging/comedi/drivers/addi_watchdog.c
index 1666b5f..7b21acc 100644
--- a/drivers/staging/comedi/drivers/addi_watchdog.c
+++ b/drivers/staging/comedi/drivers/addi_watchdog.c

@@ -16,10 +16,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include "../comedidev.h"
@@ -130,14 +126,12 @@
 {
 	struct addi_watchdog_private *spriv;
 
-	spriv = kzalloc(sizeof(*spriv), GFP_KERNEL);
+	spriv = comedi_alloc_spriv(s, sizeof(*spriv));
 	if (!spriv)
 		return -ENOMEM;
 
 	spriv->iobase = iobase;
 
-	s->private	= spriv;
-
 	s->type		= COMEDI_SUBD_TIMER;
 	s->subdev_flags	= SDF_WRITEABLE;
 	s->n_chan	= 1;

diff --git a/drivers/staging/comedi/drivers/adl_pci6208.c b/drivers/staging/comedi/drivers/adl_pci6208.c
index 8a438ff..b5e4e53 100644
--- a/drivers/staging/comedi/drivers/adl_pci6208.c
+++ b/drivers/staging/comedi/drivers/adl_pci6208.c

@@ -20,10 +20,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 /*
 Driver: adl_pci6208

diff --git a/drivers/staging/comedi/drivers/adl_pci7x3x.c b/drivers/staging/comedi/drivers/adl_pci7x3x.c
index e396074..0d9243a 100644
--- a/drivers/staging/comedi/drivers/adl_pci7x3x.c
+++ b/drivers/staging/comedi/drivers/adl_pci7x3x.c

@@ -19,10 +19,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*

diff --git a/drivers/staging/comedi/drivers/adl_pci8164.c b/drivers/staging/comedi/drivers/adl_pci8164.c
index b3ec60a..0b591b0 100644
--- a/drivers/staging/comedi/drivers/adl_pci8164.c
+++ b/drivers/staging/comedi/drivers/adl_pci8164.c

@@ -13,10 +13,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*

diff --git a/drivers/staging/comedi/drivers/adl_pci9111.c b/drivers/staging/comedi/drivers/adl_pci9111.c
index 6247fdc..af51c74 100644
--- a/drivers/staging/comedi/drivers/adl_pci9111.c
+++ b/drivers/staging/comedi/drivers/adl_pci9111.c

@@ -17,10 +17,6 @@
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/adq12b.c b/drivers/staging/comedi/drivers/adq12b.c
index 71142e3..d187a7b 100644
--- a/drivers/staging/comedi/drivers/adq12b.c
+++ b/drivers/staging/comedi/drivers/adq12b.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: adq12b

diff --git a/drivers/staging/comedi/drivers/adv_pci1723.c b/drivers/staging/comedi/drivers/adv_pci1723.c
index ccc114d..8430a27 100644
--- a/drivers/staging/comedi/drivers/adv_pci1723.c
+++ b/drivers/staging/comedi/drivers/adv_pci1723.c

@@ -1,4 +1,4 @@
-/*******************************************************************************
+/*
    comedi/drivers/pci1723.c
 
    COMEDI - Linux Control and Measurement Device Interface
@@ -13,12 +13,7 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-*******************************************************************************/
+*/
 /*
 Driver: adv_pci1723
 Description: Advantech PCI-1723

diff --git a/drivers/staging/comedi/drivers/adv_pci1724.c b/drivers/staging/comedi/drivers/adv_pci1724.c
index e60f125..da7462e 100644
--- a/drivers/staging/comedi/drivers/adv_pci1724.c
+++ b/drivers/staging/comedi/drivers/adv_pci1724.c

@@ -17,12 +17,7 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 
 /*
 

diff --git a/drivers/staging/comedi/drivers/adv_pci_dio.c b/drivers/staging/comedi/drivers/adv_pci_dio.c
index f70c6747..8e6ec75 100644
--- a/drivers/staging/comedi/drivers/adv_pci_dio.c
+++ b/drivers/staging/comedi/drivers/adv_pci_dio.c

@@ -1173,19 +1173,11 @@
 static void pci_dio_detach(struct comedi_device *dev)
 {
 	struct pci_dio_private *devpriv = dev->private;
-	struct comedi_subdevice *s;
-	int i;
 
 	if (devpriv) {
 		if (devpriv->valid)
 			pci_dio_reset(dev);
 	}
-	for (i = 0; i < dev->n_subdevices; i++) {
-		s = &dev->subdevices[i];
-		if (s->type == COMEDI_SUBD_DIO)
-			comedi_spriv_free(dev, i);
-		s->private = NULL; /* some private data is static */
-	}
 	comedi_pci_disable(dev);
 }
 

diff --git a/drivers/staging/comedi/drivers/aio_aio12_8.c b/drivers/staging/comedi/drivers/aio_aio12_8.c
index e2dc08a..279dfe8 100644
--- a/drivers/staging/comedi/drivers/aio_aio12_8.c
+++ b/drivers/staging/comedi/drivers/aio_aio12_8.c

@@ -14,10 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*
@@ -259,17 +255,11 @@
 	return 0;
 }
 
-static void aio_aio12_8_detach(struct comedi_device *dev)
-{
-	comedi_spriv_free(dev, 2);
-	comedi_legacy_detach(dev);
-}
-
 static struct comedi_driver aio_aio12_8_driver = {
 	.driver_name	= "aio_aio12_8",
 	.module		= THIS_MODULE,
 	.attach		= aio_aio12_8_attach,
-	.detach		= aio_aio12_8_detach,
+	.detach		= comedi_legacy_detach,
 	.board_name	= &board_types[0].name,
 	.num_names	= ARRAY_SIZE(board_types),
 	.offset		= sizeof(struct aio12_8_boardtype),

diff --git a/drivers/staging/comedi/drivers/aio_iiro_16.c b/drivers/staging/comedi/drivers/aio_iiro_16.c
index 126854c..029834d 100644
--- a/drivers/staging/comedi/drivers/aio_iiro_16.c
+++ b/drivers/staging/comedi/drivers/aio_iiro_16.c

@@ -14,10 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/amplc_dio200.c b/drivers/staging/comedi/drivers/amplc_dio200.c
index 297750b..e247810 100644
--- a/drivers/staging/comedi/drivers/amplc_dio200.c
+++ b/drivers/staging/comedi/drivers/amplc_dio200.c

@@ -17,11 +17,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
  * Driver: amplc_dio200

diff --git a/drivers/staging/comedi/drivers/amplc_dio200.h b/drivers/staging/comedi/drivers/amplc_dio200.h
index cf2e726..43160b9 100644
--- a/drivers/staging/comedi/drivers/amplc_dio200.h
+++ b/drivers/staging/comedi/drivers/amplc_dio200.h

@@ -18,11 +18,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef AMPLC_DIO200_H_INCLUDED

diff --git a/drivers/staging/comedi/drivers/amplc_dio200_common.c b/drivers/staging/comedi/drivers/amplc_dio200_common.c
index 3403e5c..649fc69 100644
--- a/drivers/staging/comedi/drivers/amplc_dio200_common.c
+++ b/drivers/staging/comedi/drivers/amplc_dio200_common.c

@@ -17,11 +17,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #include <linux/interrupt.h>
@@ -561,7 +556,7 @@
 	const struct dio200_layout *layout = dio200_dev_layout(dev);
 	struct dio200_subdev_intr *subpriv;
 
-	subpriv = kzalloc(sizeof(*subpriv), GFP_KERNEL);
+	subpriv = comedi_alloc_spriv(s, sizeof(*subpriv));
 	if (!subpriv)
 		return -ENOMEM;
 
@@ -573,7 +568,6 @@
 		/* Disable interrupt sources. */
 		dio200_write8(dev, subpriv->ofs, 0);
 
-	s->private = subpriv;
 	s->type = COMEDI_SUBD_DI;
 	s->subdev_flags = SDF_READABLE | SDF_CMD_READ;
 	if (layout->has_int_sce) {
@@ -888,11 +882,10 @@
 	struct dio200_subdev_8254 *subpriv;
 	unsigned int chan;
 
-	subpriv = kzalloc(sizeof(*subpriv), GFP_KERNEL);
+	subpriv = comedi_alloc_spriv(s, sizeof(*subpriv));
 	if (!subpriv)
 		return -ENOMEM;
 
-	s->private = subpriv;
 	s->type = COMEDI_SUBD_COUNTER;
 	s->subdev_flags = SDF_WRITABLE | SDF_READABLE;
 	s->n_chan = 3;
@@ -1024,11 +1017,12 @@
 {
 	struct dio200_subdev_8255 *subpriv;
 
-	subpriv = kzalloc(sizeof(*subpriv), GFP_KERNEL);
+	subpriv = comedi_alloc_spriv(s, sizeof(*subpriv));
 	if (!subpriv)
 		return -ENOMEM;
+
 	subpriv->ofs = offset;
-	s->private = subpriv;
+
 	s->type = COMEDI_SUBD_DIO;
 	s->subdev_flags = SDF_READABLE | SDF_WRITABLE;
 	s->n_chan = 24;
@@ -1230,28 +1224,11 @@
 {
 	const struct dio200_board *thisboard = comedi_board(dev);
 	struct dio200_private *devpriv = dev->private;
-	const struct dio200_layout *layout;
-	unsigned n;
 
 	if (!thisboard || !devpriv)
 		return;
 	if (dev->irq)
 		free_irq(dev->irq, dev);
-	if (dev->subdevices) {
-		layout = dio200_board_layout(thisboard);
-		for (n = 0; n < dev->n_subdevices; n++) {
-			switch (layout->sdtype[n]) {
-			case sd_8254:
-			case sd_8255:
-			case sd_intr:
-				comedi_spriv_free(dev, n);
-				break;
-			case sd_timer:
-			default:
-				break;
-			}
-		}
-	}
 }
 EXPORT_SYMBOL_GPL(amplc_dio200_common_detach);
 

diff --git a/drivers/staging/comedi/drivers/amplc_dio200_pci.c b/drivers/staging/comedi/drivers/amplc_dio200_pci.c
index 4be44e8..d7d9f5c 100644
--- a/drivers/staging/comedi/drivers/amplc_dio200_pci.c
+++ b/drivers/staging/comedi/drivers/amplc_dio200_pci.c

@@ -16,11 +16,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
  * Driver: amplc_dio200_pci

diff --git a/drivers/staging/comedi/drivers/amplc_pc236.c b/drivers/staging/comedi/drivers/amplc_pc236.c
index 115ecd5..4e889b8 100644
--- a/drivers/staging/comedi/drivers/amplc_pc236.c
+++ b/drivers/staging/comedi/drivers/amplc_pc236.c

@@ -16,11 +16,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: amplc_pc236
@@ -543,7 +538,6 @@
 		return;
 	if (dev->iobase)
 		pc236_intr_disable(dev);
-	comedi_spriv_free(dev, 0);
 	if (is_isa_board(thisboard)) {
 		comedi_legacy_detach(dev);
 	} else if (is_pci_board(thisboard)) {

diff --git a/drivers/staging/comedi/drivers/amplc_pc263.c b/drivers/staging/comedi/drivers/amplc_pc263.c
index 94a752d8..6546095 100644
--- a/drivers/staging/comedi/drivers/amplc_pc263.c
+++ b/drivers/staging/comedi/drivers/amplc_pc263.c

@@ -16,11 +16,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: amplc_pc263

diff --git a/drivers/staging/comedi/drivers/amplc_pci224.c b/drivers/staging/comedi/drivers/amplc_pci224.c
index 4d7eab9..f1e36f0 100644
--- a/drivers/staging/comedi/drivers/amplc_pci224.c
+++ b/drivers/staging/comedi/drivers/amplc_pci224.c

@@ -16,11 +16,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: amplc_pci224

diff --git a/drivers/staging/comedi/drivers/amplc_pci230.c b/drivers/staging/comedi/drivers/amplc_pci230.c
index 49200fb..846d644 100644
--- a/drivers/staging/comedi/drivers/amplc_pci230.c
+++ b/drivers/staging/comedi/drivers/amplc_pci230.c

@@ -16,10 +16,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
   */
 /*
 Driver: amplc_pci230
@@ -2834,7 +2830,6 @@
 {
 	struct pci_dev *pcidev = comedi_to_pci_dev(dev);
 
-	comedi_spriv_free(dev, 2);
 	if (dev->irq)
 		free_irq(dev->irq, dev);
 	comedi_pci_disable(dev);

diff --git a/drivers/staging/comedi/drivers/amplc_pci263.c b/drivers/staging/comedi/drivers/amplc_pci263.c
index 8b57533..4da900c 100644
--- a/drivers/staging/comedi/drivers/amplc_pci263.c
+++ b/drivers/staging/comedi/drivers/amplc_pci263.c

@@ -16,11 +16,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: amplc_pci263

diff --git a/drivers/staging/comedi/drivers/c6xdigio.c b/drivers/staging/comedi/drivers/c6xdigio.c
index 92376dc..929218a3 100644
--- a/drivers/staging/comedi/drivers/c6xdigio.c
+++ b/drivers/staging/comedi/drivers/c6xdigio.c

@@ -16,11 +16,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: c6xdigio

diff --git a/drivers/staging/comedi/drivers/cb_das16_cs.c b/drivers/staging/comedi/drivers/cb_das16_cs.c
index f874fff..ae9a208 100644
--- a/drivers/staging/comedi/drivers/cb_das16_cs.c
+++ b/drivers/staging/comedi/drivers/cb_das16_cs.c

@@ -15,10 +15,6 @@
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
     PCMCIA support code for this driver is adapted from the dummy_cs.c
     driver of the Linux PCMCIA Card Services package.
 

diff --git a/drivers/staging/comedi/drivers/cb_pcidas.c b/drivers/staging/comedi/drivers/cb_pcidas.c
index 53dd298..58bca18 100644
--- a/drivers/staging/comedi/drivers/cb_pcidas.c
+++ b/drivers/staging/comedi/drivers/cb_pcidas.c

@@ -19,12 +19,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: cb_pcidas
@@ -1608,7 +1602,6 @@
 	}
 	if (dev->irq)
 		free_irq(dev->irq, dev);
-	comedi_spriv_free(dev, 2);
 	comedi_pci_disable(dev);
 }
 

diff --git a/drivers/staging/comedi/drivers/cb_pcidas64.c b/drivers/staging/comedi/drivers/cb_pcidas64.c
index c3e5495..43c0bf5 100644
--- a/drivers/staging/comedi/drivers/cb_pcidas64.c
+++ b/drivers/staging/comedi/drivers/cb_pcidas64.c

@@ -28,12 +28,7 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 
 /*
  * Driver: cb_pcidas64
@@ -4163,7 +4158,6 @@
 					devpriv->ao_dma_desc_bus_addr);
 		}
 	}
-	comedi_spriv_free(dev, 4);
 	comedi_pci_disable(dev);
 }
 

diff --git a/drivers/staging/comedi/drivers/cb_pcidda.c b/drivers/staging/comedi/drivers/cb_pcidda.c
index f9b4598..2d3e920 100644
--- a/drivers/staging/comedi/drivers/cb_pcidda.c
+++ b/drivers/staging/comedi/drivers/cb_pcidda.c

@@ -17,10 +17,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -397,18 +393,11 @@
 	return 0;
 }
 
-static void cb_pcidda_detach(struct comedi_device *dev)
-{
-	comedi_spriv_free(dev, 1);
-	comedi_spriv_free(dev, 2);
-	comedi_pci_disable(dev);
-}
-
 static struct comedi_driver cb_pcidda_driver = {
 	.driver_name	= "cb_pcidda",
 	.module		= THIS_MODULE,
 	.auto_attach	= cb_pcidda_auto_attach,
-	.detach		= cb_pcidda_detach,
+	.detach		= comedi_pci_disable,
 };
 
 static int cb_pcidda_pci_probe(struct pci_dev *dev,

diff --git a/drivers/staging/comedi/drivers/cb_pcimdas.c b/drivers/staging/comedi/drivers/cb_pcimdas.c
index 29813c9..8b5c198 100644
--- a/drivers/staging/comedi/drivers/cb_pcimdas.c
+++ b/drivers/staging/comedi/drivers/cb_pcimdas.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: cb_pcimdas

diff --git a/drivers/staging/comedi/drivers/cb_pcimdda.c b/drivers/staging/comedi/drivers/cb_pcimdda.c
index 88f03ae..406cba8c 100644
--- a/drivers/staging/comedi/drivers/cb_pcimdda.c
+++ b/drivers/staging/comedi/drivers/cb_pcimdda.c

@@ -15,11 +15,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: cb_pcimdda
@@ -197,17 +192,11 @@
 	return 1;
 }
 
-static void cb_pcimdda_detach(struct comedi_device *dev)
-{
-	comedi_spriv_free(dev, 1);
-	comedi_pci_disable(dev);
-}
-
 static struct comedi_driver cb_pcimdda_driver = {
 	.driver_name	= "cb_pcimdda",
 	.module		= THIS_MODULE,
 	.auto_attach	= cb_pcimdda_auto_attach,
-	.detach		= cb_pcimdda_detach,
+	.detach		= comedi_pci_disable,
 };
 
 static int cb_pcimdda_pci_probe(struct pci_dev *dev,

diff --git a/drivers/staging/comedi/drivers/comedi_bond.c b/drivers/staging/comedi/drivers/comedi_bond.c
index 1bb5381..1a51866 100644
--- a/drivers/staging/comedi/drivers/comedi_bond.c
+++ b/drivers/staging/comedi/drivers/comedi_bond.c

@@ -15,11 +15,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: comedi_bond

diff --git a/drivers/staging/comedi/drivers/comedi_fc.c b/drivers/staging/comedi/drivers/comedi_fc.c
index 37dc796..b3d89c8 100644
--- a/drivers/staging/comedi/drivers/comedi_fc.c
+++ b/drivers/staging/comedi/drivers/comedi_fc.c

@@ -17,12 +17,7 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 
 #include "../comedidev.h"
 

diff --git a/drivers/staging/comedi/drivers/comedi_fc.h b/drivers/staging/comedi/drivers/comedi_fc.h
index 31afab7..a4dea7c 100644
--- a/drivers/staging/comedi/drivers/comedi_fc.h
+++ b/drivers/staging/comedi/drivers/comedi_fc.h

@@ -17,12 +17,7 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 
 #ifndef _COMEDI_FC_H
 #define _COMEDI_FC_H

diff --git a/drivers/staging/comedi/drivers/comedi_parport.c b/drivers/staging/comedi/drivers/comedi_parport.c
index 3e061cc..772a8f5 100644
--- a/drivers/staging/comedi/drivers/comedi_parport.c
+++ b/drivers/staging/comedi/drivers/comedi_parport.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: comedi_parport

diff --git a/drivers/staging/comedi/drivers/comedi_test.c b/drivers/staging/comedi/drivers/comedi_test.c
index c1d8e86..907e7a3 100644
--- a/drivers/staging/comedi/drivers/comedi_test.c
+++ b/drivers/staging/comedi/drivers/comedi_test.c

@@ -21,12 +21,7 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 /*
 Driver: comedi_test
 Description: generates fake waveforms

diff --git a/drivers/staging/comedi/drivers/contec_pci_dio.c b/drivers/staging/comedi/drivers/contec_pci_dio.c
index f2230bf..0fb9027 100644
--- a/drivers/staging/comedi/drivers/contec_pci_dio.c
+++ b/drivers/staging/comedi/drivers/contec_pci_dio.c

@@ -13,11 +13,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: contec_pci_dio

diff --git a/drivers/staging/comedi/drivers/daqboard2000.c b/drivers/staging/comedi/drivers/daqboard2000.c
index b87f95c..44c912b 100644
--- a/drivers/staging/comedi/drivers/daqboard2000.c
+++ b/drivers/staging/comedi/drivers/daqboard2000.c

@@ -14,11 +14,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: daqboard2000
@@ -110,7 +105,6 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
-#include <linux/firmware.h>
 
 #include "../comedidev.h"
 
@@ -524,7 +518,8 @@
 }
 
 static int initialize_daqboard2000(struct comedi_device *dev,
-				   const u8 *cpld_array, size_t len)
+				   const u8 *cpld_array, size_t len,
+				   unsigned long context)
 {
 	struct daqboard2000_private *devpriv = dev->private;
 	int result = -EIO;
@@ -565,22 +560,6 @@
 	return result;
 }
 
-static int daqboard2000_upload_firmware(struct comedi_device *dev)
-{
-	struct pci_dev *pcidev = comedi_to_pci_dev(dev);
-	const struct firmware *fw;
-	int ret;
-
-	ret = request_firmware(&fw, DAQBOARD2000_FIRMWARE, &pcidev->dev);
-	if (ret)
-		return ret;
-
-	ret = initialize_daqboard2000(dev, fw->data, fw->size);
-	release_firmware(fw);
-
-	return ret;
-}
-
 static void daqboard2000_adcStopDmaTransfer(struct comedi_device *dev)
 {
 }
@@ -724,7 +703,9 @@
 
 	readl(devpriv->plx + 0x6c);
 
-	result = daqboard2000_upload_firmware(dev);
+	result = comedi_load_firmware(dev, &comedi_to_pci_dev(dev)->dev,
+				      DAQBOARD2000_FIRMWARE,
+				      initialize_daqboard2000, 0);
 	if (result < 0)
 		return result;
 
@@ -766,7 +747,6 @@
 {
 	struct daqboard2000_private *devpriv = dev->private;
 
-	comedi_spriv_free(dev, 2);
 	if (dev->irq)
 		free_irq(dev->irq, dev);
 	if (devpriv) {

diff --git a/drivers/staging/comedi/drivers/das08.c b/drivers/staging/comedi/drivers/das08.c
index ba12c1d..2e7e3e2 100644
--- a/drivers/staging/comedi/drivers/das08.c
+++ b/drivers/staging/comedi/drivers/das08.c

@@ -16,12 +16,6 @@
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- *****************************************************************
  */
 
 /*
@@ -566,12 +560,6 @@
 }
 EXPORT_SYMBOL_GPL(das08_common_attach);
 
-void das08_common_detach(struct comedi_device *dev)
-{
-	comedi_spriv_free(dev, 4);
-}
-EXPORT_SYMBOL_GPL(das08_common_detach);
-
 static int __init das08_init(void)
 {
 	return 0;

diff --git a/drivers/staging/comedi/drivers/das08.h b/drivers/staging/comedi/drivers/das08.h
index 89bb8d6..cce1b58 100644
--- a/drivers/staging/comedi/drivers/das08.h
+++ b/drivers/staging/comedi/drivers/das08.h

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _DAS08_H
@@ -52,6 +47,5 @@
 };
 
 int das08_common_attach(struct comedi_device *dev, unsigned long iobase);
-void das08_common_detach(struct comedi_device *dev);
 
 #endif /* _DAS08_H */

diff --git a/drivers/staging/comedi/drivers/das08_cs.c b/drivers/staging/comedi/drivers/das08_cs.c
index d9f3e92..885fb17 100644
--- a/drivers/staging/comedi/drivers/das08_cs.c
+++ b/drivers/staging/comedi/drivers/das08_cs.c

@@ -16,19 +16,12 @@
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
     PCMCIA support code for this driver is adapted from the dummy_cs.c
     driver of the Linux PCMCIA Card Services package.
 
     The initial developer of the original code is David A. Hinds
     <dahinds@users.sourceforge.net>.  Portions created by David A. Hinds
     are Copyright (C) 1999 David A. Hinds.  All Rights Reserved.
-
-*****************************************************************
-
 */
 /*
 Driver: das08_cs
@@ -93,17 +86,11 @@
 	return das08_common_attach(dev, iobase);
 }
 
-static void das08_cs_detach(struct comedi_device *dev)
-{
-	das08_common_detach(dev);
-	comedi_pcmcia_disable(dev);
-}
-
 static struct comedi_driver driver_das08_cs = {
 	.driver_name	= "das08_cs",
 	.module		= THIS_MODULE,
 	.auto_attach	= das08_cs_auto_attach,
-	.detach		= das08_cs_detach,
+	.detach		= comedi_pcmcia_disable,
 };
 
 static int das08_pcmcia_attach(struct pcmcia_device *link)

diff --git a/drivers/staging/comedi/drivers/das08_isa.c b/drivers/staging/comedi/drivers/das08_isa.c
index f09f696..21a94389 100644
--- a/drivers/staging/comedi/drivers/das08_isa.c
+++ b/drivers/staging/comedi/drivers/das08_isa.c

@@ -16,10 +16,6 @@
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -193,17 +189,11 @@
 	return das08_common_attach(dev, dev->iobase);
 }
 
-static void das08_isa_detach(struct comedi_device *dev)
-{
-	das08_common_detach(dev);
-	comedi_legacy_detach(dev);
-}
-
 static struct comedi_driver das08_isa_driver = {
 	.driver_name	= "isa-das08",
 	.module		= THIS_MODULE,
 	.attach		= das08_isa_attach,
-	.detach		= das08_isa_detach,
+	.detach		= comedi_legacy_detach,
 	.board_name	= &das08_isa_boards[0].name,
 	.num_names	= ARRAY_SIZE(das08_isa_boards),
 	.offset		= sizeof(das08_isa_boards[0]),

diff --git a/drivers/staging/comedi/drivers/das08_pci.c b/drivers/staging/comedi/drivers/das08_pci.c
index 53fa943..9c5d234 100644
--- a/drivers/staging/comedi/drivers/das08_pci.c
+++ b/drivers/staging/comedi/drivers/das08_pci.c

@@ -16,10 +16,6 @@
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -79,17 +75,11 @@
 	return das08_common_attach(dev, dev->iobase);
 }
 
-static void das08_pci_detach(struct comedi_device *dev)
-{
-	das08_common_detach(dev);
-	comedi_pci_disable(dev);
-}
-
 static struct comedi_driver das08_pci_comedi_driver = {
 	.driver_name	= "pci-das08",
 	.module		= THIS_MODULE,
 	.auto_attach	= das08_pci_auto_attach,
-	.detach		= das08_pci_detach,
+	.detach		= comedi_pci_disable,
 };
 
 static int das08_pci_probe(struct pci_dev *dev,

diff --git a/drivers/staging/comedi/drivers/das16.c b/drivers/staging/comedi/drivers/das16.c
index 762b5a6..dbec3ba 100644
--- a/drivers/staging/comedi/drivers/das16.c
+++ b/drivers/staging/comedi/drivers/das16.c

@@ -16,12 +16,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: das16
@@ -1339,7 +1333,6 @@
 	struct das16_private_struct *devpriv = dev->private;
 
 	das16_reset(dev);
-	comedi_spriv_free(dev, 4);
 	if (devpriv) {
 		int i;
 		for (i = 0; i < 2; i++) {

diff --git a/drivers/staging/comedi/drivers/das16m1.c b/drivers/staging/comedi/drivers/das16m1.c
index 9cb9c3b..0b33808 100644
--- a/drivers/staging/comedi/drivers/das16m1.c
+++ b/drivers/staging/comedi/drivers/das16m1.c

@@ -17,12 +17,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: das16m1
@@ -672,7 +666,6 @@
 {
 	struct das16m1_private_struct *devpriv = dev->private;
 
-	comedi_spriv_free(dev, 3);
 	if (devpriv && devpriv->extra_iobase)
 		release_region(devpriv->extra_iobase, DAS16M1_SIZE2);
 	comedi_legacy_detach(dev);

diff --git a/drivers/staging/comedi/drivers/das1800.c b/drivers/staging/comedi/drivers/das1800.c
index abf7638..23b4a66 100644
--- a/drivers/staging/comedi/drivers/das1800.c
+++ b/drivers/staging/comedi/drivers/das1800.c

@@ -15,12 +15,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: das1800

diff --git a/drivers/staging/comedi/drivers/das6402.c b/drivers/staging/comedi/drivers/das6402.c
index 11424fb..f053077 100644
--- a/drivers/staging/comedi/drivers/das6402.c
+++ b/drivers/staging/comedi/drivers/das6402.c

@@ -22,11 +22,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: das6402

diff --git a/drivers/staging/comedi/drivers/das800.c b/drivers/staging/comedi/drivers/das800.c
index 9ce6cbc..091cd91 100644
--- a/drivers/staging/comedi/drivers/das800.c
+++ b/drivers/staging/comedi/drivers/das800.c

@@ -15,12 +15,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: das800

diff --git a/drivers/staging/comedi/drivers/dmm32at.c b/drivers/staging/comedi/drivers/dmm32at.c
index 6c85dd2..e29847d 100644
--- a/drivers/staging/comedi/drivers/dmm32at.c
+++ b/drivers/staging/comedi/drivers/dmm32at.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: dmm32at

diff --git a/drivers/staging/comedi/drivers/dt2811.c b/drivers/staging/comedi/drivers/dt2811.c
index 8757b54..5348cda 100644
--- a/drivers/staging/comedi/drivers/dt2811.c
+++ b/drivers/staging/comedi/drivers/dt2811.c

@@ -18,10 +18,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
 Driver: dt2811

diff --git a/drivers/staging/comedi/drivers/dt2814.c b/drivers/staging/comedi/drivers/dt2814.c
index 7c95b3b..87e9749 100644
--- a/drivers/staging/comedi/drivers/dt2814.c
+++ b/drivers/staging/comedi/drivers/dt2814.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: dt2814

diff --git a/drivers/staging/comedi/drivers/dt2815.c b/drivers/staging/comedi/drivers/dt2815.c
index b24e876..0fcd4fe 100644
--- a/drivers/staging/comedi/drivers/dt2815.c
+++ b/drivers/staging/comedi/drivers/dt2815.c

@@ -14,11 +14,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: dt2815

diff --git a/drivers/staging/comedi/drivers/dt2817.c b/drivers/staging/comedi/drivers/dt2817.c
index b5c8e82..2f46be7 100644
--- a/drivers/staging/comedi/drivers/dt2817.c
+++ b/drivers/staging/comedi/drivers/dt2817.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: dt2817

diff --git a/drivers/staging/comedi/drivers/dt282x.c b/drivers/staging/comedi/drivers/dt282x.c
index 90f2de9..c1950e3 100644
--- a/drivers/staging/comedi/drivers/dt282x.c
+++ b/drivers/staging/comedi/drivers/dt282x.c

@@ -14,11 +14,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: dt282x

diff --git a/drivers/staging/comedi/drivers/dt3000.c b/drivers/staging/comedi/drivers/dt3000.c
index 7e03929..01a2f88 100644
--- a/drivers/staging/comedi/drivers/dt3000.c
+++ b/drivers/staging/comedi/drivers/dt3000.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: dt3000

diff --git a/drivers/staging/comedi/drivers/dt9812.c b/drivers/staging/comedi/drivers/dt9812.c
index 81eb5ed..6c60949 100644
--- a/drivers/staging/comedi/drivers/dt9812.c
+++ b/drivers/staging/comedi/drivers/dt9812.c

@@ -15,11 +15,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
  */
 
 /*
@@ -43,14 +38,11 @@
  *      says P1).
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/kref.h>
 #include <linux/uaccess.h>
 #include <linux/usb.h>
 
@@ -60,6 +52,9 @@
 #define DT9812_MAX_WRITE_CMD_PIPE_SIZE	32
 #define DT9812_MAX_READ_CMD_PIPE_SIZE	32
 
+/* usb_bulk_msg() timout in milliseconds */
+#define DT9812_USB_TIMEOUT		1000
+
 /*
  * See Silican Laboratories C8051F020/1/2/3 manual
  */
@@ -242,87 +237,25 @@
 		struct dt9812_write_multi write_multi_info;
 		struct dt9812_rmw_multi rmw_multi_info;
 	} u;
-#if 0
-	WRITE_BYTE_INFO WriteByteInfo;
-	READ_BYTE_INFO ReadByteInfo;
-	WRITE_MULTI_INFO WriteMultiInfo;
-	READ_MULTI_INFO ReadMultiInfo;
-	RMW_BYTE_INFO RMWByteInfo;
-	RMW_MULTI_INFO RMWMultiInfo;
-	DAC_THRESHOLD_INFO DacThresholdInfo;
-	INT_ON_CHANGE_MASK_INFO IntOnChangeMaskInfo;
-	CGL_INFO CglInfo;
-	SUBSYSTEM_INFO SubsystemInfo;
-	CAL_POT_CMD CalPotCmd;
-	WRITE_DEV_BYTE_INFO WriteDevByteInfo;
-	READ_DEV_BYTE_INFO ReadDevByteInfo;
-	WRITE_DEV_MULTI_INFO WriteDevMultiInfo;
-	READ_DEV_MULTI_INFO ReadDevMultiInfo;
-	READ_SINGLE_VALUE_INFO ReadSingleValueInfo;
-	WRITE_SINGLE_VALUE_INFO WriteSingleValueInfo;
-#endif
 };
 
-#define DT9812_NUM_SLOTS	16
-
-static DEFINE_SEMAPHORE(dt9812_mutex);
-
-static const struct usb_device_id dt9812_table[] = {
-	{USB_DEVICE(0x0867, 0x9812)},
-	{}			/* Terminating entry */
-};
-
-MODULE_DEVICE_TABLE(usb, dt9812_table);
-
-struct usb_dt9812 {
-	struct slot_dt9812 *slot;
-	struct usb_device *udev;
-	struct usb_interface *interface;
-	u16 vendor;
-	u16 product;
-	u16 device;
-	u32 serial;
+struct dt9812_private {
+	struct semaphore sem;
 	struct {
 		__u8 addr;
 		size_t size;
-	} message_pipe, command_write, command_read, write_stream, read_stream;
-	struct kref kref;
-	u16 analog_out_shadow[2];
-	u8 digital_out_shadow;
+	} cmd_wr, cmd_rd;
+	u16 device;
+	u16 ao_shadow[2];
 };
 
-struct comedi_dt9812 {
-	struct slot_dt9812 *slot;
-	u32 serial;
-};
-
-struct slot_dt9812 {
-	struct semaphore mutex;
-	u32 serial;
-	struct usb_dt9812 *usb;
-	struct comedi_dt9812 *comedi;
-};
-
-static struct slot_dt9812 dt9812[DT9812_NUM_SLOTS];
-
-static inline struct usb_dt9812 *to_dt9812_dev(struct kref *d)
+static int dt9812_read_info(struct comedi_device *dev,
+			    int offset, void *buf, size_t buf_size)
 {
-	return container_of(d, struct usb_dt9812, kref);
-}
-
-static void dt9812_delete(struct kref *kref)
-{
-	struct usb_dt9812 *dev = to_dt9812_dev(kref);
-
-	usb_put_dev(dev->udev);
-	kfree(dev);
-}
-
-static int dt9812_read_info(struct usb_dt9812 *dev, int offset, void *buf,
-			    size_t buf_size)
-{
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct dt9812_private *devpriv = dev->private;
 	struct dt9812_usb_cmd cmd;
-	int count, retval;
+	int count, ret;
 
 	cmd.cmd = cpu_to_le32(DT9812_R_FLASH_DATA);
 	cmd.u.flash_data_info.address =
@@ -330,25 +263,23 @@
 	cmd.u.flash_data_info.numbytes = cpu_to_le16(buf_size);
 
 	/* DT9812 only responds to 32 byte writes!! */
-	count = 32;
-	retval = usb_bulk_msg(dev->udev,
-			      usb_sndbulkpipe(dev->udev,
-					      dev->command_write.addr),
-			      &cmd, 32, &count, HZ * 1);
-	if (retval)
-		return retval;
-	retval = usb_bulk_msg(dev->udev,
-			      usb_rcvbulkpipe(dev->udev,
-					      dev->command_read.addr),
-			      buf, buf_size, &count, HZ * 1);
-	return retval;
+	ret = usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr),
+			   &cmd, 32, &count, DT9812_USB_TIMEOUT);
+	if (ret)
+		return ret;
+
+	return usb_bulk_msg(usb, usb_rcvbulkpipe(usb, devpriv->cmd_rd.addr),
+			    buf, buf_size, &count, DT9812_USB_TIMEOUT);
 }
 
-static int dt9812_read_multiple_registers(struct usb_dt9812 *dev, int reg_count,
-					  u8 *address, u8 *value)
+static int dt9812_read_multiple_registers(struct comedi_device *dev,
+					  int reg_count, u8 *address,
+					  u8 *value)
 {
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct dt9812_private *devpriv = dev->private;
 	struct dt9812_usb_cmd cmd;
-	int i, count, retval;
+	int i, count, ret;
 
 	cmd.cmd = cpu_to_le32(DT9812_R_MULTI_BYTE_REG);
 	cmd.u.read_multi_info.count = reg_count;
@@ -356,26 +287,23 @@
 		cmd.u.read_multi_info.address[i] = address[i];
 
 	/* DT9812 only responds to 32 byte writes!! */
-	count = 32;
-	retval = usb_bulk_msg(dev->udev,
-			      usb_sndbulkpipe(dev->udev,
-					      dev->command_write.addr),
-			      &cmd, 32, &count, HZ * 1);
-	if (retval)
-		return retval;
-	retval = usb_bulk_msg(dev->udev,
-			      usb_rcvbulkpipe(dev->udev,
-					      dev->command_read.addr),
-			      value, reg_count, &count, HZ * 1);
-	return retval;
+	ret = usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr),
+			   &cmd, 32, &count, DT9812_USB_TIMEOUT);
+	if (ret)
+		return ret;
+
+	return usb_bulk_msg(usb, usb_rcvbulkpipe(usb, devpriv->cmd_rd.addr),
+			    value, reg_count, &count, DT9812_USB_TIMEOUT);
 }
 
-static int dt9812_write_multiple_registers(struct usb_dt9812 *dev,
+static int dt9812_write_multiple_registers(struct comedi_device *dev,
 					   int reg_count, u8 *address,
 					   u8 *value)
 {
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct dt9812_private *devpriv = dev->private;
 	struct dt9812_usb_cmd cmd;
-	int i, count, retval;
+	int i, count;
 
 	cmd.cmd = cpu_to_le32(DT9812_W_MULTI_BYTE_REG);
 	cmd.u.read_multi_info.count = reg_count;
@@ -383,19 +311,20 @@
 		cmd.u.write_multi_info.write[i].address = address[i];
 		cmd.u.write_multi_info.write[i].value = value[i];
 	}
+
 	/* DT9812 only responds to 32 byte writes!! */
-	retval = usb_bulk_msg(dev->udev,
-			      usb_sndbulkpipe(dev->udev,
-					      dev->command_write.addr),
-			      &cmd, 32, &count, HZ * 1);
-	return retval;
+	return usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr),
+			    &cmd, 32, &count, DT9812_USB_TIMEOUT);
 }
 
-static int dt9812_rmw_multiple_registers(struct usb_dt9812 *dev, int reg_count,
+static int dt9812_rmw_multiple_registers(struct comedi_device *dev,
+					 int reg_count,
 					 struct dt9812_rmw_byte *rmw)
 {
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct dt9812_private *devpriv = dev->private;
 	struct dt9812_usb_cmd cmd;
-	int i, count, retval;
+	int i, count;
 
 	cmd.cmd = cpu_to_le32(DT9812_RMW_MULTI_BYTE_REG);
 	cmd.u.rmw_multi_info.count = reg_count;
@@ -403,76 +332,52 @@
 		cmd.u.rmw_multi_info.rmw[i] = rmw[i];
 
 	/* DT9812 only responds to 32 byte writes!! */
-	retval = usb_bulk_msg(dev->udev,
-			      usb_sndbulkpipe(dev->udev,
-					      dev->command_write.addr),
-			      &cmd, 32, &count, HZ * 1);
-	return retval;
+	return usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr),
+			    &cmd, 32, &count, DT9812_USB_TIMEOUT);
 }
 
-static int dt9812_digital_in(struct slot_dt9812 *slot, u8 *bits)
+static int dt9812_digital_in(struct comedi_device *dev, u8 *bits)
 {
-	int result = -ENODEV;
+	struct dt9812_private *devpriv = dev->private;
+	u8 reg[2] = { F020_SFR_P3, F020_SFR_P1 };
+	u8 value[2];
+	int ret;
 
-	down(&slot->mutex);
-	if (slot->usb) {
-		u8 reg[2] = { F020_SFR_P3, F020_SFR_P1 };
-		u8 value[2];
-
-		result = dt9812_read_multiple_registers(slot->usb, 2, reg,
-							value);
-		if (result == 0) {
-			/*
-			 * bits 0-6 in F020_SFR_P3 are bits 0-6 in the digital
-			 * input port bit 3 in F020_SFR_P1 is bit 7 in the
-			 * digital input port
-			 */
-			*bits = (value[0] & 0x7f) | ((value[1] & 0x08) << 4);
-			/* printk("%2.2x, %2.2x -> %2.2x\n",
-			   value[0], value[1], *bits); */
-		}
+	down(&devpriv->sem);
+	ret = dt9812_read_multiple_registers(dev, 2, reg, value);
+	if (ret == 0) {
+		/*
+		 * bits 0-6 in F020_SFR_P3 are bits 0-6 in the digital
+		 * input port bit 3 in F020_SFR_P1 is bit 7 in the
+		 * digital input port
+		 */
+		*bits = (value[0] & 0x7f) | ((value[1] & 0x08) << 4);
 	}
-	up(&slot->mutex);
+	up(&devpriv->sem);
 
-	return result;
+	return ret;
 }
 
-static int dt9812_digital_out(struct slot_dt9812 *slot, u8 bits)
+static int dt9812_digital_out(struct comedi_device *dev, u8 bits)
 {
-	int result = -ENODEV;
+	struct dt9812_private *devpriv = dev->private;
+	u8 reg[1] = { F020_SFR_P2 };
+	u8 value[1] = { bits };
+	int ret;
 
-	down(&slot->mutex);
-	if (slot->usb) {
-		u8 reg[1];
-		u8 value[1];
+	down(&devpriv->sem);
+	ret = dt9812_write_multiple_registers(dev, 1, reg, value);
+	up(&devpriv->sem);
 
-		reg[0] = F020_SFR_P2;
-		value[0] = bits;
-		result = dt9812_write_multiple_registers(slot->usb, 1, reg,
-							 value);
-		slot->usb->digital_out_shadow = bits;
-	}
-	up(&slot->mutex);
-	return result;
+	return ret;
 }
 
-static int dt9812_digital_out_shadow(struct slot_dt9812 *slot, u8 *bits)
-{
-	int result = -ENODEV;
-
-	down(&slot->mutex);
-	if (slot->usb) {
-		*bits = slot->usb->digital_out_shadow;
-		result = 0;
-	}
-	up(&slot->mutex);
-	return result;
-}
-
-static void dt9812_configure_mux(struct usb_dt9812 *dev,
+static void dt9812_configure_mux(struct comedi_device *dev,
 				 struct dt9812_rmw_byte *rmw, int channel)
 {
-	if (dev->device == DT9812_DEVID_DT9812_10) {
+	struct dt9812_private *devpriv = dev->private;
+
+	if (devpriv->device == DT9812_DEVID_DT9812_10) {
 		/* In the DT9812/10V MUX is selected by P1.5-7 */
 		rmw->address = F020_SFR_P1;
 		rmw->and_mask = 0xe0;
@@ -485,18 +390,21 @@
 	}
 }
 
-static void dt9812_configure_gain(struct usb_dt9812 *dev,
+static void dt9812_configure_gain(struct comedi_device *dev,
 				  struct dt9812_rmw_byte *rmw,
 				  enum dt9812_gain gain)
 {
-	if (dev->device == DT9812_DEVID_DT9812_10) {
-		/* In the DT9812/10V, there is an external gain of 0.5 */
+	struct dt9812_private *devpriv = dev->private;
+
+	/* In the DT9812/10V, there is an external gain of 0.5 */
+	if (devpriv->device == DT9812_DEVID_DT9812_10)
 		gain <<= 1;
-	}
 
 	rmw->address = F020_SFR_ADC0CF;
 	rmw->and_mask = F020_MASK_ADC0CF_AMP0GN2 |
-	    F020_MASK_ADC0CF_AMP0GN1 | F020_MASK_ADC0CF_AMP0GN0;
+			F020_MASK_ADC0CF_AMP0GN1 |
+			F020_MASK_ADC0CF_AMP0GN0;
+
 	switch (gain) {
 		/*
 		 * 000 -> Gain =  1
@@ -508,8 +416,10 @@
 		 */
 	case DT9812_GAIN_0PT5:
 		rmw->or_value = F020_MASK_ADC0CF_AMP0GN2 |
-		    F020_MASK_ADC0CF_AMP0GN1;
+				F020_MASK_ADC0CF_AMP0GN1;
 		break;
+	default:
+		/* this should never happen, just use a gain of 1 */
 	case DT9812_GAIN_1:
 		rmw->or_value = 0x00;
 		break;
@@ -521,20 +431,18 @@
 		break;
 	case DT9812_GAIN_8:
 		rmw->or_value = F020_MASK_ADC0CF_AMP0GN1 |
-		    F020_MASK_ADC0CF_AMP0GN0;
+				F020_MASK_ADC0CF_AMP0GN0;
 		break;
 	case DT9812_GAIN_16:
 		rmw->or_value = F020_MASK_ADC0CF_AMP0GN2;
 		break;
-	default:
-		dev_err(&dev->interface->dev, "Illegal gain %d\n", gain);
-
 	}
 }
 
-static int dt9812_analog_in(struct slot_dt9812 *slot, int channel, u16 *value,
-			    enum dt9812_gain gain)
+static int dt9812_analog_in(struct comedi_device *dev,
+			    int channel, u16 *value, enum dt9812_gain gain)
 {
+	struct dt9812_private *devpriv = dev->private;
 	struct dt9812_rmw_byte rmw[3];
 	u8 reg[3] = {
 		F020_SFR_ADC0CN,
@@ -542,31 +450,30 @@
 		F020_SFR_ADC0L
 	};
 	u8 val[3];
-	int result = -ENODEV;
+	int ret;
 
-	down(&slot->mutex);
-	if (!slot->usb)
-		goto exit;
+	down(&devpriv->sem);
 
 	/* 1 select the gain */
-	dt9812_configure_gain(slot->usb, &rmw[0], gain);
+	dt9812_configure_gain(dev, &rmw[0], gain);
 
 	/* 2 set the MUX to select the channel */
-	dt9812_configure_mux(slot->usb, &rmw[1], channel);
+	dt9812_configure_mux(dev, &rmw[1], channel);
 
 	/* 3 start conversion */
 	rmw[2].address = F020_SFR_ADC0CN;
 	rmw[2].and_mask = 0xff;
 	rmw[2].or_value = F020_MASK_ADC0CN_AD0EN | F020_MASK_ADC0CN_AD0BUSY;
 
-	result = dt9812_rmw_multiple_registers(slot->usb, 3, rmw);
-	if (result)
+	ret = dt9812_rmw_multiple_registers(dev, 3, rmw);
+	if (ret)
 		goto exit;
 
 	/* read the status and ADC */
-	result = dt9812_read_multiple_registers(slot->usb, 3, reg, val);
-	if (result)
+	ret = dt9812_read_multiple_registers(dev, 3, reg, val);
+	if (ret)
 		goto exit;
+
 	/*
 	 * An ADC conversion takes 16 SAR clocks cycles, i.e. about 9us.
 	 * Therefore, between the instant that AD0BUSY was set via
@@ -578,7 +485,7 @@
 	 */
 	if ((val[0] & (F020_MASK_ADC0CN_AD0INT | F020_MASK_ADC0CN_AD0BUSY)) ==
 	    F020_MASK_ADC0CN_AD0INT) {
-		switch (slot->usb->device) {
+		switch (devpriv->device) {
 		case DT9812_DEVID_DT9812_10:
 			/*
 			 * For DT9812-10V the personality module set the
@@ -594,422 +501,284 @@
 	}
 
 exit:
-	up(&slot->mutex);
-	return result;
+	up(&devpriv->sem);
+
+	return ret;
 }
 
-static int dt9812_analog_out_shadow(struct slot_dt9812 *slot, int channel,
-				    u16 *value)
+static int dt9812_analog_out(struct comedi_device *dev, int channel, u16 value)
 {
-	int result = -ENODEV;
+	struct dt9812_private *devpriv = dev->private;
+	struct dt9812_rmw_byte rmw[3];
+	int ret;
 
-	down(&slot->mutex);
-	if (slot->usb) {
-		*value = slot->usb->analog_out_shadow[channel];
-		result = 0;
+	down(&devpriv->sem);
+
+	switch (channel) {
+	case 0:
+		/* 1. Set DAC mode */
+		rmw[0].address = F020_SFR_DAC0CN;
+		rmw[0].and_mask = 0xff;
+		rmw[0].or_value = F020_MASK_DACxCN_DACxEN;
+
+		/* 2 load low byte of DAC value first */
+		rmw[1].address = F020_SFR_DAC0L;
+		rmw[1].and_mask = 0xff;
+		rmw[1].or_value = value & 0xff;
+
+		/* 3 load high byte of DAC value next to latch the
+			12-bit value */
+		rmw[2].address = F020_SFR_DAC0H;
+		rmw[2].and_mask = 0xff;
+		rmw[2].or_value = (value >> 8) & 0xf;
+		break;
+
+	case 1:
+		/* 1. Set DAC mode */
+		rmw[0].address = F020_SFR_DAC1CN;
+		rmw[0].and_mask = 0xff;
+		rmw[0].or_value = F020_MASK_DACxCN_DACxEN;
+
+		/* 2 load low byte of DAC value first */
+		rmw[1].address = F020_SFR_DAC1L;
+		rmw[1].and_mask = 0xff;
+		rmw[1].or_value = value & 0xff;
+
+		/* 3 load high byte of DAC value next to latch the
+			12-bit value */
+		rmw[2].address = F020_SFR_DAC1H;
+		rmw[2].and_mask = 0xff;
+		rmw[2].or_value = (value >> 8) & 0xf;
+		break;
 	}
-	up(&slot->mutex);
+	ret = dt9812_rmw_multiple_registers(dev, 3, rmw);
+	devpriv->ao_shadow[channel] = value;
 
-	return result;
+	up(&devpriv->sem);
+
+	return ret;
 }
 
-static int dt9812_analog_out(struct slot_dt9812 *slot, int channel, u16 value)
+static int dt9812_di_insn_bits(struct comedi_device *dev,
+			       struct comedi_subdevice *s,
+			       struct comedi_insn *insn,
+			       unsigned int *data)
 {
-	int result = -ENODEV;
+	u8 bits = 0;
+	int ret;
 
-	down(&slot->mutex);
-	if (slot->usb) {
-		struct dt9812_rmw_byte rmw[3];
+	ret = dt9812_digital_in(dev, &bits);
+	if (ret)
+		return ret;
 
-		switch (channel) {
-		case 0:
-			/* 1. Set DAC mode */
-			rmw[0].address = F020_SFR_DAC0CN;
-			rmw[0].and_mask = 0xff;
-			rmw[0].or_value = F020_MASK_DACxCN_DACxEN;
+	data[1] = bits;
 
-			/* 2 load low byte of DAC value first */
-			rmw[1].address = F020_SFR_DAC0L;
-			rmw[1].and_mask = 0xff;
-			rmw[1].or_value = value & 0xff;
+	return insn->n;
+}
 
-			/* 3 load high byte of DAC value next to latch the
-			   12-bit value */
-			rmw[2].address = F020_SFR_DAC0H;
-			rmw[2].and_mask = 0xff;
-			rmw[2].or_value = (value >> 8) & 0xf;
-			break;
+static int dt9812_do_insn_bits(struct comedi_device *dev,
+			       struct comedi_subdevice *s,
+			       struct comedi_insn *insn,
+			       unsigned int *data)
+{
+	unsigned int mask = data[0];
+	unsigned int bits = data[1];
 
-		case 1:
-			/* 1. Set DAC mode */
-			rmw[0].address = F020_SFR_DAC1CN;
-			rmw[0].and_mask = 0xff;
-			rmw[0].or_value = F020_MASK_DACxCN_DACxEN;
+	if (mask) {
+		s->state &= ~mask;
+		s->state |= (bits & mask);
 
-			/* 2 load low byte of DAC value first */
-			rmw[1].address = F020_SFR_DAC1L;
-			rmw[1].and_mask = 0xff;
-			rmw[1].or_value = value & 0xff;
-
-			/* 3 load high byte of DAC value next to latch the
-			   12-bit value */
-			rmw[2].address = F020_SFR_DAC1H;
-			rmw[2].and_mask = 0xff;
-			rmw[2].or_value = (value >> 8) & 0xf;
-			break;
-		}
-		result = dt9812_rmw_multiple_registers(slot->usb, 3, rmw);
-		slot->usb->analog_out_shadow[channel] = value;
+		dt9812_digital_out(dev, s->state);
 	}
-	up(&slot->mutex);
 
-	return result;
+	data[1] = s->state;
+
+	return insn->n;
 }
 
-/*
- * USB framework functions
- */
-
-static int dt9812_probe(struct usb_interface *interface,
-			const struct usb_device_id *id)
+static int dt9812_ai_insn_read(struct comedi_device *dev,
+			       struct comedi_subdevice *s,
+			       struct comedi_insn *insn,
+			       unsigned int *data)
 {
-	int retval = -ENOMEM;
-	struct usb_dt9812 *dev = NULL;
-	struct usb_host_interface *iface_desc;
-	struct usb_endpoint_descriptor *endpoint;
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	u16 val = 0;
+	int ret;
 	int i;
-	u8 fw;
 
-	/* allocate memory for our device state and initialize it */
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (dev == NULL)
-		goto error;
-
-	kref_init(&dev->kref);
-
-	dev->udev = usb_get_dev(interface_to_usbdev(interface));
-	dev->interface = interface;
-
-	/* Check endpoints */
-	iface_desc = interface->cur_altsetting;
-
-	if (iface_desc->desc.bNumEndpoints != 5) {
-		dev_err(&interface->dev, "Wrong number of endpoints.\n");
-		retval = -ENODEV;
-		goto error;
+	for (i = 0; i < insn->n; i++) {
+		ret = dt9812_analog_in(dev, chan, &val, DT9812_GAIN_1);
+		if (ret)
+			return ret;
+		data[i] = val;
 	}
 
-	for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) {
-		int direction = -1;
-		endpoint = &iface_desc->endpoint[i].desc;
+	return insn->n;
+}
+
+static int dt9812_ao_insn_read(struct comedi_device *dev,
+			       struct comedi_subdevice *s,
+			       struct comedi_insn *insn,
+			       unsigned int *data)
+{
+	struct dt9812_private *devpriv = dev->private;
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	int i;
+
+	down(&devpriv->sem);
+	for (i = 0; i < insn->n; i++)
+		data[i] = devpriv->ao_shadow[chan];
+	up(&devpriv->sem);
+
+	return insn->n;
+}
+
+static int dt9812_ao_insn_write(struct comedi_device *dev,
+				struct comedi_subdevice *s,
+				struct comedi_insn *insn,
+				unsigned int *data)
+{
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	int ret;
+	int i;
+
+	for (i = 0; i < insn->n; i++) {
+		ret = dt9812_analog_out(dev, chan, data[i]);
+		if (ret)
+			return ret;
+	}
+
+	return insn->n;
+}
+
+static int dt9812_find_endpoints(struct comedi_device *dev)
+{
+	struct usb_interface *intf = comedi_to_usb_interface(dev);
+	struct usb_host_interface *host = intf->cur_altsetting;
+	struct dt9812_private *devpriv = dev->private;
+	struct usb_endpoint_descriptor *ep;
+	int i;
+
+	if (host->desc.bNumEndpoints != 5) {
+		dev_err(dev->class_dev, "Wrong number of endpoints\n");
+		return -ENODEV;
+	}
+
+	for (i = 0; i < host->desc.bNumEndpoints; ++i) {
+		int dir = -1;
+		ep = &host->endpoint[i].desc;
 		switch (i) {
 		case 0:
-			direction = USB_DIR_IN;
-			dev->message_pipe.addr = endpoint->bEndpointAddress;
-			dev->message_pipe.size =
-			    le16_to_cpu(endpoint->wMaxPacketSize);
-
+			/* unused message pipe */
+			dir = USB_DIR_IN;
 			break;
 		case 1:
-			direction = USB_DIR_OUT;
-			dev->command_write.addr = endpoint->bEndpointAddress;
-			dev->command_write.size =
-			    le16_to_cpu(endpoint->wMaxPacketSize);
+			dir = USB_DIR_OUT;
+			devpriv->cmd_wr.addr = ep->bEndpointAddress;
+			devpriv->cmd_wr.size = le16_to_cpu(ep->wMaxPacketSize);
 			break;
 		case 2:
-			direction = USB_DIR_IN;
-			dev->command_read.addr = endpoint->bEndpointAddress;
-			dev->command_read.size =
-			    le16_to_cpu(endpoint->wMaxPacketSize);
+			dir = USB_DIR_IN;
+			devpriv->cmd_rd.addr = ep->bEndpointAddress;
+			devpriv->cmd_rd.size = le16_to_cpu(ep->wMaxPacketSize);
 			break;
 		case 3:
-			direction = USB_DIR_OUT;
-			dev->write_stream.addr = endpoint->bEndpointAddress;
-			dev->write_stream.size =
-			    le16_to_cpu(endpoint->wMaxPacketSize);
+			/* unused write stream */
+			dir = USB_DIR_OUT;
 			break;
 		case 4:
-			direction = USB_DIR_IN;
-			dev->read_stream.addr = endpoint->bEndpointAddress;
-			dev->read_stream.size =
-			    le16_to_cpu(endpoint->wMaxPacketSize);
+			/* unused read stream */
+			dir = USB_DIR_IN;
 			break;
 		}
-		if ((endpoint->bEndpointAddress & USB_DIR_IN) != direction) {
-			dev_err(&interface->dev,
-				"Endpoint has wrong direction.\n");
-			retval = -ENODEV;
-			goto error;
+		if ((ep->bEndpointAddress & USB_DIR_IN) != dir) {
+			dev_err(dev->class_dev,
+				"Endpoint has wrong direction\n");
+			return -ENODEV;
 		}
 	}
-	if (dt9812_read_info(dev, 0, &fw, sizeof(fw)) != 0) {
+	return 0;
+}
+
+static int dt9812_reset_device(struct comedi_device *dev)
+{
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct dt9812_private *devpriv = dev->private;
+	u32 serial;
+	u16 vendor;
+	u16 product;
+	u16 tmp16;
+	u8 tmp8;
+	int ret;
+	int i;
+
+	ret = dt9812_read_info(dev, 0, &tmp8, sizeof(tmp8));
+	if (ret) {
 		/*
 		 * Seems like a configuration reset is necessary if driver is
 		 * reloaded while device is attached
 		 */
-		usb_reset_configuration(dev->udev);
+		usb_reset_configuration(usb);
 		for (i = 0; i < 10; i++) {
-			retval = dt9812_read_info(dev, 1, &fw, sizeof(fw));
-			if (retval == 0) {
-				dev_info(&interface->dev,
-					 "usb_reset_configuration succeeded "
-					 "after %d iterations\n", i);
+			ret = dt9812_read_info(dev, 1, &tmp8, sizeof(tmp8));
+			if (ret == 0)
 				break;
-			}
+		}
+		if (ret) {
+			dev_err(dev->class_dev,
+				"unable to reset configuration\n");
+			return ret;
 		}
 	}
 
-	if (dt9812_read_info(dev, 1, &dev->vendor, sizeof(dev->vendor)) != 0) {
-		dev_err(&interface->dev, "Failed to read vendor.\n");
-		retval = -ENODEV;
-		goto error;
+	ret = dt9812_read_info(dev, 1, &vendor, sizeof(vendor));
+	if (ret) {
+		dev_err(dev->class_dev, "failed to read vendor id\n");
+		return ret;
 	}
-	if (dt9812_read_info(dev, 3, &dev->product, sizeof(dev->product)) != 0) {
-		dev_err(&interface->dev, "Failed to read product.\n");
-		retval = -ENODEV;
-		goto error;
-	}
-	if (dt9812_read_info(dev, 5, &dev->device, sizeof(dev->device)) != 0) {
-		dev_err(&interface->dev, "Failed to read device.\n");
-		retval = -ENODEV;
-		goto error;
-	}
-	if (dt9812_read_info(dev, 7, &dev->serial, sizeof(dev->serial)) != 0) {
-		dev_err(&interface->dev, "Failed to read serial.\n");
-		retval = -ENODEV;
-		goto error;
-	}
+	vendor = le16_to_cpu(vendor);
 
-	dev->vendor = le16_to_cpu(dev->vendor);
-	dev->product = le16_to_cpu(dev->product);
-	dev->device = le16_to_cpu(dev->device);
-	dev->serial = le32_to_cpu(dev->serial);
-	switch (dev->device) {
-	case DT9812_DEVID_DT9812_10:
-		dev->analog_out_shadow[0] = 0x0800;
-		dev->analog_out_shadow[1] = 0x800;
-		break;
-	case DT9812_DEVID_DT9812_2PT5:
-		dev->analog_out_shadow[0] = 0x0000;
-		dev->analog_out_shadow[1] = 0x0000;
-		break;
+	ret = dt9812_read_info(dev, 3, &product, sizeof(product));
+	if (ret) {
+		dev_err(dev->class_dev, "failed to read product id\n");
+		return ret;
 	}
-	dev->digital_out_shadow = 0;
+	product = le16_to_cpu(product);
 
-	/* save our data pointer in this interface device */
-	usb_set_intfdata(interface, dev);
+	ret = dt9812_read_info(dev, 5, &tmp16, sizeof(tmp16));
+	if (ret) {
+		dev_err(dev->class_dev, "failed to read device id\n");
+		return ret;
+	}
+	devpriv->device = le16_to_cpu(tmp16);
+
+	ret = dt9812_read_info(dev, 7, &serial, sizeof(serial));
+	if (ret) {
+		dev_err(dev->class_dev, "failed to read serial number\n");
+		return ret;
+	}
+	serial = le32_to_cpu(serial);
 
 	/* let the user know what node this device is now attached to */
-	dev_info(&interface->dev, "USB DT9812 (%4.4x.%4.4x.%4.4x) #0x%8.8x\n",
-		 dev->vendor, dev->product, dev->device, dev->serial);
+	dev_info(dev->class_dev, "USB DT9812 (%4.4x.%4.4x.%4.4x) #0x%8.8x\n",
+		 vendor, product, devpriv->device, serial);
 
-	down(&dt9812_mutex);
-	{
-		/* Find a slot for the USB device */
-		struct slot_dt9812 *first = NULL;
-		struct slot_dt9812 *best = NULL;
-
-		for (i = 0; i < DT9812_NUM_SLOTS; i++) {
-			if (!first && !dt9812[i].usb && dt9812[i].serial == 0)
-				first = &dt9812[i];
-			if (!best && dt9812[i].serial == dev->serial)
-				best = &dt9812[i];
-		}
-
-		if (!best)
-			best = first;
-
-		if (best) {
-			down(&best->mutex);
-			best->usb = dev;
-			dev->slot = best;
-			up(&best->mutex);
-		}
+	if (devpriv->device != DT9812_DEVID_DT9812_10 &&
+	    devpriv->device != DT9812_DEVID_DT9812_2PT5) {
+		dev_err(dev->class_dev, "Unsupported device!\n");
+		return -EINVAL;
 	}
-	up(&dt9812_mutex);
 
 	return 0;
-
-error:
-	if (dev)
-		kref_put(&dev->kref, dt9812_delete);
-	return retval;
 }
 
-static void dt9812_disconnect(struct usb_interface *interface)
+static int dt9812_auto_attach(struct comedi_device *dev,
+			      unsigned long context)
 {
-	struct usb_dt9812 *dev;
-	int minor = interface->minor;
-
-	down(&dt9812_mutex);
-	dev = usb_get_intfdata(interface);
-	if (dev->slot) {
-		down(&dev->slot->mutex);
-		dev->slot->usb = NULL;
-		up(&dev->slot->mutex);
-		dev->slot = NULL;
-	}
-	usb_set_intfdata(interface, NULL);
-	up(&dt9812_mutex);
-
-	/* queue final destruction */
-	kref_put(&dev->kref, dt9812_delete);
-
-	dev_info(&interface->dev, "USB Dt9812 #%d now disconnected\n", minor);
-}
-
-static struct usb_driver dt9812_usb_driver = {
-	.name = "dt9812",
-	.probe = dt9812_probe,
-	.disconnect = dt9812_disconnect,
-	.id_table = dt9812_table,
-};
-
-/*
- * Comedi functions
- */
-
-static int dt9812_comedi_open(struct comedi_device *dev)
-{
-	struct comedi_dt9812 *devpriv = dev->private;
-	int result = -ENODEV;
-
-	down(&devpriv->slot->mutex);
-	if (devpriv->slot->usb) {
-		/* We have an attached device, fill in current range info */
-		struct comedi_subdevice *s;
-
-		s = &dev->subdevices[0];
-		s->n_chan = 8;
-		s->maxdata = 1;
-
-		s = &dev->subdevices[1];
-		s->n_chan = 8;
-		s->maxdata = 1;
-
-		s = &dev->subdevices[2];
-		s->n_chan = 8;
-		switch (devpriv->slot->usb->device) {
-		case 0:{
-				s->maxdata = 4095;
-				s->range_table = &range_bipolar10;
-			}
-			break;
-		case 1:{
-				s->maxdata = 4095;
-				s->range_table = &range_unipolar2_5;
-			}
-			break;
-		}
-
-		s = &dev->subdevices[3];
-		s->n_chan = 2;
-		switch (devpriv->slot->usb->device) {
-		case 0:{
-				s->maxdata = 4095;
-				s->range_table = &range_bipolar10;
-			}
-			break;
-		case 1:{
-				s->maxdata = 4095;
-				s->range_table = &range_unipolar2_5;
-			}
-			break;
-		}
-		result = 0;
-	}
-	up(&devpriv->slot->mutex);
-	return result;
-}
-
-static int dt9812_di_rinsn(struct comedi_device *dev,
-			   struct comedi_subdevice *s, struct comedi_insn *insn,
-			   unsigned int *data)
-{
-	struct comedi_dt9812 *devpriv = dev->private;
-	unsigned int channel = CR_CHAN(insn->chanspec);
-	int n;
-	u8 bits = 0;
-
-	dt9812_digital_in(devpriv->slot, &bits);
-	for (n = 0; n < insn->n; n++)
-		data[n] = ((1 << channel) & bits) != 0;
-	return n;
-}
-
-static int dt9812_do_winsn(struct comedi_device *dev,
-			   struct comedi_subdevice *s, struct comedi_insn *insn,
-			   unsigned int *data)
-{
-	struct comedi_dt9812 *devpriv = dev->private;
-	unsigned int channel = CR_CHAN(insn->chanspec);
-	int n;
-	u8 bits = 0;
-
-	dt9812_digital_out_shadow(devpriv->slot, &bits);
-	for (n = 0; n < insn->n; n++) {
-		u8 mask = 1 << channel;
-
-		bits &= ~mask;
-		if (data[n])
-			bits |= mask;
-	}
-	dt9812_digital_out(devpriv->slot, bits);
-	return n;
-}
-
-static int dt9812_ai_rinsn(struct comedi_device *dev,
-			   struct comedi_subdevice *s, struct comedi_insn *insn,
-			   unsigned int *data)
-{
-	struct comedi_dt9812 *devpriv = dev->private;
-	unsigned int channel = CR_CHAN(insn->chanspec);
-	int n;
-
-	for (n = 0; n < insn->n; n++) {
-		u16 value = 0;
-
-		dt9812_analog_in(devpriv->slot, channel, &value, DT9812_GAIN_1);
-		data[n] = value;
-	}
-	return n;
-}
-
-static int dt9812_ao_rinsn(struct comedi_device *dev,
-			   struct comedi_subdevice *s, struct comedi_insn *insn,
-			   unsigned int *data)
-{
-	struct comedi_dt9812 *devpriv = dev->private;
-	unsigned int channel = CR_CHAN(insn->chanspec);
-	int n;
-	u16 value;
-
-	for (n = 0; n < insn->n; n++) {
-		value = 0;
-		dt9812_analog_out_shadow(devpriv->slot, channel, &value);
-		data[n] = value;
-	}
-	return n;
-}
-
-static int dt9812_ao_winsn(struct comedi_device *dev,
-			   struct comedi_subdevice *s, struct comedi_insn *insn,
-			   unsigned int *data)
-{
-	struct comedi_dt9812 *devpriv = dev->private;
-	unsigned int channel = CR_CHAN(insn->chanspec);
-	int n;
-
-	for (n = 0; n < insn->n; n++)
-		dt9812_analog_out(devpriv->slot, channel, data[n]);
-	return n;
-}
-
-static int dt9812_attach(struct comedi_device *dev, struct comedi_devconfig *it)
-{
-	struct comedi_dt9812 *devpriv;
-	int i;
+	struct usb_interface *intf = comedi_to_usb_interface(dev);
+	struct dt9812_private *devpriv;
 	struct comedi_subdevice *s;
+	bool is_unipolar;
 	int ret;
 
 	devpriv = kzalloc(sizeof(*devpriv), GFP_KERNEL);
@@ -1017,125 +786,107 @@
 		return -ENOMEM;
 	dev->private = devpriv;
 
-	/*
-	 * Special open routine, since USB unit may be unattached at
-	 * comedi_config time, hence range can not be determined
-	 */
-	dev->open = dt9812_comedi_open;
+	sema_init(&devpriv->sem, 1);
+	usb_set_intfdata(intf, devpriv);
 
-	devpriv->serial = it->options[0];
+	ret = dt9812_find_endpoints(dev);
+	if (ret)
+		return ret;
+
+	ret = dt9812_reset_device(dev);
+	if (ret)
+		return ret;
+
+	is_unipolar = (devpriv->device == DT9812_DEVID_DT9812_2PT5);
 
 	ret = comedi_alloc_subdevices(dev, 4);
 	if (ret)
 		return ret;
 
-	/* digital input subdevice */
+	/* Digital Input subdevice */
 	s = &dev->subdevices[0];
-	s->type = COMEDI_SUBD_DI;
-	s->subdev_flags = SDF_READABLE;
-	s->n_chan = 0;
-	s->maxdata = 1;
-	s->range_table = &range_digital;
-	s->insn_read = &dt9812_di_rinsn;
+	s->type		= COMEDI_SUBD_DI;
+	s->subdev_flags	= SDF_READABLE;
+	s->n_chan	= 8;
+	s->maxdata	= 1;
+	s->range_table	= &range_digital;
+	s->insn_bits	= dt9812_di_insn_bits;
 
-	/* digital output subdevice */
+	/* Digital Output subdevice */
 	s = &dev->subdevices[1];
-	s->type = COMEDI_SUBD_DO;
-	s->subdev_flags = SDF_WRITEABLE;
-	s->n_chan = 0;
-	s->maxdata = 1;
-	s->range_table = &range_digital;
-	s->insn_write = &dt9812_do_winsn;
+	s->type		= COMEDI_SUBD_DO;
+	s->subdev_flags	= SDF_WRITEABLE;
+	s->n_chan	= 8;
+	s->maxdata	= 1;
+	s->range_table	= &range_digital;
+	s->insn_bits	= dt9812_do_insn_bits;
 
-	/* analog input subdevice */
+	/* Analog Input subdevice */
 	s = &dev->subdevices[2];
-	s->type = COMEDI_SUBD_AI;
-	s->subdev_flags = SDF_READABLE | SDF_GROUND;
-	s->n_chan = 0;
-	s->maxdata = 1;
-	s->range_table = NULL;
-	s->insn_read = &dt9812_ai_rinsn;
+	s->type		= COMEDI_SUBD_AI;
+	s->subdev_flags	= SDF_READABLE | SDF_GROUND;
+	s->n_chan	= 8;
+	s->maxdata	= 0x0fff;
+	s->range_table	= is_unipolar ? &range_unipolar2_5 : &range_bipolar10;
+	s->insn_read	= dt9812_ai_insn_read;
 
-	/* analog output subdevice */
+	/* Analog Output subdevice */
 	s = &dev->subdevices[3];
-	s->type = COMEDI_SUBD_AO;
-	s->subdev_flags = SDF_WRITEABLE;
-	s->n_chan = 0;
-	s->maxdata = 1;
-	s->range_table = NULL;
-	s->insn_write = &dt9812_ao_winsn;
-	s->insn_read = &dt9812_ao_rinsn;
+	s->type		= COMEDI_SUBD_AO;
+	s->subdev_flags	= SDF_WRITEABLE;
+	s->n_chan	= 2;
+	s->maxdata	= 0x0fff;
+	s->range_table	= is_unipolar ? &range_unipolar2_5 : &range_bipolar10;
+	s->insn_write	= dt9812_ao_insn_write;
+	s->insn_read	= dt9812_ao_insn_read;
 
-	dev_info(dev->class_dev, "successfully attached to dt9812.\n");
-
-	down(&dt9812_mutex);
-	/* Find a slot for the comedi device */
-	{
-		struct slot_dt9812 *first = NULL;
-		struct slot_dt9812 *best = NULL;
-		for (i = 0; i < DT9812_NUM_SLOTS; i++) {
-			if (!first && !dt9812[i].comedi) {
-				/* First free slot from comedi side */
-				first = &dt9812[i];
-			}
-			if (!best &&
-			    dt9812[i].usb &&
-			    dt9812[i].usb->serial == devpriv->serial) {
-				/* We have an attaced device with matching ID */
-				best = &dt9812[i];
-			}
-		}
-		if (!best)
-			best = first;
-		if (best) {
-			down(&best->mutex);
-			best->comedi = devpriv;
-			best->serial = devpriv->serial;
-			devpriv->slot = best;
-			up(&best->mutex);
-		}
-	}
-	up(&dt9812_mutex);
+	devpriv->ao_shadow[0] = is_unipolar ? 0x0000 : 0x0800;
+	devpriv->ao_shadow[1] = is_unipolar ? 0x0000 : 0x0800;
 
 	return 0;
 }
 
 static void dt9812_detach(struct comedi_device *dev)
 {
-	/* Nothing to cleanup */
+	struct usb_interface *intf = comedi_to_usb_interface(dev);
+	struct dt9812_private *devpriv = dev->private;
+
+	if (!devpriv)
+		return;
+
+	down(&devpriv->sem);
+
+	usb_set_intfdata(intf, NULL);
+
+	up(&devpriv->sem);
 }
 
-static struct comedi_driver dt9812_comedi_driver = {
-	.module = THIS_MODULE,
-	.driver_name = "dt9812",
-	.attach = dt9812_attach,
-	.detach = dt9812_detach,
+static struct comedi_driver dt9812_driver = {
+	.driver_name	= "dt9812",
+	.module		= THIS_MODULE,
+	.auto_attach	= dt9812_auto_attach,
+	.detach		= dt9812_detach,
 };
 
-static int __init usb_dt9812_init(void)
+static int dt9812_usb_probe(struct usb_interface *intf,
+			    const struct usb_device_id *id)
 {
-	int i;
-
-	/* Initialize all driver slots */
-	for (i = 0; i < DT9812_NUM_SLOTS; i++) {
-		sema_init(&dt9812[i].mutex, 1);
-		dt9812[i].serial = 0;
-		dt9812[i].usb = NULL;
-		dt9812[i].comedi = NULL;
-	}
-	dt9812[12].serial = 0x0;
-
-	return comedi_usb_driver_register(&dt9812_comedi_driver,
-						&dt9812_usb_driver);
+	return comedi_usb_auto_config(intf, &dt9812_driver, id->driver_info);
 }
 
-static void __exit usb_dt9812_exit(void)
-{
-	comedi_usb_driver_unregister(&dt9812_comedi_driver, &dt9812_usb_driver);
-}
+static const struct usb_device_id dt9812_usb_table[] = {
+	{ USB_DEVICE(0x0867, 0x9812) },
+	{ }
+};
+MODULE_DEVICE_TABLE(usb, dt9812_usb_table);
 
-module_init(usb_dt9812_init);
-module_exit(usb_dt9812_exit);
+static struct usb_driver dt9812_usb_driver = {
+	.name		= "dt9812",
+	.id_table	= dt9812_usb_table,
+	.probe		= dt9812_usb_probe,
+	.disconnect	= comedi_usb_auto_unconfig,
+};
+module_comedi_usb_driver(dt9812_driver, dt9812_usb_driver);
 
 MODULE_AUTHOR("Anders Blomdell <anders.blomdell@control.lth.se>");
 MODULE_DESCRIPTION("Comedi DT9812 driver");

diff --git a/drivers/staging/comedi/drivers/dyna_pci10xx.c b/drivers/staging/comedi/drivers/dyna_pci10xx.c
index 93ec8e4..e14dd3a 100644
--- a/drivers/staging/comedi/drivers/dyna_pci10xx.c
+++ b/drivers/staging/comedi/drivers/dyna_pci10xx.c

@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*

diff --git a/drivers/staging/comedi/drivers/gsc_hpdi.c b/drivers/staging/comedi/drivers/gsc_hpdi.c
index 0c061df..2fceff9 100644
--- a/drivers/staging/comedi/drivers/gsc_hpdi.c
+++ b/drivers/staging/comedi/drivers/gsc_hpdi.c

@@ -18,12 +18,7 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 
 /*
  * Driver: gsc_hpdi

diff --git a/drivers/staging/comedi/drivers/icp_multi.c b/drivers/staging/comedi/drivers/icp_multi.c
index 08ab9d6..a11e015 100644
--- a/drivers/staging/comedi/drivers/icp_multi.c
+++ b/drivers/staging/comedi/drivers/icp_multi.c

@@ -13,11 +13,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/jr3_pci.c b/drivers/staging/comedi/drivers/jr3_pci.c
index 90b303a..94609f4 100644
--- a/drivers/staging/comedi/drivers/jr3_pci.c
+++ b/drivers/staging/comedi/drivers/jr3_pci.c

@@ -14,11 +14,6 @@
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
  * Driver: jr3_pci
@@ -46,7 +41,6 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
-#include <linux/firmware.h>
 #include <linux/jiffies.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
@@ -97,37 +91,6 @@
 	int retries;
 };
 
-/* Hotplug firmware loading stuff */
-static int comedi_load_firmware(struct comedi_device *dev, const char *name,
-				int (*cb)(struct comedi_device *dev,
-					const u8 *data, size_t size))
-{
-	struct pci_dev *pcidev = comedi_to_pci_dev(dev);
-	int result = 0;
-	const struct firmware *fw;
-	char *firmware_path;
-	static const char *prefix = "comedi/";
-
-	firmware_path = kmalloc(strlen(prefix) + strlen(name) + 1, GFP_KERNEL);
-	if (!firmware_path) {
-		result = -ENOMEM;
-	} else {
-		firmware_path[0] = '\0';
-		strcat(firmware_path, prefix);
-		strcat(firmware_path, name);
-		result = request_firmware(&fw, firmware_path, &pcidev->dev);
-		if (result == 0) {
-			if (!cb)
-				result = -EINVAL;
-			else
-				result = cb(dev, fw->data, fw->size);
-			release_firmware(fw);
-		}
-		kfree(firmware_path);
-	}
-	return result;
-}
-
 static struct poll_delay_t poll_delay_min_max(int min, int max)
 {
 	struct poll_delay_t result;
@@ -362,8 +325,9 @@
 	return result;
 }
 
-static int jr3_download_firmware(struct comedi_device *dev, const u8 *data,
-				 size_t size)
+static int jr3_download_firmware(struct comedi_device *dev,
+				 const u8 *data, size_t size,
+				 unsigned long context)
 {
 	/*
 	 * IDM file format is:
@@ -768,7 +732,9 @@
 	/*  Reset DSP card */
 	writel(0, &devpriv->iobase->channel[0].reset);
 
-	result = comedi_load_firmware(dev, "jr3pci.idm", jr3_download_firmware);
+	result = comedi_load_firmware(dev, &comedi_to_pci_dev(dev)->dev,
+				      "comedi/jr3pci.idm",
+				      jr3_download_firmware, 0);
 	dev_dbg(dev->class_dev, "Firmare load %d\n", result);
 
 	if (result < 0)
@@ -778,8 +744,9 @@
 	 * format:
 	 *     model serial Fx Fy Fz Mx My Mz\n
 	 *
-	 *     comedi_load_firmware(dev, "jr3_offsets_table",
-	 *                          jr3_download_firmware);
+	 *     comedi_load_firmware(dev, &comedi_to_pci_dev(dev)->dev,
+	 *                          "comedi/jr3_offsets_table",
+	 *                          jr3_download_firmware, 1);
 	 */
 
 	/*

diff --git a/drivers/staging/comedi/drivers/ke_counter.c b/drivers/staging/comedi/drivers/ke_counter.c
index e0e64752..f10cf10 100644
--- a/drivers/staging/comedi/drivers/ke_counter.c
+++ b/drivers/staging/comedi/drivers/ke_counter.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ke_counter

diff --git a/drivers/staging/comedi/drivers/me4000.c b/drivers/staging/comedi/drivers/me4000.c
index 641e693..c2308fd 100644
--- a/drivers/staging/comedi/drivers/me4000.c
+++ b/drivers/staging/comedi/drivers/me4000.c

@@ -14,11 +14,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: me4000

diff --git a/drivers/staging/comedi/drivers/me_daq.c b/drivers/staging/comedi/drivers/me_daq.c
index 09f2a9fe..7533ece 100644
--- a/drivers/staging/comedi/drivers/me_daq.c
+++ b/drivers/staging/comedi/drivers/me_daq.c

@@ -14,10 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -37,7 +33,6 @@
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <linux/firmware.h>
 
 #include "../comedidev.h"
 
@@ -391,7 +386,8 @@
 }
 
 static int me2600_xilinx_download(struct comedi_device *dev,
-				  const u8 *data, size_t size)
+				  const u8 *data, size_t size,
+				  unsigned long context)
 {
 	struct me_private_data *dev_private = dev->private;
 	unsigned int value;
@@ -460,22 +456,6 @@
 	return 0;
 }
 
-static int me2600_upload_firmware(struct comedi_device *dev)
-{
-	struct pci_dev *pcidev = comedi_to_pci_dev(dev);
-	const struct firmware *fw;
-	int ret;
-
-	ret = request_firmware(&fw, ME2600_FIRMWARE, &pcidev->dev);
-	if (ret)
-		return ret;
-
-	ret = me2600_xilinx_download(dev, fw->data, fw->size);
-	release_firmware(fw);
-
-	return ret;
-}
-
 static int me_reset(struct comedi_device *dev)
 {
 	struct me_private_data *dev_private = dev->private;
@@ -529,7 +509,9 @@
 
 	/* Download firmware and reset card */
 	if (board->needs_firmware) {
-		ret = me2600_upload_firmware(dev);
+		ret = comedi_load_firmware(dev, &comedi_to_pci_dev(dev)->dev,
+					   ME2600_FIRMWARE,
+					   me2600_xilinx_download, 0);
 		if (ret < 0)
 			return ret;
 	}

diff --git a/drivers/staging/comedi/drivers/mite.c b/drivers/staging/comedi/drivers/mite.c
index 523c656..12c34db 100644
--- a/drivers/staging/comedi/drivers/mite.c
+++ b/drivers/staging/comedi/drivers/mite.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/mite.h b/drivers/staging/comedi/drivers/mite.h
index 255b8ba..d4487e8 100644
--- a/drivers/staging/comedi/drivers/mite.h
+++ b/drivers/staging/comedi/drivers/mite.h

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _MITE_H_

diff --git a/drivers/staging/comedi/drivers/mpc624.c b/drivers/staging/comedi/drivers/mpc624.c
index 4717be4..713842a 100644
--- a/drivers/staging/comedi/drivers/mpc624.c
+++ b/drivers/staging/comedi/drivers/mpc624.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: mpc624

diff --git a/drivers/staging/comedi/drivers/multiq3.c b/drivers/staging/comedi/drivers/multiq3.c
index 7a82920..5ecd1b1 100644
--- a/drivers/staging/comedi/drivers/multiq3.c
+++ b/drivers/staging/comedi/drivers/multiq3.c

@@ -14,11 +14,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: multiq3

diff --git a/drivers/staging/comedi/drivers/ni_6527.c b/drivers/staging/comedi/drivers/ni_6527.c
index d10f777..903c2ef 100644
--- a/drivers/staging/comedi/drivers/ni_6527.c
+++ b/drivers/staging/comedi/drivers/ni_6527.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_6527

diff --git a/drivers/staging/comedi/drivers/ni_65xx.c b/drivers/staging/comedi/drivers/ni_65xx.c
index 3f71f0f..42a78de 100644
--- a/drivers/staging/comedi/drivers/ni_65xx.c
+++ b/drivers/staging/comedi/drivers/ni_65xx.c

@@ -17,11 +17,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_65xx
@@ -286,15 +281,6 @@
 	return subdev->private;
 }
 
-static struct ni_65xx_subdevice_private *ni_65xx_alloc_subdevice_private(void)
-{
-	struct ni_65xx_subdevice_private *subdev_private =
-	    kzalloc(sizeof(struct ni_65xx_subdevice_private), GFP_KERNEL);
-	if (subdev_private == NULL)
-		return NULL;
-	return subdev_private;
-}
-
 static int ni_65xx_config_filter(struct comedi_device *dev,
 				 struct comedi_subdevice *s,
 				 struct comedi_insn *insn, unsigned int *data)
@@ -589,6 +575,7 @@
 	struct pci_dev *pcidev = comedi_to_pci_dev(dev);
 	const struct ni_65xx_board *board = NULL;
 	struct ni_65xx_private *devpriv;
+	struct ni_65xx_subdevice_private *spriv;
 	struct comedi_subdevice *s;
 	unsigned i;
 	int ret;
@@ -637,10 +624,10 @@
 		s->maxdata = 1;
 		s->insn_config = ni_65xx_dio_insn_config;
 		s->insn_bits = ni_65xx_dio_insn_bits;
-		s->private = ni_65xx_alloc_subdevice_private();
-		if (s->private == NULL)
+		spriv = comedi_alloc_spriv(s, sizeof(*spriv));
+		if (!spriv)
 			return -ENOMEM;
-		sprivate(s)->base_port = 0;
+		spriv->base_port = 0;
 	} else {
 		s->type = COMEDI_SUBD_UNUSED;
 	}
@@ -654,10 +641,10 @@
 		s->range_table = &range_digital;
 		s->maxdata = 1;
 		s->insn_bits = ni_65xx_dio_insn_bits;
-		s->private = ni_65xx_alloc_subdevice_private();
-		if (s->private == NULL)
+		spriv = comedi_alloc_spriv(s, sizeof(*spriv));
+		if (!spriv)
 			return -ENOMEM;
-		sprivate(s)->base_port = board->num_di_ports;
+		spriv->base_port = board->num_di_ports;
 	} else {
 		s->type = COMEDI_SUBD_UNUSED;
 	}
@@ -672,10 +659,10 @@
 		s->maxdata = 1;
 		s->insn_config = ni_65xx_dio_insn_config;
 		s->insn_bits = ni_65xx_dio_insn_bits;
-		s->private = ni_65xx_alloc_subdevice_private();
-		if (s->private == NULL)
+		spriv = comedi_alloc_spriv(s, sizeof(*spriv));
+		if (!spriv)
 			return -ENOMEM;
-		sprivate(s)->base_port = 0;
+		spriv->base_port = 0;
 		for (i = 0; i < board->num_dio_ports; ++i) {
 			/*  configure all ports for input */
 			writeb(0x1,
@@ -730,7 +717,6 @@
 static void ni_65xx_detach(struct comedi_device *dev)
 {
 	struct ni_65xx_private *devpriv = dev->private;
-	int i;
 
 	if (devpriv && devpriv->mite && devpriv->mite->daq_io_addr) {
 		writeb(0x00,
@@ -739,8 +725,6 @@
 	}
 	if (dev->irq)
 		free_irq(dev->irq, dev);
-	for (i = 0; i < dev->n_subdevices; ++i)
-		comedi_spriv_free(dev, i);
 	if (devpriv) {
 		if (devpriv->mite) {
 			mite_unsetup(devpriv->mite);

diff --git a/drivers/staging/comedi/drivers/ni_660x.c b/drivers/staging/comedi/drivers/ni_660x.c
index 5cdda7f..a9e0004 100644
--- a/drivers/staging/comedi/drivers/ni_660x.c
+++ b/drivers/staging/comedi/drivers/ni_660x.c

@@ -11,10 +11,6 @@
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/ni_670x.c b/drivers/staging/comedi/drivers/ni_670x.c
index 42ab6db..1a185b9 100644
--- a/drivers/staging/comedi/drivers/ni_670x.c
+++ b/drivers/staging/comedi/drivers/ni_670x.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_670x

diff --git a/drivers/staging/comedi/drivers/ni_at_a2150.c b/drivers/staging/comedi/drivers/ni_at_a2150.c
index 2d37516..7ea5aa3 100644
--- a/drivers/staging/comedi/drivers/ni_at_a2150.c
+++ b/drivers/staging/comedi/drivers/ni_at_a2150.c

@@ -15,12 +15,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: ni_at_a2150

diff --git a/drivers/staging/comedi/drivers/ni_at_ao.c b/drivers/staging/comedi/drivers/ni_at_ao.c
index 7e5783a..e080053 100644
--- a/drivers/staging/comedi/drivers/ni_at_ao.c
+++ b/drivers/staging/comedi/drivers/ni_at_ao.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_at_ao

diff --git a/drivers/staging/comedi/drivers/ni_atmio.c b/drivers/staging/comedi/drivers/ni_atmio.c
index 4ced7ba..713edd5 100644
--- a/drivers/staging/comedi/drivers/ni_atmio.c
+++ b/drivers/staging/comedi/drivers/ni_atmio.c

@@ -14,10 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 /*
 Driver: ni_atmio

diff --git a/drivers/staging/comedi/drivers/ni_atmio16d.c b/drivers/staging/comedi/drivers/ni_atmio16d.c
index 6c97a09..da7396f 100644
--- a/drivers/staging/comedi/drivers/ni_atmio16d.c
+++ b/drivers/staging/comedi/drivers/ni_atmio16d.c

@@ -12,11 +12,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: ni_atmio16d
@@ -767,7 +762,6 @@
 
 static void atmio16d_detach(struct comedi_device *dev)
 {
-	comedi_spriv_free(dev, 3);
 	reset_atmio16d(dev);
 	comedi_legacy_detach(dev);
 }

diff --git a/drivers/staging/comedi/drivers/ni_daq_700.c b/drivers/staging/comedi/drivers/ni_daq_700.c
index d067ef7..3c50e31 100644
--- a/drivers/staging/comedi/drivers/ni_daq_700.c
+++ b/drivers/staging/comedi/drivers/ni_daq_700.c

@@ -15,11 +15,6 @@
  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *     GNU General Public License for more details.
- *
- *     You should have received a copy of the GNU General Public License
- *     along with this program; if not, write to the Free Software
- *     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
  */
 
 /*
@@ -40,7 +35,7 @@
 Digital direction configuration: channels 0-7 output, 8-15 input (8225 device
 emu as port A output, port B input, port C N/A).
 
-Analog: The input  range is 0 to 4095 for -10 to +10 volts 
+Analog: The input  range is 0 to 4095 for -10 to +10 volts
 IRQ is assigned but not used.
 
 Version 0.1	Original DIO only driver
@@ -183,7 +178,7 @@
  */
 static void daq700_ai_config(struct comedi_device *dev,
 			     struct comedi_subdevice *s)
-{			
+{
 	unsigned long iobase = dev->iobase;
 
 	outb(0x80, iobase + CMD_R1);	/* disable scanning, ADC to chan 0 */

diff --git a/drivers/staging/comedi/drivers/ni_daq_dio24.c b/drivers/staging/comedi/drivers/ni_daq_dio24.c
index 9b7805f..d3d4eb9 100644
--- a/drivers/staging/comedi/drivers/ni_daq_dio24.c
+++ b/drivers/staging/comedi/drivers/ni_daq_dio24.c

@@ -18,12 +18,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: ni_daq_dio24
@@ -71,17 +65,11 @@
 	return 0;
 }
 
-static void dio24_detach(struct comedi_device *dev)
-{
-	comedi_spriv_free(dev, 0);
-	comedi_pcmcia_disable(dev);
-}
-
 static struct comedi_driver driver_dio24 = {
 	.driver_name	= "ni_daq_dio24",
 	.module		= THIS_MODULE,
 	.auto_attach	= dio24_auto_attach,
-	.detach		= dio24_detach,
+	.detach		= comedi_pcmcia_disable,
 };
 
 static int dio24_cs_attach(struct pcmcia_device *link)

diff --git a/drivers/staging/comedi/drivers/ni_labpc.c b/drivers/staging/comedi/drivers/ni_labpc.c
index 77a7bb6..f161e70 100644
--- a/drivers/staging/comedi/drivers/ni_labpc.c
+++ b/drivers/staging/comedi/drivers/ni_labpc.c

@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -88,7 +84,7 @@
 #define CMD1_REG		0x00	/* W: Command 1 reg */
 #define CMD1_MA(x)		(((x) & 0x7) << 0)
 #define CMD1_TWOSCMP		(1 << 3)
-#define CMD1_GAIN_MASK		(7 << 4)
+#define CMD1_GAIN(x)		(((x) & 0x7) << 4)
 #define CMD1_SCANEN		(1 << 7)
 #define CMD2_REG		0x01	/* W: Command 2 reg */
 #define CMD2_PRETRIG		(1 << 0)
@@ -153,11 +149,6 @@
 	MODE_MULT_CHAN_DOWN,
 };
 
-static const int labpc_plus_ai_gain_bits[] = {
-	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
-	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
-};
-
 static const struct comedi_lrange range_labpc_plus_ai = {
 	16, {
 		BIP_RANGE(5),
@@ -179,13 +170,7 @@
 	}
 };
 
-const int labpc_1200_ai_gain_bits[] = {
-	0x00, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
-	0x00, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
-};
-EXPORT_SYMBOL_GPL(labpc_1200_ai_gain_bits);
-
-const struct comedi_lrange range_labpc_1200_ai = {
+static const struct comedi_lrange range_labpc_1200_ai = {
 	14, {
 		BIP_RANGE(5),
 		BIP_RANGE(2.5),
@@ -203,7 +188,6 @@
 		UNI_RANGE(0.1)
 	}
 };
-EXPORT_SYMBOL_GPL(range_labpc_1200_ai);
 
 static const struct comedi_lrange range_labpc_ao = {
 	2, {
@@ -239,25 +223,18 @@
 	{
 		.name			= "lab-pc-1200",
 		.ai_speed		= 10000,
-		.register_layout	= labpc_1200_layout,
-		.has_ao			= 1,
-		.ai_range_table		= &range_labpc_1200_ai,
-		.ai_range_code		= labpc_1200_ai_gain_bits,
 		.ai_scan_up		= 1,
+		.has_ao			= 1,
+		.is_labpc1200		= 1,
 	}, {
 		.name			= "lab-pc-1200ai",
 		.ai_speed		= 10000,
-		.register_layout	= labpc_1200_layout,
-		.ai_range_table		= &range_labpc_1200_ai,
-		.ai_range_code		= labpc_1200_ai_gain_bits,
 		.ai_scan_up		= 1,
+		.is_labpc1200		= 1,
 	}, {
 		.name			= "lab-pc+",
 		.ai_speed		= 12000,
-		.register_layout	= labpc_plus_layout,
 		.has_ao			= 1,
-		.ai_range_table		= &range_labpc_plus_ai,
-		.ai_range_code		= labpc_plus_ai_gain_bits,
 	},
 };
 #endif
@@ -326,12 +303,21 @@
 	const struct labpc_boardinfo *board = comedi_board(dev);
 	struct labpc_private *devpriv = dev->private;
 
+	if (board->is_labpc1200) {
+		/*
+		 * The LabPC-1200 boards do not have a gain
+		 * of '0x10'. Skip the range values that would
+		 * result in this gain.
+		 */
+		range += (range > 0) + (range > 7);
+	}
+
 	/* munge channel bits for differential/scan disabled mode */
 	if ((mode == MODE_SINGLE_CHAN || mode == MODE_SINGLE_CHAN_INTERVAL) &&
 	    aref == AREF_DIFF)
 		chan *= 2;
 	devpriv->cmd1 = CMD1_MA(chan);
-	devpriv->cmd1 |= board->ai_range_code[range];
+	devpriv->cmd1 |= CMD1_GAIN(range);
 
 	devpriv->write_byte(devpriv->cmd1, dev->iobase + CMD1_REG);
 }
@@ -347,7 +333,7 @@
 	const struct labpc_boardinfo *board = comedi_board(dev);
 	struct labpc_private *devpriv = dev->private;
 
-	if (board->register_layout != labpc_1200_layout)
+	if (!board->is_labpc1200)
 		return;
 
 	/* reference inputs to ground or common? */
@@ -759,7 +745,7 @@
 	err |= cfc_check_trigger_src(&cmd->scan_end_src, TRIG_COUNT);
 
 	stop_mask = TRIG_COUNT | TRIG_NONE;
-	if (board->register_layout == labpc_1200_layout)
+	if (board->is_labpc1200)
 		stop_mask |= TRIG_EXT;
 	err |= cfc_check_trigger_src(&cmd->stop_src, stop_mask);
 
@@ -895,7 +881,7 @@
 		/* pc-plus has no fifo-half full interrupt */
 	} else
 #endif
-	if (board->register_layout == labpc_1200_layout &&
+	if (board->is_labpc1200 &&
 		   /*  wake-end-of-scan should interrupt on fifo not empty */
 		   (cmd->flags & TRIG_WAKE_EOS) == 0 &&
 		   /*  make sure we are taking more than just a few points */
@@ -1175,7 +1161,7 @@
 
 	/* read board status */
 	devpriv->stat1 = devpriv->read_byte(dev->iobase + STAT1_REG);
-	if (board->register_layout == labpc_1200_layout)
+	if (board->is_labpc1200)
 		devpriv->stat2 = devpriv->read_byte(dev->iobase + STAT2_REG);
 
 	if ((devpriv->stat1 & (STAT1_GATA0 | STAT1_CNTINT | STAT1_OVERFLOW |
@@ -1201,8 +1187,7 @@
 		 * has occurred
 		 */
 		if (devpriv->stat1 & STAT1_GATA0 ||
-		    (board->register_layout == labpc_1200_layout
-		     && devpriv->stat2 & STAT2_OUTA1)) {
+		    (board->is_labpc1200 && devpriv->stat2 & STAT2_OUTA1)) {
 			handle_isa_dma(dev);
 		}
 	} else
@@ -1266,7 +1251,7 @@
 	spin_unlock_irqrestore(&dev->spinlock, flags);
 
 	/* set range */
-	if (board->register_layout == labpc_1200_layout) {
+	if (board->is_labpc1200) {
 		range = CR_RANGE(insn->chanspec);
 		if (labpc_range_is_unipolar(s, range))
 			devpriv->cmd6 |= CMD6_DACUNI(channel);
@@ -1603,7 +1588,7 @@
 	devpriv->write_byte(devpriv->cmd2, dev->iobase + CMD2_REG);
 	devpriv->write_byte(devpriv->cmd3, dev->iobase + CMD3_REG);
 	devpriv->write_byte(devpriv->cmd4, dev->iobase + CMD4_REG);
-	if (board->register_layout == labpc_1200_layout) {
+	if (board->is_labpc1200) {
 		devpriv->write_byte(devpriv->cmd5, dev->iobase + CMD5_REG);
 		devpriv->write_byte(devpriv->cmd6, dev->iobase + CMD6_REG);
 	}
@@ -1626,7 +1611,8 @@
 	s->n_chan	= 8;
 	s->len_chanlist	= 8;
 	s->maxdata	= 0x0fff;
-	s->range_table	= board->ai_range_table;
+	s->range_table	= board->is_labpc1200
+				? &range_labpc_1200_ai : &range_labpc_plus_ai;
 	s->insn_read	= labpc_ai_insn_read;
 	if (dev->irq) {
 		dev->read_subdev = s;
@@ -1671,7 +1657,7 @@
 
 	/*  calibration subdevices for boards that have one */
 	s = &dev->subdevices[3];
-	if (board->register_layout == labpc_1200_layout) {
+	if (board->is_labpc1200) {
 		s->type		= COMEDI_SUBD_CALIB;
 		s->subdev_flags	= SDF_READABLE | SDF_WRITABLE | SDF_INTERNAL;
 		s->n_chan	= 16;
@@ -1686,7 +1672,7 @@
 
 	/* EEPROM */
 	s = &dev->subdevices[4];
-	if (board->register_layout == labpc_1200_layout) {
+	if (board->is_labpc1200) {
 		s->type		= COMEDI_SUBD_MEMORY;
 		s->subdev_flags	= SDF_READABLE | SDF_WRITABLE | SDF_INTERNAL;
 		s->n_chan	= EEPROM_SIZE;
@@ -1703,12 +1689,6 @@
 }
 EXPORT_SYMBOL_GPL(labpc_common_attach);
 
-void labpc_common_detach(struct comedi_device *dev)
-{
-	comedi_spriv_free(dev, 2);
-}
-EXPORT_SYMBOL_GPL(labpc_common_detach);
-
 #if IS_ENABLED(CONFIG_COMEDI_NI_LABPC_ISA)
 static int labpc_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 {
@@ -1761,8 +1741,6 @@
 {
 	struct labpc_private *devpriv = dev->private;
 
-	labpc_common_detach(dev);
-
 	if (devpriv) {
 		kfree(devpriv->dma_buffer);
 		if (devpriv->dma_chan)

diff --git a/drivers/staging/comedi/drivers/ni_labpc.h b/drivers/staging/comedi/drivers/ni_labpc.h
index 4b691f5..486589f 100644
--- a/drivers/staging/comedi/drivers/ni_labpc.h
+++ b/drivers/staging/comedi/drivers/ni_labpc.h

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _NI_LABPC_H
@@ -27,27 +22,17 @@
 #define EEPROM_SIZE	256	/*  256 byte eeprom */
 #define NUM_AO_CHAN	2	/*  boards have two analog output channels */
 
-enum labpc_register_layout { labpc_plus_layout, labpc_1200_layout };
 enum transfer_type { fifo_not_empty_transfer, fifo_half_full_transfer,
 	isa_dma_transfer
 };
 
 struct labpc_boardinfo {
 	const char *name;
-	int device_id;		/*  device id for pci and pcmcia boards */
-	int ai_speed;		/*  maximum input speed in nanoseconds */
-
-	/*  1200 has extra registers compared to pc+ */
-	enum labpc_register_layout register_layout;
-	int has_ao;		/*  has analog output true/false */
-	const struct comedi_lrange *ai_range_table;
-	const int *ai_range_code;
-
-	/*  board can auto scan up in ai channels, not just down */
-	unsigned ai_scan_up:1;
-
-	/* uses memory mapped io instead of ioports */
-	unsigned has_mmio:1;
+	int ai_speed;			/* maximum input speed in ns */
+	unsigned ai_scan_up:1;		/* can auto scan up in ai channels */
+	unsigned has_ao:1;		/* has analog outputs */
+	unsigned is_labpc1200:1;	/* has extra regs compared to pc+ */
+	unsigned has_mmio:1;		/* uses memory mapped io */
 };
 
 struct labpc_private {
@@ -101,9 +86,5 @@
 
 int labpc_common_attach(struct comedi_device *dev,
 			unsigned int irq, unsigned long isr_flags);
-void labpc_common_detach(struct comedi_device *dev);
-
-extern const int labpc_1200_ai_gain_bits[];
-extern const struct comedi_lrange range_labpc_1200_ai;
 
 #endif /* _NI_LABPC_H */

diff --git a/drivers/staging/comedi/drivers/ni_labpc_cs.c b/drivers/staging/comedi/drivers/ni_labpc_cs.c
index 9e3737c..ce67f4b 100644
--- a/drivers/staging/comedi/drivers/ni_labpc_cs.c
+++ b/drivers/staging/comedi/drivers/ni_labpc_cs.c

@@ -18,12 +18,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: ni_labpc_cs
@@ -76,12 +70,9 @@
 static const struct labpc_boardinfo labpc_cs_boards[] = {
 	{
 		.name			= "daqcard-1200",
-		.device_id		= 0x103,
 		.ai_speed		= 10000,
-		.register_layout	= labpc_1200_layout,
 		.has_ao			= 1,
-		.ai_range_table		= &range_labpc_1200_ai,
-		.ai_range_code		= labpc_1200_ai_gain_bits,
+		.is_labpc1200		= 1,
 	},
 };
 
@@ -113,17 +104,11 @@
 	return labpc_common_attach(dev, link->irq, IRQF_SHARED);
 }
 
-static void labpc_detach(struct comedi_device *dev)
-{
-	labpc_common_detach(dev);
-	comedi_pcmcia_disable(dev);
-}
-
 static struct comedi_driver driver_labpc_cs = {
 	.driver_name	= "ni_labpc_cs",
 	.module		= THIS_MODULE,
 	.auto_attach	= labpc_auto_attach,
-	.detach		= labpc_detach,
+	.detach		= comedi_pcmcia_disable,
 };
 
 static int labpc_cs_attach(struct pcmcia_device *link)

diff --git a/drivers/staging/comedi/drivers/ni_labpc_pci.c b/drivers/staging/comedi/drivers/ni_labpc_pci.c
index 8e916f8..6c79237 100644
--- a/drivers/staging/comedi/drivers/ni_labpc_pci.c
+++ b/drivers/staging/comedi/drivers/ni_labpc_pci.c

@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -50,11 +46,9 @@
 	[BOARD_NI_PCI1200] = {
 		.name			= "ni_pci-1200",
 		.ai_speed		= 10000,
-		.register_layout	= labpc_1200_layout,
-		.has_ao			= 1,
-		.ai_range_table		= &range_labpc_1200_ai,
-		.ai_range_code		= labpc_1200_ai_gain_bits,
 		.ai_scan_up		= 1,
+		.has_ao			= 1,
+		.is_labpc1200		= 1,
 		.has_mmio		= 1,
 	},
 };
@@ -98,8 +92,6 @@
 {
 	struct labpc_private *devpriv = dev->private;
 
-	labpc_common_detach(dev);
-
 	if (devpriv && devpriv->mite) {
 		mite_unsetup(devpriv->mite);
 		mite_free(devpriv->mite);

diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c
index 8c5dee9..3e9f544 100644
--- a/drivers/staging/comedi/drivers/ni_mio_common.c
+++ b/drivers/staging/comedi/drivers/ni_mio_common.c

@@ -15,11 +15,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*
@@ -4077,7 +4072,6 @@
 			ni_gpct_device_destroy(devpriv->counter_dev);
 		}
 	}
-	comedi_spriv_free(dev, NI_8255_DIO_SUBDEV);
 }
 
 static void init_ao_67xx(struct comedi_device *dev, struct comedi_subdevice *s)

diff --git a/drivers/staging/comedi/drivers/ni_mio_cs.c b/drivers/staging/comedi/drivers/ni_mio_cs.c
index 888be7b..f813f57 100644
--- a/drivers/staging/comedi/drivers/ni_mio_cs.c
+++ b/drivers/staging/comedi/drivers/ni_mio_cs.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_mio_cs

diff --git a/drivers/staging/comedi/drivers/ni_pcidio.c b/drivers/staging/comedi/drivers/ni_pcidio.c
index b5f340c..5b2f72e 100644
--- a/drivers/staging/comedi/drivers/ni_pcidio.c
+++ b/drivers/staging/comedi/drivers/ni_pcidio.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_pcidio
@@ -58,7 +53,6 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <linux/firmware.h>
 
 #include "../comedidev.h"
 
@@ -971,11 +965,13 @@
 	return 0;
 }
 
-static int pci_6534_load_fpga(struct comedi_device *dev, int fpga_index,
-			      const u8 *data, size_t data_len)
+static int pci_6534_load_fpga(struct comedi_device *dev,
+			      const u8 *data, size_t data_len,
+			      unsigned long context)
 {
 	struct nidio96_private *devpriv = dev->private;
 	static const int timeout = 1000;
+	int fpga_index = context;
 	int i;
 	size_t j;
 
@@ -1033,7 +1029,7 @@
 
 static int pci_6534_reset_fpga(struct comedi_device *dev, int fpga_index)
 {
-	return pci_6534_load_fpga(dev, fpga_index, NULL, 0);
+	return pci_6534_load_fpga(dev, NULL, 0, fpga_index);
 }
 
 static int pci_6534_reset_fpgas(struct comedi_device *dev)
@@ -1067,13 +1063,12 @@
 static int pci_6534_upload_firmware(struct comedi_device *dev)
 {
 	struct nidio96_private *devpriv = dev->private;
-	int ret;
-	const struct firmware *fw;
 	static const char *const fw_file[3] = {
 		FW_PCI_6534_SCARAB_DI,	/* loaded into scarab A for DI */
 		FW_PCI_6534_SCARAB_DO,	/* loaded into scarab B for DO */
 		FW_PCI_6534_MAIN,	/* loaded into main FPGA */
 	};
+	int ret;
 	int n;
 
 	ret = pci_6534_reset_fpgas(dev);
@@ -1081,14 +1076,11 @@
 		return ret;
 	/* load main FPGA first, then the two scarabs */
 	for (n = 2; n >= 0; n--) {
-		ret = request_firmware(&fw, fw_file[n],
-				       &devpriv->mite->pcidev->dev);
-		if (ret == 0) {
-			ret = pci_6534_load_fpga(dev, n, fw->data, fw->size);
-			if (ret == 0 && n == 2)
-				pci_6534_init_main_fpga(dev);
-			release_firmware(fw);
-		}
+		ret = comedi_load_firmware(dev, &devpriv->mite->pcidev->dev,
+					   fw_file[n],
+					   pci_6534_load_fpga, n);
+		if (ret == 0 && n == 2)
+			pci_6534_init_main_fpga(dev);
 		if (ret < 0)
 			break;
 	}

diff --git a/drivers/staging/comedi/drivers/ni_pcimio.c b/drivers/staging/comedi/drivers/ni_pcimio.c
index 634d023..35681ba 100644
--- a/drivers/staging/comedi/drivers/ni_pcimio.c
+++ b/drivers/staging/comedi/drivers/ni_pcimio.c

@@ -14,10 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 /*
 Driver: ni_pcimio

diff --git a/drivers/staging/comedi/drivers/ni_stc.h b/drivers/staging/comedi/drivers/ni_stc.h
index 0a613c0..11bf0aa 100644
--- a/drivers/staging/comedi/drivers/ni_stc.h
+++ b/drivers/staging/comedi/drivers/ni_stc.h

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/ni_tio.c b/drivers/staging/comedi/drivers/ni_tio.c
index 2252877..f2cf76d 100644
--- a/drivers/staging/comedi/drivers/ni_tio.c
+++ b/drivers/staging/comedi/drivers/ni_tio.c

@@ -13,10 +13,6 @@
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/ni_tio.h b/drivers/staging/comedi/drivers/ni_tio.h
index 8572996..7e13697 100644
--- a/drivers/staging/comedi/drivers/ni_tio.h
+++ b/drivers/staging/comedi/drivers/ni_tio.h

@@ -13,11 +13,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _COMEDI_NI_TIO_H

diff --git a/drivers/staging/comedi/drivers/ni_tio_internal.h b/drivers/staging/comedi/drivers/ni_tio_internal.h
index 5e00212..b009876 100644
--- a/drivers/staging/comedi/drivers/ni_tio_internal.h
+++ b/drivers/staging/comedi/drivers/ni_tio_internal.h

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _COMEDI_NI_TIO_INTERNAL_H

diff --git a/drivers/staging/comedi/drivers/ni_tiocmd.c b/drivers/staging/comedi/drivers/ni_tiocmd.c
index 13747f3..cff50bc 100644
--- a/drivers/staging/comedi/drivers/ni_tiocmd.c
+++ b/drivers/staging/comedi/drivers/ni_tiocmd.c

@@ -13,10 +13,6 @@
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/pcl711.c b/drivers/staging/comedi/drivers/pcl711.c
index 8be2a4c..7abf3f7 100644
--- a/drivers/staging/comedi/drivers/pcl711.c
+++ b/drivers/staging/comedi/drivers/pcl711.c

@@ -17,11 +17,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: pcl711

diff --git a/drivers/staging/comedi/drivers/pcl724.c b/drivers/staging/comedi/drivers/pcl724.c
index 4f033d8..cea657c 100644
--- a/drivers/staging/comedi/drivers/pcl724.c
+++ b/drivers/staging/comedi/drivers/pcl724.c

@@ -1,42 +1,28 @@
 /*
-    comedi/drivers/pcl724.c
+ * pcl724.c
+ * Comedi driver for 8255 based ISA DIO boards
+ *
+ * Michal Dobes <dobes@tesnet.cz>
+ */
 
-    Michal Dobes <dobes@tesnet.cz>
-
-    hardware driver for Advantech cards:
-     card:   PCL-724, PCL-722, PCL-731
-     driver: pcl724,  pcl722,  pcl731
-    and ADLink cards:
-     card:   ACL-7122, ACL-7124, PET-48DIO
-     driver: acl7122,  acl7124,  pet48dio
-
-    Options for PCL-724, PCL-731, ACL-7124 and PET-48DIO:
-     [0] - IO Base
-
-    Options for PCL-722 and ACL-7122:
-     [0] - IO Base
-     [1] - IRQ (0=disable IRQ) IRQ isn't supported at this time!
-     [2] -number of DIO:
-	      0, 144: 144 DIO configuration
-	      1,  96:  96 DIO configuration
-*/
 /*
-Driver: pcl724
-Description: Advantech PCL-724, PCL-722, PCL-731 ADLink ACL-7122, ACL-7124,
-  PET-48DIO
-Author: Michal Dobes <dobes@tesnet.cz>
-Devices: [Advantech] PCL-724 (pcl724), PCL-722 (pcl722), PCL-731 (pcl731),
-  [ADLink] ACL-7122 (acl7122), ACL-7124 (acl7124), PET-48DIO (pet48dio)
-Status: untested
-
-This is driver for digital I/O boards PCL-722/724/731 with 144/24/48 DIO
-and for digital I/O boards ACL-7122/7124/PET-48DIO with 144/24/48 DIO.
-It need 8255.o for operations and only immediate mode is supported.
-See the source for configuration details.
-*/
-/*
- * check_driver overrides:
- *   struct comedi_insn
+ * Driver: pcl724
+ * Description: Comedi driver for 8255 based ISA DIO boards
+ * Devices: (Advantech) PCL-724 [pcl724]
+ *	    (Advantech) PCL-722 [pcl722]
+ *	    (Advantech) PCL-731 [pcl731]
+ *	    (ADLink) ACL-7122 [acl7122]
+ *	    (ADLink) ACL-7124 [acl7124]
+ *	    (ADLink) PET-48DIO [pet48dio]
+ * Author: Michal Dobes <dobes@tesnet.cz>
+ * Status: untested
+ *
+ * Configuration options:
+ *   [0] - IO Base
+ *   [1] - IRQ (not supported)
+ *   [2] - number of DIO (pcl722 and acl7122 boards)
+ *	   0, 144: 144 DIO configuration
+ *	   1,  96:  96 DIO configuration
  */
 
 #include "../comedidev.h"
@@ -46,40 +32,48 @@
 
 #include "8255.h"
 
-#define PCL722_SIZE    32
-#define PCL722_96_SIZE 16
-#define PCL724_SIZE     4
-#define PCL731_SIZE     8
-#define PET48_SIZE      2
-
 #define SIZE_8255	4
 
-/* #define PCL724_IRQ   1  no IRQ support now */
-
 struct pcl724_board {
-
-	const char *name;	/*  board name */
-	int dio;		/*  num of DIO */
-	int numofports;		/*  num of 8255 subdevices */
-	unsigned int IRQbits;	/*  allowed interrupts */
-	unsigned int io_range;	/*  len of IO space */
-	char can_have96;
-	char is_pet48;
+	const char *name;
+	unsigned int io_range;
+	unsigned int can_have96:1;
+	unsigned int is_pet48:1;
+	int numofports;
 };
 
-static int subdev_8255_cb(int dir, int port, int data, unsigned long arg)
-{
-	unsigned long iobase = arg;
+static const struct pcl724_board boardtypes[] = {
+	{
+		.name		= "pcl724",
+		.io_range	= 0x04,
+		.numofports	= 1,	/* 24 DIO channels */
+	}, {
+		.name		= "pcl722",
+		.io_range	= 0x20,
+		.can_have96	= 1,
+		.numofports	= 6,	/* 144 (or 96) DIO channels */
+	}, {
+		.name		= "pcl731",
+		.io_range	= 0x08,
+		.numofports	= 2,	/* 48 DIO channels */
+	}, {
+		.name		= "acl7122",
+		.io_range	= 0x20,
+		.can_have96	= 1,
+		.numofports	= 6,	/* 144 (or 96) DIO channels */
+	}, {
+		.name		= "acl7124",
+		.io_range	= 0x04,
+		.numofports	= 1,	/* 24 DIO channels */
+	}, {
+		.name		= "pet48dio",
+		.io_range	= 0x02,
+		.is_pet48	= 1,
+		.numofports	= 2,	/* 48 DIO channels */
+	},
+};
 
-	if (dir) {
-		outb(data, iobase + port);
-		return 0;
-	} else {
-		return inb(iobase + port);
-	}
-}
-
-static int subdev_8255mapped_cb(int dir, int port, int data,
+static int pcl724_8255mapped_io(int dir, int port, int data,
 				unsigned long iobase)
 {
 	int movport = SIZE_8255 * (iobase >> 12);
@@ -96,58 +90,31 @@
 	}
 }
 
-static int pcl724_attach(struct comedi_device *dev, struct comedi_devconfig *it)
+static int pcl724_attach(struct comedi_device *dev,
+			 struct comedi_devconfig *it)
 {
 	const struct pcl724_board *board = comedi_board(dev);
 	struct comedi_subdevice *s;
+	unsigned long iobase;
 	unsigned int iorange;
-	int ret, i, n_subdevices;
-#ifdef PCL724_IRQ
-	unsigned int irq;
-#endif
+	int n_subdevices;
+	int ret;
+	int i;
 
 	iorange = board->io_range;
-	if ((board->can_have96) &&
-	    ((it->options[1] == 1) || (it->options[1] == 96)))
-		iorange = PCL722_96_SIZE; /* PCL-724 in 96 DIO configuration */
+	n_subdevices = board->numofports;
+
+	/* Handle PCL-724 in 96 DIO configuration */
+	if (board->can_have96 &&
+	    (it->options[2] == 1 || it->options[2] == 96)) {
+		iorange = 0x10;
+		n_subdevices = 4;
+	}
+
 	ret = comedi_request_region(dev, it->options[0], iorange);
 	if (ret)
 		return ret;
 
-#ifdef PCL724_IRQ
-	irq = 0;
-	if (board->IRQbits != 0) {	/* board support IRQ */
-		irq = it->options[1];
-		if (irq) {	/* we want to use IRQ */
-			if (((1 << irq) & board->IRQbits) == 0) {
-				printk(KERN_WARNING
-				       ", IRQ %u is out of allowed range, "
-				       "DISABLING IT", irq);
-				irq = 0;	/* Bad IRQ */
-			} else {
-				if (request_irq(irq, interrupt_pcl724, 0,
-					        dev->board_name, dev)) {
-					printk(KERN_WARNING
-					       ", unable to allocate IRQ %u, "
-					       "DISABLING IT", irq);
-					irq = 0;	/* Can't use IRQ */
-				} else {
-					printk(", irq=%u", irq);
-				}
-			}
-		}
-	}
-
-	dev->irq = irq;
-#endif
-
-	printk("\n");
-
-	n_subdevices = board->numofports;
-	if ((board->can_have96) && ((it->options[1] == 1)
-					 || (it->options[1] == 96)))
-		n_subdevices = 4;	/*  PCL-724 in 96 DIO configuration */
-
 	ret = comedi_alloc_subdevices(dev, n_subdevices);
 	if (ret)
 		return ret;
@@ -155,41 +122,25 @@
 	for (i = 0; i < dev->n_subdevices; i++) {
 		s = &dev->subdevices[i];
 		if (board->is_pet48) {
-			subdev_8255_init(dev, s, subdev_8255mapped_cb,
-					 (unsigned long)(dev->iobase +
-							 i * 0x1000));
-		} else
-			subdev_8255_init(dev, s, subdev_8255_cb,
-					 (unsigned long)(dev->iobase +
-							 SIZE_8255 * i));
+			iobase = dev->iobase + (i * 0x1000);
+			ret = subdev_8255_init(dev, s, pcl724_8255mapped_io,
+					       iobase);
+		} else {
+			iobase = dev->iobase + (i * SIZE_8255);
+			ret = subdev_8255_init(dev, s, NULL, iobase);
+		}
+		if (ret)
+			return ret;
 	}
 
 	return 0;
 }
 
-static void pcl724_detach(struct comedi_device *dev)
-{
-	int i;
-
-	for (i = 0; i < dev->n_subdevices; i++)
-		comedi_spriv_free(dev, i);
-	comedi_legacy_detach(dev);
-}
-
-static const struct pcl724_board boardtypes[] = {
-	{ "pcl724", 24, 1, 0x00fc, PCL724_SIZE, 0, 0, },
-	{ "pcl722", 144, 6, 0x00fc, PCL722_SIZE, 1, 0, },
-	{ "pcl731", 48, 2, 0x9cfc, PCL731_SIZE, 0, 0, },
-	{ "acl7122", 144, 6, 0x9ee8, PCL722_SIZE, 1, 0, },
-	{ "acl7124", 24, 1, 0x00fc, PCL724_SIZE, 0, 0, },
-	{ "pet48dio", 48, 2, 0x9eb8, PET48_SIZE, 0, 1, },
-};
-
 static struct comedi_driver pcl724_driver = {
 	.driver_name	= "pcl724",
 	.module		= THIS_MODULE,
 	.attach		= pcl724_attach,
-	.detach		= pcl724_detach,
+	.detach		= comedi_legacy_detach,
 	.board_name	= &boardtypes[0].name,
 	.num_names	= ARRAY_SIZE(boardtypes),
 	.offset		= sizeof(struct pcl724_board),
@@ -197,5 +148,5 @@
 module_comedi_driver(pcl724_driver);
 
 MODULE_AUTHOR("Comedi http://www.comedi.org");
-MODULE_DESCRIPTION("Comedi low-level driver");
+MODULE_DESCRIPTION("Comedi driver for 8255 based ISA DIO boards");
 MODULE_LICENSE("GPL");

diff --git a/drivers/staging/comedi/drivers/pcl725.c b/drivers/staging/comedi/drivers/pcl725.c
deleted file mode 100644
index 6b02f06..0000000
--- a/drivers/staging/comedi/drivers/pcl725.c
+++ /dev/null

@@ -1,91 +0,0 @@
-/*
- * comedi/drivers/pcl725.c
- * Driver for PCL725 and clones
- * David A. Schleef
- */
-/*
-Driver: pcl725
-Description: Advantech PCL-725 (& compatibles)
-Author: ds
-Status: unknown
-Devices: [Advantech] PCL-725 (pcl725)
-*/
-
-#include "../comedidev.h"
-
-#include <linux/ioport.h>
-
-#define PCL725_SIZE 2
-
-#define PCL725_DO 0
-#define PCL725_DI 1
-
-static int pcl725_do_insn(struct comedi_device *dev, struct comedi_subdevice *s,
-			  struct comedi_insn *insn, unsigned int *data)
-{
-	if (data[0]) {
-		s->state &= ~data[0];
-		s->state |= (data[0] & data[1]);
-		outb(s->state, dev->iobase + PCL725_DO);
-	}
-
-	data[1] = s->state;
-
-	return insn->n;
-}
-
-static int pcl725_di_insn(struct comedi_device *dev, struct comedi_subdevice *s,
-			  struct comedi_insn *insn, unsigned int *data)
-{
-	data[1] = inb(dev->iobase + PCL725_DI);
-
-	return insn->n;
-}
-
-static int pcl725_attach(struct comedi_device *dev, struct comedi_devconfig *it)
-{
-	struct comedi_subdevice *s;
-	int ret;
-
-	ret = comedi_request_region(dev, it->options[0], PCL725_SIZE);
-	if (ret)
-		return ret;
-
-	ret = comedi_alloc_subdevices(dev, 2);
-	if (ret)
-		return ret;
-
-	s = &dev->subdevices[0];
-	/* do */
-	s->type = COMEDI_SUBD_DO;
-	s->subdev_flags = SDF_WRITABLE;
-	s->maxdata = 1;
-	s->n_chan = 8;
-	s->insn_bits = pcl725_do_insn;
-	s->range_table = &range_digital;
-
-	s = &dev->subdevices[1];
-	/* di */
-	s->type = COMEDI_SUBD_DI;
-	s->subdev_flags = SDF_READABLE;
-	s->maxdata = 1;
-	s->n_chan = 8;
-	s->insn_bits = pcl725_di_insn;
-	s->range_table = &range_digital;
-
-	printk(KERN_INFO "\n");
-
-	return 0;
-}
-
-static struct comedi_driver pcl725_driver = {
-	.driver_name	= "pcl725",
-	.module		= THIS_MODULE,
-	.attach		= pcl725_attach,
-	.detach		= comedi_legacy_detach,
-};
-module_comedi_driver(pcl725_driver);
-
-MODULE_AUTHOR("Comedi http://www.comedi.org");
-MODULE_DESCRIPTION("Comedi low-level driver");
-MODULE_LICENSE("GPL");

diff --git a/drivers/staging/comedi/drivers/pcl726.c b/drivers/staging/comedi/drivers/pcl726.c
index 4aa9943..893f012 100644
--- a/drivers/staging/comedi/drivers/pcl726.c
+++ b/drivers/staging/comedi/drivers/pcl726.c

@@ -20,11 +20,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: pcl726

diff --git a/drivers/staging/comedi/drivers/pcl730.c b/drivers/staging/comedi/drivers/pcl730.c
index 2879db7..862e75f 100644
--- a/drivers/staging/comedi/drivers/pcl730.c
+++ b/drivers/staging/comedi/drivers/pcl730.c

@@ -3,135 +3,299 @@
  * Driver for Advantech PCL-730 and clones
  * José Luis Sánchez
  */
-/*
-Driver: pcl730
-Description: Advantech PCL-730 (& compatibles)
-Author: José Luis Sánchez (jsanchezv@teleline.es)
-Status: untested
-Devices: [Advantech] PCL-730 (pcl730), [ICP] ISO-730 (iso730),
-		 [Adlink] ACL-7130 (acl7130)
 
-Interrupts are not supported.
-The ACL-7130 card have an 8254 timer/counter not supported by this driver.
-*/
+/*
+ * Driver: pcl730
+ * Description: Advantech PCL-730 (& compatibles)
+ * Devices: (Advantech) PCL-730 [pcl730]
+ *	    (ICP) ISO-730 [iso730]
+ *	    (Adlink) ACL-7130 [acl7130]
+ *	    (Advantech) PCM-3730 [pcm3730]
+ *	    (Advantech) PCL-725 [pcl725]
+ *	    (ICP) P8R8-DIO [p16r16dio]
+ *	    (Adlink) ACL-7225b [acl7225b]
+ *	    (ICP) P16R16-DIO [p16r16dio]
+ *	    (Advantech) PCL-733 [pcl733]
+ *	    (Advantech) PCL-734 [pcl734]
+ * Author: José Luis Sánchez (jsanchezv@teleline.es)
+ * Status: untested
+ *
+ * Configuration options:
+ *   [0] - I/O port base
+ *
+ * Interrupts are not supported.
+ * The ACL-7130 card has an 8254 timer/counter not supported by this driver.
+ */
 
 #include "../comedidev.h"
 
 #include <linux/ioport.h>
 
-#define PCL730_SIZE		4
-#define ACL7130_SIZE	8
-#define PCL730_IDIO_LO	0	/* Isolated Digital I/O low byte (ID0-ID7) */
-#define PCL730_IDIO_HI	1	/* Isolated Digital I/O high byte (ID8-ID15) */
-#define PCL730_DIO_LO	2	/* TTL Digital I/O low byte (D0-D7) */
-#define PCL730_DIO_HI	3	/* TTL Digital I/O high byte (D8-D15) */
+/*
+ * Register map
+ *
+ * The register map varies slightly depending on the board type but
+ * all registers are 8-bit.
+ *
+ * The boardinfo 'io_range' is used to allow comedi to request the
+ * proper range required by the board.
+ *
+ * The comedi_subdevice 'private' data is used to pass the register
+ * offset to the (*insn_bits) functions to read/write the correct
+ * registers.
+ *
+ * The basic register mapping looks like this:
+ *
+ *     BASE+0  Isolated outputs 0-7 (write) / inputs 0-7 (read)
+ *     BASE+1  Isolated outputs 8-15 (write) / inputs 8-15 (read)
+ *     BASE+2  TTL outputs 0-7 (write) / inputs 0-7 (read)
+ *     BASE+3  TTL outputs 8-15 (write) / inputs 8-15 (read)
+ *
+ * The pcm3730 board does not have register BASE+1.
+ *
+ * The pcl725 and p8r8dio only have registers BASE+0 and BASE+1:
+ *
+ *     BASE+0  Isolated outputs 0-7 (write) (read back on p8r8dio)
+ *     BASE+1  Isolated inputs 0-7 (read)
+ *
+ * The acl7225b and p16r16dio boards have this register mapping:
+ *
+ *     BASE+0  Isolated outputs 0-7 (write) (read back)
+ *     BASE+1  Isolated outputs 8-15 (write) (read back)
+ *     BASE+2  Isolated inputs 0-7 (read)
+ *     BASE+3  Isolated inputs 8-15 (read)
+ *
+ * The pcl733 and pcl733 boards have this register mapping:
+ *
+ *     BASE+0  Isolated outputs 0-7 (write) or inputs 0-7 (read)
+ *     BASE+1  Isolated outputs 8-15 (write) or inputs 8-15 (read)
+ *     BASE+2  Isolated outputs 16-23 (write) or inputs 16-23 (read)
+ *     BASE+3  Isolated outputs 24-31 (write) or inputs 24-31 (read)
+ */
 
 struct pcl730_board {
-
-	const char *name;	/*  board name */
-	unsigned int io_range;	/*  len of I/O space */
+	const char *name;
+	unsigned int io_range;
+	unsigned is_pcl725:1;
+	unsigned is_acl7225b:1;
+	unsigned has_readback:1;
+	unsigned has_ttl_io:1;
+	int n_subdevs;
+	int n_iso_out_chan;
+	int n_iso_in_chan;
+	int n_ttl_chan;
 };
 
-static int pcl730_do_insn(struct comedi_device *dev, struct comedi_subdevice *s,
-			  struct comedi_insn *insn, unsigned int *data)
+static const struct pcl730_board pcl730_boards[] = {
+	{
+		.name		= "pcl730",
+		.io_range	= 0x04,
+		.has_ttl_io	= 1,
+		.n_subdevs	= 4,
+		.n_iso_out_chan	= 16,
+		.n_iso_in_chan	= 16,
+		.n_ttl_chan	= 16,
+	}, {
+		.name		= "iso730",
+		.io_range	= 0x04,
+		.n_subdevs	= 4,
+		.n_iso_out_chan	= 16,
+		.n_iso_in_chan	= 16,
+		.n_ttl_chan	= 16,
+	}, {
+		.name		= "acl7130",
+		.io_range	= 0x08,
+		.has_ttl_io	= 1,
+		.n_subdevs	= 4,
+		.n_iso_out_chan	= 16,
+		.n_iso_in_chan	= 16,
+		.n_ttl_chan	= 16,
+	}, {
+		.name		= "pcm3730",
+		.io_range	= 0x04,
+		.has_ttl_io	= 1,
+		.n_subdevs	= 4,
+		.n_iso_out_chan	= 8,
+		.n_iso_in_chan	= 8,
+		.n_ttl_chan	= 16,
+	}, {
+		.name		= "pcl725",
+		.io_range	= 0x02,
+		.is_pcl725	= 1,
+		.n_subdevs	= 2,
+		.n_iso_out_chan	= 8,
+		.n_iso_in_chan	= 8,
+	}, {
+		.name		= "p8r8dio",
+		.io_range	= 0x02,
+		.is_pcl725	= 1,
+		.has_readback	= 1,
+		.n_subdevs	= 2,
+		.n_iso_out_chan	= 8,
+		.n_iso_in_chan	= 8,
+	}, {
+		.name		= "acl7225b",
+		.io_range	= 0x08,		/* only 4 are used */
+		.is_acl7225b	= 1,
+		.has_readback	= 1,
+		.n_subdevs	= 2,
+		.n_iso_out_chan	= 16,
+		.n_iso_in_chan	= 16,
+	}, {
+		.name		= "p16r16dio",
+		.io_range	= 0x04,
+		.is_acl7225b	= 1,
+		.has_readback	= 1,
+		.n_subdevs	= 2,
+		.n_iso_out_chan	= 16,
+		.n_iso_in_chan	= 16,
+	}, {
+		.name		= "pcl733",
+		.io_range	= 0x04,
+		.n_subdevs	= 1,
+		.n_iso_in_chan	= 32,
+	}, {
+		.name		= "pcl734",
+		.io_range	= 0x04,
+		.n_subdevs	= 1,
+		.n_iso_out_chan	= 32,
+	},
+};
+
+static int pcl730_do_insn_bits(struct comedi_device *dev,
+			       struct comedi_subdevice *s,
+			       struct comedi_insn *insn,
+			       unsigned int *data)
 {
-	if (data[0]) {
-		s->state &= ~data[0];
-		s->state |= (data[0] & data[1]);
+	unsigned long reg = (unsigned long)s->private;
+	unsigned int mask = data[0];
+	unsigned int bits = data[1];
+
+	if (mask) {
+		s->state &= ~mask;
+		s->state |= (bits & mask);
+
+		if (mask & 0x00ff)
+			outb(s->state & 0xff, dev->iobase + reg);
+		if ((mask & 0xff00) && (s->n_chan > 8))
+			outb((s->state >> 8) & 0xff, dev->iobase + reg + 1);
+		if ((mask & 0xff0000) && (s->n_chan > 16))
+			outb((s->state >> 16) & 0xff, dev->iobase + reg + 2);
+		if ((mask & 0xff000000) && (s->n_chan > 24))
+			outb((s->state >> 24) & 0xff, dev->iobase + reg + 3);
 	}
-	if (data[0] & 0x00ff)
-		outb(s->state & 0xff,
-		     dev->iobase + ((unsigned long)s->private));
-	if (data[0] & 0xff00)
-		outb((s->state >> 8),
-		     dev->iobase + ((unsigned long)s->private) + 1);
 
 	data[1] = s->state;
 
 	return insn->n;
 }
 
-static int pcl730_di_insn(struct comedi_device *dev, struct comedi_subdevice *s,
-			  struct comedi_insn *insn, unsigned int *data)
+static unsigned int pcl730_get_bits(struct comedi_device *dev,
+				    struct comedi_subdevice *s)
 {
-	data[1] = inb(dev->iobase + ((unsigned long)s->private)) |
-	    (inb(dev->iobase + ((unsigned long)s->private) + 1) << 8);
+	unsigned long reg = (unsigned long)s->private;
+	unsigned int val;
+
+	val = inb(dev->iobase + reg);
+	if (s->n_chan > 8)
+		val |= (inb(dev->iobase + reg + 1) << 8);
+	if (s->n_chan > 16)
+		val |= (inb(dev->iobase + reg + 2) << 16);
+	if (s->n_chan > 24)
+		val |= (inb(dev->iobase + reg + 3) << 24);
+
+	return val;
+}
+
+static int pcl730_di_insn_bits(struct comedi_device *dev,
+			       struct comedi_subdevice *s,
+			       struct comedi_insn *insn,
+			       unsigned int *data)
+{
+	data[1] = pcl730_get_bits(dev, s);
 
 	return insn->n;
 }
 
-static int pcl730_attach(struct comedi_device *dev, struct comedi_devconfig *it)
+static int pcl730_attach(struct comedi_device *dev,
+			 struct comedi_devconfig *it)
 {
 	const struct pcl730_board *board = comedi_board(dev);
 	struct comedi_subdevice *s;
+	int subdev;
 	int ret;
 
 	ret = comedi_request_region(dev, it->options[0], board->io_range);
 	if (ret)
 		return ret;
 
-	ret = comedi_alloc_subdevices(dev, 4);
+	ret = comedi_alloc_subdevices(dev, board->n_subdevs);
 	if (ret)
 		return ret;
 
-	s = &dev->subdevices[0];
-	/* Isolated do */
-	s->type = COMEDI_SUBD_DO;
-	s->subdev_flags = SDF_WRITABLE;
-	s->maxdata = 1;
-	s->n_chan = 16;
-	s->insn_bits = pcl730_do_insn;
-	s->range_table = &range_digital;
-	s->private = (void *)PCL730_IDIO_LO;
+	subdev = 0;
 
-	s = &dev->subdevices[1];
-	/* Isolated di */
-	s->type = COMEDI_SUBD_DI;
-	s->subdev_flags = SDF_READABLE;
-	s->maxdata = 1;
-	s->n_chan = 16;
-	s->insn_bits = pcl730_di_insn;
-	s->range_table = &range_digital;
-	s->private = (void *)PCL730_IDIO_LO;
+	if (board->n_iso_out_chan) {
+		/* Isolated Digital Outputs */
+		s = &dev->subdevices[subdev++];
+		s->type		= COMEDI_SUBD_DO;
+		s->subdev_flags	= SDF_WRITABLE;
+		s->n_chan	= board->n_iso_out_chan;
+		s->maxdata	= 1;
+		s->range_table	= &range_digital;
+		s->insn_bits	= pcl730_do_insn_bits;
+		s->private	= (void *)0;
 
-	s = &dev->subdevices[2];
-	/* TTL do */
-	s->type = COMEDI_SUBD_DO;
-	s->subdev_flags = SDF_WRITABLE;
-	s->maxdata = 1;
-	s->n_chan = 16;
-	s->insn_bits = pcl730_do_insn;
-	s->range_table = &range_digital;
-	s->private = (void *)PCL730_DIO_LO;
+		/* get the initial state if supported */
+		if (board->has_readback)
+			s->state = pcl730_get_bits(dev, s);
+	}
 
-	s = &dev->subdevices[3];
-	/* TTL di */
-	s->type = COMEDI_SUBD_DI;
-	s->subdev_flags = SDF_READABLE;
-	s->maxdata = 1;
-	s->n_chan = 16;
-	s->insn_bits = pcl730_di_insn;
-	s->range_table = &range_digital;
-	s->private = (void *)PCL730_DIO_LO;
+	if (board->n_iso_in_chan) {
+		/* Isolated Digital Inputs */
+		s = &dev->subdevices[subdev++];
+		s->type		= COMEDI_SUBD_DI;
+		s->subdev_flags	= SDF_READABLE;
+		s->n_chan	= board->n_iso_in_chan;
+		s->maxdata	= 1;
+		s->range_table	= &range_digital;
+		s->insn_bits	= pcl730_di_insn_bits;
+		s->private	= board->is_acl7225b ? (void *)2 :
+				  board->is_pcl725 ? (void *)1 : (void *)0;
+	}
 
-	printk(KERN_INFO "\n");
+	if (board->has_ttl_io) {
+		/* TTL Digital Outputs */
+		s = &dev->subdevices[subdev++];
+		s->type		= COMEDI_SUBD_DO;
+		s->subdev_flags	= SDF_WRITABLE;
+		s->n_chan	= board->n_ttl_chan;
+		s->maxdata	= 1;
+		s->range_table	= &range_digital;
+		s->insn_bits	= pcl730_do_insn_bits;
+		s->private	= (void *)2;
+
+		/* TTL Digital Inputs */
+		s = &dev->subdevices[subdev++];
+		s->type		= COMEDI_SUBD_DI;
+		s->subdev_flags	= SDF_READABLE;
+		s->n_chan	= board->n_ttl_chan;
+		s->maxdata	= 1;
+		s->range_table	= &range_digital;
+		s->insn_bits	= pcl730_di_insn_bits;
+		s->private	= (void *)2;
+	}
 
 	return 0;
 }
 
-static const struct pcl730_board boardtypes[] = {
-	{ "pcl730", PCL730_SIZE, },
-	{ "iso730", PCL730_SIZE, },
-	{ "acl7130", ACL7130_SIZE, },
-};
-
 static struct comedi_driver pcl730_driver = {
 	.driver_name	= "pcl730",
 	.module		= THIS_MODULE,
 	.attach		= pcl730_attach,
 	.detach		= comedi_legacy_detach,
-	.board_name	= &boardtypes[0].name,
-	.num_names	= ARRAY_SIZE(boardtypes),
+	.board_name	= &pcl730_boards[0].name,
+	.num_names	= ARRAY_SIZE(pcl730_boards),
 	.offset		= sizeof(struct pcl730_board),
 };
 module_comedi_driver(pcl730_driver);

diff --git a/drivers/staging/comedi/drivers/pcm3724.c b/drivers/staging/comedi/drivers/pcm3724.c
index 4ef0df30b..5a9cd38 100644
--- a/drivers/staging/comedi/drivers/pcm3724.c
+++ b/drivers/staging/comedi/drivers/pcm3724.c

@@ -250,20 +250,11 @@
 	return 0;
 }
 
-static void pcm3724_detach(struct comedi_device *dev)
-{
-	int i;
-
-	for (i = 0; i < dev->n_subdevices; i++)
-		comedi_spriv_free(dev, i);
-	comedi_legacy_detach(dev);
-}
-
 static struct comedi_driver pcm3724_driver = {
 	.driver_name	= "pcm3724",
 	.module		= THIS_MODULE,
 	.attach		= pcm3724_attach,
-	.detach		= pcm3724_detach,
+	.detach		= comedi_legacy_detach,
 };
 module_comedi_driver(pcm3724_driver);
 

diff --git a/drivers/staging/comedi/drivers/pcm3730.c b/drivers/staging/comedi/drivers/pcm3730.c
deleted file mode 100644
index 3a3ce2c7..0000000
--- a/drivers/staging/comedi/drivers/pcm3730.c
+++ /dev/null

@@ -1,136 +0,0 @@
-/*
- * comedi/drivers/pcm3730.c
- * Driver for PCM3730 and clones
- * Blaine Lee
- * from pcl725 by David S.
- */
-/*
-Driver: pcm3730
-Description: PCM3730
-Author: Blaine Lee
-Devices: [Advantech] PCM-3730 (pcm3730)
-Status: unknown
-
-Configuration options:
-  [0] - I/O port base
-*/
-
-#include "../comedidev.h"
-
-#include <linux/ioport.h>
-
-#define PCM3730_SIZE 4		/*  consecutive io port addresses */
-
-#define PCM3730_DOA 0		/*  offsets for each port */
-#define PCM3730_DOB 2
-#define PCM3730_DOC 3
-#define PCM3730_DIA 0
-#define PCM3730_DIB 2
-#define PCM3730_DIC 3
-
-static int pcm3730_do_insn_bits(struct comedi_device *dev,
-				struct comedi_subdevice *s,
-				struct comedi_insn *insn, unsigned int *data)
-{
-	if (data[0]) {
-		s->state &= ~data[0];
-		s->state |= (data[0] & data[1]);
-		outb(s->state, dev->iobase + (unsigned long)(s->private));
-	}
-	data[1] = s->state;
-
-	return insn->n;
-}
-
-static int pcm3730_di_insn_bits(struct comedi_device *dev,
-				struct comedi_subdevice *s,
-				struct comedi_insn *insn, unsigned int *data)
-{
-	data[1] = inb(dev->iobase + (unsigned long)(s->private));
-	return insn->n;
-}
-
-static int pcm3730_attach(struct comedi_device *dev,
-			  struct comedi_devconfig *it)
-{
-	struct comedi_subdevice *s;
-	int ret;
-
-	ret = comedi_request_region(dev, it->options[0], PCM3730_SIZE);
-	if (ret)
-		return ret;
-
-	ret = comedi_alloc_subdevices(dev, 6);
-	if (ret)
-		return ret;
-
-	s = &dev->subdevices[0];
-	s->type = COMEDI_SUBD_DO;
-	s->subdev_flags = SDF_WRITABLE;
-	s->maxdata = 1;
-	s->n_chan = 8;
-	s->insn_bits = pcm3730_do_insn_bits;
-	s->range_table = &range_digital;
-	s->private = (void *)PCM3730_DOA;
-
-	s = &dev->subdevices[1];
-	s->type = COMEDI_SUBD_DO;
-	s->subdev_flags = SDF_WRITABLE;
-	s->maxdata = 1;
-	s->n_chan = 8;
-	s->insn_bits = pcm3730_do_insn_bits;
-	s->range_table = &range_digital;
-	s->private = (void *)PCM3730_DOB;
-
-	s = &dev->subdevices[2];
-	s->type = COMEDI_SUBD_DO;
-	s->subdev_flags = SDF_WRITABLE;
-	s->maxdata = 1;
-	s->n_chan = 8;
-	s->insn_bits = pcm3730_do_insn_bits;
-	s->range_table = &range_digital;
-	s->private = (void *)PCM3730_DOC;
-
-	s = &dev->subdevices[3];
-	s->type = COMEDI_SUBD_DI;
-	s->subdev_flags = SDF_READABLE;
-	s->maxdata = 1;
-	s->n_chan = 8;
-	s->insn_bits = pcm3730_di_insn_bits;
-	s->range_table = &range_digital;
-	s->private = (void *)PCM3730_DIA;
-
-	s = &dev->subdevices[4];
-	s->type = COMEDI_SUBD_DI;
-	s->subdev_flags = SDF_READABLE;
-	s->maxdata = 1;
-	s->n_chan = 8;
-	s->insn_bits = pcm3730_di_insn_bits;
-	s->range_table = &range_digital;
-	s->private = (void *)PCM3730_DIB;
-
-	s = &dev->subdevices[5];
-	s->type = COMEDI_SUBD_DI;
-	s->subdev_flags = SDF_READABLE;
-	s->maxdata = 1;
-	s->n_chan = 8;
-	s->insn_bits = pcm3730_di_insn_bits;
-	s->range_table = &range_digital;
-	s->private = (void *)PCM3730_DIC;
-
-	printk(KERN_INFO "\n");
-
-	return 0;
-}
-
-static struct comedi_driver pcm3730_driver = {
-	.driver_name	= "pcm3730",
-	.module		= THIS_MODULE,
-	.attach		= pcm3730_attach,
-	.detach		= comedi_legacy_detach,
-};
-module_comedi_driver(pcm3730_driver);
-
-MODULE_AUTHOR("Comedi http://www.comedi.org");
-MODULE_DESCRIPTION("Comedi low-level driver");
-MODULE_LICENSE("GPL");

diff --git a/drivers/staging/comedi/drivers/pcmad.c b/drivers/staging/comedi/drivers/pcmad.c
index b7c932e..d5c728d 100644
--- a/drivers/staging/comedi/drivers/pcmad.c
+++ b/drivers/staging/comedi/drivers/pcmad.c

@@ -1,53 +1,45 @@
 /*
-    comedi/drivers/pcmad.c
-    Hardware driver for Winsystems PCM-A/D12 and PCM-A/D16
+ * pcmad.c
+ * Hardware driver for Winsystems PCM-A/D12 and PCM-A/D16
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 2000,2001 David A. Schleef <ds@schleef.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
 
-    COMEDI - Linux Control and Measurement Device Interface
-    Copyright (C) 2000,2001 David A. Schleef <ds@schleef.org>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-*/
 /*
-Driver: pcmad
-Description: Winsystems PCM-A/D12, PCM-A/D16
-Author: ds
-Devices: [Winsystems] PCM-A/D12 (pcmad12), PCM-A/D16 (pcmad16)
-Status: untested
+ * Driver: pcmad
+ * Description: Winsystems PCM-A/D12, PCM-A/D16
+ * Devices: (Winsystems) PCM-A/D12 [pcmad12]
+ *	    (Winsystems) PCM-A/D16 [pcmad16]
+ * Author: ds
+ * Status: untested
+ *
+ * This driver was written on a bet that I couldn't write a driver
+ * in less than 2 hours.  I won the bet, but never got paid.  =(
+ *
+ * Configuration options:
+ *   [0] - I/O port base
+ *   [1] - IRQ (unused)
+ *   [2] - Analog input reference (must match jumpers)
+ *	   0 = single-ended (16 channels)
+ *	   1 = differential (8 channels)
+ *   [3] - Analog input encoding (must match jumpers)
+ *	   0 = straight binary (0-5V input range)
+ *	   1 = two's complement (+-10V input range)
+ */
 
-This driver was written on a bet that I couldn't write a driver
-in less than 2 hours.  I won the bet, but never got paid.  =(
-
-Configuration options:
-  [0] - I/O port base
-  [1] - unused
-  [2] - Analog input reference
-	0 = single ended
-	1 = differential
-  [3] - Analog input encoding (must match jumpers)
-	0 = straight binary
-	1 = two's complement
-*/
-
-#include <linux/interrupt.h>
 #include "../comedidev.h"
 
-#include <linux/ioport.h>
-
-#define PCMAD_SIZE		4
-
 #define PCMAD_STATUS		0
 #define PCMAD_LSB		1
 #define PCMAD_MSB		2
@@ -55,60 +47,82 @@
 
 struct pcmad_board_struct {
 	const char *name;
-	int n_ai_bits;
+	unsigned int ai_maxdata;
 };
 
-struct pcmad_priv_struct {
-	int differential;
-	int twos_comp;
+static const struct pcmad_board_struct pcmad_boards[] = {
+	{
+		.name		= "pcmad12",
+		.ai_maxdata	= 0x0fff,
+	}, {
+		.name		= "pcmad16",
+		.ai_maxdata	= 0xffff,
+	},
 };
 
 #define TIMEOUT	100
 
-static int pcmad_ai_insn_read(struct comedi_device *dev,
-			      struct comedi_subdevice *s,
-			      struct comedi_insn *insn, unsigned int *data)
+static int pcmad_ai_wait_for_eoc(struct comedi_device *dev,
+				 int timeout)
 {
-	const struct pcmad_board_struct *board = comedi_board(dev);
-	struct pcmad_priv_struct *devpriv = dev->private;
 	int i;
-	int chan;
-	int n;
 
-	chan = CR_CHAN(insn->chanspec);
-
-	for (n = 0; n < insn->n; n++) {
-		outb(chan, dev->iobase + PCMAD_CONVERT);
-
-		for (i = 0; i < TIMEOUT; i++) {
-			if ((inb(dev->iobase + PCMAD_STATUS) & 0x3) == 0x3)
-				break;
-		}
-		data[n] = inb(dev->iobase + PCMAD_LSB);
-		data[n] |= (inb(dev->iobase + PCMAD_MSB) << 8);
-
-		if (devpriv->twos_comp)
-			data[n] ^= (1 << (board->n_ai_bits - 1));
+	for (i = 0; i < timeout; i++) {
+		if ((inb(dev->iobase + PCMAD_STATUS) & 0x3) == 0x3)
+			return 0;
 	}
-
-	return n;
+	return -ETIME;
 }
 
-/*
- * options:
- * 0	i/o base
- * 1	unused
- * 2	0=single ended 1=differential
- * 3	0=straight binary 1=two's comp
- */
+static bool pcmad_range_is_bipolar(struct comedi_subdevice *s,
+				   unsigned int range)
+{
+	return s->range_table->range[range].min < 0;
+}
+
+static int pcmad_ai_insn_read(struct comedi_device *dev,
+			      struct comedi_subdevice *s,
+			      struct comedi_insn *insn,
+			      unsigned int *data)
+{
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	unsigned int range = CR_RANGE(insn->chanspec);
+	unsigned int val;
+	int ret;
+	int i;
+
+	for (i = 0; i < insn->n; i++) {
+		outb(chan, dev->iobase + PCMAD_CONVERT);
+
+		ret = pcmad_ai_wait_for_eoc(dev, TIMEOUT);
+		if (ret)
+			return ret;
+
+		val = inb(dev->iobase + PCMAD_LSB) |
+		      (inb(dev->iobase + PCMAD_MSB) << 8);
+
+		/* data is shifted on the pcmad12, fix it */
+		if (s->maxdata == 0x0fff)
+			val >>= 4;
+
+		if (pcmad_range_is_bipolar(s, range)) {
+			/* munge the two's complement value */
+			val ^= ((s->maxdata + 1) >> 1);
+		}
+
+		data[i] = val;
+	}
+
+	return insn->n;
+}
+
 static int pcmad_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 {
 	const struct pcmad_board_struct *board = comedi_board(dev);
-	struct pcmad_priv_struct *devpriv;
 	struct comedi_subdevice *s;
 	int ret;
 
-	ret = comedi_request_region(dev, it->options[0], PCMAD_SIZE);
+	ret = comedi_request_region(dev, it->options[0], 0x04);
 	if (ret)
 		return ret;
 
@@ -116,32 +130,25 @@
 	if (ret)
 		return ret;
 
-	devpriv = kzalloc(sizeof(*devpriv), GFP_KERNEL);
-	if (!devpriv)
-		return -ENOMEM;
-	dev->private = devpriv;
-
 	s = &dev->subdevices[0];
-	s->type = COMEDI_SUBD_AI;
-	s->subdev_flags = SDF_READABLE | AREF_GROUND;
-	s->n_chan = 16;		/* XXX */
-	s->len_chanlist = 1;
-	s->insn_read = pcmad_ai_insn_read;
-	s->maxdata = (1 << board->n_ai_bits) - 1;
-	s->range_table = &range_unknown;
+	s->type		= COMEDI_SUBD_AI;
+	if (it->options[1]) {
+		/* 8 differential channels */
+		s->subdev_flags	= SDF_READABLE | AREF_DIFF;
+		s->n_chan	= 8;
+	} else {
+		/* 16 single-ended channels */
+		s->subdev_flags	= SDF_READABLE | AREF_GROUND;
+		s->n_chan	= 16;
+	}
+	s->len_chanlist	= 1;
+	s->maxdata	= board->ai_maxdata;
+	s->range_table	= it->options[2] ? &range_bipolar10 : &range_unipolar5;
+	s->insn_read	= pcmad_ai_insn_read;
 
 	return 0;
 }
 
-static const struct pcmad_board_struct pcmad_boards[] = {
-	{
-		.name		= "pcmad12",
-		.n_ai_bits	= 12,
-	}, {
-		.name		= "pcmad16",
-		.n_ai_bits	= 16,
-	},
-};
 static struct comedi_driver pcmad_driver = {
 	.driver_name	= "pcmad",
 	.module		= THIS_MODULE,

diff --git a/drivers/staging/comedi/drivers/pcmda12.c b/drivers/staging/comedi/drivers/pcmda12.c
index 61e7fd1..774a63d 100644
--- a/drivers/staging/comedi/drivers/pcmda12.c
+++ b/drivers/staging/comedi/drivers/pcmda12.c

@@ -1,152 +1,130 @@
 /*
-    comedi/drivers/pcmda12.c
-    Driver for Winsystems PC-104 based PCM-D/A-12 8-channel AO board.
+ * pcmda12.c
+ * Driver for Winsystems PC-104 based PCM-D/A-12 8-channel AO board.
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 2006 Calin A. Culianu <calin@ajvar.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
 
-    COMEDI - Linux Control and Measurement Device Interface
-    Copyright (C) 2006 Calin A. Culianu <calin@ajvar.org>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
 /*
-Driver: pcmda12
-Description: A driver for the Winsystems PCM-D/A-12
-Devices: [Winsystems] PCM-D/A-12 (pcmda12)
-Author: Calin Culianu <calin@ajvar.org>
-Updated: Fri, 13 Jan 2006 12:01:01 -0500
-Status: works
-
-A driver for the relatively straightforward-to-program PCM-D/A-12.
-This board doesn't support commands, and the only way to set its
-analog output range is to jumper the board.  As such,
-comedi_data_write() ignores the range value specified.
-
-The board uses 16 consecutive I/O addresses starting at the I/O port
-base address.  Each address corresponds to the LSB then MSB of a
-particular channel from 0-7.
-
-Note that the board is not ISA-PNP capable and thus
-needs the I/O port comedi_config parameter.
-
-Note that passing a nonzero value as the second config option will
-enable "simultaneous xfer" mode for this board, in which AO writes
-will not take effect until a subsequent read of any AO channel.  This
-is so that one can speed up programming by preloading all AO registers
-with values before simultaneously setting them to take effect with one
-read command.
-
-Configuration Options:
-  [0] - I/O port base address
-  [1] - Do Simultaneous Xfer (see description)
-*/
+ * Driver: pcmda12
+ * Description: A driver for the Winsystems PCM-D/A-12
+ * Devices: (Winsystems) PCM-D/A-12 [pcmda12]
+ * Author: Calin Culianu <calin@ajvar.org>
+ * Updated: Fri, 13 Jan 2006 12:01:01 -0500
+ * Status: works
+ *
+ * A driver for the relatively straightforward-to-program PCM-D/A-12.
+ * This board doesn't support commands, and the only way to set its
+ * analog output range is to jumper the board. As such,
+ * comedi_data_write() ignores the range value specified.
+ *
+ * The board uses 16 consecutive I/O addresses starting at the I/O port
+ * base address. Each address corresponds to the LSB then MSB of a
+ * particular channel from 0-7.
+ *
+ * Note that the board is not ISA-PNP capable and thus needs the I/O
+ * port comedi_config parameter.
+ *
+ * Note that passing a nonzero value as the second config option will
+ * enable "simultaneous xfer" mode for this board, in which AO writes
+ * will not take effect until a subsequent read of any AO channel. This
+ * is so that one can speed up programming by preloading all AO registers
+ * with values before simultaneously setting them to take effect with one
+ * read command.
+ *
+ * Configuration Options:
+ *   [0] - I/O port base address
+ *   [1] - Do Simultaneous Xfer (see description)
+ */
 
 #include "../comedidev.h"
 
-#define CHANS 8
-#define IOSIZE 16
-#define LSB(x) ((unsigned char)((x) & 0xff))
-#define MSB(x) ((unsigned char)((((unsigned short)(x))>>8) & 0xff))
-#define LSB_PORT(chan) (dev->iobase + (chan)*2)
-#define MSB_PORT(chan) (LSB_PORT(chan)+1)
-#define BITS 12
-
-/* note these have no effect and are merely here for reference..
-   these are configured by jumpering the board! */
+/* AI range is not configurable, it's set by jumpers on the board */
 static const struct comedi_lrange pcmda12_ranges = {
-	3,
-	{
-	 UNI_RANGE(5), UNI_RANGE(10), BIP_RANGE(5)
-	 }
+	3, {
+		UNI_RANGE(5),
+		UNI_RANGE(10),
+		BIP_RANGE(5)
+	}
 };
 
 struct pcmda12_private {
-
-	unsigned int ao_readback[CHANS];
+	unsigned int ao_readback[8];
 	int simultaneous_xfer_mode;
 };
 
-static void zero_chans(struct comedi_device *dev)
-{				/* sets up an
-				   ASIC chip to defaults */
-	int i;
-	for (i = 0; i < CHANS; ++i) {
-/*      /\* do this as one instruction?? *\/ */
-/*      outw(0, LSB_PORT(chan)); */
-		outb(0, LSB_PORT(i));
-		outb(0, MSB_PORT(i));
-	}
-	inb(LSB_PORT(0));	/* update chans. */
-}
-
-static int ao_winsn(struct comedi_device *dev, struct comedi_subdevice *s,
-		    struct comedi_insn *insn, unsigned int *data)
+static int pcmda12_ao_insn_write(struct comedi_device *dev,
+				 struct comedi_subdevice *s,
+				 struct comedi_insn *insn,
+				 unsigned int *data)
 {
 	struct pcmda12_private *devpriv = dev->private;
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	unsigned int val = devpriv->ao_readback[chan];
+	unsigned long ioreg = dev->iobase + (chan * 2);
 	int i;
-	int chan = CR_CHAN(insn->chanspec);
 
-	/* Writing a list of values to an AO channel is probably not
-	 * very useful, but that's how the interface is defined. */
 	for (i = 0; i < insn->n; ++i) {
+		val = data[i];
+		outb(val & 0xff, ioreg);
+		outb((val >> 8) & 0xff, ioreg + 1);
 
-/*      /\* do this as one instruction?? *\/ */
-/*      outw(data[i], LSB_PORT(chan)); */
-
-		/* Need to do this as two instructions due to 8-bit bus?? */
-		/*  first, load the low byte */
-		outb(LSB(data[i]), LSB_PORT(chan));
-		/*  next, write the high byte */
-		outb(MSB(data[i]), MSB_PORT(chan));
-
-		/* save shadow register */
-		devpriv->ao_readback[chan] = data[i];
-
+		/*
+		 * Initiate transfer if not in simultaneaous xfer
+		 * mode by reading one of the AO registers.
+		 */
 		if (!devpriv->simultaneous_xfer_mode)
-			inb(LSB_PORT(chan));
+			inb(ioreg);
 	}
+	devpriv->ao_readback[chan] = val;
 
-	/* return the number of samples written */
-	return i;
+	return insn->n;
 }
 
-/* AO subdevices should have a read insn as well as a write insn.
-
-   Usually this means copying a value stored in devpriv->ao_readback.
-   However, since this driver supports simultaneous xfer then sometimes
-   this function actually accomplishes work.
-
-   Simultaneaous xfer mode is accomplished by loading ALL the values
-   you want for AO in all the channels, then READing off one of the AO
-   registers to initiate the instantaneous simultaneous update of all
-   DAC outputs, which makes all AO channels update simultaneously.
-   This is useful for some control applications, I would imagine.
-*/
-static int ao_rinsn(struct comedi_device *dev, struct comedi_subdevice *s,
-		    struct comedi_insn *insn, unsigned int *data)
+static int pcmda12_ao_insn_read(struct comedi_device *dev,
+				struct comedi_subdevice *s,
+				struct comedi_insn *insn,
+				unsigned int *data)
 {
 	struct pcmda12_private *devpriv = dev->private;
+	unsigned int chan = CR_CHAN(insn->chanspec);
 	int i;
-	int chan = CR_CHAN(insn->chanspec);
 
-	for (i = 0; i < insn->n; i++) {
-		if (devpriv->simultaneous_xfer_mode)
-			inb(LSB_PORT(chan));
-		/* read back shadow register */
+	/*
+	 * Initiate simultaneaous xfer mode by reading one of the
+	 * AO registers. All analog outputs will then be updated.
+	 */
+	if (devpriv->simultaneous_xfer_mode)
+		inb(dev->iobase);
+
+	for (i = 0; i < insn->n; i++)
 		data[i] = devpriv->ao_readback[chan];
-	}
 
-	return i;
+	return insn->n;
+}
+
+static void pcmda12_ao_reset(struct comedi_device *dev,
+			     struct comedi_subdevice *s)
+{
+	int i;
+
+	for (i = 0; i < s->n_chan; ++i) {
+		outb(0, dev->iobase + (i * 2));
+		outb(0, dev->iobase + (i * 2) + 1);
+	}
+	/* Initiate transfer by reading one of the AO registers. */
+	inb(dev->iobase);
 }
 
 static int pcmda12_attach(struct comedi_device *dev,
@@ -156,7 +134,7 @@
 	struct comedi_subdevice *s;
 	int ret;
 
-	ret = comedi_request_region(dev, it->options[0], IOSIZE);
+	ret = comedi_request_region(dev, it->options[0], 0x10);
 	if (ret)
 		return ret;
 
@@ -172,18 +150,17 @@
 		return ret;
 
 	s = &dev->subdevices[0];
-	s->private = NULL;
-	s->maxdata = (0x1 << BITS) - 1;
-	s->range_table = &pcmda12_ranges;
-	s->type = COMEDI_SUBD_AO;
-	s->subdev_flags = SDF_READABLE | SDF_WRITABLE;
-	s->n_chan = CHANS;
-	s->insn_write = &ao_winsn;
-	s->insn_read = &ao_rinsn;
+	s->type		= COMEDI_SUBD_AO;
+	s->subdev_flags	= SDF_READABLE | SDF_WRITABLE;
+	s->n_chan	= 8;
+	s->maxdata	= 0x0fff;
+	s->range_table	= &pcmda12_ranges;
+	s->insn_write	= pcmda12_ao_insn_write;
+	s->insn_read	= pcmda12_ao_insn_read;
 
-	zero_chans(dev);	/* clear out all the registers, basically */
+	pcmda12_ao_reset(dev, s);
 
-	return 1;
+	return 0;
 }
 
 static struct comedi_driver pcmda12_driver = {

diff --git a/drivers/staging/comedi/drivers/pcmmio.c b/drivers/staging/comedi/drivers/pcmmio.c
index 5a236cd..9f76b1f 100644
--- a/drivers/staging/comedi/drivers/pcmmio.c
+++ b/drivers/staging/comedi/drivers/pcmmio.c

@@ -14,10 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 /*
 Driver: pcmmio

diff --git a/drivers/staging/comedi/drivers/pcmuio.c b/drivers/staging/comedi/drivers/pcmuio.c
index 0c98e26..c43b633 100644
--- a/drivers/staging/comedi/drivers/pcmuio.c
+++ b/drivers/staging/comedi/drivers/pcmuio.c

@@ -1,79 +1,77 @@
 /*
-    comedi/drivers/pcmuio.c
-    Driver for Winsystems PC-104 based 48-channel and 96-channel DIO boards.
+ * pcmuio.c
+ * Comedi driver for Winsystems PC-104 based 48/96-channel DIO boards.
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 2006 Calin A. Culianu <calin@ajvar.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
 
-    COMEDI - Linux Control and Measurement Device Interface
-    Copyright (C) 2006 Calin A. Culianu <calin@ajvar.org>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
 /*
-Driver: pcmuio
-Description: A driver for the PCM-UIO48A and PCM-UIO96A boards from Winsystems.
-Devices: [Winsystems] PCM-UIO48A (pcmuio48), PCM-UIO96A (pcmuio96)
-Author: Calin Culianu <calin@ajvar.org>
-Updated: Fri, 13 Jan 2006 12:01:01 -0500
-Status: works
-
-A driver for the relatively straightforward-to-program PCM-UIO48A and
-PCM-UIO96A boards from Winsystems.  These boards use either one or two
-(in the 96-DIO version) WS16C48 ASIC HighDensity I/O Chips (HDIO).
-This chip is interesting in that each I/O line is individually
-programmable for INPUT or OUTPUT (thus comedi_dio_config can be done
-on a per-channel basis).  Also, each chip supports edge-triggered
-interrupts for the first 24 I/O lines.  Of course, since the
-96-channel version of the board has two ASICs, it can detect polarity
-changes on up to 48 I/O lines.  Since this is essentially an (non-PnP)
-ISA board, I/O Address and IRQ selection are done through jumpers on
-the board.  You need to pass that information to this driver as the
-first and second comedi_config option, respectively.  Note that the
-48-channel version uses 16 bytes of IO memory and the 96-channel
-version uses 32-bytes (in case you are worried about conflicts).  The
-48-channel board is split into two 24-channel comedi subdevices.
-The 96-channel board is split into 4 24-channel DIO subdevices.
-
-Note that IRQ support has been added, but it is untested.
-
-To use edge-detection IRQ support, pass the IRQs of both ASICS
-(for the 96 channel version) or just 1 ASIC (for 48-channel version).
-Then, use use comedi_commands with TRIG_NOW.
-Your callback will be called each time an edge is triggered, and the data
-values will be two sample_t's, which should be concatenated to form one
-32-bit unsigned int.  This value is the mask of channels that had
-edges detected from your channel list.  Note that the bits positions
-in the mask correspond to positions in your chanlist when you specified
-the command and *not* channel id's!
-
-To set the polarity of the edge-detection interrupts pass a nonzero value for
-either CR_RANGE or CR_AREF for edge-up polarity, or a zero value for both
-CR_RANGE and CR_AREF if you want edge-down polarity.
-
-In the 48-channel version:
-
-On subdev 0, the first 24 channels channels are edge-detect channels.
-
-In the 96-channel board you have the collowing channels that can do edge detection:
-
-subdev 0, channels 0-24  (first 24 channels of 1st ASIC)
-subdev 2, channels 0-24  (first 24 channels of 2nd ASIC)
-
-Configuration Options:
-  [0] - I/O port base address
-  [1] - IRQ (for first ASIC, or first 24 channels)
-  [2] - IRQ for second ASIC (pcmuio96 only - IRQ for chans 48-72 .. can be the same as first irq!)
-*/
+ * Driver: pcmuio
+ * Description: Winsystems PC-104 based 48/96-channel DIO boards.
+ * Devices: (Winsystems) PCM-UIO48A [pcmuio48]
+ *	    (Winsystems) PCM-UIO96A [pcmuio96]
+ * Author: Calin Culianu <calin@ajvar.org>
+ * Updated: Fri, 13 Jan 2006 12:01:01 -0500
+ * Status: works
+ *
+ * A driver for the relatively straightforward-to-program PCM-UIO48A and
+ * PCM-UIO96A boards from Winsystems. These boards use either one or two
+ * (in the 96-DIO version) WS16C48 ASIC HighDensity I/O Chips (HDIO). This
+ * chip is interesting in that each I/O line is individually programmable
+ * for INPUT or OUTPUT (thus comedi_dio_config can be done on a per-channel
+ * basis). Also, each chip supports edge-triggered interrupts for the first
+ * 24 I/O lines. Of course, since the 96-channel version of the board has
+ * two ASICs, it can detect polarity changes on up to 48 I/O lines. Since
+ * this is essentially an (non-PnP) ISA board, I/O Address and IRQ selection
+ * are done through jumpers on the board. You need to pass that information
+ * to this driver as the first and second comedi_config option, respectively.
+ * Note that the 48-channel version uses 16 bytes of IO memory and the 96-
+ * channel version uses 32-bytes (in case you are worried about conflicts).
+ * The 48-channel board is split into two 24-channel comedi subdevices. The
+ * 96-channel board is split into 4 24-channel DIO subdevices.
+ *
+ * Note that IRQ support has been added, but it is untested.
+ *
+ * To use edge-detection IRQ support, pass the IRQs of both ASICS (for the
+ * 96 channel version) or just 1 ASIC (for 48-channel version). Then, use
+ * comedi_commands with TRIG_NOW. Your callback will be called each time an
+ * edge is triggered, and the data values will be two sample_t's, which
+ * should be concatenated to form one 32-bit unsigned int.  This value is
+ * the mask of channels that had edges detected from your channel list. Note
+ * that the bits positions in the mask correspond to positions in your
+ * chanlist when you specified the command and *not* channel id's!
+ *
+ * To set the polarity of the edge-detection interrupts pass a nonzero value
+ * for either CR_RANGE or CR_AREF for edge-up polarity, or a zero value for
+ * both CR_RANGE and CR_AREF if you want edge-down polarity.
+ *
+ * In the 48-channel version:
+ *
+ * On subdev 0, the first 24 channels channels are edge-detect channels.
+ *
+ * In the 96-channel board you have the following channels that can do edge
+ * detection:
+ *
+ * subdev 0, channels 0-24  (first 24 channels of 1st ASIC)
+ * subdev 2, channels 0-24  (first 24 channels of 2nd ASIC)
+ *
+ * Configuration Options:
+ *  [0] - I/O port base address
+ *  [1] - IRQ (for first ASIC, or first 24 channels)
+ *  [2] - IRQ (for second ASIC, pcmuio96 only - IRQ for chans 48-72
+ *             can be the same as first irq!)
+ */
 
 #include <linux/interrupt.h>
 #include <linux/slab.h>
@@ -82,94 +80,62 @@
 
 #include "comedi_fc.h"
 
-#define CHANS_PER_PORT   8
-#define PORTS_PER_ASIC   6
-#define INTR_PORTS_PER_ASIC   3
-#define MAX_CHANS_PER_SUBDEV 24	/* number of channels per comedi subdevice */
-#define PORTS_PER_SUBDEV (MAX_CHANS_PER_SUBDEV/CHANS_PER_PORT)
-#define CHANS_PER_ASIC (CHANS_PER_PORT*PORTS_PER_ASIC)
-#define INTR_CHANS_PER_ASIC 24
-#define INTR_PORTS_PER_SUBDEV (INTR_CHANS_PER_ASIC/CHANS_PER_PORT)
-#define MAX_DIO_CHANS   (PORTS_PER_ASIC*2*CHANS_PER_PORT)
-#define MAX_ASICS       (MAX_DIO_CHANS/CHANS_PER_ASIC)
-#define CALC_N_SUBDEVS(nchans) ((nchans)/MAX_CHANS_PER_SUBDEV + (!!((nchans)%MAX_CHANS_PER_SUBDEV)) /*+ (nchans > INTR_CHANS_PER_ASIC ? 2 : 1)*/)
-/* IO Memory sizes */
-#define ASIC_IOSIZE (0x10)
-#define PCMUIO48_IOSIZE ASIC_IOSIZE
-#define PCMUIO96_IOSIZE (ASIC_IOSIZE*2)
-
-/* Some offsets - these are all in the 16byte IO memory offset from
-   the base address.  Note that there is a paging scheme to swap out
-   offsets 0x8-0xA using the PAGELOCK register.  See the table below.
-
-  Register(s)       Pages        R/W?        Description
-  --------------------------------------------------------------
-  REG_PORTx         All          R/W         Read/Write/Configure IO
-  REG_INT_PENDING   All          ReadOnly    Quickly see which INT_IDx has int.
-  REG_PAGELOCK      All          WriteOnly   Select a page
-  REG_POLx          Pg. 1 only   WriteOnly   Select edge-detection polarity
-  REG_ENABx         Pg. 2 only   WriteOnly   Enable/Disable edge-detect. int.
-  REG_INT_IDx       Pg. 3 only   R/W         See which ports/bits have ints.
- */
-#define REG_PORT0 0x0
-#define REG_PORT1 0x1
-#define REG_PORT2 0x2
-#define REG_PORT3 0x3
-#define REG_PORT4 0x4
-#define REG_PORT5 0x5
-#define REG_INT_PENDING 0x6
-#define REG_PAGELOCK 0x7	/* page selector register, upper 2 bits select a page
-				   and bits 0-5 are used to 'lock down' a particular
-				   port above to make it readonly.  */
-#define REG_POL0 0x8
-#define REG_POL1 0x9
-#define REG_POL2 0xA
-#define REG_ENAB0 0x8
-#define REG_ENAB1 0x9
-#define REG_ENAB2 0xA
-#define REG_INT_ID0 0x8
-#define REG_INT_ID1 0x9
-#define REG_INT_ID2 0xA
-
-#define NUM_PAGED_REGS 3
-#define NUM_PAGES 4
-#define FIRST_PAGED_REG 0x8
-#define REG_PAGE_BITOFFSET 6
-#define REG_LOCK_BITOFFSET 0
-#define REG_PAGE_MASK (~((0x1<<REG_PAGE_BITOFFSET)-1))
-#define REG_LOCK_MASK ~(REG_PAGE_MASK)
-#define PAGE_POL 1
-#define PAGE_ENAB 2
-#define PAGE_INT_ID 3
-
 /*
- * Board descriptions for two imaginary boards.  Describing the
- * boards in this way is optional, and completely driver-dependent.
- * Some drivers use arrays such as this, other do not.
+ * Register I/O map
+ *
+ * Offset    Page 0       Page 1       Page 2       Page 3
+ * ------  -----------  -----------  -----------  -----------
+ *  0x00   Port 0 I/O   Port 0 I/O   Port 0 I/O   Port 0 I/O
+ *  0x01   Port 1 I/O   Port 1 I/O   Port 1 I/O   Port 1 I/O
+ *  0x02   Port 2 I/O   Port 2 I/O   Port 2 I/O   Port 2 I/O
+ *  0x03   Port 3 I/O   Port 3 I/O   Port 3 I/O   Port 3 I/O
+ *  0x04   Port 4 I/O   Port 4 I/O   Port 4 I/O   Port 4 I/O
+ *  0x05   Port 5 I/O   Port 5 I/O   Port 5 I/O   Port 5 I/O
+ *  0x06   INT_PENDING  INT_PENDING  INT_PENDING  INT_PENDING
+ *  0x07    Page/Lock    Page/Lock    Page/Lock    Page/Lock
+ *  0x08       N/A         POL_0       ENAB_0       INT_ID0
+ *  0x09       N/A         POL_1       ENAB_1       INT_ID1
+ *  0x0a       N/A         POL_2       ENAB_2       INT_ID2
  */
+#define PCMUIO_PORT_REG(x)		(0x00 + (x))
+#define PCMUIO_INT_PENDING_REG		0x06
+#define PCMUIO_PAGE_LOCK_REG		0x07
+#define PCMUIO_LOCK_PORT(x)		((1 << (x)) & 0x3f)
+#define PCMUIO_PAGE(x)			(((x) & 0x3) << 6)
+#define PCMUIO_PAGE_MASK		PCMUIO_PAGE(3)
+#define PCMUIO_PAGE_POL			1
+#define PCMUIO_PAGE_ENAB		2
+#define PCMUIO_PAGE_INT_ID		3
+#define PCMUIO_PAGE_REG(x)		(0x08 + (x))
+
+#define PCMUIO_ASIC_IOSIZE		0x10
+#define PCMUIO_MAX_ASICS		2
+
 struct pcmuio_board {
 	const char *name;
 	const int num_asics;
-	const int num_channels_per_port;
-	const int num_ports;
 };
 
-/* this structure is for data unique to this subdevice.  */
-struct pcmuio_subdev_private {
-	/* mapping of halfwords (bytes) in port/chanarray to iobase */
-	unsigned long iobases[PORTS_PER_SUBDEV];
+static const struct pcmuio_board pcmuio_boards[] = {
+	{
+		.name		= "pcmuio48",
+		.num_asics	= 1,
+	}, {
+		.name		= "pcmuio96",
+		.num_asics	= 2,
+	},
+};
 
+struct pcmuio_subdev_private {
 	/* The below is only used for intr subdevices */
 	struct {
-		int asic;	/* if non-negative, this subdev has an interrupt asic */
-		int first_chan;	/* if nonnegative, the first channel id for
-				   interrupts. */
-		int num_asic_chans;	/* the number of asic channels in this subdev
-					   that have interrutps */
-		int asic_chan;	/* if nonnegative, the first channel id with
-				   respect to the asic that has interrupts */
-		int enabled_mask;	/* subdev-relative channel mask for channels
-					   we are interested in */
+		/* if non-negative, this subdev has an interrupt asic */
+		int asic;
+		/*
+		 * subdev-relative channel mask for channels
+		 * we are interested in
+		 */
+		int enabled_mask;
 		int active;
 		int stop_count;
 		int continuous;
@@ -177,160 +143,112 @@
 	} intr;
 };
 
-/* this structure is for data unique to this hardware driver.  If
-   several hardware drivers keep similar information in this structure,
-   feel free to suggest moving the variable to the struct comedi_device struct.  */
 struct pcmuio_private {
 	struct {
-		unsigned char pagelock;	/* current page and lock */
-		unsigned char pol[NUM_PAGED_REGS];	/* shadow of POLx registers */
-		unsigned char enab[NUM_PAGED_REGS];	/* shadow of ENABx registers */
-		int num;
-		unsigned long iobase;
 		unsigned int irq;
 		spinlock_t spinlock;
-	} asics[MAX_ASICS];
+	} asics[PCMUIO_MAX_ASICS];
 	struct pcmuio_subdev_private *sprivs;
 };
 
-#define subpriv ((struct pcmuio_subdev_private *)s->private)
+static void pcmuio_write(struct comedi_device *dev, unsigned int val,
+			 int asic, int page, int port)
+{
+	unsigned long iobase = dev->iobase + (asic * PCMUIO_ASIC_IOSIZE);
 
-/* DIO devices are slightly special.  Although it is possible to
- * implement the insn_read/insn_write interface, it is much more
- * useful to applications if you implement the insn_bits interface.
- * This allows packed reading/writing of the DIO channels.  The
- * comedi core can convert between insn_bits and insn_read/write */
+	if (page == 0) {
+		/* Port registers are valid for any page */
+		outb(val & 0xff, iobase + PCMUIO_PORT_REG(port + 0));
+		outb((val >> 8) & 0xff, iobase + PCMUIO_PORT_REG(port + 1));
+		outb((val >> 16) & 0xff, iobase + PCMUIO_PORT_REG(port + 2));
+	} else {
+		outb(PCMUIO_PAGE(page), iobase + PCMUIO_PAGE_LOCK_REG);
+		outb(val & 0xff, iobase + PCMUIO_PAGE_REG(0));
+		outb((val >> 8) & 0xff, iobase + PCMUIO_PAGE_REG(1));
+		outb((val >> 16) & 0xff, iobase + PCMUIO_PAGE_REG(2));
+	}
+}
+
+static unsigned int pcmuio_read(struct comedi_device *dev,
+				int asic, int page, int port)
+{
+	unsigned long iobase = dev->iobase + (asic * PCMUIO_ASIC_IOSIZE);
+	unsigned int val;
+
+	if (page == 0) {
+		/* Port registers are valid for any page */
+		val = inb(iobase + PCMUIO_PORT_REG(port + 0));
+		val |= (inb(iobase + PCMUIO_PORT_REG(port + 1)) << 8);
+		val |= (inb(iobase + PCMUIO_PORT_REG(port + 2)) << 16);
+	} else {
+		outb(PCMUIO_PAGE(page), iobase + PCMUIO_PAGE_LOCK_REG);
+		val = inb(iobase + PCMUIO_PAGE_REG(0));
+		val |= (inb(iobase + PCMUIO_PAGE_REG(1)) << 8);
+		val |= (inb(iobase + PCMUIO_PAGE_REG(2)) << 16);
+	}
+
+	return val;
+}
+
+/*
+ * Each channel can be individually programmed for input or output.
+ * Writing a '0' to a channel causes the corresponding output pin
+ * to go to a high-z state (pulled high by an external 10K resistor).
+ * This allows it to be used as an input. When used in the input mode,
+ * a read reflects the inverted state of the I/O pin, such that a
+ * high on the pin will read as a '0' in the register. Writing a '1'
+ * to a bit position causes the pin to sink current (up to 12mA),
+ * effectively pulling it low.
+ */
 static int pcmuio_dio_insn_bits(struct comedi_device *dev,
 				struct comedi_subdevice *s,
 				struct comedi_insn *insn, unsigned int *data)
 {
-	int byte_no;
+	unsigned int mask = data[0] & s->io_bits;	/* outputs only */
+	unsigned int bits = data[1];
+	int asic = s->index / 2;
+	int port = (s->index % 2) ? 3 : 0;
+	unsigned int val;
 
-	/* NOTE:
-	   reading a 0 means this channel was high
-	   writine a 0 sets the channel high
-	   reading a 1 means this channel was low
-	   writing a 1 means set this channel low
+	/* get inverted state of the channels from the port */
+	val = pcmuio_read(dev, asic, 0, port);
 
-	   Therefore everything is always inverted. */
+	/* get the true state of the channels */
+	s->state = val ^ ((0x1 << s->n_chan) - 1);
 
-	/* The insn data is a mask in data[0] and the new data
-	 * in data[1], each channel cooresponding to a bit. */
+	if (mask) {
+		s->state &= ~mask;
+		s->state |= (mask & bits);
 
-#ifdef DAMMIT_ITS_BROKEN
-	/* DEBUG */
-	dev_dbg(dev->class_dev, "write mask: %08x  data: %08x\n", data[0],
-		data[1]);
-#endif
-
-	s->state = 0;
-
-	for (byte_no = 0; byte_no < s->n_chan / CHANS_PER_PORT; ++byte_no) {
-		/* address of 8-bit port */
-		unsigned long ioaddr = subpriv->iobases[byte_no],
-		    /* bit offset of port in 32-bit doubleword */
-		    offset = byte_no * 8;
-		/* this 8-bit port's data */
-		unsigned char byte = 0,
-		    /* The write mask for this port (if any) */
-		    write_mask_byte = (data[0] >> offset) & 0xff,
-		    /* The data byte for this port */
-		    data_byte = (data[1] >> offset) & 0xff;
-
-		byte = inb(ioaddr);	/* read all 8-bits for this port */
-
-#ifdef DAMMIT_ITS_BROKEN
-		/* DEBUG */
-		printk
-		    ("byte %d wmb %02x db %02x offset %02d io %04x, data_in %02x ",
-		     byte_no, (unsigned)write_mask_byte, (unsigned)data_byte,
-		     offset, ioaddr, (unsigned)byte);
-#endif
-
-		if (write_mask_byte) {
-			/* this byte has some write_bits -- so set the output lines */
-			byte &= ~write_mask_byte;	/* clear bits for write mask */
-			byte |= ~data_byte & write_mask_byte;	/* set to inverted data_byte */
-			/* Write out the new digital output state */
-			outb(byte, ioaddr);
-		}
-#ifdef DAMMIT_ITS_BROKEN
-		/* DEBUG */
-		dev_dbg(dev->class_dev, "data_out_byte %02x\n", (unsigned)byte);
-#endif
-		/* save the digital input lines for this byte.. */
-		s->state |= ((unsigned int)byte) << offset;
+		/* invert the state and update the channels */
+		val = s->state ^ ((0x1 << s->n_chan) - 1);
+		pcmuio_write(dev, val, asic, 0, port);
 	}
 
-	/* now return the DIO lines to data[1] - note they came inverted! */
-	data[1] = ~s->state;
-
-#ifdef DAMMIT_ITS_BROKEN
-	/* DEBUG */
-	dev_dbg(dev->class_dev, "s->state %08x data_out %08x\n", s->state,
-		data[1]);
-#endif
+	data[1] = s->state;
 
 	return insn->n;
 }
 
-/* The input or output configuration of each digital line is
- * configured by a special insn_config instruction.  chanspec
- * contains the channel to be changed, and data[0] contains the
- * value COMEDI_INPUT or COMEDI_OUTPUT. */
 static int pcmuio_dio_insn_config(struct comedi_device *dev,
 				  struct comedi_subdevice *s,
 				  struct comedi_insn *insn, unsigned int *data)
 {
-	int chan = CR_CHAN(insn->chanspec), byte_no = chan / 8, bit_no =
-	    chan % 8;
-	unsigned long ioaddr;
-	unsigned char byte;
-
-	/* Compute ioaddr for this channel */
-	ioaddr = subpriv->iobases[byte_no];
-
-	/* NOTE:
-	   writing a 0 an IO channel's bit sets the channel to INPUT
-	   and pulls the line high as well
-
-	   writing a 1 to an IO channel's  bit pulls the line low
-
-	   All channels are implicitly always in OUTPUT mode -- but when
-	   they are high they can be considered to be in INPUT mode..
-
-	   Thus, we only force channels low if the config request was INPUT,
-	   otherwise we do nothing to the hardware.    */
+	unsigned int chan_mask = 1 << CR_CHAN(insn->chanspec);
+	int asic = s->index / 2;
+	int port = (s->index % 2) ? 3 : 0;
 
 	switch (data[0]) {
 	case INSN_CONFIG_DIO_OUTPUT:
-		/* save to io_bits -- don't actually do anything since
-		   all input channels are also output channels... */
-		s->io_bits |= 1 << chan;
+		s->io_bits |= chan_mask;
 		break;
 	case INSN_CONFIG_DIO_INPUT:
-		/* write a 0 to the actual register representing the channel
-		   to set it to 'input'.  0 means "float high". */
-		byte = inb(ioaddr);
-		byte &= ~(1 << bit_no);
-				/**< set input channel to '0' */
-
-		/* write out byte -- this is the only time we actually affect the
-		   hardware as all channels are implicitly output -- but input
-		   channels are set to float-high */
-		outb(byte, ioaddr);
-
-		/* save to io_bits */
-		s->io_bits &= ~(1 << chan);
+		s->io_bits &= ~chan_mask;
+		pcmuio_write(dev, s->io_bits, asic, 0, port);
 		break;
-
 	case INSN_CONFIG_DIO_QUERY:
-		/* retrieve from shadow register */
-		data[1] =
-		    (s->io_bits & (1 << chan)) ? COMEDI_OUTPUT : COMEDI_INPUT;
-		return insn->n;
+		data[1] = (s->io_bits & chan_mask) ? COMEDI_OUTPUT : COMEDI_INPUT;
 		break;
-
 	default:
 		return -EINVAL;
 		break;
@@ -339,100 +257,28 @@
 	return insn->n;
 }
 
-static void switch_page(struct comedi_device *dev, int asic, int page)
+static void pcmuio_reset(struct comedi_device *dev)
 {
 	const struct pcmuio_board *board = comedi_board(dev);
-	struct pcmuio_private *devpriv = dev->private;
-
-	if (asic < 0 || asic >= board->num_asics)
-		return;		/* paranoia */
-	if (page < 0 || page >= NUM_PAGES)
-		return;		/* more paranoia */
-
-	devpriv->asics[asic].pagelock &= ~REG_PAGE_MASK;
-	devpriv->asics[asic].pagelock |= page << REG_PAGE_BITOFFSET;
-
-	/* now write out the shadow register */
-	outb(devpriv->asics[asic].pagelock,
-	     dev->iobase + ASIC_IOSIZE * asic + REG_PAGELOCK);
-}
-
-static void init_asics(struct comedi_device *dev)
-{				/* sets up an
-				   ASIC chip to defaults */
-	const struct pcmuio_board *board = comedi_board(dev);
 	int asic;
 
 	for (asic = 0; asic < board->num_asics; ++asic) {
-		int port, page;
-		unsigned long baseaddr = dev->iobase + asic * ASIC_IOSIZE;
-
-		switch_page(dev, asic, 0);	/* switch back to page 0 */
-
 		/* first, clear all the DIO port bits */
-		for (port = 0; port < PORTS_PER_ASIC; ++port)
-			outb(0, baseaddr + REG_PORT0 + port);
+		pcmuio_write(dev, 0, asic, 0, 0);
+		pcmuio_write(dev, 0, asic, 0, 3);
 
 		/* Next, clear all the paged registers for each page */
-		for (page = 1; page < NUM_PAGES; ++page) {
-			int reg;
-			/* now clear all the paged registers */
-			switch_page(dev, asic, page);
-			for (reg = FIRST_PAGED_REG;
-			     reg < FIRST_PAGED_REG + NUM_PAGED_REGS; ++reg)
-				outb(0, baseaddr + reg);
-		}
-
-		/* DEBUG  set rising edge interrupts on port0 of both asics */
-		/*switch_page(dev, asic, PAGE_POL);
-		   outb(0xff, baseaddr + REG_POL0);
-		   switch_page(dev, asic, PAGE_ENAB);
-		   outb(0xff, baseaddr + REG_ENAB0); */
-		/* END DEBUG */
-
-		switch_page(dev, asic, 0);	/* switch back to default page 0 */
-
+		pcmuio_write(dev, 0, asic, PCMUIO_PAGE_POL, 0);
+		pcmuio_write(dev, 0, asic, PCMUIO_PAGE_ENAB, 0);
+		pcmuio_write(dev, 0, asic, PCMUIO_PAGE_INT_ID, 0);
 	}
 }
 
-#ifdef notused
-static void lock_port(struct comedi_device *dev, int asic, int port)
-{
-	const struct pcmuio_board *board = comedi_board(dev);
-	struct pcmuio_private *devpriv = dev->private;
-
-	if (asic < 0 || asic >= board->num_asics)
-		return;		/* paranoia */
-	if (port < 0 || port >= PORTS_PER_ASIC)
-		return;		/* more paranoia */
-
-	devpriv->asics[asic].pagelock |= 0x1 << port;
-	/* now write out the shadow register */
-	outb(devpriv->asics[asic].pagelock,
-	     dev->iobase + ASIC_IOSIZE * asic + REG_PAGELOCK);
-}
-
-static void unlock_port(struct comedi_device *dev, int asic, int port)
-{
-	const struct pcmuio_board *board = comedi_board(dev);
-	struct pcmuio_private *devpriv = dev->private;
-
-	if (asic < 0 || asic >= board->num_asics)
-		return;		/* paranoia */
-	if (port < 0 || port >= PORTS_PER_ASIC)
-		return;		/* more paranoia */
-	devpriv->asics[asic].pagelock &= ~(0x1 << port) | REG_LOCK_MASK;
-	/* now write out the shadow register */
-	outb(devpriv->asics[asic].pagelock,
-	     dev->iobase + ASIC_IOSIZE * asic + REG_PAGELOCK);
-}
-#endif /* notused */
-
 static void pcmuio_stop_intr(struct comedi_device *dev,
 			     struct comedi_subdevice *s)
 {
-	int nports, firstport, asic, port;
-	struct pcmuio_private *devpriv = dev->private;
+	struct pcmuio_subdev_private *subpriv = s->private;
+	int asic;
 
 	asic = subpriv->intr.asic;
 	if (asic < 0)
@@ -441,161 +287,124 @@
 	subpriv->intr.enabled_mask = 0;
 	subpriv->intr.active = 0;
 	s->async->inttrig = NULL;
-	nports = subpriv->intr.num_asic_chans / CHANS_PER_PORT;
-	firstport = subpriv->intr.asic_chan / CHANS_PER_PORT;
-	switch_page(dev, asic, PAGE_ENAB);
-	for (port = firstport; port < firstport + nports; ++port) {
-		/* disable all intrs for this subdev.. */
-		outb(0, devpriv->asics[asic].iobase + REG_ENAB0 + port);
-	}
+
+	/* disable all intrs for this subdev.. */
+	pcmuio_write(dev, 0, asic, PCMUIO_PAGE_ENAB, 0);
 }
 
-static irqreturn_t interrupt_pcmuio(int irq, void *d)
+static void pcmuio_handle_intr_subdev(struct comedi_device *dev,
+				      struct comedi_subdevice *s,
+				      unsigned triggered)
 {
-	int asic, got1 = 0;
-	struct comedi_device *dev = (struct comedi_device *)d;
+	struct pcmuio_subdev_private *subpriv = s->private;
+	unsigned int len = s->async->cmd.chanlist_len;
+	unsigned oldevents = s->async->events;
+	unsigned int val = 0;
+	unsigned long flags;
+	unsigned mytrig;
+	unsigned int i;
+
+	spin_lock_irqsave(&subpriv->intr.spinlock, flags);
+
+	if (!subpriv->intr.active)
+		goto done;
+
+	mytrig = triggered;
+	mytrig &= ((0x1 << s->n_chan) - 1);
+
+	if (!(mytrig & subpriv->intr.enabled_mask))
+		goto done;
+
+	for (i = 0; i < len; i++) {
+		unsigned int chan = CR_CHAN(s->async->cmd.chanlist[i]);
+		if (mytrig & (1U << chan))
+			val |= (1U << i);
+	}
+
+	/* Write the scan to the buffer. */
+	if (comedi_buf_put(s->async, ((short *)&val)[0]) &&
+	    comedi_buf_put(s->async, ((short *)&val)[1])) {
+		s->async->events |= (COMEDI_CB_BLOCK | COMEDI_CB_EOS);
+	} else {
+		/* Overflow! Stop acquisition!! */
+		/* TODO: STOP_ACQUISITION_CALL_HERE!! */
+		pcmuio_stop_intr(dev, s);
+	}
+
+	/* Check for end of acquisition. */
+	if (!subpriv->intr.continuous) {
+		/* stop_src == TRIG_COUNT */
+		if (subpriv->intr.stop_count > 0) {
+			subpriv->intr.stop_count--;
+			if (subpriv->intr.stop_count == 0) {
+				s->async->events |= COMEDI_CB_EOA;
+				/* TODO: STOP_ACQUISITION_CALL_HERE!! */
+				pcmuio_stop_intr(dev, s);
+			}
+		}
+	}
+
+done:
+	spin_unlock_irqrestore(&subpriv->intr.spinlock, flags);
+
+	if (oldevents != s->async->events)
+		comedi_event(dev, s);
+}
+
+static int pcmuio_handle_asic_interrupt(struct comedi_device *dev, int asic)
+{
 	struct pcmuio_private *devpriv = dev->private;
+	struct pcmuio_subdev_private *subpriv;
+	unsigned long iobase = dev->iobase + (asic * PCMUIO_ASIC_IOSIZE);
+	unsigned int triggered = 0;
+	int got1 = 0;
+	unsigned long flags;
+	unsigned char int_pend;
 	int i;
 
-	for (asic = 0; asic < MAX_ASICS; ++asic) {
+	spin_lock_irqsave(&devpriv->asics[asic].spinlock, flags);
+
+	int_pend = inb(iobase + PCMUIO_INT_PENDING_REG) & 0x07;
+	if (int_pend) {
+		triggered = pcmuio_read(dev, asic, PCMUIO_PAGE_INT_ID, 0);
+		pcmuio_write(dev, 0, asic, PCMUIO_PAGE_INT_ID, 0);
+
+		++got1;
+	}
+
+	spin_unlock_irqrestore(&devpriv->asics[asic].spinlock, flags);
+
+	if (triggered) {
+		struct comedi_subdevice *s;
+		/* TODO here: dispatch io lines to subdevs with commands.. */
+		for (i = 0; i < dev->n_subdevices; i++) {
+			s = &dev->subdevices[i];
+			subpriv = s->private;
+			if (subpriv->intr.asic == asic) {
+				/*
+				 * This is an interrupt subdev, and it
+				 * matches this asic!
+				 */
+				pcmuio_handle_intr_subdev(dev, s,
+							  triggered);
+			}
+		}
+	}
+	return got1;
+}
+
+static irqreturn_t pcmuio_interrupt(int irq, void *d)
+{
+	struct comedi_device *dev = d;
+	struct pcmuio_private *devpriv = dev->private;
+	int got1 = 0;
+	int asic;
+
+	for (asic = 0; asic < PCMUIO_MAX_ASICS; ++asic) {
 		if (irq == devpriv->asics[asic].irq) {
-			unsigned long flags;
-			unsigned triggered = 0;
-			unsigned long iobase = devpriv->asics[asic].iobase;
 			/* it is an interrupt for ASIC #asic */
-			unsigned char int_pend;
-
-			spin_lock_irqsave(&devpriv->asics[asic].spinlock,
-					  flags);
-
-			int_pend = inb(iobase + REG_INT_PENDING) & 0x07;
-
-			if (int_pend) {
-				int port;
-				for (port = 0; port < INTR_PORTS_PER_ASIC;
-				     ++port) {
-					if (int_pend & (0x1 << port)) {
-						unsigned char
-						    io_lines_with_edges = 0;
-						switch_page(dev, asic,
-							    PAGE_INT_ID);
-						io_lines_with_edges =
-						    inb(iobase +
-							REG_INT_ID0 + port);
-
-						if (io_lines_with_edges)
-							/* clear pending interrupt */
-							outb(0, iobase +
-							     REG_INT_ID0 +
-							     port);
-
-						triggered |=
-						    io_lines_with_edges <<
-						    port * 8;
-					}
-				}
-
-				++got1;
-			}
-
-			spin_unlock_irqrestore(&devpriv->asics[asic].spinlock,
-					       flags);
-
-			if (triggered) {
-				struct comedi_subdevice *s;
-				/* TODO here: dispatch io lines to subdevs with commands.. */
-				printk
-				    ("PCMUIO DEBUG: got edge detect interrupt %d asic %d which_chans: %06x\n",
-				     irq, asic, triggered);
-				for (i = 0; i < dev->n_subdevices; i++) {
-					s = &dev->subdevices[i];
-					if (subpriv->intr.asic == asic) {	/* this is an interrupt subdev, and it matches this asic! */
-						unsigned long flags;
-						unsigned oldevents;
-
-						spin_lock_irqsave(&subpriv->
-								  intr.spinlock,
-								  flags);
-
-						oldevents = s->async->events;
-
-						if (subpriv->intr.active) {
-							unsigned mytrig =
-							    ((triggered >>
-							      subpriv->intr.asic_chan)
-							     &
-							     ((0x1 << subpriv->
-							       intr.
-							       num_asic_chans) -
-							      1)) << subpriv->
-							    intr.first_chan;
-							if (mytrig &
-							    subpriv->intr.enabled_mask)
-							{
-								unsigned int val
-								    = 0;
-								unsigned int n,
-								    ch, len;
-
-								len =
-								    s->
-								    async->cmd.chanlist_len;
-								for (n = 0;
-								     n < len;
-								     n++) {
-									ch = CR_CHAN(s->async->cmd.chanlist[n]);
-									if (mytrig & (1U << ch)) {
-										val |= (1U << n);
-									}
-								}
-								/* Write the scan to the buffer. */
-								if (comedi_buf_put(s->async, ((short *)&val)[0])
-								    &&
-								    comedi_buf_put
-								    (s->async,
-								     ((short *)
-								      &val)[1]))
-								{
-									s->async->events |= (COMEDI_CB_BLOCK | COMEDI_CB_EOS);
-								} else {
-									/* Overflow! Stop acquisition!! */
-									/* TODO: STOP_ACQUISITION_CALL_HERE!! */
-									pcmuio_stop_intr
-									    (dev,
-									     s);
-								}
-
-								/* Check for end of acquisition. */
-								if (!subpriv->intr.continuous) {
-									/* stop_src == TRIG_COUNT */
-									if (subpriv->intr.stop_count > 0) {
-										subpriv->intr.stop_count--;
-										if (subpriv->intr.stop_count == 0) {
-											s->async->events |= COMEDI_CB_EOA;
-											/* TODO: STOP_ACQUISITION_CALL_HERE!! */
-											pcmuio_stop_intr
-											    (dev,
-											     s);
-										}
-									}
-								}
-							}
-						}
-
-						spin_unlock_irqrestore
-						    (&subpriv->intr.spinlock,
-						     flags);
-
-						if (oldevents !=
-						    s->async->events) {
-							comedi_event(dev, s);
-						}
-
-					}
-
-				}
-			}
-
+			if (pcmuio_handle_asic_interrupt(dev, asic))
+				got1++;
 		}
 	}
 	if (!got1)
@@ -606,7 +415,7 @@
 static int pcmuio_start_intr(struct comedi_device *dev,
 			     struct comedi_subdevice *s)
 {
-	struct pcmuio_private *devpriv = dev->private;
+	struct pcmuio_subdev_private *subpriv = s->private;
 
 	if (!subpriv->intr.continuous && subpriv->intr.stop_count == 0) {
 		/* An empty acquisition! */
@@ -615,7 +424,7 @@
 		return 1;
 	} else {
 		unsigned bits = 0, pol_bits = 0, n;
-		int nports, firstport, asic, port;
+		int asic;
 		struct comedi_cmd *cmd = &s->async->cmd;
 
 		asic = subpriv->intr.asic;
@@ -624,8 +433,6 @@
 					   subdev */
 		subpriv->intr.enabled_mask = 0;
 		subpriv->intr.active = 1;
-		nports = subpriv->intr.num_asic_chans / CHANS_PER_PORT;
-		firstport = subpriv->intr.asic_chan / CHANS_PER_PORT;
 		if (cmd->chanlist) {
 			for (n = 0; n < cmd->chanlist_len; n++) {
 				bits |= (1U << CR_CHAN(cmd->chanlist[n]));
@@ -635,31 +442,19 @@
 				    << CR_CHAN(cmd->chanlist[n]);
 			}
 		}
-		bits &= ((0x1 << subpriv->intr.num_asic_chans) -
-			 1) << subpriv->intr.first_chan;
+		bits &= ((0x1 << s->n_chan) - 1);
 		subpriv->intr.enabled_mask = bits;
 
-		switch_page(dev, asic, PAGE_ENAB);
-		for (port = firstport; port < firstport + nports; ++port) {
-			unsigned enab =
-			    bits >> (subpriv->intr.first_chan + (port -
-								 firstport) *
-				     8) & 0xff, pol =
-			    pol_bits >> (subpriv->intr.first_chan +
-					 (port - firstport) * 8) & 0xff;
-			/* set enab intrs for this subdev.. */
-			outb(enab,
-			     devpriv->asics[asic].iobase + REG_ENAB0 + port);
-			switch_page(dev, asic, PAGE_POL);
-			outb(pol,
-			     devpriv->asics[asic].iobase + REG_ENAB0 + port);
-		}
+		/* set pol and enab intrs for this subdev.. */
+		pcmuio_write(dev, pol_bits, asic, PCMUIO_PAGE_POL, 0);
+		pcmuio_write(dev, bits, asic, PCMUIO_PAGE_ENAB, 0);
 	}
 	return 0;
 }
 
 static int pcmuio_cancel(struct comedi_device *dev, struct comedi_subdevice *s)
 {
+	struct pcmuio_subdev_private *subpriv = s->private;
 	unsigned long flags;
 
 	spin_lock_irqsave(&subpriv->intr.spinlock, flags);
@@ -677,6 +472,7 @@
 pcmuio_inttrig_start_intr(struct comedi_device *dev, struct comedi_subdevice *s,
 			  unsigned int trignum)
 {
+	struct pcmuio_subdev_private *subpriv = s->private;
 	unsigned long flags;
 	int event = 0;
 
@@ -701,6 +497,7 @@
  */
 static int pcmuio_cmd(struct comedi_device *dev, struct comedi_subdevice *s)
 {
+	struct pcmuio_subdev_private *subpriv = s->private;
 	struct comedi_cmd *cmd = &s->async->cmd;
 	unsigned long flags;
 	int event = 0;
@@ -797,17 +594,18 @@
 static int pcmuio_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 {
 	const struct pcmuio_board *board = comedi_board(dev);
-	struct pcmuio_private *devpriv;
 	struct comedi_subdevice *s;
-	int sdev_no, chans_left, n_subdevs, port, asic, thisasic_chanct = 0;
-	unsigned int irq[MAX_ASICS];
+	struct pcmuio_private *devpriv;
+	struct pcmuio_subdev_private *subpriv;
+	int sdev_no, n_subdevs, asic;
+	unsigned int irq[PCMUIO_MAX_ASICS];
 	int ret;
 
 	irq[0] = it->options[1];
 	irq[1] = it->options[2];
 
 	ret = comedi_request_region(dev, it->options[0],
-				    board->num_asics * ASIC_IOSIZE);
+				    board->num_asics * PCMUIO_ASIC_IOSIZE);
 	if (ret)
 		return ret;
 
@@ -816,20 +614,11 @@
 		return -ENOMEM;
 	dev->private = devpriv;
 
-	for (asic = 0; asic < MAX_ASICS; ++asic) {
-		devpriv->asics[asic].num = asic;
-		devpriv->asics[asic].iobase = dev->iobase + asic * ASIC_IOSIZE;
-		devpriv->asics[asic].irq = 0;	/* this gets actually set at the end of
-						   this function when we
-						   request_irqs */
+	for (asic = 0; asic < PCMUIO_MAX_ASICS; ++asic)
 		spin_lock_init(&devpriv->asics[asic].spinlock);
-	}
 
-	chans_left = CHANS_PER_ASIC * board->num_asics;
-	n_subdevs = CALC_N_SUBDEVS(chans_left);
-	devpriv->sprivs = kcalloc(n_subdevs,
-				  sizeof(struct pcmuio_subdev_private),
-				  GFP_KERNEL);
+	n_subdevs = board->num_asics * 2;
+	devpriv->sprivs = kcalloc(n_subdevs, sizeof(*subpriv), GFP_KERNEL);
 	if (!devpriv->sprivs)
 		return -ENOMEM;
 
@@ -837,74 +626,40 @@
 	if (ret)
 		return ret;
 
-	port = 0;
-	asic = 0;
 	for (sdev_no = 0; sdev_no < (int)dev->n_subdevices; ++sdev_no) {
-		int byte_no;
-
 		s = &dev->subdevices[sdev_no];
-		s->private = &devpriv->sprivs[sdev_no];
+		subpriv = &devpriv->sprivs[sdev_no];
+		s->private = subpriv;
 		s->maxdata = 1;
 		s->range_table = &range_digital;
 		s->subdev_flags = SDF_READABLE | SDF_WRITABLE;
 		s->type = COMEDI_SUBD_DIO;
 		s->insn_bits = pcmuio_dio_insn_bits;
 		s->insn_config = pcmuio_dio_insn_config;
-		s->n_chan = min(chans_left, MAX_CHANS_PER_SUBDEV);
-		subpriv->intr.asic = -1;
-		subpriv->intr.first_chan = -1;
-		subpriv->intr.asic_chan = -1;
-		subpriv->intr.num_asic_chans = -1;
-		subpriv->intr.active = 0;
-		s->len_chanlist = 1;
+		s->n_chan = 24;
 
-		/* save the ioport address for each 'port' of 8 channels in the
-		   subdevice */
-		for (byte_no = 0; byte_no < PORTS_PER_SUBDEV; ++byte_no, ++port) {
-			if (port >= PORTS_PER_ASIC) {
-				port = 0;
-				++asic;
-				thisasic_chanct = 0;
-			}
-			subpriv->iobases[byte_no] =
-			    devpriv->asics[asic].iobase + port;
-
-			if (thisasic_chanct <
-			    CHANS_PER_PORT * INTR_PORTS_PER_ASIC
-			    && subpriv->intr.asic < 0) {
-				/* this is an interrupt subdevice, so setup the struct */
-				subpriv->intr.asic = asic;
-				subpriv->intr.active = 0;
-				subpriv->intr.stop_count = 0;
-				subpriv->intr.first_chan = byte_no * 8;
-				subpriv->intr.asic_chan = thisasic_chanct;
-				subpriv->intr.num_asic_chans =
-				    s->n_chan - subpriv->intr.first_chan;
-				dev->read_subdev = s;
-				s->subdev_flags |= SDF_CMD_READ;
-				s->cancel = pcmuio_cancel;
-				s->do_cmd = pcmuio_cmd;
-				s->do_cmdtest = pcmuio_cmdtest;
-				s->len_chanlist = subpriv->intr.num_asic_chans;
-			}
-			thisasic_chanct += CHANS_PER_PORT;
+		/* subdevices 0 and 2 suppport interrupts */
+		if ((sdev_no % 2) == 0) {
+			/* setup the interrupt subdevice */
+			subpriv->intr.asic = sdev_no / 2;
+			dev->read_subdev = s;
+			s->subdev_flags |= SDF_CMD_READ;
+			s->cancel = pcmuio_cancel;
+			s->do_cmd = pcmuio_cmd;
+			s->do_cmdtest = pcmuio_cmdtest;
+			s->len_chanlist = s->n_chan;
+		} else {
+			subpriv->intr.asic = -1;
+			s->len_chanlist = 1;
 		}
 		spin_lock_init(&subpriv->intr.spinlock);
-
-		chans_left -= s->n_chan;
-
-		if (!chans_left) {
-			asic = 0;	/* reset the asic to our first asic, to do intr subdevs */
-			port = 0;
-		}
-
 	}
 
-	init_asics(dev);	/* clear out all the registers, basically */
+	pcmuio_reset(dev);
 
-	for (asic = 0; irq[0] && asic < MAX_ASICS; ++asic) {
+	for (asic = 0; irq[0] && asic < PCMUIO_MAX_ASICS; ++asic) {
 		if (irq[asic]
-		    && request_irq(irq[asic], interrupt_pcmuio,
+		    && request_irq(irq[asic], pcmuio_interrupt,
 				   IRQF_SHARED, board->name, dev)) {
 			int i;
 			/* unroll the allocated irqs.. */
@@ -917,17 +672,7 @@
 		devpriv->asics[asic].irq = irq[asic];
 	}
 
-	if (irq[0]) {
-		dev_dbg(dev->class_dev, "irq: %u\n", irq[0]);
-		if (irq[1] && board->num_asics == 2)
-			dev_dbg(dev->class_dev, "second ASIC irq: %u\n",
-				irq[1]);
-	} else {
-		dev_dbg(dev->class_dev, "(IRQ mode disabled)\n");
-	}
-
-
-	return 1;
+	return 0;
 }
 
 static void pcmuio_detach(struct comedi_device *dev)
@@ -935,7 +680,7 @@
 	struct pcmuio_private *devpriv = dev->private;
 	int i;
 
-	for (i = 0; i < MAX_ASICS; ++i) {
+	for (i = 0; i < PCMUIO_MAX_ASICS; ++i) {
 		if (devpriv->asics[i].irq)
 			free_irq(devpriv->asics[i].irq, dev);
 	}
@@ -944,18 +689,6 @@
 	comedi_legacy_detach(dev);
 }
 
-static const struct pcmuio_board pcmuio_boards[] = {
-	{
-		.name		= "pcmuio48",
-		.num_asics	= 1,
-		.num_ports	= 6,
-	}, {
-		.name		= "pcmuio96",
-		.num_asics	= 2,
-		.num_ports	= 12,
-	},
-};
-
 static struct comedi_driver pcmuio_driver = {
 	.driver_name	= "pcmuio",
 	.module		= THIS_MODULE,

diff --git a/drivers/staging/comedi/drivers/plx9052.h b/drivers/staging/comedi/drivers/plx9052.h
index ff76fbb..fbcf250 100644
--- a/drivers/staging/comedi/drivers/plx9052.h
+++ b/drivers/staging/comedi/drivers/plx9052.h

@@ -16,11 +16,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _PLX9052_H_

diff --git a/drivers/staging/comedi/drivers/poc.c b/drivers/staging/comedi/drivers/poc.c
index b55a16b..005fbef 100644
--- a/drivers/staging/comedi/drivers/poc.c
+++ b/drivers/staging/comedi/drivers/poc.c

@@ -13,25 +13,18 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 /*
 Driver: poc
 Description: Generic driver for very simple devices
 Author: ds
-Devices: [Keithley Metrabyte] DAC-02 (dac02), [Advantech] PCL-733 (pcl733),
-  PCL-734 (pcl734)
+Devices: [Keithley Metrabyte] DAC-02 (dac02)
 Updated: Sat, 16 Mar 2002 17:34:48 -0800
 Status: unknown
 
 This driver is indended to support very simple ISA-based devices,
 including:
   dac02 - Keithley DAC-02 analog output board
-  pcl733 - Advantech PCL-733
-  pcl734 - Advantech PCL-734
 
 Configuration options:
   [0] - I/O port base
@@ -101,39 +94,6 @@
 	return 1;
 }
 
-static int pcl733_insn_bits(struct comedi_device *dev,
-			    struct comedi_subdevice *s,
-			    struct comedi_insn *insn, unsigned int *data)
-{
-	data[1] = inb(dev->iobase + 0);
-	data[1] |= (inb(dev->iobase + 1) << 8);
-	data[1] |= (inb(dev->iobase + 2) << 16);
-	data[1] |= (inb(dev->iobase + 3) << 24);
-
-	return insn->n;
-}
-
-static int pcl734_insn_bits(struct comedi_device *dev,
-			    struct comedi_subdevice *s,
-			    struct comedi_insn *insn, unsigned int *data)
-{
-	if (data[0]) {
-		s->state &= ~data[0];
-		s->state |= (data[0] & data[1]);
-		if ((data[0] >> 0) & 0xff)
-			outb((s->state >> 0) & 0xff, dev->iobase + 0);
-		if ((data[0] >> 8) & 0xff)
-			outb((s->state >> 8) & 0xff, dev->iobase + 1);
-		if ((data[0] >> 16) & 0xff)
-			outb((s->state >> 16) & 0xff, dev->iobase + 2);
-		if ((data[0] >> 24) & 0xff)
-			outb((s->state >> 24) & 0xff, dev->iobase + 3);
-	}
-	data[1] = s->state;
-
-	return insn->n;
-}
-
 static int poc_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 {
 	const struct boarddef_struct *board = comedi_board(dev);
@@ -180,22 +140,6 @@
 		.winsn		= dac02_ao_winsn,
 		.rinsn		= readback_insn,
 		.range		= &range_unknown,
-	}, {
-		.name		= "pcl733",
-		.iosize		= 4,
-		.type		= COMEDI_SUBD_DI,
-		.n_chan		= 32,
-		.n_bits		= 1,
-		.insnbits	= pcl733_insn_bits,
-		.range		= &range_digital,
-	}, {
-		.name		= "pcl734",
-		.iosize		= 4,
-		.type		= COMEDI_SUBD_DO,
-		.n_chan		= 32,
-		.n_bits		= 1,
-		.insnbits	= pcl734_insn_bits,
-		.range		= &range_digital,
 	},
 };
 

diff --git a/drivers/staging/comedi/drivers/rtd520.c b/drivers/staging/comedi/drivers/rtd520.c
index 30a1728..9b93a1f 100644
--- a/drivers/staging/comedi/drivers/rtd520.c
+++ b/drivers/staging/comedi/drivers/rtd520.c

@@ -14,10 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*

diff --git a/drivers/staging/comedi/drivers/rti800.c b/drivers/staging/comedi/drivers/rti800.c
index f4163fd..f698c7f 100644
--- a/drivers/staging/comedi/drivers/rti800.c
+++ b/drivers/staging/comedi/drivers/rti800.c

@@ -14,10 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*

diff --git a/drivers/staging/comedi/drivers/rti802.c b/drivers/staging/comedi/drivers/rti802.c
index 46dbbe6..9e74450 100644
--- a/drivers/staging/comedi/drivers/rti802.c
+++ b/drivers/staging/comedi/drivers/rti802.c

@@ -14,11 +14,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: rti802

diff --git a/drivers/staging/comedi/drivers/s526.c b/drivers/staging/comedi/drivers/s526.c
index d240ce8..e1587e5 100644
--- a/drivers/staging/comedi/drivers/s526.c
+++ b/drivers/staging/comedi/drivers/s526.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: s526

diff --git a/drivers/staging/comedi/drivers/s626.c b/drivers/staging/comedi/drivers/s626.c
index 0cf4b3d..48c4b70 100644
--- a/drivers/staging/comedi/drivers/s626.c
+++ b/drivers/staging/comedi/drivers/s626.c

@@ -17,11 +17,6 @@
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/s626.h b/drivers/staging/comedi/drivers/s626.h
index 99cd57b..d2756b83 100644
--- a/drivers/staging/comedi/drivers/s626.h
+++ b/drivers/staging/comedi/drivers/s626.h

@@ -17,11 +17,6 @@
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/serial2002.c b/drivers/staging/comedi/drivers/serial2002.c
index 8900086..b4f5fe3 100644
--- a/drivers/staging/comedi/drivers/serial2002.c
+++ b/drivers/staging/comedi/drivers/serial2002.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*

diff --git a/drivers/staging/comedi/drivers/skel.c b/drivers/staging/comedi/drivers/skel.c
index dbc8c54d..06aee30 100644
--- a/drivers/staging/comedi/drivers/skel.c
+++ b/drivers/staging/comedi/drivers/skel.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: skel

diff --git a/drivers/staging/comedi/drivers/ssv_dnp.c b/drivers/staging/comedi/drivers/ssv_dnp.c
index a76df09..45c661c 100644
--- a/drivers/staging/comedi/drivers/ssv_dnp.c
+++ b/drivers/staging/comedi/drivers/ssv_dnp.c

@@ -15,11 +15,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ssv_dnp

diff --git a/drivers/staging/comedi/drivers/unioxx5.c b/drivers/staging/comedi/drivers/unioxx5.c
index 0c243477..c9201d8 100644
--- a/drivers/staging/comedi/drivers/unioxx5.c
+++ b/drivers/staging/comedi/drivers/unioxx5.c

@@ -18,10 +18,6 @@
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
  *  GNU General Public License for more details.                           *
  *                                                                         *
- *  You should have received a copy of the GNU General Public License      *
- *  along with this program; if not, write to the Free Software            *
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.              *
- *                                                                         *
  ***************************************************************************/
 /*
 
@@ -375,15 +371,13 @@
 	int i, to, ndef_flag = 0;
 	int ret;
 
-	usp = kzalloc(sizeof(*usp), GFP_KERNEL);
-	if (usp == NULL)
+	usp = comedi_alloc_spriv(s, sizeof(*usp));
+	if (!usp)
 		return -ENOMEM;
 
 	ret = __comedi_request_region(dev, iobase, UNIOXX5_SIZE);
-	if (ret) {
-		kfree(usp);
+	if (ret)
 		return ret;
-	}
 	usp->usp_iobase = iobase;
 
 	/* defining modules types */
@@ -417,7 +411,6 @@
 
 	/* initial subdevice for digital or analog i/o */
 	s->type = COMEDI_SUBD_DIO;
-	s->private = usp;
 	s->subdev_flags = SDF_READABLE | SDF_WRITABLE;
 	s->n_chan = UNIOXX5_NUM_OF_CHANS;
 	s->maxdata = 0xFFF;
@@ -478,15 +471,15 @@
 
 static void unioxx5_detach(struct comedi_device *dev)
 {
+	struct comedi_subdevice *s;
+	struct unioxx5_subd_priv *spriv;
 	int i;
-	struct comedi_subdevice *subdev;
-	struct unioxx5_subd_priv *usp;
 
 	for (i = 0; i < dev->n_subdevices; i++) {
-		subdev = &dev->subdevices[i];
-		usp = subdev->private;
-		release_region(usp->usp_iobase, UNIOXX5_SIZE);
-		kfree(subdev->private);
+		s = &dev->subdevices[i];
+		spriv = s->private;
+		if (spriv && spriv->usp_iobase)
+			release_region(spriv->usp_iobase, UNIOXX5_SIZE);
 	}
 }
 

diff --git a/drivers/staging/comedi/drivers/usbdux.c b/drivers/staging/comedi/drivers/usbdux.c
index 6f5da67..279e5bd 100644
--- a/drivers/staging/comedi/drivers/usbdux.c
+++ b/drivers/staging/comedi/drivers/usbdux.c

@@ -11,11 +11,6 @@
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: usbdux
@@ -94,7 +89,6 @@
 #include <linux/usb.h>
 #include <linux/fcntl.h>
 #include <linux/compiler.h>
-#include <linux/firmware.h>
 
 #include "../comedidev.h"
 
@@ -727,154 +721,82 @@
 	}
 }
 
-static int usbduxsub_start(struct usbduxsub *usbduxsub)
-{
-	int errcode = 0;
-	uint8_t *local_transfer_buffer;
-
-	local_transfer_buffer = kmalloc(1, GFP_KERNEL);
-	if (!local_transfer_buffer)
-		return -ENOMEM;
-
-	/* 7f92 to zero */
-	*local_transfer_buffer = 0;
-	errcode = usb_control_msg(usbduxsub->usbdev,
-				  /* create a pipe for a control transfer */
-				  usb_sndctrlpipe(usbduxsub->usbdev, 0),
-				  /* bRequest, "Firmware" */
-				  USBDUXSUB_FIRMWARE,
-				  /* bmRequestType */
-				  VENDOR_DIR_OUT,
-				  /* Value */
-				  USBDUXSUB_CPUCS,
-				  /* Index */
-				  0x0000,
-				  /* address of the transfer buffer */
-				  local_transfer_buffer,
-				  /* Length */
-				  1,
-				  /* Timeout */
-				  BULK_TIMEOUT);
-	if (errcode < 0)
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: control msg failed (start)\n");
-
-	kfree(local_transfer_buffer);
-	return errcode;
-}
-
-static int usbduxsub_stop(struct usbduxsub *usbduxsub)
-{
-	int errcode = 0;
-	uint8_t *local_transfer_buffer;
-
-	local_transfer_buffer = kmalloc(1, GFP_KERNEL);
-	if (!local_transfer_buffer)
-		return -ENOMEM;
-
-	/* 7f92 to one */
-	*local_transfer_buffer = 1;
-	errcode = usb_control_msg(usbduxsub->usbdev,
-				  usb_sndctrlpipe(usbduxsub->usbdev, 0),
-				  /* bRequest, "Firmware" */
-				  USBDUXSUB_FIRMWARE,
-				  /* bmRequestType */
-				  VENDOR_DIR_OUT,
-				  /* Value */
-				  USBDUXSUB_CPUCS,
-				  /* Index */
-				  0x0000, local_transfer_buffer,
-				  /* Length */
-				  1,
-				  /* Timeout */
-				  BULK_TIMEOUT);
-	if (errcode < 0)
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: control msg failed (stop)\n");
-
-	kfree(local_transfer_buffer);
-	return errcode;
-}
-
-static int usbduxsub_upload(struct usbduxsub *usbduxsub,
-			    uint8_t *local_transfer_buffer,
-			    unsigned int start_addr, unsigned int len)
-{
-	int errcode;
-
-	errcode = usb_control_msg(usbduxsub->usbdev,
-				  usb_sndctrlpipe(usbduxsub->usbdev, 0),
-				  /* brequest, firmware */
-				  USBDUXSUB_FIRMWARE,
-				  /* bmRequestType */
-				  VENDOR_DIR_OUT,
-				  /* value */
-				  start_addr,
-				  /* index */
-				  0x0000,
-				  /* our local safe buffer */
-				  local_transfer_buffer,
-				  /* length */
-				  len,
-				  /* timeout */
-				  BULK_TIMEOUT);
-	dev_dbg(&usbduxsub->interface->dev, "comedi_: result=%d\n", errcode);
-	if (errcode < 0) {
-		dev_err(&usbduxsub->interface->dev, "comedi_: upload failed\n");
-		return errcode;
-	}
-	return 0;
-}
-
 #define FIRMWARE_MAX_LEN 0x2000
 
-static int firmware_upload(struct usbduxsub *usbduxsub,
-			  const u8 *firmware_binary, int size_firmware)
+static int usbdux_firmware_upload(struct comedi_device *dev,
+				  const u8 *data, size_t size,
+				  unsigned long context)
 {
+	struct usbduxsub *usbduxsub = dev->private;
+	struct usb_device *usb = usbduxsub->usbdev;
+	uint8_t *buf;
+	uint8_t *tmp;
 	int ret;
-	uint8_t *fw_buf;
 
-	if (!firmware_binary)
+	if (!data)
 		return 0;
 
-	if (size_firmware > FIRMWARE_MAX_LEN) {
+	if (size > FIRMWARE_MAX_LEN) {
 		dev_err(&usbduxsub->interface->dev,
 			"usbdux firmware binary it too large for FX2.\n");
 		return -ENOMEM;
 	}
 
 	/* we generate a local buffer for the firmware */
-	fw_buf = kmemdup(firmware_binary, size_firmware, GFP_KERNEL);
-	if (!fw_buf) {
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: mem alloc for firmware failed\n");
+	buf = kmemdup(data, size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* we need a malloc'ed buffer for usb_control_msg() */
+	tmp = kmalloc(1, GFP_KERNEL);
+	if (!tmp) {
+		kfree(buf);
 		return -ENOMEM;
 	}
 
-	ret = usbduxsub_stop(usbduxsub);
+	/* stop the current firmware on the device */
+	*tmp = 1;	/* 7f92 to one */
+	ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+			      USBDUXSUB_FIRMWARE,
+			      VENDOR_DIR_OUT,
+			      USBDUXSUB_CPUCS, 0x0000,
+			      tmp, 1,
+			      BULK_TIMEOUT);
 	if (ret < 0) {
 		dev_err(&usbduxsub->interface->dev,
 			"comedi_: can not stop firmware\n");
-		kfree(fw_buf);
-		return ret;
+		goto done;
 	}
 
-	ret = usbduxsub_upload(usbduxsub, fw_buf, 0, size_firmware);
+	/* upload the new firmware to the device */
+	ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+			      USBDUXSUB_FIRMWARE,
+			      VENDOR_DIR_OUT,
+			      0, 0x0000,
+			      buf, size,
+			      BULK_TIMEOUT);
 	if (ret < 0) {
 		dev_err(&usbduxsub->interface->dev,
 			"comedi_: firmware upload failed\n");
-		kfree(fw_buf);
-		return ret;
+		goto done;
 	}
-	ret = usbduxsub_start(usbduxsub);
-	if (ret < 0) {
+
+	/* start the new firmware on the device */
+	*tmp = 0;	/* 7f92 to zero */
+	ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+			      USBDUXSUB_FIRMWARE,
+			      VENDOR_DIR_OUT,
+			      USBDUXSUB_CPUCS, 0x0000,
+			      tmp, 1,
+			      BULK_TIMEOUT);
+	if (ret < 0)
 		dev_err(&usbduxsub->interface->dev,
 			"comedi_: can not start firmware\n");
-		kfree(fw_buf);
-		return ret;
-	}
-	kfree(fw_buf);
-	return 0;
+
+done:
+	kfree(tmp);
+	kfree(buf);
+	return ret;
 }
 
 static int usbduxsub_submit_inurbs(struct usbduxsub *usbduxsub)
@@ -2328,13 +2250,21 @@
 			      unsigned long context_unused)
 {
 	struct usb_interface *uinterf = comedi_to_usb_interface(dev);
+	struct usbduxsub *this_usbduxsub = usb_get_intfdata(uinterf);
+	struct usb_device *usb = usbduxsub->usbdev;
 	int ret;
-	struct usbduxsub *this_usbduxsub;
+
+	dev->private = this_usbduxsub;	/* This is temporary... */
+	ret = comedi_load_firmware(dev, &usb->dev, FIRMWARE,
+				   usbdux_firmware_upload, 0);
+	if (ret < 0) {
+		dev->private = NULL;
+		return ret;
+	}
 
 	dev->private = NULL;
 
 	down(&start_stop_sem);
-	this_usbduxsub = usb_get_intfdata(uinterf);
 	if (!this_usbduxsub || !this_usbduxsub->probed) {
 		dev_err(dev->class_dev,
 			"usbdux: error: auto_attach failed, not connected\n");
@@ -2369,35 +2299,6 @@
 	.detach		= usbdux_detach,
 };
 
-static void usbdux_firmware_request_complete_handler(const struct firmware *fw,
-						     void *context)
-{
-	struct usbduxsub *usbduxsub_tmp = context;
-	struct usb_interface *uinterf = usbduxsub_tmp->interface;
-	int ret;
-
-	if (fw == NULL) {
-		dev_err(&uinterf->dev,
-			"Firmware complete handler without firmware!\n");
-		return;
-	}
-
-	/*
-	 * we need to upload the firmware here because fw will be
-	 * freed once we've left this function
-	 */
-	ret = firmware_upload(usbduxsub_tmp, fw->data, fw->size);
-
-	if (ret) {
-		dev_err(&uinterf->dev,
-			"Could not upload firmware (err=%d)\n", ret);
-		goto out;
-	}
-	comedi_usb_auto_config(uinterf, &usbdux_driver, 0);
- out:
-	release_firmware(fw);
-}
-
 static int usbdux_usb_probe(struct usb_interface *uinterf,
 			    const struct usb_device_id *id)
 {
@@ -2405,7 +2306,6 @@
 	struct device *dev = &uinterf->dev;
 	int i;
 	int index;
-	int ret;
 
 	dev_dbg(dev, "comedi_: usbdux_: "
 		"finding a free structure for the usb-device\n");
@@ -2622,23 +2522,7 @@
 	usbduxsub[index].probed = 1;
 	up(&start_stop_sem);
 
-	ret = request_firmware_nowait(THIS_MODULE,
-				      FW_ACTION_HOTPLUG,
-				      FIRMWARE,
-				      &udev->dev,
-				      GFP_KERNEL,
-				      usbduxsub + index,
-				      usbdux_firmware_request_complete_handler);
-
-	if (ret) {
-		dev_err(dev, "Could not load firmware (err=%d)\n", ret);
-		return ret;
-	}
-
-	dev_info(dev, "comedi_: usbdux%d "
-		 "has been successfully initialised.\n", index);
-	/* success */
-	return 0;
+	return comedi_usb_auto_config(uinterf, &usbdux_driver, 0);
 }
 
 static void usbdux_usb_disconnect(struct usb_interface *intf)

diff --git a/drivers/staging/comedi/drivers/usbduxfast.c b/drivers/staging/comedi/drivers/usbduxfast.c
index 7f95af3..27898c4 100644
--- a/drivers/staging/comedi/drivers/usbduxfast.c
+++ b/drivers/staging/comedi/drivers/usbduxfast.c

@@ -10,10 +10,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -40,7 +36,6 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
-#include <linux/firmware.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -60,6 +55,7 @@
  * constants for "firmware" upload and download
  */
 #define FIRMWARE		"usbduxfast_firmware.bin"
+#define FIRMWARE_MAX_LEN	0x2000
 #define USBDUXFASTSUB_FIRMWARE	0xA0
 #define VENDOR_DIR_IN		0xC0
 #define VENDOR_DIR_OUT		0x40
@@ -112,7 +108,7 @@
 /*
  * size of the buffer for the dux commands in bytes
  */
-#define SIZEOFDUXBUFFER	256
+#define SIZEOFDUXBUF	256
 
 /*
  * number of in-URBs which receive the data: min=5
@@ -120,16 +116,6 @@
 #define NUMOFINBUFFERSHIGH	10
 
 /*
- * total number of usbduxfast devices
- */
-#define NUMUSBDUXFAST	16
-
-/*
- * analogue in subdevice
- */
-#define SUBDEV_AD	0
-
-/*
  * min delay steps for more than one channel
  * basically when the mux gives up ;-)
  *
@@ -161,143 +147,83 @@
  * this is the structure which holds all the data of this driver
  * one sub device just now: A/D
  */
-struct usbduxfastsub_s {
-	int attached;		/* is attached? */
-	int probed;		/* is it associated with a subdevice? */
-	struct usb_device *usbdev;	/* pointer to the usb-device */
-	struct urb *urbIn;	/* BULK-transfer handling: urb */
-	int8_t *transfer_buffer;
-	int16_t *insnBuffer;	/* input buffer for single insn */
-	int ifnum;		/* interface number */
-	struct usb_interface *interface;	/* interface structure */
-	/* comedi device for the interrupt context */
-	struct comedi_device *comedidev;
+struct usbduxfast_private {
+	struct urb *urb;	/* BULK-transfer handling: urb */
+	uint8_t *duxbuf;
+	int8_t *inbuf;
 	short int ai_cmd_running;	/* asynchronous command is running */
 	short int ai_continous;	/* continous acquisition */
 	long int ai_sample_count;	/* number of samples to acquire */
-	uint8_t *dux_commands;	/* commands */
 	int ignore;		/* counter which ignores the first
 				   buffers */
 	struct semaphore sem;
 };
 
 /*
- * The pointer to the private usb-data of the driver
- * is also the private data for the comedi-device.
- * This has to be global as the usb subsystem needs
- * global variables. The other reason is that this
- * structure must be there _before_ any comedi
- * command is issued. The usb subsystem must be
- * initialised before comedi can access it.
- */
-static struct usbduxfastsub_s usbduxfastsub[NUMUSBDUXFAST];
-
-static DEFINE_SEMAPHORE(start_stop_sem);
-
-/*
  * bulk transfers to usbduxfast
  */
 #define SENDADCOMMANDS            0
 #define SENDINITEP6               1
 
-static int send_dux_commands(struct usbduxfastsub_s *udfs, int cmd_type)
+static int usbduxfast_send_cmd(struct comedi_device *dev, int cmd_type)
 {
-	int tmp, nsent;
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxfast_private *devpriv = dev->private;
+	int nsent;
+	int ret;
 
-	udfs->dux_commands[0] = cmd_type;
+	devpriv->duxbuf[0] = cmd_type;
 
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi%d: usbduxfast: dux_commands: ",
-	       udfs->comedidev->minor);
-	for (tmp = 0; tmp < SIZEOFDUXBUFFER; tmp++)
-		printk(" %02x", udfs->dux_commands[tmp]);
-	printk("\n");
-#endif
-
-	tmp = usb_bulk_msg(udfs->usbdev,
-			   usb_sndbulkpipe(udfs->usbdev, CHANNELLISTEP),
-			   udfs->dux_commands, SIZEOFDUXBUFFER, &nsent, 10000);
-	if (tmp < 0)
-		dev_err(&udfs->interface->dev,
-			"could not transmit dux_commands to the usb-device, err=%d\n",
-			tmp);
-	return tmp;
-}
-
-/*
- * Stops the data acquision.
- * It should be safe to call this function from any context.
- */
-static int usbduxfastsub_unlink_InURBs(struct usbduxfastsub_s *udfs)
-{
-	int j = 0;
-	int err = 0;
-
-	if (udfs && udfs->urbIn) {
-		udfs->ai_cmd_running = 0;
-		/* waits until a running transfer is over */
-		usb_kill_urb(udfs->urbIn);
-		j = 0;
-	}
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi: usbduxfast: unlinked InURB: res=%d\n", j);
-#endif
-	return err;
-}
-
-/*
- * This will stop a running acquisition operation.
- * Is called from within this driver from both the
- * interrupt context and from comedi.
- */
-static int usbduxfast_ai_stop(struct usbduxfastsub_s *udfs, int do_unlink)
-{
-	int ret = 0;
-
-	if (!udfs) {
-		pr_err("%s: udfs=NULL!\n", __func__);
-		return -EFAULT;
-	}
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi: usbduxfast_ai_stop\n");
-#endif
-
-	udfs->ai_cmd_running = 0;
-
-	if (do_unlink)
-		/* stop aquistion */
-		ret = usbduxfastsub_unlink_InURBs(udfs);
-
+	ret = usb_bulk_msg(usb, usb_sndbulkpipe(usb, CHANNELLISTEP),
+			   devpriv->duxbuf, SIZEOFDUXBUF,
+			   &nsent, 10000);
+	if (ret < 0)
+		dev_err(dev->class_dev,
+			"could not transmit command to the usb-device, err=%d\n",
+			ret);
 	return ret;
 }
 
-/*
- * This will cancel a running acquisition operation.
- * This is called by comedi but never from inside the driver.
- */
+static void usbduxfast_cmd_data(struct comedi_device *dev, int index,
+				uint8_t len, uint8_t op, uint8_t out,
+				uint8_t log)
+{
+	struct usbduxfast_private *devpriv = dev->private;
+
+	/* Set the GPIF bytes, the first byte is the command byte */
+	devpriv->duxbuf[1 + 0x00 + index] = len;
+	devpriv->duxbuf[1 + 0x08 + index] = op;
+	devpriv->duxbuf[1 + 0x10 + index] = out;
+	devpriv->duxbuf[1 + 0x18 + index] = log;
+}
+
+static int usbduxfast_ai_stop(struct comedi_device *dev, int do_unlink)
+{
+	struct usbduxfast_private *devpriv = dev->private;
+
+	/* stop aquistion */
+	devpriv->ai_cmd_running = 0;
+
+	if (do_unlink && devpriv->urb) {
+		/* kill the running transfer */
+		usb_kill_urb(devpriv->urb);
+	}
+
+	return 0;
+}
+
 static int usbduxfast_ai_cancel(struct comedi_device *dev,
 				struct comedi_subdevice *s)
 {
-	struct usbduxfastsub_s *udfs;
+	struct usbduxfast_private *devpriv = dev->private;
 	int ret;
 
-	/* force unlink of all urbs */
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi: usbduxfast_ai_cancel\n");
-#endif
-	udfs = dev->private;
-	if (!udfs) {
-		dev_err(dev->class_dev, "%s: udfs=NULL\n", __func__);
+	if (!devpriv)
 		return -EFAULT;
-	}
-	down(&udfs->sem);
-	if (!udfs->probed) {
-		up(&udfs->sem);
-		return -ENODEV;
-	}
-	/* unlink */
-	ret = usbduxfast_ai_stop(udfs, 1);
-	up(&udfs->sem);
+
+	down(&devpriv->sem);
+	ret = usbduxfast_ai_stop(dev, 1);
+	up(&devpriv->sem);
 
 	return ret;
 }
@@ -306,32 +232,17 @@
  * analogue IN
  * interrupt service routine
  */
-static void usbduxfastsub_ai_Irq(struct urb *urb)
+static void usbduxfast_ai_interrupt(struct urb *urb)
 {
+	struct comedi_device *dev = urb->context;
+	struct comedi_subdevice *s = dev->read_subdev;
+	struct comedi_async *async = s->async;
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxfast_private *devpriv = dev->private;
 	int n, err;
-	struct usbduxfastsub_s *udfs;
-	struct comedi_device *this_comedidev;
-	struct comedi_subdevice *s;
 
-	/* sanity checks - is the urb there? */
-	if (!urb) {
-		pr_err("ao int-handler called with urb=NULL!\n");
-		return;
-	}
-	/* the context variable points to the subdevice */
-	this_comedidev = urb->context;
-	if (!this_comedidev) {
-		pr_err("urb context is a NULL pointer!\n");
-		return;
-	}
-	/* the private structure of the subdevice is usbduxfastsub_s */
-	udfs = this_comedidev->private;
-	if (!udfs) {
-		pr_err("private of comedi subdev is a NULL pointer!\n");
-		return;
-	}
 	/* are we running a command? */
-	if (unlikely(!udfs->ai_cmd_running)) {
+	if (unlikely(!devpriv->ai_cmd_running)) {
 		/*
 		 * not running a command
 		 * do not continue execution if no asynchronous command
@@ -340,13 +251,6 @@
 		return;
 	}
 
-	if (unlikely(!udfs->attached)) {
-		/* no comedi device there */
-		return;
-	}
-	/* subdevice which is the AD converter */
-	s = &this_comedidev->subdevices[SUBDEV_AD];
-
 	/* first we test if something unusual has just happened */
 	switch (urb->status) {
 	case 0:
@@ -361,189 +265,93 @@
 	case -ESHUTDOWN:
 	case -ECONNABORTED:
 		/* tell this comedi */
-		s->async->events |= COMEDI_CB_EOA;
-		s->async->events |= COMEDI_CB_ERROR;
-		comedi_event(udfs->comedidev, s);
+		async->events |= COMEDI_CB_EOA;
+		async->events |= COMEDI_CB_ERROR;
+		comedi_event(dev, s);
 		/* stop the transfer w/o unlink */
-		usbduxfast_ai_stop(udfs, 0);
+		usbduxfast_ai_stop(dev, 0);
 		return;
 
 	default:
 		pr_err("non-zero urb status received in ai intr context: %d\n",
 		       urb->status);
-		s->async->events |= COMEDI_CB_EOA;
-		s->async->events |= COMEDI_CB_ERROR;
-		comedi_event(udfs->comedidev, s);
-		usbduxfast_ai_stop(udfs, 0);
+		async->events |= COMEDI_CB_EOA;
+		async->events |= COMEDI_CB_ERROR;
+		comedi_event(dev, s);
+		usbduxfast_ai_stop(dev, 0);
 		return;
 	}
 
-	if (!udfs->ignore) {
-		if (!udfs->ai_continous) {
+	if (!devpriv->ignore) {
+		if (!devpriv->ai_continous) {
 			/* not continuous, fixed number of samples */
 			n = urb->actual_length / sizeof(uint16_t);
-			if (unlikely(udfs->ai_sample_count < n)) {
-				/*
-				 * we have send only a fraction of the bytes
-				 * received
-				 */
+			if (unlikely(devpriv->ai_sample_count < n)) {
+				unsigned int num_bytes;
+
+				/* partial sample received */
+				num_bytes = devpriv->ai_sample_count *
+					    sizeof(uint16_t);
 				cfc_write_array_to_buffer(s,
 							  urb->transfer_buffer,
-							  udfs->ai_sample_count
-							  * sizeof(uint16_t));
-				usbduxfast_ai_stop(udfs, 0);
+							  num_bytes);
+				usbduxfast_ai_stop(dev, 0);
 				/* tell comedi that the acquistion is over */
-				s->async->events |= COMEDI_CB_EOA;
-				comedi_event(udfs->comedidev, s);
+				async->events |= COMEDI_CB_EOA;
+				comedi_event(dev, s);
 				return;
 			}
-			udfs->ai_sample_count -= n;
+			devpriv->ai_sample_count -= n;
 		}
 		/* write the full buffer to comedi */
 		err = cfc_write_array_to_buffer(s, urb->transfer_buffer,
 						urb->actual_length);
 		if (unlikely(err == 0)) {
 			/* buffer overflow */
-			usbduxfast_ai_stop(udfs, 0);
+			usbduxfast_ai_stop(dev, 0);
 			return;
 		}
 
 		/* tell comedi that data is there */
-		comedi_event(udfs->comedidev, s);
-
+		comedi_event(dev, s);
 	} else {
 		/* ignore this packet */
-		udfs->ignore--;
+		devpriv->ignore--;
 	}
 
 	/*
 	 * command is still running
 	 * resubmit urb for BULK transfer
 	 */
-	urb->dev = udfs->usbdev;
+	urb->dev = usb;
 	urb->status = 0;
 	err = usb_submit_urb(urb, GFP_ATOMIC);
 	if (err < 0) {
-		dev_err(&urb->dev->dev,
+		dev_err(dev->class_dev,
 			"urb resubm failed: %d", err);
-		s->async->events |= COMEDI_CB_EOA;
-		s->async->events |= COMEDI_CB_ERROR;
-		comedi_event(udfs->comedidev, s);
-		usbduxfast_ai_stop(udfs, 0);
+		async->events |= COMEDI_CB_EOA;
+		async->events |= COMEDI_CB_ERROR;
+		comedi_event(dev, s);
+		usbduxfast_ai_stop(dev, 0);
 	}
 }
 
-static int usbduxfastsub_start(struct usbduxfastsub_s *udfs)
+static int usbduxfast_submit_urb(struct comedi_device *dev)
 {
-	int ret;
-	unsigned char *local_transfer_buffer;
-
-	local_transfer_buffer = kmalloc(1, GFP_KERNEL);
-	if (!local_transfer_buffer)
-		return -ENOMEM;
-
-	/* 7f92 to zero */
-	*local_transfer_buffer = 0;
-	/* bRequest, "Firmware" */
-	ret = usb_control_msg(udfs->usbdev, usb_sndctrlpipe(udfs->usbdev, 0),
-			      USBDUXFASTSUB_FIRMWARE,
-			      VENDOR_DIR_OUT,	  /* bmRequestType */
-			      USBDUXFASTSUB_CPUCS,    /* Value */
-			      0x0000,	/* Index */
-			      /* address of the transfer buffer */
-			      local_transfer_buffer,
-			      1,      /* Length */
-			      EZTIMEOUT);    /* Timeout */
-	if (ret < 0)
-		dev_err(&udfs->interface->dev,
-			"control msg failed (start)\n");
-
-	kfree(local_transfer_buffer);
-	return ret;
-}
-
-static int usbduxfastsub_stop(struct usbduxfastsub_s *udfs)
-{
-	int ret;
-	unsigned char *local_transfer_buffer;
-
-	local_transfer_buffer = kmalloc(1, GFP_KERNEL);
-	if (!local_transfer_buffer)
-		return -ENOMEM;
-
-	/* 7f92 to one */
-	*local_transfer_buffer = 1;
-	/* bRequest, "Firmware" */
-	ret = usb_control_msg(udfs->usbdev, usb_sndctrlpipe(udfs->usbdev, 0),
-			      USBDUXFASTSUB_FIRMWARE,
-			      VENDOR_DIR_OUT,	/* bmRequestType */
-			      USBDUXFASTSUB_CPUCS,	/* Value */
-			      0x0000,	/* Index */
-			      local_transfer_buffer, 1,	/* Length */
-			      EZTIMEOUT);	/* Timeout */
-	if (ret < 0)
-		dev_err(&udfs->interface->dev,
-			"control msg failed (stop)\n");
-
-	kfree(local_transfer_buffer);
-	return ret;
-}
-
-static int usbduxfastsub_upload(struct usbduxfastsub_s *udfs,
-				unsigned char *local_transfer_buffer,
-				unsigned int startAddr, unsigned int len)
-{
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxfast_private *devpriv = dev->private;
 	int ret;
 
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi: usbduxfast: uploading %d bytes", len);
-	printk(KERN_DEBUG " to addr %d, first byte=%d.\n",
-	       startAddr, local_transfer_buffer[0]);
-#endif
-	/* brequest, firmware */
-	ret = usb_control_msg(udfs->usbdev, usb_sndctrlpipe(udfs->usbdev, 0),
-			      USBDUXFASTSUB_FIRMWARE,
-			      VENDOR_DIR_OUT,	/* bmRequestType */
-			      startAddr,	/* value */
-			      0x0000,	 /* index */
-			      /* our local safe buffer */
-			      local_transfer_buffer,
-			      len,	/* length */
-			      EZTIMEOUT);      /* timeout */
-
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi_: usbduxfast: result=%d\n", ret);
-#endif
-
-	if (ret < 0) {
-		dev_err(&udfs->interface->dev, "uppload failed\n");
-		return ret;
-	}
-
-	return 0;
-}
-
-static int usbduxfastsub_submit_InURBs(struct usbduxfastsub_s *udfs)
-{
-	int ret;
-
-	if (!udfs)
+	if (!devpriv)
 		return -EFAULT;
 
-	usb_fill_bulk_urb(udfs->urbIn, udfs->usbdev,
-			  usb_rcvbulkpipe(udfs->usbdev, BULKINEP),
-			  udfs->transfer_buffer,
-			  SIZEINBUF, usbduxfastsub_ai_Irq, udfs->comedidev);
+	usb_fill_bulk_urb(devpriv->urb, usb, usb_rcvbulkpipe(usb, BULKINEP),
+			  devpriv->inbuf, SIZEINBUF,
+			  usbduxfast_ai_interrupt, dev);
 
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi%d: usbduxfast: submitting in-urb: "
-	       "0x%p,0x%p\n", udfs->comedidev->minor, udfs->urbIn->context,
-	       udfs->urbIn->dev);
-#endif
-	ret = usb_submit_urb(udfs->urbIn, GFP_ATOMIC);
+	ret = usb_submit_urb(devpriv->urb, GFP_ATOMIC);
 	if (ret) {
-		dev_err(&udfs->interface->dev,
-			"ai: usb_submit_urb error %d\n", ret);
+		dev_err(dev->class_dev, "usb_submit_urb error %d\n", ret);
 		return ret;
 	}
 	return 0;
@@ -553,13 +361,9 @@
 				 struct comedi_subdevice *s,
 				 struct comedi_cmd *cmd)
 {
-	struct usbduxfastsub_s *udfs = dev->private;
 	int err = 0;
 	long int steps, tmp;
-	int minSamplPer;
-
-	if (!udfs->probed)
-		return -ENODEV;
+	int min_sample_period;
 
 	/* Step 1 : check if triggers are trivially valid */
 
@@ -601,14 +405,14 @@
 	err |= cfc_check_trigger_arg_is(&cmd->scan_end_arg, cmd->chanlist_len);
 
 	if (cmd->chanlist_len == 1)
-		minSamplPer = 1;
+		min_sample_period = 1;
 	else
-		minSamplPer = MIN_SAMPLING_PERIOD;
+		min_sample_period = MIN_SAMPLING_PERIOD;
 
 	if (cmd->convert_src == TRIG_TIMER) {
 		steps = cmd->convert_arg * 30;
-		if (steps < (minSamplPer * 1000))
-			steps = minSamplPer * 1000;
+		if (steps < (min_sample_period * 1000))
+			steps = min_sample_period * 1000;
 
 		if (steps > (MAX_SAMPLING_PERIOD * 1000))
 			steps = MAX_SAMPLING_PERIOD * 1000;
@@ -650,80 +454,53 @@
 				 struct comedi_subdevice *s,
 				 unsigned int trignum)
 {
+	struct usbduxfast_private *devpriv = dev->private;
 	int ret;
-	struct usbduxfastsub_s *udfs = dev->private;
 
-	if (!udfs)
+	if (!devpriv)
 		return -EFAULT;
 
-	down(&udfs->sem);
-	if (!udfs->probed) {
-		up(&udfs->sem);
-		return -ENODEV;
-	}
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi%d: usbduxfast_ai_inttrig\n", dev->minor);
-#endif
+	down(&devpriv->sem);
 
 	if (trignum != 0) {
-		dev_err(dev->class_dev, "%s: invalid trignum\n", __func__);
-		up(&udfs->sem);
+		dev_err(dev->class_dev, "invalid trignum\n");
+		up(&devpriv->sem);
 		return -EINVAL;
 	}
-	if (!udfs->ai_cmd_running) {
-		udfs->ai_cmd_running = 1;
-		ret = usbduxfastsub_submit_InURBs(udfs);
+	if (!devpriv->ai_cmd_running) {
+		devpriv->ai_cmd_running = 1;
+		ret = usbduxfast_submit_urb(dev);
 		if (ret < 0) {
-			dev_err(dev->class_dev,
-				"%s: urbSubmit: err=%d\n", __func__, ret);
-			udfs->ai_cmd_running = 0;
-			up(&udfs->sem);
+			dev_err(dev->class_dev, "urbSubmit: err=%d\n", ret);
+			devpriv->ai_cmd_running = 0;
+			up(&devpriv->sem);
 			return ret;
 		}
 		s->async->inttrig = NULL;
 	} else {
-		dev_err(dev->class_dev,
-			"ai_inttrig but acqu is already running\n");
+		dev_err(dev->class_dev, "ai is already running\n");
 	}
-	up(&udfs->sem);
+	up(&devpriv->sem);
 	return 1;
 }
 
-/*
- * offsets for the GPIF bytes
- * the first byte is the command byte
- */
-#define LENBASE	(1+0x00)
-#define OPBASE	(1+0x08)
-#define OUTBASE	(1+0x10)
-#define LOGBASE	(1+0x18)
-
 static int usbduxfast_ai_cmd(struct comedi_device *dev,
 			     struct comedi_subdevice *s)
 {
+	struct usbduxfast_private *devpriv = dev->private;
 	struct comedi_cmd *cmd = &s->async->cmd;
 	unsigned int chan, gain, rngmask = 0xff;
 	int i, j, ret;
-	struct usbduxfastsub_s *udfs;
 	int result;
 	long steps, steps_tmp;
 
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi%d: usbduxfast_ai_cmd\n", dev->minor);
-#endif
-	udfs = dev->private;
-	if (!udfs)
+	if (!devpriv)
 		return -EFAULT;
 
-	down(&udfs->sem);
-	if (!udfs->probed) {
-		up(&udfs->sem);
-		return -ENODEV;
-	}
-	if (udfs->ai_cmd_running) {
-		dev_err(dev->class_dev,
-			"ai_cmd not possible. Another ai_cmd is running.\n");
-		up(&udfs->sem);
+	down(&devpriv->sem);
+	if (devpriv->ai_cmd_running) {
+		dev_err(dev->class_dev, "ai_cmd not possible\n");
+		up(&devpriv->sem);
 		return -EBUSY;
 	}
 	/* set current channel of the running acquisition to zero */
@@ -733,7 +510,7 @@
 	 * ignore the first buffers from the device if there
 	 * is an error condition
 	 */
-	udfs->ignore = PACKETS_TO_IGNORE;
+	devpriv->ignore = PACKETS_TO_IGNORE;
 
 	if (cmd->chanlist_len > 0) {
 		gain = CR_RANGE(cmd->chanlist[0]);
@@ -741,20 +518,19 @@
 			chan = CR_CHAN(cmd->chanlist[i]);
 			if (chan != i) {
 				dev_err(dev->class_dev,
-					"cmd is accepting only consecutive channels.\n");
-				up(&udfs->sem);
+					"channels are not consecutive\n");
+				up(&devpriv->sem);
 				return -EINVAL;
 			}
 			if ((gain != CR_RANGE(cmd->chanlist[i]))
 			    && (cmd->chanlist_len > 3)) {
 				dev_err(dev->class_dev,
-					"the gain must be the same for all channels.\n");
-				up(&udfs->sem);
+					"gain must be the same for all channels\n");
+				up(&devpriv->sem);
 				return -EINVAL;
 			}
 			if (i >= NUMCHANNELS) {
-				dev_err(dev->class_dev,
-					"channel list too long\n");
+				dev_err(dev->class_dev, "chanlist too long\n");
 				break;
 			}
 		}
@@ -762,8 +538,8 @@
 	steps = 0;
 	if (cmd->scan_begin_src == TRIG_TIMER) {
 		dev_err(dev->class_dev,
-			"scan_begin_src==TRIG_TIMER not valid.\n");
-		up(&udfs->sem);
+			"scan_begin_src==TRIG_TIMER not valid\n");
+		up(&devpriv->sem);
 		return -EINVAL;
 	}
 	if (cmd->convert_src == TRIG_TIMER)
@@ -771,27 +547,23 @@
 
 	if ((steps < MIN_SAMPLING_PERIOD) && (cmd->chanlist_len != 1)) {
 		dev_err(dev->class_dev,
-			"ai_cmd: steps=%ld, scan_begin_arg=%d. Not properly tested by cmdtest?\n",
+			"steps=%ld, scan_begin_arg=%d. Not properly tested by cmdtest?\n",
 			steps, cmd->scan_begin_arg);
-		up(&udfs->sem);
+		up(&devpriv->sem);
 		return -EINVAL;
 	}
 	if (steps > MAX_SAMPLING_PERIOD) {
-		dev_err(dev->class_dev, "ai_cmd: sampling rate too low.\n");
-		up(&udfs->sem);
+		dev_err(dev->class_dev, "sampling rate too low\n");
+		up(&devpriv->sem);
 		return -EINVAL;
 	}
 	if ((cmd->start_src == TRIG_EXT) && (cmd->chanlist_len != 1)
 	    && (cmd->chanlist_len != 16)) {
 		dev_err(dev->class_dev,
-			"ai_cmd: TRIG_EXT only with 1 or 16 channels possible.\n");
-		up(&udfs->sem);
+			"TRIG_EXT only with 1 or 16 channels possible\n");
+		up(&devpriv->sem);
 		return -EINVAL;
 	}
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi%d: usbduxfast: steps=%ld, convert_arg=%u\n",
-	       dev->minor, steps, cmd->convert_arg);
-#endif
 
 	switch (cmd->chanlist_len) {
 	case 1:
@@ -812,17 +584,11 @@
 		/* we loop here until ready has been set */
 		if (cmd->start_src == TRIG_EXT) {
 			/* branch back to state 0 */
-			udfs->dux_commands[LENBASE + 0] = 0x01;
 			/* deceision state w/o data */
-			udfs->dux_commands[OPBASE + 0] = 0x01;
-			udfs->dux_commands[OUTBASE + 0] = 0xFF & rngmask;
 			/* RDY0 = 0 */
-			udfs->dux_commands[LOGBASE + 0] = 0x00;
+			usbduxfast_cmd_data(dev, 0, 0x01, 0x01, rngmask, 0x00);
 		} else {	/* we just proceed to state 1 */
-			udfs->dux_commands[LENBASE + 0] = 1;
-			udfs->dux_commands[OPBASE + 0] = 0;
-			udfs->dux_commands[OUTBASE + 0] = 0xFF & rngmask;
-			udfs->dux_commands[LOGBASE + 0] = 0;
+			usbduxfast_cmd_data(dev, 0, 0x01, 0x00, rngmask, 0x00);
 		}
 
 		if (steps < MIN_SAMPLING_PERIOD) {
@@ -835,33 +601,25 @@
 				 */
 
 				/* branch back to state 1 */
-				udfs->dux_commands[LENBASE + 1] = 0x89;
 				/* deceision state with data */
-				udfs->dux_commands[OPBASE + 1] = 0x03;
-				udfs->dux_commands[OUTBASE + 1] =
-				    0xFF & rngmask;
 				/* doesn't matter */
-				udfs->dux_commands[LOGBASE + 1] = 0xFF;
+				usbduxfast_cmd_data(dev, 1,
+						    0x89, 0x03, rngmask, 0xff);
 			} else {
 				/*
 				 * we loop through two states: data and delay
 				 * max rate is 15MHz
 				 */
-				udfs->dux_commands[LENBASE + 1] = steps - 1;
 				/* data */
-				udfs->dux_commands[OPBASE + 1] = 0x02;
-				udfs->dux_commands[OUTBASE + 1] =
-				    0xFF & rngmask;
 				/* doesn't matter */
-				udfs->dux_commands[LOGBASE + 1] = 0;
+				usbduxfast_cmd_data(dev, 1, steps - 1,
+						    0x02, rngmask, 0x00);
+
 				/* branch back to state 1 */
-				udfs->dux_commands[LENBASE + 2] = 0x09;
 				/* deceision state w/o data */
-				udfs->dux_commands[OPBASE + 2] = 0x01;
-				udfs->dux_commands[OUTBASE + 2] =
-				    0xFF & rngmask;
 				/* doesn't matter */
-				udfs->dux_commands[LOGBASE + 2] = 0xFF;
+				usbduxfast_cmd_data(dev, 2,
+						    0x09, 0x01, rngmask, 0xff);
 			}
 		} else {
 			/*
@@ -873,26 +631,20 @@
 			steps = steps - 1;
 
 			/* do the first part of the delay */
-			udfs->dux_commands[LENBASE + 1] = steps / 2;
-			udfs->dux_commands[OPBASE + 1] = 0;
-			udfs->dux_commands[OUTBASE + 1] = 0xFF & rngmask;
-			udfs->dux_commands[LOGBASE + 1] = 0;
+			usbduxfast_cmd_data(dev, 1,
+					    steps / 2, 0x00, rngmask, 0x00);
 
 			/* and the second part */
-			udfs->dux_commands[LENBASE + 2] = steps - steps / 2;
-			udfs->dux_commands[OPBASE + 2] = 0;
-			udfs->dux_commands[OUTBASE + 2] = 0xFF & rngmask;
-			udfs->dux_commands[LOGBASE + 2] = 0;
+			usbduxfast_cmd_data(dev, 2, steps - steps / 2,
+					    0x00, rngmask, 0x00);
 
 			/* get the data and branch back */
 
 			/* branch back to state 1 */
-			udfs->dux_commands[LENBASE + 3] = 0x09;
 			/* deceision state w data */
-			udfs->dux_commands[OPBASE + 3] = 0x03;
-			udfs->dux_commands[OUTBASE + 3] = 0xFF & rngmask;
 			/* doesn't matter */
-			udfs->dux_commands[LOGBASE + 3] = 0xFF;
+			usbduxfast_cmd_data(dev, 3,
+					    0x09, 0x03, rngmask, 0xff);
 		}
 		break;
 
@@ -907,11 +659,8 @@
 		else
 			rngmask = 0xff;
 
-		udfs->dux_commands[LENBASE + 0] = 1;
 		/* data */
-		udfs->dux_commands[OPBASE + 0] = 0x02;
-		udfs->dux_commands[OUTBASE + 0] = 0xFF & rngmask;
-		udfs->dux_commands[LOGBASE + 0] = 0;
+		usbduxfast_cmd_data(dev, 0, 0x01, 0x02, rngmask, 0x00);
 
 		/* we have 1 state with duration 1: state 0 */
 		steps_tmp = steps - 1;
@@ -922,23 +671,16 @@
 			rngmask = 0xff;
 
 		/* do the first part of the delay */
-		udfs->dux_commands[LENBASE + 1] = steps_tmp / 2;
-		udfs->dux_commands[OPBASE + 1] = 0;
 		/* count */
-		udfs->dux_commands[OUTBASE + 1] = 0xFE & rngmask;
-		udfs->dux_commands[LOGBASE + 1] = 0;
+		usbduxfast_cmd_data(dev, 1, steps_tmp / 2,
+				    0x00, 0xfe & rngmask, 0x00);
 
 		/* and the second part */
-		udfs->dux_commands[LENBASE + 2] = steps_tmp - steps_tmp / 2;
-		udfs->dux_commands[OPBASE + 2] = 0;
-		udfs->dux_commands[OUTBASE + 2] = 0xFF & rngmask;
-		udfs->dux_commands[LOGBASE + 2] = 0;
+		usbduxfast_cmd_data(dev, 2, steps_tmp  - steps_tmp / 2,
+				    0x00, rngmask, 0x00);
 
-		udfs->dux_commands[LENBASE + 3] = 1;
 		/* data */
-		udfs->dux_commands[OPBASE + 3] = 0x02;
-		udfs->dux_commands[OUTBASE + 3] = 0xFF & rngmask;
-		udfs->dux_commands[LOGBASE + 3] = 0;
+		usbduxfast_cmd_data(dev, 3, 0x01, 0x02, rngmask, 0x00);
 
 		/*
 		 * we have 2 states with duration 1: step 6 and
@@ -952,22 +694,15 @@
 			rngmask = 0xff;
 
 		/* do the first part of the delay */
-		udfs->dux_commands[LENBASE + 4] = steps_tmp / 2;
-		udfs->dux_commands[OPBASE + 4] = 0;
 		/* reset */
-		udfs->dux_commands[OUTBASE + 4] = (0xFF - 0x02) & rngmask;
-		udfs->dux_commands[LOGBASE + 4] = 0;
+		usbduxfast_cmd_data(dev, 4, steps_tmp / 2,
+				    0x00, (0xff - 0x02) & rngmask, 0x00);
 
 		/* and the second part */
-		udfs->dux_commands[LENBASE + 5] = steps_tmp - steps_tmp / 2;
-		udfs->dux_commands[OPBASE + 5] = 0;
-		udfs->dux_commands[OUTBASE + 5] = 0xFF & rngmask;
-		udfs->dux_commands[LOGBASE + 5] = 0;
+		usbduxfast_cmd_data(dev, 5, steps_tmp - steps_tmp / 2,
+				    0x00, rngmask, 0x00);
 
-		udfs->dux_commands[LENBASE + 6] = 1;
-		udfs->dux_commands[OPBASE + 6] = 0;
-		udfs->dux_commands[OUTBASE + 6] = 0xFF & rngmask;
-		udfs->dux_commands[LOGBASE + 6] = 0;
+		usbduxfast_cmd_data(dev, 6, 0x01, 0x00, rngmask, 0x00);
 		break;
 
 	case 3:
@@ -975,6 +710,8 @@
 		 * three channels
 		 */
 		for (j = 0; j < 1; j++) {
+			int index = j * 2;
+
 			if (CR_RANGE(cmd->chanlist[j]) > 0)
 				rngmask = 0xff - 0x04;
 			else
@@ -983,12 +720,10 @@
 			 * commit data to the FIFO and do the first part
 			 * of the delay
 			 */
-			udfs->dux_commands[LENBASE + j * 2] = steps / 2;
 			/* data */
-			udfs->dux_commands[OPBASE + j * 2] = 0x02;
 			/* no change */
-			udfs->dux_commands[OUTBASE + j * 2] = 0xFF & rngmask;
-			udfs->dux_commands[LOGBASE + j * 2] = 0;
+			usbduxfast_cmd_data(dev, index, steps / 2,
+					    0x02, rngmask, 0x00);
 
 			if (CR_RANGE(cmd->chanlist[j + 1]) > 0)
 				rngmask = 0xff - 0x04;
@@ -996,25 +731,19 @@
 				rngmask = 0xff;
 
 			/* do the second part of the delay */
-			udfs->dux_commands[LENBASE + j * 2 + 1] =
-			    steps - steps / 2;
 			/* no data */
-			udfs->dux_commands[OPBASE + j * 2 + 1] = 0;
 			/* count */
-			udfs->dux_commands[OUTBASE + j * 2 + 1] =
-			    0xFE & rngmask;
-			udfs->dux_commands[LOGBASE + j * 2 + 1] = 0;
+			usbduxfast_cmd_data(dev, index + 1, steps - steps / 2,
+					    0x00, 0xfe & rngmask, 0x00);
 		}
 
 		/* 2 steps with duration 1: the idele step and step 6: */
 		steps_tmp = steps - 2;
 
 		/* commit data to the FIFO and do the first part of the delay */
-		udfs->dux_commands[LENBASE + 4] = steps_tmp / 2;
 		/* data */
-		udfs->dux_commands[OPBASE + 4] = 0x02;
-		udfs->dux_commands[OUTBASE + 4] = 0xFF & rngmask;
-		udfs->dux_commands[LOGBASE + 4] = 0;
+		usbduxfast_cmd_data(dev, 4, steps_tmp / 2,
+				    0x02, rngmask, 0x00);
 
 		if (CR_RANGE(cmd->chanlist[0]) > 0)
 			rngmask = 0xff - 0x04;
@@ -1022,17 +751,12 @@
 			rngmask = 0xff;
 
 		/* do the second part of the delay */
-		udfs->dux_commands[LENBASE + 5] = steps_tmp - steps_tmp / 2;
 		/* no data */
-		udfs->dux_commands[OPBASE + 5] = 0;
 		/* reset */
-		udfs->dux_commands[OUTBASE + 5] = (0xFF - 0x02) & rngmask;
-		udfs->dux_commands[LOGBASE + 5] = 0;
+		usbduxfast_cmd_data(dev, 5, steps_tmp - steps_tmp / 2,
+				    0x00, (0xff - 0x02) & rngmask, 0x00);
 
-		udfs->dux_commands[LENBASE + 6] = 1;
-		udfs->dux_commands[OPBASE + 6] = 0;
-		udfs->dux_commands[OUTBASE + 6] = 0xFF & rngmask;
-		udfs->dux_commands[LOGBASE + 6] = 0;
+		usbduxfast_cmd_data(dev, 6, 0x01, 0x00, rngmask, 0x00);
 
 	case 16:
 		if (CR_RANGE(cmd->chanlist[0]) > 0)
@@ -1046,101 +770,79 @@
 			 */
 
 			/* branch back to state 0 */
-			udfs->dux_commands[LENBASE + 0] = 0x01;
 			/* deceision state w/o data */
-			udfs->dux_commands[OPBASE + 0] = 0x01;
 			/* reset */
-			udfs->dux_commands[OUTBASE + 0] =
-			    (0xFF - 0x02) & rngmask;
 			/* RDY0 = 0 */
-			udfs->dux_commands[LOGBASE + 0] = 0x00;
+			usbduxfast_cmd_data(dev, 0, 0x01, 0x01,
+					    (0xff - 0x02) & rngmask, 0x00);
 		} else {
 			/*
 			 * we just proceed to state 1
 			 */
 
 			/* 30us reset pulse */
-			udfs->dux_commands[LENBASE + 0] = 255;
-			udfs->dux_commands[OPBASE + 0] = 0;
 			/* reset */
-			udfs->dux_commands[OUTBASE + 0] =
-			    (0xFF - 0x02) & rngmask;
-			udfs->dux_commands[LOGBASE + 0] = 0;
+			usbduxfast_cmd_data(dev, 0, 0xff, 0x00,
+					    (0xff - 0x02) & rngmask, 0x00);
 		}
 
 		/* commit data to the FIFO */
-		udfs->dux_commands[LENBASE + 1] = 1;
 		/* data */
-		udfs->dux_commands[OPBASE + 1] = 0x02;
-		udfs->dux_commands[OUTBASE + 1] = 0xFF & rngmask;
-		udfs->dux_commands[LOGBASE + 1] = 0;
+		usbduxfast_cmd_data(dev, 1, 0x01, 0x02, rngmask, 0x00);
 
 		/* we have 2 states with duration 1 */
 		steps = steps - 2;
 
 		/* do the first part of the delay */
-		udfs->dux_commands[LENBASE + 2] = steps / 2;
-		udfs->dux_commands[OPBASE + 2] = 0;
-		udfs->dux_commands[OUTBASE + 2] = 0xFE & rngmask;
-		udfs->dux_commands[LOGBASE + 2] = 0;
+		usbduxfast_cmd_data(dev, 2, steps / 2,
+				    0x00, 0xfe & rngmask, 0x00);
 
 		/* and the second part */
-		udfs->dux_commands[LENBASE + 3] = steps - steps / 2;
-		udfs->dux_commands[OPBASE + 3] = 0;
-		udfs->dux_commands[OUTBASE + 3] = 0xFF & rngmask;
-		udfs->dux_commands[LOGBASE + 3] = 0;
+		usbduxfast_cmd_data(dev, 3, steps - steps / 2,
+				    0x00, rngmask, 0x00);
 
 		/* branch back to state 1 */
-		udfs->dux_commands[LENBASE + 4] = 0x09;
 		/* deceision state w/o data */
-		udfs->dux_commands[OPBASE + 4] = 0x01;
-		udfs->dux_commands[OUTBASE + 4] = 0xFF & rngmask;
 		/* doesn't matter */
-		udfs->dux_commands[LOGBASE + 4] = 0xFF;
+		usbduxfast_cmd_data(dev, 4, 0x09, 0x01, rngmask, 0xff);
 
 		break;
 
 	default:
 		dev_err(dev->class_dev, "unsupported combination of channels\n");
-		up(&udfs->sem);
+		up(&devpriv->sem);
 		return -EFAULT;
 	}
 
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi %d: sending commands to the usb device\n",
-	       dev->minor);
-#endif
 	/* 0 means that the AD commands are sent */
-	result = send_dux_commands(udfs, SENDADCOMMANDS);
+	result = usbduxfast_send_cmd(dev, SENDADCOMMANDS);
 	if (result < 0) {
-		dev_err(dev->class_dev,
-			"adc command could not be submitted. Aborting...\n");
-		up(&udfs->sem);
+		up(&devpriv->sem);
 		return result;
 	}
 	if (cmd->stop_src == TRIG_COUNT) {
-		udfs->ai_sample_count = cmd->stop_arg * cmd->scan_end_arg;
-		if (udfs->ai_sample_count < 1) {
+		devpriv->ai_sample_count = cmd->stop_arg * cmd->scan_end_arg;
+		if (devpriv->ai_sample_count < 1) {
 			dev_err(dev->class_dev,
-				"(cmd->stop_arg)*(cmd->scan_end_arg)<1, aborting.\n");
-			up(&udfs->sem);
+				"(cmd->stop_arg)*(cmd->scan_end_arg)<1, aborting\n");
+			up(&devpriv->sem);
 			return -EFAULT;
 		}
-		udfs->ai_continous = 0;
+		devpriv->ai_continous = 0;
 	} else {
 		/* continous acquisition */
-		udfs->ai_continous = 1;
-		udfs->ai_sample_count = 0;
+		devpriv->ai_continous = 1;
+		devpriv->ai_sample_count = 0;
 	}
 
 	if ((cmd->start_src == TRIG_NOW) || (cmd->start_src == TRIG_EXT)) {
 		/* enable this acquisition operation */
-		udfs->ai_cmd_running = 1;
-		ret = usbduxfastsub_submit_InURBs(udfs);
+		devpriv->ai_cmd_running = 1;
+		ret = usbduxfast_submit_urb(dev);
 		if (ret < 0) {
-			udfs->ai_cmd_running = 0;
+			devpriv->ai_cmd_running = 0;
 			/* fixme: unlink here?? */
-			up(&udfs->sem);
+			up(&devpriv->sem);
 			return ret;
 		}
 		s->async->inttrig = NULL;
@@ -1152,7 +854,7 @@
 		 */
 		s->async->inttrig = usbduxfast_ai_inttrig;
 	}
-	up(&udfs->sem);
+	up(&devpriv->sem);
 
 	return 0;
 }
@@ -1162,309 +864,270 @@
  */
 static int usbduxfast_ai_insn_read(struct comedi_device *dev,
 				   struct comedi_subdevice *s,
-				   struct comedi_insn *insn, unsigned int *data)
+				   struct comedi_insn *insn,
+				   unsigned int *data)
 {
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxfast_private *devpriv = dev->private;
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	unsigned int range = CR_RANGE(insn->chanspec);
+	uint8_t rngmask = range ? (0xff - 0x04) : 0xff;
 	int i, j, n, actual_length;
-	int chan, range, rngmask;
-	int err;
-	struct usbduxfastsub_s *udfs;
+	int ret;
 
-	udfs = dev->private;
-	if (!udfs) {
-		dev_err(dev->class_dev, "%s: no usb dev.\n", __func__);
-		return -ENODEV;
-	}
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi%d: ai_insn_read, insn->n=%d, "
-	       "insn->subdev=%d\n", dev->minor, insn->n, insn->subdev);
-#endif
-	down(&udfs->sem);
-	if (!udfs->probed) {
-		up(&udfs->sem);
-		return -ENODEV;
-	}
-	if (udfs->ai_cmd_running) {
+	down(&devpriv->sem);
+
+	if (devpriv->ai_cmd_running) {
 		dev_err(dev->class_dev,
-			"ai_insn_read not possible. Async Command is running.\n");
-		up(&udfs->sem);
+			"ai_insn_read not possible, async cmd is running\n");
+		up(&devpriv->sem);
 		return -EBUSY;
 	}
-	/* sample one channel */
-	chan = CR_CHAN(insn->chanspec);
-	range = CR_RANGE(insn->chanspec);
+
 	/* set command for the first channel */
 
-	if (range > 0)
-		rngmask = 0xff - 0x04;
-	else
-		rngmask = 0xff;
-
 	/* commit data to the FIFO */
-	udfs->dux_commands[LENBASE + 0] = 1;
 	/* data */
-	udfs->dux_commands[OPBASE + 0] = 0x02;
-	udfs->dux_commands[OUTBASE + 0] = 0xFF & rngmask;
-	udfs->dux_commands[LOGBASE + 0] = 0;
+	usbduxfast_cmd_data(dev, 0, 0x01, 0x02, rngmask, 0x00);
 
 	/* do the first part of the delay */
-	udfs->dux_commands[LENBASE + 1] = 12;
-	udfs->dux_commands[OPBASE + 1] = 0;
-	udfs->dux_commands[OUTBASE + 1] = 0xFE & rngmask;
-	udfs->dux_commands[LOGBASE + 1] = 0;
-
-	udfs->dux_commands[LENBASE + 2] = 1;
-	udfs->dux_commands[OPBASE + 2] = 0;
-	udfs->dux_commands[OUTBASE + 2] = 0xFE & rngmask;
-	udfs->dux_commands[LOGBASE + 2] = 0;
-
-	udfs->dux_commands[LENBASE + 3] = 1;
-	udfs->dux_commands[OPBASE + 3] = 0;
-	udfs->dux_commands[OUTBASE + 3] = 0xFE & rngmask;
-	udfs->dux_commands[LOGBASE + 3] = 0;
-
-	udfs->dux_commands[LENBASE + 4] = 1;
-	udfs->dux_commands[OPBASE + 4] = 0;
-	udfs->dux_commands[OUTBASE + 4] = 0xFE & rngmask;
-	udfs->dux_commands[LOGBASE + 4] = 0;
+	usbduxfast_cmd_data(dev, 1, 0x0c, 0x00, 0xfe & rngmask, 0x00);
+	usbduxfast_cmd_data(dev, 2, 0x01, 0x00, 0xfe & rngmask, 0x00);
+	usbduxfast_cmd_data(dev, 3, 0x01, 0x00, 0xfe & rngmask, 0x00);
+	usbduxfast_cmd_data(dev, 4, 0x01, 0x00, 0xfe & rngmask, 0x00);
 
 	/* second part */
-	udfs->dux_commands[LENBASE + 5] = 12;
-	udfs->dux_commands[OPBASE + 5] = 0;
-	udfs->dux_commands[OUTBASE + 5] = 0xFF & rngmask;
-	udfs->dux_commands[LOGBASE + 5] = 0;
+	usbduxfast_cmd_data(dev, 5, 0x0c, 0x00, rngmask, 0x00);
+	usbduxfast_cmd_data(dev, 6, 0x01, 0x00, rngmask, 0x00);
 
-	udfs->dux_commands[LENBASE + 6] = 1;
-	udfs->dux_commands[OPBASE + 6] = 0;
-	udfs->dux_commands[OUTBASE + 6] = 0xFF & rngmask;
-	udfs->dux_commands[LOGBASE + 0] = 0;
-
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi %d: sending commands to the usb device\n",
-	       dev->minor);
-#endif
-	/* 0 means that the AD commands are sent */
-	err = send_dux_commands(udfs, SENDADCOMMANDS);
-	if (err < 0) {
-		dev_err(dev->class_dev,
-			"adc command could not be submitted. Aborting...\n");
-		up(&udfs->sem);
-		return err;
+	ret = usbduxfast_send_cmd(dev, SENDADCOMMANDS);
+	if (ret < 0) {
+		up(&devpriv->sem);
+		return ret;
 	}
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi%d: usbduxfast: submitting in-urb: "
-	       "0x%p,0x%p\n", udfs->comedidev->minor, udfs->urbIn->context,
-	       udfs->urbIn->dev);
-#endif
+
 	for (i = 0; i < PACKETS_TO_IGNORE; i++) {
-		err = usb_bulk_msg(udfs->usbdev,
-				   usb_rcvbulkpipe(udfs->usbdev, BULKINEP),
-				   udfs->transfer_buffer, SIZEINBUF,
+		ret = usb_bulk_msg(usb, usb_rcvbulkpipe(usb, BULKINEP),
+				   devpriv->inbuf, SIZEINBUF,
 				   &actual_length, 10000);
-		if (err < 0) {
-			dev_err(dev->class_dev, "insn timeout. No data.\n");
-			up(&udfs->sem);
-			return err;
+		if (ret < 0) {
+			dev_err(dev->class_dev, "insn timeout, no data\n");
+			up(&devpriv->sem);
+			return ret;
 		}
 	}
-	/* data points */
+
 	for (i = 0; i < insn->n;) {
-		err = usb_bulk_msg(udfs->usbdev,
-				   usb_rcvbulkpipe(udfs->usbdev, BULKINEP),
-				   udfs->transfer_buffer, SIZEINBUF,
+		ret = usb_bulk_msg(usb, usb_rcvbulkpipe(usb, BULKINEP),
+				   devpriv->inbuf, SIZEINBUF,
 				   &actual_length, 10000);
-		if (err < 0) {
-			dev_err(dev->class_dev, "insn data error: %d\n", err);
-			up(&udfs->sem);
-			return err;
+		if (ret < 0) {
+			dev_err(dev->class_dev, "insn data error: %d\n", ret);
+			up(&devpriv->sem);
+			return ret;
 		}
 		n = actual_length / sizeof(uint16_t);
 		if ((n % 16) != 0) {
-			dev_err(dev->class_dev, "insn data packet corrupted.\n");
-			up(&udfs->sem);
+			dev_err(dev->class_dev, "insn data packet corrupted\n");
+			up(&devpriv->sem);
 			return -EINVAL;
 		}
 		for (j = chan; (j < n) && (i < insn->n); j = j + 16) {
-			data[i] = ((uint16_t *) (udfs->transfer_buffer))[j];
+			data[i] = ((uint16_t *) (devpriv->inbuf))[j];
 			i++;
 		}
 	}
-	up(&udfs->sem);
-	return i;
+
+	up(&devpriv->sem);
+
+	return insn->n;
 }
 
-#define FIRMWARE_MAX_LEN 0x2000
-
-static int firmwareUpload(struct usbduxfastsub_s *usbduxfastsub,
-			  const u8 *firmwareBinary, int sizeFirmware)
+static int usbduxfast_attach_common(struct comedi_device *dev)
 {
+	struct usbduxfast_private *devpriv = dev->private;
+	struct comedi_subdevice *s;
 	int ret;
-	uint8_t *fwBuf;
 
-	if (!firmwareBinary)
+	down(&devpriv->sem);
+
+	ret = comedi_alloc_subdevices(dev, 1);
+	if (ret) {
+		up(&devpriv->sem);
+		return ret;
+	}
+
+	/* Analog Input subdevice */
+	s = &dev->subdevices[0];
+	dev->read_subdev = s;
+	s->type		= COMEDI_SUBD_AI;
+	s->subdev_flags	= SDF_READABLE | SDF_GROUND | SDF_CMD_READ;
+	s->n_chan	= 16;
+	s->len_chanlist	= 16;
+	s->insn_read	= usbduxfast_ai_insn_read;
+	s->do_cmdtest	= usbduxfast_ai_cmdtest;
+	s->do_cmd	= usbduxfast_ai_cmd;
+	s->cancel	= usbduxfast_ai_cancel;
+	s->maxdata	= 0x1000;
+	s->range_table	= &range_usbduxfast_ai_range;
+
+	up(&devpriv->sem);
+
+	return 0;
+}
+
+static int usbduxfast_upload_firmware(struct comedi_device *dev,
+				      const u8 *data, size_t size,
+				      unsigned long context)
+{
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	uint8_t *buf;
+	unsigned char *tmp;
+	int ret;
+
+	if (!data)
 		return 0;
 
-	if (sizeFirmware > FIRMWARE_MAX_LEN) {
-		dev_err(&usbduxfastsub->interface->dev,
-			"comedi_: usbduxfast firmware binary it too large for FX2.\n");
+	if (size > FIRMWARE_MAX_LEN) {
+		dev_err(dev->class_dev, "firmware binary too large for FX2\n");
 		return -ENOMEM;
 	}
 
 	/* we generate a local buffer for the firmware */
-	fwBuf = kmemdup(firmwareBinary, sizeFirmware, GFP_KERNEL);
-	if (!fwBuf) {
-		dev_err(&usbduxfastsub->interface->dev,
-			"comedi_: mem alloc for firmware failed\n");
+	buf = kmemdup(data, size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* we need a malloc'ed buffer for usb_control_msg() */
+	tmp = kmalloc(1, GFP_KERNEL);
+	if (!tmp) {
+		kfree(buf);
 		return -ENOMEM;
 	}
 
-	ret = usbduxfastsub_stop(usbduxfastsub);
+	/* stop the current firmware on the device */
+	*tmp = 1;	/* 7f92 to one */
+	ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+			      USBDUXFASTSUB_FIRMWARE,
+			      VENDOR_DIR_OUT,
+			      USBDUXFASTSUB_CPUCS, 0x0000,
+			      tmp, 1,
+			      EZTIMEOUT);
 	if (ret < 0) {
-		dev_err(&usbduxfastsub->interface->dev,
-			"comedi_: can not stop firmware\n");
-		kfree(fwBuf);
-		return ret;
+		dev_err(dev->class_dev, "can not stop firmware\n");
+		goto done;
 	}
 
-	ret = usbduxfastsub_upload(usbduxfastsub, fwBuf, 0, sizeFirmware);
+	/* upload the new firmware to the device */
+	ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+			      USBDUXFASTSUB_FIRMWARE,
+			      VENDOR_DIR_OUT,
+			      0, 0x0000,
+			      buf, size,
+			      EZTIMEOUT);
 	if (ret < 0) {
-		dev_err(&usbduxfastsub->interface->dev,
-			"comedi_: firmware upload failed\n");
-		kfree(fwBuf);
-		return ret;
-	}
-	ret = usbduxfastsub_start(usbduxfastsub);
-	if (ret < 0) {
-		dev_err(&usbduxfastsub->interface->dev,
-			"comedi_: can not start firmware\n");
-		kfree(fwBuf);
-		return ret;
-	}
-	kfree(fwBuf);
-	return 0;
-}
-
-static void tidy_up(struct usbduxfastsub_s *udfs)
-{
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi_: usbduxfast: tiding up\n");
-#endif
-
-	if (!udfs)
-		return;
-
-	/* shows the usb subsystem that the driver is down */
-	if (udfs->interface)
-		usb_set_intfdata(udfs->interface, NULL);
-
-	udfs->probed = 0;
-
-	if (udfs->urbIn) {
-		/* waits until a running transfer is over */
-		usb_kill_urb(udfs->urbIn);
-
-		kfree(udfs->transfer_buffer);
-		udfs->transfer_buffer = NULL;
-
-		usb_free_urb(udfs->urbIn);
-		udfs->urbIn = NULL;
+		dev_err(dev->class_dev, "firmware upload failed\n");
+		goto done;
 	}
 
-	kfree(udfs->insnBuffer);
-	udfs->insnBuffer = NULL;
+	/* start the new firmware on the device */
+	*tmp = 0;	/* 7f92 to zero */
+	ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+			      USBDUXFASTSUB_FIRMWARE,
+			      VENDOR_DIR_OUT,
+			      USBDUXFASTSUB_CPUCS, 0x0000,
+			      tmp, 1,
+			      EZTIMEOUT);
+	if (ret < 0)
+		dev_err(dev->class_dev, "can not start firmware\n");
 
-	kfree(udfs->dux_commands);
-	udfs->dux_commands = NULL;
-
-	udfs->ai_cmd_running = 0;
-}
-
-static int usbduxfast_attach_common(struct comedi_device *dev,
-				    struct usbduxfastsub_s *udfs)
-{
-	int ret;
-	struct comedi_subdevice *s;
-
-	down(&udfs->sem);
-	/* pointer back to the corresponding comedi device */
-	udfs->comedidev = dev;
-
-	ret = comedi_alloc_subdevices(dev, 1);
-	if (ret) {
-		up(&udfs->sem);
-		return ret;
-	}
-	/* private structure is also simply the usb-structure */
-	dev->private = udfs;
-	/* the first subdevice is the A/D converter */
-	s = &dev->subdevices[SUBDEV_AD];
-	/*
-	 * the URBs get the comedi subdevice which is responsible for reading
-	 * this is the subdevice which reads data
-	 */
-	dev->read_subdev = s;
-	/* the subdevice receives as private structure the usb-structure */
-	s->private = NULL;
-	/* analog input */
-	s->type = COMEDI_SUBD_AI;
-	/* readable and ref is to ground */
-	s->subdev_flags = SDF_READABLE | SDF_GROUND | SDF_CMD_READ;
-	/* 16 channels */
-	s->n_chan = 16;
-	/* length of the channellist */
-	s->len_chanlist = 16;
-	/* callback functions */
-	s->insn_read = usbduxfast_ai_insn_read;
-	s->do_cmdtest = usbduxfast_ai_cmdtest;
-	s->do_cmd = usbduxfast_ai_cmd;
-	s->cancel = usbduxfast_ai_cancel;
-	/* max value from the A/D converter (12bit+1 bit for overflow) */
-	s->maxdata = 0x1000;
-	/* range table to convert to physical units */
-	s->range_table = &range_usbduxfast_ai_range;
-	/* finally decide that it's attached */
-	udfs->attached = 1;
-	up(&udfs->sem);
-	dev_info(dev->class_dev, "successfully attached to usbduxfast.\n");
-	return 0;
+done:
+	kfree(tmp);
+	kfree(buf);
+	return ret;
 }
 
 static int usbduxfast_auto_attach(struct comedi_device *dev,
 				  unsigned long context_unused)
 {
-	struct usb_interface *uinterf = comedi_to_usb_interface(dev);
+	struct usb_interface *intf = comedi_to_usb_interface(dev);
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxfast_private *devpriv;
 	int ret;
-	struct usbduxfastsub_s *udfs;
 
-	dev->private = NULL;
-	down(&start_stop_sem);
-	udfs = usb_get_intfdata(uinterf);
-	if (!udfs || !udfs->probed) {
+	if (usb->speed != USB_SPEED_HIGH) {
 		dev_err(dev->class_dev,
-			"usbduxfast: error: auto_attach failed, not connected\n");
-		ret = -ENODEV;
-	} else if (udfs->attached) {
+			"This driver needs USB 2.0 to operate. Aborting...\n");
+		return -ENODEV;
+	}
+
+	devpriv = kzalloc(sizeof(*devpriv), GFP_KERNEL);
+	if (!devpriv)
+		return -ENOMEM;
+	dev->private = devpriv;
+
+	sema_init(&devpriv->sem, 1);
+	usb_set_intfdata(intf, devpriv);
+
+	devpriv->duxbuf = kmalloc(SIZEOFDUXBUF, GFP_KERNEL);
+	if (!devpriv->duxbuf)
+		return -ENOMEM;
+
+	ret = usb_set_interface(usb,
+				intf->altsetting->desc.bInterfaceNumber, 1);
+	if (ret < 0) {
 		dev_err(dev->class_dev,
-		       "usbduxfast: error: auto_attach failed, already attached\n");
-		ret = -ENODEV;
-	} else
-		ret = usbduxfast_attach_common(dev, udfs);
-	up(&start_stop_sem);
-	return ret;
+			"could not switch to alternate setting 1\n");
+		return -ENODEV;
+	}
+
+	devpriv->urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!devpriv->urb) {
+		dev_err(dev->class_dev, "Could not alloc. urb\n");
+		return -ENOMEM;
+	}
+
+	devpriv->inbuf = kmalloc(SIZEINBUF, GFP_KERNEL);
+	if (!devpriv->inbuf)
+		return -ENOMEM;
+
+	ret = comedi_load_firmware(dev, &usb->dev, FIRMWARE,
+				   usbduxfast_upload_firmware, 0);
+	if (ret)
+		return ret;
+
+	return usbduxfast_attach_common(dev);
 }
 
 static void usbduxfast_detach(struct comedi_device *dev)
 {
-	struct usbduxfastsub_s *usb = dev->private;
+	struct usb_interface *intf = comedi_to_usb_interface(dev);
+	struct usbduxfast_private *devpriv = dev->private;
 
-	if (usb) {
-		down(&usb->sem);
-		down(&start_stop_sem);
-		dev->private = NULL;
-		usb->attached = 0;
-		usb->comedidev = NULL;
-		up(&start_stop_sem);
-		up(&usb->sem);
+	if (!devpriv)
+		return;
+
+	down(&devpriv->sem);
+
+	usb_set_intfdata(intf, NULL);
+
+	if (devpriv->urb) {
+		/* waits until a running transfer is over */
+		usb_kill_urb(devpriv->urb);
+
+		kfree(devpriv->inbuf);
+		devpriv->inbuf = NULL;
+
+		usb_free_urb(devpriv->urb);
+		devpriv->urb = NULL;
 	}
+
+	kfree(devpriv->duxbuf);
+	devpriv->duxbuf = NULL;
+
+	devpriv->ai_cmd_running = 0;
+
+	up(&devpriv->sem);
 }
 
 static struct comedi_driver usbduxfast_driver = {
@@ -1474,178 +1137,10 @@
 	.detach		= usbduxfast_detach,
 };
 
-static void usbduxfast_firmware_request_complete_handler(const struct firmware
-							 *fw, void *context)
-{
-	struct usbduxfastsub_s *usbduxfastsub_tmp = context;
-	struct usb_interface *uinterf = usbduxfastsub_tmp->interface;
-	int ret;
-
-	if (fw == NULL)
-		return;
-
-	/*
-	 * we need to upload the firmware here because fw will be
-	 * freed once we've left this function
-	 */
-	ret = firmwareUpload(usbduxfastsub_tmp, fw->data, fw->size);
-
-	if (ret) {
-		dev_err(&uinterf->dev,
-			"Could not upload firmware (err=%d)\n", ret);
-		goto out;
-	}
-
-	comedi_usb_auto_config(uinterf, &usbduxfast_driver, 0);
- out:
-	release_firmware(fw);
-}
-
-static int usbduxfast_usb_probe(struct usb_interface *uinterf,
+static int usbduxfast_usb_probe(struct usb_interface *intf,
 				const struct usb_device_id *id)
 {
-	struct usb_device *udev = interface_to_usbdev(uinterf);
-	int i;
-	int index;
-	int ret;
-
-	if (udev->speed != USB_SPEED_HIGH) {
-		dev_err(&uinterf->dev,
-			"This driver needs USB 2.0 to operate. Aborting...\n");
-		return -ENODEV;
-	}
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi_: usbduxfast_: finding a free structure for "
-	       "the usb-device\n");
-#endif
-	down(&start_stop_sem);
-	/* look for a free place in the usbduxfast array */
-	index = -1;
-	for (i = 0; i < NUMUSBDUXFAST; i++) {
-		if (!usbduxfastsub[i].probed) {
-			index = i;
-			break;
-		}
-	}
-
-	/* no more space */
-	if (index == -1) {
-		dev_err(&uinterf->dev,
-			"Too many usbduxfast-devices connected.\n");
-		up(&start_stop_sem);
-		return -EMFILE;
-	}
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi_: usbduxfast: usbduxfastsub[%d] is ready to "
-	       "connect to comedi.\n", index);
-#endif
-
-	sema_init(&(usbduxfastsub[index].sem), 1);
-	/* save a pointer to the usb device */
-	usbduxfastsub[index].usbdev = udev;
-
-	/* save the interface itself */
-	usbduxfastsub[index].interface = uinterf;
-	/* get the interface number from the interface */
-	usbduxfastsub[index].ifnum = uinterf->altsetting->desc.bInterfaceNumber;
-	/*
-	 * hand the private data over to the usb subsystem
-	 * will be needed for disconnect
-	 */
-	usb_set_intfdata(uinterf, &(usbduxfastsub[index]));
-
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi_: usbduxfast: ifnum=%d\n",
-	       usbduxfastsub[index].ifnum);
-#endif
-	/* create space for the commands going to the usb device */
-	usbduxfastsub[index].dux_commands = kmalloc(SIZEOFDUXBUFFER,
-						    GFP_KERNEL);
-	if (!usbduxfastsub[index].dux_commands) {
-		tidy_up(&(usbduxfastsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	/* create space of the instruction buffer */
-	usbduxfastsub[index].insnBuffer = kmalloc(SIZEINSNBUF, GFP_KERNEL);
-	if (!usbduxfastsub[index].insnBuffer) {
-		tidy_up(&(usbduxfastsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	/* setting to alternate setting 1: enabling bulk ep */
-	i = usb_set_interface(usbduxfastsub[index].usbdev,
-			      usbduxfastsub[index].ifnum, 1);
-	if (i < 0) {
-		dev_err(&uinterf->dev,
-			"usbduxfast%d: could not switch to alternate setting 1.\n",
-			index);
-		tidy_up(&(usbduxfastsub[index]));
-		up(&start_stop_sem);
-		return -ENODEV;
-	}
-	usbduxfastsub[index].urbIn = usb_alloc_urb(0, GFP_KERNEL);
-	if (!usbduxfastsub[index].urbIn) {
-		dev_err(&uinterf->dev,
-			"usbduxfast%d: Could not alloc. urb\n", index);
-		tidy_up(&(usbduxfastsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	usbduxfastsub[index].transfer_buffer = kmalloc(SIZEINBUF, GFP_KERNEL);
-	if (!usbduxfastsub[index].transfer_buffer) {
-		tidy_up(&(usbduxfastsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	/* we've reached the bottom of the function */
-	usbduxfastsub[index].probed = 1;
-	up(&start_stop_sem);
-
-	ret = request_firmware_nowait(THIS_MODULE,
-				      FW_ACTION_HOTPLUG,
-				      FIRMWARE,
-				      &udev->dev,
-				      GFP_KERNEL,
-				      usbduxfastsub + index,
-				      usbduxfast_firmware_request_complete_handler);
-
-	if (ret) {
-		dev_err(&uinterf->dev, "could not load firmware (err=%d)\n", ret);
-		return ret;
-	}
-
-	dev_info(&uinterf->dev,
-		 "usbduxfast%d has been successfully initialized.\n", index);
-	/* success */
-	return 0;
-}
-
-static void usbduxfast_usb_disconnect(struct usb_interface *intf)
-{
-	struct usbduxfastsub_s *udfs = usb_get_intfdata(intf);
-	struct usb_device *udev = interface_to_usbdev(intf);
-
-	if (!udfs) {
-		dev_err(&intf->dev, "disconnect called with null pointer.\n");
-		return;
-	}
-	if (udfs->usbdev != udev) {
-		dev_err(&intf->dev, "BUG! called with wrong ptr!!!\n");
-		return;
-	}
-
-	comedi_usb_auto_unconfig(intf);
-
-	down(&start_stop_sem);
-	down(&udfs->sem);
-	tidy_up(udfs);
-	up(&udfs->sem);
-	up(&start_stop_sem);
-
-#ifdef CONFIG_COMEDI_DEBUG
-	printk(KERN_DEBUG "comedi_: usbduxfast: disconnected from the usb\n");
-#endif
+	return comedi_usb_auto_config(intf, &usbduxfast_driver, 0);
 }
 
 static const struct usb_device_id usbduxfast_usb_table[] = {
@@ -1657,12 +1152,9 @@
 MODULE_DEVICE_TABLE(usb, usbduxfast_usb_table);
 
 static struct usb_driver usbduxfast_usb_driver = {
-#ifdef COMEDI_HAVE_USB_DRIVER_OWNER
-	.owner		= THIS_MODULE,
-#endif
 	.name		= "usbduxfast",
 	.probe		= usbduxfast_usb_probe,
-	.disconnect	= usbduxfast_usb_disconnect,
+	.disconnect	= comedi_usb_auto_unconfig,
 	.id_table	= usbduxfast_usb_table,
 };
 module_comedi_usb_driver(usbduxfast_driver, usbduxfast_usb_driver);

diff --git a/drivers/staging/comedi/drivers/usbduxsigma.c b/drivers/staging/comedi/drivers/usbduxsigma.c
index d3bc1b9..898c3c4 100644
--- a/drivers/staging/comedi/drivers/usbduxsigma.c
+++ b/drivers/staging/comedi/drivers/usbduxsigma.c

@@ -1,30 +1,27 @@
 /*
-   comedi/drivers/usbdux.c
-   Copyright (C) 2011 Bernd Porr, Bernd.Porr@f2s.com
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
+ * usbduxsigma.c
+ * Copyright (C) 2011 Bernd Porr, Bernd.Porr@f2s.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  */
+
 /*
-Driver: usbduxsigma
-Description: University of Stirling USB DAQ & INCITE Technology Limited
-Devices: [ITL] USB-DUX (usbduxsigma.o)
-Author: Bernd Porr <BerndPorr@f2s.com>
-Updated: 8 Nov 2011
-Status: testing
-*/
+ * Driver: usbduxsigma
+ * Description: University of Stirling USB DAQ & INCITE Technology Limited
+ * Devices: (ITL) USB-DUX [usbduxsigma]
+ * Author: Bernd Porr <BerndPorr@f2s.com>
+ * Updated: 8 Nov 2011
+ * Status: testing
+ */
+
 /*
  * I must give credit here to Chris Baugher who
  * wrote the driver for AT-MIO-16d. I used some parts of this
@@ -44,9 +41,6 @@
  *   0.6: corrected wrong input range
  */
 
-/* generates loads of debug info */
-/* #define NOISY_DUX_DEBUGBUG */
-
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -55,7 +49,7 @@
 #include <linux/usb.h>
 #include <linux/fcntl.h>
 #include <linux/compiler.h>
-#include <linux/firmware.h>
+
 #include "comedi_fc.h"
 #include "../comedidev.h"
 
@@ -63,38 +57,21 @@
 #define BULK_TIMEOUT 1000
 
 /* constants for "firmware" upload and download */
-#define FIRMWARE "usbduxsigma_firmware.bin"
-#define USBDUXSUB_FIRMWARE 0xA0
-#define VENDOR_DIR_IN  0xC0
-#define VENDOR_DIR_OUT 0x40
+#define FIRMWARE		"usbduxsigma_firmware.bin"
+#define FIRMWARE_MAX_LEN	0x4000
+#define USBDUXSUB_FIRMWARE	0xa0
+#define VENDOR_DIR_IN		0xc0
+#define VENDOR_DIR_OUT		0x40
 
 /* internal addresses of the 8051 processor */
 #define USBDUXSUB_CPUCS 0xE600
 
-/*
- * the minor device number, major is 180 only for debugging purposes and to
- * upload special firmware (programming the eeprom etc) which is not
- * compatible with the comedi framwork
- */
-#define USBDUXSUB_MINOR 32
-
-/* max lenghth of the transfer-buffer for software upload */
-#define TB_LEN 0x2000
-
-/* Input endpoint number: ISO/IRQ */
-#define ISOINEP           6
-
-/* Output endpoint number: ISO/IRQ */
-#define ISOOUTEP          2
-
-/* This EP sends DUX commands to USBDUX */
-#define COMMAND_OUT_EP     1
-
-/* This EP receives the DUX commands from USBDUX */
-#define COMMAND_IN_EP        8
-
-/* Output endpoint for PWM */
-#define PWM_EP         4
+/* USB endpoints */
+#define USBDUXSIGMA_CMD_OUT_EP		1	/* command output */
+#define USBDUXSIGMA_ISO_OUT_EP		2	/* analog output ISO/IRQ */
+#define USBDUXSIGMA_PWM_OUT_EP		4	/* pwm output */
+#define USBDUXSIGMA_ISO_IN_EP		6	/* analog input ISO/IRQ */
+#define USBDUXSIGMA_CMD_IN_EP		8	/* command input */
 
 /* 300Hz max frequ under PWM */
 #define MIN_PWM_PERIOD  ((long)(1E9/300))
@@ -105,6 +82,8 @@
 /* Number of channels (16 AD and offset)*/
 #define NUMCHANNELS 16
 
+#define USBDUXSIGMA_NUM_AO_CHAN		4
+
 /* Size of one A/D value */
 #define SIZEADIN          ((sizeof(int32_t)))
 
@@ -150,84 +129,54 @@
 /* must have more buffers due to buggy USB ctr */
 #define NUMOFOUTBUFFERSHIGH    10
 
-/* Total number of usbdux devices */
-#define NUMUSBDUX             16
-
-/* Analogue in subdevice */
-#define SUBDEV_AD             0
-
-/* Analogue out subdevice */
-#define SUBDEV_DA             1
-
-/* Digital I/O */
-#define SUBDEV_DIO            2
-
-/* timer aka pwm output */
-#define SUBDEV_PWM            3
-
 /* number of retries to get the right dux command */
 #define RETRIES 10
 
-/**************************************************/
-/* comedi constants */
-static const struct comedi_lrange range_usbdux_ai_range = { 1, {
-								BIP_RANGE
-								(2.65/2.0)
-								}
+/* bulk transfer commands to usbduxsigma */
+#define USBBUXSIGMA_AD_CMD		0
+#define USBDUXSIGMA_DA_CMD		1
+#define USBDUXSIGMA_DIO_CFG_CMD		2
+#define USBDUXSIGMA_DIO_BITS_CMD	3
+#define USBDUXSIGMA_SINGLE_AD_CMD	4
+#define USBDUXSIGMA_PWM_ON_CMD		7
+#define USBDUXSIGMA_PWM_OFF_CMD		8
+
+static const struct comedi_lrange usbduxsigma_ai_range = {
+	1, {
+		BIP_RANGE(2.65 / 2.0)
+	}
 };
 
-/*
- * private structure of one subdevice
- */
-
-/*
- * This is the structure which holds all the data of
- * this driver one sub device just now: A/D
- */
-struct usbduxsub {
-	/* attached? */
-	int attached;
-	/* is it associated with a subdevice? */
-	int probed;
-	/* pointer to the usb-device */
-	struct usb_device *usbdev;
+struct usbduxsigma_private {
 	/* actual number of in-buffers */
-	int numOfInBuffers;
+	int n_ai_urbs;
 	/* actual number of out-buffers */
-	int numOfOutBuffers;
+	int n_ao_urbs;
 	/* ISO-transfer handling: buffers */
-	struct urb **urbIn;
-	struct urb **urbOut;
+	struct urb **ai_urbs;
+	struct urb **ao_urbs;
 	/* pwm-transfer handling */
-	struct urb *urbPwm;
+	struct urb *pwm_urb;
 	/* PWM period */
-	unsigned int pwmPeriod;
+	unsigned int pwm_period;
 	/* PWM internal delay for the GPIF in the FX2 */
-	uint8_t pwmDelay;
+	uint8_t pwm_delay;
 	/* size of the PWM buffer which holds the bit pattern */
-	int sizePwmBuf;
+	int pwm_buf_sz;
 	/* input buffer for the ISO-transfer */
-	int32_t *inBuffer;
+	int32_t *in_buf;
 	/* input buffer for single insn */
-	int8_t *insnBuffer;
-	/* output buffer for single DA outputs */
-	int16_t *outBuffer;
-	/* interface number */
-	int ifnum;
-	/* interface structure in 2.6 */
-	struct usb_interface *interface;
-	/* comedi device for the interrupt context */
-	struct comedi_device *comedidev;
-	/* is it USB_SPEED_HIGH or not? */
-	short int high_speed;
-	/* asynchronous command is running */
-	short int ai_cmd_running;
-	short int ao_cmd_running;
-	/* pwm is running */
-	short int pwm_cmd_running;
-	/* continuous acquisition */
-	short int ai_continuous;
-	short int ao_continuous;
+	int8_t *insn_buf;
+
+	unsigned int ao_readback[USBDUXSIGMA_NUM_AO_CHAN];
+
+	unsigned high_speed:1;
+	unsigned ai_cmd_running:1;
+	unsigned ai_continuous:1;
+	unsigned ao_cmd_running:1;
+	unsigned ao_continuous:1;
+	unsigned pwm_cmd_running:1;
+
 	/* number of samples to acquire */
 	int ai_sample_count;
 	int ao_sample_count;
@@ -246,126 +195,58 @@
 	struct semaphore sem;
 };
 
-/*
- * The pointer to the private usb-data of the driver is also the private data
- * for the comedi-device.  This has to be global as the usb subsystem needs
- * global variables. The other reason is that this structure must be there
- * _before_ any comedi command is issued. The usb subsystem must be initialised
- * before comedi can access it.
- */
-static struct usbduxsub usbduxsub[NUMUSBDUX];
-
-static DEFINE_SEMAPHORE(start_stop_sem);
-
-/*
- * Stops the data acquision
- * It should be safe to call this function from any context
- */
-static int usbduxsub_unlink_InURBs(struct usbduxsub *usbduxsub_tmp)
+static void usbduxsigma_ai_stop(struct comedi_device *dev, int do_unlink)
 {
-	int i = 0;
-	int err = 0;
-
-	if (usbduxsub_tmp && usbduxsub_tmp->urbIn) {
-		for (i = 0; i < usbduxsub_tmp->numOfInBuffers; i++) {
-			if (usbduxsub_tmp->urbIn[i]) {
-				/* We wait here until all transfers have been
-				 * cancelled. */
-				usb_kill_urb(usbduxsub_tmp->urbIn[i]);
-			}
-			dev_dbg(&usbduxsub_tmp->interface->dev,
-				"comedi: usbdux: unlinked InURB %d, err=%d\n",
-				i, err);
-		}
-	}
-	return err;
-}
-
-/*
- * This will stop a running acquisition operation
- * Is called from within this driver from both the
- * interrupt context and from comedi
- */
-static int usbdux_ai_stop(struct usbduxsub *this_usbduxsub, int do_unlink)
-{
-	int ret = 0;
-
-	if (!this_usbduxsub) {
-		pr_err("comedi?: usbdux_ai_stop: this_usbduxsub=NULL!\n");
-		return -EFAULT;
-	}
-	dev_dbg(&this_usbduxsub->interface->dev, "comedi: usbdux_ai_stop\n");
+	struct usbduxsigma_private *devpriv = dev->private;
 
 	if (do_unlink) {
-		/* stop aquistion */
-		ret = usbduxsub_unlink_InURBs(this_usbduxsub);
+		int i;
+
+		for (i = 0; i < devpriv->n_ai_urbs; i++) {
+			if (devpriv->ai_urbs[i])
+				usb_kill_urb(devpriv->ai_urbs[i]);
+		}
 	}
 
-	this_usbduxsub->ai_cmd_running = 0;
-
-	return ret;
+	devpriv->ai_cmd_running = 0;
 }
 
-/*
- * This will cancel a running acquisition operation.
- * This is called by comedi but never from inside the driver.
- */
-static int usbdux_ai_cancel(struct comedi_device *dev,
-			    struct comedi_subdevice *s)
+static int usbduxsigma_ai_cancel(struct comedi_device *dev,
+				 struct comedi_subdevice *s)
 {
-	struct usbduxsub *this_usbduxsub;
-	int res = 0;
+	struct usbduxsigma_private *devpriv = dev->private;
 
-	/* force unlink of all urbs */
-	this_usbduxsub = dev->private;
-	if (!this_usbduxsub)
-		return -EFAULT;
+	down(&devpriv->sem);
+	/* unlink only if it is really running */
+	usbduxsigma_ai_stop(dev, devpriv->ai_cmd_running);
+	up(&devpriv->sem);
 
-	dev_dbg(&this_usbduxsub->interface->dev, "comedi: usbdux_ai_cancel\n");
-
-	/* prevent other CPUs from submitting new commands just now */
-	down(&this_usbduxsub->sem);
-	if (!(this_usbduxsub->probed)) {
-		up(&this_usbduxsub->sem);
-		return -ENODEV;
-	}
-	/* unlink only if the urb really has been submitted */
-	res = usbdux_ai_stop(this_usbduxsub, this_usbduxsub->ai_cmd_running);
-	up(&this_usbduxsub->sem);
-	return res;
+	return 0;
 }
 
-/* analogue IN - interrupt service routine */
-static void usbduxsub_ai_IsocIrq(struct urb *urb)
+static void usbduxsigma_ai_urb_complete(struct urb *urb)
 {
-	int i, err, n;
-	struct usbduxsub *this_usbduxsub;
-	struct comedi_device *this_comedidev;
-	struct comedi_subdevice *s;
-	int32_t v;
+	struct comedi_device *dev = urb->context;
+	struct usbduxsigma_private *devpriv = dev->private;
+	struct comedi_subdevice *s = dev->read_subdev;
 	unsigned int dio_state;
-
-	/* the context variable points to the comedi device */
-	this_comedidev = urb->context;
-	/* the private structure of the subdevice is struct usbduxsub */
-	this_usbduxsub = this_comedidev->private;
-	/* subdevice which is the AD converter */
-	s = &this_comedidev->subdevices[SUBDEV_AD];
+	int32_t val;
+	int ret;
+	int i;
 
 	/* first we test if something unusual has just happened */
 	switch (urb->status) {
 	case 0:
 		/* copy the result in the transfer buffer */
-		memcpy(this_usbduxsub->inBuffer,
-		       urb->transfer_buffer, SIZEINBUF);
+		memcpy(devpriv->in_buf, urb->transfer_buffer, SIZEINBUF);
 		break;
 	case -EILSEQ:
-		/* error in the ISOchronous data */
-		/* we don't copy the data into the transfer buffer */
-		/* and recycle the last data byte */
-		dev_dbg(&urb->dev->dev,
-			"comedi%d: usbdux: CRC error in ISO IN stream.\n",
-			this_usbduxsub->comedidev->minor);
+		/*
+		 * error in the ISOchronous data
+		 * we don't copy the data into the transfer buffer
+		 * and recycle the last data byte
+		 */
+		dev_dbg(dev->class_dev, "CRC error in ISO IN stream\n");
 
 		break;
 
@@ -374,185 +255,127 @@
 	case -ESHUTDOWN:
 	case -ECONNABORTED:
 		/* happens after an unlink command */
-		if (this_usbduxsub->ai_cmd_running) {
-			/* we are still running a command */
-			/* tell this comedi */
-			s->async->events |= COMEDI_CB_EOA;
-			s->async->events |= COMEDI_CB_ERROR;
-			comedi_event(this_usbduxsub->comedidev, s);
-			/* stop the transfer w/o unlink */
-			usbdux_ai_stop(this_usbduxsub, 0);
+		if (devpriv->ai_cmd_running) {
+			usbduxsigma_ai_stop(dev, 0);	/* w/o unlink */
+			/* we are still running a command, tell comedi */
+			s->async->events |= (COMEDI_CB_EOA | COMEDI_CB_ERROR);
+			comedi_event(dev, s);
 		}
 		return;
 
 	default:
-		/* a real error on the bus */
-		/* pass error to comedi if we are really running a command */
-		if (this_usbduxsub->ai_cmd_running) {
-			dev_err(&urb->dev->dev,
-				"Non-zero urb status received in ai intr "
-				"context: %d\n", urb->status);
-			s->async->events |= COMEDI_CB_EOA;
-			s->async->events |= COMEDI_CB_ERROR;
-			comedi_event(this_usbduxsub->comedidev, s);
-			/* don't do an unlink here */
-			usbdux_ai_stop(this_usbduxsub, 0);
+		/*
+		 * a real error on the bus
+		 * pass error to comedi if we are really running a command
+		 */
+		if (devpriv->ai_cmd_running) {
+			dev_err(dev->class_dev,
+				"%s: non-zero urb status (%d)\n",
+				__func__, urb->status);
+			usbduxsigma_ai_stop(dev, 0);	/* w/o unlink */
+			s->async->events |= (COMEDI_CB_EOA | COMEDI_CB_ERROR);
+			comedi_event(dev, s);
 		}
 		return;
 	}
 
-	/*
-	 * at this point we are reasonably sure that nothing dodgy has happened
-	 * are we running a command?
-	 */
-	if (unlikely((!(this_usbduxsub->ai_cmd_running)))) {
-		/*
-		 * not running a command, do not continue execution if no
-		 * asynchronous command is running in particular not resubmit
-		 */
+	if (unlikely(!devpriv->ai_cmd_running))
 		return;
-	}
 
-	urb->dev = this_usbduxsub->usbdev;
+	urb->dev = comedi_to_usb_dev(dev);
 
-	/* resubmit the urb */
-	err = usb_submit_urb(urb, GFP_ATOMIC);
-	if (unlikely(err < 0)) {
-		dev_err(&urb->dev->dev,
-			"comedi_: urb resubmit failed in int-context!"
-			"err=%d\n",
-			err);
-		if (err == -EL2NSYNC)
-			dev_err(&urb->dev->dev,
-				"buggy USB host controller or bug in IRQ "
-				"handler!\n");
-		s->async->events |= COMEDI_CB_EOA;
-		s->async->events |= COMEDI_CB_ERROR;
-		comedi_event(this_usbduxsub->comedidev, s);
-		/* don't do an unlink here */
-		usbdux_ai_stop(this_usbduxsub, 0);
+	ret = usb_submit_urb(urb, GFP_ATOMIC);
+	if (unlikely(ret < 0)) {
+		dev_err(dev->class_dev, "%s: urb resubmit failed (%d)\n",
+			__func__, ret);
+		if (ret == -EL2NSYNC)
+			dev_err(dev->class_dev,
+				"buggy USB host controller or bug in IRQ handler\n");
+		usbduxsigma_ai_stop(dev, 0);	/* w/o unlink */
+		s->async->events |= (COMEDI_CB_EOA | COMEDI_CB_ERROR);
+		comedi_event(dev, s);
 		return;
 	}
 
 	/* get the state of the dio pins to allow external trigger */
-	dio_state = be32_to_cpu(this_usbduxsub->inBuffer[0]);
+	dio_state = be32_to_cpu(devpriv->in_buf[0]);
 
-	this_usbduxsub->ai_counter--;
-	if (likely(this_usbduxsub->ai_counter > 0))
+	devpriv->ai_counter--;
+	if (likely(devpriv->ai_counter > 0))
 		return;
 
 	/* timer zero, transfer measurements to comedi */
-	this_usbduxsub->ai_counter = this_usbduxsub->ai_timer;
+	devpriv->ai_counter = devpriv->ai_timer;
 
-	/* test, if we transmit only a fixed number of samples */
-	if (!(this_usbduxsub->ai_continuous)) {
+	if (!devpriv->ai_continuous) {
 		/* not continuous, fixed number of samples */
-		this_usbduxsub->ai_sample_count--;
-		/* all samples received? */
-		if (this_usbduxsub->ai_sample_count < 0) {
-			/* prevent a resubmit next time */
-			usbdux_ai_stop(this_usbduxsub, 0);
-			/* say comedi that the acquistion is over */
+		devpriv->ai_sample_count--;
+		if (devpriv->ai_sample_count < 0) {
+			usbduxsigma_ai_stop(dev, 0);	/* w/o unlink */
+			/* acquistion is over, tell comedi */
 			s->async->events |= COMEDI_CB_EOA;
-			comedi_event(this_usbduxsub->comedidev, s);
+			comedi_event(dev, s);
 			return;
 		}
 	}
+
 	/* get the data from the USB bus and hand it over to comedi */
-	n = s->async->cmd.chanlist_len;
-	for (i = 0; i < n; i++) {
+	for (i = 0; i < s->async->cmd.chanlist_len; i++) {
 		/* transfer data, note first byte is the DIO state */
-		v = be32_to_cpu(this_usbduxsub->inBuffer[i+1]);
-		/* strip status byte */
-		v = v & 0x00ffffff;
-		/* convert to unsigned */
-		v = v ^ 0x00800000;
-		/* write the byte to the buffer */
-		err = cfc_write_array_to_buffer(s, &v, sizeof(uint32_t));
-		if (unlikely(err == 0)) {
+		val = be32_to_cpu(devpriv->in_buf[i+1]);
+		val &= 0x00ffffff;	/* strip status byte */
+		val ^= 0x00800000;	/* convert to unsigned */
+
+		ret = cfc_write_array_to_buffer(s, &val, sizeof(uint32_t));
+		if (unlikely(ret == 0)) {
 			/* buffer overflow */
-			usbdux_ai_stop(this_usbduxsub, 0);
+			usbduxsigma_ai_stop(dev, 0);	/* w/o unlink */
 			return;
 		}
 	}
 	/* tell comedi that data is there */
-	s->async->events |= COMEDI_CB_BLOCK | COMEDI_CB_EOS;
-	comedi_event(this_usbduxsub->comedidev, s);
+	s->async->events |= (COMEDI_CB_BLOCK | COMEDI_CB_EOS);
+	comedi_event(dev, s);
 }
 
-static int usbduxsub_unlink_OutURBs(struct usbduxsub *usbduxsub_tmp)
+static void usbduxsigma_ao_stop(struct comedi_device *dev, int do_unlink)
 {
-	int i = 0;
-	int err = 0;
+	struct usbduxsigma_private *devpriv = dev->private;
 
-	if (usbduxsub_tmp && usbduxsub_tmp->urbOut) {
-		for (i = 0; i < usbduxsub_tmp->numOfOutBuffers; i++) {
-			if (usbduxsub_tmp->urbOut[i])
-				usb_kill_urb(usbduxsub_tmp->urbOut[i]);
+	if (do_unlink) {
+		int i;
 
-			dev_dbg(&usbduxsub_tmp->interface->dev,
-				"comedi: usbdux: unlinked OutURB %d: res=%d\n",
-				i, err);
+		for (i = 0; i < devpriv->n_ao_urbs; i++) {
+			if (devpriv->ao_urbs[i])
+				usb_kill_urb(devpriv->ao_urbs[i]);
 		}
 	}
-	return err;
+
+	devpriv->ao_cmd_running = 0;
 }
 
-/* This will cancel a running acquisition operation
- * in any context.
- */
-static int usbdux_ao_stop(struct usbduxsub *this_usbduxsub, int do_unlink)
+static int usbduxsigma_ao_cancel(struct comedi_device *dev,
+				 struct comedi_subdevice *s)
 {
-	int ret = 0;
+	struct usbduxsigma_private *devpriv = dev->private;
 
-	if (!this_usbduxsub)
-		return -EFAULT;
-	dev_dbg(&this_usbduxsub->interface->dev, "comedi: usbdux_ao_cancel\n");
-
-	if (do_unlink)
-		ret = usbduxsub_unlink_OutURBs(this_usbduxsub);
-
-	this_usbduxsub->ao_cmd_running = 0;
-
-	return ret;
-}
-
-/* force unlink, is called by comedi */
-static int usbdux_ao_cancel(struct comedi_device *dev,
-			    struct comedi_subdevice *s)
-{
-	struct usbduxsub *this_usbduxsub = dev->private;
-	int res = 0;
-
-	if (!this_usbduxsub)
-		return -EFAULT;
-
-	/* prevent other CPUs from submitting a command just now */
-	down(&this_usbduxsub->sem);
-	if (!(this_usbduxsub->probed)) {
-		up(&this_usbduxsub->sem);
-		return -ENODEV;
-	}
+	down(&devpriv->sem);
 	/* unlink only if it is really running */
-	res = usbdux_ao_stop(this_usbduxsub, this_usbduxsub->ao_cmd_running);
-	up(&this_usbduxsub->sem);
-	return res;
+	usbduxsigma_ao_stop(dev, devpriv->ao_cmd_running);
+	up(&devpriv->sem);
+
+	return 0;
 }
 
-static void usbduxsub_ao_IsocIrq(struct urb *urb)
+static void usbduxsigma_ao_urb_complete(struct urb *urb)
 {
-	int i, ret;
+	struct comedi_device *dev = urb->context;
+	struct usbduxsigma_private *devpriv = dev->private;
+	struct comedi_subdevice *s = dev->write_subdev;
 	uint8_t *datap;
-	struct usbduxsub *this_usbduxsub;
-	struct comedi_device *this_comedidev;
-	struct comedi_subdevice *s;
-
-	/* the context variable points to the subdevice */
-	this_comedidev = urb->context;
-	/* the private structure of the subdevice is struct usbduxsub */
-	this_usbduxsub = this_comedidev->private;
-
-	s = &this_comedidev->subdevices[SUBDEV_DA];
+	int len;
+	int ret;
+	int i;
 
 	switch (urb->status) {
 	case 0:
@@ -563,347 +386,141 @@
 	case -ENOENT:
 	case -ESHUTDOWN:
 	case -ECONNABORTED:
-		/* after an unlink command, unplug, ... etc */
-		/* no unlink needed here. Already shutting down. */
-		if (this_usbduxsub->ao_cmd_running) {
+		/* happens after an unlink command */
+		if (devpriv->ao_cmd_running) {
+			usbduxsigma_ao_stop(dev, 0);	/* w/o unlink */
 			s->async->events |= COMEDI_CB_EOA;
-			comedi_event(this_usbduxsub->comedidev, s);
-			usbdux_ao_stop(this_usbduxsub, 0);
+			comedi_event(dev, s);
 		}
 		return;
 
 	default:
 		/* a real error */
-		if (this_usbduxsub->ao_cmd_running) {
-			dev_err(&urb->dev->dev,
-				"comedi_: Non-zero urb status received in ao "
-				"intr context: %d\n", urb->status);
-			s->async->events |= COMEDI_CB_ERROR;
-			s->async->events |= COMEDI_CB_EOA;
-			comedi_event(this_usbduxsub->comedidev, s);
-			/* we do an unlink if we are in the high speed mode */
-			usbdux_ao_stop(this_usbduxsub, 0);
+		if (devpriv->ao_cmd_running) {
+			dev_err(dev->class_dev,
+				"%s: non-zero urb status (%d)\n",
+				__func__, urb->status);
+			usbduxsigma_ao_stop(dev, 0);	/* w/o unlink */
+			s->async->events |= (COMEDI_CB_ERROR | COMEDI_CB_EOA);
+			comedi_event(dev, s);
 		}
 		return;
 	}
 
-	/* are we actually running? */
-	if (!(this_usbduxsub->ao_cmd_running))
+	if (!devpriv->ao_cmd_running)
 		return;
 
-	/* normal operation: executing a command in this subdevice */
-	this_usbduxsub->ao_counter--;
-	if ((int)this_usbduxsub->ao_counter <= 0) {
-		/* timer zero */
-		this_usbduxsub->ao_counter = this_usbduxsub->ao_timer;
+	devpriv->ao_counter--;
+	if ((int)devpriv->ao_counter <= 0) {
+		/* timer zero, transfer from comedi */
+		devpriv->ao_counter = devpriv->ao_timer;
 
-		/* handle non continuous acquisition */
-		if (!(this_usbduxsub->ao_continuous)) {
-			/* fixed number of samples */
-			this_usbduxsub->ao_sample_count--;
-			if (this_usbduxsub->ao_sample_count < 0) {
-				/* all samples transmitted */
-				usbdux_ao_stop(this_usbduxsub, 0);
+		if (!devpriv->ao_continuous) {
+			/* not continuous, fixed number of samples */
+			devpriv->ao_sample_count--;
+			if (devpriv->ao_sample_count < 0) {
+				usbduxsigma_ao_stop(dev, 0);	/* w/o unlink */
+				/* acquistion is over, tell comedi */
 				s->async->events |= COMEDI_CB_EOA;
-				comedi_event(this_usbduxsub->comedidev, s);
-				/* no resubmit of the urb */
+				comedi_event(dev, s);
 				return;
 			}
 		}
+
 		/* transmit data to the USB bus */
-		((uint8_t *) (urb->transfer_buffer))[0] =
-		    s->async->cmd.chanlist_len;
-		for (i = 0; i < s->async->cmd.chanlist_len; i++) {
-			short temp;
-			if (i >= NUMOUTCHANNELS)
-				break;
+		datap = urb->transfer_buffer;
+		len = s->async->cmd.chanlist_len;
+		*datap++ = len;
+		for (i = 0; i < len; i++) {
+			unsigned int chan = devpriv->dac_commands[i];
+			short val;
 
-			/* pointer to the DA */
-			datap =
-			    (&(((uint8_t *) urb->transfer_buffer)[i * 2 + 1]));
-			/* get the data from comedi */
-			ret = comedi_buf_get(s->async, &temp);
-			datap[0] = temp;
-			datap[1] = this_usbduxsub->dac_commands[i];
-			/* printk("data[0]=%x, data[1]=%x, data[2]=%x\n", */
-			/* datap[0],datap[1],datap[2]); */
+			ret = comedi_buf_get(s->async, &val);
 			if (ret < 0) {
-				dev_err(&urb->dev->dev,
-					"comedi: buffer underflow\n");
-				s->async->events |= COMEDI_CB_EOA;
-				s->async->events |= COMEDI_CB_OVERFLOW;
+				dev_err(dev->class_dev, "buffer underflow\n");
+				s->async->events |= (COMEDI_CB_EOA |
+						     COMEDI_CB_OVERFLOW);
 			}
-			/* transmit data to comedi */
+			*datap++ = val;
+			*datap++ = chan;
+			devpriv->ao_readback[chan] = val;
+
 			s->async->events |= COMEDI_CB_BLOCK;
-			comedi_event(this_usbduxsub->comedidev, s);
+			comedi_event(dev, s);
 		}
 	}
+
 	urb->transfer_buffer_length = SIZEOUTBUF;
-	urb->dev = this_usbduxsub->usbdev;
+	urb->dev = comedi_to_usb_dev(dev);
 	urb->status = 0;
-	if (this_usbduxsub->ao_cmd_running) {
-		if (this_usbduxsub->high_speed) {
-			/* uframes */
-			urb->interval = 8;
-		} else {
-			/* frames */
-			urb->interval = 1;
-		}
-		urb->number_of_packets = 1;
-		urb->iso_frame_desc[0].offset = 0;
-		urb->iso_frame_desc[0].length = SIZEOUTBUF;
-		urb->iso_frame_desc[0].status = 0;
-		ret = usb_submit_urb(urb, GFP_ATOMIC);
-		if (ret < 0) {
-			dev_err(&urb->dev->dev,
-				"comedi_: ao urb resubm failed in int-cont. "
-				"ret=%d", ret);
-			if (ret == EL2NSYNC)
-				dev_err(&urb->dev->dev,
-					"buggy USB host controller or bug in "
-					"IRQ handling!\n");
-
-			s->async->events |= COMEDI_CB_EOA;
-			s->async->events |= COMEDI_CB_ERROR;
-			comedi_event(this_usbduxsub->comedidev, s);
-			/* don't do an unlink here */
-			usbdux_ao_stop(this_usbduxsub, 0);
-		}
+	if (devpriv->high_speed)
+		urb->interval = 8;	/* uframes */
+	else
+		urb->interval = 1;	/* frames */
+	urb->number_of_packets = 1;
+	urb->iso_frame_desc[0].offset = 0;
+	urb->iso_frame_desc[0].length = SIZEOUTBUF;
+	urb->iso_frame_desc[0].status = 0;
+	ret = usb_submit_urb(urb, GFP_ATOMIC);
+	if (ret < 0) {
+		dev_err(dev->class_dev,
+			"%s: urb resubmit failed (%d)\n",
+			__func__, ret);
+		if (ret == EL2NSYNC)
+			dev_err(dev->class_dev,
+				"buggy USB host controller or bug in IRQ handler\n");
+		usbduxsigma_ao_stop(dev, 0);	/* w/o unlink */
+		s->async->events |= (COMEDI_CB_EOA | COMEDI_CB_ERROR);
+		comedi_event(dev, s);
 	}
 }
 
-static int usbduxsub_start(struct usbduxsub *usbduxsub)
+static int usbduxsigma_submit_urbs(struct comedi_device *dev,
+				   struct urb **urbs, int num_urbs,
+				   int input_urb)
 {
-	int errcode = 0;
-	uint8_t *local_transfer_buffer;
-
-	local_transfer_buffer = kmalloc(16, GFP_KERNEL);
-	if (!local_transfer_buffer)
-		return -ENOMEM;
-
-	/* 7f92 to zero */
-	local_transfer_buffer[0] = 0;
-	errcode = usb_control_msg(usbduxsub->usbdev,
-				  /* create a pipe for a control transfer */
-				  usb_sndctrlpipe(usbduxsub->usbdev, 0),
-				  /* bRequest, "Firmware" */
-				  USBDUXSUB_FIRMWARE,
-				  /* bmRequestType */
-				  VENDOR_DIR_OUT,
-				  /* Value */
-				  USBDUXSUB_CPUCS,
-				  /* Index */
-				  0x0000,
-				  /* address of the transfer buffer */
-				  local_transfer_buffer,
-				  /* Length */
-				  1,
-				  /* Timeout */
-				  BULK_TIMEOUT);
-	if (errcode < 0)
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: control msg failed (start)\n");
-
-	kfree(local_transfer_buffer);
-	return errcode;
-}
-
-static int usbduxsub_stop(struct usbduxsub *usbduxsub)
-{
-	int errcode = 0;
-	uint8_t *local_transfer_buffer;
-
-	local_transfer_buffer = kmalloc(16, GFP_KERNEL);
-	if (!local_transfer_buffer)
-		return -ENOMEM;
-
-	/* 7f92 to one */
-	local_transfer_buffer[0] = 1;
-	errcode = usb_control_msg(usbduxsub->usbdev,
-				  usb_sndctrlpipe(usbduxsub->usbdev, 0),
-				  /* bRequest, "Firmware" */
-				  USBDUXSUB_FIRMWARE,
-				  /* bmRequestType */
-				  VENDOR_DIR_OUT,
-				  /* Value */
-				  USBDUXSUB_CPUCS,
-				  /* Index */
-				  0x0000, local_transfer_buffer,
-				  /* Length */
-				  1,
-				  /* Timeout */
-				  BULK_TIMEOUT);
-	if (errcode < 0)
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: control msg failed (stop)\n");
-
-	kfree(local_transfer_buffer);
-	return errcode;
-}
-
-static int usbduxsub_upload(struct usbduxsub *usbduxsub,
-			    uint8_t *local_transfer_buffer,
-			    unsigned int startAddr, unsigned int len)
-{
-	int errcode;
-
-	errcode = usb_control_msg(usbduxsub->usbdev,
-				  usb_sndctrlpipe(usbduxsub->usbdev, 0),
-				  /* brequest, firmware */
-				  USBDUXSUB_FIRMWARE,
-				  /* bmRequestType */
-				  VENDOR_DIR_OUT,
-				  /* value */
-				  startAddr,
-				  /* index */
-				  0x0000,
-				  /* our local safe buffer */
-				  local_transfer_buffer,
-				  /* length */
-				  len,
-				  /* timeout */
-				  BULK_TIMEOUT);
-	dev_dbg(&usbduxsub->interface->dev, "comedi_: result=%d\n", errcode);
-	if (errcode < 0) {
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: upload failed\n");
-		return errcode;
-	}
-	return 0;
-}
-
-/* the FX2LP has twice as much as the standard FX2 */
-#define FIRMWARE_MAX_LEN 0x4000
-
-static int firmwareUpload(struct usbduxsub *usbduxsub,
-			  const u8 *firmwareBinary, int sizeFirmware)
-{
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxsigma_private *devpriv = dev->private;
+	struct urb *urb;
 	int ret;
-	uint8_t *fwBuf;
-
-	if (!firmwareBinary)
-		return 0;
-
-	if (sizeFirmware > FIRMWARE_MAX_LEN) {
-		dev_err(&usbduxsub->interface->dev,
-			"usbduxsigma firmware binary it too large for FX2.\n");
-		return -ENOMEM;
-	}
-
-	/* we generate a local buffer for the firmware */
-	fwBuf = kmemdup(firmwareBinary, sizeFirmware, GFP_KERNEL);
-	if (!fwBuf) {
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: mem alloc for firmware failed\n");
-		return -ENOMEM;
-	}
-
-	ret = usbduxsub_stop(usbduxsub);
-	if (ret < 0) {
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: can not stop firmware\n");
-		kfree(fwBuf);
-		return ret;
-	}
-
-	ret = usbduxsub_upload(usbduxsub, fwBuf, 0, sizeFirmware);
-	if (ret < 0) {
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: firmware upload failed\n");
-		kfree(fwBuf);
-		return ret;
-	}
-	ret = usbduxsub_start(usbduxsub);
-	if (ret < 0) {
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: can not start firmware\n");
-		kfree(fwBuf);
-		return ret;
-	}
-	kfree(fwBuf);
-	return 0;
-}
-
-static int usbduxsub_submit_InURBs(struct usbduxsub *usbduxsub)
-{
-	int i, errFlag;
-
-	if (!usbduxsub)
-		return -EFAULT;
+	int i;
 
 	/* Submit all URBs and start the transfer on the bus */
-	for (i = 0; i < usbduxsub->numOfInBuffers; i++) {
+	for (i = 0; i < num_urbs; i++) {
+		urb = urbs[i];
+
 		/* in case of a resubmission after an unlink... */
-		usbduxsub->urbIn[i]->interval = usbduxsub->ai_interval;
-		usbduxsub->urbIn[i]->context = usbduxsub->comedidev;
-		usbduxsub->urbIn[i]->dev = usbduxsub->usbdev;
-		usbduxsub->urbIn[i]->status = 0;
-		usbduxsub->urbIn[i]->transfer_flags = URB_ISO_ASAP;
-		dev_dbg(&usbduxsub->interface->dev,
-			"comedi%d: submitting in-urb[%d]: %p,%p intv=%d\n",
-			usbduxsub->comedidev->minor, i,
-			(usbduxsub->urbIn[i]->context),
-			(usbduxsub->urbIn[i]->dev),
-			(usbduxsub->urbIn[i]->interval));
-		errFlag = usb_submit_urb(usbduxsub->urbIn[i], GFP_ATOMIC);
-		if (errFlag) {
-			dev_err(&usbduxsub->interface->dev,
-				"comedi_: ai: usb_submit_urb(%d) error %d\n",
-				i, errFlag);
-			return errFlag;
-		}
+		if (input_urb)
+			urb->interval = devpriv->ai_interval;
+		urb->context = dev;
+		urb->dev = usb;
+		urb->status = 0;
+		urb->transfer_flags = URB_ISO_ASAP;
+
+		ret = usb_submit_urb(urb, GFP_ATOMIC);
+		if (ret)
+			return ret;
 	}
 	return 0;
 }
 
-static int usbduxsub_submit_OutURBs(struct usbduxsub *usbduxsub)
+static int usbduxsigma_chans_to_interval(int num_chan)
 {
-	int i, errFlag;
-
-	if (!usbduxsub)
-		return -EFAULT;
-
-	for (i = 0; i < usbduxsub->numOfOutBuffers; i++) {
-		dev_dbg(&usbduxsub->interface->dev,
-			"comedi_: submitting out-urb[%d]\n", i);
-		/* in case of a resubmission after an unlink... */
-		usbduxsub->urbOut[i]->context = usbduxsub->comedidev;
-		usbduxsub->urbOut[i]->dev = usbduxsub->usbdev;
-		usbduxsub->urbOut[i]->status = 0;
-		usbduxsub->urbOut[i]->transfer_flags = URB_ISO_ASAP;
-		errFlag = usb_submit_urb(usbduxsub->urbOut[i], GFP_ATOMIC);
-		if (errFlag) {
-			dev_err(&usbduxsub->interface->dev,
-				"comedi_: ao: usb_submit_urb(%d) error %d\n",
-				i, errFlag);
-			return errFlag;
-		}
-	}
-	return 0;
+	if (num_chan <= 2)
+		return 2;	/* 4kHz */
+	if (num_chan <= 8)
+		return 4;	/* 2kHz */
+	return 8;		/* 1kHz */
 }
 
-static int chanToInterval(int nChannels)
+static int usbduxsigma_ai_cmdtest(struct comedi_device *dev,
+				  struct comedi_subdevice *s,
+				  struct comedi_cmd *cmd)
 {
-	if (nChannels <= 2)
-		/* 4kHz */
-		return 2;
-	if (nChannels <= 8)
-		/* 2kHz */
-		return 4;
-	/* 1kHz */
-	return 8;
-}
-
-static int usbdux_ai_cmdtest(struct comedi_device *dev,
-			     struct comedi_subdevice *s,
-			     struct comedi_cmd *cmd)
-{
-	struct usbduxsub *this_usbduxsub = dev->private;
-	int err = 0, i;
-	unsigned int tmpTimer;
-
-	if (!(this_usbduxsub->probed))
-		return -ENODEV;
+	struct usbduxsigma_private *devpriv = dev->private;
+	int high_speed = devpriv->high_speed;
+	int interval = usbduxsigma_chans_to_interval(cmd->chanlist_len);
+	int err = 0;
 
 	/* Step 1 : check if triggers are trivially valid */
 
@@ -934,34 +551,28 @@
 		err |= cfc_check_trigger_arg_is(&cmd->scan_begin_arg, 0);
 
 	if (cmd->scan_begin_src == TRIG_TIMER) {
-		if (this_usbduxsub->high_speed) {
+		unsigned int tmp;
+
+		if (high_speed) {
 			/*
 			 * In high speed mode microframes are possible.
 			 * However, during one microframe we can roughly
 			 * sample two channels. Thus, the more channels
 			 * are in the channel list the more time we need.
 			 */
-			i = chanToInterval(cmd->chanlist_len);
 			err |= cfc_check_trigger_arg_min(&cmd->scan_begin_arg,
-							 (1000000 / 8 * i));
-			/* now calc the real sampling rate with all the
-			 * rounding errors */
-			tmpTimer =
-			    ((unsigned int)(cmd->scan_begin_arg / 125000)) *
-			    125000;
+						(1000000 / 8 * interval));
+
+			tmp = (cmd->scan_begin_arg / 125000) * 125000;
 		} else {
 			/* full speed */
 			/* 1kHz scans every USB frame */
 			err |= cfc_check_trigger_arg_min(&cmd->scan_begin_arg,
 							 1000000);
-			/*
-			 * calc the real sampling rate with the rounding errors
-			 */
-			tmpTimer = ((unsigned int)(cmd->scan_begin_arg /
-						   1000000)) * 1000000;
+
+			tmp = (cmd->scan_begin_arg / 1000000) * 1000000;
 		}
-		err |= cfc_check_trigger_arg_is(&cmd->scan_begin_arg,
-						tmpTimer);
+		err |= cfc_check_trigger_arg_is(&cmd->scan_begin_arg, tmp);
 	}
 
 	err |= cfc_check_trigger_arg_is(&cmd->scan_end_arg, cmd->chanlist_len);
@@ -976,6 +587,37 @@
 	if (err)
 		return 3;
 
+	/* Step 4: fix up any arguments */
+
+	if (high_speed) {
+		/*
+		 * every 2 channels get a time window of 125us. Thus, if we
+		 * sample all 16 channels we need 1ms. If we sample only one
+		 * channel we need only 125us
+		 */
+		devpriv->ai_interval = interval;
+		devpriv->ai_timer = cmd->scan_begin_arg / (125000 * interval);
+	} else {
+		/* interval always 1ms */
+		devpriv->ai_interval = 1;
+		devpriv->ai_timer = cmd->scan_begin_arg / 1000000;
+	}
+	if (devpriv->ai_timer < 1)
+		err |= -EINVAL;
+
+	if (cmd->stop_src == TRIG_COUNT) {
+		/* data arrives as one packet */
+		devpriv->ai_sample_count = cmd->stop_arg;
+		devpriv->ai_continuous = 0;
+	} else {
+		/* continuous acquisition */
+		devpriv->ai_continuous = 1;
+		devpriv->ai_sample_count = 0;
+	}
+
+	if (err)
+		return 4;
+
 	return 0;
 }
 
@@ -993,536 +635,278 @@
 		(*muxsg1) = (*muxsg1) | (1 << (chan-8));
 }
 
-
-/* bulk transfers to usbdux */
-
-#define SENDADCOMMANDS            0
-#define SENDDACOMMANDS            1
-#define SENDDIOCONFIGCOMMAND      2
-#define SENDDIOBITSCOMMAND        3
-#define SENDSINGLEAD              4
-#define SENDPWMON                 7
-#define SENDPWMOFF                8
-
-static int send_dux_commands(struct usbduxsub *this_usbduxsub, int cmd_type)
+static int usbbuxsigma_send_cmd(struct comedi_device *dev, int cmd_type)
 {
-	int result, nsent;
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxsigma_private *devpriv = dev->private;
+	int nsent;
 
-	this_usbduxsub->dux_commands[0] = cmd_type;
-#ifdef NOISY_DUX_DEBUGBUG
-	printk(KERN_DEBUG "comedi%d: usbdux: dux_commands: ",
-	       this_usbduxsub->comedidev->minor);
-	for (result = 0; result < SIZEOFDUXBUFFER; result++)
-		printk(" %02x", this_usbduxsub->dux_commands[result]);
-	printk("\n");
-#endif
-	result = usb_bulk_msg(this_usbduxsub->usbdev,
-			      usb_sndbulkpipe(this_usbduxsub->usbdev,
-					      COMMAND_OUT_EP),
-			      this_usbduxsub->dux_commands, SIZEOFDUXBUFFER,
-			      &nsent, BULK_TIMEOUT);
-	if (result < 0)
-		dev_err(&this_usbduxsub->interface->dev, "comedi%d: "
-			"could not transmit dux_command to the usb-device, "
-			"err=%d\n", this_usbduxsub->comedidev->minor, result);
+	devpriv->dux_commands[0] = cmd_type;
 
-	return result;
+	return usb_bulk_msg(usb, usb_sndbulkpipe(usb, USBDUXSIGMA_CMD_OUT_EP),
+			    devpriv->dux_commands, SIZEOFDUXBUFFER,
+			    &nsent, BULK_TIMEOUT);
 }
 
-static int receive_dux_commands(struct usbduxsub *this_usbduxsub, int command)
+static int usbduxsigma_receive_cmd(struct comedi_device *dev, int command)
 {
-	int result = (-EFAULT);
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxsigma_private *devpriv = dev->private;
 	int nrec;
+	int ret;
 	int i;
 
 	for (i = 0; i < RETRIES; i++) {
-		result = usb_bulk_msg(this_usbduxsub->usbdev,
-				      usb_rcvbulkpipe(this_usbduxsub->usbdev,
-						      COMMAND_IN_EP),
-				      this_usbduxsub->insnBuffer, SIZEINSNBUF,
-				      &nrec, BULK_TIMEOUT);
-		if (result < 0) {
-			dev_err(&this_usbduxsub->interface->dev, "comedi%d: "
-				"insn: USB error %d "
-				"while receiving DUX command"
-				"\n", this_usbduxsub->comedidev->minor,
-				result);
-			return result;
-		}
-		if (this_usbduxsub->insnBuffer[0] == command)
-			return result;
+		ret = usb_bulk_msg(usb,
+				   usb_rcvbulkpipe(usb, USBDUXSIGMA_CMD_IN_EP),
+				   devpriv->insn_buf, SIZEINSNBUF,
+				   &nrec, BULK_TIMEOUT);
+		if (ret < 0)
+			return ret;
+
+		if (devpriv->insn_buf[0] == command)
+			return 0;
 	}
-	/* this is only reached if the data has been requested a couple of
-	 * times */
-	dev_err(&this_usbduxsub->interface->dev, "comedi%d: insn: "
-		"wrong data returned from firmware: want %d, got %d.\n",
-		this_usbduxsub->comedidev->minor, command,
-		this_usbduxsub->insnBuffer[0]);
+	/*
+	 * This is only reached if the data has been requested a
+	 * couple of times and the command was not received.
+	 */
 	return -EFAULT;
 }
 
-static int usbdux_ai_inttrig(struct comedi_device *dev,
-			     struct comedi_subdevice *s, unsigned int trignum)
+static int usbduxsigma_ai_inttrig(struct comedi_device *dev,
+				  struct comedi_subdevice *s,
+				  unsigned int trignum)
 {
+	struct usbduxsigma_private *devpriv = dev->private;
 	int ret;
-	struct usbduxsub *this_usbduxsub = dev->private;
-	if (!this_usbduxsub)
-		return -EFAULT;
 
-	down(&this_usbduxsub->sem);
-	if (!(this_usbduxsub->probed)) {
-		up(&this_usbduxsub->sem);
-		return -ENODEV;
-	}
-	dev_dbg(&this_usbduxsub->interface->dev,
-		"comedi%d: usbdux_ai_inttrig\n", dev->minor);
-
-	if (trignum != 0) {
-		dev_err(&this_usbduxsub->interface->dev,
-			"comedi%d: usbdux_ai_inttrig: invalid trignum\n",
-			dev->minor);
-		up(&this_usbduxsub->sem);
+	if (trignum != 0)
 		return -EINVAL;
-	}
-	if (!(this_usbduxsub->ai_cmd_running)) {
-		this_usbduxsub->ai_cmd_running = 1;
-		ret = usbduxsub_submit_InURBs(this_usbduxsub);
+
+	down(&devpriv->sem);
+	if (!devpriv->ai_cmd_running) {
+		ret = usbduxsigma_submit_urbs(dev, devpriv->ai_urbs,
+					      devpriv->n_ai_urbs, 1);
 		if (ret < 0) {
-			dev_err(&this_usbduxsub->interface->dev,
-				"comedi%d: usbdux_ai_inttrig: "
-				"urbSubmit: err=%d\n", dev->minor, ret);
-			this_usbduxsub->ai_cmd_running = 0;
-			up(&this_usbduxsub->sem);
+			up(&devpriv->sem);
 			return ret;
 		}
+		devpriv->ai_cmd_running = 1;
 		s->async->inttrig = NULL;
-	} else {
-		dev_err(&this_usbduxsub->interface->dev,
-			"comedi%d: ai_inttrig but acqu is already running\n",
-			dev->minor);
 	}
-	up(&this_usbduxsub->sem);
+	up(&devpriv->sem);
+
 	return 1;
 }
 
-static int usbdux_ai_cmd(struct comedi_device *dev, struct comedi_subdevice *s)
+static int usbduxsigma_ai_cmd(struct comedi_device *dev,
+			      struct comedi_subdevice *s)
 {
+	struct usbduxsigma_private *devpriv = dev->private;
 	struct comedi_cmd *cmd = &s->async->cmd;
-	unsigned int chan;
-	int i, ret;
-	struct usbduxsub *this_usbduxsub = dev->private;
-	int result;
+	unsigned int len = cmd->chanlist_len;
 	uint8_t muxsg0 = 0;
 	uint8_t muxsg1 = 0;
 	uint8_t sysred = 0;
+	int ret;
+	int i;
 
-	if (!this_usbduxsub)
-		return -EFAULT;
+	down(&devpriv->sem);
 
-	dev_dbg(&this_usbduxsub->interface->dev,
-		"comedi%d: usbdux_ai_cmd\n", dev->minor);
-
-	/* block other CPUs from starting an ai_cmd */
-	down(&this_usbduxsub->sem);
-
-	if (!(this_usbduxsub->probed)) {
-		up(&this_usbduxsub->sem);
-		return -ENODEV;
-	}
-	if (this_usbduxsub->ai_cmd_running) {
-		dev_err(&this_usbduxsub->interface->dev, "comedi%d: "
-			"ai_cmd not possible. Another ai_cmd is running.\n",
-			dev->minor);
-		up(&this_usbduxsub->sem);
-		return -EBUSY;
-	}
 	/* set current channel of the running acquisition to zero */
 	s->async->cur_chan = 0;
+	for (i = 0; i < len; i++) {
+		unsigned int chan  = CR_CHAN(cmd->chanlist[i]);
 
-	/* first the number of channels per time step */
-	this_usbduxsub->dux_commands[1] = cmd->chanlist_len;
-
-	/* CONFIG0 */
-	this_usbduxsub->dux_commands[2] = 0x12;
-
-	/* CONFIG1: 23kHz sampling rate, delay = 0us,  */
-	this_usbduxsub->dux_commands[3] = 0x03;
-
-	/* CONFIG3: differential channels off */
-	this_usbduxsub->dux_commands[4] = 0x00;
-
-	for (i = 0; i < cmd->chanlist_len; i++) {
-		chan = CR_CHAN(cmd->chanlist[i]);
 		create_adc_command(chan, &muxsg0, &muxsg1);
-		if (i >= NUMCHANNELS) {
-			dev_err(&this_usbduxsub->interface->dev,
-				"comedi%d: channel list too long\n",
-				dev->minor);
-			break;
-		}
-	}
-	this_usbduxsub->dux_commands[5] = muxsg0;
-	this_usbduxsub->dux_commands[6] = muxsg1;
-	this_usbduxsub->dux_commands[7] = sysred;
-
-	dev_dbg(&this_usbduxsub->interface->dev,
-		"comedi %d: sending commands to the usb device: size=%u\n",
-		dev->minor, NUMCHANNELS);
-
-	result = send_dux_commands(this_usbduxsub, SENDADCOMMANDS);
-	if (result < 0) {
-		up(&this_usbduxsub->sem);
-		return result;
 	}
 
-	if (this_usbduxsub->high_speed) {
-		/*
-		 * every 2 channels get a time window of 125us. Thus, if we
-		 * sample all 16 channels we need 1ms. If we sample only one
-		 * channel we need only 125us
-		 */
-		this_usbduxsub->ai_interval =
-			chanToInterval(cmd->chanlist_len);
-		this_usbduxsub->ai_timer = cmd->scan_begin_arg / (125000 *
-							  (this_usbduxsub->
-							   ai_interval));
-	} else {
-		/* interval always 1ms */
-		this_usbduxsub->ai_interval = 1;
-		this_usbduxsub->ai_timer = cmd->scan_begin_arg / 1000000;
-	}
-	if (this_usbduxsub->ai_timer < 1) {
-		dev_err(&this_usbduxsub->interface->dev, "comedi%d: ai_cmd: "
-			"timer=%d, scan_begin_arg=%d. "
-			"Not properly tested by cmdtest?\n", dev->minor,
-			this_usbduxsub->ai_timer, cmd->scan_begin_arg);
-		up(&this_usbduxsub->sem);
-		return -EINVAL;
-	}
-	this_usbduxsub->ai_counter = this_usbduxsub->ai_timer;
+	devpriv->dux_commands[1] = len;  /* num channels per time step */
+	devpriv->dux_commands[2] = 0x12; /* CONFIG0 */
+	devpriv->dux_commands[3] = 0x03; /* CONFIG1: 23kHz sample, delay 0us */
+	devpriv->dux_commands[4] = 0x00; /* CONFIG3: diff. channels off */
+	devpriv->dux_commands[5] = muxsg0;
+	devpriv->dux_commands[6] = muxsg1;
+	devpriv->dux_commands[7] = sysred;
 
-	if (cmd->stop_src == TRIG_COUNT) {
-		/* data arrives as one packet */
-		this_usbduxsub->ai_sample_count = cmd->stop_arg;
-		this_usbduxsub->ai_continuous = 0;
-	} else {
-		/* continuous acquisition */
-		this_usbduxsub->ai_continuous = 1;
-		this_usbduxsub->ai_sample_count = 0;
+	ret = usbbuxsigma_send_cmd(dev, USBBUXSIGMA_AD_CMD);
+	if (ret < 0) {
+		up(&devpriv->sem);
+		return ret;
 	}
 
+	devpriv->ai_counter = devpriv->ai_timer;
+
 	if (cmd->start_src == TRIG_NOW) {
 		/* enable this acquisition operation */
-		this_usbduxsub->ai_cmd_running = 1;
-		ret = usbduxsub_submit_InURBs(this_usbduxsub);
+		ret = usbduxsigma_submit_urbs(dev, devpriv->ai_urbs,
+					      devpriv->n_ai_urbs, 1);
 		if (ret < 0) {
-			this_usbduxsub->ai_cmd_running = 0;
-			/* fixme: unlink here?? */
-			up(&this_usbduxsub->sem);
+			up(&devpriv->sem);
 			return ret;
 		}
 		s->async->inttrig = NULL;
-	} else {
-		/* TRIG_INT */
-		/* don't enable the acquision operation */
-		/* wait for an internal signal */
-		s->async->inttrig = usbdux_ai_inttrig;
+		devpriv->ai_cmd_running = 1;
+	} else {	/* TRIG_INT */
+		/* wait for an internal signal and submit the urbs later */
+		s->async->inttrig = usbduxsigma_ai_inttrig;
 	}
-	up(&this_usbduxsub->sem);
+
+	up(&devpriv->sem);
+
 	return 0;
 }
 
-/* Mode 0 is used to get a single conversion on demand */
-static int usbdux_ai_insn_read(struct comedi_device *dev,
-			       struct comedi_subdevice *s,
-			       struct comedi_insn *insn, unsigned int *data)
+static int usbduxsigma_ai_insn_read(struct comedi_device *dev,
+				    struct comedi_subdevice *s,
+				    struct comedi_insn *insn,
+				    unsigned int *data)
 {
-	int i;
-	int32_t one = 0;
-	int chan;
-	int err;
-	struct usbduxsub *this_usbduxsub = dev->private;
+	struct usbduxsigma_private *devpriv = dev->private;
+	unsigned int chan = CR_CHAN(insn->chanspec);
 	uint8_t muxsg0 = 0;
 	uint8_t muxsg1 = 0;
 	uint8_t sysred = 0;
+	int ret;
+	int i;
 
-	if (!this_usbduxsub)
-		return 0;
-
-	dev_dbg(&this_usbduxsub->interface->dev,
-		"comedi%d: ai_insn_read, insn->n=%d, insn->subdev=%d\n",
-		dev->minor, insn->n, insn->subdev);
-
-	down(&this_usbduxsub->sem);
-	if (!(this_usbduxsub->probed)) {
-		up(&this_usbduxsub->sem);
-		return -ENODEV;
-	}
-	if (this_usbduxsub->ai_cmd_running) {
-		dev_err(&this_usbduxsub->interface->dev,
-			"comedi%d: ai_insn_read not possible. "
-			"Async Command is running.\n", dev->minor);
-		up(&this_usbduxsub->sem);
-		return 0;
+	down(&devpriv->sem);
+	if (devpriv->ai_cmd_running) {
+		up(&devpriv->sem);
+		return -EBUSY;
 	}
 
-	/* sample one channel */
-	/* CONFIG0: chopper on */
-	this_usbduxsub->dux_commands[1] = 0x16;
-
-	/* CONFIG1: 2kHz sampling rate */
-	this_usbduxsub->dux_commands[2] = 0x80;
-
-	/* CONFIG3: differential channels off */
-	this_usbduxsub->dux_commands[3] = 0x00;
-
-	chan = CR_CHAN(insn->chanspec);
 	create_adc_command(chan, &muxsg0, &muxsg1);
 
-	this_usbduxsub->dux_commands[4] = muxsg0;
-	this_usbduxsub->dux_commands[5] = muxsg1;
-	this_usbduxsub->dux_commands[6] = sysred;
+	/* Mode 0 is used to get a single conversion on demand */
+	devpriv->dux_commands[1] = 0x16; /* CONFIG0: chopper on */
+	devpriv->dux_commands[2] = 0x80; /* CONFIG1: 2kHz sampling rate */
+	devpriv->dux_commands[3] = 0x00; /* CONFIG3: diff. channels off */
+	devpriv->dux_commands[4] = muxsg0;
+	devpriv->dux_commands[5] = muxsg1;
+	devpriv->dux_commands[6] = sysred;
 
 	/* adc commands */
-	err = send_dux_commands(this_usbduxsub, SENDSINGLEAD);
-	if (err < 0) {
-		up(&this_usbduxsub->sem);
-		return err;
+	ret = usbbuxsigma_send_cmd(dev, USBDUXSIGMA_SINGLE_AD_CMD);
+	if (ret < 0) {
+		up(&devpriv->sem);
+		return ret;
 	}
 
 	for (i = 0; i < insn->n; i++) {
-		err = receive_dux_commands(this_usbduxsub, SENDSINGLEAD);
-		if (err < 0) {
-			up(&this_usbduxsub->sem);
-			return 0;
-		}
-		/* 32 bits big endian from the A/D converter */
-		one = be32_to_cpu(*((int32_t *)
-				    ((this_usbduxsub->insnBuffer)+1)));
-		/* mask out the status byte */
-		one = one & 0x00ffffff;
-		/* turn it into an unsigned integer */
-		one = one ^ 0x00800000;
-		data[i] = one;
-	}
-	up(&this_usbduxsub->sem);
-	return i;
-}
+		int32_t val;
 
-
-
-
-static int usbdux_getstatusinfo(struct comedi_device *dev, int chan)
-{
-	struct usbduxsub *this_usbduxsub = dev->private;
-	uint8_t sysred = 0;
-	uint32_t one;
-	int err;
-
-	if (!this_usbduxsub)
-		return 0;
-
-	if (this_usbduxsub->ai_cmd_running) {
-		dev_err(&this_usbduxsub->interface->dev,
-			"comedi%d: status read not possible. "
-			"Async Command is running.\n", dev->minor);
-		return 0;
-	}
-
-	/* CONFIG0 */
-	this_usbduxsub->dux_commands[1] = 0x12;
-
-	/* CONFIG1: 2kHz sampling rate */
-	this_usbduxsub->dux_commands[2] = 0x80;
-
-	/* CONFIG3: differential channels off */
-	this_usbduxsub->dux_commands[3] = 0x00;
-
-	if (chan == 1) {
-		/* ADC offset */
-		sysred = sysred | 1;
-	} else if (chan == 2) {
-		/* VCC */
-		sysred = sysred | 4;
-	} else if (chan == 3) {
-		/* temperature */
-		sysred = sysred | 8;
-	} else if (chan == 4) {
-		/* gain */
-		sysred = sysred | 16;
-	} else if (chan == 5) {
-		/* ref */
-		sysred = sysred | 32;
-	}
-
-	this_usbduxsub->dux_commands[4] = 0;
-	this_usbduxsub->dux_commands[5] = 0;
-	this_usbduxsub->dux_commands[6] = sysred;
-
-	/* adc commands */
-	err = send_dux_commands(this_usbduxsub, SENDSINGLEAD);
-	if (err < 0)
-		return err;
-
-	err = receive_dux_commands(this_usbduxsub, SENDSINGLEAD);
-	if (err < 0)
-		return err;
-
-	/* 32 bits big endian from the A/D converter */
-	one = be32_to_cpu(*((int32_t *)((this_usbduxsub->insnBuffer)+1)));
-	/* mask out the status byte */
-	one = one & 0x00ffffff;
-	one = one ^ 0x00800000;
-
-	return (int)one;
-}
-
-
-
-
-
-
-/************************************/
-/* analog out */
-
-static int usbdux_ao_insn_read(struct comedi_device *dev,
-			       struct comedi_subdevice *s,
-			       struct comedi_insn *insn, unsigned int *data)
-{
-	int i;
-	int chan = CR_CHAN(insn->chanspec);
-	struct usbduxsub *this_usbduxsub = dev->private;
-
-	if (!this_usbduxsub)
-		return -EFAULT;
-
-	down(&this_usbduxsub->sem);
-	if (!(this_usbduxsub->probed)) {
-		up(&this_usbduxsub->sem);
-		return -ENODEV;
-	}
-	for (i = 0; i < insn->n; i++)
-		data[i] = this_usbduxsub->outBuffer[chan];
-
-	up(&this_usbduxsub->sem);
-	return i;
-}
-
-static int usbdux_ao_insn_write(struct comedi_device *dev,
-				struct comedi_subdevice *s,
-				struct comedi_insn *insn, unsigned int *data)
-{
-	int i, err;
-	int chan = CR_CHAN(insn->chanspec);
-	struct usbduxsub *this_usbduxsub = dev->private;
-
-	if (!this_usbduxsub)
-		return -EFAULT;
-
-	dev_dbg(&this_usbduxsub->interface->dev,
-		"comedi%d: ao_insn_write\n", dev->minor);
-
-	down(&this_usbduxsub->sem);
-	if (!(this_usbduxsub->probed)) {
-		up(&this_usbduxsub->sem);
-		return -ENODEV;
-	}
-	if (this_usbduxsub->ao_cmd_running) {
-		dev_err(&this_usbduxsub->interface->dev,
-			"comedi%d: ao_insn_write: "
-			"ERROR: asynchronous ao_cmd is running\n", dev->minor);
-		up(&this_usbduxsub->sem);
-		return 0;
-	}
-
-	for (i = 0; i < insn->n; i++) {
-		dev_dbg(&this_usbduxsub->interface->dev,
-			"comedi%d: ao_insn_write: data[chan=%d,i=%d]=%d\n",
-			dev->minor, chan, i, data[i]);
-
-		/* number of channels: 1 */
-		this_usbduxsub->dux_commands[1] = 1;
-		/* channel number */
-		this_usbduxsub->dux_commands[2] = data[i];
-		this_usbduxsub->outBuffer[chan] = data[i];
-		this_usbduxsub->dux_commands[3] = chan;
-		err = send_dux_commands(this_usbduxsub, SENDDACOMMANDS);
-		if (err < 0) {
-			up(&this_usbduxsub->sem);
-			return err;
-		}
-	}
-	up(&this_usbduxsub->sem);
-
-	return i;
-}
-
-static int usbdux_ao_inttrig(struct comedi_device *dev,
-			     struct comedi_subdevice *s, unsigned int trignum)
-{
-	int ret;
-	struct usbduxsub *this_usbduxsub = dev->private;
-
-	if (!this_usbduxsub)
-		return -EFAULT;
-
-	down(&this_usbduxsub->sem);
-
-	if (!(this_usbduxsub->probed)) {
-		ret = -ENODEV;
-		goto out;
-	}
-	if (trignum != 0) {
-		dev_err(&this_usbduxsub->interface->dev,
-			"comedi%d: usbdux_ao_inttrig: invalid trignum\n",
-			dev->minor);
-		ret = -EINVAL;
-		goto out;
-	}
-	if (!(this_usbduxsub->ao_cmd_running)) {
-		this_usbduxsub->ao_cmd_running = 1;
-		ret = usbduxsub_submit_OutURBs(this_usbduxsub);
+		ret = usbduxsigma_receive_cmd(dev, USBDUXSIGMA_SINGLE_AD_CMD);
 		if (ret < 0) {
-			dev_err(&this_usbduxsub->interface->dev,
-				"comedi%d: usbdux_ao_inttrig: submitURB: "
-				"err=%d\n", dev->minor, ret);
-			this_usbduxsub->ao_cmd_running = 0;
-			goto out;
+			up(&devpriv->sem);
+			return ret;
 		}
-		s->async->inttrig = NULL;
-	} else {
-		dev_err(&this_usbduxsub->interface->dev,
-			"comedi%d: ao_inttrig but acqu is already running.\n",
-			dev->minor);
+
+		/* 32 bits big endian from the A/D converter */
+		val = be32_to_cpu(*((int32_t *)((devpriv->insn_buf) + 1)));
+		val &= 0x00ffffff;	/* strip status byte */
+		val ^= 0x00800000;	/* convert to unsigned */
+
+		data[i] = val;
 	}
-	ret = 1;
-out:
-	up(&this_usbduxsub->sem);
-	return ret;
+	up(&devpriv->sem);
+
+	return insn->n;
 }
 
-static int usbdux_ao_cmdtest(struct comedi_device *dev,
-			     struct comedi_subdevice *s,
-			     struct comedi_cmd *cmd)
+static int usbduxsigma_ao_insn_read(struct comedi_device *dev,
+				    struct comedi_subdevice *s,
+				    struct comedi_insn *insn,
+				    unsigned int *data)
 {
-	struct usbduxsub *this_usbduxsub = dev->private;
+	struct usbduxsigma_private *devpriv = dev->private;
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	int i;
+
+	down(&devpriv->sem);
+	for (i = 0; i < insn->n; i++)
+		data[i] = devpriv->ao_readback[chan];
+	up(&devpriv->sem);
+
+	return insn->n;
+}
+
+static int usbduxsigma_ao_insn_write(struct comedi_device *dev,
+				     struct comedi_subdevice *s,
+				     struct comedi_insn *insn,
+				     unsigned int *data)
+{
+	struct usbduxsigma_private *devpriv = dev->private;
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	int ret;
+	int i;
+
+	down(&devpriv->sem);
+	if (devpriv->ao_cmd_running) {
+		up(&devpriv->sem);
+		return -EBUSY;
+	}
+
+	for (i = 0; i < insn->n; i++) {
+		devpriv->dux_commands[1] = 1;		/* num channels */
+		devpriv->dux_commands[2] = data[i];	/* value */
+		devpriv->dux_commands[3] = chan;	/* channel number */
+		ret = usbbuxsigma_send_cmd(dev, USBDUXSIGMA_DA_CMD);
+		if (ret < 0) {
+			up(&devpriv->sem);
+			return ret;
+		}
+		devpriv->ao_readback[chan] = data[i];
+	}
+	up(&devpriv->sem);
+
+	return insn->n;
+}
+
+static int usbduxsigma_ao_inttrig(struct comedi_device *dev,
+				  struct comedi_subdevice *s,
+				  unsigned int trignum)
+{
+	struct usbduxsigma_private *devpriv = dev->private;
+	int ret;
+
+	if (trignum != 0)
+		return -EINVAL;
+
+	down(&devpriv->sem);
+	if (!devpriv->ao_cmd_running) {
+		ret = usbduxsigma_submit_urbs(dev, devpriv->ao_urbs,
+					      devpriv->n_ao_urbs, 0);
+		if (ret < 0) {
+			up(&devpriv->sem);
+			return ret;
+		}
+		devpriv->ao_cmd_running = 1;
+		s->async->inttrig = NULL;
+	}
+	up(&devpriv->sem);
+
+	return 1;
+}
+
+static int usbduxsigma_ao_cmdtest(struct comedi_device *dev,
+				  struct comedi_subdevice *s,
+				  struct comedi_cmd *cmd)
+{
+	struct usbduxsigma_private *devpriv = dev->private;
 	int err = 0;
+	int high_speed;
 	unsigned int flags;
 
-	if (!this_usbduxsub)
-		return -EFAULT;
-
-	if (!(this_usbduxsub->probed))
-		return -ENODEV;
-
-	dev_dbg(&this_usbduxsub->interface->dev,
-		"comedi%d: usbdux_ao_cmdtest\n", dev->minor);
+	/* high speed conversions are not used yet */
+	high_speed = 0;		/* (devpriv->high_speed) */
 
 	/* Step 1 : check if triggers are trivially valid */
 
 	err |= cfc_check_trigger_src(&cmd->start_src, TRIG_NOW | TRIG_INT);
 
-	if (0) {		/* (this_usbduxsub->high_speed) */
+	if (high_speed) {
 		/*
 		 * start immediately a new scan
 		 * the sampling rate is set by the coversion rate
@@ -1538,8 +922,10 @@
 	err |= cfc_check_trigger_src(&cmd->scan_end_src, TRIG_COUNT);
 	err |= cfc_check_trigger_src(&cmd->stop_src, TRIG_COUNT | TRIG_NONE);
 
-	if (err)
+	if (err) {
+		up(&devpriv->sem);
 		return 1;
+	}
 
 	/* Step 2a : make sure trigger sources are unique */
 
@@ -1578,272 +964,186 @@
 	if (err)
 		return 3;
 
+	/* Step 4: fix up any arguments */
+
+	/* we count in timer steps */
+	if (high_speed) {
+		/* timing of the conversion itself: every 125 us */
+		devpriv->ao_timer = cmd->convert_arg / 125000;
+	} else {
+		/*
+		 * timing of the scan: every 1ms
+		 * we get all channels at once
+		 */
+		devpriv->ao_timer = cmd->scan_begin_arg / 1000000;
+	}
+	if (devpriv->ao_timer < 1)
+		err |= -EINVAL;
+
+	if (cmd->stop_src == TRIG_COUNT) {
+		/* not continuous, use counter */
+		if (high_speed) {
+			/* high speed also scans everything at once */
+			devpriv->ao_sample_count = cmd->stop_arg *
+						   cmd->scan_end_arg;
+		} else {
+			/*
+			 * There's no scan as the scan has been
+			 * handled inside the FX2. Data arrives as
+			 * one packet.
+			 */
+			devpriv->ao_sample_count = cmd->stop_arg;
+		}
+		devpriv->ao_continuous = 0;
+	} else {
+		/* continuous acquisition */
+		devpriv->ao_continuous = 1;
+		devpriv->ao_sample_count = 0;
+	}
+
+	if (err)
+		return 4;
+
 	return 0;
 }
 
-static int usbdux_ao_cmd(struct comedi_device *dev, struct comedi_subdevice *s)
+static int usbduxsigma_ao_cmd(struct comedi_device *dev,
+			      struct comedi_subdevice *s)
 {
+	struct usbduxsigma_private *devpriv = dev->private;
 	struct comedi_cmd *cmd = &s->async->cmd;
-	unsigned int chan, gain;
-	int i, ret;
-	struct usbduxsub *this_usbduxsub = dev->private;
+	int ret;
+	int i;
 
-	if (!this_usbduxsub)
-		return -EFAULT;
-
-	down(&this_usbduxsub->sem);
-	if (!(this_usbduxsub->probed)) {
-		up(&this_usbduxsub->sem);
-		return -ENODEV;
-	}
-	dev_dbg(&this_usbduxsub->interface->dev,
-		"comedi%d: %s\n", dev->minor, __func__);
+	down(&devpriv->sem);
 
 	/* set current channel of the running acquisition to zero */
 	s->async->cur_chan = 0;
-	for (i = 0; i < cmd->chanlist_len; ++i) {
-		chan = CR_CHAN(cmd->chanlist[i]);
-		gain = CR_RANGE(cmd->chanlist[i]);
-		if (i >= NUMOUTCHANNELS) {
-			dev_err(&this_usbduxsub->interface->dev,
-				"comedi%d: %s: channel list too long\n",
-				dev->minor, __func__);
-			break;
-		}
-		this_usbduxsub->dac_commands[i] = chan;
-		dev_dbg(&this_usbduxsub->interface->dev,
-			"comedi%d: dac command for ch %d is %x\n",
-			dev->minor, i, this_usbduxsub->dac_commands[i]);
-	}
+	for (i = 0; i < cmd->chanlist_len; ++i)
+		devpriv->dac_commands[i] = CR_CHAN(cmd->chanlist[i]);
 
-	/* we count in steps of 1ms (125us) */
-	/* 125us mode not used yet */
-	if (0) {		/* (this_usbduxsub->high_speed) */
-		/* 125us */
-		/* timing of the conversion itself: every 125 us */
-		this_usbduxsub->ao_timer = cmd->convert_arg / 125000;
-	} else {
-		/* 1ms */
-		/* timing of the scan: we get all channels at once */
-		this_usbduxsub->ao_timer = cmd->scan_begin_arg / 1000000;
-		dev_dbg(&this_usbduxsub->interface->dev,
-			"comedi%d: scan_begin_src=%d, scan_begin_arg=%d, "
-			"convert_src=%d, convert_arg=%d\n", dev->minor,
-			cmd->scan_begin_src, cmd->scan_begin_arg,
-			cmd->convert_src, cmd->convert_arg);
-		dev_dbg(&this_usbduxsub->interface->dev,
-			"comedi%d: ao_timer=%d (ms)\n",
-			dev->minor, this_usbduxsub->ao_timer);
-		if (this_usbduxsub->ao_timer < 1) {
-			dev_err(&this_usbduxsub->interface->dev,
-				"comedi%d: usbdux: ao_timer=%d, "
-				"scan_begin_arg=%d. "
-				"Not properly tested by cmdtest?\n",
-				dev->minor, this_usbduxsub->ao_timer,
-				cmd->scan_begin_arg);
-			up(&this_usbduxsub->sem);
-			return -EINVAL;
-		}
-	}
-	this_usbduxsub->ao_counter = this_usbduxsub->ao_timer;
-
-	if (cmd->stop_src == TRIG_COUNT) {
-		/* not continuous */
-		/* counter */
-		/* high speed also scans everything at once */
-		if (0) {	/* (this_usbduxsub->high_speed) */
-			this_usbduxsub->ao_sample_count =
-			    (cmd->stop_arg) * (cmd->scan_end_arg);
-		} else {
-			/* there's no scan as the scan has been */
-			/* perf inside the FX2 */
-			/* data arrives as one packet */
-			this_usbduxsub->ao_sample_count = cmd->stop_arg;
-		}
-		this_usbduxsub->ao_continuous = 0;
-	} else {
-		/* continuous acquisition */
-		this_usbduxsub->ao_continuous = 1;
-		this_usbduxsub->ao_sample_count = 0;
-	}
+	devpriv->ao_counter = devpriv->ao_timer;
 
 	if (cmd->start_src == TRIG_NOW) {
 		/* enable this acquisition operation */
-		this_usbduxsub->ao_cmd_running = 1;
-		ret = usbduxsub_submit_OutURBs(this_usbduxsub);
+		ret = usbduxsigma_submit_urbs(dev, devpriv->ao_urbs,
+					      devpriv->n_ao_urbs, 0);
 		if (ret < 0) {
-			this_usbduxsub->ao_cmd_running = 0;
-			/* fixme: unlink here?? */
-			up(&this_usbduxsub->sem);
+			up(&devpriv->sem);
 			return ret;
 		}
 		s->async->inttrig = NULL;
-	} else {
-		/* TRIG_INT */
-		/* submit the urbs later */
-		/* wait for an internal signal */
-		s->async->inttrig = usbdux_ao_inttrig;
+		devpriv->ao_cmd_running = 1;
+	} else {	/* TRIG_INT */
+		/* wait for an internal signal and submit the urbs later */
+		s->async->inttrig = usbduxsigma_ao_inttrig;
 	}
 
-	up(&this_usbduxsub->sem);
+	up(&devpriv->sem);
+
 	return 0;
 }
 
-static int usbdux_dio_insn_config(struct comedi_device *dev,
-				  struct comedi_subdevice *s,
-				  struct comedi_insn *insn, unsigned int *data)
+static int usbduxsigma_dio_insn_config(struct comedi_device *dev,
+				       struct comedi_subdevice *s,
+				       struct comedi_insn *insn,
+				       unsigned int *data)
 {
-	int chan = CR_CHAN(insn->chanspec);
-
-	/* The input or output configuration of each digital line is
-	 * configured by a special insn_config instruction.  chanspec
-	 * contains the channel to be changed, and data[0] contains the
-	 * value COMEDI_INPUT or COMEDI_OUTPUT. */
+	unsigned int chan = CR_CHAN(insn->chanspec);
+	unsigned int mask = 1 << chan;
 
 	switch (data[0]) {
 	case INSN_CONFIG_DIO_OUTPUT:
-		s->io_bits |= 1 << chan;	/* 1 means Out */
+		s->io_bits |= mask;
 		break;
 	case INSN_CONFIG_DIO_INPUT:
-		s->io_bits &= ~(1 << chan);
+		s->io_bits &= ~mask;
 		break;
 	case INSN_CONFIG_DIO_QUERY:
-		data[1] =
-		    (s->io_bits & (1 << chan)) ? COMEDI_OUTPUT : COMEDI_INPUT;
+		data[1] = (s->io_bits & mask) ? COMEDI_OUTPUT : COMEDI_INPUT;
 		break;
 	default:
 		return -EINVAL;
 		break;
 	}
-	/* we don't tell the firmware here as it would take 8 frames */
-	/* to submit the information. We do it in the insn_bits. */
+
+	/*
+	 * We don't tell the firmware here as it would take 8 frames
+	 * to submit the information. We do it in the (*insn_bits).
+	 */
 	return insn->n;
 }
 
-static int usbdux_dio_insn_bits(struct comedi_device *dev,
-				struct comedi_subdevice *s,
-				struct comedi_insn *insn,
-				unsigned int *data)
+static int usbduxsigma_dio_insn_bits(struct comedi_device *dev,
+				     struct comedi_subdevice *s,
+				     struct comedi_insn *insn,
+				     unsigned int *data)
 {
+	struct usbduxsigma_private *devpriv = dev->private;
+	unsigned int mask = data[0];
+	unsigned int bits = data[1];
+	int ret;
 
-	struct usbduxsub *this_usbduxsub = dev->private;
-	int err;
+	down(&devpriv->sem);
 
-	if (!this_usbduxsub)
-		return -EFAULT;
+	s->state &= ~mask;
+	s->state |= (bits & mask);
 
-	down(&this_usbduxsub->sem);
+	devpriv->dux_commands[1] = s->io_bits & 0xff;
+	devpriv->dux_commands[4] = s->state & 0xff;
+	devpriv->dux_commands[2] = (s->io_bits >> 8) & 0xff;
+	devpriv->dux_commands[5] = (s->state >> 8) & 0xff;
+	devpriv->dux_commands[3] = (s->io_bits >> 16) & 0xff;
+	devpriv->dux_commands[6] = (s->state >> 16) & 0xff;
 
-	if (!(this_usbduxsub->probed)) {
-		up(&this_usbduxsub->sem);
-		return -ENODEV;
-	}
+	ret = usbbuxsigma_send_cmd(dev, USBDUXSIGMA_DIO_BITS_CMD);
+	if (ret < 0)
+		goto done;
+	ret = usbduxsigma_receive_cmd(dev, USBDUXSIGMA_DIO_BITS_CMD);
+	if (ret < 0)
+		goto done;
 
-	/* The insn data is a mask in data[0] and the new data
-	 * in data[1], each channel cooresponding to a bit. */
-	s->state &= ~data[0];
-	s->state |= data[0] & data[1];
-	/* The commands are 8 bits wide */
-	this_usbduxsub->dux_commands[1] = (s->io_bits) & 0x000000FF;
-	this_usbduxsub->dux_commands[4] = (s->state) & 0x000000FF;
-	this_usbduxsub->dux_commands[2] = ((s->io_bits) & 0x0000FF00) >> 8;
-	this_usbduxsub->dux_commands[5] = ((s->state) & 0x0000FF00) >> 8;
-	this_usbduxsub->dux_commands[3] = ((s->io_bits) & 0x00FF0000) >> 16;
-	this_usbduxsub->dux_commands[6] = ((s->state) & 0x00FF0000) >> 16;
+	s->state = devpriv->insn_buf[1] |
+		   (devpriv->insn_buf[2] << 8) |
+		   (devpriv->insn_buf[3] << 16);
 
-	/* This command also tells the firmware to return */
-	/* the digital input lines */
-	err = send_dux_commands(this_usbduxsub, SENDDIOBITSCOMMAND);
-	if (err < 0) {
-		up(&this_usbduxsub->sem);
-		return err;
-	}
-	err = receive_dux_commands(this_usbduxsub, SENDDIOBITSCOMMAND);
-	if (err < 0) {
-		up(&this_usbduxsub->sem);
-		return err;
-	}
+	data[1] = s->state;
+	ret = insn->n;
 
-	data[1] = (((unsigned int)(this_usbduxsub->insnBuffer[1]))&0xff) |
-		((((unsigned int)(this_usbduxsub->insnBuffer[2]))&0xff) << 8) |
-		((((unsigned int)(this_usbduxsub->insnBuffer[3]))&0xff) << 16);
-
-	s->state = data[1];
-
-	up(&this_usbduxsub->sem);
-	return insn->n;
-}
-
-/***********************************/
-/* PWM */
-
-static int usbduxsub_unlink_PwmURBs(struct usbduxsub *usbduxsub_tmp)
-{
-	int err = 0;
-
-	if (usbduxsub_tmp && usbduxsub_tmp->urbPwm) {
-		if (usbduxsub_tmp->urbPwm)
-			usb_kill_urb(usbduxsub_tmp->urbPwm);
-		dev_dbg(&usbduxsub_tmp->interface->dev,
-			"comedi: unlinked PwmURB: res=%d\n", err);
-	}
-	return err;
-}
-
-/* This cancels a running acquisition operation
- * in any context.
- */
-static int usbdux_pwm_stop(struct usbduxsub *this_usbduxsub, int do_unlink)
-{
-	int ret = 0;
-
-	if (!this_usbduxsub)
-		return -EFAULT;
-
-	dev_dbg(&this_usbduxsub->interface->dev, "comedi: %s\n", __func__);
-	if (do_unlink)
-		ret = usbduxsub_unlink_PwmURBs(this_usbduxsub);
-
-	this_usbduxsub->pwm_cmd_running = 0;
+done:
+	up(&devpriv->sem);
 
 	return ret;
 }
 
-/* force unlink - is called by comedi */
-static int usbdux_pwm_cancel(struct comedi_device *dev,
-			     struct comedi_subdevice *s)
+static void usbduxsigma_pwm_stop(struct comedi_device *dev, int do_unlink)
 {
-	struct usbduxsub *this_usbduxsub = dev->private;
-	int res = 0;
+	struct usbduxsigma_private *devpriv = dev->private;
 
-	/* unlink only if it is really running */
-	res = usbdux_pwm_stop(this_usbduxsub, this_usbduxsub->pwm_cmd_running);
+	if (do_unlink) {
+		if (devpriv->pwm_urb)
+			usb_kill_urb(devpriv->pwm_urb);
+	}
 
-	dev_dbg(&this_usbduxsub->interface->dev,
-		"comedi %d: sending pwm off command to the usb device.\n",
-		dev->minor);
-	res = send_dux_commands(this_usbduxsub, SENDPWMOFF);
-	if (res < 0)
-		return res;
-
-	return res;
+	devpriv->pwm_cmd_running = 0;
 }
 
-static void usbduxsub_pwm_irq(struct urb *urb)
+static int usbduxsigma_pwm_cancel(struct comedi_device *dev,
+				  struct comedi_subdevice *s)
 {
+	struct usbduxsigma_private *devpriv = dev->private;
+
+	/* unlink only if it is really running */
+	usbduxsigma_pwm_stop(dev, devpriv->pwm_cmd_running);
+
+	return usbbuxsigma_send_cmd(dev, USBDUXSIGMA_PWM_OFF_CMD);
+}
+
+static void usbduxsigma_pwm_urb_complete(struct urb *urb)
+{
+	struct comedi_device *dev = urb->context;
+	struct usbduxsigma_private *devpriv = dev->private;
 	int ret;
-	struct usbduxsub *this_usbduxsub;
-	struct comedi_device *this_comedidev;
-	struct comedi_subdevice *s;
-
-	/* printk(KERN_DEBUG "PWM: IRQ\n"); */
-
-	/* the context variable points to the subdevice */
-	this_comedidev = urb->context;
-	/* the private structure of the subdevice is struct usbduxsub */
-	this_usbduxsub = this_comedidev->private;
-
-	s = &this_comedidev->subdevices[SUBDEV_DA];
 
 	switch (urb->status) {
 	case 0:
@@ -1854,260 +1154,180 @@
 	case -ENOENT:
 	case -ESHUTDOWN:
 	case -ECONNABORTED:
-		/*
-		 * after an unlink command, unplug, ... etc
-		 * no unlink needed here. Already shutting down.
-		 */
-		if (this_usbduxsub->pwm_cmd_running)
-			usbdux_pwm_stop(this_usbduxsub, 0);
-
+		/* happens after an unlink command */
+		if (devpriv->pwm_cmd_running)
+			usbduxsigma_pwm_stop(dev, 0);	/* w/o unlink */
 		return;
 
 	default:
 		/* a real error */
-		if (this_usbduxsub->pwm_cmd_running) {
-			dev_err(&this_usbduxsub->interface->dev,
-				"comedi_: Non-zero urb status received in "
-				"pwm intr context: %d\n", urb->status);
-			usbdux_pwm_stop(this_usbduxsub, 0);
+		if (devpriv->pwm_cmd_running) {
+			dev_err(dev->class_dev,
+				"%s: non-zero urb status (%d)\n",
+				__func__, urb->status);
+			usbduxsigma_pwm_stop(dev, 0);	/* w/o unlink */
 		}
 		return;
 	}
 
-	/* are we actually running? */
-	if (!(this_usbduxsub->pwm_cmd_running))
+	if (!devpriv->pwm_cmd_running)
 		return;
 
-	urb->transfer_buffer_length = this_usbduxsub->sizePwmBuf;
-	urb->dev = this_usbduxsub->usbdev;
+	urb->transfer_buffer_length = devpriv->pwm_buf_sz;
+	urb->dev = comedi_to_usb_dev(dev);
 	urb->status = 0;
-	if (this_usbduxsub->pwm_cmd_running) {
-		ret = usb_submit_urb(urb, GFP_ATOMIC);
-		if (ret < 0) {
-			dev_err(&this_usbduxsub->interface->dev,
-				"comedi_: pwm urb resubm failed in int-cont. "
-				"ret=%d", ret);
-			if (ret == EL2NSYNC)
-				dev_err(&this_usbduxsub->interface->dev,
-					"buggy USB host controller or bug in "
-					"IRQ handling!\n");
-
-			/* don't do an unlink here */
-			usbdux_pwm_stop(this_usbduxsub, 0);
-		}
+	ret = usb_submit_urb(urb, GFP_ATOMIC);
+	if (ret < 0) {
+		dev_err(dev->class_dev, "%s: urb resubmit failed (%d)\n",
+			__func__, ret);
+		if (ret == EL2NSYNC)
+			dev_err(dev->class_dev,
+				"buggy USB host controller or bug in IRQ handler\n");
+		usbduxsigma_pwm_stop(dev, 0);	/* w/o unlink */
 	}
 }
 
-static int usbduxsub_submit_PwmURBs(struct usbduxsub *usbduxsub)
+static int usbduxsigma_submit_pwm_urb(struct comedi_device *dev)
 {
-	int errFlag;
-
-	if (!usbduxsub)
-		return -EFAULT;
-
-	dev_dbg(&usbduxsub->interface->dev, "comedi_: submitting pwm-urb\n");
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxsigma_private *devpriv = dev->private;
+	struct urb *urb = devpriv->pwm_urb;
 
 	/* in case of a resubmission after an unlink... */
-	usb_fill_bulk_urb(usbduxsub->urbPwm,
-			  usbduxsub->usbdev,
-			  usb_sndbulkpipe(usbduxsub->usbdev, PWM_EP),
-			  usbduxsub->urbPwm->transfer_buffer,
-			  usbduxsub->sizePwmBuf, usbduxsub_pwm_irq,
-			  usbduxsub->comedidev);
+	usb_fill_bulk_urb(urb,
+			  usb, usb_sndbulkpipe(usb, USBDUXSIGMA_PWM_OUT_EP),
+			  urb->transfer_buffer, devpriv->pwm_buf_sz,
+			  usbduxsigma_pwm_urb_complete, dev);
 
-	errFlag = usb_submit_urb(usbduxsub->urbPwm, GFP_ATOMIC);
-	if (errFlag) {
-		dev_err(&usbduxsub->interface->dev,
-			"comedi_: usbduxsigma: pwm: usb_submit_urb error %d\n",
-			errFlag);
-		return errFlag;
-	}
-	return 0;
+	return usb_submit_urb(urb, GFP_ATOMIC);
 }
 
-static int usbdux_pwm_period(struct comedi_device *dev,
-			     struct comedi_subdevice *s, unsigned int period)
+static int usbduxsigma_pwm_period(struct comedi_device *dev,
+				  struct comedi_subdevice *s,
+				  unsigned int period)
 {
-	struct usbduxsub *this_usbduxsub = dev->private;
+	struct usbduxsigma_private *devpriv = dev->private;
 	int fx2delay = 255;
 
 	if (period < MIN_PWM_PERIOD) {
-		dev_err(&this_usbduxsub->interface->dev,
-			"comedi%d: illegal period setting for pwm.\n",
-			dev->minor);
 		return -EAGAIN;
 	} else {
-		fx2delay = period / ((int)(6 * 512 * (1.0 / 0.033))) - 6;
-		if (fx2delay > 255) {
-			dev_err(&this_usbduxsub->interface->dev,
-				"comedi%d: period %d for pwm is too low.\n",
-				dev->minor, period);
+		fx2delay = (period / (6 * 512 * 1000 / 33)) - 6;
+		if (fx2delay > 255)
 			return -EAGAIN;
-		}
 	}
-	this_usbduxsub->pwmDelay = fx2delay;
-	this_usbduxsub->pwmPeriod = period;
-	dev_dbg(&this_usbduxsub->interface->dev, "%s: frequ=%d, period=%d\n",
-		__func__, period, fx2delay);
+	devpriv->pwm_delay = fx2delay;
+	devpriv->pwm_period = period;
 	return 0;
 }
 
-/* is called from insn so there's no need to do all the sanity checks */
-static int usbdux_pwm_start(struct comedi_device *dev,
-			    struct comedi_subdevice *s)
+static int usbduxsigma_pwm_start(struct comedi_device *dev,
+				 struct comedi_subdevice *s)
 {
-	int ret, i;
-	struct usbduxsub *this_usbduxsub = dev->private;
+	struct usbduxsigma_private *devpriv = dev->private;
+	int ret;
 
-	dev_dbg(&this_usbduxsub->interface->dev, "comedi%d: %s\n",
-		dev->minor, __func__);
-
-	if (this_usbduxsub->pwm_cmd_running) {
-		/* already running */
+	if (devpriv->pwm_cmd_running)
 		return 0;
-	}
 
-	this_usbduxsub->dux_commands[1] = ((uint8_t) this_usbduxsub->pwmDelay);
-	ret = send_dux_commands(this_usbduxsub, SENDPWMON);
+	devpriv->dux_commands[1] = devpriv->pwm_delay;
+	ret = usbbuxsigma_send_cmd(dev, USBDUXSIGMA_PWM_ON_CMD);
 	if (ret < 0)
 		return ret;
 
-	/* initialise the buffer */
-	for (i = 0; i < this_usbduxsub->sizePwmBuf; i++)
-		((char *)(this_usbduxsub->urbPwm->transfer_buffer))[i] = 0;
+	memset(devpriv->pwm_urb->transfer_buffer, 0, devpriv->pwm_buf_sz);
 
-	this_usbduxsub->pwm_cmd_running = 1;
-	ret = usbduxsub_submit_PwmURBs(this_usbduxsub);
-	if (ret < 0) {
-		this_usbduxsub->pwm_cmd_running = 0;
+	ret = usbduxsigma_submit_pwm_urb(dev);
+	if (ret < 0)
 		return ret;
-	}
+	devpriv->pwm_cmd_running = 1;
+
 	return 0;
 }
 
-/* generates the bit pattern for PWM with the optional sign bit */
-static int usbdux_pwm_pattern(struct comedi_device *dev,
-			      struct comedi_subdevice *s, int channel,
-			      unsigned int value, unsigned int sign)
+static int usbduxsigma_pwm_pattern(struct comedi_device *dev,
+				   struct comedi_subdevice *s,
+				   unsigned int chan,
+				   unsigned int value,
+				   unsigned int sign)
 {
-	struct usbduxsub *this_usbduxsub = dev->private;
-	int i, szbuf;
-	char *pBuf;
-	char pwm_mask;
-	char sgn_mask;
-	char c;
+	struct usbduxsigma_private *devpriv = dev->private;
+	char pwm_mask = (1 << chan);	/* DIO bit for the PWM data */
+	char sgn_mask = (16 << chan);	/* DIO bit for the sign */
+	char *buf = (char *)(devpriv->pwm_urb->transfer_buffer);
+	int szbuf = devpriv->pwm_buf_sz;
+	int i;
 
-	if (!this_usbduxsub)
-		return -EFAULT;
-
-	/* this is the DIO bit which carries the PWM data */
-	pwm_mask = (1 << channel);
-	/* this is the DIO bit which carries the optional direction bit */
-	sgn_mask = (16 << channel);
-	/* this is the buffer which will be filled with the with bit */
-	/* pattern for one period */
-	szbuf = this_usbduxsub->sizePwmBuf;
-	pBuf = (char *)(this_usbduxsub->urbPwm->transfer_buffer);
 	for (i = 0; i < szbuf; i++) {
-		c = *pBuf;
-		/* reset bits */
-		c = c & (~pwm_mask);
-		/* set the bit as long as the index is lower than the value */
+		char c = *buf;
+
+		c &= ~pwm_mask;
 		if (i < value)
-			c = c | pwm_mask;
-		/* set the optional sign bit for a relay */
-		if (!sign) {
-			/* positive value */
-			c = c & (~sgn_mask);
-		} else {
-			/* negative value */
-			c = c | sgn_mask;
-		}
-		*(pBuf++) = c;
+			c |= pwm_mask;
+		if (!sign)
+			c &= ~sgn_mask;
+		else
+			c |= sgn_mask;
+		*buf++ = c;
 	}
 	return 1;
 }
 
-static int usbdux_pwm_write(struct comedi_device *dev,
-			    struct comedi_subdevice *s,
-			    struct comedi_insn *insn, unsigned int *data)
+static int usbduxsigma_pwm_write(struct comedi_device *dev,
+				 struct comedi_subdevice *s,
+				 struct comedi_insn *insn,
+				 unsigned int *data)
 {
-	struct usbduxsub *this_usbduxsub = dev->private;
-
-	if (!this_usbduxsub)
-		return -EFAULT;
-
-	if ((insn->n) != 1) {
-		/*
-		 * doesn't make sense to have more than one value here because
-		 * it would just overwrite the PWM buffer a couple of times
-		 */
-		return -EINVAL;
-	}
+	unsigned int chan = CR_CHAN(insn->chanspec);
 
 	/*
-	 * the sign is set via a special INSN only, this gives us 8 bits for
-	 * normal operation
-	 * relay sign 0 by default
+	 * It doesn't make sense to support more than one value here
+	 * because it would just overwrite the PWM buffer.
 	 */
-	return usbdux_pwm_pattern(dev, s, CR_CHAN(insn->chanspec), data[0], 0);
+	if (insn->n != 1)
+		return -EINVAL;
+
+	/*
+	 * The sign is set via a special INSN only, this gives us 8 bits
+	 * for normal operation, sign is 0 by default.
+	 */
+	return usbduxsigma_pwm_pattern(dev, s, chan, data[0], 0);
 }
 
-static int usbdux_pwm_read(struct comedi_device *x1,
-			   struct comedi_subdevice *x2, struct comedi_insn *x3,
-			   unsigned int *x4)
+static int usbduxsigma_pwm_config(struct comedi_device *dev,
+				  struct comedi_subdevice *s,
+				  struct comedi_insn *insn,
+				  unsigned int *data)
 {
-	/* not needed */
-	return -EINVAL;
-};
+	struct usbduxsigma_private *devpriv = dev->private;
+	unsigned int chan = CR_CHAN(insn->chanspec);
 
-/* switches on/off PWM */
-static int usbdux_pwm_config(struct comedi_device *dev,
-			     struct comedi_subdevice *s,
-			     struct comedi_insn *insn, unsigned int *data)
-{
-	struct usbduxsub *this_usbduxsub = dev->private;
 	switch (data[0]) {
 	case INSN_CONFIG_ARM:
-		/* switch it on */
-		dev_dbg(&this_usbduxsub->interface->dev,
-			"comedi%d: %s: pwm on\n", dev->minor, __func__);
 		/*
 		 * if not zero the PWM is limited to a certain time which is
 		 * not supported here
 		 */
 		if (data[1] != 0)
 			return -EINVAL;
-		return usbdux_pwm_start(dev, s);
+		return usbduxsigma_pwm_start(dev, s);
 	case INSN_CONFIG_DISARM:
-		dev_dbg(&this_usbduxsub->interface->dev,
-			"comedi%d: %s: pwm off\n", dev->minor, __func__);
-		return usbdux_pwm_cancel(dev, s);
+		return usbduxsigma_pwm_cancel(dev, s);
 	case INSN_CONFIG_GET_PWM_STATUS:
-		/*
-		 * to check if the USB transmission has failed or in case PWM
-		 * was limited to n cycles to check if it has terminated
-		 */
-		data[1] = this_usbduxsub->pwm_cmd_running;
+		data[1] = devpriv->pwm_cmd_running;
 		return 0;
 	case INSN_CONFIG_PWM_SET_PERIOD:
-		dev_dbg(&this_usbduxsub->interface->dev,
-			"comedi%d: %s: setting period\n", dev->minor,
-			__func__);
-		return usbdux_pwm_period(dev, s, data[1]);
+		return usbduxsigma_pwm_period(dev, s, data[1]);
 	case INSN_CONFIG_PWM_GET_PERIOD:
-		data[1] = this_usbduxsub->pwmPeriod;
+		data[1] = devpriv->pwm_period;
 		return 0;
 	case INSN_CONFIG_PWM_SET_H_BRIDGE:
-		/* value in the first byte and the sign in the second for a
-		   relay */
-		return usbdux_pwm_pattern(dev, s,
-					  /* the channel number */
-					  CR_CHAN(insn->chanspec),
-					  /* actual PWM data */
-					  data[1],
-					  /* just a sign */
-					  (data[2] != 0));
+		/*
+		 * data[1] = value
+		 * data[2] = sign (for a relay)
+		 */
+		return usbduxsigma_pwm_pattern(dev, s, chan,
+					       data[1], (data[2] != 0));
 	case INSN_CONFIG_PWM_GET_H_BRIDGE:
 		/* values are not kept in this driver, nothing to return */
 		return -EINVAL;
@@ -2115,233 +1335,399 @@
 	return -EINVAL;
 }
 
-/* end of PWM */
-/*****************************************************************/
-
-static void tidy_up(struct usbduxsub *usbduxsub_tmp)
+static int usbduxsigma_getstatusinfo(struct comedi_device *dev, int chan)
 {
-	int i;
+	struct usbduxsigma_private *devpriv = dev->private;
+	uint8_t sysred;
+	uint32_t val;
+	int ret;
 
-	if (!usbduxsub_tmp)
-		return;
-	dev_dbg(&usbduxsub_tmp->interface->dev, "comedi_: tiding up\n");
-
-	/* shows the usb subsystem that the driver is down */
-	if (usbduxsub_tmp->interface)
-		usb_set_intfdata(usbduxsub_tmp->interface, NULL);
-
-	usbduxsub_tmp->probed = 0;
-
-	if (usbduxsub_tmp->urbIn) {
-		if (usbduxsub_tmp->ai_cmd_running) {
-			usbduxsub_tmp->ai_cmd_running = 0;
-			usbduxsub_unlink_InURBs(usbduxsub_tmp);
-		}
-		for (i = 0; i < usbduxsub_tmp->numOfInBuffers; i++) {
-			kfree(usbduxsub_tmp->urbIn[i]->transfer_buffer);
-			usbduxsub_tmp->urbIn[i]->transfer_buffer = NULL;
-			usb_kill_urb(usbduxsub_tmp->urbIn[i]);
-			usb_free_urb(usbduxsub_tmp->urbIn[i]);
-			usbduxsub_tmp->urbIn[i] = NULL;
-		}
-		kfree(usbduxsub_tmp->urbIn);
-		usbduxsub_tmp->urbIn = NULL;
+	switch (chan) {
+	default:
+	case 0:
+		sysred = 0;		/* ADC zero */
+		break;
+	case 1:
+		sysred = 1;		/* ADC offset */
+		break;
+	case 2:
+		sysred = 4;		/* VCC */
+		break;
+	case 3:
+		sysred = 8;		/* temperature */
+		break;
+	case 4:
+		sysred = 16;		/* gain */
+		break;
+	case 5:
+		sysred =  32;		/* ref */
+		break;
 	}
-	if (usbduxsub_tmp->urbOut) {
-		if (usbduxsub_tmp->ao_cmd_running) {
-			usbduxsub_tmp->ao_cmd_running = 0;
-			usbduxsub_unlink_OutURBs(usbduxsub_tmp);
-		}
-		for (i = 0; i < usbduxsub_tmp->numOfOutBuffers; i++) {
-			if (usbduxsub_tmp->urbOut[i]->transfer_buffer) {
-				kfree(usbduxsub_tmp->
-				      urbOut[i]->transfer_buffer);
-				usbduxsub_tmp->urbOut[i]->transfer_buffer =
-				    NULL;
-			}
-			if (usbduxsub_tmp->urbOut[i]) {
-				usb_kill_urb(usbduxsub_tmp->urbOut[i]);
-				usb_free_urb(usbduxsub_tmp->urbOut[i]);
-				usbduxsub_tmp->urbOut[i] = NULL;
-			}
-		}
-		kfree(usbduxsub_tmp->urbOut);
-		usbduxsub_tmp->urbOut = NULL;
-	}
-	if (usbduxsub_tmp->urbPwm) {
-		if (usbduxsub_tmp->pwm_cmd_running) {
-			usbduxsub_tmp->pwm_cmd_running = 0;
-			usbduxsub_unlink_PwmURBs(usbduxsub_tmp);
-		}
-		kfree(usbduxsub_tmp->urbPwm->transfer_buffer);
-		usbduxsub_tmp->urbPwm->transfer_buffer = NULL;
-		usb_kill_urb(usbduxsub_tmp->urbPwm);
-		usb_free_urb(usbduxsub_tmp->urbPwm);
-		usbduxsub_tmp->urbPwm = NULL;
-	}
-	kfree(usbduxsub_tmp->inBuffer);
-	usbduxsub_tmp->inBuffer = NULL;
-	kfree(usbduxsub_tmp->insnBuffer);
-	usbduxsub_tmp->insnBuffer = NULL;
-	kfree(usbduxsub_tmp->outBuffer);
-	usbduxsub_tmp->outBuffer = NULL;
-	kfree(usbduxsub_tmp->dac_commands);
-	usbduxsub_tmp->dac_commands = NULL;
-	kfree(usbduxsub_tmp->dux_commands);
-	usbduxsub_tmp->dux_commands = NULL;
-	usbduxsub_tmp->ai_cmd_running = 0;
-	usbduxsub_tmp->ao_cmd_running = 0;
-	usbduxsub_tmp->pwm_cmd_running = 0;
+
+	devpriv->dux_commands[1] = 0x12; /* CONFIG0 */
+	devpriv->dux_commands[2] = 0x80; /* CONFIG1: 2kHz sampling rate */
+	devpriv->dux_commands[3] = 0x00; /* CONFIG3: diff. channels off */
+	devpriv->dux_commands[4] = 0;
+	devpriv->dux_commands[5] = 0;
+	devpriv->dux_commands[6] = sysred;
+	ret = usbbuxsigma_send_cmd(dev, USBDUXSIGMA_SINGLE_AD_CMD);
+	if (ret < 0)
+		return ret;
+
+	ret = usbduxsigma_receive_cmd(dev, USBDUXSIGMA_SINGLE_AD_CMD);
+	if (ret < 0)
+		return ret;
+
+	/* 32 bits big endian from the A/D converter */
+	val = be32_to_cpu(*((int32_t *)((devpriv->insn_buf)+1)));
+	val &= 0x00ffffff;	/* strip status byte */
+	val ^= 0x00800000;	/* convert to unsigned */
+
+	return (int)val;
 }
 
-static int usbduxsigma_attach_common(struct comedi_device *dev,
-				     struct usbduxsub *uds)
+static int usbduxsigma_attach_common(struct comedi_device *dev)
 {
-	int ret;
+	struct usbduxsigma_private *devpriv = dev->private;
 	struct comedi_subdevice *s;
 	int n_subdevs;
 	int offset;
+	int ret;
 
-	down(&uds->sem);
-	/* pointer back to the corresponding comedi device */
-	uds->comedidev = dev;
+	down(&devpriv->sem);
 
-	/* set number of subdevices */
-	if (uds->high_speed)
+	if (devpriv->high_speed)
 		n_subdevs = 4;	/* with pwm */
 	else
 		n_subdevs = 3;	/* without pwm */
 	ret = comedi_alloc_subdevices(dev, n_subdevs);
 	if (ret) {
-		up(&uds->sem);
+		up(&devpriv->sem);
 		return ret;
 	}
-	/* private structure is also simply the usb-structure */
-	dev->private = uds;
-	/* the first subdevice is the A/D converter */
-	s = &dev->subdevices[SUBDEV_AD];
-	/* the URBs get the comedi subdevice */
-	/* which is responsible for reading */
-	/* this is the subdevice which reads data */
+
+	/* Analog Input subdevice */
+	s = &dev->subdevices[0];
 	dev->read_subdev = s;
-	/* the subdevice receives as private structure the */
-	/* usb-structure */
-	s->private = NULL;
-	/* analog input */
-	s->type = COMEDI_SUBD_AI;
-	/* readable and ref is to ground, 32 bit wide data! */
-	s->subdev_flags = SDF_READABLE | SDF_GROUND |
-		SDF_CMD_READ | SDF_LSAMPL;
-	/* 16 A/D channels */
-	s->n_chan = NUMCHANNELS;
-	/* length of the channellist */
-	s->len_chanlist = NUMCHANNELS;
-	/* callback functions */
-	s->insn_read = usbdux_ai_insn_read;
-	s->do_cmdtest = usbdux_ai_cmdtest;
-	s->do_cmd = usbdux_ai_cmd;
-	s->cancel = usbdux_ai_cancel;
-	/* max value from the A/D converter (24bit) */
-	s->maxdata = 0x00FFFFFF;
-	/* range table to convert to physical units */
-	s->range_table = (&range_usbdux_ai_range);
-	/* analog output subdevice */
-	s = &dev->subdevices[SUBDEV_DA];
-	/* analog out */
-	s->type = COMEDI_SUBD_AO;
-	/* backward pointer */
+	s->type		= COMEDI_SUBD_AI;
+	s->subdev_flags	= SDF_READABLE | SDF_GROUND | SDF_CMD_READ | SDF_LSAMPL;
+	s->n_chan	= NUMCHANNELS;
+	s->len_chanlist	= NUMCHANNELS;
+	s->maxdata	= 0x00ffffff;
+	s->range_table	= &usbduxsigma_ai_range;
+	s->insn_read	= usbduxsigma_ai_insn_read;
+	s->do_cmdtest	= usbduxsigma_ai_cmdtest;
+	s->do_cmd	= usbduxsigma_ai_cmd;
+	s->cancel	= usbduxsigma_ai_cancel;
+
+	/* Analog Output subdevice */
+	s = &dev->subdevices[1];
 	dev->write_subdev = s;
-	/* the subdevice receives as private structure the */
-	/* usb-structure */
-	s->private = NULL;
-	/* are writable */
-	s->subdev_flags = SDF_WRITABLE | SDF_GROUND | SDF_CMD_WRITE;
-	/* 4 channels */
-	s->n_chan = 4;
-	/* length of the channellist */
-	s->len_chanlist = 4;
-	/* 8 bit resolution */
-	s->maxdata = 0x00ff;
-	/* unipolar range */
-	s->range_table = &range_unipolar2_5;
-	/* callback */
-	s->do_cmdtest = usbdux_ao_cmdtest;
-	s->do_cmd = usbdux_ao_cmd;
-	s->cancel = usbdux_ao_cancel;
-	s->insn_read = usbdux_ao_insn_read;
-	s->insn_write = usbdux_ao_insn_write;
-	/* digital I/O subdevice */
-	s = &dev->subdevices[SUBDEV_DIO];
-	s->type = COMEDI_SUBD_DIO;
-	s->subdev_flags = SDF_READABLE | SDF_WRITABLE;
-	/* 8 external and 16 internal channels */
-	s->n_chan = 24;
-	s->maxdata = 1;
-	s->range_table = (&range_digital);
-	s->insn_bits = usbdux_dio_insn_bits;
-	s->insn_config = usbdux_dio_insn_config;
-	/* we don't use it */
-	s->private = NULL;
-	if (uds->high_speed) {
-		/* timer / pwm subdevice */
-		s = &dev->subdevices[SUBDEV_PWM];
-		s->type = COMEDI_SUBD_PWM;
-		s->subdev_flags = SDF_WRITABLE | SDF_PWM_HBRIDGE;
-		s->n_chan = 8;
-		/* this defines the max duty cycle resolution */
-		s->maxdata = uds->sizePwmBuf;
-		s->insn_write = usbdux_pwm_write;
-		s->insn_read = usbdux_pwm_read;
-		s->insn_config = usbdux_pwm_config;
-		usbdux_pwm_period(dev, s, PWM_DEFAULT_PERIOD);
+	s->type		= COMEDI_SUBD_AO;
+	s->subdev_flags	= SDF_WRITABLE | SDF_GROUND | SDF_CMD_WRITE;
+	s->n_chan	= USBDUXSIGMA_NUM_AO_CHAN;
+	s->len_chanlist	= s->n_chan;
+	s->maxdata	= 0x00ff;
+	s->range_table	= &range_unipolar2_5;
+	s->insn_write	= usbduxsigma_ao_insn_write;
+	s->insn_read	= usbduxsigma_ao_insn_read;
+	s->do_cmdtest	= usbduxsigma_ao_cmdtest;
+	s->do_cmd	= usbduxsigma_ao_cmd;
+	s->cancel	= usbduxsigma_ao_cancel;
+
+	/* Digital I/O subdevice */
+	s = &dev->subdevices[2];
+	s->type		= COMEDI_SUBD_DIO;
+	s->subdev_flags	= SDF_READABLE | SDF_WRITABLE;
+	s->n_chan	= 24;
+	s->maxdata	= 1;
+	s->range_table	= &range_digital;
+	s->insn_bits	= usbduxsigma_dio_insn_bits;
+	s->insn_config	= usbduxsigma_dio_insn_config;
+
+	if (devpriv->high_speed) {
+		/* Timer / pwm subdevice */
+		s = &dev->subdevices[3];
+		s->type		= COMEDI_SUBD_PWM;
+		s->subdev_flags	= SDF_WRITABLE | SDF_PWM_HBRIDGE;
+		s->n_chan	= 8;
+		s->maxdata	= devpriv->pwm_buf_sz;
+		s->insn_write	= usbduxsigma_pwm_write;
+		s->insn_config	= usbduxsigma_pwm_config;
+
+		usbduxsigma_pwm_period(dev, s, PWM_DEFAULT_PERIOD);
 	}
-	/* finally decide that it's attached */
-	uds->attached = 1;
-	up(&uds->sem);
-	offset = usbdux_getstatusinfo(dev, 0);
+
+	up(&devpriv->sem);
+
+	offset = usbduxsigma_getstatusinfo(dev, 0);
 	if (offset < 0)
-		dev_err(&uds->interface->dev,
-			"Communication to USBDUXSIGMA failed! Check firmware and cabling.");
-	dev_info(&uds->interface->dev,
-		 "comedi%d: attached, ADC_zero = %x\n", dev->minor, offset);
+		dev_err(dev->class_dev,
+			"Communication to USBDUXSIGMA failed! Check firmware and cabling\n");
+
+	dev_info(dev->class_dev, "attached, ADC_zero = %x\n", offset);
+
 	return 0;
 }
 
+static int usbduxsigma_firmware_upload(struct comedi_device *dev,
+				       const u8 *data, size_t size,
+				       unsigned long context)
+{
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	uint8_t *buf;
+	uint8_t *tmp;
+	int ret;
+
+	if (!data)
+		return 0;
+
+	if (size > FIRMWARE_MAX_LEN) {
+		dev_err(dev->class_dev, "firmware binary too large for FX2\n");
+		return -ENOMEM;
+	}
+
+	/* we generate a local buffer for the firmware */
+	buf = kmemdup(data, size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* we need a malloc'ed buffer for usb_control_msg() */
+	tmp = kmalloc(1, GFP_KERNEL);
+	if (!tmp) {
+		kfree(buf);
+		return -ENOMEM;
+	}
+
+	/* stop the current firmware on the device */
+	*tmp = 1;	/* 7f92 to one */
+	ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+			      USBDUXSUB_FIRMWARE,
+			      VENDOR_DIR_OUT,
+			      USBDUXSUB_CPUCS, 0x0000,
+			      tmp, 1,
+			      BULK_TIMEOUT);
+	if (ret < 0) {
+		dev_err(dev->class_dev, "can not stop firmware\n");
+		goto done;
+	}
+
+	/* upload the new firmware to the device */
+	ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+			      USBDUXSUB_FIRMWARE,
+			      VENDOR_DIR_OUT,
+			      0, 0x0000,
+			      buf, size,
+			      BULK_TIMEOUT);
+	if (ret < 0) {
+		dev_err(dev->class_dev, "firmware upload failed\n");
+		goto done;
+	}
+
+	/* start the new firmware on the device */
+	*tmp = 0;	/* 7f92 to zero */
+	ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+			      USBDUXSUB_FIRMWARE,
+			      VENDOR_DIR_OUT,
+			      USBDUXSUB_CPUCS, 0x0000,
+			      tmp, 1,
+			      BULK_TIMEOUT);
+	if (ret < 0)
+		dev_err(dev->class_dev, "can not start firmware\n");
+
+done:
+	kfree(tmp);
+	kfree(buf);
+	return ret;
+}
+
+static int usbduxsigma_alloc_usb_buffers(struct comedi_device *dev)
+{
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxsigma_private *devpriv = dev->private;
+	struct urb *urb;
+	int i;
+
+	devpriv->dac_commands = kzalloc(NUMOUTCHANNELS, GFP_KERNEL);
+	devpriv->dux_commands = kzalloc(SIZEOFDUXBUFFER, GFP_KERNEL);
+	devpriv->in_buf = kzalloc(SIZEINBUF, GFP_KERNEL);
+	devpriv->insn_buf = kzalloc(SIZEINSNBUF, GFP_KERNEL);
+	devpriv->ai_urbs = kcalloc(devpriv->n_ai_urbs, sizeof(*urb),
+				   GFP_KERNEL);
+	devpriv->ao_urbs = kcalloc(devpriv->n_ao_urbs, sizeof(*urb),
+				   GFP_KERNEL);
+	if (!devpriv->dac_commands || !devpriv->dux_commands ||
+	    !devpriv->in_buf || !devpriv->insn_buf ||
+	    !devpriv->ai_urbs || !devpriv->ao_urbs)
+		return -ENOMEM;
+
+	for (i = 0; i < devpriv->n_ai_urbs; i++) {
+		/* one frame: 1ms */
+		urb = usb_alloc_urb(1, GFP_KERNEL);
+		if (!urb)
+			return -ENOMEM;
+		devpriv->ai_urbs[i] = urb;
+		urb->dev = usb;
+		/* will be filled later with a pointer to the comedi-device */
+		/* and ONLY then the urb should be submitted */
+		urb->context = NULL;
+		urb->pipe = usb_rcvisocpipe(usb, USBDUXSIGMA_ISO_IN_EP);
+		urb->transfer_flags = URB_ISO_ASAP;
+		urb->transfer_buffer = kzalloc(SIZEINBUF, GFP_KERNEL);
+		if (!urb->transfer_buffer)
+			return -ENOMEM;
+		urb->complete = usbduxsigma_ai_urb_complete;
+		urb->number_of_packets = 1;
+		urb->transfer_buffer_length = SIZEINBUF;
+		urb->iso_frame_desc[0].offset = 0;
+		urb->iso_frame_desc[0].length = SIZEINBUF;
+	}
+
+	for (i = 0; i < devpriv->n_ao_urbs; i++) {
+		/* one frame: 1ms */
+		urb = usb_alloc_urb(1, GFP_KERNEL);
+		if (!urb)
+			return -ENOMEM;
+		devpriv->ao_urbs[i] = urb;
+		urb->dev = usb;
+		/* will be filled later with a pointer to the comedi-device */
+		/* and ONLY then the urb should be submitted */
+		urb->context = NULL;
+		urb->pipe = usb_sndisocpipe(usb, USBDUXSIGMA_ISO_OUT_EP);
+		urb->transfer_flags = URB_ISO_ASAP;
+		urb->transfer_buffer = kzalloc(SIZEOUTBUF, GFP_KERNEL);
+		if (!urb->transfer_buffer)
+			return -ENOMEM;
+		urb->complete = usbduxsigma_ao_urb_complete;
+		urb->number_of_packets = 1;
+		urb->transfer_buffer_length = SIZEOUTBUF;
+		urb->iso_frame_desc[0].offset = 0;
+		urb->iso_frame_desc[0].length = SIZEOUTBUF;
+		if (devpriv->high_speed)
+			urb->interval = 8;	/* uframes */
+		else
+			urb->interval = 1;	/* frames */
+	}
+
+	if (devpriv->high_speed) {
+		/* max bulk ep size in high speed */
+		devpriv->pwm_buf_sz = 512;
+		urb = usb_alloc_urb(0, GFP_KERNEL);
+		if (!urb)
+			return -ENOMEM;
+		devpriv->pwm_urb = urb;
+		urb->transfer_buffer = kzalloc(devpriv->pwm_buf_sz, GFP_KERNEL);
+		if (!urb->transfer_buffer)
+			return -ENOMEM;
+	} else {
+		devpriv->pwm_urb = NULL;
+		devpriv->pwm_buf_sz = 0;
+	}
+
+	return 0;
+}
+
+static void usbduxsigma_free_usb_buffers(struct comedi_device *dev)
+{
+	struct usbduxsigma_private *devpriv = dev->private;
+	struct urb *urb;
+	int i;
+
+	/* force unlink all urbs */
+	usbduxsigma_ai_stop(dev, 1);
+	usbduxsigma_ao_stop(dev, 1);
+	usbduxsigma_pwm_stop(dev, 1);
+
+	urb = devpriv->pwm_urb;
+	if (urb) {
+		kfree(urb->transfer_buffer);
+		usb_free_urb(urb);
+	}
+	if (devpriv->ao_urbs) {
+		for (i = 0; i < devpriv->n_ao_urbs; i++) {
+			urb = devpriv->ao_urbs[i];
+			if (urb) {
+				kfree(urb->transfer_buffer);
+				usb_free_urb(urb);
+			}
+		}
+		kfree(devpriv->ao_urbs);
+	}
+	if (devpriv->ai_urbs) {
+		for (i = 0; i < devpriv->n_ai_urbs; i++) {
+			urb = devpriv->ai_urbs[i];
+			if (urb) {
+				kfree(urb->transfer_buffer);
+				usb_free_urb(urb);
+			}
+		}
+		kfree(devpriv->ai_urbs);
+	}
+	kfree(devpriv->insn_buf);
+	kfree(devpriv->in_buf);
+	kfree(devpriv->dux_commands);
+	kfree(devpriv->dac_commands);
+}
+
 static int usbduxsigma_auto_attach(struct comedi_device *dev,
 				   unsigned long context_unused)
 {
-	struct usb_interface *uinterf = comedi_to_usb_interface(dev);
+	struct usb_interface *intf = comedi_to_usb_interface(dev);
+	struct usb_device *usb = comedi_to_usb_dev(dev);
+	struct usbduxsigma_private *devpriv;
 	int ret;
-	struct usbduxsub *uds;
 
-	dev->private = NULL;
-	down(&start_stop_sem);
-	uds = usb_get_intfdata(uinterf);
-	if (!uds || !uds->probed) {
+	devpriv = kzalloc(sizeof(*devpriv), GFP_KERNEL);
+	if (!devpriv)
+		return -ENOMEM;
+	dev->private = devpriv;
+
+	sema_init(&devpriv->sem, 1);
+	usb_set_intfdata(intf, devpriv);
+
+	ret = usb_set_interface(usb,
+				intf->altsetting->desc.bInterfaceNumber, 3);
+	if (ret < 0) {
 		dev_err(dev->class_dev,
-			"usbduxsigma: error: auto_attach failed, not connected\n");
-		ret = -ENODEV;
-	} else if (uds->attached) {
-		dev_err(dev->class_dev,
-		       "usbduxsigma: error: auto_attach failed, already attached\n");
-		ret = -ENODEV;
-	} else
-		ret = usbduxsigma_attach_common(dev, uds);
-	up(&start_stop_sem);
-	return ret;
+			"could not set alternate setting 3 in high speed\n");
+		return -ENODEV;
+	}
+
+	/* test if it is high speed (USB 2.0) */
+	devpriv->high_speed = (usb->speed == USB_SPEED_HIGH);
+	if (devpriv->high_speed) {
+		devpriv->n_ai_urbs = NUMOFINBUFFERSHIGH;
+		devpriv->n_ao_urbs = NUMOFOUTBUFFERSHIGH;
+	} else {
+		devpriv->n_ai_urbs = NUMOFINBUFFERSFULL;
+		devpriv->n_ao_urbs = NUMOFOUTBUFFERSFULL;
+	}
+
+	ret = usbduxsigma_alloc_usb_buffers(dev);
+	if (ret)
+		return ret;
+
+	ret = comedi_load_firmware(dev, &usb->dev, FIRMWARE,
+				   usbduxsigma_firmware_upload, 0);
+	if (ret)
+		return ret;
+
+	return usbduxsigma_attach_common(dev);
 }
 
 static void usbduxsigma_detach(struct comedi_device *dev)
 {
-	struct usbduxsub *usb = dev->private;
+	struct usb_interface *intf = comedi_to_usb_interface(dev);
+	struct usbduxsigma_private *devpriv = dev->private;
 
-	if (usb) {
-		down(&usb->sem);
-		dev->private = NULL;
-		usb->attached = 0;
-		usb->comedidev = NULL;
-		up(&usb->sem);
-	}
+	if (!devpriv)
+		return;
+
+	usb_set_intfdata(intf, NULL);
+
+	down(&devpriv->sem);
+	usbduxsigma_free_usb_buffers(dev);
+	up(&devpriv->sem);
 }
 
 static struct comedi_driver usbduxsigma_driver = {
@@ -2351,306 +1737,10 @@
 	.detach		= usbduxsigma_detach,
 };
 
-static void usbdux_firmware_request_complete_handler(const struct firmware *fw,
-						     void *context)
-{
-	struct usbduxsub *usbduxsub_tmp = context;
-	struct usb_interface *uinterf = usbduxsub_tmp->interface;
-	int ret;
-
-	if (fw == NULL) {
-		dev_err(&uinterf->dev,
-			"Firmware complete handler without firmware!\n");
-		return;
-	}
-
-	/*
-	 * we need to upload the firmware here because fw will be
-	 * freed once we've left this function
-	 */
-	ret = firmwareUpload(usbduxsub_tmp, fw->data, fw->size);
-
-	if (ret) {
-		dev_err(&uinterf->dev,
-			"Could not upload firmware (err=%d)\n", ret);
-		goto out;
-	}
-	comedi_usb_auto_config(uinterf, &usbduxsigma_driver, 0);
-out:
-	release_firmware(fw);
-}
-
-static int usbduxsigma_usb_probe(struct usb_interface *uinterf,
+static int usbduxsigma_usb_probe(struct usb_interface *intf,
 				 const struct usb_device_id *id)
 {
-	struct usb_device *udev = interface_to_usbdev(uinterf);
-	struct device *dev = &uinterf->dev;
-	int i;
-	int index;
-	int ret;
-
-	dev_dbg(dev, "comedi_: usbdux_: "
-		"finding a free structure for the usb-device\n");
-
-	down(&start_stop_sem);
-	/* look for a free place in the usbdux array */
-	index = -1;
-	for (i = 0; i < NUMUSBDUX; i++) {
-		if (!(usbduxsub[i].probed)) {
-			index = i;
-			break;
-		}
-	}
-
-	/* no more space */
-	if (index == -1) {
-		dev_err(dev, "Too many usbduxsigma-devices connected.\n");
-		up(&start_stop_sem);
-		return -EMFILE;
-	}
-	dev_dbg(dev, "comedi_: usbdux: "
-		"usbduxsub[%d] is ready to connect to comedi.\n", index);
-
-	sema_init(&(usbduxsub[index].sem), 1);
-	/* save a pointer to the usb device */
-	usbduxsub[index].usbdev = udev;
-
-	/* save the interface itself */
-	usbduxsub[index].interface = uinterf;
-	/* get the interface number from the interface */
-	usbduxsub[index].ifnum = uinterf->altsetting->desc.bInterfaceNumber;
-	/* hand the private data over to the usb subsystem */
-	/* will be needed for disconnect */
-	usb_set_intfdata(uinterf, &(usbduxsub[index]));
-
-	dev_dbg(dev, "comedi_: usbdux: ifnum=%d\n", usbduxsub[index].ifnum);
-
-	/* test if it is high speed (USB 2.0) */
-	usbduxsub[index].high_speed =
-	    (usbduxsub[index].usbdev->speed == USB_SPEED_HIGH);
-
-	/* create space for the commands of the DA converter */
-	usbduxsub[index].dac_commands = kzalloc(NUMOUTCHANNELS, GFP_KERNEL);
-	if (!usbduxsub[index].dac_commands) {
-		tidy_up(&(usbduxsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	/* create space for the commands going to the usb device */
-	usbduxsub[index].dux_commands = kzalloc(SIZEOFDUXBUFFER, GFP_KERNEL);
-	if (!usbduxsub[index].dux_commands) {
-		tidy_up(&(usbduxsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	/* create space for the in buffer and set it to zero */
-	usbduxsub[index].inBuffer = kzalloc(SIZEINBUF, GFP_KERNEL);
-	if (!(usbduxsub[index].inBuffer)) {
-		tidy_up(&(usbduxsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	/* create space of the instruction buffer */
-	usbduxsub[index].insnBuffer = kzalloc(SIZEINSNBUF, GFP_KERNEL);
-	if (!(usbduxsub[index].insnBuffer)) {
-		tidy_up(&(usbduxsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	/* create space for the outbuffer */
-	usbduxsub[index].outBuffer = kzalloc(SIZEOUTBUF, GFP_KERNEL);
-	if (!(usbduxsub[index].outBuffer)) {
-		tidy_up(&(usbduxsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	/* setting to alternate setting 3: enabling iso ep and bulk ep. */
-	i = usb_set_interface(usbduxsub[index].usbdev,
-			      usbduxsub[index].ifnum, 3);
-	if (i < 0) {
-		dev_err(dev, "comedi_: usbduxsigma%d: "
-			"could not set alternate setting 3 in high speed.\n",
-			index);
-		tidy_up(&(usbduxsub[index]));
-		up(&start_stop_sem);
-		return -ENODEV;
-	}
-	if (usbduxsub[index].high_speed)
-		usbduxsub[index].numOfInBuffers = NUMOFINBUFFERSHIGH;
-	else
-		usbduxsub[index].numOfInBuffers = NUMOFINBUFFERSFULL;
-
-	usbduxsub[index].urbIn = kcalloc(usbduxsub[index].numOfInBuffers,
-					 sizeof(struct urb *),
-					 GFP_KERNEL);
-	if (!(usbduxsub[index].urbIn)) {
-		tidy_up(&(usbduxsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	for (i = 0; i < usbduxsub[index].numOfInBuffers; i++) {
-		/* one frame: 1ms */
-		usbduxsub[index].urbIn[i] = usb_alloc_urb(1, GFP_KERNEL);
-		if (usbduxsub[index].urbIn[i] == NULL) {
-			dev_err(dev, "comedi_: usbduxsigma%d: "
-				"Could not alloc. urb(%d)\n", index, i);
-			tidy_up(&(usbduxsub[index]));
-			up(&start_stop_sem);
-			return -ENOMEM;
-		}
-		usbduxsub[index].urbIn[i]->dev = usbduxsub[index].usbdev;
-		/* will be filled later with a pointer to the comedi-device */
-		/* and ONLY then the urb should be submitted */
-		usbduxsub[index].urbIn[i]->context = NULL;
-		usbduxsub[index].urbIn[i]->pipe =
-		    usb_rcvisocpipe(usbduxsub[index].usbdev, ISOINEP);
-		usbduxsub[index].urbIn[i]->transfer_flags = URB_ISO_ASAP;
-		usbduxsub[index].urbIn[i]->transfer_buffer =
-		    kzalloc(SIZEINBUF, GFP_KERNEL);
-		if (!(usbduxsub[index].urbIn[i]->transfer_buffer)) {
-			tidy_up(&(usbduxsub[index]));
-			up(&start_stop_sem);
-			return -ENOMEM;
-		}
-		usbduxsub[index].urbIn[i]->complete = usbduxsub_ai_IsocIrq;
-		usbduxsub[index].urbIn[i]->number_of_packets = 1;
-		usbduxsub[index].urbIn[i]->transfer_buffer_length = SIZEINBUF;
-		usbduxsub[index].urbIn[i]->iso_frame_desc[0].offset = 0;
-		usbduxsub[index].urbIn[i]->iso_frame_desc[0].length =
-			SIZEINBUF;
-	}
-
-	/* out */
-	if (usbduxsub[index].high_speed)
-		usbduxsub[index].numOfOutBuffers = NUMOFOUTBUFFERSHIGH;
-	else
-		usbduxsub[index].numOfOutBuffers = NUMOFOUTBUFFERSFULL;
-
-	usbduxsub[index].urbOut = kcalloc(usbduxsub[index].numOfOutBuffers,
-					  sizeof(struct urb *), GFP_KERNEL);
-	if (!(usbduxsub[index].urbOut)) {
-		tidy_up(&(usbduxsub[index]));
-		up(&start_stop_sem);
-		return -ENOMEM;
-	}
-	for (i = 0; i < usbduxsub[index].numOfOutBuffers; i++) {
-		/* one frame: 1ms */
-		usbduxsub[index].urbOut[i] = usb_alloc_urb(1, GFP_KERNEL);
-		if (usbduxsub[index].urbOut[i] == NULL) {
-			dev_err(dev, "comedi_: usbduxsigma%d: "
-				"Could not alloc. urb(%d)\n", index, i);
-			tidy_up(&(usbduxsub[index]));
-			up(&start_stop_sem);
-			return -ENOMEM;
-		}
-		usbduxsub[index].urbOut[i]->dev = usbduxsub[index].usbdev;
-		/* will be filled later with a pointer to the comedi-device */
-		/* and ONLY then the urb should be submitted */
-		usbduxsub[index].urbOut[i]->context = NULL;
-		usbduxsub[index].urbOut[i]->pipe =
-		    usb_sndisocpipe(usbduxsub[index].usbdev, ISOOUTEP);
-		usbduxsub[index].urbOut[i]->transfer_flags = URB_ISO_ASAP;
-		usbduxsub[index].urbOut[i]->transfer_buffer =
-		    kzalloc(SIZEOUTBUF, GFP_KERNEL);
-		if (!(usbduxsub[index].urbOut[i]->transfer_buffer)) {
-			tidy_up(&(usbduxsub[index]));
-			up(&start_stop_sem);
-			return -ENOMEM;
-		}
-		usbduxsub[index].urbOut[i]->complete = usbduxsub_ao_IsocIrq;
-		usbduxsub[index].urbOut[i]->number_of_packets = 1;
-		usbduxsub[index].urbOut[i]->transfer_buffer_length =
-			SIZEOUTBUF;
-		usbduxsub[index].urbOut[i]->iso_frame_desc[0].offset = 0;
-		usbduxsub[index].urbOut[i]->iso_frame_desc[0].length =
-		    SIZEOUTBUF;
-		if (usbduxsub[index].high_speed) {
-			/* uframes */
-			usbduxsub[index].urbOut[i]->interval = 8;
-		} else {
-			/* frames */
-			usbduxsub[index].urbOut[i]->interval = 1;
-		}
-	}
-
-	/* pwm */
-	if (usbduxsub[index].high_speed) {
-		/* max bulk ep size in high speed */
-		usbduxsub[index].sizePwmBuf = 512;
-		usbduxsub[index].urbPwm = usb_alloc_urb(0, GFP_KERNEL);
-		if (usbduxsub[index].urbPwm == NULL) {
-			dev_err(dev, "comedi_: usbduxsigma%d: "
-				"Could not alloc. pwm urb\n", index);
-			tidy_up(&(usbduxsub[index]));
-			up(&start_stop_sem);
-			return -ENOMEM;
-		}
-		usbduxsub[index].urbPwm->transfer_buffer =
-		    kzalloc(usbduxsub[index].sizePwmBuf, GFP_KERNEL);
-		if (!(usbduxsub[index].urbPwm->transfer_buffer)) {
-			tidy_up(&(usbduxsub[index]));
-			up(&start_stop_sem);
-			return -ENOMEM;
-		}
-	} else {
-		usbduxsub[index].urbPwm = NULL;
-		usbduxsub[index].sizePwmBuf = 0;
-	}
-
-	usbduxsub[index].ai_cmd_running = 0;
-	usbduxsub[index].ao_cmd_running = 0;
-	usbduxsub[index].pwm_cmd_running = 0;
-
-	/* we've reached the bottom of the function */
-	usbduxsub[index].probed = 1;
-	up(&start_stop_sem);
-
-	ret = request_firmware_nowait(THIS_MODULE,
-				      FW_ACTION_HOTPLUG,
-				      FIRMWARE,
-				      &udev->dev,
-				      GFP_KERNEL,
-				      usbduxsub + index,
-				      usbdux_firmware_request_complete_handler
-				      );
-
-	if (ret) {
-		dev_err(dev, "Could not load firmware (err=%d)\n", ret);
-		return ret;
-	}
-
-	dev_info(dev, "comedi_: successfully initialised.\n");
-	/* success */
-	return 0;
-}
-
-static void usbduxsigma_usb_disconnect(struct usb_interface *intf)
-{
-	struct usbduxsub *usbduxsub_tmp = usb_get_intfdata(intf);
-	struct usb_device *udev = interface_to_usbdev(intf);
-
-	if (!usbduxsub_tmp) {
-		dev_err(&intf->dev,
-			"comedi_: disconnect called with null pointer.\n");
-		return;
-	}
-	if (usbduxsub_tmp->usbdev != udev) {
-		dev_err(&intf->dev, "comedi_: BUG! wrong ptr!\n");
-		return;
-	}
-	if (usbduxsub_tmp->ai_cmd_running)
-		/* we are still running a command */
-		usbdux_ai_stop(usbduxsub_tmp, 1);
-	if (usbduxsub_tmp->ao_cmd_running)
-		/* we are still running a command */
-		usbdux_ao_stop(usbduxsub_tmp, 1);
-	comedi_usb_auto_unconfig(intf);
-	down(&start_stop_sem);
-	down(&usbduxsub_tmp->sem);
-	tidy_up(usbduxsub_tmp);
-	up(&usbduxsub_tmp->sem);
-	up(&start_stop_sem);
-	dev_info(&intf->dev, "comedi_: disconnected from the usb\n");
+	return comedi_usb_auto_config(intf, &usbduxsigma_driver, 0);
 }
 
 static const struct usb_device_id usbduxsigma_usb_table[] = {
@@ -2664,7 +1754,7 @@
 static struct usb_driver usbduxsigma_usb_driver = {
 	.name		= "usbduxsigma",
 	.probe		= usbduxsigma_usb_probe,
-	.disconnect	= usbduxsigma_usb_disconnect,
+	.disconnect	= comedi_usb_auto_unconfig,
 	.id_table	= usbduxsigma_usb_table,
 };
 module_comedi_usb_driver(usbduxsigma_driver, usbduxsigma_usb_driver);

diff --git a/drivers/staging/comedi/drivers/vmk80xx.c b/drivers/staging/comedi/drivers/vmk80xx.c
index 2be5087..0ab04c0 100644
--- a/drivers/staging/comedi/drivers/vmk80xx.c
+++ b/drivers/staging/comedi/drivers/vmk80xx.c

@@ -16,11 +16,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: vmk80xx
@@ -159,8 +154,6 @@
 };
 
 struct vmk80xx_private {
-	struct usb_device *usb;
-	struct usb_interface *intf;
 	struct usb_endpoint_descriptor *ep_rx;
 	struct usb_endpoint_descriptor *ep_tx;
 	struct firmware_version fw;
@@ -170,9 +163,10 @@
 	enum vmk80xx_model model;
 };
 
-static int vmk80xx_check_data_link(struct vmk80xx_private *devpriv)
+static int vmk80xx_check_data_link(struct comedi_device *dev)
 {
-	struct usb_device *usb = devpriv->usb;
+	struct vmk80xx_private *devpriv = dev->private;
+	struct usb_device *usb = comedi_to_usb_dev(dev);
 	unsigned int tx_pipe;
 	unsigned int rx_pipe;
 	unsigned char tx[1];
@@ -194,9 +188,10 @@
 	return (int)rx[1];
 }
 
-static void vmk80xx_read_eeprom(struct vmk80xx_private *devpriv, int flag)
+static void vmk80xx_read_eeprom(struct comedi_device *dev, int flag)
 {
-	struct usb_device *usb = devpriv->usb;
+	struct vmk80xx_private *devpriv = dev->private;
+	struct usb_device *usb = comedi_to_usb_dev(dev);
 	unsigned int tx_pipe;
 	unsigned int rx_pipe;
 	unsigned char tx[1];
@@ -223,9 +218,10 @@
 		strncpy(devpriv->fw.ic6_vers, rx + 25, 24);
 }
 
-static void vmk80xx_do_bulk_msg(struct vmk80xx_private *devpriv)
+static void vmk80xx_do_bulk_msg(struct comedi_device *dev)
 {
-	struct usb_device *usb = devpriv->usb;
+	struct vmk80xx_private *devpriv = dev->private;
+	struct usb_device *usb = comedi_to_usb_dev(dev);
 	__u8 tx_addr;
 	__u8 rx_addr;
 	unsigned int tx_pipe;
@@ -248,21 +244,18 @@
 	usb_bulk_msg(usb, rx_pipe, devpriv->usb_rx_buf, size, NULL, HZ * 10);
 }
 
-static int vmk80xx_read_packet(struct vmk80xx_private *devpriv)
+static int vmk80xx_read_packet(struct comedi_device *dev)
 {
-	struct usb_device *usb;
+	struct vmk80xx_private *devpriv = dev->private;
+	struct usb_device *usb = comedi_to_usb_dev(dev);
 	struct usb_endpoint_descriptor *ep;
 	unsigned int pipe;
 
-	if (!devpriv->intf)
-		return -ENODEV;
-
 	if (devpriv->model == VMK8061_MODEL) {
-		vmk80xx_do_bulk_msg(devpriv);
+		vmk80xx_do_bulk_msg(dev);
 		return 0;
 	}
 
-	usb = devpriv->usb;
 	ep = devpriv->ep_rx;
 	pipe = usb_rcvintpipe(usb, ep->bEndpointAddress);
 	return usb_interrupt_msg(usb, pipe, devpriv->usb_rx_buf,
@@ -270,23 +263,20 @@
 				 HZ * 10);
 }
 
-static int vmk80xx_write_packet(struct vmk80xx_private *devpriv, int cmd)
+static int vmk80xx_write_packet(struct comedi_device *dev, int cmd)
 {
-	struct usb_device *usb;
+	struct vmk80xx_private *devpriv = dev->private;
+	struct usb_device *usb = comedi_to_usb_dev(dev);
 	struct usb_endpoint_descriptor *ep;
 	unsigned int pipe;
 
-	if (!devpriv->intf)
-		return -ENODEV;
-
 	devpriv->usb_tx_buf[0] = cmd;
 
 	if (devpriv->model == VMK8061_MODEL) {
-		vmk80xx_do_bulk_msg(devpriv);
+		vmk80xx_do_bulk_msg(dev);
 		return 0;
 	}
 
-	usb = devpriv->usb;
 	ep = devpriv->ep_tx;
 	pipe = usb_sndintpipe(usb, ep->bEndpointAddress);
 	return usb_interrupt_msg(usb, pipe, devpriv->usb_tx_buf,
@@ -294,18 +284,19 @@
 				 HZ * 10);
 }
 
-static int vmk80xx_reset_device(struct vmk80xx_private *devpriv)
+static int vmk80xx_reset_device(struct comedi_device *dev)
 {
+	struct vmk80xx_private *devpriv = dev->private;
 	size_t size;
 	int retval;
 
 	size = le16_to_cpu(devpriv->ep_tx->wMaxPacketSize);
 	memset(devpriv->usb_tx_buf, 0, size);
-	retval = vmk80xx_write_packet(devpriv, VMK8055_CMD_RST);
+	retval = vmk80xx_write_packet(dev, VMK8055_CMD_RST);
 	if (retval)
 		return retval;
 	/* set outputs to known state as we cannot read them */
-	return vmk80xx_write_packet(devpriv, VMK8055_CMD_WRT_AD);
+	return vmk80xx_write_packet(dev, VMK8055_CMD_WRT_AD);
 }
 
 static int vmk80xx_ai_insn_read(struct comedi_device *dev,
@@ -338,7 +329,7 @@
 	}
 
 	for (n = 0; n < insn->n; n++) {
-		if (vmk80xx_read_packet(devpriv))
+		if (vmk80xx_read_packet(dev))
 			break;
 
 		if (devpriv->model == VMK8055_MODEL) {
@@ -388,7 +379,7 @@
 	for (n = 0; n < insn->n; n++) {
 		devpriv->usb_tx_buf[reg] = data[n];
 
-		if (vmk80xx_write_packet(devpriv, cmd))
+		if (vmk80xx_write_packet(dev, cmd))
 			break;
 	}
 
@@ -415,7 +406,7 @@
 	devpriv->usb_tx_buf[0] = VMK8061_CMD_RD_AO;
 
 	for (n = 0; n < insn->n; n++) {
-		if (vmk80xx_read_packet(devpriv))
+		if (vmk80xx_read_packet(dev))
 			break;
 
 		data[n] = devpriv->usb_rx_buf[reg + chan];
@@ -447,7 +438,7 @@
 		reg = VMK8055_DI_REG;
 	}
 
-	retval = vmk80xx_read_packet(devpriv);
+	retval = vmk80xx_read_packet(dev);
 
 	if (!retval) {
 		if (devpriv->model == VMK8055_MODEL)
@@ -492,7 +483,7 @@
 		tx_buf[reg] &= ~data[0];
 		tx_buf[reg] |= (data[0] & data[1]);
 
-		retval = vmk80xx_write_packet(devpriv, cmd);
+		retval = vmk80xx_write_packet(dev, cmd);
 
 		if (retval)
 			goto out;
@@ -501,7 +492,7 @@
 	if (devpriv->model == VMK8061_MODEL) {
 		tx_buf[0] = VMK8061_CMD_RD_DO;
 
-		retval = vmk80xx_read_packet(devpriv);
+		retval = vmk80xx_read_packet(dev);
 
 		if (!retval) {
 			data[1] = rx_buf[reg];
@@ -547,7 +538,7 @@
 	}
 
 	for (n = 0; n < insn->n; n++) {
-		if (vmk80xx_read_packet(devpriv))
+		if (vmk80xx_read_packet(dev))
 			break;
 
 		if (devpriv->model == VMK8055_MODEL)
@@ -597,7 +588,7 @@
 	}
 
 	for (n = 0; n < insn->n; n++)
-		if (vmk80xx_write_packet(devpriv, cmd))
+		if (vmk80xx_write_packet(dev, cmd))
 			break;
 
 	up(&devpriv->limit_sem);
@@ -640,7 +631,7 @@
 
 		devpriv->usb_tx_buf[6 + chan] = val;
 
-		if (vmk80xx_write_packet(devpriv, cmd))
+		if (vmk80xx_write_packet(dev, cmd))
 			break;
 	}
 
@@ -671,7 +662,7 @@
 	tx_buf[0] = VMK8061_CMD_RD_PWM;
 
 	for (n = 0; n < insn->n; n++) {
-		if (vmk80xx_read_packet(devpriv))
+		if (vmk80xx_read_packet(dev))
 			break;
 
 		data[n] = rx_buf[reg[0]] + 4 * rx_buf[reg[1]];
@@ -719,7 +710,7 @@
 		tx_buf[reg[0]] = (unsigned char)(data[n] & 0x03);
 		tx_buf[reg[1]] = (unsigned char)(data[n] >> 2) & 0xff;
 
-		if (vmk80xx_write_packet(devpriv, cmd))
+		if (vmk80xx_write_packet(dev, cmd))
 			break;
 	}
 
@@ -731,7 +722,7 @@
 static int vmk80xx_find_usb_endpoints(struct comedi_device *dev)
 {
 	struct vmk80xx_private *devpriv = dev->private;
-	struct usb_interface *intf = devpriv->intf;
+	struct usb_interface *intf = comedi_to_usb_interface(dev);
 	struct usb_host_interface *iface_desc = intf->cur_altsetting;
 	struct usb_endpoint_descriptor *ep_desc;
 	int i;
@@ -889,8 +880,6 @@
 		return -ENOMEM;
 	dev->private = devpriv;
 
-	devpriv->usb = interface_to_usbdev(intf);
-	devpriv->intf = intf;
 	devpriv->model = boardinfo->model;
 
 	ret = vmk80xx_find_usb_endpoints(dev);
@@ -906,23 +895,24 @@
 	usb_set_intfdata(intf, devpriv);
 
 	if (devpriv->model == VMK8061_MODEL) {
-		vmk80xx_read_eeprom(devpriv, IC3_VERSION);
+		vmk80xx_read_eeprom(dev, IC3_VERSION);
 		dev_info(&intf->dev, "%s\n", devpriv->fw.ic3_vers);
 
-		if (vmk80xx_check_data_link(devpriv)) {
-			vmk80xx_read_eeprom(devpriv, IC6_VERSION);
+		if (vmk80xx_check_data_link(dev)) {
+			vmk80xx_read_eeprom(dev, IC6_VERSION);
 			dev_info(&intf->dev, "%s\n", devpriv->fw.ic6_vers);
 		}
 	}
 
 	if (devpriv->model == VMK8055_MODEL)
-		vmk80xx_reset_device(devpriv);
+		vmk80xx_reset_device(dev);
 
 	return vmk80xx_init_subdevices(dev);
 }
 
 static void vmk80xx_detach(struct comedi_device *dev)
 {
+	struct usb_interface *intf = comedi_to_usb_interface(dev);
 	struct vmk80xx_private *devpriv = dev->private;
 
 	if (!devpriv)
@@ -930,7 +920,7 @@
 
 	down(&devpriv->limit_sem);
 
-	usb_set_intfdata(devpriv->intf, NULL);
+	usb_set_intfdata(intf, NULL);
 
 	kfree(devpriv->usb_rx_buf);
 	kfree(devpriv->usb_tx_buf);

diff --git a/drivers/staging/comedi/kcomedilib/kcomedilib_main.c b/drivers/staging/comedi/kcomedilib/kcomedilib_main.c
index 3231a48..da8988c 100644
--- a/drivers/staging/comedi/kcomedilib/kcomedilib_main.c
+++ b/drivers/staging/comedi/kcomedilib/kcomedilib_main.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #include <linux/module.h>

diff --git a/drivers/staging/comedi/proc.c b/drivers/staging/comedi/proc.c
index 886c202..8ee9442 100644
--- a/drivers/staging/comedi/proc.c
+++ b/drivers/staging/comedi/proc.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*

diff --git a/drivers/staging/comedi/range.c b/drivers/staging/comedi/range.c
index 1dc391b..1f20332 100644
--- a/drivers/staging/comedi/range.c
+++ b/drivers/staging/comedi/range.c

@@ -14,11 +14,6 @@
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #include <linux/uaccess.h>

diff --git a/drivers/staging/cptm1217/clearpad_tm1217.c b/drivers/staging/cptm1217/clearpad_tm1217.c
index e96eee3..42a5f5c 100644
--- a/drivers/staging/cptm1217/clearpad_tm1217.c
+++ b/drivers/staging/cptm1217/clearpad_tm1217.c

@@ -547,10 +547,8 @@
 fail:
 	/* Clean up before returning failure */
 	for (i = 0; i < TOUCH_SUPPORTED; i++) {
-		if (ts->cp_input_info[i].input) {
+		if (ts->cp_input_info[i].input)
 			input_unregister_device(ts->cp_input_info[i].input);
-			input_free_device(ts->cp_input_info[i].input);
-		}
 	}
 	kfree(ts);
 	return retval;

diff --git a/drivers/staging/crystalhd/bc_dts_glob_lnx.h b/drivers/staging/crystalhd/bc_dts_glob_lnx.h
index fd1a6e6..981708f 100644
--- a/drivers/staging/crystalhd/bc_dts_glob_lnx.h
+++ b/drivers/staging/crystalhd/bc_dts_glob_lnx.h

@@ -58,11 +58,11 @@
  * between the driver and the application.
  */
 enum BC_DTS_GLOBALS {
-	BC_MAX_FW_CMD_BUFF_SZ	= 0x40,		/* FW passthrough cmd/rsp buffer size */
+	BC_MAX_FW_CMD_BUFF_SZ = 0x40, /* FW passthrough cmd/rsp buffer size */
 	PCI_CFG_SIZE		= 256,		/* PCI config size buffer */
 	BC_IOCTL_DATA_POOL_SIZE	= 8,		/* BC_IOCTL_DATA Pool size */
-	BC_LINK_MAX_OPENS	= 3,		/* Maximum simultaneous opens*/
-	BC_LINK_MAX_SGLS	= 1024,		/* Maximum SG elements 4M/4K */
+	BC_LINK_MAX_OPENS	= 3,	/* Maximum simultaneous opens*/
+	BC_LINK_MAX_SGLS	= 1024,	/* Maximum SG elements 4M/4K */
 	BC_TX_LIST_CNT		= 2,		/* Max Tx DMA Rings */
 	BC_RX_LIST_CNT		= 8,		/* Max Rx DMA Rings*/
 	BC_PROC_OUTPUT_TIMEOUT	= 3000,		/* Milliseconds */
@@ -240,11 +240,14 @@
 	DRV_CMD_ADD_RXBUFFS,	/* Add Rx side buffers to driver pool */
 	DRV_CMD_FETCH_RXBUFF,	/* Get Rx DMAed buffer */
 	DRV_CMD_START_RX_CAP,	/* Start Rx Buffer Capture */
-	DRV_CMD_FLUSH_RX_CAP,	/* Stop the capture for now...we will enhance this later*/
+	DRV_CMD_FLUSH_RX_CAP,	/* Stop the capture for now...
+			we will enhance this later*/
 	DRV_CMD_GET_DRV_STAT,	/* Get Driver Internal Statistics */
 	DRV_CMD_RST_DRV_STAT,	/* Reset Driver Internal Statistics */
-	DRV_CMD_NOTIFY_MODE,	/* Notify the Mode to driver in which the application is Operating*/
-	DRV_CMD_CHANGE_CLOCK,	/* Change the core clock to either save power or improve performance */
+	DRV_CMD_NOTIFY_MODE,	/* Notify the Mode to driver
+			in which the application is Operating*/
+	DRV_CMD_CHANGE_CLOCK,	/* Change the core clock to either save power
+			or improve performance */
 
 	/* MUST be the last one.. */
 	DRV_CMD_END,			/* End of the List.. */
@@ -283,8 +286,8 @@
 	struct BC_IOCTL_DATA	udata;		/* IOCTL from App..*/
 	uint32_t		u_id;		/* Driver specific user ID */
 	uint32_t		cmd;		/* Cmd ID for driver's use. */
-	void			*add_cdata;	/* Additional command specific data..*/
-	uint32_t		add_cdata_sz;	/* Additional command specific data size */
+	void	 *add_cdata;	/* Additional command specific data..*/
+	uint32_t add_cdata_sz;	/* Additional command specific data size */
 	struct crystalhd_ioctl_data *next;	/* List/Fifo management */
 };
 

diff --git a/drivers/staging/crystalhd/crystalhd_cmds.c b/drivers/staging/crystalhd/crystalhd_cmds.c
index ed99daa6..3ab502b 100644
--- a/drivers/staging/crystalhd/crystalhd_cmds.c
+++ b/drivers/staging/crystalhd/crystalhd_cmds.c

@@ -472,8 +472,8 @@
 }
 
 /* Helper function to check on user buffers */
-static enum BC_STATUS bc_cproc_check_inbuffs(bool pin, void *ubuff, uint32_t ub_sz,
-					uint32_t uv_off, bool en_422)
+static enum BC_STATUS bc_cproc_check_inbuffs(bool pin, void *ubuff,
+				 uint32_t ub_sz, uint32_t uv_off, bool en_422)
 {
 	if (!ubuff || !ub_sz) {
 		BCMLOG_ERR("%s->Invalid Arg %p %x\n",
@@ -483,8 +483,9 @@
 
 	/* Check for alignment */
 	if (((uintptr_t)ubuff) & 0x03) {
-		BCMLOG_ERR("%s-->Un-aligned address not implemented yet.. %p\n",
-				((pin) ? "TX" : "RX"), ubuff);
+		BCMLOG_ERR(
+			"%s-->Un-aligned address not implemented yet.. %p\n",
+			 ((pin) ? "TX" : "RX"), ubuff);
 		return BC_STS_NOT_IMPL;
 	}
 	if (pin)
@@ -572,7 +573,8 @@
 	if (!dio_hnd)
 		return BC_STS_ERROR;
 
-	sts = crystalhd_hw_add_cap_buffer(&ctx->hw_ctx, dio_hnd, (ctx->state == BC_LINK_READY));
+	sts = crystalhd_hw_add_cap_buffer(&ctx->hw_ctx, dio_hnd,
+					 (ctx->state == BC_LINK_READY));
 	if ((sts != BC_STS_SUCCESS) && (sts != BC_STS_BUSY)) {
 		crystalhd_unmap_dio(ctx->adp, dio_hnd);
 		return sts;
@@ -618,7 +620,8 @@
 
 	sts = crystalhd_hw_get_cap_buffer(&ctx->hw_ctx, &frame->PibInfo, &dio);
 	if (sts != BC_STS_SUCCESS)
-		return (ctx->state & BC_LINK_SUSPEND) ? BC_STS_IO_USER_ABORT : sts;
+		return (ctx->state & BC_LINK_SUSPEND) ?
+					 BC_STS_IO_USER_ABORT : sts;
 
 	frame->Flags = dio->uinfo.comp_flags;
 
@@ -673,7 +676,8 @@
 	frame = &idata->udata.u.DecOutData;
 	for (count = 0; count < BC_RX_LIST_CNT; count++) {
 
-		sts = crystalhd_hw_get_cap_buffer(&ctx->hw_ctx, &frame->PibInfo, &dio);
+		sts = crystalhd_hw_get_cap_buffer(&ctx->hw_ctx,
+					 &frame->PibInfo, &dio);
 		if (sts != BC_STS_SUCCESS)
 			break;
 
@@ -916,7 +920,8 @@
  * Closer application handle and release app specific
  * resources.
  */
-enum BC_STATUS crystalhd_user_close(struct crystalhd_cmd *ctx, struct crystalhd_user *uc)
+enum BC_STATUS crystalhd_user_close(struct crystalhd_cmd *ctx,
+					 struct crystalhd_user *uc)
 {
 	uint32_t mode = uc->mode;
 
@@ -1008,8 +1013,8 @@
  * mode of operation and returns the function pointer
  * from the cproc table.
  */
-crystalhd_cmd_proc crystalhd_get_cmd_proc(struct crystalhd_cmd *ctx, uint32_t cmd,
-				      struct crystalhd_user *uc)
+crystalhd_cmd_proc crystalhd_get_cmd_proc(struct crystalhd_cmd *ctx,
+				 uint32_t cmd, struct crystalhd_user *uc)
 {
 	crystalhd_cmd_proc cproc = NULL;
 	unsigned int i, tbl_sz;
@@ -1024,7 +1029,8 @@
 		return NULL;
 	}
 
-	tbl_sz = sizeof(g_crystalhd_cproc_tbl) / sizeof(struct crystalhd_cmd_tbl);
+	tbl_sz = sizeof(g_crystalhd_cproc_tbl) /
+				 sizeof(struct crystalhd_cmd_tbl);
 	for (i = 0; i < tbl_sz; i++) {
 		if (g_crystalhd_cproc_tbl[i].cmd_id == cmd) {
 			if ((uc->mode == DTS_MONITOR_MODE) &&

diff --git a/drivers/staging/crystalhd/crystalhd_cmds.h b/drivers/staging/crystalhd/crystalhd_cmds.h
index 4066ba3..377cd9d 100644
--- a/drivers/staging/crystalhd/crystalhd_cmds.h
+++ b/drivers/staging/crystalhd/crystalhd_cmds.h

@@ -66,7 +66,8 @@
 	struct crystalhd_hw	hw_ctx;
 };
 
-typedef enum BC_STATUS(*crystalhd_cmd_proc)(struct crystalhd_cmd *, struct crystalhd_ioctl_data *);
+typedef enum BC_STATUS(*crystalhd_cmd_proc)(struct crystalhd_cmd *,
+					 struct crystalhd_ioctl_data *);
 
 struct crystalhd_cmd_tbl {
 	uint32_t		cmd_id;
@@ -74,13 +75,17 @@
 	uint32_t		block_mon;
 };
 
-enum BC_STATUS crystalhd_suspend(struct crystalhd_cmd *ctx, struct crystalhd_ioctl_data *idata);
+enum BC_STATUS crystalhd_suspend(struct crystalhd_cmd *ctx,
+				 struct crystalhd_ioctl_data *idata);
 enum BC_STATUS crystalhd_resume(struct crystalhd_cmd *ctx);
-crystalhd_cmd_proc crystalhd_get_cmd_proc(struct crystalhd_cmd *ctx, uint32_t cmd,
-				      struct crystalhd_user *uc);
-enum BC_STATUS crystalhd_user_open(struct crystalhd_cmd *ctx, struct crystalhd_user **user_ctx);
-enum BC_STATUS crystalhd_user_close(struct crystalhd_cmd *ctx, struct crystalhd_user *uc);
-enum BC_STATUS crystalhd_setup_cmd_context(struct crystalhd_cmd *ctx, struct crystalhd_adp *adp);
+crystalhd_cmd_proc crystalhd_get_cmd_proc(struct crystalhd_cmd *ctx,
+				 uint32_t cmd, struct crystalhd_user *uc);
+enum BC_STATUS crystalhd_user_open(struct crystalhd_cmd *ctx,
+				 struct crystalhd_user **user_ctx);
+enum BC_STATUS crystalhd_user_close(struct crystalhd_cmd *ctx,
+				 struct crystalhd_user *uc);
+enum BC_STATUS crystalhd_setup_cmd_context(struct crystalhd_cmd *ctx,
+				 struct crystalhd_adp *adp);
 enum BC_STATUS crystalhd_delete_cmd_context(struct crystalhd_cmd *ctx);
 bool crystalhd_cmd_interrupt(struct crystalhd_cmd *ctx);
 

diff --git a/drivers/staging/crystalhd/crystalhd_fw_if.h b/drivers/staging/crystalhd/crystalhd_fw_if.h
index 9e2831e..4b363a5 100644
--- a/drivers/staging/crystalhd/crystalhd_fw_if.h
+++ b/drivers/staging/crystalhd/crystalhd_fw_if.h

@@ -106,7 +106,8 @@
 
 struct fgt_sei {
 	struct fgt_sei *next;
-	unsigned char model_values[3][MAX_FGT_VALUE_INTERVAL][MAX_FGT_MODEL_VALUE];
+	unsigned char
+		 model_values[3][MAX_FGT_VALUE_INTERVAL][MAX_FGT_MODEL_VALUE];
 	unsigned char upper_bound[3][MAX_FGT_VALUE_INTERVAL];
 	unsigned char lower_bound[3][MAX_FGT_VALUE_INTERVAL];
 
@@ -125,10 +126,12 @@
 
 	unsigned char blending_mode_id;	/* Blending mode. */
 	unsigned char log2_scale_factor;	/* Log2 scale factor (2-7). */
-	unsigned char comp_flag[3];		/* Components [0,2] parameters present flag. */
-	unsigned char num_intervals_minus1[3]; /* Number of intensity level intervals. */
+	unsigned char comp_flag[3];	/* Components [0,2]
+					 parameters present flag. */
+	unsigned char num_intervals_minus1[3]; /* Number of
+					 intensity level intervals. */
 	unsigned char num_model_values[3];	/* Number of model values. */
-	uint16_t      repetition_period;	/* Repetition period (0-16384) */
+	uint16_t      repetition_period; /* Repetition period (0-16384) */
 
 };
 
@@ -266,40 +269,40 @@
 
 	/* Decoding commands */
 	eCMD_C011_DEC_CHAN_OPEN			= eCMD_C011_CMD_BASE + 0x100,
-	eCMD_C011_DEC_CHAN_CLOSE			= eCMD_C011_CMD_BASE + 0x101,
-	eCMD_C011_DEC_CHAN_ACTIVATE			= eCMD_C011_CMD_BASE + 0x102,
-	eCMD_C011_DEC_CHAN_STATUS			= eCMD_C011_CMD_BASE + 0x103,
-	eCMD_C011_DEC_CHAN_FLUSH			= eCMD_C011_CMD_BASE + 0x104,
+	eCMD_C011_DEC_CHAN_CLOSE		= eCMD_C011_CMD_BASE + 0x101,
+	eCMD_C011_DEC_CHAN_ACTIVATE		= eCMD_C011_CMD_BASE + 0x102,
+	eCMD_C011_DEC_CHAN_STATUS		= eCMD_C011_CMD_BASE + 0x103,
+	eCMD_C011_DEC_CHAN_FLUSH		= eCMD_C011_CMD_BASE + 0x104,
 	eCMD_C011_DEC_CHAN_TRICK_PLAY		= eCMD_C011_CMD_BASE + 0x105,
-	eCMD_C011_DEC_CHAN_TS_PIDS			= eCMD_C011_CMD_BASE + 0x106,
+	eCMD_C011_DEC_CHAN_TS_PIDS		= eCMD_C011_CMD_BASE + 0x106,
 	eCMD_C011_DEC_CHAN_PS_STREAM_ID		= eCMD_C011_CMD_BASE + 0x107,
 	eCMD_C011_DEC_CHAN_INPUT_PARAMS		= eCMD_C011_CMD_BASE + 0x108,
 	eCMD_C011_DEC_CHAN_VIDEO_OUTPUT		= eCMD_C011_CMD_BASE + 0x109,
-	eCMD_C011_DEC_CHAN_OUTPUT_FORMAT		= eCMD_C011_CMD_BASE + 0x10A,
-	eCMD_C011_DEC_CHAN_SCALING_FILTERS		= eCMD_C011_CMD_BASE + 0x10B,
-	eCMD_C011_DEC_CHAN_OSD_MODE			= eCMD_C011_CMD_BASE + 0x10D,
+	eCMD_C011_DEC_CHAN_OUTPUT_FORMAT	= eCMD_C011_CMD_BASE + 0x10A,
+	eCMD_C011_DEC_CHAN_SCALING_FILTERS	= eCMD_C011_CMD_BASE + 0x10B,
+	eCMD_C011_DEC_CHAN_OSD_MODE		= eCMD_C011_CMD_BASE + 0x10D,
 	eCMD_C011_DEC_CHAN_DROP			= eCMD_C011_CMD_BASE + 0x10E,
-	eCMD_C011_DEC_CHAN_RELEASE			= eCMD_C011_CMD_BASE + 0x10F,
-	eCMD_C011_DEC_CHAN_STREAM_SETTINGS		= eCMD_C011_CMD_BASE + 0x110,
+	eCMD_C011_DEC_CHAN_RELEASE		= eCMD_C011_CMD_BASE + 0x10F,
+	eCMD_C011_DEC_CHAN_STREAM_SETTINGS	= eCMD_C011_CMD_BASE + 0x110,
 	eCMD_C011_DEC_CHAN_PAUSE_OUTPUT		= eCMD_C011_CMD_BASE + 0x111,
-	eCMD_C011_DEC_CHAN_CHANGE			= eCMD_C011_CMD_BASE + 0x112,
-	eCMD_C011_DEC_CHAN_SET_STC			= eCMD_C011_CMD_BASE + 0x113,
-	eCMD_C011_DEC_CHAN_SET_PTS			= eCMD_C011_CMD_BASE + 0x114,
-	eCMD_C011_DEC_CHAN_CC_MODE			= eCMD_C011_CMD_BASE + 0x115,
-	eCMD_C011_DEC_CREATE_AUDIO_CONTEXT		= eCMD_C011_CMD_BASE + 0x116,
-	eCMD_C011_DEC_COPY_AUDIO_CONTEXT		= eCMD_C011_CMD_BASE + 0x117,
-	eCMD_C011_DEC_DELETE_AUDIO_CONTEXT		= eCMD_C011_CMD_BASE + 0x118,
-	eCMD_C011_DEC_CHAN_SET_DECYPTION		= eCMD_C011_CMD_BASE + 0x119,
+	eCMD_C011_DEC_CHAN_CHANGE		= eCMD_C011_CMD_BASE + 0x112,
+	eCMD_C011_DEC_CHAN_SET_STC		= eCMD_C011_CMD_BASE + 0x113,
+	eCMD_C011_DEC_CHAN_SET_PTS		= eCMD_C011_CMD_BASE + 0x114,
+	eCMD_C011_DEC_CHAN_CC_MODE		= eCMD_C011_CMD_BASE + 0x115,
+	eCMD_C011_DEC_CREATE_AUDIO_CONTEXT	= eCMD_C011_CMD_BASE + 0x116,
+	eCMD_C011_DEC_COPY_AUDIO_CONTEXT	= eCMD_C011_CMD_BASE + 0x117,
+	eCMD_C011_DEC_DELETE_AUDIO_CONTEXT	= eCMD_C011_CMD_BASE + 0x118,
+	eCMD_C011_DEC_CHAN_SET_DECYPTION	= eCMD_C011_CMD_BASE + 0x119,
 	eCMD_C011_DEC_CHAN_START_VIDEO		= eCMD_C011_CMD_BASE + 0x11A,
 	eCMD_C011_DEC_CHAN_STOP_VIDEO		= eCMD_C011_CMD_BASE + 0x11B,
 	eCMD_C011_DEC_CHAN_PIC_CAPTURE		= eCMD_C011_CMD_BASE + 0x11C,
-	eCMD_C011_DEC_CHAN_PAUSE			= eCMD_C011_CMD_BASE + 0x11D,
+	eCMD_C011_DEC_CHAN_PAUSE		= eCMD_C011_CMD_BASE + 0x11D,
 	eCMD_C011_DEC_CHAN_PAUSE_STATE		= eCMD_C011_CMD_BASE + 0x11E,
-	eCMD_C011_DEC_CHAN_SET_SLOWM_RATE		= eCMD_C011_CMD_BASE + 0x11F,
-	eCMD_C011_DEC_CHAN_GET_SLOWM_RATE		= eCMD_C011_CMD_BASE + 0x120,
+	eCMD_C011_DEC_CHAN_SET_SLOWM_RATE	= eCMD_C011_CMD_BASE + 0x11F,
+	eCMD_C011_DEC_CHAN_GET_SLOWM_RATE	= eCMD_C011_CMD_BASE + 0x120,
 	eCMD_C011_DEC_CHAN_SET_FF_RATE		= eCMD_C011_CMD_BASE + 0x121,
 	eCMD_C011_DEC_CHAN_GET_FF_RATE		= eCMD_C011_CMD_BASE + 0x122,
-	eCMD_C011_DEC_CHAN_FRAME_ADVANCE		= eCMD_C011_CMD_BASE + 0x123,
+	eCMD_C011_DEC_CHAN_FRAME_ADVANCE	= eCMD_C011_CMD_BASE + 0x123,
 	eCMD_C011_DEC_CHAN_SET_SKIP_PIC_MODE	= eCMD_C011_CMD_BASE + 0x124,
 	eCMD_C011_DEC_CHAN_GET_SKIP_PIC_MODE	= eCMD_C011_CMD_BASE + 0x125,
 	eCMD_C011_DEC_CHAN_FILL_PIC_BUF		= eCMD_C011_CMD_BASE + 0x126,
@@ -308,15 +311,16 @@
 	eCMD_C011_DEC_CHAN_SET_BRCM_TRICK_MODE	= eCMD_C011_CMD_BASE + 0x129,
 	eCMD_C011_DEC_CHAN_GET_BRCM_TRICK_MODE	= eCMD_C011_CMD_BASE + 0x12A,
 	eCMD_C011_DEC_CHAN_REVERSE_FIELD_STATUS	= eCMD_C011_CMD_BASE + 0x12B,
-	eCMD_C011_DEC_CHAN_I_PICTURE_FOUND		= eCMD_C011_CMD_BASE + 0x12C,
-	eCMD_C011_DEC_CHAN_SET_PARAMETER		= eCMD_C011_CMD_BASE + 0x12D,
+	eCMD_C011_DEC_CHAN_I_PICTURE_FOUND	= eCMD_C011_CMD_BASE + 0x12C,
+	eCMD_C011_DEC_CHAN_SET_PARAMETER	= eCMD_C011_CMD_BASE + 0x12D,
 	eCMD_C011_DEC_CHAN_SET_USER_DATA_MODE	= eCMD_C011_CMD_BASE + 0x12E,
-	eCMD_C011_DEC_CHAN_SET_PAUSE_DISPLAY_MODE	= eCMD_C011_CMD_BASE + 0x12F,
-	eCMD_C011_DEC_CHAN_SET_SLOW_DISPLAY_MODE	= eCMD_C011_CMD_BASE + 0x130,
+	eCMD_C011_DEC_CHAN_SET_PAUSE_DISPLAY_MODE = eCMD_C011_CMD_BASE + 0x12F,
+	eCMD_C011_DEC_CHAN_SET_SLOW_DISPLAY_MODE = eCMD_C011_CMD_BASE + 0x130,
 	eCMD_C011_DEC_CHAN_SET_FF_DISPLAY_MODE	= eCMD_C011_CMD_BASE + 0x131,
-	eCMD_C011_DEC_CHAN_SET_DISPLAY_TIMING_MODE	= eCMD_C011_CMD_BASE + 0x132,
-	eCMD_C011_DEC_CHAN_SET_DISPLAY_MODE		= eCMD_C011_CMD_BASE + 0x133,
-	eCMD_C011_DEC_CHAN_GET_DISPLAY_MODE		= eCMD_C011_CMD_BASE + 0x134,
+	eCMD_C011_DEC_CHAN_SET_DISPLAY_TIMING_MODE = eCMD_C011_CMD_BASE +
+								 0x132,
+	eCMD_C011_DEC_CHAN_SET_DISPLAY_MODE	= eCMD_C011_CMD_BASE + 0x133,
+	eCMD_C011_DEC_CHAN_GET_DISPLAY_MODE	= eCMD_C011_CMD_BASE + 0x134,
 	eCMD_C011_DEC_CHAN_SET_REVERSE_FIELD	= eCMD_C011_CMD_BASE + 0x135,
 	eCMD_C011_DEC_CHAN_STREAM_OPEN		= eCMD_C011_CMD_BASE + 0x136,
 	eCMD_C011_DEC_CHAN_SET_PCR_PID		= eCMD_C011_CMD_BASE + 0x137,
@@ -328,19 +332,22 @@
 	eCMD_C011_DEC_CHAN_GET_DISPLAY_ORDER	= eCMD_C011_CMD_BASE + 0x143,
 	eCMD_C011_DEC_CHAN_SET_HOST_TRICK_MODE	= eCMD_C011_CMD_BASE + 0x144,
 	eCMD_C011_DEC_CHAN_SET_OPERATION_MODE	= eCMD_C011_CMD_BASE + 0x145,
-	eCMD_C011_DEC_CHAN_DISPLAY_PAUSE_UNTO_PTS	= eCMD_C011_CMD_BASE + 0x146,
-	eCMD_C011_DEC_CHAN_SET_PTS_STC_DIFF_THRESHOLD = eCMD_C011_CMD_BASE + 0x147,
+	eCMD_C011_DEC_CHAN_DISPLAY_PAUSE_UNTO_PTS = eCMD_C011_CMD_BASE + 0x146,
+	eCMD_C011_DEC_CHAN_SET_PTS_STC_DIFF_THRESHOLD = eCMD_C011_CMD_BASE +
+								 0x147,
 	eCMD_C011_DEC_CHAN_SEND_COMPRESSED_BUF	= eCMD_C011_CMD_BASE + 0x148,
 	eCMD_C011_DEC_CHAN_SET_CLIPPING		= eCMD_C011_CMD_BASE + 0x149,
 	eCMD_C011_DEC_CHAN_SET_PARAMETERS_FOR_HARD_RESET_INTERRUPT_TO_HOST
 		= eCMD_C011_CMD_BASE + 0x150,
 
 	/* Decoder RevD commands */
-	eCMD_C011_DEC_CHAN_SET_CSC	= eCMD_C011_CMD_BASE + 0x180, /* color space conversion */
+	eCMD_C011_DEC_CHAN_SET_CSC	= eCMD_C011_CMD_BASE + 0x180, /* color
+							 space conversion */
 	eCMD_C011_DEC_CHAN_SET_RANGE_REMAP	= eCMD_C011_CMD_BASE + 0x181,
 	eCMD_C011_DEC_CHAN_SET_FGT		= eCMD_C011_CMD_BASE + 0x182,
 	/* Note: 0x183 not implemented yet in Rev D main */
-	eCMD_C011_DEC_CHAN_SET_LASTPICTURE_PADDING = eCMD_C011_CMD_BASE + 0x183,
+	eCMD_C011_DEC_CHAN_SET_LASTPICTURE_PADDING = eCMD_C011_CMD_BASE +
+								 0x183,
 
 	/* Decoder 7412 commands (7412-only) */
 	eCMD_C011_DEC_CHAN_SET_CONTENT_KEY	= eCMD_C011_CMD_BASE + 0x190,

diff --git a/drivers/staging/crystalhd/crystalhd_hw.c b/drivers/staging/crystalhd/crystalhd_hw.c
index e617d2f..0c8cb32 100644
--- a/drivers/staging/crystalhd/crystalhd_hw.c
+++ b/drivers/staging/crystalhd/crystalhd_hw.c

@@ -94,15 +94,19 @@
 	 * Enable clocks while 7412 reset is asserted, delay
 	 * De-assert 7412 reset
 	 */
-	rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp, MISC_PERST_DECODER_CTRL);
+	rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp,
+					 MISC_PERST_DECODER_CTRL);
 	rst_deco_cntrl.stop_bcm_7412_clk = 0;
 	rst_deco_cntrl.bcm7412_rst = 1;
-	crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL, rst_deco_cntrl.whole_reg);
+	crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL,
+					 rst_deco_cntrl.whole_reg);
 	msleep_interruptible(10);
 
-	rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp, MISC_PERST_DECODER_CTRL);
+	rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp,
+					 MISC_PERST_DECODER_CTRL);
 	rst_deco_cntrl.bcm7412_rst = 0;
-	crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL, rst_deco_cntrl.whole_reg);
+	crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL,
+					 rst_deco_cntrl.whole_reg);
 	msleep_interruptible(50);
 
 	/* Disable OTP_CONTENT_MISC to 0 to disable all secure modes */
@@ -132,9 +136,11 @@
 	 * Assert 7412 reset, delay
 	 * Assert 7412 stop clock
 	 */
-	rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp, MISC_PERST_DECODER_CTRL);
+	rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp,
+					 MISC_PERST_DECODER_CTRL);
 	rst_deco_cntrl.stop_bcm_7412_clk = 1;
-	crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL, rst_deco_cntrl.whole_reg);
+	crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL,
+					 rst_deco_cntrl.whole_reg);
 	msleep_interruptible(50);
 
 	/* Bus Arbiter Timeout: GISB_ARBITER_TIMER
@@ -213,7 +219,8 @@
 {
 	uint32_t reg;
 
-	/* FIXME: jarod: wouldn't we want to write a 0 to the reg? Or does the write clear the bits specified? */
+	/* FIXME: jarod: wouldn't we want to write a 0 to the reg?
+	 Or does the write clear the bits specified? */
 	reg = crystalhd_reg_rd(adp, MISC1_Y_RX_ERROR_STATUS);
 	if (reg)
 		crystalhd_reg_wr(adp, MISC1_Y_RX_ERROR_STATUS, reg);
@@ -263,10 +270,12 @@
 	crystalhd_reg_wr(adp, DCI_DRAM_BASE_ADDR, (BC_DRAM_FW_CFG_ADDR >> 19));
 
 	crystalhd_reg_wr(adp, AES_CMD, 0);
-	crystalhd_reg_wr(adp, AES_CONFIG_INFO, (BC_DRAM_FW_CFG_ADDR & 0x7FFFF));
+	crystalhd_reg_wr(adp, AES_CONFIG_INFO,
+		 (BC_DRAM_FW_CFG_ADDR & 0x7FFFF));
 	crystalhd_reg_wr(adp, AES_CMD, 0x1);
 
-	/* FIXME: jarod: I've seen this fail, and introducing extra delays helps... */
+	/* FIXME: jarod: I've seen this fail,
+	 and introducing extra delays helps... */
 	for (i = 0; i < 100; ++i) {
 		reg = crystalhd_reg_rd(adp, AES_STATUS);
 		if (reg & 0x1)
@@ -349,7 +358,8 @@
 	return true;
 }
 
-static struct crystalhd_rx_dma_pkt *crystalhd_hw_alloc_rx_pkt(struct crystalhd_hw *hw)
+static struct crystalhd_rx_dma_pkt *crystalhd_hw_alloc_rx_pkt(
+					struct crystalhd_hw *hw)
 {
 	unsigned long flags = 0;
 	struct crystalhd_rx_dma_pkt *temp = NULL;
@@ -484,8 +494,8 @@
 }
 
 
-static bool crystalhd_code_in_full(struct crystalhd_adp *adp, uint32_t needed_sz,
-				 bool b_188_byte_pkts,  uint8_t flags)
+static bool crystalhd_code_in_full(struct crystalhd_adp *adp,
+		 uint32_t needed_sz, bool b_188_byte_pkts,  uint8_t flags)
 {
 	uint32_t base, end, writep, readp;
 	uint32_t cpbSize, cpbFullness, fifoSize;
@@ -525,7 +535,7 @@
 }
 
 static enum BC_STATUS crystalhd_hw_tx_req_complete(struct crystalhd_hw *hw,
-					    uint32_t list_id, enum BC_STATUS cs)
+					 uint32_t list_id, enum BC_STATUS cs)
 {
 	struct tx_dma_pkt *tx_req;
 
@@ -536,7 +546,8 @@
 
 	hw->pwr_lock--;
 
-	tx_req = (struct tx_dma_pkt *)crystalhd_dioq_find_and_fetch(hw->tx_actq, list_id);
+	tx_req = (struct tx_dma_pkt *)crystalhd_dioq_find_and_fetch(
+					hw->tx_actq, list_id);
 	if (!tx_req) {
 		if (cs != BC_STS_IO_USER_ABORT)
 			BCMLOG_ERR("Find and Fetch Did not find req\n");
@@ -559,7 +570,8 @@
 	return crystalhd_dioq_add(hw->tx_freeq, tx_req, false, 0);
 }
 
-static bool crystalhd_tx_list0_handler(struct crystalhd_hw *hw, uint32_t err_sts)
+static bool crystalhd_tx_list0_handler(struct crystalhd_hw *hw,
+					 uint32_t err_sts)
 {
 	uint32_t err_mask, tmp;
 	unsigned long flags = 0;
@@ -591,7 +603,8 @@
 	return true;
 }
 
-static bool crystalhd_tx_list1_handler(struct crystalhd_hw *hw, uint32_t err_sts)
+static bool crystalhd_tx_list1_handler(struct crystalhd_hw *hw,
+					 uint32_t err_sts)
 {
 	uint32_t err_mask, tmp;
 	unsigned long flags = 0;
@@ -663,14 +676,15 @@
 	if (!p_dma_desc || !cnt)
 		return;
 
-	/* FIXME: jarod: perhaps a modparam desc_debug to enable this, rather than
-	 * setting ll (log level, I presume) to non-zero? */
+	/* FIXME: jarod: perhaps a modparam desc_debug to enable this,
+	 rather than setting ll (log level, I presume) to non-zero? */
 	if (!ll)
 		return;
 
 	for (ix = ul_desc_index; ix < (ul_desc_index + cnt); ix++) {
-		BCMLOG(ll, "%s[%d] Buff[%x:%x] Next:[%x:%x] XferSz:%x Intr:%x,Last:%x\n",
-		       ((p_dma_desc[ul_desc_index].dma_dir) ? "TDesc" : "RDesc"),
+		BCMLOG(ll,
+		 "%s[%d] Buff[%x:%x] Next:[%x:%x] XferSz:%x Intr:%x,Last:%x\n",
+		 ((p_dma_desc[ul_desc_index].dma_dir) ? "TDesc" : "RDesc"),
 		       ul_desc_index,
 		       p_dma_desc[ul_desc_index].buff_addr_high,
 		       p_dma_desc[ul_desc_index].buff_addr_low,
@@ -707,7 +721,8 @@
 		/* Get SGLE length */
 		len = crystalhd_get_sgle_len(ioreq, sg_ix);
 		if (len % 4) {
-			BCMLOG_ERR(" len in sg %d %d %d\n", len, sg_ix, sg_cnt);
+			BCMLOG_ERR(" len in sg %d %d %d\n", len, sg_ix,
+				 sg_cnt);
 			return BC_STS_NOT_IMPL;
 		}
 		/* Setup DMA desc with Phy addr & Length at current index. */
@@ -722,7 +737,8 @@
 		desc[ix].dma_dir        = ioreq->uinfo.dir_tx;
 
 		/* Chain DMA descriptor.  */
-		addr_temp.full_addr = desc_phy_addr + sizeof(struct dma_descriptor);
+		addr_temp.full_addr = desc_phy_addr +
+					 sizeof(struct dma_descriptor);
 		desc[ix].next_desc_addr_low = addr_temp.low_part;
 		desc[ix].next_desc_addr_high = addr_temp.high_part;
 
@@ -731,8 +747,9 @@
 
 		/* Debug.. */
 		if ((!len) || (len > crystalhd_get_sgle_len(ioreq, sg_ix))) {
-			BCMLOG_ERR("inv-len(%x) Ix(%d) count:%x xfr_sz:%x sg_cnt:%d\n",
-				   len, ix, count, xfr_sz, sg_cnt);
+			BCMLOG_ERR(
+			 "inv-len(%x) Ix(%d) count:%x xfr_sz:%x sg_cnt:%d\n",
+			 len, ix, count, xfr_sz, sg_cnt);
 			return BC_STS_ERROR;
 		}
 		/* Length expects Multiple of 4 */
@@ -774,7 +791,8 @@
 	return BC_STS_SUCCESS;
 }
 
-static enum BC_STATUS crystalhd_xlat_sgl_to_dma_desc(struct crystalhd_dio_req *ioreq,
+static enum BC_STATUS crystalhd_xlat_sgl_to_dma_desc(
+					      struct crystalhd_dio_req *ioreq,
 					      struct dma_desc_mem *pdesc_mem,
 					      uint32_t *uv_desc_index)
 {
@@ -887,12 +905,14 @@
 	while ((l1 || l2) && cnt) {
 
 		if (l1) {
-			l1 = crystalhd_reg_rd(hw->adp, MISC1_TX_FIRST_DESC_L_ADDR_LIST0);
+			l1 = crystalhd_reg_rd(hw->adp,
+				 MISC1_TX_FIRST_DESC_L_ADDR_LIST0);
 			l1 &= DMA_START_BIT;
 		}
 
 		if (l2) {
-			l2 = crystalhd_reg_rd(hw->adp, MISC1_TX_FIRST_DESC_L_ADDR_LIST1);
+			l2 = crystalhd_reg_rd(hw->adp,
+				 MISC1_TX_FIRST_DESC_L_ADDR_LIST1);
 			l2 &= DMA_START_BIT;
 		}
 
@@ -986,7 +1006,8 @@
 	return addr_entry;
 }
 
-static bool crystalhd_rel_addr_to_pib_Q(struct crystalhd_hw *hw, uint32_t addr_to_rel)
+static bool crystalhd_rel_addr_to_pib_Q(struct crystalhd_hw *hw,
+					 uint32_t addr_to_rel)
 {
 	uint32_t Q_addr;
 	uint32_t r_offset, w_offset, n_offset;
@@ -1021,7 +1042,8 @@
 	return true;
 }
 
-static void cpy_pib_to_app(struct c011_pib *src_pib, struct BC_PIC_INFO_BLOCK *dst_pib)
+static void cpy_pib_to_app(struct c011_pib *src_pib,
+					 struct BC_PIC_INFO_BLOCK *dst_pib)
 {
 	if (!src_pib || !dst_pib) {
 		BCMLOG_ERR("Invalid Arguments\n");
@@ -1063,11 +1085,13 @@
 			       (uint32_t *)&src_pib);
 
 		if (src_pib.bFormatChange) {
-			rx_pkt = (struct crystalhd_rx_dma_pkt *)crystalhd_dioq_fetch(hw->rx_freeq);
+			rx_pkt = (struct crystalhd_rx_dma_pkt *)
+					crystalhd_dioq_fetch(hw->rx_freeq);
 			if (!rx_pkt)
 				return;
 			rx_pkt->flags = 0;
-			rx_pkt->flags |= COMP_FLAG_PIB_VALID | COMP_FLAG_FMT_CHANGE;
+			rx_pkt->flags |= COMP_FLAG_PIB_VALID |
+					 COMP_FLAG_FMT_CHANGE;
 			AppPib = &rx_pkt->pib;
 			cpy_pib_to_app(&src_pib, AppPib);
 
@@ -1084,7 +1108,8 @@
 			       rx_pkt->pib.pulldown,
 			       rx_pkt->pib.ycom);
 
-			crystalhd_dioq_add(hw->rx_rdyq, (void *)rx_pkt, true, rx_pkt->pkt_tag);
+			crystalhd_dioq_add(hw->rx_rdyq, (void *)rx_pkt, true,
+					 rx_pkt->pkt_tag);
 
 		}
 
@@ -1096,16 +1121,20 @@
 {
 	uint32_t        dma_cntrl;
 
-	dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
+	dma_cntrl = crystalhd_reg_rd(hw->adp,
+			 MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
 	if (!(dma_cntrl & DMA_START_BIT)) {
 		dma_cntrl |= DMA_START_BIT;
-		crystalhd_reg_wr(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+		crystalhd_reg_wr(hw->adp,
+			 MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
 	}
 
-	dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
+	dma_cntrl = crystalhd_reg_rd(hw->adp,
+			 MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
 	if (!(dma_cntrl & DMA_START_BIT)) {
 		dma_cntrl |= DMA_START_BIT;
-		crystalhd_reg_wr(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+		crystalhd_reg_wr(hw->adp,
+			 MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
 	}
 
 	return;
@@ -1116,44 +1145,52 @@
 	uint32_t dma_cntrl = 0, count = 30;
 	uint32_t l0y = 1, l0uv = 1, l1y = 1, l1uv = 1;
 
-	dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
+	dma_cntrl = crystalhd_reg_rd(hw->adp,
+			 MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
 	if ((dma_cntrl & DMA_START_BIT)) {
 		dma_cntrl &= ~DMA_START_BIT;
-		crystalhd_reg_wr(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+		crystalhd_reg_wr(hw->adp,
+			 MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
 	}
 
-	dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
+	dma_cntrl = crystalhd_reg_rd(hw->adp,
+			 MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
 	if ((dma_cntrl & DMA_START_BIT)) {
 		dma_cntrl &= ~DMA_START_BIT;
-		crystalhd_reg_wr(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+		crystalhd_reg_wr(hw->adp,
+			 MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
 	}
 
 	/* Poll for 3seconds (30 * 100ms) on both the lists..*/
 	while ((l0y || l0uv || l1y || l1uv) && count) {
 
 		if (l0y) {
-			l0y = crystalhd_reg_rd(hw->adp, MISC1_Y_RX_FIRST_DESC_L_ADDR_LIST0);
+			l0y = crystalhd_reg_rd(hw->adp,
+				 MISC1_Y_RX_FIRST_DESC_L_ADDR_LIST0);
 			l0y &= DMA_START_BIT;
 			if (!l0y)
 				hw->rx_list_sts[0] &= ~rx_waiting_y_intr;
 		}
 
 		if (l1y) {
-			l1y = crystalhd_reg_rd(hw->adp, MISC1_Y_RX_FIRST_DESC_L_ADDR_LIST1);
+			l1y = crystalhd_reg_rd(hw->adp,
+				 MISC1_Y_RX_FIRST_DESC_L_ADDR_LIST1);
 			l1y &= DMA_START_BIT;
 			if (!l1y)
 				hw->rx_list_sts[1] &= ~rx_waiting_y_intr;
 		}
 
 		if (l0uv) {
-			l0uv = crystalhd_reg_rd(hw->adp, MISC1_UV_RX_FIRST_DESC_L_ADDR_LIST0);
+			l0uv = crystalhd_reg_rd(hw->adp,
+				 MISC1_UV_RX_FIRST_DESC_L_ADDR_LIST0);
 			l0uv &= DMA_START_BIT;
 			if (!l0uv)
 				hw->rx_list_sts[0] &= ~rx_waiting_uv_intr;
 		}
 
 		if (l1uv) {
-			l1uv = crystalhd_reg_rd(hw->adp, MISC1_UV_RX_FIRST_DESC_L_ADDR_LIST1);
+			l1uv = crystalhd_reg_rd(hw->adp,
+				 MISC1_UV_RX_FIRST_DESC_L_ADDR_LIST1);
 			l1uv &= DMA_START_BIT;
 			if (!l1uv)
 				hw->rx_list_sts[1] &= ~rx_waiting_uv_intr;
@@ -1168,7 +1205,8 @@
 	       count, hw->rx_list_sts[0], hw->rx_list_sts[1]);
 }
 
-static enum BC_STATUS crystalhd_hw_prog_rxdma(struct crystalhd_hw *hw, struct crystalhd_rx_dma_pkt *rx_pkt)
+static enum BC_STATUS crystalhd_hw_prog_rxdma(struct crystalhd_hw *hw,
+					 struct crystalhd_rx_dma_pkt *rx_pkt)
 {
 	uint32_t y_low_addr_reg, y_high_addr_reg;
 	uint32_t uv_low_addr_reg, uv_high_addr_reg;
@@ -1186,7 +1224,8 @@
 	}
 
 	spin_lock_irqsave(&hw->rx_lock, flags);
-	/* FIXME: jarod: sts_free is an enum for 0, in crystalhd_hw.h... yuk... */
+	/* FIXME: jarod: sts_free is an enum for 0,
+	 in crystalhd_hw.h... yuk... */
 	if (sts_free != hw->rx_list_sts[hw->rx_list_post_index]) {
 		spin_unlock_irqrestore(&hw->rx_lock, flags);
 		return BC_STS_BUSY;
@@ -1210,7 +1249,8 @@
 	hw->rx_list_post_index = (hw->rx_list_post_index + 1) % DMA_ENGINE_CNT;
 	spin_unlock_irqrestore(&hw->rx_lock, flags);
 
-	crystalhd_dioq_add(hw->rx_actq, (void *)rx_pkt, false, rx_pkt->pkt_tag);
+	crystalhd_dioq_add(hw->rx_actq, (void *)rx_pkt, false,
+			 rx_pkt->pkt_tag);
 
 	crystalhd_start_rx_dma_engine(hw);
 	/* Program the Y descriptor */
@@ -1221,8 +1261,10 @@
 	if (rx_pkt->uv_phy_addr) {
 		/* Program the UV descriptor */
 		desc_addr.full_addr = rx_pkt->uv_phy_addr;
-		crystalhd_reg_wr(hw->adp, uv_high_addr_reg, desc_addr.high_part);
-		crystalhd_reg_wr(hw->adp, uv_low_addr_reg, desc_addr.low_part | 0x01);
+		crystalhd_reg_wr(hw->adp, uv_high_addr_reg,
+			 desc_addr.high_part);
+		crystalhd_reg_wr(hw->adp, uv_low_addr_reg,
+			 desc_addr.low_part | 0x01);
 	}
 
 	return BC_STS_SUCCESS;
@@ -1268,16 +1310,20 @@
 
 	hw->stop_pending = 0;
 
-	dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
+	dma_cntrl = crystalhd_reg_rd(hw->adp,
+			 MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
 	if (dma_cntrl & DMA_START_BIT) {
 		dma_cntrl &= ~DMA_START_BIT;
-		crystalhd_reg_wr(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+		crystalhd_reg_wr(hw->adp,
+			 MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
 	}
 
-	dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
+	dma_cntrl = crystalhd_reg_rd(hw->adp,
+			 MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
 	if (dma_cntrl & DMA_START_BIT) {
 		dma_cntrl &= ~DMA_START_BIT;
-		crystalhd_reg_wr(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+		crystalhd_reg_wr(hw->adp,
+			 MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
 	}
 	hw->rx_list_post_index = 0;
 
@@ -1287,8 +1333,8 @@
 	crystalhd_reg_wr(hw->adp, PCIE_DLL_DATA_LINK_CONTROL, aspm);
 }
 
-static enum BC_STATUS crystalhd_rx_pkt_done(struct crystalhd_hw *hw, uint32_t list_index,
-				     enum BC_STATUS comp_sts)
+static enum BC_STATUS crystalhd_rx_pkt_done(struct crystalhd_hw *hw,
+			 uint32_t list_index, enum BC_STATUS comp_sts)
 {
 	struct crystalhd_rx_dma_pkt *rx_pkt = NULL;
 	uint32_t y_dw_dnsz, uv_dw_dnsz;
@@ -1302,7 +1348,8 @@
 	rx_pkt = crystalhd_dioq_find_and_fetch(hw->rx_actq,
 					     hw->rx_pkt_tag_seed + list_index);
 	if (!rx_pkt) {
-		BCMLOG_ERR("Act-Q:PostIx:%x L0Sts:%x L1Sts:%x current L:%x tag:%x comp:%x\n",
+		BCMLOG_ERR(
+		"Act-Q:PostIx:%x L0Sts:%x L1Sts:%x current L:%x tag:%x comp:%x\n",
 			   hw->rx_list_post_index, hw->rx_list_sts[0],
 			   hw->rx_list_sts[1], list_index,
 			   hw->rx_pkt_tag_seed + list_index, comp_sts);
@@ -1324,8 +1371,8 @@
 	return crystalhd_hw_post_cap_buff(hw, rx_pkt);
 }
 
-static bool crystalhd_rx_list0_handler(struct crystalhd_hw *hw, uint32_t int_sts,
-				     uint32_t y_err_sts, uint32_t uv_err_sts)
+static bool crystalhd_rx_list0_handler(struct crystalhd_hw *hw,
+		 uint32_t int_sts, uint32_t y_err_sts, uint32_t uv_err_sts)
 {
 	uint32_t tmp;
 	enum list_sts tmp_lsts;
@@ -1367,7 +1414,8 @@
 		tmp &= ~MISC1_UV_RX_ERROR_STATUS_RX_L0_UNDERRUN_ERROR_MASK;
 	}
 
-	if (uv_err_sts & MISC1_UV_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK) {
+	if (uv_err_sts &
+	 MISC1_UV_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK) {
 		hw->rx_list_sts[0] &= ~rx_uv_mask;
 		hw->rx_list_sts[0] |= rx_uv_error;
 		tmp &= ~MISC1_UV_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK;
@@ -1392,8 +1440,8 @@
 	return (tmp_lsts != hw->rx_list_sts[0]);
 }
 
-static bool crystalhd_rx_list1_handler(struct crystalhd_hw *hw, uint32_t int_sts,
-				     uint32_t y_err_sts, uint32_t uv_err_sts)
+static bool crystalhd_rx_list1_handler(struct crystalhd_hw *hw,
+		 uint32_t int_sts, uint32_t y_err_sts, uint32_t uv_err_sts)
 {
 	uint32_t tmp;
 	enum list_sts tmp_lsts;
@@ -1486,9 +1534,11 @@
 		/* Update States..*/
 		spin_lock_irqsave(&hw->rx_lock, flags);
 		if (i == 0)
-			ret = crystalhd_rx_list0_handler(hw, intr_sts, y_err_sts, uv_err_sts);
+			ret = crystalhd_rx_list0_handler(hw, intr_sts,
+					 y_err_sts, uv_err_sts);
 		else
-			ret = crystalhd_rx_list1_handler(hw, intr_sts, y_err_sts, uv_err_sts);
+			ret = crystalhd_rx_list1_handler(hw, intr_sts,
+					 y_err_sts, uv_err_sts);
 		if (ret) {
 			switch (hw->rx_list_sts[i]) {
 			case sts_free:
@@ -1501,11 +1551,13 @@
 				/* We got error on both or Y or uv. */
 				hw->stats.rx_errors++;
 				crystalhd_get_dnsz(hw, i, &y_dn_sz, &uv_dn_sz);
-				/* FIXME: jarod: this is where my mini pci-e card is tripping up */
+				/* FIXME: jarod: this is where
+				 my mini pci-e card is tripping up */
 				BCMLOG(BCMLOG_DBG, "list_index:%x rx[%d] Y:%x "
 				       "UV:%x Int:%x YDnSz:%x UVDnSz:%x\n",
 				       i, hw->stats.rx_errors, y_err_sts,
-				       uv_err_sts, intr_sts, y_dn_sz, uv_dn_sz);
+				       uv_err_sts, intr_sts, y_dn_sz,
+				       		 uv_dn_sz);
 				hw->rx_list_sts[i] = sts_free;
 				comp_sts = BC_STS_ERROR;
 				break;
@@ -1567,14 +1619,17 @@
 	union link_misc_perst_decoder_ctrl rst_cntrl_reg;
 
 	/* Pulse reset pin of 7412 (MISC_PERST_DECODER_CTRL) */
-	rst_cntrl_reg.whole_reg = crystalhd_reg_rd(hw->adp, MISC_PERST_DECODER_CTRL);
+	rst_cntrl_reg.whole_reg = crystalhd_reg_rd(hw->adp,
+					 MISC_PERST_DECODER_CTRL);
 
 	rst_cntrl_reg.bcm_7412_rst = 1;
-	crystalhd_reg_wr(hw->adp, MISC_PERST_DECODER_CTRL, rst_cntrl_reg.whole_reg);
+	crystalhd_reg_wr(hw->adp, MISC_PERST_DECODER_CTRL,
+					 rst_cntrl_reg.whole_reg);
 	msleep_interruptible(50);
 
 	rst_cntrl_reg.bcm_7412_rst = 0;
-	crystalhd_reg_wr(hw->adp, MISC_PERST_DECODER_CTRL, rst_cntrl_reg.whole_reg);
+	crystalhd_reg_wr(hw->adp, MISC_PERST_DECODER_CTRL,
+					 rst_cntrl_reg.whole_reg);
 
 	/* Close all banks, put DDR in idle */
 	bc_dec_reg_wr(hw->adp, SDRAM_PRECHARGE, 0);
@@ -1622,7 +1677,8 @@
 **
 *************************************************/
 
-enum BC_STATUS crystalhd_download_fw(struct crystalhd_adp *adp, void *buffer, uint32_t sz)
+enum BC_STATUS crystalhd_download_fw(struct crystalhd_adp *adp, void *buffer,
+					 uint32_t sz)
 {
 	uint32_t reg_data, cnt, *temp_buff;
 	uint32_t fw_sig_len = 36;
@@ -1828,7 +1884,8 @@
 			crystalhd_hw_proc_pib(hw);
 
 		bc_dec_reg_wr(adp, Stream2Host_Intr_Sts, deco_intr);
-		/* FIXME: jarod: No udelay? might this be the real reason mini pci-e cards were stalling out? */
+		/* FIXME: jarod: No udelay? might this be
+		 the real reason mini pci-e cards were stalling out? */
 		bc_dec_reg_wr(adp, Stream2Host_Intr_Sts, 0);
 		rc = 1;
 	}
@@ -1852,7 +1909,8 @@
 	return rc;
 }
 
-enum BC_STATUS crystalhd_hw_open(struct crystalhd_hw *hw, struct crystalhd_adp *adp)
+enum BC_STATUS crystalhd_hw_open(struct crystalhd_hw *hw,
+			 struct crystalhd_adp *adp)
 {
 	if (!hw || !adp) {
 		BCMLOG_ERR("Invalid Arguments\n");
@@ -1967,7 +2025,8 @@
 		}
 		rpkt->desc_mem.pdma_desc_start = mem;
 		rpkt->desc_mem.phy_addr = phy_addr;
-		rpkt->desc_mem.sz  = BC_LINK_MAX_SGLS * sizeof(struct dma_descriptor);
+		rpkt->desc_mem.sz  = BC_LINK_MAX_SGLS *
+					 sizeof(struct dma_descriptor);
 		rpkt->pkt_tag = hw->rx_pkt_tag_seed + i;
 		crystalhd_hw_free_rx_pkt(hw, rpkt);
 	}
@@ -2013,7 +2072,8 @@
 	return BC_STS_SUCCESS;
 }
 
-enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw, struct crystalhd_dio_req *ioreq,
+enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw,
+			     struct crystalhd_dio_req *ioreq,
 			     hw_comp_callback call_back,
 			     wait_queue_head_t *cb_event, uint32_t *list_id,
 			     uint8_t data_flags)
@@ -2047,7 +2107,8 @@
 	}
 
 	/* Get a list from TxFreeQ */
-	tx_dma_packet = (struct tx_dma_pkt *)crystalhd_dioq_fetch(hw->tx_freeq);
+	tx_dma_packet = (struct tx_dma_pkt *)crystalhd_dioq_fetch(
+						hw->tx_freeq);
 	if (!tx_dma_packet) {
 		BCMLOG_ERR("No empty elements..\n");
 		return BC_STS_ERR_USAGE;
@@ -2105,7 +2166,8 @@
 	crystalhd_start_tx_dma_engine(hw);
 	crystalhd_reg_wr(hw->adp, first_desc_u_addr, desc_addr.high_part);
 
-	crystalhd_reg_wr(hw->adp, first_desc_l_addr, desc_addr.low_part | 0x01);
+	crystalhd_reg_wr(hw->adp, first_desc_l_addr, desc_addr.low_part |
+					 0x01);
 					/* Be sure we set the valid bit ^^^^ */
 
 	return BC_STS_SUCCESS;
@@ -2120,7 +2182,8 @@
  *
  * FIX_ME: Not Tested the actual condition..
  */
-enum BC_STATUS crystalhd_hw_cancel_tx(struct crystalhd_hw *hw, uint32_t list_id)
+enum BC_STATUS crystalhd_hw_cancel_tx(struct crystalhd_hw *hw,
+					 uint32_t list_id)
 {
 	if (!hw || !list_id) {
 		BCMLOG_ERR("Invalid Arguments\n");
@@ -2134,7 +2197,7 @@
 }
 
 enum BC_STATUS crystalhd_hw_add_cap_buffer(struct crystalhd_hw *hw,
-				    struct crystalhd_dio_req *ioreq, bool en_post)
+				 struct crystalhd_dio_req *ioreq, bool en_post)
 {
 	struct crystalhd_rx_dma_pkt *rpkt;
 	uint32_t tag, uv_desc_ix = 0;
@@ -2154,7 +2217,8 @@
 	rpkt->dio_req = ioreq;
 	tag = rpkt->pkt_tag;
 
-	sts = crystalhd_xlat_sgl_to_dma_desc(ioreq, &rpkt->desc_mem, &uv_desc_ix);
+	sts = crystalhd_xlat_sgl_to_dma_desc(ioreq, &rpkt->desc_mem,
+					 &uv_desc_ix);
 	if (sts != BC_STS_SUCCESS)
 		return sts;
 
@@ -2163,7 +2227,7 @@
 	/* Store the address of UV in the rx packet for post*/
 	if (uv_desc_ix)
 		rpkt->uv_phy_addr = rpkt->desc_mem.phy_addr +
-				    (sizeof(struct dma_descriptor) * (uv_desc_ix + 1));
+			 (sizeof(struct dma_descriptor) * (uv_desc_ix + 1));
 
 	if (en_post)
 		sts = crystalhd_hw_post_cap_buff(hw, rpkt);
@@ -2190,7 +2254,8 @@
 	rpkt = crystalhd_dioq_fetch_wait(hw->rx_rdyq, timeout, &sig_pending);
 	if (!rpkt) {
 		if (sig_pending) {
-			BCMLOG(BCMLOG_INFO, "wait on frame time out %d\n", sig_pending);
+			BCMLOG(BCMLOG_INFO, "wait on frame time out %d\n",
+					 sig_pending);
 			return BC_STS_IO_USER_ABORT;
 		} else {
 			return BC_STS_TIMEOUT;
@@ -2305,7 +2370,8 @@
 	return BC_STS_SUCCESS;
 }
 
-void crystalhd_hw_stats(struct crystalhd_hw *hw, struct crystalhd_hw_stats *stats)
+void crystalhd_hw_stats(struct crystalhd_hw *hw,
+		 struct crystalhd_hw_stats *stats)
 {
 	if (!hw) {
 		BCMLOG_ERR("Invalid Arguments\n");
@@ -2378,7 +2444,8 @@
 
 		if (reg & 0x00020000) {
 			hw->prev_n = n;
-			/* FIXME: jarod: outputting a random "C" is... confusing... */
+			/* FIXME: jarod: outputting
+			 a random "C" is... confusing... */
 			BCMLOG(BCMLOG_INFO, "C");
 			return BC_STS_SUCCESS;
 		} else {

diff --git a/drivers/staging/crystalhd/crystalhd_hw.h b/drivers/staging/crystalhd/crystalhd_hw.h
index 2d0e6c6..3780944 100644
--- a/drivers/staging/crystalhd/crystalhd_hw.h
+++ b/drivers/staging/crystalhd/crystalhd_hw.h

@@ -46,7 +46,7 @@
 #define Cpu2HstMbx1		0x00100F04
 #define MbxStat1		0x00100F08
 #define Stream2Host_Intr_Sts	0x00100F24
-#define C011_RET_SUCCESS	0x0	/* Reutrn status of firmware command. */
+#define C011_RET_SUCCESS	0x0 /* Reutrn status of firmware command. */
 
 /* TS input status register */
 #define TS_StreamAFIFOStatus	0x0010044C
@@ -103,7 +103,7 @@
 #define BC_FWIMG_ST_ADDR	0x00000000
 /* FIXME: jarod: there's a kernel function that'll do this for us... */
 #define rotr32_1(x, n)		(((x) >> n) | ((x) << (32 - n)))
-#define bswap_32_1(x)		((rotr32_1((x), 24) & 0x00ff00ff) | (rotr32_1((x), 8) & 0xff00ff00))
+#define bswap_32_1(x) ((rotr32_1((x), 24) & 0x00ff00ff) | (rotr32_1((x), 8) & 0xff00ff00))
 
 #define DecHt_HostSwReset	0x340000
 #define BC_DRAM_FW_CFG_ADDR	0x001c2000
@@ -136,9 +136,11 @@
 
 union link_misc_perst_deco_ctrl {
 	struct {
-		uint32_t	bcm7412_rst:1;		/* 1 -> BCM7412 is held in reset. Reset value 1.*/
+		uint32_t	bcm7412_rst:1;	/* 1 -> BCM7412 is held
+						in reset. Reset value 1.*/
 		uint32_t	reserved0:3;		/* Reserved.No Effect*/
-		uint32_t	stop_bcm_7412_clk:1;	/* 1 ->Stops branch of 27MHz clk used to clk BCM7412*/
+		uint32_t	stop_bcm_7412_clk:1;	/* 1 ->Stops branch of
+						27MHz clk used to clk BCM7412*/
 		uint32_t	reserved1:27;		/* Reseved. No Effect*/
 	};
 
@@ -148,13 +150,18 @@
 
 union link_misc_perst_clk_ctrl {
 	struct {
-		uint32_t	sel_alt_clk:1;	  /* When set, selects a 6.75MHz clock as the source of core_clk */
-		uint32_t	stop_core_clk:1;  /* When set, stops the branch of core_clk that is not needed for low power operation */
-		uint32_t	pll_pwr_dn:1;	  /* When set, powers down the main PLL. The alternate clock bit should be set
-						     to select an alternate clock before setting this bit.*/
+		uint32_t	sel_alt_clk:1;	  /* When set, selects a
+				 6.75MHz clock as the source of core_clk */
+		uint32_t	stop_core_clk:1;  /* When set, stops the branch
+		 of core_clk that is not needed for low power operation */
+		uint32_t	pll_pwr_dn:1;	  /* When set, powers down the
+			 main PLL. The alternate clock bit should be set to
+			 select an alternate clock before setting this bit.*/
 		uint32_t	reserved0:5;	  /* Reserved */
-		uint32_t	pll_mult:8;	  /* This setting controls the multiplier for the PLL. */
-		uint32_t	pll_div:4;	  /* This setting controls the divider for the PLL. */
+		uint32_t	pll_mult:8;	  /* This setting controls
+						 the multiplier for the PLL. */
+		uint32_t	pll_div:4;	  /* This setting controls
+						 the divider for the PLL. */
 		uint32_t	reserved1:12;	  /* Reserved */
 	};
 
@@ -164,9 +171,11 @@
 
 union link_misc_perst_decoder_ctrl {
 	struct {
-		uint32_t	bcm_7412_rst:1; /* 1 -> BCM7412 is held in reset. Reset value 1.*/
+		uint32_t	bcm_7412_rst:1; /* 1 -> BCM7412 is held
+						 in reset. Reset value 1.*/
 		uint32_t	res0:3; /* Reserved.No Effect*/
-		uint32_t	stop_7412_clk:1; /* 1 ->Stops branch of 27MHz clk used to clk BCM7412*/
+		uint32_t	stop_7412_clk:1; /* 1 ->Stops branch of 27MHz
+						 clk used to clk BCM7412*/
 		uint32_t	res1:27; /* Reseved. No Effect */
 	};
 
@@ -225,10 +234,12 @@
  * The  virtual address will determine what should be freed.
  */
 struct dma_desc_mem {
-	struct dma_descriptor	*pdma_desc_start; /* 32-bytes for dma descriptor. should be first element */
-	dma_addr_t		phy_addr;	/* physical address of each DMA desc */
+	struct dma_descriptor	*pdma_desc_start; /* 32-bytes for dma
+				 descriptor. should be first element */
+	dma_addr_t		phy_addr;	/* physical address
+						 of each DMA desc */
 	uint32_t		sz;
-	struct _dma_desc_mem_	*Next;		/* points to Next Descriptor in chain */
+	struct _dma_desc_mem_	*Next; /* points to Next Descriptor in chain */
 
 };
 
@@ -323,50 +334,54 @@
 #define CLOCK_PRESET 175
 
 /* DMA engine register BIT mask wrappers.. */
-#define DMA_START_BIT		MISC1_TX_SW_DESC_LIST_CTRL_STS_TX_DMA_RUN_STOP_MASK
+#define DMA_START_BIT	MISC1_TX_SW_DESC_LIST_CTRL_STS_TX_DMA_RUN_STOP_MASK
 
-#define GET_RX_INTR_MASK (INTR_INTR_STATUS_L1_UV_RX_DMA_ERR_INTR_MASK |		\
-			  INTR_INTR_STATUS_L1_UV_RX_DMA_DONE_INTR_MASK |	\
-			  INTR_INTR_STATUS_L1_Y_RX_DMA_ERR_INTR_MASK |		\
-			  INTR_INTR_STATUS_L1_Y_RX_DMA_DONE_INTR_MASK |		\
-			  INTR_INTR_STATUS_L0_UV_RX_DMA_ERR_INTR_MASK |		\
-			  INTR_INTR_STATUS_L0_UV_RX_DMA_DONE_INTR_MASK |	\
-			  INTR_INTR_STATUS_L0_Y_RX_DMA_ERR_INTR_MASK |		\
-			  INTR_INTR_STATUS_L0_Y_RX_DMA_DONE_INTR_MASK)
+#define GET_RX_INTR_MASK (INTR_INTR_STATUS_L1_UV_RX_DMA_ERR_INTR_MASK |	\
+	INTR_INTR_STATUS_L1_UV_RX_DMA_DONE_INTR_MASK |	\
+	INTR_INTR_STATUS_L1_Y_RX_DMA_ERR_INTR_MASK |		\
+	INTR_INTR_STATUS_L1_Y_RX_DMA_DONE_INTR_MASK |		\
+	INTR_INTR_STATUS_L0_UV_RX_DMA_ERR_INTR_MASK |		\
+	INTR_INTR_STATUS_L0_UV_RX_DMA_DONE_INTR_MASK |	\
+	INTR_INTR_STATUS_L0_Y_RX_DMA_ERR_INTR_MASK |		\
+	INTR_INTR_STATUS_L0_Y_RX_DMA_DONE_INTR_MASK)
 
-#define GET_Y0_ERR_MSK (MISC1_Y_RX_ERROR_STATUS_RX_L0_OVERRUN_ERROR_MASK |		\
-			MISC1_Y_RX_ERROR_STATUS_RX_L0_UNDERRUN_ERROR_MASK |		\
-			MISC1_Y_RX_ERROR_STATUS_RX_L0_DESC_TX_ABORT_ERRORS_MASK |	\
-			MISC1_Y_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK)
+#define GET_Y0_ERR_MSK (MISC1_Y_RX_ERROR_STATUS_RX_L0_OVERRUN_ERROR_MASK | \
+	MISC1_Y_RX_ERROR_STATUS_RX_L0_UNDERRUN_ERROR_MASK |		\
+	MISC1_Y_RX_ERROR_STATUS_RX_L0_DESC_TX_ABORT_ERRORS_MASK |	\
+	MISC1_Y_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK)
 
-#define GET_UV0_ERR_MSK (MISC1_UV_RX_ERROR_STATUS_RX_L0_OVERRUN_ERROR_MASK |		\
-			 MISC1_UV_RX_ERROR_STATUS_RX_L0_UNDERRUN_ERROR_MASK |		\
-			 MISC1_UV_RX_ERROR_STATUS_RX_L0_DESC_TX_ABORT_ERRORS_MASK |	\
-			 MISC1_UV_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK)
+#define GET_UV0_ERR_MSK (MISC1_UV_RX_ERROR_STATUS_RX_L0_OVERRUN_ERROR_MASK | \
+	MISC1_UV_RX_ERROR_STATUS_RX_L0_UNDERRUN_ERROR_MASK |		\
+	MISC1_UV_RX_ERROR_STATUS_RX_L0_DESC_TX_ABORT_ERRORS_MASK |	\
+	MISC1_UV_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK)
 
-#define GET_Y1_ERR_MSK (MISC1_Y_RX_ERROR_STATUS_RX_L1_OVERRUN_ERROR_MASK |		\
-			MISC1_Y_RX_ERROR_STATUS_RX_L1_UNDERRUN_ERROR_MASK |		\
-			MISC1_Y_RX_ERROR_STATUS_RX_L1_DESC_TX_ABORT_ERRORS_MASK |	\
-			MISC1_Y_RX_ERROR_STATUS_RX_L1_FIFO_FULL_ERRORS_MASK)
+#define GET_Y1_ERR_MSK (MISC1_Y_RX_ERROR_STATUS_RX_L1_OVERRUN_ERROR_MASK | \
+	MISC1_Y_RX_ERROR_STATUS_RX_L1_UNDERRUN_ERROR_MASK |		\
+	MISC1_Y_RX_ERROR_STATUS_RX_L1_DESC_TX_ABORT_ERRORS_MASK |	\
+	MISC1_Y_RX_ERROR_STATUS_RX_L1_FIFO_FULL_ERRORS_MASK)
 
-#define GET_UV1_ERR_MSK	(MISC1_UV_RX_ERROR_STATUS_RX_L1_OVERRUN_ERROR_MASK |		\
-			 MISC1_UV_RX_ERROR_STATUS_RX_L1_UNDERRUN_ERROR_MASK |		\
-			 MISC1_UV_RX_ERROR_STATUS_RX_L1_DESC_TX_ABORT_ERRORS_MASK |	\
-			 MISC1_UV_RX_ERROR_STATUS_RX_L1_FIFO_FULL_ERRORS_MASK)
+#define GET_UV1_ERR_MSK	(MISC1_UV_RX_ERROR_STATUS_RX_L1_OVERRUN_ERROR_MASK | \
+	MISC1_UV_RX_ERROR_STATUS_RX_L1_UNDERRUN_ERROR_MASK |		\
+	MISC1_UV_RX_ERROR_STATUS_RX_L1_DESC_TX_ABORT_ERRORS_MASK |	\
+	MISC1_UV_RX_ERROR_STATUS_RX_L1_FIFO_FULL_ERRORS_MASK)
 
 
 /**** API Exposed to the other layers ****/
 enum BC_STATUS crystalhd_download_fw(struct crystalhd_adp *adp,
 			      void *buffer, uint32_t sz);
-enum BC_STATUS crystalhd_do_fw_cmd(struct crystalhd_hw *hw, struct BC_FW_CMD *fw_cmd);
-bool crystalhd_hw_interrupt(struct crystalhd_adp *adp, struct crystalhd_hw *hw);
-enum BC_STATUS crystalhd_hw_open(struct crystalhd_hw *, struct crystalhd_adp *);
+enum BC_STATUS crystalhd_do_fw_cmd(struct crystalhd_hw *hw,
+				 struct BC_FW_CMD *fw_cmd);
+bool crystalhd_hw_interrupt(struct crystalhd_adp *adp,
+				 struct crystalhd_hw *hw);
+enum BC_STATUS crystalhd_hw_open(struct crystalhd_hw *,
+				 struct crystalhd_adp *);
 enum BC_STATUS crystalhd_hw_close(struct crystalhd_hw *);
 enum BC_STATUS crystalhd_hw_setup_dma_rings(struct crystalhd_hw *);
 enum BC_STATUS crystalhd_hw_free_dma_rings(struct crystalhd_hw *);
 
 
-enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw, struct crystalhd_dio_req *ioreq,
+enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw,
+			     struct crystalhd_dio_req *ioreq,
 			     hw_comp_callback call_back,
 			     wait_queue_head_t *cb_event,
 			     uint32_t *list_id, uint8_t data_flags);
@@ -374,15 +389,17 @@
 enum BC_STATUS crystalhd_hw_pause(struct crystalhd_hw *hw);
 enum BC_STATUS crystalhd_hw_unpause(struct crystalhd_hw *hw);
 enum BC_STATUS crystalhd_hw_suspend(struct crystalhd_hw *hw);
-enum BC_STATUS crystalhd_hw_cancel_tx(struct crystalhd_hw *hw, uint32_t list_id);
+enum BC_STATUS crystalhd_hw_cancel_tx(struct crystalhd_hw *hw,
+				 uint32_t list_id);
 enum BC_STATUS crystalhd_hw_add_cap_buffer(struct crystalhd_hw *hw,
-				    struct crystalhd_dio_req *ioreq, bool en_post);
+			 struct crystalhd_dio_req *ioreq, bool en_post);
 enum BC_STATUS crystalhd_hw_get_cap_buffer(struct crystalhd_hw *hw,
 				    struct BC_PIC_INFO_BLOCK *pib,
 				    struct crystalhd_dio_req **ioreq);
 enum BC_STATUS crystalhd_hw_stop_capture(struct crystalhd_hw *hw);
 enum BC_STATUS crystalhd_hw_start_capture(struct crystalhd_hw *hw);
-void crystalhd_hw_stats(struct crystalhd_hw *hw, struct crystalhd_hw_stats *stats);
+void crystalhd_hw_stats(struct crystalhd_hw *hw,
+			 struct crystalhd_hw_stats *stats);
 
 /* API to program the core clock on the decoder */
 enum BC_STATUS crystalhd_hw_set_core_clock(struct crystalhd_hw *);

diff --git a/drivers/staging/crystalhd/crystalhd_lnx.c b/drivers/staging/crystalhd/crystalhd_lnx.c
index 85f51fb..c1f6163 100644
--- a/drivers/staging/crystalhd/crystalhd_lnx.c
+++ b/drivers/staging/crystalhd/crystalhd_lnx.c

@@ -75,7 +75,8 @@
 	return 0;
 }
 
-struct crystalhd_ioctl_data *chd_dec_alloc_iodata(struct crystalhd_adp *adp, bool isr)
+struct crystalhd_ioctl_data *chd_dec_alloc_iodata(struct crystalhd_adp *adp,
+					 bool isr)
 {
 	unsigned long flags = 0;
 	struct crystalhd_ioctl_data *temp;
@@ -95,8 +96,8 @@
 	return temp;
 }
 
-void chd_dec_free_iodata(struct crystalhd_adp *adp, struct crystalhd_ioctl_data *iodata,
-			 bool isr)
+void chd_dec_free_iodata(struct crystalhd_adp *adp,
+			 struct crystalhd_ioctl_data *iodata, bool isr)
 {
 	unsigned long flags = 0;
 
@@ -109,7 +110,8 @@
 	spin_unlock_irqrestore(&adp->lock, flags);
 }
 
-static inline int crystalhd_user_data(unsigned long ud, void *dr, int size, int set)
+static inline int crystalhd_user_data(unsigned long ud, void *dr,
+			 int size, int set)
 {
 	int rc;
 
@@ -131,8 +133,8 @@
 	return rc;
 }
 
-static int chd_dec_fetch_cdata(struct crystalhd_adp *adp, struct crystalhd_ioctl_data *io,
-			       uint32_t m_sz, unsigned long ua)
+static int chd_dec_fetch_cdata(struct crystalhd_adp *adp,
+	 struct crystalhd_ioctl_data *io, uint32_t m_sz, unsigned long ua)
 {
 	unsigned long ua_off;
 	int rc = 0;
@@ -163,7 +165,7 @@
 }
 
 static int chd_dec_release_cdata(struct crystalhd_adp *adp,
-				 struct crystalhd_ioctl_data *io, unsigned long ua)
+			 struct crystalhd_ioctl_data *io, unsigned long ua)
 {
 	unsigned long ua_off;
 	int rc;
@@ -178,8 +180,9 @@
 		rc = crystalhd_user_data(ua_off, io->add_cdata,
 					io->add_cdata_sz, 1);
 		if (rc) {
-			BCMLOG_ERR("failed to push add_cdata sz:%x ua_off:%x\n",
-				   io->add_cdata_sz, (unsigned int)ua_off);
+			BCMLOG_ERR(
+				"failed to push add_cdata sz:%x ua_off:%x\n",
+				 io->add_cdata_sz, (unsigned int)ua_off);
 			return -ENODATA;
 		}
 	}
@@ -252,10 +255,7 @@
 		rc = chd_dec_proc_user_data(adp, temp, ua, 1);
 	}
 
-	if (temp) {
-		chd_dec_free_iodata(adp, temp, 0);
-		temp = NULL;
-	}
+	chd_dec_free_iodata(adp, temp, 0);
 
 	return rc;
 }
@@ -378,8 +378,8 @@
 		goto class_create_fail;
 	}
 
-	dev = device_create(crystalhd_class, NULL, MKDEV(adp->chd_dec_major, 0),
-			    NULL, "crystalhd");
+	dev = device_create(crystalhd_class, NULL,
+			 MKDEV(adp->chd_dec_major, 0), NULL, "crystalhd");
 	if (IS_ERR(dev)) {
 		rc = PTR_ERR(dev);
 		BCMLOG_ERR("failed to create device\n");
@@ -394,7 +394,8 @@
 
 	/* Allocate general purpose ioctl pool. */
 	for (i = 0; i < CHD_IODATA_POOL_SZ; i++) {
-		temp = kzalloc(sizeof(struct crystalhd_ioctl_data), GFP_KERNEL);
+		temp = kzalloc(sizeof(struct crystalhd_ioctl_data),
+					 GFP_KERNEL);
 		if (!temp) {
 			BCMLOG_ERR("ioctl data pool kzalloc failed\n");
 			rc = -ENOMEM;

diff --git a/drivers/staging/crystalhd/crystalhd_lnx.h b/drivers/staging/crystalhd/crystalhd_lnx.h
index a9e3633..bac572a 100644
--- a/drivers/staging/crystalhd/crystalhd_lnx.h
+++ b/drivers/staging/crystalhd/crystalhd_lnx.h

@@ -77,8 +77,8 @@
 	int		chd_dec_major;
 	unsigned int		cfg_users;
 
-	struct crystalhd_ioctl_data	*idata_free_head;	/* ioctl data pool */
-	struct crystalhd_elem		*elem_pool_head;	/* Queue element pool */
+	struct crystalhd_ioctl_data	*idata_free_head; /* ioctl data pool */
+	struct crystalhd_elem	*elem_pool_head; /* Queue element pool */
 
 	struct crystalhd_cmd	cmds;
 

diff --git a/drivers/staging/crystalhd/crystalhd_misc.c b/drivers/staging/crystalhd/crystalhd_misc.c
index a5f109c..51f6980 100644
--- a/drivers/staging/crystalhd/crystalhd_misc.c
+++ b/drivers/staging/crystalhd/crystalhd_misc.c

@@ -30,19 +30,22 @@
 
 uint32_t g_linklog_level;
 
-static inline uint32_t crystalhd_dram_rd(struct crystalhd_adp *adp, uint32_t mem_off)
+static inline uint32_t crystalhd_dram_rd(struct crystalhd_adp *adp,
+					 uint32_t mem_off)
 {
 	crystalhd_reg_wr(adp, DCI_DRAM_BASE_ADDR, (mem_off >> 19));
 	return bc_dec_reg_rd(adp, (0x00380000 | (mem_off & 0x0007FFFF)));
 }
 
-static inline void crystalhd_dram_wr(struct crystalhd_adp *adp, uint32_t mem_off, uint32_t val)
+static inline void crystalhd_dram_wr(struct crystalhd_adp *adp,
+					 uint32_t mem_off, uint32_t val)
 {
 	crystalhd_reg_wr(adp, DCI_DRAM_BASE_ADDR, (mem_off >> 19));
 	bc_dec_reg_wr(adp, (0x00380000 | (mem_off & 0x0007FFFF)), val);
 }
 
-static inline enum BC_STATUS bc_chk_dram_range(struct crystalhd_adp *adp, uint32_t start_off, uint32_t cnt)
+static inline enum BC_STATUS bc_chk_dram_range(struct crystalhd_adp *adp,
+					 uint32_t start_off, uint32_t cnt)
 {
 	return BC_STS_SUCCESS;
 }
@@ -66,7 +69,8 @@
 	return temp;
 }
 
-static void crystalhd_free_dio(struct crystalhd_adp *adp, struct crystalhd_dio_req *dio)
+static void crystalhd_free_dio(struct crystalhd_adp *adp,
+					 struct crystalhd_dio_req *dio)
 {
 	unsigned long flags = 0;
 
@@ -99,7 +103,8 @@
 
 	return temp;
 }
-static void crystalhd_free_elem(struct crystalhd_adp *adp, struct crystalhd_elem *elem)
+static void crystalhd_free_elem(struct crystalhd_adp *adp,
+					 struct crystalhd_elem *elem)
 {
 	unsigned long flags = 0;
 
@@ -120,7 +125,8 @@
 #endif
 }
 
-static inline void crystalhd_init_sg(struct scatterlist *sg, unsigned int entries)
+static inline void crystalhd_init_sg(struct scatterlist *sg,
+					 unsigned int entries)
 {
 	/* http://lkml.org/lkml/2007/11/27/68 */
 	sg_init_table(sg, entries);
@@ -208,7 +214,8 @@
  * configuration space.
  *
  */
-void crystalhd_reg_wr(struct crystalhd_adp *adp, uint32_t reg_off, uint32_t val)
+void crystalhd_reg_wr(struct crystalhd_adp *adp, uint32_t reg_off,
+					 uint32_t val)
 {
 	if (!adp || (reg_off > adp->pci_i2o_len)) {
 		BCMLOG_ERR("link_wr_reg_off outof range: 0x%08x\n", reg_off);
@@ -469,7 +476,8 @@
  * by calling the call back provided during creation.
  *
  */
-void crystalhd_delete_dioq(struct crystalhd_adp *adp, struct crystalhd_dioq *dioq)
+void crystalhd_delete_dioq(struct crystalhd_adp *adp,
+			 struct crystalhd_dioq *dioq)
 {
 	void *temp;
 
@@ -639,7 +647,8 @@
 	while ((ioq->count == 0) && count) {
 		spin_unlock_irqrestore(&ioq->lock, flags);
 
-		crystalhd_wait_on_event(&ioq->event, (ioq->count > 0), 1000, rc, 0);
+		crystalhd_wait_on_event(&ioq->event,
+				 (ioq->count > 0), 1000, rc, 0);
 		if (rc == 0) {
 			goto out;
 		} else if (rc == -EINTR) {
@@ -678,7 +687,8 @@
 			  struct crystalhd_dio_req **dio_hnd)
 {
 	struct crystalhd_dio_req	*dio;
-	/* FIXME: jarod: should some of these unsigned longs be uint32_t or uintptr_t? */
+	/* FIXME: jarod: should some of these
+	 unsigned longs be uint32_t or uintptr_t? */
 	unsigned long start = 0, end = 0, uaddr = 0, count = 0;
 	unsigned long spsz = 0, uv_start = 0;
 	int i = 0, rw = 0, res = 0, nr_pages = 0, skip_fb_sg = 0;
@@ -723,7 +733,8 @@
 	if (uv_offset) {
 		uv_start = (uaddr + (unsigned long)uv_offset)  >> PAGE_SHIFT;
 		dio->uinfo.uv_sg_ix = uv_start - start;
-		dio->uinfo.uv_sg_off = ((uaddr + (unsigned long)uv_offset) & ~PAGE_MASK);
+		dio->uinfo.uv_sg_off = ((uaddr + (unsigned long)uv_offset) &
+					 ~PAGE_MASK);
 	}
 
 	dio->fb_size = ubuff_sz & 0x03;
@@ -819,7 +830,8 @@
  *
  * This routine is to unmap the user buffer pages.
  */
-enum BC_STATUS crystalhd_unmap_dio(struct crystalhd_adp *adp, struct crystalhd_dio_req *dio)
+enum BC_STATUS crystalhd_unmap_dio(struct crystalhd_adp *adp,
+				 struct crystalhd_dio_req *dio)
 {
 	struct page *page = NULL;
 	int j = 0;
@@ -841,7 +853,8 @@
 		}
 	}
 	if (dio->sig == crystalhd_dio_sg_mapped)
-		pci_unmap_sg(adp->pdev, dio->sg, dio->page_cnt, dio->direction);
+		pci_unmap_sg(adp->pdev, dio->sg, dio->page_cnt,
+			 dio->direction);
 
 	crystalhd_free_dio(adp, dio);
 

diff --git a/drivers/staging/crystalhd/crystalhd_misc.h b/drivers/staging/crystalhd/crystalhd_misc.h
index 8cdaa7a..4dae3a7 100644
--- a/drivers/staging/crystalhd/crystalhd_misc.h
+++ b/drivers/staging/crystalhd/crystalhd_misc.h

@@ -127,12 +127,16 @@
 void crystalhd_reg_wr(struct crystalhd_adp *, uint32_t, uint32_t);
 
 /*========= Decoder (7412) memory access routines..=================*/
-enum BC_STATUS crystalhd_mem_rd(struct crystalhd_adp *, uint32_t, uint32_t, uint32_t *);
-enum BC_STATUS crystalhd_mem_wr(struct crystalhd_adp *, uint32_t, uint32_t, uint32_t *);
+enum BC_STATUS crystalhd_mem_rd(struct crystalhd_adp *,
+			 uint32_t, uint32_t, uint32_t *);
+enum BC_STATUS crystalhd_mem_wr(struct crystalhd_adp *,
+			 uint32_t, uint32_t, uint32_t *);
 
 /*==========Link (70012) PCIe Config access routines.================*/
-enum BC_STATUS crystalhd_pci_cfg_rd(struct crystalhd_adp *, uint32_t, uint32_t, uint32_t *);
-enum BC_STATUS crystalhd_pci_cfg_wr(struct crystalhd_adp *, uint32_t, uint32_t, uint32_t);
+enum BC_STATUS crystalhd_pci_cfg_rd(struct crystalhd_adp *,
+			 uint32_t, uint32_t, uint32_t *);
+enum BC_STATUS crystalhd_pci_cfg_wr(struct crystalhd_adp *,
+			 uint32_t, uint32_t, uint32_t);
 
 /*========= Linux Kernel Interface routines. ======================= */
 void *bc_kern_dma_alloc(struct crystalhd_adp *, uint32_t, dma_addr_t *);
@@ -168,20 +172,26 @@
 /*================ Direct IO mapping routines ==================*/
 extern int crystalhd_create_dio_pool(struct crystalhd_adp *, uint32_t);
 extern void crystalhd_destroy_dio_pool(struct crystalhd_adp *);
-extern enum BC_STATUS crystalhd_map_dio(struct crystalhd_adp *, void *, uint32_t,
-				   uint32_t, bool, bool, struct crystalhd_dio_req**);
+extern enum BC_STATUS crystalhd_map_dio(struct crystalhd_adp *, void *,
+		 uint32_t, uint32_t, bool, bool, struct crystalhd_dio_req**);
 
-extern enum BC_STATUS crystalhd_unmap_dio(struct crystalhd_adp *, struct crystalhd_dio_req*);
+extern enum BC_STATUS crystalhd_unmap_dio(struct crystalhd_adp *,
+					 struct crystalhd_dio_req*);
 #define crystalhd_get_sgle_paddr(_dio, _ix) (cpu_to_le64(sg_dma_address(&_dio->sg[_ix])))
 #define crystalhd_get_sgle_len(_dio, _ix) (cpu_to_le32(sg_dma_len(&_dio->sg[_ix])))
 
 /*================ General Purpose Queues ==================*/
-extern enum BC_STATUS crystalhd_create_dioq(struct crystalhd_adp *, struct crystalhd_dioq **, crystalhd_data_free_cb , void *);
-extern void crystalhd_delete_dioq(struct crystalhd_adp *, struct crystalhd_dioq *);
-extern enum BC_STATUS crystalhd_dioq_add(struct crystalhd_dioq *ioq, void *data, bool wake, uint32_t tag);
+extern enum BC_STATUS crystalhd_create_dioq(struct crystalhd_adp *,
+		 struct crystalhd_dioq **, crystalhd_data_free_cb , void *);
+extern void crystalhd_delete_dioq(struct crystalhd_adp *,
+		 struct crystalhd_dioq *);
+extern enum BC_STATUS crystalhd_dioq_add(struct crystalhd_dioq *ioq,
+		 void *data, bool wake, uint32_t tag);
 extern void *crystalhd_dioq_fetch(struct crystalhd_dioq *ioq);
-extern void *crystalhd_dioq_find_and_fetch(struct crystalhd_dioq *ioq, uint32_t tag);
-extern void *crystalhd_dioq_fetch_wait(struct crystalhd_dioq *ioq, uint32_t to_secs, uint32_t *sig_pend);
+extern void *crystalhd_dioq_find_and_fetch(struct crystalhd_dioq *ioq,
+		 uint32_t tag);
+extern void *crystalhd_dioq_fetch_wait(struct crystalhd_dioq *ioq,
+		 uint32_t to_secs, uint32_t *sig_pend);
 
 #define crystalhd_dioq_count(_ioq)	((_ioq) ? _ioq->count : 0)
 
@@ -190,7 +200,8 @@
 
 
 /*================ Debug routines/macros .. ================================*/
-extern void crystalhd_show_buffer(uint32_t off, uint8_t *buff, uint32_t dwcount);
+extern void crystalhd_show_buffer(uint32_t off, uint8_t *buff,
+		 uint32_t dwcount);
 
 enum _chd_log_levels {
 	BCMLOG_ERROR		= 0x80000000,	/* Don't disable this option */

diff --git a/drivers/staging/csr/bh.c b/drivers/staging/csr/bh.c
index b53a9e2..d795852 100644
--- a/drivers/staging/csr/bh.c
+++ b/drivers/staging/csr/bh.c

@@ -241,67 +241,72 @@
 	this_thread = &priv->bh_thread;
 
 	t = timeout = 0;
-    while (!kthread_should_stop()) {
-        /* wait until an error occurs, or we need to process something. */
-        unifi_trace(priv, UDBG3, "bh_thread goes to sleep.\n");
+	while (!kthread_should_stop()) {
+		/*
+		* wait until an error occurs,
+		* or we need to process something.
+		*/
+		unifi_trace(priv, UDBG3, "bh_thread goes to sleep.\n");
 
-        if (timeout > 0) {
-            /* Convert t in ms to jiffies */
-            t = msecs_to_jiffies(timeout);
-            ret = wait_event_interruptible_timeout(this_thread->wakeup_q,
-                    (this_thread->wakeup_flag && !this_thread->block_thread) ||
-                    kthread_should_stop(),
-                    t);
-            timeout = (ret > 0) ? jiffies_to_msecs(ret) : 0;
-        } else {
-            ret = wait_event_interruptible(this_thread->wakeup_q,
-                    (this_thread->wakeup_flag && !this_thread->block_thread) ||
-                    kthread_should_stop());
-        }
+		if (timeout > 0) {
+			/* Convert t in ms to jiffies */
+			t = msecs_to_jiffies(timeout);
+			ret = wait_event_interruptible_timeout(
+				this_thread->wakeup_q,
+				(this_thread->wakeup_flag && !this_thread->block_thread) ||
+				kthread_should_stop(),
+				t);
+			timeout = (ret > 0) ? jiffies_to_msecs(ret) : 0;
+		} else {
+			ret = wait_event_interruptible(this_thread->wakeup_q,
+				(this_thread->wakeup_flag && !this_thread->block_thread) ||
+				kthread_should_stop());
+		}
 
-        if (kthread_should_stop()) {
-            unifi_trace(priv, UDBG2, "bh_thread: signalled to exit\n");
-            break;
-        }
+		if (kthread_should_stop()) {
+			unifi_trace(priv, UDBG2,
+				"bh_thread: signalled to exit\n");
+			break;
+		}
 
-        if (ret < 0) {
-            unifi_notice(priv,
-                    "bh_thread: wait_event returned %d, thread will exit\n",
-                    ret);
-            uf_wait_for_thread_to_stop(priv, this_thread);
-            break;
-        }
+		if (ret < 0) {
+			unifi_notice(priv,
+				"bh_thread: wait_event returned %d, thread will exit\n",
+				ret);
+			uf_wait_for_thread_to_stop(priv, this_thread);
+			break;
+		}
 
-        this_thread->wakeup_flag = 0;
+		this_thread->wakeup_flag = 0;
 
-        unifi_trace(priv, UDBG3, "bh_thread calls unifi_bh().\n");
+		unifi_trace(priv, UDBG3, "bh_thread calls unifi_bh().\n");
 
-        CsrSdioClaim(priv->sdio);
-        csrResult = unifi_bh(priv->card, &timeout);
-        if(csrResult != CSR_RESULT_SUCCESS) {
-            if (csrResult == CSR_WIFI_HIP_RESULT_NO_DEVICE) {
-                CsrSdioRelease(priv->sdio);
-                uf_wait_for_thread_to_stop(priv, this_thread);
-                break;
-            }
-            /* Errors must be delivered to the error task */
-            handle_bh_error(priv);
-        }
-        CsrSdioRelease(priv->sdio);
-    }
+		CsrSdioClaim(priv->sdio);
+		csrResult = unifi_bh(priv->card, &timeout);
+		if (csrResult != CSR_RESULT_SUCCESS) {
+			if (csrResult == CSR_WIFI_HIP_RESULT_NO_DEVICE) {
+				CsrSdioRelease(priv->sdio);
+				uf_wait_for_thread_to_stop(priv, this_thread);
+				break;
+			}
+			/* Errors must be delivered to the error task */
+			handle_bh_error(priv);
+		}
+		CsrSdioRelease(priv->sdio);
+	}
 
-    /*
-     * I would normally try to call csr_sdio_remove_irq() here to make sure
-     * that we do not get any interrupts while this thread is not running.
-     * However, the MMC/SDIO driver tries to kill its' interrupt thread.
-     * The kernel threads implementation does not allow to kill threads
-     * from a signalled to stop thread.
-     * So, instead call csr_sdio_linux_remove_irq() always after calling
-     * uf_stop_thread() to kill this thread.
-     */
+	/*
+	 * I would normally try to call csr_sdio_remove_irq() here to make sure
+	* that we do not get any interrupts while this thread is not running.
+	* However, the MMC/SDIO driver tries to kill its' interrupt thread.
+	* The kernel threads implementation does not allow to kill threads
+	* from a signalled to stop thread.
+	* So, instead call csr_sdio_linux_remove_irq() always after calling
+	* uf_stop_thread() to kill this thread.
+	*/
 
-    unifi_trace(priv, UDBG2, "bh_thread exiting....\n");
-    return 0;
+	unifi_trace(priv, UDBG2, "bh_thread exiting....\n");
+	return 0;
 } /* bh_thread_function() */
 
 
@@ -319,33 +324,33 @@
  *      0 on success or else a Linux error code.
  * ---------------------------------------------------------------------------
  */
-    int
+int
 uf_init_bh(unifi_priv_t *priv)
 {
-    int r;
+	int r;
 
-    /* Enable mlme interface. */
-    priv->io_aborted = 0;
+	/* Enable mlme interface. */
+	priv->io_aborted = 0;
 
 
-    /* Start the BH thread */
-    r = uf_start_thread(priv, &priv->bh_thread, bh_thread_function);
-    if (r) {
-        unifi_error(priv,
-                "uf_init_bh: failed to start the BH thread.\n");
-        return r;
-    }
+	/* Start the BH thread */
+	r = uf_start_thread(priv, &priv->bh_thread, bh_thread_function);
+	if (r) {
+		unifi_error(priv,
+			"uf_init_bh: failed to start the BH thread.\n");
+		return r;
+	}
 
-    /* Allow interrupts */
-    r = csr_sdio_linux_install_irq(priv->sdio);
-    if (r) {
-        unifi_error(priv,
-                "uf_init_bh: failed to install the IRQ.\n");
+	/* Allow interrupts */
+	r = csr_sdio_linux_install_irq(priv->sdio);
+	if (r) {
+		unifi_error(priv,
+			"uf_init_bh: failed to install the IRQ.\n");
 
-        uf_stop_thread(priv, &priv->bh_thread);
-    }
+		uf_stop_thread(priv, &priv->bh_thread);
+	}
 
-    return r;
+	return r;
 } /* uf_init_bh() */
 
 
@@ -370,28 +375,30 @@
  */
 CsrResult unifi_run_bh(void *ospriv)
 {
-    unifi_priv_t *priv = ospriv;
+	unifi_priv_t *priv = ospriv;
 
-    /*
-     * If an error has occurred, we discard silently all messages from the bh
-     * until the error has been processed and the unifi has been reinitialised.
-     */
-    if (priv->bh_thread.block_thread == 1) {
-        unifi_trace(priv, UDBG3, "unifi_run_bh: discard message.\n");
-        /*
-         * Do not try to acknowledge a pending interrupt here.
-         * This function is called by unifi_send_signal() which in turn can be
-         * running in an atomic or 'disabled irq' level if a signal is sent
-         * from a workqueue task (i.e multicass addresses set).
-         * We can not hold the SDIO lock because it might sleep.
-         */
-        return CSR_RESULT_FAILURE;
-    }
+	/*
+	* If an error has occurred, we discard silently all messages from the bh
+	* until the error has been processed and the unifi has been
+	* reinitialised.
+	*/
+	if (priv->bh_thread.block_thread == 1) {
+		unifi_trace(priv, UDBG3, "unifi_run_bh: discard message.\n");
+		/*
+		* Do not try to acknowledge a pending interrupt here.
+		* This function is called by unifi_send_signal()
+		* which in turn can be running in an atomic or 'disabled irq'
+		* level if a signal is sent from a workqueue task
+		* (i.e multicass addresses set). We can not hold the SDIO lock
+		* because it might sleep.
+		*/
+		return CSR_RESULT_FAILURE;
+	}
 
-    priv->bh_thread.wakeup_flag = 1;
-    /* wake up I/O thread */
-    wake_up_interruptible(&priv->bh_thread.wakeup_q);
+	priv->bh_thread.wakeup_flag = 1;
+	/* wake up I/O thread */
+	wake_up_interruptible(&priv->bh_thread.wakeup_q);
 
-    return CSR_RESULT_SUCCESS;
+	return CSR_RESULT_SUCCESS;
 } /* unifi_run_bh() */
 

diff --git a/drivers/staging/csr/csr_framework_ext.c b/drivers/staging/csr/csr_framework_ext.c
index 2aabb6c..98122bc 100644
--- a/drivers/staging/csr/csr_framework_ext.c
+++ b/drivers/staging/csr/csr_framework_ext.c

@@ -1,10 +1,10 @@
 /*****************************************************************************
 
-            (c) Cambridge Silicon Radio Limited 2010
-            All rights reserved and confidential information of CSR
+		(c) Cambridge Silicon Radio Limited 2010
+		All rights reserved and confidential information of CSR
 
-            Refer to LICENSE.txt included with this source for details
-            on the license terms.
+		Refer to LICENSE.txt included with this source for details
+		on the license terms.
 
 *****************************************************************************/
 
@@ -31,10 +31,10 @@
  *----------------------------------------------------------------------------*/
 void CsrThreadSleep(u16 sleepTimeInMs)
 {
-    unsigned long t;
+	unsigned long t;
 
-    /* Convert t in ms to jiffies and round up */
-    t = ((sleepTimeInMs * HZ) + 999) / 1000;
-    schedule_timeout_uninterruptible(t);
+	/* Convert t in ms to jiffies and round up */
+	t = ((sleepTimeInMs * HZ) + 999) / 1000;
+	schedule_timeout_uninterruptible(t);
 }
 EXPORT_SYMBOL_GPL(CsrThreadSleep);

diff --git a/drivers/staging/csr/csr_framework_ext.h b/drivers/staging/csr/csr_framework_ext.h
index e8ae490..6d26ac6 100644
--- a/drivers/staging/csr/csr_framework_ext.h
+++ b/drivers/staging/csr/csr_framework_ext.h

@@ -2,11 +2,11 @@
 #define CSR_FRAMEWORK_EXT_H__
 /*****************************************************************************
 
-            (c) Cambridge Silicon Radio Limited 2010
-            All rights reserved and confidential information of CSR
+		(c) Cambridge Silicon Radio Limited 2010
+	All rights reserved and confidential information of CSR
 
-            Refer to LICENSE.txt included with this source for details
-            on the license terms.
+		Refer to LICENSE.txt included with this source for details
+		on the license terms.
 
 *****************************************************************************/
 

diff --git a/drivers/staging/csr/csr_wifi_nme_ap_sef.c b/drivers/staging/csr/csr_wifi_nme_ap_sef.c
index e048848..bfebb15 100644
--- a/drivers/staging/csr/csr_wifi_nme_ap_sef.c
+++ b/drivers/staging/csr/csr_wifi_nme_ap_sef.c

@@ -21,10 +21,10 @@
             CsrWifiNmeApStopCfmHandler(drvpriv, msg);
             break;
         case CSR_WIFI_NME_AP_CONFIG_SET_CFM:
-            CsrWifiNmeApConfigSetCfmHandler(drvpriv,msg);
+            CsrWifiNmeApConfigSetCfmHandler(drvpriv, msg);
             break;
         default:
-	    unifi_error(drvpriv, "CsrWifiNmeApUpstreamStateHandlers: unhandled NME_AP message type 0x%.4X\n",msg->type);
+	    unifi_error(drvpriv, "CsrWifiNmeApUpstreamStateHandlers: unhandled NME_AP message type 0x%.4X\n", msg->type);
             break;
     }
 }

diff --git a/drivers/staging/csr/drv.c b/drivers/staging/csr/drv.c
index bdc2523..92898de 100644
--- a/drivers/staging/csr/drv.c
+++ b/drivers/staging/csr/drv.c

@@ -1159,13 +1159,13 @@
             break;
 #ifdef CSR_SUPPORT_SME
           case UNIFI_CFG_CORE_DUMP:
-            CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,CSR_WIFI_SME_CONTROL_INDICATION_ERROR);
+            CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, CSR_WIFI_SME_CONTROL_INDICATION_ERROR);
             unifi_trace(priv, UDBG2, "UNIFI_CFG_CORE_DUMP: sent wifi off indication\n");
             break;
 #endif
 #ifdef CSR_SUPPORT_WEXT_AP
           case UNIFI_CFG_SET_AP_CONFIG:
-            r= unifi_cfg_set_ap_config(priv,(unsigned char*)arg);
+            r= unifi_cfg_set_ap_config(priv, (unsigned char*)arg);
             break;
 #endif
           default:
@@ -1275,7 +1275,7 @@
             /* Attach the network device to the stack */
             if (!interfacePriv->netdev_registered)
             {
-                r = uf_register_netdev(priv,interfaceTag);
+                r = uf_register_netdev(priv, interfaceTag);
                 if (r) {
                     unifi_error(priv, "Failed to register the network device.\n");
                     goto out;

diff --git a/drivers/staging/csr/io.c b/drivers/staging/csr/io.c
index fe4a7ba..f903022 100644
--- a/drivers/staging/csr/io.c
+++ b/drivers/staging/csr/io.c

@@ -117,7 +117,7 @@
          if (priv->rxSignalBuffer.rx_buff[i].bufptr == NULL)
          {
              int j;
-             unifi_error(priv,"signal_buffer_init:Failed to Allocate shared memory for T-H signals \n");
+             unifi_error(priv, "signal_buffer_init:Failed to Allocate shared memory for T-H signals \n");
              for(j=0;j<i;j++)
              {
                  priv->rxSignalBuffer.rx_buff[j].sig_len=0;
@@ -360,13 +360,13 @@
 
         for(i=1;i<CSR_WIFI_NUM_INTERFACES;i++)
         {
-            if( !uf_alloc_netdevice_for_other_interfaces(priv,i) )
+            if( !uf_alloc_netdevice_for_other_interfaces(priv, i) )
             {
                 /* error occured while allocating the net_device for interface[i]. The net_device are
                  * allocated for the interfaces with id<i. Dont worry, all the allocated net_device will
                  * be releasing chen the control goes to the label failed0.
                  */
-                unifi_error(priv, "Failed to allocate driver private for interface[%d]\n",i);
+                unifi_error(priv, "Failed to allocate driver private for interface[%d]\n", i);
                 goto failed0;
             }
             else
@@ -391,12 +391,12 @@
 #ifdef CSR_WIFI_RX_PATH_SPLIT
     if (signal_buffer_init(priv, CSR_WIFI_RX_SIGNAL_BUFFER_SIZE))
     {
-        unifi_error(priv,"Failed to allocate shared memory for T-H signals\n");
+        unifi_error(priv, "Failed to allocate shared memory for T-H signals\n");
         goto failed2;
     }
     priv->rx_workqueue = create_singlethread_workqueue("rx_workq");
     if (priv->rx_workqueue == NULL) {
-        unifi_error(priv,"create_singlethread_workqueue failed \n");
+        unifi_error(priv, "create_singlethread_workqueue failed \n");
         goto failed3;
     }
     INIT_WORK(&priv->rx_work_struct, rx_wq_handler);
@@ -442,7 +442,7 @@
     flush_workqueue(priv->rx_workqueue);
     destroy_workqueue(priv->rx_workqueue);
 failed3:
-    signal_buffer_free(priv,CSR_WIFI_RX_SIGNAL_BUFFER_SIZE);
+    signal_buffer_free(priv, CSR_WIFI_RX_SIGNAL_BUFFER_SIZE);
 failed2:
 #endif
     /* Remove the device nodes */
@@ -558,8 +558,8 @@
     /* Free any packets left in the Rx queues */
     for(i=0;i<CSR_WIFI_NUM_INTERFACES;i++)
     {
-        uf_free_pending_rx_packets(priv, UF_UNCONTROLLED_PORT_Q, broadcast_address,i);
-        uf_free_pending_rx_packets(priv, UF_CONTROLLED_PORT_Q, broadcast_address,i);
+        uf_free_pending_rx_packets(priv, UF_UNCONTROLLED_PORT_Q, broadcast_address, i);
+        uf_free_pending_rx_packets(priv, UF_CONTROLLED_PORT_Q, broadcast_address, i);
     }
     /*
      * We need to free the resources held by the core, which include tx skbs,
@@ -595,7 +595,7 @@
 #ifdef CSR_WIFI_RX_PATH_SPLIT
     flush_workqueue(priv->rx_workqueue);
     destroy_workqueue(priv->rx_workqueue);
-    signal_buffer_free(priv,CSR_WIFI_RX_SIGNAL_BUFFER_SIZE);
+    signal_buffer_free(priv, CSR_WIFI_RX_SIGNAL_BUFFER_SIZE);
 #endif
 
     /* Priv is freed as part of the net_device */

diff --git a/drivers/staging/csr/monitor.c b/drivers/staging/csr/monitor.c
index c8e20e4..e11f6cb 100644
--- a/drivers/staging/csr/monitor.c
+++ b/drivers/staging/csr/monitor.c

@@ -188,7 +188,7 @@
 
 
     skb->dev = dev;
-    skb->mac_header = skb->data;
+    skb_reset_mac_header(skb);
     skb->pkt_type = PACKET_OTHERHOST;
     skb->protocol = __constant_htons(ETH_P_80211_RAW);
     memset(skb->cb, 0, sizeof(skb->cb));

diff --git a/drivers/staging/csr/netdev.c b/drivers/staging/csr/netdev.c
index a0177d9..5ead2d4 100644
--- a/drivers/staging/csr/netdev.c
+++ b/drivers/staging/csr/netdev.c

@@ -754,7 +754,7 @@
         case CSR_WIFI_ROUTER_CTRL_MODE_IBSS:
             {
                 CsrWifiRouterCtrlStaInfo_t * dstStaInfo =
-                    CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv,ehdr->h_dest, interfacePriv->InterfaceTag);
+                    CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv, ehdr->h_dest, interfacePriv->InterfaceTag);
                 unifi_trace(priv, UDBG4, "mode is AP \n");
                 if (!(ehdr->h_dest[0] & 0x01) && dstStaInfo && dstStaInfo->wmmOrQosEnabled) {
                     /* If packet is not Broadcast/multicast */
@@ -1011,7 +1011,7 @@
 #endif
 
     if(skb== NULL || daddr == NULL || saddr == NULL){
-        unifi_error(priv,"skb_80211_to_ether: PBC fail\n");
+        unifi_error(priv, "skb_80211_to_ether: PBC fail\n");
         return 1;
     }
 
@@ -1198,7 +1198,7 @@
     u8 bQosNull = false;
 
     if (skb == NULL) {
-        unifi_error(priv,"prepare_and_add_macheader: Invalid SKB reference\n");
+        unifi_error(priv, "prepare_and_add_macheader: Invalid SKB reference\n");
         return -1;
     }
 
@@ -1383,7 +1383,7 @@
             macHeaderLengthInBytes -= ETH_ALEN;
             break;
         default:
-            unifi_error(priv,"Unknown direction =%d : Not handled now\n",direction);
+            unifi_error(priv, "Unknown direction =%d : Not handled now\n", direction);
             return -1;
     }
     /* 2 bytes of frame control field, appended by firmware */
@@ -1569,8 +1569,8 @@
     memcpy(peerAddress.a, ((u8 *) bulkdata.d[0].os_data_ptr) + 4, ETH_ALEN);
 
     unifi_trace(priv, UDBG5, "RA[0]=%x, RA[1]=%x, RA[2]=%x, RA[3]=%x, RA[4]=%x, RA[5]=%x\n",
-                peerAddress.a[0],peerAddress.a[1], peerAddress.a[2], peerAddress.a[3],
-                peerAddress.a[4],peerAddress.a[5]);
+                peerAddress.a[0], peerAddress.a[1], peerAddress.a[2], peerAddress.a[3],
+                peerAddress.a[4], peerAddress.a[5]);
 
 
     if ((proto == ETH_P_PAE)
@@ -1865,10 +1865,10 @@
 
 #ifdef CSR_SUPPORT_SME
     if(queue<=3) {
-        routerStartBuffering(priv,queue);
-        unifi_trace(priv,UDBG2,"Start buffering %d\n", queue);
+        routerStartBuffering(priv, queue);
+        unifi_trace(priv, UDBG2, "Start buffering %d\n", queue);
      } else {
-        routerStartBuffering(priv,0);
+        routerStartBuffering(priv, 0);
         unifi_error(priv, "Start buffering %d defaulting to 0\n", queue);
      }
 #endif
@@ -1893,11 +1893,11 @@
 
 #ifdef CSR_SUPPORT_SME
     if(queue <=3) {
-        routerStopBuffering(priv,queue);
-        uf_send_buffered_frames(priv,queue);
+        routerStopBuffering(priv, queue);
+        uf_send_buffered_frames(priv, queue);
     } else {
-        routerStopBuffering(priv,0);
-        uf_send_buffered_frames(priv,0);
+        routerStopBuffering(priv, 0);
+        uf_send_buffered_frames(priv, 0);
     }
 #endif
 } /* unifi_restart_xmit() */
@@ -2102,14 +2102,14 @@
             netif_tx_schedule_all(priv->netdev[interfaceTag]);
         }
 #endif
-        uf_process_rx_pending_queue(priv, queue, peer_address, 1,interfaceTag);
+        uf_process_rx_pending_queue(priv, queue, peer_address, 1, interfaceTag);
     }
 } /* uf_resume_data_plane() */
 
 
-void uf_free_pending_rx_packets(unifi_priv_t *priv, int queue, CsrWifiMacAddress peer_address,u16 interfaceTag)
+void uf_free_pending_rx_packets(unifi_priv_t *priv, int queue, CsrWifiMacAddress peer_address, u16 interfaceTag)
 {
-    uf_process_rx_pending_queue(priv, queue, peer_address, 0,interfaceTag);
+    uf_process_rx_pending_queue(priv, queue, peer_address, 0, interfaceTag);
 
 } /* uf_free_pending_rx_packets() */
 
@@ -2153,7 +2153,7 @@
     if (interfaceTag >= CSR_WIFI_NUM_INTERFACES)
     {
         unifi_error(priv, "%s: MA-PACKET indication with bad interfaceTag %d\n", __FUNCTION__, interfaceTag);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
 
@@ -2167,7 +2167,7 @@
 
     if (bulkdata->d[0].data_length == 0) {
         unifi_warning(priv, "%s: MA-PACKET indication with zero bulk data\n", __FUNCTION__);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
 
@@ -2179,8 +2179,8 @@
     toDs = (skb->data[1] & 0x01) ? 1 : 0;
     fromDs = (skb->data[1] & 0x02) ? 1 : 0;
 
-    memcpy(da,(skb->data+4+toDs*12),ETH_ALEN);/* Address1 or 3 */
-    memcpy(sa,(skb->data+10+fromDs*(6+toDs*8)),ETH_ALEN); /* Address2, 3 or 4 */
+    memcpy(da, (skb->data+4+toDs*12), ETH_ALEN);/* Address1 or 3 */
+    memcpy(sa, (skb->data+10+fromDs*(6+toDs*8)), ETH_ALEN); /* Address2, 3 or 4 */
 
 
     pData = &bulkdata->d[0];
@@ -2189,7 +2189,7 @@
 
     dataFrameType =((frameControl & 0x00f0) >> 4);
     unifi_trace(priv, UDBG6,
-                "%s: Receive Data Frame Type %d \n", __FUNCTION__,dataFrameType);
+                "%s: Receive Data Frame Type %d \n", __FUNCTION__, dataFrameType);
 
     switch(dataFrameType)
     {
@@ -2276,7 +2276,7 @@
 
         /* AP/P2PGO specific handling here */
         CsrWifiRouterCtrlStaInfo_t * srcStaInfo =
-            CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv,sa,interfaceTag);
+            CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv, sa, interfaceTag);
 
         /* Defensive check only; Source address is already checked in
         process_ma_packet_ind and we should have a valid source address here */
@@ -2284,10 +2284,10 @@
          if(srcStaInfo == NULL) {
             CsrWifiMacAddress peerMacAddress;
             /* Unknown data PDU */
-            memcpy(peerMacAddress.a,sa,ETH_ALEN);
+            memcpy(peerMacAddress.a, sa, ETH_ALEN);
             unifi_trace(priv, UDBG1, "%s: Unexpected frame from peer = %x:%x:%x:%x:%x:%x\n", __FUNCTION__,
-            sa[0], sa[1],sa[2], sa[3], sa[4],sa[5]);
-            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,interfaceTag,peerMacAddress);
+            sa[0], sa[1], sa[2], sa[3], sa[4], sa[5]);
+            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, peerMacAddress);
             unifi_net_data_free(priv, &bulkdata->d[0]);
             return;
         }
@@ -2296,11 +2296,11 @@
         if (port_action != CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_OPEN) {
             /* Drop the packet and return */
             CsrWifiMacAddress peerMacAddress;
-            memcpy(peerMacAddress.a,sa,ETH_ALEN);
+            memcpy(peerMacAddress.a, sa, ETH_ALEN);
             unifi_trace(priv, UDBG3, "%s: Port is not open: unexpected frame from peer = %x:%x:%x:%x:%x:%x\n",
-                        __FUNCTION__, sa[0], sa[1],sa[2], sa[3], sa[4],sa[5]);
+                        __FUNCTION__, sa[0], sa[1], sa[2], sa[3], sa[4], sa[5]);
 
-            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,interfaceTag,peerMacAddress);
+            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, peerMacAddress);
             interfacePriv->stats.rx_dropped++;
             unifi_net_data_free(priv, &bulkdata->d[0]);
             unifi_notice(priv, "%s: Dropping packet, proto=0x%04x, %s port\n", __FUNCTION__,
@@ -2328,7 +2328,7 @@
         {
             return;
         }
-        unifi_trace(priv, UDBG5, "unifi_rx: no specific AP handling process as normal frame, MAC Header len %d\n",macHeaderLengthInBytes);
+        unifi_trace(priv, UDBG5, "unifi_rx: no specific AP handling process as normal frame, MAC Header len %d\n", macHeaderLengthInBytes);
         /* Remove the MAC header for subsequent conversion */
         skb_pull(skb, macHeaderLengthInBytes);
         pData->os_data_ptr = skb->data;
@@ -2422,7 +2422,7 @@
     if(interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_AP ||
        interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_P2PGO) {
 
-        uf_process_ma_pkt_cfm_for_ap(priv,interfaceTag,pkt_cfm);
+        uf_process_ma_pkt_cfm_for_ap(priv, interfaceTag, pkt_cfm);
     } else if (interfacePriv->m4_sent && (pkt_cfm->HostTag == interfacePriv->m4_hostTag)) {
         /* Check if this is a confirm for EAPOL M4 frame and we need to send transmistted ind*/
         CsrResult result = pkt_cfm->TransmissionStatus == CSR_TX_SUCCESSFUL?CSR_RESULT_SUCCESS:CSR_RESULT_FAILURE;
@@ -2486,7 +2486,7 @@
     if (interfaceTag >= CSR_WIFI_NUM_INTERFACES)
     {
         unifi_error(priv, "%s: MA-PACKET indication with bad interfaceTag %d\n", __FUNCTION__, interfaceTag);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
 
@@ -2500,7 +2500,7 @@
 
     if (bulkdata->d[0].data_length == 0) {
         unifi_warning(priv, "%s: MA-PACKET indication with zero bulk data\n", __FUNCTION__);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
     /* For monitor mode we need to pass this indication to the registered application
@@ -2508,8 +2508,8 @@
     /* MIC failure is already taken care of so no need to send the PDUs which are not successfully received in non-monitor mode*/
     if(pkt_ind->ReceptionStatus != CSR_RX_SUCCESS)
     {
-        unifi_warning(priv, "%s: MA-PACKET indication with status = %d\n",__FUNCTION__, pkt_ind->ReceptionStatus);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_warning(priv, "%s: MA-PACKET indication with status = %d\n", __FUNCTION__, pkt_ind->ReceptionStatus);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
 
@@ -2521,8 +2521,8 @@
     toDs = (skb->data[1] & 0x01) ? 1 : 0;
     fromDs = (skb->data[1] & 0x02) ? 1 : 0;
 
-    memcpy(da,(skb->data+4+toDs*12),ETH_ALEN);/* Address1 or 3 */
-    memcpy(sa,(skb->data+10+fromDs*(6+toDs*8)),ETH_ALEN); /* Address2, 3 or 4 */
+    memcpy(da, (skb->data+4+toDs*12), ETH_ALEN);/* Address1 or 3 */
+    memcpy(sa, (skb->data+10+fromDs*(6+toDs*8)), ETH_ALEN); /* Address2, 3 or 4 */
 
     /* Find the BSSID, which will be used to match the BA session */
     if (toDs && fromDs)
@@ -2539,7 +2539,7 @@
     frameControl = CSR_GET_UINT16_FROM_LITTLE_ENDIAN(pData->os_data_ptr);
     frameType = ((frameControl & 0x000C) >> 2);
 
-    unifi_trace(priv, UDBG3, "Rx Frame Type: %d sn: %d\n",frameType,
+    unifi_trace(priv, UDBG3, "Rx Frame Type: %d sn: %d\n", frameType,
          (le16_to_cpu(*((u16*)(bulkdata->d[0].os_data_ptr + IEEE802_11_SEQUENCE_CONTROL_OFFSET))) >> 4) & 0xfff);
     if(frameType == IEEE802_11_FRAMETYPE_CONTROL){
 #ifdef CSR_SUPPORT_SME
@@ -2550,18 +2550,18 @@
             u8 pmBit = (frameControl & 0x1000)?0x01:0x00;
             unifi_trace(priv, UDBG6, "%s: Received PS-POLL Frame\n", __FUNCTION__);
 
-            uf_process_ps_poll(priv,sa,da,pmBit,interfaceTag);
+            uf_process_ps_poll(priv, sa, da, pmBit, interfaceTag);
         }
         else {
             unifi_warning(priv, "%s: Non PS-POLL control frame is received\n", __FUNCTION__);
         }
 #endif
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
     if(frameType != IEEE802_11_FRAMETYPE_DATA) {
-        unifi_warning(priv, "%s: Non control Non Data frame is received\n",__FUNCTION__);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_warning(priv, "%s: Non control Non Data frame is received\n", __FUNCTION__);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
 
@@ -2569,15 +2569,15 @@
     if((interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_AP) ||
        (interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_P2PGO)){
 
-        srcStaInfo = CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv,sa,interfaceTag);
+        srcStaInfo = CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv, sa, interfaceTag);
 
         if(srcStaInfo == NULL) {
             CsrWifiMacAddress peerMacAddress;
             /* Unknown data PDU */
-            memcpy(peerMacAddress.a,sa,ETH_ALEN);
+            memcpy(peerMacAddress.a, sa, ETH_ALEN);
             unifi_trace(priv, UDBG1, "%s: Unexpected frame from peer = %x:%x:%x:%x:%x:%x\n", __FUNCTION__,
-            sa[0], sa[1],sa[2], sa[3], sa[4],sa[5]);
-            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,interfaceTag,peerMacAddress);
+            sa[0], sa[1], sa[2], sa[3], sa[4], sa[5]);
+            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, peerMacAddress);
             unifi_net_data_free(priv, &bulkdata->d[0]);
             return;
         }
@@ -2591,7 +2591,7 @@
         */
 
         pmBit = (frameControl & 0x1000)?0x01:0x00;
-        powerSaveChanged = uf_process_pm_bit_for_peer(priv,srcStaInfo,pmBit,interfaceTag);
+        powerSaveChanged = uf_process_pm_bit_for_peer(priv, srcStaInfo, pmBit, interfaceTag);
 
         /* Update station last activity time */
         srcStaInfo->activity_flag = TRUE;
@@ -2616,8 +2616,8 @@
                 else{
                     qosControl = CSR_GET_UINT16_FROM_LITTLE_ENDIAN(pData->os_data_ptr + 24);
                 }
-                unifi_trace(priv, UDBG5, "%s: Check if U-APSD operations are triggered for qosControl: 0x%x\n",__FUNCTION__,qosControl);
-                uf_process_wmm_deliver_ac_uapsd(priv,srcStaInfo,qosControl,interfaceTag);
+                unifi_trace(priv, UDBG5, "%s: Check if U-APSD operations are triggered for qosControl: 0x%x\n", __FUNCTION__, qosControl);
+                uf_process_wmm_deliver_ac_uapsd(priv, srcStaInfo, qosControl, interfaceTag);
             }
         }
     }
@@ -2918,8 +2918,8 @@
             interfacePriv->connected = UnifiConnected;
             interfacePriv->wait_netdev_change = FALSE;
             /* Note: passing the broadcast address here will allow anyone to attempt to join our adhoc network */
-            uf_process_rx_pending_queue(priv, UF_UNCONTROLLED_PORT_Q, broadcast_address, 1,interfacePriv->InterfaceTag);
-            uf_process_rx_pending_queue(priv, UF_CONTROLLED_PORT_Q, broadcast_address, 1,interfacePriv->InterfaceTag);
+            uf_process_rx_pending_queue(priv, UF_UNCONTROLLED_PORT_Q, broadcast_address, 1, interfacePriv->InterfaceTag);
+            uf_process_rx_pending_queue(priv, UF_CONTROLLED_PORT_Q, broadcast_address, 1, interfacePriv->InterfaceTag);
         }
         break;
 

diff --git a/drivers/staging/csr/sdio_mmc.c b/drivers/staging/csr/sdio_mmc.c
index 30271d3..2b503c2 100644
--- a/drivers/staging/csr/sdio_mmc.c
+++ b/drivers/staging/csr/sdio_mmc.c

@@ -1135,8 +1135,8 @@
  * them from the list passed in csr_sdio_register_driver().
  */
 static const struct sdio_device_id unifi_ids[] = {
-    { SDIO_DEVICE(SDIO_MANF_ID_CSR,SDIO_CARD_ID_UNIFI_3) },
-    { SDIO_DEVICE(SDIO_MANF_ID_CSR,SDIO_CARD_ID_UNIFI_4) },
+    { SDIO_DEVICE(SDIO_MANF_ID_CSR, SDIO_CARD_ID_UNIFI_3) },
+    { SDIO_DEVICE(SDIO_MANF_ID_CSR, SDIO_CARD_ID_UNIFI_4) },
     { /* end: all zeroes */				},
 };
 

diff --git a/drivers/staging/csr/sme_blocking.c b/drivers/staging/csr/sme_blocking.c
index d88ccd5..0c6e216 100644
--- a/drivers/staging/csr/sme_blocking.c
+++ b/drivers/staging/csr/sme_blocking.c

@@ -1280,7 +1280,7 @@
         return -EIO;
 
     /* Suspend the SME, which MAY cause it to power down UniFi */
-    CsrWifiRouterCtrlSuspendIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0, 0, priv->wol_suspend);
+    CsrWifiRouterCtrlSuspendIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, 0, priv->wol_suspend);
     r = sme_wait_for_reply(priv, UNIFI_SME_SYS_LONG_TIMEOUT);
     if (r) {
         /* No reply - forcibly power down in case the request wasn't processed */
@@ -1366,7 +1366,7 @@
     if (r)
         return -EIO;
 
-    CsrWifiRouterCtrlResumeIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0, priv->wol_suspend);
+    CsrWifiRouterCtrlResumeIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, priv->wol_suspend);
 
     r = sme_wait_for_reply(priv, UNIFI_SME_SYS_LONG_TIMEOUT);
     if (r)
@@ -1377,7 +1377,7 @@
 }
 
 #ifdef CSR_SUPPORT_WEXT_AP
-int sme_ap_stop(unifi_priv_t *priv,u16 interface_tag)
+int sme_ap_stop(unifi_priv_t *priv, u16 interface_tag)
 {
     int r;
 
@@ -1390,7 +1390,7 @@
     if (r)
         return -EIO;
 
-    CsrWifiNmeApStopReqSend(0,interface_tag);
+    CsrWifiNmeApStopReqSend(0, interface_tag);
 
     r = sme_wait_for_reply(priv, UNIFI_SME_MGT_SHORT_TIMEOUT);
     if (r)
@@ -1403,12 +1403,12 @@
 
 }
 
-int sme_ap_start(unifi_priv_t *priv,u16 interface_tag,
+int sme_ap_start(unifi_priv_t *priv, u16 interface_tag,
                  CsrWifiSmeApConfig_t * ap_config)
 {
     int r;
     CsrWifiSmeApP2pGoConfig p2p_go_param;
-    memset(&p2p_go_param,0,sizeof(CsrWifiSmeApP2pGoConfig));
+    memset(&p2p_go_param, 0, sizeof(CsrWifiSmeApP2pGoConfig));
 
     if (priv->smepriv == NULL) {
         unifi_error(priv, "sme_ap_start: invalid smepriv\n");
@@ -1419,10 +1419,10 @@
     if (r)
         return -EIO;
 
-    CsrWifiNmeApStartReqSend(0,interface_tag,CSR_WIFI_AP_TYPE_LEGACY,FALSE,
-                             ap_config->ssid,1,ap_config->channel,
-                             ap_config->credentials,ap_config->max_connections,
-                             p2p_go_param,FALSE);
+    CsrWifiNmeApStartReqSend(0, interface_tag, CSR_WIFI_AP_TYPE_LEGACY, FALSE,
+                             ap_config->ssid, 1, ap_config->channel,
+                             ap_config->credentials, ap_config->max_connections,
+                             p2p_go_param, FALSE);
 
     r = sme_wait_for_reply(priv, UNIFI_SME_MGT_SHORT_TIMEOUT);
     if (r)
@@ -1440,7 +1440,7 @@
 {
     int r;
     CsrWifiSmeApP2pGoConfig p2p_go_param;
-    memset(&p2p_go_param,0,sizeof(CsrWifiSmeApP2pGoConfig));
+    memset(&p2p_go_param, 0, sizeof(CsrWifiSmeApP2pGoConfig));
 
     if (priv->smepriv == NULL) {
         unifi_error(priv, "sme_ap_config: invalid smepriv\n");
@@ -1451,7 +1451,7 @@
     if (r)
         return -EIO;
 
-    CsrWifiNmeApConfigSetReqSend(0,*group_security_config,
+    CsrWifiNmeApConfigSetReqSend(0, *group_security_config,
                                  *ap_mac_config);
 
     r = sme_wait_for_reply(priv, UNIFI_SME_MGT_SHORT_TIMEOUT);

diff --git a/drivers/staging/csr/sme_native.c b/drivers/staging/csr/sme_native.c
index ca55249..d0b9be3 100644
--- a/drivers/staging/csr/sme_native.c
+++ b/drivers/staging/csr/sme_native.c

@@ -55,7 +55,7 @@
 
 int sme_mgt_wifi_on(unifi_priv_t *priv)
 {
-    int r,i;
+    int r, i;
     s32 csrResult;
 
     if (priv == NULL) {

diff --git a/drivers/staging/csr/sme_sys.c b/drivers/staging/csr/sme_sys.c
index b1151a2..b5258d7 100644
--- a/drivers/staging/csr/sme_sys.c
+++ b/drivers/staging/csr/sme_sys.c

@@ -158,7 +158,7 @@
         unifi_error(priv, "CsrWifiRouterCtrlMediaStatusReqHandler: invalid interfaceTag\n");
         return;
     }
-    unifi_trace(priv, UDBG3, "CsrWifiRouterCtrlMediaStatusReqHandler: Mode = %d req->mediaStatus = %d\n",interfacePriv->interfaceMode,req->mediaStatus);
+    unifi_trace(priv, UDBG3, "CsrWifiRouterCtrlMediaStatusReqHandler: Mode = %d req->mediaStatus = %d\n", interfacePriv->interfaceMode, req->mediaStatus);
     if (interfacePriv->interfaceMode != CSR_WIFI_ROUTER_CTRL_MODE_AMP) {
         bulk_data_desc_t bulk_data;
 
@@ -389,7 +389,7 @@
         unifi_error(priv,
                     "CsrWifiRouterCtrlHipReqHandler: Failed to send signal (0x%.4X - %u)\n",
                     *((u16*)signal_ptr), r);
-        CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,CSR_WIFI_SME_CONTROL_INDICATION_ERROR);
+        CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, CSR_WIFI_SME_CONTROL_INDICATION_ERROR);
     }
 
     unifi_trace(priv, UDBG4, "CsrWifiRouterCtrlHipReqHandler: <----\n");
@@ -474,7 +474,7 @@
     r = ul_send_signal_unpacked(priv, &signal, &bulkdata);
     if (r)
     {
-        unifi_error(priv, "CsrWifiSmeRoamCompleteIndHandler: failed to send QOS data null packet result: %d\n",r);
+        unifi_error(priv, "CsrWifiSmeRoamCompleteIndHandler: failed to send QOS data null packet result: %d\n", r);
         unifi_net_data_free(priv, &bulkdata.d[0]);
         return;
     }
@@ -574,7 +574,7 @@
 
             /* If port is closed, discard all the pending Rx packets */
             if (port_action == CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_CLOSED_DISCARD) {
-                uf_free_pending_rx_packets(priv, queue, *macAddress,interfaceTag);
+                uf_free_pending_rx_packets(priv, queue, *macAddress, interfaceTag);
             }
         }
     } else {
@@ -645,7 +645,7 @@
          * coming from the peer station.
          */
         if (port_action == CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_CLOSED_DISCARD) {
-            uf_free_pending_rx_packets(priv, queue, *macAddress,interfaceTag);
+            uf_free_pending_rx_packets(priv, queue, *macAddress, interfaceTag);
         }
 
 	unifi_trace(priv, UDBG2,
@@ -712,7 +712,7 @@
     configure_data_port(priv, req->controlledPortAction, (const CsrWifiMacAddress *)&req->macAddress,
                         UF_CONTROLLED_PORT_Q, req->interfaceTag);
 
-    CsrWifiRouterCtrlPortConfigureCfmSend(msg->source,req->clientData,req->interfaceTag,
+    CsrWifiRouterCtrlPortConfigureCfmSend(msg->source, req->clientData, req->interfaceTag,
                                       CSR_RESULT_SUCCESS, req->macAddress);
     unifi_trace(priv, UDBG3, "leaving CsrWifiRouterCtrlPortConfigureReqHandler\n");
 }
@@ -723,7 +723,7 @@
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
     CsrWifiRouterCtrlVersions versions;
     CsrWifiRouterCtrlWifiOnReq* req = (CsrWifiRouterCtrlWifiOnReq*)msg;
-    int r,i;
+    int r, i;
     CsrResult csrResult;
 
     if (priv == NULL) {
@@ -963,7 +963,7 @@
     }
     wifi_off(priv);
 
-    CsrWifiRouterCtrlWifiOffCfmSend(msg->source,req->clientData);
+    CsrWifiRouterCtrlWifiOffCfmSend(msg->source, req->clientData);
 
     /* If this is called in response to closing the character device, the
      * caller must use uf_sme_cancel_request() to terminate any pending SME
@@ -1239,7 +1239,7 @@
     unifi_trace(priv, UDBG1,
                 "subscribe_req: encap=%d, handle=%d, result=%d\n",
                 req->encapsulation, i, result);
-    CsrWifiRouterMaPacketSubscribeCfmSend(msg->source,req->interfaceTag, i, result, 0);
+    CsrWifiRouterMaPacketSubscribeCfmSend(msg->source, req->interfaceTag, i, result, 0);
 }
 
 
@@ -1268,7 +1268,7 @@
     unifi_trace(priv, UDBG1,
                 "unsubscribe_req: handle=%d, result=%d\n",
                 req->subscriptionHandle, result);
-    CsrWifiRouterMaPacketUnsubscribeCfmSend(msg->source,req->interfaceTag, result);
+    CsrWifiRouterMaPacketUnsubscribeCfmSend(msg->source, req->interfaceTag, result);
 }
 
 
@@ -1282,7 +1282,7 @@
         return;
     }
 
-    CsrWifiRouterCtrlCapabilitiesCfmSend(msg->source,req->clientData,
+    CsrWifiRouterCtrlCapabilitiesCfmSend(msg->source, req->clientData,
             UNIFI_SOFT_COMMAND_Q_LENGTH - 1,
             UNIFI_SOFT_TRAFFIC_Q_LENGTH - 1);
 }
@@ -1404,7 +1404,7 @@
         if (r) {
             unifi_error(priv,
                         "_sys_packet_req: failed to translate eth frame.\n");
-            unifi_net_data_free(priv,&bulkdata.d[0]);
+            unifi_net_data_free(priv, &bulkdata.d[0]);
             return r;
         }
 
@@ -1439,7 +1439,7 @@
 #ifdef CSR_SUPPORT_SME
     if ((protection = uf_get_protection_bit_from_interfacemode(priv, interfaceTag, peerMacAddress.a)) < 0) {
         unifi_error(priv, "unicast address, but destination not in station record database\n");
-        unifi_net_data_free(priv,&bulkdata.d[0]);
+        unifi_net_data_free(priv, &bulkdata.d[0]);
         return -1;
     }
 #else
@@ -1453,7 +1453,7 @@
     /* add Mac header */
     if (prepare_and_add_macheader(priv, skb, newSkb, req.Priority, &bulkdata, interfaceTag, frame, frame + ETH_ALEN, protection)) {
         unifi_error(priv, "failed to create MAC header\n");
-        unifi_net_data_free(priv,&bulkdata.d[0]);
+        unifi_net_data_free(priv, &bulkdata.d[0]);
         return -1;
     }
 
@@ -1479,7 +1479,7 @@
     if (r) {
         unifi_error(priv,
                     "_sys_packet_req: failed to send signal.\n");
-        unifi_net_data_free(priv,&bulkdata.d[0]);
+        unifi_net_data_free(priv, &bulkdata.d[0]);
         return r;
     }
     /* The final CsrWifiRouterMaPacketCfmSend() will called when the actual MA-PACKET.cfm is received from the chip */
@@ -1558,7 +1558,7 @@
     memcpy(req->Ra.x, daddr, ETH_ALEN);
     req->Priority = mareq->priority;
     req->TransmitRate = 0; /* Let firmware select the rate*/
-    req->VirtualInterfaceIdentifier = uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag);
+    req->VirtualInterfaceIdentifier = uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag);
     req->HostTag = mareq->hostTag;
 
     if(mareq->cfmRequested)
@@ -1571,7 +1571,7 @@
 
     if (r && mareq->cfmRequested)
     {
-        CsrWifiRouterMaPacketCfmSend(msg->source,interfaceTag,
+        CsrWifiRouterMaPacketCfmSend(msg->source, interfaceTag,
                                      CSR_RESULT_FAILURE,
                                      mareq->hostTag, 0);
     }
@@ -1637,7 +1637,7 @@
 /* reset the station records when the mode is set as CSR_WIFI_ROUTER_CTRL_MODE_NONE */
 static void CsrWifiRouterCtrlResetStationRecordList(unifi_priv_t *priv, u16 interfaceTag)
 {
-    u8 i,j;
+    u8 i, j;
     CsrWifiRouterCtrlStaInfo_t *staInfo=NULL;
     netInterface_priv_t *interfacePriv = priv->interfacePriv[interfaceTag];
     unsigned long lock_flags;
@@ -1658,15 +1658,15 @@
             uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                                  &send_cfm_list,
                                                  &(staInfo->mgtFrames));
-            uf_flush_list(priv,&(staInfo->mgtFrames));
+            uf_flush_list(priv, &(staInfo->mgtFrames));
             for(j=0;j<MAX_ACCESS_CATOGORY;j++){
                 uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                                      &send_cfm_list,
                                                      &(staInfo->dataPdu[j]));
-                uf_flush_list(priv,&(staInfo->dataPdu[j]));
+                uf_flush_list(priv, &(staInfo->dataPdu[j]));
             }
 
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             /* Removing station record information from port config array */
             memset(staInfo->peerControlledPort, 0, sizeof(unifi_port_cfg_t));
             staInfo->peerControlledPort->port_action = CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_CLOSED_DISCARD;
@@ -1680,7 +1680,7 @@
 
             kfree(interfacePriv->staInfo[i]);
             interfacePriv->staInfo[i] = NULL;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }
     }
     /* after the critical region process the list of frames that requested cfm
@@ -1697,9 +1697,9 @@
         case CSR_WIFI_ROUTER_CTRL_MODE_NONE:
             if (priv->noOfPktQueuedInDriver) {
                 unifi_warning(priv, "After reset the noOfPktQueuedInDriver = %x\n", priv->noOfPktQueuedInDriver);
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 priv->noOfPktQueuedInDriver = 0;
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             }
             break;
         case CSR_WIFI_ROUTER_CTRL_MODE_IBSS:
@@ -1745,18 +1745,18 @@
     uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                              &send_cfm_list,
                                              &(interfacePriv->genericMgtFrames));
-    uf_flush_list(priv,&(interfacePriv->genericMgtFrames));
+    uf_flush_list(priv, &(interfacePriv->genericMgtFrames));
 
     uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                              &send_cfm_list,
                                              &(interfacePriv->genericMulticastOrBroadCastMgtFrames));
-    uf_flush_list(priv,&(interfacePriv->genericMulticastOrBroadCastMgtFrames));
+    uf_flush_list(priv, &(interfacePriv->genericMulticastOrBroadCastMgtFrames));
 
     uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                              &send_cfm_list,
                                              &(interfacePriv->genericMulticastOrBroadCastFrames));
 
-    uf_flush_list(priv,&(interfacePriv->genericMulticastOrBroadCastFrames));
+    uf_flush_list(priv, &(interfacePriv->genericMulticastOrBroadCastFrames));
 
     /*  process the list of frames that requested cfm
     and send cfm to requestor one by one */
@@ -1772,7 +1772,7 @@
             /* station records not available in these modes */
             break;
         default:
-            CsrWifiRouterCtrlResetStationRecordList(priv,interfaceTag);
+            CsrWifiRouterCtrlResetStationRecordList(priv, interfaceTag);
     }
 
     interfacePriv->num_stations_joined = 0;
@@ -1880,7 +1880,7 @@
          * other then CSR_WIFI_TIM_SET or CSR_WIFI_TIM_RESET value
          */
         interfacePriv->bcTimSetReqQueued =0xFF;
-        CsrWifiRouterCtrlInterfaceReset(priv,req->interfaceTag);
+        CsrWifiRouterCtrlInterfaceReset(priv, req->interfaceTag);
 
         if(req->mode == CSR_WIFI_ROUTER_CTRL_MODE_AP ||
            req->mode == CSR_WIFI_ROUTER_CTRL_MODE_P2PGO) {
@@ -1900,7 +1900,7 @@
         }
     }
     else {
-        unifi_error(priv, "CsrWifiRouterCtrlModeSetReqHandler: invalid interfaceTag :%d\n",req->interfaceTag);
+        unifi_error(priv, "CsrWifiRouterCtrlModeSetReqHandler: invalid interfaceTag :%d\n", req->interfaceTag);
     }
 }
 
@@ -1941,15 +1941,15 @@
                                                  &send_cfm_list,
                                                  &(staInfo->mgtFrames));
 
-        uf_flush_list(priv,&(staInfo->mgtFrames));
+        uf_flush_list(priv, &(staInfo->mgtFrames));
         for(j=0;j<MAX_ACCESS_CATOGORY;j++){
             uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                                      &send_cfm_list,
                                                      &(staInfo->dataPdu[j]));
-            uf_flush_list(priv,&(staInfo->dataPdu[j]));
+            uf_flush_list(priv, &(staInfo->dataPdu[j]));
         }
 
-        spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+        spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
         /* clear the port configure array info, for the corresponding peer entry */
         controlledPort = &interfacePriv->controlled_data_port;
         unControlledPort = &interfacePriv->uncontrolled_data_port;
@@ -1975,12 +1975,12 @@
             unifi_warning(priv, "number of uncontrolled port entries is zero, trying to decrement: debug\n");
         }
 
-        spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         /* update the TIM with zero */
         if (interfacePriv->interfaceMode != CSR_WIFI_ROUTER_CTRL_MODE_IBSS &&
                 staInfo->timSet == CSR_WIFI_TIM_SET) {
             unifi_trace(priv, UDBG3, "peer is deleted so TIM updated to 0, in firmware\n");
-            update_tim(priv,staInfo->aid,0,req->interfaceTag, req->peerRecordHandle);
+            update_tim(priv, staInfo->aid, 0, req->interfaceTag, req->peerRecordHandle);
         }
 
 
@@ -2021,7 +2021,7 @@
         cancel_work_sync(&staInfo->send_disconnected_ind_task);
 #endif
 
-        spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+        spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
 #ifdef CSR_SUPPORT_SME
         interfacePriv->num_stations_joined--;
 
@@ -2039,7 +2039,7 @@
         /* Free the station record for corresponding peer */
         kfree(interfacePriv->staInfo[req->peerRecordHandle]);
         interfacePriv->staInfo[req->peerRecordHandle] = NULL;
-        spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
         /* after the critical region process the list of frames that requested cfm
         and send cfm to requestor one by one */
@@ -2092,12 +2092,12 @@
             break;
     }
 
-    CsrWifiRouterCtrlPeerDelCfmSend(msg->source,req->clientData,req->interfaceTag,status);
+    CsrWifiRouterCtrlPeerDelCfmSend(msg->source, req->clientData, req->interfaceTag, status);
     unifi_trace(priv, UDBG2, "leaving CsrWifiRouterCtrlPeerDelReqHandler \n");
 }
 
 /* Add the new station to the station record data base */
-static int peer_add_new_record(unifi_priv_t *priv,CsrWifiRouterCtrlPeerAddReq *req,u32 *handle)
+static int peer_add_new_record(unifi_priv_t *priv, CsrWifiRouterCtrlPeerAddReq *req, u32 *handle)
 {
     u8 i, powerModeTemp = 0;
     u8 freeSlotFound = FALSE;
@@ -2135,11 +2135,11 @@
                         req->staInfo.listenIntervalInTus);
 
             /* disable the preemption until station record updated */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
 
             interfacePriv->staInfo[i] = newRecord;
             /* Initialize the record*/
-            memset(newRecord,0,sizeof(CsrWifiRouterCtrlStaInfo_t));
+            memset(newRecord, 0, sizeof(CsrWifiRouterCtrlStaInfo_t));
             /* update the station record */
             memcpy(newRecord->peerMacAddress.a, req->peerMacAddress.a, ETH_ALEN);
             newRecord->wmmOrQosEnabled = req->staInfo.wmmOrQosEnabled;
@@ -2182,11 +2182,11 @@
                 u8 k;
                 for(k=0; k< MAX_ACCESS_CATOGORY ;k++)
                     unifi_trace(priv, UDBG2, "peer_add_new_record: WMM : %d ,AC %d, powersaveMode %x \n",
-                            req->staInfo.wmmOrQosEnabled,k,newRecord->powersaveMode[k]);
+                            req->staInfo.wmmOrQosEnabled, k, newRecord->powersaveMode[k]);
             }
 
             unifi_trace(priv, UDBG3, "newRecord->wmmOrQosEnabled : %d , MAX SP : %d\n",
-                    newRecord->wmmOrQosEnabled,newRecord->maxSpLength);
+                    newRecord->wmmOrQosEnabled, newRecord->maxSpLength);
 
             /* Initialize the mgtFrames & data Pdu list */
             {
@@ -2201,7 +2201,7 @@
             newRecord->activity_flag = TRUE;
 
             /* enable the preemption as station record updated */
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
             /* First time port actions are set for the peer with below information */
             configure_data_port(priv, CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_OPEN, &newRecord->peerMacAddress,
@@ -2216,7 +2216,7 @@
             }
 
 
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             /* Port status must be already set before calling the Add Peer request */
             newRecord->peerControlledPort = uf_sme_port_config_handle(priv, newRecord->peerMacAddress.a,
                                                                       UF_CONTROLLED_PORT_Q, req->interfaceTag);
@@ -2228,7 +2228,7 @@
                 unifi_warning(priv, "Un/ControlledPort record not found in port configuration array index = %d\n", i);
                 kfree(interfacePriv->staInfo[i]);
                 interfacePriv->staInfo[i] = NULL;
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
                 return CSR_RESULT_FAILURE;
             }
 
@@ -2279,7 +2279,7 @@
 
             }
 #endif
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
             break;
         }
     }
@@ -2446,7 +2446,7 @@
                                                  &send_cfm_list,
                                                  &(staInfo->dataPdu[j]));
 
-        uf_flush_list(priv,&(staInfo->dataPdu[j]));
+        uf_flush_list(priv, &(staInfo->dataPdu[j]));
     }
 
     send_auto_ma_packet_confirm(priv, staInfo->interfacePriv, &send_cfm_list);
@@ -2471,7 +2471,7 @@
 
 
 #endif
-void CsrWifiRouterCtrlPeerAddReqHandler(void* drvpriv,CsrWifiFsmEvent* msg)
+void CsrWifiRouterCtrlPeerAddReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
 {
     CsrWifiRouterCtrlPeerAddReq* req = (CsrWifiRouterCtrlPeerAddReq*)msg;
     CsrResult status = CSR_RESULT_SUCCESS;
@@ -2500,7 +2500,7 @@
         case CSR_WIFI_ROUTER_CTRL_MODE_IBSS:
         case CSR_WIFI_ROUTER_CTRL_MODE_P2PGO:
             /* Add station record */
-            status = peer_add_new_record(priv,req,&handle);
+            status = peer_add_new_record(priv, req, &handle);
             break;
         case CSR_WIFI_ROUTER_CTRL_MODE_STA:
         case CSR_WIFI_ROUTER_CTRL_MODE_P2PCLI:
@@ -2509,11 +2509,11 @@
             break;
     }
 
-    CsrWifiRouterCtrlPeerAddCfmSend(msg->source,req->clientData,req->interfaceTag,req->peerMacAddress,handle,status);
+    CsrWifiRouterCtrlPeerAddCfmSend(msg->source, req->clientData, req->interfaceTag, req->peerMacAddress, handle, status);
     unifi_trace(priv, UDBG2, "leaving CsrWifiRouterCtrlPeerAddReqHandler \n");
 }
 
-void CsrWifiRouterCtrlPeerUpdateReqHandler(void* drvpriv,CsrWifiFsmEvent* msg)
+void CsrWifiRouterCtrlPeerUpdateReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
 {
     CsrWifiRouterCtrlPeerUpdateReq* req = (CsrWifiRouterCtrlPeerUpdateReq*)msg;
     CsrResult status = CSR_RESULT_SUCCESS;
@@ -2526,7 +2526,7 @@
         return;
     }
 
-    CsrWifiRouterCtrlPeerUpdateCfmSend(msg->source,req->clientData,req->interfaceTag,status);
+    CsrWifiRouterCtrlPeerUpdateCfmSend(msg->source, req->clientData, req->interfaceTag, status);
     unifi_trace(priv, UDBG2, "leaving CsrWifiRouterCtrlPeerUpdateReqHandler \n");
 }
 
@@ -2986,13 +2986,13 @@
         unifi_trace(priv, UDBG6, "<<%s\n", __FUNCTION__);
     } else {
 
-    	unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__,interfacePriv->interfaceMode);
+    	unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__, interfacePriv->interfaceMode);
 
     }
 #elif defined(UNIFI_DEBUG)
     /*WAPI Disabled*/
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
-    unifi_error(priv,"CsrWifiRouterCtrlWapiMulticastFilterReqHandler: called when WAPI isn't enabled\n");
+    unifi_error(priv, "CsrWifiRouterCtrlWapiMulticastFilterReqHandler: called when WAPI isn't enabled\n");
 #endif
 }
 
@@ -3022,13 +3022,13 @@
         unifi_trace(priv, UDBG6, "<<%s\n", __FUNCTION__);
     } else {
 
-    	 unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__,interfacePriv->interfaceMode);
+    	 unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__, interfacePriv->interfaceMode);
 
     }
 #elif defined(UNIFI_DEBUG)
     /*WAPI Disabled*/
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
-    unifi_error(priv,"CsrWifiRouterCtrlWapiUnicastFilterReqHandler: called when WAPI isn't enabled\n");
+    unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastFilterReqHandler: called when WAPI isn't enabled\n");
 #endif
 }
 
@@ -3064,13 +3064,13 @@
 
 
         if (req->dataLength == 0 || req->data == NULL) {
-             unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReq: invalid request\n",__FUNCTION__);
+             unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReq: invalid request\n", __FUNCTION__);
              return;
         }
 
         res = unifi_net_data_malloc(priv, &bulkdata.d[0], req->dataLength);
         if (res != CSR_RESULT_SUCCESS) {
-             unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReq: Could not allocate net data\n",__FUNCTION__);
+             unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReq: Could not allocate net data\n", __FUNCTION__);
              return;
         }
 
@@ -3078,15 +3078,15 @@
          * So reset the reception status to rx_success */
         res = read_unpack_signal(req->signal, &signal);
         if (res) {
-	          unifi_error(priv,"CsrWifiRouterCtrlWapiRxPktReqHandler: Received unknown or corrupted signal.\n");
+	          unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReqHandler: Received unknown or corrupted signal.\n");
 	          return;
         }
         pkt_ind = (CSR_MA_PACKET_INDICATION*) (&((&signal)->u).MaPacketIndication);
         if (pkt_ind->ReceptionStatus != CSR_MICHAEL_MIC_ERROR) {
-	          unifi_error(priv,"CsrWifiRouterCtrlWapiRxPktReqHandler: Unknown signal with reception status = %d\n",pkt_ind->ReceptionStatus);
+	          unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReqHandler: Unknown signal with reception status = %d\n", pkt_ind->ReceptionStatus);
 	          return;
         } else {
-	          unifi_trace(priv, UDBG4,"CsrWifiRouterCtrlWapiRxPktReqHandler: MIC verified , RX_SUCCESS \n",__FUNCTION__);
+	          unifi_trace(priv, UDBG4, "CsrWifiRouterCtrlWapiRxPktReqHandler: MIC verified , RX_SUCCESS \n", __FUNCTION__);
 	          pkt_ind->ReceptionStatus = CSR_RX_SUCCESS;
 	          write_pack(&signal, req->signal, &(req->signalLength));
         }
@@ -3113,12 +3113,12 @@
 
         unifi_trace(priv, UDBG6, "<<%s\n", __FUNCTION__);
     } else {
-    	unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__,interfacePriv->interfaceMode);
+    	unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__, interfacePriv->interfaceMode);
     }
 #elif defined(UNIFI_DEBUG)
     /*WAPI Disabled*/
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
-    unifi_error(priv,"CsrWifiRouterCtrlWapiRxPktReqHandler: called when WAPI isn't enabled\n");
+    unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReqHandler: called when WAPI isn't enabled\n");
 #endif
 }
 
@@ -3142,15 +3142,15 @@
         unifi_trace(priv, UDBG6, ">>%s\n", __FUNCTION__);
 
         if (priv == NULL) {
-            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler : invalid priv\n",__FUNCTION__);
+            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler : invalid priv\n", __FUNCTION__);
             return;
         }
         if (priv->smepriv == NULL) {
-            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler : invalid sme priv\n",__FUNCTION__);
+            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler : invalid sme priv\n", __FUNCTION__);
             return;
         }
         if (req->data == NULL) {
-            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: invalid request\n",__FUNCTION__);
+            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: invalid request\n", __FUNCTION__);
             return;
         } else {
             /* If it is QoS data (type = data subtype = QoS), frame header contains QoS control field */
@@ -3159,7 +3159,7 @@
             }
         }
         if ( !(req->dataLength>(macHeaderLengthInBytes+appendedCryptoFields)) ) {
-            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: invalid dataLength\n",__FUNCTION__);
+            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: invalid dataLength\n", __FUNCTION__);
             return;
         }
 
@@ -3174,7 +3174,7 @@
          */
         result = unifi_net_data_malloc(priv, &bulkdata.d[0], req->dataLength);
         if (result != CSR_RESULT_SUCCESS) {
-             unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: Could not allocate net data\n",__FUNCTION__);
+             unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: Could not allocate net data\n", __FUNCTION__);
              return;
         }
         memcpy((void*)bulkdata.d[0].os_data_ptr, req->data, req->dataLength);
@@ -3217,13 +3217,13 @@
 
     } else {
 
-    	unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__,interfacePriv->interfaceMode);
+    	unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__, interfacePriv->interfaceMode);
 
     }
 #elif defined(UNIFI_DEBUG)
     /*WAPI Disabled*/
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
-    unifi_error(priv,"CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: called when WAPI SW ENCRYPTION isn't enabled\n");
+    unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: called when WAPI SW ENCRYPTION isn't enabled\n");
 #endif
 }
 
@@ -3240,14 +3240,14 @@
 
         unifi_trace(priv, UDBG6, ">>%s\n", __FUNCTION__);
 
-        unifi_trace(priv, UDBG1, "CsrWifiRouterCtrlWapiFilterReq: req->isWapiConnected [0/1] = %d \n",req->isWapiConnected);
+        unifi_trace(priv, UDBG1, "CsrWifiRouterCtrlWapiFilterReq: req->isWapiConnected [0/1] = %d \n", req->isWapiConnected);
 
         priv->isWapiConnection = req->isWapiConnected;
 
         unifi_trace(priv, UDBG6, "<<%s\n", __FUNCTION__);
     } else {
 
-    	unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__,interfacePriv->interfaceMode);
+    	unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__, interfacePriv->interfaceMode);
 
     }
 #endif
@@ -3255,6 +3255,6 @@
 #elif defined(UNIFI_DEBUG)
     /*WAPI Disabled*/
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
-    unifi_error(priv,"CsrWifiRouterCtrlWapiFilterReq: called when WAPI isn't enabled\n");
+    unifi_error(priv, "CsrWifiRouterCtrlWapiFilterReq: called when WAPI isn't enabled\n");
 #endif
 }

diff --git a/drivers/staging/csr/sme_userspace.c b/drivers/staging/csr/sme_userspace.c
index abcb446..b919b00 100644
--- a/drivers/staging/csr/sme_userspace.c
+++ b/drivers/staging/csr/sme_userspace.c

@@ -118,7 +118,7 @@
 void
 uf_sme_deinit(unifi_priv_t *priv)
 {
-    int i,j;
+    int i, j;
     u8 ba_session_idx;
     ba_session_rx_struct *ba_session_rx = NULL;
     ba_session_tx_struct *ba_session_tx = NULL;
@@ -224,7 +224,7 @@
     if (CSR_WIFI_ROUTER_CTRL_PROTOCOL_DIRECTION_RX == direction)
     {
         u16 interfaceTag = 0;
-        CsrWifiRouterCtrlTrafficProtocolIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,
+        CsrWifiRouterCtrlTrafficProtocolIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0,
                 interfaceTag,
                 packet_type,
                 direction,

diff --git a/drivers/staging/csr/sme_wext.c b/drivers/staging/csr/sme_wext.c
index 4129a643..84f11cb 100644
--- a/drivers/staging/csr/sme_wext.c
+++ b/drivers/staging/csr/sme_wext.c

@@ -120,7 +120,7 @@
 #ifdef CSR_SUPPORT_WEXT_AP
 void uf_sme_wext_ap_set_defaults(unifi_priv_t *priv)
 {
-    memcpy(priv->ap_config.ssid.ssid,"defaultssid",sizeof("defaultssid"));
+    memcpy(priv->ap_config.ssid.ssid, "defaultssid", sizeof("defaultssid"));
 
     priv->ap_config.ssid.length = 8;
     priv->ap_config.channel = 6;
@@ -202,7 +202,7 @@
                                                     to enable different types of
                                                     devices to join us */
     priv->ap_mac_config.supportedRatesCount =
-           uf_configure_supported_rates(priv->ap_mac_config.supportedRates,priv->ap_mac_config.phySupportedBitmap);
+           uf_configure_supported_rates(priv->ap_mac_config.supportedRates, priv->ap_mac_config.phySupportedBitmap);
 }
 #endif
 /*
@@ -459,7 +459,7 @@
 {
     u8 int_str[7] = "0";
     u32 param_str_len;
-    u8  *param_str_begin,*param_str_end;
+    u8  *param_str_begin, *param_str_end;
     u8  *orig_str = *str_ptr;
 
     if (!strncmp(*str_ptr, token, strlen(token))) {
@@ -472,41 +472,41 @@
             param_str_end = *str_ptr-1;
             param_str_len = param_str_end - param_str_begin;
         }
-        unifi_trace(priv,UDBG2,"'token:%s', len:%d, ", token, param_str_len);
+        unifi_trace(priv, UDBG2, "'token:%s', len:%d, ", token, param_str_len);
         if (param_str_len > param_max_len) {
-            unifi_notice(priv,"extracted param len:%d is > MAX:%d\n",param_str_len, param_max_len);
+            unifi_notice(priv, "extracted param len:%d is > MAX:%d\n", param_str_len, param_max_len);
             param_str_len = param_max_len;
         }
         switch (param_type) {
             case PARAM_TYPE_INT:
             {
-                u32 *pdst_int = dst,num =0;
-                int i,j=0;
+                u32 *pdst_int = dst, num =0;
+                int i, j=0;
                 if (param_str_len > sizeof(int_str)) {
                     param_str_len = sizeof(int_str);
                 }
                 memcpy(int_str, param_str_begin, param_str_len);
                 for(i = param_str_len; i>0;i--) {
                     if(int_str[i-1] >= '0' && int_str[i-1] <='9') {
-                        num += ((int_str[i-1]-'0')*power(10,j));
+                        num += ((int_str[i-1]-'0')*power(10, j));
                         j++;
                     } else {
-                        unifi_error(priv,"decode_parameter_from_string:not a number %c\n",(int_str[i-1]));
+                        unifi_error(priv, "decode_parameter_from_string:not a number %c\n", (int_str[i-1]));
                         return -1;
                     }
                 }
                 *pdst_int = num;
-                unifi_trace(priv,UDBG2,"decode_parameter_from_string:decoded int = %d\n",*pdst_int);
+                unifi_trace(priv, UDBG2, "decode_parameter_from_string:decoded int = %d\n", *pdst_int);
             }
             break;
             default:
                 memcpy(dst, param_str_begin, param_str_len);
                 *((char *)dst + param_str_len) = 0;
-                unifi_trace(priv,UDBG2,"decode_parameter_from_string:decoded string = %s\n",(char *)dst);
+                unifi_trace(priv, UDBG2, "decode_parameter_from_string:decoded string = %s\n", (char *)dst);
             break;
         }
     } else {
-        unifi_error(priv,"decode_parameter_from_string: Token:%s not found in %s \n",token,orig_str);
+        unifi_error(priv, "decode_parameter_from_string: Token:%s not found in %s \n", token, orig_str);
         return -1;
     }
     return 0;
@@ -514,7 +514,7 @@
 static int store_ap_advanced_config_from_string(unifi_priv_t *priv, char *param_str)
 {
     char * str_ptr=param_str;
-    int ret = 0,tmp_var;
+    int ret = 0, tmp_var;
     char phy_mode[6];
     CsrWifiSmeApMacConfig * ap_mac_config = &priv->ap_mac_config;
 
@@ -522,36 +522,36 @@
     ret = decode_parameter_from_string(priv, &str_ptr, "BI=",
                                        PARAM_TYPE_INT, &tmp_var, 5);
     if(ret) {
-        unifi_error(priv,"store_ap_advanced_config_from_string: BI not found\n");
+        unifi_error(priv, "store_ap_advanced_config_from_string: BI not found\n");
         return -1;
     }
     ap_mac_config->beaconInterval = tmp_var;
     ret = decode_parameter_from_string(priv, &str_ptr, "DTIM_PER=",
                                         PARAM_TYPE_INT, &tmp_var, 5);
     if(ret) {
-        unifi_error(priv,"store_ap_advanced_config_from_string: DTIM_PER not found\n");
+        unifi_error(priv, "store_ap_advanced_config_from_string: DTIM_PER not found\n");
         return -1;
     }
     ap_mac_config->dtimPeriod = tmp_var;
     ret = decode_parameter_from_string(priv, &str_ptr, "WMM=",
                                         PARAM_TYPE_INT, &tmp_var, 5);
     if(ret) {
-        unifi_error(priv,"store_ap_advanced_config_from_string: WMM not found\n");
+        unifi_error(priv, "store_ap_advanced_config_from_string: WMM not found\n");
         return -1;
     }
     ap_mac_config->wmmEnabled = tmp_var;
     ret = decode_parameter_from_string(priv, &str_ptr, "PHY=",
                                         PARAM_TYPE_STRING, phy_mode, 5);
     if(ret) {
-        unifi_error(priv,"store_ap_advanced_config_from_string: PHY not found\n");
+        unifi_error(priv, "store_ap_advanced_config_from_string: PHY not found\n");
     } else {
-       if(strstr(phy_mode,"b")){
+       if(strstr(phy_mode, "b")){
            ap_mac_config->phySupportedBitmap = CSR_WIFI_SME_AP_PHY_SUPPORT_B;
        }
-       if(strstr(phy_mode,"g")) {
+       if(strstr(phy_mode, "g")) {
            ap_mac_config->phySupportedBitmap |= CSR_WIFI_SME_AP_PHY_SUPPORT_G;
        }
-       if(strstr(phy_mode,"n")) {
+       if(strstr(phy_mode, "n")) {
            ap_mac_config->phySupportedBitmap |= CSR_WIFI_SME_AP_PHY_SUPPORT_N;
        }
        ap_mac_config->supportedRatesCount =
@@ -560,39 +560,39 @@
     return ret;
 }
 
-static int store_ap_config_from_string( unifi_priv_t * priv,char *param_str)
+static int store_ap_config_from_string( unifi_priv_t * priv, char *param_str)
 
 {
     char *str_ptr = param_str;
     char sub_cmd[16];
     char sec[CSR_WIFI_MAX_SEC_LEN];
     char key[CSR_WIFI_MAX_KEY_LEN];
-    int ret = 0,tmp_var;
+    int ret = 0, tmp_var;
     CsrWifiSmeApConfig_t *ap_config = &priv->ap_config;
     CsrWifiSmeApMacConfig * ap_mac_config = &priv->ap_mac_config;
     memset(sub_cmd, 0, sizeof(sub_cmd));
-    if(!strstr(param_str,"END")) {
-        unifi_error(priv,"store_ap_config_from_string:Invalid config string:%s\n",param_str);
+    if(!strstr(param_str, "END")) {
+        unifi_error(priv, "store_ap_config_from_string:Invalid config string:%s\n", param_str);
         return -1;
     }
-    if (decode_parameter_from_string(priv,&str_ptr, "ASCII_CMD=",
+    if (decode_parameter_from_string(priv, &str_ptr, "ASCII_CMD=",
         PARAM_TYPE_STRING, sub_cmd, 6) != 0) {
          return -1;
     }
     if (strncmp(sub_cmd, "AP_CFG", 6)) {
 
-        if(!strncmp(sub_cmd ,"ADVCFG", 6)) {
+        if(!strncmp(sub_cmd , "ADVCFG", 6)) {
            return store_ap_advanced_config_from_string(priv, str_ptr);
         }
-        unifi_error(priv,"store_ap_config_from_string: sub_cmd:%s != 'AP_CFG or ADVCFG'!\n", sub_cmd);
+        unifi_error(priv, "store_ap_config_from_string: sub_cmd:%s != 'AP_CFG or ADVCFG'!\n", sub_cmd);
         return -1;
     }
     memset(ap_config, 0, sizeof(CsrWifiSmeApConfig_t));
-    ret = decode_parameter_from_string(priv,&str_ptr, "SSID=",
+    ret = decode_parameter_from_string(priv, &str_ptr, "SSID=",
                                        PARAM_TYPE_STRING, ap_config->ssid.ssid,
                                        CSR_WIFI_MAX_SSID_LEN);
     if(ret) {
-        unifi_error(priv,"store_ap_config_from_string: SSID not found\n");
+        unifi_error(priv, "store_ap_config_from_string: SSID not found\n");
         return -1;
     }
     ap_config->ssid.length = strlen(ap_config->ssid.ssid);
@@ -600,27 +600,27 @@
     ret = decode_parameter_from_string(priv, &str_ptr, "SEC=",
                                        PARAM_TYPE_STRING, sec, CSR_WIFI_MAX_SEC_LEN);
     if(ret) {
-        unifi_error(priv,"store_ap_config_from_string: SEC not found\n");
+        unifi_error(priv, "store_ap_config_from_string: SEC not found\n");
         return -1;
     }
-    ret = decode_parameter_from_string(priv,&str_ptr, "KEY=",
-                         PARAM_TYPE_STRING,  key, CSR_WIFI_MAX_KEY_LEN);
-    if(!strcasecmp(sec,"open")) {
-        unifi_trace(priv,UDBG2,"store_ap_config_from_string: security open");
+    ret = decode_parameter_from_string(priv, &str_ptr, "KEY=",
+                         PARAM_TYPE_STRING, key, CSR_WIFI_MAX_KEY_LEN);
+    if(!strcasecmp(sec, "open")) {
+        unifi_trace(priv, UDBG2, "store_ap_config_from_string: security open");
         ap_config->credentials.authType = CSR_WIFI_SME_AP_AUTH_TYPE_OPEN_SYSTEM;
         if(ret) {
-            unifi_notice(priv,"store_ap_config_from_string: KEY not found:fine with Open\n");
+            unifi_notice(priv, "store_ap_config_from_string: KEY not found:fine with Open\n");
         }
     }
-    else if(!strcasecmp(sec,"wpa2-psk")) {
-        int i,j=0;
+    else if(!strcasecmp(sec, "wpa2-psk")) {
+        int i, j=0;
         CsrWifiNmeApAuthPers *pers =
                             ((CsrWifiNmeApAuthPers *)&(ap_config->credentials.nmeAuthType.authTypePersonal));
         u8 *psk = pers->authPers_credentials.psk.psk;
 
-        unifi_trace(priv,UDBG2,"store_ap_config_from_string: security WPA2");
+        unifi_trace(priv, UDBG2, "store_ap_config_from_string: security WPA2");
         if(ret) {
-            unifi_error(priv,"store_ap_config_from_string: KEY not found for WPA2\n");
+            unifi_error(priv, "store_ap_config_from_string: KEY not found for WPA2\n");
             return -1;
         }
         ap_config->credentials.authType = CSR_WIFI_SME_AP_AUTH_TYPE_PERSONAL;
@@ -636,21 +636,21 @@
         }
 
     } else {
-       unifi_notice(priv,"store_ap_config_from_string: Unknown security: Assuming Open");
+       unifi_notice(priv, "store_ap_config_from_string: Unknown security: Assuming Open");
        ap_config->credentials.authType = CSR_WIFI_SME_AP_AUTH_TYPE_OPEN_SYSTEM;
        return -1;
     }
    /* Get the decoded value in a temp int variable to ensure that other fields within the struct
       which are of type other than int are not over written */
-    ret = decode_parameter_from_string(priv,&str_ptr, "CHANNEL=", PARAM_TYPE_INT, &tmp_var, 5);
+    ret = decode_parameter_from_string(priv, &str_ptr, "CHANNEL=", PARAM_TYPE_INT, &tmp_var, 5);
     if(ret)
         return -1;
     ap_config->channel = tmp_var;
-    ret = decode_parameter_from_string(priv,&str_ptr, "PREAMBLE=", PARAM_TYPE_INT, &tmp_var, 5);
+    ret = decode_parameter_from_string(priv, &str_ptr, "PREAMBLE=", PARAM_TYPE_INT, &tmp_var, 5);
     if(ret)
         return -1;
     ap_mac_config->preamble = tmp_var;
-    ret = decode_parameter_from_string(priv,&str_ptr, "MAX_SCB=", PARAM_TYPE_INT,  &tmp_var, 5);
+    ret = decode_parameter_from_string(priv, &str_ptr, "MAX_SCB=", PARAM_TYPE_INT, &tmp_var, 5);
     ap_config->max_connections = tmp_var;
     return ret;
 }
@@ -664,9 +664,9 @@
     int r;
 
     unifi_trace(priv, UDBG1, "iwprivsapstart\n" );
-    r = sme_ap_start(priv,interfacePriv->InterfaceTag,&priv->ap_config);
+    r = sme_ap_start(priv, interfacePriv->InterfaceTag, &priv->ap_config);
     if(r) {
-        unifi_error(priv,"iwprivsapstart AP START failed : %d\n",-r);
+        unifi_error(priv, "iwprivsapstart AP START failed : %d\n", -r);
     }
     return r;
 }
@@ -692,28 +692,28 @@
             return -EFAULT;
         }
         cfg_str[wrqu->data.length] = 0;
-        unifi_trace(priv,UDBG2,"length:%d\n",wrqu->data.length);
-        unifi_trace(priv,UDBG2,"AP configuration string:%s\n",cfg_str);
+        unifi_trace(priv, UDBG2, "length:%d\n", wrqu->data.length);
+        unifi_trace(priv, UDBG2, "AP configuration string:%s\n", cfg_str);
         str = cfg_str;
-       if ((r = store_ap_config_from_string(priv,str))) {
-           unifi_error(priv, "iwprivsapconfig:Failed  to decode the string %d\n",r);
+       if ((r = store_ap_config_from_string(priv, str))) {
+           unifi_error(priv, "iwprivsapconfig:Failed  to decode the string %d\n", r);
            kfree(cfg_str);
            return -EIO;
 
        }
     } else {
-        unifi_error(priv,"iwprivsapconfig argument length = 0 \n");
+        unifi_error(priv, "iwprivsapconfig argument length = 0 \n");
         return -EIO;
     }
     r = sme_ap_config(priv, &priv->ap_mac_config, &priv->group_sec_config);
     if(r) {
-        unifi_error(priv,"iwprivsapstop AP Config failed : %d\n",-r);
+        unifi_error(priv, "iwprivsapstop AP Config failed : %d\n", -r);
     } else if(interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_AP ||
         interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_P2PGO) {
         unifi_trace(priv, UDBG1, "iwprivsapconfig: Starting the AP");
-        r = sme_ap_start(priv,interfacePriv->InterfaceTag,&priv->ap_config);
+        r = sme_ap_start(priv, interfacePriv->InterfaceTag, &priv->ap_config);
         if(r) {
-            unifi_error(priv,"iwprivsapstart AP START failed : %d\n",-r);
+            unifi_error(priv, "iwprivsapstart AP START failed : %d\n", -r);
         }
     }
     kfree(cfg_str);
@@ -730,9 +730,9 @@
     u16 interface_tag = interfacePriv->InterfaceTag;
 
     unifi_trace(priv, UDBG1, "iwprivsapstop\n" );
-    r = sme_ap_stop(priv,interface_tag);
+    r = sme_ap_stop(priv, interface_tag);
     if(r) {
-        unifi_error(priv,"iwprivsapstop AP STOP failed : %d\n",-r);
+        unifi_error(priv, "iwprivsapstop AP STOP failed : %d\n", -r);
     }
     return r;
 }
@@ -778,14 +778,14 @@
             break;
         case CSR_WIFI_ROUTER_CTRL_MODE_AP:
         case CSR_WIFI_ROUTER_CTRL_MODE_P2PGO:
-            r = sme_ap_stop(priv,interface_tag);
+            r = sme_ap_stop(priv, interface_tag);
             break;
         default :
             break;
     }
 
     if(r) {
-        unifi_error(priv,"iwprivsstackstop Stack stop failed : %d\n",-r);
+        unifi_error(priv, "iwprivsstackstop Stack stop failed : %d\n", -r);
     }
     return 0;
 }
@@ -3167,7 +3167,7 @@
 #endif
 #ifdef CSR_SUPPORT_WEXT_AP
     { SIOCIWSAPCFGPRIV, IW_PRIV_TYPE_CHAR | 256, IW_PRIV_TYPE_NONE, "AP_SET_CFG" },
-    { SIOCIWSAPSTARTPRIV, 0,IW_PRIV_TYPE_CHAR | IW_PRIV_SIZE_FIXED|IWPRIV_SME_MAX_STRING,"AP_BSS_START" },
+    { SIOCIWSAPSTARTPRIV, 0, IW_PRIV_TYPE_CHAR | IW_PRIV_SIZE_FIXED|IWPRIV_SME_MAX_STRING, "AP_BSS_START" },
     { SIOCIWSAPSTOPPRIV, IW_PRIV_TYPE_CHAR |IW_PRIV_SIZE_FIXED|0,
       IW_PRIV_TYPE_CHAR |IW_PRIV_SIZE_FIXED|0, "AP_BSS_STOP" },
 #ifdef ANDROID_BUILD

diff --git a/drivers/staging/csr/ul_int.c b/drivers/staging/csr/ul_int.c
index 0fae6f4..eb286e5 100644
--- a/drivers/staging/csr/ul_int.c
+++ b/drivers/staging/csr/ul_int.c

@@ -258,7 +258,7 @@
         unifi_notice(priv, "ul_log_config_ind: wifi on in progress, suppress error\n");
     } else {
         /* wifi_off_ind (error or exit) */
-        CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0, (CsrWifiRouterCtrlControlIndication)(*conf_param));
+        CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, (CsrWifiRouterCtrlControlIndication)(*conf_param));
     }
 #ifdef CSR_WIFI_HIP_DEBUG_OFFLINE
     unifi_debug_buf_dump();

diff --git a/drivers/staging/csr/unifi_event.c b/drivers/staging/csr/unifi_event.c
index e81a998..71fdb21 100644
--- a/drivers/staging/csr/unifi_event.c
+++ b/drivers/staging/csr/unifi_event.c

@@ -105,7 +105,7 @@
     u8 isDataFrameSubTypeNoData = FALSE;
 
 #ifdef CSR_WIFI_SECURITY_WAPI_ENABLE
-    static const u8 wapiProtocolIdSNAPHeader[] = {0x88,0xb4};
+    static const u8 wapiProtocolIdSNAPHeader[] = {0x88, 0xb4};
     static const u8 wapiProtocolIdSNAPHeaderOffset = 6;
     u8 *destAddr;
     u8 *srcAddr;
@@ -206,7 +206,7 @@
                 unifi_trace(priv, UDBG4, "Discarding the contents of the frame with MIC failure \n");
 
                 if (isWapiUnicastPkt &&
-                    ((uf_sme_port_state(priv,srcAddr,UF_CONTROLLED_PORT_Q,interfaceTag) != CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_OPEN)||
+                    ((uf_sme_port_state(priv, srcAddr, UF_CONTROLLED_PORT_Q, interfaceTag) != CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_OPEN)||
 #ifndef CSR_WIFI_SECURITY_WAPI_SW_ENCRYPTION
                     (priv->wapi_unicast_filter) ||
 #endif
@@ -231,7 +231,7 @@
                 unifi_trace(priv, UDBG6, "check_routing_pkt_data_ind - MIC FAILURE : Dest Addr %x:%x:%x:%x:%x:%x\n",
                             destAddr[0], destAddr[1], destAddr[2], destAddr[3], destAddr[4], destAddr[5]);
                 unifi_trace(priv, UDBG6, "check_routing_pkt_data_ind - MIC FAILURE : Control Port State - 0x%.4X \n",
-                            uf_sme_port_state(priv,srcAddr,UF_CONTROLLED_PORT_Q,interfaceTag));
+                            uf_sme_port_state(priv, srcAddr, UF_CONTROLLED_PORT_Q, interfaceTag));
 
                 unifi_error(priv, "MIC failure in %s\n", __FUNCTION__);
 
@@ -285,9 +285,9 @@
 
         if (llcSnapHeaderOffset > 0) {
         	/* QoS data or Data */
-            unifi_trace(priv, UDBG6, "check_routing_pkt_data_ind(): SNAP header found & its offset %d\n",llcSnapHeaderOffset);
+            unifi_trace(priv, UDBG6, "check_routing_pkt_data_ind(): SNAP header found & its offset %d\n", llcSnapHeaderOffset);
             if (memcmp((u8 *)(bulkdata->d[0].os_data_ptr+llcSnapHeaderOffset+wapiProtocolIdSNAPHeaderOffset),
-                       wapiProtocolIdSNAPHeader,sizeof(wapiProtocolIdSNAPHeader))) {
+                       wapiProtocolIdSNAPHeader, sizeof(wapiProtocolIdSNAPHeader))) {
 
             	unifi_trace(priv, UDBG6, "check_routing_pkt_data_ind(): This is a data & NOT a WAI protocol packet\n");
                 /* On the first unicast data pkt that is decrypted successfully after re-keying, reset the filter */
@@ -584,14 +584,14 @@
     unifi_priv_t *priv = (unifi_priv_t*)ospriv;
 
     unifi_trace(priv, UDBG4, "rx_wq_handler: RdPtr = %d WritePtr =  %d\n",
-                priv->rxSignalBuffer.readPointer,priv->rxSignalBuffer.writePointer);
+                priv->rxSignalBuffer.readPointer, priv->rxSignalBuffer.writePointer);
     if(priv != NULL) {
         u8 readPointer = priv->rxSignalBuffer.readPointer;
         while (readPointer != priv->rxSignalBuffer.writePointer)
         {
              rx_buff_struct_t *buf = &priv->rxSignalBuffer.rx_buff[readPointer];
              unifi_trace(priv, UDBG6, "rx_wq_handler: RdPtr = %d WritePtr =  %d\n",
-                         readPointer,priv->rxSignalBuffer.writePointer);
+                         readPointer, priv->rxSignalBuffer.writePointer);
              unifi_process_receive_event(priv, buf->bufptr, buf->sig_len, &buf->data_ptrs);
              readPointer ++;
              if(readPointer >= priv->rxSignalBuffer.size) {
@@ -661,7 +661,7 @@
             CSR_GET_UINT16_FROM_LITTLE_ENDIAN((sigdata) + sizeof(s16)*6) & 0xFFFF,
             CSR_GET_UINT16_FROM_LITTLE_ENDIAN((sigdata) + sizeof(s16)*7) & 0xFFFF, siglen);
     if(signal_buffer_is_full(priv)) {
-        unifi_error(priv,"TO HOST signal queue FULL dropping the PDU\n");
+        unifi_error(priv, "TO HOST signal queue FULL dropping the PDU\n");
         for (i = 0; i < UNIFI_MAX_DATA_REFERENCES; i++) {
             if (bulkdata->d[i].data_length != 0) {
                 unifi_net_data_free(priv, (void *)&bulkdata->d[i]);
@@ -671,14 +671,14 @@
     }
     writePointer = priv->rxSignalBuffer.writePointer;
     rx_buff = &priv->rxSignalBuffer.rx_buff[writePointer];
-    memcpy(rx_buff->bufptr,sigdata,siglen);
+    memcpy(rx_buff->bufptr, sigdata, siglen);
     rx_buff->sig_len = siglen;
     rx_buff->data_ptrs = *bulkdata;
     writePointer++;
     if(writePointer >= priv->rxSignalBuffer.size) {
         writePointer =0;
     }
-    unifi_trace(priv, UDBG4, "unifi_receive_event:writePtr = %d\n",priv->rxSignalBuffer.writePointer);
+    unifi_trace(priv, UDBG4, "unifi_receive_event:writePtr = %d\n", priv->rxSignalBuffer.writePointer);
     priv->rxSignalBuffer.writePointer = writePointer;
 
 #ifndef CSR_WIFI_RX_PATH_SPLIT_DONT_USE_WQ

diff --git a/drivers/staging/csr/unifi_pdu_processing.c b/drivers/staging/csr/unifi_pdu_processing.c
index f9b421b..04fe9e2 100644
--- a/drivers/staging/csr/unifi_pdu_processing.c
+++ b/drivers/staging/csr/unifi_pdu_processing.c

@@ -38,7 +38,7 @@
     skb = (struct sk_buff*)bulkdata->d[0].os_net_buf_ptr;
     align_offset = (u32)(long)(bulkdata->d[0].os_data_ptr) & (CSR_WIFI_ALIGN_BYTES-1);
     if(align_offset){
-        skb_pull(skb,align_offset);
+        skb_pull(skb, align_offset);
     }
 
     buffered_pkt->bulkdata.os_data_ptr = bulkdata->d[0].os_data_ptr;
@@ -86,7 +86,7 @@
      */
     req->TransmissionControl = transmissionControl;
     req->VirtualInterfaceIdentifier =
-           uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag);
+           uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag);
     memcpy(req->Ra.x, peerMacAddress, ETH_ALEN);
 
     if (hostTag == 0xffffffff) {
@@ -124,8 +124,8 @@
 #define TRANSMISSION_CONTROL_EOSP_MASK 0x0002
 
 static
-int frame_and_send_queued_pdu(unifi_priv_t* priv,tx_buffered_packets_t* buffered_pkt,
-            CsrWifiRouterCtrlStaInfo_t *staRecord,u8 moreData , u8 eosp)
+int frame_and_send_queued_pdu(unifi_priv_t* priv, tx_buffered_packets_t* buffered_pkt,
+            CsrWifiRouterCtrlStaInfo_t *staRecord, u8 moreData , u8 eosp)
 {
 
     CSR_SIGNAL signal;
@@ -135,7 +135,7 @@
     u8 *qc;
     u16 *fc = (u16*)(buffered_pkt->bulkdata.os_data_ptr);
     unsigned long lock_flags;
-    unifi_trace(priv, UDBG3, "frame_and_send_queued_pdu with moreData: %d , EOSP: %d\n",moreData,eosp);
+    unifi_trace(priv, UDBG3, "frame_and_send_queued_pdu with moreData: %d , EOSP: %d\n", moreData, eosp);
     unifi_frame_ma_packet_req(priv, buffered_pkt->priority, buffered_pkt->rate, buffered_pkt->hostTag,
                buffered_pkt->interfaceTag, buffered_pkt->transmissionControl,
                buffered_pkt->leSenderProcessId, buffered_pkt->peerMacAddress.a, &signal);
@@ -156,7 +156,7 @@
 
     if((staRecord != NULL)&& (staRecord->wmmOrQosEnabled == TRUE))
     {
-        unifi_trace(priv, UDBG3, "frame_and_send_queued_pdu WMM Enabled: %d \n",staRecord->wmmOrQosEnabled);
+        unifi_trace(priv, UDBG3, "frame_and_send_queued_pdu WMM Enabled: %d \n", staRecord->wmmOrQosEnabled);
 
         toDs = (*fc & cpu_to_le16(IEEE802_11_FC_TO_DS_MASK))?1 : 0;
         fromDs = (*fc & cpu_to_le16(IEEE802_11_FC_FROM_DS_MASK))? 1: 0;
@@ -190,7 +190,7 @@
     }
     result = ul_send_signal_unpacked(priv, &signal, &bulkdata);
     if(result){
-        _update_buffered_pkt_params_after_alignment(priv, &bulkdata,buffered_pkt);
+        _update_buffered_pkt_params_after_alignment(priv, &bulkdata, buffered_pkt);
     }
 
  /* Decrement the packet counts queued in driver */
@@ -199,13 +199,13 @@
         if (!priv->noOfPktQueuedInDriver) {
             unifi_error(priv, "packets queued in driver 0 still decrementing\n");
         } else {
-            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
             priv->noOfPktQueuedInDriver--;
-            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
         }
         /* Sta Record is available for all unicast (except genericMgt Frames) & in other case its NULL */
         if (staRecord) {
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             if (!staRecord->noOfPktQueued) {
                 unifi_error(priv, "packets queued in driver per station is 0 still decrementing\n");
             } else {
@@ -217,7 +217,7 @@
                     staRecord->nullDataHostTag = INVALID_HOST_TAG;
                 }
             }
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }
 
     }
@@ -243,24 +243,24 @@
 
     /* return the last node , and modify it. */
 
-    spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+    spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
     list_for_each_prev_safe(listHead, placeHolder, txList) {
         tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
         tx_q_item->transmissionControl |= TRANSMISSION_CONTROL_EOSP_MASK;
         tx_q_item->transmissionControl = (tx_q_item->transmissionControl & ~(CSR_NO_CONFIRM_REQUIRED));
         unifi_trace(priv, UDBG1,
-                "set_eosp_transmit_ctrl Transmission Control = 0x%x hostTag = 0x%x \n",tx_q_item->transmissionControl,tx_q_item->hostTag);
-        unifi_trace(priv,UDBG3,"in set_eosp_transmit_ctrl no.of buffered frames %d\n",priv->noOfPktQueuedInDriver);
+                "set_eosp_transmit_ctrl Transmission Control = 0x%x hostTag = 0x%x \n", tx_q_item->transmissionControl, tx_q_item->hostTag);
+        unifi_trace(priv, UDBG3, "in set_eosp_transmit_ctrl no.of buffered frames %d\n", priv->noOfPktQueuedInDriver);
         break;
     }
-    spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
-    unifi_trace(priv, UDBG1,"List Empty %d\n",list_empty(txList));
+    spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
+    unifi_trace(priv, UDBG1, "List Empty %d\n", list_empty(txList));
     unifi_trace(priv, UDBG5, "leaving set_eosp_transmit_ctrl\n");
     return;
 }
 
 static
-void send_vif_availibility_rsp(unifi_priv_t *priv,CSR_VIF_IDENTIFIER vif,CSR_RESULT_CODE resultCode)
+void send_vif_availibility_rsp(unifi_priv_t *priv, CSR_VIF_IDENTIFIER vif, CSR_RESULT_CODE resultCode)
 {
     CSR_SIGNAL signal;
     CSR_MA_VIF_AVAILABILITY_RESPONSE *rsp;
@@ -269,7 +269,7 @@
 
     unifi_trace(priv, UDBG3, "send_vif_availibility_rsp : invoked with resultCode = %d \n", resultCode);
 
-    memset(&signal,0,sizeof(CSR_SIGNAL));
+    memset(&signal, 0, sizeof(CSR_SIGNAL));
     rsp = &signal.u.MaVifAvailabilityResponse;
     rsp->VirtualInterfaceIdentifier = vif;
     rsp->ResultCode = resultCode;
@@ -280,7 +280,7 @@
     /* Send the signal to UniFi */
     r = ul_send_signal_unpacked(priv, &signal, bulkdata);
     if(r) {
-        unifi_error(priv,"Availibility response sending failed %x status %d\n",vif,r);
+        unifi_error(priv, "Availibility response sending failed %x status %d\n", vif, r);
     }
     else {
         unifi_trace(priv, UDBG3, "send_vif_availibility_rsp : status = %d \n", r);
@@ -295,7 +295,7 @@
     unsigned long lock_flags;
     struct list_head *listHead, *list;
     struct list_head *placeHolder;
-    u8 i, j,eospFramedeleted=0;
+    u8 i, j, eospFramedeleted=0;
     u8 thresholdExcedeDueToBroadcast = TRUE;
     /* it will be made it interface Specific in the future when multi interfaces are supported ,
     right now interface 0 is considered */
@@ -311,10 +311,10 @@
              * packets for station record crossed the threshold limit (64 for AP supporting
              * 8 peers)
              */
-            unifi_trace(priv,UDBG3,"number of station pkts queued=  %d for sta id = %d\n", staInfo->noOfPktQueued, staInfo->aid);
+            unifi_trace(priv, UDBG3, "number of station pkts queued=  %d for sta id = %d\n", staInfo->noOfPktQueued, staInfo->aid);
             for(j = 0; j < MAX_ACCESS_CATOGORY; j++) {
                 list = &staInfo->dataPdu[j];
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 list_for_each_safe(listHead, placeHolder, list) {
                     tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
                     list_del(listHead);
@@ -339,7 +339,7 @@
                     }
                     break;
                 }
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             }
         }
     }
@@ -347,13 +347,13 @@
         /* Remove the packets from genericMulticastOrBroadCastFrames queue
          * (the max packets in driver is reached due to broadcast/multicast frames)
          */
-        spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+        spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
         list_for_each_safe(listHead, placeHolder, &interfacePriv->genericMulticastOrBroadCastFrames) {
             tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
             if(eospFramedeleted){
                 tx_q_item->transmissionControl |= TRANSMISSION_CONTROL_EOSP_MASK;
                 tx_q_item->transmissionControl = (tx_q_item->transmissionControl & ~(CSR_NO_CONFIRM_REQUIRED));
-                unifi_trace(priv, UDBG1,"updating eosp for next packet hostTag:= 0x%x ",tx_q_item->hostTag);
+                unifi_trace(priv, UDBG1, "updating eosp for next packet hostTag:= 0x%x ", tx_q_item->hostTag);
                 eospFramedeleted =0;
                 break;
             }
@@ -361,7 +361,7 @@
             if(tx_q_item->transmissionControl & TRANSMISSION_CONTROL_EOSP_MASK ){
                eospFramedeleted = 1;
             }
-            unifi_trace(priv,UDBG1, "freeing of multicast packets ToC = 0x%x hostTag = 0x%x \n",tx_q_item->transmissionControl,tx_q_item->hostTag);
+            unifi_trace(priv, UDBG1, "freeing of multicast packets ToC = 0x%x hostTag = 0x%x \n", tx_q_item->transmissionControl, tx_q_item->hostTag);
             list_del(listHead);
             unifi_net_data_free(priv, &tx_q_item->bulkdata);
             kfree(tx_q_item);
@@ -373,7 +373,7 @@
                 break;
             }
         }
-        spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
     }
     unifi_trace(priv, UDBG3, "leaving verify_and_accomodate_tx_packet\n");
 }
@@ -391,13 +391,13 @@
 
     unifi_trace(priv, UDBG5, "entering enque_tx_data_pdu\n");
     if(!list) {
-       unifi_error(priv,"List is not specified\n");
+       unifi_error(priv, "List is not specified\n");
        return CSR_RESULT_FAILURE;
     }
 
     /* Removes aged packets & adds the incoming packet */
     if (priv->noOfPktQueuedInDriver >= CSR_WIFI_DRIVER_SUPPORT_FOR_MAX_PKT_QUEUEING) {
-        unifi_trace(priv,UDBG3,"number of pkts queued=  %d \n", priv->noOfPktQueuedInDriver);
+        unifi_trace(priv, UDBG3, "number of pkts queued=  %d \n", priv->noOfPktQueuedInDriver);
         verify_and_accomodate_tx_packet(priv);
     }
 
@@ -412,7 +412,7 @@
     }
 
     /* disable the preemption */
-    spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+    spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
     INIT_LIST_HEAD(&tx_q_item->q);
     /* fill the tx_q structure members */
     tx_q_item->bulkdata.os_data_ptr = bulkdata->d[0].os_data_ptr;
@@ -437,7 +437,7 @@
 
     /* Count of packet queued in driver */
     priv->noOfPktQueuedInDriver++;
-    spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
     unifi_trace(priv, UDBG5, "leaving enque_tx_data_pdu\n");
     return CSR_RESULT_SUCCESS;
 }
@@ -655,13 +655,13 @@
     }
 
     if (handle != CSR_WIFI_BROADCAST_OR_MULTICAST_HANDLE) {
-        spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+        spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
         if ((staRecord = ((CsrWifiRouterCtrlStaInfo_t *) (interfacePriv->staInfo[handle]))) == NULL) {
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
             unifi_warning(priv, "uf_handle_tim_cfm: station record is NULL  handle = %x\n", handle);
             return;
         }
-       spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+       spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
     }
     switch(timSetStatus)
     {
@@ -909,13 +909,13 @@
                    (u8*)&signal.SignalPrimitiveHeader.SenderProcessId);
 
     /* set The virtual interfaceIdentifier, aid, tim value */
-    req->VirtualInterfaceIdentifier = uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag);
+    req->VirtualInterfaceIdentifier = uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag);
     req->AssociationId = aid;
     req->TimValue = setTim;
 
 
     unifi_trace(priv, UDBG2, "update_tim:AID %x,senderIdLsb = 0x%x, handle = 0x%x, timSetStatus = %x, sender proceesID = %x \n",
-                aid,senderIdLsb, handle, timSetStatus, signal.SignalPrimitiveHeader.SenderProcessId);
+                aid, senderIdLsb, handle, timSetStatus, signal.SignalPrimitiveHeader.SenderProcessId);
 
     /* Send the signal to UniFi */
     r = ul_send_signal_unpacked(priv, &signal, bulkdata);
@@ -953,17 +953,17 @@
                                     CsrWifiRouterCtrlStaInfo_t *staRecord,
                                     u16 interfaceTag)
 {
-    int r,i;
-    u8 spaceAvail[4] = {TRUE,TRUE,TRUE,TRUE};
+    int r, i;
+    u8 spaceAvail[4] = {TRUE, TRUE, TRUE, TRUE};
     tx_buffered_packets_t * buffered_pkt = NULL;
     unsigned long lock_flags;
     netInterface_priv_t *interfacePriv = priv->interfacePriv[interfaceTag];
 
     unifi_trace(priv, UDBG5, "entering process_peer_active_transition\n");
 
-    if(IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag)) {
+    if(IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag)) {
         /* giving more priority to multicast packets so delaying unicast packets*/
-        unifi_trace(priv,UDBG2, "Multicast transmission is going on so resume unicast transmission after DTIM over\n");
+        unifi_trace(priv, UDBG2, "Multicast transmission is going on so resume unicast transmission after DTIM over\n");
 
         /* As station is active now, even though AP is not able to send frames to it
          * because of DTIM, it needs to reset the TIM here
@@ -987,12 +987,12 @@
     while((buffered_pkt=dequeue_tx_data_pdu(priv, &staRecord->mgtFrames))) {
         buffered_pkt->transmissionControl &=
                      ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
-        if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,0,FALSE)) == -ENOSPC) {
+        if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, 0, FALSE)) == -ENOSPC) {
             unifi_trace(priv, UDBG2, "p_p_a_t:(ENOSPC) Mgt Frame queueing \n");
             /* Enqueue at the head of the queue */
-            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
             list_add(&buffered_pkt->q, &staRecord->mgtFrames);
-            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             priv->pausedStaHandle[3]=(u8)(staRecord->assignedHandle);
             spaceAvail[3] = FALSE;
             break;
@@ -1008,7 +1008,7 @@
     if (!staRecord->timRequestPendingFlag) {
         if (staRecord->txSuspend) {
             if(staRecord->timSet == CSR_WIFI_TIM_SET) {
-                update_tim(priv,staRecord->aid,0,interfaceTag, staRecord->assignedHandle);
+                update_tim(priv, staRecord->aid, 0, interfaceTag, staRecord->assignedHandle);
             }
             return;
         }
@@ -1025,16 +1025,16 @@
     for(i=3;i>=0;i--) {
         if(!spaceAvail[i])
             continue;
-        unifi_trace(priv, UDBG6, "p_p_a_t:data pkt sending for AC %d \n",i);
+        unifi_trace(priv, UDBG6, "p_p_a_t:data pkt sending for AC %d \n", i);
         while((buffered_pkt=dequeue_tx_data_pdu(priv, &staRecord->dataPdu[i]))) {
            buffered_pkt->transmissionControl &=
                       ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
-           if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,0,FALSE)) == -ENOSPC) {
+           if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, 0, FALSE)) == -ENOSPC) {
                /* Clear the trigger bit transmission control*/
                /* Enqueue at the head of the queue */
-               spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+               spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                list_add(&buffered_pkt->q, &staRecord->dataPdu[i]);
-               spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+               spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                priv->pausedStaHandle[i]=(u8)(staRecord->assignedHandle);
                break;
            } else {
@@ -1050,7 +1050,7 @@
     if (!staRecord->timRequestPendingFlag){
         if((staRecord->timSet  == CSR_WIFI_TIM_SET) || (staRecord->timSet  == CSR_WIFI_TIM_SETTING)) {
             unifi_trace(priv, UDBG3, "p_p_a_t:resetting tim .....\n");
-            update_tim(priv,staRecord->aid,0,interfaceTag, staRecord->assignedHandle);
+            update_tim(priv, staRecord->aid, 0, interfaceTag, staRecord->assignedHandle);
         }
     }
     else
@@ -1067,7 +1067,7 @@
 
 
 
-void uf_process_ma_pkt_cfm_for_ap(unifi_priv_t *priv,u16 interfaceTag, const CSR_MA_PACKET_CONFIRM *pkt_cfm)
+void uf_process_ma_pkt_cfm_for_ap(unifi_priv_t *priv, u16 interfaceTag, const CSR_MA_PACKET_CONFIRM *pkt_cfm)
 {
     netInterface_priv_t *interfacePriv;
     u8 i;
@@ -1076,16 +1076,16 @@
 
 
     if(pkt_cfm->HostTag == interfacePriv->multicastPduHostTag) {
-         unifi_trace(priv,UDBG2,"CFM for marked Multicast Tag = %x\n",interfacePriv->multicastPduHostTag);
+         unifi_trace(priv, UDBG2, "CFM for marked Multicast Tag = %x\n", interfacePriv->multicastPduHostTag);
          interfacePriv->multicastPduHostTag = 0xffffffff;
-         resume_suspended_uapsd(priv,interfaceTag);
-         resume_unicast_buffered_frames(priv,interfaceTag);
+         resume_suspended_uapsd(priv, interfaceTag);
+         resume_unicast_buffered_frames(priv, interfaceTag);
          if(list_empty(&interfacePriv->genericMulticastOrBroadCastMgtFrames) &&
               list_empty(&interfacePriv->genericMulticastOrBroadCastFrames)) {
-            unifi_trace(priv,UDBG1,"Resetting multicastTIM");
+            unifi_trace(priv, UDBG1, "Resetting multicastTIM");
             if (!interfacePriv->bcTimSetReqPendingFlag)
             {
-                update_tim(priv,0,CSR_WIFI_TIM_RESET,interfaceTag, 0xFFFFFFFF);
+                update_tim(priv, 0, CSR_WIFI_TIM_RESET, interfaceTag, 0xFFFFFFFF);
             }
             else
             {
@@ -1164,7 +1164,7 @@
                                                                  &send_cfm_list,
                                                                  &(staRecord->dataPdu[j]));
 
-                        uf_flush_list(priv,&(staRecord->dataPdu[j]));
+                        uf_flush_list(priv, &(staRecord->dataPdu[j]));
                     }
 
                     send_auto_ma_packet_confirm(priv, staRecord->interfacePriv, &send_cfm_list);
@@ -1469,7 +1469,7 @@
     }
 
     /* prepare the complete skb, by pushing the MAC header to the beginning of the skb->data */
-    unifi_trace(priv, UDBG5, "updated Mac Header: %d \n",macHeaderLengthInBytes);
+    unifi_trace(priv, UDBG5, "updated Mac Header: %d \n", macHeaderLengthInBytes);
     memcpy(bufPtr, macHeaderBuf, macHeaderLengthInBytes);
 
     unifi_trace(priv, UDBG5, "leaving the update_macheader function\n");
@@ -1515,7 +1515,7 @@
     CsrWifiRouterCtrlStaInfo_t *dstStaInfo = NULL;
     netInterface_priv_t *interfacePriv;
 
-    unifi_trace(priv, UDBG5, "entering  uf_ap_process_data_pdu %d\n",macHeaderLengthInBytes);
+    unifi_trace(priv, UDBG5, "entering  uf_ap_process_data_pdu %d\n", macHeaderLengthInBytes);
     /* InterfaceTag validation from MA_PACKET.indication */
     if (interfaceTag >= CSR_WIFI_NUM_INTERFACES) {
         unifi_trace(priv, UDBG1, "Interface Tag is Invalid in uf_ap_process_data_pdu\n");
@@ -1608,7 +1608,7 @@
     unifi_trace(priv, UDBG3, "Mac Header updated...calling uf_process_ma_packet_req \n");
 
     /* Packet is ready to send to unifi ,transmissionControl = 0x0004, confirmation is not needed for data packets */
-    if (uf_process_ma_packet_req(priv,  ehdr->h_dest, 0xffffffff, interfaceTag, CSR_NO_CONFIRM_REQUIRED, (CSR_RATE)0,priority, priv->netdev_client->sender_id, bulkdata)) {
+    if (uf_process_ma_packet_req(priv, ehdr->h_dest, 0xffffffff, interfaceTag, CSR_NO_CONFIRM_REQUIRED, (CSR_RATE)0, priority, priv->netdev_client->sender_id, bulkdata)) {
         if (sendToNetdev) {
             unifi_trace(priv, UDBG1, "In uf_ap_process_data_pdu, (Packet Drop) uf_process_ma_packet_req failed. freeing skb_copy data (original data sent to Netdev)\n");
             /*  Free's the skb_copy(skbPtr) data since packet processing failed */
@@ -1750,7 +1750,7 @@
                         /* push the packet to the unifi if list is empty (if packet lost how to re-enque) */
                         if (list_empty(&interfacePriv->genericMgtFrames)) {
 #ifdef CSR_SUPPORT_SME
-                            if(!(IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag))) {
+                            if(!(IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag))) {
 #endif
 
                             unifi_trace(priv, UDBG3, "genericMgtFrames list is empty uf_process_ma_packet_req \n");
@@ -1765,8 +1765,8 @@
 #ifdef CSR_SUPPORT_SME
                             }else{
                                 list = &interfacePriv->genericMgtFrames;
-                                unifi_trace(priv, UDBG3, "genericMgtFrames queue empty and dtim started\n hosttag is 0x%x,\n",signal.u.MaPacketRequest.HostTag);
-                                update_eosp_to_head_of_broadcast_list_head(priv,interfaceTag);
+                                unifi_trace(priv, UDBG3, "genericMgtFrames queue empty and dtim started\n hosttag is 0x%x,\n", signal.u.MaPacketRequest.HostTag);
+                                update_eosp_to_head_of_broadcast_list_head(priv, interfaceTag);
                            }
 #endif
                         } else {
@@ -1776,15 +1776,15 @@
                         }
                     } else {
                         /* check peer power state */
-                        if (queuePacketDozing || !list_empty(&staRecord->mgtFrames) || IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag)) {
+                        if (queuePacketDozing || !list_empty(&staRecord->mgtFrames) || IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag)) {
                             /* peer is in dozing mode, so queue packet in mgt frame list of station record */
                            /*if multicast traffic is going on, buffer the unicast packets*/
                             list = &staRecord->mgtFrames;
 
                             unifi_trace(priv, UDBG1, "staRecord->MgtFrames list empty? = %s, handle = %d, queuePacketDozing = %d\n",
                                         (list_empty(&staRecord->mgtFrames))? "YES": "NO", staRecord->assignedHandle, queuePacketDozing);
-                            if(IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag)){
-                                update_eosp_to_head_of_broadcast_list_head(priv,interfaceTag);
+                            if(IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag)){
+                                update_eosp_to_head_of_broadcast_list_head(priv, interfaceTag);
                             }
 
                         } else {
@@ -1794,7 +1794,7 @@
                                 /* requeue the failed packet to staRecord->mgtFrames with same position */
                                 list = &staRecord->mgtFrames;
                                 requeueOnSamePos = TRUE;
-                                unifi_trace(priv, UDBG1, "(ENOSPC) Sending MgtFrames Failed handle = %d so buffering\n",staRecord->assignedHandle);
+                                unifi_trace(priv, UDBG1, "(ENOSPC) Sending MgtFrames Failed handle = %d so buffering\n", staRecord->assignedHandle);
                                 priv->pausedStaHandle[0]=(u8)(staRecord->assignedHandle);
                             } else if (result) {
                                 status = CSR_RESULT_FAILURE;
@@ -1827,11 +1827,11 @@
                     if(!staRecord) {
                         unifi_error(priv, "In %s unicast but staRecord = NULL\n", __FUNCTION__);
                         return CSR_RESULT_FAILURE;
-                    } else if (queuePacketDozing || isRouterBufferEnabled(priv,priority_q)|| !list_empty(&staRecord->dataPdu[priority_q]) || IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag)) {
+                    } else if (queuePacketDozing || isRouterBufferEnabled(priv, priority_q)|| !list_empty(&staRecord->dataPdu[priority_q]) || IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag)) {
                         /* peer is in dozing mode, so queue packet in mgt frame list of station record */
                         /* if multicast traffic is going on, buffet the unicast packets */
                         unifi_trace(priv, UDBG2, "Enqueued to staRecord->dataPdu[%d] queuePacketDozing=%d,\
-                                Buffering enabled = %d \n", priority_q,queuePacketDozing,isRouterBufferEnabled(priv,priority_q));
+                                Buffering enabled = %d \n", priority_q, queuePacketDozing, isRouterBufferEnabled(priv, priority_q));
                         list = &staRecord->dataPdu[priority_q];
                     } else {
                         unifi_trace(priv, UDBG5, "staRecord->dataPdu[%d] list is empty uf_process_ma_packet_req \n", priority_q);
@@ -1839,12 +1839,12 @@
                         result = ul_send_signal_unpacked(priv, &signal, bulkdata);
                         if(result == -ENOSPC) {
                             /* requeue the failed packet to staRecord->dataPdu[priority_q] with same position */
-                            unifi_trace(priv, UDBG1, "(ENOSPC) Sending Unicast DataPDU to queue %d Failed so buffering\n",priority_q);
+                            unifi_trace(priv, UDBG1, "(ENOSPC) Sending Unicast DataPDU to queue %d Failed so buffering\n", priority_q);
                             requeueOnSamePos = TRUE;
                             list = &staRecord->dataPdu[priority_q];
                             priv->pausedStaHandle[priority_q]=(u8)(staRecord->assignedHandle);
-                            if(!isRouterBufferEnabled(priv,priority_q)) {
-                                unifi_error(priv,"Buffering Not enabled for queue %d \n",priority_q);
+                            if(!isRouterBufferEnabled(priv, priority_q)) {
+                                unifi_error(priv, "Buffering Not enabled for queue %d \n", priority_q);
                             }
                         } else if (result) {
                             status = CSR_RESULT_FAILURE;
@@ -1869,19 +1869,19 @@
             unifi_error(priv, "unrecognized frame type\n");
     }
     if(list) {
-        status = enque_tx_data_pdu(priv, bulkdata,list, &signal,requeueOnSamePos);
+        status = enque_tx_data_pdu(priv, bulkdata, list, &signal, requeueOnSamePos);
         /* Record no. of packet queued for each peer */
         if (staRecord && (pktType == CSR_WIFI_UNICAST_PDU) && (!status)) {
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             staRecord->noOfPktQueued++;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }
         else if ((pktType == CSR_WIFI_MULTICAST_PDU) && (!status))
         {
             /* If broadcast Tim is set && queuing is successful, then only update TIM */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             interfacePriv->noOfbroadcastPktQueued++;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }
     }
     /* If broadcast Tim is set && queuing is successful, then only update TIM */
@@ -1889,7 +1889,7 @@
         unifi_trace(priv, UDBG3, "tim set due to broadcast pkt\n");
         if (!interfacePriv->bcTimSetReqPendingFlag)
         {
-            update_tim(priv,0,CSR_WIFI_TIM_SET,interfaceTag, handle);
+            update_tim(priv, 0, CSR_WIFI_TIM_SET, interfaceTag, handle);
         }
         else
         {
@@ -1909,7 +1909,7 @@
                    !list_empty(&staRecord->dataPdu[UNIFI_TRAFFIC_Q_CONTENTION])) {
                     unifi_trace(priv, UDBG3, "tim set due to unicast pkt & peer in powersave\n");
                     if (!staRecord->timRequestPendingFlag){
-                        update_tim(priv,staRecord->aid,1,interfaceTag, handle);
+                        update_tim(priv, staRecord->aid, 1, interfaceTag, handle);
                     }
                     else
                     {
@@ -1929,7 +1929,7 @@
                 if (uf_is_more_data_for_non_delivery_ac(staRecord) || (allDeliveryEnabled && dataAvailable)
                     || (!list_empty(&staRecord->mgtFrames))) {
                     if (!staRecord->timRequestPendingFlag) {
-                        update_tim(priv,staRecord->aid,1,interfaceTag, handle);
+                        update_tim(priv, staRecord->aid, 1, interfaceTag, handle);
                     }
                     else
                     {
@@ -1945,8 +1945,8 @@
         }
     }
 
-    if((list) && (pktType == CSR_WIFI_UNICAST_PDU && !queuePacketDozing) && !(isRouterBufferEnabled(priv,priority_q)) && !(IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag))) {
-        unifi_trace(priv, UDBG2, "buffering cleared for queue = %d So resending buffered frames\n",priority_q);
+    if((list) && (pktType == CSR_WIFI_UNICAST_PDU && !queuePacketDozing) && !(isRouterBufferEnabled(priv, priority_q)) && !(IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag))) {
+        unifi_trace(priv, UDBG2, "buffering cleared for queue = %d So resending buffered frames\n", priority_q);
         uf_send_buffered_frames(priv, priority_q);
     }
     unifi_trace(priv, UDBG5, "leaving uf_process_ma_packet_req \n");
@@ -2022,23 +2022,23 @@
     netInterface_priv_t *interfacePriv = priv->interfacePriv[interfaceTag];
     u32 hostTag = 0xffffffff;
 
-    if(!isRouterBufferEnabled(priv,UNIFI_TRAFFIC_Q_VO)) {
-        while((interfacePriv->dtimActive)&& (buffered_pkt=dequeue_tx_data_pdu(priv,&interfacePriv->genericMulticastOrBroadCastMgtFrames))) {
+    if(!isRouterBufferEnabled(priv, UNIFI_TRAFFIC_Q_VO)) {
+        while((interfacePriv->dtimActive)&& (buffered_pkt=dequeue_tx_data_pdu(priv, &interfacePriv->genericMulticastOrBroadCastMgtFrames))) {
             buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK);
             moreData = (buffered_pkt->transmissionControl & TRANSMISSION_CONTROL_EOSP_MASK)?FALSE:TRUE;
 
 
-            unifi_trace(priv,UDBG2,"DTIM Occurred for interface:sending Mgt packet %d\n",interfaceTag);
+            unifi_trace(priv, UDBG2, "DTIM Occurred for interface:sending Mgt packet %d\n", interfaceTag);
 
-            if((r=frame_and_send_queued_pdu(priv,buffered_pkt,NULL,moreData,FALSE)) == -ENOSPC) {
-               unifi_trace(priv,UDBG1,"frame_and_send_queued_pdu failed with ENOSPC for host tag = %x\n", buffered_pkt->hostTag);
+            if((r=frame_and_send_queued_pdu(priv, buffered_pkt, NULL, moreData, FALSE)) == -ENOSPC) {
+               unifi_trace(priv, UDBG1, "frame_and_send_queued_pdu failed with ENOSPC for host tag = %x\n", buffered_pkt->hostTag);
                /* Enqueue at the head of the queue */
-               spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+               spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                list_add(&buffered_pkt->q, &interfacePriv->genericMulticastOrBroadCastMgtFrames);
-               spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+               spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                break;
             } else {
-                unifi_trace(priv,UDBG1,"send_multicast_frames: Send genericMulticastOrBroadCastMgtFrames (%x, %x)\n",
+                unifi_trace(priv, UDBG1, "send_multicast_frames: Send genericMulticastOrBroadCastMgtFrames (%x, %x)\n",
                                         buffered_pkt->hostTag,
                                         r);
                 if(r) {
@@ -2051,35 +2051,35 @@
                         hostTag = buffered_pkt->hostTag;
                         pduSent++;
                     } else {
-                        send_vif_availibility_rsp(priv,uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag),CSR_RC_UNSPECIFIED_FAILURE);
+                        send_vif_availibility_rsp(priv, uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag), CSR_RC_UNSPECIFIED_FAILURE);
                     }
                 }
                 /* Buffered frame sent successfully */
-                spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+                spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
                 interfacePriv->noOfbroadcastPktQueued--;
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
                 kfree(buffered_pkt);
            }
 
         }
     }
-    if(!isRouterBufferEnabled(priv,UNIFI_TRAFFIC_Q_CONTENTION)) {
-        while((interfacePriv->dtimActive)&& (buffered_pkt=dequeue_tx_data_pdu(priv,&interfacePriv->genericMulticastOrBroadCastFrames))) {
+    if(!isRouterBufferEnabled(priv, UNIFI_TRAFFIC_Q_CONTENTION)) {
+        while((interfacePriv->dtimActive)&& (buffered_pkt=dequeue_tx_data_pdu(priv, &interfacePriv->genericMulticastOrBroadCastFrames))) {
             buffered_pkt->transmissionControl |= TRANSMISSION_CONTROL_TRIGGER_MASK;
             moreData = (buffered_pkt->transmissionControl & TRANSMISSION_CONTROL_EOSP_MASK)?FALSE:TRUE;
 
 
-            if((r=frame_and_send_queued_pdu(priv,buffered_pkt,NULL,moreData,FALSE)) == -ENOSPC) {
+            if((r=frame_and_send_queued_pdu(priv, buffered_pkt, NULL, moreData, FALSE)) == -ENOSPC) {
                 /* Clear the trigger bit transmission control*/
                 buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK);
                 /* Enqueue at the head of the queue */
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 list_add(&buffered_pkt->q, &interfacePriv->genericMulticastOrBroadCastFrames);
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                 break;
             } else {
                 if(r) {
-                    unifi_trace(priv,UDBG1,"send_multicast_frames: Send genericMulticastOrBroadCastFrame failed (%x, %x)\n",
+                    unifi_trace(priv, UDBG1, "send_multicast_frames: Send genericMulticastOrBroadCastFrame failed (%x, %x)\n",
                                             buffered_pkt->hostTag,
                                             r);
                     unifi_net_data_free(priv, &buffered_pkt->bulkdata);
@@ -2090,26 +2090,26 @@
                         pduSent ++;
                         hostTag = buffered_pkt->hostTag;
                     } else {
-                        send_vif_availibility_rsp(priv,uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag),CSR_RC_UNSPECIFIED_FAILURE);
+                        send_vif_availibility_rsp(priv, uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag), CSR_RC_UNSPECIFIED_FAILURE);
                     }
                 }
                 /* Buffered frame sent successfully */
-                spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+                spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
                 interfacePriv->noOfbroadcastPktQueued--;
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
                 kfree(buffered_pkt);
             }
         }
     }
     if((interfacePriv->dtimActive == FALSE)) {
         /* Record the host Tag*/
-        unifi_trace(priv,UDBG2,"send_multicast_frames: Recorded hostTag of EOSP packet: = 0x%x\n",hostTag);
+        unifi_trace(priv, UDBG2, "send_multicast_frames: Recorded hostTag of EOSP packet: = 0x%x\n", hostTag);
         interfacePriv->multicastPduHostTag = hostTag;
     }
     return pduSent;
 }
 #endif
-void uf_process_ma_vif_availibility_ind(unifi_priv_t *priv,u8 *sigdata,
+void uf_process_ma_vif_availibility_ind(unifi_priv_t *priv, u8 *sigdata,
                                         u32 siglen)
 {
 #ifdef CSR_SUPPORT_SME
@@ -2148,15 +2148,15 @@
             /* This condition can occur because of a potential race where the
                TIM is not yet reset as host is waiting for confirm but it is sent
                by firmware and DTIM occurs*/
-            unifi_notice(priv,"ma_vif_availibility_ind recevied for multicast but queues are empty%d\n",interfaceTag);
-            send_vif_availibility_rsp(priv,ind->VirtualInterfaceIdentifier,CSR_RC_NO_BUFFERED_BROADCAST_MULTICAST_FRAMES);
+            unifi_notice(priv, "ma_vif_availibility_ind recevied for multicast but queues are empty%d\n", interfaceTag);
+            send_vif_availibility_rsp(priv, ind->VirtualInterfaceIdentifier, CSR_RC_NO_BUFFERED_BROADCAST_MULTICAST_FRAMES);
             interfacePriv->dtimActive = FALSE;
             if(interfacePriv->multicastPduHostTag == 0xffffffff) {
-                unifi_notice(priv,"ma_vif_availibility_ind recevied for multicast but queues are empty%d\n",interfaceTag);
+                unifi_notice(priv, "ma_vif_availibility_ind recevied for multicast but queues are empty%d\n", interfaceTag);
                 /* This may be an extra request in very rare race conditions but it is fine as it would atleast remove the potential lock up */
                 if (!interfacePriv->bcTimSetReqPendingFlag)
                 {
-                    update_tim(priv,0,CSR_WIFI_TIM_RESET,interfaceTag, 0xFFFFFFFF);
+                    update_tim(priv, 0, CSR_WIFI_TIM_RESET, interfaceTag, 0xFFFFFFFF);
                 }
                 else
                 {
@@ -2171,23 +2171,23 @@
             return;
         }
         if(interfacePriv->dtimActive) {
-            unifi_trace(priv,UDBG2,"DTIM Occurred for already active DTIM interface %d\n",interfaceTag);
+            unifi_trace(priv, UDBG2, "DTIM Occurred for already active DTIM interface %d\n", interfaceTag);
             return;
         } else {
-            unifi_trace(priv,UDBG2,"DTIM Occurred for interface %d\n",interfaceTag);
+            unifi_trace(priv, UDBG2, "DTIM Occurred for interface %d\n", interfaceTag);
             if(list_empty(&interfacePriv->genericMulticastOrBroadCastFrames)) {
-                set_eosp_transmit_ctrl(priv,&interfacePriv->genericMulticastOrBroadCastMgtFrames);
+                set_eosp_transmit_ctrl(priv, &interfacePriv->genericMulticastOrBroadCastMgtFrames);
             } else {
-                set_eosp_transmit_ctrl(priv,&interfacePriv->genericMulticastOrBroadCastFrames);
+                set_eosp_transmit_ctrl(priv, &interfacePriv->genericMulticastOrBroadCastFrames);
             }
         }
         interfacePriv->dtimActive = TRUE;
-        pduSent = send_multicast_frames(priv,interfaceTag);
+        pduSent = send_multicast_frames(priv, interfaceTag);
     }
     else {
-        unifi_error(priv,"Interface switching is not supported %d\n",interfaceTag);
+        unifi_error(priv, "Interface switching is not supported %d\n", interfaceTag);
         resultCode = CSR_RC_NOT_SUPPORTED;
-        send_vif_availibility_rsp(priv,ind->VirtualInterfaceIdentifier,CSR_RC_NOT_SUPPORTED);
+        send_vif_availibility_rsp(priv, ind->VirtualInterfaceIdentifier, CSR_RC_NOT_SUPPORTED);
     }
 #endif
 }
@@ -2204,12 +2204,12 @@
         if(((staRecord->powersaveMode[i]==CSR_WIFI_AC_DELIVERY_ONLY_ENABLE)
              ||(staRecord->powersaveMode[i]==CSR_WIFI_AC_TRIGGER_AND_DELIVERY_ENABLED))
              &&(!list_empty(&staRecord->dataPdu[i]))) {
-            unifi_trace(priv,UDBG2,"uf_is_more_data_for_delivery_ac: Data Available AC = %d\n", i);
+            unifi_trace(priv, UDBG2, "uf_is_more_data_for_delivery_ac: Data Available AC = %d\n", i);
             return TRUE;
         }
     }
 
-    unifi_trace(priv,UDBG2,"uf_is_more_data_for_delivery_ac: Data NOT Available \n");
+    unifi_trace(priv, UDBG2, "uf_is_more_data_for_delivery_ac: Data NOT Available \n");
     return FALSE;
 }
 
@@ -2222,12 +2222,12 @@
         if(((staRecord->powersaveMode[i]==CSR_WIFI_AC_DELIVERY_ONLY_ENABLE)
              ||(staRecord->powersaveMode[i]==CSR_WIFI_AC_TRIGGER_AND_DELIVERY_ENABLED))
              &&(!list_empty(&staRecord->dataPdu[i]))) {
-            unifi_trace(priv,UDBG2,"uf_is_more_data_for_usp_delivery: Data Available AC = %d\n", i);
+            unifi_trace(priv, UDBG2, "uf_is_more_data_for_usp_delivery: Data Available AC = %d\n", i);
             return TRUE;
         }
     }
 
-    unifi_trace(priv,UDBG2,"uf_is_more_data_for_usp_delivery: Data NOT Available \n");
+    unifi_trace(priv, UDBG2, "uf_is_more_data_for_usp_delivery: Data NOT Available \n");
     return FALSE;
 }
 
@@ -2272,18 +2272,18 @@
         return;
     }
     while((buffered_pkt=dequeue_tx_data_pdu(priv, txList))) {
-        if((IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag))) {
+        if((IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag))) {
             unifi_trace(priv, UDBG2, "uf_send_buffered_data_from_delivery_ac: DTIM Active, suspend UAPSD, staId: 0x%x\n",
                         staInfo->aid);
 
             /* Once resume called, the U-APSD delivery operation will resume */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             staInfo->uspSuspend = TRUE;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
             /* re-queueing the packet as DTIM started */
-            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
-            list_add(&buffered_pkt->q,txList);
-            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
+            list_add(&buffered_pkt->q, txList);
+            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             break;
         }
 
@@ -2315,20 +2315,20 @@
             unifi_warning(priv, "uf_send_buffered_data_from_delivery_ac: non U-APSD !!! \n");
         }
 
-        unifi_trace(priv,UDBG2,"uf_send_buffered_data_from_delivery_ac : MoreData:%d, EOSP:%d\n",moreData,eosp);
+        unifi_trace(priv, UDBG2, "uf_send_buffered_data_from_delivery_ac : MoreData:%d, EOSP:%d\n", moreData, eosp);
 
-        if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staInfo,moreData,eosp)) == -ENOSPC) {
+        if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staInfo, moreData, eosp)) == -ENOSPC) {
 
             unifi_trace(priv, UDBG2, "uf_send_buffered_data_from_delivery_ac: UASPD suspended, ENOSPC in hipQ=%x\n", queue);
 
             /* Once resume called, the U-APSD delivery operation will resume */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             staInfo->uspSuspend = TRUE;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
-            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
-            list_add(&buffered_pkt->q,txList);
-            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
+            list_add(&buffered_pkt->q, txList);
+            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             priv->pausedStaHandle[queue]=(u8)(staInfo->assignedHandle);
             break;
         } else {
@@ -2337,17 +2337,17 @@
                 unifi_net_data_free(priv, &buffered_pkt->bulkdata);
             }
             kfree(buffered_pkt);
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             staInfo->noOfSpFramesSent++;
             if((!moreData) || (staInfo->noOfSpFramesSent == staInfo->maxSpLength)) {
                 unifi_trace(priv, UDBG2, "uf_send_buffered_data_from_delivery_ac: Terminating USP\n");
                 staInfo->uapsdActive = FALSE;
                 staInfo->uspSuspend = FALSE;
                 staInfo->noOfSpFramesSent = 0;
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
                 break;
             }
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }
     }
     unifi_trace(priv, UDBG2, "--uf_send_buffered_data_from_delivery_ac, active=%x\n", staInfo->uapsdActive);
@@ -2364,25 +2364,25 @@
     u8 moreData = FALSE;
     s8 r =0;
 
-    unifi_trace(priv,UDBG2,"uf_send_buffered_data_from_ac :\n");
+    unifi_trace(priv, UDBG2, "uf_send_buffered_data_from_ac :\n");
 
-    while(!isRouterBufferEnabled(priv,queue) &&
+    while(!isRouterBufferEnabled(priv, queue) &&
                     ((buffered_pkt=dequeue_tx_data_pdu(priv, txList))!=NULL)){
 
         buffered_pkt->transmissionControl &=
                  ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
 
-        unifi_trace(priv,UDBG3,"uf_send_buffered_data_from_ac : MoreData:%d, EOSP:%d\n",moreData,eosp);
+        unifi_trace(priv, UDBG3, "uf_send_buffered_data_from_ac : MoreData:%d, EOSP:%d\n", moreData, eosp);
 
-        if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staInfo,moreData,eosp)) == -ENOSPC) {
+        if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staInfo, moreData, eosp)) == -ENOSPC) {
            /* Enqueue at the head of the queue */
-           spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
-           list_add(&buffered_pkt->q,txList);
-           spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+           spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
+           list_add(&buffered_pkt->q, txList);
+           spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
            if(staInfo != NULL){
               priv->pausedStaHandle[queue]=(u8)(staInfo->assignedHandle);
            }
-           unifi_trace(priv,UDBG3," uf_send_buffered_data_from_ac: PDU sending failed .. no space for queue %d \n",queue);
+           unifi_trace(priv, UDBG3, " uf_send_buffered_data_from_ac: PDU sending failed .. no space for queue %d \n", queue);
            } else {
             if(r){
                 /* the PDU failed where we can't do any thing so free the storage */
@@ -2394,10 +2394,10 @@
 
 }
 
-void uf_send_buffered_frames(unifi_priv_t *priv,unifi_TrafficQueue q)
+void uf_send_buffered_frames(unifi_priv_t *priv, unifi_TrafficQueue q)
 {
     u16 interfaceTag = GET_ACTIVE_INTERFACE_TAG(priv);
-    u32 startIndex=0,endIndex=0;
+    u32 startIndex=0, endIndex=0;
     CsrWifiRouterCtrlStaInfo_t * staInfo = NULL;
     u8 queue;
     u8 moreData = FALSE;
@@ -2412,14 +2412,14 @@
 
     if(interfacePriv->dtimActive) {
         /* this function updates dtimActive*/
-        send_multicast_frames(priv,interfaceTag);
+        send_multicast_frames(priv, interfaceTag);
         if(!interfacePriv->dtimActive) {
             moreData = (!list_empty(&interfacePriv->genericMulticastOrBroadCastMgtFrames) ||
              !list_empty(&interfacePriv->genericMulticastOrBroadCastFrames));
             if(!moreData) {
                 if (!interfacePriv->bcTimSetReqPendingFlag)
                 {
-                    update_tim(priv,0,CSR_WIFI_TIM_RESET,interfaceTag, 0XFFFFFFFF);
+                    update_tim(priv, 0, CSR_WIFI_TIM_RESET, interfaceTag, 0XFFFFFFFF);
                 }
                 else
                 {
@@ -2436,8 +2436,8 @@
                         !list_empty(&interfacePriv->genericMulticastOrBroadCastFrames));
            if(!moreData) {
                /* This should never happen but if it happens, we need a way out */
-               unifi_error(priv,"ERROR: No More Data but DTIM is active sending Response\n");
-               send_vif_availibility_rsp(priv,uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag),CSR_RC_NO_BUFFERED_BROADCAST_MULTICAST_FRAMES);
+               unifi_error(priv, "ERROR: No More Data but DTIM is active sending Response\n");
+               send_vif_availibility_rsp(priv, uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag), CSR_RC_NO_BUFFERED_BROADCAST_MULTICAST_FRAMES);
                interfacePriv->dtimActive = FALSE;
            }
         }
@@ -2450,9 +2450,9 @@
     if(queue == UNIFI_TRAFFIC_Q_VO) {
 
 
-        unifi_trace(priv,UDBG2,"uf_send_buffered_frames : trying mgt from queue=%d\n",queue);
+        unifi_trace(priv, UDBG2, "uf_send_buffered_frames : trying mgt from queue=%d\n", queue);
         for(startIndex= 0; startIndex < UNIFI_MAX_CONNECTIONS;startIndex++) {
-            staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv,startIndex,interfaceTag);
+            staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv, startIndex, interfaceTag);
             if(!staInfo ) {
                 continue;
             } else if((staInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_POWER_SAVE)
@@ -2464,31 +2464,31 @@
                                &&(staInfo->uapsdActive == FALSE)){
                             /*Non-UAPSD case push the management frames out*/
                if(!list_empty(&staInfo->mgtFrames)){
-                    uf_send_buffered_data_from_ac(priv,staInfo, UNIFI_TRAFFIC_Q_VO, &staInfo->mgtFrames);
+                    uf_send_buffered_data_from_ac(priv, staInfo, UNIFI_TRAFFIC_Q_VO, &staInfo->mgtFrames);
                 }
             }
 
-            if(isRouterBufferEnabled(priv,queue)) {
-                unifi_notice(priv,"uf_send_buffered_frames : No space Left for queue = %d\n",queue);
+            if(isRouterBufferEnabled(priv, queue)) {
+                unifi_notice(priv, "uf_send_buffered_frames : No space Left for queue = %d\n", queue);
                 break;
             }
         }
         /*push generic management frames out*/
         if(!list_empty(&interfacePriv->genericMgtFrames)) {
-            unifi_trace(priv,UDBG2,"uf_send_buffered_frames : trying generic mgt from queue=%d\n",queue);
-            uf_send_buffered_data_from_ac(priv,staInfo, UNIFI_TRAFFIC_Q_VO, &interfacePriv->genericMgtFrames);
+            unifi_trace(priv, UDBG2, "uf_send_buffered_frames : trying generic mgt from queue=%d\n", queue);
+            uf_send_buffered_data_from_ac(priv, staInfo, UNIFI_TRAFFIC_Q_VO, &interfacePriv->genericMgtFrames);
         }
     }
 
 
-    unifi_trace(priv,UDBG2,"uf_send_buffered_frames : Resume called for Queue=%d\n",queue);
-    unifi_trace(priv,UDBG2,"uf_send_buffered_frames : start=%d end=%d\n",startIndex,endIndex);
+    unifi_trace(priv, UDBG2, "uf_send_buffered_frames : Resume called for Queue=%d\n", queue);
+    unifi_trace(priv, UDBG2, "uf_send_buffered_frames : start=%d end=%d\n", startIndex, endIndex);
 
     startIndex = priv->pausedStaHandle[queue];
     endIndex = (startIndex + UNIFI_MAX_CONNECTIONS -1) % UNIFI_MAX_CONNECTIONS;
 
     while(startIndex != endIndex) {
-        staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv,startIndex,interfaceTag);
+        staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv, startIndex, interfaceTag);
         if(!staInfo) {
             startIndex ++;
             if(startIndex >= UNIFI_MAX_CONNECTIONS) {
@@ -2504,7 +2504,7 @@
             continue;
         }
         /* Peer is active or U-APSD is active so send PDUs to the peer */
-        unifi_trace(priv,UDBG2,"uf_send_buffered_frames : trying data from queue=%d\n",queue);
+        unifi_trace(priv, UDBG2, "uf_send_buffered_frames : trying data from queue=%d\n", queue);
 
 
         if((staInfo != NULL)&&(staInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_ACTIVE)
@@ -2520,7 +2520,7 @@
            startIndex = 0;
         }
     }
-    if(isRouterBufferEnabled(priv,queue)) {
+    if(isRouterBufferEnabled(priv, queue)) {
         priv->pausedStaHandle[queue] = endIndex;
     } else {
         priv->pausedStaHandle[queue] = 0;
@@ -2561,7 +2561,7 @@
 }
 
 
-int uf_process_station_records_for_sending_data(unifi_priv_t *priv,u16 interfaceTag,
+int uf_process_station_records_for_sending_data(unifi_priv_t *priv, u16 interfaceTag,
                                                  CsrWifiRouterCtrlStaInfo_t *srcStaInfo,
                                                  CsrWifiRouterCtrlStaInfo_t *dstStaInfo)
 {
@@ -2647,10 +2647,10 @@
         return;
     }
 
-    spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+    spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
     staInfo->uapsdActive = TRUE;
     staInfo->uspSuspend = FALSE;
-    spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
     if(((staInfo->powersaveMode[UNIFI_TRAFFIC_Q_VO]==CSR_WIFI_AC_TRIGGER_AND_DELIVERY_ENABLED)||
         (staInfo->powersaveMode[UNIFI_TRAFFIC_Q_VO]==CSR_WIFI_AC_DELIVERY_ONLY_ENABLE))
@@ -2666,9 +2666,9 @@
          * NOTE: If we have sent Mgt frame also, we must send QNULL followed to terminate USP
          */
         if (!staInfo->uspSuspend) {
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             staInfo->uapsdActive = FALSE;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
             unifi_trace(priv, UDBG2, "uf_handle_uspframes_delivery: sending QNull for trigger\n");
             uf_send_qos_null(priv, interfaceTag, staInfo->peerMacAddress.a, (CSR_PRIORITY) staInfo->triggerFramePriority, staInfo);
@@ -2687,12 +2687,12 @@
             }
 
             if ((!staInfo->uapsdActive) ||
-                    (staInfo->uspSuspend && IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag))) {
+                    (staInfo->uspSuspend && IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag))) {
                 /* If DTIM active found on one AC, No need to parse the remaining AC's
                  * as USP suspended. Break out of loop
                  */
                 unifi_trace(priv, UDBG2, "uf_handle_uspframes_delivery: suspend=%x,  DTIM=%x, USP terminated=%s\n",
-                           staInfo->uspSuspend, IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag),
+                           staInfo->uspSuspend, IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag),
                            staInfo->uapsdActive?"NO":"YES");
                 break;
             }
@@ -2704,7 +2704,7 @@
      */
     is_all_ac_deliver_enabled_and_moredata(staInfo, &allDeliveryEnabled, &dataAvailable);
     if ((allDeliveryEnabled && !dataAvailable)) {
-        if ((staInfo->timSet != CSR_WIFI_TIM_RESET) || (staInfo->timSet != CSR_WIFI_TIM_RESETTING)) {
+        if ((staInfo->timSet != CSR_WIFI_TIM_RESET) && (staInfo->timSet != CSR_WIFI_TIM_RESETTING)) {
             staInfo->updateTimReqQueued = (u8) CSR_WIFI_TIM_RESET;
             unifi_trace(priv, UDBG4, " --uf_handle_uspframes_delivery, UAPSD timset\n");
             if (!staInfo->timRequestPendingFlag) {
@@ -2734,9 +2734,9 @@
 
     if((srcStaInfo->powersaveMode[priority_q]==CSR_WIFI_AC_TRIGGER_ONLY_ENABLED)
         ||(srcStaInfo->powersaveMode[priority_q]==CSR_WIFI_AC_TRIGGER_AND_DELIVERY_ENABLED)) {
-        spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+        spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
         srcStaInfo->triggerFramePriority = priority;
-        spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         unifi_trace(priv, UDBG2, "uf_process_wmm_deliver_ac_uapsd: trigger frame, Begin U-APSD, triggerQ=%x\n", priority_q);
         uf_handle_uspframes_delivery(priv, srcStaInfo, interfaceTag);
     }
@@ -2744,7 +2744,7 @@
 }
 
 
-void uf_send_qos_null(unifi_priv_t * priv,u16 interfaceTag, const u8 *da,CSR_PRIORITY priority,CsrWifiRouterCtrlStaInfo_t * srcStaInfo)
+void uf_send_qos_null(unifi_priv_t * priv, u16 interfaceTag, const u8 *da, CSR_PRIORITY priority, CsrWifiRouterCtrlStaInfo_t * srcStaInfo)
 {
     bulk_data_param_t bulkdata;
     CsrResult csrResult;
@@ -2806,14 +2806,14 @@
 
     r = ul_send_signal_unpacked(priv, &signal, &bulkdata);
     if(r) {
-        unifi_error(priv, "failed to send QOS data null packet result: %d\n",r);
+        unifi_error(priv, "failed to send QOS data null packet result: %d\n", r);
         unifi_net_data_free(priv, &bulkdata.d[0]);
     }
 
     return;
 
 }
-void uf_send_nulldata(unifi_priv_t * priv,u16 interfaceTag, const u8 *da,CSR_PRIORITY priority,CsrWifiRouterCtrlStaInfo_t * srcStaInfo)
+void uf_send_nulldata(unifi_priv_t * priv, u16 interfaceTag, const u8 *da, CSR_PRIORITY priority, CsrWifiRouterCtrlStaInfo_t * srcStaInfo)
 {
     bulk_data_param_t bulkdata;
     CsrResult csrResult;
@@ -2882,14 +2882,14 @@
     if(r == -ENOSPC) {
         unifi_trace(priv, UDBG1, "uf_send_nulldata: ENOSPC Requeue the Null frame\n");
         enque_tx_data_pdu(priv, &bulkdata, &srcStaInfo->dataPdu[priority_q], &signal, 1);
-        spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+        spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
         srcStaInfo->noOfPktQueued++;
-        spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
 
     }
     if(r && r != -ENOSPC){
-        unifi_error(priv, "uf_send_nulldata: Failed to send Null frame Error = %d\n",r);
+        unifi_error(priv, "uf_send_nulldata: Failed to send Null frame Error = %d\n", r);
         unifi_net_data_free(priv, &bulkdata.d[0]);
         srcStaInfo->nullDataHostTag = INVALID_HOST_TAG;
     }
@@ -2939,7 +2939,7 @@
 
 
 u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t * srcStaInfo,
-                                u8 pmBit,u16 interfaceTag)
+                                u8 pmBit, u16 interfaceTag)
 {
     u8 moreData = FALSE;
     u8 powerSaveChanged = FALSE;
@@ -2955,22 +2955,22 @@
         if(srcStaInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_ACTIVE) {
 
             /* disable the preemption */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             srcStaInfo->currentPeerState =CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_POWER_SAVE;
             powerSaveChanged = TRUE;
             /* enable the preemption */
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         } else {
             return powerSaveChanged;
         }
     } else {
         if(srcStaInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_POWER_SAVE) {
             /* disable the preemption */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             srcStaInfo->currentPeerState = CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_ACTIVE;
             powerSaveChanged = TRUE;
             /* enable the preemption */
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }else {
             return powerSaveChanged;
         }
@@ -2978,10 +2978,10 @@
 
 
     if(srcStaInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_ACTIVE) {
-        unifi_trace(priv,UDBG3, "Peer with AID = %d is active now\n",srcStaInfo->aid);
-        process_peer_active_transition(priv,srcStaInfo,interfaceTag);
+        unifi_trace(priv, UDBG3, "Peer with AID = %d is active now\n", srcStaInfo->aid);
+        process_peer_active_transition(priv, srcStaInfo, interfaceTag);
     } else {
-        unifi_trace(priv,UDBG3, "Peer with AID = %d is in PS Now\n",srcStaInfo->aid);
+        unifi_trace(priv, UDBG3, "Peer with AID = %d is in PS Now\n", srcStaInfo->aid);
         /* Set TIM if needed */
         if(!srcStaInfo->wmmOrQosEnabled) {
             moreData = (!list_empty(&srcStaInfo->mgtFrames) ||
@@ -2990,7 +2990,7 @@
             if(moreData && (srcStaInfo->timSet == CSR_WIFI_TIM_RESET)) {
                 unifi_trace(priv, UDBG3, "This condition should not occur\n");
                 if (!srcStaInfo->timRequestPendingFlag){
-                    update_tim(priv,srcStaInfo->aid,1,interfaceTag, srcStaInfo->assignedHandle);
+                    update_tim(priv, srcStaInfo->aid, 1, interfaceTag, srcStaInfo->assignedHandle);
                 }
                 else
                 {
@@ -3013,7 +3013,7 @@
 
             if(moreData && (srcStaInfo->timSet == CSR_WIFI_TIM_RESET)) {
                 if (!srcStaInfo->timRequestPendingFlag){
-                    update_tim(priv,srcStaInfo->aid,1,interfaceTag, srcStaInfo->assignedHandle);
+                    update_tim(priv, srcStaInfo->aid, 1, interfaceTag, srcStaInfo->assignedHandle);
                 }
                 else
                 {
@@ -3033,7 +3033,7 @@
 
 
 
-void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceTag)
+void uf_process_ps_poll(unifi_priv_t *priv, u8* sa, u8* da, u8 pmBit, u16 interfaceTag)
 {
     CsrWifiRouterCtrlStaInfo_t *staRecord =
     CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv, sa, interfaceTag);
@@ -3046,27 +3046,27 @@
 
     unifi_trace(priv, UDBG3, "entering uf_process_ps_poll\n");
     if(!staRecord) {
-        memcpy(peerMacAddress.a,sa,ETH_ALEN);
+        memcpy(peerMacAddress.a, sa, ETH_ALEN);
         unifi_trace(priv, UDBG3, "In uf_process_ps_poll, sta record not found:unexpected frame addr = %x:%x:%x:%x:%x:%x\n",
-                sa[0], sa[1],sa[2], sa[3], sa[4],sa[5]);
-        CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,interfaceTag,peerMacAddress);
+                sa[0], sa[1], sa[2], sa[3], sa[4], sa[5]);
+        CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, peerMacAddress);
         return;
     }
 
-    uf_process_pm_bit_for_peer(priv,staRecord,pmBit,interfaceTag);
+    uf_process_pm_bit_for_peer(priv, staRecord, pmBit, interfaceTag);
 
     /* Update station last activity time */
     staRecord->activity_flag = TRUE;
 
     /* This should not change the PM bit as PS-POLL has PM bit always set */
     if(!pmBit) {
-        unifi_notice (priv," PM bit reset in PS-POLL\n");
+        unifi_notice (priv, " PM bit reset in PS-POLL\n");
         return;
     }
 
-    if(IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag)) {
+    if(IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag)) {
         /* giving more priority to multicast packets so dropping ps-poll*/
-        unifi_notice (priv," multicast transmission is going on so don't take action on PS-POLL\n");
+        unifi_notice (priv, " multicast transmission is going on so don't take action on PS-POLL\n");
         return;
     }
 
@@ -3078,13 +3078,13 @@
                         !list_empty(&staRecord->mgtFrames));
 
             buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
-            if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,moreData,FALSE)) == -ENOSPC) {
+            if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, moreData, FALSE)) == -ENOSPC) {
                 /* Clear the trigger bit transmission control*/
                 buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
                 /* Enqueue at the head of the queue */
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 list_add(&buffered_pkt->q, &staRecord->mgtFrames);
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                 unifi_trace(priv, UDBG1, "(ENOSPC) PS-POLL received : PDU sending failed \n");
                 priv->pausedStaHandle[3]=(u8)(staRecord->assignedHandle);
             } else {
@@ -3101,13 +3101,13 @@
                         !list_empty(&staRecord->dataPdu[UNIFI_TRAFFIC_Q_VO]));
 
             buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
-            if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,moreData,FALSE)) == -ENOSPC) {
+            if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, moreData, FALSE)) == -ENOSPC) {
                 /* Clear the trigger bit transmission control*/
                 buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
                 /* Enqueue at the head of the queue */
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 list_add(&buffered_pkt->q, &staRecord->dataPdu[UNIFI_TRAFFIC_Q_VO]);
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                 priv->pausedStaHandle[3]=(u8)(staRecord->assignedHandle);
                 unifi_trace(priv, UDBG1, "(ENOSPC) PS-POLL received : PDU sending failed \n");
             } else {
@@ -3123,13 +3123,13 @@
             moreData = !list_empty(&staRecord->dataPdu[UNIFI_TRAFFIC_Q_CONTENTION]);
 
             buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
-            if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,moreData,FALSE)) == -ENOSPC) {
+            if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, moreData, FALSE)) == -ENOSPC) {
                 /* Clear the trigger bit transmission control*/
                 buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
                 /* Enqueue at the head of the queue */
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 list_add(&buffered_pkt->q, &staRecord->dataPdu[UNIFI_TRAFFIC_Q_CONTENTION]);
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                 priv->pausedStaHandle[0]=(u8)(staRecord->assignedHandle);
                 unifi_trace(priv, UDBG1, "(ENOSPC) PS-POLL received : PDU sending failed \n");
             } else {
@@ -3150,7 +3150,7 @@
         if(!moreData && (staRecord->timSet == CSR_WIFI_TIM_SET)) {
             unifi_trace(priv, UDBG3, "more data = NULL, set tim to 0 in uf_process_ps_poll\n");
             if (!staRecord->timRequestPendingFlag){
-                update_tim(priv,staRecord->aid,0,interfaceTag, staRecord->assignedHandle);
+                update_tim(priv, staRecord->aid, 0, interfaceTag, staRecord->assignedHandle);
             }
             else
             {
@@ -3165,7 +3165,7 @@
     } else {
 
         u8 allDeliveryEnabled = 0, dataAvailable = 0;
-        unifi_trace(priv, UDBG3,"Qos Support station.Processing PS-Poll\n");
+        unifi_trace(priv, UDBG3, "Qos Support station.Processing PS-Poll\n");
 
         /*Send Data From Management Frames*/
         /* Priority orders for delivering the buffered packets are
@@ -3179,7 +3179,7 @@
 
         if (allDeliveryEnabled) {
             unifi_trace(priv, UDBG3, "uf_process_ps_poll: All ACs are delivery enable so Sending QOS Null in response of Ps-poll\n");
-            uf_send_qos_null(priv,interfaceTag,sa,CSR_QOS_UP0,staRecord);
+            uf_send_qos_null(priv, interfaceTag, sa, CSR_QOS_UP0, staRecord);
             return;
         }
 
@@ -3192,13 +3192,13 @@
                     buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
 
                     /* Last parameter is EOSP & its false always for PS-POLL processing */
-                    if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,moreData,FALSE)) == -ENOSPC) {
+                    if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, moreData, FALSE)) == -ENOSPC) {
                         /* Clear the trigger bit transmission control*/
                         buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
                         /* Enqueue at the head of the queue */
-                        spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                        spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                         list_add(&buffered_pkt->q, &staRecord->mgtFrames);
-                        spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                        spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                         priv->pausedStaHandle[0]=(u8)(staRecord->assignedHandle);
                         unifi_trace(priv, UDBG1, "(ENOSPC) PS-POLL received : PDU sending failed \n");
                     } else {
@@ -3227,13 +3227,13 @@
                         buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
 
                         /* Last parameter is EOSP & its false always for PS-POLL processing */
-                        if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,moreData,FALSE)) == -ENOSPC) {
+                        if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, moreData, FALSE)) == -ENOSPC) {
                             /* Clear the trigger bit transmission control*/
                             buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
                             /* Enqueue at the head of the queue */
-                            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                             list_add(&buffered_pkt->q, &staRecord->dataPdu[i]);
-                            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                             priv->pausedStaHandle[0]=(u8)(staRecord->assignedHandle);
                             unifi_trace(priv, UDBG1, "(ENOSPC) PS-POLL received : PDU sending failed \n");
                         } else {
@@ -3256,7 +3256,7 @@
         if(!moreData && (staRecord->timSet == CSR_WIFI_TIM_SET)) {
             unifi_trace(priv, UDBG3, "more data = NULL, set tim to 0 in uf_process_ps_poll\n");
             if (!staRecord->timRequestPendingFlag){
-                update_tim(priv,staRecord->aid,0,interfaceTag, staRecord->assignedHandle);
+                update_tim(priv, staRecord->aid, 0, interfaceTag, staRecord->assignedHandle);
             }
             else
             {
@@ -3311,7 +3311,7 @@
     struct list_head *placeHolder;
     unsigned long lock_flags;
 
-    spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+    spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
 
     /* Search through the list and if confirmation required for any frames,
     add it to the send_cfm list */
@@ -3337,7 +3337,7 @@
         }
     }
 
-    spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
 
 }
 
@@ -3352,7 +3352,7 @@
 
     unifi_trace(priv, UDBG5, "entering the uf_flush_list \n");
 
-    spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+    spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
     /* go through list, delete & free memory */
     list_for_each_safe(listHead, placeHolder, list) {
         tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
@@ -3378,7 +3378,7 @@
             priv->noOfPktQueuedInDriver--;
         }
     }
-    spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
 }
 
 tx_buffered_packets_t *dequeue_tx_data_pdu(unifi_priv_t *priv, struct list_head *txList)
@@ -3403,13 +3403,13 @@
     }
 
     /* return first node after header, & delete from the list  && atleast one item exist */
-    spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+    spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
     list_for_each_safe(listHead, placeHolder, txList) {
         tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
         list_del(listHead);
         break;
     }
-    spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
 
     if (tx_q_item) {
         unifi_trace(priv, UDBG5,
@@ -3440,20 +3440,20 @@
     interfacePriv = priv->interfacePriv[interfaceTag];
 
     /* disable the preemption until station record is fetched */
-    spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+    spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
 
     for (i = 0; i < UNIFI_MAX_CONNECTIONS; i++) {
         if (interfacePriv->staInfo[i]!= NULL) {
             if (!memcmp(((CsrWifiRouterCtrlStaInfo_t *) (interfacePriv->staInfo[i]))->peerMacAddress.a, peerMacAddress, ETH_ALEN)) {
                 /* enable the preemption as station record is fetched */
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
                 unifi_trace(priv, UDBG5, "peer entry found in station record\n");
                 return ((CsrWifiRouterCtrlStaInfo_t *) (interfacePriv->staInfo[i]));
             }
         }
     }
     /* enable the preemption as station record is fetched */
-    spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
     unifi_trace(priv, UDBG5, "peer entry not found in station record\n");
     return NULL;
 }
@@ -3487,7 +3487,7 @@
         return;
     }
 
-    spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+    spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
     /* Go through the list of stations to check for inactivity */
     for(i = 0; i < UNIFI_MAX_CONNECTIONS; i++) {
         staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv, i, interfaceTag);
@@ -3502,7 +3502,7 @@
         elapsedTime = (currentTime >= staInfo->lastActivity)?
                 (currentTime - staInfo->lastActivity):
                 (~((u32)0) - staInfo->lastActivity + currentTime);
-        spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
         if (elapsedTime > MAX_INACTIVITY_INTERVAL) {
             memcpy((u8*)&peerMacAddress, (u8*)&staInfo->peerMacAddress, sizeof(CsrWifiMacAddress));
@@ -3545,7 +3545,7 @@
         return;
     }
 
-    spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+    spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
     /* Update activity */
     staInfo->lastActivity = currentTime;
 
@@ -3558,7 +3558,7 @@
                     (currentTime - interfacePriv->last_inactivity_check):
                     (~((u32)0) - interfacePriv->last_inactivity_check + currentTime);
 
-    spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
     /* Check if it is time to run the inactivity handler */
     if (elapsedTime > INACTIVITY_CHECK_INTERVAL) {
@@ -3572,19 +3572,19 @@
    u8 i;
    int j;
    tx_buffered_packets_t * buffered_pkt = NULL;
-   u8 hipslotFree[4] = {TRUE,TRUE,TRUE,TRUE};
+   u8 hipslotFree[4] = {TRUE, TRUE, TRUE, TRUE};
    int r;
    unsigned long lock_flags;
 
-   while(!isRouterBufferEnabled(priv,3) &&
-                            ((buffered_pkt=dequeue_tx_data_pdu(priv,&interfacePriv->genericMgtFrames))!=NULL)) {
+   while(!isRouterBufferEnabled(priv, 3) &&
+                            ((buffered_pkt=dequeue_tx_data_pdu(priv, &interfacePriv->genericMgtFrames))!=NULL)) {
         buffered_pkt->transmissionControl &=
                      ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
-        if((r=frame_and_send_queued_pdu(priv,buffered_pkt,NULL,0,FALSE)) == -ENOSPC) {
+        if((r=frame_and_send_queued_pdu(priv, buffered_pkt, NULL, 0, FALSE)) == -ENOSPC) {
             /* Enqueue at the head of the queue */
-            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
             list_add(&buffered_pkt->q, &interfacePriv->genericMgtFrames);
-            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             hipslotFree[3]=FALSE;
             break;
         }else {
@@ -3606,12 +3606,12 @@
           while((( TRUE == hipslotFree[3] ) && (buffered_pkt=dequeue_tx_data_pdu(priv, &staInfo->mgtFrames)))) {
               buffered_pkt->transmissionControl &=
                            ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
-              if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staInfo,0,FALSE)) == -ENOSPC) {
+              if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staInfo, 0, FALSE)) == -ENOSPC) {
                   unifi_trace(priv, UDBG3, "(ENOSPC) in resume_unicast_buffered_frames:: hip slots are full for voice queue\n");
                   /* Enqueue at the head of the queue */
-                  spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                  spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                   list_add(&buffered_pkt->q, &staInfo->mgtFrames);
-                  spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                  spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                   priv->pausedStaHandle[3]=(u8)(staInfo->assignedHandle);
                   hipslotFree[3] = FALSE;
                   break;
@@ -3632,11 +3632,11 @@
               while((buffered_pkt=dequeue_tx_data_pdu(priv, &staInfo->dataPdu[j]))) {
                  buffered_pkt->transmissionControl &=
                             ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
-                 if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staInfo,0,FALSE)) == -ENOSPC) {
+                 if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staInfo, 0, FALSE)) == -ENOSPC) {
                      /* Enqueue at the head of the queue */
-                     spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                     spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                      list_add(&buffered_pkt->q, &staInfo->dataPdu[j]);
-                     spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                     spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                      priv->pausedStaHandle[j]=(u8)(staInfo->assignedHandle);
                      hipslotFree[j]=FALSE;
                      break;
@@ -3653,7 +3653,7 @@
        }
     }
 }
-void update_eosp_to_head_of_broadcast_list_head(unifi_priv_t *priv,u16 interfaceTag)
+void update_eosp_to_head_of_broadcast_list_head(unifi_priv_t *priv, u16 interfaceTag)
 {
 
     netInterface_priv_t *interfacePriv = priv->interfacePriv[interfaceTag];
@@ -3668,15 +3668,15 @@
          * because we have received any mgmt packet so it should not hold for long time
          * peer may time out.
          */
-        spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+        spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
         list_for_each_safe(listHead, placeHolder, &interfacePriv->genericMulticastOrBroadCastFrames) {
             tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
             tx_q_item->transmissionControl |= TRANSMISSION_CONTROL_EOSP_MASK;
             tx_q_item->transmissionControl = (tx_q_item->transmissionControl & ~(CSR_NO_CONFIRM_REQUIRED));
-            unifi_trace(priv, UDBG1,"updating eosp for list Head hostTag:= 0x%x ",tx_q_item->hostTag);
+            unifi_trace(priv, UDBG1, "updating eosp for list Head hostTag:= 0x%x ", tx_q_item->hostTag);
             break;
         }
-        spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
     }
 }
 
@@ -3692,7 +3692,7 @@
  *      interfaceTag    For which resume should happen
  * ---------------------------------------------------------------------------
  */
-void resume_suspended_uapsd(unifi_priv_t* priv,u16 interfaceTag)
+void resume_suspended_uapsd(unifi_priv_t* priv, u16 interfaceTag)
 {
 
    u8 startIndex;
@@ -3701,7 +3701,7 @@
 
     unifi_trace(priv, UDBG2, "++resume_suspended_uapsd: \n");
     for(startIndex= 0; startIndex < UNIFI_MAX_CONNECTIONS;startIndex++) {
-        staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv,startIndex,interfaceTag);
+        staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv, startIndex, interfaceTag);
 
         if(!staInfo || !staInfo->wmmOrQosEnabled) {
             continue;
@@ -3716,10 +3716,10 @@
                         staInfo->currentPeerState, staInfo->uapsdActive, staInfo->uspSuspend);
             if (staInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_ACTIVE)
             {
-                spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+                spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
                 staInfo->uapsdActive = FALSE;
                 staInfo->uspSuspend = FALSE;
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
             }
         }
     }

diff --git a/drivers/staging/csr/unifi_priv.h b/drivers/staging/csr/unifi_priv.h
index d20d74c..37302f3 100644
--- a/drivers/staging/csr/unifi_priv.h
+++ b/drivers/staging/csr/unifi_priv.h

@@ -259,7 +259,7 @@
 
 #define IS_DELIVERY_ENABLED(mode) (mode & CSR_WIFI_AC_DELIVERY_ONLY_ENABLE)? 1: 0
 #define IS_DELIVERY_AND_TRIGGER_ENABLED(mode) ((mode & CSR_WIFI_AC_DELIVERY_ONLY_ENABLE)||(mode & CSR_WIFI_AC_TRIGGER_AND_DELIVERY_ENABLED))? 1: 0
-#define IS_DTIM_ACTIVE(flag,hostTag) ((flag == TRUE || hostTag != INVALID_HOST_TAG))
+#define IS_DTIM_ACTIVE(flag, hostTag) ((flag == TRUE || hostTag != INVALID_HOST_TAG))
 #define INVALID_HOST_TAG 0xFFFFFFFF
 #define UNIFI_TRAFFIC_Q_CONTENTION UNIFI_TRAFFIC_Q_BE
 
@@ -767,9 +767,9 @@
 } netInterface_priv_t;
 
 #ifdef CSR_SUPPORT_SME
-#define routerStartBuffering(priv,queue) priv->routerBufferEnable[(queue)] = TRUE;
-#define routerStopBuffering(priv,queue) priv->routerBufferEnable[(queue)]  = FALSE;
-#define isRouterBufferEnabled(priv,queue) priv->routerBufferEnable[(queue)]
+#define routerStartBuffering(priv, queue) priv->routerBufferEnable[(queue)] = TRUE;
+#define routerStopBuffering(priv, queue) priv->routerBufferEnable[(queue)]  = FALSE;
+#define isRouterBufferEnabled(priv, queue) priv->routerBufferEnable[(queue)]
 #endif
 
 #ifdef USE_DRIVER_LOCK
@@ -919,8 +919,8 @@
 
 #ifdef CSR_SUPPORT_SME
 u8 uf_check_broadcast_bssid(unifi_priv_t *priv, const bulk_data_param_t *bulkdata);
-u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t * srcStaInfo,u8 pmBit,u16 interfaceTag);
-void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceTag);
+u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t * srcStaInfo, u8 pmBit, u16 interfaceTag);
+void uf_process_ps_poll(unifi_priv_t *priv, u8* sa, u8* da, u8 pmBit, u16 interfaceTag);
 int uf_ap_process_data_pdu(unifi_priv_t *priv, struct sk_buff *skb,
                    struct ethhdr *ehdr, CsrWifiRouterCtrlStaInfo_t * srcStaInfo,
                    const CSR_SIGNAL *signal,
@@ -936,17 +936,17 @@
 void uf_send_buffered_data_from_delivery_ac(unifi_priv_t *priv, CsrWifiRouterCtrlStaInfo_t * staInfo, u8 queue, struct list_head *txList);
 
 void uf_continue_uapsd(unifi_priv_t *priv, CsrWifiRouterCtrlStaInfo_t * staInfo);
-void uf_send_qos_null(unifi_priv_t * priv,u16 interfaceTag, const u8 *da,CSR_PRIORITY priority,CsrWifiRouterCtrlStaInfo_t * srcStaInfo);
-void uf_send_nulldata(unifi_priv_t * priv,u16 interfaceTag, const u8 *da,CSR_PRIORITY priority,CsrWifiRouterCtrlStaInfo_t * srcStaInfo);
+void uf_send_qos_null(unifi_priv_t * priv, u16 interfaceTag, const u8 *da, CSR_PRIORITY priority, CsrWifiRouterCtrlStaInfo_t * srcStaInfo);
+void uf_send_nulldata(unifi_priv_t * priv, u16 interfaceTag, const u8 *da, CSR_PRIORITY priority, CsrWifiRouterCtrlStaInfo_t * srcStaInfo);
 
 
 
 #endif
 CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,  u8 *peerMacAddress, CSR_CLIENT_TAG hostTag, u16 interfaceTag, CSR_TRANSMISSION_CONTROL transmissionControl, CSR_RATE TransmitRate, CSR_PRIORITY priority, CSR_PROCESS_ID senderId, bulk_data_param_t *bulkdata);
-void uf_process_ma_vif_availibility_ind(unifi_priv_t *priv,u8 *sigdata, u32 siglen);
+void uf_process_ma_vif_availibility_ind(unifi_priv_t *priv, u8 *sigdata, u32 siglen);
 #ifdef CSR_SUPPORT_SME
-void uf_send_buffered_frames(unifi_priv_t *priv,unifi_TrafficQueue queue);
-int uf_process_station_records_for_sending_data(unifi_priv_t *priv,u16 interfaceTag,
+void uf_send_buffered_frames(unifi_priv_t *priv, unifi_TrafficQueue queue);
+int uf_process_station_records_for_sending_data(unifi_priv_t *priv, u16 interfaceTag,
                                                  CsrWifiRouterCtrlStaInfo_t *srcStaInfo,
                                                  CsrWifiRouterCtrlStaInfo_t *dstStaInfo);
 void uf_prepare_send_cfm_list_for_queued_pkts(unifi_priv_t * priv,
@@ -958,8 +958,8 @@
 void uf_flush_list(unifi_priv_t * priv, struct list_head * list);
 tx_buffered_packets_t *dequeue_tx_data_pdu(unifi_priv_t *priv, struct list_head *txList);
 void resume_unicast_buffered_frames(unifi_priv_t *priv, u16 interfaceTag);
-void update_eosp_to_head_of_broadcast_list_head(unifi_priv_t *priv,u16 interfaceTag);
-void resume_suspended_uapsd(unifi_priv_t* priv,u16 interfaceTag);
+void update_eosp_to_head_of_broadcast_list_head(unifi_priv_t *priv, u16 interfaceTag);
+void resume_suspended_uapsd(unifi_priv_t* priv, u16 interfaceTag);
 #endif
 /*
  *      netdev.c
@@ -1048,14 +1048,14 @@
                                                                  u16 interfaceTag);
 
 void uf_update_sta_activity(unifi_priv_t *priv, u16 interfaceTag, const u8 *peerMacAddress);
-void uf_process_ma_pkt_cfm_for_ap(unifi_priv_t *priv,u16 interfaceTag, const CSR_MA_PACKET_CONFIRM *pkt_cfm);
+void uf_process_ma_pkt_cfm_for_ap(unifi_priv_t *priv, u16 interfaceTag, const CSR_MA_PACKET_CONFIRM *pkt_cfm);
 #endif
 
 void uf_resume_data_plane(unifi_priv_t *priv, int queue,
                           CsrWifiMacAddress peer_address,
                           u16 interfaceTag);
 void uf_free_pending_rx_packets(unifi_priv_t *priv, int queue,
-        CsrWifiMacAddress peer_address,u16 interfaceTag);
+        CsrWifiMacAddress peer_address, u16 interfaceTag);
 
 int uf_register_netdev(unifi_priv_t *priv, int numOfInterface);
 void uf_unregister_netdev(unifi_priv_t *priv);

diff --git a/drivers/staging/csr/unifi_sme.c b/drivers/staging/csr/unifi_sme.c
index 9029503..5090882 100644
--- a/drivers/staging/csr/unifi_sme.c
+++ b/drivers/staging/csr/unifi_sme.c

@@ -133,7 +133,7 @@
                     unicastPdu = FALSE;
 
                 CsrWifiRouterCtrlMicFailureIndSend (priv->CSR_WIFI_SME_IFACEQUEUE, 0,
-                        (ind->VirtualInterfaceIdentifier & 0xff),peerMacAddress,
+                        (ind->VirtualInterfaceIdentifier & 0xff), peerMacAddress,
                         unicastPdu);
                 return;
             }
@@ -143,10 +143,10 @@
                 {
                     u8 pmBit = (frmCtrl & 0x1000)?0x01:0x00;
                     u16 interfaceTag = (ind->VirtualInterfaceIdentifier & 0xff);
-                    CsrWifiRouterCtrlStaInfo_t *srcStaInfo =  CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv,taddr,interfaceTag);
+                    CsrWifiRouterCtrlStaInfo_t *srcStaInfo =  CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv, taddr, interfaceTag);
                     if((srcStaInfo != NULL) && (uf_check_broadcast_bssid(priv, bulkdata)== FALSE))
                     {
-                        uf_process_pm_bit_for_peer(priv,srcStaInfo,pmBit,interfaceTag);
+                        uf_process_pm_bit_for_peer(priv, srcStaInfo, pmBit, interfaceTag);
 
                         /* Update station last activity flag */
                         srcStaInfo->activity_flag = TRUE;
@@ -169,7 +169,7 @@
                 return;
             }
 
-            unifi_trace(priv,UDBG1,"MA-PACKET Confirm (%x, %x)\n", cfm->HostTag, cfm->TransmissionStatus);
+            unifi_trace(priv, UDBG1, "MA-PACKET Confirm (%x, %x)\n", cfm->HostTag, cfm->TransmissionStatus);
 
             interfacePriv = priv->interfacePriv[interfaceTag];
 #ifdef CSR_SUPPORT_SME
@@ -177,7 +177,7 @@
                  interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_P2PGO) {
 
                 if(cfm->HostTag == interfacePriv->multicastPduHostTag){
-                    uf_process_ma_pkt_cfm_for_ap(priv ,interfaceTag, cfm);
+                    uf_process_ma_pkt_cfm_for_ap(priv, interfaceTag, cfm);
                 }
             }
 #endif
@@ -395,7 +395,7 @@
             interfacePriv->mc_list_count);
 
     /* Flush the current list */
-    CsrWifiRouterCtrlMulticastAddressIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0, interfaceTag, CSR_WIFI_SME_LIST_ACTION_FLUSH, 0, NULL);
+    CsrWifiRouterCtrlMulticastAddressIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, CSR_WIFI_SME_LIST_ACTION_FLUSH, 0, NULL);
 
     mc_count = interfacePriv->mc_list_count;
     mc_list = interfacePriv->mc_list;
@@ -419,7 +419,7 @@
         return;
     }
 
-    CsrWifiRouterCtrlMulticastAddressIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,
+    CsrWifiRouterCtrlMulticastAddressIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0,
             interfaceTag,
             CSR_WIFI_SME_LIST_ACTION_ADD,
             mc_count, multicast_address_list);
@@ -950,7 +950,7 @@
     }
     return i;
 }
-int unifi_cfg_set_ap_config(unifi_priv_t * priv,unsigned char* arg)
+int unifi_cfg_set_ap_config(unifi_priv_t * priv, unsigned char* arg)
 {
     uf_cfg_ap_config_t cfg_ap_config;
     char *buffer;
@@ -981,7 +981,7 @@
     priv->ap_mac_config.phySupportedBitmap = cfg_ap_config.phySupportedBitmap;
     priv->ap_mac_config.maxListenInterval=cfg_ap_config.maxListenInterval;
 
-    priv->ap_mac_config.supportedRatesCount=     uf_configure_supported_rates(priv->ap_mac_config.supportedRates,priv->ap_mac_config.phySupportedBitmap);
+    priv->ap_mac_config.supportedRatesCount=     uf_configure_supported_rates(priv->ap_mac_config.supportedRates, priv->ap_mac_config.phySupportedBitmap);
 
     return 0;
 }
@@ -1051,7 +1051,7 @@
     u16 interfaceTag = 0;
 
 
-    CsrWifiRouterCtrlTrafficProtocolIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,
+    CsrWifiRouterCtrlTrafficProtocolIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0,
             interfaceTag,
             ind->packet_type,
             ind->direction,
@@ -1119,7 +1119,7 @@
         }
     }
 
-    CsrWifiRouterCtrlTrafficSampleIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0, interfaceTag, ind->stats);
+    CsrWifiRouterCtrlTrafficSampleIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, ind->stats);
 
     ind->in_use = 0;
 
@@ -1219,7 +1219,7 @@
         kfree(pktBulkData); /* Would have been copied over by the SME Handler */
 
     } else {
-	    unifi_warning(priv, "uf_send_pkt_to_encrypt() is NOT applicable for interface mode - %d\n",interfacePriv->interfaceMode);
+	    unifi_warning(priv, "uf_send_pkt_to_encrypt() is NOT applicable for interface mode - %d\n", interfacePriv->interfaceMode);
     }
 }/* uf_send_pkt_to_encrypt() */
 #endif

diff --git a/drivers/staging/csr/unifi_sme.h b/drivers/staging/csr/unifi_sme.h
index b689cfe..aff9aa1 100644
--- a/drivers/staging/csr/unifi_sme.h
+++ b/drivers/staging/csr/unifi_sme.h

@@ -210,9 +210,9 @@
 int sme_mgt_mib_set(unifi_priv_t *priv,
                     unsigned char *varbind, int length);
 #ifdef CSR_SUPPORT_WEXT_AP
-int sme_ap_start(unifi_priv_t *priv,u16 interface_tag,CsrWifiSmeApConfig_t *ap_config);
-int sme_ap_stop(unifi_priv_t *priv,u16 interface_tag);
-int sme_ap_config(unifi_priv_t *priv,CsrWifiSmeApMacConfig *ap_mac_config, CsrWifiNmeApConfig *group_security_config);
+int sme_ap_start(unifi_priv_t *priv, u16 interface_tag, CsrWifiSmeApConfig_t *ap_config);
+int sme_ap_stop(unifi_priv_t *priv, u16 interface_tag);
+int sme_ap_config(unifi_priv_t *priv, CsrWifiSmeApMacConfig *ap_mac_config, CsrWifiNmeApConfig *group_security_config);
 int uf_configure_supported_rates(u8 * supportedRates, u8 phySupportedBitmap);
 #endif
 int unifi_translate_scan(struct net_device *dev,
@@ -234,7 +234,7 @@
 int unifi_cfg_strict_draft_n(unifi_priv_t *priv, unsigned char *arg);
 int unifi_cfg_enable_okc(unifi_priv_t *priv, unsigned char *arg);
 #ifdef CSR_SUPPORT_WEXT_AP
-int unifi_cfg_set_ap_config(unifi_priv_t * priv,unsigned char* arg);
+int unifi_cfg_set_ap_config(unifi_priv_t * priv, unsigned char* arg);
 #endif
 
 

diff --git a/drivers/staging/cxt1e1/comet.c b/drivers/staging/cxt1e1/comet.c
index 52224cd..fabfd77 100644
--- a/drivers/staging/cxt1e1/comet.c
+++ b/drivers/staging/cxt1e1/comet.c

@@ -13,7 +13,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <asm/io.h>
+#include <linux/io.h>
 #include <linux/hdlc.h>
 #include "pmcc4_sysdep.h"
 #include "sbecom_inline_linux.h"
@@ -35,235 +35,253 @@
 #define COMET_NUM_UNITS     5   /* Number of points per entry in table */
 
 /* forward references */
-STATIC void SetPwrLevel (comet_t * comet);
-STATIC void WrtRcvEqualizerTbl (ci_t * ci, comet_t * comet, u_int32_t *table);
-STATIC void WrtXmtWaveformTbl (ci_t * ci, comet_t * comet, u_int8_t table[COMET_NUM_SAMPLES][COMET_NUM_UNITS]);
+STATIC void SetPwrLevel(comet_t *comet);
+STATIC void WrtRcvEqualizerTbl(ci_t *ci, comet_t *comet, u_int32_t *table);
+STATIC void WrtXmtWaveformTbl(ci_t *ci, comet_t *comet, u_int8_t table[COMET_NUM_SAMPLES][COMET_NUM_UNITS]);
 
 
 void       *TWV_table[12] = {
-    TWVLongHaul0DB, TWVLongHaul7_5DB, TWVLongHaul15DB, TWVLongHaul22_5DB,
-    TWVShortHaul0, TWVShortHaul1, TWVShortHaul2, TWVShortHaul3, TWVShortHaul4,
-    TWVShortHaul5,
-    TWV_E1_75Ohm,    /** PORT POINT - 75 Ohm not supported **/
-    TWV_E1_120Ohm
+	TWVLongHaul0DB, TWVLongHaul7_5DB, TWVLongHaul15DB, TWVLongHaul22_5DB,
+	TWVShortHaul0, TWVShortHaul1, TWVShortHaul2, TWVShortHaul3,
+	TWVShortHaul4, TWVShortHaul5,
+	/** PORT POINT - 75 Ohm not supported **/
+	TWV_E1_75Ohm,
+	TWV_E1_120Ohm
 };
 
 
 static int
-lbo_tbl_lkup (int t1, int lbo)
-{
-    if ((lbo < CFG_LBO_LH0) || (lbo > CFG_LBO_E120))    /* error switches to
-                                                         * default */
-    {
-        if (t1)
-            lbo = CFG_LBO_LH0;  /* default T1 waveform table */
-        else
-            lbo = CFG_LBO_E120;     /* default E1 waveform table */
-    }
-    return (lbo - 1);               /* make index ZERO relative */
+lbo_tbl_lkup(int t1, int lbo) {
+	/* error switches to default */
+	if ((lbo < CFG_LBO_LH0) || (lbo > CFG_LBO_E120)) {
+		if (t1)
+			/* default T1 waveform table */
+			lbo = CFG_LBO_LH0;
+		else
+			/* default E1 waveform table */
+			lbo = CFG_LBO_E120;
+	}
+	/* make index ZERO relative */
+	return lbo - 1;
 }
 
-
-void
-init_comet (void *ci, comet_t * comet, u_int32_t port_mode, int clockmaster,
-            u_int8_t moreParams)
+void init_comet(void *ci, comet_t *comet, u_int32_t port_mode, int clockmaster,
+		u_int8_t moreParams)
 {
-    u_int8_t isT1mode;
-    u_int8_t    tix = CFG_LBO_LH0;      /* T1 default */
+	u_int8_t isT1mode;
+	/* T1 default */
+	u_int8_t    tix = CFG_LBO_LH0;
+	isT1mode = IS_FRAME_ANY_T1(port_mode);
+	/* T1 or E1 */
+	if (isT1mode) {
+		/* Select T1 Mode & PIO output enabled */
+		pci_write_32((u_int32_t *) &comet->gbl_cfg, 0xa0);
+		/* default T1 waveform table */
+		tix = lbo_tbl_lkup(isT1mode, CFG_LBO_LH0);
+	} else {
+		/* Select E1 Mode & PIO output enabled */
+		pci_write_32((u_int32_t *) &comet->gbl_cfg, 0x81);
+		/* default E1 waveform table */
+		tix = lbo_tbl_lkup(isT1mode, CFG_LBO_E120);
+	}
 
-    isT1mode = IS_FRAME_ANY_T1 (port_mode);
-    /* T1 or E1 */
-    if (isT1mode)
-    {
-        pci_write_32 ((u_int32_t *) &comet->gbl_cfg, 0xa0);     /* Select T1 Mode & PIO
-                                                                 * output enabled */
-        tix = lbo_tbl_lkup (isT1mode, CFG_LBO_LH0);     /* default T1 waveform
-                                                         * table */
-    } else
-    {
-        pci_write_32 ((u_int32_t *) &comet->gbl_cfg, 0x81);     /* Select E1 Mode & PIO
-                                                                 * output enabled */
-        tix = lbo_tbl_lkup (isT1mode, CFG_LBO_E120);    /* default E1 waveform
-                                                         * table */
-    }
+	if (moreParams & CFG_LBO_MASK)
+		/* dial-in requested waveform table */
+		tix = lbo_tbl_lkup(isT1mode, moreParams & CFG_LBO_MASK);
+	/* Tx line Intfc cfg Set for analog & no special patterns */
+	/* Transmit Line Interface Config. */
+	pci_write_32((u_int32_t *) &comet->tx_line_cfg, 0x00);
+	/* master test Ignore Test settings for now */
+	/* making sure it's Default value */
+	pci_write_32((u_int32_t *) &comet->mtest, 0x00);
+	/* Turn on Center (CENT) and everything else off */
+	/* RJAT cfg */
+	pci_write_32((u_int32_t *) &comet->rjat_cfg, 0x10);
+	/* Set Jitter Attenuation to recommend T1 values */
+	if (isT1mode) {
+		/* RJAT Divider N1 Control */
+		pci_write_32((u_int32_t *) &comet->rjat_n1clk, 0x2F);
+		/* RJAT Divider N2 Control */
+		pci_write_32((u_int32_t *) &comet->rjat_n2clk, 0x2F);
+	} else {
+		/* RJAT Divider N1 Control */
+		pci_write_32((u_int32_t *) &comet->rjat_n1clk, 0xFF);
+		/* RJAT Divider N2 Control */
+		pci_write_32((u_int32_t *) &comet->rjat_n2clk, 0xFF);
+	}
 
-    if (moreParams & CFG_LBO_MASK)
-        tix = lbo_tbl_lkup (isT1mode, moreParams & CFG_LBO_MASK);       /* dial-in requested
-                                                                         * waveform table */
+	/* Turn on Center (CENT) and everything else off */
+	/* TJAT Config. */
+	pci_write_32((u_int32_t *) &comet->tjat_cfg, 0x10);
 
-    /* Tx line Intfc cfg     ** Set for analog & no special patterns */
-    pci_write_32 ((u_int32_t *) &comet->tx_line_cfg, 0x00);     /* Transmit Line
-                                                                 * Interface Config. */
+	/* Do not bypass jitter attenuation and bypass elastic store */
+	/* rx opts */
+	pci_write_32((u_int32_t *) &comet->rx_opt, 0x00);
 
-    /* master test    ** Ignore Test settings for now */
-    pci_write_32 ((u_int32_t *) &comet->mtest, 0x00);   /* making sure it's
-                                                         * Default value */
+	/* TJAT ctrl & TJAT divider ctrl */
+	/* Set Jitter Attenuation to recommended T1 values */
+	if (isT1mode) {
+		/* TJAT Divider N1 Control */
+		pci_write_32((u_int32_t *) &comet->tjat_n1clk, 0x2F);
+		/* TJAT Divider N2  Control */
+		pci_write_32((u_int32_t *) &comet->tjat_n2clk, 0x2F);
+	} else {
+		/* TJAT Divider N1 Control */
+		pci_write_32((u_int32_t *) &comet->tjat_n1clk, 0xFF);
+		/* TJAT Divider N2 Control */
+		pci_write_32((u_int32_t *) &comet->tjat_n2clk, 0xFF);
+	}
 
-    /* Turn on Center (CENT) and everything else off */
-    pci_write_32 ((u_int32_t *) &comet->rjat_cfg, 0x10);        /* RJAT cfg */
-    /* Set Jitter Attenuation to recommend T1 values */
-    if (isT1mode)
-    {
-        pci_write_32 ((u_int32_t *) &comet->rjat_n1clk, 0x2F);  /* RJAT Divider N1
-                                                                 * Control */
-        pci_write_32 ((u_int32_t *) &comet->rjat_n2clk, 0x2F);  /* RJAT Divider N2
-                                                                 * Control */
-    } else
-    {
-        pci_write_32 ((u_int32_t *) &comet->rjat_n1clk, 0xFF);  /* RJAT Divider N1
-                                                                 * Control */
-        pci_write_32 ((u_int32_t *) &comet->rjat_n2clk, 0xFF);  /* RJAT Divider N2
-                                                                 * Control */
-    }
+	/* 1c: rx ELST cfg   20: tx ELST cfg  28&38: rx&tx data link ctrl */
 
-    /* Turn on Center (CENT) and everything else off */
-    pci_write_32 ((u_int32_t *) &comet->tjat_cfg, 0x10);        /* TJAT Config. */
-
-    /* Do not bypass jitter attenuation and bypass elastic store */
-    pci_write_32 ((u_int32_t *) &comet->rx_opt, 0x00);  /* rx opts */
-
-    /* TJAT ctrl & TJAT divider ctrl */
-    /* Set Jitter Attenuation to recommended T1 values */
-    if (isT1mode)
-    {
-        pci_write_32 ((u_int32_t *) &comet->tjat_n1clk, 0x2F);  /* TJAT Divider N1
-                                                                 * Control */
-        pci_write_32 ((u_int32_t *) &comet->tjat_n2clk, 0x2F);  /* TJAT Divider N2
-                                                                 * Control */
-    } else
-    {
-        pci_write_32 ((u_int32_t *) &comet->tjat_n1clk, 0xFF);  /* TJAT Divider N1
-                                                                 * Control */
-        pci_write_32 ((u_int32_t *) &comet->tjat_n2clk, 0xFF);  /* TJAT Divider N2
-                                                                 * Control */
-    }
-
-    /* 1c: rx ELST cfg   20: tx ELST cfg  28&38: rx&tx data link ctrl */
-    if (isT1mode)
-    {                               /* Select 193-bit frame format */
-        pci_write_32 ((u_int32_t *) &comet->rx_elst_cfg, 0x00);
-        pci_write_32 ((u_int32_t *) &comet->tx_elst_cfg, 0x00);
-    } else
-    {                               /* Select 256-bit frame format */
-        pci_write_32 ((u_int32_t *) &comet->rx_elst_cfg, 0x03);
-        pci_write_32 ((u_int32_t *) &comet->tx_elst_cfg, 0x03);
-        pci_write_32 ((u_int32_t *) &comet->rxce1_ctl, 0x00);   /* disable T1 data link
-                                                                 * receive */
-        pci_write_32 ((u_int32_t *) &comet->txci1_ctl, 0x00);   /* disable T1 data link
-                                                                 * transmit */
-    }
+	/* Select 193-bit frame format */
+	if (isT1mode) {
+		pci_write_32((u_int32_t *) &comet->rx_elst_cfg, 0x00);
+		pci_write_32((u_int32_t *) &comet->tx_elst_cfg, 0x00);
+	} else {
+		/* Select 256-bit frame format */
+		pci_write_32((u_int32_t *) &comet->rx_elst_cfg, 0x03);
+		pci_write_32((u_int32_t *) &comet->tx_elst_cfg, 0x03);
+		/* disable T1 data link receive */
+		pci_write_32((u_int32_t *) &comet->rxce1_ctl, 0x00);
+		/* disable T1 data link transmit */
+		pci_write_32((u_int32_t *) &comet->txci1_ctl, 0x00);
+	}
 
     /* the following is a default value */
     /* Enable 8 out of 10 validation */
-    pci_write_32 ((u_int32_t *) &comet->t1_rboc_ena, 0x00);     /* t1RBOC
-                                                                 * enable(BOC:BitOriented
-                                                                 * Code) */
-    if (isT1mode)
-    {
+	 /* t1RBOC enable(BOC:BitOriented Code) */
+	pci_write_32((u_int32_t *) &comet->t1_rboc_ena, 0x00);
+	if (isT1mode)
+	{
 
-        /* IBCD cfg: aka Inband Code Detection ** loopback code length set to */
-        pci_write_32 ((u_int32_t *) &comet->ibcd_cfg, 0x04);    /* 6 bit down, 5 bit up
-                                                                 * (assert)  */
-        pci_write_32 ((u_int32_t *) &comet->ibcd_act, 0x08);    /* line loopback
-                                                                 * activate pattern */
-        pci_write_32 ((u_int32_t *) &comet->ibcd_deact, 0x24);  /* deactivate code
-                                                                 * pattern (i.e.001) */
-    }
+	/* IBCD cfg: aka Inband Code Detection ** loopback code length set to */
+		/* 6 bit down, 5 bit up (assert) */
+		pci_write_32((u_int32_t *) &comet->ibcd_cfg, 0x04);
+		/* line loopback activate pattern */
+		pci_write_32((u_int32_t *) &comet->ibcd_act, 0x08);
+		/* deactivate code pattern (i.e.001) */
+		pci_write_32((u_int32_t *) &comet->ibcd_deact, 0x24);
+	}
     /* 10: CDRC cfg 28&38: rx&tx data link 1 ctrl 48: t1 frmr cfg  */
     /* 50: SIGX cfg, COSS (change of signaling state) 54: XBAS cfg  */
     /* 60: t1 ALMI cfg */
     /* Configure Line Coding */
 
-    switch (port_mode)
-    {
-    case CFG_FRAME_SF:              /* 1 - T1 B8ZS */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->t1_frmr_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->t1_xbas_cfg, 0x20); /* 5:B8ZS */
-        pci_write_32 ((u_int32_t *) &comet->t1_almi_cfg, 0);
-        break;
-    case CFG_FRAME_ESF:     /* 2 - T1 B8ZS */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->rxce1_ctl, 0x20);   /* Bit 5: T1 DataLink
-                                                                 * Enable */
-        pci_write_32 ((u_int32_t *) &comet->txci1_ctl, 0x20);   /* 5: T1 DataLink Enable */
-        pci_write_32 ((u_int32_t *) &comet->t1_frmr_cfg, 0x30); /* 4:ESF  5:ESFFA */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0x04);    /* 2:ESF */
-        pci_write_32 ((u_int32_t *) &comet->t1_xbas_cfg, 0x30); /* 4:ESF  5:B8ZS */
-        pci_write_32 ((u_int32_t *) &comet->t1_almi_cfg, 0x10); /* 4:ESF */
-        break;
-    case CFG_FRAME_E1PLAIN:         /* 3 - HDB3 */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0x40);
-        break;
-    case CFG_FRAME_E1CAS:           /* 4 - HDB3 */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0x60);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0);
-        break;
-    case CFG_FRAME_E1CRC:           /* 5 - HDB3 */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0x10);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0xc2);
-        break;
-    case CFG_FRAME_E1CRC_CAS:       /* 6 - HDB3 */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0x70);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0x82);
-        break;
-    case CFG_FRAME_SF_AMI:          /* 7 - T1 AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->t1_frmr_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->t1_xbas_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->t1_almi_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        break;
-    case CFG_FRAME_ESF_AMI:         /* 8 - T1 AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->rxce1_ctl, 0x20);   /* 5: T1 DataLink Enable */
-        pci_write_32 ((u_int32_t *) &comet->txci1_ctl, 0x20);   /* 5: T1 DataLink Enable */
-        pci_write_32 ((u_int32_t *) &comet->t1_frmr_cfg, 0x30); /* Bit 4:ESF  5:ESFFA */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0x04);    /* 2:ESF */
-        pci_write_32 ((u_int32_t *) &comet->t1_xbas_cfg, 0x10); /* 4:ESF */
-        pci_write_32 ((u_int32_t *) &comet->t1_almi_cfg, 0x10); /* 4:ESF */
-        break;
-    case CFG_FRAME_E1PLAIN_AMI:       /* 9 - AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0x80);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0x40);
-        break;
-    case CFG_FRAME_E1CAS_AMI:       /* 10 - AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0xe0);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0);
-        break;
-    case CFG_FRAME_E1CRC_AMI:       /* 11 - AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0x90);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0xc2);
-        break;
-    case CFG_FRAME_E1CRC_CAS_AMI:   /* 12 - AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0xf0);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0x82);
-        break;
-    }                               /* end switch */
+	switch (port_mode)
+	{
+	/* 1 - T1 B8ZS */
+	case CFG_FRAME_SF:
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->t1_frmr_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+		/* 5:B8ZS */
+		pci_write_32((u_int32_t *) &comet->t1_xbas_cfg, 0x20);
+		pci_write_32((u_int32_t *) &comet->t1_almi_cfg, 0);
+		break;
+	/* 2 - T1 B8ZS */
+	case CFG_FRAME_ESF:
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+		/* Bit 5: T1 DataLink Enable */
+		pci_write_32((u_int32_t *) &comet->rxce1_ctl, 0x20);
+		/* 5: T1 DataLink Enable */
+		pci_write_32((u_int32_t *) &comet->txci1_ctl, 0x20);
+		/* 4:ESF  5:ESFFA */
+		pci_write_32((u_int32_t *) &comet->t1_frmr_cfg, 0x30);
+		/* 2:ESF */
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0x04);
+		/* 4:ESF  5:B8ZS */
+		pci_write_32((u_int32_t *) &comet->t1_xbas_cfg, 0x30);
+		/* 4:ESF */
+		pci_write_32((u_int32_t *) &comet->t1_almi_cfg, 0x10);
+		break;
+	/* 3 - HDB3 */
+	case CFG_FRAME_E1PLAIN:
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0x40);
+		break;
+	/* 4 - HDB3 */
+	case CFG_FRAME_E1CAS:
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0x60);
+		pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0);
+		break;
+	/* 5 - HDB3 */
+	case CFG_FRAME_E1CRC:
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0x10);
+		pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0xc2);
+		break;
+	/* 6 - HDB3 */
+	case CFG_FRAME_E1CRC_CAS:
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0x70);
+		pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0x82);
+		break;
+	/* 7 - T1 AMI */
+	case CFG_FRAME_SF_AMI:
+		/* Enable AMI Line Decoding */
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+		pci_write_32((u_int32_t *) &comet->t1_frmr_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->t1_xbas_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->t1_almi_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+		break;
+	/* 8 - T1 AMI */
+	case CFG_FRAME_ESF_AMI:
+		/* Enable AMI Line Decoding */
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+		/* 5: T1 DataLink Enable */
+		pci_write_32((u_int32_t *) &comet->rxce1_ctl, 0x20);
+		/* 5: T1 DataLink Enable */
+		pci_write_32((u_int32_t *) &comet->txci1_ctl, 0x20);
+		/* Bit 4:ESF  5:ESFFA */
+		pci_write_32((u_int32_t *) &comet->t1_frmr_cfg, 0x30);
+		/* 2:ESF */
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0x04);
+		/* 4:ESF */
+		pci_write_32((u_int32_t *) &comet->t1_xbas_cfg, 0x10);
+		/* 4:ESF */
+		pci_write_32((u_int32_t *) &comet->t1_almi_cfg, 0x10);
+		break;
+	/* 9 - AMI */
+	case CFG_FRAME_E1PLAIN_AMI:
+		/* Enable AMI Line Decoding */
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0x80);
+		pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0x40);
+		break;
+	/* 10 - AMI */
+	case CFG_FRAME_E1CAS_AMI:
+		/* Enable AMI Line Decoding */
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0xe0);
+		pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0);
+		break;
+	/* 11 - AMI */
+	case CFG_FRAME_E1CRC_AMI:
+		/* Enable AMI Line Decoding */
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0x90);
+		pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0xc2);
+		break;
+	/* 12 - AMI */
+	case CFG_FRAME_E1CRC_CAS_AMI:
+		/* Enable AMI Line Decoding */
+		pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+		pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+		pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0xf0);
+		pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0x82);
+		break;
+	}	/* end switch */
 
     /***
      * Set Full Frame mode (NXDSO[1] = 0, NXDSO[0] = 0)
@@ -277,101 +295,109 @@
 
     /* 0x30: "BRIF cfg"; 0x20 is 'CMODE', 0x03 is (bit) rate */
     /* note "rate bits can only be set once after reset" */
-    if (clockmaster)
-    {                               /* CMODE == clockMode, 0=clock master (so
-                                     * all 3 others should be slave) */
-        if (isT1mode)               /* rate = 1.544 Mb/s */
-            pci_write_32 ((u_int32_t *) &comet->brif_cfg, 0x00);        /* Comet 0 Master
-                                                                         * Mode(CMODE=0) */
-        else                        /* rate = 2.048 Mb/s */
-            pci_write_32 ((u_int32_t *) &comet->brif_cfg, 0x01);        /* Comet 0 Master
-                                                                         * Mode(CMODE=0) */
+	if (clockmaster)
+		{
+		/* CMODE == clockMode, 0=clock master (so all 3 others should be slave) */
+		/* rate = 1.544 Mb/s */
+		if (isT1mode)
+			/* Comet 0 Master Mode(CMODE=0) */
+			pci_write_32((u_int32_t *) &comet->brif_cfg, 0x00);
+		/* rate = 2.048 Mb/s */
+		else
+			/* Comet 0 Master Mode(CMODE=0) */
+			pci_write_32((u_int32_t *) &comet->brif_cfg, 0x01);
 
-        /* 31: BRIF frame pulse cfg  06: tx timing options */
-        pci_write_32 ((u_int32_t *) &comet->brif_fpcfg, 0x00);  /* Master Mode
-                                                                 * i.e.FPMODE=0 (@0x20) */
-        if ((moreParams & CFG_CLK_PORT_MASK) == CFG_CLK_PORT_INTERNAL)
-        {
-            if (cxt1e1_log_level >= LOG_SBEBUG12)
-                pr_info(">> %s: clockmaster internal clock\n", __func__);
-            pci_write_32 ((u_int32_t *) &comet->tx_time, 0x0d); /* internal oscillator */
-        } else                      /* external clock source */
-        {
-            if (cxt1e1_log_level >= LOG_SBEBUG12)
-                pr_info(">> %s: clockmaster external clock\n", __func__);
-            pci_write_32 ((u_int32_t *) &comet->tx_time, 0x09); /* loop timing
-                                                                 * (external) */
-        }
+		/* 31: BRIF frame pulse cfg  06: tx timing options */
 
-    } else                          /* slave */
-    {
-        if (isT1mode)
-            pci_write_32 ((u_int32_t *) &comet->brif_cfg, 0x20);        /* Slave Mode(CMODE=1,
-                                                                         * see above) */
-        else
-            pci_write_32 ((u_int32_t *) &comet->brif_cfg, 0x21);        /* Slave Mode (CMODE=1) */
-        pci_write_32 ((u_int32_t *) &comet->brif_fpcfg, 0x20);  /* Slave Mode i.e.
-                                                                 * FPMODE=1 (@0x20) */
-        if (cxt1e1_log_level >= LOG_SBEBUG12)
-            pr_info(">> %s: clockslave internal clock\n", __func__);
-        pci_write_32 ((u_int32_t *) &comet->tx_time, 0x0d);     /* oscillator timing */
-    }
+		/* Master Mode i.e.FPMODE=0 (@0x20) */
+		pci_write_32((u_int32_t *) &comet->brif_fpcfg, 0x00);
+		if ((moreParams & CFG_CLK_PORT_MASK) == CFG_CLK_PORT_INTERNAL)
+			{
+			if (cxt1e1_log_level >= LOG_SBEBUG12)
+				pr_info(">> %s: clockmaster internal clock\n", __func__);
+			/* internal oscillator */
+			pci_write_32((u_int32_t *) &comet->tx_time, 0x0d);
+		} else {
+			/* external clock source */
+			if (cxt1e1_log_level >= LOG_SBEBUG12)
+				pr_info(">> %s: clockmaster external clock\n", __func__);
+			/* loop timing(external) */
+			pci_write_32((u_int32_t *) &comet->tx_time, 0x09);
+		}
 
-    /* 32: BRIF parity F-bit cfg */
-    /* Totem-pole operation */
-    pci_write_32 ((u_int32_t *) &comet->brif_pfcfg, 0x01);      /* Receive Backplane
-                                                                 * Parity/F-bit */
+	} else  {
+		/* slave */
+		if (isT1mode)
+			/* Slave Mode(CMODE=1, see above) */
+			pci_write_32((u_int32_t *) &comet->brif_cfg, 0x20);
+		else
+			/* Slave Mode(CMODE=1)*/
+			pci_write_32((u_int32_t *) &comet->brif_cfg, 0x21);
+		/* Slave Mode i.e. FPMODE=1 (@0x20) */
+		pci_write_32((u_int32_t *) &comet->brif_fpcfg, 0x20);
+	if (cxt1e1_log_level >= LOG_SBEBUG12)
+		pr_info(">> %s: clockslave internal clock\n", __func__);
+	/* oscillator timing */
+	pci_write_32((u_int32_t *) &comet->tx_time, 0x0d);
+	}
+
+	/* 32: BRIF parity F-bit cfg */
+	/* Totem-pole operation */
+	/* Receive Backplane Parity/F-bit */
+	pci_write_32((u_int32_t *) &comet->brif_pfcfg, 0x01);
 
     /* dc: RLPS equalizer V ref */
     /* Configuration */
-    if (isT1mode)
-        pci_write_32 ((u_int32_t *) &comet->rlps_eqvr, 0x2c);   /* RLPS Equalizer
-                                                                 * Voltage  */
-    else
-        pci_write_32 ((u_int32_t *) &comet->rlps_eqvr, 0x34);   /* RLPS Equalizer
-                                                                 * Voltage  */
+	if (isT1mode)
+		/* RLPS Equalizer Voltage  */
+		pci_write_32((u_int32_t *) &comet->rlps_eqvr, 0x2c);
+	else
+		/* RLPS Equalizer Voltage  */
+		pci_write_32((u_int32_t *) &comet->rlps_eqvr, 0x34);
 
     /* Reserved bit set and SQUELCH enabled */
     /* f8: RLPS cfg & status  f9: RLPS ALOS detect/clear threshold */
-    pci_write_32 ((u_int32_t *) &comet->rlps_cfgsts, 0x11);     /* RLPS Configuration
-                                                                 * Status */
-    if (isT1mode)
-        pci_write_32 ((u_int32_t *) &comet->rlps_alos_thresh, 0x55);    /* ? */
-    else
-        pci_write_32 ((u_int32_t *) &comet->rlps_alos_thresh, 0x22);    /* ? */
+	/* RLPS Configuration Status */
+	pci_write_32((u_int32_t *) &comet->rlps_cfgsts, 0x11);
+	if (isT1mode)
+		 /* ? */
+		pci_write_32((u_int32_t *) &comet->rlps_alos_thresh, 0x55);
+	else
+		/* ? */
+		pci_write_32((u_int32_t *) &comet->rlps_alos_thresh, 0x22);
 
 
     /* Set Full Frame mode (NXDSO[1] = 0, NXDSO[0] = 0) */
     /* CMODE=0: Clock slave mode with BTCLK as an input, DE=1: Use rising */
     /* edge of BTCLK for data, FE=1: Use rising edge of BTCLK for frame, */
     /* CMS=0: Use backplane freq, RATE[1:0]=0,0: T1 */
-/***    Transmit side is always an Input, Slave Clock*/
-    /* 40: BTIF cfg  41: BTIF frame pulse cfg */
-    if (isT1mode)
-        pci_write_32 ((u_int32_t *) &comet->btif_cfg, 0x38);    /* BTIF Configuration
-                                                                 * Reg. */
-    else
-        pci_write_32 ((u_int32_t *) &comet->btif_cfg, 0x39);    /* BTIF Configuration
-                                                                 * Reg. */
-
-    pci_write_32 ((u_int32_t *) &comet->btif_fpcfg, 0x01);      /* BTIF Frame Pulse
-                                                                 * Config. */
+    /***    Transmit side is always an Input, Slave Clock*/
+    /* 40: BTIF cfg  41: loop timing(external) */
+	/*BTIF frame pulse cfg */
+	if (isT1mode)
+		/* BTIF Configuration  Reg. */
+		pci_write_32((u_int32_t *) &comet->btif_cfg, 0x38);
+	else
+		/* BTIF Configuration  Reg. */
+		pci_write_32((u_int32_t *) &comet->btif_cfg, 0x39);
+	/* BTIF Frame Pulse Config. */
+	pci_write_32((u_int32_t *) &comet->btif_fpcfg, 0x01);
 
     /* 0a: master diag  06: tx timing options */
     /* if set Comet to loop back */
 
     /* Comets set to normal */
-    pci_write_32 ((u_int32_t *) &comet->mdiag, 0x00);
+	pci_write_32((u_int32_t *) &comet->mdiag, 0x00);
 
     /* BTCLK driven by TCLKI internally (crystal driven) and Xmt Elasted  */
     /* Store is enabled. */
 
-    WrtXmtWaveformTbl (ci, comet, TWV_table[tix]);
-    if (isT1mode)
-        WrtRcvEqualizerTbl ((ci_t *) ci, comet, &T1_Equalizer[0]);
-    else
-        WrtRcvEqualizerTbl ((ci_t *) ci, comet, &E1_Equalizer[0]);
-    SetPwrLevel (comet);
+	WrtXmtWaveformTbl(ci, comet, TWV_table[tix]);
+	if (isT1mode)
+		WrtRcvEqualizerTbl((ci_t *) ci, comet, &T1_Equalizer[0]);
+	else
+		WrtRcvEqualizerTbl((ci_t *) ci, comet, &E1_Equalizer[0]);
+	SetPwrLevel(comet);
 }
 
 /*
@@ -382,15 +408,15 @@
 ** Returns:     Nothing
 */
 STATIC void
-WrtXmtWaveform (ci_t * ci, comet_t * comet, u_int32_t sample, u_int32_t unit, u_int8_t data)
+WrtXmtWaveform(ci_t *ci, comet_t *comet, u_int32_t sample, u_int32_t unit, u_int8_t data)
 {
-    u_int8_t    WaveformAddr;
+	u_int8_t    WaveformAddr;
 
-    WaveformAddr = (sample << 3) + (unit & 7);
-    pci_write_32 ((u_int32_t *) &comet->xlpg_pwave_addr, WaveformAddr);
-    pci_flush_write (ci);           /* for write order preservation when
-                                     * Optimizing driver */
-    pci_write_32 ((u_int32_t *) &comet->xlpg_pwave_data, 0x7F & data);
+	WaveformAddr = (sample << 3) + (unit & 7);
+	pci_write_32((u_int32_t *) &comet->xlpg_pwave_addr, WaveformAddr);
+	/* for write order preservation when Optimizing driver */
+	pci_flush_write(ci);
+	pci_write_32((u_int32_t *) &comet->xlpg_pwave_data, 0x7F & data);
 }
 
 /*
@@ -400,19 +426,19 @@
 ** Returns:     Nothing
 */
 STATIC void
-WrtXmtWaveformTbl (ci_t * ci, comet_t * comet,
-                   u_int8_t table[COMET_NUM_SAMPLES][COMET_NUM_UNITS])
+WrtXmtWaveformTbl(ci_t *ci, comet_t *comet,
+		  u_int8_t table[COMET_NUM_SAMPLES][COMET_NUM_UNITS])
 {
-    u_int32_t sample, unit;
+	u_int32_t sample, unit;
 
-    for (sample = 0; sample < COMET_NUM_SAMPLES; sample++)
-    {
-        for (unit = 0; unit < COMET_NUM_UNITS; unit++)
-            WrtXmtWaveform (ci, comet, sample, unit, table[sample][unit]);
-    }
+	for (sample = 0; sample < COMET_NUM_SAMPLES; sample++)
+		{
+		for (unit = 0; unit < COMET_NUM_UNITS; unit++)
+			WrtXmtWaveform(ci, comet, sample, unit, table[sample][unit]);
+		}
 
     /* Enable transmitter and set output amplitude */
-    pci_write_32 ((u_int32_t *) &comet->xlpg_cfg, table[COMET_NUM_SAMPLES][0]);
+	pci_write_32((u_int32_t *) &comet->xlpg_cfg, table[COMET_NUM_SAMPLES][0]);
 }
 
 
@@ -427,60 +453,60 @@
 */
 
 STATIC void
-WrtRcvEqualizerTbl (ci_t * ci, comet_t * comet, u_int32_t *table)
+WrtRcvEqualizerTbl(ci_t *ci, comet_t *comet, u_int32_t *table)
 {
-    u_int32_t   ramaddr;
-    volatile u_int32_t value;
+	u_int32_t   ramaddr;
+	volatile u_int32_t value;
 
-    for (ramaddr = 0; ramaddr < 256; ramaddr++)
-    {
-        /*** the following lines are per Errata 7, 2.5 ***/
-        {
-            pci_write_32 ((u_int32_t *) &comet->rlps_eq_rwsel, 0x80);   /* Set up for a read
-                                                                         * operation */
-            pci_flush_write (ci);   /* for write order preservation when
-                                     * Optimizing driver */
-            pci_write_32 ((u_int32_t *) &comet->rlps_eq_iaddr, (u_int8_t) ramaddr); /* write the addr,
-                                                                                  * initiate a read */
-            pci_flush_write (ci);   /* for write order preservation when
-                                     * Optimizing driver */
-            /*
-             * wait 3 line rate clock cycles to ensure address bits are
-             * captured by T1/E1 clock
-             */
-            OS_uwait (4, "wret");   /* 683ns * 3 = 1366 ns, approx 2us (but
-                                     * use 4us) */
-        }
+	for (ramaddr = 0; ramaddr < 256; ramaddr++) {
+	/*** the following lines are per Errata 7, 2.5 ***/
+		{
+		/* Set up for a read operation */
+		pci_write_32((u_int32_t *) &comet->rlps_eq_rwsel, 0x80);
+		/* for write order preservation when Optimizing driver */
+		pci_flush_write(ci);
+		/* write the addr, initiate a read */
+		pci_write_32((u_int32_t *) &comet->rlps_eq_iaddr, (u_int8_t) ramaddr);
+		/* for write order preservation when Optimizing driver */
+		pci_flush_write(ci);
+		/*
+		* wait 3 line rate clock cycles to ensure address bits are
+		* captured by T1/E1 clock
+		*/
 
-        value = *table++;
-        pci_write_32 ((u_int32_t *) &comet->rlps_idata3, (u_int8_t) (value >> 24));
-        pci_write_32 ((u_int32_t *) &comet->rlps_idata2, (u_int8_t) (value >> 16));
-        pci_write_32 ((u_int32_t *) &comet->rlps_idata1, (u_int8_t) (value >> 8));
-        pci_write_32 ((u_int32_t *) &comet->rlps_idata0, (u_int8_t) value);
-        pci_flush_write (ci);       /* for write order preservation when
-                                     * Optimizing driver */
+		/* 683ns * 3 = 1366 ns, approx 2us (but use 4us) */
+		OS_uwait(4, "wret");
+	}
 
-        /* Storing RAM address, causes RAM to be updated */
+	value = *table++;
+	pci_write_32((u_int32_t *) &comet->rlps_idata3, (u_int8_t) (value >> 24));
+	pci_write_32((u_int32_t *) &comet->rlps_idata2, (u_int8_t) (value >> 16));
+	pci_write_32((u_int32_t *) &comet->rlps_idata1, (u_int8_t) (value >> 8));
+	pci_write_32((u_int32_t *) &comet->rlps_idata0, (u_int8_t) value);
+	 /* for write order preservation when Optimizing driver */
+	pci_flush_write(ci);
 
-        pci_write_32 ((u_int32_t *) &comet->rlps_eq_rwsel, 0);  /* Set up for a write
-                                                                 * operation */
-        pci_flush_write (ci);       /* for write order preservation when
-                                     * Optimizing driver */
-        pci_write_32 ((u_int32_t *) &comet->rlps_eq_iaddr, (u_int8_t) ramaddr); /* write the addr,
-                                                                                 * initiate a read */
-        pci_flush_write (ci);       /* for write order preservation when
-                                     * Optimizing driver */
-        /*
-         * wait 3 line rate clock cycles to ensure address bits are captured
-         * by T1/E1 clock
-         */
-        OS_uwait (4, "wret");       /* 683ns * 3 = 1366 ns, approx 2us (but
-                                     * use 4us) */
-    }
+	/* Storing RAM address, causes RAM to be updated */
 
-    pci_write_32 ((u_int32_t *) &comet->rlps_eq_cfg, 0xCB);     /* Enable Equalizer &
-                                                                 * set it to use 256
-                                                                 * periods */
+		/* Set up for a write operation */
+		pci_write_32((u_int32_t *) &comet->rlps_eq_rwsel, 0);
+		/* for write order preservation when optimizing driver */
+		pci_flush_write(ci);
+		/* write the addr, initiate a read */
+		pci_write_32((u_int32_t *) &comet->rlps_eq_iaddr, (u_int8_t) ramaddr);
+		 /* for write order preservation when optimizing driver */
+		pci_flush_write(ci);
+
+	/*
+	* wait 3 line rate clock cycles to ensure address bits are captured
+	* by T1/E1 clock
+	*/
+		/* 683ns * 3 = 1366 ns, approx 2us (but use 4us) */
+		OS_uwait(4, "wret");
+	}
+
+	/* Enable Equalizer & set it to use 256 periods */
+	pci_write_32((u_int32_t *) &comet->rlps_eq_cfg, 0xCB);
 }
 
 
@@ -491,9 +517,9 @@
 */
 
 STATIC void
-SetPwrLevel (comet_t * comet)
+SetPwrLevel(comet_t *comet)
 {
-    volatile u_int32_t temp;
+	volatile u_int32_t temp;
 
 /*
 **    Algorithm to Balance the Power Distribution of Ttip Tring
@@ -507,22 +533,20 @@
 **    Repeat these steps for register F5
 **    Write 0x01 to register F6
 */
-    pci_write_32 ((u_int32_t *) &comet->xlpg_fdata_sel, 0x00);  /* XLPG Fuse Data Select */
-
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_pctl, 0x01); /* XLPG Analog Test
-                                                                 * Positive control */
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_pctl, 0x01);
-
-    temp = pci_read_32 ((u_int32_t *) &comet->xlpg_atest_pctl) & 0xfe;
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_pctl, temp);
-
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_nctl, 0x01); /* XLPG Analog Test
-                                                                 * Negative control */
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_nctl, 0x01);
-
-    temp = pci_read_32 ((u_int32_t *) &comet->xlpg_atest_nctl) & 0xfe;
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_nctl, temp);
-    pci_write_32 ((u_int32_t *) &comet->xlpg_fdata_sel, 0x01);  /* XLPG */
+	/* XLPG Fuse Data Select */
+	pci_write_32((u_int32_t *) &comet->xlpg_fdata_sel, 0x00);
+	/* XLPG Analog Test Positive control */
+	pci_write_32((u_int32_t *) &comet->xlpg_atest_pctl, 0x01);
+	pci_write_32((u_int32_t *) &comet->xlpg_atest_pctl, 0x01);
+	temp = pci_read_32((u_int32_t *) &comet->xlpg_atest_pctl) & 0xfe;
+	pci_write_32((u_int32_t *) &comet->xlpg_atest_pctl, temp);
+	pci_write_32((u_int32_t *) &comet->xlpg_atest_nctl, 0x01);
+	pci_write_32((u_int32_t *) &comet->xlpg_atest_nctl, 0x01);
+	/* XLPG Analog Test Negative control */
+	temp = pci_read_32((u_int32_t *) &comet->xlpg_atest_nctl) & 0xfe;
+	pci_write_32((u_int32_t *) &comet->xlpg_atest_nctl, temp);
+	/* XLPG */
+	pci_write_32((u_int32_t *) &comet->xlpg_fdata_sel, 0x01);
 }
 
 
@@ -535,33 +559,30 @@
 */
 #if 0
 STATIC void
-SetCometOps (comet_t * comet)
+SetCometOps(comet_t *comet)
 {
-    volatile u_int8_t rd_value;
+	volatile u_int8_t rd_value;
 
-    if (comet == mConfig.C4Func1Base + (COMET0_OFFSET >> 2))
-    {
-        rd_value = (u_int8_t) pci_read_32 ((u_int32_t *) &comet->brif_cfg);     /* read the BRIF
-                                                                                 * Configuration */
-        rd_value &= ~0x20;
-        pci_write_32 ((u_int32_t *) &comet->brif_cfg, (u_int32_t) rd_value);
-
-        rd_value = (u_int8_t) pci_read_32 ((u_int32_t *) &comet->brif_fpcfg);   /* read the BRIF Frame
-                                                                                 * Pulse Configuration */
-        rd_value &= ~0x20;
-        pci_write_32 ((u_int32_t *) &comet->brif_fpcfg, (u_int8_t) rd_value);
-    } else
-    {
-        rd_value = (u_int8_t) pci_read_32 ((u_int32_t *) &comet->brif_cfg);     /* read the BRIF
-                                                                                 * Configuration */
-        rd_value |= 0x20;
-        pci_write_32 ((u_int32_t *) &comet->brif_cfg, (u_int32_t) rd_value);
-
-        rd_value = (u_int8_t) pci_read_32 ((u_int32_t *) &comet->brif_fpcfg);   /* read the BRIF Frame
-                                                                                 * Pulse Configuration */
-        rd_value |= 0x20;
-        pci_write_32 ((u_int32_t *) &comet->brif_fpcfg, (u_int8_t) rd_value);
-    }
+	if (comet == mConfig.C4Func1Base + (COMET0_OFFSET >> 2))
+	{
+		/* read the BRIF Configuration */
+		rd_value = (u_int8_t) pci_read_32((u_int32_t *) &comet->brif_cfg);
+		rd_value &= ~0x20;
+		pci_write_32((u_int32_t *) &comet->brif_cfg, (u_int32_t) rd_value);
+		/* read the BRIF Frame Pulse Configuration */
+		rd_value = (u_int8_t) pci_read_32((u_int32_t *) &comet->brif_fpcfg);
+		rd_value &= ~0x20;
+		pci_write_32((u_int32_t *) &comet->brif_fpcfg, (u_int8_t) rd_value);
+	} else {
+	/* read the BRIF Configuration */
+	rd_value = (u_int8_t) pci_read_32((u_int32_t *) &comet->brif_cfg);
+	rd_value |= 0x20;
+	pci_write_32((u_int32_t *) &comet->brif_cfg, (u_int32_t) rd_value);
+	/* read the BRIF Frame Pulse Configuration */
+	rd_value = (u_int8_t) pci_read_32((u_int32_t *) &comet->brif_fpcfg);
+	rd_value |= 0x20;
+	pci_write_32(u_int32_t *) & comet->brif_fpcfg, (u_int8_t) rd_value);
+	}
 }
 #endif
 

diff --git a/drivers/staging/cxt1e1/functions.c b/drivers/staging/cxt1e1/functions.c
index d9a9aa3..6167dc5 100644
--- a/drivers/staging/cxt1e1/functions.c
+++ b/drivers/staging/cxt1e1/functions.c

@@ -14,7 +14,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/slab.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <asm/byteorder.h>
 #include <linux/netdevice.h>
 #include <linux/delay.h>
@@ -97,7 +97,7 @@
 
 
 void
-pci_flush_write (ci_t * ci)
+pci_flush_write (ci_t *ci)
 {
     volatile u_int32_t v;
 
@@ -202,7 +202,7 @@
 {
     struct net_device *ndev = (struct net_device *) user;
 
-    return (netif_carrier_ok (ndev));
+    return netif_carrier_ok (ndev);
 }
 
 void
@@ -246,7 +246,7 @@
 {
     struct net_device *ndev = (struct net_device *) user;
 
-    return (netif_queue_stopped (ndev));
+    return netif_queue_stopped (ndev);
 }
 
 void sd_recv_consume(void *token, size_t len, void *user)
@@ -279,7 +279,7 @@
 
 
 void
-VMETRO_TRIGGER (ci_t * ci, int x)
+VMETRO_TRIGGER (ci_t *ci, int x)
 {
     comet_t    *comet;
     volatile u_int32_t data;

diff --git a/drivers/staging/cxt1e1/hwprobe.c b/drivers/staging/cxt1e1/hwprobe.c
index de8ac0b..110c252 100644
--- a/drivers/staging/cxt1e1/hwprobe.c
+++ b/drivers/staging/cxt1e1/hwprobe.c

@@ -50,7 +50,7 @@
 
 
 void        __init
-show_two (hdw_info_t * hi, int brdno)
+show_two (hdw_info_t *hi, int brdno)
 {
     ci_t       *ci;
     struct pci_dev *pdev;
@@ -102,7 +102,7 @@
 
 
 void        __init
-hdw_sn_get (hdw_info_t * hi, int brdno)
+hdw_sn_get (hdw_info_t *hi, int brdno)
 {
     /* obtain hardware EEPROM information */
     long        addr;
@@ -222,7 +222,7 @@
 
 
 STATIC int  __init
-c4_hdw_init (struct pci_dev * pdev, int found)
+c4_hdw_init (struct pci_dev *pdev, int found)
 {
     hdw_info_t *hi;
     int         i;

diff --git a/drivers/staging/cxt1e1/linux.c b/drivers/staging/cxt1e1/linux.c
index a829b62..e5889ef 100644
--- a/drivers/staging/cxt1e1/linux.c
+++ b/drivers/staging/cxt1e1/linux.c

@@ -144,7 +144,7 @@
 
 
 char       *
-get_hdlc_name (hdlc_device * hdlc)
+get_hdlc_name (hdlc_device *hdlc)
 {
     struct c4_priv *priv = hdlc->priv;
     struct net_device *dev = getuserbychan (priv->channum);
@@ -185,7 +185,7 @@
  * within a port's group.
  */
 void
-c4_wk_chan_restart (mch_t * ch)
+c4_wk_chan_restart (mch_t *ch)
 {
     mpi_t      *pi = ch->up;
 
@@ -203,7 +203,7 @@
 }
 
 status_t
-c4_wk_chan_init (mpi_t * pi, mch_t * ch)
+c4_wk_chan_init (mpi_t *pi, mch_t *ch)
 {
     /*
      * this will be used to restart a stopped channel
@@ -218,7 +218,7 @@
 }
 
 status_t
-c4_wq_port_init (mpi_t * pi)
+c4_wq_port_init (mpi_t *pi)
 {
 
     char        name[16], *np;  /* NOTE: name of the queue limited by system
@@ -241,7 +241,7 @@
 }
 
 void
-c4_wq_port_cleanup (mpi_t * pi)
+c4_wq_port_cleanup (mpi_t *pi)
 {
     /*
      * PORT POINT: cannot call this if WQ is statically allocated w/in
@@ -278,7 +278,7 @@
 
 
 static int
-void_open (struct net_device * ndev)
+void_open (struct net_device *ndev)
 {
     pr_info("%s: trying to open master device !\n", ndev->name);
     return -1;
@@ -286,7 +286,7 @@
 
 
 STATIC int
-chan_open (struct net_device * ndev)
+chan_open (struct net_device *ndev)
 {
     hdlc_device *hdlc = dev_to_hdlc (ndev);
     const struct c4_priv *priv = hdlc->priv;
@@ -306,7 +306,7 @@
 
 
 STATIC int
-chan_close (struct net_device * ndev)
+chan_close (struct net_device *ndev)
 {
     hdlc_device *hdlc = dev_to_hdlc (ndev);
     const struct c4_priv *priv = hdlc->priv;
@@ -320,14 +320,14 @@
 
 
 STATIC int
-chan_dev_ioctl (struct net_device * dev, struct ifreq * ifr, int cmd)
+chan_dev_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 {
     return hdlc_ioctl (dev, ifr, cmd);
 }
 
 
 STATIC int
-chan_attach_noop (struct net_device * ndev, unsigned short foo_1, unsigned short foo_2)
+chan_attach_noop (struct net_device *ndev, unsigned short foo_1, unsigned short foo_2)
 {
     return 0;                   /* our driver has nothing to do here, show's
                                  * over, go home */
@@ -335,7 +335,7 @@
 
 
 STATIC struct net_device_stats *
-chan_get_stats (struct net_device * ndev)
+chan_get_stats (struct net_device *ndev)
 {
     mch_t      *ch;
     struct net_device_stats *nstats;
@@ -388,14 +388,14 @@
 
 
 static ci_t *
-get_ci_by_dev (struct net_device * ndev)
+get_ci_by_dev (struct net_device *ndev)
 {
     return (ci_t *)(netdev_priv(ndev));
 }
 
 
 STATIC int
-c4_linux_xmit (struct sk_buff * skb, struct net_device * ndev)
+c4_linux_xmit (struct sk_buff *skb, struct net_device *ndev)
 {
     const struct c4_priv *priv;
     int         rval;
@@ -417,8 +417,8 @@
 };
 
 STATIC struct net_device *
-create_chan (struct net_device * ndev, ci_t * ci,
-             struct sbecom_chan_param * cp)
+create_chan (struct net_device *ndev, ci_t *ci,
+             struct sbecom_chan_param *cp)
 {
     hdlc_device *hdlc;
     struct net_device *dev;
@@ -510,7 +510,7 @@
 
 /* the idea here is to get port information and pass it back (using pointer) */
 STATIC      status_t
-do_get_port (struct net_device * ndev, void *data)
+do_get_port (struct net_device *ndev, void *data)
 {
     int         ret;
     ci_t       *ci;             /* ci stands for card information */
@@ -535,7 +535,7 @@
 
 /* this function copys the user data and then calls the real action function */
 STATIC      status_t
-do_set_port (struct net_device * ndev, void *data)
+do_set_port (struct net_device *ndev, void *data)
 {
     ci_t       *ci;             /* ci stands for card information */
     struct sbecom_port_param pp;/* copy data to kernel land */
@@ -557,7 +557,7 @@
 
 /* work the port loopback mode as per directed */
 STATIC      status_t
-do_port_loop (struct net_device * ndev, void *data)
+do_port_loop (struct net_device *ndev, void *data)
 {
     struct sbecom_port_param pp;
     ci_t       *ci;
@@ -572,7 +572,7 @@
 
 /* set the specified register with the given value / or just read it */
 STATIC      status_t
-do_framer_rw (struct net_device * ndev, void *data)
+do_framer_rw (struct net_device *ndev, void *data)
 {
     struct sbecom_port_param pp;
     ci_t       *ci;
@@ -593,7 +593,7 @@
 
 /* set the specified register with the given value / or just read it */
 STATIC      status_t
-do_pld_rw (struct net_device * ndev, void *data)
+do_pld_rw (struct net_device *ndev, void *data)
 {
     struct sbecom_port_param pp;
     ci_t       *ci;
@@ -614,7 +614,7 @@
 
 /* set the specified register with the given value / or just read it */
 STATIC      status_t
-do_musycc_rw (struct net_device * ndev, void *data)
+do_musycc_rw (struct net_device *ndev, void *data)
 {
     struct c4_musycc_param mp;
     ci_t       *ci;
@@ -634,7 +634,7 @@
 }
 
 STATIC      status_t
-do_get_chan (struct net_device * ndev, void *data)
+do_get_chan (struct net_device *ndev, void *data)
 {
     struct sbecom_chan_param cp;
     int         ret;
@@ -652,7 +652,7 @@
 }
 
 STATIC      status_t
-do_set_chan (struct net_device * ndev, void *data)
+do_set_chan (struct net_device *ndev, void *data)
 {
     struct sbecom_chan_param cp;
     int         ret;
@@ -673,7 +673,7 @@
 }
 
 STATIC      status_t
-do_create_chan (struct net_device * ndev, void *data)
+do_create_chan (struct net_device *ndev, void *data)
 {
     ci_t       *ci;
     struct net_device *dev;
@@ -700,7 +700,7 @@
 }
 
 STATIC      status_t
-do_get_chan_stats (struct net_device * ndev, void *data)
+do_get_chan_stats (struct net_device *ndev, void *data)
 {
     struct c4_chan_stats_wrap ccs;
     int         ret;
@@ -721,7 +721,7 @@
     return 0;
 }
 STATIC      status_t
-do_set_loglevel (struct net_device * ndev, void *data)
+do_set_loglevel (struct net_device *ndev, void *data)
 {
     unsigned int cxt1e1_log_level;
 
@@ -732,7 +732,7 @@
 }
 
 STATIC      status_t
-do_deluser (struct net_device * ndev, int lockit)
+do_deluser (struct net_device *ndev, int lockit)
 {
     if (ndev->flags & IFF_UP)
         return -EBUSY;
@@ -763,7 +763,7 @@
 }
 
 int
-do_del_chan (struct net_device * musycc_dev, void *data)
+do_del_chan (struct net_device *musycc_dev, void *data)
 {
     struct sbecom_chan_param cp;
     char        buf[sizeof (CHANNAME) + 3];
@@ -787,7 +787,7 @@
 int         c4_reset_board (void *);
 
 int
-do_reset (struct net_device * musycc_dev, void *data)
+do_reset (struct net_device *musycc_dev, void *data)
 {
     const struct c4_priv *priv;
     int         i;
@@ -816,7 +816,7 @@
 }
 
 int
-do_reset_chan_stats (struct net_device * musycc_dev, void *data)
+do_reset_chan_stats (struct net_device *musycc_dev, void *data)
 {
     struct sbecom_chan_param cp;
 
@@ -827,7 +827,7 @@
 }
 
 STATIC      status_t
-c4_ioctl (struct net_device * ndev, struct ifreq * ifr, int cmd)
+c4_ioctl (struct net_device *ndev, struct ifreq *ifr, int cmd)
 {
     ci_t       *ci;
     void       *data;
@@ -954,7 +954,7 @@
 }
 
 struct net_device *__init
-c4_add_dev (hdw_info_t * hi, int brdno, unsigned long f0, unsigned long f1,
+c4_add_dev (hdw_info_t *hi, int brdno, unsigned long f0, unsigned long f1,
             int irq0, int irq1)
 {
     struct net_device *ndev;

diff --git a/drivers/staging/cxt1e1/musycc.c b/drivers/staging/cxt1e1/musycc.c
index b2cc68a..1037086 100644
--- a/drivers/staging/cxt1e1/musycc.c
+++ b/drivers/staging/cxt1e1/musycc.c

@@ -74,7 +74,7 @@
 
 #if 1
 STATIC int
-musycc_dump_rxbuffer_ring(mch_t * ch, int lockit)
+musycc_dump_rxbuffer_ring(mch_t *ch, int lockit)
 {
     struct mdesc *m;
     unsigned long flags = 0;
@@ -140,7 +140,7 @@
 
 #if 1
 STATIC int
-musycc_dump_txbuffer_ring(mch_t * ch, int lockit)
+musycc_dump_txbuffer_ring(mch_t *ch, int lockit)
 {
     struct mdesc *m;
     unsigned long flags = 0;
@@ -205,7 +205,7 @@
  */
 
 status_t
-musycc_dump_ring(ci_t * ci, unsigned int chan)
+musycc_dump_ring(ci_t *ci, unsigned int chan)
 {
     mch_t      *ch;
 
@@ -248,7 +248,7 @@
 
 
 status_t
-musycc_dump_rings(ci_t * ci, unsigned int start_chan)
+musycc_dump_rings(ci_t *ci, unsigned int start_chan)
 {
     unsigned int chan;
 
@@ -264,7 +264,7 @@
  */
 
 void
-musycc_init_mdt(mpi_t * pi)
+musycc_init_mdt(mpi_t *pi)
 {
     u_int32_t  *addr, cfg;
     int         i;
@@ -288,7 +288,7 @@
 /* Set TX thp to the next unprocessed md */
 
 void
-musycc_update_tx_thp(mch_t * ch)
+musycc_update_tx_thp(mch_t *ch)
 {
     struct mdesc *md;
     unsigned long flags;
@@ -443,7 +443,7 @@
   */
 
 void
-musycc_chan_restart(mch_t * ch)
+musycc_chan_restart(mch_t *ch)
 {
 #ifdef RLD_RESTART_DEBUG
     pr_info("++ musycc_chan_restart[%d]: txd_irq_srv @ %p = sts %x\n",
@@ -461,7 +461,7 @@
 
 
 void
-rld_put_led(mpi_t * pi, u_int32_t ledval)
+rld_put_led(mpi_t *pi, u_int32_t ledval)
 {
     static u_int32_t led = 0;
 
@@ -477,7 +477,7 @@
 #define MUSYCC_SR_RETRY_CNT  9
 
 void
-musycc_serv_req(mpi_t * pi, u_int32_t req)
+musycc_serv_req(mpi_t *pi, u_int32_t req)
 {
     volatile u_int32_t r;
     int         rcnt;
@@ -578,7 +578,7 @@
 
 #ifdef  SBE_PMCC4_ENABLE
 void
-musycc_update_timeslots(mpi_t * pi)
+musycc_update_timeslots(mpi_t *pi)
 {
     int         i, ch;
     char        e1mode = IS_FRAME_ANY_E1(pi->p.port_mode);
@@ -640,7 +640,7 @@
 
 #ifdef SBE_WAN256T3_ENABLE
 void
-musycc_update_timeslots(mpi_t * pi)
+musycc_update_timeslots(mpi_t *pi)
 {
     mch_t      *ch;
 
@@ -703,7 +703,7 @@
 
 #ifdef SBE_WAN256T3_ENABLE
 STATIC void __init
-musycc_init_port(mpi_t * pi)
+musycc_init_port(mpi_t *pi)
 {
     pci_write_32((u_int32_t *) &pi->reg->gbp, OS_vtophys(pi->regram));
 
@@ -737,7 +737,7 @@
 
 
 status_t    __init
-musycc_init(ci_t * ci)
+musycc_init(ci_t *ci)
 {
     char       *regaddr;        /* temp for address boundary calculations */
     int         i, gchan;
@@ -832,7 +832,7 @@
 
 
 void
-musycc_bh_tx_eom(mpi_t * pi, int gchan)
+musycc_bh_tx_eom(mpi_t *pi, int gchan)
 {
     mch_t      *ch;
     struct mdesc *md;
@@ -1010,7 +1010,7 @@
 
 
 STATIC void
-musycc_bh_rx_eom(mpi_t * pi, int gchan)
+musycc_bh_rx_eom(mpi_t *pi, int gchan)
 {
     mch_t      *ch;
     void       *m, *m2;
@@ -1229,7 +1229,7 @@
 #else
 void
 #endif
-musycc_intr_bh_tasklet(ci_t * ci)
+musycc_intr_bh_tasklet(ci_t *ci)
 {
     mpi_t      *pi;
     mch_t      *ch;
@@ -1517,7 +1517,7 @@
 
 #if 0
 int         __init
-musycc_new_chan(ci_t * ci, int channum, void *user)
+musycc_new_chan(ci_t *ci, int channum, void *user)
 {
     mch_t      *ch;
 
@@ -1546,7 +1546,7 @@
 
 #ifdef SBE_PMCC4_ENABLE
 status_t
-musycc_chan_down(ci_t * dummy, int channum)
+musycc_chan_down(ci_t *dummy, int channum)
 {
     mpi_t      *pi;
     mch_t      *ch;
@@ -1597,7 +1597,7 @@
 
 
 int
-musycc_del_chan(ci_t * ci, int channum)
+musycc_del_chan(ci_t *ci, int channum)
 {
     mch_t      *ch;
 
@@ -1613,7 +1613,7 @@
 
 
 int
-musycc_del_chan_stats(ci_t * ci, int channum)
+musycc_del_chan_stats(ci_t *ci, int channum)
 {
     mch_t      *ch;
 
@@ -1628,7 +1628,7 @@
 
 
 int
-musycc_start_xmit(ci_t * ci, int channum, void *mem_token)
+musycc_start_xmit(ci_t *ci, int channum, void *mem_token)
 {
     mch_t      *ch;
     struct mdesc *md;

diff --git a/drivers/staging/cxt1e1/pmcc4.h b/drivers/staging/cxt1e1/pmcc4.h
index b0ed4ad..003eb86 100644
--- a/drivers/staging/cxt1e1/pmcc4.h
+++ b/drivers/staging/cxt1e1/pmcc4.h

@@ -85,15 +85,15 @@
 status_t    c4_chan_up (ci_t *, int channum);
 status_t    c4_del_chan_stats (int channum);
 status_t    c4_del_chan (int channum);
-status_t    c4_get_iidinfo (ci_t * ci, struct sbe_iid_info * iip);
+status_t    c4_get_iidinfo (ci_t *ci, struct sbe_iid_info *iip);
 int         c4_is_chan_up (int channum);
 
 void       *getuserbychan (int channum);
-void        pci_flush_write (ci_t * ci);
+void        pci_flush_write (ci_t *ci);
 void        sbecom_set_loglevel (int debuglevel);
-char       *sbeid_get_bdname (ci_t * ci);
-void        sbeid_set_bdtype (ci_t * ci);
-void        sbeid_set_hdwbid (ci_t * ci);
+char       *sbeid_get_bdname (ci_t *ci);
+void        sbeid_set_bdtype (ci_t *ci);
+void        sbeid_set_hdwbid (ci_t *ci);
 u_int32_t   sbeCrc (u_int8_t *, u_int32_t, u_int32_t, u_int32_t *);
 
 void        VMETRO_TRACE (void *);       /* put data into 8 LEDs */

diff --git a/drivers/staging/cxt1e1/pmcc4_drv.c b/drivers/staging/cxt1e1/pmcc4_drv.c
index 8d8a22b..32d7a21 100644
--- a/drivers/staging/cxt1e1/pmcc4_drv.c
+++ b/drivers/staging/cxt1e1/pmcc4_drv.c

@@ -28,7 +28,7 @@
 #include <linux/sched.h>        /* include for timer */
 #include <linux/timer.h>        /* include for timer */
 #include <linux/hdlc.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 #include "sbecom_inline_linux.h"
 #include "libsbew.h"
@@ -123,7 +123,7 @@
                 {
                     if ((ch->state != UNASSIGNED) &&
                         (ch->channum == channum))
-                        return (ch);
+                        return ch;
                 }
             }
     return 0;
@@ -193,7 +193,7 @@
 #define COMET_LBCMD_READ  0x80  /* read only (do not set, return read value) */
 
 void
-checkPorts (ci_t * ci)
+checkPorts (ci_t *ci)
 {
 #ifndef CONFIG_SBE_PMCC4_NCOMM
     /*
@@ -459,7 +459,7 @@
 
 
 STATIC void
-c4_watchdog (ci_t * ci)
+c4_watchdog (ci_t *ci)
 {
     if (drvr_state != SBE_DRVR_AVAILABLE)
     {
@@ -512,7 +512,7 @@
  */
 
 int
-c4_get_portcfg (ci_t * ci)
+c4_get_portcfg (ci_t *ci)
 {
     comet_t    *comet;
     int         portnum, mask;
@@ -536,7 +536,7 @@
 /* nothing herein should generate interrupts */
 
 status_t    __init
-c4_init (ci_t * ci, u_char *func0, u_char *func1)
+c4_init (ci_t *ci, u_char *func0, u_char *func1)
 {
     mpi_t      *pi;
     mch_t      *ch;
@@ -670,7 +670,7 @@
 /* better be fully setup to handle interrupts when you call this */
 
 status_t    __init
-c4_init2 (ci_t * ci)
+c4_init2 (ci_t *ci)
 {
     status_t    ret;
 
@@ -698,7 +698,7 @@
 /* This function sets the loopback mode (or clears it, as the case may be). */
 
 int
-c4_loop_port (ci_t * ci, int portnum, u_int8_t cmd)
+c4_loop_port (ci_t *ci, int portnum, u_int8_t cmd)
 {
     comet_t    *comet;
     volatile u_int32_t loopValue;
@@ -757,7 +757,7 @@
  */
 
 status_t
-c4_frame_rw (ci_t * ci, struct sbecom_port_param * pp)
+c4_frame_rw (ci_t *ci, struct sbecom_port_param *pp)
 {
     comet_t    *comet;
     volatile u_int32_t data;
@@ -796,7 +796,7 @@
  */
 
 status_t
-c4_pld_rw (ci_t * ci, struct sbecom_port_param * pp)
+c4_pld_rw (ci_t *ci, struct sbecom_port_param *pp)
 {
     volatile u_int32_t *regaddr;
     volatile u_int32_t data;
@@ -834,7 +834,7 @@
  */
 
 status_t
-c4_musycc_rw (ci_t * ci, struct c4_musycc_param * mcp)
+c4_musycc_rw (ci_t *ci, struct c4_musycc_param *mcp)
 {
     mpi_t      *pi;
     volatile u_int32_t *dph;    /* hardware implemented register */
@@ -898,7 +898,7 @@
 }
 
 status_t
-c4_get_port (ci_t * ci, int portnum)
+c4_get_port (ci_t *ci, int portnum)
 {
     if (portnum >= ci->max_port)    /* sanity check */
         return ENXIO;
@@ -913,7 +913,7 @@
 }
 
 status_t
-c4_set_port (ci_t * ci, int portnum)
+c4_set_port (ci_t *ci, int portnum)
 {
     mpi_t      *pi;
     struct sbecom_port_param *pp;
@@ -942,7 +942,7 @@
 
         if ((ret = c4_wq_port_init (pi)))       /* create/init
                                                  * workqueue_struct */
-            return (ret);
+            return ret;
     }
 
     init_comet (ci, pi->cometbase, pp->port_mode, 1 /* clockmaster == true */ , pp->portP);
@@ -1018,7 +1018,7 @@
 unsigned int max_int = 0;
 
 status_t
-c4_new_chan (ci_t * ci, int portnum, int channum, void *user)
+c4_new_chan (ci_t *ci, int portnum, int channum, void *user)
 {
     mpi_t      *pi;
     mch_t      *ch;
@@ -1111,7 +1111,7 @@
 
 
 status_t
-c4_set_chan (int channum, struct sbecom_chan_param * p)
+c4_set_chan (int channum, struct sbecom_chan_param *p)
 {
     mch_t      *ch;
     int         i, x = 0;
@@ -1162,7 +1162,7 @@
 
 
 status_t
-c4_get_chan (int channum, struct sbecom_chan_param * p)
+c4_get_chan (int channum, struct sbecom_chan_param *p)
 {
     mch_t      *ch;
 
@@ -1173,7 +1173,7 @@
 }
 
 status_t
-c4_get_chan_stats (int channum, struct sbecom_chan_stats * p)
+c4_get_chan_stats (int channum, struct sbecom_chan_stats *p)
 {
     mch_t      *ch;
 
@@ -1185,7 +1185,7 @@
 }
 
 STATIC int
-c4_fifo_alloc (mpi_t * pi, int chan, int *len)
+c4_fifo_alloc (mpi_t *pi, int chan, int *len)
 {
     int         i, l = 0, start = 0, max = 0, maxstart = 0;
 
@@ -1222,7 +1222,7 @@
 }
 
 void
-c4_fifo_free (mpi_t * pi, int chan)
+c4_fifo_free (mpi_t *pi, int chan)
 {
     int         i;
 
@@ -1236,7 +1236,7 @@
 
 
 status_t
-c4_chan_up (ci_t * ci, int channum)
+c4_chan_up (ci_t *ci, int channum)
 {
     mpi_t      *pi;
     mch_t      *ch;
@@ -1467,7 +1467,7 @@
 /* stop the hardware from servicing & interrupting */
 
 void
-c4_stopwd (ci_t * ci)
+c4_stopwd (ci_t *ci)
 {
     OS_stop_watchdog (&ci->wd);
     SD_SEM_TAKE (&ci->sem_wdbusy, "_stop_");    /* ensure WD not running */
@@ -1476,7 +1476,7 @@
 
 
 void
-sbecom_get_brdinfo (ci_t * ci, struct sbe_brd_info * bip, u_int8_t *bsn)
+sbecom_get_brdinfo (ci_t *ci, struct sbe_brd_info *bip, u_int8_t *bsn)
 {
     char       *np;
     u_int32_t   sn = 0;
@@ -1485,7 +1485,7 @@
     bip->brdno = ci->brdno;         /* our board number */
     bip->brd_id = ci->brd_id;
     bip->brd_hdw_id = ci->hdw_bid;
-    bip->brd_chan_cnt = MUSYCC_NCHANS * ci->max_port;   /* number of channels
+    bip->brd_chan_cnt = MUSYCC_NCHANS *ci->max_port;   /* number of channels
                                                          * being used */
     bip->brd_port_cnt = ci->max_port;   /* number of ports being used */
     bip->brd_pci_speed = BINFO_PCI_SPEED_unk;   /* PCI speed not yet
@@ -1535,7 +1535,7 @@
 
 
 status_t
-c4_get_iidinfo (ci_t * ci, struct sbe_iid_info * iip)
+c4_get_iidinfo (ci_t *ci, struct sbe_iid_info *iip)
 {
     struct net_device *dev;
     char       *np;
@@ -1624,7 +1624,7 @@
         }
         ci = ci->next;              /* next board, if any */
     }
-    return (base);
+    return base;
 }
 
 #endif                          /*** CONFIG_SBE_PMCC4_NCOMM ***/

diff --git a/drivers/staging/cxt1e1/sbecom_inline_linux.h b/drivers/staging/cxt1e1/sbecom_inline_linux.h
index 68ed445..3c6d1c0 100644
--- a/drivers/staging/cxt1e1/sbecom_inline_linux.h
+++ b/drivers/staging/cxt1e1/sbecom_inline_linux.h

@@ -177,7 +177,7 @@
 
 
 static inline int
-OS_start_watchdog (struct watchdog * wd)
+OS_start_watchdog (struct watchdog *wd)
 {
     wd->h.expires = jiffies + wd->ticks;
     add_timer (&wd->h);
@@ -186,7 +186,7 @@
 
 
 static inline int
-OS_stop_watchdog (struct watchdog * wd)
+OS_stop_watchdog (struct watchdog *wd)
 {
     del_timer_sync (&wd->h);
     return 0;
@@ -194,7 +194,7 @@
 
 
 static inline int
-OS_free_watchdog (struct watchdog * wd)
+OS_free_watchdog (struct watchdog *wd)
 {
     OS_stop_watchdog (wd);
     OS_kfree (wd);

diff --git a/drivers/staging/cxt1e1/sbeid.c b/drivers/staging/cxt1e1/sbeid.c
index a2243b1..0f9bd5f 100644
--- a/drivers/staging/cxt1e1/sbeid.c
+++ b/drivers/staging/cxt1e1/sbeid.c

@@ -27,7 +27,7 @@
 
 
 char       *
-sbeid_get_bdname (ci_t * ci)
+sbeid_get_bdname (ci_t *ci)
 {
     char       *np = 0;
 
@@ -73,7 +73,7 @@
 /* given the presetting of brd_id, set the corresponding hdw_id */
 
 void
-sbeid_set_hdwbid (ci_t * ci)
+sbeid_set_hdwbid (ci_t *ci)
 {
     /*
      * set SBE's unique hardware identification (for legacy boards might not
@@ -170,7 +170,7 @@
 /* given the presetting of hdw_bid, set the corresponding brd_id */
 
 void
-sbeid_set_bdtype (ci_t * ci)
+sbeid_set_bdtype (ci_t *ci)
 {
     /* set SBE's unique PCI VENDOR/DEVID */
     switch (ci->hdw_bid)

diff --git a/drivers/staging/cxt1e1/sbeproc.h b/drivers/staging/cxt1e1/sbeproc.h
index e5c072c..37285df 100644
--- a/drivers/staging/cxt1e1/sbeproc.h
+++ b/drivers/staging/cxt1e1/sbeproc.h

@@ -28,11 +28,11 @@
 
 #else
 
-static inline void sbecom_proc_brd_cleanup(ci_t * ci)
+static inline void sbecom_proc_brd_cleanup(ci_t *ci)
 {
 }
 
-static inline int __init sbecom_proc_brd_init(ci_t * ci)
+static inline int __init sbecom_proc_brd_init(ci_t *ci)
 {
 	return 0;
 }

diff --git a/drivers/staging/dgrp/dgrp_dpa_ops.c b/drivers/staging/dgrp/dgrp_dpa_ops.c
index 114799c..69bfe30 100644
--- a/drivers/staging/dgrp/dgrp_dpa_ops.c
+++ b/drivers/staging/dgrp/dgrp_dpa_ops.c

@@ -392,7 +392,7 @@
 		getnode.nd_rx_byte = nd->nd_rx_byte;
 
 		memset(&getnode.nd_ps_desc, 0, MAX_DESC_LEN);
-		strncpy(getnode.nd_ps_desc, nd->nd_ps_desc, MAX_DESC_LEN);
+		strlcpy(getnode.nd_ps_desc, nd->nd_ps_desc, MAX_DESC_LEN);
 
 		if (copy_to_user(uarg, &getnode, sizeof(struct digi_node)))
 			return -EFAULT;

diff --git a/drivers/staging/dgrp/dgrp_net_ops.c b/drivers/staging/dgrp/dgrp_net_ops.c
index 5b7833f..33ac7fb 100644
--- a/drivers/staging/dgrp/dgrp_net_ops.c
+++ b/drivers/staging/dgrp/dgrp_net_ops.c

@@ -278,7 +278,7 @@
 		switch (ch->ch_pscan_state) {
 		default:
 			/* reset to sanity and fall through */
-			ch->ch_pscan_state = 0 ;
+			ch->ch_pscan_state = 0;
 
 		case 0:
 			/* No FF seen yet */
@@ -1607,7 +1607,7 @@
 					if ((ch->ch_pun.un_flag & UN_LOW) != 0 ?
 					    (n <= TBUF_LOW) :
 					    (ch->ch_pun.un_flag & UN_TIME) != 0 ?
-					    ((jiffies - ch->ch_waketime) >= 0) :
+					    time_is_before_jiffies(ch->ch_waketime) :
 					    (n == 0 && ch->ch_s_tpos == ch->ch_s_tin) &&
 					    ((ch->ch_pun.un_flag & UN_EMPTY) != 0 ||
 					    ((ch->ch_tun.un_open_count &&
@@ -3083,7 +3083,7 @@
 						nd->nd_hw_ver = (b[8] << 8) | b[9];
 						nd->nd_sw_ver = (b[10] << 8) | b[11];
 						nd->nd_hw_id = b[6];
-						desclen = ((plen - 12) > MAX_DESC_LEN) ? MAX_DESC_LEN :
+						desclen = (plen - 12 > MAX_DESC_LEN - 1) ? MAX_DESC_LEN - 1 :
 							plen - 12;
 
 						if (desclen <= 0) {

diff --git a/drivers/staging/dgrp/drp.h b/drivers/staging/dgrp/drp.h
index 84a1e7b..4024b48 100644
--- a/drivers/staging/dgrp/drp.h
+++ b/drivers/staging/dgrp/drp.h

@@ -674,7 +674,7 @@
 	ushort	     nd_hw_ver;		  /* HW version returned from PS   */
 	ushort	     nd_sw_ver;		  /* SW version returned from PS   */
 	uint	     nd_hw_id;		  /* HW ID returned from PS	   */
-	u8	  nd_ps_desc[MAX_DESC_LEN+1];  /* Description from PS	*/
+	u8	  nd_ps_desc[MAX_DESC_LEN];  /* Description from PS	*/
 	uint	     nd_vpd_len;		/* VPD len, if any */
 	u8	     nd_vpd[VPDSIZE];		/* VPD, if any */
 

diff --git a/drivers/staging/dwc2/core.c b/drivers/staging/dwc2/core.c
index 3177db2..e3a0e77 100644
--- a/drivers/staging/dwc2/core.c
+++ b/drivers/staging/dwc2/core.c

@@ -506,8 +506,7 @@
 	struct dwc2_core_params *params = hsotg->core_params;
 	u32 rxfsiz, nptxfsiz, ptxfsiz, hptxfsiz, dfifocfg;
 
-	if (!(hsotg->hwcfg2 & GHWCFG2_DYNAMIC_FIFO) ||
-	    !params->enable_dynamic_fifo)
+	if (!params->enable_dynamic_fifo)
 		return;
 
 	dev_dbg(hsotg->dev, "Total FIFO Size=%d\n", hsotg->total_fifo_size);
@@ -1146,16 +1145,10 @@
 static void dwc2_hc_set_even_odd_frame(struct dwc2_hsotg *hsotg,
 				       struct dwc2_host_chan *chan, u32 *hcchar)
 {
-	u32 hfnum, frnum;
-
 	if (chan->ep_type == USB_ENDPOINT_XFER_INT ||
 	    chan->ep_type == USB_ENDPOINT_XFER_ISOC) {
-		hfnum = readl(hsotg->regs + HFNUM);
-		frnum = hfnum >> HFNUM_FRNUM_SHIFT &
-			HFNUM_FRNUM_MASK >> HFNUM_FRNUM_SHIFT;
-
 		/* 1 if _next_ frame is odd, 0 if it's even */
-		if (frnum & 0x1)
+		if (!(dwc2_hcd_get_frame_number(hsotg) & 0x1))
 			*hcchar |= HCCHAR_ODDFRM;
 	}
 }
@@ -1696,7 +1689,7 @@
 	    GHWCFG2_FS_PHY_TYPE_DEDICATED)
 		clock = 48;
 
-	if ((hprt0 & HPRT0_SPD_MASK) == 0)
+	if ((hprt0 & HPRT0_SPD_MASK) == HPRT0_SPD_HIGH_SPEED)
 		/* High speed case */
 		return 125 * clock;
 	else
@@ -1815,8 +1808,6 @@
 {
 #ifdef DEBUG
 	u32 __iomem *addr;
-	int i, ep_num;
-	char *txfsiz;
 
 	dev_dbg(hsotg->dev, "Core Global Registers\n");
 	addr = hsotg->regs + GOTGCTL;
@@ -1892,23 +1883,6 @@
 	dev_dbg(hsotg->dev, "HPTXFSIZ	 @0x%08lX : 0x%08X\n",
 		(unsigned long)addr, readl(addr));
 
-	if (hsotg->core_params->en_multiple_tx_fifo <= 0) {
-		ep_num = hsotg->hwcfg4 >> GHWCFG4_NUM_DEV_PERIO_IN_EP_SHIFT &
-			 GHWCFG4_NUM_DEV_PERIO_IN_EP_MASK >>
-					 GHWCFG4_NUM_DEV_PERIO_IN_EP_SHIFT;
-		txfsiz = "DPTXFSIZ";
-	} else {
-		ep_num = hsotg->hwcfg4 >> GHWCFG4_NUM_IN_EPS_SHIFT &
-			 GHWCFG4_NUM_IN_EPS_MASK >> GHWCFG4_NUM_IN_EPS_SHIFT;
-		txfsiz = "DIENPTXF";
-	}
-
-	for (i = 0; i < ep_num; i++) {
-		addr = hsotg->regs + DPTXFSIZN(i + 1);
-		dev_dbg(hsotg->dev, "%s[%d] @0x%08lX : 0x%08X\n", txfsiz, i + 1,
-			(unsigned long)addr, readl(addr));
-	}
-
 	addr = hsotg->regs + PCGCTL;
 	dev_dbg(hsotg->dev, "PCGCTL	 @0x%08lX : 0x%08X\n",
 		(unsigned long)addr, readl(addr));
@@ -2298,7 +2272,7 @@
 #ifndef NO_FS_PHY_HW_CHECKS
 		valid = 0;
 #else
-		val = 0;
+		val = DWC2_PHY_TYPE_PARAM_FS;
 		dev_dbg(hsotg->dev, "Setting phy_type to %d\n", val);
 		retval = -EINVAL;
 #endif
@@ -2325,7 +2299,7 @@
 			dev_err(hsotg->dev,
 				"%d invalid for phy_type. Check HW configuration.\n",
 				val);
-		val = 0;
+		val = DWC2_PHY_TYPE_PARAM_FS;
 		if (hs_phy_type != GHWCFG2_HS_PHY_TYPE_NOT_SUPPORTED) {
 			if (hs_phy_type == GHWCFG2_HS_PHY_TYPE_UTMI ||
 			    hs_phy_type == GHWCFG2_HS_PHY_TYPE_UTMI_ULPI)
@@ -2360,8 +2334,8 @@
 		valid = 0;
 	}
 
-	if (val == 0 && dwc2_get_param_phy_type(hsotg) ==
-					DWC2_PHY_TYPE_PARAM_FS)
+	if (val == DWC2_SPEED_PARAM_HIGH &&
+	    dwc2_get_param_phy_type(hsotg) == DWC2_PHY_TYPE_PARAM_FS)
 		valid = 0;
 
 	if (!valid) {
@@ -2370,7 +2344,7 @@
 				"%d invalid for speed parameter. Check HW configuration.\n",
 				val);
 		val = dwc2_get_param_phy_type(hsotg) == DWC2_PHY_TYPE_PARAM_FS ?
-				1 : 0;
+				DWC2_SPEED_PARAM_FULL : DWC2_SPEED_PARAM_HIGH;
 		dev_dbg(hsotg->dev, "Setting speed to %d\n", val);
 		retval = -EINVAL;
 	}
@@ -2668,7 +2642,7 @@
  * for the DWC_otg core. It returns non-0 if any parameters are invalid.
  */
 int dwc2_set_parameters(struct dwc2_hsotg *hsotg,
-			struct dwc2_core_params *params)
+			const struct dwc2_core_params *params)
 {
 	int retval = 0;
 

diff --git a/drivers/staging/dwc2/core_intr.c b/drivers/staging/dwc2/core_intr.c
index 4c9ad14..98c51bb 100644
--- a/drivers/staging/dwc2/core_intr.c
+++ b/drivers/staging/dwc2/core_intr.c

@@ -403,8 +403,7 @@
 #define GINTMSK_COMMON	(GINTSTS_WKUPINT | GINTSTS_SESSREQINT |		\
 			 GINTSTS_CONIDSTSCHNG | GINTSTS_OTGINT |	\
 			 GINTSTS_MODEMIS | GINTSTS_DISCONNINT |		\
-			 GINTSTS_USBSUSP | GINTSTS_RESTOREDONE |	\
-			 GINTSTS_PRTINT)
+			 GINTSTS_USBSUSP | GINTSTS_PRTINT)
 
 /*
  * This function returns the Core Interrupt register
@@ -450,7 +449,7 @@
 {
 	struct dwc2_hsotg *hsotg = dev;
 	u32 gintsts;
-	int retval = 0;
+	irqreturn_t retval = IRQ_NONE;
 
 	if (dwc2_check_core_status(hsotg) < 0) {
 		dev_warn(hsotg->dev, "Controller is disconnected\n");
@@ -461,7 +460,7 @@
 
 	gintsts = dwc2_read_common_intr(hsotg);
 	if (gintsts & ~GINTSTS_PRTINT)
-		retval = 1;
+		retval = IRQ_HANDLED;
 
 	if (gintsts & GINTSTS_MODEMIS)
 		dwc2_handle_mode_mismatch_intr(hsotg);
@@ -478,12 +477,6 @@
 	if (gintsts & GINTSTS_USBSUSP)
 		dwc2_handle_usb_suspend_intr(hsotg);
 
-	if (gintsts & GINTSTS_RESTOREDONE) {
-		gintsts = GINTSTS_RESTOREDONE;
-		writel(gintsts, hsotg->regs + GINTSTS);
-		dev_dbg(hsotg->dev, " --Restore done interrupt received--\n");
-	}
-
 	if (gintsts & GINTSTS_PRTINT) {
 		/*
 		 * The port interrupt occurs while in device mode with HPRT0
@@ -500,6 +493,6 @@
 
 	spin_unlock(&hsotg->lock);
 out:
-	return IRQ_RETVAL(retval);
+	return retval;
 }
 EXPORT_SYMBOL_GPL(dwc2_handle_common_intr);

diff --git a/drivers/staging/dwc2/hcd.c b/drivers/staging/dwc2/hcd.c
index 8551cce..2ed54b1 100644
--- a/drivers/staging/dwc2/hcd.c
+++ b/drivers/staging/dwc2/hcd.c

@@ -1563,9 +1563,9 @@
 		break;
 
 	case GetPortStatus:
-		dev_dbg(hsotg->dev,
-			"GetPortStatus wIndex=0x%04x flags=0x%08x\n", windex,
-			hsotg->flags.d32);
+		dev_vdbg(hsotg->dev,
+			 "GetPortStatus wIndex=0x%04x flags=0x%08x\n", windex,
+			 hsotg->flags.d32);
 		if (!windex || windex > 1)
 			goto error;
 
@@ -1598,7 +1598,7 @@
 		}
 
 		hprt0 = readl(hsotg->regs + HPRT0);
-		dev_dbg(hsotg->dev, "  HPRT0: 0x%08x\n", hprt0);
+		dev_vdbg(hsotg->dev, "  HPRT0: 0x%08x\n", hprt0);
 
 		if (hprt0 & HPRT0_CONNSTS)
 			port_status |= USB_PORT_STAT_CONNECTION;
@@ -1623,7 +1623,7 @@
 			port_status |= USB_PORT_STAT_TEST;
 		/* USB_PORT_FEAT_INDICATOR unsupported always 0 */
 
-		dev_dbg(hsotg->dev, "port_status=%08x\n", port_status);
+		dev_vdbg(hsotg->dev, "port_status=%08x\n", port_status);
 		*(__le32 *)buf = cpu_to_le32(port_status);
 		break;
 
@@ -2533,9 +2533,8 @@
 static irqreturn_t _dwc2_hcd_irq(struct usb_hcd *hcd)
 {
 	struct dwc2_hsotg *hsotg = dwc2_hcd_to_hsotg(hcd);
-	int retval = dwc2_hcd_intr(hsotg);
 
-	return IRQ_RETVAL(retval);
+	return dwc2_handle_hcd_intr(hsotg);
 }
 
 /*
@@ -2702,7 +2701,7 @@
  * a negative error on failure.
  */
 int dwc2_hcd_init(struct dwc2_hsotg *hsotg, int irq,
-		  struct dwc2_core_params *params)
+		  const struct dwc2_core_params *params)
 {
 	struct usb_hcd *hcd;
 	struct dwc2_host_chan *channel;
@@ -2919,7 +2918,7 @@
 	 * allocates the DMA buffer pool, registers the USB bus, requests the
 	 * IRQ line, and calls hcd_start method.
 	 */
-	retval = usb_add_hcd(hcd, irq, IRQF_SHARED | IRQF_DISABLED);
+	retval = usb_add_hcd(hcd, irq, IRQF_SHARED);
 	if (retval < 0)
 		goto error3;
 

diff --git a/drivers/staging/dwc2/hcd.h b/drivers/staging/dwc2/hcd.h
index d071f1a..cf6c055 100644
--- a/drivers/staging/dwc2/hcd.h
+++ b/drivers/staging/dwc2/hcd.h

@@ -448,10 +448,10 @@
 }
 
 extern int dwc2_hcd_init(struct dwc2_hsotg *hsotg, int irq,
-			 struct dwc2_core_params *params);
+			 const struct dwc2_core_params *params);
 extern void dwc2_hcd_remove(struct dwc2_hsotg *hsotg);
 extern int dwc2_set_parameters(struct dwc2_hsotg *hsotg,
-			       struct dwc2_core_params *params);
+			       const struct dwc2_core_params *params);
 extern void dwc2_set_all_params(struct dwc2_core_params *params, int value);
 
 /* Transaction Execution Functions */
@@ -646,14 +646,14 @@
 /* HCD Core API */
 
 /**
- * dwc2_hcd_intr() - Called on every hardware interrupt
+ * dwc2_handle_hcd_intr() - Called on every hardware interrupt
  *
  * @hsotg: The DWC2 HCD
  *
- * Returns non zero if interrupt is handled
- * Return 0 if interrupt is not handled
+ * Returns IRQ_HANDLED if interrupt is handled
+ * Return IRQ_NONE if interrupt is not handled
  */
-extern int dwc2_hcd_intr(struct dwc2_hsotg *hsotg);
+extern irqreturn_t dwc2_handle_hcd_intr(struct dwc2_hsotg *hsotg);
 
 /**
  * dwc2_hcd_stop() - Halts the DWC_otg host mode operation

diff --git a/drivers/staging/dwc2/hcd_intr.c b/drivers/staging/dwc2/hcd_intr.c
index e24062f..e75dccb 100644
--- a/drivers/staging/dwc2/hcd_intr.c
+++ b/drivers/staging/dwc2/hcd_intr.c

@@ -115,16 +115,13 @@
 {
 	struct list_head *qh_entry;
 	struct dwc2_qh *qh;
-	u32 hfnum;
 	enum dwc2_transaction_type tr_type;
 
 #ifdef DEBUG_SOF
 	dev_vdbg(hsotg->dev, "--Start of Frame Interrupt--\n");
 #endif
 
-	hfnum = readl(hsotg->regs + HFNUM);
-	hsotg->frame_number = hfnum >> HFNUM_FRNUM_SHIFT &
-			    HFNUM_FRNUM_MASK >> HFNUM_FRNUM_SHIFT;
+	hsotg->frame_number = dwc2_hcd_get_frame_number(hsotg);
 
 	dwc2_track_missed_sofs(hsotg);
 
@@ -244,6 +241,7 @@
 	u32 usbcfg;
 	u32 prtspd;
 	u32 hcfg;
+	u32 fslspclksel;
 	u32 hfir;
 
 	dev_vdbg(hsotg->dev, "%s(%p)\n", __func__, hsotg);
@@ -275,6 +273,7 @@
 		}
 
 		hcfg = readl(hsotg->regs + HCFG);
+		fslspclksel = hcfg & HCFG_FSLSPCLKSEL_MASK;
 
 		if (prtspd == HPRT0_SPD_LOW_SPEED &&
 		    params->host_ls_low_power_phy_clk ==
@@ -282,8 +281,7 @@
 			/* 6 MHZ */
 			dev_vdbg(hsotg->dev,
 				 "FS_PHY programming HCFG to 6 MHz\n");
-			if ((hcfg & HCFG_FSLSPCLKSEL_MASK) !=
-			    HCFG_FSLSPCLKSEL_6_MHZ) {
+			if (fslspclksel != HCFG_FSLSPCLKSEL_6_MHZ) {
 				hcfg &= ~HCFG_FSLSPCLKSEL_MASK;
 				hcfg |= HCFG_FSLSPCLKSEL_6_MHZ;
 				writel(hcfg, hsotg->regs + HCFG);
@@ -293,8 +291,7 @@
 			/* 48 MHZ */
 			dev_vdbg(hsotg->dev,
 				 "FS_PHY programming HCFG to 48 MHz\n");
-			if ((hcfg & HCFG_FSLSPCLKSEL_MASK) !=
-			    HCFG_FSLSPCLKSEL_48_MHZ) {
+			if (fslspclksel != HCFG_FSLSPCLKSEL_48_MHZ) {
 				hcfg &= ~HCFG_FSLSPCLKSEL_MASK;
 				hcfg |= HCFG_FSLSPCLKSEL_48_MHZ;
 				writel(hcfg, hsotg->regs + HCFG);
@@ -2060,14 +2057,14 @@
 }
 
 /* This function handles interrupts for the HCD */
-int dwc2_hcd_intr(struct dwc2_hsotg *hsotg)
+irqreturn_t dwc2_handle_hcd_intr(struct dwc2_hsotg *hsotg)
 {
 	u32 gintsts, dbg_gintsts;
-	int retval = 0;
+	irqreturn_t retval = IRQ_NONE;
 
 	if (dwc2_check_core_status(hsotg) < 0) {
 		dev_warn(hsotg->dev, "Controller is disconnected\n");
-		return 0;
+		return retval;
 	}
 
 	spin_lock(&hsotg->lock);
@@ -2077,10 +2074,10 @@
 		gintsts = dwc2_read_core_intr(hsotg);
 		if (!gintsts) {
 			spin_unlock(&hsotg->lock);
-			return 0;
+			return retval;
 		}
 
-		retval = 1;
+		retval = IRQ_HANDLED;
 
 		dbg_gintsts = gintsts;
 #ifndef DEBUG_SOF
@@ -2102,9 +2099,6 @@
 			dwc2_rx_fifo_level_intr(hsotg);
 		if (gintsts & GINTSTS_NPTXFEMP)
 			dwc2_np_tx_fifo_empty_intr(hsotg);
-		if (gintsts & GINTSTS_I2CINT)
-			/* Todo: Implement i2cintr handler */
-			writel(GINTSTS_I2CINT, hsotg->regs + GINTSTS);
 		if (gintsts & GINTSTS_PRTINT)
 			dwc2_port_intr(hsotg);
 		if (gintsts & GINTSTS_HCHINT)

diff --git a/drivers/staging/dwc2/pci.c b/drivers/staging/dwc2/pci.c
index 69c65eb..3ca54d6 100644
--- a/drivers/staging/dwc2/pci.c
+++ b/drivers/staging/dwc2/pci.c

@@ -59,7 +59,7 @@
 
 static const char dwc2_driver_name[] = "dwc2";
 
-static struct dwc2_core_params dwc2_module_params = {
+static const struct dwc2_core_params dwc2_module_params = {
 	.otg_cap			= -1,
 	.otg_ver			= -1,
 	.dma_enable			= -1,
@@ -101,8 +101,6 @@
 {
 	struct dwc2_hsotg *hsotg = pci_get_drvdata(dev);
 
-	dev_dbg(&dev->dev, "%s(%p)\n", __func__, dev);
-
 	dwc2_hcd_remove(hsotg);
 	pci_disable_device(dev);
 }
@@ -125,18 +123,14 @@
 	struct dwc2_hsotg *hsotg;
 	int retval;
 
-	dev_dbg(&dev->dev, "%s(%p)\n", __func__, dev);
-
 	hsotg = devm_kzalloc(&dev->dev, sizeof(*hsotg), GFP_KERNEL);
 	if (!hsotg)
 		return -ENOMEM;
 
-	pci_set_power_state(dev, PCI_D0);
-
 	hsotg->dev = &dev->dev;
-	hsotg->regs = devm_request_and_ioremap(&dev->dev, &dev->resource[0]);
-	if (!hsotg->regs)
-		return -ENOMEM;
+	hsotg->regs = devm_ioremap_resource(&dev->dev, &dev->resource[0]);
+	if (IS_ERR(hsotg->regs))
+		return PTR_ERR(hsotg->regs);
 
 	dev_dbg(&dev->dev, "mapped PA %08lx to VA %p\n",
 		(unsigned long)pci_resource_start(dev, 0), hsotg->regs);
@@ -153,7 +147,6 @@
 	}
 
 	pci_set_drvdata(dev, hsotg);
-	dev_dbg(&dev->dev, "hsotg=%p\n", hsotg);
 
 	return retval;
 }
@@ -162,6 +155,10 @@
 	{
 		PCI_DEVICE(PCI_VENDOR_ID_SYNOPSYS, PCI_PRODUCT_ID_HAPS_HSOTG),
 	},
+	{
+		PCI_DEVICE(PCI_VENDOR_ID_STMICRO,
+			   PCI_DEVICE_ID_STMICRO_USB_OTG),
+	},
 	{ /* end: all zeroes */ }
 };
 MODULE_DEVICE_TABLE(pci, dwc2_pci_ids);

diff --git a/drivers/staging/echo/echo.c b/drivers/staging/echo/echo.c
index 5882139..9597e95 100644
--- a/drivers/staging/echo/echo.c
+++ b/drivers/staging/echo/echo.c

@@ -267,13 +267,13 @@
 		goto error_snap;
 
 	ec->cond_met = 0;
-	ec->Pstates = 0;
-	ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
-	ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
+	ec->pstates = 0;
+	ec->ltxacc = ec->lrxacc = ec->lcleanacc = ec->lclean_bgacc = 0;
+	ec->ltx = ec->lrx = ec->lclean = ec->lclean_bg = 0;
 	ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
-	ec->Lbgn = ec->Lbgn_acc = 0;
-	ec->Lbgn_upper = 200;
-	ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
+	ec->lbgn = ec->lbgn_acc = 0;
+	ec->lbgn_upper = 200;
+	ec->lbgn_upper_acc = ec->lbgn_upper << 13;
 
 	return ec;
 
@@ -314,13 +314,13 @@
 {
 	int i;
 
-	ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
-	ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
+	ec->ltxacc = ec->lrxacc = ec->lcleanacc = ec->lclean_bgacc = 0;
+	ec->ltx = ec->lrx = ec->lclean = ec->lclean_bg = 0;
 	ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
 
-	ec->Lbgn = ec->Lbgn_acc = 0;
-	ec->Lbgn_upper = 200;
-	ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
+	ec->lbgn = ec->lbgn_acc = 0;
+	ec->lbgn_upper = 200;
+	ec->lbgn_upper_acc = ec->lbgn_upper << 13;
 
 	ec->nonupdate_dwell = 0;
 
@@ -332,7 +332,7 @@
 		memset(ec->fir_taps16[i], 0, ec->taps * sizeof(int16_t));
 
 	ec->curr_pos = ec->taps - 1;
-	ec->Pstates = 0;
+	ec->pstates = 0;
 }
 EXPORT_SYMBOL_GPL(oslec_flush);
 
@@ -418,33 +418,33 @@
 		new = (int)tx * (int)tx;
 		old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
 		    (int)ec->fir_state.history[ec->fir_state.curr_pos];
-		ec->Pstates +=
+		ec->pstates +=
 		    ((new - old) + (1 << (ec->log2taps - 1))) >> ec->log2taps;
-		if (ec->Pstates < 0)
-			ec->Pstates = 0;
+		if (ec->pstates < 0)
+			ec->pstates = 0;
 	}
 
 	/* Calculate short term average levels using simple single pole IIRs */
 
-	ec->Ltxacc += abs(tx) - ec->Ltx;
-	ec->Ltx = (ec->Ltxacc + (1 << 4)) >> 5;
-	ec->Lrxacc += abs(rx) - ec->Lrx;
-	ec->Lrx = (ec->Lrxacc + (1 << 4)) >> 5;
+	ec->ltxacc += abs(tx) - ec->ltx;
+	ec->ltx = (ec->ltxacc + (1 << 4)) >> 5;
+	ec->lrxacc += abs(rx) - ec->lrx;
+	ec->lrx = (ec->lrxacc + (1 << 4)) >> 5;
 
 	/* Foreground filter */
 
 	ec->fir_state.coeffs = ec->fir_taps16[0];
 	echo_value = fir16(&ec->fir_state, tx);
 	ec->clean = rx - echo_value;
-	ec->Lcleanacc += abs(ec->clean) - ec->Lclean;
-	ec->Lclean = (ec->Lcleanacc + (1 << 4)) >> 5;
+	ec->lcleanacc += abs(ec->clean) - ec->lclean;
+	ec->lclean = (ec->lcleanacc + (1 << 4)) >> 5;
 
 	/* Background filter */
 
 	echo_value = fir16(&ec->fir_state_bg, tx);
 	clean_bg = rx - echo_value;
-	ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg;
-	ec->Lclean_bg = (ec->Lclean_bgacc + (1 << 4)) >> 5;
+	ec->lclean_bgacc += abs(clean_bg) - ec->lclean_bg;
+	ec->lclean_bg = (ec->lclean_bgacc + (1 << 4)) >> 5;
 
 	/* Background Filter adaption */
 
@@ -455,7 +455,7 @@
 	ec->factor = 0;
 	ec->shift = 0;
 	if ((ec->nonupdate_dwell == 0)) {
-		int P, logP, shift;
+		int p, logp, shift;
 
 		/* Determine:
 
@@ -490,9 +490,9 @@
 		   for a divide versus a top_bit() implementation.
 		 */
 
-		P = MIN_TX_POWER_FOR_ADAPTION + ec->Pstates;
-		logP = top_bit(P) + ec->log2taps;
-		shift = 30 - 2 - logP;
+		p = MIN_TX_POWER_FOR_ADAPTION + ec->pstates;
+		logp = top_bit(p) + ec->log2taps;
+		shift = 30 - 2 - logp;
 		ec->shift = shift;
 
 		lms_adapt_bg(ec, clean_bg, shift);
@@ -502,7 +502,7 @@
 	   near end speech */
 
 	ec->adapt = 0;
-	if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx))
+	if ((ec->lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->lrx > ec->ltx))
 		ec->nonupdate_dwell = DTD_HANGOVER;
 	if (ec->nonupdate_dwell)
 		ec->nonupdate_dwell--;
@@ -515,9 +515,9 @@
 	if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) &&
 	    (ec->nonupdate_dwell == 0) &&
 	    /* (ec->Lclean_bg < 0.875*ec->Lclean) */
-	    (8 * ec->Lclean_bg < 7 * ec->Lclean) &&
+	    (8 * ec->lclean_bg < 7 * ec->lclean) &&
 	    /* (ec->Lclean_bg < 0.125*ec->Ltx) */
-	    (8 * ec->Lclean_bg < ec->Ltx)) {
+	    (8 * ec->lclean_bg < ec->ltx)) {
 		if (ec->cond_met == 6) {
 			/*
 			 * BG filter has had better results for 6 consecutive
@@ -541,14 +541,14 @@
 		 * non-linearity in the channel.".
 		 */
 
-		if ((16 * ec->Lclean < ec->Ltx)) {
+		if ((16 * ec->lclean < ec->ltx)) {
 			/*
 			 * Our e/c has improved echo by at least 24 dB (each
 			 * factor of 2 is 6dB, so 2*2*2*2=16 is the same as
 			 * 6+6+6+6=24dB)
 			 */
 			if (ec->adaption_mode & ECHO_CAN_USE_CNG) {
-				ec->cng_level = ec->Lbgn;
+				ec->cng_level = ec->lbgn;
 
 				/*
 				 * Very elementary comfort noise generation.
@@ -571,10 +571,10 @@
 
 			} else if (ec->adaption_mode & ECHO_CAN_USE_CLIP) {
 				/* This sounds much better than CNG */
-				if (ec->clean_nlp > ec->Lbgn)
-					ec->clean_nlp = ec->Lbgn;
-				if (ec->clean_nlp < -ec->Lbgn)
-					ec->clean_nlp = -ec->Lbgn;
+				if (ec->clean_nlp > ec->lbgn)
+					ec->clean_nlp = ec->lbgn;
+				if (ec->clean_nlp < -ec->lbgn)
+					ec->clean_nlp = -ec->lbgn;
 			} else {
 				/*
 				 * just mute the residual, doesn't sound very
@@ -593,9 +593,9 @@
 			 * level signals like near end speech.  When combined
 			 * with CNG or especially CLIP seems to work OK.
 			 */
-			if (ec->Lclean < 40) {
-				ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn;
-				ec->Lbgn = (ec->Lbgn_acc + (1 << 11)) >> 12;
+			if (ec->lclean < 40) {
+				ec->lbgn_acc += abs(ec->clean) - ec->lbgn;
+				ec->lbgn = (ec->lbgn_acc + (1 << 11)) >> 12;
 			}
 		}
 	}

diff --git a/drivers/staging/echo/echo.h b/drivers/staging/echo/echo.h
index 32ca9de..9b08c63 100644
--- a/drivers/staging/echo/echo.h
+++ b/drivers/staging/echo/echo.h

@@ -139,24 +139,24 @@
 	int adaption_mode;
 
 	int cond_met;
-	int32_t Pstates;
+	int32_t pstates;
 	int16_t adapt;
 	int32_t factor;
 	int16_t shift;
 
 	/* Average levels and averaging filter states */
-	int Ltxacc;
-	int Lrxacc;
-	int Lcleanacc;
-	int Lclean_bgacc;
-	int Ltx;
-	int Lrx;
-	int Lclean;
-	int Lclean_bg;
-	int Lbgn;
-	int Lbgn_acc;
-	int Lbgn_upper;
-	int Lbgn_upper_acc;
+	int ltxacc;
+	int lrxacc;
+	int lcleanacc;
+	int lclean_bgacc;
+	int ltx;
+	int lrx;
+	int lclean;
+	int lclean_bg;
+	int lbgn;
+	int lbgn_acc;
+	int lbgn_upper;
+	int lbgn_upper_acc;
 
 	/* foreground and background filter states */
 	struct fir16_state_t fir_state;

diff --git a/drivers/staging/frontier/alphatrack.c b/drivers/staging/frontier/alphatrack.c
index ea9362d..5590ebf 100644
--- a/drivers/staging/frontier/alphatrack.c
+++ b/drivers/staging/frontier/alphatrack.c

@@ -24,13 +24,14 @@
  * raw interrupt reports.
  */
 
-/* Note: this currently uses a dumb ringbuffer for reads and writes.
+/*
+ * Note: this currently uses a dumb ringbuffer for reads and writes.
  * A more optimal driver would cache and kill off outstanding urbs that are
  * now invalid, and ignore ones that already were in the queue but valid
  * as we only have 30 commands for the alphatrack. In particular this is
  * key for getting lights to flash in time as otherwise many commands
  * can be buffered up before the light change makes it to the interface.
-*/
+ */
 
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -100,7 +101,8 @@
 module_param(debug, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(debug, "Debug enabled or not");
 
-/* All interrupt in transfers are collected in a ring buffer to
+/*
+ * All interrupt in transfers are collected in a ring buffer to
  * avoid racing conditions and get better performance of the driver.
  */
 
@@ -109,8 +111,7 @@
 module_param(ring_buffer_size, int, S_IRUGO);
 MODULE_PARM_DESC(ring_buffer_size, "Read ring buffer size");
 
-/* The write_buffer can one day contain more than one interrupt out transfer.
- */
+/* The write_buffer can one day contain more than one interrupt out transfer.*/
 
 static int write_buffer_size = WRITE_BUFFER_SIZE;
 module_param(write_buffer_size, int, S_IRUGO);
@@ -199,9 +200,7 @@
 			usb_kill_urb(dev->interrupt_out_urb);
 }
 
-/**
- *	usb_alphatrack_delete
- */
+/** usb_alphatrack_delete */
 static void usb_alphatrack_delete(struct usb_alphatrack *dev)
 {
 	usb_alphatrack_abort_transfers(dev);
@@ -213,9 +212,7 @@
 	kfree(dev);		/* fixme oldi_buffer */
 }
 
-/**
- *	usb_alphatrack_interrupt_in_callback
- */
+/** usb_alphatrack_interrupt_in_callback */
 
 static void usb_alphatrack_interrupt_in_callback(struct urb *urb)
 {
@@ -296,9 +293,7 @@
 	wake_up_interruptible(&dev->read_wait);
 }
 
-/**
- *	usb_alphatrack_interrupt_out_callback
- */
+/** usb_alphatrack_interrupt_out_callback */
 static void usb_alphatrack_interrupt_out_callback(struct urb *urb)
 {
 	struct usb_alphatrack *dev = urb->context;
@@ -315,9 +310,7 @@
 	wake_up_interruptible(&dev->write_wait);
 }
 
-/**
- *	usb_alphatrack_open
- */
+/** usb_alphatrack_open */
 static int usb_alphatrack_open(struct inode *inode, struct file *file)
 {
 	struct usb_alphatrack *dev;
@@ -398,9 +391,7 @@
 	return retval;
 }
 
-/**
- *	usb_alphatrack_release
- */
+/** usb_alphatrack_release */
 static int usb_alphatrack_release(struct inode *inode, struct file *file)
 {
 	struct usb_alphatrack *dev;
@@ -447,9 +438,7 @@
 	return retval;
 }
 
-/**
- *	usb_alphatrack_poll
- */
+/** usb_alphatrack_poll */
 static unsigned int usb_alphatrack_poll(struct file *file, poll_table *wait)
 {
 	struct usb_alphatrack *dev;
@@ -468,9 +457,7 @@
 	return mask;
 }
 
-/**
- *	usb_alphatrack_read
- */
+/** usb_alphatrack_read */
 static ssize_t usb_alphatrack_read(struct file *file, char __user *buffer,
 				   size_t count, loff_t *ppos)
 {
@@ -539,9 +526,7 @@
 	return retval;
 }
 
-/**
- *	usb_alphatrack_write
- */
+/** usb_alphatrack_write */
 static ssize_t usb_alphatrack_write(struct file *file,
 				    const char __user *buffer, size_t count,
 				    loff_t *ppos)
@@ -601,7 +586,7 @@
 	}
 
 	if (dev->interrupt_out_endpoint == NULL) {
-		dev_err(&dev->intf->dev, "Endpoint should not be be null!\n");
+		dev_err(&dev->intf->dev, "Endpoint should not be null!\n");
 		goto unlock_exit;
 	}
 
@@ -718,8 +703,10 @@
 
 	true_size = min(ring_buffer_size, RING_BUFFER_SIZE);
 
-	/* FIXME - there are more usb_alloc routines for dma correctness.
-	   Needed? */
+	/*
+	 * FIXME - there are more usb_alloc routines for dma correctness.
+	 * Needed?
+	 */
 	dev->ring_buffer = kmalloc_array(true_size,
 					 sizeof(struct alphatrack_icmd),
 					 GFP_KERNEL);

diff --git a/drivers/staging/frontier/alphatrack.h b/drivers/staging/frontier/alphatrack.h
index 10a7972..418c605 100644
--- a/drivers/staging/frontier/alphatrack.h
+++ b/drivers/staging/frontier/alphatrack.h

@@ -6,7 +6,8 @@
 	unsigned char cmd[8];
 };
 
-/* These are unused by the present driver but provide documentation for the
+/*
+ * These are unused by the present driver but provide documentation for the
  * userspace API.
  */
 enum LightID {
@@ -58,7 +59,8 @@
 #define BUTTONMASK_PRESS2      0x00008010
 #define BUTTONMASK_PRESS3      0x00002020
 
-/* last 3 bytes are the slider position
+/*
+ * last 3 bytes are the slider position
  * 40 is the actual slider moving, the most sig bits, and 3 lsb
  */
 

diff --git a/drivers/staging/frontier/tranzport.c b/drivers/staging/frontier/tranzport.c
index 04b5e66d..6cbf9c7 100644
--- a/drivers/staging/frontier/tranzport.c
+++ b/drivers/staging/frontier/tranzport.c

@@ -86,7 +86,8 @@
 module_param(debug, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(debug, "Debug enabled or not");
 
-/* All interrupt in transfers are collected in a ring buffer to
+/*
+ * All interrupt in transfers are collected in a ring buffer to
  * avoid racing conditions and get better performance of the driver.
  */
 
@@ -95,7 +96,8 @@
 module_param(ring_buffer_size, int, S_IRUGO);
 MODULE_PARM_DESC(ring_buffer_size, "Read ring buffer size in reports");
 
-/* The write_buffer can one day contain more than one interrupt out transfer.
+/*
+ * The write_buffer can one day contain more than one interrupt out transfer.
  */
 static int write_buffer_size = WRITE_BUFFER_SIZE;
 module_param(write_buffer_size, int, S_IRUGO);
@@ -565,9 +567,9 @@
 			newwheel = (*dev->ring_buffer)[next_tail].cmd[6];
 			oldwheel = (*dev->ring_buffer)[dev->ring_tail].cmd[6];
 			/* if both are wheel events, and
-			   no buttons have changes (FIXME, do I have to check?),
-			   and we are the same sign, we can compress +- 7F
-			*/
+			 * no buttons have changes (FIXME, do I have to check?),
+			 * and we are the same sign, we can compress +- 7F
+			 */
 			dbg_info(&dev->intf->dev,
 				"%s: trying to compress: "
 				"%02x%02x%02x%02x%02x%02x%02x%02x\n",
@@ -729,7 +731,7 @@
 	}
 
 	if (dev->interrupt_out_endpoint == NULL) {
-		dev_err(&dev->intf->dev, "Endpoint should not be be null!\n");
+		dev_err(&dev->intf->dev, "Endpoint should not be null!\n");
 		goto unlock_exit;
 	}
 
@@ -842,8 +844,10 @@
 		ring_buffer_size = RING_BUFFER_SIZE;
 	true_size = min(ring_buffer_size, RING_BUFFER_SIZE);
 
-	/* FIXME - there are more usb_alloc routines for dma correctness.
-	   Needed? */
+	/*
+	 * FIXME - there are more usb_alloc routines for dma correctness.
+	 * Needed?
+	 */
 
 	dev->ring_buffer =
 	    kmalloc((true_size * sizeof(struct tranzport_cmd)) + 8, GFP_KERNEL);

diff --git a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_dnld.c b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_dnld.c
index 47cc365..6311b2f 100644
--- a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_dnld.c
+++ b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_dnld.c

@@ -132,16 +132,16 @@
 	pdata = (u32 *) bootimage;
 	size = sizeof(bootimage);
 
-	// check for odd word
-	if (size & 0x0003) {
+	/* check for odd word */
+	if (size & 0x0003)
 		size += 4;
-	}
-	// Provide mutual exclusive access while reading ASIC registers.
+
+	/* Provide mutual exclusive access while reading ASIC registers. */
 	spin_lock_irqsave(&info->dpram_lock, flags);
 
-	// need to set i/o base address initially and hardware will autoincrement
+	/* need to set i/o base address initially and hardware will autoincrement */
 	ft1000_write_reg(dev, FT1000_REG_DPRAM_ADDR, FT1000_DPRAM_BASE);
-	// write bytes
+	/* write bytes */
 	for (i = 0; i < (size >> 2); i++) {
 		templong = *pdata++;
 		outl(templong, dev->base_addr + FT1000_REG_MAG_DPDATA);
@@ -345,11 +345,10 @@
 
 			handshake = get_handshake(dev, HANDSHAKE_DSP_BL_READY);
 
-			if (handshake == HANDSHAKE_DSP_BL_READY) {
+			if (handshake == HANDSHAKE_DSP_BL_READY)
 				put_handshake(dev, HANDSHAKE_DRIVER_READY);
-			} else {
+			else
 				Status = FAILURE;
-			}
 
 			uiState = STATE_BOOT_DWNLD;
 
@@ -391,7 +390,7 @@
 						Status = FAILURE;
 						break;
 					}
-					// Provide mutual exclusive access while reading ASIC registers.
+					/* Provide mutual exclusive access while reading ASIC registers. */
 					spin_lock_irqsave(&info->dpram_lock,
 							  flags);
 					/*
@@ -505,15 +504,15 @@
 					break;
 
 				case REQUEST_MAILBOX_DATA:
-					// Convert length from byte count to word count. Make sure we round up.
+					/* Convert length from byte count to word count. Make sure we round up. */
 					word_length =
 						(long)(info->DSPInfoBlklen + 1) / 2;
 					put_request_value(dev, word_length);
 					pMailBoxData =
-						(struct drv_msg *) & info->DSPInfoBlk[0];
+						(struct drv_msg *) &info->DSPInfoBlk[0];
 					pUsData =
-						(u16 *) & pMailBoxData->data[0];
-					// Provide mutual exclusive access while reading ASIC registers.
+						(u16 *) &pMailBoxData->data[0];
+					/* Provide mutual exclusive access while reading ASIC registers. */
 					spin_lock_irqsave(&info->dpram_lock,
 							  flags);
 					if (file_version == 5) {
@@ -538,9 +537,9 @@
 						outw(DWNLD_MAG_PS_HDR_LOC,
 							 dev->base_addr +
 							 FT1000_REG_DPRAM_ADDR);
-						if (word_length & 0x01) {
+						if (word_length & 0x01)
 							word_length++;
-						}
+
 						word_length = word_length / 2;
 
 						for (; word_length > 0; word_length--) {	/* In words */
@@ -565,7 +564,7 @@
 						(u16 *) ((long)pFileStart +
 							pFileHdr5->
 							version_data_offset);
-					// Provide mutual exclusive access while reading ASIC registers.
+					/* Provide mutual exclusive access while reading ASIC registers. */
 					spin_lock_irqsave(&info->dpram_lock,
 							  flags);
 					/*
@@ -692,7 +691,7 @@
 
 			if (pHdr->portdest == 0x80	/* DspOAM */
 				&& (pHdr->portsrc == 0x00	/* Driver */
-				|| pHdr->portsrc == 0x10 /* FMM */ )) {
+				|| pHdr->portsrc == 0x10 /* FMM */)) {
 				uiState = STATE_SECTION_PROV;
 			} else {
 				DEBUG(1,
@@ -711,13 +710,13 @@
 			pHdr = (struct pseudo_hdr *) pUcFile;
 
 			if (pHdr->checksum == hdr_checksum(pHdr)) {
-				if (pHdr->portdest != 0x80 /* Dsp OAM */ ) {
+				if (pHdr->portdest != 0x80 /* Dsp OAM */) {
 					uiState = STATE_DONE_PROV;
 					break;
 				}
 				usHdrLength = ntohs(pHdr->length);	/* Byte length for PROV records */
 
-				// Get buffer for provisioning data
+				/* Get buffer for provisioning data */
 				pbuffer =
 					kmalloc((usHdrLength + sizeof(struct pseudo_hdr)),
 						GFP_ATOMIC);
@@ -725,7 +724,7 @@
 					memcpy(pbuffer, (void *)pUcFile,
 						   (u32) (usHdrLength +
 							   sizeof(struct pseudo_hdr)));
-					// link provisioning data
+					/* link provisioning data */
 					pprov_record =
 						kmalloc(sizeof(struct prov_record),
 							GFP_ATOMIC);
@@ -735,7 +734,7 @@
 						list_add_tail(&pprov_record->
 								  list,
 								  &info->prov_list);
-						// Move to next entry if available
+						/* Move to next entry if available */
 						pUcFile =
 							(u8 *) ((unsigned long) pUcFile +
 								   (unsigned long) ((usHdrLength + 1) & 0xFFFFFFFE) + sizeof(struct pseudo_hdr));

diff --git a/drivers/staging/ft1000/ft1000-usb/ft1000_debug.c b/drivers/staging/ft1000/ft1000-usb/ft1000_debug.c
index 3251d2e..68a55ce 100644
--- a/drivers/staging/ft1000/ft1000-usb/ft1000_debug.c
+++ b/drivers/staging/ft1000/ft1000-usb/ft1000_debug.c

@@ -1,29 +1,31 @@
-//---------------------------------------------------------------------------
-// FT1000 driver for Flarion Flash OFDM NIC Device
-//
-// Copyright (C) 2006 Flarion Technologies, All rights reserved.
-//
-// This program is free software; you can redistribute it and/or modify it
-// under the terms of the GNU General Public License as published by the Free
-// Software Foundation; either version 2 of the License, or (at your option) any
-// later version. This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-// more details. You should have received a copy of the GNU General Public
-// License along with this program; if not, write to the
-// Free Software Foundation, Inc., 59 Temple Place -
-// Suite 330, Boston, MA 02111-1307, USA.
-//---------------------------------------------------------------------------
-//
-// File:         ft1000_chdev.c
-//
-// Description:  Custom character device dispatch routines.
-//
-// History:
-// 8/29/02    Whc                Ported to Linux.
-// 6/05/06    Whc                Porting to Linux 2.6.9
-//
-//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* FT1000 driver for Flarion Flash OFDM NIC Device
+*
+* Copyright (C) 2006 Flarion Technologies, All rights reserved.
+*
+* This program is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License as published by the Free
+* Software Foundation; either version 2 of the License, or (at your option) any
+* later version. This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+* or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+* more details. You should have received a copy of the GNU General Public
+* License along with this program; if not, write to the
+* Free Software Foundation, Inc., 59 Temple Place -
+* Suite 330, Boston, MA 02111-1307, USA.
+*---------------------------------------------------------------------------
+*
+* File:         ft1000_chdev.c
+*
+* Description:  Custom character device dispatch routines.
+*
+* History:
+* 8/29/02    Whc                Ported to Linux.
+* 6/05/06    Whc                Porting to Linux 2.6.9
+*
+*---------------------------------------------------------------------------
+*/
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -38,25 +40,24 @@
 
 static int ft1000_flarion_cnt = 0;
 
-static int ft1000_open (struct inode *inode, struct file *file);
+static int ft1000_open(struct inode *inode, struct file *file);
 static unsigned int ft1000_poll_dev(struct file *file, poll_table *wait);
 static long ft1000_ioctl(struct file *file, unsigned int command,
                            unsigned long argument);
-static int ft1000_release (struct inode *inode, struct file *file);
+static int ft1000_release(struct inode *inode, struct file *file);
 
-// List to free receive command buffer pool
+/* List to free receive command buffer pool */
 struct list_head freercvpool;
 
-// lock to arbitrate free buffer list for receive command data
+/* lock to arbitrate free buffer list for receive command data */
 spinlock_t free_buff_lock;
 
 int numofmsgbuf = 0;
 
-//
-// Table of entry-point routines for char device
-//
-static const struct file_operations ft1000fops =
-{
+/*
+* Table of entry-point routines for char device
+*/
+static const struct file_operations ft1000fops = {
 	.unlocked_ioctl	= ft1000_ioctl,
 	.poll		= ft1000_poll_dev,
 	.open		= ft1000_open,
@@ -64,34 +65,35 @@
 	.llseek		= no_llseek,
 };
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_get_buffer
-//
-// Parameters:
-//
-// Returns:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
+/*
+---------------------------------------------------------------------------
+* Function:    ft1000_get_buffer
+*
+* Parameters:
+*
+* Returns:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
 struct dpram_blk *ft1000_get_buffer(struct list_head *bufflist)
 {
     unsigned long flags;
 	struct dpram_blk *ptr;
 
     spin_lock_irqsave(&free_buff_lock, flags);
-    // Check if buffer is available
-    if ( list_empty(bufflist) ) {
+    /* Check if buffer is available */
+    if (list_empty(bufflist)) {
         DEBUG("ft1000_get_buffer:  No more buffer - %d\n", numofmsgbuf);
         ptr = NULL;
-    }
-    else {
+    } else {
         numofmsgbuf--;
 	ptr = list_entry(bufflist->next, struct dpram_blk, list);
         list_del(&ptr->list);
-        //DEBUG("ft1000_get_buffer: number of free msg buffers = %d\n", numofmsgbuf);
+        /* DEBUG("ft1000_get_buffer: number of free msg buffers = %d\n", numofmsgbuf); */
     }
     spin_unlock_irqrestore(&free_buff_lock, flags);
 
@@ -101,42 +103,46 @@
 
 
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_free_buffer
-//
-// Parameters:
-//
-// Returns:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_free_buffer
+*
+* Parameters:
+*
+* Returns:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
 void ft1000_free_buffer(struct dpram_blk *pdpram_blk, struct list_head *plist)
 {
     unsigned long flags;
 
     spin_lock_irqsave(&free_buff_lock, flags);
-    // Put memory back to list
+    /* Put memory back to list */
     list_add_tail(&pdpram_blk->list, plist);
     numofmsgbuf++;
-    //DEBUG("ft1000_free_buffer: number of free msg buffers = %d\n", numofmsgbuf);
+    /*DEBUG("ft1000_free_buffer: number of free msg buffers = %d\n", numofmsgbuf); */
     spin_unlock_irqrestore(&free_buff_lock, flags);
 }
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_CreateDevice
-//
-// Parameters:  dev - pointer to adapter object
-//
-// Returns:     0 if successful
-//
-// Description: Creates a private char device.
-//
-// Notes:       Only called by init_module().
-//
-//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_CreateDevice
+*
+* Parameters:  dev - pointer to adapter object
+*
+* Returns:     0 if successful
+*
+* Description: Creates a private char device.
+*
+* Notes:       Only called by init_module().
+*
+*---------------------------------------------------------------------------
+*/
 int ft1000_create_dev(struct ft1000_usb *dev)
 {
     int result;
@@ -144,20 +150,19 @@
 	struct dentry *dir, *file;
 	struct ft1000_debug_dirs *tmp;
 
-    // make a new device name
+    /* make a new device name */
     sprintf(dev->DeviceName, "%s%d", "FT1000_", dev->CardNumber);
 
     DEBUG("%s: number of instance = %d\n", __func__, ft1000_flarion_cnt);
     DEBUG("DeviceCreated = %x\n", dev->DeviceCreated);
 
-    if (dev->DeviceCreated)
-    {
+    if (dev->DeviceCreated) {
 	DEBUG("%s: \"%s\" already registered\n", __func__, dev->DeviceName);
 	return -EIO;
     }
 
 
-    // register the device
+    /* register the device */
     DEBUG("%s: \"%s\" debugfs device registration\n", __func__, dev->DeviceName);
 
 	tmp = kmalloc(sizeof(struct ft1000_debug_dirs), GFP_KERNEL);
@@ -186,7 +191,7 @@
 
     DEBUG("%s: registered debugfs directory \"%s\"\n", __func__, dev->DeviceName);
 
-    // initialize application information
+    /* initialize application information */
     dev->appcnt = 0;
     for (i=0; i<MAX_NUM_APP; i++) {
         dev->app_info[i].nTxMsg = 0;
@@ -198,7 +203,7 @@
         dev->app_info[i].DspBCMsgFlag = 0;
         dev->app_info[i].NumOfMsg = 0;
         init_waitqueue_head(&dev->app_info[i].wait_dpram_msg);
-        INIT_LIST_HEAD (&dev->app_info[i].app_sqlist);
+        INIT_LIST_HEAD(&dev->app_info[i].app_sqlist);
     }
 
     dev->DeviceCreated = TRUE;
@@ -214,16 +219,18 @@
 	return result;
 }
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_DestroyDeviceDEBUG
-//
-// Parameters:  dev - pointer to adapter object
-//
-// Description: Destroys a private char device.
-//
-// Notes:       Only called by cleanup_module().
-//
-//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_DestroyDeviceDEBUG
+*
+* Parameters:  dev - pointer to adapter object
+*
+* Description: Destroys a private char device.
+*
+* Notes:       Only called by cleanup_module().
+*
+*---------------------------------------------------------------------------
+*/
 void ft1000_destroy_dev(struct net_device *netdev)
 {
 	struct ft1000_info *info = netdev_priv(netdev);
@@ -238,8 +245,7 @@
 
 
 
-    if (dev->DeviceCreated)
-	{
+    if (dev->DeviceCreated) {
         ft1000_flarion_cnt--;
 		list_for_each_safe(pos, q, &dev->nodes.list) {
 			dir = list_entry(pos, struct ft1000_debug_dirs, list);
@@ -253,7 +259,7 @@
 		DEBUG("%s: unregistered device \"%s\"\n", __func__,
 					   dev->DeviceName);
 
-        // Make sure we free any memory reserve for slow Queue
+        /* Make sure we free any memory reserve for slow Queue */
         for (i=0; i<MAX_NUM_APP; i++) {
             while (list_empty(&dev->app_info[i].app_sqlist) == 0) {
                 pdpram_blk = list_entry(dev->app_info[i].app_sqlist.next, struct dpram_blk, list);
@@ -264,7 +270,7 @@
             wake_up_interruptible(&dev->app_info[i].wait_dpram_msg);
         }
 
-        // Remove buffer allocated for receive command data
+        /* Remove buffer allocated for receive command data */
         if (ft1000_flarion_cnt == 0) {
             while (list_empty(&freercvpool) == 0) {
 		ptr = list_entry(freercvpool.next, struct dpram_blk, list);
@@ -279,17 +285,19 @@
 
 }
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_open
-//
-// Parameters:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
-static int ft1000_open (struct inode *inode, struct file *file)
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_open
+*
+* Parameters:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
+static int ft1000_open(struct inode *inode, struct file *file)
 {
 	struct ft1000_info *info;
 	struct ft1000_usb *dev = (struct ft1000_usb *)inode->i_private;
@@ -301,22 +309,22 @@
 
 	info = file->private_data = netdev_priv(dev->net);
 
-    DEBUG("f_owner = %p number of application = %d\n", (&file->f_owner), dev->appcnt );
+    DEBUG("f_owner = %p number of application = %d\n", (&file->f_owner), dev->appcnt);
 
-    // Check if maximum number of application exceeded
+    /* Check if maximum number of application exceeded */
     if (dev->appcnt > MAX_NUM_APP) {
         DEBUG("Maximum number of application exceeded\n");
         return -EACCES;
     }
 
-    // Search for available application info block
+    /* Search for available application info block */
     for (i=0; i<MAX_NUM_APP; i++) {
-        if ( (dev->app_info[i].fileobject == NULL) ) {
+        if ((dev->app_info[i].fileobject == NULL)) {
             break;
         }
     }
 
-    // Fail due to lack of application info block
+    /* Fail due to lack of application info block */
     if (i == MAX_NUM_APP) {
         DEBUG("Could not find an application info block\n");
         return -EACCES;
@@ -334,16 +342,18 @@
 }
 
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_poll_dev
-//
-// Parameters:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_poll_dev
+*
+* Parameters:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
 
 static unsigned int ft1000_poll_dev(struct file *file, poll_table *wait)
 {
@@ -352,24 +362,24 @@
 	struct ft1000_usb *dev = info->priv;
     int i;
 
-    //DEBUG("ft1000_poll_dev called\n");
+    /* DEBUG("ft1000_poll_dev called\n"); */
     if (ft1000_flarion_cnt == 0) {
         DEBUG("FT1000:ft1000_poll_dev called when ft1000_flarion_cnt is zero\n");
         return (-EBADF);
     }
 
-    // Search for matching file object
+    /* Search for matching file object */
     for (i=0; i<MAX_NUM_APP; i++) {
-        if ( dev->app_info[i].fileobject == &file->f_owner) {
-            //DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", dev->app_info[i].app_id);
+        if (dev->app_info[i].fileobject == &file->f_owner) {
+            /* DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", dev->app_info[i].app_id); */
             break;
         }
     }
 
-    // Could not find application info block
+    /* Could not find application info block */
     if (i == MAX_NUM_APP) {
         DEBUG("FT1000:ft1000_ioctl:Could not find application info block\n");
-        return ( -EACCES );
+        return (-EACCES);
     }
 
     if (list_empty(&dev->app_info[i].app_sqlist) == 0) {
@@ -377,23 +387,25 @@
         return(POLLIN | POLLRDNORM | POLLPRI);
     }
 
-    poll_wait (file, &dev->app_info[i].wait_dpram_msg, wait);
-    //DEBUG("FT1000:ft1000_poll_dev:Polling for data from DSP\n");
+    poll_wait(file, &dev->app_info[i].wait_dpram_msg, wait);
+    /* DEBUG("FT1000:ft1000_poll_dev:Polling for data from DSP\n"); */
 
     return (0);
 }
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_ioctl
-//
-// Parameters:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
-static long ft1000_ioctl (struct file *file, unsigned int command,
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_ioctl
+*
+* Parameters:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
+static long ft1000_ioctl(struct file *file, unsigned int command,
                            unsigned long argument)
 {
     void __user *argp = (void __user *)argument;
@@ -417,21 +429,21 @@
     unsigned short ledStat=0;
     unsigned short conStat=0;
 
-    //DEBUG("ft1000_ioctl called\n");
+    /* DEBUG("ft1000_ioctl called\n"); */
 
     if (ft1000_flarion_cnt == 0) {
         DEBUG("FT1000:ft1000_ioctl called when ft1000_flarion_cnt is zero\n");
         return (-EBADF);
     }
 
-    //DEBUG("FT1000:ft1000_ioctl:command = 0x%x argument = 0x%8x\n", command, (u32)argument);
+    /* DEBUG("FT1000:ft1000_ioctl:command = 0x%x argument = 0x%8x\n", command, (u32)argument); */
 
 	info = file->private_data;
 	ft1000dev = info->priv;
     cmd = _IOC_NR(command);
-    //DEBUG("FT1000:ft1000_ioctl:cmd = 0x%x\n", cmd);
+    /* DEBUG("FT1000:ft1000_ioctl:cmd = 0x%x\n", cmd); */
 
-    // process the command
+    /* process the command */
     switch (cmd) {
     case IOCTL_REGISTER_CMD:
             DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_REGISTER called\n");
@@ -441,7 +453,7 @@
                 break;
             }
             if (tempword == DSPBCMSGID) {
-                // Search for matching file object
+                /* Search for matching file object */
                 for (i=0; i<MAX_NUM_APP; i++) {
                     if (ft1000dev->app_info[i].fileobject == &file->f_owner) {
                         ft1000dev->app_info[i].DspBCMsgFlag = 1;
@@ -457,7 +469,7 @@
 
         get_ver_data.drv_ver = FT1000_DRV_VER;
 
-        if (copy_to_user(argp, &get_ver_data, sizeof(get_ver_data)) ) {
+        if (copy_to_user(argp, &get_ver_data, sizeof(get_ver_data))) {
             DEBUG("FT1000:ft1000_ioctl: copy fault occurred\n");
             result = -EFAULT;
             break;
@@ -467,20 +479,20 @@
 
         break;
     case IOCTL_CONNECT:
-        // Connect Message
+        /* Connect Message */
         DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_CONNECT\n");
         ConnectionMsg[79] = 0xfc;
 			   card_send_command(ft1000dev, (unsigned short *)ConnectionMsg, 0x4c);
 
         break;
     case IOCTL_DISCONNECT:
-        // Disconnect Message
+        /* Disconnect Message */
         DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_DISCONNECT\n");
         ConnectionMsg[79] = 0xfd;
 			   card_send_command(ft1000dev, (unsigned short *)ConnectionMsg, 0x4c);
         break;
     case IOCTL_GET_DSP_STAT_CMD:
-        //DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DSP_STAT called\n");
+        /* DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DSP_STAT called\n"); */
 	memset(&get_stat_data, 0, sizeof(get_stat_data));
         memcpy(get_stat_data.DspVer, info->DspVer, DSPVERSZ);
         memcpy(get_stat_data.HwSerNum, info->HwSerNum, HWSERNUMSZ);
@@ -494,8 +506,7 @@
                 ft1000_read_dpram16(ft1000dev, FT1000_MAG_DSP_CON_STATE, (u8 *)&conStat, FT1000_MAG_DSP_CON_STATE_INDX);
                 get_stat_data.ConStat = ntohs(conStat);
                 DEBUG("FT1000:ft1000_ioctl: ConStat = 0x%x\n", get_stat_data.ConStat);
-            }
-            else {
+            } else {
                 get_stat_data.ConStat = 0x0f;
             }
 
@@ -504,10 +515,10 @@
         get_stat_data.nRxPkts = info->stats.rx_packets;
         get_stat_data.nTxBytes = info->stats.tx_bytes;
         get_stat_data.nRxBytes = info->stats.rx_bytes;
-        do_gettimeofday ( &tv );
+        do_gettimeofday(&tv);
         get_stat_data.ConTm = (u32)(tv.tv_sec - info->ConTm);
         DEBUG("Connection Time = %d\n", (int)get_stat_data.ConTm);
-        if (copy_to_user(argp, &get_stat_data, sizeof(get_stat_data)) ) {
+        if (copy_to_user(argp, &get_stat_data, sizeof(get_stat_data))) {
             DEBUG("FT1000:ft1000_ioctl: copy fault occurred\n");
             result = -EFAULT;
             break;
@@ -517,7 +528,7 @@
     case IOCTL_SET_DPRAM_CMD:
         {
             IOCTL_DPRAM_BLK *dpram_data = NULL;
-            //IOCTL_DPRAM_COMMAND dpram_command;
+            /* IOCTL_DPRAM_COMMAND dpram_command; */
             u16 qtype;
             u16 msgsz;
 		struct pseudo_hdr *ppseudo_hdr;
@@ -526,7 +537,7 @@
             u16 app_index;
             u16 status;
 
-            //DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_SET_DPRAM called\n");
+            /* DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_SET_DPRAM called\n");*/
 
 
             if (ft1000_flarion_cnt == 0) {
@@ -545,12 +556,12 @@
 
             if (info->CardReady) {
 
-               //DEBUG("FT1000:ft1000_ioctl: try to SET_DPRAM \n");
+               /* DEBUG("FT1000:ft1000_ioctl: try to SET_DPRAM \n"); */
 
-                // Get the length field to see how many bytes to copy
+                /* Get the length field to see how many bytes to copy */
                 result = get_user(msgsz, (__u16 __user *)argp);
-                msgsz = ntohs (msgsz);
-                //DEBUG("FT1000:ft1000_ioctl: length of message = %d\n", msgsz);
+                msgsz = ntohs(msgsz);
+                /* DEBUG("FT1000:ft1000_ioctl: length of message = %d\n", msgsz); */
 
                 if (msgsz > MAX_CMD_SQSIZE) {
                     DEBUG("FT1000:ft1000_ioctl: bad message length = %d\n", msgsz);
@@ -563,12 +574,11 @@
 		if (!dpram_data)
 			break;
 
-                if ( copy_from_user(dpram_data, argp, msgsz+2) ) {
+                if (copy_from_user(dpram_data, argp, msgsz+2)) {
                     DEBUG("FT1000:ft1000_ChIoctl: copy fault occurred\n");
                     result = -EFAULT;
-                }
-                else {
-                    // Check if this message came from a registered application
+                } else {
+                    /* Check if this message came from a registered application */
                     for (i=0; i<MAX_NUM_APP; i++) {
                         if (ft1000dev->app_info[i].fileobject == &file->f_owner) {
                             break;
@@ -582,28 +592,27 @@
                     }
                     app_index = i;
 
-                    // Check message qtype type which is the lower byte within qos_class
+                    /* Check message qtype type which is the lower byte within qos_class */
                     qtype = ntohs(dpram_data->pseudohdr.qos_class) & 0xff;
-                    //DEBUG("FT1000_ft1000_ioctl: qtype = %d\n", qtype);
+                    /* DEBUG("FT1000_ft1000_ioctl: qtype = %d\n", qtype); */
                     if (qtype) {
-                    }
-                    else {
-                        // Put message into Slow Queue
-                        // Only put a message into the DPRAM if msg doorbell is available
+                    } else {
+                        /* Put message into Slow Queue */
+                        /* Only put a message into the DPRAM if msg doorbell is available */
                         status = ft1000_read_register(ft1000dev, &tempword, FT1000_REG_DOORBELL);
-                        //DEBUG("FT1000_ft1000_ioctl: READ REGISTER tempword=%x\n", tempword);
+                        /* DEBUG("FT1000_ft1000_ioctl: READ REGISTER tempword=%x\n", tempword); */
                         if (tempword & FT1000_DB_DPRAM_TX) {
-                            // Suspend for 2ms and try again due to DSP doorbell busy
+                            /* Suspend for 2ms and try again due to DSP doorbell busy */
                             mdelay(2);
                             status = ft1000_read_register(ft1000dev, &tempword, FT1000_REG_DOORBELL);
                             if (tempword & FT1000_DB_DPRAM_TX) {
-                                // Suspend for 1ms and try again due to DSP doorbell busy
+                                /* Suspend for 1ms and try again due to DSP doorbell busy */
                                 mdelay(1);
                                 status = ft1000_read_register(ft1000dev, &tempword, FT1000_REG_DOORBELL);
                                 if (tempword & FT1000_DB_DPRAM_TX) {
                                     status = ft1000_read_register(ft1000dev, &tempword, FT1000_REG_DOORBELL);
                                     if (tempword & FT1000_DB_DPRAM_TX) {
-                                        // Suspend for 3ms and try again due to DSP doorbell busy
+                                        /* Suspend for 3ms and try again due to DSP doorbell busy */
                                         mdelay(3);
                                         status = ft1000_read_register(ft1000dev, &tempword, FT1000_REG_DOORBELL);
                                         if (tempword & FT1000_DB_DPRAM_TX) {
@@ -617,11 +626,11 @@
                             }
                         }
 
-                        //DEBUG("FT1000_ft1000_ioctl: finished reading register\n");
+                        /*DEBUG("FT1000_ft1000_ioctl: finished reading register\n"); */
 
-                        // Make sure we are within the limits of the slow queue memory limitation
-                        if ( (msgsz < MAX_CMD_SQSIZE) && (msgsz > PSEUDOSZ) ) {
-                            // Need to put sequence number plus new checksum for message
+                        /* Make sure we are within the limits of the slow queue memory limitation */
+                        if ((msgsz < MAX_CMD_SQSIZE) && (msgsz > PSEUDOSZ)) {
+                            /* Need to put sequence number plus new checksum for message */
                             pmsg = (u16 *)&dpram_data->pseudohdr;
 				ppseudo_hdr = (struct pseudo_hdr *)pmsg;
                             total_len = msgsz+2;
@@ -629,15 +638,15 @@
                                 total_len++;
                             }
 
-                            // Insert slow queue sequence number
+                            /* Insert slow queue sequence number */
                             ppseudo_hdr->seq_num = info->squeseqnum++;
                             ppseudo_hdr->portsrc = ft1000dev->app_info[app_index].app_id;
-                            // Calculate new checksum
+                            /* Calculate new checksum */
                             ppseudo_hdr->checksum = *pmsg++;
-                            //DEBUG("checksum = 0x%x\n", ppseudo_hdr->checksum);
+                            /* DEBUG("checksum = 0x%x\n", ppseudo_hdr->checksum); */
                             for (i=1; i<7; i++) {
                                 ppseudo_hdr->checksum ^= *pmsg++;
-                                //DEBUG("checksum = 0x%x\n", ppseudo_hdr->checksum);
+                                /* DEBUG("checksum = 0x%x\n", ppseudo_hdr->checksum); */
                             }
                             pmsg++;
 				ppseudo_hdr = (struct pseudo_hdr *)pmsg;
@@ -645,14 +654,12 @@
 
 
                             ft1000dev->app_info[app_index].nTxMsg++;
-                        }
-                        else {
+                        } else {
                             result = -EINVAL;
                         }
                     }
                 }
-            }
-            else {
+            } else {
                 DEBUG("FT1000:ft1000_ioctl: Card not ready take messages\n");
                 result = -EACCES;
             }
@@ -666,21 +673,21 @@
             IOCTL_DPRAM_BLK __user *pioctl_dpram;
             int msglen;
 
-            //DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DPRAM called\n");
+            /* DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DPRAM called\n"); */
 
             if (ft1000_flarion_cnt == 0) {
                 return (-EBADF);
             }
 
-            // Search for matching file object
+            /* Search for matching file object */
             for (i=0; i<MAX_NUM_APP; i++) {
                 if (ft1000dev->app_info[i].fileobject == &file->f_owner) {
-                    //DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", ft1000dev->app_info[i].app_id);
+                    /*DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", ft1000dev->app_info[i].app_id); */
                     break;
                 }
             }
 
-            // Could not find application info block
+            /* Could not find application info block */
             if (i == MAX_NUM_APP) {
                 DEBUG("FT1000:ft1000_ioctl:Could not find application info block\n");
                 result = -EBADF;
@@ -690,30 +697,29 @@
             result = 0;
             pioctl_dpram = argp;
             if (list_empty(&ft1000dev->app_info[i].app_sqlist) == 0) {
-                //DEBUG("FT1000:ft1000_ioctl:Message detected in slow queue\n");
+                /* DEBUG("FT1000:ft1000_ioctl:Message detected in slow queue\n"); */
                 spin_lock_irqsave(&free_buff_lock, flags);
                 pdpram_blk = list_entry(ft1000dev->app_info[i].app_sqlist.next, struct dpram_blk, list);
                 list_del(&pdpram_blk->list);
                 ft1000dev->app_info[i].NumOfMsg--;
-                //DEBUG("FT1000:ft1000_ioctl:NumOfMsg for app %d = %d\n", i, ft1000dev->app_info[i].NumOfMsg);
+                /* DEBUG("FT1000:ft1000_ioctl:NumOfMsg for app %d = %d\n", i, ft1000dev->app_info[i].NumOfMsg); */
                 spin_unlock_irqrestore(&free_buff_lock, flags);
                 msglen = ntohs(*(u16 *)pdpram_blk->pbuffer) + PSEUDOSZ;
                 result = get_user(msglen, &pioctl_dpram->total_len);
 		if (result)
 			break;
 		msglen = htons(msglen);
-                //DEBUG("FT1000:ft1000_ioctl:msg length = %x\n", msglen);
-                if(copy_to_user (&pioctl_dpram->pseudohdr, pdpram_blk->pbuffer, msglen))
-				{
+                /* DEBUG("FT1000:ft1000_ioctl:msg length = %x\n", msglen); */
+                if (copy_to_user (&pioctl_dpram->pseudohdr, pdpram_blk->pbuffer, msglen)) {
 					DEBUG("FT1000:ft1000_ioctl: copy fault occurred\n");
-	             	result = -EFAULT;
-	             	break;
+			result = -EFAULT;
+			break;
 				}
 
                 ft1000_free_buffer(pdpram_blk, &freercvpool);
                 result = msglen;
             }
-            //DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DPRAM no message\n");
+            /* DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DPRAM no message\n"); */
         }
         break;
 
@@ -726,17 +732,19 @@
     return result;
 }
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_release
-//
-// Parameters:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
-static int ft1000_release (struct inode *inode, struct file *file)
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_release
+*
+* Parameters:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
+static int ft1000_release(struct inode *inode, struct file *file)
 {
 	struct ft1000_info *info;
     struct net_device *dev;
@@ -755,10 +763,10 @@
         return (-EBADF);
     }
 
-    // Search for matching file object
+    /* Search for matching file object */
     for (i=0; i<MAX_NUM_APP; i++) {
-        if ( ft1000dev->app_info[i].fileobject == &file->f_owner) {
-            //DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", ft1000dev->app_info[i].app_id);
+        if (ft1000dev->app_info[i].fileobject == &file->f_owner) {
+            /* DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", ft1000dev->app_info[i].app_id); */
             break;
         }
     }
@@ -773,11 +781,10 @@
         ft1000_free_buffer(pdpram_blk, &freercvpool);
     }
 
-    // initialize application information
+    /* initialize application information */
     ft1000dev->appcnt--;
     DEBUG("ft1000_chdev:%s:appcnt = %d\n", __FUNCTION__, ft1000dev->appcnt);
     ft1000dev->app_info[i].fileobject = NULL;
 
     return 0;
 }
-

diff --git a/drivers/staging/ft1000/ft1000-usb/ft1000_ioctl.h b/drivers/staging/ft1000/ft1000-usb/ft1000_ioctl.h
index 3f4207f..24b8d77 100644
--- a/drivers/staging/ft1000/ft1000-usb/ft1000_ioctl.h
+++ b/drivers/staging/ft1000/ft1000-usb/ft1000_ioctl.h

@@ -1,91 +1,89 @@
-//---------------------------------------------------------------------------
-// FT1000 driver for Flarion Flash OFDM NIC Device
-//
-// Copyright (C) 2002 Flarion Technologies, All rights reserved.
-//
-// This program is free software; you can redistribute it and/or modify it
-// under the terms of the GNU General Public License as published by the Free
-// Software Foundation; either version 2 of the License, or (at your option) any
-// later version. This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-// more details. You should have received a copy of the GNU General Public
-// License along with this program; if not, write to the
-// Free Software Foundation, Inc., 59 Temple Place -
-// Suite 330, Boston, MA 02111-1307, USA.
-//---------------------------------------------------------------------------
-//
-// File:         ft1000_ioctl.h
-//
-// Description:    Common structures and defines relating to IOCTL
-//
-// History:
-// 11/5/02    Whc                Created.
-//
-//---------------------------------------------------------------------------//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* FT1000 driver for Flarion Flash OFDM NIC Device
+*
+* Copyright (C) 2002 Flarion Technologies, All rights reserved.
+*
+* This program is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License as published by the Free
+* Software Foundation; either version 2 of the License, or (at your option) any
+* later version. This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+* or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+* more details. You should have received a copy of the GNU General Public
+* License along with this program; if not, write to the
+* Free Software Foundation, Inc., 59 Temple Place -
+* Suite 330, Boston, MA 02111-1307, USA.
+*---------------------------------------------------------------------------
+*
+* File:         ft1000_ioctl.h
+*
+* Description:    Common structures and defines relating to IOCTL
+*
+* History:
+* 11/5/02    Whc                Created.
+*
+*---------------------------------------------------------------------------//---------------------------------------------------------------------------
+*/
 #ifndef _FT1000IOCTLH_
 #define _FT1000IOCTLH_
 
-typedef struct _IOCTL_GET_VER
-{
+typedef struct _IOCTL_GET_VER {
     unsigned long drv_ver;
 } __attribute__ ((packed)) IOCTL_GET_VER, *PIOCTL_GET_VER;
 
-//Data structure for Dsp statistics
-typedef struct _IOCTL_GET_DSP_STAT
-{
-    unsigned char DspVer[DSPVERSZ];        // DSP version number
-    unsigned char HwSerNum[HWSERNUMSZ];    // Hardware Serial Number
-    unsigned char Sku[SKUSZ];              // SKU
-    unsigned char eui64[EUISZ];            // EUI64
-    unsigned short ConStat;                // Connection Status
-                                //    Bits 0-3 = Connection Status Field
-                                //               0000=Idle (Disconnect)
-                                //               0001=Searching
-                                //               0010=Active (Connected)
-                                //               0011=Waiting for L2 down
-                                //               0100=Sleep
-    unsigned short LedStat;                // Led Status
-                                //    Bits 0-3   = Signal Strength Field
-                                //                 0000 = -105dBm to -92dBm
-                                //                 0001 = -92dBm to -85dBm
-                                //                 0011 = -85dBm to -75dBm
-                                //                 0111 = -75dBm to -50dBm
-                                //                 1111 = -50dBm to 0dBm
-                                //    Bits 4-7   = Reserved
-                                //    Bits 8-11  = SNR Field
-                                //                 0000 = <2dB
-                                //                 0001 = 2dB to 8dB
-                                //                 0011 = 8dB to 15dB
-                                //                 0111 = 15dB to 22dB
-                                //                 1111 = >22dB
-                                //    Bits 12-15 = Reserved
-    unsigned long nTxPkts;                // Number of packets transmitted from host to dsp
-    unsigned long nRxPkts;                // Number of packets received from dsp to host
-    unsigned long nTxBytes;               // Number of bytes transmitted from host to dsp
-    unsigned long nRxBytes;               // Number of bytes received from dsp to host
-    unsigned long ConTm;                  // Current session connection time in seconds
-    unsigned char CalVer[CALVERSZ];       // Proprietary Calibration Version
-    unsigned char CalDate[CALDATESZ];     // Proprietary Calibration Date
+/* Data structure for Dsp statistics */
+typedef struct _IOCTL_GET_DSP_STAT {
+    unsigned char DspVer[DSPVERSZ];        /* DSP version number */
+    unsigned char HwSerNum[HWSERNUMSZ];    /* Hardware Serial Number */
+    unsigned char Sku[SKUSZ];              /* SKU */
+    unsigned char eui64[EUISZ];            /* EUI64 */
+    unsigned short ConStat;                /* Connection Status */
+                                /*    Bits 0-3 = Connection Status Field */
+                                /*               0000=Idle (Disconnect) */
+                                /*               0001=Searching */
+                                /*               0010=Active (Connected) */
+                                /*               0011=Waiting for L2 down */
+                                /*               0100=Sleep */
+    unsigned short LedStat;                /* Led Status */
+                                /*    Bits 0-3   = Signal Strength Field */
+                                /*                 0000 = -105dBm to -92dBm */
+                                /*                 0001 = -92dBm to -85dBm */
+                                /*                 0011 = -85dBm to -75dBm */
+                                /*                 0111 = -75dBm to -50dBm */
+                                /*                 1111 = -50dBm to 0dBm */
+                                /*    Bits 4-7   = Reserved */
+                                /*    Bits 8-11  = SNR Field */
+                                /*                 0000 = <2dB */
+                                /*                 0001 = 2dB to 8dB */
+                                /*                 0011 = 8dB to 15dB */
+                                /*                 0111 = 15dB to 22dB */
+                                /*                 1111 = >22dB */
+                                /*    Bits 12-15 = Reserved */
+    unsigned long nTxPkts;                /* Number of packets transmitted from host to dsp */
+    unsigned long nRxPkts;                /* Number of packets received from dsp to host */
+    unsigned long nTxBytes;               /* Number of bytes transmitted from host to dsp */
+    unsigned long nRxBytes;               /* Number of bytes received from dsp to host */
+    unsigned long ConTm;                  /* Current session connection time in seconds */
+    unsigned char CalVer[CALVERSZ];       /* Proprietary Calibration Version */
+    unsigned char CalDate[CALDATESZ];     /* Proprietary Calibration Date */
 } __attribute__ ((packed)) IOCTL_GET_DSP_STAT, *PIOCTL_GET_DSP_STAT;
 
-//Data structure for Dual Ported RAM messaging between Host and Dsp
-typedef struct _IOCTL_DPRAM_BLK
-{
+/* Data structure for Dual Ported RAM messaging between Host and Dsp */
+typedef struct _IOCTL_DPRAM_BLK {
     unsigned short total_len;
 	struct pseudo_hdr pseudohdr;
     unsigned char buffer[1780];
 } __attribute__ ((packed)) IOCTL_DPRAM_BLK, *PIOCTL_DPRAM_BLK;
 
-typedef struct _IOCTL_DPRAM_COMMAND
-{
+typedef struct _IOCTL_DPRAM_COMMAND {
     unsigned short extra;
     IOCTL_DPRAM_BLK dpram_blk;
 } __attribute__ ((packed)) IOCTL_DPRAM_COMMAND, *PIOCTL_DPRAM_COMMAND;
 
-//
-// Custom IOCTL command codes
-//
+/*
+* Custom IOCTL command codes
+*/
 #define FT1000_MAGIC_CODE      'F'
 
 #define IOCTL_REGISTER_CMD					0
@@ -96,12 +94,12 @@
 #define IOCTL_CONNECT               10
 #define IOCTL_DISCONNECT            11
 
-#define IOCTL_FT1000_GET_DSP_STAT _IOR (FT1000_MAGIC_CODE, IOCTL_GET_DSP_STAT_CMD, sizeof(IOCTL_GET_DSP_STAT) )
-#define IOCTL_FT1000_GET_VER _IOR (FT1000_MAGIC_CODE, IOCTL_GET_VER_CMD, sizeof(IOCTL_GET_VER) )
-#define IOCTL_FT1000_CONNECT _IOW (FT1000_MAGIC_CODE, IOCTL_CONNECT, 0 )
-#define IOCTL_FT1000_DISCONNECT _IOW (FT1000_MAGIC_CODE, IOCTL_DISCONNECT, 0 )
-#define IOCTL_FT1000_SET_DPRAM _IOW (FT1000_MAGIC_CODE, IOCTL_SET_DPRAM_CMD, sizeof(IOCTL_DPRAM_BLK) )
-#define IOCTL_FT1000_GET_DPRAM _IOR (FT1000_MAGIC_CODE, IOCTL_GET_DPRAM_CMD, sizeof(IOCTL_DPRAM_BLK) )
-#define IOCTL_FT1000_REGISTER  _IOW (FT1000_MAGIC_CODE, IOCTL_REGISTER_CMD, sizeof(unsigned short *) )
-#endif // _FT1000IOCTLH_
+#define IOCTL_FT1000_GET_DSP_STAT _IOR(FT1000_MAGIC_CODE, IOCTL_GET_DSP_STAT_CMD, sizeof(IOCTL_GET_DSP_STAT)
+#define IOCTL_FT1000_GET_VER _IOR(FT1000_MAGIC_CODE, IOCTL_GET_VER_CMD, sizeof(IOCTL_GET_VER)
+#define IOCTL_FT1000_CONNECT _IOW(FT1000_MAGIC_CODE, IOCTL_CONNECT, 0
+#define IOCTL_FT1000_DISCONNECT _IOW(FT1000_MAGIC_CODE, IOCTL_DISCONNECT, 0
+#define IOCTL_FT1000_SET_DPRAM _IOW(FT1000_MAGIC_CODE, IOCTL_SET_DPRAM_CMD, sizeof(IOCTL_DPRAM_BLK)
+#define IOCTL_FT1000_GET_DPRAM _IOR(FT1000_MAGIC_CODE, IOCTL_GET_DPRAM_CMD, sizeof(IOCTL_DPRAM_BLK)
+#define IOCTL_FT1000_REGISTER  _IOW(FT1000_MAGIC_CODE, IOCTL_REGISTER_CMD, sizeof(unsigned short *)
+#endif /* _FT1000IOCTLH_ */
 

diff --git a/drivers/staging/ft1000/ft1000-usb/ft1000_usb.c b/drivers/staging/ft1000/ft1000-usb/ft1000_usb.c
index 614db55..29a7cd2 100644
--- a/drivers/staging/ft1000/ft1000-usb/ft1000_usb.c
+++ b/drivers/staging/ft1000/ft1000-usb/ft1000_usb.c

@@ -79,8 +79,12 @@
 	ft1000dev->dev = dev;
 	ft1000dev->status = 0;
 	ft1000dev->net = NULL;
-	ft1000dev->tx_urb = usb_alloc_urb(0, GFP_ATOMIC);
-	ft1000dev->rx_urb = usb_alloc_urb(0, GFP_ATOMIC);
+	ft1000dev->tx_urb = usb_alloc_urb(0, GFP_KERNEL);
+	ft1000dev->rx_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!ft1000dev->tx_urb || !ft1000dev->rx_urb) {
+		ret = -ENOMEM;
+		goto err_fw;
+	}
 
 	DEBUG("ft1000_probe is called\n");
 	numaltsetting = interface->num_altsetting;
@@ -209,6 +213,8 @@
 err_load:
 	kfree(pFileStart);
 err_fw:
+	usb_free_urb(ft1000dev->rx_urb);
+	usb_free_urb(ft1000dev->tx_urb);
 	kfree(ft1000dev);
 	return ret;
 }

diff --git a/drivers/staging/fwserial/fwserial.c b/drivers/staging/fwserial/fwserial.c
index e5818a1..4e1cd5e 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c

@@ -18,6 +18,8 @@
  * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/device.h>
@@ -101,13 +103,16 @@
 };
 
 #define to_device(a, b)			(a->b)
-#define fwtty_err(p, s, v...)		dev_err(to_device(p, device), s, ##v)
-#define fwtty_info(p, s, v...)		dev_info(to_device(p, device), s, ##v)
-#define fwtty_notice(p, s, v...)	dev_notice(to_device(p, device), s, ##v)
-#define fwtty_dbg(p, s, v...)		\
-		dev_dbg(to_device(p, device), "%s: " s, __func__, ##v)
-#define fwtty_err_ratelimited(p, s, v...) \
-		dev_err_ratelimited(to_device(p, device), s, ##v)
+#define fwtty_err(p, fmt, ...)						\
+	dev_err(to_device(p, device), fmt, ##__VA_ARGS__)
+#define fwtty_info(p, fmt, ...)						\
+	dev_info(to_device(p, device), fmt, ##__VA_ARGS__)
+#define fwtty_notice(p, fmt, ...)					\
+	dev_notice(to_device(p, device), fmt, ##__VA_ARGS__)
+#define fwtty_dbg(p, fmt, ...)						\
+	dev_dbg(to_device(p, device), "%s: " fmt, __func__, ##__VA_ARGS__)
+#define fwtty_err_ratelimited(p, fmt, ...)				\
+	dev_err_ratelimited(to_device(p, device), fmt, ##__VA_ARGS__)
 
 #ifdef DEBUG
 static inline void debug_short_write(struct fwtty_port *port, int c, int n)
@@ -118,7 +123,7 @@
 		spin_lock_bh(&port->lock);
 		avail = dma_fifo_avail(&port->tx_fifo);
 		spin_unlock_bh(&port->lock);
-		fwtty_dbg(port, "short write: avail:%d req:%d wrote:%d",
+		fwtty_dbg(port, "short write: avail:%d req:%d wrote:%d\n",
 			  avail, c, n);
 	}
 }
@@ -197,22 +202,22 @@
 {
 	switch (rcode) {
 	case RCODE_SEND_ERROR:
-		fwtty_err_ratelimited(port, "card busy");
+		fwtty_err_ratelimited(port, "card busy\n");
 		break;
 	case RCODE_ADDRESS_ERROR:
-		fwtty_err_ratelimited(port, "bad unit addr or write length");
+		fwtty_err_ratelimited(port, "bad unit addr or write length\n");
 		break;
 	case RCODE_DATA_ERROR:
-		fwtty_err_ratelimited(port, "failed rx");
+		fwtty_err_ratelimited(port, "failed rx\n");
 		break;
 	case RCODE_NO_ACK:
-		fwtty_err_ratelimited(port, "missing ack");
+		fwtty_err_ratelimited(port, "missing ack\n");
 		break;
 	case RCODE_BUSY:
-		fwtty_err_ratelimited(port, "remote busy");
+		fwtty_err_ratelimited(port, "remote busy\n");
 		break;
 	default:
-		fwtty_err_ratelimited(port, "failed tx: %d", rcode);
+		fwtty_err_ratelimited(port, "failed tx: %d\n", rcode);
 	}
 }
 
@@ -287,7 +292,7 @@
 		schedule_delayed_work(&port->drain, 0);
 	avail = dma_fifo_avail(&port->tx_fifo);
 
-	fwtty_dbg(port, "fifo len: %d avail: %d", len, avail);
+	fwtty_dbg(port, "fifo len: %d avail: %d\n", len, avail);
 }
 
 static void fwtty_restart_tx(struct fwtty_port *port)
@@ -323,7 +328,7 @@
 	if (delta & TIOCM_CTS)
 		++port->icount.cts;
 
-	fwtty_dbg(port, "status: %x delta: %x", status, delta);
+	fwtty_dbg(port, "status: %x delta: %x\n", status, delta);
 
 	if (delta & TIOCM_CAR) {
 		tty = tty_port_tty_get(&port->port);
@@ -509,7 +514,7 @@
 	n = (elapsed * port->cps) / HZ + 1;
 	port->break_last = now;
 
-	fwtty_dbg(port, "sending %d brks", n);
+	fwtty_dbg(port, "sending %d brks\n", n);
 
 	while (n) {
 		t = min(n, 16);
@@ -570,7 +575,7 @@
 	size_t size = (n + sizeof(struct buffered_rx) + 0xFF) & ~0xFF;
 
 	if (port->buffered + n > HIGH_WATERMARK) {
-		fwtty_err_ratelimited(port, "overflowed rx buffer: buffered: %d new: %zu wtrmk: %d",
+		fwtty_err_ratelimited(port, "overflowed rx buffer: buffered: %d new: %zu wtrmk: %d\n",
 				      port->buffered, n, HIGH_WATERMARK);
 		return 0;
 	}
@@ -599,7 +604,7 @@
 	unsigned lsr;
 	int err = 0;
 
-	fwtty_dbg(port, "%d", n);
+	fwtty_dbg(port, "%d\n", n);
 	profile_size_distrib(port->stats.reads, n);
 
 	if (port->write_only) {
@@ -689,7 +694,7 @@
 	rcu_read_unlock();
 	if (!peer || peer != rcu_access_pointer(port->peer)) {
 		rcode = RCODE_ADDRESS_ERROR;
-		fwtty_err_ratelimited(port, "ignoring unauthenticated data");
+		fwtty_err_ratelimited(port, "ignoring unauthenticated data\n");
 		goto respond;
 	}
 
@@ -746,7 +751,7 @@
 	struct fwtty_port *port = txn->port;
 	int len;
 
-	fwtty_dbg(port, "rcode: %d", rcode);
+	fwtty_dbg(port, "rcode: %d\n", rcode);
 
 	switch (rcode) {
 	case RCODE_COMPLETE:
@@ -809,7 +814,7 @@
 		n = dma_fifo_out_pend(&port->tx_fifo, &txn->dma_pended);
 		spin_unlock_bh(&port->lock);
 
-		fwtty_dbg(port, "out: %u rem: %d", txn->dma_pended.len, n);
+		fwtty_dbg(port, "out: %u rem: %d\n", txn->dma_pended.len, n);
 
 		if (n < 0) {
 			kmem_cache_free(fwtty_txn_cache, txn);
@@ -819,7 +824,8 @@
 				profile_size_distrib(port->stats.txns, 0);
 			else {
 				++port->stats.fifo_errs;
-				fwtty_err_ratelimited(port, "fifo err: %d", n);
+				fwtty_err_ratelimited(port, "fifo err: %d\n",
+						      n);
 			}
 			break;
 		}
@@ -877,7 +883,7 @@
 
 	++port->stats.xchars;
 
-	fwtty_dbg(port, "%02x", ch);
+	fwtty_dbg(port, "%02x\n", ch);
 
 	rcu_read_lock();
 	peer = rcu_dereference(port->peer);
@@ -964,7 +970,7 @@
 {
 	struct fwtty_port *port = to_port(tty_port, port);
 
-	fwtty_dbg(port, "on/off: %d", on);
+	fwtty_dbg(port, "on/off: %d\n", on);
 
 	spin_lock_bh(&port->lock);
 	/* Don't change carrier state if this is a console */
@@ -992,7 +998,7 @@
 
 	rc = (port->mstatus & TIOCM_CAR);
 
-	fwtty_dbg(port, "%d", rc);
+	fwtty_dbg(port, "%d\n", rc);
 
 	return rc;
 }
@@ -1177,7 +1183,7 @@
 	struct fwtty_port *port = tty->driver_data;
 	int n, len;
 
-	fwtty_dbg(port, "%d", c);
+	fwtty_dbg(port, "%d\n", c);
 	profile_size_distrib(port->stats.writes, c);
 
 	spin_lock_bh(&port->lock);
@@ -1204,7 +1210,7 @@
 	n = dma_fifo_avail(&port->tx_fifo);
 	spin_unlock_bh(&port->lock);
 
-	fwtty_dbg(port, "%d", n);
+	fwtty_dbg(port, "%d\n", n);
 
 	return n;
 }
@@ -1218,7 +1224,7 @@
 	n = dma_fifo_level(&port->tx_fifo);
 	spin_unlock_bh(&port->lock);
 
-	fwtty_dbg(port, "%d", n);
+	fwtty_dbg(port, "%d\n", n);
 
 	return n;
 }
@@ -1227,7 +1233,7 @@
 {
 	struct fwtty_port *port = tty->driver_data;
 
-	fwtty_dbg(port, "%02x", ch);
+	fwtty_dbg(port, "%02x\n", ch);
 
 	fwtty_write_xchar(port, ch);
 }
@@ -1254,7 +1260,7 @@
 {
 	struct fwtty_port *port = tty->driver_data;
 
-	fwtty_dbg(port, "CRTSCTS: %d", (C_CRTSCTS(tty) != 0));
+	fwtty_dbg(port, "CRTSCTS: %d\n", (C_CRTSCTS(tty) != 0));
 
 	profile_fifo_avail(port, port->stats.unthrottle);
 
@@ -1409,7 +1415,7 @@
 	struct fwtty_port *port = tty->driver_data;
 	long ret;
 
-	fwtty_dbg(port, "%d", state);
+	fwtty_dbg(port, "%d\n", state);
 
 	if (state == -1) {
 		set_bit(STOP_TX, &port->flags);
@@ -1446,7 +1452,7 @@
 	tiocm = (port->mctrl & MCTRL_MASK) | (port->mstatus & ~MCTRL_MASK);
 	spin_unlock_bh(&port->lock);
 
-	fwtty_dbg(port, "%x", tiocm);
+	fwtty_dbg(port, "%x\n", tiocm);
 
 	return tiocm;
 }
@@ -1455,7 +1461,7 @@
 {
 	struct fwtty_port *port = tty->driver_data;
 
-	fwtty_dbg(port, "set: %x clear: %x", set, clear);
+	fwtty_dbg(port, "set: %x clear: %x\n", set, clear);
 
 	/* TODO: simulate loopback if TIOCM_LOOP set */
 
@@ -1775,7 +1781,7 @@
 	if (port->port.console && port->fwcon_ops->notify != NULL)
 		(*port->fwcon_ops->notify)(FWCON_NOTIFY_ATTACH, port->con_data);
 
-	fwtty_info(&peer->unit, "peer (guid:%016llx) connected on %s",
+	fwtty_info(&peer->unit, "peer (guid:%016llx) connected on %s\n",
 		   (unsigned long long)peer->guid, dev_name(port->device));
 }
 
@@ -1797,7 +1803,7 @@
 					   pkt, be16_to_cpu(pkt->hdr.len));
 		if (rcode == RCODE_BUSY || rcode == RCODE_SEND_ERROR ||
 		    rcode == RCODE_GENERATION) {
-			fwtty_dbg(&peer->unit, "mgmt write error: %d", rcode);
+			fwtty_dbg(&peer->unit, "mgmt write error: %d\n", rcode);
 			continue;
 		} else
 			break;
@@ -1918,7 +1924,7 @@
 
 	port = fwserial_find_port(peer);
 	if (!port) {
-		fwtty_err(&peer->unit, "avail ports in use");
+		fwtty_err(&peer->unit, "avail ports in use\n");
 		err = -EBUSY;
 		goto free_pkt;
 	}
@@ -2056,7 +2062,7 @@
 		 * has created its remote unit device before this driver has
 		 * been probed for any unit devices...
 		 */
-		fwtty_err(card, "unknown card (guid %016llx)",
+		fwtty_err(card, "unknown card (guid %016llx)\n",
 			  (unsigned long long) card->guid);
 		return NULL;
 	}
@@ -2084,8 +2090,8 @@
 	list_for_each_entry_rcu(peer, &serial->peer_list, list) {
 		int g = peer->generation;
 		smp_rmb();
-		fwtty_dbg(card, "peer(%d:%x) guid: %016llx\n", g,
-			  peer->node_id, (unsigned long long) peer->guid);
+		fwtty_dbg(card, "peer(%d:%x) guid: %016llx\n",
+			  g, peer->node_id, (unsigned long long) peer->guid);
 	}
 }
 #else
@@ -2173,7 +2179,7 @@
 	peer->serial = serial;
 	list_add_rcu(&peer->list, &serial->peer_list);
 
-	fwtty_info(&peer->unit, "peer added (guid:%016llx)",
+	fwtty_info(&peer->unit, "peer added (guid:%016llx)\n",
 		   (unsigned long long)peer->guid);
 
 	/* identify the local unit & virt cable to loopback port */
@@ -2236,7 +2242,7 @@
 
 	list_del_rcu(&peer->list);
 
-	fwtty_info(&peer->unit, "peer removed (guid:%016llx)",
+	fwtty_info(&peer->unit, "peer removed (guid:%016llx)\n",
 		   (unsigned long long)peer->guid);
 
 	spin_unlock_bh(&peer->lock);
@@ -2324,7 +2330,7 @@
 
 	err = fwtty_ports_add(serial);
 	if (err) {
-		fwtty_err(&unit, "no space in port table");
+		fwtty_err(&unit, "no space in port table\n");
 		goto free_ports;
 	}
 
@@ -2335,7 +2341,8 @@
 						   card->device);
 		if (IS_ERR(tty_dev)) {
 			err = PTR_ERR(tty_dev);
-			fwtty_err(&unit, "register tty device error (%d)", err);
+			fwtty_err(&unit, "register tty device error (%d)\n",
+				  err);
 			goto unregister_ttys;
 		}
 
@@ -2352,7 +2359,8 @@
 						    card->device);
 		if (IS_ERR(loop_dev)) {
 			err = PTR_ERR(loop_dev);
-			fwtty_err(&unit, "create loop device failed (%d)", err);
+			fwtty_err(&unit, "create loop device failed (%d)\n",
+				  err);
 			goto unregister_ttys;
 		}
 		serial->ports[j]->device = loop_dev;
@@ -2372,14 +2380,14 @@
 
 	list_add_rcu(&serial->list, &fwserial_list);
 
-	fwtty_notice(&unit, "TTY over FireWire on device %s (guid %016llx)",
+	fwtty_notice(&unit, "TTY over FireWire on device %s (guid %016llx)\n",
 		     dev_name(card->device), (unsigned long long) card->guid);
 
 	err = fwserial_add_peer(serial, unit);
 	if (!err)
 		return 0;
 
-	fwtty_err(&unit, "unable to add peer unit device (%d)", err);
+	fwtty_err(&unit, "unable to add peer unit device (%d)\n", err);
 
 	/* fall-through to error processing */
 	debugfs_remove_recursive(serial->debugfs);
@@ -2621,7 +2629,7 @@
 	switch (peer->state) {
 	case FWPS_NOT_ATTACHED:
 		if (!port) {
-			fwtty_err(&peer->unit, "no more ports avail");
+			fwtty_err(&peer->unit, "no more ports avail\n");
 			fill_plug_rsp_nack(pkt);
 		} else {
 			peer->port = port;
@@ -2663,7 +2671,7 @@
 			fwtty_write_port_status(tmp);
 			spin_lock_bh(&peer->lock);
 		} else {
-			fwtty_err(&peer->unit, "PLUG_RSP error (%d)", rcode);
+			fwtty_err(&peer->unit, "PLUG_RSP error (%d)\n", rcode);
 			port = peer_revert_state(peer);
 		}
 	}
@@ -2715,7 +2723,8 @@
 	spin_lock_bh(&peer->lock);
 	if (peer->state == FWPS_UNPLUG_RESPONDING) {
 		if (rcode != RCODE_COMPLETE)
-			fwtty_err(&peer->unit, "UNPLUG_RSP error (%d)", rcode);
+			fwtty_err(&peer->unit, "UNPLUG_RSP error (%d)\n",
+				  rcode);
 		port = peer_revert_state(peer);
 	}
 cleanup:
@@ -2750,19 +2759,19 @@
 		 * already removed from the bus -- and the removal was
 		 * processed before we rec'd this transaction
 		 */
-		fwtty_err(&peer->unit, "peer already removed");
+		fwtty_err(&peer->unit, "peer already removed\n");
 		spin_unlock_bh(&peer->lock);
 		return RCODE_ADDRESS_ERROR;
 	}
 
 	rcode = RCODE_COMPLETE;
 
-	fwtty_dbg(&peer->unit, "mgmt: hdr.code: %04hx", pkt->hdr.code);
+	fwtty_dbg(&peer->unit, "mgmt: hdr.code: %04hx\n", pkt->hdr.code);
 
 	switch (be16_to_cpu(pkt->hdr.code) & FWSC_CODE_MASK) {
 	case FWSC_VIRT_CABLE_PLUG:
 		if (work_pending(&peer->work)) {
-			fwtty_err(&peer->unit, "plug req: busy");
+			fwtty_err(&peer->unit, "plug req: busy\n");
 			rcode = RCODE_CONFLICT_ERROR;
 
 		} else {
@@ -2777,7 +2786,7 @@
 			rcode = RCODE_CONFLICT_ERROR;
 
 		} else if (be16_to_cpu(pkt->hdr.code) & FWSC_RSP_NACK) {
-			fwtty_notice(&peer->unit, "NACK plug rsp");
+			fwtty_notice(&peer->unit, "NACK plug rsp\n");
 			port = peer_revert_state(peer);
 
 		} else {
@@ -2793,7 +2802,7 @@
 
 	case FWSC_VIRT_CABLE_UNPLUG:
 		if (work_pending(&peer->work)) {
-			fwtty_err(&peer->unit, "unplug req: busy");
+			fwtty_err(&peer->unit, "unplug req: busy\n");
 			rcode = RCODE_CONFLICT_ERROR;
 		} else {
 			PREPARE_WORK(&peer->work, fwserial_handle_unplug_req);
@@ -2806,14 +2815,14 @@
 			rcode = RCODE_CONFLICT_ERROR;
 		else {
 			if (be16_to_cpu(pkt->hdr.code) & FWSC_RSP_NACK)
-				fwtty_notice(&peer->unit, "NACK unplug?");
+				fwtty_notice(&peer->unit, "NACK unplug?\n");
 			port = peer_revert_state(peer);
 			reset = true;
 		}
 		break;
 
 	default:
-		fwtty_err(&peer->unit, "unknown mgmt code %d",
+		fwtty_err(&peer->unit, "unknown mgmt code %d\n",
 			  be16_to_cpu(pkt->hdr.code));
 		rcode = RCODE_DATA_ERROR;
 	}
@@ -2847,7 +2856,7 @@
 	rcu_read_lock();
 	peer = __fwserial_peer_by_node_id(card, generation, source);
 	if (!peer) {
-		fwtty_dbg(card, "peer(%d:%x) not found", generation, source);
+		fwtty_dbg(card, "peer(%d:%x) not found\n", generation, source);
 		__dump_peer_list(card);
 		rcode = RCODE_CONFLICT_ERROR;
 
@@ -2897,7 +2906,7 @@
 
 	err = tty_register_driver(fwtty_driver);
 	if (err) {
-		driver_err("register tty driver failed (%d)", err);
+		pr_err("register tty driver failed (%d)\n", err);
 		goto put_tty;
 	}
 
@@ -2922,7 +2931,7 @@
 
 		err = tty_register_driver(fwloop_driver);
 		if (err) {
-			driver_err("register loop driver failed (%d)", err);
+			pr_err("register loop driver failed (%d)\n", err);
 			goto put_loop;
 		}
 	}
@@ -2948,7 +2957,7 @@
 	err = fw_core_add_address_handler(&fwserial_mgmt_addr_handler,
 					  &fwserial_mgmt_addr_region);
 	if (err) {
-		driver_err("add management handler failed (%d)", err);
+		pr_err("add management handler failed (%d)\n", err);
 		goto destroy_cache;
 	}
 
@@ -2956,13 +2965,13 @@
 		FW_UNIT_ADDRESS(fwserial_mgmt_addr_handler.offset);
 	err = fw_core_add_descriptor(&fwserial_unit_directory);
 	if (err) {
-		driver_err("add unit descriptor failed (%d)", err);
+		pr_err("add unit descriptor failed (%d)\n", err);
 		goto remove_handler;
 	}
 
 	err = driver_register(&fwserial_driver.driver);
 	if (err) {
-		driver_err("register fwserial driver failed (%d)", err);
+		pr_err("register fwserial driver failed (%d)\n", err);
 		goto remove_descriptor;
 	}
 

diff --git a/drivers/staging/fwserial/fwserial.h b/drivers/staging/fwserial/fwserial.h
index 514f571..2463501 100644
--- a/drivers/staging/fwserial/fwserial.h
+++ b/drivers/staging/fwserial/fwserial.h

@@ -356,8 +356,6 @@
 
 extern struct tty_driver *fwtty_driver;
 
-#define driver_err(s, v...)	pr_err(KBUILD_MODNAME ": " s, ##v)
-
 struct fwtty_port *fwtty_port_get(unsigned index);
 void fwtty_port_put(struct fwtty_port *port);
 

diff --git a/drivers/staging/gdm72xx/Kconfig b/drivers/staging/gdm72xx/Kconfig
index 6905913..dd8a391 100644
--- a/drivers/staging/gdm72xx/Kconfig
+++ b/drivers/staging/gdm72xx/Kconfig

@@ -4,7 +4,7 @@
 
 menuconfig WIMAX_GDM72XX
 	tristate "GCT GDM72xx WiMAX support"
-	depends on NET
+	depends on NET && (USB || MMC)
 	help
 	  Support for the GCT GDM72xx WiMAX chip
 
@@ -19,7 +19,7 @@
 	default n
 
 config WIMAX_GDM72XX_WIMAX2
-	bool "Enable WIMAX2 support"
+	bool "Enable WiMAX2 support"
 	default n
 
 choice
@@ -27,18 +27,18 @@
 
 config WIMAX_GDM72XX_USB
 	bool "USB interface"
-	depends on USB
+	depends on (USB = y || USB = WIMAX_GDM72XX)
 
 config WIMAX_GDM72XX_SDIO
 	bool "SDIO interface"
-	depends on MMC
+	depends on (MMC = y || MMC = WIMAX_GDM72XX)
 
 endchoice
 
 if WIMAX_GDM72XX_USB
 
 config WIMAX_GDM72XX_USB_PM
-	bool "Enable power managerment support"
+	bool "Enable power management support"
 	depends on PM_RUNTIME
 
 endif # WIMAX_GDM72XX_USB

diff --git a/drivers/staging/gdm72xx/gdm_wimax.c b/drivers/staging/gdm72xx/gdm_wimax.c
index 41efbee..dd85497 100644
--- a/drivers/staging/gdm72xx/gdm_wimax.c
+++ b/drivers/staging/gdm72xx/gdm_wimax.c

@@ -939,8 +939,7 @@
 	struct net_device *dev;
 	int ret;
 
-	dev = (struct net_device *)alloc_netdev(sizeof(*nic),
-						"wm%d", ether_setup);
+	dev = alloc_netdev(sizeof(*nic), "wm%d", ether_setup);
 
 	if (dev == NULL) {
 		pr_err("alloc_etherdev failed\n");

diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c
index d3bed21f..f96dcec 100644
--- a/drivers/staging/goldfish/goldfish_audio.c
+++ b/drivers/staging/goldfish/goldfish_audio.c

@@ -1,4 +1,5 @@
-/* drivers/misc/goldfish_audio.c
+/*
+ * drivers/misc/goldfish_audio.c
  *
  * Copyright (C) 2007 Google, Inc.
  * Copyright (C) 2012 Intel, Inc.
@@ -47,10 +48,11 @@
 	int read_supported;         /* true if we have audio input support */
 };
 
-/* We will allocate two read buffers and two write buffers.
-   Having two read buffers facilitate stereo -> mono conversion.
-   Having two write buffers facilitate interleaved IO.
-*/
+/*
+ *  We will allocate two read buffers and two write buffers.
+ *  Having two read buffers facilitate stereo -> mono conversion.
+ *  Having two write buffers facilitate interleaved IO.
+ */
 #define READ_BUFFER_SIZE        16384
 #define WRITE_BUFFER_SIZE       16384
 #define COMBINED_BUFFER_SIZE    ((2 * READ_BUFFER_SIZE) + \
@@ -59,8 +61,10 @@
 #define AUDIO_READ(data, addr)		(readl(data->reg_base + addr))
 #define AUDIO_WRITE(data, addr, x)	(writel(x, data->reg_base + addr))
 
-/* temporary variable used between goldfish_audio_probe() and
-   goldfish_audio_open() */
+/*
+ *  temporary variable used between goldfish_audio_probe() and
+ *  goldfish_audio_open()
+ */
 static struct goldfish_audio *audio_data;
 
 enum {
@@ -161,8 +165,10 @@
 		}
 
 		spin_lock_irqsave(&data->lock, irq_flags);
-		/* clear the buffer empty flag, and signal the emulator
-		 * to start writing the buffer */
+		/*
+		 *  clear the buffer empty flag, and signal the emulator
+		 *  to start writing the buffer
+		 */
 		if (kbuf == data->write_buffer1) {
 			data->buffer_status &= ~AUDIO_INT_WRITE_BUFFER_1_EMPTY;
 			AUDIO_WRITE(data, AUDIO_WRITE_BUFFER_1, copy);
@@ -225,8 +231,10 @@
 	/* read buffer status flags */
 	status = AUDIO_READ(data, AUDIO_INT_STATUS);
 	status &= AUDIO_INT_MASK;
-	/* if buffers are newly empty, wake up blocked
-	   goldfish_audio_write() call */
+	/*
+	 *  if buffers are newly empty, wake up blocked
+	 *  goldfish_audio_write() call
+	 */
 	if (status) {
 		data->buffer_status = status;
 		wake_up(&data->wait);

diff --git a/drivers/staging/goldfish/goldfish_nand.c b/drivers/staging/goldfish/goldfish_nand.c
index ab1f019..81e2ad4 100644
--- a/drivers/staging/goldfish/goldfish_nand.c
+++ b/drivers/staging/goldfish/goldfish_nand.c

@@ -326,9 +326,10 @@
 			(mtd->writesize + mtd->oobsize) * mtd->writesize;
 	do_div(mtd->size, mtd->writesize + mtd->oobsize);
 	mtd->size *= mtd->writesize;
-	dev_dbg(&pdev->dev, 
+	dev_dbg(&pdev->dev,
 		"goldfish nand dev%d: size %llx, page %d, extra %d, erase %d\n",
-		       id, mtd->size, mtd->writesize, mtd->oobsize, mtd->erasesize);
+		       id, mtd->size, mtd->writesize,
+		       mtd->oobsize, mtd->erasesize);
 	spin_unlock_irqrestore(&nand->lock, irq_flags);
 
 	mtd->priv = nand;
@@ -340,7 +341,7 @@
 	result = goldfish_nand_cmd(mtd, NAND_CMD_GET_DEV_NAME, 0, name_len,
 									name);
 	if (result != name_len) {
-		dev_err(&pdev->dev, 
+		dev_err(&pdev->dev,
 			"goldfish_nand_init_device failed to get dev name %d != %d\n",
 			       result, name_len);
 		return -ENODEV;
@@ -391,7 +392,7 @@
 
 	version = readl(base + NAND_VERSION);
 	if (version != NAND_VERSION_CURRENT) {
-		dev_err(&pdev->dev, 
+		dev_err(&pdev->dev,
 			"goldfish_nand_init: version mismatch, got %d, expected %d\n",
 				version, NAND_VERSION_CURRENT);
 		return -ENODEV;
@@ -400,7 +401,7 @@
 	if (num_dev == 0)
 		return -ENODEV;
 
-	nand = devm_kzalloc(&pdev->dev, sizeof(*nand) + 
+	nand = devm_kzalloc(&pdev->dev, sizeof(*nand) +
 				sizeof(struct mtd_info) * num_dev, GFP_KERNEL);
 	if (nand == NULL)
 		return -ENOMEM;

diff --git a/drivers/staging/goldfish/goldfish_nand_reg.h b/drivers/staging/goldfish/goldfish_nand_reg.h
index 956c6c3..ddfda71 100644
--- a/drivers/staging/goldfish/goldfish_nand_reg.h
+++ b/drivers/staging/goldfish/goldfish_nand_reg.h

@@ -1,27 +1,30 @@
-/* drivers/mtd/devices/goldfish_nand_reg.h
-**
-** Copyright (C) 2007 Google, Inc.
-**
-** This software is licensed under the terms of the GNU General Public
-** License version 2, as published by the Free Software Foundation, and
-** may be copied, distributed, and modified under those terms.
-**
-** This program is distributed in the hope that it will be useful,
-** but WITHOUT ANY WARRANTY; without even the implied warranty of
-** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-** GNU General Public License for more details.
-**
-*/
+/*
+ * drivers/mtd/devices/goldfish_nand_reg.h
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
 
 #ifndef GOLDFISH_NAND_REG_H
 #define GOLDFISH_NAND_REG_H
 
 enum nand_cmd {
-	NAND_CMD_GET_DEV_NAME,  /* Write device name for NAND_DEV to NAND_DATA (vaddr) */
+	/* Write device name for NAND_DEV to NAND_DATA (vaddr) */
+	NAND_CMD_GET_DEV_NAME,
 	NAND_CMD_READ,
 	NAND_CMD_WRITE,
 	NAND_CMD_ERASE,
-	NAND_CMD_BLOCK_BAD_GET, /* NAND_RESULT is 1 if block is bad, 0 if it is not */
+	/* NAND_RESULT is 1 if block is bad, 0 if it is not */
+	NAND_CMD_BLOCK_BAD_GET,
 	NAND_CMD_BLOCK_BAD_SET,
 	NAND_CMD_READ_WITH_PARAMS,
 	NAND_CMD_WRITE_WITH_PARAMS,

diff --git a/drivers/staging/iio/adc/ad7192.c b/drivers/staging/iio/adc/ad7192.c
index 5047019..3283e282 100644
--- a/drivers/staging/iio/adc/ad7192.c
+++ b/drivers/staging/iio/adc/ad7192.c

@@ -326,7 +326,7 @@
 	unsigned long lval;
 	int div, ret;
 
-	ret = strict_strtoul(buf, 10, &lval);
+	ret = kstrtoul(buf, 10, &lval);
 	if (ret)
 		return ret;
 	if (lval == 0)

diff --git a/drivers/staging/iio/adc/ad7280a.c b/drivers/staging/iio/adc/ad7280a.c
index 2fd6ee3..c19618b 100644
--- a/drivers/staging/iio/adc/ad7280a.c
+++ b/drivers/staging/iio/adc/ad7280a.c

@@ -632,7 +632,7 @@
 	long val;
 	int ret;
 
-	ret = strict_strtol(buf, 10, &val);
+	ret = kstrtol(buf, 10, &val);
 	if (ret)
 		return ret;
 

diff --git a/drivers/staging/iio/adc/ad7291.c b/drivers/staging/iio/adc/ad7291.c
index d088c66..3fc79e5 100644
--- a/drivers/staging/iio/adc/ad7291.c
+++ b/drivers/staging/iio/adc/ad7291.c

@@ -21,6 +21,8 @@
 #include <linux/iio/sysfs.h>
 #include <linux/iio/events.h>
 
+#include "ad7291.h"
+
 /*
  * Simplified handling
  *
@@ -39,33 +41,9 @@
 #define AD7291_VOLTAGE			0x01
 #define AD7291_T_SENSE			0x02
 #define AD7291_T_AVERAGE		0x03
-#define AD7291_CH0_DATA_HIGH		0x04
-#define AD7291_CH0_DATA_LOW		0x05
-#define AD7291_CH0_HYST			0x06
-#define AD7291_CH1_DATA_HIGH		0x07
-#define AD7291_CH1_DATA_LOW		0x08
-#define AD7291_CH1_HYST			0x09
-#define AD7291_CH2_DATA_HIGH		0x0A
-#define AD7291_CH2_DATA_LOW		0x0B
-#define AD7291_CH2_HYST			0x0C
-#define AD7291_CH3_DATA_HIGH		0x0D
-#define AD7291_CH3_DATA_LOW		0x0E
-#define AD7291_CH3_HYST			0x0F
-#define AD7291_CH4_DATA_HIGH		0x10
-#define AD7291_CH4_DATA_LOW		0x11
-#define AD7291_CH4_HYST			0x12
-#define AD7291_CH5_DATA_HIGH		0x13
-#define AD7291_CH5_DATA_LOW		0x14
-#define AD7291_CH5_HYST			0x15
-#define AD7291_CH6_DATA_HIGH		0x16
-#define AD7291_CH6_DATA_LOW		0x17
-#define AD7291_CH6_HYST			0x18
-#define AD7291_CH7_DATA_HIGH		0x19
-#define AD7291_CH7_DATA_LOW		0x1A
-#define AD7291_CH7_HYST			0x2B
-#define AD7291_T_SENSE_HIGH		0x1C
-#define AD7291_T_SENSE_LOW		0x1D
-#define AD7291_T_SENSE_HYST		0x1E
+#define AD7291_DATA_HIGH(x)		((x) * 3 + 0x4)
+#define AD7291_DATA_LOW(x)		((x) * 3 + 0x5)
+#define AD7291_HYST(x)			((x) * 3 + 0x6)
 #define AD7291_VOLTAGE_ALERT_STATUS	0x1F
 #define AD7291_T_ALERT_STATUS		0x20
 
@@ -100,7 +78,6 @@
 struct ad7291_chip_info {
 	struct i2c_client	*client;
 	struct regulator	*reg;
-	u16			int_vref_mv;
 	u16			command;
 	u16			c_mask;	/* Active voltage channels for events */
 	struct mutex		state_lock;
@@ -111,45 +88,22 @@
 	struct i2c_client *client = chip->client;
 	int ret = 0;
 
-	ret = i2c_smbus_read_word_data(client, reg);
+	ret = i2c_smbus_read_word_swapped(client, reg);
 	if (ret < 0) {
 		dev_err(&client->dev, "I2C read error\n");
 		return ret;
 	}
 
-	*data = swab16((u16)ret);
+	*data = ret;
 
 	return 0;
 }
 
 static int ad7291_i2c_write(struct ad7291_chip_info *chip, u8 reg, u16 data)
 {
-	return i2c_smbus_write_word_data(chip->client, reg, swab16(data));
+	return i2c_smbus_write_word_swapped(chip->client, reg, data);
 }
 
-static ssize_t ad7291_store_reset(struct device *dev,
-		struct device_attribute *attr,
-		const char *buf,
-		size_t len)
-{
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct ad7291_chip_info *chip = iio_priv(indio_dev);
-
-	return ad7291_i2c_write(chip, AD7291_COMMAND,
-				chip->command | AD7291_RESET);
-}
-
-static IIO_DEVICE_ATTR(reset, S_IWUSR, NULL, ad7291_store_reset, 0);
-
-static struct attribute *ad7291_attributes[] = {
-	&iio_dev_attr_reset.dev_attr.attr,
-	NULL,
-};
-
-static const struct attribute_group ad7291_attribute_group = {
-	.attrs = ad7291_attributes,
-};
-
 static irqreturn_t ad7291_event_handler(int irq, void *private)
 {
 	struct iio_dev *indio_dev = private;
@@ -255,31 +209,31 @@
 static IIO_DEVICE_ATTR(in_temp0_thresh_both_hyst_raw,
 		       S_IRUGO | S_IWUSR,
 		       ad7291_show_hyst, ad7291_set_hyst,
-		       AD7291_T_SENSE_HYST);
+		       AD7291_HYST(8));
 static IIO_DEVICE_ATTR(in_voltage0_thresh_both_hyst_raw,
 		       S_IRUGO | S_IWUSR,
-		       ad7291_show_hyst, ad7291_set_hyst, AD7291_CH0_HYST);
+		       ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(0));
 static IIO_DEVICE_ATTR(in_voltage1_thresh_both_hyst_raw,
 		       S_IRUGO | S_IWUSR,
-		       ad7291_show_hyst, ad7291_set_hyst, AD7291_CH1_HYST);
+		       ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(1));
 static IIO_DEVICE_ATTR(in_voltage2_thresh_both_hyst_raw,
 		       S_IRUGO | S_IWUSR,
-		       ad7291_show_hyst, ad7291_set_hyst, AD7291_CH2_HYST);
+		       ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(2));
 static IIO_DEVICE_ATTR(in_voltage3_thresh_both_hyst_raw,
 		       S_IRUGO | S_IWUSR,
-		       ad7291_show_hyst, ad7291_set_hyst, AD7291_CH3_HYST);
+		       ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(3));
 static IIO_DEVICE_ATTR(in_voltage4_thresh_both_hyst_raw,
 		       S_IRUGO | S_IWUSR,
-		       ad7291_show_hyst, ad7291_set_hyst, AD7291_CH4_HYST);
+		       ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(4));
 static IIO_DEVICE_ATTR(in_voltage5_thresh_both_hyst_raw,
 		       S_IRUGO | S_IWUSR,
-		       ad7291_show_hyst, ad7291_set_hyst, AD7291_CH5_HYST);
+		       ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(5));
 static IIO_DEVICE_ATTR(in_voltage6_thresh_both_hyst_raw,
 		       S_IRUGO | S_IWUSR,
-		       ad7291_show_hyst, ad7291_set_hyst, AD7291_CH6_HYST);
+		       ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(6));
 static IIO_DEVICE_ATTR(in_voltage7_thresh_both_hyst_raw,
 		       S_IRUGO | S_IWUSR,
-		       ad7291_show_hyst, ad7291_set_hyst, AD7291_CH7_HYST);
+		       ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(7));
 
 static struct attribute *ad7291_event_attributes[] = {
 	&iio_dev_attr_in_temp0_thresh_both_hyst_raw.dev_attr.attr,
@@ -294,53 +248,45 @@
 	NULL,
 };
 
-/* high / low */
-static u8 ad7291_limit_regs[9][2] = {
-	{ AD7291_CH0_DATA_HIGH, AD7291_CH0_DATA_LOW },
-	{ AD7291_CH1_DATA_HIGH, AD7291_CH1_DATA_LOW },
-	{ AD7291_CH2_DATA_HIGH, AD7291_CH2_DATA_LOW },
-	{ AD7291_CH3_DATA_HIGH, AD7291_CH3_DATA_LOW }, /* FIXME: ? */
-	{ AD7291_CH4_DATA_HIGH, AD7291_CH4_DATA_LOW },
-	{ AD7291_CH5_DATA_HIGH, AD7291_CH5_DATA_LOW },
-	{ AD7291_CH6_DATA_HIGH, AD7291_CH6_DATA_LOW },
-	{ AD7291_CH7_DATA_HIGH, AD7291_CH7_DATA_LOW },
-	/* temp */
-	{ AD7291_T_SENSE_HIGH, AD7291_T_SENSE_LOW },
-};
+static unsigned int ad7291_threshold_reg(u64 event_code)
+{
+	unsigned int offset;
+
+	switch (IIO_EVENT_CODE_EXTRACT_CHAN_TYPE(event_code)) {
+	case IIO_VOLTAGE:
+		offset = IIO_EVENT_CODE_EXTRACT_CHAN(event_code);
+		break;
+	case IIO_TEMP:
+		offset = 8;
+		break;
+	default:
+	    return 0;
+	}
+
+	if (IIO_EVENT_CODE_EXTRACT_DIR(event_code) == IIO_EV_DIR_FALLING)
+		return AD7291_DATA_LOW(offset);
+	else
+		return AD7291_DATA_HIGH(offset);
+}
 
 static int ad7291_read_event_value(struct iio_dev *indio_dev,
 				   u64 event_code,
 				   int *val)
 {
 	struct ad7291_chip_info *chip = iio_priv(indio_dev);
-
 	int ret;
-	u8 reg;
 	u16 uval;
-	s16 signval;
+
+	ret = ad7291_i2c_read(chip, ad7291_threshold_reg(event_code), &uval);
+	if (ret < 0)
+		return ret;
 
 	switch (IIO_EVENT_CODE_EXTRACT_CHAN_TYPE(event_code)) {
 	case IIO_VOLTAGE:
-		reg = ad7291_limit_regs[IIO_EVENT_CODE_EXTRACT_CHAN(event_code)]
-			[!(IIO_EVENT_CODE_EXTRACT_DIR(event_code) ==
-			   IIO_EV_DIR_RISING)];
-
-		ret = ad7291_i2c_read(chip, reg, &uval);
-		if (ret < 0)
-			return ret;
 		*val = uval & AD7291_VALUE_MASK;
 		return 0;
-
 	case IIO_TEMP:
-		reg = ad7291_limit_regs[8]
-			[!(IIO_EVENT_CODE_EXTRACT_DIR(event_code) ==
-			   IIO_EV_DIR_RISING)];
-
-		ret = ad7291_i2c_read(chip, reg, &signval);
-		if (ret < 0)
-			return ret;
-		signval = (s16)((signval & AD7291_VALUE_MASK) << 4) >> 4;
-		*val = signval;
+		*val = sign_extend32(uval, 11);
 		return 0;
 	default:
 		return -EINVAL;
@@ -352,28 +298,21 @@
 				    int val)
 {
 	struct ad7291_chip_info *chip = iio_priv(indio_dev);
-	u8 reg;
-	s16 signval;
 
 	switch (IIO_EVENT_CODE_EXTRACT_CHAN_TYPE(event_code)) {
 	case IIO_VOLTAGE:
 		if (val > AD7291_VALUE_MASK || val < 0)
 			return -EINVAL;
-		reg = ad7291_limit_regs[IIO_EVENT_CODE_EXTRACT_CHAN(event_code)]
-			[!(IIO_EVENT_CODE_EXTRACT_DIR(event_code) ==
-			   IIO_EV_DIR_RISING)];
-		return ad7291_i2c_write(chip, reg, val);
+		break;
 	case IIO_TEMP:
 		if (val > 2047 || val < -2048)
 			return -EINVAL;
-		reg = ad7291_limit_regs[8]
-			[!(IIO_EVENT_CODE_EXTRACT_DIR(event_code) ==
-			   IIO_EV_DIR_RISING)];
-		signval = val;
-		return ad7291_i2c_write(chip, reg, *(u16 *)&signval);
+		break;
 	default:
 		return -EINVAL;
-	};
+	}
+
+	return ad7291_i2c_write(chip, ad7291_threshold_reg(event_code), val);
 }
 
 static int ad7291_read_event_config(struct iio_dev *indio_dev,
@@ -456,9 +395,7 @@
 {
 	int ret;
 	struct ad7291_chip_info *chip = iio_priv(indio_dev);
-	unsigned int scale_uv;
 	u16 regval;
-	s16 signval;
 
 	switch (mask) {
 	case IIO_CHAN_INFO_RAW:
@@ -479,44 +416,47 @@
 				return ret;
 			}
 			/* Read voltage */
-			ret = i2c_smbus_read_word_data(chip->client,
+			ret = i2c_smbus_read_word_swapped(chip->client,
 						       AD7291_VOLTAGE);
 			if (ret < 0) {
 				mutex_unlock(&chip->state_lock);
 				return ret;
 			}
-			*val = swab16((u16)ret) & AD7291_VALUE_MASK;
+			*val = ret & AD7291_VALUE_MASK;
 			mutex_unlock(&chip->state_lock);
 			return IIO_VAL_INT;
 		case IIO_TEMP:
 			/* Assumes tsense bit of command register always set */
-			ret = i2c_smbus_read_word_data(chip->client,
+			ret = i2c_smbus_read_word_swapped(chip->client,
 						       AD7291_T_SENSE);
 			if (ret < 0)
 				return ret;
-			signval = (s16)((swab16((u16)ret) &
-				AD7291_VALUE_MASK) << 4) >> 4;
-			*val = signval;
+			*val = sign_extend32(ret, 11);
 			return IIO_VAL_INT;
 		default:
 			return -EINVAL;
 		}
 	case IIO_CHAN_INFO_AVERAGE_RAW:
-		ret = i2c_smbus_read_word_data(chip->client,
+		ret = i2c_smbus_read_word_swapped(chip->client,
 					       AD7291_T_AVERAGE);
 			if (ret < 0)
 				return ret;
-			signval = (s16)((swab16((u16)ret) &
-				AD7291_VALUE_MASK) << 4) >> 4;
-			*val = signval;
+			*val = sign_extend32(ret, 11);
 			return IIO_VAL_INT;
 	case IIO_CHAN_INFO_SCALE:
 		switch (chan->type) {
 		case IIO_VOLTAGE:
-			scale_uv = (chip->int_vref_mv * 1000) >> AD7291_BITS;
-			*val =  scale_uv / 1000;
-			*val2 = (scale_uv % 1000) * 1000;
-			return IIO_VAL_INT_PLUS_MICRO;
+			if (chip->reg) {
+				int vref;
+				vref = regulator_get_voltage(chip->reg);
+				if (vref < 0)
+					return vref;
+				*val = vref / 1000;
+			} else {
+				*val = 2500;
+			}
+			*val2 = AD7291_BITS;
+			return IIO_VAL_FRACTIONAL_LOG2;
 		case IIO_TEMP:
 			/*
 			 * One LSB of the ADC corresponds to 0.25 deg C.
@@ -571,7 +511,6 @@
 };
 
 static const struct iio_info ad7291_info = {
-	.attrs = &ad7291_attribute_group,
 	.read_raw = &ad7291_read_raw,
 	.read_event_config = &ad7291_read_event_config,
 	.write_event_config = &ad7291_write_event_config,
@@ -583,9 +522,10 @@
 static int ad7291_probe(struct i2c_client *client,
 		const struct i2c_device_id *id)
 {
+	struct ad7291_platform_data *pdata = client->dev.platform_data;
 	struct ad7291_chip_info *chip;
 	struct iio_dev *indio_dev;
-	int ret = 0, voltage_uv = 0;
+	int ret = 0;
 
 	indio_dev = iio_device_alloc(sizeof(*chip));
 	if (indio_dev == NULL) {
@@ -594,12 +534,14 @@
 	}
 	chip = iio_priv(indio_dev);
 
-	chip->reg = regulator_get(&client->dev, "vcc");
-	if (!IS_ERR(chip->reg)) {
+	if (pdata && pdata->use_external_ref) {
+		chip->reg = regulator_get(&client->dev, "vref");
+		if (IS_ERR(chip->reg))
+			goto error_free;
+
 		ret = regulator_enable(chip->reg);
 		if (ret)
 			goto error_put_reg;
-		voltage_uv = regulator_get_voltage(chip->reg);
 	}
 
 	mutex_init(&chip->state_lock);
@@ -612,12 +554,8 @@
 			AD7291_T_SENSE_MASK | /* Tsense always enabled */
 			AD7291_ALERT_POLARITY; /* set irq polarity low level */
 
-	if (voltage_uv) {
-		chip->int_vref_mv = voltage_uv / 1000;
+	if (pdata && pdata->use_external_ref)
 		chip->command |= AD7291_EXT_REF;
-	} else {
-		chip->int_vref_mv = 2500; /* Build-in ref */
-	}
 
 	indio_dev->name = id->name;
 	indio_dev->channels = ad7291_channels;
@@ -654,21 +592,18 @@
 	if (ret)
 		goto error_unreg_irq;
 
-	dev_info(&client->dev, "%s ADC registered.\n",
-			 id->name);
-
 	return 0;
 
 error_unreg_irq:
 	if (client->irq)
 		free_irq(client->irq, indio_dev);
 error_disable_reg:
-	if (!IS_ERR(chip->reg))
+	if (chip->reg)
 		regulator_disable(chip->reg);
 error_put_reg:
-	if (!IS_ERR(chip->reg))
+	if (chip->reg)
 		regulator_put(chip->reg);
-
+error_free:
 	iio_device_free(indio_dev);
 error_ret:
 	return ret;
@@ -684,7 +619,7 @@
 	if (client->irq)
 		free_irq(client->irq, indio_dev);
 
-	if (!IS_ERR(chip->reg)) {
+	if (chip->reg) {
 		regulator_disable(chip->reg);
 		regulator_put(chip->reg);
 	}

diff --git a/drivers/staging/iio/adc/ad7291.h b/drivers/staging/iio/adc/ad7291.h
new file mode 100644
index 0000000..bbd89fa
--- /dev/null
+++ b/drivers/staging/iio/adc/ad7291.h

@@ -0,0 +1,12 @@
+#ifndef __IIO_AD7291_H__
+#define __IIO_AD7291_H__
+
+/**
+ * struct ad7291_platform_data - AD7291 platform data
+ * @use_external_ref: Whether to use an external or internal reference voltage
+ */
+struct ad7291_platform_data {
+	bool use_external_ref;
+};
+
+#endif

diff --git a/drivers/staging/iio/adc/ad7606_core.c b/drivers/staging/iio/adc/ad7606_core.c
index d104b43..72868ce 100644
--- a/drivers/staging/iio/adc/ad7606_core.c
+++ b/drivers/staging/iio/adc/ad7606_core.c

@@ -125,9 +125,12 @@
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
 	struct ad7606_state *st = iio_priv(indio_dev);
 	unsigned long lval;
+	int ret;
 
-	if (strict_strtoul(buf, 10, &lval))
-		return -EINVAL;
+	ret = kstrtoul(buf, 10, &lval);
+	if (ret)
+		return ret;
+
 	if (!(lval == 5000 || lval == 10000)) {
 		dev_err(dev, "range is not supported\n");
 		return -EINVAL;
@@ -173,8 +176,9 @@
 	unsigned long lval;
 	int ret;
 
-	if (strict_strtoul(buf, 10, &lval))
-		return -EINVAL;
+	ret = kstrtoul(buf, 10, &lval);
+	if (ret)
+		return ret;
 
 	ret = ad7606_oversampling_get_index(lval);
 	if (ret < 0) {

diff --git a/drivers/staging/iio/adc/ad7606_par.c b/drivers/staging/iio/adc/ad7606_par.c
index 58cfdde..8a48d18 100644
--- a/drivers/staging/iio/adc/ad7606_par.c
+++ b/drivers/staging/iio/adc/ad7606_par.c

@@ -112,8 +112,6 @@
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	release_mem_region(res->start, resource_size(res));
 
-	platform_set_drvdata(pdev, NULL);
-
 	return 0;
 }
 

diff --git a/drivers/staging/iio/adc/ad7816.c b/drivers/staging/iio/adc/ad7816.c
index 9284771..8470036 100644
--- a/drivers/staging/iio/adc/ad7816.c
+++ b/drivers/staging/iio/adc/ad7816.c

@@ -175,9 +175,9 @@
 	unsigned long data;
 	int ret;
 
-	ret = strict_strtoul(buf, 10, &data);
+	ret = kstrtoul(buf, 10, &data);
 	if (ret)
-		return -EINVAL;
+		return ret;
 
 	if (data > AD7816_CS_MAX && data != AD7816_CS_MASK) {
 		dev_err(&chip->spi_dev->dev, "Invalid channel id %lu for %s.\n",
@@ -290,7 +290,9 @@
 	u8 data;
 	int ret;
 
-	ret = strict_strtol(buf, 10, &value);
+	ret = kstrtol(buf, 10, &value);
+	if (ret)
+		return ret;
 
 	if (chip->channel_id > AD7816_CS_MAX) {
 		dev_err(dev, "Invalid oti channel id %d.\n", chip->channel_id);

diff --git a/drivers/staging/iio/adc/ad799x_core.c b/drivers/staging/iio/adc/ad799x_core.c
index 8dc97b3..2b2049c 100644
--- a/drivers/staging/iio/adc/ad799x_core.c
+++ b/drivers/staging/iio/adc/ad799x_core.c

@@ -226,7 +226,7 @@
 	int ret, i;
 	u8 t;
 
-	ret = strict_strtol(buf, 10, &val);
+	ret = kstrtol(buf, 10, &val);
 	if (ret)
 		return ret;
 
@@ -337,7 +337,7 @@
 	long val;
 	int ret;
 
-	ret = strict_strtol(buf, 10, &val);
+	ret = kstrtol(buf, 10, &val);
 	if (ret)
 		return ret;
 

diff --git a/drivers/staging/iio/adc/lpc32xx_adc.c b/drivers/staging/iio/adc/lpc32xx_adc.c
index 2f2f7fd..9a4bb09 100644
--- a/drivers/staging/iio/adc/lpc32xx_adc.c
+++ b/drivers/staging/iio/adc/lpc32xx_adc.c

@@ -215,7 +215,6 @@
 
 	iio_device_unregister(iodev);
 	free_irq(irq, info);
-	platform_set_drvdata(pdev, NULL);
 	clk_put(info->clk);
 	iounmap(info->adc_base);
 	iio_device_free(iodev);

diff --git a/drivers/staging/iio/adc/mxs-lradc.c b/drivers/staging/iio/adc/mxs-lradc.c
index 163c638..d92c97a 100644
--- a/drivers/staging/iio/adc/mxs-lradc.c
+++ b/drivers/staging/iio/adc/mxs-lradc.c

@@ -620,7 +620,7 @@
 		((LRADC_DELAY_TIMER_LOOP - 1) << LRADC_CH_NUM_SAMPLES_OFFSET);
 	unsigned int i, j = 0;
 
-	for_each_set_bit(i, iio->active_scan_mask, iio->masklength) {
+	for_each_set_bit(i, iio->active_scan_mask, LRADC_MAX_TOTAL_CHANS) {
 		lradc->buffer[j] = readl(lradc->base + LRADC_CH(j));
 		writel(chan_value, lradc->base + LRADC_CH(j));
 		lradc->buffer[j] &= LRADC_CH_VALUE_MASK;
@@ -774,8 +774,7 @@
 					const unsigned long *mask)
 {
 	struct mxs_lradc *lradc = iio_priv(iio);
-	const int len = iio->masklength;
-	const int map_chans = bitmap_weight(mask, len);
+	const int map_chans = bitmap_weight(mask, LRADC_MAX_TOTAL_CHANS);
 	int rsvd_chans = 0;
 	unsigned long rsvd_mask = 0;
 
@@ -792,7 +791,7 @@
 		rsvd_chans++;
 
 	/* Test for attempts to map channels with special mode of operation. */
-	if (bitmap_intersects(mask, &rsvd_mask, len))
+	if (bitmap_intersects(mask, &rsvd_mask, LRADC_MAX_TOTAL_CHANS))
 		return false;
 
 	/* Test for attempts to map more channels then available slots. */
@@ -968,6 +967,7 @@
 	iio->modes = INDIO_DIRECT_MODE;
 	iio->channels = mxs_lradc_chan_spec;
 	iio->num_channels = ARRAY_SIZE(mxs_lradc_chan_spec);
+	iio->masklength = LRADC_MAX_TOTAL_CHANS;
 
 	ret = iio_triggered_buffer_setup(iio, &iio_pollfunc_store_time,
 				&mxs_lradc_trigger_handler,

diff --git a/drivers/staging/iio/adc/spear_adc.c b/drivers/staging/iio/adc/spear_adc.c
index f45da42..736219c 100644
--- a/drivers/staging/iio/adc/spear_adc.c
+++ b/drivers/staging/iio/adc/spear_adc.c

@@ -407,7 +407,6 @@
 	struct spear_adc_info *info = iio_priv(iodev);
 
 	iio_device_unregister(iodev);
-	platform_set_drvdata(pdev, NULL);
 	clk_disable_unprepare(info->clk);
 	clk_put(info->clk);
 	iounmap(info->adc_base_spear6xx);
@@ -416,11 +415,13 @@
 	return 0;
 }
 
+#ifdef CONFIG_OF
 static const struct of_device_id spear_adc_dt_ids[] = {
 	{ .compatible = "st,spear600-adc", },
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, spear_adc_dt_ids);
+#endif
 
 static struct platform_driver spear_adc_driver = {
 	.probe		= spear_adc_probe,

diff --git a/drivers/staging/iio/gyro/Kconfig b/drivers/staging/iio/gyro/Kconfig
index 8360662..b433371 100644
--- a/drivers/staging/iio/gyro/Kconfig
+++ b/drivers/staging/iio/gyro/Kconfig

@@ -10,13 +10,6 @@
 	  Say yes here to build support for Analog Devices adis16060 wide bandwidth
 	  yaw rate gyroscope with SPI.
 
-config ADIS16130
-	tristate "Analog Devices ADIS16130 High Precision Angular Rate Sensor driver"
-	depends on SPI
-	help
-	  Say yes here to build support for Analog Devices ADIS16130 High Precision
-	  Angular Rate Sensor driver.
-
 config ADIS16260
 	tristate "Analog Devices ADIS16260 Digital Gyroscope Sensor SPI driver"
 	depends on SPI

diff --git a/drivers/staging/iio/gyro/Makefile b/drivers/staging/iio/gyro/Makefile
index 98e6500..975f95b 100644
--- a/drivers/staging/iio/gyro/Makefile
+++ b/drivers/staging/iio/gyro/Makefile

@@ -5,8 +5,5 @@
 adis16060-y             := adis16060_core.o
 obj-$(CONFIG_ADIS16060) += adis16060.o
 
-adis16130-y             := adis16130_core.o
-obj-$(CONFIG_ADIS16130) += adis16130.o
-
 adis16260-y             := adis16260_core.o
 obj-$(CONFIG_ADIS16260) += adis16260.o

diff --git a/drivers/staging/iio/gyro/adis16130_core.c b/drivers/staging/iio/gyro/adis16130_core.c
deleted file mode 100644
index 531b803..0000000
--- a/drivers/staging/iio/gyro/adis16130_core.c
+++ /dev/null

@@ -1,178 +0,0 @@
-/*
- * ADIS16130 Digital Output, High Precision Angular Rate Sensor driver
- *
- * Copyright 2010 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later.
- */
-
-#include <linux/delay.h>
-#include <linux/mutex.h>
-#include <linux/device.h>
-#include <linux/kernel.h>
-#include <linux/spi/spi.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
-#include <linux/list.h>
-#include <linux/module.h>
-
-#include <linux/iio/iio.h>
-#include <linux/iio/sysfs.h>
-
-#define ADIS16130_CON         0x0
-#define ADIS16130_CON_RD      (1 << 6)
-#define ADIS16130_IOP         0x1
-
-/* 1 = data-ready signal low when unread data on all channels; */
-#define ADIS16130_IOP_ALL_RDY (1 << 3)
-#define ADIS16130_IOP_SYNC    (1 << 0) /* 1 = synchronization enabled */
-#define ADIS16130_RATEDATA    0x8 /* Gyroscope output, rate of rotation */
-#define ADIS16130_TEMPDATA    0xA /* Temperature output */
-#define ADIS16130_RATECS      0x28 /* Gyroscope channel setup */
-#define ADIS16130_RATECS_EN   (1 << 3) /* 1 = channel enable; */
-#define ADIS16130_TEMPCS      0x2A /* Temperature channel setup */
-#define ADIS16130_TEMPCS_EN   (1 << 3)
-#define ADIS16130_RATECONV    0x30
-#define ADIS16130_TEMPCONV    0x32
-#define ADIS16130_MODE        0x38
-#define ADIS16130_MODE_24BIT  (1 << 1) /* 1 = 24-bit resolution; */
-
-/**
- * struct adis16130_state - device instance specific data
- * @us:			actual spi_device to write data
- * @buf_lock:		mutex to protect tx and rx
- * @buf:		unified tx/rx buffer
- **/
-struct adis16130_state {
-	struct spi_device		*us;
-	struct mutex			buf_lock;
-	u8				buf[4] ____cacheline_aligned;
-};
-
-static int adis16130_spi_read(struct iio_dev *indio_dev, u8 reg_addr, u32 *val)
-{
-	int ret;
-	struct adis16130_state *st = iio_priv(indio_dev);
-	struct spi_message msg;
-	struct spi_transfer xfer = {
-		.tx_buf = st->buf,
-		.rx_buf = st->buf,
-		.len = 4,
-	};
-
-	mutex_lock(&st->buf_lock);
-
-	st->buf[0] = ADIS16130_CON_RD | reg_addr;
-	st->buf[1] = st->buf[2] = st->buf[3] = 0;
-
-	spi_message_init(&msg);
-	spi_message_add_tail(&xfer, &msg);
-	ret = spi_sync(st->us, &msg);
-	ret = spi_read(st->us, st->buf, 4);
-
-	if (ret == 0)
-		*val = (st->buf[1] << 16) | (st->buf[2] << 8) | st->buf[3];
-	mutex_unlock(&st->buf_lock);
-
-	return ret;
-}
-
-static int adis16130_read_raw(struct iio_dev *indio_dev,
-			      struct iio_chan_spec const *chan,
-			      int *val, int *val2,
-			      long mask)
-{
-	int ret;
-	u32 temp;
-
-	/* Take the iio_dev status lock */
-	mutex_lock(&indio_dev->mlock);
-	ret =  adis16130_spi_read(indio_dev, chan->address, &temp);
-	mutex_unlock(&indio_dev->mlock);
-	if (ret)
-		return ret;
-	*val = temp;
-	return IIO_VAL_INT;
-}
-
-static const struct iio_chan_spec adis16130_channels[] = {
-	{
-		.type = IIO_ANGL_VEL,
-		.modified = 1,
-		.channel2 = IIO_MOD_Z,
-		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-		.address = ADIS16130_RATEDATA,
-	}, {
-		.type = IIO_TEMP,
-		.indexed = 1,
-		.channel = 0,
-		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-		.address = ADIS16130_TEMPDATA,
-	}
-};
-
-static const struct iio_info adis16130_info = {
-	.read_raw = &adis16130_read_raw,
-	.driver_module = THIS_MODULE,
-};
-
-static int adis16130_probe(struct spi_device *spi)
-{
-	int ret;
-	struct adis16130_state *st;
-	struct iio_dev *indio_dev;
-
-	/* setup the industrialio driver allocated elements */
-	indio_dev = iio_device_alloc(sizeof(*st));
-	if (indio_dev == NULL) {
-		ret = -ENOMEM;
-		goto error_ret;
-	}
-	st = iio_priv(indio_dev);
-	/* this is only used for removal purposes */
-	spi_set_drvdata(spi, indio_dev);
-	st->us = spi;
-	mutex_init(&st->buf_lock);
-	indio_dev->name = spi->dev.driver->name;
-	indio_dev->channels = adis16130_channels;
-	indio_dev->num_channels = ARRAY_SIZE(adis16130_channels);
-	indio_dev->dev.parent = &spi->dev;
-	indio_dev->info = &adis16130_info;
-	indio_dev->modes = INDIO_DIRECT_MODE;
-
-	ret = iio_device_register(indio_dev);
-	if (ret)
-		goto error_free_dev;
-
-	return 0;
-
-error_free_dev:
-	iio_device_free(indio_dev);
-
-error_ret:
-	return ret;
-}
-
-/* fixme, confirm ordering in this function */
-static int adis16130_remove(struct spi_device *spi)
-{
-	iio_device_unregister(spi_get_drvdata(spi));
-	iio_device_free(spi_get_drvdata(spi));
-
-	return 0;
-}
-
-static struct spi_driver adis16130_driver = {
-	.driver = {
-		.name = "adis16130",
-		.owner = THIS_MODULE,
-	},
-	.probe = adis16130_probe,
-	.remove = adis16130_remove,
-};
-module_spi_driver(adis16130_driver);
-
-MODULE_AUTHOR("Barry Song <21cnbao@gmail.com>");
-MODULE_DESCRIPTION("Analog Devices ADIS16130 High Precision Angular Rate");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("spi:adis16130");

diff --git a/drivers/staging/iio/trigger/Kconfig b/drivers/staging/iio/trigger/Kconfig
index 1a051da..2fd18c6 100644
--- a/drivers/staging/iio/trigger/Kconfig
+++ b/drivers/staging/iio/trigger/Kconfig

@@ -12,23 +12,6 @@
 	  Provides support for using periodic capable real time
 	  clocks as IIO triggers.
 
-config IIO_GPIO_TRIGGER
-	tristate "GPIO trigger"
-	depends on GPIOLIB
-	help
-	  Provides support for using GPIO pins as IIO triggers.
-
-config IIO_SYSFS_TRIGGER
-	tristate "SYSFS trigger"
-	depends on SYSFS
-	select IRQ_WORK
-	help
-	  Provides support for using SYSFS entry as IIO triggers.
-	  If unsure, say N (but it's safe to say "Y").
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called iio-trig-sysfs.
-
 config IIO_BFIN_TMR_TRIGGER
 	tristate "Blackfin TIMER trigger"
 	depends on BLACKFIN

diff --git a/drivers/staging/iio/trigger/Makefile b/drivers/staging/iio/trigger/Makefile
index b088b57..238481b 100644
--- a/drivers/staging/iio/trigger/Makefile
+++ b/drivers/staging/iio/trigger/Makefile

@@ -3,6 +3,4 @@
 #
 
 obj-$(CONFIG_IIO_PERIODIC_RTC_TRIGGER) += iio-trig-periodic-rtc.o
-obj-$(CONFIG_IIO_GPIO_TRIGGER) += iio-trig-gpio.o
-obj-$(CONFIG_IIO_SYSFS_TRIGGER) += iio-trig-sysfs.o
 obj-$(CONFIG_IIO_BFIN_TMR_TRIGGER) += iio-trig-bfin-timer.o

diff --git a/drivers/staging/iio/trigger/iio-trig-gpio.c b/drivers/staging/iio/trigger/iio-trig-gpio.c
deleted file mode 100644
index 7c593d1..0000000
--- a/drivers/staging/iio/trigger/iio-trig-gpio.c
+++ /dev/null

@@ -1,167 +0,0 @@
-/*
- * Industrial I/O - gpio based trigger support
- *
- * Copyright (c) 2008 Jonathan Cameron
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * Currently this is more of a functioning proof of concept than a full
- * fledged trigger driver.
- *
- * TODO:
- *
- * Add board config elements to allow specification of startup settings.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/interrupt.h>
-#include <linux/gpio.h>
-#include <linux/slab.h>
-
-#include <linux/iio/iio.h>
-#include <linux/iio/trigger.h>
-
-static LIST_HEAD(iio_gpio_trigger_list);
-static DEFINE_MUTEX(iio_gpio_trigger_list_lock);
-
-struct iio_gpio_trigger_info {
-	struct mutex in_use;
-	unsigned int irq;
-};
-/*
- * Need to reference count these triggers and only enable gpio interrupts
- * as appropriate.
- */
-
-/* So what functionality do we want in here?... */
-/* set high / low as interrupt type? */
-
-static irqreturn_t iio_gpio_trigger_poll(int irq, void *private)
-{
-	/* Timestamp not currently provided */
-	iio_trigger_poll(private, 0);
-	return IRQ_HANDLED;
-}
-
-static const struct iio_trigger_ops iio_gpio_trigger_ops = {
-	.owner = THIS_MODULE,
-};
-
-static int iio_gpio_trigger_probe(struct platform_device *pdev)
-{
-	struct iio_gpio_trigger_info *trig_info;
-	struct iio_trigger *trig, *trig2;
-	unsigned long irqflags;
-	struct resource *irq_res;
-	int irq, ret = 0, irq_res_cnt = 0;
-
-	do {
-		irq_res = platform_get_resource(pdev,
-				IORESOURCE_IRQ, irq_res_cnt);
-
-		if (irq_res == NULL) {
-			if (irq_res_cnt == 0)
-				dev_err(&pdev->dev, "No GPIO IRQs specified");
-			break;
-		}
-		irqflags = (irq_res->flags & IRQF_TRIGGER_MASK) | IRQF_SHARED;
-
-		for (irq = irq_res->start; irq <= irq_res->end; irq++) {
-
-			trig = iio_trigger_alloc("irqtrig%d", irq);
-			if (!trig) {
-				ret = -ENOMEM;
-				goto error_free_completed_registrations;
-			}
-
-			trig_info = kzalloc(sizeof(*trig_info), GFP_KERNEL);
-			if (!trig_info) {
-				ret = -ENOMEM;
-				goto error_put_trigger;
-			}
-			iio_trigger_set_drvdata(trig, trig_info);
-			trig_info->irq = irq;
-			trig->ops = &iio_gpio_trigger_ops;
-			ret = request_irq(irq, iio_gpio_trigger_poll,
-					  irqflags, trig->name, trig);
-			if (ret) {
-				dev_err(&pdev->dev,
-					"request IRQ-%d failed", irq);
-				goto error_free_trig_info;
-			}
-
-			ret = iio_trigger_register(trig);
-			if (ret)
-				goto error_release_irq;
-
-			list_add_tail(&trig->alloc_list,
-					&iio_gpio_trigger_list);
-		}
-
-		irq_res_cnt++;
-	} while (irq_res != NULL);
-
-
-	return 0;
-
-/* First clean up the partly allocated trigger */
-error_release_irq:
-	free_irq(irq, trig);
-error_free_trig_info:
-	kfree(trig_info);
-error_put_trigger:
-	iio_trigger_put(trig);
-error_free_completed_registrations:
-	/* The rest should have been added to the iio_gpio_trigger_list */
-	list_for_each_entry_safe(trig,
-				 trig2,
-				 &iio_gpio_trigger_list,
-				 alloc_list) {
-		trig_info = iio_trigger_get_drvdata(trig);
-		free_irq(gpio_to_irq(trig_info->irq), trig);
-		kfree(trig_info);
-		iio_trigger_unregister(trig);
-	}
-
-	return ret;
-}
-
-static int iio_gpio_trigger_remove(struct platform_device *pdev)
-{
-	struct iio_trigger *trig, *trig2;
-	struct iio_gpio_trigger_info *trig_info;
-
-	mutex_lock(&iio_gpio_trigger_list_lock);
-	list_for_each_entry_safe(trig,
-				 trig2,
-				 &iio_gpio_trigger_list,
-				 alloc_list) {
-		trig_info = iio_trigger_get_drvdata(trig);
-		iio_trigger_unregister(trig);
-		free_irq(trig_info->irq, trig);
-		kfree(trig_info);
-		iio_trigger_put(trig);
-	}
-	mutex_unlock(&iio_gpio_trigger_list_lock);
-
-	return 0;
-}
-
-static struct platform_driver iio_gpio_trigger_driver = {
-	.probe = iio_gpio_trigger_probe,
-	.remove = iio_gpio_trigger_remove,
-	.driver = {
-		.name = "iio_gpio_trigger",
-		.owner = THIS_MODULE,
-	},
-};
-
-module_platform_driver(iio_gpio_trigger_driver);
-
-MODULE_AUTHOR("Jonathan Cameron <jic23@kernel.org>");
-MODULE_DESCRIPTION("Example gpio trigger for the iio subsystem");
-MODULE_LICENSE("GPL v2");

diff --git a/drivers/staging/iio/trigger/iio-trig-sysfs.c b/drivers/staging/iio/trigger/iio-trig-sysfs.c
deleted file mode 100644
index b727bde..0000000
--- a/drivers/staging/iio/trigger/iio-trig-sysfs.c
+++ /dev/null

@@ -1,227 +0,0 @@
-/*
- * Copyright 2011 Analog Devices Inc.
- *
- * Licensed under the GPL-2.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/irq_work.h>
-
-#include <linux/iio/iio.h>
-#include <linux/iio/trigger.h>
-
-struct iio_sysfs_trig {
-	struct iio_trigger *trig;
-	struct irq_work work;
-	int id;
-	struct list_head l;
-};
-
-static LIST_HEAD(iio_sysfs_trig_list);
-static DEFINE_MUTEX(iio_syfs_trig_list_mut);
-
-static int iio_sysfs_trigger_probe(int id);
-static ssize_t iio_sysfs_trig_add(struct device *dev,
-				  struct device_attribute *attr,
-				  const char *buf,
-				  size_t len)
-{
-	int ret;
-	unsigned long input;
-
-	ret = strict_strtoul(buf, 10, &input);
-	if (ret)
-		return ret;
-	ret = iio_sysfs_trigger_probe(input);
-	if (ret)
-		return ret;
-	return len;
-}
-static DEVICE_ATTR(add_trigger, S_IWUSR, NULL, &iio_sysfs_trig_add);
-
-static int iio_sysfs_trigger_remove(int id);
-static ssize_t iio_sysfs_trig_remove(struct device *dev,
-				     struct device_attribute *attr,
-				     const char *buf,
-				     size_t len)
-{
-	int ret;
-	unsigned long input;
-
-	ret = strict_strtoul(buf, 10, &input);
-	if (ret)
-		return ret;
-	ret = iio_sysfs_trigger_remove(input);
-	if (ret)
-		return ret;
-	return len;
-}
-
-static DEVICE_ATTR(remove_trigger, S_IWUSR, NULL, &iio_sysfs_trig_remove);
-
-static struct attribute *iio_sysfs_trig_attrs[] = {
-	&dev_attr_add_trigger.attr,
-	&dev_attr_remove_trigger.attr,
-	NULL,
-};
-
-static const struct attribute_group iio_sysfs_trig_group = {
-	.attrs = iio_sysfs_trig_attrs,
-};
-
-static const struct attribute_group *iio_sysfs_trig_groups[] = {
-	&iio_sysfs_trig_group,
-	NULL
-};
-
-
-/* Nothing to actually do upon release */
-static void iio_trigger_sysfs_release(struct device *dev)
-{
-}
-
-static struct device iio_sysfs_trig_dev = {
-	.bus = &iio_bus_type,
-	.groups = iio_sysfs_trig_groups,
-	.release = &iio_trigger_sysfs_release,
-};
-
-static void iio_sysfs_trigger_work(struct irq_work *work)
-{
-	struct iio_sysfs_trig *trig = container_of(work, struct iio_sysfs_trig,
-							work);
-
-	iio_trigger_poll(trig->trig, 0);
-}
-
-static ssize_t iio_sysfs_trigger_poll(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t count)
-{
-	struct iio_trigger *trig = to_iio_trigger(dev);
-	struct iio_sysfs_trig *sysfs_trig = iio_trigger_get_drvdata(trig);
-
-	irq_work_queue(&sysfs_trig->work);
-
-	return count;
-}
-
-static DEVICE_ATTR(trigger_now, S_IWUSR, NULL, iio_sysfs_trigger_poll);
-
-static struct attribute *iio_sysfs_trigger_attrs[] = {
-	&dev_attr_trigger_now.attr,
-	NULL,
-};
-
-static const struct attribute_group iio_sysfs_trigger_attr_group = {
-	.attrs = iio_sysfs_trigger_attrs,
-};
-
-static const struct attribute_group *iio_sysfs_trigger_attr_groups[] = {
-	&iio_sysfs_trigger_attr_group,
-	NULL
-};
-
-static const struct iio_trigger_ops iio_sysfs_trigger_ops = {
-	.owner = THIS_MODULE,
-};
-
-static int iio_sysfs_trigger_probe(int id)
-{
-	struct iio_sysfs_trig *t;
-	int ret;
-	bool foundit = false;
-	mutex_lock(&iio_syfs_trig_list_mut);
-	list_for_each_entry(t, &iio_sysfs_trig_list, l)
-		if (id == t->id) {
-			foundit = true;
-			break;
-		}
-	if (foundit) {
-		ret = -EINVAL;
-		goto out1;
-	}
-	t = kmalloc(sizeof(*t), GFP_KERNEL);
-	if (t == NULL) {
-		ret = -ENOMEM;
-		goto out1;
-	}
-	t->id = id;
-	t->trig = iio_trigger_alloc("sysfstrig%d", id);
-	if (!t->trig) {
-		ret = -ENOMEM;
-		goto free_t;
-	}
-
-	t->trig->dev.groups = iio_sysfs_trigger_attr_groups;
-	t->trig->ops = &iio_sysfs_trigger_ops;
-	t->trig->dev.parent = &iio_sysfs_trig_dev;
-	iio_trigger_set_drvdata(t->trig, t);
-
-	init_irq_work(&t->work, iio_sysfs_trigger_work);
-
-	ret = iio_trigger_register(t->trig);
-	if (ret)
-		goto out2;
-	list_add(&t->l, &iio_sysfs_trig_list);
-	__module_get(THIS_MODULE);
-	mutex_unlock(&iio_syfs_trig_list_mut);
-	return 0;
-
-out2:
-	iio_trigger_put(t->trig);
-free_t:
-	kfree(t);
-out1:
-	mutex_unlock(&iio_syfs_trig_list_mut);
-	return ret;
-}
-
-static int iio_sysfs_trigger_remove(int id)
-{
-	bool foundit = false;
-	struct iio_sysfs_trig *t;
-	mutex_lock(&iio_syfs_trig_list_mut);
-	list_for_each_entry(t, &iio_sysfs_trig_list, l)
-		if (id == t->id) {
-			foundit = true;
-			break;
-		}
-	if (!foundit) {
-		mutex_unlock(&iio_syfs_trig_list_mut);
-		return -EINVAL;
-	}
-
-	iio_trigger_unregister(t->trig);
-	iio_trigger_free(t->trig);
-
-	list_del(&t->l);
-	kfree(t);
-	module_put(THIS_MODULE);
-	mutex_unlock(&iio_syfs_trig_list_mut);
-	return 0;
-}
-
-
-static int __init iio_sysfs_trig_init(void)
-{
-	device_initialize(&iio_sysfs_trig_dev);
-	dev_set_name(&iio_sysfs_trig_dev, "iio_sysfs_trigger");
-	return device_add(&iio_sysfs_trig_dev);
-}
-module_init(iio_sysfs_trig_init);
-
-static void __exit iio_sysfs_trig_exit(void)
-{
-	device_unregister(&iio_sysfs_trig_dev);
-}
-module_exit(iio_sysfs_trig_exit);
-
-MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
-MODULE_DESCRIPTION("Sysfs based trigger for the iio subsystem");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:iio-trig-sysfs");

diff --git a/drivers/staging/imx-drm/Kconfig b/drivers/staging/imx-drm/Kconfig
index ef699f7..2233905 100644
--- a/drivers/staging/imx-drm/Kconfig
+++ b/drivers/staging/imx-drm/Kconfig

@@ -30,6 +30,14 @@
 	  Choose this to enable the internal Television Encoder (TVe)
 	  found on i.MX53 processors.
 
+config DRM_IMX_LDB
+	tristate "Support for LVDS displays"
+	depends on DRM_IMX
+	select OF_VIDEOMODE
+	help
+	  Choose this to enable the internal LVDS Display Bridge (LDB)
+	  found on i.MX53 and i.MX6 processors.
+
 config DRM_IMX_IPUV3_CORE
 	tristate "IPUv3 core support"
 	depends on DRM_IMX

diff --git a/drivers/staging/imx-drm/Makefile b/drivers/staging/imx-drm/Makefile
index 7e50184..bfaf693 100644
--- a/drivers/staging/imx-drm/Makefile
+++ b/drivers/staging/imx-drm/Makefile

@@ -5,6 +5,7 @@
 
 obj-$(CONFIG_DRM_IMX_PARALLEL_DISPLAY) += parallel-display.o
 obj-$(CONFIG_DRM_IMX_TVE) += imx-tve.o
+obj-$(CONFIG_DRM_IMX_LDB) += imx-ldb.o
 obj-$(CONFIG_DRM_IMX_FB_HELPER) += imx-fbdev.o
 obj-$(CONFIG_DRM_IMX_IPUV3_CORE) += ipu-v3/
 obj-$(CONFIG_DRM_IMX_IPUV3)	+= ipuv3-crtc.o

diff --git a/drivers/staging/imx-drm/TODO b/drivers/staging/imx-drm/TODO
index 123acbe..f806415 100644
--- a/drivers/staging/imx-drm/TODO
+++ b/drivers/staging/imx-drm/TODO

@@ -6,7 +6,6 @@
 - Factor out more code to common helper functions
 - decide where to put the base driver. It is not specific to a subsystem
   and would be used by DRM/KMS and media/V4L2
-- convert irq driver to irq_domain_add_linear
 
 Missing features (not necessarily for moving out of staging):
 

diff --git a/drivers/staging/imx-drm/imx-drm-core.c b/drivers/staging/imx-drm/imx-drm-core.c
index 6455305..9854a1d 100644
--- a/drivers/staging/imx-drm/imx-drm-core.c
+++ b/drivers/staging/imx-drm/imx-drm-core.c

@@ -144,7 +144,7 @@
 		u32 interface_pix_fmt)
 {
 	return imx_drm_crtc_panel_format_pins(crtc, encoder_type,
-					      interface_pix_fmt, 0, 0);
+					      interface_pix_fmt, 2, 3);
 }
 EXPORT_SYMBOL_GPL(imx_drm_crtc_panel_format);
 
@@ -491,7 +491,6 @@
 {
 	struct imx_drm_device *imxdrm = __imx_drm_device();
 	struct imx_drm_crtc *imx_drm_crtc;
-	const struct drm_crtc_funcs *crtc_funcs;
 	int ret;
 
 	mutex_lock(&imxdrm->mutex);
@@ -512,8 +511,6 @@
 	imx_drm_crtc->cookie.cookie = cookie;
 	imx_drm_crtc->cookie.id = id;
 
-	crtc_funcs = imx_drm_helper_funcs->crtc_funcs;
-
 	imx_drm_crtc->crtc = crtc;
 	imx_drm_crtc->imxdrm = imxdrm;
 

diff --git a/drivers/staging/imx-drm/imx-ldb.c b/drivers/staging/imx-drm/imx-ldb.c
new file mode 100644
index 0000000..8af7f3b
--- /dev/null
+++ b/drivers/staging/imx-drm/imx-ldb.c

@@ -0,0 +1,625 @@
+/*
+ * i.MX drm driver - LVDS display bridge
+ *
+ * Copyright (C) 2012 Sascha Hauer, Pengutronix
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/clk.h>
+#include <drm/drmP.h>
+#include <drm/drm_fb_helper.h>
+#include <drm/drm_crtc_helper.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mfd/syscon/imx6q-iomuxc-gpr.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <video/of_videomode.h>
+#include <linux/regmap.h>
+#include <linux/videodev2.h>
+
+#include "imx-drm.h"
+
+#define DRIVER_NAME "imx-ldb"
+
+#define LDB_CH0_MODE_EN_TO_DI0		(1 << 0)
+#define LDB_CH0_MODE_EN_TO_DI1		(3 << 0)
+#define LDB_CH0_MODE_EN_MASK		(3 << 0)
+#define LDB_CH1_MODE_EN_TO_DI0		(1 << 2)
+#define LDB_CH1_MODE_EN_TO_DI1		(3 << 2)
+#define LDB_CH1_MODE_EN_MASK		(3 << 2)
+#define LDB_SPLIT_MODE_EN		(1 << 4)
+#define LDB_DATA_WIDTH_CH0_24		(1 << 5)
+#define LDB_BIT_MAP_CH0_JEIDA		(1 << 6)
+#define LDB_DATA_WIDTH_CH1_24		(1 << 7)
+#define LDB_BIT_MAP_CH1_JEIDA		(1 << 8)
+#define LDB_DI0_VS_POL_ACT_LOW		(1 << 9)
+#define LDB_DI1_VS_POL_ACT_LOW		(1 << 10)
+#define LDB_BGREF_RMODE_INT		(1 << 15)
+
+#define con_to_imx_ldb_ch(x) container_of(x, struct imx_ldb_channel, connector)
+#define enc_to_imx_ldb_ch(x) container_of(x, struct imx_ldb_channel, encoder)
+
+struct imx_ldb;
+
+struct imx_ldb_channel {
+	struct imx_ldb *ldb;
+	struct drm_connector connector;
+	struct imx_drm_connector *imx_drm_connector;
+	struct drm_encoder encoder;
+	struct imx_drm_encoder *imx_drm_encoder;
+	int chno;
+	void *edid;
+	int edid_len;
+	struct drm_display_mode mode;
+	int mode_valid;
+};
+
+struct bus_mux {
+	int reg;
+	int shift;
+	int mask;
+};
+
+struct imx_ldb {
+	struct regmap *regmap;
+	struct device *dev;
+	struct imx_ldb_channel channel[2];
+	struct clk *clk[2]; /* our own clock */
+	struct clk *clk_sel[4]; /* parent of display clock */
+	struct clk *clk_pll[2]; /* upstream clock we can adjust */
+	u32 ldb_ctrl;
+	const struct bus_mux *lvds_mux;
+};
+
+static enum drm_connector_status imx_ldb_connector_detect(
+		struct drm_connector *connector, bool force)
+{
+	return connector_status_connected;
+}
+
+static void imx_ldb_connector_destroy(struct drm_connector *connector)
+{
+	/* do not free here */
+}
+
+static int imx_ldb_connector_get_modes(struct drm_connector *connector)
+{
+	struct imx_ldb_channel *imx_ldb_ch = con_to_imx_ldb_ch(connector);
+	int num_modes = 0;
+
+	if (imx_ldb_ch->edid) {
+		drm_mode_connector_update_edid_property(connector,
+							imx_ldb_ch->edid);
+		num_modes = drm_add_edid_modes(connector, imx_ldb_ch->edid);
+	}
+
+	if (imx_ldb_ch->mode_valid) {
+		struct drm_display_mode *mode;
+
+		mode = drm_mode_create(connector->dev);
+		drm_mode_copy(mode, &imx_ldb_ch->mode);
+		mode->type |= DRM_MODE_TYPE_DRIVER | DRM_MODE_TYPE_PREFERRED;
+		drm_mode_probed_add(connector, mode);
+		num_modes++;
+	}
+
+	return num_modes;
+}
+
+static int imx_ldb_connector_mode_valid(struct drm_connector *connector,
+			  struct drm_display_mode *mode)
+{
+	return 0;
+}
+
+static struct drm_encoder *imx_ldb_connector_best_encoder(
+		struct drm_connector *connector)
+{
+	struct imx_ldb_channel *imx_ldb_ch = con_to_imx_ldb_ch(connector);
+
+	return &imx_ldb_ch->encoder;
+}
+
+static void imx_ldb_encoder_dpms(struct drm_encoder *encoder, int mode)
+{
+}
+
+static bool imx_ldb_encoder_mode_fixup(struct drm_encoder *encoder,
+			   const struct drm_display_mode *mode,
+			   struct drm_display_mode *adjusted_mode)
+{
+	return true;
+}
+
+static void imx_ldb_set_clock(struct imx_ldb *ldb, int mux, int chno,
+		unsigned long serial_clk, unsigned long di_clk)
+{
+	int ret;
+
+	dev_dbg(ldb->dev, "%s: now: %ld want: %ld\n", __func__,
+			clk_get_rate(ldb->clk_pll[chno]), serial_clk);
+	clk_set_rate(ldb->clk_pll[chno], serial_clk);
+
+	dev_dbg(ldb->dev, "%s after: %ld\n", __func__,
+			clk_get_rate(ldb->clk_pll[chno]));
+
+	dev_dbg(ldb->dev, "%s: now: %ld want: %ld\n", __func__,
+			clk_get_rate(ldb->clk[chno]),
+			(long int)di_clk);
+	clk_set_rate(ldb->clk[chno], di_clk);
+
+	dev_dbg(ldb->dev, "%s after: %ld\n", __func__,
+			clk_get_rate(ldb->clk[chno]));
+
+	/* set display clock mux to LDB input clock */
+	ret = clk_set_parent(ldb->clk_sel[mux], ldb->clk[chno]);
+	if (ret) {
+		dev_err(ldb->dev, "unable to set di%d parent clock to ldb_di%d\n", mux, chno);
+	}
+}
+
+static void imx_ldb_encoder_prepare(struct drm_encoder *encoder)
+{
+	struct imx_ldb_channel *imx_ldb_ch = enc_to_imx_ldb_ch(encoder);
+	struct imx_ldb *ldb = imx_ldb_ch->ldb;
+	struct drm_display_mode *mode = &encoder->crtc->mode;
+	u32 pixel_fmt;
+	unsigned long serial_clk;
+	unsigned long di_clk = mode->clock * 1000;
+	int mux = imx_drm_encoder_get_mux_id(imx_ldb_ch->imx_drm_encoder,
+					     encoder->crtc);
+
+	if (ldb->ldb_ctrl & LDB_SPLIT_MODE_EN) {
+		/* dual channel LVDS mode */
+		serial_clk = 3500UL * mode->clock;
+		imx_ldb_set_clock(ldb, mux, 0, serial_clk, di_clk);
+		imx_ldb_set_clock(ldb, mux, 1, serial_clk, di_clk);
+	} else {
+		serial_clk = 7000UL * mode->clock;
+		imx_ldb_set_clock(ldb, mux, imx_ldb_ch->chno, serial_clk, di_clk);
+	}
+
+	switch (imx_ldb_ch->chno) {
+	case 0:
+		pixel_fmt = (ldb->ldb_ctrl & LDB_DATA_WIDTH_CH0_24) ?
+			V4L2_PIX_FMT_RGB24 : V4L2_PIX_FMT_BGR666;
+		break;
+	case 1:
+		pixel_fmt = (ldb->ldb_ctrl & LDB_DATA_WIDTH_CH1_24) ?
+			V4L2_PIX_FMT_RGB24 : V4L2_PIX_FMT_BGR666;
+		break;
+	default:
+		dev_err(ldb->dev, "unable to config di%d panel format\n",
+			imx_ldb_ch->chno);
+		pixel_fmt = V4L2_PIX_FMT_RGB24;
+	}
+
+	imx_drm_crtc_panel_format(encoder->crtc, DRM_MODE_ENCODER_LVDS,
+			pixel_fmt);
+}
+
+static void imx_ldb_encoder_commit(struct drm_encoder *encoder)
+{
+	struct imx_ldb_channel *imx_ldb_ch = enc_to_imx_ldb_ch(encoder);
+	struct imx_ldb *ldb = imx_ldb_ch->ldb;
+	int dual = ldb->ldb_ctrl & LDB_SPLIT_MODE_EN;
+	int mux = imx_drm_encoder_get_mux_id(imx_ldb_ch->imx_drm_encoder,
+					     encoder->crtc);
+
+	if (dual) {
+		clk_prepare_enable(ldb->clk[0]);
+		clk_prepare_enable(ldb->clk[1]);
+	}
+
+	if (imx_ldb_ch == &ldb->channel[0] || dual) {
+		ldb->ldb_ctrl &= ~LDB_CH0_MODE_EN_MASK;
+		if (mux == 0 || ldb->lvds_mux)
+			ldb->ldb_ctrl |= LDB_CH0_MODE_EN_TO_DI0;
+		else if (mux == 1)
+			ldb->ldb_ctrl |= LDB_CH0_MODE_EN_TO_DI1;
+	}
+	if (imx_ldb_ch == &ldb->channel[1] || dual) {
+		ldb->ldb_ctrl &= ~LDB_CH1_MODE_EN_MASK;
+		if (mux == 1 || ldb->lvds_mux)
+			ldb->ldb_ctrl |= LDB_CH1_MODE_EN_TO_DI1;
+		else if (mux == 0)
+			ldb->ldb_ctrl |= LDB_CH1_MODE_EN_TO_DI0;
+	}
+
+	if (ldb->lvds_mux) {
+		const struct bus_mux *lvds_mux = NULL;
+
+		if (imx_ldb_ch == &ldb->channel[0])
+			lvds_mux = &ldb->lvds_mux[0];
+		else if (imx_ldb_ch == &ldb->channel[1])
+			lvds_mux = &ldb->lvds_mux[1];
+
+		regmap_update_bits(ldb->regmap, lvds_mux->reg, lvds_mux->mask,
+				   mux << lvds_mux->shift);
+	}
+
+	regmap_write(ldb->regmap, IOMUXC_GPR2, ldb->ldb_ctrl);
+}
+
+static void imx_ldb_encoder_mode_set(struct drm_encoder *encoder,
+			 struct drm_display_mode *mode,
+			 struct drm_display_mode *adjusted_mode)
+{
+	struct imx_ldb_channel *imx_ldb_ch = enc_to_imx_ldb_ch(encoder);
+	struct imx_ldb *ldb = imx_ldb_ch->ldb;
+	int dual = ldb->ldb_ctrl & LDB_SPLIT_MODE_EN;
+
+	if (mode->clock > 170000) {
+		dev_warn(ldb->dev,
+			 "%s: mode exceeds 170 MHz pixel clock\n", __func__);
+	}
+	if (mode->clock > 85000 && !dual) {
+		dev_warn(ldb->dev,
+			 "%s: mode exceeds 85 MHz pixel clock\n", __func__);
+	}
+
+	/* FIXME - assumes straight connections DI0 --> CH0, DI1 --> CH1 */
+	if (imx_ldb_ch == &ldb->channel[0]) {
+		if (mode->flags & DRM_MODE_FLAG_NVSYNC)
+			ldb->ldb_ctrl |= LDB_DI0_VS_POL_ACT_LOW;
+		else if (mode->flags & DRM_MODE_FLAG_PVSYNC)
+			ldb->ldb_ctrl &= ~LDB_DI0_VS_POL_ACT_LOW;
+	}
+	if (imx_ldb_ch == &ldb->channel[1]) {
+		if (mode->flags & DRM_MODE_FLAG_NVSYNC)
+			ldb->ldb_ctrl |= LDB_DI1_VS_POL_ACT_LOW;
+		else if (mode->flags & DRM_MODE_FLAG_PVSYNC)
+			ldb->ldb_ctrl &= ~LDB_DI1_VS_POL_ACT_LOW;
+	}
+}
+
+static void imx_ldb_encoder_disable(struct drm_encoder *encoder)
+{
+	struct imx_ldb_channel *imx_ldb_ch = enc_to_imx_ldb_ch(encoder);
+	struct imx_ldb *ldb = imx_ldb_ch->ldb;
+
+	/*
+	 * imx_ldb_encoder_disable is called by
+	 * drm_helper_disable_unused_functions without
+	 * the encoder being enabled before.
+	 */
+	if (imx_ldb_ch == &ldb->channel[0] &&
+	    (ldb->ldb_ctrl & LDB_CH0_MODE_EN_MASK) == 0)
+		return;
+	else if (imx_ldb_ch == &ldb->channel[1] &&
+		 (ldb->ldb_ctrl & LDB_CH1_MODE_EN_MASK) == 0)
+		return;
+
+	if (imx_ldb_ch == &ldb->channel[0])
+		ldb->ldb_ctrl &= ~LDB_CH0_MODE_EN_MASK;
+	else if (imx_ldb_ch == &ldb->channel[1])
+		ldb->ldb_ctrl &= ~LDB_CH1_MODE_EN_MASK;
+
+	regmap_write(ldb->regmap, IOMUXC_GPR2, ldb->ldb_ctrl);
+
+	if (ldb->ldb_ctrl & LDB_SPLIT_MODE_EN) {
+		clk_disable_unprepare(ldb->clk[0]);
+		clk_disable_unprepare(ldb->clk[1]);
+	}
+}
+
+static void imx_ldb_encoder_destroy(struct drm_encoder *encoder)
+{
+	/* do not free here */
+}
+
+static struct drm_connector_funcs imx_ldb_connector_funcs = {
+	.dpms = drm_helper_connector_dpms,
+	.fill_modes = drm_helper_probe_single_connector_modes,
+	.detect = imx_ldb_connector_detect,
+	.destroy = imx_ldb_connector_destroy,
+};
+
+static struct drm_connector_helper_funcs imx_ldb_connector_helper_funcs = {
+	.get_modes = imx_ldb_connector_get_modes,
+	.best_encoder = imx_ldb_connector_best_encoder,
+	.mode_valid = imx_ldb_connector_mode_valid,
+};
+
+static struct drm_encoder_funcs imx_ldb_encoder_funcs = {
+	.destroy = imx_ldb_encoder_destroy,
+};
+
+static struct drm_encoder_helper_funcs imx_ldb_encoder_helper_funcs = {
+	.dpms = imx_ldb_encoder_dpms,
+	.mode_fixup = imx_ldb_encoder_mode_fixup,
+	.prepare = imx_ldb_encoder_prepare,
+	.commit = imx_ldb_encoder_commit,
+	.mode_set = imx_ldb_encoder_mode_set,
+	.disable = imx_ldb_encoder_disable,
+};
+
+static int imx_ldb_get_clk(struct imx_ldb *ldb, int chno)
+{
+	char clkname[16];
+
+	sprintf(clkname, "di%d", chno);
+	ldb->clk[chno] = devm_clk_get(ldb->dev, clkname);
+	if (IS_ERR(ldb->clk[chno]))
+		return PTR_ERR(ldb->clk[chno]);
+
+	sprintf(clkname, "di%d_pll", chno);
+	ldb->clk_pll[chno] = devm_clk_get(ldb->dev, clkname);
+	if (IS_ERR(ldb->clk_pll[chno]))
+		return PTR_ERR(ldb->clk_pll[chno]);
+
+	return 0;
+}
+
+static int imx_ldb_register(struct imx_ldb_channel *imx_ldb_ch)
+{
+	int ret;
+	struct imx_ldb *ldb = imx_ldb_ch->ldb;
+
+	ret = imx_ldb_get_clk(ldb, imx_ldb_ch->chno);
+	if (ret)
+		return ret;
+	if (ldb->ldb_ctrl & LDB_SPLIT_MODE_EN) {
+		ret |= imx_ldb_get_clk(ldb, 1);
+		if (ret)
+			return ret;
+	}
+
+	imx_ldb_ch->connector.funcs = &imx_ldb_connector_funcs;
+	imx_ldb_ch->encoder.funcs = &imx_ldb_encoder_funcs;
+
+	imx_ldb_ch->encoder.encoder_type = DRM_MODE_ENCODER_LVDS;
+	imx_ldb_ch->connector.connector_type = DRM_MODE_CONNECTOR_LVDS;
+
+	drm_encoder_helper_add(&imx_ldb_ch->encoder,
+			&imx_ldb_encoder_helper_funcs);
+	ret = imx_drm_add_encoder(&imx_ldb_ch->encoder,
+			&imx_ldb_ch->imx_drm_encoder, THIS_MODULE);
+	if (ret) {
+		dev_err(ldb->dev, "adding encoder failed with %d\n", ret);
+		return ret;
+	}
+
+	drm_connector_helper_add(&imx_ldb_ch->connector,
+			&imx_ldb_connector_helper_funcs);
+
+	ret = imx_drm_add_connector(&imx_ldb_ch->connector,
+			&imx_ldb_ch->imx_drm_connector, THIS_MODULE);
+	if (ret) {
+		imx_drm_remove_encoder(imx_ldb_ch->imx_drm_encoder);
+		dev_err(ldb->dev, "adding connector failed with %d\n", ret);
+		return ret;
+	}
+
+	drm_mode_connector_attach_encoder(&imx_ldb_ch->connector,
+			&imx_ldb_ch->encoder);
+
+	return 0;
+}
+
+enum {
+	LVDS_BIT_MAP_SPWG,
+	LVDS_BIT_MAP_JEIDA
+};
+
+static const char *imx_ldb_bit_mappings[] = {
+	[LVDS_BIT_MAP_SPWG]  = "spwg",
+	[LVDS_BIT_MAP_JEIDA] = "jeida",
+};
+
+const int of_get_data_mapping(struct device_node *np)
+{
+	const char *bm;
+	int ret, i;
+
+	ret = of_property_read_string(np, "fsl,data-mapping", &bm);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(imx_ldb_bit_mappings); i++)
+		if (!strcasecmp(bm, imx_ldb_bit_mappings[i]))
+			return i;
+
+	return -EINVAL;
+}
+
+static struct bus_mux imx6q_lvds_mux[2] = {
+	{
+		.reg = IOMUXC_GPR3,
+		.shift = 6,
+		.mask = IMX6Q_GPR3_LVDS0_MUX_CTL_MASK,
+	}, {
+		.reg = IOMUXC_GPR3,
+		.shift = 8,
+		.mask = IMX6Q_GPR3_LVDS1_MUX_CTL_MASK,
+	}
+};
+
+/*
+ * For a device declaring compatible = "fsl,imx6q-ldb", "fsl,imx53-ldb",
+ * of_match_device will walk through this list and take the first entry
+ * matching any of its compatible values. Therefore, the more generic
+ * entries (in this case fsl,imx53-ldb) need to be ordered last.
+ */
+static const struct of_device_id imx_ldb_dt_ids[] = {
+	{ .compatible = "fsl,imx6q-ldb", .data = imx6q_lvds_mux, },
+	{ .compatible = "fsl,imx53-ldb", .data = NULL, },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, imx_ldb_dt_ids);
+
+static int imx_ldb_probe(struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+	const struct of_device_id *of_id =
+			of_match_device(of_match_ptr(imx_ldb_dt_ids),
+					&pdev->dev);
+	struct device_node *child;
+	const u8 *edidp;
+	struct imx_ldb *imx_ldb;
+	int datawidth;
+	int mapping;
+	int dual;
+	int ret;
+	int i;
+
+	imx_ldb = devm_kzalloc(&pdev->dev, sizeof(*imx_ldb), GFP_KERNEL);
+	if (!imx_ldb)
+		return -ENOMEM;
+
+	imx_ldb->regmap = syscon_regmap_lookup_by_phandle(np, "gpr");
+	if (IS_ERR(imx_ldb->regmap)) {
+		dev_err(&pdev->dev, "failed to get parent regmap\n");
+		return PTR_ERR(imx_ldb->regmap);
+	}
+
+	imx_ldb->dev = &pdev->dev;
+
+	if (of_id)
+		imx_ldb->lvds_mux = of_id->data;
+
+	dual = of_property_read_bool(np, "fsl,dual-channel");
+	if (dual)
+		imx_ldb->ldb_ctrl |= LDB_SPLIT_MODE_EN;
+
+	/*
+	 * There are three diferent possible clock mux configurations:
+	 * i.MX53:  ipu1_di0_sel, ipu1_di1_sel
+	 * i.MX6q:  ipu1_di0_sel, ipu1_di1_sel, ipu2_di0_sel, ipu2_di1_sel
+	 * i.MX6dl: ipu1_di0_sel, ipu1_di1_sel, lcdif_sel
+	 * Map them all to di0_sel...di3_sel.
+	 */
+	for (i = 0; i < 4; i++) {
+		char clkname[16];
+
+		sprintf(clkname, "di%d_sel", i);
+		imx_ldb->clk_sel[i] = devm_clk_get(imx_ldb->dev, clkname);
+		if (IS_ERR(imx_ldb->clk_sel[i])) {
+			ret = PTR_ERR(imx_ldb->clk_sel[i]);
+			imx_ldb->clk_sel[i] = NULL;
+			break;
+		}
+	}
+	if (i == 0)
+		return ret;
+
+	for_each_child_of_node(np, child) {
+		struct imx_ldb_channel *channel;
+
+		ret = of_property_read_u32(child, "reg", &i);
+		if (ret || i < 0 || i > 1)
+			return -EINVAL;
+
+		if (dual && i > 0) {
+			dev_warn(&pdev->dev, "dual-channel mode, ignoring second output\n");
+			continue;
+		}
+
+		if (!of_device_is_available(child))
+			continue;
+
+		channel = &imx_ldb->channel[i];
+		channel->ldb = imx_ldb;
+		channel->chno = i;
+
+		edidp = of_get_property(child, "edid", &channel->edid_len);
+		if (edidp) {
+			channel->edid = kmemdup(edidp, channel->edid_len,
+						GFP_KERNEL);
+		} else {
+			ret = of_get_drm_display_mode(child, &channel->mode, 0);
+			if (!ret)
+				channel->mode_valid = 1;
+		}
+
+		ret = of_property_read_u32(child, "fsl,data-width", &datawidth);
+		if (ret)
+			datawidth = 0;
+		else if (datawidth != 18 && datawidth != 24)
+			return -EINVAL;
+
+		mapping = of_get_data_mapping(child);
+		switch (mapping) {
+		case LVDS_BIT_MAP_SPWG:
+			if (datawidth == 24) {
+				if (i == 0 || dual)
+					imx_ldb->ldb_ctrl |= LDB_DATA_WIDTH_CH0_24;
+				if (i == 1 || dual)
+					imx_ldb->ldb_ctrl |= LDB_DATA_WIDTH_CH1_24;
+			}
+			break;
+		case LVDS_BIT_MAP_JEIDA:
+			if (datawidth == 18) {
+				dev_err(&pdev->dev, "JEIDA standard only supported in 24 bit\n");
+				return -EINVAL;
+			}
+			if (i == 0 || dual)
+				imx_ldb->ldb_ctrl |= LDB_DATA_WIDTH_CH0_24 | LDB_BIT_MAP_CH0_JEIDA;
+			if (i == 1 || dual)
+				imx_ldb->ldb_ctrl |= LDB_DATA_WIDTH_CH1_24 | LDB_BIT_MAP_CH1_JEIDA;
+			break;
+		default:
+			dev_err(&pdev->dev, "data mapping not specified or invalid\n");
+			return -EINVAL;
+		}
+
+		ret = imx_ldb_register(channel);
+		if (ret)
+			return ret;
+
+		imx_drm_encoder_add_possible_crtcs(channel->imx_drm_encoder, child);
+	}
+
+	platform_set_drvdata(pdev, imx_ldb);
+
+	return 0;
+}
+
+static int imx_ldb_remove(struct platform_device *pdev)
+{
+	struct imx_ldb *imx_ldb = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		struct imx_ldb_channel *channel = &imx_ldb->channel[i];
+		struct drm_connector *connector = &channel->connector;
+		struct drm_encoder *encoder = &channel->encoder;
+
+		drm_mode_connector_detach_encoder(connector, encoder);
+
+		imx_drm_remove_connector(channel->imx_drm_connector);
+		imx_drm_remove_encoder(channel->imx_drm_encoder);
+	}
+
+	return 0;
+}
+
+static struct platform_driver imx_ldb_driver = {
+	.probe		= imx_ldb_probe,
+	.remove		= imx_ldb_remove,
+	.driver		= {
+		.of_match_table = imx_ldb_dt_ids,
+		.name	= DRIVER_NAME,
+		.owner	= THIS_MODULE,
+	},
+};
+
+module_platform_driver(imx_ldb_driver);
+
+MODULE_DESCRIPTION("i.MX LVDS driver");
+MODULE_AUTHOR("Sascha Hauer, Pengutronix");
+MODULE_LICENSE("GPL");

diff --git a/drivers/staging/imx-drm/imx-tve.c b/drivers/staging/imx-drm/imx-tve.c
index 03892de..a56797d 100644
--- a/drivers/staging/imx-drm/imx-tve.c
+++ b/drivers/staging/imx-drm/imx-tve.c

@@ -22,7 +22,6 @@
 #include <linux/clk-provider.h>
 #include <linux/module.h>
 #include <linux/of_i2c.h>
-#include <linux/pinctrl/consumer.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
 #include <linux/spinlock.h>
@@ -610,15 +609,6 @@
 	}
 
 	if (tve->mode == TVE_MODE_VGA) {
-		struct pinctrl *pinctrl;
-
-		pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
-		if (IS_ERR(pinctrl)) {
-			ret = PTR_ERR(pinctrl);
-			dev_warn(&pdev->dev, "failed to setup pinctrl: %d", ret);
-			return ret;
-		}
-
 		ret = of_property_read_u32(np, "fsl,hsync-pin", &tve->hsync_pin);
 		if (ret < 0) {
 			dev_err(&pdev->dev, "failed to get vsync pin\n");
@@ -638,11 +628,9 @@
 		return -ENOENT;
 	}
 
-	base = devm_request_and_ioremap(&pdev->dev, res);
-	if (!base) {
-		dev_err(&pdev->dev, "failed to remap memory region\n");
-		return -ENOENT;
-	}
+	base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
 
 	tve_regmap_config.lock_arg = tve;
 	tve->regmap = devm_regmap_init_mmio_clk(&pdev->dev, "tve", base,

diff --git a/drivers/staging/imx-drm/ipu-v3/ipu-common.c b/drivers/staging/imx-drm/ipu-v3/ipu-common.c
index 0127601..e35d0bf 100644
--- a/drivers/staging/imx-drm/ipu-v3/ipu-common.c
+++ b/drivers/staging/imx-drm/ipu-v3/ipu-common.c

@@ -27,6 +27,7 @@
 #include <linux/list.h>
 #include <linux/irq.h>
 #include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
 #include <linux/of_device.h>
 
 #include "imx-ipu-v3.h"
@@ -799,16 +800,18 @@
 static void ipu_irq_handle(struct ipu_soc *ipu, const int *regs, int num_regs)
 {
 	unsigned long status;
-	int i, bit, irq_base;
+	int i, bit, irq;
 
 	for (i = 0; i < num_regs; i++) {
 
 		status = ipu_cm_read(ipu, IPU_INT_STAT(regs[i]));
 		status &= ipu_cm_read(ipu, IPU_INT_CTRL(regs[i]));
 
-		irq_base = ipu->irq_start + regs[i] * 32;
-		for_each_set_bit(bit, &status, 32)
-			generic_handle_irq(irq_base + bit);
+		for_each_set_bit(bit, &status, 32) {
+			irq = irq_linear_revmap(ipu->domain, regs[i] * 32 + bit);
+			if (irq)
+				generic_handle_irq(irq);
+		}
 	}
 }
 
@@ -838,57 +841,15 @@
 	chained_irq_exit(chip, desc);
 }
 
-static void ipu_ack_irq(struct irq_data *d)
-{
-	struct ipu_soc *ipu = irq_data_get_irq_chip_data(d);
-	unsigned int irq = d->irq - ipu->irq_start;
-
-	ipu_cm_write(ipu, 1 << (irq % 32), IPU_INT_STAT(irq / 32));
-}
-
-static void ipu_unmask_irq(struct irq_data *d)
-{
-	struct ipu_soc *ipu = irq_data_get_irq_chip_data(d);
-	unsigned int irq = d->irq - ipu->irq_start;
-	unsigned long flags;
-	u32 reg;
-
-	spin_lock_irqsave(&ipu->lock, flags);
-
-	reg = ipu_cm_read(ipu, IPU_INT_CTRL(irq / 32));
-	reg |= 1 << (irq % 32);
-	ipu_cm_write(ipu, reg, IPU_INT_CTRL(irq / 32));
-
-	spin_unlock_irqrestore(&ipu->lock, flags);
-}
-
-static void ipu_mask_irq(struct irq_data *d)
-{
-	struct ipu_soc *ipu = irq_data_get_irq_chip_data(d);
-	unsigned int irq = d->irq - ipu->irq_start;
-	unsigned long flags;
-	u32 reg;
-
-	spin_lock_irqsave(&ipu->lock, flags);
-
-	reg = ipu_cm_read(ipu, IPU_INT_CTRL(irq / 32));
-	reg &= ~(1 << (irq % 32));
-	ipu_cm_write(ipu, reg, IPU_INT_CTRL(irq / 32));
-
-	spin_unlock_irqrestore(&ipu->lock, flags);
-}
-
-static struct irq_chip ipu_irq_chip = {
-	.name = "IPU",
-	.irq_ack = ipu_ack_irq,
-	.irq_mask = ipu_mask_irq,
-	.irq_unmask = ipu_unmask_irq,
-};
-
 int ipu_idmac_channel_irq(struct ipu_soc *ipu, struct ipuv3_channel *channel,
 		enum ipu_channel_irq irq_type)
 {
-	return ipu->irq_start + irq_type + channel->num;
+	int irq = irq_linear_revmap(ipu->domain, irq_type + channel->num);
+
+	if (!irq)
+		irq = irq_create_mapping(ipu->domain, irq_type + channel->num);
+
+	return irq;
 }
 EXPORT_SYMBOL_GPL(ipu_idmac_channel_irq);
 
@@ -975,18 +936,48 @@
 	return ret;
 }
 
+
 static int ipu_irq_init(struct ipu_soc *ipu)
 {
-	int i;
+	struct irq_chip_generic *gc;
+	struct irq_chip_type *ct;
+	unsigned long unused[IPU_NUM_IRQS / 32] = {
+		0x400100d0, 0xffe000fd,
+		0x400100d0, 0xffe000fd,
+		0x400100d0, 0xffe000fd,
+		0x4077ffff, 0xffe7e1fd,
+		0x23fffffe, 0x8880fff0,
+		0xf98fe7d0, 0xfff81fff,
+		0x400100d0, 0xffe000fd,
+		0x00000000,
+	};
+	int ret, i;
 
-	ipu->irq_start = irq_alloc_descs(-1, 0, IPU_NUM_IRQS, 0);
-	if (ipu->irq_start < 0)
-		return ipu->irq_start;
+	ipu->domain = irq_domain_add_linear(ipu->dev->of_node, IPU_NUM_IRQS,
+					    &irq_generic_chip_ops, ipu);
+	if (!ipu->domain) {
+		dev_err(ipu->dev, "failed to add irq domain\n");
+		return -ENODEV;
+	}
 
-	for (i = ipu->irq_start; i < ipu->irq_start + IPU_NUM_IRQS; i++) {
-		irq_set_chip_and_handler(i, &ipu_irq_chip, handle_level_irq);
-		set_irq_flags(i, IRQF_VALID);
-		irq_set_chip_data(i, ipu);
+	ret = irq_alloc_domain_generic_chips(ipu->domain, 32, 1, "IPU",
+					     handle_level_irq, 0, IRQF_VALID, 0);
+	if (ret < 0) {
+		dev_err(ipu->dev, "failed to alloc generic irq chips\n");
+		irq_domain_remove(ipu->domain);
+		return ret;
+	}
+
+	for (i = 0; i < IPU_NUM_IRQS; i += 32) {
+		gc = irq_get_domain_generic_chip(ipu->domain, i);
+		gc->reg_base = ipu->cm_reg;
+		gc->unused = unused[i / 32];
+		ct = gc->chip_types;
+		ct->chip.irq_ack = irq_gc_ack_set_bit;
+		ct->chip.irq_mask = irq_gc_mask_clr_bit;
+		ct->chip.irq_unmask = irq_gc_mask_set_bit;
+		ct->regs.ack = IPU_INT_STAT(i / 32);
+		ct->regs.mask = IPU_INT_CTRL(i / 32);
 	}
 
 	irq_set_chained_handler(ipu->irq_sync, ipu_irq_handler);
@@ -999,20 +990,22 @@
 
 static void ipu_irq_exit(struct ipu_soc *ipu)
 {
-	int i;
+	int i, irq;
 
 	irq_set_chained_handler(ipu->irq_err, NULL);
 	irq_set_handler_data(ipu->irq_err, NULL);
 	irq_set_chained_handler(ipu->irq_sync, NULL);
 	irq_set_handler_data(ipu->irq_sync, NULL);
 
-	for (i = ipu->irq_start; i < ipu->irq_start + IPU_NUM_IRQS; i++) {
-		set_irq_flags(i, 0);
-		irq_set_chip(i, NULL);
-		irq_set_chip_data(i, NULL);
+	/* TODO: remove irq_domain_generic_chips */
+
+	for (i = 0; i < IPU_NUM_IRQS; i++) {
+		irq = irq_linear_revmap(ipu->domain, i);
+		if (irq)
+			irq_dispose_mapping(irq);
 	}
 
-	irq_free_descs(ipu->irq_start, IPU_NUM_IRQS);
+	irq_domain_remove(ipu->domain);
 }
 
 static int ipu_probe(struct platform_device *pdev)

diff --git a/drivers/staging/imx-drm/ipu-v3/ipu-di.c b/drivers/staging/imx-drm/ipu-v3/ipu-di.c
index 19d777e..0b6806e 100644
--- a/drivers/staging/imx-drm/ipu-v3/ipu-di.c
+++ b/drivers/staging/imx-drm/ipu-v3/ipu-di.c

@@ -603,7 +603,12 @@
 
 		vsync_cnt = 3;
 		if (di->id == 1)
-			vsync_cnt = 6;
+			/*
+			 * TODO: change only for TVEv2, parallel display
+			 * uses pin 2 / 3
+			 */
+			if (!(sig->hsync_pin == 2 && sig->vsync_pin == 3))
+				vsync_cnt = 6;
 
 		if (sig->Hsync_pol) {
 			if (sig->hsync_pin == 2)
@@ -614,11 +619,11 @@
 				di_gen |= DI_GEN_POLARITY_7;
 		}
 		if (sig->Vsync_pol) {
-			if (sig->hsync_pin == 3)
+			if (sig->vsync_pin == 3)
 				di_gen |= DI_GEN_POLARITY_3;
-			else if (sig->hsync_pin == 6)
+			else if (sig->vsync_pin == 6)
 				di_gen |= DI_GEN_POLARITY_6;
-			else if (sig->hsync_pin == 8)
+			else if (sig->vsync_pin == 8)
 				di_gen |= DI_GEN_POLARITY_8;
 		}
 	}

diff --git a/drivers/staging/imx-drm/ipu-v3/ipu-dmfc.c b/drivers/staging/imx-drm/ipu-v3/ipu-dmfc.c
index 91821bc..2e97c33 100644
--- a/drivers/staging/imx-drm/ipu-v3/ipu-dmfc.c
+++ b/drivers/staging/imx-drm/ipu-v3/ipu-dmfc.c

@@ -61,7 +61,7 @@
 
 static const struct dmfc_channel_data dmfcdata[] = {
 	{
-		.ipu_channel	= 23,
+		.ipu_channel	= IPUV3_CHANNEL_MEM_BG_SYNC,
 		.channel_reg	= DMFC_DP_CHAN,
 		.shift		= DMFC_DP_CHAN_5B_23,
 		.eot_shift	= 20,
@@ -73,13 +73,13 @@
 		.eot_shift	= 22,
 		.max_fifo_lines	= 1,
 	}, {
-		.ipu_channel	= 27,
+		.ipu_channel	= IPUV3_CHANNEL_MEM_FG_SYNC,
 		.channel_reg	= DMFC_DP_CHAN,
 		.shift		= DMFC_DP_CHAN_5F_27,
 		.eot_shift	= 21,
 		.max_fifo_lines	= 2,
 	}, {
-		.ipu_channel	= 28,
+		.ipu_channel	= IPUV3_CHANNEL_MEM_DC_SYNC,
 		.channel_reg	= DMFC_WR_CHAN,
 		.shift		= DMFC_WR_CHAN_1_28,
 		.eot_shift	= 16,
@@ -292,7 +292,7 @@
 {
 	struct ipu_dmfc_priv *priv = dmfc->priv;
 	int slots = dmfc_bandwidth_to_slots(priv, bandwidth_pixel_per_second);
-	int segment = 0, ret = 0;
+	int segment = -1, ret = 0;
 
 	dev_dbg(priv->dev, "dmfc: trying to allocate %ldMpixel/s for IPU channel %d\n",
 			bandwidth_pixel_per_second / 1000000,
@@ -307,7 +307,17 @@
 		goto out;
 	}
 
-	segment = dmfc_find_slots(priv, slots);
+	/* Always allocate at least 128*4 bytes (2 slots) */
+	if (slots < 2)
+		slots = 2;
+
+	/* For the MEM_BG channel, first try to allocate twice the slots */
+	if (dmfc->data->ipu_channel == IPUV3_CHANNEL_MEM_BG_SYNC)
+		segment = dmfc_find_slots(priv, slots * 2);
+	if (segment >= 0)
+		slots *= 2;
+	else
+		segment = dmfc_find_slots(priv, slots);
 	if (segment < 0) {
 		ret = -EBUSY;
 		goto out;
@@ -391,7 +401,7 @@
 	 * We have a total bandwidth of clkrate * 4pixel divided
 	 * into 8 slots.
 	 */
-	priv->bandwidth_per_slot = clk_get_rate(ipu_clk) / 8;
+	priv->bandwidth_per_slot = clk_get_rate(ipu_clk) * 4 / 8;
 
 	dev_dbg(dev, "dmfc: 8 slots with %ldMpixel/s bandwidth each\n",
 			priv->bandwidth_per_slot / 1000000);

diff --git a/drivers/staging/imx-drm/ipu-v3/ipu-prv.h b/drivers/staging/imx-drm/ipu-v3/ipu-prv.h
index 5518028..4df0050 100644
--- a/drivers/staging/imx-drm/ipu-v3/ipu-prv.h
+++ b/drivers/staging/imx-drm/ipu-v3/ipu-prv.h

@@ -110,7 +110,7 @@
 #define IDMAC_BAND_EN(ch)		IPU_IDMAC_REG(0x0040 + 4 * ((ch) / 32))
 #define IDMAC_CHA_BUSY(ch)		IPU_IDMAC_REG(0x0100 + 4 * ((ch) / 32))
 
-#define IPU_NUM_IRQS	(32 * 5)
+#define IPU_NUM_IRQS	(32 * 15)
 
 enum ipu_modules {
 	IPU_CONF_CSI0_EN		= (1 << 0),
@@ -170,9 +170,9 @@
 
 	struct ipuv3_channel	channel[64];
 
-	int			irq_start;
 	int			irq_sync;
 	int			irq_err;
+	struct irq_domain	*domain;
 
 	struct ipu_dc_priv	*dc_priv;
 	struct ipu_dp_priv	*dp_priv;

diff --git a/drivers/staging/imx-drm/ipuv3-crtc.c b/drivers/staging/imx-drm/ipuv3-crtc.c
index ff5c633..4a7eedf 100644
--- a/drivers/staging/imx-drm/ipuv3-crtc.c
+++ b/drivers/staging/imx-drm/ipuv3-crtc.c

@@ -22,7 +22,6 @@
 #include <linux/device.h>
 #include <linux/platform_device.h>
 #include <drm/drmP.h>
-#include <drm/drm_fb_helper.h>
 #include <drm/drm_crtc_helper.h>
 #include <linux/fb.h>
 #include <linux/clk.h>
@@ -42,9 +41,6 @@
 };
 
 struct ipu_crtc {
-	struct drm_fb_helper	fb_helper;
-	struct ipu_framebuffer	ifb;
-	int			num_crtcs;
 	struct device		*dev;
 	struct drm_crtc		base;
 	struct imx_drm_crtc	*imx_crtc;
@@ -54,7 +50,6 @@
 	struct dmfc_channel	*dmfc;
 	struct ipu_di		*di;
 	int			enabled;
-	struct ipu_priv		*ipu_priv;
 	struct drm_pending_vblank_event *page_flip_event;
 	struct drm_framebuffer	*newfb;
 	int			irq;
@@ -152,6 +147,7 @@
 
 	ipu_crtc->newfb = fb;
 	ipu_crtc->page_flip_event = event;
+	crtc->fb = fb;
 
 	return 0;
 }
@@ -334,7 +330,6 @@
 	imx_drm_handle_vblank(ipu_crtc->imx_crtc);
 
 	if (ipu_crtc->newfb) {
-		ipu_crtc->base.fb = ipu_crtc->newfb;
 		ipu_crtc->newfb = NULL;
 		ipu_drm_set_base(&ipu_crtc->base, 0, 0);
 		ipu_crtc_handle_pageflip(ipu_crtc);

diff --git a/drivers/staging/imx-drm/parallel-display.c b/drivers/staging/imx-drm/parallel-display.c
index e7fba62..cea9f14 100644
--- a/drivers/staging/imx-drm/parallel-display.c
+++ b/drivers/staging/imx-drm/parallel-display.c

@@ -23,7 +23,6 @@
 #include <drm/drm_fb_helper.h>
 #include <drm/drm_crtc_helper.h>
 #include <linux/videodev2.h>
-#include <linux/pinctrl/consumer.h>
 
 #include "imx-drm.h"
 
@@ -206,20 +205,11 @@
 	struct imx_parallel_display *imxpd;
 	int ret;
 	const char *fmt;
-	struct pinctrl *pinctrl;
 
 	imxpd = devm_kzalloc(&pdev->dev, sizeof(*imxpd), GFP_KERNEL);
 	if (!imxpd)
 		return -ENOMEM;
 
-	pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
-	if (IS_ERR(pinctrl)) {
-		ret = PTR_ERR(pinctrl);
-		dev_warn(&pdev->dev, "pinctrl_get_select_default failed with %d",
-				ret);
-		return ret;
-	}
-
 	edidp = of_get_property(np, "edid", &imxpd->edid_len);
 	if (edidp)
 		imxpd->edid = kmemdup(edidp, imxpd->edid_len, GFP_KERNEL);
@@ -265,6 +255,7 @@
 	{ .compatible = "fsl,imx-parallel-display", },
 	{ /* sentinel */ }
 };
+MODULE_DEVICE_TABLE(of, imx_pd_dt_ids);
 
 static struct platform_driver imx_pd_driver = {
 	.probe		= imx_pd_probe,

diff --git a/drivers/staging/keucr/init.c b/drivers/staging/keucr/init.c
index 231611d..f5d41e0 100644
--- a/drivers/staging/keucr/init.c
+++ b/drivers/staging/keucr/init.c

@@ -19,13 +19,13 @@
 	int	result;
 	BYTE	MiscReg03 = 0;
 
-	printk(KERN_INFO "--- Init Media ---\n");
-	result = ENE_Read_BYTE(us, REG_CARD_STATUS, &MiscReg03);
+	dev_info(&us->pusb_dev->dev, "--- Init Media ---\n");
+	result = ene_read_byte(us, REG_CARD_STATUS, &MiscReg03);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk(KERN_ERR "Read register fail !!\n");
+		dev_err(&us->pusb_dev->dev, "Failed to read register\n");
 		return USB_STOR_TRANSPORT_ERROR;
 	}
-	printk(KERN_INFO "MiscReg03 = %x\n", MiscReg03);
+	dev_info(&us->pusb_dev->dev, "MiscReg03 = %x\n", MiscReg03);
 
 	if (MiscReg03 & 0x02) {
 		if (!us->SM_Status.Ready && !us->MS_Status.Ready) {
@@ -39,9 +39,9 @@
 }
 
 /*
- * ENE_Read_BYTE() :
+ * ene_read_byte() :
  */
-int ENE_Read_BYTE(struct us_data *us, WORD index, void *buf)
+int ene_read_byte(struct us_data *us, WORD index, void *buf)
 {
 	struct bulk_cb_wrap *bcb = (struct bulk_cb_wrap *) us->iobuf;
 	int result;
@@ -67,11 +67,13 @@
 	int	result;
 	BYTE	buf[0x200];
 
-	printk(KERN_INFO "transport --- ENE_SMInit\n");
+	dev_dbg(&us->pusb_dev->dev, "transport --- ENE_SMInit\n");
 
 	result = ENE_LoadBinCode(us, SM_INIT_PATTERN);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk(KERN_INFO "Load SM Init Code Fail !!\n");
+		dev_info(&us->pusb_dev->dev,
+			 "Failed to load SmartMedia init code\n: result= %x\n",
+			 result);
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
@@ -84,26 +86,33 @@
 
 	result = ENE_SendScsiCmd(us, FDIR_READ, &buf, 0);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk(KERN_ERR
-		       "Execution SM Init Code Fail !! result = %x\n", result);
+		dev_err(&us->pusb_dev->dev,
+			"Failed to load SmartMedia init code: result = %x\n",
+			result);
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
-	us->SM_Status = *(PSM_STATUS)&buf[0];
+	us->SM_Status = *(struct keucr_sm_status *)&buf[0];
 
 	us->SM_DeviceID = buf[1];
 	us->SM_CardID   = buf[2];
 
 	if (us->SM_Status.Insert && us->SM_Status.Ready) {
-		printk(KERN_INFO "Insert     = %x\n", us->SM_Status.Insert);
-		printk(KERN_INFO "Ready      = %x\n", us->SM_Status.Ready);
-		printk(KERN_INFO "WtP        = %x\n", us->SM_Status.WtP);
-		printk(KERN_INFO "DeviceID   = %x\n", us->SM_DeviceID);
-		printk(KERN_INFO "CardID     = %x\n", us->SM_CardID);
+		dev_info(&us->pusb_dev->dev, "Insert     = %x\n",
+					     us->SM_Status.Insert);
+		dev_info(&us->pusb_dev->dev, "Ready      = %x\n",
+					     us->SM_Status.Ready);
+		dev_info(&us->pusb_dev->dev, "WtP        = %x\n",
+					     us->SM_Status.WtP);
+		dev_info(&us->pusb_dev->dev, "DeviceID   = %x\n",
+					     us->SM_DeviceID);
+		dev_info(&us->pusb_dev->dev, "CardID     = %x\n",
+					     us->SM_CardID);
 		MediaChange = 1;
 		Check_D_MediaFmt(us);
 	} else {
-		printk(KERN_ERR "SM Card Not Ready --- %x\n", buf[0]);
+		dev_err(&us->pusb_dev->dev,
+			"SmartMedia Card Not Ready --- %x\n", buf[0]);
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
@@ -120,7 +129,7 @@
 	/* void *buf; */
 	PBYTE buf;
 
-	/* printk(KERN_INFO "transport --- ENE_LoadBinCode\n"); */
+	/* dev_info(&us->pusb_dev->dev, "transport --- ENE_LoadBinCode\n"); */
 	if (us->BIN_FLAG == flag)
 		return USB_STOR_TRANSPORT_GOOD;
 
@@ -130,11 +139,11 @@
 	switch (flag) {
 	/* For SS */
 	case SM_INIT_PATTERN:
-		printk(KERN_INFO "SM_INIT_PATTERN\n");
+		dev_dbg(&us->pusb_dev->dev, "SM_INIT_PATTERN\n");
 		memcpy(buf, SM_Init, 0x800);
 		break;
 	case SM_RW_PATTERN:
-		printk(KERN_INFO "SM_RW_PATTERN\n");
+		dev_dbg(&us->pusb_dev->dev, "SM_RW_PATTERN\n");
 		memcpy(buf, SM_Rdwr, 0x800);
 		break;
 	}
@@ -165,12 +174,13 @@
 		     cswlen = 0, partial = 0;
 	unsigned int residue;
 
-	/* printk(KERN_INFO "transport --- ENE_SendScsiCmd\n"); */
+	/* dev_dbg(&us->pusb_dev->dev, "transport --- ENE_SendScsiCmd\n"); */
 	/* send cmd to out endpoint */
 	result = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe,
 					    bcb, US_BULK_CB_WRAP_LEN, NULL);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk(KERN_ERR "send cmd to out endpoint fail ---\n");
+		dev_err(&us->pusb_dev->dev,
+				"send cmd to out endpoint fail ---\n");
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
@@ -189,7 +199,7 @@
 			result = usb_stor_bulk_transfer_sg(us, pipe, buf,
 						transfer_length, 0, &partial);
 		if (result != USB_STOR_XFER_GOOD) {
-			printk(KERN_ERR "data transfer fail ---\n");
+			dev_err(&us->pusb_dev->dev, "data transfer fail ---\n");
 			return USB_STOR_TRANSPORT_ERROR;
 		}
 	}
@@ -199,14 +209,16 @@
 						US_BULK_CS_WRAP_LEN, &cswlen);
 
 	if (result == USB_STOR_XFER_SHORT && cswlen == 0) {
-		printk(KERN_WARNING "Received 0-length CSW; retrying...\n");
+		dev_warn(&us->pusb_dev->dev,
+				"Received 0-length CSW; retrying...\n");
 		result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe,
 					bcs, US_BULK_CS_WRAP_LEN, &cswlen);
 	}
 
 	if (result == USB_STOR_XFER_STALLED) {
 		/* get the status again */
-		printk(KERN_WARNING "Attempting to get CSW (2nd try)...\n");
+		dev_warn(&us->pusb_dev->dev,
+				"Attempting to get CSW (2nd try)...\n");
 		result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe,
 						bcs, US_BULK_CS_WRAP_LEN, NULL);
 	}
@@ -243,7 +255,7 @@
 	struct bulk_cs_wrap *bcs = (struct bulk_cs_wrap *) us->iobuf;
 	int result;
 
-	/* printk(KERN_INFO "transport --- ENE_Read_Data\n"); */
+	/* dev_dbg(&us->pusb_dev->dev, "transport --- ENE_Read_Data\n"); */
 	/* set up the command wrapper */
 	memset(bcb, 0, sizeof(struct bulk_cb_wrap));
 	bcb->Signature = cpu_to_le32(US_BULK_CB_SIGN);
@@ -318,55 +330,3 @@
 	return USB_STOR_TRANSPORT_GOOD;
 }
 
-/*
- * usb_stor_print_cmd():
- */
-void usb_stor_print_cmd(struct scsi_cmnd *srb)
-{
-	PBYTE	Cdb = srb->cmnd;
-	DWORD	cmd = Cdb[0];
-	DWORD	bn  =	((Cdb[2] << 24) & 0xff000000) |
-			((Cdb[3] << 16) & 0x00ff0000) |
-			((Cdb[4] << 8) & 0x0000ff00) |
-			((Cdb[5] << 0) & 0x000000ff);
-	WORD	blen = ((Cdb[7] << 8) & 0xff00) | ((Cdb[8] << 0) & 0x00ff);
-
-	switch (cmd) {
-	case TEST_UNIT_READY:
-		/* printk(KERN_INFO
-			 "scsi cmd %X --- SCSIOP_TEST_UNIT_READY\n", cmd); */
-		break;
-	case INQUIRY:
-		printk(KERN_INFO "scsi cmd %X --- SCSIOP_INQUIRY\n", cmd);
-		break;
-	case MODE_SENSE:
-		printk(KERN_INFO "scsi cmd %X --- SCSIOP_MODE_SENSE\n", cmd);
-		break;
-	case START_STOP:
-		printk(KERN_INFO "scsi cmd %X --- SCSIOP_START_STOP\n", cmd);
-		break;
-	case READ_CAPACITY:
-		printk(KERN_INFO "scsi cmd %X --- SCSIOP_READ_CAPACITY\n", cmd);
-		break;
-	case READ_10:
-		/*  printk(KERN_INFO
-			   "scsi cmd %X --- SCSIOP_READ,bn = %X, blen = %X\n"
-			   ,cmd, bn, blen); */
-		break;
-	case WRITE_10:
-		/* printk(KERN_INFO
-			  "scsi cmd %X --- SCSIOP_WRITE,
-			  bn = %X, blen = %X\n" , cmd, bn, blen); */
-		break;
-	case ALLOW_MEDIUM_REMOVAL:
-		printk(KERN_INFO
-			"scsi cmd %X --- SCSIOP_ALLOW_MEDIUM_REMOVAL\n", cmd);
-		break;
-	default:
-		printk(KERN_INFO "scsi cmd %X --- Other cmd\n", cmd);
-		break;
-	}
-	bn = 0;
-	blen = 0;
-}
-

diff --git a/drivers/staging/keucr/scsiglue.c b/drivers/staging/keucr/scsiglue.c
index 48e1005..afb00d8 100644
--- a/drivers/staging/keucr/scsiglue.c
+++ b/drivers/staging/keucr/scsiglue.c

@@ -73,7 +73,8 @@
 		if (us->fflags & US_FL_CAPACITY_HEURISTICS)
 			sdev->guess_capacity = 1;
 		if (sdev->scsi_level > SCSI_2)
-			sdev->sdev_target->scsi_level = sdev->scsi_level = SCSI_2;
+			sdev->sdev_target->scsi_level = sdev->scsi_level
+								= SCSI_2;
 		sdev->retry_hwerror = 1;
 		sdev->allow_restart = 1;
 		sdev->last_sector_bug = 1;
@@ -144,7 +145,7 @@
 	scsi_lock(us_to_host(us));
 	if (us->srb != srb) {
 		scsi_unlock(us_to_host(us));
-		printk("-- nothing to abort\n");
+		dev_info(&us->pusb_dev->dev, "-- nothing to abort\n");
 		return FAILED;
 	}
 
@@ -319,8 +320,11 @@
 	return -EINVAL;
 }
 
-static DEVICE_ATTR(max_sectors, S_IRUGO | S_IWUSR, show_max_sectors, store_max_sectors);
-static struct device_attribute *sysfs_device_attr_list[] = {&dev_attr_max_sectors, NULL, };
+static DEVICE_ATTR(max_sectors, S_IRUGO | S_IWUSR, show_max_sectors,
+							store_max_sectors);
+static struct device_attribute *sysfs_device_attr_list[] = {
+	&dev_attr_max_sectors, NULL,
+};
 
 /* this defines our host template, with which we'll allocate hosts */
 
@@ -393,8 +397,9 @@
 /*
  * usb_stor_access_xfer_buf()
  */
-unsigned int usb_stor_access_xfer_buf(struct us_data *us, unsigned char *buffer,
-	unsigned int buflen, struct scsi_cmnd *srb, struct scatterlist **sgptr,
+unsigned int usb_stor_access_xfer_buf(struct us_data *us,
+	unsigned char *buffer, unsigned int buflen,
+	struct scsi_cmnd *srb, struct scatterlist **sgptr,
 	unsigned int *offset, enum xfer_buf_dir dir)
 {
 	unsigned int cnt;
@@ -424,7 +429,7 @@
 
 		while (sglen > 0) {
 			unsigned int plen = min(sglen,
-						(unsigned int)PAGE_SIZE - poff);
+					(unsigned int)PAGE_SIZE - poff);
 			unsigned char *ptr = kmap(page);
 
 			if (dir == TO_XFER_BUF)

diff --git a/drivers/staging/keucr/smil.h b/drivers/staging/keucr/smil.h
index 24a636a..1538d7b 100644
--- a/drivers/staging/keucr/smil.h
+++ b/drivers/staging/keucr/smil.h

@@ -168,7 +168,7 @@
 /***************************************************************************
 Struct Definition
 ***************************************************************************/
-struct SSFDCTYPE {
+struct keucr_media_info {
 	BYTE Model;
 	BYTE Attribute;
 	BYTE MaxZones;
@@ -177,30 +177,14 @@
 	WORD MaxLogBlocks;
 };
 
-typedef struct SSFDCTYPE_T {
-	BYTE Model;
-	BYTE Attribute;
-	BYTE MaxZones;
-	BYTE MaxSectors;
-	WORD MaxBlocks;
-	WORD MaxLogBlocks;
-} *SSFDCTYPE_T;
-
-struct ADDRESS {
+struct keucr_media_address {
 	BYTE Zone;	/* Zone Number */
 	BYTE Sector;	/* Sector(512byte) Number on Block */
 	WORD PhyBlock;	/* Physical Block Number on Zone */
 	WORD LogBlock;	/* Logical Block Number of Zone */
 };
 
-typedef struct ADDRESS_T {
-	BYTE Zone;	/* Zone Number */
-	BYTE Sector;	/* Sector(512byte) Number on Block */
-	WORD PhyBlock;	/* Physical Block Number on Zone */
-	WORD LogBlock;	/* Logical Block Number of Zone */
-} *ADDRESS_T;
-
-struct CIS_AREA {
+struct keucr_media_area {
 	BYTE Sector;	/* Sector(512byte) Number on Block */
 	WORD PhyBlock;	/* Physical Block Number on Zone 0 */
 };
@@ -215,9 +199,9 @@
 extern WORD	WriteBlock;
 extern DWORD	MediaChange;
 
-extern struct SSFDCTYPE  Ssfdc;
-extern struct ADDRESS    Media;
-extern struct CIS_AREA   CisArea;
+extern struct keucr_media_info    Ssfdc;
+extern struct keucr_media_address Media;
+extern struct keucr_media_area    CisArea;
 
 /*
  * SMILMain.c

diff --git a/drivers/staging/keucr/smilmain.c b/drivers/staging/keucr/smilmain.c
index cc49038..2786808 100644
--- a/drivers/staging/keucr/smilmain.c
+++ b/drivers/staging/keucr/smilmain.c

@@ -4,204 +4,135 @@
 #include "smcommon.h"
 #include "smil.h"
 
-int         Check_D_LogCHS              (WORD *,BYTE *,BYTE *);
-void        Initialize_D_Media          (void);
-void        PowerOff_D_Media            (void);
-int         Check_D_MediaPower          (void);
-int         Check_D_MediaExist          (void);
-int         Check_D_MediaWP             (void);
-int         Check_D_MediaFmt            (struct us_data *);
-int         Check_D_MediaFmtForEraseAll (struct us_data *);
-int         Conv_D_MediaAddr            (struct us_data *, DWORD);
-int         Inc_D_MediaAddr             (struct us_data *);
-int         Check_D_FirstSect           (void);
-int         Check_D_LastSect            (void);
-int         Media_D_ReadOneSect         (struct us_data *, WORD, BYTE *);
-int         Media_D_WriteOneSect        (struct us_data *, WORD, BYTE *);
-int         Media_D_CopyBlockHead       (struct us_data *);
-int         Media_D_CopyBlockTail       (struct us_data *);
-int         Media_D_EraseOneBlock       (void);
-int         Media_D_EraseAllBlock       (void);
+int         Check_D_LogCHS(WORD *, BYTE *, BYTE *);
+void        Initialize_D_Media(void);
+void        PowerOff_D_Media(void);
+int         Check_D_MediaPower(void);
+int         Check_D_MediaExist(void);
+int         Check_D_MediaWP(void);
+int         Check_D_MediaFmt(struct us_data *);
+int         Check_D_MediaFmtForEraseAll(struct us_data *);
+int         Conv_D_MediaAddr(struct us_data *, DWORD);
+int         Inc_D_MediaAddr(struct us_data *);
+int         Check_D_FirstSect(void);
+int         Check_D_LastSect(void);
+int         Media_D_ReadOneSect(struct us_data *, WORD, BYTE *);
+int         Media_D_WriteOneSect(struct us_data *, WORD, BYTE *);
+int         Media_D_CopyBlockHead(struct us_data *);
+int         Media_D_CopyBlockTail(struct us_data *);
+int         Media_D_EraseOneBlock(void);
+int         Media_D_EraseAllBlock(void);
 
-int  Copy_D_BlockAll             (struct us_data *, DWORD);
-int  Copy_D_BlockHead            (struct us_data *);
-int  Copy_D_BlockTail            (struct us_data *);
-int  Reassign_D_BlockHead        (struct us_data *);
+int  Copy_D_BlockAll(struct us_data *, DWORD);
+int  Copy_D_BlockHead(struct us_data *);
+int  Copy_D_BlockTail(struct us_data *);
+int  Reassign_D_BlockHead(struct us_data *);
 
-int  Assign_D_WriteBlock         (void);
-int  Release_D_ReadBlock         (struct us_data *);
-int  Release_D_WriteBlock        (struct us_data *);
-int  Release_D_CopySector        (struct us_data *);
+int  Assign_D_WriteBlock(void);
+int  Release_D_ReadBlock(struct us_data *);
+int  Release_D_WriteBlock(struct us_data *);
+int  Release_D_CopySector(struct us_data *);
 
-int  Copy_D_PhyOneSect           (struct us_data *);
-int  Read_D_PhyOneSect           (struct us_data *, WORD, BYTE *);
-int  Write_D_PhyOneSect          (struct us_data *, WORD, BYTE *);
-int  Erase_D_PhyOneBlock         (struct us_data *);
+int  Copy_D_PhyOneSect(struct us_data *);
+int  Read_D_PhyOneSect(struct us_data *, WORD, BYTE *);
+int  Write_D_PhyOneSect(struct us_data *, WORD, BYTE *);
+int  Erase_D_PhyOneBlock(struct us_data *);
 
-int  Set_D_PhyFmtValue           (struct us_data *);
-int  Search_D_CIS                (struct us_data *);
-int  Make_D_LogTable             (struct us_data *);
-void Check_D_BlockIsFull         (void);
+int  Set_D_PhyFmtValue(struct us_data *);
+int  Search_D_CIS(struct us_data *);
+int  Make_D_LogTable(struct us_data *);
+void Check_D_BlockIsFull(void);
 
-int  MarkFail_D_PhyOneBlock      (struct us_data *);
+int  MarkFail_D_PhyOneBlock(struct us_data *);
 
 DWORD ErrXDCode;
 DWORD ErrCode;
-//BYTE  SectBuf[SECTSIZE];
 static BYTE  WorkBuf[SECTSIZE];
 static BYTE  Redundant[REDTSIZE];
 static BYTE  WorkRedund[REDTSIZE];
-//WORD  Log2Phy[MAX_ZONENUM][MAX_LOGBLOCK];
-static WORD  *Log2Phy[MAX_ZONENUM];                 // 128 x 1000,   Log2Phy[MAX_ZONENUM][MAX_LOGBLOCK];
-static BYTE  Assign[MAX_ZONENUM][MAX_BLOCKNUM/8];
+/* 128 x 1000, Log2Phy[MAX_ZONENUM][MAX_LOGBLOCK]; */
+static WORD  *Log2Phy[MAX_ZONENUM];
+static BYTE  Assign[MAX_ZONENUM][MAX_BLOCKNUM / 8];
 static WORD  AssignStart[MAX_ZONENUM];
 WORD  ReadBlock;
 WORD  WriteBlock;
 DWORD MediaChange;
 static DWORD SectCopyMode;
 
-//BIT Control Macro
-static BYTE BitData[] = { 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80 } ;
-#define Set_D_Bit(a,b)    (a[(BYTE)((b)/8)]|= BitData[(b)%8])
-#define Clr_D_Bit(a,b)    (a[(BYTE)((b)/8)]&=~BitData[(b)%8])
-#define Chk_D_Bit(a,b)    (a[(BYTE)((b)/8)] & BitData[(b)%8])
+/* BIT Control Macro */
+static BYTE BitData[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+#define Set_D_Bit(a, b)    (a[(BYTE)((b) / 8)] |= BitData[(b) % 8])
+#define Clr_D_Bit(a, b)    (a[(BYTE)((b) / 8)] &= ~BitData[(b) % 8])
+#define Chk_D_Bit(a, b)    (a[(BYTE)((b) / 8)] & BitData[(b) % 8])
 
-//extern PBYTE    SMHostAddr;
 BYTE     IsSSFDCCompliance;
 BYTE     IsXDCompliance;
 
 
-//
-////Power Control & Media Exist Check Function
-////----- Init_D_SmartMedia() --------------------------------------------
-//int Init_D_SmartMedia(void)
-//{
-//    int     i;
-//
-//    EMCR_Print("Init_D_SmartMedia start\n");
-//    for (i=0; i<MAX_ZONENUM; i++)
-//    {
-//        if (Log2Phy[i]!=NULL)
-//        {
-//            EMCR_Print("ExFreePool Zone = %x, Addr = %x\n", i, Log2Phy[i]);
-//            ExFreePool(Log2Phy[i]);
-//            Log2Phy[i] = NULL;
-//        }
-//    }
-//
-//    Initialize_D_Media();
-//    return(NO_ERROR);
-//}
-
-//----- SM_FreeMem() -------------------------------------------------
+/* ----- SM_FreeMem() ------------------------------------------------- */
 int SM_FreeMem(void)
 {
 	int	i;
 
 	pr_info("SM_FreeMem start\n");
-	for (i=0; i<MAX_ZONENUM; i++)
-	{
-		if (Log2Phy[i]!=NULL)
-		{
+	for (i = 0; i < MAX_ZONENUM; i++) {
+		if (Log2Phy[i] != NULL) {
 			pr_info("Free Zone = %x, Addr = %p\n", i, Log2Phy[i]);
 			kfree(Log2Phy[i]);
 			Log2Phy[i] = NULL;
 		}
 	}
-	return(NO_ERROR);
+	return NO_ERROR;
 }
 
-////----- Pwoff_D_SmartMedia() -------------------------------------------
-//int Pwoff_D_SmartMedia(void)
-//{
-//    PowerOff_D_Media();
-//    return(NO_ERROR);
-//}
-//
-////----- Check_D_SmartMedia() -------------------------------------------
-//int Check_D_SmartMedia(void)
-//{
-//    if (Check_D_MediaExist())
-//        return(ErrCode);
-//
-//    return(NO_ERROR);
-//}
-//
-////----- Check_D_Parameter() --------------------------------------------
-//int Check_D_Parameter(PFDO_DEVICE_EXTENSION fdoExt,WORD *pcyl,BYTE *phead,BYTE *psect)
-//{
-//    if (Check_D_MediaPower())
-//        return(ErrCode);
-//
-//    if (Check_D_MediaFmt(fdoExt))
-//        return(ErrCode);
-//
-//    if (Check_D_LogCHS(pcyl,phead,psect))
-//        return(ErrCode);
-//
-//    return(NO_ERROR);
-//}
-
-//SmartMedia Read/Write/Erase Function
-//----- Media_D_ReadSector() -------------------------------------------
-int Media_D_ReadSector(struct us_data *us, DWORD start,WORD count,BYTE *buf)
+/* SmartMedia Read/Write/Erase Function */
+/* ----- Media_D_ReadSector() ------------------------------------------- */
+int Media_D_ReadSector(struct us_data *us, DWORD start, WORD count, BYTE *buf)
 {
 	WORD len, bn;
 
-	//if (Check_D_MediaPower())        ; ¦b 6250 don't care
-	//    return(ErrCode);
-	//if (Check_D_MediaFmt(fdoExt))    ;
-	//    return(ErrCode);
 	if (Conv_D_MediaAddr(us, start))
-		return(ErrCode);
+		return ErrCode;
 
-	while(1)
-	{
+	while (1) {
 		len = Ssfdc.MaxSectors - Media.Sector;
 		if (count > len)
 			bn = len;
 		else
 			bn = count;
-		//if (Media_D_ReadOneSect(fdoExt, SectBuf))
-		//if (Media_D_ReadOneSect(fdoExt, count, buf))
-		if (Media_D_ReadOneSect(us, bn, buf))
-		{
+
+		if (Media_D_ReadOneSect(us, bn, buf)) {
 			ErrCode = ERR_EccReadErr;
-			return(ErrCode);
+			return ErrCode;
 		}
 
 		Media.Sector += bn;
 		count -= bn;
 
-		if (count<=0)
+		if (count <= 0)
 			break;
 
 		buf += bn * SECTSIZE;
 
 		if (Inc_D_MediaAddr(us))
-			return(ErrCode);
+			return ErrCode;
 	}
 
-	return(NO_ERROR);
+	return NO_ERROR;
 }
-// here
-//----- Media_D_CopySector() ------------------------------------------
-int Media_D_CopySector(struct us_data *us, DWORD start,WORD count,BYTE *buf)
+/* here */
+/* ----- Media_D_CopySector() ------------------------------------------ */
+int Media_D_CopySector(struct us_data *us, DWORD start, WORD count, BYTE *buf)
 {
-	//DWORD mode;
-	//int i;
 	WORD len, bn;
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
 
 	/* pr_info("Media_D_CopySector !!!\n"); */
 	if (Conv_D_MediaAddr(us, start))
-		return(ErrCode);
+		return ErrCode;
 
-	while(1)
-	{
+	while (1) {
 		if (Assign_D_WriteBlock())
-			return(ERROR);
+			return ERROR;
 
 		len = Ssfdc.MaxSectors - Media.Sector;
 		if (count > len)
@@ -209,607 +140,137 @@
 		else
 		bn = count;
 
-		//if (Ssfdc_D_CopyBlock(fdoExt,count,buf,Redundant))
-		if (Ssfdc_D_CopyBlock(us,bn,buf,Redundant))
-		{
+		if (Ssfdc_D_CopyBlock(us, bn, buf, Redundant)) {
 			ErrCode = ERR_WriteFault;
-			return(ErrCode);
+			return ErrCode;
 		}
 
 		Media.Sector = 0x1F;
-		//if (Release_D_ReadBlock(fdoExt))
-		if (Release_D_CopySector(us))
-		{
-			if (ErrCode==ERR_HwError)
-			{
+		if (Release_D_CopySector(us)) {
+			if (ErrCode == ERR_HwError) {
 				ErrCode = ERR_WriteFault;
-				return(ErrCode);
+				return ErrCode;
 			}
 		}
 		count -= bn;
 
-		if (count<=0)
+		if (count <= 0)
 			break;
 
 		buf += bn * SECTSIZE;
 
 		if (Inc_D_MediaAddr(us))
-			return(ErrCode);
+			return ErrCode;
 
 	}
-	return(NO_ERROR);
+	return NO_ERROR;
 }
 
-//----- Release_D_CopySector() ------------------------------------------
+/* ----- Release_D_CopySector() ------------------------------------------ */
 int Release_D_CopySector(struct us_data *us)
 {
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
+	Log2Phy[Media.Zone][Media.LogBlock] = WriteBlock;
+	Media.PhyBlock = ReadBlock;
 
-	Log2Phy[Media.Zone][Media.LogBlock]=WriteBlock;
-	Media.PhyBlock=ReadBlock;
-
-	if (Media.PhyBlock==NO_ASSIGN)
-	{
-		Media.PhyBlock=WriteBlock;
-		return(SMSUCCESS);
+	if (Media.PhyBlock == NO_ASSIGN) {
+		Media.PhyBlock = WriteBlock;
+		return SMSUCCESS;
 	}
 
-	Clr_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-	Media.PhyBlock=WriteBlock;
+	Clr_D_Bit(Assign[Media.Zone], Media.PhyBlock);
+	Media.PhyBlock = WriteBlock;
 
-	return(SMSUCCESS);
-}
-/*
-//----- Media_D_WriteSector() ------------------------------------------
-int Media_D_WriteSector(PFDO_DEVICE_EXTENSION fdoExt, DWORD start,WORD count,BYTE *buf)
-{
-    int i;
-    WORD len, bn;
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    //if (Check_D_MediaPower())
-    //    return(ErrCode);
-    //
-    //if (Check_D_MediaFmt(fdoExt))
-    //    return(ErrCode);
-    //
-    //if (Check_D_MediaWP())
-    //    return(ErrCode);
-
-    if (Conv_D_MediaAddr(fdoExt, start))
-        return(ErrCode);
-
-    //ENE_Print("Media_D_WriteSector --- Sector = %x\n", Media.Sector);
-    if (Check_D_FirstSect())
-    {
-        if (Media_D_CopyBlockHead(fdoExt))
-        {
-            ErrCode = ERR_WriteFault;
-            return(ErrCode);
-        }
-    }
-
-    while(1)
-    {
-        if (!Check_D_FirstSect())
-        {
-            if (Assign_D_WriteBlock())
-                return(ErrCode);
-        }
-
-        len = Ssfdc.MaxSectors - Media.Sector;
-        if (count > len)
-           bn = len;
-        else
-           bn = count;
-        //for(i=0;i<SECTSIZE;i++)
-        //    SectBuf[i]=*buf++;
-
-        //if (Media_D_WriteOneSect(fdoExt, SectBuf))
-        if (Media_D_WriteOneSect(fdoExt, bn, buf))
-        {
-            ErrCode = ERR_WriteFault;
-            return(ErrCode);
-        }
-
-        Media.Sector += bn - 1;
-
-        if (!Check_D_LastSect())
-        {
-            if (Release_D_ReadBlock(fdoExt))
-
-            {    if (ErrCode==ERR_HwError)
-                {
-                    ErrCode = ERR_WriteFault;
-                    return(ErrCode);
-                }
-            }
-        }
-
-        count -= bn;
-
-        if (count<=0)
-            break;
-
-        buf += bn * SECTSIZE;
-
-        //if (--count<=0)
-        //    break;
-
-        if (Inc_D_MediaAddr(fdoExt))
-            return(ErrCode);
-    }
-
-    if (!Check_D_LastSect())
-        return(NO_ERROR);
-
-    if (Inc_D_MediaAddr(fdoExt))
-        return(ErrCode);
-
-    if (Media_D_CopyBlockTail(fdoExt))
-    {
-        ErrCode = ERR_WriteFault;
-        return(ErrCode);
-    }
-
-    return(NO_ERROR);
-}
-//
-////----- Media_D_EraseBlock() -------------------------------------------
-//int Media_D_EraseBlock(PFDO_DEVICE_EXTENSION fdoExt, DWORD start,WORD count)
-//{
-//    if (Check_D_MediaPower())
-//        return(ErrCode);
-//
-//    if (Check_D_MediaFmt(fdoExt))
-//        return(ErrCode);
-//
-//    if (Check_D_MediaWP())
-//        return(ErrCode);
-//
-//    if (Conv_D_MediaAddr(start))
-//        return(ErrCode);
-//
-//    while(Check_D_FirstSect()) {
-//        if (Inc_D_MediaAddr(fdoExt))
-//            return(ErrCode);
-//
-//        if (--count<=0)
-//            return(NO_ERROR);
-//    }
-//
-//    while(1) {
-//        if (!Check_D_LastSect())
-//            if (Media_D_EraseOneBlock())
-//                if (ErrCode==ERR_HwError)
-//                {
-//                    ErrCode = ERR_WriteFault;
-//                    return(ErrCode);
-//                }
-//
-//        if (Inc_D_MediaAddr(fdoExt))
-//            return(ErrCode);
-//
-//        if (--count<=0)
-//            return(NO_ERROR);
-//    }
-//}
-//
-////----- Media_D_EraseAll() ---------------------------------------------
-//int Media_D_EraseAll(PFDO_DEVICE_EXTENSION fdoExt)
-//{
-//    if (Check_D_MediaPower())
-//        return(ErrCode);
-//
-//    if (Check_D_MediaFmtForEraseAll(fdoExt))
-//        return(ErrCode);
-//
-//    if (Check_D_MediaWP())
-//        return(ErrCode);
-//
-//    if (Media_D_EraseAllBlock())
-//        return(ErrCode);
-//
-//    return(NO_ERROR);
-//}
-
-//SmartMedia Write Function for One Sector Write Mode
-//----- Media_D_OneSectWriteStart() ------------------------------------
-int Media_D_OneSectWriteStart(PFDO_DEVICE_EXTENSION fdoExt,DWORD start,BYTE *buf)
-{
-//  int i;
-//  SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-//  ADDRESS_T   bb = (ADDRESS_T) &Media;
-//
-//  //if (Check_D_MediaPower())
-//  //    return(ErrCode);
-//  //if (Check_D_MediaFmt(fdoExt))
-//  //    return(ErrCode);
-//  //if (Check_D_MediaWP())
-//  //    return(ErrCode);
-//  if (Conv_D_MediaAddr(fdoExt, start))
-//      return(ErrCode);
-//
-//  if (Check_D_FirstSect())
-//      if (Media_D_CopyBlockHead(fdoExt))
-//      {
-//          ErrCode = ERR_WriteFault;
-//          return(ErrCode);
-//      }
-//
-//  if (!Check_D_FirstSect())
-//      if (Assign_D_WriteBlock())
-//          return(ErrCode);
-//
-//  //for(i=0;i<SECTSIZE;i++)
-//  //    SectBuf[i]=*buf++;
-//
-//  //if (Media_D_WriteOneSect(fdoExt, SectBuf))
-//  if (Media_D_WriteOneSect(fdoExt, buf))
-//  {
-//      ErrCode = ERR_WriteFault;
-//      return(ErrCode);
-//  }
-//
-//  if (!Check_D_LastSect())
-//  {
-//      if (Release_D_ReadBlock(fdoExt))
-//          if (ErrCode==ERR_HwError)
-//          {
-//              ErrCode = ERR_WriteFault;
-//              return(ErrCode);
-//          }
-//  }
-
-    return(NO_ERROR);
+	return SMSUCCESS;
 }
 
-//----- Media_D_OneSectWriteNext() -------------------------------------
-int Media_D_OneSectWriteNext(PFDO_DEVICE_EXTENSION fdoExt, BYTE *buf)
-{
-//  int i;
-//  SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-//  ADDRESS_T   bb = (ADDRESS_T) &Media;
-//
-//  if (Inc_D_MediaAddr(fdoExt))
-//      return(ErrCode);
-//
-//  if (!Check_D_FirstSect())
-//    if (Assign_D_WriteBlock())
-//      return(ErrCode);
-//
-//  //for(i=0;i<SECTSIZE;i++)
-//  //    SectBuf[i]=*buf++;
-//
-//  //if (Media_D_WriteOneSect(fdoExt, SectBuf))
-//  if (Media_D_WriteOneSect(fdoExt, buf))
-//  {
-//      ErrCode = ERR_WriteFault;
-//      return(ErrCode);
-//  }
-//
-//  if (!Check_D_LastSect())
-//  {
-//      if (Release_D_ReadBlock(fdoExt))
-//          if (ErrCode==ERR_HwError)
-//          {
-//              ErrCode = ERR_WriteFault;
-//              return(ErrCode);
-//          }
-//  }
-
-    return(NO_ERROR);
-}
-
-//----- Media_D_OneSectWriteFlush() ------------------------------------
-int Media_D_OneSectWriteFlush(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    if (!Check_D_LastSect())
-        return(NO_ERROR);
-
-    if (Inc_D_MediaAddr(fdoExt))
-        return(ErrCode);
-
-    if (Media_D_CopyBlockTail(fdoExt))
-    {
-        ErrCode = ERR_WriteFault;
-        return(ErrCode);
-    }
-
-    return(NO_ERROR);
-}
-//
-////LED Tern On/Off Subroutine
-////----- SM_EnableLED() -----------------------------------------------
-//void SM_EnableLED(PFDO_DEVICE_EXTENSION fdoExt, BOOLEAN enable)
-//{
-//    if (fdoExt->Drive_IsSWLED)
-//    {
-//        if (enable)
-//           Led_D_TernOn();
-//        else
-//           Led_D_TernOff();
-//    }
-//}
-//
-////----- Led_D_TernOn() -------------------------------------------------
-//void Led_D_TernOn(void)
-//{
-//    if (Check_D_CardStsChg())
-//        MediaChange=ERROR;
-//
-//    Cnt_D_LedOn();
-//}
-//
-////----- Led_D_TernOff() ------------------------------------------------
-//void Led_D_TernOff(void)
-//{
-//    if (Check_D_CardStsChg())
-//        MediaChange=ERROR;
-//
-//    Cnt_D_LedOff();
-//}
-//
-////SmartMedia Logical Format Subroutine
-////----- Check_D_LogCHS() -----------------------------------------------
-//int Check_D_LogCHS(WORD *c,BYTE *h,BYTE *s)
-//{
-//    switch(Ssfdc.Model) {
-//        case SSFDC1MB:   *c=125; *h= 4; *s= 4; break;
-//        case SSFDC2MB:   *c=125; *h= 4; *s= 8; break;
-//        case SSFDC4MB:   *c=250; *h= 4; *s= 8; break;
-//        case SSFDC8MB:   *c=250; *h= 4; *s=16; break;
-//        case SSFDC16MB:  *c=500; *h= 4; *s=16; break;
-//        case SSFDC32MB:  *c=500; *h= 8; *s=16; break;
-//        case SSFDC64MB:  *c=500; *h= 8; *s=32; break;
-//        case SSFDC128MB: *c=500; *h=16; *s=32; break;
-//        default:         *c= 0;  *h= 0; *s= 0; ErrCode = ERR_NoSmartMedia;    return(ERROR);
-//    }
-//
-//    return(SMSUCCESS);
-//}
-//
-////Power Control & Media Exist Check Subroutine
-////----- Initialize_D_Media() -------------------------------------------
-//void Initialize_D_Media(void)
-//{
-//    ErrCode      = NO_ERROR;
-//    MediaChange  = ERROR;
-//    SectCopyMode = COMPLETED;
-//    Cnt_D_Reset();
-//}
-//
-////----- PowerOff_D_Media() ---------------------------------------------
-//void PowerOff_D_Media(void)
-//{
-//    Cnt_D_PowerOff();
-//}
-//
-////----- Check_D_MediaPower() -------------------------------------------
-//int Check_D_MediaPower(void)
-//{
-//    //usleep(56*1024);
-//    if (Check_D_CardStsChg())
-//        MediaChange = ERROR;
-//    //usleep(56*1024);
-//    if ((!Check_D_CntPower())&&(!MediaChange))  // ¦³ power & Media ¨S³Q change, «h return success
-//        return(SMSUCCESS);
-//    //usleep(56*1024);
-//
-//    if (Check_D_CardExist())                    // Check if card is not exist, return err
-//    {
-//        ErrCode        = ERR_NoSmartMedia;
-//        MediaChange = ERROR;
-//        return(ERROR);
-//    }
-//    //usleep(56*1024);
-//    if (Cnt_D_PowerOn())
-//    {
-//        ErrCode        = ERR_NoSmartMedia;
-//        MediaChange = ERROR;
-//        return(ERROR);
-//    }
-//    //usleep(56*1024);
-//    Ssfdc_D_Reset(fdoExt);
-//    //usleep(56*1024);
-//    return(SMSUCCESS);
-//}
-//
-////-----Check_D_MediaExist() --------------------------------------------
-//int Check_D_MediaExist(void)
-//{
-//    if (Check_D_CardStsChg())
-//        MediaChange = ERROR;
-//
-//    if (!Check_D_CardExist())
-//    {
-//        if (!MediaChange)
-//            return(SMSUCCESS);
-//
-//        ErrCode = ERR_ChangedMedia;
-//        return(ERROR);
-//    }
-//
-//    ErrCode = ERR_NoSmartMedia;
-//
-//    return(ERROR);
-//}
-//
-////----- Check_D_MediaWP() ----------------------------------------------
-//int Check_D_MediaWP(void)
-//{
-//    if (Ssfdc.Attribute &MWP)
-//    {
-//        ErrCode = ERR_WrtProtect;
-//        return(ERROR);
-//    }
-//
-//    return(SMSUCCESS);
-//}
-*/
-//SmartMedia Physical Format Test Subroutine
-//----- Check_D_MediaFmt() ---------------------------------------------
+/* SmartMedia Physical Format Test Subroutine */
+/* ----- Check_D_MediaFmt() --------------------------------------------- */
 int Check_D_MediaFmt(struct us_data *us)
 {
 	pr_info("Check_D_MediaFmt\n");
-	//ULONG i,j, result=FALSE, zone,block;
 
-	//usleep(56*1024);
 	if (!MediaChange)
-		return(SMSUCCESS);
+		return SMSUCCESS;
 
 	MediaChange  = ERROR;
 	SectCopyMode = COMPLETED;
 
-	//usleep(56*1024);
-	if (Set_D_PhyFmtValue(us))
-	{
+	if (Set_D_PhyFmtValue(us)) {
 		ErrCode = ERR_UnknownMedia;
-		return(ERROR);
+		return ERROR;
 	}
-	
-	//usleep(56*1024);
-	if (Search_D_CIS(us))
-	{
+
+	if (Search_D_CIS(us)) {
 		ErrCode = ERR_IllegalFmt;
-		return(ERROR);
+		return ERROR;
 	}
 
-
-    MediaChange = SMSUCCESS;
-    return(SMSUCCESS);
+	MediaChange = SMSUCCESS;
+	return SMSUCCESS;
 }
-/*
-////----- Check_D_BlockIsFull() ----------------------------------
-//void Check_D_BlockIsFull()
-//{
-//    ULONG i, block;
-//
-//    if (IsXDCompliance || IsSSFDCCompliance)
-//    {
-//       // If the blocks are full then return write-protect.
-//       block = Ssfdc.MaxBlocks/8;
-//       for (Media.Zone=0; Media.Zone<Ssfdc.MaxZones; Media.Zone++)
-//       {
-//           if (Log2Phy[Media.Zone]==NULL)
-//           {
-//               if (Make_D_LogTable())
-//               {
-//                   ErrCode = ERR_IllegalFmt;
-//                   return;
-//               }
-//           }
-//
-//           for (i=0; i<block; i++)
-//           {
-//               if (Assign[Media.Zone][i] != 0xFF)
-//                  return;
-//           }
-//       }
-//       Ssfdc.Attribute |= WP;
-//    }
-//}
-//
-//
-////----- Check_D_MediaFmtForEraseAll() ----------------------------------
-//int Check_D_MediaFmtForEraseAll(PFDO_DEVICE_EXTENSION fdoExt)
-//{
-//    MediaChange  = ERROR;
-//    SectCopyMode = COMPLETED;
-//
-//    if (Set_D_PhyFmtValue(fdoExt))
-//    {
-//        ErrCode = ERR_UnknownMedia;
-//        return(ERROR);
-//    }
-//
-//    if (Search_D_CIS(fdoExt))
-//    {
-//        ErrCode = ERR_IllegalFmt;
-//        return(ERROR);
-//    }
-//
-//    return(SMSUCCESS);
-//}
-*/
-//SmartMedia Physical Address Control Subroutine
-//----- Conv_D_MediaAddr() ---------------------------------------------
+
+/* SmartMedia Physical Address Control Subroutine */
+/* ----- Conv_D_MediaAddr() --------------------------------------------- */
 int Conv_D_MediaAddr(struct us_data *us, DWORD addr)
 {
 	DWORD temp;
-	//ULONG  zz;
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-	temp           = addr/Ssfdc.MaxSectors;
-	Media.Zone     = (BYTE) (temp/Ssfdc.MaxLogBlocks);
+	temp           = addr / Ssfdc.MaxSectors;
+	Media.Zone     = (BYTE) (temp / Ssfdc.MaxLogBlocks);
 
-	if (Log2Phy[Media.Zone]==NULL)
-	{
-		if (Make_D_LogTable(us))
-		{
+	if (Log2Phy[Media.Zone] == NULL) {
+		if (Make_D_LogTable(us)) {
 			ErrCode = ERR_IllegalFmt;
-			return(ERROR);
+			return ERROR;
 		}
 	}
 
-	Media.Sector   = (BYTE) (addr%Ssfdc.MaxSectors);
-	Media.LogBlock = (WORD) (temp%Ssfdc.MaxLogBlocks);
+	Media.Sector   = (BYTE) (addr % Ssfdc.MaxSectors);
+	Media.LogBlock = (WORD) (temp % Ssfdc.MaxLogBlocks);
 
-	if (Media.Zone<Ssfdc.MaxZones)
-	{
+	if (Media.Zone < Ssfdc.MaxZones) {
 		Clr_D_RedundantData(Redundant);
 		Set_D_LogBlockAddr(Redundant);
 		Media.PhyBlock = Log2Phy[Media.Zone][Media.LogBlock];
-		return(SMSUCCESS);
+		return SMSUCCESS;
 	}
 
 	ErrCode = ERR_OutOfLBA;
-	return(ERROR);
+	return ERROR;
 }
 
-//----- Inc_D_MediaAddr() ----------------------------------------------
+/* ----- Inc_D_MediaAddr() ---------------------------------------------- */
 int Inc_D_MediaAddr(struct us_data *us)
 {
 	WORD        LogBlock = Media.LogBlock;
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-	if (++Media.Sector<Ssfdc.MaxSectors)
-		return(SMSUCCESS);
+	if (++Media.Sector < Ssfdc.MaxSectors)
+		return SMSUCCESS;
 
-	if (Log2Phy[Media.Zone]==NULL)
-	{
-		if (Make_D_LogTable(us))
-		{
+	if (Log2Phy[Media.Zone] == NULL) {
+		if (Make_D_LogTable(us)) {
 			ErrCode = ERR_IllegalFmt;
-			return(ERROR);
+			return ERROR;
 		}
 	}
 
-	Media.Sector=0;
+	Media.Sector = 0;
 	Media.LogBlock = LogBlock;
 
-	if (++Media.LogBlock<Ssfdc.MaxLogBlocks)
-	{
+	if (++Media.LogBlock < Ssfdc.MaxLogBlocks) {
 		Clr_D_RedundantData(Redundant);
 		Set_D_LogBlockAddr(Redundant);
-		Media.PhyBlock=Log2Phy[Media.Zone][Media.LogBlock];
-		return(SMSUCCESS);
+		Media.PhyBlock = Log2Phy[Media.Zone][Media.LogBlock];
+		return SMSUCCESS;
 	}
 
-	Media.LogBlock=0;
+	Media.LogBlock = 0;
 
-	if (++Media.Zone<Ssfdc.MaxZones)
-	{
-		if (Log2Phy[Media.Zone]==NULL)
-		{
-			if (Make_D_LogTable(us))
-			{
+	if (++Media.Zone < Ssfdc.MaxZones) {
+		if (Log2Phy[Media.Zone] == NULL) {
+			if (Make_D_LogTable(us)) {
 				ErrCode = ERR_IllegalFmt;
-				return(ERROR);
+				return ERROR;
 			}
 		}
 
@@ -817,1034 +278,508 @@
 
 		Clr_D_RedundantData(Redundant);
 		Set_D_LogBlockAddr(Redundant);
-		Media.PhyBlock=Log2Phy[Media.Zone][Media.LogBlock];
-		return(SMSUCCESS);
+		Media.PhyBlock = Log2Phy[Media.Zone][Media.LogBlock];
+		return SMSUCCESS;
 	}
 
-	Media.Zone=0;
+	Media.Zone = 0;
 	ErrCode = ERR_OutOfLBA;
 
-	return(ERROR);
-}
-/*
-//----- Check_D_FirstSect() --------------------------------------------
-int Check_D_FirstSect(void)
-{
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    if (!Media.Sector)
-        return(SMSUCCESS);
-
-    return(ERROR);
+	return ERROR;
 }
 
-//----- Check_D_LastSect() ---------------------------------------------
-int Check_D_LastSect(void)
-{
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    if (Media.Sector<(Ssfdc.MaxSectors-1))
-        return(ERROR);
-
-    return(SMSUCCESS);
-}
-*/
-//SmartMedia Read/Write Subroutine with Retry
-//----- Media_D_ReadOneSect() ------------------------------------------
+/* SmartMedia Read/Write Subroutine with Retry */
+/* ----- Media_D_ReadOneSect() ------------------------------------------ */
 int Media_D_ReadOneSect(struct us_data *us, WORD count, BYTE *buf)
 {
 	DWORD err, retry;
 
 	if (!Read_D_PhyOneSect(us, count, buf))
-		return(SMSUCCESS);
-	if (ErrCode==ERR_HwError)
-		return(ERROR);
-	if (ErrCode==ERR_DataStatus)
-		return(ERROR);
+		return SMSUCCESS;
+	if (ErrCode == ERR_HwError)
+		return ERROR;
+	if (ErrCode == ERR_DataStatus)
+		return ERROR;
 
 #ifdef RDERR_REASSIGN
-	if (Ssfdc.Attribute &MWP)
-	{
-		if (ErrCode==ERR_CorReadErr)
-			return(SMSUCCESS);
-		return(ERROR);
+	if (Ssfdc.Attribute & MWP) {
+		if (ErrCode == ERR_CorReadErr)
+			return SMSUCCESS;
+		return ERROR;
 	}
 
-	err=ErrCode;
-	for(retry=0; retry<2; retry++)
-	{
-		if (Copy_D_BlockAll(us, (err==ERR_EccReadErr)?REQ_FAIL:REQ_ERASE))
-		{
-			if (ErrCode==ERR_HwError)
-				return(ERROR);
+	err = ErrCode;
+	for (retry = 0; retry < 2; retry++) {
+		if (Copy_D_BlockAll(us,
+			(err == ERR_EccReadErr) ? REQ_FAIL : REQ_ERASE)) {
+			if (ErrCode == ERR_HwError)
+				return ERROR;
 			continue;
 		}
 
 		ErrCode = err;
-		if (ErrCode==ERR_CorReadErr)
-			return(SMSUCCESS);
-		return(ERROR);
+		if (ErrCode == ERR_CorReadErr)
+			return SMSUCCESS;
+		return ERROR;
 	}
 
 	MediaChange = ERROR;
 #else
-	if (ErrCode==ERR_CorReadErr) return(SMSUCCESS);
+	if (ErrCode == ERR_CorReadErr)
+		return SMSUCCESS;
 #endif
 
-	return(ERROR);
-}
-/*
-//----- Media_D_WriteOneSect() -----------------------------------------
-int Media_D_WriteOneSect(PFDO_DEVICE_EXTENSION fdoExt, WORD count, BYTE *buf)
-{
-    DWORD retry;
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    if (!Write_D_PhyOneSect(fdoExt, count, buf))
-        return(SMSUCCESS);
-    if (ErrCode==ERR_HwError)
-        return(ERROR);
-
-    for(retry=1; retry<2; retry++)
-    {
-        if (Reassign_D_BlockHead(fdoExt))
-        {
-            if (ErrCode==ERR_HwError)
-                return(ERROR);
-            continue;
-        }
-
-        if (!Write_D_PhyOneSect(fdoExt, count, buf))
-            return(SMSUCCESS);
-        if (ErrCode==ERR_HwError)
-            return(ERROR);
-    }
-
-    if (Release_D_WriteBlock(fdoExt))
-        return(ERROR);
-
-    ErrCode        = ERR_WriteFault;
-    MediaChange = ERROR;
-    return(ERROR);
+	return ERROR;
 }
 
-//SmartMedia Data Copy Subroutine with Retry
-//----- Media_D_CopyBlockHead() ----------------------------------------
-int Media_D_CopyBlockHead(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    DWORD retry;
-
-    for(retry=0; retry<2; retry++)
-    {
-        if (!Copy_D_BlockHead(fdoExt))
-            return(SMSUCCESS);
-        if (ErrCode==ERR_HwError)
-            return(ERROR);
-    }
-
-    MediaChange = ERROR;
-    return(ERROR);
-}
-
-//----- Media_D_CopyBlockTail() ----------------------------------------
-int Media_D_CopyBlockTail(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    DWORD retry;
-
-    if (!Copy_D_BlockTail(fdoExt))
-        return(SMSUCCESS);
-    if (ErrCode==ERR_HwError)
-        return(ERROR);
-
-    for(retry=1; retry<2; retry++)
-    {
-        if (Reassign_D_BlockHead(fdoExt))
-        {
-            if (ErrCode==ERR_HwError)
-                return(ERROR);
-            continue;
-        }
-
-        if (!Copy_D_BlockTail(fdoExt))
-            return(SMSUCCESS);
-        if (ErrCode==ERR_HwError)
-            return(ERROR);
-    }
-
-    if (Release_D_WriteBlock(fdoExt))
-        return(ERROR);
-
-    ErrCode        = ERR_WriteFault;
-    MediaChange = ERROR;
-    return(ERROR);
-}
-//
-////----- Media_D_EraseOneBlock() ----------------------------------------
-//int Media_D_EraseOneBlock(void)
-//{
-//    WORD        LogBlock = Media.LogBlock;
-//    WORD        PhyBlock = Media.PhyBlock;
-//    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-//    ADDRESS_T   bb = (ADDRESS_T) &Media;
-//
-//    if (Media.PhyBlock==NO_ASSIGN)
-//        return(SMSUCCESS);
-//
-//    if (Log2Phy[Media.Zone]==NULL)
-//    {
-//        if (Make_D_LogTable())
-//        {
-//            ErrCode = ERR_IllegalFmt;
-//            return(ERROR);
-//        }
-//    }
-//    Media.LogBlock = LogBlock;
-//    Media.PhyBlock = PhyBlock;
-//
-//    Log2Phy[Media.Zone][Media.LogBlock]=NO_ASSIGN;
-//
-//    if (Erase_D_PhyOneBlock(fdoExt))
-//    {
-//        if (ErrCode==ERR_HwError)
-//            return(ERROR);
-//        if (MarkFail_D_PhyOneBlock())
-//            return(ERROR);
-//
-//        ErrCode = ERR_WriteFault;
-//        return(ERROR);
-//    }
-//
-//    Clr_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-//    Media.PhyBlock=NO_ASSIGN;
-//    return(SMSUCCESS);
-//}
-//
-////SmartMedia Erase Subroutine
-////----- Media_D_EraseAllBlock() ----------------------------------------
-//int Media_D_EraseAllBlock(void)
-//{
-//    WORD cis=0;
-//
-//    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-//    ADDRESS_T   bb = (ADDRESS_T) &Media;
-//
-//    MediaChange = ERROR;
-//    Media.Sector   = 0;
-//
-//    for(Media.Zone=0; Media.Zone<Ssfdc.MaxZones; Media.Zone++)
-//        for(Media.PhyBlock=0; Media.PhyBlock<Ssfdc.MaxBlocks; Media.PhyBlock++) {
-//            if (Ssfdc_D_ReadRedtData(Redundant))
-//            {
-//                Ssfdc_D_Reset(fdoExt);
-//                return(ERROR);
-//            }
-//
-//            Ssfdc_D_Reset(fdoExt);
-//            if (!Check_D_FailBlock(Redundant))
-//            {
-//                if (cis)
-//                {
-//                    if (Ssfdc_D_EraseBlock(fdoExt))
-//                    {
-//                        ErrCode = ERR_HwError;
-//                        return(ERROR);
-//                    }
-//
-//                    if (Ssfdc_D_CheckStatus())
-//                    {
-//                        if (MarkFail_D_PhyOneBlock())
-//                            return(ERROR);
-//                    }
-//
-//                    continue;
-//                }
-//
-//                if (Media.PhyBlock!=CisArea.PhyBlock)
-//                {
-//                    ErrCode = ERR_IllegalFmt;
-//                    return(ERROR);
-//                }
-//
-//                cis++;
-//            }
-//
-//        }
-//    return(SMSUCCESS);
-//}
-*/
-//SmartMedia Physical Sector Data Copy Subroutine
-//----- Copy_D_BlockAll() ----------------------------------------------
+/* SmartMedia Physical Sector Data Copy Subroutine */
+/* ----- Copy_D_BlockAll() ---------------------------------------------- */
 int Copy_D_BlockAll(struct us_data *us, DWORD mode)
 {
 	BYTE sect;
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-	sect=Media.Sector;
+	sect = Media.Sector;
 
 	if (Assign_D_WriteBlock())
-		return(ERROR);
-	if (mode==REQ_FAIL)
-		SectCopyMode=REQ_FAIL;
+		return ERROR;
+	if (mode == REQ_FAIL)
+		SectCopyMode = REQ_FAIL;
 
-	for(Media.Sector=0; Media.Sector<Ssfdc.MaxSectors; Media.Sector++)
-	{
-		if (Copy_D_PhyOneSect(us))
-		{
-			if (ErrCode==ERR_HwError)
-				return(ERROR);
+	for (Media.Sector = 0; Media.Sector < Ssfdc.MaxSectors;
+							Media.Sector++) {
+		if (Copy_D_PhyOneSect(us)) {
+			if (ErrCode == ERR_HwError)
+				return ERROR;
 			if (Release_D_WriteBlock(us))
-				return(ERROR);
+				return ERROR;
 
 			ErrCode = ERR_WriteFault;
-			Media.PhyBlock=ReadBlock;
-			Media.Sector=sect;
+			Media.PhyBlock = ReadBlock;
+			Media.Sector = sect;
 
-			return(ERROR);
+			return ERROR;
 		}
 	}
 
 	if (Release_D_ReadBlock(us))
-		return(ERROR);
+		return ERROR;
 
-	Media.PhyBlock=WriteBlock;
-	Media.Sector=sect;
-	return(SMSUCCESS);
-}
-/*
-//----- Copy_D_BlockHead() ---------------------------------------------
-int Copy_D_BlockHead(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    BYTE sect;
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    sect=Media.Sector;
-    if (Assign_D_WriteBlock())
-        return(ERROR);
-
-    for(Media.Sector=0; Media.Sector<sect; Media.Sector++)
-    {
-        if (Copy_D_PhyOneSect(fdoExt))
-        {
-            if (ErrCode==ERR_HwError)
-                return(ERROR);
-            if (Release_D_WriteBlock(fdoExt))
-                return(ERROR);
-
-            ErrCode = ERR_WriteFault;
-            Media.PhyBlock=ReadBlock;
-            Media.Sector=sect;
-
-            return(ERROR);
-        }
-    }
-
-    Media.PhyBlock=WriteBlock;
-    Media.Sector=sect;
-    return(SMSUCCESS);
+	Media.PhyBlock = WriteBlock;
+	Media.Sector = sect;
+	return SMSUCCESS;
 }
 
-//----- Copy_D_BlockTail() ---------------------------------------------
-int Copy_D_BlockTail(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    BYTE sect;
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    for(sect=Media.Sector; Media.Sector<Ssfdc.MaxSectors; Media.Sector++)
-    {
-        if (Copy_D_PhyOneSect(fdoExt))
-        {
-            if (ErrCode==ERR_HwError)
-                return(ERROR);
-
-            Media.PhyBlock=WriteBlock;
-            Media.Sector=sect;
-
-            return(ERROR);
-        }
-    }
-
-    if (Release_D_ReadBlock(fdoExt))
-        return(ERROR);
-
-    Media.PhyBlock=WriteBlock;
-    Media.Sector=sect;
-    return(SMSUCCESS);
-}
-
-//----- Reassign_D_BlockHead() -----------------------------------------
-int Reassign_D_BlockHead(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    DWORD  mode;
-    WORD   block;
-    BYTE   sect;
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    mode=SectCopyMode;
-    block=ReadBlock;
-    sect=Media.Sector;
-
-    if (Assign_D_WriteBlock())
-        return(ERROR);
-
-    SectCopyMode=REQ_FAIL;
-
-    for(Media.Sector=0; Media.Sector<sect; Media.Sector++)
-    {
-        if (Copy_D_PhyOneSect(fdoExt))
-        {
-            if (ErrCode==ERR_HwError)
-                return(ERROR);
-            if (Release_D_WriteBlock(fdoExt))
-                return(ERROR);
-
-            ErrCode = ERR_WriteFault;
-            SectCopyMode=mode;
-            WriteBlock=ReadBlock;
-            ReadBlock=block;
-            Media.Sector=sect;
-            Media.PhyBlock=WriteBlock;
-
-            return(ERROR);
-        }
-    }
-
-    if (Release_D_ReadBlock(fdoExt))
-        return(ERROR);
-
-    SectCopyMode=mode;
-    ReadBlock=block;
-    Media.Sector=sect;
-    Media.PhyBlock=WriteBlock;
-    return(SMSUCCESS);
-}
-*/
-//SmartMedia Physical Block Assign/Release Subroutine
-//----- Assign_D_WriteBlock() ------------------------------------------
+/* SmartMedia Physical Block Assign/Release Subroutine */
+/* ----- Assign_D_WriteBlock() ------------------------------------------ */
 int Assign_D_WriteBlock(void)
 {
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
-	ReadBlock=Media.PhyBlock;
+	ReadBlock = Media.PhyBlock;
 
-	for(WriteBlock=AssignStart[Media.Zone]; WriteBlock<Ssfdc.MaxBlocks; WriteBlock++)
-	{
-		if (!Chk_D_Bit(Assign[Media.Zone],WriteBlock))
-		{
-			Set_D_Bit(Assign[Media.Zone],WriteBlock);
-			AssignStart[Media.Zone]=WriteBlock+1;
-			Media.PhyBlock=WriteBlock;
-			SectCopyMode=REQ_ERASE;
-			//ErrXDCode = NO_ERROR;
-			return(SMSUCCESS);
+	for (WriteBlock = AssignStart[Media.Zone];
+			WriteBlock < Ssfdc.MaxBlocks; WriteBlock++) {
+		if (!Chk_D_Bit(Assign[Media.Zone], WriteBlock)) {
+			Set_D_Bit(Assign[Media.Zone], WriteBlock);
+			AssignStart[Media.Zone] = WriteBlock + 1;
+			Media.PhyBlock = WriteBlock;
+			SectCopyMode = REQ_ERASE;
+			return SMSUCCESS;
 		}
 	}
 
-	for(WriteBlock=0; WriteBlock<AssignStart[Media.Zone]; WriteBlock++)
-	{
-		if (!Chk_D_Bit(Assign[Media.Zone],WriteBlock))
-		{
-			Set_D_Bit(Assign[Media.Zone],WriteBlock);
-			AssignStart[Media.Zone]=WriteBlock+1;
-			Media.PhyBlock=WriteBlock;
-			SectCopyMode=REQ_ERASE;
-			//ErrXDCode = NO_ERROR;
-			return(SMSUCCESS);
+	for (WriteBlock = 0;
+			WriteBlock < AssignStart[Media.Zone]; WriteBlock++) {
+		if (!Chk_D_Bit(Assign[Media.Zone], WriteBlock)) {
+			Set_D_Bit(Assign[Media.Zone], WriteBlock);
+			AssignStart[Media.Zone] = WriteBlock + 1;
+			Media.PhyBlock = WriteBlock;
+			SectCopyMode = REQ_ERASE;
+			return SMSUCCESS;
 		}
 	}
 
-	WriteBlock=NO_ASSIGN;
+	WriteBlock = NO_ASSIGN;
 	ErrCode = ERR_WriteFault;
-	// For xD test
-	//Ssfdc.Attribute |= WP;
-	//ErrXDCode = ERR_WrtProtect;
-	return(ERROR);
+
+	return ERROR;
 }
 
-//----- Release_D_ReadBlock() ------------------------------------------
+/* ----- Release_D_ReadBlock() ------------------------------------------ */
 int Release_D_ReadBlock(struct us_data *us)
 {
 	DWORD mode;
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-	mode=SectCopyMode;
-	SectCopyMode=COMPLETED;
+	mode = SectCopyMode;
+	SectCopyMode = COMPLETED;
 
-	if (mode==COMPLETED)
-		return(SMSUCCESS);
+	if (mode == COMPLETED)
+		return SMSUCCESS;
 
-	Log2Phy[Media.Zone][Media.LogBlock]=WriteBlock;
-	Media.PhyBlock=ReadBlock;
+	Log2Phy[Media.Zone][Media.LogBlock] = WriteBlock;
+	Media.PhyBlock = ReadBlock;
 
-	if (Media.PhyBlock==NO_ASSIGN)
-	{
-		Media.PhyBlock=WriteBlock;
-		return(SMSUCCESS);
+	if (Media.PhyBlock == NO_ASSIGN) {
+		Media.PhyBlock = WriteBlock;
+		return SMSUCCESS;
 	}
 
-	if (mode==REQ_ERASE)
-	{
-		if (Erase_D_PhyOneBlock(us))
-		{
-			if (ErrCode==ERR_HwError) return(ERROR);
-			if (MarkFail_D_PhyOneBlock(us)) return(ERROR);
-		}
-		else
-			Clr_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-	}
-	else if (MarkFail_D_PhyOneBlock(us))
-		return(ERROR);
+	if (mode == REQ_ERASE) {
+		if (Erase_D_PhyOneBlock(us)) {
+			if (ErrCode == ERR_HwError)
+				return ERROR;
+			if (MarkFail_D_PhyOneBlock(us))
+				return ERROR;
+		} else
+			Clr_D_Bit(Assign[Media.Zone], Media.PhyBlock);
+	} else if (MarkFail_D_PhyOneBlock(us))
+		return ERROR;
 
-	Media.PhyBlock=WriteBlock;
-	return(SMSUCCESS);
+	Media.PhyBlock = WriteBlock;
+	return SMSUCCESS;
 }
 
-//----- Release_D_WriteBlock() -----------------------------------------
+/* ----- Release_D_WriteBlock() ----------------------------------------- */
 int Release_D_WriteBlock(struct us_data *us)
 {
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
-	SectCopyMode=COMPLETED;
-	Media.PhyBlock=WriteBlock;
+	SectCopyMode = COMPLETED;
+	Media.PhyBlock = WriteBlock;
 
 	if (MarkFail_D_PhyOneBlock(us))
-		return(ERROR);
+		return ERROR;
 
-	Media.PhyBlock=ReadBlock;
-	return(SMSUCCESS);
+	Media.PhyBlock = ReadBlock;
+	return SMSUCCESS;
 }
 
-//SmartMedia Physical Sector Data Copy Subroutine
-//----- Copy_D_PhyOneSect() --------------------------------------------
+/* SmartMedia Physical Sector Data Copy Subroutine */
+/* ----- Copy_D_PhyOneSect() -------------------------------------------- */
 int Copy_D_PhyOneSect(struct us_data *us)
 {
 	int           i;
 	DWORD  err, retry;
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
 
 	/* pr_info("Copy_D_PhyOneSect --- Secotr = %x\n", Media.Sector); */
-	if (ReadBlock!=NO_ASSIGN)
-	{
-		Media.PhyBlock=ReadBlock;
-		for(retry=0; retry<2; retry++)
-		{
-			if (retry!=0)
-			{
+	if (ReadBlock != NO_ASSIGN) {
+		Media.PhyBlock = ReadBlock;
+		for (retry = 0; retry < 2; retry++) {
+			if (retry != 0) {
 				Ssfdc_D_Reset(us);
-				if (Ssfdc_D_ReadCisSect(us,WorkBuf,WorkRedund))
-				{ ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
+				if (Ssfdc_D_ReadCisSect(us, WorkBuf,
+								WorkRedund)) {
+					ErrCode = ERR_HwError;
+					MediaChange = ERROR;
+					return ERROR;
+				}
 
-				if (Check_D_CISdata(WorkBuf,WorkRedund))
-				{ ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
+				if (Check_D_CISdata(WorkBuf, WorkRedund)) {
+					ErrCode = ERR_HwError;
+					MediaChange = ERROR;
+					return ERROR;
+				}
 			}
 
-			if (Ssfdc_D_ReadSect(us,WorkBuf,WorkRedund))
-			{ ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-			if (Check_D_DataStatus(WorkRedund))
-			{ err=ERROR; break; }
-			if (!Check_D_ReadError(WorkRedund))
-			{ err=SMSUCCESS; break; }
-			if (!Check_D_Correct(WorkBuf,WorkRedund))
-			{ err=SMSUCCESS; break; }
+			if (Ssfdc_D_ReadSect(us, WorkBuf, WorkRedund)) {
+				ErrCode = ERR_HwError;
+				MediaChange = ERROR;
+				return ERROR;
+			}
+			if (Check_D_DataStatus(WorkRedund)) {
+				err = ERROR;
+				break;
+			}
+			if (!Check_D_ReadError(WorkRedund)) {
+				err = SMSUCCESS;
+				break;
+			}
+			if (!Check_D_Correct(WorkBuf, WorkRedund)) {
+				err = SMSUCCESS;
+				break;
+			}
 
-			err=ERROR;
-			SectCopyMode=REQ_FAIL;
+			err = ERROR;
+			SectCopyMode = REQ_FAIL;
 		}
-	}
-	else
-	{
-		err=SMSUCCESS;
-		for(i=0; i<SECTSIZE; i++)
-			WorkBuf[i]=DUMMY_DATA;
+	} else {
+		err = SMSUCCESS;
+		for (i = 0; i < SECTSIZE; i++)
+			WorkBuf[i] = DUMMY_DATA;
 		Clr_D_RedundantData(WorkRedund);
 	}
 
 	Set_D_LogBlockAddr(WorkRedund);
-	if (err==ERROR)
-	{
+	if (err == ERROR) {
 		Set_D_RightECC(WorkRedund);
 		Set_D_DataStaus(WorkRedund);
 	}
 
-	Media.PhyBlock=WriteBlock;
+	Media.PhyBlock = WriteBlock;
 
-	if (Ssfdc_D_WriteSectForCopy(us, WorkBuf, WorkRedund))
-	{ ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-	if (Ssfdc_D_CheckStatus())
-	{ ErrCode = ERR_WriteFault; return(ERROR); }
+	if (Ssfdc_D_WriteSectForCopy(us, WorkBuf, WorkRedund)) {
+		ErrCode = ERR_HwError;
+		MediaChange = ERROR;
+		return ERROR;
+	}
+	if (Ssfdc_D_CheckStatus()) {
+		ErrCode = ERR_WriteFault;
+		return ERROR;
+	}
 
-	Media.PhyBlock=ReadBlock;
-	return(SMSUCCESS);
+	Media.PhyBlock = ReadBlock;
+	return SMSUCCESS;
 }
 
-//SmartMedia Physical Sector Read/Write/Erase Subroutine
-//----- Read_D_PhyOneSect() --------------------------------------------
+/* SmartMedia Physical Sector Read/Write/Erase Subroutine */
+/* ----- Read_D_PhyOneSect() -------------------------------------------- */
 int Read_D_PhyOneSect(struct us_data *us, WORD count, BYTE *buf)
 {
 	int           i;
 	DWORD  retry;
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-	if (Media.PhyBlock==NO_ASSIGN)
-	{
-		for(i=0; i<SECTSIZE; i++)
-			*buf++=DUMMY_DATA;
-		return(SMSUCCESS);
+	if (Media.PhyBlock == NO_ASSIGN) {
+		for (i = 0; i < SECTSIZE; i++)
+			*buf++ = DUMMY_DATA;
+		return SMSUCCESS;
 	}
 
-	for(retry=0; retry<2; retry++)
-	{
-		if (retry!=0)
-		{
+	for (retry = 0; retry < 2; retry++) {
+		if (retry != 0) {
 			Ssfdc_D_Reset(us);
 
-			if (Ssfdc_D_ReadCisSect(us,WorkBuf,WorkRedund))
-			{ ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-			if (Check_D_CISdata(WorkBuf,WorkRedund))
-			{ ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
+			if (Ssfdc_D_ReadCisSect(us, WorkBuf, WorkRedund)) {
+				ErrCode = ERR_HwError;
+				MediaChange = ERROR;
+				return ERROR;
+			}
+			if (Check_D_CISdata(WorkBuf, WorkRedund)) {
+				ErrCode = ERR_HwError;
+				MediaChange = ERROR;
+				return ERROR;
+			}
 		}
 
-		//if (Ssfdc_D_ReadSect(fdoExt,buf,Redundant))
-		if (Ssfdc_D_ReadBlock(us,count,buf,Redundant))
-		{ ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-		if (Check_D_DataStatus(Redundant))
-		{ ErrCode = ERR_DataStatus; return(ERROR); }
+		if (Ssfdc_D_ReadBlock(us, count, buf, Redundant)) {
+			ErrCode = ERR_HwError;
+			MediaChange = ERROR;
+			return ERROR;
+		}
+		if (Check_D_DataStatus(Redundant)) {
+			ErrCode = ERR_DataStatus;
+			return ERROR;
+		}
 
 		if (!Check_D_ReadError(Redundant))
-			return(SMSUCCESS);
+			return SMSUCCESS;
 
-		if (!Check_D_Correct(buf,Redundant))
-		{ ErrCode = ERR_CorReadErr; return(ERROR); }
+		if (!Check_D_Correct(buf, Redundant)) {
+			ErrCode = ERR_CorReadErr;
+			return ERROR;
+		}
 	}
 
 	ErrCode = ERR_EccReadErr;
-	return(ERROR);
+	return ERROR;
 }
-/*
-//----- Write_D_PhyOneSect() -------------------------------------------
-int Write_D_PhyOneSect(PFDO_DEVICE_EXTENSION fdoExt, WORD count, BYTE *buf)
-{
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-    //if (Ssfdc_D_WriteSect(fdoExt,buf,Redundant))
-    if (Ssfdc_D_WriteBlock(fdoExt,count,buf,Redundant))
-    { ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-    if (Ssfdc_D_CheckStatus())
-    { ErrCode = ERR_WriteFault; return(ERROR); }
-
-    return(SMSUCCESS);
-}
-*/
-//----- Erase_D_PhyOneBlock() ------------------------------------------
+/* ----- Erase_D_PhyOneBlock() ------------------------------------------ */
 int Erase_D_PhyOneBlock(struct us_data *us)
 {
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
+	if (Ssfdc_D_EraseBlock(us)) {
+		ErrCode = ERR_HwError;
+		MediaChange = ERROR;
+		return ERROR;
+	}
+	if (Ssfdc_D_CheckStatus()) {
+		ErrCode = ERR_WriteFault;
+		return ERROR;
+	}
 
-	if (Ssfdc_D_EraseBlock(us))
-	{ ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-	if (Ssfdc_D_CheckStatus())
-	{ ErrCode = ERR_WriteFault; return(ERROR); }
-
-	return(SMSUCCESS);
+	return SMSUCCESS;
 }
 
-//SmartMedia Physical Format Check Local Subroutine
-//----- Set_D_PhyFmtValue() --------------------------------------------
+/* SmartMedia Physical Format Check Local Subroutine */
+/* ----- Set_D_PhyFmtValue() -------------------------------------------- */
 int Set_D_PhyFmtValue(struct us_data *us)
 {
-//    PPDO_DEVICE_EXTENSION   pdoExt;
-//    BYTE      idcode[4];
-//    DWORD     UserDefData_1, UserDefData_2, Data, mask;
-//
-//    //if (!fdoExt->ChildDeviceObject)       return(ERROR);
-//    //pdoExt = fdoExt->ChildDeviceObject->DeviceExtension;
-//
-//    Ssfdc_D_ReadID(idcode, READ_ID_1);
-//
-    //if (Set_D_SsfdcModel(idcode[1]))
-    if (Set_D_SsfdcModel(us->SM_DeviceID))
-        return(ERROR);
+	if (Set_D_SsfdcModel(us->SM_DeviceID))
+		return ERROR;
 
-//    //Use Multi-function pin to differentiate SM and xD.
-//    UserDefData_1 = ReadPCIReg(fdoExt->BusID, fdoExt->DevID, fdoExt->FuncID, PCI_REG_USER_DEF) & 0x80;
-//    if (UserDefData_1)
-//    {
-//       if ( READ_PORT_BYTE(SM_REG_INT_STATUS) & 0x80 )      fdoExt->DiskType = DISKTYPE_XD;
-//       if ( READ_PORT_BYTE(SM_REG_INT_STATUS) & 0x40 )      fdoExt->DiskType = DISKTYPE_SM;
-//
-//       if ( IsXDCompliance && (fdoExt->DiskType == DISKTYPE_XD) )
-//       {
-//          Ssfdc_D_ReadID(idcode, READ_ID_3);
-//          if (idcode[2] != 0xB5)
-//             return(ERROR);
-//       }
-//    }
-//
-//    //Use GPIO to differentiate SM and xD.
-//    UserDefData_2 = ReadPCIReg(fdoExt->BusID, fdoExt->DevID, fdoExt->FuncID, PCI_REG_USER_DEF) >> 8;
-//    if ( UserDefData_2 )
-//    {
-//       Data = ReadPCIReg(fdoExt->BusID, fdoExt->DevID, 0, 0xAC);
-//
-//       mask = 1 << (UserDefData_2-1);
-//       // 1 : xD , 0 : SM
-//       if ( Data & mask)
-//          fdoExt->DiskType = DISKTYPE_XD;
-//       else
-//          fdoExt->DiskType = DISKTYPE_SM;
-//
-//       if ( IsXDCompliance && (fdoExt->DiskType == DISKTYPE_XD) )
-//       {
-//          Ssfdc_D_ReadID(idcode, READ_ID_3);
-//          if (idcode[2] != 0xB5)
-//             return(ERROR);
-//       }
-//    }
-//
-//    if ( !(UserDefData_1 | UserDefData_2) )
-//    {
-//      // Use UserDefine Register to differentiate SM and xD.
-//      Ssfdc_D_ReadID(idcode, READ_ID_3);
-//
-//      if (idcode[2] == 0xB5)
-//         fdoExt->DiskType = DISKTYPE_XD;
-//      else
-//      {
-//          if (!IsXDCompliance)
-//             fdoExt->DiskType = DISKTYPE_SM;
-//          else
-//             return(ERROR);
-//      }
-//
-//      if (fdoExt->UserDef_DiskType == 0x04)  fdoExt->DiskType = DISKTYPE_XD;
-//      if (fdoExt->UserDef_DiskType == 0x08)  fdoExt->DiskType = DISKTYPE_SM;
-//    }
-//
-//    if (!fdoExt->UserDef_DisableWP)
-//    {
-//       if (fdoExt->DiskType == DISKTYPE_SM)
-//       {
-//           if (Check_D_SsfdcWP())
-//              Ssfdc.Attribute|=WP;
-//       }
-//    }
-
-    return(SMSUCCESS);
+	return SMSUCCESS;
 }
 
-//----- Search_D_CIS() -------------------------------------------------
+/* ----- Search_D_CIS() ------------------------------------------------- */
 int Search_D_CIS(struct us_data *us)
 {
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
+	Media.Zone = 0;
+	Media.Sector = 0;
 
-	Media.Zone=0; Media.Sector=0;
-
-	for (Media.PhyBlock=0; Media.PhyBlock<(Ssfdc.MaxBlocks-Ssfdc.MaxLogBlocks-1); Media.PhyBlock++)
-	{
-		if (Ssfdc_D_ReadRedtData(us, Redundant))
-		{
+	for (Media.PhyBlock = 0;
+		Media.PhyBlock < (Ssfdc.MaxBlocks - Ssfdc.MaxLogBlocks - 1);
+		Media.PhyBlock++) {
+		if (Ssfdc_D_ReadRedtData(us, Redundant)) {
 			Ssfdc_D_Reset(us);
-			return(ERROR);
+			return ERROR;
 		}
 
 		if (!Check_D_FailBlock(Redundant))
 			break;
 	}
 
-	if (Media.PhyBlock==(Ssfdc.MaxBlocks-Ssfdc.MaxLogBlocks-1))
-	{
+	if (Media.PhyBlock == (Ssfdc.MaxBlocks - Ssfdc.MaxLogBlocks - 1)) {
 		Ssfdc_D_Reset(us);
-		return(ERROR);
+		return ERROR;
 	}
 
-	while (Media.Sector<CIS_SEARCH_SECT)
-	{
-		if (Media.Sector)
-		{
-			if (Ssfdc_D_ReadRedtData(us, Redundant))
-			{
+	while (Media.Sector < CIS_SEARCH_SECT) {
+		if (Media.Sector) {
+			if (Ssfdc_D_ReadRedtData(us, Redundant)) {
 				Ssfdc_D_Reset(us);
-				return(ERROR);
+				return ERROR;
 			}
 		}
-		if (!Check_D_DataStatus(Redundant))
-		{
-			if (Ssfdc_D_ReadSect(us,WorkBuf,Redundant))
-			{
+		if (!Check_D_DataStatus(Redundant)) {
+			if (Ssfdc_D_ReadSect(us, WorkBuf, Redundant)) {
 				Ssfdc_D_Reset(us);
-				return(ERROR);
+				return ERROR;
 			}
 
-			if (Check_D_CISdata(WorkBuf,Redundant))
-			{
+			if (Check_D_CISdata(WorkBuf, Redundant)) {
 				Ssfdc_D_Reset(us);
-				return(ERROR);
+				return ERROR;
 			}
 
-			CisArea.PhyBlock=Media.PhyBlock;
-			CisArea.Sector=Media.Sector;
+			CisArea.PhyBlock = Media.PhyBlock;
+			CisArea.Sector = Media.Sector;
 			Ssfdc_D_Reset(us);
-			return(SMSUCCESS);
+			return SMSUCCESS;
 		}
 
 		Media.Sector++;
 	}
 
 	Ssfdc_D_Reset(us);
-	return(ERROR);
+	return ERROR;
 }
 
-//----- Make_D_LogTable() ----------------------------------------------
+/* ----- Make_D_LogTable() ---------------------------------------------- */
 int Make_D_LogTable(struct us_data *us)
 {
-	WORD  phyblock,logblock;
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
+	WORD  phyblock, logblock;
 
-	if (Log2Phy[Media.Zone]==NULL)
-	{
-		Log2Phy[Media.Zone] = kmalloc(MAX_LOGBLOCK*sizeof(WORD), GFP_KERNEL);
+	if (Log2Phy[Media.Zone] == NULL) {
+		Log2Phy[Media.Zone] = kmalloc(MAX_LOGBLOCK * sizeof(WORD),
+								GFP_KERNEL);
 		/* pr_info("ExAllocatePool Zone = %x, Addr = %x\n",
 				Media.Zone, Log2Phy[Media.Zone]); */
-		if (Log2Phy[Media.Zone]==NULL)
-			return(ERROR);
+		if (Log2Phy[Media.Zone] == NULL)
+			return ERROR;
 	}
 
-	Media.Sector=0;
+	Media.Sector = 0;
 
-	//for(Media.Zone=0; Media.Zone<MAX_ZONENUM; Media.Zone++)
-	//for(Media.Zone=0; Media.Zone<Ssfdc.MaxZones; Media.Zone++)
-	{
-		/* pr_info("Make_D_LogTable --- MediaZone = 0x%x\n",
-							Media.Zone); */
-		for(Media.LogBlock=0; Media.LogBlock<Ssfdc.MaxLogBlocks; Media.LogBlock++)
-			Log2Phy[Media.Zone][Media.LogBlock]=NO_ASSIGN;
+	/* pr_info("Make_D_LogTable --- MediaZone = 0x%x\n",
+						Media.Zone); */
+	for (Media.LogBlock = 0; Media.LogBlock < Ssfdc.MaxLogBlocks;
+						Media.LogBlock++)
+		Log2Phy[Media.Zone][Media.LogBlock] = NO_ASSIGN;
 
-		for(Media.PhyBlock=0; Media.PhyBlock<(MAX_BLOCKNUM/8); Media.PhyBlock++)
-			Assign[Media.Zone][Media.PhyBlock]=0x00;
+	for (Media.PhyBlock = 0; Media.PhyBlock < (MAX_BLOCKNUM / 8);
+						Media.PhyBlock++)
+		Assign[Media.Zone][Media.PhyBlock] = 0x00;
 
-		for(Media.PhyBlock=0; Media.PhyBlock<Ssfdc.MaxBlocks; Media.PhyBlock++)
-		{
-			if ((!Media.Zone) && (Media.PhyBlock<=CisArea.PhyBlock))
-			{
-				Set_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-				continue;
+	for (Media.PhyBlock = 0; Media.PhyBlock < Ssfdc.MaxBlocks;
+						Media.PhyBlock++) {
+		if ((!Media.Zone) && (Media.PhyBlock <= CisArea.PhyBlock)) {
+			Set_D_Bit(Assign[Media.Zone], Media.PhyBlock);
+			continue;
+		}
+
+		if (Ssfdc_D_ReadRedtData(us, Redundant)) {
+			Ssfdc_D_Reset(us);
+			return ERROR;
+		}
+
+		if (!Check_D_DataBlank(Redundant))
+			continue;
+
+		Set_D_Bit(Assign[Media.Zone], Media.PhyBlock);
+
+		if (Check_D_FailBlock(Redundant))
+			continue;
+
+		if (Load_D_LogBlockAddr(Redundant))
+			continue;
+
+		if (Media.LogBlock >= Ssfdc.MaxLogBlocks)
+			continue;
+
+		if (Log2Phy[Media.Zone][Media.LogBlock] == NO_ASSIGN) {
+			Log2Phy[Media.Zone][Media.LogBlock] = Media.PhyBlock;
+			continue;
+		}
+
+		phyblock     = Media.PhyBlock;
+		logblock     = Media.LogBlock;
+		Media.Sector = (BYTE)(Ssfdc.MaxSectors - 1);
+
+		if (Ssfdc_D_ReadRedtData(us, Redundant)) {
+			Ssfdc_D_Reset(us);
+			return ERROR;
+		}
+
+		if (!Load_D_LogBlockAddr(Redundant) &&
+				(Media.LogBlock == logblock)) {
+			Media.PhyBlock = Log2Phy[Media.Zone][logblock];
+
+			if (Ssfdc_D_ReadRedtData(us, Redundant)) {
+				Ssfdc_D_Reset(us);
+				return ERROR;
 			}
 
-			if (Ssfdc_D_ReadRedtData(us, Redundant))
-			{ Ssfdc_D_Reset(us); return(ERROR); }
+			Media.PhyBlock = phyblock;
 
-			if (!Check_D_DataBlank(Redundant))
-				continue;
-
-			Set_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-
-			if (Check_D_FailBlock(Redundant))
-				continue;
-
-			//if (Check_D_DataStatus(Redundant))
-			//    continue;
-
-			if (Load_D_LogBlockAddr(Redundant))
-				continue;
-
-			if (Media.LogBlock>=Ssfdc.MaxLogBlocks)
-				continue;
-
-			if (Log2Phy[Media.Zone][Media.LogBlock]==NO_ASSIGN)
-			{
-				Log2Phy[Media.Zone][Media.LogBlock]=Media.PhyBlock;
-				continue;
-			}
-
-			phyblock     = Media.PhyBlock;
-			logblock     = Media.LogBlock;
-			Media.Sector = (BYTE)(Ssfdc.MaxSectors-1);
-
-			if (Ssfdc_D_ReadRedtData(us, Redundant))
-			{ Ssfdc_D_Reset(us); return(ERROR); }
-
-			if (!Load_D_LogBlockAddr(Redundant))
-			{
-				if (Media.LogBlock==logblock)
-				{
-					Media.PhyBlock=Log2Phy[Media.Zone][logblock];
-
-					if (Ssfdc_D_ReadRedtData(us, Redundant))
-					{ Ssfdc_D_Reset(us); return(ERROR); }
-
-					Media.PhyBlock=phyblock;
-
-					if (!Load_D_LogBlockAddr(Redundant))
-					{
-						if (Media.LogBlock!=logblock)
-						{
-							Media.PhyBlock=Log2Phy[Media.Zone][logblock];
-							Log2Phy[Media.Zone][logblock]=phyblock;
-						}
-					}
-					else
-					{
-						Media.PhyBlock=Log2Phy[Media.Zone][logblock];
-						Log2Phy[Media.Zone][logblock]=phyblock;
-					}
+			if (!Load_D_LogBlockAddr(Redundant)) {
+				if (Media.LogBlock != logblock) {
+					Media.PhyBlock =
+						Log2Phy[Media.Zone][logblock];
+					Log2Phy[Media.Zone][logblock] =
+								phyblock;
 				}
+			} else {
+				Media.PhyBlock = Log2Phy[Media.Zone][logblock];
+				Log2Phy[Media.Zone][logblock] = phyblock;
 			}
+		}
 
-			Media.Sector=0;
+		Media.Sector = 0;
+		Media.PhyBlock = phyblock;
 
-// here Not yet
-//#ifdef L2P_ERR_ERASE
-//			if (!(Ssfdc.Attribute &MWP))
-//			{
-//				Ssfdc_D_Reset(fdoExt);
-//				if (Ssfdc_D_EraseBlock(fdoExt))
-//					return(ERROR);
-//
-//				if (Ssfdc_D_CheckStatus())
-//				{
-//					if (MarkFail_D_PhyOneBlock())
-//						return(ERROR);
-//				}
-//				else
-//					Clr_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-//			}
-//#else
-//			Ssfdc.Attribute|=MWP;
-//#endif
-			Media.PhyBlock=phyblock;
+	AssignStart[Media.Zone] = 0;
 
-		} // End for (Media.PhyBlock<Ssfdc.MaxBlocks)
-
-		AssignStart[Media.Zone]=0;
-
-	} // End for (Media.Zone<MAX_ZONENUM)
+	} /* End for (Media.Zone<MAX_ZONENUM) */
 
 	Ssfdc_D_Reset(us);
-	return(SMSUCCESS);
+	return SMSUCCESS;
 }
 
-//----- MarkFail_D_PhyOneBlock() ---------------------------------------
+/* ----- MarkFail_D_PhyOneBlock() --------------------------------------- */
 int MarkFail_D_PhyOneBlock(struct us_data *us)
 {
 	BYTE sect;
-	//SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-	//ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-	sect=Media.Sector;
+	sect = Media.Sector;
 	Set_D_FailBlock(WorkRedund);
-	//Ssfdc_D_WriteRedtMode();
 
-	for(Media.Sector=0; Media.Sector<Ssfdc.MaxSectors; Media.Sector++)
-	{
-		if (Ssfdc_D_WriteRedtData(us, WorkRedund))
-		{
+	for (Media.Sector = 0; Media.Sector < Ssfdc.MaxSectors;
+							Media.Sector++) {
+		if (Ssfdc_D_WriteRedtData(us, WorkRedund)) {
 			Ssfdc_D_Reset(us);
 			Media.Sector   = sect;
 			ErrCode        = ERR_HwError;
 			MediaChange = ERROR;
-			return(ERROR);
-		} // NO Status Check
+			return ERROR;
+		} /* NO Status Check */
 	}
 
 	Ssfdc_D_Reset(us);
-	Media.Sector=sect;
-	return(SMSUCCESS);
+	Media.Sector = sect;
+	return SMSUCCESS;
 }
-/*
-//
-////----- SM_Init() ----------------------------------------------------
-//void SM_Init(void)
-//{
-//    _Hw_D_ClrIntCardChg();
-//    _Hw_D_SetIntMask();
-//    // For DMA Interrupt
-//    _Hw_D_ClrDMAIntCardChg();
-//    _Hw_D_SetDMAIntMask();
-//}
-//
-////----- Media_D_EraseAllRedtData() -----------------------------------
-//int Media_D_EraseAllRedtData(DWORD Index, BOOLEAN CheckBlock)
-//{
-//    BYTE    i;
-//
-//    if (Check_D_MediaPower())
-//        return(ErrCode);
-//
-//    if (Check_D_MediaWP())
-//        return(ErrCode);
-//
-//    for (i=0; i<REDTSIZE; i++)
-//        WorkRedund[i] = 0xFF;
-//
-//    Media.Zone = (BYTE)Index;
-//    for (Media.PhyBlock=0; Media.PhyBlock<Ssfdc.MaxBlocks; Media.PhyBlock++)
-//    {
-//        if ((!Media.Zone) && (Media.PhyBlock<=CisArea.PhyBlock))
-//            continue;
-//
-//        if (Ssfdc_D_EraseBlock(fdoExt))
-//        {
-//            ErrCode = ERR_HwError;
-//            return(ERROR);
-//        }
-//
-//        for(Media.Sector=0; Media.Sector<Ssfdc.MaxSectors; Media.Sector++)
-//        {
-//            Ssfdc_D_WriteRedtMode();
-//
-//            if (Ssfdc_D_WriteRedtData(WorkRedund))
-//            {
-//                Ssfdc_D_Reset(fdoExt);
-//                ErrCode        = ERR_HwError;
-//                MediaChange    = ERROR;
-//                return(ERROR);
-//            } // NO Status Check
-//        }
-//
-//        Ssfdc_D_Reset(fdoExt);
-//    }
-//
-//    Ssfdc_D_Reset(fdoExt);
-//
-//    return(SMSUCCESS);
-//}
-//
-////----- Media_D_GetMediaInfo() ---------------------------------------
-//DWORD Media_D_GetMediaInfo(PFDO_DEVICE_EXTENSION fdoExt, PIOCTL_MEDIA_INFO_IN pParamIn, PIOCTL_MEDIA_INFO_OUT pParamOut)
-//{
-//    pParamOut->ErrCode = STATUS_CMD_FAIL;
-//
-//    Init_D_SmartMedia();
-//
-//    if (Check_D_MediaPower())
-//        return (ErrCode==ERR_NoSmartMedia) ? STATUS_CMD_NO_MEDIA : STATUS_CMD_FAIL;
-//
-//    if (Set_D_PhyFmtValue(fdoExt))
-//        return STATUS_CMD_FAIL;
-//
-//    //usleep(56*1024);
-//    if (Search_D_CIS(fdoExt))
-//        return STATUS_CMD_FAIL;
-//
-//    if (Check_D_MediaWP())
-//        return STATUS_CMD_MEDIA_WP;
-//
-//    pParamOut->PageSize  = Ssfdc.MaxSectors;
-//    pParamOut->BlockSize = Ssfdc.MaxBlocks;
-//    pParamOut->ZoneSize  = Ssfdc.MaxZones;
-//
-//    return STATUS_CMD_SUCCESS;
-//}*/

diff --git a/drivers/staging/keucr/smilsub.c b/drivers/staging/keucr/smilsub.c
index d4dd5ed..346c570 100644
--- a/drivers/staging/keucr/smilsub.c
+++ b/drivers/staging/keucr/smilsub.c

@@ -33,9 +33,9 @@
 void   _Calc_D_ECCdata(BYTE *);
 
 
-struct SSFDCTYPE                Ssfdc;
-struct ADDRESS                  Media;
-struct CIS_AREA                 CisArea;
+struct keucr_media_info         Ssfdc;
+struct keucr_media_address      Media;
+struct keucr_media_area         CisArea;
 
 static BYTE                            EccBuf[6];
 extern PBYTE                    SMHostAddr;
@@ -103,8 +103,10 @@
 {
 	WORD addr1, addr2;
 
-	addr1 = (WORD)*(redundant + REDT_ADDR1H)*0x0100 + (WORD)*(redundant + REDT_ADDR1L);
-	addr2 = (WORD)*(redundant + REDT_ADDR2H)*0x0100 + (WORD)*(redundant + REDT_ADDR2L);
+	addr1 = (WORD)*(redundant + REDT_ADDR1H)*0x0100 +
+					(WORD)*(redundant + REDT_ADDR1L);
+	addr2 = (WORD)*(redundant + REDT_ADDR2H)*0x0100 +
+					(WORD)*(redundant + REDT_ADDR2L);
 
 	if (addr1 == addr2)
 		if ((addr1 & 0xF000) == 0x1000) {
@@ -151,7 +153,8 @@
 	if ((hweight16(addr) % 2))
 		addr++;
 
-	*(redundant + REDT_ADDR1H) = *(redundant + REDT_ADDR2H) = (BYTE)(addr / 0x0100);
+	*(redundant + REDT_ADDR1H) = *(redundant + REDT_ADDR2H) =
+							(BYTE)(addr / 0x0100);
 	*(redundant + REDT_ADDR1L) = *(redundant + REDT_ADDR2L) = (BYTE)addr;
 }
 
@@ -191,7 +194,9 @@
 	Media.Sector = CisArea.Sector;
 
 	if (Ssfdc_D_ReadSect(us, buf, redundant)) {
-		Media.Zone = zone; Media.PhyBlock = block; Media.Sector = sector;
+		Media.Zone = zone;
+		Media.PhyBlock = block;
+		Media.Sector = sector;
 		return ERROR;
 	}
 
@@ -209,7 +214,8 @@
 
 	result = ENE_LoadBinCode(us, SM_RW_PATTERN);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk("Load SM RW Code Fail !!\n");
+		dev_err(&us->pusb_dev->dev,
+			"Failed to load SmartMedia read/write code\n");
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
@@ -252,7 +258,8 @@
 }
 
 /* ----- Ssfdc_D_ReadBlock() --------------------------------------------- */
-int Ssfdc_D_ReadBlock(struct us_data *us, WORD count, BYTE *buf, BYTE *redundant)
+int Ssfdc_D_ReadBlock(struct us_data *us, WORD count, BYTE *buf,
+							BYTE *redundant)
 {
 	struct bulk_cb_wrap *bcb = (struct bulk_cb_wrap *) us->iobuf;
 	int	result;
@@ -260,7 +267,8 @@
 
 	result = ENE_LoadBinCode(us, SM_RW_PATTERN);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk("Load SM RW Code Fail !!\n");
+		dev_err(&us->pusb_dev->dev,
+			"Failed to load SmartMedia read/write code\n");
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
@@ -304,7 +312,8 @@
 
 
 /* ----- Ssfdc_D_CopyBlock() -------------------------------------------- */
-int Ssfdc_D_CopyBlock(struct us_data *us, WORD count, BYTE *buf, BYTE *redundant)
+int Ssfdc_D_CopyBlock(struct us_data *us, WORD count, BYTE *buf,
+							BYTE *redundant)
 {
 	struct bulk_cb_wrap *bcb = (struct bulk_cb_wrap *) us->iobuf;
 	int	result;
@@ -312,7 +321,8 @@
 
 	result = ENE_LoadBinCode(us, SM_RW_PATTERN);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk("Load SM RW Code Fail !!\n");
+		dev_err(&us->pusb_dev->dev,
+			"Failed to load SmartMedia read/write code\n");
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
@@ -358,7 +368,8 @@
 
 	result = ENE_LoadBinCode(us, SM_RW_PATTERN);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk("Load SM RW Code Fail !!\n");
+		dev_err(&us->pusb_dev->dev,
+			"Failed to load SmartMedia read/write code\n");
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
@@ -396,7 +407,8 @@
 
 	result = ENE_LoadBinCode(us, SM_RW_PATTERN);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk("Load SM RW Code Fail !!\n");
+		dev_err(&us->pusb_dev->dev,
+			"Failed to load SmartMedia read/write code\n");
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
@@ -431,7 +443,8 @@
 
 	result = ENE_LoadBinCode(us, SM_RW_PATTERN);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk("Load SM RW Code Fail !!\n");
+		dev_err(&us->pusb_dev->dev,
+			"Failed to load SmartMedia read/write code\n");
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
@@ -470,7 +483,8 @@
 
 	result = ENE_LoadBinCode(us, SM_RW_PATTERN);
 	if (result != USB_STOR_XFER_GOOD) {
-		printk("Load SM RW Code Fail !!\n");
+		dev_err(&us->pusb_dev->dev,
+			"Failed to load SmartMedia read/write code\n");
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
@@ -611,7 +625,7 @@
 		return ERROR;
 	}
 
-    return SMSUCCESS;
+	return SMSUCCESS;
 }
 
 /* ----- _Check_D_DevCode() --------------------------------------------- */
@@ -686,8 +700,8 @@
 /* ----- Set_D_RightECC() ---------------------------------------------- */
 void Set_D_RightECC(BYTE *redundant)
 {
-    /* Driver ECC Check */
-    return;
+	/* Driver ECC Check */
+	return;
 }
 
 

diff --git a/drivers/staging/keucr/smscsi.c b/drivers/staging/keucr/smscsi.c
index 58b5555..572d648 100644
--- a/drivers/staging/keucr/smscsi.c
+++ b/drivers/staging/keucr/smscsi.c

@@ -56,7 +56,7 @@
 	return result;
 }
 
-/* ----- SM_SCSI_Test_Unit_Ready() -------------------------------------------------- */
+/* ----- SM_SCSI_Test_Unit_Ready() ------------------------------------- */
 int SM_SCSI_Test_Unit_Ready(struct us_data *us, struct scsi_cmnd *srb)
 {
 	if (us->SM_Status.Insert && us->SM_Status.Ready)
@@ -69,21 +69,27 @@
 	return USB_STOR_TRANSPORT_GOOD;
 }
 
-/* ----- SM_SCSI_Inquiry() -------------------------------------------------- */
+/* ----- SM_SCSI_Inquiry() --------------------------------------------- */
 int SM_SCSI_Inquiry(struct us_data *us, struct scsi_cmnd *srb)
 {
-	BYTE data_ptr[36] = {0x00, 0x80, 0x02, 0x00, 0x1F, 0x00, 0x00, 0x00, 0x55, 0x53, 0x42, 0x32, 0x2E, 0x30, 0x20, 0x20, 0x43, 0x61, 0x72, 0x64, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x31, 0x30, 0x30};
+	BYTE data_ptr[36] = {0x00, 0x80, 0x02, 0x00, 0x1F, 0x00, 0x00, 0x00,
+				 0x55, 0x53, 0x42, 0x32, 0x2E, 0x30, 0x20,
+				 0x20, 0x43, 0x61, 0x72, 0x64, 0x52, 0x65,
+				 0x61, 0x64, 0x65, 0x72, 0x20, 0x20, 0x20,
+				 0x20, 0x20, 0x20, 0x30, 0x31, 0x30, 0x30};
 
 	usb_stor_set_xfer_buf(us, data_ptr, 36, srb, TO_XFER_BUF);
 	return USB_STOR_TRANSPORT_GOOD;
 }
 
 
-/* ----- SM_SCSI_Mode_Sense() -------------------------------------------------- */
+/* ----- SM_SCSI_Mode_Sense() ------------------------------------------ */
 int SM_SCSI_Mode_Sense(struct us_data *us, struct scsi_cmnd *srb)
 {
-	BYTE	mediaNoWP[12] = {0x0b, 0x00, 0x00, 0x08, 0x00, 0x00, 0x71, 0xc0, 0x00, 0x00, 0x02, 0x00};
-	BYTE	mediaWP[12]   = {0x0b, 0x00, 0x80, 0x08, 0x00, 0x00, 0x71, 0xc0, 0x00, 0x00, 0x02, 0x00};
+	BYTE	mediaNoWP[12] = {0x0b, 0x00, 0x00, 0x08, 0x00, 0x00,
+				0x71, 0xc0, 0x00, 0x00, 0x02, 0x00};
+	BYTE	mediaWP[12]   = {0x0b, 0x00, 0x80, 0x08, 0x00, 0x00,
+				0x71, 0xc0, 0x00, 0x00, 0x02, 0x00};
 
 	if (us->SM_Status.WtP)
 		usb_stor_set_xfer_buf(us, mediaWP, 12, srb, TO_XFER_BUF);
@@ -94,7 +100,7 @@
 	return USB_STOR_TRANSPORT_GOOD;
 }
 
-/* ----- SM_SCSI_Read_Capacity() -------------------------------------------------- */
+/* ----- SM_SCSI_Read_Capacity() --------------------------------------- */
 int SM_SCSI_Read_Capacity(struct us_data *us, struct scsi_cmnd *srb)
 {
 	unsigned int offset = 0;
@@ -103,14 +109,14 @@
 	WORD    bl_len;
 	BYTE    buf[8];
 
-	printk("SM_SCSI_Read_Capacity\n");
+	dev_dbg(&us->pusb_dev->dev, "SM_SCSI_Read_Capacity\n");
 
 	bl_len = 0x200;
 	bl_num = Ssfdc.MaxLogBlocks * Ssfdc.MaxSectors * Ssfdc.MaxZones - 1;
 
 	us->bl_num = bl_num;
-	printk("bl_len = %x\n", bl_len);
-	printk("bl_num = %x\n", bl_num);
+	dev_dbg(&us->pusb_dev->dev, "bl_len = %x\n", bl_len);
+	dev_dbg(&us->pusb_dev->dev, "bl_num = %x\n", bl_num);
 
 	buf[0] = (bl_num >> 24) & 0xff;
 	buf[1] = (bl_num >> 16) & 0xff;
@@ -131,8 +137,10 @@
 {
 	int result = 0;
 	PBYTE	Cdb = srb->cmnd;
-	DWORD bn  =  ((Cdb[2] << 24) & 0xff000000) | ((Cdb[3] << 16) & 0x00ff0000) |
-		((Cdb[4] << 8) & 0x0000ff00) | ((Cdb[5] << 0) & 0x000000ff);
+	DWORD bn  =  ((Cdb[2] << 24) & 0xff000000) |
+			((Cdb[3] << 16) & 0x00ff0000) |
+			((Cdb[4] << 8) & 0x0000ff00) |
+			((Cdb[5] << 0) & 0x000000ff);
 	WORD  blen = ((Cdb[7] << 8) & 0xff00)     | ((Cdb[8] << 0) & 0x00ff);
 	DWORD	blenByte = blen * 0x200;
 	void	*buf;
@@ -161,8 +169,10 @@
 {
 	int result = 0;
 	PBYTE	Cdb = srb->cmnd;
-	DWORD bn  =  ((Cdb[2] << 24) & 0xff000000) | ((Cdb[3] << 16) & 0x00ff0000) |
-		((Cdb[4] << 8) & 0x0000ff00) | ((Cdb[5] << 0) & 0x000000ff);
+	DWORD bn  =  ((Cdb[2] << 24) & 0xff000000) |
+			((Cdb[3] << 16) & 0x00ff0000) |
+			((Cdb[4] << 8) & 0x0000ff00) |
+			((Cdb[5] << 0) & 0x000000ff);
 	WORD  blen = ((Cdb[7] << 8) & 0xff00)     | ((Cdb[8] << 0) & 0x00ff);
 	DWORD	blenByte = blen * 0x200;
 	void	*buf;

diff --git a/drivers/staging/keucr/transport.c b/drivers/staging/keucr/transport.c
index 1a8837d..aeb2186 100644
--- a/drivers/staging/keucr/transport.c
+++ b/drivers/staging/keucr/transport.c

@@ -79,6 +79,47 @@
 }
 
 /*
+ * usb_stor_print_cmd():
+ */
+static void usb_stor_print_cmd(struct us_data *us, struct scsi_cmnd *srb)
+{
+	PBYTE   Cdb = srb->cmnd;
+	DWORD   cmd = Cdb[0];
+
+	switch (cmd) {
+	case TEST_UNIT_READY:
+		break;
+	case INQUIRY:
+		dev_dbg(&us->pusb_dev->dev,
+				"scsi cmd %X --- SCSIOP_INQUIRY\n", cmd);
+		break;
+	case MODE_SENSE:
+		dev_dbg(&us->pusb_dev->dev,
+				"scsi cmd %X --- SCSIOP_MODE_SENSE\n", cmd);
+		break;
+	case START_STOP:
+		dev_dbg(&us->pusb_dev->dev,
+				"scsi cmd %X --- SCSIOP_START_STOP\n", cmd);
+		break;
+	case READ_CAPACITY:
+		dev_dbg(&us->pusb_dev->dev,
+				"scsi cmd %X --- SCSIOP_READ_CAPACITY\n", cmd);
+		break;
+	case READ_10:
+		break;
+	case WRITE_10:
+		break;
+	case ALLOW_MEDIUM_REMOVAL:
+		dev_dbg(&us->pusb_dev->dev,
+			"scsi cmd %X --- SCSIOP_ALLOW_MEDIUM_REMOVAL\n", cmd);
+		break;
+	default:
+		dev_dbg(&us->pusb_dev->dev, "scsi cmd %X --- Other cmd\n", cmd);
+		break;
+	}
+}
+
+/*
  * usb_stor_control_msg()
  */
 int usb_stor_control_msg(struct us_data *us, unsigned int pipe,
@@ -303,7 +344,7 @@
 	int result;
 
 	/* pr_info("transport --- usb_stor_invoke_transport\n"); */
-	usb_stor_print_cmd(srb);
+	usb_stor_print_cmd(us, srb);
 	/* send the command to the transport layer */
 	scsi_set_resid(srb, 0);
 	result = us->transport(srb, us); /* usb_stor_Bulk_transport; */
@@ -429,7 +470,7 @@
 	int result = 0;
 
 	/* pr_info("transport --- ENE_stor_invoke_transport\n"); */
-	usb_stor_print_cmd(srb);
+	usb_stor_print_cmd(us, srb);
 	/* send the command to the transport layer */
 	scsi_set_resid(srb, 0);
 	if (!(us->SM_Status.Ready))
@@ -708,8 +749,8 @@
 
 		} else {
 			residue = min(residue, transfer_length);
-			scsi_set_resid(srb, max(scsi_get_resid(srb),
-							(int) residue));
+			scsi_set_resid(srb, max_t(int, scsi_get_resid(srb),
+							residue));
 		}
 	}
 

diff --git a/drivers/staging/keucr/transport.h b/drivers/staging/keucr/transport.h
index 2a11a98..df34474 100644
--- a/drivers/staging/keucr/transport.h
+++ b/drivers/staging/keucr/transport.h

@@ -29,7 +29,6 @@
 extern int usb_stor_Bulk_transport(struct scsi_cmnd *, struct us_data*);
 extern int usb_stor_Bulk_max_lun(struct us_data *);
 extern int usb_stor_Bulk_reset(struct us_data *);
-extern void usb_stor_print_cmd(struct scsi_cmnd *);
 extern void usb_stor_invoke_transport(struct scsi_cmnd *, struct us_data*);
 extern void usb_stor_stop_transport(struct us_data *);
 extern int usb_stor_control_msg(struct us_data *us, unsigned int pipe,
@@ -61,7 +60,7 @@
 extern int ENE_SMInit(struct us_data *);
 extern int ENE_SendScsiCmd(struct us_data*, BYTE, void*, int);
 extern int ENE_LoadBinCode(struct us_data*, BYTE);
-extern int ENE_Read_BYTE(struct us_data*, WORD index, void *buf);
+extern int ene_read_byte(struct us_data*, WORD index, void *buf);
 extern int ENE_Read_Data(struct us_data*, void *buf, unsigned int length);
 extern int ENE_Write_Data(struct us_data*, void *buf, unsigned int length);
 extern void BuildSenseBuffer(struct scsi_cmnd *, int);

diff --git a/drivers/staging/keucr/usb.c b/drivers/staging/keucr/usb.c
index f656f8a..ddd2e73 100644
--- a/drivers/staging/keucr/usb.c
+++ b/drivers/staging/keucr/usb.c

@@ -24,13 +24,13 @@
 
 static unsigned int delay_use = 1;
 
-static struct usb_device_id eucr_usb_ids [] = {
+static struct usb_device_id eucr_usb_ids[] = {
 	{ USB_DEVICE(0x058f, 0x6366) },
 	{ USB_DEVICE(0x0cf2, 0x6230) },
 	{ USB_DEVICE(0x0cf2, 0x6250) },
 	{ }                                            /* Terminating entry */
 };
-MODULE_DEVICE_TABLE (usb, eucr_usb_ids);
+MODULE_DEVICE_TABLE(usb, eucr_usb_ids);
 
 
 #ifdef CONFIG_PM
@@ -65,7 +65,7 @@
 
 	us->Power_IsResum = true;
 
-	us->SM_Status = *(PSM_STATUS)&tmp;
+	us->SM_Status = *(struct keucr_sm_status *)&tmp;
 
 	return 0;
 }
@@ -85,9 +85,9 @@
 	 * the device
 	 */
 
- 	us->Power_IsResum = true;
+	us->Power_IsResum = true;
 
-	us->SM_Status = *(PSM_STATUS)&tmp;
+	us->SM_Status = *(struct keucr_sm_status *)&tmp;
 
 	return 0;
 }
@@ -124,16 +124,18 @@
 	return 0;
 }
 
-void fill_inquiry_response(struct us_data *us, unsigned char *data, unsigned int data_len)
+void fill_inquiry_response(struct us_data *us, unsigned char *data,
+							unsigned int data_len)
 {
 	pr_info("usb --- fill_inquiry_response\n");
 	if (data_len < 36) /* You lose. */
 		return;
 
 	if (data[0]&0x20) {
-		memset(data+8,0,28);
+		memset(data+8, 0, 28);
 	} else {
-		u16 bcdDevice = le16_to_cpu(us->pusb_dev->descriptor.bcdDevice);
+		u16 bcdDevice =
+			le16_to_cpu(us->pusb_dev->descriptor.bcdDevice);
 		memcpy(data+8, us->unusual_dev->vendorName,
 			strlen(us->unusual_dev->vendorName) > 8 ? 8 :
 			strlen(us->unusual_dev->vendorName));
@@ -148,7 +150,7 @@
 	usb_stor_set_xfer_buf(us, data, data_len, us->srb, TO_XFER_BUF);
 }
 
-static int usb_stor_control_thread(void * __us)
+static int usb_stor_control_thread(void *__us)
 {
 	struct us_data *us = (struct us_data *)__us;
 	struct Scsi_Host *host = us_to_host(us);
@@ -194,7 +196,8 @@
 			us->srb->result = DID_BAD_TARGET << 16;
 		} else if ((us->srb->cmnd[0] == INQUIRY)
 			   && (us->fflags & US_FL_FIX_INQUIRY)) {
-			unsigned char data_ptr[36] = {0x00, 0x80, 0x02, 0x02, 0x1F, 0x00, 0x00, 0x00};
+			unsigned char data_ptr[36] = {0x00, 0x80, 0x02, 0x02,
+						0x1F, 0x00, 0x00, 0x00};
 
 			fill_inquiry_response(us, data_ptr, 36);
 			us->srb->result = SAM_STAT_GOOD;
@@ -253,13 +256,15 @@
 	usb_set_intfdata(intf, us);
 
 	/* Allocate the device-related DMA-mapped buffers */
-	us->cr = usb_alloc_coherent(us->pusb_dev, sizeof(*us->cr), GFP_KERNEL, &us->cr_dma);
+	us->cr = usb_alloc_coherent(us->pusb_dev, sizeof(*us->cr), GFP_KERNEL,
+							&us->cr_dma);
 	if (!us->cr) {
 		pr_info("usb_ctrlrequest allocation failed\n");
 		return -ENOMEM;
 	}
 
-	us->iobuf = usb_alloc_coherent(us->pusb_dev, US_IOBUF_SIZE, GFP_KERNEL, &us->iobuf_dma);
+	us->iobuf = usb_alloc_coherent(us->pusb_dev, US_IOBUF_SIZE, GFP_KERNEL,
+							&us->iobuf_dma);
 	if (!us->iobuf) {
 		pr_info("I/O buffer allocation failed\n");
 		return -ENOMEM;
@@ -275,7 +280,8 @@
 static int get_device_info(struct us_data *us, const struct usb_device_id *id)
 {
 	struct usb_device *dev = us->pusb_dev;
-	struct usb_interface_descriptor *idesc = &us->pusb_intf->cur_altsetting->desc;
+	struct usb_interface_descriptor *idesc =
+					&us->pusb_intf->cur_altsetting->desc;
 
 	pr_info("usb --- get_device_info\n");
 
@@ -374,10 +380,13 @@
 	/* Calculate and store the pipe values */
 	us->send_ctrl_pipe = usb_sndctrlpipe(us->pusb_dev, 0);
 	us->recv_ctrl_pipe = usb_rcvctrlpipe(us->pusb_dev, 0);
-	us->send_bulk_pipe = usb_sndbulkpipe(us->pusb_dev, ep_out->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
-	us->recv_bulk_pipe = usb_rcvbulkpipe(us->pusb_dev, ep_in->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
+	us->send_bulk_pipe = usb_sndbulkpipe(us->pusb_dev,
+			ep_out->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
+	us->recv_bulk_pipe = usb_rcvbulkpipe(us->pusb_dev,
+			ep_in->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
 	if (ep_int) {
-		us->recv_intr_pipe = usb_rcvintpipe(us->pusb_dev, ep_int->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
+		us->recv_intr_pipe = usb_rcvintpipe(us->pusb_dev,
+			ep_int->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
 		us->ep_bInterval = ep_int->bInterval;
 	}
 	return 0;
@@ -433,10 +442,9 @@
 	kfree(us->sensebuf);
 
 	/* Free the device-related DMA-mapped buffers */
-	if (us->cr)
-		usb_free_coherent(us->pusb_dev, sizeof(*us->cr), us->cr, us->cr_dma);
-	if (us->iobuf)
-		usb_free_coherent(us->pusb_dev, US_IOBUF_SIZE, us->iobuf, us->iobuf_dma);
+	usb_free_coherent(us->pusb_dev, sizeof(*us->cr), us->cr, us->cr_dma);
+	usb_free_coherent(us->pusb_dev, US_IOBUF_SIZE, us->iobuf,
+			  us->iobuf_dma);
 
 	/* Remove our private data from the interface */
 	usb_set_intfdata(us->pusb_intf, NULL);
@@ -485,7 +493,7 @@
 	scsi_host_put(us_to_host(us));
 }
 
-static int usb_stor_scan_thread(void * __us)
+static int usb_stor_scan_thread(void *__us)
 {
 	struct us_data *us = (struct us_data *)__us;
 
@@ -515,7 +523,8 @@
 	complete_and_exit(&us->scanning_done, 0);
 }
 
-static int eucr_probe(struct usb_interface *intf, const struct usb_device_id *id)
+static int eucr_probe(struct usb_interface *intf,
+					const struct usb_device_id *id)
 {
 	struct Scsi_Host *host;
 	struct us_data *us;
@@ -525,7 +534,7 @@
 
 	pr_info("usb --- eucr_probe\n");
 
-      host = scsi_host_alloc(&usb_stor_host_template, sizeof(*us));
+	host = scsi_host_alloc(&usb_stor_host_template, sizeof(*us));
 	if (!host) {
 		pr_info("Unable to allocate the scsi host\n");
 		return -ENOMEM;
@@ -585,7 +594,7 @@
 	wake_up_process(th);
 
 	/* probe card type */
-	result = ENE_Read_BYTE(us, REG_CARD_STATUS, &MiscReg03);
+	result = ene_read_byte(us, REG_CARD_STATUS, &MiscReg03);
 	if (result != USB_STOR_XFER_GOOD) {
 		result = USB_STOR_TRANSPORT_ERROR;
 		quiesce_and_remove_host(us);
@@ -595,9 +604,9 @@
 	if (!(MiscReg03 & 0x02)) {
 		result = -ENODEV;
 		quiesce_and_remove_host(us);
-		pr_info("keucr: The driver only supports SM/MS card.\
-			To use SD card, \
-			please build driver/usb/storage/ums-eneub6250.ko\n");
+		pr_info("keucr: The driver only supports SM/MS card. "
+			"To use SD card, "
+			"please build driver/usb/storage/ums-eneub6250.ko\n");
 		goto BadDevice;
 	}
 
@@ -623,9 +632,9 @@
 static struct usb_driver usb_storage_driver = {
 	.name =		"eucr",
 	.probe =		eucr_probe,
-    	.suspend =	    eucr_suspend,
+	.suspend =	    eucr_suspend,
 	.resume =	    eucr_resume,
-    	.reset_resume =	eucr_reset_resume,
+	.reset_resume =	eucr_reset_resume,
 	.disconnect =	eucr_disconnect,
 	.pre_reset =	eucr_pre_reset,
 	.post_reset =	eucr_post_reset,

diff --git a/drivers/staging/keucr/usb.h b/drivers/staging/keucr/usb.h
index a5f7a16..d665af1 100644
--- a/drivers/staging/keucr/usb.h
+++ b/drivers/staging/keucr/usb.h

@@ -1,4 +1,4 @@
-// Driver for USB Mass Storage compliant devices
+/* Driver for USB Mass Storage compliant devices */
 
 #ifndef _USB_H_
 #define _USB_H_
@@ -19,26 +19,26 @@
  */
 
 struct us_unusual_dev {
-	const char* vendorName;
-	const char* productName;
+	const char *vendorName;
+	const char *productName;
 	__u8  useProtocol;
 	__u8  useTransport;
 	int (*initFunction)(struct us_data *);
 };
 
-//EnE HW Register
+/* EnE HW Register */
 #define REG_CARD_STATUS     0xFF83
 #define REG_HW_TRAP1        0xFF89
 
-// SRB Status. Refers /usr/include/wine/wine/wnaspi32.h & SCSI sense key
-#define SS_SUCCESS                  0x00      // No Sense
+/* SRB Status. Refers /usr/include/wine/wine/wnaspi32.h & SCSI sense key */
+#define SS_SUCCESS                  0x00      /* No Sense */
 #define SS_NOT_READY                0x02
 #define SS_MEDIUM_ERR               0x03
 #define SS_HW_ERR                   0x04
 #define SS_ILLEGAL_REQUEST          0x05
 #define SS_UNIT_ATTENTION           0x06
 
-//ENE Load FW Pattern
+/* ENE Load FW Pattern */
 #define SD_INIT1_PATTERN   1
 #define SD_INIT2_PATTERN   2
 #define SD_RW_PATTERN      3
@@ -51,39 +51,40 @@
 #define FDIR_WRITE        0
 #define FDIR_READ         1
 
-typedef struct _SD_STATUS {
-    BYTE    Insert:1;
-    BYTE    Ready:1;
-    BYTE    MediaChange:1;
-    BYTE    IsMMC:1;
-    BYTE    HiCapacity:1;
-    BYTE    HiSpeed:1;
-    BYTE    WtP:1;
-    BYTE    Reserved:1;
-} SD_STATUS, *PSD_STATUS;
+struct keucr_sd_status {
+	BYTE    Insert:1;
+	BYTE    Ready:1;
+	BYTE    MediaChange:1;
+	BYTE    IsMMC:1;
+	BYTE    HiCapacity:1;
+	BYTE    HiSpeed:1;
+	BYTE    WtP:1;
+	BYTE    Reserved:1;
+};
 
-typedef struct _MS_STATUS {
-    BYTE    Insert:1;
-    BYTE    Ready:1;
-    BYTE    MediaChange:1;
-    BYTE    IsMSPro:1;
-    BYTE    IsMSPHG:1;
-    BYTE    Reserved1:1;
-    BYTE    WtP:1;
-    BYTE    Reserved2:1;
-} MS_STATUS, *PMS_STATUS;
+struct keucr_ms_status {
+	BYTE    Insert:1;
+	BYTE    Ready:1;
+	BYTE    MediaChange:1;
+	BYTE    IsMSPro:1;
+	BYTE    IsMSPHG:1;
+	BYTE    Reserved1:1;
+	BYTE    WtP:1;
+	BYTE    Reserved2:1;
+};
 
-typedef struct _SM_STATUS {
-    BYTE    Insert:1;
-    BYTE    Ready:1;
-    BYTE    MediaChange:1;
-    BYTE    Reserved:3;
-    BYTE    WtP:1;
-    BYTE    IsMS:1;
-} SM_STATUS, *PSM_STATUS;
+struct keucr_sm_status {
+	BYTE    Insert:1;
+	BYTE    Ready:1;
+	BYTE    MediaChange:1;
+	BYTE    Reserved:3;
+	BYTE    WtP:1;
+	BYTE    IsMS:1;
+};
 
-// SD Block Length
-#define SD_BLOCK_LEN                            9       // 2^9 = 512 Bytes, The HW maximum read/write data length
+/* SD Block Length */
+#define SD_BLOCK_LEN		9	/* 2^9 = 512 Bytes,
+				The HW maximum read/write data length */
 
 /* Dynamic bitflag definitions (us->dflags): used in set_bit() etc. */
 #define US_FLIDX_URB_ACTIVE	0	/* current_urb is in use    */
@@ -107,9 +108,9 @@
 #define US_IOBUF_SIZE		64	/* Size of the DMA-mapped I/O buffer */
 #define US_SENSE_SIZE		18	/* Size of the autosense data buffer */
 
-typedef int (*trans_cmnd)(struct scsi_cmnd *, struct us_data*);
-typedef int (*trans_reset)(struct us_data*);
-typedef void (*proto_cmnd)(struct scsi_cmnd*, struct us_data*);
+typedef int (*trans_cmnd)(struct scsi_cmnd *, struct us_data *);
+typedef int (*trans_reset)(struct us_data *);
+typedef void (*proto_cmnd)(struct scsi_cmnd *, struct us_data *);
 typedef void (*extra_data_destructor)(void *);	/* extra data destructor */
 typedef void (*pm_hook)(struct us_data *, int);	/* power management hook */
 
@@ -176,19 +177,19 @@
 #ifdef CONFIG_PM
 	pm_hook			suspend_resume_hook;
 #endif
-	// for 6250 code
-	SD_STATUS   SD_Status;
-	MS_STATUS   MS_Status;
-	SM_STATUS   SM_Status;
+	/* for 6250 code */
+	struct keucr_sd_status   SD_Status;
+	struct keucr_ms_status   MS_Status;
+	struct keucr_sm_status   SM_Status;
 
-	//----- SD Control Data ----------------
-	//SD_REGISTER SD_Regs;
+	/* ----- SD Control Data ---------------- */
+	/* SD_REGISTER SD_Regs; */
 	WORD        SD_Block_Mult;
 	BYTE        SD_READ_BL_LEN;
 	WORD        SD_C_SIZE;
 	BYTE        SD_C_SIZE_MULT;
 
-	// SD/MMC New spec.
+	/* SD/MMC New spec. */
 	BYTE        SD_SPEC_VER;
 	BYTE        SD_CSD_VER;
 	BYTE        SD20_HIGH_CAPACITY;
@@ -196,15 +197,15 @@
 	BYTE        MMC_SPEC_VER;
 	BYTE        MMC_BusWidth;
 	BYTE        MMC_HIGH_CAPACITY;
-	
-	//----- MS Control Data ----------------
+
+	/* ----- MS Control Data ---------------- */
 	BOOLEAN             MS_SWWP;
 	DWORD               MSP_TotalBlock;
 	/* MS_LibControl       MS_Lib; */
 	BOOLEAN             MS_IsRWPage;
 	WORD                MS_Model;
 
-	//----- SM Control Data ----------------
+	/* ----- SM Control Data ---------------- */
 	BYTE		SM_DeviceID;
 	BYTE		SM_CardID;
 
@@ -212,16 +213,18 @@
 	BYTE		BIN_FLAG;
 	DWORD		bl_num;
 	int		SrbStatus;
-	
-	//------Power Managerment ---------------
-	BOOLEAN         Power_IsResum;	
+
+	/* ------Power Managerment --------------- */
+	BOOLEAN         Power_IsResum;
 };
 
 /* Convert between us_data and the corresponding Scsi_Host */
-static inline struct Scsi_Host *us_to_host(struct us_data *us) {
+static inline struct Scsi_Host *us_to_host(struct us_data *us)
+{
 	return container_of((void *) us, struct Scsi_Host, hostdata);
 }
-static inline struct us_data *host_to_us(struct Scsi_Host *host) {
+static inline struct us_data *host_to_us(struct Scsi_Host *host)
+{
 	return (struct us_data *) host->hostdata;
 }
 

diff --git a/drivers/staging/line6/pcm.c b/drivers/staging/line6/pcm.c
index 02f77d7..4795f12 100644
--- a/drivers/staging/line6/pcm.c
+++ b/drivers/staging/line6/pcm.c

@@ -107,11 +107,15 @@
 
 int line6_pcm_acquire(struct snd_line6_pcm *line6pcm, int channels)
 {
-	unsigned long flags_old =
-	    __sync_fetch_and_or(&line6pcm->flags, channels);
-	unsigned long flags_new = flags_old | channels;
-	unsigned long flags_final = flags_old;
-	int err = 0;
+	unsigned long flags_old, flags_new, flags_final;
+	int err;
+
+	do {
+		flags_old = ACCESS_ONCE(line6pcm->flags);
+		flags_new = flags_old | channels;
+	} while (cmpxchg(&line6pcm->flags, flags_old, flags_new) != flags_old);
+
+	flags_final = flags_old;
 
 	line6pcm->prev_fbuf = NULL;
 
@@ -197,9 +201,12 @@
 
 int line6_pcm_release(struct snd_line6_pcm *line6pcm, int channels)
 {
-	unsigned long flags_old =
-	    __sync_fetch_and_and(&line6pcm->flags, ~channels);
-	unsigned long flags_new = flags_old & ~channels;
+	unsigned long flags_old, flags_new;
+
+	do {
+		flags_old = ACCESS_ONCE(line6pcm->flags);
+		flags_new = flags_old & ~channels;
+	} while (cmpxchg(&line6pcm->flags, flags_old, flags_new) != flags_old);
 
 	if (test_flags(flags_new, flags_old, LINE6_BITS_CAPTURE_STREAM))
 		line6_unlink_audio_in_urbs(line6pcm);

diff --git a/drivers/staging/lustre/Kconfig b/drivers/staging/lustre/Kconfig
new file mode 100644
index 0000000..a224d88
--- /dev/null
+++ b/drivers/staging/lustre/Kconfig

@@ -0,0 +1,3 @@
+source "drivers/staging/lustre/lustre/Kconfig"
+
+source "drivers/staging/lustre/lnet/Kconfig"

diff --git a/drivers/staging/lustre/Makefile b/drivers/staging/lustre/Makefile
new file mode 100644
index 0000000..2616289
--- /dev/null
+++ b/drivers/staging/lustre/Makefile

@@ -0,0 +1,4 @@
+subdir-ccflags-y := -I$(src)/include/
+
+obj-$(CONFIG_LUSTRE_FS)		+= lustre/
+obj-$(CONFIG_LNET)		+= lnet/

diff --git a/drivers/staging/lustre/TODO b/drivers/staging/lustre/TODO
new file mode 100644
index 0000000..22742d6
--- /dev/null
+++ b/drivers/staging/lustre/TODO

@@ -0,0 +1,13 @@
+* Possible remaining coding style fix.
+* Remove deadcode.
+* Seperate client/server functionality. Functions only used by server can be
+  removed from client.
+* Clean up libcfs layer. Ideally we can remove include/linux/libcfs entirely.
+* Clean up CLIO layer. Lustre client readahead/writeback control needs to better
+  suit kernel providings.
+* Add documents in Documentation.
+* Other minor misc cleanups...
+
+Please send any patches to Greg Kroah-Hartman <greg@kroah.com>, Andreas Dilger
+<andreas.dilger@intel.com> and Peng Tao <tao.peng@emc.com>. CCing
+hpdd-discuss <hpdd-discuss@lists.01.org> would be great too.

diff --git a/drivers/staging/lustre/include/linux/libcfs/bitmap.h b/drivers/staging/lustre/include/linux/libcfs/bitmap.h
new file mode 100644
index 0000000..3f1c37b
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/bitmap.h

@@ -0,0 +1,111 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LIBCFS_BITMAP_H_
+#define _LIBCFS_BITMAP_H_
+
+
+typedef struct {
+	int	     size;
+	unsigned long   data[0];
+} cfs_bitmap_t;
+
+#define CFS_BITMAP_SIZE(nbits) \
+     (((nbits/BITS_PER_LONG)+1)*sizeof(long)+sizeof(cfs_bitmap_t))
+
+static inline
+cfs_bitmap_t *CFS_ALLOCATE_BITMAP(int size)
+{
+	cfs_bitmap_t *ptr;
+
+	OBD_ALLOC(ptr, CFS_BITMAP_SIZE(size));
+	if (ptr == NULL)
+		RETURN(ptr);
+
+	ptr->size = size;
+
+	RETURN (ptr);
+}
+
+#define CFS_FREE_BITMAP(ptr)	OBD_FREE(ptr, CFS_BITMAP_SIZE(ptr->size))
+
+static inline
+void cfs_bitmap_set(cfs_bitmap_t *bitmap, int nbit)
+{
+	set_bit(nbit, bitmap->data);
+}
+
+static inline
+void cfs_bitmap_clear(cfs_bitmap_t *bitmap, int nbit)
+{
+	test_and_clear_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_check(cfs_bitmap_t *bitmap, int nbit)
+{
+	return test_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_test_and_clear(cfs_bitmap_t *bitmap, int nbit)
+{
+	return test_and_clear_bit(nbit, bitmap->data);
+}
+
+/* return 0 is bitmap has none set bits */
+static inline
+int cfs_bitmap_check_empty(cfs_bitmap_t *bitmap)
+{
+	return find_first_bit(bitmap->data, bitmap->size) == bitmap->size;
+}
+
+static inline
+void cfs_bitmap_copy(cfs_bitmap_t *new, cfs_bitmap_t *old)
+{
+	int newsize;
+
+	LASSERT(new->size >= old->size);
+	newsize = new->size;
+	memcpy(new, old, CFS_BITMAP_SIZE(old->size));
+	new->size = newsize;
+}
+
+#define cfs_foreach_bit(bitmap, pos)					\
+	for ((pos) = find_first_bit((bitmap)->data, bitmap->size);	\
+	     (pos) < (bitmap)->size;					\
+	     (pos) = find_next_bit((bitmap)->data, (bitmap)->size, (pos) + 1))
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/curproc.h b/drivers/staging/lustre/include/linux/libcfs/curproc.h
new file mode 100644
index 0000000..90d7ce6
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/curproc.h

@@ -0,0 +1,110 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/curproc.h
+ *
+ * Lustre curproc API declaration
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_CURPROC_H__
+#define __LIBCFS_CURPROC_H__
+
+/*
+ * Portable API to access common characteristics of "current" UNIX process.
+ *
+ * Implemented in portals/include/libcfs/<os>/
+ */
+int    cfs_curproc_groups_nr(void);
+int    current_is_in_group(gid_t group);
+void   cfs_curproc_groups_dump(gid_t *array, int size);
+
+/*
+ * Plus, platform-specific constant
+ *
+ * CFS_CURPROC_COMM_MAX,
+ *
+ * and opaque scalar type
+ *
+ * kernel_cap_t
+ */
+
+/* check if task is running in compat mode.*/
+int current_is_32bit(void);
+#define current_pid()		(current->pid)
+#define current_comm()		(current->comm)
+int cfs_get_environ(const char *key, char *value, int *val_len);
+
+typedef __u32 cfs_cap_t;
+
+#define CFS_CAP_CHOWN		   0
+#define CFS_CAP_DAC_OVERRIDE	    1
+#define CFS_CAP_DAC_READ_SEARCH	 2
+#define CFS_CAP_FOWNER		  3
+#define CFS_CAP_FSETID		  4
+#define CFS_CAP_LINUX_IMMUTABLE	 9
+#define CFS_CAP_SYS_ADMIN	      21
+#define CFS_CAP_SYS_BOOT	       23
+#define CFS_CAP_SYS_RESOURCE	   24
+
+#define CFS_CAP_FS_MASK ((1 << CFS_CAP_CHOWN) |		 \
+			 (1 << CFS_CAP_DAC_OVERRIDE) |	  \
+			 (1 << CFS_CAP_DAC_READ_SEARCH) |       \
+			 (1 << CFS_CAP_FOWNER) |		\
+			 (1 << CFS_CAP_FSETID ) |	       \
+			 (1 << CFS_CAP_LINUX_IMMUTABLE) |       \
+			 (1 << CFS_CAP_SYS_ADMIN) |	     \
+			 (1 << CFS_CAP_SYS_BOOT) |	      \
+			 (1 << CFS_CAP_SYS_RESOURCE))
+
+void cfs_cap_raise(cfs_cap_t cap);
+void cfs_cap_lower(cfs_cap_t cap);
+int cfs_cap_raised(cfs_cap_t cap);
+cfs_cap_t cfs_curproc_cap_pack(void);
+void cfs_curproc_cap_unpack(cfs_cap_t cap);
+int cfs_capable(cfs_cap_t cap);
+
+/* __LIBCFS_CURPROC_H__ */
+#endif
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/libcfs.h
new file mode 100644
index 0000000..1ab1f2b
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs.h

@@ -0,0 +1,234 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LIBCFS_H__
+#define __LIBCFS_LIBCFS_H__
+
+#if !__GNUC__
+#define __attribute__(x)
+#endif
+
+#include <linux/libcfs/linux/libcfs.h>
+
+#include "curproc.h"
+
+#ifndef offsetof
+# define offsetof(typ,memb) ((long)(long_ptr_t)((char *)&(((typ *)0)->memb)))
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) ((sizeof (a)) / (sizeof ((a)[0])))
+#endif
+
+#if !defined(swap)
+#define swap(x,y) do { typeof(x) z = x; x = y; y = z; } while (0)
+#endif
+
+#if !defined(container_of)
+/* given a pointer @ptr to the field @member embedded into type (usually
+ * struct) @type, return pointer to the embedding instance of @type. */
+#define container_of(ptr, type, member) \
+	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+#endif
+
+static inline int __is_po2(unsigned long long val)
+{
+	return !(val & (val - 1));
+}
+
+#define IS_PO2(val) __is_po2((unsigned long long)(val))
+
+#define LOWEST_BIT_SET(x)       ((x) & ~((x) - 1))
+
+/*
+ * Lustre Error Checksum: calculates checksum
+ * of Hex number by XORing each bit.
+ */
+#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \
+			   ((hexnum) >> 8 & 0xf))
+
+
+/*
+ * Some (nomina odiosa sunt) platforms define NULL as naked 0. This confuses
+ * Lustre RETURN(NULL) macro.
+ */
+#if defined(NULL)
+#undef NULL
+#endif
+
+#define NULL ((void *)0)
+
+#define LUSTRE_SRV_LNET_PID      LUSTRE_LNET_PID
+
+
+#include <linux/list.h>
+
+#ifndef cfs_for_each_possible_cpu
+#  error cfs_for_each_possible_cpu is not supported by kernel!
+#endif
+
+/* libcfs tcpip */
+int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask);
+int libcfs_ipif_enumerate(char ***names);
+void libcfs_ipif_free_enumeration(char **names, int n);
+int libcfs_sock_listen(socket_t **sockp, __u32 ip, int port, int backlog);
+int libcfs_sock_accept(socket_t **newsockp, socket_t *sock);
+void libcfs_sock_abort_accept(socket_t *sock);
+int libcfs_sock_connect(socket_t **sockp, int *fatal,
+			__u32 local_ip, int local_port,
+			__u32 peer_ip, int peer_port);
+int libcfs_sock_setbuf(socket_t *socket, int txbufsize, int rxbufsize);
+int libcfs_sock_getbuf(socket_t *socket, int *txbufsize, int *rxbufsize);
+int libcfs_sock_getaddr(socket_t *socket, int remote, __u32 *ip, int *port);
+int libcfs_sock_write(socket_t *sock, void *buffer, int nob, int timeout);
+int libcfs_sock_read(socket_t *sock, void *buffer, int nob, int timeout);
+void libcfs_sock_release(socket_t *sock);
+
+/* libcfs watchdogs */
+struct lc_watchdog;
+
+/* Add a watchdog which fires after "time" milliseconds of delay.  You have to
+ * touch it once to enable it. */
+struct lc_watchdog *lc_watchdog_add(int time,
+				    void (*cb)(pid_t pid, void *),
+				    void *data);
+
+/* Enables a watchdog and resets its timer. */
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout);
+#define CFS_GET_TIMEOUT(svc) (max_t(int, obd_timeout,		   \
+			  AT_OFF ? 0 : at_get(&svc->srv_at_estimate)) * \
+			  svc->srv_watchdog_factor)
+
+/* Disable a watchdog; touch it to restart it. */
+void lc_watchdog_disable(struct lc_watchdog *lcw);
+
+/* Clean up the watchdog */
+void lc_watchdog_delete(struct lc_watchdog *lcw);
+
+/* Dump a debug log */
+void lc_watchdog_dumplog(pid_t pid, void *data);
+
+
+/* need both kernel and user-land acceptor */
+#define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
+#define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
+
+/*
+ * libcfs pseudo device operations
+ *
+ * struct psdev_t and
+ * misc_register() and
+ * misc_deregister() are declared in
+ * libcfs/<os>/<os>-prim.h
+ *
+ * It's just draft now.
+ */
+
+struct cfs_psdev_file {
+	unsigned long   off;
+	void	    *private_data;
+	unsigned long   reserved1;
+	unsigned long   reserved2;
+};
+
+struct cfs_psdev_ops {
+	int (*p_open)(unsigned long, void *);
+	int (*p_close)(unsigned long, void *);
+	int (*p_read)(struct cfs_psdev_file *, char *, unsigned long);
+	int (*p_write)(struct cfs_psdev_file *, char *, unsigned long);
+	int (*p_ioctl)(struct cfs_psdev_file *, unsigned long, void *);
+};
+
+/*
+ * Drop into debugger, if possible. Implementation is provided by platform.
+ */
+
+void cfs_enter_debugger(void);
+
+/*
+ * Defined by platform
+ */
+int unshare_fs_struct(void);
+sigset_t cfs_get_blocked_sigs(void);
+sigset_t cfs_block_allsigs(void);
+sigset_t cfs_block_sigs(unsigned long sigs);
+sigset_t cfs_block_sigsinv(unsigned long sigs);
+void cfs_restore_sigs(sigset_t);
+int cfs_signal_pending(void);
+void cfs_clear_sigpending(void);
+
+/*
+ * Random number handling
+ */
+
+/* returns a random 32-bit integer */
+unsigned int cfs_rand(void);
+/* seed the generator */
+void cfs_srand(unsigned int, unsigned int);
+void cfs_get_random_bytes(void *buf, int size);
+
+#include <linux/libcfs/libcfs_debug.h>
+#include <linux/libcfs/libcfs_cpu.h>
+#include <linux/libcfs/libcfs_private.h>
+#include <linux/libcfs/libcfs_ioctl.h>
+#include <linux/libcfs/libcfs_prim.h>
+#include <linux/libcfs/libcfs_time.h>
+#include <linux/libcfs/libcfs_string.h>
+#include <linux/libcfs/libcfs_kernelcomm.h>
+#include <linux/libcfs/libcfs_workitem.h>
+#include <linux/libcfs/libcfs_hash.h>
+#include <linux/libcfs/libcfs_heap.h>
+#include <linux/libcfs/libcfs_fail.h>
+#include <linux/libcfs/params_tree.h>
+#include <linux/libcfs/libcfs_crypto.h>
+
+/* container_of depends on "likely" which is defined in libcfs_private.h */
+static inline void *__container_of(void *ptr, unsigned long shift)
+{
+	if (unlikely(IS_ERR(ptr) || ptr == NULL))
+		return ptr;
+	else
+		return (char *)ptr - shift;
+}
+
+#define container_of0(ptr, type, member) \
+	((type *)__container_of((void *)(ptr), offsetof(type, member)))
+
+#define SET_BUT_UNUSED(a) do { } while(sizeof(a) - sizeof(a))
+
+#define _LIBCFS_H
+
+#endif /* _LIBCFS_H */

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h
new file mode 100644
index 0000000..6ae7415
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h

@@ -0,0 +1,214 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_cpu.h
+ *
+ * CPU partition
+ *   . CPU partition is virtual processing unit
+ *
+ *   . CPU partition can present 1-N cores, or 1-N NUMA nodes,
+ *     in other words, CPU partition is a processors pool.
+ *
+ * CPU Partition Table (CPT)
+ *   . a set of CPU partitions
+ *
+ *   . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP
+ *
+ *   . User can specify total number of CPU partitions while creating a
+ *     CPT, ID of CPU partition is always start from 0.
+ *
+ *     Example: if there are 8 cores on the system, while creating a CPT
+ *     with cpu_npartitions=4:
+ *	      core[0, 1] = partition[0], core[2, 3] = partition[1]
+ *	      core[4, 5] = partition[2], core[6, 7] = partition[3]
+ *
+ *	  cpu_npartitions=1:
+ *	      core[0, 1, ... 7] = partition[0]
+ *
+ *   . User can also specify CPU partitions by string pattern
+ *
+ *     Examples: cpu_partitions="0[0,1], 1[2,3]"
+ *	       cpu_partitions="N 0[0-3], 1[4-8]"
+ *
+ *     The first character "N" means following numbers are numa ID
+ *
+ *   . NUMA allocators, CPU affinity threads are built over CPU partitions,
+ *     instead of HW CPUs or HW nodes.
+ *
+ *   . By default, Lustre modules should refer to the global cfs_cpt_table,
+ *     instead of accessing HW CPUs directly, so concurrency of Lustre can be
+ *     configured by cpu_npartitions of the global cfs_cpt_table
+ *
+ *   . If cpu_npartitions=1(all CPUs in one pool), lustre should work the
+ *     same way as 2.2 or earlier versions
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_CPU_H__
+#define __LIBCFS_CPU_H__
+
+#ifndef HAVE_LIBCFS_CPT
+
+typedef unsigned long		cpumask_t;
+typedef unsigned long		nodemask_t;
+
+struct cfs_cpt_table {
+	/* # of CPU partitions */
+	int			ctb_nparts;
+	/* cpu mask */
+	cpumask_t		ctb_mask;
+	/* node mask */
+	nodemask_t		ctb_nodemask;
+	/* version */
+	__u64			ctb_version;
+};
+
+#endif /* !HAVE_LIBCFS_CPT */
+
+/* any CPU partition */
+#define CFS_CPT_ANY		(-1)
+
+extern struct cfs_cpt_table	*cfs_cpt_table;
+
+/**
+ * destroy a CPU partition table
+ */
+void cfs_cpt_table_free(struct cfs_cpt_table *cptab);
+/**
+ * create a cfs_cpt_table with \a ncpt number of partitions
+ */
+struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt);
+/**
+ * print string information of cpt-table
+ */
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
+ * return total number of CPU partitions in \a cptab
+ */
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab);
+/**
+ * return number of HW cores or hypter-threadings in a CPU partition \a cpt
+ */
+int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * is there any online CPU in CPU partition \a cpt
+ */
+int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return cpumask of CPU partition \a cpt
+ */
+cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return nodemask of CPU partition \a cpt
+ */
+nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * shadow current HW processor ID to CPU-partition ID of \a cptab
+ */
+int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap);
+/**
+ * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu);
+/**
+ * bind current thread on a CPU-partition \a cpt of \a cptab
+ */
+int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * add \a cpu to CPU partion @cpt of \a cptab, return 1 for success,
+ * otherwise 0 is returned
+ */
+int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * remove \a cpu from CPU partition \a cpt of \a cptab
+ */
+void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * add all cpus in \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab,
+			int cpt, cpumask_t *mask);
+/**
+ * remove all cpus in \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab,
+			   int cpt, cpumask_t *mask);
+/**
+ * add all cpus in NUMA node \a node to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node);
+/**
+ * remove all cpus in NUMA node \a node from CPU partition \a cpt
+ */
+void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node);
+
+/**
+ * add all cpus in node mask \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab,
+			 int cpt, nodemask_t *mask);
+/**
+ * remove all cpus in node mask \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab,
+			    int cpt, nodemask_t *mask);
+/**
+ * unset all cpus for CPU partition \a cpt
+ */
+void cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * convert partition id \a cpt to numa node id, if there are more than one
+ * nodes in this partition, it might return a different node id each time.
+ */
+int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt);
+
+/**
+ * iterate over all CPU partitions in \a cptab
+ */
+#define cfs_cpt_for_each(i, cptab)	\
+	for (i = 0; i < cfs_cpt_number(cptab); i++)
+
+#ifndef __read_mostly
+# define __read_mostly
+#endif
+
+#ifndef ____cacheline_aligned
+#define ____cacheline_aligned
+#endif
+
+int  cfs_cpu_init(void);
+void cfs_cpu_fini(void);
+
+#endif /* __LIBCFS_CPU_H__ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h
new file mode 100644
index 0000000..64ca62f
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h

@@ -0,0 +1,201 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+#ifndef _LIBCFS_CRYPTO_H
+#define _LIBCFS_CRYPTO_H
+
+struct cfs_crypto_hash_type {
+	char		*cht_name;      /**< hash algorithm name, equal to
+					 * format name for crypto api */
+	unsigned int    cht_key;	/**< init key by default (vaild for
+					 * 4 bytes context like crc32, adler */
+	unsigned int    cht_size;       /**< hash digest size */
+};
+
+enum cfs_crypto_hash_alg {
+	CFS_HASH_ALG_NULL       = 0,
+	CFS_HASH_ALG_ADLER32,
+	CFS_HASH_ALG_CRC32,
+	CFS_HASH_ALG_MD5,
+	CFS_HASH_ALG_SHA1,
+	CFS_HASH_ALG_SHA256,
+	CFS_HASH_ALG_SHA384,
+	CFS_HASH_ALG_SHA512,
+	CFS_HASH_ALG_CRC32C,
+	CFS_HASH_ALG_MAX
+};
+
+static struct cfs_crypto_hash_type hash_types[] = {
+	[CFS_HASH_ALG_NULL]    = { "null",     0,      0 },
+	[CFS_HASH_ALG_ADLER32] = { "adler32",  1,      4 },
+	[CFS_HASH_ALG_CRC32]   = { "crc32",   ~0,      4 },
+	[CFS_HASH_ALG_CRC32C]  = { "crc32c",  ~0,      4 },
+	[CFS_HASH_ALG_MD5]     = { "md5",      0,     16 },
+	[CFS_HASH_ALG_SHA1]    = { "sha1",     0,     20 },
+	[CFS_HASH_ALG_SHA256]  = { "sha256",   0,     32 },
+	[CFS_HASH_ALG_SHA384]  = { "sha384",   0,     48 },
+	[CFS_HASH_ALG_SHA512]  = { "sha512",   0,     64 },
+};
+
+/**    Return pointer to type of hash for valid hash algorithm identifier */
+static inline const struct cfs_crypto_hash_type *
+		    cfs_crypto_hash_type(unsigned char hash_alg)
+{
+	struct cfs_crypto_hash_type *ht;
+
+	if (hash_alg < CFS_HASH_ALG_MAX) {
+		ht = &hash_types[hash_alg];
+		if (ht->cht_name)
+			return ht;
+	}
+	return NULL;
+}
+
+/**     Return hash name for valid hash algorithm identifier or "unknown" */
+static inline const char *cfs_crypto_hash_name(unsigned char hash_alg)
+{
+	const struct cfs_crypto_hash_type *ht;
+
+	ht = cfs_crypto_hash_type(hash_alg);
+	if (ht)
+		return ht->cht_name;
+	else
+		return "unknown";
+}
+
+/**     Return digest size for valid algorithm identifier or 0 */
+static inline int cfs_crypto_hash_digestsize(unsigned char hash_alg)
+{
+	const struct cfs_crypto_hash_type *ht;
+
+	ht = cfs_crypto_hash_type(hash_alg);
+	if (ht)
+		return ht->cht_size;
+	else
+		return 0;
+}
+
+/**     Return hash identifier for valid hash algorithm name or 0xFF */
+static inline unsigned char cfs_crypto_hash_alg(const char *algname)
+{
+	unsigned char   i;
+
+	for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+		if (!strcmp(hash_types[i].cht_name, algname))
+			break;
+	return (i == CFS_HASH_ALG_MAX ? 0xFF : i);
+}
+
+/**     Calculate hash digest for buffer.
+ *      @param alg	    id of hash algorithm
+ *      @param buf	    buffer of data
+ *      @param buf_len	buffer len
+ *      @param key	    initial value for algorithm, if it is NULL,
+ *			    default initial value should be used.
+ *      @param key_len	len of initial value
+ *      @param hash	   [out] pointer to hash, if it is NULL, hash_len is
+ *			    set to valid digest size in bytes, retval -ENOSPC.
+ *      @param hash_len       [in,out] size of hash buffer
+ *      @returns	      status of operation
+ *      @retval -EINVAL       if buf, buf_len, hash_len or alg_id is invalid
+ *      @retval -ENODEV       if this algorithm is unsupported
+ *      @retval -ENOSPC       if pointer to hash is NULL, or hash_len less than
+ *			    digest size
+ *      @retval 0	     for success
+ *      @retval < 0	   other errors from lower layers.
+ */
+int cfs_crypto_hash_digest(unsigned char alg,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len);
+
+/* cfs crypto hash descriptor */
+struct cfs_crypto_hash_desc;
+
+/**     Allocate and initialize desriptor for hash algorithm.
+ *      @param alg	    algorithm id
+ *      @param key	    initial value for algorithm, if it is NULL,
+ *			    default initial value should be used.
+ *      @param key_len	len of initial value
+ *      @returns	      pointer to descriptor of hash instance
+ *      @retval ERR_PTR(error) when errors occured.
+ */
+struct cfs_crypto_hash_desc*
+	cfs_crypto_hash_init(unsigned char alg,
+			     unsigned char *key, unsigned int key_len);
+
+/**    Update digest by part of data.
+ *     @param desc	      hash descriptor
+ *     @param page	      data page
+ *     @param offset	    data offset
+ *     @param len	       data len
+ *     @returns		 status of operation
+ *     @retval 0		for success.
+ */
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc,
+				struct page *page, unsigned int offset,
+				unsigned int len);
+
+/**    Update digest by part of data.
+ *     @param desc	      hash descriptor
+ *     @param buf	       pointer to data buffer
+ *     @param buf_len	   size of data at buffer
+ *     @returns		 status of operation
+ *     @retval 0		for success.
+ */
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf,
+			   unsigned int buf_len);
+
+/**    Finalize hash calculation, copy hash digest to buffer, destroy hash
+ *     descriptor.
+ *     @param desc	      hash descriptor
+ *     @param hash	      buffer pointer to store hash digest
+ *     @param hash_len	  pointer to hash buffer size, if NULL
+ *			      destory hash descriptor
+ *     @returns		 status of operation
+ *     @retval -ENOSPC	  if hash is NULL, or *hash_len less than
+ *			      digest size
+ *     @retval 0		for success
+ *     @retval < 0	      other errors from lower layers.
+ */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc,
+			  unsigned char *hash, unsigned int *hash_len);
+/**
+ *      Register crypto hash algorithms
+ */
+int cfs_crypto_register(void);
+
+/**
+ *      Unregister
+ */
+void cfs_crypto_unregister(void);
+
+/**     Return hash speed in Mbytes per second for valid hash algorithm
+ *      identifier. If test was unsuccessfull -1 would be return.
+ */
+int cfs_crypto_hash_speed(unsigned char hash_alg);
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h
new file mode 100644
index 0000000..dd8ac2f
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h

@@ -0,0 +1,350 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_debug.h
+ *
+ * Debug messages and assertions
+ *
+ */
+
+#ifndef __LIBCFS_DEBUG_H__
+#define __LIBCFS_DEBUG_H__
+
+/*
+ *  Debugging
+ */
+extern unsigned int libcfs_subsystem_debug;
+extern unsigned int libcfs_stack;
+extern unsigned int libcfs_debug;
+extern unsigned int libcfs_printk;
+extern unsigned int libcfs_console_ratelimit;
+extern unsigned int libcfs_watchdog_ratelimit;
+extern unsigned int libcfs_console_max_delay;
+extern unsigned int libcfs_console_min_delay;
+extern unsigned int libcfs_console_backoff;
+extern unsigned int libcfs_debug_binary;
+extern char libcfs_debug_file_path_arr[PATH_MAX];
+
+int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys);
+int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
+
+/* Has there been an LBUG? */
+extern unsigned int libcfs_catastrophe;
+extern unsigned int libcfs_panic_on_lbug;
+
+/**
+ * Format for debug message headers
+ */
+struct ptldebug_header {
+	__u32 ph_len;
+	__u32 ph_flags;
+	__u32 ph_subsys;
+	__u32 ph_mask;
+	__u16 ph_cpu_id;
+	__u16 ph_type;
+	__u32 ph_sec;
+	__u64 ph_usec;
+	__u32 ph_stack;
+	__u32 ph_pid;
+	__u32 ph_extern_pid;
+	__u32 ph_line_num;
+} __attribute__((packed));
+
+
+#define PH_FLAG_FIRST_RECORD 1
+
+/* Debugging subsystems (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+#define S_UNDEFINED   0x00000001
+#define S_MDC	 0x00000002
+#define S_MDS	 0x00000004
+#define S_OSC	 0x00000008
+#define S_OST	 0x00000010
+#define S_CLASS       0x00000020
+#define S_LOG	 0x00000040
+#define S_LLITE       0x00000080
+#define S_RPC	 0x00000100
+#define S_MGMT	0x00000200
+#define S_LNET	0x00000400
+#define S_LND	 0x00000800 /* ALL LNDs */
+#define S_PINGER      0x00001000
+#define S_FILTER      0x00002000
+/* unused */
+#define S_ECHO	0x00008000
+#define S_LDLM	0x00010000
+#define S_LOV	 0x00020000
+#define S_LQUOTA      0x00040000
+#define S_OSD		0x00080000
+/* unused */
+/* unused */
+/* unused */
+#define S_LMV	 0x00800000 /* b_new_cmd */
+/* unused */
+#define S_SEC	 0x02000000 /* upcall cache */
+#define S_GSS	 0x04000000 /* b_new_cmd */
+/* unused */
+#define S_MGC	 0x10000000
+#define S_MGS	 0x20000000
+#define S_FID	 0x40000000 /* b_new_cmd */
+#define S_FLD	 0x80000000 /* b_new_cmd */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+
+/* Debugging masks (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+#define D_TRACE       0x00000001 /* ENTRY/EXIT markers */
+#define D_INODE       0x00000002
+#define D_SUPER       0x00000004
+#define D_EXT2	0x00000008 /* anything from ext2_debug */
+#define D_MALLOC      0x00000010 /* print malloc, free information */
+#define D_CACHE       0x00000020 /* cache-related items */
+#define D_INFO	0x00000040 /* general information */
+#define D_IOCTL       0x00000080 /* ioctl related information */
+#define D_NETERROR    0x00000100 /* network errors */
+#define D_NET	 0x00000200 /* network communications */
+#define D_WARNING     0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
+#define D_BUFFS       0x00000800
+#define D_OTHER       0x00001000
+#define D_DENTRY      0x00002000
+#define D_NETTRACE    0x00004000
+#define D_PAGE	0x00008000 /* bulk page handling */
+#define D_DLMTRACE    0x00010000
+#define D_ERROR       0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG       0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA	  0x00080000 /* recovery and failover */
+#define D_RPCTRACE    0x00100000 /* for distributed debugging */
+#define D_VFSTRACE    0x00200000
+#define D_READA       0x00400000 /* read-ahead */
+#define D_MMAP	0x00800000
+#define D_CONFIG      0x01000000
+#define D_CONSOLE     0x02000000
+#define D_QUOTA       0x04000000
+#define D_SEC	 0x08000000
+#define D_LFSCK	      0x10000000 /* For both OI scrub and LFSCK */
+/* keep these in sync with lnet/{utils,libcfs}/debug.c */
+
+#define D_HSM	 D_TRACE
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600))	 /* jiffies */
+#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */
+#define CDEBUG_DEFAULT_BACKOFF   2
+typedef struct {
+	cfs_time_t      cdls_next;
+	unsigned int    cdls_delay;
+	int	     cdls_count;
+} cfs_debug_limit_state_t;
+
+struct libcfs_debug_msg_data {
+	const char	       *msg_file;
+	const char	       *msg_fn;
+	int		      msg_subsys;
+	int		      msg_line;
+	int		      msg_mask;
+	cfs_debug_limit_state_t  *msg_cdls;
+};
+
+#define LIBCFS_DEBUG_MSG_DATA_INIT(data, mask, cdls)	\
+do {							\
+	(data)->msg_subsys = DEBUG_SUBSYSTEM;	       \
+	(data)->msg_file   = __FILE__;		      \
+	(data)->msg_fn     = __FUNCTION__;		  \
+	(data)->msg_line   = __LINE__;		      \
+	(data)->msg_cdls   = (cdls);			\
+	(data)->msg_mask   = (mask);			\
+} while (0)
+
+#define LIBCFS_DEBUG_MSG_DATA_DECL(dataname, mask, cdls)    \
+	static struct libcfs_debug_msg_data dataname = {    \
+	       .msg_subsys = DEBUG_SUBSYSTEM,	       \
+	       .msg_file   = __FILE__,		      \
+	       .msg_fn     = __FUNCTION__,		  \
+	       .msg_line   = __LINE__,		      \
+	       .msg_cdls   = (cdls)	 };	      \
+	dataname.msg_mask   = (mask);
+
+
+
+/**
+ * Filters out logging messages based on mask and subsystem.
+ */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+	return mask & D_CANTMASK ||
+		((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem));
+}
+
+#define __CDEBUG(cdls, mask, format, ...)			       \
+do {								    \
+	static struct libcfs_debug_msg_data msgdata;		    \
+									\
+	CFS_CHECK_STACK(&msgdata, mask, cdls);			  \
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		LIBCFS_DEBUG_MSG_DATA_INIT(&msgdata, mask, cdls);       \
+		libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__);     \
+	}							       \
+} while (0)
+
+#define CDEBUG(mask, format, ...) __CDEBUG(NULL, mask, format, ## __VA_ARGS__)
+
+#define CDEBUG_LIMIT(mask, format, ...)	 \
+do {					    \
+	static cfs_debug_limit_state_t cdls;    \
+						\
+	__CDEBUG(&cdls, mask, format, ## __VA_ARGS__);\
+} while (0)
+
+
+
+
+#define CWARN(format, ...)	  CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__)
+#define CERROR(format, ...)	 CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__)
+#define CNETERR(format, a...)       CDEBUG_LIMIT(D_NETERROR, format, ## a)
+#define CEMERG(format, ...)	 CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__)
+
+#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__)
+#define LCONSOLE_INFO(format, ...)  CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__)
+#define LCONSOLE_WARN(format, ...)  CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__)
+#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \
+			   "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__)
+#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__)
+
+#define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__)
+
+
+void libcfs_log_goto(struct libcfs_debug_msg_data *, const char *, long_ptr_t);
+#define GOTO(label, rc)						 \
+do {								    \
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);     \
+		libcfs_log_goto(&msgdata, #label, (long_ptr_t)(rc));    \
+	} else {							\
+		(void)(rc);					     \
+	}							       \
+	goto label;						     \
+} while (0)
+
+
+/*
+ * if rc == NULL, we need to code as RETURN((void *)NULL), otherwise
+ * there will be a warning in osx.
+ */
+#if defined(__GNUC__)
+
+long libcfs_log_return(struct libcfs_debug_msg_data *, long rc);
+#if BITS_PER_LONG > 32
+#define RETURN(rc)							\
+do {									\
+	EXIT_NESTING;							\
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);	\
+		return (typeof(rc))libcfs_log_return(&msgdata,		\
+						     (long)(rc));	\
+	}								\
+									\
+	return (rc);							\
+} while (0)
+#else /* BITS_PER_LONG == 32 */
+/* We need an on-stack variable, because we cannot case a 32-bit pointer
+ * directly to (long long) without generating a complier warning/error, yet
+ * casting directly to (long) will truncate 64-bit return values. The log
+ * values will print as 32-bit values, but they always have been. LU-1436
+ */
+#define RETURN(rc)							\
+do {									\
+	EXIT_NESTING;							\
+	if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {		\
+		typeof(rc) __rc = (rc);					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);	\
+		libcfs_log_return(&msgdata, (long_ptr_t)__rc);		\
+		return __rc;						\
+	}								\
+									\
+	return (rc);							\
+} while (0)
+#endif /* BITS_PER_LONG > 32 */
+
+#elif defined(_MSC_VER)
+#define RETURN(rc)						      \
+do {								    \
+	CDEBUG(D_TRACE, "Process leaving.\n");			  \
+	EXIT_NESTING;						   \
+	return (rc);						    \
+} while (0)
+#else
+# error "Unkown compiler"
+#endif /* __GNUC__ */
+
+#define ENTRY							   \
+ENTRY_NESTING;							  \
+do {								    \
+	CDEBUG(D_TRACE, "Process entered\n");			   \
+} while (0)
+
+#define EXIT							    \
+do {								    \
+	CDEBUG(D_TRACE, "Process leaving\n");			   \
+	EXIT_NESTING;						   \
+} while(0)
+
+#define RETURN_EXIT							\
+do {									\
+	EXIT;								\
+	return;								\
+} while (0)
+
+extern int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+			    const char *format1, ...)
+	__attribute__ ((format (printf, 2, 3)));
+
+extern int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+			      const char *format1,
+			      va_list args, const char *format2, ...)
+	__attribute__ ((format (printf, 4, 5)));
+
+/* other external symbols that tracefile provides: */
+extern int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+				   const char *usr_buffer, int usr_buffer_nob);
+extern int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+				    const char *knl_buffer, char *append);
+
+#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
+
+#endif	/* __LIBCFS_DEBUG_H__ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h
new file mode 100644
index 0000000..8393c27
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h

@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#ifndef _LIBCFS_FAIL_H
+#define _LIBCFS_FAIL_H
+
+extern unsigned long cfs_fail_loc;
+extern unsigned int cfs_fail_val;
+
+extern wait_queue_head_t cfs_race_waitq;
+extern int cfs_race_state;
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set);
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set);
+
+enum {
+	CFS_FAIL_LOC_NOSET      = 0,
+	CFS_FAIL_LOC_ORSET      = 1,
+	CFS_FAIL_LOC_RESET      = 2,
+	CFS_FAIL_LOC_VALUE      = 3
+};
+
+/* Failure injection control */
+#define CFS_FAIL_MASK_SYS    0x0000FF00
+#define CFS_FAIL_MASK_LOC   (0x000000FF | CFS_FAIL_MASK_SYS)
+
+#define CFS_FAILED_BIT       30
+/* CFS_FAILED is 0x40000000 */
+#define CFS_FAILED	  (1 << CFS_FAILED_BIT)
+
+#define CFS_FAIL_ONCE_BIT    31
+/* CFS_FAIL_ONCE is 0x80000000 */
+#define CFS_FAIL_ONCE       (1 << CFS_FAIL_ONCE_BIT)
+
+/* The following flags aren't made to be combined */
+#define CFS_FAIL_SKIP	0x20000000 /* skip N times then fail */
+#define CFS_FAIL_SOME	0x10000000 /* only fail N times */
+#define CFS_FAIL_RAND	0x08000000 /* fail 1/N of the times */
+#define CFS_FAIL_USR1	0x04000000 /* user flag */
+
+#define CFS_FAIL_PRECHECK(id) (cfs_fail_loc &&				\
+			      (cfs_fail_loc & CFS_FAIL_MASK_LOC) ==	   \
+			      ((id) & CFS_FAIL_MASK_LOC))
+
+static inline int cfs_fail_check_set(__u32 id, __u32 value,
+				     int set, int quiet)
+{
+	int ret = 0;
+
+	if (unlikely(CFS_FAIL_PRECHECK(id) &&
+		     (ret = __cfs_fail_check_set(id, value, set)))) {
+		if (quiet) {
+			CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n",
+			       id, value);
+		} else {
+			LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n",
+				      id, value);
+		}
+	}
+
+	return ret;
+}
+
+/* If id hit cfs_fail_loc, return 1, otherwise return 0 */
+#define CFS_FAIL_CHECK(id) \
+	cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0)
+#define CFS_FAIL_CHECK_QUIET(id) \
+	cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1)
+
+/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_VALUE(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0)
+#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_ORSET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0)
+#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_RESET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0)
+#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \
+	cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1)
+
+static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+	if (unlikely(CFS_FAIL_PRECHECK(id)))
+		return __cfs_fail_timeout_set(id, value, ms, set);
+	else
+		return 0;
+}
+
+/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT(id, secs) \
+	cfs_fail_timeout_set(id, 0, secs * 1000, CFS_FAIL_LOC_NOSET)
+
+#define CFS_FAIL_TIMEOUT_MS(id, ms) \
+	cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and
+ * sleep seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \
+	cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_ORSET)
+
+#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \
+	cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET)
+
+/* The idea here is to synchronise two threads to force a race. The
+ * first thread that calls this with a matching fail_loc is put to
+ * sleep. The next thread that calls with the same fail_loc wakes up
+ * the first and continues. */
+static inline void cfs_race(__u32 id)
+{
+
+	if (CFS_FAIL_PRECHECK(id)) {
+		if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+			int rc;
+			cfs_race_state = 0;
+			CERROR("cfs_race id %x sleeping\n", id);
+			cfs_wait_event_interruptible(cfs_race_waitq,
+						     cfs_race_state != 0, rc);
+			CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc);
+		} else {
+			CERROR("cfs_fail_race id %x waking\n", id);
+			cfs_race_state = 1;
+			wake_up(&cfs_race_waitq);
+		}
+	}
+}
+#define CFS_RACE(id) cfs_race(id)
+
+#endif /* _LIBCFS_FAIL_H */

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h
new file mode 100644
index 0000000..f6361b3
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h

@@ -0,0 +1,851 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_hash.h
+ *
+ * Hashing routines
+ *
+ */
+
+#ifndef __LIBCFS_HASH_H__
+#define __LIBCFS_HASH_H__
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL
+
+/*
+ * Ideally we would use HAVE_HASH_LONG for this, but on linux we configure
+ * the linux kernel and user space at the same time, so we need to differentiate
+ * between them explicitely. If this is not needed on other architectures, then
+ * we'll need to move the functions to archi specific headers.
+ */
+
+#include <linux/hash.h>
+
+#define cfs_hash_long(val, bits)    hash_long(val, bits)
+
+/** disable debug */
+#define CFS_HASH_DEBUG_NONE	 0
+/** record hash depth and output to console when it's too deep,
+ *  computing overhead is low but consume more memory */
+#define CFS_HASH_DEBUG_1	    1
+/** expensive, check key validation */
+#define CFS_HASH_DEBUG_2	    2
+
+#define CFS_HASH_DEBUG_LEVEL	CFS_HASH_DEBUG_NONE
+
+struct cfs_hash_ops;
+struct cfs_hash_lock_ops;
+struct cfs_hash_hlist_ops;
+
+typedef union {
+	rwlock_t		rw;		/**< rwlock */
+	spinlock_t		spin;		/**< spinlock */
+} cfs_hash_lock_t;
+
+/**
+ * cfs_hash_bucket is a container of:
+ * - lock, couter ...
+ * - array of hash-head starting from hsb_head[0], hash-head can be one of
+ *   . cfs_hash_head_t
+ *   . cfs_hash_head_dep_t
+ *   . cfs_hash_dhead_t
+ *   . cfs_hash_dhead_dep_t
+ *   which depends on requirement of user
+ * - some extra bytes (caller can require it while creating hash)
+ */
+typedef struct cfs_hash_bucket {
+	cfs_hash_lock_t		hsb_lock;	/**< bucket lock */
+	__u32			hsb_count;	/**< current entries */
+	__u32			hsb_version;	/**< change version */
+	unsigned int		hsb_index;	/**< index of bucket */
+	int			hsb_depmax;	/**< max depth on bucket */
+	long			hsb_head[0];	/**< hash-head array */
+} cfs_hash_bucket_t;
+
+/**
+ * cfs_hash bucket descriptor, it's normally in stack of caller
+ */
+typedef struct cfs_hash_bd {
+	cfs_hash_bucket_t	  *bd_bucket;      /**< address of bucket */
+	unsigned int		bd_offset;      /**< offset in bucket */
+} cfs_hash_bd_t;
+
+#define CFS_HASH_NAME_LEN	   16      /**< default name length */
+#define CFS_HASH_BIGNAME_LEN	64      /**< bigname for param tree */
+
+#define CFS_HASH_BKT_BITS	   3       /**< default bits of bucket */
+#define CFS_HASH_BITS_MAX	   30      /**< max bits of bucket */
+#define CFS_HASH_BITS_MIN	   CFS_HASH_BKT_BITS
+
+/**
+ * common hash attributes.
+ */
+enum cfs_hash_tag {
+	/**
+	 * don't need any lock, caller will protect operations with it's
+	 * own lock. With this flag:
+	 *  . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK
+	 *    will be ignored.
+	 *  . Some functions will be disabled with this flag, i.e:
+	 *    cfs_hash_for_each_empty, cfs_hash_rehash
+	 */
+	CFS_HASH_NO_LOCK	= 1 << 0,
+	/** no bucket lock, use one spinlock to protect the whole hash */
+	CFS_HASH_NO_BKTLOCK     = 1 << 1,
+	/** rwlock to protect bucket */
+	CFS_HASH_RW_BKTLOCK     = 1 << 2,
+	/** spinlcok to protect bucket */
+	CFS_HASH_SPIN_BKTLOCK   = 1 << 3,
+	/** always add new item to tail */
+	CFS_HASH_ADD_TAIL       = 1 << 4,
+	/** hash-table doesn't have refcount on item */
+	CFS_HASH_NO_ITEMREF     = 1 << 5,
+	/** big name for param-tree */
+	CFS_HASH_BIGNAME	= 1 << 6,
+	/** track global count */
+	CFS_HASH_COUNTER	= 1 << 7,
+	/** rehash item by new key */
+	CFS_HASH_REHASH_KEY     = 1 << 8,
+	/** Enable dynamic hash resizing */
+	CFS_HASH_REHASH	 = 1 << 9,
+	/** can shrink hash-size */
+	CFS_HASH_SHRINK	 = 1 << 10,
+	/** assert hash is empty on exit */
+	CFS_HASH_ASSERT_EMPTY   = 1 << 11,
+	/** record hlist depth */
+	CFS_HASH_DEPTH	  = 1 << 12,
+	/**
+	 * rehash is always scheduled in a different thread, so current
+	 * change on hash table is non-blocking
+	 */
+	CFS_HASH_NBLK_CHANGE    = 1 << 13,
+	/** NB, we typed hs_flags as  __u16, please change it
+	 * if you need to extend >=16 flags */
+};
+
+/** most used attributes */
+#define CFS_HASH_DEFAULT       (CFS_HASH_RW_BKTLOCK | \
+				CFS_HASH_COUNTER | CFS_HASH_REHASH)
+
+/**
+ * cfs_hash is a hash-table implementation for general purpose, it can support:
+ *    . two refcount modes
+ *      hash-table with & without refcount
+ *    . four lock modes
+ *      nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock
+ *    . general operations
+ *      lookup, add(add_tail or add_head), delete
+ *    . rehash
+ *      grows or shrink
+ *    . iteration
+ *      locked iteration and unlocked iteration
+ *    . bigname
+ *      support long name hash
+ *    . debug
+ *      trace max searching depth
+ *
+ * Rehash:
+ * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker)
+ * is spawned to handle the rehash in the background, it's possible that other
+ * processes can concurrently perform additions, deletions, and lookups
+ * without being blocked on rehash completion, because rehash will release
+ * the global wrlock for each bucket.
+ *
+ * rehash and iteration can't run at the same time because it's too tricky
+ * to keep both of them safe and correct.
+ * As they are relatively rare operations, so:
+ *   . if iteration is in progress while we try to launch rehash, then
+ *     it just giveup, iterator will launch rehash at the end.
+ *   . if rehash is in progress while we try to iterate the hash table,
+ *     then we just wait (shouldn't be very long time), anyway, nobody
+ *     should expect iteration of whole hash-table to be non-blocking.
+ *
+ * During rehashing, a (key,object) pair may be in one of two buckets,
+ * depending on whether the worker task has yet to transfer the object
+ * to its new location in the table. Lookups and deletions need to search both
+ * locations; additions must take care to only insert into the new bucket.
+ */
+
+typedef struct cfs_hash {
+	/** serialize with rehash, or serialize all operations if
+	 * the hash-table has CFS_HASH_NO_BKTLOCK */
+	cfs_hash_lock_t	     hs_lock;
+	/** hash operations */
+	struct cfs_hash_ops	*hs_ops;
+	/** hash lock operations */
+	struct cfs_hash_lock_ops   *hs_lops;
+	/** hash list operations */
+	struct cfs_hash_hlist_ops  *hs_hops;
+	/** hash buckets-table */
+	cfs_hash_bucket_t	 **hs_buckets;
+	/** total number of items on this hash-table */
+	atomic_t		hs_count;
+	/** hash flags, see cfs_hash_tag for detail */
+	__u16		       hs_flags;
+	/** # of extra-bytes for bucket, for user saving extended attributes */
+	__u16		       hs_extra_bytes;
+	/** wants to iterate */
+	__u8			hs_iterating;
+	/** hash-table is dying */
+	__u8			hs_exiting;
+	/** current hash bits */
+	__u8			hs_cur_bits;
+	/** min hash bits */
+	__u8			hs_min_bits;
+	/** max hash bits */
+	__u8			hs_max_bits;
+	/** bits for rehash */
+	__u8			hs_rehash_bits;
+	/** bits for each bucket */
+	__u8			hs_bkt_bits;
+	/** resize min threshold */
+	__u16		       hs_min_theta;
+	/** resize max threshold */
+	__u16		       hs_max_theta;
+	/** resize count */
+	__u32		       hs_rehash_count;
+	/** # of iterators (caller of cfs_hash_for_each_*) */
+	__u32		       hs_iterators;
+	/** rehash workitem */
+	cfs_workitem_t	      hs_rehash_wi;
+	/** refcount on this hash table */
+	atomic_t		hs_refcount;
+	/** rehash buckets-table */
+	cfs_hash_bucket_t	 **hs_rehash_buckets;
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+	/** serialize debug members */
+	spinlock_t			hs_dep_lock;
+	/** max depth */
+	unsigned int		hs_dep_max;
+	/** id of the deepest bucket */
+	unsigned int		hs_dep_bkt;
+	/** offset in the deepest bucket */
+	unsigned int		hs_dep_off;
+	/** bits when we found the max depth */
+	unsigned int		hs_dep_bits;
+	/** workitem to output max depth */
+	cfs_workitem_t	      hs_dep_wi;
+#endif
+	/** name of htable */
+	char			hs_name[0];
+} cfs_hash_t;
+
+typedef struct cfs_hash_lock_ops {
+	/** lock the hash table */
+	void    (*hs_lock)(cfs_hash_lock_t *lock, int exclusive);
+	/** unlock the hash table */
+	void    (*hs_unlock)(cfs_hash_lock_t *lock, int exclusive);
+	/** lock the hash bucket */
+	void    (*hs_bkt_lock)(cfs_hash_lock_t *lock, int exclusive);
+	/** unlock the hash bucket */
+	void    (*hs_bkt_unlock)(cfs_hash_lock_t *lock, int exclusive);
+} cfs_hash_lock_ops_t;
+
+typedef struct cfs_hash_hlist_ops {
+	/** return hlist_head of hash-head of @bd */
+	struct hlist_head *(*hop_hhead)(cfs_hash_t *hs, cfs_hash_bd_t *bd);
+	/** return hash-head size */
+	int (*hop_hhead_size)(cfs_hash_t *hs);
+	/** add @hnode to hash-head of @bd */
+	int (*hop_hnode_add)(cfs_hash_t *hs,
+			     cfs_hash_bd_t *bd, struct hlist_node *hnode);
+	/** remove @hnode from hash-head of @bd */
+	int (*hop_hnode_del)(cfs_hash_t *hs,
+			     cfs_hash_bd_t *bd, struct hlist_node *hnode);
+} cfs_hash_hlist_ops_t;
+
+typedef struct cfs_hash_ops {
+	/** return hashed value from @key */
+	unsigned (*hs_hash)(cfs_hash_t *hs, const void *key, unsigned mask);
+	/** return key address of @hnode */
+	void *   (*hs_key)(struct hlist_node *hnode);
+	/** copy key from @hnode to @key */
+	void     (*hs_keycpy)(struct hlist_node *hnode, void *key);
+	/**
+	 *  compare @key with key of @hnode
+	 *  returns 1 on a match
+	 */
+	int      (*hs_keycmp)(const void *key, struct hlist_node *hnode);
+	/** return object address of @hnode, i.e: container_of(...hnode) */
+	void *   (*hs_object)(struct hlist_node *hnode);
+	/** get refcount of item, always called with holding bucket-lock */
+	void     (*hs_get)(cfs_hash_t *hs, struct hlist_node *hnode);
+	/** release refcount of item */
+	void     (*hs_put)(cfs_hash_t *hs, struct hlist_node *hnode);
+	/** release refcount of item, always called with holding bucket-lock */
+	void     (*hs_put_locked)(cfs_hash_t *hs, struct hlist_node *hnode);
+	/** it's called before removing of @hnode */
+	void     (*hs_exit)(cfs_hash_t *hs, struct hlist_node *hnode);
+} cfs_hash_ops_t;
+
+/** total number of buckets in @hs */
+#define CFS_HASH_NBKT(hs)       \
+	(1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits))
+
+/** total number of buckets in @hs while rehashing */
+#define CFS_HASH_RH_NBKT(hs)    \
+	(1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits))
+
+/** number of hlist for in bucket */
+#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits)
+
+/** total number of hlist in @hs */
+#define CFS_HASH_NHLIST(hs)     (1U << (hs)->hs_cur_bits)
+
+/** total number of hlist in @hs while rehashing */
+#define CFS_HASH_RH_NHLIST(hs)  (1U << (hs)->hs_rehash_bits)
+
+static inline int
+cfs_hash_with_no_lock(cfs_hash_t *hs)
+{
+	/* caller will serialize all operations for this hash-table */
+	return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_no_bktlock(cfs_hash_t *hs)
+{
+	/* no bucket lock, one single lock to protect the hash-table */
+	return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_rw_bktlock(cfs_hash_t *hs)
+{
+	/* rwlock to protect hash bucket */
+	return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_spin_bktlock(cfs_hash_t *hs)
+{
+	/* spinlock to protect hash bucket */
+	return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_add_tail(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0;
+}
+
+static inline int
+cfs_hash_with_no_itemref(cfs_hash_t *hs)
+{
+	/* hash-table doesn't keep refcount on item,
+	 * item can't be removed from hash unless it's
+	 * ZERO refcount */
+	return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0;
+}
+
+static inline int
+cfs_hash_with_bigname(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_BIGNAME) != 0;
+}
+
+static inline int
+cfs_hash_with_counter(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_COUNTER) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_REHASH) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash_key(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0;
+}
+
+static inline int
+cfs_hash_with_shrink(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_SHRINK) != 0;
+}
+
+static inline int
+cfs_hash_with_assert_empty(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0;
+}
+
+static inline int
+cfs_hash_with_depth(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_DEPTH) != 0;
+}
+
+static inline int
+cfs_hash_with_nblk_change(cfs_hash_t *hs)
+{
+	return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0;
+}
+
+static inline int
+cfs_hash_is_exiting(cfs_hash_t *hs)
+{       /* cfs_hash_destroy is called */
+	return hs->hs_exiting;
+}
+
+static inline int
+cfs_hash_is_rehashing(cfs_hash_t *hs)
+{       /* rehash is launched */
+	return hs->hs_rehash_bits != 0;
+}
+
+static inline int
+cfs_hash_is_iterating(cfs_hash_t *hs)
+{       /* someone is calling cfs_hash_for_each_* */
+	return hs->hs_iterating || hs->hs_iterators != 0;
+}
+
+static inline int
+cfs_hash_bkt_size(cfs_hash_t *hs)
+{
+	return offsetof(cfs_hash_bucket_t, hsb_head[0]) +
+	       hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) +
+	       hs->hs_extra_bytes;
+}
+
+#define CFS_HOP(hs, op)	   (hs)->hs_ops->hs_ ## op
+
+static inline unsigned
+cfs_hash_id(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return CFS_HOP(hs, hash)(hs, key, mask);
+}
+
+static inline void *
+cfs_hash_key(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	return CFS_HOP(hs, key)(hnode);
+}
+
+static inline void
+cfs_hash_keycpy(cfs_hash_t *hs, struct hlist_node *hnode, void *key)
+{
+	if (CFS_HOP(hs, keycpy) != NULL)
+		CFS_HOP(hs, keycpy)(hnode, key);
+}
+
+/**
+ * Returns 1 on a match,
+ */
+static inline int
+cfs_hash_keycmp(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+	return CFS_HOP(hs, keycmp)(key, hnode);
+}
+
+static inline void *
+cfs_hash_object(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	return CFS_HOP(hs, object)(hnode);
+}
+
+static inline void
+cfs_hash_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	return CFS_HOP(hs, get)(hs, hnode);
+}
+
+static inline void
+cfs_hash_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	LASSERT(CFS_HOP(hs, put_locked) != NULL);
+
+	return CFS_HOP(hs, put_locked)(hs, hnode);
+}
+
+static inline void
+cfs_hash_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	LASSERT(CFS_HOP(hs, put) != NULL);
+
+	return CFS_HOP(hs, put)(hs, hnode);
+}
+
+static inline void
+cfs_hash_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	if (CFS_HOP(hs, exit))
+		CFS_HOP(hs, exit)(hs, hnode);
+}
+
+static inline void cfs_hash_lock(cfs_hash_t *hs, int excl)
+{
+	hs->hs_lops->hs_lock(&hs->hs_lock, excl);
+}
+
+static inline void cfs_hash_unlock(cfs_hash_t *hs, int excl)
+{
+	hs->hs_lops->hs_unlock(&hs->hs_lock, excl);
+}
+
+static inline int cfs_hash_dec_and_lock(cfs_hash_t *hs,
+					atomic_t *condition)
+{
+	LASSERT(cfs_hash_with_no_bktlock(hs));
+	return atomic_dec_and_lock(condition, &hs->hs_lock.spin);
+}
+
+static inline void cfs_hash_bd_lock(cfs_hash_t *hs,
+				    cfs_hash_bd_t *bd, int excl)
+{
+	hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+static inline void cfs_hash_bd_unlock(cfs_hash_t *hs,
+				      cfs_hash_bd_t *bd, int excl)
+{
+	hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are normally for hash-table without rehash
+ */
+void cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd);
+
+static inline void cfs_hash_bd_get_and_lock(cfs_hash_t *hs, const void *key,
+					    cfs_hash_bd_t *bd, int excl)
+{
+	cfs_hash_bd_get(hs, key, bd);
+	cfs_hash_bd_lock(hs, bd, excl);
+}
+
+static inline unsigned cfs_hash_bd_index_get(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits);
+}
+
+static inline void cfs_hash_bd_index_set(cfs_hash_t *hs,
+					 unsigned index, cfs_hash_bd_t *bd)
+{
+	bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits];
+	bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U);
+}
+
+static inline void *
+cfs_hash_bd_extra_get(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	return (void *)bd->bd_bucket +
+	       cfs_hash_bkt_size(hs) - hs->hs_extra_bytes;
+}
+
+static inline __u32
+cfs_hash_bd_version_get(cfs_hash_bd_t *bd)
+{
+	/* need hold cfs_hash_bd_lock */
+	return bd->bd_bucket->hsb_version;
+}
+
+static inline __u32
+cfs_hash_bd_count_get(cfs_hash_bd_t *bd)
+{
+	/* need hold cfs_hash_bd_lock */
+	return bd->bd_bucket->hsb_count;
+}
+
+static inline int
+cfs_hash_bd_depmax_get(cfs_hash_bd_t *bd)
+{
+	return bd->bd_bucket->hsb_depmax;
+}
+
+static inline int
+cfs_hash_bd_compare(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2)
+{
+	if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index)
+		return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index;
+
+	if (bd1->bd_offset != bd2->bd_offset)
+		return bd1->bd_offset - bd2->bd_offset;
+
+	return 0;
+}
+
+void cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			    struct hlist_node *hnode);
+void cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			    struct hlist_node *hnode);
+void cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old,
+			     cfs_hash_bd_t *bd_new, struct hlist_node *hnode);
+
+static inline int cfs_hash_bd_dec_and_lock(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+					   atomic_t *condition)
+{
+	LASSERT(cfs_hash_with_spin_bktlock(hs));
+	return atomic_dec_and_lock(condition,
+				       &bd->bd_bucket->hsb_lock.spin);
+}
+
+static inline struct hlist_head *cfs_hash_bd_hhead(cfs_hash_t *hs,
+						  cfs_hash_bd_t *bd)
+{
+	return hs->hs_hops->hop_hhead(hs, bd);
+}
+
+struct hlist_node *cfs_hash_bd_lookup_locked(cfs_hash_t *hs,
+					    cfs_hash_bd_t *bd, const void *key);
+struct hlist_node *cfs_hash_bd_peek_locked(cfs_hash_t *hs,
+					  cfs_hash_bd_t *bd, const void *key);
+struct hlist_node *cfs_hash_bd_findadd_locked(cfs_hash_t *hs,
+					     cfs_hash_bd_t *bd, const void *key,
+					     struct hlist_node *hnode,
+					     int insist_add);
+struct hlist_node *cfs_hash_bd_finddel_locked(cfs_hash_t *hs,
+					     cfs_hash_bd_t *bd, const void *key,
+					     struct hlist_node *hnode);
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are safe for hash-table with rehash
+ */
+void cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds);
+void cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl);
+void cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl);
+
+static inline void cfs_hash_dual_bd_get_and_lock(cfs_hash_t *hs, const void *key,
+						 cfs_hash_bd_t *bds, int excl)
+{
+	cfs_hash_dual_bd_get(hs, key, bds);
+	cfs_hash_dual_bd_lock(hs, bds, excl);
+}
+
+struct hlist_node *cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs,
+						 cfs_hash_bd_t *bds,
+						 const void *key);
+struct hlist_node *cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs,
+						  cfs_hash_bd_t *bds,
+						  const void *key,
+						  struct hlist_node *hnode,
+						  int insist_add);
+struct hlist_node *cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs,
+						  cfs_hash_bd_t *bds,
+						  const void *key,
+						  struct hlist_node *hnode);
+
+/* Hash init/cleanup functions */
+cfs_hash_t *cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+			    unsigned bkt_bits, unsigned extra_bytes,
+			    unsigned min_theta, unsigned max_theta,
+			    cfs_hash_ops_t *ops, unsigned flags);
+
+cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs);
+void cfs_hash_putref(cfs_hash_t *hs);
+
+/* Hash addition functions */
+void cfs_hash_add(cfs_hash_t *hs, const void *key,
+		  struct hlist_node *hnode);
+int cfs_hash_add_unique(cfs_hash_t *hs, const void *key,
+			struct hlist_node *hnode);
+void *cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key,
+			      struct hlist_node *hnode);
+
+/* Hash deletion functions */
+void *cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode);
+void *cfs_hash_del_key(cfs_hash_t *hs, const void *key);
+
+/* Hash lookup/for_each functions */
+#define CFS_HASH_LOOP_HOG       1024
+
+typedef int (*cfs_hash_for_each_cb_t)(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+				      struct hlist_node *node, void *data);
+void *cfs_hash_lookup(cfs_hash_t *hs, const void *key);
+void cfs_hash_for_each(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
+void cfs_hash_for_each_safe(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_nolock(cfs_hash_t *hs,
+			      cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_empty(cfs_hash_t *hs,
+			     cfs_hash_for_each_cb_t, void *data);
+void cfs_hash_for_each_key(cfs_hash_t *hs, const void *key,
+			   cfs_hash_for_each_cb_t, void *data);
+typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data);
+void cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t, void *data);
+
+void cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex,
+			     cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_is_empty(cfs_hash_t *hs);
+__u64 cfs_hash_size_get(cfs_hash_t *hs);
+
+/*
+ * Rehash - Theta is calculated to be the average chained
+ * hash depth assuming a perfectly uniform hash funcion.
+ */
+void cfs_hash_rehash_cancel_locked(cfs_hash_t *hs);
+void cfs_hash_rehash_cancel(cfs_hash_t *hs);
+int  cfs_hash_rehash(cfs_hash_t *hs, int do_rehash);
+void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key,
+			 void *new_key, struct hlist_node *hnode);
+
+#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1
+/* Validate hnode references the correct key */
+static inline void
+cfs_hash_key_validate(cfs_hash_t *hs, const void *key,
+		      struct hlist_node *hnode)
+{
+	LASSERT(cfs_hash_keycmp(hs, key, hnode));
+}
+
+/* Validate hnode is in the correct bucket */
+static inline void
+cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			 struct hlist_node *hnode)
+{
+	cfs_hash_bd_t   bds[2];
+
+	cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds);
+	LASSERT(bds[0].bd_bucket == bd->bd_bucket ||
+		bds[1].bd_bucket == bd->bd_bucket);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */
+
+static inline void
+cfs_hash_key_validate(cfs_hash_t *hs, const void *key,
+		      struct hlist_node *hnode) {}
+
+static inline void
+cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			 struct hlist_node *hnode) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL */
+
+#define CFS_HASH_THETA_BITS  10
+#define CFS_HASH_MIN_THETA  (1U << (CFS_HASH_THETA_BITS - 1))
+#define CFS_HASH_MAX_THETA  (1U << (CFS_HASH_THETA_BITS + 1))
+
+/* Return integer component of theta */
+static inline int __cfs_hash_theta_int(int theta)
+{
+	return (theta >> CFS_HASH_THETA_BITS);
+}
+
+/* Return a fractional value between 0 and 999 */
+static inline int __cfs_hash_theta_frac(int theta)
+{
+	return ((theta * 1000) >> CFS_HASH_THETA_BITS) -
+	       (__cfs_hash_theta_int(theta) * 1000);
+}
+
+static inline int __cfs_hash_theta(cfs_hash_t *hs)
+{
+	return (atomic_read(&hs->hs_count) <<
+		CFS_HASH_THETA_BITS) >> hs->hs_cur_bits;
+}
+
+static inline void __cfs_hash_set_theta(cfs_hash_t *hs, int min, int max)
+{
+	LASSERT(min < max);
+	hs->hs_min_theta = (__u16)min;
+	hs->hs_max_theta = (__u16)max;
+}
+
+/* Generic debug formatting routines mainly for proc handler */
+struct seq_file;
+int cfs_hash_debug_header(struct seq_file *m);
+int cfs_hash_debug_str(cfs_hash_t *hs, struct seq_file *m);
+
+/*
+ * Generic djb2 hash algorithm for character arrays.
+ */
+static inline unsigned
+cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask)
+{
+	unsigned i, hash = 5381;
+
+	LASSERT(key != NULL);
+
+	for (i = 0; i < size; i++)
+		hash = hash * 33 + ((char *)key)[i];
+
+	return (hash & mask);
+}
+
+/*
+ * Generic u32 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u32_hash(const __u32 key, unsigned mask)
+{
+	return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask);
+}
+
+/*
+ * Generic u64 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u64_hash(const __u64 key, unsigned mask)
+{
+	return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask);
+}
+
+/** iterate over all buckets in @bds (array of cfs_hash_bd_t) */
+#define cfs_hash_for_each_bd(bds, n, i) \
+	for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++)
+
+/** iterate over all buckets of @hs */
+#define cfs_hash_for_each_bucket(hs, bd, pos)		   \
+	for (pos = 0;					   \
+	     pos < CFS_HASH_NBKT(hs) &&			 \
+	     ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++)
+
+/** iterate over all hlist of bucket @bd */
+#define cfs_hash_bd_for_each_hlist(hs, bd, hlist)	       \
+	for ((bd)->bd_offset = 0;			       \
+	     (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) &&       \
+	     (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL;       \
+	     (bd)->bd_offset++)
+
+/* !__LIBCFS__HASH_H__ */
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h
new file mode 100644
index 0000000..bfa6d7b
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h

@@ -0,0 +1,200 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/include/libcfs/heap.h
+ *
+ * Author: Eric Barton	<eeb@whamcloud.com>
+ *	   Liang Zhen	<liang@whamcloud.com>
+ */
+
+#ifndef __LIBCFS_HEAP_H__
+#define __LIBCFS_HEAP_H__
+
+/** \defgroup heap Binary heap
+ *
+ * The binary heap is a scalable data structure created using a binary tree. It
+ * is capable of maintaining large sets of elements sorted usually by one or
+ * more element properties, but really based on anything that can be used as a
+ * binary predicate in order to determine the relevant ordering of any two nodes
+ * that belong to the set. There is no search operation, rather the intention is
+ * for the element of the lowest priority which will always be at the root of
+ * the tree (as this is an implementation of a min-heap) to be removed by users
+ * for consumption.
+ *
+ * Users of the heap should embed a \e cfs_binheap_node_t object instance on
+ * every object of the set that they wish the binary heap instance to handle,
+ * and (at a minimum) provide a cfs_binheap_ops_t::hop_compare() implementation
+ * which is used by the heap as the binary predicate during its internal sorting
+ * operations.
+ *
+ * The current implementation enforces no locking scheme, and so assumes the
+ * user caters for locking between calls to insert, delete and lookup
+ * operations. Since the only consumer for the data structure at this point
+ * are NRS policies, and these operate on a per-CPT basis, binary heap instances
+ * are tied to a specific CPT.
+ * @{
+ */
+
+/**
+ * Binary heap node.
+ *
+ * Objects of this type are embedded into objects of the ordered set that is to
+ * be maintained by a \e cfs_binheap_t instance.
+ */
+typedef struct {
+	/** Index into the binary tree */
+	unsigned int	chn_index;
+} cfs_binheap_node_t;
+
+#define CBH_SHIFT	9
+#define CBH_SIZE       (1 << CBH_SHIFT)		    /* # ptrs per level */
+#define CBH_MASK       (CBH_SIZE - 1)
+#define CBH_NOB	(CBH_SIZE * sizeof(cfs_binheap_node_t *))
+
+#define CBH_POISON	0xdeadbeef
+
+/**
+ * Binary heap flags.
+ */
+enum {
+	CBH_FLAG_ATOMIC_GROW	= 1,
+};
+
+struct cfs_binheap;
+
+/**
+ * Binary heap operations.
+ */
+typedef struct {
+	/**
+	 * Called right before inserting a node into the binary heap.
+	 *
+	 * Implementing this operation is optional.
+	 *
+	 * \param[in] h The heap
+	 * \param[in] e The node
+	 *
+	 * \retval 0 success
+	 * \retval != 0 error
+	 */
+	int		(*hop_enter)(struct cfs_binheap *h,
+				     cfs_binheap_node_t *e);
+	/**
+	 * Called right after removing a node from the binary heap.
+	 *
+	 * Implementing this operation is optional.
+	 *
+	 * \param[in] h The heap
+	 * \param[in] e The node
+	 */
+	void		(*hop_exit)(struct cfs_binheap *h,
+				    cfs_binheap_node_t *e);
+	/**
+	 * A binary predicate which is called during internal heap sorting
+	 * operations, and used in order to determine the relevant ordering of
+	 * two heap nodes.
+	 *
+	 * Implementing this operation is mandatory.
+	 *
+	 * \param[in] a The first heap node
+	 * \param[in] b The second heap node
+	 *
+	 * \retval 0 Node a > node b
+	 * \retval 1 Node a < node b
+	 *
+	 * \see cfs_binheap_bubble()
+	 * \see cfs_biheap_sink()
+	 */
+	int		(*hop_compare)(cfs_binheap_node_t *a,
+				       cfs_binheap_node_t *b);
+} cfs_binheap_ops_t;
+
+/**
+ * Binary heap object.
+ *
+ * Sorts elements of type \e cfs_binheap_node_t
+ */
+typedef struct cfs_binheap {
+	/** Triple indirect */
+	cfs_binheap_node_t  ****cbh_elements3;
+	/** double indirect */
+	cfs_binheap_node_t   ***cbh_elements2;
+	/** single indirect */
+	cfs_binheap_node_t    **cbh_elements1;
+	/** # elements referenced */
+	unsigned int		cbh_nelements;
+	/** high water mark */
+	unsigned int		cbh_hwm;
+	/** user flags */
+	unsigned int		cbh_flags;
+	/** operations table */
+	cfs_binheap_ops_t      *cbh_ops;
+	/** private data */
+	void		       *cbh_private;
+	/** associated CPT table */
+	struct cfs_cpt_table   *cbh_cptab;
+	/** associated CPT id of this cfs_binheap_t::cbh_cptab */
+	int			cbh_cptid;
+} cfs_binheap_t;
+
+void cfs_binheap_destroy(cfs_binheap_t *h);
+cfs_binheap_t *cfs_binheap_create(cfs_binheap_ops_t *ops, unsigned int flags,
+				  unsigned count, void *arg,
+				  struct cfs_cpt_table *cptab, int cptid);
+cfs_binheap_node_t *cfs_binheap_find(cfs_binheap_t *h, unsigned int idx);
+int cfs_binheap_insert(cfs_binheap_t *h, cfs_binheap_node_t *e);
+void cfs_binheap_remove(cfs_binheap_t *h, cfs_binheap_node_t *e);
+
+static inline int
+cfs_binheap_size(cfs_binheap_t *h)
+{
+	return h->cbh_nelements;
+}
+
+static inline int
+cfs_binheap_is_empty(cfs_binheap_t *h)
+{
+	return h->cbh_nelements == 0;
+}
+
+static inline cfs_binheap_node_t *
+cfs_binheap_root(cfs_binheap_t *h)
+{
+	return cfs_binheap_find(h, 0);
+}
+
+static inline cfs_binheap_node_t *
+cfs_binheap_remove_root(cfs_binheap_t *h)
+{
+	cfs_binheap_node_t *e = cfs_binheap_find(h, 0);
+
+	if (e != NULL)
+		cfs_binheap_remove(h, e);
+	return e;
+}
+
+/** @} heap */
+
+#endif /* __LIBCFS_HEAP_H__ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h
new file mode 100644
index 0000000..5be3679
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h

@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_ioctl.h
+ *
+ * Low-level ioctl data structures. Kernel ioctl functions declared here,
+ * and user space functions are in libcfsutil_ioctl.h.
+ *
+ */
+
+#ifndef __LIBCFS_IOCTL_H__
+#define __LIBCFS_IOCTL_H__
+
+
+#define LIBCFS_IOCTL_VERSION 0x0001000a
+
+struct libcfs_ioctl_data {
+	__u32 ioc_len;
+	__u32 ioc_version;
+
+	__u64 ioc_nid;
+	__u64 ioc_u64[1];
+
+	__u32 ioc_flags;
+	__u32 ioc_count;
+	__u32 ioc_net;
+	__u32 ioc_u32[7];
+
+	__u32 ioc_inllen1;
+	char *ioc_inlbuf1;
+	__u32 ioc_inllen2;
+	char *ioc_inlbuf2;
+
+	__u32 ioc_plen1; /* buffers in userspace */
+	char *ioc_pbuf1;
+	__u32 ioc_plen2; /* buffers in userspace */
+	char *ioc_pbuf2;
+
+	char ioc_bulk[0];
+};
+
+
+struct libcfs_ioctl_hdr {
+	__u32 ioc_len;
+	__u32 ioc_version;
+};
+
+struct libcfs_debug_ioctl_data
+{
+	struct libcfs_ioctl_hdr hdr;
+	unsigned int subs;
+	unsigned int debug;
+};
+
+#define LIBCFS_IOC_INIT(data)			   \
+do {						    \
+	memset(&data, 0, sizeof(data));		 \
+	data.ioc_version = LIBCFS_IOCTL_VERSION;	\
+	data.ioc_len = sizeof(data);		    \
+} while (0)
+
+
+struct libcfs_ioctl_handler {
+	struct list_head item;
+	int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_data *data);
+};
+
+#define DECLARE_IOCTL_HANDLER(ident, func)		      \
+	struct libcfs_ioctl_handler ident = {		   \
+		/* .item = */ LIST_HEAD_INIT(ident.item),   \
+		/* .handle_ioctl = */ func		      \
+	}
+
+
+/* FIXME check conflict with lustre_lib.h */
+#define LIBCFS_IOC_DEBUG_MASK	     _IOWR('f', 250, long)
+
+
+/* ioctls for manipulating snapshots 30- */
+#define IOC_LIBCFS_TYPE		   'e'
+#define IOC_LIBCFS_MIN_NR		 30
+/* libcfs ioctls */
+#define IOC_LIBCFS_PANIC		   _IOWR('e', 30, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLEAR_DEBUG	     _IOWR('e', 31, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MARK_DEBUG	      _IOWR('e', 32, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_CONTROL	     _IOWR('e', 33, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_SNAPSHOT	    _IOWR('e', 34, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_LOOKUP_STRING       _IOWR('e', 35, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MEMHOG		  _IOWR('e', 36, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING_TEST	       _IOWR('e', 37, IOCTL_LIBCFS_TYPE)
+/* lnet ioctls */
+#define IOC_LIBCFS_GET_NI		  _IOWR('e', 50, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_FAIL_NID		_IOWR('e', 51, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_ROUTE	       _IOWR('e', 52, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_ROUTE	       _IOWR('e', 53, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_ROUTE	       _IOWR('e', 54, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_NOTIFY_ROUTER	   _IOWR('e', 55, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_UNCONFIGURE	     _IOWR('e', 56, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PORTALS_COMPATIBILITY   _IOWR('e', 57, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNET_DIST	       _IOWR('e', 58, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CONFIGURE	       _IOWR('e', 59, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_TESTPROTOCOMPAT	 _IOWR('e', 60, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING		    _IOWR('e', 61, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEBUG_PEER	      _IOWR('e', 62, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNETST		  _IOWR('e', 63, IOCTL_LIBCFS_TYPE)
+/* lnd ioctls */
+#define IOC_LIBCFS_REGISTER_MYNID	  _IOWR('e', 70, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLOSE_CONNECTION	_IOWR('e', 71, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PUSH_CONNECTION	 _IOWR('e', 72, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_CONN		_IOWR('e', 73, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_PEER		_IOWR('e', 74, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_PEER		_IOWR('e', 75, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_PEER		_IOWR('e', 76, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_TXDESC	      _IOWR('e', 77, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_INTERFACE	   _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_INTERFACE	   _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_INTERFACE	   _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
+
+#define IOC_LIBCFS_MAX_NR			     80
+
+static inline int libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
+{
+	int len = sizeof(*data);
+	len += cfs_size_round(data->ioc_inllen1);
+	len += cfs_size_round(data->ioc_inllen2);
+	return len;
+}
+
+static inline int libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data)
+{
+	if (data->ioc_len > (1<<30)) {
+		CERROR ("LIBCFS ioctl: ioc_len larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen1 > (1<<30)) {
+		CERROR ("LIBCFS ioctl: ioc_inllen1 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen2 > (1<<30)) {
+		CERROR ("LIBCFS ioctl: ioc_inllen2 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+		CERROR ("LIBCFS ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+		CERROR ("LIBCFS ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf1 && !data->ioc_plen1) {
+		CERROR ("LIBCFS ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf2 && !data->ioc_plen2) {
+		CERROR ("LIBCFS ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_plen1 && !data->ioc_pbuf1) {
+		CERROR ("LIBCFS ioctl: plen1 nonzero but no pbuf1 pointer\n");
+		return 1;
+	}
+	if (data->ioc_plen2 && !data->ioc_pbuf2) {
+		CERROR ("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n");
+		return 1;
+	}
+	if ((__u32)libcfs_ioctl_packlen(data) != data->ioc_len ) {
+		CERROR ("LIBCFS ioctl: packlen != ioc_len\n");
+		return 1;
+	}
+	if (data->ioc_inllen1 &&
+	    data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') {
+		CERROR ("LIBCFS ioctl: inlbuf1 not 0 terminated\n");
+		return 1;
+	}
+	if (data->ioc_inllen2 &&
+	    data->ioc_bulk[cfs_size_round(data->ioc_inllen1) +
+			   data->ioc_inllen2 - 1] != '\0') {
+		CERROR ("LIBCFS ioctl: inlbuf2 not 0 terminated\n");
+		return 1;
+	}
+	return 0;
+}
+
+
+extern int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_ioctl_getdata(char *buf, char *end, void *arg);
+extern int libcfs_ioctl_popdata(void *arg, void *buf, int size);
+
+
+#endif /* __LIBCFS_IOCTL_H__ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h
new file mode 100644
index 0000000..596a15f
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h

@@ -0,0 +1,117 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * libcfs/include/libcfs/libcfs_kernelcomm.h
+ *
+ * Kernel <-> userspace communication routines.
+ * The definitions below are used in the kernel and userspace.
+ *
+ */
+
+#ifndef __LIBCFS_KERNELCOMM_H__
+#define __LIBCFS_KERNELCOMM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+/* KUC message header.
+ * All current and future KUC messages should use this header.
+ * To avoid having to include Lustre headers from libcfs, define this here.
+ */
+struct kuc_hdr {
+	__u16 kuc_magic;
+	__u8  kuc_transport;  /* Each new Lustre feature should use a different
+				 transport */
+	__u8  kuc_flags;
+	__u16 kuc_msgtype;    /* Message type or opcode, transport-specific */
+	__u16 kuc_msglen;     /* Including header */
+} __attribute__((aligned(sizeof(__u64))));
+
+#define KUC_MAGIC  0x191C /*Lustre9etLinC */
+#define KUC_FL_BLOCK 0x01   /* Wait for send */
+
+/* kuc_msgtype values are defined in each transport */
+enum kuc_transport_type {
+	KUC_TRANSPORT_GENERIC   = 1,
+	KUC_TRANSPORT_HSM       = 2,
+	KUC_TRANSPORT_CHANGELOG = 3,
+};
+
+enum kuc_generic_message_type {
+	KUC_MSG_SHUTDOWN = 1,
+};
+
+/* prototype for callback function on kuc groups */
+typedef int (*libcfs_kkuc_cb_t)(__u32 data, void *cb_arg);
+
+/* KUC Broadcast Groups. This determines which userspace process hears which
+ * messages.  Mutliple transports may be used within a group, or multiple
+ * groups may use the same transport.  Broadcast
+ * groups need not be used if e.g. a UID is specified instead;
+ * use group 0 to signify unicast.
+ */
+#define KUC_GRP_HSM	   0x02
+#define KUC_GRP_MAX	   KUC_GRP_HSM
+
+/* Kernel methods */
+extern int libcfs_kkuc_msg_put(struct file *fp, void *payload);
+extern int libcfs_kkuc_group_put(int group, void *payload);
+extern int libcfs_kkuc_group_add(struct file *fp, int uid, int group,
+				 __u32 data);
+extern int libcfs_kkuc_group_rem(int uid, int group);
+extern int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+				     void *cb_arg);
+
+#define LK_FLG_STOP 0x01
+
+/* kernelcomm control structure, passed from userspace to kernel */
+typedef struct lustre_kernelcomm {
+	__u32 lk_wfd;
+	__u32 lk_rfd;
+	__u32 lk_uid;
+	__u32 lk_group;
+	__u32 lk_data;
+	__u32 lk_flags;
+} __attribute__((packed)) lustre_kernelcomm;
+
+/* Userspace methods */
+extern int libcfs_ukuc_start(lustre_kernelcomm *l, int groups);
+extern int libcfs_ukuc_stop(lustre_kernelcomm *l);
+extern int libcfs_ukuc_msg_get(lustre_kernelcomm *l, char *buf, int maxsize,
+			       int transport);
+
+#endif /* __LIBCFS_KERNELCOMM_H__ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h
new file mode 100644
index 0000000..9c40ed9
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h

@@ -0,0 +1,101 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_prim.h
+ *
+ * General primitives.
+ *
+ */
+
+#ifndef __LIBCFS_PRIM_H__
+#define __LIBCFS_PRIM_H__
+
+#ifndef EXPORT_SYMBOL
+# define EXPORT_SYMBOL(s)
+#endif
+
+/*
+ * Schedule
+ */
+void cfs_pause(cfs_duration_t ticks);
+
+/*
+ * Timer
+ */
+typedef  void (cfs_timer_func_t)(ulong_ptr_t);
+void schedule_timeout_and_set_state(cfs_task_state_t, int64_t);
+
+void init_waitqueue_entry_current(wait_queue_t *link);
+int64_t waitq_timedwait(wait_queue_t *, cfs_task_state_t, int64_t);
+void waitq_wait(wait_queue_t *, cfs_task_state_t);
+void add_wait_queue_exclusive_head(wait_queue_head_t *, wait_queue_t *);
+
+void cfs_init_timer(timer_list_t *t);
+void cfs_timer_init(timer_list_t *t, cfs_timer_func_t *func, void *arg);
+void cfs_timer_done(timer_list_t *t);
+void cfs_timer_arm(timer_list_t *t, cfs_time_t deadline);
+void cfs_timer_disarm(timer_list_t *t);
+int  cfs_timer_is_armed(timer_list_t *t);
+cfs_time_t cfs_timer_deadline(timer_list_t *t);
+
+/*
+ * Memory
+ */
+#ifndef memory_pressure_get
+#define memory_pressure_get() (0)
+#endif
+#ifndef memory_pressure_set
+#define memory_pressure_set() do {} while (0)
+#endif
+#ifndef memory_pressure_clr
+#define memory_pressure_clr() do {} while (0)
+#endif
+
+static inline int cfs_memory_pressure_get_and_set(void)
+{
+	int old = memory_pressure_get();
+
+	if (!old)
+		memory_pressure_set();
+	return old;
+}
+
+static inline void cfs_memory_pressure_restore(int old)
+{
+	if (old)
+		memory_pressure_set();
+	else
+		memory_pressure_clr();
+	return;
+}
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
new file mode 100644
index 0000000..056caa4
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h

@@ -0,0 +1,577 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_private.h
+ *
+ * Various defines for libcfs.
+ *
+ */
+
+#ifndef __LIBCFS_PRIVATE_H__
+#define __LIBCFS_PRIVATE_H__
+
+/* XXX this layering violation is for nidstrings */
+#include <linux/lnet/types.h>
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+
+
+/*
+ * When this is on, LASSERT macro includes check for assignment used instead
+ * of equality check, but doesn't have unlikely(). Turn this on from time to
+ * time to make test-builds. This shouldn't be on for production release.
+ */
+#define LASSERT_CHECKED (0)
+
+
+#define LASSERTF(cond, fmt, ...)					\
+do {									\
+	if (unlikely(!(cond))) {					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL);	\
+		libcfs_debug_msg(&__msg_data,				\
+				 "ASSERTION( %s ) failed: " fmt, #cond,	\
+				 ## __VA_ARGS__);			\
+		lbug_with_loc(&__msg_data);				\
+	}								\
+} while (0)
+
+#define LASSERT(cond) LASSERTF(cond, "\n")
+
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+/**
+ * This is for more expensive checks that one doesn't want to be enabled all
+ * the time. LINVRNT() has to be explicitly enabled by
+ * CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK option.
+ */
+# define LINVRNT(exp) LASSERT(exp)
+#else
+# define LINVRNT(exp) ((void)sizeof!!(exp))
+#endif
+
+#define KLASSERT(e) LASSERT(e)
+
+void lbug_with_loc(struct libcfs_debug_msg_data *) __attribute__((noreturn));
+
+#define LBUG()							  \
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);	     \
+	lbug_with_loc(&msgdata);					\
+} while(0)
+
+extern atomic_t libcfs_kmemory;
+/*
+ * Memory
+ */
+
+# define libcfs_kmem_inc(ptr, size)		\
+do {						\
+	atomic_add(size, &libcfs_kmemory);	\
+} while (0)
+
+# define libcfs_kmem_dec(ptr, size)		\
+do {						\
+	atomic_sub(size, &libcfs_kmemory);	\
+} while (0)
+
+# define libcfs_kmem_read()			\
+	atomic_read(&libcfs_kmemory)
+
+
+#ifndef LIBCFS_VMALLOC_SIZE
+#define LIBCFS_VMALLOC_SIZE	(2 << PAGE_CACHE_SHIFT) /* 2 pages */
+#endif
+
+#define LIBCFS_ALLOC_PRE(size, mask)					    \
+do {									    \
+	LASSERT(!in_interrupt() ||					    \
+		((size) <= LIBCFS_VMALLOC_SIZE &&			    \
+		 ((mask) & GFP_ATOMIC)) != 0);			    \
+} while (0)
+
+#define LIBCFS_ALLOC_POST(ptr, size)					    \
+do {									    \
+	if (unlikely((ptr) == NULL)) {					    \
+		CERROR("LNET: out of memory at %s:%d (tried to alloc '"	    \
+		       #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));  \
+		CERROR("LNET: %d total bytes allocated by lnet\n",	    \
+		       libcfs_kmem_read());				    \
+	} else {							    \
+		memset((ptr), 0, (size));				    \
+		libcfs_kmem_inc((ptr), (size));				    \
+		CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %d).\n",  \
+		       (int)(size), (ptr), libcfs_kmem_read());		    \
+	}								   \
+} while (0)
+
+/**
+ * allocate memory with GFP flags @mask
+ */
+#define LIBCFS_ALLOC_GFP(ptr, size, mask)				    \
+do {									    \
+	LIBCFS_ALLOC_PRE((size), (mask));				    \
+	(ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?				    \
+		kmalloc((size), (mask)) : vmalloc(size);	    \
+	LIBCFS_ALLOC_POST((ptr), (size));				    \
+} while (0)
+
+/**
+ * default allocator
+ */
+#define LIBCFS_ALLOC(ptr, size) \
+	LIBCFS_ALLOC_GFP(ptr, size, __GFP_IO)
+
+/**
+ * non-sleeping allocator
+ */
+#define LIBCFS_ALLOC_ATOMIC(ptr, size) \
+	LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC)
+
+/**
+ * allocate memory for specified CPU partition
+ *   \a cptab != NULL, \a cpt is CPU partition id of \a cptab
+ *   \a cptab == NULL, \a cpt is HW NUMA node id
+ */
+#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask)		    \
+do {									    \
+	LIBCFS_ALLOC_PRE((size), (mask));				    \
+	(ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?				    \
+		kmalloc_node((size), (mask), cfs_cpt_spread_node(cptab, cpt)) :\
+		vmalloc_node(size, cfs_cpt_spread_node(cptab, cpt));	    \
+	LIBCFS_ALLOC_POST((ptr), (size));				    \
+} while (0)
+
+/** default numa allocator */
+#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)				    \
+	LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO)
+
+#define LIBCFS_FREE(ptr, size)					  \
+do {								    \
+	int s = (size);						 \
+	if (unlikely((ptr) == NULL)) {				  \
+		CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
+		       "%s:%d\n", s, __FILE__, __LINE__);	       \
+		break;						  \
+	}							       \
+	libcfs_kmem_dec((ptr), s);				      \
+	CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
+	       s, (ptr), libcfs_kmem_read());				\
+	if (unlikely(s > LIBCFS_VMALLOC_SIZE))			  \
+		vfree(ptr);				    \
+	else							    \
+		kfree(ptr);					  \
+} while (0)
+
+/******************************************************************************/
+
+/* htonl hack - either this, or compile with -O2. Stupid byteorder/generic.h */
+#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__)
+#define ___htonl(x) __cpu_to_be32(x)
+#define ___htons(x) __cpu_to_be16(x)
+#define ___ntohl(x) __be32_to_cpu(x)
+#define ___ntohs(x) __be16_to_cpu(x)
+#define htonl(x) ___htonl(x)
+#define ntohl(x) ___ntohl(x)
+#define htons(x) ___htons(x)
+#define ntohs(x) ___ntohs(x)
+#endif
+
+void libcfs_debug_dumpstack(task_t *tsk);
+void libcfs_run_upcall(char **argv);
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *);
+void libcfs_debug_dumplog(void);
+int libcfs_debug_init(unsigned long bufsize);
+int libcfs_debug_cleanup(void);
+int libcfs_debug_clear_buffer(void);
+int libcfs_debug_mark_buffer(const char *text);
+
+void libcfs_debug_set_level(unsigned int debug_level);
+
+
+/*
+ * allocate per-cpu-partition data, returned value is an array of pointers,
+ * variable can be indexed by CPU ID.
+ *	cptable != NULL: size of array is number of CPU partitions
+ *	cptable == NULL: size of array is number of HW cores
+ */
+void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size);
+/*
+ * destory per-cpu-partition variable
+ */
+void  cfs_percpt_free(void *vars);
+int   cfs_percpt_number(void *vars);
+void *cfs_percpt_current(void *vars);
+void *cfs_percpt_index(void *vars, int idx);
+
+#define cfs_percpt_for_each(var, i, vars)		\
+	for (i = 0; i < cfs_percpt_number(vars) &&	\
+		    ((var) = (vars)[i]) != NULL; i++)
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by count.
+ */
+void *cfs_array_alloc(int count, unsigned int size);
+void  cfs_array_free(void *vars);
+
+#define LASSERT_ATOMIC_ENABLED	  (1)
+
+#if LASSERT_ATOMIC_ENABLED
+
+/** assert value of @a is equal to @v */
+#define LASSERT_ATOMIC_EQ(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) == v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is unequal to @v */
+#define LASSERT_ATOMIC_NE(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) != v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is little than @v */
+#define LASSERT_ATOMIC_LT(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) < v,			\
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is little/equal to @v */
+#define LASSERT_ATOMIC_LE(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) <= v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is great than @v */
+#define LASSERT_ATOMIC_GT(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) > v,			\
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is great/equal to @v */
+#define LASSERT_ATOMIC_GE(a, v)				 \
+do {							    \
+	LASSERTF(atomic_read(a) >= v,		       \
+		 "value: %d\n", atomic_read((a)));	  \
+} while (0)
+
+/** assert value of @a is great than @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v);     \
+} while (0)
+
+/** assert value of @a is great than @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)			 \
+do {							    \
+	int __v = atomic_read(a);			   \
+	LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v);   \
+} while (0)
+
+#else /* !LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_EQ(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_NE(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_LT(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_LE(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_GT(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_GE(a, v)		 do {} while (0)
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)	 do {} while (0)
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)	 do {} while (0)
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)	 do {} while (0)
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)	 do {} while (0)
+
+#endif /* LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_ZERO(a)		  LASSERT_ATOMIC_EQ(a, 0)
+#define LASSERT_ATOMIC_POS(a)		   LASSERT_ATOMIC_GT(a, 0)
+
+#define CFS_ALLOC_PTR(ptr)      LIBCFS_ALLOC(ptr, sizeof (*(ptr)));
+#define CFS_FREE_PTR(ptr)       LIBCFS_FREE(ptr, sizeof (*(ptr)));
+
+/*
+ * percpu partition lock
+ *
+ * There are some use-cases like this in Lustre:
+ * . each CPU partition has it's own private data which is frequently changed,
+ *   and mostly by the local CPU partition.
+ * . all CPU partitions share some global data, these data are rarely changed.
+ *
+ * LNet is typical example.
+ * CPU partition lock is designed for this kind of use-cases:
+ * . each CPU partition has it's own private lock
+ * . change on private data just needs to take the private lock
+ * . read on shared data just needs to take _any_ of private locks
+ * . change on shared data needs to take _all_ private locks,
+ *   which is slow and should be really rare.
+ */
+
+enum {
+	CFS_PERCPT_LOCK_EX	= -1, /* negative */
+};
+
+
+struct cfs_percpt_lock {
+	/* cpu-partition-table for this lock */
+	struct cfs_cpt_table	*pcl_cptab;
+	/* exclusively locked */
+	unsigned int		pcl_locked;
+	/* private lock table */
+	spinlock_t		**pcl_locks;
+};
+
+/* return number of private locks */
+static inline int
+cfs_percpt_lock_num(struct cfs_percpt_lock *pcl)
+{
+	return cfs_cpt_number(pcl->pcl_cptab);
+}
+
+
+/*
+ * create a cpu-partition lock based on CPU partition table \a cptab,
+ * each private lock has extra \a psize bytes padding data
+ */
+struct cfs_percpt_lock *cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab);
+/* destroy a cpu-partition lock */
+void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl);
+
+/* lock private lock \a index of \a pcl */
+void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index);
+/* unlock private lock \a index of \a pcl */
+void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index);
+/* create percpt (atomic) refcount based on @cptab */
+atomic_t **cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int val);
+/* destroy percpt refcount */
+void cfs_percpt_atomic_free(atomic_t **refs);
+/* return sum of all percpu refs */
+int cfs_percpt_atomic_summary(atomic_t **refs);
+
+
+/** Compile-time assertion.
+
+ * Check an invariant described by a constant expression at compile time by
+ * forcing a compiler error if it does not hold.  \a cond must be a constant
+ * expression as defined by the ISO C Standard:
+ *
+ *       6.8.4.2  The switch statement
+ *       ....
+ *       [#3] The expression of each case label shall be  an  integer
+ *       constant   expression  and  no  two  of  the  case  constant
+ *       expressions in the same switch statement shall have the same
+ *       value  after  conversion...
+ *
+ */
+#define CLASSERT(cond) do {switch(42) {case (cond): case 0: break;}} while (0)
+
+/* support decl needed both by kernel and liblustre */
+int	 libcfs_isknown_lnd(int type);
+char       *libcfs_lnd2modname(int type);
+char       *libcfs_lnd2str(int type);
+int	 libcfs_str2lnd(const char *str);
+char       *libcfs_net2str(__u32 net);
+char       *libcfs_nid2str(lnet_nid_t nid);
+__u32       libcfs_str2net(const char *str);
+lnet_nid_t  libcfs_str2nid(const char *str);
+int	 libcfs_str2anynid(lnet_nid_t *nid, const char *str);
+char       *libcfs_id2str(lnet_process_id_t id);
+void	cfs_free_nidlist(struct list_head *list);
+int	 cfs_parse_nidlist(char *str, int len, struct list_head *list);
+int	 cfs_match_nid(lnet_nid_t nid, struct list_head *list);
+
+/** \addtogroup lnet_addr
+ * @{ */
+/* how an LNET NID encodes net:address */
+/** extract the address part of an lnet_nid_t */
+#define LNET_NIDADDR(nid)      ((__u32)((nid) & 0xffffffff))
+/** extract the network part of an lnet_nid_t */
+#define LNET_NIDNET(nid)       ((__u32)(((nid) >> 32)) & 0xffffffff)
+/** make an lnet_nid_t from a network part and an address part */
+#define LNET_MKNID(net,addr)   ((((__u64)(net))<<32)|((__u64)(addr)))
+/* how net encodes type:number */
+#define LNET_NETNUM(net)       ((net) & 0xffff)
+#define LNET_NETTYP(net)       (((net) >> 16) & 0xffff)
+#define LNET_MKNET(typ,num)    ((((__u32)(typ))<<16)|((__u32)(num)))
+/** @} lnet_addr */
+
+/* max value for numeric network address */
+#define MAX_NUMERIC_VALUE 0xffffffff
+
+/* implication */
+#define ergo(a, b) (!(a) || (b))
+/* logical equivalence */
+#define equi(a, b) (!!(a) == !!(b))
+
+#ifndef CFS_CURRENT_TIME
+# define CFS_CURRENT_TIME time(0)
+#endif
+
+/* --------------------------------------------------------------------
+ * Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect.
+ * All stuff about lwt are put in arch/kp30.h
+ * -------------------------------------------------------------------- */
+
+struct libcfs_device_userstate
+{
+	int	   ldu_memhog_pages;
+	struct page   *ldu_memhog_root_page;
+};
+
+/* what used to be in portals_lib.h */
+#ifndef MIN
+# define MIN(a,b) (((a)<(b)) ? (a): (b))
+#endif
+#ifndef MAX
+# define MAX(a,b) (((a)>(b)) ? (a): (b))
+#endif
+
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+static inline int cfs_size_round4 (int val)
+{
+	return (val + 3) & (~0x3);
+}
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+	return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+static inline int cfs_size_round16(int val)
+{
+	return (val + 0xf) & (~0xf);
+}
+
+static inline int cfs_size_round32(int val)
+{
+	return (val + 0x1f) & (~0x1f);
+}
+
+static inline int cfs_size_round0(int val)
+{
+	if (!val)
+		return 0;
+	return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t cfs_round_strlen(char *fset)
+{
+	return (size_t)cfs_size_round((int)strlen(fset) + 1);
+}
+
+/* roundup \a val to power2 */
+static inline unsigned int cfs_power2_roundup(unsigned int val)
+{
+	if (val != LOWEST_BIT_SET(val)) { /* not a power of 2 already */
+		do {
+			val &= ~LOWEST_BIT_SET(val);
+		} while (val != LOWEST_BIT_SET(val));
+		/* ...and round up */
+		val <<= 1;
+	}
+	return val;
+}
+
+#define LOGL(var,len,ptr)				       \
+do {							    \
+	if (var)						\
+		memcpy((char *)ptr, (const char *)var, len);    \
+	ptr += cfs_size_round(len);			     \
+} while (0)
+
+#define LOGU(var,len,ptr)				       \
+do {							    \
+	if (var)						\
+		memcpy((char *)var, (const char *)ptr, len);    \
+	ptr += cfs_size_round(len);			     \
+} while (0)
+
+#define LOGL0(var,len,ptr)			      \
+do {						    \
+	if (!len)				       \
+		break;				  \
+	memcpy((char *)ptr, (const char *)var, len);    \
+	*((char *)(ptr) + len) = 0;		     \
+	ptr += cfs_size_round(len + 1);		 \
+} while (0)
+
+/**
+ *  Lustre Network Driver types.
+ */
+enum {
+	/* Only add to these values (i.e. don't ever change or redefine them):
+	 * network addresses depend on them... */
+	QSWLND    = 1,
+	SOCKLND   = 2,
+	GMLND     = 3, /* obsolete, keep it so that libcfs_nid2str works */
+	PTLLND    = 4,
+	O2IBLND   = 5,
+	CIBLND    = 6,
+	OPENIBLND = 7,
+	IIBLND    = 8,
+	LOLND     = 9,
+	RALND     = 10,
+	VIBLND    = 11,
+	MXLND     = 12,
+	GNILND    = 13,
+};
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h
new file mode 100644
index 0000000..a6bac9c
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h

@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_string.h
+ *
+ * Generic string manipulation functions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#ifndef __LIBCFS_STRING_H__
+#define __LIBCFS_STRING_H__
+
+/* libcfs_string.c */
+/* string comparison ignoring case */
+int cfs_strncasecmp(const char *s1, const char *s2, size_t n);
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+		 int *oldmask, int minmask, int allmask);
+
+/* Allocate space for and copy an existing string.
+ * Must free with kfree().
+ */
+char *cfs_strdup(const char *str, u_int32_t flags);
+
+/* safe vsnprintf */
+int cfs_vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
+
+/* safe snprintf */
+int cfs_snprintf(char *buf, size_t size, const char *fmt, ...);
+
+/* trim leading and trailing space characters */
+char *cfs_firststr(char *str, size_t size);
+
+/**
+ * Structure to represent NULL-less strings.
+ */
+struct cfs_lstr {
+	char		*ls_str;
+	int		ls_len;
+};
+
+/*
+ * Structure to represent \<range_expr\> token of the syntax.
+ */
+struct cfs_range_expr {
+	/*
+	 * Link to cfs_expr_list::el_exprs.
+	 */
+	struct list_head	re_link;
+	__u32		re_lo;
+	__u32		re_hi;
+	__u32		re_stride;
+};
+
+struct cfs_expr_list {
+	struct list_head	el_link;
+	struct list_head	el_exprs;
+};
+
+static inline int
+cfs_iswhite(char c)
+{
+	switch (c) {
+	case ' ':
+	case '\t':
+	case '\n':
+	case '\r':
+		return 1;
+	default:
+		break;
+	}
+	return 0;
+}
+
+char *cfs_trimwhite(char *str);
+int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
+int cfs_str2num_check(char *str, int nob, unsigned *num,
+		      unsigned min, unsigned max);
+int cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+			 int single_tok, struct cfs_range_expr **expr);
+int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list);
+int cfs_expr_list_values(struct cfs_expr_list *expr_list,
+			 int max, __u32 **values);
+static inline void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+	/* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+	 * by OBD_FREE() if it's called by module other than libcfs & LNet,
+	 * otherwise we will see fake memory leak */
+	LIBCFS_FREE(values, num * sizeof(values[0]));
+}
+
+void cfs_expr_list_free(struct cfs_expr_list *expr_list);
+void cfs_expr_list_print(struct cfs_expr_list *expr_list);
+int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+			struct cfs_expr_list **elpp);
+void cfs_expr_list_free_list(struct list_head *list);
+int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
+int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+void cfs_ip_addr_free(struct list_head *list);
+
+#define	strtoul(str, endp, base)	simple_strtoul(str, endp, base)
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h
new file mode 100644
index 0000000..4bdd771
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h

@@ -0,0 +1,132 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_time.h
+ *
+ * Time functions.
+ *
+ */
+
+#ifndef __LIBCFS_TIME_H__
+#define __LIBCFS_TIME_H__
+/*
+ * generic time manipulation functions.
+ */
+
+static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
+{
+	return (cfs_time_t)(t + d);
+}
+
+static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
+{
+	return (cfs_time_t)(t1 - t2);
+}
+
+static inline int cfs_time_after(cfs_time_t t1, cfs_time_t t2)
+{
+	return cfs_time_before(t2, t1);
+}
+
+static inline int cfs_time_aftereq(cfs_time_t t1, cfs_time_t t2)
+{
+	return cfs_time_beforeq(t2, t1);
+}
+
+
+static inline cfs_time_t cfs_time_shift(int seconds)
+{
+	return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+}
+
+static inline long cfs_timeval_sub(struct timeval *large, struct timeval *small,
+				   struct timeval *result)
+{
+	long r = (long) (
+		(large->tv_sec - small->tv_sec) * ONE_MILLION +
+		(large->tv_usec - small->tv_usec));
+	if (result != NULL) {
+		result->tv_usec = r % ONE_MILLION;
+		result->tv_sec = r / ONE_MILLION;
+	}
+	return r;
+}
+
+static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg)
+{
+	if (cfs_time_after(cfs_time_current(),
+			   cfs_time_add(now, cfs_time_seconds(15))))
+		CERROR("slow %s "CFS_TIME_T" sec\n", msg,
+		       cfs_duration_sec(cfs_time_sub(cfs_time_current(),now)));
+}
+
+#define CFS_RATELIMIT(seconds)				  \
+({							      \
+	/*						      \
+	 * XXX nikita: non-portable initializer		 \
+	 */						     \
+	static time_t __next_message = 0;		       \
+	int result;					     \
+								\
+	if (cfs_time_after(cfs_time_current(), __next_message)) \
+		result = 1;				     \
+	else {						  \
+		__next_message = cfs_time_shift(seconds);       \
+		result = 0;				     \
+	}						       \
+	result;						 \
+})
+
+/*
+ * helper function similar to do_gettimeofday() of Linux kernel
+ */
+static inline void cfs_fs_timeval(struct timeval *tv)
+{
+	cfs_fs_time_t time;
+
+	cfs_fs_time_current(&time);
+	cfs_fs_time_usec(&time, tv);
+}
+
+/*
+ * return valid time-out based on user supplied one. Currently we only check
+ * that time-out is not shorted than allowed.
+ */
+static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout)
+{
+	if (timeout < CFS_TICK)
+		timeout = CFS_TICK;
+	return timeout;
+}
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h
new file mode 100644
index 0000000..5cc64f3
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h

@@ -0,0 +1,110 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_workitem.h
+ *
+ * Author: Isaac Huang  <he.h.huang@oracle.com>
+ *	 Liang Zhen   <zhen.liang@sun.com>
+ *
+ * A workitems is deferred work with these semantics:
+ * - a workitem always runs in thread context.
+ * - a workitem can be concurrent with other workitems but is strictly
+ *   serialized with respect to itself.
+ * - no CPU affinity, a workitem does not necessarily run on the same CPU
+ *   that schedules it. However, this might change in the future.
+ * - if a workitem is scheduled again before it has a chance to run, it
+ *   runs only once.
+ * - if a workitem is scheduled while it runs, it runs again after it
+ *   completes; this ensures that events occurring while other events are
+ *   being processed receive due attention. This behavior also allows a
+ *   workitem to reschedule itself.
+ *
+ * Usage notes:
+ * - a workitem can sleep but it should be aware of how that sleep might
+ *   affect others.
+ * - a workitem runs inside a kernel thread so there's no user space to access.
+ * - do not use a workitem if the scheduling latency can't be tolerated.
+ *
+ * When wi_action returns non-zero, it means the workitem has either been
+ * freed or reused and workitem scheduler won't touch it any more.
+ */
+
+#ifndef __LIBCFS_WORKITEM_H__
+#define __LIBCFS_WORKITEM_H__
+
+struct cfs_wi_sched;
+
+void cfs_wi_sched_destroy(struct cfs_wi_sched *);
+int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt,
+			int nthrs, struct cfs_wi_sched **);
+
+struct cfs_workitem;
+
+typedef int (*cfs_wi_action_t) (struct cfs_workitem *);
+typedef struct cfs_workitem {
+	/** chain on runq or rerunq */
+	struct list_head       wi_list;
+	/** working function */
+	cfs_wi_action_t  wi_action;
+	/** arg for working function */
+	void	    *wi_data;
+	/** in running */
+	unsigned short   wi_running:1;
+	/** scheduled */
+	unsigned short   wi_scheduled:1;
+} cfs_workitem_t;
+
+static inline void
+cfs_wi_init(cfs_workitem_t *wi, void *data, cfs_wi_action_t action)
+{
+	INIT_LIST_HEAD(&wi->wi_list);
+
+	wi->wi_running   = 0;
+	wi->wi_scheduled = 0;
+	wi->wi_data      = data;
+	wi->wi_action    = action;
+}
+
+void cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+int  cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+void cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+
+int  cfs_wi_startup(void);
+void cfs_wi_shutdown(void);
+
+/** # workitem scheduler loops before reschedule */
+#define CFS_WI_RESCHED    128
+
+#endif /* __LIBCFS_WORKITEM_H__ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h b/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h
new file mode 100644
index 0000000..4b7ae1c
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h

@@ -0,0 +1,286 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_KP30_H__
+#define __LIBCFS_LINUX_KP30_H__
+
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/rwsem.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/smp.h>
+#include <linux/ctype.h>
+#include <linux/compiler.h>
+#ifdef HAVE_MM_INLINE
+# include <linux/mm_inline.h>
+#endif
+#include <linux/kallsyms.h>
+#include <linux/moduleparam.h>
+#include <linux/scatterlist.h>
+
+#include <linux/libcfs/linux/portals_compat25.h>
+
+
+#define prepare_work(wq,cb,cbdata)					    \
+do {									  \
+	INIT_WORK((wq), (void *)(cb));					\
+} while (0)
+
+#define cfs_get_work_data(type,field,data) container_of(data,type,field)
+
+
+#define our_recalc_sigpending(current) recalc_sigpending()
+#define strtok(a,b) strpbrk(a, b)
+#define work_struct_t      struct work_struct
+
+#ifdef CONFIG_SMP
+#else
+#endif
+
+
+#define SEM_COUNT(sem)	  ((sem)->count)
+
+
+/* ------------------------------------------------------------------- */
+
+#define PORTAL_SYMBOL_REGISTER(x)
+#define PORTAL_SYMBOL_UNREGISTER(x)
+
+
+
+
+/******************************************************************************/
+/* Module parameter support */
+#define CFS_MODULE_PARM(name, t, type, perm, desc) \
+	module_param(name, type, perm);\
+	MODULE_PARM_DESC(name, desc)
+
+#define CFS_SYSFS_MODULE_PARM  1 /* module parameters accessible via sysfs */
+
+/******************************************************************************/
+
+#if (__GNUC__)
+/* Use the special GNU C __attribute__ hack to have the compiler check the
+ * printf style argument string against the actual argument count and
+ * types.
+ */
+#ifdef printf
+# warning printf has been defined as a macro...
+# undef printf
+#endif
+
+#endif /* __GNUC__ */
+
+# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b)
+# define printf(format, b...) CDEBUG(D_OTHER, format , ## b)
+# define time(a) CURRENT_TIME
+
+# define cfs_num_present_cpus()  num_present_cpus()
+
+/******************************************************************************/
+/* Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect. */
+#define LWT_SUPPORT  0
+
+#define LWT_MEMORY   (16<<20)
+
+#ifndef KLWT_SUPPORT
+#  if !defined(BITS_PER_LONG)
+#   error "BITS_PER_LONG not defined"
+#  endif
+
+/* kernel hasn't defined this? */
+typedef struct {
+	long long   lwte_when;
+	char       *lwte_where;
+	void       *lwte_task;
+	long	lwte_p1;
+	long	lwte_p2;
+	long	lwte_p3;
+	long	lwte_p4;
+# if BITS_PER_LONG > 32
+	long	lwte_pad;
+# endif
+} lwt_event_t;
+#endif /* !KLWT_SUPPORT */
+
+#if LWT_SUPPORT
+#  if !KLWT_SUPPORT
+
+typedef struct _lwt_page {
+	struct list_head	       lwtp_list;
+	struct page	     *lwtp_page;
+	lwt_event_t	     *lwtp_events;
+} lwt_page_t;
+
+typedef struct {
+	int		lwtc_current_index;
+	lwt_page_t	*lwtc_current_page;
+} lwt_cpu_t;
+
+extern int       lwt_enabled;
+extern lwt_cpu_t lwt_cpus[];
+
+/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set.
+ * This stuff is meant for finding specific problems; it never stays in
+ * production code... */
+
+#define LWTSTR(n)       #n
+#define LWTWHERE(f,l)   f ":" LWTSTR(l)
+#define LWT_EVENTS_PER_PAGE (PAGE_CACHE_SIZE / sizeof (lwt_event_t))
+
+#define LWT_EVENT(p1, p2, p3, p4)				       \
+do {								    \
+	unsigned long    flags;					 \
+	lwt_cpu_t       *cpu;					   \
+	lwt_page_t      *p;					     \
+	lwt_event_t     *e;					     \
+									\
+	if (lwt_enabled) {					      \
+		local_irq_save (flags);				 \
+									\
+		cpu = &lwt_cpus[smp_processor_id()];		    \
+		p = cpu->lwtc_current_page;			     \
+		e = &p->lwtp_events[cpu->lwtc_current_index++];	 \
+									\
+		if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) {   \
+			cpu->lwtc_current_page =			\
+				list_entry (p->lwtp_list.next,      \
+						lwt_page_t, lwtp_list); \
+			cpu->lwtc_current_index = 0;		    \
+		}						       \
+									\
+		e->lwte_when  = get_cycles();			   \
+		e->lwte_where = LWTWHERE(__FILE__,__LINE__);	    \
+		e->lwte_task  = current;				\
+		e->lwte_p1    = (long)(p1);			     \
+		e->lwte_p2    = (long)(p2);			     \
+		e->lwte_p3    = (long)(p3);			     \
+		e->lwte_p4    = (long)(p4);			     \
+									\
+		local_irq_restore (flags);			      \
+	}							       \
+} while (0)
+
+#endif /* !KLWT_SUPPORT */
+
+extern int  lwt_init (void);
+extern void lwt_fini (void);
+extern int  lwt_lookup_string (int *size, char *knlptr,
+			       char *usrptr, int usrsize);
+extern int  lwt_control (int enable, int clear);
+extern int  lwt_snapshot (cfs_cycles_t *now, int *ncpu, int *total_size,
+			  void *user_ptr, int user_size);
+#endif /* LWT_SUPPORT */
+
+/* ------------------------------------------------------------------ */
+
+#define IOCTL_LIBCFS_TYPE long
+
+#ifdef __CYGWIN__
+# ifndef BITS_PER_LONG
+#   define BITS_PER_LONG 64
+# endif
+#endif
+
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
+
+/* this is a bit chunky */
+
+#define _LWORDSIZE BITS_PER_LONG
+
+# define LPU64 "%llu"
+# define LPD64 "%lld"
+# define LPX64 "%#llx"
+# define LPX64i "%llx"
+# define LPO64 "%#llo"
+# define LPF64 "L"
+
+/*
+ * long_ptr_t & ulong_ptr_t, same to "long" for gcc
+ */
+# define LPLU "%lu"
+# define LPLD "%ld"
+# define LPLX "%#lx"
+
+/*
+ * pid_t
+ */
+# define LPPID "%d"
+
+
+#undef _LWORDSIZE
+
+/* compat macroses */
+
+
+#ifndef get_cpu
+# ifdef CONFIG_PREEMPT
+#  define get_cpu()  ({ preempt_disable(); smp_processor_id(); })
+#  define put_cpu()  preempt_enable()
+# else
+#  define get_cpu()  smp_processor_id()
+#  define put_cpu()
+# endif
+#else
+#endif /* get_cpu & put_cpu */
+
+#define INIT_CTL_NAME(a)
+#define INIT_STRATEGY(a)
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h
new file mode 100644
index 0000000..292a3ba
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h

@@ -0,0 +1,125 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_LIBCFS_H__
+#define __LIBCFS_LINUX_LIBCFS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+
+#include <stdarg.h>
+#include <linux/libcfs/linux/linux-cpu.h>
+#include <linux/libcfs/linux/linux-time.h>
+#include <linux/libcfs/linux/linux-mem.h>
+#include <linux/libcfs/linux/linux-prim.h>
+#include <linux/libcfs/linux/linux-lock.h>
+#include <linux/libcfs/linux/linux-fs.h>
+#include <linux/libcfs/linux/linux-tcpip.h>
+#include <linux/libcfs/linux/linux-bitops.h>
+#include <linux/libcfs/linux/linux-types.h>
+#include <linux/libcfs/linux/kp30.h>
+
+#include <asm/types.h>
+#include <linux/types.h>
+#include <asm/timex.h>
+#include <linux/sched.h> /* THREAD_SIZE */
+#include <linux/rbtree.h>
+
+#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
+
+#if !defined(__x86_64__)
+# ifdef  __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -				 \
+			  ((unsigned long)__builtin_dwarf_cfa() &       \
+			   (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK() (THREAD_SIZE -				 \
+			  ((unsigned long)__builtin_frame_address(0) &  \
+			   (THREAD_SIZE - 1)))
+# endif /* __ia64__ */
+
+#define __CHECK_STACK(msgdata, mask, cdls)			      \
+do {								    \
+	if (unlikely(CDEBUG_STACK() > libcfs_stack)) {		  \
+		LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);   \
+		libcfs_stack = CDEBUG_STACK();			  \
+		libcfs_debug_msg(msgdata,			       \
+				 "maximum lustre stack %lu\n",	  \
+				 CDEBUG_STACK());		       \
+		(msgdata)->msg_mask = mask;			     \
+		(msgdata)->msg_cdls = cdls;			     \
+		dump_stack();					   \
+	      /*panic("LBUG");*/					\
+	}							       \
+} while (0)
+#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
+#else /* __x86_64__ */
+#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while(0)
+#define CDEBUG_STACK() (0L)
+#endif /* __x86_64__ */
+
+/* initial pid  */
+#define LUSTRE_LNET_PID	  12345
+
+#define ENTRY_NESTING_SUPPORT (1)
+#define ENTRY_NESTING   do {;} while (0)
+#define EXIT_NESTING   do {;} while (0)
+#define __current_nesting_level() (0)
+
+/**
+ * Platform specific declarations for cfs_curproc API (libcfs/curproc.h)
+ *
+ * Implementation is in linux-curproc.c
+ */
+#define CFS_CURPROC_COMM_MAX (sizeof ((struct task_struct *)0)->comm)
+
+#include <linux/capability.h>
+
+/* long integer with size equal to pointer */
+typedef unsigned long ulong_ptr_t;
+typedef long long_ptr_t;
+
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
+
+
+
+#endif /* _LINUX_LIBCFS_H */

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h
new file mode 100644
index 0000000..43936e3
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h

@@ -0,0 +1,38 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-bitops.h
+ */
+#include <linux/bitops.h>

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h
new file mode 100644
index 0000000..224371c
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h

@@ -0,0 +1,175 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_LINUX_CPU_H__
+#define __LIBCFS_LINUX_CPU_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/topology.h>
+#include <linux/version.h>
+
+
+#ifdef CONFIG_SMP
+
+#define HAVE_LIBCFS_CPT
+
+/** virtual processing unit */
+struct cfs_cpu_partition {
+	/* CPUs mask for this partition */
+	cpumask_t			*cpt_cpumask;
+	/* nodes mask for this partition */
+	nodemask_t			*cpt_nodemask;
+	/* spread rotor for NUMA allocator */
+	unsigned			cpt_spread_rotor;
+};
+
+/** descriptor for CPU partitions */
+struct cfs_cpt_table {
+	/* version, reserved for hotplug */
+	unsigned			ctb_version;
+	/* spread rotor for NUMA allocator */
+	unsigned			ctb_spread_rotor;
+	/* # of CPU partitions */
+	unsigned			ctb_nparts;
+	/* partitions tables */
+	struct cfs_cpu_partition	*ctb_parts;
+	/* shadow HW CPU to CPU partition ID */
+	int				*ctb_cpu2cpt;
+	/* all cpus in this partition table */
+	cpumask_t			*ctb_cpumask;
+	/* all nodes in this partition table */
+	nodemask_t			*ctb_nodemask;
+};
+
+void cfs_cpu_core_siblings(int cpu, cpumask_t *mask);
+void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask);
+void cfs_node_to_cpumask(int node, cpumask_t *mask);
+int cfs_cpu_core_nsiblings(int cpu);
+int cfs_cpu_ht_nsiblings(int cpu);
+
+/**
+ * comment out definitions for compatible layer
+ * #define CFS_CPU_NR			  NR_CPUS
+ *
+ * typedef cpumask_t			   cfs_cpumask_t;
+ *
+ * #define cfs_cpu_current()		   smp_processor_id()
+ * #define cfs_cpu_online(i)		   cpu_online(i)
+ * #define cfs_cpu_online_num()		num_online_cpus()
+ * #define cfs_cpu_online_for_each(i)	  for_each_online_cpu(i)
+ * #define cfs_cpu_possible_num()	      num_possible_cpus()
+ * #define cfs_cpu_possible_for_each(i)	for_each_possible_cpu(i)
+ *
+ * #ifdef CONFIG_CPUMASK_SIZE
+ * #define cfs_cpu_mask_size()		 cpumask_size()
+ * #else
+ * #define cfs_cpu_mask_size()		 sizeof(cfs_cpumask_t)
+ * #endif
+ *
+ * #define cfs_cpu_mask_set(i, mask)	   cpu_set(i, mask)
+ * #define cfs_cpu_mask_unset(i, mask)	 cpu_clear(i, mask)
+ * #define cfs_cpu_mask_isset(i, mask)	 cpu_isset(i, mask)
+ * #define cfs_cpu_mask_clear(mask)	    cpus_clear(mask)
+ * #define cfs_cpu_mask_empty(mask)	    cpus_empty(mask)
+ * #define cfs_cpu_mask_weight(mask)	   cpus_weight(mask)
+ * #define cfs_cpu_mask_first(mask)	    first_cpu(mask)
+ * #define cfs_cpu_mask_any_online(mask)      (any_online_cpu(mask) != NR_CPUS)
+ * #define cfs_cpu_mask_for_each(i, mask)      for_each_cpu_mask(i, mask)
+ * #define cfs_cpu_mask_bind(t, mask)	  set_cpus_allowed(t, mask)
+ *
+ * #ifdef HAVE_CPUMASK_COPY
+ * #define cfs_cpu_mask_copy(dst, src)	 cpumask_copy(dst, src)
+ * #else
+ * #define cfs_cpu_mask_copy(dst, src)	 memcpy(dst, src, sizeof(*src))
+ * #endif
+ *
+ * static inline void
+ * cfs_cpu_mask_of_online(cfs_cpumask_t *mask)
+ * {
+ * cfs_cpu_mask_copy(mask, &cpu_online_map);
+ * }
+ *
+ * #ifdef CONFIG_NUMA
+ *
+ * #define CFS_NODE_NR			 MAX_NUMNODES
+ *
+ * typedef nodemask_t			  cfs_node_mask_t;
+ *
+ * #define cfs_node_of_cpu(cpu)		cpu_to_node(cpu)
+ * #define cfs_node_online(i)		  node_online(i)
+ * #define cfs_node_online_num()	       num_online_nodes()
+ * #define cfs_node_online_for_each(i)	 for_each_online_node(i)
+ * #define cfs_node_possible_num()	     num_possible_nodes()
+ * #define cfs_node_possible_for_each(i)       for_each_node(i)
+ *
+ * static inline void cfs_node_to_cpumask(int node, cfs_cpumask_t *mask)
+ * {
+ * #if defined(HAVE_NODE_TO_CPUMASK)
+ *      *mask = node_to_cpumask(node);
+ * #elif defined(HAVE_CPUMASK_OF_NODE)
+ *      cfs_cpu_mask_copy(mask, cpumask_of_node(node));
+ * #else
+ * # error "Needs node_to_cpumask or cpumask_of_node"
+ * #endif
+ * }
+ *
+ * #define cfs_node_mask_set(i, mask)	  node_set(i, mask)
+ * #define cfs_node_mask_unset(i, mask)	node_clear(i, mask)
+ * #define cfs_node_mask_isset(i, mask)	node_isset(i, mask)
+ * #define cfs_node_mask_clear(mask)	   nodes_reset(mask)
+ * #define cfs_node_mask_empty(mask)	   nodes_empty(mask)
+ * #define cfs_node_mask_weight(mask)	  nodes_weight(mask)
+ * #define cfs_node_mask_for_each(i, mask)     for_each_node_mask(i, mask)
+ * #define cfs_node_mask_copy(dst, src)	memcpy(dst, src, sizeof(*src))
+ *
+ * static inline void
+ * cfs_node_mask_of_online(cfs_node_mask_t *mask)
+ * {
+ *       cfs_node_mask_copy(mask, &node_online_map);
+ * }
+ *
+ * #endif
+ */
+
+#endif /* CONFIG_SMP */
+#endif /* __LIBCFS_LINUX_CPU_H__ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h
new file mode 100644
index 0000000..97c771cf
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h

@@ -0,0 +1,49 @@
+ /*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/**
+ * Linux crypto hash specific functions.
+ */
+
+/**
+ * Functions for start/stop shash CRC32 algorithm.
+ */
+int cfs_crypto_crc32_register(void);
+void cfs_crypto_crc32_unregister(void);
+
+/**
+ * Functions for start/stop shash adler32 algorithm.
+ */
+int cfs_crypto_adler32_register(void);
+void cfs_crypto_adler32_unregister(void);
+
+/**
+ * Functions for start/stop shash crc32 pclmulqdq
+ */
+int cfs_crypto_crc32_pclmul_register(void);
+void cfs_crypto_crc32_pclmul_unregister(void);

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h
new file mode 100644
index 0000000..eebf138
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h

@@ -0,0 +1,92 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-fs.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_FS_H__
+#define __LIBCFS_LINUX_CFS_FS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/mount.h>
+#include <linux/backing-dev.h>
+#include <linux/posix_acl_xattr.h>
+
+#define filp_size(f)					\
+	(i_size_read((f)->f_dentry->d_inode))
+#define filp_poff(f)					\
+	(&(f)->f_pos)
+
+# define do_fsync(fp, flag)				\
+	((fp)->f_op->fsync(fp, 0, LLONG_MAX, flag))
+
+#define filp_read(fp, buf, size, pos)			\
+	((fp)->f_op->read((fp), (buf), (size), pos))
+
+#define filp_write(fp, buf, size, pos)			\
+	((fp)->f_op->write((fp), (buf), (size), pos))
+
+#define filp_fsync(fp)					\
+	do_fsync(fp, 1)
+
+#define flock_type(fl)			((fl)->fl_type)
+#define flock_set_type(fl, type)	do { (fl)->fl_type = (type); } while (0)
+#define flock_pid(fl)			((fl)->fl_pid)
+#define flock_set_pid(fl, pid)		do { (fl)->fl_pid = (pid); } while (0)
+#define flock_start(fl)			((fl)->fl_start)
+#define flock_set_start(fl, st)		do { (fl)->fl_start = (st); } while (0)
+#define flock_end(fl)			((fl)->fl_end)
+#define flock_set_end(fl, end)		do { (fl)->fl_end = (end); } while (0)
+
+#ifndef IFSHIFT
+#define IFSHIFT			12
+#endif
+
+#ifndef IFTODT
+#define IFTODT(type)		(((type) & S_IFMT) >> IFSHIFT)
+#endif
+#ifndef DTTOIF
+#define DTTOIF(dirtype)		((dirtype) << IFSHIFT)
+#endif
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h
new file mode 100644
index 0000000..6fbcbf3
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h

@@ -0,0 +1,204 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-lock.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_LOCK_H__
+#define __LIBCFS_LINUX_CFS_LOCK_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/mutex.h>
+
+/*
+ * IMPORTANT !!!!!!!!
+ *
+ * All locks' declaration are not guaranteed to be initialized,
+ * Althought some of they are initialized in Linux. All locks
+ * declared by CFS_DECL_* should be initialized explicitly.
+ */
+
+/*
+ * spin_lock "implementation" (use Linux kernel's primitives)
+ *
+ * - spin_lock_init(x)
+ * - spin_lock(x)
+ * - spin_lock_bh(x)
+ * - spin_lock_bh_init(x)
+ * - spin_unlock(x)
+ * - spin_unlock_bh(x)
+ * - spin_trylock(x)
+ * - spin_is_locked(x)
+ *
+ * - spin_lock_irq(x)
+ * - spin_lock_irqsave(x, f)
+ * - spin_unlock_irqrestore(x, f)
+ * - read_lock_irqsave(lock, f)
+ * - write_lock_irqsave(lock, f)
+ * - write_unlock_irqrestore(lock, f)
+ */
+
+/*
+ * spinlock "implementation"
+ */
+
+
+
+
+/*
+ * rw_semaphore "implementation" (use Linux kernel's primitives)
+ *
+ * - sema_init(x)
+ * - init_rwsem(x)
+ * - down_read(x)
+ * - up_read(x)
+ * - down_write(x)
+ * - up_write(x)
+ */
+
+
+#define fini_rwsem(s)		do {} while (0)
+
+
+/*
+ * rwlock_t "implementation" (use Linux kernel's primitives)
+ *
+ * - rwlock_init(x)
+ * - read_lock(x)
+ * - read_unlock(x)
+ * - write_lock(x)
+ * - write_unlock(x)
+ * - write_lock_bh(x)
+ * - write_unlock_bh(x)
+ *
+ * - RW_LOCK_UNLOCKED
+ */
+
+
+#ifndef DEFINE_RWLOCK
+#define DEFINE_RWLOCK(lock)	rwlock_t lock = __RW_LOCK_UNLOCKED(lock)
+#endif
+
+/*
+ * completion "implementation" (use Linux kernel's primitives)
+ *
+ * - DECLARE_COMPLETION(work)
+ * - INIT_COMPLETION(c)
+ * - COMPLETION_INITIALIZER(work)
+ * - init_completion(c)
+ * - complete(c)
+ * - wait_for_completion(c)
+ * - wait_for_completion_interruptible(c)
+ * - fini_completion(c)
+ */
+#define fini_completion(c) do { } while (0)
+
+/*
+ * semaphore "implementation" (use Linux kernel's primitives)
+ * - DEFINE_SEMAPHORE(name)
+ * - sema_init(sem, val)
+ * - up(sem)
+ * - down(sem)
+ * - down_interruptible(sem)
+ * - down_trylock(sem)
+ */
+
+/*
+ * mutex "implementation" (use Linux kernel's primitives)
+ *
+ * - DEFINE_MUTEX(name)
+ * - mutex_init(x)
+ * - mutex_lock(x)
+ * - mutex_unlock(x)
+ * - mutex_trylock(x)
+ * - mutex_is_locked(x)
+ * - mutex_destroy(x)
+ */
+
+#ifndef lockdep_set_class
+
+/**************************************************************************
+ *
+ * Lockdep "implementation". Also see liblustre.h
+ *
+ **************************************************************************/
+
+struct lock_class_key {
+	;
+};
+
+#define lockdep_set_class(lock, key) \
+	do { (void)sizeof(lock); (void)sizeof(key); } while (0)
+/* This has to be a macro, so that `subclass' can be undefined in kernels
+ * that do not support lockdep. */
+
+
+static inline void lockdep_off(void)
+{
+}
+
+static inline void lockdep_on(void)
+{
+}
+#else
+
+#endif /* lockdep_set_class */
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#ifndef mutex_lock_nested
+#define mutex_lock_nested(mutex, subclass) mutex_lock(mutex)
+#endif
+
+#ifndef spin_lock_nested
+#define spin_lock_nested(lock, subclass) spin_lock(lock)
+#endif
+
+#ifndef down_read_nested
+#define down_read_nested(lock, subclass) down_read(lock)
+#endif
+
+#ifndef down_write_nested
+#define down_write_nested(lock, subclass) down_write(lock)
+#endif
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+
+#endif /* __LIBCFS_LINUX_CFS_LOCK_H__ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
new file mode 100644
index 0000000..042a2bc
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h

@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_MEM_H__
+#define __LIBCFS_LINUX_CFS_MEM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+
+#define CFS_PAGE_MASK		   (~((__u64)PAGE_CACHE_SIZE-1))
+#define page_index(p)       ((p)->index)
+
+#define memory_pressure_get() (current->flags & PF_MEMALLOC)
+#define memory_pressure_set() do { current->flags |= PF_MEMALLOC; } while (0)
+#define memory_pressure_clr() do { current->flags &= ~PF_MEMALLOC; } while (0)
+
+#if BITS_PER_LONG == 32
+/* limit to lowmem on 32-bit systems */
+#define NUM_CACHEPAGES \
+	min(num_physpages, 1UL << (30 - PAGE_CACHE_SHIFT) * 3 / 4)
+#else
+#define NUM_CACHEPAGES num_physpages
+#endif
+
+/*
+ * In Linux there is no way to determine whether current execution context is
+ * blockable.
+ */
+#define ALLOC_ATOMIC_TRY   GFP_ATOMIC
+
+#define DECL_MMSPACE		mm_segment_t __oldfs
+#define MMSPACE_OPEN \
+	do { __oldfs = get_fs(); set_fs(get_ds());} while(0)
+#define MMSPACE_CLOSE	       set_fs(__oldfs)
+
+/*
+ * Shrinker
+ */
+
+# define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)  \
+		       struct shrinker *shrinker, \
+		       struct shrink_control *sc
+# define shrink_param(sc, var) ((sc)->var)
+
+typedef int (*shrinker_t)(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask));
+
+static inline
+struct shrinker *set_shrinker(int seek, shrinker_t func)
+{
+	struct shrinker *s;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return (NULL);
+
+	s->shrink = func;
+	s->seeks = seek;
+
+	register_shrinker(s);
+
+	return s;
+}
+
+static inline
+void remove_shrinker(struct shrinker *shrinker)
+{
+	if (shrinker == NULL)
+		return;
+
+	unregister_shrinker(shrinker);
+	kfree(shrinker);
+}
+
+#endif /* __LINUX_CFS_MEM_H__ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h
new file mode 100644
index 0000000..a4963a8
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h

@@ -0,0 +1,241 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-prim.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_PRIM_H__
+#define __LIBCFS_LINUX_CFS_PRIM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/mm.h>
+#include <linux/timer.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/random.h>
+
+#include <linux/miscdevice.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+#include <asm/div64.h>
+
+#include <linux/libcfs/linux/linux-time.h>
+
+
+/*
+ * CPU
+ */
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#ifdef NR_CPUS
+#else
+#define NR_CPUS     1
+#endif
+
+/*
+ * cache
+ */
+
+/*
+ * IRQs
+ */
+
+
+/*
+ * Pseudo device register
+ */
+typedef struct miscdevice		psdev_t;
+
+/*
+ * Sysctl register
+ */
+typedef struct ctl_table		ctl_table_t;
+typedef struct ctl_table_header		ctl_table_header_t;
+
+#define cfs_register_sysctl_table(t, a) register_sysctl_table(t)
+
+#define DECLARE_PROC_HANDLER(name)		      \
+static int					      \
+LL_PROC_PROTO(name)				     \
+{						       \
+	DECLARE_LL_PROC_PPOS_DECL;		      \
+							\
+	return proc_call_handler(table->data, write,    \
+				 ppos, buffer, lenp,    \
+				 __##name);	     \
+}
+
+/*
+ * Symbol register
+ */
+#define cfs_symbol_register(s, p)       do {} while(0)
+#define cfs_symbol_unregister(s)	do {} while(0)
+#define cfs_symbol_get(s)	       symbol_get(s)
+#define cfs_symbol_put(s)	       symbol_put(s)
+
+typedef struct module module_t;
+
+/*
+ * Proc file system APIs
+ */
+typedef struct proc_dir_entry	   proc_dir_entry_t;
+
+/*
+ * Wait Queue
+ */
+
+
+typedef long			    cfs_task_state_t;
+
+#define CFS_DECL_WAITQ(wq)		DECLARE_WAIT_QUEUE_HEAD(wq)
+
+/*
+ * Task struct
+ */
+typedef struct task_struct	      task_t;
+#define DECL_JOURNAL_DATA	   void *journal_info
+#define PUSH_JOURNAL		do {    \
+	journal_info = current->journal_info;   \
+	current->journal_info = NULL;	   \
+	} while(0)
+#define POP_JOURNAL		 do {    \
+	current->journal_info = journal_info;   \
+	} while(0)
+
+/* Module interfaces */
+#define cfs_module(name, version, init, fini) \
+	module_init(init);		    \
+	module_exit(fini)
+
+/*
+ * Signal
+ */
+
+/*
+ * Timer
+ */
+typedef struct timer_list timer_list_t;
+
+
+#ifndef wait_event_timeout /* Only for RHEL3 2.4.21 kernel */
+#define __wait_event_timeout(wq, condition, timeout, ret)	\
+do {							     \
+	int __ret = 0;					   \
+	if (!(condition)) {				      \
+		wait_queue_t __wait;			     \
+		unsigned long expire;			    \
+								 \
+		init_waitqueue_entry(&__wait, current);	  \
+		expire = timeout + jiffies;		      \
+		add_wait_queue(&wq, &__wait);		    \
+		for (;;) {				       \
+			set_current_state(TASK_UNINTERRUPTIBLE); \
+			if (condition)			   \
+				break;			   \
+			if (jiffies > expire) {		  \
+				ret = jiffies - expire;	  \
+				break;			   \
+			}					\
+			schedule_timeout(timeout);	       \
+		}						\
+		current->state = TASK_RUNNING;		   \
+		remove_wait_queue(&wq, &__wait);		 \
+	}							\
+} while (0)
+/*
+   retval == 0; condition met; we're good.
+   retval > 0; timed out.
+*/
+#define cfs_waitq_wait_event_timeout(wq, condition, timeout, ret)    \
+do {								 \
+	ret = 0;						     \
+	if (!(condition))					    \
+		__wait_event_timeout(wq, condition, timeout, ret);   \
+} while (0)
+#else
+#define cfs_waitq_wait_event_timeout(wq, condition, timeout, ret)    \
+	ret = wait_event_timeout(wq, condition, timeout)
+#endif
+
+#define cfs_waitq_wait_event_interruptible_timeout(wq, c, timeout, ret) \
+	ret = wait_event_interruptible_timeout(wq, c, timeout)
+
+/*
+ * atomic
+ */
+
+
+#define cfs_atomic_add_unless(atom, a, u)    atomic_add_unless(atom, a, u)
+#define cfs_atomic_cmpxchg(atom, old, nv)    atomic_cmpxchg(atom, old, nv)
+
+/*
+ * membar
+ */
+
+
+/*
+ * interrupt
+ */
+
+
+/*
+ * might_sleep
+ */
+
+/*
+ * group_info
+ */
+typedef struct group_info group_info_t;
+
+
+/*
+ * Random bytes
+ */
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h
new file mode 100644
index 0000000..687f33f
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h

@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-tcpip.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_TCP_H__
+#define __LIBCFS_LINUX_CFS_TCP_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <net/sock.h>
+
+#ifndef HIPQUAD
+// XXX Should just kill all users
+#if defined(__LITTLE_ENDIAN)
+#define HIPQUAD(addr) \
+	((unsigned char *)&addr)[3], \
+	((unsigned char *)&addr)[2], \
+	((unsigned char *)&addr)[1], \
+	((unsigned char *)&addr)[0]
+#elif defined(__BIG_ENDIAN)
+#define HIPQUAD NIPQUAD
+#else
+#error "Please fix asm/byteorder.h"
+#endif /* __LITTLE_ENDIAN */
+#endif
+
+typedef struct socket   socket_t;
+
+#define SOCK_SNDBUF(so)	 ((so)->sk->sk_sndbuf)
+#define SOCK_TEST_NOSPACE(so)   test_bit(SOCK_NOSPACE, &(so)->flags)
+
+static inline int
+cfs_sock_error(struct socket *sock)
+{
+	return sock->sk->sk_err;
+}
+
+static inline int
+cfs_sock_wmem_queued(struct socket *sock)
+{
+	return sock->sk->sk_wmem_queued;
+}
+
+#define cfs_sk_sleep(sk)	sk_sleep(sk)
+
+#define DEFAULT_NET	(&init_net)
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h
new file mode 100644
index 0000000..4a48b91
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h

@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-time.h
+ *
+ * Implementation of portable time API for Linux (kernel and user-level).
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_LINUX_LINUX_TIME_H__
+#define __LIBCFS_LINUX_LINUX_TIME_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+/* Portable time API */
+
+/*
+ * Platform provides three opaque data-types:
+ *
+ *  cfs_time_t	represents point in time. This is internal kernel
+ *		    time rather than "wall clock". This time bears no
+ *		    relation to gettimeofday().
+ *
+ *  cfs_duration_t    represents time interval with resolution of internal
+ *		    platform clock
+ *
+ *  cfs_fs_time_t     represents instance in world-visible time. This is
+ *		    used in file-system time-stamps
+ *
+ *  cfs_time_t     cfs_time_current(void);
+ *  cfs_time_t     cfs_time_add    (cfs_time_t, cfs_duration_t);
+ *  cfs_duration_t cfs_time_sub    (cfs_time_t, cfs_time_t);
+ *  int	    cfs_impl_time_before (cfs_time_t, cfs_time_t);
+ *  int	    cfs_impl_time_before_eq(cfs_time_t, cfs_time_t);
+ *
+ *  cfs_duration_t cfs_duration_build(int64_t);
+ *
+ *  time_t	 cfs_duration_sec (cfs_duration_t);
+ *  void	   cfs_duration_usec(cfs_duration_t, struct timeval *);
+ *  void	   cfs_duration_nsec(cfs_duration_t, struct timespec *);
+ *
+ *  void	   cfs_fs_time_current(cfs_fs_time_t *);
+ *  time_t	 cfs_fs_time_sec    (cfs_fs_time_t *);
+ *  void	   cfs_fs_time_usec   (cfs_fs_time_t *, struct timeval *);
+ *  void	   cfs_fs_time_nsec   (cfs_fs_time_t *, struct timespec *);
+ *  int	    cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *);
+ *  int	    cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *);
+ *
+ *  CFS_TIME_FORMAT
+ *  CFS_DURATION_FORMAT
+ *
+ */
+
+#define ONE_BILLION ((u_int64_t)1000000000)
+#define ONE_MILLION 1000000
+
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/time.h>
+#include <asm/div64.h>
+
+#include <linux/libcfs/linux/portals_compat25.h>
+
+/*
+ * post 2.5 kernels.
+ */
+
+#include <linux/jiffies.h>
+
+typedef struct timespec cfs_fs_time_t;
+
+static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v)
+{
+	v->tv_sec  = t->tv_sec;
+	v->tv_usec = t->tv_nsec / 1000;
+}
+
+static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s)
+{
+	*s = *t;
+}
+
+/*
+ * internal helper function used by cfs_fs_time_before*()
+ */
+static inline unsigned long long __cfs_fs_time_flat(cfs_fs_time_t *t)
+{
+	return (unsigned long long)t->tv_sec * ONE_BILLION + t->tv_nsec;
+}
+
+
+/*
+ * Generic kernel stuff
+ */
+
+typedef unsigned long cfs_time_t;      /* jiffies */
+typedef long cfs_duration_t;
+typedef cycles_t cfs_cycles_t;
+
+static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
+{
+	return time_before(t1, t2);
+}
+
+static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
+{
+	return time_before_eq(t1, t2);
+}
+
+static inline cfs_time_t cfs_time_current(void)
+{
+	return jiffies;
+}
+
+static inline time_t cfs_time_current_sec(void)
+{
+	return get_seconds();
+}
+
+static inline void cfs_fs_time_current(cfs_fs_time_t *t)
+{
+	*t = CURRENT_TIME;
+}
+
+static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t)
+{
+	return t->tv_sec;
+}
+
+static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+	return __cfs_fs_time_flat(t1) <  __cfs_fs_time_flat(t2);
+}
+
+static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+	return __cfs_fs_time_flat(t1) <= __cfs_fs_time_flat(t2);
+}
+
+#if 0
+static inline cfs_duration_t cfs_duration_build(int64_t nano)
+{
+#if (BITS_PER_LONG == 32)
+	/* We cannot use do_div(t, ONE_BILLION), do_div can only process
+	 * 64 bits n and 32 bits base */
+	int64_t  t = nano * HZ;
+	do_div(t, 1000);
+	do_div(t, 1000000);
+	return (cfs_duration_t)t;
+#else
+	return (nano * HZ / ONE_BILLION);
+#endif
+}
+#endif
+
+static inline cfs_duration_t cfs_time_seconds(int seconds)
+{
+	return ((cfs_duration_t)seconds) * HZ;
+}
+
+static inline time_t cfs_duration_sec(cfs_duration_t d)
+{
+	return d / HZ;
+}
+
+static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s)
+{
+#if (BITS_PER_LONG == 32) && (HZ > 4096)
+	__u64 t;
+
+	s->tv_sec = d / HZ;
+	t = (d - (cfs_duration_t)s->tv_sec * HZ) * ONE_MILLION;
+	do_div(t, HZ);
+	s->tv_usec = t;
+#else
+	s->tv_sec = d / HZ;
+	s->tv_usec = ((d - (cfs_duration_t)s->tv_sec * HZ) * \
+		ONE_MILLION) / HZ;
+#endif
+}
+
+static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s)
+{
+#if (BITS_PER_LONG == 32)
+	__u64 t;
+
+	s->tv_sec = d / HZ;
+	t = (d - s->tv_sec * HZ) * ONE_BILLION;
+	do_div(t, HZ);
+	s->tv_nsec = t;
+#else
+	s->tv_sec = d / HZ;
+	s->tv_nsec = ((d - s->tv_sec * HZ) * ONE_BILLION) / HZ;
+#endif
+}
+
+#define cfs_time_current_64 get_jiffies_64
+
+static inline __u64 cfs_time_add_64(__u64 t, __u64 d)
+{
+	return t + d;
+}
+
+static inline __u64 cfs_time_shift_64(int seconds)
+{
+	return cfs_time_add_64(cfs_time_current_64(),
+			       cfs_time_seconds(seconds));
+}
+
+static inline int cfs_time_before_64(__u64 t1, __u64 t2)
+{
+	return (__s64)t2 - (__s64)t1 > 0;
+}
+
+static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
+{
+	return (__s64)t2 - (__s64)t1 >= 0;
+}
+
+
+/*
+ * One jiffy
+ */
+#define CFS_TICK		(1)
+
+#define CFS_TIME_T	      "%lu"
+#define CFS_DURATION_T	  "%ld"
+
+
+#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h
new file mode 100644
index 0000000..1423949
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h

@@ -0,0 +1,36 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/user-bitops.h
+ */
+#include <linux/types.h>

diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h b/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h
new file mode 100644
index 0000000..132a4be
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h

@@ -0,0 +1,114 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_PORTALS_COMPAT_H__
+#define __LIBCFS_LINUX_PORTALS_COMPAT_H__
+
+// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved
+#if defined(SPINLOCK_DEBUG) && SPINLOCK_DEBUG
+#  define SIGNAL_MASK_ASSERT() \
+   LASSERT(current->sighand->siglock.magic == SPINLOCK_MAGIC)
+#else
+# define SIGNAL_MASK_ASSERT()
+#endif
+// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved
+
+#define SIGNAL_MASK_LOCK(task, flags)				  \
+	spin_lock_irqsave(&task->sighand->siglock, flags)
+#define SIGNAL_MASK_UNLOCK(task, flags)				\
+	spin_unlock_irqrestore(&task->sighand->siglock, flags)
+#define USERMODEHELPER(path, argv, envp)			       \
+	call_usermodehelper(path, argv, envp, 1)
+#define clear_tsk_thread_flag(current, TIF_SIGPENDING)	  clear_tsk_thread_flag(current,       \
+							TIF_SIGPENDING)
+# define smp_num_cpus	      num_online_cpus()
+
+#define cfs_wait_event_interruptible(wq, condition, ret)	       \
+	ret = wait_event_interruptible(wq, condition)
+#define cfs_wait_event_interruptible_exclusive(wq, condition, ret)     \
+	ret = wait_event_interruptible_exclusive(wq, condition)
+
+#define THREAD_NAME(comm, len, fmt, a...)			      \
+	snprintf(comm, len, fmt, ## a)
+
+/* 2.6 alloc_page users can use page->lru */
+#define PAGE_LIST_ENTRY lru
+#define PAGE_LIST(page) ((page)->lru)
+
+#ifndef __user
+#define __user
+#endif
+
+#ifndef __fls
+#define __cfs_fls fls
+#else
+#define __cfs_fls __fls
+#endif
+
+#define ll_proc_dointvec(table, write, filp, buffer, lenp, ppos)	\
+	proc_dointvec(table, write, buffer, lenp, ppos);
+
+#define ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos)	\
+	proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+#define ll_proc_dostring(table, write, filp, buffer, lenp, ppos)	\
+	proc_dostring(table, write, buffer, lenp, ppos);
+#define LL_PROC_PROTO(name)					     \
+	name(ctl_table_t *table, int write,		      \
+	     void __user *buffer, size_t *lenp, loff_t *ppos)
+#define DECLARE_LL_PROC_PPOS_DECL
+
+/* helper for sysctl handlers */
+int proc_call_handler(void *data, int write,
+		      loff_t *ppos, void *buffer, size_t *lenp,
+		      int (*handler)(void *data, int write,
+				     loff_t pos, void *buffer, int len));
+/*
+ * CPU
+ */
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#ifdef NR_CPUS
+#else
+#define NR_CPUS     1
+#endif
+
+#define cfs_register_sysctl_table(t, a) register_sysctl_table(t)
+
+#endif /* _PORTALS_COMPAT_H */

diff --git a/drivers/staging/lustre/include/linux/libcfs/lucache.h b/drivers/staging/lustre/include/linux/libcfs/lucache.h
new file mode 100644
index 0000000..7ae36fc
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/lucache.h

@@ -0,0 +1,162 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUCACHE_H
+#define _LUCACHE_H
+
+#include <linux/libcfs/libcfs.h>
+
+/** \defgroup ucache ucache
+ *
+ * @{
+ */
+
+#define UC_CACHE_NEW	    0x01
+#define UC_CACHE_ACQUIRING      0x02
+#define UC_CACHE_INVALID	0x04
+#define UC_CACHE_EXPIRED	0x08
+
+#define UC_CACHE_IS_NEW(i)	  ((i)->ue_flags & UC_CACHE_NEW)
+#define UC_CACHE_IS_INVALID(i)      ((i)->ue_flags & UC_CACHE_INVALID)
+#define UC_CACHE_IS_ACQUIRING(i)    ((i)->ue_flags & UC_CACHE_ACQUIRING)
+#define UC_CACHE_IS_EXPIRED(i)      ((i)->ue_flags & UC_CACHE_EXPIRED)
+#define UC_CACHE_IS_VALID(i)	((i)->ue_flags == 0)
+
+#define UC_CACHE_SET_NEW(i)	 (i)->ue_flags |= UC_CACHE_NEW
+#define UC_CACHE_SET_INVALID(i)     (i)->ue_flags |= UC_CACHE_INVALID
+#define UC_CACHE_SET_ACQUIRING(i)   (i)->ue_flags |= UC_CACHE_ACQUIRING
+#define UC_CACHE_SET_EXPIRED(i)     (i)->ue_flags |= UC_CACHE_EXPIRED
+#define UC_CACHE_SET_VALID(i)       (i)->ue_flags = 0
+
+#define UC_CACHE_CLEAR_NEW(i)       (i)->ue_flags &= ~UC_CACHE_NEW
+#define UC_CACHE_CLEAR_ACQUIRING(i) (i)->ue_flags &= ~UC_CACHE_ACQUIRING
+#define UC_CACHE_CLEAR_INVALID(i)   (i)->ue_flags &= ~UC_CACHE_INVALID
+#define UC_CACHE_CLEAR_EXPIRED(i)   (i)->ue_flags &= ~UC_CACHE_EXPIRED
+
+struct upcall_cache_entry;
+
+struct md_perm {
+	lnet_nid_t      mp_nid;
+	__u32	   mp_perm;
+};
+
+struct md_identity {
+	struct upcall_cache_entry *mi_uc_entry;
+	uid_t		      mi_uid;
+	gid_t		      mi_gid;
+	group_info_t	  *mi_ginfo;
+	int			mi_nperms;
+	struct md_perm	    *mi_perms;
+};
+
+struct upcall_cache_entry {
+	struct list_head	      ue_hash;
+	__u64		   ue_key;
+	atomic_t	    ue_refcount;
+	int		     ue_flags;
+	wait_queue_head_t	     ue_waitq;
+	cfs_time_t	      ue_acquire_expire;
+	cfs_time_t	      ue_expire;
+	union {
+		struct md_identity     identity;
+	} u;
+};
+
+#define UC_CACHE_HASH_SIZE	(128)
+#define UC_CACHE_HASH_INDEX(id)   ((id) & (UC_CACHE_HASH_SIZE - 1))
+#define UC_CACHE_UPCALL_MAXPATH   (1024UL)
+
+struct upcall_cache;
+
+struct upcall_cache_ops {
+	void	    (*init_entry)(struct upcall_cache_entry *, void *args);
+	void	    (*free_entry)(struct upcall_cache *,
+				      struct upcall_cache_entry *);
+	int	     (*upcall_compare)(struct upcall_cache *,
+					  struct upcall_cache_entry *,
+					  __u64 key, void *args);
+	int	     (*downcall_compare)(struct upcall_cache *,
+					    struct upcall_cache_entry *,
+					    __u64 key, void *args);
+	int	     (*do_upcall)(struct upcall_cache *,
+				     struct upcall_cache_entry *);
+	int	     (*parse_downcall)(struct upcall_cache *,
+					  struct upcall_cache_entry *, void *);
+};
+
+struct upcall_cache {
+	struct list_head		uc_hashtable[UC_CACHE_HASH_SIZE];
+	spinlock_t		uc_lock;
+	rwlock_t		uc_upcall_rwlock;
+
+	char			uc_name[40];		/* for upcall */
+	char			uc_upcall[UC_CACHE_UPCALL_MAXPATH];
+	int			uc_acquire_expire;	/* seconds */
+	int			uc_entry_expire;	/* seconds */
+	struct upcall_cache_ops	*uc_ops;
+};
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+						  __u64 key, void *args);
+void upcall_cache_put_entry(struct upcall_cache *cache,
+			    struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+			  void *args);
+void upcall_cache_flush_idle(struct upcall_cache *cache);
+void upcall_cache_flush_all(struct upcall_cache *cache);
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args);
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+				       struct upcall_cache_ops *ops);
+void upcall_cache_cleanup(struct upcall_cache *cache);
+
+#if 0
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *hash,
+						  __u64 key, __u32 primary,
+						  __u32 ngroups, __u32 *groups);
+void upcall_cache_put_entry(struct upcall_cache *hash,
+			    struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *hash, __u32 err, __u64 key,
+			  __u32 primary, __u32 ngroups, __u32 *groups);
+void upcall_cache_flush_idle(struct upcall_cache *cache);
+void upcall_cache_flush_all(struct upcall_cache *cache);
+struct upcall_cache *upcall_cache_init(const char *name);
+void upcall_cache_cleanup(struct upcall_cache *hash);
+
+#endif
+
+/** @} ucache */
+
+#endif /* _LUCACHE_H */

diff --git a/drivers/staging/lustre/include/linux/libcfs/params_tree.h b/drivers/staging/lustre/include/linux/libcfs/params_tree.h
new file mode 100644
index 0000000..3f18a44
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/libcfs/params_tree.h

@@ -0,0 +1,166 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * API and structure definitions for params_tree.
+ *
+ * Author: LiuYing <emoly.liu@oracle.com>
+ */
+#ifndef __PARAMS_TREE_H__
+#define __PARAMS_TREE_H__
+
+#include <linux/libcfs/libcfs.h>
+
+#undef LPROCFS
+#if  defined(CONFIG_PROC_FS)
+# define LPROCFS
+#endif
+
+#ifdef LPROCFS
+typedef struct file			     cfs_param_file_t;
+typedef struct inode			    cfs_inode_t;
+typedef struct proc_inode		       cfs_proc_inode_t;
+typedef struct seq_file			 cfs_seq_file_t;
+typedef struct seq_operations		   cfs_seq_ops_t;
+typedef struct file_operations		  cfs_param_file_ops_t;
+typedef module_t			   *cfs_param_module_t;
+typedef struct proc_dir_entry		   cfs_param_dentry_t;
+typedef struct poll_table_struct		cfs_poll_table_t;
+#define CFS_PARAM_MODULE			THIS_MODULE
+#define cfs_file_private(file)		  (file->private_data)
+#define cfs_dentry_data(dentry)		 (dentry->data)
+#define cfs_proc_inode_pde(proc_inode)	  (proc_inode->pde)
+#define cfs_proc_inode(proc_inode)	      (proc_inode->vfs_inode)
+#define cfs_seq_read_common		     seq_read
+#define cfs_seq_lseek_common		    seq_lseek
+#define cfs_seq_private(seq)		    (seq->private)
+#define cfs_seq_printf(seq, format, ...)	seq_printf(seq, format,  \
+							   ## __VA_ARGS__)
+#define cfs_seq_release(inode, file)	    seq_release(inode, file)
+#define cfs_seq_puts(seq, s)		    seq_puts(seq, s)
+#define cfs_seq_putc(seq, s)		    seq_putc(seq, s)
+#define cfs_seq_read(file, buf, count, ppos, rc) (rc = seq_read(file, buf, \
+							    count, ppos))
+#define cfs_seq_open(file, ops, rc)	     (rc = seq_open(file, ops))
+
+#else /* !LPROCFS */
+
+typedef struct cfs_params_file {
+	void	   *param_private;
+	loff_t	  param_pos;
+	unsigned int    param_flags;
+} cfs_param_file_t;
+
+typedef struct cfs_param_inode {
+	void    *param_private;
+} cfs_inode_t;
+
+typedef struct cfs_param_dentry {
+	void *param_data;
+} cfs_param_dentry_t;
+
+typedef struct cfs_proc_inode {
+	cfs_param_dentry_t *param_pde;
+	cfs_inode_t	 param_inode;
+} cfs_proc_inode_t;
+
+struct cfs_seq_operations;
+typedef struct cfs_seq_file {
+	char		      *buf;
+	size_t		     size;
+	size_t		     from;
+	size_t		     count;
+	loff_t		     index;
+	loff_t		     version;
+	struct mutex			lock;
+	struct cfs_seq_operations *op;
+	void		      *private;
+} cfs_seq_file_t;
+
+typedef struct cfs_seq_operations {
+	void *(*start) (cfs_seq_file_t *m, loff_t *pos);
+	void  (*stop) (cfs_seq_file_t *m, void *v);
+	void *(*next) (cfs_seq_file_t *m, void *v, loff_t *pos);
+	int   (*show) (cfs_seq_file_t *m, void *v);
+} cfs_seq_ops_t;
+
+typedef void *cfs_param_module_t;
+typedef void *cfs_poll_table_t;
+
+typedef struct cfs_param_file_ops {
+	cfs_param_module_t owner;
+	int (*open) (cfs_inode_t *, struct file *);
+	loff_t (*llseek)(struct file *, loff_t, int);
+	int (*release) (cfs_inode_t *, cfs_param_file_t *);
+	unsigned int (*poll) (struct file *, cfs_poll_table_t *);
+	ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
+	ssize_t (*read)(struct file *, char *, size_t, loff_t *);
+} cfs_param_file_ops_t;
+typedef cfs_param_file_ops_t *cfs_lproc_filep_t;
+
+static inline cfs_proc_inode_t *FAKE_PROC_I(const cfs_inode_t *inode)
+{
+	return container_of(inode, cfs_proc_inode_t, param_inode);
+}
+
+#define CFS_PARAM_MODULE			NULL
+#define cfs_file_private(file)		  (file->param_private)
+#define cfs_dentry_data(dentry)		 (dentry->param_data)
+#define cfs_proc_inode(proc_inode)	      (proc_inode->param_inode)
+#define cfs_proc_inode_pde(proc_inode)	  (proc_inode->param_pde)
+#define cfs_seq_read_common		     NULL
+#define cfs_seq_lseek_common		    NULL
+#define cfs_seq_private(seq)		    (seq->private)
+#define cfs_seq_read(file, buf, count, ppos, rc) do {} while(0)
+#define cfs_seq_open(file, ops, rc)		     \
+do {						    \
+	 cfs_seq_file_t *p = cfs_file_private(file);    \
+	 if (!p) {				      \
+		LIBCFS_ALLOC(p, sizeof(*p));	    \
+		if (!p) {			       \
+			rc = -ENOMEM;		   \
+			break;			  \
+		}				       \
+		cfs_file_private(file) = p;	     \
+	}					       \
+	memset(p, 0, sizeof(*p));		       \
+	p->op = ops;				    \
+	rc = 0;					 \
+} while(0)
+
+#endif /* LPROCFS */
+
+/* XXX: params_tree APIs */
+
+#endif  /* __PARAMS_TREE_H__ */

diff --git a/drivers/staging/lustre/include/linux/lnet/api-support.h b/drivers/staging/lustre/include/linux/lnet/api-support.h
new file mode 100644
index 0000000..a8d91db
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/api-support.h

@@ -0,0 +1,44 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_SUPPORT_H__
+#define __LNET_API_SUPPORT_H__
+
+#include <linux/lnet/linux/api-support.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+#include <linux/lnet/lnet.h>
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/api.h b/drivers/staging/lustre/include/linux/lnet/api.h
new file mode 100644
index 0000000..e8642e3
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/api.h

@@ -0,0 +1,220 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_H__
+#define __LNET_API_H__
+
+/** \defgroup lnet LNet
+ *
+ * The Lustre Networking subsystem.
+ *
+ * LNet is an asynchronous message-passing API, which provides an unreliable
+ * connectionless service that can't guarantee any order. It supports OFA IB,
+ * TCP/IP, and Cray Portals, and routes between heterogeneous networks.
+ *
+ * LNet can run both in OS kernel space and in userspace as a library.
+ * @{
+ */
+
+#include <linux/lnet/types.h>
+
+/** \defgroup lnet_init_fini Initialization and cleanup
+ * The LNet must be properly initialized before any LNet calls can be made.
+ * @{ */
+int LNetInit(void);
+void LNetFini(void);
+
+int LNetNIInit(lnet_pid_t requested_pid);
+int LNetNIFini(void);
+/** @} lnet_init_fini */
+
+/** \defgroup lnet_addr LNet addressing and basic types
+ *
+ * Addressing scheme and basic data types of LNet.
+ *
+ * The LNet API is memory-oriented, so LNet must be able to address not only
+ * end-points but also memory region within a process address space.
+ * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process
+ * in a node. A portal represents an opening in the address space of a
+ * process. Match bits is criteria to identify a region of memory inside a
+ * portal, and offset specifies an offset within the memory region.
+ *
+ * LNet creates a table of portals for each process during initialization.
+ * This table has MAX_PORTALS entries and its size can't be dynamically
+ * changed. A portal stays empty until the owning process starts to add
+ * memory regions to it. A portal is sometimes called an index because
+ * it's an entry in the portals table of a process.
+ *
+ * \see LNetMEAttach
+ * @{ */
+int LNetGetId(unsigned int index, lnet_process_id_t *id);
+int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order);
+void LNetSnprintHandle(char *str, int str_len, lnet_handle_any_t handle);
+
+/** @} lnet_addr */
+
+
+/** \defgroup lnet_me Match entries
+ *
+ * A match entry (abbreviated as ME) describes a set of criteria to accept
+ * incoming requests.
+ *
+ * A portal is essentially a match list plus a set of attributes. A match
+ * list is a chain of MEs. Each ME includes a pointer to a memory descriptor
+ * and a set of match criteria. The match criteria can be used to reject
+ * incoming requests based on process ID or the match bits provided in the
+ * request. MEs can be dynamically inserted into a match list by LNetMEAttach()
+ * and LNetMEInsert(), and removed from its list by LNetMEUnlink().
+ * @{ */
+int LNetMEAttach(unsigned int      portal,
+		 lnet_process_id_t match_id_in,
+		 __u64	     match_bits_in,
+		 __u64	     ignore_bits_in,
+		 lnet_unlink_t     unlink_in,
+		 lnet_ins_pos_t    pos_in,
+		 lnet_handle_me_t *handle_out);
+
+int LNetMEInsert(lnet_handle_me_t  current_in,
+		 lnet_process_id_t match_id_in,
+		 __u64	     match_bits_in,
+		 __u64	     ignore_bits_in,
+		 lnet_unlink_t     unlink_in,
+		 lnet_ins_pos_t    position_in,
+		 lnet_handle_me_t *handle_out);
+
+int LNetMEUnlink(lnet_handle_me_t current_in);
+/** @} lnet_me */
+
+/** \defgroup lnet_md Memory descriptors
+ *
+ * A memory descriptor contains information about a region of a user's
+ * memory (either in kernel or user space) and optionally points to an
+ * event queue where information about the operations performed on the
+ * memory descriptor are recorded. Memory descriptor is abbreviated as
+ * MD and can be used interchangeably with the memory region it describes.
+ *
+ * The LNet API provides two operations to create MDs: LNetMDAttach()
+ * and LNetMDBind(); one operation to unlink and release the resources
+ * associated with a MD: LNetMDUnlink().
+ * @{ */
+int LNetMDAttach(lnet_handle_me_t  current_in,
+		 lnet_md_t	 md_in,
+		 lnet_unlink_t     unlink_in,
+		 lnet_handle_md_t *handle_out);
+
+int LNetMDBind(lnet_md_t	 md_in,
+	       lnet_unlink_t     unlink_in,
+	       lnet_handle_md_t *handle_out);
+
+int LNetMDUnlink(lnet_handle_md_t md_in);
+/** @} lnet_md */
+
+/** \defgroup lnet_eq Events and event queues
+ *
+ * Event queues (abbreviated as EQ) are used to log operations performed on
+ * local MDs. In particular, they signal the completion of a data transmission
+ * into or out of a MD. They can also be used to hold acknowledgments for
+ * completed PUT operations and indicate when a MD has been unlinked. Multiple
+ * MDs can share a single EQ. An EQ may have an optional event handler
+ * associated with it. If an event handler exists, it will be run for each
+ * event that is deposited into the EQ.
+ *
+ * In addition to the lnet_handle_eq_t, the LNet API defines two types
+ * associated with events: The ::lnet_event_kind_t defines the kinds of events
+ * that can be stored in an EQ. The lnet_event_t defines a structure that
+ * holds the information about with an event.
+ *
+ * There are five functions for dealing with EQs: LNetEQAlloc() is used to
+ * create an EQ and allocate the resources needed, while LNetEQFree()
+ * releases these resources and free the EQ. LNetEQGet() retrieves the next
+ * event from an EQ, and LNetEQWait() can be used to block a process until
+ * an EQ has at least one event. LNetEQPoll() can be used to test or wait
+ * on multiple EQs.
+ * @{ */
+int LNetEQAlloc(unsigned int       count_in,
+		lnet_eq_handler_t  handler,
+		lnet_handle_eq_t  *handle_out);
+
+int LNetEQFree(lnet_handle_eq_t eventq_in);
+
+int LNetEQGet(lnet_handle_eq_t  eventq_in,
+	      lnet_event_t     *event_out);
+
+
+int LNetEQWait(lnet_handle_eq_t  eventq_in,
+	       lnet_event_t     *event_out);
+
+int LNetEQPoll(lnet_handle_eq_t *eventqs_in,
+	       int	       neq_in,
+	       int	       timeout_ms,
+	       lnet_event_t     *event_out,
+	       int	      *which_eq_out);
+/** @} lnet_eq */
+
+/** \defgroup lnet_data Data movement operations
+ *
+ * The LNet API provides two data movement operations: LNetPut()
+ * and LNetGet().
+ * @{ */
+int LNetPut(lnet_nid_t	self,
+	    lnet_handle_md_t  md_in,
+	    lnet_ack_req_t    ack_req_in,
+	    lnet_process_id_t target_in,
+	    unsigned int      portal_in,
+	    __u64	     match_bits_in,
+	    unsigned int      offset_in,
+	    __u64	     hdr_data_in);
+
+int LNetGet(lnet_nid_t	self,
+	    lnet_handle_md_t  md_in,
+	    lnet_process_id_t target_in,
+	    unsigned int      portal_in,
+	    __u64	     match_bits_in,
+	    unsigned int      offset_in);
+/** @} lnet_data */
+
+
+/** \defgroup lnet_misc Miscellaneous operations.
+ * Miscellaneous operations.
+ * @{ */
+
+int LNetSetLazyPortal(int portal);
+int LNetClearLazyPortal(int portal);
+int LNetCtl(unsigned int cmd, void *arg);
+int LNetSetAsync(lnet_process_id_t id, int nasync);
+
+/** @} lnet_misc */
+
+/** @} lnet */
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
new file mode 100644
index 0000000..59bff0b
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h

@@ -0,0 +1,874 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-lnet.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef __LNET_LIB_LNET_H__
+#define __LNET_LIB_LNET_H__
+
+#include <linux/lnet/linux/lib-lnet.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+
+extern lnet_t  the_lnet;			/* THE network */
+
+#if  defined(LNET_USE_LIB_FREELIST)
+/* 1 CPT, simplify implementation... */
+# define LNET_CPT_MAX_BITS      0
+
+#else /* KERNEL and no freelist */
+
+# if (BITS_PER_LONG == 32)
+/* 2 CPTs, allowing more CPTs might make us under memory pressure */
+#  define LNET_CPT_MAX_BITS     1
+
+# else /* 64-bit system */
+/*
+ * 256 CPTs for thousands of CPUs, allowing more CPTs might make us
+ * under risk of consuming all lh_cookie.
+ */
+#  define LNET_CPT_MAX_BITS     8
+# endif /* BITS_PER_LONG == 32 */
+#endif
+
+/* max allowed CPT number */
+#define LNET_CPT_MAX	    (1 << LNET_CPT_MAX_BITS)
+
+#define LNET_CPT_NUMBER	 (the_lnet.ln_cpt_number)
+#define LNET_CPT_BITS	   (the_lnet.ln_cpt_bits)
+#define LNET_CPT_MASK	   ((1ULL << LNET_CPT_BITS) - 1)
+
+/** exclusive lock */
+#define LNET_LOCK_EX	    CFS_PERCPT_LOCK_EX
+
+static inline int lnet_is_wire_handle_none (lnet_handle_wire_t *wh)
+{
+	return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE &&
+		wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE);
+}
+
+static inline int lnet_md_exhausted (lnet_libmd_t *md)
+{
+	return (md->md_threshold == 0 ||
+		((md->md_options & LNET_MD_MAX_SIZE) != 0 &&
+		 md->md_offset + md->md_max_size > md->md_length));
+}
+
+static inline int lnet_md_unlinkable (lnet_libmd_t *md)
+{
+	/* Should unlink md when its refcount is 0 and either:
+	 *  - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink,
+	 *    in the latter case md may not be exhausted).
+	 *  - auto unlink is on and md is exhausted.
+	 */
+	if (md->md_refcount != 0)
+		return 0;
+
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0)
+		return 1;
+
+	return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 &&
+		lnet_md_exhausted(md));
+}
+
+#define lnet_cpt_table()	(the_lnet.ln_cpt_table)
+#define lnet_cpt_current()	cfs_cpt_current(the_lnet.ln_cpt_table, 1)
+
+static inline int
+lnet_cpt_of_cookie(__u64 cookie)
+{
+	unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK;
+
+	/* LNET_CPT_NUMBER doesn't have to be power2, which means we can
+	 * get illegal cpt from it's invalid cookie */
+	return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER;
+}
+
+static inline void
+lnet_res_lock(int cpt)
+{
+	cfs_percpt_lock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline void
+lnet_res_unlock(int cpt)
+{
+	cfs_percpt_unlock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline int
+lnet_res_lock_current(void)
+{
+	int cpt = lnet_cpt_current();
+
+	lnet_res_lock(cpt);
+	return cpt;
+}
+
+static inline void
+lnet_net_lock(int cpt)
+{
+	cfs_percpt_lock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline void
+lnet_net_unlock(int cpt)
+{
+	cfs_percpt_unlock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline int
+lnet_net_lock_current(void)
+{
+	int cpt = lnet_cpt_current();
+
+	lnet_net_lock(cpt);
+	return cpt;
+}
+
+#define LNET_LOCK()		lnet_net_lock(LNET_LOCK_EX)
+#define LNET_UNLOCK()		lnet_net_unlock(LNET_LOCK_EX)
+
+
+#define lnet_ptl_lock(ptl)	spin_lock(&(ptl)->ptl_lock)
+#define lnet_ptl_unlock(ptl)	spin_unlock(&(ptl)->ptl_lock)
+#define lnet_eq_wait_lock()	spin_lock(&the_lnet.ln_eq_wait_lock)
+#define lnet_eq_wait_unlock()	spin_unlock(&the_lnet.ln_eq_wait_lock)
+#define lnet_ni_lock(ni)	spin_lock(&(ni)->ni_lock)
+#define lnet_ni_unlock(ni)	spin_unlock(&(ni)->ni_lock)
+#define LNET_MUTEX_LOCK(m)	mutex_lock(m)
+#define LNET_MUTEX_UNLOCK(m)	mutex_unlock(m)
+
+
+#define MAX_PORTALS     64
+
+/* these are only used by code with LNET_USE_LIB_FREELIST, but we still
+ * exported them to !LNET_USE_LIB_FREELIST for easy implemetation */
+#define LNET_FL_MAX_MES		2048
+#define LNET_FL_MAX_MDS		2048
+#define LNET_FL_MAX_EQS		512
+#define LNET_FL_MAX_MSGS	2048    /* Outstanding messages */
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int lnet_freelist_init(lnet_freelist_t *fl, int n, int size);
+void lnet_freelist_fini(lnet_freelist_t *fl);
+
+static inline void *
+lnet_freelist_alloc (lnet_freelist_t *fl)
+{
+	/* ALWAYS called with liblock held */
+	lnet_freeobj_t *o;
+
+	if (list_empty (&fl->fl_list))
+		return (NULL);
+
+	o = list_entry (fl->fl_list.next, lnet_freeobj_t, fo_list);
+	list_del (&o->fo_list);
+	return ((void *)&o->fo_contents);
+}
+
+static inline void
+lnet_freelist_free (lnet_freelist_t *fl, void *obj)
+{
+	/* ALWAYS called with liblock held */
+	lnet_freeobj_t *o = list_entry (obj, lnet_freeobj_t, fo_contents);
+
+	list_add (&o->fo_list, &fl->fl_list);
+}
+
+
+static inline lnet_eq_t *
+lnet_eq_alloc (void)
+{
+	/* NEVER called with resource lock held */
+	struct lnet_res_container *rec = &the_lnet.ln_eq_container;
+	lnet_eq_t		  *eq;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_res_lock(0);
+	eq = (lnet_eq_t *)lnet_freelist_alloc(&rec->rec_freelist);
+	lnet_res_unlock(0);
+
+	return eq;
+}
+
+static inline void
+lnet_eq_free_locked(lnet_eq_t *eq)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_res_container *rec = &the_lnet.ln_eq_container;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	lnet_freelist_free(&rec->rec_freelist, eq);
+}
+
+static inline void
+lnet_eq_free(lnet_eq_t *eq)
+{
+	lnet_res_lock(0);
+	lnet_eq_free_locked(eq);
+	lnet_res_unlock(0);
+}
+
+static inline lnet_libmd_t *
+lnet_md_alloc (lnet_md_t *umd)
+{
+	/* NEVER called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_md_containers[0];
+	lnet_libmd_t		  *md;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_res_lock(0);
+	md = (lnet_libmd_t *)lnet_freelist_alloc(&rec->rec_freelist);
+	lnet_res_unlock(0);
+
+	if (md != NULL)
+		INIT_LIST_HEAD(&md->md_list);
+
+	return md;
+}
+
+static inline void
+lnet_md_free_locked(lnet_libmd_t *md)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_md_containers[0];
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	lnet_freelist_free(&rec->rec_freelist, md);
+}
+
+static inline void
+lnet_md_free(lnet_libmd_t *md)
+{
+	lnet_res_lock(0);
+	lnet_md_free_locked(md);
+	lnet_res_unlock(0);
+}
+
+static inline lnet_me_t *
+lnet_me_alloc(void)
+{
+	/* NEVER called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_me_containers[0];
+	lnet_me_t		  *me;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_res_lock(0);
+	me = (lnet_me_t *)lnet_freelist_alloc(&rec->rec_freelist);
+	lnet_res_unlock(0);
+
+	return me;
+}
+
+static inline void
+lnet_me_free_locked(lnet_me_t *me)
+{
+	/* ALWAYS called with resource lock held */
+	struct lnet_res_container *rec = the_lnet.ln_me_containers[0];
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	lnet_freelist_free(&rec->rec_freelist, me);
+}
+
+static inline void
+lnet_me_free(lnet_me_t *me)
+{
+	lnet_res_lock(0);
+	lnet_me_free_locked(me);
+	lnet_res_unlock(0);
+}
+
+static inline lnet_msg_t *
+lnet_msg_alloc (void)
+{
+	/* NEVER called with network lock held */
+	struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0];
+	lnet_msg_t		  *msg;
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_net_lock(0);
+	msg = (lnet_msg_t *)lnet_freelist_alloc(&msc->msc_freelist);
+	lnet_net_unlock(0);
+
+	if (msg != NULL) {
+		/* NULL pointers, clear flags etc */
+		memset(msg, 0, sizeof(*msg));
+	}
+	return msg;
+}
+
+static inline void
+lnet_msg_free_locked(lnet_msg_t *msg)
+{
+	/* ALWAYS called with network lock held */
+	struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0];
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+	LASSERT(!msg->msg_onactivelist);
+	lnet_freelist_free(&msc->msc_freelist, msg);
+}
+
+static inline void
+lnet_msg_free (lnet_msg_t *msg)
+{
+	lnet_net_lock(0);
+	lnet_msg_free_locked(msg);
+	lnet_net_unlock(0);
+}
+
+#else /* !LNET_USE_LIB_FREELIST */
+
+static inline lnet_eq_t *
+lnet_eq_alloc (void)
+{
+	/* NEVER called with liblock held */
+	lnet_eq_t *eq;
+
+	LIBCFS_ALLOC(eq, sizeof(*eq));
+	return (eq);
+}
+
+static inline void
+lnet_eq_free(lnet_eq_t *eq)
+{
+	/* ALWAYS called with resource lock held */
+	LIBCFS_FREE(eq, sizeof(*eq));
+}
+
+static inline lnet_libmd_t *
+lnet_md_alloc (lnet_md_t *umd)
+{
+	/* NEVER called with liblock held */
+	lnet_libmd_t *md;
+	unsigned int  size;
+	unsigned int  niov;
+
+	if ((umd->options & LNET_MD_KIOV) != 0) {
+		niov = umd->length;
+		size = offsetof(lnet_libmd_t, md_iov.kiov[niov]);
+	} else {
+		niov = ((umd->options & LNET_MD_IOVEC) != 0) ?
+		       umd->length : 1;
+		size = offsetof(lnet_libmd_t, md_iov.iov[niov]);
+	}
+
+	LIBCFS_ALLOC(md, size);
+
+	if (md != NULL) {
+		/* Set here in case of early free */
+		md->md_options = umd->options;
+		md->md_niov = niov;
+		INIT_LIST_HEAD(&md->md_list);
+	}
+
+	return (md);
+}
+
+static inline void
+lnet_md_free(lnet_libmd_t *md)
+{
+	/* ALWAYS called with resource lock held */
+	unsigned int  size;
+
+	if ((md->md_options & LNET_MD_KIOV) != 0)
+		size = offsetof(lnet_libmd_t, md_iov.kiov[md->md_niov]);
+	else
+		size = offsetof(lnet_libmd_t, md_iov.iov[md->md_niov]);
+
+	LIBCFS_FREE(md, size);
+}
+
+static inline lnet_me_t *
+lnet_me_alloc (void)
+{
+	/* NEVER called with liblock held */
+	lnet_me_t *me;
+
+	LIBCFS_ALLOC(me, sizeof(*me));
+	return (me);
+}
+
+static inline void
+lnet_me_free(lnet_me_t *me)
+{
+	/* ALWAYS called with resource lock held */
+	LIBCFS_FREE(me, sizeof(*me));
+}
+
+static inline lnet_msg_t *
+lnet_msg_alloc(void)
+{
+	/* NEVER called with liblock held */
+	lnet_msg_t *msg;
+
+	LIBCFS_ALLOC(msg, sizeof(*msg));
+
+	/* no need to zero, LIBCFS_ALLOC does for us */
+	return (msg);
+}
+
+static inline void
+lnet_msg_free(lnet_msg_t *msg)
+{
+	/* ALWAYS called with network lock held */
+	LASSERT(!msg->msg_onactivelist);
+	LIBCFS_FREE(msg, sizeof(*msg));
+}
+
+#define lnet_eq_free_locked(eq)		lnet_eq_free(eq)
+#define lnet_md_free_locked(md)		lnet_md_free(md)
+#define lnet_me_free_locked(me)		lnet_me_free(me)
+#define lnet_msg_free_locked(msg)	lnet_msg_free(msg)
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+lnet_libhandle_t *lnet_res_lh_lookup(struct lnet_res_container *rec,
+				     __u64 cookie);
+void lnet_res_lh_initialize(struct lnet_res_container *rec,
+			    lnet_libhandle_t *lh);
+static inline void
+lnet_res_lh_invalidate(lnet_libhandle_t *lh)
+{
+	/* ALWAYS called with resource lock held */
+	/* NB: cookie is still useful, don't reset it */
+	list_del(&lh->lh_hash_chain);
+}
+
+static inline void
+lnet_eq2handle (lnet_handle_eq_t *handle, lnet_eq_t *eq)
+{
+	if (eq == NULL) {
+		LNetInvalidateHandle(handle);
+		return;
+	}
+
+	handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lnet_eq_t *
+lnet_handle2eq(lnet_handle_eq_t *handle)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+
+	lh = lnet_res_lh_lookup(&the_lnet.ln_eq_container, handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_eq_t, eq_lh);
+}
+
+static inline void
+lnet_md2handle (lnet_handle_md_t *handle, lnet_libmd_t *md)
+{
+	handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lnet_libmd_t *
+lnet_handle2md(lnet_handle_md_t *handle)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+	int		 cpt;
+
+	cpt = lnet_cpt_of_cookie(handle->cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+				handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_libmd_t, md_lh);
+}
+
+static inline lnet_libmd_t *
+lnet_wire_handle2md(lnet_handle_wire_t *wh)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+	int		 cpt;
+
+	if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie)
+		return NULL;
+
+	cpt = lnet_cpt_of_cookie(wh->wh_object_cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+				wh->wh_object_cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_libmd_t, md_lh);
+}
+
+static inline void
+lnet_me2handle (lnet_handle_me_t *handle, lnet_me_t *me)
+{
+	handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lnet_me_t *
+lnet_handle2me(lnet_handle_me_t *handle)
+{
+	/* ALWAYS called with resource lock held */
+	lnet_libhandle_t *lh;
+	int		 cpt;
+
+	cpt = lnet_cpt_of_cookie(handle->cookie);
+	lh = lnet_res_lh_lookup(the_lnet.ln_me_containers[cpt],
+				handle->cookie);
+	if (lh == NULL)
+		return NULL;
+
+	return lh_entry(lh, lnet_me_t, me_lh);
+}
+
+static inline void
+lnet_peer_addref_locked(lnet_peer_t *lp)
+{
+	LASSERT (lp->lp_refcount > 0);
+	lp->lp_refcount++;
+}
+
+extern void lnet_destroy_peer_locked(lnet_peer_t *lp);
+
+static inline void
+lnet_peer_decref_locked(lnet_peer_t *lp)
+{
+	LASSERT (lp->lp_refcount > 0);
+	lp->lp_refcount--;
+	if (lp->lp_refcount == 0)
+		lnet_destroy_peer_locked(lp);
+}
+
+static inline int
+lnet_isrouter(lnet_peer_t *lp)
+{
+	return lp->lp_rtr_refcount != 0;
+}
+
+static inline void
+lnet_ni_addref_locked(lnet_ni_t *ni, int cpt)
+{
+	LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+	LASSERT(*ni->ni_refs[cpt] >= 0);
+
+	(*ni->ni_refs[cpt])++;
+}
+
+static inline void
+lnet_ni_addref(lnet_ni_t *ni)
+{
+	lnet_net_lock(0);
+	lnet_ni_addref_locked(ni, 0);
+	lnet_net_unlock(0);
+}
+
+static inline void
+lnet_ni_decref_locked(lnet_ni_t *ni, int cpt)
+{
+	LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+	LASSERT(*ni->ni_refs[cpt] > 0);
+
+	(*ni->ni_refs[cpt])--;
+}
+
+static inline void
+lnet_ni_decref(lnet_ni_t *ni)
+{
+	lnet_net_lock(0);
+	lnet_ni_decref_locked(ni, 0);
+	lnet_net_unlock(0);
+}
+
+void lnet_ni_free(lnet_ni_t *ni);
+
+static inline int
+lnet_nid2peerhash(lnet_nid_t nid)
+{
+	return cfs_hash_long(nid, LNET_PEER_HASH_BITS);
+}
+
+static inline struct list_head *
+lnet_net2rnethash(__u32 net)
+{
+	return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) +
+		LNET_NETTYP(net)) &
+		((1U << the_lnet.ln_remote_nets_hbits) - 1)];
+}
+
+extern lnd_t the_lolnd;
+
+
+extern int lnet_cpt_of_nid_locked(lnet_nid_t nid);
+extern int lnet_cpt_of_nid(lnet_nid_t nid);
+extern lnet_ni_t *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
+extern lnet_ni_t *lnet_net2ni_locked(__u32 net, int cpt);
+extern lnet_ni_t *lnet_net2ni(__u32 net);
+
+int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, cfs_time_t when);
+void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when);
+int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid);
+int lnet_check_routes(void);
+int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
+void lnet_destroy_routes(void);
+int lnet_get_route(int idx, __u32 *net, __u32 *hops,
+		   lnet_nid_t *gateway, __u32 *alive);
+void lnet_proc_init(void);
+void lnet_proc_fini(void);
+int  lnet_rtrpools_alloc(int im_a_router);
+void lnet_rtrpools_free(void);
+lnet_remotenet_t *lnet_find_net_locked (__u32 net);
+
+int lnet_islocalnid(lnet_nid_t nid);
+int lnet_islocalnet(__u32 net);
+
+void lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+			unsigned int offset, unsigned int mlen);
+void lnet_msg_detach_md(lnet_msg_t *msg, int status);
+void lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev);
+void lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type);
+void lnet_msg_commit(lnet_msg_t *msg, int cpt);
+void lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status);
+
+void lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev);
+void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+		    unsigned int offset, unsigned int len);
+int lnet_send(lnet_nid_t nid, lnet_msg_t *msg, lnet_nid_t rtr_nid);
+void lnet_return_tx_credits_locked(lnet_msg_t *msg);
+void lnet_return_rx_credits_locked(lnet_msg_t *msg);
+
+/* portals functions */
+/* portals attributes */
+static inline int
+lnet_ptl_is_lazy(lnet_portal_t *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_LAZY);
+}
+
+static inline int
+lnet_ptl_is_unique(lnet_portal_t *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE);
+}
+
+static inline int
+lnet_ptl_is_wildcard(lnet_portal_t *ptl)
+{
+	return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD);
+}
+
+static inline void
+lnet_ptl_setopt(lnet_portal_t *ptl, int opt)
+{
+	ptl->ptl_options |= opt;
+}
+
+static inline void
+lnet_ptl_unsetopt(lnet_portal_t *ptl, int opt)
+{
+	ptl->ptl_options &= ~opt;
+}
+
+/* match-table functions */
+struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable,
+			       lnet_process_id_t id, __u64 mbits);
+struct lnet_match_table *lnet_mt_of_attach(unsigned int index,
+					   lnet_process_id_t id, __u64 mbits,
+					   __u64 ignore_bits,
+					   lnet_ins_pos_t pos);
+int lnet_mt_match_md(struct lnet_match_table *mtable,
+		     struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* portals match/attach functions */
+void lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+			struct list_head *matches, struct list_head *drops);
+void lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md);
+int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* initialized and finalize portals */
+int lnet_portals_create(void);
+void lnet_portals_destroy(void);
+
+/* message functions */
+int lnet_parse (lnet_ni_t *ni, lnet_hdr_t *hdr,
+		lnet_nid_t fromnid, void *private, int rdma_req);
+void lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	       unsigned int offset, unsigned int mlen, unsigned int rlen);
+lnet_msg_t *lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *get_msg);
+void lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *msg, unsigned int len);
+void lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int rc);
+void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
+void lnet_recv_delayed_msg_list(struct list_head *head);
+
+int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt);
+void lnet_msg_container_cleanup(struct lnet_msg_container *container);
+void lnet_msg_containers_destroy(void);
+int lnet_msg_containers_create(void);
+
+char *lnet_msgtyp2str (int type);
+void lnet_print_hdr (lnet_hdr_t * hdr);
+int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
+
+void lnet_counters_get(lnet_counters_t *counters);
+void lnet_counters_reset(void);
+
+unsigned int lnet_iov_nob (unsigned int niov, struct iovec *iov);
+int lnet_extract_iov (int dst_niov, struct iovec *dst,
+		      int src_niov, struct iovec *src,
+		      unsigned int offset, unsigned int len);
+
+unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov);
+int lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+		      int src_niov, lnet_kiov_t *src,
+		      unsigned int offset, unsigned int len);
+
+void lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov,
+			unsigned int doffset,
+			unsigned int nsiov, struct iovec *siov,
+			unsigned int soffset, unsigned int nob);
+void lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov,
+			 unsigned int iovoffset,
+			 unsigned int nkiov, lnet_kiov_t *kiov,
+			 unsigned int kiovoffset, unsigned int nob);
+void lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov,
+			 unsigned int kiovoffset,
+			 unsigned int niov, struct iovec *iov,
+			 unsigned int iovoffset, unsigned int nob);
+void lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov,
+			  unsigned int doffset,
+			  unsigned int nskiov, lnet_kiov_t *skiov,
+			  unsigned int soffset, unsigned int nob);
+
+static inline void
+lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset,
+		   unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+		   unsigned int nob)
+{
+	struct iovec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen};
+
+	lnet_copy_iov2iov(1, &diov, doffset,
+			  nsiov, siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset,
+		    unsigned int nsiov, lnet_kiov_t *skiov, unsigned int soffset,
+		    unsigned int nob)
+{
+	struct iovec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen};
+
+	lnet_copy_kiov2iov(1, &diov, doffset,
+			   nsiov, skiov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2iov(unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+		   int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+	struct iovec siov = {/*.iov_base = */ src, /*.iov_len = */slen};
+	lnet_copy_iov2iov(ndiov, diov, doffset,
+			  1, &siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, unsigned int doffset,
+		    int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+	struct iovec siov = {/* .iov_base = */ src, /* .iov_len = */ slen};
+	lnet_copy_iov2kiov(ndiov, dkiov, doffset,
+			   1, &siov, soffset, nob);
+}
+
+void lnet_me_unlink(lnet_me_t *me);
+
+void lnet_md_unlink(lnet_libmd_t *md);
+void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd);
+
+void lnet_register_lnd(lnd_t *lnd);
+void lnet_unregister_lnd(lnd_t *lnd);
+int lnet_set_ip_niaddr (lnet_ni_t *ni);
+
+int lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+		 __u32 local_ip, __u32 peer_ip, int peer_port);
+void lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
+				__u32 peer_ip, int port);
+int lnet_count_acceptor_nis(void);
+int lnet_acceptor_timeout(void);
+int lnet_acceptor_port(void);
+
+int lnet_count_acceptor_nis(void);
+int lnet_acceptor_port(void);
+
+int lnet_acceptor_start(void);
+void lnet_acceptor_stop(void);
+
+void lnet_get_tunables(void);
+int lnet_peers_start_down(void);
+int lnet_peer_buffer_credits(lnet_ni_t *ni);
+
+int lnet_router_checker_start(void);
+void lnet_router_checker_stop(void);
+void lnet_swap_pinginfo(lnet_ping_info_t *info);
+
+int lnet_ping_target_init(void);
+void lnet_ping_target_fini(void);
+int lnet_ping(lnet_process_id_t id, int timeout_ms,
+	      lnet_process_id_t *ids, int n_ids);
+
+int lnet_parse_ip2nets (char **networksp, char *ip2nets);
+int lnet_parse_routes (char *route_str, int *im_a_router);
+int lnet_parse_networks (struct list_head *nilist, char *networks);
+
+int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt);
+lnet_peer_t *lnet_find_peer_locked(struct lnet_peer_table *ptable,
+				   lnet_nid_t nid);
+void lnet_peer_tables_cleanup(void);
+void lnet_peer_tables_destroy(void);
+int lnet_peer_tables_create(void);
+void lnet_debug_peer(lnet_nid_t nid);
+
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
new file mode 100644
index 0000000..86428d4
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h

@@ -0,0 +1,765 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef __LNET_LIB_TYPES_H__
+#define __LNET_LIB_TYPES_H__
+
+#include <linux/lnet/linux/lib-types.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/list.h>
+#include <linux/lnet/types.h>
+
+#define WIRE_ATTR       __attribute__((packed))
+
+/* Packed version of lnet_process_id_t to transfer via network */
+typedef struct {
+	lnet_nid_t nid;
+	lnet_pid_t pid;   /* node id / process id */
+} WIRE_ATTR lnet_process_id_packed_t;
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use). */
+typedef struct {
+	__u64 wh_interface_cookie;
+	__u64 wh_object_cookie;
+} WIRE_ATTR lnet_handle_wire_t;
+
+typedef enum {
+	LNET_MSG_ACK = 0,
+	LNET_MSG_PUT,
+	LNET_MSG_GET,
+	LNET_MSG_REPLY,
+	LNET_MSG_HELLO,
+} lnet_msg_type_t;
+
+/* The variant fields of the portals message header are aligned on an 8
+ * byte boundary in the message header.  Note that all types used in these
+ * wire structs MUST be fixed size and the smaller types are placed at the
+ * end. */
+typedef struct lnet_ack {
+	lnet_handle_wire_t  dst_wmd;
+	__u64	       match_bits;
+	__u32	       mlength;
+} WIRE_ATTR lnet_ack_t;
+
+typedef struct lnet_put {
+	lnet_handle_wire_t  ack_wmd;
+	__u64	       match_bits;
+	__u64	       hdr_data;
+	__u32	       ptl_index;
+	__u32	       offset;
+} WIRE_ATTR lnet_put_t;
+
+typedef struct lnet_get {
+	lnet_handle_wire_t  return_wmd;
+	__u64	       match_bits;
+	__u32	       ptl_index;
+	__u32	       src_offset;
+	__u32	       sink_length;
+} WIRE_ATTR lnet_get_t;
+
+typedef struct lnet_reply {
+	lnet_handle_wire_t  dst_wmd;
+} WIRE_ATTR lnet_reply_t;
+
+typedef struct lnet_hello {
+	__u64	      incarnation;
+	__u32	      type;
+} WIRE_ATTR lnet_hello_t;
+
+typedef struct {
+	lnet_nid_t	  dest_nid;
+	lnet_nid_t	  src_nid;
+	lnet_pid_t	  dest_pid;
+	lnet_pid_t	  src_pid;
+	__u32	       type;	       /* lnet_msg_type_t */
+	__u32	       payload_length;     /* payload data to follow */
+	/*<------__u64 aligned------->*/
+	union {
+		lnet_ack_t   ack;
+		lnet_put_t   put;
+		lnet_get_t   get;
+		lnet_reply_t reply;
+		lnet_hello_t hello;
+	} msg;
+} WIRE_ATTR lnet_hdr_t;
+
+/* A HELLO message contains a magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * LNET_MSG_HELLO in the type field.  All other common fields are zero
+ * (including payload_size; i.e. no payload).
+ * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID. These LNDs should
+ * exchange HELLO messages when a connection is first established.  Individual
+ * LNDs can put whatever else they fancy in lnet_hdr_t::msg.
+ */
+typedef struct {
+	__u32   magic;			  /* LNET_PROTO_TCP_MAGIC */
+	__u16   version_major;		  /* increment on incompatible change */
+	__u16   version_minor;		  /* increment on compatible change */
+} WIRE_ATTR lnet_magicversion_t;
+
+/* PROTO MAGIC for LNDs */
+#define LNET_PROTO_IB_MAGIC		 0x0be91b91
+#define LNET_PROTO_RA_MAGIC		 0x0be91b92
+#define LNET_PROTO_QSW_MAGIC		0x0be91b93
+#define LNET_PROTO_GNI_MAGIC		0xb00fbabe /* ask Kim */
+#define LNET_PROTO_TCP_MAGIC		0xeebc0ded
+#define LNET_PROTO_PTL_MAGIC		0x50746C4E /* 'PtlN' unique magic */
+#define LNET_PROTO_MX_MAGIC		 0x4d583130 /* 'MX10'! */
+#define LNET_PROTO_ACCEPTOR_MAGIC	   0xacce7100
+#define LNET_PROTO_PING_MAGIC	       0x70696E67 /* 'ping' */
+
+/* Placeholder for a future "unified" protocol across all LNDs */
+/* Current LNDs that receive a request with this magic will respond with a
+ * "stub" reply using their current protocol */
+#define LNET_PROTO_MAGIC		    0x45726963 /* ! */
+
+
+#define LNET_PROTO_TCP_VERSION_MAJOR	1
+#define LNET_PROTO_TCP_VERSION_MINOR	0
+
+/* Acceptor connection request */
+typedef struct {
+	__u32       acr_magic;		  /* PTL_ACCEPTOR_PROTO_MAGIC */
+	__u32       acr_version;		/* protocol version */
+	__u64       acr_nid;		    /* target NID */
+} WIRE_ATTR lnet_acceptor_connreq_t;
+
+#define LNET_PROTO_ACCEPTOR_VERSION       1
+
+/* forward refs */
+struct lnet_libmd;
+
+typedef struct lnet_msg {
+	struct list_head	    msg_activelist;
+	struct list_head	    msg_list;	   /* Q for credits/MD */
+
+	lnet_process_id_t     msg_target;
+	/* where is it from, it's only for building event */
+	lnet_nid_t		msg_from;
+	__u32			msg_type;
+
+	/* commited for sending */
+	unsigned int		msg_tx_committed:1;
+	/* CPT # this message committed for sending */
+	unsigned int		msg_tx_cpt:15;
+	/* commited for receiving */
+	unsigned int		msg_rx_committed:1;
+	/* CPT # this message committed for receiving */
+	unsigned int		msg_rx_cpt:15;
+	/* queued for tx credit */
+	unsigned int		msg_tx_delayed:1;
+	/* queued for RX buffer */
+	unsigned int		msg_rx_delayed:1;
+	/* ready for pending on RX delay list */
+	unsigned int		msg_rx_ready_delay:1;
+
+	unsigned int	  msg_vmflush:1;      /* VM trying to free memory */
+	unsigned int	  msg_target_is_router:1; /* sending to a router */
+	unsigned int	  msg_routing:1;      /* being forwarded */
+	unsigned int	  msg_ack:1;	  /* ack on finalize (PUT) */
+	unsigned int	  msg_sending:1;      /* outgoing message */
+	unsigned int	  msg_receiving:1;    /* being received */
+	unsigned int	  msg_txcredit:1;     /* taken an NI send credit */
+	unsigned int	  msg_peertxcredit:1; /* taken a peer send credit */
+	unsigned int	  msg_rtrcredit:1;    /* taken a globel router credit */
+	unsigned int	  msg_peerrtrcredit:1; /* taken a peer router credit */
+	unsigned int	  msg_onactivelist:1; /* on the activelist */
+
+	struct lnet_peer     *msg_txpeer;	 /* peer I'm sending to */
+	struct lnet_peer     *msg_rxpeer;	 /* peer I received from */
+
+	void		 *msg_private;
+	struct lnet_libmd    *msg_md;
+
+	unsigned int	  msg_len;
+	unsigned int	  msg_wanted;
+	unsigned int	  msg_offset;
+	unsigned int	  msg_niov;
+	struct iovec	 *msg_iov;
+	lnet_kiov_t	  *msg_kiov;
+
+	lnet_event_t	  msg_ev;
+	lnet_hdr_t	    msg_hdr;
+} lnet_msg_t;
+
+
+typedef struct lnet_libhandle {
+	struct list_head	    lh_hash_chain;
+	__u64		 lh_cookie;
+} lnet_libhandle_t;
+
+#define lh_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+
+typedef struct lnet_eq {
+	struct list_head		eq_list;
+	lnet_libhandle_t	eq_lh;
+	lnet_seq_t		eq_enq_seq;
+	lnet_seq_t		eq_deq_seq;
+	unsigned int		eq_size;
+	lnet_eq_handler_t	eq_callback;
+	lnet_event_t		*eq_events;
+	int			**eq_refs;	/* percpt refcount for EQ */
+} lnet_eq_t;
+
+typedef struct lnet_me {
+	struct list_head	     me_list;
+	lnet_libhandle_t       me_lh;
+	lnet_process_id_t      me_match_id;
+	unsigned int	   me_portal;
+	unsigned int	   me_pos;		/* hash offset in mt_hash */
+	__u64		  me_match_bits;
+	__u64		  me_ignore_bits;
+	lnet_unlink_t	  me_unlink;
+	struct lnet_libmd     *me_md;
+} lnet_me_t;
+
+typedef struct lnet_libmd {
+	struct list_head	    md_list;
+	lnet_libhandle_t      md_lh;
+	lnet_me_t	    *md_me;
+	char		 *md_start;
+	unsigned int	  md_offset;
+	unsigned int	  md_length;
+	unsigned int	  md_max_size;
+	int		   md_threshold;
+	int		   md_refcount;
+	unsigned int	  md_options;
+	unsigned int	  md_flags;
+	void		 *md_user_ptr;
+	lnet_eq_t	    *md_eq;
+	unsigned int	  md_niov;		/* # frags */
+	union {
+		struct iovec  iov[LNET_MAX_IOV];
+		lnet_kiov_t   kiov[LNET_MAX_IOV];
+	} md_iov;
+} lnet_libmd_t;
+
+#define LNET_MD_FLAG_ZOMBIE	   (1 << 0)
+#define LNET_MD_FLAG_AUTO_UNLINK      (1 << 1)
+
+#ifdef LNET_USE_LIB_FREELIST
+typedef struct
+{
+	void		  *fl_objs;	  /* single contiguous array of objects */
+	int		    fl_nobjs;	 /* the number of them */
+	int		    fl_objsize;       /* the size (including overhead) of each of them */
+	struct list_head	     fl_list;	  /* where they are enqueued */
+} lnet_freelist_t;
+
+typedef struct
+{
+	struct list_head	     fo_list;	     /* enqueue on fl_list */
+	void		  *fo_contents;	 /* aligned contents */
+} lnet_freeobj_t;
+#endif
+
+typedef struct {
+	/* info about peers we are trying to fail */
+	struct list_head	     tp_list;	     /* ln_test_peers */
+	lnet_nid_t	     tp_nid;	      /* matching nid */
+	unsigned int	   tp_threshold;	/* # failures to simulate */
+} lnet_test_peer_t;
+
+#define LNET_COOKIE_TYPE_MD    1
+#define LNET_COOKIE_TYPE_ME    2
+#define LNET_COOKIE_TYPE_EQ    3
+#define LNET_COOKIE_TYPE_BITS  2
+#define LNET_COOKIE_MASK	((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL)
+
+struct lnet_ni;				  /* forward ref */
+
+typedef struct lnet_lnd
+{
+	/* fields managed by portals */
+	struct list_head	    lnd_list;	     /* stash in the LND table */
+	int		   lnd_refcount;	 /* # active instances */
+
+	/* fields initialised by the LND */
+	unsigned int	  lnd_type;
+
+	int  (*lnd_startup) (struct lnet_ni *ni);
+	void (*lnd_shutdown) (struct lnet_ni *ni);
+	int  (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg);
+
+	/* In data movement APIs below, payload buffers are described as a set
+	 * of 'niov' fragments which are...
+	 * EITHER
+	 *    in virtual memory (struct iovec *iov != NULL)
+	 * OR
+	 *    in pages (kernel only: plt_kiov_t *kiov != NULL).
+	 * The LND may NOT overwrite these fragment descriptors.
+	 * An 'offset' and may specify a byte offset within the set of
+	 * fragments to start from
+	 */
+
+	/* Start sending a preformatted message.  'private' is NULL for PUT and
+	 * GET messages; otherwise this is a response to an incoming message
+	 * and 'private' is the 'private' passed to lnet_parse().  Return
+	 * non-zero for immediate failure, otherwise complete later with
+	 * lnet_finalize() */
+	int (*lnd_send)(struct lnet_ni *ni, void *private, lnet_msg_t *msg);
+
+	/* Start receiving 'mlen' bytes of payload data, skipping the following
+	 * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to
+	 * lnet_parse().  Return non-zero for immedaite failure, otherwise
+	 * complete later with lnet_finalize().  This also gives back a receive
+	 * credit if the LND does flow control. */
+	int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+			int delayed, unsigned int niov,
+			struct iovec *iov, lnet_kiov_t *kiov,
+			unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+	/* lnet_parse() has had to delay processing of this message
+	 * (e.g. waiting for a forwarding buffer or send credits).  Give the
+	 * LND a chance to free urgently needed resources.  If called, return 0
+	 * for success and do NOT give back a receive credit; that has to wait
+	 * until lnd_recv() gets called.  On failure return < 0 and
+	 * release resources; lnd_recv() will not be called. */
+	int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+			      void **new_privatep);
+
+	/* notification of peer health */
+	void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
+
+	/* query of peer aliveness */
+	void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when);
+
+	/* accept a new connection */
+	int (*lnd_accept)(struct lnet_ni *ni, socket_t *sock);
+
+} lnd_t;
+
+#define LNET_NI_STATUS_UP      0x15aac0de
+#define LNET_NI_STATUS_DOWN    0xdeadface
+#define LNET_NI_STATUS_INVALID 0x00000000
+typedef struct {
+	lnet_nid_t ns_nid;
+	__u32      ns_status;
+	__u32      ns_unused;
+} WIRE_ATTR lnet_ni_status_t;
+
+struct lnet_tx_queue {
+	int			tq_credits;	/* # tx credits free */
+	int			tq_credits_min;	/* lowest it's been */
+	int			tq_credits_max;	/* total # tx credits */
+	struct list_head		tq_delayed;	/* delayed TXs */
+};
+
+#define LNET_MAX_INTERFACES   16
+
+typedef struct lnet_ni {
+	spinlock_t		ni_lock;
+	struct list_head		ni_list;	/* chain on ln_nis */
+	struct list_head		ni_cptlist;	/* chain on ln_nis_cpt */
+	int			ni_maxtxcredits; /* # tx credits  */
+	/* # per-peer send credits */
+	int			ni_peertxcredits;
+	/* # per-peer router buffer credits */
+	int			ni_peerrtrcredits;
+	/* seconds to consider peer dead */
+	int			ni_peertimeout;
+	int			ni_ncpts;	/* number of CPTs */
+	__u32			*ni_cpts;	/* bond NI on some CPTs */
+	lnet_nid_t		ni_nid;		/* interface's NID */
+	void			*ni_data;	/* instance-specific data */
+	lnd_t			*ni_lnd;	/* procedural interface */
+	struct lnet_tx_queue	**ni_tx_queues;	/* percpt TX queues */
+	int			**ni_refs;	/* percpt reference count */
+	long			ni_last_alive;	/* when I was last alive */
+	lnet_ni_status_t	*ni_status;	/* my health status */
+	/* equivalent interfaces to use */
+	char			*ni_interfaces[LNET_MAX_INTERFACES];
+} lnet_ni_t;
+
+#define LNET_PROTO_PING_MATCHBITS	0x8000000000000000LL
+
+/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+ * of old LNet, so there shouldn't be any compatibility issue */
+#define LNET_PING_FEAT_INVAL		(0)		/* no feature */
+#define LNET_PING_FEAT_BASE		(1 << 0)	/* just a ping */
+#define LNET_PING_FEAT_NI_STATUS	(1 << 1)	/* return NI status */
+
+#define LNET_PING_FEAT_MASK		(LNET_PING_FEAT_BASE | \
+					 LNET_PING_FEAT_NI_STATUS)
+
+typedef struct {
+	__u32			pi_magic;
+	__u32			pi_features;
+	lnet_pid_t		pi_pid;
+	__u32			pi_nnis;
+	lnet_ni_status_t	pi_ni[0];
+} WIRE_ATTR lnet_ping_info_t;
+
+/* router checker data, per router */
+#define LNET_MAX_RTR_NIS   16
+#define LNET_PINGINFO_SIZE offsetof(lnet_ping_info_t, pi_ni[LNET_MAX_RTR_NIS])
+typedef struct {
+	/* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
+	struct list_head		rcd_list;
+	lnet_handle_md_t	rcd_mdh;	/* ping buffer MD */
+	struct lnet_peer	*rcd_gateway;	/* reference to gateway */
+	lnet_ping_info_t	*rcd_pinginfo;	/* ping buffer */
+} lnet_rc_data_t;
+
+typedef struct lnet_peer {
+	struct list_head	lp_hashlist;	  /* chain on peer hash */
+	struct list_head	lp_txq;	       /* messages blocking for tx credits */
+	struct list_head	lp_rtrq;	      /* messages blocking for router credits */
+	struct list_head	lp_rtr_list;	  /* chain on router list */
+	int	       lp_txcredits;	 /* # tx credits available */
+	int	       lp_mintxcredits;      /* low water mark */
+	int	       lp_rtrcredits;	/* # router credits */
+	int	       lp_minrtrcredits;     /* low water mark */
+	unsigned int      lp_alive:1;	   /* alive/dead? */
+	unsigned int      lp_notify:1;	  /* notification outstanding? */
+	unsigned int      lp_notifylnd:1;       /* outstanding notification for LND? */
+	unsigned int      lp_notifying:1;       /* some thread is handling notification */
+	unsigned int      lp_ping_notsent;      /* SEND event outstanding from ping */
+	int	       lp_alive_count;       /* # times router went dead<->alive */
+	long	      lp_txqnob;	    /* bytes queued for sending */
+	cfs_time_t	lp_timestamp;	 /* time of last aliveness news */
+	cfs_time_t	lp_ping_timestamp;    /* time of last ping attempt */
+	cfs_time_t	lp_ping_deadline;     /* != 0 if ping reply expected */
+	cfs_time_t	lp_last_alive;	/* when I was last alive */
+	cfs_time_t	lp_last_query;	/* when lp_ni was queried last time */
+	lnet_ni_t	*lp_ni;		/* interface peer is on */
+	lnet_nid_t	lp_nid;	       /* peer's NID */
+	int	       lp_refcount;	  /* # refs */
+	int			lp_cpt;		/* CPT this peer attached on */
+	/* # refs from lnet_route_t::lr_gateway */
+	int			lp_rtr_refcount;
+	/* returned RC ping features */
+	unsigned int		lp_ping_feats;
+	struct list_head		lp_routes;	/* routers on this peer */
+	lnet_rc_data_t		*lp_rcd;	/* router checker state */
+} lnet_peer_t;
+
+
+/* peer hash size */
+#define LNET_PEER_HASH_BITS     9
+#define LNET_PEER_HASH_SIZE     (1 << LNET_PEER_HASH_BITS)
+
+/* peer hash table */
+struct lnet_peer_table {
+	int			pt_version;	/* /proc validity stamp */
+	int			pt_number;	/* # peers extant */
+	struct list_head		pt_deathrow;	/* zombie peers */
+	struct list_head		*pt_hash;	/* NID->peer hash */
+};
+
+/* peer aliveness is enabled only on routers for peers in a network where the
+ * lnet_ni_t::ni_peertimeout has been set to a positive value */
+#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \
+					 (lp)->lp_ni->ni_peertimeout > 0)
+
+typedef struct {
+	struct list_head		lr_list;	/* chain on net */
+	struct list_head		lr_gwlist;	/* chain on gateway */
+	lnet_peer_t		*lr_gateway;	/* router node */
+	__u32			lr_net;		/* remote network number */
+	int			lr_seq;		/* sequence for round-robin */
+	unsigned int		lr_downis;	/* number of down NIs */
+	unsigned int		lr_hops;	/* how far I am */
+} lnet_route_t;
+
+#define LNET_REMOTE_NETS_HASH_DEFAULT	(1U << 7)
+#define LNET_REMOTE_NETS_HASH_MAX	(1U << 16)
+#define LNET_REMOTE_NETS_HASH_SIZE	(1 << the_lnet.ln_remote_nets_hbits)
+
+typedef struct {
+	struct list_head	      lrn_list;       /* chain on ln_remote_nets_hash */
+	struct list_head	      lrn_routes;     /* routes to me */
+	__u32		   lrn_net;	/* my net number */
+} lnet_remotenet_t;
+
+typedef struct {
+	struct list_head rbp_bufs;	     /* my free buffer pool */
+	struct list_head rbp_msgs;	     /* messages blocking for a buffer */
+	int	rbp_npages;	   /* # pages in each buffer */
+	int	rbp_nbuffers;	 /* # buffers */
+	int	rbp_credits;	  /* # free buffers / blocked messages */
+	int	rbp_mincredits;       /* low water mark */
+} lnet_rtrbufpool_t;
+
+typedef struct {
+	struct list_head	     rb_list;	     /* chain on rbp_bufs */
+	lnet_rtrbufpool_t     *rb_pool;	     /* owning pool */
+	lnet_kiov_t	    rb_kiov[0];	  /* the buffer space */
+} lnet_rtrbuf_t;
+
+typedef struct {
+	__u32	msgs_alloc;
+	__u32	msgs_max;
+	__u32	errors;
+	__u32	send_count;
+	__u32	recv_count;
+	__u32	route_count;
+	__u32	drop_count;
+	__u64	send_length;
+	__u64	recv_length;
+	__u64	route_length;
+	__u64	drop_length;
+} WIRE_ATTR lnet_counters_t;
+
+#define LNET_PEER_HASHSIZE   503		/* prime! */
+
+#define LNET_NRBPOOLS	 3		 /* # different router buffer pools */
+
+enum {
+	/* Didn't match anything */
+	LNET_MATCHMD_NONE	= (1 << 0),
+	/* Matched OK */
+	LNET_MATCHMD_OK		= (1 << 1),
+	/* Must be discarded */
+	LNET_MATCHMD_DROP	= (1 << 2),
+	/* match and buffer is exhausted */
+	LNET_MATCHMD_EXHAUSTED  = (1 << 3),
+	/* match or drop */
+	LNET_MATCHMD_FINISH     = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP),
+};
+
+/* Options for lnet_portal_t::ptl_options */
+#define LNET_PTL_LAZY	       (1 << 0)
+#define LNET_PTL_MATCH_UNIQUE       (1 << 1)    /* unique match, for RDMA */
+#define LNET_PTL_MATCH_WILDCARD     (1 << 2)    /* wildcard match, request portal */
+
+/* parameter for matching operations (GET, PUT) */
+struct lnet_match_info {
+	__u64			mi_mbits;
+	lnet_process_id_t	mi_id;
+	unsigned int		mi_opc;
+	unsigned int		mi_portal;
+	unsigned int		mi_rlength;
+	unsigned int		mi_roffset;
+};
+
+/* ME hash of RDMA portal */
+#define LNET_MT_HASH_BITS		8
+#define LNET_MT_HASH_SIZE		(1 << LNET_MT_HASH_BITS)
+#define LNET_MT_HASH_MASK		(LNET_MT_HASH_SIZE - 1)
+/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash,
+ * the last entry is reserved for MEs with ignore-bits */
+#define LNET_MT_HASH_IGNORE		LNET_MT_HASH_SIZE
+/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which
+ * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the
+ * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */
+#define LNET_MT_BITS_U64		6	/* 2^6 bits */
+#define LNET_MT_EXHAUSTED_BITS		(LNET_MT_HASH_BITS - LNET_MT_BITS_U64)
+#define LNET_MT_EXHAUSTED_BMAP		((1 << LNET_MT_EXHAUSTED_BITS) + 1)
+
+/* portal match table */
+struct lnet_match_table {
+	/* reserved for upcoming patches, CPU partition ID */
+	unsigned int		mt_cpt;
+	unsigned int		mt_portal;      /* portal index */
+	/* match table is set as "enabled" if there's non-exhausted MD
+	 * attached on mt_mhash, it's only valide for wildcard portal */
+	unsigned int		mt_enabled;
+	/* bitmap to flag whether MEs on mt_hash are exhausted or not */
+	__u64			mt_exhausted[LNET_MT_EXHAUSTED_BMAP];
+	struct list_head		*mt_mhash;      /* matching hash */
+};
+
+/* these are only useful for wildcard portal */
+/* Turn off message rotor for wildcard portals */
+#define	LNET_PTL_ROTOR_OFF	0
+/* round-robin dispatch all PUT messages for wildcard portals */
+#define	LNET_PTL_ROTOR_ON	1
+/* round-robin dispatch routed PUT message for wildcard portals */
+#define	LNET_PTL_ROTOR_RR_RT	2
+/* dispatch routed PUT message by hashing source NID for wildcard portals */
+#define	LNET_PTL_ROTOR_HASH_RT	3
+
+typedef struct lnet_portal {
+	spinlock_t		ptl_lock;
+	unsigned int		ptl_index;	/* portal ID, reserved */
+	/* flags on this portal: lazy, unique... */
+	unsigned int		ptl_options;
+	/* list of messags which are stealing buffer */
+	struct list_head		ptl_msg_stealing;
+	/* messages blocking for MD */
+	struct list_head		ptl_msg_delayed;
+	/* Match table for each CPT */
+	struct lnet_match_table	**ptl_mtables;
+	/* spread rotor of incoming "PUT" */
+	int			ptl_rotor;
+	/* # active entries for this portal */
+	int		     ptl_mt_nmaps;
+	/* array of active entries' cpu-partition-id */
+	int		     ptl_mt_maps[0];
+} lnet_portal_t;
+
+#define LNET_LH_HASH_BITS	12
+#define LNET_LH_HASH_SIZE	(1ULL << LNET_LH_HASH_BITS)
+#define LNET_LH_HASH_MASK	(LNET_LH_HASH_SIZE - 1)
+
+/* resource container (ME, MD, EQ) */
+struct lnet_res_container {
+	unsigned int		rec_type;	/* container type */
+	__u64			rec_lh_cookie;	/* cookie generator */
+	struct list_head		rec_active;	/* active resource list */
+	struct list_head		*rec_lh_hash;	/* handle hash */
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_t		rec_freelist;	/* freelist for resources */
+#endif
+};
+
+/* message container */
+struct lnet_msg_container {
+	int			msc_init;	/* initialized or not */
+	/* max # threads finalizing */
+	int			msc_nfinalizers;
+	/* msgs waiting to complete finalizing */
+	struct list_head		msc_finalizing;
+	struct list_head		msc_active;	/* active message list */
+	/* threads doing finalization */
+	void			**msc_finalizers;
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_t		msc_freelist;	/* freelist for messages */
+#endif
+};
+
+/* Router Checker states */
+#define LNET_RC_STATE_SHUTDOWN		0	/* not started */
+#define LNET_RC_STATE_RUNNING		1	/* started up OK */
+#define LNET_RC_STATE_STOPPING		2	/* telling thread to stop */
+
+typedef struct
+{
+	/* CPU partition table of LNet */
+	struct cfs_cpt_table		*ln_cpt_table;
+	/* number of CPTs in ln_cpt_table */
+	unsigned int			ln_cpt_number;
+	unsigned int			ln_cpt_bits;
+
+	/* protect LNet resources (ME/MD/EQ) */
+	struct cfs_percpt_lock		*ln_res_lock;
+	/* # portals */
+	int				ln_nportals;
+	/* the vector of portals */
+	lnet_portal_t			**ln_portals;
+	/* percpt ME containers */
+	struct lnet_res_container	**ln_me_containers;
+	/* percpt MD container */
+	struct lnet_res_container	**ln_md_containers;
+
+	/* Event Queue container */
+	struct lnet_res_container	ln_eq_container;
+	wait_queue_head_t			ln_eq_waitq;
+	spinlock_t			ln_eq_wait_lock;
+	unsigned int			ln_remote_nets_hbits;
+
+	/* protect NI, peer table, credits, routers, rtrbuf... */
+	struct cfs_percpt_lock		*ln_net_lock;
+	/* percpt message containers for active/finalizing/freed message */
+	struct lnet_msg_container	**ln_msg_containers;
+	lnet_counters_t			**ln_counters;
+	struct lnet_peer_table		**ln_peer_tables;
+	/* failure simulation */
+	struct list_head			ln_test_peers;
+
+	struct list_head			ln_nis;		/* LND instances */
+	/* NIs bond on specific CPT(s) */
+	struct list_head			ln_nis_cpt;
+	/* dying LND instances */
+	struct list_head			ln_nis_zombie;
+	lnet_ni_t			*ln_loni;	/* the loopback NI */
+	/* NI to wait for events in */
+	lnet_ni_t			*ln_eq_waitni;
+
+	/* remote networks with routes to them */
+	struct list_head			*ln_remote_nets_hash;
+	/* validity stamp */
+	__u64				ln_remote_nets_version;
+	/* list of all known routers */
+	struct list_head			ln_routers;
+	/* validity stamp */
+	__u64				ln_routers_version;
+	/* percpt router buffer pools */
+	lnet_rtrbufpool_t		**ln_rtrpools;
+
+	lnet_handle_md_t		ln_ping_target_md;
+	lnet_handle_eq_t		ln_ping_target_eq;
+	lnet_ping_info_t		*ln_ping_info;
+
+	/* router checker startup/shutdown state */
+	int				ln_rc_state;
+	/* router checker's event queue */
+	lnet_handle_eq_t		ln_rc_eqh;
+	/* rcd still pending on net */
+	struct list_head			ln_rcd_deathrow;
+	/* rcd ready for free */
+	struct list_head			ln_rcd_zombie;
+	/* serialise startup/shutdown */
+	struct semaphore		ln_rc_signal;
+
+	struct mutex			ln_api_mutex;
+	struct mutex			ln_lnd_mutex;
+	int				ln_init;	/* LNetInit() called? */
+	/* Have I called LNetNIInit myself? */
+	int				ln_niinit_self;
+	/* LNetNIInit/LNetNIFini counter */
+	int				ln_refcount;
+	/* shutdown in progress */
+	int				ln_shutdown;
+
+	int				ln_routing;	/* am I a router? */
+	lnet_pid_t			ln_pid;		/* requested pid */
+	/* uniquely identifies this ni in this epoch */
+	__u64				ln_interface_cookie;
+	/* registered LNDs */
+	struct list_head			ln_lnds;
+
+	/* space for network names */
+	char				*ln_network_tokens;
+	int				ln_network_tokens_nob;
+	/* test protocol compatibility flags */
+	int				ln_testprotocompat;
+
+} lnet_t;
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/linux/api-support.h b/drivers/staging/lustre/include/linux/lnet/linux/api-support.h
new file mode 100644
index 0000000..ca78a0a
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/linux/api-support.h

@@ -0,0 +1,43 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_API_SUPPORT_H__
+#define __LINUX_API_SUPPORT_H__
+
+#ifndef __LNET_API_SUPPORT_H__
+#error Do not #include this file directly. #include <lnet /api-support.h> instead
+#endif
+
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h
new file mode 100644
index 0000000..d2c0a70
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h

@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LIB_LNET_H__
+#define __LNET_LINUX_LIB_LNET_H__
+
+#ifndef __LNET_LIB_LNET_H__
+#error Do not #include this file directly. #include <linux/lnet/lib-lnet.h> instead
+#endif
+
+# include <asm/page.h>
+# include <linux/string.h>
+# include <asm/io.h>
+# include <linux/libcfs/libcfs.h>
+
+static inline __u64
+lnet_page2phys (struct page *p)
+{
+	/* compiler optimizer will elide unused branches */
+
+	switch (sizeof(typeof(page_to_phys(p)))) {
+	case 4:
+		/* page_to_phys returns a 32 bit physical address.  This must
+		 * be a 32 bit machine with <= 4G memory and we must ensure we
+		 * don't sign extend when converting to 64 bits. */
+		return (unsigned long)page_to_phys(p);
+
+	case 8:
+		/* page_to_phys returns a 64 bit physical address :) */
+		return page_to_phys(p);
+
+	default:
+		LBUG();
+		return 0;
+	}
+}
+
+
+#define LNET_ROUTER
+
+#endif /* __LNET_LINUX_LIB_LNET_H__ */

diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h b/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h
new file mode 100644
index 0000000..669e8c0
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h

@@ -0,0 +1,45 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LIB_TYPES_H__
+#define __LNET_LINUX_LIB_TYPES_H__
+
+#ifndef __LNET_LIB_TYPES_H__
+#error Do not #include this file directly. #include <linux/lnet/lib-types.h> instead
+#endif
+
+# include <linux/uio.h>
+# include <linux/types.h>
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lnet.h b/drivers/staging/lustre/include/linux/lnet/linux/lnet.h
new file mode 100644
index 0000000..1e888f1
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/linux/lnet.h

@@ -0,0 +1,56 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LNET_H__
+#define __LNET_LINUX_LNET_H__
+
+#ifndef __LNET_H__
+#error Do not #include this file directly. #include <linux/lnet/lnet.h> instead
+#endif
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+
+#include <linux/uio.h>
+#include <linux/types.h>
+
+#define cfs_tcp_sendpage(sk, page, offset, size, flags) \
+	tcp_sendpage(sk, page, offset, size, flags)
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h b/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h
new file mode 100644
index 0000000..1bde44e
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h

@@ -0,0 +1,51 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_SYSCTL_H__
+#define __LNET_SYSCTL_H__
+
+#if defined(CONFIG_SYSCTL)
+
+
+#define CTL_KRANAL      201
+#define CTL_O2IBLND     205
+#define CTL_PTLLND      206
+#define CTL_QSWNAL      207
+#define CTL_SOCKLND     208
+#define CTL_GNILND      210
+
+
+#endif
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/lnet.h b/drivers/staging/lustre/include/linux/lnet/lnet.h
new file mode 100644
index 0000000..c532b15
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lnet.h

@@ -0,0 +1,51 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_H__
+#define __LNET_H__
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+#include <linux/lnet/linux/lnet.h>
+
+#include <linux/lnet/types.h>
+#include <linux/lnet/api.h>
+
+#define LNET_NIDSTR_COUNT  1024    /* # of nidstrings */
+#define LNET_NIDSTR_SIZE   32      /* size of each one (see below for usage) */
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/lnetctl.h b/drivers/staging/lustre/include/linux/lnet/lnetctl.h
new file mode 100644
index 0000000..b22daa2
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lnetctl.h

@@ -0,0 +1,80 @@
+/*
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+
+#define LNET_DEV_ID 0
+#define LNET_DEV_PATH "/dev/lnet"
+#define LNET_DEV_MAJOR 10
+#define LNET_DEV_MINOR 240
+#define OBD_DEV_ID 1
+#define OBD_DEV_NAME "obd"
+#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME
+#define OBD_DEV_MAJOR 10
+#define OBD_DEV_MINOR 241
+#define SMFS_DEV_ID  2
+#define SMFS_DEV_PATH "/dev/snapdev"
+#define SMFS_DEV_MAJOR 10
+#define SMFS_DEV_MINOR 242
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_list_nids(int argc, char **argv);
+int jt_ptl_which_nid(int argc, char **argv);
+int jt_ptl_print_interfaces(int argc, char **argv);
+int jt_ptl_add_interface(int argc, char **argv);
+int jt_ptl_del_interface(int argc, char **argv);
+int jt_ptl_print_peers (int argc, char **argv);
+int jt_ptl_add_peer (int argc, char **argv);
+int jt_ptl_del_peer (int argc, char **argv);
+int jt_ptl_print_connections (int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_print_active_txs(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_add_route (int argc, char **argv);
+int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_notify_router (int argc, char **argv);
+int jt_ptl_print_routes (int argc, char **argv);
+int jt_ptl_fail_nid (int argc, char **argv);
+int jt_ptl_lwt(int argc, char **argv);
+int jt_ptl_testprotocompat(int argc, char **argv);
+int jt_ptl_memhog(int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/lnetst.h b/drivers/staging/lustre/include/linux/lnet/lnetst.h
new file mode 100644
index 0000000..d90f94e
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/lnetst.h

@@ -0,0 +1,491 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lnetst.h
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LNET_ST_H__
+#define __LNET_ST_H__
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+
+#define LST_FEAT_NONE		(0)
+#define LST_FEAT_BULK_LEN	(1 << 0)	/* enable variable page size */
+
+#define LST_FEATS_EMPTY		(LST_FEAT_NONE)
+#define LST_FEATS_MASK		(LST_FEAT_NONE | LST_FEAT_BULK_LEN)
+
+#define LST_NAME_SIZE	   32	      /* max name buffer length */
+
+#define LSTIO_DEBUG	     0xC00	   /* debug */
+#define LSTIO_SESSION_NEW       0xC01	   /* create session */
+#define LSTIO_SESSION_END       0xC02	   /* end session */
+#define LSTIO_SESSION_INFO      0xC03	   /* query session */
+#define LSTIO_GROUP_ADD	 0xC10	   /* add group */
+#define LSTIO_GROUP_LIST	0xC11	   /* list all groups in session */
+#define LSTIO_GROUP_INFO	0xC12	   /* query defailt infomation of specified group */
+#define LSTIO_GROUP_DEL	 0xC13	   /* delete group */
+#define LSTIO_NODES_ADD	 0xC14	   /* add nodes to specified group */
+#define LSTIO_GROUP_UPDATE      0xC15	   /* update group */
+#define LSTIO_BATCH_ADD	 0xC20	   /* add batch */
+#define LSTIO_BATCH_START       0xC21	   /* start batch */
+#define LSTIO_BATCH_STOP	0xC22	   /* stop batch */
+#define LSTIO_BATCH_DEL	 0xC23	   /* delete batch */
+#define LSTIO_BATCH_LIST	0xC24	   /* show all batches in the session */
+#define LSTIO_BATCH_INFO	0xC25	   /* show defail of specified batch */
+#define LSTIO_TEST_ADD	  0xC26	   /* add test (to batch) */
+#define LSTIO_BATCH_QUERY       0xC27	   /* query batch status */
+#define LSTIO_STAT_QUERY	0xC30	   /* get stats */
+
+typedef struct {
+	lnet_nid_t	      ses_nid;		/* nid of console node */
+	__u64		   ses_stamp;	      /* time stamp */
+} lst_sid_t;					    /*** session id */
+
+extern lst_sid_t LST_INVALID_SID;
+
+typedef struct {
+	__u64		   bat_id;		 /* unique id in session */
+} lst_bid_t;					    /*** batch id (group of tests) */
+
+/* Status of test node */
+#define LST_NODE_ACTIVE	 0x1		     /* node in this session */
+#define LST_NODE_BUSY	   0x2		     /* node is taken by other session */
+#define LST_NODE_DOWN	   0x4		     /* node is down */
+#define LST_NODE_UNKNOWN	0x8		     /* node not in session */
+
+typedef struct {
+	lnet_process_id_t       nde_id;		 /* id of node */
+	int		     nde_state;	      /* state of node */
+} lstcon_node_ent_t;				    /*** node entry, for list_group command */
+
+typedef struct {
+	int		     nle_nnode;	      /* # of nodes */
+	int		     nle_nactive;	    /* # of active nodes */
+	int		     nle_nbusy;	      /* # of busy nodes */
+	int		     nle_ndown;	      /* # of down nodes */
+	int		     nle_nunknown;	   /* # of unknown nodes */
+} lstcon_ndlist_ent_t;				  /*** node_list entry, for list_batch command */
+
+typedef struct {
+	int		     tse_type;	       /* test type */
+	int		     tse_loop;	       /* loop count */
+	int		     tse_concur;	     /* concurrency of test */
+} lstcon_test_ent_t;				    /*** test summary entry, for list_batch command */
+
+typedef struct {
+	int		     bae_state;	      /* batch status */
+	int		     bae_timeout;	    /* batch timeout */
+	int		     bae_ntest;	      /* # of tests in the batch */
+} lstcon_batch_ent_t;				   /*** batch summary entry, for list_batch command */
+
+typedef struct {
+	lstcon_ndlist_ent_t     tbe_cli_nle;	    /* client (group) node_list entry */
+	lstcon_ndlist_ent_t     tbe_srv_nle;	    /* server (group) node_list entry */
+	union {
+		lstcon_test_ent_t  tbe_test;	    /* test entry */
+		lstcon_batch_ent_t tbe_batch;	   /* batch entry */
+	} u;
+} lstcon_test_batch_ent_t;			      /*** test/batch verbose information entry,
+							 *** for list_batch command */
+
+typedef struct {
+	struct list_head	      rpe_link;	       /* link chain */
+	lnet_process_id_t       rpe_peer;	       /* peer's id */
+	struct timeval	  rpe_stamp;	      /* time stamp of RPC */
+	int		     rpe_state;	      /* peer's state */
+	int		     rpe_rpc_errno;	  /* RPC errno */
+
+	lst_sid_t	       rpe_sid;		/* peer's session id */
+	int		     rpe_fwk_errno;	  /* framework errno */
+	int		     rpe_priv[4];	    /* private data */
+	char		    rpe_payload[0];	 /* private reply payload */
+} lstcon_rpc_ent_t;
+
+typedef struct {
+	int		     trs_rpc_stat[4];	/* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */
+	int		     trs_rpc_errno;	  /* RPC errno */
+	int		     trs_fwk_stat[8];	/* framework stat */
+	int		     trs_fwk_errno;	  /* errno of the first remote error */
+	void		   *trs_fwk_private;	/* private framework stat */
+} lstcon_trans_stat_t;
+
+static inline int
+lstcon_rpc_stat_total(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0];
+}
+
+static inline int
+lstcon_rpc_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1];
+}
+
+static inline int
+lstcon_rpc_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2];
+}
+
+static inline int
+lstcon_sesop_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesop_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_active(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesqry_stat_busy(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_unknown(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_tsbop_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbop_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_idle(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbqry_stat_run(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_statqry_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_statqry_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+	return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+/* create a session */
+typedef struct {
+	int		     lstio_ses_key;	  /* IN: local key */
+	int		     lstio_ses_timeout;      /* IN: session timeout */
+	int		     lstio_ses_force;	/* IN: force create ? */
+	/** IN: session features */
+	unsigned		lstio_ses_feats;
+	lst_sid_t	      *lstio_ses_idp;	  /* OUT: session id */
+	int		     lstio_ses_nmlen;	/* IN: name length */
+	char		   *lstio_ses_namep;	/* IN: session name */
+} lstio_session_new_args_t;
+
+/* query current session */
+typedef struct {
+	lst_sid_t	      *lstio_ses_idp;	  /* OUT: session id */
+	int		    *lstio_ses_keyp;	 /* OUT: local key */
+	/** OUT: session features */
+	unsigned	       *lstio_ses_featp;
+	lstcon_ndlist_ent_t    *lstio_ses_ndinfo;       /* OUT: */
+	int		     lstio_ses_nmlen;	/* IN: name length */
+	char		   *lstio_ses_namep;	/* OUT: session name */
+} lstio_session_info_args_t;
+
+/* delete a session */
+typedef struct {
+	int		     lstio_ses_key;	  /* IN: session key */
+} lstio_session_end_args_t;
+
+#define LST_OPC_SESSION	 1
+#define LST_OPC_GROUP	   2
+#define LST_OPC_NODES	   3
+#define LST_OPC_BATCHCLI	4
+#define LST_OPC_BATCHSRV	5
+
+typedef struct {
+	int		     lstio_dbg_key;	  /* IN: session key */
+	int		     lstio_dbg_type;	 /* IN: debug sessin|batch|group|nodes list */
+	int		     lstio_dbg_flags;	/* IN: reserved debug flags */
+	int		     lstio_dbg_timeout;      /* IN: timeout of debug */
+
+	int		     lstio_dbg_nmlen;	/* IN: len of name */
+	char		   *lstio_dbg_namep;	/* IN: name of group|batch */
+	int		     lstio_dbg_count;	/* IN: # of test nodes to debug */
+	lnet_process_id_t      *lstio_dbg_idsp;	 /* IN: id of test nodes */
+	struct list_head	     *lstio_dbg_resultp;      /* OUT: list head of result buffer */
+} lstio_debug_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+} lstio_group_add_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+} lstio_group_del_args_t;
+
+#define LST_GROUP_CLEAN	 1		       /* remove inactive nodes in the group */
+#define LST_GROUP_REFRESH       2		       /* refresh inactive nodes in the group */
+#define LST_GROUP_RMND	  3		       /* delete nodes from the group */
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_opc;	  /* IN: OPC */
+	int		     lstio_grp_args;	 /* IN: arguments */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+	int		     lstio_grp_count;	/* IN: # of nodes id */
+	lnet_process_id_t      *lstio_grp_idsp;	 /* IN: array of nodes */
+	struct list_head	     *lstio_grp_resultp;      /* OUT: list head of result buffer */
+} lstio_group_update_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name length */
+	char		   *lstio_grp_namep;	/* IN: group name */
+	int		     lstio_grp_count;	/* IN: # of nodes */
+	/** OUT: session features */
+	unsigned	       *lstio_grp_featp;
+	lnet_process_id_t      *lstio_grp_idsp;	 /* IN: nodes */
+	struct list_head	     *lstio_grp_resultp;      /* OUT: list head of result buffer */
+} lstio_group_nodes_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_idx;	  /* IN: group idx */
+	int		     lstio_grp_nmlen;	/* IN: name len */
+	char		   *lstio_grp_namep;	/* OUT: name */
+} lstio_group_list_args_t;
+
+typedef struct {
+	int		     lstio_grp_key;	  /* IN: session key */
+	int		     lstio_grp_nmlen;	/* IN: name len */
+	char		   *lstio_grp_namep;	/* IN: name */
+	lstcon_ndlist_ent_t    *lstio_grp_entp;	 /* OUT: description of group */
+
+	int		    *lstio_grp_idxp;	 /* IN/OUT: node index */
+	int		    *lstio_grp_ndentp;       /* IN/OUT: # of nodent */
+	lstcon_node_ent_t      *lstio_grp_dentsp;       /* OUT: nodent array */
+} lstio_group_info_args_t;
+
+#define LST_DEFAULT_BATCH       "batch"		 /* default batch name */
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+} lstio_batch_add_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+} lstio_batch_del_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_timeout;      /* IN: timeout for the batch */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+	struct list_head	     *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_run_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_force;	/* IN: abort unfinished test RPC */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+	struct list_head	     *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_stop_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_testidx;      /* IN: test index */
+	int		     lstio_bat_client;       /* IN: is test client? */
+	int		     lstio_bat_timeout;      /* IN: timeout for waiting */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+	struct list_head	     *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_query_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_idx;	  /* IN: index */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: batch name */
+} lstio_batch_list_args_t;
+
+typedef struct {
+	int		     lstio_bat_key;	  /* IN: session key */
+	int		     lstio_bat_nmlen;	/* IN: name length */
+	char		   *lstio_bat_namep;	/* IN: name */
+	int		     lstio_bat_server;       /* IN: query server or not */
+	int		     lstio_bat_testidx;      /* IN: test index */
+	lstcon_test_batch_ent_t *lstio_bat_entp;	/* OUT: batch ent */
+
+	int		    *lstio_bat_idxp;	 /* IN/OUT: index of node */
+	int		    *lstio_bat_ndentp;       /* IN/OUT: # of nodent */
+	lstcon_node_ent_t      *lstio_bat_dentsp;       /* array of nodent */
+} lstio_batch_info_args_t;
+
+/* add stat in session */
+typedef struct {
+	int		     lstio_sta_key;	  /* IN: session key */
+	int		     lstio_sta_timeout;      /* IN: timeout for stat requst */
+	int		     lstio_sta_nmlen;	/* IN: group name length */
+	char		   *lstio_sta_namep;	/* IN: group name */
+	int		     lstio_sta_count;	/* IN: # of pid */
+	lnet_process_id_t      *lstio_sta_idsp;	 /* IN: pid */
+	struct list_head	     *lstio_sta_resultp;      /* OUT: list head of result buffer */
+} lstio_stat_args_t;
+
+typedef enum {
+	LST_TEST_BULK   = 1,
+	LST_TEST_PING   = 2
+} lst_test_type_t;
+
+/* create a test in a batch */
+#define LST_MAX_CONCUR	  1024		    /* Max concurrency of test */
+
+typedef struct {
+	int		     lstio_tes_key;	  /* IN: session key */
+	int		     lstio_tes_bat_nmlen;    /* IN: batch name len */
+	char		   *lstio_tes_bat_name;     /* IN: batch name */
+	int		     lstio_tes_type;	 /* IN: test type */
+	int		     lstio_tes_oneside;      /* IN: one sided test */
+	int		     lstio_tes_loop;	 /* IN: loop count */
+	int		     lstio_tes_concur;       /* IN: concurrency */
+
+	int		     lstio_tes_dist;	 /* IN: node distribution in destination groups */
+	int		     lstio_tes_span;	 /* IN: node span in destination groups */
+	int		     lstio_tes_sgrp_nmlen;   /* IN: source group name length */
+	char		   *lstio_tes_sgrp_name;    /* IN: group name */
+	int		     lstio_tes_dgrp_nmlen;   /* IN: destination group name length */
+	char		   *lstio_tes_dgrp_name;    /* IN: group name */
+
+	int		     lstio_tes_param_len;    /* IN: param buffer len */
+	void		   *lstio_tes_param;	/* IN: parameter for specified test:
+							       lstio_bulk_param_t,
+							       lstio_ping_param_t,
+							       ... more */
+	int		    *lstio_tes_retp;	 /* OUT: private returned value */
+	struct list_head	     *lstio_tes_resultp;      /* OUT: list head of result buffer */
+} lstio_test_args_t;
+
+typedef enum {
+	LST_BRW_READ    = 1,
+	LST_BRW_WRITE   = 2
+} lst_brw_type_t;
+
+typedef enum {
+	LST_BRW_CHECK_NONE   = 1,
+	LST_BRW_CHECK_SIMPLE = 2,
+	LST_BRW_CHECK_FULL   = 3
+} lst_brw_flags_t;
+
+typedef struct {
+	int		     blk_opc;		/* bulk operation code */
+	int		     blk_size;	       /* size (bytes) */
+	int		     blk_time;	       /* time of running the test*/
+	int		     blk_flags;	      /* reserved flags */
+} lst_test_bulk_param_t;
+
+typedef struct {
+	int		     png_size;	       /* size of ping message */
+	int		     png_time;	       /* time */
+	int		     png_loop;	       /* loop */
+	int		     png_flags;	      /* reserved flags */
+} lst_test_ping_param_t;
+
+/* more tests */
+typedef struct {
+	__u32 errors;
+	__u32 rpcs_sent;
+	__u32 rpcs_rcvd;
+	__u32 rpcs_dropped;
+	__u32 rpcs_expired;
+	__u64 bulk_get;
+	__u64 bulk_put;
+} WIRE_ATTR srpc_counters_t;
+
+typedef struct {
+	/** milliseconds since current session started */
+	__u32 running_ms;
+	__u32 active_batches;
+	__u32 zombie_sessions;
+	__u32 brw_errors;
+	__u32 ping_errors;
+} WIRE_ATTR sfw_counters_t;
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/ptllnd.h b/drivers/staging/lustre/include/linux/lnet/ptllnd.h
new file mode 100644
index 0000000..fc1ce8e
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/ptllnd.h

@@ -0,0 +1,94 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/ptllnd.h
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ */
+
+/*
+ * The PTLLND was designed to support Portals with
+ * Lustre and non-lustre UNLINK semantics.
+ * However for now the two targets are Cray Portals
+ * on the XT3 and Lustre Portals (for testing) both
+ * have Lustre UNLINK semantics, so this is defined
+ * by default.
+ */
+#define LUSTRE_PORTALS_UNLINK_SEMANTICS
+
+
+#ifdef _USING_LUSTRE_PORTALS_
+
+/* NIDs are 64-bits on Lustre Portals */
+#define FMT_NID LPU64
+#define FMT_PID "%d"
+
+/* When using Lustre Portals Lustre completion semantics are imlicit*/
+#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS      0
+
+#else /* _USING_CRAY_PORTALS_ */
+
+/* NIDs are integers on Cray Portals */
+#define FMT_NID "%u"
+#define FMT_PID "%d"
+
+/* When using Cray Portals this is defined in the Cray Portals Header*/
+/*#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS */
+
+/* Can compare handles directly on Cray Portals */
+#define PtlHandleIsEqual(a,b) ((a) == (b))
+
+/* Diffrent error types on Cray Portals*/
+#define ptl_err_t ptl_ni_fail_t
+
+/*
+ * The Cray Portals has no maximum number of IOVs.  The
+ * maximum is limited only by memory and size of the
+ * int parameters (2^31-1).
+ * Lustre only really require that the underyling
+ * implemenation to support at least LNET_MAX_IOV,
+ * so for Cray portals we can safely just use that
+ * value here.
+ *
+ */
+#define PTL_MD_MAX_IOV	  LNET_MAX_IOV
+
+#endif
+
+#define FMT_PTLID "ptlid:"FMT_PID"-"FMT_NID
+
+/* Align incoming small request messages to an 8 byte boundary if this is
+ * supported to avoid alignment issues on some architectures */
+#ifndef PTL_MD_LOCAL_ALIGN8
+# define PTL_MD_LOCAL_ALIGN8 0
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h b/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h
new file mode 100644
index 0000000..7d12b3a
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h

@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/ptllnd_wire.h
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ */
+
+/* Minimum buffer size that any peer will post to receive ptllnd messages */
+#define PTLLND_MIN_BUFFER_SIZE  256
+
+/************************************************************************
+ * Tunable defaults that {u,k}lnds/ptllnd should have in common.
+ */
+
+#define PTLLND_PORTAL	   9	  /* The same portal PTLPRC used when talking to cray portals */
+#define PTLLND_PID	      9	  /* The Portals PID */
+#define PTLLND_PEERCREDITS      8	  /* concurrent sends to 1 peer */
+
+/* Default buffer size for kernel ptllnds (guaranteed eager) */
+#define PTLLND_MAX_KLND_MSG_SIZE 512
+
+/* Default buffer size for catamount ptllnds (not guaranteed eager) - large
+ * enough to avoid RDMA for anything sent while control is not in liblustre */
+#define PTLLND_MAX_ULND_MSG_SIZE 512
+
+
+/************************************************************************
+ * Portals LND Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+#define PTL_RESERVED_MATCHBITS  0x100	/* below this value is reserved
+					 * above is for bulk data transfer */
+#define LNET_MSG_MATCHBITS       0      /* the value for the message channel */
+
+typedef struct
+{
+	lnet_hdr_t	kptlim_hdr;	     /* portals header */
+	char	      kptlim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kptl_immediate_msg_t;
+
+typedef struct
+{
+	lnet_hdr_t	kptlrm_hdr;	     /* portals header */
+	__u64	     kptlrm_matchbits;       /* matchbits */
+} WIRE_ATTR kptl_rdma_msg_t;
+
+typedef struct
+{
+	__u64	     kptlhm_matchbits;       /* matchbits */
+	__u32	     kptlhm_max_msg_size;    /* max message size */
+} WIRE_ATTR kptl_hello_msg_t;
+
+typedef struct
+{
+	/* First 2 fields fixed FOR ALL TIME */
+	__u32	   ptlm_magic;     /* I'm a Portals LND message */
+	__u16	   ptlm_version;   /* this is my version number */
+	__u8	    ptlm_type;      /* the message type */
+	__u8	    ptlm_credits;   /* returned credits */
+	__u32	   ptlm_nob;       /* # bytes in whole message */
+	__u32	   ptlm_cksum;     /* checksum (0 == no checksum) */
+	__u64	   ptlm_srcnid;    /* sender's NID */
+	__u64	   ptlm_srcstamp;  /* sender's incarnation */
+	__u64	   ptlm_dstnid;    /* destination's NID */
+	__u64	   ptlm_dststamp;  /* destination's incarnation */
+	__u32	   ptlm_srcpid;    /* sender's PID */
+	__u32	   ptlm_dstpid;    /* destination's PID */
+
+	 union {
+		kptl_immediate_msg_t    immediate;
+		kptl_rdma_msg_t	 rdma;
+		kptl_hello_msg_t	hello;
+	} WIRE_ATTR ptlm_u;
+
+} kptl_msg_t;
+
+/* kptl_msg_t::ptlm_credits is only a __u8 */
+#define PTLLND_MSG_MAX_CREDITS ((typeof(((kptl_msg_t*) 0)->ptlm_credits)) -1)
+
+#define PTLLND_MSG_MAGIC		LNET_PROTO_PTL_MAGIC
+#define PTLLND_MSG_VERSION	      0x04
+
+#define PTLLND_RDMA_OK		  0x00
+#define PTLLND_RDMA_FAIL		0x01
+
+#define PTLLND_MSG_TYPE_INVALID	 0x00
+#define PTLLND_MSG_TYPE_PUT	     0x01
+#define PTLLND_MSG_TYPE_GET	     0x02
+#define PTLLND_MSG_TYPE_IMMEDIATE       0x03    /* No bulk data xfer*/
+#define PTLLND_MSG_TYPE_NOOP	    0x04
+#define PTLLND_MSG_TYPE_HELLO	   0x05
+#define PTLLND_MSG_TYPE_NAK	     0x06

diff --git a/drivers/staging/lustre/include/linux/lnet/socklnd.h b/drivers/staging/lustre/include/linux/lnet/socklnd.h
new file mode 100644
index 0000000..bacc749
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/socklnd.h

@@ -0,0 +1,103 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/socklnd.h
+ *
+ * #defines shared between socknal implementation and utilities
+ */
+#ifndef __LNET_LNET_SOCKLND_H__
+#define __LNET_LNET_SOCKLND_H__
+
+#include <linux/lnet/types.h>
+#include <linux/lnet/lib-types.h>
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY	0
+#define SOCKLND_CONN_CONTROL    1
+#define SOCKLND_CONN_BULK_IN    2
+#define SOCKLND_CONN_BULK_OUT   3
+#define SOCKLND_CONN_NTYPES     4
+
+#define SOCKLND_CONN_ACK	SOCKLND_CONN_BULK_IN
+
+typedef struct {
+	__u32		   kshm_magic;     /* magic number of socklnd message */
+	__u32		   kshm_version;   /* version of socklnd message */
+	lnet_nid_t	      kshm_src_nid;   /* sender's nid */
+	lnet_nid_t	      kshm_dst_nid;   /* destination nid */
+	lnet_pid_t	      kshm_src_pid;   /* sender's pid */
+	lnet_pid_t	      kshm_dst_pid;   /* destination pid */
+	__u64		   kshm_src_incarnation; /* sender's incarnation */
+	__u64		   kshm_dst_incarnation; /* destination's incarnation */
+	__u32		   kshm_ctype;     /* connection type */
+	__u32		   kshm_nips;      /* # IP addrs */
+	__u32		   kshm_ips[0];    /* IP addrs */
+} WIRE_ATTR ksock_hello_msg_t;
+
+typedef struct {
+	lnet_hdr_t	      ksnm_hdr;       /* lnet hdr */
+
+	/*
+	 * ksnm_payload is removed because of winnt compiler's limitation:
+	 * zero-sized array can only be placed at the tail of [nested]
+	 * structure definitions. lnet payload will be stored just after
+	 * the body of structure ksock_lnet_msg_t
+	 */
+} WIRE_ATTR ksock_lnet_msg_t;
+
+typedef struct {
+	__u32		   ksm_type;       /* type of socklnd message */
+	__u32		   ksm_csum;       /* checksum if != 0 */
+	__u64		   ksm_zc_cookies[2]; /* Zero-Copy request/ACK cookie */
+	union {
+		ksock_lnet_msg_t lnetmsg;       /* lnet message, it's empty if it's NOOP */
+	} WIRE_ATTR ksm_u;
+} WIRE_ATTR ksock_msg_t;
+
+static inline void
+socklnd_init_msg(ksock_msg_t *msg, int type)
+{
+	msg->ksm_csum	   = 0;
+	msg->ksm_type	   = type;
+	msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+#define KSOCK_MSG_NOOP	  0xc0	    /* ksm_u empty */
+#define KSOCK_MSG_LNET	  0xc1	    /* lnet msg */
+
+/* We need to know this number to parse hello msg from ksocklnd in
+ * other LND (usocklnd, for example) */
+#define KSOCK_PROTO_V2	  2
+#define KSOCK_PROTO_V3	  3
+
+#endif

diff --git a/drivers/staging/lustre/include/linux/lnet/types.h b/drivers/staging/lustre/include/linux/lnet/types.h
new file mode 100644
index 0000000..4f63b7ac
--- /dev/null
+++ b/drivers/staging/lustre/include/linux/lnet/types.h

@@ -0,0 +1,503 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_TYPES_H__
+#define __LNET_TYPES_H__
+
+/** \addtogroup lnet
+ * @{ */
+
+#include <linux/libcfs/libcfs.h>
+
+/** \addtogroup lnet_addr
+ * @{ */
+
+/** Portal reserved for LNet's own use.
+ * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments.
+ */
+#define LNET_RESERVED_PORTAL      0
+
+/**
+ * Address of an end-point in an LNet network.
+ *
+ * A node can have multiple end-points and hence multiple addresses.
+ * An LNet network can be a simple network (e.g. tcp0) or a network of
+ * LNet networks connected by LNet routers. Therefore an end-point address
+ * has two parts: network ID, and address within a network.
+ *
+ * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID.
+ */
+typedef __u64 lnet_nid_t;
+/**
+ * ID of a process in a node. Shortened as PID to distinguish from
+ * lnet_process_id_t, the global process ID.
+ */
+typedef __u32 lnet_pid_t;
+
+/** wildcard NID that matches any end-point address */
+#define LNET_NID_ANY      ((lnet_nid_t) -1)
+/** wildcard PID that matches any lnet_pid_t */
+#define LNET_PID_ANY      ((lnet_pid_t) -1)
+
+#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */
+#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */
+
+#define LNET_TIME_FOREVER    (-1)
+
+/**
+ * Objects maintained by the LNet are accessed through handles. Handle types
+ * have names of the form lnet_handle_xx_t, where xx is one of the two letter
+ * object type codes ('eq' for event queue, 'md' for memory descriptor, and
+ * 'me' for match entry).
+ * Each type of object is given a unique handle type to enhance type checking.
+ * The type lnet_handle_any_t can be used when a generic handle is needed.
+ * Every handle value can be converted into a value of type lnet_handle_any_t
+ * without loss of information.
+ */
+typedef struct {
+	__u64	 cookie;
+} lnet_handle_any_t;
+
+typedef lnet_handle_any_t lnet_handle_eq_t;
+typedef lnet_handle_any_t lnet_handle_md_t;
+typedef lnet_handle_any_t lnet_handle_me_t;
+
+#define LNET_WIRE_HANDLE_COOKIE_NONE   (-1)
+
+/**
+ * Invalidate handle \a h.
+ */
+static inline void LNetInvalidateHandle(lnet_handle_any_t *h)
+{
+	h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE;
+}
+
+/**
+ * Compare handles \a h1 and \a h2.
+ *
+ * \return 1 if handles are equal, 0 if otherwise.
+ */
+static inline int LNetHandleIsEqual (lnet_handle_any_t h1, lnet_handle_any_t h2)
+{
+	return (h1.cookie == h2.cookie);
+}
+
+/**
+ * Check whether handle \a h is invalid.
+ *
+ * \return 1 if handle is invalid, 0 if valid.
+ */
+static inline int LNetHandleIsInvalid(lnet_handle_any_t h)
+{
+	return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
+}
+
+/**
+ * Global process ID.
+ */
+typedef struct {
+	/** node id */
+	lnet_nid_t nid;
+	/** process id */
+	lnet_pid_t pid;
+} lnet_process_id_t;
+/** @} lnet_addr */
+
+/** \addtogroup lnet_me
+ * @{ */
+
+/**
+ * Specifies whether the match entry or memory descriptor should be unlinked
+ * automatically (LNET_UNLINK) or not (LNET_RETAIN).
+ */
+typedef enum {
+	LNET_RETAIN = 0,
+	LNET_UNLINK
+} lnet_unlink_t;
+
+/**
+ * Values of the type lnet_ins_pos_t are used to control where a new match
+ * entry is inserted. The value LNET_INS_BEFORE is used to insert the new
+ * entry before the current entry or before the head of the list. The value
+ * LNET_INS_AFTER is used to insert the new entry after the current entry
+ * or after the last item in the list.
+ */
+typedef enum {
+	/** insert ME before current position or head of the list */
+	LNET_INS_BEFORE,
+	/** insert ME after current position or tail of the list */
+	LNET_INS_AFTER,
+	/** attach ME at tail of local CPU partition ME list */
+	LNET_INS_LOCAL
+} lnet_ins_pos_t;
+
+/** @} lnet_me */
+
+/** \addtogroup lnet_md
+ * @{ */
+
+/**
+ * Defines the visible parts of a memory descriptor. Values of this type
+ * are used to initialize memory descriptors.
+ */
+typedef struct {
+	/**
+	 * Specify the memory region associated with the memory descriptor.
+	 * If the options field has:
+	 * - LNET_MD_KIOV bit set: The start field points to the starting
+	 * address of an array of lnet_kiov_t and the length field specifies
+	 * the number of entries in the array. The length can't be bigger
+	 * than LNET_MAX_IOV. The lnet_kiov_t is used to describe page-based
+	 * fragments that are not necessarily mapped in virtal memory.
+	 * - LNET_MD_IOVEC bit set: The start field points to the starting
+	 * address of an array of struct iovec and the length field specifies
+	 * the number of entries in the array. The length can't be bigger
+	 * than LNET_MAX_IOV. The struct iovec is used to describe fragments
+	 * that have virtual addresses.
+	 * - Otherwise: The memory region is contiguous. The start field
+	 * specifies the starting address for the memory region and the
+	 * length field specifies its length.
+	 *
+	 * When the memory region is fragmented, all fragments but the first
+	 * one must start on page boundary, and all but the last must end on
+	 * page boundary.
+	 */
+	void	    *start;
+	unsigned int     length;
+	/**
+	 * Specifies the maximum number of operations that can be performed
+	 * on the memory descriptor. An operation is any action that could
+	 * possibly generate an event. In the usual case, the threshold value
+	 * is decremented for each operation on the MD. When the threshold
+	 * drops to zero, the MD becomes inactive and does not respond to
+	 * operations. A threshold value of LNET_MD_THRESH_INF indicates that
+	 * there is no bound on the number of operations that may be applied
+	 * to a MD.
+	 */
+	int	      threshold;
+	/**
+	 * Specifies the largest incoming request that the memory descriptor
+	 * should respond to. When the unused portion of a MD (length -
+	 * local offset) falls below this value, the MD becomes inactive and
+	 * does not respond to further operations. This value is only used
+	 * if the LNET_MD_MAX_SIZE option is set.
+	 */
+	int	      max_size;
+	/**
+	 * Specifies the behavior of the memory descriptor. A bitwise OR
+	 * of the following values can be used:
+	 * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD.
+	 * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD.
+	 * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory
+	 *   region is provided by the incoming request. By default, the
+	 *   offset is maintained locally. When maintained locally, the
+	 *   offset is incremented by the length of the request so that
+	 *   the next operation (PUT or GET) will access the next part of
+	 *   the memory region. Note that only one offset variable exists
+	 *   per memory descriptor. If both PUT and GET operations are
+	 *   performed on a memory descriptor, the offset is updated each time.
+	 * - LNET_MD_TRUNCATE: The length provided in the incoming request can
+	 *   be reduced to match the memory available in the region (determined
+	 *   by subtracting the offset from the length of the memory region).
+	 *   By default, if the length in the incoming operation is greater
+	 *   than the amount of memory available, the operation is rejected.
+	 * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for
+	 *   incoming PUT operations, even if requested. By default,
+	 *   acknowledgments are sent for PUT operations that request an
+	 *   acknowledgment. Acknowledgments are never sent for GET operations.
+	 *   The data sent in the REPLY serves as an implicit acknowledgment.
+	 * - LNET_MD_KIOV: The start and length fields specify an array of
+	 *   lnet_kiov_t.
+	 * - LNET_MD_IOVEC: The start and length fields specify an array of
+	 *   struct iovec.
+	 * - LNET_MD_MAX_SIZE: The max_size field is valid.
+	 *
+	 * Note:
+	 * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather
+	 *   capability for memory descriptors. They can't be both set.
+	 * - When LNET_MD_MAX_SIZE is set, the total length of the memory
+	 *   region (i.e. sum of all fragment lengths) must not be less than
+	 *   \a max_size.
+	 */
+	unsigned int     options;
+	/**
+	 * A user-specified value that is associated with the memory
+	 * descriptor. The value does not need to be a pointer, but must fit
+	 * in the space used by a pointer. This value is recorded in events
+	 * associated with operations on this MD.
+	 */
+	void	    *user_ptr;
+	/**
+	 * A handle for the event queue used to log the operations performed on
+	 * the memory region. If this argument is a NULL handle (i.e. nullified
+	 * by LNetInvalidateHandle()), operations performed on this memory
+	 * descriptor are not logged.
+	 */
+	lnet_handle_eq_t eq_handle;
+} lnet_md_t;
+
+/* Max Transfer Unit (minimum supported everywhere).
+ * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
+ * these limits are system wide and not interface-local. */
+#define LNET_MTU_BITS	20
+#define LNET_MTU	(1 << LNET_MTU_BITS)
+
+/** limit on the number of fragments in discontiguous MDs */
+#define LNET_MAX_IOV    256
+
+/* Max payload size */
+# define LNET_MAX_PAYLOAD	CONFIG_LNET_MAX_PAYLOAD
+# if (LNET_MAX_PAYLOAD < LNET_MTU)
+#  error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
+# else
+#  if (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
+/*  PAGE_SIZE is a constant: check with cpp! */
+#   error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
+#  endif
+# endif
+
+/**
+ * Options for the MD structure. See lnet_md_t::options.
+ */
+#define LNET_MD_OP_PUT	       (1 << 0)
+/** See lnet_md_t::options. */
+#define LNET_MD_OP_GET	       (1 << 1)
+/** See lnet_md_t::options. */
+#define LNET_MD_MANAGE_REMOTE	(1 << 2)
+/* unused			    (1 << 3) */
+/** See lnet_md_t::options. */
+#define LNET_MD_TRUNCATE	     (1 << 4)
+/** See lnet_md_t::options. */
+#define LNET_MD_ACK_DISABLE	  (1 << 5)
+/** See lnet_md_t::options. */
+#define LNET_MD_IOVEC		(1 << 6)
+/** See lnet_md_t::options. */
+#define LNET_MD_MAX_SIZE	     (1 << 7)
+/** See lnet_md_t::options. */
+#define LNET_MD_KIOV		 (1 << 8)
+
+/* For compatibility with Cray Portals */
+#define LNET_MD_PHYS			 0
+
+/** Infinite threshold on MD operations. See lnet_md_t::threshold */
+#define LNET_MD_THRESH_INF       (-1)
+
+/* NB lustre portals uses struct iovec internally! */
+typedef struct iovec lnet_md_iovec_t;
+
+/**
+ * A page-based fragment of a MD.
+ */
+typedef struct {
+	/** Pointer to the page where the fragment resides */
+	struct page      *kiov_page;
+	/** Length in bytes of the fragment */
+	unsigned int     kiov_len;
+	/**
+	 * Starting offset of the fragment within the page. Note that the
+	 * end of the fragment must not pass the end of the page; i.e.,
+	 * kiov_len + kiov_offset <= PAGE_CACHE_SIZE.
+	 */
+	unsigned int     kiov_offset;
+} lnet_kiov_t;
+/** @} lnet_md */
+
+/** \addtogroup lnet_eq
+ * @{ */
+
+/**
+ * Six types of events can be logged in an event queue.
+ */
+typedef enum {
+	/** An incoming GET operation has completed on the MD. */
+	LNET_EVENT_GET		= 1,
+	/**
+	 * An incoming PUT operation has completed on the MD. The
+	 * underlying layers will not alter the memory (on behalf of this
+	 * operation) once this event has been logged.
+	 */
+	LNET_EVENT_PUT,
+	/**
+	 * A REPLY operation has completed. This event is logged after the
+	 * data (if any) from the REPLY has been written into the MD.
+	 */
+	LNET_EVENT_REPLY,
+	/** An acknowledgment has been received. */
+	LNET_EVENT_ACK,
+	/**
+	 * An outgoing send (PUT or GET) operation has completed. This event
+	 * is logged after the entire buffer has been sent and it is safe for
+	 * the caller to reuse the buffer.
+	 *
+	 * Note:
+	 * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can
+	 *   happen even when the message has not yet been put out on wire.
+	 * - It's unsafe to assume that in an outgoing GET operation
+	 *   the LNET_EVENT_SEND event would happen before the
+	 *   LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and
+	 *   LNET_EVENT_ACK events in an outgoing PUT operation.
+	 */
+	LNET_EVENT_SEND,
+	/**
+	 * A MD has been unlinked. Note that LNetMDUnlink() does not
+	 * necessarily trigger an LNET_EVENT_UNLINK event.
+	 * \see LNetMDUnlink
+	 */
+	LNET_EVENT_UNLINK,
+} lnet_event_kind_t;
+
+#define LNET_SEQ_BASETYPE       long
+typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t;
+#define LNET_SEQ_GT(a,b)	(((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0)
+
+/* XXX
+ * cygwin need the pragma line, not clear if it's needed in other places.
+ * checking!!!
+ */
+#ifdef __CYGWIN__
+#pragma pack(push, 4)
+#endif
+
+/**
+ * Information about an event on a MD.
+ */
+typedef struct {
+	/** The identifier (nid, pid) of the target. */
+	lnet_process_id_t   target;
+	/** The identifier (nid, pid) of the initiator. */
+	lnet_process_id_t   initiator;
+	/**
+	 * The NID of the immediate sender. If the request has been forwarded
+	 * by routers, this is the NID of the last hop; otherwise it's the
+	 * same as the initiator.
+	 */
+	lnet_nid_t	  sender;
+	/** Indicates the type of the event. */
+	lnet_event_kind_t   type;
+	/** The portal table index specified in the request */
+	unsigned int	pt_index;
+	/** A copy of the match bits specified in the request. */
+	__u64	       match_bits;
+	/** The length (in bytes) specified in the request. */
+	unsigned int	rlength;
+	/**
+	 * The length (in bytes) of the data that was manipulated by the
+	 * operation. For truncated operations, the manipulated length will be
+	 * the number of bytes specified by the MD (possibly with an offset,
+	 * see lnet_md_t). For all other operations, the manipulated length
+	 * will be the length of the requested operation, i.e. rlength.
+	 */
+	unsigned int	mlength;
+	/**
+	 * The handle to the MD associated with the event. The handle may be
+	 * invalid if the MD has been unlinked.
+	 */
+	lnet_handle_md_t    md_handle;
+	/**
+	 * A snapshot of the state of the MD immediately after the event has
+	 * been processed. In particular, the threshold field in md will
+	 * reflect the value of the threshold after the operation occurred.
+	 */
+	lnet_md_t	   md;
+	/**
+	 * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT.
+	 * \see LNetPut
+	 */
+	__u64	       hdr_data;
+	/**
+	 * Indicates the completion status of the operation. It's 0 for
+	 * successful operations, otherwise it's an error code.
+	 */
+	int		 status;
+	/**
+	 * Indicates whether the MD has been unlinked. Note that:
+	 * - An event with unlinked set is the last event on the MD.
+	 * - This field is also set for an explicit LNET_EVENT_UNLINK event.
+	 * \see LNetMDUnlink
+	 */
+	int		 unlinked;
+	/**
+	 * The displacement (in bytes) into the memory region that the
+	 * operation used. The offset can be determined by the operation for
+	 * a remote managed MD or by the local MD.
+	 * \see lnet_md_t::options
+	 */
+	unsigned int	offset;
+	/**
+	 * The sequence number for this event. Sequence numbers are unique
+	 * to each event.
+	 */
+	volatile lnet_seq_t sequence;
+} lnet_event_t;
+#ifdef __CYGWIN__
+#pragma pop
+#endif
+
+/**
+ * Event queue handler function type.
+ *
+ * The EQ handler runs for each event that is deposited into the EQ. The
+ * handler is supplied with a pointer to the event that triggered the
+ * handler invocation.
+ *
+ * The handler must not block, must be reentrant, and must not call any LNet
+ * API functions. It should return as quickly as possible.
+ */
+typedef void (*lnet_eq_handler_t)(lnet_event_t *event);
+#define LNET_EQ_HANDLER_NONE NULL
+/** @} lnet_eq */
+
+/** \addtogroup lnet_data
+ * @{ */
+
+/**
+ * Specify whether an acknowledgment should be sent by target when the PUT
+ * operation completes (i.e., when the data has been written to a MD of the
+ * target process).
+ *
+ * \see lnet_md_t::options for the discussion on LNET_MD_ACK_DISABLE by which
+ * acknowledgments can be disabled for a MD.
+ */
+typedef enum {
+	/** Request an acknowledgment */
+	LNET_ACK_REQ,
+	/** Request that no acknowledgment should be generated. */
+	LNET_NOACK_REQ
+} lnet_ack_req_t;
+/** @} lnet_data */
+
+/** @} lnet */
+#endif

diff --git a/drivers/staging/lustre/lnet/Kconfig b/drivers/staging/lustre/lnet/Kconfig
new file mode 100644
index 0000000..00850ee
--- /dev/null
+++ b/drivers/staging/lustre/lnet/Kconfig

@@ -0,0 +1,40 @@
+config LNET
+	tristate "Lustre networking subsystem"
+	depends on LUSTRE_FS
+
+config LNET_MAX_PAYLOAD
+	int "Lustre lnet max transfer payload (default 2MB)"
+	depends on LUSTRE_FS
+	default "1048576"
+	help
+	  This option defines the maximum size of payload in bytes that lnet
+	  can put into its transport.
+
+	  If unsure, use default.
+
+config LNET_SELFTEST
+	tristate "Lustre networking self testing"
+	depends on LNET
+	help
+	  Choose Y here if you want to do lnet self testing. To compile this
+	  as a module, choose M here: the module will be called lnet_selftest.
+
+	  To compile this as a kernel modules, choose M here and it will be
+	  called lnet_selftest.
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LNET_XPRT_IB
+	tristate "LNET infiniband support"
+	depends on LNET && INFINIBAND && INFINIBAND_ADDR_TRANS
+	default LNET && INFINIBAND
+	help
+	  This option allows the LNET users to use infiniband as an
+	  RDMA-enabled transport.
+
+	  To compile this as a kernel module, choose M here and it will be
+	  called ko2iblnd.
+
+	  If unsure, say N.

diff --git a/drivers/staging/lustre/lnet/Makefile b/drivers/staging/lustre/lnet/Makefile
new file mode 100644
index 0000000..374212b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/Makefile

@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) := klnds/ lnet/ selftest/

diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile
new file mode 100644
index 0000000..c23e4f6
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/Makefile

@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) += o2iblnd/  socklnd/

diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
new file mode 100644
index 0000000..71b7d84
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile

@@ -0,0 +1,5 @@
+obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o
+ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
+
+
+ccflags-y := -I$(src)/../../include

diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644
index 0000000..29a9794
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c

@@ -0,0 +1,3259 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+#include <asm/div64.h>
+
+lnd_t the_o2iblnd = {
+	.lnd_type       = O2IBLND,
+	.lnd_startup    = kiblnd_startup,
+	.lnd_shutdown   = kiblnd_shutdown,
+	.lnd_ctl	= kiblnd_ctl,
+	.lnd_query      = kiblnd_query,
+	.lnd_send       = kiblnd_send,
+	.lnd_recv       = kiblnd_recv,
+};
+
+kib_data_t	      kiblnd_data;
+
+__u32
+kiblnd_cksum (void *ptr, int nob)
+{
+	char  *c  = ptr;
+	__u32  sum = 0;
+
+	while (nob-- > 0)
+		sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+	/* ensure I don't return 0 (== no checksum) */
+	return (sum == 0) ? 1 : sum;
+}
+
+static char *
+kiblnd_msgtype2str(int type)
+{
+	switch (type) {
+	case IBLND_MSG_CONNREQ:
+		return "CONNREQ";
+
+	case IBLND_MSG_CONNACK:
+		return "CONNACK";
+
+	case IBLND_MSG_NOOP:
+		return "NOOP";
+
+	case IBLND_MSG_IMMEDIATE:
+		return "IMMEDIATE";
+
+	case IBLND_MSG_PUT_REQ:
+		return "PUT_REQ";
+
+	case IBLND_MSG_PUT_NAK:
+		return "PUT_NAK";
+
+	case IBLND_MSG_PUT_ACK:
+		return "PUT_ACK";
+
+	case IBLND_MSG_PUT_DONE:
+		return "PUT_DONE";
+
+	case IBLND_MSG_GET_REQ:
+		return "GET_REQ";
+
+	case IBLND_MSG_GET_DONE:
+		return "GET_DONE";
+
+	default:
+		return "???";
+	}
+}
+
+static int
+kiblnd_msgtype2size(int type)
+{
+	const int hdr_size = offsetof(kib_msg_t, ibm_u);
+
+	switch (type) {
+	case IBLND_MSG_CONNREQ:
+	case IBLND_MSG_CONNACK:
+		return hdr_size + sizeof(kib_connparams_t);
+
+	case IBLND_MSG_NOOP:
+		return hdr_size;
+
+	case IBLND_MSG_IMMEDIATE:
+		return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+
+	case IBLND_MSG_PUT_REQ:
+		return hdr_size + sizeof(kib_putreq_msg_t);
+
+	case IBLND_MSG_PUT_ACK:
+		return hdr_size + sizeof(kib_putack_msg_t);
+
+	case IBLND_MSG_GET_REQ:
+		return hdr_size + sizeof(kib_get_msg_t);
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		return hdr_size + sizeof(kib_completion_msg_t);
+	default:
+		return -1;
+	}
+}
+
+static int
+kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+{
+	kib_rdma_desc_t   *rd;
+	int		nob;
+	int		n;
+	int		i;
+
+	LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ ||
+		 msg->ibm_type == IBLND_MSG_PUT_ACK);
+
+	rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
+			      &msg->ibm_u.get.ibgm_rd :
+			      &msg->ibm_u.putack.ibpam_rd;
+
+	if (flip) {
+		__swab32s(&rd->rd_key);
+		__swab32s(&rd->rd_nfrags);
+	}
+
+	n = rd->rd_nfrags;
+
+	if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+		CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
+		       n, IBLND_MAX_RDMA_FRAGS);
+		return 1;
+	}
+
+	nob = offsetof (kib_msg_t, ibm_u) +
+	      kiblnd_rd_msg_size(rd, msg->ibm_type, n);
+
+	if (msg->ibm_nob < nob) {
+		CERROR("Short %s: %d(%d)\n",
+		       kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
+		return 1;
+	}
+
+	if (!flip)
+		return 0;
+
+	for (i = 0; i < n; i++) {
+		__swab32s(&rd->rd_frags[i].rf_nob);
+		__swab64s(&rd->rd_frags[i].rf_addr);
+	}
+
+	return 0;
+}
+
+void
+kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+		 int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+	kib_net_t *net = ni->ni_data;
+
+	/* CAVEAT EMPTOR! all message fields not set here should have been
+	 * initialised previously. */
+	msg->ibm_magic    = IBLND_MSG_MAGIC;
+	msg->ibm_version  = version;
+	/*   ibm_type */
+	msg->ibm_credits  = credits;
+	/*   ibm_nob */
+	msg->ibm_cksum    = 0;
+	msg->ibm_srcnid   = ni->ni_nid;
+	msg->ibm_srcstamp = net->ibn_incarnation;
+	msg->ibm_dstnid   = dstnid;
+	msg->ibm_dststamp = dststamp;
+
+	if (*kiblnd_tunables.kib_cksum) {
+		/* NB ibm_cksum zero while computing cksum */
+		msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+	}
+}
+
+int
+kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+{
+	const int hdr_size = offsetof(kib_msg_t, ibm_u);
+	__u32     msg_cksum;
+	__u16     version;
+	int       msg_nob;
+	int       flip;
+
+	/* 6 bytes are enough to have received magic + version */
+	if (nob < 6) {
+		CERROR("Short message: %d\n", nob);
+		return -EPROTO;
+	}
+
+	if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+		flip = 0;
+	} else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+		flip = 1;
+	} else {
+		CERROR("Bad magic: %08x\n", msg->ibm_magic);
+		return -EPROTO;
+	}
+
+	version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+	if (version != IBLND_MSG_VERSION &&
+	    version != IBLND_MSG_VERSION_1) {
+		CERROR("Bad version: %x\n", version);
+		return -EPROTO;
+	}
+
+	if (nob < hdr_size) {
+		CERROR("Short message: %d\n", nob);
+		return -EPROTO;
+	}
+
+	msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+	if (msg_nob > nob) {
+		CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+		return -EPROTO;
+	}
+
+	/* checksum must be computed with ibm_cksum zero and BEFORE anything
+	 * gets flipped */
+	msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+	msg->ibm_cksum = 0;
+	if (msg_cksum != 0 &&
+	    msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+		CERROR("Bad checksum\n");
+		return -EPROTO;
+	}
+
+	msg->ibm_cksum = msg_cksum;
+
+	if (flip) {
+		/* leave magic unflipped as a clue to peer endianness */
+		msg->ibm_version = version;
+		CLASSERT (sizeof(msg->ibm_type) == 1);
+		CLASSERT (sizeof(msg->ibm_credits) == 1);
+		msg->ibm_nob     = msg_nob;
+		__swab64s(&msg->ibm_srcnid);
+		__swab64s(&msg->ibm_srcstamp);
+		__swab64s(&msg->ibm_dstnid);
+		__swab64s(&msg->ibm_dststamp);
+	}
+
+	if (msg->ibm_srcnid == LNET_NID_ANY) {
+		CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+		return -EPROTO;
+	}
+
+	if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
+		CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
+		       msg_nob, kiblnd_msgtype2size(msg->ibm_type));
+		return -EPROTO;
+	}
+
+	switch (msg->ibm_type) {
+	default:
+		CERROR("Unknown message type %x\n", msg->ibm_type);
+		return -EPROTO;
+
+	case IBLND_MSG_NOOP:
+	case IBLND_MSG_IMMEDIATE:
+	case IBLND_MSG_PUT_REQ:
+		break;
+
+	case IBLND_MSG_PUT_ACK:
+	case IBLND_MSG_GET_REQ:
+		if (kiblnd_unpack_rd(msg, flip))
+			return -EPROTO;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		if (flip)
+			__swab32s(&msg->ibm_u.completion.ibcm_status);
+		break;
+
+	case IBLND_MSG_CONNREQ:
+	case IBLND_MSG_CONNACK:
+		if (flip) {
+			__swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+			__swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+			__swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+		}
+		break;
+	}
+	return 0;
+}
+
+int
+kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
+{
+	kib_peer_t	*peer;
+	kib_net_t	*net = ni->ni_data;
+	int		cpt = lnet_cpt_of_nid(nid);
+	unsigned long   flags;
+
+	LASSERT(net != NULL);
+	LASSERT(nid != LNET_NID_ANY);
+
+	LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
+	if (peer == NULL) {
+		CERROR("Cannot allocate peer\n");
+		return -ENOMEM;
+	}
+
+	memset(peer, 0, sizeof(*peer));	 /* zero flags etc */
+
+	peer->ibp_ni = ni;
+	peer->ibp_nid = nid;
+	peer->ibp_error = 0;
+	peer->ibp_last_alive = 0;
+	atomic_set(&peer->ibp_refcount, 1);  /* 1 ref for caller */
+
+	INIT_LIST_HEAD(&peer->ibp_list);     /* not in the peer table yet */
+	INIT_LIST_HEAD(&peer->ibp_conns);
+	INIT_LIST_HEAD(&peer->ibp_tx_queue);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT (net->ibn_shutdown == 0);
+
+	/* npeers only grows with the global lock held */
+	atomic_inc(&net->ibn_npeers);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	*peerp = peer;
+	return 0;
+}
+
+void
+kiblnd_destroy_peer (kib_peer_t *peer)
+{
+	kib_net_t *net = peer->ibp_ni->ni_data;
+
+	LASSERT (net != NULL);
+	LASSERT (atomic_read(&peer->ibp_refcount) == 0);
+	LASSERT (!kiblnd_peer_active(peer));
+	LASSERT (peer->ibp_connecting == 0);
+	LASSERT (peer->ibp_accepting == 0);
+	LASSERT (list_empty(&peer->ibp_conns));
+	LASSERT (list_empty(&peer->ibp_tx_queue));
+
+	LIBCFS_FREE(peer, sizeof(*peer));
+
+	/* NB a peer's connections keep a reference on their peer until
+	 * they are destroyed, so we can be assured that _all_ state to do
+	 * with this peer has been cleaned up when its refcount drops to
+	 * zero. */
+	atomic_dec(&net->ibn_npeers);
+}
+
+kib_peer_t *
+kiblnd_find_peer_locked (lnet_nid_t nid)
+{
+	/* the caller is responsible for accounting the additional reference
+	 * that this creates */
+	struct list_head       *peer_list = kiblnd_nid2peerlist(nid);
+	struct list_head       *tmp;
+	kib_peer_t       *peer;
+
+	list_for_each (tmp, peer_list) {
+
+		peer = list_entry(tmp, kib_peer_t, ibp_list);
+
+		LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+			 peer->ibp_accepting > 0 ||
+			 !list_empty(&peer->ibp_conns));  /* active conn */
+
+		if (peer->ibp_nid != nid)
+			continue;
+
+		CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
+		       peer, libcfs_nid2str(nid),
+		       atomic_read(&peer->ibp_refcount),
+		       peer->ibp_version);
+		return peer;
+	}
+	return NULL;
+}
+
+void
+kiblnd_unlink_peer_locked (kib_peer_t *peer)
+{
+	LASSERT (list_empty(&peer->ibp_conns));
+
+	LASSERT (kiblnd_peer_active(peer));
+	list_del_init(&peer->ibp_list);
+	/* lose peerlist's ref */
+	kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_get_peer_info (lnet_ni_t *ni, int index,
+		      lnet_nid_t *nidp, int *count)
+{
+	kib_peer_t	    *peer;
+	struct list_head	    *ptmp;
+	int		    i;
+	unsigned long	  flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+
+		list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (index-- > 0)
+				continue;
+
+			*nidp = peer->ibp_nid;
+			*count = atomic_read(&peer->ibp_refcount);
+
+			read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					       flags);
+			return 0;
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return -ENOENT;
+}
+
+void
+kiblnd_del_peer_locked (kib_peer_t *peer)
+{
+	struct list_head	   *ctmp;
+	struct list_head	   *cnxt;
+	kib_conn_t	   *conn;
+
+	if (list_empty(&peer->ibp_conns)) {
+		kiblnd_unlink_peer_locked(peer);
+	} else {
+		list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+			kiblnd_close_conn_locked(conn, 0);
+		}
+		/* NB closing peer's last conn unlinked it. */
+	}
+	/* NB peer now unlinked; might even be freed if the peer table had the
+	 * last ref on it. */
+}
+
+int
+kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
+{
+	LIST_HEAD	 (zombies);
+	struct list_head	    *ptmp;
+	struct list_head	    *pnxt;
+	kib_peer_t	    *peer;
+	int		    lo;
+	int		    hi;
+	int		    i;
+	unsigned long	  flags;
+	int		    rc = -ENOENT;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY) {
+		lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+	} else {
+		lo = 0;
+		hi = kiblnd_data.kib_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
+				continue;
+
+			if (!list_empty(&peer->ibp_tx_queue)) {
+				LASSERT (list_empty(&peer->ibp_conns));
+
+				list_splice_init(&peer->ibp_tx_queue,
+						     &zombies);
+			}
+
+			kiblnd_del_peer_locked(peer);
+			rc = 0;	 /* matched something */
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_txlist_done(ni, &zombies, -EIO);
+
+	return rc;
+}
+
+kib_conn_t *
+kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+	kib_peer_t	    *peer;
+	struct list_head	    *ptmp;
+	kib_conn_t	    *conn;
+	struct list_head	    *ctmp;
+	int		    i;
+	unsigned long	  flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+		list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			list_for_each (ctmp, &peer->ibp_conns) {
+				if (index-- > 0)
+					continue;
+
+				conn = list_entry(ctmp, kib_conn_t,
+						      ibc_list);
+				kiblnd_conn_addref(conn);
+				read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+						       flags);
+				return conn;
+			}
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return NULL;
+}
+
+void
+kiblnd_debug_rx (kib_rx_t *rx)
+{
+	CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
+	       rx, rx->rx_status, rx->rx_msg->ibm_type,
+	       rx->rx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_tx (kib_tx_t *tx)
+{
+	CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
+	       "cookie "LPX64" msg %s%s type %x cred %d\n",
+	       tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+	       tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+	       tx->tx_lntmsg[0] == NULL ? "-" : "!",
+	       tx->tx_lntmsg[1] == NULL ? "-" : "!",
+	       tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_conn (kib_conn_t *conn)
+{
+	struct list_head	*tmp;
+	int		i;
+
+	spin_lock(&conn->ibc_lock);
+
+	CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s: \n",
+	       atomic_read(&conn->ibc_refcount), conn,
+	       conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+	CDEBUG(D_CONSOLE, "   state %d nposted %d/%d cred %d o_cred %d r_cred %d\n",
+	       conn->ibc_state, conn->ibc_noops_posted,
+	       conn->ibc_nsends_posted, conn->ibc_credits,
+	       conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+	CDEBUG(D_CONSOLE, "   comms_err %d\n", conn->ibc_comms_error);
+
+	CDEBUG(D_CONSOLE, "   early_rxs:\n");
+	list_for_each(tmp, &conn->ibc_early_rxs)
+		kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_noops:\n");
+	list_for_each(tmp, &conn->ibc_tx_noops)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   active_txs:\n");
+	list_for_each(tmp, &conn->ibc_active_txs)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   rxs:\n");
+	for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++)
+		kiblnd_debug_rx(&conn->ibc_rxs[i]);
+
+	spin_unlock(&conn->ibc_lock);
+}
+
+int
+kiblnd_translate_mtu(int value)
+{
+	switch (value) {
+	default:
+		return -1;
+	case 0:
+		return 0;
+	case 256:
+		return IB_MTU_256;
+	case 512:
+		return IB_MTU_512;
+	case 1024:
+		return IB_MTU_1024;
+	case 2048:
+		return IB_MTU_2048;
+	case 4096:
+		return IB_MTU_4096;
+	}
+}
+
+static void
+kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
+{
+	int	   mtu;
+
+	/* XXX There is no path record for iWARP, set by netdev->change_mtu? */
+	if (cmid->route.path_rec == NULL)
+		return;
+
+	mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
+	LASSERT (mtu >= 0);
+	if (mtu != 0)
+		cmid->route.path_rec->mtu = mtu;
+}
+
+static int
+kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+{
+	cpumask_t	*mask;
+	int		vectors;
+	int		off;
+	int		i;
+	lnet_nid_t	nid = conn->ibc_peer->ibp_nid;
+
+	vectors = conn->ibc_cmid->device->num_comp_vectors;
+	if (vectors <= 1)
+		return 0;
+
+	mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
+
+	/* hash NID to CPU id in this partition... */
+	off = do_div(nid, cpus_weight(*mask));
+	for_each_cpu_mask(i, *mask) {
+		if (off-- == 0)
+			return i % vectors;
+	}
+
+	LBUG();
+	return 1;
+}
+
+kib_conn_t *
+kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+		   int state, int version)
+{
+	/* CAVEAT EMPTOR:
+	 * If the new conn is created successfully it takes over the caller's
+	 * ref on 'peer'.  It also "owns" 'cmid' and destroys it when it itself
+	 * is destroyed.  On failure, the caller's ref on 'peer' remains and
+	 * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
+	 * to destroy 'cmid' here since I'm called from the CM which still has
+	 * its ref on 'cmid'). */
+	rwlock_t		*glock = &kiblnd_data.kib_global_lock;
+	kib_net_t	      *net = peer->ibp_ni->ni_data;
+	kib_dev_t	      *dev;
+	struct ib_qp_init_attr *init_qp_attr;
+	struct kib_sched_info	*sched;
+	kib_conn_t		*conn;
+	struct ib_cq		*cq;
+	unsigned long		flags;
+	int			cpt;
+	int			rc;
+	int			i;
+
+	LASSERT(net != NULL);
+	LASSERT(!in_interrupt());
+
+	dev = net->ibn_dev;
+
+	cpt = lnet_cpt_of_nid(peer->ibp_nid);
+	sched = kiblnd_data.kib_scheds[cpt];
+
+	LASSERT(sched->ibs_nthreads > 0);
+
+	LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
+			 sizeof(*init_qp_attr));
+	if (init_qp_attr == NULL) {
+		CERROR("Can't allocate qp_attr for %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		goto failed_0;
+	}
+
+	LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
+	if (conn == NULL) {
+		CERROR("Can't allocate connection for %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		goto failed_1;
+	}
+
+	conn->ibc_state = IBLND_CONN_INIT;
+	conn->ibc_version = version;
+	conn->ibc_peer = peer;		  /* I take the caller's ref */
+	cmid->context = conn;		   /* for future CM callbacks */
+	conn->ibc_cmid = cmid;
+
+	INIT_LIST_HEAD(&conn->ibc_early_rxs);
+	INIT_LIST_HEAD(&conn->ibc_tx_noops);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+	INIT_LIST_HEAD(&conn->ibc_active_txs);
+	spin_lock_init(&conn->ibc_lock);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
+			 sizeof(*conn->ibc_connvars));
+	if (conn->ibc_connvars == NULL) {
+		CERROR("Can't allocate in-progress connection state\n");
+		goto failed_2;
+	}
+
+	write_lock_irqsave(glock, flags);
+	if (dev->ibd_failover) {
+		write_unlock_irqrestore(glock, flags);
+		CERROR("%s: failover in progress\n", dev->ibd_ifname);
+		goto failed_2;
+	}
+
+	if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
+		/* wakeup failover thread and teardown connection */
+		if (kiblnd_dev_can_failover(dev)) {
+			list_add_tail(&dev->ibd_fail_list,
+				      &kiblnd_data.kib_failed_devs);
+			wake_up(&kiblnd_data.kib_failover_waitq);
+		}
+
+		write_unlock_irqrestore(glock, flags);
+		CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
+		       cmid->device->name, dev->ibd_ifname);
+		goto failed_2;
+	}
+
+	kiblnd_hdev_addref_locked(dev->ibd_hdev);
+	conn->ibc_hdev = dev->ibd_hdev;
+
+	kiblnd_setup_mtu_locked(cmid);
+
+	write_unlock_irqrestore(glock, flags);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
+			 IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
+	if (conn->ibc_rxs == NULL) {
+		CERROR("Cannot allocate RX buffers\n");
+		goto failed_2;
+	}
+
+	rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
+				IBLND_RX_MSG_PAGES(version));
+	if (rc != 0)
+		goto failed_2;
+
+	kiblnd_map_rx_descs(conn);
+
+	cq = ib_create_cq(cmid->device,
+			  kiblnd_cq_completion, kiblnd_cq_event, conn,
+			  IBLND_CQ_ENTRIES(version),
+			  kiblnd_get_completion_vector(conn, cpt));
+	if (IS_ERR(cq)) {
+		CERROR("Can't create CQ: %ld, cqe: %d\n",
+		       PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
+		goto failed_2;
+	}
+
+	conn->ibc_cq = cq;
+
+	rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (rc != 0) {
+		CERROR("Can't request completion notificiation: %d\n", rc);
+		goto failed_2;
+	}
+
+	init_qp_attr->event_handler = kiblnd_qp_event;
+	init_qp_attr->qp_context = conn;
+	init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
+	init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
+	init_qp_attr->cap.max_send_sge = 1;
+	init_qp_attr->cap.max_recv_sge = 1;
+	init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	init_qp_attr->qp_type = IB_QPT_RC;
+	init_qp_attr->send_cq = cq;
+	init_qp_attr->recv_cq = cq;
+
+	conn->ibc_sched = sched;
+
+	rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+	if (rc != 0) {
+		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
+		       rc, init_qp_attr->cap.max_send_wr,
+		       init_qp_attr->cap.max_recv_wr);
+		goto failed_2;
+	}
+
+	LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+
+	/* 1 ref for caller and each rxmsg */
+	atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
+	conn->ibc_nrx = IBLND_RX_MSGS(version);
+
+	/* post receives */
+	for (i = 0; i < IBLND_RX_MSGS(version); i++) {
+		rc = kiblnd_post_rx(&conn->ibc_rxs[i],
+				    IBLND_POSTRX_NO_CREDIT);
+		if (rc != 0) {
+			CERROR("Can't post rxmsg: %d\n", rc);
+
+			/* Make posted receives complete */
+			kiblnd_abort_receives(conn);
+
+			/* correct # of posted buffers
+			 * NB locking needed now I'm racing with completion */
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+			conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			/* cmid will be destroyed by CM(ofed) after cm_callback
+			 * returned, so we can't refer it anymore
+			 * (by kiblnd_connd()->kiblnd_destroy_conn) */
+			rdma_destroy_qp(conn->ibc_cmid);
+			conn->ibc_cmid = NULL;
+
+			/* Drop my own and unused rxbuffer refcounts */
+			while (i++ <= IBLND_RX_MSGS(version))
+				kiblnd_conn_decref(conn);
+
+			return NULL;
+		}
+	}
+
+	/* Init successful! */
+	LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
+		 state == IBLND_CONN_PASSIVE_WAIT);
+	conn->ibc_state = state;
+
+	/* 1 more conn */
+	atomic_inc(&net->ibn_nconns);
+	return conn;
+
+ failed_2:
+	kiblnd_destroy_conn(conn);
+ failed_1:
+	LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+ failed_0:
+	return NULL;
+}
+
+void
+kiblnd_destroy_conn (kib_conn_t *conn)
+{
+	struct rdma_cm_id *cmid = conn->ibc_cmid;
+	kib_peer_t	*peer = conn->ibc_peer;
+	int		rc;
+
+	LASSERT (!in_interrupt());
+	LASSERT (atomic_read(&conn->ibc_refcount) == 0);
+	LASSERT (list_empty(&conn->ibc_early_rxs));
+	LASSERT (list_empty(&conn->ibc_tx_noops));
+	LASSERT (list_empty(&conn->ibc_tx_queue));
+	LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+	LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
+	LASSERT (list_empty(&conn->ibc_active_txs));
+	LASSERT (conn->ibc_noops_posted == 0);
+	LASSERT (conn->ibc_nsends_posted == 0);
+
+	switch (conn->ibc_state) {
+	default:
+		/* conn must be completely disengaged from the network */
+		LBUG();
+
+	case IBLND_CONN_DISCONNECTED:
+		/* connvars should have been freed already */
+		LASSERT (conn->ibc_connvars == NULL);
+		break;
+
+	case IBLND_CONN_INIT:
+		break;
+	}
+
+	/* conn->ibc_cmid might be destroyed by CM already */
+	if (cmid != NULL && cmid->qp != NULL)
+		rdma_destroy_qp(cmid);
+
+	if (conn->ibc_cq != NULL) {
+		rc = ib_destroy_cq(conn->ibc_cq);
+		if (rc != 0)
+			CWARN("Error destroying CQ: %d\n", rc);
+	}
+
+	if (conn->ibc_rx_pages != NULL)
+		kiblnd_unmap_rx_descs(conn);
+
+	if (conn->ibc_rxs != NULL) {
+		LIBCFS_FREE(conn->ibc_rxs,
+			    IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t));
+	}
+
+	if (conn->ibc_connvars != NULL)
+		LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+	if (conn->ibc_hdev != NULL)
+		kiblnd_hdev_decref(conn->ibc_hdev);
+
+	/* See CAVEAT EMPTOR above in kiblnd_create_conn */
+	if (conn->ibc_state != IBLND_CONN_INIT) {
+		kib_net_t *net = peer->ibp_ni->ni_data;
+
+		kiblnd_peer_decref(peer);
+		rdma_destroy_id(cmid);
+		atomic_dec(&net->ibn_nconns);
+	}
+
+	LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+	kib_conn_t	     *conn;
+	struct list_head	     *ctmp;
+	struct list_head	     *cnxt;
+	int		     count = 0;
+
+	list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+		CDEBUG(D_NET, "Closing conn -> %s, "
+			      "version: %x, reason: %d\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_version, why);
+
+		kiblnd_close_conn_locked(conn, why);
+		count++;
+	}
+
+	return count;
+}
+
+int
+kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+				 int version, __u64 incarnation)
+{
+	kib_conn_t	     *conn;
+	struct list_head	     *ctmp;
+	struct list_head	     *cnxt;
+	int		     count = 0;
+
+	list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+		if (conn->ibc_version     == version &&
+		    conn->ibc_incarnation == incarnation)
+			continue;
+
+		CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
+			      "incarnation:"LPX64"(%x, "LPX64")\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_version, conn->ibc_incarnation,
+		       version, incarnation);
+
+		kiblnd_close_conn_locked(conn, -ESTALE);
+		count++;
+	}
+
+	return count;
+}
+
+int
+kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid)
+{
+	kib_peer_t	     *peer;
+	struct list_head	     *ptmp;
+	struct list_head	     *pnxt;
+	int		     lo;
+	int		     hi;
+	int		     i;
+	unsigned long	   flags;
+	int		     count = 0;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY)
+		lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+	else {
+		lo = 0;
+		hi = kiblnd_data.kib_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
+				continue;
+
+			count += kiblnd_close_peer_conns_locked(peer, 0);
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* wildcards always succeed */
+	if (nid == LNET_NID_ANY)
+		return 0;
+
+	return (count == 0) ? -ENOENT : 0;
+}
+
+int
+kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	int		       rc = -EINVAL;
+
+	switch(cmd) {
+	case IOC_LIBCFS_GET_PEER: {
+		lnet_nid_t   nid = 0;
+		int	  count = 0;
+
+		rc = kiblnd_get_peer_info(ni, data->ioc_count,
+					  &nid, &count);
+		data->ioc_nid    = nid;
+		data->ioc_count  = count;
+		break;
+	}
+
+	case IOC_LIBCFS_DEL_PEER: {
+		rc = kiblnd_del_peer(ni, data->ioc_nid);
+		break;
+	}
+	case IOC_LIBCFS_GET_CONN: {
+		kib_conn_t *conn;
+
+		rc = 0;
+		conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+		if (conn == NULL) {
+			rc = -ENOENT;
+			break;
+		}
+
+		LASSERT (conn->ibc_cmid != NULL);
+		data->ioc_nid = conn->ibc_peer->ibp_nid;
+		if (conn->ibc_cmid->route.path_rec == NULL)
+			data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+		else
+			data->ioc_u32[0] =
+			ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+		kiblnd_conn_decref(conn);
+		break;
+	}
+	case IOC_LIBCFS_CLOSE_CONNECTION: {
+		rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+		break;
+	}
+
+	default:
+		break;
+	}
+
+	return rc;
+}
+
+void
+kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+	cfs_time_t	last_alive = 0;
+	cfs_time_t	now = cfs_time_current();
+	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
+	kib_peer_t	*peer;
+	unsigned long	flags;
+
+	read_lock_irqsave(glock, flags);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL) {
+		LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+			 peer->ibp_accepting > 0 ||
+			 !list_empty(&peer->ibp_conns));  /* active conn */
+		last_alive = peer->ibp_last_alive;
+	}
+
+	read_unlock_irqrestore(glock, flags);
+
+	if (last_alive != 0)
+		*when = last_alive;
+
+	/* peer is not persistent in hash, trigger peer creation
+	 * and connection establishment with a NULL tx */
+	if (peer == NULL)
+		kiblnd_launch_tx(ni, NULL, nid);
+
+	CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+	       libcfs_nid2str(nid), peer,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1);
+	return;
+}
+
+void
+kiblnd_free_pages(kib_pages_t *p)
+{
+	int	npages = p->ibp_npages;
+	int	i;
+
+	for (i = 0; i < npages; i++) {
+		if (p->ibp_pages[i] != NULL)
+			__free_page(p->ibp_pages[i]);
+	}
+
+	LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+{
+	kib_pages_t	*p;
+	int		i;
+
+	LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
+			 offsetof(kib_pages_t, ibp_pages[npages]));
+	if (p == NULL) {
+		CERROR("Can't allocate descriptor for %d pages\n", npages);
+		return -ENOMEM;
+	}
+
+	memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+	p->ibp_npages = npages;
+
+	for (i = 0; i < npages; i++) {
+		p->ibp_pages[i] = alloc_pages_node(
+				    cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+				    __GFP_IO, 0);
+		if (p->ibp_pages[i] == NULL) {
+			CERROR("Can't allocate page %d of %d\n", i, npages);
+			kiblnd_free_pages(p);
+			return -ENOMEM;
+		}
+	}
+
+	*pp = p;
+	return 0;
+}
+
+void
+kiblnd_unmap_rx_descs(kib_conn_t *conn)
+{
+	kib_rx_t *rx;
+	int       i;
+
+	LASSERT (conn->ibc_rxs != NULL);
+	LASSERT (conn->ibc_hdev != NULL);
+
+	for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+		rx = &conn->ibc_rxs[i];
+
+		LASSERT (rx->rx_nob >= 0); /* not posted */
+
+		kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+					KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+							  rx->rx_msgaddr),
+					IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+	}
+
+	kiblnd_free_pages(conn->ibc_rx_pages);
+
+	conn->ibc_rx_pages = NULL;
+}
+
+void
+kiblnd_map_rx_descs(kib_conn_t *conn)
+{
+	kib_rx_t       *rx;
+	struct page    *pg;
+	int	     pg_off;
+	int	     ipg;
+	int	     i;
+
+	for (pg_off = ipg = i = 0;
+	     i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+		pg = conn->ibc_rx_pages->ibp_pages[ipg];
+		rx = &conn->ibc_rxs[i];
+
+		rx->rx_conn = conn;
+		rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+
+		rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+						       rx->rx_msg, IBLND_MSG_SIZE,
+						       DMA_FROM_DEVICE);
+		LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+						   rx->rx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+
+		CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n",
+		       i, rx->rx_msg, rx->rx_msgaddr,
+		       lnet_page2phys(pg) + pg_off);
+
+		pg_off += IBLND_MSG_SIZE;
+		LASSERT (pg_off <= PAGE_SIZE);
+
+		if (pg_off == PAGE_SIZE) {
+			pg_off = 0;
+			ipg++;
+			LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
+		}
+	}
+}
+
+static void
+kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+{
+	kib_hca_dev_t  *hdev = tpo->tpo_hdev;
+	kib_tx_t       *tx;
+	int	     i;
+
+	LASSERT (tpo->tpo_pool.po_allocated == 0);
+
+	if (hdev == NULL)
+		return;
+
+	for (i = 0; i < tpo->tpo_pool.po_size; i++) {
+		tx = &tpo->tpo_tx_descs[i];
+		kiblnd_dma_unmap_single(hdev->ibh_ibdev,
+					KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
+							  tx->tx_msgaddr),
+					IBLND_MSG_SIZE, DMA_TO_DEVICE);
+	}
+
+	kiblnd_hdev_decref(hdev);
+	tpo->tpo_hdev = NULL;
+}
+
+static kib_hca_dev_t *
+kiblnd_current_hdev(kib_dev_t *dev)
+{
+	kib_hca_dev_t *hdev;
+	unsigned long  flags;
+	int	    i = 0;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while (dev->ibd_failover) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+		if (i++ % 50 == 0)
+			CDEBUG(D_NET, "%s: Wait for failover\n",
+			       dev->ibd_ifname);
+		schedule_timeout(cfs_time_seconds(1) / 100);
+
+		read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+
+	kiblnd_hdev_addref_locked(dev->ibd_hdev);
+	hdev = dev->ibd_hdev;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	return hdev;
+}
+
+static void
+kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
+{
+	kib_pages_t    *txpgs = tpo->tpo_tx_pages;
+	kib_pool_t     *pool  = &tpo->tpo_pool;
+	kib_net_t      *net   = pool->po_owner->ps_net;
+	kib_dev_t      *dev;
+	struct page    *page;
+	kib_tx_t       *tx;
+	int	     page_offset;
+	int	     ipage;
+	int	     i;
+
+	LASSERT (net != NULL);
+
+	dev = net->ibn_dev;
+
+	/* pre-mapped messages are not bigger than 1 page */
+	CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
+
+	/* No fancy arithmetic when we do the buffer calculations */
+	CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
+
+	tpo->tpo_hdev = kiblnd_current_hdev(dev);
+
+	for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
+		page = txpgs->ibp_pages[ipage];
+		tx = &tpo->tpo_tx_descs[i];
+
+		tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+					   page_offset);
+
+		tx->tx_msgaddr = kiblnd_dma_map_single(
+			tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
+			IBLND_MSG_SIZE, DMA_TO_DEVICE);
+		LASSERT (!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
+						   tx->tx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
+
+		list_add(&tx->tx_list, &pool->po_free_list);
+
+		page_offset += IBLND_MSG_SIZE;
+		LASSERT (page_offset <= PAGE_SIZE);
+
+		if (page_offset == PAGE_SIZE) {
+			page_offset = 0;
+			ipage++;
+			LASSERT (ipage <= txpgs->ibp_npages);
+		}
+	}
+}
+
+struct ib_mr *
+kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
+{
+	__u64   index;
+
+	LASSERT (hdev->ibh_mrs[0] != NULL);
+
+	if (hdev->ibh_nmrs == 1)
+		return hdev->ibh_mrs[0];
+
+	index = addr >> hdev->ibh_mr_shift;
+
+	if (index <  hdev->ibh_nmrs &&
+	    index == ((addr + size - 1) >> hdev->ibh_mr_shift))
+		return hdev->ibh_mrs[index];
+
+	return NULL;
+}
+
+struct ib_mr *
+kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
+{
+	struct ib_mr *prev_mr;
+	struct ib_mr *mr;
+	int	   i;
+
+	LASSERT (hdev->ibh_mrs[0] != NULL);
+
+	if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+	    *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
+		return NULL;
+
+	if (hdev->ibh_nmrs == 1)
+		return hdev->ibh_mrs[0];
+
+	for (i = 0, mr = prev_mr = NULL;
+	     i < rd->rd_nfrags; i++) {
+		mr = kiblnd_find_dma_mr(hdev,
+					rd->rd_frags[i].rf_addr,
+					rd->rd_frags[i].rf_nob);
+		if (prev_mr == NULL)
+			prev_mr = mr;
+
+		if (mr == NULL || prev_mr != mr) {
+			/* Can't covered by one single MR */
+			mr = NULL;
+			break;
+		}
+	}
+
+	return mr;
+}
+
+void
+kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
+{
+	LASSERT (pool->fpo_map_count == 0);
+
+	if (pool->fpo_fmr_pool != NULL)
+		ib_destroy_fmr_pool(pool->fpo_fmr_pool);
+
+	if (pool->fpo_hdev != NULL)
+		kiblnd_hdev_decref(pool->fpo_hdev);
+
+	LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
+}
+
+void
+kiblnd_destroy_fmr_pool_list(struct list_head *head)
+{
+	kib_fmr_pool_t *pool;
+
+	while (!list_empty(head)) {
+		pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
+		list_del(&pool->fpo_list);
+		kiblnd_destroy_fmr_pool(pool);
+	}
+}
+
+static int kiblnd_fmr_pool_size(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
+
+	return max(IBLND_FMR_POOL, size);
+}
+
+static int kiblnd_fmr_flush_trigger(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
+
+	return max(IBLND_FMR_POOL_FLUSH, size);
+}
+
+int
+kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+{
+	/* FMR pool for RDMA */
+	kib_dev_t	       *dev = fps->fps_net->ibn_dev;
+	kib_fmr_pool_t	  *fpo;
+	struct ib_fmr_pool_param param = {
+		.max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+		.page_shift	= PAGE_SHIFT,
+		.access	    = (IB_ACCESS_LOCAL_WRITE |
+				      IB_ACCESS_REMOTE_WRITE),
+		.pool_size	   = fps->fps_pool_size,
+		.dirty_watermark   = fps->fps_flush_trigger,
+		.flush_function    = NULL,
+		.flush_arg	 = NULL,
+		.cache	     = !!*kiblnd_tunables.kib_fmr_cache};
+	int rc;
+
+	LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+	if (fpo == NULL)
+		return -ENOMEM;
+
+	fpo->fpo_hdev = kiblnd_current_hdev(dev);
+
+	fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
+	if (IS_ERR(fpo->fpo_fmr_pool)) {
+		rc = PTR_ERR(fpo->fpo_fmr_pool);
+		CERROR("Failed to create FMR pool: %d\n", rc);
+
+		kiblnd_hdev_decref(fpo->fpo_hdev);
+		LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
+		return rc;
+	}
+
+	fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	fpo->fpo_owner    = fps;
+	*pp_fpo = fpo;
+
+	return 0;
+}
+
+static void
+kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
+{
+	if (fps->fps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&fps->fps_lock);
+
+	while (!list_empty(&fps->fps_pool_list)) {
+		kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
+						 kib_fmr_pool_t, fpo_list);
+		fpo->fpo_failed = 1;
+		list_del(&fpo->fpo_list);
+		if (fpo->fpo_map_count == 0)
+			list_add(&fpo->fpo_list, zombies);
+		else
+			list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+	}
+
+	spin_unlock(&fps->fps_lock);
+}
+
+static void
+kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+{
+	if (fps->fps_net != NULL) { /* initialized? */
+		kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
+		kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
+	}
+}
+
+static int
+kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, kib_net_t *net,
+			int pool_size, int flush_trigger)
+{
+	kib_fmr_pool_t *fpo;
+	int	     rc;
+
+	memset(fps, 0, sizeof(kib_fmr_poolset_t));
+
+	fps->fps_net = net;
+	fps->fps_cpt = cpt;
+	fps->fps_pool_size = pool_size;
+	fps->fps_flush_trigger = flush_trigger;
+	spin_lock_init(&fps->fps_lock);
+	INIT_LIST_HEAD(&fps->fps_pool_list);
+	INIT_LIST_HEAD(&fps->fps_failed_pool_list);
+
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	if (rc == 0)
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+
+	return rc;
+}
+
+static int
+kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
+{
+	if (fpo->fpo_map_count != 0) /* still in use */
+		return 0;
+	if (fpo->fpo_failed)
+		return 1;
+	return cfs_time_aftereq(now, fpo->fpo_deadline);
+}
+
+void
+kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+{
+	LIST_HEAD     (zombies);
+	kib_fmr_pool_t    *fpo = fmr->fmr_pool;
+	kib_fmr_poolset_t *fps = fpo->fpo_owner;
+	cfs_time_t	 now = cfs_time_current();
+	kib_fmr_pool_t    *tmp;
+	int		rc;
+
+	rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+	LASSERT (rc == 0);
+
+	if (status != 0) {
+		rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
+		LASSERT (rc == 0);
+	}
+
+	fmr->fmr_pool = NULL;
+	fmr->fmr_pfmr = NULL;
+
+	spin_lock(&fps->fps_lock);
+	fpo->fpo_map_count --;  /* decref the pool */
+
+	list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
+		/* the first pool is persistent */
+		if (fps->fps_pool_list.next == &fpo->fpo_list)
+			continue;
+
+		if (kiblnd_fmr_pool_is_idle(fpo, now)) {
+			list_move(&fpo->fpo_list, &zombies);
+			fps->fps_version ++;
+		}
+	}
+	spin_unlock(&fps->fps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_fmr_pool_list(&zombies);
+}
+
+int
+kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
+		    __u64 iov, kib_fmr_t *fmr)
+{
+	struct ib_pool_fmr *pfmr;
+	kib_fmr_pool_t     *fpo;
+	__u64	       version;
+	int		 rc;
+
+ again:
+	spin_lock(&fps->fps_lock);
+	version = fps->fps_version;
+	list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
+		fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		fpo->fpo_map_count++;
+		spin_unlock(&fps->fps_lock);
+
+		pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
+					    pages, npages, iov);
+		if (likely(!IS_ERR(pfmr))) {
+			fmr->fmr_pool = fpo;
+			fmr->fmr_pfmr = pfmr;
+			return 0;
+		}
+
+		spin_lock(&fps->fps_lock);
+		fpo->fpo_map_count--;
+		if (PTR_ERR(pfmr) != -EAGAIN) {
+			spin_unlock(&fps->fps_lock);
+			return PTR_ERR(pfmr);
+		}
+
+		/* EAGAIN and ... */
+		if (version != fps->fps_version) {
+			spin_unlock(&fps->fps_lock);
+			goto again;
+		}
+	}
+
+	if (fps->fps_increasing) {
+		spin_unlock(&fps->fps_lock);
+		CDEBUG(D_NET, "Another thread is allocating new "
+		       "FMR pool, waiting for her to complete\n");
+		schedule();
+		goto again;
+
+	}
+
+	if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
+		/* someone failed recently */
+		spin_unlock(&fps->fps_lock);
+		return -EAGAIN;
+	}
+
+	fps->fps_increasing = 1;
+	spin_unlock(&fps->fps_lock);
+
+	CDEBUG(D_NET, "Allocate new FMR pool\n");
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	spin_lock(&fps->fps_lock);
+	fps->fps_increasing = 0;
+	if (rc == 0) {
+		fps->fps_version++;
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+	} else {
+		fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+	}
+	spin_unlock(&fps->fps_lock);
+
+	goto again;
+}
+
+static void
+kiblnd_fini_pool(kib_pool_t *pool)
+{
+	LASSERT (list_empty(&pool->po_free_list));
+	LASSERT (pool->po_allocated == 0);
+
+	CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
+}
+
+static void
+kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+{
+	CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
+
+	memset(pool, 0, sizeof(kib_pool_t));
+	INIT_LIST_HEAD(&pool->po_free_list);
+	pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	pool->po_owner    = ps;
+	pool->po_size     = size;
+}
+
+void
+kiblnd_destroy_pool_list(struct list_head *head)
+{
+	kib_pool_t *pool;
+
+	while (!list_empty(head)) {
+		pool = list_entry(head->next, kib_pool_t, po_list);
+		list_del(&pool->po_list);
+
+		LASSERT (pool->po_owner != NULL);
+		pool->po_owner->ps_pool_destroy(pool);
+	}
+}
+
+static void
+kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+{
+	if (ps->ps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&ps->ps_lock);
+	while (!list_empty(&ps->ps_pool_list)) {
+		kib_pool_t *po = list_entry(ps->ps_pool_list.next,
+					    kib_pool_t, po_list);
+		po->po_failed = 1;
+		list_del(&po->po_list);
+		if (po->po_allocated == 0)
+			list_add(&po->po_list, zombies);
+		else
+			list_add(&po->po_list, &ps->ps_failed_pool_list);
+	}
+	spin_unlock(&ps->ps_lock);
+}
+
+static void
+kiblnd_fini_poolset(kib_poolset_t *ps)
+{
+	if (ps->ps_net != NULL) { /* initialized? */
+		kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
+		kiblnd_destroy_pool_list(&ps->ps_pool_list);
+	}
+}
+
+static int
+kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
+		    kib_net_t *net, char *name, int size,
+		    kib_ps_pool_create_t po_create,
+		    kib_ps_pool_destroy_t po_destroy,
+		    kib_ps_node_init_t nd_init,
+		    kib_ps_node_fini_t nd_fini)
+{
+	kib_pool_t	*pool;
+	int		rc;
+
+	memset(ps, 0, sizeof(kib_poolset_t));
+
+	ps->ps_cpt	    = cpt;
+	ps->ps_net	  = net;
+	ps->ps_pool_create  = po_create;
+	ps->ps_pool_destroy = po_destroy;
+	ps->ps_node_init    = nd_init;
+	ps->ps_node_fini    = nd_fini;
+	ps->ps_pool_size    = size;
+	if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
+	    >= sizeof(ps->ps_name))
+		return -E2BIG;
+	spin_lock_init(&ps->ps_lock);
+	INIT_LIST_HEAD(&ps->ps_pool_list);
+	INIT_LIST_HEAD(&ps->ps_failed_pool_list);
+
+	rc = ps->ps_pool_create(ps, size, &pool);
+	if (rc == 0)
+		list_add(&pool->po_list, &ps->ps_pool_list);
+	else
+		CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+	return rc;
+}
+
+static int
+kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
+{
+	if (pool->po_allocated != 0) /* still in use */
+		return 0;
+	if (pool->po_failed)
+		return 1;
+	return cfs_time_aftereq(now, pool->po_deadline);
+}
+
+void
+kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+{
+	LIST_HEAD  (zombies);
+	kib_poolset_t  *ps = pool->po_owner;
+	kib_pool_t     *tmp;
+	cfs_time_t      now = cfs_time_current();
+
+	spin_lock(&ps->ps_lock);
+
+	if (ps->ps_node_fini != NULL)
+		ps->ps_node_fini(pool, node);
+
+	LASSERT (pool->po_allocated > 0);
+	list_add(node, &pool->po_free_list);
+	pool->po_allocated --;
+
+	list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+		/* the first pool is persistent */
+		if (ps->ps_pool_list.next == &pool->po_list)
+			continue;
+
+		if (kiblnd_pool_is_idle(pool, now))
+			list_move(&pool->po_list, &zombies);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_pool_list(&zombies);
+}
+
+struct list_head *
+kiblnd_pool_alloc_node(kib_poolset_t *ps)
+{
+	struct list_head	    *node;
+	kib_pool_t	    *pool;
+	int		    rc;
+
+ again:
+	spin_lock(&ps->ps_lock);
+	list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+		if (list_empty(&pool->po_free_list))
+			continue;
+
+		pool->po_allocated ++;
+		pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		node = pool->po_free_list.next;
+		list_del(node);
+
+		if (ps->ps_node_init != NULL) {
+			/* still hold the lock */
+			ps->ps_node_init(pool, node);
+		}
+		spin_unlock(&ps->ps_lock);
+		return node;
+	}
+
+	/* no available tx pool and ... */
+	if (ps->ps_increasing) {
+		/* another thread is allocating a new pool */
+		spin_unlock(&ps->ps_lock);
+		CDEBUG(D_NET, "Another thread is allocating new "
+		       "%s pool, waiting for her to complete\n",
+		       ps->ps_name);
+		schedule();
+		goto again;
+	}
+
+	if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
+		/* someone failed recently */
+		spin_unlock(&ps->ps_lock);
+		return NULL;
+	}
+
+	ps->ps_increasing = 1;
+	spin_unlock(&ps->ps_lock);
+
+	CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+
+	rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+
+	spin_lock(&ps->ps_lock);
+	ps->ps_increasing = 0;
+	if (rc == 0) {
+		list_add_tail(&pool->po_list, &ps->ps_pool_list);
+	} else {
+		ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+		CERROR("Can't allocate new %s pool because out of memory\n",
+		       ps->ps_name);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	goto again;
+}
+
+void
+kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
+{
+	kib_pmr_pool_t      *ppo = pmr->pmr_pool;
+	struct ib_mr	*mr  = pmr->pmr_mr;
+
+	pmr->pmr_mr = NULL;
+	kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
+	if (mr != NULL)
+		ib_dereg_mr(mr);
+}
+
+int
+kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+		    kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
+{
+	kib_phys_mr_t *pmr;
+	struct list_head    *node;
+	int	    rc;
+	int	    i;
+
+	node = kiblnd_pool_alloc_node(&pps->pps_poolset);
+	if (node == NULL) {
+		CERROR("Failed to allocate PMR descriptor\n");
+		return -ENOMEM;
+	}
+
+	pmr = container_of(node, kib_phys_mr_t, pmr_list);
+	if (pmr->pmr_pool->ppo_hdev != hdev) {
+		kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+		return -EAGAIN;
+	}
+
+	for (i = 0; i < rd->rd_nfrags; i ++) {
+		pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
+		pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
+	}
+
+	pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
+				     pmr->pmr_ipb, rd->rd_nfrags,
+				     IB_ACCESS_LOCAL_WRITE |
+				     IB_ACCESS_REMOTE_WRITE,
+				     iova);
+	if (!IS_ERR(pmr->pmr_mr)) {
+		pmr->pmr_iova = *iova;
+		*pp_pmr = pmr;
+		return 0;
+	}
+
+	rc = PTR_ERR(pmr->pmr_mr);
+	CERROR("Failed ib_reg_phys_mr: %d\n", rc);
+
+	pmr->pmr_mr = NULL;
+	kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+
+	return rc;
+}
+
+static void
+kiblnd_destroy_pmr_pool(kib_pool_t *pool)
+{
+	kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
+	kib_phys_mr_t  *pmr;
+
+	LASSERT (pool->po_allocated == 0);
+
+	while (!list_empty(&pool->po_free_list)) {
+		pmr = list_entry(pool->po_free_list.next,
+				     kib_phys_mr_t, pmr_list);
+
+		LASSERT (pmr->pmr_mr == NULL);
+		list_del(&pmr->pmr_list);
+
+		if (pmr->pmr_ipb != NULL) {
+			LIBCFS_FREE(pmr->pmr_ipb,
+				    IBLND_MAX_RDMA_FRAGS *
+				    sizeof(struct ib_phys_buf));
+		}
+
+		LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
+	}
+
+	kiblnd_fini_pool(pool);
+	if (ppo->ppo_hdev != NULL)
+		kiblnd_hdev_decref(ppo->ppo_hdev);
+
+	LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
+}
+
+static inline int kiblnd_pmr_pool_size(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts;
+
+	return max(IBLND_PMR_POOL, size);
+}
+
+static int
+kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+	struct kib_pmr_pool	*ppo;
+	struct kib_pool		*pool;
+	kib_phys_mr_t		*pmr;
+	int			i;
+
+	LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(),
+			 ps->ps_cpt, sizeof(kib_pmr_pool_t));
+	if (ppo == NULL) {
+		CERROR("Failed to allocate PMR pool\n");
+		return -ENOMEM;
+	}
+
+	pool = &ppo->ppo_pool;
+	kiblnd_init_pool(ps, pool, size);
+
+	for (i = 0; i < size; i++) {
+		LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(),
+				 ps->ps_cpt, sizeof(kib_phys_mr_t));
+		if (pmr == NULL)
+			break;
+
+		pmr->pmr_pool = ppo;
+		LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt,
+				 IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb));
+		if (pmr->pmr_ipb == NULL)
+			break;
+
+		list_add(&pmr->pmr_list, &pool->po_free_list);
+	}
+
+	if (i < size) {
+		ps->ps_pool_destroy(pool);
+		return -ENOMEM;
+	}
+
+	ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
+	*pp_po = pool;
+	return 0;
+}
+
+static void
+kiblnd_destroy_tx_pool(kib_pool_t *pool)
+{
+	kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+	int	     i;
+
+	LASSERT (pool->po_allocated == 0);
+
+	if (tpo->tpo_tx_pages != NULL) {
+		kiblnd_unmap_tx_pool(tpo);
+		kiblnd_free_pages(tpo->tpo_tx_pages);
+	}
+
+	if (tpo->tpo_tx_descs == NULL)
+		goto out;
+
+	for (i = 0; i < pool->po_size; i++) {
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+		list_del(&tx->tx_list);
+		if (tx->tx_pages != NULL)
+			LIBCFS_FREE(tx->tx_pages,
+				    LNET_MAX_IOV *
+				    sizeof(*tx->tx_pages));
+		if (tx->tx_frags != NULL)
+			LIBCFS_FREE(tx->tx_frags,
+				    IBLND_MAX_RDMA_FRAGS *
+					    sizeof(*tx->tx_frags));
+		if (tx->tx_wrq != NULL)
+			LIBCFS_FREE(tx->tx_wrq,
+				    (1 + IBLND_MAX_RDMA_FRAGS) *
+				    sizeof(*tx->tx_wrq));
+		if (tx->tx_sge != NULL)
+			LIBCFS_FREE(tx->tx_sge,
+				    (1 + IBLND_MAX_RDMA_FRAGS) *
+				    sizeof(*tx->tx_sge));
+		if (tx->tx_rd != NULL)
+			LIBCFS_FREE(tx->tx_rd,
+				    offsetof(kib_rdma_desc_t,
+					     rd_frags[IBLND_MAX_RDMA_FRAGS]));
+	}
+
+	LIBCFS_FREE(tpo->tpo_tx_descs,
+		    pool->po_size * sizeof(kib_tx_t));
+out:
+	kiblnd_fini_pool(pool);
+	LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+}
+
+static int kiblnd_tx_pool_size(int ncpts)
+{
+	int ntx = *kiblnd_tunables.kib_ntx / ncpts;
+
+	return max(IBLND_TX_POOL, ntx);
+}
+
+static int
+kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+	int	    i;
+	int	    npg;
+	kib_pool_t    *pool;
+	kib_tx_pool_t *tpo;
+
+	LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
+	if (tpo == NULL) {
+		CERROR("Failed to allocate TX pool\n");
+		return -ENOMEM;
+	}
+
+	pool = &tpo->tpo_pool;
+	kiblnd_init_pool(ps, pool, size);
+	tpo->tpo_tx_descs = NULL;
+	tpo->tpo_tx_pages = NULL;
+
+	npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+	if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
+		CERROR("Can't allocate tx pages: %d\n", npg);
+		LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+		return -ENOMEM;
+	}
+
+	LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
+			 size * sizeof(kib_tx_t));
+	if (tpo->tpo_tx_descs == NULL) {
+		CERROR("Can't allocate %d tx descriptors\n", size);
+		ps->ps_pool_destroy(pool);
+		return -ENOMEM;
+	}
+
+	memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+
+	for (i = 0; i < size; i++) {
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+		tx->tx_pool = tpo;
+		if (ps->ps_net->ibn_fmr_ps != NULL) {
+			LIBCFS_CPT_ALLOC(tx->tx_pages,
+					 lnet_cpt_table(), ps->ps_cpt,
+					 LNET_MAX_IOV * sizeof(*tx->tx_pages));
+			if (tx->tx_pages == NULL)
+				break;
+		}
+
+		LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
+				 IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags));
+		if (tx->tx_frags == NULL)
+			break;
+
+		sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
+
+		LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 sizeof(*tx->tx_wrq));
+		if (tx->tx_wrq == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 sizeof(*tx->tx_sge));
+		if (tx->tx_sge == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
+				 offsetof(kib_rdma_desc_t,
+					  rd_frags[IBLND_MAX_RDMA_FRAGS]));
+		if (tx->tx_rd == NULL)
+			break;
+	}
+
+	if (i == size) {
+		kiblnd_map_tx_pool(tpo);
+		*pp_po = pool;
+		return 0;
+	}
+
+	ps->ps_pool_destroy(pool);
+	return -ENOMEM;
+}
+
+static void
+kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+{
+	kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+					     tps_poolset);
+	kib_tx_t	 *tx  = list_entry(node, kib_tx_t, tx_list);
+
+	tx->tx_cookie = tps->tps_next_tx_cookie ++;
+}
+
+void
+kiblnd_net_fini_pools(kib_net_t *net)
+{
+	int	i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		kib_tx_poolset_t	*tps;
+		kib_fmr_poolset_t	*fps;
+		kib_pmr_poolset_t	*pps;
+
+		if (net->ibn_tx_ps != NULL) {
+			tps = net->ibn_tx_ps[i];
+			kiblnd_fini_poolset(&tps->tps_poolset);
+		}
+
+		if (net->ibn_fmr_ps != NULL) {
+			fps = net->ibn_fmr_ps[i];
+			kiblnd_fini_fmr_poolset(fps);
+		}
+
+		if (net->ibn_pmr_ps != NULL) {
+			pps = net->ibn_pmr_ps[i];
+			kiblnd_fini_poolset(&pps->pps_poolset);
+		}
+	}
+
+	if (net->ibn_tx_ps != NULL) {
+		cfs_percpt_free(net->ibn_tx_ps);
+		net->ibn_tx_ps = NULL;
+	}
+
+	if (net->ibn_fmr_ps != NULL) {
+		cfs_percpt_free(net->ibn_fmr_ps);
+		net->ibn_fmr_ps = NULL;
+	}
+
+	if (net->ibn_pmr_ps != NULL) {
+		cfs_percpt_free(net->ibn_pmr_ps);
+		net->ibn_pmr_ps = NULL;
+	}
+}
+
+int
+kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
+{
+	unsigned long	flags;
+	int		cpt;
+	int		rc;
+	int		i;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (*kiblnd_tunables.kib_map_on_demand == 0 &&
+	    net->ibn_dev->ibd_hdev->ibh_nmrs == 1) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					   flags);
+		goto create_tx_pool;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (*kiblnd_tunables.kib_fmr_pool_size <
+	    *kiblnd_tunables.kib_ntx / 4) {
+		CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+		       *kiblnd_tunables.kib_fmr_pool_size,
+		       *kiblnd_tunables.kib_ntx / 4);
+		rc = -EINVAL;
+		goto failed;
+	}
+
+	/* TX pool must be created later than FMR/PMR, see LU-2268
+	 * for details */
+	LASSERT(net->ibn_tx_ps == NULL);
+
+	/* premapping can fail if ibd_nmr > 1, so we always create
+	 * FMR/PMR pool and map-on-demand if premapping failed */
+
+	net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					   sizeof(kib_fmr_poolset_t));
+	if (net->ibn_fmr_ps == NULL) {
+		CERROR("Failed to allocate FMR pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
+					     kiblnd_fmr_pool_size(ncpts),
+					     kiblnd_fmr_flush_trigger(ncpts));
+		if (rc == -ENOSYS && i == 0) /* no FMR */
+			break; /* create PMR pool */
+
+		if (rc != 0) { /* a real error */
+			CERROR("Can't initialize FMR pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	if (i > 0) {
+		LASSERT(i == ncpts);
+		goto create_tx_pool;
+	}
+
+	cfs_percpt_free(net->ibn_fmr_ps);
+	net->ibn_fmr_ps = NULL;
+
+	CWARN("Device does not support FMR, failing back to PMR\n");
+
+	if (*kiblnd_tunables.kib_pmr_pool_size <
+	    *kiblnd_tunables.kib_ntx / 4) {
+		CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
+		       *kiblnd_tunables.kib_pmr_pool_size,
+		       *kiblnd_tunables.kib_ntx / 4);
+		rc = -EINVAL;
+		goto failed;
+	}
+
+	net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					   sizeof(kib_pmr_poolset_t));
+	if (net->ibn_pmr_ps == NULL) {
+		CERROR("Failed to allocate PMR pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset,
+					 cpt, net, "PMR",
+					 kiblnd_pmr_pool_size(ncpts),
+					 kiblnd_create_pmr_pool,
+					 kiblnd_destroy_pmr_pool, NULL, NULL);
+		if (rc != 0) {
+			CERROR("Can't initialize PMR pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+ create_tx_pool:
+	net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					  sizeof(kib_tx_poolset_t));
+	if (net->ibn_tx_ps == NULL) {
+		CERROR("Failed to allocate tx pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
+					 cpt, net, "TX",
+					 kiblnd_tx_pool_size(ncpts),
+					 kiblnd_create_tx_pool,
+					 kiblnd_destroy_tx_pool,
+					 kiblnd_tx_init, NULL);
+		if (rc != 0) {
+			CERROR("Can't initialize TX pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	return 0;
+ failed:
+	kiblnd_net_fini_pools(net);
+	LASSERT(rc != 0);
+	return rc;
+}
+
+static int
+kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+{
+	struct ib_device_attr *attr;
+	int		    rc;
+
+	/* It's safe to assume a HCA can handle a page size
+	 * matching that of the native system */
+	hdev->ibh_page_shift = PAGE_SHIFT;
+	hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+	hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+
+	LIBCFS_ALLOC(attr, sizeof(*attr));
+	if (attr == NULL) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	rc = ib_query_device(hdev->ibh_ibdev, attr);
+	if (rc == 0)
+		hdev->ibh_mr_size = attr->max_mr_size;
+
+	LIBCFS_FREE(attr, sizeof(*attr));
+
+	if (rc != 0) {
+		CERROR("Failed to query IB device: %d\n", rc);
+		return rc;
+	}
+
+	if (hdev->ibh_mr_size == ~0ULL) {
+		hdev->ibh_mr_shift = 64;
+		return 0;
+	}
+
+	for (hdev->ibh_mr_shift = 0;
+	     hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift ++) {
+		if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
+		    hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
+			return 0;
+	}
+
+	CERROR("Invalid mr size: "LPX64"\n", hdev->ibh_mr_size);
+	return -EINVAL;
+}
+
+void
+kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+{
+	int     i;
+
+	if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL)
+		return;
+
+	for (i = 0; i < hdev->ibh_nmrs; i++) {
+		if (hdev->ibh_mrs[i] == NULL)
+			break;
+
+		ib_dereg_mr(hdev->ibh_mrs[i]);
+	}
+
+	LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+	hdev->ibh_mrs  = NULL;
+	hdev->ibh_nmrs = 0;
+}
+
+void
+kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+{
+	kiblnd_hdev_cleanup_mrs(hdev);
+
+	if (hdev->ibh_pd != NULL)
+		ib_dealloc_pd(hdev->ibh_pd);
+
+	if (hdev->ibh_cmid != NULL)
+		rdma_destroy_id(hdev->ibh_cmid);
+
+	LIBCFS_FREE(hdev, sizeof(*hdev));
+}
+
+int
+kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+{
+	struct ib_mr *mr;
+	int	   i;
+	int	   rc;
+	__u64	 mm_size;
+	__u64	 mr_size;
+	int	   acflags = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_WRITE;
+
+	rc = kiblnd_hdev_get_attr(hdev);
+	if (rc != 0)
+		return rc;
+
+	if (hdev->ibh_mr_shift == 64) {
+		LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
+		if (hdev->ibh_mrs == NULL) {
+			CERROR("Failed to allocate MRs table\n");
+			return -ENOMEM;
+		}
+
+		hdev->ibh_mrs[0] = NULL;
+		hdev->ibh_nmrs   = 1;
+
+		mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
+		if (IS_ERR(mr)) {
+			CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
+			kiblnd_hdev_cleanup_mrs(hdev);
+			return PTR_ERR(mr);
+		}
+
+		hdev->ibh_mrs[0] = mr;
+
+		goto out;
+	}
+
+	mr_size = (1ULL << hdev->ibh_mr_shift);
+	mm_size = (unsigned long)high_memory - PAGE_OFFSET;
+
+	hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift);
+
+	if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) {
+		/* it's 4T..., assume we will re-code at that time */
+		CERROR("Can't support memory size: x"LPX64
+		       " with MR size: x"LPX64"\n", mm_size, mr_size);
+		return -EINVAL;
+	}
+
+	/* create an array of MRs to cover all memory */
+	LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+	if (hdev->ibh_mrs == NULL) {
+		CERROR("Failed to allocate MRs' table\n");
+		return -ENOMEM;
+	}
+
+	memset(hdev->ibh_mrs, 0, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+
+	for (i = 0; i < hdev->ibh_nmrs; i++) {
+		struct ib_phys_buf ipb;
+		__u64	      iova;
+
+		ipb.size = hdev->ibh_mr_size;
+		ipb.addr = i * mr_size;
+		iova     = ipb.addr;
+
+		mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova);
+		if (IS_ERR(mr)) {
+			CERROR("Failed ib_reg_phys_mr addr "LPX64
+			       " size "LPX64" : %ld\n",
+			       ipb.addr, ipb.size, PTR_ERR(mr));
+			kiblnd_hdev_cleanup_mrs(hdev);
+			return PTR_ERR(mr);
+		}
+
+		LASSERT (iova == ipb.addr);
+
+		hdev->ibh_mrs[i] = mr;
+	}
+
+out:
+	if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1)
+		LCONSOLE_INFO("Register global MR array, MR size: "
+			      LPX64", array size: %d\n",
+			      hdev->ibh_mr_size, hdev->ibh_nmrs);
+	return 0;
+}
+
+static int
+kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{       /* DUMMY */
+	return 0;
+}
+
+static int
+kiblnd_dev_need_failover(kib_dev_t *dev)
+{
+	struct rdma_cm_id  *cmid;
+	struct sockaddr_in  srcaddr;
+	struct sockaddr_in  dstaddr;
+	int		 rc;
+
+	if (dev->ibd_hdev == NULL || /* initializing */
+	    dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
+	    *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
+		return 1;
+
+	/* XXX: it's UGLY, but I don't have better way to find
+	 * ib-bonding HCA failover because:
+	 *
+	 * a. no reliable CM event for HCA failover...
+	 * b. no OFED API to get ib_device for current net_device...
+	 *
+	 * We have only two choices at this point:
+	 *
+	 * a. rdma_bind_addr(), it will conflict with listener cmid
+	 * b. rdma_resolve_addr() to zero addr */
+	cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
+				     IB_QPT_RC);
+	if (IS_ERR(cmid)) {
+		rc = PTR_ERR(cmid);
+		CERROR("Failed to create cmid for failover: %d\n", rc);
+		return rc;
+	}
+
+	memset(&srcaddr, 0, sizeof(srcaddr));
+	srcaddr.sin_family      = AF_INET;
+	srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+
+	memset(&dstaddr, 0, sizeof(dstaddr));
+	dstaddr.sin_family = AF_INET;
+	rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
+			       (struct sockaddr *)&dstaddr, 1);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+		       dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+		       cmid->device, rc);
+		rdma_destroy_id(cmid);
+		return rc;
+	}
+
+	if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
+		/* don't need device failover */
+		rdma_destroy_id(cmid);
+		return 0;
+	}
+
+	return 1;
+}
+
+int
+kiblnd_dev_failover(kib_dev_t *dev)
+{
+	LIST_HEAD      (zombie_tpo);
+	LIST_HEAD      (zombie_ppo);
+	LIST_HEAD      (zombie_fpo);
+	struct rdma_cm_id  *cmid  = NULL;
+	kib_hca_dev_t      *hdev  = NULL;
+	kib_hca_dev_t      *old;
+	struct ib_pd       *pd;
+	kib_net_t	  *net;
+	struct sockaddr_in  addr;
+	unsigned long       flags;
+	int		 rc = 0;
+	int		    i;
+
+	LASSERT (*kiblnd_tunables.kib_dev_failover > 1 ||
+		 dev->ibd_can_failover ||
+		 dev->ibd_hdev == NULL);
+
+	rc = kiblnd_dev_need_failover(dev);
+	if (rc <= 0)
+		goto out;
+
+	if (dev->ibd_hdev != NULL &&
+	    dev->ibd_hdev->ibh_cmid != NULL) {
+		/* XXX it's not good to close old listener at here,
+		 * because we can fail to create new listener.
+		 * But we have to close it now, otherwise rdma_bind_addr
+		 * will return EADDRINUSE... How crap! */
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+		cmid = dev->ibd_hdev->ibh_cmid;
+		/* make next schedule of kiblnd_dev_need_failover()
+		 * return 1 for me */
+		dev->ibd_hdev->ibh_cmid  = NULL;
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		rdma_destroy_id(cmid);
+	}
+
+	cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
+				     IB_QPT_RC);
+	if (IS_ERR(cmid)) {
+		rc = PTR_ERR(cmid);
+		CERROR("Failed to create cmid for failover: %d\n", rc);
+		goto out;
+	}
+
+	memset(&addr, 0, sizeof(addr));
+	addr.sin_family      = AF_INET;
+	addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+	addr.sin_port	= htons(*kiblnd_tunables.kib_service);
+
+	/* Bind to failover device or port */
+	rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+		       dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+		       cmid->device, rc);
+		rdma_destroy_id(cmid);
+		goto out;
+	}
+
+	LIBCFS_ALLOC(hdev, sizeof(*hdev));
+	if (hdev == NULL) {
+		CERROR("Failed to allocate kib_hca_dev\n");
+		rdma_destroy_id(cmid);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	atomic_set(&hdev->ibh_ref, 1);
+	hdev->ibh_dev   = dev;
+	hdev->ibh_cmid  = cmid;
+	hdev->ibh_ibdev = cmid->device;
+
+	pd = ib_alloc_pd(cmid->device);
+	if (IS_ERR(pd)) {
+		rc = PTR_ERR(pd);
+		CERROR("Can't allocate PD: %d\n", rc);
+		goto out;
+	}
+
+	hdev->ibh_pd = pd;
+
+	rc = rdma_listen(cmid, 0);
+	if (rc != 0) {
+		CERROR("Can't start new listener: %d\n", rc);
+		goto out;
+	}
+
+	rc = kiblnd_hdev_setup_mrs(hdev);
+	if (rc != 0) {
+		CERROR("Can't setup device: %d\n", rc);
+		goto out;
+	}
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	old = dev->ibd_hdev;
+	dev->ibd_hdev = hdev; /* take over the refcount */
+	hdev = old;
+
+	list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
+		cfs_cpt_for_each(i, lnet_cpt_table()) {
+			kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
+					    &zombie_tpo);
+
+			if (net->ibn_fmr_ps != NULL) {
+				kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
+							&zombie_fpo);
+
+			} else if (net->ibn_pmr_ps != NULL) {
+				kiblnd_fail_poolset(&net->ibn_pmr_ps[i]->
+						    pps_poolset, &zombie_ppo);
+			}
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ out:
+	if (!list_empty(&zombie_tpo))
+		kiblnd_destroy_pool_list(&zombie_tpo);
+	if (!list_empty(&zombie_ppo))
+		kiblnd_destroy_pool_list(&zombie_ppo);
+	if (!list_empty(&zombie_fpo))
+		kiblnd_destroy_fmr_pool_list(&zombie_fpo);
+	if (hdev != NULL)
+		kiblnd_hdev_decref(hdev);
+
+	if (rc != 0)
+		dev->ibd_failed_failover++;
+	else
+		dev->ibd_failed_failover = 0;
+
+	return rc;
+}
+
+void
+kiblnd_destroy_dev (kib_dev_t *dev)
+{
+	LASSERT (dev->ibd_nnets == 0);
+	LASSERT (list_empty(&dev->ibd_nets));
+
+	list_del(&dev->ibd_fail_list);
+	list_del(&dev->ibd_list);
+
+	if (dev->ibd_hdev != NULL)
+		kiblnd_hdev_decref(dev->ibd_hdev);
+
+	LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+kib_dev_t *
+kiblnd_create_dev(char *ifname)
+{
+	struct net_device *netdev;
+	kib_dev_t	 *dev;
+	__u32	      netmask;
+	__u32	      ip;
+	int		up;
+	int		rc;
+
+	rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
+	if (rc != 0) {
+		CERROR("Can't query IPoIB interface %s: %d\n",
+		       ifname, rc);
+		return NULL;
+	}
+
+	if (!up) {
+		CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(dev, sizeof(*dev));
+	if (dev == NULL)
+		return NULL;
+
+	memset(dev, 0, sizeof(*dev));
+	netdev = dev_get_by_name(&init_net, ifname);
+	if (netdev == NULL) {
+		dev->ibd_can_failover = 0;
+	} else {
+		dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
+		dev_put(netdev);
+	}
+
+	INIT_LIST_HEAD(&dev->ibd_nets);
+	INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
+	INIT_LIST_HEAD(&dev->ibd_fail_list);
+	dev->ibd_ifip = ip;
+	strcpy(&dev->ibd_ifname[0], ifname);
+
+	/* initialize the device */
+	rc = kiblnd_dev_failover(dev);
+	if (rc != 0) {
+		CERROR("Can't initialize device: %d\n", rc);
+		LIBCFS_FREE(dev, sizeof(*dev));
+		return NULL;
+	}
+
+	list_add_tail(&dev->ibd_list,
+			  &kiblnd_data.kib_devs);
+	return dev;
+}
+
+void
+kiblnd_base_shutdown(void)
+{
+	struct kib_sched_info	*sched;
+	int			i;
+
+	LASSERT (list_empty(&kiblnd_data.kib_devs));
+
+	CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	switch (kiblnd_data.kib_init) {
+	default:
+		LBUG();
+
+	case IBLND_INIT_ALL:
+	case IBLND_INIT_DATA:
+		LASSERT (kiblnd_data.kib_peers != NULL);
+		for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+			LASSERT (list_empty(&kiblnd_data.kib_peers[i]));
+		}
+		LASSERT (list_empty(&kiblnd_data.kib_connd_zombies));
+		LASSERT (list_empty(&kiblnd_data.kib_connd_conns));
+
+		/* flag threads to terminate; wake and wait for them to die */
+		kiblnd_data.kib_shutdown = 1;
+
+		/* NB: we really want to stop scheduler threads net by net
+		 * instead of the whole module, this should be improved
+		 * with dynamic configuration LNet */
+		cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
+			wake_up_all(&sched->ibs_waitq);
+
+		wake_up_all(&kiblnd_data.kib_connd_waitq);
+		wake_up_all(&kiblnd_data.kib_failover_waitq);
+
+		i = 2;
+		while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+			       "Waiting for %d threads to terminate\n",
+			       atomic_read(&kiblnd_data.kib_nthreads));
+			cfs_pause(cfs_time_seconds(1));
+		}
+
+		/* fall through */
+
+	case IBLND_INIT_NOTHING:
+		break;
+	}
+
+	if (kiblnd_data.kib_peers != NULL) {
+		LIBCFS_FREE(kiblnd_data.kib_peers,
+			    sizeof(struct list_head) *
+			    kiblnd_data.kib_peer_hash_size);
+	}
+
+	if (kiblnd_data.kib_scheds != NULL)
+		cfs_percpt_free(kiblnd_data.kib_scheds);
+
+	CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+	module_put(THIS_MODULE);
+}
+
+void
+kiblnd_shutdown (lnet_ni_t *ni)
+{
+	kib_net_t	*net = ni->ni_data;
+	rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
+	int	       i;
+	unsigned long     flags;
+
+	LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+	if (net == NULL)
+		goto out;
+
+	CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	write_lock_irqsave(g_lock, flags);
+	net->ibn_shutdown = 1;
+	write_unlock_irqrestore(g_lock, flags);
+
+	switch (net->ibn_init) {
+	default:
+		LBUG();
+
+	case IBLND_INIT_ALL:
+		/* nuke all existing peers within this net */
+		kiblnd_del_peer(ni, LNET_NID_ANY);
+
+		/* Wait for all peer state to clean up */
+		i = 2;
+		while (atomic_read(&net->ibn_npeers) != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
+			       "%s: waiting for %d peers to disconnect\n",
+			       libcfs_nid2str(ni->ni_nid),
+			       atomic_read(&net->ibn_npeers));
+			cfs_pause(cfs_time_seconds(1));
+		}
+
+		kiblnd_net_fini_pools(net);
+
+		write_lock_irqsave(g_lock, flags);
+		LASSERT(net->ibn_dev->ibd_nnets > 0);
+		net->ibn_dev->ibd_nnets--;
+		list_del(&net->ibn_list);
+		write_unlock_irqrestore(g_lock, flags);
+
+		/* fall through */
+
+	case IBLND_INIT_NOTHING:
+		LASSERT (atomic_read(&net->ibn_nconns) == 0);
+
+		if (net->ibn_dev != NULL &&
+		    net->ibn_dev->ibd_nnets == 0)
+			kiblnd_destroy_dev(net->ibn_dev);
+
+		break;
+	}
+
+	CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	net->ibn_init = IBLND_INIT_NOTHING;
+	ni->ni_data = NULL;
+
+	LIBCFS_FREE(net, sizeof(*net));
+
+out:
+	if (list_empty(&kiblnd_data.kib_devs))
+		kiblnd_base_shutdown();
+	return;
+}
+
+int
+kiblnd_base_startup(void)
+{
+	struct kib_sched_info	*sched;
+	int			rc;
+	int			i;
+
+	LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+	try_module_get(THIS_MODULE);
+	memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
+
+	rwlock_init(&kiblnd_data.kib_global_lock);
+
+	INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+	INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
+
+	kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
+	LIBCFS_ALLOC(kiblnd_data.kib_peers,
+		     sizeof(struct list_head) *
+			    kiblnd_data.kib_peer_hash_size);
+	if (kiblnd_data.kib_peers == NULL) {
+		goto failed;
+	}
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+		INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
+
+	spin_lock_init(&kiblnd_data.kib_connd_lock);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+	init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+	init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
+
+	kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
+						  sizeof(*sched));
+	if (kiblnd_data.kib_scheds == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+		int	nthrs;
+
+		spin_lock_init(&sched->ibs_lock);
+		INIT_LIST_HEAD(&sched->ibs_conns);
+		init_waitqueue_head(&sched->ibs_waitq);
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
+		} else {
+			/* max to half of CPUs, another half is reserved for
+			 * upper layer modules */
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+		}
+
+		sched->ibs_nthreads_max = nthrs;
+		sched->ibs_cpt = i;
+	}
+
+	kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+	/* lists/ptrs/locks initialised */
+	kiblnd_data.kib_init = IBLND_INIT_DATA;
+	/*****************************************************/
+
+	rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
+	if (rc != 0) {
+		CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+		goto failed;
+	}
+
+	if (*kiblnd_tunables.kib_dev_failover != 0)
+		rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+					 "kiblnd_failover");
+
+	if (rc != 0) {
+		CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
+		goto failed;
+	}
+
+	/* flag everything initialised */
+	kiblnd_data.kib_init = IBLND_INIT_ALL;
+	/*****************************************************/
+
+	return 0;
+
+ failed:
+	kiblnd_base_shutdown();
+	return -ENETDOWN;
+}
+
+int
+kiblnd_start_schedulers(struct kib_sched_info *sched)
+{
+	int	rc = 0;
+	int	nthrs;
+	int	i;
+
+	if (sched->ibs_nthreads == 0) {
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = sched->ibs_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       sched->ibs_cpt);
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+			nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
+		}
+	} else {
+		LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
+		/* increase one thread if there is new interface */
+		nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long	id;
+		char	name[20];
+		id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+		snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
+			 KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
+		rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       sched->ibs_cpt, sched->ibs_nthreads + i, rc);
+		break;
+	}
+
+	sched->ibs_nthreads += i;
+	return rc;
+}
+
+int
+kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
+{
+	int	cpt;
+	int	rc;
+	int	i;
+
+	for (i = 0; i < ncpts; i++) {
+		struct kib_sched_info *sched;
+
+		cpt = (cpts == NULL) ? i : cpts[i];
+		sched = kiblnd_data.kib_scheds[cpt];
+
+		if (!newdev && sched->ibs_nthreads > 0)
+			continue;
+
+		rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
+		if (rc != 0) {
+			CERROR("Failed to start scheduler threads for %s\n",
+			       dev->ibd_ifname);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+kib_dev_t *
+kiblnd_dev_search(char *ifname)
+{
+	kib_dev_t	*alias = NULL;
+	kib_dev_t	*dev;
+	char		*colon;
+	char		*colon2;
+
+	colon = strchr(ifname, ':');
+	list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			return dev;
+
+		if (alias != NULL)
+			continue;
+
+		colon2 = strchr(dev->ibd_ifname, ':');
+		if (colon != NULL)
+			*colon = 0;
+		if (colon2 != NULL)
+			*colon2 = 0;
+
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			alias = dev;
+
+		if (colon != NULL)
+			*colon = ':';
+		if (colon2 != NULL)
+			*colon2 = ':';
+	}
+	return alias;
+}
+
+int
+kiblnd_startup (lnet_ni_t *ni)
+{
+	char		     *ifname;
+	kib_dev_t		*ibdev = NULL;
+	kib_net_t		*net;
+	struct timeval	    tv;
+	unsigned long	     flags;
+	int		       rc;
+	int			  newdev;
+
+	LASSERT (ni->ni_lnd == &the_o2iblnd);
+
+	if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+		rc = kiblnd_base_startup();
+		if (rc != 0)
+			return rc;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	ni->ni_data = net;
+	if (net == NULL)
+		goto failed;
+
+	memset(net, 0, sizeof(*net));
+
+	do_gettimeofday(&tv);
+	net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+	ni->ni_peertimeout    = *kiblnd_tunables.kib_peertimeout;
+	ni->ni_maxtxcredits   = *kiblnd_tunables.kib_credits;
+	ni->ni_peertxcredits  = *kiblnd_tunables.kib_peertxcredits;
+	ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
+
+	if (ni->ni_interfaces[0] != NULL) {
+		/* Use the IPoIB interface specified in 'networks=' */
+
+		CLASSERT (LNET_MAX_INTERFACES > 1);
+		if (ni->ni_interfaces[1] != NULL) {
+			CERROR("Multiple interfaces not supported\n");
+			goto failed;
+		}
+
+		ifname = ni->ni_interfaces[0];
+	} else {
+		ifname = *kiblnd_tunables.kib_default_ipif;
+	}
+
+	if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+		CERROR("IPoIB interface name too long: %s\n", ifname);
+		goto failed;
+	}
+
+	ibdev = kiblnd_dev_search(ifname);
+
+	newdev = ibdev == NULL;
+	/* hmm...create kib_dev even for alias */
+	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
+		ibdev = kiblnd_create_dev(ifname);
+
+	if (ibdev == NULL)
+		goto failed;
+
+	net->ibn_dev = ibdev;
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+
+	rc = kiblnd_dev_start_threads(ibdev, newdev,
+				      ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto failed;
+
+	rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0) {
+		CERROR("Failed to initialize NI pools: %d\n", rc);
+		goto failed;
+	}
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	ibdev->ibd_nnets++;
+	list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	net->ibn_init = IBLND_INIT_ALL;
+
+	return 0;
+
+failed:
+	if (net->ibn_dev == NULL && ibdev != NULL)
+		kiblnd_destroy_dev(ibdev);
+
+	kiblnd_shutdown(ni);
+
+	CDEBUG(D_NET, "kiblnd_startup failed\n");
+	return -ENETDOWN;
+}
+
+void __exit
+kiblnd_module_fini (void)
+{
+	lnet_unregister_lnd(&the_o2iblnd);
+	kiblnd_tunables_fini();
+}
+
+int __init
+kiblnd_module_init (void)
+{
+	int    rc;
+
+	CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+	CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+		  <= IBLND_MSG_SIZE);
+	CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+		  <= IBLND_MSG_SIZE);
+
+	rc = kiblnd_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_o2iblnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00");
+MODULE_LICENSE("GPL");
+
+module_init(kiblnd_module_init);
+module_exit(kiblnd_module_fini);

diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644
index 0000000..e4626bf
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h

@@ -0,0 +1,1057 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/pci.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
+#define IBLND_PEER_HASH_SIZE		101	/* # peer lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED			100
+
+#define IBLND_N_SCHED			2
+#define IBLND_N_SCHED_HIGH		4
+
+typedef struct
+{
+	int	      *kib_dev_failover;     /* HCA failover */
+	unsigned int     *kib_service;	  /* IB service number */
+	int	      *kib_min_reconnect_interval; /* first failed connection retry... */
+	int	      *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+	int	      *kib_cksum;	    /* checksum kib_msg_t? */
+	int	      *kib_timeout;	  /* comms timeout (seconds) */
+	int	      *kib_keepalive;	/* keepalive timeout (seconds) */
+	int	      *kib_ntx;	      /* # tx descs */
+	int	      *kib_credits;	  /* # concurrent sends */
+	int	      *kib_peertxcredits;    /* # concurrent sends to 1 peer */
+	int	      *kib_peerrtrcredits;   /* # per-peer router buffer credits */
+	int	      *kib_peercredits_hiw;  /* # when eagerly to return credits */
+	int	      *kib_peertimeout;      /* seconds to consider peer dead */
+	char	    **kib_default_ipif;     /* default IPoIB interface */
+	int	      *kib_retry_count;
+	int	      *kib_rnr_retry_count;
+	int	      *kib_concurrent_sends; /* send work queue sizing */
+	int		 *kib_ib_mtu;		/* IB MTU */
+	int	      *kib_map_on_demand;    /* map-on-demand if RD has more fragments
+						 * than this value, 0 disable map-on-demand */
+	int	      *kib_pmr_pool_size;    /* # physical MR in pool */
+	int	      *kib_fmr_pool_size;    /* # FMRs in pool */
+	int	      *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+	int	      *kib_fmr_cache;	/* enable FMR pool cache? */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+	ctl_table_header_t *kib_sysctl;  /* sysctl interface */
+#endif
+	int	      *kib_require_priv_port;/* accept only privileged ports */
+	int	      *kib_use_priv_port;    /* use privileged port for active connect */
+	/* # threads on each CPT */
+	int		 *kib_nscheds;
+} kib_tunables_t;
+
+extern kib_tunables_t  kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1      8	  /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1    7	  /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT	8	  /* default # of peer credits */
+#define IBLND_CREDITS_MAX	  ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer credits */
+
+#define IBLND_MSG_QUEUE_SIZE(v)    ((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_MSG_QUEUE_SIZE_V1 :   \
+				     *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
+#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_CREDIT_HIGHWATER_V1 : \
+				     *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
+
+static inline int
+kiblnd_concurrent_sends_v1(void)
+{
+	if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+		return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+		return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+
+	return *kiblnd_tunables.kib_concurrent_sends;
+}
+
+#define IBLND_CONCURRENT_SENDS(v)  ((v) == IBLND_MSG_VERSION_1 ? \
+				     kiblnd_concurrent_sends_v1() : \
+				     *kiblnd_tunables.kib_concurrent_sends)
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v)	   (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+#define IBLND_MSG_SIZE	      (4<<10)		 /* max size of queued messages (inc hdr) */
+#define IBLND_MAX_RDMA_FRAGS	 LNET_MAX_IOV	   /* max # of fragments supported */
+#define IBLND_CFG_RDMA_FRAGS       (*kiblnd_tunables.kib_map_on_demand != 0 ? \
+				    *kiblnd_tunables.kib_map_on_demand :      \
+				     IBLND_MAX_RDMA_FRAGS)  /* max # of fragments configured by user */
+#define IBLND_RDMA_FRAGS(v)	((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
+
+/************************/
+/* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL			256
+#define IBLND_PMR_POOL			256
+#define IBLND_FMR_POOL			256
+#define IBLND_FMR_POOL_FLUSH		192
+
+/* TX messages (shared by all connections) */
+#define IBLND_TX_MSGS()	    (*kiblnd_tunables.kib_ntx)
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS(v)	    (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v))
+#define IBLND_RX_MSG_BYTES(v)       (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(v)      ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(v)	    IBLND_RX_MSGS(v)
+#define IBLND_SEND_WRS(v)	  ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v))
+#define IBLND_CQ_ENTRIES(v)	 (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE	      IFALIASZ
+#else
+#define KIB_IFNAME_SIZE	      256
+#endif
+
+typedef struct
+{
+	struct list_head	   ibd_list;	  /* chain on kib_devs */
+	struct list_head	   ibd_fail_list;     /* chain on kib_failed_devs */
+	__u32		ibd_ifip;	  /* IPoIB interface IP */
+	/** IPoIB interface name */
+	char		 ibd_ifname[KIB_IFNAME_SIZE];
+	int		  ibd_nnets;	 /* # nets extant */
+
+	cfs_time_t	   ibd_next_failover;
+	int		  ibd_failed_failover; /* # failover failures */
+	unsigned int	 ibd_failover;      /* failover in progress */
+	unsigned int	 ibd_can_failover;  /* IPoIB interface is a bonding master */
+	struct list_head	   ibd_nets;
+	struct kib_hca_dev  *ibd_hdev;
+} kib_dev_t;
+
+typedef struct kib_hca_dev
+{
+	struct rdma_cm_id   *ibh_cmid;	  /* listener cmid */
+	struct ib_device    *ibh_ibdev;	 /* IB device */
+	int		  ibh_page_shift;    /* page shift of current HCA */
+	int		  ibh_page_size;     /* page size of current HCA */
+	__u64		ibh_page_mask;     /* page mask of current HCA */
+	int		  ibh_mr_shift;      /* bits shift of max MR size */
+	__u64		ibh_mr_size;       /* size of MR */
+	int		  ibh_nmrs;	  /* # of global MRs */
+	struct ib_mr       **ibh_mrs;	   /* global MR */
+	struct ib_pd	*ibh_pd;	    /* PD */
+	kib_dev_t	   *ibh_dev;	   /* owner */
+	atomic_t	 ibh_ref;	   /* refcount */
+} kib_hca_dev_t;
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE     300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY	1
+
+typedef struct
+{
+	int		     ibp_npages;	     /* # pages */
+	struct page	    *ibp_pages[0];	   /* page array */
+} kib_pages_t;
+
+struct kib_pmr_pool;
+
+typedef struct {
+	struct list_head	      pmr_list;	       /* chain node */
+	struct ib_phys_buf     *pmr_ipb;		/* physical buffer */
+	struct ib_mr	   *pmr_mr;		 /* IB MR */
+	struct kib_pmr_pool    *pmr_pool;	       /* owner of this MR */
+	__u64		   pmr_iova;	       /* Virtual I/O address */
+	int		     pmr_refcount;	   /* reference count */
+} kib_phys_mr_t;
+
+struct kib_pool;
+struct kib_poolset;
+
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+				     int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
+
+struct kib_net;
+
+#define IBLND_POOL_NAME_LEN     32
+
+typedef struct kib_poolset
+{
+	spinlock_t		ps_lock;		/* serialize */
+	struct kib_net	 *ps_net;		 /* network it belongs to */
+	char		    ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
+	struct list_head	      ps_pool_list;	   /* list of pools */
+	struct list_head	      ps_failed_pool_list;    /* failed pool list */
+	cfs_time_t	      ps_next_retry;	  /* time stamp for retry if failed to allocate */
+	int		     ps_increasing;	  /* is allocating new pool */
+	int		     ps_pool_size;	   /* new pool size */
+	int			ps_cpt;			/* CPT id */
+
+	kib_ps_pool_create_t    ps_pool_create;	 /* create a new pool */
+	kib_ps_pool_destroy_t   ps_pool_destroy;	/* destroy a pool */
+	kib_ps_node_init_t      ps_node_init;	   /* initialize new allocated node */
+	kib_ps_node_fini_t      ps_node_fini;	   /* finalize node */
+} kib_poolset_t;
+
+typedef struct kib_pool
+{
+	struct list_head	      po_list;		/* chain on pool list */
+	struct list_head	      po_free_list;	   /* pre-allocated node */
+	kib_poolset_t	  *po_owner;	       /* pool_set of this pool */
+	cfs_time_t	      po_deadline;	    /* deadline of this pool */
+	int		     po_allocated;	   /* # of elements in use */
+	int		     po_failed;	      /* pool is created on failed HCA */
+	int		     po_size;		/* # of pre-allocated elements */
+} kib_pool_t;
+
+typedef struct {
+	kib_poolset_t	   tps_poolset;	    /* pool-set */
+	__u64		   tps_next_tx_cookie;     /* cookie of TX */
+} kib_tx_poolset_t;
+
+typedef struct {
+	kib_pool_t	      tpo_pool;	       /* pool */
+	struct kib_hca_dev     *tpo_hdev;	       /* device for this pool */
+	struct kib_tx	  *tpo_tx_descs;	   /* all the tx descriptors */
+	kib_pages_t	    *tpo_tx_pages;	   /* premapped tx msg pages */
+} kib_tx_pool_t;
+
+typedef struct {
+	kib_poolset_t	   pps_poolset;	    /* pool-set */
+} kib_pmr_poolset_t;
+
+typedef struct kib_pmr_pool {
+	struct kib_hca_dev     *ppo_hdev;	       /* device for this pool */
+	kib_pool_t	      ppo_pool;	       /* pool */
+} kib_pmr_pool_t;
+
+typedef struct
+{
+	spinlock_t		fps_lock;		/* serialize */
+	struct kib_net	 *fps_net;		/* IB network */
+	struct list_head	      fps_pool_list;	  /* FMR pool list */
+	struct list_head	      fps_failed_pool_list;   /* FMR pool list */
+	__u64		   fps_version;	    /* validity stamp */
+	int			fps_cpt;		/* CPT id */
+	int			fps_pool_size;
+	int			fps_flush_trigger;
+	/* is allocating new pool */
+	int			fps_increasing;
+	/* time stamp for retry if failed to allocate */
+	cfs_time_t		fps_next_retry;
+} kib_fmr_poolset_t;
+
+typedef struct
+{
+	struct list_head	      fpo_list;	       /* chain on pool list */
+	struct kib_hca_dev     *fpo_hdev;	       /* device for this pool */
+	kib_fmr_poolset_t      *fpo_owner;	      /* owner of this pool */
+	struct ib_fmr_pool     *fpo_fmr_pool;	   /* IB FMR pool */
+	cfs_time_t	      fpo_deadline;	   /* deadline of this pool */
+	int		     fpo_failed;	     /* fmr pool is failed */
+	int		     fpo_map_count;	  /* # of mapped FMR */
+} kib_fmr_pool_t;
+
+typedef struct {
+	struct ib_pool_fmr     *fmr_pfmr;	       /* IB pool fmr */
+	kib_fmr_pool_t	 *fmr_pool;	       /* pool of FMR */
+} kib_fmr_t;
+
+typedef struct kib_net
+{
+	struct list_head	   ibn_list;	  /* chain on kib_dev_t::ibd_nets */
+	__u64		ibn_incarnation;   /* my epoch */
+	int		  ibn_init;	  /* initialisation state */
+	int		  ibn_shutdown;      /* shutting down? */
+
+	atomic_t		ibn_npeers;	/* # peers extant */
+	atomic_t		ibn_nconns;	/* # connections extant */
+
+	kib_tx_poolset_t	**ibn_tx_ps;	/* tx pool-set */
+	kib_fmr_poolset_t	**ibn_fmr_ps;	/* fmr pool-set */
+	kib_pmr_poolset_t	**ibn_pmr_ps;	/* pmr pool-set */
+
+	kib_dev_t		*ibn_dev;	/* underlying IB device */
+} kib_net_t;
+
+#define KIB_THREAD_SHIFT		16
+#define KIB_THREAD_ID(cpt, tid)		((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)		((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)		((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+	/* serialise */
+	spinlock_t		ibs_lock;
+	/* schedulers sleep here */
+	wait_queue_head_t		ibs_waitq;
+	/* conns to check for rx completions */
+	struct list_head		ibs_conns;
+	/* number of scheduler threads */
+	int			ibs_nthreads;
+	/* max allowed scheduler threads */
+	int			ibs_nthreads_max;
+	int			ibs_cpt;	/* CPT id */
+};
+
+typedef struct
+{
+	int			kib_init;	/* initialisation state */
+	int			kib_shutdown;	/* shut down? */
+	struct list_head		kib_devs;	/* IB devices extant */
+	/* list head of failed devices */
+	struct list_head		kib_failed_devs;
+	/* schedulers sleep here */
+	wait_queue_head_t		kib_failover_waitq;
+	atomic_t		kib_nthreads;	/* # live threads */
+	/* stabilize net/dev/peer/conn ops */
+	rwlock_t		kib_global_lock;
+	/* hash table of all my known peers */
+	struct list_head		*kib_peers;
+	/* size of kib_peers */
+	int			kib_peer_hash_size;
+	/* the connd task (serialisation assertions) */
+	void			*kib_connd;
+	/* connections to setup/teardown */
+	struct list_head		kib_connd_conns;
+	/* connections with zero refcount */
+	struct list_head		kib_connd_zombies;
+	/* connection daemon sleeps here */
+	wait_queue_head_t		kib_connd_waitq;
+	spinlock_t		kib_connd_lock;	/* serialise */
+	struct ib_qp_attr	kib_error_qpa;	/* QP->ERROR */
+	/* percpt data for schedulers */
+	struct kib_sched_info	**kib_scheds;
+} kib_data_t;
+
+#define IBLND_INIT_NOTHING	 0
+#define IBLND_INIT_DATA	    1
+#define IBLND_INIT_ALL	     2
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct kib_connparams
+{
+	__u16	     ibcp_queue_depth;
+	__u16	     ibcp_max_frags;
+	__u32	     ibcp_max_msg_size;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct
+{
+	lnet_hdr_t	ibim_hdr;	     /* portals header */
+	char	      ibim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
+
+typedef struct
+{
+	__u32	     rf_nob;	       /* # bytes this frag */
+	__u64	     rf_addr;	      /* CAVEAT EMPTOR: misaligned!! */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct
+{
+	__u32	     rd_key;	       /* local/remote key */
+	__u32	     rd_nfrags;	    /* # fragments */
+	kib_rdma_frag_t   rd_frags[0];	  /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+
+typedef struct
+{
+	lnet_hdr_t	ibprm_hdr;	    /* portals header */
+	__u64	     ibprm_cookie;	 /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct
+{
+	__u64	     ibpam_src_cookie;     /* reflected completion cookie */
+	__u64	     ibpam_dst_cookie;     /* opaque completion cookie */
+	kib_rdma_desc_t   ibpam_rd;	     /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct
+{
+	lnet_hdr_t	ibgm_hdr;	     /* portals header */
+	__u64	     ibgm_cookie;	  /* opaque completion cookie */
+	kib_rdma_desc_t   ibgm_rd;	      /* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
+
+typedef struct
+{
+	__u64	     ibcm_cookie;	  /* opaque completion cookie */
+	__s32	     ibcm_status;	  /* < 0 failure: >= 0 length */
+} WIRE_ATTR kib_completion_msg_t;
+
+typedef struct
+{
+	/* First 2 fields fixed FOR ALL TIME */
+	__u32	     ibm_magic;	    /* I'm an ibnal message */
+	__u16	     ibm_version;	  /* this is my version number */
+
+	__u8	      ibm_type;	     /* msg type */
+	__u8	      ibm_credits;	  /* returned credits */
+	__u32	     ibm_nob;	      /* # bytes in whole message */
+	__u32	     ibm_cksum;	    /* checksum (0 == no checksum) */
+	__u64	     ibm_srcnid;	   /* sender's NID */
+	__u64	     ibm_srcstamp;	 /* sender's incarnation */
+	__u64	     ibm_dstnid;	   /* destination's NID */
+	__u64	     ibm_dststamp;	 /* destination's incarnation */
+
+	union {
+		kib_connparams_t      connparams;
+		kib_immediate_msg_t   immediate;
+		kib_putreq_msg_t      putreq;
+		kib_putack_msg_t      putack;
+		kib_get_msg_t	 get;
+		kib_completion_msg_t  completion;
+	} WIRE_ATTR ibm_u;
+} WIRE_ATTR kib_msg_t;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC	/* unique magic */
+
+#define IBLND_MSG_VERSION_1	 0x11
+#define IBLND_MSG_VERSION_2	 0x12
+#define IBLND_MSG_VERSION	   IBLND_MSG_VERSION_2
+
+#define IBLND_MSG_CONNREQ	   0xc0	/* connection request */
+#define IBLND_MSG_CONNACK	   0xc1	/* connection acknowledge */
+#define IBLND_MSG_NOOP	      0xd0	/* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE	 0xd1	/* immediate */
+#define IBLND_MSG_PUT_REQ	   0xd2	/* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK	   0xd3	/* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK	   0xd4	/* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE	  0xd5	/* completion (src->sink) */
+#define IBLND_MSG_GET_REQ	   0xd6	/* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE	  0xd7	/* completion (src->sink: all OK) */
+
+typedef struct {
+	__u32	    ibr_magic;	     /* sender's magic */
+	__u16	    ibr_version;	   /* sender's version */
+	__u8	     ibr_why;	       /* reject reason */
+	__u8	     ibr_padding;	   /* padding */
+	__u64	    ibr_incarnation;       /* incarnation of peer */
+	kib_connparams_t ibr_cp;		/* connection parameters */
+} WIRE_ATTR kib_rej_t;
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE       1	  /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES    2	  /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL	   3	  /* Anything else */
+
+#define IBLND_REJECT_CONN_UNCOMPAT   4	  /* incompatible version peer */
+#define IBLND_REJECT_CONN_STALE      5	  /* stale peer */
+
+#define IBLND_REJECT_RDMA_FRAGS      6	  /* Fatal: peer's rdma frags can't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7	  /* Fatal: peer's msg queue size can't match mine */
+
+/***********************************************************************/
+
+typedef struct kib_rx			   /* receive message */
+{
+	struct list_head		rx_list;      /* queue for attention */
+	struct kib_conn	  *rx_conn;      /* owning conn */
+	int		       rx_nob;       /* # bytes received (-1 while posted) */
+	enum ib_wc_status	 rx_status;    /* completion status */
+	kib_msg_t		*rx_msg;       /* message buffer (host vaddr) */
+	__u64		     rx_msgaddr;   /* message buffer (I/O addr) */
+	DECLARE_PCI_UNMAP_ADDR   (rx_msgunmap); /* for dma_unmap_single() */
+	struct ib_recv_wr	 rx_wrq;       /* receive work item... */
+	struct ib_sge	     rx_sge;       /* ...and its memory */
+} kib_rx_t;
+
+#define IBLND_POSTRX_DONT_POST    0	     /* don't post */
+#define IBLND_POSTRX_NO_CREDIT    1	     /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT  2	     /* post: give peer back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3	     /* post: give myself back 1 reserved credit */
+
+typedef struct kib_tx			   /* transmit message */
+{
+	struct list_head		tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+	kib_tx_pool_t	    *tx_pool;      /* pool I'm from */
+	struct kib_conn	  *tx_conn;      /* owning conn */
+	short		     tx_sending;   /* # tx callbacks outstanding */
+	short		     tx_queued;    /* queued for sending */
+	short		     tx_waiting;   /* waiting for peer */
+	int		       tx_status;    /* LNET completion status */
+	unsigned long	     tx_deadline;  /* completion deadline */
+	__u64		     tx_cookie;    /* completion cookie */
+	lnet_msg_t	       *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
+	kib_msg_t		*tx_msg;       /* message buffer (host vaddr) */
+	__u64		     tx_msgaddr;   /* message buffer (I/O addr) */
+	DECLARE_PCI_UNMAP_ADDR   (tx_msgunmap); /* for dma_unmap_single() */
+	int		       tx_nwrq;      /* # send work items */
+	struct ib_send_wr	*tx_wrq;       /* send work items... */
+	struct ib_sge	    *tx_sge;       /* ...and their memory */
+	kib_rdma_desc_t	  *tx_rd;	/* rdma descriptor */
+	int		       tx_nfrags;    /* # entries in... */
+	struct scatterlist       *tx_frags;     /* dma_map_sg descriptor */
+	__u64		    *tx_pages;     /* rdma phys page addrs */
+	union {
+		kib_phys_mr_t      *pmr;	/* MR for physical buffer */
+		kib_fmr_t	   fmr;	/* FMR */
+	}			 tx_u;
+	int		       tx_dmadir;    /* dma direction */
+} kib_tx_t;
+
+typedef struct kib_connvars
+{
+	/* connection-in-progress variables */
+	kib_msg_t		 cv_msg;
+} kib_connvars_t;
+
+typedef struct kib_conn
+{
+	struct kib_sched_info *ibc_sched;	/* scheduler information */
+	struct kib_peer     *ibc_peer;	  /* owning peer */
+	kib_hca_dev_t       *ibc_hdev;	  /* HCA bound on */
+	struct list_head	   ibc_list;	  /* stash on peer's conn list */
+	struct list_head	   ibc_sched_list;    /* schedule for attention */
+	__u16		ibc_version;       /* version of connection */
+	__u64		ibc_incarnation;   /* which instance of the peer */
+	atomic_t	 ibc_refcount;      /* # users */
+	int		  ibc_state;	 /* what's happening */
+	int		  ibc_nsends_posted; /* # uncompleted sends */
+	int		  ibc_noops_posted;  /* # uncompleted NOOPs */
+	int		  ibc_credits;       /* # credits I have */
+	int		  ibc_outstanding_credits; /* # credits to return */
+	int		  ibc_reserved_credits;/* # ACK/DONE msg credits */
+	int		  ibc_comms_error;   /* set on comms error */
+	unsigned int	     ibc_nrx:16;	/* receive buffers owned */
+	unsigned int	     ibc_scheduled:1;   /* scheduled for attention */
+	unsigned int	     ibc_ready:1;       /* CQ callback fired */
+	/* time of last send */
+	unsigned long	ibc_last_send;
+	/** link chain for kiblnd_check_conns only */
+	struct list_head	   ibc_connd_list;
+	/** rxs completed before ESTABLISHED */
+	struct list_head	   ibc_early_rxs;
+	/** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+	struct list_head	   ibc_tx_noops;
+	struct list_head	   ibc_tx_queue;       /* sends that need a credit */
+	struct list_head	   ibc_tx_queue_nocred;/* sends that don't need a credit */
+	struct list_head	   ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
+	struct list_head	   ibc_active_txs;     /* active tx awaiting completion */
+	spinlock_t	     ibc_lock;		 /* serialise */
+	kib_rx_t	    *ibc_rxs;	    /* the rx descs */
+	kib_pages_t	 *ibc_rx_pages;       /* premapped rx msg pages */
+
+	struct rdma_cm_id   *ibc_cmid;	   /* CM id */
+	struct ib_cq	*ibc_cq;	     /* completion queue */
+
+	kib_connvars_t      *ibc_connvars;       /* in-progress connection state */
+} kib_conn_t;
+
+#define IBLND_CONN_INIT	       0	 /* being initialised */
+#define IBLND_CONN_ACTIVE_CONNECT     1	 /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT       2	 /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED	3	 /* connection established */
+#define IBLND_CONN_CLOSING	    4	 /* being closed */
+#define IBLND_CONN_DISCONNECTED       5	 /* disconnected */
+
+typedef struct kib_peer
+{
+	struct list_head	   ibp_list;	   /* stash on global peer list */
+	lnet_nid_t	   ibp_nid;	    /* who's on the other end(s) */
+	lnet_ni_t	   *ibp_ni;	     /* LNet interface */
+	atomic_t	 ibp_refcount;       /* # users */
+	struct list_head	   ibp_conns;	  /* all active connections */
+	struct list_head	   ibp_tx_queue;       /* msgs waiting for a conn */
+	__u16		ibp_version;	/* version of peer */
+	__u64		ibp_incarnation;    /* incarnation of peer */
+	int		  ibp_connecting;     /* current active connection attempts */
+	int		  ibp_accepting;      /* current passive connection attempts */
+	int		  ibp_error;	  /* errno on closing this peer */
+	cfs_time_t	   ibp_last_alive;     /* when (in jiffies) I was last alive */
+} kib_peer_t;
+
+extern kib_data_t      kiblnd_data;
+
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+
+static inline void
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+{
+	LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+	atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+{
+	LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+	if (atomic_dec_and_test(&hdev->ibh_ref))
+		kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(kib_dev_t *dev)
+{
+	if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+		return 0;
+
+	if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+		return 0;
+
+	if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+		return 1;
+
+	return dev->ibd_can_failover;
+}
+
+#define kiblnd_conn_addref(conn)				\
+do {							    \
+	CDEBUG(D_NET, "conn[%p] (%d)++\n",		      \
+	       (conn), atomic_read(&(conn)->ibc_refcount)); \
+	atomic_inc(&(conn)->ibc_refcount);		  \
+} while (0)
+
+#define kiblnd_conn_decref(conn)					\
+do {									\
+	unsigned long flags;						\
+									\
+	CDEBUG(D_NET, "conn[%p] (%d)--\n",				\
+	       (conn), atomic_read(&(conn)->ibc_refcount));		\
+	LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);			\
+	if (atomic_dec_and_test(&(conn)->ibc_refcount)) {		\
+		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);	\
+		list_add_tail(&(conn)->ibc_list,			\
+				  &kiblnd_data.kib_connd_zombies);	\
+		wake_up(&kiblnd_data.kib_connd_waitq);		\
+		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+	}								\
+} while (0)
+
+#define kiblnd_peer_addref(peer)				\
+do {							    \
+	CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",		\
+	       (peer), libcfs_nid2str((peer)->ibp_nid),	 \
+	       atomic_read (&(peer)->ibp_refcount));	\
+	atomic_inc(&(peer)->ibp_refcount);		  \
+} while (0)
+
+#define kiblnd_peer_decref(peer)				\
+do {							    \
+	CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",		\
+	       (peer), libcfs_nid2str((peer)->ibp_nid),	 \
+	       atomic_read (&(peer)->ibp_refcount));	\
+	LASSERT_ATOMIC_POS(&(peer)->ibp_refcount);	      \
+	if (atomic_dec_and_test(&(peer)->ibp_refcount))     \
+		kiblnd_destroy_peer(peer);		      \
+} while (0)
+
+static inline struct list_head *
+kiblnd_nid2peerlist (lnet_nid_t nid)
+{
+	unsigned int hash =
+		((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+
+	return (&kiblnd_data.kib_peers [hash]);
+}
+
+static inline int
+kiblnd_peer_active (kib_peer_t *peer)
+{
+	/* Am I in the peer hash table? */
+	return (!list_empty(&peer->ibp_list));
+}
+
+static inline kib_conn_t *
+kiblnd_get_conn_locked (kib_peer_t *peer)
+{
+	LASSERT (!list_empty(&peer->ibp_conns));
+
+	/* just return the first connection */
+	return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+}
+
+static inline int
+kiblnd_send_keepalive(kib_conn_t *conn)
+{
+	return (*kiblnd_tunables.kib_keepalive > 0) &&
+		cfs_time_after(jiffies, conn->ibc_last_send +
+			       *kiblnd_tunables.kib_keepalive*HZ);
+}
+
+static inline int
+kiblnd_need_noop(kib_conn_t *conn)
+{
+	LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	if (conn->ibc_outstanding_credits <
+	    IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+	    !kiblnd_send_keepalive(conn))
+		return 0; /* No need to send NOOP */
+
+	if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+		if (!list_empty(&conn->ibc_tx_queue_nocred))
+			return 0; /* NOOP can be piggybacked */
+
+		/* No tx to piggyback NOOP onto or no credit to send a tx */
+		return (list_empty(&conn->ibc_tx_queue) ||
+			conn->ibc_credits == 0);
+	}
+
+	if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+	    !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+	    conn->ibc_credits == 0)		    /* no credit */
+		return 0;
+
+	if (conn->ibc_credits == 1 &&      /* last credit reserved for */
+	    conn->ibc_outstanding_credits == 0) /* giving back credits */
+		return 0;
+
+	/* No tx to piggyback NOOP onto or no credit to send a tx */
+	return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
+static inline void
+kiblnd_abort_receives(kib_conn_t *conn)
+{
+	ib_modify_qp(conn->ibc_cmid->qp,
+		     &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+static inline const char *
+kiblnd_queue2str (kib_conn_t *conn, struct list_head *q)
+{
+	if (q == &conn->ibc_tx_queue)
+		return "tx_queue";
+
+	if (q == &conn->ibc_tx_queue_rsrvd)
+		return "tx_queue_rsrvd";
+
+	if (q == &conn->ibc_tx_queue_nocred)
+		return "tx_queue_nocred";
+
+	if (q == &conn->ibc_active_txs)
+		return "active_txs";
+
+	LBUG();
+	return NULL;
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_TX    0
+#define IBLND_WID_RDMA  1
+#define IBLND_WID_RX    2
+#define IBLND_WID_MASK  3UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+	unsigned long lptr = (unsigned long)ptr;
+
+	LASSERT ((lptr & IBLND_WID_MASK) == 0);
+	LASSERT ((type & ~IBLND_WID_MASK) == 0);
+	return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+	return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+	return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
+{
+	conn->ibc_state = state;
+	mb();
+}
+
+static inline void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+{
+	msg->ibm_type = type;
+	msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
+
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+	int   i;
+	int   size;
+
+	for (i = size = 0; i < rd->rd_nfrags; i++)
+		size += rd->rd_frags[i].rf_nob;
+
+	return size;
+}
+
+static inline __u64
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_frags[index].rf_addr;
+}
+
+static inline __u32
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+{
+	if (nob < rd->rd_frags[index].rf_nob) {
+		rd->rd_frags[index].rf_addr += nob;
+		rd->rd_frags[index].rf_nob  -= nob;
+	} else {
+		index ++;
+	}
+
+	return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+{
+	LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+		 msgtype == IBLND_MSG_PUT_ACK);
+
+	return msgtype == IBLND_MSG_GET_REQ ?
+	       offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+	       offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+}
+
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return ib_dma_mapping_error(dev, dma_addr);
+}
+
+static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
+					  void *msg, size_t size,
+					  enum dma_data_direction direction)
+{
+	return ib_dma_map_single(dev, msg, size, direction);
+}
+
+static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
+					   __u64 addr, size_t size,
+					  enum dma_data_direction direction)
+{
+	ib_dma_unmap_single(dev, addr, size, direction);
+}
+
+#define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
+#define KIBLND_UNMAP_ADDR(p, m, a)      (a)
+
+static inline int kiblnd_dma_map_sg(struct ib_device *dev,
+				    struct scatterlist *sg, int nents,
+				    enum dma_data_direction direction)
+{
+	return ib_dma_map_sg(dev, sg, nents, direction);
+}
+
+static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
+				       struct scatterlist *sg, int nents,
+				       enum dma_data_direction direction)
+{
+	ib_dma_unmap_sg(dev, sg, nents, direction);
+}
+
+static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
+					  struct scatterlist *sg)
+{
+	return ib_sg_dma_address(dev, sg);
+}
+
+static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
+					     struct scatterlist *sg)
+{
+	return ib_sg_dma_len(dev, sg);
+}
+
+/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
+ * right because OFED1.2 defines it as const, to use it we have to add
+ * (void *) cast to overcome "const" */
+
+#define KIBLND_CONN_PARAM(e)	    ((e)->param.conn.private_data)
+#define KIBLND_CONN_PARAM_LEN(e)	((e)->param.conn.private_data_len)
+
+
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
+				    kib_rdma_desc_t *rd);
+struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev,
+				 __u64 addr, __u64 size);
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+		  kib_rdma_desc_t *rd, int nfrags);
+void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+
+int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
+			 int npages, __u64 iov, kib_fmr_t *fmr);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+
+int  kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+			 kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr);
+void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr);
+
+int  kiblnd_startup (lnet_ni_t *ni);
+void kiblnd_shutdown (lnet_ni_t *ni);
+int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+
+int  kiblnd_tunables_init(void);
+void kiblnd_tunables_fini(void);
+
+int  kiblnd_connd (void *arg);
+int  kiblnd_scheduler(void *arg);
+int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+int  kiblnd_failover_thread (void *arg);
+
+int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+void kiblnd_free_pages (kib_pages_t *p);
+
+int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
+			struct rdma_cm_event *event);
+int  kiblnd_translate_mtu(int value);
+
+int  kiblnd_dev_failover(kib_dev_t *dev);
+int  kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_t *peer);
+void kiblnd_peer_alive (kib_peer_t *peer);
+kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
+void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
+int  kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+				      int version, __u64 incarnation);
+int  kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
+
+void kiblnd_connreq_done(kib_conn_t *conn, int status);
+kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
+				int state, int version);
+void kiblnd_destroy_conn (kib_conn_t *conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+int  kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+		       int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+
+void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
+void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+			 int status);
+void kiblnd_check_sends (kib_conn_t *conn);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+		      int credits, lnet_nid_t dstnid, __u64 dststamp);
+int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+
+int  kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+		 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+		 unsigned int offset, unsigned int mlen, unsigned int rlen);

diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644
index 0000000..cc62321
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c

@@ -0,0 +1,3529 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_cb.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+void
+kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx)
+{
+	lnet_msg_t *lntmsg[2];
+	kib_net_t  *net = ni->ni_data;
+	int	 rc;
+	int	 i;
+
+	LASSERT (net != NULL);
+	LASSERT (!in_interrupt());
+	LASSERT (!tx->tx_queued);	       /* mustn't be queued for sending */
+	LASSERT (tx->tx_sending == 0);	  /* mustn't be awaiting sent callback */
+	LASSERT (!tx->tx_waiting);	      /* mustn't be awaiting peer response */
+	LASSERT (tx->tx_pool != NULL);
+
+	kiblnd_unmap_tx(ni, tx);
+
+	/* tx may have up to 2 lnet msgs to finalise */
+	lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+	lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+	rc = tx->tx_status;
+
+	if (tx->tx_conn != NULL) {
+		LASSERT (ni == tx->tx_conn->ibc_peer->ibp_ni);
+
+		kiblnd_conn_decref(tx->tx_conn);
+		tx->tx_conn = NULL;
+	}
+
+	tx->tx_nwrq = 0;
+	tx->tx_status = 0;
+
+	kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
+
+	/* delay finalize until my descs have been freed */
+	for (i = 0; i < 2; i++) {
+		if (lntmsg[i] == NULL)
+			continue;
+
+		lnet_finalize(ni, lntmsg[i], rc);
+	}
+}
+
+void
+kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status)
+{
+	kib_tx_t *tx;
+
+	while (!list_empty (txlist)) {
+		tx = list_entry (txlist->next, kib_tx_t, tx_list);
+
+		list_del(&tx->tx_list);
+		/* complete now */
+		tx->tx_waiting = 0;
+		tx->tx_status = status;
+		kiblnd_tx_done(ni, tx);
+	}
+}
+
+kib_tx_t *
+kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
+{
+	kib_net_t		*net = (kib_net_t *)ni->ni_data;
+	struct list_head		*node;
+	kib_tx_t		*tx;
+	kib_tx_poolset_t	*tps;
+
+	tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+	node = kiblnd_pool_alloc_node(&tps->tps_poolset);
+	if (node == NULL)
+		return NULL;
+	tx = container_of(node, kib_tx_t, tx_list);
+
+	LASSERT (tx->tx_nwrq == 0);
+	LASSERT (!tx->tx_queued);
+	LASSERT (tx->tx_sending == 0);
+	LASSERT (!tx->tx_waiting);
+	LASSERT (tx->tx_status == 0);
+	LASSERT (tx->tx_conn == NULL);
+	LASSERT (tx->tx_lntmsg[0] == NULL);
+	LASSERT (tx->tx_lntmsg[1] == NULL);
+	LASSERT (tx->tx_u.pmr == NULL);
+	LASSERT (tx->tx_nfrags == 0);
+
+	return tx;
+}
+
+void
+kiblnd_drop_rx(kib_rx_t *rx)
+{
+	kib_conn_t		*conn	= rx->rx_conn;
+	struct kib_sched_info	*sched	= conn->ibc_sched;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+	LASSERT(conn->ibc_nrx > 0);
+	conn->ibc_nrx--;
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx (kib_rx_t *rx, int credit)
+{
+	kib_conn_t	 *conn = rx->rx_conn;
+	kib_net_t	  *net = conn->ibc_peer->ibp_ni->ni_data;
+	struct ib_recv_wr  *bad_wrq = NULL;
+	struct ib_mr       *mr;
+	int		 rc;
+
+	LASSERT (net != NULL);
+	LASSERT (!in_interrupt());
+	LASSERT (credit == IBLND_POSTRX_NO_CREDIT ||
+		 credit == IBLND_POSTRX_PEER_CREDIT ||
+		 credit == IBLND_POSTRX_RSRVD_CREDIT);
+
+	mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE);
+	LASSERT (mr != NULL);
+
+	rx->rx_sge.lkey   = mr->lkey;
+	rx->rx_sge.addr   = rx->rx_msgaddr;
+	rx->rx_sge.length = IBLND_MSG_SIZE;
+
+	rx->rx_wrq.next = NULL;
+	rx->rx_wrq.sg_list = &rx->rx_sge;
+	rx->rx_wrq.num_sge = 1;
+	rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+	LASSERT (conn->ibc_state >= IBLND_CONN_INIT);
+	LASSERT (rx->rx_nob >= 0);	      /* not posted */
+
+	if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+		kiblnd_drop_rx(rx);	     /* No more posts for this rx */
+		return 0;
+	}
+
+	rx->rx_nob = -1;			/* flag posted */
+
+	rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+	if (rc != 0) {
+		CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
+		rx->rx_nob = 0;
+	}
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+		return rc;
+
+	if (rc != 0) {
+		kiblnd_close_conn(conn, rc);
+		kiblnd_drop_rx(rx);	     /* No more posts for this rx */
+		return rc;
+	}
+
+	if (credit == IBLND_POSTRX_NO_CREDIT)
+		return 0;
+
+	spin_lock(&conn->ibc_lock);
+	if (credit == IBLND_POSTRX_PEER_CREDIT)
+		conn->ibc_outstanding_credits++;
+	else
+		conn->ibc_reserved_credits++;
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+	return 0;
+}
+
+kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+{
+	struct list_head   *tmp;
+
+	list_for_each(tmp, &conn->ibc_active_txs) {
+		kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+		LASSERT (!tx->tx_queued);
+		LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
+
+		if (tx->tx_cookie != cookie)
+			continue;
+
+		if (tx->tx_waiting &&
+		    tx->tx_msg->ibm_type == txtype)
+			return tx;
+
+		CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+		      tx->tx_waiting ? "" : "NOT ",
+		      tx->tx_msg->ibm_type, txtype);
+	}
+	return NULL;
+}
+
+void
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+	kib_tx_t    *tx;
+	lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+	int	  idle;
+
+	spin_lock(&conn->ibc_lock);
+
+	tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+	if (tx == NULL) {
+		spin_unlock(&conn->ibc_lock);
+
+		CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+		      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		kiblnd_close_conn(conn, -EPROTO);
+		return;
+	}
+
+	if (tx->tx_status == 0) {	       /* success so far */
+		if (status < 0) {	       /* failed? */
+			tx->tx_status = status;
+		} else if (txtype == IBLND_MSG_GET_REQ) {
+			lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+		}
+	}
+
+	tx->tx_waiting = 0;
+
+	idle = !tx->tx_queued && (tx->tx_sending == 0);
+	if (idle)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(ni, tx);
+}
+
+void
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+	lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+	kib_tx_t    *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+
+	if (tx == NULL) {
+		CERROR("Can't get tx for completion %x for %s\n",
+		       type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+	}
+
+	tx->tx_msg->ibm_u.completion.ibcm_status = status;
+	tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+	kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+
+	kiblnd_queue_tx(tx, conn);
+}
+
+void
+kiblnd_handle_rx (kib_rx_t *rx)
+{
+	kib_msg_t    *msg = rx->rx_msg;
+	kib_conn_t   *conn = rx->rx_conn;
+	lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+	int	   credits = msg->ibm_credits;
+	kib_tx_t     *tx;
+	int	   rc = 0;
+	int	   rc2;
+	int	   post_credit;
+
+	LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	CDEBUG (D_NET, "Received %x[%d] from %s\n",
+		msg->ibm_type, credits,
+		libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+	if (credits != 0) {
+		/* Have I received credits that will let me send? */
+		spin_lock(&conn->ibc_lock);
+
+		if (conn->ibc_credits + credits >
+		    IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) {
+			rc2 = conn->ibc_credits;
+			spin_unlock(&conn->ibc_lock);
+
+			CERROR("Bad credits from %s: %d + %d > %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       rc2, credits,
+			       IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+
+			kiblnd_close_conn(conn, -EPROTO);
+			kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+			return;
+		}
+
+		conn->ibc_credits += credits;
+
+		/* This ensures the credit taken by NOOP can be returned */
+		if (msg->ibm_type == IBLND_MSG_NOOP &&
+		    !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+			conn->ibc_outstanding_credits++;
+
+		spin_unlock(&conn->ibc_lock);
+		kiblnd_check_sends(conn);
+	}
+
+	switch (msg->ibm_type) {
+	default:
+		CERROR("Bad IBLND message type %x from %s\n",
+		       msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		post_credit = IBLND_POSTRX_NO_CREDIT;
+		rc = -EPROTO;
+		break;
+
+	case IBLND_MSG_NOOP:
+		if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+			post_credit = IBLND_POSTRX_NO_CREDIT;
+			break;
+		}
+
+		if (credits != 0) /* credit already posted */
+			post_credit = IBLND_POSTRX_NO_CREDIT;
+		else	      /* a keepalive NOOP */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_IMMEDIATE:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
+				msg->ibm_srcnid, rx, 0);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_PUT_REQ:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
+				msg->ibm_srcnid, rx, 1);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+		CWARN ("PUT_NACK from %s\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+
+	case IBLND_MSG_PUT_ACK:
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+		spin_lock(&conn->ibc_lock);
+		tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+					msg->ibm_u.putack.ibpam_src_cookie);
+		if (tx != NULL)
+			list_del(&tx->tx_list);
+		spin_unlock(&conn->ibc_lock);
+
+		if (tx == NULL) {
+			CERROR("Unmatched PUT_ACK from %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			rc = -EPROTO;
+			break;
+		}
+
+		LASSERT (tx->tx_waiting);
+		/* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+		 * (a) I can overwrite tx_msg since my peer has received it!
+		 * (b) tx_waiting set tells tx_complete() it's not done. */
+
+		tx->tx_nwrq = 0;		/* overwrite PUT_REQ */
+
+		rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
+				       kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+				       &msg->ibm_u.putack.ibpam_rd,
+				       msg->ibm_u.putack.ibpam_dst_cookie);
+		if (rc2 < 0)
+			CERROR("Can't setup rdma for PUT to %s: %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+		spin_lock(&conn->ibc_lock);
+		tx->tx_waiting = 0;	/* clear waiting and queue atomically */
+		kiblnd_queue_tx_locked(tx, conn);
+		spin_unlock(&conn->ibc_lock);
+		break;
+
+	case IBLND_MSG_PUT_DONE:
+		post_credit = IBLND_POSTRX_PEER_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+
+	case IBLND_MSG_GET_REQ:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
+				msg->ibm_srcnid, rx, 1);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_GET_DONE:
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+	}
+
+	if (rc < 0)			     /* protocol error */
+		kiblnd_close_conn(conn, rc);
+
+	if (post_credit != IBLND_POSTRX_DONT_POST)
+		kiblnd_post_rx(rx, post_credit);
+}
+
+void
+kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
+{
+	kib_msg_t    *msg = rx->rx_msg;
+	kib_conn_t   *conn = rx->rx_conn;
+	lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+	kib_net_t    *net = ni->ni_data;
+	int	   rc;
+	int	   err = -EIO;
+
+	LASSERT (net != NULL);
+	LASSERT (rx->rx_nob < 0);	       /* was posted */
+	rx->rx_nob = 0;			 /* isn't now */
+
+	if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+		goto ignore;
+
+	if (status != IB_WC_SUCCESS) {
+		CNETERR("Rx from %s failed: %d\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+		goto failed;
+	}
+
+	LASSERT (nob >= 0);
+	rx->rx_nob = nob;
+
+	rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+	if (rc != 0) {
+		CERROR ("Error %d unpacking rx from %s\n",
+			rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		goto failed;
+	}
+
+	if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+	    msg->ibm_dstnid != ni->ni_nid ||
+	    msg->ibm_srcstamp != conn->ibc_incarnation ||
+	    msg->ibm_dststamp != net->ibn_incarnation) {
+		CERROR ("Stale rx from %s\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		err = -ESTALE;
+		goto failed;
+	}
+
+	/* set time last known alive */
+	kiblnd_peer_alive(conn->ibc_peer);
+
+	/* racing with connection establishment/teardown! */
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+		rwlock_t  *g_lock = &kiblnd_data.kib_global_lock;
+		unsigned long  flags;
+
+		write_lock_irqsave(g_lock, flags);
+		/* must check holding global lock to eliminate race */
+		if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+			list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+			write_unlock_irqrestore(g_lock, flags);
+			return;
+		}
+		write_unlock_irqrestore(g_lock, flags);
+	}
+	kiblnd_handle_rx(rx);
+	return;
+
+ failed:
+	CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+	kiblnd_close_conn(conn, err);
+ ignore:
+	kiblnd_drop_rx(rx);		     /* Don't re-post rx. */
+}
+
+struct page *
+kiblnd_kvaddr_to_page (unsigned long vaddr)
+{
+	struct page *page;
+
+	if (vaddr >= VMALLOC_START &&
+	    vaddr < VMALLOC_END) {
+		page = vmalloc_to_page ((void *)vaddr);
+		LASSERT (page != NULL);
+		return page;
+	}
+#ifdef CONFIG_HIGHMEM
+	if (vaddr >= PKMAP_BASE &&
+	    vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+		/* No highmem pages only used for bulk (kiov) I/O */
+		CERROR("find page for address in highmem\n");
+		LBUG();
+	}
+#endif
+	page = virt_to_page (vaddr);
+	LASSERT (page != NULL);
+	return page;
+}
+
+static int
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+	kib_hca_dev_t		*hdev;
+	__u64			*pages = tx->tx_pages;
+	kib_fmr_poolset_t	*fps;
+	int			npages;
+	int			size;
+	int			cpt;
+	int			rc;
+	int			i;
+
+	LASSERT(tx->tx_pool != NULL);
+	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+	hdev  = tx->tx_pool->tpo_hdev;
+
+	for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+		for (size = 0; size <  rd->rd_frags[i].rf_nob;
+			       size += hdev->ibh_page_size) {
+			pages[npages ++] = (rd->rd_frags[i].rf_addr &
+					    hdev->ibh_page_mask) + size;
+		}
+	}
+
+	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+	fps = net->ibn_fmr_ps[cpt];
+	rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr);
+	if (rc != 0) {
+		CERROR ("Can't map %d pages: %d\n", npages, rc);
+		return rc;
+	}
+
+	/* If rd is not tx_rd, it's going to get sent to a peer, who will need
+	 * the rkey */
+	rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey :
+					 tx->tx_u.fmr.fmr_pfmr->fmr->lkey;
+	rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+	rd->rd_frags[0].rf_nob   = nob;
+	rd->rd_nfrags = 1;
+
+	return 0;
+}
+
+static int
+kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+	kib_hca_dev_t		*hdev;
+	kib_pmr_poolset_t	*pps;
+	__u64			iova;
+	int			cpt;
+	int			rc;
+
+	LASSERT(tx->tx_pool != NULL);
+	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+	hdev = tx->tx_pool->tpo_hdev;
+
+	iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask;
+
+	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+	pps = net->ibn_pmr_ps[cpt];
+	rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr);
+	if (rc != 0) {
+		CERROR("Failed to create MR by phybuf: %d\n", rc);
+		return rc;
+	}
+
+	/* If rd is not tx_rd, it's going to get sent to a peer, who will need
+	 * the rkey */
+	rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey :
+					 tx->tx_u.pmr->pmr_mr->lkey;
+	rd->rd_nfrags = 1;
+	rd->rd_frags[0].rf_addr = iova;
+	rd->rd_frags[0].rf_nob  = nob;
+
+	return 0;
+}
+
+void
+kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
+{
+	kib_net_t  *net = ni->ni_data;
+
+	LASSERT(net != NULL);
+
+	if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) {
+		kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status);
+		tx->tx_u.fmr.fmr_pfmr = NULL;
+
+	} else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) {
+		kiblnd_pmr_pool_unmap(tx->tx_u.pmr);
+		tx->tx_u.pmr = NULL;
+	}
+
+	if (tx->tx_nfrags != 0) {
+		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+				    tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+		tx->tx_nfrags = 0;
+	}
+}
+
+int
+kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+	      kib_rdma_desc_t *rd, int nfrags)
+{
+	kib_hca_dev_t      *hdev  = tx->tx_pool->tpo_hdev;
+	kib_net_t	  *net   = ni->ni_data;
+	struct ib_mr       *mr    = NULL;
+	__u32	       nob;
+	int		 i;
+
+	/* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+	 * RDMA sink */
+	tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	tx->tx_nfrags = nfrags;
+
+	rd->rd_nfrags =
+		kiblnd_dma_map_sg(hdev->ibh_ibdev,
+				  tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+
+	for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
+		rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
+			hdev->ibh_ibdev, &tx->tx_frags[i]);
+		rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+			hdev->ibh_ibdev, &tx->tx_frags[i]);
+		nob += rd->rd_frags[i].rf_nob;
+	}
+
+	/* looking for pre-mapping MR */
+	mr = kiblnd_find_rd_dma_mr(hdev, rd);
+	if (mr != NULL) {
+		/* found pre-mapping MR */
+		rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+		return 0;
+	}
+
+	if (net->ibn_fmr_ps != NULL)
+		return kiblnd_fmr_map_tx(net, tx, rd, nob);
+	else if (net->ibn_pmr_ps != NULL)
+		return kiblnd_pmr_map_tx(net, tx, rd, nob);
+
+	return -EINVAL;
+}
+
+
+int
+kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		    unsigned int niov, struct iovec *iov, int offset, int nob)
+{
+	kib_net_t	  *net = ni->ni_data;
+	struct page	*page;
+	struct scatterlist *sg;
+	unsigned long       vaddr;
+	int		 fragnob;
+	int		 page_offset;
+
+	LASSERT (nob > 0);
+	LASSERT (niov > 0);
+	LASSERT (net != NULL);
+
+	while (offset >= iov->iov_len) {
+		offset -= iov->iov_len;
+		niov--;
+		iov++;
+		LASSERT (niov > 0);
+	}
+
+	sg = tx->tx_frags;
+	do {
+		LASSERT (niov > 0);
+
+		vaddr = ((unsigned long)iov->iov_base) + offset;
+		page_offset = vaddr & (PAGE_SIZE - 1);
+		page = kiblnd_kvaddr_to_page(vaddr);
+		if (page == NULL) {
+			CERROR ("Can't find page\n");
+			return -EFAULT;
+		}
+
+		fragnob = min((int)(iov->iov_len - offset), nob);
+		fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+		sg_set_page(sg, page, fragnob, page_offset);
+		sg++;
+
+		if (offset + fragnob < iov->iov_len) {
+			offset += fragnob;
+		} else {
+			offset = 0;
+			iov++;
+			niov--;
+		}
+		nob -= fragnob;
+	} while (nob > 0);
+
+	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+	kib_net_t	  *net = ni->ni_data;
+	struct scatterlist *sg;
+	int		 fragnob;
+
+	CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+	LASSERT (nob > 0);
+	LASSERT (nkiov > 0);
+	LASSERT (net != NULL);
+
+	while (offset >= kiov->kiov_len) {
+		offset -= kiov->kiov_len;
+		nkiov--;
+		kiov++;
+		LASSERT (nkiov > 0);
+	}
+
+	sg = tx->tx_frags;
+	do {
+		LASSERT (nkiov > 0);
+
+		fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+		sg_set_page(sg, kiov->kiov_page, fragnob,
+			    kiov->kiov_offset + offset);
+		sg++;
+
+		offset = 0;
+		kiov++;
+		nkiov--;
+		nob -= fragnob;
+	} while (nob > 0);
+
+	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
+{
+	kib_msg_t	 *msg = tx->tx_msg;
+	kib_peer_t	*peer = conn->ibc_peer;
+	int		ver = conn->ibc_version;
+	int		rc;
+	int		done;
+	struct ib_send_wr *bad_wrq;
+
+	LASSERT (tx->tx_queued);
+	/* We rely on this for QP sizing */
+	LASSERT (tx->tx_nwrq > 0);
+	LASSERT (tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver));
+
+	LASSERT (credit == 0 || credit == 1);
+	LASSERT (conn->ibc_outstanding_credits >= 0);
+	LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+	LASSERT (conn->ibc_credits >= 0);
+	LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+
+	if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
+		/* tx completions outstanding... */
+		CDEBUG(D_NET, "%s: posted enough\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
+		CDEBUG(D_NET, "%s: no credits\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
+	    conn->ibc_credits == 1 &&   /* last credit reserved */
+	    msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
+		CDEBUG(D_NET, "%s: not using last credit\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	/* NB don't drop ibc_lock before bumping tx_sending */
+	list_del(&tx->tx_list);
+	tx->tx_queued = 0;
+
+	if (msg->ibm_type == IBLND_MSG_NOOP &&
+	    (!kiblnd_need_noop(conn) ||     /* redundant NOOP */
+	     (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
+	      conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
+		/* OK to drop when posted enough NOOPs, since
+		 * kiblnd_check_sends will queue NOOP again when
+		 * posted NOOPs complete */
+		spin_unlock(&conn->ibc_lock);
+		kiblnd_tx_done(peer->ibp_ni, tx);
+		spin_lock(&conn->ibc_lock);
+		CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_noops_posted);
+		return 0;
+	}
+
+	kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+			peer->ibp_nid, conn->ibc_incarnation);
+
+	conn->ibc_credits -= credit;
+	conn->ibc_outstanding_credits = 0;
+	conn->ibc_nsends_posted++;
+	if (msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted++;
+
+	/* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+	 * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+	 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+	 * and then re-queued here.  It's (just) possible that
+	 * tx_sending is non-zero if we've not done the tx_complete()
+	 * from the first send; hence the ++ rather than = below. */
+	tx->tx_sending++;
+	list_add(&tx->tx_list, &conn->ibc_active_txs);
+
+	/* I'm still holding ibc_lock! */
+	if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+		rc = -ECONNABORTED;
+	} else if (tx->tx_pool->tpo_pool.po_failed ||
+		 conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+		/* close_conn will launch failover */
+		rc = -ENETDOWN;
+	} else {
+		rc = ib_post_send(conn->ibc_cmid->qp,
+				  tx->tx_wrq, &bad_wrq);
+	}
+
+	conn->ibc_last_send = jiffies;
+
+	if (rc == 0)
+		return 0;
+
+	/* NB credits are transferred in the actual
+	 * message, which can only be the last work item */
+	conn->ibc_credits += credit;
+	conn->ibc_outstanding_credits += msg->ibm_credits;
+	conn->ibc_nsends_posted--;
+	if (msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted--;
+
+	tx->tx_status = rc;
+	tx->tx_waiting = 0;
+	tx->tx_sending--;
+
+	done = (tx->tx_sending == 0);
+	if (done)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+		CERROR("Error %d posting transmit to %s\n",
+		       rc, libcfs_nid2str(peer->ibp_nid));
+	else
+		CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+		       rc, libcfs_nid2str(peer->ibp_nid));
+
+	kiblnd_close_conn(conn, rc);
+
+	if (done)
+		kiblnd_tx_done(peer->ibp_ni, tx);
+
+	spin_lock(&conn->ibc_lock);
+
+	return -EIO;
+}
+
+void
+kiblnd_check_sends (kib_conn_t *conn)
+{
+	int	ver = conn->ibc_version;
+	lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+	kib_tx_t  *tx;
+
+	/* Don't send anything until after the connection is established */
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+		CDEBUG(D_NET, "%s too soon\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+	}
+
+	spin_lock(&conn->ibc_lock);
+
+	LASSERT (conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
+	LASSERT (!IBLND_OOB_CAPABLE(ver) ||
+		 conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
+	LASSERT (conn->ibc_reserved_credits >= 0);
+
+	while (conn->ibc_reserved_credits > 0 &&
+	       !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+		tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+				    kib_tx_t, tx_list);
+		list_del(&tx->tx_list);
+		list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+		conn->ibc_reserved_credits--;
+	}
+
+	if (kiblnd_need_noop(conn)) {
+		spin_unlock(&conn->ibc_lock);
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+		if (tx != NULL)
+			kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+		spin_lock(&conn->ibc_lock);
+		if (tx != NULL)
+			kiblnd_queue_tx_locked(tx, conn);
+	}
+
+	kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */
+
+	for (;;) {
+		int credit;
+
+		if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+			credit = 0;
+			tx = list_entry(conn->ibc_tx_queue_nocred.next,
+					    kib_tx_t, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_noops)) {
+			LASSERT (!IBLND_OOB_CAPABLE(ver));
+			credit = 1;
+			tx = list_entry(conn->ibc_tx_noops.next,
+					kib_tx_t, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_queue)) {
+			credit = 1;
+			tx = list_entry(conn->ibc_tx_queue.next,
+					    kib_tx_t, tx_list);
+		} else
+			break;
+
+		if (kiblnd_post_tx_locked(conn, tx, credit) != 0)
+			break;
+	}
+
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_conn_decref(conn); /* ...until here */
+}
+
+void
+kiblnd_tx_complete (kib_tx_t *tx, int status)
+{
+	int	   failed = (status != IB_WC_SUCCESS);
+	kib_conn_t   *conn = tx->tx_conn;
+	int	   idle;
+
+	LASSERT (tx->tx_sending > 0);
+
+	if (failed) {
+		if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+			CNETERR("Tx -> %s cookie "LPX64
+				" sending %d waiting %d: failed %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+				status);
+
+		kiblnd_close_conn(conn, -EIO);
+	} else {
+		kiblnd_peer_alive(conn->ibc_peer);
+	}
+
+	spin_lock(&conn->ibc_lock);
+
+	/* I could be racing with rdma completion.  Whoever makes 'tx' idle
+	 * gets to free it, which also drops its ref on 'conn'. */
+
+	tx->tx_sending--;
+	conn->ibc_nsends_posted--;
+	if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted--;
+
+	if (failed) {
+		tx->tx_waiting = 0;	     /* don't wait for peer */
+		tx->tx_status = -EIO;
+	}
+
+	idle = (tx->tx_sending == 0) &&	 /* This is the final callback */
+	       !tx->tx_waiting &&	       /* Not waiting for peer */
+	       !tx->tx_queued;		  /* Not re-queued (PUT_DONE) */
+	if (idle)
+		list_del(&tx->tx_list);
+
+	kiblnd_conn_addref(conn);	       /* 1 ref for me.... */
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
+
+	kiblnd_check_sends(conn);
+
+	kiblnd_conn_decref(conn);	       /* ...until here */
+}
+
+void
+kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
+{
+	kib_hca_dev_t     *hdev = tx->tx_pool->tpo_hdev;
+	struct ib_sge     *sge = &tx->tx_sge[tx->tx_nwrq];
+	struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
+	int		nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+	struct ib_mr      *mr;
+
+	LASSERT (tx->tx_nwrq >= 0);
+	LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+	LASSERT (nob <= IBLND_MSG_SIZE);
+
+	kiblnd_init_msg(tx->tx_msg, type, body_nob);
+
+	mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob);
+	LASSERT (mr != NULL);
+
+	sge->lkey   = mr->lkey;
+	sge->addr   = tx->tx_msgaddr;
+	sge->length = nob;
+
+	memset(wrq, 0, sizeof(*wrq));
+
+	wrq->next       = NULL;
+	wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+	wrq->sg_list    = sge;
+	wrq->num_sge    = 1;
+	wrq->opcode     = IB_WR_SEND;
+	wrq->send_flags = IB_SEND_SIGNALED;
+
+	tx->tx_nwrq++;
+}
+
+int
+kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+		  int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+{
+	kib_msg_t	 *ibmsg = tx->tx_msg;
+	kib_rdma_desc_t   *srcrd = tx->tx_rd;
+	struct ib_sge     *sge = &tx->tx_sge[0];
+	struct ib_send_wr *wrq = &tx->tx_wrq[0];
+	int		rc  = resid;
+	int		srcidx;
+	int		dstidx;
+	int		wrknob;
+
+	LASSERT (!in_interrupt());
+	LASSERT (tx->tx_nwrq == 0);
+	LASSERT (type == IBLND_MSG_GET_DONE ||
+		 type == IBLND_MSG_PUT_DONE);
+
+	srcidx = dstidx = 0;
+
+	while (resid > 0) {
+		if (srcidx >= srcrd->rd_nfrags) {
+			CERROR("Src buffer exhausted: %d frags\n", srcidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (dstidx == dstrd->rd_nfrags) {
+			CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) {
+			CERROR("RDMA too fragmented for %s (%d): "
+			       "%d/%d src %d/%d dst frags\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       IBLND_RDMA_FRAGS(conn->ibc_version),
+			       srcidx, srcrd->rd_nfrags,
+			       dstidx, dstrd->rd_nfrags);
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
+				 kiblnd_rd_frag_size(dstrd, dstidx)), resid);
+
+		sge = &tx->tx_sge[tx->tx_nwrq];
+		sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
+		sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
+		sge->length = wrknob;
+
+		wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+		wrq->next       = wrq + 1;
+		wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+		wrq->sg_list    = sge;
+		wrq->num_sge    = 1;
+		wrq->opcode     = IB_WR_RDMA_WRITE;
+		wrq->send_flags = 0;
+
+		wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
+		wrq->wr.rdma.rkey	= kiblnd_rd_frag_key(dstrd, dstidx);
+
+		srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
+		dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
+
+		resid -= wrknob;
+
+		tx->tx_nwrq++;
+		wrq++;
+		sge++;
+	}
+
+	if (rc < 0)			     /* no RDMA if completing with failure */
+		tx->tx_nwrq = 0;
+
+	ibmsg->ibm_u.completion.ibcm_status = rc;
+	ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+	kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
+			   type, sizeof (kib_completion_msg_t));
+
+	return rc;
+}
+
+void
+kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+	struct list_head   *q;
+
+	LASSERT (tx->tx_nwrq > 0);	      /* work items set up */
+	LASSERT (!tx->tx_queued);	       /* not queued for sending already */
+	LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	tx->tx_queued = 1;
+	tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ);
+
+	if (tx->tx_conn == NULL) {
+		kiblnd_conn_addref(conn);
+		tx->tx_conn = conn;
+		LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+	} else {
+		/* PUT_DONE first attached to conn as a PUT_REQ */
+		LASSERT (tx->tx_conn == conn);
+		LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+	}
+
+	switch (tx->tx_msg->ibm_type) {
+	default:
+		LBUG();
+
+	case IBLND_MSG_PUT_REQ:
+	case IBLND_MSG_GET_REQ:
+		q = &conn->ibc_tx_queue_rsrvd;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_ACK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		q = &conn->ibc_tx_queue_nocred;
+		break;
+
+	case IBLND_MSG_NOOP:
+		if (IBLND_OOB_CAPABLE(conn->ibc_version))
+			q = &conn->ibc_tx_queue_nocred;
+		else
+			q = &conn->ibc_tx_noops;
+		break;
+
+	case IBLND_MSG_IMMEDIATE:
+		q = &conn->ibc_tx_queue;
+		break;
+	}
+
+	list_add_tail(&tx->tx_list, q);
+}
+
+void
+kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+	spin_lock(&conn->ibc_lock);
+	kiblnd_queue_tx_locked(tx, conn);
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+}
+
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+			       struct sockaddr_in *srcaddr,
+			       struct sockaddr_in *dstaddr,
+			       int timeout_ms)
+{
+	unsigned short port;
+	int rc;
+
+	/* allow the port to be reused */
+	rc = rdma_set_reuseaddr(cmid, 1);
+	if (rc != 0) {
+		CERROR("Unable to set reuse on cmid: %d\n", rc);
+		return rc;
+	}
+
+	/* look for a free privileged port */
+	for (port = PROT_SOCK-1; port > 0; port--) {
+		srcaddr->sin_port = htons(port);
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)srcaddr,
+				       (struct sockaddr *)dstaddr,
+				       timeout_ms);
+		if (rc == 0) {
+			CDEBUG(D_NET, "bound to port %hu\n", port);
+			return 0;
+		} else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+			CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+			       port, rc);
+		} else {
+			return rc;
+		}
+	}
+
+	CERROR("Failed to bind to a free privileged port\n");
+	return rc;
+}
+
+void
+kiblnd_connect_peer (kib_peer_t *peer)
+{
+	struct rdma_cm_id *cmid;
+	kib_dev_t	 *dev;
+	kib_net_t	 *net = peer->ibp_ni->ni_data;
+	struct sockaddr_in srcaddr;
+	struct sockaddr_in dstaddr;
+	int		rc;
+
+	LASSERT (net != NULL);
+	LASSERT (peer->ibp_connecting > 0);
+
+	cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
+				     IB_QPT_RC);
+
+	if (IS_ERR(cmid)) {
+		CERROR("Can't create CMID for %s: %ld\n",
+		       libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
+		rc = PTR_ERR(cmid);
+		goto failed;
+	}
+
+	dev = net->ibn_dev;
+	memset(&srcaddr, 0, sizeof(srcaddr));
+	srcaddr.sin_family = AF_INET;
+	srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
+
+	memset(&dstaddr, 0, sizeof(dstaddr));
+	dstaddr.sin_family = AF_INET;
+	dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+	dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+
+	kiblnd_peer_addref(peer);	       /* cmid's ref */
+
+	if (*kiblnd_tunables.kib_use_priv_port) {
+		rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+					 *kiblnd_tunables.kib_timeout * 1000);
+	} else {
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)&srcaddr,
+				       (struct sockaddr *)&dstaddr,
+				       *kiblnd_tunables.kib_timeout * 1000);
+	}
+	if (rc != 0) {
+		/* Can't initiate address resolution:  */
+		CERROR("Can't resolve addr for %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		goto failed2;
+	}
+
+	LASSERT (cmid->device != NULL);
+	CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n",
+	       libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+	       HIPQUAD(dev->ibd_ifip), cmid->device->name);
+
+	return;
+
+ failed2:
+	kiblnd_peer_decref(peer);	       /* cmid's ref */
+	rdma_destroy_id(cmid);
+ failed:
+	kiblnd_peer_connect_failed(peer, 1, rc);
+}
+
+void
+kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
+{
+	kib_peer_t	*peer;
+	kib_peer_t	*peer2;
+	kib_conn_t	*conn;
+	rwlock_t	*g_lock = &kiblnd_data.kib_global_lock;
+	unsigned long      flags;
+	int		rc;
+
+	/* If I get here, I've committed to send, so I complete the tx with
+	 * failure on any problems */
+
+	LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */
+	LASSERT (tx == NULL || tx->tx_nwrq > 0);     /* work items have been set up */
+
+	/* First time, just use a read lock since I expect to find my peer
+	 * connected */
+	read_lock_irqsave(g_lock, flags);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL && !list_empty(&peer->ibp_conns)) {
+		/* Found a peer with an established connection */
+		conn = kiblnd_get_conn_locked(peer);
+		kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+		read_unlock_irqrestore(g_lock, flags);
+
+		if (tx != NULL)
+			kiblnd_queue_tx(tx, conn);
+		kiblnd_conn_decref(conn); /* ...to here */
+		return;
+	}
+
+	read_unlock(g_lock);
+	/* Re-try with a write lock */
+	write_lock(g_lock);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL) {
+		if (list_empty(&peer->ibp_conns)) {
+			/* found a peer, but it's still connecting... */
+			LASSERT (peer->ibp_connecting != 0 ||
+				 peer->ibp_accepting != 0);
+			if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+						  &peer->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+			if (tx != NULL)
+				kiblnd_queue_tx(tx, conn);
+			kiblnd_conn_decref(conn); /* ...to here */
+		}
+		return;
+	}
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	/* Allocate a peer ready to add to the peer table and retry */
+	rc = kiblnd_create_peer(ni, &peer, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+		if (tx != NULL) {
+			tx->tx_status = -EHOSTUNREACH;
+			tx->tx_waiting = 0;
+			kiblnd_tx_done(ni, tx);
+		}
+		return;
+	}
+
+	write_lock_irqsave(g_lock, flags);
+
+	peer2 = kiblnd_find_peer_locked(nid);
+	if (peer2 != NULL) {
+		if (list_empty(&peer2->ibp_conns)) {
+			/* found a peer, but it's still connecting... */
+			LASSERT (peer2->ibp_connecting != 0 ||
+				 peer2->ibp_accepting != 0);
+			if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+						  &peer2->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer2);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+			if (tx != NULL)
+				kiblnd_queue_tx(tx, conn);
+			kiblnd_conn_decref(conn); /* ...to here */
+		}
+
+		kiblnd_peer_decref(peer);
+		return;
+	}
+
+	/* Brand new peer */
+	LASSERT (peer->ibp_connecting == 0);
+	peer->ibp_connecting = 1;
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
+	if (tx != NULL)
+		list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
+
+	kiblnd_peer_addref(peer);
+	list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	kiblnd_connect_peer(peer);
+	kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+	int	       type = lntmsg->msg_type;
+	lnet_process_id_t target = lntmsg->msg_target;
+	int	       target_is_router = lntmsg->msg_target_is_router;
+	int	       routing = lntmsg->msg_routing;
+	unsigned int      payload_niov = lntmsg->msg_niov;
+	struct iovec     *payload_iov = lntmsg->msg_iov;
+	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+	unsigned int      payload_offset = lntmsg->msg_offset;
+	unsigned int      payload_nob = lntmsg->msg_len;
+	kib_msg_t	*ibmsg;
+	kib_tx_t	 *tx;
+	int	       nob;
+	int	       rc;
+
+	/* NB 'private' is different depending on what we're sending.... */
+
+	CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+	       payload_nob, payload_niov, libcfs_id2str(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+
+	/* Thread context */
+	LASSERT (!in_interrupt());
+	/* payload is either all vaddrs or all pages */
+	LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+	switch (type) {
+	default:
+		LBUG();
+		return (-EIO);
+
+	case LNET_MSG_ACK:
+		LASSERT (payload_nob == 0);
+		break;
+
+	case LNET_MSG_GET:
+		if (routing || target_is_router)
+			break;		  /* send IMMEDIATE */
+
+		/* is the REPLY message too small for RDMA? */
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+		if (nob <= IBLND_MSG_SIZE)
+			break;		  /* send IMMEDIATE */
+
+		tx = kiblnd_get_idle_tx(ni, target.nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate txd for GET to %s\n",
+			       libcfs_nid2str(target.nid));
+			return -ENOMEM;
+		}
+
+		ibmsg = tx->tx_msg;
+
+		if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+			rc = kiblnd_setup_rd_iov(ni, tx,
+						 &ibmsg->ibm_u.get.ibgm_rd,
+						 lntmsg->msg_md->md_niov,
+						 lntmsg->msg_md->md_iov.iov,
+						 0, lntmsg->msg_md->md_length);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx,
+						  &ibmsg->ibm_u.get.ibgm_rd,
+						  lntmsg->msg_md->md_niov,
+						  lntmsg->msg_md->md_iov.kiov,
+						  0, lntmsg->msg_md->md_length);
+		if (rc != 0) {
+			CERROR("Can't setup GET sink for %s: %d\n",
+			       libcfs_nid2str(target.nid), rc);
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]);
+		ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+		ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+		tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+		if (tx->tx_lntmsg[1] == NULL) {
+			CERROR("Can't create reply for GET -> %s\n",
+			       libcfs_nid2str(target.nid));
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
+		tx->tx_waiting = 1;	     /* waiting for GET_DONE */
+		kiblnd_launch_tx(ni, tx, target.nid);
+		return 0;
+
+	case LNET_MSG_REPLY:
+	case LNET_MSG_PUT:
+		/* Is the payload small enough not to need RDMA? */
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+		if (nob <= IBLND_MSG_SIZE)
+			break;		  /* send IMMEDIATE */
+
+		tx = kiblnd_get_idle_tx(ni, target.nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate %s txd for %s\n",
+			       type == LNET_MSG_PUT ? "PUT" : "REPLY",
+			       libcfs_nid2str(target.nid));
+			return -ENOMEM;
+		}
+
+		if (payload_kiov == NULL)
+			rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+						 payload_niov, payload_iov,
+						 payload_offset, payload_nob);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+						  payload_niov, payload_kiov,
+						  payload_offset, payload_nob);
+		if (rc != 0) {
+			CERROR("Can't setup PUT src for %s: %d\n",
+			       libcfs_nid2str(target.nid), rc);
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		ibmsg = tx->tx_msg;
+		ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+		ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+		tx->tx_waiting = 1;	     /* waiting for PUT_{ACK,NAK} */
+		kiblnd_launch_tx(ni, tx, target.nid);
+		return 0;
+	}
+
+	/* send IMMEDIATE */
+
+	LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+		 <= IBLND_MSG_SIZE);
+
+	tx = kiblnd_get_idle_tx(ni, target.nid);
+	if (tx == NULL) {
+		CERROR ("Can't send %d to %s: tx descs exhausted\n",
+			type, libcfs_nid2str(target.nid));
+		return -ENOMEM;
+	}
+
+	ibmsg = tx->tx_msg;
+	ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+	if (payload_kiov != NULL)
+		lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+				    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				    payload_niov, payload_kiov,
+				    payload_offset, payload_nob);
+	else
+		lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
+				   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				   payload_niov, payload_iov,
+				   payload_offset, payload_nob);
+
+	nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+	kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+
+	tx->tx_lntmsg[0] = lntmsg;	      /* finalise lntmsg on completion */
+	kiblnd_launch_tx(ni, tx, target.nid);
+	return 0;
+}
+
+void
+kiblnd_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
+{
+	lnet_process_id_t target = lntmsg->msg_target;
+	unsigned int      niov = lntmsg->msg_niov;
+	struct iovec     *iov = lntmsg->msg_iov;
+	lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+	unsigned int      offset = lntmsg->msg_offset;
+	unsigned int      nob = lntmsg->msg_len;
+	kib_tx_t	 *tx;
+	int	       rc;
+
+	tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
+	if (tx == NULL) {
+		CERROR("Can't get tx for REPLY to %s\n",
+		       libcfs_nid2str(target.nid));
+		goto failed_0;
+	}
+
+	if (nob == 0)
+		rc = 0;
+	else if (kiov == NULL)
+		rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+					 niov, iov, offset, nob);
+	else
+		rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+					  niov, kiov, offset, nob);
+
+	if (rc != 0) {
+		CERROR("Can't setup GET src for %s: %d\n",
+		       libcfs_nid2str(target.nid), rc);
+		goto failed_1;
+	}
+
+	rc = kiblnd_init_rdma(rx->rx_conn, tx,
+			      IBLND_MSG_GET_DONE, nob,
+			      &rx->rx_msg->ibm_u.get.ibgm_rd,
+			      rx->rx_msg->ibm_u.get.ibgm_cookie);
+	if (rc < 0) {
+		CERROR("Can't setup rdma for GET from %s: %d\n",
+		       libcfs_nid2str(target.nid), rc);
+		goto failed_1;
+	}
+
+	if (nob == 0) {
+		/* No RDMA: local completion may happen now! */
+		lnet_finalize(ni, lntmsg, 0);
+	} else {
+		/* RDMA: lnet_finalize(lntmsg) when it
+		 * completes */
+		tx->tx_lntmsg[0] = lntmsg;
+	}
+
+	kiblnd_queue_tx(tx, rx->rx_conn);
+	return;
+
+ failed_1:
+	kiblnd_tx_done(ni, tx);
+ failed_0:
+	lnet_finalize(ni, lntmsg, -EIO);
+}
+
+int
+kiblnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+	     unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+	     unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	kib_rx_t    *rx = private;
+	kib_msg_t   *rxmsg = rx->rx_msg;
+	kib_conn_t  *conn = rx->rx_conn;
+	kib_tx_t    *tx;
+	kib_msg_t   *txmsg;
+	int	  nob;
+	int	  post_credit = IBLND_POSTRX_PEER_CREDIT;
+	int	  rc = 0;
+
+	LASSERT (mlen <= rlen);
+	LASSERT (!in_interrupt());
+	/* Either all pages or all vaddrs */
+	LASSERT (!(kiov != NULL && iov != NULL));
+
+	switch (rxmsg->ibm_type) {
+	default:
+		LBUG();
+
+	case IBLND_MSG_IMMEDIATE:
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+		if (nob > rx->rx_nob) {
+			CERROR ("Immediate message from %s too big: %d(%d)\n",
+				libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+				nob, rx->rx_nob);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (kiov != NULL)
+			lnet_copy_flat2kiov(niov, kiov, offset,
+					    IBLND_MSG_SIZE, rxmsg,
+					    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					    mlen);
+		else
+			lnet_copy_flat2iov(niov, iov, offset,
+					   IBLND_MSG_SIZE, rxmsg,
+					   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					   mlen);
+		lnet_finalize (ni, lntmsg, 0);
+		break;
+
+	case IBLND_MSG_PUT_REQ:
+		if (mlen == 0) {
+			lnet_finalize(ni, lntmsg, 0);
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
+					       rxmsg->ibm_u.putreq.ibprm_cookie);
+			break;
+		}
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate tx for %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			/* Not replying will break the connection */
+			rc = -ENOMEM;
+			break;
+		}
+
+		txmsg = tx->tx_msg;
+		if (kiov == NULL)
+			rc = kiblnd_setup_rd_iov(ni, tx,
+						 &txmsg->ibm_u.putack.ibpam_rd,
+						 niov, iov, offset, mlen);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx,
+						  &txmsg->ibm_u.putack.ibpam_rd,
+						  niov, kiov, offset, mlen);
+		if (rc != 0) {
+			CERROR("Can't setup PUT sink for %s: %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+			kiblnd_tx_done(ni, tx);
+			/* tell peer it's over */
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
+					       rxmsg->ibm_u.putreq.ibprm_cookie);
+			break;
+		}
+
+		nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]);
+		txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+		txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+		tx->tx_waiting = 1;	     /* waiting for PUT_DONE */
+		kiblnd_queue_tx(tx, conn);
+
+		/* reposted buffer reserved for PUT_DONE */
+		post_credit = IBLND_POSTRX_NO_CREDIT;
+		break;
+
+	case IBLND_MSG_GET_REQ:
+		if (lntmsg != NULL) {
+			/* Optimized GET; RDMA lntmsg's payload */
+			kiblnd_reply(ni, rx, lntmsg);
+		} else {
+			/* GET didn't match anything */
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+					       -ENODATA,
+					       rxmsg->ibm_u.get.ibgm_cookie);
+		}
+		break;
+	}
+
+	kiblnd_post_rx(rx, post_credit);
+	return rc;
+}
+
+int
+kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+	task_t *task = kthread_run(fn, arg, name);
+
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	atomic_inc(&kiblnd_data.kib_nthreads);
+	return 0;
+}
+
+void
+kiblnd_thread_fini (void)
+{
+	atomic_dec (&kiblnd_data.kib_nthreads);
+}
+
+void
+kiblnd_peer_alive (kib_peer_t *peer)
+{
+	/* This is racy, but everyone's only writing cfs_time_current() */
+	peer->ibp_last_alive = cfs_time_current();
+	mb();
+}
+
+void
+kiblnd_peer_notify (kib_peer_t *peer)
+{
+	int	   error = 0;
+	cfs_time_t    last_alive = 0;
+	unsigned long flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (list_empty(&peer->ibp_conns) &&
+	    peer->ibp_accepting == 0 &&
+	    peer->ibp_connecting == 0 &&
+	    peer->ibp_error != 0) {
+		error = peer->ibp_error;
+		peer->ibp_error = 0;
+
+		last_alive = peer->ibp_last_alive;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (error != 0)
+		lnet_notify(peer->ibp_ni,
+			    peer->ibp_nid, 0, last_alive);
+}
+
+void
+kiblnd_close_conn_locked (kib_conn_t *conn, int error)
+{
+	/* This just does the immediate housekeeping.  'error' is zero for a
+	 * normal shutdown which can happen only after the connection has been
+	 * established.  If the connection is established, schedule the
+	 * connection to be finished off by the connd.  Otherwise the connd is
+	 * already dealing with it (either to set it up or tear it down).
+	 * Caller holds kib_global_lock exclusively in irq context */
+	kib_peer_t       *peer = conn->ibc_peer;
+	kib_dev_t	*dev;
+	unsigned long     flags;
+
+	LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	if (error != 0 && conn->ibc_comms_error == 0)
+		conn->ibc_comms_error = error;
+
+	if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+		return; /* already being handled  */
+
+	if (error == 0 &&
+	    list_empty(&conn->ibc_tx_noops) &&
+	    list_empty(&conn->ibc_tx_queue) &&
+	    list_empty(&conn->ibc_tx_queue_rsrvd) &&
+	    list_empty(&conn->ibc_tx_queue_nocred) &&
+	    list_empty(&conn->ibc_active_txs)) {
+		CDEBUG(D_NET, "closing conn to %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+	} else {
+		CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+		       libcfs_nid2str(peer->ibp_nid), error,
+		       list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+		       list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
+		       list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+		       list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+		       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+	}
+
+	dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev;
+	list_del(&conn->ibc_list);
+	/* connd (see below) takes over ibc_list's ref */
+
+	if (list_empty (&peer->ibp_conns) &&    /* no more conns */
+	    kiblnd_peer_active(peer)) {	 /* still in peer table */
+		kiblnd_unlink_peer_locked(peer);
+
+		/* set/clear error on last conn */
+		peer->ibp_error = conn->ibc_comms_error;
+	}
+
+	kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+	if (error != 0 &&
+	    kiblnd_dev_can_failover(dev)) {
+		list_add_tail(&dev->ibd_fail_list,
+			      &kiblnd_data.kib_failed_devs);
+		wake_up(&kiblnd_data.kib_failover_waitq);
+	}
+
+	spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+	list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+	wake_up(&kiblnd_data.kib_connd_waitq);
+
+	spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn(kib_conn_t *conn, int error)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_close_conn_locked(conn, error);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_handle_early_rxs(kib_conn_t *conn)
+{
+	unsigned long    flags;
+	kib_rx_t	*rx;
+
+	LASSERT(!in_interrupt());
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while (!list_empty(&conn->ibc_early_rxs)) {
+		rx = list_entry(conn->ibc_early_rxs.next,
+				    kib_rx_t, rx_list);
+		list_del(&rx->rx_list);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_handle_rx(rx);
+
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+	LIST_HEAD       (zombies);
+	struct list_head	  *tmp;
+	struct list_head	  *nxt;
+	kib_tx_t	    *tx;
+
+	spin_lock(&conn->ibc_lock);
+
+	list_for_each_safe (tmp, nxt, txs) {
+		tx = list_entry (tmp, kib_tx_t, tx_list);
+
+		if (txs == &conn->ibc_active_txs) {
+			LASSERT (!tx->tx_queued);
+			LASSERT (tx->tx_waiting ||
+				 tx->tx_sending != 0);
+		} else {
+			LASSERT (tx->tx_queued);
+		}
+
+		tx->tx_status = -ECONNABORTED;
+		tx->tx_waiting = 0;
+
+		if (tx->tx_sending == 0) {
+			tx->tx_queued = 0;
+			list_del (&tx->tx_list);
+			list_add (&tx->tx_list, &zombies);
+		}
+	}
+
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
+}
+
+void
+kiblnd_finalise_conn (kib_conn_t *conn)
+{
+	LASSERT (!in_interrupt());
+	LASSERT (conn->ibc_state > IBLND_CONN_INIT);
+
+	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+	/* abort_receives moves QP state to IB_QPS_ERR.  This is only required
+	 * for connections that didn't get as far as being connected, because
+	 * rdma_disconnect() does this for free. */
+	kiblnd_abort_receives(conn);
+
+	/* Complete all tx descs not waiting for sends to complete.
+	 * NB we should be safe from RDMA now that the QP has changed state */
+
+	kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+	kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+	kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error)
+{
+	LIST_HEAD    (zombies);
+	unsigned long     flags;
+
+	LASSERT (error != 0);
+	LASSERT (!in_interrupt());
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (active) {
+		LASSERT (peer->ibp_connecting > 0);
+		peer->ibp_connecting--;
+	} else {
+		LASSERT (peer->ibp_accepting > 0);
+		peer->ibp_accepting--;
+	}
+
+	if (peer->ibp_connecting != 0 ||
+	    peer->ibp_accepting != 0) {
+		/* another connection attempt under way... */
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					    flags);
+		return;
+	}
+
+	if (list_empty(&peer->ibp_conns)) {
+		/* Take peer's blocked transmits to complete with error */
+		list_add(&zombies, &peer->ibp_tx_queue);
+		list_del_init(&peer->ibp_tx_queue);
+
+		if (kiblnd_peer_active(peer))
+			kiblnd_unlink_peer_locked(peer);
+
+		peer->ibp_error = error;
+	} else {
+		/* Can't have blocked transmits if there are connections */
+		LASSERT (list_empty(&peer->ibp_tx_queue));
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_peer_notify(peer);
+
+	if (list_empty (&zombies))
+		return;
+
+	CNETERR("Deleting messages for %s: connection failed\n",
+		libcfs_nid2str(peer->ibp_nid));
+
+	kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
+}
+
+void
+kiblnd_connreq_done(kib_conn_t *conn, int status)
+{
+	kib_peer_t	*peer = conn->ibc_peer;
+	kib_tx_t	  *tx;
+	struct list_head	 txs;
+	unsigned long      flags;
+	int		active;
+
+	active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n",
+	       libcfs_nid2str(peer->ibp_nid), active,
+	       conn->ibc_version, status);
+
+	LASSERT (!in_interrupt());
+	LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+		  peer->ibp_connecting > 0) ||
+		 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+		  peer->ibp_accepting > 0));
+
+	LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+	conn->ibc_connvars = NULL;
+
+	if (status != 0) {
+		/* failed to establish connection */
+		kiblnd_peer_connect_failed(peer, active, status);
+		kiblnd_finalise_conn(conn);
+		return;
+	}
+
+	/* connection established */
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	conn->ibc_last_send = jiffies;
+	kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+	kiblnd_peer_alive(peer);
+
+	/* Add conn to peer's list and nuke any dangling conns from a different
+	 * peer instance... */
+	kiblnd_conn_addref(conn);	       /* +1 ref for ibc_list */
+	list_add(&conn->ibc_list, &peer->ibp_conns);
+	if (active)
+		peer->ibp_connecting--;
+	else
+		peer->ibp_accepting--;
+
+	if (peer->ibp_version == 0) {
+		peer->ibp_version     = conn->ibc_version;
+		peer->ibp_incarnation = conn->ibc_incarnation;
+	}
+
+	if (peer->ibp_version     != conn->ibc_version ||
+	    peer->ibp_incarnation != conn->ibc_incarnation) {
+		kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
+						conn->ibc_incarnation);
+		peer->ibp_version     = conn->ibc_version;
+		peer->ibp_incarnation = conn->ibc_incarnation;
+	}
+
+	/* grab pending txs while I have the lock */
+	list_add(&txs, &peer->ibp_tx_queue);
+	list_del_init(&peer->ibp_tx_queue);
+
+	if (!kiblnd_peer_active(peer) ||	/* peer has been deleted */
+	    conn->ibc_comms_error != 0) {       /* error has happened already */
+		lnet_ni_t *ni = peer->ibp_ni;
+
+		/* start to shut down connection */
+		kiblnd_close_conn_locked(conn, -ECONNABORTED);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
+
+		return;
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* Schedule blocked txs */
+	spin_lock(&conn->ibc_lock);
+	while (!list_empty(&txs)) {
+		tx = list_entry(txs.next, kib_tx_t, tx_list);
+		list_del(&tx->tx_list);
+
+		kiblnd_queue_tx_locked(tx, conn);
+	}
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+
+	/* schedule blocked rxs */
+	kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+{
+	int	  rc;
+
+	rc = rdma_reject(cmid, rej, sizeof(*rej));
+
+	if (rc != 0)
+		CWARN("Error %d sending reject\n", rc);
+}
+
+int
+kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+	rwlock_t		*g_lock = &kiblnd_data.kib_global_lock;
+	kib_msg_t	     *reqmsg = priv;
+	kib_msg_t	     *ackmsg;
+	kib_dev_t	     *ibdev;
+	kib_peer_t	    *peer;
+	kib_peer_t	    *peer2;
+	kib_conn_t	    *conn;
+	lnet_ni_t	     *ni  = NULL;
+	kib_net_t	     *net = NULL;
+	lnet_nid_t	     nid;
+	struct rdma_conn_param cp;
+	kib_rej_t	      rej;
+	int		    version = IBLND_MSG_VERSION;
+	unsigned long	  flags;
+	int		    rc;
+	struct sockaddr_in    *peer_addr;
+	LASSERT (!in_interrupt());
+
+	/* cmid inherits 'context' from the corresponding listener id */
+	ibdev = (kib_dev_t *)cmid->context;
+	LASSERT (ibdev != NULL);
+
+	memset(&rej, 0, sizeof(rej));
+	rej.ibr_magic		= IBLND_MSG_MAGIC;
+	rej.ibr_why		  = IBLND_REJECT_FATAL;
+	rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+	peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+	if (*kiblnd_tunables.kib_require_priv_port &&
+	    ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+		__u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+		CERROR("Peer's port (%u.%u.%u.%u:%hu) is not privileged\n",
+		       HIPQUAD(ip), ntohs(peer_addr->sin_port));
+		goto failed;
+	}
+
+	if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+		CERROR("Short connection request\n");
+		goto failed;
+	}
+
+	/* Future protocol version compatibility support!  If the
+	 * o2iblnd-specific protocol changes, or when LNET unifies
+	 * protocols over all LNDs, the initial connection will
+	 * negotiate a protocol version.  I trap this here to avoid
+	 * console errors; the reject tells the peer which protocol I
+	 * speak. */
+	if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+	    reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+		goto failed;
+	if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+		goto failed;
+	if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+		goto failed;
+
+	rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+	if (rc != 0) {
+		CERROR("Can't parse connection request: %d\n", rc);
+		goto failed;
+	}
+
+	nid = reqmsg->ibm_srcnid;
+	ni  = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+
+	if (ni != NULL) {
+		net = (kib_net_t *)ni->ni_data;
+		rej.ibr_incarnation = net->ibn_incarnation;
+	}
+
+	if (ni == NULL ||			 /* no matching net */
+	    ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
+	    net->ibn_dev != ibdev) {	      /* wrong device */
+		CERROR("Can't accept %s on %s (%s:%d:%u.%u.%u.%u): "
+		       "bad dst nid %s\n", libcfs_nid2str(nid),
+		       ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+		       ibdev->ibd_ifname, ibdev->ibd_nnets,
+		       HIPQUAD(ibdev->ibd_ifip),
+		       libcfs_nid2str(reqmsg->ibm_dstnid));
+
+		goto failed;
+	}
+
+       /* check time stamp as soon as possible */
+	if (reqmsg->ibm_dststamp != 0 &&
+	    reqmsg->ibm_dststamp != net->ibn_incarnation) {
+		CWARN("Stale connection request\n");
+		rej.ibr_why = IBLND_REJECT_CONN_STALE;
+		goto failed;
+	}
+
+	/* I can accept peer's version */
+	version = reqmsg->ibm_version;
+
+	if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+		CERROR("Unexpected connreq msg type: %x from %s\n",
+		       reqmsg->ibm_type, libcfs_nid2str(nid));
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_queue_depth !=
+	    IBLND_MSG_QUEUE_SIZE(version)) {
+		CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+		       libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth,
+		       IBLND_MSG_QUEUE_SIZE(version));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
+
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_frags !=
+	    IBLND_RDMA_FRAGS(version)) {
+		CERROR("Can't accept %s(version %x): "
+		       "incompatible max_frags %d (%d wanted)\n",
+		       libcfs_nid2str(nid), version,
+		       reqmsg->ibm_u.connparams.ibcp_max_frags,
+		       IBLND_RDMA_FRAGS(version));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+		goto failed;
+
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+		CERROR("Can't accept %s: message size %d too big (%d max)\n",
+		       libcfs_nid2str(nid),
+		       reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+		       IBLND_MSG_SIZE);
+		goto failed;
+	}
+
+	/* assume 'nid' is a new peer; create  */
+	rc = kiblnd_create_peer(ni, &peer, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	write_lock_irqsave(g_lock, flags);
+
+	peer2 = kiblnd_find_peer_locked(nid);
+	if (peer2 != NULL) {
+		if (peer2->ibp_version == 0) {
+			peer2->ibp_version     = version;
+			peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+		}
+
+		/* not the guy I've talked with */
+		if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
+		    peer2->ibp_version     != version) {
+			kiblnd_close_peer_conns_locked(peer2, -ESTALE);
+			write_unlock_irqrestore(g_lock, flags);
+
+			CWARN("Conn stale %s [old ver: %x, new ver: %x]\n",
+			      libcfs_nid2str(nid), peer2->ibp_version, version);
+
+			kiblnd_peer_decref(peer);
+			rej.ibr_why = IBLND_REJECT_CONN_STALE;
+			goto failed;
+		}
+
+		/* tie-break connection race in favour of the higher NID */
+		if (peer2->ibp_connecting != 0 &&
+		    nid < ni->ni_nid) {
+			write_unlock_irqrestore(g_lock, flags);
+
+			CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid));
+
+			kiblnd_peer_decref(peer);
+			rej.ibr_why = IBLND_REJECT_CONN_RACE;
+			goto failed;
+		}
+
+		peer2->ibp_accepting++;
+		kiblnd_peer_addref(peer2);
+
+		write_unlock_irqrestore(g_lock, flags);
+		kiblnd_peer_decref(peer);
+		peer = peer2;
+	} else {
+		/* Brand new peer */
+		LASSERT (peer->ibp_accepting == 0);
+		LASSERT (peer->ibp_version == 0 &&
+			 peer->ibp_incarnation == 0);
+
+		peer->ibp_accepting   = 1;
+		peer->ibp_version     = version;
+		peer->ibp_incarnation = reqmsg->ibm_srcstamp;
+
+		/* I have a ref on ni that prevents it being shutdown */
+		LASSERT (net->ibn_shutdown == 0);
+
+		kiblnd_peer_addref(peer);
+		list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+		write_unlock_irqrestore(g_lock, flags);
+	}
+
+	conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+	if (conn == NULL) {
+		kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
+		kiblnd_peer_decref(peer);
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	/* conn now "owns" cmid, so I return success from here on to ensure the
+	 * CM callback doesn't destroy cmid. */
+
+	conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+	conn->ibc_credits	  = IBLND_MSG_QUEUE_SIZE(version);
+	conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version);
+	LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version)
+		 <= IBLND_RX_MSGS(version));
+
+	ackmsg = &conn->ibc_connvars->cv_msg;
+	memset(ackmsg, 0, sizeof(*ackmsg));
+
+	kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+			sizeof(ackmsg->ibm_u.connparams));
+	ackmsg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+	ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+	ackmsg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+
+	kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.private_data	= ackmsg;
+	cp.private_data_len    = ackmsg->ibm_nob;
+	cp.responder_resources = 0;	     /* No atomic ops or RDMA reads */
+	cp.initiator_depth     = 0;
+	cp.flow_control	= 1;
+	cp.retry_count	 = *kiblnd_tunables.kib_retry_count;
+	cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+	CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+	rc = rdma_accept(cmid, &cp);
+	if (rc != 0) {
+		CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+		rej.ibr_version = version;
+		rej.ibr_why     = IBLND_REJECT_FATAL;
+
+		kiblnd_reject(cmid, &rej);
+		kiblnd_connreq_done(conn, rc);
+		kiblnd_conn_decref(conn);
+	}
+
+	lnet_ni_decref(ni);
+	return 0;
+
+ failed:
+	if (ni != NULL)
+		lnet_ni_decref(ni);
+
+	rej.ibr_version = version;
+	rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+	rej.ibr_cp.ibcp_max_frags   = IBLND_RDMA_FRAGS(version);
+	kiblnd_reject(cmid, &rej);
+
+	return -ECONNREFUSED;
+}
+
+void
+kiblnd_reconnect (kib_conn_t *conn, int version,
+		  __u64 incarnation, int why, kib_connparams_t *cp)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+	char	  *reason;
+	int	    retry = 0;
+	unsigned long  flags;
+
+	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+	LASSERT (peer->ibp_connecting > 0);     /* 'conn' at least */
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	/* retry connection if it's still needed and no other connection
+	 * attempts (active or passive) are in progress
+	 * NB: reconnect is still needed even when ibp_tx_queue is
+	 * empty if ibp_version != version because reconnect may be
+	 * initiated by kiblnd_query() */
+	if ((!list_empty(&peer->ibp_tx_queue) ||
+	     peer->ibp_version != version) &&
+	    peer->ibp_connecting == 1 &&
+	    peer->ibp_accepting == 0) {
+		retry = 1;
+		peer->ibp_connecting++;
+
+		peer->ibp_version     = version;
+		peer->ibp_incarnation = incarnation;
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (!retry)
+		return;
+
+	switch (why) {
+	default:
+		reason = "Unknown";
+		break;
+
+	case IBLND_REJECT_CONN_STALE:
+		reason = "stale";
+		break;
+
+	case IBLND_REJECT_CONN_RACE:
+		reason = "conn race";
+		break;
+
+	case IBLND_REJECT_CONN_UNCOMPAT:
+		reason = "version negotiation";
+		break;
+	}
+
+	CNETERR("%s: retrying (%s), %x, %x, "
+		"queue_dep: %d, max_frag: %d, msg_size: %d\n",
+		libcfs_nid2str(peer->ibp_nid),
+		reason, IBLND_MSG_VERSION, version,
+		cp != NULL? cp->ibcp_queue_depth :IBLND_MSG_QUEUE_SIZE(version),
+		cp != NULL? cp->ibcp_max_frags   : IBLND_RDMA_FRAGS(version),
+		cp != NULL? cp->ibcp_max_msg_size: IBLND_MSG_SIZE);
+
+	kiblnd_connect_peer(peer);
+}
+
+void
+kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+
+	LASSERT (!in_interrupt());
+	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	switch (reason) {
+	case IB_CM_REJ_STALE_CONN:
+		kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0,
+				 IBLND_REJECT_CONN_STALE, NULL);
+		break;
+
+	case IB_CM_REJ_INVALID_SERVICE_ID:
+		CNETERR("%s rejected: no listener at %d\n",
+			libcfs_nid2str(peer->ibp_nid),
+			*kiblnd_tunables.kib_service);
+		break;
+
+	case IB_CM_REJ_CONSUMER_DEFINED:
+		if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
+			kib_rej_t	*rej	 = priv;
+			kib_connparams_t *cp	  = NULL;
+			int	       flip	= 0;
+			__u64	     incarnation = -1;
+
+			/* NB. default incarnation is -1 because:
+			 * a) V1 will ignore dst incarnation in connreq.
+			 * b) V2 will provide incarnation while rejecting me,
+			 *    -1 will be overwrote.
+			 *
+			 * if I try to connect to a V1 peer with V2 protocol,
+			 * it rejected me then upgrade to V2, I have no idea
+			 * about the upgrading and try to reconnect with V1,
+			 * in this case upgraded V2 can find out I'm trying to
+			 * talk to the old guy and reject me(incarnation is -1).
+			 */
+
+			if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+			    rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+				__swab32s(&rej->ibr_magic);
+				__swab16s(&rej->ibr_version);
+				flip = 1;
+			}
+
+			if (priv_nob >= sizeof(kib_rej_t) &&
+			    rej->ibr_version > IBLND_MSG_VERSION_1) {
+				/* priv_nob is always 148 in current version
+				 * of OFED, so we still need to check version.
+				 * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */
+				cp = &rej->ibr_cp;
+
+				if (flip) {
+					__swab64s(&rej->ibr_incarnation);
+					__swab16s(&cp->ibcp_queue_depth);
+					__swab16s(&cp->ibcp_max_frags);
+					__swab32s(&cp->ibcp_max_msg_size);
+				}
+
+				incarnation = rej->ibr_incarnation;
+			}
+
+			if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+			    rej->ibr_magic != LNET_PROTO_MAGIC) {
+				CERROR("%s rejected: consumer defined fatal error\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+			}
+
+			if (rej->ibr_version != IBLND_MSG_VERSION &&
+			    rej->ibr_version != IBLND_MSG_VERSION_1) {
+				CERROR("%s rejected: o2iblnd version %x error\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       rej->ibr_version);
+				break;
+			}
+
+			if (rej->ibr_why     == IBLND_REJECT_FATAL &&
+			    rej->ibr_version == IBLND_MSG_VERSION_1) {
+				CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
+				       libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
+
+				if (conn->ibc_version != IBLND_MSG_VERSION_1)
+					rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
+			}
+
+			switch (rej->ibr_why) {
+			case IBLND_REJECT_CONN_RACE:
+			case IBLND_REJECT_CONN_STALE:
+			case IBLND_REJECT_CONN_UNCOMPAT:
+				kiblnd_reconnect(conn, rej->ibr_version,
+						 incarnation, rej->ibr_why, cp);
+				break;
+
+			case IBLND_REJECT_MSG_QUEUE_SIZE:
+				CERROR("%s rejected: incompatible message queue depth %d, %d\n",
+				       libcfs_nid2str(peer->ibp_nid), cp->ibcp_queue_depth,
+				       IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+				break;
+
+			case IBLND_REJECT_RDMA_FRAGS:
+				CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n",
+				       libcfs_nid2str(peer->ibp_nid), cp->ibcp_max_frags,
+				       IBLND_RDMA_FRAGS(conn->ibc_version));
+				break;
+
+			case IBLND_REJECT_NO_RESOURCES:
+				CERROR("%s rejected: o2iblnd no resources\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+
+			case IBLND_REJECT_FATAL:
+				CERROR("%s rejected: o2iblnd fatal error\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+
+			default:
+				CERROR("%s rejected: o2iblnd reason %d\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       rej->ibr_why);
+				break;
+			}
+			break;
+		}
+		/* fall through */
+	default:
+		CNETERR("%s rejected: reason %d, size %d\n",
+			libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
+		break;
+	}
+
+	kiblnd_connreq_done(conn, -ECONNREFUSED);
+}
+
+void
+kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+	lnet_ni_t     *ni   = peer->ibp_ni;
+	kib_net_t     *net  = ni->ni_data;
+	kib_msg_t     *msg  = priv;
+	int	    ver  = conn->ibc_version;
+	int	    rc   = kiblnd_unpack_msg(msg, priv_nob);
+	unsigned long  flags;
+
+	LASSERT (net != NULL);
+
+	if (rc != 0) {
+		CERROR("Can't unpack connack from %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		goto failed;
+	}
+
+	if (msg->ibm_type != IBLND_MSG_CONNACK) {
+		CERROR("Unexpected message %d from %s\n",
+		       msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (ver != msg->ibm_version) {
+		CERROR("%s replied version %x is different with "
+		       "requested version %x\n",
+		       libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_queue_depth !=
+	    IBLND_MSG_QUEUE_SIZE(ver)) {
+		CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_queue_depth,
+		       IBLND_MSG_QUEUE_SIZE(ver));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_max_frags !=
+	    IBLND_RDMA_FRAGS(ver)) {
+		CERROR("%s has incompatible max_frags %d (%d wanted)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_max_frags,
+		       IBLND_RDMA_FRAGS(ver));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+		CERROR("%s max message size %d too big (%d max)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_max_msg_size,
+		       IBLND_MSG_SIZE);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (msg->ibm_dstnid == ni->ni_nid &&
+	    msg->ibm_dststamp == net->ibn_incarnation)
+		rc = 0;
+	else
+		rc = -ESTALE;
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (rc != 0) {
+		CERROR("Bad connection reply from %s, rc = %d, "
+		       "version: %x max_frags: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc,
+		       msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
+		goto failed;
+	}
+
+	conn->ibc_incarnation      = msg->ibm_srcstamp;
+	conn->ibc_credits	  =
+	conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver);
+	LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver)
+		 <= IBLND_RX_MSGS(ver));
+
+	kiblnd_connreq_done(conn, 0);
+	return;
+
+ failed:
+	/* NB My QP has already established itself, so I handle anything going
+	 * wrong here by setting ibc_comms_error.
+	 * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+	 * immediately tears it down. */
+
+	LASSERT (rc != 0);
+	conn->ibc_comms_error = rc;
+	kiblnd_connreq_done(conn, 0);
+}
+
+int
+kiblnd_active_connect (struct rdma_cm_id *cmid)
+{
+	kib_peer_t	      *peer = (kib_peer_t *)cmid->context;
+	kib_conn_t	      *conn;
+	kib_msg_t	       *msg;
+	struct rdma_conn_param   cp;
+	int		      version;
+	__u64		    incarnation;
+	unsigned long	    flags;
+	int		      rc;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	incarnation = peer->ibp_incarnation;
+	version     = (peer->ibp_version == 0) ? IBLND_MSG_VERSION :
+						 peer->ibp_version;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version);
+	if (conn == NULL) {
+		kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
+		kiblnd_peer_decref(peer); /* lose cmid's ref */
+		return -ENOMEM;
+	}
+
+	/* conn "owns" cmid now, so I return success from here on to ensure the
+	 * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+	 * on peer */
+
+	msg = &conn->ibc_connvars->cv_msg;
+
+	memset(msg, 0, sizeof(*msg));
+	kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+	msg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+	msg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+	msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+	kiblnd_pack_msg(peer->ibp_ni, msg, version,
+			0, peer->ibp_nid, incarnation);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.private_data	= msg;
+	cp.private_data_len    = msg->ibm_nob;
+	cp.responder_resources = 0;	     /* No atomic ops or RDMA reads */
+	cp.initiator_depth     = 0;
+	cp.flow_control	= 1;
+	cp.retry_count	 = *kiblnd_tunables.kib_retry_count;
+	cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+	LASSERT(cmid->context == (void *)conn);
+	LASSERT(conn->ibc_cmid == cmid);
+
+	rc = rdma_connect(cmid, &cp);
+	if (rc != 0) {
+		CERROR("Can't connect to %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		kiblnd_connreq_done(conn, rc);
+		kiblnd_conn_decref(conn);
+	}
+
+	return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+	kib_peer_t  *peer;
+	kib_conn_t  *conn;
+	int	  rc;
+
+	switch (event->event) {
+	default:
+		CERROR("Unexpected event: %d, status: %d\n",
+		       event->event, event->status);
+		LBUG();
+
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		/* destroy cmid on failure */
+		rc = kiblnd_passive_connect(cmid,
+					    (void *)KIBLND_CONN_PARAM(event),
+					    KIBLND_CONN_PARAM_LEN(event));
+		CDEBUG(D_NET, "connreq: %d\n", rc);
+		return rc;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		peer = (kib_peer_t *)cmid->context;
+		CNETERR("%s: ADDR ERROR %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+		kiblnd_peer_decref(peer);
+		return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		peer = (kib_peer_t *)cmid->context;
+
+		CDEBUG(D_NET,"%s Addr resolved: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+
+		if (event->status != 0) {
+			CNETERR("Can't resolve address for %s: %d\n",
+				libcfs_nid2str(peer->ibp_nid), event->status);
+			rc = event->status;
+		} else {
+			rc = rdma_resolve_route(
+				cmid, *kiblnd_tunables.kib_timeout * 1000);
+			if (rc == 0)
+				return 0;
+			/* Can't initiate route resolution */
+			CERROR("Can't resolve route for %s: %d\n",
+			       libcfs_nid2str(peer->ibp_nid), rc);
+		}
+		kiblnd_peer_connect_failed(peer, 1, rc);
+		kiblnd_peer_decref(peer);
+		return rc;		      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		peer = (kib_peer_t *)cmid->context;
+		CNETERR("%s: ROUTE ERROR %d\n",
+			libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+		kiblnd_peer_decref(peer);
+		return -EHOSTUNREACH;	   /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		peer = (kib_peer_t *)cmid->context;
+		CDEBUG(D_NET,"%s Route resolved: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+
+		if (event->status == 0)
+			return kiblnd_active_connect(cmid);
+
+		CNETERR("Can't resolve route for %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, event->status);
+		kiblnd_peer_decref(peer);
+		return event->status;	   /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_UNREACHABLE:
+		conn = (kib_conn_t *)cmid->context;
+		LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+			conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+		CNETERR("%s: UNREACHABLE %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+		kiblnd_connreq_done(conn, -ENETDOWN);
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		conn = (kib_conn_t *)cmid->context;
+		LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+			conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+		CNETERR("%s: CONNECT ERROR %d\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+		kiblnd_connreq_done(conn, -ENOTCONN);
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_REJECTED:
+		conn = (kib_conn_t *)cmid->context;
+		switch (conn->ibc_state) {
+		default:
+			LBUG();
+
+		case IBLND_CONN_PASSIVE_WAIT:
+			CERROR ("%s: REJECTED %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				event->status);
+			kiblnd_connreq_done(conn, -ECONNRESET);
+			break;
+
+		case IBLND_CONN_ACTIVE_CONNECT:
+			kiblnd_rejected(conn, event->status,
+					(void *)KIBLND_CONN_PARAM(event),
+					KIBLND_CONN_PARAM_LEN(event));
+			break;
+		}
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		conn = (kib_conn_t *)cmid->context;
+		switch (conn->ibc_state) {
+		default:
+			LBUG();
+
+		case IBLND_CONN_PASSIVE_WAIT:
+			CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_connreq_done(conn, 0);
+			break;
+
+		case IBLND_CONN_ACTIVE_CONNECT:
+			CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_check_connreply(conn,
+					       (void *)KIBLND_CONN_PARAM(event),
+					       KIBLND_CONN_PARAM_LEN(event));
+			break;
+		}
+		/* net keeps its ref on conn! */
+		return 0;
+
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
+		return 0;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		conn = (kib_conn_t *)cmid->context;
+		if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+			CERROR("%s DISCONNECTED\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_connreq_done(conn, -ECONNRESET);
+		} else {
+			kiblnd_close_conn(conn, 0);
+		}
+		kiblnd_conn_decref(conn);
+		cmid->context = NULL;
+		return 0;
+
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		LCONSOLE_ERROR_MSG(0x131,
+				   "Received notification of device removal\n"
+				   "Please shutdown LNET to allow this to proceed\n");
+		/* Can't remove network from underneath LNET for now, so I have
+		 * to ignore this */
+		return 0;
+
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
+		return 0;
+	}
+}
+
+static int
+kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+{
+	kib_tx_t	  *tx;
+	struct list_head	*ttmp;
+
+	list_for_each (ttmp, txs) {
+		tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+		if (txs != &conn->ibc_active_txs) {
+			LASSERT (tx->tx_queued);
+		} else {
+			LASSERT (!tx->tx_queued);
+			LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+		}
+
+		if (cfs_time_aftereq (jiffies, tx->tx_deadline)) {
+			CERROR("Timed out tx: %s, %lu seconds\n",
+			       kiblnd_queue2str(conn, txs),
+			       cfs_duration_sec(jiffies - tx->tx_deadline));
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+{
+	return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
+}
+
+void
+kiblnd_check_conns (int idx)
+{
+	LIST_HEAD (closes);
+	LIST_HEAD (checksends);
+	struct list_head    *peers = &kiblnd_data.kib_peers[idx];
+	struct list_head    *ptmp;
+	kib_peer_t    *peer;
+	kib_conn_t    *conn;
+	struct list_head    *ctmp;
+	unsigned long  flags;
+
+	/* NB. We expect to have a look at all the peers and not find any
+	 * RDMAs to time out, so we just use a shared lock while we
+	 * take a look... */
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	list_for_each (ptmp, peers) {
+		peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+		list_for_each (ctmp, &peer->ibp_conns) {
+			int timedout;
+			int sendnoop;
+
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+			LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+			spin_lock(&conn->ibc_lock);
+
+			sendnoop = kiblnd_need_noop(conn);
+			timedout = kiblnd_conn_timed_out_locked(conn);
+			if (!sendnoop && !timedout) {
+				spin_unlock(&conn->ibc_lock);
+				continue;
+			}
+
+			if (timedout) {
+				CERROR("Timed out RDMA with %s (%lu): "
+				       "c: %u, oc: %u, rc: %u\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       cfs_duration_sec(cfs_time_current() -
+							peer->ibp_last_alive),
+				       conn->ibc_credits,
+				       conn->ibc_outstanding_credits,
+				       conn->ibc_reserved_credits);
+				list_add(&conn->ibc_connd_list, &closes);
+			} else {
+				list_add(&conn->ibc_connd_list,
+					     &checksends);
+			}
+			/* +ref for 'closes' or 'checksends' */
+			kiblnd_conn_addref(conn);
+
+			spin_unlock(&conn->ibc_lock);
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* Handle timeout by closing the whole
+	 * connection. We can only be sure RDMA activity
+	 * has ceased once the QP has been modified. */
+	while (!list_empty(&closes)) {
+		conn = list_entry(closes.next,
+				      kib_conn_t, ibc_connd_list);
+		list_del(&conn->ibc_connd_list);
+		kiblnd_close_conn(conn, -ETIMEDOUT);
+		kiblnd_conn_decref(conn);
+	}
+
+	/* In case we have enough credits to return via a
+	 * NOOP, but there were no non-blocking tx descs
+	 * free to do it last time... */
+	while (!list_empty(&checksends)) {
+		conn = list_entry(checksends.next,
+				      kib_conn_t, ibc_connd_list);
+		list_del(&conn->ibc_connd_list);
+		kiblnd_check_sends(conn);
+		kiblnd_conn_decref(conn);
+	}
+}
+
+void
+kiblnd_disconnect_conn (kib_conn_t *conn)
+{
+	LASSERT (!in_interrupt());
+	LASSERT (current == kiblnd_data.kib_connd);
+	LASSERT (conn->ibc_state == IBLND_CONN_CLOSING);
+
+	rdma_disconnect(conn->ibc_cmid);
+	kiblnd_finalise_conn(conn);
+
+	kiblnd_peer_notify(conn->ibc_peer);
+}
+
+int
+kiblnd_connd (void *arg)
+{
+	wait_queue_t     wait;
+	unsigned long      flags;
+	kib_conn_t	*conn;
+	int		timeout;
+	int		i;
+	int		dropped_lock;
+	int		peer_index = 0;
+	unsigned long      deadline = jiffies;
+
+	cfs_block_allsigs ();
+
+	init_waitqueue_entry_current (&wait);
+	kiblnd_data.kib_connd = current;
+
+	spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+
+		dropped_lock = 0;
+
+		if (!list_empty (&kiblnd_data.kib_connd_zombies)) {
+			conn = list_entry(kiblnd_data. \
+					      kib_connd_zombies.next,
+					      kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+					       flags);
+			dropped_lock = 1;
+
+			kiblnd_destroy_conn(conn);
+
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		if (!list_empty(&kiblnd_data.kib_connd_conns)) {
+			conn = list_entry(kiblnd_data.kib_connd_conns.next,
+					      kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+					       flags);
+			dropped_lock = 1;
+
+			kiblnd_disconnect_conn(conn);
+			kiblnd_conn_decref(conn);
+
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		/* careful with the jiffy wrap... */
+		timeout = (int)(deadline - jiffies);
+		if (timeout <= 0) {
+			const int n = 4;
+			const int p = 1;
+			int       chunk = kiblnd_data.kib_peer_hash_size;
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+			dropped_lock = 1;
+
+			/* Time to check for RDMA timeouts on a few more
+			 * peers: I do checks every 'p' seconds on a
+			 * proportion of the peer table and I need to check
+			 * every connection 'n' times within a timeout
+			 * interval, to ensure I detect a timeout on any
+			 * connection within (n+1)/n times the timeout
+			 * interval. */
+
+			if (*kiblnd_tunables.kib_timeout > n * p)
+				chunk = (chunk * n * p) /
+					*kiblnd_tunables.kib_timeout;
+			if (chunk == 0)
+				chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				kiblnd_check_conns(peer_index);
+				peer_index = (peer_index + 1) %
+					     kiblnd_data.kib_peer_hash_size;
+			}
+
+			deadline += p * HZ;
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		if (dropped_lock)
+			continue;
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+		waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+	kib_conn_t *conn = arg;
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		CDEBUG(D_NET, "%s established\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+
+	default:
+		CERROR("%s: Async QP event type %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+		return;
+	}
+}
+
+void
+kiblnd_complete (struct ib_wc *wc)
+{
+	switch (kiblnd_wreqid2type(wc->wr_id)) {
+	default:
+		LBUG();
+
+	case IBLND_WID_RDMA:
+		/* We only get RDMA completion notification if it fails.  All
+		 * subsequent work items, including the final SEND will fail
+		 * too.  However we can't print out any more info about the
+		 * failing RDMA because 'tx' might be back on the idle list or
+		 * even reused already if we didn't manage to post all our work
+		 * items */
+		CNETERR("RDMA (tx: %p) failed: %d\n",
+			kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+		return;
+
+	case IBLND_WID_TX:
+		kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+		return;
+
+	case IBLND_WID_RX:
+		kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+				   wc->byte_len);
+		return;
+	}
+}
+
+void
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
+{
+	/* NB I'm not allowed to schedule this conn once its refcount has
+	 * reached 0.  Since fundamentally I'm racing with scheduler threads
+	 * consuming my CQ I could be called after all completions have
+	 * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+	 * and this CQ is about to be destroyed so I NOOP. */
+	kib_conn_t		*conn = (kib_conn_t *)arg;
+	struct kib_sched_info	*sched = conn->ibc_sched;
+	unsigned long		flags;
+
+	LASSERT(cq == conn->ibc_cq);
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	conn->ibc_ready = 1;
+
+	if (!conn->ibc_scheduled &&
+	    (conn->ibc_nrx > 0 ||
+	     conn->ibc_nsends_posted > 0)) {
+		kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+		conn->ibc_scheduled = 1;
+		list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+		if (waitqueue_active(&sched->ibs_waitq))
+			wake_up(&sched->ibs_waitq);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+	kib_conn_t *conn = arg;
+
+	CERROR("%s: async CQ event type %d\n",
+	       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+	long			id = (long)arg;
+	struct kib_sched_info	*sched;
+	kib_conn_t		*conn;
+	wait_queue_t		wait;
+	unsigned long		flags;
+	struct ib_wc		wc;
+	int			did_something;
+	int			busy_loops = 0;
+	int			rc;
+
+	cfs_block_allsigs();
+
+	init_waitqueue_entry_current(&wait);
+
+	sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+	if (rc != 0) {
+		CWARN("Failed to bind on CPT %d, please verify whether "
+		      "all CPUs are healthy and reload modules if necessary, "
+		      "otherwise your system might under risk of low "
+		      "performance\n", sched->ibs_cpt);
+	}
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		if (busy_loops++ >= IBLND_RESCHED) {
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			cond_resched();
+			busy_loops = 0;
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+		}
+
+		did_something = 0;
+
+		if (!list_empty(&sched->ibs_conns)) {
+			conn = list_entry(sched->ibs_conns.next,
+					      kib_conn_t, ibc_sched_list);
+			/* take over kib_sched_conns' ref on conn... */
+			LASSERT(conn->ibc_scheduled);
+			list_del(&conn->ibc_sched_list);
+			conn->ibc_ready = 0;
+
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			if (rc == 0) {
+				rc = ib_req_notify_cq(conn->ibc_cq,
+						      IB_CQ_NEXT_COMP);
+				if (rc < 0) {
+					CWARN("%s: ib_req_notify_cq failed: %d, "
+					      "closing connection\n",
+					      libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+					kiblnd_close_conn(conn, -EIO);
+					kiblnd_conn_decref(conn);
+					spin_lock_irqsave(&sched->ibs_lock,
+							      flags);
+					continue;
+				}
+
+				rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			}
+
+			if (rc < 0) {
+				CWARN("%s: ib_poll_cq failed: %d, "
+				      "closing connection\n",
+				      libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				      rc);
+				kiblnd_close_conn(conn, -EIO);
+				kiblnd_conn_decref(conn);
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+				continue;
+			}
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+
+			if (rc != 0 || conn->ibc_ready) {
+				/* There may be another completion waiting; get
+				 * another scheduler to check while I handle
+				 * this one... */
+				/* +1 ref for sched_conns */
+				kiblnd_conn_addref(conn);
+				list_add_tail(&conn->ibc_sched_list,
+						  &sched->ibs_conns);
+				if (waitqueue_active(&sched->ibs_waitq))
+					wake_up(&sched->ibs_waitq);
+			} else {
+				conn->ibc_scheduled = 0;
+			}
+
+			if (rc != 0) {
+				spin_unlock_irqrestore(&sched->ibs_lock, flags);
+				kiblnd_complete(&wc);
+
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+			}
+
+			kiblnd_conn_decref(conn); /* ...drop my ref from above */
+			did_something = 1;
+		}
+
+		if (did_something)
+			continue;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
+		spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+		waitq_wait(&wait, TASK_INTERRUPTIBLE);
+		busy_loops = 0;
+
+		remove_wait_queue(&sched->ibs_waitq, &wait);
+		set_current_state(TASK_RUNNING);
+		spin_lock_irqsave(&sched->ibs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+int
+kiblnd_failover_thread(void *arg)
+{
+	rwlock_t		*glock = &kiblnd_data.kib_global_lock;
+	kib_dev_t	 *dev;
+	wait_queue_t     wait;
+	unsigned long      flags;
+	int		rc;
+
+	LASSERT (*kiblnd_tunables.kib_dev_failover != 0);
+
+	cfs_block_allsigs ();
+
+	init_waitqueue_entry_current(&wait);
+	write_lock_irqsave(glock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		int     do_failover = 0;
+		int     long_sleep;
+
+		list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+				    ibd_fail_list) {
+			if (cfs_time_before(cfs_time_current(),
+					    dev->ibd_next_failover))
+				continue;
+			do_failover = 1;
+			break;
+		}
+
+		if (do_failover) {
+			list_del_init(&dev->ibd_fail_list);
+			dev->ibd_failover = 1;
+			write_unlock_irqrestore(glock, flags);
+
+			rc = kiblnd_dev_failover(dev);
+
+			write_lock_irqsave(glock, flags);
+
+			LASSERT (dev->ibd_failover);
+			dev->ibd_failover = 0;
+			if (rc >= 0) { /* Device is OK or failover succeed */
+				dev->ibd_next_failover = cfs_time_shift(3);
+				continue;
+			}
+
+			/* failed to failover, retry later */
+			dev->ibd_next_failover =
+				cfs_time_shift(min(dev->ibd_failed_failover, 10));
+			if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+					      &kiblnd_data.kib_failed_devs);
+			}
+
+			continue;
+		}
+
+		/* long sleep if no more pending failover */
+		long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_unlock_irqrestore(glock, flags);
+
+		rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+						   cfs_time_seconds(1));
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_lock_irqsave(glock, flags);
+
+		if (!long_sleep || rc != 0)
+			continue;
+
+		/* have a long sleep, routine check all active devices,
+		 * we need checking like this because if there is not active
+		 * connection on the dev and no SEND from local, we may listen
+		 * on wrong HCA for ever while there is a bonding failover */
+		list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+			if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+					      &kiblnd_data.kib_failed_devs);
+			}
+		}
+	}
+
+	write_unlock_irqrestore(glock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}

diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644
index 0000000..e21028b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c

@@ -0,0 +1,493 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+static int service = 987;
+CFS_MODULE_PARM(service, "i", int, 0444,
+		"service number (within RDMA_PS_TCP)");
+
+static int cksum = 0;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+		"set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+		"timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+		"number of threads in each scheduler pool");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+		"# of message descriptors allocated for each pool");
+
+/* NB: this value is shared by all CPTs */
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+		"# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+		"# concurrent sends to 1 peer");
+
+static int peer_credits_hiw = 0;
+CFS_MODULE_PARM(peer_credits_hiw, "i", int, 0444,
+		"when eagerly to return credits");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+		"# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+		"Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+static char *ipif_name = "ib0";
+CFS_MODULE_PARM(ipif_name, "s", charp, 0444,
+		"IPoIB interface name");
+
+static int retry_count = 5;
+CFS_MODULE_PARM(retry_count, "i", int, 0644,
+		"Retransmissions when no ACK received");
+
+static int rnr_retry_count = 6;
+CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644,
+		"RNR retransmissions");
+
+static int keepalive = 100;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+		"Idle time in seconds before sending a keepalive");
+
+static int ib_mtu = 0;
+CFS_MODULE_PARM(ib_mtu, "i", int, 0444,
+		"IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends = 0;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
+		"send work-queue sizing");
+
+static int map_on_demand = 0;
+CFS_MODULE_PARM(map_on_demand, "i", int, 0444,
+		"map on demand");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_pool_size = 512;
+CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444,
+		"size of fmr pool on each CPT (>= ntx / 4)");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_flush_trigger = 384;
+CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444,
+		"# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+CFS_MODULE_PARM(fmr_cache, "i", int, 0444,
+		"non-zero to enable FMR caching");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int pmr_pool_size = 512;
+CFS_MODULE_PARM(pmr_pool_size, "i", int, 0444,
+		"size of MR cache pmr pool on each CPT");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover = 0;
+CFS_MODULE_PARM(dev_failover, "i", int, 0444,
+	       "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+
+static int require_privileged_port = 0;
+CFS_MODULE_PARM(require_privileged_port, "i", int, 0644,
+		"require privileged port when accepting connection");
+
+static int use_privileged_port = 1;
+CFS_MODULE_PARM(use_privileged_port, "i", int, 0644,
+		"use privileged port when initiating connection");
+
+kib_tunables_t kiblnd_tunables = {
+	.kib_dev_failover	   = &dev_failover,
+	.kib_service		= &service,
+	.kib_cksum		  = &cksum,
+	.kib_timeout		= &timeout,
+	.kib_keepalive	      = &keepalive,
+	.kib_ntx		    = &ntx,
+	.kib_credits		= &credits,
+	.kib_peertxcredits	  = &peer_credits,
+	.kib_peercredits_hiw	= &peer_credits_hiw,
+	.kib_peerrtrcredits	 = &peer_buffer_credits,
+	.kib_peertimeout	    = &peer_timeout,
+	.kib_default_ipif	   = &ipif_name,
+	.kib_retry_count	    = &retry_count,
+	.kib_rnr_retry_count	= &rnr_retry_count,
+	.kib_concurrent_sends       = &concurrent_sends,
+	.kib_ib_mtu		 = &ib_mtu,
+	.kib_map_on_demand	  = &map_on_demand,
+	.kib_fmr_pool_size	  = &fmr_pool_size,
+	.kib_fmr_flush_trigger      = &fmr_flush_trigger,
+	.kib_fmr_cache	      = &fmr_cache,
+	.kib_pmr_pool_size	  = &pmr_pool_size,
+	.kib_require_priv_port      = &require_privileged_port,
+	.kib_use_priv_port	    = &use_privileged_port,
+	.kib_nscheds		    = &nscheds
+};
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+static char ipif_basename_space[32];
+
+
+enum {
+	O2IBLND_SERVICE  = 1,
+	O2IBLND_CKSUM,
+	O2IBLND_TIMEOUT,
+	O2IBLND_NTX,
+	O2IBLND_CREDITS,
+	O2IBLND_PEER_TXCREDITS,
+	O2IBLND_PEER_CREDITS_HIW,
+	O2IBLND_PEER_RTRCREDITS,
+	O2IBLND_PEER_TIMEOUT,
+	O2IBLND_IPIF_BASENAME,
+	O2IBLND_RETRY_COUNT,
+	O2IBLND_RNR_RETRY_COUNT,
+	O2IBLND_KEEPALIVE,
+	O2IBLND_CONCURRENT_SENDS,
+	O2IBLND_IB_MTU,
+	O2IBLND_MAP_ON_DEMAND,
+	O2IBLND_FMR_POOL_SIZE,
+	O2IBLND_FMR_FLUSH_TRIGGER,
+	O2IBLND_FMR_CACHE,
+	O2IBLND_PMR_POOL_SIZE,
+	O2IBLND_DEV_FAILOVER
+};
+
+static ctl_table_t kiblnd_ctl_table[] = {
+	{
+		.ctl_name = O2IBLND_SERVICE,
+		.procname = "service",
+		.data     = &service,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_CKSUM,
+		.procname = "cksum",
+		.data     = &cksum,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_TIMEOUT,
+		.procname = "timeout",
+		.data     = &timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_NTX,
+		.procname = "ntx",
+		.data     = &ntx,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_CREDITS,
+		.procname = "credits",
+		.data     = &credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_TXCREDITS,
+		.procname = "peer_credits",
+		.data     = &peer_credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_CREDITS_HIW,
+		.procname = "peer_credits_hiw",
+		.data     = &peer_credits_hiw,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_RTRCREDITS,
+		.procname = "peer_buffer_credits",
+		.data     = &peer_buffer_credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_TIMEOUT,
+		.procname = "peer_timeout",
+		.data     = &peer_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_IPIF_BASENAME,
+		.procname = "ipif_name",
+		.data     = ipif_basename_space,
+		.maxlen   = sizeof(ipif_basename_space),
+		.mode     = 0444,
+		.proc_handler = &proc_dostring
+	},
+	{
+		.ctl_name = O2IBLND_RETRY_COUNT,
+		.procname = "retry_count",
+		.data     = &retry_count,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_RNR_RETRY_COUNT,
+		.procname = "rnr_retry_count",
+		.data     = &rnr_retry_count,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_KEEPALIVE,
+		.procname = "keepalive",
+		.data     = &keepalive,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_CONCURRENT_SENDS,
+		.procname = "concurrent_sends",
+		.data     = &concurrent_sends,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_IB_MTU,
+		.procname = "ib_mtu",
+		.data     = &ib_mtu,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_MAP_ON_DEMAND,
+		.procname = "map_on_demand",
+		.data     = &map_on_demand,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+
+	{
+		.ctl_name = O2IBLND_FMR_POOL_SIZE,
+		.procname = "fmr_pool_size",
+		.data     = &fmr_pool_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_FMR_FLUSH_TRIGGER,
+		.procname = "fmr_flush_trigger",
+		.data     = &fmr_flush_trigger,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_FMR_CACHE,
+		.procname = "fmr_cache",
+		.data     = &fmr_cache,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PMR_POOL_SIZE,
+		.procname = "pmr_pool_size",
+		.data     = &pmr_pool_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_DEV_FAILOVER,
+		.procname = "dev_failover",
+		.data     = &dev_failover,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{0}
+};
+
+static ctl_table_t kiblnd_top_ctl_table[] = {
+	{
+		.ctl_name = CTL_O2IBLND,
+		.procname = "o2iblnd",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = kiblnd_ctl_table
+	},
+	{0}
+};
+
+void
+kiblnd_initstrtunable(char *space, char *str, int size)
+{
+	strncpy(space, str, size);
+	space[size-1] = 0;
+}
+
+void
+kiblnd_sysctl_init (void)
+{
+	kiblnd_initstrtunable(ipif_basename_space, ipif_name,
+			      sizeof(ipif_basename_space));
+
+	kiblnd_tunables.kib_sysctl =
+		cfs_register_sysctl_table(kiblnd_top_ctl_table, 0);
+
+	if (kiblnd_tunables.kib_sysctl == NULL)
+		CWARN("Can't setup /proc tunables\n");
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+	if (kiblnd_tunables.kib_sysctl != NULL)
+		unregister_sysctl_table(kiblnd_tunables.kib_sysctl);
+}
+
+#else
+
+void
+kiblnd_sysctl_init (void)
+{
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+}
+
+#endif
+
+int
+kiblnd_tunables_init (void)
+{
+	if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
+		CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+		       *kiblnd_tunables.kib_ib_mtu);
+		return -EINVAL;
+	}
+
+	if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT)
+		*kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT;
+
+	if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX)
+		*kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX;
+
+	if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits)
+		*kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits;
+
+	if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2)
+		*kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2;
+
+	if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits)
+		*kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1;
+
+	if (*kiblnd_tunables.kib_map_on_demand < 0 ||
+	    *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS)
+		*kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */
+
+	if (*kiblnd_tunables.kib_map_on_demand == 1)
+		*kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */
+
+	if (*kiblnd_tunables.kib_concurrent_sends == 0) {
+		if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+		    *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8)
+			*kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2;
+		else
+			*kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits);
+	}
+
+	if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2)
+		*kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2)
+		*kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) {
+		CWARN("Concurrent sends %d is lower than message queue size: %d, "
+		      "performance may drop slightly.\n",
+		      *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits);
+	}
+
+	kiblnd_sysctl_init();
+	return 0;
+}
+
+void
+kiblnd_tunables_fini (void)
+{
+	kiblnd_sysctl_fini();
+}

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
new file mode 100644
index 0000000..6494b2b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile

@@ -0,0 +1,7 @@
+obj-$(CONFIG_LNET) += ksocklnd.o
+
+ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o
+
+
+
+ccflags-y := -I$(src)/../../include

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
new file mode 100644
index 0000000..c826bf9
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c

@@ -0,0 +1,2902 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/socklnd/socklnd.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "socklnd.h"
+
+lnd_t		   the_ksocklnd;
+ksock_nal_data_t	ksocknal_data;
+
+ksock_interface_t *
+ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		i;
+	ksock_interface_t *iface;
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		LASSERT(i < LNET_MAX_INTERFACES);
+		iface = &net->ksnn_interfaces[i];
+
+		if (iface->ksni_ipaddr == ip)
+			return (iface);
+	}
+
+	return (NULL);
+}
+
+ksock_route_t *
+ksocknal_create_route (__u32 ipaddr, int port)
+{
+	ksock_route_t *route;
+
+	LIBCFS_ALLOC (route, sizeof (*route));
+	if (route == NULL)
+		return (NULL);
+
+	atomic_set (&route->ksnr_refcount, 1);
+	route->ksnr_peer = NULL;
+	route->ksnr_retry_interval = 0;	 /* OK to connect at any time */
+	route->ksnr_ipaddr = ipaddr;
+	route->ksnr_port = port;
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+	route->ksnr_connected = 0;
+	route->ksnr_deleted = 0;
+	route->ksnr_conn_count = 0;
+	route->ksnr_share_count = 0;
+
+	return (route);
+}
+
+void
+ksocknal_destroy_route (ksock_route_t *route)
+{
+	LASSERT (atomic_read(&route->ksnr_refcount) == 0);
+
+	if (route->ksnr_peer != NULL)
+		ksocknal_peer_decref(route->ksnr_peer);
+
+	LIBCFS_FREE (route, sizeof (*route));
+}
+
+int
+ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_net_t   *net = ni->ni_data;
+	ksock_peer_t  *peer;
+
+	LASSERT (id.nid != LNET_NID_ANY);
+	LASSERT (id.pid != LNET_PID_ANY);
+	LASSERT (!in_interrupt());
+
+	LIBCFS_ALLOC (peer, sizeof (*peer));
+	if (peer == NULL)
+		return -ENOMEM;
+
+	memset (peer, 0, sizeof (*peer));       /* NULL pointers/clear flags etc */
+
+	peer->ksnp_ni = ni;
+	peer->ksnp_id = id;
+	atomic_set (&peer->ksnp_refcount, 1);   /* 1 ref for caller */
+	peer->ksnp_closing = 0;
+	peer->ksnp_accepting = 0;
+	peer->ksnp_proto = NULL;
+	peer->ksnp_last_alive = 0;
+	peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	INIT_LIST_HEAD (&peer->ksnp_conns);
+	INIT_LIST_HEAD (&peer->ksnp_routes);
+	INIT_LIST_HEAD (&peer->ksnp_tx_queue);
+	INIT_LIST_HEAD (&peer->ksnp_zc_req_list);
+	spin_lock_init(&peer->ksnp_lock);
+
+	spin_lock_bh(&net->ksnn_lock);
+
+	if (net->ksnn_shutdown) {
+		spin_unlock_bh(&net->ksnn_lock);
+
+		LIBCFS_FREE(peer, sizeof(*peer));
+		CERROR("Can't create peer: network shutdown\n");
+		return -ESHUTDOWN;
+	}
+
+	net->ksnn_npeers++;
+
+	spin_unlock_bh(&net->ksnn_lock);
+
+	*peerp = peer;
+	return 0;
+}
+
+void
+ksocknal_destroy_peer (ksock_peer_t *peer)
+{
+	ksock_net_t    *net = peer->ksnp_ni->ni_data;
+
+	CDEBUG (D_NET, "peer %s %p deleted\n",
+		libcfs_id2str(peer->ksnp_id), peer);
+
+	LASSERT (atomic_read (&peer->ksnp_refcount) == 0);
+	LASSERT (peer->ksnp_accepting == 0);
+	LASSERT (list_empty (&peer->ksnp_conns));
+	LASSERT (list_empty (&peer->ksnp_routes));
+	LASSERT (list_empty (&peer->ksnp_tx_queue));
+	LASSERT (list_empty (&peer->ksnp_zc_req_list));
+
+	LIBCFS_FREE (peer, sizeof (*peer));
+
+	/* NB a peer's connections and routes keep a reference on their peer
+	 * until they are destroyed, so we can be assured that _all_ state to
+	 * do with this peer has been cleaned up when its refcount drops to
+	 * zero. */
+	spin_lock_bh(&net->ksnn_lock);
+	net->ksnn_npeers--;
+	spin_unlock_bh(&net->ksnn_lock);
+}
+
+ksock_peer_t *
+ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id)
+{
+	struct list_head       *peer_list = ksocknal_nid2peerlist(id.nid);
+	struct list_head       *tmp;
+	ksock_peer_t     *peer;
+
+	list_for_each (tmp, peer_list) {
+
+		peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+		LASSERT (!peer->ksnp_closing);
+
+		if (peer->ksnp_ni != ni)
+			continue;
+
+		if (peer->ksnp_id.nid != id.nid ||
+		    peer->ksnp_id.pid != id.pid)
+			continue;
+
+		CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+		       peer, libcfs_id2str(id),
+		       atomic_read(&peer->ksnp_refcount));
+		return (peer);
+	}
+	return (NULL);
+}
+
+ksock_peer_t *
+ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_peer_t     *peer;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL)			/* +1 ref for caller? */
+		ksocknal_peer_addref(peer);
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return (peer);
+}
+
+void
+ksocknal_unlink_peer_locked (ksock_peer_t *peer)
+{
+	int		i;
+	__u32	      ip;
+	ksock_interface_t *iface;
+
+	for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+		LASSERT (i < LNET_MAX_INTERFACES);
+		ip = peer->ksnp_passive_ips[i];
+
+		iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+		/* All IPs in peer->ksnp_passive_ips[] come from the
+		 * interface list, therefore the call must succeed. */
+		LASSERT (iface != NULL);
+
+		CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
+		       peer, iface, iface->ksni_nroutes);
+		iface->ksni_npeers--;
+	}
+
+	LASSERT (list_empty(&peer->ksnp_conns));
+	LASSERT (list_empty(&peer->ksnp_routes));
+	LASSERT (!peer->ksnp_closing);
+	peer->ksnp_closing = 1;
+	list_del (&peer->ksnp_list);
+	/* lose peerlist's ref */
+	ksocknal_peer_decref(peer);
+}
+
+int
+ksocknal_get_peer_info (lnet_ni_t *ni, int index,
+			lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip,
+			int *port, int *conn_count, int *share_count)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*ptmp;
+	ksock_route_t     *route;
+	struct list_head	*rtmp;
+	int		i;
+	int		j;
+	int		rc = -ENOENT;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+
+		list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			if (peer->ksnp_n_passive_ips == 0 &&
+			    list_empty(&peer->ksnp_routes)) {
+				if (index-- > 0)
+					continue;
+
+				*id = peer->ksnp_id;
+				*myip = 0;
+				*peer_ip = 0;
+				*port = 0;
+				*conn_count = 0;
+				*share_count = 0;
+				rc = 0;
+				goto out;
+			}
+
+			for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+				if (index-- > 0)
+					continue;
+
+				*id = peer->ksnp_id;
+				*myip = peer->ksnp_passive_ips[j];
+				*peer_ip = 0;
+				*port = 0;
+				*conn_count = 0;
+				*share_count = 0;
+				rc = 0;
+				goto out;
+			}
+
+			list_for_each (rtmp, &peer->ksnp_routes) {
+				if (index-- > 0)
+					continue;
+
+				route = list_entry(rtmp, ksock_route_t,
+						       ksnr_list);
+
+				*id = peer->ksnp_id;
+				*myip = route->ksnr_myipaddr;
+				*peer_ip = route->ksnr_ipaddr;
+				*port = route->ksnr_port;
+				*conn_count = route->ksnr_conn_count;
+				*share_count = route->ksnr_share_count;
+				rc = 0;
+				goto out;
+			}
+		}
+	}
+ out:
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return (rc);
+}
+
+void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+	ksock_peer_t      *peer = route->ksnr_peer;
+	int		type = conn->ksnc_type;
+	ksock_interface_t *iface;
+
+	conn->ksnc_route = route;
+	ksocknal_route_addref(route);
+
+	if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+		if (route->ksnr_myipaddr == 0) {
+			/* route wasn't bound locally yet (the initial route) */
+			CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n",
+			       libcfs_id2str(peer->ksnp_id),
+			       HIPQUAD(route->ksnr_ipaddr),
+			       HIPQUAD(conn->ksnc_myipaddr));
+		} else {
+			CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from "
+			       "%u.%u.%u.%u to %u.%u.%u.%u\n",
+			       libcfs_id2str(peer->ksnp_id),
+			       HIPQUAD(route->ksnr_ipaddr),
+			       HIPQUAD(route->ksnr_myipaddr),
+			       HIPQUAD(conn->ksnc_myipaddr));
+
+			iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+						  route->ksnr_myipaddr);
+			if (iface != NULL)
+				iface->ksni_nroutes--;
+		}
+		route->ksnr_myipaddr = conn->ksnc_myipaddr;
+		iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+					  route->ksnr_myipaddr);
+		if (iface != NULL)
+			iface->ksni_nroutes++;
+	}
+
+	route->ksnr_connected |= (1<<type);
+	route->ksnr_conn_count++;
+
+	/* Successful connection => further attempts can
+	 * proceed immediately */
+	route->ksnr_retry_interval = 0;
+}
+
+void
+ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
+{
+	struct list_head	*tmp;
+	ksock_conn_t      *conn;
+	ksock_route_t     *route2;
+
+	LASSERT (!peer->ksnp_closing);
+	LASSERT (route->ksnr_peer == NULL);
+	LASSERT (!route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+	LASSERT (route->ksnr_connected == 0);
+
+	/* LASSERT(unique) */
+	list_for_each(tmp, &peer->ksnp_routes) {
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+			CERROR ("Duplicate route %s %u.%u.%u.%u\n",
+				libcfs_id2str(peer->ksnp_id),
+				HIPQUAD(route->ksnr_ipaddr));
+			LBUG();
+		}
+	}
+
+	route->ksnr_peer = peer;
+	ksocknal_peer_addref(peer);
+	/* peer's routelist takes over my ref on 'route' */
+	list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+
+	list_for_each(tmp, &peer->ksnp_conns) {
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+			continue;
+
+		ksocknal_associate_route_conn_locked(route, conn);
+		/* keep going (typed routes) */
+	}
+}
+
+void
+ksocknal_del_route_locked (ksock_route_t *route)
+{
+	ksock_peer_t      *peer = route->ksnr_peer;
+	ksock_interface_t *iface;
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+	struct list_head	*cnxt;
+
+	LASSERT (!route->ksnr_deleted);
+
+	/* Close associated conns */
+	list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_route != route)
+			continue;
+
+		ksocknal_close_conn_locked (conn, 0);
+	}
+
+	if (route->ksnr_myipaddr != 0) {
+		iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+					  route->ksnr_myipaddr);
+		if (iface != NULL)
+			iface->ksni_nroutes--;
+	}
+
+	route->ksnr_deleted = 1;
+	list_del (&route->ksnr_list);
+	ksocknal_route_decref(route);	     /* drop peer's ref */
+
+	if (list_empty (&peer->ksnp_routes) &&
+	    list_empty (&peer->ksnp_conns)) {
+		/* I've just removed the last route to a peer with no active
+		 * connections */
+		ksocknal_unlink_peer_locked (peer);
+	}
+}
+
+int
+ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
+{
+	struct list_head	*tmp;
+	ksock_peer_t      *peer;
+	ksock_peer_t      *peer2;
+	ksock_route_t     *route;
+	ksock_route_t     *route2;
+	int		rc;
+
+	if (id.nid == LNET_NID_ANY ||
+	    id.pid == LNET_PID_ANY)
+		return (-EINVAL);
+
+	/* Have a brand new peer ready... */
+	rc = ksocknal_create_peer(&peer, ni, id);
+	if (rc != 0)
+		return rc;
+
+	route = ksocknal_create_route (ipaddr, port);
+	if (route == NULL) {
+		ksocknal_peer_decref(peer);
+		return (-ENOMEM);
+	}
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	/* always called with a ref on ni, so shutdown can't have started */
+	LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+	peer2 = ksocknal_find_peer_locked (ni, id);
+	if (peer2 != NULL) {
+		ksocknal_peer_decref(peer);
+		peer = peer2;
+	} else {
+		/* peer table takes my ref on peer */
+		list_add_tail (&peer->ksnp_list,
+				   ksocknal_nid2peerlist (id.nid));
+	}
+
+	route2 = NULL;
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route2->ksnr_ipaddr == ipaddr)
+			break;
+
+		route2 = NULL;
+	}
+	if (route2 == NULL) {
+		ksocknal_add_route_locked(peer, route);
+		route->ksnr_share_count++;
+	} else {
+		ksocknal_route_decref(route);
+		route2->ksnr_share_count++;
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (0);
+}
+
+void
+ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
+{
+	ksock_conn_t     *conn;
+	ksock_route_t    *route;
+	struct list_head       *tmp;
+	struct list_head       *nxt;
+	int	       nshared;
+
+	LASSERT (!peer->ksnp_closing);
+
+	/* Extra ref prevents peer disappearing until I'm done with it */
+	ksocknal_peer_addref(peer);
+
+	list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		/* no match */
+		if (!(ip == 0 || route->ksnr_ipaddr == ip))
+			continue;
+
+		route->ksnr_share_count = 0;
+		/* This deletes associated conns too */
+		ksocknal_del_route_locked (route);
+	}
+
+	nshared = 0;
+	list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		nshared += route->ksnr_share_count;
+	}
+
+	if (nshared == 0) {
+		/* remove everything else if there are no explicit entries
+		 * left */
+
+		list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+			/* we should only be removing auto-entries */
+			LASSERT(route->ksnr_share_count == 0);
+			ksocknal_del_route_locked (route);
+		}
+
+		list_for_each_safe (tmp, nxt, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			ksocknal_close_conn_locked(conn, 0);
+		}
+	}
+
+	ksocknal_peer_decref(peer);
+	/* NB peer unlinks itself when last conn/route is removed */
+}
+
+int
+ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
+{
+	LIST_HEAD     (zombies);
+	struct list_head	*ptmp;
+	struct list_head	*pnxt;
+	ksock_peer_t      *peer;
+	int		lo;
+	int		hi;
+	int		i;
+	int		rc = -ENOENT;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (id.nid != LNET_NID_ANY)
+		lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+	else {
+		lo = 0;
+		hi = ksocknal_data.ksnd_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt,
+					&ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
+			      (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
+				continue;
+
+			ksocknal_peer_addref(peer);     /* a ref for me... */
+
+			ksocknal_del_peer_locked (peer, ip);
+
+			if (peer->ksnp_closing &&
+			    !list_empty(&peer->ksnp_tx_queue)) {
+				LASSERT (list_empty(&peer->ksnp_conns));
+				LASSERT (list_empty(&peer->ksnp_routes));
+
+				list_splice_init(&peer->ksnp_tx_queue,
+						     &zombies);
+			}
+
+			ksocknal_peer_decref(peer);     /* ...till here */
+
+			rc = 0;		 /* matched! */
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(ni, &zombies, 1);
+
+	return (rc);
+}
+
+ksock_conn_t *
+ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*ptmp;
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+	int		i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			LASSERT (!peer->ksnp_closing);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			list_for_each (ctmp, &peer->ksnp_conns) {
+				if (index-- > 0)
+					continue;
+
+				conn = list_entry (ctmp, ksock_conn_t,
+						       ksnc_list);
+				ksocknal_conn_addref(conn);
+				read_unlock(&ksocknal_data. \
+						 ksnd_global_lock);
+				return (conn);
+			}
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return (NULL);
+}
+
+ksock_sched_t *
+ksocknal_choose_scheduler_locked(unsigned int cpt)
+{
+	struct ksock_sched_info	*info = ksocknal_data.ksnd_sched_info[cpt];
+	ksock_sched_t		*sched;
+	int			i;
+
+	LASSERT(info->ksi_nthreads > 0);
+
+	sched = &info->ksi_scheds[0];
+	/*
+	 * NB: it's safe so far, but info->ksi_nthreads could be changed
+	 * at runtime when we have dynamic LNet configuration, then we
+	 * need to take care of this.
+	 */
+	for (i = 1; i < info->ksi_nthreads; i++) {
+		if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+			sched = &info->ksi_scheds[i];
+	}
+
+	return sched;
+}
+
+int
+ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		i;
+	int		nip;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	nip = net->ksnn_ninterfaces;
+	LASSERT (nip <= LNET_MAX_INTERFACES);
+
+	/* Only offer interfaces for additional connections if I have
+	 * more than one. */
+	if (nip < 2) {
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return 0;
+	}
+
+	for (i = 0; i < nip; i++) {
+		ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
+		LASSERT (ipaddrs[i] != 0);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return (nip);
+}
+
+int
+ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+{
+	int   best_netmatch = 0;
+	int   best_xor      = 0;
+	int   best	  = -1;
+	int   this_xor;
+	int   this_netmatch;
+	int   i;
+
+	for (i = 0; i < nips; i++) {
+		if (ips[i] == 0)
+			continue;
+
+		this_xor = (ips[i] ^ iface->ksni_ipaddr);
+		this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+		if (!(best < 0 ||
+		      best_netmatch < this_netmatch ||
+		      (best_netmatch == this_netmatch &&
+		       best_xor > this_xor)))
+			continue;
+
+		best = i;
+		best_netmatch = this_netmatch;
+		best_xor = this_xor;
+	}
+
+	LASSERT (best >= 0);
+	return (best);
+}
+
+int
+ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+{
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	ksock_net_t	*net = peer->ksnp_ni->ni_data;
+	ksock_interface_t  *iface;
+	ksock_interface_t  *best_iface;
+	int		 n_ips;
+	int		 i;
+	int		 j;
+	int		 k;
+	__u32	       ip;
+	__u32	       xor;
+	int		 this_netmatch;
+	int		 best_netmatch;
+	int		 best_npeers;
+
+	/* CAVEAT EMPTOR: We do all our interface matching with an
+	 * exclusive hold of global lock at IRQ priority.  We're only
+	 * expecting to be dealing with small numbers of interfaces, so the
+	 * O(n**3)-ness shouldn't matter */
+
+	/* Also note that I'm not going to return more than n_peerips
+	 * interfaces, even if I have more myself */
+
+	write_lock_bh(global_lock);
+
+	LASSERT (n_peerips <= LNET_MAX_INTERFACES);
+	LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+	/* Only match interfaces for additional connections
+	 * if I have > 1 interface */
+	n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
+		MIN(n_peerips, net->ksnn_ninterfaces);
+
+	for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+		/*	      ^ yes really... */
+
+		/* If we have any new interfaces, first tick off all the
+		 * peer IPs that match old interfaces, then choose new
+		 * interfaces to match the remaining peer IPS.
+		 * We don't forget interfaces we've stopped using; we might
+		 * start using them again... */
+
+		if (i < peer->ksnp_n_passive_ips) {
+			/* Old interface. */
+			ip = peer->ksnp_passive_ips[i];
+			best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+
+			/* peer passive ips are kept up to date */
+			LASSERT(best_iface != NULL);
+		} else {
+			/* choose a new interface */
+			LASSERT (i == peer->ksnp_n_passive_ips);
+
+			best_iface = NULL;
+			best_netmatch = 0;
+			best_npeers = 0;
+
+			for (j = 0; j < net->ksnn_ninterfaces; j++) {
+				iface = &net->ksnn_interfaces[j];
+				ip = iface->ksni_ipaddr;
+
+				for (k = 0; k < peer->ksnp_n_passive_ips; k++)
+					if (peer->ksnp_passive_ips[k] == ip)
+						break;
+
+				if (k < peer->ksnp_n_passive_ips) /* using it already */
+					continue;
+
+				k = ksocknal_match_peerip(iface, peerips, n_peerips);
+				xor = (ip ^ peerips[k]);
+				this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+				if (!(best_iface == NULL ||
+				      best_netmatch < this_netmatch ||
+				      (best_netmatch == this_netmatch &&
+				       best_npeers > iface->ksni_npeers)))
+					continue;
+
+				best_iface = iface;
+				best_netmatch = this_netmatch;
+				best_npeers = iface->ksni_npeers;
+			}
+
+			best_iface->ksni_npeers++;
+			ip = best_iface->ksni_ipaddr;
+			peer->ksnp_passive_ips[i] = ip;
+			peer->ksnp_n_passive_ips = i+1;
+		}
+
+		LASSERT (best_iface != NULL);
+
+		/* mark the best matching peer IP used */
+		j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+		peerips[j] = 0;
+	}
+
+	/* Overwrite input peer IP addresses */
+	memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+
+	write_unlock_bh(global_lock);
+
+	return (n_ips);
+}
+
+void
+ksocknal_create_routes(ksock_peer_t *peer, int port,
+		       __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+	ksock_route_t       *newroute = NULL;
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	lnet_ni_t	   *ni = peer->ksnp_ni;
+	ksock_net_t	 *net = ni->ni_data;
+	struct list_head	  *rtmp;
+	ksock_route_t       *route;
+	ksock_interface_t   *iface;
+	ksock_interface_t   *best_iface;
+	int		  best_netmatch;
+	int		  this_netmatch;
+	int		  best_nroutes;
+	int		  i;
+	int		  j;
+
+	/* CAVEAT EMPTOR: We do all our interface matching with an
+	 * exclusive hold of global lock at IRQ priority.  We're only
+	 * expecting to be dealing with small numbers of interfaces, so the
+	 * O(n**3)-ness here shouldn't matter */
+
+	write_lock_bh(global_lock);
+
+	if (net->ksnn_ninterfaces < 2) {
+		/* Only create additional connections
+		 * if I have > 1 interface */
+		write_unlock_bh(global_lock);
+		return;
+	}
+
+	LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES);
+
+	for (i = 0; i < npeer_ipaddrs; i++) {
+		if (newroute != NULL) {
+			newroute->ksnr_ipaddr = peer_ipaddrs[i];
+		} else {
+			write_unlock_bh(global_lock);
+
+			newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+			if (newroute == NULL)
+				return;
+
+			write_lock_bh(global_lock);
+		}
+
+		if (peer->ksnp_closing) {
+			/* peer got closed under me */
+			break;
+		}
+
+		/* Already got a route? */
+		route = NULL;
+		list_for_each(rtmp, &peer->ksnp_routes) {
+			route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+			if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+				break;
+
+			route = NULL;
+		}
+		if (route != NULL)
+			continue;
+
+		best_iface = NULL;
+		best_nroutes = 0;
+		best_netmatch = 0;
+
+		LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+		/* Select interface to connect from */
+		for (j = 0; j < net->ksnn_ninterfaces; j++) {
+			iface = &net->ksnn_interfaces[j];
+
+			/* Using this interface already? */
+			list_for_each(rtmp, &peer->ksnp_routes) {
+				route = list_entry(rtmp, ksock_route_t,
+						       ksnr_list);
+
+				if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+					break;
+
+				route = NULL;
+			}
+			if (route != NULL)
+				continue;
+
+			this_netmatch = (((iface->ksni_ipaddr ^
+					   newroute->ksnr_ipaddr) &
+					   iface->ksni_netmask) == 0) ? 1 : 0;
+
+			if (!(best_iface == NULL ||
+			      best_netmatch < this_netmatch ||
+			      (best_netmatch == this_netmatch &&
+			       best_nroutes > iface->ksni_nroutes)))
+				continue;
+
+			best_iface = iface;
+			best_netmatch = this_netmatch;
+			best_nroutes = iface->ksni_nroutes;
+		}
+
+		if (best_iface == NULL)
+			continue;
+
+		newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+		best_iface->ksni_nroutes++;
+
+		ksocknal_add_route_locked(peer, newroute);
+		newroute = NULL;
+	}
+
+	write_unlock_bh(global_lock);
+	if (newroute != NULL)
+		ksocknal_route_decref(newroute);
+}
+
+int
+ksocknal_accept (lnet_ni_t *ni, socket_t *sock)
+{
+	ksock_connreq_t    *cr;
+	int		 rc;
+	__u32	       peer_ip;
+	int		 peer_port;
+
+	rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+	LASSERT (rc == 0);		      /* we succeeded before */
+
+	LIBCFS_ALLOC(cr, sizeof(*cr));
+	if (cr == NULL) {
+		LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from "
+				   "%u.%u.%u.%u: memory exhausted\n",
+				   HIPQUAD(peer_ip));
+		return -ENOMEM;
+	}
+
+	lnet_ni_addref(ni);
+	cr->ksncr_ni   = ni;
+	cr->ksncr_sock = sock;
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+	return 0;
+}
+
+int
+ksocknal_connecting (ksock_peer_t *peer, __u32 ipaddr)
+{
+	ksock_route_t   *route;
+
+	list_for_each_entry (route, &peer->ksnp_routes, ksnr_list) {
+
+		if (route->ksnr_ipaddr == ipaddr)
+			return route->ksnr_connecting;
+	}
+	return 0;
+}
+
+int
+ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+		      socket_t *sock, int type)
+{
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	LIST_HEAD     (zombies);
+	lnet_process_id_t  peerid;
+	struct list_head	*tmp;
+	__u64	      incarnation;
+	ksock_conn_t      *conn;
+	ksock_conn_t      *conn2;
+	ksock_peer_t      *peer = NULL;
+	ksock_peer_t      *peer2;
+	ksock_sched_t     *sched;
+	ksock_hello_msg_t *hello;
+	int		   cpt;
+	ksock_tx_t	*tx;
+	ksock_tx_t	*txtmp;
+	int		rc;
+	int		active;
+	char	      *warn = NULL;
+
+	active = (route != NULL);
+
+	LASSERT (active == (type != SOCKLND_CONN_NONE));
+
+	LIBCFS_ALLOC(conn, sizeof(*conn));
+	if (conn == NULL) {
+		rc = -ENOMEM;
+		goto failed_0;
+	}
+
+	memset (conn, 0, sizeof (*conn));
+
+	conn->ksnc_peer = NULL;
+	conn->ksnc_route = NULL;
+	conn->ksnc_sock = sock;
+	/* 2 ref, 1 for conn, another extra ref prevents socket
+	 * being closed before establishment of connection */
+	atomic_set (&conn->ksnc_sock_refcount, 2);
+	conn->ksnc_type = type;
+	ksocknal_lib_save_callback(sock, conn);
+	atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+	conn->ksnc_rx_ready = 0;
+	conn->ksnc_rx_scheduled = 0;
+
+	INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+	conn->ksnc_tx_ready = 0;
+	conn->ksnc_tx_scheduled = 0;
+	conn->ksnc_tx_carrier = NULL;
+	atomic_set (&conn->ksnc_tx_nob, 0);
+
+	LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t,
+				     kshm_ips[LNET_MAX_INTERFACES]));
+	if (hello == NULL) {
+		rc = -ENOMEM;
+		goto failed_1;
+	}
+
+	/* stash conn's local and remote addrs */
+	rc = ksocknal_lib_get_conn_addrs (conn);
+	if (rc != 0)
+		goto failed_1;
+
+	/* Find out/confirm peer's NID and connection type and get the
+	 * vector of interfaces she's willing to let me connect to.
+	 * Passive connections use the listener timeout since the peer sends
+	 * eagerly */
+
+	if (active) {
+		peer = route->ksnr_peer;
+		LASSERT(ni == peer->ksnp_ni);
+
+		/* Active connection sends HELLO eagerly */
+		hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
+		peerid = peer->ksnp_id;
+
+		write_lock_bh(global_lock);
+		conn->ksnc_proto = peer->ksnp_proto;
+		write_unlock_bh(global_lock);
+
+		if (conn->ksnc_proto == NULL) {
+			 conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+			 if (*ksocknal_tunables.ksnd_protocol == 2)
+				 conn->ksnc_proto = &ksocknal_protocol_v2x;
+			 else if (*ksocknal_tunables.ksnd_protocol == 1)
+				 conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+		}
+
+		rc = ksocknal_send_hello (ni, conn, peerid.nid, hello);
+		if (rc != 0)
+			goto failed_1;
+	} else {
+		peerid.nid = LNET_NID_ANY;
+		peerid.pid = LNET_PID_ANY;
+
+		/* Passive, get protocol from peer */
+		conn->ksnc_proto = NULL;
+	}
+
+	rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation);
+	if (rc < 0)
+		goto failed_1;
+
+	LASSERT (rc == 0 || active);
+	LASSERT (conn->ksnc_proto != NULL);
+	LASSERT (peerid.nid != LNET_NID_ANY);
+
+	cpt = lnet_cpt_of_nid(peerid.nid);
+
+	if (active) {
+		ksocknal_peer_addref(peer);
+		write_lock_bh(global_lock);
+	} else {
+		rc = ksocknal_create_peer(&peer, ni, peerid);
+		if (rc != 0)
+			goto failed_1;
+
+		write_lock_bh(global_lock);
+
+		/* called with a ref on ni, so shutdown can't have started */
+		LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+		peer2 = ksocknal_find_peer_locked(ni, peerid);
+		if (peer2 == NULL) {
+			/* NB this puts an "empty" peer in the peer
+			 * table (which takes my ref) */
+			list_add_tail(&peer->ksnp_list,
+					  ksocknal_nid2peerlist(peerid.nid));
+		} else {
+			ksocknal_peer_decref(peer);
+			peer = peer2;
+		}
+
+		/* +1 ref for me */
+		ksocknal_peer_addref(peer);
+		peer->ksnp_accepting++;
+
+		/* Am I already connecting to this guy?  Resolve in
+		 * favour of higher NID... */
+		if (peerid.nid < ni->ni_nid &&
+		    ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
+			rc = EALREADY;
+			warn = "connection race resolution";
+			goto failed_2;
+		}
+	}
+
+	if (peer->ksnp_closing ||
+	    (active && route->ksnr_deleted)) {
+		/* peer/route got closed under me */
+		rc = -ESTALE;
+		warn = "peer/route removed";
+		goto failed_2;
+	}
+
+	if (peer->ksnp_proto == NULL) {
+		/* Never connected before.
+		 * NB recv_hello may have returned EPROTO to signal my peer
+		 * wants a different protocol than the one I asked for.
+		 */
+		LASSERT (list_empty(&peer->ksnp_conns));
+
+		peer->ksnp_proto = conn->ksnc_proto;
+		peer->ksnp_incarnation = incarnation;
+	}
+
+	if (peer->ksnp_proto != conn->ksnc_proto ||
+	    peer->ksnp_incarnation != incarnation) {
+		/* Peer rebooted or I've got the wrong protocol version */
+		ksocknal_close_peer_conns_locked(peer, 0, 0);
+
+		peer->ksnp_proto = NULL;
+		rc = ESTALE;
+		warn = peer->ksnp_incarnation != incarnation ?
+		       "peer rebooted" :
+		       "wrong proto version";
+		goto failed_2;
+	}
+
+	switch (rc) {
+	default:
+		LBUG();
+	case 0:
+		break;
+	case EALREADY:
+		warn = "lost conn race";
+		goto failed_2;
+	case EPROTO:
+		warn = "retry with different protocol version";
+		goto failed_2;
+	}
+
+	/* Refuse to duplicate an existing connection, unless this is a
+	 * loopback connection */
+	if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+		list_for_each(tmp, &peer->ksnp_conns) {
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+			    conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+			    conn2->ksnc_type != conn->ksnc_type)
+				continue;
+
+			/* Reply on a passive connection attempt so the peer
+			 * realises we're connected. */
+			LASSERT (rc == 0);
+			if (!active)
+				rc = EALREADY;
+
+			warn = "duplicate";
+			goto failed_2;
+		}
+	}
+
+	/* If the connection created by this route didn't bind to the IP
+	 * address the route connected to, the connection/route matching
+	 * code below probably isn't going to work. */
+	if (active &&
+	    route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+		CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n",
+		       libcfs_id2str(peer->ksnp_id),
+		       HIPQUAD(route->ksnr_ipaddr),
+		       HIPQUAD(conn->ksnc_ipaddr));
+	}
+
+	/* Search for a route corresponding to the new connection and
+	 * create an association.  This allows incoming connections created
+	 * by routes in my peer to match my own route entries so I don't
+	 * continually create duplicate routes. */
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+			continue;
+
+		ksocknal_associate_route_conn_locked(route, conn);
+		break;
+	}
+
+	conn->ksnc_peer = peer;		 /* conn takes my ref on peer */
+	peer->ksnp_last_alive = cfs_time_current();
+	peer->ksnp_send_keepalive = 0;
+	peer->ksnp_error = 0;
+
+	sched = ksocknal_choose_scheduler_locked(cpt);
+	sched->kss_nconns++;
+	conn->ksnc_scheduler = sched;
+
+	conn->ksnc_tx_last_post = cfs_time_current();
+	/* Set the deadline for the outgoing HELLO to drain */
+	conn->ksnc_tx_bufnob = cfs_sock_wmem_queued(sock);
+	conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();   /* order with adding to peer's conn list */
+
+	list_add (&conn->ksnc_list, &peer->ksnp_conns);
+	ksocknal_conn_addref(conn);
+
+	ksocknal_new_packet(conn, 0);
+
+	conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
+
+	/* Take packets blocking for this connection. */
+	list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
+		if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO)
+				continue;
+
+		list_del (&tx->tx_list);
+		ksocknal_queue_tx_locked (tx, conn);
+	}
+
+	write_unlock_bh(global_lock);
+
+	/* We've now got a new connection.  Any errors from here on are just
+	 * like "normal" comms errors and we close the connection normally.
+	 * NB (a) we still have to send the reply HELLO for passive
+	 *	connections,
+	 *    (b) normal I/O on the conn is blocked until I setup and call the
+	 *	socket callbacks.
+	 */
+
+	CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+	       " incarnation:"LPD64" sched[%d:%d]\n",
+	       libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+	       HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr),
+	       conn->ksnc_port, incarnation, cpt,
+	       (int)(sched - &sched->kss_info->ksi_scheds[0]));
+
+	if (active) {
+		/* additional routes after interface exchange? */
+		ksocknal_create_routes(peer, conn->ksnc_port,
+				       hello->kshm_ips, hello->kshm_nips);
+	} else {
+		hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+						       hello->kshm_nips);
+		rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+	}
+
+	LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+				    kshm_ips[LNET_MAX_INTERFACES]));
+
+	/* setup the socket AFTER I've received hello (it disables
+	 * SO_LINGER).  I might call back to the acceptor who may want
+	 * to send a protocol version response and then close the
+	 * socket; this ensures the socket only tears down after the
+	 * response has been sent. */
+	if (rc == 0)
+		rc = ksocknal_lib_setup_sock(sock);
+
+	write_lock_bh(global_lock);
+
+	/* NB my callbacks block while I hold ksnd_global_lock */
+	ksocknal_lib_set_callback(sock, conn);
+
+	if (!active)
+		peer->ksnp_accepting--;
+
+	write_unlock_bh(global_lock);
+
+	if (rc != 0) {
+		write_lock_bh(global_lock);
+		if (!conn->ksnc_closing) {
+			/* could be closed by another thread */
+			ksocknal_close_conn_locked(conn, rc);
+		}
+		write_unlock_bh(global_lock);
+	} else if (ksocknal_connsock_addref(conn) == 0) {
+		/* Allow I/O to proceed. */
+		ksocknal_read_callback(conn);
+		ksocknal_write_callback(conn);
+		ksocknal_connsock_decref(conn);
+	}
+
+	ksocknal_connsock_decref(conn);
+	ksocknal_conn_decref(conn);
+	return rc;
+
+ failed_2:
+	if (!peer->ksnp_closing &&
+	    list_empty (&peer->ksnp_conns) &&
+	    list_empty (&peer->ksnp_routes)) {
+		list_add(&zombies, &peer->ksnp_tx_queue);
+		list_del_init(&peer->ksnp_tx_queue);
+		ksocknal_unlink_peer_locked(peer);
+	}
+
+	write_unlock_bh(global_lock);
+
+	if (warn != NULL) {
+		if (rc < 0)
+			CERROR("Not creating conn %s type %d: %s\n",
+			       libcfs_id2str(peerid), conn->ksnc_type, warn);
+		else
+			CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+			      libcfs_id2str(peerid), conn->ksnc_type, warn);
+	}
+
+	if (!active) {
+		if (rc > 0) {
+			/* Request retry by replying with CONN_NONE
+			 * ksnc_proto has been set already */
+			conn->ksnc_type = SOCKLND_CONN_NONE;
+			hello->kshm_nips = 0;
+			ksocknal_send_hello(ni, conn, peerid.nid, hello);
+		}
+
+		write_lock_bh(global_lock);
+		peer->ksnp_accepting--;
+		write_unlock_bh(global_lock);
+	}
+
+	ksocknal_txlist_done(ni, &zombies, 1);
+	ksocknal_peer_decref(peer);
+
+ failed_1:
+	if (hello != NULL)
+		LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+					    kshm_ips[LNET_MAX_INTERFACES]));
+
+	LIBCFS_FREE (conn, sizeof(*conn));
+
+ failed_0:
+	libcfs_sock_release(sock);
+	return rc;
+}
+
+void
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
+{
+	/* This just does the immmediate housekeeping, and queues the
+	 * connection for the reaper to terminate.
+	 * Caller holds ksnd_global_lock exclusively in irq context */
+	ksock_peer_t      *peer = conn->ksnc_peer;
+	ksock_route_t     *route;
+	ksock_conn_t      *conn2;
+	struct list_head	*tmp;
+
+	LASSERT (peer->ksnp_error == 0);
+	LASSERT (!conn->ksnc_closing);
+	conn->ksnc_closing = 1;
+
+	/* ksnd_deathrow_conns takes over peer's ref */
+	list_del (&conn->ksnc_list);
+
+	route = conn->ksnc_route;
+	if (route != NULL) {
+		/* dissociate conn from route... */
+		LASSERT (!route->ksnr_deleted);
+		LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
+
+		conn2 = NULL;
+		list_for_each(tmp, &peer->ksnp_conns) {
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			if (conn2->ksnc_route == route &&
+			    conn2->ksnc_type == conn->ksnc_type)
+				break;
+
+			conn2 = NULL;
+		}
+		if (conn2 == NULL)
+			route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
+		conn->ksnc_route = NULL;
+
+#if 0	   /* irrelevent with only eager routes */
+		/* make route least favourite */
+		list_del (&route->ksnr_list);
+		list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
+#endif
+		ksocknal_route_decref(route);     /* drop conn's ref on route */
+	}
+
+	if (list_empty (&peer->ksnp_conns)) {
+		/* No more connections to this peer */
+
+		if (!list_empty(&peer->ksnp_tx_queue)) {
+			ksock_tx_t *tx;
+
+			LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+
+			/* throw them to the last connection...,
+			 * these TXs will be send to /dev/null by scheduler */
+			list_for_each_entry(tx, &peer->ksnp_tx_queue,
+						tx_list)
+				ksocknal_tx_prep(conn, tx);
+
+			spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
+			list_splice_init(&peer->ksnp_tx_queue,
+					     &conn->ksnc_tx_queue);
+			spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
+		}
+
+		peer->ksnp_proto = NULL;	/* renegotiate protocol version */
+		peer->ksnp_error = error;       /* stash last conn close reason */
+
+		if (list_empty (&peer->ksnp_routes)) {
+			/* I've just closed last conn belonging to a
+			 * peer with no routes to it */
+			ksocknal_unlink_peer_locked (peer);
+		}
+	}
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list,
+			  &ksocknal_data.ksnd_deathrow_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed (ksock_peer_t *peer)
+{
+	int	notify = 0;
+	cfs_time_t last_alive = 0;
+
+	/* There has been a connection failure or comms error; but I'll only
+	 * tell LNET I think the peer is dead if it's to another kernel and
+	 * there are no connections or connection attempts in existance. */
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+	    list_empty(&peer->ksnp_conns) &&
+	    peer->ksnp_accepting == 0 &&
+	    ksocknal_find_connecting_route_locked(peer) == NULL) {
+		notify = 1;
+		last_alive = peer->ksnp_last_alive;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	if (notify)
+		lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0,
+			     last_alive);
+}
+
+void
+ksocknal_finalize_zcreq(ksock_conn_t *conn)
+{
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	ksock_tx_t       *tx;
+	ksock_tx_t       *tmp;
+	LIST_HEAD    (zlist);
+
+	/* NB safe to finalize TXs because closing of socket will
+	 * abort all buffered data */
+	LASSERT (conn->ksnc_sock == NULL);
+
+	spin_lock(&peer->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
+		if (tx->tx_conn != conn)
+			continue;
+
+		LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0);
+
+		tx->tx_msg.ksm_zc_cookies[0] = 0;
+		tx->tx_zc_aborted = 1; /* mark it as not-acked */
+		list_del(&tx->tx_zc_list);
+		list_add(&tx->tx_zc_list, &zlist);
+	}
+
+	spin_unlock(&peer->ksnp_lock);
+
+	while (!list_empty(&zlist)) {
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+
+		list_del(&tx->tx_zc_list);
+		ksocknal_tx_decref(tx);
+	}
+}
+
+void
+ksocknal_terminate_conn (ksock_conn_t *conn)
+{
+	/* This gets called by the reaper (guaranteed thread context) to
+	 * disengage the socket from its callbacks and close it.
+	 * ksnc_refcount will eventually hit zero, and then the reaper will
+	 * destroy it. */
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	ksock_sched_t    *sched = conn->ksnc_scheduler;
+	int	       failed = 0;
+
+	LASSERT(conn->ksnc_closing);
+
+	/* wake up the scheduler to "send" all remaining packets to /dev/null */
+	spin_lock_bh(&sched->kss_lock);
+
+	/* a closing conn is always ready to tx */
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled &&
+	    !list_empty(&conn->ksnc_tx_queue)){
+		list_add_tail (&conn->ksnc_tx_list,
+			       &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	/* serialise with callbacks */
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+	/* OK, so this conn may not be completely disengaged from its
+	 * scheduler yet, but it _has_ committed to terminate... */
+	conn->ksnc_scheduler->kss_nconns--;
+
+	if (peer->ksnp_error != 0) {
+		/* peer's last conn closed in error */
+		LASSERT (list_empty (&peer->ksnp_conns));
+		failed = 1;
+		peer->ksnp_error = 0;     /* avoid multiple notifications */
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (failed)
+		ksocknal_peer_failed(peer);
+
+	/* The socket is closed on the final put; either here, or in
+	 * ksocknal_{send,recv}msg().  Since we set up the linger2 option
+	 * when the connection was established, this will close the socket
+	 * immediately, aborting anything buffered in it. Any hung
+	 * zero-copy transmits will therefore complete in finite time. */
+	ksocknal_connsock_decref(conn);
+}
+
+void
+ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+{
+	/* Queue the conn for the reaper to destroy */
+
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_destroy_conn (ksock_conn_t *conn)
+{
+	cfs_time_t      last_rcv;
+
+	/* Final coup-de-grace of the reaper */
+	CDEBUG (D_NET, "connection %p\n", conn);
+
+	LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0);
+	LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0);
+	LASSERT (conn->ksnc_sock == NULL);
+	LASSERT (conn->ksnc_route == NULL);
+	LASSERT (!conn->ksnc_tx_scheduled);
+	LASSERT (!conn->ksnc_rx_scheduled);
+	LASSERT (list_empty(&conn->ksnc_tx_queue));
+
+	/* complete current receive if any */
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_LNET_PAYLOAD:
+		last_rcv = conn->ksnc_rx_deadline -
+			   cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+		CERROR("Completing partial receive from %s[%d]"
+		       ", ip %d.%d.%d.%d:%d, with error, wanted: %d, left: %d, "
+		       "last alive is %ld secs ago\n",
+		       libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+		       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+		       conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+		       cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+					last_rcv)));
+		lnet_finalize (conn->ksnc_peer->ksnp_ni,
+			       conn->ksnc_cookie, -EIO);
+		break;
+	case SOCKNAL_RX_LNET_HEADER:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of lnet header from %s"
+			       ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+			       conn->ksnc_proto->pro_version);
+		break;
+	case SOCKNAL_RX_KSM_HEADER:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of ksock message from %s"
+			       ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+			       conn->ksnc_proto->pro_version);
+		break;
+	case SOCKNAL_RX_SLOP:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of slops from %s"
+			       ", ip %d.%d.%d.%d:%d, with error\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+	       break;
+	default:
+		LBUG ();
+		break;
+	}
+
+	ksocknal_peer_decref(conn->ksnc_peer);
+
+	LIBCFS_FREE (conn, sizeof (*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why)
+{
+	ksock_conn_t       *conn;
+	struct list_head	 *ctmp;
+	struct list_head	 *cnxt;
+	int		 count = 0;
+
+	list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+		conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+		if (ipaddr == 0 ||
+		    conn->ksnc_ipaddr == ipaddr) {
+			count++;
+			ksocknal_close_conn_locked (conn, why);
+		}
+	}
+
+	return (count);
+}
+
+int
+ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
+{
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	__u32	     ipaddr = conn->ksnc_ipaddr;
+	int	       count;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	count = ksocknal_close_peer_conns_locked (peer, ipaddr, why);
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (count);
+}
+
+int
+ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr)
+{
+	ksock_peer_t       *peer;
+	struct list_head	 *ptmp;
+	struct list_head	 *pnxt;
+	int		 lo;
+	int		 hi;
+	int		 i;
+	int		 count = 0;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (id.nid != LNET_NID_ANY)
+		lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+	else {
+		lo = 0;
+		hi = ksocknal_data.ksnd_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt,
+					&ksocknal_data.ksnd_peers[i]) {
+
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
+			      (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
+				continue;
+
+			count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0);
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	/* wildcards always succeed */
+	if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+		return (0);
+
+	return (count == 0 ? -ENOENT : 0);
+}
+
+void
+ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
+{
+	/* The router is telling me she's been notified of a change in
+	 * gateway state.... */
+	lnet_process_id_t  id = {0};
+
+	id.nid = gw_nid;
+	id.pid = LNET_PID_ANY;
+
+	CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
+		alive ? "up" : "down");
+
+	if (!alive) {
+		/* If the gateway crashed, close all open connections... */
+		ksocknal_close_matching_conns (id, 0);
+		return;
+	}
+
+	/* ...otherwise do nothing.  We can only establish new connections
+	 * if we have autroutes, and these connect on demand. */
+}
+
+void
+ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+	int		connect = 1;
+	cfs_time_t	 last_alive = 0;
+	cfs_time_t	 now = cfs_time_current();
+	ksock_peer_t      *peer = NULL;
+	rwlock_t		*glock = &ksocknal_data.ksnd_global_lock;
+	lnet_process_id_t  id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+
+	read_lock(glock);
+
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL) {
+		struct list_head       *tmp;
+		ksock_conn_t     *conn;
+		int	       bufnob;
+
+		list_for_each (tmp, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+
+			if (bufnob < conn->ksnc_tx_bufnob) {
+				/* something got ACKed */
+				conn->ksnc_tx_deadline =
+					cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+				peer->ksnp_last_alive = now;
+				conn->ksnc_tx_bufnob = bufnob;
+			}
+		}
+
+		last_alive = peer->ksnp_last_alive;
+		if (ksocknal_find_connectable_route_locked(peer) == NULL)
+			connect = 0;
+	}
+
+	read_unlock(glock);
+
+	if (last_alive != 0)
+		*when = last_alive;
+
+	CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+	       libcfs_nid2str(nid), peer,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1,
+	       connect);
+
+	if (!connect)
+		return;
+
+	ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
+
+	write_lock_bh(glock);
+
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL)
+		ksocknal_launch_all_connections_locked(peer);
+
+	write_unlock_bh(glock);
+	return;
+}
+
+void
+ksocknal_push_peer (ksock_peer_t *peer)
+{
+	int	       index;
+	int	       i;
+	struct list_head       *tmp;
+	ksock_conn_t     *conn;
+
+	for (index = 0; ; index++) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+		i = 0;
+		conn = NULL;
+
+		list_for_each (tmp, &peer->ksnp_conns) {
+			if (i++ == index) {
+				conn = list_entry (tmp, ksock_conn_t,
+						       ksnc_list);
+				ksocknal_conn_addref(conn);
+				break;
+			}
+		}
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		if (conn == NULL)
+			break;
+
+		ksocknal_lib_push_conn (conn);
+		ksocknal_conn_decref(conn);
+	}
+}
+
+int
+ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*tmp;
+	int		index;
+	int		i;
+	int		j;
+	int		rc = -ENOENT;
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		for (j = 0; ; j++) {
+			read_lock(&ksocknal_data.ksnd_global_lock);
+
+			index = 0;
+			peer = NULL;
+
+			list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+				peer = list_entry(tmp, ksock_peer_t,
+						      ksnp_list);
+
+				if (!((id.nid == LNET_NID_ANY ||
+				       id.nid == peer->ksnp_id.nid) &&
+				      (id.pid == LNET_PID_ANY ||
+				       id.pid == peer->ksnp_id.pid))) {
+					peer = NULL;
+					continue;
+				}
+
+				if (index++ == j) {
+					ksocknal_peer_addref(peer);
+					break;
+				}
+			}
+
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			if (peer != NULL) {
+				rc = 0;
+				ksocknal_push_peer (peer);
+				ksocknal_peer_decref(peer);
+			}
+		}
+
+	}
+
+	return (rc);
+}
+
+int
+ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
+{
+	ksock_net_t       *net = ni->ni_data;
+	ksock_interface_t *iface;
+	int		rc;
+	int		i;
+	int		j;
+	struct list_head	*ptmp;
+	ksock_peer_t      *peer;
+	struct list_head	*rtmp;
+	ksock_route_t     *route;
+
+	if (ipaddress == 0 ||
+	    netmask == 0)
+		return (-EINVAL);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	iface = ksocknal_ip2iface(ni, ipaddress);
+	if (iface != NULL) {
+		/* silently ignore dups */
+		rc = 0;
+	} else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
+		rc = -ENOSPC;
+	} else {
+		iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
+
+		iface->ksni_ipaddr = ipaddress;
+		iface->ksni_netmask = netmask;
+		iface->ksni_nroutes = 0;
+		iface->ksni_npeers = 0;
+
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+			list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+				peer = list_entry(ptmp, ksock_peer_t,
+						      ksnp_list);
+
+				for (j = 0; j < peer->ksnp_n_passive_ips; j++)
+					if (peer->ksnp_passive_ips[j] == ipaddress)
+						iface->ksni_npeers++;
+
+				list_for_each(rtmp, &peer->ksnp_routes) {
+					route = list_entry(rtmp,
+							       ksock_route_t,
+							       ksnr_list);
+
+					if (route->ksnr_myipaddr == ipaddress)
+						iface->ksni_nroutes++;
+				}
+			}
+		}
+
+		rc = 0;
+		/* NB only new connections will pay attention to the new interface! */
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+void
+ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+{
+	struct list_head	 *tmp;
+	struct list_head	 *nxt;
+	ksock_route_t      *route;
+	ksock_conn_t       *conn;
+	int		 i;
+	int		 j;
+
+	for (i = 0; i < peer->ksnp_n_passive_ips; i++)
+		if (peer->ksnp_passive_ips[i] == ipaddr) {
+			for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
+				peer->ksnp_passive_ips[j-1] =
+					peer->ksnp_passive_ips[j];
+			peer->ksnp_n_passive_ips--;
+			break;
+		}
+
+	list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		if (route->ksnr_myipaddr != ipaddr)
+			continue;
+
+		if (route->ksnr_share_count != 0) {
+			/* Manually created; keep, but unbind */
+			route->ksnr_myipaddr = 0;
+		} else {
+			ksocknal_del_route_locked(route);
+		}
+	}
+
+	list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_myipaddr == ipaddr)
+			ksocknal_close_conn_locked (conn, 0);
+	}
+}
+
+int
+ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		rc = -ENOENT;
+	struct list_head	*tmp;
+	struct list_head	*nxt;
+	ksock_peer_t      *peer;
+	__u32	      this_ip;
+	int		i;
+	int		j;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
+
+		if (!(ipaddress == 0 ||
+		      ipaddress == this_ip))
+			continue;
+
+		rc = 0;
+
+		for (j = i+1; j < net->ksnn_ninterfaces; j++)
+			net->ksnn_interfaces[j-1] =
+				net->ksnn_interfaces[j];
+
+		net->ksnn_ninterfaces--;
+
+		for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+			list_for_each_safe(tmp, nxt,
+					       &ksocknal_data.ksnd_peers[j]) {
+				peer = list_entry(tmp, ksock_peer_t,
+						      ksnp_list);
+
+				if (peer->ksnp_ni != ni)
+					continue;
+
+				ksocknal_peer_del_interface_locked(peer, this_ip);
+			}
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+int
+ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+	lnet_process_id_t id = {0};
+	struct libcfs_ioctl_data *data = arg;
+	int rc;
+
+	switch(cmd) {
+	case IOC_LIBCFS_GET_INTERFACE: {
+		ksock_net_t       *net = ni->ni_data;
+		ksock_interface_t *iface;
+
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+		if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
+			rc = -ENOENT;
+		} else {
+			rc = 0;
+			iface = &net->ksnn_interfaces[data->ioc_count];
+
+			data->ioc_u32[0] = iface->ksni_ipaddr;
+			data->ioc_u32[1] = iface->ksni_netmask;
+			data->ioc_u32[2] = iface->ksni_npeers;
+			data->ioc_u32[3] = iface->ksni_nroutes;
+		}
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return rc;
+	}
+
+	case IOC_LIBCFS_ADD_INTERFACE:
+		return ksocknal_add_interface(ni,
+					      data->ioc_u32[0], /* IP address */
+					      data->ioc_u32[1]); /* net mask */
+
+	case IOC_LIBCFS_DEL_INTERFACE:
+		return ksocknal_del_interface(ni,
+					      data->ioc_u32[0]); /* IP address */
+
+	case IOC_LIBCFS_GET_PEER: {
+		__u32	    myip = 0;
+		__u32	    ip = 0;
+		int	      port = 0;
+		int	      conn_count = 0;
+		int	      share_count = 0;
+
+		rc = ksocknal_get_peer_info(ni, data->ioc_count,
+					    &id, &myip, &ip, &port,
+					    &conn_count,  &share_count);
+		if (rc != 0)
+			return rc;
+
+		data->ioc_nid    = id.nid;
+		data->ioc_count  = share_count;
+		data->ioc_u32[0] = ip;
+		data->ioc_u32[1] = port;
+		data->ioc_u32[2] = myip;
+		data->ioc_u32[3] = conn_count;
+		data->ioc_u32[4] = id.pid;
+		return 0;
+	}
+
+	case IOC_LIBCFS_ADD_PEER:
+		id.nid = data->ioc_nid;
+		id.pid = LUSTRE_SRV_LNET_PID;
+		return ksocknal_add_peer (ni, id,
+					  data->ioc_u32[0], /* IP */
+					  data->ioc_u32[1]); /* port */
+
+	case IOC_LIBCFS_DEL_PEER:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_del_peer (ni, id,
+					  data->ioc_u32[0]); /* IP */
+
+	case IOC_LIBCFS_GET_CONN: {
+		int	   txmem;
+		int	   rxmem;
+		int	   nagle;
+		ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
+
+		if (conn == NULL)
+			return -ENOENT;
+
+		ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+		data->ioc_count  = txmem;
+		data->ioc_nid    = conn->ksnc_peer->ksnp_id.nid;
+		data->ioc_flags  = nagle;
+		data->ioc_u32[0] = conn->ksnc_ipaddr;
+		data->ioc_u32[1] = conn->ksnc_port;
+		data->ioc_u32[2] = conn->ksnc_myipaddr;
+		data->ioc_u32[3] = conn->ksnc_type;
+		data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+		data->ioc_u32[5] = rxmem;
+		data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+		ksocknal_conn_decref(conn);
+		return 0;
+	}
+
+	case IOC_LIBCFS_CLOSE_CONNECTION:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_close_matching_conns (id,
+						      data->ioc_u32[0]);
+
+	case IOC_LIBCFS_REGISTER_MYNID:
+		/* Ignore if this is a noop */
+		if (data->ioc_nid == ni->ni_nid)
+			return 0;
+
+		CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+		       libcfs_nid2str(data->ioc_nid),
+		       libcfs_nid2str(ni->ni_nid));
+		return -EINVAL;
+
+	case IOC_LIBCFS_PUSH_CONNECTION:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_push(ni, id);
+
+	default:
+		return -EINVAL;
+	}
+	/* not reached */
+}
+
+void
+ksocknal_free_buffers (void)
+{
+	LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+	if (ksocknal_data.ksnd_sched_info != NULL) {
+		struct ksock_sched_info	*info;
+		int			i;
+
+		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+			if (info->ksi_scheds != NULL) {
+				LIBCFS_FREE(info->ksi_scheds,
+					    info->ksi_nthreads_max *
+					    sizeof(info->ksi_scheds[0]));
+			}
+		}
+		cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+	}
+
+	LIBCFS_FREE (ksocknal_data.ksnd_peers,
+		     sizeof (struct list_head) *
+		     ksocknal_data.ksnd_peer_hash_size);
+
+	spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+	if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+		struct list_head	zlist;
+		ksock_tx_t	*tx;
+
+		list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
+		list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+		while (!list_empty(&zlist)) {
+			tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+			list_del(&tx->tx_list);
+			LIBCFS_FREE(tx, tx->tx_desc_size);
+		}
+	} else {
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	}
+}
+
+void
+ksocknal_base_shutdown(void)
+{
+	struct ksock_sched_info *info;
+	ksock_sched_t		*sched;
+	int			i;
+	int			j;
+
+	CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+	       atomic_read (&libcfs_kmemory));
+	LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+	switch (ksocknal_data.ksnd_init) {
+	default:
+		LASSERT (0);
+
+	case SOCKNAL_INIT_ALL:
+	case SOCKNAL_INIT_DATA:
+		LASSERT (ksocknal_data.ksnd_peers != NULL);
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+			LASSERT (list_empty (&ksocknal_data.ksnd_peers[i]));
+		}
+
+		LASSERT(list_empty(&ksocknal_data.ksnd_nets));
+		LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns));
+		LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns));
+		LASSERT (list_empty (&ksocknal_data.ksnd_connd_connreqs));
+		LASSERT (list_empty (&ksocknal_data.ksnd_connd_routes));
+
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+					sched = &info->ksi_scheds[j];
+					LASSERT(list_empty(&sched->\
+							       kss_tx_conns));
+					LASSERT(list_empty(&sched->\
+							       kss_rx_conns));
+					LASSERT(list_empty(&sched-> \
+						  kss_zombie_noop_txs));
+					LASSERT(sched->kss_nconns == 0);
+				}
+			}
+		}
+
+		/* flag threads to terminate; wake and wait for them to die */
+		ksocknal_data.ksnd_shuttingdown = 1;
+		wake_up_all(&ksocknal_data.ksnd_connd_waitq);
+		wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
+
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+					sched = &info->ksi_scheds[j];
+					wake_up_all(&sched->kss_waitq);
+				}
+			}
+		}
+
+		i = 4;
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		while (ksocknal_data.ksnd_nthreads != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+			       "waiting for %d threads to terminate\n",
+				ksocknal_data.ksnd_nthreads);
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			cfs_pause(cfs_time_seconds(1));
+			read_lock(&ksocknal_data.ksnd_global_lock);
+		}
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		ksocknal_free_buffers();
+
+		ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+		break;
+	}
+
+	CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+	       atomic_read (&libcfs_kmemory));
+
+	module_put(THIS_MODULE);
+}
+
+__u64
+ksocknal_new_incarnation (void)
+{
+	struct timeval tv;
+
+	/* The incarnation number is the time this module loaded and it
+	 * identifies this particular instance of the socknal.  Hopefully
+	 * we won't be able to reboot more frequently than 1MHz for the
+	 * forseeable future :) */
+
+	do_gettimeofday(&tv);
+
+	return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+}
+
+int
+ksocknal_base_startup(void)
+{
+	struct ksock_sched_info	*info;
+	int			rc;
+	int			i;
+
+	LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+	LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+	memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+	ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
+	LIBCFS_ALLOC (ksocknal_data.ksnd_peers,
+		      sizeof (struct list_head) *
+		      ksocknal_data.ksnd_peer_hash_size);
+	if (ksocknal_data.ksnd_peers == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+		INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
+
+	rwlock_init(&ksocknal_data.ksnd_global_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
+
+	spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns);
+	init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_connd_lock);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes);
+	init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_tx_lock);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs);
+
+	/* NB memset above zeros whole of ksocknal_data */
+
+	/* flag lists/ptrs/locks initialised */
+	ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+	try_module_get(THIS_MODULE);
+
+	ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+							 sizeof(*info));
+	if (ksocknal_data.ksnd_sched_info == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+		ksock_sched_t	*sched;
+		int		nthrs;
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+		} else {
+			/* max to half of CPUs, assume another half should be
+			 * reserved for upper layer modules */
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+		}
+
+		info->ksi_nthreads_max = nthrs;
+		info->ksi_cpt = i;
+
+		LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+				 info->ksi_nthreads_max * sizeof(*sched));
+		if (info->ksi_scheds == NULL)
+			goto failed;
+
+		for (; nthrs > 0; nthrs--) {
+			sched = &info->ksi_scheds[nthrs - 1];
+
+			sched->kss_info = info;
+			spin_lock_init(&sched->kss_lock);
+			INIT_LIST_HEAD(&sched->kss_rx_conns);
+			INIT_LIST_HEAD(&sched->kss_tx_conns);
+			INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+			init_waitqueue_head(&sched->kss_waitq);
+		}
+	}
+
+	ksocknal_data.ksnd_connd_starting	 = 0;
+	ksocknal_data.ksnd_connd_failed_stamp     = 0;
+	ksocknal_data.ksnd_connd_starting_stamp   = cfs_time_current_sec();
+	/* must have at least 2 connds to remain responsive to accepts while
+	 * connecting */
+	if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
+		*ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
+
+	if (*ksocknal_tunables.ksnd_nconnds_max <
+	    *ksocknal_tunables.ksnd_nconnds) {
+		ksocknal_tunables.ksnd_nconnds_max =
+			ksocknal_tunables.ksnd_nconnds;
+	}
+
+	for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+		char name[16];
+		spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+		ksocknal_data.ksnd_connd_starting++;
+		spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+
+		snprintf(name, sizeof(name), "socknal_cd%02d", i);
+		rc = ksocknal_thread_start(ksocknal_connd,
+					   (void *)((ulong_ptr_t)i), name);
+		if (rc != 0) {
+			spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+			ksocknal_data.ksnd_connd_starting--;
+			spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+			CERROR("Can't spawn socknal connd: %d\n", rc);
+			goto failed;
+		}
+	}
+
+	rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
+	if (rc != 0) {
+		CERROR ("Can't spawn socknal reaper: %d\n", rc);
+		goto failed;
+	}
+
+	/* flag everything initialised */
+	ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+	return 0;
+
+ failed:
+	ksocknal_base_shutdown();
+	return -ENETDOWN;
+}
+
+void
+ksocknal_debug_peerhash (lnet_ni_t *ni)
+{
+	ksock_peer_t	*peer = NULL;
+	struct list_head	*tmp;
+	int		i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni == ni) break;
+
+			peer = NULL;
+		}
+	}
+
+	if (peer != NULL) {
+		ksock_route_t *route;
+		ksock_conn_t  *conn;
+
+		CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, "
+		       "closing %d, accepting %d, err %d, zcookie "LPU64", "
+		       "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id),
+		       atomic_read(&peer->ksnp_refcount),
+		       peer->ksnp_sharecount, peer->ksnp_closing,
+		       peer->ksnp_accepting, peer->ksnp_error,
+		       peer->ksnp_zc_next_cookie,
+		       !list_empty(&peer->ksnp_tx_queue),
+		       !list_empty(&peer->ksnp_zc_req_list));
+
+		list_for_each (tmp, &peer->ksnp_routes) {
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
+			CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
+			       "del %d\n", atomic_read(&route->ksnr_refcount),
+			       route->ksnr_scheduled, route->ksnr_connecting,
+			       route->ksnr_connected, route->ksnr_deleted);
+		}
+
+		list_for_each (tmp, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
+			       atomic_read(&conn->ksnc_conn_refcount),
+			       atomic_read(&conn->ksnc_sock_refcount),
+			       conn->ksnc_type, conn->ksnc_closing);
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return;
+}
+
+void
+ksocknal_shutdown (lnet_ni_t *ni)
+{
+	ksock_net_t      *net = ni->ni_data;
+	int	       i;
+	lnet_process_id_t anyid = {0};
+
+	anyid.nid =  LNET_NID_ANY;
+	anyid.pid =  LNET_PID_ANY;
+
+	LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+	LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+	spin_lock_bh(&net->ksnn_lock);
+	net->ksnn_shutdown = 1;		 /* prevent new peers */
+	spin_unlock_bh(&net->ksnn_lock);
+
+	/* Delete all peers */
+	ksocknal_del_peer(ni, anyid, 0);
+
+	/* Wait for all peer state to clean up */
+	i = 2;
+	spin_lock_bh(&net->ksnn_lock);
+	while (net->ksnn_npeers != 0) {
+		spin_unlock_bh(&net->ksnn_lock);
+
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+		       "waiting for %d peers to disconnect\n",
+		       net->ksnn_npeers);
+		cfs_pause(cfs_time_seconds(1));
+
+		ksocknal_debug_peerhash(ni);
+
+		spin_lock_bh(&net->ksnn_lock);
+	}
+	spin_unlock_bh(&net->ksnn_lock);
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0);
+		LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
+	}
+
+	list_del(&net->ksnn_list);
+	LIBCFS_FREE(net, sizeof(*net));
+
+	ksocknal_data.ksnd_nnets--;
+	if (ksocknal_data.ksnd_nnets == 0)
+		ksocknal_base_shutdown();
+}
+
+int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+	char      **names;
+	int	 i;
+	int	 j;
+	int	 rc;
+	int	 n;
+
+	n = libcfs_ipif_enumerate(&names);
+	if (n <= 0) {
+		CERROR("Can't enumerate interfaces: %d\n", n);
+		return n;
+	}
+
+	for (i = j = 0; i < n; i++) {
+		int	up;
+		__u32      ip;
+		__u32      mask;
+
+		if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+			continue;
+
+		rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+		if (rc != 0) {
+			CWARN("Can't get interface %s info: %d\n",
+			      names[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Ignoring interface %s (down)\n",
+			      names[i]);
+			continue;
+		}
+
+		if (j == LNET_MAX_INTERFACES) {
+			CWARN("Ignoring interface %s (too many interfaces)\n",
+			      names[i]);
+			continue;
+		}
+
+		net->ksnn_interfaces[j].ksni_ipaddr = ip;
+		net->ksnn_interfaces[j].ksni_netmask = mask;
+		strncpy(&net->ksnn_interfaces[j].ksni_name[0],
+			names[i], IFNAMSIZ);
+		j++;
+	}
+
+	libcfs_ipif_free_enumeration(names, n);
+
+	if (j == 0)
+		CERROR("Can't find any usable interfaces\n");
+
+	return j;
+}
+
+int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+	int	new_ipif = 0;
+	int	i;
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		char		*ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+		char		*colon = strchr(ifnam, ':');
+		int		found  = 0;
+		ksock_net_t	*tmp;
+		int		j;
+
+		if (colon != NULL) /* ignore alias device */
+			*colon = 0;
+
+		list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+					ksnn_list) {
+			for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+				char *ifnam2 = &tmp->ksnn_interfaces[j].\
+					     ksni_name[0];
+				char *colon2 = strchr(ifnam2, ':');
+
+				if (colon2 != NULL)
+					*colon2 = 0;
+
+				found = strcmp(ifnam, ifnam2) == 0;
+				if (colon2 != NULL)
+					*colon2 = ':';
+			}
+			if (found)
+				break;
+		}
+
+		new_ipif += !found;
+		if (colon != NULL)
+			*colon = ':';
+	}
+
+	return new_ipif;
+}
+
+int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+	int	nthrs;
+	int	rc = 0;
+	int	i;
+
+	if (info->ksi_nthreads == 0) {
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = info->ksi_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       info->ksi_cpt);
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+			nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+		}
+		nthrs = min(nthrs, info->ksi_nthreads_max);
+	} else {
+		LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+		/* increase two threads if there is new interface */
+		nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long		id;
+		char		name[20];
+		ksock_sched_t	*sched;
+		id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+		sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+		snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
+			 info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+
+		rc = ksocknal_thread_start(ksocknal_scheduler,
+					   (void *)id, name);
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       info->ksi_cpt, info->ksi_nthreads + i, rc);
+		break;
+	}
+
+	info->ksi_nthreads += i;
+	return rc;
+}
+
+int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+	int	newif = ksocknal_search_new_ipif(net);
+	int	rc;
+	int	i;
+
+	LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+
+	for (i = 0; i < ncpts; i++) {
+		struct ksock_sched_info	*info;
+		int cpt = (cpts == NULL) ? i : cpts[i];
+
+		LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+		info = ksocknal_data.ksnd_sched_info[cpt];
+
+		if (!newif && info->ksi_nthreads > 0)
+			continue;
+
+		rc = ksocknal_start_schedulers(info);
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+
+int
+ksocknal_startup (lnet_ni_t *ni)
+{
+	ksock_net_t  *net;
+	int	   rc;
+	int	   i;
+
+	LASSERT (ni->ni_lnd == &the_ksocklnd);
+
+	if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+		rc = ksocknal_base_startup();
+		if (rc != 0)
+			return rc;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	if (net == NULL)
+		goto fail_0;
+
+	spin_lock_init(&net->ksnn_lock);
+	net->ksnn_incarnation = ksocknal_new_incarnation();
+	ni->ni_data = net;
+	ni->ni_peertimeout    = *ksocknal_tunables.ksnd_peertimeout;
+	ni->ni_maxtxcredits   = *ksocknal_tunables.ksnd_credits;
+	ni->ni_peertxcredits  = *ksocknal_tunables.ksnd_peertxcredits;
+	ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
+
+	if (ni->ni_interfaces[0] == NULL) {
+		rc = ksocknal_enumerate_interfaces(net);
+		if (rc <= 0)
+			goto fail_1;
+
+		net->ksnn_ninterfaces = 1;
+	} else {
+		for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+			int    up;
+
+			if (ni->ni_interfaces[i] == NULL)
+				break;
+
+			rc = libcfs_ipif_query(
+				ni->ni_interfaces[i], &up,
+				&net->ksnn_interfaces[i].ksni_ipaddr,
+				&net->ksnn_interfaces[i].ksni_netmask);
+
+			if (rc != 0) {
+				CERROR("Can't get interface %s info: %d\n",
+				       ni->ni_interfaces[i], rc);
+				goto fail_1;
+			}
+
+			if (!up) {
+				CERROR("Interface %s is down\n",
+				       ni->ni_interfaces[i]);
+				goto fail_1;
+			}
+
+			strncpy(&net->ksnn_interfaces[i].ksni_name[0],
+				ni->ni_interfaces[i], IFNAMSIZ);
+		}
+		net->ksnn_ninterfaces = i;
+	}
+
+	/* call it before add it to ksocknal_data.ksnd_nets */
+	rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto fail_1;
+
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+				net->ksnn_interfaces[0].ksni_ipaddr);
+	list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+
+	ksocknal_data.ksnd_nnets++;
+
+	return 0;
+
+ fail_1:
+	LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+	if (ksocknal_data.ksnd_nnets == 0)
+		ksocknal_base_shutdown();
+
+	return -ENETDOWN;
+}
+
+
+void __exit
+ksocknal_module_fini (void)
+{
+	lnet_unregister_lnd(&the_ksocklnd);
+	ksocknal_tunables_fini();
+}
+
+int __init
+ksocknal_module_init (void)
+{
+	int    rc;
+
+	/* check ksnr_connected/connecting field large enough */
+	CLASSERT (SOCKLND_CONN_NTYPES <= 4);
+	CLASSERT (SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN);
+
+	/* initialize the_ksocklnd */
+	the_ksocklnd.lnd_type     = SOCKLND;
+	the_ksocklnd.lnd_startup  = ksocknal_startup;
+	the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
+	the_ksocklnd.lnd_ctl      = ksocknal_ctl;
+	the_ksocklnd.lnd_send     = ksocknal_send;
+	the_ksocklnd.lnd_recv     = ksocknal_recv;
+	the_ksocklnd.lnd_notify   = ksocknal_notify;
+	the_ksocklnd.lnd_query    = ksocknal_query;
+	the_ksocklnd.lnd_accept   = ksocknal_accept;
+
+	rc = ksocknal_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_ksocklnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0");
+MODULE_LICENSE("GPL");
+
+cfs_module(ksocknal, "3.0.0", ksocknal_module_init, ksocknal_module_fini);

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
new file mode 100644
index 0000000..b483e0c
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h

@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "socklnd_lib-linux.h"
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/socklnd.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#define SOCKNAL_PEER_HASH_SIZE  101	     /* # peer lists */
+#define SOCKNAL_RESCHED	 100	     /* # scheduler loops before reschedule */
+#define SOCKNAL_INSANITY_RECONN 5000	    /* connd is trying on reconn infinitely */
+#define SOCKNAL_ENOMEM_RETRY    CFS_TICK	/* jiffies between retries */
+
+#define SOCKNAL_SINGLE_FRAG_TX      0	   /* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX      0	   /* disable multi-fragment receives */
+
+#define SOCKNAL_VERSION_DEBUG       0	   /* enable protocol version debugging */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK  0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK  1
+#endif
+
+struct ksock_sched_info;
+
+typedef struct				  /* per scheduler state */
+{
+	spinlock_t		kss_lock;	/* serialise */
+	struct list_head		kss_rx_conns;	/* conn waiting to be read */
+	/* conn waiting to be written */
+	struct list_head		kss_tx_conns;
+	/* zombie noop tx list */
+	struct list_head		kss_zombie_noop_txs;
+	wait_queue_head_t		kss_waitq;	/* where scheduler sleeps */
+	/* # connections assigned to this scheduler */
+	int			kss_nconns;
+	struct ksock_sched_info	*kss_info;	/* owner of it */
+	struct page		*kss_rx_scratch_pgs[LNET_MAX_IOV];
+	struct iovec		kss_scratch_iov[LNET_MAX_IOV];
+} ksock_sched_t;
+
+struct ksock_sched_info {
+	int			ksi_nthreads_max; /* max allowed threads */
+	int			ksi_nthreads;	/* number of threads */
+	int			ksi_cpt;	/* CPT id */
+	ksock_sched_t		*ksi_scheds;	/* array of schedulers */
+};
+
+#define KSOCK_CPT_SHIFT			16
+#define KSOCK_THREAD_ID(cpt, sid)	(((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id)		((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id)		((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
+
+typedef struct				  /* in-use interface */
+{
+	__u32		ksni_ipaddr;		/* interface's IP address */
+	__u32		ksni_netmask;		/* interface's network mask */
+	int		ksni_nroutes;		/* # routes using (active) */
+	int		ksni_npeers;		/* # peers using (passive) */
+	char		ksni_name[IFNAMSIZ];	/* interface name */
+} ksock_interface_t;
+
+typedef struct
+{
+	/* "stuck" socket timeout (seconds) */
+	int	      *ksnd_timeout;
+	/* # scheduler threads in each pool while starting */
+	int		 *ksnd_nscheds;
+	int	      *ksnd_nconnds;	 /* # connection daemons */
+	int	      *ksnd_nconnds_max;     /* max # connection daemons */
+	int	      *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+	int	      *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+	int	      *ksnd_eager_ack;       /* make TCP ack eagerly? */
+	int	      *ksnd_typed_conns;     /* drive sockets by type? */
+	int	      *ksnd_min_bulk;	/* smallest "large" message */
+	int	      *ksnd_tx_buffer_size;  /* socket tx buffer size */
+	int	      *ksnd_rx_buffer_size;  /* socket rx buffer size */
+	int	      *ksnd_nagle;	   /* enable NAGLE? */
+	int	      *ksnd_round_robin;     /* round robin for multiple interfaces */
+	int	      *ksnd_keepalive;       /* # secs for sending keepalive NOOP */
+	int	      *ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+	int	      *ksnd_keepalive_count; /* # probes */
+	int	      *ksnd_keepalive_intvl; /* time between probes */
+	int	      *ksnd_credits;	 /* # concurrent sends */
+	int	      *ksnd_peertxcredits;   /* # concurrent sends to 1 peer */
+	int	      *ksnd_peerrtrcredits;  /* # per-peer router buffer credits */
+	int	      *ksnd_peertimeout;     /* seconds to consider peer dead */
+	int	      *ksnd_enable_csum;     /* enable check sum */
+	int	      *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+	int	      *ksnd_nonblk_zcack;    /* always send zc-ack on non-blocking connection */
+	unsigned int     *ksnd_zc_min_payload;  /* minimum zero copy payload size */
+	int	      *ksnd_zc_recv;	 /* enable ZC receive (for Chelsio TOE) */
+	int	      *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+	ctl_table_header_t *ksnd_sysctl;   /* sysctl interface */
+#endif
+} ksock_tunables_t;
+
+typedef struct
+{
+	__u64		  ksnn_incarnation;	/* my epoch */
+	spinlock_t	  ksnn_lock;		/* serialise */
+	struct list_head	  ksnn_list;		/* chain on global list */
+	int		  ksnn_npeers;		/* # peers */
+	int		  ksnn_shutdown;	/* shutting down? */
+	int		  ksnn_ninterfaces;	/* IP interfaces */
+	ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES];
+} ksock_net_t;
+
+/** connd timeout */
+#define SOCKNAL_CONND_TIMEOUT  120
+/** reserved thread for accepting & creating new connd */
+#define SOCKNAL_CONND_RESV     1
+
+typedef struct
+{
+	int			ksnd_init;	/* initialisation state */
+	int			ksnd_nnets;	/* # networks set up */
+	struct list_head		ksnd_nets;	/* list of nets */
+	/* stabilize peer/conn ops */
+	rwlock_t		ksnd_global_lock;
+	/* hash table of all my known peers */
+	struct list_head		*ksnd_peers;
+	int			ksnd_peer_hash_size; /* size of ksnd_peers */
+
+	int			ksnd_nthreads;	/* # live threads */
+	int			ksnd_shuttingdown; /* tell threads to exit */
+	/* schedulers information */
+	struct ksock_sched_info	**ksnd_sched_info;
+
+	atomic_t      ksnd_nactive_txs;    /* #active txs */
+
+	struct list_head	ksnd_deathrow_conns; /* conns to close: reaper_lock*/
+	struct list_head	ksnd_zombie_conns;   /* conns to free: reaper_lock */
+	struct list_head	ksnd_enomem_conns;   /* conns to retry: reaper_lock*/
+	wait_queue_head_t       ksnd_reaper_waitq;   /* reaper sleeps here */
+	cfs_time_t	ksnd_reaper_waketime;/* when reaper will wake */
+	spinlock_t	  ksnd_reaper_lock;	/* serialise */
+
+	int	       ksnd_enomem_tx;      /* test ENOMEM sender */
+	int	       ksnd_stall_tx;       /* test sluggish sender */
+	int	       ksnd_stall_rx;       /* test sluggish receiver */
+
+	struct list_head	ksnd_connd_connreqs; /* incoming connection requests */
+	struct list_head	ksnd_connd_routes;   /* routes waiting to be connected */
+	wait_queue_head_t       ksnd_connd_waitq;    /* connds sleep here */
+	int	       ksnd_connd_connecting;/* # connds connecting */
+	/** time stamp of the last failed connecting attempt */
+	long	      ksnd_connd_failed_stamp;
+	/** # starting connd */
+	unsigned	  ksnd_connd_starting;
+	/** time stamp of the last starting connd */
+	long	      ksnd_connd_starting_stamp;
+	/** # running connd */
+	unsigned	  ksnd_connd_running;
+	spinlock_t	  ksnd_connd_lock;	/* serialise */
+
+	struct list_head	  ksnd_idle_noop_txs;	/* list head for freed noop tx */
+	spinlock_t	  ksnd_tx_lock;		/* serialise, g_lock unsafe */
+
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_ALL	2
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments (the first frag contains the portals header),
+ * followed by 0 or more lnet_kiov_t fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, the payload is
+ * received into either struct iovec or lnet_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
+
+struct ksock_conn;			      /* forward ref */
+struct ksock_peer;			      /* forward ref */
+struct ksock_route;			     /* forward ref */
+struct ksock_proto;			     /* forward ref */
+
+typedef struct				  /* transmit packet */
+{
+	struct list_head     tx_list;	/* queue on conn for transmission etc */
+	struct list_head     tx_zc_list;     /* queue on peer for ZC request */
+	atomic_t   tx_refcount;    /* tx reference count */
+	int	    tx_nob;	 /* # packet bytes */
+	int	    tx_resid;       /* residual bytes */
+	int	    tx_niov;	/* # packet iovec frags */
+	struct iovec  *tx_iov;	 /* packet iovec frags */
+	int	    tx_nkiov;       /* # packet page frags */
+	unsigned short tx_zc_aborted;  /* aborted ZC request */
+	unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
+	unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
+	unsigned short tx_nonblk:1;    /* it's a non-blocking ACK */
+	lnet_kiov_t   *tx_kiov;	/* packet page frags */
+	struct ksock_conn  *tx_conn;	/* owning conn */
+	lnet_msg_t    *tx_lnetmsg;     /* lnet message for lnet_finalize() */
+	cfs_time_t     tx_deadline;    /* when (in jiffies) tx times out */
+	ksock_msg_t    tx_msg;	 /* socklnd message buffer */
+	int	    tx_desc_size;   /* size of this descriptor */
+	union {
+		struct {
+			struct iovec iov;       /* virt hdr */
+			lnet_kiov_t  kiov[0];   /* paged payload */
+		}		  paged;
+		struct {
+			struct iovec iov[1];    /* virt hdr + payload */
+		}		  virt;
+	}		       tx_frags;
+} ksock_tx_t;
+
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+typedef union {
+	struct iovec     iov[LNET_MAX_IOV];
+	lnet_kiov_t      kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_KSM_HEADER   1	       /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER  2	       /* reading lnet message header */
+#define SOCKNAL_RX_PARSE	3	       /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT   4	       /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5	       /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP	 6	       /* skipping body */
+
+typedef struct ksock_conn
+{
+	struct ksock_peer  *ksnc_peer;	 /* owning peer */
+	struct ksock_route *ksnc_route;	/* owning route */
+	struct list_head	  ksnc_list;	 /* stash on peer's conn list */
+	socket_t       *ksnc_sock;	 /* actual socket */
+	void	       *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+	void	       *ksnc_saved_write_space; /* socket's original write_space() callback */
+	atomic_t	ksnc_conn_refcount; /* conn refcount */
+	atomic_t	ksnc_sock_refcount; /* sock refcount */
+	ksock_sched_t      *ksnc_scheduler;  /* who schedules this connection */
+	__u32	       ksnc_myipaddr;   /* my IP */
+	__u32	       ksnc_ipaddr;     /* peer's IP */
+	int		 ksnc_port;       /* peer's port */
+	signed int	  ksnc_type:3;     /* type of connection,
+					      * should be signed value */
+	unsigned int	    ksnc_closing:1;  /* being shut down */
+	unsigned int	    ksnc_flip:1;     /* flip or not, only for V2.x */
+	unsigned int	    ksnc_zc_capable:1; /* enable to ZC */
+	struct ksock_proto *ksnc_proto;      /* protocol for the connection */
+
+	/* reader */
+	struct list_head  ksnc_rx_list;     /* where I enq waiting input or a forwarding descriptor */
+	cfs_time_t	    ksnc_rx_deadline; /* when (in jiffies) receive times out */
+	__u8		  ksnc_rx_started;  /* started receiving a message */
+	__u8		  ksnc_rx_ready;    /* data ready to read */
+	__u8		  ksnc_rx_scheduled;/* being progressed */
+	__u8		  ksnc_rx_state;    /* what is being read */
+	int		   ksnc_rx_nob_left; /* # bytes to next hdr/body */
+	int		   ksnc_rx_nob_wanted; /* bytes actually wanted */
+	int		   ksnc_rx_niov;     /* # iovec frags */
+	struct iovec	 *ksnc_rx_iov;      /* the iovec frags */
+	int		   ksnc_rx_nkiov;    /* # page frags */
+	lnet_kiov_t	  *ksnc_rx_kiov;     /* the page frags */
+	ksock_rxiovspace_t    ksnc_rx_iov_space;/* space for frag descriptors */
+	__u32		 ksnc_rx_csum;     /* partial checksum for incoming data */
+	void		 *ksnc_cookie;      /* rx lnet_finalize passthru arg */
+	ksock_msg_t	   ksnc_msg;	 /* incoming message buffer:
+						 * V2.x message takes the
+						 * whole struct
+						 * V1.x message is a bare
+						 * lnet_hdr_t, it's stored in
+						 * ksnc_msg.ksm_u.lnetmsg */
+
+	/* WRITER */
+	struct list_head	    ksnc_tx_list;     /* where I enq waiting for output space */
+	struct list_head	    ksnc_tx_queue;    /* packets waiting to be sent */
+	ksock_tx_t	   *ksnc_tx_carrier;  /* next TX that can carry a LNet message or ZC-ACK */
+	cfs_time_t	    ksnc_tx_deadline; /* when (in jiffies) tx times out */
+	int		   ksnc_tx_bufnob;     /* send buffer marker */
+	atomic_t	  ksnc_tx_nob;	/* # bytes queued */
+	int		   ksnc_tx_ready;      /* write space */
+	int		   ksnc_tx_scheduled;  /* being progressed */
+	cfs_time_t	    ksnc_tx_last_post;  /* time stamp of the last posted TX */
+} ksock_conn_t;
+
+typedef struct ksock_route
+{
+	struct list_head	    ksnr_list;	/* chain on peer route list */
+	struct list_head	    ksnr_connd_list;  /* chain on ksnr_connd_routes */
+	struct ksock_peer    *ksnr_peer;	/* owning peer */
+	atomic_t	  ksnr_refcount;    /* # users */
+	cfs_time_t	    ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
+	cfs_duration_t	ksnr_retry_interval; /* how long between retries */
+	__u32		 ksnr_myipaddr;    /* my IP */
+	__u32		 ksnr_ipaddr;      /* IP address to connect to */
+	int		   ksnr_port;	/* port to connect to */
+	unsigned int	  ksnr_scheduled:1; /* scheduled for attention */
+	unsigned int	  ksnr_connecting:1;/* connection establishment in progress */
+	unsigned int	  ksnr_connected:4; /* connections established by type */
+	unsigned int	  ksnr_deleted:1;   /* been removed from peer? */
+	unsigned int	  ksnr_share_count; /* created explicitly? */
+	int		   ksnr_conn_count;  /* # conns established by this route */
+} ksock_route_t;
+
+#define SOCKNAL_KEEPALIVE_PING	  1       /* cookie for keepalive ping */
+
+typedef struct ksock_peer
+{
+	struct list_head	    ksnp_list;	/* stash on global peer list */
+	cfs_time_t	    ksnp_last_alive;  /* when (in jiffies) I was last alive */
+	lnet_process_id_t     ksnp_id;       /* who's on the other end(s) */
+	atomic_t	  ksnp_refcount; /* # users */
+	int		   ksnp_sharecount;  /* lconf usage counter */
+	int		   ksnp_closing;  /* being closed */
+	int		   ksnp_accepting;/* # passive connections pending */
+	int		   ksnp_error;    /* errno on closing last conn */
+	__u64		 ksnp_zc_next_cookie;/* ZC completion cookie */
+	__u64		 ksnp_incarnation;   /* latest known peer incarnation */
+	struct ksock_proto   *ksnp_proto;    /* latest known peer protocol */
+	struct list_head	    ksnp_conns;    /* all active connections */
+	struct list_head	    ksnp_routes;   /* routes */
+	struct list_head	    ksnp_tx_queue; /* waiting packets */
+	spinlock_t	      ksnp_lock;	/* serialize, g_lock unsafe */
+	struct list_head	    ksnp_zc_req_list;   /* zero copy requests wait for ACK  */
+	cfs_time_t	    ksnp_send_keepalive; /* time to send keepalive */
+	lnet_ni_t	    *ksnp_ni;       /* which network */
+	int		   ksnp_n_passive_ips; /* # of... */
+	__u32		 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_t;
+
+typedef struct ksock_connreq
+{
+	struct list_head	    ksncr_list;     /* stash on ksnd_connd_connreqs */
+	lnet_ni_t	    *ksncr_ni;       /* chosen NI */
+	socket_t	 *ksncr_sock;     /* accepted socket */
+} ksock_connreq_t;
+
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
+
+#define SOCKNAL_MATCH_NO	0	/* TX can't match type of connection */
+#define SOCKNAL_MATCH_YES       1	/* TX matches type of connection */
+#define SOCKNAL_MATCH_MAY       2	/* TX can be sent on the connection, but not preferred */
+
+typedef struct ksock_proto
+{
+	int	   pro_version;					      /* version number of protocol */
+	int	 (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *);     /* handshake function */
+	int	 (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */
+	void	(*pro_pack)(ksock_tx_t *);				  /* message pack */
+	void	(*pro_unpack)(ksock_msg_t *);			       /* message unpack */
+	ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *);	  /* queue tx on the connection */
+	int	 (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
+	int	 (*pro_handle_zcreq)(ksock_conn_t *, __u64, int);	    /* handle ZC request */
+	int	 (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64);	  /* handle ZC ACK */
+	int	 (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int);	 /* msg type matches the connection type:
+										 * return value:
+										 *   return MATCH_NO  : no
+										 *   return MATCH_YES : matching type
+										 *   return MATCH_MAY : can be backup */
+} ksock_proto_t;
+
+extern ksock_proto_t ksocknal_protocol_v1x;
+extern ksock_proto_t ksocknal_protocol_v2x;
+extern ksock_proto_t ksocknal_protocol_v3x;
+
+#define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1	  KSOCK_PROTO_V1_MAJOR
+
+#ifndef CPU_MASK_NONE
+#define CPU_MASK_NONE   0UL
+#endif
+
+static inline int
+ksocknal_route_mask(void)
+{
+	if (!*ksocknal_tunables.ksnd_typed_conns)
+		return (1 << SOCKLND_CONN_ANY);
+
+	return ((1 << SOCKLND_CONN_CONTROL) |
+		(1 << SOCKLND_CONN_BULK_IN) |
+		(1 << SOCKLND_CONN_BULK_OUT));
+}
+
+static inline struct list_head *
+ksocknal_nid2peerlist (lnet_nid_t nid)
+{
+	unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
+
+	return (&ksocknal_data.ksnd_peers [hash]);
+}
+
+static inline void
+ksocknal_conn_addref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
+extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+		ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref (ksock_conn_t *conn)
+{
+	int   rc = -ESHUTDOWN;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	if (!conn->ksnc_closing) {
+		LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+		atomic_inc(&conn->ksnc_sock_refcount);
+		rc = 0;
+	}
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+static inline void
+ksocknal_connsock_decref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+	if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+		LASSERT (conn->ksnc_closing);
+		libcfs_sock_release(conn->ksnc_sock);
+		conn->ksnc_sock = NULL;
+		ksocknal_finalize_zcreq(conn);
+	}
+}
+
+static inline void
+ksocknal_tx_addref (ksock_tx_t *tx)
+{
+	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	atomic_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
+extern void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx);
+
+static inline void
+ksocknal_tx_decref (ksock_tx_t *tx)
+{
+	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	if (atomic_dec_and_test(&tx->tx_refcount))
+		ksocknal_tx_done(NULL, tx);
+}
+
+static inline void
+ksocknal_route_addref (ksock_route_t *route)
+{
+	LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+	atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route (ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref (ksock_route_t *route)
+{
+	LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+	if (atomic_dec_and_test(&route->ksnr_refcount))
+		ksocknal_destroy_route (route);
+}
+
+static inline void
+ksocknal_peer_addref (ksock_peer_t *peer)
+{
+	LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+	atomic_inc(&peer->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer (ksock_peer_t *peer);
+
+static inline void
+ksocknal_peer_decref (ksock_peer_t *peer)
+{
+	LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+	if (atomic_dec_and_test(&peer->ksnp_refcount))
+		ksocknal_destroy_peer (peer);
+}
+
+int ksocknal_startup (lnet_ni_t *ni);
+void ksocknal_shutdown (lnet_ni_t *ni);
+int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ksocknal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+		  int delayed, unsigned int niov,
+		  struct iovec *iov, lnet_kiov_t *kiov,
+		  unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(lnet_ni_t *ni, socket_t *sock);
+
+extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
+extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed (ksock_peer_t *peer);
+extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+				 socket_t *sock, int type);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn (ksock_conn_t *conn);
+extern void ksocknal_destroy_conn (ksock_conn_t *conn);
+extern int  ksocknal_close_peer_conns_locked (ksock_peer_t *peer,
+					      __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+extern int ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr);
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer,
+					       ksock_tx_t *tx, int nonblk);
+
+extern int  ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx,
+				   lnet_process_id_t id);
+extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx (ksock_tx_t *tx);
+extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
+extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
+extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+				  int error);
+extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
+extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
+extern void ksocknal_thread_fini (void);
+extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_connd (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern int ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+				lnet_nid_t peer_nid, ksock_hello_msg_t *hello);
+extern int ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+				ksock_hello_msg_t *hello, lnet_process_id_t *id,
+				__u64 *incarnation);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+
+extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
+extern void ksocknal_lib_save_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(socket_t *sock,  ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn);
+extern int ksocknal_lib_setup_sock (socket_t *so);
+extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem,
+					   int *rxmem, int *nagle);
+
+extern int ksocknal_tunables_init(void);
+extern void ksocknal_tunables_fini(void);
+extern int ksocknal_lib_tunables_init(void);
+extern void ksocknal_lib_tunables_fini(void);
+
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+
+extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_bind_thread_to_cpu(int id);

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644
index 0000000..ad5e241
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c

@@ -0,0 +1,2664 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+ksock_tx_t *
+ksocknal_alloc_tx(int type, int size)
+{
+	ksock_tx_t *tx = NULL;
+
+	if (type == KSOCK_MSG_NOOP) {
+		LASSERT(size == KSOCK_NOOP_TX_SIZE);
+
+		/* searching for a noop tx in free list */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
+					    next, ksock_tx_t, tx_list);
+			LASSERT(tx->tx_desc_size == size);
+			list_del(&tx->tx_list);
+		}
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	}
+
+	if (tx == NULL)
+		LIBCFS_ALLOC(tx, size);
+
+	if (tx == NULL)
+		return NULL;
+
+	atomic_set(&tx->tx_refcount, 1);
+	tx->tx_zc_aborted = 0;
+	tx->tx_zc_capable = 0;
+	tx->tx_zc_checked = 0;
+	tx->tx_desc_size  = size;
+
+	atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+	return tx;
+}
+
+ksock_tx_t *
+ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
+{
+	ksock_tx_t *tx;
+
+	tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
+	if (tx == NULL) {
+		CERROR("Can't allocate noop tx desc\n");
+		return NULL;
+	}
+
+	tx->tx_conn     = NULL;
+	tx->tx_lnetmsg  = NULL;
+	tx->tx_kiov     = NULL;
+	tx->tx_nkiov    = 0;
+	tx->tx_iov      = tx->tx_frags.virt.iov;
+	tx->tx_niov     = 1;
+	tx->tx_nonblk   = nonblk;
+
+	socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP);
+	tx->tx_msg.ksm_zc_cookies[1] = cookie;
+
+	return tx;
+}
+
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+	atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+	if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+		/* it's a noop tx */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	} else {
+		LIBCFS_FREE(tx, tx->tx_desc_size);
+	}
+}
+
+int
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct iovec  *iov = tx->tx_iov;
+	int    nob;
+	int    rc;
+
+	LASSERT (tx->tx_niov > 0);
+
+	/* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+	rc = ksocknal_lib_send_iov(conn, tx);
+
+	if (rc <= 0)			    /* sent nothing? */
+		return (rc);
+
+	nob = rc;
+	LASSERT (nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
+
+	/* "consume" iov */
+	do {
+		LASSERT (tx->tx_niov > 0);
+
+		if (nob < (int) iov->iov_len) {
+			iov->iov_base = (void *)((char *)iov->iov_base + nob);
+			iov->iov_len -= nob;
+			return (rc);
+		}
+
+		nob -= iov->iov_len;
+		tx->tx_iov = ++iov;
+		tx->tx_niov--;
+	} while (nob != 0);
+
+	return (rc);
+}
+
+int
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	lnet_kiov_t    *kiov = tx->tx_kiov;
+	int     nob;
+	int     rc;
+
+	LASSERT (tx->tx_niov == 0);
+	LASSERT (tx->tx_nkiov > 0);
+
+	/* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+	rc = ksocknal_lib_send_kiov(conn, tx);
+
+	if (rc <= 0)			    /* sent nothing? */
+		return (rc);
+
+	nob = rc;
+	LASSERT (nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
+
+	/* "consume" kiov */
+	do {
+		LASSERT(tx->tx_nkiov > 0);
+
+		if (nob < (int)kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return rc;
+		}
+
+		nob -= (int)kiov->kiov_len;
+		tx->tx_kiov = ++kiov;
+		tx->tx_nkiov--;
+	} while (nob != 0);
+
+	return (rc);
+}
+
+int
+ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	int      rc;
+	int      bufnob;
+
+	if (ksocknal_data.ksnd_stall_tx != 0) {
+		cfs_pause(cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+	}
+
+	LASSERT (tx->tx_resid != 0);
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		return (-ESHUTDOWN);
+	}
+
+	do {
+		if (ksocknal_data.ksnd_enomem_tx > 0) {
+			/* testing... */
+			ksocknal_data.ksnd_enomem_tx--;
+			rc = -EAGAIN;
+		} else if (tx->tx_niov != 0) {
+			rc = ksocknal_send_iov (conn, tx);
+		} else {
+			rc = ksocknal_send_kiov (conn, tx);
+		}
+
+		bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+		if (rc > 0)		     /* sent something? */
+			conn->ksnc_tx_bufnob += rc; /* account it */
+
+		if (bufnob < conn->ksnc_tx_bufnob) {
+			/* allocated send buffer bytes < computed; infer
+			 * something got ACKed */
+			conn->ksnc_tx_deadline =
+				cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+			conn->ksnc_tx_bufnob = bufnob;
+			mb();
+		}
+
+		if (rc <= 0) { /* Didn't write anything? */
+
+			if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+				rc = -EAGAIN;
+
+			/* Check if EAGAIN is due to memory pressure */
+			if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+				rc = -ENOMEM;
+
+			break;
+		}
+
+		/* socket's wmem_queued now includes 'rc' bytes */
+		atomic_sub (rc, &conn->ksnc_tx_nob);
+		rc = 0;
+
+	} while (tx->tx_resid != 0);
+
+	ksocknal_connsock_decref(conn);
+	return (rc);
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+	struct iovec *iov = conn->ksnc_rx_iov;
+	int     nob;
+	int     rc;
+
+	LASSERT (conn->ksnc_rx_niov > 0);
+
+	/* Never touch conn->ksnc_rx_iov or change connection
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_iov(conn);
+
+	if (rc <= 0)
+		return (rc);
+
+	/* received something... */
+	nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();		       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+	do {
+		LASSERT (conn->ksnc_rx_niov > 0);
+
+		if (nob < (int)iov->iov_len) {
+			iov->iov_len -= nob;
+			iov->iov_base = (void *)((char *)iov->iov_base + nob);
+			return (-EAGAIN);
+		}
+
+		nob -= iov->iov_len;
+		conn->ksnc_rx_iov = ++iov;
+		conn->ksnc_rx_niov--;
+	} while (nob != 0);
+
+	return (rc);
+}
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+	int     nob;
+	int     rc;
+	LASSERT (conn->ksnc_rx_nkiov > 0);
+
+	/* Never touch conn->ksnc_rx_kiov or change connection
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_kiov(conn);
+
+	if (rc <= 0)
+		return (rc);
+
+	/* received something... */
+	nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();		       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+	do {
+		LASSERT (conn->ksnc_rx_nkiov > 0);
+
+		if (nob < (int) kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return -EAGAIN;
+		}
+
+		nob -= kiov->kiov_len;
+		conn->ksnc_rx_kiov = ++kiov;
+		conn->ksnc_rx_nkiov--;
+	} while (nob != 0);
+
+	return 1;
+}
+
+int
+ksocknal_receive (ksock_conn_t *conn)
+{
+	/* Return 1 on success, 0 on EOF, < 0 on error.
+	 * Caller checks ksnc_rx_nob_wanted to determine
+	 * progress/completion. */
+	int     rc;
+	ENTRY;
+
+	if (ksocknal_data.ksnd_stall_rx != 0) {
+		cfs_pause(cfs_time_seconds (ksocknal_data.ksnd_stall_rx));
+	}
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		return (-ESHUTDOWN);
+	}
+
+	for (;;) {
+		if (conn->ksnc_rx_niov != 0)
+			rc = ksocknal_recv_iov (conn);
+		else
+			rc = ksocknal_recv_kiov (conn);
+
+		if (rc <= 0) {
+			/* error/EOF or partial receive */
+			if (rc == -EAGAIN) {
+				rc = 1;
+			} else if (rc == 0 && conn->ksnc_rx_started) {
+				/* EOF in the middle of a message */
+				rc = -EPROTO;
+			}
+			break;
+		}
+
+		/* Completed a fragment */
+
+		if (conn->ksnc_rx_nob_wanted == 0) {
+			rc = 1;
+			break;
+		}
+	}
+
+	ksocknal_connsock_decref(conn);
+	RETURN (rc);
+}
+
+void
+ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx)
+{
+	lnet_msg_t  *lnetmsg = tx->tx_lnetmsg;
+	int	  rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO;
+	ENTRY;
+
+	LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+	if (tx->tx_conn != NULL)
+		ksocknal_conn_decref(tx->tx_conn);
+
+	if (ni == NULL && tx->tx_conn != NULL)
+		ni = tx->tx_conn->ksnc_peer->ksnp_ni;
+
+	ksocknal_free_tx (tx);
+	if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+		lnet_finalize (ni, lnetmsg, rc);
+
+	EXIT;
+}
+
+void
+ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error)
+{
+	ksock_tx_t *tx;
+
+	while (!list_empty (txlist)) {
+		tx = list_entry (txlist->next, ksock_tx_t, tx_list);
+
+		if (error && tx->tx_lnetmsg != NULL) {
+			CNETERR("Deleting packet type %d len %d %s->%s\n",
+				le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
+				le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
+		} else if (error) {
+			CNETERR("Deleting noop packet\n");
+		}
+
+		list_del (&tx->tx_list);
+
+		LASSERT (atomic_read(&tx->tx_refcount) == 1);
+		ksocknal_tx_done (ni, tx);
+	}
+}
+
+static void
+ksocknal_check_zc_req(ksock_tx_t *tx)
+{
+	ksock_conn_t   *conn = tx->tx_conn;
+	ksock_peer_t   *peer = conn->ksnc_peer;
+
+	/* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
+	 * to ksnp_zc_req_list if some fragment of this message should be sent
+	 * zero-copy.  Our peer will send an ACK containing this cookie when
+	 * she has received this message to tell us we can signal completion.
+	 * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
+	 * ksnp_zc_req_list. */
+	LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT (tx->tx_zc_capable);
+
+	tx->tx_zc_checked = 1;
+
+	if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
+	    !conn->ksnc_zc_capable)
+		return;
+
+	/* assign cookie and queue tx to pending list, it will be released when
+	 * a matching ack is received. See ksocknal_handle_zcack() */
+
+	ksocknal_tx_addref(tx);
+
+	spin_lock(&peer->ksnp_lock);
+
+	/* ZC_REQ is going to be pinned to the peer */
+	tx->tx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+	LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+
+	tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
+
+	if (peer->ksnp_zc_next_cookie == 0)
+		peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
+
+	spin_unlock(&peer->ksnp_lock);
+}
+
+static void
+ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+{
+	ksock_peer_t   *peer = tx->tx_conn->ksnc_peer;
+
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_zc_capable);
+
+	tx->tx_zc_checked = 0;
+
+	spin_lock(&peer->ksnp_lock);
+
+	if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+		/* Not waiting for an ACK */
+		spin_unlock(&peer->ksnp_lock);
+		return;
+	}
+
+	tx->tx_msg.ksm_zc_cookies[0] = 0;
+	list_del(&tx->tx_zc_list);
+
+	spin_unlock(&peer->ksnp_lock);
+
+	ksocknal_tx_decref(tx);
+}
+
+int
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	int	    rc;
+
+	if (tx->tx_zc_capable && !tx->tx_zc_checked)
+		ksocknal_check_zc_req(tx);
+
+	rc = ksocknal_transmit (conn, tx);
+
+	CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+	if (tx->tx_resid == 0) {
+		/* Sent everything OK */
+		LASSERT (rc == 0);
+
+		return (0);
+	}
+
+	if (rc == -EAGAIN)
+		return (rc);
+
+	if (rc == -ENOMEM) {
+		static int counter;
+
+		counter++;   /* exponential backoff warnings */
+		if ((counter & (-counter)) == counter)
+			CWARN("%u ENOMEM tx %p (%u allocated)\n",
+			      counter, conn, atomic_read(&libcfs_kmemory));
+
+		/* Queue on ksnd_enomem_conns for retry after a timeout */
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/* enomem list takes over scheduler's ref... */
+		LASSERT (conn->ksnc_tx_scheduled);
+		list_add_tail(&conn->ksnc_tx_list,
+				  &ksocknal_data.ksnd_enomem_conns);
+		if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+						   SOCKNAL_ENOMEM_RETRY),
+				   ksocknal_data.ksnd_reaper_waketime))
+			wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+		return (rc);
+	}
+
+	/* Actual error */
+	LASSERT (rc < 0);
+
+	if (!conn->ksnc_closing) {
+		switch (rc) {
+		case -ECONNRESET:
+			LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection "
+				      "while we were sending data; it may have "
+				      "rebooted.\n",
+				      HIPQUAD(conn->ksnc_ipaddr));
+			break;
+		default:
+			LCONSOLE_WARN("There was an unexpected network error "
+				      "while writing to %u.%u.%u.%u: %d.\n",
+				      HIPQUAD(conn->ksnc_ipaddr), rc);
+			break;
+		}
+		CDEBUG(D_NET, "[%p] Error %d on write to %s"
+		       " ip %d.%d.%d.%d:%d\n", conn, rc,
+		       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+		       HIPQUAD(conn->ksnc_ipaddr),
+		       conn->ksnc_port);
+	}
+
+	if (tx->tx_zc_checked)
+		ksocknal_uncheck_zc_req(tx);
+
+	/* it's not an error if conn is being closed */
+	ksocknal_close_conn_and_siblings (conn,
+					  (conn->ksnc_closing) ? 0 : rc);
+
+	return (rc);
+}
+
+void
+ksocknal_launch_connection_locked (ksock_route_t *route)
+{
+
+	/* called holding write lock on ksnd_global_lock */
+
+	LASSERT (!route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+	LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0);
+
+	route->ksnr_scheduled = 1;	      /* scheduling conn for connd */
+	ksocknal_route_addref(route);	   /* extra ref for connd */
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&route->ksnr_connd_list,
+			  &ksocknal_data.ksnd_connd_routes);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+}
+
+void
+ksocknal_launch_all_connections_locked (ksock_peer_t *peer)
+{
+	ksock_route_t *route;
+
+	/* called holding write lock on ksnd_global_lock */
+	for (;;) {
+		/* launch any/all connections that need it */
+		route = ksocknal_find_connectable_route_locked(peer);
+		if (route == NULL)
+			return;
+
+		ksocknal_launch_connection_locked(route);
+	}
+}
+
+ksock_conn_t *
+ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk)
+{
+	struct list_head       *tmp;
+	ksock_conn_t     *conn;
+	ksock_conn_t     *typed = NULL;
+	ksock_conn_t     *fallback = NULL;
+	int	       tnob     = 0;
+	int	       fnob     = 0;
+
+	list_for_each (tmp, &peer->ksnp_conns) {
+		ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
+		int	   nob = atomic_read(&c->ksnc_tx_nob) +
+				    cfs_sock_wmem_queued(c->ksnc_sock);
+		int	   rc;
+
+		LASSERT (!c->ksnc_closing);
+		LASSERT (c->ksnc_proto != NULL &&
+			 c->ksnc_proto->pro_match_tx != NULL);
+
+		rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
+
+		switch (rc) {
+		default:
+			LBUG();
+		case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
+			continue;
+
+		case SOCKNAL_MATCH_YES: /* typed connection */
+			if (typed == NULL || tnob > nob ||
+			    (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+				typed = c;
+				tnob  = nob;
+			}
+			break;
+
+		case SOCKNAL_MATCH_MAY: /* fallback connection */
+			if (fallback == NULL || fnob > nob ||
+			    (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+				fallback = c;
+				fnob     = nob;
+			}
+			break;
+		}
+	}
+
+	/* prefer the typed selection */
+	conn = (typed != NULL) ? typed : fallback;
+
+	if (conn != NULL)
+		conn->ksnc_tx_last_post = cfs_time_current();
+
+	return conn;
+}
+
+void
+ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	conn->ksnc_proto->pro_pack(tx);
+
+	atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+	ksocknal_conn_addref(conn); /* +1 ref for tx */
+	tx->tx_conn = conn;
+}
+
+void
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+{
+	ksock_sched_t *sched = conn->ksnc_scheduler;
+	ksock_msg_t   *msg = &tx->tx_msg;
+	ksock_tx_t    *ztx = NULL;
+	int	    bufnob = 0;
+
+	/* called holding global lock (read or irq-write) and caller may
+	 * not have dropped this lock between finding conn and calling me,
+	 * so we don't need the {get,put}connsock dance to deref
+	 * ksnc_sock... */
+	LASSERT(!conn->ksnc_closing);
+
+	CDEBUG (D_NET, "Sending to %s ip %d.%d.%d.%d:%d\n",
+		libcfs_id2str(conn->ksnc_peer->ksnp_id),
+		HIPQUAD(conn->ksnc_ipaddr),
+		conn->ksnc_port);
+
+	ksocknal_tx_prep(conn, tx);
+
+	/* Ensure the frags we've been given EXACTLY match the number of
+	 * bytes we want to send.  Many TCP/IP stacks disregard any total
+	 * size parameters passed to them and just look at the frags.
+	 *
+	 * We always expect at least 1 mapped fragment containing the
+	 * complete ksocknal message header. */
+	LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+		 lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
+		 (unsigned int)tx->tx_nob);
+	LASSERT (tx->tx_niov >= 1);
+	LASSERT (tx->tx_resid == tx->tx_nob);
+
+	CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+		tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type:
+					       KSOCK_MSG_NOOP,
+		tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+	/*
+	 * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__
+	 * but they're used inside spinlocks a lot.
+	 */
+	bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+	spin_lock_bh(&sched->kss_lock);
+
+	if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
+		/* First packet starts the timeout */
+		conn->ksnc_tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+		if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+		conn->ksnc_tx_bufnob = 0;
+		mb(); /* order with adding to tx_queue */
+	}
+
+	if (msg->ksm_type == KSOCK_MSG_NOOP) {
+		/* The packet is noop ZC ACK, try to piggyback the ack_cookie
+		 * on a normal packet so I don't need to send it */
+		LASSERT (msg->ksm_zc_cookies[1] != 0);
+		LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+		if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
+			ztx = tx; /* ZC ACK piggybacked on ztx release tx later */
+
+	} else {
+		/* It's a normal packet - can it piggback a noop zc-ack that
+		 * has been queued already? */
+		LASSERT (msg->ksm_zc_cookies[1] == 0);
+		LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL);
+
+		ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
+		/* ztx will be released later */
+	}
+
+	if (ztx != NULL) {
+		atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+		list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+	}
+
+	if (conn->ksnc_tx_ready &&      /* able to send */
+	    !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+		/* +1 ref for scheduler */
+		ksocknal_conn_addref(conn);
+		list_add_tail (&conn->ksnc_tx_list,
+				   &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+}
+
+
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+{
+	cfs_time_t     now = cfs_time_current();
+	struct list_head    *tmp;
+	ksock_route_t *route;
+
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+		if (route->ksnr_scheduled)      /* connections being established */
+			continue;
+
+		/* all route types connected ? */
+		if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
+			continue;
+
+		if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+		      cfs_time_aftereq(now, route->ksnr_timeout))) {
+			CDEBUG(D_NET,
+			       "Too soon to retry route %u.%u.%u.%u "
+			       "(cnted %d, interval %ld, %ld secs later)\n",
+			       HIPQUAD(route->ksnr_ipaddr),
+			       route->ksnr_connected,
+			       route->ksnr_retry_interval,
+			       cfs_duration_sec(route->ksnr_timeout - now));
+			continue;
+		}
+
+		return (route);
+	}
+
+	return (NULL);
+}
+
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
+{
+	struct list_head	*tmp;
+	ksock_route_t     *route;
+
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+		if (route->ksnr_scheduled)
+			return (route);
+	}
+
+	return (NULL);
+}
+
+int
+ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
+{
+	ksock_peer_t     *peer;
+	ksock_conn_t     *conn;
+	rwlock_t     *g_lock;
+	int	       retry;
+	int	       rc;
+
+	LASSERT (tx->tx_conn == NULL);
+
+	g_lock = &ksocknal_data.ksnd_global_lock;
+
+	for (retry = 0;; retry = 1) {
+		read_lock(g_lock);
+		peer = ksocknal_find_peer_locked(ni, id);
+		if (peer != NULL) {
+			if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+				conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+				if (conn != NULL) {
+					/* I've got no routes that need to be
+					 * connecting and I do have an actual
+					 * connection... */
+					ksocknal_queue_tx_locked (tx, conn);
+					read_unlock(g_lock);
+					return (0);
+				}
+			}
+		}
+
+		/* I'll need a write lock... */
+		read_unlock(g_lock);
+
+		write_lock_bh(g_lock);
+
+		peer = ksocknal_find_peer_locked(ni, id);
+		if (peer != NULL)
+			break;
+
+		write_unlock_bh(g_lock);
+
+		if ((id.pid & LNET_PID_USERFLAG) != 0) {
+			CERROR("Refusing to create a connection to "
+			       "userspace process %s\n", libcfs_id2str(id));
+			return -EHOSTUNREACH;
+		}
+
+		if (retry) {
+			CERROR("Can't find peer %s\n", libcfs_id2str(id));
+			return -EHOSTUNREACH;
+		}
+
+		rc = ksocknal_add_peer(ni, id,
+				       LNET_NIDADDR(id.nid),
+				       lnet_acceptor_port());
+		if (rc != 0) {
+			CERROR("Can't add peer %s: %d\n",
+			       libcfs_id2str(id), rc);
+			return rc;
+		}
+	}
+
+	ksocknal_launch_all_connections_locked(peer);
+
+	conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+	if (conn != NULL) {
+		/* Connection exists; queue message on it */
+		ksocknal_queue_tx_locked (tx, conn);
+		write_unlock_bh(g_lock);
+		return (0);
+	}
+
+	if (peer->ksnp_accepting > 0 ||
+	    ksocknal_find_connecting_route_locked (peer) != NULL) {
+		/* the message is going to be pinned to the peer */
+		tx->tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+		/* Queue the message until a connection is established */
+		list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
+		write_unlock_bh(g_lock);
+		return 0;
+	}
+
+	write_unlock_bh(g_lock);
+
+	/* NB Routes may be ignored if connections to them failed recently */
+	CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+	return (-EHOSTUNREACH);
+}
+
+int
+ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	int	       mpflag = 0;
+	int	       type = lntmsg->msg_type;
+	lnet_process_id_t target = lntmsg->msg_target;
+	unsigned int      payload_niov = lntmsg->msg_niov;
+	struct iovec     *payload_iov = lntmsg->msg_iov;
+	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+	unsigned int      payload_offset = lntmsg->msg_offset;
+	unsigned int      payload_nob = lntmsg->msg_len;
+	ksock_tx_t       *tx;
+	int	       desc_size;
+	int	       rc;
+
+	/* NB 'private' is different depending on what we're sending.
+	 * Just ignore it... */
+
+	CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+	       payload_nob, payload_niov, libcfs_id2str(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+	/* payload is either all vaddrs or all pages */
+	LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+	LASSERT (!in_interrupt ());
+
+	if (payload_iov != NULL)
+		desc_size = offsetof(ksock_tx_t,
+				     tx_frags.virt.iov[1 + payload_niov]);
+	else
+		desc_size = offsetof(ksock_tx_t,
+				     tx_frags.paged.kiov[payload_niov]);
+
+	if (lntmsg->msg_vmflush)
+		mpflag = cfs_memory_pressure_get_and_set();
+	tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
+	if (tx == NULL) {
+		CERROR("Can't allocate tx desc type %d size %d\n",
+		       type, desc_size);
+		if (lntmsg->msg_vmflush)
+			cfs_memory_pressure_restore(mpflag);
+		return (-ENOMEM);
+	}
+
+	tx->tx_conn = NULL;		     /* set when assigned a conn */
+	tx->tx_lnetmsg = lntmsg;
+
+	if (payload_iov != NULL) {
+		tx->tx_kiov = NULL;
+		tx->tx_nkiov = 0;
+		tx->tx_iov = tx->tx_frags.virt.iov;
+		tx->tx_niov = 1 +
+			      lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+					       payload_niov, payload_iov,
+					       payload_offset, payload_nob);
+	} else {
+		tx->tx_niov = 1;
+		tx->tx_iov = &tx->tx_frags.paged.iov;
+		tx->tx_kiov = tx->tx_frags.paged.kiov;
+		tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+						 payload_niov, payload_kiov,
+						 payload_offset, payload_nob);
+
+		if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
+			tx->tx_zc_capable = 1;
+	}
+
+	socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET);
+
+	/* The first fragment will be set later in pro_pack */
+	rc = ksocknal_launch_packet(ni, tx, target);
+	if (lntmsg->msg_vmflush)
+		cfs_memory_pressure_restore(mpflag);
+	if (rc == 0)
+		return (0);
+
+	ksocknal_free_tx(tx);
+	return (-EIO);
+}
+
+int
+ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+	task_t *task = kthread_run(fn, arg, name);
+
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	ksocknal_data.ksnd_nthreads++;
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+	return 0;
+}
+
+void
+ksocknal_thread_fini (void)
+{
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	ksocknal_data.ksnd_nthreads--;
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+	static char ksocknal_slop_buffer[4096];
+
+	int	    nob;
+	unsigned int   niov;
+	int	    skipped;
+
+	LASSERT(conn->ksnc_proto != NULL);
+
+	if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+		/* Remind the socket to ack eagerly... */
+		ksocknal_lib_eager_ack(conn);
+	}
+
+	if (nob_to_skip == 0) {	 /* right at next packet boundary now */
+		conn->ksnc_rx_started = 0;
+		mb();		       /* racing with timeout thread */
+
+		switch (conn->ksnc_proto->pro_version) {
+		case  KSOCK_PROTO_V2:
+		case  KSOCK_PROTO_V3:
+			conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+			conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg;
+
+			conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u);
+			conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u);
+			conn->ksnc_rx_iov[0].iov_len  = offsetof(ksock_msg_t, ksm_u);
+			break;
+
+		case KSOCK_PROTO_V1:
+			/* Receiving bare lnet_hdr_t */
+			conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+			conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t);
+			conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t);
+
+			conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+			conn->ksnc_rx_iov[0].iov_len  = sizeof (lnet_hdr_t);
+			break;
+
+		default:
+			LBUG ();
+		}
+		conn->ksnc_rx_niov = 1;
+
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_nkiov = 0;
+		conn->ksnc_rx_csum = ~0;
+		return (1);
+	}
+
+	/* Set up to skip as much as possible now.  If there's more left
+	 * (ran out of iov entries) we'll get called again */
+
+	conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+	conn->ksnc_rx_nob_left = nob_to_skip;
+	conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+	skipped = 0;
+	niov = 0;
+
+	do {
+		nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+		conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+		conn->ksnc_rx_iov[niov].iov_len  = nob;
+		niov++;
+		skipped += nob;
+		nob_to_skip -=nob;
+
+	} while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+		 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+	conn->ksnc_rx_niov = niov;
+	conn->ksnc_rx_kiov = NULL;
+	conn->ksnc_rx_nkiov = 0;
+	conn->ksnc_rx_nob_wanted = skipped;
+	return (0);
+}
+
+int
+ksocknal_process_receive (ksock_conn_t *conn)
+{
+	lnet_hdr_t	*lhdr;
+	lnet_process_id_t *id;
+	int		rc;
+
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+
+	/* NB: sched lock NOT held */
+	/* SOCKNAL_RX_LNET_HEADER is here for backward compatability */
+	LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+	if (conn->ksnc_rx_nob_wanted != 0) {
+		rc = ksocknal_receive(conn);
+
+		if (rc <= 0) {
+			LASSERT (rc != -EAGAIN);
+
+			if (rc == 0)
+				CDEBUG (D_NET, "[%p] EOF from %s"
+					" ip %d.%d.%d.%d:%d\n", conn,
+					libcfs_id2str(conn->ksnc_peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+			else if (!conn->ksnc_closing)
+				CERROR ("[%p] Error %d on read from %s"
+					" ip %d.%d.%d.%d:%d\n",
+					conn, rc,
+					libcfs_id2str(conn->ksnc_peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+
+			/* it's not an error if conn is being closed */
+			ksocknal_close_conn_and_siblings (conn,
+							  (conn->ksnc_closing) ? 0 : rc);
+			return (rc == 0 ? -ESHUTDOWN : rc);
+		}
+
+		if (conn->ksnc_rx_nob_wanted != 0) {
+			/* short read */
+			return (-EAGAIN);
+		}
+	}
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_KSM_HEADER:
+		if (conn->ksnc_flip) {
+			__swab32s(&conn->ksnc_msg.ksm_type);
+			__swab32s(&conn->ksnc_msg.ksm_csum);
+			__swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
+			__swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
+		}
+
+		if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
+		    conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
+			CERROR("%s: Unknown message type: %x\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_type);
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, -EPROTO);
+			return (-EPROTO);
+		}
+
+		if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+		    conn->ksnc_msg.ksm_csum != 0 &&     /* has checksum */
+		    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+			/* NOOP Checksum error */
+			CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, -EPROTO);
+			return (-EIO);
+		}
+
+		if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) {
+			__u64 cookie = 0;
+
+			LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+			if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
+				cookie = conn->ksnc_msg.ksm_zc_cookies[0];
+
+			rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
+					       conn->ksnc_msg.ksm_zc_cookies[1]);
+
+			if (rc != 0) {
+				CERROR("%s: Unknown ZC-ACK cookie: "LPU64", "LPU64"\n",
+				       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+				       cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
+				ksocknal_new_packet(conn, 0);
+				ksocknal_close_conn_and_siblings(conn, -EPROTO);
+				return (rc);
+			}
+		}
+
+		if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
+			ksocknal_new_packet (conn, 0);
+			return 0;       /* NOOP is done and just return */
+		}
+
+		conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+		conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t);
+		conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t);
+
+		conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+		conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+		conn->ksnc_rx_iov[0].iov_len  = sizeof(ksock_lnet_msg_t);
+
+		conn->ksnc_rx_niov = 1;
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_nkiov = 0;
+
+		goto again;     /* read lnet header now */
+
+	case SOCKNAL_RX_LNET_HEADER:
+		/* unpack message header */
+		conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
+
+		if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
+			/* Userspace peer */
+			lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+			id   = &conn->ksnc_peer->ksnp_id;
+
+			/* Substitute process ID assigned at connection time */
+			lhdr->src_pid = cpu_to_le32(id->pid);
+			lhdr->src_nid = cpu_to_le64(id->nid);
+		}
+
+		conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+		ksocknal_conn_addref(conn);     /* ++ref while parsing */
+
+		rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
+				&conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
+				conn->ksnc_peer->ksnp_id.nid, conn, 0);
+		if (rc < 0) {
+			/* I just received garbage: give up on this conn */
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings (conn, rc);
+			ksocknal_conn_decref(conn);
+			return (-EPROTO);
+		}
+
+		/* I'm racing with ksocknal_recv() */
+		LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+			 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+
+		if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+			return 0;
+
+		/* ksocknal_recv() got called */
+		goto again;
+
+	case SOCKNAL_RX_LNET_PAYLOAD:
+		/* payload all received */
+		rc = 0;
+
+		if (conn->ksnc_rx_nob_left == 0 &&   /* not truncating */
+		    conn->ksnc_msg.ksm_csum != 0 &&  /* has checksum */
+		    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+			CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+			rc = -EIO;
+		}
+
+		if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) {
+			LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+			lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+			id   = &conn->ksnc_peer->ksnp_id;
+
+			rc = conn->ksnc_proto->pro_handle_zcreq(conn,
+					conn->ksnc_msg.ksm_zc_cookies[0],
+					*ksocknal_tunables.ksnd_nonblk_zcack ||
+					le64_to_cpu(lhdr->src_nid) != id->nid);
+		}
+
+		lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
+
+		if (rc != 0) {
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings (conn, rc);
+			return (-EPROTO);
+		}
+		/* Fall through */
+
+	case SOCKNAL_RX_SLOP:
+		/* starting new packet? */
+		if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+			return 0;       /* come back later */
+		goto again;	     /* try to finish reading slop now */
+
+	default:
+		break;
+	}
+
+	/* Not Reached */
+	LBUG ();
+	return (-EINVAL);		       /* keep gcc happy */
+}
+
+int
+ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	       unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+	       unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	ksock_conn_t  *conn = (ksock_conn_t *)private;
+	ksock_sched_t *sched = conn->ksnc_scheduler;
+
+	LASSERT (mlen <= rlen);
+	LASSERT (niov <= LNET_MAX_IOV);
+
+	conn->ksnc_cookie = msg;
+	conn->ksnc_rx_nob_wanted = mlen;
+	conn->ksnc_rx_nob_left   = rlen;
+
+	if (mlen == 0 || iov != NULL) {
+		conn->ksnc_rx_nkiov = 0;
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+		conn->ksnc_rx_niov =
+			lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+					 niov, iov, offset, mlen);
+	} else {
+		conn->ksnc_rx_niov = 0;
+		conn->ksnc_rx_iov  = NULL;
+		conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+		conn->ksnc_rx_nkiov =
+			lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+					  niov, kiov, offset, mlen);
+	}
+
+	LASSERT (mlen ==
+		 lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+		 lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+	LASSERT (conn->ksnc_rx_scheduled);
+
+	spin_lock_bh(&sched->kss_lock);
+
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_PARSE_WAIT:
+		list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+		wake_up (&sched->kss_waitq);
+		LASSERT (conn->ksnc_rx_ready);
+		break;
+
+	case SOCKNAL_RX_PARSE:
+		/* scheduler hasn't noticed I'm parsing yet */
+		break;
+	}
+
+	conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_conn_decref(conn);
+	return 0;
+}
+
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+	int	   rc;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	rc = (!ksocknal_data.ksnd_shuttingdown &&
+	      list_empty(&sched->kss_rx_conns) &&
+	      list_empty(&sched->kss_tx_conns));
+
+	spin_unlock_bh(&sched->kss_lock);
+	return rc;
+}
+
+int ksocknal_scheduler(void *arg)
+{
+	struct ksock_sched_info	*info;
+	ksock_sched_t		*sched;
+	ksock_conn_t		*conn;
+	ksock_tx_t		*tx;
+	int			rc;
+	int			nloops = 0;
+	long			id = (long)arg;
+
+	info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+	sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+
+	cfs_block_allsigs();
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+	if (rc != 0) {
+		CERROR("Can't set CPT affinity to %d: %d\n",
+		       info->ksi_cpt, rc);
+	}
+
+	spin_lock_bh(&sched->kss_lock);
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		int did_something = 0;
+
+		/* Ensure I progress everything semi-fairly */
+
+		if (!list_empty (&sched->kss_rx_conns)) {
+			conn = list_entry(sched->kss_rx_conns.next,
+					      ksock_conn_t, ksnc_rx_list);
+			list_del(&conn->ksnc_rx_list);
+
+			LASSERT(conn->ksnc_rx_scheduled);
+			LASSERT(conn->ksnc_rx_ready);
+
+			/* clear rx_ready in case receive isn't complete.
+			 * Do it BEFORE we call process_recv, since
+			 * data_ready can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_rx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			rc = ksocknal_process_receive(conn);
+
+			spin_lock_bh(&sched->kss_lock);
+
+			/* I'm the only one that can clear this flag */
+			LASSERT(conn->ksnc_rx_scheduled);
+
+			/* Did process_receive get everything it wanted? */
+			if (rc == 0)
+				conn->ksnc_rx_ready = 1;
+
+			if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+				/* Conn blocked waiting for ksocknal_recv()
+				 * I change its state (under lock) to signal
+				 * it can be rescheduled */
+				conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+			} else if (conn->ksnc_rx_ready) {
+				/* reschedule for rx */
+				list_add_tail (&conn->ksnc_rx_list,
+						   &sched->kss_rx_conns);
+			} else {
+				conn->ksnc_rx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
+
+			did_something = 1;
+		}
+
+		if (!list_empty (&sched->kss_tx_conns)) {
+			LIST_HEAD    (zlist);
+
+			if (!list_empty(&sched->kss_zombie_noop_txs)) {
+				list_add(&zlist,
+					     &sched->kss_zombie_noop_txs);
+				list_del_init(&sched->kss_zombie_noop_txs);
+			}
+
+			conn = list_entry(sched->kss_tx_conns.next,
+					      ksock_conn_t, ksnc_tx_list);
+			list_del (&conn->ksnc_tx_list);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			LASSERT(conn->ksnc_tx_ready);
+			LASSERT(!list_empty(&conn->ksnc_tx_queue));
+
+			tx = list_entry(conn->ksnc_tx_queue.next,
+					    ksock_tx_t, tx_list);
+
+			if (conn->ksnc_tx_carrier == tx)
+				ksocknal_next_tx_carrier(conn);
+
+			/* dequeue now so empty list => more to send */
+			list_del(&tx->tx_list);
+
+			/* Clear tx_ready in case send isn't complete.  Do
+			 * it BEFORE we call process_transmit, since
+			 * write_space can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_tx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			if (!list_empty(&zlist)) {
+				/* free zombie noop txs, it's fast because
+				 * noop txs are just put in freelist */
+				ksocknal_txlist_done(NULL, &zlist, 0);
+			}
+
+			rc = ksocknal_process_transmit(conn, tx);
+
+			if (rc == -ENOMEM || rc == -EAGAIN) {
+				/* Incomplete send: replace tx on HEAD of tx_queue */
+				spin_lock_bh(&sched->kss_lock);
+				list_add(&tx->tx_list,
+					     &conn->ksnc_tx_queue);
+			} else {
+				/* Complete send; tx -ref */
+				ksocknal_tx_decref(tx);
+
+				spin_lock_bh(&sched->kss_lock);
+				/* assume space for more */
+				conn->ksnc_tx_ready = 1;
+			}
+
+			if (rc == -ENOMEM) {
+				/* Do nothing; after a short timeout, this
+				 * conn will be reposted on kss_tx_conns. */
+			} else if (conn->ksnc_tx_ready &&
+				   !list_empty (&conn->ksnc_tx_queue)) {
+				/* reschedule for tx */
+				list_add_tail (&conn->ksnc_tx_list,
+						   &sched->kss_tx_conns);
+			} else {
+				conn->ksnc_tx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
+
+			did_something = 1;
+		}
+		if (!did_something ||	   /* nothing to do */
+		    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+			spin_unlock_bh(&sched->kss_lock);
+
+			nloops = 0;
+
+			if (!did_something) {   /* wait for something to do */
+				cfs_wait_event_interruptible_exclusive(
+					sched->kss_waitq,
+					!ksocknal_sched_cansleep(sched), rc);
+				LASSERT (rc == 0);
+			} else {
+				cond_resched();
+			}
+
+			spin_lock_bh(&sched->kss_lock);
+		}
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_thread_fini();
+	return 0;
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
+{
+	ksock_sched_t *sched;
+	ENTRY;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_rx_ready = 1;
+
+	if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+		list_add_tail(&conn->ksnc_rx_list,
+				  &sched->kss_rx_conns);
+		conn->ksnc_rx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+	spin_unlock_bh(&sched->kss_lock);
+
+	EXIT;
+}
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback (ksock_conn_t *conn)
+{
+	ksock_sched_t *sched;
+	ENTRY;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled && // not being progressed
+	    !list_empty(&conn->ksnc_tx_queue)){//packets to send
+		list_add_tail (&conn->ksnc_tx_list,
+				   &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	EXIT;
+}
+
+ksock_proto_t *
+ksocknal_parse_proto_version (ksock_hello_msg_t *hello)
+{
+	__u32   version = 0;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		version = hello->kshm_version;
+	else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
+		version = __swab32(hello->kshm_version);
+
+	if (version != 0) {
+#if SOCKNAL_VERSION_DEBUG
+		if (*ksocknal_tunables.ksnd_protocol == 1)
+			return NULL;
+
+		if (*ksocknal_tunables.ksnd_protocol == 2 &&
+		    version == KSOCK_PROTO_V3)
+			return NULL;
+#endif
+		if (version == KSOCK_PROTO_V2)
+			return &ksocknal_protocol_v2x;
+
+		if (version == KSOCK_PROTO_V3)
+			return &ksocknal_protocol_v3x;
+
+		return NULL;
+	}
+
+	if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+		lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello;
+
+		CLASSERT (sizeof (lnet_magicversion_t) ==
+			  offsetof (ksock_hello_msg_t, kshm_src_nid));
+
+		if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+		    hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+			return &ksocknal_protocol_v1x;
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+		     lnet_nid_t peer_nid, ksock_hello_msg_t *hello)
+{
+	/* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+	ksock_net_t	 *net = (ksock_net_t *)ni->ni_data;
+
+	LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES);
+
+	/* rely on caller to hold a ref on socket so it wouldn't disappear */
+	LASSERT (conn->ksnc_proto != NULL);
+
+	hello->kshm_src_nid	 = ni->ni_nid;
+	hello->kshm_dst_nid	 = peer_nid;
+	hello->kshm_src_pid	 = the_lnet.ln_pid;
+
+	hello->kshm_src_incarnation = net->ksnn_incarnation;
+	hello->kshm_ctype	   = conn->ksnc_type;
+
+	return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+int
+ksocknal_invert_type(int type)
+{
+	switch (type)
+	{
+	case SOCKLND_CONN_ANY:
+	case SOCKLND_CONN_CONTROL:
+		return (type);
+	case SOCKLND_CONN_BULK_IN:
+		return SOCKLND_CONN_BULK_OUT;
+	case SOCKLND_CONN_BULK_OUT:
+		return SOCKLND_CONN_BULK_IN;
+	default:
+		return (SOCKLND_CONN_NONE);
+	}
+}
+
+int
+ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+		     ksock_hello_msg_t *hello, lnet_process_id_t *peerid,
+		     __u64 *incarnation)
+{
+	/* Return < 0	fatal error
+	 *	0	  success
+	 *	EALREADY   lost connection race
+	 *	EPROTO     protocol version mismatch
+	 */
+	socket_t	*sock = conn->ksnc_sock;
+	int		  active = (conn->ksnc_proto != NULL);
+	int		  timeout;
+	int		  proto_match;
+	int		  rc;
+	ksock_proto_t       *proto;
+	lnet_process_id_t    recv_id;
+
+	/* socket type set on active connections - not set on passive */
+	LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
+
+	timeout = active ? *ksocknal_tunables.ksnd_timeout :
+			    lnet_acceptor_timeout();
+
+	rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+	    hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+	    hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+		/* Unexpected magic! */
+		CERROR ("Bad magic(1) %#08x (%#08x expected) from "
+			"%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic),
+			LNET_PROTO_TCP_MAGIC,
+			HIPQUAD(conn->ksnc_ipaddr));
+		return -EPROTO;
+	}
+
+	rc = libcfs_sock_read(sock, &hello->kshm_version,
+			      sizeof(hello->kshm_version), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	proto = ksocknal_parse_proto_version(hello);
+	if (proto == NULL) {
+		if (!active) {
+			/* unknown protocol from peer, tell peer my protocol */
+			conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+			if (*ksocknal_tunables.ksnd_protocol == 2)
+				conn->ksnc_proto = &ksocknal_protocol_v2x;
+			else if (*ksocknal_tunables.ksnd_protocol == 1)
+				conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+			hello->kshm_nips = 0;
+			ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
+		}
+
+		CERROR ("Unknown protocol version (%d.x expected)"
+			" from %u.%u.%u.%u\n",
+			conn->ksnc_proto->pro_version,
+			HIPQUAD(conn->ksnc_ipaddr));
+
+		return -EPROTO;
+	}
+
+	proto_match = (conn->ksnc_proto == proto);
+	conn->ksnc_proto = proto;
+
+	/* receive the rest of hello message anyway */
+	rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading or checking hello from from %u.%u.%u.%u\n",
+		       rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	*incarnation = hello->kshm_src_incarnation;
+
+	if (hello->kshm_src_nid == LNET_NID_ANY) {
+		CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY"
+		       "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr));
+		return -EPROTO;
+	}
+
+	if (!active &&
+	    conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+		/* Userspace NAL assigns peer process ID from socket */
+		recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+		recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+	} else {
+		recv_id.nid = hello->kshm_src_nid;
+		recv_id.pid = hello->kshm_src_pid;
+	}
+
+	if (!active) {
+		*peerid = recv_id;
+
+		/* peer determines type */
+		conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+		if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+			CERROR ("Unexpected type %d from %s ip %u.%u.%u.%u\n",
+				hello->kshm_ctype, libcfs_id2str(*peerid),
+				HIPQUAD(conn->ksnc_ipaddr));
+			return -EPROTO;
+		}
+
+		return 0;
+	}
+
+	if (peerid->pid != recv_id.pid ||
+	    peerid->nid != recv_id.nid) {
+		LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host"
+				   " %u.%u.%u.%u, but they claimed they were "
+				   "%s; please check your Lustre "
+				   "configuration.\n",
+				   libcfs_id2str(*peerid),
+				   HIPQUAD(conn->ksnc_ipaddr),
+				   libcfs_id2str(recv_id));
+		return -EPROTO;
+	}
+
+	if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+		/* Possible protocol mismatch or I lost the connection race */
+		return proto_match ? EALREADY : EPROTO;
+	}
+
+	if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+		CERROR ("Mismatched types: me %d, %s ip %u.%u.%u.%u %d\n",
+			conn->ksnc_type, libcfs_id2str(*peerid),
+			HIPQUAD(conn->ksnc_ipaddr),
+			hello->kshm_ctype);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+int
+ksocknal_connect (ksock_route_t *route)
+{
+	LIST_HEAD    (zombies);
+	ksock_peer_t     *peer = route->ksnr_peer;
+	int	       type;
+	int	       wanted;
+	socket_t     *sock;
+	cfs_time_t	deadline;
+	int	       retry_later = 0;
+	int	       rc = 0;
+
+	deadline = cfs_time_add(cfs_time_current(),
+				cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	LASSERT (route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+
+	route->ksnr_connecting = 1;
+
+	for (;;) {
+		wanted = ksocknal_route_mask() & ~route->ksnr_connected;
+
+		/* stop connecting if peer/route got closed under me, or
+		 * route got connected while queued */
+		if (peer->ksnp_closing || route->ksnr_deleted ||
+		    wanted == 0) {
+			retry_later = 0;
+			break;
+		}
+
+		/* reschedule if peer is connecting to me */
+		if (peer->ksnp_accepting > 0) {
+			CDEBUG(D_NET,
+			       "peer %s(%d) already connecting to me, retry later.\n",
+			       libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting);
+			retry_later = 1;
+		}
+
+		if (retry_later) /* needs reschedule */
+			break;
+
+		if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) {
+			type = SOCKLND_CONN_ANY;
+		} else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) {
+			type = SOCKLND_CONN_CONTROL;
+		} else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) {
+			type = SOCKLND_CONN_BULK_IN;
+		} else {
+			LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0);
+			type = SOCKLND_CONN_BULK_OUT;
+		}
+
+		write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+		if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+			rc = -ETIMEDOUT;
+			lnet_connect_console_error(rc, peer->ksnp_id.nid,
+						   route->ksnr_ipaddr,
+						   route->ksnr_port);
+			goto failed;
+		}
+
+		rc = lnet_connect(&sock, peer->ksnp_id.nid,
+				  route->ksnr_myipaddr,
+				  route->ksnr_ipaddr, route->ksnr_port);
+		if (rc != 0)
+			goto failed;
+
+		rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+		if (rc < 0) {
+			lnet_connect_console_error(rc, peer->ksnp_id.nid,
+						   route->ksnr_ipaddr,
+						   route->ksnr_port);
+			goto failed;
+		}
+
+		/* A +ve RC means I have to retry because I lost the connection
+		 * race or I have to renegotiate protocol version */
+		retry_later = (rc != 0);
+		if (retry_later)
+			CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
+			       libcfs_nid2str(peer->ksnp_id.nid));
+
+		write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	}
+
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+
+	if (retry_later) {
+		/* re-queue for attention; this frees me up to handle
+		 * the peer's incoming connection request */
+
+		if (rc == EALREADY ||
+		    (rc == 0 && peer->ksnp_accepting > 0)) {
+			/* We want to introduce a delay before next
+			 * attempt to connect if we lost conn race,
+			 * but the race is resolved quickly usually,
+			 * so min_reconnectms should be good heuristic */
+			route->ksnr_retry_interval =
+				cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+			route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+							   route->ksnr_retry_interval);
+		}
+
+		ksocknal_launch_connection_locked(route);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+	return retry_later;
+
+ failed:
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+
+	/* This is a retry rather than a new connection */
+	route->ksnr_retry_interval *= 2;
+	route->ksnr_retry_interval =
+		MAX(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+	route->ksnr_retry_interval =
+		MIN(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+
+	LASSERT (route->ksnr_retry_interval != 0);
+	route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+					   route->ksnr_retry_interval);
+
+	if (!list_empty(&peer->ksnp_tx_queue) &&
+	    peer->ksnp_accepting == 0 &&
+	    ksocknal_find_connecting_route_locked(peer) == NULL) {
+		ksock_conn_t *conn;
+
+		/* ksnp_tx_queue is queued on a conn on successful
+		 * connection for V1.x and V2.x */
+		if (!list_empty (&peer->ksnp_conns)) {
+			conn = list_entry(peer->ksnp_conns.next,
+					      ksock_conn_t, ksnc_list);
+			LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+		}
+
+		/* take all the blocked packets while I've got the lock and
+		 * complete below... */
+		list_splice_init(&peer->ksnp_tx_queue, &zombies);
+	}
+
+#if 0	   /* irrelevent with only eager routes */
+	if (!route->ksnr_deleted) {
+		/* make this route least-favourite for re-selection */
+		list_del(&route->ksnr_list);
+		list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+	}
+#endif
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_peer_failed(peer);
+	ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
+	return 0;
+}
+
+/*
+ * check whether we need to create more connds.
+ * It will try to create new thread if it's necessary, @timeout can
+ * be updated if failed to create, so caller wouldn't keep try while
+ * running out of resource.
+ */
+static int
+ksocknal_connd_check_start(long sec, long *timeout)
+{
+	char name[16];
+	int rc;
+	int total = ksocknal_data.ksnd_connd_starting +
+		    ksocknal_data.ksnd_connd_running;
+
+	if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+		/* still in initializing */
+		return 0;
+	}
+
+	if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
+	    total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
+		/* can't create more connd, or still have enough
+		 * threads to handle more connecting */
+		return 0;
+	}
+
+	if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
+		/* no pending connecting request */
+		return 0;
+	}
+
+	if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
+		/* may run out of resource, retry later */
+		*timeout = cfs_time_seconds(1);
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_starting > 0) {
+		/* serialize starting to avoid flood */
+		return 0;
+	}
+
+	ksocknal_data.ksnd_connd_starting_stamp = sec;
+	ksocknal_data.ksnd_connd_starting++;
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	/* NB: total is the next id */
+	snprintf(name, sizeof(name), "socknal_cd%02d", total);
+	rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+	if (rc == 0)
+		return 1;
+
+	/* we tried ... */
+	LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+	ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec();
+
+	return 1;
+}
+
+/*
+ * check whether current thread can exit, it will return 1 if there are too
+ * many threads and no creating in past 120 seconds.
+ * Also, this function may update @timeout to make caller come back
+ * again to recheck these conditions.
+ */
+static int
+ksocknal_connd_check_stop(long sec, long *timeout)
+{
+	int val;
+
+	if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+		/* still in initializing */
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_starting > 0) {
+		/* in progress of starting new thread */
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_running <=
+	    *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
+		return 0;
+	}
+
+	/* created thread in past 120 seconds? */
+	val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
+		    SOCKNAL_CONND_TIMEOUT - sec);
+
+	*timeout = (val > 0) ? cfs_time_seconds(val) :
+			       cfs_time_seconds(SOCKNAL_CONND_TIMEOUT);
+	if (val > 0)
+		return 0;
+
+	/* no creating in past 120 seconds */
+
+	return ksocknal_data.ksnd_connd_running >
+	       ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
+}
+
+/* Go through connd_routes queue looking for a route that we can process
+ * right now, @timeout_p can be updated if we need to come back later */
+static ksock_route_t *
+ksocknal_connd_get_route_locked(signed long *timeout_p)
+{
+	ksock_route_t *route;
+	cfs_time_t     now;
+
+	now = cfs_time_current();
+
+	/* connd_routes can contain both pending and ordinary routes */
+	list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes,
+				 ksnr_connd_list) {
+
+		if (route->ksnr_retry_interval == 0 ||
+		    cfs_time_aftereq(now, route->ksnr_timeout))
+			return route;
+
+		if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
+		    (int)*timeout_p > (int)(route->ksnr_timeout - now))
+			*timeout_p = (int)(route->ksnr_timeout - now);
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_connd (void *arg)
+{
+	spinlock_t    *connd_lock = &ksocknal_data.ksnd_connd_lock;
+	ksock_connreq_t   *cr;
+	wait_queue_t     wait;
+	int		nloops = 0;
+	int		cons_retry = 0;
+
+	cfs_block_allsigs ();
+
+	init_waitqueue_entry_current (&wait);
+
+	spin_lock_bh(connd_lock);
+
+	LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+	ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_running++;
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		ksock_route_t *route = NULL;
+		long sec = cfs_time_current_sec();
+		long timeout = MAX_SCHEDULE_TIMEOUT;
+		int  dropped_lock = 0;
+
+		if (ksocknal_connd_check_stop(sec, &timeout)) {
+			/* wakeup another one to check stop */
+			wake_up(&ksocknal_data.ksnd_connd_waitq);
+			break;
+		}
+
+		if (ksocknal_connd_check_start(sec, &timeout)) {
+			/* created new thread */
+			dropped_lock = 1;
+		}
+
+		if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+			/* Connection accepted by the listener */
+			cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
+					    next, ksock_connreq_t, ksncr_list);
+
+			list_del(&cr->ksncr_list);
+			spin_unlock_bh(connd_lock);
+			dropped_lock = 1;
+
+			ksocknal_create_conn(cr->ksncr_ni, NULL,
+					     cr->ksncr_sock, SOCKLND_CONN_NONE);
+			lnet_ni_decref(cr->ksncr_ni);
+			LIBCFS_FREE(cr, sizeof(*cr));
+
+			spin_lock_bh(connd_lock);
+		}
+
+		/* Only handle an outgoing connection request if there
+		 * is a thread left to handle incoming connections and
+		 * create new connd */
+		if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
+		    ksocknal_data.ksnd_connd_running) {
+			route = ksocknal_connd_get_route_locked(&timeout);
+		}
+		if (route != NULL) {
+			list_del (&route->ksnr_connd_list);
+			ksocknal_data.ksnd_connd_connecting++;
+			spin_unlock_bh(connd_lock);
+			dropped_lock = 1;
+
+			if (ksocknal_connect(route)) {
+				/* consecutive retry */
+				if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
+					CWARN("massive consecutive "
+					      "re-connecting to %u.%u.%u.%u\n",
+					      HIPQUAD(route->ksnr_ipaddr));
+					cons_retry = 0;
+				}
+			} else {
+				cons_retry = 0;
+			}
+
+			ksocknal_route_decref(route);
+
+			spin_lock_bh(connd_lock);
+			ksocknal_data.ksnd_connd_connecting--;
+		}
+
+		if (dropped_lock) {
+			if (++nloops < SOCKNAL_RESCHED)
+				continue;
+			spin_unlock_bh(connd_lock);
+			nloops = 0;
+			cond_resched();
+			spin_lock_bh(connd_lock);
+			continue;
+		}
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_unlock_bh(connd_lock);
+
+		nloops = 0;
+		waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_lock_bh(connd_lock);
+	}
+	ksocknal_data.ksnd_connd_running--;
+	spin_unlock_bh(connd_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
+
+ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_t *peer)
+{
+	/* We're called with a shared lock on ksnd_global_lock */
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+
+	list_for_each (ctmp, &peer->ksnp_conns) {
+		int     error;
+		conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+		/* Don't need the {get,put}connsock dance to deref ksnc_sock */
+		LASSERT (!conn->ksnc_closing);
+
+		/* SOCK_ERROR will reset error code of socket in
+		 * some platform (like Darwin8.x) */
+		error = cfs_sock_error(conn->ksnc_sock);
+		if (error != 0) {
+			ksocknal_conn_addref(conn);
+
+			switch (error) {
+			case ECONNRESET:
+				CNETERR("A connection with %s "
+					"(%u.%u.%u.%u:%d) was reset; "
+					"it may have rebooted.\n",
+					libcfs_id2str(peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+				break;
+			case ETIMEDOUT:
+				CNETERR("A connection with %s "
+					"(%u.%u.%u.%u:%d) timed out; the "
+					"network or node may be down.\n",
+					libcfs_id2str(peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+				break;
+			default:
+				CNETERR("An unexpected network error %d "
+					"occurred with %s "
+					"(%u.%u.%u.%u:%d\n", error,
+					libcfs_id2str(peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+				break;
+			}
+
+			return (conn);
+		}
+
+		if (conn->ksnc_rx_started &&
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_rx_deadline)) {
+			/* Timed out incomplete incoming message */
+			ksocknal_conn_addref(conn);
+			CNETERR("Timeout receiving from %s (%u.%u.%u.%u:%d), "
+				"state %d wanted %d left %d\n",
+				libcfs_id2str(peer->ksnp_id),
+				HIPQUAD(conn->ksnc_ipaddr),
+				conn->ksnc_port,
+				conn->ksnc_rx_state,
+				conn->ksnc_rx_nob_wanted,
+				conn->ksnc_rx_nob_left);
+			return (conn);
+		}
+
+		if ((!list_empty(&conn->ksnc_tx_queue) ||
+		     cfs_sock_wmem_queued(conn->ksnc_sock) != 0) &&
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_tx_deadline)) {
+			/* Timed out messages queued for sending or
+			 * buffered in the socket's send buffer */
+			ksocknal_conn_addref(conn);
+			CNETERR("Timeout sending data to %s (%u.%u.%u.%u:%d) "
+				"the network or that node may be down.\n",
+				libcfs_id2str(peer->ksnp_id),
+				HIPQUAD(conn->ksnc_ipaddr),
+				conn->ksnc_port);
+			return (conn);
+		}
+	}
+
+	return (NULL);
+}
+
+static inline void
+ksocknal_flush_stale_txs(ksock_peer_t *peer)
+{
+	ksock_tx_t	*tx;
+	LIST_HEAD      (stale_txs);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	while (!list_empty (&peer->ksnp_tx_queue)) {
+		tx = list_entry (peer->ksnp_tx_queue.next,
+				     ksock_tx_t, tx_list);
+
+		if (!cfs_time_aftereq(cfs_time_current(),
+				      tx->tx_deadline))
+			break;
+
+		list_del (&tx->tx_list);
+		list_add_tail (&tx->tx_list, &stale_txs);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
+}
+
+int
+ksocknal_send_keepalive_locked(ksock_peer_t *peer)
+{
+	ksock_sched_t  *sched;
+	ksock_conn_t   *conn;
+	ksock_tx_t     *tx;
+
+	if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */
+		return 0;
+
+	if (peer->ksnp_proto != &ksocknal_protocol_v3x)
+		return 0;
+
+	if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
+	    cfs_time_before(cfs_time_current(),
+			    cfs_time_add(peer->ksnp_last_alive,
+					 cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+		return 0;
+
+	if (cfs_time_before(cfs_time_current(),
+			    peer->ksnp_send_keepalive))
+		return 0;
+
+	/* retry 10 secs later, so we wouldn't put pressure
+	 * on this peer if we failed to send keepalive this time */
+	peer->ksnp_send_keepalive = cfs_time_shift(10);
+
+	conn = ksocknal_find_conn_locked(peer, NULL, 1);
+	if (conn != NULL) {
+		sched = conn->ksnc_scheduler;
+
+		spin_lock_bh(&sched->kss_lock);
+		if (!list_empty(&conn->ksnc_tx_queue)) {
+			spin_unlock_bh(&sched->kss_lock);
+			/* there is an queued ACK, don't need keepalive */
+			return 0;
+		}
+
+		spin_unlock_bh(&sched->kss_lock);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	/* cookie = 1 is reserved for keepalive PING */
+	tx = ksocknal_alloc_tx_noop(1, 1);
+	if (tx == NULL) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return -ENOMEM;
+	}
+
+	if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return 1;
+	}
+
+	ksocknal_free_tx(tx);
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	return -EIO;
+}
+
+
+void
+ksocknal_check_peer_timeouts (int idx)
+{
+	struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
+	ksock_peer_t     *peer;
+	ksock_conn_t     *conn;
+	ksock_tx_t       *tx;
+
+ again:
+	/* NB. We expect to have a look at all the peers and not find any
+	 * connections to time out, so we just use a shared lock while we
+	 * take a look... */
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	list_for_each_entry(peer, peers, ksnp_list) {
+		cfs_time_t  deadline = 0;
+		int	 resid = 0;
+		int	 n     = 0;
+
+		if (ksocknal_send_keepalive_locked(peer) != 0) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			goto again;
+		}
+
+		conn = ksocknal_find_timed_out_conn (peer);
+
+		if (conn != NULL) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+
+			/* NB we won't find this one again, but we can't
+			 * just proceed with the next peer, since we dropped
+			 * ksnd_global_lock and it might be dead already! */
+			ksocknal_conn_decref(conn);
+			goto again;
+		}
+
+		/* we can't process stale txs right here because we're
+		 * holding only shared lock */
+		if (!list_empty (&peer->ksnp_tx_queue)) {
+			ksock_tx_t *tx =
+				list_entry (peer->ksnp_tx_queue.next,
+						ksock_tx_t, tx_list);
+
+			if (cfs_time_aftereq(cfs_time_current(),
+					     tx->tx_deadline)) {
+
+				ksocknal_peer_addref(peer);
+				read_unlock(&ksocknal_data.ksnd_global_lock);
+
+				ksocknal_flush_stale_txs(peer);
+
+				ksocknal_peer_decref(peer);
+				goto again;
+			}
+		}
+
+		if (list_empty(&peer->ksnp_zc_req_list))
+			continue;
+
+		spin_lock(&peer->ksnp_lock);
+		list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
+			if (!cfs_time_aftereq(cfs_time_current(),
+					      tx->tx_deadline))
+				break;
+			/* ignore the TX if connection is being closed */
+			if (tx->tx_conn->ksnc_closing)
+				continue;
+			n++;
+		}
+
+		if (n == 0) {
+			spin_unlock(&peer->ksnp_lock);
+			continue;
+		}
+
+		tx = list_entry(peer->ksnp_zc_req_list.next,
+				    ksock_tx_t, tx_zc_list);
+		deadline = tx->tx_deadline;
+		resid    = tx->tx_resid;
+		conn     = tx->tx_conn;
+		ksocknal_conn_addref(conn);
+
+		spin_unlock(&peer->ksnp_lock);
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		CERROR("Total %d stale ZC_REQs for peer %s detected; the "
+		       "oldest(%p) timed out %ld secs ago, "
+		       "resid: %d, wmem: %d\n",
+		       n, libcfs_nid2str(peer->ksnp_id.nid), tx,
+		       cfs_duration_sec(cfs_time_current() - deadline),
+		       resid, cfs_sock_wmem_queued(conn->ksnc_sock));
+
+		ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+		ksocknal_conn_decref(conn);
+		goto again;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+	wait_queue_t     wait;
+	ksock_conn_t      *conn;
+	ksock_sched_t     *sched;
+	struct list_head	 enomem_conns;
+	int		nenomem_conns;
+	cfs_duration_t     timeout;
+	int		i;
+	int		peer_index = 0;
+	cfs_time_t	 deadline = cfs_time_current();
+
+	cfs_block_allsigs ();
+
+	INIT_LIST_HEAD(&enomem_conns);
+	init_waitqueue_entry_current (&wait);
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+
+		if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) {
+			conn = list_entry (ksocknal_data. \
+					       ksnd_deathrow_conns.next,
+					       ksock_conn_t, ksnc_list);
+			list_del (&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_terminate_conn(conn);
+			ksocknal_conn_decref(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+			continue;
+		}
+
+		if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) {
+			conn = list_entry (ksocknal_data.ksnd_zombie_conns.\
+					       next, ksock_conn_t, ksnc_list);
+			list_del (&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_destroy_conn(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+			continue;
+		}
+
+		if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) {
+			list_add(&enomem_conns,
+				     &ksocknal_data.ksnd_enomem_conns);
+			list_del_init(&ksocknal_data.ksnd_enomem_conns);
+		}
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/* reschedule all the connections that stalled with ENOMEM... */
+		nenomem_conns = 0;
+		while (!list_empty (&enomem_conns)) {
+			conn = list_entry (enomem_conns.next,
+					       ksock_conn_t, ksnc_tx_list);
+			list_del (&conn->ksnc_tx_list);
+
+			sched = conn->ksnc_scheduler;
+
+			spin_lock_bh(&sched->kss_lock);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			conn->ksnc_tx_ready = 1;
+			list_add_tail(&conn->ksnc_tx_list,
+					  &sched->kss_tx_conns);
+			wake_up(&sched->kss_waitq);
+
+			spin_unlock_bh(&sched->kss_lock);
+			nenomem_conns++;
+		}
+
+		/* careful with the jiffy wrap... */
+		while ((timeout = cfs_time_sub(deadline,
+					       cfs_time_current())) <= 0) {
+			const int n = 4;
+			const int p = 1;
+			int       chunk = ksocknal_data.ksnd_peer_hash_size;
+
+			/* Time to check for timeouts on a few more peers: I do
+			 * checks every 'p' seconds on a proportion of the peer
+			 * table and I need to check every connection 'n' times
+			 * within a timeout interval, to ensure I detect a
+			 * timeout on any connection within (n+1)/n times the
+			 * timeout interval. */
+
+			if (*ksocknal_tunables.ksnd_timeout > n * p)
+				chunk = (chunk * n * p) /
+					*ksocknal_tunables.ksnd_timeout;
+			if (chunk == 0)
+				chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				ksocknal_check_peer_timeouts (peer_index);
+				peer_index = (peer_index + 1) %
+					     ksocknal_data.ksnd_peer_hash_size;
+			}
+
+			deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+		}
+
+		if (nenomem_conns != 0) {
+			/* Reduce my timeout if I rescheduled ENOMEM conns.
+			 * This also prevents me getting woken immediately
+			 * if any go back on my enomem list. */
+			timeout = SOCKNAL_ENOMEM_RETRY;
+		}
+		ksocknal_data.ksnd_reaper_waketime =
+			cfs_time_add(cfs_time_current(), timeout);
+
+		set_current_state (TASK_INTERRUPTIBLE);
+		add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		if (!ksocknal_data.ksnd_shuttingdown &&
+		    list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
+		    list_empty (&ksocknal_data.ksnd_zombie_conns))
+			waitq_timedwait (&wait, TASK_INTERRUPTIBLE,
+					     timeout);
+
+		set_current_state (TASK_RUNNING);
+		remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+	}
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
new file mode 100644
index 0000000..3e08fe2
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c

@@ -0,0 +1,1088 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include "socklnd.h"
+
+# if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+
+enum {
+	SOCKLND_TIMEOUT = 1,
+	SOCKLND_CREDITS,
+	SOCKLND_PEER_TXCREDITS,
+	SOCKLND_PEER_RTRCREDITS,
+	SOCKLND_PEER_TIMEOUT,
+	SOCKLND_NCONNDS,
+	SOCKLND_RECONNECTS_MIN,
+	SOCKLND_RECONNECTS_MAX,
+	SOCKLND_EAGER_ACK,
+	SOCKLND_ZERO_COPY,
+	SOCKLND_TYPED,
+	SOCKLND_BULK_MIN,
+	SOCKLND_RX_BUFFER_SIZE,
+	SOCKLND_TX_BUFFER_SIZE,
+	SOCKLND_NAGLE,
+	SOCKLND_IRQ_AFFINITY,
+	SOCKLND_ROUND_ROBIN,
+	SOCKLND_KEEPALIVE,
+	SOCKLND_KEEPALIVE_IDLE,
+	SOCKLND_KEEPALIVE_COUNT,
+	SOCKLND_KEEPALIVE_INTVL,
+	SOCKLND_BACKOFF_INIT,
+	SOCKLND_BACKOFF_MAX,
+	SOCKLND_PROTOCOL,
+	SOCKLND_ZERO_COPY_RECV,
+	SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS
+};
+
+static ctl_table_t ksocknal_ctl_table[] = {
+	{
+		.ctl_name = SOCKLND_TIMEOUT,
+		.procname = "timeout",
+		.data     = &ksocknal_tunables.ksnd_timeout,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_CREDITS,
+		.procname = "credits",
+		.data     = &ksocknal_tunables.ksnd_credits,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	 {
+		.ctl_name = SOCKLND_PEER_TXCREDITS,
+		.procname = "peer_credits",
+		.data     = &ksocknal_tunables.ksnd_peertxcredits,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	 {
+		.ctl_name = SOCKLND_PEER_RTRCREDITS,
+		.procname = "peer_buffer_credits",
+		.data     = &ksocknal_tunables.ksnd_peerrtrcredits,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_PEER_TIMEOUT,
+		.procname = "peer_timeout",
+		.data     = &ksocknal_tunables.ksnd_peertimeout,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_NCONNDS,
+		.procname = "nconnds",
+		.data     = &ksocknal_tunables.ksnd_nconnds,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_RECONNECTS_MIN,
+		.procname = "min_reconnectms",
+		.data     = &ksocknal_tunables.ksnd_min_reconnectms,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_RECONNECTS_MAX,
+		.procname = "max_reconnectms",
+		.data     = &ksocknal_tunables.ksnd_max_reconnectms,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_EAGER_ACK,
+		.procname = "eager_ack",
+		.data     = &ksocknal_tunables.ksnd_eager_ack,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_ZERO_COPY,
+		.procname = "zero_copy",
+		.data     = &ksocknal_tunables.ksnd_zc_min_payload,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_ZERO_COPY_RECV,
+		.procname = "zero_copy_recv",
+		.data     = &ksocknal_tunables.ksnd_zc_recv,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+
+	{
+		.ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS,
+		.procname = "zero_copy_recv",
+		.data     = &ksocknal_tunables.ksnd_zc_recv_min_nfrags,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_TYPED,
+		.procname = "typed",
+		.data     = &ksocknal_tunables.ksnd_typed_conns,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_BULK_MIN,
+		.procname = "min_bulk",
+		.data     = &ksocknal_tunables.ksnd_min_bulk,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_RX_BUFFER_SIZE,
+		.procname = "rx_buffer_size",
+		.data     = &ksocknal_tunables.ksnd_rx_buffer_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_TX_BUFFER_SIZE,
+		.procname = "tx_buffer_size",
+		.data     = &ksocknal_tunables.ksnd_tx_buffer_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_NAGLE,
+		.procname = "nagle",
+		.data     = &ksocknal_tunables.ksnd_nagle,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_ROUND_ROBIN,
+		.procname = "round_robin",
+		.data     = &ksocknal_tunables.ksnd_round_robin,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE,
+		.procname = "keepalive",
+		.data     = &ksocknal_tunables.ksnd_keepalive,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE_IDLE,
+		.procname = "keepalive_idle",
+		.data     = &ksocknal_tunables.ksnd_keepalive_idle,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE_COUNT,
+		.procname = "keepalive_count",
+		.data     = &ksocknal_tunables.ksnd_keepalive_count,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE_INTVL,
+		.procname = "keepalive_intvl",
+		.data     = &ksocknal_tunables.ksnd_keepalive_intvl,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+#if SOCKNAL_VERSION_DEBUG
+	{
+		.ctl_name = SOCKLND_PROTOCOL,
+		.procname = "protocol",
+		.data     = &ksocknal_tunables.ksnd_protocol,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+#endif
+	{0}
+};
+
+
+ctl_table_t ksocknal_top_ctl_table[] = {
+	{
+		.ctl_name = CTL_SOCKLND,
+		.procname = "socknal",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = ksocknal_ctl_table
+	},
+	{ 0 }
+};
+
+int
+ksocknal_lib_tunables_init ()
+{
+	if (!*ksocknal_tunables.ksnd_typed_conns) {
+		int rc = -EINVAL;
+#if SOCKNAL_VERSION_DEBUG
+		if (*ksocknal_tunables.ksnd_protocol < 3)
+			rc = 0;
+#endif
+		if (rc != 0) {
+			CERROR("Protocol V3.x MUST have typed connections\n");
+			return rc;
+		}
+	}
+
+	if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2)
+		*ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2;
+	if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV)
+		*ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV;
+
+	ksocknal_tunables.ksnd_sysctl =
+		cfs_register_sysctl_table(ksocknal_top_ctl_table, 0);
+
+	if (ksocknal_tunables.ksnd_sysctl == NULL)
+		CWARN("Can't setup /proc tunables\n");
+
+	return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+	if (ksocknal_tunables.ksnd_sysctl != NULL)
+		unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);
+}
+#else
+int
+ksocknal_lib_tunables_init ()
+{
+	return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+}
+#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{
+	int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+				     &conn->ksnc_ipaddr,
+				     &conn->ksnc_port);
+
+	/* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+	LASSERT (!conn->ksnc_closing);
+
+	if (rc != 0) {
+		CERROR ("Error %d getting sock peer IP\n", rc);
+		return rc;
+	}
+
+	rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+				 &conn->ksnc_myipaddr, NULL);
+	if (rc != 0) {
+		CERROR ("Error %d getting sock local IP\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+int
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
+{
+	int  caps = conn->ksnc_sock->sk->sk_route_caps;
+
+	if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+		return 0;
+
+	/* ZC if the socket supports scatter/gather and doesn't need software
+	 * checksums */
+	return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0);
+}
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct socket *sock = conn->ksnc_sock;
+	int	    nob;
+	int	    rc;
+
+	if (*ksocknal_tunables.ksnd_enable_csum	&& /* checksum enabled */
+	    conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
+	    tx->tx_nob == tx->tx_resid		 && /* frist sending    */
+	    tx->tx_msg.ksm_csum == 0)		     /* not checksummed  */
+		ksocknal_lib_csum_tx(tx);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+
+	{
+#if SOCKNAL_SINGLE_FRAG_TX
+		struct iovec    scratch;
+		struct iovec   *scratchiov = &scratch;
+		unsigned int    niov = 1;
+#else
+		struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+		unsigned int    niov = tx->tx_niov;
+#endif
+		struct msghdr msg = {
+			.msg_name       = NULL,
+			.msg_namelen    = 0,
+			.msg_iov	= scratchiov,
+			.msg_iovlen     = niov,
+			.msg_control    = NULL,
+			.msg_controllen = 0,
+			.msg_flags      = MSG_DONTWAIT
+		};
+		mm_segment_t oldmm = get_fs();
+		int  i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i] = tx->tx_iov[i];
+			nob += scratchiov[i].iov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		set_fs (KERNEL_DS);
+		rc = sock_sendmsg(sock, &msg, nob);
+		set_fs (oldmm);
+	}
+	return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct socket *sock = conn->ksnc_sock;
+	lnet_kiov_t   *kiov = tx->tx_kiov;
+	int	    rc;
+	int	    nob;
+
+	/* Not NOOP message */
+	LASSERT (tx->tx_lnetmsg != NULL);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+		/* Zero copy is enabled */
+		struct sock   *sk = sock->sk;
+		struct page   *page = kiov->kiov_page;
+		int	    offset = kiov->kiov_offset;
+		int	    fragsize = kiov->kiov_len;
+		int	    msgflg = MSG_DONTWAIT;
+
+		CDEBUG(D_NET, "page %p + offset %x for %d\n",
+			       page, offset, kiov->kiov_len);
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    fragsize < tx->tx_resid)
+			msgflg |= MSG_MORE;
+
+		if (sk->sk_prot->sendpage != NULL) {
+			rc = sk->sk_prot->sendpage(sk, page,
+						   offset, fragsize, msgflg);
+		} else {
+			rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+					      msgflg);
+		}
+	} else {
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+		struct iovec  scratch;
+		struct iovec *scratchiov = &scratch;
+		unsigned int  niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+		struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+		unsigned int  niov = tx->tx_nkiov;
+#endif
+		struct msghdr msg = {
+			.msg_name       = NULL,
+			.msg_namelen    = 0,
+			.msg_iov	= scratchiov,
+			.msg_iovlen     = niov,
+			.msg_control    = NULL,
+			.msg_controllen = 0,
+			.msg_flags      = MSG_DONTWAIT
+		};
+		mm_segment_t  oldmm = get_fs();
+		int	   i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+						 kiov[i].kiov_offset;
+			nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		set_fs (KERNEL_DS);
+		rc = sock_sendmsg(sock, &msg, nob);
+		set_fs (oldmm);
+
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].kiov_page);
+	}
+	return rc;
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+	int	    opt = 1;
+	mm_segment_t   oldmm = get_fs();
+	struct socket *sock = conn->ksnc_sock;
+
+	/* Remind the socket to ACK eagerly.  If I don't, the socket might
+	 * think I'm about to send something it could piggy-back the ACK
+	 * on, introducing delay in completing zero-copy sends in my
+	 * peer. */
+
+	set_fs(KERNEL_DS);
+	sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
+			       (char *)&opt, sizeof (opt));
+	set_fs(oldmm);
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+	struct iovec  scratch;
+	struct iovec *scratchiov = &scratch;
+	unsigned int  niov = 1;
+#else
+	struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	unsigned int  niov = conn->ksnc_rx_niov;
+#endif
+	struct iovec *iov = conn->ksnc_rx_iov;
+	struct msghdr msg = {
+		.msg_name       = NULL,
+		.msg_namelen    = 0,
+		.msg_iov	= scratchiov,
+		.msg_iovlen     = niov,
+		.msg_control    = NULL,
+		.msg_controllen = 0,
+		.msg_flags      = 0
+	};
+	mm_segment_t oldmm = get_fs();
+	int	  nob;
+	int	  i;
+	int	  rc;
+	int	  fragnob;
+	int	  sum;
+	__u32	saved_csum;
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	LASSERT (niov > 0);
+
+	for (nob = i = 0; i < niov; i++) {
+		scratchiov[i] = iov[i];
+		nob += scratchiov[i].iov_len;
+	}
+	LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+	set_fs (KERNEL_DS);
+	rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+	/* NB this is just a boolean..........................^ */
+	set_fs (oldmm);
+
+	saved_csum = 0;
+	if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+		saved_csum = conn->ksnc_msg.ksm_csum;
+		conn->ksnc_msg.ksm_csum = 0;
+	}
+
+	if (saved_csum != 0) {
+		/* accumulate checksum */
+		for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+			LASSERT (i < niov);
+
+			fragnob = iov[i].iov_len;
+			if (fragnob > sum)
+				fragnob = sum;
+
+			conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+							   iov[i].iov_base, fragnob);
+		}
+		conn->ksnc_msg.ksm_csum = saved_csum;
+	}
+
+	return rc;
+}
+
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+	if (addr == NULL)
+		return;
+
+	vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
+		       struct iovec *iov, struct page **pages)
+{
+	void	     *addr;
+	int	       nob;
+	int	       i;
+
+	if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+		return NULL;
+
+	LASSERT (niov <= LNET_MAX_IOV);
+
+	if (niov < 2 ||
+	    niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+		return NULL;
+
+	for (nob = i = 0; i < niov; i++) {
+		if ((kiov[i].kiov_offset != 0 && i > 0) ||
+		    (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1))
+			return NULL;
+
+		pages[i] = kiov[i].kiov_page;
+		nob += kiov[i].kiov_len;
+	}
+
+	addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+	if (addr == NULL)
+		return NULL;
+
+	iov->iov_base = addr + kiov[0].kiov_offset;
+	iov->iov_len = nob;
+
+	return addr;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+	struct iovec   scratch;
+	struct iovec  *scratchiov = &scratch;
+	struct page  **pages      = NULL;
+	unsigned int   niov       = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+	struct iovec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
+	unsigned int   niov       = conn->ksnc_rx_nkiov;
+#endif
+	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+	struct msghdr msg = {
+		.msg_name       = NULL,
+		.msg_namelen    = 0,
+		.msg_iov	= scratchiov,
+		.msg_control    = NULL,
+		.msg_controllen = 0,
+		.msg_flags      = 0
+	};
+	mm_segment_t oldmm = get_fs();
+	int	  nob;
+	int	  i;
+	int	  rc;
+	void	*base;
+	void	*addr;
+	int	  sum;
+	int	  fragnob;
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
+		nob = scratchiov[0].iov_len;
+		msg.msg_iovlen = 1;
+
+	} else {
+		for (nob = i = 0; i < niov; i++) {
+			nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+						 kiov[i].kiov_offset;
+		}
+		msg.msg_iovlen = niov;
+	}
+
+	LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+	set_fs (KERNEL_DS);
+	rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+	/* NB this is just a boolean.......................^ */
+	set_fs (oldmm);
+
+	if (conn->ksnc_msg.ksm_csum != 0) {
+		for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+			LASSERT (i < niov);
+
+			/* Dang! have to kmap again because I have nowhere to stash the
+			 * mapped address.  But by doing it while the page is still
+			 * mapped, the kernel just bumps the map count and returns me
+			 * the address it stashed. */
+			base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+			fragnob = kiov[i].kiov_len;
+			if (fragnob > sum)
+				fragnob = sum;
+
+			conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+							   base, fragnob);
+
+			kunmap(kiov[i].kiov_page);
+		}
+	}
+
+	if (addr != NULL) {
+		ksocknal_lib_kiov_vunmap(addr);
+	} else {
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].kiov_page);
+	}
+
+	return (rc);
+}
+
+void
+ksocknal_lib_csum_tx(ksock_tx_t *tx)
+{
+	int	  i;
+	__u32	csum;
+	void	*base;
+
+	LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg);
+	LASSERT(tx->tx_conn != NULL);
+	LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+	tx->tx_msg.ksm_csum = 0;
+
+	csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base,
+			     tx->tx_iov[0].iov_len);
+
+	if (tx->tx_kiov != NULL) {
+		for (i = 0; i < tx->tx_nkiov; i++) {
+			base = kmap(tx->tx_kiov[i].kiov_page) +
+			       tx->tx_kiov[i].kiov_offset;
+
+			csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
+
+			kunmap(tx->tx_kiov[i].kiov_page);
+		}
+	} else {
+		for (i = 1; i < tx->tx_niov; i++)
+			csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
+					     tx->tx_iov[i].iov_len);
+	}
+
+	if (*ksocknal_tunables.ksnd_inject_csum_error) {
+		csum++;
+		*ksocknal_tunables.ksnd_inject_csum_error = 0;
+	}
+
+	tx->tx_msg.ksm_csum = csum;
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+	mm_segment_t   oldmm = get_fs ();
+	struct socket *sock = conn->ksnc_sock;
+	int	    len;
+	int	    rc;
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		*txmem = *rxmem = *nagle = 0;
+		return (-ESHUTDOWN);
+	}
+
+	rc = libcfs_sock_getbuf(sock, txmem, rxmem);
+	if (rc == 0) {
+		len = sizeof(*nagle);
+		set_fs(KERNEL_DS);
+		rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
+					   (char *)nagle, &len);
+		set_fs(oldmm);
+	}
+
+	ksocknal_connsock_decref(conn);
+
+	if (rc == 0)
+		*nagle = !*nagle;
+	else
+		*txmem = *rxmem = *nagle = 0;
+
+	return (rc);
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *sock)
+{
+	mm_segment_t    oldmm = get_fs ();
+	int	     rc;
+	int	     option;
+	int	     keep_idle;
+	int	     keep_intvl;
+	int	     keep_count;
+	int	     do_keepalive;
+	struct linger   linger;
+
+	sock->sk->sk_allocation = GFP_NOFS;
+
+	/* Ensure this socket aborts active sends immediately when we close
+	 * it. */
+
+	linger.l_onoff = 0;
+	linger.l_linger = 0;
+
+	set_fs (KERNEL_DS);
+	rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER,
+			      (char *)&linger, sizeof (linger));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set SO_LINGER: %d\n", rc);
+		return (rc);
+	}
+
+	option = -1;
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2,
+				    (char *)&option, sizeof (option));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set SO_LINGER2: %d\n", rc);
+		return (rc);
+	}
+
+	if (!*ksocknal_tunables.ksnd_nagle) {
+		option = 1;
+
+		set_fs (KERNEL_DS);
+		rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY,
+					    (char *)&option, sizeof (option));
+		set_fs (oldmm);
+		if (rc != 0) {
+			CERROR ("Can't disable nagle: %d\n", rc);
+			return (rc);
+		}
+	}
+
+	rc = libcfs_sock_setbuf(sock,
+				*ksocknal_tunables.ksnd_tx_buffer_size,
+				*ksocknal_tunables.ksnd_rx_buffer_size);
+	if (rc != 0) {
+		CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+			*ksocknal_tunables.ksnd_tx_buffer_size,
+			*ksocknal_tunables.ksnd_rx_buffer_size, rc);
+		return (rc);
+	}
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+
+	/* snapshot tunables */
+	keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+	keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+	keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+	do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+	option = (do_keepalive ? 1 : 0);
+	set_fs (KERNEL_DS);
+	rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE,
+			      (char *)&option, sizeof (option));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+		return (rc);
+	}
+
+	if (!do_keepalive)
+		return (0);
+
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
+				    (char *)&keep_idle, sizeof (keep_idle));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
+		return (rc);
+	}
+
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
+				    (char *)&keep_intvl, sizeof (keep_intvl));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+		return (rc);
+	}
+
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
+				    (char *)&keep_count, sizeof (keep_count));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
+		return (rc);
+	}
+
+	return (0);
+}
+
+void
+ksocknal_lib_push_conn (ksock_conn_t *conn)
+{
+	struct sock    *sk;
+	struct tcp_sock *tp;
+	int	     nonagle;
+	int	     val = 1;
+	int	     rc;
+	mm_segment_t    oldmm;
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0)			    /* being shut down */
+		return;
+
+	sk = conn->ksnc_sock->sk;
+	tp = tcp_sk(sk);
+
+	lock_sock (sk);
+	nonagle = tp->nonagle;
+	tp->nonagle = 1;
+	release_sock (sk);
+
+	oldmm = get_fs ();
+	set_fs (KERNEL_DS);
+
+	rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
+				      (char *)&val, sizeof (val));
+	LASSERT (rc == 0);
+
+	set_fs (oldmm);
+
+	lock_sock (sk);
+	tp->nonagle = nonagle;
+	release_sock (sk);
+
+	ksocknal_connsock_decref(conn);
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+/*
+ * socket call back in Linux
+ */
+static void
+ksocknal_data_ready (struct sock *sk, int n)
+{
+	ksock_conn_t  *conn;
+	ENTRY;
+
+	/* interleave correctly with closing sockets... */
+	LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = sk->sk_user_data;
+	if (conn == NULL) {	     /* raced with ksocknal_terminate_conn */
+		LASSERT (sk->sk_data_ready != &ksocknal_data_ready);
+		sk->sk_data_ready (sk, n);
+	} else
+		ksocknal_read_callback(conn);
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	EXIT;
+}
+
+static void
+ksocknal_write_space (struct sock *sk)
+{
+	ksock_conn_t  *conn;
+	int	    wspace;
+	int	    min_wpace;
+
+	/* interleave correctly with closing sockets... */
+	LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = sk->sk_user_data;
+	wspace = SOCKNAL_WSPACE(sk);
+	min_wpace = SOCKNAL_MIN_WSPACE(sk);
+
+	CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+	       sk, wspace, min_wpace, conn,
+	       (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
+				      " ready" : " blocked"),
+	       (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+				      " scheduled" : " idle"),
+	       (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
+				      " empty" : " queued"));
+
+	if (conn == NULL) {	     /* raced with ksocknal_terminate_conn */
+		LASSERT (sk->sk_write_space != &ksocknal_write_space);
+		sk->sk_write_space (sk);
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return;
+	}
+
+	if (wspace >= min_wpace) {	      /* got enough space */
+		ksocknal_write_callback(conn);
+
+		/* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
+		 * ENOMEM check in ksocknal_transmit is race-free (think about
+		 * it). */
+
+		clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+	conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
+	conn->ksnc_saved_write_space = sock->sk->sk_write_space;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+{
+	sock->sk->sk_user_data = conn;
+	sock->sk->sk_data_ready = ksocknal_data_ready;
+	sock->sk->sk_write_space = ksocknal_write_space;
+	return;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+	/* Remove conn's network callbacks.
+	 * NB I _have_ to restore the callback, rather than storing a noop,
+	 * since the socket could survive past this module being unloaded!! */
+	sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
+	sock->sk->sk_write_space = conn->ksnc_saved_write_space;
+
+	/* A callback could be in progress already; they hold a read lock
+	 * on ksnd_global_lock (to serialise with me) and NOOP if
+	 * sk_user_data is NULL. */
+	sock->sk->sk_user_data = NULL;
+
+	return ;
+}
+
+int
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+{
+	int	    rc = 0;
+	ksock_sched_t *sched;
+
+	sched = conn->ksnc_scheduler;
+	spin_lock_bh(&sched->kss_lock);
+
+	if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) &&
+	    !conn->ksnc_tx_ready) {
+		/* SOCK_NOSPACE is set when the socket fills
+		 * and cleared in the write_space callback
+		 * (which also sets ksnc_tx_ready).  If
+		 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+		 * zero, I didn't fill the socket and
+		 * write_space won't reschedule me, so I
+		 * return -ENOMEM to get my caller to retry
+		 * after a timeout */
+		rc = -ENOMEM;
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	return rc;
+}

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
new file mode 100644
index 0000000..3c13578
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h

@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_PORTAL_ALLOC
+
+#ifndef __LINUX_SOCKNAL_LIB_H__
+#define __LINUX_SOCKNAL_LIB_H__
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/if.h>
+
+#include <asm/uaccess.h>
+#include <asm/irq.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <linux/syscalls.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/crc32.h>
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+	return crc32_le(crc, p, len);
+#else
+	while (len-- > 0)
+		crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+	return crc;
+#endif
+}
+
+#define SOCKNAL_WSPACE(sk)       sk_stream_wspace(sk)
+#define SOCKNAL_MIN_WSPACE(sk)   sk_stream_min_wspace(sk)
+
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS		3
+#define SOCKNAL_NSCHEDS_HIGH	(SOCKNAL_NSCHEDS << 1)
+
+#endif

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644
index 0000000..8a474f6
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c

@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int sock_timeout = 50;
+CFS_MODULE_PARM(sock_timeout, "i", int, 0644,
+		"dead socket timeout (seconds)");
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+		"# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+		"# concurrent sends to 1 peer");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+		"# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+		"Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+		"# scheduler daemons in each pool while starting");
+
+static int nconnds = 4;
+CFS_MODULE_PARM(nconnds, "i", int, 0444,
+		"# connection daemons while starting");
+
+static int nconnds_max = 64;
+CFS_MODULE_PARM(nconnds_max, "i", int, 0444,
+		"max # connection daemons");
+
+static int min_reconnectms = 1000;
+CFS_MODULE_PARM(min_reconnectms, "i", int, 0644,
+		"min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+CFS_MODULE_PARM(max_reconnectms, "i", int, 0644,
+		"max connection retry interval (mS)");
+
+# define DEFAULT_EAGER_ACK 0
+static int eager_ack = DEFAULT_EAGER_ACK;
+CFS_MODULE_PARM(eager_ack, "i", int, 0644,
+		"send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+CFS_MODULE_PARM(typed_conns, "i", int, 0444,
+		"use different sockets for bulk");
+
+static int min_bulk = (1<<10);
+CFS_MODULE_PARM(min_bulk, "i", int, 0644,
+		"smallest 'large' message");
+
+# define DEFAULT_BUFFER_SIZE 0
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644,
+		"socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644,
+		"socket rx buffer size (0 for system default)");
+
+static int nagle = 0;
+CFS_MODULE_PARM(nagle, "i", int, 0644,
+		"enable NAGLE?");
+
+static int round_robin = 1;
+CFS_MODULE_PARM(round_robin, "i", int, 0644,
+		"Round robin for multiple interfaces");
+
+static int keepalive = 30;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+		"# seconds before send keepalive");
+
+static int keepalive_idle = 30;
+CFS_MODULE_PARM(keepalive_idle, "i", int, 0644,
+		"# idle seconds before probe");
+
+#define DEFAULT_KEEPALIVE_COUNT  5
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+CFS_MODULE_PARM(keepalive_count, "i", int, 0644,
+		"# missed probes == dead");
+
+static int keepalive_intvl = 5;
+CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644,
+		"seconds between probes");
+
+static int enable_csum = 0;
+CFS_MODULE_PARM(enable_csum, "i", int, 0644,
+		"enable check sum");
+
+static int inject_csum_error = 0;
+CFS_MODULE_PARM(inject_csum_error, "i", int, 0644,
+		"set non-zero to inject a checksum error");
+
+static int nonblk_zcack = 1;
+CFS_MODULE_PARM(nonblk_zcack, "i", int, 0644,
+		"always send ZC-ACK on non-blocking connection");
+
+static unsigned int zc_min_payload = (16 << 10);
+CFS_MODULE_PARM(zc_min_payload, "i", int, 0644,
+		"minimum payload size to zero copy");
+
+static unsigned int zc_recv = 0;
+CFS_MODULE_PARM(zc_recv, "i", int, 0644,
+		"enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0644,
+		"minimum # of fragments to enable ZC recv");
+
+
+#if SOCKNAL_VERSION_DEBUG
+static int protocol = 3;
+CFS_MODULE_PARM(protocol, "i", int, 0644,
+		"protocol version");
+#endif
+
+ksock_tunables_t ksocknal_tunables;
+
+int ksocknal_tunables_init(void)
+{
+
+	/* initialize ksocknal_tunables structure */
+	ksocknal_tunables.ksnd_timeout	    = &sock_timeout;
+	ksocknal_tunables.ksnd_nscheds		  = &nscheds;
+	ksocknal_tunables.ksnd_nconnds	    = &nconnds;
+	ksocknal_tunables.ksnd_nconnds_max	= &nconnds_max;
+	ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+	ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+	ksocknal_tunables.ksnd_eager_ack	  = &eager_ack;
+	ksocknal_tunables.ksnd_typed_conns	= &typed_conns;
+	ksocknal_tunables.ksnd_min_bulk	   = &min_bulk;
+	ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+	ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+	ksocknal_tunables.ksnd_nagle	      = &nagle;
+	ksocknal_tunables.ksnd_round_robin	= &round_robin;
+	ksocknal_tunables.ksnd_keepalive	  = &keepalive;
+	ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+	ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+	ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+	ksocknal_tunables.ksnd_credits	    = &credits;
+	ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
+	ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
+	ksocknal_tunables.ksnd_peertimeout	= &peer_timeout;
+	ksocknal_tunables.ksnd_enable_csum	= &enable_csum;
+	ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+	ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
+	ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
+	ksocknal_tunables.ksnd_zc_recv	    = &zc_recv;
+	ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+
+
+
+#if SOCKNAL_VERSION_DEBUG
+	ksocknal_tunables.ksnd_protocol	   = &protocol;
+#endif
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+	ksocknal_tunables.ksnd_sysctl	     =  NULL;
+#endif
+
+	if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+		*ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+
+	/* initialize platform-sepcific tunables */
+	return ksocknal_lib_tunables_init();
+};
+
+void ksocknal_tunables_fini(void)
+{
+	ksocknal_lib_tunables_fini();
+}

diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
new file mode 100644
index 0000000..ec57179
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c

@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+/*
+ * Protocol entries :
+ *   pro_send_hello       : send hello message
+ *   pro_recv_hello       : receive hello message
+ *   pro_pack	     : pack message header
+ *   pro_unpack	   : unpack message header
+ *   pro_queue_tx_zcack() : Called holding BH lock: kss_lock
+ *			  return 1 if ACK is piggybacked, otherwise return 0
+ *   pro_queue_tx_msg()   : Called holding BH lock: kss_lock
+ *			  return the ACK that piggybacked by my message, or NULL
+ *   pro_handle_zcreq()   : handler of incoming ZC-REQ
+ *   pro_handle_zcack()   : handler of incoming ZC-ACK
+ *   pro_match_tx()       : Called holding glock
+ */
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+	/* V1.x, just enqueue it */
+	list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+	return NULL;
+}
+
+void
+ksocknal_next_tx_carrier(ksock_conn_t *conn)
+{
+	ksock_tx_t     *tx = conn->ksnc_tx_carrier;
+
+	/* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+	LASSERT (!list_empty(&conn->ksnc_tx_queue));
+	LASSERT (tx != NULL);
+
+	/* Next TX that can carry ZC-ACK or LNet message */
+	if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+		/* no more packets queued */
+		conn->ksnc_tx_carrier = NULL;
+	} else {
+		conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
+						       ksock_tx_t, tx_list);
+		LASSERT (conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type);
+	}
+}
+
+static int
+ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
+			   ksock_tx_t *tx_ack, __u64 cookie)
+{
+	ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+	LASSERT (tx_ack == NULL ||
+		 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	/*
+	 * Enqueue or piggyback tx_ack / cookie
+	 * . no tx can piggyback cookie of tx_ack (or cookie), just
+	 *   enqueue the tx_ack (if tx_ack != NUL) and return NULL.
+	 * . There is tx can piggyback cookie of tx_ack (or cookie),
+	 *   piggyback the cookie and return the tx.
+	 */
+	if (tx == NULL) {
+		if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+			conn->ksnc_tx_carrier = tx_ack;
+		}
+		return 0;
+	}
+
+	if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+		/* tx is noop zc-ack, can't piggyback zc-ack cookie */
+		if (tx_ack != NULL)
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+		return 0;
+	}
+
+	LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+	LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0);
+
+	if (tx_ack != NULL)
+		cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+	/* piggyback the zc-ack cookie */
+	tx->tx_msg.ksm_zc_cookies[1] = cookie;
+	/* move on to the next TX which can carry cookie */
+	ksocknal_next_tx_carrier(conn);
+
+	return 1;
+}
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+	ksock_tx_t  *tx  = conn->ksnc_tx_carrier;
+
+	/*
+	 * Enqueue tx_msg:
+	 * . If there is no NOOP on the connection, just enqueue
+	 *   tx_msg and return NULL
+	 * . If there is NOOP on the connection, piggyback the cookie
+	 *   and replace the NOOP tx, and return the NOOP tx.
+	 */
+	if (tx == NULL) { /* nothing on queue */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+		conn->ksnc_tx_carrier = tx_msg;
+		return NULL;
+	}
+
+	if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+		return NULL;
+	}
+
+	LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	/* There is a noop zc-ack can be piggybacked */
+	tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
+	ksocknal_next_tx_carrier(conn);
+
+	/* use new_tx to replace the noop zc-ack packet */
+	list_add(&tx_msg->tx_list, &tx->tx_list);
+	list_del(&tx->tx_list);
+
+	return tx;
+}
+
+static int
+ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
+			   ksock_tx_t *tx_ack, __u64 cookie)
+{
+	ksock_tx_t *tx;
+
+	if (conn->ksnc_type != SOCKLND_CONN_ACK)
+		return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
+
+	/* non-blocking ZC-ACK (to router) */
+	LASSERT (tx_ack == NULL ||
+		 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	if ((tx = conn->ksnc_tx_carrier) == NULL) {
+		if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+			conn->ksnc_tx_carrier = tx_ack;
+		}
+		return 0;
+	}
+
+	/* conn->ksnc_tx_carrier != NULL */
+
+	if (tx_ack != NULL)
+		cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+	if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
+		return 1;
+
+	if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
+		/* replace the keepalive PING with a real ACK */
+		LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+		tx->tx_msg.ksm_zc_cookies[1] = cookie;
+		return 1;
+	}
+
+	if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
+	    cookie == tx->tx_msg.ksm_zc_cookies[1]) {
+		CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+		      libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+		return 1; /* XXX return error in the future */
+	}
+
+	if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+		/* NOOP tx has only one ZC-ACK cookie, can carry at least one more */
+		if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
+			tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
+			tx->tx_msg.ksm_zc_cookies[1] = cookie;
+		} else {
+			tx->tx_msg.ksm_zc_cookies[0] = cookie;
+		}
+
+		if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
+			/* not likely to carry more ACKs, skip it to simplify logic */
+			ksocknal_next_tx_carrier(conn);
+		}
+
+		return 1;
+	}
+
+	/* takes two or more cookies already */
+
+	if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
+		__u64   tmp = 0;
+
+		/* two seperated cookies: (a+2, a) or (a+1, a) */
+		LASSERT (tx->tx_msg.ksm_zc_cookies[0] -
+			 tx->tx_msg.ksm_zc_cookies[1] <= 2);
+
+		if (tx->tx_msg.ksm_zc_cookies[0] -
+		    tx->tx_msg.ksm_zc_cookies[1] == 2) {
+			if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
+				tmp = cookie;
+		} else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
+			tmp = tx->tx_msg.ksm_zc_cookies[1];
+		} else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
+			tmp = tx->tx_msg.ksm_zc_cookies[0];
+		}
+
+		if (tmp != 0) {
+			/* range of cookies */
+			tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
+			tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
+			return 1;
+		}
+
+	} else {
+		/* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */
+		if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
+		    cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
+			CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+			      libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+			return 1; /* XXX: return error in the future */
+		}
+
+		if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
+			tx->tx_msg.ksm_zc_cookies[1] = cookie;
+			return 1;
+		}
+
+		if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
+			tx->tx_msg.ksm_zc_cookies[0] = cookie;
+			return 1;
+		}
+	}
+
+	/* failed to piggyback ZC-ACK */
+	if (tx_ack != NULL) {
+		list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
+		/* the next tx can piggyback at least 1 ACK */
+		ksocknal_next_tx_carrier(conn);
+	}
+
+	return 0;
+}
+
+static int
+ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+	int nob;
+
+#if SOCKNAL_VERSION_DEBUG
+	if (!*ksocknal_tunables.ksnd_typed_conns)
+		return SOCKNAL_MATCH_YES;
+#endif
+
+	if (tx == NULL || tx->tx_lnetmsg == NULL) {
+		/* noop packet */
+		nob = offsetof(ksock_msg_t, ksm_u);
+	} else {
+		nob = tx->tx_lnetmsg->msg_len +
+		      ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
+		       sizeof(lnet_hdr_t) : sizeof(ksock_msg_t));
+	}
+
+	/* default checking for typed connection */
+	switch (conn->ksnc_type) {
+	default:
+		CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+		LBUG();
+	case SOCKLND_CONN_ANY:
+		return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_BULK_IN:
+		return SOCKNAL_MATCH_MAY;
+
+	case SOCKLND_CONN_BULK_OUT:
+		if (nob < *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_CONTROL:
+		if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+	}
+}
+
+static int
+ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+	int nob;
+
+	if (tx == NULL || tx->tx_lnetmsg == NULL)
+		nob = offsetof(ksock_msg_t, ksm_u);
+	else
+		nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t);
+
+	switch (conn->ksnc_type) {
+	default:
+		CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+		LBUG();
+	case SOCKLND_CONN_ANY:
+		return SOCKNAL_MATCH_NO;
+
+	case SOCKLND_CONN_ACK:
+		if (nonblk)
+			return SOCKNAL_MATCH_YES;
+		else if (tx == NULL || tx->tx_lnetmsg == NULL)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_NO;
+
+	case SOCKLND_CONN_BULK_OUT:
+		if (nonblk)
+			return SOCKNAL_MATCH_NO;
+		else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_CONTROL:
+		if (nonblk)
+			return SOCKNAL_MATCH_NO;
+		else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+	}
+}
+
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+{
+	ksock_peer_t   *peer = c->ksnc_peer;
+	ksock_conn_t   *conn;
+	ksock_tx_t     *tx;
+	int	     rc;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
+	if (conn != NULL) {
+		ksock_sched_t *sched = conn->ksnc_scheduler;
+
+		LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+		spin_lock_bh(&sched->kss_lock);
+
+		rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
+
+		spin_unlock_bh(&sched->kss_lock);
+
+		if (rc) { /* piggybacked */
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			return 0;
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	/* ACK connection is not ready, or can't piggyback the ACK */
+	tx = ksocknal_alloc_tx_noop(cookie, !!remote);
+	if (tx == NULL)
+		return -ENOMEM;
+
+	if ((rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) == 0)
+		return 0;
+
+	ksocknal_free_tx(tx);
+	return rc;
+}
+
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+{
+	ksock_peer_t      *peer = conn->ksnc_peer;
+	ksock_tx_t	*tx;
+	ksock_tx_t	*tmp;
+	LIST_HEAD     (zlist);
+	int		count;
+
+	if (cookie1 == 0)
+		cookie1 = cookie2;
+
+	count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
+
+	if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
+	    conn->ksnc_proto == &ksocknal_protocol_v3x) {
+		/* keepalive PING for V3.x, just ignore it */
+		return count == 1 ? 0 : -EPROTO;
+	}
+
+	spin_lock(&peer->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp,
+				     &peer->ksnp_zc_req_list, tx_zc_list) {
+		__u64 c = tx->tx_msg.ksm_zc_cookies[0];
+
+		if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
+			tx->tx_msg.ksm_zc_cookies[0] = 0;
+			list_del(&tx->tx_zc_list);
+			list_add(&tx->tx_zc_list, &zlist);
+
+			if (--count == 0)
+				break;
+		}
+	}
+
+	spin_unlock(&peer->ksnp_lock);
+
+	while (!list_empty(&zlist)) {
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+		list_del(&tx->tx_zc_list);
+		ksocknal_tx_decref(tx);
+	}
+
+	return count == 0 ? 0 : -EPROTO;
+}
+
+static int
+ksocknal_send_hello_v1 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+	socket_t	*sock = conn->ksnc_sock;
+	lnet_hdr_t	  *hdr;
+	lnet_magicversion_t *hmv;
+	int		  rc;
+	int		  i;
+
+	CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid));
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate lnet_hdr_t\n");
+		return -ENOMEM;
+	}
+
+	hmv = (lnet_magicversion_t *)&hdr->dest_nid;
+
+	/* Re-organize V2.x message header to V1.x (lnet_hdr_t)
+	 * header and send out */
+	hmv->magic	 = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+	hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+	hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+	if (the_lnet.ln_testprotocompat != 0) {
+		/* single-shot proto check */
+		LNET_LOCK();
+		if ((the_lnet.ln_testprotocompat & 1) != 0) {
+			hmv->version_major++;   /* just different! */
+			the_lnet.ln_testprotocompat &= ~1;
+		}
+		if ((the_lnet.ln_testprotocompat & 2) != 0) {
+			hmv->magic = LNET_PROTO_MAGIC;
+			the_lnet.ln_testprotocompat &= ~2;
+		}
+		LNET_UNLOCK();
+	}
+
+	hdr->src_nid	= cpu_to_le64 (hello->kshm_src_nid);
+	hdr->src_pid	= cpu_to_le32 (hello->kshm_src_pid);
+	hdr->type	   = cpu_to_le32 (LNET_MSG_HELLO);
+	hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+	hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+	hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
+
+	rc = libcfs_sock_write(sock, hdr, sizeof(*hdr),lnet_acceptor_timeout());
+
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+		goto out;
+	}
+
+	if (hello->kshm_nips == 0)
+		goto out;
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
+	}
+
+	rc = libcfs_sock_write(sock, hello->kshm_ips,
+			       hello->kshm_nips * sizeof(__u32),
+			       lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO payload (%d)"
+			" to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+			HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+	}
+out:
+	LIBCFS_FREE(hdr, sizeof(*hdr));
+
+	return rc;
+}
+
+static int
+ksocknal_send_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+	socket_t   *sock = conn->ksnc_sock;
+	int	     rc;
+
+	hello->kshm_magic   = LNET_PROTO_MAGIC;
+	hello->kshm_version = conn->ksnc_proto->pro_version;
+
+	if (the_lnet.ln_testprotocompat != 0) {
+		/* single-shot proto check */
+		LNET_LOCK();
+		if ((the_lnet.ln_testprotocompat & 1) != 0) {
+			hello->kshm_version++;   /* just different! */
+			the_lnet.ln_testprotocompat &= ~1;
+		}
+		LNET_UNLOCK();
+	}
+
+	rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips),
+			       lnet_acceptor_timeout());
+
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+		return rc;
+	}
+
+	if (hello->kshm_nips == 0)
+		return 0;
+
+	rc = libcfs_sock_write(sock, hello->kshm_ips,
+			       hello->kshm_nips * sizeof(__u32),
+			       lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO payload (%d)"
+			" to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+			HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+	}
+
+	return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,int timeout)
+{
+	socket_t	*sock = conn->ksnc_sock;
+	lnet_hdr_t	  *hdr;
+	int		  rc;
+	int		  i;
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate lnet_hdr_t\n");
+		return -ENOMEM;
+	}
+
+	rc = libcfs_sock_read(sock, &hdr->src_nid,
+			      sizeof (*hdr) - offsetof (lnet_hdr_t, src_nid),
+			      timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+	/* ...and check we got what we expected */
+	if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
+		CERROR ("Expecting a HELLO hdr,"
+			" but got type %d from %u.%u.%u.%u\n",
+			le32_to_cpu (hdr->type),
+			HIPQUAD(conn->ksnc_ipaddr));
+		rc = -EPROTO;
+		goto out;
+	}
+
+	hello->kshm_src_nid	 = le64_to_cpu (hdr->src_nid);
+	hello->kshm_src_pid	 = le32_to_cpu (hdr->src_pid);
+	hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation);
+	hello->kshm_ctype	   = le32_to_cpu (hdr->msg.hello.type);
+	hello->kshm_nips	    = le32_to_cpu (hdr->payload_length) /
+					 sizeof (__u32);
+
+	if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+		CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+		       hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+		rc = -EPROTO;
+		goto out;
+	}
+
+	if (hello->kshm_nips == 0)
+		goto out;
+
+	rc = libcfs_sock_read(sock, hello->kshm_ips,
+			      hello->kshm_nips * sizeof(__u32), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+
+		if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+			       i, HIPQUAD(conn->ksnc_ipaddr));
+			rc = -EPROTO;
+			break;
+		}
+	}
+out:
+	LIBCFS_FREE(hdr, sizeof(*hdr));
+
+	return rc;
+}
+
+static int
+ksocknal_recv_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout)
+{
+	socket_t      *sock = conn->ksnc_sock;
+	int		rc;
+	int		i;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		conn->ksnc_flip = 0;
+	else
+		conn->ksnc_flip = 1;
+
+	rc = libcfs_sock_read(sock, &hello->kshm_src_nid,
+			      offsetof(ksock_hello_msg_t, kshm_ips) -
+				       offsetof(ksock_hello_msg_t, kshm_src_nid),
+			      timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	if (conn->ksnc_flip) {
+		__swab32s(&hello->kshm_src_pid);
+		__swab64s(&hello->kshm_src_nid);
+		__swab32s(&hello->kshm_dst_pid);
+		__swab64s(&hello->kshm_dst_nid);
+		__swab64s(&hello->kshm_src_incarnation);
+		__swab64s(&hello->kshm_dst_incarnation);
+		__swab32s(&hello->kshm_ctype);
+		__swab32s(&hello->kshm_nips);
+	}
+
+	if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+		CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+		       hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+		return -EPROTO;
+	}
+
+	if (hello->kshm_nips == 0)
+		return 0;
+
+	rc = libcfs_sock_read(sock, hello->kshm_ips,
+			      hello->kshm_nips * sizeof(__u32), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		if (conn->ksnc_flip)
+			__swab32s(&hello->kshm_ips[i]);
+
+		if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+			       i, HIPQUAD(conn->ksnc_ipaddr));
+			return -EPROTO;
+		}
+	}
+
+	return 0;
+}
+
+static void
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
+{
+	/* V1.x has no KSOCK_MSG_NOOP */
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_lnetmsg != NULL);
+
+	tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr;
+	tx->tx_iov[0].iov_len  = sizeof(lnet_hdr_t);
+
+	tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t);
+}
+
+static void
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
+{
+	tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
+
+	if (tx->tx_lnetmsg != NULL) {
+		LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+
+		tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
+		tx->tx_iov[0].iov_len = sizeof(ksock_msg_t);
+		tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len;
+	} else {
+		LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+		tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+		tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t,  ksm_u.lnetmsg.ksnm_hdr);
+	}
+	/* Don't checksum before start sending, because packet can be piggybacked with ACK */
+}
+
+static void
+ksocknal_unpack_msg_v1(ksock_msg_t *msg)
+{
+	msg->ksm_csum	   = 0;
+	msg->ksm_type	   = KSOCK_MSG_LNET;
+	msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+static void
+ksocknal_unpack_msg_v2(ksock_msg_t *msg)
+{
+	return;  /* Do nothing */
+}
+
+ksock_proto_t  ksocknal_protocol_v1x =
+{
+	.pro_version	    = KSOCK_PROTO_V1,
+	.pro_send_hello	 = ksocknal_send_hello_v1,
+	.pro_recv_hello	 = ksocknal_recv_hello_v1,
+	.pro_pack	       = ksocknal_pack_msg_v1,
+	.pro_unpack	     = ksocknal_unpack_msg_v1,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v1,
+	.pro_handle_zcreq       = NULL,
+	.pro_handle_zcack       = NULL,
+	.pro_queue_tx_zcack     = NULL,
+	.pro_match_tx	   = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v2x =
+{
+	.pro_version	    = KSOCK_PROTO_V2,
+	.pro_send_hello	 = ksocknal_send_hello_v2,
+	.pro_recv_hello	 = ksocknal_recv_hello_v2,
+	.pro_pack	       = ksocknal_pack_msg_v2,
+	.pro_unpack	     = ksocknal_unpack_msg_v2,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+	.pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v2,
+	.pro_handle_zcreq       = ksocknal_handle_zcreq,
+	.pro_handle_zcack       = ksocknal_handle_zcack,
+	.pro_match_tx	   = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v3x =
+{
+	.pro_version	    = KSOCK_PROTO_V3,
+	.pro_send_hello	 = ksocknal_send_hello_v2,
+	.pro_recv_hello	 = ksocknal_recv_hello_v2,
+	.pro_pack	       = ksocknal_pack_msg_v2,
+	.pro_unpack	     = ksocknal_unpack_msg_v2,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+	.pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v3,
+	.pro_handle_zcreq       = ksocknal_handle_zcreq,
+	.pro_handle_zcack       = ksocknal_handle_zcack,
+	.pro_match_tx	   = ksocknal_match_tx_v3
+};

diff --git a/drivers/staging/lustre/lnet/lnet/Makefile b/drivers/staging/lustre/lnet/lnet/Makefile
new file mode 100644
index 0000000..1bd9ef7
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/Makefile

@@ -0,0 +1,8 @@
+obj-$(CONFIG_LNET) += lnet.o
+
+lnet-y := api-errno.o api-ni.o config.o lib-me.o lib-msg.o lib-eq.o	\
+	  lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o		\
+	  router_proc.o acceptor.o peer.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lnet/lnet/acceptor.c b/drivers/staging/lustre/lnet/lnet/acceptor.c
new file mode 100644
index 0000000..81ef28b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/acceptor.c

@@ -0,0 +1,527 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+
+static int   accept_port    = 988;
+static int   accept_backlog = 127;
+static int   accept_timeout = 5;
+
+struct {
+	int			pta_shutdown;
+	socket_t		*pta_sock;
+	struct completion	pta_signal;
+} lnet_acceptor_state;
+
+int
+lnet_acceptor_port(void)
+{
+	return accept_port;
+}
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+	return (magic == constant ||
+		magic == __swab32(constant));
+}
+
+
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static char *accept = "secure";
+
+CFS_MODULE_PARM(accept, "s", charp, 0444,
+		"Accept connections (secure|all|none)");
+CFS_MODULE_PARM(accept_port, "i", int, 0444,
+		"Acceptor's port (same on all nodes)");
+CFS_MODULE_PARM(accept_backlog, "i", int, 0444,
+		"Acceptor's listen backlog");
+CFS_MODULE_PARM(accept_timeout, "i", int, 0644,
+		"Acceptor's timeout (seconds)");
+
+static char *accept_type = NULL;
+
+int
+lnet_acceptor_get_tunables(void)
+{
+	/* Userland acceptor uses 'accept_type' instead of 'accept', due to
+	 * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
+	 * for compatibility. Hence the trick. */
+	accept_type = accept;
+	return 0;
+}
+
+int
+lnet_acceptor_timeout(void)
+{
+	return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error (int rc, lnet_nid_t peer_nid,
+			   __u32 peer_ip, int peer_port)
+{
+	switch (rc) {
+	/* "normal" errors */
+	case -ECONNREFUSED:
+		CNETERR("Connection to %s at host %u.%u.%u.%u on port %d was "
+			"refused: check that Lustre is running on that node.\n",
+			libcfs_nid2str(peer_nid),
+			HIPQUAD(peer_ip), peer_port);
+		break;
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+		CNETERR("Connection to %s at host %u.%u.%u.%u "
+			"was unreachable: the network or that node may "
+			"be down, or Lustre may be misconfigured.\n",
+			libcfs_nid2str(peer_nid), HIPQUAD(peer_ip));
+		break;
+	case -ETIMEDOUT:
+		CNETERR("Connection to %s at host %u.%u.%u.%u on "
+			"port %d took too long: that node may be hung "
+			"or experiencing high load.\n",
+			libcfs_nid2str(peer_nid),
+			HIPQUAD(peer_ip), peer_port);
+		break;
+	case -ECONNRESET:
+		LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %u.%u.%u.%u"
+				   " on port %d was reset: "
+				   "is it running a compatible version of "
+				   "Lustre and is %s one of its NIDs?\n",
+				   libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port,
+				   libcfs_nid2str(peer_nid));
+		break;
+	case -EPROTO:
+		LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at "
+				   "host %u.%u.%u.%u on port %d: is it running "
+				   "a compatible version of Lustre?\n",
+				   libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port);
+		break;
+	case -EADDRINUSE:
+		LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to "
+				   "connect to %s at host %u.%u.%u.%u on port "
+				   "%d\n", libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port);
+		break;
+	default:
+		LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s"
+				   " at host %u.%u.%u.%u on port %d\n", rc,
+				   libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port);
+		break;
+	}
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+	    __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+	lnet_acceptor_connreq_t cr;
+	socket_t	   *sock;
+	int		     rc;
+	int		     port;
+	int		     fatal;
+
+	CLASSERT (sizeof(cr) <= 16);	    /* not too big to be on the stack */
+
+	for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+	     port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+	     --port) {
+		/* Iterate through reserved ports. */
+
+		rc = libcfs_sock_connect(&sock, &fatal,
+					 local_ip, port,
+					 peer_ip, peer_port);
+		if (rc != 0) {
+			if (fatal)
+				goto failed;
+			continue;
+		}
+
+		CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+		cr.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+		cr.acr_nid     = peer_nid;
+
+		if (the_lnet.ln_testprotocompat != 0) {
+			/* single-shot proto check */
+			lnet_net_lock(LNET_LOCK_EX);
+			if ((the_lnet.ln_testprotocompat & 4) != 0) {
+				cr.acr_version++;
+				the_lnet.ln_testprotocompat &= ~4;
+			}
+			if ((the_lnet.ln_testprotocompat & 8) != 0) {
+				cr.acr_magic = LNET_PROTO_MAGIC;
+				the_lnet.ln_testprotocompat &= ~8;
+			}
+			lnet_net_unlock(LNET_LOCK_EX);
+		}
+
+		rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+		if (rc != 0)
+			goto failed_sock;
+
+		*sockp = sock;
+		return 0;
+	}
+
+	rc = -EADDRINUSE;
+	goto failed;
+
+ failed_sock:
+	libcfs_sock_release(sock);
+ failed:
+	lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+	return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+
+/* Below is the code common for both kernel and MT user-space */
+
+int
+lnet_accept(socket_t *sock, __u32 magic)
+{
+	lnet_acceptor_connreq_t cr;
+	__u32		   peer_ip;
+	int		     peer_port;
+	int		     rc;
+	int		     flip;
+	lnet_ni_t	      *ni;
+	char		   *str;
+
+	LASSERT (sizeof(cr) <= 16);	     /* not too big for the stack */
+
+	rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+	LASSERT (rc == 0);		      /* we succeeded before */
+
+	if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+		if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+			/* future version compatibility!
+			 * When LNET unifies protocols over all LNDs, the first
+			 * thing sent will be a version query.  I send back
+			 * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+			memset (&cr, 0, sizeof(cr));
+			cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+			cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+			rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+					       accept_timeout);
+
+			if (rc != 0)
+				CERROR("Error sending magic+version in response"
+				       "to LNET magic from %u.%u.%u.%u: %d\n",
+				       HIPQUAD(peer_ip), rc);
+			return -EPROTO;
+		}
+
+		if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+			str = "'old' socknal/tcpnal";
+		else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC))
+			str = "'old' ranal";
+		else
+			str = "unrecognised";
+
+		LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %u.%u.%u.%u"
+				   " magic %08x: %s acceptor protocol\n",
+				   HIPQUAD(peer_ip), magic, str);
+		return -EPROTO;
+	}
+
+	flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+	rc = libcfs_sock_read(sock, &cr.acr_version,
+			      sizeof(cr.acr_version),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request version from "
+		       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+		return -EIO;
+	}
+
+	if (flip)
+		__swab32s(&cr.acr_version);
+
+	if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+		/* future version compatibility!
+		 * An acceptor-specific protocol rev will first send a version
+		 * query.  I send back my current version to tell her I'm
+		 * "old". */
+		int peer_version = cr.acr_version;
+
+		memset (&cr, 0, sizeof(cr));
+		cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+		rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+
+		if (rc != 0)
+			CERROR("Error sending magic+version in response"
+			       "to version %d from %u.%u.%u.%u: %d\n",
+			       peer_version, HIPQUAD(peer_ip), rc);
+		return -EPROTO;
+	}
+
+	rc = libcfs_sock_read(sock, &cr.acr_nid,
+			      sizeof(cr) -
+			      offsetof(lnet_acceptor_connreq_t, acr_nid),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request from "
+		       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+		return -EIO;
+	}
+
+	if (flip)
+		__swab64s(&cr.acr_nid);
+
+	ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
+	if (ni == NULL ||	       /* no matching net */
+	    ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+		if (ni != NULL)
+			lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %u.%u.%u.%u"
+				   " for %s: No matching NI\n",
+				   HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+		return -EPERM;
+	}
+
+	if (ni->ni_lnd->lnd_accept == NULL) {
+		/* This catches a request for the loopback LND */
+		lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %u.%u.%u.%u"
+				  " for %s: NI doesn not accept IP connections\n",
+				  HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+		return -EPERM;
+	}
+
+	CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u\n",
+	       libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip));
+
+	rc = ni->ni_lnd->lnd_accept(ni, sock);
+
+	lnet_ni_decref(ni);
+	return rc;
+}
+
+int
+lnet_acceptor(void *arg)
+{
+	socket_t  *newsock;
+	int	    rc;
+	__u32	  magic;
+	__u32	  peer_ip;
+	int	    peer_port;
+	int	    secure = (int)((long_ptr_t)arg);
+
+	LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+	cfs_block_allsigs();
+
+	rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
+				0, accept_port, accept_backlog);
+	if (rc != 0) {
+		if (rc == -EADDRINUSE)
+			LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
+					   " %d: port already in use\n",
+					   accept_port);
+		else
+			LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port "
+					   "%d: unexpected error %d\n",
+					   accept_port, rc);
+
+		lnet_acceptor_state.pta_sock = NULL;
+	} else {
+		LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+	}
+
+	/* set init status and unblock parent */
+	lnet_acceptor_state.pta_shutdown = rc;
+	complete(&lnet_acceptor_state.pta_signal);
+
+	if (rc != 0)
+		return rc;
+
+	while (!lnet_acceptor_state.pta_shutdown) {
+
+		rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+		if (rc != 0) {
+			if (rc != -EAGAIN) {
+				CWARN("Accept error %d: pausing...\n", rc);
+				cfs_pause(cfs_time_seconds(1));
+			}
+			continue;
+		}
+
+		/* maybe we're waken up with libcfs_sock_abort_accept() */
+		if (lnet_acceptor_state.pta_shutdown) {
+			libcfs_sock_release(newsock);
+			break;
+		}
+
+		rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port);
+		if (rc != 0) {
+			CERROR("Can't determine new connection's address\n");
+			goto failed;
+		}
+
+		if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+			CERROR("Refusing connection from %u.%u.%u.%u: "
+			       "insecure port %d\n",
+			       HIPQUAD(peer_ip), peer_port);
+			goto failed;
+		}
+
+		rc = libcfs_sock_read(newsock, &magic, sizeof(magic),
+				      accept_timeout);
+		if (rc != 0) {
+			CERROR("Error %d reading connection request from "
+			       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+			goto failed;
+		}
+
+		rc = lnet_accept(newsock, magic);
+		if (rc != 0)
+			goto failed;
+
+		continue;
+
+	failed:
+		libcfs_sock_release(newsock);
+	}
+
+	libcfs_sock_release(lnet_acceptor_state.pta_sock);
+	lnet_acceptor_state.pta_sock = NULL;
+
+	CDEBUG(D_NET, "Acceptor stopping\n");
+
+	/* unblock lnet_acceptor_stop() */
+	complete(&lnet_acceptor_state.pta_signal);
+	return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+	if (!strcmp(acc, "secure")) {
+		*sec = 1;
+		return 1;
+	} else if (!strcmp(acc, "all")) {
+		*sec = 0;
+		return 1;
+	} else if (!strcmp(acc, "none")) {
+		return 0;
+	} else {
+		LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+				   acc);
+		return -EINVAL;
+	}
+}
+
+int
+lnet_acceptor_start(void)
+{
+	int  rc;
+	long rc2;
+	long secure;
+
+	LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+	rc = lnet_acceptor_get_tunables();
+	if (rc != 0)
+		return rc;
+
+
+	init_completion(&lnet_acceptor_state.pta_signal);
+	rc = accept2secure(accept_type, &secure);
+	if (rc <= 0) {
+		fini_completion(&lnet_acceptor_state.pta_signal);
+		return rc;
+	}
+
+	if (lnet_count_acceptor_nis() == 0)  /* not required */
+		return 0;
+
+	rc2 = PTR_ERR(kthread_run(lnet_acceptor,
+				  (void *)(ulong_ptr_t)secure,
+				  "acceptor_%03ld", secure));
+	if (IS_ERR_VALUE(rc2)) {
+		CERROR("Can't start acceptor thread: %ld\n", rc2);
+		fini_completion(&lnet_acceptor_state.pta_signal);
+
+		return -ESRCH;
+	}
+
+	/* wait for acceptor to startup */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+	if (!lnet_acceptor_state.pta_shutdown) {
+		/* started OK */
+		LASSERT(lnet_acceptor_state.pta_sock != NULL);
+		return 0;
+	}
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+	fini_completion(&lnet_acceptor_state.pta_signal);
+
+	return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+	if (lnet_acceptor_state.pta_sock == NULL) /* not running */
+		return;
+
+	lnet_acceptor_state.pta_shutdown = 1;
+	libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock);
+
+	/* block until acceptor signals exit */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+	fini_completion(&lnet_acceptor_state.pta_signal);
+}

diff --git a/drivers/staging/lustre/lnet/lnet/api-errno.c b/drivers/staging/lustre/lnet/lnet/api-errno.c
new file mode 100644
index 0000000..695b272
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/api-errno.c

@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/api-errno.c
+ *
+ * Instantiate the string table of errors
+ */
+
+/* If you change these, you must update the number table in portals/errno.h */

diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
new file mode 100644
index 0000000..e88bee3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/api-ni.c

@@ -0,0 +1,1941 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+#include <linux/log2.h>
+
+#define D_LNI D_CONSOLE
+
+lnet_t      the_lnet;			   /* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+
+static char *ip2nets = "";
+CFS_MODULE_PARM(ip2nets, "s", charp, 0444,
+		"LNET network <- IP table");
+
+static char *networks = "";
+CFS_MODULE_PARM(networks, "s", charp, 0444,
+		"local networks");
+
+static char *routes = "";
+CFS_MODULE_PARM(routes, "s", charp, 0444,
+		"routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+CFS_MODULE_PARM(rnet_htable_size, "i", int, 0444,
+		"size of remote network hash table");
+
+char *
+lnet_get_routes(void)
+{
+	return routes;
+}
+
+char *
+lnet_get_networks(void)
+{
+	char   *nets;
+	int     rc;
+
+	if (*networks != 0 && *ip2nets != 0) {
+		LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or "
+				   "'ip2nets' but not both at once\n");
+		return NULL;
+	}
+
+	if (*ip2nets != 0) {
+		rc = lnet_parse_ip2nets(&nets, ip2nets);
+		return (rc == 0) ? nets : NULL;
+	}
+
+	if (*networks != 0)
+		return networks;
+
+	return "tcp";
+}
+
+void
+lnet_init_locks(void)
+{
+	spin_lock_init(&the_lnet.ln_eq_wait_lock);
+	init_waitqueue_head(&the_lnet.ln_eq_waitq);
+	mutex_init(&the_lnet.ln_lnd_mutex);
+	mutex_init(&the_lnet.ln_api_mutex);
+}
+
+void
+lnet_fini_locks(void)
+{
+}
+
+
+static int
+lnet_create_remote_nets_table(void)
+{
+	int		i;
+	struct list_head	*hash;
+
+	LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+	LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+	LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+	if (hash == NULL) {
+		CERROR("Failed to create remote nets hash table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&hash[i]);
+	the_lnet.ln_remote_nets_hash = hash;
+	return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+	int		i;
+	struct list_head	*hash;
+
+	if (the_lnet.ln_remote_nets_hash == NULL)
+		return;
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+	LIBCFS_FREE(the_lnet.ln_remote_nets_hash,
+		    LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+	the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+	if (the_lnet.ln_res_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_res_lock);
+		the_lnet.ln_res_lock = NULL;
+	}
+
+	if (the_lnet.ln_net_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_net_lock);
+		the_lnet.ln_net_lock = NULL;
+	}
+
+	lnet_fini_locks();
+}
+
+static int
+lnet_create_locks(void)
+{
+	lnet_init_locks();
+
+	the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_res_lock == NULL)
+		goto failed;
+
+	the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_net_lock == NULL)
+		goto failed;
+
+	return 0;
+
+ failed:
+	lnet_destroy_locks();
+	return -ENOMEM;
+}
+
+void lnet_assert_wire_constants (void)
+{
+	/* Wire protocol assertions generated by 'wirecheck'
+	 * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+	 * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+	 * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+
+	/* Constants... */
+	CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+	CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1);
+	CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0);
+	CLASSERT (LNET_MSG_ACK == 0);
+	CLASSERT (LNET_MSG_PUT == 1);
+	CLASSERT (LNET_MSG_GET == 2);
+	CLASSERT (LNET_MSG_REPLY == 3);
+	CLASSERT (LNET_MSG_HELLO == 4);
+
+	/* Checks for struct ptl_handle_wire_t */
+	CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16);
+	CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0);
+	CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8);
+	CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8);
+	CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+	/* Checks for struct lnet_magicversion_t */
+	CLASSERT ((int)sizeof(lnet_magicversion_t) == 8);
+	CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0);
+	CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4);
+	CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4);
+	CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2);
+	CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6);
+	CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2);
+
+	/* Checks for struct lnet_hdr_t */
+	CLASSERT ((int)sizeof(lnet_hdr_t) == 72);
+	CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40);
+
+	/* Ack */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4);
+
+	/* Put */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4);
+
+	/* Get */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4);
+
+	/* Reply */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+
+	/* Hello */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4);
+}
+
+lnd_t *
+lnet_find_lnd_by_type (int type)
+{
+	lnd_t	      *lnd;
+	struct list_head	 *tmp;
+
+	/* holding lnd mutex */
+	list_for_each (tmp, &the_lnet.ln_lnds) {
+		lnd = list_entry(tmp, lnd_t, lnd_list);
+
+		if ((int)lnd->lnd_type == type)
+			return lnd;
+	}
+
+	return NULL;
+}
+
+void
+lnet_register_lnd (lnd_t *lnd)
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (libcfs_isknown_lnd(lnd->lnd_type));
+	LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+	list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds);
+	lnd->lnd_refcount = 0;
+
+	CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd (lnd_t *lnd)
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+	LASSERT (lnd->lnd_refcount == 0);
+
+	list_del (&lnd->lnd_list);
+	CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+void
+lnet_counters_get(lnet_counters_t *counters)
+{
+	lnet_counters_t *ctr;
+	int		i;
+
+	memset(counters, 0, sizeof(*counters));
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+		counters->msgs_max     += ctr->msgs_max;
+		counters->msgs_alloc   += ctr->msgs_alloc;
+		counters->errors       += ctr->errors;
+		counters->send_count   += ctr->send_count;
+		counters->recv_count   += ctr->recv_count;
+		counters->route_count  += ctr->route_count;
+		counters->drop_length  += ctr->drop_length;
+		counters->send_length  += ctr->send_length;
+		counters->recv_length  += ctr->recv_length;
+		counters->route_length += ctr->route_length;
+		counters->drop_length  += ctr->drop_length;
+
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+	lnet_counters_t *counters;
+	int		i;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+		memset(counters, 0, sizeof(lnet_counters_t));
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_reset);
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int
+lnet_freelist_init (lnet_freelist_t *fl, int n, int size)
+{
+	char *space;
+
+	LASSERT (n > 0);
+
+	size += offsetof (lnet_freeobj_t, fo_contents);
+
+	LIBCFS_ALLOC(space, n * size);
+	if (space == NULL)
+		return (-ENOMEM);
+
+	INIT_LIST_HEAD (&fl->fl_list);
+	fl->fl_objs = space;
+	fl->fl_nobjs = n;
+	fl->fl_objsize = size;
+
+	do
+	{
+		memset (space, 0, size);
+		list_add ((struct list_head *)space, &fl->fl_list);
+		space += size;
+	} while (--n != 0);
+
+	return (0);
+}
+
+void
+lnet_freelist_fini (lnet_freelist_t *fl)
+{
+	struct list_head       *el;
+	int	       count;
+
+	if (fl->fl_nobjs == 0)
+		return;
+
+	count = 0;
+	for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+		count++;
+
+	LASSERT (count == fl->fl_nobjs);
+
+	LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+	memset (fl, 0, sizeof (*fl));
+}
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+__u64
+lnet_create_interface_cookie (void)
+{
+	/* NB the interface cookie in wire handles guards against delayed
+	 * replies and ACKs appearing valid after reboot. Initialisation time,
+	 * even if it's only implemented to millisecond resolution is probably
+	 * easily good enough. */
+	struct timeval tv;
+	__u64	  cookie;
+	do_gettimeofday(&tv);
+	cookie = tv.tv_sec;
+	cookie *= 1000000;
+	cookie += tv.tv_usec;
+	return cookie;
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+	switch (type) {
+	default:
+		LBUG();
+	case LNET_COOKIE_TYPE_MD:
+		return "MD";
+	case LNET_COOKIE_TYPE_ME:
+		return "ME";
+	case LNET_COOKIE_TYPE_EQ:
+		return "EQ";
+	}
+}
+
+void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+	int	count = 0;
+
+	if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+		return;
+
+	while (!list_empty(&rec->rec_active)) {
+		struct list_head *e = rec->rec_active.next;
+
+		list_del_init(e);
+		if (rec->rec_type == LNET_COOKIE_TYPE_EQ) {
+			lnet_eq_free(list_entry(e, lnet_eq_t, eq_list));
+
+		} else if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+			lnet_md_free(list_entry(e, lnet_libmd_t, md_list));
+
+		} else { /* NB: Active MEs should be attached on portals */
+			LBUG();
+		}
+		count++;
+	}
+
+	if (count > 0) {
+		/* Found alive MD/ME/EQ, user really should unlink/free
+		 * all of them before finalize LNet, but if someone didn't,
+		 * we have to recycle garbage for him */
+		CERROR("%d active elements on exit of %s container\n",
+		       count, lnet_res_type2str(rec->rec_type));
+	}
+
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_fini(&rec->rec_freelist);
+#endif
+	if (rec->rec_lh_hash != NULL) {
+		LIBCFS_FREE(rec->rec_lh_hash,
+			    LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+		rec->rec_lh_hash = NULL;
+	}
+
+	rec->rec_type = 0; /* mark it as finalized */
+}
+
+int
+lnet_res_container_setup(struct lnet_res_container *rec,
+			 int cpt, int type, int objnum, int objsz)
+{
+	int	rc = 0;
+	int	i;
+
+	LASSERT(rec->rec_type == 0);
+
+	rec->rec_type = type;
+	INIT_LIST_HEAD(&rec->rec_active);
+
+#ifdef LNET_USE_LIB_FREELIST
+	memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist));
+	rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz);
+	if (rc != 0)
+		goto out;
+#endif
+	rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+	/* Arbitrary choice of hash table size */
+	LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+			 LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+	if (rec->rec_lh_hash == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+	return 0;
+
+out:
+	CERROR("Failed to setup %s resource container\n",
+	       lnet_res_type2str(type));
+	lnet_res_container_cleanup(rec);
+	return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+	struct lnet_res_container	*rec;
+	int				i;
+
+	cfs_percpt_for_each(rec, i, recs)
+		lnet_res_container_cleanup(rec);
+
+	cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type, int objnum, int objsz)
+{
+	struct lnet_res_container	**recs;
+	struct lnet_res_container	*rec;
+	int				rc;
+	int				i;
+
+	recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+	if (recs == NULL) {
+		CERROR("Failed to allocate %s resource containers\n",
+		       lnet_res_type2str(type));
+		return NULL;
+	}
+
+	cfs_percpt_for_each(rec, i, recs) {
+		rc = lnet_res_container_setup(rec, i, type, objnum, objsz);
+		if (rc != 0) {
+			lnet_res_containers_destroy(recs);
+			return NULL;
+		}
+	}
+
+	return recs;
+}
+
+lnet_libhandle_t *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	struct list_head		*head;
+	lnet_libhandle_t	*lh;
+	unsigned int		hash;
+
+	if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+		return NULL;
+
+	hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+	head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+	list_for_each_entry(lh, head, lh_hash_chain) {
+		if (lh->lh_cookie == cookie)
+			return lh;
+	}
+
+	return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	unsigned int	ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+	unsigned int	hash;
+
+	lh->lh_cookie = rec->rec_lh_cookie;
+	rec->rec_lh_cookie += 1 << ibits;
+
+	hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+	list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+
+int lnet_unprepare(void);
+
+int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+	/* Prepare to bring up the network */
+	struct lnet_res_container **recs;
+	int			  rc = 0;
+
+	LASSERT (the_lnet.ln_refcount == 0);
+
+	the_lnet.ln_routing = 0;
+
+	LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0);
+	the_lnet.ln_pid = requested_pid;
+
+	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+	INIT_LIST_HEAD(&the_lnet.ln_nis);
+	INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
+	INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_routers);
+
+	rc = lnet_create_remote_nets_table();
+	if (rc != 0)
+		goto failed;
+
+	the_lnet.ln_interface_cookie = lnet_create_interface_cookie();
+
+	the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+						sizeof(lnet_counters_t));
+	if (the_lnet.ln_counters == NULL) {
+		CERROR("Failed to allocate counters for LNet\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	rc = lnet_peer_tables_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_msg_containers_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+				      LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS,
+				      sizeof(lnet_eq_t));
+	if (rc != 0)
+		goto failed;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES,
+					  sizeof(lnet_me_t));
+	if (recs == NULL)
+		goto failed;
+
+	the_lnet.ln_me_containers = recs;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS,
+					  sizeof(lnet_libmd_t));
+	if (recs == NULL)
+		goto failed;
+
+	the_lnet.ln_md_containers = recs;
+
+	rc = lnet_portals_create();
+	if (rc != 0) {
+		CERROR("Failed to create portals for LNet: %d\n", rc);
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	lnet_unprepare();
+	return rc;
+}
+
+int
+lnet_unprepare (void)
+{
+	/* NB no LNET_LOCK since this is the last reference.  All LND instances
+	 * have shut down already, so it is safe to unlink and free all
+	 * descriptors, even those that appear committed to a network op (eg MD
+	 * with non-zero pending count) */
+
+	lnet_fail_nid(LNET_NID_ANY, 0);
+
+	LASSERT(the_lnet.ln_refcount == 0);
+	LASSERT(list_empty(&the_lnet.ln_test_peers));
+	LASSERT(list_empty(&the_lnet.ln_nis));
+	LASSERT(list_empty(&the_lnet.ln_nis_cpt));
+	LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+	lnet_portals_destroy();
+
+	if (the_lnet.ln_md_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_md_containers);
+		the_lnet.ln_md_containers = NULL;
+	}
+
+	if (the_lnet.ln_me_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_me_containers);
+		the_lnet.ln_me_containers = NULL;
+	}
+
+	lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+	lnet_msg_containers_destroy();
+	lnet_peer_tables_destroy();
+	lnet_rtrpools_free();
+
+	if (the_lnet.ln_counters != NULL) {
+		cfs_percpt_free(the_lnet.ln_counters);
+		the_lnet.ln_counters = NULL;
+	}
+	lnet_destroy_remote_nets_table();
+
+	return 0;
+}
+
+lnet_ni_t  *
+lnet_net2ni_locked(__u32 net, int cpt)
+{
+	struct list_head	*tmp;
+	lnet_ni_t	*ni;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (LNET_NIDNET(ni->ni_nid) == net) {
+			lnet_ni_addref_locked(ni, cpt);
+			return ni;
+		}
+	}
+
+	return NULL;
+}
+
+lnet_ni_t *
+lnet_net2ni(__u32 net)
+{
+	lnet_ni_t *ni;
+
+	lnet_net_lock(0);
+	ni = lnet_net2ni_locked(net, 0);
+	lnet_net_unlock(0);
+
+	return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni);
+
+static unsigned int
+lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+	__u64		key = nid;
+	unsigned int	val;
+
+	LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+	if (number == 1)
+		return 0;
+
+	val = cfs_hash_long(key, LNET_CPT_BITS);
+	/* NB: LNET_CP_NUMBER doesn't have to be PO2 */
+	if (val < number)
+		return val;
+
+	return (unsigned int)(key + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(lnet_nid_t nid)
+{
+	struct lnet_ni *ni;
+
+	/* must called with hold of lnet_net_lock */
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	/* take lnet_net_lock(any) would be OK */
+	if (!list_empty(&the_lnet.ln_nis_cpt)) {
+		list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) {
+			if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid))
+				continue;
+
+			LASSERT(ni->ni_cpts != NULL);
+			return ni->ni_cpts[lnet_nid_cpt_hash
+					   (nid, ni->ni_ncpts)];
+		}
+	}
+
+	return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid)
+{
+	int	cpt;
+	int	cpt2;
+
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	if (list_empty(&the_lnet.ln_nis_cpt))
+		return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+	cpt = lnet_net_lock_current();
+	cpt2 = lnet_cpt_of_nid_locked(nid);
+	lnet_net_unlock(cpt);
+
+	return cpt2;
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet(__u32 net)
+{
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_net2ni_locked(net, cpt);
+	if (ni != NULL)
+		lnet_ni_decref_locked(ni, cpt);
+
+	lnet_net_unlock(cpt);
+
+	return ni != NULL;
+}
+
+lnet_ni_t  *
+lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
+{
+	struct lnet_ni	*ni;
+	struct list_head	*tmp;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (ni->ni_nid == nid) {
+			lnet_ni_addref_locked(ni, cpt);
+			return ni;
+		}
+	}
+
+	return NULL;
+}
+
+int
+lnet_islocalnid(lnet_nid_t nid)
+{
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+	ni = lnet_nid2ni_locked(nid, cpt);
+	if (ni != NULL)
+		lnet_ni_decref_locked(ni, cpt);
+	lnet_net_unlock(cpt);
+
+	return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nis (void)
+{
+	/* Return the # of NIs that need the acceptor. */
+	int		count = 0;
+	struct list_head	*tmp;
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (ni->ni_lnd->lnd_accept != NULL)
+			count++;
+	}
+
+	lnet_net_unlock(cpt);
+
+	return count;
+}
+
+static int
+lnet_ni_tq_credits(lnet_ni_t *ni)
+{
+	int	credits;
+
+	LASSERT(ni->ni_ncpts >= 1);
+
+	if (ni->ni_ncpts == 1)
+		return ni->ni_maxtxcredits;
+
+	credits = ni->ni_maxtxcredits / ni->ni_ncpts;
+	credits = max(credits, 8 * ni->ni_peertxcredits);
+	credits = min(credits, ni->ni_maxtxcredits);
+
+	return credits;
+}
+
+void
+lnet_shutdown_lndnis (void)
+{
+	int		i;
+	int		islo;
+	lnet_ni_t	 *ni;
+
+	/* NB called holding the global mutex */
+
+	/* All quiet on the API front */
+	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_refcount == 0);
+	LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_shutdown = 1;	/* flag shutdown */
+
+	/* Unlink NIs from the global table */
+	while (!list_empty(&the_lnet.ln_nis)) {
+		ni = list_entry(the_lnet.ln_nis.next,
+				    lnet_ni_t, ni_list);
+		/* move it to zombie list and nobody can find it anymore */
+		list_move(&ni->ni_list, &the_lnet.ln_nis_zombie);
+		lnet_ni_decref_locked(ni, 0);	/* drop ln_nis' ref */
+
+		if (!list_empty(&ni->ni_cptlist)) {
+			list_del_init(&ni->ni_cptlist);
+			lnet_ni_decref_locked(ni, 0);
+		}
+	}
+
+	/* Drop the cached eqwait NI. */
+	if (the_lnet.ln_eq_waitni != NULL) {
+		lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0);
+		the_lnet.ln_eq_waitni = NULL;
+	}
+
+	/* Drop the cached loopback NI. */
+	if (the_lnet.ln_loni != NULL) {
+		lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+		the_lnet.ln_loni = NULL;
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* Clear lazy portals and drop delayed messages which hold refs
+	 * on their lnet_msg_t::msg_rxpeer */
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		LNetClearLazyPortal(i);
+
+	/* Clear the peer table and wait for all peers to go (they hold refs on
+	 * their NIs) */
+	lnet_peer_tables_cleanup();
+
+	lnet_net_lock(LNET_LOCK_EX);
+	/* Now wait for the NI's I just nuked to show up on ln_zombie_nis
+	 * and shut them down in guaranteed thread context */
+	i = 2;
+	while (!list_empty(&the_lnet.ln_nis_zombie)) {
+		int	*ref;
+		int	j;
+
+		ni = list_entry(the_lnet.ln_nis_zombie.next,
+				    lnet_ni_t, ni_list);
+		list_del_init(&ni->ni_list);
+		cfs_percpt_for_each(ref, j, ni->ni_refs) {
+			if (*ref == 0)
+				continue;
+			/* still busy, add it back to zombie list */
+			list_add(&ni->ni_list, &the_lnet.ln_nis_zombie);
+			break;
+		}
+
+		while (!list_empty(&ni->ni_list)) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			++i;
+			if ((i & (-i)) == i) {
+				CDEBUG(D_WARNING,
+				       "Waiting for zombie LNI %s\n",
+				       libcfs_nid2str(ni->ni_nid));
+			}
+			cfs_pause(cfs_time_seconds(1));
+			lnet_net_lock(LNET_LOCK_EX);
+			continue;
+		}
+
+		ni->ni_lnd->lnd_refcount--;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		islo = ni->ni_lnd->lnd_type == LOLND;
+
+		LASSERT (!in_interrupt ());
+		(ni->ni_lnd->lnd_shutdown)(ni);
+
+		/* can't deref lnd anymore now; it might have unregistered
+		 * itself...  */
+
+		if (!islo)
+			CDEBUG(D_LNI, "Removed LNI %s\n",
+			       libcfs_nid2str(ni->ni_nid));
+
+		lnet_ni_free(ni);
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	the_lnet.ln_shutdown = 0;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_network_tokens != NULL) {
+		LIBCFS_FREE(the_lnet.ln_network_tokens,
+			    the_lnet.ln_network_tokens_nob);
+		the_lnet.ln_network_tokens = NULL;
+	}
+}
+
+int
+lnet_startup_lndnis (void)
+{
+	lnd_t			*lnd;
+	struct lnet_ni		*ni;
+	struct lnet_tx_queue	*tq;
+	struct list_head		nilist;
+	int			i;
+	int		rc = 0;
+	int		lnd_type;
+	int		nicount = 0;
+	char	      *nets = lnet_get_networks();
+
+	INIT_LIST_HEAD(&nilist);
+
+	if (nets == NULL)
+		goto failed;
+
+	rc = lnet_parse_networks(&nilist, nets);
+	if (rc != 0)
+		goto failed;
+
+	while (!list_empty(&nilist)) {
+		ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+		lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+		LASSERT (libcfs_isknown_lnd(lnd_type));
+
+		if (lnd_type == CIBLND    ||
+		    lnd_type == OPENIBLND ||
+		    lnd_type == IIBLND    ||
+		    lnd_type == VIBLND) {
+			CERROR("LND %s obsoleted\n",
+			       libcfs_lnd2str(lnd_type));
+			goto failed;
+		}
+
+		LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+		lnd = lnet_find_lnd_by_type(lnd_type);
+
+		if (lnd == NULL) {
+			LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+			rc = request_module("%s",
+						libcfs_lnd2modname(lnd_type));
+			LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+			lnd = lnet_find_lnd_by_type(lnd_type);
+			if (lnd == NULL) {
+				LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+				CERROR("Can't load LND %s, module %s, rc=%d\n",
+				       libcfs_lnd2str(lnd_type),
+				       libcfs_lnd2modname(lnd_type), rc);
+				goto failed;
+			}
+		}
+
+		lnet_net_lock(LNET_LOCK_EX);
+		lnd->lnd_refcount++;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		ni->ni_lnd = lnd;
+
+		rc = (lnd->lnd_startup)(ni);
+
+		LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+
+		if (rc != 0) {
+			LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s"
+					   "\n",
+					   rc, libcfs_lnd2str(lnd->lnd_type));
+			lnet_net_lock(LNET_LOCK_EX);
+			lnd->lnd_refcount--;
+			lnet_net_unlock(LNET_LOCK_EX);
+			goto failed;
+		}
+
+		LASSERT (ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL);
+
+		list_del(&ni->ni_list);
+
+		lnet_net_lock(LNET_LOCK_EX);
+		/* refcount for ln_nis */
+		lnet_ni_addref_locked(ni, 0);
+		list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
+		if (ni->ni_cpts != NULL) {
+			list_add_tail(&ni->ni_cptlist,
+					  &the_lnet.ln_nis_cpt);
+			lnet_ni_addref_locked(ni, 0);
+		}
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		if (lnd->lnd_type == LOLND) {
+			lnet_ni_addref(ni);
+			LASSERT (the_lnet.ln_loni == NULL);
+			the_lnet.ln_loni = ni;
+			continue;
+		}
+
+		if (ni->ni_peertxcredits == 0 ||
+		    ni->ni_maxtxcredits == 0) {
+			LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+					   libcfs_lnd2str(lnd->lnd_type),
+					   ni->ni_peertxcredits == 0 ?
+					   "" : "per-peer ");
+			goto failed;
+		}
+
+		cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+			tq->tq_credits_min =
+			tq->tq_credits_max =
+			tq->tq_credits = lnet_ni_tq_credits(ni);
+		}
+
+		CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+		       libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits,
+		       lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+		       ni->ni_peerrtrcredits, ni->ni_peertimeout);
+
+		nicount++;
+	}
+
+	if (the_lnet.ln_eq_waitni != NULL && nicount > 1) {
+		lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type;
+		LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network"
+				   "\n",
+				   libcfs_lnd2str(lnd_type));
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	lnet_shutdown_lndnis();
+
+	while (!list_empty(&nilist)) {
+		ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+		list_del(&ni->ni_list);
+		lnet_ni_free(ni);
+	}
+
+	return -ENETDOWN;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Only userspace program needs to call this function - it's automatically
+ * called in the kernel at module loading time. Caller has to call LNetFini()
+ * after a call to LNetInit(), if and only if the latter returned 0. It must
+ * be called exactly once.
+ *
+ * \return 0 on success, and -ve on failures.
+ */
+int
+LNetInit(void)
+{
+	int	rc;
+
+	lnet_assert_wire_constants();
+	LASSERT(!the_lnet.ln_init);
+
+	memset(&the_lnet, 0, sizeof(the_lnet));
+
+	/* refer to global cfs_cpt_table for now */
+	the_lnet.ln_cpt_table	= cfs_cpt_table;
+	the_lnet.ln_cpt_number	= cfs_cpt_number(cfs_cpt_table);
+
+	LASSERT(the_lnet.ln_cpt_number > 0);
+	if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+		/* we are under risk of consuming all lh_cookie */
+		CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
+		       "please change setting of CPT-table and retry\n",
+		       the_lnet.ln_cpt_number, LNET_CPT_MAX);
+		return -1;
+	}
+
+	while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+		the_lnet.ln_cpt_bits++;
+
+	rc = lnet_create_locks();
+	if (rc != 0) {
+		CERROR("Can't create LNet global locks: %d\n", rc);
+		return -1;
+	}
+
+	the_lnet.ln_refcount = 0;
+	the_lnet.ln_init = 1;
+	LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+	INIT_LIST_HEAD(&the_lnet.ln_lnds);
+	INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
+
+	/* The hash table size is the number of bits it takes to express the set
+	 * ln_num_routes, minus 1 (better to under estimate than over so we
+	 * don't waste memory). */
+	if (rnet_htable_size <= 0)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+	else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+	the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+					   order_base_2(rnet_htable_size) - 1);
+
+	/* All LNDs apart from the LOLND are in separate modules.  They
+	 * register themselves when their module loads, and unregister
+	 * themselves when their module is unloaded. */
+	lnet_register_lnd(&the_lolnd);
+	return 0;
+}
+EXPORT_SYMBOL(LNetInit);
+
+/**
+ * Finalize LNet library.
+ *
+ * Only userspace program needs to call this function. It can be called
+ * at most once.
+ *
+ * \pre LNetInit() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ */
+void
+LNetFini(void)
+{
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount == 0);
+
+	while (!list_empty(&the_lnet.ln_lnds))
+		lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+						   lnd_t, lnd_list));
+	lnet_destroy_locks();
+
+	the_lnet.ln_init = 0;
+}
+EXPORT_SYMBOL(LNetFini);
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Userspace program should call this after a successful call to LNetInit().
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+	int	 im_a_router = 0;
+	int	 rc;
+
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+	if (the_lnet.ln_refcount > 0) {
+		rc = the_lnet.ln_refcount++;
+		goto out;
+	}
+
+	lnet_get_tunables();
+
+	if (requested_pid == LNET_PID_ANY) {
+		/* Don't instantiate LNET just for me */
+		rc = -ENETDOWN;
+		goto failed0;
+	}
+
+	rc = lnet_prepare(requested_pid);
+	if (rc != 0)
+		goto failed0;
+
+	rc = lnet_startup_lndnis();
+	if (rc != 0)
+		goto failed1;
+
+	rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_check_routes();
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_rtrpools_alloc(im_a_router);
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_acceptor_start();
+	if (rc != 0)
+		goto failed2;
+
+	the_lnet.ln_refcount = 1;
+	/* Now I may use my own API functions... */
+
+	/* NB router checker needs the_lnet.ln_ping_info in
+	 * lnet_router_checker -> lnet_update_ni_status_locked */
+	rc = lnet_ping_target_init();
+	if (rc != 0)
+		goto failed3;
+
+	rc = lnet_router_checker_start();
+	if (rc != 0)
+		goto failed4;
+
+	lnet_proc_init();
+	goto out;
+
+ failed4:
+	lnet_ping_target_fini();
+ failed3:
+	the_lnet.ln_refcount = 0;
+	lnet_acceptor_stop();
+ failed2:
+	lnet_destroy_routes();
+	lnet_shutdown_lndnis();
+ failed1:
+	lnet_unprepare();
+ failed0:
+	LASSERT (rc < 0);
+ out:
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+	return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini()
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (the_lnet.ln_refcount != 1) {
+		the_lnet.ln_refcount--;
+	} else {
+		LASSERT (!the_lnet.ln_niinit_self);
+
+		lnet_proc_fini();
+		lnet_router_checker_stop();
+		lnet_ping_target_fini();
+
+		/* Teardown fns that use my own API functions BEFORE here */
+		the_lnet.ln_refcount = 0;
+
+		lnet_acceptor_stop();
+		lnet_destroy_routes();
+		lnet_shutdown_lndnis();
+		lnet_unprepare();
+	}
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet
+ * internal ioctl handler.
+ *
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it.
+ *
+ * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer.
+ * The data will be printed to system console. Don't use it excessively.
+ * \param arg A pointer to lnet_process_id_t, process ID of the peer.
+ *
+ * \return Always return 0 when called by users directly (i.e., not via ioctl).
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	lnet_process_id_t	 id = {0};
+	lnet_ni_t		*ni;
+	int		       rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	switch (cmd) {
+	case IOC_LIBCFS_GET_NI:
+		rc = LNetGetId(data->ioc_count, &id);
+		data->ioc_nid = id.nid;
+		return rc;
+
+	case IOC_LIBCFS_FAIL_NID:
+		return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+	case IOC_LIBCFS_ADD_ROUTE:
+		rc = lnet_add_route(data->ioc_net, data->ioc_count,
+				    data->ioc_nid);
+		return (rc != 0) ? rc : lnet_check_routes();
+
+	case IOC_LIBCFS_DEL_ROUTE:
+		return lnet_del_route(data->ioc_net, data->ioc_nid);
+
+	case IOC_LIBCFS_GET_ROUTE:
+		return lnet_get_route(data->ioc_count,
+				      &data->ioc_net, &data->ioc_count,
+				      &data->ioc_nid, &data->ioc_flags);
+	case IOC_LIBCFS_NOTIFY_ROUTER:
+		return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+				   cfs_time_current() -
+				   cfs_time_seconds(cfs_time_current_sec() -
+						    (time_t)data->ioc_u64[0]));
+
+	case IOC_LIBCFS_PORTALS_COMPATIBILITY:
+		/* This can be removed once lustre stops calling it */
+		return 0;
+
+	case IOC_LIBCFS_LNET_DIST:
+		rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+		if (rc < 0 && rc != -EHOSTUNREACH)
+			return rc;
+
+		data->ioc_u32[0] = rc;
+		return 0;
+
+	case IOC_LIBCFS_TESTPROTOCOMPAT:
+		lnet_net_lock(LNET_LOCK_EX);
+		the_lnet.ln_testprotocompat = data->ioc_flags;
+		lnet_net_unlock(LNET_LOCK_EX);
+		return 0;
+
+	case IOC_LIBCFS_PING:
+		id.nid = data->ioc_nid;
+		id.pid = data->ioc_u32[0];
+		rc = lnet_ping(id, data->ioc_u32[1], /* timeout */
+			       (lnet_process_id_t *)data->ioc_pbuf1,
+			       data->ioc_plen1/sizeof(lnet_process_id_t));
+		if (rc < 0)
+			return rc;
+		data->ioc_count = rc;
+		return 0;
+
+	case IOC_LIBCFS_DEBUG_PEER: {
+		/* CAVEAT EMPTOR: this one designed for calling directly; not
+		 * via an ioctl */
+		id = *((lnet_process_id_t *) arg);
+
+		lnet_debug_peer(id.nid);
+
+		ni = lnet_net2ni(LNET_NIDNET(id.nid));
+		if (ni == NULL) {
+			CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id));
+		} else {
+			if (ni->ni_lnd->lnd_ctl == NULL) {
+				CDEBUG(D_WARNING, "No ctl for %s\n",
+				       libcfs_id2str(id));
+			} else {
+				(void)ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+			}
+
+			lnet_ni_decref(ni);
+		}
+		return 0;
+	}
+
+	default:
+		ni = lnet_net2ni(data->ioc_net);
+		if (ni == NULL)
+			return -EINVAL;
+
+		if (ni->ni_lnd->lnd_ctl == NULL)
+			rc = -EINVAL;
+		else
+			rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+
+		lnet_ni_decref(ni);
+		return rc;
+	}
+	/* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+/**
+ * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that
+ * all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * lnet_process_id_t ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, lnet_process_id_t *id)
+{
+	struct lnet_ni	*ni;
+	struct list_head	*tmp;
+	int		cpt;
+	int		rc = -ENOENT;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		if (index-- != 0)
+			continue;
+
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		id->nid = ni->ni_nid;
+		id->pid = the_lnet.ln_pid;
+		rc = 0;
+		break;
+	}
+
+	lnet_net_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+/**
+ * Print a string representation of handle \a h into buffer \a str of
+ * \a len bytes.
+ */
+void
+LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
+{
+	snprintf(str, len, LPX64, h.cookie);
+}
+EXPORT_SYMBOL(LNetSnprintHandle);
+
+static int
+lnet_create_ping_info(void)
+{
+	int	       i;
+	int	       n;
+	int	       rc;
+	unsigned int      infosz;
+	lnet_ni_t	*ni;
+	lnet_process_id_t id;
+	lnet_ping_info_t *pinfo;
+
+	for (n = 0; ; n++) {
+		rc = LNetGetId(n, &id);
+		if (rc == -ENOENT)
+			break;
+
+		LASSERT (rc == 0);
+	}
+
+	infosz = offsetof(lnet_ping_info_t, pi_ni[n]);
+	LIBCFS_ALLOC(pinfo, infosz);
+	if (pinfo == NULL) {
+		CERROR("Can't allocate ping info[%d]\n", n);
+		return -ENOMEM;
+	}
+
+	pinfo->pi_nnis    = n;
+	pinfo->pi_pid     = the_lnet.ln_pid;
+	pinfo->pi_magic   = LNET_PROTO_PING_MAGIC;
+	pinfo->pi_features = LNET_PING_FEAT_NI_STATUS;
+
+	for (i = 0; i < n; i++) {
+		lnet_ni_status_t *ns = &pinfo->pi_ni[i];
+
+		rc = LNetGetId(i, &id);
+		LASSERT (rc == 0);
+
+		ns->ns_nid    = id.nid;
+		ns->ns_status = LNET_NI_STATUS_UP;
+
+		lnet_net_lock(0);
+
+		ni = lnet_nid2ni_locked(id.nid, 0);
+		LASSERT(ni != NULL);
+
+		lnet_ni_lock(ni);
+		LASSERT(ni->ni_status == NULL);
+		ni->ni_status = ns;
+		lnet_ni_unlock(ni);
+
+		lnet_ni_decref_locked(ni, 0);
+		lnet_net_unlock(0);
+	}
+
+	the_lnet.ln_ping_info = pinfo;
+	return 0;
+}
+
+static void
+lnet_destroy_ping_info(void)
+{
+	struct lnet_ni	*ni;
+
+	lnet_net_lock(0);
+
+	list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+		lnet_ni_lock(ni);
+		ni->ni_status = NULL;
+		lnet_ni_unlock(ni);
+	}
+
+	lnet_net_unlock(0);
+
+	LIBCFS_FREE(the_lnet.ln_ping_info,
+		    offsetof(lnet_ping_info_t,
+			     pi_ni[the_lnet.ln_ping_info->pi_nnis]));
+	the_lnet.ln_ping_info = NULL;
+	return;
+}
+
+int
+lnet_ping_target_init(void)
+{
+	lnet_md_t	 md = {0};
+	lnet_handle_me_t  meh;
+	lnet_process_id_t id;
+	int	       rc;
+	int	       rc2;
+	int	       infosz;
+
+	rc = lnet_create_ping_info();
+	if (rc != 0)
+		return rc;
+
+	/* We can have a tiny EQ since we only need to see the unlink event on
+	 * teardown, which by definition is the last one! */
+	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
+	if (rc != 0) {
+		CERROR("Can't allocate ping EQ: %d\n", rc);
+		goto failed_0;
+	}
+
+	memset(&id, 0, sizeof(lnet_process_id_t));
+	id.nid = LNET_NID_ANY;
+	id.pid = LNET_PID_ANY;
+
+	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+			  LNET_PROTO_PING_MATCHBITS, 0,
+			  LNET_UNLINK, LNET_INS_AFTER,
+			  &meh);
+	if (rc != 0) {
+		CERROR("Can't create ping ME: %d\n", rc);
+		goto failed_1;
+	}
+
+	/* initialize md content */
+	infosz = offsetof(lnet_ping_info_t,
+			  pi_ni[the_lnet.ln_ping_info->pi_nnis]);
+	md.start     = the_lnet.ln_ping_info;
+	md.length    = infosz;
+	md.threshold = LNET_MD_THRESH_INF;
+	md.max_size  = 0;
+	md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+		       LNET_MD_MANAGE_REMOTE;
+	md.user_ptr  = NULL;
+	md.eq_handle = the_lnet.ln_ping_target_eq;
+
+	rc = LNetMDAttach(meh, md,
+			  LNET_RETAIN,
+			  &the_lnet.ln_ping_target_md);
+	if (rc != 0) {
+		CERROR("Can't attach ping MD: %d\n", rc);
+		goto failed_2;
+	}
+
+	return 0;
+
+ failed_2:
+	rc2 = LNetMEUnlink(meh);
+	LASSERT (rc2 == 0);
+ failed_1:
+	rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT (rc2 == 0);
+ failed_0:
+	lnet_destroy_ping_info();
+	return rc;
+}
+
+void
+lnet_ping_target_fini(void)
+{
+	lnet_event_t    event;
+	int	     rc;
+	int	     which;
+	int	     timeout_ms = 1000;
+	sigset_t    blocked = cfs_block_allsigs();
+
+	LNetMDUnlink(the_lnet.ln_ping_target_md);
+	/* NB md could be busy; this just starts the unlink */
+
+	for (;;) {
+		rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1,
+				timeout_ms, &event, &which);
+
+		/* I expect overflow... */
+		LASSERT (rc >= 0 || rc == -EOVERFLOW);
+
+		if (rc == 0) {
+			/* timed out: provide a diagnostic */
+			CWARN("Still waiting for ping MD to unlink\n");
+			timeout_ms *= 2;
+			continue;
+		}
+
+		/* Got a valid event */
+		if (event.unlinked)
+			break;
+	}
+
+	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT (rc == 0);
+	lnet_destroy_ping_info();
+	cfs_restore_sigs(blocked);
+}
+
+int
+lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids)
+{
+	lnet_handle_eq_t     eqh;
+	lnet_handle_md_t     mdh;
+	lnet_event_t	 event;
+	lnet_md_t	    md = {0};
+	int		  which;
+	int		  unlinked = 0;
+	int		  replied = 0;
+	const int	    a_long_time = 60000; /* mS */
+	int		  infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]);
+	lnet_ping_info_t    *info;
+	lnet_process_id_t    tmpid;
+	int		  i;
+	int		  nob;
+	int		  rc;
+	int		  rc2;
+	sigset_t	 blocked;
+
+	if (n_ids <= 0 ||
+	    id.nid == LNET_NID_ANY ||
+	    timeout_ms > 500000 ||	      /* arbitrary limit! */
+	    n_ids > 20)			 /* arbitrary limit! */
+		return -EINVAL;
+
+	if (id.pid == LNET_PID_ANY)
+		id.pid = LUSTRE_SRV_LNET_PID;
+
+	LIBCFS_ALLOC(info, infosz);
+	if (info == NULL)
+		return -ENOMEM;
+
+	/* NB 2 events max (including any unlink event) */
+	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate EQ: %d\n", rc);
+		goto out_0;
+	}
+
+	/* initialize md content */
+	md.start     = info;
+	md.length    = infosz;
+	md.threshold = 2; /*GET/REPLY*/
+	md.max_size  = 0;
+	md.options   = LNET_MD_TRUNCATE;
+	md.user_ptr  = NULL;
+	md.eq_handle = eqh;
+
+	rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+	if (rc != 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto out_1;
+	}
+
+	rc = LNetGet(LNET_NID_ANY, mdh, id,
+		     LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0);
+
+	if (rc != 0) {
+		/* Don't CERROR; this could be deliberate! */
+
+		rc2 = LNetMDUnlink(mdh);
+		LASSERT (rc2 == 0);
+
+		/* NB must wait for the UNLINK event below... */
+		unlinked = 1;
+		timeout_ms = a_long_time;
+	}
+
+	do {
+		/* MUST block for unlink to complete */
+		if (unlinked)
+			blocked = cfs_block_allsigs();
+
+		rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which);
+
+		if (unlinked)
+			cfs_restore_sigs(blocked);
+
+		CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+		       (rc2 <= 0) ? -1 : event.type,
+		       (rc2 <= 0) ? -1 : event.status,
+		       (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+		LASSERT (rc2 != -EOVERFLOW);     /* can't miss anything */
+
+		if (rc2 <= 0 || event.status != 0) {
+			/* timeout or error */
+			if (!replied && rc == 0)
+				rc = (rc2 < 0) ? rc2 :
+				     (rc2 == 0) ? -ETIMEDOUT :
+				     event.status;
+
+			if (!unlinked) {
+				/* Ensure completion in finite time... */
+				LNetMDUnlink(mdh);
+				/* No assertion (racing with network) */
+				unlinked = 1;
+				timeout_ms = a_long_time;
+			} else if (rc2 == 0) {
+				/* timed out waiting for unlink */
+				CWARN("ping %s: late network completion\n",
+				      libcfs_id2str(id));
+			}
+		} else if (event.type == LNET_EVENT_REPLY) {
+			replied = 1;
+			rc = event.mlength;
+		}
+
+	} while (rc2 <= 0 || !event.unlinked);
+
+	if (!replied) {
+		if (rc >= 0)
+			CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+			      libcfs_id2str(id));
+		rc = -EIO;
+		goto out_1;
+	}
+
+	nob = rc;
+	LASSERT (nob >= 0 && nob <= infosz);
+
+	rc = -EPROTO;			   /* if I can't parse... */
+
+	if (nob < 8) {
+		/* can't check magic/version */
+		CERROR("%s: ping info too short %d\n",
+		       libcfs_id2str(id), nob);
+		goto out_1;
+	}
+
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+		lnet_swap_pinginfo(info);
+	} else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+		CERROR("%s: Unexpected magic %08x\n",
+		       libcfs_id2str(id), info->pi_magic);
+		goto out_1;
+	}
+
+	if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+		CERROR("%s: ping w/o NI status: 0x%x\n",
+		       libcfs_id2str(id), info->pi_features);
+		goto out_1;
+	}
+
+	if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) {
+		CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(lnet_ping_info_t, pi_ni[0]));
+		goto out_1;
+	}
+
+	if (info->pi_nnis < n_ids)
+		n_ids = info->pi_nnis;
+
+	if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) {
+		CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids]));
+		goto out_1;
+	}
+
+	rc = -EFAULT;			   /* If I SEGV... */
+
+	for (i = 0; i < n_ids; i++) {
+		tmpid.pid = info->pi_pid;
+		tmpid.nid = info->pi_ni[i].ns_nid;
+		if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+			goto out_1;
+	}
+	rc = info->pi_nnis;
+
+ out_1:
+	rc2 = LNetEQFree(eqh);
+	if (rc2 != 0)
+		CERROR("rc2 %d\n", rc2);
+	LASSERT (rc2 == 0);
+
+ out_0:
+	LIBCFS_FREE(info, infosz);
+	return rc;
+}

diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c
new file mode 100644
index 0000000..28711e6
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/config.c

@@ -0,0 +1,1264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+typedef struct {			    /* tmp struct for parsing routes */
+	struct list_head	 ltb_list;	/* stash on lists */
+	int		ltb_size;	/* allocated size */
+	char	       ltb_text[0];     /* text buffer */
+} lnet_text_buf_t;
+
+static int lnet_tbnob = 0;			/* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB     (64<<10)	/* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB  (4<<10)
+
+void
+lnet_syntax(char *name, char *str, int offset, int width)
+{
+	static char dots[LNET_SINGLE_TEXTBUF_NOB];
+	static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+	memset(dots, '.', sizeof(dots));
+	dots[sizeof(dots)-1] = 0;
+	memset(dashes, '-', sizeof(dashes));
+	dashes[sizeof(dashes)-1] = 0;
+
+	LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+	LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+			   (int)strlen(name), dots, offset, dots,
+			    (width < 1) ? 0 : width - 1, dashes);
+}
+
+int
+lnet_issep (char c)
+{
+	switch (c) {
+	case '\n':
+	case '\r':
+	case ';':
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+int
+lnet_net_unique(__u32 net, struct list_head *nilist)
+{
+	struct list_head       *tmp;
+	lnet_ni_t	*ni;
+
+	list_for_each (tmp, nilist) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (LNET_NIDNET(ni->ni_nid) == net)
+			return 0;
+	}
+
+	return 1;
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+	if (ni->ni_refs != NULL)
+		cfs_percpt_free(ni->ni_refs);
+
+	if (ni->ni_tx_queues != NULL)
+		cfs_percpt_free(ni->ni_tx_queues);
+
+	if (ni->ni_cpts != NULL)
+		cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+	LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+lnet_ni_t *
+lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
+{
+	struct lnet_tx_queue	*tq;
+	struct lnet_ni		*ni;
+	int			rc;
+	int			i;
+
+	if (!lnet_net_unique(net, nilist)) {
+		LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n",
+				   libcfs_net2str(net));
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(ni, sizeof(*ni));
+	if (ni == NULL) {
+		CERROR("Out of memory creating network %s\n",
+		       libcfs_net2str(net));
+		return NULL;
+	}
+
+	spin_lock_init(&ni->ni_lock);
+	INIT_LIST_HEAD(&ni->ni_cptlist);
+	ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*ni->ni_refs[0]));
+	if (ni->ni_refs == NULL)
+		goto failed;
+
+	ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(*ni->ni_tx_queues[0]));
+	if (ni->ni_tx_queues == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+		INIT_LIST_HEAD(&tq->tq_delayed);
+
+	if (el == NULL) {
+		ni->ni_cpts  = NULL;
+		ni->ni_ncpts = LNET_CPT_NUMBER;
+	} else {
+		rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+		if (rc <= 0) {
+			CERROR("Failed to set CPTs for NI %s: %d\n",
+			       libcfs_net2str(net), rc);
+			goto failed;
+		}
+
+		LASSERT(rc <= LNET_CPT_NUMBER);
+		if (rc == LNET_CPT_NUMBER) {
+			LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0]));
+			ni->ni_cpts = NULL;
+		}
+
+		ni->ni_ncpts = rc;
+	}
+
+	/* LND will fill in the address part of the NID */
+	ni->ni_nid = LNET_MKNID(net, 0);
+	ni->ni_last_alive = cfs_time_current_sec();
+	list_add_tail(&ni->ni_list, nilist);
+	return ni;
+ failed:
+	lnet_ni_free(ni);
+	return NULL;
+}
+
+int
+lnet_parse_networks(struct list_head *nilist, char *networks)
+{
+	struct cfs_expr_list *el = NULL;
+	int		tokensize = strlen(networks) + 1;
+	char		*tokens;
+	char		*str;
+	char		*tmp;
+	struct lnet_ni	*ni;
+	__u32		net;
+	int		nnets = 0;
+
+	if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _WAY_ conservative */
+		LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too "
+				   "long\n");
+		return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(tokens, tokensize);
+	if (tokens == NULL) {
+		CERROR("Can't allocate net tokens\n");
+		return -ENOMEM;
+	}
+
+	the_lnet.ln_network_tokens = tokens;
+	the_lnet.ln_network_tokens_nob = tokensize;
+	memcpy (tokens, networks, tokensize);
+	str = tmp = tokens;
+
+	/* Add in the loopback network */
+	ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist);
+	if (ni == NULL)
+		goto failed;
+
+	while (str != NULL && *str != 0) {
+		char	*comma = strchr(str, ',');
+		char	*bracket = strchr(str, '(');
+		char	*square = strchr(str, '[');
+		char	*iface;
+		int	niface;
+		int	rc;
+
+		/* NB we don't check interface conflicts here; it's the LNDs
+		 * responsibility (if it cares at all) */
+
+		if (square != NULL && (comma == NULL || square < comma)) {
+			/* i.e: o2ib0(ib0)[1,2], number between square
+			 * brackets are CPTs this NI needs to be bond */
+			if (bracket != NULL && bracket > square) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			tmp = strchr(square, ']');
+			if (tmp == NULL) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			rc = cfs_expr_list_parse(square, tmp - square + 1,
+						 0, LNET_CPT_NUMBER - 1, &el);
+			if (rc != 0) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			while (square <= tmp)
+				*square++ = ' ';
+		}
+
+		if (bracket == NULL ||
+		    (comma != NULL && comma < bracket)) {
+
+			/* no interface list specified */
+
+			if (comma != NULL)
+				*comma++ = 0;
+			net = libcfs_str2net(cfs_trimwhite(str));
+
+			if (net == LNET_NIDNET(LNET_NID_ANY)) {
+				LCONSOLE_ERROR_MSG(0x113, "Unrecognised network"
+						   " type\n");
+				tmp = str;
+				goto failed_syntax;
+			}
+
+			if (LNET_NETTYP(net) != LOLND && /* LO is implicit */
+			    lnet_ni_alloc(net, el, nilist) == NULL)
+				goto failed;
+
+			if (el != NULL) {
+				cfs_expr_list_free(el);
+				el = NULL;
+			}
+
+			str = comma;
+			continue;
+		}
+
+		*bracket = 0;
+		net = libcfs_str2net(cfs_trimwhite(str));
+		if (net == LNET_NIDNET(LNET_NID_ANY)) {
+			tmp = str;
+			goto failed_syntax;
+		}
+
+		nnets++;
+		ni = lnet_ni_alloc(net, el, nilist);
+		if (ni == NULL)
+			goto failed;
+
+		if (el != NULL) {
+			cfs_expr_list_free(el);
+			el = NULL;
+		}
+
+		niface = 0;
+		iface = bracket + 1;
+
+		bracket = strchr(iface, ')');
+		if (bracket == NULL) {
+			tmp = iface;
+			goto failed_syntax;
+		}
+
+		*bracket = 0;
+		do {
+			comma = strchr(iface, ',');
+			if (comma != NULL)
+				*comma++ = 0;
+
+			iface = cfs_trimwhite(iface);
+			if (*iface == 0) {
+				tmp = iface;
+				goto failed_syntax;
+			}
+
+			if (niface == LNET_MAX_INTERFACES) {
+				LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
+						   "for net %s\n",
+						   libcfs_net2str(net));
+				goto failed;
+			}
+
+			ni->ni_interfaces[niface++] = iface;
+			iface = comma;
+		} while (iface != NULL);
+
+		str = bracket + 1;
+		comma = strchr(bracket + 1, ',');
+		if (comma != NULL) {
+			*comma = 0;
+			str = cfs_trimwhite(str);
+			if (*str != 0) {
+				tmp = str;
+				goto failed_syntax;
+			}
+			str = comma + 1;
+			continue;
+		}
+
+		str = cfs_trimwhite(str);
+		if (*str != 0) {
+			tmp = str;
+			goto failed_syntax;
+		}
+	}
+
+	LASSERT(!list_empty(nilist));
+	return 0;
+
+ failed_syntax:
+	lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));
+ failed:
+	while (!list_empty(nilist)) {
+		ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+
+		list_del(&ni->ni_list);
+		lnet_ni_free(ni);
+	}
+
+	if (el != NULL)
+		cfs_expr_list_free(el);
+
+	LIBCFS_FREE(tokens, tokensize);
+	the_lnet.ln_network_tokens = NULL;
+
+	return -EINVAL;
+}
+
+lnet_text_buf_t *
+lnet_new_text_buf (int str_len)
+{
+	lnet_text_buf_t *ltb;
+	int	      nob;
+
+	/* NB allocate space for the terminating 0 */
+	nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]);
+	if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _way_ conservative for "route net gateway..." */
+		CERROR("text buffer too big\n");
+		return NULL;
+	}
+
+	if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+		CERROR("Too many text buffers\n");
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(ltb, nob);
+	if (ltb == NULL)
+		return NULL;
+
+	ltb->ltb_size = nob;
+	ltb->ltb_text[0] = 0;
+	lnet_tbnob += nob;
+	return ltb;
+}
+
+void
+lnet_free_text_buf (lnet_text_buf_t *ltb)
+{
+	lnet_tbnob -= ltb->ltb_size;
+	LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+	lnet_text_buf_t  *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+}
+
+void
+lnet_print_text_bufs(struct list_head *tbs)
+{
+	struct list_head	*tmp;
+	lnet_text_buf_t   *ltb;
+
+	list_for_each (tmp, tbs) {
+		ltb = list_entry(tmp, lnet_text_buf_t, ltb_list);
+
+		CDEBUG(D_WARNING, "%s\n", ltb->ltb_text);
+	}
+
+	CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob);
+}
+
+int
+lnet_str2tbs_sep (struct list_head *tbs, char *str)
+{
+	struct list_head	pending;
+	char	     *sep;
+	int	       nob;
+	int	       i;
+	lnet_text_buf_t  *ltb;
+
+	INIT_LIST_HEAD(&pending);
+
+	/* Split 'str' into separate commands */
+	for (;;) {
+		/* skip leading whitespace */
+		while (cfs_iswhite(*str))
+			str++;
+
+		/* scan for separator or comment */
+		for (sep = str; *sep != 0; sep++)
+			if (lnet_issep(*sep) || *sep == '#')
+				break;
+
+		nob = (int)(sep - str);
+		if (nob > 0) {
+			ltb = lnet_new_text_buf(nob);
+			if (ltb == NULL) {
+				lnet_free_text_bufs(&pending);
+				return -1;
+			}
+
+			for (i = 0; i < nob; i++)
+				if (cfs_iswhite(str[i]))
+					ltb->ltb_text[i] = ' ';
+				else
+					ltb->ltb_text[i] = str[i];
+
+			ltb->ltb_text[nob] = 0;
+
+			list_add_tail(&ltb->ltb_list, &pending);
+		}
+
+		if (*sep == '#') {
+			/* scan for separator */
+			do {
+				sep++;
+			} while (*sep != 0 && !lnet_issep(*sep));
+		}
+
+		if (*sep == 0)
+			break;
+
+		str = sep + 1;
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 0;
+}
+
+int
+lnet_expand1tb (struct list_head *list,
+	       char *str, char *sep1, char *sep2,
+	       char *item, int itemlen)
+{
+	int	      len1 = (int)(sep1 - str);
+	int	      len2 = strlen(sep2 + 1);
+	lnet_text_buf_t *ltb;
+
+	LASSERT (*sep1 == '[');
+	LASSERT (*sep2 == ']');
+
+	ltb = lnet_new_text_buf(len1 + itemlen + len2);
+	if (ltb == NULL)
+		return -ENOMEM;
+
+	memcpy(ltb->ltb_text, str, len1);
+	memcpy(&ltb->ltb_text[len1], item, itemlen);
+	memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+	ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+	list_add_tail(&ltb->ltb_list, list);
+	return 0;
+}
+
+int
+lnet_str2tbs_expand (struct list_head *tbs, char *str)
+{
+	char	      num[16];
+	struct list_head	pending;
+	char	     *sep;
+	char	     *sep2;
+	char	     *parsed;
+	char	     *enditem;
+	int	       lo;
+	int	       hi;
+	int	       stride;
+	int	       i;
+	int	       nob;
+	int	       scanned;
+
+	INIT_LIST_HEAD(&pending);
+
+	sep = strchr(str, '[');
+	if (sep == NULL)			/* nothing to expand */
+		return 0;
+
+	sep2 = strchr(sep, ']');
+	if (sep2 == NULL)
+		goto failed;
+
+	for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+		enditem = ++parsed;
+		while (enditem < sep2 && *enditem != ',')
+			enditem++;
+
+		if (enditem == parsed)		/* no empty items */
+			goto failed;
+
+		if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) {
+
+			if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+				/* simple string enumeration */
+				if (lnet_expand1tb(&pending, str, sep, sep2,
+						   parsed, (int)(enditem - parsed)) != 0)
+					goto failed;
+
+				continue;
+			}
+
+			stride = 1;
+		}
+
+		/* range expansion */
+
+		if (enditem != parsed + scanned) /* no trailing junk */
+			goto failed;
+
+		if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+		    (hi - lo) % stride != 0)
+			goto failed;
+
+		for (i = lo; i <= hi; i += stride) {
+
+			snprintf(num, sizeof(num), "%d", i);
+			nob = strlen(num);
+			if (nob + 1 == sizeof(num))
+				goto failed;
+
+			if (lnet_expand1tb(&pending, str, sep, sep2,
+					   num, nob) != 0)
+				goto failed;
+		}
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 1;
+
+ failed:
+	lnet_free_text_bufs(&pending);
+	return -1;
+}
+
+int
+lnet_parse_hops (char *str, unsigned int *hops)
+{
+	int     len = strlen(str);
+	int     nob = len;
+
+	return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+		nob == len &&
+		*hops > 0 && *hops < 256);
+}
+
+
+int
+lnet_parse_route (char *str, int *im_a_router)
+{
+	/* static scratch buffer OK (single threaded) */
+	static char       cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+	struct list_head	nets;
+	struct list_head	gateways;
+	struct list_head       *tmp1;
+	struct list_head       *tmp2;
+	__u32	     net;
+	lnet_nid_t	nid;
+	lnet_text_buf_t  *ltb;
+	int	       rc;
+	char	     *sep;
+	char	     *token = str;
+	int	       ntokens = 0;
+	int	       myrc = -1;
+	unsigned int      hops;
+	int	       got_hops = 0;
+
+	INIT_LIST_HEAD(&gateways);
+	INIT_LIST_HEAD(&nets);
+
+	/* save a copy of the string for error messages */
+	strncpy(cmd, str, sizeof(cmd) - 1);
+	cmd[sizeof(cmd) - 1] = 0;
+
+	sep = str;
+	for (;;) {
+		/* scan for token start */
+		while (cfs_iswhite(*sep))
+			sep++;
+		if (*sep == 0) {
+			if (ntokens < (got_hops ? 3 : 2))
+				goto token_error;
+			break;
+		}
+
+		ntokens++;
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !cfs_iswhite(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens == 1) {
+			tmp2 = &nets;		/* expanding nets */
+		} else if (ntokens == 2 &&
+			   lnet_parse_hops(token, &hops)) {
+			got_hops = 1;	   /* got a hop count */
+			continue;
+		} else {
+			tmp2 = &gateways;	/* expanding gateways */
+		}
+
+		ltb = lnet_new_text_buf(strlen(token));
+		if (ltb == NULL)
+			goto out;
+
+		strcpy(ltb->ltb_text, token);
+		tmp1 = &ltb->ltb_list;
+		list_add_tail(tmp1, tmp2);
+
+		while (tmp1 != tmp2) {
+			ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+
+			rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+			if (rc < 0)
+				goto token_error;
+
+			tmp1 = tmp1->next;
+
+			if (rc > 0) {		/* expanded! */
+				list_del(&ltb->ltb_list);
+				lnet_free_text_buf(ltb);
+				continue;
+			}
+
+			if (ntokens == 1) {
+				net = libcfs_str2net(ltb->ltb_text);
+				if (net == LNET_NIDNET(LNET_NID_ANY) ||
+				    LNET_NETTYP(net) == LOLND)
+					goto token_error;
+			} else {
+				nid = libcfs_str2nid(ltb->ltb_text);
+				if (nid == LNET_NID_ANY ||
+				    LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+					goto token_error;
+			}
+		}
+	}
+
+	if (!got_hops)
+		hops = 1;
+
+	LASSERT (!list_empty(&nets));
+	LASSERT (!list_empty(&gateways));
+
+	list_for_each (tmp1, &nets) {
+		ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+		net = libcfs_str2net(ltb->ltb_text);
+		LASSERT (net != LNET_NIDNET(LNET_NID_ANY));
+
+		list_for_each (tmp2, &gateways) {
+			ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list);
+			nid = libcfs_str2nid(ltb->ltb_text);
+			LASSERT (nid != LNET_NID_ANY);
+
+			if (lnet_islocalnid(nid)) {
+				*im_a_router = 1;
+				continue;
+			}
+
+			rc = lnet_add_route (net, hops, nid);
+			if (rc != 0) {
+				CERROR("Can't create route "
+				       "to %s via %s\n",
+				       libcfs_net2str(net),
+				       libcfs_nid2str(nid));
+				goto out;
+			}
+		}
+	}
+
+	myrc = 0;
+	goto out;
+
+ token_error:
+	lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+ out:
+	lnet_free_text_bufs(&nets);
+	lnet_free_text_bufs(&gateways);
+	return myrc;
+}
+
+int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+	lnet_text_buf_t   *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+		if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+			lnet_free_text_bufs(tbs);
+			return -EINVAL;
+		}
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+
+	return 0;
+}
+
+int
+lnet_parse_routes (char *routes, int *im_a_router)
+{
+	struct list_head	tbs;
+	int	       rc = 0;
+
+	*im_a_router = 0;
+
+	INIT_LIST_HEAD(&tbs);
+
+	if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+		CERROR("Error parsing routes\n");
+		rc = -EINVAL;
+	} else {
+		rc = lnet_parse_route_tbs(&tbs, im_a_router);
+	}
+
+	LASSERT (lnet_tbnob == 0);
+	return rc;
+}
+
+int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+	LIST_HEAD	(list);
+	int		rc;
+	int		i;
+
+	rc = cfs_ip_addr_parse(token, len, &list);
+	if (rc != 0)
+		return rc;
+
+	for (rc = i = 0; !rc && i < nip; i++)
+		rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+	cfs_ip_addr_free(&list);
+
+	return rc;
+}
+
+int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+	static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+	int   matched = 0;
+	int   ntokens = 0;
+	int   len;
+	char *net = NULL;
+	char *sep;
+	char *token;
+	int   rc;
+
+	LASSERT (strlen(net_entry) < sizeof(tokens));
+
+	/* work on a copy of the string */
+	strcpy(tokens, net_entry);
+	sep = tokens;
+	for (;;) {
+		/* scan for token start */
+		while (cfs_iswhite(*sep))
+			sep++;
+		if (*sep == 0)
+			break;
+
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !cfs_iswhite(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens++ == 0) {
+			net = token;
+			continue;
+		}
+
+		len = strlen(token);
+
+		rc = lnet_match_network_token(token, len, ipaddrs, nip);
+		if (rc < 0) {
+			lnet_syntax("ip2nets", net_entry,
+				    (int)(token - tokens), len);
+			return rc;
+		}
+
+		matched |= (rc != 0);
+	}
+
+	if (!matched)
+		return 0;
+
+	strcpy(net_entry, net);		 /* replace with matched net */
+	return 1;
+}
+
+__u32
+lnet_netspec2net(char *netspec)
+{
+	char   *bracket = strchr(netspec, '(');
+	__u32   net;
+
+	if (bracket != NULL)
+		*bracket = 0;
+
+	net = libcfs_str2net(netspec);
+
+	if (bracket != NULL)
+		*bracket = '(';
+
+	return net;
+}
+
+int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+	int	       offset = 0;
+	int	       offset2;
+	int	       len;
+	lnet_text_buf_t  *tb;
+	lnet_text_buf_t  *tb2;
+	struct list_head       *t;
+	char	     *sep;
+	char	     *bracket;
+	__u32	     net;
+
+	LASSERT (!list_empty(nets));
+	LASSERT (nets->next == nets->prev);     /* single entry */
+
+	tb = list_entry(nets->next, lnet_text_buf_t, ltb_list);
+
+	for (;;) {
+		sep = strchr(tb->ltb_text, ',');
+		bracket = strchr(tb->ltb_text, '(');
+
+		if (sep != NULL &&
+		    bracket != NULL &&
+		    bracket < sep) {
+			/* netspec lists interfaces... */
+
+			offset2 = offset + (int)(bracket - tb->ltb_text);
+			len = strlen(bracket);
+
+			bracket = strchr(bracket + 1, ')');
+
+			if (bracket == NULL ||
+			    !(bracket[1] == ',' || bracket[1] == 0)) {
+				lnet_syntax("ip2nets", source, offset2, len);
+				return -EINVAL;
+			}
+
+			sep = (bracket[1] == 0) ? NULL : bracket + 1;
+		}
+
+		if (sep != NULL)
+			*sep++ = 0;
+
+		net = lnet_netspec2net(tb->ltb_text);
+		if (net == LNET_NIDNET(LNET_NID_ANY)) {
+			lnet_syntax("ip2nets", source, offset,
+				    strlen(tb->ltb_text));
+			return -EINVAL;
+		}
+
+		list_for_each(t, nets) {
+			tb2 = list_entry(t, lnet_text_buf_t, ltb_list);
+
+			if (tb2 == tb)
+				continue;
+
+			if (net == lnet_netspec2net(tb2->ltb_text)) {
+				/* duplicate network */
+				lnet_syntax("ip2nets", source, offset,
+					    strlen(tb->ltb_text));
+				return -EINVAL;
+			}
+		}
+
+		if (sep == NULL)
+			return 0;
+
+		offset += (int)(sep - tb->ltb_text);
+		tb2 = lnet_new_text_buf(strlen(sep));
+		if (tb2 == NULL)
+			return -ENOMEM;
+
+		strcpy(tb2->ltb_text, sep);
+		list_add_tail(&tb2->ltb_list, nets);
+
+		tb = tb2;
+	}
+}
+
+int
+lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+	static char	networks[LNET_SINGLE_TEXTBUF_NOB];
+	static char	source[LNET_SINGLE_TEXTBUF_NOB];
+
+	struct list_head	  raw_entries;
+	struct list_head	  matched_nets;
+	struct list_head	  current_nets;
+	struct list_head	 *t;
+	struct list_head	 *t2;
+	lnet_text_buf_t    *tb;
+	lnet_text_buf_t    *tb2;
+	__u32	       net1;
+	__u32	       net2;
+	int		 len;
+	int		 count;
+	int		 dup;
+	int		 rc;
+
+	INIT_LIST_HEAD(&raw_entries);
+	if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+		CERROR("Error parsing ip2nets\n");
+		LASSERT (lnet_tbnob == 0);
+		return -EINVAL;
+	}
+
+	INIT_LIST_HEAD(&matched_nets);
+	INIT_LIST_HEAD(&current_nets);
+	networks[0] = 0;
+	count = 0;
+	len = 0;
+	rc = 0;
+
+	while (!list_empty(&raw_entries)) {
+		tb = list_entry(raw_entries.next, lnet_text_buf_t,
+				    ltb_list);
+
+		strncpy(source, tb->ltb_text, sizeof(source)-1);
+		source[sizeof(source)-1] = 0;
+
+		/* replace ltb_text with the network(s) add on match */
+		rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+		if (rc < 0)
+			break;
+
+		list_del(&tb->ltb_list);
+
+		if (rc == 0) {		  /* no match */
+			lnet_free_text_buf(tb);
+			continue;
+		}
+
+		/* split into separate networks */
+		INIT_LIST_HEAD(&current_nets);
+		list_add(&tb->ltb_list, &current_nets);
+		rc = lnet_splitnets(source, &current_nets);
+		if (rc < 0)
+			break;
+
+		dup = 0;
+		list_for_each (t, &current_nets) {
+			tb = list_entry(t, lnet_text_buf_t, ltb_list);
+			net1 = lnet_netspec2net(tb->ltb_text);
+			LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY));
+
+			list_for_each(t2, &matched_nets) {
+				tb2 = list_entry(t2, lnet_text_buf_t,
+						     ltb_list);
+				net2 = lnet_netspec2net(tb2->ltb_text);
+				LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY));
+
+				if (net1 == net2) {
+					dup = 1;
+					break;
+				}
+			}
+
+			if (dup)
+				break;
+		}
+
+		if (dup) {
+			lnet_free_text_bufs(&current_nets);
+			continue;
+		}
+
+		list_for_each_safe(t, t2, &current_nets) {
+			tb = list_entry(t, lnet_text_buf_t, ltb_list);
+
+			list_del(&tb->ltb_list);
+			list_add_tail(&tb->ltb_list, &matched_nets);
+
+			len += snprintf(networks + len, sizeof(networks) - len,
+					"%s%s", (len == 0) ? "" : ",",
+					tb->ltb_text);
+
+			if (len >= sizeof(networks)) {
+				CERROR("Too many matched networks\n");
+				rc = -E2BIG;
+				goto out;
+			}
+		}
+
+		count++;
+	}
+
+ out:
+	lnet_free_text_bufs(&raw_entries);
+	lnet_free_text_bufs(&matched_nets);
+	lnet_free_text_bufs(&current_nets);
+	LASSERT (lnet_tbnob == 0);
+
+	if (rc < 0)
+		return rc;
+
+	*networksp = networks;
+	return count;
+}
+
+void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+	LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+int
+lnet_ipaddr_enumerate (__u32 **ipaddrsp)
+{
+	int	up;
+	__u32      netmask;
+	__u32     *ipaddrs;
+	__u32     *ipaddrs2;
+	int	nip;
+	char     **ifnames;
+	int	nif = libcfs_ipif_enumerate(&ifnames);
+	int	i;
+	int	rc;
+
+	if (nif <= 0)
+		return nif;
+
+	LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+	if (ipaddrs == NULL) {
+		CERROR("Can't allocate ipaddrs[%d]\n", nif);
+		libcfs_ipif_free_enumeration(ifnames, nif);
+		return -ENOMEM;
+	}
+
+	for (i = nip = 0; i < nif; i++) {
+		if (!strcmp(ifnames[i], "lo"))
+			continue;
+
+		rc = libcfs_ipif_query(ifnames[i], &up,
+				       &ipaddrs[nip], &netmask);
+		if (rc != 0) {
+			CWARN("Can't query interface %s: %d\n",
+			      ifnames[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Ignoring interface %s: it's down\n",
+			      ifnames[i]);
+			continue;
+		}
+
+		nip++;
+	}
+
+	libcfs_ipif_free_enumeration(ifnames, nif);
+
+	if (nip == nif) {
+		*ipaddrsp = ipaddrs;
+	} else {
+		if (nip > 0) {
+			LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+			if (ipaddrs2 == NULL) {
+				CERROR("Can't allocate ipaddrs[%d]\n", nip);
+				nip = -ENOMEM;
+			} else {
+				memcpy(ipaddrs2, ipaddrs,
+				       nip * sizeof(*ipaddrs));
+				*ipaddrsp = ipaddrs2;
+				rc = nip;
+			}
+		}
+		lnet_ipaddr_free_enumeration(ipaddrs, nif);
+	}
+	return nip;
+}
+
+int
+lnet_parse_ip2nets (char **networksp, char *ip2nets)
+{
+	__u32     *ipaddrs;
+	int	nip = lnet_ipaddr_enumerate(&ipaddrs);
+	int	rc;
+
+	if (nip < 0) {
+		LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP "
+				   "interfaces for ip2nets to match\n", nip);
+		return nip;
+	}
+
+	if (nip == 0) {
+		LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces "
+				   "for ip2nets to match\n");
+		return -ENOENT;
+	}
+
+	rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+	lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+	if (rc < 0) {
+		LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+		return rc;
+	}
+
+	if (rc == 0) {
+		LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match "
+				   "any local IP interfaces\n");
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+int
+lnet_set_ip_niaddr (lnet_ni_t *ni)
+{
+	__u32  net = LNET_NIDNET(ni->ni_nid);
+	char **names;
+	int    n;
+	__u32  ip;
+	__u32  netmask;
+	int    up;
+	int    i;
+	int    rc;
+
+	/* Convenience for LNDs that use the IP address of a local interface as
+	 * the local address part of their NID */
+
+	if (ni->ni_interfaces[0] != NULL) {
+
+		CLASSERT (LNET_MAX_INTERFACES > 1);
+
+		if (ni->ni_interfaces[1] != NULL) {
+			CERROR("Net %s doesn't support multiple interfaces\n",
+			       libcfs_net2str(net));
+			return -EPERM;
+		}
+
+		rc = libcfs_ipif_query(ni->ni_interfaces[0],
+				       &up, &ip, &netmask);
+		if (rc != 0) {
+			CERROR("Net %s can't query interface %s: %d\n",
+			       libcfs_net2str(net), ni->ni_interfaces[0], rc);
+			return -EPERM;
+		}
+
+		if (!up) {
+			CERROR("Net %s can't use interface %s: it's down\n",
+			       libcfs_net2str(net), ni->ni_interfaces[0]);
+			return -ENETDOWN;
+		}
+
+		ni->ni_nid = LNET_MKNID(net, ip);
+		return 0;
+	}
+
+	n = libcfs_ipif_enumerate(&names);
+	if (n <= 0) {
+		CERROR("Net %s can't enumerate interfaces: %d\n",
+		       libcfs_net2str(net), n);
+		return 0;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+			continue;
+
+		rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+
+		if (rc != 0) {
+			CWARN("Net %s can't query interface %s: %d\n",
+			      libcfs_net2str(net), names[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Net %s ignoring interface %s (down)\n",
+			      libcfs_net2str(net), names[i]);
+			continue;
+		}
+
+		libcfs_ipif_free_enumeration(names, n);
+		ni->ni_nid = LNET_MKNID(net, ip);
+		return 0;
+	}
+
+	CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+	libcfs_ipif_free_enumeration(names, n);
+	return -ENOENT;
+}
+EXPORT_SYMBOL(lnet_set_ip_niaddr);

diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
new file mode 100644
index 0000000..78297a7
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c

@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-eq.c
+ *
+ * Library level Event queue management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create an event queue that has room for \a count number of events.
+ *
+ * The event queue is circular and older events will be overwritten by new
+ * ones if they are not removed in time by the user using the functions
+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to
+ * determine the appropriate size of the event queue to prevent this loss
+ * of events. Note that when EQ handler is specified in \a callback, no
+ * event loss can happen, since the handler is run for each event deposited
+ * into the EQ.
+ *
+ * \param count The number of events to be stored in the event queue. It
+ * will be rounded up to the next power of two.
+ * \param callback A handler function that runs when an event is deposited
+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to
+ * indicate that no event handler is desired.
+ * \param handle On successful return, this location will hold a handle for
+ * the newly created EQ.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If an parameter is not valid.
+ * \retval -ENOMEM If memory for the EQ can't be allocated.
+ *
+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics.
+ */
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
+	    lnet_handle_eq_t *handle)
+{
+	lnet_eq_t     *eq;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	/* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+	 * overflow, they don't skip entries, so the queue has the same
+	 * apparent capacity at all times */
+
+	count = cfs_power2_roundup(count);
+
+	if (callback != LNET_EQ_HANDLER_NONE && count != 0) {
+		CWARN("EQ callback is guaranteed to get every event, "
+		      "do you still want to set eqcount %d for polling "
+		      "event which will have locking overhead? "
+		      "Please contact with developer to confirm\n", count);
+	}
+
+	/* count can be 0 if only need callback, we can eliminate
+	 * overhead of enqueue event */
+	if (count == 0 && callback == LNET_EQ_HANDLER_NONE)
+		return -EINVAL;
+
+	eq = lnet_eq_alloc();
+	if (eq == NULL)
+		return -ENOMEM;
+
+	if (count != 0) {
+		LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t));
+		if (eq->eq_events == NULL)
+			goto failed;
+		/* NB allocator has set all event sequence numbers to 0,
+		 * so all them should be earlier than eq_deq_seq */
+	}
+
+	eq->eq_deq_seq = 1;
+	eq->eq_enq_seq = 1;
+	eq->eq_size = count;
+	eq->eq_callback = callback;
+
+	eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*eq->eq_refs[0]));
+	if (eq->eq_refs == NULL)
+		goto failed;
+
+	/* MUST hold both exclusive lnet_res_lock */
+	lnet_res_lock(LNET_LOCK_EX);
+	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
+	lnet_eq_wait_lock();
+
+	lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh);
+	list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active);
+
+	lnet_eq_wait_unlock();
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_eq2handle(handle, eq);
+	return 0;
+
+failed:
+	if (eq->eq_events != NULL)
+		LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t));
+
+	if (eq->eq_refs != NULL)
+		cfs_percpt_free(eq->eq_refs);
+
+	lnet_eq_free(eq);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(LNetEQAlloc);
+
+/**
+ * Release the resources associated with an event queue if it's idle;
+ * otherwise do nothing and it's up to the user to try again.
+ *
+ * \param eqh A handle for the event queue to be released.
+ *
+ * \retval 0 If the EQ is not in use and freed.
+ * \retval -ENOENT If \a eqh does not point to a valid EQ.
+ * \retval -EBUSY  If the EQ is still in use by some MDs.
+ */
+int
+LNetEQFree(lnet_handle_eq_t eqh)
+{
+	struct lnet_eq	*eq;
+	lnet_event_t	*events = NULL;
+	int		**refs = NULL;
+	int		*ref;
+	int		rc = 0;
+	int		size = 0;
+	int		i;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	lnet_res_lock(LNET_LOCK_EX);
+	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
+	lnet_eq_wait_lock();
+
+	eq = lnet_handle2eq(&eqh);
+	if (eq == NULL) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	cfs_percpt_for_each(ref, i, eq->eq_refs) {
+		LASSERT(*ref >= 0);
+		if (*ref == 0)
+			continue;
+
+		CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n",
+		       i, *ref);
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/* stash for free after lock dropped */
+	events	= eq->eq_events;
+	size	= eq->eq_size;
+	refs	= eq->eq_refs;
+
+	lnet_res_lh_invalidate(&eq->eq_lh);
+	list_del(&eq->eq_list);
+	lnet_eq_free_locked(eq);
+ out:
+	lnet_eq_wait_unlock();
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	if (events != NULL)
+		LIBCFS_FREE(events, size * sizeof(lnet_event_t));
+	if (refs != NULL)
+		cfs_percpt_free(refs);
+
+	return rc;
+}
+EXPORT_SYMBOL(LNetEQFree);
+
+void
+lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+	/* MUST called with resource lock hold but w/o lnet_eq_wait_lock */
+	int index;
+
+	if (eq->eq_size == 0) {
+		LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE);
+		eq->eq_callback(ev);
+		return;
+	}
+
+	lnet_eq_wait_lock();
+	ev->sequence = eq->eq_enq_seq++;
+
+	LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size));
+	index = ev->sequence & (eq->eq_size - 1);
+
+	eq->eq_events[index] = *ev;
+
+	if (eq->eq_callback != LNET_EQ_HANDLER_NONE)
+		eq->eq_callback(ev);
+
+	/* Wake anyone waiting in LNetEQPoll() */
+	if (waitqueue_active(&the_lnet.ln_eq_waitq))
+		wake_up_all(&the_lnet.ln_eq_waitq);
+	lnet_eq_wait_unlock();
+}
+
+int
+lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+	int		new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+	lnet_event_t	*new_event = &eq->eq_events[new_index];
+	int		rc;
+	ENTRY;
+
+	/* must called with lnet_eq_wait_lock hold */
+	if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence))
+		RETURN(0);
+
+	/* We've got a new event... */
+	*ev = *new_event;
+
+	CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+	       new_event, eq->eq_deq_seq, eq->eq_size);
+
+	/* ...but did it overwrite an event we've not seen yet? */
+	if (eq->eq_deq_seq == new_event->sequence) {
+		rc = 1;
+	} else {
+		/* don't complain with CERROR: some EQs are sized small
+		 * anyway; if it's important, the caller should complain */
+		CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
+		       eq->eq_deq_seq, new_event->sequence);
+		rc = -EOVERFLOW;
+	}
+
+	eq->eq_deq_seq = new_event->sequence + 1;
+	RETURN(rc);
+}
+
+/**
+ * A nonblocking function that can be used to get the next event in an EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. The event is removed from the queue.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 0	  No pending event in the EQ.
+ * \retval 1	  Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+	int which;
+
+	return LNetEQPoll(&eventq, 1, 0,
+			 event, &which);
+}
+EXPORT_SYMBOL(LNetEQGet);
+
+/**
+ * Block the calling process until there is an event in the EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. This function returns the next event
+ * in the EQ and removes it from the EQ.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 1	  Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+	int which;
+
+	return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER,
+			 event, &which);
+}
+EXPORT_SYMBOL(LNetEQWait);
+
+
+static int
+lnet_eq_wait_locked(int *timeout_ms)
+{
+	int		tms = *timeout_ms;
+	int		wait;
+	wait_queue_t  wl;
+	cfs_time_t      now;
+
+	if (tms == 0)
+		return -1; /* don't want to wait and no new event */
+
+	init_waitqueue_entry_current(&wl);
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+	lnet_eq_wait_unlock();
+
+	if (tms < 0) {
+		waitq_wait(&wl, TASK_INTERRUPTIBLE);
+
+	} else {
+		struct timeval tv;
+
+		now = cfs_time_current();
+		waitq_timedwait(&wl, TASK_INTERRUPTIBLE,
+				    cfs_time_seconds(tms) / 1000);
+		cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv);
+		tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+		if (tms < 0) /* no more wait but may have new event */
+			tms = 0;
+	}
+
+	wait = tms != 0; /* might need to call here again */
+	*timeout_ms = tms;
+
+	lnet_eq_wait_lock();
+	remove_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+	return wait;
+}
+
+
+
+/**
+ * Block the calling process until there's an event from a set of EQs or
+ * timeout happens.
+ *
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully, in which case the corresponding event
+ * is consumed.
+ *
+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a
+ * fixed period, or block indefinitely.
+ *
+ * \param eventqs,neq An array of EQ handles, and size of the array.
+ * \param timeout_ms Time in milliseconds to wait for an event to occur on
+ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an
+ * infinite timeout.
+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will
+ * hold the next event in the EQs, and \a which will contain the index of the
+ * EQ from which the event was taken.
+ *
+ * \retval 0	  No pending event in the EQs after timeout.
+ * \retval 1	  Indicates success.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ indicated by \a which has been dropped due to limited space in the EQ.
+ * \retval -ENOENT    If there's an invalid handle in \a eventqs.
+ */
+int
+LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
+	   lnet_event_t *event, int *which)
+{
+	int	wait = 1;
+	int	rc;
+	int	i;
+	ENTRY;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (neq < 1)
+		RETURN(-ENOENT);
+
+	lnet_eq_wait_lock();
+
+	for (;;) {
+		for (i = 0; i < neq; i++) {
+			lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]);
+
+			if (eq == NULL) {
+				lnet_eq_wait_unlock();
+				RETURN(-ENOENT);
+			}
+
+			rc = lnet_eq_dequeue_event(eq, event);
+			if (rc != 0) {
+				lnet_eq_wait_unlock();
+				*which = i;
+				RETURN(rc);
+			}
+		}
+
+		if (wait == 0)
+			break;
+
+		/*
+		 * return value of lnet_eq_wait_locked:
+		 * -1 : did nothing and it's sure no new event
+		 *  1 : sleep inside and wait until new event
+		 *  0 : don't want to wait anymore, but might have new event
+		 *      so need to call dequeue again
+		 */
+		wait = lnet_eq_wait_locked(&timeout_ms);
+		if (wait < 0) /* no new event */
+			break;
+	}
+
+	lnet_eq_wait_unlock();
+	RETURN(0);
+}

diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c
new file mode 100644
index 0000000..ae643f2
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-md.c

@@ -0,0 +1,451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(lnet_libmd_t *md)
+{
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+		/* first unlink attempt... */
+		lnet_me_t *me = md->md_me;
+
+		md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+		/* Disassociate from ME (if any), and unlink it if it was created
+		 * with LNET_UNLINK */
+		if (me != NULL) {
+			/* detach MD from portal */
+			lnet_ptl_detach_md(me, md);
+			if (me->me_unlink == LNET_UNLINK)
+				lnet_me_unlink(me);
+		}
+
+		/* ensure all future handle lookups fail */
+		lnet_res_lh_invalidate(&md->md_lh);
+	}
+
+	if (md->md_refcount != 0) {
+		CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+		return;
+	}
+
+	CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+	if (md->md_eq != NULL) {
+		int	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+
+		LASSERT(*md->md_eq->eq_refs[cpt] > 0);
+		(*md->md_eq->eq_refs[cpt])--;
+	}
+
+	LASSERT(!list_empty(&md->md_list));
+	list_del_init(&md->md_list);
+	lnet_md_free_locked(md);
+}
+
+static int
+lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
+{
+	int	  i;
+	unsigned int niov;
+	int	  total_length = 0;
+
+	lmd->md_me = NULL;
+	lmd->md_start = umd->start;
+	lmd->md_offset = 0;
+	lmd->md_max_size = umd->max_size;
+	lmd->md_options = umd->options;
+	lmd->md_user_ptr = umd->user_ptr;
+	lmd->md_eq = NULL;
+	lmd->md_threshold = umd->threshold;
+	lmd->md_refcount = 0;
+	lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+
+	if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+		if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+			return -EINVAL;
+
+		lmd->md_niov = niov = umd->length;
+		memcpy(lmd->md_iov.iov, umd->start,
+		       niov * sizeof (lmd->md_iov.iov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the base address on trust */
+			if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */
+				return -EINVAL;
+
+			total_length += lmd->md_iov.iov[i].iov_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) // illegal max_size
+			return -EINVAL;
+
+	} else if ((umd->options & LNET_MD_KIOV) != 0) {
+		lmd->md_niov = niov = umd->length;
+		memcpy(lmd->md_iov.kiov, umd->start,
+		       niov * sizeof (lmd->md_iov.kiov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the page pointer on trust */
+			if (lmd->md_iov.kiov[i].kiov_offset +
+			    lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE )
+				return -EINVAL; /* invalid length */
+
+			total_length += lmd->md_iov.kiov[i].kiov_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) // illegal max_size
+			return -EINVAL;
+	} else {   /* contiguous */
+		lmd->md_length = umd->length;
+		lmd->md_niov = niov = 1;
+		lmd->md_iov.iov[0].iov_base = umd->start;
+		lmd->md_iov.iov[0].iov_len = umd->length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > (int)umd->length)) // illegal max_size
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* must be called with resource lock held */
+static int
+lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt)
+{
+	struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+	/* NB we are passed an allocated, but inactive md.
+	 * if we return success, caller may lnet_md_unlink() it.
+	 * otherwise caller may only lnet_md_free() it.
+	 */
+	/* This implementation doesn't know how to create START events or
+	 * disable END events.  Best to LASSERT our caller is compliant so
+	 * we find out quickly...  */
+	/*  TODO - reevaluate what should be here in light of
+	 * the removal of the start and end events
+	 * maybe there we shouldn't even allow LNET_EQ_NONE!)
+	 * LASSERT (eq == NULL);
+	 */
+	if (!LNetHandleIsInvalid(eq_handle)) {
+		md->md_eq = lnet_handle2eq(&eq_handle);
+
+		if (md->md_eq == NULL)
+			return -ENOENT;
+
+		(*md->md_eq->eq_refs[cpt])++;
+	}
+
+	lnet_res_lh_initialize(container, &md->md_lh);
+
+	LASSERT(list_empty(&md->md_list));
+	list_add(&md->md_list, &container->rec_active);
+
+	return 0;
+}
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
+{
+	/* NB this doesn't copy out all the iov entries so when a
+	 * discontiguous MD is copied out, the target gets to know the
+	 * original iov pointer (in start) and the number of entries it had
+	 * and that's all.
+	 */
+	umd->start = lmd->md_start;
+	umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+		      lmd->md_length : lmd->md_niov;
+	umd->threshold = lmd->md_threshold;
+	umd->max_size = lmd->md_max_size;
+	umd->options = lmd->md_options;
+	umd->user_ptr = lmd->md_user_ptr;
+	lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
+}
+
+int
+lnet_md_validate(lnet_md_t *umd)
+{
+	if (umd->start == NULL && umd->length != 0) {
+		CERROR("MD start pointer can not be NULL with length %u\n",
+		       umd->length);
+		return -EINVAL;
+	}
+
+	if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+	    umd->length > LNET_MAX_IOV) {
+		CERROR("Invalid option: too many fragments %u, %d max\n",
+		       umd->length, LNET_MAX_IOV);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param meh A handle for a ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a
+ * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by
+ * calling LNetInvalidateHandle() on it.
+ * \retval -EBUSY  If the ME pointed to by \a meh is already associated with
+ * a MD.
+ */
+int
+LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
+	     lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+	LIST_HEAD		(matches);
+	LIST_HEAD		(drops);
+	struct lnet_me		*me;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (lnet_md_validate(&umd) != 0)
+		return -EINVAL;
+
+	if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+		CERROR("Invalid option: no MD_OP set\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_alloc(&umd);
+	if (md == NULL)
+		return -ENOMEM;
+
+	rc = lnet_md_build(md, &umd, unlink);
+	cpt = lnet_cpt_of_cookie(meh.cookie);
+
+	lnet_res_lock(cpt);
+	if (rc != 0)
+		goto failed;
+
+	me = lnet_handle2me(&meh);
+	if (me == NULL)
+		rc = -ENOENT;
+	else if (me->me_md != NULL)
+		rc = -EBUSY;
+	else
+		rc = lnet_md_link(md, umd.eq_handle, cpt);
+
+	if (rc != 0)
+		goto failed;
+
+	/* attach this MD to portal of ME and check if it matches any
+	 * blocked msgs on this portal */
+	lnet_ptl_attach_md(me, md, &matches, &drops);
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+
+	lnet_drop_delayed_msg_list(&drops, "Bad match");
+	lnet_recv_delayed_msg_list(&matches);
+
+	return 0;
+
+ failed:
+	lnet_md_free_locked(md);
+
+	lnet_res_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that
+ * it's OK to supply a NULL \a umd.eq_handle by calling
+ * LNetInvalidateHandle() on it.
+ */
+int
+LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+	lnet_libmd_t	*md;
+	int		cpt;
+	int		rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (lnet_md_validate(&umd) != 0)
+		return -EINVAL;
+
+	if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+		CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_alloc(&umd);
+	if (md == NULL)
+		return -ENOMEM;
+
+	rc = lnet_md_build(md, &umd, unlink);
+
+	cpt = lnet_res_lock_current();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_md_link(md, umd.eq_handle, cpt);
+	if (rc != 0)
+		goto failed;
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+
+ failed:
+	lnet_md_free_locked(md);
+
+	lnet_res_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ *   and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ *   returns, and the unlinked event will be piggybacked on the event of
+ *   the completion of the last operation by setting the unlinked field of
+ *   the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+LNetMDUnlink (lnet_handle_md_t mdh)
+{
+	lnet_event_t	ev;
+	lnet_libmd_t	*md;
+	int		cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL) {
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	/* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+	 * when the NAL is done, the completion event flags that the MD was
+	 * unlinked.  Otherwise, we enqueue an event now... */
+
+	if (md->md_eq != NULL &&
+	    md->md_refcount == 0) {
+		lnet_build_unlink_event(md, &ev);
+		lnet_eq_enqueue_event(md->md_eq, &ev);
+	}
+
+	lnet_md_unlink(md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMDUnlink);

diff --git a/drivers/staging/lustre/lnet/lnet/lib-me.c b/drivers/staging/lustre/lnet/lnet/lib-me.c
new file mode 100644
index 0000000..0081075
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-me.c

@@ -0,0 +1,297 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the lnet_process_id_t
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ * \param handle On successful returns, a handle to the newly created ME
+ * object is saved here. This handle can be used later in LNetMEInsert(),
+ * LNetMEUnlink(), or LNetMDAttach() functions.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is invalid.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ */
+int
+LNetMEAttach(unsigned int portal,
+	     lnet_process_id_t match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     lnet_unlink_t unlink, lnet_ins_pos_t pos,
+	     lnet_handle_me_t *handle)
+{
+	struct lnet_match_table *mtable;
+	struct lnet_me		*me;
+	struct list_head		*head;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if ((int)portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	mtable = lnet_mt_of_attach(portal, match_id,
+				   match_bits, ignore_bits, pos);
+	if (mtable == NULL) /* can't match portal type */
+		return -EPERM;
+
+	me = lnet_me_alloc();
+	if (me == NULL)
+		return -ENOMEM;
+
+	lnet_res_lock(mtable->mt_cpt);
+
+	me->me_portal = portal;
+	me->me_match_id = match_id;
+	me->me_match_bits = match_bits;
+	me->me_ignore_bits = ignore_bits;
+	me->me_unlink = unlink;
+	me->me_md = NULL;
+
+	lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
+			       &me->me_lh);
+	if (ignore_bits != 0)
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+	me->me_pos = head - &mtable->mt_mhash[0];
+	if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+		list_add_tail(&me->me_list, head);
+	else
+		list_add(&me->me_list, head);
+
+	lnet_me2handle(handle, me);
+
+	lnet_res_unlock(mtable->mt_cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/**
+ * Create and a match entry and insert it before or after the ME pointed to by
+ * \a current_meh. The new ME is empty, i.e. not associated with a memory
+ * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME.
+ *
+ * This function is identical to LNetMEAttach() except for the position
+ * where the new ME is inserted.
+ *
+ * \param current_meh A handle for a ME. The new ME will be inserted
+ * immediately before or immediately after this ME.
+ * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion
+ * for LNetMEAttach().
+ *
+ * \retval 0       On success.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ * \retval -ENOENT If \a current_meh does not point to a valid match entry.
+ */
+int
+LNetMEInsert(lnet_handle_me_t current_meh,
+	     lnet_process_id_t match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     lnet_unlink_t unlink, lnet_ins_pos_t pos,
+	     lnet_handle_me_t *handle)
+{
+	struct lnet_me		*current_me;
+	struct lnet_me		*new_me;
+	struct lnet_portal	*ptl;
+	int			cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (pos == LNET_INS_LOCAL)
+		return -EPERM;
+
+	new_me = lnet_me_alloc();
+	if (new_me == NULL)
+		return -ENOMEM;
+
+	cpt = lnet_cpt_of_cookie(current_meh.cookie);
+
+	lnet_res_lock(cpt);
+
+	current_me = lnet_handle2me(&current_meh);
+	if (current_me == NULL) {
+		lnet_me_free_locked(new_me);
+
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	LASSERT(current_me->me_portal < the_lnet.ln_nportals);
+
+	ptl = the_lnet.ln_portals[current_me->me_portal];
+	if (lnet_ptl_is_unique(ptl)) {
+		/* nosense to insertion on unique portal */
+		lnet_me_free_locked(new_me);
+		lnet_res_unlock(cpt);
+		return -EPERM;
+	}
+
+	new_me->me_pos = current_me->me_pos;
+	new_me->me_portal = current_me->me_portal;
+	new_me->me_match_id = match_id;
+	new_me->me_match_bits = match_bits;
+	new_me->me_ignore_bits = ignore_bits;
+	new_me->me_unlink = unlink;
+	new_me->me_md = NULL;
+
+	lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh);
+
+	if (pos == LNET_INS_AFTER)
+		list_add(&new_me->me_list, &current_me->me_list);
+	else
+		list_add_tail(&new_me->me_list, &current_me->me_list);
+
+	lnet_me2handle(handle, new_me);
+
+	lnet_res_unlock(cpt);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEInsert);
+
+/**
+ * Unlink a match entry from its match list.
+ *
+ * This operation also releases any resources associated with the ME. If a
+ * memory descriptor is attached to the ME, then it will be unlinked as well
+ * and an unlink event will be generated. It is an error to use the ME handle
+ * after calling LNetMEUnlink().
+ *
+ * \param meh A handle for the ME to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a meh does not point to a valid ME.
+ * \see LNetMDUnlink() for the discussion on delivering unlink event.
+ */
+int
+LNetMEUnlink(lnet_handle_me_t meh)
+{
+	lnet_me_t	*me;
+	lnet_libmd_t	*md;
+	lnet_event_t	ev;
+	int		cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(meh.cookie);
+	lnet_res_lock(cpt);
+
+	me = lnet_handle2me(&meh);
+	if (me == NULL) {
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	md = me->me_md;
+	if (md != NULL &&
+	    md->md_eq != NULL &&
+	    md->md_refcount == 0) {
+		lnet_build_unlink_event(md, &ev);
+		lnet_eq_enqueue_event(md->md_eq, &ev);
+	}
+
+	lnet_me_unlink(me);
+
+	lnet_res_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEUnlink);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(lnet_me_t *me)
+{
+	list_del(&me->me_list);
+
+	if (me->me_md != NULL) {
+		lnet_libmd_t *md = me->me_md;
+
+		/* detach MD from portal of this ME */
+		lnet_ptl_detach_md(me, md);
+		lnet_md_unlink(md);
+	}
+
+	lnet_res_lh_invalidate(&me->me_lh);
+	lnet_me_free_locked(me);
+}
+
+#if 0
+static void
+lib_me_dump(lnet_me_t *me)
+{
+	CWARN("Match Entry %p ("LPX64")\n", me,
+	      me->me_lh.lh_cookie);
+
+	CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+	      me->me_match_bits, me->me_ignore_bits);
+
+	CWARN("\tMD\t= %p\n", me->md);
+	CWARN("\tprev\t= %p\n",
+	      list_entry(me->me_list.prev, lnet_me_t, me_list));
+	CWARN("\tnext\t= %p\n",
+	      list_entry(me->me_list.next, lnet_me_t, me_list));
+}
+#endif

diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
new file mode 100644
index 0000000..49b0f12
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-move.c

@@ -0,0 +1,2441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+static int local_nid_dist_zero = 1;
+CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444,
+		"Reserved");
+
+int
+lnet_fail_nid (lnet_nid_t nid, unsigned int threshold)
+{
+	lnet_test_peer_t  *tp;
+	struct list_head	*el;
+	struct list_head	*next;
+	struct list_head	 cull;
+
+	LASSERT (the_lnet.ln_init);
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	if (threshold != 0) {
+		/* Adding a new entry */
+		LIBCFS_ALLOC(tp, sizeof(*tp));
+		if (tp == NULL)
+			return -ENOMEM;
+
+		tp->tp_nid = nid;
+		tp->tp_threshold = threshold;
+
+		lnet_net_lock(0);
+		list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+		lnet_net_unlock(0);
+		return 0;
+	}
+
+	/* removing entries */
+	INIT_LIST_HEAD(&cull);
+
+	lnet_net_lock(0);
+
+	list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+		if (tp->tp_threshold == 0 ||    /* needs culling anyway */
+		    nid == LNET_NID_ANY ||       /* removing all entries */
+		    tp->tp_nid == nid)	  /* matched this one */
+		{
+			list_del (&tp->tp_list);
+			list_add (&tp->tp_list, &cull);
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty (&cull)) {
+		tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+
+		list_del (&tp->tp_list);
+		LIBCFS_FREE(tp, sizeof (*tp));
+	}
+	return 0;
+}
+
+static int
+fail_peer (lnet_nid_t nid, int outgoing)
+{
+	lnet_test_peer_t *tp;
+	struct list_head       *el;
+	struct list_head       *next;
+	struct list_head	cull;
+	int	       fail = 0;
+
+	INIT_LIST_HEAD (&cull);
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	lnet_net_lock(0);
+
+	list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+		if (tp->tp_threshold == 0) {
+			/* zombie entry */
+			if (outgoing) {
+				/* only cull zombies on outgoing tests,
+				 * since we may be at interrupt priority on
+				 * incoming messages. */
+				list_del (&tp->tp_list);
+				list_add (&tp->tp_list, &cull);
+			}
+			continue;
+		}
+
+		if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */
+		    nid == tp->tp_nid) {	/* fail this peer */
+			fail = 1;
+
+			if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+				tp->tp_threshold--;
+				if (outgoing &&
+				    tp->tp_threshold == 0) {
+					/* see above */
+					list_del (&tp->tp_list);
+					list_add (&tp->tp_list, &cull);
+				}
+			}
+			break;
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty (&cull)) {
+		tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+		list_del (&tp->tp_list);
+
+		LIBCFS_FREE(tp, sizeof (*tp));
+	}
+
+	return (fail);
+}
+
+unsigned int
+lnet_iov_nob (unsigned int niov, struct iovec *iov)
+{
+	unsigned int nob = 0;
+
+	while (niov-- > 0)
+		nob += (iov++)->iov_len;
+
+	return (nob);
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+		   unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+		   unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int  this_nob;
+
+	if (nob == 0)
+		return;
+
+	/* skip complete frags before 'doffset' */
+	LASSERT (ndiov > 0);
+	while (doffset >= diov->iov_len) {
+		doffset -= diov->iov_len;
+		diov++;
+		ndiov--;
+		LASSERT (ndiov > 0);
+	}
+
+	/* skip complete frags before 'soffset' */
+	LASSERT (nsiov > 0);
+	while (soffset >= siov->iov_len) {
+		soffset -= siov->iov_len;
+		siov++;
+		nsiov--;
+		LASSERT (nsiov > 0);
+	}
+
+	do {
+		LASSERT (ndiov > 0);
+		LASSERT (nsiov > 0);
+		this_nob = MIN(diov->iov_len - doffset,
+			       siov->iov_len - soffset);
+		this_nob = MIN(this_nob, nob);
+
+		memcpy ((char *)diov->iov_base + doffset,
+			(char *)siov->iov_base + soffset, this_nob);
+		nob -= this_nob;
+
+		if (diov->iov_len > doffset + this_nob) {
+			doffset += this_nob;
+		} else {
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->iov_len > soffset + this_nob) {
+			soffset += this_nob;
+		} else {
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+int
+lnet_extract_iov (int dst_niov, struct iovec *dst,
+		  int src_niov, struct iovec *src,
+		  unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int    frag_len;
+	unsigned int    niov;
+
+	if (len == 0)			   /* no data => */
+		return (0);		     /* no frags */
+
+	LASSERT (src_niov > 0);
+	while (offset >= src->iov_len) {      /* skip initial frags */
+		offset -= src->iov_len;
+		src_niov--;
+		src++;
+		LASSERT (src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT (src_niov > 0);
+		LASSERT ((int)niov <= dst_niov);
+
+		frag_len = src->iov_len - offset;
+		dst->iov_base = ((char *)src->iov_base) + offset;
+
+		if (len <= frag_len) {
+			dst->iov_len = len;
+			return (niov);
+		}
+
+		dst->iov_len = frag_len;
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_iov);
+
+
+unsigned int
+lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov)
+{
+	unsigned int  nob = 0;
+
+	while (niov-- > 0)
+		nob += (kiov++)->kiov_len;
+
+	return (nob);
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+		     unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+		     unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *daddr = NULL;
+	char	   *saddr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (ndiov > 0);
+	while (doffset >= diov->kiov_len) {
+		doffset -= diov->kiov_len;
+		diov++;
+		ndiov--;
+		LASSERT (ndiov > 0);
+	}
+
+	LASSERT (nsiov > 0);
+	while (soffset >= siov->kiov_len) {
+		soffset -= siov->kiov_len;
+		siov++;
+		nsiov--;
+		LASSERT (nsiov > 0);
+	}
+
+	do {
+		LASSERT (ndiov > 0);
+		LASSERT (nsiov > 0);
+		this_nob = MIN(diov->kiov_len - doffset,
+			       siov->kiov_len - soffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (daddr == NULL)
+			daddr = ((char *)kmap(diov->kiov_page)) +
+				diov->kiov_offset + doffset;
+		if (saddr == NULL)
+			saddr = ((char *)kmap(siov->kiov_page)) +
+				siov->kiov_offset + soffset;
+
+		/* Vanishing risk of kmap deadlock when mapping 2 pages.
+		 * However in practice at least one of the kiovs will be mapped
+		 * kernel pages and the map/unmap will be NOOPs */
+
+		memcpy (daddr, saddr, this_nob);
+		nob -= this_nob;
+
+		if (diov->kiov_len > doffset + this_nob) {
+			daddr += this_nob;
+			doffset += this_nob;
+		} else {
+			kunmap(diov->kiov_page);
+			daddr = NULL;
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->kiov_len > soffset + this_nob) {
+			saddr += this_nob;
+			soffset += this_nob;
+		} else {
+			kunmap(siov->kiov_page);
+			saddr = NULL;
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+
+	if (daddr != NULL)
+		kunmap(diov->kiov_page);
+	if (saddr != NULL)
+		kunmap(siov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+		    unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+		    unsigned int nob)
+{
+	/* NB iov, kiov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT (niov > 0);
+	}
+
+	LASSERT (nkiov > 0);
+	while (kiovoffset >= kiov->kiov_len) {
+		kiovoffset -= kiov->kiov_len;
+		kiov++;
+		nkiov--;
+		LASSERT (nkiov > 0);
+	}
+
+	do {
+		LASSERT (niov > 0);
+		LASSERT (nkiov > 0);
+		this_nob = MIN(iov->iov_len - iovoffset,
+			       kiov->kiov_len - kiovoffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->kiov_page)) +
+				kiov->kiov_offset + kiovoffset;
+
+		memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob);
+		nob -= this_nob;
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+
+		if (kiov->kiov_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->kiov_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+		    unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+		    unsigned int nob)
+{
+	/* NB kiov, iov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (nkiov > 0);
+	while (kiovoffset >= kiov->kiov_len) {
+		kiovoffset -= kiov->kiov_len;
+		kiov++;
+		nkiov--;
+		LASSERT (nkiov > 0);
+	}
+
+	LASSERT (niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT (niov > 0);
+	}
+
+	do {
+		LASSERT (nkiov > 0);
+		LASSERT (niov > 0);
+		this_nob = MIN(kiov->kiov_len - kiovoffset,
+			       iov->iov_len - iovoffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->kiov_page)) +
+				kiov->kiov_offset + kiovoffset;
+
+		memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
+		nob -= this_nob;
+
+		if (kiov->kiov_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->kiov_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+		   int src_niov, lnet_kiov_t *src,
+		   unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int    frag_len;
+	unsigned int    niov;
+
+	if (len == 0)			   /* no data => */
+		return (0);		     /* no frags */
+
+	LASSERT (src_niov > 0);
+	while (offset >= src->kiov_len) {      /* skip initial frags */
+		offset -= src->kiov_len;
+		src_niov--;
+		src++;
+		LASSERT (src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT (src_niov > 0);
+		LASSERT ((int)niov <= dst_niov);
+
+		frag_len = src->kiov_len - offset;
+		dst->kiov_page = src->kiov_page;
+		dst->kiov_offset = src->kiov_offset + offset;
+
+		if (len <= frag_len) {
+			dst->kiov_len = len;
+			LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+			return (niov);
+		}
+
+		dst->kiov_len = frag_len;
+		LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+void
+lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	     unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	unsigned int  niov = 0;
+	struct iovec *iov = NULL;
+	lnet_kiov_t  *kiov = NULL;
+	int	   rc;
+
+	LASSERT (!in_interrupt ());
+	LASSERT (mlen == 0 || msg != NULL);
+
+	if (msg != NULL) {
+		LASSERT(msg->msg_receiving);
+		LASSERT(!msg->msg_sending);
+		LASSERT(rlen == msg->msg_len);
+		LASSERT(mlen <= msg->msg_len);
+		LASSERT(msg->msg_offset == offset);
+		LASSERT(msg->msg_wanted == mlen);
+
+		msg->msg_receiving = 0;
+
+		if (mlen != 0) {
+			niov = msg->msg_niov;
+			iov  = msg->msg_iov;
+			kiov = msg->msg_kiov;
+
+			LASSERT (niov > 0);
+			LASSERT ((iov == NULL) != (kiov == NULL));
+		}
+	}
+
+	rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
+				    niov, iov, kiov, offset, mlen, rlen);
+	if (rc < 0)
+		lnet_finalize(ni, msg, rc);
+}
+
+void
+lnet_setpayloadbuffer(lnet_msg_t *msg)
+{
+	lnet_libmd_t *md = msg->msg_md;
+
+	LASSERT (msg->msg_len > 0);
+	LASSERT (!msg->msg_routing);
+	LASSERT (md != NULL);
+	LASSERT (msg->msg_niov == 0);
+	LASSERT (msg->msg_iov == NULL);
+	LASSERT (msg->msg_kiov == NULL);
+
+	msg->msg_niov = md->md_niov;
+	if ((md->md_options & LNET_MD_KIOV) != 0)
+		msg->msg_kiov = md->md_iov.kiov;
+	else
+		msg->msg_iov = md->md_iov.iov;
+}
+
+void
+lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+	       unsigned int offset, unsigned int len)
+{
+	msg->msg_type = type;
+	msg->msg_target = target;
+	msg->msg_len = len;
+	msg->msg_offset = offset;
+
+	if (len != 0)
+		lnet_setpayloadbuffer(msg);
+
+	memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+	msg->msg_hdr.type	   = cpu_to_le32(type);
+	msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
+	msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
+	/* src_nid will be set later */
+	msg->msg_hdr.src_pid	= cpu_to_le32(the_lnet.ln_pid);
+	msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
+
+void
+lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	void   *priv = msg->msg_private;
+	int     rc;
+
+	LASSERT (!in_interrupt ());
+	LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+		 (msg->msg_txcredit && msg->msg_peertxcredit));
+
+	rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+	if (rc < 0)
+		lnet_finalize(ni, msg, rc);
+}
+
+int
+lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	int	rc;
+
+	LASSERT(!msg->msg_sending);
+	LASSERT(msg->msg_receiving);
+	LASSERT(!msg->msg_rx_ready_delay);
+	LASSERT(ni->ni_lnd->lnd_eager_recv != NULL);
+
+	msg->msg_rx_ready_delay = 1;
+	rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+					  &msg->msg_private);
+	if (rc != 0) {
+		CERROR("recv from %s / send to %s aborted: "
+		       "eager_recv failed %d\n",
+		       libcfs_nid2str(msg->msg_rxpeer->lp_nid),
+		       libcfs_id2str(msg->msg_target), rc);
+		LASSERT(rc < 0); /* required by my callers */
+	}
+
+	return rc;
+}
+
+/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+void
+lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+	cfs_time_t last_alive = 0;
+
+	LASSERT(lnet_peer_aliveness_enabled(lp));
+	LASSERT(ni->ni_lnd->lnd_query != NULL);
+
+	lnet_net_unlock(lp->lp_cpt);
+	(ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+	lnet_net_lock(lp->lp_cpt);
+
+	lp->lp_last_query = cfs_time_current();
+
+	if (last_alive != 0) /* NI has updated timestamp */
+		lp->lp_last_alive = last_alive;
+}
+
+/* NB: always called with lnet_net_lock held */
+static inline int
+lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
+{
+	int	alive;
+	cfs_time_t deadline;
+
+	LASSERT (lnet_peer_aliveness_enabled(lp));
+
+	/* Trust lnet_notify() if it has more recent aliveness news, but
+	 * ignore the initial assumed death (see lnet_peers_start_down()).
+	 */
+	if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+	    cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+		return 0;
+
+	deadline = cfs_time_add(lp->lp_last_alive,
+				cfs_time_seconds(lp->lp_ni->ni_peertimeout));
+	alive = cfs_time_after(deadline, now);
+
+	/* Update obsolete lp_alive except for routers assumed to be dead
+	 * initially, because router checker would update aliveness in this
+	 * case, and moreover lp_last_alive at peer creation is assumed.
+	 */
+	if (alive && !lp->lp_alive &&
+	    !(lnet_isrouter(lp) && lp->lp_alive_count == 0))
+		lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+	return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ *     may drop the lnet_net_lock */
+int
+lnet_peer_alive_locked (lnet_peer_t *lp)
+{
+	cfs_time_t now = cfs_time_current();
+
+	if (!lnet_peer_aliveness_enabled(lp))
+		return -ENODEV;
+
+	if (lnet_peer_is_alive(lp, now))
+		return 1;
+
+	/* Peer appears dead, but we should avoid frequent NI queries (at
+	 * most once per lnet_queryinterval seconds). */
+	if (lp->lp_last_query != 0) {
+		static const int lnet_queryinterval = 1;
+
+		cfs_time_t next_query =
+			   cfs_time_add(lp->lp_last_query,
+					cfs_time_seconds(lnet_queryinterval));
+
+		if (cfs_time_before(now, next_query)) {
+			if (lp->lp_alive)
+				CWARN("Unexpected aliveness of peer %s: "
+				      "%d < %d (%d/%d)\n",
+				      libcfs_nid2str(lp->lp_nid),
+				      (int)now, (int)next_query,
+				      lnet_queryinterval,
+				      lp->lp_ni->ni_peertimeout);
+			return 0;
+		}
+	}
+
+	/* query NI for latest aliveness news */
+	lnet_ni_query_locked(lp->lp_ni, lp);
+
+	if (lnet_peer_is_alive(lp, now))
+		return 1;
+
+	lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+	return 0;
+}
+
+int
+lnet_post_send_locked(lnet_msg_t *msg, int do_send)
+{
+	/* lnet_send is going to lnet_net_unlock immediately after this,
+	 * so it sets do_send FALSE and I don't do the unlock/send/lock bit.
+	 * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer
+	 * appears dead, and 0 if sent or OK to send */
+	struct lnet_peer	*lp = msg->msg_txpeer;
+	struct lnet_ni		*ni = lp->lp_ni;
+	struct lnet_tx_queue	*tq;
+	int			cpt;
+
+	/* non-lnet_send() callers have checked before */
+	LASSERT(!do_send || msg->msg_tx_delayed);
+	LASSERT(!msg->msg_receiving);
+	LASSERT(msg->msg_tx_committed);
+
+	cpt = msg->msg_tx_cpt;
+	tq = ni->ni_tx_queues[cpt];
+
+	/* NB 'lp' is always the next hop */
+	if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+	    lnet_peer_alive_locked(lp) == 0) {
+		the_lnet.ln_counters[cpt]->drop_count++;
+		the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+		lnet_net_unlock(cpt);
+
+		CNETERR("Dropping message for %s: peer not alive\n",
+			libcfs_id2str(msg->msg_target));
+		if (do_send)
+			lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+		lnet_net_lock(cpt);
+		return EHOSTUNREACH;
+	}
+
+	if (!msg->msg_peertxcredit) {
+		LASSERT ((lp->lp_txcredits < 0) ==
+			 !list_empty(&lp->lp_txq));
+
+		msg->msg_peertxcredit = 1;
+		lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+		lp->lp_txcredits--;
+
+		if (lp->lp_txcredits < lp->lp_mintxcredits)
+			lp->lp_mintxcredits = lp->lp_txcredits;
+
+		if (lp->lp_txcredits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lp_txq);
+			return EAGAIN;
+		}
+	}
+
+	if (!msg->msg_txcredit) {
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		msg->msg_txcredit = 1;
+		tq->tq_credits--;
+
+		if (tq->tq_credits < tq->tq_credits_min)
+			tq->tq_credits_min = tq->tq_credits;
+
+		if (tq->tq_credits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &tq->tq_delayed);
+			return EAGAIN;
+		}
+	}
+
+	if (do_send) {
+		lnet_net_unlock(cpt);
+		lnet_ni_send(ni, msg);
+		lnet_net_lock(cpt);
+	}
+	return 0;
+}
+
+
+lnet_rtrbufpool_t *
+lnet_msg2bufpool(lnet_msg_t *msg)
+{
+	lnet_rtrbufpool_t	*rbp;
+	int			cpt;
+
+	LASSERT(msg->msg_rx_committed);
+
+	cpt = msg->msg_rx_cpt;
+	rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+	LASSERT(msg->msg_len <= LNET_MTU);
+	while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) {
+		rbp++;
+		LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+	}
+
+	return rbp;
+}
+
+int
+lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
+{
+	/* lnet_parse is going to lnet_net_unlock immediately after this, so it
+	 * sets do_recv FALSE and I don't do the unlock/send/lock bit.  I
+	 * return EAGAIN if msg blocked and 0 if received or OK to receive */
+	lnet_peer_t	 *lp = msg->msg_rxpeer;
+	lnet_rtrbufpool_t   *rbp;
+	lnet_rtrbuf_t       *rb;
+
+	LASSERT (msg->msg_iov == NULL);
+	LASSERT (msg->msg_kiov == NULL);
+	LASSERT (msg->msg_niov == 0);
+	LASSERT (msg->msg_routing);
+	LASSERT (msg->msg_receiving);
+	LASSERT (!msg->msg_sending);
+
+	/* non-lnet_parse callers only receive delayed messages */
+	LASSERT(!do_recv || msg->msg_rx_delayed);
+
+	if (!msg->msg_peerrtrcredit) {
+		LASSERT ((lp->lp_rtrcredits < 0) ==
+			 !list_empty(&lp->lp_rtrq));
+
+		msg->msg_peerrtrcredit = 1;
+		lp->lp_rtrcredits--;
+		if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
+			lp->lp_minrtrcredits = lp->lp_rtrcredits;
+
+		if (lp->lp_rtrcredits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+			return EAGAIN;
+		}
+	}
+
+	rbp = lnet_msg2bufpool(msg);
+
+	if (!msg->msg_rtrcredit) {
+		LASSERT ((rbp->rbp_credits < 0) ==
+			 !list_empty(&rbp->rbp_msgs));
+
+		msg->msg_rtrcredit = 1;
+		rbp->rbp_credits--;
+		if (rbp->rbp_credits < rbp->rbp_mincredits)
+			rbp->rbp_mincredits = rbp->rbp_credits;
+
+		if (rbp->rbp_credits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+			return EAGAIN;
+		}
+	}
+
+	LASSERT (!list_empty(&rbp->rbp_bufs));
+	rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list);
+	list_del(&rb->rb_list);
+
+	msg->msg_niov = rbp->rbp_npages;
+	msg->msg_kiov = &rb->rb_kiov[0];
+
+	if (do_recv) {
+		int cpt = msg->msg_rx_cpt;
+
+		lnet_net_unlock(cpt);
+		lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+			     0, msg->msg_len, msg->msg_len);
+		lnet_net_lock(cpt);
+	}
+	return 0;
+}
+
+void
+lnet_return_tx_credits_locked(lnet_msg_t *msg)
+{
+	lnet_peer_t	*txpeer = msg->msg_txpeer;
+	lnet_msg_t	*msg2;
+
+	if (msg->msg_txcredit) {
+		struct lnet_ni	     *ni = txpeer->lp_ni;
+		struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+		/* give back NI txcredits */
+		msg->msg_txcredit = 0;
+
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		tq->tq_credits++;
+		if (tq->tq_credits <= 0) {
+			msg2 = list_entry(tq->tq_delayed.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			LASSERT(msg2->msg_txpeer->lp_ni == ni);
+			LASSERT(msg2->msg_tx_delayed);
+
+			(void) lnet_post_send_locked(msg2, 1);
+		}
+	}
+
+	if (msg->msg_peertxcredit) {
+		/* give back peer txcredits */
+		msg->msg_peertxcredit = 0;
+
+		LASSERT((txpeer->lp_txcredits < 0) ==
+			!list_empty(&txpeer->lp_txq));
+
+		txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+		LASSERT (txpeer->lp_txqnob >= 0);
+
+		txpeer->lp_txcredits++;
+		if (txpeer->lp_txcredits <= 0) {
+			msg2 = list_entry(txpeer->lp_txq.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			LASSERT(msg2->msg_txpeer == txpeer);
+			LASSERT(msg2->msg_tx_delayed);
+
+			(void) lnet_post_send_locked(msg2, 1);
+		}
+	}
+
+	if (txpeer != NULL) {
+		msg->msg_txpeer = NULL;
+		lnet_peer_decref_locked(txpeer);
+	}
+}
+
+void
+lnet_return_rx_credits_locked(lnet_msg_t *msg)
+{
+	lnet_peer_t	*rxpeer = msg->msg_rxpeer;
+	lnet_msg_t	*msg2;
+
+	if (msg->msg_rtrcredit) {
+		/* give back global router credits */
+		lnet_rtrbuf_t     *rb;
+		lnet_rtrbufpool_t *rbp;
+
+		/* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+		 * there until it gets one allocated, or aborts the wait
+		 * itself */
+		LASSERT (msg->msg_kiov != NULL);
+
+		rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]);
+		rbp = rb->rb_pool;
+		LASSERT (rbp == lnet_msg2bufpool(msg));
+
+		msg->msg_kiov = NULL;
+		msg->msg_rtrcredit = 0;
+
+		LASSERT((rbp->rbp_credits < 0) ==
+			!list_empty(&rbp->rbp_msgs));
+		LASSERT((rbp->rbp_credits > 0) ==
+			!list_empty(&rbp->rbp_bufs));
+
+		list_add(&rb->rb_list, &rbp->rbp_bufs);
+		rbp->rbp_credits++;
+		if (rbp->rbp_credits <= 0) {
+			msg2 = list_entry(rbp->rbp_msgs.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			(void) lnet_post_routed_recv_locked(msg2, 1);
+		}
+	}
+
+	if (msg->msg_peerrtrcredit) {
+		/* give back peer router credits */
+		msg->msg_peerrtrcredit = 0;
+
+		LASSERT((rxpeer->lp_rtrcredits < 0) ==
+			!list_empty(&rxpeer->lp_rtrq));
+
+		rxpeer->lp_rtrcredits++;
+		if (rxpeer->lp_rtrcredits <= 0) {
+			msg2 = list_entry(rxpeer->lp_rtrq.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			(void) lnet_post_routed_recv_locked(msg2, 1);
+		}
+	}
+	if (rxpeer != NULL) {
+		msg->msg_rxpeer = NULL;
+		lnet_peer_decref_locked(rxpeer);
+	}
+}
+
+static int
+lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
+{
+	lnet_peer_t *p1 = r1->lr_gateway;
+	lnet_peer_t *p2 = r2->lr_gateway;
+
+	if (r1->lr_hops < r2->lr_hops)
+		return 1;
+
+	if (r1->lr_hops > r2->lr_hops)
+		return -1;
+
+	if (p1->lp_txqnob < p2->lp_txqnob)
+		return 1;
+
+	if (p1->lp_txqnob > p2->lp_txqnob)
+		return -1;
+
+	if (p1->lp_txcredits > p2->lp_txcredits)
+		return 1;
+
+	if (p1->lp_txcredits < p2->lp_txcredits)
+		return -1;
+
+	if (r1->lr_seq - r2->lr_seq <= 0)
+		return 1;
+
+	return -1;
+}
+
+static lnet_peer_t *
+lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
+{
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*rtr;
+	lnet_route_t		*rtr_best;
+	lnet_route_t		*rtr_last;
+	struct lnet_peer	*lp_best;
+	struct lnet_peer	*lp;
+	int			rc;
+
+	/* If @rtr_nid is not LNET_NID_ANY, return the gateway with
+	 * rtr_nid nid, otherwise find the best gateway I can use */
+
+	rnet = lnet_find_net_locked(LNET_NIDNET(target));
+	if (rnet == NULL)
+		return NULL;
+
+	lp_best = NULL;
+	rtr_best = rtr_last = NULL;
+	list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) {
+		lp = rtr->lr_gateway;
+
+		if (!lp->lp_alive || /* gateway is down */
+		    ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 &&
+		     rtr->lr_downis != 0)) /* NI to target is down */
+			continue;
+
+		if (ni != NULL && lp->lp_ni != ni)
+			continue;
+
+		if (lp->lp_nid == rtr_nid) /* it's pre-determined router */
+			return lp;
+
+		if (lp_best == NULL) {
+			rtr_best = rtr_last = rtr;
+			lp_best = lp;
+			continue;
+		}
+
+		/* no protection on below fields, but it's harmless */
+		if (rtr_last->lr_seq - rtr->lr_seq < 0)
+			rtr_last = rtr;
+
+		rc = lnet_compare_routes(rtr, rtr_best);
+		if (rc < 0)
+			continue;
+
+		rtr_best = rtr;
+		lp_best = lp;
+	}
+
+	/* set sequence number on the best router to the latest sequence + 1
+	 * so we can round-robin all routers, it's race and inaccurate but
+	 * harmless and functional  */
+	if (rtr_best != NULL)
+		rtr_best->lr_seq = rtr_last->lr_seq + 1;
+	return lp_best;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+	lnet_nid_t		dst_nid = msg->msg_target.nid;
+	struct lnet_ni		*src_ni;
+	struct lnet_ni		*local_ni;
+	struct lnet_peer	*lp;
+	int			cpt;
+	int			cpt2;
+	int			rc;
+
+	/* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+	 * but we might want to use pre-determined router for ACK/REPLY
+	 * in the future */
+	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+	LASSERT (msg->msg_txpeer == NULL);
+	LASSERT (!msg->msg_sending);
+	LASSERT (!msg->msg_target_is_router);
+	LASSERT (!msg->msg_receiving);
+
+	msg->msg_sending = 1;
+
+	LASSERT(!msg->msg_tx_committed);
+	cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid);
+ again:
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	if (src_nid == LNET_NID_ANY) {
+		src_ni = NULL;
+	} else {
+		src_ni = lnet_nid2ni_locked(src_nid, cpt);
+		if (src_ni == NULL) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("Can't send to %s: src %s is not a "
+				      "local nid\n", libcfs_nid2str(dst_nid),
+				      libcfs_nid2str(src_nid));
+			return -EINVAL;
+		}
+		LASSERT (!msg->msg_routing);
+	}
+
+	/* Is this for someone on a local network? */
+	local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
+
+	if (local_ni != NULL) {
+		if (src_ni == NULL) {
+			src_ni = local_ni;
+			src_nid = src_ni->ni_nid;
+		} else if (src_ni == local_ni) {
+			lnet_ni_decref_locked(local_ni, cpt);
+		} else {
+			lnet_ni_decref_locked(local_ni, cpt);
+			lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("No route to %s via from %s\n",
+				      libcfs_nid2str(dst_nid),
+				      libcfs_nid2str(src_nid));
+			return -EINVAL;
+		}
+
+		LASSERT(src_nid != LNET_NID_ANY);
+		lnet_msg_commit(msg, cpt);
+
+		if (!msg->msg_routing)
+			msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+
+		if (src_ni == the_lnet.ln_loni) {
+			/* No send credit hassles with LOLND */
+			lnet_net_unlock(cpt);
+			lnet_ni_send(src_ni, msg);
+
+			lnet_net_lock(cpt);
+			lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+			return 0;
+		}
+
+		rc = lnet_nid2peer_locked(&lp, dst_nid, cpt);
+		/* lp has ref on src_ni; lose mine */
+		lnet_ni_decref_locked(src_ni, cpt);
+		if (rc != 0) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("Error %d finding peer %s\n", rc,
+				      libcfs_nid2str(dst_nid));
+			/* ENOMEM or shutting down */
+			return rc;
+		}
+		LASSERT (lp->lp_ni == src_ni);
+	} else {
+		/* sending to a remote network */
+		lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid);
+		if (lp == NULL) {
+			if (src_ni != NULL)
+				lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+
+			LCONSOLE_WARN("No route to %s via %s "
+				      "(all routers down)\n",
+				      libcfs_id2str(msg->msg_target),
+				      libcfs_nid2str(src_nid));
+			return -EHOSTUNREACH;
+		}
+
+		/* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
+		 * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
+		 * pre-determined router, this can happen if router table
+		 * was changed when we release the lock */
+		if (rtr_nid != lp->lp_nid) {
+			cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid);
+			if (cpt2 != cpt) {
+				if (src_ni != NULL)
+					lnet_ni_decref_locked(src_ni, cpt);
+				lnet_net_unlock(cpt);
+
+				rtr_nid = lp->lp_nid;
+				cpt = cpt2;
+				goto again;
+			}
+		}
+
+		CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+		       libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
+		       lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+
+		if (src_ni == NULL) {
+			src_ni = lp->lp_ni;
+			src_nid = src_ni->ni_nid;
+		} else {
+			LASSERT (src_ni == lp->lp_ni);
+			lnet_ni_decref_locked(src_ni, cpt);
+		}
+
+		lnet_peer_addref_locked(lp);
+
+		LASSERT(src_nid != LNET_NID_ANY);
+		lnet_msg_commit(msg, cpt);
+
+		if (!msg->msg_routing) {
+			/* I'm the source and now I know which NI to send on */
+			msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+		}
+
+		msg->msg_target_is_router = 1;
+		msg->msg_target.nid = lp->lp_nid;
+		msg->msg_target.pid = LUSTRE_SRV_LNET_PID;
+	}
+
+	/* 'lp' is our best choice of peer */
+
+	LASSERT (!msg->msg_peertxcredit);
+	LASSERT (!msg->msg_txcredit);
+	LASSERT (msg->msg_txpeer == NULL);
+
+	msg->msg_txpeer = lp;		   /* msg takes my ref on lp */
+
+	rc = lnet_post_send_locked(msg, 0);
+	lnet_net_unlock(cpt);
+
+	if (rc == EHOSTUNREACH)
+		return -EHOSTUNREACH;
+
+	if (rc == 0)
+		lnet_ni_send(src_ni, msg);
+
+	return 0;
+}
+
+static void
+lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob)
+{
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += nob;
+	lnet_net_unlock(cpt);
+
+	lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t	*hdr = &msg->msg_hdr;
+
+	if (msg->msg_wanted != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+	/* Must I ACK?  If so I'll grab the ack_wmd out of the header and put
+	 * it back into the ACK during lnet_finalize() */
+	msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+			(msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+		     msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t		*hdr = &msg->msg_hdr;
+	struct lnet_match_info	info;
+	int			rc;
+
+	/* Convert put fields to host byte order */
+	hdr->msg.put.match_bits	= le64_to_cpu(hdr->msg.put.match_bits);
+	hdr->msg.put.ptl_index	= le32_to_cpu(hdr->msg.put.ptl_index);
+	hdr->msg.put.offset	= le32_to_cpu(hdr->msg.put.offset);
+
+	info.mi_id.nid	= hdr->src_nid;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_PUT;
+	info.mi_portal	= hdr->msg.put.ptl_index;
+	info.mi_rlength	= hdr->payload_length;
+	info.mi_roffset	= hdr->msg.put.offset;
+	info.mi_mbits	= hdr->msg.put.match_bits;
+
+	msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL;
+
+ again:
+	rc = lnet_ptl_match_md(&info, msg);
+	switch (rc) {
+	default:
+		LBUG();
+
+	case LNET_MATCHMD_OK:
+		lnet_recv_put(ni, msg);
+		return 0;
+
+	case LNET_MATCHMD_NONE:
+		if (msg->msg_rx_delayed) /* attached on delayed list */
+			return 0;
+
+		rc = lnet_ni_eager_recv(ni, msg);
+		if (rc == 0)
+			goto again;
+		/* fall through */
+
+	case LNET_MATCHMD_DROP:
+		CNETERR("Dropping PUT from %s portal %d match "LPU64
+			" offset %d length %d: %d\n",
+			libcfs_id2str(info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+		return ENOENT;	/* +ve: OK but no match */
+	}
+}
+
+static int
+lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
+{
+	struct lnet_match_info	info;
+	lnet_hdr_t		*hdr = &msg->msg_hdr;
+	lnet_handle_wire_t	reply_wmd;
+	int			rc;
+
+	/* Convert get fields to host byte order */
+	hdr->msg.get.match_bits	  = le64_to_cpu(hdr->msg.get.match_bits);
+	hdr->msg.get.ptl_index	  = le32_to_cpu(hdr->msg.get.ptl_index);
+	hdr->msg.get.sink_length  = le32_to_cpu(hdr->msg.get.sink_length);
+	hdr->msg.get.src_offset	  = le32_to_cpu(hdr->msg.get.src_offset);
+
+	info.mi_id.nid	= hdr->src_nid;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_GET;
+	info.mi_portal	= hdr->msg.get.ptl_index;
+	info.mi_rlength	= hdr->msg.get.sink_length;
+	info.mi_roffset	= hdr->msg.get.src_offset;
+	info.mi_mbits	= hdr->msg.get.match_bits;
+
+	rc = lnet_ptl_match_md(&info, msg);
+	if (rc == LNET_MATCHMD_DROP) {
+		CNETERR("Dropping GET from %s portal %d match "LPU64
+			" offset %d length %d\n",
+			libcfs_id2str(info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength);
+		return ENOENT;	/* +ve: OK but no match */
+	}
+
+	LASSERT(rc == LNET_MATCHMD_OK);
+
+	lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+	reply_wmd = hdr->msg.get.return_wmd;
+
+	lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+		       msg->msg_offset, msg->msg_wanted);
+
+	msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+	if (rdma_get) {
+		/* The LND completes the REPLY from her recv procedure */
+		lnet_ni_recv(ni, msg->msg_private, msg, 0,
+			     msg->msg_offset, msg->msg_len, msg->msg_len);
+		return 0;
+	}
+
+	lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+	msg->msg_receiving = 0;
+
+	rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
+	if (rc < 0) {
+		/* didn't get as far as lnet_ni_send() */
+		CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+		       libcfs_nid2str(ni->ni_nid),
+		       libcfs_id2str(info.mi_id), rc);
+
+		lnet_finalize(ni, msg, rc);
+	}
+
+	return 0;
+}
+
+static int
+lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	void	     *private = msg->msg_private;
+	lnet_hdr_t       *hdr = &msg->msg_hdr;
+	lnet_process_id_t src = {0};
+	lnet_libmd_t     *md;
+	int	       rlength;
+	int	       mlength;
+	int			cpt;
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CNETERR("%s: Dropping REPLY from %s for %s "
+			"MD "LPX64"."LPX64"\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+			(md == NULL) ? "invalid" : "inactive",
+			hdr->msg.reply.dst_wmd.wh_interface_cookie,
+			hdr->msg.reply.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return ENOENT;		  /* +ve: OK but no match */
+	}
+
+	LASSERT (md->md_offset == 0);
+
+	rlength = hdr->payload_length;
+	mlength = MIN(rlength, (int)md->md_length);
+
+	if (mlength < rlength &&
+	    (md->md_options & LNET_MD_TRUNCATE) == 0) {
+		CNETERR("%s: Dropping REPLY from %s length %d "
+			"for MD "LPX64" would overflow (%d)\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+			rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+			mlength);
+		lnet_res_unlock(cpt);
+		return ENOENT;	  /* +ve: OK but no match */
+	}
+
+	CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+	       mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, mlength);
+
+	if (mlength != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+	return 0;
+}
+
+static int
+lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t       *hdr = &msg->msg_hdr;
+	lnet_process_id_t src = {0};
+	lnet_libmd_t     *md;
+	int			cpt;
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* Convert ack fields to host byte order */
+	hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+	hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		/* Don't moan; this is expected */
+		CDEBUG(D_NET,
+		       "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n",
+		       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+		       (md == NULL) ? "invalid" : "inactive",
+		       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		       hdr->msg.ack.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return ENOENT;		  /* +ve! */
+	}
+
+	CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+	       hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+	return 0;
+}
+
+static int
+lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	int	rc = 0;
+
+	if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+	    lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+		if (ni->ni_lnd->lnd_eager_recv == NULL) {
+			msg->msg_rx_ready_delay = 1;
+		} else {
+			lnet_net_unlock(msg->msg_rx_cpt);
+			rc = lnet_ni_eager_recv(ni, msg);
+			lnet_net_lock(msg->msg_rx_cpt);
+		}
+	}
+
+	if (rc == 0)
+		rc = lnet_post_routed_recv_locked(msg, 0);
+	return rc;
+}
+
+char *
+lnet_msgtyp2str (int type)
+{
+	switch (type) {
+	case LNET_MSG_ACK:
+		return ("ACK");
+	case LNET_MSG_PUT:
+		return ("PUT");
+	case LNET_MSG_GET:
+		return ("GET");
+	case LNET_MSG_REPLY:
+		return ("REPLY");
+	case LNET_MSG_HELLO:
+		return ("HELLO");
+	default:
+		return ("<UNKNOWN>");
+	}
+}
+EXPORT_SYMBOL(lnet_msgtyp2str);
+
+void
+lnet_print_hdr(lnet_hdr_t * hdr)
+{
+	lnet_process_id_t src = {0};
+	lnet_process_id_t dst = {0};
+	char *type_str = lnet_msgtyp2str (hdr->type);
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	dst.nid = hdr->dest_nid;
+	dst.pid = hdr->dest_pid;
+
+	CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+	CWARN("    From %s\n", libcfs_id2str(src));
+	CWARN("    To   %s\n", libcfs_id2str(dst));
+
+	switch (hdr->type) {
+	default:
+		break;
+
+	case LNET_MSG_PUT:
+		CWARN("    Ptl index %d, ack md "LPX64"."LPX64", "
+		      "match bits "LPU64"\n",
+		      hdr->msg.put.ptl_index,
+		      hdr->msg.put.ack_wmd.wh_interface_cookie,
+		      hdr->msg.put.ack_wmd.wh_object_cookie,
+		      hdr->msg.put.match_bits);
+		CWARN("    Length %d, offset %d, hdr data "LPX64"\n",
+		      hdr->payload_length, hdr->msg.put.offset,
+		      hdr->msg.put.hdr_data);
+		break;
+
+	case LNET_MSG_GET:
+		CWARN("    Ptl index %d, return md "LPX64"."LPX64", "
+		      "match bits "LPU64"\n", hdr->msg.get.ptl_index,
+		      hdr->msg.get.return_wmd.wh_interface_cookie,
+		      hdr->msg.get.return_wmd.wh_object_cookie,
+		      hdr->msg.get.match_bits);
+		CWARN("    Length %d, src offset %d\n",
+		      hdr->msg.get.sink_length,
+		      hdr->msg.get.src_offset);
+		break;
+
+	case LNET_MSG_ACK:
+		CWARN("    dst md "LPX64"."LPX64", "
+		      "manipulated length %d\n",
+		      hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		      hdr->msg.ack.dst_wmd.wh_object_cookie,
+		      hdr->msg.ack.mlength);
+		break;
+
+	case LNET_MSG_REPLY:
+		CWARN("    dst md "LPX64"."LPX64", "
+		      "length %d\n",
+		      hdr->msg.reply.dst_wmd.wh_interface_cookie,
+		      hdr->msg.reply.dst_wmd.wh_object_cookie,
+		      hdr->payload_length);
+	}
+
+}
+
+int
+lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
+	   void *private, int rdma_req)
+{
+	int		rc = 0;
+	int		cpt;
+	int		for_me;
+	struct lnet_msg	*msg;
+	lnet_pid_t     dest_pid;
+	lnet_nid_t     dest_nid;
+	lnet_nid_t     src_nid;
+	__u32	  payload_length;
+	__u32	  type;
+
+	LASSERT (!in_interrupt ());
+
+	type = le32_to_cpu(hdr->type);
+	src_nid = le64_to_cpu(hdr->src_nid);
+	dest_nid = le64_to_cpu(hdr->dest_nid);
+	dest_pid = le32_to_cpu(hdr->dest_pid);
+	payload_length = le32_to_cpu(hdr->payload_length);
+
+	for_me = (ni->ni_nid == dest_nid);
+	cpt = lnet_cpt_of_nid(from_nid);
+
+	switch (type) {
+	case LNET_MSG_ACK:
+	case LNET_MSG_GET:
+		if (payload_length > 0) {
+			CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       lnet_msgtyp2str(type), payload_length);
+			return -EPROTO;
+		}
+		break;
+
+	case LNET_MSG_PUT:
+	case LNET_MSG_REPLY:
+		if (payload_length > (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+			CERROR("%s, src %s: bad %s payload %d "
+			       "(%d max expected)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       lnet_msgtyp2str(type),
+			       payload_length,
+			       for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+			return -EPROTO;
+		}
+		break;
+
+	default:
+		CERROR("%s, src %s: Bad message type 0x%x\n",
+		       libcfs_nid2str(from_nid),
+		       libcfs_nid2str(src_nid), type);
+		return -EPROTO;
+	}
+
+	if (the_lnet.ln_routing &&
+	    ni->ni_last_alive != cfs_time_current_sec()) {
+		lnet_ni_lock(ni);
+
+		/* NB: so far here is the only place to set NI status to "up */
+		ni->ni_last_alive = cfs_time_current_sec();
+		if (ni->ni_status != NULL &&
+		    ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+			ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+		lnet_ni_unlock(ni);
+	}
+
+	/* Regard a bad destination NID as a protocol error.  Senders should
+	 * know what they're doing; if they don't they're misconfigured, buggy
+	 * or malicious so we chop them off at the knees :) */
+
+	if (!for_me) {
+		if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+			/* should have gone direct */
+			CERROR ("%s, src %s: Bad dest nid %s "
+				"(should have been sent direct)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (lnet_islocalnid(dest_nid)) {
+			/* dest is another local NI; sender should have used
+			 * this node's NID on its own network */
+			CERROR ("%s, src %s: Bad dest nid %s "
+				"(it's my nid but on a different network)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (rdma_req && type == LNET_MSG_GET) {
+			CERROR ("%s, src %s: Bad optimized GET for %s "
+				"(final destination must be me)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (!the_lnet.ln_routing) {
+			CERROR ("%s, src %s: Dropping message for %s "
+				"(routing not enabled)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			goto drop;
+		}
+	}
+
+	/* Message looks OK; we're not going to return an error, so we MUST
+	 * call back lnd_recv() come what may... */
+
+	if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer (src_nid, 0))	     /* shall we now? */
+	{
+		CERROR("%s, src %s: Dropping %s to simulate failure\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("%s, src %s: Dropping %s (out of memory)\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	/* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */
+
+	msg->msg_type = type;
+	msg->msg_private = private;
+	msg->msg_receiving = 1;
+	msg->msg_len = msg->msg_wanted = payload_length;
+	msg->msg_offset = 0;
+	msg->msg_hdr = *hdr;
+	/* for building message event */
+	msg->msg_from = from_nid;
+	if (!for_me) {
+		msg->msg_target.pid	= dest_pid;
+		msg->msg_target.nid	= dest_nid;
+		msg->msg_routing	= 1;
+
+	} else {
+		/* convert common msg->hdr fields to host byteorder */
+		msg->msg_hdr.type	= type;
+		msg->msg_hdr.src_nid	= src_nid;
+		msg->msg_hdr.src_pid	= le32_to_cpu(msg->msg_hdr.src_pid);
+		msg->msg_hdr.dest_nid	= dest_nid;
+		msg->msg_hdr.dest_pid	= dest_pid;
+		msg->msg_hdr.payload_length = payload_length;
+	}
+
+	lnet_net_lock(cpt);
+	rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
+	if (rc != 0) {
+		lnet_net_unlock(cpt);
+		CERROR("%s, src %s: Dropping %s "
+		       "(error %d looking up sender)\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type), rc);
+		lnet_msg_free(msg);
+		goto drop;
+	}
+
+	lnet_msg_commit(msg, cpt);
+
+	if (!for_me) {
+		rc = lnet_parse_forward_locked(ni, msg);
+		lnet_net_unlock(cpt);
+
+		if (rc < 0)
+			goto free_drop;
+		if (rc == 0) {
+			lnet_ni_recv(ni, msg->msg_private, msg, 0,
+				     0, payload_length, payload_length);
+		}
+		return 0;
+	}
+
+	lnet_net_unlock(cpt);
+
+	switch (type) {
+	case LNET_MSG_ACK:
+		rc = lnet_parse_ack(ni, msg);
+		break;
+	case LNET_MSG_PUT:
+		rc = lnet_parse_put(ni, msg);
+		break;
+	case LNET_MSG_GET:
+		rc = lnet_parse_get(ni, msg, rdma_req);
+		break;
+	case LNET_MSG_REPLY:
+		rc = lnet_parse_reply(ni, msg);
+		break;
+	default:
+		LASSERT(0);
+		rc = -EPROTO;
+		goto free_drop;  /* prevent an unused label if !kernel */
+	}
+
+	if (rc == 0)
+		return 0;
+
+	LASSERT (rc == ENOENT);
+
+ free_drop:
+	LASSERT(msg->msg_md == NULL);
+	lnet_finalize(ni, msg, rc);
+
+ drop:
+	lnet_drop_message(ni, cpt, private, payload_length);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+	while (!list_empty(head)) {
+		lnet_process_id_t	id = {0};
+		lnet_msg_t		*msg;
+
+		msg = list_entry(head->next, lnet_msg_t, msg_list);
+		list_del(&msg->msg_list);
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_md == NULL);
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CWARN("Dropping delayed PUT from %s portal %d match "LPU64
+		      " offset %d length %d: %s\n",
+		      libcfs_id2str(id),
+		      msg->msg_hdr.msg.put.ptl_index,
+		      msg->msg_hdr.msg.put.match_bits,
+		      msg->msg_hdr.msg.put.offset,
+		      msg->msg_hdr.payload_length, reason);
+
+		/* NB I can't drop msg's ref on msg_rxpeer until after I've
+		 * called lnet_drop_message(), so I just hang onto msg as well
+		 * until that's done */
+
+		lnet_drop_message(msg->msg_rxpeer->lp_ni,
+				  msg->msg_rxpeer->lp_cpt,
+				  msg->msg_private, msg->msg_len);
+		/*
+		 * NB: message will not generate event because w/o attached MD,
+		 * but we still should give error code so lnet_msg_decommit()
+		 * can skip counters operations and other checks.
+		 */
+		lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
+	}
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		lnet_msg_t	  *msg;
+		lnet_process_id_t  id;
+
+		msg = list_entry(head->next, lnet_msg_t, msg_list);
+		list_del(&msg->msg_list);
+
+		/* md won't disappear under me, since each msg
+		 * holds a ref on it */
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_md != NULL);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+		       "match "LPU64" offset %d length %d.\n",
+			libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index,
+			msg->msg_hdr.msg.put.match_bits,
+			msg->msg_hdr.msg.put.offset,
+			msg->msg_hdr.payload_length);
+
+		lnet_recv_put(msg->msg_rxpeer->lp_ni, msg);
+	}
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see lnet_event_t::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
+	lnet_process_id_t target, unsigned int portal,
+	__u64 match_bits, unsigned int offset,
+	__u64 hdr_data)
+{
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer (target.nid, 1))	  /* shall we now? */
+	{
+		CERROR("Dropping PUT to %s: simulated failure\n",
+		       libcfs_id2str(target));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+	msg->msg_vmflush = !!memory_pressure_get();
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+	msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+	/* NB handles only looked up by creator (no flips) */
+	if (ack == LNET_ACK_REQ) {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			the_lnet.ln_interface_cookie;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			md->md_lh.lh_cookie;
+	} else {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+	}
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	rc = lnet_send(self, msg, LNET_NID_ANY);
+	if (rc != 0) {
+		CNETERR( "Error sending PUT to %s: %d\n",
+		       libcfs_id2str(target), rc);
+		lnet_finalize (NULL, msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+lnet_msg_t *
+lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
+{
+	/* The LND can DMA direct to the GET md (i.e. no REPLY msg).  This
+	 * returns a msg for the LND to pass to lnet_finalize() when the sink
+	 * data has been received.
+	 *
+	 * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+	 * lnet_finalize() is called on it, so the LND must call this first */
+
+	struct lnet_msg		*msg = lnet_msg_alloc();
+	struct lnet_libmd	*getmd = getmsg->msg_md;
+	lnet_process_id_t	peer_id = getmsg->msg_target;
+	int			cpt;
+
+	LASSERT(!getmsg->msg_target_is_router);
+	LASSERT(!getmsg->msg_routing);
+
+	cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+	lnet_res_lock(cpt);
+
+	LASSERT (getmd->md_refcount > 0);
+
+	if (msg == NULL) {
+		CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
+		goto drop;
+	}
+
+	if (getmd->md_threshold == 0) {
+		CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id),
+			getmd);
+		lnet_res_unlock(cpt);
+		goto drop;
+	}
+
+	LASSERT(getmd->md_offset == 0);
+
+	CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
+
+	/* setup information for lnet_build_msg_event */
+	msg->msg_from = peer_id.nid;
+	msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+	msg->msg_hdr.src_nid = peer_id.nid;
+	msg->msg_hdr.payload_length = getmd->md_length;
+	msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+	lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+	lnet_res_unlock(cpt);
+
+	cpt = lnet_cpt_of_nid(peer_id.nid);
+
+	lnet_net_lock(cpt);
+	lnet_msg_commit(msg, cpt);
+	lnet_net_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	return msg;
+
+ drop:
+	cpt = lnet_cpt_of_nid(peer_id.nid);
+
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+	lnet_net_unlock(cpt);
+
+	if (msg != NULL)
+		lnet_msg_free(msg);
+
+	return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len)
+{
+	/* Set the REPLY length, now the RDMA that elides the REPLY message has
+	 * completed and I know it. */
+	LASSERT (reply != NULL);
+	LASSERT (reply->msg_type == LNET_MSG_GET);
+	LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY);
+
+	/* NB I trusted my peer to RDMA.  If she tells me she's written beyond
+	 * the end of my buffer, I might as well be dead. */
+	LASSERT (len <= reply->msg_ev.mlength);
+
+	reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating" (See LNetMDBind()).
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self, lnet_handle_md_t mdh,
+	lnet_process_id_t target, unsigned int portal,
+	__u64 match_bits, unsigned int offset)
+{
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer (target.nid, 1))	  /* shall we now? */
+	{
+		CERROR("Dropping GET to %s: simulated failure\n",
+		       libcfs_id2str(target));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
+
+	msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+	/* NB handles only looked up by creator (no flips) */
+	msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+		the_lnet.ln_interface_cookie;
+	msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+		md->md_lh.lh_cookie;
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	rc = lnet_send(self, msg, LNET_NID_ANY);
+	if (rc < 0) {
+		CNETERR( "Error sending GET to %s: %d\n",
+		       libcfs_id2str(target), rc);
+		lnet_finalize (NULL, msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+	struct list_head		*e;
+	struct lnet_ni		*ni;
+	lnet_remotenet_t	*rnet;
+	__u32			dstnet = LNET_NIDNET(dstnid);
+	int			hops;
+	int			cpt;
+	__u32			order = 2;
+	struct list_head		*rn_list;
+
+	/* if !local_nid_dist_zero, I don't return a distance of 0 ever
+	 * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+	 * keep order 0 free for 0@lo and order 1 free for a local NID
+	 * match */
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each (e, &the_lnet.ln_nis) {
+		ni = list_entry(e, lnet_ni_t, ni_list);
+
+		if (ni->ni_nid == dstnid) {
+			if (srcnidp != NULL)
+				*srcnidp = dstnid;
+			if (orderp != NULL) {
+				if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+					*orderp = 0;
+				else
+					*orderp = 1;
+			}
+			lnet_net_unlock(cpt);
+
+			return local_nid_dist_zero ? 0 : 1;
+		}
+
+		if (LNET_NIDNET(ni->ni_nid) == dstnet) {
+			if (srcnidp != NULL)
+				*srcnidp = ni->ni_nid;
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return 1;
+		}
+
+		order++;
+	}
+
+	rn_list = lnet_net2rnethash(dstnet);
+	list_for_each(e, rn_list) {
+		rnet = list_entry(e, lnet_remotenet_t, lrn_list);
+
+		if (rnet->lrn_net == dstnet) {
+			lnet_route_t *route;
+			lnet_route_t *shortest = NULL;
+
+			LASSERT (!list_empty(&rnet->lrn_routes));
+
+			list_for_each_entry(route, &rnet->lrn_routes,
+						lr_list) {
+				if (shortest == NULL ||
+				    route->lr_hops < shortest->lr_hops)
+					shortest = route;
+			}
+
+			LASSERT (shortest != NULL);
+			hops = shortest->lr_hops;
+			if (srcnidp != NULL)
+				*srcnidp = shortest->lr_gateway->lp_ni->ni_nid;
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return hops + 1;
+		}
+		order++;
+	}
+
+	lnet_net_unlock(cpt);
+	return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
+
+/**
+ * Set the number of asynchronous messages expected from a target process.
+ *
+ * This function is only meaningful for userspace callers. It's a no-op when
+ * called from kernel.
+ *
+ * Asynchronous messages are those that can come from a target when the
+ * userspace process is not waiting for IO to complete; e.g., AST callbacks
+ * from Lustre servers. Specifying the expected number of such messages
+ * allows them to be eagerly received when user process is not running in
+ * LNet; otherwise network errors may occur.
+ *
+ * \param id Process ID of the target process.
+ * \param nasync Number of asynchronous messages expected from the target.
+ *
+ * \return 0 on success, and an error code otherwise.
+ */
+int
+LNetSetAsync(lnet_process_id_t id, int nasync)
+{
+	return 0;
+}
+EXPORT_SYMBOL(LNetSetAsync);

diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c
new file mode 100644
index 0000000..8f3a50b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c

@@ -0,0 +1,650 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+void
+lnet_build_unlink_event (lnet_libmd_t *md, lnet_event_t *ev)
+{
+	ENTRY;
+
+	memset(ev, 0, sizeof(*ev));
+
+	ev->status   = 0;
+	ev->unlinked = 1;
+	ev->type     = LNET_EVENT_UNLINK;
+	lnet_md_deconstruct(md, &ev->md);
+	lnet_md2handle(&ev->md_handle, md);
+	EXIT;
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
+{
+	lnet_hdr_t	*hdr = &msg->msg_hdr;
+	lnet_event_t	*ev  = &msg->msg_ev;
+
+	LASSERT(!msg->msg_routing);
+
+	ev->type = ev_type;
+
+	if (ev_type == LNET_EVENT_SEND) {
+		/* event for active message */
+		ev->target.nid    = le64_to_cpu(hdr->dest_nid);
+		ev->target.pid    = le32_to_cpu(hdr->dest_pid);
+		ev->initiator.nid = LNET_NID_ANY;
+		ev->initiator.pid = the_lnet.ln_pid;
+		ev->sender	  = LNET_NID_ANY;
+
+	} else {
+		/* event for passive message */
+		ev->target.pid    = hdr->dest_pid;
+		ev->target.nid    = hdr->dest_nid;
+		ev->initiator.pid = hdr->src_pid;
+		ev->initiator.nid = hdr->src_nid;
+		ev->rlength       = hdr->payload_length;
+		ev->sender	  = msg->msg_from;
+		ev->mlength	  = msg->msg_wanted;
+		ev->offset	  = msg->msg_offset;
+	}
+
+	switch (ev_type) {
+	default:
+		LBUG();
+
+	case LNET_EVENT_PUT: /* passive PUT */
+		ev->pt_index   = hdr->msg.put.ptl_index;
+		ev->match_bits = hdr->msg.put.match_bits;
+		ev->hdr_data   = hdr->msg.put.hdr_data;
+		return;
+
+	case LNET_EVENT_GET: /* passive GET */
+		ev->pt_index   = hdr->msg.get.ptl_index;
+		ev->match_bits = hdr->msg.get.match_bits;
+		ev->hdr_data   = 0;
+		return;
+
+	case LNET_EVENT_ACK: /* ACK */
+		ev->match_bits = hdr->msg.ack.match_bits;
+		ev->mlength    = hdr->msg.ack.mlength;
+		return;
+
+	case LNET_EVENT_REPLY: /* REPLY */
+		return;
+
+	case LNET_EVENT_SEND: /* active message */
+		if (msg->msg_type == LNET_MSG_PUT) {
+			ev->pt_index   = le32_to_cpu(hdr->msg.put.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+			ev->offset     = le32_to_cpu(hdr->msg.put.offset);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->payload_length);
+			ev->hdr_data   = le64_to_cpu(hdr->msg.put.hdr_data);
+
+		} else {
+			LASSERT(msg->msg_type == LNET_MSG_GET);
+			ev->pt_index   = le32_to_cpu(hdr->msg.get.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->msg.get.sink_length);
+			ev->offset     = le32_to_cpu(hdr->msg.get.src_offset);
+			ev->hdr_data   = 0;
+		}
+		return;
+	}
+}
+
+void
+lnet_msg_commit(lnet_msg_t *msg, int cpt)
+{
+	struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+	lnet_counters_t		  *counters  = the_lnet.ln_counters[cpt];
+
+	/* routed message can be committed for both receiving and sending */
+	LASSERT(!msg->msg_tx_committed);
+
+	if (msg->msg_sending) {
+		LASSERT(!msg->msg_receiving);
+
+		msg->msg_tx_cpt = cpt;
+		msg->msg_tx_committed = 1;
+		if (msg->msg_rx_committed) { /* routed message REPLY */
+			LASSERT(msg->msg_onactivelist);
+			return;
+		}
+	} else {
+		LASSERT(!msg->msg_sending);
+		msg->msg_rx_cpt = cpt;
+		msg->msg_rx_committed = 1;
+	}
+
+	LASSERT(!msg->msg_onactivelist);
+	msg->msg_onactivelist = 1;
+	list_add(&msg->msg_activelist, &container->msc_active);
+
+	counters->msgs_alloc++;
+	if (counters->msgs_alloc > counters->msgs_max)
+		counters->msgs_max = counters->msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(lnet_msg_t *msg, int status)
+{
+	lnet_counters_t	*counters;
+	lnet_event_t	*ev = &msg->msg_ev;
+
+	LASSERT(msg->msg_tx_committed);
+	if (status != 0)
+		goto out;
+
+	counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+	switch (ev->type) {
+	default: /* routed message */
+		LASSERT(msg->msg_routing);
+		LASSERT(msg->msg_rx_committed);
+		LASSERT(ev->type == 0);
+
+		counters->route_length += msg->msg_len;
+		counters->route_count++;
+		goto out;
+
+	case LNET_EVENT_PUT:
+		/* should have been decommitted */
+		LASSERT(!msg->msg_rx_committed);
+		/* overwritten while sending ACK */
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		msg->msg_type = LNET_MSG_PUT; /* fix type */
+		break;
+
+	case LNET_EVENT_SEND:
+		LASSERT(!msg->msg_rx_committed);
+		if (msg->msg_type == LNET_MSG_PUT)
+			counters->send_length += msg->msg_len;
+		break;
+
+	case LNET_EVENT_GET:
+		LASSERT(msg->msg_rx_committed);
+		/* overwritten while sending reply, we should never be
+		 * here for optimized GET */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY);
+		msg->msg_type = LNET_MSG_GET; /* fix type */
+		break;
+	}
+
+	counters->send_count++;
+ out:
+	lnet_return_tx_credits_locked(msg);
+	msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(lnet_msg_t *msg, int status)
+{
+	lnet_counters_t	*counters;
+	lnet_event_t	*ev = &msg->msg_ev;
+
+	LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+	LASSERT(msg->msg_rx_committed);
+
+	if (status != 0)
+		goto out;
+
+	counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+	switch (ev->type) {
+	default:
+		LASSERT(ev->type == 0);
+		LASSERT(msg->msg_routing);
+		goto out;
+
+	case LNET_EVENT_ACK:
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		break;
+
+	case LNET_EVENT_GET:
+		/* type is "REPLY" if it's an optimized GET on passive side,
+		 * because optimized GET will never be committed for sending,
+		 * so message type wouldn't be changed back to "GET" by
+		 * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+			msg->msg_type == LNET_MSG_GET);
+		counters->send_length += msg->msg_wanted;
+		break;
+
+	case LNET_EVENT_PUT:
+		LASSERT(msg->msg_type == LNET_MSG_PUT);
+		break;
+
+	case LNET_EVENT_REPLY:
+		/* type is "GET" if it's an optimized GET on active side,
+		 * see details in lnet_create_reply_msg() */
+		LASSERT(msg->msg_type == LNET_MSG_GET ||
+			msg->msg_type == LNET_MSG_REPLY);
+		break;
+	}
+
+	counters->recv_count++;
+	if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+		counters->recv_length += msg->msg_wanted;
+
+ out:
+	lnet_return_rx_credits_locked(msg);
+	msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status)
+{
+	int	cpt2 = cpt;
+
+	LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+	LASSERT(msg->msg_onactivelist);
+
+	if (msg->msg_tx_committed) { /* always decommit for sending first */
+		LASSERT(cpt == msg->msg_tx_cpt);
+		lnet_msg_decommit_tx(msg, status);
+	}
+
+	if (msg->msg_rx_committed) {
+		/* forwarding msg committed for both receiving and sending */
+		if (cpt != msg->msg_rx_cpt) {
+			lnet_net_unlock(cpt);
+			cpt2 = msg->msg_rx_cpt;
+			lnet_net_lock(cpt2);
+		}
+		lnet_msg_decommit_rx(msg, status);
+	}
+
+	list_del(&msg->msg_activelist);
+	msg->msg_onactivelist = 0;
+
+	the_lnet.ln_counters[cpt2]->msgs_alloc--;
+
+	if (cpt2 != cpt) {
+		lnet_net_unlock(cpt2);
+		lnet_net_lock(cpt);
+	}
+}
+
+void
+lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+		   unsigned int offset, unsigned int mlen)
+{
+	/* NB: @offset and @len are only useful for receiving */
+	/* Here, we attach the MD on lnet_msg and mark it busy and
+	 * decrementing its threshold. Come what may, the lnet_msg "owns"
+	 * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+	 * signals completion. */
+	LASSERT(!msg->msg_routing);
+
+	msg->msg_md = md;
+	if (msg->msg_receiving) { /* commited for receiving */
+		msg->msg_offset = offset;
+		msg->msg_wanted = mlen;
+	}
+
+	md->md_refcount++;
+	if (md->md_threshold != LNET_MD_THRESH_INF) {
+		LASSERT(md->md_threshold > 0);
+		md->md_threshold--;
+	}
+
+	/* build umd in event */
+	lnet_md2handle(&msg->msg_ev.md_handle, md);
+	lnet_md_deconstruct(md, &msg->msg_ev.md);
+}
+
+void
+lnet_msg_detach_md(lnet_msg_t *msg, int status)
+{
+	lnet_libmd_t	*md = msg->msg_md;
+	int		unlink;
+
+	/* Now it's safe to drop my caller's ref */
+	md->md_refcount--;
+	LASSERT(md->md_refcount >= 0);
+
+	unlink = lnet_md_unlinkable(md);
+	if (md->md_eq != NULL) {
+		msg->msg_ev.status   = status;
+		msg->msg_ev.unlinked = unlink;
+		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+	}
+
+	if (unlink)
+		lnet_md_unlink(md);
+
+	msg->msg_md = NULL;
+}
+
+static int
+lnet_complete_msg_locked(lnet_msg_t *msg, int cpt)
+{
+	lnet_handle_wire_t ack_wmd;
+	int		rc;
+	int		status = msg->msg_ev.status;
+
+	LASSERT (msg->msg_onactivelist);
+
+	if (status == 0 && msg->msg_ack) {
+		/* Only send an ACK if the PUT completed successfully */
+
+		lnet_msg_decommit(msg, cpt, 0);
+
+		msg->msg_ack = 0;
+		lnet_net_unlock(cpt);
+
+		LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+		LASSERT(!msg->msg_routing);
+
+		ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+		lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+
+		msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+		msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+		msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+		/* NB: we probably want to use NID of msg::msg_from as 3rd
+		 * parameter (router NID) if it's routed message */
+		rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is commited for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either
+		 * because CPT for sending can be different with CPT for
+		 * receiving, so we should return back to lnet_finalize()
+		 * to make sure we are locking the correct partition.
+		 */
+		return rc;
+
+	} else if (status == 0 &&	/* OK so far */
+		   (msg->msg_routing && !msg->msg_sending)) {
+		/* not forwarded */
+		LASSERT(!msg->msg_receiving);	/* called back recv already */
+		lnet_net_unlock(cpt);
+
+		rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is commited for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either:
+		 * - The rule is message must decommit for sending first if
+		 *   the it's committed for both sending and receiving
+		 * - CPT for sending can be different with CPT for receiving,
+		 *   so we should return back to lnet_finalize() to make
+		 *   sure we are locking the correct partition.
+		 */
+		return rc;
+	}
+
+	lnet_msg_decommit(msg, cpt, status);
+	lnet_msg_free_locked(msg);
+	return 0;
+}
+
+void
+lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status)
+{
+	struct lnet_msg_container	*container;
+	int				my_slot;
+	int				cpt;
+	int				rc;
+	int				i;
+
+	LASSERT (!in_interrupt ());
+
+	if (msg == NULL)
+		return;
+#if 0
+	CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n",
+	       lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target),
+	       msg->msg_target_is_router ? "t" : "",
+	       msg->msg_routing ? "X" : "",
+	       msg->msg_ack ? "A" : "",
+	       msg->msg_sending ? "S" : "",
+	       msg->msg_receiving ? "R" : "",
+	       msg->msg_delayed ? "d" : "",
+	       msg->msg_txcredit ? "C" : "",
+	       msg->msg_peertxcredit ? "c" : "",
+	       msg->msg_rtrcredit ? "F" : "",
+	       msg->msg_peerrtrcredit ? "f" : "",
+	       msg->msg_onactivelist ? "!" : "",
+	       msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid),
+	       msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid));
+#endif
+	msg->msg_ev.status = status;
+
+	if (msg->msg_md != NULL) {
+		cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
+		lnet_res_lock(cpt);
+		lnet_msg_detach_md(msg, status);
+		lnet_res_unlock(cpt);
+	}
+
+ again:
+	rc = 0;
+	if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+		/* not commited to network yet */
+		LASSERT(!msg->msg_onactivelist);
+		lnet_msg_free(msg);
+		return;
+	}
+
+	/*
+	 * NB: routed message can be commited for both receiving and sending,
+	 * we should finalize in LIFO order and keep counters correct.
+	 * (finalize sending first then finalize receiving)
+	 */
+	cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+	lnet_net_lock(cpt);
+
+	container = the_lnet.ln_msg_containers[cpt];
+	list_add_tail(&msg->msg_list, &container->msc_finalizing);
+
+	/* Recursion breaker.  Don't complete the message here if I am (or
+	 * enough other threads are) already completing messages */
+
+	my_slot = -1;
+	for (i = 0; i < container->msc_nfinalizers; i++) {
+		if (container->msc_finalizers[i] == current)
+			break;
+
+		if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+			my_slot = i;
+	}
+
+	if (i < container->msc_nfinalizers || my_slot < 0) {
+		lnet_net_unlock(cpt);
+		return;
+	}
+
+	container->msc_finalizers[my_slot] = current;
+
+	while (!list_empty(&container->msc_finalizing)) {
+		msg = list_entry(container->msc_finalizing.next,
+				     lnet_msg_t, msg_list);
+
+		list_del(&msg->msg_list);
+
+		/* NB drops and regains the lnet lock if it actually does
+		 * anything, so my finalizing friends can chomp along too */
+		rc = lnet_complete_msg_locked(msg, cpt);
+		if (rc != 0)
+			break;
+	}
+
+	container->msc_finalizers[my_slot] = NULL;
+	lnet_net_unlock(cpt);
+
+	if (rc != 0)
+		goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+	int     count = 0;
+
+	if (container->msc_init == 0)
+		return;
+
+	while (!list_empty(&container->msc_active)) {
+		lnet_msg_t *msg = list_entry(container->msc_active.next,
+						 lnet_msg_t, msg_activelist);
+
+		LASSERT(msg->msg_onactivelist);
+		msg->msg_onactivelist = 0;
+		list_del(&msg->msg_activelist);
+		lnet_msg_free(msg);
+		count++;
+	}
+
+	if (count > 0)
+		CERROR("%d active msg on exit\n", count);
+
+	if (container->msc_finalizers != NULL) {
+		LIBCFS_FREE(container->msc_finalizers,
+			    container->msc_nfinalizers *
+			    sizeof(*container->msc_finalizers));
+		container->msc_finalizers = NULL;
+	}
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_fini(&container->msc_freelist);
+#endif
+	container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+	int	rc;
+
+	container->msc_init = 1;
+
+	INIT_LIST_HEAD(&container->msc_active);
+	INIT_LIST_HEAD(&container->msc_finalizing);
+
+#ifdef LNET_USE_LIB_FREELIST
+	memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t));
+
+	rc = lnet_freelist_init(&container->msc_freelist,
+				LNET_FL_MAX_MSGS, sizeof(lnet_msg_t));
+	if (rc != 0) {
+		CERROR("Failed to init freelist for message container\n");
+		lnet_msg_container_cleanup(container);
+		return rc;
+	}
+#else
+	rc = 0;
+#endif
+	/* number of CPUs */
+	container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+
+	LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+			 container->msc_nfinalizers *
+			 sizeof(*container->msc_finalizers));
+
+	if (container->msc_finalizers == NULL) {
+		CERROR("Failed to allocate message finalizers\n");
+		lnet_msg_container_cleanup(container);
+		return -ENOMEM;
+	}
+
+	return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+	struct lnet_msg_container *container;
+	int     i;
+
+	if (the_lnet.ln_msg_containers == NULL)
+		return;
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+		lnet_msg_container_cleanup(container);
+
+	cfs_percpt_free(the_lnet.ln_msg_containers);
+	the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+	struct lnet_msg_container *container;
+	int	rc;
+	int	i;
+
+	the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+						      sizeof(*container));
+
+	if (the_lnet.ln_msg_containers == NULL) {
+		CERROR("Failed to allocate cpu-partition data for network\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+		rc = lnet_msg_container_setup(container, i);
+		if (rc != 0) {
+			lnet_msg_containers_destroy();
+			return rc;
+		}
+	}
+
+	return 0;
+}

diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
new file mode 100644
index 0000000..9b9e7d31
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-ptl.c

@@ -0,0 +1,938 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* NB: add /proc interfaces in upcoming patches */
+int	portal_rotor	= LNET_PTL_ROTOR_HASH_RT;
+CFS_MODULE_PARM(portal_rotor, "i", int, 0644,
+		"redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id,
+		    __u64 mbits, __u64 ignore_bits)
+{
+	struct lnet_portal	*ptl = the_lnet.ln_portals[index];
+	int			unique;
+
+	unique = ignore_bits == 0 &&
+		 match_id.nid != LNET_NID_ANY &&
+		 match_id.pid != LNET_PID_ANY;
+
+	LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+	/* prefer to check w/o any lock */
+	if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+		goto match;
+
+	/* unset, new portal */
+	lnet_ptl_lock(ptl);
+	/* check again with lock */
+	if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+		lnet_ptl_unlock(ptl);
+		goto match;
+	}
+
+	/* still not set */
+	if (unique)
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+	else
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+	lnet_ptl_unlock(ptl);
+
+	return 1;
+
+ match:
+	if ((lnet_ptl_is_unique(ptl) && !unique) ||
+	    (lnet_ptl_is_wildcard(ptl) && unique))
+		return 0;
+	return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	mtable->mt_enabled = 1;
+
+	ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+	for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+		LASSERT(ptl->ptl_mt_maps[i] != cpt);
+		if (ptl->ptl_mt_maps[i] < cpt)
+			break;
+
+		/* swap to order */
+		ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+		ptl->ptl_mt_maps[i] = cpt;
+	}
+
+	ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	if (LNET_CPT_NUMBER == 1)
+		return; /* never disable the only match-table */
+
+	mtable->mt_enabled = 0;
+
+	LASSERT(ptl->ptl_mt_nmaps > 0 &&
+		ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+	/* remove it from mt_maps */
+	ptl->ptl_mt_nmaps--;
+	for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+		if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+			ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+	}
+}
+
+static int
+lnet_try_match_md(lnet_libmd_t *md,
+		  struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	/* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+	 * lnet_match_blocked_msg() relies on this to avoid races */
+	unsigned int	offset;
+	unsigned int	mlength;
+	lnet_me_t	*me = md->md_me;
+
+	/* MD exhausted */
+	if (lnet_md_exhausted(md))
+		return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+	/* mismatched MD op */
+	if ((md->md_options & info->mi_opc) == 0)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME nid/pid? */
+	if (me->me_match_id.nid != LNET_NID_ANY &&
+	    me->me_match_id.nid != info->mi_id.nid)
+		return LNET_MATCHMD_NONE;
+
+	if (me->me_match_id.pid != LNET_PID_ANY &&
+	    me->me_match_id.pid != info->mi_id.pid)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME matchbits? */
+	if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+		return LNET_MATCHMD_NONE;
+
+	/* Hurrah! This _is_ a match; check it out... */
+
+	if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+		offset = md->md_offset;
+	else
+		offset = info->mi_roffset;
+
+	if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+		mlength = md->md_max_size;
+		LASSERT(md->md_offset + mlength <= md->md_length);
+	} else {
+		mlength = md->md_length - offset;
+	}
+
+	if (info->mi_rlength <= mlength) {	/* fits in allowed space */
+		mlength = info->mi_rlength;
+	} else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+		/* this packet _really_ is too big */
+		CERROR("Matching packet from %s, match "LPU64
+		       " length %d too big: %d left, %d allowed\n",
+		       libcfs_id2str(info->mi_id), info->mi_mbits,
+		       info->mi_rlength, md->md_length - offset, mlength);
+
+		return LNET_MATCHMD_DROP;
+	}
+
+	/* Commit to this ME/MD */
+	CDEBUG(D_NET, "Incoming %s index %x from %s of "
+	       "length %d/%d into md "LPX64" [%d] + %d\n",
+	       (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+	       info->mi_portal, libcfs_id2str(info->mi_id), mlength,
+	       info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+	lnet_msg_attach_md(msg, md, offset, mlength);
+	md->md_offset = offset + mlength;
+
+	if (!lnet_md_exhausted(md))
+		return LNET_MATCHMD_OK;
+
+	/* Auto-unlink NOW, so the ME gets unlinked if required.
+	 * We bumped md->md_refcount above so the MD just gets flagged
+	 * for unlink when it is finalized. */
+	if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+		lnet_md_unlink(md);
+
+	return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits)
+{
+	if (LNET_CPT_NUMBER == 1)
+		return ptl->ptl_mtables[0]; /* the only one */
+
+	/* if it's a unique portal, return match-table hashed by NID */
+	return lnet_ptl_is_unique(ptl) ?
+	       ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, lnet_process_id_t id,
+		  __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos)
+{
+	struct lnet_portal	*ptl;
+	struct lnet_match_table	*mtable;
+
+	/* NB: called w/o lock */
+	LASSERT(index < the_lnet.ln_nportals);
+
+	if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+		return NULL;
+
+	ptl = the_lnet.ln_portals[index];
+
+	mtable = lnet_match2mt(ptl, id, mbits);
+	if (mtable != NULL) /* unique portal or only one match-table */
+		return mtable;
+
+	/* it's a wildcard portal */
+	switch (pos) {
+	default:
+		return NULL;
+	case LNET_INS_BEFORE:
+	case LNET_INS_AFTER:
+		/* posted by no affinity thread, always hash to specific
+		 * match-table to avoid buffer stealing which is heavy */
+		return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+	case LNET_INS_LOCAL:
+		/* posted by cpu-affinity thread */
+		return ptl->ptl_mtables[lnet_cpt_current()];
+	}
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	int			nmaps;
+	int			rotor;
+	int			routed;
+	int			cpt;
+
+	/* NB: called w/o lock */
+	LASSERT(info->mi_portal < the_lnet.ln_nportals);
+	ptl = the_lnet.ln_portals[info->mi_portal];
+
+	LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+	mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits);
+	if (mtable != NULL)
+		return mtable;
+
+	/* it's a wildcard portal */
+	routed = LNET_NIDNET(msg->msg_hdr.src_nid) !=
+		 LNET_NIDNET(msg->msg_hdr.dest_nid);
+
+	if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+	    (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+		cpt = lnet_cpt_current();
+		if (ptl->ptl_mtables[cpt]->mt_enabled)
+			return ptl->ptl_mtables[cpt];
+	}
+
+	rotor = ptl->ptl_rotor++; /* get round-robin factor */
+	if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+		cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid);
+	else
+		cpt = rotor % LNET_CPT_NUMBER;
+
+	if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+		/* is there any active entry for this portal? */
+		nmaps = ptl->ptl_mt_nmaps;
+		/* map to an active mtable to avoid heavy "stealing" */
+		if (nmaps != 0) {
+			/* NB: there is possibility that ptl_mt_maps is being
+			 * changed because we are not under protection of
+			 * lnet_ptl_lock, but it shouldn't hurt anything */
+			cpt = ptl->ptl_mt_maps[rotor % nmaps];
+		}
+	}
+
+	return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+	__u64	*bmap;
+	int	i;
+
+	if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		return 0;
+
+	if (pos < 0) { /* check all bits */
+		for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+			if (mtable->mt_exhausted[i] != (__u64)(-1))
+				return 0;
+		}
+		return 1;
+	}
+
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+	/* mtable::mt_mhash[pos] is marked as exhausted or not */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+	__u64	*bmap;
+
+	LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+	/* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	if (!exhausted)
+		*bmap &= ~(1ULL << pos);
+	else
+		*bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+		   lnet_process_id_t id, __u64 mbits)
+{
+	struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+	if (lnet_ptl_is_wildcard(ptl)) {
+		return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+	} else {
+		unsigned long hash = mbits + id.nid + id.pid;
+
+		LASSERT(lnet_ptl_is_unique(ptl));
+		hash = cfs_hash_long(hash, LNET_MT_HASH_BITS);
+		return &mtable->mt_mhash[hash];
+	}
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+		 struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct list_head		*head;
+	lnet_me_t		*me;
+	lnet_me_t		*tmp;
+	int			exhausted = 0;
+	int			rc;
+
+	/* any ME with ignore bits? */
+	if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+	/* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+	if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		exhausted = LNET_MATCHMD_EXHAUSTED;
+
+	list_for_each_entry_safe(me, tmp, head, me_list) {
+		/* ME attached but MD not attached yet */
+		if (me->me_md == NULL)
+			continue;
+
+		LASSERT(me == me->me_md->md_me);
+
+		rc = lnet_try_match_md(me->me_md, info, msg);
+		if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+			exhausted = 0; /* mlist is not empty */
+
+		if ((rc & LNET_MATCHMD_FINISH) != 0) {
+			/* don't return EXHAUSTED bit because we don't know
+			 * whether the mlist is empty or not */
+			return rc & ~LNET_MATCHMD_EXHAUSTED;
+		}
+	}
+
+	if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+		lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+		if (!lnet_mt_test_exhausted(mtable, -1))
+			exhausted = 0;
+	}
+
+	if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+		head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+		goto again; /* re-check MEs w/o ignore-bits */
+	}
+
+	if (info->mi_opc == LNET_MD_OP_GET ||
+	    !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+		return LNET_MATCHMD_DROP | exhausted;
+
+	return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+	int	rc;
+
+	/* message arrived before any buffer posting on this portal,
+	 * simply delay or drop this message */
+	if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+		return 0;
+
+	lnet_ptl_lock(ptl);
+	/* check it again with hold of lock */
+	if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+		lnet_ptl_unlock(ptl);
+		return 0;
+	}
+
+	if (lnet_ptl_is_lazy(ptl)) {
+		if (msg->msg_rx_ready_delay) {
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list,
+					  &ptl->ptl_msg_delayed);
+		}
+		rc = LNET_MATCHMD_NONE;
+	} else {
+		rc = LNET_MATCHMD_DROP;
+	}
+
+	lnet_ptl_unlock(ptl);
+	return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+		     struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	int	first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+	int	rc = 0;
+	int	i;
+
+	/* steal buffer from other CPTs, and delay it if nothing to steal,
+	 * this function is more expensive than a regular match, but we
+	 * don't expect it can happen a lot */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	for (i = 0; i < LNET_CPT_NUMBER; i++) {
+		struct lnet_match_table *mtable;
+		int			cpt;
+
+		cpt = (first + i) % LNET_CPT_NUMBER;
+		mtable = ptl->ptl_mtables[cpt];
+		if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+			continue;
+
+		lnet_res_lock(cpt);
+		lnet_ptl_lock(ptl);
+
+		if (i == 0) { /* the first try, attach on stealing list */
+			list_add_tail(&msg->msg_list,
+					  &ptl->ptl_msg_stealing);
+		}
+
+		if (!list_empty(&msg->msg_list)) { /* on stealing list */
+			rc = lnet_mt_match_md(mtable, info, msg);
+
+			if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+			    mtable->mt_enabled)
+				lnet_ptl_disable_mt(ptl, cpt);
+
+			if ((rc & LNET_MATCHMD_FINISH) != 0)
+				list_del_init(&msg->msg_list);
+
+		} else {
+			/* could be matched by lnet_ptl_attach_md()
+			 * which is called by another thread */
+			rc = msg->msg_md == NULL ?
+			     LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+		}
+
+		if (!list_empty(&msg->msg_list) && /* not matched yet */
+		    (i == LNET_CPT_NUMBER - 1 || /* the last CPT */
+		     ptl->ptl_mt_nmaps == 0 ||   /* no active CPT */
+		     (ptl->ptl_mt_nmaps == 1 &&  /* the only active CPT */
+		      ptl->ptl_mt_maps[0] == cpt))) {
+			/* nothing to steal, delay or drop */
+			list_del_init(&msg->msg_list);
+
+			if (lnet_ptl_is_lazy(ptl)) {
+				msg->msg_rx_delayed = 1;
+				list_add_tail(&msg->msg_list,
+						  &ptl->ptl_msg_delayed);
+				rc = LNET_MATCHMD_NONE;
+			} else {
+				rc = LNET_MATCHMD_DROP;
+			}
+		}
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(cpt);
+
+		if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed)
+			break;
+	}
+
+	return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	int			rc;
+
+	CDEBUG(D_NET, "Request from %s of length %d into portal %d "
+	       "MB="LPX64"\n", libcfs_id2str(info->mi_id),
+	       info->mi_rlength, info->mi_portal, info->mi_mbits);
+
+	if (info->mi_portal >= the_lnet.ln_nportals) {
+		CERROR("Invalid portal %d not in [0-%d]\n",
+		       info->mi_portal, the_lnet.ln_nportals);
+		return LNET_MATCHMD_DROP;
+	}
+
+	ptl = the_lnet.ln_portals[info->mi_portal];
+	rc = lnet_ptl_match_early(ptl, msg);
+	if (rc != 0) /* matched or delayed early message */
+		return rc;
+
+	mtable = lnet_mt_of_match(info, msg);
+	lnet_res_lock(mtable->mt_cpt);
+
+	if (the_lnet.ln_shutdown) {
+		rc = LNET_MATCHMD_DROP;
+		goto out1;
+	}
+
+	rc = lnet_mt_match_md(mtable, info, msg);
+	if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+		lnet_ptl_lock(ptl);
+		lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+		lnet_ptl_unlock(ptl);
+	}
+
+	if ((rc & LNET_MATCHMD_FINISH) != 0)	/* matched or dropping */
+		goto out1;
+
+	if (!msg->msg_rx_ready_delay)
+		goto out1;
+
+	LASSERT(lnet_ptl_is_lazy(ptl));
+	LASSERT(!msg->msg_rx_delayed);
+
+	/* NB: we don't expect "delay" can happen a lot */
+	if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+		lnet_ptl_lock(ptl);
+
+		msg->msg_rx_delayed = 1;
+		list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(mtable->mt_cpt);
+
+	} else  {
+		lnet_res_unlock(mtable->mt_cpt);
+		rc = lnet_ptl_match_delay(ptl, info, msg);
+	}
+
+	if (msg->msg_rx_delayed) {
+		CDEBUG(D_NET,
+		       "Delaying %s from %s ptl %d MB "LPX64" off %d len %d\n",
+		       info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+		       libcfs_id2str(info->mi_id), info->mi_portal,
+		       info->mi_mbits, info->mi_roffset, info->mi_rlength);
+	}
+	goto out0;
+ out1:
+	lnet_res_unlock(mtable->mt_cpt);
+ out0:
+	/* EXHAUSTED bit is only meaningful for internal functions */
+	return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md)
+{
+	LASSERT(me->me_md == md && md->md_me == me);
+
+	me->me_md = NULL;
+	md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+		   struct list_head *matches, struct list_head *drops)
+{
+	struct lnet_portal	*ptl = the_lnet.ln_portals[me->me_portal];
+	struct lnet_match_table	*mtable;
+	struct list_head		*head;
+	lnet_msg_t		*tmp;
+	lnet_msg_t		*msg;
+	int			exhausted = 0;
+	int			cpt;
+
+	LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+	me->me_md = md;
+	md->md_me = me;
+
+	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+	mtable = ptl->ptl_mtables[cpt];
+
+	if (list_empty(&ptl->ptl_msg_stealing) &&
+	    list_empty(&ptl->ptl_msg_delayed) &&
+	    !lnet_mt_test_exhausted(mtable, me->me_pos))
+		return;
+
+	lnet_ptl_lock(ptl);
+	head = &ptl->ptl_msg_stealing;
+ again:
+	list_for_each_entry_safe(msg, tmp, head, msg_list) {
+		struct lnet_match_info	info;
+		lnet_hdr_t		*hdr;
+		int			rc;
+
+		LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+		hdr   = &msg->msg_hdr;
+		info.mi_id.nid	= hdr->src_nid;
+		info.mi_id.pid	= hdr->src_pid;
+		info.mi_opc	= LNET_MD_OP_PUT;
+		info.mi_portal	= hdr->msg.put.ptl_index;
+		info.mi_rlength	= hdr->payload_length;
+		info.mi_roffset	= hdr->msg.put.offset;
+		info.mi_mbits	= hdr->msg.put.match_bits;
+
+		rc = lnet_try_match_md(md, &info, msg);
+
+		exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+		if ((rc & LNET_MATCHMD_NONE) != 0) {
+			if (exhausted)
+				break;
+			continue;
+		}
+
+		/* Hurrah! This _is_ a match */
+		LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+		list_del_init(&msg->msg_list);
+
+		if (head == &ptl->ptl_msg_stealing) {
+			if (exhausted)
+				break;
+			/* stealing thread will handle the message */
+			continue;
+		}
+
+		if ((rc & LNET_MATCHMD_OK) != 0) {
+			list_add_tail(&msg->msg_list, matches);
+
+			CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+			       "match "LPU64" offset %d length %d.\n",
+			       libcfs_id2str(info.mi_id),
+			       info.mi_portal, info.mi_mbits,
+			       info.mi_roffset, info.mi_rlength);
+		} else {
+			list_add_tail(&msg->msg_list, drops);
+		}
+
+		if (exhausted)
+			break;
+	}
+
+	if (!exhausted && head == &ptl->ptl_msg_stealing) {
+		head = &ptl->ptl_msg_delayed;
+		goto again;
+	}
+
+	if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+		lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+		if (!mtable->mt_enabled)
+			lnet_ptl_enable_mt(ptl, cpt);
+	}
+
+	lnet_ptl_unlock(ptl);
+}
+
+void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+	struct lnet_match_table	*mtable;
+	int			i;
+
+	if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+		return;
+
+	LASSERT(list_empty(&ptl->ptl_msg_delayed));
+	LASSERT(list_empty(&ptl->ptl_msg_stealing));
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		struct list_head	*mhash;
+		lnet_me_t	*me;
+		int		j;
+
+		if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+			continue;
+
+		mhash = mtable->mt_mhash;
+		/* cleanup ME */
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+			while (!list_empty(&mhash[j])) {
+				me = list_entry(mhash[j].next,
+						    lnet_me_t, me_list);
+				CERROR("Active ME %p on exit\n", me);
+				list_del(&me->me_list);
+				lnet_me_free(me);
+			}
+		}
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+	}
+
+	cfs_percpt_free(ptl->ptl_mtables);
+	ptl->ptl_mtables = NULL;
+}
+
+int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+	struct lnet_match_table	*mtable;
+	struct list_head		*mhash;
+	int			i;
+	int			j;
+
+	ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct lnet_match_table));
+	if (ptl->ptl_mtables == NULL) {
+		CERROR("Failed to create match table for portal %d\n", index);
+		return -ENOMEM;
+	}
+
+	ptl->ptl_index = index;
+	INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+	INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+	spin_lock_init(&ptl->ptl_lock);
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+				 sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+		if (mhash == NULL) {
+			CERROR("Failed to create match hash for portal %d\n",
+			       index);
+			goto failed;
+		}
+
+		memset(&mtable->mt_exhausted[0], -1,
+		       sizeof(mtable->mt_exhausted[0]) *
+		       LNET_MT_EXHAUSTED_BMAP);
+		mtable->mt_mhash = mhash;
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+			INIT_LIST_HEAD(&mhash[j]);
+
+		mtable->mt_portal = index;
+		mtable->mt_cpt = i;
+	}
+
+	return 0;
+ failed:
+	lnet_ptl_cleanup(ptl);
+	return -ENOMEM;
+}
+
+void
+lnet_portals_destroy(void)
+{
+	int	i;
+
+	if (the_lnet.ln_portals == NULL)
+		return;
+
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+
+	cfs_array_free(the_lnet.ln_portals);
+	the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+	int	size;
+	int	i;
+
+	size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]);
+
+	the_lnet.ln_nportals = MAX_PORTALS;
+	the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size);
+	if (the_lnet.ln_portals == NULL) {
+		CERROR("Failed to allocate portals table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < the_lnet.ln_nportals; i++) {
+		if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+			lnet_portals_destroy();
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+	struct lnet_portal *ptl;
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+	struct lnet_portal	*ptl;
+	LIST_HEAD		(zombies);
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	if (!lnet_ptl_is_lazy(ptl)) {
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(LNET_LOCK_EX);
+		return 0;
+	}
+
+	if (the_lnet.ln_shutdown)
+		CWARN("Active lazy portal %d on exit\n", portal);
+	else
+		CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+	/* grab all the blocked messages atomically */
+	list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+	lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr");
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);

diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c
new file mode 100644
index 0000000..670dae3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lo.c

@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+int
+lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	LASSERT (!lntmsg->msg_routing);
+	LASSERT (!lntmsg->msg_target_is_router);
+
+	return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+int
+lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+	    int delayed, unsigned int niov,
+	    struct iovec *iov, lnet_kiov_t *kiov,
+	    unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	lnet_msg_t *sendmsg = private;
+
+	if (lntmsg != NULL) {		   /* not discarding */
+		if (sendmsg->msg_iov != NULL) {
+			if (iov != NULL)
+				lnet_copy_iov2iov(niov, iov, offset,
+						  sendmsg->msg_niov,
+						  sendmsg->msg_iov,
+						  sendmsg->msg_offset, mlen);
+			else
+				lnet_copy_iov2kiov(niov, kiov, offset,
+						   sendmsg->msg_niov,
+						   sendmsg->msg_iov,
+						   sendmsg->msg_offset, mlen);
+		} else {
+			if (iov != NULL)
+				lnet_copy_kiov2iov(niov, iov, offset,
+						   sendmsg->msg_niov,
+						   sendmsg->msg_kiov,
+						   sendmsg->msg_offset, mlen);
+			else
+				lnet_copy_kiov2kiov(niov, kiov, offset,
+						    sendmsg->msg_niov,
+						    sendmsg->msg_kiov,
+						    sendmsg->msg_offset, mlen);
+		}
+
+		lnet_finalize(ni, lntmsg, 0);
+	}
+
+	lnet_finalize(ni, sendmsg, 0);
+	return 0;
+}
+
+static int lolnd_instanced;
+
+void
+lolnd_shutdown(lnet_ni_t *ni)
+{
+	CDEBUG (D_NET, "shutdown\n");
+	LASSERT (lolnd_instanced);
+
+	lolnd_instanced = 0;
+}
+
+int
+lolnd_startup (lnet_ni_t *ni)
+{
+	LASSERT (ni->ni_lnd == &the_lolnd);
+	LASSERT (!lolnd_instanced);
+	lolnd_instanced = 1;
+
+	return (0);
+}
+
+lnd_t the_lolnd = {
+	/* .lnd_list       = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+	/* .lnd_refcount   = */ 0,
+	/* .lnd_type       = */ LOLND,
+	/* .lnd_startup    = */ lolnd_startup,
+	/* .lnd_shutdown   = */ lolnd_shutdown,
+	/* .lnt_ctl	= */ NULL,
+	/* .lnd_send       = */ lolnd_send,
+	/* .lnd_recv       = */ lolnd_recv,
+	/* .lnd_eager_recv = */ NULL,
+	/* .lnd_notify     = */ NULL,
+	/* .lnd_accept     = */ NULL
+};

diff --git a/drivers/staging/lustre/lnet/lnet/module.c b/drivers/staging/lustre/lnet/lnet/module.c
new file mode 100644
index 0000000..c832385
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/module.c

@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+static int config_on_load = 0;
+CFS_MODULE_PARM(config_on_load, "i", int, 0444,
+		"configure network at module load");
+
+static struct mutex lnet_config_mutex;
+
+int
+lnet_configure (void *arg)
+{
+	/* 'arg' only there so I can be passed to cfs_create_thread() */
+	int    rc = 0;
+
+	LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+	if (!the_lnet.ln_niinit_self) {
+		rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+		if (rc >= 0) {
+			the_lnet.ln_niinit_self = 1;
+			rc = 0;
+		}
+	}
+
+	LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+	return rc;
+}
+
+int
+lnet_unconfigure (void)
+{
+	int   refcount;
+
+	LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+	if (the_lnet.ln_niinit_self) {
+		the_lnet.ln_niinit_self = 0;
+		LNetNIFini();
+	}
+
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+	refcount = the_lnet.ln_refcount;
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+
+	LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+	return (refcount == 0) ? 0 : -EBUSY;
+}
+
+int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+	int   rc;
+
+	switch (cmd) {
+	case IOC_LIBCFS_CONFIGURE:
+		return lnet_configure(NULL);
+
+	case IOC_LIBCFS_UNCONFIGURE:
+		return lnet_unconfigure();
+
+	default:
+		/* Passing LNET_PID_ANY only gives me a ref if the net is up
+		 * already; I'll need it to ensure the net can't go down while
+		 * I'm called into it */
+		rc = LNetNIInit(LNET_PID_ANY);
+		if (rc >= 0) {
+			rc = LNetCtl(cmd, data);
+			LNetNIFini();
+		}
+		return rc;
+	}
+}
+
+DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+
+int
+init_lnet(void)
+{
+	int		  rc;
+	ENTRY;
+
+	mutex_init(&lnet_config_mutex);
+
+	rc = LNetInit();
+	if (rc != 0) {
+		CERROR("LNetInit: error %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+	LASSERT (rc == 0);
+
+	if (config_on_load) {
+		/* Have to schedule a separate thread to avoid deadlocking
+		 * in modload */
+		(void) kthread_run(lnet_configure, NULL, "lnet_initd");
+	}
+
+	RETURN(0);
+}
+
+void
+fini_lnet(void)
+{
+	int rc;
+
+	rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+	LASSERT (rc == 0);
+
+	LNetFini();
+}
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "1.0.0", init_lnet, fini_lnet);

diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
new file mode 100644
index 0000000..2869776
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/peer.c

@@ -0,0 +1,337 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+int
+lnet_peer_tables_create(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head		*hash;
+	int			i;
+	int			j;
+
+	the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+						   sizeof(*ptable));
+	if (the_lnet.ln_peer_tables == NULL) {
+		CERROR("Failed to allocate cpu-partition peer tables\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		INIT_LIST_HEAD(&ptable->pt_deathrow);
+
+		LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+				 LNET_PEER_HASH_SIZE * sizeof(*hash));
+		if (hash == NULL) {
+			CERROR("Failed to create peer hash table\n");
+			lnet_peer_tables_destroy();
+			return -ENOMEM;
+		}
+
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			INIT_LIST_HEAD(&hash[j]);
+		ptable->pt_hash = hash; /* sign of initialization */
+	}
+
+	return 0;
+}
+
+void
+lnet_peer_tables_destroy(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head		*hash;
+	int			i;
+	int			j;
+
+	if (the_lnet.ln_peer_tables == NULL)
+		return;
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		hash = ptable->pt_hash;
+		if (hash == NULL) /* not intialized */
+			break;
+
+		LASSERT(list_empty(&ptable->pt_deathrow));
+
+		ptable->pt_hash = NULL;
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			LASSERT(list_empty(&hash[j]));
+
+		LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+	}
+
+	cfs_percpt_free(the_lnet.ln_peer_tables);
+	the_lnet.ln_peer_tables = NULL;
+}
+
+void
+lnet_peer_tables_cleanup(void)
+{
+	struct lnet_peer_table	*ptable;
+	int			i;
+	int			j;
+
+	LASSERT(the_lnet.ln_shutdown);	/* i.e. no new peers */
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		lnet_net_lock(i);
+
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++) {
+			struct list_head *peers = &ptable->pt_hash[j];
+
+			while (!list_empty(peers)) {
+				lnet_peer_t *lp = list_entry(peers->next,
+								 lnet_peer_t,
+								 lp_hashlist);
+				list_del_init(&lp->lp_hashlist);
+				/* lose hash table's ref */
+				lnet_peer_decref_locked(lp);
+			}
+		}
+
+		lnet_net_unlock(i);
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		LIST_HEAD	(deathrow);
+		lnet_peer_t	*lp;
+
+		lnet_net_lock(i);
+
+		for (j = 3; ptable->pt_number != 0; j++) {
+			lnet_net_unlock(i);
+
+			if ((j & (j - 1)) == 0) {
+				CDEBUG(D_WARNING,
+				       "Waiting for %d peers on peer table\n",
+				       ptable->pt_number);
+			}
+			cfs_pause(cfs_time_seconds(1) / 2);
+			lnet_net_lock(i);
+		}
+		list_splice_init(&ptable->pt_deathrow, &deathrow);
+
+		lnet_net_unlock(i);
+
+		while (!list_empty(&deathrow)) {
+			lp = list_entry(deathrow.next,
+					    lnet_peer_t, lp_hashlist);
+			list_del(&lp->lp_hashlist);
+			LIBCFS_FREE(lp, sizeof(*lp));
+		}
+	}
+}
+
+void
+lnet_destroy_peer_locked(lnet_peer_t *lp)
+{
+	struct lnet_peer_table *ptable;
+
+	LASSERT(lp->lp_refcount == 0);
+	LASSERT(lp->lp_rtr_refcount == 0);
+	LASSERT(list_empty(&lp->lp_txq));
+	LASSERT(list_empty(&lp->lp_hashlist));
+	LASSERT(lp->lp_txqnob == 0);
+
+	ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+	LASSERT(ptable->pt_number > 0);
+	ptable->pt_number--;
+
+	lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt);
+	lp->lp_ni = NULL;
+
+	list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+}
+
+lnet_peer_t *
+lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+	struct list_head	*peers;
+	lnet_peer_t	*lp;
+
+	LASSERT(!the_lnet.ln_shutdown);
+
+	peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+	list_for_each_entry(lp, peers, lp_hashlist) {
+		if (lp->lp_nid == nid) {
+			lnet_peer_addref_locked(lp);
+			return lp;
+		}
+	}
+
+	return NULL;
+}
+
+int
+lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
+{
+	struct lnet_peer_table	*ptable;
+	lnet_peer_t		*lp = NULL;
+	lnet_peer_t		*lp2;
+	int			cpt2;
+	int			rc = 0;
+
+	*lpp = NULL;
+	if (the_lnet.ln_shutdown) /* it's shutting down */
+		return -ESHUTDOWN;
+
+	/* cpt can be LNET_LOCK_EX if it's called from router functions */
+	cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
+
+	ptable = the_lnet.ln_peer_tables[cpt2];
+	lp = lnet_find_peer_locked(ptable, nid);
+	if (lp != NULL) {
+		*lpp = lp;
+		return 0;
+	}
+
+	if (!list_empty(&ptable->pt_deathrow)) {
+		lp = list_entry(ptable->pt_deathrow.next,
+				    lnet_peer_t, lp_hashlist);
+		list_del(&lp->lp_hashlist);
+	}
+
+	/*
+	 * take extra refcount in case another thread has shutdown LNet
+	 * and destroyed locks and peer-table before I finish the allocation
+	 */
+	ptable->pt_number++;
+	lnet_net_unlock(cpt);
+
+	if (lp != NULL)
+		memset(lp, 0, sizeof(*lp));
+	else
+		LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
+
+	if (lp == NULL) {
+		rc = -ENOMEM;
+		lnet_net_lock(cpt);
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&lp->lp_txq);
+	INIT_LIST_HEAD(&lp->lp_rtrq);
+	INIT_LIST_HEAD(&lp->lp_routes);
+
+	lp->lp_notify = 0;
+	lp->lp_notifylnd = 0;
+	lp->lp_notifying = 0;
+	lp->lp_alive_count = 0;
+	lp->lp_timestamp = 0;
+	lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+	lp->lp_last_alive = cfs_time_current(); /* assumes alive */
+	lp->lp_last_query = 0; /* haven't asked NI yet */
+	lp->lp_ping_timestamp = 0;
+	lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
+	lp->lp_nid = nid;
+	lp->lp_cpt = cpt2;
+	lp->lp_refcount = 2;	/* 1 for caller; 1 for hash */
+	lp->lp_rtr_refcount = 0;
+
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	lp2 = lnet_find_peer_locked(ptable, nid);
+	if (lp2 != NULL) {
+		*lpp = lp2;
+		goto out;
+	}
+
+	lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
+	if (lp->lp_ni == NULL) {
+		rc = -EHOSTUNREACH;
+		goto out;
+	}
+
+	lp->lp_txcredits    =
+	lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+	lp->lp_rtrcredits    =
+	lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+
+	list_add_tail(&lp->lp_hashlist,
+			  &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+	ptable->pt_version++;
+	*lpp = lp;
+
+	return 0;
+out:
+	if (lp != NULL)
+		list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+	ptable->pt_number--;
+	return rc;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+	char		*aliveness = "NA";
+	lnet_peer_t	*lp;
+	int		rc;
+	int		cpt;
+
+	cpt = lnet_cpt_of_nid(nid);
+	lnet_net_lock(cpt);
+
+	rc = lnet_nid2peer_locked(&lp, nid, cpt);
+	if (rc != 0) {
+		lnet_net_unlock(cpt);
+		CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+		return;
+	}
+
+	if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+		aliveness = lp->lp_alive ? "up" : "down";
+
+	CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+	       libcfs_nid2str(lp->lp_nid), lp->lp_refcount,
+	       aliveness, lp->lp_ni->ni_peertxcredits,
+	       lp->lp_rtrcredits, lp->lp_minrtrcredits,
+	       lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+
+	lnet_peer_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+}

diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c
new file mode 100644
index 0000000..a326ce0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/router.c

@@ -0,0 +1,1694 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+#if  defined(LNET_ROUTER)
+
+#define LNET_NRB_TINY_MIN	512	/* min value for each CPT */
+#define LNET_NRB_TINY		(LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN	4096	/* min value for each CPT */
+#define LNET_NRB_SMALL		(LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_LARGE_MIN	256	/* min value for each CPT */
+#define LNET_NRB_LARGE		(LNET_NRB_LARGE_MIN * 4)
+
+static char *forwarding = "";
+CFS_MODULE_PARM(forwarding, "s", charp, 0444,
+		"Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444,
+		"# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+CFS_MODULE_PARM(small_router_buffers, "i", int, 0444,
+		"# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+CFS_MODULE_PARM(large_router_buffers, "i", int, 0444,
+		"# of large messages to buffer in the router");
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+		"# router buffer credits per peer");
+
+static int auto_down = 1;
+CFS_MODULE_PARM(auto_down, "i", int, 0444,
+		"Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+	/* NI option overrides LNet default */
+	if (ni->ni_peerrtrcredits > 0)
+		return ni->ni_peerrtrcredits;
+	if (peer_buffer_credits > 0)
+		return peer_buffer_credits;
+
+	/* As an approximation, allow this peer the same number of router
+	 * buffers as it is allowed outstanding sends */
+	return ni->ni_peertxcredits;
+}
+
+/* forward ref's */
+static int lnet_router_checker(void *);
+#else
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+	return 0;
+}
+
+#endif
+
+static int check_routers_before_use = 0;
+CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
+		"Assume routers are down and ping them before use");
+
+static int avoid_asym_router_failure = 1;
+CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644,
+		"Avoid asymmetrical router failures (0 to disable)");
+
+static int dead_router_check_interval = 60;
+CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644,
+		"Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 60;
+CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644,
+		"Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644,
+		"Seconds to wait for the reply to a router health query");
+
+int
+lnet_peers_start_down(void)
+{
+	return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
+{
+	if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */
+		CDEBUG(D_NET, "Out of date\n");
+		return;
+	}
+
+	lp->lp_timestamp = when;		/* update timestamp */
+	lp->lp_ping_deadline = 0;	       /* disable ping timeout */
+
+	if (lp->lp_alive_count != 0 &&	  /* got old news */
+	    (!lp->lp_alive) == (!alive)) {      /* new date for old news */
+		CDEBUG(D_NET, "Old news\n");
+		return;
+	}
+
+	/* Flag that notification is outstanding */
+
+	lp->lp_alive_count++;
+	lp->lp_alive = !(!alive);	       /* 1 bit! */
+	lp->lp_notify = 1;
+	lp->lp_notifylnd |= notifylnd;
+	if (lp->lp_alive)
+		lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+	CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+}
+
+void
+lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+	int	alive;
+	int	notifylnd;
+
+	/* Notify only in 1 thread at any time to ensure ordered notification.
+	 * NB individual events can be missed; the only guarantee is that you
+	 * always get the most recent news */
+
+	if (lp->lp_notifying)
+		return;
+
+	lp->lp_notifying = 1;
+
+	while (lp->lp_notify) {
+		alive     = lp->lp_alive;
+		notifylnd = lp->lp_notifylnd;
+
+		lp->lp_notifylnd = 0;
+		lp->lp_notify    = 0;
+
+		if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
+			lnet_net_unlock(lp->lp_cpt);
+
+			/* A new notification could happen now; I'll handle it
+			 * when control returns to me */
+
+			(ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+
+			lnet_net_lock(lp->lp_cpt);
+		}
+	}
+
+	lp->lp_notifying = 0;
+}
+
+
+static void
+lnet_rtr_addref_locked(lnet_peer_t *lp)
+{
+	LASSERT(lp->lp_refcount > 0);
+	LASSERT(lp->lp_rtr_refcount >= 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lp_rtr_refcount++;
+	if (lp->lp_rtr_refcount == 1) {
+		struct list_head *pos;
+
+		/* a simple insertion sort */
+		list_for_each_prev(pos, &the_lnet.ln_routers) {
+			lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
+							  lp_rtr_list);
+
+			if (rtr->lp_nid < lp->lp_nid)
+				break;
+		}
+
+		list_add(&lp->lp_rtr_list, pos);
+		/* addref for the_lnet.ln_routers */
+		lnet_peer_addref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+static void
+lnet_rtr_decref_locked(lnet_peer_t *lp)
+{
+	LASSERT(lp->lp_refcount > 0);
+	LASSERT(lp->lp_rtr_refcount > 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lp_rtr_refcount--;
+	if (lp->lp_rtr_refcount == 0) {
+		LASSERT(list_empty(&lp->lp_routes));
+
+		if (lp->lp_rcd != NULL) {
+			list_add(&lp->lp_rcd->rcd_list,
+				     &the_lnet.ln_rcd_deathrow);
+			lp->lp_rcd = NULL;
+		}
+
+		list_del(&lp->lp_rtr_list);
+		/* decref for the_lnet.ln_routers */
+		lnet_peer_decref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+lnet_remotenet_t *
+lnet_find_net_locked (__u32 net)
+{
+	lnet_remotenet_t	*rnet;
+	struct list_head		*tmp;
+	struct list_head		*rn_list;
+
+	LASSERT(!the_lnet.ln_shutdown);
+
+	rn_list = lnet_net2rnethash(net);
+	list_for_each(tmp, rn_list) {
+		rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
+
+		if (rnet->lrn_net == net)
+			return rnet;
+	}
+	return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+	static int seeded = 0;
+	int lnd_type, seed[2];
+	struct timeval tv;
+	lnet_ni_t *ni;
+	struct list_head *tmp;
+
+	if (seeded)
+		return;
+
+	cfs_get_random_bytes(seed, sizeof(seed));
+
+	/* Nodes with small feet have little entropy
+	 * the NID for this node gives the most entropy in the low bits */
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+		lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+		if (lnd_type != LOLND)
+			seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
+	}
+
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+	seeded = 1;
+	return;
+}
+
+/* NB expects LNET_LOCK held */
+void
+lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route)
+{
+	unsigned int      len = 0;
+	unsigned int      offset = 0;
+	struct list_head       *e;
+
+	lnet_shuffle_seed();
+
+	list_for_each (e, &rnet->lrn_routes) {
+		len++;
+	}
+
+	/* len+1 positions to add a new entry, also prevents division by 0 */
+	offset = cfs_rand() % (len + 1);
+	list_for_each (e, &rnet->lrn_routes) {
+		if (offset == 0)
+			break;
+		offset--;
+	}
+	list_add(&route->lr_list, e);
+	list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+
+	the_lnet.ln_remote_nets_version++;
+	lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
+{
+	struct list_head	  *e;
+	lnet_remotenet_t    *rnet;
+	lnet_remotenet_t    *rnet2;
+	lnet_route_t	*route;
+	lnet_ni_t	   *ni;
+	int		  add_route;
+	int		  rc;
+
+	CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n",
+	       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+
+	if (gateway == LNET_NID_ANY ||
+	    LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+	    net == LNET_NIDNET(LNET_NID_ANY) ||
+	    LNET_NETTYP(net) == LOLND ||
+	    LNET_NIDNET(gateway) == net ||
+	    hops < 1 || hops > 255)
+		return (-EINVAL);
+
+	if (lnet_islocalnet(net))	       /* it's a local network */
+		return 0;		       /* ignore the route entry */
+
+	/* Assume net, route, all new */
+	LIBCFS_ALLOC(route, sizeof(*route));
+	LIBCFS_ALLOC(rnet, sizeof(*rnet));
+	if (route == NULL || rnet == NULL) {
+		CERROR("Out of memory creating route %s %d %s\n",
+		       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+		if (route != NULL)
+			LIBCFS_FREE(route, sizeof(*route));
+		if (rnet != NULL)
+			LIBCFS_FREE(rnet, sizeof(*rnet));
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&rnet->lrn_routes);
+	rnet->lrn_net = net;
+	route->lr_hops = hops;
+	route->lr_net = net;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
+	if (rc != 0) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		LIBCFS_FREE(route, sizeof(*route));
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+		if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */
+			return 0;	/* ignore the route entry */
+		} else {
+			CERROR("Error %d creating route %s %d %s\n", rc,
+			       libcfs_net2str(net), hops,
+			       libcfs_nid2str(gateway));
+		}
+		return rc;
+	}
+
+	LASSERT (!the_lnet.ln_shutdown);
+
+	rnet2 = lnet_find_net_locked(net);
+	if (rnet2 == NULL) {
+		/* new network */
+		list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+		rnet2 = rnet;
+	}
+
+	/* Search for a duplicate route (it's a NOOP if it is) */
+	add_route = 1;
+	list_for_each (e, &rnet2->lrn_routes) {
+		lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
+
+		if (route2->lr_gateway == route->lr_gateway) {
+			add_route = 0;
+			break;
+		}
+
+		/* our lookups must be true */
+		LASSERT (route2->lr_gateway->lp_nid != gateway);
+	}
+
+	if (add_route) {
+		lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
+		lnet_add_route_to_rnet(rnet2, route);
+
+		ni = route->lr_gateway->lp_ni;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		/* XXX Assume alive */
+		if (ni->ni_lnd->lnd_notify != NULL)
+			(ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	/* -1 for notify or !add_route */
+	lnet_peer_decref_locked(route->lr_gateway);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (!add_route)
+		LIBCFS_FREE(route, sizeof(*route));
+
+	if (rnet != rnet2)
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+	return 0;
+}
+
+int
+lnet_check_routes(void)
+{
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	lnet_route_t		*route2;
+	struct list_head		*e1;
+	struct list_head		*e2;
+	int			cpt;
+	struct list_head		*rn_list;
+	int			i;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+			route2 = NULL;
+			list_for_each(e2, &rnet->lrn_routes) {
+				lnet_nid_t	nid1;
+				lnet_nid_t	nid2;
+				int		net;
+
+				route = list_entry(e2, lnet_route_t,
+						       lr_list);
+
+				if (route2 == NULL) {
+					route2 = route;
+					continue;
+				}
+
+				if (route->lr_gateway->lp_ni ==
+				    route2->lr_gateway->lp_ni)
+					continue;
+
+				nid1 = route->lr_gateway->lp_nid;
+				nid2 = route2->lr_gateway->lp_nid;
+				net = rnet->lrn_net;
+
+				lnet_net_unlock(cpt);
+
+				CERROR("Routes to %s via %s and %s not "
+				       "supported\n",
+				       libcfs_net2str(net),
+				       libcfs_nid2str(nid1),
+				       libcfs_nid2str(nid2));
+				return -EINVAL;
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return 0;
+}
+
+int
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+{
+	struct lnet_peer	*gateway;
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	struct list_head		*e1;
+	struct list_head		*e2;
+	int			rc = -ENOENT;
+	struct list_head		*rn_list;
+	int			idx = 0;
+
+	CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+	       libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+	/* NB Caller may specify either all routes via the given gateway
+	 * or a specific route entry actual NIDs) */
+
+	lnet_net_lock(LNET_LOCK_EX);
+	if (net == LNET_NIDNET(LNET_NID_ANY))
+		rn_list = &the_lnet.ln_remote_nets_hash[0];
+	else
+		rn_list = lnet_net2rnethash(net);
+
+ again:
+	list_for_each(e1, rn_list) {
+		rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+		if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+			net == rnet->lrn_net))
+			continue;
+
+		list_for_each(e2, &rnet->lrn_routes) {
+			route = list_entry(e2, lnet_route_t, lr_list);
+
+			gateway = route->lr_gateway;
+			if (!(gw_nid == LNET_NID_ANY ||
+			      gw_nid == gateway->lp_nid))
+				continue;
+
+			list_del(&route->lr_list);
+			list_del(&route->lr_gwlist);
+			the_lnet.ln_remote_nets_version++;
+
+			if (list_empty(&rnet->lrn_routes))
+				list_del(&rnet->lrn_list);
+			else
+				rnet = NULL;
+
+			lnet_rtr_decref_locked(gateway);
+			lnet_peer_decref_locked(gateway);
+
+			lnet_net_unlock(LNET_LOCK_EX);
+
+			LIBCFS_FREE(route, sizeof(*route));
+
+			if (rnet != NULL)
+				LIBCFS_FREE(rnet, sizeof(*rnet));
+
+			rc = 0;
+			lnet_net_lock(LNET_LOCK_EX);
+			goto again;
+		}
+	}
+
+	if (net == LNET_NIDNET(LNET_NID_ANY) &&
+	    ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+		rn_list = &the_lnet.ln_remote_nets_hash[idx];
+		goto again;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+void
+lnet_destroy_routes (void)
+{
+	lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops,
+	       lnet_nid_t *gateway, __u32 *alive)
+{
+	struct list_head		*e1;
+	struct list_head		*e2;
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	int			cpt;
+	int			i;
+	struct list_head		*rn_list;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+			list_for_each(e2, &rnet->lrn_routes) {
+				route = list_entry(e2, lnet_route_t,
+						       lr_list);
+
+				if (idx-- == 0) {
+					*net     = rnet->lrn_net;
+					*hops    = route->lr_hops;
+					*gateway = route->lr_gateway->lp_nid;
+					*alive   = route->lr_gateway->lp_alive;
+					lnet_net_unlock(cpt);
+					return 0;
+				}
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return -ENOENT;
+}
+
+void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+	int	       i;
+	lnet_ni_status_t *stat;
+
+	__swab32s(&info->pi_magic);
+	__swab32s(&info->pi_features);
+	__swab32s(&info->pi_pid);
+	__swab32s(&info->pi_nnis);
+	for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+		stat = &info->pi_ni[i];
+		__swab64s(&stat->ns_nid);
+		__swab32s(&stat->ns_status);
+	}
+	return;
+}
+
+/**
+ * parse router-checker pinginfo, record number of down NIs for remote
+ * networks on that router.
+ */
+static void
+lnet_parse_rc_info(lnet_rc_data_t *rcd)
+{
+	lnet_ping_info_t	*info = rcd->rcd_pinginfo;
+	struct lnet_peer	*gw   = rcd->rcd_gateway;
+	lnet_route_t		*rtr;
+
+	if (!gw->lp_alive)
+		return;
+
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(info);
+
+	/* NB always racing with network! */
+	if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+		CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
+		       libcfs_nid2str(gw->lp_nid), info->pi_magic);
+		gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+		return;
+	}
+
+	gw->lp_ping_feats = info->pi_features;
+	if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+		CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+		       libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+		return; /* nothing I can understand */
+	}
+
+	if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+		return; /* can't carry NI status info */
+
+	list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
+		int	ptl_status = LNET_NI_STATUS_INVALID;
+		int	down = 0;
+		int	up = 0;
+		int	i;
+
+		for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+			lnet_ni_status_t *stat = &info->pi_ni[i];
+			lnet_nid_t	 nid = stat->ns_nid;
+
+			if (nid == LNET_NID_ANY) {
+				CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
+				       libcfs_nid2str(gw->lp_nid));
+				gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+				return;
+			}
+
+			if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+				continue;
+
+			if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+				if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
+					down++;
+				else if (ptl_status != LNET_NI_STATUS_UP)
+					ptl_status = LNET_NI_STATUS_DOWN;
+				continue;
+			}
+
+			if (stat->ns_status == LNET_NI_STATUS_UP) {
+				if (LNET_NIDNET(nid) == rtr->lr_net) {
+					up = 1;
+					break;
+				}
+				/* ptl NIs are considered down only when
+				 * they're all down */
+				if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+					ptl_status = LNET_NI_STATUS_UP;
+				continue;
+			}
+
+			CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
+			       libcfs_nid2str(gw->lp_nid), stat->ns_status);
+			gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+			return;
+		}
+
+		if (up) { /* ignore downed NIs if NI for dest network is up */
+			rtr->lr_downis = 0;
+			continue;
+		}
+		rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
+	}
+}
+
+static void
+lnet_router_checker_event(lnet_event_t *event)
+{
+	lnet_rc_data_t		*rcd = event->md.user_ptr;
+	struct lnet_peer	*lp;
+
+	LASSERT(rcd != NULL);
+
+	if (event->unlinked) {
+		LNetInvalidateHandle(&rcd->rcd_mdh);
+		return;
+	}
+
+	LASSERT(event->type == LNET_EVENT_SEND ||
+		event->type == LNET_EVENT_REPLY);
+
+	lp = rcd->rcd_gateway;
+	LASSERT(lp != NULL);
+
+	 /* NB: it's called with holding lnet_res_lock, we have a few
+	  * places need to hold both locks at the same time, please take
+	  * care of lock ordering */
+	lnet_net_lock(lp->lp_cpt);
+	if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
+		/* ignore if no longer a router or rcd is replaced */
+		goto out;
+	}
+
+	if (event->type == LNET_EVENT_SEND) {
+		lp->lp_ping_notsent = 0;
+		if (event->status == 0)
+			goto out;
+	}
+
+	/* LNET_EVENT_REPLY */
+	/* A successful REPLY means the router is up.  If _any_ comms
+	 * to the router fail I assume it's down (this will happen if
+	 * we ping alive routers to try to detect router death before
+	 * apps get burned). */
+
+	lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+	/* The router checker will wake up very shortly and do the
+	 * actual notification.
+	 * XXX If 'lp' stops being a router before then, it will still
+	 * have the notification pending!!! */
+
+	if (avoid_asym_router_failure && event->status == 0)
+		lnet_parse_rc_info(rcd);
+
+ out:
+	lnet_net_unlock(lp->lp_cpt);
+}
+
+void
+lnet_wait_known_routerstate(void)
+{
+	lnet_peer_t	 *rtr;
+	struct list_head	  *entry;
+	int		  all_known;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	for (;;) {
+		int	cpt = lnet_net_lock_current();
+
+		all_known = 1;
+		list_for_each (entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+			if (rtr->lp_alive_count == 0) {
+				all_known = 0;
+				break;
+			}
+		}
+
+		lnet_net_unlock(cpt);
+
+		if (all_known)
+			return;
+
+		cfs_pause(cfs_time_seconds(1));
+	}
+}
+
+void
+lnet_update_ni_status_locked(void)
+{
+	lnet_ni_t	*ni;
+	long		now;
+	int		timeout;
+
+	LASSERT(the_lnet.ln_routing);
+
+	timeout = router_ping_timeout +
+		  MAX(live_router_check_interval, dead_router_check_interval);
+
+	now = cfs_time_current_sec();
+	list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+		if (ni->ni_lnd->lnd_type == LOLND)
+			continue;
+
+		if (now < ni->ni_last_alive + timeout)
+			continue;
+
+		lnet_ni_lock(ni);
+		/* re-check with lock */
+		if (now < ni->ni_last_alive + timeout) {
+			lnet_ni_unlock(ni);
+			continue;
+		}
+
+		LASSERT(ni->ni_status != NULL);
+
+		if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
+			CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+			       libcfs_nid2str(ni->ni_nid), timeout);
+			/* NB: so far, this is the only place to set
+			 * NI status to "down" */
+			ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+		}
+		lnet_ni_unlock(ni);
+	}
+}
+
+void
+lnet_destroy_rc_data(lnet_rc_data_t *rcd)
+{
+	LASSERT(list_empty(&rcd->rcd_list));
+	/* detached from network */
+	LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
+
+	if (rcd->rcd_gateway != NULL) {
+		int cpt = rcd->rcd_gateway->lp_cpt;
+
+		lnet_net_lock(cpt);
+		lnet_peer_decref_locked(rcd->rcd_gateway);
+		lnet_net_unlock(cpt);
+	}
+
+	if (rcd->rcd_pinginfo != NULL)
+		LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+
+	LIBCFS_FREE(rcd, sizeof(*rcd));
+}
+
+lnet_rc_data_t *
+lnet_create_rc_data_locked(lnet_peer_t *gateway)
+{
+	lnet_rc_data_t		*rcd = NULL;
+	lnet_ping_info_t	*pi;
+	int			rc;
+	int			i;
+
+	lnet_net_unlock(gateway->lp_cpt);
+
+	LIBCFS_ALLOC(rcd, sizeof(*rcd));
+	if (rcd == NULL)
+		goto out;
+
+	LNetInvalidateHandle(&rcd->rcd_mdh);
+	INIT_LIST_HEAD(&rcd->rcd_list);
+
+	LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+	if (pi == NULL)
+		goto out;
+
+	memset(pi, 0, LNET_PINGINFO_SIZE);
+	for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+		pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+		pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+	}
+	rcd->rcd_pinginfo = pi;
+
+	LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+	rc = LNetMDBind((lnet_md_t){.start     = pi,
+				    .user_ptr  = rcd,
+				    .length    = LNET_PINGINFO_SIZE,
+				    .threshold = LNET_MD_THRESH_INF,
+				    .options   = LNET_MD_TRUNCATE,
+				    .eq_handle = the_lnet.ln_rc_eqh},
+			LNET_UNLINK,
+			&rcd->rcd_mdh);
+	if (rc < 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto out;
+	}
+	LASSERT(rc == 0);
+
+	lnet_net_lock(gateway->lp_cpt);
+	/* router table changed or someone has created rcd for this gateway */
+	if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
+		lnet_net_unlock(gateway->lp_cpt);
+		goto out;
+	}
+
+	lnet_peer_addref_locked(gateway);
+	rcd->rcd_gateway = gateway;
+	gateway->lp_rcd = rcd;
+	gateway->lp_ping_notsent = 0;
+
+	return rcd;
+
+ out:
+	if (rcd != NULL) {
+		if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
+			rc = LNetMDUnlink(rcd->rcd_mdh);
+			LASSERT(rc == 0);
+		}
+		lnet_destroy_rc_data(rcd);
+	}
+
+	lnet_net_lock(gateway->lp_cpt);
+	return gateway->lp_rcd;
+}
+
+static int
+lnet_router_check_interval (lnet_peer_t *rtr)
+{
+	int secs;
+
+	secs = rtr->lp_alive ? live_router_check_interval :
+			       dead_router_check_interval;
+	if (secs < 0)
+		secs = 0;
+
+	return secs;
+}
+
+static void
+lnet_ping_router_locked (lnet_peer_t *rtr)
+{
+	lnet_rc_data_t *rcd = NULL;
+	cfs_time_t      now = cfs_time_current();
+	int	     secs;
+
+	lnet_peer_addref_locked(rtr);
+
+	if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
+	    cfs_time_after(now, rtr->lp_ping_deadline))
+		lnet_notify_locked(rtr, 1, 0, now);
+
+	/* Run any outstanding notifications */
+	lnet_ni_notify_locked(rtr->lp_ni, rtr);
+
+	if (!lnet_isrouter(rtr) ||
+	    the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+		/* router table changed or router checker is shutting down */
+		lnet_peer_decref_locked(rtr);
+		return;
+	}
+
+	rcd = rtr->lp_rcd != NULL ?
+	      rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
+
+	if (rcd == NULL)
+		return;
+
+	secs = lnet_router_check_interval(rtr);
+
+	CDEBUG(D_NET,
+	       "rtr %s %d: deadline %lu ping_notsent %d alive %d "
+	       "alive_count %d lp_ping_timestamp %lu\n",
+	       libcfs_nid2str(rtr->lp_nid), secs,
+	       rtr->lp_ping_deadline, rtr->lp_ping_notsent,
+	       rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
+
+	if (secs != 0 && !rtr->lp_ping_notsent &&
+	    cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+					     cfs_time_seconds(secs)))) {
+		int	       rc;
+		lnet_process_id_t id;
+		lnet_handle_md_t  mdh;
+
+		id.nid = rtr->lp_nid;
+		id.pid = LUSTRE_SRV_LNET_PID;
+		CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
+
+		rtr->lp_ping_notsent   = 1;
+		rtr->lp_ping_timestamp = now;
+
+		mdh = rcd->rcd_mdh;
+
+		if (rtr->lp_ping_deadline == 0) {
+			rtr->lp_ping_deadline =
+				cfs_time_shift(router_ping_timeout);
+		}
+
+		lnet_net_unlock(rtr->lp_cpt);
+
+		rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+			     LNET_PROTO_PING_MATCHBITS, 0);
+
+		lnet_net_lock(rtr->lp_cpt);
+		if (rc != 0)
+			rtr->lp_ping_notsent = 0; /* no event pending */
+	}
+
+	lnet_peer_decref_locked(rtr);
+	return;
+}
+
+int
+lnet_router_checker_start(void)
+{
+	int	  rc;
+	int	  eqsz;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+	if (check_routers_before_use &&
+	    dead_router_check_interval <= 0) {
+		LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be"
+				   " set if 'check_routers_before_use' is set"
+				   "\n");
+		return -EINVAL;
+	}
+
+	if (!the_lnet.ln_routing &&
+	    live_router_check_interval <= 0 &&
+	    dead_router_check_interval <= 0)
+		return 0;
+
+	sema_init(&the_lnet.ln_rc_signal, 0);
+	/* EQ size doesn't matter; the callback is guaranteed to get every
+	 * event */
+	eqsz = 0;
+	rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
+			 &the_lnet.ln_rc_eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+		return -ENOMEM;
+	}
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+	rc = PTR_ERR(kthread_run(lnet_router_checker,
+				 NULL, "router_checker"));
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("Can't start router checker thread: %d\n", rc);
+		/* block until event callback signals exit */
+		down(&the_lnet.ln_rc_signal);
+		rc = LNetEQFree(the_lnet.ln_rc_eqh);
+		LASSERT(rc == 0);
+		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+		return -ENOMEM;
+	}
+
+	if (check_routers_before_use) {
+		/* Note that a helpful side-effect of pinging all known routers
+		 * at startup is that it makes them drop stale connections they
+		 * may have to a previous instance of me. */
+		lnet_wait_known_routerstate();
+	}
+
+	return 0;
+}
+
+void
+lnet_router_checker_stop (void)
+{
+	int rc;
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+		return;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+	the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+
+	/* block until event callback signals exit */
+	down(&the_lnet.ln_rc_signal);
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+	rc = LNetEQFree(the_lnet.ln_rc_eqh);
+	LASSERT (rc == 0);
+	return;
+}
+
+static void
+lnet_prune_rc_data(int wait_unlink)
+{
+	lnet_rc_data_t		*rcd;
+	lnet_rc_data_t		*tmp;
+	lnet_peer_t		*lp;
+	struct list_head		head;
+	int			i = 2;
+
+	if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+		   list_empty(&the_lnet.ln_rcd_deathrow) &&
+		   list_empty(&the_lnet.ln_rcd_zombie)))
+		return;
+
+	INIT_LIST_HEAD(&head);
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+		/* router checker is stopping, prune all */
+		list_for_each_entry(lp, &the_lnet.ln_routers,
+					lp_rtr_list) {
+			if (lp->lp_rcd == NULL)
+				continue;
+
+			LASSERT(list_empty(&lp->lp_rcd->rcd_list));
+			list_add(&lp->lp_rcd->rcd_list,
+				     &the_lnet.ln_rcd_deathrow);
+			lp->lp_rcd = NULL;
+		}
+	}
+
+	/* unlink all RCDs on deathrow list */
+	list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
+
+	if (!list_empty(&head)) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		list_for_each_entry(rcd, &head, rcd_list)
+			LNetMDUnlink(rcd->rcd_mdh);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	list_splice_init(&head, &the_lnet.ln_rcd_zombie);
+
+	/* release all zombie RCDs */
+	while (!list_empty(&the_lnet.ln_rcd_zombie)) {
+		list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
+					     rcd_list) {
+			if (LNetHandleIsInvalid(rcd->rcd_mdh))
+				list_move(&rcd->rcd_list, &head);
+		}
+
+		wait_unlink = wait_unlink &&
+			      !list_empty(&the_lnet.ln_rcd_zombie);
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		while (!list_empty(&head)) {
+			rcd = list_entry(head.next,
+					     lnet_rc_data_t, rcd_list);
+			list_del_init(&rcd->rcd_list);
+			lnet_destroy_rc_data(rcd);
+		}
+
+		if (!wait_unlink)
+			return;
+
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+		       "Waiting for rc buffers to unlink\n");
+		cfs_pause(cfs_time_seconds(1) / 4);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+
+#if  defined(LNET_ROUTER)
+
+static int
+lnet_router_checker(void *arg)
+{
+	lnet_peer_t       *rtr;
+	struct list_head	*entry;
+
+	cfs_block_allsigs();
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+		__u64	version;
+		int	cpt;
+		int	cpt2;
+
+		cpt = lnet_net_lock_current();
+rescan:
+		version = the_lnet.ln_routers_version;
+
+		list_for_each(entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+			cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
+			if (cpt != cpt2) {
+				lnet_net_unlock(cpt);
+				cpt = cpt2;
+				lnet_net_lock(cpt);
+				/* the routers list has changed */
+				if (version != the_lnet.ln_routers_version)
+					goto rescan;
+			}
+
+			lnet_ping_router_locked(rtr);
+
+			/* NB dropped lock */
+			if (version != the_lnet.ln_routers_version) {
+				/* the routers list has changed */
+				goto rescan;
+			}
+		}
+
+		if (the_lnet.ln_routing)
+			lnet_update_ni_status_locked();
+
+		lnet_net_unlock(cpt);
+
+		lnet_prune_rc_data(0); /* don't wait for UNLINK */
+
+		/* Call cfs_pause() here always adds 1 to load average
+		 * because kernel counts # active tasks as nr_running
+		 * + nr_uninterruptible. */
+		schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+						   cfs_time_seconds(1));
+	}
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
+
+	lnet_prune_rc_data(1); /* wait for UNLINK */
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+	up(&the_lnet.ln_rc_signal);
+	/* The unlink event callback will signal final completion */
+	return 0;
+}
+
+void
+lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
+{
+	int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+
+	while (--npages >= 0)
+		__free_page(rb->rb_kiov[npages].kiov_page);
+
+	LIBCFS_FREE(rb, sz);
+}
+
+lnet_rtrbuf_t *
+lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
+{
+	int	    npages = rbp->rbp_npages;
+	int	    sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+	struct page   *page;
+	lnet_rtrbuf_t *rb;
+	int	    i;
+
+	LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+	if (rb == NULL)
+		return NULL;
+
+	rb->rb_pool = rbp;
+
+	for (i = 0; i < npages; i++) {
+		page = alloc_pages_node(
+				cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+				__GFP_ZERO | GFP_IOFS, 0);
+		if (page == NULL) {
+			while (--i >= 0)
+				__free_page(rb->rb_kiov[i].kiov_page);
+
+			LIBCFS_FREE(rb, sz);
+			return NULL;
+		}
+
+		rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
+		rb->rb_kiov[i].kiov_offset = 0;
+		rb->rb_kiov[i].kiov_page = page;
+	}
+
+	return rb;
+}
+
+void
+lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
+{
+	int		npages = rbp->rbp_npages;
+	int		nbuffers = 0;
+	lnet_rtrbuf_t	*rb;
+
+	if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+		return;
+
+	LASSERT (list_empty(&rbp->rbp_msgs));
+	LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers);
+
+	while (!list_empty(&rbp->rbp_bufs)) {
+		LASSERT (rbp->rbp_credits > 0);
+
+		rb = list_entry(rbp->rbp_bufs.next,
+				    lnet_rtrbuf_t, rb_list);
+		list_del(&rb->rb_list);
+		lnet_destroy_rtrbuf(rb, npages);
+		nbuffers++;
+	}
+
+	LASSERT (rbp->rbp_nbuffers == nbuffers);
+	LASSERT (rbp->rbp_credits == nbuffers);
+
+	rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+}
+
+int
+lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
+{
+	lnet_rtrbuf_t *rb;
+	int	    i;
+
+	if (rbp->rbp_nbuffers != 0) {
+		LASSERT (rbp->rbp_nbuffers == nbufs);
+		return 0;
+	}
+
+	for (i = 0; i < nbufs; i++) {
+		rb = lnet_new_rtrbuf(rbp, cpt);
+
+		if (rb == NULL) {
+			CERROR("Failed to allocate %d router bufs of %d pages\n",
+			       nbufs, rbp->rbp_npages);
+			return -ENOMEM;
+		}
+
+		rbp->rbp_nbuffers++;
+		rbp->rbp_credits++;
+		rbp->rbp_mincredits++;
+		list_add(&rb->rb_list, &rbp->rbp_bufs);
+
+		/* No allocation "under fire" */
+		/* Otherwise we'd need code to schedule blocked msgs etc */
+		LASSERT (!the_lnet.ln_routing);
+	}
+
+	LASSERT (rbp->rbp_credits == nbufs);
+	return 0;
+}
+
+void
+lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
+{
+	INIT_LIST_HEAD(&rbp->rbp_msgs);
+	INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+	rbp->rbp_npages = npages;
+	rbp->rbp_credits = 0;
+	rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+	lnet_rtrbufpool_t *rtrp;
+	int		  i;
+
+	if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+		return;
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_free_bufs(&rtrp[0]);
+		lnet_rtrpool_free_bufs(&rtrp[1]);
+		lnet_rtrpool_free_bufs(&rtrp[2]);
+	}
+
+	cfs_percpt_free(the_lnet.ln_rtrpools);
+	the_lnet.ln_rtrpools = NULL;
+}
+
+static int
+lnet_nrb_tiny_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_TINY;
+
+	if (tiny_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "tiny_router_buffers=%d invalid when "
+				   "routing enabled\n", tiny_router_buffers);
+		return -1;
+	}
+
+	if (tiny_router_buffers > 0)
+		nrbs = tiny_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_SMALL;
+
+	if (small_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "small_router_buffers=%d invalid when "
+				   "routing enabled\n", small_router_buffers);
+		return -1;
+	}
+
+	if (small_router_buffers > 0)
+		nrbs = small_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_LARGE;
+
+	if (large_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "large_router_buffers=%d invalid when "
+				   "routing enabled\n", large_router_buffers);
+		return -1;
+	}
+
+	if (large_router_buffers > 0)
+		nrbs = large_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+	lnet_rtrbufpool_t *rtrp;
+	int	large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int	small_pages = 1;
+	int	nrb_tiny;
+	int	nrb_small;
+	int	nrb_large;
+	int	rc;
+	int	i;
+
+	if (!strcmp(forwarding, "")) {
+		/* not set either way */
+		if (!im_a_router)
+			return 0;
+	} else if (!strcmp(forwarding, "disabled")) {
+		/* explicitly disabled */
+		return 0;
+	} else if (!strcmp(forwarding, "enabled")) {
+		/* explicitly enabled */
+	} else {
+		LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either "
+				   "'enabled' or 'disabled'\n");
+		return -EINVAL;
+	}
+
+	nrb_tiny = lnet_nrb_tiny_calculate(0);
+	if (nrb_tiny < 0)
+		return -EINVAL;
+
+	nrb_small = lnet_nrb_small_calculate(small_pages);
+	if (nrb_small < 0)
+		return -EINVAL;
+
+	nrb_large = lnet_nrb_large_calculate(large_pages);
+	if (nrb_large < 0)
+		return -EINVAL;
+
+	the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+						LNET_NRBPOOLS *
+						sizeof(lnet_rtrbufpool_t));
+	if (the_lnet.ln_rtrpools == NULL) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "Failed to initialize router buffe pool\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_init(&rtrp[0], 0);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
+		if (rc != 0)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[1], small_pages);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
+		if (rc != 0)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[2], large_pages);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
+		if (rc != 0)
+			goto failed;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_routing = 1;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+
+ failed:
+	lnet_rtrpools_free();
+	return rc;
+}
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+	struct lnet_peer	*lp = NULL;
+	cfs_time_t		now = cfs_time_current();
+	int			cpt = lnet_cpt_of_nid(nid);
+
+	LASSERT (!in_interrupt ());
+
+	CDEBUG (D_NET, "%s notifying %s: %s\n",
+		(ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+		libcfs_nid2str(nid),
+		alive ? "up" : "down");
+
+	if (ni != NULL &&
+	    LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+		CWARN ("Ignoring notification of %s %s by %s (different net)\n",
+			libcfs_nid2str(nid), alive ? "birth" : "death",
+			libcfs_nid2str(ni->ni_nid));
+		return -EINVAL;
+	}
+
+	/* can't do predictions... */
+	if (cfs_time_after(when, now)) {
+		CWARN ("Ignoring prediction from %s of %s %s "
+		       "%ld seconds in the future\n",
+		       (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+		       libcfs_nid2str(nid), alive ? "up" : "down",
+		       cfs_duration_sec(cfs_time_sub(when, now)));
+		return -EINVAL;
+	}
+
+	if (ni != NULL && !alive &&	     /* LND telling me she's down */
+	    !auto_down) {		       /* auto-down disabled */
+		CDEBUG(D_NET, "Auto-down disabled\n");
+		return 0;
+	}
+
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
+	if (lp == NULL) {
+		/* nid not found */
+		lnet_net_unlock(cpt);
+		CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+		return 0;
+	}
+
+	/* We can't fully trust LND on reporting exact peer last_alive
+	 * if he notifies us about dead peer. For example ksocklnd can
+	 * call us with when == _time_when_the_node_was_booted_ if
+	 * no connections were successfully established */
+	if (ni != NULL && !alive && when < lp->lp_last_alive)
+		when = lp->lp_last_alive;
+
+	lnet_notify_locked(lp, ni == NULL, alive, when);
+
+	lnet_ni_notify_locked(ni, lp);
+
+	lnet_peer_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
+
+void
+lnet_get_tunables (void)
+{
+	return;
+}
+
+#else
+
+int
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+	return -EOPNOTSUPP;
+}
+
+void
+lnet_router_checker (void)
+{
+	static time_t last = 0;
+	static int    running = 0;
+
+	time_t	    now = cfs_time_current_sec();
+	int	       interval = now - last;
+	int	       rc;
+	__u64	     version;
+	lnet_peer_t      *rtr;
+
+	/* It's no use to call me again within a sec - all intervals and
+	 * timeouts are measured in seconds */
+	if (last != 0 && interval < 2)
+		return;
+
+	if (last != 0 &&
+	    interval > MAX(live_router_check_interval,
+			   dead_router_check_interval))
+		CNETERR("Checker(%d/%d) not called for %d seconds\n",
+			live_router_check_interval, dead_router_check_interval,
+			interval);
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_net_lock(0);
+	LASSERT(!running); /* recursion check */
+	running = 1;
+	lnet_net_unlock(0);
+
+	last = now;
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING)
+		lnet_prune_rc_data(0); /* unlink all rcd and nowait */
+
+	/* consume all pending events */
+	while (1) {
+		int	  i;
+		lnet_event_t ev;
+
+		/* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the
+		 * recursion breaker in LNetEQPoll would fail */
+		rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i);
+		if (rc == 0)   /* no event pending */
+			break;
+
+		/* NB a lost SENT prevents me from pinging a router again */
+		if (rc == -EOVERFLOW) {
+			CERROR("Dropped an event!!!\n");
+			abort();
+		}
+
+		LASSERT (rc == 1);
+
+		lnet_router_checker_event(&ev);
+	}
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) {
+		lnet_prune_rc_data(1); /* release rcd */
+		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+		running = 0;
+		return;
+	}
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	lnet_net_lock(0);
+
+	version = the_lnet.ln_routers_version;
+	list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) {
+		lnet_ping_router_locked(rtr);
+		LASSERT (version == the_lnet.ln_routers_version);
+	}
+
+	lnet_net_unlock(0);
+
+	running = 0; /* lock only needed for the recursion check */
+	return;
+}
+
+/* NB lnet_peers_start_down depends on me,
+ * so must be called before any peer creation */
+void
+lnet_get_tunables (void)
+{
+	char *s;
+
+	s = getenv("LNET_ROUTER_PING_TIMEOUT");
+	if (s != NULL) router_ping_timeout = atoi(s);
+
+	s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL");
+	if (s != NULL) live_router_check_interval = atoi(s);
+
+	s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL");
+	if (s != NULL) dead_router_check_interval = atoi(s);
+
+	/* This replaces old lnd_notify mechanism */
+	check_routers_before_use = 1;
+	if (dead_router_check_interval <= 0)
+		dead_router_check_interval = 30;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+}
+
+int
+lnet_rtrpools_alloc(int im_a_arouter)
+{
+	return 0;
+}
+
+#endif

diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c
new file mode 100644
index 0000000..3084b0c
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/router_proc.c

@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+
+#if  defined(LNET_ROUTER)
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+static ctl_table_header_t *lnet_table_header = NULL;
+
+#define CTL_LNET	 (0x100)
+enum {
+	PSDEV_LNET_STATS = 100,
+	PSDEV_LNET_ROUTES,
+	PSDEV_LNET_ROUTERS,
+	PSDEV_LNET_PEERS,
+	PSDEV_LNET_BUFFERS,
+	PSDEV_LNET_NIS,
+	PSDEV_LNET_PTL_ROTOR,
+};
+
+#define LNET_LOFFT_BITS		(sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS	(LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS	MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8)
+
+#define LNET_PROC_HASH_BITS	LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS	(LNET_LOFFT_BITS -       \
+				 LNET_PROC_CPT_BITS -    \
+				 LNET_PROC_VER_BITS -    \
+				 LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS	(LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS	(LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK	((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK	((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK	((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK	((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos)				\
+	(int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off)		\
+	(((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) |   \
+	((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) |   \
+	((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+	((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v)	((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int __proc_lnet_stats(void *data, int write,
+			     loff_t pos, void *buffer, int nob)
+{
+	int	      rc;
+	lnet_counters_t *ctrs;
+	int	      len;
+	char	    *tmpstr;
+	const int	tmpsiz = 256; /* 7 %u and 4 LPU64 */
+
+	if (write) {
+		lnet_counters_reset();
+		return 0;
+	}
+
+	/* read */
+
+	LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+	if (ctrs == NULL)
+		return -ENOMEM;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL) {
+		LIBCFS_FREE(ctrs, sizeof(*ctrs));
+		return -ENOMEM;
+	}
+
+	lnet_counters_get(ctrs);
+
+	len = snprintf(tmpstr, tmpsiz,
+		       "%u %u %u %u %u %u %u "LPU64" "LPU64" "
+		       LPU64" "LPU64,
+		       ctrs->msgs_alloc, ctrs->msgs_max,
+		       ctrs->errors,
+		       ctrs->send_count, ctrs->recv_count,
+		       ctrs->route_count, ctrs->drop_count,
+		       ctrs->send_length, ctrs->recv_length,
+		       ctrs->route_length, ctrs->drop_length);
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, "\n");
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	LIBCFS_FREE(ctrs, sizeof(*ctrs));
+	return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_stats);
+
+int LL_PROC_PROTO(proc_lnet_routes)
+{
+	const int	tmpsiz = 256;
+	char		*tmpstr;
+	char		*s;
+	int		rc = 0;
+	int		len;
+	int		ver;
+	int		off;
+
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	CLASSERT(sizeof(loff_t) >= 4);
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT (!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+			      the_lnet.ln_routing ? "enabled" : "disabled");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+
+		s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %7s %s\n",
+			      "net", "hops", "state", "router");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_remote_nets_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head		*n;
+		struct list_head		*r;
+		lnet_route_t		*route = NULL;
+		lnet_remotenet_t	*rnet  = NULL;
+		int			skip  = off - 1;
+		struct list_head		*rn_list;
+		int			i;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+			lnet_net_unlock(0);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+		     i++) {
+			rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+			n = rn_list->next;
+
+			while (n != rn_list && route == NULL) {
+				rnet = list_entry(n, lnet_remotenet_t,
+						      lrn_list);
+
+				r = rnet->lrn_routes.next;
+
+				while (r != &rnet->lrn_routes) {
+					lnet_route_t *re =
+						list_entry(r, lnet_route_t,
+							       lr_list);
+					if (skip == 0) {
+						route = re;
+						break;
+					}
+
+					skip--;
+					r = r->next;
+				}
+
+				n = n->next;
+			}
+		}
+
+		if (route != NULL) {
+			__u32	net   = rnet->lrn_net;
+			unsigned int hops  = route->lr_hops;
+			lnet_nid_t   nid   = route->lr_gateway->lp_nid;
+			int	  alive = route->lr_gateway->lp_alive;
+
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-8s %4u %7s %s\n",
+				      libcfs_net2str(net), hops,
+				      alive ? "up" : "down",
+				      libcfs_nid2str(nid));
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_routers)
+{
+	int	rc = 0;
+	char      *tmpstr;
+	char      *s;
+	const int  tmpsiz = 256;
+	int	len;
+	int	ver;
+	int	off;
+
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT (!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+			      "ref", "rtr_ref", "alive_cnt", "state",
+			      "last_ping", "ping_sent", "deadline",
+			      "down_ni", "router");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_routers_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head		*r;
+		struct lnet_peer	*peer = NULL;
+		int			skip = off - 1;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+			lnet_net_unlock(0);
+
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		r = the_lnet.ln_routers.next;
+
+		while (r != &the_lnet.ln_routers) {
+			lnet_peer_t *lp = list_entry(r, lnet_peer_t,
+							 lp_rtr_list);
+
+			if (skip == 0) {
+				peer = lp;
+				break;
+			}
+
+			skip--;
+			r = r->next;
+		}
+
+		if (peer != NULL) {
+			lnet_nid_t nid = peer->lp_nid;
+			cfs_time_t now = cfs_time_current();
+			cfs_time_t deadline = peer->lp_ping_deadline;
+			int nrefs     = peer->lp_refcount;
+			int nrtrrefs  = peer->lp_rtr_refcount;
+			int alive_cnt = peer->lp_alive_count;
+			int alive     = peer->lp_alive;
+			int pingsent  = !peer->lp_ping_notsent;
+			int last_ping = cfs_duration_sec(cfs_time_sub(now,
+						     peer->lp_ping_timestamp));
+			int down_ni   = 0;
+			lnet_route_t *rtr;
+
+			if ((peer->lp_ping_feats &
+			     LNET_PING_FEAT_NI_STATUS) != 0) {
+				list_for_each_entry(rtr, &peer->lp_routes,
+							lr_gwlist) {
+					/* downis on any route should be the
+					 * number of downis on the gateway */
+					if (rtr->lr_downis != 0) {
+						down_ni = rtr->lr_downis;
+						break;
+					}
+				}
+			}
+
+			if (deadline == 0)
+				s += snprintf(s, tmpstr + tmpsiz - s,
+					      "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+					      nrefs, nrtrrefs, alive_cnt,
+					      alive ? "up" : "down", last_ping,
+					      pingsent, "NA", down_ni,
+					      libcfs_nid2str(nid));
+			else
+				s += snprintf(s, tmpstr + tmpsiz - s,
+					      "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+					      nrefs, nrtrrefs, alive_cnt,
+					      alive ? "up" : "down", last_ping,
+					      pingsent,
+					      cfs_duration_sec(cfs_time_sub(deadline, now)),
+					      down_ni, libcfs_nid2str(nid));
+			LASSERT (tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_peers)
+{
+	const int		tmpsiz  = 256;
+	struct lnet_peer_table	*ptable;
+	char			*tmpstr;
+	char			*s;
+	int			cpt  = LNET_PROC_CPT_GET(*ppos);
+	int			ver  = LNET_PROC_VER_GET(*ppos);
+	int			hash = LNET_PROC_HASH_GET(*ppos);
+	int			hoff = LNET_PROC_HOFF_GET(*ppos);
+	int			rc = 0;
+	int			len;
+
+	CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS);
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	if (cpt >= LNET_CPT_NUMBER) {
+		*lenp = 0;
+		return 0;
+	}
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+			      "nid", "refs", "state", "last", "max",
+			      "rtr", "min", "tx", "min", "queue");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+
+		hoff++;
+	} else {
+		struct lnet_peer	*peer;
+		struct list_head		*p;
+		int			skip;
+ again:
+		p = NULL;
+		peer = NULL;
+		skip = hoff - 1;
+
+		lnet_net_lock(cpt);
+		ptable = the_lnet.ln_peer_tables[cpt];
+		if (hoff == 1)
+			ver = LNET_PROC_VERSION(ptable->pt_version);
+
+		if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+			lnet_net_unlock(cpt);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		while (hash < LNET_PEER_HASH_SIZE) {
+			if (p == NULL)
+				p = ptable->pt_hash[hash].next;
+
+			while (p != &ptable->pt_hash[hash]) {
+				lnet_peer_t *lp = list_entry(p, lnet_peer_t,
+								 lp_hashlist);
+				if (skip == 0) {
+					peer = lp;
+
+					/* minor optimization: start from idx+1
+					 * on next iteration if we've just
+					 * drained lp_hashlist */
+					if (lp->lp_hashlist.next ==
+					    &ptable->pt_hash[hash]) {
+						hoff = 1;
+						hash++;
+					} else {
+						hoff++;
+					}
+
+					break;
+				}
+
+				skip--;
+				p = lp->lp_hashlist.next;
+			}
+
+			if (peer != NULL)
+				break;
+
+			p = NULL;
+			hoff = 1;
+			hash++;
+		}
+
+		if (peer != NULL) {
+			lnet_nid_t nid       = peer->lp_nid;
+			int	nrefs     = peer->lp_refcount;
+			int	lastalive = -1;
+			char      *aliveness = "NA";
+			int	maxcr     = peer->lp_ni->ni_peertxcredits;
+			int	txcr      = peer->lp_txcredits;
+			int	mintxcr   = peer->lp_mintxcredits;
+			int	rtrcr     = peer->lp_rtrcredits;
+			int	minrtrcr  = peer->lp_minrtrcredits;
+			int	txqnob    = peer->lp_txqnob;
+
+			if (lnet_isrouter(peer) ||
+			    lnet_peer_aliveness_enabled(peer))
+				aliveness = peer->lp_alive ? "up" : "down";
+
+			if (lnet_peer_aliveness_enabled(peer)) {
+				cfs_time_t     now = cfs_time_current();
+				cfs_duration_t delta;
+
+				delta = cfs_time_sub(now, peer->lp_last_alive);
+				lastalive = cfs_duration_sec(delta);
+
+				/* No need to mess up peers contents with
+				 * arbitrarily long integers - it suffices to
+				 * know that lastalive is more than 10000s old
+				 */
+				if (lastalive >= 10000)
+					lastalive = 9999;
+			}
+
+			lnet_net_unlock(cpt);
+
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+				      libcfs_nid2str(nid), nrefs, aliveness,
+				      lastalive, maxcr, rtrcr, minrtrcr, txcr,
+				      mintxcr, txqnob);
+			LASSERT (tmpstr + tmpsiz - s > 0);
+
+		} else { /* peer is NULL */
+			lnet_net_unlock(cpt);
+		}
+
+		if (hash == LNET_PEER_HASH_SIZE) {
+			cpt++;
+			hash = 0;
+			hoff = 1;
+			if (peer == NULL && cpt < LNET_CPT_NUMBER)
+				goto again;
+		}
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+static int __proc_lnet_buffers(void *data, int write,
+			       loff_t pos, void *buffer, int nob)
+{
+	char	    *s;
+	char	    *tmpstr;
+	int		tmpsiz;
+	int		idx;
+	int		len;
+	int		rc;
+	int		i;
+
+	LASSERT(!write);
+
+	/* (4 %d) * 4 * LNET_CPT_NUMBER */
+	tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	s += snprintf(s, tmpstr + tmpsiz - s,
+		      "%5s %5s %7s %7s\n",
+		      "pages", "count", "credits", "min");
+	LASSERT (tmpstr + tmpsiz - s > 0);
+
+	if (the_lnet.ln_rtrpools == NULL)
+		goto out; /* I'm not a router */
+
+	for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+		lnet_rtrbufpool_t *rbp;
+
+		lnet_net_lock(LNET_LOCK_EX);
+		cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%5d %5d %7d %7d\n",
+				      rbp[idx].rbp_npages,
+				      rbp[idx].rbp_nbuffers,
+				      rbp[idx].rbp_credits,
+				      rbp[idx].rbp_mincredits);
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+ out:
+	len = s - tmpstr;
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, NULL);
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_buffers);
+
+int LL_PROC_PROTO(proc_lnet_nis)
+{
+	int	tmpsiz = 128 * LNET_CPT_NUMBER;
+	int	rc = 0;
+	char      *tmpstr;
+	char      *s;
+	int	len;
+
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	LASSERT (!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+			      "nid", "status", "alive", "refs", "peer",
+			      "rtr", "max", "tx", "min");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+	} else {
+		struct list_head	*n;
+		lnet_ni_t	 *ni   = NULL;
+		int		skip = *ppos - 1;
+
+		lnet_net_lock(0);
+
+		n = the_lnet.ln_nis.next;
+
+		while (n != &the_lnet.ln_nis) {
+			lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list);
+
+			if (skip == 0) {
+				ni = a_ni;
+				break;
+			}
+
+			skip--;
+			n = n->next;
+		}
+
+		if (ni != NULL) {
+			struct lnet_tx_queue	*tq;
+			char	*stat;
+			long	now = cfs_time_current_sec();
+			int	last_alive = -1;
+			int	i;
+			int	j;
+
+			if (the_lnet.ln_routing)
+				last_alive = now - ni->ni_last_alive;
+
+			/* @lo forever alive */
+			if (ni->ni_lnd->lnd_type == LOLND)
+				last_alive = 0;
+
+			lnet_ni_lock(ni);
+			LASSERT(ni->ni_status != NULL);
+			stat = (ni->ni_status->ns_status ==
+				LNET_NI_STATUS_UP) ? "up" : "down";
+			lnet_ni_unlock(ni);
+
+			/* we actually output credits information for
+			 * TX queue of each partition */
+			cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+				for (j = 0; ni->ni_cpts != NULL &&
+				     j < ni->ni_ncpts; j++) {
+					if (i == ni->ni_cpts[j])
+						break;
+				}
+
+				if (j == ni->ni_ncpts)
+					continue;
+
+				if (i != 0)
+					lnet_net_lock(i);
+
+				s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+				      libcfs_nid2str(ni->ni_nid), stat,
+				      last_alive, *ni->ni_refs[i],
+				      ni->ni_peertxcredits,
+				      ni->ni_peerrtrcredits,
+				      tq->tq_credits_max,
+				      tq->tq_credits, tq->tq_credits_min);
+				if (i != 0)
+					lnet_net_unlock(i);
+			}
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos += 1;
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+struct lnet_portal_rotors {
+	int	     pr_value;
+	const char      *pr_name;
+	const char	*pr_desc;
+};
+
+static struct lnet_portal_rotors	portal_rotors[] = {
+	{
+		.pr_value = LNET_PTL_ROTOR_OFF,
+		.pr_name  = "OFF",
+		.pr_desc  = "Turn off message rotor for wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_ON,
+		.pr_name  = "ON",
+		.pr_desc  = "round-robin dispatch all PUT messages for "
+			    "wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_RR_RT,
+		.pr_name  = "RR_RT",
+		.pr_desc  = "round-robin dispatch routed PUT message for "
+			    "wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_HASH_RT,
+		.pr_name  = "HASH_RT",
+		.pr_desc  = "dispatch routed PUT message by hashing source "
+			    "NID for wildcard portals"
+	},
+	{
+		.pr_value = -1,
+		.pr_name  = NULL,
+		.pr_desc  = NULL
+	},
+};
+
+extern int portal_rotor;
+
+static int __proc_lnet_portal_rotor(void *data, int write,
+				    loff_t pos, void *buffer, int nob)
+{
+	const int	buf_len	= 128;
+	char		*buf;
+	char		*tmp;
+	int		rc;
+	int		i;
+
+	LIBCFS_ALLOC(buf, buf_len);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	if (!write) {
+		lnet_res_lock(0);
+
+		for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+			if (portal_rotors[i].pr_value == portal_rotor)
+				break;
+		}
+
+		LASSERT(portal_rotors[i].pr_value == portal_rotor);
+		lnet_res_unlock(0);
+
+		rc = snprintf(buf, buf_len,
+			      "{\n\tportals: all\n"
+			      "\trotor: %s\n\tdescription: %s\n}",
+			      portal_rotors[i].pr_name,
+			      portal_rotors[i].pr_desc);
+
+		if (pos >= min_t(int, rc, buf_len)) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+					buf + pos, "\n");
+		}
+		goto out;
+	}
+
+	rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob);
+	if (rc < 0)
+		goto out;
+
+	tmp = cfs_trimwhite(buf);
+
+	rc = -EINVAL;
+	lnet_res_lock(0);
+	for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+		if (cfs_strncasecmp(portal_rotors[i].pr_name, tmp,
+				    strlen(portal_rotors[i].pr_name)) == 0) {
+			portal_rotor = portal_rotors[i].pr_value;
+			rc = 0;
+			break;
+		}
+	}
+	lnet_res_unlock(0);
+out:
+	LIBCFS_FREE(buf, buf_len);
+	return rc;
+}
+DECLARE_PROC_HANDLER(proc_lnet_portal_rotor);
+
+static ctl_table_t lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		INIT_CTL_NAME(PSDEV_LNET_STATS)
+		.procname = "stats",
+		.mode     = 0644,
+		.proc_handler = &proc_lnet_stats,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_ROUTES)
+		.procname = "routes",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_routes,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_ROUTERS)
+		.procname = "routers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_routers,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PEERS)
+		.procname = "peers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_peers,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PEERS)
+		.procname = "buffers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_buffers,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_NIS)
+		.procname = "nis",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_nis,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PTL_ROTOR)
+		.procname = "portal_rotor",
+		.mode     = 0644,
+		.proc_handler = &proc_lnet_portal_rotor,
+	},
+	{
+		INIT_CTL_NAME(0)
+	}
+};
+
+static ctl_table_t top_table[] = {
+	{
+		INIT_CTL_NAME(CTL_LNET)
+		.procname = "lnet",
+		.mode     = 0555,
+		.data     = NULL,
+		.maxlen   = 0,
+		.child    = lnet_table,
+	},
+	{
+		INIT_CTL_NAME(0)
+	}
+};
+
+void
+lnet_proc_init(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header == NULL)
+		lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+}
+
+void
+lnet_proc_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+#endif
+}
+
+#else
+
+void
+lnet_proc_init(void)
+{
+}
+
+void
+lnet_proc_fini(void)
+{
+}
+
+#endif

diff --git a/drivers/staging/lustre/lnet/selftest/Makefile b/drivers/staging/lustre/lnet/selftest/Makefile
new file mode 100644
index 0000000..1e40aee
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/Makefile

@@ -0,0 +1,6 @@
+obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o
+
+lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \
+		   module.o ping_test.o brw_test.o
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lnet/selftest/brw_test.c b/drivers/staging/lustre/lnet/selftest/brw_test.c
new file mode 100644
index 0000000..3bb6fbe
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/brw_test.c

@@ -0,0 +1,499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/brw_test.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+static int brw_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(brw_srv_workitems, "i", int, 0644, "# BRW server workitems");
+
+static int brw_inject_errors;
+CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644,
+		"# data errors to inject randomly, zero by default");
+
+static void
+brw_client_fini (sfw_test_instance_t *tsi)
+{
+	srpc_bulk_t     *bulk;
+	sfw_test_unit_t *tsu;
+
+	LASSERT (tsi->tsi_is_client);
+
+	list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+		bulk = tsu->tsu_private;
+		if (bulk == NULL) continue;
+
+		srpc_free_bulk(bulk);
+		tsu->tsu_private = NULL;
+	}
+}
+
+int
+brw_client_init (sfw_test_instance_t *tsi)
+{
+	sfw_session_t	 *sn = tsi->tsi_batch->bat_session;
+	int		  flags;
+	int		  npg;
+	int		  len;
+	int		  opc;
+	srpc_bulk_t	 *bulk;
+	sfw_test_unit_t	 *tsu;
+
+	LASSERT(sn != NULL);
+	LASSERT(tsi->tsi_is_client);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		test_bulk_req_t  *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		/* NB: this is not going to work for variable page size,
+		 * but we have to keep it for compatibility */
+		len   = npg * PAGE_CACHE_SIZE;
+
+	} else {
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	if (npg > LNET_MAX_IOV || npg <= 0)
+		return -EINVAL;
+
+	if (opc != LST_BRW_READ && opc != LST_BRW_WRITE)
+		return -EINVAL;
+
+	if (flags != LST_BRW_CHECK_NONE &&
+	    flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE)
+		return -EINVAL;
+
+	list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+		bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid),
+				       npg, len, opc == LST_BRW_READ);
+		if (bulk == NULL) {
+			brw_client_fini(tsi);
+			return -ENOMEM;
+		}
+
+		tsu->tsu_private = bulk;
+	}
+
+	return 0;
+}
+
+#define BRW_POISON      0xbeefbeefbeefbeefULL
+#define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE       sizeof(__u64)
+
+int
+brw_inject_one_error (void)
+{
+	struct timeval tv;
+
+	if (brw_inject_errors <= 0) return 0;
+
+	do_gettimeofday(&tv);
+
+	if ((tv.tv_usec & 1) == 0) return 0;
+
+	return brw_inject_errors--;
+}
+
+void
+brw_fill_page (struct page *pg, int pattern, __u64 magic)
+{
+	char *addr = page_address(pg);
+	int   i;
+
+	LASSERT (addr != NULL);
+
+	if (pattern == LST_BRW_CHECK_NONE) return;
+
+	if (magic == BRW_MAGIC)
+		magic += brw_inject_one_error();
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		memcpy(addr, &magic, BRW_MSIZE);
+		addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+		memcpy(addr, &magic, BRW_MSIZE);
+		return;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++)
+			memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE);
+		return;
+	}
+
+	LBUG ();
+	return;
+}
+
+int
+brw_check_page (struct page *pg, int pattern, __u64 magic)
+{
+	char  *addr = page_address(pg);
+	__u64  data = 0; /* make compiler happy */
+	int    i;
+
+	LASSERT (addr != NULL);
+
+	if (pattern == LST_BRW_CHECK_NONE)
+		return 0;
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		data = *((__u64 *) addr);
+		if (data != magic) goto bad_data;
+
+		addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+		data = *((__u64 *) addr);
+		if (data != magic) goto bad_data;
+
+		return 0;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) {
+			data = *(((__u64 *) addr) + i);
+			if (data != magic) goto bad_data;
+		}
+
+		return 0;
+	}
+
+	LBUG ();
+
+bad_data:
+	CERROR ("Bad data in page %p: "LPX64", "LPX64" expected\n",
+		pg, data, magic);
+	return 1;
+}
+
+void
+brw_fill_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+	int	 i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		brw_fill_page(pg, pattern, magic);
+	}
+}
+
+int
+brw_check_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+	int	 i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		if (brw_check_page(pg, pattern, magic) != 0) {
+			CERROR ("Bulk page %p (%d/%d) is corrupted!\n",
+				pg, i, bk->bk_niov);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+brw_client_prep_rpc (sfw_test_unit_t *tsu,
+		     lnet_process_id_t dest, srpc_client_rpc_t **rpcpp)
+{
+	srpc_bulk_t	 *bulk = tsu->tsu_private;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t	    *sn = tsi->tsi_batch->bat_session;
+	srpc_client_rpc_t   *rpc;
+	srpc_brw_reqst_t    *req;
+	int		     flags;
+	int		     npg;
+	int		     len;
+	int		     opc;
+	int		     rc;
+
+	LASSERT(sn != NULL);
+	LASSERT(bulk != NULL);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		len   = npg * PAGE_CACHE_SIZE;
+
+	} else {
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
+	if (rc != 0)
+		return rc;
+
+	memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+	if (opc == LST_BRW_WRITE)
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
+	else
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON);
+
+	req = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+	req->brw_flags = flags;
+	req->brw_rw    = opc;
+	req->brw_len   = len;
+
+	*rpcpp = rpc;
+	return 0;
+}
+
+static void
+brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+	__u64		magic = BRW_MAGIC;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+	srpc_msg_t	  *msg = &rpc->crpc_replymsg;
+	srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
+	srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+
+	LASSERT (sn != NULL);
+
+	if (rpc->crpc_status != 0) {
+		CERROR ("BRW RPC to %s failed with %d\n",
+			libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+		if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_brw_errors);
+		goto out;
+	}
+
+	if (msg->msg_magic != SRPC_MSG_MAGIC) {
+		__swab64s(&magic);
+		__swab32s(&reply->brw_status);
+	}
+
+	CDEBUG (reply->brw_status ? D_WARNING : D_NET,
+		"BRW RPC to %s finished with brw_status: %d\n",
+		libcfs_id2str(rpc->crpc_dest), reply->brw_status);
+
+	if (reply->brw_status != 0) {
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -(int)reply->brw_status;
+		goto out;
+	}
+
+	if (reqst->brw_rw == LST_BRW_WRITE) goto out;
+
+	if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
+		CERROR ("Bulk data from %s is corrupted!\n",
+			libcfs_id2str(rpc->crpc_dest));
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -EBADMSG;
+	}
+
+out:
+	return;
+}
+
+void
+brw_server_rpc_done (srpc_server_rpc_t *rpc)
+{
+	srpc_bulk_t *blk = rpc->srpc_bulk;
+
+	if (blk == NULL) return;
+
+	if (rpc->srpc_status != 0)
+		CERROR ("Bulk transfer %s %s has failed: %d\n",
+			blk->bk_sink ? "from" : "to",
+			libcfs_id2str(rpc->srpc_peer), rpc->srpc_status);
+	else
+		CDEBUG (D_NET, "Transfered %d pages bulk data %s %s\n",
+			blk->bk_niov, blk->bk_sink ? "from" : "to",
+			libcfs_id2str(rpc->srpc_peer));
+
+	sfw_free_pages(rpc);
+}
+
+int
+brw_bulk_ready (srpc_server_rpc_t *rpc, int status)
+{
+	__u64	     magic = BRW_MAGIC;
+	srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+	srpc_brw_reqst_t *reqst;
+	srpc_msg_t       *reqstmsg;
+
+	LASSERT (rpc->srpc_bulk != NULL);
+	LASSERT (rpc->srpc_reqstbuf != NULL);
+
+	reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	reqst = &reqstmsg->msg_body.brw_reqst;
+
+	if (status != 0) {
+		CERROR ("BRW bulk %s failed for RPC from %s: %d\n",
+			reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE",
+			libcfs_id2str(rpc->srpc_peer), status);
+		return -EIO;
+	}
+
+	if (reqst->brw_rw == LST_BRW_READ)
+		return 0;
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC)
+		__swab64s(&magic);
+
+	if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) {
+		CERROR ("Bulk data from %s is corrupted!\n",
+			libcfs_id2str(rpc->srpc_peer));
+		reply->brw_status = EBADMSG;
+	}
+
+	return 0;
+}
+
+int
+brw_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	srpc_msg_t       *replymsg = &rpc->srpc_replymsg;
+	srpc_msg_t       *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
+	srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
+	int		  npg;
+	int	       rc;
+
+	LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+		LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+		__swab32s(&reqst->brw_rw);
+		__swab32s(&reqst->brw_len);
+		__swab32s(&reqst->brw_flags);
+		__swab64s(&reqst->brw_rpyid);
+		__swab64s(&reqst->brw_bulkid);
+	}
+	LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id));
+
+	reply->brw_status = 0;
+	rpc->srpc_done = brw_server_rpc_done;
+
+	if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) ||
+	    (reqst->brw_flags != LST_BRW_CHECK_NONE &&
+	     reqst->brw_flags != LST_BRW_CHECK_FULL &&
+	     reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) {
+		reply->brw_status = EINVAL;
+		return 0;
+	}
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		reply->brw_status = EPROTO;
+		return 0;
+	}
+
+	if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+		/* compat with old version */
+		if ((reqst->brw_len & ~CFS_PAGE_MASK) != 0) {
+			reply->brw_status = EINVAL;
+			return 0;
+		}
+		npg = reqst->brw_len >> PAGE_CACHE_SHIFT;
+
+	} else {
+		npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) {
+		reply->brw_status = EINVAL;
+		return 0;
+	}
+
+	rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg,
+			     reqst->brw_len,
+			     reqst->brw_rw == LST_BRW_WRITE);
+	if (rc != 0)
+		return rc;
+
+	if (reqst->brw_rw == LST_BRW_READ)
+		brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC);
+	else
+		brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON);
+
+	return 0;
+}
+
+sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void)
+{
+	brw_test_client.tso_init       = brw_client_init;
+	brw_test_client.tso_fini       = brw_client_fini;
+	brw_test_client.tso_prep_rpc   = brw_client_prep_rpc;
+	brw_test_client.tso_done_rpc   = brw_client_done_rpc;
+};
+
+srpc_service_t brw_test_service;
+void brw_init_test_service(void)
+{
+
+	brw_test_service.sv_id	 = SRPC_SERVICE_BRW;
+	brw_test_service.sv_name       = "brw_test";
+	brw_test_service.sv_handler    = brw_server_handle;
+	brw_test_service.sv_bulk_ready = brw_bulk_ready;
+	brw_test_service.sv_wi_total   = brw_srv_workitems;
+}

diff --git a/drivers/staging/lustre/lnet/selftest/conctl.c b/drivers/staging/lustre/lnet/selftest/conctl.c
new file mode 100644
index 0000000..bce3d3b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conctl.c

@@ -0,0 +1,931 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * IOC handle in kernel
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnetst.h>
+#include "console.h"
+
+int
+lst_session_new_ioctl(lstio_session_new_args_t *args)
+{
+	char      *name;
+	int	rc;
+
+	if (args->lstio_ses_idp   == NULL || /* address for output sid */
+	    args->lstio_ses_key   == 0 || /* no key is specified */
+	    args->lstio_ses_namep == NULL || /* session name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_ses_namep,
+			       args->lstio_ses_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_ses_nmlen] = 0;
+
+	rc = lstcon_session_new(name,
+				args->lstio_ses_key,
+				args->lstio_ses_feats,
+				args->lstio_ses_force,
+				args->lstio_ses_timeout,
+				args->lstio_ses_idp);
+
+	LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+	return rc;
+}
+
+int
+lst_session_end_ioctl(lstio_session_end_args_t *args)
+{
+	if (args->lstio_ses_key != console_session.ses_key)
+		return -EACCES;
+
+	return lstcon_session_end();
+}
+
+int
+lst_session_info_ioctl(lstio_session_info_args_t *args)
+{
+	/* no checking of key */
+
+	if (args->lstio_ses_idp   == NULL || /* address for ouput sid */
+	    args->lstio_ses_keyp  == NULL || /* address for ouput key */
+	    args->lstio_ses_featp  == NULL || /* address for ouput features */
+	    args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+	    args->lstio_ses_namep == NULL || /* address for ouput name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_session_info(args->lstio_ses_idp,
+				   args->lstio_ses_keyp,
+				   args->lstio_ses_featp,
+				   args->lstio_ses_ndinfo,
+				   args->lstio_ses_namep,
+				   args->lstio_ses_nmlen);
+}
+
+int
+lst_debug_ioctl(lstio_debug_args_t *args)
+{
+	char   *name   = NULL;
+	int     client = 1;
+	int     rc;
+
+	if (args->lstio_dbg_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_dbg_resultp == NULL)
+		return -EINVAL;
+
+	if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+	    (args->lstio_dbg_nmlen <= 0 ||
+	     args->lstio_dbg_nmlen > LST_NAME_SIZE))
+		return -EINVAL;
+
+	if (args->lstio_dbg_namep != NULL) {
+		LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+		if (name == NULL)
+			return -ENOMEM;
+
+		if (copy_from_user(name, args->lstio_dbg_namep,
+				       args->lstio_dbg_nmlen)) {
+			LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+			return -EFAULT;
+		}
+
+		name[args->lstio_dbg_nmlen] = 0;
+	}
+
+	rc = -EINVAL;
+
+	switch (args->lstio_dbg_type) {
+	case LST_OPC_SESSION:
+		rc = lstcon_session_debug(args->lstio_dbg_timeout,
+					  args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_BATCHSRV:
+		client = 0;
+	case LST_OPC_BATCHCLI:
+		if (name == NULL)
+			goto out;
+
+		rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+					name, client, args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_GROUP:
+		if (name == NULL)
+			goto out;
+
+		rc = lstcon_group_debug(args->lstio_dbg_timeout,
+					name, args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_NODES:
+		if (args->lstio_dbg_count <= 0 ||
+		    args->lstio_dbg_idsp == NULL)
+			goto out;
+
+		rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+					args->lstio_dbg_count,
+					args->lstio_dbg_idsp,
+					args->lstio_dbg_resultp);
+		break;
+
+	default:
+		break;
+	}
+
+out:
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_group_add_ioctl(lstio_group_add_args_t *args)
+{
+	char	   *name;
+	int	     rc;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_add(name);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_group_del_ioctl(lstio_group_del_args_t *args)
+{
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_del(name);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_group_update_ioctl(lstio_group_update_args_t *args)
+{
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			   args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	switch (args->lstio_grp_opc) {
+	case LST_GROUP_CLEAN:
+		rc = lstcon_group_clean(name, args->lstio_grp_args);
+		break;
+
+	case LST_GROUP_REFRESH:
+		rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+		break;
+
+	case LST_GROUP_RMND:
+		if (args->lstio_grp_count  <= 0 ||
+		    args->lstio_grp_idsp == NULL) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+					 args->lstio_grp_idsp,
+					 args->lstio_grp_resultp);
+		break;
+
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_nodes_add_ioctl(lstio_group_nodes_args_t *args)
+{
+	unsigned feats;
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_idsp == NULL || /* array of ids */
+	    args->lstio_grp_count <= 0 ||
+	    args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_featp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_nodes_add(name, args->lstio_grp_count,
+			      args->lstio_grp_idsp, &feats,
+			      args->lstio_grp_resultp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	if (rc == 0 &&
+	    copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) {
+		return -EINVAL;
+	}
+
+	return rc;
+}
+
+int
+lst_group_list_ioctl(lstio_group_list_args_t *args)
+{
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_idx   < 0 ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_group_list(args->lstio_grp_idx,
+			      args->lstio_grp_nmlen,
+			      args->lstio_grp_namep);
+}
+
+int
+lst_group_info_ioctl(lstio_group_info_args_t *args)
+{
+	char	   *name;
+	int	     ndent;
+	int	     index;
+	int	     rc;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_grp_entp  == NULL && /* output: group entry */
+	    args->lstio_grp_dentsp == NULL)  /* output: node entry */
+		return -EINVAL;
+
+	if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+		if (args->lstio_grp_idxp == NULL || /* node index */
+		    args->lstio_grp_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
+
+		if (copy_from_user(&ndent, args->lstio_grp_ndentp,
+				       sizeof(ndent)) ||
+		    copy_from_user(&index, args->lstio_grp_idxp,
+				       sizeof(index)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_info(name, args->lstio_grp_entp,
+			       &index, &ndent, args->lstio_grp_dentsp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	if (rc != 0)
+		return rc;
+
+	if (args->lstio_grp_dentsp != NULL &&
+	    (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent))))
+		rc = -EFAULT;
+
+	return 0;
+}
+
+int
+lst_batch_add_ioctl(lstio_batch_add_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_add(name);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_run_ioctl(lstio_batch_run_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+			      args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_stop_ioctl(lstio_batch_stop_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_stop(name, args->lstio_bat_force,
+			       args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_query_ioctl(lstio_batch_query_args_t *args)
+{
+	char   *name;
+	int     rc;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_bat_testidx < 0)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_test_batch_query(name,
+				     args->lstio_bat_testidx,
+				     args->lstio_bat_client,
+				     args->lstio_bat_timeout,
+				     args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_list_ioctl(lstio_batch_list_args_t *args)
+{
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_idx   < 0 ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_batch_list(args->lstio_bat_idx,
+			      args->lstio_bat_nmlen,
+			      args->lstio_bat_namep);
+}
+
+int
+lst_batch_info_ioctl(lstio_batch_info_args_t *args)
+{
+	char	   *name;
+	int	     rc;
+	int	     index;
+	int	     ndent;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL || /* batch name */
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_bat_entp == NULL && /* output: batch entry */
+	    args->lstio_bat_dentsp == NULL) /* output: node entry */
+		return -EINVAL;
+
+	if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+		if (args->lstio_bat_idxp == NULL || /* node index */
+		    args->lstio_bat_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
+
+		if (copy_from_user(&index, args->lstio_bat_idxp,
+				       sizeof(index)) ||
+		    copy_from_user(&ndent, args->lstio_bat_ndentp,
+				       sizeof(ndent)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep, args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_info(name,
+			    args->lstio_bat_entp, args->lstio_bat_server,
+			    args->lstio_bat_testidx, &index, &ndent,
+			    args->lstio_bat_dentsp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	if (rc != 0)
+		return rc;
+
+	if (args->lstio_bat_dentsp != NULL &&
+	    (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent))))
+		rc = -EFAULT;
+
+	return rc;
+}
+
+int
+lst_stat_query_ioctl(lstio_stat_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	/* TODO: not finished */
+	if (args->lstio_sta_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_sta_resultp == NULL ||
+	    (args->lstio_sta_namep  == NULL &&
+	     args->lstio_sta_idsp   == NULL) ||
+	    args->lstio_sta_nmlen <= 0 ||
+	    args->lstio_sta_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_sta_idsp != NULL &&
+	    args->lstio_sta_count <= 0)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_sta_namep,
+			       args->lstio_sta_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+		return -EFAULT;
+	}
+
+	if (args->lstio_sta_idsp == NULL) {
+		rc = lstcon_group_stat(name, args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
+	} else {
+		rc = lstcon_nodes_stat(args->lstio_sta_count,
+				       args->lstio_sta_idsp,
+				       args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
+	}
+
+	LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+
+	return rc;
+}
+
+int lst_test_add_ioctl(lstio_test_args_t *args)
+{
+	char	   *name;
+	char	   *srcgrp = NULL;
+	char	   *dstgrp = NULL;
+	void	   *param = NULL;
+	int	     ret = 0;
+	int	     rc = -ENOMEM;
+
+	if (args->lstio_tes_resultp == NULL ||
+	    args->lstio_tes_retp == NULL ||
+	    args->lstio_tes_bat_name == NULL || /* no specified batch */
+	    args->lstio_tes_bat_nmlen <= 0 ||
+	    args->lstio_tes_bat_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_sgrp_name == NULL || /* no source group */
+	    args->lstio_tes_sgrp_nmlen <= 0 ||
+	    args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_dgrp_name == NULL || /* no target group */
+	    args->lstio_tes_dgrp_nmlen <= 0 ||
+	    args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_tes_loop == 0 || /* negative is infinite */
+	    args->lstio_tes_concur <= 0 ||
+	    args->lstio_tes_dist <= 0 ||
+	    args->lstio_tes_span <= 0)
+		return -EINVAL;
+
+	/* have parameter, check if parameter length is valid */
+	if (args->lstio_tes_param != NULL &&
+	    (args->lstio_tes_param_len <= 0 ||
+	     args->lstio_tes_param_len > PAGE_CACHE_SIZE - sizeof(lstcon_test_t)))
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_tes_bat_nmlen + 1);
+	if (name == NULL)
+		return rc;
+
+	LIBCFS_ALLOC(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+	if (srcgrp == NULL)
+		goto out;
+
+	LIBCFS_ALLOC(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+	 if (dstgrp == NULL)
+		goto out;
+
+	if (args->lstio_tes_param != NULL) {
+		LIBCFS_ALLOC(param, args->lstio_tes_param_len);
+		if (param == NULL)
+			goto out;
+	}
+
+	rc = -EFAULT;
+	if (copy_from_user(name,
+			      args->lstio_tes_bat_name,
+			      args->lstio_tes_bat_nmlen) ||
+	    copy_from_user(srcgrp,
+			      args->lstio_tes_sgrp_name,
+			      args->lstio_tes_sgrp_nmlen) ||
+	    copy_from_user(dstgrp,
+			      args->lstio_tes_dgrp_name,
+			      args->lstio_tes_dgrp_nmlen) ||
+	    copy_from_user(param, args->lstio_tes_param,
+			      args->lstio_tes_param_len))
+		goto out;
+
+	rc = lstcon_test_add(name,
+			    args->lstio_tes_type,
+			    args->lstio_tes_loop,
+			    args->lstio_tes_concur,
+			    args->lstio_tes_dist, args->lstio_tes_span,
+			    srcgrp, dstgrp, param, args->lstio_tes_param_len,
+			    &ret, args->lstio_tes_resultp);
+
+	if (ret != 0)
+		rc = (copy_to_user(args->lstio_tes_retp, &ret,
+				       sizeof(ret))) ? -EFAULT : 0;
+out:
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_tes_bat_nmlen + 1);
+
+	if (srcgrp != NULL)
+		LIBCFS_FREE(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+
+	if (dstgrp != NULL)
+		LIBCFS_FREE(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+
+	if (param != NULL)
+		LIBCFS_FREE(param, args->lstio_tes_param_len);
+
+	return rc;
+}
+
+int
+lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+	char   *buf;
+	int     opc = data->ioc_u32[0];
+	int     rc;
+
+	if (cmd != IOC_LIBCFS_LNETST)
+		return -EINVAL;
+
+	if (data->ioc_plen1 > PAGE_CACHE_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(buf, data->ioc_plen1);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	/* copy in parameter */
+	if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
+		LIBCFS_FREE(buf, data->ioc_plen1);
+		return -EFAULT;
+	}
+
+	mutex_lock(&console_session.ses_mutex);
+
+	console_session.ses_laststamp = cfs_time_current_sec();
+
+	if (console_session.ses_shutdown) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	if (console_session.ses_expired)
+		lstcon_session_end();
+
+	if (opc != LSTIO_SESSION_NEW &&
+	    console_session.ses_state == LST_SESSION_NONE) {
+		CDEBUG(D_NET, "LST no active session\n");
+		rc = -ESRCH;
+		goto out;
+	}
+
+	memset(&console_session.ses_trans_stat, 0, sizeof(lstcon_trans_stat_t));
+
+	switch (opc) {
+		case LSTIO_SESSION_NEW:
+			rc = lst_session_new_ioctl((lstio_session_new_args_t *)buf);
+			break;
+		case LSTIO_SESSION_END:
+			rc = lst_session_end_ioctl((lstio_session_end_args_t *)buf);
+			break;
+		case LSTIO_SESSION_INFO:
+			rc = lst_session_info_ioctl((lstio_session_info_args_t *)buf);
+			break;
+		case LSTIO_DEBUG:
+			rc = lst_debug_ioctl((lstio_debug_args_t *)buf);
+			break;
+		case LSTIO_GROUP_ADD:
+			rc = lst_group_add_ioctl((lstio_group_add_args_t *)buf);
+			break;
+		case LSTIO_GROUP_DEL:
+			rc = lst_group_del_ioctl((lstio_group_del_args_t *)buf);
+			break;
+		case LSTIO_GROUP_UPDATE:
+			rc = lst_group_update_ioctl((lstio_group_update_args_t *)buf);
+			break;
+		case LSTIO_NODES_ADD:
+			rc = lst_nodes_add_ioctl((lstio_group_nodes_args_t *)buf);
+			break;
+		case LSTIO_GROUP_LIST:
+			rc = lst_group_list_ioctl((lstio_group_list_args_t *)buf);
+			break;
+		case LSTIO_GROUP_INFO:
+			rc = lst_group_info_ioctl((lstio_group_info_args_t *)buf);
+			break;
+		case LSTIO_BATCH_ADD:
+			rc = lst_batch_add_ioctl((lstio_batch_add_args_t *)buf);
+			break;
+		case LSTIO_BATCH_START:
+			rc = lst_batch_run_ioctl((lstio_batch_run_args_t *)buf);
+			break;
+		case LSTIO_BATCH_STOP:
+			rc = lst_batch_stop_ioctl((lstio_batch_stop_args_t *)buf);
+			break;
+		case LSTIO_BATCH_QUERY:
+			rc = lst_batch_query_ioctl((lstio_batch_query_args_t *)buf);
+			break;
+		case LSTIO_BATCH_LIST:
+			rc = lst_batch_list_ioctl((lstio_batch_list_args_t *)buf);
+			break;
+		case LSTIO_BATCH_INFO:
+			rc = lst_batch_info_ioctl((lstio_batch_info_args_t *)buf);
+			break;
+		case LSTIO_TEST_ADD:
+			rc = lst_test_add_ioctl((lstio_test_args_t *)buf);
+			break;
+		case LSTIO_STAT_QUERY:
+			rc = lst_stat_query_ioctl((lstio_stat_args_t *)buf);
+			break;
+		default:
+			rc = -EINVAL;
+	}
+
+	if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
+			     sizeof(lstcon_trans_stat_t)))
+		rc = -EFAULT;
+out:
+	mutex_unlock(&console_session.ses_mutex);
+
+	LIBCFS_FREE(buf, data->ioc_plen1);
+
+	return rc;
+}
+
+EXPORT_SYMBOL(lstcon_ioctl_entry);

diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c
new file mode 100644
index 0000000..446de0e
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conrpc.c

@@ -0,0 +1,1397 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Console framework rpcs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "timer.h"
+#include "conrpc.h"
+#include "console.h"
+
+void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
+			   lstcon_node_t *, lstcon_trans_stat_t *);
+
+static void
+lstcon_rpc_done(srpc_client_rpc_t *rpc)
+{
+	lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+
+	LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
+	LASSERT(crpc->crp_posted && !crpc->crp_finished);
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (crpc->crp_trans == NULL) {
+		/* Orphan RPC is not in any transaction,
+		 * I'm just a poor body and nobody loves me */
+		spin_unlock(&rpc->crpc_lock);
+
+		/* release it */
+		lstcon_rpc_put(crpc);
+		return;
+	}
+
+	/* not an orphan RPC */
+	crpc->crp_finished = 1;
+
+	if (crpc->crp_stamp == 0) {
+		/* not aborted */
+		LASSERT (crpc->crp_status == 0);
+
+		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_status = rpc->crpc_status;
+	}
+
+	/* wakeup (transaction)thread if I'm the last RPC in the transaction */
+	if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining))
+		wake_up(&crpc->crp_trans->tas_waitq);
+
+	spin_unlock(&rpc->crpc_lock);
+}
+
+int
+lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+{
+	crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
+				       feats, bulk_npg, bulk_len,
+				       lstcon_rpc_done, (void *)crpc);
+	if (crpc->crp_rpc == NULL)
+		return -ENOMEM;
+
+	crpc->crp_trans    = NULL;
+	crpc->crp_node     = nd;
+	crpc->crp_posted   = 0;
+	crpc->crp_finished = 0;
+	crpc->crp_unpacked = 0;
+	crpc->crp_status   = 0;
+	crpc->crp_stamp    = 0;
+	crpc->crp_embedded = embedded;
+	INIT_LIST_HEAD(&crpc->crp_link);
+
+	atomic_inc(&console_session.ses_rpc_counter);
+
+	return 0;
+}
+
+int
+lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+{
+	lstcon_rpc_t  *crpc = NULL;
+	int	    rc;
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!list_empty(&console_session.ses_rpc_freelist)) {
+		crpc = list_entry(console_session.ses_rpc_freelist.next,
+				      lstcon_rpc_t, crp_link);
+		list_del_init(&crpc->crp_link);
+	}
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (crpc == NULL) {
+		LIBCFS_ALLOC(crpc, sizeof(*crpc));
+		if (crpc == NULL)
+			return -ENOMEM;
+	}
+
+	rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc);
+	if (rc == 0) {
+		*crpcpp = crpc;
+		return 0;
+	}
+
+	LIBCFS_FREE(crpc, sizeof(*crpc));
+
+	return rc;
+}
+
+void
+lstcon_rpc_put(lstcon_rpc_t *crpc)
+{
+	srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
+	int	  i;
+
+	LASSERT (list_empty(&crpc->crp_link));
+
+	for (i = 0; i < bulk->bk_niov; i++) {
+		if (bulk->bk_iovs[i].kiov_page == NULL)
+			continue;
+
+		__free_page(bulk->bk_iovs[i].kiov_page);
+	}
+
+	srpc_client_rpc_decref(crpc->crp_rpc);
+
+	if (crpc->crp_embedded) {
+		/* embedded RPC, don't recycle it */
+		memset(crpc, 0, sizeof(*crpc));
+		crpc->crp_embedded = 1;
+
+	} else {
+		spin_lock(&console_session.ses_rpc_lock);
+
+		list_add(&crpc->crp_link,
+			     &console_session.ses_rpc_freelist);
+
+		spin_unlock(&console_session.ses_rpc_lock);
+	}
+
+	/* RPC is not alive now */
+	atomic_dec(&console_session.ses_rpc_counter);
+}
+
+void
+lstcon_rpc_post(lstcon_rpc_t *crpc)
+{
+	lstcon_rpc_trans_t *trans = crpc->crp_trans;
+
+	LASSERT (trans != NULL);
+
+	atomic_inc(&trans->tas_remaining);
+	crpc->crp_posted = 1;
+
+	sfw_post_rpc(crpc->crp_rpc);
+}
+
+static char *
+lstcon_rpc_trans_name(int transop)
+{
+	if (transop == LST_TRANS_SESNEW)
+		return "SESNEW";
+
+	if (transop == LST_TRANS_SESEND)
+		return "SESEND";
+
+	if (transop == LST_TRANS_SESQRY)
+		return "SESQRY";
+
+	if (transop == LST_TRANS_SESPING)
+		return "SESPING";
+
+	if (transop == LST_TRANS_TSBCLIADD)
+		return "TSBCLIADD";
+
+	if (transop == LST_TRANS_TSBSRVADD)
+		return "TSBSRVADD";
+
+	if (transop == LST_TRANS_TSBRUN)
+		return "TSBRUN";
+
+	if (transop == LST_TRANS_TSBSTOP)
+		return "TSBSTOP";
+
+	if (transop == LST_TRANS_TSBCLIQRY)
+		return "TSBCLIQRY";
+
+	if (transop == LST_TRANS_TSBSRVQRY)
+		return "TSBSRVQRY";
+
+	if (transop == LST_TRANS_STATQRY)
+		return "STATQRY";
+
+	return "Unknown";
+}
+
+int
+lstcon_rpc_trans_prep(struct list_head *translist,
+		      int transop, lstcon_rpc_trans_t **transpp)
+{
+	lstcon_rpc_trans_t *trans;
+
+	if (translist != NULL) {
+		list_for_each_entry(trans, translist, tas_link) {
+			/* Can't enqueue two private transaction on
+			 * the same object */
+			if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE)
+				return -EPERM;
+		}
+	}
+
+	/* create a trans group */
+	LIBCFS_ALLOC(trans, sizeof(*trans));
+	if (trans == NULL)
+		return -ENOMEM;
+
+	trans->tas_opc = transop;
+
+	if (translist == NULL)
+		INIT_LIST_HEAD(&trans->tas_olink);
+	else
+		list_add_tail(&trans->tas_olink, translist);
+
+	list_add_tail(&trans->tas_link, &console_session.ses_trans_list);
+
+	INIT_LIST_HEAD(&trans->tas_rpcs_list);
+	atomic_set(&trans->tas_remaining, 0);
+	init_waitqueue_head(&trans->tas_waitq);
+
+	spin_lock(&console_session.ses_rpc_lock);
+	trans->tas_features = console_session.ses_features;
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	*transpp = trans;
+	return 0;
+}
+
+void
+lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+{
+	list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
+	crpc->crp_trans = trans;
+}
+
+void
+lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+{
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t      *crpc;
+	lstcon_node_t     *nd;
+
+	list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		if (!crpc->crp_posted || /* not posted */
+		    crpc->crp_stamp != 0) { /* rpc done or aborted already */
+			if (crpc->crp_stamp == 0) {
+				crpc->crp_stamp = cfs_time_current();
+				crpc->crp_status = -EINTR;
+			}
+			spin_unlock(&rpc->crpc_lock);
+			continue;
+		}
+
+		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_status = error;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		sfw_abort_rpc(rpc);
+
+		if  (error != ETIMEDOUT)
+			continue;
+
+		nd = crpc->crp_node;
+		if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+			continue;
+
+		nd->nd_stamp = crpc->crp_stamp;
+		nd->nd_state = LST_NODE_DOWN;
+	}
+}
+
+static int
+lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+{
+	if (console_session.ses_shutdown &&
+	    !list_empty(&trans->tas_olink)) /* Not an end session RPC */
+		return 1;
+
+	return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0;
+}
+
+int
+lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+{
+	lstcon_rpc_t  *crpc;
+	int	    rc;
+
+	if (list_empty(&trans->tas_rpcs_list))
+		return 0;
+
+	if (timeout < LST_TRANS_MIN_TIMEOUT)
+		timeout = LST_TRANS_MIN_TIMEOUT;
+
+	CDEBUG(D_NET, "Transaction %s started\n",
+	       lstcon_rpc_trans_name(trans->tas_opc));
+
+	/* post all requests */
+	list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+		LASSERT (!crpc->crp_posted);
+
+		lstcon_rpc_post(crpc);
+	}
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	rc = wait_event_interruptible_timeout(trans->tas_waitq,
+					      lstcon_rpc_trans_check(trans),
+					      cfs_time_seconds(timeout));
+	rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	if (console_session.ses_shutdown)
+		rc = -ESHUTDOWN;
+
+	if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) {
+		/* treat short timeout as canceled */
+		if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2)
+			rc = -EINTR;
+
+		lstcon_rpc_trans_abort(trans, rc);
+	}
+
+	CDEBUG(D_NET, "Transaction %s stopped: %d\n",
+	       lstcon_rpc_trans_name(trans->tas_opc), rc);
+
+	lstcon_rpc_trans_stat(trans, lstcon_trans_stat());
+
+	return rc;
+}
+
+int
+lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+{
+	lstcon_node_t	*nd  = crpc->crp_node;
+	srpc_client_rpc_t    *rpc = crpc->crp_rpc;
+	srpc_generic_reply_t *rep;
+
+	LASSERT (nd != NULL && rpc != NULL);
+	LASSERT (crpc->crp_stamp != 0);
+
+	if (crpc->crp_status != 0) {
+		*msgpp = NULL;
+		return crpc->crp_status;
+	}
+
+	*msgpp = &rpc->crpc_replymsg;
+	if (!crpc->crp_unpacked) {
+		sfw_unpack_message(*msgpp);
+		crpc->crp_unpacked = 1;
+	}
+
+	if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+		return 0;
+
+	nd->nd_stamp = crpc->crp_stamp;
+	rep = &(*msgpp)->msg_body.reply;
+
+	if (rep->sid.ses_nid == LNET_NID_ANY)
+		nd->nd_state = LST_NODE_UNKNOWN;
+	else if (lstcon_session_match(rep->sid))
+		nd->nd_state = LST_NODE_ACTIVE;
+	else
+		nd->nd_state = LST_NODE_BUSY;
+
+	return 0;
+}
+
+void
+lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, lstcon_trans_stat_t *stat)
+{
+	lstcon_rpc_t      *crpc;
+	srpc_msg_t	*rep;
+	int		error;
+
+	LASSERT (stat != NULL);
+
+	memset(stat, 0, sizeof(*stat));
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		lstcon_rpc_stat_total(stat, 1);
+
+		LASSERT (crpc->crp_stamp != 0);
+
+		error = lstcon_rpc_get_reply(crpc, &rep);
+		if (error != 0) {
+			lstcon_rpc_stat_failure(stat, 1);
+			if (stat->trs_rpc_errno == 0)
+				stat->trs_rpc_errno = -error;
+
+			continue;
+		}
+
+		lstcon_rpc_stat_success(stat, 1);
+
+		lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat);
+	}
+
+	if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) {
+		stat->trs_fwk_errno =
+		      lstcon_session_feats_check(trans->tas_features);
+	}
+
+	CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, "
+		      "RPC error(%d), Framework error(%d)\n",
+	       lstcon_rpc_trans_name(trans->tas_opc),
+	       lstcon_rpc_stat_success(stat, 0),
+	       lstcon_rpc_stat_failure(stat, 0),
+	       lstcon_rpc_stat_total(stat, 0),
+	       stat->trs_rpc_errno, stat->trs_fwk_errno);
+
+	return;
+}
+
+int
+lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+			     struct list_head *head_up,
+			     lstcon_rpc_readent_func_t readent)
+{
+	struct list_head	    tmp;
+	struct list_head	   *next;
+	lstcon_rpc_ent_t     *ent;
+	srpc_generic_reply_t *rep;
+	lstcon_rpc_t	 *crpc;
+	srpc_msg_t	   *msg;
+	lstcon_node_t	*nd;
+	cfs_duration_t	dur;
+	struct timeval	tv;
+	int		   error;
+
+	LASSERT (head_up != NULL);
+
+	next = head_up;
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		if (copy_from_user(&tmp, next,
+				       sizeof(struct list_head)))
+			return -EFAULT;
+
+		if (tmp.next == head_up)
+			return 0;
+
+		next = tmp.next;
+
+		ent = list_entry(next, lstcon_rpc_ent_t, rpe_link);
+
+		LASSERT (crpc->crp_stamp != 0);
+
+		error = lstcon_rpc_get_reply(crpc, &msg);
+
+		nd = crpc->crp_node;
+
+		dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
+		      (cfs_time_t)console_session.ses_id.ses_stamp);
+		cfs_duration_usec(dur, &tv);
+
+		if (copy_to_user(&ent->rpe_peer,
+				     &nd->nd_id, sizeof(lnet_process_id_t)) ||
+		    copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) ||
+		    copy_to_user(&ent->rpe_state,
+				     &nd->nd_state, sizeof(nd->nd_state)) ||
+		    copy_to_user(&ent->rpe_rpc_errno, &error,
+				     sizeof(error)))
+			return -EFAULT;
+
+		if (error != 0)
+			continue;
+
+		/* RPC is done */
+		rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+
+		if (copy_to_user(&ent->rpe_sid,
+				     &rep->sid, sizeof(lst_sid_t)) ||
+		    copy_to_user(&ent->rpe_fwk_errno,
+				     &rep->status, sizeof(rep->status)))
+			return -EFAULT;
+
+		if (readent == NULL)
+			continue;
+
+		if ((error = readent(trans->tas_opc, msg, ent)) != 0)
+			return error;
+	}
+
+	return 0;
+}
+
+void
+lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+{
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t      *crpc;
+	lstcon_rpc_t      *tmp;
+	int		count = 0;
+
+	list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list,
+				 crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		/* free it if not posted or finished already */
+		if (!crpc->crp_posted || crpc->crp_finished) {
+			spin_unlock(&rpc->crpc_lock);
+
+			list_del_init(&crpc->crp_link);
+			lstcon_rpc_put(crpc);
+
+			continue;
+		}
+
+		/* rpcs can be still not callbacked (even LNetMDUnlink is called)
+		 * because huge timeout for inaccessible network, don't make
+		 * user wait for them, just abandon them, they will be recycled
+		 * in callback */
+
+		LASSERT (crpc->crp_status != 0);
+
+		crpc->crp_node  = NULL;
+		crpc->crp_trans = NULL;
+		list_del_init(&crpc->crp_link);
+		count ++;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		atomic_dec(&trans->tas_remaining);
+	}
+
+	LASSERT (atomic_read(&trans->tas_remaining) == 0);
+
+	list_del(&trans->tas_link);
+	if (!list_empty(&trans->tas_olink))
+		list_del(&trans->tas_olink);
+
+	CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n",
+	       lstcon_rpc_trans_name(trans->tas_opc), count);
+
+	LIBCFS_FREE(trans, sizeof(*trans));
+
+	return;
+}
+
+int
+lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
+		   unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_mksn_reqst_t *msrq;
+	srpc_rmsn_reqst_t *rsrq;
+	int		rc;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION,
+				     feats, 0, 0, crpc);
+		if (rc != 0)
+			return rc;
+
+		msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
+		msrq->mksn_sid     = console_session.ses_id;
+		msrq->mksn_force   = console_session.ses_force;
+		strncpy(msrq->mksn_name, console_session.ses_name,
+			strlen(console_session.ses_name));
+		break;
+
+	case LST_TRANS_SESEND:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION,
+				     feats, 0, 0, crpc);
+		if (rc != 0)
+			return rc;
+
+		rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst;
+		rsrq->rmsn_sid = console_session.ses_id;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	return 0;
+}
+
+int
+lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_debug_reqst_t *drq;
+	int		    rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+	drq->dbg_sid   = console_session.ses_id;
+	drq->dbg_flags = 0;
+
+	return rc;
+}
+
+int
+lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+		   lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+{
+	lstcon_batch_t	   *batch;
+	srpc_batch_reqst_t *brq;
+	int		    rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst;
+
+	brq->bar_sid     = console_session.ses_id;
+	brq->bar_bid     = tsb->tsb_id;
+	brq->bar_testidx = tsb->tsb_index;
+	brq->bar_opc     = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN :
+			   (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP:
+			    SRPC_BATCH_OPC_QUERY);
+
+	if (transop != LST_TRANS_TSBRUN &&
+	    transop != LST_TRANS_TSBSTOP)
+		return 0;
+
+	LASSERT (tsb->tsb_index == 0);
+
+	batch = (lstcon_batch_t *)tsb;
+	brq->bar_arg = batch->bat_arg;
+
+	return 0;
+}
+
+int
+lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_stat_reqst_t *srq;
+	int		   rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst;
+
+	srq->str_sid  = console_session.ses_id;
+	srq->str_type = 0; /* XXX remove it */
+
+	return 0;
+}
+
+lnet_process_id_packed_t *
+lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
+{
+	lnet_process_id_packed_t *pid;
+	int		       i;
+
+	i = idx / SFW_ID_PER_PAGE;
+
+	LASSERT (i < nkiov);
+
+	pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page);
+
+	return &pid[idx % SFW_ID_PER_PAGE];
+}
+
+int
+lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+		     int dist, int span, int nkiov, lnet_kiov_t *kiov)
+{
+	lnet_process_id_packed_t *pid;
+	lstcon_ndlink_t	  *ndl;
+	lstcon_node_t	    *nd;
+	int		       start;
+	int		       end;
+	int		       i = 0;
+
+	LASSERT (dist >= 1);
+	LASSERT (span >= 1);
+	LASSERT (grp->grp_nnode >= 1);
+
+	if (span > grp->grp_nnode)
+		return -EINVAL;
+
+	start = ((idx / dist) * span) % grp->grp_nnode;
+	end   = ((idx / dist) * span + span - 1) % grp->grp_nnode;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+		if (i < start) {
+			i ++;
+			continue;
+		}
+
+		if (i > (end >= start ? end: grp->grp_nnode))
+			break;
+
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	if (start <= end) /* done */
+		return 0;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		if (i > grp->grp_nnode + end)
+			break;
+
+		nd = ndl->ndl_node;
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	return 0;
+}
+
+int
+lstcon_pingrpc_prep(lst_test_ping_param_t *param, srpc_test_reqst_t *req)
+{
+	test_ping_req_t *prq = &req->tsr_u.ping;
+
+	prq->png_size   = param->png_size;
+	prq->png_flags  = param->png_flags;
+	/* TODO dest */
+	return 0;
+}
+
+int
+lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+	test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+
+	brq->blk_opc    = param->blk_opc;
+	brq->blk_npg    = (param->blk_size + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE;
+	brq->blk_flags  = param->blk_flags;
+
+	return 0;
+}
+
+int
+lstcon_bulkrpc_v1_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+	test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+
+	brq->blk_opc	= param->blk_opc;
+	brq->blk_flags	= param->blk_flags;
+	brq->blk_len	= param->blk_size;
+	brq->blk_offset	= 0; /* reserved */
+
+	return 0;
+}
+
+int
+lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+		    lstcon_test_t *test, lstcon_rpc_t **crpc)
+{
+	lstcon_group_t    *sgrp = test->tes_src_grp;
+	lstcon_group_t    *dgrp = test->tes_dst_grp;
+	srpc_test_reqst_t *trq;
+	srpc_bulk_t       *bulk;
+	int		i;
+	int		   npg = 0;
+	int		   nob = 0;
+	int		   rc  = 0;
+
+	if (transop == LST_TRANS_TSBCLIADD) {
+		npg = sfw_id_pages(test->tes_span);
+		nob = (feats & LST_FEAT_BULK_LEN) == 0 ?
+		      npg * PAGE_CACHE_SIZE :
+		      sizeof(lnet_process_id_packed_t) * test->tes_span;
+	}
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc);
+	if (rc != 0)
+		return rc;
+
+	trq  = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst;
+
+	if (transop == LST_TRANS_TSBSRVADD) {
+		int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist;
+		int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span;
+		int nmax = (ndist + nspan - 1) / nspan;
+
+		trq->tsr_ndest = 0;
+		trq->tsr_loop  = nmax * test->tes_dist * test->tes_concur;
+
+	} else {
+		bulk = &(*crpc)->crp_rpc->crpc_bulk;
+
+		for (i = 0; i < npg; i++) {
+			int	len;
+
+			LASSERT(nob > 0);
+
+			len = (feats & LST_FEAT_BULK_LEN) == 0 ?
+			      PAGE_CACHE_SIZE : min_t(int, nob, PAGE_CACHE_SIZE);
+			nob -= len;
+
+			bulk->bk_iovs[i].kiov_offset = 0;
+			bulk->bk_iovs[i].kiov_len    = len;
+			bulk->bk_iovs[i].kiov_page   =
+				alloc_page(GFP_IOFS);
+
+			if (bulk->bk_iovs[i].kiov_page == NULL) {
+				lstcon_rpc_put(*crpc);
+				return -ENOMEM;
+			}
+		}
+
+		bulk->bk_sink = 0;
+
+		LASSERT (transop == LST_TRANS_TSBCLIADD);
+
+		rc = lstcon_dstnodes_prep(test->tes_dst_grp,
+					  test->tes_cliidx++,
+					  test->tes_dist,
+					  test->tes_span,
+					  npg, &bulk->bk_iovs[0]);
+		if (rc != 0) {
+			lstcon_rpc_put(*crpc);
+			return rc;
+		}
+
+		trq->tsr_ndest = test->tes_span;
+		trq->tsr_loop  = test->tes_loop;
+	}
+
+	trq->tsr_sid	= console_session.ses_id;
+	trq->tsr_bid	= test->tes_hdr.tsb_id;
+	trq->tsr_concur     = test->tes_concur;
+	trq->tsr_is_client  = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0;
+	trq->tsr_stop_onerr = !!test->tes_stop_onerr;
+
+	switch (test->tes_type) {
+	case LST_TEST_PING:
+		trq->tsr_service = SRPC_SERVICE_PING;
+		rc = lstcon_pingrpc_prep((lst_test_ping_param_t *)
+					 &test->tes_param[0], trq);
+		break;
+
+	case LST_TEST_BULK:
+		trq->tsr_service = SRPC_SERVICE_BRW;
+		if ((feats & LST_FEAT_BULK_LEN) == 0) {
+			rc = lstcon_bulkrpc_v0_prep((lst_test_bulk_param_t *)
+						    &test->tes_param[0], trq);
+		} else {
+			rc = lstcon_bulkrpc_v1_prep((lst_test_bulk_param_t *)
+						    &test->tes_param[0], trq);
+		}
+
+		break;
+	default:
+		LBUG();
+		break;
+	}
+
+	return rc;
+}
+
+int
+lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
+			 lstcon_node_t *nd, srpc_msg_t *reply)
+{
+	srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
+	int		   status   = mksn_rep->mksn_status;
+
+	if (status == 0 &&
+	    (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		mksn_rep->mksn_status = EPROTO;
+		status = EPROTO;
+	}
+
+	if (status == EPROTO) {
+		CNETERR("session protocol error from %s: %u\n",
+			libcfs_nid2str(nd->nd_id.nid),
+			reply->msg_ses_feats);
+	}
+
+	if (status != 0)
+		return status;
+
+	if (!trans->tas_feats_updated) {
+		trans->tas_feats_updated = 1;
+		trans->tas_features = reply->msg_ses_feats;
+	}
+
+	if (reply->msg_ses_feats != trans->tas_features) {
+		CNETERR("Framework features %x from %s is different with "
+			"features on this transaction: %x\n",
+			 reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid),
+			 trans->tas_features);
+		status = mksn_rep->mksn_status = EPROTO;
+	}
+
+	if (status == 0) {
+		/* session timeout on remote node */
+		nd->nd_timeout = mksn_rep->mksn_timeout;
+	}
+
+	return status;
+}
+
+void
+lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
+		      lstcon_node_t *nd, lstcon_trans_stat_t *stat)
+{
+	srpc_rmsn_reply_t  *rmsn_rep;
+	srpc_debug_reply_t *dbg_rep;
+	srpc_batch_reply_t *bat_rep;
+	srpc_test_reply_t  *test_rep;
+	srpc_stat_reply_t  *stat_rep;
+	int		 rc = 0;
+
+	switch (trans->tas_opc) {
+	case LST_TRANS_SESNEW:
+		rc = lstcon_sesnew_stat_reply(trans, nd, msg);
+		if (rc == 0) {
+			lstcon_sesop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_sesop_stat_failure(stat, 1);
+		break;
+
+	case LST_TRANS_SESEND:
+		rmsn_rep = &msg->msg_body.rmsn_reply;
+		/* ESRCH is not an error for end session */
+		if (rmsn_rep->rmsn_status == 0 ||
+		    rmsn_rep->rmsn_status == ESRCH) {
+			lstcon_sesop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_sesop_stat_failure(stat, 1);
+		rc = rmsn_rep->rmsn_status;
+		break;
+
+	case LST_TRANS_SESQRY:
+	case LST_TRANS_SESPING:
+		dbg_rep = &msg->msg_body.dbg_reply;
+
+		if (dbg_rep->dbg_status == ESRCH) {
+			lstcon_sesqry_stat_unknown(stat, 1);
+			return;
+		}
+
+		if (lstcon_session_match(dbg_rep->dbg_sid))
+			lstcon_sesqry_stat_active(stat, 1);
+		else
+			lstcon_sesqry_stat_busy(stat, 1);
+		return;
+
+	case LST_TRANS_TSBRUN:
+	case LST_TRANS_TSBSTOP:
+		bat_rep = &msg->msg_body.bat_reply;
+
+		if (bat_rep->bar_status == 0) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		if (bat_rep->bar_status == EPERM &&
+		    trans->tas_opc == LST_TRANS_TSBSTOP) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_tsbop_stat_failure(stat, 1);
+		rc = bat_rep->bar_status;
+		break;
+
+	case LST_TRANS_TSBCLIQRY:
+	case LST_TRANS_TSBSRVQRY:
+		bat_rep = &msg->msg_body.bat_reply;
+
+		if (bat_rep->bar_active != 0)
+			lstcon_tsbqry_stat_run(stat, 1);
+		else
+			lstcon_tsbqry_stat_idle(stat, 1);
+
+		if (bat_rep->bar_status == 0)
+			return;
+
+		lstcon_tsbqry_stat_failure(stat, 1);
+		rc = bat_rep->bar_status;
+		break;
+
+	case LST_TRANS_TSBCLIADD:
+	case LST_TRANS_TSBSRVADD:
+		test_rep = &msg->msg_body.tes_reply;
+
+		if (test_rep->tsr_status == 0) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_tsbop_stat_failure(stat, 1);
+		rc = test_rep->tsr_status;
+		break;
+
+	case LST_TRANS_STATQRY:
+		stat_rep = &msg->msg_body.stat_reply;
+
+		if (stat_rep->str_status == 0) {
+			lstcon_statqry_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_statqry_stat_failure(stat, 1);
+		rc = stat_rep->str_status;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	if (stat->trs_fwk_errno == 0)
+		stat->trs_fwk_errno = rc;
+
+	return;
+}
+
+int
+lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			struct list_head *translist, int transop,
+			void *arg, lstcon_rpc_cond_func_t condition,
+			lstcon_rpc_trans_t **transpp)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_ndlink_t    *ndl;
+	lstcon_node_t      *nd;
+	lstcon_rpc_t       *rpc;
+	unsigned	    feats;
+	int		 rc;
+
+	/* Creating session RPG for list of nodes */
+
+	rc = lstcon_rpc_trans_prep(translist, transop, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction %d: %d\n", transop, rc);
+		return rc;
+	}
+
+	feats = trans->tas_features;
+	list_for_each_entry(ndl, ndlist, ndl_link) {
+		rc = condition == NULL ? 1 :
+		     condition(transop, ndl->ndl_node, arg);
+
+		if (rc == 0)
+			continue;
+
+		if (rc < 0) {
+			CDEBUG(D_NET, "Condition error while creating RPC "
+				      " for transaction %d: %d\n", transop, rc);
+			break;
+		}
+
+		nd = ndl->ndl_node;
+
+		switch (transop) {
+		case LST_TRANS_SESNEW:
+		case LST_TRANS_SESEND:
+			rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc);
+			break;
+		case LST_TRANS_SESQRY:
+		case LST_TRANS_SESPING:
+			rc = lstcon_dbgrpc_prep(nd, feats, &rpc);
+			break;
+		case LST_TRANS_TSBCLIADD:
+		case LST_TRANS_TSBSRVADD:
+			rc = lstcon_testrpc_prep(nd, transop, feats,
+						 (lstcon_test_t *)arg, &rpc);
+			break;
+		case LST_TRANS_TSBRUN:
+		case LST_TRANS_TSBSTOP:
+		case LST_TRANS_TSBCLIQRY:
+		case LST_TRANS_TSBSRVQRY:
+			rc = lstcon_batrpc_prep(nd, transop, feats,
+						(lstcon_tsb_hdr_t *)arg, &rpc);
+			break;
+		case LST_TRANS_STATQRY:
+			rc = lstcon_statrpc_prep(nd, feats, &rpc);
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc != 0) {
+			CERROR("Failed to create RPC for transaction %s: %d\n",
+			       lstcon_rpc_trans_name(transop), rc);
+			break;
+		}
+
+		lstcon_rpc_trans_addreq(trans, rpc);
+	}
+
+	if (rc == 0) {
+		*transpp = trans;
+		return 0;
+	}
+
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+void
+lstcon_rpc_pinger(void *arg)
+{
+	stt_timer_t	*ptimer = (stt_timer_t *)arg;
+	lstcon_rpc_trans_t *trans;
+	lstcon_rpc_t       *crpc;
+	srpc_msg_t	 *rep;
+	srpc_debug_reqst_t *drq;
+	lstcon_ndlink_t    *ndl;
+	lstcon_node_t      *nd;
+	time_t	      intv;
+	int		 count = 0;
+	int		 rc;
+
+	/* RPC pinger is a special case of transaction,
+	 * it's called by timer at 8 seconds interval.
+	 */
+	mutex_lock(&console_session.ses_mutex);
+
+	if (console_session.ses_shutdown || console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+		return;
+	}
+
+	if (!console_session.ses_expired &&
+	    cfs_time_current_sec() - console_session.ses_laststamp >
+	    (time_t)console_session.ses_timeout)
+		console_session.ses_expired = 1;
+
+	trans = console_session.ses_ping;
+
+	LASSERT (trans != NULL);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+
+		if (console_session.ses_expired) {
+			/* idle console, end session on all nodes */
+			if (nd->nd_state != LST_NODE_ACTIVE)
+				continue;
+
+			rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND,
+						trans->tas_features, &crpc);
+			if (rc != 0) {
+				CERROR("Out of memory\n");
+				break;
+			}
+
+			lstcon_rpc_trans_addreq(trans, crpc);
+			lstcon_rpc_post(crpc);
+
+			continue;
+		}
+
+		crpc = &nd->nd_ping;
+
+		if (crpc->crp_rpc != NULL) {
+			LASSERT (crpc->crp_trans == trans);
+			LASSERT (!list_empty(&crpc->crp_link));
+
+			spin_lock(&crpc->crp_rpc->crpc_lock);
+
+			LASSERT(crpc->crp_posted);
+
+			if (!crpc->crp_finished) {
+				/* in flight */
+				spin_unlock(&crpc->crp_rpc->crpc_lock);
+				continue;
+			}
+
+			spin_unlock(&crpc->crp_rpc->crpc_lock);
+
+			lstcon_rpc_get_reply(crpc, &rep);
+
+			list_del_init(&crpc->crp_link);
+
+			lstcon_rpc_put(crpc);
+		}
+
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			continue;
+
+		intv = cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+						     nd->nd_stamp));
+		if (intv < (time_t)nd->nd_timeout / 2)
+			continue;
+
+		rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
+				     trans->tas_features, 0, 0, 1, crpc);
+		if (rc != 0) {
+			CERROR("Out of memory\n");
+			break;
+		}
+
+		drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+		drq->dbg_sid   = console_session.ses_id;
+		drq->dbg_flags = 0;
+
+		lstcon_rpc_trans_addreq(trans, crpc);
+		lstcon_rpc_post(crpc);
+
+		count ++;
+	}
+
+	if (console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+		return;
+	}
+
+	CDEBUG(D_NET, "Ping %d nodes in session\n", count);
+
+	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+	stt_add_timer(ptimer);
+
+	mutex_unlock(&console_session.ses_mutex);
+}
+
+int
+lstcon_rpc_pinger_start(void)
+{
+	stt_timer_t    *ptimer;
+	int	     rc;
+
+	LASSERT (list_empty(&console_session.ses_rpc_freelist));
+	LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+
+	rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING,
+				   &console_session.ses_ping);
+	if (rc != 0) {
+		CERROR("Failed to create console pinger\n");
+		return rc;
+	}
+
+	ptimer = &console_session.ses_ping_timer;
+	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+
+	stt_add_timer(ptimer);
+
+	return 0;
+}
+
+void
+lstcon_rpc_pinger_stop(void)
+{
+	LASSERT (console_session.ses_shutdown);
+
+	stt_del_timer(&console_session.ses_ping_timer);
+
+	lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN);
+	lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat());
+	lstcon_rpc_trans_destroy(console_session.ses_ping);
+
+	memset(lstcon_trans_stat(), 0, sizeof(lstcon_trans_stat_t));
+
+	console_session.ses_ping = NULL;
+}
+
+void
+lstcon_rpc_cleanup_wait(void)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_rpc_t       *crpc;
+	struct list_head	 *pacer;
+	struct list_head	  zlist;
+
+	/* Called with hold of global mutex */
+
+	LASSERT (console_session.ses_shutdown);
+
+	while (!list_empty(&console_session.ses_trans_list)) {
+		list_for_each(pacer, &console_session.ses_trans_list) {
+			trans = list_entry(pacer, lstcon_rpc_trans_t,
+					       tas_link);
+
+			CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
+			       lstcon_rpc_trans_name(trans->tas_opc));
+
+			wake_up(&trans->tas_waitq);
+		}
+
+		mutex_unlock(&console_session.ses_mutex);
+
+		CWARN("Session is shutting down, "
+		      "waiting for termination of transactions\n");
+		cfs_pause(cfs_time_seconds(1));
+
+		mutex_lock(&console_session.ses_mutex);
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0),
+		       console_session.ses_rpc_lock,
+		       "Network is not accessable or target is down, "
+		       "waiting for %d console RPCs to being recycled\n",
+		       atomic_read(&console_session.ses_rpc_counter));
+
+	list_add(&zlist, &console_session.ses_rpc_freelist);
+	list_del_init(&console_session.ses_rpc_freelist);
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	while (!list_empty(&zlist)) {
+		crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+
+		list_del(&crpc->crp_link);
+		LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+	}
+}
+
+int
+lstcon_rpc_module_init(void)
+{
+	INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list);
+	console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger;
+	console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer;
+
+	console_session.ses_ping = NULL;
+
+	spin_lock_init(&console_session.ses_rpc_lock);
+	atomic_set(&console_session.ses_rpc_counter, 0);
+	INIT_LIST_HEAD(&console_session.ses_rpc_freelist);
+
+	return 0;
+}
+
+void
+lstcon_rpc_module_fini(void)
+{
+	LASSERT (list_empty(&console_session.ses_rpc_freelist));
+	LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+}

diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.h b/drivers/staging/lustre/lnet/selftest/conrpc.h
new file mode 100644
index 0000000..9aba24a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conrpc.h

@@ -0,0 +1,146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * /lnet/selftest/conrpc.h
+ *
+ * Console rpc
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+#ifndef __LST_CONRPC_H__
+#define __LST_CONRPC_H__
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "rpc.h"
+#include "selftest.h"
+
+/* Console rpc and rpc transaction */
+#define LST_TRANS_TIMEOUT       30
+#define LST_TRANS_MIN_TIMEOUT   3
+
+#define LST_VALIDATE_TIMEOUT(t) MIN(MAX(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT)
+
+#define LST_PING_INTERVAL       8
+
+struct lstcon_rpc_trans;
+struct lstcon_tsb_hdr;
+struct lstcon_test;
+struct lstcon_node;
+
+typedef struct lstcon_rpc {
+	struct list_head	       crp_link;       /* chain on rpc transaction */
+	srpc_client_rpc_t       *crp_rpc;	/* client rpc */
+	struct lstcon_node      *crp_node;       /* destination node */
+	struct lstcon_rpc_trans *crp_trans;     /* conrpc transaction */
+
+	unsigned int		 crp_posted:1;   /* rpc is posted */
+	unsigned int		 crp_finished:1; /* rpc is finished */
+	unsigned int		 crp_unpacked:1; /* reply is unpacked */
+	/** RPC is embedded in other structure and can't free it */
+	unsigned int		 crp_embedded:1;
+	int		      crp_status;     /* console rpc errors */
+	cfs_time_t	       crp_stamp;      /* replied time stamp */
+} lstcon_rpc_t;
+
+typedef struct lstcon_rpc_trans {
+	struct list_head	    tas_olink;     /* link chain on owner list */
+	struct list_head	    tas_link;      /* link chain on global list */
+	int		   tas_opc;       /* operation code of transaction */
+	/* features mask is uptodate */
+	unsigned	      tas_feats_updated;
+	/* test features mask */
+	unsigned	      tas_features;
+	wait_queue_head_t	   tas_waitq;     /* wait queue head */
+	atomic_t	  tas_remaining; /* # of un-scheduled rpcs */
+	struct list_head	    tas_rpcs_list; /* queued requests */
+} lstcon_rpc_trans_t;
+
+#define LST_TRANS_PRIVATE       0x1000
+
+#define LST_TRANS_SESNEW	(LST_TRANS_PRIVATE | 0x01)
+#define LST_TRANS_SESEND	(LST_TRANS_PRIVATE | 0x02)
+#define LST_TRANS_SESQRY	0x03
+#define LST_TRANS_SESPING       0x04
+
+#define LST_TRANS_TSBCLIADD     (LST_TRANS_PRIVATE | 0x11)
+#define LST_TRANS_TSBSRVADD     (LST_TRANS_PRIVATE | 0x12)
+#define LST_TRANS_TSBRUN	(LST_TRANS_PRIVATE | 0x13)
+#define LST_TRANS_TSBSTOP       (LST_TRANS_PRIVATE | 0x14)
+#define LST_TRANS_TSBCLIQRY     0x15
+#define LST_TRANS_TSBSRVQRY     0x16
+
+#define LST_TRANS_STATQRY       0x21
+
+typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (* lstcon_rpc_readent_func_t)(int, srpc_msg_t *, lstcon_rpc_ent_t *);
+
+int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+			unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
+			unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+			struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+			 struct lstcon_test *test, lstcon_rpc_t **crpc);
+int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
+			 lstcon_rpc_t **crpc);
+void lstcon_rpc_put(lstcon_rpc_t *crpc);
+int  lstcon_rpc_trans_prep(struct list_head *translist,
+			   int transop, lstcon_rpc_trans_t **transpp);
+int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			     struct list_head *translist, int transop,
+			     void *arg, lstcon_rpc_cond_func_t condition,
+			     lstcon_rpc_trans_t **transpp);
+void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+			   lstcon_trans_stat_t *stat);
+int  lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+				  struct list_head *head_up,
+				  lstcon_rpc_readent_func_t readent);
+void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
+void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
+void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
+int  lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+int  lstcon_rpc_pinger_start(void);
+void lstcon_rpc_pinger_stop(void);
+void lstcon_rpc_cleanup_wait(void);
+int  lstcon_rpc_module_init(void);
+void lstcon_rpc_module_fini(void);
+
+
+#endif

diff --git a/drivers/staging/lustre/lnet/selftest/console.c b/drivers/staging/lustre/lnet/selftest/console.c
new file mode 100644
index 0000000..78e8d04
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/console.c

@@ -0,0 +1,2071 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Infrastructure of LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "console.h"
+#include "conrpc.h"
+
+#define LST_NODE_STATE_COUNTER(nd, p)		   \
+do {						    \
+	if ((nd)->nd_state == LST_NODE_ACTIVE)	  \
+		(p)->nle_nactive ++;		    \
+	else if ((nd)->nd_state == LST_NODE_BUSY)       \
+		(p)->nle_nbusy ++;		      \
+	else if ((nd)->nd_state == LST_NODE_DOWN)       \
+		(p)->nle_ndown ++;		      \
+	else					    \
+		(p)->nle_nunknown ++;		   \
+	(p)->nle_nnode ++;			      \
+} while (0)
+
+lstcon_session_t	console_session;
+
+void
+lstcon_node_get(lstcon_node_t *nd)
+{
+	LASSERT (nd->nd_ref >= 1);
+
+	nd->nd_ref++;
+}
+
+static int
+lstcon_node_find(lnet_process_id_t id, lstcon_node_t **ndpp, int create)
+{
+	lstcon_ndlink_t *ndl;
+	unsigned int     idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+
+	LASSERT (id.nid != LNET_NID_ANY);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		lstcon_node_get(ndl->ndl_node);
+		*ndpp = ndl->ndl_node;
+		return 0;
+	}
+
+	if (!create)
+		return -ENOENT;
+
+	LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+	if (*ndpp == NULL)
+		return -ENOMEM;
+
+	ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+
+	ndl->ndl_node = *ndpp;
+
+	ndl->ndl_node->nd_ref   = 1;
+	ndl->ndl_node->nd_id    = id;
+	ndl->ndl_node->nd_stamp = cfs_time_current();
+	ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+	ndl->ndl_node->nd_timeout = 0;
+	memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+
+	/* queued in global hash & list, no refcount is taken by
+	 * global hash & list, if caller release his refcount,
+	 * node will be released */
+	list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list);
+
+	return 0;
+}
+
+void
+lstcon_node_put(lstcon_node_t *nd)
+{
+	lstcon_ndlink_t  *ndl;
+
+	LASSERT (nd->nd_ref > 0);
+
+	if (--nd->nd_ref > 0)
+		return;
+
+	ndl = (lstcon_ndlink_t *)(nd + 1);
+
+	LASSERT (!list_empty(&ndl->ndl_link));
+	LASSERT (!list_empty(&ndl->ndl_hlink));
+
+	/* remove from session */
+	list_del(&ndl->ndl_link);
+	list_del(&ndl->ndl_hlink);
+
+	LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+}
+
+static int
+lstcon_ndlink_find(struct list_head *hash,
+		   lnet_process_id_t id, lstcon_ndlink_t **ndlpp, int create)
+{
+	unsigned int     idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+	lstcon_ndlink_t *ndl;
+	lstcon_node_t   *nd;
+	int	      rc;
+
+	if (id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	/* search in hash */
+	list_for_each_entry(ndl, &hash[idx], ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		*ndlpp = ndl;
+		return 0;
+	}
+
+	if (create == 0)
+		return -ENOENT;
+
+	/* find or create in session hash */
+	rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0);
+	if (rc != 0)
+		return rc;
+
+	LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+	if (ndl == NULL) {
+		lstcon_node_put(nd);
+		return -ENOMEM;
+	}
+
+	*ndlpp = ndl;
+
+	ndl->ndl_node = nd;
+	INIT_LIST_HEAD(&ndl->ndl_link);
+	list_add_tail(&ndl->ndl_hlink, &hash[idx]);
+
+	return  0;
+}
+
+static void
+lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+{
+	LASSERT (list_empty(&ndl->ndl_link));
+	LASSERT (!list_empty(&ndl->ndl_hlink));
+
+	list_del(&ndl->ndl_hlink); /* delete from hash */
+	lstcon_node_put(ndl->ndl_node);
+
+	LIBCFS_FREE(ndl, sizeof(*ndl));
+}
+
+static int
+lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+{
+	lstcon_group_t *grp;
+	int	     i;
+
+	LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+				   grp_ndl_hash[LST_NODE_HASHSIZE]));
+	if (grp == NULL)
+		return -ENOMEM;
+
+	memset(grp, 0, offsetof(lstcon_group_t,
+				grp_ndl_hash[LST_NODE_HASHSIZE]));
+
+	grp->grp_ref = 1;
+	if (name != NULL)
+		strcpy(grp->grp_name, name);
+
+	INIT_LIST_HEAD(&grp->grp_link);
+	INIT_LIST_HEAD(&grp->grp_ndl_list);
+	INIT_LIST_HEAD(&grp->grp_trans_list);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++)
+		INIT_LIST_HEAD(&grp->grp_ndl_hash[i]);
+
+	*grpp = grp;
+
+	return 0;
+}
+
+static void
+lstcon_group_addref(lstcon_group_t *grp)
+{
+	grp->grp_ref ++;
+}
+
+static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+
+static void
+lstcon_group_drain(lstcon_group_t *grp, int keep)
+{
+	lstcon_ndlink_t *ndl;
+	lstcon_ndlink_t *tmp;
+
+	list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
+		if ((ndl->ndl_node->nd_state & keep) == 0)
+			lstcon_group_ndlink_release(grp, ndl);
+	}
+}
+
+static void
+lstcon_group_decref(lstcon_group_t *grp)
+{
+	int     i;
+
+	if (--grp->grp_ref > 0)
+		return;
+
+	if (!list_empty(&grp->grp_link))
+		list_del(&grp->grp_link);
+
+	lstcon_group_drain(grp, 0);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT (list_empty(&grp->grp_ndl_hash[i]));
+	}
+
+	LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+				  grp_ndl_hash[LST_NODE_HASHSIZE]));
+}
+
+static int
+lstcon_group_find(char *name, lstcon_group_t **grpp)
+{
+	lstcon_group_t   *grp;
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
+			continue;
+
+		lstcon_group_addref(grp);  /* +1 ref for caller */
+		*grpp = grp;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static void
+lstcon_group_put(lstcon_group_t *grp)
+{
+	lstcon_group_decref(grp);
+}
+
+static int
+lstcon_group_ndlink_find(lstcon_group_t *grp, lnet_process_id_t id,
+			 lstcon_ndlink_t **ndlpp, int create)
+{
+	int     rc;
+
+	rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create);
+	if (rc != 0)
+		return rc;
+
+	if (!list_empty(&(*ndlpp)->ndl_link))
+		return 0;
+
+	list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list);
+	grp->grp_nnode ++;
+
+	return 0;
+}
+
+static void
+lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+{
+	list_del_init(&ndl->ndl_link);
+	lstcon_ndlink_release(ndl);
+	grp->grp_nnode --;
+}
+
+static void
+lstcon_group_ndlink_move(lstcon_group_t *old,
+			 lstcon_group_t *new, lstcon_ndlink_t *ndl)
+{
+	unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
+			   LST_NODE_HASHSIZE;
+
+	list_del(&ndl->ndl_hlink);
+	list_del(&ndl->ndl_link);
+	old->grp_nnode --;
+
+	list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &new->grp_ndl_list);
+	new->grp_nnode ++;
+
+	return;
+}
+
+static void
+lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+{
+	lstcon_ndlink_t *ndl;
+
+	while (!list_empty(&old->grp_ndl_list)) {
+		ndl = list_entry(old->grp_ndl_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		lstcon_group_ndlink_move(old, new, ndl);
+	}
+}
+
+int
+lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	lstcon_group_t *grp = (lstcon_group_t *)arg;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+		if (nd->nd_state == LST_NODE_ACTIVE)
+			return 0;
+		break;
+
+	case LST_TRANS_SESEND:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return 0;
+
+		if (grp != NULL && nd->nd_ref > 1)
+			return 0;
+		break;
+
+	case LST_TRANS_SESQRY:
+		break;
+
+	default:
+		LBUG();
+	}
+
+	return 1;
+}
+
+int
+lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+		      lstcon_rpc_ent_t *ent_up)
+{
+	srpc_debug_reply_t *rep;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+	case LST_TRANS_SESEND:
+		return 0;
+
+	case LST_TRANS_SESQRY:
+		rep = &msg->msg_body.dbg_reply;
+
+		if (copy_to_user(&ent_up->rpe_priv[0],
+				     &rep->dbg_timeout, sizeof(int)) ||
+		    copy_to_user(&ent_up->rpe_payload[0],
+				     &rep->dbg_name, LST_NAME_SIZE))
+			return -EFAULT;
+
+		return 0;
+
+	default:
+		LBUG();
+	}
+
+	return 0;
+}
+
+static int
+lstcon_group_nodes_add(lstcon_group_t *grp,
+		       int count, lnet_process_id_t *ids_up,
+		       unsigned *featp, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t      *trans;
+	lstcon_ndlink_t	 *ndl;
+	lstcon_group_t	  *tmp;
+	lnet_process_id_t	id;
+	int		      i;
+	int		      rc;
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* skip if it's in this group already */
+		rc = lstcon_group_ndlink_find(grp, id, &ndl, 0);
+		if (rc == 0)
+			continue;
+
+		/* add to tmp group */
+		rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1);
+		if (rc != 0) {
+			CERROR("Can't create ndlink, out of memory\n");
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+				     &tmp->grp_trans_list, LST_TRANS_SESNEW,
+				     tmp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	/* post all RPCs */
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_sesrpc_readent);
+	*featp = trans->tas_features;
+
+	/* destroy all RPGs */
+	lstcon_rpc_trans_destroy(trans);
+
+	lstcon_group_move(tmp, grp);
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+static int
+lstcon_group_nodes_remove(lstcon_group_t *grp,
+			  int count, lnet_process_id_t *ids_up,
+			  struct list_head *result_up)
+{
+	lstcon_rpc_trans_t     *trans;
+	lstcon_ndlink_t	*ndl;
+	lstcon_group_t	 *tmp;
+	lnet_process_id_t       id;
+	int		     rc;
+	int		     i;
+
+	/* End session and remove node from the group */
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			goto error;
+		}
+
+		/* move node to tmp group */
+		if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0)
+			lstcon_group_ndlink_move(grp, tmp, ndl);
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+				     &tmp->grp_trans_list, LST_TRANS_SESEND,
+				     tmp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		goto error;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* release nodes anyway, because we can't rollback status */
+	lstcon_group_put(tmp);
+
+	return rc;
+error:
+	lstcon_group_move(tmp, grp);
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+int
+lstcon_group_add(char *name)
+{
+	lstcon_group_t *grp;
+	int	     rc;
+
+	rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
+	if (rc != 0) {
+		/* find a group with same name */
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	rc = lstcon_group_alloc(name, &grp);
+	if (rc != 0) {
+		CERROR("Can't allocate descriptor for group %s\n", name);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&grp->grp_link, &console_session.ses_grp_list);
+
+	return rc;
+}
+
+int
+lstcon_nodes_add(char *name, int count, lnet_process_id_t *ids_up,
+		 unsigned *featp, struct list_head *result_up)
+{
+	lstcon_group_t	 *grp;
+	int		     rc;
+
+	LASSERT (count > 0);
+	LASSERT (ids_up != NULL);
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by other threads or test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+
+		return -EBUSY;
+	}
+
+	rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_del(char *name)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_group_t     *grp;
+	int		 rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by others threads or test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &grp->grp_trans_list, LST_TRANS_SESEND,
+				     grp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	lstcon_rpc_trans_destroy(trans);
+
+	lstcon_group_put(grp);
+	/* -ref for session, it's destroyed,
+	 * status can't be rolled back, destroy group anway */
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_clean(char *name, int args)
+{
+	lstcon_group_t *grp = NULL;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	args = (LST_NODE_ACTIVE | LST_NODE_BUSY |
+		LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args;
+
+	lstcon_group_drain(grp, args);
+
+	lstcon_group_put(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_put(grp);
+
+	return 0;
+}
+
+int
+lstcon_nodes_remove(char *name, int count,
+		    lnet_process_id_t *ids_up, struct list_head *result_up)
+{
+	lstcon_group_t *grp = NULL;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up);
+
+	lstcon_group_put(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_refresh(char *name, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t      *trans;
+	lstcon_group_t	  *grp;
+	int		      rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	/* re-invite all inactive nodes int the group */
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &grp->grp_trans_list, LST_TRANS_SESNEW,
+				     grp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		/* local error, return */
+		CDEBUG(D_NET, "Can't create transaction: %d\n", rc);
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* -ref for me */
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_list(int index, int len, char *name_up)
+{
+	lstcon_group_t *grp;
+
+	LASSERT (index >= 0);
+	LASSERT (name_up != NULL);
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up, grp->grp_name, len) ?
+			       -EFAULT : 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_nodes_getent(struct list_head *head, int *index_p,
+		    int *count_p, lstcon_node_ent_t *dents_up)
+{
+	lstcon_ndlink_t  *ndl;
+	lstcon_node_t    *nd;
+	int	       count = 0;
+	int	       index = 0;
+
+	LASSERT (index_p != NULL && count_p != NULL);
+	LASSERT (dents_up != NULL);
+	LASSERT (*index_p >= 0);
+	LASSERT (*count_p > 0);
+
+	list_for_each_entry(ndl, head, ndl_link) {
+		if (index++ < *index_p)
+			continue;
+
+		if (count >= *count_p)
+			break;
+
+		nd = ndl->ndl_node;
+		if (copy_to_user(&dents_up[count].nde_id,
+				     &nd->nd_id, sizeof(nd->nd_id)) ||
+		    copy_to_user(&dents_up[count].nde_state,
+				     &nd->nd_state, sizeof(nd->nd_state)))
+			return -EFAULT;
+
+		count ++;
+	}
+
+	if (index <= *index_p)
+		return -ENOENT;
+
+	*count_p = count;
+	*index_p = index;
+
+	return 0;
+}
+
+int
+lstcon_group_info(char *name, lstcon_ndlist_ent_t *gents_p,
+		  int *index_p, int *count_p, lstcon_node_ent_t *dents_up)
+{
+	lstcon_ndlist_ent_t *gentp;
+	lstcon_group_t      *grp;
+	lstcon_ndlink_t     *ndl;
+	int		  rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (dents_up != 0) {
+		/* verbose query */
+		rc = lstcon_nodes_getent(&grp->grp_ndl_list,
+					 index_p, count_p, dents_up);
+		lstcon_group_put(grp);
+
+		return rc;
+	}
+
+	/* non-verbose query */
+	LIBCFS_ALLOC(gentp, sizeof(lstcon_ndlist_ent_t));
+	if (gentp == NULL) {
+		CERROR("Can't allocate ndlist_ent\n");
+		lstcon_group_put(grp);
+
+		return -ENOMEM;
+	}
+
+	memset(gentp, 0, sizeof(lstcon_ndlist_ent_t));
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp);
+
+	rc = copy_to_user(gents_p, gentp,
+			      sizeof(lstcon_ndlist_ent_t)) ? -EFAULT: 0;
+
+	LIBCFS_FREE(gentp, sizeof(lstcon_ndlist_ent_t));
+
+	lstcon_group_put(grp);
+
+	return 0;
+}
+
+int
+lstcon_batch_find(char *name, lstcon_batch_t **batpp)
+{
+	lstcon_batch_t   *bat;
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
+			*batpp = bat;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_add(char *name)
+{
+	lstcon_batch_t   *bat;
+	int	       i;
+	int	       rc;
+
+	rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
+	if (rc != 0) {
+		CDEBUG(D_NET, "Batch %s already exists\n", name);
+		return rc;
+	}
+
+	LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+	if (bat == NULL) {
+		CERROR("Can't allocate descriptor for batch %s\n", name);
+		return -ENOMEM;
+	}
+
+	LIBCFS_ALLOC(bat->bat_cli_hash,
+		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	if (bat->bat_cli_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+		return -ENOMEM;
+	}
+
+	LIBCFS_ALLOC(bat->bat_srv_hash,
+		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	if (bat->bat_srv_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+		return -ENOMEM;
+	}
+
+	strcpy(bat->bat_name, name);
+	bat->bat_hdr.tsb_index = 0;
+	bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie;
+
+	bat->bat_ntest = 0;
+	bat->bat_state = LST_BATCH_IDLE;
+
+	INIT_LIST_HEAD(&bat->bat_cli_list);
+	INIT_LIST_HEAD(&bat->bat_srv_list);
+	INIT_LIST_HEAD(&bat->bat_test_list);
+	INIT_LIST_HEAD(&bat->bat_trans_list);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		INIT_LIST_HEAD(&bat->bat_cli_hash[i]);
+		INIT_LIST_HEAD(&bat->bat_srv_hash[i]);
+	}
+
+	list_add_tail(&bat->bat_link, &console_session.ses_bat_list);
+
+	return rc;
+}
+
+int
+lstcon_batch_list(int index, int len, char *name_up)
+{
+	lstcon_batch_t    *bat;
+
+	LASSERT (name_up != NULL);
+	LASSERT (index >= 0);
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up,bat->bat_name, len) ?
+			       -EFAULT: 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, int server,
+		  int testidx, int *index_p, int *ndent_p,
+		  lstcon_node_ent_t *dents_up)
+{
+	lstcon_test_batch_ent_t *entp;
+	struct list_head	      *clilst;
+	struct list_head	      *srvlst;
+	lstcon_test_t	   *test = NULL;
+	lstcon_batch_t	  *bat;
+	lstcon_ndlink_t	 *ndl;
+	int		      rc;
+
+	rc = lstcon_batch_find(name, &bat);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	if (testidx > 0) {
+		/* query test, test index start from 1 */
+		list_for_each_entry(test, &bat->bat_test_list, tes_link) {
+			if (testidx-- == 1)
+				break;
+		}
+
+		if (testidx > 0) {
+			CDEBUG(D_NET, "Can't find specified test in batch\n");
+			return -ENOENT;
+		}
+	}
+
+	clilst = (test == NULL) ? &bat->bat_cli_list :
+				  &test->tes_src_grp->grp_ndl_list;
+	srvlst = (test == NULL) ? &bat->bat_srv_list :
+				  &test->tes_dst_grp->grp_ndl_list;
+
+	if (dents_up != NULL) {
+		rc = lstcon_nodes_getent((server ? srvlst: clilst),
+					 index_p, ndent_p, dents_up);
+		return rc;
+	}
+
+	/* non-verbose query */
+	LIBCFS_ALLOC(entp, sizeof(lstcon_test_batch_ent_t));
+	if (entp == NULL)
+		return -ENOMEM;
+
+	memset(entp, 0, sizeof(lstcon_test_batch_ent_t));
+
+	if (test == NULL) {
+		entp->u.tbe_batch.bae_ntest = bat->bat_ntest;
+		entp->u.tbe_batch.bae_state = bat->bat_state;
+
+	} else {
+
+		entp->u.tbe_test.tse_type   = test->tes_type;
+		entp->u.tbe_test.tse_loop   = test->tes_loop;
+		entp->u.tbe_test.tse_concur = test->tes_concur;
+	}
+
+	list_for_each_entry(ndl, clilst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle);
+
+	list_for_each_entry(ndl, srvlst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle);
+
+	rc = copy_to_user(ent_up, entp,
+			      sizeof(lstcon_test_batch_ent_t)) ? -EFAULT : 0;
+
+	LIBCFS_FREE(entp, sizeof(lstcon_test_batch_ent_t));
+
+	return rc;
+}
+
+int
+lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	switch (transop) {
+	case LST_TRANS_TSBRUN:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return -ENETDOWN;
+		break;
+
+	case LST_TRANS_TSBSTOP:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return 0;
+		break;
+
+	case LST_TRANS_TSBCLIQRY:
+	case LST_TRANS_TSBSRVQRY:
+		break;
+	}
+
+	return 1;
+}
+
+static int
+lstcon_batch_op(lstcon_batch_t *bat, int transop,
+		struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
+				     &bat->bat_trans_list, transop,
+				     bat, lstcon_batrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_batch_run(char *name, int timeout, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	if (lstcon_batch_find(name, &bat) != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	bat->bat_arg = timeout;
+
+	rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up);
+
+	/* mark batch as running if it's started in any node */
+	if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0)
+		bat->bat_state = LST_BATCH_RUNNING;
+
+	return rc;
+}
+
+int
+lstcon_batch_stop(char *name, int force, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	if (lstcon_batch_find(name, &bat) != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	bat->bat_arg = force;
+
+	rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up);
+
+	/* mark batch as stopped if all RPCs finished */
+	if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0)
+		bat->bat_state = LST_BATCH_IDLE;
+
+	return rc;
+}
+
+static void
+lstcon_batch_destroy(lstcon_batch_t *bat)
+{
+	lstcon_ndlink_t    *ndl;
+	lstcon_test_t      *test;
+	int		 i;
+
+	list_del(&bat->bat_link);
+
+	while (!list_empty(&bat->bat_test_list)) {
+		test = list_entry(bat->bat_test_list.next,
+				      lstcon_test_t, tes_link);
+		LASSERT (list_empty(&test->tes_trans_list));
+
+		list_del(&test->tes_link);
+
+		lstcon_group_put(test->tes_src_grp);
+		lstcon_group_put(test->tes_dst_grp);
+
+		LIBCFS_FREE(test, offsetof(lstcon_test_t,
+					   tes_param[test->tes_paramlen]));
+	}
+
+	LASSERT (list_empty(&bat->bat_trans_list));
+
+	while (!list_empty(&bat->bat_cli_list)) {
+		ndl = list_entry(bat->bat_cli_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	while (!list_empty(&bat->bat_srv_list)) {
+		ndl = list_entry(bat->bat_srv_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT (list_empty(&bat->bat_cli_hash[i]));
+		LASSERT (list_empty(&bat->bat_srv_hash[i]));
+	}
+
+	LIBCFS_FREE(bat->bat_cli_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat->bat_srv_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+}
+
+int
+lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	lstcon_test_t    *test;
+	lstcon_batch_t   *batch;
+	lstcon_ndlink_t  *ndl;
+	struct list_head       *hash;
+	struct list_head       *head;
+
+	test = (lstcon_test_t *)arg;
+	LASSERT (test != NULL);
+
+	batch = test->tes_batch;
+	LASSERT (batch != NULL);
+
+	if (test->tes_oneside &&
+	    transop == LST_TRANS_TSBSRVADD)
+		return 0;
+
+	if (nd->nd_state != LST_NODE_ACTIVE)
+		return -ENETDOWN;
+
+	if (transop == LST_TRANS_TSBCLIADD) {
+		hash = batch->bat_cli_hash;
+		head = &batch->bat_cli_list;
+
+	} else {
+		LASSERT (transop == LST_TRANS_TSBSRVADD);
+
+		hash = batch->bat_srv_hash;
+		head = &batch->bat_srv_list;
+	}
+
+	LASSERT (nd->nd_id.nid != LNET_NID_ANY);
+
+	if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0)
+		return -ENOMEM;
+
+	if (list_empty(&ndl->ndl_link))
+		list_add_tail(&ndl->ndl_link, head);
+
+	return 1;
+}
+
+static int
+lstcon_test_nodes_add(lstcon_test_t *test, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t     *trans;
+	lstcon_group_t	 *grp;
+	int		     transop;
+	int		     rc;
+
+	LASSERT (test->tes_src_grp != NULL);
+	LASSERT (test->tes_dst_grp != NULL);
+
+	transop = LST_TRANS_TSBSRVADD;
+	grp  = test->tes_dst_grp;
+again:
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &test->tes_trans_list, transop,
+				     test, lstcon_testrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+	    lstcon_trans_stat()->trs_fwk_errno != 0) {
+		lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+		lstcon_rpc_trans_destroy(trans);
+		/* return if any error */
+		CDEBUG(D_NET, "Failed to add test %s, "
+			      "RPC error %d, framework error %d\n",
+		       transop == LST_TRANS_TSBCLIADD ? "client" : "server",
+		       lstcon_trans_stat()->trs_rpc_errno,
+		       lstcon_trans_stat()->trs_fwk_errno);
+
+		return rc;
+	}
+
+	lstcon_rpc_trans_destroy(trans);
+
+	if (transop == LST_TRANS_TSBCLIADD)
+		return rc;
+
+	transop = LST_TRANS_TSBCLIADD;
+	grp = test->tes_src_grp;
+	test->tes_cliidx = 0;
+
+	/* requests to test clients */
+	goto again;
+}
+
+int
+lstcon_test_add(char *name, int type, int loop, int concur,
+		int dist, int span, char *src_name, char * dst_name,
+		void *param, int paramlen, int *retp,
+		struct list_head *result_up)
+{
+	lstcon_group_t  *src_grp = NULL;
+	lstcon_group_t  *dst_grp = NULL;
+	lstcon_test_t   *test    = NULL;
+	lstcon_batch_t  *batch;
+	int	      rc;
+
+	rc = lstcon_batch_find(name, &batch);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return rc;
+	}
+
+	if (batch->bat_state != LST_BATCH_IDLE) {
+		CDEBUG(D_NET, "Can't change running batch %s\n", name);
+		return rc;
+	}
+
+	rc = lstcon_group_find(src_name, &src_grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", src_name);
+		goto out;
+	}
+
+	rc = lstcon_group_find(dst_name, &dst_grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", dst_name);
+		goto out;
+	}
+
+	if (dst_grp->grp_userland)
+		*retp = 1;
+
+	LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+	if (!test) {
+		CERROR("Can't allocate test descriptor\n");
+		rc = -ENOMEM;
+
+		goto out;
+	}
+
+	memset(test, 0, offsetof(lstcon_test_t, tes_param[paramlen]));
+	test->tes_hdr.tsb_id    = batch->bat_hdr.tsb_id;
+	test->tes_batch	 = batch;
+	test->tes_type	  = type;
+	test->tes_oneside       = 0; /* TODO */
+	test->tes_loop	  = loop;
+	test->tes_concur	= concur;
+	test->tes_stop_onerr    = 1; /* TODO */
+	test->tes_span	  = span;
+	test->tes_dist	  = dist;
+	test->tes_cliidx	= 0; /* just used for creating RPC */
+	test->tes_src_grp       = src_grp;
+	test->tes_dst_grp       = dst_grp;
+	INIT_LIST_HEAD(&test->tes_trans_list);
+
+	if (param != NULL) {
+		test->tes_paramlen = paramlen;
+		memcpy(&test->tes_param[0], param, paramlen);
+	}
+
+	rc = lstcon_test_nodes_add(test, result_up);
+
+	if (rc != 0)
+		goto out;
+
+	if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+	    lstcon_trans_stat()->trs_fwk_errno != 0)
+		CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, name);
+
+	/* add to test list anyway, so user can check what's going on */
+	list_add_tail(&test->tes_link, &batch->bat_test_list);
+
+	batch->bat_ntest ++;
+	test->tes_hdr.tsb_index = batch->bat_ntest;
+
+	/*  hold groups so nobody can change them */
+	return rc;
+out:
+	if (test != NULL)
+		LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+
+	if (dst_grp != NULL)
+		lstcon_group_put(dst_grp);
+
+	if (src_grp != NULL)
+		lstcon_group_put(src_grp);
+
+	return rc;
+}
+
+int
+lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+{
+	lstcon_test_t *test;
+
+	list_for_each_entry(test, &batch->bat_test_list, tes_link) {
+		if (idx == test->tes_hdr.tsb_index) {
+			*testpp = test;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+		      lstcon_rpc_ent_t *ent_up)
+{
+	srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+	LASSERT (transop == LST_TRANS_TSBCLIQRY ||
+		 transop == LST_TRANS_TSBSRVQRY);
+
+	/* positive errno, framework error code */
+	if (copy_to_user(&ent_up->rpe_priv[0],
+			     &rep->bar_active, sizeof(rep->bar_active)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int
+lstcon_test_batch_query(char *name, int testidx, int client,
+			int timeout, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	struct list_head	 *translist;
+	struct list_head	 *ndlist;
+	lstcon_tsb_hdr_t   *hdr;
+	lstcon_batch_t     *batch;
+	lstcon_test_t      *test = NULL;
+	int		 transop;
+	int		 rc;
+
+	rc = lstcon_batch_find(name, &batch);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch: %s\n", name);
+		return rc;
+	}
+
+	if (testidx == 0) {
+		translist = &batch->bat_trans_list;
+		ndlist    = &batch->bat_cli_list;
+		hdr       = &batch->bat_hdr;
+
+	} else {
+		/* query specified test only */
+		rc = lstcon_test_find(batch, testidx, &test);
+		if (rc != 0) {
+			CDEBUG(D_NET, "Can't find test: %d\n", testidx);
+			return rc;
+		}
+
+		translist = &test->tes_trans_list;
+		ndlist    = &test->tes_src_grp->grp_ndl_list;
+		hdr       = &test->tes_hdr;
+	}
+
+	transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY;
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr,
+				     lstcon_batrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, timeout);
+
+	if (testidx == 0 && /* query a batch, not a test */
+	    lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 &&
+	    lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) {
+		/* all RPCs finished, and no active test */
+		batch->bat_state = LST_BATCH_IDLE;
+	}
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_tsbrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+		       lstcon_rpc_ent_t *ent_up)
+{
+	srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+	sfw_counters_t    *sfwk_stat;
+	srpc_counters_t   *srpc_stat;
+	lnet_counters_t   *lnet_stat;
+
+	if (rep->str_status != 0)
+		return 0;
+
+	sfwk_stat = (sfw_counters_t *)&ent_up->rpe_payload[0];
+	srpc_stat = (srpc_counters_t *)((char *)sfwk_stat + sizeof(*sfwk_stat));
+	lnet_stat = (lnet_counters_t *)((char *)srpc_stat + sizeof(*srpc_stat));
+
+	if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
+	    copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) ||
+	    copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int
+lstcon_ndlist_stat(struct list_head *ndlist,
+		   int timeout, struct list_head *result_up)
+{
+	struct list_head	  head;
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	INIT_LIST_HEAD(&head);
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, &head,
+				     LST_TRANS_STATQRY, NULL, NULL, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_statrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_group_stat(char *grp_name, int timeout, struct list_head *result_up)
+{
+	lstcon_group_t     *grp;
+	int		 rc;
+
+	rc = lstcon_group_find(grp_name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", grp_name);
+		return rc;
+	}
+
+	rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+		  int timeout, struct list_head *result_up)
+{
+	lstcon_ndlink_t	 *ndl;
+	lstcon_group_t	  *tmp;
+	lnet_process_id_t	id;
+	int		      i;
+	int		      rc;
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* add to tmp group */
+		rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2);
+		if (rc != 0) {
+			CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET,
+			       "Failed to find or create %s: %d\n",
+			       libcfs_id2str(id), rc);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+int
+lstcon_debug_ndlist(struct list_head *ndlist,
+		    struct list_head *translist,
+		    int timeout, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
+				     NULL, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_sesrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_session_debug(int timeout, struct list_head *result_up)
+{
+	return lstcon_debug_ndlist(&console_session.ses_ndl_list,
+				   NULL, timeout, result_up);
+}
+
+int
+lstcon_batch_debug(int timeout, char *name,
+		   int client, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	rc = lstcon_batch_find(name, &bat);
+	if (rc != 0)
+		return -ENOENT;
+
+	rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list :
+					  &bat->bat_srv_list,
+				 NULL, timeout, result_up);
+
+	return rc;
+}
+
+int
+lstcon_group_debug(int timeout, char *name,
+		   struct list_head *result_up)
+{
+	lstcon_group_t *grp;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0)
+		return -ENOENT;
+
+	rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+				 timeout, result_up);
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_nodes_debug(int timeout,
+		   int count, lnet_process_id_t *ids_up,
+		   struct list_head *result_up)
+{
+	lnet_process_id_t  id;
+	lstcon_ndlink_t   *ndl;
+	lstcon_group_t    *grp;
+	int		i;
+	int		rc;
+
+	rc = lstcon_group_alloc(NULL, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Out of memory\n");
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* node is added to tmp group */
+		rc = lstcon_group_ndlink_find(grp, id, &ndl, 1);
+		if (rc != 0) {
+			CERROR("Can't create node link\n");
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+				 timeout, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_session_match(lst_sid_t sid)
+{
+	return (console_session.ses_id.ses_nid   == sid.ses_nid &&
+		console_session.ses_id.ses_stamp == sid.ses_stamp) ?  1: 0;
+}
+
+static void
+lstcon_new_session_id(lst_sid_t *sid)
+{
+	lnet_process_id_t      id;
+
+	LASSERT (console_session.ses_state == LST_SESSION_NONE);
+
+	LNetGetId(1, &id);
+	sid->ses_nid   = id.nid;
+	sid->ses_stamp = cfs_time_current();
+}
+
+extern srpc_service_t lstcon_acceptor_service;
+
+int
+lstcon_session_new(char *name, int key, unsigned feats,
+		   int timeout, int force, lst_sid_t *sid_up)
+{
+	int     rc = 0;
+	int     i;
+
+	if (console_session.ses_state != LST_SESSION_NONE) {
+		/* session exists */
+		if (!force) {
+			CNETERR("Session %s already exists\n",
+				console_session.ses_name);
+			return -EEXIST;
+		}
+
+		rc = lstcon_session_end();
+
+		/* lstcon_session_end() only return local error */
+		if  (rc != 0)
+			return rc;
+	}
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CNETERR("Unknown session features %x\n",
+			(feats & ~LST_FEATS_MASK));
+		return -EINVAL;
+	}
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+	lstcon_new_session_id(&console_session.ses_id);
+
+	console_session.ses_key	    = key;
+	console_session.ses_state   = LST_SESSION_ACTIVE;
+	console_session.ses_force   = !!force;
+	console_session.ses_features = feats;
+	console_session.ses_feats_updated = 0;
+	console_session.ses_timeout = (timeout <= 0) ?
+				      LST_CONSOLE_TIMEOUT : timeout;
+	strcpy(console_session.ses_name, name);
+
+	rc = lstcon_batch_add(LST_DEFAULT_BATCH);
+	if (rc != 0)
+		return rc;
+
+	rc = lstcon_rpc_pinger_start();
+	if (rc != 0) {
+		lstcon_batch_t *bat = NULL;
+
+		lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
+		lstcon_batch_destroy(bat);
+
+		return rc;
+	}
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			     sizeof(lst_sid_t)) == 0)
+		return rc;
+
+	lstcon_session_end();
+
+	return -EFAULT;
+}
+
+int
+lstcon_session_info(lst_sid_t *sid_up, int *key_up, unsigned *featp,
+		    lstcon_ndlist_ent_t *ndinfo_up, char *name_up, int len)
+{
+	lstcon_ndlist_ent_t *entp;
+	lstcon_ndlink_t     *ndl;
+	int		  rc = 0;
+
+	if (console_session.ses_state != LST_SESSION_ACTIVE)
+		return -ESRCH;
+
+	LIBCFS_ALLOC(entp, sizeof(*entp));
+	if (entp == NULL)
+		return -ENOMEM;
+
+	memset(entp, 0, sizeof(*entp));
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, entp);
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			     sizeof(lst_sid_t)) ||
+	    copy_to_user(key_up, &console_session.ses_key,
+			     sizeof(*key_up)) ||
+	    copy_to_user(featp, &console_session.ses_features,
+			     sizeof(*featp)) ||
+	    copy_to_user(ndinfo_up, entp, sizeof(*entp)) ||
+	    copy_to_user(name_up, console_session.ses_name, len))
+		rc = -EFAULT;
+
+	LIBCFS_FREE(entp, sizeof(*entp));
+
+	return rc;
+}
+
+int
+lstcon_session_end()
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_group_t     *grp;
+	lstcon_batch_t     *bat;
+	int		 rc = 0;
+
+	LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
+
+	rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list,
+				     NULL, LST_TRANS_SESEND, NULL,
+				     lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	console_session.ses_shutdown = 1;
+
+	lstcon_rpc_pinger_stop();
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* User can do nothing even rpc failed, so go on */
+
+	/* waiting for orphan rpcs to die */
+	lstcon_rpc_cleanup_wait();
+
+	console_session.ses_id    = LST_INVALID_SID;
+	console_session.ses_state = LST_SESSION_NONE;
+	console_session.ses_key   = 0;
+	console_session.ses_force = 0;
+	console_session.ses_feats_updated = 0;
+
+	/* destroy all batches */
+	while (!list_empty(&console_session.ses_bat_list)) {
+		bat = list_entry(console_session.ses_bat_list.next,
+				     lstcon_batch_t, bat_link);
+
+		lstcon_batch_destroy(bat);
+	}
+
+	/* destroy all groups */
+	while (!list_empty(&console_session.ses_grp_list)) {
+		grp = list_entry(console_session.ses_grp_list.next,
+				     lstcon_group_t, grp_link);
+		LASSERT (grp->grp_ref == 1);
+
+		lstcon_group_put(grp);
+	}
+
+	/* all nodes should be released */
+	LASSERT (list_empty(&console_session.ses_ndl_list));
+
+	console_session.ses_shutdown = 0;
+	console_session.ses_expired  = 0;
+
+	return rc;
+}
+
+int
+lstcon_session_feats_check(unsigned feats)
+{
+	int rc = 0;
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CERROR("Can't support these features: %x\n",
+		       (feats & ~LST_FEATS_MASK));
+		return -EPROTO;
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!console_session.ses_feats_updated) {
+		console_session.ses_feats_updated = 1;
+		console_session.ses_features = feats;
+	}
+
+	if (console_session.ses_features != feats)
+		rc = -EPROTO;
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (rc != 0) {
+		CERROR("remote features %x do not match with "
+		       "session features %x of console\n",
+		       feats, console_session.ses_features);
+	}
+
+	return rc;
+}
+
+static int
+lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
+{
+	srpc_msg_t	*rep  = &rpc->srpc_replymsg;
+	srpc_msg_t	*req  = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
+	srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
+	lstcon_group_t    *grp  = NULL;
+	lstcon_ndlink_t   *ndl;
+	int		rc   = 0;
+
+	sfw_unpack_message(req);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	jrep->join_sid = console_session.ses_id;
+
+	if (console_session.ses_id.ses_nid == LNET_NID_ANY) {
+		jrep->join_status = ESRCH;
+		goto out;
+	}
+
+	if (lstcon_session_feats_check(req->msg_ses_feats) != 0) {
+		jrep->join_status = EPROTO;
+		goto out;
+	}
+
+	if (jreq->join_sid.ses_nid != LNET_NID_ANY &&
+	     !lstcon_session_match(jreq->join_sid)) {
+		jrep->join_status = EBUSY;
+		goto out;
+	}
+
+	if (lstcon_group_find(jreq->join_group, &grp) != 0) {
+		rc = lstcon_group_alloc(jreq->join_group, &grp);
+		if (rc != 0) {
+			CERROR("Out of memory\n");
+			goto out;
+		}
+
+		list_add_tail(&grp->grp_link,
+				  &console_session.ses_grp_list);
+		lstcon_group_addref(grp);
+	}
+
+	if (grp->grp_ref > 2) {
+		/* Group in using */
+		jrep->join_status = EBUSY;
+		goto out;
+	}
+
+	rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0);
+	if (rc == 0) {
+		jrep->join_status = EEXIST;
+		goto out;
+	}
+
+	rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		goto out;
+	}
+
+	ndl->ndl_node->nd_state   = LST_NODE_ACTIVE;
+	ndl->ndl_node->nd_timeout = console_session.ses_timeout;
+
+	if (grp->grp_userland == 0)
+		grp->grp_userland = 1;
+
+	strcpy(jrep->join_session, console_session.ses_name);
+	jrep->join_timeout = console_session.ses_timeout;
+	jrep->join_status  = 0;
+
+out:
+	rep->msg_ses_feats = console_session.ses_features;
+	if (grp != NULL)
+		lstcon_group_put(grp);
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	return rc;
+}
+
+srpc_service_t lstcon_acceptor_service;
+void lstcon_init_acceptor_service(void)
+{
+	/* initialize selftest console acceptor service table */
+	lstcon_acceptor_service.sv_name    = "join session";
+	lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle;
+	lstcon_acceptor_service.sv_id      = SRPC_SERVICE_JOIN;
+	lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
+}
+
+extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data);
+
+DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+
+/* initialize console */
+int
+lstcon_console_init(void)
+{
+	int     i;
+	int     rc;
+
+	memset(&console_session, 0, sizeof(lstcon_session_t));
+
+	console_session.ses_id		    = LST_INVALID_SID;
+	console_session.ses_state	    = LST_SESSION_NONE;
+	console_session.ses_timeout	    = 0;
+	console_session.ses_force	    = 0;
+	console_session.ses_expired	    = 0;
+	console_session.ses_feats_updated   = 0;
+	console_session.ses_features	    = LST_FEATS_MASK;
+	console_session.ses_laststamp	    = cfs_time_current_sec();
+
+	mutex_init(&console_session.ses_mutex);
+
+	INIT_LIST_HEAD(&console_session.ses_ndl_list);
+	INIT_LIST_HEAD(&console_session.ses_grp_list);
+	INIT_LIST_HEAD(&console_session.ses_bat_list);
+	INIT_LIST_HEAD(&console_session.ses_trans_list);
+
+	LIBCFS_ALLOC(console_session.ses_ndl_hash,
+		     sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+	if (console_session.ses_ndl_hash == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]);
+
+
+	/* initialize acceptor service table */
+	lstcon_init_acceptor_service();
+
+	rc = srpc_add_service(&lstcon_acceptor_service);
+	LASSERT (rc != -EBUSY);
+	if (rc != 0) {
+		LIBCFS_FREE(console_session.ses_ndl_hash,
+			    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+		return rc;
+	}
+
+	rc = srpc_service_add_buffers(&lstcon_acceptor_service,
+				      lstcon_acceptor_service.sv_wi_total);
+	if (rc != 0) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
+
+	if (rc == 0) {
+		lstcon_rpc_module_init();
+		return 0;
+	}
+
+out:
+	srpc_shutdown_service(&lstcon_acceptor_service);
+	srpc_remove_service(&lstcon_acceptor_service);
+
+	LIBCFS_FREE(console_session.ses_ndl_hash,
+		    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return rc;
+}
+
+int
+lstcon_console_fini(void)
+{
+	int     i;
+
+	libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	srpc_shutdown_service(&lstcon_acceptor_service);
+	srpc_remove_service(&lstcon_acceptor_service);
+
+	if (console_session.ses_state != LST_SESSION_NONE)
+		lstcon_session_end();
+
+	lstcon_rpc_module_fini();
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	LASSERT (list_empty(&console_session.ses_ndl_list));
+	LASSERT (list_empty(&console_session.ses_grp_list));
+	LASSERT (list_empty(&console_session.ses_bat_list));
+	LASSERT (list_empty(&console_session.ses_trans_list));
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT (list_empty(&console_session.ses_ndl_hash[i]));
+	}
+
+	LIBCFS_FREE(console_session.ses_ndl_hash,
+		    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return 0;
+}

diff --git a/drivers/staging/lustre/lnet/selftest/console.h b/drivers/staging/lustre/lnet/selftest/console.h
new file mode 100644
index 0000000..e61b266
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/console.h

@@ -0,0 +1,232 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/console.h
+ *
+ * kernel structure for LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LST_CONSOLE_H__
+#define __LST_CONSOLE_H__
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "selftest.h"
+#include "conrpc.h"
+
+typedef struct lstcon_node {
+	lnet_process_id_t    nd_id;	  /* id of the node */
+	int		  nd_ref;	 /* reference count */
+	int		  nd_state;       /* state of the node */
+	int		  nd_timeout;     /* session timeout */
+	cfs_time_t	   nd_stamp;       /* timestamp of last replied RPC */
+	struct lstcon_rpc    nd_ping;	/* ping rpc */
+} lstcon_node_t;				/*** node descriptor */
+
+typedef struct {
+	struct list_head	   ndl_link;       /* chain on list */
+	struct list_head	   ndl_hlink;      /* chain on hash */
+	lstcon_node_t       *ndl_node;       /* pointer to node */
+} lstcon_ndlink_t;			      /*** node link descriptor */
+
+typedef struct {
+	struct list_head	   grp_link;       /* chain on global group list */
+	int		  grp_ref;	/* reference count */
+	int		  grp_userland;   /* has userland nodes */
+	int		  grp_nnode;      /* # of nodes */
+	char		 grp_name[LST_NAME_SIZE]; /* group name */
+
+	struct list_head	   grp_trans_list; /* transaction list */
+	struct list_head	   grp_ndl_list;   /* nodes list */
+	struct list_head	   grp_ndl_hash[0];/* hash table for nodes */
+} lstcon_group_t;		    /*** (alias of nodes) group descriptor */
+
+#define LST_BATCH_IDLE	  0xB0	    /* idle batch */
+#define LST_BATCH_RUNNING       0xB1	    /* running batch */
+
+typedef struct lstcon_tsb_hdr {
+	lst_bid_t	       tsb_id;	 /* batch ID */
+	int		     tsb_index;      /* test index */
+} lstcon_tsb_hdr_t;
+
+typedef struct {
+	lstcon_tsb_hdr_t	bat_hdr;	/* test_batch header */
+	struct list_head	      bat_link;       /* chain on session's batches list */
+	int		     bat_ntest;      /* # of test */
+	int		     bat_state;      /* state of the batch */
+	int		     bat_arg;	/* parameter for run|stop, timeout for run, force for stop */
+	char		    bat_name[LST_NAME_SIZE]; /* name of batch */
+
+	struct list_head	      bat_test_list;  /* list head of tests (lstcon_test_t) */
+	struct list_head	      bat_trans_list; /* list head of transaction */
+	struct list_head	      bat_cli_list;   /* list head of client nodes (lstcon_node_t) */
+	struct list_head	     *bat_cli_hash;   /* hash table of client nodes */
+	struct list_head	      bat_srv_list;   /* list head of server nodes */
+	struct list_head	     *bat_srv_hash;   /* hash table of server nodes */
+} lstcon_batch_t;			     /*** (tests ) batch descritptor */
+
+typedef struct lstcon_test {
+	lstcon_tsb_hdr_t      tes_hdr;	/* test batch header */
+	struct list_head	    tes_link;       /* chain on batch's tests list */
+	lstcon_batch_t       *tes_batch;      /* pointer to batch */
+
+	int		   tes_type;       /* type of the test, i.e: bulk, ping */
+	int		   tes_stop_onerr; /* stop on error */
+	int		   tes_oneside;    /* one-sided test */
+	int		   tes_concur;     /* concurrency */
+	int		   tes_loop;       /* loop count */
+	int		   tes_dist;       /* nodes distribution of target group */
+	int		   tes_span;       /* nodes span of target group */
+	int		   tes_cliidx;     /* client index, used for RPC creating */
+
+	struct list_head  tes_trans_list; /* transaction list */
+	lstcon_group_t       *tes_src_grp;    /* group run the test */
+	lstcon_group_t       *tes_dst_grp;    /* target group */
+
+	int		   tes_paramlen;   /* test parameter length */
+	char		  tes_param[0];   /* test parameter */
+} lstcon_test_t;				/*** a single test descriptor */
+
+#define LST_GLOBAL_HASHSIZE     503	     /* global nodes hash table size */
+#define LST_NODE_HASHSIZE       239	     /* node hash table (for batch or group) */
+
+#define LST_SESSION_NONE	0x0	     /* no session */
+#define LST_SESSION_ACTIVE      0x1	     /* working session */
+
+#define LST_CONSOLE_TIMEOUT     300	     /* default console timeout */
+
+typedef struct {
+	struct mutex		ses_mutex;      /* only 1 thread in session */
+	lst_sid_t	       ses_id;	 /* global session id */
+	int		     ses_key;	/* local session key */
+	int		     ses_state;      /* state of session */
+	int		     ses_timeout;    /* timeout in seconds */
+	time_t		  ses_laststamp;  /* last operation stamp (seconds) */
+	/** tests features of the session */
+	unsigned		ses_features;
+	/** features are synced with remote test nodes */
+	unsigned		ses_feats_updated:1;
+	/** force creating */
+	unsigned		ses_force:1;
+	/** session is shutting down */
+	unsigned		ses_shutdown:1;
+	/** console is timedout */
+	unsigned		ses_expired:1;
+	__u64		   ses_id_cookie;  /* batch id cookie */
+	char		    ses_name[LST_NAME_SIZE];  /* session name */
+	lstcon_rpc_trans_t     *ses_ping;       /* session pinger */
+	stt_timer_t	     ses_ping_timer; /* timer for pinger */
+	lstcon_trans_stat_t     ses_trans_stat; /* transaction stats */
+
+	struct list_head	      ses_trans_list; /* global list of transaction */
+	struct list_head	      ses_grp_list;   /* global list of groups */
+	struct list_head	      ses_bat_list;   /* global list of batches */
+	struct list_head	      ses_ndl_list;   /* global list of nodes */
+	struct list_head	     *ses_ndl_hash;   /* hash table of nodes */
+
+	spinlock_t	  ses_rpc_lock;   /* serialize */
+	atomic_t	    ses_rpc_counter;/* # of initialized RPCs */
+	struct list_head	      ses_rpc_freelist; /* idle console rpc */
+} lstcon_session_t;			     /*** session descriptor */
+
+extern lstcon_session_t	 console_session;
+
+static inline lstcon_trans_stat_t *
+lstcon_trans_stat(void)
+{
+	return &console_session.ses_trans_stat;
+}
+
+static inline struct list_head *
+lstcon_id2hash (lnet_process_id_t id, struct list_head *hash)
+{
+	unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+
+	return &hash[idx];
+}
+
+extern int lstcon_session_match(lst_sid_t sid);
+extern int lstcon_session_new(char *name, int key, unsigned version,
+			      int timeout, int flags, lst_sid_t *sid_up);
+extern int lstcon_session_info(lst_sid_t *sid_up, int *key, unsigned *verp,
+			       lstcon_ndlist_ent_t *entp, char *name_up, int len);
+extern int lstcon_session_end(void);
+extern int lstcon_session_debug(int timeout, struct list_head *result_up);
+extern int lstcon_session_feats_check(unsigned feats);
+extern int lstcon_batch_debug(int timeout, char *name,
+			      int client, struct list_head *result_up);
+extern int lstcon_group_debug(int timeout, char *name,
+			      struct list_head *result_up);
+extern int lstcon_nodes_debug(int timeout, int nnd, lnet_process_id_t *nds_up,
+			      struct list_head *result_up);
+extern int lstcon_group_add(char *name);
+extern int lstcon_group_del(char *name);
+extern int lstcon_group_clean(char *name, int args);
+extern int lstcon_group_refresh(char *name, struct list_head *result_up);
+extern int lstcon_nodes_add(char *name, int nnd, lnet_process_id_t *nds_up,
+			    unsigned *featp, struct list_head *result_up);
+extern int lstcon_nodes_remove(char *name, int nnd, lnet_process_id_t *nds_up,
+			       struct list_head *result_up);
+extern int lstcon_group_info(char *name, lstcon_ndlist_ent_t *gent_up,
+			     int *index_p, int *ndent_p, lstcon_node_ent_t *ndents_up);
+extern int lstcon_group_list(int idx, int len, char *name_up);
+extern int lstcon_batch_add(char *name);
+extern int lstcon_batch_run(char *name, int timeout,
+			    struct list_head *result_up);
+extern int lstcon_batch_stop(char *name, int force,
+			     struct list_head *result_up);
+extern int lstcon_test_batch_query(char *name, int testidx,
+				   int client, int timeout,
+				   struct list_head *result_up);
+extern int lstcon_batch_del(char *name);
+extern int lstcon_batch_list(int idx, int namelen, char *name_up);
+extern int lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up,
+			     int server, int testidx, int *index_p,
+			     int *ndent_p, lstcon_node_ent_t *dents_up);
+extern int lstcon_group_stat(char *grp_name, int timeout,
+			     struct list_head *result_up);
+extern int lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+			     int timeout, struct list_head *result_up);
+extern int lstcon_test_add(char *name, int type, int loop, int concur,
+			   int dist, int span, char *src_name, char * dst_name,
+			   void *param, int paramlen, int *retp,
+			   struct list_head *result_up);
+
+#endif

diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c
new file mode 100644
index 0000000..483c785
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/framework.c

@@ -0,0 +1,1814 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/framework.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Author: Liang Zhen  <liangzhen@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+lst_sid_t LST_INVALID_SID = {LNET_NID_ANY, -1};
+
+static int session_timeout = 100;
+CFS_MODULE_PARM(session_timeout, "i", int, 0444,
+		"test session timeout in seconds (100 by default, 0 == never)");
+
+static int rpc_timeout = 64;
+CFS_MODULE_PARM(rpc_timeout, "i", int, 0644,
+		"rpc timeout in seconds (64 by default, 0 == never)");
+
+#define sfw_unpack_id(id)	       \
+do {				    \
+	__swab64s(&(id).nid);	   \
+	__swab32s(&(id).pid);	   \
+} while (0)
+
+#define sfw_unpack_sid(sid)	     \
+do {				    \
+	__swab64s(&(sid).ses_nid);      \
+	__swab64s(&(sid).ses_stamp);    \
+} while (0)
+
+#define sfw_unpack_fw_counters(fc)	\
+do {				      \
+	__swab32s(&(fc).running_ms);      \
+	__swab32s(&(fc).active_batches);  \
+	__swab32s(&(fc).zombie_sessions); \
+	__swab32s(&(fc).brw_errors);      \
+	__swab32s(&(fc).ping_errors);     \
+} while (0)
+
+#define sfw_unpack_rpc_counters(rc)     \
+do {				    \
+	__swab32s(&(rc).errors);	\
+	__swab32s(&(rc).rpcs_sent);     \
+	__swab32s(&(rc).rpcs_rcvd);     \
+	__swab32s(&(rc).rpcs_dropped);  \
+	__swab32s(&(rc).rpcs_expired);  \
+	__swab64s(&(rc).bulk_get);      \
+	__swab64s(&(rc).bulk_put);      \
+} while (0)
+
+#define sfw_unpack_lnet_counters(lc)    \
+do {				    \
+	__swab32s(&(lc).errors);	\
+	__swab32s(&(lc).msgs_max);      \
+	__swab32s(&(lc).msgs_alloc);    \
+	__swab32s(&(lc).send_count);    \
+	__swab32s(&(lc).recv_count);    \
+	__swab32s(&(lc).drop_count);    \
+	__swab32s(&(lc).route_count);   \
+	__swab64s(&(lc).send_length);   \
+	__swab64s(&(lc).recv_length);   \
+	__swab64s(&(lc).drop_length);   \
+	__swab64s(&(lc).route_length);  \
+} while (0)
+
+#define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
+#define sfw_batch_active(b)     (atomic_read(&(b)->bat_nactive) != 0)
+
+struct smoketest_framework {
+	struct list_head	 fw_zombie_rpcs;     /* RPCs to be recycled */
+	struct list_head	 fw_zombie_sessions; /* stopping sessions */
+	struct list_head	 fw_tests;	   /* registered test cases */
+	atomic_t       fw_nzombies;	/* # zombie sessions */
+	spinlock_t	   fw_lock;		/* serialise */
+	sfw_session_t	  *fw_session;		/* _the_ session */
+	int		   fw_shuttingdown;	/* shutdown in progress */
+	srpc_server_rpc_t *fw_active_srpc;	/* running RPC */
+} sfw_data;
+
+/* forward ref's */
+int sfw_stop_batch (sfw_batch_t *tsb, int force);
+void sfw_destroy_session (sfw_session_t *sn);
+
+static inline sfw_test_case_t *
+sfw_find_test_case(int id)
+{
+	sfw_test_case_t *tsc;
+
+	LASSERT (id <= SRPC_SERVICE_MAX_ID);
+	LASSERT (id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+		if (tsc->tsc_srv_service->sv_id == id)
+			return tsc;
+	}
+
+	return NULL;
+}
+
+static int
+sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
+{
+	sfw_test_case_t *tsc;
+
+	if (sfw_find_test_case(service->sv_id) != NULL) {
+		CERROR ("Failed to register test %s (%d)\n",
+			service->sv_name, service->sv_id);
+		return -EEXIST;
+	}
+
+	LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+	if (tsc == NULL)
+		return -ENOMEM;
+
+	memset(tsc, 0, sizeof(sfw_test_case_t));
+	tsc->tsc_cli_ops     = cliops;
+	tsc->tsc_srv_service = service;
+
+	list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests);
+	return 0;
+}
+
+void
+sfw_add_session_timer (void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	stt_timer_t   *timer = &sn->sn_timer;
+
+	LASSERT (!sfw_data.fw_shuttingdown);
+
+	if (sn == NULL || sn->sn_timeout == 0)
+		return;
+
+	LASSERT (!sn->sn_timer_active);
+
+	sn->sn_timer_active = 1;
+	timer->stt_expires = cfs_time_add(sn->sn_timeout,
+					  cfs_time_current_sec());
+	stt_add_timer(timer);
+	return;
+}
+
+int
+sfw_del_session_timer (void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	if (sn == NULL || !sn->sn_timer_active)
+		return 0;
+
+	LASSERT (sn->sn_timeout != 0);
+
+	if (stt_del_timer(&sn->sn_timer)) { /* timer defused */
+		sn->sn_timer_active = 0;
+		return 0;
+	}
+
+	return EBUSY; /* racing with sfw_session_expired() */
+}
+
+/* called with sfw_data.fw_lock held */
+static void
+sfw_deactivate_session (void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	int	    nactive = 0;
+	sfw_batch_t   *tsb;
+	sfw_test_case_t *tsc;
+
+	if (sn == NULL) return;
+
+	LASSERT (!sn->sn_timer_active);
+
+	sfw_data.fw_session = NULL;
+	atomic_inc(&sfw_data.fw_nzombies);
+	list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions);
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		srpc_abort_service(tsc->tsc_srv_service);
+	}
+
+	spin_lock(&sfw_data.fw_lock);
+
+	list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			nactive++;
+			sfw_stop_batch(tsb, 1);
+		}
+	}
+
+	if (nactive != 0)
+		return;   /* wait for active batches to stop */
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+
+	spin_lock(&sfw_data.fw_lock);
+}
+
+
+void
+sfw_session_expired (void *data)
+{
+	sfw_session_t *sn = data;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	LASSERT (sn->sn_timer_active);
+	LASSERT (sn == sfw_data.fw_session);
+
+	CWARN ("Session expired! sid: %s-"LPU64", name: %s\n",
+	       libcfs_nid2str(sn->sn_id.ses_nid),
+	       sn->sn_id.ses_stamp, &sn->sn_name[0]);
+
+	sn->sn_timer_active = 0;
+	sfw_deactivate_session();
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+static inline void
+sfw_init_session(sfw_session_t *sn, lst_sid_t sid,
+		 unsigned features, const char *name)
+{
+	stt_timer_t *timer = &sn->sn_timer;
+
+	memset(sn, 0, sizeof(sfw_session_t));
+	INIT_LIST_HEAD(&sn->sn_list);
+	INIT_LIST_HEAD(&sn->sn_batches);
+	atomic_set(&sn->sn_refcount, 1);	/* +1 for caller */
+	atomic_set(&sn->sn_brw_errors, 0);
+	atomic_set(&sn->sn_ping_errors, 0);
+	strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
+
+	sn->sn_timer_active = 0;
+	sn->sn_id	   = sid;
+	sn->sn_features	    = features;
+	sn->sn_timeout      = session_timeout;
+	sn->sn_started      = cfs_time_current();
+
+	timer->stt_data = sn;
+	timer->stt_func = sfw_session_expired;
+	INIT_LIST_HEAD(&timer->stt_list);
+}
+
+/* completion handler for incoming framework RPCs */
+void
+sfw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv	= rpc->srpc_scd->scd_svc;
+	int			status	= rpc->srpc_status;
+
+	CDEBUG (D_NET,
+		"Incoming framework RPC done: "
+		"service %s, peer %s, status %s:%d\n",
+		sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		swi_state2str(rpc->srpc_wi.swi_state),
+		status);
+
+	if (rpc->srpc_bulk != NULL)
+		sfw_free_pages(rpc);
+	return;
+}
+
+void
+sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
+{
+	LASSERT (rpc->crpc_bulk.bk_niov == 0);
+	LASSERT (list_empty(&rpc->crpc_list));
+	LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+	CDEBUG (D_NET,
+		"Outgoing framework RPC done: "
+		"service %d, peer %s, status %s:%d:%d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(rpc->crpc_wi.swi_state),
+		rpc->crpc_aborted, rpc->crpc_status);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	/* my callers must finish all RPCs before shutting me down */
+	LASSERT(!sfw_data.fw_shuttingdown);
+	list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs);
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+sfw_batch_t *
+sfw_find_batch (lst_bid_t bid)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	sfw_batch_t   *bat;
+
+	LASSERT (sn != NULL);
+
+	list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+		if (bat->bat_id.bat_id == bid.bat_id)
+			return bat;
+	}
+
+	return NULL;
+}
+
+sfw_batch_t *
+sfw_bid2batch (lst_bid_t bid)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	sfw_batch_t   *bat;
+
+	LASSERT (sn != NULL);
+
+	bat = sfw_find_batch(bid);
+	if (bat != NULL)
+		return bat;
+
+	LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+	if (bat == NULL)
+		return NULL;
+
+	bat->bat_error    = 0;
+	bat->bat_session  = sn;
+	bat->bat_id       = bid;
+	atomic_set(&bat->bat_nactive, 0);
+	INIT_LIST_HEAD(&bat->bat_tests);
+
+	list_add_tail(&bat->bat_list, &sn->sn_batches);
+	return bat;
+}
+
+int
+sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+{
+	sfw_session_t  *sn = sfw_data.fw_session;
+	sfw_counters_t *cnt = &reply->str_fw;
+	sfw_batch_t    *bat;
+	struct timeval  tv;
+
+	reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->str_sid.ses_nid == LNET_NID_ANY) {
+		reply->str_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) {
+		reply->str_status = ESRCH;
+		return 0;
+	}
+
+	lnet_counters_get(&reply->str_lnet);
+	srpc_get_counters(&reply->str_rpc);
+
+	/* send over the msecs since the session was started
+	 - with 32 bits to send, this is ~49 days */
+	cfs_duration_usec(cfs_time_sub(cfs_time_current(),
+				       sn->sn_started), &tv);
+
+	cnt->running_ms      = (__u32)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+	cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
+	cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
+	cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
+
+	cnt->active_batches = 0;
+	list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+		if (atomic_read(&bat->bat_nactive) > 0)
+			cnt->active_batches++;
+	}
+
+	reply->str_status = 0;
+	return 0;
+}
+
+int
+sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	srpc_msg_t    *msg = container_of(request, srpc_msg_t,
+					  msg_body.mksn_reqst);
+	int	       cplen = 0;
+
+	if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
+		reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+		reply->mksn_status = EINVAL;
+		return 0;
+	}
+
+	if (sn != NULL) {
+		reply->mksn_status  = 0;
+		reply->mksn_sid     = sn->sn_id;
+		reply->mksn_timeout = sn->sn_timeout;
+
+		if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) {
+			atomic_inc(&sn->sn_refcount);
+			return 0;
+		}
+
+		if (!request->mksn_force) {
+			reply->mksn_status = EBUSY;
+			cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0],
+					sizeof(reply->mksn_name));
+			if (cplen >= sizeof(reply->mksn_name))
+				return -E2BIG;
+			return 0;
+		}
+	}
+
+	/* reject the request if it requires unknown features
+	 * NB: old version will always accept all features because it's not
+	 * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+	 * harmless because it will return zero feature to console, and it's
+	 * console's responsibility to make sure all nodes in a session have
+	 * same feature mask. */
+	if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		reply->mksn_status = EPROTO;
+		return 0;
+	}
+
+	/* brand new or create by force */
+	LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+	if (sn == NULL) {
+		CERROR ("Dropping RPC (mksn) under memory pressure.\n");
+		return -ENOMEM;
+	}
+
+	sfw_init_session(sn, request->mksn_sid,
+			 msg->msg_ses_feats, &request->mksn_name[0]);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	sfw_deactivate_session();
+	LASSERT(sfw_data.fw_session == NULL);
+	sfw_data.fw_session = sn;
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->mksn_status  = 0;
+	reply->mksn_sid     = sn->sn_id;
+	reply->mksn_timeout = sn->sn_timeout;
+	return 0;
+}
+
+int
+sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->rmsn_sid.ses_nid == LNET_NID_ANY) {
+		reply->rmsn_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) {
+		reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY;
+		return 0;
+	}
+
+	if (!atomic_dec_and_test(&sn->sn_refcount)) {
+		reply->rmsn_status = 0;
+		return 0;
+	}
+
+	spin_lock(&sfw_data.fw_lock);
+	sfw_deactivate_session();
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->rmsn_status = 0;
+	reply->rmsn_sid    = LST_INVALID_SID;
+	LASSERT(sfw_data.fw_session == NULL);
+	return 0;
+}
+
+int
+sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	if (sn == NULL) {
+		reply->dbg_status = ESRCH;
+		reply->dbg_sid    = LST_INVALID_SID;
+		return 0;
+	}
+
+	reply->dbg_status  = 0;
+	reply->dbg_sid     = sn->sn_id;
+	reply->dbg_timeout = sn->sn_timeout;
+	if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name))
+	    >= sizeof(reply->dbg_name))
+		return -E2BIG;
+
+	return 0;
+}
+
+void
+sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
+{
+	sfw_test_unit_t     *tsu = rpc->crpc_priv;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+	/* Called with hold of tsi->tsi_lock */
+	LASSERT (list_empty(&rpc->crpc_list));
+	list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+}
+
+static inline int
+sfw_test_buffers(sfw_test_instance_t *tsi)
+{
+	struct sfw_test_case	*tsc = sfw_find_test_case(tsi->tsi_service);
+	struct srpc_service	*svc = tsc->tsc_srv_service;
+	int			nbuf;
+
+	nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts;
+	return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA);
+}
+
+int
+sfw_load_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case	*tsc;
+	struct srpc_service	*svc;
+	int			nbuf;
+	int			rc;
+
+	LASSERT(tsi != NULL);
+	tsc = sfw_find_test_case(tsi->tsi_service);
+	nbuf = sfw_test_buffers(tsi);
+	LASSERT(tsc != NULL);
+	svc = tsc->tsc_srv_service;
+
+	if (tsi->tsi_is_client) {
+		tsi->tsi_ops = tsc->tsc_cli_ops;
+		return 0;
+	}
+
+	rc = srpc_service_add_buffers(svc, nbuf);
+	if (rc != 0) {
+		CWARN("Failed to reserve enough buffers: "
+		      "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc);
+		/* NB: this error handler is not strictly correct, because
+		 * it may release more buffers than already allocated,
+		 * but it doesn't matter because request portal should
+		 * be lazy portal and will grow buffers if necessary. */
+		srpc_service_remove_buffers(svc, nbuf);
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET, "Reserved %d buffers for test %s\n",
+	       nbuf * (srpc_serv_is_framework(svc) ?
+		       1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name);
+	return 0;
+}
+
+void
+sfw_unload_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service);
+
+	LASSERT(tsc != NULL);
+
+	if (tsi->tsi_is_client)
+		return;
+
+	/* shrink buffers, because request portal is lazy portal
+	 * which can grow buffers at runtime so we may leave
+	 * some buffers behind, but never mind... */
+	srpc_service_remove_buffers(tsc->tsc_srv_service,
+				    sfw_test_buffers(tsi));
+	return;
+}
+
+void
+sfw_destroy_test_instance (sfw_test_instance_t *tsi)
+{
+	srpc_client_rpc_t *rpc;
+	sfw_test_unit_t   *tsu;
+
+	if (!tsi->tsi_is_client) goto clean;
+
+	tsi->tsi_ops->tso_fini(tsi);
+
+	LASSERT (!tsi->tsi_stopping);
+	LASSERT (list_empty(&tsi->tsi_active_rpcs));
+	LASSERT (!sfw_test_active(tsi));
+
+	while (!list_empty(&tsi->tsi_units)) {
+		tsu = list_entry(tsi->tsi_units.next,
+				     sfw_test_unit_t, tsu_list);
+		list_del(&tsu->tsu_list);
+		LIBCFS_FREE(tsu, sizeof(*tsu));
+	}
+
+	while (!list_empty(&tsi->tsi_free_rpcs)) {
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+clean:
+	sfw_unload_test(tsi);
+	LIBCFS_FREE(tsi, sizeof(*tsi));
+	return;
+}
+
+void
+sfw_destroy_batch (sfw_batch_t *tsb)
+{
+	sfw_test_instance_t *tsi;
+
+	LASSERT (!sfw_batch_active(tsb));
+	LASSERT (list_empty(&tsb->bat_list));
+
+	while (!list_empty(&tsb->bat_tests)) {
+		tsi = list_entry(tsb->bat_tests.next,
+				     sfw_test_instance_t, tsi_list);
+		list_del_init(&tsi->tsi_list);
+		sfw_destroy_test_instance(tsi);
+	}
+
+	LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+	return;
+}
+
+void
+sfw_destroy_session (sfw_session_t *sn)
+{
+	sfw_batch_t *batch;
+
+	LASSERT (list_empty(&sn->sn_list));
+	LASSERT (sn != sfw_data.fw_session);
+
+	while (!list_empty(&sn->sn_batches)) {
+		batch = list_entry(sn->sn_batches.next,
+				       sfw_batch_t, bat_list);
+		list_del_init(&batch->bat_list);
+		sfw_destroy_batch(batch);
+	}
+
+	LIBCFS_FREE(sn, sizeof(*sn));
+	atomic_dec(&sfw_data.fw_nzombies);
+	return;
+}
+
+void
+sfw_unpack_addtest_req(srpc_msg_t *msg)
+{
+	srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+	LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
+	LASSERT (req->tsr_is_client);
+
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+	if (req->tsr_service == SRPC_SERVICE_BRW) {
+		if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+			test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+
+			__swab32s(&bulk->blk_opc);
+			__swab32s(&bulk->blk_npg);
+			__swab32s(&bulk->blk_flags);
+
+		} else {
+			test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+
+			__swab16s(&bulk->blk_opc);
+			__swab16s(&bulk->blk_flags);
+			__swab32s(&bulk->blk_offset);
+			__swab32s(&bulk->blk_len);
+		}
+
+		return;
+	}
+
+	if (req->tsr_service == SRPC_SERVICE_PING) {
+		test_ping_req_t *ping = &req->tsr_u.ping;
+
+		__swab32s(&ping->png_size);
+		__swab32s(&ping->png_flags);
+		return;
+	}
+
+	LBUG ();
+	return;
+}
+
+int
+sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+{
+	srpc_msg_t	  *msg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_test_reqst_t   *req = &msg->msg_body.tes_reqst;
+	srpc_bulk_t	 *bk = rpc->srpc_bulk;
+	int		  ndest = req->tsr_ndest;
+	sfw_test_unit_t     *tsu;
+	sfw_test_instance_t *tsi;
+	int		  i;
+	int		  rc;
+
+	LIBCFS_ALLOC(tsi, sizeof(*tsi));
+	if (tsi == NULL) {
+		CERROR ("Can't allocate test instance for batch: "LPU64"\n",
+			tsb->bat_id.bat_id);
+		return -ENOMEM;
+	}
+
+	memset(tsi, 0, sizeof(*tsi));
+	spin_lock_init(&tsi->tsi_lock);
+	atomic_set(&tsi->tsi_nactive, 0);
+	INIT_LIST_HEAD(&tsi->tsi_units);
+	INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
+	INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
+
+	tsi->tsi_stopping      = 0;
+	tsi->tsi_batch	 = tsb;
+	tsi->tsi_loop	  = req->tsr_loop;
+	tsi->tsi_concur	= req->tsr_concur;
+	tsi->tsi_service       = req->tsr_service;
+	tsi->tsi_is_client     = !!(req->tsr_is_client);
+	tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
+
+	rc = sfw_load_test(tsi);
+	if (rc != 0) {
+		LIBCFS_FREE(tsi, sizeof(*tsi));
+		return rc;
+	}
+
+	LASSERT (!sfw_batch_active(tsb));
+
+	if (!tsi->tsi_is_client) {
+		/* it's test server, just add it to tsb */
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+	LASSERT (bk != NULL);
+	LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest);
+	LASSERT((unsigned int)bk->bk_len >=
+		sizeof(lnet_process_id_packed_t) * ndest);
+
+	sfw_unpack_addtest_req(msg);
+	memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u));
+
+	for (i = 0; i < ndest; i++) {
+		lnet_process_id_packed_t *dests;
+		lnet_process_id_packed_t  id;
+		int		       j;
+
+		dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page);
+		LASSERT (dests != NULL);  /* my pages are within KVM always */
+		id = dests[i % SFW_ID_PER_PAGE];
+		if (msg->msg_magic != SRPC_MSG_MAGIC)
+			sfw_unpack_id(id);
+
+		for (j = 0; j < tsi->tsi_concur; j++) {
+			LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+			if (tsu == NULL) {
+				rc = -ENOMEM;
+				CERROR ("Can't allocate tsu for %d\n",
+					tsi->tsi_service);
+				goto error;
+			}
+
+			tsu->tsu_dest.nid = id.nid;
+			tsu->tsu_dest.pid = id.pid;
+			tsu->tsu_instance = tsi;
+			tsu->tsu_private  = NULL;
+			list_add_tail(&tsu->tsu_list, &tsi->tsi_units);
+		}
+	}
+
+	rc = tsi->tsi_ops->tso_init(tsi);
+	if (rc == 0) {
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+error:
+	LASSERT (rc != 0);
+	sfw_destroy_test_instance(tsi);
+	return rc;
+}
+
+static void
+sfw_test_unit_done (sfw_test_unit_t *tsu)
+{
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_batch_t	 *tsb = tsi->tsi_batch;
+	sfw_session_t       *sn = tsb->bat_session;
+
+	LASSERT (sfw_test_active(tsi));
+
+	if (!atomic_dec_and_test(&tsi->tsi_nactive))
+		return;
+
+	/* the test instance is done */
+	spin_lock(&tsi->tsi_lock);
+
+	tsi->tsi_stopping = 0;
+
+	spin_unlock(&tsi->tsi_lock);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */
+	    sn == sfw_data.fw_session) {		  /* sn also active */
+		spin_unlock(&sfw_data.fw_lock);
+		return;
+	}
+
+	LASSERT (!list_empty(&sn->sn_list)); /* I'm a zombie! */
+
+	list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			spin_unlock(&sfw_data.fw_lock);
+			return;
+		}
+	}
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+	return;
+}
+
+void
+sfw_test_rpc_done (srpc_client_rpc_t *rpc)
+{
+	sfw_test_unit_t     *tsu = rpc->crpc_priv;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	int		  done = 0;
+
+	tsi->tsi_ops->tso_done_rpc(tsu, rpc);
+
+	spin_lock(&tsi->tsi_lock);
+
+	LASSERT (sfw_test_active(tsi));
+	LASSERT (!list_empty(&rpc->crpc_list));
+
+	list_del_init(&rpc->crpc_list);
+
+	/* batch is stopping or loop is done or get error */
+	if (tsi->tsi_stopping ||
+	    tsu->tsu_loop == 0 ||
+	    (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
+		done = 1;
+
+	/* dec ref for poster */
+	srpc_client_rpc_decref(rpc);
+
+	spin_unlock(&tsi->tsi_lock);
+
+	if (!done) {
+		swi_schedule_workitem(&tsu->tsu_worker);
+		return;
+	}
+
+	sfw_test_unit_done(tsu);
+	return;
+}
+
+int
+sfw_create_test_rpc(sfw_test_unit_t *tsu, lnet_process_id_t peer,
+		    unsigned features, int nblk, int blklen,
+		    srpc_client_rpc_t **rpcpp)
+{
+	srpc_client_rpc_t   *rpc = NULL;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+	spin_lock(&tsi->tsi_lock);
+
+	LASSERT (sfw_test_active(tsi));
+
+	if (!list_empty(&tsi->tsi_free_rpcs)) {
+		/* pick request from buffer */
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		LASSERT (nblk == rpc->crpc_bulk.bk_niov);
+		list_del_init(&rpc->crpc_list);
+	}
+
+	spin_unlock(&tsi->tsi_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk,
+					     blklen, sfw_test_rpc_done,
+					     sfw_test_rpc_fini, tsu);
+	} else {
+		srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk,
+				     blklen, sfw_test_rpc_done,
+				     sfw_test_rpc_fini, tsu);
+	}
+
+	if (rpc == NULL) {
+		CERROR("Can't create rpc for test %d\n", tsi->tsi_service);
+		return -ENOMEM;
+	}
+
+	rpc->crpc_reqstmsg.msg_ses_feats = features;
+	*rpcpp = rpc;
+
+	return 0;
+}
+
+int
+sfw_run_test (swi_workitem_t *wi)
+{
+	sfw_test_unit_t     *tsu = wi->swi_workitem.wi_data;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	srpc_client_rpc_t   *rpc = NULL;
+
+	LASSERT (wi == &tsu->tsu_worker);
+
+	if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
+		LASSERT (rpc == NULL);
+		goto test_done;
+	}
+
+	LASSERT (rpc != NULL);
+
+	spin_lock(&tsi->tsi_lock);
+
+	if (tsi->tsi_stopping) {
+		list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+		spin_unlock(&tsi->tsi_lock);
+		goto test_done;
+	}
+
+	if (tsu->tsu_loop > 0)
+		tsu->tsu_loop--;
+
+	list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs);
+	spin_unlock(&tsi->tsi_lock);
+
+	rpc->crpc_timeout = rpc_timeout;
+
+	spin_lock(&rpc->crpc_lock);
+	srpc_post_rpc(rpc);
+	spin_unlock(&rpc->crpc_lock);
+	return 0;
+
+test_done:
+	/*
+	 * No one can schedule me now since:
+	 * - previous RPC, if any, has done and
+	 * - no new RPC is initiated.
+	 * - my batch is still active; no one can run it again now.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	swi_exit_workitem(wi);
+	sfw_test_unit_done(tsu);
+	return 1;
+}
+
+int
+sfw_run_batch (sfw_batch_t *tsb)
+{
+	swi_workitem_t      *wi;
+	sfw_test_unit_t     *tsu;
+	sfw_test_instance_t *tsi;
+
+	if (sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch already active: "LPU64" (%d)\n",
+		       tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive));
+		return 0;
+	}
+
+	list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+		if (!tsi->tsi_is_client) /* skip server instances */
+			continue;
+
+		LASSERT (!tsi->tsi_stopping);
+		LASSERT (!sfw_test_active(tsi));
+
+		atomic_inc(&tsb->bat_nactive);
+
+		list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+			atomic_inc(&tsi->tsi_nactive);
+			tsu->tsu_loop = tsi->tsi_loop;
+			wi = &tsu->tsu_worker;
+			swi_init_workitem(wi, tsu, sfw_run_test,
+					  lst_sched_test[\
+					  lnet_cpt_of_nid(tsu->tsu_dest.nid)]);
+			swi_schedule_workitem(wi);
+		}
+	}
+
+	return 0;
+}
+
+int
+sfw_stop_batch (sfw_batch_t *tsb, int force)
+{
+	sfw_test_instance_t *tsi;
+	srpc_client_rpc_t   *rpc;
+
+	if (!sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch "LPU64" inactive\n", tsb->bat_id.bat_id);
+		return 0;
+	}
+
+	list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+		spin_lock(&tsi->tsi_lock);
+
+		if (!tsi->tsi_is_client ||
+		    !sfw_test_active(tsi) || tsi->tsi_stopping) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		tsi->tsi_stopping = 1;
+
+		if (!force) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		/* abort launched rpcs in the test */
+		list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) {
+			spin_lock(&rpc->crpc_lock);
+
+			srpc_abort_rpc(rpc, -EINTR);
+
+			spin_unlock(&rpc->crpc_lock);
+		}
+
+		spin_unlock(&tsi->tsi_lock);
+	}
+
+	return 0;
+}
+
+int
+sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+{
+	sfw_test_instance_t *tsi;
+
+	if (testidx < 0)
+		return -EINVAL;
+
+	if (testidx == 0) {
+		reply->bar_active = atomic_read(&tsb->bat_nactive);
+		return 0;
+	}
+
+	list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+		if (testidx-- > 1)
+			continue;
+
+		reply->bar_active = atomic_read(&tsi->tsi_nactive);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+void
+sfw_free_pages (srpc_server_rpc_t *rpc)
+{
+	srpc_free_bulk(rpc->srpc_bulk);
+	rpc->srpc_bulk = NULL;
+}
+
+int
+sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+		int sink)
+{
+	LASSERT(rpc->srpc_bulk == NULL);
+	LASSERT(npages > 0 && npages <= LNET_MAX_IOV);
+
+	rpc->srpc_bulk = srpc_alloc_bulk(cpt, npages, len, sink);
+	if (rpc->srpc_bulk == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int
+sfw_add_test (srpc_server_rpc_t *rpc)
+{
+	sfw_session_t     *sn = sfw_data.fw_session;
+	srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+	srpc_test_reqst_t *request;
+	int		rc;
+	sfw_batch_t       *bat;
+
+	request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
+	reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->tsr_loop == 0 ||
+	    request->tsr_concur == 0 ||
+	    request->tsr_sid.ses_nid == LNET_NID_ANY ||
+	    request->tsr_ndest > SFW_MAX_NDESTS ||
+	    (request->tsr_is_client && request->tsr_ndest == 0) ||
+	    request->tsr_concur > SFW_MAX_CONCUR ||
+	    request->tsr_service > SRPC_SERVICE_MAX_ID ||
+	    request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) {
+		reply->tsr_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) ||
+	    sfw_find_test_case(request->tsr_service) == NULL) {
+		reply->tsr_status = ENOENT;
+		return 0;
+	}
+
+	bat = sfw_bid2batch(request->tsr_bid);
+	if (bat == NULL) {
+		CERROR ("Dropping RPC (%s) from %s under memory pressure.\n",
+			rpc->srpc_scd->scd_svc->sv_name,
+			libcfs_id2str(rpc->srpc_peer));
+		return -ENOMEM;
+	}
+
+	if (sfw_batch_active(bat)) {
+		reply->tsr_status = EBUSY;
+		return 0;
+	}
+
+	if (request->tsr_is_client && rpc->srpc_bulk == NULL) {
+		/* rpc will be resumed later in sfw_bulk_ready */
+		int	npg = sfw_id_pages(request->tsr_ndest);
+		int	len;
+
+		if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+			len = npg * PAGE_CACHE_SIZE;
+
+		} else  {
+			len = sizeof(lnet_process_id_packed_t) *
+			      request->tsr_ndest;
+		}
+
+		return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1);
+	}
+
+	rc = sfw_add_test_instance(bat, rpc);
+	CDEBUG (rc == 0 ? D_NET : D_WARNING,
+		"%s test: sv %d %s, loop %d, concur %d, ndest %d\n",
+		rc == 0 ? "Added" : "Failed to add", request->tsr_service,
+		request->tsr_is_client ? "client" : "server",
+		request->tsr_loop, request->tsr_concur, request->tsr_ndest);
+
+	reply->tsr_status = (rc < 0) ? -rc : rc;
+	return 0;
+}
+
+int
+sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	int	    rc = 0;
+	sfw_batch_t   *bat;
+
+	reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) {
+		reply->bar_status = ESRCH;
+		return 0;
+	}
+
+	bat = sfw_find_batch(request->bar_bid);
+	if (bat == NULL) {
+		reply->bar_status = ENOENT;
+		return 0;
+	}
+
+	switch (request->bar_opc) {
+	case SRPC_BATCH_OPC_RUN:
+		rc = sfw_run_batch(bat);
+		break;
+
+	case SRPC_BATCH_OPC_STOP:
+		rc = sfw_stop_batch(bat, request->bar_arg);
+		break;
+
+	case SRPC_BATCH_OPC_QUERY:
+		rc = sfw_query_batch(bat, request->bar_testidx, reply);
+		break;
+
+	default:
+		return -EINVAL; /* drop it */
+	}
+
+	reply->bar_status = (rc < 0) ? -rc : rc;
+	return 0;
+}
+
+int
+sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	srpc_msg_t     *reply	= &rpc->srpc_replymsg;
+	srpc_msg_t     *request	= &rpc->srpc_reqstbuf->buf_msg;
+	unsigned	features = LST_FEATS_MASK;
+	int		rc = 0;
+
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	/* Remove timer to avoid racing with it or expiring active session */
+	if (sfw_del_session_timer() != 0) {
+		CERROR("Dropping RPC (%s) from %s: racing with expiry timer.",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_unpack_message(request);
+	LASSERT(request->msg_type == srpc_service2request(sv->sv_id));
+
+	/* rpc module should have checked this */
+	LASSERT(request->msg_version == SRPC_MSG_VERSION);
+
+	if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
+	    sv->sv_id != SRPC_SERVICE_DEBUG) {
+		sfw_session_t *sn = sfw_data.fw_session;
+
+		if (sn != NULL &&
+		    sn->sn_features != request->msg_ses_feats) {
+			CNETERR("Features of framework RPC don't match "
+				"features of current session: %x/%x\n",
+				request->msg_ses_feats, sn->sn_features);
+			reply->msg_body.reply.status = EPROTO;
+			reply->msg_body.reply.sid    = sn->sn_id;
+			goto out;
+		}
+
+	} else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		/* NB: at this point, old version will ignore features and
+		 * create new session anyway, so console should be able
+		 * to handle this */
+		reply->msg_body.reply.status = EPROTO;
+		goto out;
+	}
+
+	switch(sv->sv_id) {
+	default:
+		LBUG ();
+	case SRPC_SERVICE_TEST:
+		rc = sfw_add_test(rpc);
+		break;
+
+	case SRPC_SERVICE_BATCH:
+		rc = sfw_control_batch(&request->msg_body.bat_reqst,
+				       &reply->msg_body.bat_reply);
+		break;
+
+	case SRPC_SERVICE_QUERY_STAT:
+		rc = sfw_get_stats(&request->msg_body.stat_reqst,
+				   &reply->msg_body.stat_reply);
+		break;
+
+	case SRPC_SERVICE_DEBUG:
+		rc = sfw_debug_session(&request->msg_body.dbg_reqst,
+				       &reply->msg_body.dbg_reply);
+		break;
+
+	case SRPC_SERVICE_MAKE_SESSION:
+		rc = sfw_make_session(&request->msg_body.mksn_reqst,
+				      &reply->msg_body.mksn_reply);
+		break;
+
+	case SRPC_SERVICE_REMOVE_SESSION:
+		rc = sfw_remove_session(&request->msg_body.rmsn_reqst,
+					&reply->msg_body.rmsn_reply);
+		break;
+	}
+
+	if (sfw_data.fw_session != NULL)
+		features = sfw_data.fw_session->sn_features;
+ out:
+	reply->msg_ses_feats = features;
+	rpc->srpc_done = sfw_server_rpc_done;
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+int
+sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	int			rc;
+
+	LASSERT(rpc->srpc_bulk != NULL);
+	LASSERT(sv->sv_id == SRPC_SERVICE_TEST);
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (status != 0) {
+		CERROR("Bulk transfer failed for RPC: "
+		       "service %s, peer %s, status %d\n",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer), status);
+		spin_unlock(&sfw_data.fw_lock);
+		return -EIO;
+	}
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	if (sfw_del_session_timer() != 0) {
+		CERROR("Dropping RPC (%s) from %s: racing with expiry timer",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	rc = sfw_add_test(rpc);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done)(srpc_client_rpc_t *), void *priv)
+{
+	srpc_client_rpc_t *rpc = NULL;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	LASSERT (!sfw_data.fw_shuttingdown);
+	LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+
+		srpc_init_client_rpc(rpc, peer, service, 0, 0,
+				     done, sfw_client_rpc_fini, priv);
+	}
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, service,
+					     nbulkiov, bulklen, done,
+					     nbulkiov != 0 ?  NULL :
+					     sfw_client_rpc_fini,
+					     priv);
+	}
+
+	if (rpc != NULL) /* "session" is concept in framework */
+		rpc->crpc_reqstmsg.msg_ses_feats = features;
+
+	return rpc;
+}
+
+void
+sfw_unpack_message (srpc_msg_t *msg)
+{
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	/* srpc module should guarantee I wouldn't get crap */
+	LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+	if (msg->msg_type == SRPC_MSG_STAT_REQST) {
+		srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+
+		__swab32s(&req->str_type);
+		__swab64s(&req->str_rpyid);
+		sfw_unpack_sid(req->str_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
+		srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+
+		__swab32s(&rep->str_status);
+		sfw_unpack_sid(rep->str_sid);
+		sfw_unpack_fw_counters(rep->str_fw);
+		sfw_unpack_rpc_counters(rep->str_rpc);
+		sfw_unpack_lnet_counters(rep->str_lnet);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
+		srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+
+		__swab64s(&req->mksn_rpyid);
+		__swab32s(&req->mksn_force);
+		sfw_unpack_sid(req->mksn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
+		srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+
+		__swab32s(&rep->mksn_status);
+		__swab32s(&rep->mksn_timeout);
+		sfw_unpack_sid(rep->mksn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
+		srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+
+		__swab64s(&req->rmsn_rpyid);
+		sfw_unpack_sid(req->rmsn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
+		srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+
+		__swab32s(&rep->rmsn_status);
+		sfw_unpack_sid(rep->rmsn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
+		srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+
+		__swab64s(&req->dbg_rpyid);
+		__swab32s(&req->dbg_flags);
+		sfw_unpack_sid(req->dbg_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
+		srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+
+		__swab32s(&rep->dbg_nbatch);
+		__swab32s(&rep->dbg_timeout);
+		sfw_unpack_sid(rep->dbg_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
+		srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+
+		__swab32s(&req->bar_opc);
+		__swab64s(&req->bar_rpyid);
+		__swab32s(&req->bar_testidx);
+		__swab32s(&req->bar_arg);
+		sfw_unpack_sid(req->bar_sid);
+		__swab64s(&req->bar_bid.bat_id);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
+		srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+		__swab32s(&rep->bar_status);
+		sfw_unpack_sid(rep->bar_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_TEST_REQST) {
+		srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+		__swab64s(&req->tsr_rpyid);
+		__swab64s(&req->tsr_bulkid);
+		__swab32s(&req->tsr_loop);
+		__swab32s(&req->tsr_ndest);
+		__swab32s(&req->tsr_concur);
+		__swab32s(&req->tsr_service);
+		sfw_unpack_sid(req->tsr_sid);
+		__swab64s(&req->tsr_bid.bat_id);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
+		srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+
+		__swab32s(&rep->tsr_status);
+		sfw_unpack_sid(rep->tsr_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
+		srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+
+		__swab64s(&req->join_rpyid);
+		sfw_unpack_sid(req->join_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
+		srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+
+		__swab32s(&rep->join_status);
+		__swab32s(&rep->join_timeout);
+		sfw_unpack_sid(rep->join_sid);
+		return;
+	}
+
+	LBUG ();
+	return;
+}
+
+void
+sfw_abort_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
+	LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&rpc->crpc_lock);
+	srpc_abort_rpc(rpc, -EINTR);
+	spin_unlock(&rpc->crpc_lock);
+	return;
+}
+
+void
+sfw_post_rpc (srpc_client_rpc_t *rpc)
+{
+	spin_lock(&rpc->crpc_lock);
+
+	LASSERT (!rpc->crpc_closed);
+	LASSERT (!rpc->crpc_aborted);
+	LASSERT (list_empty(&rpc->crpc_list));
+	LASSERT (!sfw_data.fw_shuttingdown);
+
+	rpc->crpc_timeout = rpc_timeout;
+	srpc_post_rpc(rpc);
+
+	spin_unlock(&rpc->crpc_lock);
+	return;
+}
+
+static srpc_service_t sfw_services[] =
+{
+	{
+		/* sv_id */    SRPC_SERVICE_DEBUG,
+		/* sv_name */  "debug",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_QUERY_STAT,
+		/* sv_name */  "query stats",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_MAKE_SESSION,
+		/* sv_name */  "make session",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_REMOVE_SESSION,
+		/* sv_name */  "remove session",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_BATCH,
+		/* sv_name */  "batch service",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_TEST,
+		/* sv_name */  "test service",
+		0
+	},
+	{
+		/* sv_id */    0,
+		/* sv_name */  NULL,
+		0
+	}
+};
+
+extern sfw_test_client_ops_t ping_test_client;
+extern srpc_service_t	ping_test_service;
+extern void ping_init_test_client(void);
+extern void ping_init_test_service(void);
+
+extern sfw_test_client_ops_t brw_test_client;
+extern srpc_service_t	brw_test_service;
+extern void brw_init_test_client(void);
+extern void brw_init_test_service(void);
+
+
+int
+sfw_startup (void)
+{
+	int	      i;
+	int	      rc;
+	int	      error;
+	srpc_service_t  *sv;
+	sfw_test_case_t *tsc;
+
+
+	if (session_timeout < 0) {
+		CERROR ("Session timeout must be non-negative: %d\n",
+			session_timeout);
+		return -EINVAL;
+	}
+
+	if (rpc_timeout < 0) {
+		CERROR ("RPC timeout must be non-negative: %d\n",
+			rpc_timeout);
+		return -EINVAL;
+	}
+
+	if (session_timeout == 0)
+		CWARN ("Zero session_timeout specified "
+		       "- test sessions never expire.\n");
+
+	if (rpc_timeout == 0)
+		CWARN ("Zero rpc_timeout specified "
+		       "- test RPC never expire.\n");
+
+	memset(&sfw_data, 0, sizeof(struct smoketest_framework));
+
+	sfw_data.fw_session     = NULL;
+	sfw_data.fw_active_srpc = NULL;
+	spin_lock_init(&sfw_data.fw_lock);
+	atomic_set(&sfw_data.fw_nzombies, 0);
+	INIT_LIST_HEAD(&sfw_data.fw_tests);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions);
+
+	brw_init_test_client();
+	brw_init_test_service();
+	rc = sfw_register_test(&brw_test_service, &brw_test_client);
+	LASSERT (rc == 0);
+
+	ping_init_test_client();
+	ping_init_test_service();
+	rc = sfw_register_test(&ping_test_service, &ping_test_client);
+	LASSERT (rc == 0);
+
+	error = 0;
+	list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+		sv = tsc->tsc_srv_service;
+
+		rc = srpc_add_service(sv);
+		LASSERT (rc != -EBUSY);
+		if (rc != 0) {
+			CWARN ("Failed to add %s service: %d\n",
+			       sv->sv_name, rc);
+			error = rc;
+		}
+	}
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL) break;
+
+		sv->sv_bulk_ready = NULL;
+		sv->sv_handler    = sfw_handle_server_rpc;
+		sv->sv_wi_total   = SFW_FRWK_WI_MAX;
+		if (sv->sv_id == SRPC_SERVICE_TEST)
+			sv->sv_bulk_ready = sfw_bulk_ready;
+
+		rc = srpc_add_service(sv);
+		LASSERT (rc != -EBUSY);
+		if (rc != 0) {
+			CWARN ("Failed to add %s service: %d\n",
+			       sv->sv_name, rc);
+			error = rc;
+		}
+
+		/* about to sfw_shutdown, no need to add buffer */
+		if (error) continue;
+
+		rc = srpc_service_add_buffers(sv, sv->sv_wi_total);
+		if (rc != 0) {
+			CWARN("Failed to reserve enough buffers: "
+			      "service %s, %d needed: %d\n",
+			      sv->sv_name, sv->sv_wi_total, rc);
+			error = -ENOMEM;
+		}
+	}
+
+	if (error != 0)
+		sfw_shutdown();
+	return error;
+}
+
+void
+sfw_shutdown (void)
+{
+	srpc_service_t	*sv;
+	sfw_test_case_t	*tsc;
+	int		 i;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	sfw_data.fw_shuttingdown = 1;
+	lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock,
+		       "waiting for active RPC to finish.\n");
+
+	if (sfw_del_session_timer() != 0)
+		lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock,
+			       "waiting for session timer to explode.\n");
+
+	sfw_deactivate_session();
+	lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0,
+		       sfw_data.fw_lock,
+		       "waiting for %d zombie sessions to die.\n",
+		       atomic_read(&sfw_data.fw_nzombies));
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL)
+			break;
+
+		srpc_shutdown_service(sv);
+		srpc_remove_service(sv);
+	}
+
+	list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+		sv = tsc->tsc_srv_service;
+		srpc_shutdown_service(sv);
+		srpc_remove_service(sv);
+	}
+
+	while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
+		srpc_client_rpc_t *rpc;
+
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL)
+			break;
+
+		srpc_wait_service_shutdown(sv);
+	}
+
+	while (!list_empty(&sfw_data.fw_tests)) {
+		tsc = list_entry(sfw_data.fw_tests.next,
+				     sfw_test_case_t, tsc_list);
+
+		srpc_wait_service_shutdown(tsc->tsc_srv_service);
+
+		list_del(&tsc->tsc_list);
+		LIBCFS_FREE(tsc, sizeof(*tsc));
+	}
+
+	return;
+}

diff --git a/drivers/staging/lustre/lnet/selftest/module.c b/drivers/staging/lustre/lnet/selftest/module.c
new file mode 100644
index 0000000..5257e56
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/module.c

@@ -0,0 +1,169 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+enum {
+	LST_INIT_NONE		= 0,
+	LST_INIT_WI_SERIAL,
+	LST_INIT_WI_TEST,
+	LST_INIT_RPC,
+	LST_INIT_FW,
+	LST_INIT_CONSOLE
+};
+
+extern int lstcon_console_init(void);
+extern int lstcon_console_fini(void);
+
+static int lst_init_step = LST_INIT_NONE;
+
+struct cfs_wi_sched *lst_sched_serial;
+struct cfs_wi_sched **lst_sched_test;
+
+void
+lnet_selftest_fini(void)
+{
+	int	i;
+
+	switch (lst_init_step) {
+		case LST_INIT_CONSOLE:
+			lstcon_console_fini();
+		case LST_INIT_FW:
+			sfw_shutdown();
+		case LST_INIT_RPC:
+			srpc_shutdown();
+		case LST_INIT_WI_TEST:
+			for (i = 0;
+			     i < cfs_cpt_number(lnet_cpt_table()); i++) {
+				if (lst_sched_test[i] == NULL)
+					continue;
+				cfs_wi_sched_destroy(lst_sched_test[i]);
+			}
+			LIBCFS_FREE(lst_sched_test,
+				    sizeof(lst_sched_test[0]) *
+				    cfs_cpt_number(lnet_cpt_table()));
+			lst_sched_test = NULL;
+
+		case LST_INIT_WI_SERIAL:
+			cfs_wi_sched_destroy(lst_sched_serial);
+			lst_sched_serial = NULL;
+		case LST_INIT_NONE:
+			break;
+		default:
+			LBUG();
+	}
+	return;
+}
+
+void
+lnet_selftest_structure_assertion(void)
+{
+	CLASSERT(sizeof(srpc_msg_t) == 160);
+	CLASSERT(sizeof(srpc_test_reqst_t) == 70);
+	CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72);
+	CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78);
+	CLASSERT(sizeof(srpc_stat_reply_t) == 136);
+	CLASSERT(sizeof(srpc_stat_reqst_t) == 28);
+}
+
+int
+lnet_selftest_init(void)
+{
+	int	nscheds;
+	int	rc;
+	int	i;
+
+	rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
+				 1, &lst_sched_serial);
+	if (rc != 0) {
+		CERROR("Failed to create serial WI scheduler for LST\n");
+		return rc;
+	}
+	lst_init_step = LST_INIT_WI_SERIAL;
+
+	nscheds = cfs_cpt_number(lnet_cpt_table());
+	LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds);
+	if (lst_sched_test == NULL)
+		goto error;
+
+	lst_init_step = LST_INIT_WI_TEST;
+	for (i = 0; i < nscheds; i++) {
+		int nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+
+		/* reserve at least one CPU for LND */
+		nthrs = max(nthrs - 1, 1);
+		rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
+					 nthrs, &lst_sched_test[i]);
+		if (rc != 0) {
+			CERROR("Failed to create CPT affinity WI scheduler "
+			       "%d for LST\n", i);
+			goto error;
+		}
+	}
+
+	rc = srpc_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup rpc\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_RPC;
+
+	rc = sfw_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup framework\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_FW;
+
+	rc = lstcon_console_init();
+	if (rc != 0) {
+		CERROR("LST can't startup console\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_CONSOLE;
+	return 0;
+error:
+	lnet_selftest_fini();
+	return rc;
+}
+
+
+MODULE_DESCRIPTION("LNet Selftest");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "0.9.0", lnet_selftest_init, lnet_selftest_fini);

diff --git a/drivers/staging/lustre/lnet/selftest/ping_test.c b/drivers/staging/lustre/lnet/selftest/ping_test.c
new file mode 100644
index 0000000..f0f9194
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/ping_test.c

@@ -0,0 +1,229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Test client & Server
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+#define LST_PING_TEST_MAGIC     0xbabeface
+
+int ping_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(ping_srv_workitems, "i", int, 0644, "# PING server workitems");
+
+typedef struct {
+	spinlock_t	pnd_lock;	/* serialize */
+	int		pnd_counter;	/* sequence counter */
+} lst_ping_data_t;
+
+static lst_ping_data_t  lst_ping_data;
+
+static int
+ping_client_init(sfw_test_instance_t *tsi)
+{
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+
+	LASSERT(tsi->tsi_is_client);
+	LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	spin_lock_init(&lst_ping_data.pnd_lock);
+	lst_ping_data.pnd_counter = 0;
+
+	return 0;
+}
+
+static void
+ping_client_fini (sfw_test_instance_t *tsi)
+{
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+	int	    errors;
+
+	LASSERT (sn != NULL);
+	LASSERT (tsi->tsi_is_client);
+
+	errors = atomic_read(&sn->sn_ping_errors);
+	if (errors)
+		CWARN ("%d pings have failed.\n", errors);
+	else
+		CDEBUG (D_NET, "Ping test finished OK.\n");
+}
+
+static int
+ping_client_prep_rpc(sfw_test_unit_t *tsu,
+		     lnet_process_id_t dest, srpc_client_rpc_t **rpc)
+{
+	srpc_ping_reqst_t   *req;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn  = tsi->tsi_batch->bat_session;
+	struct timeval       tv;
+	int		     rc;
+
+	LASSERT(sn != NULL);
+	LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc);
+	if (rc != 0)
+		return rc;
+
+	req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst;
+
+	req->pnr_magic = LST_PING_TEST_MAGIC;
+
+	spin_lock(&lst_ping_data.pnd_lock);
+	req->pnr_seq = lst_ping_data.pnd_counter++;
+	spin_unlock(&lst_ping_data.pnd_lock);
+
+	cfs_fs_timeval(&tv);
+	req->pnr_time_sec  = tv.tv_sec;
+	req->pnr_time_usec = tv.tv_usec;
+
+	return rc;
+}
+
+static void
+ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+	srpc_ping_reqst_t   *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+	srpc_ping_reply_t   *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+	struct timeval       tv;
+
+	LASSERT (sn != NULL);
+
+	if (rpc->crpc_status != 0) {
+		if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_ping_errors);
+		CERROR ("Unable to ping %s (%d): %d\n",
+			libcfs_id2str(rpc->crpc_dest),
+			reqst->pnr_seq, rpc->crpc_status);
+		return;
+	}
+
+	if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+		__swab32s(&reply->pnr_seq);
+		__swab32s(&reply->pnr_magic);
+		__swab32s(&reply->pnr_status);
+	}
+
+	if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+		rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+		CERROR ("Bad magic %u from %s, %u expected.\n",
+			reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+			LST_PING_TEST_MAGIC);
+		return;
+	}
+
+	if (reply->pnr_seq != reqst->pnr_seq) {
+		rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+		CERROR ("Bad seq %u from %s, %u expected.\n",
+			reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+			reqst->pnr_seq);
+		return;
+	}
+
+	cfs_fs_timeval(&tv);
+	CDEBUG (D_NET, "%d reply in %u usec\n", reply->pnr_seq,
+		(unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000
+			   + (tv.tv_usec - reqst->pnr_time_usec)));
+	return;
+}
+
+static int
+ping_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv  = rpc->srpc_scd->scd_svc;
+	srpc_msg_t	*reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_msg_t	  *replymsg = &rpc->srpc_replymsg;
+	srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
+	srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+
+	LASSERT (sv->sv_id == SRPC_SERVICE_PING);
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+		LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+		__swab32s(&req->pnr_seq);
+		__swab32s(&req->pnr_magic);
+		__swab64s(&req->pnr_time_sec);
+		__swab64s(&req->pnr_time_usec);
+	}
+	LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id));
+
+	if (req->pnr_magic != LST_PING_TEST_MAGIC) {
+		CERROR ("Unexpect magic %08x from %s\n",
+			req->pnr_magic, libcfs_id2str(rpc->srpc_peer));
+		return -EINVAL;
+	}
+
+	rep->pnr_seq   = req->pnr_seq;
+	rep->pnr_magic = LST_PING_TEST_MAGIC;
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		rep->pnr_status = EPROTO;
+		return 0;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	CDEBUG(D_NET, "Get ping %d from %s\n",
+	       req->pnr_seq, libcfs_id2str(rpc->srpc_peer));
+	return 0;
+}
+
+sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void)
+{
+	ping_test_client.tso_init     = ping_client_init;
+	ping_test_client.tso_fini     = ping_client_fini;
+	ping_test_client.tso_prep_rpc = ping_client_prep_rpc;
+	ping_test_client.tso_done_rpc = ping_client_done_rpc;
+}
+
+srpc_service_t ping_test_service;
+void ping_init_test_service(void)
+{
+	ping_test_service.sv_id       = SRPC_SERVICE_PING;
+	ping_test_service.sv_name     = "ping_test";
+	ping_test_service.sv_handler  = ping_server_handle;
+	ping_test_service.sv_wi_total = ping_srv_workitems;
+}

diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c
new file mode 100644
index 0000000..bc1f38b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/rpc.c

@@ -0,0 +1,1666 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/rpc.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *
+ * 2012-05-13: Liang Zhen <liang@whamcloud.com>
+ * - percpt data for service to improve smp performance
+ * - code cleanup
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+typedef enum {
+	SRPC_STATE_NONE,
+	SRPC_STATE_NI_INIT,
+	SRPC_STATE_EQ_INIT,
+	SRPC_STATE_RUNNING,
+	SRPC_STATE_STOPPING,
+} srpc_state_t;
+
+struct smoketest_rpc {
+	spinlock_t	 rpc_glock;	/* global lock */
+	srpc_service_t	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
+	lnet_handle_eq_t rpc_lnet_eq;	/* _the_ LNet event queue */
+	srpc_state_t	 rpc_state;
+	srpc_counters_t	 rpc_counters;
+	__u64		 rpc_matchbits;	/* matchbits counter */
+} srpc_data;
+
+static inline int
+srpc_serv_portal(int svc_id)
+{
+	return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ?
+	       SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL;
+}
+
+/* forward ref's */
+int srpc_handle_rpc (swi_workitem_t *wi);
+
+void srpc_get_counters (srpc_counters_t *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	*cnt = srpc_data.rpc_counters;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+void srpc_set_counters (const srpc_counters_t *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters = *cnt;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+int
+srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob)
+{
+	nob = min(nob, (int)PAGE_CACHE_SIZE);
+
+	LASSERT(nob > 0);
+	LASSERT(i >= 0 && i < bk->bk_niov);
+
+	bk->bk_iovs[i].kiov_offset = 0;
+	bk->bk_iovs[i].kiov_page   = pg;
+	bk->bk_iovs[i].kiov_len    = nob;
+	return nob;
+}
+
+void
+srpc_free_bulk (srpc_bulk_t *bk)
+{
+	int	 i;
+	struct page *pg;
+
+	LASSERT (bk != NULL);
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		if (pg == NULL) break;
+
+		__free_page(pg);
+	}
+
+	LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
+	return;
+}
+
+srpc_bulk_t *
+srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink)
+{
+	srpc_bulk_t  *bk;
+	struct page  **pages;
+	int	      i;
+
+	LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
+
+	LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
+			 offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	if (bk == NULL) {
+		CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
+		return NULL;
+	}
+
+	memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	bk->bk_sink   = sink;
+	bk->bk_len    = bulk_len;
+	bk->bk_niov   = bulk_npg;
+	UNUSED(pages);
+
+	for (i = 0; i < bulk_npg; i++) {
+		struct page *pg;
+		int	    nob;
+
+		pg = alloc_pages_node(cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+				      GFP_IOFS, 0);
+		if (pg == NULL) {
+			CERROR("Can't allocate page %d of %d\n", i, bulk_npg);
+			srpc_free_bulk(bk);
+			return NULL;
+		}
+
+		nob = srpc_add_bulk_page(bk, pg, i, bulk_len);
+		bulk_len -= nob;
+	}
+
+	return bk;
+}
+
+static inline __u64
+srpc_next_id (void)
+{
+	__u64 id;
+
+	spin_lock(&srpc_data.rpc_glock);
+	id = srpc_data.rpc_matchbits++;
+	spin_unlock(&srpc_data.rpc_glock);
+	return id;
+}
+
+void
+srpc_init_server_rpc(struct srpc_server_rpc *rpc,
+		     struct srpc_service_cd *scd,
+		     struct srpc_buffer *buffer)
+{
+	memset(rpc, 0, sizeof(*rpc));
+	swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc,
+			  srpc_serv_is_framework(scd->scd_svc) ?
+			  lst_sched_serial : lst_sched_test[scd->scd_cpt]);
+
+	rpc->srpc_ev.ev_fired = 1; /* no event expected now */
+
+	rpc->srpc_scd      = scd;
+	rpc->srpc_reqstbuf = buffer;
+	rpc->srpc_peer     = buffer->buf_peer;
+	rpc->srpc_self     = buffer->buf_self;
+	LNetInvalidateHandle(&rpc->srpc_replymdh);
+}
+
+static void
+srpc_service_fini(struct srpc_service *svc)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	struct srpc_buffer	*buf;
+	struct list_head		*q;
+	int			i;
+
+	if (svc->sv_cpt_data == NULL)
+		return;
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		while (1) {
+			if (!list_empty(&scd->scd_buf_posted))
+				q = &scd->scd_buf_posted;
+			else if (!list_empty(&scd->scd_buf_blocked))
+				q = &scd->scd_buf_blocked;
+			else
+				break;
+
+			while (!list_empty(q)) {
+				buf = list_entry(q->next,
+						     struct srpc_buffer,
+						     buf_list);
+				list_del(&buf->buf_list);
+				LIBCFS_FREE(buf, sizeof(*buf));
+			}
+		}
+
+		LASSERT(list_empty(&scd->scd_rpc_active));
+
+		while (!list_empty(&scd->scd_rpc_free)) {
+			rpc = list_entry(scd->scd_rpc_free.next,
+					     struct srpc_server_rpc,
+					     srpc_list);
+			list_del(&rpc->srpc_list);
+			LIBCFS_FREE(rpc, sizeof(*rpc));
+		}
+	}
+
+	cfs_percpt_free(svc->sv_cpt_data);
+	svc->sv_cpt_data = NULL;
+}
+
+static int
+srpc_service_nrpcs(struct srpc_service *svc)
+{
+	int nrpcs = svc->sv_wi_total / svc->sv_ncpts;
+
+	return srpc_serv_is_framework(svc) ?
+	       max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN);
+}
+
+int srpc_add_buffer(struct swi_workitem *wi);
+
+static int
+srpc_service_init(struct srpc_service *svc)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			nrpcs;
+	int			i;
+	int			j;
+
+	svc->sv_shuttingdown = 0;
+
+	svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct srpc_service_cd));
+	if (svc->sv_cpt_data == NULL)
+		return -ENOMEM;
+
+	svc->sv_ncpts = srpc_serv_is_framework(svc) ?
+			1 : cfs_cpt_number(lnet_cpt_table());
+	nrpcs = srpc_service_nrpcs(svc);
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		scd->scd_cpt = i;
+		scd->scd_svc = svc;
+		spin_lock_init(&scd->scd_lock);
+		INIT_LIST_HEAD(&scd->scd_rpc_free);
+		INIT_LIST_HEAD(&scd->scd_rpc_active);
+		INIT_LIST_HEAD(&scd->scd_buf_posted);
+		INIT_LIST_HEAD(&scd->scd_buf_blocked);
+
+		scd->scd_ev.ev_data = scd;
+		scd->scd_ev.ev_type = SRPC_REQUEST_RCVD;
+
+		/* NB: don't use lst_sched_serial for adding buffer,
+		 * see details in srpc_service_add_buffers() */
+		swi_init_workitem(&scd->scd_buf_wi, scd,
+				  srpc_add_buffer, lst_sched_test[i]);
+
+		if (i != 0 && srpc_serv_is_framework(svc)) {
+			/* NB: framework service only needs srpc_service_cd for
+			 * one partition, but we allocate for all to make
+			 * it easier to implement, it will waste a little
+			 * memory but nobody should care about this */
+			continue;
+		}
+
+		for (j = 0; j < nrpcs; j++) {
+			LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(),
+					 i, sizeof(*rpc));
+			if (rpc == NULL) {
+				srpc_service_fini(svc);
+				return -ENOMEM;
+			}
+			list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+		}
+	}
+
+	return 0;
+}
+
+int
+srpc_add_service(struct srpc_service *sv)
+{
+	int id = sv->sv_id;
+
+	LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID);
+
+	if (srpc_service_init(sv) != 0)
+		return -ENOMEM;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	if (srpc_data.rpc_services[id] != NULL) {
+		spin_unlock(&srpc_data.rpc_glock);
+		goto failed;
+	}
+
+	srpc_data.rpc_services[id] = sv;
+	spin_unlock(&srpc_data.rpc_glock);
+
+	CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
+	return 0;
+
+ failed:
+	srpc_service_fini(sv);
+	return -EBUSY;
+}
+
+int
+srpc_remove_service (srpc_service_t *sv)
+{
+	int id = sv->sv_id;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	if (srpc_data.rpc_services[id] != sv) {
+		spin_unlock(&srpc_data.rpc_glock);
+		return -ENOENT;
+	}
+
+	srpc_data.rpc_services[id] = NULL;
+	spin_unlock(&srpc_data.rpc_glock);
+	return 0;
+}
+
+int
+srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
+		       int len, int options, lnet_process_id_t peer,
+		       lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	int		 rc;
+	lnet_md_t	 md;
+	lnet_handle_me_t meh;
+
+	rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
+			  local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
+	if (rc != 0) {
+		CERROR ("LNetMEAttach failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	md.threshold = 1;
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.options   = options;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+
+	rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR ("LNetMDAttach failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+
+		rc = LNetMEUnlink(meh);
+		LASSERT (rc == 0);
+		return -ENOMEM;
+	}
+
+	CDEBUG (D_NET,
+		"Posted passive RDMA: peer %s, portal %d, matchbits "LPX64"\n",
+		libcfs_id2str(peer), portal, matchbits);
+	return 0;
+}
+
+int
+srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
+		      int options, lnet_process_id_t peer, lnet_nid_t self,
+		      lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	int       rc;
+	lnet_md_t md;
+
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+	md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+	md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+	rc = LNetMDBind(md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR ("LNetMDBind failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	/* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+	 * they're only meaningful for MDs attached to an ME (i.e. passive
+	 * buffers... */
+	if ((options & LNET_MD_OP_PUT) != 0) {
+		rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+			     portal, matchbits, 0, 0);
+	} else {
+		LASSERT ((options & LNET_MD_OP_GET) != 0);
+
+		rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
+	}
+
+	if (rc != 0) {
+		CERROR ("LNet%s(%s, %d, "LPD64") failed: %d\n",
+			((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+			libcfs_id2str(peer), portal, matchbits, rc);
+
+		/* The forthcoming unlink event will complete this operation
+		 * with failure, so fall through and return success here.
+		 */
+		rc = LNetMDUnlink(*mdh);
+		LASSERT (rc == 0);
+	} else {
+		CDEBUG (D_NET,
+			"Posted active RDMA: peer %s, portal %u, matchbits "LPX64"\n",
+			libcfs_id2str(peer), portal, matchbits);
+	}
+	return 0;
+}
+
+int
+srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf,
+			int len, lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	return srpc_post_active_rdma(srpc_serv_portal(service), service,
+				     buf, len, LNET_MD_OP_PUT, peer,
+				     LNET_NID_ANY, mdh, ev);
+}
+
+int
+srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
+			 lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	lnet_process_id_t any = {0};
+
+	any.nid = LNET_NID_ANY;
+	any.pid = LNET_PID_ANY;
+
+	return srpc_post_passive_rdma(srpc_serv_portal(service),
+				      local, service, buf, len,
+				      LNET_MD_OP_PUT, any, mdh, ev);
+}
+
+int
+srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
+{
+	struct srpc_service	*sv = scd->scd_svc;
+	struct srpc_msg		*msg = &buf->buf_msg;
+	int			rc;
+
+	LNetInvalidateHandle(&buf->buf_mdh);
+	list_add(&buf->buf_list, &scd->scd_buf_posted);
+	scd->scd_buf_nposted++;
+	spin_unlock(&scd->scd_lock);
+
+	rc = srpc_post_passive_rqtbuf(sv->sv_id,
+				      !srpc_serv_is_framework(sv),
+				      msg, sizeof(*msg), &buf->buf_mdh,
+				      &scd->scd_ev);
+
+	/* At this point, a RPC (new or delayed) may have arrived in
+	 * msg and its event handler has been called. So we must add
+	 * buf to scd_buf_posted _before_ dropping scd_lock */
+
+	spin_lock(&scd->scd_lock);
+
+	if (rc == 0) {
+		if (!sv->sv_shuttingdown)
+			return 0;
+
+		spin_unlock(&scd->scd_lock);
+		/* srpc_shutdown_service might have tried to unlink me
+		 * when my buf_mdh was still invalid */
+		LNetMDUnlink(buf->buf_mdh);
+		spin_lock(&scd->scd_lock);
+		return 0;
+	}
+
+	scd->scd_buf_nposted--;
+	if (sv->sv_shuttingdown)
+		return rc; /* don't allow to change scd_buf_posted */
+
+	list_del(&buf->buf_list);
+	spin_unlock(&scd->scd_lock);
+
+	LIBCFS_FREE(buf, sizeof(*buf));
+
+	spin_lock(&scd->scd_lock);
+	return rc;
+}
+
+int
+srpc_add_buffer(struct swi_workitem *wi)
+{
+	struct srpc_service_cd	*scd = wi->swi_workitem.wi_data;
+	struct srpc_buffer	*buf;
+	int			rc = 0;
+
+	/* it's called by workitem scheduler threads, these threads
+	 * should have been set CPT affinity, so buffers will be posted
+	 * on CPT local list of Portal */
+	spin_lock(&scd->scd_lock);
+
+	while (scd->scd_buf_adjust > 0 &&
+	       !scd->scd_svc->sv_shuttingdown) {
+		scd->scd_buf_adjust--; /* consume it */
+		scd->scd_buf_posting++;
+
+		spin_unlock(&scd->scd_lock);
+
+		LIBCFS_ALLOC(buf, sizeof(*buf));
+		if (buf == NULL) {
+			CERROR("Failed to add new buf to service: %s\n",
+			       scd->scd_svc->sv_name);
+			spin_lock(&scd->scd_lock);
+			rc = -ENOMEM;
+			break;
+		}
+
+		spin_lock(&scd->scd_lock);
+		if (scd->scd_svc->sv_shuttingdown) {
+			spin_unlock(&scd->scd_lock);
+			LIBCFS_FREE(buf, sizeof(*buf));
+
+			spin_lock(&scd->scd_lock);
+			rc = -ESHUTDOWN;
+			break;
+		}
+
+		rc = srpc_service_post_buffer(scd, buf);
+		if (rc != 0)
+			break; /* buf has been freed inside */
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+		scd->scd_buf_total++;
+		scd->scd_buf_low = MAX(2, scd->scd_buf_total / 4);
+	}
+
+	if (rc != 0) {
+		scd->scd_buf_err_stamp = cfs_time_current_sec();
+		scd->scd_buf_err = rc;
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return 0;
+}
+
+int
+srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd	*scd;
+	int			rc = 0;
+	int			i;
+
+	LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		scd->scd_buf_err = 0;
+		scd->scd_buf_err_stamp = 0;
+		scd->scd_buf_posting = 0;
+		scd->scd_buf_adjust = nbuffer;
+		/* start to post buffers */
+		swi_schedule_workitem(&scd->scd_buf_wi);
+		spin_unlock(&scd->scd_lock);
+
+		/* framework service only post buffer for one partition  */
+		if (srpc_serv_is_framework(sv))
+			break;
+	}
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		/*
+		 * NB: srpc_service_add_buffers() can be called inside
+		 * thread context of lst_sched_serial, and we don't normally
+		 * allow to sleep inside thread context of WI scheduler
+		 * because it will block current scheduler thread from doing
+		 * anything else, even worse, it could deadlock if it's
+		 * waiting on result from another WI of the same scheduler.
+		 * However, it's safe at here because scd_buf_wi is scheduled
+		 * by thread in a different WI scheduler (lst_sched_test),
+		 * so we don't have any risk of deadlock, though this could
+		 * block all WIs pending on lst_sched_serial for a moment
+		 * which is not good but not fatal.
+		 */
+		lst_wait_until(scd->scd_buf_err != 0 ||
+			       (scd->scd_buf_adjust == 0 &&
+				scd->scd_buf_posting == 0),
+			       scd->scd_lock, "waiting for adding buffer\n");
+
+		if (scd->scd_buf_err != 0 && rc == 0)
+			rc = scd->scd_buf_err;
+
+		spin_unlock(&scd->scd_lock);
+	}
+
+	return rc;
+}
+
+void
+srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd	*scd;
+	int			num;
+	int			i;
+
+	LASSERT(!sv->sv_shuttingdown);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		num = scd->scd_buf_total + scd->scd_buf_posting;
+		scd->scd_buf_adjust -= min(nbuffer, num);
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+/* returns 1 if sv has finished, otherwise 0 */
+int
+srpc_finish_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
+
+	LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		if (!swi_deschedule_workitem(&scd->scd_buf_wi))
+			return 0;
+
+		if (scd->scd_buf_nposted > 0) {
+			CDEBUG(D_NET, "waiting for %d posted buffers to unlink",
+			       scd->scd_buf_nposted);
+			spin_unlock(&scd->scd_lock);
+			return 0;
+		}
+
+		if (list_empty(&scd->scd_rpc_active)) {
+			spin_unlock(&scd->scd_lock);
+			continue;
+		}
+
+		rpc = list_entry(scd->scd_rpc_active.next,
+				     struct srpc_server_rpc, srpc_list);
+		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, "
+			"wi %s scheduled %d running %d, "
+			"ev fired %d type %d status %d lnet %d\n",
+			rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+			swi_state2str(rpc->srpc_wi.swi_state),
+			rpc->srpc_wi.swi_workitem.wi_scheduled,
+			rpc->srpc_wi.swi_workitem.wi_running,
+			rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type,
+			rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet);
+		spin_unlock(&scd->scd_lock);
+		return 0;
+	}
+
+	/* no lock needed from now on */
+	srpc_service_fini(sv);
+	return 1;
+}
+
+/* called with sv->sv_lock held */
+void
+srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+{
+	if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
+		if (srpc_service_post_buffer(scd, buf) != 0) {
+			CWARN("Failed to post %s buffer\n",
+			      scd->scd_svc->sv_name);
+		}
+		return;
+	}
+
+	/* service is shutting down, or we want to recycle some buffers */
+	scd->scd_buf_total--;
+
+	if (scd->scd_buf_adjust < 0) {
+		scd->scd_buf_adjust++;
+		if (scd->scd_buf_adjust < 0 &&
+		    scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) {
+			CDEBUG(D_INFO,
+			       "Try to recyle %d buffers but nothing left\n",
+			       scd->scd_buf_adjust);
+			scd->scd_buf_adjust = 0;
+		}
+	}
+
+	spin_unlock(&scd->scd_lock);
+	LIBCFS_FREE(buf, sizeof(*buf));
+	spin_lock(&scd->scd_lock);
+}
+
+void
+srpc_abort_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
+
+	CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the abort, NB:
+		 * racing with incoming RPCs; complete fix should make test
+		 * RPCs carry session ID in its headers */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
+			rpc->srpc_aborted = 1;
+			swi_schedule_workitem(&rpc->srpc_wi);
+		}
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+void
+srpc_shutdown_service(srpc_service_t *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	srpc_buffer_t		*buf;
+	int			i;
+
+	CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_lock(&scd->scd_lock);
+
+	sv->sv_shuttingdown = 1; /* i.e. no new active RPC */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_unlock(&scd->scd_lock);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the shutdown */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list)
+			swi_schedule_workitem(&rpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+
+		/* OK to traverse scd_buf_posted without lock, since no one
+		 * touches scd_buf_posted now */
+		list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
+			LNetMDUnlink(buf->buf_mdh);
+	}
+}
+
+int
+srpc_send_request (srpc_client_rpc_t *rpc)
+{
+	srpc_event_t *ev = &rpc->crpc_reqstev;
+	int	   rc;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REQUEST_SENT;
+
+	rc = srpc_post_active_rqtbuf(rpc->crpc_dest, rpc->crpc_service,
+				     &rpc->crpc_reqstmsg, sizeof(srpc_msg_t),
+				     &rpc->crpc_reqstmdh, ev);
+	if (rc != 0) {
+		LASSERT (rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+int
+srpc_prepare_reply (srpc_client_rpc_t *rpc)
+{
+	srpc_event_t *ev = &rpc->crpc_replyev;
+	__u64	*id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+	int	   rc;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_RCVD;
+
+	*id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+				    &rpc->crpc_replymsg, sizeof(srpc_msg_t),
+				    LNET_MD_OP_PUT, rpc->crpc_dest,
+				    &rpc->crpc_replymdh, ev);
+	if (rc != 0) {
+		LASSERT (rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+int
+srpc_prepare_bulk (srpc_client_rpc_t *rpc)
+{
+	srpc_bulk_t  *bk = &rpc->crpc_bulk;
+	srpc_event_t *ev = &rpc->crpc_bulkev;
+	__u64	*id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+	int	   rc;
+	int	   opt;
+
+	LASSERT (bk->bk_niov <= LNET_MAX_IOV);
+
+	if (bk->bk_niov == 0) return 0; /* nothing to do */
+
+	opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_BULK_REQ_RCVD;
+
+	*id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+				    &bk->bk_iovs[0], bk->bk_niov, opt,
+				    rpc->crpc_dest, &bk->bk_mdh, ev);
+	if (rc != 0) {
+		LASSERT (rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+int
+srpc_do_bulk (srpc_server_rpc_t *rpc)
+{
+	srpc_event_t  *ev = &rpc->srpc_ev;
+	srpc_bulk_t   *bk = rpc->srpc_bulk;
+	__u64	  id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+	int	    rc;
+	int	    opt;
+
+	LASSERT (bk != NULL);
+
+	opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+				   &bk->bk_iovs[0], bk->bk_niov, opt,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &bk->bk_mdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
+}
+
+/* only called from srpc_handle_rpc */
+void
+srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+{
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv  = scd->scd_svc;
+	srpc_buffer_t		*buffer;
+
+	LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+
+	rpc->srpc_status = status;
+
+	CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR,
+		"Server RPC %p done: service %s, peer %s, status %s:%d\n",
+		rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		swi_state2str(rpc->srpc_wi.swi_state), status);
+
+	if (status != 0) {
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_dropped++;
+		spin_unlock(&srpc_data.rpc_glock);
+	}
+
+	if (rpc->srpc_done != NULL)
+		(*rpc->srpc_done) (rpc);
+	LASSERT(rpc->srpc_bulk == NULL);
+
+	spin_lock(&scd->scd_lock);
+
+	if (rpc->srpc_reqstbuf != NULL) {
+		/* NB might drop sv_lock in srpc_service_recycle_buffer, but
+		 * sv won't go away for scd_rpc_active must not be empty */
+		srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */
+
+	/*
+	 * No one can schedule me now since:
+	 * - I'm not on scd_rpc_active.
+	 * - all LNet events have been fired.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT(rpc->srpc_ev.ev_fired);
+	swi_exit_workitem(&rpc->srpc_wi);
+
+	if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
+		buffer = list_entry(scd->scd_buf_blocked.next,
+					srpc_buffer_t, buf_list);
+		list_del(&buffer->buf_list);
+
+		srpc_init_server_rpc(rpc, scd, buffer);
+		list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active);
+		swi_schedule_workitem(&rpc->srpc_wi);
+	} else {
+		list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return;
+}
+
+/* handles an incoming RPC */
+int
+srpc_handle_rpc(swi_workitem_t *wi)
+{
+	struct srpc_server_rpc	*rpc = wi->swi_workitem.wi_data;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	int			rc = 0;
+
+	LASSERT(wi == &rpc->srpc_wi);
+
+	spin_lock(&scd->scd_lock);
+
+	if (sv->sv_shuttingdown || rpc->srpc_aborted) {
+		spin_unlock(&scd->scd_lock);
+
+		if (rpc->srpc_bulk != NULL)
+			LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+		LNetMDUnlink(rpc->srpc_replymdh);
+
+		if (ev->ev_fired) { /* no more event, OK to finish */
+			srpc_server_rpc_done(rpc, -ESHUTDOWN);
+			return 1;
+		}
+		return 0;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+	switch (wi->swi_state) {
+	default:
+		LBUG ();
+	case SWI_STATE_NEWBORN: {
+		srpc_msg_t	   *msg;
+		srpc_generic_reply_t *reply;
+
+		msg = &rpc->srpc_reqstbuf->buf_msg;
+		reply = &rpc->srpc_replymsg.msg_body.reply;
+
+		if (msg->msg_magic == 0) {
+			/* moaned already in srpc_lnet_ev_handler */
+			srpc_server_rpc_done(rpc, EBADMSG);
+			return 1;
+		}
+
+		srpc_unpack_msg_hdr(msg);
+		if (msg->msg_version != SRPC_MSG_VERSION) {
+			CWARN("Version mismatch: %u, %u expected, from %s\n",
+			      msg->msg_version, SRPC_MSG_VERSION,
+			      libcfs_id2str(rpc->srpc_peer));
+			reply->status = EPROTO;
+			/* drop through and send reply */
+		} else {
+			reply->status = 0;
+			rc = (*sv->sv_handler)(rpc);
+			LASSERT(reply->status == 0 || !rpc->srpc_bulk);
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_BULK_STARTED;
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = srpc_do_bulk(rpc);
+			if (rc == 0)
+				return 0; /* wait for bulk */
+
+			LASSERT (ev->ev_fired);
+			ev->ev_status = rc;
+		}
+	}
+	case SWI_STATE_BULK_STARTED:
+		LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = ev->ev_status;
+
+			if (sv->sv_bulk_ready != NULL)
+				rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+		rc = srpc_send_reply(rpc);
+		if (rc == 0)
+			return 0; /* wait for reply */
+		srpc_server_rpc_done(rpc, rc);
+		return 1;
+
+	case SWI_STATE_REPLY_SUBMITTED:
+		if (!ev->ev_fired) {
+			CERROR("RPC %p: bulk %p, service %d\n",
+			       rpc, rpc->srpc_bulk, sv->sv_id);
+			CERROR("Event: status %d, type %d, lnet %d\n",
+			       ev->ev_status, ev->ev_type, ev->ev_lnet);
+			LASSERT (ev->ev_fired);
+		}
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_server_rpc_done(rpc, ev->ev_status);
+		return 1;
+	}
+
+	return 0;
+}
+
+void
+srpc_client_rpc_expired (void *data)
+{
+	srpc_client_rpc_t *rpc = data;
+
+	CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n",
+	       rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	       rpc->crpc_timeout);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_timeout = 0;
+	srpc_abort_rpc(rpc, -ETIMEDOUT);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters.rpcs_expired++;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+inline void
+srpc_add_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+	stt_timer_t *timer = &rpc->crpc_timer;
+
+	if (rpc->crpc_timeout == 0) return;
+
+	INIT_LIST_HEAD(&timer->stt_list);
+	timer->stt_data    = rpc;
+	timer->stt_func    = srpc_client_rpc_expired;
+	timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
+					  cfs_time_current_sec());
+	stt_add_timer(timer);
+	return;
+}
+
+/*
+ * Called with rpc->crpc_lock held.
+ *
+ * Upon exit the RPC expiry timer is not queued and the handler is not
+ * running on any CPU. */
+void
+srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+	/* timer not planted or already exploded */
+	if (rpc->crpc_timeout == 0)
+		return;
+
+	/* timer sucessfully defused */
+	if (stt_del_timer(&rpc->crpc_timer))
+		return;
+
+	/* timer detonated, wait for it to explode */
+	while (rpc->crpc_timeout != 0) {
+		spin_unlock(&rpc->crpc_lock);
+
+		schedule();
+
+		spin_lock(&rpc->crpc_lock);
+	}
+}
+
+void
+srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
+{
+	swi_workitem_t *wi = &rpc->crpc_wi;
+
+	LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_closed = 1;
+	if (rpc->crpc_status == 0)
+		rpc->crpc_status = status;
+
+	srpc_del_client_rpc_timer(rpc);
+
+	CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR,
+		"Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+	/*
+	 * No one can schedule me now since:
+	 * - RPC timer has been defused.
+	 * - all LNet events have been fired.
+	 * - crpc_closed has been set, preventing srpc_abort_rpc from
+	 *   scheduling me.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT (!srpc_event_pending(rpc));
+	swi_exit_workitem(wi);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	(*rpc->crpc_done)(rpc);
+	return;
+}
+
+/* sends an outgoing RPC */
+int
+srpc_send_rpc (swi_workitem_t *wi)
+{
+	int		rc = 0;
+	srpc_client_rpc_t *rpc;
+	srpc_msg_t	*reply;
+	int		do_bulk;
+
+	LASSERT(wi != NULL);
+
+	rpc = wi->swi_workitem.wi_data;
+
+	LASSERT (rpc != NULL);
+	LASSERT (wi == &rpc->crpc_wi);
+
+	reply = &rpc->crpc_replymsg;
+	do_bulk = rpc->crpc_bulk.bk_niov > 0;
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (rpc->crpc_aborted) {
+		spin_unlock(&rpc->crpc_lock);
+		goto abort;
+	}
+
+	spin_unlock(&rpc->crpc_lock);
+
+	switch (wi->swi_state) {
+	default:
+		LBUG ();
+	case SWI_STATE_NEWBORN:
+		LASSERT (!srpc_event_pending(rpc));
+
+		rc = srpc_prepare_reply(rpc);
+		if (rc != 0) {
+			srpc_client_rpc_done(rpc, rc);
+			return 1;
+		}
+
+		rc = srpc_prepare_bulk(rpc);
+		if (rc != 0) break;
+
+		wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+		rc = srpc_send_request(rpc);
+		break;
+
+	case SWI_STATE_REQUEST_SUBMITTED:
+		/* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+		 * order; however, they're processed in a strict order:
+		 * rqt, rpy, and bulk. */
+		if (!rpc->crpc_reqstev.ev_fired) break;
+
+		rc = rpc->crpc_reqstev.ev_status;
+		if (rc != 0) break;
+
+		wi->swi_state = SWI_STATE_REQUEST_SENT;
+		/* perhaps more events, fall thru */
+	case SWI_STATE_REQUEST_SENT: {
+		srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+
+		if (!rpc->crpc_replyev.ev_fired) break;
+
+		rc = rpc->crpc_replyev.ev_status;
+		if (rc != 0) break;
+
+		srpc_unpack_msg_hdr(reply);
+		if (reply->msg_type != type ||
+		    (reply->msg_magic != SRPC_MSG_MAGIC &&
+		     reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CWARN ("Bad message from %s: type %u (%d expected),"
+			       " magic %u (%d expected).\n",
+			       libcfs_id2str(rpc->crpc_dest),
+			       reply->msg_type, type,
+			       reply->msg_magic, SRPC_MSG_MAGIC);
+			rc = -EBADMSG;
+			break;
+		}
+
+		if (do_bulk && reply->msg_body.reply.status != 0) {
+			CWARN ("Remote error %d at %s, unlink bulk buffer in "
+			       "case peer didn't initiate bulk transfer\n",
+			       reply->msg_body.reply.status,
+			       libcfs_id2str(rpc->crpc_dest));
+			LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+	}
+	case SWI_STATE_REPLY_RECEIVED:
+		if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
+
+		rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+		/* Bulk buffer was unlinked due to remote error. Clear error
+		 * since reply buffer still contains valid data.
+		 * NB rpc->crpc_done shouldn't look into bulk data in case of
+		 * remote error. */
+		if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+		    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+			rc = 0;
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_client_rpc_done(rpc, rc);
+		return 1;
+	}
+
+	if (rc != 0) {
+		spin_lock(&rpc->crpc_lock);
+		srpc_abort_rpc(rpc, rc);
+		spin_unlock(&rpc->crpc_lock);
+	}
+
+abort:
+	if (rpc->crpc_aborted) {
+		LNetMDUnlink(rpc->crpc_reqstmdh);
+		LNetMDUnlink(rpc->crpc_replymdh);
+		LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+		if (!srpc_event_pending(rpc)) {
+			srpc_client_rpc_done(rpc, -EINTR);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+srpc_client_rpc_t *
+srpc_create_client_rpc (lnet_process_id_t peer, int service,
+			int nbulkiov, int bulklen,
+			void (*rpc_done)(srpc_client_rpc_t *),
+			void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+	srpc_client_rpc_t *rpc;
+
+	LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
+				   crpc_bulk.bk_iovs[nbulkiov]));
+	if (rpc == NULL)
+		return NULL;
+
+	srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+			     bulklen, rpc_done, rpc_fini, priv);
+	return rpc;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_abort_rpc (srpc_client_rpc_t *rpc, int why)
+{
+	LASSERT (why != 0);
+
+	if (rpc->crpc_aborted || /* already aborted */
+	    rpc->crpc_closed)    /* callback imminent */
+		return;
+
+	CDEBUG (D_NET,
+		"Aborting RPC: service %d, peer %s, state %s, why %d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(rpc->crpc_wi.swi_state), why);
+
+	rpc->crpc_aborted = 1;
+	rpc->crpc_status  = why;
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_post_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT (!rpc->crpc_aborted);
+	LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+		libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+		rpc->crpc_timeout);
+
+	srpc_add_client_rpc_timer(rpc);
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
+}
+
+
+int
+srpc_send_reply(struct srpc_server_rpc *rpc)
+{
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	struct srpc_msg		*msg = &rpc->srpc_replymsg;
+	struct srpc_buffer	*buffer = rpc->srpc_reqstbuf;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	__u64			rpyid;
+	int			rc;
+
+	LASSERT(buffer != NULL);
+	rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
+
+	spin_lock(&scd->scd_lock);
+
+	if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
+		/* Repost buffer before replying since test client
+		 * might send me another RPC once it gets the reply */
+		if (srpc_service_post_buffer(scd, buffer) != 0)
+			CWARN("Failed to repost %s buffer\n", sv->sv_name);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_SENT;
+
+	msg->msg_magic   = SRPC_MSG_MAGIC;
+	msg->msg_version = SRPC_MSG_VERSION;
+	msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+				   sizeof(*msg), LNET_MD_OP_PUT,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &rpc->srpc_replymdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
+}
+
+/* when in kernel always called with LNET_LOCK() held, and in thread context */
+void
+srpc_lnet_ev_handler(lnet_event_t *ev)
+{
+	struct srpc_service_cd	*scd;
+	srpc_event_t      *rpcev = ev->md.user_ptr;
+	srpc_client_rpc_t *crpc;
+	srpc_server_rpc_t *srpc;
+	srpc_buffer_t     *buffer;
+	srpc_service_t    *sv;
+	srpc_msg_t	*msg;
+	srpc_msg_type_t    type;
+
+	LASSERT (!in_interrupt());
+
+	if (ev->status != 0) {
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.errors++;
+		spin_unlock(&srpc_data.rpc_glock);
+	}
+
+	rpcev->ev_lnet = ev->type;
+
+	switch (rpcev->ev_type) {
+	default:
+		CERROR("Unknown event: status %d, type %d, lnet %d\n",
+		       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+		LBUG ();
+	case SRPC_REQUEST_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+			srpc_data.rpc_counters.rpcs_sent++;
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+	case SRPC_REPLY_RCVD:
+	case SRPC_BULK_REQ_RCVD:
+		crpc = rpcev->ev_data;
+
+		if (rpcev != &crpc->crpc_reqstev &&
+		    rpcev != &crpc->crpc_replyev &&
+		    rpcev != &crpc->crpc_bulkev) {
+			CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+			       rpcev, crpc, &crpc->crpc_reqstev,
+			       &crpc->crpc_replyev, &crpc->crpc_bulkev);
+			CERROR("Bad event: status %d, type %d, lnet %d\n",
+			       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+			LBUG ();
+		}
+
+		spin_lock(&crpc->crpc_lock);
+
+		LASSERT(rpcev->ev_fired == 0);
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+						-EINTR : ev->status;
+		swi_schedule_workitem(&crpc->crpc_wi);
+
+		spin_unlock(&crpc->crpc_lock);
+		break;
+
+	case SRPC_REQUEST_RCVD:
+		scd = rpcev->ev_data;
+		sv = scd->scd_svc;
+
+		LASSERT(rpcev == &scd->scd_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		LASSERT (ev->unlinked);
+		LASSERT (ev->type == LNET_EVENT_PUT ||
+			 ev->type == LNET_EVENT_UNLINK);
+		LASSERT (ev->type != LNET_EVENT_UNLINK ||
+			 sv->sv_shuttingdown);
+
+		buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+		buffer->buf_peer = ev->initiator;
+		buffer->buf_self = ev->target.nid;
+
+		LASSERT(scd->scd_buf_nposted > 0);
+		scd->scd_buf_nposted--;
+
+		if (sv->sv_shuttingdown) {
+			/* Leave buffer on scd->scd_buf_nposted since
+			 * srpc_finish_service needs to traverse it. */
+			spin_unlock(&scd->scd_lock);
+			break;
+		}
+
+		if (scd->scd_buf_err_stamp != 0 &&
+		    scd->scd_buf_err_stamp < cfs_time_current_sec()) {
+			/* re-enable adding buffer */
+			scd->scd_buf_err_stamp = 0;
+			scd->scd_buf_err = 0;
+		}
+
+		if (scd->scd_buf_err == 0 && /* adding buffer is enabled */
+		    scd->scd_buf_adjust == 0 &&
+		    scd->scd_buf_nposted < scd->scd_buf_low) {
+			scd->scd_buf_adjust = MAX(scd->scd_buf_total / 2,
+						  SFW_TEST_WI_MIN);
+			swi_schedule_workitem(&scd->scd_buf_wi);
+		}
+
+		list_del(&buffer->buf_list); /* from scd->scd_buf_posted */
+		msg = &buffer->buf_msg;
+		type = srpc_service2request(sv->sv_id);
+
+		if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+		    (msg->msg_type != type &&
+		     msg->msg_type != __swab32(type)) ||
+		    (msg->msg_magic != SRPC_MSG_MAGIC &&
+		     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CERROR ("Dropping RPC (%s) from %s: "
+				"status %d mlength %d type %u magic %u.\n",
+				sv->sv_name, libcfs_id2str(ev->initiator),
+				ev->status, ev->mlength,
+				msg->msg_type, msg->msg_magic);
+
+			/* NB can't call srpc_service_recycle_buffer here since
+			 * it may call LNetM[DE]Attach. The invalid magic tells
+			 * srpc_handle_rpc to drop this RPC */
+			msg->msg_magic = 0;
+		}
+
+		if (!list_empty(&scd->scd_rpc_free)) {
+			srpc = list_entry(scd->scd_rpc_free.next,
+					      struct srpc_server_rpc,
+					      srpc_list);
+			list_del(&srpc->srpc_list);
+
+			srpc_init_server_rpc(srpc, scd, buffer);
+			list_add_tail(&srpc->srpc_list,
+					  &scd->scd_rpc_active);
+			swi_schedule_workitem(&srpc->srpc_wi);
+		} else {
+			list_add_tail(&buffer->buf_list,
+					  &scd->scd_buf_blocked);
+		}
+
+		spin_unlock(&scd->scd_lock);
+
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_rcvd++;
+		spin_unlock(&srpc_data.rpc_glock);
+		break;
+
+	case SRPC_BULK_GET_RPLD:
+		LASSERT (ev->type == LNET_EVENT_SEND ||
+			 ev->type == LNET_EVENT_REPLY ||
+			 ev->type == LNET_EVENT_UNLINK);
+
+		if (!ev->unlinked)
+			break; /* wait for final event */
+
+	case SRPC_BULK_PUT_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+
+			if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
+				srpc_data.rpc_counters.bulk_get += ev->mlength;
+			else
+				srpc_data.rpc_counters.bulk_put += ev->mlength;
+
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+	case SRPC_REPLY_SENT:
+		srpc = rpcev->ev_data;
+		scd  = srpc->srpc_scd;
+
+		LASSERT(rpcev == &srpc->srpc_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+				   -EINTR : ev->status;
+		swi_schedule_workitem(&srpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+		break;
+	}
+}
+
+
+int
+srpc_startup (void)
+{
+	int rc;
+
+	memset(&srpc_data, 0, sizeof(struct smoketest_rpc));
+	spin_lock_init(&srpc_data.rpc_glock);
+
+	/* 1 second pause to avoid timestamp reuse */
+	cfs_pause(cfs_time_seconds(1));
+	srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48;
+
+	srpc_data.rpc_state = SRPC_STATE_NONE;
+
+	rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+	if (rc < 0) {
+		CERROR ("LNetNIInit() has failed: %d\n", rc);
+		return rc;
+	}
+
+	srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+
+	LNetInvalidateHandle(&srpc_data.rpc_lnet_eq);
+	rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
+	if (rc != 0) {
+		CERROR("LNetEQAlloc() has failed: %d\n", rc);
+		goto bail;
+	}
+
+	rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+	rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+
+	srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+
+	rc = stt_startup();
+
+bail:
+	if (rc != 0)
+		srpc_shutdown();
+	else
+		srpc_data.rpc_state = SRPC_STATE_RUNNING;
+
+	return rc;
+}
+
+void
+srpc_shutdown (void)
+{
+	int i;
+	int rc;
+	int state;
+
+	state = srpc_data.rpc_state;
+	srpc_data.rpc_state = SRPC_STATE_STOPPING;
+
+	switch (state) {
+	default:
+		LBUG ();
+	case SRPC_STATE_RUNNING:
+		spin_lock(&srpc_data.rpc_glock);
+
+		for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+			srpc_service_t *sv = srpc_data.rpc_services[i];
+
+			LASSERTF (sv == NULL,
+				  "service not empty: id %d, name %s\n",
+				  i, sv->sv_name);
+		}
+
+		spin_unlock(&srpc_data.rpc_glock);
+
+		stt_shutdown();
+
+	case SRPC_STATE_EQ_INIT:
+		rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+		rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
+		LASSERT (rc == 0);
+		rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+		LASSERT (rc == 0); /* the EQ should have no user by now */
+
+	case SRPC_STATE_NI_INIT:
+		LNetNIFini();
+	}
+
+	return;
+}

diff --git a/drivers/staging/lustre/lnet/selftest/rpc.h b/drivers/staging/lustre/lnet/selftest/rpc.h
new file mode 100644
index 0000000..b905d49
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/rpc.h

@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __SELFTEST_RPC_H__
+#define __SELFTEST_RPC_H__
+
+#include <linux/lnet/lnetst.h>
+
+/*
+ * LST wired structures
+ *
+ * XXX: *REPLY == *REQST + 1
+ */
+typedef enum {
+	SRPC_MSG_MKSN_REQST     = 0,
+	SRPC_MSG_MKSN_REPLY     = 1,
+	SRPC_MSG_RMSN_REQST     = 2,
+	SRPC_MSG_RMSN_REPLY     = 3,
+	SRPC_MSG_BATCH_REQST    = 4,
+	SRPC_MSG_BATCH_REPLY    = 5,
+	SRPC_MSG_STAT_REQST     = 6,
+	SRPC_MSG_STAT_REPLY     = 7,
+	SRPC_MSG_TEST_REQST     = 8,
+	SRPC_MSG_TEST_REPLY     = 9,
+	SRPC_MSG_DEBUG_REQST    = 10,
+	SRPC_MSG_DEBUG_REPLY    = 11,
+	SRPC_MSG_BRW_REQST      = 12,
+	SRPC_MSG_BRW_REPLY      = 13,
+	SRPC_MSG_PING_REQST     = 14,
+	SRPC_MSG_PING_REPLY     = 15,
+	SRPC_MSG_JOIN_REQST     = 16,
+	SRPC_MSG_JOIN_REPLY     = 17,
+} srpc_msg_type_t;
+
+
+/* CAVEAT EMPTOR:
+ * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * and 2nd field matchbits of bulk buffer if any.
+ *
+ * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * session id if needed.
+ */
+typedef struct {
+	__u64			rpyid;		/* reply buffer matchbits */
+	__u64			bulkid;		/* bulk buffer matchbits */
+} WIRE_ATTR srpc_generic_reqst_t;
+
+typedef struct {
+	__u32		   status;
+	lst_sid_t	       sid;
+} WIRE_ATTR srpc_generic_reply_t;
+
+/* FRAMEWORK RPCs */
+typedef struct {
+	__u64			mksn_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       mksn_sid;	/* session id */
+	__u32			mksn_force;      /* use brute force */
+	char			mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reqst_t;			/* make session request */
+
+typedef struct {
+	__u32		   mksn_status;      /* session status */
+	lst_sid_t	       mksn_sid;	 /* session id */
+	__u32		   mksn_timeout;     /* session timeout */
+	char			mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+
+typedef struct {
+	__u64			rmsn_rpyid;      /* reply buffer matchbits */
+	lst_sid_t		rmsn_sid;	/* session id */
+} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+
+typedef struct {
+	__u32			rmsn_status;
+	lst_sid_t		rmsn_sid;	/* session id */
+} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+
+typedef struct {
+	__u64			join_rpyid;     /* reply buffer matchbits */
+	lst_sid_t	       join_sid;       /* session id to join */
+	char		    join_group[LST_NAME_SIZE]; /* group name */
+} WIRE_ATTR srpc_join_reqst_t;
+
+typedef struct {
+	__u32		   join_status;    /* returned status */
+	lst_sid_t	       join_sid;       /* session id */
+	__u32			join_timeout;   /* # seconds' inactivity to expire */
+	char		    join_session[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_join_reply_t;
+
+typedef struct {
+	__u64		   dbg_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       dbg_sid;	/* session id */
+	__u32		   dbg_flags;      /* bitmap of debug */
+} WIRE_ATTR srpc_debug_reqst_t;
+
+typedef struct {
+	__u32		   dbg_status;     /* returned code */
+	lst_sid_t	       dbg_sid;	/* session id */
+	__u32		   dbg_timeout;    /* session timeout */
+	__u32		   dbg_nbatch;     /* # of batches in the node */
+	char		    dbg_name[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_debug_reply_t;
+
+#define SRPC_BATCH_OPC_RUN      1
+#define SRPC_BATCH_OPC_STOP     2
+#define SRPC_BATCH_OPC_QUERY    3
+
+typedef struct {
+	__u64		   bar_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       bar_sid;	/* session id */
+	lst_bid_t	       bar_bid;	/* batch id */
+	__u32		   bar_opc;	/* create/start/stop batch */
+	__u32		   bar_testidx;    /* index of test */
+	__u32		   bar_arg;	/* parameters */
+} WIRE_ATTR srpc_batch_reqst_t;
+
+typedef struct {
+	__u32		   bar_status;     /* status of request */
+	lst_sid_t	       bar_sid;	/* session id */
+	__u32		   bar_active;     /* # of active tests in batch/test */
+	__u32		   bar_time;       /* remained time */
+} WIRE_ATTR srpc_batch_reply_t;
+
+typedef struct {
+	__u64		   str_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       str_sid;	/* session id */
+	__u32		   str_type;       /* type of stat */
+} WIRE_ATTR srpc_stat_reqst_t;
+
+typedef struct {
+	__u32		   str_status;
+	lst_sid_t	       str_sid;
+	sfw_counters_t	  str_fw;
+	srpc_counters_t	 str_rpc;
+	lnet_counters_t	 str_lnet;
+} WIRE_ATTR srpc_stat_reply_t;
+
+typedef struct {
+	__u32		   blk_opc;	/* bulk operation code */
+	__u32		   blk_npg;	/* # of pages */
+	__u32		   blk_flags;      /* reserved flags */
+} WIRE_ATTR test_bulk_req_t;
+
+typedef struct {
+	/** bulk operation code */
+	__u16			blk_opc;
+	/** data check flags */
+	__u16			blk_flags;
+	/** data length */
+	__u32			blk_len;
+	/** reserved: offset */
+	__u32		   blk_offset;
+} WIRE_ATTR test_bulk_req_v1_t;
+
+typedef struct {
+	__u32			png_size;       /* size of ping message */
+	__u32			png_flags;      /* reserved flags */
+} WIRE_ATTR test_ping_req_t;
+
+typedef struct {
+	__u64			tsr_rpyid;      /* reply buffer matchbits */
+	__u64			tsr_bulkid;     /* bulk buffer matchbits */
+	lst_sid_t		tsr_sid;	/* session id */
+	lst_bid_t		tsr_bid;	/* batch id */
+	__u32			tsr_service;    /* test type: bulk|ping|... */
+	/* test client loop count or # server buffers needed */
+	__u32			tsr_loop;
+	__u32			tsr_concur;     /* concurrency of test */
+	__u8			tsr_is_client;  /* is test client or not */
+	__u8			tsr_stop_onerr; /* stop on error */
+	__u32			tsr_ndest;      /* # of dest nodes */
+
+	union {
+		test_ping_req_t		ping;
+		test_bulk_req_t		bulk_v0;
+		test_bulk_req_v1_t	bulk_v1;
+	}		tsr_u;
+} WIRE_ATTR srpc_test_reqst_t;
+
+typedef struct {
+	__u32			tsr_status;     /* returned code */
+	lst_sid_t		tsr_sid;
+} WIRE_ATTR srpc_test_reply_t;
+
+/* TEST RPCs */
+typedef struct {
+	__u64		   pnr_rpyid;
+	__u32		   pnr_magic;
+	__u32		   pnr_seq;
+	__u64		   pnr_time_sec;
+	__u64		   pnr_time_usec;
+} WIRE_ATTR srpc_ping_reqst_t;
+
+typedef struct {
+	__u32		   pnr_status;
+	__u32		   pnr_magic;
+	__u32		   pnr_seq;
+} WIRE_ATTR srpc_ping_reply_t;
+
+typedef struct {
+	__u64		   brw_rpyid;      /* reply buffer matchbits */
+	__u64		   brw_bulkid;     /* bulk buffer matchbits */
+	__u32		   brw_rw;	 /* read or write */
+	__u32		   brw_len;	/* bulk data len */
+	__u32		   brw_flags;      /* bulk data patterns */
+} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+
+typedef struct {
+	__u32		   brw_status;
+} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+
+#define SRPC_MSG_MAGIC		  0xeeb0f00d
+#define SRPC_MSG_VERSION		1
+
+typedef struct srpc_msg {
+	/** magic number */
+	__u32	msg_magic;
+	/** message version number */
+	__u32	msg_version;
+	/** type of message body: srpc_msg_type_t */
+	__u32	msg_type;
+	__u32	msg_reserved0;
+	__u32	msg_reserved1;
+	/** test session features */
+	__u32	msg_ses_feats;
+	union {
+		srpc_generic_reqst_t reqst;
+		srpc_generic_reply_t reply;
+
+		srpc_mksn_reqst_t    mksn_reqst;
+		srpc_mksn_reply_t    mksn_reply;
+		srpc_rmsn_reqst_t    rmsn_reqst;
+		srpc_rmsn_reply_t    rmsn_reply;
+		srpc_debug_reqst_t   dbg_reqst;
+		srpc_debug_reply_t   dbg_reply;
+		srpc_batch_reqst_t   bat_reqst;
+		srpc_batch_reply_t   bat_reply;
+		srpc_stat_reqst_t    stat_reqst;
+		srpc_stat_reply_t    stat_reply;
+		srpc_test_reqst_t    tes_reqst;
+		srpc_test_reply_t    tes_reply;
+		srpc_join_reqst_t    join_reqst;
+		srpc_join_reply_t    join_reply;
+
+		srpc_ping_reqst_t    ping_reqst;
+		srpc_ping_reply_t    ping_reply;
+		srpc_brw_reqst_t     brw_reqst;
+		srpc_brw_reply_t     brw_reply;
+	}     msg_body;
+} WIRE_ATTR srpc_msg_t;
+
+static inline void
+srpc_unpack_msg_hdr(srpc_msg_t *msg)
+{
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	/* We do not swap the magic number here as it is needed to
+	   determine whether the body needs to be swapped. */
+	/* __swab32s(&msg->msg_magic); */
+	__swab32s(&msg->msg_type);
+	__swab32s(&msg->msg_version);
+	__swab32s(&msg->msg_ses_feats);
+	__swab32s(&msg->msg_reserved0);
+	__swab32s(&msg->msg_reserved1);
+}
+
+#endif /* __SELFTEST_RPC_H__ */

diff --git a/drivers/staging/lustre/lnet/selftest/selftest.h b/drivers/staging/lustre/lnet/selftest/selftest.h
new file mode 100644
index 0000000..8053b05
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/selftest.h

@@ -0,0 +1,611 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/selftest.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_SELFTEST_H__
+#define __SELFTEST_SELFTEST_H__
+
+#define LNET_ONLY
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+
+#include "rpc.h"
+#include "timer.h"
+
+#ifndef MADE_WITHOUT_COMPROMISE
+#define MADE_WITHOUT_COMPROMISE
+#endif
+
+
+#define SWI_STATE_NEWBORN		  0
+#define SWI_STATE_REPLY_SUBMITTED	  1
+#define SWI_STATE_REPLY_SENT	       2
+#define SWI_STATE_REQUEST_SUBMITTED	3
+#define SWI_STATE_REQUEST_SENT	     4
+#define SWI_STATE_REPLY_RECEIVED	   5
+#define SWI_STATE_BULK_STARTED	     6
+#define SWI_STATE_DONE		     10
+
+/* forward refs */
+struct srpc_service;
+struct srpc_service_cd;
+struct sfw_test_unit;
+struct sfw_test_instance;
+
+/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework
+ * services, e.g. create/modify session.
+ */
+#define SRPC_SERVICE_DEBUG	      0
+#define SRPC_SERVICE_MAKE_SESSION       1
+#define SRPC_SERVICE_REMOVE_SESSION     2
+#define SRPC_SERVICE_BATCH	      3
+#define SRPC_SERVICE_TEST	       4
+#define SRPC_SERVICE_QUERY_STAT	 5
+#define SRPC_SERVICE_JOIN	       6
+#define SRPC_FRAMEWORK_SERVICE_MAX_ID   10
+/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */
+#define SRPC_SERVICE_BRW		11
+#define SRPC_SERVICE_PING	       12
+#define SRPC_SERVICE_MAX_ID	     12
+
+#define SRPC_REQUEST_PORTAL	     50
+/* a lazy portal for framework RPC requests */
+#define SRPC_FRAMEWORK_REQUEST_PORTAL   51
+/* all reply/bulk RDMAs go to this portal */
+#define SRPC_RDMA_PORTAL		52
+
+static inline srpc_msg_type_t
+srpc_service2request (int service)
+{
+	switch (service) {
+	default:
+		LBUG ();
+	case SRPC_SERVICE_DEBUG:
+		return SRPC_MSG_DEBUG_REQST;
+
+	case SRPC_SERVICE_MAKE_SESSION:
+		return SRPC_MSG_MKSN_REQST;
+
+	case SRPC_SERVICE_REMOVE_SESSION:
+		return SRPC_MSG_RMSN_REQST;
+
+	case SRPC_SERVICE_BATCH:
+		return SRPC_MSG_BATCH_REQST;
+
+	case SRPC_SERVICE_TEST:
+		return SRPC_MSG_TEST_REQST;
+
+	case SRPC_SERVICE_QUERY_STAT:
+		return SRPC_MSG_STAT_REQST;
+
+	case SRPC_SERVICE_BRW:
+		return SRPC_MSG_BRW_REQST;
+
+	case SRPC_SERVICE_PING:
+		return SRPC_MSG_PING_REQST;
+
+	case SRPC_SERVICE_JOIN:
+		return SRPC_MSG_JOIN_REQST;
+	}
+}
+
+static inline srpc_msg_type_t
+srpc_service2reply (int service)
+{
+	return srpc_service2request(service) + 1;
+}
+
+typedef enum {
+	SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
+	SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
+	SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
+	SRPC_REPLY_RCVD      = 4, /* incoming reply received */
+	SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
+	SRPC_REQUEST_RCVD    = 6, /* incoming request received */
+	SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
+} srpc_event_type_t;
+
+/* RPC event */
+typedef struct {
+	srpc_event_type_t ev_type;   /* what's up */
+	lnet_event_kind_t ev_lnet;   /* LNet event type */
+	int	       ev_fired;  /* LNet event fired? */
+	int	       ev_status; /* LNet event status */
+	void	     *ev_data;   /* owning server/client RPC */
+} srpc_event_t;
+
+typedef struct {
+	int	      bk_len;  /* len of bulk data */
+	lnet_handle_md_t bk_mdh;
+	int	      bk_sink; /* sink/source */
+	int	      bk_niov; /* # iov in bk_iovs */
+	lnet_kiov_t      bk_iovs[0];
+} srpc_bulk_t; /* bulk descriptor */
+
+/* message buffer descriptor */
+typedef struct srpc_buffer {
+	struct list_head	   buf_list; /* chain on srpc_service::*_msgq */
+	srpc_msg_t	   buf_msg;
+	lnet_handle_md_t     buf_mdh;
+	lnet_nid_t	   buf_self;
+	lnet_process_id_t    buf_peer;
+} srpc_buffer_t;
+
+struct swi_workitem;
+typedef int (*swi_action_t) (struct swi_workitem *);
+
+typedef struct swi_workitem {
+	struct cfs_wi_sched	*swi_sched;
+	cfs_workitem_t       swi_workitem;
+	swi_action_t	 swi_action;
+	int		  swi_state;
+} swi_workitem_t;
+
+/* server-side state of a RPC */
+typedef struct srpc_server_rpc {
+	/* chain on srpc_service::*_rpcq */
+	struct list_head		srpc_list;
+	struct srpc_service_cd *srpc_scd;
+	swi_workitem_t       srpc_wi;
+	srpc_event_t	 srpc_ev;      /* bulk/reply event */
+	lnet_nid_t	   srpc_self;
+	lnet_process_id_t    srpc_peer;
+	srpc_msg_t	   srpc_replymsg;
+	lnet_handle_md_t     srpc_replymdh;
+	srpc_buffer_t       *srpc_reqstbuf;
+	srpc_bulk_t	 *srpc_bulk;
+
+	unsigned int	 srpc_aborted; /* being given up */
+	int		  srpc_status;
+	void	       (*srpc_done)(struct srpc_server_rpc *);
+} srpc_server_rpc_t;
+
+/* client-side state of a RPC */
+typedef struct srpc_client_rpc {
+	struct list_head		crpc_list;	/* chain on user's lists */
+	spinlock_t		crpc_lock;	/* serialize */
+	int		  crpc_service;
+	atomic_t	 crpc_refcount;
+	int		  crpc_timeout; /* # seconds to wait for reply */
+	stt_timer_t	  crpc_timer;
+	swi_workitem_t       crpc_wi;
+	lnet_process_id_t    crpc_dest;
+
+	void	       (*crpc_done)(struct srpc_client_rpc *);
+	void	       (*crpc_fini)(struct srpc_client_rpc *);
+	int		  crpc_status;    /* completion status */
+	void		*crpc_priv;      /* caller data */
+
+	/* state flags */
+	unsigned int	 crpc_aborted:1; /* being given up */
+	unsigned int	 crpc_closed:1;  /* completed */
+
+	/* RPC events */
+	srpc_event_t	 crpc_bulkev;    /* bulk event */
+	srpc_event_t	 crpc_reqstev;   /* request event */
+	srpc_event_t	 crpc_replyev;   /* reply event */
+
+	/* bulk, request(reqst), and reply exchanged on wire */
+	srpc_msg_t	   crpc_reqstmsg;
+	srpc_msg_t	   crpc_replymsg;
+	lnet_handle_md_t     crpc_reqstmdh;
+	lnet_handle_md_t     crpc_replymdh;
+	srpc_bulk_t	  crpc_bulk;
+} srpc_client_rpc_t;
+
+#define srpc_client_rpc_size(rpc)				       \
+offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+
+#define srpc_client_rpc_addref(rpc)				     \
+do {								    \
+	CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n",			 \
+	       (rpc), libcfs_id2str((rpc)->crpc_dest),		  \
+	       atomic_read(&(rpc)->crpc_refcount));		 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);	    \
+	atomic_inc(&(rpc)->crpc_refcount);			  \
+} while (0)
+
+#define srpc_client_rpc_decref(rpc)				     \
+do {								    \
+	CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n",			 \
+	       (rpc), libcfs_id2str((rpc)->crpc_dest),		  \
+	       atomic_read(&(rpc)->crpc_refcount));		 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);	    \
+	if (atomic_dec_and_test(&(rpc)->crpc_refcount))	     \
+		srpc_destroy_client_rpc(rpc);			   \
+} while (0)
+
+#define srpc_event_pending(rpc)   ((rpc)->crpc_bulkev.ev_fired == 0 ||  \
+				   (rpc)->crpc_reqstev.ev_fired == 0 || \
+				   (rpc)->crpc_replyev.ev_fired == 0)
+
+/* CPU partition data of srpc service */
+struct srpc_service_cd {
+	/** serialize */
+	spinlock_t		scd_lock;
+	/** backref to service */
+	struct srpc_service	*scd_svc;
+	/** event buffer */
+	srpc_event_t		scd_ev;
+	/** free RPC descriptors */
+	struct list_head		scd_rpc_free;
+	/** in-flight RPCs */
+	struct list_head		scd_rpc_active;
+	/** workitem for posting buffer */
+	swi_workitem_t		scd_buf_wi;
+	/** CPT id */
+	int			scd_cpt;
+	/** error code for scd_buf_wi */
+	int			scd_buf_err;
+	/** timestamp for scd_buf_err */
+	unsigned long	   scd_buf_err_stamp;
+	/** total # request buffers */
+	int			scd_buf_total;
+	/** # posted request buffers */
+	int			scd_buf_nposted;
+	/** in progress of buffer posting */
+	int			scd_buf_posting;
+	/** allocate more buffers if scd_buf_nposted < scd_buf_low */
+	int			scd_buf_low;
+	/** increase/decrease some buffers */
+	int			scd_buf_adjust;
+	/** posted message buffers */
+	struct list_head		scd_buf_posted;
+	/** blocked for RPC descriptor */
+	struct list_head		scd_buf_blocked;
+};
+
+/* number of server workitems (mini-thread) for testing service */
+#define SFW_TEST_WI_MIN		256
+#define SFW_TEST_WI_MAX		2048
+/* extra buffers for tolerating buggy peers, or unbalanced number
+ * of peers between partitions  */
+#define SFW_TEST_WI_EXTRA	64
+
+/* number of server workitems (mini-thread) for framework service */
+#define SFW_FRWK_WI_MIN		16
+#define SFW_FRWK_WI_MAX		256
+
+typedef struct srpc_service {
+	int			sv_id;		/* service id */
+	const char		*sv_name;	/* human readable name */
+	int			sv_wi_total;	/* total server workitems */
+	int			sv_shuttingdown;
+	int			sv_ncpts;
+	/* percpt data for srpc_service */
+	struct srpc_service_cd	**sv_cpt_data;
+	/* Service callbacks:
+	 * - sv_handler: process incoming RPC request
+	 * - sv_bulk_ready: notify bulk data
+	 */
+	int	      (*sv_handler) (srpc_server_rpc_t *);
+	int	      (*sv_bulk_ready) (srpc_server_rpc_t *, int);
+} srpc_service_t;
+
+typedef struct {
+	struct list_head	sn_list;    /* chain on fw_zombie_sessions */
+	lst_sid_t	 sn_id;      /* unique identifier */
+	unsigned int      sn_timeout; /* # seconds' inactivity to expire */
+	int	       sn_timer_active;
+	unsigned int	  sn_features;
+	stt_timer_t       sn_timer;
+	struct list_head	sn_batches; /* list of batches */
+	char	      sn_name[LST_NAME_SIZE];
+	atomic_t      sn_refcount;
+	atomic_t      sn_brw_errors;
+	atomic_t      sn_ping_errors;
+	cfs_time_t	sn_started;
+} sfw_session_t;
+
+#define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
+				       (sid0).ses_stamp == (sid1).ses_stamp)
+
+typedef struct {
+	struct list_head	bat_list;      /* chain on sn_batches */
+	lst_bid_t	 bat_id;	/* batch id */
+	int	       bat_error;     /* error code of batch */
+	sfw_session_t    *bat_session;   /* batch's session */
+	atomic_t      bat_nactive;   /* # of active tests */
+	struct list_head	bat_tests;     /* test instances */
+} sfw_batch_t;
+
+typedef struct {
+	int  (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */
+	void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+	int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+			     lnet_process_id_t dest,
+			     srpc_client_rpc_t **rpc);   /* prep a tests rpc */
+	void (*tso_done_rpc)(struct sfw_test_unit *tsu,
+			     srpc_client_rpc_t *rpc);    /* done a test rpc */
+} sfw_test_client_ops_t;
+
+typedef struct sfw_test_instance {
+	struct list_head	      tsi_list;	 /* chain on batch */
+	int		     tsi_service;      /* test type */
+	sfw_batch_t	    *tsi_batch;	/* batch */
+	sfw_test_client_ops_t  *tsi_ops;	  /* test client operations */
+
+	/* public parameter for all test units */
+	unsigned int		tsi_is_client:1;     /* is test client */
+	unsigned int		tsi_stoptsu_onerr:1; /* stop tsu on error */
+	int		     tsi_concur;	  /* concurrency */
+	int		     tsi_loop;	    /* loop count */
+
+	/* status of test instance */
+	spinlock_t		tsi_lock;	  /* serialize */
+	unsigned int		tsi_stopping:1;   /* test is stopping */
+	atomic_t	    tsi_nactive;      /* # of active test unit */
+	struct list_head	      tsi_units;	/* test units */
+	struct list_head	      tsi_free_rpcs;    /* free rpcs */
+	struct list_head	      tsi_active_rpcs;  /* active rpcs */
+
+	union {
+		test_ping_req_t		ping;	  /* ping parameter */
+		test_bulk_req_t		bulk_v0;  /* bulk parameter */
+		test_bulk_req_v1_t	bulk_v1;  /* bulk v1 parameter */
+	} tsi_u;
+} sfw_test_instance_t;
+
+/* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at
+ * the end of pages are not used */
+#define SFW_MAX_CONCUR     LST_MAX_CONCUR
+#define SFW_ID_PER_PAGE    (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t))
+#define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
+#define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
+
+typedef struct sfw_test_unit {
+	struct list_head	    tsu_list;	 /* chain on lst_test_instance */
+	lnet_process_id_t     tsu_dest;	 /* id of dest node */
+	int		   tsu_loop;	 /* loop count of the test */
+	sfw_test_instance_t  *tsu_instance;     /* pointer to test instance */
+	void		 *tsu_private;      /* private data */
+	swi_workitem_t	tsu_worker;       /* workitem of the test unit */
+} sfw_test_unit_t;
+
+typedef struct sfw_test_case {
+	struct list_head	      tsc_list;	 /* chain on fw_tests */
+	srpc_service_t	 *tsc_srv_service;  /* test service */
+	sfw_test_client_ops_t  *tsc_cli_ops;      /* ops of test client */
+} sfw_test_case_t;
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done) (srpc_client_rpc_t *), void *priv);
+int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+			lnet_process_id_t peer, unsigned features,
+			int nblk, int blklen, srpc_client_rpc_t **rpc);
+void sfw_abort_rpc(srpc_client_rpc_t *rpc);
+void sfw_post_rpc(srpc_client_rpc_t *rpc);
+void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
+void sfw_unpack_message(srpc_msg_t *msg);
+void sfw_free_pages(srpc_server_rpc_t *rpc);
+void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
+int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+		    int sink);
+int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(lnet_process_id_t peer, int service,
+		       int nbulkiov, int bulklen,
+		       void (*rpc_done)(srpc_client_rpc_t *),
+		       void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
+void srpc_post_rpc(srpc_client_rpc_t *rpc);
+void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
+void srpc_free_bulk(srpc_bulk_t *bk);
+srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len,
+			     int sink);
+int srpc_send_rpc(swi_workitem_t *wi);
+int srpc_send_reply(srpc_server_rpc_t *rpc);
+int srpc_add_service(srpc_service_t *sv);
+int srpc_remove_service(srpc_service_t *sv);
+void srpc_shutdown_service(srpc_service_t *sv);
+void srpc_abort_service(srpc_service_t *sv);
+int srpc_finish_service(srpc_service_t *sv);
+int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_get_counters(srpc_counters_t *cnt);
+void srpc_set_counters(const srpc_counters_t *cnt);
+
+extern struct cfs_wi_sched *lst_sched_serial;
+extern struct cfs_wi_sched **lst_sched_test;
+
+static inline int
+srpc_serv_is_framework(struct srpc_service *svc)
+{
+	return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID;
+}
+
+static inline int
+swi_wi_action(cfs_workitem_t *wi)
+{
+	swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+
+	return swi->swi_action(swi);
+}
+
+static inline void
+swi_init_workitem(swi_workitem_t *swi, void *data,
+		  swi_action_t action, struct cfs_wi_sched *sched)
+{
+	swi->swi_sched  = sched;
+	swi->swi_action = action;
+	swi->swi_state  = SWI_STATE_NEWBORN;
+	cfs_wi_init(&swi->swi_workitem, data, swi_wi_action);
+}
+
+static inline void
+swi_schedule_workitem(swi_workitem_t *wi)
+{
+	cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
+}
+
+static inline void
+swi_exit_workitem(swi_workitem_t *swi)
+{
+	cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
+}
+
+static inline int
+swi_deschedule_workitem(swi_workitem_t *swi)
+{
+	return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
+}
+
+
+int sfw_startup(void);
+int srpc_startup(void);
+void sfw_shutdown(void);
+void srpc_shutdown(void);
+
+static inline void
+srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT (rpc != NULL);
+	LASSERT (!srpc_event_pending(rpc));
+	LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+	if (rpc->crpc_fini == NULL) {
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	} else {
+		(*rpc->crpc_fini) (rpc);
+	}
+
+	return;
+}
+
+static inline void
+srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer,
+		      int service, int nbulkiov, int bulklen,
+		      void (*rpc_done)(srpc_client_rpc_t *),
+		      void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+	LASSERT (nbulkiov <= LNET_MAX_IOV);
+
+	memset(rpc, 0, offsetof(srpc_client_rpc_t,
+				crpc_bulk.bk_iovs[nbulkiov]));
+
+	INIT_LIST_HEAD(&rpc->crpc_list);
+	swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc,
+			  lst_sched_test[lnet_cpt_of_nid(peer.nid)]);
+	spin_lock_init(&rpc->crpc_lock);
+	atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
+
+	rpc->crpc_dest	 = peer;
+	rpc->crpc_priv	 = priv;
+	rpc->crpc_service      = service;
+	rpc->crpc_bulk.bk_len  = bulklen;
+	rpc->crpc_bulk.bk_niov = nbulkiov;
+	rpc->crpc_done	 = rpc_done;
+	rpc->crpc_fini	 = rpc_fini;
+	LNetInvalidateHandle(&rpc->crpc_reqstmdh);
+	LNetInvalidateHandle(&rpc->crpc_replymdh);
+	LNetInvalidateHandle(&rpc->crpc_bulk.bk_mdh);
+
+	/* no event is expected at this point */
+	rpc->crpc_bulkev.ev_fired  =
+	rpc->crpc_reqstev.ev_fired =
+	rpc->crpc_replyev.ev_fired = 1;
+
+	rpc->crpc_reqstmsg.msg_magic   = SRPC_MSG_MAGIC;
+	rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION;
+	rpc->crpc_reqstmsg.msg_type    = srpc_service2request(service);
+	return;
+}
+
+static inline const char *
+swi_state2str (int state)
+{
+#define STATE2STR(x) case x: return #x
+	switch(state) {
+		default:
+			LBUG();
+		STATE2STR(SWI_STATE_NEWBORN);
+		STATE2STR(SWI_STATE_REPLY_SUBMITTED);
+		STATE2STR(SWI_STATE_REPLY_SENT);
+		STATE2STR(SWI_STATE_REQUEST_SUBMITTED);
+		STATE2STR(SWI_STATE_REQUEST_SENT);
+		STATE2STR(SWI_STATE_REPLY_RECEIVED);
+		STATE2STR(SWI_STATE_BULK_STARTED);
+		STATE2STR(SWI_STATE_DONE);
+	}
+#undef STATE2STR
+}
+
+#define UNUSED(x)       ( (void)(x) )
+
+
+#define selftest_wait_events()	cfs_pause(cfs_time_seconds(1) / 10)
+
+
+#define lst_wait_until(cond, lock, fmt, ...)				\
+do {									\
+	int __I = 2;							\
+	while (!(cond)) {						\
+		CDEBUG(IS_PO2(++__I) ? D_WARNING : D_NET,		\
+		       fmt, ## __VA_ARGS__);				\
+		spin_unlock(&(lock));					\
+									\
+		selftest_wait_events();					\
+									\
+		spin_lock(&(lock));					\
+	}								\
+} while (0)
+
+static inline void
+srpc_wait_service_shutdown(srpc_service_t *sv)
+{
+	int i = 2;
+
+	LASSERT(sv->sv_shuttingdown);
+
+	while (srpc_finish_service(sv) == 0) {
+		i++;
+		CDEBUG (((i & -i) == i) ? D_WARNING : D_NET,
+			"Waiting for %s service to shutdown...\n",
+			sv->sv_name);
+		selftest_wait_events();
+	}
+}
+
+#endif /* __SELFTEST_SELFTEST_H__ */

diff --git a/drivers/staging/lustre/lnet/selftest/timer.c b/drivers/staging/lustre/lnet/selftest/timer.c
new file mode 100644
index 0000000..2c07855
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/timer.c

@@ -0,0 +1,253 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+
+/*
+ * Timers are implemented as a sorted queue of expiry times. The queue
+ * is slotted, with each slot holding timers which expire in a
+ * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are
+ * sorted by increasing expiry time. The number of slots is 2**7 (128),
+ * to cover a time period of 1024 seconds into the future before wrapping.
+ */
+#define STTIMER_MINPOLL	3   /* log2 min poll interval (8 s) */
+#define STTIMER_SLOTTIME       (1 << STTIMER_MINPOLL)
+#define STTIMER_SLOTTIMEMASK   (~(STTIMER_SLOTTIME - 1))
+#define STTIMER_NSLOTS	       (1 << 7)
+#define STTIMER_SLOT(t)	       (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \
+						    (STTIMER_NSLOTS - 1))])
+
+struct st_timer_data {
+	spinlock_t	 stt_lock;
+	/* start time of the slot processed previously */
+	cfs_time_t       stt_prev_slot;
+	struct list_head       stt_hash[STTIMER_NSLOTS];
+	int	      stt_shuttingdown;
+	wait_queue_head_t      stt_waitq;
+	int	      stt_nthreads;
+} stt_data;
+
+void
+stt_add_timer(stt_timer_t *timer)
+{
+	struct list_head *pos;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT (stt_data.stt_nthreads > 0);
+	LASSERT (!stt_data.stt_shuttingdown);
+	LASSERT (timer->stt_func != NULL);
+	LASSERT (list_empty(&timer->stt_list));
+	LASSERT (cfs_time_after(timer->stt_expires, cfs_time_current_sec()));
+
+	/* a simple insertion sort */
+	list_for_each_prev (pos, STTIMER_SLOT(timer->stt_expires)) {
+		stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+
+		if (cfs_time_aftereq(timer->stt_expires, old->stt_expires))
+			break;
+	}
+	list_add(&timer->stt_list, pos);
+
+	spin_unlock(&stt_data.stt_lock);
+}
+
+/*
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ *
+ * CAVEAT EMPTOR:
+ * When 0 is returned, it is possible that timer->stt_func _is_ running on
+ * another CPU.
+ */
+int
+stt_del_timer (stt_timer_t *timer)
+{
+	int ret = 0;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT (stt_data.stt_nthreads > 0);
+	LASSERT (!stt_data.stt_shuttingdown);
+
+	if (!list_empty(&timer->stt_list)) {
+		ret = 1;
+		list_del_init(&timer->stt_list);
+	}
+
+	spin_unlock(&stt_data.stt_lock);
+	return ret;
+}
+
+/* called with stt_data.stt_lock held */
+int
+stt_expire_list (struct list_head *slot, cfs_time_t now)
+{
+	int	  expired = 0;
+	stt_timer_t *timer;
+
+	while (!list_empty(slot)) {
+		timer = list_entry(slot->next, stt_timer_t, stt_list);
+
+		if (cfs_time_after(timer->stt_expires, now))
+			break;
+
+		list_del_init(&timer->stt_list);
+		spin_unlock(&stt_data.stt_lock);
+
+		expired++;
+		(*timer->stt_func) (timer->stt_data);
+
+		spin_lock(&stt_data.stt_lock);
+	}
+
+	return expired;
+}
+
+int
+stt_check_timers (cfs_time_t *last)
+{
+	int	expired = 0;
+	cfs_time_t now;
+	cfs_time_t this_slot;
+
+	now = cfs_time_current_sec();
+	this_slot = now & STTIMER_SLOTTIMEMASK;
+
+	spin_lock(&stt_data.stt_lock);
+
+	while (cfs_time_aftereq(this_slot, *last)) {
+		expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
+		this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+	}
+
+	*last = now & STTIMER_SLOTTIMEMASK;
+	spin_unlock(&stt_data.stt_lock);
+	return expired;
+}
+
+
+int
+stt_timer_main (void *arg)
+{
+	int rc = 0;
+	UNUSED(arg);
+
+	SET_BUT_UNUSED(rc);
+
+	cfs_block_allsigs();
+
+	while (!stt_data.stt_shuttingdown) {
+		stt_check_timers(&stt_data.stt_prev_slot);
+
+		rc = wait_event_timeout(stt_data.stt_waitq,
+					stt_data.stt_shuttingdown,
+					cfs_time_seconds(STTIMER_SLOTTIME));
+	}
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads--;
+	spin_unlock(&stt_data.stt_lock);
+	return 0;
+}
+
+int
+stt_start_timer_thread (void)
+{
+	task_t *task;
+
+	LASSERT(!stt_data.stt_shuttingdown);
+
+	task = kthread_run(stt_timer_main, NULL, "st_timer");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads++;
+	spin_unlock(&stt_data.stt_lock);
+	return 0;
+}
+
+
+int
+stt_startup (void)
+{
+	int rc = 0;
+	int i;
+
+	stt_data.stt_shuttingdown = 0;
+	stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK;
+
+	spin_lock_init(&stt_data.stt_lock);
+	for (i = 0; i < STTIMER_NSLOTS; i++)
+		INIT_LIST_HEAD(&stt_data.stt_hash[i]);
+
+	stt_data.stt_nthreads = 0;
+	init_waitqueue_head(&stt_data.stt_waitq);
+	rc = stt_start_timer_thread();
+	if (rc != 0)
+		CERROR ("Can't spawn timer thread: %d\n", rc);
+
+	return rc;
+}
+
+void
+stt_shutdown (void)
+{
+	int i;
+
+	spin_lock(&stt_data.stt_lock);
+
+	for (i = 0; i < STTIMER_NSLOTS; i++)
+		LASSERT (list_empty(&stt_data.stt_hash[i]));
+
+	stt_data.stt_shuttingdown = 1;
+
+	wake_up(&stt_data.stt_waitq);
+	lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock,
+		       "waiting for %d threads to terminate\n",
+		       stt_data.stt_nthreads);
+
+	spin_unlock(&stt_data.stt_lock);
+}

diff --git a/drivers/staging/lustre/lnet/selftest/timer.h b/drivers/staging/lustre/lnet/selftest/timer.h
new file mode 100644
index 0000000..56dbfe5
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/timer.h

@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_TIMER_H__
+#define __SELFTEST_TIMER_H__
+
+typedef struct {
+	struct list_head	stt_list;
+	cfs_time_t	stt_expires;
+	void	    (*stt_func) (void *);
+	void	     *stt_data;
+} stt_timer_t;
+
+void stt_add_timer (stt_timer_t *timer);
+int stt_del_timer (stt_timer_t *timer);
+int stt_startup (void);
+void stt_shutdown (void);
+
+#endif /* __SELFTEST_TIMER_H__ */

diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig
new file mode 100644
index 0000000..e0eb830
--- /dev/null
+++ b/drivers/staging/lustre/lustre/Kconfig

@@ -0,0 +1,51 @@
+config LUSTRE_FS
+	tristate "Lustre file system client support"
+	depends on STAGING && INET && BROKEN
+	select LNET
+	select CRYPTO
+	select CRYPTO_CRC32
+	select CRYPTO_CRC32_PCLMUL if X86
+	select CRYPTO_CRC32C
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	help
+	  This option enables Lustre file system client support. Choose Y
+	  here if you want to access a Lustre file system cluster. To compile
+	  this file system support as a module, choose M here: the module will
+	  be called lustre.
+
+	  To mount Lustre file systems , you also need to install the user space
+	  mount.lustre and other user space commands which can be found in the
+	  lustre-client package, available from
+	  http://downloads.whamcloud.com/public/lustre/
+
+	  Lustre file system is the most popular cluster file system in high
+	  performance computing. Source code of both kernel space and user space
+	  Lustre components can also be found at
+	  http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LUSTRE_OBD_MAX_IOCTL_BUFFER
+	int "Lustre obd max ioctl buffer bytes (default 8KB)"
+	depends on LUSTRE_FS
+	default 8192
+	help
+	  This option defines the maximum size of buffer in bytes that user space
+	  applications can pass to Lustre kernel module through ioctl interface.
+
+	  If unsure, use default.
+
+config LUSTRE_DEBUG_EXPENSIVE_CHECK
+	bool "Enable Lustre DEBUG checks"
+	depends on LUSTRE_FS
+	default false
+	help
+	  This option is mainly for debug purpose. It enables Lustre code to do
+	  expensive checks that may have a performance impact.
+
+	  Use with caution. If unsure, say N.

diff --git a/drivers/staging/lustre/lustre/Makefile b/drivers/staging/lustre/lustre/Makefile
new file mode 100644
index 0000000..3fb94fc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/Makefile

@@ -0,0 +1,2 @@
+obj-$(CONFIG_LUSTRE_FS) := fid/ lvfs/ obdclass/ ptlrpc/ obdecho/ mgc/ lov/ \
+			   osc/ mdc/ lmv/ llite/ fld/ libcfs/

diff --git a/drivers/staging/lustre/lustre/fid/Makefile b/drivers/staging/lustre/lustre/fid/Makefile
new file mode 100644
index 0000000..b8d6d21
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/Makefile

@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += fid.o
+fid-y := fid_handler.o fid_store.o fid_request.o lproc_fid.o fid_lib.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/fid/fid_handler.c b/drivers/staging/lustre/lustre/fid/fid_handler.c
new file mode 100644
index 0000000..bbbb3cf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_handler.c

@@ -0,0 +1,661 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_handler.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+int client_fid_init(struct obd_device *obd,
+		    struct obd_export *exp, enum lu_cli_type type)
+{
+	struct client_obd *cli = &obd->u.cli;
+	char *prefix;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(cli->cl_seq);
+	if (cli->cl_seq == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+	if (prefix == NULL)
+		GOTO(out_free_seq, rc = -ENOMEM);
+
+	snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name);
+
+	/* Init client side sequence-manager */
+	rc = seq_client_init(cli->cl_seq, exp, type, prefix, NULL);
+	OBD_FREE(prefix, MAX_OBD_NAME + 5);
+	if (rc)
+		GOTO(out_free_seq, rc);
+
+	RETURN(rc);
+out_free_seq:
+	OBD_FREE_PTR(cli->cl_seq);
+	cli->cl_seq = NULL;
+	return rc;
+}
+EXPORT_SYMBOL(client_fid_init);
+
+int client_fid_fini(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	ENTRY;
+
+	if (cli->cl_seq != NULL) {
+		seq_client_fini(cli->cl_seq);
+		OBD_FREE_PTR(cli->cl_seq);
+		cli->cl_seq = NULL;
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(client_fid_fini);
+
+static void seq_server_proc_fini(struct lu_server_seq *seq);
+
+/* Assigns client to sequence controller node. */
+int seq_server_set_cli(struct lu_server_seq *seq,
+		       struct lu_client_seq *cli,
+		       const struct lu_env *env)
+{
+	int rc = 0;
+	ENTRY;
+
+	/*
+	 * Ask client for new range, assign that range to ->seq_space and write
+	 * seq state to backing store should be atomic.
+	 */
+	mutex_lock(&seq->lss_mutex);
+
+	if (cli == NULL) {
+		CDEBUG(D_INFO, "%s: Detached sequence client %s\n",
+		       seq->lss_name, cli->lcs_name);
+		seq->lss_cli = cli;
+		GOTO(out_up, rc = 0);
+	}
+
+	if (seq->lss_cli != NULL) {
+		CDEBUG(D_HA, "%s: Sequence controller is already "
+		       "assigned\n", seq->lss_name);
+		GOTO(out_up, rc = -EEXIST);
+	}
+
+	CDEBUG(D_INFO, "%s: Attached sequence controller %s\n",
+	       seq->lss_name, cli->lcs_name);
+
+	seq->lss_cli = cli;
+	cli->lcs_space.lsr_index = seq->lss_site->ss_node_id;
+	EXIT;
+out_up:
+	mutex_unlock(&seq->lss_mutex);
+	return rc;
+}
+EXPORT_SYMBOL(seq_server_set_cli);
+/*
+ * allocate \a w units of sequence from range \a from.
+ */
+static inline void range_alloc(struct lu_seq_range *to,
+			       struct lu_seq_range *from,
+			       __u64 width)
+{
+	width = min(range_space(from), width);
+	to->lsr_start = from->lsr_start;
+	to->lsr_end = from->lsr_start + width;
+	from->lsr_start += width;
+}
+
+/**
+ * On controller node, allocate new super sequence for regular sequence server.
+ * As this super sequence controller, this node suppose to maintain fld
+ * and update index.
+ * \a out range always has currect mds node number of requester.
+ */
+
+static int __seq_server_alloc_super(struct lu_server_seq *seq,
+				    struct lu_seq_range *out,
+				    const struct lu_env *env)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc;
+	ENTRY;
+
+	LASSERT(range_is_sane(space));
+
+	if (range_is_exhausted(space)) {
+		CERROR("%s: Sequences space is exhausted\n",
+		       seq->lss_name);
+		RETURN(-ENOSPC);
+	} else {
+		range_alloc(out, space, seq->lss_width);
+	}
+
+	rc = seq_store_update(env, seq, out, 1 /* sync */);
+
+	LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n",
+		      seq->lss_name, rc, PRANGE(out));
+
+	RETURN(rc);
+}
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+			   struct lu_seq_range *out,
+			   const struct lu_env *env)
+{
+	int rc;
+	ENTRY;
+
+	mutex_lock(&seq->lss_mutex);
+	rc = __seq_server_alloc_super(seq, out, env);
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(rc);
+}
+
+static int __seq_set_init(const struct lu_env *env,
+			    struct lu_server_seq *seq)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc;
+
+	range_alloc(&seq->lss_lowater_set, space, seq->lss_set_width);
+	range_alloc(&seq->lss_hiwater_set, space, seq->lss_set_width);
+
+	rc = seq_store_update(env, seq, NULL, 1);
+
+	return rc;
+}
+
+/*
+ * This function implements new seq allocation algorithm using async
+ * updates to seq file on disk. ref bug 18857 for details.
+ * there are four variable to keep track of this process
+ *
+ * lss_space; - available lss_space
+ * lss_lowater_set; - lu_seq_range for all seqs before barrier, i.e. safe to use
+ * lss_hiwater_set; - lu_seq_range after barrier, i.e. allocated but may be
+ *		    not yet committed
+ *
+ * when lss_lowater_set reaches the end it is replaced with hiwater one and
+ * a write operation is initiated to allocate new hiwater range.
+ * if last seq write opearion is still not commited, current operation is
+ * flaged as sync write op.
+ */
+static int range_alloc_set(const struct lu_env *env,
+			    struct lu_seq_range *out,
+			    struct lu_server_seq *seq)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	struct lu_seq_range *loset = &seq->lss_lowater_set;
+	struct lu_seq_range *hiset = &seq->lss_hiwater_set;
+	int rc = 0;
+
+	if (range_is_zero(loset))
+		__seq_set_init(env, seq);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_ALLOC)) /* exhaust set */
+		loset->lsr_start = loset->lsr_end;
+
+	if (range_is_exhausted(loset)) {
+		/* reached high water mark. */
+		struct lu_device *dev = seq->lss_site->ss_lu->ls_top_dev;
+		int obd_num_clients = dev->ld_obd->obd_num_exports;
+		__u64 set_sz;
+
+		/* calculate new seq width based on number of clients */
+		set_sz = max(seq->lss_set_width,
+			     obd_num_clients * seq->lss_width);
+		set_sz = min(range_space(space), set_sz);
+
+		/* Switch to hiwater range now */
+		*loset = *hiset;
+		/* allocate new hiwater range */
+		range_alloc(hiset, space, set_sz);
+
+		/* update ondisk seq with new *space */
+		rc = seq_store_update(env, seq, NULL, seq->lss_need_sync);
+	}
+
+	LASSERTF(!range_is_exhausted(loset) || range_is_sane(loset),
+		 DRANGE"\n", PRANGE(loset));
+
+	if (rc == 0)
+		range_alloc(out, loset, seq->lss_width);
+
+	RETURN(rc);
+}
+
+static int __seq_server_alloc_meta(struct lu_server_seq *seq,
+				   struct lu_seq_range *out,
+				   const struct lu_env *env)
+{
+	struct lu_seq_range *space = &seq->lss_space;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(range_is_sane(space));
+
+	/* Check if available space ends and allocate new super seq */
+	if (range_is_exhausted(space)) {
+		if (!seq->lss_cli) {
+			CERROR("%s: No sequence controller is attached.\n",
+			       seq->lss_name);
+			RETURN(-ENODEV);
+		}
+
+		rc = seq_client_alloc_super(seq->lss_cli, env);
+		if (rc) {
+			CERROR("%s: Can't allocate super-sequence, rc %d\n",
+			       seq->lss_name, rc);
+			RETURN(rc);
+		}
+
+		/* Saving new range to allocation space. */
+		*space = seq->lss_cli->lcs_space;
+		LASSERT(range_is_sane(space));
+	}
+
+	rc = range_alloc_set(env, out, seq);
+	if (rc != 0) {
+		CERROR("%s: Allocated meta-sequence failed: rc = %d\n",
+			seq->lss_name, rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INFO, "%s: Allocated meta-sequence " DRANGE"\n",
+		seq->lss_name, PRANGE(out));
+
+	RETURN(rc);
+}
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+			  struct lu_seq_range *out,
+			  const struct lu_env *env)
+{
+	int rc;
+	ENTRY;
+
+	mutex_lock(&seq->lss_mutex);
+	rc = __seq_server_alloc_meta(seq, out, env);
+	mutex_unlock(&seq->lss_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_server_alloc_meta);
+
+static int seq_server_handle(struct lu_site *site,
+			     const struct lu_env *env,
+			     __u32 opc, struct lu_seq_range *out)
+{
+	int rc;
+	struct seq_server_site *ss_site;
+	ENTRY;
+
+	ss_site = lu_site2seq(site);
+
+	switch (opc) {
+	case SEQ_ALLOC_META:
+		if (!ss_site->ss_server_seq) {
+			CERROR("Sequence server is not "
+			       "initialized\n");
+			RETURN(-EINVAL);
+		}
+		rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env);
+		break;
+	case SEQ_ALLOC_SUPER:
+		if (!ss_site->ss_control_seq) {
+			CERROR("Sequence controller is not "
+			       "initialized\n");
+			RETURN(-EINVAL);
+		}
+		rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	RETURN(rc);
+}
+
+static int seq_req_handle(struct ptlrpc_request *req,
+			  const struct lu_env *env,
+			  struct seq_thread_info *info)
+{
+	struct lu_seq_range *out, *tmp;
+	struct lu_site *site;
+	int rc = -EPROTO;
+	__u32 *opc;
+	ENTRY;
+
+	LASSERT(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY));
+	site = req->rq_export->exp_obd->obd_lu_dev->ld_site;
+	LASSERT(site != NULL);
+
+	rc = req_capsule_server_pack(info->sti_pill);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	opc = req_capsule_client_get(info->sti_pill, &RMF_SEQ_OPC);
+	if (opc != NULL) {
+		out = req_capsule_server_get(info->sti_pill, &RMF_SEQ_RANGE);
+		if (out == NULL)
+			RETURN(err_serious(-EPROTO));
+
+		tmp = req_capsule_client_get(info->sti_pill, &RMF_SEQ_RANGE);
+
+		/* seq client passed mdt id, we need to pass that using out
+		 * range parameter */
+
+		out->lsr_index = tmp->lsr_index;
+		out->lsr_flags = tmp->lsr_flags;
+		rc = seq_server_handle(site, env, *opc, out);
+	} else
+		rc = err_serious(-EPROTO);
+
+	RETURN(rc);
+}
+
+/* context key constructor/destructor: seq_key_init, seq_key_fini */
+LU_KEY_INIT_FINI(seq, struct seq_thread_info);
+
+/* context key: seq_thread_key */
+LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
+
+static void seq_thread_info_init(struct ptlrpc_request *req,
+				 struct seq_thread_info *info)
+{
+	info->sti_pill = &req->rq_pill;
+	/* Init request capsule */
+	req_capsule_init(info->sti_pill, req, RCL_SERVER);
+	req_capsule_set(info->sti_pill, &RQF_SEQ_QUERY);
+}
+
+static void seq_thread_info_fini(struct seq_thread_info *info)
+{
+	req_capsule_fini(info->sti_pill);
+}
+
+int seq_handle(struct ptlrpc_request *req)
+{
+	const struct lu_env *env;
+	struct seq_thread_info *info;
+	int rc;
+
+	env = req->rq_svc_thread->t_env;
+	LASSERT(env != NULL);
+
+	info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+	LASSERT(info != NULL);
+
+	seq_thread_info_init(req, info);
+	rc = seq_req_handle(req, env, info);
+	/* XXX: we don't need replay but MDT assign transno in any case,
+	 * remove it manually before reply*/
+	lustre_msg_set_transno(req->rq_repmsg, 0);
+	seq_thread_info_fini(info);
+
+	return rc;
+}
+EXPORT_SYMBOL(seq_handle);
+
+/*
+ * Entry point for handling FLD RPCs called from MDT.
+ */
+int seq_query(struct com_thread_info *info)
+{
+	return seq_handle(info->cti_pill->rc_req);
+}
+EXPORT_SYMBOL(seq_query);
+
+
+#ifdef LPROCFS
+static int seq_server_proc_init(struct lu_server_seq *seq)
+{
+	int rc;
+	ENTRY;
+
+	seq->lss_proc_dir = lprocfs_register(seq->lss_name,
+					     seq_type_proc_dir,
+					     NULL, NULL);
+	if (IS_ERR(seq->lss_proc_dir)) {
+		rc = PTR_ERR(seq->lss_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_add_vars(seq->lss_proc_dir,
+			      seq_server_proc_list, seq);
+	if (rc) {
+		CERROR("%s: Can't init sequence manager "
+		       "proc, rc %d\n", seq->lss_name, rc);
+		GOTO(out_cleanup, rc);
+	}
+
+	RETURN(0);
+
+out_cleanup:
+	seq_server_proc_fini(seq);
+	return rc;
+}
+
+static void seq_server_proc_fini(struct lu_server_seq *seq)
+{
+	ENTRY;
+	if (seq->lss_proc_dir != NULL) {
+		if (!IS_ERR(seq->lss_proc_dir))
+			lprocfs_remove(&seq->lss_proc_dir);
+		seq->lss_proc_dir = NULL;
+	}
+	EXIT;
+}
+#else
+static int seq_server_proc_init(struct lu_server_seq *seq)
+{
+	return 0;
+}
+
+static void seq_server_proc_fini(struct lu_server_seq *seq)
+{
+	return;
+}
+#endif
+
+
+int seq_server_init(struct lu_server_seq *seq,
+		    struct dt_device *dev,
+		    const char *prefix,
+		    enum lu_mgr_type type,
+		    struct seq_server_site *ss,
+		    const struct lu_env *env)
+{
+	int rc, is_srv = (type == LUSTRE_SEQ_SERVER);
+	ENTRY;
+
+	LASSERT(dev != NULL);
+	LASSERT(prefix != NULL);
+	LASSERT(ss != NULL);
+	LASSERT(ss->ss_lu != NULL);
+
+	seq->lss_cli = NULL;
+	seq->lss_type = type;
+	seq->lss_site = ss;
+	range_init(&seq->lss_space);
+
+	range_init(&seq->lss_lowater_set);
+	range_init(&seq->lss_hiwater_set);
+	seq->lss_set_width = LUSTRE_SEQ_BATCH_WIDTH;
+
+	mutex_init(&seq->lss_mutex);
+
+	seq->lss_width = is_srv ?
+		LUSTRE_SEQ_META_WIDTH : LUSTRE_SEQ_SUPER_WIDTH;
+
+	snprintf(seq->lss_name, sizeof(seq->lss_name),
+		 "%s-%s", (is_srv ? "srv" : "ctl"), prefix);
+
+	rc = seq_store_init(seq, env, dev);
+	if (rc)
+		GOTO(out, rc);
+	/* Request backing store for saved sequence info. */
+	rc = seq_store_read(seq, env);
+	if (rc == -ENODATA) {
+
+		/* Nothing is read, init by default value. */
+		seq->lss_space = is_srv ?
+			LUSTRE_SEQ_ZERO_RANGE:
+			LUSTRE_SEQ_SPACE_RANGE;
+
+		LASSERT(ss != NULL);
+		seq->lss_space.lsr_index = ss->ss_node_id;
+		LCONSOLE_INFO("%s: No data found "
+			      "on store. Initialize space\n",
+			      seq->lss_name);
+
+		rc = seq_store_update(env, seq, NULL, 0);
+		if (rc) {
+			CERROR("%s: Can't write space data, "
+			       "rc %d\n", seq->lss_name, rc);
+		}
+	} else if (rc) {
+		CERROR("%s: Can't read space data, rc %d\n",
+		       seq->lss_name, rc);
+		GOTO(out, rc);
+	}
+
+	if (is_srv) {
+		LASSERT(range_is_sane(&seq->lss_space));
+	} else {
+		LASSERT(!range_is_zero(&seq->lss_space) &&
+			range_is_sane(&seq->lss_space));
+	}
+
+	rc  = seq_server_proc_init(seq);
+	if (rc)
+		GOTO(out, rc);
+
+	EXIT;
+out:
+	if (rc)
+		seq_server_fini(seq, env);
+	return rc;
+}
+EXPORT_SYMBOL(seq_server_init);
+
+void seq_server_fini(struct lu_server_seq *seq,
+		     const struct lu_env *env)
+{
+	ENTRY;
+
+	seq_server_proc_fini(seq);
+	seq_store_fini(seq, env);
+
+	EXIT;
+}
+EXPORT_SYMBOL(seq_server_fini);
+
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss)
+{
+	if (ss == NULL)
+		RETURN(0);
+
+	if (ss->ss_server_seq) {
+		seq_server_fini(ss->ss_server_seq, env);
+		OBD_FREE_PTR(ss->ss_server_seq);
+		ss->ss_server_seq = NULL;
+	}
+
+	if (ss->ss_control_seq) {
+		seq_server_fini(ss->ss_control_seq, env);
+		OBD_FREE_PTR(ss->ss_control_seq);
+		ss->ss_control_seq = NULL;
+	}
+
+	if (ss->ss_client_seq) {
+		seq_client_fini(ss->ss_client_seq);
+		OBD_FREE_PTR(ss->ss_client_seq);
+		ss->ss_client_seq = NULL;
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(seq_site_fini);
+
+proc_dir_entry_t *seq_type_proc_dir = NULL;
+
+static int __init fid_mod_init(void)
+{
+	seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
+					     proc_lustre_root,
+					     NULL, NULL);
+	if (IS_ERR(seq_type_proc_dir))
+		return PTR_ERR(seq_type_proc_dir);
+
+	LU_CONTEXT_KEY_INIT(&seq_thread_key);
+	lu_context_key_register(&seq_thread_key);
+	return 0;
+}
+
+static void __exit fid_mod_exit(void)
+{
+	lu_context_key_degister(&seq_thread_key);
+	if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
+		lprocfs_remove(&seq_type_proc_dir);
+		seq_type_proc_dir = NULL;
+	}
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FID Module");
+MODULE_LICENSE("GPL");
+
+cfs_module(fid, "0.1.0", fid_mod_init, fid_mod_exit);

diff --git a/drivers/staging/lustre/lustre/fid/fid_internal.h b/drivers/staging/lustre/lustre/fid/fid_internal.h
new file mode 100644
index 0000000..407a743
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_internal.h

@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+#ifndef __FID_INTERNAL_H
+#define __FID_INTERNAL_H
+
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct seq_thread_info {
+	struct req_capsule     *sti_pill;
+	struct lu_seq_range     sti_space;
+	struct lu_buf	   sti_buf;
+};
+
+enum {
+	SEQ_TXN_STORE_CREDITS = 20
+};
+
+extern struct lu_context_key seq_thread_key;
+
+int seq_client_alloc_super(struct lu_client_seq *seq,
+			   const struct lu_env *env);
+/* Store API functions. */
+int seq_store_init(struct lu_server_seq *seq,
+		   const struct lu_env *env,
+		   struct dt_device *dt);
+
+void seq_store_fini(struct lu_server_seq *seq,
+		    const struct lu_env *env);
+
+int seq_store_read(struct lu_server_seq *seq,
+		   const struct lu_env *env);
+
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+		     struct lu_seq_range *out, int sync);
+
+#ifdef LPROCFS
+extern struct lprocfs_vars seq_server_proc_list[];
+extern struct lprocfs_vars seq_client_proc_list[];
+#endif
+
+
+extern proc_dir_entry_t *seq_type_proc_dir;
+
+#endif /* __FID_INTERNAL_H */

diff --git a/drivers/staging/lustre/lustre/fid/fid_lib.c b/drivers/staging/lustre/lustre/fid/fid_lib.c
new file mode 100644
index 0000000..eaff51a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_lib.c

@@ -0,0 +1,97 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_lib.c
+ *
+ * Miscellaneous fid functions.
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <lu_object.h>
+#include <lustre_fid.h>
+
+/**
+ * A cluster-wide range from which fid-sequences are granted to servers and
+ * then clients.
+ *
+ * Fid namespace:
+ * <pre>
+ * Normal FID:	seq:64 [2^33,2^64-1]      oid:32	  ver:32
+ * IGIF      :	0:32, ino:32	      gen:32	  0:32
+ * IDIF      :	0:31, 1:1, ost-index:16,  objd:48	 0:32
+ * </pre>
+ *
+ * The first 0x400 sequences of normal FID are reserved for special purpose.
+ * FID_SEQ_START + 1 is for local file id generation.
+ * FID_SEQ_START + 2 is for .lustre directory and its objects
+ */
+const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = {
+	FID_SEQ_NORMAL,
+	(__u64)~0ULL
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_SPACE_RANGE);
+
+/* Zero range, used for init and other purposes. */
+const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = {
+	0,
+	0
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_ZERO_RANGE);
+
+/* Lustre Big Fs Lock fid. */
+const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL,
+				       .f_oid = FID_OID_SPECIAL_BFL,
+				       .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LUSTRE_BFL_FID);
+
+/** Special fid for ".lustre" directory */
+const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+					  .f_oid = FID_OID_DOT_LUSTRE,
+					  .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_DOT_LUSTRE_FID);
+
+/** Special fid for "fid" special object in .lustre */
+const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+				   .f_oid = FID_OID_DOT_LUSTRE_OBF,
+				   .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_OBF_FID);

diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c
new file mode 100644
index 0000000..fcaaca7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_request.c

@@ -0,0 +1,522 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_request.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+/* mdc RPC locks */
+#include <lustre_mdc.h>
+#include "fid_internal.h"
+
+static int seq_client_rpc(struct lu_client_seq *seq,
+			  struct lu_seq_range *output, __u32 opc,
+			  const char *opcname)
+{
+	struct obd_export     *exp = seq->lcs_exp;
+	struct ptlrpc_request *req;
+	struct lu_seq_range   *out, *in;
+	__u32		 *op;
+	unsigned int	   debug_mask;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY,
+					LUSTRE_MDS_VERSION, SEQ_QUERY);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	/* Init operation code */
+	op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC);
+	*op = opc;
+
+	/* Zero out input range, this is not recovery yet. */
+	in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE);
+	range_init(in);
+
+	ptlrpc_request_set_replen(req);
+
+	in->lsr_index = seq->lcs_space.lsr_index;
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		fld_range_set_mdt(in);
+	else
+		fld_range_set_ost(in);
+
+	if (opc == SEQ_ALLOC_SUPER) {
+		req->rq_request_portal = SEQ_CONTROLLER_PORTAL;
+		req->rq_reply_portal = MDC_REPLY_PORTAL;
+		/* During allocating super sequence for data object,
+		 * the current thread might hold the export of MDT0(MDT0
+		 * precreating objects on this OST), and it will send the
+		 * request to MDT0 here, so we can not keep resending the
+		 * request here, otherwise if MDT0 is failed(umounted),
+		 * it can not release the export of MDT0 */
+		if (seq->lcs_type == LUSTRE_SEQ_DATA)
+			req->rq_no_delay = req->rq_no_resend = 1;
+		debug_mask = D_CONSOLE;
+	} else {
+		if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+			req->rq_request_portal = SEQ_METADATA_PORTAL;
+		else
+			req->rq_request_portal = SEQ_DATA_PORTAL;
+		debug_mask = D_INFO;
+	}
+
+	ptlrpc_at_set_req_timeout(req);
+
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	if (rc)
+		GOTO(out_req, rc);
+
+	out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);
+	*output = *out;
+
+	if (!range_is_sane(output)) {
+		CERROR("%s: Invalid range received from server: "
+		       DRANGE"\n", seq->lcs_name, PRANGE(output));
+		GOTO(out_req, rc = -EINVAL);
+	}
+
+	if (range_is_exhausted(output)) {
+		CERROR("%s: Range received from server is exhausted: "
+		       DRANGE"]\n", seq->lcs_name, PRANGE(output));
+		GOTO(out_req, rc = -EINVAL);
+	}
+
+	CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n",
+		     seq->lcs_name, opcname, PRANGE(output));
+
+	EXIT;
+out_req:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* Request sequence-controller node to allocate new super-sequence. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+			   const struct lu_env *env)
+{
+	int rc;
+	ENTRY;
+
+	mutex_lock(&seq->lcs_mutex);
+
+	if (seq->lcs_srv) {
+		LASSERT(env != NULL);
+		rc = seq_server_alloc_super(seq->lcs_srv, &seq->lcs_space,
+					    env);
+	} else {
+		/* Check whether the connection to seq controller has been
+		 * setup (lcs_exp != NULL) */
+		if (seq->lcs_exp == NULL) {
+			mutex_unlock(&seq->lcs_mutex);
+			RETURN(-EINPROGRESS);
+		}
+
+		rc = seq_client_rpc(seq, &seq->lcs_space,
+				    SEQ_ALLOC_SUPER, "super");
+	}
+	mutex_unlock(&seq->lcs_mutex);
+	RETURN(rc);
+}
+
+/* Request sequence-controller node to allocate new meta-sequence. */
+static int seq_client_alloc_meta(const struct lu_env *env,
+				 struct lu_client_seq *seq)
+{
+	int rc;
+	ENTRY;
+
+	if (seq->lcs_srv) {
+		LASSERT(env != NULL);
+		rc = seq_server_alloc_meta(seq->lcs_srv, &seq->lcs_space, env);
+	} else {
+		do {
+			/* If meta server return -EINPROGRESS or EAGAIN,
+			 * it means meta server might not be ready to
+			 * allocate super sequence from sequence controller
+			 * (MDT0)yet */
+			rc = seq_client_rpc(seq, &seq->lcs_space,
+					    SEQ_ALLOC_META, "meta");
+		} while (rc == -EINPROGRESS || rc == -EAGAIN);
+	}
+	RETURN(rc);
+}
+
+/* Allocate new sequence for client. */
+static int seq_client_alloc_seq(const struct lu_env *env,
+				struct lu_client_seq *seq, seqno_t *seqnr)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(range_is_sane(&seq->lcs_space));
+
+	if (range_is_exhausted(&seq->lcs_space)) {
+		rc = seq_client_alloc_meta(env, seq);
+		if (rc) {
+			CERROR("%s: Can't allocate new meta-sequence,"
+			       "rc %d\n", seq->lcs_name, rc);
+			RETURN(rc);
+		} else {
+			CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
+			       seq->lcs_name, PRANGE(&seq->lcs_space));
+		}
+	} else {
+		rc = 0;
+	}
+
+	LASSERT(!range_is_exhausted(&seq->lcs_space));
+	*seqnr = seq->lcs_space.lsr_start;
+	seq->lcs_space.lsr_start += 1;
+
+	CDEBUG(D_INFO, "%s: Allocated sequence ["LPX64"]\n", seq->lcs_name,
+	       *seqnr);
+
+	RETURN(rc);
+}
+
+static int seq_fid_alloc_prep(struct lu_client_seq *seq,
+			      wait_queue_t *link)
+{
+	if (seq->lcs_update) {
+		add_wait_queue(&seq->lcs_waitq, link);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&seq->lcs_mutex);
+
+		waitq_wait(link, TASK_UNINTERRUPTIBLE);
+
+		mutex_lock(&seq->lcs_mutex);
+		remove_wait_queue(&seq->lcs_waitq, link);
+		set_current_state(TASK_RUNNING);
+		return -EAGAIN;
+	}
+	++seq->lcs_update;
+	mutex_unlock(&seq->lcs_mutex);
+	return 0;
+}
+
+static void seq_fid_alloc_fini(struct lu_client_seq *seq)
+{
+	LASSERT(seq->lcs_update == 1);
+	mutex_lock(&seq->lcs_mutex);
+	--seq->lcs_update;
+	wake_up(&seq->lcs_waitq);
+}
+
+/**
+ * Allocate the whole seq to the caller.
+ **/
+int seq_client_get_seq(const struct lu_env *env,
+		       struct lu_client_seq *seq, seqno_t *seqnr)
+{
+	wait_queue_t link;
+	int rc;
+
+	LASSERT(seqnr != NULL);
+	mutex_lock(&seq->lcs_mutex);
+	init_waitqueue_entry_current(&link);
+
+	while (1) {
+		rc = seq_fid_alloc_prep(seq, &link);
+		if (rc == 0)
+			break;
+	}
+
+	rc = seq_client_alloc_seq(env, seq, seqnr);
+	if (rc) {
+		CERROR("%s: Can't allocate new sequence, "
+		       "rc %d\n", seq->lcs_name, rc);
+		seq_fid_alloc_fini(seq);
+		mutex_unlock(&seq->lcs_mutex);
+		return rc;
+	}
+
+	CDEBUG(D_INFO, "%s: allocate sequence "
+	       "[0x%16.16"LPF64"x]\n", seq->lcs_name, *seqnr);
+
+	/* Since the caller require the whole seq,
+	 * so marked this seq to be used */
+	if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+		seq->lcs_fid.f_oid = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+	else
+		seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+	seq->lcs_fid.f_seq = *seqnr;
+	seq->lcs_fid.f_ver = 0;
+	/*
+	 * Inform caller that sequence switch is performed to allow it
+	 * to setup FLD for it.
+	 */
+	seq_fid_alloc_fini(seq);
+	mutex_unlock(&seq->lcs_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(seq_client_get_seq);
+
+/* Allocate new fid on passed client @seq and save it to @fid. */
+int seq_client_alloc_fid(const struct lu_env *env,
+			 struct lu_client_seq *seq, struct lu_fid *fid)
+{
+	wait_queue_t link;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+	LASSERT(fid != NULL);
+
+	init_waitqueue_entry_current(&link);
+	mutex_lock(&seq->lcs_mutex);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST))
+		seq->lcs_fid.f_oid = seq->lcs_width;
+
+	while (1) {
+		seqno_t seqnr;
+
+		if (!fid_is_zero(&seq->lcs_fid) &&
+		    fid_oid(&seq->lcs_fid) < seq->lcs_width) {
+			/* Just bump last allocated fid and return to caller. */
+			seq->lcs_fid.f_oid += 1;
+			rc = 0;
+			break;
+		}
+
+		rc = seq_fid_alloc_prep(seq, &link);
+		if (rc)
+			continue;
+
+		rc = seq_client_alloc_seq(env, seq, &seqnr);
+		if (rc) {
+			CERROR("%s: Can't allocate new sequence, "
+			       "rc %d\n", seq->lcs_name, rc);
+			seq_fid_alloc_fini(seq);
+			mutex_unlock(&seq->lcs_mutex);
+			RETURN(rc);
+		}
+
+		CDEBUG(D_INFO, "%s: Switch to sequence "
+		       "[0x%16.16"LPF64"x]\n", seq->lcs_name, seqnr);
+
+		seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID;
+		seq->lcs_fid.f_seq = seqnr;
+		seq->lcs_fid.f_ver = 0;
+
+		/*
+		 * Inform caller that sequence switch is performed to allow it
+		 * to setup FLD for it.
+		 */
+		rc = 1;
+
+		seq_fid_alloc_fini(seq);
+		break;
+	}
+
+	*fid = seq->lcs_fid;
+	mutex_unlock(&seq->lcs_mutex);
+
+	CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name,  PFID(fid));
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_alloc_fid);
+
+/*
+ * Finish the current sequence due to disconnect.
+ * See mdc_import_event()
+ */
+void seq_client_flush(struct lu_client_seq *seq)
+{
+	wait_queue_t link;
+
+	LASSERT(seq != NULL);
+	init_waitqueue_entry_current(&link);
+	mutex_lock(&seq->lcs_mutex);
+
+	while (seq->lcs_update) {
+		add_wait_queue(&seq->lcs_waitq, &link);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&seq->lcs_mutex);
+
+		waitq_wait(&link, TASK_UNINTERRUPTIBLE);
+
+		mutex_lock(&seq->lcs_mutex);
+		remove_wait_queue(&seq->lcs_waitq, &link);
+		set_current_state(TASK_RUNNING);
+	}
+
+	fid_zero(&seq->lcs_fid);
+	/**
+	 * this id shld not be used for seq range allocation.
+	 * set to -1 for dgb check.
+	 */
+
+	seq->lcs_space.lsr_index = -1;
+
+	range_init(&seq->lcs_space);
+	mutex_unlock(&seq->lcs_mutex);
+}
+EXPORT_SYMBOL(seq_client_flush);
+
+static void seq_client_proc_fini(struct lu_client_seq *seq);
+
+#ifdef LPROCFS
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+	int rc;
+	ENTRY;
+
+	seq->lcs_proc_dir = lprocfs_register(seq->lcs_name,
+					     seq_type_proc_dir,
+					     NULL, NULL);
+
+	if (IS_ERR(seq->lcs_proc_dir)) {
+		CERROR("%s: LProcFS failed in seq-init\n",
+		       seq->lcs_name);
+		rc = PTR_ERR(seq->lcs_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_add_vars(seq->lcs_proc_dir,
+			      seq_client_proc_list, seq);
+	if (rc) {
+		CERROR("%s: Can't init sequence manager "
+		       "proc, rc %d\n", seq->lcs_name, rc);
+		GOTO(out_cleanup, rc);
+	}
+
+	RETURN(0);
+
+out_cleanup:
+	seq_client_proc_fini(seq);
+	return rc;
+}
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+	ENTRY;
+	if (seq->lcs_proc_dir) {
+		if (!IS_ERR(seq->lcs_proc_dir))
+			lprocfs_remove(&seq->lcs_proc_dir);
+		seq->lcs_proc_dir = NULL;
+	}
+	EXIT;
+}
+#else
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+	return 0;
+}
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+	return;
+}
+#endif
+
+int seq_client_init(struct lu_client_seq *seq,
+		    struct obd_export *exp,
+		    enum lu_cli_type type,
+		    const char *prefix,
+		    struct lu_server_seq *srv)
+{
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+	LASSERT(prefix != NULL);
+
+	seq->lcs_srv = srv;
+	seq->lcs_type = type;
+
+	mutex_init(&seq->lcs_mutex);
+	if (type == LUSTRE_SEQ_METADATA)
+		seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+	else
+		seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+	init_waitqueue_head(&seq->lcs_waitq);
+	/* Make sure that things are clear before work is started. */
+	seq_client_flush(seq);
+
+	if (exp != NULL)
+		seq->lcs_exp = class_export_get(exp);
+	else if (type == LUSTRE_SEQ_METADATA)
+		LASSERT(seq->lcs_srv != NULL);
+
+	snprintf(seq->lcs_name, sizeof(seq->lcs_name),
+		 "cli-%s", prefix);
+
+	rc = seq_client_proc_init(seq);
+	if (rc)
+		seq_client_fini(seq);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_init);
+
+void seq_client_fini(struct lu_client_seq *seq)
+{
+	ENTRY;
+
+	seq_client_proc_fini(seq);
+
+	if (seq->lcs_exp != NULL) {
+		class_export_put(seq->lcs_exp);
+		seq->lcs_exp = NULL;
+	}
+
+	seq->lcs_srv = NULL;
+	EXIT;
+}
+EXPORT_SYMBOL(seq_client_fini);

diff --git a/drivers/staging/lustre/lustre/fid/fid_store.c b/drivers/staging/lustre/lustre/fid/fid_store.c
new file mode 100644
index 0000000..a90e6e3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_store.c

@@ -0,0 +1,259 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_store.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+
+static struct lu_buf *seq_store_buf(struct seq_thread_info *info)
+{
+	struct lu_buf *buf;
+
+	buf = &info->sti_buf;
+	buf->lb_buf = &info->sti_space;
+	buf->lb_len = sizeof(info->sti_space);
+	return buf;
+}
+
+struct seq_update_callback {
+	struct dt_txn_commit_cb suc_cb;
+	struct lu_server_seq   *suc_seq;
+};
+
+void seq_update_cb(struct lu_env *env, struct thandle *th,
+		   struct dt_txn_commit_cb *cb, int err)
+{
+	struct seq_update_callback *ccb;
+
+	ccb = container_of0(cb, struct seq_update_callback, suc_cb);
+
+	LASSERT(ccb->suc_seq != NULL);
+
+	ccb->suc_seq->lss_need_sync = 0;
+	OBD_FREE_PTR(ccb);
+}
+
+int seq_update_cb_add(struct thandle *th, struct lu_server_seq *seq)
+{
+	struct seq_update_callback *ccb;
+	struct dt_txn_commit_cb	   *dcb;
+	int			   rc;
+
+	OBD_ALLOC_PTR(ccb);
+	if (ccb == NULL)
+		return -ENOMEM;
+
+	ccb->suc_seq	   = seq;
+	seq->lss_need_sync = 1;
+
+	dcb	       = &ccb->suc_cb;
+	dcb->dcb_func  = seq_update_cb;
+	INIT_LIST_HEAD(&dcb->dcb_linkage);
+	strncpy(dcb->dcb_name, "seq_update_cb", MAX_COMMIT_CB_STR_LEN);
+	dcb->dcb_name[MAX_COMMIT_CB_STR_LEN - 1] = '\0';
+
+	rc = dt_trans_cb_add(th, dcb);
+	if (rc)
+		OBD_FREE_PTR(ccb);
+	return rc;
+}
+
+/* This function implies that caller takes care about locking. */
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+		     struct lu_seq_range *out, int sync)
+{
+	struct dt_device *dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev);
+	struct seq_thread_info *info;
+	struct thandle *th;
+	loff_t pos = 0;
+	int rc;
+
+	info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+	LASSERT(info != NULL);
+
+	th = dt_trans_create(env, dt_dev);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, seq->lss_obj,
+				     sizeof(struct lu_seq_range), 0, th);
+	if (rc)
+		GOTO(exit, rc);
+
+	if (out != NULL) {
+		rc = fld_declare_server_create(env,
+					       seq->lss_site->ss_server_fld,
+					       out, th);
+		if (rc)
+			GOTO(exit, rc);
+	}
+
+	rc = dt_trans_start_local(env, dt_dev, th);
+	if (rc)
+		GOTO(exit, rc);
+
+	/* Store ranges in le format. */
+	range_cpu_to_le(&info->sti_space, &seq->lss_space);
+
+	rc = dt_record_write(env, seq->lss_obj, seq_store_buf(info), &pos, th);
+	if (rc) {
+		CERROR("%s: Can't write space data, rc %d\n",
+		       seq->lss_name, rc);
+		GOTO(exit, rc);
+	} else if (out != NULL) {
+		rc = fld_server_create(env, seq->lss_site->ss_server_fld, out,
+				       th);
+		if (rc) {
+			CERROR("%s: Can't Update fld database, rc %d\n",
+				seq->lss_name, rc);
+			GOTO(exit, rc);
+		}
+	}
+	/* next sequence update will need sync until this update is committed
+	 * in case of sync operation this is not needed obviously */
+	if (!sync)
+		/* if callback can't be added then sync always */
+		sync = !!seq_update_cb_add(th, seq);
+
+	th->th_sync |= sync;
+exit:
+	dt_trans_stop(env, dt_dev, th);
+	return rc;
+}
+
+/*
+ * This function implies that caller takes care about locking or locking is not
+ * needed (init time).
+ */
+int seq_store_read(struct lu_server_seq *seq,
+		   const struct lu_env *env)
+{
+	struct seq_thread_info *info;
+	loff_t pos = 0;
+	int rc;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+	LASSERT(info != NULL);
+
+	rc = seq->lss_obj->do_body_ops->dbo_read(env, seq->lss_obj,
+						 seq_store_buf(info),
+						 &pos, BYPASS_CAPA);
+
+	if (rc == sizeof(info->sti_space)) {
+		range_le_to_cpu(&seq->lss_space, &info->sti_space);
+		CDEBUG(D_INFO, "%s: Space - "DRANGE"\n",
+		       seq->lss_name, PRANGE(&seq->lss_space));
+		rc = 0;
+	} else if (rc == 0) {
+		rc = -ENODATA;
+	} else if (rc > 0) {
+		CERROR("%s: Read only %d bytes of %d\n", seq->lss_name,
+		       rc, (int)sizeof(info->sti_space));
+		rc = -EIO;
+	}
+
+	RETURN(rc);
+}
+
+int seq_store_init(struct lu_server_seq *seq,
+		   const struct lu_env *env,
+		   struct dt_device *dt)
+{
+	struct dt_object *dt_obj;
+	struct lu_fid fid;
+	struct lu_attr attr;
+	struct dt_object_format dof;
+	const char *name;
+	int rc;
+	ENTRY;
+
+	name = seq->lss_type == LUSTRE_SEQ_SERVER ?
+		LUSTRE_SEQ_SRV_NAME : LUSTRE_SEQ_CTL_NAME;
+
+	if (seq->lss_type == LUSTRE_SEQ_SERVER)
+		lu_local_obj_fid(&fid, FID_SEQ_SRV_OID);
+	else
+		lu_local_obj_fid(&fid, FID_SEQ_CTL_OID);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.la_valid = LA_MODE;
+	attr.la_mode = S_IFREG | 0666;
+	dof.dof_type = DFT_REGULAR;
+
+	dt_obj = dt_find_or_create(env, dt, &fid, &dof, &attr);
+	if (!IS_ERR(dt_obj)) {
+		seq->lss_obj = dt_obj;
+		rc = 0;
+	} else {
+		CERROR("%s: Can't find \"%s\" obj %d\n",
+		       seq->lss_name, name, (int)PTR_ERR(dt_obj));
+		rc = PTR_ERR(dt_obj);
+	}
+
+	RETURN(rc);
+}
+
+void seq_store_fini(struct lu_server_seq *seq,
+		    const struct lu_env *env)
+{
+	ENTRY;
+
+	if (seq->lss_obj != NULL) {
+		if (!IS_ERR(seq->lss_obj))
+			lu_object_put(env, &seq->lss_obj->do_lu);
+		seq->lss_obj = NULL;
+	}
+
+	EXIT;
+}

diff --git a/drivers/staging/lustre/lustre/fid/lproc_fid.c b/drivers/staging/lustre/lustre/fid/lproc_fid.c
new file mode 100644
index 0000000..af817a8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/lproc_fid.c

@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/lproc_fid.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+#ifdef LPROCFS
+/*
+ * Note: this function is only used for testing, it is no safe for production
+ * use.
+ */
+static int
+lprocfs_fid_write_common(const char *buffer, unsigned long count,
+			 struct lu_seq_range *range)
+{
+	struct lu_seq_range tmp;
+	int rc;
+	ENTRY;
+
+	LASSERT(range != NULL);
+
+	rc = sscanf(buffer, "[%llx - %llx]\n",
+		    (long long unsigned *)&tmp.lsr_start,
+		    (long long unsigned *)&tmp.lsr_end);
+	if (rc != 2 || !range_is_sane(&tmp) || range_is_zero(&tmp))
+		RETURN(-EINVAL);
+	*range = tmp;
+	RETURN(0);
+}
+
+/* Client side procfs stuff */
+static ssize_t
+lprocfs_fid_space_seq_write(struct file *file, const char *buffer,
+			    size_t count, loff_t *off)
+{
+	struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = lprocfs_fid_write_common(buffer, count, &seq->lcs_space);
+
+	if (rc == 0) {
+		CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+		       seq->lcs_name, PRANGE(&seq->lcs_space));
+	}
+
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(count);
+}
+
+static int
+lprocfs_fid_space_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = seq_printf(m, "["LPX64" - "LPX64"]:%x:%s\n", PRANGE(&seq->lcs_space));
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(rc);
+}
+
+static ssize_t
+lprocfs_fid_width_seq_write(struct file *file, const char *buffer,
+			    size_t count, loff_t *off)
+{
+	struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
+	__u64  max;
+	int rc, val;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		RETURN(rc);
+
+	mutex_lock(&seq->lcs_mutex);
+	if (seq->lcs_type == LUSTRE_SEQ_DATA)
+		max = LUSTRE_DATA_SEQ_MAX_WIDTH;
+	else
+		max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+
+	if (val <= max && val > 0) {
+		seq->lcs_width = val;
+
+		if (rc == 0) {
+			CDEBUG(D_INFO, "%s: Sequence size: "LPU64"\n",
+			       seq->lcs_name, seq->lcs_width);
+		}
+	}
+
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(count);
+}
+
+static int
+lprocfs_fid_width_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = seq_printf(m, LPU64"\n", seq->lcs_width);
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(rc);
+}
+
+static int
+lprocfs_fid_fid_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	mutex_lock(&seq->lcs_mutex);
+	rc = seq_printf(m, DFID"\n", PFID(&seq->lcs_fid));
+	mutex_unlock(&seq->lcs_mutex);
+
+	RETURN(rc);
+}
+
+static int
+lprocfs_fid_server_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+	struct client_obd *cli;
+	int rc;
+	ENTRY;
+
+	LASSERT(seq != NULL);
+
+	if (seq->lcs_exp != NULL) {
+		cli = &seq->lcs_exp->exp_obd->u.cli;
+		rc = seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
+	} else {
+		rc = seq_printf(m, "%s\n", seq->lcs_srv->lss_name);
+	}
+	RETURN(rc);
+}
+
+struct lprocfs_vars seq_server_proc_list[] = {
+};
+
+LPROC_SEQ_FOPS(lprocfs_fid_space);
+LPROC_SEQ_FOPS(lprocfs_fid_width);
+LPROC_SEQ_FOPS_RO(lprocfs_fid_server);
+LPROC_SEQ_FOPS_RO(lprocfs_fid_fid);
+
+struct lprocfs_vars seq_client_proc_list[] = {
+	{ "space", &lprocfs_fid_space_fops },
+	{ "width", &lprocfs_fid_width_fops },
+	{ "server", &lprocfs_fid_server_fops },
+	{ "fid", &lprocfs_fid_fid_fops },
+	{ NULL }
+};
+#endif

diff --git a/drivers/staging/lustre/lustre/fld/Makefile b/drivers/staging/lustre/lustre/fld/Makefile
new file mode 100644
index 0000000..e7f2881
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/Makefile

@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += fld.o
+fld-y := fld_handler.o fld_request.o fld_cache.o fld_index.o lproc_fld.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/fld/fld_cache.c b/drivers/staging/lustre/lustre/fld/fld_cache.c
new file mode 100644
index 0000000..347f2ae
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_cache.c

@@ -0,0 +1,566 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_cache.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+/**
+ * create fld cache.
+ */
+struct fld_cache *fld_cache_init(const char *name,
+				 int cache_size, int cache_threshold)
+{
+	struct fld_cache *cache;
+	ENTRY;
+
+	LASSERT(name != NULL);
+	LASSERT(cache_threshold < cache_size);
+
+	OBD_ALLOC_PTR(cache);
+	if (cache == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	INIT_LIST_HEAD(&cache->fci_entries_head);
+	INIT_LIST_HEAD(&cache->fci_lru);
+
+	cache->fci_cache_count = 0;
+	rwlock_init(&cache->fci_lock);
+
+	strlcpy(cache->fci_name, name,
+		sizeof(cache->fci_name));
+
+	cache->fci_cache_size = cache_size;
+	cache->fci_threshold = cache_threshold;
+
+	/* Init fld cache info. */
+	memset(&cache->fci_stat, 0, sizeof(cache->fci_stat));
+
+	CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n",
+	       cache->fci_name, cache_size, cache_threshold);
+
+	RETURN(cache);
+}
+
+/**
+ * destroy fld cache.
+ */
+void fld_cache_fini(struct fld_cache *cache)
+{
+	__u64 pct;
+	ENTRY;
+
+	LASSERT(cache != NULL);
+	fld_cache_flush(cache);
+
+	if (cache->fci_stat.fst_count > 0) {
+		pct = cache->fci_stat.fst_cache * 100;
+		do_div(pct, cache->fci_stat.fst_count);
+	} else {
+		pct = 0;
+	}
+
+	CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
+	CDEBUG(D_INFO, "  Total reqs: "LPU64"\n", cache->fci_stat.fst_count);
+	CDEBUG(D_INFO, "  Cache reqs: "LPU64"\n", cache->fci_stat.fst_cache);
+	CDEBUG(D_INFO, "  Cache hits: "LPU64"%%\n", pct);
+
+	OBD_FREE_PTR(cache);
+
+	EXIT;
+}
+
+/**
+ * delete given node from list.
+ */
+void fld_cache_entry_delete(struct fld_cache *cache,
+			    struct fld_cache_entry *node)
+{
+	list_del(&node->fce_list);
+	list_del(&node->fce_lru);
+	cache->fci_cache_count--;
+	OBD_FREE_PTR(node);
+}
+
+/**
+ * fix list by checking new entry with NEXT entry in order.
+ */
+static void fld_fix_new_list(struct fld_cache *cache)
+{
+	struct fld_cache_entry *f_curr;
+	struct fld_cache_entry *f_next;
+	struct lu_seq_range *c_range;
+	struct lu_seq_range *n_range;
+	struct list_head *head = &cache->fci_entries_head;
+	ENTRY;
+
+restart_fixup:
+
+	list_for_each_entry_safe(f_curr, f_next, head, fce_list) {
+		c_range = &f_curr->fce_range;
+		n_range = &f_next->fce_range;
+
+		LASSERT(range_is_sane(c_range));
+		if (&f_next->fce_list == head)
+			break;
+
+		if (c_range->lsr_flags != n_range->lsr_flags)
+			continue;
+
+		LASSERTF(c_range->lsr_start <= n_range->lsr_start,
+			 "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n",
+			 PRANGE(c_range), PRANGE(n_range));
+
+		/* check merge possibility with next range */
+		if (c_range->lsr_end == n_range->lsr_start) {
+			if (c_range->lsr_index != n_range->lsr_index)
+				continue;
+			n_range->lsr_start = c_range->lsr_start;
+			fld_cache_entry_delete(cache, f_curr);
+			continue;
+		}
+
+		/* check if current range overlaps with next range. */
+		if (n_range->lsr_start < c_range->lsr_end) {
+			if (c_range->lsr_index == n_range->lsr_index) {
+				n_range->lsr_start = c_range->lsr_start;
+				n_range->lsr_end = max(c_range->lsr_end,
+						       n_range->lsr_end);
+				fld_cache_entry_delete(cache, f_curr);
+			} else {
+				if (n_range->lsr_end <= c_range->lsr_end) {
+					*n_range = *c_range;
+					fld_cache_entry_delete(cache, f_curr);
+				} else
+					n_range->lsr_start = c_range->lsr_end;
+			}
+
+			/* we could have overlap over next
+			 * range too. better restart. */
+			goto restart_fixup;
+		}
+
+		/* kill duplicates */
+		if (c_range->lsr_start == n_range->lsr_start &&
+		    c_range->lsr_end == n_range->lsr_end)
+			fld_cache_entry_delete(cache, f_curr);
+	}
+
+	EXIT;
+}
+
+/**
+ * add node to fld cache
+ */
+static inline void fld_cache_entry_add(struct fld_cache *cache,
+				       struct fld_cache_entry *f_new,
+				       struct list_head *pos)
+{
+	list_add(&f_new->fce_list, pos);
+	list_add(&f_new->fce_lru, &cache->fci_lru);
+
+	cache->fci_cache_count++;
+	fld_fix_new_list(cache);
+}
+
+/**
+ * Check if cache needs to be shrunk. If so - do it.
+ * Remove one entry in list and so on until cache is shrunk enough.
+ */
+static int fld_cache_shrink(struct fld_cache *cache)
+{
+	struct fld_cache_entry *flde;
+	struct list_head *curr;
+	int num = 0;
+	ENTRY;
+
+	LASSERT(cache != NULL);
+
+	if (cache->fci_cache_count < cache->fci_cache_size)
+		RETURN(0);
+
+	curr = cache->fci_lru.prev;
+
+	while (cache->fci_cache_count + cache->fci_threshold >
+	       cache->fci_cache_size && curr != &cache->fci_lru) {
+
+		flde = list_entry(curr, struct fld_cache_entry, fce_lru);
+		curr = curr->prev;
+		fld_cache_entry_delete(cache, flde);
+		num++;
+	}
+
+	CDEBUG(D_INFO, "%s: FLD cache - Shrunk by "
+	       "%d entries\n", cache->fci_name, num);
+
+	RETURN(0);
+}
+
+/**
+ * kill all fld cache entries.
+ */
+void fld_cache_flush(struct fld_cache *cache)
+{
+	ENTRY;
+
+	write_lock(&cache->fci_lock);
+	cache->fci_cache_size = 0;
+	fld_cache_shrink(cache);
+	write_unlock(&cache->fci_lock);
+
+	EXIT;
+}
+
+/**
+ * punch hole in existing range. divide this range and add new
+ * entry accordingly.
+ */
+
+void fld_cache_punch_hole(struct fld_cache *cache,
+			  struct fld_cache_entry *f_curr,
+			  struct fld_cache_entry *f_new)
+{
+	const struct lu_seq_range *range = &f_new->fce_range;
+	const seqno_t new_start  = range->lsr_start;
+	const seqno_t new_end  = range->lsr_end;
+	struct fld_cache_entry *fldt;
+
+	ENTRY;
+	OBD_ALLOC_GFP(fldt, sizeof *fldt, GFP_ATOMIC);
+	if (!fldt) {
+		OBD_FREE_PTR(f_new);
+		EXIT;
+		/* overlap is not allowed, so dont mess up list. */
+		return;
+	}
+	/*  break f_curr RANGE into three RANGES:
+	 *	f_curr, f_new , fldt
+	 */
+
+	/* f_new = *range */
+
+	/* fldt */
+	fldt->fce_range.lsr_start = new_end;
+	fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end;
+	fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index;
+
+	/* f_curr */
+	f_curr->fce_range.lsr_end = new_start;
+
+	/* add these two entries to list */
+	fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+	fld_cache_entry_add(cache, fldt, &f_new->fce_list);
+
+	/* no need to fixup */
+	EXIT;
+}
+
+/**
+ * handle range overlap in fld cache.
+ */
+static void fld_cache_overlap_handle(struct fld_cache *cache,
+				struct fld_cache_entry *f_curr,
+				struct fld_cache_entry *f_new)
+{
+	const struct lu_seq_range *range = &f_new->fce_range;
+	const seqno_t new_start  = range->lsr_start;
+	const seqno_t new_end  = range->lsr_end;
+	const mdsno_t mdt = range->lsr_index;
+
+	/* this is overlap case, these case are checking overlapping with
+	 * prev range only. fixup will handle overlaping with next range. */
+
+	if (f_curr->fce_range.lsr_index == mdt) {
+		f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start,
+						  new_start);
+
+		f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end,
+						new_end);
+
+		OBD_FREE_PTR(f_new);
+		fld_fix_new_list(cache);
+
+	} else if (new_start <= f_curr->fce_range.lsr_start &&
+			f_curr->fce_range.lsr_end <= new_end) {
+		/* case 1: new range completely overshadowed existing range.
+		 *	 e.g. whole range migrated. update fld cache entry */
+
+		f_curr->fce_range = *range;
+		OBD_FREE_PTR(f_new);
+		fld_fix_new_list(cache);
+
+	} else if (f_curr->fce_range.lsr_start < new_start &&
+			new_end < f_curr->fce_range.lsr_end) {
+		/* case 2: new range fit within existing range. */
+
+		fld_cache_punch_hole(cache, f_curr, f_new);
+
+	} else  if (new_end <= f_curr->fce_range.lsr_end) {
+		/* case 3: overlap:
+		 *	 [new_start [c_start  new_end)  c_end)
+		 */
+
+		LASSERT(new_start <= f_curr->fce_range.lsr_start);
+
+		f_curr->fce_range.lsr_start = new_end;
+		fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev);
+
+	} else if (f_curr->fce_range.lsr_start <= new_start) {
+		/* case 4: overlap:
+		 *	 [c_start [new_start c_end) new_end)
+		 */
+
+		LASSERT(f_curr->fce_range.lsr_end <= new_end);
+
+		f_curr->fce_range.lsr_end = new_start;
+		fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+	} else
+		CERROR("NEW range ="DRANGE" curr = "DRANGE"\n",
+		       PRANGE(range),PRANGE(&f_curr->fce_range));
+}
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *f_new;
+
+	LASSERT(range_is_sane(range));
+
+	OBD_ALLOC_PTR(f_new);
+	if (!f_new)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	f_new->fce_range = *range;
+	RETURN(f_new);
+}
+
+/**
+ * Insert FLD entry in FLD cache.
+ *
+ * This function handles all cases of merging and breaking up of
+ * ranges.
+ */
+int fld_cache_insert_nolock(struct fld_cache *cache,
+			    struct fld_cache_entry *f_new)
+{
+	struct fld_cache_entry *f_curr;
+	struct fld_cache_entry *n;
+	struct list_head *head;
+	struct list_head *prev = NULL;
+	const seqno_t new_start  = f_new->fce_range.lsr_start;
+	const seqno_t new_end  = f_new->fce_range.lsr_end;
+	__u32 new_flags  = f_new->fce_range.lsr_flags;
+	ENTRY;
+
+	/*
+	 * Duplicate entries are eliminated in insert op.
+	 * So we don't need to search new entry before starting
+	 * insertion loop.
+	 */
+
+	if (!cache->fci_no_shrink)
+		fld_cache_shrink(cache);
+
+	head = &cache->fci_entries_head;
+
+	list_for_each_entry_safe(f_curr, n, head, fce_list) {
+		/* add list if next is end of list */
+		if (new_end < f_curr->fce_range.lsr_start ||
+		   (new_end == f_curr->fce_range.lsr_start &&
+		    new_flags != f_curr->fce_range.lsr_flags))
+			break;
+
+		prev = &f_curr->fce_list;
+		/* check if this range is to left of new range. */
+		if (new_start < f_curr->fce_range.lsr_end &&
+		    new_flags == f_curr->fce_range.lsr_flags) {
+			fld_cache_overlap_handle(cache, f_curr, f_new);
+			goto out;
+		}
+	}
+
+	if (prev == NULL)
+		prev = head;
+
+	CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range));
+	/* Add new entry to cache and lru list. */
+	fld_cache_entry_add(cache, f_new, prev);
+out:
+	RETURN(0);
+}
+
+int fld_cache_insert(struct fld_cache *cache,
+		     const struct lu_seq_range *range)
+{
+	struct fld_cache_entry	*flde;
+	int rc;
+
+	flde = fld_cache_entry_create(range);
+	if (IS_ERR(flde))
+		RETURN(PTR_ERR(flde));
+
+	write_lock(&cache->fci_lock);
+	rc = fld_cache_insert_nolock(cache, flde);
+	write_unlock(&cache->fci_lock);
+	if (rc)
+		OBD_FREE_PTR(flde);
+
+	RETURN(rc);
+}
+
+void fld_cache_delete_nolock(struct fld_cache *cache,
+		      const struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *tmp;
+	struct list_head *head;
+
+	head = &cache->fci_entries_head;
+	list_for_each_entry_safe(flde, tmp, head, fce_list) {
+		/* add list if next is end of list */
+		if (range->lsr_start == flde->fce_range.lsr_start ||
+		   (range->lsr_end == flde->fce_range.lsr_end &&
+		    range->lsr_flags == flde->fce_range.lsr_flags)) {
+			fld_cache_entry_delete(cache, flde);
+			break;
+		}
+	}
+}
+
+/**
+ * Delete FLD entry in FLD cache.
+ *
+ */
+void fld_cache_delete(struct fld_cache *cache,
+		      const struct lu_seq_range *range)
+{
+	write_lock(&cache->fci_lock);
+	fld_cache_delete_nolock(cache, range);
+	write_unlock(&cache->fci_lock);
+}
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+			      struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *got = NULL;
+	struct list_head *head;
+
+	head = &cache->fci_entries_head;
+	list_for_each_entry(flde, head, fce_list) {
+		if (range->lsr_start == flde->fce_range.lsr_start ||
+		   (range->lsr_end == flde->fce_range.lsr_end &&
+		    range->lsr_flags == flde->fce_range.lsr_flags)) {
+			got = flde;
+			break;
+		}
+	}
+
+	RETURN(got);
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+struct fld_cache_entry
+*fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range)
+{
+	struct fld_cache_entry *got = NULL;
+	ENTRY;
+
+	read_lock(&cache->fci_lock);
+	got = fld_cache_entry_lookup_nolock(cache, range);
+	read_unlock(&cache->fci_lock);
+	RETURN(got);
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+int fld_cache_lookup(struct fld_cache *cache,
+		     const seqno_t seq, struct lu_seq_range *range)
+{
+	struct fld_cache_entry *flde;
+	struct fld_cache_entry *prev = NULL;
+	struct list_head *head;
+	ENTRY;
+
+	read_lock(&cache->fci_lock);
+	head = &cache->fci_entries_head;
+
+	cache->fci_stat.fst_count++;
+	list_for_each_entry(flde, head, fce_list) {
+		if (flde->fce_range.lsr_start > seq) {
+			if (prev != NULL)
+				*range = prev->fce_range;
+			break;
+		}
+
+		prev = flde;
+		if (range_within(&flde->fce_range, seq)) {
+			*range = flde->fce_range;
+
+			cache->fci_stat.fst_cache++;
+			read_unlock(&cache->fci_lock);
+			RETURN(0);
+		}
+	}
+	read_unlock(&cache->fci_lock);
+	RETURN(-ENOENT);
+}

diff --git a/drivers/staging/lustre/lustre/fld/fld_handler.c b/drivers/staging/lustre/lustre/fld/fld_handler.c
new file mode 100644
index 0000000..d2707ae
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_handler.c

@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_handler.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <md_object.h>
+#include <lustre_fid.h>
+#include <lustre_req_layout.h>
+#include "fld_internal.h"
+#include <lustre_fid.h>
+
+
+/* context key constructor/destructor: fld_key_init, fld_key_fini */
+LU_KEY_INIT_FINI(fld, struct fld_thread_info);
+
+/* context key: fld_thread_key */
+LU_CONTEXT_KEY_DEFINE(fld, LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD);
+
+proc_dir_entry_t *fld_type_proc_dir = NULL;
+
+static int __init fld_mod_init(void)
+{
+	fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME,
+					     proc_lustre_root,
+					     NULL, NULL);
+	if (IS_ERR(fld_type_proc_dir))
+		return PTR_ERR(fld_type_proc_dir);
+
+	LU_CONTEXT_KEY_INIT(&fld_thread_key);
+	lu_context_key_register(&fld_thread_key);
+	return 0;
+}
+
+static void __exit fld_mod_exit(void)
+{
+	lu_context_key_degister(&fld_thread_key);
+	if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) {
+		lprocfs_remove(&fld_type_proc_dir);
+		fld_type_proc_dir = NULL;
+	}
+}
+
+int fld_declare_server_create(const struct lu_env *env,
+			      struct lu_server_fld *fld,
+			      struct lu_seq_range *range,
+			      struct thandle *th)
+{
+	int rc;
+
+	rc = fld_declare_index_create(env, fld, range, th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_declare_server_create);
+
+/**
+ * Insert FLD index entry and update FLD cache.
+ *
+ * This function is called from the sequence allocator when a super-sequence
+ * is granted to a server.
+ */
+int fld_server_create(const struct lu_env *env, struct lu_server_fld *fld,
+		      struct lu_seq_range *range, struct thandle *th)
+{
+	int rc;
+
+	mutex_lock(&fld->lsf_lock);
+	rc = fld_index_create(env, fld, range, th);
+	mutex_unlock(&fld->lsf_lock);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_create);
+
+/**
+ *  Lookup mds by seq, returns a range for given seq.
+ *
+ *  If that entry is not cached in fld cache, request is sent to super
+ *  sequence controller node (MDT0). All other MDT[1...N] and client
+ *  cache fld entries, but this cache is not persistent.
+ */
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		      seqno_t seq, struct lu_seq_range *range)
+{
+	struct lu_seq_range *erange;
+	struct fld_thread_info *info;
+	int rc;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+	erange = &info->fti_lrange;
+
+	/* Lookup it in the cache. */
+	rc = fld_cache_lookup(fld->lsf_cache, seq, erange);
+	if (rc == 0) {
+		if (unlikely(fld_range_type(erange) != fld_range_type(range) &&
+			     !fld_range_is_any(range))) {
+			CERROR("%s: FLD cache range "DRANGE" does not match"
+			       "requested flag %x: rc = %d\n", fld->lsf_name,
+			       PRANGE(erange), range->lsr_flags, -EIO);
+			RETURN(-EIO);
+		}
+		*range = *erange;
+		RETURN(0);
+	}
+
+	if (fld->lsf_obj) {
+		/* On server side, all entries should be in cache.
+		 * If we can not find it in cache, just return error */
+		CERROR("%s: Cannot find sequence "LPX64": rc = %d\n",
+			fld->lsf_name, seq, -EIO);
+		RETURN(-EIO);
+	} else {
+		LASSERT(fld->lsf_control_exp);
+		/* send request to mdt0 i.e. super seq. controller.
+		 * This is temporary solution, long term solution is fld
+		 * replication on all mdt servers.
+		 */
+		range->lsr_start = seq;
+		rc = fld_client_rpc(fld->lsf_control_exp,
+				    range, FLD_LOOKUP);
+		if (rc == 0)
+			fld_cache_insert(fld->lsf_cache, range);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_lookup);
+
+/**
+ * All MDT server handle fld lookup operation. But only MDT0 has fld index.
+ * if entry is not found in cache we need to forward lookup request to MDT0
+ */
+
+static int fld_server_handle(struct lu_server_fld *fld,
+			     const struct lu_env *env,
+			     __u32 opc, struct lu_seq_range *range,
+			     struct fld_thread_info *info)
+{
+	int rc;
+	ENTRY;
+
+	switch (opc) {
+	case FLD_LOOKUP:
+		rc = fld_server_lookup(env, fld, range->lsr_start, range);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	CDEBUG(D_INFO, "%s: FLD req handle: error %d (opc: %d, range: "
+	       DRANGE"\n", fld->lsf_name, rc, opc, PRANGE(range));
+
+	RETURN(rc);
+
+}
+
+static int fld_req_handle(struct ptlrpc_request *req,
+			  struct fld_thread_info *info)
+{
+	struct obd_export *exp = req->rq_export;
+	struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
+	struct lu_seq_range *in;
+	struct lu_seq_range *out;
+	int rc;
+	__u32 *opc;
+	ENTRY;
+
+	rc = req_capsule_server_pack(info->fti_pill);
+	if (rc)
+		RETURN(err_serious(rc));
+
+	opc = req_capsule_client_get(info->fti_pill, &RMF_FLD_OPC);
+	if (opc != NULL) {
+		in = req_capsule_client_get(info->fti_pill, &RMF_FLD_MDFLD);
+		if (in == NULL)
+			RETURN(err_serious(-EPROTO));
+		out = req_capsule_server_get(info->fti_pill, &RMF_FLD_MDFLD);
+		if (out == NULL)
+			RETURN(err_serious(-EPROTO));
+		*out = *in;
+
+		/* For old 2.0 client, the 'lsr_flags' is uninitialized.
+		 * Set it as 'LU_SEQ_RANGE_MDT' by default. */
+		if (!(exp_connect_flags(exp) & OBD_CONNECT_64BITHASH) &&
+		    !(exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) &&
+		    !(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) &&
+		    !exp->exp_libclient)
+			fld_range_set_mdt(out);
+
+		rc = fld_server_handle(lu_site2seq(site)->ss_server_fld,
+				       req->rq_svc_thread->t_env,
+				       *opc, out, info);
+	} else {
+		rc = err_serious(-EPROTO);
+	}
+
+	RETURN(rc);
+}
+
+static void fld_thread_info_init(struct ptlrpc_request *req,
+				 struct fld_thread_info *info)
+{
+	info->fti_pill = &req->rq_pill;
+	/* Init request capsule. */
+	req_capsule_init(info->fti_pill, req, RCL_SERVER);
+	req_capsule_set(info->fti_pill, &RQF_FLD_QUERY);
+}
+
+static void fld_thread_info_fini(struct fld_thread_info *info)
+{
+	req_capsule_fini(info->fti_pill);
+}
+
+static int fld_handle(struct ptlrpc_request *req)
+{
+	struct fld_thread_info *info;
+	const struct lu_env *env;
+	int rc;
+
+	env = req->rq_svc_thread->t_env;
+	LASSERT(env != NULL);
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+
+	fld_thread_info_init(req, info);
+	rc = fld_req_handle(req, info);
+	fld_thread_info_fini(info);
+
+	return rc;
+}
+
+/*
+ * Entry point for handling FLD RPCs called from MDT.
+ */
+int fld_query(struct com_thread_info *info)
+{
+	return fld_handle(info->cti_pill->rc_req);
+}
+EXPORT_SYMBOL(fld_query);
+
+/*
+ * Returns true, if fid is local to this server node.
+ *
+ * WARNING: this function is *not* guaranteed to return false if fid is
+ * remote: it makes an educated conservative guess only.
+ *
+ * fid_is_local() is supposed to be used in assertion checks only.
+ */
+int fid_is_local(const struct lu_env *env,
+		 struct lu_site *site, const struct lu_fid *fid)
+{
+	int result;
+	struct seq_server_site *ss_site;
+	struct lu_seq_range *range;
+	struct fld_thread_info *info;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	range = &info->fti_lrange;
+
+	result = 1; /* conservatively assume fid is local */
+	ss_site = lu_site2seq(site);
+	if (ss_site->ss_client_fld != NULL) {
+		int rc;
+
+		rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache,
+				      fid_seq(fid), range);
+		if (rc == 0)
+			result = (range->lsr_index == ss_site->ss_node_id);
+	}
+	return result;
+}
+EXPORT_SYMBOL(fid_is_local);
+
+static void fld_server_proc_fini(struct lu_server_fld *fld);
+
+#ifdef LPROCFS
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+	int rc = 0;
+	ENTRY;
+
+	fld->lsf_proc_dir = lprocfs_register(fld->lsf_name,
+					     fld_type_proc_dir,
+					     fld_server_proc_list, fld);
+	if (IS_ERR(fld->lsf_proc_dir)) {
+		rc = PTR_ERR(fld->lsf_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_seq_create(fld->lsf_proc_dir, "fldb", 0444,
+				&fld_proc_seq_fops, fld);
+	if (rc) {
+		lprocfs_remove(&fld->lsf_proc_dir);
+		fld->lsf_proc_dir = NULL;
+	}
+
+	RETURN(rc);
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+	ENTRY;
+	if (fld->lsf_proc_dir != NULL) {
+		if (!IS_ERR(fld->lsf_proc_dir))
+			lprocfs_remove(&fld->lsf_proc_dir);
+		fld->lsf_proc_dir = NULL;
+	}
+	EXIT;
+}
+#else
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+	return 0;
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+	return;
+}
+#endif
+
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct dt_device *dt, const char *prefix, int mds_node_id,
+		    int type)
+{
+	int cache_size, cache_threshold;
+	int rc;
+	ENTRY;
+
+	snprintf(fld->lsf_name, sizeof(fld->lsf_name),
+		 "srv-%s", prefix);
+
+	cache_size = FLD_SERVER_CACHE_SIZE /
+		sizeof(struct fld_cache_entry);
+
+	cache_threshold = cache_size *
+		FLD_SERVER_CACHE_THRESHOLD / 100;
+
+	mutex_init(&fld->lsf_lock);
+	fld->lsf_cache = fld_cache_init(fld->lsf_name,
+					cache_size, cache_threshold);
+	if (IS_ERR(fld->lsf_cache)) {
+		rc = PTR_ERR(fld->lsf_cache);
+		fld->lsf_cache = NULL;
+		GOTO(out, rc);
+	}
+
+	if (!mds_node_id && type == LU_SEQ_RANGE_MDT) {
+		rc = fld_index_init(env, fld, dt);
+		if (rc)
+			GOTO(out, rc);
+	} else {
+		fld->lsf_obj = NULL;
+	}
+
+	rc = fld_server_proc_init(fld);
+	if (rc)
+		GOTO(out, rc);
+
+	fld->lsf_control_exp = NULL;
+
+	GOTO(out, rc);
+
+out:
+	if (rc)
+		fld_server_fini(env, fld);
+	return rc;
+}
+EXPORT_SYMBOL(fld_server_init);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+	ENTRY;
+
+	fld_server_proc_fini(fld);
+	fld_index_fini(env, fld);
+
+	if (fld->lsf_cache != NULL) {
+		if (!IS_ERR(fld->lsf_cache))
+			fld_cache_fini(fld->lsf_cache);
+		fld->lsf_cache = NULL;
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(fld_server_fini);
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FLD");
+MODULE_LICENSE("GPL");
+
+cfs_module(mdd, "0.1.0", fld_mod_init, fld_mod_exit);

diff --git a/drivers/staging/lustre/lustre/fld/fld_index.c b/drivers/staging/lustre/lustre/fld/fld_index.c
new file mode 100644
index 0000000..ec68a54
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_index.c

@@ -0,0 +1,426 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_index.c
+ *
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_mdc.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+const char fld_index_name[] = "fld";
+
+static const struct lu_seq_range IGIF_FLD_RANGE = {
+	.lsr_start = FID_SEQ_IGIF,
+	.lsr_end   = FID_SEQ_IGIF_MAX + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range DOT_LUSTRE_FLD_RANGE = {
+	.lsr_start = FID_SEQ_DOT_LUSTRE,
+	.lsr_end   = FID_SEQ_DOT_LUSTRE + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range ROOT_FLD_RANGE = {
+	.lsr_start = FID_SEQ_ROOT,
+	.lsr_end   = FID_SEQ_ROOT + 1,
+	.lsr_index = 0,
+	.lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+const struct dt_index_features fld_index_features = {
+	.dif_flags       = DT_IND_UPDATE,
+	.dif_keysize_min = sizeof(seqno_t),
+	.dif_keysize_max = sizeof(seqno_t),
+	.dif_recsize_min = sizeof(struct lu_seq_range),
+	.dif_recsize_max = sizeof(struct lu_seq_range),
+	.dif_ptrsize     = 4
+};
+
+extern struct lu_context_key fld_thread_key;
+
+int fld_declare_index_create(const struct lu_env *env,
+			     struct lu_server_fld *fld,
+			     const struct lu_seq_range *new_range,
+			     struct thandle *th)
+{
+	struct lu_seq_range	*tmp;
+	struct lu_seq_range	*range;
+	struct fld_thread_info	*info;
+	int			rc = 0;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	range = &info->fti_lrange;
+	tmp = &info->fti_irange;
+	memset(range, 0, sizeof(*range));
+
+	rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+	if (rc == 0) {
+		/* In case of duplicate entry, the location must be same */
+		LASSERT((range_compare_loc(new_range, range) == 0));
+		GOTO(out, rc = -EEXIST);
+	}
+
+	if (rc != -ENOENT) {
+		CERROR("%s: lookup range "DRANGE" error: rc = %d\n",
+			fld->lsf_name, PRANGE(range), rc);
+		GOTO(out, rc);
+	}
+
+	/* Check for merge case, since the fld entry can only be increamental,
+	 * so we will only check whether it can be merged from the left. */
+	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+	    range_compare_loc(new_range, range) == 0) {
+		range_cpu_to_be(tmp, range);
+		rc = dt_declare_delete(env, fld->lsf_obj,
+				       (struct dt_key *)&tmp->lsr_start, th);
+		if (rc) {
+			CERROR("%s: declare record "DRANGE" failed: rc = %d\n",
+			       fld->lsf_name, PRANGE(range), rc);
+			GOTO(out, rc);
+		}
+		memcpy(tmp, new_range, sizeof(*new_range));
+		tmp->lsr_start = range->lsr_start;
+	} else {
+		memcpy(tmp, new_range, sizeof(*new_range));
+	}
+
+	range_cpu_to_be(tmp, tmp);
+	rc = dt_declare_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+			       (struct dt_key *)&tmp->lsr_start, th);
+out:
+	RETURN(rc);
+}
+
+/**
+ * insert range in fld store.
+ *
+ *      \param  range  range to be inserted
+ *      \param  th     transaction for this operation as it could compound
+ *		     transaction.
+ *
+ *      \retval  0  success
+ *      \retval  -ve error
+ *
+ * The whole fld index insertion is protected by seq->lss_mutex (see
+ * seq_server_alloc_super), i.e. only one thread will access fldb each
+ * time, so we do not need worry the fld file and cache will being
+ * changed between declare and create.
+ * Because the fld entry can only be increamental, so we will only check
+ * whether it can be merged from the left.
+ **/
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+		     const struct lu_seq_range *new_range, struct thandle *th)
+{
+	struct lu_seq_range	*range;
+	struct lu_seq_range	*tmp;
+	struct fld_thread_info	*info;
+	int			rc = 0;
+	int			deleted = 0;
+	struct fld_cache_entry	*flde;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+
+	LASSERT(mutex_is_locked(&fld->lsf_lock));
+
+	range = &info->fti_lrange;
+	memset(range, 0, sizeof(*range));
+	tmp = &info->fti_irange;
+	rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+	if (rc != -ENOENT) {
+		rc = rc == 0 ? -EEXIST : rc;
+		GOTO(out, rc);
+	}
+
+	if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+	    range_compare_loc(new_range, range) == 0) {
+		range_cpu_to_be(tmp, range);
+		rc = dt_delete(env, fld->lsf_obj,
+			       (struct dt_key *)&tmp->lsr_start, th,
+				BYPASS_CAPA);
+		if (rc != 0)
+			GOTO(out, rc);
+		memcpy(tmp, new_range, sizeof(*new_range));
+		tmp->lsr_start = range->lsr_start;
+		deleted = 1;
+	} else {
+		memcpy(tmp, new_range, sizeof(*new_range));
+	}
+
+	range_cpu_to_be(tmp, tmp);
+	rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+		       (struct dt_key *)&tmp->lsr_start, th, BYPASS_CAPA, 1);
+	if (rc != 0) {
+		CERROR("%s: insert range "DRANGE" failed: rc = %d\n",
+		       fld->lsf_name, PRANGE(new_range), rc);
+		GOTO(out, rc);
+	}
+
+	flde = fld_cache_entry_create(new_range);
+	if (IS_ERR(flde))
+		GOTO(out, rc = PTR_ERR(flde));
+
+	write_lock(&fld->lsf_cache->fci_lock);
+	if (deleted)
+		fld_cache_delete_nolock(fld->lsf_cache, new_range);
+	rc = fld_cache_insert_nolock(fld->lsf_cache, flde);
+	write_unlock(&fld->lsf_cache->fci_lock);
+	if (rc)
+		OBD_FREE_PTR(flde);
+out:
+	RETURN(rc);
+}
+
+/**
+ * lookup range for a seq passed. note here we only care about the start/end,
+ * caller should handle the attached location data (flags, index).
+ *
+ * \param  seq     seq for lookup.
+ * \param  range   result of lookup.
+ *
+ * \retval  0	   found, \a range is the matched range;
+ * \retval -ENOENT      not found, \a range is the left-side range;
+ * \retval  -ve	 other error;
+ */
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     seqno_t seq, struct lu_seq_range *range)
+{
+	struct lu_seq_range     *fld_rec;
+	struct fld_thread_info  *info;
+	int rc;
+
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	fld_rec = &info->fti_rec;
+
+	rc = fld_cache_lookup(fld->lsf_cache, seq, fld_rec);
+	if (rc == 0) {
+		*range = *fld_rec;
+		if (range_within(range, seq))
+			rc = 0;
+		else
+			rc = -ENOENT;
+	}
+
+	CDEBUG(D_INFO, "%s: lookup seq = "LPX64" range : "DRANGE" rc = %d\n",
+	       fld->lsf_name, seq, PRANGE(range), rc);
+
+	RETURN(rc);
+}
+
+int fld_insert_entry(const struct lu_env *env,
+		     struct lu_server_fld *fld,
+		     const struct lu_seq_range *range)
+{
+	struct thandle *th;
+	int rc;
+	ENTRY;
+
+	th = dt_trans_create(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev));
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	rc = fld_declare_index_create(env, fld, range, th);
+	if (rc != 0) {
+		if (rc == -EEXIST)
+			rc = 0;
+		GOTO(out, rc);
+	}
+
+	rc = dt_trans_start_local(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev),
+				  th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = fld_index_create(env, fld, range, th);
+	if (rc == -EEXIST)
+		rc = 0;
+out:
+	dt_trans_stop(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev), th);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_insert_entry);
+
+static int fld_insert_special_entries(const struct lu_env *env,
+				      struct lu_server_fld *fld)
+{
+	int rc;
+
+	rc = fld_insert_entry(env, fld, &IGIF_FLD_RANGE);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = fld_insert_entry(env, fld, &DOT_LUSTRE_FLD_RANGE);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = fld_insert_entry(env, fld, &ROOT_FLD_RANGE);
+
+	RETURN(rc);
+}
+
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+		   struct dt_device *dt)
+{
+	struct dt_object	*dt_obj = NULL;
+	struct lu_fid		fid;
+	struct lu_attr		*attr = NULL;
+	struct lu_seq_range	*range = NULL;
+	struct fld_thread_info	*info;
+	struct dt_object_format	dof;
+	struct dt_it		*it;
+	const struct dt_it_ops	*iops;
+	int			rc;
+	ENTRY;
+
+	info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+	LASSERT(info != NULL);
+
+	lu_local_obj_fid(&fid, FLD_INDEX_OID);
+	OBD_ALLOC_PTR(attr);
+	if (attr == NULL)
+		RETURN(-ENOMEM);
+
+	memset(attr, 0, sizeof(*attr));
+	attr->la_valid = LA_MODE;
+	attr->la_mode = S_IFREG | 0666;
+	dof.dof_type = DFT_INDEX;
+	dof.u.dof_idx.di_feat = &fld_index_features;
+
+	dt_obj = dt_find_or_create(env, dt, &fid, &dof, attr);
+	if (IS_ERR(dt_obj)) {
+		rc = PTR_ERR(dt_obj);
+		CERROR("%s: Can't find \"%s\" obj %d\n", fld->lsf_name,
+			fld_index_name, rc);
+		dt_obj = NULL;
+		GOTO(out, rc);
+	}
+
+	fld->lsf_obj = dt_obj;
+	rc = dt_obj->do_ops->do_index_try(env, dt_obj, &fld_index_features);
+	if (rc != 0) {
+		CERROR("%s: File \"%s\" is not an index: rc = %d!\n",
+		       fld->lsf_name, fld_index_name, rc);
+		GOTO(out, rc);
+	}
+
+	range = &info->fti_rec;
+	/* Load fld entry to cache */
+	iops = &dt_obj->do_index_ops->dio_it;
+	it = iops->init(env, dt_obj, 0, NULL);
+	if (IS_ERR(it))
+		GOTO(out, rc = PTR_ERR(it));
+
+	rc = iops->load(env, it, 0);
+	if (rc < 0)
+		GOTO(out_it_fini, rc);
+
+	if (rc > 0) {
+		/* Load FLD entry into server cache */
+		do {
+			rc = iops->rec(env, it, (struct dt_rec *)range, 0);
+			if (rc != 0)
+				GOTO(out_it_put, rc);
+			LASSERT(range != NULL);
+			range_be_to_cpu(range, range);
+			rc = fld_cache_insert(fld->lsf_cache, range);
+			if (rc != 0)
+				GOTO(out_it_put, rc);
+			rc = iops->next(env, it);
+		} while (rc == 0);
+	}
+
+	/* Note: fld_insert_entry will detect whether these
+	 * special entries already exist inside FLDB */
+	mutex_lock(&fld->lsf_lock);
+	rc = fld_insert_special_entries(env, fld);
+	mutex_unlock(&fld->lsf_lock);
+	if (rc != 0) {
+		CERROR("%s: insert special entries failed!: rc = %d\n",
+		       fld->lsf_name, rc);
+		GOTO(out_it_put, rc);
+	}
+
+out_it_put:
+	iops->put(env, it);
+out_it_fini:
+	iops->fini(env, it);
+out:
+	if (attr != NULL)
+		OBD_FREE_PTR(attr);
+
+	if (rc != 0) {
+		if (dt_obj != NULL)
+			lu_object_put(env, &dt_obj->do_lu);
+		fld->lsf_obj = NULL;
+	}
+	RETURN(rc);
+}
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+	ENTRY;
+	if (fld->lsf_obj != NULL) {
+		if (!IS_ERR(fld->lsf_obj))
+			lu_object_put(env, &fld->lsf_obj->do_lu);
+		fld->lsf_obj = NULL;
+	}
+	EXIT;
+}

diff --git a/drivers/staging/lustre/lustre/fld/fld_internal.h b/drivers/staging/lustre/lustre/fld/fld_internal.h
new file mode 100644
index 0000000..9fa9e01
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_internal.h

@@ -0,0 +1,223 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Tom WangDi <wangdi@clusterfs.com>
+ */
+#ifndef __FLD_INTERNAL_H
+#define __FLD_INTERNAL_H
+
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+
+enum {
+	LUSTRE_FLD_INIT = 1 << 0,
+	LUSTRE_FLD_RUN  = 1 << 1
+};
+
+struct fld_stats {
+	__u64   fst_count;
+	__u64   fst_cache;
+	__u64   fst_inflight;
+};
+
+typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64);
+
+typedef struct lu_fld_target *
+(*fld_scan_func_t) (struct lu_client_fld *, __u64);
+
+struct lu_fld_hash {
+	const char	      *fh_name;
+	fld_hash_func_t	  fh_hash_func;
+	fld_scan_func_t	  fh_scan_func;
+};
+
+struct fld_cache_entry {
+	struct list_head	       fce_lru;
+	struct list_head	       fce_list;
+	/**
+	 * fld cache entries are sorted on range->lsr_start field. */
+	struct lu_seq_range      fce_range;
+};
+
+struct fld_cache {
+	/**
+	 * Cache guard, protects fci_hash mostly because others immutable after
+	 * init is finished.
+	 */
+	rwlock_t		 fci_lock;
+
+	/**
+	 * Cache shrink threshold */
+	int		      fci_threshold;
+
+	/**
+	 * Prefered number of cached entries */
+	int		      fci_cache_size;
+
+	/**
+	 * Current number of cached entries. Protected by \a fci_lock */
+	int		      fci_cache_count;
+
+	/**
+	 * LRU list fld entries. */
+	struct list_head	       fci_lru;
+
+	/**
+	 * sorted fld entries. */
+	struct list_head	       fci_entries_head;
+
+	/**
+	 * Cache statistics. */
+	struct fld_stats	 fci_stat;
+
+	/**
+	 * Cache name used for debug and messages. */
+	char		     fci_name[80];
+	unsigned int		 fci_no_shrink:1;
+};
+
+enum fld_op {
+	FLD_CREATE = 0,
+	FLD_DELETE = 1,
+	FLD_LOOKUP = 2
+};
+
+enum {
+	/* 4M of FLD cache will not hurt client a lot. */
+	FLD_SERVER_CACHE_SIZE      = (4 * 0x100000),
+
+	/* 1M of FLD cache will not hurt client a lot. */
+	FLD_CLIENT_CACHE_SIZE      = (1 * 0x100000)
+};
+
+enum {
+	/* Cache threshold is 10 percent of size. */
+	FLD_SERVER_CACHE_THRESHOLD = 10,
+
+	/* Cache threshold is 10 percent of size. */
+	FLD_CLIENT_CACHE_THRESHOLD = 10
+};
+
+extern struct lu_fld_hash fld_hash[];
+
+
+struct fld_thread_info {
+	struct req_capsule *fti_pill;
+	__u64	       fti_key;
+	struct lu_seq_range fti_rec;
+	struct lu_seq_range fti_lrange;
+	struct lu_seq_range fti_irange;
+};
+
+extern struct lu_context_key fld_thread_key;
+
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+		   struct dt_device *dt);
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_index_create(const struct lu_env *env,
+			     struct lu_server_fld *fld,
+			     const struct lu_seq_range *new,
+			     struct thandle *th);
+
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+		     const struct lu_seq_range *new, struct thandle *th);
+
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		     seqno_t seq, struct lu_seq_range *range);
+
+int fld_client_rpc(struct obd_export *exp,
+		   struct lu_seq_range *range, __u32 fld_op);
+
+#ifdef LPROCFS
+extern struct lprocfs_vars fld_server_proc_list[];
+extern struct lprocfs_vars fld_client_proc_list[];
+#endif
+
+
+struct fld_cache *fld_cache_init(const char *name,
+				 int cache_size, int cache_threshold);
+
+void fld_cache_fini(struct fld_cache *cache);
+
+void fld_cache_flush(struct fld_cache *cache);
+
+int fld_cache_insert(struct fld_cache *cache,
+		     const struct lu_seq_range *range);
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range);
+
+int fld_cache_insert_nolock(struct fld_cache *cache,
+			    struct fld_cache_entry *f_new);
+void fld_cache_delete(struct fld_cache *cache,
+		      const struct lu_seq_range *range);
+void fld_cache_delete_nolock(struct fld_cache *cache,
+			     const struct lu_seq_range *range);
+int fld_cache_lookup(struct fld_cache *cache,
+		     const seqno_t seq, struct lu_seq_range *range);
+
+struct fld_cache_entry*
+fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range);
+void fld_cache_entry_delete(struct fld_cache *cache,
+			    struct fld_cache_entry *node);
+void fld_dump_cache_entries(struct fld_cache *cache);
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+			      struct lu_seq_range *range);
+int fld_write_range(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_seq_range *range, struct thandle *th);
+
+static inline const char *
+fld_target_name(struct lu_fld_target *tar)
+{
+	if (tar->ft_srv != NULL)
+		return tar->ft_srv->lsf_name;
+
+	return (const char *)tar->ft_exp->exp_obd->obd_name;
+}
+
+extern proc_dir_entry_t *fld_type_proc_dir;
+extern struct file_operations fld_proc_seq_fops;
+#endif /* __FLD_INTERNAL_H */

diff --git a/drivers/staging/lustre/lustre/fld/fld_request.c b/drivers/staging/lustre/lustre/fld/fld_request.c
new file mode 100644
index 0000000..e9f0739
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_request.c

@@ -0,0 +1,519 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_request.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include <lustre_mdc.h>
+#include "fld_internal.h"
+
+/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c
+ * It should be common thing. The same about mdc RPC lock */
+static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+	int rc;
+	ENTRY;
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&mcw->mcw_entry);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	RETURN(rc);
+};
+
+static void fld_enter_request(struct client_obd *cli)
+{
+	struct mdc_cache_waiter mcw;
+	struct l_wait_info lwi = { 0 };
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+		list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+		init_waitqueue_head(&mcw.mcw_waitq);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		l_wait_event(mcw.mcw_waitq, fld_req_avail(cli, &mcw), &lwi);
+	} else {
+		cli->cl_r_in_flight++;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+	}
+}
+
+static void fld_exit_request(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct mdc_cache_waiter *mcw;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_r_in_flight--;
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+
+		if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+			/* No free request slots anymore */
+			break;
+		}
+
+		mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+		list_del_init(&mcw->mcw_entry);
+		cli->cl_r_in_flight++;
+		wake_up(&mcw->mcw_waitq);
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static int fld_rrb_hash(struct lu_client_fld *fld,
+			seqno_t seq)
+{
+	LASSERT(fld->lcf_count > 0);
+	return do_div(seq, fld->lcf_count);
+}
+
+static struct lu_fld_target *
+fld_rrb_scan(struct lu_client_fld *fld, seqno_t seq)
+{
+	struct lu_fld_target *target;
+	int hash;
+	ENTRY;
+
+	/* Because almost all of special sequence located in MDT0,
+	 * it should go to index 0 directly, instead of calculating
+	 * hash again, and also if other MDTs is not being connected,
+	 * the fld lookup requests(for seq on MDT0) should not be
+	 * blocked because of other MDTs */
+	if (fid_seq_is_norm(seq))
+		hash = fld_rrb_hash(fld, seq);
+	else
+		hash = 0;
+
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+		if (target->ft_idx == hash)
+			RETURN(target);
+	}
+
+	CERROR("%s: Can't find target by hash %d (seq "LPX64"). "
+	       "Targets (%d):\n", fld->lcf_name, hash, seq,
+	       fld->lcf_count);
+
+	list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+		const char *srv_name = target->ft_srv != NULL  ?
+			target->ft_srv->lsf_name : "<null>";
+		const char *exp_name = target->ft_exp != NULL ?
+			(char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+			"<null>";
+
+		CERROR("  exp: 0x%p (%s), srv: 0x%p (%s), idx: "LPU64"\n",
+		       target->ft_exp, exp_name, target->ft_srv,
+		       srv_name, target->ft_idx);
+	}
+
+	/*
+	 * If target is not found, there is logical error anyway, so here is
+	 * LBUG() to catch this situation.
+	 */
+	LBUG();
+	RETURN(NULL);
+}
+
+struct lu_fld_hash fld_hash[] = {
+	{
+		.fh_name = "RRB",
+		.fh_hash_func = fld_rrb_hash,
+		.fh_scan_func = fld_rrb_scan
+	},
+	{
+		0,
+	}
+};
+
+static struct lu_fld_target *
+fld_client_get_target(struct lu_client_fld *fld, seqno_t seq)
+{
+	struct lu_fld_target *target;
+	ENTRY;
+
+	LASSERT(fld->lcf_hash != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	target = fld->lcf_hash->fh_scan_func(fld, seq);
+	spin_unlock(&fld->lcf_lock);
+
+	if (target != NULL) {
+		CDEBUG(D_INFO, "%s: Found target (idx "LPU64
+		       ") by seq "LPX64"\n", fld->lcf_name,
+		       target->ft_idx, seq);
+	}
+
+	RETURN(target);
+}
+
+/*
+ * Add export to FLD. This is usually done by CMM and LMV as they are main users
+ * of FLD module.
+ */
+int fld_client_add_target(struct lu_client_fld *fld,
+			  struct lu_fld_target *tar)
+{
+	const char *name;
+	struct lu_fld_target *target, *tmp;
+	ENTRY;
+
+	LASSERT(tar != NULL);
+	name = fld_target_name(tar);
+	LASSERT(name != NULL);
+	LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+
+	if (fld->lcf_flags != LUSTRE_FLD_INIT) {
+		CERROR("%s: Attempt to add target %s (idx "LPU64") "
+		       "on fly - skip it\n", fld->lcf_name, name,
+		       tar->ft_idx);
+		RETURN(0);
+	} else {
+		CDEBUG(D_INFO, "%s: Adding target %s (idx "
+		       LPU64")\n", fld->lcf_name, name, tar->ft_idx);
+	}
+
+	OBD_ALLOC_PTR(target);
+	if (target == NULL)
+		RETURN(-ENOMEM);
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
+		if (tmp->ft_idx == tar->ft_idx) {
+			spin_unlock(&fld->lcf_lock);
+			OBD_FREE_PTR(target);
+			CERROR("Target %s exists in FLD and known as %s:#"LPU64"\n",
+			       name, fld_target_name(tmp), tmp->ft_idx);
+			RETURN(-EEXIST);
+		}
+	}
+
+	target->ft_exp = tar->ft_exp;
+	if (target->ft_exp != NULL)
+		class_export_get(target->ft_exp);
+	target->ft_srv = tar->ft_srv;
+	target->ft_idx = tar->ft_idx;
+
+	list_add_tail(&target->ft_chain,
+			  &fld->lcf_targets);
+
+	fld->lcf_count++;
+	spin_unlock(&fld->lcf_lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(fld_client_add_target);
+
+/* Remove export from FLD */
+int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
+{
+	struct lu_fld_target *target, *tmp;
+	ENTRY;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry_safe(target, tmp,
+				     &fld->lcf_targets, ft_chain) {
+		if (target->ft_idx == idx) {
+			fld->lcf_count--;
+			list_del(&target->ft_chain);
+			spin_unlock(&fld->lcf_lock);
+
+			if (target->ft_exp != NULL)
+				class_export_put(target->ft_exp);
+
+			OBD_FREE_PTR(target);
+			RETURN(0);
+		}
+	}
+	spin_unlock(&fld->lcf_lock);
+	RETURN(-ENOENT);
+}
+EXPORT_SYMBOL(fld_client_del_target);
+
+#ifdef LPROCFS
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+	int rc;
+	ENTRY;
+
+	fld->lcf_proc_dir = lprocfs_register(fld->lcf_name,
+					     fld_type_proc_dir,
+					     NULL, NULL);
+
+	if (IS_ERR(fld->lcf_proc_dir)) {
+		CERROR("%s: LProcFS failed in fld-init\n",
+		       fld->lcf_name);
+		rc = PTR_ERR(fld->lcf_proc_dir);
+		RETURN(rc);
+	}
+
+	rc = lprocfs_add_vars(fld->lcf_proc_dir,
+			      fld_client_proc_list, fld);
+	if (rc) {
+		CERROR("%s: Can't init FLD proc, rc %d\n",
+		       fld->lcf_name, rc);
+		GOTO(out_cleanup, rc);
+	}
+
+	RETURN(0);
+
+out_cleanup:
+	fld_client_proc_fini(fld);
+	return rc;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+	ENTRY;
+	if (fld->lcf_proc_dir) {
+		if (!IS_ERR(fld->lcf_proc_dir))
+			lprocfs_remove(&fld->lcf_proc_dir);
+		fld->lcf_proc_dir = NULL;
+	}
+	EXIT;
+}
+#else
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+	return 0;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+	return;
+}
+#endif
+
+EXPORT_SYMBOL(fld_client_proc_fini);
+
+static inline int hash_is_sane(int hash)
+{
+	return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+}
+
+int fld_client_init(struct lu_client_fld *fld,
+		    const char *prefix, int hash)
+{
+	int cache_size, cache_threshold;
+	int rc;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+		 "cli-%s", prefix);
+
+	if (!hash_is_sane(hash)) {
+		CERROR("%s: Wrong hash function %#x\n",
+		       fld->lcf_name, hash);
+		RETURN(-EINVAL);
+	}
+
+	fld->lcf_count = 0;
+	spin_lock_init(&fld->lcf_lock);
+	fld->lcf_hash = &fld_hash[hash];
+	fld->lcf_flags = LUSTRE_FLD_INIT;
+	INIT_LIST_HEAD(&fld->lcf_targets);
+
+	cache_size = FLD_CLIENT_CACHE_SIZE /
+		sizeof(struct fld_cache_entry);
+
+	cache_threshold = cache_size *
+		FLD_CLIENT_CACHE_THRESHOLD / 100;
+
+	fld->lcf_cache = fld_cache_init(fld->lcf_name,
+					cache_size, cache_threshold);
+	if (IS_ERR(fld->lcf_cache)) {
+		rc = PTR_ERR(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+		GOTO(out, rc);
+	}
+
+	rc = fld_client_proc_init(fld);
+	if (rc)
+		GOTO(out, rc);
+	EXIT;
+out:
+	if (rc)
+		fld_client_fini(fld);
+	else
+		CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+		       fld->lcf_name, fld->lcf_hash->fh_name);
+	return rc;
+}
+EXPORT_SYMBOL(fld_client_init);
+
+void fld_client_fini(struct lu_client_fld *fld)
+{
+	struct lu_fld_target *target, *tmp;
+	ENTRY;
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry_safe(target, tmp,
+				     &fld->lcf_targets, ft_chain) {
+		fld->lcf_count--;
+		list_del(&target->ft_chain);
+		if (target->ft_exp != NULL)
+			class_export_put(target->ft_exp);
+		OBD_FREE_PTR(target);
+	}
+	spin_unlock(&fld->lcf_lock);
+
+	if (fld->lcf_cache != NULL) {
+		if (!IS_ERR(fld->lcf_cache))
+			fld_cache_fini(fld->lcf_cache);
+		fld->lcf_cache = NULL;
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(fld_client_fini);
+
+int fld_client_rpc(struct obd_export *exp,
+		   struct lu_seq_range *range, __u32 fld_op)
+{
+	struct ptlrpc_request *req;
+	struct lu_seq_range   *prange;
+	__u32		 *op;
+	int		    rc;
+	struct obd_import     *imp;
+	ENTRY;
+
+	LASSERT(exp != NULL);
+
+	imp = class_exp2cliimp(exp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, LUSTRE_MDS_VERSION,
+					FLD_QUERY);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
+	*op = fld_op;
+
+	prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
+	*prange = *range;
+
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = FLD_REQUEST_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (fld_op == FLD_LOOKUP &&
+	    imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS)
+		req->rq_allow_replay = 1;
+
+	if (fld_op != FLD_LOOKUP)
+		mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	fld_enter_request(&exp->exp_obd->u.cli);
+	rc = ptlrpc_queue_wait(req);
+	fld_exit_request(&exp->exp_obd->u.cli);
+	if (fld_op != FLD_LOOKUP)
+		mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	if (rc)
+		GOTO(out_req, rc);
+
+	prange = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD);
+	if (prange == NULL)
+		GOTO(out_req, rc = -EFAULT);
+	*range = *prange;
+	EXIT;
+out_req:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds,
+		      __u32 flags, const struct lu_env *env)
+{
+	struct lu_seq_range res = { 0 };
+	struct lu_fld_target *target;
+	int rc;
+	ENTRY;
+
+	fld->lcf_flags |= LUSTRE_FLD_RUN;
+
+	rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
+	if (rc == 0) {
+		*mds = res.lsr_index;
+		RETURN(0);
+	}
+
+	/* Can not find it in the cache */
+	target = fld_client_get_target(fld, seq);
+	LASSERT(target != NULL);
+
+	CDEBUG(D_INFO, "%s: Lookup fld entry (seq: "LPX64") on "
+	       "target %s (idx "LPU64")\n", fld->lcf_name, seq,
+	       fld_target_name(target), target->ft_idx);
+
+	res.lsr_start = seq;
+	fld_range_set_type(&res, flags);
+	if (target->ft_srv != NULL) {
+		LASSERT(env != NULL);
+		rc = fld_server_lookup(env, target->ft_srv, seq, &res);
+	} else {
+		rc = fld_client_rpc(target->ft_exp, &res, FLD_LOOKUP);
+	}
+
+	if (rc == 0) {
+		*mds = res.lsr_index;
+
+		fld_cache_insert(fld->lcf_cache, &res);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(fld_client_lookup);
+
+void fld_client_flush(struct lu_client_fld *fld)
+{
+	fld_cache_flush(fld->lcf_cache);
+}
+EXPORT_SYMBOL(fld_client_flush);

diff --git a/drivers/staging/lustre/lustre/fld/lproc_fld.c b/drivers/staging/lustre/lustre/fld/lproc_fld.c
new file mode 100644
index 0000000..c1bd803
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/lproc_fld.c

@@ -0,0 +1,373 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/lproc_fld.c
+ *
+ * FLD (FIDs Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ *	Di Wang <di.wang@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include <lustre_fid.h>
+#include "fld_internal.h"
+
+#ifdef LPROCFS
+static int
+fld_proc_targets_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+	struct lu_fld_target *target;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	list_for_each_entry(target,
+				&fld->lcf_targets, ft_chain)
+		seq_printf(m, "%s\n", fld_target_name(target));
+	spin_unlock(&fld->lcf_lock);
+
+	RETURN(0);
+}
+
+static int
+fld_proc_hash_seq_show(struct seq_file *m, void *unused)
+{
+	struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	spin_lock(&fld->lcf_lock);
+	seq_printf(m, "%s\n", fld->lcf_hash->fh_name);
+	spin_unlock(&fld->lcf_lock);
+
+	RETURN(0);
+}
+
+static ssize_t
+fld_proc_hash_seq_write(struct file *file, const char *buffer,
+			size_t count, loff_t *off)
+{
+	struct lu_client_fld *fld = ((struct seq_file *)file->private_data)->private;
+	struct lu_fld_hash *hash = NULL;
+	int i;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	for (i = 0; fld_hash[i].fh_name != NULL; i++) {
+		if (count != strlen(fld_hash[i].fh_name))
+			continue;
+
+		if (!strncmp(fld_hash[i].fh_name, buffer, count)) {
+			hash = &fld_hash[i];
+			break;
+		}
+	}
+
+	if (hash != NULL) {
+		spin_lock(&fld->lcf_lock);
+		fld->lcf_hash = hash;
+		spin_unlock(&fld->lcf_lock);
+
+		CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n",
+		       fld->lcf_name, hash->fh_name);
+	}
+
+	RETURN(count);
+}
+
+static ssize_t
+fld_proc_cache_flush_write(struct file *file, const char __user *buffer,
+			       size_t count, loff_t *pos)
+{
+	struct lu_client_fld *fld = file->private_data;
+	ENTRY;
+
+	LASSERT(fld != NULL);
+
+	fld_cache_flush(fld->lcf_cache);
+
+	CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
+
+	RETURN(count);
+}
+
+static int fld_proc_cache_flush_open(struct inode *inode, struct file *file)
+{
+	file->private_data = PDE_DATA(inode);
+	return 0;
+}
+
+static int fld_proc_cache_flush_release(struct inode *inode, struct file *file)
+{
+	file->private_data = NULL;
+	return 0;
+}
+
+struct file_operations fld_proc_cache_flush_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fld_proc_cache_flush_open,
+	.write		= fld_proc_cache_flush_write,
+	.release	= fld_proc_cache_flush_release,
+};
+
+struct fld_seq_param {
+	struct lu_env		fsp_env;
+	struct dt_it		*fsp_it;
+	struct lu_server_fld	*fsp_fld;
+	unsigned int		fsp_stop:1;
+};
+
+static void *fldb_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld    *fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops  *iops;
+
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	iops->load(&param->fsp_env, param->fsp_it, *pos);
+
+	*pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+	return param;
+}
+
+static void fldb_seq_stop(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	const struct dt_it_ops	*iops;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+
+	if (param == NULL)
+		return;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	iops->put(&param->fsp_env, param->fsp_it);
+}
+
+static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return NULL;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	rc = iops->next(&param->fsp_env, param->fsp_it);
+	if (rc > 0) {
+		param->fsp_stop = 1;
+		return NULL;
+	}
+
+	*pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+	return param;
+}
+
+static int fldb_seq_show(struct seq_file *p, void *v)
+{
+	struct fld_seq_param    *param = p->private;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+	struct fld_thread_info	*info;
+	struct lu_seq_range	*fld_rec;
+	int			rc;
+
+	if (param == NULL || param->fsp_stop)
+		return 0;
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	info = lu_context_key_get(&param->fsp_env.le_ctx,
+				  &fld_thread_key);
+	fld_rec = &info->fti_rec;
+	rc = iops->rec(&param->fsp_env, param->fsp_it,
+		       (struct dt_rec *)fld_rec, 0);
+	if (rc != 0) {
+		CERROR("%s:read record error: rc %d\n",
+		       fld->lsf_name, rc);
+	} else if (fld_rec->lsr_start != 0) {
+		range_be_to_cpu(fld_rec, fld_rec);
+		rc = seq_printf(p, DRANGE"\n", PRANGE(fld_rec));
+	}
+
+	return rc;
+}
+
+struct seq_operations fldb_sops = {
+	.start = fldb_seq_start,
+	.stop = fldb_seq_stop,
+	.next = fldb_seq_next,
+	.show = fldb_seq_show,
+};
+
+static int fldb_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq;
+	struct lu_server_fld    *fld = (struct lu_server_fld *)PDE_DATA(inode);
+	struct dt_object	*obj;
+	const struct dt_it_ops  *iops;
+	struct fld_seq_param    *param = NULL;
+	int			env_init = 0;
+	int			rc;
+
+	rc = seq_open(file, &fldb_sops);
+	if (rc)
+		GOTO(out, rc);
+
+	obj = fld->lsf_obj;
+	if (obj == NULL) {
+		seq = file->private_data;
+		seq->private = NULL;
+		return 0;
+	}
+
+	OBD_ALLOC_PTR(param);
+	if (param == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = lu_env_init(&param->fsp_env, LCT_MD_THREAD);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	env_init = 1;
+	iops = &obj->do_index_ops->dio_it;
+	param->fsp_it = iops->init(&param->fsp_env, obj, 0, NULL);
+	if (IS_ERR(param->fsp_it))
+		GOTO(out, rc = PTR_ERR(param->fsp_it));
+
+	param->fsp_fld = fld;
+	param->fsp_stop = 0;
+
+	seq = file->private_data;
+	seq->private = param;
+out:
+	if (rc != 0) {
+		if (env_init == 1)
+			lu_env_fini(&param->fsp_env);
+		if (param != NULL)
+			OBD_FREE_PTR(param);
+	}
+	return rc;
+}
+
+static int fldb_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file		*seq = file->private_data;
+	struct fld_seq_param	*param;
+	struct lu_server_fld	*fld;
+	struct dt_object	*obj;
+	const struct dt_it_ops	*iops;
+
+	param = seq->private;
+	if (param == NULL) {
+		lprocfs_seq_release(inode, file);
+		return 0;
+	}
+
+	fld = param->fsp_fld;
+	obj = fld->lsf_obj;
+	LASSERT(obj != NULL);
+	iops = &obj->do_index_ops->dio_it;
+
+	LASSERT(iops != NULL);
+	LASSERT(obj != NULL);
+	LASSERT(param->fsp_it != NULL);
+	iops->fini(&param->fsp_env, param->fsp_it);
+	lu_env_fini(&param->fsp_env);
+	OBD_FREE_PTR(param);
+	lprocfs_seq_release(inode, file);
+
+	return 0;
+}
+
+struct lprocfs_vars fld_server_proc_list[] = {
+	{ NULL }};
+
+LPROC_SEQ_FOPS_RO(fld_proc_targets);
+LPROC_SEQ_FOPS(fld_proc_hash);
+
+struct lprocfs_vars fld_client_proc_list[] = {
+	{ "targets", &fld_proc_targets_fops },
+	{ "hash", &fld_proc_hash_fops },
+	{ "cache_flush", &fld_proc_cache_flush_fops },
+	{ NULL }};
+
+struct file_operations fld_proc_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = fldb_seq_open,
+	.read    = seq_read,
+	.release = fldb_seq_release,
+};
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h
new file mode 100644
index 0000000..4bb6880
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/cl_object.h

@@ -0,0 +1,3279 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LUSTRE_CL_OBJECT_H
+#define _LUSTRE_CL_OBJECT_H
+
+/** \defgroup clio clio
+ *
+ * Client objects implement io operations and cache pages.
+ *
+ * Examples: lov and osc are implementations of cl interface.
+ *
+ * Big Theory Statement.
+ *
+ * Layered objects.
+ *
+ * Client implementation is based on the following data-types:
+ *
+ *   - cl_object
+ *
+ *   - cl_page
+ *
+ *   - cl_lock     represents an extent lock on an object.
+ *
+ *   - cl_io       represents high-level i/o activity such as whole read/write
+ *		 system call, or write-out of pages from under the lock being
+ *		 canceled. cl_io has sub-ios that can be stopped and resumed
+ *		 independently, thus achieving high degree of transfer
+ *		 parallelism. Single cl_io can be advanced forward by
+ *		 the multiple threads (although in the most usual case of
+ *		 read/write system call it is associated with the single user
+ *		 thread, that issued the system call).
+ *
+ *   - cl_req      represents a collection of pages for a transfer. cl_req is
+ *		 constructed by req-forming engine that tries to saturate
+ *		 transport with large and continuous transfers.
+ *
+ * Terminology
+ *
+ *     - to avoid confusion high-level I/O operation like read or write system
+ *     call is referred to as "an io", whereas low-level I/O operation, like
+ *     RPC, is referred to as "a transfer"
+ *
+ *     - "generic code" means generic (not file system specific) code in the
+ *     hosting environment. "cl-code" means code (mostly in cl_*.c files) that
+ *     is not layer specific.
+ *
+ * Locking.
+ *
+ *  - i_mutex
+ *      - PG_locked
+ *	  - cl_object_header::coh_page_guard
+ *	  - cl_object_header::coh_lock_guard
+ *	  - lu_site::ls_guard
+ *
+ * See the top comment in cl_object.c for the description of overall locking and
+ * reference-counting design.
+ *
+ * See comments below for the description of i/o, page, and dlm-locking
+ * design.
+ *
+ * @{
+ */
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+#include <lvfs.h>
+#	include <linux/mutex.h>
+#	include <linux/radix-tree.h>
+
+struct inode;
+
+struct cl_device;
+struct cl_device_operations;
+
+struct cl_object;
+struct cl_object_page_operations;
+struct cl_object_lock_operations;
+
+struct cl_page;
+struct cl_page_slice;
+struct cl_lock;
+struct cl_lock_slice;
+
+struct cl_lock_operations;
+struct cl_page_operations;
+
+struct cl_io;
+struct cl_io_slice;
+
+struct cl_req;
+struct cl_req_slice;
+
+/**
+ * Operations for each data device in the client stack.
+ *
+ * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops
+ */
+struct cl_device_operations {
+	/**
+	 * Initialize cl_req. This method is called top-to-bottom on all
+	 * devices in the stack to get them a chance to allocate layer-private
+	 * data, and to attach them to the cl_req by calling
+	 * cl_req_slice_add().
+	 *
+	 * \see osc_req_init(), lov_req_init(), lovsub_req_init()
+	 * \see ccc_req_init()
+	 */
+	int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev,
+			    struct cl_req *req);
+};
+
+/**
+ * Device in the client stack.
+ *
+ * \see ccc_device, lov_device, lovsub_device, osc_device
+ */
+struct cl_device {
+	/** Super-class. */
+	struct lu_device		   cd_lu_dev;
+	/** Per-layer operation vector. */
+	const struct cl_device_operations *cd_ops;
+};
+
+/** \addtogroup cl_object cl_object
+ * @{ */
+/**
+ * "Data attributes" of cl_object. Data attributes can be updated
+ * independently for a sub-object, and top-object's attributes are calculated
+ * from sub-objects' ones.
+ */
+struct cl_attr {
+	/** Object size, in bytes */
+	loff_t cat_size;
+	/**
+	 * Known minimal size, in bytes.
+	 *
+	 * This is only valid when at least one DLM lock is held.
+	 */
+	loff_t cat_kms;
+	/** Modification time. Measured in seconds since epoch. */
+	time_t cat_mtime;
+	/** Access time. Measured in seconds since epoch. */
+	time_t cat_atime;
+	/** Change time. Measured in seconds since epoch. */
+	time_t cat_ctime;
+	/**
+	 * Blocks allocated to this cl_object on the server file system.
+	 *
+	 * \todo XXX An interface for block size is needed.
+	 */
+	__u64  cat_blocks;
+	/**
+	 * User identifier for quota purposes.
+	 */
+	uid_t  cat_uid;
+	/**
+	 * Group identifier for quota purposes.
+	 */
+	gid_t  cat_gid;
+};
+
+/**
+ * Fields in cl_attr that are being set.
+ */
+enum cl_attr_valid {
+	CAT_SIZE   = 1 << 0,
+	CAT_KMS    = 1 << 1,
+	CAT_MTIME  = 1 << 3,
+	CAT_ATIME  = 1 << 4,
+	CAT_CTIME  = 1 << 5,
+	CAT_BLOCKS = 1 << 6,
+	CAT_UID    = 1 << 7,
+	CAT_GID    = 1 << 8
+};
+
+/**
+ * Sub-class of lu_object with methods common for objects on the client
+ * stacks.
+ *
+ * cl_object: represents a regular file system object, both a file and a
+ *    stripe. cl_object is based on lu_object: it is identified by a fid,
+ *    layered, cached, hashed, and lrued. Important distinction with the server
+ *    side, where md_object and dt_object are used, is that cl_object "fans out"
+ *    at the lov/sns level: depending on the file layout, single file is
+ *    represented as a set of "sub-objects" (stripes). At the implementation
+ *    level, struct lov_object contains an array of cl_objects. Each sub-object
+ *    is a full-fledged cl_object, having its fid, living in the lru and hash
+ *    table.
+ *
+ *    This leads to the next important difference with the server side: on the
+ *    client, it's quite usual to have objects with the different sequence of
+ *    layers. For example, typical top-object is composed of the following
+ *    layers:
+ *
+ *	- vvp
+ *	- lov
+ *
+ *    whereas its sub-objects are composed of
+ *
+ *	- lovsub
+ *	- osc
+ *
+ *    layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
+ *    track of the object-subobject relationship.
+ *
+ *    Sub-objects are not cached independently: when top-object is about to
+ *    be discarded from the memory, all its sub-objects are torn-down and
+ *    destroyed too.
+ *
+ * \see ccc_object, lov_object, lovsub_object, osc_object
+ */
+struct cl_object {
+	/** super class */
+	struct lu_object		   co_lu;
+	/** per-object-layer operations */
+	const struct cl_object_operations *co_ops;
+	/** offset of page slice in cl_page buffer */
+	int				   co_slice_off;
+};
+
+/**
+ * Description of the client object configuration. This is used for the
+ * creation of a new client object that is identified by a more state than
+ * fid.
+ */
+struct cl_object_conf {
+	/** Super-class. */
+	struct lu_object_conf     coc_lu;
+	union {
+		/**
+		 * Object layout. This is consumed by lov.
+		 */
+		struct lustre_md *coc_md;
+		/**
+		 * Description of particular stripe location in the
+		 * cluster. This is consumed by osc.
+		 */
+		struct lov_oinfo *coc_oinfo;
+	} u;
+	/**
+	 * VFS inode. This is consumed by vvp.
+	 */
+	struct inode	     *coc_inode;
+	/**
+	 * Layout lock handle.
+	 */
+	struct ldlm_lock	 *coc_lock;
+	/**
+	 * Operation to handle layout, OBJECT_CONF_XYZ.
+	 */
+	int			  coc_opc;
+};
+
+enum {
+	/** configure layout, set up a new stripe, must be called while
+	 * holding layout lock. */
+	OBJECT_CONF_SET = 0,
+	/** invalidate the current stripe configuration due to losing
+	 * layout lock. */
+	OBJECT_CONF_INVALIDATE = 1,
+	/** wait for old layout to go away so that new layout can be
+	 * set up. */
+	OBJECT_CONF_WAIT = 2
+};
+
+/**
+ * Operations implemented for each cl object layer.
+ *
+ * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
+ */
+struct cl_object_operations {
+	/**
+	 * Initialize page slice for this layer. Called top-to-bottom through
+	 * every object layer when a new cl_page is instantiated. Layer
+	 * keeping private per-page data, or requiring its own page operations
+	 * vector should allocate these data here, and attach then to the page
+	 * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
+	 * sense). Optional.
+	 *
+	 * \retval NULL success.
+	 *
+	 * \retval ERR_PTR(errno) failure code.
+	 *
+	 * \retval valid-pointer pointer to already existing referenced page
+	 *	 to be used instead of newly created.
+	 */
+	int  (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
+				struct cl_page *page, struct page *vmpage);
+	/**
+	 * Initialize lock slice for this layer. Called top-to-bottom through
+	 * every object layer when a new cl_lock is instantiated. Layer
+	 * keeping private per-lock data, or requiring its own lock operations
+	 * vector should allocate these data here, and attach then to the lock
+	 * by calling cl_lock_slice_add(). Mandatory.
+	 */
+	int  (*coo_lock_init)(const struct lu_env *env,
+			      struct cl_object *obj, struct cl_lock *lock,
+			      const struct cl_io *io);
+	/**
+	 * Initialize io state for a given layer.
+	 *
+	 * called top-to-bottom once per io existence to initialize io
+	 * state. If layer wants to keep some state for this type of io, it
+	 * has to embed struct cl_io_slice in lu_env::le_ses, and register
+	 * slice with cl_io_slice_add(). It is guaranteed that all threads
+	 * participating in this io share the same session.
+	 */
+	int  (*coo_io_init)(const struct lu_env *env,
+			    struct cl_object *obj, struct cl_io *io);
+	/**
+	 * Fill portion of \a attr that this layer controls. This method is
+	 * called top-to-bottom through all object layers.
+	 *
+	 * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+	 *
+	 * \return   0: to continue
+	 * \return +ve: to stop iterating through layers (but 0 is returned
+	 * from enclosing cl_object_attr_get())
+	 * \return -ve: to signal error
+	 */
+	int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_attr *attr);
+	/**
+	 * Update attributes.
+	 *
+	 * \a valid is a bitmask composed from enum #cl_attr_valid, and
+	 * indicating what attributes are to be set.
+	 *
+	 * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+	 *
+	 * \return the same convention as for
+	 * cl_object_operations::coo_attr_get() is used.
+	 */
+	int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj,
+			    const struct cl_attr *attr, unsigned valid);
+	/**
+	 * Update object configuration. Called top-to-bottom to modify object
+	 * configuration.
+	 *
+	 * XXX error conditions and handling.
+	 */
+	int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
+			    const struct cl_object_conf *conf);
+	/**
+	 * Glimpse ast. Executed when glimpse ast arrives for a lock on this
+	 * object. Layers are supposed to fill parts of \a lvb that will be
+	 * shipped to the glimpse originator as a glimpse result.
+	 *
+	 * \see ccc_object_glimpse(), lovsub_object_glimpse(),
+	 * \see osc_object_glimpse()
+	 */
+	int (*coo_glimpse)(const struct lu_env *env,
+			   const struct cl_object *obj, struct ost_lvb *lvb);
+};
+
+/**
+ * Extended header for client object.
+ */
+struct cl_object_header {
+	/** Standard lu_object_header. cl_object::co_lu::lo_header points
+	 * here. */
+	struct lu_object_header  coh_lu;
+	/** \name locks
+	 * \todo XXX move locks below to the separate cache-lines, they are
+	 * mostly useless otherwise.
+	 */
+	/** @{ */
+	/** Lock protecting page tree. */
+	spinlock_t		 coh_page_guard;
+	/** Lock protecting lock list. */
+	spinlock_t		 coh_lock_guard;
+	/** @} locks */
+	/** Radix tree of cl_page's, cached for this object. */
+	struct radix_tree_root   coh_tree;
+	/** # of pages in radix tree. */
+	unsigned long	    coh_pages;
+	/** List of cl_lock's granted for this object. */
+	struct list_head	       coh_locks;
+
+	/**
+	 * Parent object. It is assumed that an object has a well-defined
+	 * parent, but not a well-defined child (there may be multiple
+	 * sub-objects, for the same top-object). cl_object_header::coh_parent
+	 * field allows certain code to be written generically, without
+	 * limiting possible cl_object layouts unduly.
+	 */
+	struct cl_object_header *coh_parent;
+	/**
+	 * Protects consistency between cl_attr of parent object and
+	 * attributes of sub-objects, that the former is calculated ("merged")
+	 * from.
+	 *
+	 * \todo XXX this can be read/write lock if needed.
+	 */
+	spinlock_t		 coh_attr_guard;
+	/**
+	 * Size of cl_page + page slices
+	 */
+	unsigned short		 coh_page_bufsize;
+	/**
+	 * Number of objects above this one: 0 for a top-object, 1 for its
+	 * sub-object, etc.
+	 */
+	unsigned char		 coh_nesting;
+};
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer top-to-bottom to \a slice.
+ */
+#define cl_object_for_each(slice, obj)				      \
+	list_for_each_entry((slice),				    \
+				&(obj)->co_lu.lo_header->loh_layers,	\
+				co_lu.lo_linkage)
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer bottom-to-top to \a slice.
+ */
+#define cl_object_for_each_reverse(slice, obj)			       \
+	list_for_each_entry_reverse((slice),			     \
+					&(obj)->co_lu.lo_header->loh_layers, \
+					co_lu.lo_linkage)
+/** @} cl_object */
+
+#ifndef pgoff_t
+#define pgoff_t unsigned long
+#endif
+
+#define CL_PAGE_EOF ((pgoff_t)~0ull)
+
+/** \addtogroup cl_page cl_page
+ * @{ */
+
+/** \struct cl_page
+ * Layered client page.
+ *
+ * cl_page: represents a portion of a file, cached in the memory. All pages
+ *    of the given file are of the same size, and are kept in the radix tree
+ *    hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
+ *    of the top-level file object are first class cl_objects, they have their
+ *    own radix trees of pages and hence page is implemented as a sequence of
+ *    struct cl_pages's, linked into double-linked list through
+ *    cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
+ *    corresponding radix tree at the corresponding logical offset.
+ *
+ * cl_page is associated with VM page of the hosting environment (struct
+ *    page in Linux kernel, for example), struct page. It is assumed, that this
+ *    association is implemented by one of cl_page layers (top layer in the
+ *    current design) that
+ *
+ *	- intercepts per-VM-page call-backs made by the environment (e.g.,
+ *	  memory pressure),
+ *
+ *	- translates state (page flag bits) and locking between lustre and
+ *	  environment.
+ *
+ *    The association between cl_page and struct page is immutable and
+ *    established when cl_page is created.
+ *
+ * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
+ *    this io an exclusive access to this page w.r.t. other io attempts and
+ *    various events changing page state (such as transfer completion, or
+ *    eviction of the page from the memory). Note, that in general cl_io
+ *    cannot be identified with a particular thread, and page ownership is not
+ *    exactly equal to the current thread holding a lock on the page. Layer
+ *    implementing association between cl_page and struct page has to implement
+ *    ownership on top of available synchronization mechanisms.
+ *
+ *    While lustre client maintains the notion of an page ownership by io,
+ *    hosting MM/VM usually has its own page concurrency control
+ *    mechanisms. For example, in Linux, page access is synchronized by the
+ *    per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
+ *    takes care to acquire and release such locks as necessary around the
+ *    calls to the file system methods (->readpage(), ->prepare_write(),
+ *    ->commit_write(), etc.). This leads to the situation when there are two
+ *    different ways to own a page in the client:
+ *
+ *	- client code explicitly and voluntary owns the page (cl_page_own());
+ *
+ *	- VM locks a page and then calls the client, that has "to assume"
+ *	  the ownership from the VM (cl_page_assume()).
+ *
+ *    Dual methods to release ownership are cl_page_disown() and
+ *    cl_page_unassume().
+ *
+ * cl_page is reference counted (cl_page::cp_ref). When reference counter
+ *    drops to 0, the page is returned to the cache, unless it is in
+ *    cl_page_state::CPS_FREEING state, in which case it is immediately
+ *    destroyed.
+ *
+ *    The general logic guaranteeing the absence of "existential races" for
+ *    pages is the following:
+ *
+ *	- there are fixed known ways for a thread to obtain a new reference
+ *	  to a page:
+ *
+ *	    - by doing a lookup in the cl_object radix tree, protected by the
+ *	      spin-lock;
+ *
+ *	    - by starting from VM-locked struct page and following some
+ *	      hosting environment method (e.g., following ->private pointer in
+ *	      the case of Linux kernel), see cl_vmpage_page();
+ *
+ *	- when the page enters cl_page_state::CPS_FREEING state, all these
+ *	  ways are severed with the proper synchronization
+ *	  (cl_page_delete());
+ *
+ *	- entry into cl_page_state::CPS_FREEING is serialized by the VM page
+ *	  lock;
+ *
+ *	- no new references to the page in cl_page_state::CPS_FREEING state
+ *	  are allowed (checked in cl_page_get()).
+ *
+ *    Together this guarantees that when last reference to a
+ *    cl_page_state::CPS_FREEING page is released, it is safe to destroy the
+ *    page, as neither references to it can be acquired at that point, nor
+ *    ones exist.
+ *
+ * cl_page is a state machine. States are enumerated in enum
+ *    cl_page_state. Possible state transitions are enumerated in
+ *    cl_page_state_set(). State transition process (i.e., actual changing of
+ *    cl_page::cp_state field) is protected by the lock on the underlying VM
+ *    page.
+ *
+ * Linux Kernel implementation.
+ *
+ *    Binding between cl_page and struct page (which is a typedef for
+ *    struct page) is implemented in the vvp layer. cl_page is attached to the
+ *    ->private pointer of the struct page, together with the setting of
+ *    PG_private bit in page->flags, and acquiring additional reference on the
+ *    struct page (much like struct buffer_head, or any similar file system
+ *    private data structures).
+ *
+ *    PG_locked lock is used to implement both ownership and transfer
+ *    synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
+ *    states. No additional references are acquired for the duration of the
+ *    transfer.
+ *
+ * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
+ *	  write-out is "protected" by the special PG_writeback bit.
+ */
+
+/**
+ * States of cl_page. cl_page.c assumes particular order here.
+ *
+ * The page state machine is rather crude, as it doesn't recognize finer page
+ * states like "dirty" or "up to date". This is because such states are not
+ * always well defined for the whole stack (see, for example, the
+ * implementation of the read-ahead, that hides page up-to-dateness to track
+ * cache hits accurately). Such sub-states are maintained by the layers that
+ * are interested in them.
+ */
+enum cl_page_state {
+	/**
+	 * Page is in the cache, un-owned. Page leaves cached state in the
+	 * following cases:
+	 *
+	 *     - [cl_page_state::CPS_OWNED] io comes across the page and
+	 *     owns it;
+	 *
+	 *     - [cl_page_state::CPS_PAGEOUT] page is dirty, the
+	 *     req-formation engine decides that it wants to include this page
+	 *     into an cl_req being constructed, and yanks it from the cache;
+	 *
+	 *     - [cl_page_state::CPS_FREEING] VM callback is executed to
+	 *     evict the page form the memory;
+	 *
+	 * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+	 */
+	CPS_CACHED,
+	/**
+	 * Page is exclusively owned by some cl_io. Page may end up in this
+	 * state as a result of
+	 *
+	 *     - io creating new page and immediately owning it;
+	 *
+	 *     - [cl_page_state::CPS_CACHED] io finding existing cached page
+	 *     and owning it;
+	 *
+	 *     - [cl_page_state::CPS_OWNED] io finding existing owned page
+	 *     and waiting for owner to release the page;
+	 *
+	 * Page leaves owned state in the following cases:
+	 *
+	 *     - [cl_page_state::CPS_CACHED] io decides to leave the page in
+	 *     the cache, doing nothing;
+	 *
+	 *     - [cl_page_state::CPS_PAGEIN] io starts read transfer for
+	 *     this page;
+	 *
+	 *     - [cl_page_state::CPS_PAGEOUT] io starts immediate write
+	 *     transfer for this page;
+	 *
+	 *     - [cl_page_state::CPS_FREEING] io decides to destroy this
+	 *     page (e.g., as part of truncate or extent lock cancellation).
+	 *
+	 * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
+	 */
+	CPS_OWNED,
+	/**
+	 * Page is being written out, as a part of a transfer. This state is
+	 * entered when req-formation logic decided that it wants this page to
+	 * be sent through the wire _now_. Specifically, it means that once
+	 * this state is achieved, transfer completion handler (with either
+	 * success or failure indication) is guaranteed to be executed against
+	 * this page independently of any locks and any scheduling decisions
+	 * made by the hosting environment (that effectively means that the
+	 * page is never put into cl_page_state::CPS_PAGEOUT state "in
+	 * advance". This property is mentioned, because it is important when
+	 * reasoning about possible dead-locks in the system). The page can
+	 * enter this state as a result of
+	 *
+	 *     - [cl_page_state::CPS_OWNED] an io requesting an immediate
+	 *     write-out of this page, or
+	 *
+	 *     - [cl_page_state::CPS_CACHED] req-forming engine deciding
+	 *     that it has enough dirty pages cached to issue a "good"
+	 *     transfer.
+	 *
+	 * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
+	 * is completed---it is moved into cl_page_state::CPS_CACHED state.
+	 *
+	 * Underlying VM page is locked for the duration of transfer.
+	 *
+	 * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+	 */
+	CPS_PAGEOUT,
+	/**
+	 * Page is being read in, as a part of a transfer. This is quite
+	 * similar to the cl_page_state::CPS_PAGEOUT state, except that
+	 * read-in is always "immediate"---there is no such thing a sudden
+	 * construction of read cl_req from cached, presumably not up to date,
+	 * pages.
+	 *
+	 * Underlying VM page is locked for the duration of transfer.
+	 *
+	 * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+	 */
+	CPS_PAGEIN,
+	/**
+	 * Page is being destroyed. This state is entered when client decides
+	 * that page has to be deleted from its host object, as, e.g., a part
+	 * of truncate.
+	 *
+	 * Once this state is reached, there is no way to escape it.
+	 *
+	 * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+	 */
+	CPS_FREEING,
+	CPS_NR
+};
+
+enum cl_page_type {
+	/** Host page, the page is from the host inode which the cl_page
+	 * belongs to. */
+	CPT_CACHEABLE = 1,
+
+	/** Transient page, the transient cl_page is used to bind a cl_page
+	 *  to vmpage which is not belonging to the same object of cl_page.
+	 *  it is used in DirectIO, lockless IO and liblustre. */
+	CPT_TRANSIENT,
+};
+
+/**
+ * Flags maintained for every cl_page.
+ */
+enum cl_page_flags {
+	/**
+	 * Set when pagein completes. Used for debugging (read completes at
+	 * most once for a page).
+	 */
+	CPF_READ_COMPLETED = 1 << 0
+};
+
+/**
+ * Fields are protected by the lock on struct page, except for atomics and
+ * immutables.
+ *
+ * \invariant Data type invariants are in cl_page_invariant(). Basically:
+ * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
+ * list, consistent with the parent/child pointers in the cl_page::cp_obj and
+ * cl_page::cp_owner (when set).
+ */
+struct cl_page {
+	/** Reference counter. */
+	atomic_t	     cp_ref;
+	/** An object this page is a part of. Immutable after creation. */
+	struct cl_object	*cp_obj;
+	/** Logical page index within the object. Immutable after creation. */
+	pgoff_t		  cp_index;
+	/** List of slices. Immutable after creation. */
+	struct list_head	       cp_layers;
+	/** Parent page, NULL for top-level page. Immutable after creation. */
+	struct cl_page	  *cp_parent;
+	/** Lower-layer page. NULL for bottommost page. Immutable after
+	 * creation. */
+	struct cl_page	  *cp_child;
+	/**
+	 * Page state. This field is const to avoid accidental update, it is
+	 * modified only internally within cl_page.c. Protected by a VM lock.
+	 */
+	const enum cl_page_state cp_state;
+	/** Linkage of pages within group. Protected by cl_page::cp_mutex. */
+	struct list_head		cp_batch;
+	/** Mutex serializing membership of a page in a batch. */
+	struct mutex		cp_mutex;
+	/** Linkage of pages within cl_req. */
+	struct list_head	       cp_flight;
+	/** Transfer error. */
+	int		      cp_error;
+
+	/**
+	 * Page type. Only CPT_TRANSIENT is used so far. Immutable after
+	 * creation.
+	 */
+	enum cl_page_type	cp_type;
+
+	/**
+	 * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
+	 * by sub-io. Protected by a VM lock.
+	 */
+	struct cl_io	    *cp_owner;
+	/**
+	 * Debug information, the task is owning the page.
+	 */
+	task_t	      *cp_task;
+	/**
+	 * Owning IO request in cl_page_state::CPS_PAGEOUT and
+	 * cl_page_state::CPS_PAGEIN states. This field is maintained only in
+	 * the top-level pages. Protected by a VM lock.
+	 */
+	struct cl_req	   *cp_req;
+	/** List of references to this page, for debugging. */
+	struct lu_ref	    cp_reference;
+	/** Link to an object, for debugging. */
+	struct lu_ref_link      *cp_obj_ref;
+	/** Link to a queue, for debugging. */
+	struct lu_ref_link      *cp_queue_ref;
+	/** Per-page flags from enum cl_page_flags. Protected by a VM lock. */
+	unsigned		 cp_flags;
+	/** Assigned if doing a sync_io */
+	struct cl_sync_io       *cp_sync_io;
+};
+
+/**
+ * Per-layer part of cl_page.
+ *
+ * \see ccc_page, lov_page, osc_page
+ */
+struct cl_page_slice {
+	struct cl_page		  *cpl_page;
+	/**
+	 * Object slice corresponding to this page slice. Immutable after
+	 * creation.
+	 */
+	struct cl_object		*cpl_obj;
+	const struct cl_page_operations *cpl_ops;
+	/** Linkage into cl_page::cp_layers. Immutable after creation. */
+	struct list_head		       cpl_linkage;
+};
+
+/**
+ * Lock mode. For the client extent locks.
+ *
+ * \warning: cl_lock_mode_match() assumes particular ordering here.
+ * \ingroup cl_lock
+ */
+enum cl_lock_mode {
+	/**
+	 * Mode of a lock that protects no data, and exists only as a
+	 * placeholder. This is used for `glimpse' requests. A phantom lock
+	 * might get promoted to real lock at some point.
+	 */
+	CLM_PHANTOM,
+	CLM_READ,
+	CLM_WRITE,
+	CLM_GROUP
+};
+
+/**
+ * Requested transfer type.
+ * \ingroup cl_req
+ */
+enum cl_req_type {
+	CRT_READ,
+	CRT_WRITE,
+	CRT_NR
+};
+
+/**
+ * Per-layer page operations.
+ *
+ * Methods taking an \a io argument are for the activity happening in the
+ * context of given \a io. Page is assumed to be owned by that io, except for
+ * the obvious cases (like cl_page_operations::cpo_own()).
+ *
+ * \see vvp_page_ops, lov_page_ops, osc_page_ops
+ */
+struct cl_page_operations {
+	/**
+	 * cl_page<->struct page methods. Only one layer in the stack has to
+	 * implement these. Current code assumes that this functionality is
+	 * provided by the topmost layer, see cl_page_disown0() as an example.
+	 */
+
+	/**
+	 * \return the underlying VM page. Optional.
+	 */
+	struct page *(*cpo_vmpage)(const struct lu_env *env,
+				  const struct cl_page_slice *slice);
+	/**
+	 * Called when \a io acquires this page into the exclusive
+	 * ownership. When this method returns, it is guaranteed that the is
+	 * not owned by other io, and no transfer is going on against
+	 * it. Optional.
+	 *
+	 * \see cl_page_own()
+	 * \see vvp_page_own(), lov_page_own()
+	 */
+	int  (*cpo_own)(const struct lu_env *env,
+			const struct cl_page_slice *slice,
+			struct cl_io *io, int nonblock);
+	/** Called when ownership it yielded. Optional.
+	 *
+	 * \see cl_page_disown()
+	 * \see vvp_page_disown()
+	 */
+	void (*cpo_disown)(const struct lu_env *env,
+			   const struct cl_page_slice *slice, struct cl_io *io);
+	/**
+	 * Called for a page that is already "owned" by \a io from VM point of
+	 * view. Optional.
+	 *
+	 * \see cl_page_assume()
+	 * \see vvp_page_assume(), lov_page_assume()
+	 */
+	void (*cpo_assume)(const struct lu_env *env,
+			   const struct cl_page_slice *slice, struct cl_io *io);
+	/** Dual to cl_page_operations::cpo_assume(). Optional. Called
+	 * bottom-to-top when IO releases a page without actually unlocking
+	 * it.
+	 *
+	 * \see cl_page_unassume()
+	 * \see vvp_page_unassume()
+	 */
+	void (*cpo_unassume)(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *io);
+	/**
+	 * Announces whether the page contains valid data or not by \a uptodate.
+	 *
+	 * \see cl_page_export()
+	 * \see vvp_page_export()
+	 */
+	void  (*cpo_export)(const struct lu_env *env,
+			    const struct cl_page_slice *slice, int uptodate);
+	/**
+	 * Unmaps page from the user space (if it is mapped).
+	 *
+	 * \see cl_page_unmap()
+	 * \see vvp_page_unmap()
+	 */
+	int (*cpo_unmap)(const struct lu_env *env,
+			 const struct cl_page_slice *slice, struct cl_io *io);
+	/**
+	 * Checks whether underlying VM page is locked (in the suitable
+	 * sense). Used for assertions.
+	 *
+	 * \retval    -EBUSY: page is protected by a lock of a given mode;
+	 * \retval  -ENODATA: page is not protected by a lock;
+	 * \retval	 0: this layer cannot decide. (Should never happen.)
+	 */
+	int (*cpo_is_vmlocked)(const struct lu_env *env,
+			       const struct cl_page_slice *slice);
+	/**
+	 * Page destruction.
+	 */
+
+	/**
+	 * Called when page is truncated from the object. Optional.
+	 *
+	 * \see cl_page_discard()
+	 * \see vvp_page_discard(), osc_page_discard()
+	 */
+	void (*cpo_discard)(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io);
+	/**
+	 * Called when page is removed from the cache, and is about to being
+	 * destroyed. Optional.
+	 *
+	 * \see cl_page_delete()
+	 * \see vvp_page_delete(), osc_page_delete()
+	 */
+	void (*cpo_delete)(const struct lu_env *env,
+			   const struct cl_page_slice *slice);
+	/** Destructor. Frees resources and slice itself. */
+	void (*cpo_fini)(const struct lu_env *env,
+			 struct cl_page_slice *slice);
+
+	/**
+	 * Checks whether the page is protected by a cl_lock. This is a
+	 * per-layer method, because certain layers have ways to check for the
+	 * lock much more efficiently than through the generic locks scan, or
+	 * implement locking mechanisms separate from cl_lock, e.g.,
+	 * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks
+	 * being canceled, or scheduled for cancellation as soon as the last
+	 * user goes away, too.
+	 *
+	 * \retval    -EBUSY: page is protected by a lock of a given mode;
+	 * \retval  -ENODATA: page is not protected by a lock;
+	 * \retval	 0: this layer cannot decide.
+	 *
+	 * \see cl_page_is_under_lock()
+	 */
+	int (*cpo_is_under_lock)(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 struct cl_io *io);
+
+	/**
+	 * Optional debugging helper. Prints given page slice.
+	 *
+	 * \see cl_page_print()
+	 */
+	int (*cpo_print)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 void *cookie, lu_printer_t p);
+	/**
+	 * \name transfer
+	 *
+	 * Transfer methods. See comment on cl_req for a description of
+	 * transfer formation and life-cycle.
+	 *
+	 * @{
+	 */
+	/**
+	 * Request type dependent vector of operations.
+	 *
+	 * Transfer operations depend on transfer mode (cl_req_type). To avoid
+	 * passing transfer mode to each and every of these methods, and to
+	 * avoid branching on request type inside of the methods, separate
+	 * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
+	 * provided. That is, method invocation usually looks like
+	 *
+	 *	 slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
+	 */
+	struct {
+		/**
+		 * Called when a page is submitted for a transfer as a part of
+		 * cl_page_list.
+		 *
+		 * \return    0	 : page is eligible for submission;
+		 * \return    -EALREADY : skip this page;
+		 * \return    -ve       : error.
+		 *
+		 * \see cl_page_prep()
+		 */
+		int  (*cpo_prep)(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 struct cl_io *io);
+		/**
+		 * Completion handler. This is guaranteed to be eventually
+		 * fired after cl_page_operations::cpo_prep() or
+		 * cl_page_operations::cpo_make_ready() call.
+		 *
+		 * This method can be called in a non-blocking context. It is
+		 * guaranteed however, that the page involved and its object
+		 * are pinned in memory (and, hence, calling cl_page_put() is
+		 * safe).
+		 *
+		 * \see cl_page_completion()
+		 */
+		void (*cpo_completion)(const struct lu_env *env,
+				       const struct cl_page_slice *slice,
+				       int ioret);
+		/**
+		 * Called when cached page is about to be added to the
+		 * cl_req as a part of req formation.
+		 *
+		 * \return    0       : proceed with this page;
+		 * \return    -EAGAIN : skip this page;
+		 * \return    -ve     : error.
+		 *
+		 * \see cl_page_make_ready()
+		 */
+		int  (*cpo_make_ready)(const struct lu_env *env,
+				       const struct cl_page_slice *slice);
+		/**
+		 * Announce that this page is to be written out
+		 * opportunistically, that is, page is dirty, it is not
+		 * necessary to start write-out transfer right now, but
+		 * eventually page has to be written out.
+		 *
+		 * Main caller of this is the write path (see
+		 * vvp_io_commit_write()), using this method to build a
+		 * "transfer cache" from which large transfers are then
+		 * constructed by the req-formation engine.
+		 *
+		 * \todo XXX it would make sense to add page-age tracking
+		 * semantics here, and to oblige the req-formation engine to
+		 * send the page out not later than it is too old.
+		 *
+		 * \see cl_page_cache_add()
+		 */
+		int  (*cpo_cache_add)(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *io);
+	} io[CRT_NR];
+	/**
+	 * Tell transfer engine that only [to, from] part of a page should be
+	 * transmitted.
+	 *
+	 * This is used for immediate transfers.
+	 *
+	 * \todo XXX this is not very good interface. It would be much better
+	 * if all transfer parameters were supplied as arguments to
+	 * cl_io_operations::cio_submit() call, but it is not clear how to do
+	 * this for page queues.
+	 *
+	 * \see cl_page_clip()
+	 */
+	void (*cpo_clip)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 int from, int to);
+	/**
+	 * \pre  the page was queued for transferring.
+	 * \post page is removed from client's pending list, or -EBUSY
+	 *       is returned if it has already been in transferring.
+	 *
+	 * This is one of seldom page operation which is:
+	 * 0. called from top level;
+	 * 1. don't have vmpage locked;
+	 * 2. every layer should synchronize execution of its ->cpo_cancel()
+	 *    with completion handlers. Osc uses client obd lock for this
+	 *    purpose. Based on there is no vvp_page_cancel and
+	 *    lov_page_cancel(), cpo_cancel is defacto protected by client lock.
+	 *
+	 * \see osc_page_cancel().
+	 */
+	int (*cpo_cancel)(const struct lu_env *env,
+			  const struct cl_page_slice *slice);
+	/**
+	 * Write out a page by kernel. This is only called by ll_writepage
+	 * right now.
+	 *
+	 * \see cl_page_flush()
+	 */
+	int (*cpo_flush)(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *io);
+	/** @} transfer */
+};
+
+/**
+ * Helper macro, dumping detailed information about \a page into a log.
+ */
+#define CL_PAGE_DEBUG(mask, env, page, format, ...)		     \
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		cl_page_print(env, &msgdata, lu_cdebug_printer, page);  \
+		CDEBUG(mask, format , ## __VA_ARGS__);		  \
+	}							       \
+} while (0)
+
+/**
+ * Helper macro, dumping shorter information about \a page into a log.
+ */
+#define CL_PAGE_HEADER(mask, env, page, format, ...)			  \
+do {									  \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		      \
+									      \
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {			 \
+		cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
+		CDEBUG(mask, format , ## __VA_ARGS__);			\
+	}								     \
+} while (0)
+
+static inline int __page_in_use(const struct cl_page *page, int refc)
+{
+	if (page->cp_type == CPT_CACHEABLE)
+		++refc;
+	LASSERT(atomic_read(&page->cp_ref) > 0);
+	return (atomic_read(&page->cp_ref) > refc);
+}
+#define cl_page_in_use(pg)       __page_in_use(pg, 1)
+#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
+
+/** @} cl_page */
+
+/** \addtogroup cl_lock cl_lock
+ * @{ */
+/** \struct cl_lock
+ *
+ * Extent locking on the client.
+ *
+ * LAYERING
+ *
+ * The locking model of the new client code is built around
+ *
+ *	struct cl_lock
+ *
+ * data-type representing an extent lock on a regular file. cl_lock is a
+ * layered object (much like cl_object and cl_page), it consists of a header
+ * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
+ * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
+ *
+ * All locks for a given object are linked into cl_object_header::coh_locks
+ * list (protected by cl_object_header::coh_lock_guard spin-lock) through
+ * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can
+ * sort it in starting lock offset, or use altogether different data structure
+ * like a tree.
+ *
+ * Typical cl_lock consists of the two layers:
+ *
+ *     - vvp_lock (vvp specific data), and
+ *     - lov_lock (lov specific data).
+ *
+ * lov_lock contains an array of sub-locks. Each of these sub-locks is a
+ * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
+ *
+ *     - lovsub_lock, and
+ *     - osc_lock
+ *
+ * Each sub-lock is associated with a cl_object (representing stripe
+ * sub-object or the file to which top-level cl_lock is associated to), and is
+ * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
+ * cl_object (that at lov layer also fans out into multiple sub-objects), and
+ * is different from cl_page, that doesn't fan out (there is usually exactly
+ * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
+ * a "top-lock" and its lovsub-osc portion a "sub-lock".
+ *
+ * LIFE CYCLE
+ *
+ * cl_lock is reference counted. When reference counter drops to 0, lock is
+ * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING
+ * lock is destroyed when last reference is released. Referencing between
+ * top-lock and its sub-locks is described in the lov documentation module.
+ *
+ * STATE MACHINE
+ *
+ * Also, cl_lock is a state machine. This requires some clarification. One of
+ * the goals of client IO re-write was to make IO path non-blocking, or at
+ * least to make it easier to make it non-blocking in the future. Here
+ * `non-blocking' means that when a system call (read, write, truncate)
+ * reaches a situation where it has to wait for a communication with the
+ * server, it should --instead of waiting-- remember its current state and
+ * switch to some other work.  E.g,. instead of waiting for a lock enqueue,
+ * client should proceed doing IO on the next stripe, etc. Obviously this is
+ * rather radical redesign, and it is not planned to be fully implemented at
+ * this time, instead we are putting some infrastructure in place, that would
+ * make it easier to do asynchronous non-blocking IO easier in the
+ * future. Specifically, where old locking code goes to sleep (waiting for
+ * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When
+ * enqueue reply comes, its completion handler signals that lock state-machine
+ * is ready to transit to the next state. There is some generic code in
+ * cl_lock.c that sleeps, waiting for these signals. As a result, for users of
+ * this cl_lock.c code, it looks like locking is done in normal blocking
+ * fashion, and it the same time it is possible to switch to the non-blocking
+ * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c
+ * functions).
+ *
+ * For a description of state machine states and transitions see enum
+ * cl_lock_state.
+ *
+ * There are two ways to restrict a set of states which lock might move to:
+ *
+ *     - placing a "hold" on a lock guarantees that lock will not be moved
+ *       into cl_lock_state::CLS_FREEING state until hold is released. Hold
+ *       can be only acquired on a lock that is not in
+ *       cl_lock_state::CLS_FREEING. All holds on a lock are counted in
+ *       cl_lock::cll_holds. Hold protects lock from cancellation and
+ *       destruction. Requests to cancel and destroy a lock on hold will be
+ *       recorded, but only honored when last hold on a lock is released;
+ *
+ *     - placing a "user" on a lock guarantees that lock will not leave
+ *       cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of
+ *       states, once it enters this set. That is, if a user is added onto a
+ *       lock in a state not from this set, it doesn't immediately enforce
+ *       lock to move to this set, but once lock enters this set it will
+ *       remain there until all users are removed. Lock users are counted in
+ *       cl_lock::cll_users.
+ *
+ *       User is used to assure that lock is not canceled or destroyed while
+ *       it is being enqueued, or actively used by some IO.
+ *
+ *       Currently, a user always comes with a hold (cl_lock_invariant()
+ *       checks that a number of holds is not less than a number of users).
+ *
+ * CONCURRENCY
+ *
+ * This is how lock state-machine operates. struct cl_lock contains a mutex
+ * cl_lock::cll_guard that protects struct fields.
+ *
+ *     - mutex is taken, and cl_lock::cll_state is examined.
+ *
+ *     - for every state there are possible target states where lock can move
+ *       into. They are tried in order. Attempts to move into next state are
+ *       done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try().
+ *
+ *     - if the transition can be performed immediately, state is changed,
+ *       and mutex is released.
+ *
+ *     - if the transition requires blocking, _try() function returns
+ *       cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to
+ *       sleep, waiting for possibility of lock state change. It is woken
+ *       up when some event occurs, that makes lock state change possible
+ *       (e.g., the reception of the reply from the server), and repeats
+ *       the loop.
+ *
+ * Top-lock and sub-lock has separate mutexes and the latter has to be taken
+ * first to avoid dead-lock.
+ *
+ * To see an example of interaction of all these issues, take a look at the
+ * lov_cl.c:lov_lock_enqueue() function. It is called as a part of
+ * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by
+ * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note
+ * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It
+ * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be
+ * done in parallel, rather than one after another (this is used for glimpse
+ * locks, that cannot dead-lock).
+ *
+ * INTERFACE AND USAGE
+ *
+ * struct cl_lock_operations provide a number of call-backs that are invoked
+ * when events of interest occurs. Layers can intercept and handle glimpse,
+ * blocking, cancel ASTs and a reception of the reply from the server.
+ *
+ * One important difference with the old client locking model is that new
+ * client has a representation for the top-lock, whereas in the old code only
+ * sub-locks existed as real data structures and file-level locks are
+ * represented by "request sets" that are created and destroyed on each and
+ * every lock creation.
+ *
+ * Top-locks are cached, and can be found in the cache by the system calls. It
+ * is possible that top-lock is in cache, but some of its sub-locks were
+ * canceled and destroyed. In that case top-lock has to be enqueued again
+ * before it can be used.
+ *
+ * Overall process of the locking during IO operation is as following:
+ *
+ *     - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
+ *       is called on each layer. Responsibility of this method is to add locks,
+ *       needed by a given layer into cl_io.ci_lockset.
+ *
+ *     - once locks for all layers were collected, they are sorted to avoid
+ *       dead-locks (cl_io_locks_sort()), and enqueued.
+ *
+ *     - when all locks are acquired, IO is performed;
+ *
+ *     - locks are released into cache.
+ *
+ * Striping introduces major additional complexity into locking. The
+ * fundamental problem is that it is generally unsafe to actively use (hold)
+ * two locks on the different OST servers at the same time, as this introduces
+ * inter-server dependency and can lead to cascading evictions.
+ *
+ * Basic solution is to sub-divide large read/write IOs into smaller pieces so
+ * that no multi-stripe locks are taken (note that this design abandons POSIX
+ * read/write semantics). Such pieces ideally can be executed concurrently. At
+ * the same time, certain types of IO cannot be sub-divived, without
+ * sacrificing correctness. This includes:
+ *
+ *  - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
+ *  atomicity;
+ *
+ *  - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
+ *
+ * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
+ * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
+ * has to be held together with the usual lock on [offset, offset + count].
+ *
+ * As multi-stripe locks have to be allowed, it makes sense to cache them, so
+ * that, for example, a sequence of O_APPEND writes can proceed quickly
+ * without going down to the individual stripes to do lock matching. On the
+ * other hand, multi-stripe locks shouldn't be used by normal read/write
+ * calls. To achieve this, every layer can implement ->clo_fits_into() method,
+ * that is called by lock matching code (cl_lock_lookup()), and that can be
+ * used to selectively disable matching of certain locks for certain IOs. For
+ * exmaple, lov layer implements lov_lock_fits_into() that allow multi-stripe
+ * locks to be matched only for truncates and O_APPEND writes.
+ *
+ * Interaction with DLM
+ *
+ * In the expected setup, cl_lock is ultimately backed up by a collection of
+ * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
+ * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
+ * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
+ * description of interaction with DLM.
+ */
+
+/**
+ * Lock description.
+ */
+struct cl_lock_descr {
+	/** Object this lock is granted for. */
+	struct cl_object *cld_obj;
+	/** Index of the first page protected by this lock. */
+	pgoff_t	   cld_start;
+	/** Index of the last page (inclusive) protected by this lock. */
+	pgoff_t	   cld_end;
+	/** Group ID, for group lock */
+	__u64	     cld_gid;
+	/** Lock mode. */
+	enum cl_lock_mode cld_mode;
+	/**
+	 * flags to enqueue lock. A combination of bit-flags from
+	 * enum cl_enq_flags.
+	 */
+	__u32	     cld_enq_flags;
+};
+
+#define DDESCR "%s(%d):[%lu, %lu]"
+#define PDESCR(descr)						   \
+	cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode,	\
+	(descr)->cld_start, (descr)->cld_end
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode);
+
+/**
+ * Lock state-machine states.
+ *
+ * \htmlonly
+ * <pre>
+ *
+ * Possible state transitions:
+ *
+ *	      +------------------>NEW
+ *	      |		    |
+ *	      |		    | cl_enqueue_try()
+ *	      |		    |
+ *	      |    cl_unuse_try()  V
+ *	      |  +--------------QUEUING (*)
+ *	      |  |		 |
+ *	      |  |		 | cl_enqueue_try()
+ *	      |  |		 |
+ *	      |  | cl_unuse_try()  V
+ *    sub-lock  |  +-------------ENQUEUED (*)
+ *    canceled  |  |		 |
+ *	      |  |		 | cl_wait_try()
+ *	      |  |		 |
+ *	      |  |		(R)
+ *	      |  |		 |
+ *	      |  |		 V
+ *	      |  |		HELD<---------+
+ *	      |  |		 |	    |
+ *	      |  |		 |	    | cl_use_try()
+ *	      |  |  cl_unuse_try() |	    |
+ *	      |  |		 |	    |
+ *	      |  |		 V	 ---+
+ *	      |  +------------>INTRANSIT (D) <--+
+ *	      |		    |	    |
+ *	      |     cl_unuse_try() |	    | cached lock found
+ *	      |		    |	    | cl_use_try()
+ *	      |		    |	    |
+ *	      |		    V	    |
+ *	      +------------------CACHED---------+
+ *				   |
+ *				  (C)
+ *				   |
+ *				   V
+ *				FREEING
+ *
+ * Legend:
+ *
+ *	 In states marked with (*) transition to the same state (i.e., a loop
+ *	 in the diagram) is possible.
+ *
+ *	 (R) is the point where Receive call-back is invoked: it allows layers
+ *	 to handle arrival of lock reply.
+ *
+ *	 (C) is the point where Cancellation call-back is invoked.
+ *
+ *	 (D) is the transit state which means the lock is changing.
+ *
+ *	 Transition to FREEING state is possible from any other state in the
+ *	 diagram in case of unrecoverable error.
+ * </pre>
+ * \endhtmlonly
+ *
+ * These states are for individual cl_lock object. Top-lock and its sub-locks
+ * can be in the different states. Another way to say this is that we have
+ * nested state-machines.
+ *
+ * Separate QUEUING and ENQUEUED states are needed to support non-blocking
+ * operation for locks with multiple sub-locks. Imagine lock on a file F, that
+ * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send
+ * enqueue to S0, wait for its completion, then send enqueue for S1, wait for
+ * its completion and at last enqueue lock for S2, and wait for its
+ * completion. In that case, top-lock is in QUEUING state while S0, S1 are
+ * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note
+ * that in this case, sub-locks move from state to state, and top-lock remains
+ * in the same state).
+ */
+enum cl_lock_state {
+	/**
+	 * Lock that wasn't yet enqueued
+	 */
+	CLS_NEW,
+	/**
+	 * Enqueue is in progress, blocking for some intermediate interaction
+	 * with the other side.
+	 */
+	CLS_QUEUING,
+	/**
+	 * Lock is fully enqueued, waiting for server to reply when it is
+	 * granted.
+	 */
+	CLS_ENQUEUED,
+	/**
+	 * Lock granted, actively used by some IO.
+	 */
+	CLS_HELD,
+	/**
+	 * This state is used to mark the lock is being used, or unused.
+	 * We need this state because the lock may have several sublocks,
+	 * so it's impossible to have an atomic way to bring all sublocks
+	 * into CLS_HELD state at use case, or all sublocks to CLS_CACHED
+	 * at unuse case.
+	 * If a thread is referring to a lock, and it sees the lock is in this
+	 * state, it must wait for the lock.
+	 * See state diagram for details.
+	 */
+	CLS_INTRANSIT,
+	/**
+	 * Lock granted, not used.
+	 */
+	CLS_CACHED,
+	/**
+	 * Lock is being destroyed.
+	 */
+	CLS_FREEING,
+	CLS_NR
+};
+
+enum cl_lock_flags {
+	/**
+	 * lock has been cancelled. This flag is never cleared once set (by
+	 * cl_lock_cancel0()).
+	 */
+	CLF_CANCELLED  = 1 << 0,
+	/** cancellation is pending for this lock. */
+	CLF_CANCELPEND = 1 << 1,
+	/** destruction is pending for this lock. */
+	CLF_DOOMED     = 1 << 2,
+	/** from enqueue RPC reply upcall. */
+	CLF_FROM_UPCALL= 1 << 3,
+};
+
+/**
+ * Lock closure.
+ *
+ * Lock closure is a collection of locks (both top-locks and sub-locks) that
+ * might be updated in a result of an operation on a certain lock (which lock
+ * this is a closure of).
+ *
+ * Closures are needed to guarantee dead-lock freedom in the presence of
+ *
+ *     - nested state-machines (top-lock state-machine composed of sub-lock
+ *       state-machines), and
+ *
+ *     - shared sub-locks.
+ *
+ * Specifically, many operations, such as lock enqueue, wait, unlock,
+ * etc. start from a top-lock, and then operate on a sub-locks of this
+ * top-lock, holding a top-lock mutex. When sub-lock state changes as a result
+ * of such operation, this change has to be propagated to all top-locks that
+ * share this sub-lock. Obviously, no natural lock ordering (e.g.,
+ * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has
+ * to be used. Lock closure systematizes this try-and-repeat logic.
+ */
+struct cl_lock_closure {
+	/**
+	 * Lock that is mutexed when closure construction is started. When
+	 * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on
+	 * origin is released before waiting.
+	 */
+	struct cl_lock   *clc_origin;
+	/**
+	 * List of enclosed locks, so far. Locks are linked here through
+	 * cl_lock::cll_inclosure.
+	 */
+	struct list_head	clc_list;
+	/**
+	 * True iff closure is in a `wait' mode. This determines what
+	 * cl_lock_enclosure() does when a lock L to be added to the closure
+	 * is currently mutexed by some other thread.
+	 *
+	 * If cl_lock_closure::clc_wait is not set, then closure construction
+	 * fails with CLO_REPEAT immediately.
+	 *
+	 * In wait mode, cl_lock_enclosure() waits until next attempt to build
+	 * a closure might succeed. To this end it releases an origin mutex
+	 * (cl_lock_closure::clc_origin), that has to be the only lock mutex
+	 * owned by the current thread, and then waits on L mutex (by grabbing
+	 * it and immediately releasing), before returning CLO_REPEAT to the
+	 * caller.
+	 */
+	int	       clc_wait;
+	/** Number of locks in the closure. */
+	int	       clc_nr;
+};
+
+/**
+ * Layered client lock.
+ */
+struct cl_lock {
+	/** Reference counter. */
+	atomic_t	  cll_ref;
+	/** List of slices. Immutable after creation. */
+	struct list_head	    cll_layers;
+	/**
+	 * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected
+	 * by cl_lock::cll_descr::cld_obj::coh_lock_guard.
+	 */
+	struct list_head	    cll_linkage;
+	/**
+	 * Parameters of this lock. Protected by
+	 * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within
+	 * cl_lock::cll_guard. Modified only on lock creation and in
+	 * cl_lock_modify().
+	 */
+	struct cl_lock_descr  cll_descr;
+	/** Protected by cl_lock::cll_guard. */
+	enum cl_lock_state    cll_state;
+	/** signals state changes. */
+	wait_queue_head_t	   cll_wq;
+	/**
+	 * Recursive lock, most fields in cl_lock{} are protected by this.
+	 *
+	 * Locking rules: this mutex is never held across network
+	 * communication, except when lock is being canceled.
+	 *
+	 * Lock ordering: a mutex of a sub-lock is taken first, then a mutex
+	 * on a top-lock. Other direction is implemented through a
+	 * try-lock-repeat loop. Mutices of unrelated locks can be taken only
+	 * by try-locking.
+	 *
+	 * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait().
+	 */
+	struct mutex		cll_guard;
+	task_t	   *cll_guarder;
+	int		   cll_depth;
+
+	/**
+	 * the owner for INTRANSIT state
+	 */
+	task_t	   *cll_intransit_owner;
+	int		   cll_error;
+	/**
+	 * Number of holds on a lock. A hold prevents a lock from being
+	 * canceled and destroyed. Protected by cl_lock::cll_guard.
+	 *
+	 * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release()
+	 */
+	int		   cll_holds;
+	 /**
+	  * Number of lock users. Valid in cl_lock_state::CLS_HELD state
+	  * only. Lock user pins lock in CLS_HELD state. Protected by
+	  * cl_lock::cll_guard.
+	  *
+	  * \see cl_wait(), cl_unuse().
+	  */
+	int		   cll_users;
+	/**
+	 * Flag bit-mask. Values from enum cl_lock_flags. Updates are
+	 * protected by cl_lock::cll_guard.
+	 */
+	unsigned long	 cll_flags;
+	/**
+	 * A linkage into a list of locks in a closure.
+	 *
+	 * \see cl_lock_closure
+	 */
+	struct list_head	    cll_inclosure;
+	/**
+	 * Confict lock at queuing time.
+	 */
+	struct cl_lock       *cll_conflict;
+	/**
+	 * A list of references to this lock, for debugging.
+	 */
+	struct lu_ref	 cll_reference;
+	/**
+	 * A list of holds on this lock, for debugging.
+	 */
+	struct lu_ref	 cll_holders;
+	/**
+	 * A reference for cl_lock::cll_descr::cld_obj. For debugging.
+	 */
+	struct lu_ref_link   *cll_obj_ref;
+#ifdef CONFIG_LOCKDEP
+	/* "dep_map" name is assumed by lockdep.h macros. */
+	struct lockdep_map    dep_map;
+#endif
+};
+
+/**
+ * Per-layer part of cl_lock
+ *
+ * \see ccc_lock, lov_lock, lovsub_lock, osc_lock
+ */
+struct cl_lock_slice {
+	struct cl_lock		  *cls_lock;
+	/** Object slice corresponding to this lock slice. Immutable after
+	 * creation. */
+	struct cl_object		*cls_obj;
+	const struct cl_lock_operations *cls_ops;
+	/** Linkage into cl_lock::cll_layers. Immutable after creation. */
+	struct list_head		       cls_linkage;
+};
+
+/**
+ * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}().
+ *
+ * NOTE: lov_subresult() depends on ordering here.
+ */
+enum cl_lock_transition {
+	/** operation cannot be completed immediately. Wait for state change. */
+	CLO_WAIT	= 1,
+	/** operation had to release lock mutex, restart. */
+	CLO_REPEAT      = 2,
+	/** lower layer re-enqueued. */
+	CLO_REENQUEUED  = 3,
+};
+
+/**
+ *
+ * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ */
+struct cl_lock_operations {
+	/**
+	 * \name statemachine
+	 *
+	 * State machine transitions. These 3 methods are called to transfer
+	 * lock from one state to another, as described in the commentary
+	 * above enum #cl_lock_state.
+	 *
+	 * \retval 0	  this layer has nothing more to do to before
+	 *		       transition to the target state happens;
+	 *
+	 * \retval CLO_REPEAT method had to release and re-acquire cl_lock
+	 *		    mutex, repeat invocation of transition method
+	 *		    across all layers;
+	 *
+	 * \retval CLO_WAIT   this layer cannot move to the target state
+	 *		    immediately, as it has to wait for certain event
+	 *		    (e.g., the communication with the server). It
+	 *		    is guaranteed, that when the state transfer
+	 *		    becomes possible, cl_lock::cll_wq wait-queue
+	 *		    is signaled. Caller can wait for this event by
+	 *		    calling cl_lock_state_wait();
+	 *
+	 * \retval -ve	failure, abort state transition, move the lock
+	 *		    into cl_lock_state::CLS_FREEING state, and set
+	 *		    cl_lock::cll_error.
+	 *
+	 * Once all layers voted to agree to transition (by returning 0), lock
+	 * is moved into corresponding target state. All state transition
+	 * methods are optional.
+	 */
+	/** @{ */
+	/**
+	 * Attempts to enqueue the lock. Called top-to-bottom.
+	 *
+	 * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
+	 * \see osc_lock_enqueue()
+	 */
+	int  (*clo_enqueue)(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *io, __u32 enqflags);
+	/**
+	 * Attempts to wait for enqueue result. Called top-to-bottom.
+	 *
+	 * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait()
+	 */
+	int  (*clo_wait)(const struct lu_env *env,
+			 const struct cl_lock_slice *slice);
+	/**
+	 * Attempts to unlock the lock. Called bottom-to-top. In addition to
+	 * usual return values of lock state-machine methods, this can return
+	 * -ESTALE to indicate that lock cannot be returned to the cache, and
+	 * has to be re-initialized.
+	 * unuse is a one-shot operation, so it must NOT return CLO_WAIT.
+	 *
+	 * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse()
+	 */
+	int  (*clo_unuse)(const struct lu_env *env,
+			  const struct cl_lock_slice *slice);
+	/**
+	 * Notifies layer that cached lock is started being used.
+	 *
+	 * \pre lock->cll_state == CLS_CACHED
+	 *
+	 * \see lov_lock_use(), osc_lock_use()
+	 */
+	int  (*clo_use)(const struct lu_env *env,
+			const struct cl_lock_slice *slice);
+	/** @} statemachine */
+	/**
+	 * A method invoked when lock state is changed (as a result of state
+	 * transition). This is used, for example, to track when the state of
+	 * a sub-lock changes, to propagate this change to the corresponding
+	 * top-lock. Optional
+	 *
+	 * \see lovsub_lock_state()
+	 */
+	void (*clo_state)(const struct lu_env *env,
+			  const struct cl_lock_slice *slice,
+			  enum cl_lock_state st);
+	/**
+	 * Returns true, iff given lock is suitable for the given io, idea
+	 * being, that there are certain "unsafe" locks, e.g., ones acquired
+	 * for O_APPEND writes, that we don't want to re-use for a normal
+	 * write, to avoid the danger of cascading evictions. Optional. Runs
+	 * under cl_object_header::coh_lock_guard.
+	 *
+	 * XXX this should take more information about lock needed by
+	 * io. Probably lock description or something similar.
+	 *
+	 * \see lov_fits_into()
+	 */
+	int (*clo_fits_into)(const struct lu_env *env,
+			     const struct cl_lock_slice *slice,
+			     const struct cl_lock_descr *need,
+			     const struct cl_io *io);
+	/**
+	 * \name ast
+	 * Asynchronous System Traps. All of then are optional, all are
+	 * executed bottom-to-top.
+	 */
+	/** @{ */
+
+	/**
+	 * Cancellation callback. Cancel a lock voluntarily, or under
+	 * the request of server.
+	 */
+	void (*clo_cancel)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice);
+	/**
+	 * Lock weighting ast. Executed to estimate how precious this lock
+	 * is. The sum of results across all layers is used to determine
+	 * whether lock worth keeping in cache given present memory usage.
+	 *
+	 * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh().
+	 */
+	unsigned long (*clo_weigh)(const struct lu_env *env,
+				   const struct cl_lock_slice *slice);
+	/** @} ast */
+
+	/**
+	 * \see lovsub_lock_closure()
+	 */
+	int (*clo_closure)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice,
+			   struct cl_lock_closure *closure);
+	/**
+	 * Executed bottom-to-top when lock description changes (e.g., as a
+	 * result of server granting more generous lock than was requested).
+	 *
+	 * \see lovsub_lock_modify()
+	 */
+	int (*clo_modify)(const struct lu_env *env,
+			  const struct cl_lock_slice *slice,
+			  const struct cl_lock_descr *updated);
+	/**
+	 * Notifies layers (bottom-to-top) that lock is going to be
+	 * destroyed. Responsibility of layers is to prevent new references on
+	 * this lock from being acquired once this method returns.
+	 *
+	 * This can be called multiple times due to the races.
+	 *
+	 * \see cl_lock_delete()
+	 * \see osc_lock_delete(), lovsub_lock_delete()
+	 */
+	void (*clo_delete)(const struct lu_env *env,
+			   const struct cl_lock_slice *slice);
+	/**
+	 * Destructor. Frees resources and the slice.
+	 *
+	 * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
+	 * \see osc_lock_fini()
+	 */
+	void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
+	/**
+	 * Optional debugging helper. Prints given lock slice.
+	 */
+	int (*clo_print)(const struct lu_env *env,
+			 void *cookie, lu_printer_t p,
+			 const struct cl_lock_slice *slice);
+};
+
+#define CL_LOCK_DEBUG(mask, env, lock, format, ...)		     \
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		cl_lock_print(env, &msgdata, lu_cdebug_printer, lock);  \
+		CDEBUG(mask, format , ## __VA_ARGS__);		  \
+	}							       \
+} while (0)
+
+#define CL_LOCK_ASSERT(expr, env, lock) do {			    \
+	if (likely(expr))					       \
+		break;						  \
+									\
+	CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr);    \
+	LBUG();							 \
+} while (0)
+
+/** @} cl_lock */
+
+/** \addtogroup cl_page_list cl_page_list
+ * Page list used to perform collective operations on a group of pages.
+ *
+ * Pages are added to the list one by one. cl_page_list acquires a reference
+ * for every page in it. Page list is used to perform collective operations on
+ * pages:
+ *
+ *     - submit pages for an immediate transfer,
+ *
+ *     - own pages on behalf of certain io (waiting for each page in turn),
+ *
+ *     - discard pages.
+ *
+ * When list is finalized, it releases references on all pages it still has.
+ *
+ * \todo XXX concurrency control.
+ *
+ * @{
+ */
+struct cl_page_list {
+	unsigned	     pl_nr;
+	struct list_head	   pl_pages;
+	task_t	  *pl_owner;
+};
+
+/**
+ * A 2-queue of pages. A convenience data-type for common use case, 2-queue
+ * contains an incoming page list and an outgoing page list.
+ */
+struct cl_2queue {
+	struct cl_page_list c2_qin;
+	struct cl_page_list c2_qout;
+};
+
+/** @} cl_page_list */
+
+/** \addtogroup cl_io cl_io
+ * @{ */
+/** \struct cl_io
+ * I/O
+ *
+ * cl_io represents a high level I/O activity like
+ * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
+ * lock.
+ *
+ * cl_io is a layered object, much like cl_{object,page,lock} but with one
+ * important distinction. We want to minimize number of calls to the allocator
+ * in the fast path, e.g., in the case of read(2) when everything is cached:
+ * client already owns the lock over region being read, and data are cached
+ * due to read-ahead. To avoid allocation of cl_io layers in such situations,
+ * per-layer io state is stored in the session, associated with the io, see
+ * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
+ * by using free-lists, see cl_env_get().
+ *
+ * There is a small predefined number of possible io types, enumerated in enum
+ * cl_io_type.
+ *
+ * cl_io is a state machine, that can be advanced concurrently by the multiple
+ * threads. It is up to these threads to control the concurrency and,
+ * specifically, to detect when io is done, and its state can be safely
+ * released.
+ *
+ * For read/write io overall execution plan is as following:
+ *
+ *     (0) initialize io state through all layers;
+ *
+ *     (1) loop: prepare chunk of work to do
+ *
+ *     (2) call all layers to collect locks they need to process current chunk
+ *
+ *     (3) sort all locks to avoid dead-locks, and acquire them
+ *
+ *     (4) process the chunk: call per-page methods
+ *	 (cl_io_operations::cio_read_page() for read,
+ *	 cl_io_operations::cio_prepare_write(),
+ *	 cl_io_operations::cio_commit_write() for write)
+ *
+ *     (5) release locks
+ *
+ *     (6) repeat loop.
+ *
+ * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
+ * address allocation efficiency issues mentioned above), and returns with the
+ * special error condition from per-page method when current sub-io has to
+ * block. This causes io loop to be repeated, and lov switches to the next
+ * sub-io in its cl_io_operations::cio_iter_init() implementation.
+ */
+
+/** IO types */
+enum cl_io_type {
+	/** read system call */
+	CIT_READ,
+	/** write system call */
+	CIT_WRITE,
+	/** truncate, utime system calls */
+	CIT_SETATTR,
+	/**
+	 * page fault handling
+	 */
+	CIT_FAULT,
+	/**
+	 * fsync system call handling
+	 * To write out a range of file
+	 */
+	CIT_FSYNC,
+	/**
+	 * Miscellaneous io. This is used for occasional io activity that
+	 * doesn't fit into other types. Currently this is used for:
+	 *
+	 *     - cancellation of an extent lock. This io exists as a context
+	 *     to write dirty pages from under the lock being canceled back
+	 *     to the server;
+	 *
+	 *     - VM induced page write-out. An io context for writing page out
+	 *     for memory cleansing;
+	 *
+	 *     - glimpse. An io context to acquire glimpse lock.
+	 *
+	 *     - grouplock. An io context to acquire group lock.
+	 *
+	 * CIT_MISC io is used simply as a context in which locks and pages
+	 * are manipulated. Such io has no internal "process", that is,
+	 * cl_io_loop() is never called for it.
+	 */
+	CIT_MISC,
+	CIT_OP_NR
+};
+
+/**
+ * States of cl_io state machine
+ */
+enum cl_io_state {
+	/** Not initialized. */
+	CIS_ZERO,
+	/** Initialized. */
+	CIS_INIT,
+	/** IO iteration started. */
+	CIS_IT_STARTED,
+	/** Locks taken. */
+	CIS_LOCKED,
+	/** Actual IO is in progress. */
+	CIS_IO_GOING,
+	/** IO for the current iteration finished. */
+	CIS_IO_FINISHED,
+	/** Locks released. */
+	CIS_UNLOCKED,
+	/** Iteration completed. */
+	CIS_IT_ENDED,
+	/** cl_io finalized. */
+	CIS_FINI
+};
+
+/**
+ * IO state private for a layer.
+ *
+ * This is usually embedded into layer session data, rather than allocated
+ * dynamically.
+ *
+ * \see vvp_io, lov_io, osc_io, ccc_io
+ */
+struct cl_io_slice {
+	struct cl_io		  *cis_io;
+	/** corresponding object slice. Immutable after creation. */
+	struct cl_object	      *cis_obj;
+	/** io operations. Immutable after creation. */
+	const struct cl_io_operations *cis_iop;
+	/**
+	 * linkage into a list of all slices for a given cl_io, hanging off
+	 * cl_io::ci_layers. Immutable after creation.
+	 */
+	struct list_head		     cis_linkage;
+};
+
+
+/**
+ * Per-layer io operations.
+ * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
+ */
+struct cl_io_operations {
+	/**
+	 * Vector of io state transition methods for every io type.
+	 *
+	 * \see cl_page_operations::io
+	 */
+	struct {
+		/**
+		 * Prepare io iteration at a given layer.
+		 *
+		 * Called top-to-bottom at the beginning of each iteration of
+		 * "io loop" (if it makes sense for this type of io). Here
+		 * layer selects what work it will do during this iteration.
+		 *
+		 * \see cl_io_operations::cio_iter_fini()
+		 */
+		int (*cio_iter_init) (const struct lu_env *env,
+				      const struct cl_io_slice *slice);
+		/**
+		 * Finalize io iteration.
+		 *
+		 * Called bottom-to-top at the end of each iteration of "io
+		 * loop". Here layers can decide whether IO has to be
+		 * continued.
+		 *
+		 * \see cl_io_operations::cio_iter_init()
+		 */
+		void (*cio_iter_fini) (const struct lu_env *env,
+				       const struct cl_io_slice *slice);
+		/**
+		 * Collect locks for the current iteration of io.
+		 *
+		 * Called top-to-bottom to collect all locks necessary for
+		 * this iteration. This methods shouldn't actually enqueue
+		 * anything, instead it should post a lock through
+		 * cl_io_lock_add(). Once all locks are collected, they are
+		 * sorted and enqueued in the proper order.
+		 */
+		int  (*cio_lock) (const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+		/**
+		 * Finalize unlocking.
+		 *
+		 * Called bottom-to-top to finish layer specific unlocking
+		 * functionality, after generic code released all locks
+		 * acquired by cl_io_operations::cio_lock().
+		 */
+		void  (*cio_unlock)(const struct lu_env *env,
+				    const struct cl_io_slice *slice);
+		/**
+		 * Start io iteration.
+		 *
+		 * Once all locks are acquired, called top-to-bottom to
+		 * commence actual IO. In the current implementation,
+		 * top-level vvp_io_{read,write}_start() does all the work
+		 * synchronously by calling generic_file_*(), so other layers
+		 * are called when everything is done.
+		 */
+		int  (*cio_start)(const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+		/**
+		 * Called top-to-bottom at the end of io loop. Here layer
+		 * might wait for an unfinished asynchronous io.
+		 */
+		void (*cio_end)  (const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+		/**
+		 * Called bottom-to-top to notify layers that read/write IO
+		 * iteration finished, with \a nob bytes transferred.
+		 */
+		void (*cio_advance)(const struct lu_env *env,
+				    const struct cl_io_slice *slice,
+				    size_t nob);
+		/**
+		 * Called once per io, bottom-to-top to release io resources.
+		 */
+		void (*cio_fini) (const struct lu_env *env,
+				  const struct cl_io_slice *slice);
+	} op[CIT_OP_NR];
+	struct {
+		/**
+		 * Submit pages from \a queue->c2_qin for IO, and move
+		 * successfully submitted pages into \a queue->c2_qout. Return
+		 * non-zero if failed to submit even the single page. If
+		 * submission failed after some pages were moved into \a
+		 * queue->c2_qout, completion callback with non-zero ioret is
+		 * executed on them.
+		 */
+		int  (*cio_submit)(const struct lu_env *env,
+				   const struct cl_io_slice *slice,
+				   enum cl_req_type crt,
+				   struct cl_2queue *queue);
+	} req_op[CRT_NR];
+	/**
+	 * Read missing page.
+	 *
+	 * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start()
+	 * method, when it hits not-up-to-date page in the range. Optional.
+	 *
+	 * \pre io->ci_type == CIT_READ
+	 */
+	int (*cio_read_page)(const struct lu_env *env,
+			     const struct cl_io_slice *slice,
+			     const struct cl_page_slice *page);
+	/**
+	 * Prepare write of a \a page. Called bottom-to-top by a top-level
+	 * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for
+	 * get data from user-level buffer.
+	 *
+	 * \pre io->ci_type == CIT_WRITE
+	 *
+	 * \see vvp_io_prepare_write(), lov_io_prepare_write(),
+	 * osc_io_prepare_write().
+	 */
+	int (*cio_prepare_write)(const struct lu_env *env,
+				 const struct cl_io_slice *slice,
+				 const struct cl_page_slice *page,
+				 unsigned from, unsigned to);
+	/**
+	 *
+	 * \pre io->ci_type == CIT_WRITE
+	 *
+	 * \see vvp_io_commit_write(), lov_io_commit_write(),
+	 * osc_io_commit_write().
+	 */
+	int (*cio_commit_write)(const struct lu_env *env,
+				const struct cl_io_slice *slice,
+				const struct cl_page_slice *page,
+				unsigned from, unsigned to);
+	/**
+	 * Optional debugging helper. Print given io slice.
+	 */
+	int (*cio_print)(const struct lu_env *env, void *cookie,
+			 lu_printer_t p, const struct cl_io_slice *slice);
+};
+
+/**
+ * Flags to lock enqueue procedure.
+ * \ingroup cl_lock
+ */
+enum cl_enq_flags {
+	/**
+	 * instruct server to not block, if conflicting lock is found. Instead
+	 * -EWOULDBLOCK is returned immediately.
+	 */
+	CEF_NONBLOCK     = 0x00000001,
+	/**
+	 * take lock asynchronously (out of order), as it cannot
+	 * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
+	 */
+	CEF_ASYNC	= 0x00000002,
+	/**
+	 * tell the server to instruct (though a flag in the blocking ast) an
+	 * owner of the conflicting lock, that it can drop dirty pages
+	 * protected by this lock, without sending them to the server.
+	 */
+	CEF_DISCARD_DATA = 0x00000004,
+	/**
+	 * tell the sub layers that it must be a `real' lock. This is used for
+	 * mmapped-buffer locks and glimpse locks that must be never converted
+	 * into lockless mode.
+	 *
+	 * \see vvp_mmap_locks(), cl_glimpse_lock().
+	 */
+	CEF_MUST	 = 0x00000008,
+	/**
+	 * tell the sub layers that never request a `real' lock. This flag is
+	 * not used currently.
+	 *
+	 * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
+	 * conversion policy: ci_lockreq describes generic information of lock
+	 * requirement for this IO, especially for locks which belong to the
+	 * object doing IO; however, lock itself may have precise requirements
+	 * that are described by the enqueue flags.
+	 */
+	CEF_NEVER	= 0x00000010,
+	/**
+	 * for async glimpse lock.
+	 */
+	CEF_AGL	  = 0x00000020,
+	/**
+	 * mask of enq_flags.
+	 */
+	CEF_MASK	 = 0x0000003f,
+};
+
+/**
+ * Link between lock and io. Intermediate structure is needed, because the
+ * same lock can be part of multiple io's simultaneously.
+ */
+struct cl_io_lock_link {
+	/** linkage into one of cl_lockset lists. */
+	struct list_head	   cill_linkage;
+	struct cl_lock_descr cill_descr;
+	struct cl_lock      *cill_lock;
+	/** optional destructor */
+	void	       (*cill_fini)(const struct lu_env *env,
+					struct cl_io_lock_link *link);
+};
+
+/**
+ * Lock-set represents a collection of locks, that io needs at a
+ * time. Generally speaking, client tries to avoid holding multiple locks when
+ * possible, because
+ *
+ *      - holding extent locks over multiple ost's introduces the danger of
+ *	"cascading timeouts";
+ *
+ *      - holding multiple locks over the same ost is still dead-lock prone,
+ *	see comment in osc_lock_enqueue(),
+ *
+ * but there are certain situations where this is unavoidable:
+ *
+ *      - O_APPEND writes have to take [0, EOF] lock for correctness;
+ *
+ *      - truncate has to take [new-size, EOF] lock for correctness;
+ *
+ *      - SNS has to take locks across full stripe for correctness;
+ *
+ *      - in the case when user level buffer, supplied to {read,write}(file0),
+ *	is a part of a memory mapped lustre file, client has to take a dlm
+ *	locks on file0, and all files that back up the buffer (or a part of
+ *	the buffer, that is being processed in the current chunk, in any
+ *	case, there are situations where at least 2 locks are necessary).
+ *
+ * In such cases we at least try to take locks in the same consistent
+ * order. To this end, all locks are first collected, then sorted, and then
+ * enqueued.
+ */
+struct cl_lockset {
+	/** locks to be acquired. */
+	struct list_head  cls_todo;
+	/** locks currently being processed. */
+	struct list_head  cls_curr;
+	/** locks acquired. */
+	struct list_head  cls_done;
+};
+
+/**
+ * Lock requirements(demand) for IO. It should be cl_io_lock_req,
+ * but 'req' is always to be thought as 'request' :-)
+ */
+enum cl_io_lock_dmd {
+	/** Always lock data (e.g., O_APPEND). */
+	CILR_MANDATORY = 0,
+	/** Layers are free to decide between local and global locking. */
+	CILR_MAYBE,
+	/** Never lock: there is no cache (e.g., liblustre). */
+	CILR_NEVER
+};
+
+enum cl_fsync_mode {
+	/** start writeback, do not wait for them to finish */
+	CL_FSYNC_NONE  = 0,
+	/** start writeback and wait for them to finish */
+	CL_FSYNC_LOCAL = 1,
+	/** discard all of dirty pages in a specific file range */
+	CL_FSYNC_DISCARD = 2,
+	/** start writeback and make sure they have reached storage before
+	 * return. OST_SYNC RPC must be issued and finished */
+	CL_FSYNC_ALL   = 3
+};
+
+struct cl_io_rw_common {
+	loff_t      crw_pos;
+	size_t      crw_count;
+	int	 crw_nonblock;
+};
+
+
+/**
+ * State for io.
+ *
+ * cl_io is shared by all threads participating in this IO (in current
+ * implementation only one thread advances IO, but parallel IO design and
+ * concurrent copy_*_user() require multiple threads acting on the same IO. It
+ * is up to these threads to serialize their activities, including updates to
+ * mutable cl_io fields.
+ */
+struct cl_io {
+	/** type of this IO. Immutable after creation. */
+	enum cl_io_type		ci_type;
+	/** current state of cl_io state machine. */
+	enum cl_io_state	       ci_state;
+	/** main object this io is against. Immutable after creation. */
+	struct cl_object	      *ci_obj;
+	/**
+	 * Upper layer io, of which this io is a part of. Immutable after
+	 * creation.
+	 */
+	struct cl_io		  *ci_parent;
+	/** List of slices. Immutable after creation. */
+	struct list_head		     ci_layers;
+	/** list of locks (to be) acquired by this io. */
+	struct cl_lockset	      ci_lockset;
+	/** lock requirements, this is just a help info for sublayers. */
+	enum cl_io_lock_dmd	    ci_lockreq;
+	union {
+		struct cl_rd_io {
+			struct cl_io_rw_common rd;
+		} ci_rd;
+		struct cl_wr_io {
+			struct cl_io_rw_common wr;
+			int		    wr_append;
+			int		    wr_sync;
+		} ci_wr;
+		struct cl_io_rw_common ci_rw;
+		struct cl_setattr_io {
+			struct ost_lvb   sa_attr;
+			unsigned int     sa_valid;
+			struct obd_capa *sa_capa;
+		} ci_setattr;
+		struct cl_fault_io {
+			/** page index within file. */
+			pgoff_t	 ft_index;
+			/** bytes valid byte on a faulted page. */
+			int	     ft_nob;
+			/** writable page? for nopage() only */
+			int	     ft_writable;
+			/** page of an executable? */
+			int	     ft_executable;
+			/** page_mkwrite() */
+			int	     ft_mkwrite;
+			/** resulting page */
+			struct cl_page *ft_page;
+		} ci_fault;
+		struct cl_fsync_io {
+			loff_t	     fi_start;
+			loff_t	     fi_end;
+			struct obd_capa   *fi_capa;
+			/** file system level fid */
+			struct lu_fid     *fi_fid;
+			enum cl_fsync_mode fi_mode;
+			/* how many pages were written/discarded */
+			unsigned int       fi_nr_written;
+		} ci_fsync;
+	} u;
+	struct cl_2queue     ci_queue;
+	size_t	       ci_nob;
+	int		  ci_result;
+	unsigned int	 ci_continue:1,
+	/**
+	 * This io has held grouplock, to inform sublayers that
+	 * don't do lockless i/o.
+	 */
+			     ci_no_srvlock:1,
+	/**
+	 * The whole IO need to be restarted because layout has been changed
+	 */
+			     ci_need_restart:1,
+	/**
+	 * to not refresh layout - the IO issuer knows that the layout won't
+	 * change(page operations, layout change causes all page to be
+	 * discarded), or it doesn't matter if it changes(sync).
+	 */
+			     ci_ignore_layout:1,
+	/**
+	 * Check if layout changed after the IO finishes. Mainly for HSM
+	 * requirement. If IO occurs to openning files, it doesn't need to
+	 * verify layout because HSM won't release openning files.
+	 * Right now, only two opertaions need to verify layout: glimpse
+	 * and setattr.
+	 */
+			     ci_verify_layout:1;
+	/**
+	 * Number of pages owned by this IO. For invariant checking.
+	 */
+	unsigned	     ci_owned_nr;
+};
+
+/** @} cl_io */
+
+/** \addtogroup cl_req cl_req
+ * @{ */
+/** \struct cl_req
+ * Transfer.
+ *
+ * There are two possible modes of transfer initiation on the client:
+ *
+ *     - immediate transfer: this is started when a high level io wants a page
+ *       or a collection of pages to be transferred right away. Examples:
+ *       read-ahead, synchronous read in the case of non-page aligned write,
+ *       page write-out as a part of extent lock cancellation, page write-out
+ *       as a part of memory cleansing. Immediate transfer can be both
+ *       cl_req_type::CRT_READ and cl_req_type::CRT_WRITE;
+ *
+ *     - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens
+ *       when io wants to transfer a page to the server some time later, when
+ *       it can be done efficiently. Example: pages dirtied by the write(2)
+ *       path.
+ *
+ * In any case, transfer takes place in the form of a cl_req, which is a
+ * representation for a network RPC.
+ *
+ * Pages queued for an opportunistic transfer are cached until it is decided
+ * that efficient RPC can be composed of them. This decision is made by "a
+ * req-formation engine", currently implemented as a part of osc
+ * layer. Req-formation depends on many factors: the size of the resulting
+ * RPC, whether or not multi-object RPCs are supported by the server,
+ * max-rpc-in-flight limitations, size of the dirty cache, etc.
+ *
+ * For the immediate transfer io submits a cl_page_list, that req-formation
+ * engine slices into cl_req's, possibly adding cached pages to some of
+ * the resulting req's.
+ *
+ * Whenever a page from cl_page_list is added to a newly constructed req, its
+ * cl_page_operations::cpo_prep() layer methods are called. At that moment,
+ * page state is atomically changed from cl_page_state::CPS_OWNED to
+ * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner
+ * is zeroed, and cl_page::cp_req is set to the
+ * req. cl_page_operations::cpo_prep() method at the particular layer might
+ * return -EALREADY to indicate that it does not need to submit this page
+ * at all. This is possible, for example, if page, submitted for read,
+ * became up-to-date in the meantime; and for write, the page don't have
+ * dirty bit marked. \see cl_io_submit_rw()
+ *
+ * Whenever a cached page is added to a newly constructed req, its
+ * cl_page_operations::cpo_make_ready() layer methods are called. At that
+ * moment, page state is atomically changed from cl_page_state::CPS_CACHED to
+ * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to
+ * req. cl_page_operations::cpo_make_ready() method at the particular layer
+ * might return -EAGAIN to indicate that this page is not eligible for the
+ * transfer right now.
+ *
+ * FUTURE
+ *
+ * Plan is to divide transfers into "priority bands" (indicated when
+ * submitting cl_page_list, and queuing a page for the opportunistic transfer)
+ * and allow glueing of cached pages to immediate transfers only within single
+ * band. This would make high priority transfers (like lock cancellation or
+ * memory pressure induced write-out) really high priority.
+ *
+ */
+
+/**
+ * Per-transfer attributes.
+ */
+struct cl_req_attr {
+	/** Generic attributes for the server consumption. */
+	struct obdo	*cra_oa;
+	/** Capability. */
+	struct obd_capa	*cra_capa;
+	/** Jobid */
+	char		 cra_jobid[JOBSTATS_JOBID_SIZE];
+};
+
+/**
+ * Transfer request operations definable at every layer.
+ *
+ * Concurrency: transfer formation engine synchronizes calls to all transfer
+ * methods.
+ */
+struct cl_req_operations {
+	/**
+	 * Invoked top-to-bottom by cl_req_prep() when transfer formation is
+	 * complete (all pages are added).
+	 *
+	 * \see osc_req_prep()
+	 */
+	int  (*cro_prep)(const struct lu_env *env,
+			 const struct cl_req_slice *slice);
+	/**
+	 * Called top-to-bottom to fill in \a oa fields. This is called twice
+	 * with different flags, see bug 10150 and osc_build_req().
+	 *
+	 * \param obj an object from cl_req which attributes are to be set in
+	 *	    \a oa.
+	 *
+	 * \param oa struct obdo where attributes are placed
+	 *
+	 * \param flags \a oa fields to be filled.
+	 */
+	void (*cro_attr_set)(const struct lu_env *env,
+			     const struct cl_req_slice *slice,
+			     const struct cl_object *obj,
+			     struct cl_req_attr *attr, obd_valid flags);
+	/**
+	 * Called top-to-bottom from cl_req_completion() to notify layers that
+	 * transfer completed. Has to free all state allocated by
+	 * cl_device_operations::cdo_req_init().
+	 */
+	void (*cro_completion)(const struct lu_env *env,
+			       const struct cl_req_slice *slice, int ioret);
+};
+
+/**
+ * A per-object state that (potentially multi-object) transfer request keeps.
+ */
+struct cl_req_obj {
+	/** object itself */
+	struct cl_object   *ro_obj;
+	/** reference to cl_req_obj::ro_obj. For debugging. */
+	struct lu_ref_link *ro_obj_ref;
+	/* something else? Number of pages for a given object? */
+};
+
+/**
+ * Transfer request.
+ *
+ * Transfer requests are not reference counted, because IO sub-system owns
+ * them exclusively and knows when to free them.
+ *
+ * Life cycle.
+ *
+ * cl_req is created by cl_req_alloc() that calls
+ * cl_device_operations::cdo_req_init() device methods to allocate per-req
+ * state in every layer.
+ *
+ * Then pages are added (cl_req_page_add()), req keeps track of all objects it
+ * contains pages for.
+ *
+ * Once all pages were collected, cl_page_operations::cpo_prep() method is
+ * called top-to-bottom. At that point layers can modify req, let it pass, or
+ * deny it completely. This is to support things like SNS that have transfer
+ * ordering requirements invisible to the individual req-formation engine.
+ *
+ * On transfer completion (or transfer timeout, or failure to initiate the
+ * transfer of an allocated req), cl_req_operations::cro_completion() method
+ * is called, after execution of cl_page_operations::cpo_completion() of all
+ * req's pages.
+ */
+struct cl_req {
+	enum cl_req_type      crq_type;
+	/** A list of pages being transfered */
+	struct list_head	    crq_pages;
+	/** Number of pages in cl_req::crq_pages */
+	unsigned	      crq_nrpages;
+	/** An array of objects which pages are in ->crq_pages */
+	struct cl_req_obj    *crq_o;
+	/** Number of elements in cl_req::crq_objs[] */
+	unsigned	      crq_nrobjs;
+	struct list_head	    crq_layers;
+};
+
+/**
+ * Per-layer state for request.
+ */
+struct cl_req_slice {
+	struct cl_req    *crs_req;
+	struct cl_device *crs_dev;
+	struct list_head	crs_linkage;
+	const struct cl_req_operations *crs_ops;
+};
+
+/* @} cl_req */
+
+enum cache_stats_item {
+	/** how many cache lookups were performed */
+	CS_lookup = 0,
+	/** how many times cache lookup resulted in a hit */
+	CS_hit,
+	/** how many entities are in the cache right now */
+	CS_total,
+	/** how many entities in the cache are actively used (and cannot be
+	 * evicted) right now */
+	CS_busy,
+	/** how many entities were created at all */
+	CS_create,
+	CS_NR
+};
+
+#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
+
+/**
+ * Stats for a generic cache (similar to inode, lu_object, etc. caches).
+ */
+struct cache_stats {
+	const char    *cs_name;
+	atomic_t   cs_stats[CS_NR];
+};
+
+/** These are not exported so far */
+void cache_stats_init (struct cache_stats *cs, const char *name);
+
+/**
+ * Client-side site. This represents particular client stack. "Global"
+ * variables should (directly or indirectly) be added here to allow multiple
+ * clients to co-exist in the single address space.
+ */
+struct cl_site {
+	struct lu_site	cs_lu;
+	/**
+	 * Statistical counters. Atomics do not scale, something better like
+	 * per-cpu counters is needed.
+	 *
+	 * These are exported as /proc/fs/lustre/llite/.../site
+	 *
+	 * When interpreting keep in mind that both sub-locks (and sub-pages)
+	 * and top-locks (and top-pages) are accounted here.
+	 */
+	struct cache_stats    cs_pages;
+	struct cache_stats    cs_locks;
+	atomic_t	  cs_pages_state[CPS_NR];
+	atomic_t	  cs_locks_state[CLS_NR];
+};
+
+int  cl_site_init (struct cl_site *s, struct cl_device *top);
+void cl_site_fini (struct cl_site *s);
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
+
+/**
+ * Output client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m);
+
+/**
+ * \name helpers
+ *
+ * Type conversion and accessory functions.
+ */
+/** @{ */
+
+static inline struct cl_site *lu2cl_site(const struct lu_site *site)
+{
+	return container_of(site, struct cl_site, cs_lu);
+}
+
+static inline int lu_device_is_cl(const struct lu_device *d)
+{
+	return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
+{
+	LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
+	return container_of0(d, struct cl_device, cd_lu_dev);
+}
+
+static inline struct lu_device *cl2lu_dev(struct cl_device *d)
+{
+	return &d->cd_lu_dev;
+}
+
+static inline struct cl_object *lu2cl(const struct lu_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
+	return container_of0(o, struct cl_object, co_lu);
+}
+
+static inline const struct cl_object_conf *
+lu2cl_conf(const struct lu_object_conf *conf)
+{
+	return container_of0(conf, struct cl_object_conf, coc_lu);
+}
+
+static inline struct cl_object *cl_object_next(const struct cl_object *obj)
+{
+	return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
+}
+
+static inline struct cl_device *cl_object_device(const struct cl_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev));
+	return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
+}
+
+static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
+{
+	return container_of0(h, struct cl_object_header, coh_lu);
+}
+
+static inline struct cl_site *cl_object_site(const struct cl_object *obj)
+{
+	return lu2cl_site(obj->co_lu.lo_dev->ld_site);
+}
+
+static inline
+struct cl_object_header *cl_object_header(const struct cl_object *obj)
+{
+	return luh2coh(obj->co_lu.lo_header);
+}
+
+static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
+{
+	return lu_device_init(&d->cd_lu_dev, t);
+}
+
+static inline void cl_device_fini(struct cl_device *d)
+{
+	lu_device_fini(&d->cd_lu_dev);
+}
+
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_page_operations *ops);
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_lock_operations *ops);
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+		     struct cl_object *obj, const struct cl_io_operations *ops);
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+		      struct cl_device *dev,
+		      const struct cl_req_operations *ops);
+/** @} helpers */
+
+/** \defgroup cl_object cl_object
+ * @{ */
+struct cl_object *cl_object_top (struct cl_object *o);
+struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
+				 const struct lu_fid *fid,
+				 const struct cl_object_conf *c);
+
+int  cl_object_header_init(struct cl_object_header *h);
+void cl_object_header_fini(struct cl_object_header *h);
+void cl_object_put	(const struct lu_env *env, struct cl_object *o);
+void cl_object_get	(struct cl_object *o);
+void cl_object_attr_lock  (struct cl_object *o);
+void cl_object_attr_unlock(struct cl_object *o);
+int  cl_object_attr_get   (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_attr *attr);
+int  cl_object_attr_set   (const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid);
+int  cl_object_glimpse    (const struct lu_env *env, struct cl_object *obj,
+			   struct ost_lvb *lvb);
+int  cl_conf_set	  (const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_object_conf *conf);
+void cl_object_prune      (const struct lu_env *env, struct cl_object *obj);
+void cl_object_kill       (const struct lu_env *env, struct cl_object *obj);
+int  cl_object_has_locks  (struct cl_object *obj);
+
+/**
+ * Returns true, iff \a o0 and \a o1 are slices of the same object.
+ */
+static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
+{
+	return cl_object_header(o0) == cl_object_header(o1);
+}
+
+static inline void cl_object_page_init(struct cl_object *clob, int size)
+{
+	clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
+	cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8);
+}
+
+static inline void *cl_object_page_slice(struct cl_object *clob,
+					 struct cl_page *page)
+{
+	return (void *)((char *)page + clob->co_slice_off);
+}
+
+/** @} cl_object */
+
+/** \defgroup cl_page cl_page
+ * @{ */
+enum {
+	CLP_GANG_OKAY = 0,
+	CLP_GANG_RESCHED,
+	CLP_GANG_AGAIN,
+	CLP_GANG_ABORT
+};
+
+/* callback of cl_page_gang_lookup() */
+typedef int   (*cl_page_gang_cb_t)  (const struct lu_env *, struct cl_io *,
+				     struct cl_page *, void *);
+int	     cl_page_gang_lookup (const struct lu_env *env,
+				     struct cl_object *obj,
+				     struct cl_io *io,
+				     pgoff_t start, pgoff_t end,
+				     cl_page_gang_cb_t cb, void *cbdata);
+struct cl_page *cl_page_lookup      (struct cl_object_header *hdr,
+				     pgoff_t index);
+struct cl_page *cl_page_find	(const struct lu_env *env,
+				     struct cl_object *obj,
+				     pgoff_t idx, struct page *vmpage,
+				     enum cl_page_type type);
+struct cl_page *cl_page_find_sub    (const struct lu_env *env,
+				     struct cl_object *obj,
+				     pgoff_t idx, struct page *vmpage,
+				     struct cl_page *parent);
+void	    cl_page_get	 (struct cl_page *page);
+void	    cl_page_put	 (const struct lu_env *env,
+				     struct cl_page *page);
+void	    cl_page_print       (const struct lu_env *env, void *cookie,
+				     lu_printer_t printer,
+				     const struct cl_page *pg);
+void	    cl_page_header_print(const struct lu_env *env, void *cookie,
+				     lu_printer_t printer,
+				     const struct cl_page *pg);
+struct page     *cl_page_vmpage      (const struct lu_env *env,
+				     struct cl_page *page);
+struct cl_page *cl_vmpage_page      (struct page *vmpage, struct cl_object *obj);
+struct cl_page *cl_page_top	 (struct cl_page *page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+				       const struct lu_device_type *dtype);
+
+/**
+ * \name ownership
+ *
+ * Functions dealing with the ownership of page by io.
+ */
+/** @{ */
+
+int  cl_page_own	(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+int  cl_page_own_try    (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+void cl_page_assume     (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+void cl_page_unassume   (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *pg);
+void cl_page_disown     (const struct lu_env *env,
+			 struct cl_io *io, struct cl_page *page);
+int  cl_page_is_owned   (const struct cl_page *pg, const struct cl_io *io);
+
+/** @} ownership */
+
+/**
+ * \name transfer
+ *
+ * Functions dealing with the preparation of a page for a transfer, and
+ * tracking transfer state.
+ */
+/** @{ */
+int  cl_page_prep       (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg, enum cl_req_type crt);
+void cl_page_completion (const struct lu_env *env,
+			 struct cl_page *pg, enum cl_req_type crt, int ioret);
+int  cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
+			 enum cl_req_type crt);
+int  cl_page_cache_add  (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg, enum cl_req_type crt);
+void cl_page_clip       (const struct lu_env *env, struct cl_page *pg,
+			 int from, int to);
+int  cl_page_cancel     (const struct lu_env *env, struct cl_page *page);
+int  cl_page_flush      (const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *pg);
+
+/** @} transfer */
+
+
+/**
+ * \name helper routines
+ * Functions to discard, delete and export a cl_page.
+ */
+/** @{ */
+void    cl_page_discard      (const struct lu_env *env, struct cl_io *io,
+			      struct cl_page *pg);
+void    cl_page_delete       (const struct lu_env *env, struct cl_page *pg);
+int     cl_page_unmap	(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page *pg);
+int     cl_page_is_vmlocked  (const struct lu_env *env,
+			      const struct cl_page *pg);
+void    cl_page_export       (const struct lu_env *env,
+			      struct cl_page *pg, int uptodate);
+int     cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page *page);
+loff_t  cl_offset	    (const struct cl_object *obj, pgoff_t idx);
+pgoff_t cl_index	     (const struct cl_object *obj, loff_t offset);
+int     cl_page_size	 (const struct cl_object *obj);
+int     cl_pages_prune       (const struct lu_env *env, struct cl_object *obj);
+
+void cl_lock_print      (const struct lu_env *env, void *cookie,
+			 lu_printer_t printer, const struct cl_lock *lock);
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+			 lu_printer_t printer,
+			 const struct cl_lock_descr *descr);
+/* @} helper */
+
+/** @} cl_page */
+
+/** \defgroup cl_lock cl_lock
+ * @{ */
+
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source);
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source);
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+				const struct cl_lock_descr *need,
+				const char *scope, const void *source);
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+				 struct cl_object *obj, pgoff_t index,
+				 struct cl_lock *except, int pending,
+				 int canceld);
+static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env,
+					      struct cl_object *obj,
+					      struct cl_page *page,
+					      struct cl_lock *except,
+					      int pending, int canceld)
+{
+	LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj));
+	return cl_lock_at_pgoff(env, obj, page->cp_index, except,
+				pending, canceld);
+}
+
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype);
+
+void  cl_lock_get       (struct cl_lock *lock);
+void  cl_lock_get_trust (struct cl_lock *lock);
+void  cl_lock_put       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_hold_add  (const struct lu_env *env, struct cl_lock *lock,
+			 const char *scope, const void *source);
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+			  const char *scope, const void *source);
+void  cl_lock_unhold    (const struct lu_env *env, struct cl_lock *lock,
+			 const char *scope, const void *source);
+void  cl_lock_release   (const struct lu_env *env, struct cl_lock *lock,
+			 const char *scope, const void *source);
+void  cl_lock_user_add  (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_user_del  (const struct lu_env *env, struct cl_lock *lock);
+
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+				     struct cl_lock *lock);
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state);
+int cl_lock_is_intransit(struct cl_lock *lock);
+
+int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock,
+			 int keep_mutex);
+
+/** \name statemachine statemachine
+ * Interface to lock state machine consists of 3 parts:
+ *
+ *     - "try" functions that attempt to effect a state transition. If state
+ *     transition is not possible right now (e.g., if it has to wait for some
+ *     asynchronous event to occur), these functions return
+ *     cl_lock_transition::CLO_WAIT.
+ *
+ *     - "non-try" functions that implement synchronous blocking interface on
+ *     top of non-blocking "try" functions. These functions repeatedly call
+ *     corresponding "try" versions, and if state transition is not possible
+ *     immediately, wait for lock state change.
+ *
+ *     - methods from cl_lock_operations, called by "try" functions. Lock can
+ *     be advanced to the target state only when all layers voted that they
+ *     are ready for this transition. "Try" functions call methods under lock
+ *     mutex. If a layer had to release a mutex, it re-acquires it and returns
+ *     cl_lock_transition::CLO_REPEAT, causing "try" function to call all
+ *     layers again.
+ *
+ * TRY	      NON-TRY      METHOD			    FINAL STATE
+ *
+ * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED
+ *
+ * cl_wait_try()    cl_wait()    cl_lock_operations::clo_wait()    CLS_HELD
+ *
+ * cl_unuse_try()   cl_unuse()   cl_lock_operations::clo_unuse()   CLS_CACHED
+ *
+ * cl_use_try()     NONE	 cl_lock_operations::clo_use()     CLS_HELD
+ *
+ * @{ */
+
+int   cl_enqueue    (const struct lu_env *env, struct cl_lock *lock,
+		     struct cl_io *io, __u32 flags);
+int   cl_wait       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_unuse      (const struct lu_env *env, struct cl_lock *lock);
+int   cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+		     struct cl_io *io, __u32 flags);
+int   cl_unuse_try  (const struct lu_env *env, struct cl_lock *lock);
+int   cl_wait_try   (const struct lu_env *env, struct cl_lock *lock);
+int   cl_use_try    (const struct lu_env *env, struct cl_lock *lock, int atomic);
+
+/** @} statemachine */
+
+void cl_lock_signal      (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_state_wait  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_state_set   (const struct lu_env *env, struct cl_lock *lock,
+			  enum cl_lock_state state);
+int  cl_queue_match      (const struct list_head *queue,
+			  const struct cl_lock_descr *need);
+
+void cl_lock_mutex_get  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_mutex_try  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_mutex_put  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_is_mutexed (struct cl_lock *lock);
+int  cl_lock_nr_mutexed (const struct lu_env *env);
+int  cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_ext_match  (const struct cl_lock_descr *has,
+			 const struct cl_lock_descr *need);
+int  cl_lock_descr_match(const struct cl_lock_descr *has,
+			 const struct cl_lock_descr *need);
+int  cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need);
+int  cl_lock_modify     (const struct lu_env *env, struct cl_lock *lock,
+			 const struct cl_lock_descr *desc);
+
+void cl_lock_closure_init (const struct lu_env *env,
+			   struct cl_lock_closure *closure,
+			   struct cl_lock *origin, int wait);
+void cl_lock_closure_fini (struct cl_lock_closure *closure);
+int  cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+			   struct cl_lock_closure *closure);
+void cl_lock_disclosure   (const struct lu_env *env,
+			   struct cl_lock_closure *closure);
+int  cl_lock_enclosure    (const struct lu_env *env, struct cl_lock *lock,
+			   struct cl_lock_closure *closure);
+
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error);
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait);
+
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock);
+
+/** @} cl_lock */
+
+/** \defgroup cl_io cl_io
+ * @{ */
+
+int   cl_io_init	 (const struct lu_env *env, struct cl_io *io,
+			  enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_sub_init     (const struct lu_env *env, struct cl_io *io,
+			  enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_rw_init      (const struct lu_env *env, struct cl_io *io,
+			  enum cl_io_type iot, loff_t pos, size_t count);
+int   cl_io_loop	 (const struct lu_env *env, struct cl_io *io);
+
+void  cl_io_fini	 (const struct lu_env *env, struct cl_io *io);
+int   cl_io_iter_init    (const struct lu_env *env, struct cl_io *io);
+void  cl_io_iter_fini    (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock	 (const struct lu_env *env, struct cl_io *io);
+void  cl_io_unlock       (const struct lu_env *env, struct cl_io *io);
+int   cl_io_start	(const struct lu_env *env, struct cl_io *io);
+void  cl_io_end	  (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock_add     (const struct lu_env *env, struct cl_io *io,
+			  struct cl_io_lock_link *link);
+int   cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+			   struct cl_lock_descr *descr);
+int   cl_io_read_page    (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page);
+int   cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_submit_rw    (const struct lu_env *env, struct cl_io *io,
+			  enum cl_req_type iot, struct cl_2queue *queue);
+int   cl_io_submit_sync  (const struct lu_env *env, struct cl_io *io,
+			  enum cl_req_type iot, struct cl_2queue *queue,
+			  long timeout);
+void  cl_io_rw_advance   (const struct lu_env *env, struct cl_io *io,
+			  size_t nob);
+int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *queue);
+int   cl_io_is_going     (const struct lu_env *env);
+
+/**
+ * True, iff \a io is an O_APPEND write(2).
+ */
+static inline int cl_io_is_append(const struct cl_io *io)
+{
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
+}
+
+static inline int cl_io_is_sync_write(const struct cl_io *io)
+{
+	return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
+}
+
+static inline int cl_io_is_mkwrite(const struct cl_io *io)
+{
+	return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
+}
+
+/**
+ * True, iff \a io is a truncate(2).
+ */
+static inline int cl_io_is_trunc(const struct cl_io *io)
+{
+	return io->ci_type == CIT_SETATTR &&
+		(io->u.ci_setattr.sa_valid & ATTR_SIZE);
+}
+
+struct cl_io *cl_io_top(struct cl_io *io);
+
+void cl_io_print(const struct lu_env *env, void *cookie,
+		 lu_printer_t printer, const struct cl_io *io);
+
+#define CL_IO_SLICE_CLEAN(foo_io, base)				 \
+do {								    \
+	typeof(foo_io) __foo_io = (foo_io);			     \
+									\
+	CLASSERT(offsetof(typeof(*__foo_io), base) == 0);	       \
+	memset(&__foo_io->base + 1, 0,				  \
+	       (sizeof *__foo_io) - sizeof __foo_io->base);	     \
+} while (0)
+
+/** @} cl_io */
+
+/** \defgroup cl_page_list cl_page_list
+ * @{ */
+
+/**
+ * Last page in the page list.
+ */
+static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist)
+{
+	LASSERT(plist->pl_nr > 0);
+	return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
+}
+
+/**
+ * Iterate over pages in a page list.
+ */
+#define cl_page_list_for_each(page, list)			       \
+	list_for_each_entry((page), &(list)->pl_pages, cp_batch)
+
+/**
+ * Iterate over pages in a page list, taking possible removals into account.
+ */
+#define cl_page_list_for_each_safe(page, temp, list)		    \
+	list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
+
+void cl_page_list_init   (struct cl_page_list *plist);
+void cl_page_list_add    (struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_move   (struct cl_page_list *dst, struct cl_page_list *src,
+			  struct cl_page *page);
+void cl_page_list_splice (struct cl_page_list *list,
+			  struct cl_page_list *head);
+void cl_page_list_del    (const struct lu_env *env,
+			  struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_disown (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_own    (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_assume (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_discard(const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_unmap  (const struct lu_env *env,
+			  struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_fini   (const struct lu_env *env, struct cl_page_list *plist);
+
+void cl_2queue_init     (struct cl_2queue *queue);
+void cl_2queue_add      (struct cl_2queue *queue, struct cl_page *page);
+void cl_2queue_disown   (const struct lu_env *env,
+			 struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_assume   (const struct lu_env *env,
+			 struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_discard  (const struct lu_env *env,
+			 struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_fini     (const struct lu_env *env, struct cl_2queue *queue);
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
+
+/** @} cl_page_list */
+
+/** \defgroup cl_req cl_req
+ * @{ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+			    enum cl_req_type crt, int nr_objects);
+
+void cl_req_page_add  (const struct lu_env *env, struct cl_req *req,
+		       struct cl_page *page);
+void cl_req_page_done (const struct lu_env *env, struct cl_page *page);
+int  cl_req_prep      (const struct lu_env *env, struct cl_req *req);
+void cl_req_attr_set  (const struct lu_env *env, struct cl_req *req,
+		       struct cl_req_attr *attr, obd_valid flags);
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret);
+
+/** \defgroup cl_sync_io cl_sync_io
+ * @{ */
+
+/**
+ * Anchor for synchronous transfer. This is allocated on a stack by thread
+ * doing synchronous transfer, and a pointer to this structure is set up in
+ * every page submitted for transfer. Transfer completion routine updates
+ * anchor and wakes up waiting thread when transfer is complete.
+ */
+struct cl_sync_io {
+	/** number of pages yet to be transferred. */
+	atomic_t		csi_sync_nr;
+	/** error code. */
+	int			csi_sync_rc;
+	/** barrier of destroy this structure */
+	atomic_t		csi_barrier;
+	/** completion to be signaled when transfer is complete. */
+	wait_queue_head_t		csi_waitq;
+};
+
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages);
+int  cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+		     struct cl_page_list *queue, struct cl_sync_io *anchor,
+		     long timeout);
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret);
+
+/** @} cl_sync_io */
+
+/** @} cl_req */
+
+/** \defgroup cl_env cl_env
+ *
+ * lu_env handling for a client.
+ *
+ * lu_env is an environment within which lustre code executes. Its major part
+ * is lu_context---a fast memory allocation mechanism that is used to conserve
+ * precious kernel stack space. Originally lu_env was designed for a server,
+ * where
+ *
+ *     - there is a (mostly) fixed number of threads, and
+ *
+ *     - call chains have no non-lustre portions inserted between lustre code.
+ *
+ * On a client both these assumtpion fails, because every user thread can
+ * potentially execute lustre code as part of a system call, and lustre calls
+ * into VFS or MM that call back into lustre.
+ *
+ * To deal with that, cl_env wrapper functions implement the following
+ * optimizations:
+ *
+ *     - allocation and destruction of environment is amortized by caching no
+ *     longer used environments instead of destroying them;
+ *
+ *     - there is a notion of "current" environment, attached to the kernel
+ *     data structure representing current thread Top-level lustre code
+ *     allocates an environment and makes it current, then calls into
+ *     non-lustre code, that in turn calls lustre back. Low-level lustre
+ *     code thus called can fetch environment created by the top-level code
+ *     and reuse it, avoiding additional environment allocation.
+ *       Right now, three interfaces can attach the cl_env to running thread:
+ *       - cl_env_get
+ *       - cl_env_implant
+ *       - cl_env_reexit(cl_env_reenter had to be called priorly)
+ *
+ * \see lu_env, lu_context, lu_context_key
+ * @{ */
+
+struct cl_env_nest {
+	int   cen_refcheck;
+	void *cen_cookie;
+};
+
+struct lu_env *cl_env_peek       (int *refcheck);
+struct lu_env *cl_env_get	(int *refcheck);
+struct lu_env *cl_env_alloc      (int *refcheck, __u32 tags);
+struct lu_env *cl_env_nested_get (struct cl_env_nest *nest);
+void	   cl_env_put	(struct lu_env *env, int *refcheck);
+void	   cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env);
+void	  *cl_env_reenter    (void);
+void	   cl_env_reexit     (void *cookie);
+void	   cl_env_implant    (struct lu_env *env, int *refcheck);
+void	   cl_env_unplant    (struct lu_env *env, int *refcheck);
+
+/** @} cl_env */
+
+/*
+ * Misc
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+				struct lu_device_type *ldt,
+				struct lu_device *next);
+/** @} clio */
+
+int cl_global_init(void);
+void cl_global_fini(void);
+
+#endif /* _LINUX_CL_OBJECT_H */

diff --git a/drivers/staging/lustre/lustre/include/dt_object.h b/drivers/staging/lustre/lustre/include/dt_object.h
new file mode 100644
index 0000000..e116bb2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/dt_object.h

@@ -0,0 +1,1498 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_DT_OBJECT_H
+#define __LUSTRE_DT_OBJECT_H
+
+/** \defgroup dt dt
+ * Sub-class of lu_object with methods common for "data" objects in OST stack.
+ *
+ * Data objects behave like regular files: you can read/write them, get and
+ * set their attributes. Implementation of dt interface is supposed to
+ * implement some form of garbage collection, normally reference counting
+ * (nlink) based one.
+ *
+ * Examples: osd (lustre/osd) is an implementation of dt interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+
+struct thandle;
+struct dt_device;
+struct dt_object;
+struct dt_index_features;
+struct niobuf_local;
+struct niobuf_remote;
+struct ldlm_enqueue_info;
+
+typedef enum {
+	MNTOPT_USERXATTR	= 0x00000001,
+	MNTOPT_ACL	      = 0x00000002,
+} mntopt_t;
+
+struct dt_device_param {
+	unsigned	   ddp_max_name_len;
+	unsigned	   ddp_max_nlink;
+	unsigned	   ddp_block_shift;
+	mntopt_t	   ddp_mntopts;
+	unsigned	   ddp_max_ea_size;
+	void	      *ddp_mnt; /* XXX: old code can retrieve mnt -bzzz */
+	int		ddp_mount_type;
+	unsigned long long ddp_maxbytes;
+	/* percentage of available space to reserve for grant error margin */
+	int		ddp_grant_reserved;
+	/* per-inode space consumption */
+	short	      ddp_inodespace;
+	/* per-fragment grant overhead to be used by client for grant
+	 * calculation */
+	int		ddp_grant_frag;
+};
+
+/**
+ * Per-transaction commit callback function
+ */
+struct dt_txn_commit_cb;
+typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th,
+			struct dt_txn_commit_cb *cb, int err);
+/**
+ * Special per-transaction callback for cases when just commit callback
+ * is needed and per-device callback are not convenient to use
+ */
+#define TRANS_COMMIT_CB_MAGIC	0xa0a00a0a
+#define MAX_COMMIT_CB_STR_LEN	32
+
+struct dt_txn_commit_cb {
+	struct list_head	dcb_linkage;
+	dt_cb_t		dcb_func;
+	__u32		dcb_magic;
+	char		dcb_name[MAX_COMMIT_CB_STR_LEN];
+};
+
+/**
+ * Operations on dt device.
+ */
+struct dt_device_operations {
+	/**
+	 * Return device-wide statistics.
+	 */
+	int   (*dt_statfs)(const struct lu_env *env,
+			   struct dt_device *dev, struct obd_statfs *osfs);
+	/**
+	 * Create transaction, described by \a param.
+	 */
+	struct thandle *(*dt_trans_create)(const struct lu_env *env,
+					   struct dt_device *dev);
+	/**
+	 * Start transaction, described by \a param.
+	 */
+	int   (*dt_trans_start)(const struct lu_env *env,
+				struct dt_device *dev, struct thandle *th);
+	/**
+	 * Finish previously started transaction.
+	 */
+	int   (*dt_trans_stop)(const struct lu_env *env,
+			       struct thandle *th);
+	/**
+	 * Add commit callback to the transaction.
+	 */
+	int   (*dt_trans_cb_add)(struct thandle *th,
+				 struct dt_txn_commit_cb *dcb);
+	/**
+	 * Return fid of root index object.
+	 */
+	int   (*dt_root_get)(const struct lu_env *env,
+			     struct dt_device *dev, struct lu_fid *f);
+	/**
+	 * Return device configuration data.
+	 */
+	void  (*dt_conf_get)(const struct lu_env *env,
+			     const struct dt_device *dev,
+			     struct dt_device_param *param);
+	/**
+	 *  handling device state, mostly for tests
+	 */
+	int   (*dt_sync)(const struct lu_env *env, struct dt_device *dev);
+	int   (*dt_ro)(const struct lu_env *env, struct dt_device *dev);
+	/**
+	  * Start a transaction commit asynchronously
+	  *
+	  * \param env environment
+	  * \param dev dt_device to start commit on
+	  *
+	  * \return 0 success, negative value if error
+	  */
+	 int   (*dt_commit_async)(const struct lu_env *env,
+				  struct dt_device *dev);
+	/**
+	 * Initialize capability context.
+	 */
+	int   (*dt_init_capa_ctxt)(const struct lu_env *env,
+				   struct dt_device *dev,
+				   int mode, unsigned long timeout,
+				   __u32 alg, struct lustre_capa_key *keys);
+};
+
+struct dt_index_features {
+	/** required feature flags from enum dt_index_flags */
+	__u32 dif_flags;
+	/** minimal required key size */
+	size_t dif_keysize_min;
+	/** maximal required key size, 0 if no limit */
+	size_t dif_keysize_max;
+	/** minimal required record size */
+	size_t dif_recsize_min;
+	/** maximal required record size, 0 if no limit */
+	size_t dif_recsize_max;
+	/** pointer size for record */
+	size_t dif_ptrsize;
+};
+
+enum dt_index_flags {
+	/** index supports variable sized keys */
+	DT_IND_VARKEY = 1 << 0,
+	/** index supports variable sized records */
+	DT_IND_VARREC = 1 << 1,
+	/** index can be modified */
+	DT_IND_UPDATE = 1 << 2,
+	/** index supports records with non-unique (duplicate) keys */
+	DT_IND_NONUNQ = 1 << 3,
+	/**
+	 * index support fixed-size keys sorted with natural numerical way
+	 * and is able to return left-side value if no exact value found
+	 */
+	DT_IND_RANGE = 1 << 4,
+};
+
+/**
+ * Features, required from index to support file system directories (mapping
+ * names to fids).
+ */
+extern const struct dt_index_features dt_directory_features;
+extern const struct dt_index_features dt_otable_features;
+extern const struct dt_index_features dt_lfsck_features;
+
+/* index features supported by the accounting objects */
+extern const struct dt_index_features dt_acct_features;
+
+/* index features supported by the quota global indexes */
+extern const struct dt_index_features dt_quota_glb_features;
+
+/* index features supported by the quota slave indexes */
+extern const struct dt_index_features dt_quota_slv_features;
+
+/**
+ * This is a general purpose dt allocation hint.
+ * It now contains the parent object.
+ * It can contain any allocation hint in the future.
+ */
+struct dt_allocation_hint {
+	struct dt_object	   *dah_parent;
+	__u32		       dah_mode;
+};
+
+/**
+ * object type specifier.
+ */
+
+enum dt_format_type {
+	DFT_REGULAR,
+	DFT_DIR,
+	/** for mknod */
+	DFT_NODE,
+	/** for special index */
+	DFT_INDEX,
+	/** for symbolic link */
+	DFT_SYM,
+};
+
+/**
+ * object format specifier.
+ */
+struct dt_object_format {
+	/** type for dt object */
+	enum dt_format_type dof_type;
+	union {
+		struct dof_regular {
+			int striped;
+		} dof_reg;
+		struct dof_dir {
+		} dof_dir;
+		struct dof_node {
+		} dof_node;
+		/**
+		 * special index need feature as parameter to create
+		 * special idx
+		 */
+		struct dof_index {
+			const struct dt_index_features *di_feat;
+		} dof_idx;
+	} u;
+};
+
+enum dt_format_type dt_mode_to_dft(__u32 mode);
+
+typedef __u64 dt_obj_version_t;
+
+/**
+ * Per-dt-object operations.
+ */
+struct dt_object_operations {
+	void  (*do_read_lock)(const struct lu_env *env,
+			      struct dt_object *dt, unsigned role);
+	void  (*do_write_lock)(const struct lu_env *env,
+			       struct dt_object *dt, unsigned role);
+	void  (*do_read_unlock)(const struct lu_env *env,
+				struct dt_object *dt);
+	void  (*do_write_unlock)(const struct lu_env *env,
+				 struct dt_object *dt);
+	int  (*do_write_locked)(const struct lu_env *env,
+				struct dt_object *dt);
+	/**
+	 * Note: following ->do_{x,}attr_{set,get}() operations are very
+	 * similar to ->moo_{x,}attr_{set,get}() operations in struct
+	 * md_object_operations (see md_object.h). These operations are not in
+	 * lu_object_operations, because ->do_{x,}attr_set() versions take
+	 * transaction handle as an argument (this transaction is started by
+	 * caller). We might factor ->do_{x,}attr_get() into
+	 * lu_object_operations, but that would break existing symmetry.
+	 */
+
+	/**
+	 * Return standard attributes.
+	 *
+	 * precondition: lu_object_exists(&dt->do_lu);
+	 */
+	int   (*do_attr_get)(const struct lu_env *env,
+			     struct dt_object *dt, struct lu_attr *attr,
+			     struct lustre_capa *capa);
+	/**
+	 * Set standard attributes.
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_attr_set)(const struct lu_env *env,
+				     struct dt_object *dt,
+				     const struct lu_attr *attr,
+				     struct thandle *handle);
+	int   (*do_attr_set)(const struct lu_env *env,
+			     struct dt_object *dt,
+			     const struct lu_attr *attr,
+			     struct thandle *handle,
+			     struct lustre_capa *capa);
+	/**
+	 * Return a value of an extended attribute.
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_xattr_get)(const struct lu_env *env, struct dt_object *dt,
+			      struct lu_buf *buf, const char *name,
+			      struct lustre_capa *capa);
+	/**
+	 * Set value of an extended attribute.
+	 *
+	 * \a fl - flags from enum lu_xattr_flags
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_xattr_set)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_buf *buf,
+				      const char *name, int fl,
+				      struct thandle *handle);
+	int   (*do_xattr_set)(const struct lu_env *env,
+			      struct dt_object *dt, const struct lu_buf *buf,
+			      const char *name, int fl, struct thandle *handle,
+			      struct lustre_capa *capa);
+	/**
+	 * Delete existing extended attribute.
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_xattr_del)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const char *name, struct thandle *handle);
+	int   (*do_xattr_del)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      const char *name, struct thandle *handle,
+			      struct lustre_capa *capa);
+	/**
+	 * Place list of existing extended attributes into \a buf (which has
+	 * length len).
+	 *
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_xattr_list)(const struct lu_env *env,
+			       struct dt_object *dt, struct lu_buf *buf,
+			       struct lustre_capa *capa);
+	/**
+	 * Init allocation hint using parent object and child mode.
+	 * (1) The \a parent might be NULL if this is a partial creation for
+	 *     remote object.
+	 * (2) The type of child is in \a child_mode.
+	 * (3) The result hint is stored in \a ah;
+	 */
+	void  (*do_ah_init)(const struct lu_env *env,
+			    struct dt_allocation_hint *ah,
+			    struct dt_object *parent,
+			    struct dt_object *child,
+			    umode_t child_mode);
+	/**
+	 * Create new object on this device.
+	 *
+	 * precondition: !dt_object_exists(dt);
+	 * postcondition: ergo(result == 0, dt_object_exists(dt));
+	 */
+	int   (*do_declare_create)(const struct lu_env *env,
+				   struct dt_object *dt,
+				   struct lu_attr *attr,
+				   struct dt_allocation_hint *hint,
+				   struct dt_object_format *dof,
+				   struct thandle *th);
+	int   (*do_create)(const struct lu_env *env, struct dt_object *dt,
+			   struct lu_attr *attr,
+			   struct dt_allocation_hint *hint,
+			   struct dt_object_format *dof,
+			   struct thandle *th);
+
+	/**
+	  Destroy object on this device
+	 * precondition: !dt_object_exists(dt);
+	 * postcondition: ergo(result == 0, dt_object_exists(dt));
+	 */
+	int   (*do_declare_destroy)(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct thandle *th);
+	int   (*do_destroy)(const struct lu_env *env, struct dt_object *dt,
+			    struct thandle *th);
+
+	/**
+	 * Announce that this object is going to be used as an index. This
+	 * operation check that object supports indexing operations and
+	 * installs appropriate dt_index_operations vector on success.
+	 *
+	 * Also probes for features. Operation is successful if all required
+	 * features are supported.
+	 */
+	int   (*do_index_try)(const struct lu_env *env,
+			      struct dt_object *dt,
+			      const struct dt_index_features *feat);
+	/**
+	 * Add nlink of the object
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_ref_add)(const struct lu_env *env,
+				    struct dt_object *dt, struct thandle *th);
+	int   (*do_ref_add)(const struct lu_env *env,
+			    struct dt_object *dt, struct thandle *th);
+	/**
+	 * Del nlink of the object
+	 * precondition: dt_object_exists(dt);
+	 */
+	int   (*do_declare_ref_del)(const struct lu_env *env,
+				    struct dt_object *dt, struct thandle *th);
+	int   (*do_ref_del)(const struct lu_env *env,
+			    struct dt_object *dt, struct thandle *th);
+
+	struct obd_capa *(*do_capa_get)(const struct lu_env *env,
+					struct dt_object *dt,
+					struct lustre_capa *old,
+					__u64 opc);
+	int (*do_object_sync)(const struct lu_env *, struct dt_object *);
+	/**
+	 * Get object info of next level. Currently, only get inode from osd.
+	 * This is only used by quota b=16542
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*do_data_get)(const struct lu_env *env, struct dt_object *dt,
+			   void **data);
+
+	/**
+	 * Lock object.
+	 */
+	int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt,
+			      struct lustre_handle *lh,
+			      struct ldlm_enqueue_info *einfo,
+			      void *policy);
+};
+
+/**
+ * Per-dt-object operations on "file body".
+ */
+struct dt_body_operations {
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	ssize_t (*dbo_read)(const struct lu_env *env, struct dt_object *dt,
+			    struct lu_buf *buf, loff_t *pos,
+			    struct lustre_capa *capa);
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	ssize_t (*dbo_declare_write)(const struct lu_env *env,
+				     struct dt_object *dt,
+				     const loff_t size, loff_t pos,
+				     struct thandle *handle);
+	ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt,
+			     const struct lu_buf *buf, loff_t *pos,
+			     struct thandle *handle, struct lustre_capa *capa,
+			     int ignore_quota);
+	/*
+	 * methods for zero-copy IO
+	 */
+
+	/*
+	 * precondition: dt_object_exists(dt);
+	 * returns:
+	 * < 0 - error code
+	 * = 0 - illegal
+	 * > 0 - number of local buffers prepared
+	 */
+	int (*dbo_bufs_get)(const struct lu_env *env, struct dt_object *dt,
+			    loff_t pos, ssize_t len, struct niobuf_local *lb,
+			    int rw, struct lustre_capa *capa);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_bufs_put)(const struct lu_env *env, struct dt_object *dt,
+			    struct niobuf_local *lb, int nr);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_write_prep)(const struct lu_env *env, struct dt_object *dt,
+			      struct niobuf_local *lb, int nr);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_declare_write_commit)(const struct lu_env *env,
+					struct dt_object *dt,
+					struct niobuf_local *,
+					int, struct thandle *);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_write_commit)(const struct lu_env *env, struct dt_object *dt,
+				struct niobuf_local *, int, struct thandle *);
+	/*
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dbo_read_prep)(const struct lu_env *env, struct dt_object *dt,
+			     struct niobuf_local *lnb, int nr);
+	int (*dbo_fiemap_get)(const struct lu_env *env, struct dt_object *dt,
+			      struct ll_user_fiemap *fm);
+	/**
+	 * Punch object's content
+	 * precondition: regular object, not index
+	 */
+	int   (*dbo_declare_punch)(const struct lu_env *, struct dt_object *,
+				  __u64, __u64, struct thandle *th);
+	int   (*dbo_punch)(const struct lu_env *env, struct dt_object *dt,
+			  __u64 start, __u64 end, struct thandle *th,
+			  struct lustre_capa *capa);
+};
+
+/**
+ * Incomplete type of index record.
+ */
+struct dt_rec;
+
+/**
+ * Incomplete type of index key.
+ */
+struct dt_key;
+
+/**
+ * Incomplete type of dt iterator.
+ */
+struct dt_it;
+
+/**
+ * Per-dt-object operations on object as index.
+ */
+struct dt_index_operations {
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dio_lookup)(const struct lu_env *env, struct dt_object *dt,
+			  struct dt_rec *rec, const struct dt_key *key,
+			  struct lustre_capa *capa);
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dio_declare_insert)(const struct lu_env *env,
+				  struct dt_object *dt,
+				  const struct dt_rec *rec,
+				  const struct dt_key *key,
+				  struct thandle *handle);
+	int (*dio_insert)(const struct lu_env *env, struct dt_object *dt,
+			  const struct dt_rec *rec, const struct dt_key *key,
+			  struct thandle *handle, struct lustre_capa *capa,
+			  int ignore_quota);
+	/**
+	 * precondition: dt_object_exists(dt);
+	 */
+	int (*dio_declare_delete)(const struct lu_env *env,
+				  struct dt_object *dt,
+				  const struct dt_key *key,
+				  struct thandle *handle);
+	int (*dio_delete)(const struct lu_env *env, struct dt_object *dt,
+			  const struct dt_key *key, struct thandle *handle,
+			  struct lustre_capa *capa);
+	/**
+	 * Iterator interface
+	 */
+	struct dt_it_ops {
+		/**
+		 * Allocate and initialize new iterator.
+		 *
+		 * precondition: dt_object_exists(dt);
+		 */
+		struct dt_it *(*init)(const struct lu_env *env,
+				      struct dt_object *dt,
+				      __u32 attr,
+				      struct lustre_capa *capa);
+		void	  (*fini)(const struct lu_env *env,
+				      struct dt_it *di);
+		int	    (*get)(const struct lu_env *env,
+				      struct dt_it *di,
+				      const struct dt_key *key);
+		void	   (*put)(const struct lu_env *env,
+				      struct dt_it *di);
+		int	   (*next)(const struct lu_env *env,
+				      struct dt_it *di);
+		struct dt_key *(*key)(const struct lu_env *env,
+				      const struct dt_it *di);
+		int       (*key_size)(const struct lu_env *env,
+				      const struct dt_it *di);
+		int	    (*rec)(const struct lu_env *env,
+				      const struct dt_it *di,
+				      struct dt_rec *rec,
+				      __u32 attr);
+		__u64	(*store)(const struct lu_env *env,
+				      const struct dt_it *di);
+		int	   (*load)(const struct lu_env *env,
+				      const struct dt_it *di, __u64 hash);
+		int	(*key_rec)(const struct lu_env *env,
+				      const struct dt_it *di, void* key_rec);
+	} dio_it;
+};
+
+enum dt_otable_it_valid {
+	DOIV_ERROR_HANDLE	= 0x0001,
+};
+
+enum dt_otable_it_flags {
+	/* Exit when fail. */
+	DOIF_FAILOUT	= 0x0001,
+
+	/* Reset iteration position to the device beginning. */
+	DOIF_RESET	= 0x0002,
+
+	/* There is up layer component uses the iteration. */
+	DOIF_OUTUSED	= 0x0004,
+};
+
+/* otable based iteration needs to use the common DT interation APIs.
+ * To initialize the iteration, it needs call dio_it::init() firstly.
+ * Here is how the otable based iteration should prepare arguments to
+ * call dt_it_ops::init().
+ *
+ * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init()
+ * is composed of two parts:
+ * low 16-bits is for valid bits, high 16-bits is for flags bits. */
+#define DT_OTABLE_IT_FLAGS_SHIFT	16
+#define DT_OTABLE_IT_FLAGS_MASK 	0xffff0000
+
+struct dt_device {
+	struct lu_device		   dd_lu_dev;
+	const struct dt_device_operations *dd_ops;
+
+	/**
+	 * List of dt_txn_callback (see below). This is not protected in any
+	 * way, because callbacks are supposed to be added/deleted only during
+	 * single-threaded start-up shut-down procedures.
+	 */
+	struct list_head			 dd_txn_callbacks;
+};
+
+int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
+void dt_device_fini(struct dt_device *dev);
+
+static inline int lu_device_is_dt(const struct lu_device *d)
+{
+	return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT);
+}
+
+static inline struct dt_device * lu2dt_dev(struct lu_device *l)
+{
+	LASSERT(lu_device_is_dt(l));
+	return container_of0(l, struct dt_device, dd_lu_dev);
+}
+
+struct dt_object {
+	struct lu_object		   do_lu;
+	const struct dt_object_operations *do_ops;
+	const struct dt_body_operations   *do_body_ops;
+	const struct dt_index_operations  *do_index_ops;
+};
+
+/*
+ * In-core representation of per-device local object OID storage
+ */
+struct local_oid_storage {
+	/* all initialized llog systems on this node linked by this */
+	struct list_head	  los_list;
+
+	/* how many handle's reference this los has */
+	atomic_t	  los_refcount;
+	struct dt_device *los_dev;
+	struct dt_object *los_obj;
+
+	/* data used to generate new fids */
+	struct mutex	 los_id_lock;
+	__u64		  los_seq;
+	__u32		  los_last_oid;
+};
+
+static inline struct dt_object *lu2dt(struct lu_object *l)
+{
+	LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev));
+	return container_of0(l, struct dt_object, do_lu);
+}
+
+int  dt_object_init(struct dt_object *obj,
+		    struct lu_object_header *h, struct lu_device *d);
+
+void dt_object_fini(struct dt_object *obj);
+
+static inline int dt_object_exists(const struct dt_object *dt)
+{
+	return lu_object_exists(&dt->do_lu);
+}
+
+static inline int dt_object_remote(const struct dt_object *dt)
+{
+	return lu_object_remote(&dt->do_lu);
+}
+
+static inline struct dt_object *lu2dt_obj(struct lu_object *o)
+{
+	LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev)));
+	return container_of0(o, struct dt_object, do_lu);
+}
+
+/**
+ * This is the general purpose transaction handle.
+ * 1. Transaction Life Cycle
+ *      This transaction handle is allocated upon starting a new transaction,
+ *      and deallocated after this transaction is committed.
+ * 2. Transaction Nesting
+ *      We do _NOT_ support nested transaction. So, every thread should only
+ *      have one active transaction, and a transaction only belongs to one
+ *      thread. Due to this, transaction handle need no reference count.
+ * 3. Transaction & dt_object locking
+ *      dt_object locks should be taken inside transaction.
+ * 4. Transaction & RPC
+ *      No RPC request should be issued inside transaction.
+ */
+struct thandle {
+	/** the dt device on which the transactions are executed */
+	struct dt_device *th_dev;
+
+	/** context for this transaction, tag is LCT_TX_HANDLE */
+	struct lu_context th_ctx;
+
+	/** additional tags (layers can add in declare) */
+	__u32	     th_tags;
+
+	/** the last operation result in this transaction.
+	 * this value is used in recovery */
+	__s32	     th_result;
+
+	/** whether we need sync commit */
+	unsigned int		th_sync:1;
+
+	/* local transation, no need to inform other layers */
+	unsigned int		th_local:1;
+
+	/* In DNE, one transaction can be disassemblied into
+	 * updates on several different MDTs, and these updates
+	 * will be attached to th_remote_update_list per target.
+	 * Only single thread will access the list, no need lock
+	 */
+	struct list_head		th_remote_update_list;
+	struct update_request	*th_current_request;
+};
+
+/**
+ * Transaction call-backs.
+ *
+ * These are invoked by osd (or underlying transaction engine) when
+ * transaction changes state.
+ *
+ * Call-backs are used by upper layers to modify transaction parameters and to
+ * perform some actions on for each transaction state transition. Typical
+ * example is mdt registering call-back to write into last-received file
+ * before each transaction commit.
+ */
+struct dt_txn_callback {
+	int (*dtc_txn_start)(const struct lu_env *env,
+			     struct thandle *txn, void *cookie);
+	int (*dtc_txn_stop)(const struct lu_env *env,
+			    struct thandle *txn, void *cookie);
+	void (*dtc_txn_commit)(struct thandle *txn, void *cookie);
+	void		*dtc_cookie;
+	__u32		dtc_tag;
+	struct list_head	   dtc_linkage;
+};
+
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb);
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb);
+
+int dt_txn_hook_start(const struct lu_env *env,
+		      struct dt_device *dev, struct thandle *txn);
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn);
+void dt_txn_hook_commit(struct thandle *txn);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj);
+
+/**
+ * Callback function used for parsing path.
+ * \see llo_store_resolve
+ */
+typedef int (*dt_entry_func_t)(const struct lu_env *env,
+			    const char *name,
+			    void *pvt);
+
+#define DT_MAX_PATH 1024
+
+int dt_path_parser(const struct lu_env *env,
+		   char *local, dt_entry_func_t entry_func,
+		   void *data);
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid);
+
+struct dt_object *dt_store_open(const struct lu_env *env,
+				struct dt_device *dt,
+				const char *dirname,
+				const char *filename,
+				struct lu_fid *fid);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object_format *dof,
+				    struct lu_attr *attr);
+
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev,
+			       const struct lu_fid *fid,
+			       struct lu_device *top_dev);
+static inline struct dt_object *
+dt_locate(const struct lu_env *env, struct dt_device *dev,
+	  const struct lu_fid *fid)
+{
+	return dt_locate_at(env, dev, fid, dev->dd_lu_dev.ld_site->ls_top_dev);
+}
+
+
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+			   const struct lu_fid *first_fid,
+			   struct local_oid_storage **los);
+void local_oid_storage_fini(const struct lu_env *env,
+			    struct local_oid_storage *los);
+int local_object_fid_generate(const struct lu_env *env,
+			      struct local_oid_storage *los,
+			      struct lu_fid *fid);
+int local_object_declare_create(const struct lu_env *env,
+				struct local_oid_storage *los,
+				struct dt_object *o,
+				struct lu_attr *attr,
+				struct dt_object_format *dof,
+				struct thandle *th);
+int local_object_create(const struct lu_env *env,
+			struct local_oid_storage *los,
+			struct dt_object *o,
+			struct lu_attr *attr, struct dt_object_format *dof,
+			struct thandle *th);
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+					    struct local_oid_storage *los,
+					    struct dt_object *parent,
+					    const char *name, __u32 mode);
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+						     struct dt_device *dt,
+						     const struct lu_fid *fid,
+						     struct dt_object *parent,
+						     const char *name,
+						     __u32 mode);
+struct dt_object *
+local_index_find_or_create(const struct lu_env *env,
+			   struct local_oid_storage *los,
+			   struct dt_object *parent,
+			   const char *name, __u32 mode,
+			   const struct dt_index_features *ft);
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object *parent,
+				    const char *name, __u32 mode,
+				    const struct dt_index_features *ft);
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+			struct dt_object *parent, const char *name);
+
+static inline int dt_object_lock(const struct lu_env *env,
+				 struct dt_object *o, struct lustre_handle *lh,
+				 struct ldlm_enqueue_info *einfo,
+				 void *policy)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_object_lock);
+	return o->do_ops->do_object_lock(env, o, lh, einfo, policy);
+}
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+		  const char *name, struct lu_fid *fid);
+
+static inline int dt_object_sync(const struct lu_env *env,
+				 struct dt_object *o)
+{
+	LASSERT(o);
+	LASSERT(o->do_ops);
+	LASSERT(o->do_ops->do_object_sync);
+	return o->do_ops->do_object_sync(env, o);
+}
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+			   struct thandle *th);
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+		    dt_obj_version_t version, struct thandle *th);
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o);
+
+
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+	    struct lu_buf *buf, loff_t *pos);
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+		   struct lu_buf *buf, loff_t *pos);
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_buf *buf, loff_t *pos, struct thandle *th);
+typedef int (*dt_index_page_build_t)(const struct lu_env *env,
+				     union lu_page *lp, int nob,
+				     const struct dt_it_ops *iops,
+				     struct dt_it *it, __u32 attr, void *arg);
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg);
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg);
+
+static inline struct thandle *dt_trans_create(const struct lu_env *env,
+					      struct dt_device *d)
+{
+	LASSERT(d->dd_ops->dt_trans_create);
+	return d->dd_ops->dt_trans_create(env, d);
+}
+
+static inline int dt_trans_start(const struct lu_env *env,
+				 struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_start);
+	return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+/* for this transaction hooks shouldn't be called */
+static inline int dt_trans_start_local(const struct lu_env *env,
+				       struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_start);
+	th->th_local = 1;
+	return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+static inline int dt_trans_stop(const struct lu_env *env,
+				struct dt_device *d, struct thandle *th)
+{
+	LASSERT(d->dd_ops->dt_trans_stop);
+	return d->dd_ops->dt_trans_stop(env, th);
+}
+
+static inline int dt_trans_cb_add(struct thandle *th,
+				  struct dt_txn_commit_cb *dcb)
+{
+	LASSERT(th->th_dev->dd_ops->dt_trans_cb_add);
+	dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC;
+	return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb);
+}
+/** @} dt */
+
+
+static inline int dt_declare_record_write(const struct lu_env *env,
+					  struct dt_object *dt,
+					  int size, loff_t pos,
+					  struct thandle *th)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_declare_write);
+	rc = dt->do_body_ops->dbo_declare_write(env, dt, size, pos, th);
+	return rc;
+}
+
+static inline int dt_declare_create(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct lu_attr *attr,
+				    struct dt_allocation_hint *hint,
+				    struct dt_object_format *dof,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_create);
+	return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_create(const struct lu_env *env,
+				    struct dt_object *dt,
+				    struct lu_attr *attr,
+				    struct dt_allocation_hint *hint,
+				    struct dt_object_format *dof,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_create);
+	return dt->do_ops->do_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_declare_destroy(const struct lu_env *env,
+				     struct dt_object *dt,
+				     struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_destroy);
+	return dt->do_ops->do_declare_destroy(env, dt, th);
+}
+
+static inline int dt_destroy(const struct lu_env *env,
+			     struct dt_object *dt,
+			     struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_destroy);
+	return dt->do_ops->do_destroy(env, dt, th);
+}
+
+static inline void dt_read_lock(const struct lu_env *env,
+				struct dt_object *dt,
+				unsigned role)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_read_lock);
+	dt->do_ops->do_read_lock(env, dt, role);
+}
+
+static inline void dt_write_lock(const struct lu_env *env,
+				struct dt_object *dt,
+				unsigned role)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_write_lock);
+	dt->do_ops->do_write_lock(env, dt, role);
+}
+
+static inline void dt_read_unlock(const struct lu_env *env,
+				struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_read_unlock);
+	dt->do_ops->do_read_unlock(env, dt);
+}
+
+static inline void dt_write_unlock(const struct lu_env *env,
+				struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_write_unlock);
+	dt->do_ops->do_write_unlock(env, dt);
+}
+
+static inline int dt_write_locked(const struct lu_env *env,
+				  struct dt_object *dt)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_write_locked);
+	return dt->do_ops->do_write_locked(env, dt);
+}
+
+static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt,
+			      struct lu_attr *la, void *arg)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_attr_get);
+	return dt->do_ops->do_attr_get(env, dt, la, arg);
+}
+
+static inline int dt_declare_attr_set(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_attr *la,
+				      struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_attr_set);
+	return dt->do_ops->do_declare_attr_set(env, dt, la, th);
+}
+
+static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt,
+			      const struct lu_attr *la, struct thandle *th,
+			      struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_attr_set);
+	return dt->do_ops->do_attr_set(env, dt, la, th, capa);
+}
+
+static inline int dt_declare_ref_add(const struct lu_env *env,
+				     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_ref_add);
+	return dt->do_ops->do_declare_ref_add(env, dt, th);
+}
+
+static inline int dt_ref_add(const struct lu_env *env,
+			     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_ref_add);
+	return dt->do_ops->do_ref_add(env, dt, th);
+}
+
+static inline int dt_declare_ref_del(const struct lu_env *env,
+				     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_ref_del);
+	return dt->do_ops->do_declare_ref_del(env, dt, th);
+}
+
+static inline int dt_ref_del(const struct lu_env *env,
+			     struct dt_object *dt, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_ref_del);
+	return dt->do_ops->do_ref_del(env, dt, th);
+}
+
+static inline struct obd_capa *dt_capa_get(const struct lu_env *env,
+					   struct dt_object *dt,
+					   struct lustre_capa *old, __u64 opc)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_ref_del);
+	return dt->do_ops->do_capa_get(env, dt, old, opc);
+}
+
+static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
+			      struct niobuf_remote *rnb,
+			      struct niobuf_local *lnb, int rw,
+			      struct lustre_capa *capa)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_bufs_get);
+	return d->do_body_ops->dbo_bufs_get(env, d, rnb->offset,
+					    rnb->len, lnb, rw, capa);
+}
+
+static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
+			      struct niobuf_local *lnb, int n)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_bufs_put);
+	return d->do_body_ops->dbo_bufs_put(env, d, lnb, n);
+}
+
+static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d,
+				struct niobuf_local *lnb, int n)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_write_prep);
+	return d->do_body_ops->dbo_write_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write_commit(const struct lu_env *env,
+					  struct dt_object *d,
+					  struct niobuf_local *lnb,
+					  int n, struct thandle *th)
+{
+	LASSERTF(d != NULL, "dt is NULL when we want to declare write\n");
+	LASSERT(th != NULL);
+	return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th);
+}
+
+
+static inline int dt_write_commit(const struct lu_env *env,
+				  struct dt_object *d, struct niobuf_local *lnb,
+				  int n, struct thandle *th)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_write_commit);
+	return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th);
+}
+
+static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d,
+			       struct niobuf_local *lnb, int n)
+{
+	LASSERT(d);
+	LASSERT(d->do_body_ops);
+	LASSERT(d->do_body_ops->dbo_read_prep);
+	return d->do_body_ops->dbo_read_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_punch(const struct lu_env *env,
+				   struct dt_object *dt, __u64 start,
+				   __u64 end, struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_declare_punch);
+	return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th);
+}
+
+static inline int dt_punch(const struct lu_env *env, struct dt_object *dt,
+			   __u64 start, __u64 end, struct thandle *th,
+			   struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_punch);
+	return dt->do_body_ops->dbo_punch(env, dt, start, end, th, capa);
+}
+
+static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d,
+				struct ll_user_fiemap *fm)
+{
+	LASSERT(d);
+	if (d->do_body_ops == NULL)
+		return -EPROTO;
+	if (d->do_body_ops->dbo_fiemap_get == NULL)
+		return -EOPNOTSUPP;
+	return d->do_body_ops->dbo_fiemap_get(env, d, fm);
+}
+
+static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev,
+			    struct obd_statfs *osfs)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_statfs);
+	return dev->dd_ops->dt_statfs(env, dev, osfs);
+}
+
+static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev,
+			      struct lu_fid *f)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_root_get);
+	return dev->dd_ops->dt_root_get(env, dev, f);
+}
+
+static inline void dt_conf_get(const struct lu_env *env,
+			       const struct dt_device *dev,
+			       struct dt_device_param *param)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_conf_get);
+	return dev->dd_ops->dt_conf_get(env, dev, param);
+}
+
+static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_sync);
+	return dev->dd_ops->dt_sync(env, dev);
+}
+
+static inline int dt_ro(const struct lu_env *env, struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_ro);
+	return dev->dd_ops->dt_ro(env, dev);
+}
+
+static inline int dt_declare_insert(const struct lu_env *env,
+				    struct dt_object *dt,
+				    const struct dt_rec *rec,
+				    const struct dt_key *key,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_declare_insert);
+	return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th);
+}
+
+static inline int dt_insert(const struct lu_env *env,
+				    struct dt_object *dt,
+				    const struct dt_rec *rec,
+				    const struct dt_key *key,
+				    struct thandle *th,
+				    struct lustre_capa *capa,
+				    int noquota)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_insert);
+	return dt->do_index_ops->dio_insert(env, dt, rec, key, th,
+					    capa, noquota);
+}
+
+static inline int dt_declare_xattr_del(const struct lu_env *env,
+				       struct dt_object *dt,
+				       const char *name,
+				       struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_xattr_del);
+	return dt->do_ops->do_declare_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_xattr_del(const struct lu_env *env,
+			       struct dt_object *dt, const char *name,
+			       struct thandle *th,
+			       struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_del);
+	return dt->do_ops->do_xattr_del(env, dt, name, th, capa);
+}
+
+static inline int dt_declare_xattr_set(const struct lu_env *env,
+				      struct dt_object *dt,
+				      const struct lu_buf *buf,
+				      const char *name, int fl,
+				      struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_declare_xattr_set);
+	return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_xattr_set(const struct lu_env *env,
+			      struct dt_object *dt, const struct lu_buf *buf,
+			      const char *name, int fl, struct thandle *th,
+			      struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_set);
+	return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th, capa);
+}
+
+static inline int dt_xattr_get(const struct lu_env *env,
+			      struct dt_object *dt, struct lu_buf *buf,
+			      const char *name, struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_get);
+	return dt->do_ops->do_xattr_get(env, dt, buf, name, capa);
+}
+
+static inline int dt_xattr_list(const struct lu_env *env,
+			       struct dt_object *dt, struct lu_buf *buf,
+			       struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_ops);
+	LASSERT(dt->do_ops->do_xattr_list);
+	return dt->do_ops->do_xattr_list(env, dt, buf, capa);
+}
+
+static inline int dt_declare_delete(const struct lu_env *env,
+				    struct dt_object *dt,
+				    const struct dt_key *key,
+				    struct thandle *th)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_declare_delete);
+	return dt->do_index_ops->dio_declare_delete(env, dt, key, th);
+}
+
+static inline int dt_delete(const struct lu_env *env,
+			    struct dt_object *dt,
+			    const struct dt_key *key,
+			    struct thandle *th,
+			    struct lustre_capa *capa)
+{
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_delete);
+	return dt->do_index_ops->dio_delete(env, dt, key, th, capa);
+}
+
+static inline int dt_commit_async(const struct lu_env *env,
+				  struct dt_device *dev)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_commit_async);
+	return dev->dd_ops->dt_commit_async(env, dev);
+}
+
+static inline int dt_init_capa_ctxt(const struct lu_env *env,
+				    struct dt_device *dev,
+				    int mode, unsigned long timeout,
+				    __u32 alg, struct lustre_capa_key *keys)
+{
+	LASSERT(dev);
+	LASSERT(dev->dd_ops);
+	LASSERT(dev->dd_ops->dt_init_capa_ctxt);
+	return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode,
+					      timeout, alg, keys);
+}
+
+static inline int dt_lookup(const struct lu_env *env,
+			    struct dt_object *dt,
+			    struct dt_rec *rec,
+			    const struct dt_key *key,
+			    struct lustre_capa *capa)
+{
+	int ret;
+
+	LASSERT(dt);
+	LASSERT(dt->do_index_ops);
+	LASSERT(dt->do_index_ops->dio_lookup);
+
+	ret = dt->do_index_ops->dio_lookup(env, dt, rec, key, capa);
+	if (ret > 0)
+		ret = 0;
+	else if (ret == 0)
+		ret = -ENOENT;
+	return ret;
+}
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+struct dt_find_hint {
+	struct lu_fid	*dfh_fid;
+	struct dt_device     *dfh_dt;
+	struct dt_object     *dfh_o;
+};
+
+struct dt_thread_info {
+	char		     dti_buf[DT_MAX_PATH];
+	struct dt_find_hint      dti_dfh;
+	struct lu_attr	   dti_attr;
+	struct lu_fid	    dti_fid;
+	struct dt_object_format  dti_dof;
+	struct lustre_mdt_attrs  dti_lma;
+	struct lu_buf	    dti_lb;
+	loff_t		   dti_off;
+};
+
+extern struct lu_context_key dt_key;
+
+static inline struct dt_thread_info *dt_info(const struct lu_env *env)
+{
+	struct dt_thread_info *dti;
+
+	dti = lu_context_key_get(&env->le_ctx, &dt_key);
+	LASSERT(dti);
+	return dti;
+}
+
+int dt_global_init(void);
+void dt_global_fini(void);
+
+# ifdef LPROCFS
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+			  int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+			      int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+			     int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+			      int count, int *eof, void *data);
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+			     int count, int *eof, void *data);
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+			    int count, int *eof, void *data);
+# endif /* LPROCFS */
+
+#endif /* __LUSTRE_DT_OBJECT_H */

diff --git a/drivers/staging/lustre/lustre/include/interval_tree.h b/drivers/staging/lustre/lustre/include/interval_tree.h
new file mode 100644
index 0000000..dfdb8aa
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/interval_tree.h

@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/interval_tree.h
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+
+#ifndef _INTERVAL_H__
+#define _INTERVAL_H__
+
+#include <linux/libcfs/libcfs.h>   /* LASSERT. */
+
+struct interval_node {
+	struct interval_node   *in_left;
+	struct interval_node   *in_right;
+	struct interval_node   *in_parent;
+	unsigned		in_color:1,
+				in_intree:1, /** set if the node is in tree */
+				in_res1:30;
+	__u8		    in_res2[4];  /** tags, 8-bytes aligned */
+	__u64		   in_max_high;
+	struct interval_node_extent {
+		__u64 start;
+		__u64 end;
+	} in_extent;
+};
+
+enum interval_iter {
+	INTERVAL_ITER_CONT = 1,
+	INTERVAL_ITER_STOP = 2
+};
+
+static inline int interval_is_intree(struct interval_node *node)
+{
+	return node->in_intree == 1;
+}
+
+static inline __u64 interval_low(struct interval_node *node)
+{
+	return node->in_extent.start;
+}
+
+static inline __u64 interval_high(struct interval_node *node)
+{
+	return node->in_extent.end;
+}
+
+static inline void interval_set(struct interval_node *node,
+				__u64 start, __u64 end)
+{
+	LASSERT(start <= end);
+	node->in_extent.start = start;
+	node->in_extent.end = end;
+	node->in_max_high = end;
+}
+
+/* Rules to write an interval callback.
+ *  - the callback returns INTERVAL_ITER_STOP when it thinks the iteration
+ *    should be stopped. It will then cause the iteration function to return
+ *    immediately with return value INTERVAL_ITER_STOP.
+ *  - callbacks for interval_iterate and interval_iterate_reverse: Every
+ *    nodes in the tree will be set to @node before the callback being called
+ *  - callback for interval_search: Only overlapped node will be set to @node
+ *    before the callback being called.
+ */
+typedef enum interval_iter (*interval_callback_t)(struct interval_node *node,
+						  void *args);
+
+struct interval_node *interval_insert(struct interval_node *node,
+				      struct interval_node **root);
+void interval_erase(struct interval_node *node, struct interval_node **root);
+
+/* Search the extents in the tree and call @func for each overlapped
+ * extents. */
+enum interval_iter interval_search(struct interval_node *root,
+				   struct interval_node_extent *ex,
+				   interval_callback_t func, void *data);
+
+/* Iterate every node in the tree - by reverse order or regular order. */
+enum interval_iter interval_iterate(struct interval_node *root,
+				    interval_callback_t func, void *data);
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+				    interval_callback_t func,void *data);
+
+void interval_expand(struct interval_node *root,
+		     struct interval_node_extent *ext,
+		     struct interval_node_extent *limiter);
+int interval_is_overlapped(struct interval_node *root,
+			   struct interval_node_extent *ex);
+struct interval_node *interval_find(struct interval_node *root,
+				    struct interval_node_extent *ex);
+#endif

diff --git a/drivers/staging/lustre/lustre/include/ioctl.h b/drivers/staging/lustre/lustre/include/ioctl.h
new file mode 100644
index 0000000..227c261
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/ioctl.h

@@ -0,0 +1,106 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _IOWR
+
+/* On i386 and x86_64, _ASM_I386_IOCTL_H is defined by the kernel's ioctl.h,
+ * and on newer kernels this header is shared as _ASM_GENERIC_IOCTL_H.
+ *
+ * We can avoid any problems with the kernel header being included again by
+ * defining _ASM_I386_IOCTL_H here so that a later occurence of <asm/ioctl.h>
+ * does not include the kernel's ioctl.h after this one. b=14746 */
+#define _ASM_I386_IOCTL_H
+#define _ASM_GENERIC_IOCTL_H
+
+/* ioctl command encoding: 32 bits total, command in lower 16 bits,
+ * size of the parameter structure in the lower 14 bits of the
+ * upper 16 bits.
+ * Encoding the size of the parameter structure in the ioctl request
+ * The highest 2 bits are reserved for indicating the ``access mode''.
+ * NOTE: This limits the max parameter size to 16kB -1 !
+ */
+
+/*
+ * The following is for compatibility across the various Linux
+ * platforms.  The i386 ioctl numbering scheme doesn't really enforce
+ * a type field.  De facto, however, the top 8 bits of the lower 16
+ * bits are indeed used as a type field, so we might just as well make
+ * this explicit here.  Please be sure to use the decoding macros
+ * below from now on.
+ */
+#define _IOC_NRBITS     8
+#define _IOC_TYPEBITS   8
+#define _IOC_SIZEBITS   14
+#define _IOC_DIRBITS    2
+
+#define _IOC_NRMASK     ((1 << _IOC_NRBITS)-1)
+#define _IOC_TYPEMASK   ((1 << _IOC_TYPEBITS)-1)
+#define _IOC_SIZEMASK   ((1 << _IOC_SIZEBITS)-1)
+#define _IOC_DIRMASK    ((1 << _IOC_DIRBITS)-1)
+
+#define _IOC_NRSHIFT    0
+#define _IOC_TYPESHIFT  (_IOC_NRSHIFT+_IOC_NRBITS)
+#define _IOC_SIZESHIFT  (_IOC_TYPESHIFT+_IOC_TYPEBITS)
+#define _IOC_DIRSHIFT   (_IOC_SIZESHIFT+_IOC_SIZEBITS)
+
+/*
+ * Direction bits.
+ */
+#define _IOC_NONE       0U
+#define _IOC_WRITE      1U
+#define _IOC_READ       2U
+
+#define _IOC(dir,type,nr,size) (((dir)  << _IOC_DIRSHIFT) | ((type) << _IOC_TYPESHIFT) | ((nr)   << _IOC_NRSHIFT) | ((size) << _IOC_SIZESHIFT))
+
+/* used to create numbers */
+#define _IO(type,nr)	    _IOC(_IOC_NONE,(type),(nr),0)
+#define _IOR(type,nr,size)      _IOC(_IOC_READ,(type),(nr),sizeof(size))
+#define _IOW(type,nr,size)      _IOC(_IOC_WRITE,(type),(nr),sizeof(size))
+#define _IOWR(type,nr,size)     _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
+
+/* used to decode ioctl numbers.. */
+#define _IOC_DIR(nr)	    (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
+#define _IOC_TYPE(nr)	   (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
+#define _IOC_NR(nr)	     (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
+#define _IOC_SIZE(nr)	   (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
+
+/* ...and for the drivers/sound files... */
+
+#define IOC_IN	  (_IOC_WRITE << _IOC_DIRSHIFT)
+#define IOC_OUT	 (_IOC_READ << _IOC_DIRSHIFT)
+#define IOC_INOUT       ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT)
+#define IOCSIZE_MASK    (_IOC_SIZEMASK << _IOC_SIZESHIFT)
+#define IOCSIZE_SHIFT   (_IOC_SIZESHIFT)
+
+#endif /* _IOWR */

diff --git a/drivers/staging/lustre/lustre/include/lclient.h b/drivers/staging/lustre/lustre/include/lclient.h
new file mode 100644
index 0000000..9d4011f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lclient.h

@@ -0,0 +1,437 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Definitions shared between vvp and liblustre, and other clients in the
+ * future.
+ *
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef LCLIENT_H
+#define LCLIENT_H
+
+blkcnt_t dirty_cnt(struct inode *inode);
+
+int cl_glimpse_size0(struct inode *inode, int agl);
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+		    struct inode *inode, struct cl_object *clob, int agl);
+
+static inline int cl_glimpse_size(struct inode *inode)
+{
+	return cl_glimpse_size0(inode, 0);
+}
+
+static inline int cl_agl(struct inode *inode)
+{
+	return cl_glimpse_size0(inode, 1);
+}
+
+/**
+ * Locking policy for setattr.
+ */
+enum ccc_setattr_lock_type {
+	/** Locking is done by server */
+	SETATTR_NOLOCK,
+	/** Extent lock is enqueued */
+	SETATTR_EXTENT_LOCK,
+	/** Existing local extent lock is used */
+	SETATTR_MATCH_LOCK
+};
+
+
+/**
+ * IO state private to vvp or slp layers.
+ */
+struct ccc_io {
+	/** super class */
+	struct cl_io_slice     cui_cl;
+	struct cl_io_lock_link cui_link;
+	/**
+	 * I/O vector information to or from which read/write is going.
+	 */
+	struct iovec *cui_iov;
+	unsigned long cui_nrsegs;
+	/**
+	 * Total iov count for left IO.
+	 */
+	unsigned long cui_tot_nrsegs;
+	/**
+	 * Old length for iov that was truncated partially.
+	 */
+	size_t cui_iov_olen;
+	/**
+	 * Total size for the left IO.
+	 */
+	size_t cui_tot_count;
+
+	union {
+		struct {
+			enum ccc_setattr_lock_type cui_local_lock;
+		} setattr;
+	} u;
+	/**
+	 * True iff io is processing glimpse right now.
+	 */
+	int		  cui_glimpse;
+	/**
+	 * Layout version when this IO is initialized
+	 */
+	__u32		cui_layout_gen;
+	/**
+	 * File descriptor against which IO is done.
+	 */
+	struct ll_file_data *cui_fd;
+	struct kiocb *cui_iocb;
+};
+
+/**
+ * True, if \a io is a normal io, False for other (sendfile, splice*).
+ * must be impementated in arch specific code.
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io);
+
+extern struct lu_context_key ccc_key;
+extern struct lu_context_key ccc_session_key;
+
+struct ccc_thread_info {
+	struct cl_lock_descr cti_descr;
+	struct cl_io	 cti_io;
+	struct cl_attr       cti_attr;
+};
+
+static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env)
+{
+	struct ccc_thread_info      *info;
+
+	info = lu_context_key_get(&env->le_ctx, &ccc_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env)
+{
+	struct cl_attr *attr = &ccc_env_info(env)->cti_attr;
+	memset(attr, 0, sizeof(*attr));
+	return attr;
+}
+
+static inline struct cl_io *ccc_env_thread_io(const struct lu_env *env)
+{
+	struct cl_io *io = &ccc_env_info(env)->cti_io;
+	memset(io, 0, sizeof(*io));
+	return io;
+}
+
+struct ccc_session {
+	struct ccc_io cs_ios;
+};
+
+static inline struct ccc_session *ccc_env_session(const struct lu_env *env)
+{
+	struct ccc_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &ccc_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct ccc_io *ccc_env_io(const struct lu_env *env)
+{
+	return &ccc_env_session(env)->cs_ios;
+}
+
+/**
+ * ccc-private object state.
+ */
+struct ccc_object {
+	struct cl_object_header cob_header;
+	struct cl_object	cob_cl;
+	struct inode	   *cob_inode;
+
+	/**
+	 * A list of dirty pages pending IO in the cache. Used by
+	 * SOM. Protected by ll_inode_info::lli_lock.
+	 *
+	 * \see ccc_page::cpg_pending_linkage
+	 */
+	struct list_head	     cob_pending_list;
+
+	/**
+	 * Access this counter is protected by inode->i_sem. Now that
+	 * the lifetime of transient pages must be covered by inode sem,
+	 * we don't need to hold any lock..
+	 */
+	int		     cob_transient_pages;
+	/**
+	 * Number of outstanding mmaps on this file.
+	 *
+	 * \see ll_vm_open(), ll_vm_close().
+	 */
+	atomic_t	    cob_mmap_cnt;
+
+	/**
+	 * various flags
+	 * cob_discard_page_warned
+	 *     if pages belonging to this object are discarded when a client
+	 * is evicted, some debug info will be printed, this flag will be set
+	 * during processing the first discarded page, then avoid flooding
+	 * debug message for lots of discarded pages.
+	 *
+	 * \see ll_dirty_page_discard_warn.
+	 */
+	unsigned int		cob_discard_page_warned:1;
+};
+
+/**
+ * ccc-private page state.
+ */
+struct ccc_page {
+	struct cl_page_slice cpg_cl;
+	int		  cpg_defer_uptodate;
+	int		  cpg_ra_used;
+	int		  cpg_write_queued;
+	/**
+	 * Non-empty iff this page is already counted in
+	 * ccc_object::cob_pending_list. Protected by
+	 * ccc_object::cob_pending_guard. This list is only used as a flag,
+	 * that is, never iterated through, only checked for list_empty(), but
+	 * having a list is useful for debugging.
+	 */
+	struct list_head	   cpg_pending_linkage;
+	/** VM page */
+	struct page	  *cpg_page;
+};
+
+static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice)
+{
+	return container_of(slice, struct ccc_page, cpg_cl);
+}
+
+struct cl_page    *ccc_vmpage_page_transient(struct page *vmpage);
+
+struct ccc_device {
+	struct cl_device    cdv_cl;
+	struct super_block *cdv_sb;
+	struct cl_device   *cdv_next;
+};
+
+struct ccc_lock {
+	struct cl_lock_slice clk_cl;
+};
+
+struct ccc_req {
+	struct cl_req_slice  crq_cl;
+};
+
+void *ccc_key_init	(const struct lu_context *ctx,
+			   struct lu_context_key *key);
+void  ccc_key_fini	(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+void *ccc_session_key_init(const struct lu_context *ctx,
+			   struct lu_context_key *key);
+void  ccc_session_key_fini(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+
+int	      ccc_device_init  (const struct lu_env *env,
+				   struct lu_device *d,
+				   const char *name, struct lu_device *next);
+struct lu_device *ccc_device_fini (const struct lu_env *env,
+				   struct lu_device *d);
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+				   struct lu_device_type *t,
+				   struct lustre_cfg *cfg,
+				   const struct lu_device_operations *luops,
+				   const struct cl_device_operations *clops);
+struct lu_device *ccc_device_free (const struct lu_env *env,
+				   struct lu_device *d);
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev,
+				   const struct cl_object_operations *clops,
+				   const struct lu_object_operations *luops);
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+		 struct cl_req *req);
+void ccc_umount(const struct lu_env *env, struct cl_device *dev);
+int ccc_global_init(struct lu_device_type *device_type);
+void ccc_global_fini(struct lu_device_type *device_type);
+int ccc_object_init0(const struct lu_env *env,struct ccc_object *vob,
+		     const struct cl_object_conf *conf);
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf);
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj);
+int ccc_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io,
+		  const struct cl_lock_operations *lkops);
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_attr *attr, unsigned valid);
+int ccc_object_glimpse(const struct lu_env *env,
+		       const struct cl_object *obj, struct ost_lvb *lvb);
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_object_conf *conf);
+struct page *ccc_page_vmpage(const struct lu_env *env,
+			    const struct cl_page_slice *slice);
+int ccc_page_is_under_lock(const struct lu_env *env,
+			   const struct cl_page_slice *slice, struct cl_io *io);
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice);
+void ccc_transient_page_verify(const struct cl_page *page);
+int  ccc_transient_page_own(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io, int nonblock);
+void ccc_transient_page_assume(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       struct cl_io *io);
+void ccc_transient_page_unassume(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 struct cl_io *io);
+void ccc_transient_page_disown(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       struct cl_io *io);
+void ccc_transient_page_discard(const struct lu_env *env,
+				const struct cl_page_slice *slice,
+				struct cl_io *io);
+int ccc_transient_page_prep(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io);
+void ccc_lock_delete(const struct lu_env *env,
+		     const struct cl_lock_slice *slice);
+void ccc_lock_fini(const struct lu_env *env,struct cl_lock_slice *slice);
+int ccc_lock_enqueue(const struct lu_env *env,const struct cl_lock_slice *slice,
+		     struct cl_io *io, __u32 enqflags);
+int ccc_lock_unuse(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_wait(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_fits_into(const struct lu_env *env,
+		       const struct cl_lock_slice *slice,
+		       const struct cl_lock_descr *need,
+		       const struct cl_io *io);
+void ccc_lock_state(const struct lu_env *env,
+		    const struct cl_lock_slice *slice,
+		    enum cl_lock_state state);
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios);
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+			  __u32 enqflags, enum cl_lock_mode mode,
+			  pgoff_t start, pgoff_t end);
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+		    __u32 enqflags, enum cl_lock_mode mode,
+		    loff_t start, loff_t end);
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios);
+void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios,
+		    size_t nob);
+void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio,
+		       struct cl_io *io);
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_io *io, loff_t start, size_t count, int *exceed);
+void ccc_req_completion(const struct lu_env *env,
+			const struct cl_req_slice *slice, int ioret);
+void ccc_req_attr_set(const struct lu_env *env,const struct cl_req_slice *slice,
+		      const struct cl_object *obj,
+		      struct cl_req_attr *oa, obd_valid flags);
+
+struct lu_device   *ccc2lu_dev      (struct ccc_device *vdv);
+struct lu_object   *ccc2lu	  (struct ccc_object *vob);
+struct ccc_device  *lu2ccc_dev      (const struct lu_device *d);
+struct ccc_device  *cl2ccc_dev      (const struct cl_device *d);
+struct ccc_object  *lu2ccc	  (const struct lu_object *obj);
+struct ccc_object  *cl2ccc	  (const struct cl_object *obj);
+struct ccc_lock    *cl2ccc_lock     (const struct cl_lock_slice *slice);
+struct ccc_io      *cl2ccc_io       (const struct lu_env *env,
+				     const struct cl_io_slice *slice);
+struct ccc_req     *cl2ccc_req      (const struct cl_req_slice *slice);
+struct page	 *cl2vm_page      (const struct cl_page_slice *slice);
+struct inode       *ccc_object_inode(const struct cl_object *obj);
+struct ccc_object  *cl_inode2ccc    (struct inode *inode);
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+		   struct obd_capa *capa);
+
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage);
+int ccc_object_invariant(const struct cl_object *obj);
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+int cl_local_size(struct inode *inode);
+
+__u16 ll_dirent_type_get(struct lu_dirent *ent);
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32);
+__u32 cl_fid_build_gen(const struct lu_fid *fid);
+
+# define CLOBINVRNT(env, clob, expr)				    \
+	((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr))
+
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp);
+int cl_ocd_update(struct obd_device *host,
+		  struct obd_device *watched,
+		  enum obd_notify_event ev, void *owner, void *data);
+
+struct ccc_grouplock {
+	struct lu_env   *cg_env;
+	struct cl_io    *cg_io;
+	struct cl_lock  *cg_lock;
+	unsigned long    cg_gid;
+};
+
+int  cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+		      struct ccc_grouplock *cg);
+void cl_put_grouplock(struct ccc_grouplock *cg);
+
+/**
+ * New interfaces to get and put lov_stripe_md from lov layer. This violates
+ * layering because lov_stripe_md is supposed to be a private data in lov.
+ *
+ * NB: If you find you have to use these interfaces for your new code, please
+ * think about it again. These interfaces may be removed in the future for
+ * better layering. */
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj);
+void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm);
+int lov_read_and_clear_async_rc(struct cl_object *clob);
+
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode);
+void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm);
+
+/**
+ * Data structure managing a client's cached clean pages. An LRU of
+ * pages is maintained, along with other statistics.
+ */
+struct cl_client_cache {
+	atomic_t	ccc_users;    /* # of users (OSCs) of this data */
+	struct list_head	ccc_lru;      /* LRU list of cached clean pages */
+	spinlock_t	ccc_lru_lock; /* lock for list */
+	atomic_t	ccc_lru_left; /* # of LRU entries available */
+	unsigned long	ccc_lru_max;  /* Max # of LRU entries possible */
+	unsigned int	ccc_lru_shrinkers; /* # of threads reclaiming */
+};
+
+#endif /*LCLIENT_H */

diff --git a/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h
new file mode 100644
index 0000000..5866922
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h

@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LINUX_LPROCFS_SNMP_H
+#define _LINUX_LPROCFS_SNMP_H
+
+#ifndef _LPROCFS_SNMP_H
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/version.h>
+#include <linux/smp.h>
+#include <linux/rwsem.h>
+#include <linux/libcfs/libcfs.h>
+#include <linux/statfs.h>
+
+
+#endif /* LPROCFS_SNMP_H */

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_acl.h b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h
new file mode 100644
index 0000000..ff4fc4f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h

@@ -0,0 +1,66 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_acl.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_LINUX_ACL_H
+#define _LUSTRE_LINUX_ACL_H
+
+#ifndef	_LUSTRE_ACL_H
+#error	Shoud not include direectly. use #include <lustre_acl.h> instead
+#endif
+
+# include <linux/fs.h>
+# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/posix_acl_xattr.h>
+#  define LUSTRE_POSIX_ACL_MAX_ENTRIES	32
+#  define LUSTRE_POSIX_ACL_MAX_SIZE					\
+	(sizeof(posix_acl_xattr_header) +				\
+	 LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry))
+# endif /* CONFIG_FS_POSIX_ACL */
+# include <linux/lustre_intent.h>
+# include <linux/xattr.h> /* XATTR_{REPLACE,CREATE} */
+
+#ifndef LUSTRE_POSIX_ACL_MAX_SIZE
+# define LUSTRE_POSIX_ACL_MAX_SIZE   0
+#endif
+
+#endif /* _LUSTRE_LINUX_ACL_H */

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_common.h b/drivers/staging/lustre/lustre/include/linux/lustre_common.h
new file mode 100644
index 0000000..d1783a3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_common.h

@@ -0,0 +1,22 @@
+#ifndef LUSTRE_COMMON_H
+#define LUSTRE_COMMON_H
+
+#include <linux/sched.h>
+
+static inline int cfs_cleanup_group_info(void)
+{
+	struct group_info *ginfo;
+
+	ginfo = groups_alloc(0);
+	if (!ginfo)
+		return -ENOMEM;
+
+	set_current_groups(ginfo);
+	put_group_info(ginfo);
+
+	return 0;
+}
+
+#define ll_inode_blksize(a)		(1<<(a)->i_blkbits)
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
new file mode 100644
index 0000000..dff0468
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h

@@ -0,0 +1,349 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_COMPAT25_H
+#define _LINUX_COMPAT25_H
+
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/lustre_patchless_compat.h>
+
+# define LOCK_FS_STRUCT(fs)	spin_lock(&(fs)->lock)
+# define UNLOCK_FS_STRUCT(fs)	spin_unlock(&(fs)->lock)
+
+static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
+				 struct dentry *dentry)
+{
+	struct path path;
+	struct path old_pwd;
+
+	path.mnt = mnt;
+	path.dentry = dentry;
+	LOCK_FS_STRUCT(fs);
+	old_pwd = fs->pwd;
+	path_get(&path);
+	fs->pwd = path;
+	UNLOCK_FS_STRUCT(fs);
+
+	if (old_pwd.dentry)
+		path_put(&old_pwd);
+}
+
+
+/*
+ * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_BLOCKS    (1 << 27)
+
+#define current_ngroups current_cred()->group_info->ngroups
+#define current_groups current_cred()->group_info->small_block
+
+/*
+ * OBD need working random driver, thus all our
+ * initialization routines must be called after device
+ * driver initialization
+ */
+#ifndef MODULE
+#undef module_init
+#define module_init(a)     late_initcall(a)
+#endif
+
+
+#define LTIME_S(time)		   (time.tv_sec)
+
+#define ll_permission(inode,mask,nd)    inode_permission(inode,mask)
+
+# define ll_generic_permission(inode, mask, flags, check_acl) \
+	 generic_permission(inode, mask)
+
+#define ll_blkdev_put(a, b) blkdev_put(a, b)
+
+#define ll_dentry_open(a,b,c)	dentry_open(a,b,c)
+
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+		       vfs_symlink(dir, dentry, path)
+
+
+#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \
+		generic_file_llseek_size(file, offset, origin, maxbytes, eof);
+
+/* inode_dio_wait(i) use as-is for write lock */
+# define inode_dio_write_done(i)	do {} while (0) /* for write unlock */
+# define inode_dio_read(i)		atomic_inc(&(i)->i_dio_count)
+/* inode_dio_done(i) use as-is for read unlock */
+
+#define TREE_READ_LOCK_IRQ(mapping)	spin_lock_irq(&(mapping)->tree_lock)
+#define TREE_READ_UNLOCK_IRQ(mapping)	spin_unlock_irq(&(mapping)->tree_lock)
+
+static inline
+int ll_unregister_blkdev(unsigned int dev, const char *name)
+{
+	unregister_blkdev(dev, name);
+	return 0;
+}
+
+#define ll_invalidate_bdev(a,b)	 invalidate_bdev((a))
+
+#ifndef FS_HAS_FIEMAP
+#define FS_HAS_FIEMAP			(0)
+#endif
+
+
+
+/* add a lustre compatible layer for crypto API */
+#include <linux/crypto.h>
+#define ll_crypto_hash	  crypto_hash
+#define ll_crypto_cipher	crypto_blkcipher
+#define ll_crypto_alloc_hash(name, type, mask)  crypto_alloc_hash(name, type, mask)
+#define ll_crypto_hash_setkey(tfm, key, keylen) crypto_hash_setkey(tfm, key, keylen)
+#define ll_crypto_hash_init(desc)	       crypto_hash_init(desc)
+#define ll_crypto_hash_update(desc, sl, bytes)  crypto_hash_update(desc, sl, bytes)
+#define ll_crypto_hash_final(desc, out)	 crypto_hash_final(desc, out)
+#define ll_crypto_blkcipher_setkey(tfm, key, keylen) \
+		crypto_blkcipher_setkey(tfm, key, keylen)
+#define ll_crypto_blkcipher_set_iv(tfm, src, len) \
+		crypto_blkcipher_set_iv(tfm, src, len)
+#define ll_crypto_blkcipher_get_iv(tfm, dst, len) \
+		crypto_blkcipher_get_iv(tfm, dst, len)
+#define ll_crypto_blkcipher_encrypt(desc, dst, src, bytes) \
+		crypto_blkcipher_encrypt(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_decrypt(desc, dst, src, bytes) \
+		crypto_blkcipher_decrypt(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_encrypt_iv(desc, dst, src, bytes) \
+		crypto_blkcipher_encrypt_iv(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_decrypt_iv(desc, dst, src, bytes) \
+		crypto_blkcipher_decrypt_iv(desc, dst, src, bytes)
+
+static inline
+struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(const char *name,
+						   u32 type, u32 mask)
+{
+	struct ll_crypto_cipher *rtn = crypto_alloc_blkcipher(name, type, mask);
+
+	return (rtn == NULL ? ERR_PTR(-ENOMEM) : rtn);
+}
+
+static inline int ll_crypto_hmac(struct ll_crypto_hash *tfm,
+				 u8 *key, unsigned int *keylen,
+				 struct scatterlist *sg,
+				 unsigned int size, u8 *result)
+{
+	struct hash_desc desc;
+	int	      rv;
+	desc.tfm   = tfm;
+	desc.flags = 0;
+	rv = crypto_hash_setkey(desc.tfm, key, *keylen);
+	if (rv) {
+		CERROR("failed to hash setkey: %d\n", rv);
+		return rv;
+	}
+	return crypto_hash_digest(&desc, sg, size, result);
+}
+static inline
+unsigned int ll_crypto_tfm_alg_max_keysize(struct crypto_blkcipher *tfm)
+{
+	return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.max_keysize;
+}
+static inline
+unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm)
+{
+	return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize;
+}
+
+#define ll_crypto_hash_blocksize(tfm)       crypto_hash_blocksize(tfm)
+#define ll_crypto_hash_digestsize(tfm)      crypto_hash_digestsize(tfm)
+#define ll_crypto_blkcipher_ivsize(tfm)     crypto_blkcipher_ivsize(tfm)
+#define ll_crypto_blkcipher_blocksize(tfm)  crypto_blkcipher_blocksize(tfm)
+#define ll_crypto_free_hash(tfm)	    crypto_free_hash(tfm)
+#define ll_crypto_free_blkcipher(tfm)       crypto_free_blkcipher(tfm)
+
+#define ll_vfs_rmdir(dir,entry,mnt)	     vfs_rmdir(dir,entry)
+#define ll_vfs_mkdir(inode,dir,mnt,mode)	vfs_mkdir(inode,dir,mode)
+#define ll_vfs_link(old,mnt,dir,new,mnt1)       vfs_link(old,dir,new)
+#define ll_vfs_unlink(inode,entry,mnt)	  vfs_unlink(inode,entry)
+#define ll_vfs_mknod(dir,entry,mnt,mode,dev)    vfs_mknod(dir,entry,mode,dev)
+#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry)
+#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
+		vfs_rename(old,old_dir,new,new_dir)
+
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#define cfs_bio_io_error(a,b)   bio_io_error((a))
+#define cfs_bio_endio(a,b,c)    bio_endio((a),(c))
+
+#define cfs_fs_pwd(fs)       ((fs)->pwd.dentry)
+#define cfs_fs_mnt(fs)       ((fs)->pwd.mnt)
+#define cfs_path_put(nd)     path_put(&(nd)->path)
+
+
+#ifndef SLAB_DESTROY_BY_RCU
+#define SLAB_DESTROY_BY_RCU 0
+#endif
+
+
+
+static inline int
+ll_quota_on(struct super_block *sb, int off, int ver, char *name, int remount)
+{
+	int rc;
+
+	if (sb->s_qcop->quota_on) {
+		struct path path;
+
+		rc = kern_path(name, LOOKUP_FOLLOW, &path);
+		if (!rc)
+			return rc;
+		rc = sb->s_qcop->quota_on(sb, off, ver
+					    , &path
+					   );
+		path_put(&path);
+		return rc;
+	}
+	else
+		return -ENOSYS;
+}
+
+static inline int ll_quota_off(struct super_block *sb, int off, int remount)
+{
+	if (sb->s_qcop->quota_off) {
+		return sb->s_qcop->quota_off(sb, off
+					    );
+	}
+	else
+		return -ENOSYS;
+}
+
+
+# define ll_vfs_dq_init	     dquot_initialize
+# define ll_vfs_dq_drop	     dquot_drop
+# define ll_vfs_dq_transfer	 dquot_transfer
+# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1)
+
+
+
+
+
+#define queue_max_phys_segments(rq)       queue_max_segments(rq)
+#define queue_max_hw_segments(rq)	 queue_max_segments(rq)
+
+#define ll_kmap_atomic(a, b)	kmap_atomic(a)
+#define ll_kunmap_atomic(a, b)	kunmap_atomic(a)
+
+
+#define ll_d_hlist_node hlist_node
+#define ll_d_hlist_empty(list) hlist_empty(list)
+#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name)
+#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry)
+#define ll_d_hlist_for_each_entry(dentry, p, i_dentry, alias) \
+	p = NULL; hlist_for_each_entry(dentry, i_dentry, alias)
+
+
+#define bio_hw_segments(q, bio) 0
+
+
+#define ll_pagevec_init(pv, cold)       do {} while (0)
+#define ll_pagevec_add(pv, pg)	  (0)
+#define ll_pagevec_lru_add_file(pv)     do {} while (0)
+
+
+#ifndef QUOTA_OK
+# define QUOTA_OK 0
+#endif
+#ifndef NO_QUOTA
+# define NO_QUOTA (-EDQUOT)
+#endif
+
+#ifndef SEEK_DATA
+#define SEEK_DATA      3       /* seek to the next data */
+#endif
+#ifndef SEEK_HOLE
+#define SEEK_HOLE      4       /* seek to the next hole */
+#endif
+
+#ifndef FMODE_UNSIGNED_OFFSET
+#define FMODE_UNSIGNED_OFFSET	((__force fmode_t)0x2000)
+#endif
+
+#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit)
+# define ext2_set_bit	     __test_and_set_bit_le
+# define ext2_clear_bit	   __test_and_clear_bit_le
+# define ext2_test_bit	    test_bit_le
+# define ext2_find_first_zero_bit find_first_zero_bit_le
+# define ext2_find_next_zero_bit  find_next_zero_bit_le
+#endif
+
+#ifdef ATTR_TIMES_SET
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+#else
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET)
+#endif
+
+
+
+/*
+ * After 3.1, kernel's nameidata.intent.open.flags is different
+ * with lustre's lookup_intent.it_flags, as lustre's it_flags'
+ * lower bits equal to FMODE_xxx while kernel doesn't transliterate
+ * lower bits of nameidata.intent.open.flags to FMODE_xxx.
+ * */
+#include <linux/version.h>
+static inline int ll_namei_to_lookup_intent_flag(int flag)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
+	flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag);
+#endif
+	return flag;
+}
+
+# define ll_mrf_ret void
+# define LL_MRF_RETURN(rc)
+
+#include <linux/fs.h>
+
+# define ll_umode_t	umode_t
+
+#include <linux/dcache.h>
+
+# define ll_dirty_inode(inode, flag)	(inode)->i_sb->s_op->dirty_inode((inode), flag)
+
+#endif /* _COMPAT25_H */

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_debug.h b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h
new file mode 100644
index 0000000..11deac7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h

@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_DEBUG_H
+#define _LINUX_LUSTRE_DEBUG_H
+
+#ifndef _LUSTRE_DEBUG_H
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+#define LL_CDEBUG_PAGE(mask, page, fmt, arg...)			       \
+	CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: "\
+	       fmt, page, page->mapping, page->index, (long)page->flags,      \
+	       page_count(page), page_private(page), ## arg)
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h
new file mode 100644
index 0000000..207df03
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h

@@ -0,0 +1,46 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_DLM_H__
+#define _LINUX_LUSTRE_DLM_H__
+
+#ifndef _LUSTRE_DLM_H__
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+# include <linux/proc_fs.h>
+#  include <asm/processor.h>
+#  include <linux/bit_spinlock.h>
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h
new file mode 100644
index 0000000..6c72609
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h

@@ -0,0 +1,181 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_fsfilt.h
+ *
+ * Filesystem interface helper.
+ */
+
+#ifndef _LINUX_LUSTRE_FSFILT_H
+#define _LINUX_LUSTRE_FSFILT_H
+
+#ifndef _LUSTRE_FSFILT_H
+#error Do not #include this file directly. #include <lustre_fsfilt.h> instead
+#endif
+
+
+#include <obd.h>
+#include <obd_class.h>
+
+typedef void (*fsfilt_cb_t)(struct obd_device *obd, __u64 last_rcvd,
+			    void *data, int error);
+
+struct fsfilt_operations {
+	struct list_head fs_list;
+	module_t *fs_owner;
+	char   *fs_type;
+	char   *(* fs_getlabel)(struct super_block *sb);
+	void   *(* fs_start)(struct inode *inode, int op, void *desc_private,
+			     int logs);
+	int     (* fs_commit)(struct inode *inode, void *handle,int force_sync);
+	int     (* fs_map_inode_pages)(struct inode *inode, struct page **page,
+				       int pages, unsigned long *blocks,
+				       int create, struct mutex *sem);
+	int     (* fs_write_record)(struct file *, void *, int size, loff_t *,
+				    int force_sync);
+	int     (* fs_read_record)(struct file *, void *, int size, loff_t *);
+	int     (* fs_setup)(struct super_block *sb);
+};
+
+extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops);
+extern void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops);
+extern struct fsfilt_operations *fsfilt_get_ops(const char *type);
+extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops);
+
+static inline char *fsfilt_get_label(struct obd_device *obd,
+				     struct super_block *sb)
+{
+	if (obd->obd_fsops->fs_getlabel == NULL)
+		return NULL;
+	if (obd->obd_fsops->fs_getlabel(sb)[0] == '\0')
+		return NULL;
+
+	return obd->obd_fsops->fs_getlabel(sb);
+}
+
+#define FSFILT_OP_UNLINK		1
+#define FSFILT_OP_CANCEL_UNLINK	 10
+
+#define __fsfilt_check_slow(obd, start, msg)			      \
+do {								      \
+	if (cfs_time_before(jiffies, start + 15 * HZ))		\
+		break;						    \
+	else if (cfs_time_before(jiffies, start + 30 * HZ))	   \
+		CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name,   \
+		       msg, (jiffies-start) / HZ);		    \
+	else if (cfs_time_before(jiffies, start + DISK_TIMEOUT * HZ)) \
+		CWARN("%s: slow %s %lus\n", obd->obd_name, msg,	   \
+		      (jiffies - start) / HZ);			\
+	else							      \
+		CERROR("%s: slow %s %lus\n", obd->obd_name, msg,	  \
+		       (jiffies - start) / HZ);		       \
+} while (0)
+
+#define fsfilt_check_slow(obd, start, msg)	      \
+do {						    \
+	__fsfilt_check_slow(obd, start, msg);	   \
+	start = jiffies;				\
+} while (0)
+
+static inline void *fsfilt_start_log(struct obd_device *obd,
+				     struct inode *inode, int op,
+				     struct obd_trans_info *oti, int logs)
+{
+	unsigned long now = jiffies;
+	void *parent_handle = oti ? oti->oti_handle : NULL;
+	void *handle;
+
+	handle = obd->obd_fsops->fs_start(inode, op, parent_handle, logs);
+	CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle);
+
+	if (oti != NULL) {
+		if (parent_handle == NULL) {
+			oti->oti_handle = handle;
+		} else if (handle != parent_handle) {
+			CERROR("mismatch: parent %p, handle %p, oti %p\n",
+			       parent_handle, handle, oti);
+			LBUG();
+		}
+	}
+	fsfilt_check_slow(obd, now, "journal start");
+	return handle;
+}
+
+static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode,
+				void *handle, int force_sync)
+{
+	unsigned long now = jiffies;
+	int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync);
+	CDEBUG(D_INFO, "committing handle %p\n", handle);
+
+	fsfilt_check_slow(obd, now, "journal start");
+
+	return rc;
+}
+
+static inline int fsfilt_map_inode_pages(struct obd_device *obd,
+					 struct inode *inode,
+					 struct page **page, int pages,
+					 unsigned long *blocks,
+					 int create, struct mutex *mutex)
+{
+	return obd->obd_fsops->fs_map_inode_pages(inode, page, pages, blocks,
+						  create, mutex);
+}
+
+static inline int fsfilt_read_record(struct obd_device *obd, struct file *file,
+				     void *buf, loff_t size, loff_t *offs)
+{
+	return obd->obd_fsops->fs_read_record(file, buf, size, offs);
+}
+
+static inline int fsfilt_write_record(struct obd_device *obd, struct file *file,
+				      void *buf, loff_t size, loff_t *offs,
+				      int force_sync)
+{
+	return obd->obd_fsops->fs_write_record(file, buf, size,offs,force_sync);
+}
+
+static inline int fsfilt_setup(struct obd_device *obd, struct super_block *fs)
+{
+	if (obd->obd_fsops->fs_setup)
+		return obd->obd_fsops->fs_setup(fs);
+	return 0;
+}
+
+
+
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_handles.h b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h
new file mode 100644
index 0000000..ecf1840
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h

@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_LUSTRE_HANDLES_H_
+#define __LINUX_LUSTRE_HANDLES_H_
+
+#ifndef __LUSTRE_HANDLES_H_
+#error Do not #include this file directly. #include <lustre_handles.h> instead
+#endif
+
+#include <asm/types.h>
+#include <asm/atomic.h>
+#include <linux/list.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <linux/rcupdate.h> /* for rcu_head{} */
+typedef struct rcu_head cfs_rcu_head_t;
+
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_intent.h b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h
new file mode 100644
index 0000000..b10ddfa
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h

@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_INTENT_H
+#define LUSTRE_INTENT_H
+
+/* intent IT_XXX are defined in lustre/include/obd.h */
+struct lustre_intent_data {
+	int		it_disposition;
+	int		it_status;
+	__u64		it_lock_handle;
+	__u64		it_lock_bits;
+	int		it_lock_mode;
+	int		it_remote_lock_mode;
+	__u64	   it_remote_lock_handle;
+	void	   *it_data;
+	unsigned int    it_lock_set:1;
+};
+
+struct lookup_intent {
+	int     it_op;
+	int     it_flags;
+	int     it_create_mode;
+	union {
+		struct lustre_intent_data lustre;
+	} d;
+};
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lib.h b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h
new file mode 100644
index 0000000..b2f755a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h

@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LINUX_LUSTRE_LIB_H
+#define _LINUX_LUSTRE_LIB_H
+
+#ifndef _LUSTRE_LIB_H
+#error Do not #include this file directly. #include <lustre_lib.h> instead
+#endif
+
+# include <linux/rwsem.h>
+# include <linux/sched.h>
+# include <linux/signal.h>
+# include <linux/types.h>
+# include <linux/lustre_compat25.h>
+# include <linux/lustre_common.h>
+
+#ifndef LP_POISON
+#if BITS_PER_LONG > 32
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
+#else
+# define LI_POISON ((int)0x5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a)
+#endif
+#endif
+
+/* This macro is only for compatibility reasons with older Linux Lustre user
+ * tools. New ioctls should NOT use this macro as the ioctl "size". Instead
+ * the ioctl should get a "size" argument which is the actual data type used
+ * by the ioctl, to ensure the ioctl interface is versioned correctly. */
+#define OBD_IOC_DATA_TYPE	       long
+
+#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) |		\
+			   sigmask(SIGTERM) | sigmask(SIGQUIT) |	       \
+			   sigmask(SIGALRM))
+
+/* initialize ost_lvb according to inode */
+static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb)
+{
+	lvb->lvb_size = i_size_read(inode);
+	lvb->lvb_blocks = inode->i_blocks;
+	lvb->lvb_mtime = LTIME_S(inode->i_mtime);
+	lvb->lvb_atime = LTIME_S(inode->i_atime);
+	lvb->lvb_ctime = LTIME_S(inode->i_ctime);
+}
+
+#endif /* _LUSTRE_LIB_H */

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
new file mode 100644
index 0000000..c95dff9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h

@@ -0,0 +1,100 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LL_H
+#define _LINUX_LL_H
+
+#ifndef _LL_H
+#error Do not #include this file directly. #include <lustre_lite.h> instead
+#endif
+
+
+#include <linux/version.h>
+
+#include <asm/statfs.h>
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/proc_fs.h>
+
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_ha.h>
+
+#include <linux/rbtree.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/pagemap.h>
+
+/* lprocfs.c */
+enum {
+	 LPROC_LL_DIRTY_HITS = 0,
+	 LPROC_LL_DIRTY_MISSES,
+	 LPROC_LL_READ_BYTES,
+	 LPROC_LL_WRITE_BYTES,
+	 LPROC_LL_BRW_READ,
+	 LPROC_LL_BRW_WRITE,
+	 LPROC_LL_OSC_READ,
+	 LPROC_LL_OSC_WRITE,
+	 LPROC_LL_IOCTL,
+	 LPROC_LL_OPEN,
+	 LPROC_LL_RELEASE,
+	 LPROC_LL_MAP,
+	 LPROC_LL_LLSEEK,
+	 LPROC_LL_FSYNC,
+	 LPROC_LL_READDIR,
+	 LPROC_LL_SETATTR,
+	 LPROC_LL_TRUNC,
+	 LPROC_LL_FLOCK,
+	 LPROC_LL_GETATTR,
+	 LPROC_LL_CREATE,
+	 LPROC_LL_LINK,
+	 LPROC_LL_UNLINK,
+	 LPROC_LL_SYMLINK,
+	 LPROC_LL_MKDIR,
+	 LPROC_LL_RMDIR,
+	 LPROC_LL_MKNOD,
+	 LPROC_LL_RENAME,
+	 LPROC_LL_STAFS,
+	 LPROC_LL_ALLOC_INODE,
+	 LPROC_LL_SETXATTR,
+	 LPROC_LL_GETXATTR,
+	 LPROC_LL_LISTXATTR,
+	 LPROC_LL_REMOVEXATTR,
+	 LPROC_LL_INODE_PERM,
+	 LPROC_LL_FILE_OPCODES
+};
+
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_log.h b/drivers/staging/lustre/lustre/include/linux/lustre_log.h
new file mode 100644
index 0000000..e9c8e56
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_log.h

@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *  - orphan recovery: OST adds record on create
+ *  - mtime/size consistency: the OST adds a record on first write
+ *  - open/unlinked objects: OST adds a record on destroy
+ *
+ *  - mds unlink log: the MDS adds an entry upon delete
+ *
+ *  - raid1 replication log between OST's
+ *  - MDS replication logs
+ */
+
+#ifndef _LINUX_LUSTRE_LOG_H
+#define _LINUX_LUSTRE_LOG_H
+
+#ifndef _LUSTRE_LOG_H
+#error Do not #include this file directly. #include <lustre_log.h> instead
+#endif
+
+#define LUSTRE_LOG_SERVER
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_net.h b/drivers/staging/lustre/lustre/include/linux/lustre_net.h
new file mode 100644
index 0000000..2d7c425
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_net.h

@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_NET_H
+#define _LINUX_LUSTRE_NET_H
+
+#ifndef _LUSTRE_NET_H
+#error Do not #include this file directly. #include <lustre_net.h> instead
+#endif
+
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+/* XXX Liang: should be moved to other header instead of here */
+#ifndef WITH_GROUP_INFO
+#define WITH_GROUP_INFO
+#endif
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
new file mode 100644
index 0000000..f050808
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h

@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_PATCHLESS_COMPAT_H
+#define LUSTRE_PATCHLESS_COMPAT_H
+
+#include <linux/fs.h>
+
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/hash.h>
+
+
+#define ll_delete_from_page_cache(page) delete_from_page_cache(page)
+
+static inline void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return;
+
+	if (PagePrivate(page))
+		page->mapping->a_ops->invalidatepage(page, 0);
+
+	cancel_dirty_page(page, PAGE_SIZE);
+	ClearPageMappedToDisk(page);
+	ll_delete_from_page_cache(page);
+}
+
+#  define d_refcount(d)		 ((d)->d_count)
+
+#ifdef ATTR_OPEN
+# define ATTR_FROM_OPEN ATTR_OPEN
+#else
+# ifndef ATTR_FROM_OPEN
+#  define ATTR_FROM_OPEN 0
+# endif
+#endif /* ATTR_OPEN */
+
+#ifndef ATTR_RAW
+#define ATTR_RAW 0
+#endif
+
+#ifndef ATTR_CTIME_SET
+/*
+ * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_CTIME_SET (1 << 28)
+#endif
+
+#endif /* LUSTRE_PATCHLESS_COMPAT_H */

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_quota.h b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h
new file mode 100644
index 0000000..421866b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h

@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_QUOTA_H
+#define _LINUX_LUSTRE_QUOTA_H
+
+#ifndef _LUSTRE_QUOTA_H
+#error Do not #include this file directly. #include <lustre_quota.h> instead
+#endif
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+
+#endif /* _LUSTRE_QUOTA_H */

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/drivers/staging/lustre/lustre/include/linux/lustre_user.h
new file mode 100644
index 0000000..ebaf929
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_user.h

@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LINUX_LUSTRE_USER_H
+#define _LINUX_LUSTRE_USER_H
+
+# include <linux/version.h>
+# include <linux/quota.h>
+
+/*
+ * asm-x86_64/processor.h on some SLES 9 distros seems to use
+ * kernel-only typedefs.  fortunately skipping it altogether is ok
+ * (for now).
+ */
+#define __ASM_X86_64_PROCESSOR_H
+
+#include <linux/string.h>
+
+#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
+    defined(__craynv) || defined (__mips64__) || defined(__powerpc64__)
+typedef struct stat     lstat_t;
+#define lstat_f	 lstat
+#define HAVE_LOV_USER_MDS_DATA
+#else
+typedef struct stat64   lstat_t;
+#define lstat_f	 lstat64
+#define HAVE_LOV_USER_MDS_DATA
+#endif
+
+#endif /* _LUSTRE_USER_H */

diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs.h b/drivers/staging/lustre/lustre/include/linux/lvfs.h
new file mode 100644
index 0000000..b4db6cb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lvfs.h

@@ -0,0 +1,134 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LINUX_LVFS_H__
+#define __LINUX_LVFS_H__
+
+#ifndef __LVFS_H__
+#error Do not #include this file directly. #include <lvfs.h> instead
+#endif
+
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/lvfs_linux.h>
+
+#define LLOG_LVFS
+
+/* simple.c */
+
+struct lvfs_ucred {
+	__u32		   luc_uid;
+	__u32		   luc_gid;
+	__u32		   luc_fsuid;
+	__u32		   luc_fsgid;
+	kernel_cap_t	luc_cap;
+	__u32		   luc_umask;
+	struct group_info      *luc_ginfo;
+	struct md_identity     *luc_identity;
+};
+
+struct lvfs_callback_ops {
+	struct dentry *(*l_fid2dentry)(__u64 id_ino, __u32 gen, __u64 gr, void *data);
+};
+
+#define OBD_RUN_CTXT_MAGIC      0xC0FFEEAA
+#define OBD_CTXT_DEBUG	  /* development-only debugging */
+struct lvfs_run_ctxt {
+	struct vfsmount	 *pwdmnt;
+	struct dentry	   *pwd;
+	mm_segment_t	     fs;
+	struct lvfs_ucred	luc;
+	int		      ngroups;
+	struct lvfs_callback_ops cb_ops;
+	struct group_info       *group_info;
+	struct dt_device	*dt;
+#ifdef OBD_CTXT_DEBUG
+	__u32		    magic;
+#endif
+};
+
+#ifdef OBD_CTXT_DEBUG
+#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC
+#else
+#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0)
+#endif
+
+
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *oldname,
+		  char *newname);
+
+static inline void l_dput(struct dentry *de)
+{
+	if (!de || IS_ERR(de))
+		return;
+	//shrink_dcache_parent(de);
+	LASSERT(d_refcount(de) > 0);
+	dput(de);
+}
+
+/* We need to hold the inode semaphore over the dcache lookup itself, or we
+ * run the risk of entering the filesystem lookup path concurrently on SMP
+ * systems, and instantiating two inodes for the same entry.  We still
+ * protect against concurrent addition/removal races with the DLM locking.
+ */
+static inline struct dentry *ll_lookup_one_len(const char *fid_name,
+					       struct dentry *dparent,
+					       int fid_namelen)
+{
+	struct dentry *dchild;
+
+	mutex_lock(&dparent->d_inode->i_mutex);
+	dchild = lookup_one_len(fid_name, dparent, fid_namelen);
+	mutex_unlock(&dparent->d_inode->i_mutex);
+
+	if (IS_ERR(dchild) || dchild->d_inode == NULL)
+		return dchild;
+
+	if (is_bad_inode(dchild->d_inode)) {
+		CERROR("bad inode returned %lu/%u\n",
+		       dchild->d_inode->i_ino, dchild->d_inode->i_generation);
+		dput(dchild);
+		dchild = ERR_PTR(-ENOENT);
+	}
+	return dchild;
+}
+
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h
new file mode 100644
index 0000000..140a60f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h

@@ -0,0 +1,66 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LVFS_LINUX_H__
+#define __LVFS_LINUX_H__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#include <lvfs.h>
+
+#define l_file file
+#define l_dentry dentry
+
+#define l_filp_open filp_open
+
+struct lvfs_run_ctxt;
+struct l_file *l_dentry_open(struct lvfs_run_ctxt *, struct l_dentry *,
+			     int flags);
+
+struct l_linux_dirent {
+	struct list_head      lld_list;
+	ino_t	   lld_ino;
+	unsigned long   lld_off;
+	char	    lld_name[LL_FID_NAMELEN];
+};
+struct l_readdir_callback {
+	struct l_linux_dirent *lrc_dirent;
+	struct list_head	    *lrc_list;
+};
+
+#endif /*  __LVFS_LINUX_H__ */

diff --git a/drivers/staging/lustre/lustre/include/linux/obd.h b/drivers/staging/lustre/lustre/include/linux/obd.h
new file mode 100644
index 0000000..2c36c0d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/obd.h

@@ -0,0 +1,128 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_OBD_H
+#define __LINUX_OBD_H
+
+#ifndef __OBD_H
+#error Do not #include this file directly. #include <obd.h> instead
+#endif
+
+#include <obd_support.h>
+
+# include <linux/fs.h>
+# include <linux/list.h>
+# include <linux/sched.h>  /* for struct task_struct, for current.h */
+# include <linux/proc_fs.h>
+# include <linux/mount.h>
+# include <linux/lustre_intent.h>
+
+struct ll_iattr {
+	struct iattr	iattr;
+	unsigned int	ia_attr_flags;
+};
+
+#define CLIENT_OBD_LIST_LOCK_DEBUG 1
+
+typedef struct {
+	spinlock_t		lock;
+
+	unsigned long       time;
+	struct task_struct *task;
+	const char	 *func;
+	int		 line;
+} client_obd_lock_t;
+
+static inline void __client_obd_list_lock(client_obd_lock_t *lock,
+					  const char *func, int line)
+{
+	unsigned long cur = jiffies;
+	while (1) {
+		if (spin_trylock(&lock->lock)) {
+			LASSERT(lock->task == NULL);
+			lock->task = current;
+			lock->func = func;
+			lock->line = line;
+			lock->time = jiffies;
+			break;
+		}
+
+		if ((jiffies - cur > 5 * HZ) &&
+		    (jiffies - lock->time > 5 * HZ)) {
+			struct task_struct *task = lock->task;
+
+			if (task == NULL)
+				continue;
+
+			LCONSOLE_WARN("%s:%d: lock %p was acquired"
+				      " by <%s:%d:%s:%d> for %lu seconds.\n",
+				      current->comm, current->pid,
+				      lock, task->comm, task->pid,
+				      lock->func, lock->line,
+				      (jiffies - lock->time) / HZ);
+			LCONSOLE_WARN("====== for process holding the "
+				      "lock =====\n");
+			libcfs_debug_dumpstack(task);
+			LCONSOLE_WARN("====== for current process =====\n");
+			libcfs_debug_dumpstack(NULL);
+			LCONSOLE_WARN("====== end =======\n");
+			cfs_pause(1000 * HZ);
+		}
+		cpu_relax();
+	}
+}
+
+#define client_obd_list_lock(lock) \
+	__client_obd_list_lock(lock, __FUNCTION__, __LINE__)
+
+static inline void client_obd_list_unlock(client_obd_lock_t *lock)
+{
+	LASSERT(lock->task != NULL);
+	lock->task = NULL;
+	lock->time = jiffies;
+	spin_unlock(&lock->lock);
+}
+
+
+static inline void client_obd_list_lock_init(client_obd_lock_t *lock)
+{
+	spin_lock_init(&lock->lock);
+}
+
+static inline void client_obd_list_lock_done(client_obd_lock_t *lock)
+{}
+
+#endif /* __LINUX_OBD_H */

diff --git a/drivers/staging/lustre/lustre/include/linux/obd_class.h b/drivers/staging/lustre/lustre/include/linux/obd_class.h
new file mode 100644
index 0000000..021ead6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/obd_class.h

@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_CLASS_OBD_H
+#define __LINUX_CLASS_OBD_H
+
+#ifndef __CLASS_OBD_H
+#error Do not #include this file directly. #include <obd_class.h> instead
+#endif
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+
+/* obdo.c */
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid);
+void la_from_obdo(struct lu_attr *la, struct obdo *dst, obd_flag valid);
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+#define ll_inode_flags(inode)	 (inode->i_flags)
+
+
+#endif /* __LINUX_OBD_CLASS_H */

diff --git a/drivers/staging/lustre/lustre/include/linux/obd_support.h b/drivers/staging/lustre/lustre/include/linux/obd_support.h
new file mode 100644
index 0000000..9166503
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/obd_support.h

@@ -0,0 +1,63 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_OBD_SUPPORT
+#define _LINUX_OBD_SUPPORT
+
+#ifndef _OBD_SUPPORT
+#error Do not #include this file directly. #include <obd_support.h> instead
+#endif
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+#include <asm/processor.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+
+# include <linux/types.h>
+# include <linux/blkdev.h>
+# include <lvfs.h>
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h
new file mode 100644
index 0000000..e770d02
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lprocfs_status.h

@@ -0,0 +1,1043 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LPROCFS_SNMP_H
+#define _LPROCFS_SNMP_H
+
+#include <linux/lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <linux/libcfs/params_tree.h>
+
+struct lprocfs_vars {
+	const char		*name;
+	struct file_operations	*fops;
+	void			*data;
+	/**
+	 * /proc file mode.
+	 */
+	mode_t			proc_mode;
+};
+
+struct lprocfs_static_vars {
+	struct lprocfs_vars *module_vars;
+	struct lprocfs_vars *obd_vars;
+};
+
+/* if we find more consumers this could be generalized */
+#define OBD_HIST_MAX 32
+struct obd_histogram {
+	spinlock_t	oh_lock;
+	unsigned long	oh_buckets[OBD_HIST_MAX];
+};
+
+enum {
+	BRW_R_PAGES = 0,
+	BRW_W_PAGES,
+	BRW_R_RPC_HIST,
+	BRW_W_RPC_HIST,
+	BRW_R_IO_TIME,
+	BRW_W_IO_TIME,
+	BRW_R_DISCONT_PAGES,
+	BRW_W_DISCONT_PAGES,
+	BRW_R_DISCONT_BLOCKS,
+	BRW_W_DISCONT_BLOCKS,
+	BRW_R_DISK_IOSIZE,
+	BRW_W_DISK_IOSIZE,
+	BRW_R_DIO_FRAGS,
+	BRW_W_DIO_FRAGS,
+	BRW_LAST,
+};
+
+struct brw_stats {
+	struct obd_histogram hist[BRW_LAST];
+};
+
+enum {
+	RENAME_SAMEDIR_SIZE = 0,
+	RENAME_CROSSDIR_SRC_SIZE,
+	RENAME_CROSSDIR_TGT_SIZE,
+	RENAME_LAST,
+};
+
+struct rename_stats {
+	struct obd_histogram hist[RENAME_LAST];
+};
+
+/* An lprocfs counter can be configured using the enum bit masks below.
+ *
+ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
+ * protects this counter from concurrent updates. If not specified,
+ * lprocfs an internal per-counter lock variable. External locks are
+ * not used to protect counter increments, but are used to protect
+ * counter readout and resets.
+ *
+ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
+ * (i.e. counter can be incremented by more than "1"). When specified,
+ * the counter maintains min, max and sum in addition to a simple
+ * invocation count. This allows averages to be be computed.
+ * If not specified, the counter is an increment-by-1 counter.
+ * min, max, sum, etc. are not maintained.
+ *
+ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
+ * squares (for multi-valued counter samples only). This allows
+ * external computation of standard deviation, but involves a 64-bit
+ * multiply per counter increment.
+ */
+
+enum {
+	LPROCFS_CNTR_EXTERNALLOCK = 0x0001,
+	LPROCFS_CNTR_AVGMINMAX    = 0x0002,
+	LPROCFS_CNTR_STDDEV       = 0x0004,
+
+	/* counter data type */
+	LPROCFS_TYPE_REGS	 = 0x0100,
+	LPROCFS_TYPE_BYTES	= 0x0200,
+	LPROCFS_TYPE_PAGES	= 0x0400,
+	LPROCFS_TYPE_CYCLE	= 0x0800,
+};
+
+#define LC_MIN_INIT ((~(__u64)0) >> 1)
+
+struct lprocfs_counter_header {
+	unsigned int		lc_config;
+	const char		*lc_name;   /* must be static */
+	const char		*lc_units;  /* must be static */
+};
+
+struct lprocfs_counter {
+	__s64	lc_count;
+	__s64	lc_min;
+	__s64	lc_max;
+	__s64	lc_sumsquare;
+	/*
+	 * Every counter has lc_array_sum[0], while lc_array_sum[1] is only
+	 * for irq context counter, i.e. stats with
+	 * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need
+	 * lc_array_sum[1]
+	 */
+	__s64	lc_array_sum[1];
+};
+#define lc_sum		lc_array_sum[0]
+#define lc_sum_irq	lc_array_sum[1]
+
+struct lprocfs_percpu {
+#ifndef __GNUC__
+	__s64			pad;
+#endif
+	struct lprocfs_counter lp_cntr[0];
+};
+
+#define LPROCFS_GET_NUM_CPU 0x0001
+#define LPROCFS_GET_SMP_ID  0x0002
+
+enum lprocfs_stats_flags {
+	LPROCFS_STATS_FLAG_NONE     = 0x0000, /* per cpu counter */
+	LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu
+					       * area and need locking */
+	LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */
+};
+
+enum lprocfs_fields_flags {
+	LPROCFS_FIELDS_FLAGS_CONFIG     = 0x0001,
+	LPROCFS_FIELDS_FLAGS_SUM	= 0x0002,
+	LPROCFS_FIELDS_FLAGS_MIN	= 0x0003,
+	LPROCFS_FIELDS_FLAGS_MAX	= 0x0004,
+	LPROCFS_FIELDS_FLAGS_AVG	= 0x0005,
+	LPROCFS_FIELDS_FLAGS_SUMSQUARE  = 0x0006,
+	LPROCFS_FIELDS_FLAGS_COUNT      = 0x0007,
+};
+
+struct lprocfs_stats {
+	/* # of counters */
+	unsigned short			ls_num;
+	/* 1 + the biggest cpu # whose ls_percpu slot has been allocated */
+	unsigned short			ls_biggest_alloc_num;
+	enum lprocfs_stats_flags	ls_flags;
+	/* Lock used when there are no percpu stats areas; For percpu stats,
+	 * it is used to protect ls_biggest_alloc_num change */
+	spinlock_t			ls_lock;
+
+	/* has ls_num of counter headers */
+	struct lprocfs_counter_header	*ls_cnt_header;
+	struct lprocfs_percpu		*ls_percpu[0];
+};
+
+#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC)
+
+/* Pack all opcodes down into a single monotonically increasing index */
+static inline int opcode_offset(__u32 opc) {
+	if (opc < OST_LAST_OPC) {
+		 /* OST opcode */
+		return (opc - OST_FIRST_OPC);
+	} else if (opc < MDS_LAST_OPC) {
+		/* MDS opcode */
+		return (opc - MDS_FIRST_OPC +
+			OPC_RANGE(OST));
+	} else if (opc < LDLM_LAST_OPC) {
+		/* LDLM Opcode */
+		return (opc - LDLM_FIRST_OPC +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < MGS_LAST_OPC) {
+		/* MGS Opcode */
+		return (opc - MGS_FIRST_OPC +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < OBD_LAST_OPC) {
+		/* OBD Ping */
+		return (opc - OBD_FIRST_OPC +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < LLOG_LAST_OPC) {
+		/* LLOG Opcode */
+		return (opc - LLOG_FIRST_OPC +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < QUOTA_LAST_OPC) {
+		/* LQUOTA Opcode */
+		return (opc - QUOTA_FIRST_OPC +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < SEQ_LAST_OPC) {
+		/* SEQ opcode */
+		return (opc - SEQ_FIRST_OPC +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < SEC_LAST_OPC) {
+		/* SEC opcode */
+		return (opc - SEC_FIRST_OPC +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < FLD_LAST_OPC) {
+		/* FLD opcode */
+		 return (opc - FLD_FIRST_OPC +
+			OPC_RANGE(SEC) +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else if (opc < UPDATE_LAST_OPC) {
+		/* update opcode */
+		return (opc - UPDATE_FIRST_OPC +
+			OPC_RANGE(FLD) +
+			OPC_RANGE(SEC) +
+			OPC_RANGE(SEQ) +
+			OPC_RANGE(QUOTA) +
+			OPC_RANGE(LLOG) +
+			OPC_RANGE(OBD) +
+			OPC_RANGE(MGS) +
+			OPC_RANGE(LDLM) +
+			OPC_RANGE(MDS) +
+			OPC_RANGE(OST));
+	} else {
+		/* Unknown Opcode */
+		return -1;
+	}
+}
+
+
+#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST)  + \
+			    OPC_RANGE(MDS)  + \
+			    OPC_RANGE(LDLM) + \
+			    OPC_RANGE(MGS)  + \
+			    OPC_RANGE(OBD)  + \
+			    OPC_RANGE(LLOG) + \
+			    OPC_RANGE(SEC)  + \
+			    OPC_RANGE(SEQ)  + \
+			    OPC_RANGE(SEC)  + \
+			    OPC_RANGE(FLD)  + \
+			    OPC_RANGE(UPDATE))
+
+#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR)  + \
+			    OPC_RANGE(EXTRA))
+
+enum {
+	PTLRPC_REQWAIT_CNTR = 0,
+	PTLRPC_REQQDEPTH_CNTR,
+	PTLRPC_REQACTIVE_CNTR,
+	PTLRPC_TIMEOUT,
+	PTLRPC_REQBUF_AVAIL_CNTR,
+	PTLRPC_LAST_CNTR
+};
+
+#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
+
+enum {
+	LDLM_GLIMPSE_ENQUEUE = 0,
+	LDLM_PLAIN_ENQUEUE,
+	LDLM_EXTENT_ENQUEUE,
+	LDLM_FLOCK_ENQUEUE,
+	LDLM_IBITS_ENQUEUE,
+	MDS_REINT_SETATTR,
+	MDS_REINT_CREATE,
+	MDS_REINT_LINK,
+	MDS_REINT_UNLINK,
+	MDS_REINT_RENAME,
+	MDS_REINT_OPEN,
+	MDS_REINT_SETXATTR,
+	BRW_READ_BYTES,
+	BRW_WRITE_BYTES,
+	EXTRA_LAST_OPC
+};
+
+#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
+/* class_obd.c */
+extern proc_dir_entry_t *proc_lustre_root;
+
+struct obd_device;
+struct obd_histogram;
+
+/* Days / hours / mins / seconds format */
+struct dhms {
+	int d,h,m,s;
+};
+static inline void s2dhms(struct dhms *ts, time_t secs)
+{
+	ts->d = secs / 86400;
+	secs = secs % 86400;
+	ts->h = secs / 3600;
+	secs = secs % 3600;
+	ts->m = secs / 60;
+	ts->s = secs % 60;
+}
+#define DHMS_FMT "%dd%dh%02dm%02ds"
+#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
+
+#define JOBSTATS_JOBID_VAR_MAX_LEN	20
+#define JOBSTATS_DISABLE		"disable"
+#define JOBSTATS_PROCNAME_UID		"procname_uid"
+
+typedef void (*cntr_init_callback)(struct lprocfs_stats *stats);
+
+struct obd_job_stats {
+	cfs_hash_t	*ojs_hash;
+	struct list_head	 ojs_list;
+	rwlock_t       ojs_lock; /* protect the obj_list */
+	cntr_init_callback ojs_cntr_init_fn;
+	int		ojs_cntr_num;
+	int		ojs_cleanup_interval;
+	time_t		   ojs_last_cleanup;
+};
+
+#ifdef LPROCFS
+
+extern int lprocfs_stats_alloc_one(struct lprocfs_stats *stats,
+				   unsigned int cpuid);
+/*
+ * \return value
+ *      < 0     : on error (only possible for opc as LPROCFS_GET_SMP_ID)
+ */
+static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc,
+				     unsigned long *flags)
+{
+	int		rc = 0;
+
+	switch (opc) {
+	default:
+		LBUG();
+
+	case LPROCFS_GET_SMP_ID:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, *flags);
+			else
+				spin_lock(&stats->ls_lock);
+			return 0;
+		} else {
+			unsigned int cpuid = get_cpu();
+
+			if (unlikely(stats->ls_percpu[cpuid] == NULL)) {
+				rc = lprocfs_stats_alloc_one(stats, cpuid);
+				if (rc < 0) {
+					put_cpu();
+					return rc;
+				}
+			}
+			return cpuid;
+		}
+
+	case LPROCFS_GET_NUM_CPU:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, *flags);
+			else
+				spin_lock(&stats->ls_lock);
+			return 1;
+		} else {
+			return stats->ls_biggest_alloc_num;
+		}
+	}
+}
+
+static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc,
+					unsigned long *flags)
+{
+	switch (opc) {
+	default:
+		LBUG();
+
+	case LPROCFS_GET_SMP_ID:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+				spin_unlock_irqrestore(&stats->ls_lock,
+							   *flags);
+			} else {
+				spin_unlock(&stats->ls_lock);
+			}
+		} else {
+			put_cpu();
+		}
+		return;
+
+	case LPROCFS_GET_NUM_CPU:
+		if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+				spin_unlock_irqrestore(&stats->ls_lock,
+							   *flags);
+			} else {
+				spin_unlock(&stats->ls_lock);
+			}
+		}
+		return;
+	}
+}
+
+static inline unsigned int
+lprocfs_stats_counter_size(struct lprocfs_stats *stats)
+{
+	unsigned int percpusize;
+
+	percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]);
+
+	/* irq safe stats need lc_array_sum[1] */
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+		percpusize += stats->ls_num * sizeof(__s64);
+
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0)
+		percpusize = L1_CACHE_ALIGN(percpusize);
+
+	return percpusize;
+}
+
+static inline struct lprocfs_counter *
+lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid,
+			  int index)
+{
+	struct lprocfs_counter *cntr;
+
+	cntr = &stats->ls_percpu[cpuid]->lp_cntr[index];
+
+	if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+		cntr = (void *)cntr + index * sizeof(__s64);
+
+	return cntr;
+}
+
+/* Two optimized LPROCFS counter increment functions are provided:
+ *     lprocfs_counter_incr(cntr, value) - optimized for by-one counters
+ *     lprocfs_counter_add(cntr) - use for multi-valued counters
+ * Counter data layout allows config flag, counter lock and the
+ * count itself to reside within a single cache line.
+ */
+
+extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
+				long amount);
+extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
+				long amount);
+
+#define lprocfs_counter_incr(stats, idx) \
+	lprocfs_counter_add(stats, idx, 1)
+#define lprocfs_counter_decr(stats, idx) \
+	lprocfs_counter_sub(stats, idx, 1)
+
+extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+				 struct lprocfs_counter_header *header,
+				 enum lprocfs_stats_flags flags,
+				 enum lprocfs_fields_flags field);
+static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats,
+					    int idx,
+					    enum lprocfs_fields_flags field)
+{
+	int	      i;
+	unsigned int  num_cpu;
+	unsigned long flags	= 0;
+	__u64	      ret	= 0;
+
+	LASSERT(stats != NULL);
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		ret += lprocfs_read_helper(
+				lprocfs_stats_counter_get(stats, i, idx),
+				&stats->ls_cnt_header[idx], stats->ls_flags,
+				field);
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	return ret;
+}
+
+extern struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
+extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
+extern void lprocfs_free_stats(struct lprocfs_stats **stats);
+extern void lprocfs_init_ops_stats(int num_private_stats,
+				   struct lprocfs_stats *stats);
+extern void lprocfs_init_mps_stats(int num_private_stats,
+				   struct lprocfs_stats *stats);
+extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
+extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+				   unsigned int num_private_stats);
+extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
+				  unsigned int num_private_stats);
+extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+				 unsigned conf, const char *name,
+				 const char *units);
+extern void lprocfs_free_obd_stats(struct obd_device *obddev);
+extern void lprocfs_free_md_stats(struct obd_device *obddev);
+struct obd_export;
+struct nid_stat;
+extern int lprocfs_add_clear_entry(struct obd_device * obd,
+				   proc_dir_entry_t *entry);
+extern int lprocfs_exp_setup(struct obd_export *exp,
+			     lnet_nid_t *peer_nid, int *newnid);
+extern int lprocfs_exp_cleanup(struct obd_export *exp);
+extern proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+						char *name,
+						void *data,
+						struct file_operations *fops);
+extern struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+		    const char *format, ...);
+extern void lprocfs_free_per_client_stats(struct obd_device *obd);
+extern int
+lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+			      unsigned long count, void *data);
+extern int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data);
+
+extern int lprocfs_register_stats(proc_dir_entry_t *root, const char *name,
+				  struct lprocfs_stats *stats);
+
+/* lprocfs_status.c */
+extern int lprocfs_add_vars(proc_dir_entry_t *root,
+			    struct lprocfs_vars *var,
+			    void *data);
+
+extern proc_dir_entry_t *lprocfs_register(const char *name,
+					      proc_dir_entry_t *parent,
+					      struct lprocfs_vars *list,
+					      void *data);
+
+extern void lprocfs_remove(proc_dir_entry_t **root);
+extern void lprocfs_remove_proc_entry(const char *name,
+				      struct proc_dir_entry *parent);
+
+extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
+extern int lprocfs_obd_cleanup(struct obd_device *obd);
+
+extern int lprocfs_seq_create(proc_dir_entry_t *parent, const char *name,
+			      mode_t mode,
+			      const struct file_operations *seq_fops,
+			      void *data);
+extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
+				  mode_t mode,
+				  const struct file_operations *seq_fops,
+				  void *data);
+
+/* Generic callbacks */
+
+extern int lprocfs_rd_u64(struct seq_file *m, void *data);
+extern int lprocfs_rd_atomic(struct seq_file *m, void *data);
+extern int lprocfs_wr_atomic(struct file *file, const char *buffer,
+			     unsigned long count, void *data);
+extern int lprocfs_rd_uint(struct seq_file *m, void *data);
+extern int lprocfs_wr_uint(struct file *file, const char *buffer,
+			   unsigned long count, void *data);
+extern int lprocfs_rd_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_name(struct seq_file *m, void *data);
+extern int lprocfs_rd_server_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_conn_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_import(struct seq_file *m, void *data);
+extern int lprocfs_rd_state(struct seq_file *m, void *data);
+extern int lprocfs_rd_connect_flags(struct seq_file *m, void *data);
+extern int lprocfs_rd_num_exports(struct seq_file *m, void *data);
+extern int lprocfs_rd_numrefs(struct seq_file *m, void *data);
+
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(struct seq_file *m,
+				  struct adaptive_timeout *at);
+extern int lprocfs_rd_timeouts(struct seq_file *m, void *data);
+extern int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+			       unsigned long count, void *data);
+extern int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+			    size_t count, loff_t *off);
+extern int lprocfs_wr_ping(struct file *file, const char *buffer,
+			   size_t count, loff_t *off);
+extern int lprocfs_wr_import(struct file *file, const char *buffer,
+		      size_t count, loff_t *off);
+extern int lprocfs_rd_pinger_recov(struct seq_file *m, void *n);
+extern int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+				   size_t count, loff_t *off);
+
+/* Statfs helpers */
+extern int lprocfs_rd_blksize(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytestotal(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytesfree(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytesavail(struct seq_file *m, void *data);
+extern int lprocfs_rd_filestotal(struct seq_file *m, void *data);
+extern int lprocfs_rd_filesfree(struct seq_file *m, void *data);
+
+extern int lprocfs_write_helper(const char *buffer, unsigned long count,
+				int *val);
+extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+				     int *val, int mult);
+extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult);
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
+				    long val, int mult);
+extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count,
+				    __u64 *val);
+extern int lprocfs_write_frac_u64_helper(const char *buffer,
+					 unsigned long count,
+					 __u64 *val, int mult);
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+				unsigned long *count);
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_clear(struct obd_histogram *oh);
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
+
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt);
+
+extern int lprocfs_single_release(cfs_inode_t *, struct file *);
+extern int lprocfs_seq_release(cfs_inode_t *, struct file *);
+
+/* You must use these macros when you want to refer to
+ * the import in a client obd_device for a lprocfs entry */
+#define LPROCFS_CLIMP_CHECK(obd) do {	   \
+	typecheck(struct obd_device *, obd);    \
+	down_read(&(obd)->u.cli.cl_sem);    \
+	if ((obd)->u.cli.cl_import == NULL) {   \
+	     up_read(&(obd)->u.cli.cl_sem); \
+	     return -ENODEV;		    \
+	}				       \
+} while(0)
+#define LPROCFS_CLIMP_EXIT(obd)		 \
+	up_read(&(obd)->u.cli.cl_sem);
+
+
+/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
+  proc entries; otherwise, you will define name##_seq_write function also for
+  a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
+  call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
+#define __LPROC_SEQ_FOPS(name, custom_seq_write)			\
+static int name##_single_open(cfs_inode_t *inode, struct file *file)	\
+{									\
+	return single_open(file, name##_seq_show, PDE_DATA(inode));	\
+}									\
+struct file_operations name##_fops = {				     \
+	.owner   = THIS_MODULE,					    \
+	.open    = name##_single_open,				     \
+	.read    = seq_read,					       \
+	.write   = custom_seq_write,				       \
+	.llseek  = seq_lseek,					      \
+	.release = lprocfs_single_release,				 \
+}
+
+#define LPROC_SEQ_FOPS_RO(name)	 __LPROC_SEQ_FOPS(name, NULL)
+#define LPROC_SEQ_FOPS(name)	    __LPROC_SEQ_FOPS(name, name##_seq_write)
+
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_rd_##type(m, m->private);		\
+	}								\
+	LPROC_SEQ_FOPS_RO(name##_##type)
+
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)				\
+	static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+	{								\
+		return lprocfs_rd_##type(m, m->private);		\
+	}								\
+	static ssize_t name##_##type##_seq_write(struct file *file,	\
+			const char *buffer, size_t count, loff_t *off)	\
+	{								\
+		struct seq_file *seq = file->private_data;		\
+		return lprocfs_wr_##type(file, buffer,			\
+					 count, seq->private);		\
+	}								\
+	LPROC_SEQ_FOPS(name##_##type);
+
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)				\
+	static ssize_t name##_##type##_write(struct file *file,		\
+			const char *buffer, size_t count, loff_t *off)	\
+	{								\
+		return lprocfs_wr_##type(file, buffer, count, off);	\
+	}								\
+	static int name##_##type##_open(cfs_inode_t *inode, struct file *file) \
+	{								\
+		return single_open(file, NULL, PDE_DATA(inode));	\
+	}								\
+	struct file_operations name##_##type##_fops = {			\
+		.open	= name##_##type##_open,				\
+		.write	= name##_##type##_write,			\
+		.release = lprocfs_single_release,			\
+	};
+
+/* lprocfs_jobstats.c */
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+			  int event, long amount);
+void lprocfs_job_stats_fini(struct obd_device *obd);
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback fn);
+int lprocfs_rd_job_interval(struct seq_file *m, void *data);
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+			    unsigned long count, void *data);
+
+/* lproc_ptlrpc.c */
+struct ptlrpc_request;
+extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
+
+/* lproc_status.c */
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data);
+int lprocfs_obd_wr_max_pages_per_rpc(struct file *file, const char *buffer,
+				     size_t count, loff_t *off);
+
+/* all quota proc functions */
+extern int lprocfs_quota_rd_bunit(char *page, char **start,
+				  loff_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_btune(char *page, char **start,
+				  loff_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_iunit(char *page, char **start,
+				  loff_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_itune(char *page, char **start,
+				  loff_t off, int count,
+				  int *eof, void *data);
+extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
+				  unsigned long count, void *data);
+extern int lprocfs_quota_rd_type(char *page, char **start, loff_t off, int count,
+				 int *eof, void *data);
+extern int lprocfs_quota_wr_type(struct file *file, const char *buffer,
+				 unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, loff_t off,
+					   int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_seconds(struct file *file,
+					   const char *buffer,
+					   unsigned long count, void *data);
+extern int lprocfs_quota_rd_sync_blk(char *page, char **start, loff_t off,
+				     int count, int *eof, void *data);
+extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer,
+				     unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_qs(char *page, char **start, loff_t off,
+				      int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_qs(struct file *file,
+				      const char *buffer,
+				      unsigned long count, void *data);
+extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, loff_t off,
+					    int count, int *eof, void *data);
+extern int lprocfs_quota_wr_boundary_factor(struct file *file,
+					    const char *buffer,
+					    unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_bunit(char *page, char **start, loff_t off,
+					int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_bunit(struct file *file,
+					const char *buffer,
+					unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_iunit(char *page, char **start, loff_t off,
+					int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_iunit(struct file *file,
+					const char *buffer,
+					unsigned long count, void *data);
+extern int lprocfs_quota_rd_qs_factor(char *page, char **start, loff_t off,
+				      int count, int *eof, void *data);
+extern int lprocfs_quota_wr_qs_factor(struct file *file,
+				      const char *buffer,
+				      unsigned long count, void *data);
+
+
+
+#else
+/* LPROCFS is not defined */
+
+#define proc_lustre_root NULL
+
+static inline void lprocfs_counter_add(struct lprocfs_stats *stats,
+				       int index, long amount)
+{ return; }
+static inline void lprocfs_counter_incr(struct lprocfs_stats *stats,
+					int index)
+{ return; }
+static inline void lprocfs_counter_sub(struct lprocfs_stats *stats,
+				       int index, long amount)
+{ return; }
+static inline void lprocfs_counter_decr(struct lprocfs_stats *stats,
+					int index)
+{ return; }
+static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
+					int index, unsigned conf,
+					const char *name, const char *units)
+{ return; }
+
+static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
+				   enum lprocfs_fields_flags field)
+{ return 0; }
+
+/* NB: we return !NULL to satisfy error checker */
+static inline struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags)
+{ return (struct lprocfs_stats *)1; }
+static inline void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_free_stats(struct lprocfs_stats **stats)
+{ return; }
+static inline int lprocfs_register_stats(proc_dir_entry_t *root,
+					 const char *name,
+					 struct lprocfs_stats *stats)
+{ return 0; }
+static inline void lprocfs_init_ops_stats(int num_private_stats,
+					  struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_mps_stats(int num_private_stats,
+					  struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{ return; }
+static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+					  unsigned int num_private_stats)
+{ return 0; }
+static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
+					 unsigned int num_private_stats)
+{ return 0; }
+static inline void lprocfs_free_obd_stats(struct obd_device *obddev)
+{ return; }
+static inline void lprocfs_free_md_stats(struct obd_device *obddev)
+{ return; }
+
+struct obd_export;
+static inline int lprocfs_add_clear_entry(struct obd_export *exp)
+{ return 0; }
+static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid,
+				    int *newnid)
+{ return 0; }
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+static inline proc_dir_entry_t *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+		   void *data, struct file_operations *fops)
+{return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+		    const char *format, ...)
+{return NULL; }
+static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{return count;}
+static inline
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{ return 0; }
+
+static inline proc_dir_entry_t *
+lprocfs_register(const char *name, proc_dir_entry_t *parent,
+		 struct lprocfs_vars *list, void *data)
+{ return NULL; }
+static inline int lprocfs_add_vars(proc_dir_entry_t *root,
+				   struct lprocfs_vars *var,
+				   void *data)
+{ return 0; }
+static inline void lprocfs_remove(proc_dir_entry_t **root)
+{ return; }
+static inline void lprocfs_remove_proc_entry(const char *name,
+					     struct proc_dir_entry *parent)
+{ return; }
+static inline int lprocfs_obd_setup(struct obd_device *dev,
+				    struct lprocfs_vars *list)
+{ return 0; }
+static inline int lprocfs_obd_cleanup(struct obd_device *dev)
+{ return 0; }
+static inline int lprocfs_rd_u64(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_name(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_import(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_pinger_recov(struct seq_file *m, void *n)
+{ return 0; }
+static inline int lprocfs_rd_state(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{ return 0; }
+extern inline int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{ return 0; }
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(struct seq_file *m,
+					 struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_wr_timeouts(struct file *file,
+				      const char *buffer,
+				      unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+				    size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_ping(struct file *file, const char *buffer,
+			   size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_import(struct file *file, const char *buffer,
+			      size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+					size_t count, loff_t *off)
+{ return 0; }
+
+/* Statfs helpers */
+static inline
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{ return; }
+static inline
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{ return 0; }
+static inline
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt)
+{ return; }
+static inline
+__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+			       enum lprocfs_fields_flags field)
+{ return (__u64)0; }
+
+#define LPROC_SEQ_FOPS_RO(name)
+#define LPROC_SEQ_FOPS(name)
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)
+
+/* lprocfs_jobstats.c */
+static inline
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event,
+			  long amount)
+{ return 0; }
+static inline
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback fn)
+{ return 0; }
+
+
+/* lproc_ptlrpc.c */
+#define target_print_req NULL
+
+#endif /* LPROCFS */
+
+#endif /* LPROCFS_SNMP_H */

diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h
new file mode 100644
index 0000000..d40ad81
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_object.h

@@ -0,0 +1,1346 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_LU_OBJECT_H
+#define __LUSTRE_LU_OBJECT_H
+
+#include <stdarg.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+struct lprocfs_stats;
+
+/** \defgroup lu lu
+ * lu_* data-types represent server-side entities shared by data and meta-data
+ * stacks.
+ *
+ * Design goals:
+ *
+ * -# support for layering.
+ *
+ *     Server side object is split into layers, one per device in the
+ *     corresponding device stack. Individual layer is represented by struct
+ *     lu_object. Compound layered object --- by struct lu_object_header. Most
+ *     interface functions take lu_object as an argument and operate on the
+ *     whole compound object. This decision was made due to the following
+ *     reasons:
+ *
+ *	- it's envisaged that lu_object will be used much more often than
+ *	lu_object_header;
+ *
+ *	- we want lower (non-top) layers to be able to initiate operations
+ *	on the whole object.
+ *
+ *     Generic code supports layering more complex than simple stacking, e.g.,
+ *     it is possible that at some layer object "spawns" multiple sub-objects
+ *     on the lower layer.
+ *
+ * -# fid-based identification.
+ *
+ *     Compound object is uniquely identified by its fid. Objects are indexed
+ *     by their fids (hash table is used for index).
+ *
+ * -# caching and life-cycle management.
+ *
+ *     Object's life-time is controlled by reference counting. When reference
+ *     count drops to 0, object is returned to cache. Cached objects still
+ *     retain their identity (i.e., fid), and can be recovered from cache.
+ *
+ *     Objects are kept in the global LRU list, and lu_site_purge() function
+ *     can be used to reclaim given number of unused objects from the tail of
+ *     the LRU.
+ *
+ * -# avoiding recursion.
+ *
+ *     Generic code tries to replace recursion through layers by iterations
+ *     where possible. Additionally to the end of reducing stack consumption,
+ *     data, when practically possible, are allocated through lu_context_key
+ *     interface rather than on stack.
+ * @{
+ */
+
+struct lu_site;
+struct lu_object;
+struct lu_device;
+struct lu_object_header;
+struct lu_context;
+struct lu_env;
+
+/**
+ * Operations common for data and meta-data devices.
+ */
+struct lu_device_operations {
+	/**
+	 * Allocate object for the given device (without lower-layer
+	 * parts). This is called by lu_object_operations::loo_object_init()
+	 * from the parent layer, and should setup at least lu_object::lo_dev
+	 * and lu_object::lo_ops fields of resulting lu_object.
+	 *
+	 * Object creation protocol.
+	 *
+	 * Due to design goal of avoiding recursion, object creation (see
+	 * lu_object_alloc()) is somewhat involved:
+	 *
+	 *  - first, lu_device_operations::ldo_object_alloc() method of the
+	 *  top-level device in the stack is called. It should allocate top
+	 *  level object (including lu_object_header), but without any
+	 *  lower-layer sub-object(s).
+	 *
+	 *  - then lu_object_alloc() sets fid in the header of newly created
+	 *  object.
+	 *
+	 *  - then lu_object_operations::loo_object_init() is called. It has
+	 *  to allocate lower-layer object(s). To do this,
+	 *  lu_object_operations::loo_object_init() calls ldo_object_alloc()
+	 *  of the lower-layer device(s).
+	 *
+	 *  - for all new objects allocated by
+	 *  lu_object_operations::loo_object_init() (and inserted into object
+	 *  stack), lu_object_operations::loo_object_init() is called again
+	 *  repeatedly, until no new objects are created.
+	 *
+	 * \post ergo(!IS_ERR(result), result->lo_dev == d &&
+	 *			     result->lo_ops != NULL);
+	 */
+	struct lu_object *(*ldo_object_alloc)(const struct lu_env *env,
+					      const struct lu_object_header *h,
+					      struct lu_device *d);
+	/**
+	 * process config specific for device.
+	 */
+	int (*ldo_process_config)(const struct lu_env *env,
+				  struct lu_device *, struct lustre_cfg *);
+	int (*ldo_recovery_complete)(const struct lu_env *,
+				     struct lu_device *);
+
+	/**
+	 * initialize local objects for device. this method called after layer has
+	 * been initialized (after LCFG_SETUP stage) and before it starts serving
+	 * user requests.
+	 */
+
+	int (*ldo_prepare)(const struct lu_env *,
+			   struct lu_device *parent,
+			   struct lu_device *dev);
+
+};
+
+/**
+ * For lu_object_conf flags
+ */
+typedef enum {
+	/* This is a new object to be allocated, or the file
+	 * corresponding to the object does not exists. */
+	LOC_F_NEW	= 0x00000001,
+} loc_flags_t;
+
+/**
+ * Object configuration, describing particulars of object being created. On
+ * server this is not used, as server objects are full identified by fid. On
+ * client configuration contains struct lustre_md.
+ */
+struct lu_object_conf {
+	/**
+	 * Some hints for obj find and alloc.
+	 */
+	loc_flags_t     loc_flags;
+};
+
+/**
+ * Type of "printer" function used by lu_object_operations::loo_object_print()
+ * method.
+ *
+ * Printer function is needed to provide some flexibility in (semi-)debugging
+ * output: possible implementations: printk, CDEBUG, sysfs/seq_file
+ */
+typedef int (*lu_printer_t)(const struct lu_env *env,
+			    void *cookie, const char *format, ...)
+	__attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Operations specific for particular lu_object.
+ */
+struct lu_object_operations {
+
+	/**
+	 * Allocate lower-layer parts of the object by calling
+	 * lu_device_operations::ldo_object_alloc() of the corresponding
+	 * underlying device.
+	 *
+	 * This method is called once for each object inserted into object
+	 * stack. It's responsibility of this method to insert lower-layer
+	 * object(s) it create into appropriate places of object stack.
+	 */
+	int (*loo_object_init)(const struct lu_env *env,
+			       struct lu_object *o,
+			       const struct lu_object_conf *conf);
+	/**
+	 * Called (in top-to-bottom order) during object allocation after all
+	 * layers were allocated and initialized. Can be used to perform
+	 * initialization depending on lower layers.
+	 */
+	int (*loo_object_start)(const struct lu_env *env,
+				struct lu_object *o);
+	/**
+	 * Called before lu_object_operations::loo_object_free() to signal
+	 * that object is being destroyed. Dual to
+	 * lu_object_operations::loo_object_init().
+	 */
+	void (*loo_object_delete)(const struct lu_env *env,
+				  struct lu_object *o);
+	/**
+	 * Dual to lu_device_operations::ldo_object_alloc(). Called when
+	 * object is removed from memory.
+	 */
+	void (*loo_object_free)(const struct lu_env *env,
+				struct lu_object *o);
+	/**
+	 * Called when last active reference to the object is released (and
+	 * object returns to the cache). This method is optional.
+	 */
+	void (*loo_object_release)(const struct lu_env *env,
+				   struct lu_object *o);
+	/**
+	 * Optional debugging helper. Print given object.
+	 */
+	int (*loo_object_print)(const struct lu_env *env, void *cookie,
+				lu_printer_t p, const struct lu_object *o);
+	/**
+	 * Optional debugging method. Returns true iff method is internally
+	 * consistent.
+	 */
+	int (*loo_object_invariant)(const struct lu_object *o);
+};
+
+/**
+ * Type of lu_device.
+ */
+struct lu_device_type;
+
+/**
+ * Device: a layer in the server side abstraction stacking.
+ */
+struct lu_device {
+	/**
+	 * reference count. This is incremented, in particular, on each object
+	 * created at this layer.
+	 *
+	 * \todo XXX which means that atomic_t is probably too small.
+	 */
+	atomic_t		       ld_ref;
+	/**
+	 * Pointer to device type. Never modified once set.
+	 */
+	struct lu_device_type       *ld_type;
+	/**
+	 * Operation vector for this device.
+	 */
+	const struct lu_device_operations *ld_ops;
+	/**
+	 * Stack this device belongs to.
+	 */
+	struct lu_site		    *ld_site;
+	struct proc_dir_entry	     *ld_proc_entry;
+
+	/** \todo XXX: temporary back pointer into obd. */
+	struct obd_device		 *ld_obd;
+	/**
+	 * A list of references to this object, for debugging.
+	 */
+	struct lu_ref		      ld_reference;
+	/**
+	 * Link the device to the site.
+	 **/
+	struct list_head			 ld_linkage;
+};
+
+struct lu_device_type_operations;
+
+/**
+ * Tag bits for device type. They are used to distinguish certain groups of
+ * device types.
+ */
+enum lu_device_tag {
+	/** this is meta-data device */
+	LU_DEVICE_MD = (1 << 0),
+	/** this is data device */
+	LU_DEVICE_DT = (1 << 1),
+	/** data device in the client stack */
+	LU_DEVICE_CL = (1 << 2)
+};
+
+/**
+ * Type of device.
+ */
+struct lu_device_type {
+	/**
+	 * Tag bits. Taken from enum lu_device_tag. Never modified once set.
+	 */
+	__u32				   ldt_tags;
+	/**
+	 * Name of this class. Unique system-wide. Never modified once set.
+	 */
+	char				   *ldt_name;
+	/**
+	 * Operations for this type.
+	 */
+	const struct lu_device_type_operations *ldt_ops;
+	/**
+	 * \todo XXX: temporary pointer to associated obd_type.
+	 */
+	struct obd_type			*ldt_obd_type;
+	/**
+	 * \todo XXX: temporary: context tags used by obd_*() calls.
+	 */
+	__u32				   ldt_ctx_tags;
+	/**
+	 * Number of existing device type instances.
+	 */
+	unsigned				ldt_device_nr;
+	/**
+	 * Linkage into a global list of all device types.
+	 *
+	 * \see lu_device_types.
+	 */
+	struct list_head			      ldt_linkage;
+};
+
+/**
+ * Operations on a device type.
+ */
+struct lu_device_type_operations {
+	/**
+	 * Allocate new device.
+	 */
+	struct lu_device *(*ldto_device_alloc)(const struct lu_env *env,
+					       struct lu_device_type *t,
+					       struct lustre_cfg *lcfg);
+	/**
+	 * Free device. Dual to
+	 * lu_device_type_operations::ldto_device_alloc(). Returns pointer to
+	 * the next device in the stack.
+	 */
+	struct lu_device *(*ldto_device_free)(const struct lu_env *,
+					      struct lu_device *);
+
+	/**
+	 * Initialize the devices after allocation
+	 */
+	int  (*ldto_device_init)(const struct lu_env *env,
+				 struct lu_device *, const char *,
+				 struct lu_device *);
+	/**
+	 * Finalize device. Dual to
+	 * lu_device_type_operations::ldto_device_init(). Returns pointer to
+	 * the next device in the stack.
+	 */
+	struct lu_device *(*ldto_device_fini)(const struct lu_env *env,
+					      struct lu_device *);
+	/**
+	 * Initialize device type. This is called on module load.
+	 */
+	int  (*ldto_init)(struct lu_device_type *t);
+	/**
+	 * Finalize device type. Dual to
+	 * lu_device_type_operations::ldto_init(). Called on module unload.
+	 */
+	void (*ldto_fini)(struct lu_device_type *t);
+	/**
+	 * Called when the first device is created.
+	 */
+	void (*ldto_start)(struct lu_device_type *t);
+	/**
+	 * Called when number of devices drops to 0.
+	 */
+	void (*ldto_stop)(struct lu_device_type *t);
+};
+
+static inline int lu_device_is_md(const struct lu_device *d)
+{
+	return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD);
+}
+
+/**
+ * Flags for the object layers.
+ */
+enum lu_object_flags {
+	/**
+	 * this flags is set if lu_object_operations::loo_object_init() has
+	 * been called for this layer. Used by lu_object_alloc().
+	 */
+	LU_OBJECT_ALLOCATED = (1 << 0)
+};
+
+/**
+ * Common object attributes.
+ */
+struct lu_attr {
+	/** size in bytes */
+	__u64	  la_size;
+	/** modification time in seconds since Epoch */
+	obd_time       la_mtime;
+	/** access time in seconds since Epoch */
+	obd_time       la_atime;
+	/** change time in seconds since Epoch */
+	obd_time       la_ctime;
+	/** 512-byte blocks allocated to object */
+	__u64	  la_blocks;
+	/** permission bits and file type */
+	__u32	  la_mode;
+	/** owner id */
+	__u32	  la_uid;
+	/** group id */
+	__u32	  la_gid;
+	/** object flags */
+	__u32	  la_flags;
+	/** number of persistent references to this object */
+	__u32	  la_nlink;
+	/** blk bits of the object*/
+	__u32	  la_blkbits;
+	/** blk size of the object*/
+	__u32	  la_blksize;
+	/** real device */
+	__u32	  la_rdev;
+	/**
+	 * valid bits
+	 *
+	 * \see enum la_valid
+	 */
+	__u64	  la_valid;
+};
+
+/** Bit-mask of valid attributes */
+enum la_valid {
+	LA_ATIME = 1 << 0,
+	LA_MTIME = 1 << 1,
+	LA_CTIME = 1 << 2,
+	LA_SIZE  = 1 << 3,
+	LA_MODE  = 1 << 4,
+	LA_UID   = 1 << 5,
+	LA_GID   = 1 << 6,
+	LA_BLOCKS = 1 << 7,
+	LA_TYPE   = 1 << 8,
+	LA_FLAGS  = 1 << 9,
+	LA_NLINK  = 1 << 10,
+	LA_RDEV   = 1 << 11,
+	LA_BLKSIZE = 1 << 12,
+	LA_KILL_SUID = 1 << 13,
+	LA_KILL_SGID = 1 << 14,
+};
+
+/**
+ * Layer in the layered object.
+ */
+struct lu_object {
+	/**
+	 * Header for this object.
+	 */
+	struct lu_object_header	   *lo_header;
+	/**
+	 * Device for this layer.
+	 */
+	struct lu_device		  *lo_dev;
+	/**
+	 * Operations for this object.
+	 */
+	const struct lu_object_operations *lo_ops;
+	/**
+	 * Linkage into list of all layers.
+	 */
+	struct list_head			 lo_linkage;
+	/**
+	 * Depth. Top level layer depth is 0.
+	 */
+	int				lo_depth;
+	/**
+	 * Flags from enum lu_object_flags.
+	 */
+	__u32					lo_flags;
+	/**
+	 * Link to the device, for debugging.
+	 */
+	struct lu_ref_link		*lo_dev_ref;
+};
+
+enum lu_object_header_flags {
+	/**
+	 * Don't keep this object in cache. Object will be destroyed as soon
+	 * as last reference to it is released. This flag cannot be cleared
+	 * once set.
+	 */
+	LU_OBJECT_HEARD_BANSHEE = 0,
+	/**
+	 * Mark this object has already been taken out of cache.
+	 */
+	LU_OBJECT_UNHASHED = 1
+};
+
+enum lu_object_header_attr {
+	LOHA_EXISTS   = 1 << 0,
+	LOHA_REMOTE   = 1 << 1,
+	/**
+	 * UNIX file type is stored in S_IFMT bits.
+	 */
+	LOHA_FT_START = 001 << 12, /**< S_IFIFO */
+	LOHA_FT_END   = 017 << 12, /**< S_IFMT */
+};
+
+/**
+ * "Compound" object, consisting of multiple layers.
+ *
+ * Compound object with given fid is unique with given lu_site.
+ *
+ * Note, that object does *not* necessary correspond to the real object in the
+ * persistent storage: object is an anchor for locking and method calling, so
+ * it is created for things like not-yet-existing child created by mkdir or
+ * create calls. lu_object_operations::loo_exists() can be used to check
+ * whether object is backed by persistent storage entity.
+ */
+struct lu_object_header {
+	/**
+	 * Object flags from enum lu_object_header_flags. Set and checked
+	 * atomically.
+	 */
+	unsigned long	  loh_flags;
+	/**
+	 * Object reference count. Protected by lu_site::ls_guard.
+	 */
+	atomic_t	   loh_ref;
+	/**
+	 * Fid, uniquely identifying this object.
+	 */
+	struct lu_fid	  loh_fid;
+	/**
+	 * Common object attributes, cached for efficiency. From enum
+	 * lu_object_header_attr.
+	 */
+	__u32		  loh_attr;
+	/**
+	 * Linkage into per-site hash table. Protected by lu_site::ls_guard.
+	 */
+	struct hlist_node       loh_hash;
+	/**
+	 * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
+	 */
+	struct list_head	     loh_lru;
+	/**
+	 * Linkage into list of layers. Never modified once set (except lately
+	 * during object destruction). No locking is necessary.
+	 */
+	struct list_head	     loh_layers;
+	/**
+	 * A list of references to this object, for debugging.
+	 */
+	struct lu_ref	  loh_reference;
+};
+
+struct fld;
+
+struct lu_site_bkt_data {
+	/**
+	 * number of busy object on this bucket
+	 */
+	long		      lsb_busy;
+	/**
+	 * LRU list, updated on each access to object. Protected by
+	 * bucket lock of lu_site::ls_obj_hash.
+	 *
+	 * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+	 * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+	 * of list_for_each_entry_safe_reverse()).
+	 */
+	struct list_head		lsb_lru;
+	/**
+	 * Wait-queue signaled when an object in this site is ultimately
+	 * destroyed (lu_object_free()). It is used by lu_object_find() to
+	 * wait before re-trying when object in the process of destruction is
+	 * found in the hash table.
+	 *
+	 * \see htable_lookup().
+	 */
+	wait_queue_head_t	       lsb_marche_funebre;
+};
+
+enum {
+	LU_SS_CREATED	 = 0,
+	LU_SS_CACHE_HIT,
+	LU_SS_CACHE_MISS,
+	LU_SS_CACHE_RACE,
+	LU_SS_CACHE_DEATH_RACE,
+	LU_SS_LRU_PURGED,
+	LU_SS_LAST_STAT
+};
+
+/**
+ * lu_site is a "compartment" within which objects are unique, and LRU
+ * discipline is maintained.
+ *
+ * lu_site exists so that multiple layered stacks can co-exist in the same
+ * address space.
+ *
+ * lu_site has the same relation to lu_device as lu_object_header to
+ * lu_object.
+ */
+struct lu_site {
+	/**
+	 * objects hash table
+	 */
+	cfs_hash_t	       *ls_obj_hash;
+	/**
+	 * index of bucket on hash table while purging
+	 */
+	int		       ls_purge_start;
+	/**
+	 * Top-level device for this stack.
+	 */
+	struct lu_device	 *ls_top_dev;
+	/**
+	 * Bottom-level device for this stack
+	 */
+	struct lu_device	*ls_bottom_dev;
+	/**
+	 * Linkage into global list of sites.
+	 */
+	struct list_head		ls_linkage;
+	/**
+	 * List for lu device for this site, protected
+	 * by ls_ld_lock.
+	 **/
+	struct list_head		ls_ld_linkage;
+	spinlock_t		ls_ld_lock;
+
+	/**
+	 * lu_site stats
+	 */
+	struct lprocfs_stats	*ls_stats;
+	/**
+	 * XXX: a hack! fld has to find md_site via site, remove when possible
+	 */
+	struct seq_server_site	*ld_seq_site;
+};
+
+static inline struct lu_site_bkt_data *
+lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+	cfs_hash_bd_t bd;
+
+	cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+	return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+}
+
+/** \name ctors
+ * Constructors/destructors.
+ * @{
+ */
+
+int  lu_site_init	 (struct lu_site *s, struct lu_device *d);
+void lu_site_fini	 (struct lu_site *s);
+int  lu_site_init_finish  (struct lu_site *s);
+void lu_stack_fini	(const struct lu_env *env, struct lu_device *top);
+void lu_device_get	(struct lu_device *d);
+void lu_device_put	(struct lu_device *d);
+int  lu_device_init       (struct lu_device *d, struct lu_device_type *t);
+void lu_device_fini       (struct lu_device *d);
+int  lu_object_header_init(struct lu_object_header *h);
+void lu_object_header_fini(struct lu_object_header *h);
+int  lu_object_init       (struct lu_object *o,
+			   struct lu_object_header *h, struct lu_device *d);
+void lu_object_fini       (struct lu_object *o);
+void lu_object_add_top    (struct lu_object_header *h, struct lu_object *o);
+void lu_object_add	(struct lu_object *before, struct lu_object *o);
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d);
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
+
+/**
+ * Helpers to initialize and finalize device types.
+ */
+
+int  lu_device_type_init(struct lu_device_type *ldt);
+void lu_device_type_fini(struct lu_device_type *ldt);
+void lu_types_stop(void);
+
+/** @} ctors */
+
+/** \name caching
+ * Caching and reference counting.
+ * @{
+ */
+
+/**
+ * Acquire additional reference to the given object. This function is used to
+ * attain additional reference. To acquire initial reference use
+ * lu_object_find().
+ */
+static inline void lu_object_get(struct lu_object *o)
+{
+	LASSERT(atomic_read(&o->lo_header->loh_ref) > 0);
+	atomic_inc(&o->lo_header->loh_ref);
+}
+
+/**
+ * Return true of object will not be cached after last reference to it is
+ * released.
+ */
+static inline int lu_object_is_dying(const struct lu_object_header *h)
+{
+	return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
+}
+
+void lu_object_put(const struct lu_env *env, struct lu_object *o);
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
+
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr);
+
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+		   lu_printer_t printer);
+struct lu_object *lu_object_find(const struct lu_env *env,
+				 struct lu_device *dev, const struct lu_fid *f,
+				 const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+				    struct lu_device *dev,
+				    const struct lu_fid *f,
+				    const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf);
+/** @} caching */
+
+/** \name helpers
+ * Helpers.
+ * @{
+ */
+
+/**
+ * First (topmost) sub-object of given compound object
+ */
+static inline struct lu_object *lu_object_top(struct lu_object_header *h)
+{
+	LASSERT(!list_empty(&h->loh_layers));
+	return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Next sub-object in the layering
+ */
+static inline struct lu_object *lu_object_next(const struct lu_object *o)
+{
+	return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Pointer to the fid of this object.
+ */
+static inline const struct lu_fid *lu_object_fid(const struct lu_object *o)
+{
+	return &o->lo_header->loh_fid;
+}
+
+/**
+ * return device operations vector for this object
+ */
+static const inline struct lu_device_operations *
+lu_object_ops(const struct lu_object *o)
+{
+	return o->lo_dev->ld_ops;
+}
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+				   const struct lu_device_type *dtype);
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+		      void *cookie, const char *format, ...);
+
+/**
+ * Print object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_DEBUG(mask, env, object, format, ...)		   \
+do {								      \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		  \
+									  \
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		     \
+		lu_object_print(env, &msgdata, lu_cdebug_printer, object);\
+		CDEBUG(mask, format , ## __VA_ARGS__);		    \
+	}								 \
+} while (0)
+
+/**
+ * Print short object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_HEADER(mask, env, object, format, ...)		\
+do {								    \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);		\
+									\
+	if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {		   \
+		lu_object_header_print(env, &msgdata, lu_cdebug_printer,\
+				       (object)->lo_header);	    \
+		lu_cdebug_printer(env, &msgdata, "\n");		 \
+		CDEBUG(mask, format , ## __VA_ARGS__);		  \
+	}							       \
+} while (0)
+
+void lu_object_print       (const struct lu_env *env, void *cookie,
+			    lu_printer_t printer, const struct lu_object *o);
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t printer,
+			    const struct lu_object_header *hdr);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o);
+
+
+/**
+ * Check whether object exists, no matter on local or remote storage.
+ * Note: LOHA_EXISTS will be set once some one created the object,
+ * and it does not needs to be committed to storage.
+ */
+#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS)
+
+/**
+ * Check whether object on the remote storage.
+ */
+#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
+
+static inline int lu_object_assert_exists(const struct lu_object *o)
+{
+	return lu_object_exists(o);
+}
+
+static inline int lu_object_assert_not_exists(const struct lu_object *o)
+{
+	return !lu_object_exists(o);
+}
+
+/**
+ * Attr of this object.
+ */
+static inline __u32 lu_object_attr(const struct lu_object *o)
+{
+	LASSERT(lu_object_exists(o) != 0);
+	return o->lo_header->loh_attr;
+}
+
+static inline struct lu_ref_link *lu_object_ref_add(struct lu_object *o,
+						    const char *scope,
+						    const void *source)
+{
+	return lu_ref_add(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del(struct lu_object *o,
+				     const char *scope, const void *source)
+{
+	lu_ref_del(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del_at(struct lu_object *o,
+					struct lu_ref_link *link,
+					const char *scope, const void *source)
+{
+	lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+/** input params, should be filled out by mdt */
+struct lu_rdpg {
+	/** hash */
+	__u64		   rp_hash;
+	/** count in bytes */
+	unsigned int	    rp_count;
+	/** number of pages */
+	unsigned int	    rp_npages;
+	/** requested attr */
+	__u32		   rp_attrs;
+	/** pointers to pages */
+	struct page	   **rp_pages;
+};
+
+enum lu_xattr_flags {
+	LU_XATTR_REPLACE = (1 << 0),
+	LU_XATTR_CREATE  = (1 << 1)
+};
+
+/** @} helpers */
+
+/** \name lu_context
+ * @{ */
+
+/** For lu_context health-checks */
+enum lu_context_state {
+	LCS_INITIALIZED = 1,
+	LCS_ENTERED,
+	LCS_LEFT,
+	LCS_FINALIZED
+};
+
+/**
+ * lu_context. Execution context for lu_object methods. Currently associated
+ * with thread.
+ *
+ * All lu_object methods, except device and device type methods (called during
+ * system initialization and shutdown) are executed "within" some
+ * lu_context. This means, that pointer to some "current" lu_context is passed
+ * as an argument to all methods.
+ *
+ * All service ptlrpc threads create lu_context as part of their
+ * initialization. It is possible to create "stand-alone" context for other
+ * execution environments (like system calls).
+ *
+ * lu_object methods mainly use lu_context through lu_context_key interface
+ * that allows each layer to associate arbitrary pieces of data with each
+ * context (see pthread_key_create(3) for similar interface).
+ *
+ * On a client, lu_context is bound to a thread, see cl_env_get().
+ *
+ * \see lu_context_key
+ */
+struct lu_context {
+	/**
+	 * lu_context is used on the client side too. Yet we don't want to
+	 * allocate values of server-side keys for the client contexts and
+	 * vice versa.
+	 *
+	 * To achieve this, set of tags in introduced. Contexts and keys are
+	 * marked with tags. Key value are created only for context whose set
+	 * of tags has non-empty intersection with one for key. Tags are taken
+	 * from enum lu_context_tag.
+	 */
+	__u32		  lc_tags;
+	enum lu_context_state  lc_state;
+	/**
+	 * Pointer to the home service thread. NULL for other execution
+	 * contexts.
+	 */
+	struct ptlrpc_thread  *lc_thread;
+	/**
+	 * Pointer to an array with key values. Internal implementation
+	 * detail.
+	 */
+	void		 **lc_value;
+	/**
+	 * Linkage into a list of all remembered contexts. Only
+	 * `non-transient' contexts, i.e., ones created for service threads
+	 * are placed here.
+	 */
+	struct list_head	     lc_remember;
+	/**
+	 * Version counter used to skip calls to lu_context_refill() when no
+	 * keys were registered.
+	 */
+	unsigned	       lc_version;
+	/**
+	 * Debugging cookie.
+	 */
+	unsigned	       lc_cookie;
+};
+
+/**
+ * lu_context_key interface. Similar to pthread_key.
+ */
+
+enum lu_context_tag {
+	/**
+	 * Thread on md server
+	 */
+	LCT_MD_THREAD = 1 << 0,
+	/**
+	 * Thread on dt server
+	 */
+	LCT_DT_THREAD = 1 << 1,
+	/**
+	 * Context for transaction handle
+	 */
+	LCT_TX_HANDLE = 1 << 2,
+	/**
+	 * Thread on client
+	 */
+	LCT_CL_THREAD = 1 << 3,
+	/**
+	 * A per-request session on a server, and a per-system-call session on
+	 * a client.
+	 */
+	LCT_SESSION   = 1 << 4,
+	/**
+	 * A per-request data on OSP device
+	 */
+	LCT_OSP_THREAD = 1 << 5,
+	/**
+	 * MGS device thread
+	 */
+	LCT_MG_THREAD = 1 << 6,
+	/**
+	 * Context for local operations
+	 */
+	LCT_LOCAL = 1 << 7,
+	/**
+	 * Set when at least one of keys, having values in this context has
+	 * non-NULL lu_context_key::lct_exit() method. This is used to
+	 * optimize lu_context_exit() call.
+	 */
+	LCT_HAS_EXIT  = 1 << 28,
+	/**
+	 * Don't add references for modules creating key values in that context.
+	 * This is only for contexts used internally by lu_object framework.
+	 */
+	LCT_NOREF     = 1 << 29,
+	/**
+	 * Key is being prepared for retiring, don't create new values for it.
+	 */
+	LCT_QUIESCENT = 1 << 30,
+	/**
+	 * Context should be remembered.
+	 */
+	LCT_REMEMBER  = 1 << 31,
+	/**
+	 * Contexts usable in cache shrinker thread.
+	 */
+	LCT_SHRINKER  = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF
+};
+
+/**
+ * Key. Represents per-context value slot.
+ *
+ * Keys are usually registered when module owning the key is initialized, and
+ * de-registered when module is unloaded. Once key is registered, all new
+ * contexts with matching tags, will get key value. "Old" contexts, already
+ * initialized at the time of key registration, can be forced to get key value
+ * by calling lu_context_refill().
+ *
+ * Every key value is counted in lu_context_key::lct_used and acquires a
+ * reference on an owning module. This means, that all key values have to be
+ * destroyed before module can be unloaded. This is usually achieved by
+ * stopping threads started by the module, that created contexts in their
+ * entry functions. Situation is complicated by the threads shared by multiple
+ * modules, like ptlrpcd daemon on a client. To work around this problem,
+ * contexts, created in such threads, are `remembered' (see
+ * LCT_REMEMBER)---i.e., added into a global list. When module is preparing
+ * for unloading it does the following:
+ *
+ *     - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT)
+ *       preventing new key values from being allocated in the new contexts,
+ *       and
+ *
+ *     - scans a list of remembered contexts, destroying values of module
+ *       keys, thus releasing references to the module.
+ *
+ * This is done by lu_context_key_quiesce(). If module is re-activated
+ * before key has been de-registered, lu_context_key_revive() call clears
+ * `quiescent' marker.
+ *
+ * lu_context code doesn't provide any internal synchronization for these
+ * activities---it's assumed that startup (including threads start-up) and
+ * shutdown are serialized by some external means.
+ *
+ * \see lu_context
+ */
+struct lu_context_key {
+	/**
+	 * Set of tags for which values of this key are to be instantiated.
+	 */
+	__u32 lct_tags;
+	/**
+	 * Value constructor. This is called when new value is created for a
+	 * context. Returns pointer to new value of error pointer.
+	 */
+	void  *(*lct_init)(const struct lu_context *ctx,
+			   struct lu_context_key *key);
+	/**
+	 * Value destructor. Called when context with previously allocated
+	 * value of this slot is destroyed. \a data is a value that was returned
+	 * by a matching call to lu_context_key::lct_init().
+	 */
+	void   (*lct_fini)(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+	/**
+	 * Optional method called on lu_context_exit() for all allocated
+	 * keys. Can be used by debugging code checking that locks are
+	 * released, etc.
+	 */
+	void   (*lct_exit)(const struct lu_context *ctx,
+			   struct lu_context_key *key, void *data);
+	/**
+	 * Internal implementation detail: index within lu_context::lc_value[]
+	 * reserved for this key.
+	 */
+	int      lct_index;
+	/**
+	 * Internal implementation detail: number of values created for this
+	 * key.
+	 */
+	atomic_t lct_used;
+	/**
+	 * Internal implementation detail: module for this key.
+	 */
+	module_t *lct_owner;
+	/**
+	 * References to this key. For debugging.
+	 */
+	struct lu_ref  lct_reference;
+};
+
+#define LU_KEY_INIT(mod, type)				    \
+	static void* mod##_key_init(const struct lu_context *ctx, \
+				    struct lu_context_key *key)   \
+	{							 \
+		type *value;				      \
+								  \
+		CLASSERT(PAGE_CACHE_SIZE >= sizeof (*value));       \
+								  \
+		OBD_ALLOC_PTR(value);			     \
+		if (value == NULL)				\
+			value = ERR_PTR(-ENOMEM);		 \
+								  \
+		return value;				     \
+	}							 \
+	struct __##mod##__dummy_init {;} /* semicolon catcher */
+
+#define LU_KEY_FINI(mod, type)					      \
+	static void mod##_key_fini(const struct lu_context *ctx,	    \
+				    struct lu_context_key *key, void* data) \
+	{								   \
+		type *info = data;					  \
+									    \
+		OBD_FREE_PTR(info);					 \
+	}								   \
+	struct __##mod##__dummy_fini {;} /* semicolon catcher */
+
+#define LU_KEY_INIT_FINI(mod, type)   \
+	LU_KEY_INIT(mod,type);	\
+	LU_KEY_FINI(mod,type)
+
+#define LU_CONTEXT_KEY_DEFINE(mod, tags)		\
+	struct lu_context_key mod##_thread_key = {      \
+		.lct_tags = tags,		       \
+		.lct_init = mod##_key_init,	     \
+		.lct_fini = mod##_key_fini	      \
+	}
+
+#define LU_CONTEXT_KEY_INIT(key)			\
+do {						    \
+	(key)->lct_owner = THIS_MODULE;		 \
+} while (0)
+
+int   lu_context_key_register(struct lu_context_key *key);
+void  lu_context_key_degister(struct lu_context_key *key);
+void *lu_context_key_get     (const struct lu_context *ctx,
+			       const struct lu_context_key *key);
+void  lu_context_key_quiesce (struct lu_context_key *key);
+void  lu_context_key_revive  (struct lu_context_key *key);
+
+
+/*
+ * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an
+ * owning module.
+ */
+
+#define LU_KEY_INIT_GENERIC(mod)					\
+	static void mod##_key_init_generic(struct lu_context_key *k, ...) \
+	{							       \
+		struct lu_context_key *key = k;			 \
+		va_list args;					   \
+									\
+		va_start(args, k);				      \
+		do {						    \
+			LU_CONTEXT_KEY_INIT(key);		       \
+			key = va_arg(args, struct lu_context_key *);    \
+		} while (key != NULL);				  \
+		va_end(args);					   \
+	}
+
+#define LU_TYPE_INIT(mod, ...)					  \
+	LU_KEY_INIT_GENERIC(mod)					\
+	static int mod##_type_init(struct lu_device_type *t)	    \
+	{							       \
+		mod##_key_init_generic(__VA_ARGS__, NULL);	      \
+		return lu_context_key_register_many(__VA_ARGS__, NULL); \
+	}							       \
+	struct __##mod##_dummy_type_init {;}
+
+#define LU_TYPE_FINI(mod, ...)					  \
+	static void mod##_type_fini(struct lu_device_type *t)	   \
+	{							       \
+		lu_context_key_degister_many(__VA_ARGS__, NULL);	\
+	}							       \
+	struct __##mod##_dummy_type_fini {;}
+
+#define LU_TYPE_START(mod, ...)				 \
+	static void mod##_type_start(struct lu_device_type *t)  \
+	{						       \
+		lu_context_key_revive_many(__VA_ARGS__, NULL);  \
+	}						       \
+	struct __##mod##_dummy_type_start {;}
+
+#define LU_TYPE_STOP(mod, ...)				  \
+	static void mod##_type_stop(struct lu_device_type *t)   \
+	{						       \
+		lu_context_key_quiesce_many(__VA_ARGS__, NULL); \
+	}						       \
+	struct __##mod##_dummy_type_stop {;}
+
+
+
+#define LU_TYPE_INIT_FINI(mod, ...)	     \
+	LU_TYPE_INIT(mod, __VA_ARGS__);	 \
+	LU_TYPE_FINI(mod, __VA_ARGS__);	 \
+	LU_TYPE_START(mod, __VA_ARGS__);	\
+	LU_TYPE_STOP(mod, __VA_ARGS__)
+
+int   lu_context_init  (struct lu_context *ctx, __u32 tags);
+void  lu_context_fini  (struct lu_context *ctx);
+void  lu_context_enter (struct lu_context *ctx);
+void  lu_context_exit  (struct lu_context *ctx);
+int   lu_context_refill(struct lu_context *ctx);
+
+/*
+ * Helper functions to operate on multiple keys. These are used by the default
+ * device type operations, defined by LU_TYPE_INIT_FINI().
+ */
+
+int  lu_context_key_register_many(struct lu_context_key *k, ...);
+void lu_context_key_degister_many(struct lu_context_key *k, ...);
+void lu_context_key_revive_many  (struct lu_context_key *k, ...);
+void lu_context_key_quiesce_many (struct lu_context_key *k, ...);
+
+/*
+ * update/clear ctx/ses tags.
+ */
+void lu_context_tags_update(__u32 tags);
+void lu_context_tags_clear(__u32 tags);
+void lu_session_tags_update(__u32 tags);
+void lu_session_tags_clear(__u32 tags);
+
+/**
+ * Environment.
+ */
+struct lu_env {
+	/**
+	 * "Local" context, used to store data instead of stack.
+	 */
+	struct lu_context  le_ctx;
+	/**
+	 * "Session" context for per-request data.
+	 */
+	struct lu_context *le_ses;
+};
+
+int  lu_env_init  (struct lu_env *env, __u32 tags);
+void lu_env_fini  (struct lu_env *env);
+int  lu_env_refill(struct lu_env *env);
+int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
+
+/** @} lu_context */
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m);
+
+/**
+ * Common name structure to be passed around for various name related methods.
+ */
+struct lu_name {
+	const char    *ln_name;
+	int	    ln_namelen;
+};
+
+/**
+ * Common buffer structure to be passed around for various xattr_{s,g}et()
+ * methods.
+ */
+struct lu_buf {
+	void   *lb_buf;
+	ssize_t lb_len;
+};
+
+#define DLUBUF "(%p %zu)"
+#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len
+/**
+ * One-time initializers, called at obdclass module initialization, not
+ * exported.
+ */
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void);
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void);
+
+struct lu_kmem_descr {
+	struct kmem_cache **ckd_cache;
+	const char       *ckd_name;
+	const size_t      ckd_size;
+};
+
+int  lu_kmem_init(struct lu_kmem_descr *caches);
+void lu_kmem_fini(struct lu_kmem_descr *caches);
+
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid);
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf);
+
+/** null buffer */
+extern struct lu_buf LU_BUF_NULL;
+
+void lu_buf_free(struct lu_buf *buf);
+void lu_buf_alloc(struct lu_buf *buf, int size);
+void lu_buf_realloc(struct lu_buf *buf, int size);
+
+int lu_buf_check_and_grow(struct lu_buf *buf, int len);
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len);
+
+/** @} lu */
+#endif /* __LUSTRE_LU_OBJECT_H */

diff --git a/drivers/staging/lustre/lustre/include/lu_ref.h b/drivers/staging/lustre/lustre/include/lu_ref.h
new file mode 100644
index 0000000..624c19b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_ref.h

@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LUSTRE_LU_REF_H
+#define __LUSTRE_LU_REF_H
+
+#include <linux/list.h>
+
+/** \defgroup lu_ref lu_ref
+ *
+ * An interface to track references between objects. Mostly for debugging.
+ *
+ * Suppose there is a reference counted data-structure struct foo. To track
+ * who acquired references to instance of struct foo, add lu_ref field to it:
+ *
+ * \code
+ *	 struct foo {
+ *		 atomic_t      foo_refcount;
+ *		 struct lu_ref foo_reference;
+ *		 ...
+ *	 };
+ * \endcode
+ *
+ * foo::foo_reference has to be initialized by calling
+ * lu_ref_init(). Typically there will be functions or macros to increment and
+ * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo)
+ * and foo_put(struct foo *foo), respectively.
+ *
+ * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add()
+ * has to be called to insert into foo::foo_reference a record, describing
+ * acquired reference. Dually, lu_ref_del() removes matching record. Typical
+ * usages are:
+ *
+ * \code
+ *	struct bar *bar;
+ *
+ *	// bar owns a reference to foo.
+ *	bar->bar_foo = foo_get(foo);
+ *	lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *	...
+ *
+ *	// reference from bar to foo is released.
+ *	lu_ref_del(&foo->foo_reference, "bar", bar);
+ *	foo_put(bar->bar_foo);
+ *
+ *
+ *	// current thread acquired a temporary reference to foo.
+ *	foo_get(foo);
+ *	lu_ref_add(&foo->reference, __FUNCTION__, current);
+ *
+ *	...
+ *
+ *	// temporary reference is released.
+ *	lu_ref_del(&foo->reference, __FUNCTION__, current);
+ *	foo_put(foo);
+ * \endcode
+ *
+ * \e Et \e cetera. Often it makes sense to include lu_ref_add() and
+ * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct
+ * foo is destroyed, lu_ref_fini() has to be called that checks that no
+ * pending references remain. lu_ref_print() can be used to dump a list of
+ * pending references, while hunting down a leak.
+ *
+ * For objects to which a large number of references can be acquired,
+ * lu_ref_del() can become cpu consuming, as it has to scan the list of
+ * references. To work around this, remember result of lu_ref_add() (usually
+ * in the same place where pointer to struct foo is stored), and use
+ * lu_ref_del_at():
+ *
+ * \code
+ *	// There is a large number of bar's for a single foo.
+ *	bar->bar_foo     = foo_get(foo);
+ *	bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *	...
+ *
+ *	// reference from bar to foo is released.
+ *	lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar);
+ *	foo_put(bar->bar_foo);
+ * \endcode
+ *
+ * lu_ref interface degrades gracefully in case of memory shortages.
+ *
+ * @{
+ */
+
+
+struct lu_ref  {};
+
+static inline void lu_ref_init(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_fini(struct lu_ref *ref)
+{
+}
+
+static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref,
+					     const char *scope,
+					     const void *source)
+{
+	return NULL;
+}
+
+static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref,
+						    const char *scope,
+						    const void *source)
+{
+	return NULL;
+}
+
+static inline void lu_ref_del(struct lu_ref *ref, const char *scope,
+			      const void *source)
+{
+}
+
+static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+				 const char *scope, const void *source0,
+				 const void *source1)
+{
+}
+
+static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+				 const char *scope, const void *source)
+{
+}
+
+static inline int lu_ref_global_init(void)
+{
+	return 0;
+}
+
+static inline void lu_ref_global_fini(void)
+{
+}
+
+static inline void lu_ref_print(const struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_print_all(void)
+{
+}
+
+/** @} lu */
+
+#endif /* __LUSTRE_LU_REF_H */

diff --git a/drivers/staging/lustre/lustre/include/lu_target.h b/drivers/staging/lustre/lustre/include/lu_target.h
new file mode 100644
index 0000000..8d48cf4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_target.h

@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_LU_TARGET_H
+#define _LUSTRE_LU_TARGET_H
+
+#include <dt_object.h>
+#include <lustre_disk.h>
+
+struct lu_target {
+	struct obd_device       *lut_obd;
+	struct dt_device	*lut_bottom;
+	/** last_rcvd file */
+	struct dt_object	*lut_last_rcvd;
+	/* transaction callbacks */
+	struct dt_txn_callback   lut_txn_cb;
+	/** server data in last_rcvd file */
+	struct lr_server_data    lut_lsd;
+	/** Server last transaction number */
+	__u64		    lut_last_transno;
+	/** Lock protecting last transaction number */
+	spinlock_t		 lut_translock;
+	/** Lock protecting client bitmap */
+	spinlock_t		 lut_client_bitmap_lock;
+	/** Bitmap of known clients */
+	unsigned long	   *lut_client_bitmap;
+};
+
+typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno,
+			 void *data, int err);
+struct tgt_commit_cb {
+	tgt_cb_t  tgt_cb_func;
+	void     *tgt_cb_data;
+};
+
+void tgt_boot_epoch_update(struct lu_target *lut);
+int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *lut,
+			   struct obd_export *exp, __u64 transno);
+int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp);
+int tgt_init(const struct lu_env *env, struct lu_target *lut,
+	     struct obd_device *obd, struct dt_device *dt);
+void tgt_fini(const struct lu_env *env, struct lu_target *lut);
+int tgt_client_alloc(struct obd_export *exp);
+void tgt_client_free(struct obd_export *exp);
+int tgt_client_del(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int);
+int tgt_client_new(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_data_read(const struct lu_env *env, struct lu_target *tg,
+			 struct lsd_client_data *lcd, loff_t *off, int index);
+int tgt_client_data_write(const struct lu_env *env, struct lu_target *tg,
+			  struct lsd_client_data *lcd, loff_t *off, struct thandle *th);
+int tgt_server_data_read(const struct lu_env *env, struct lu_target *tg);
+int tgt_server_data_write(const struct lu_env *env, struct lu_target *tg,
+			  struct thandle *th);
+int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg, int sync);
+int tgt_truncate_last_rcvd(const struct lu_env *env, struct lu_target *tg, loff_t off);
+
+#endif /* __LUSTRE_LU_TARGET_H */

diff --git a/drivers/staging/lustre/lustre/include/lustre/libiam.h b/drivers/staging/lustre/lustre/include/lustre/libiam.h
new file mode 100644
index 0000000..e8e0b08
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/libiam.h

@@ -0,0 +1,145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/libiam.h
+ *
+ * iam user level library
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+/*
+ *  lustre/libiam.h
+ */
+
+#ifndef __IAM_ULIB_H__
+#define __IAM_ULIB_H__
+
+/** \defgroup libiam libiam
+ *
+ * @{
+ */
+
+
+#define DX_FMT_NAME_LEN 16
+
+enum iam_fmt_t {
+	FMT_LFIX,
+	FMT_LVAR
+};
+
+struct iam_uapi_info {
+	__u16 iui_keysize;
+	__u16 iui_recsize;
+	__u16 iui_ptrsize;
+	__u16 iui_height;
+	char  iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Creat an iam file, but do NOT open it.
+ * Return 0 if success, else -1.
+ */
+int iam_creat(char *filename, enum iam_fmt_t fmt,
+	      int blocksize, int keysize, int recsize, int ptrsize);
+
+/*
+ * Open an iam file, but do NOT creat it if the file doesn't exist.
+ * Please use iam_creat for creating the file before use iam_open.
+ * Return file id (fd) if success, else -1.
+ */
+int iam_open(char *filename, struct iam_uapi_info *ua);
+
+/*
+ * Close file opened by iam_open.
+ */
+int iam_close(int fd);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_insert(int fd, struct iam_uapi_info *ua,
+	       int key_need_convert, char *keybuf,
+	       int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_lookup(int fd, struct iam_uapi_info *ua,
+	       int key_need_convert, char *key_buf,
+	       int *keysize, char *save_key,
+	       int rec_need_convert, char *rec_buf,
+	       int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_delete(int fd, struct iam_uapi_info *ua,
+	       int key_need_convert, char *keybuf,
+	       int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_start(int fd, struct iam_uapi_info *ua,
+		 int key_need_convert, char *key_buf,
+		 int *keysize, char *save_key,
+		 int rec_need_convert, char *rec_buf,
+		 int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_next(int fd, struct iam_uapi_info *ua,
+		int key_need_convert, char *key_buf,
+		int *keysize, char *save_key,
+		int rec_need_convert, char *rec_buf,
+		int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_stop(int fd, struct iam_uapi_info *ua,
+		int key_need_convert, char *keybuf,
+		int rec_need_convert, char *recbuf);
+
+/*
+ * Change iam file mode.
+ */
+int iam_polymorph(char *filename, unsigned long mode);
+
+/** @} libiam */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h
new file mode 100644
index 0000000..707eb74
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h

@@ -0,0 +1,43 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ * NOTE: This file is DEPRECATED!  Please include lustreapi.h directly
+ * instead of this file.  This file will be removed from a future version
+ * of lustre!
+ */
+
+#ifndef _LIBLUSTREAPI_H_
+#define _LIBLUSTREAPI_H_
+
+#include <lustre/lustreapi.h>
+#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly."
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
new file mode 100644
index 0000000..ad253c6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h

@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/ll_fiemap.h
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+
+
+struct ll_fiemap_extent {
+	__u64 fe_logical;  /* logical offset in bytes for the start of
+			    * the extent from the beginning of the file */
+	__u64 fe_physical; /* physical offset in bytes for the start
+			    * of the extent from the beginning of the disk */
+	__u64 fe_length;   /* length in bytes for this extent */
+	__u64 fe_reserved64[2];
+	__u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+	__u32 fe_device;   /* device number for this extent */
+	__u32 fe_reserved[2];
+};
+
+struct ll_user_fiemap {
+	__u64 fm_start;  /* logical offset (inclusive) at
+			  * which to start mapping (in) */
+	__u64 fm_length; /* logical length of mapping which
+			  * userspace wants (in) */
+	__u32 fm_flags;  /* FIEMAP_FLAG_* flags for request (in/out) */
+	__u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+	__u32 fm_extent_count;  /* size of fm_extents array (in) */
+	__u32 fm_reserved;
+	struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET      (~0ULL)
+
+#define FIEMAP_FLAG_SYNC	 0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR	0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_EXTENT_LAST	      0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN	   0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC	  0x00000004 /* Location still pending.
+						    * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED	   0x00000008 /* Data can not be read
+						    * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED    0x00000080 /* Data is encrypted by fs.
+						    * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_NOT_ALIGNED       0x00000100 /* Extent offsets may not be
+						    * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE       0x00000200 /* Data mixed with metadata.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL	 0x00000400 /* Multiple files in block.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN	 0x00000800 /* Space allocated, but
+						    * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED	    0x00001000 /* File does not natively
+						    * support extents. Result
+						    * merged for efficiency. */
+
+
+static inline size_t fiemap_count_to_size(size_t extent_count)
+{
+	return (sizeof(struct ll_user_fiemap) + extent_count *
+					       sizeof(struct ll_fiemap_extent));
+}
+
+static inline unsigned fiemap_size_to_count(size_t array_size)
+{
+	return ((array_size - sizeof(struct ll_user_fiemap)) /
+					       sizeof(struct ll_fiemap_extent));
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT	 0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET	       0x80000000 /* Data stored remotely.
+						    * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */

diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
new file mode 100644
index 0000000..93a3d7d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h

@@ -0,0 +1,2 @@
+#define BUILD_VERSION "v2_3_64_0-g6e62c21-CHANGED-3.9.0"
+#define LUSTRE_RELEASE 3.9.0_g6e62c21

diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
new file mode 100644
index 0000000..8825460
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h

@@ -0,0 +1,3653 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_idl.h
+ *
+ * Lustre wire protocol definitions.
+ */
+
+/** \defgroup lustreidl lustreidl
+ *
+ * Lustre wire protocol definitions.
+ *
+ * ALL structs passing over the wire should be declared here.  Structs
+ * that are used in interfaces with userspace should go in lustre_user.h.
+ *
+ * All structs being declared here should be built from simple fixed-size
+ * types (__u8, __u16, __u32, __u64) or be built from other types or
+ * structs also declared in this file.  Similarly, all flags and magic
+ * values in those structs should also be declared here.  This ensures
+ * that the Lustre wire protocol is not influenced by external dependencies.
+ *
+ * The only other acceptable items in this file are VERY SIMPLE accessor
+ * functions to avoid callers grubbing inside the structures, and the
+ * prototypes of the swabber functions for each struct.  Nothing that
+ * depends on external functions or definitions should be in here.
+ *
+ * Structs must be properly aligned to put 64-bit values on an 8-byte
+ * boundary.  Any structs being added here must also be added to
+ * utils/wirecheck.c and "make newwiretest" run to regenerate the
+ * utils/wiretest.c sources.  This allows us to verify that wire structs
+ * have the proper alignment/size on all architectures.
+ *
+ * DO NOT CHANGE any of the structs, flags, values declared here and used
+ * in released Lustre versions.  Some structs may have padding fields that
+ * can be used.  Some structs might allow addition at the end (verify this
+ * in the code to ensure that new/old clients that see this larger struct
+ * do not fail, otherwise you need to implement protocol compatibility).
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines,
+ * implemented either here, inline (trivial implementations) or in
+ * ptlrpc/pack_generic.c.  These 'swabbers' convert the type from "other"
+ * endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument.  The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_IDL_H_
+#define _LUSTRE_IDL_H_
+
+#if !defined(LASSERT) && !defined(LPU64)
+#include <linux/libcfs/libcfs.h> /* for LASSERT, LPUX64, etc */
+#endif
+
+/* Defn's shared with user-space. */
+#include <lustre/lustre_user.h>
+
+/*
+ *  GENERAL STUFF
+ */
+/* FOO_REQUEST_PORTAL is for incoming requests on the FOO
+ * FOO_REPLY_PORTAL   is for incoming replies on the FOO
+ * FOO_BULK_PORTAL    is for incoming bulk on the FOO
+ */
+
+#define CONNMGR_REQUEST_PORTAL	  1
+#define CONNMGR_REPLY_PORTAL	    2
+//#define OSC_REQUEST_PORTAL	    3
+#define OSC_REPLY_PORTAL		4
+//#define OSC_BULK_PORTAL	       5
+#define OST_IO_PORTAL		   6
+#define OST_CREATE_PORTAL	       7
+#define OST_BULK_PORTAL		 8
+//#define MDC_REQUEST_PORTAL	    9
+#define MDC_REPLY_PORTAL	       10
+//#define MDC_BULK_PORTAL	      11
+#define MDS_REQUEST_PORTAL	     12
+//#define MDS_REPLY_PORTAL	     13
+#define MDS_BULK_PORTAL		14
+#define LDLM_CB_REQUEST_PORTAL	 15
+#define LDLM_CB_REPLY_PORTAL	   16
+#define LDLM_CANCEL_REQUEST_PORTAL     17
+#define LDLM_CANCEL_REPLY_PORTAL       18
+//#define PTLBD_REQUEST_PORTAL	   19
+//#define PTLBD_REPLY_PORTAL	     20
+//#define PTLBD_BULK_PORTAL	      21
+#define MDS_SETATTR_PORTAL	     22
+#define MDS_READPAGE_PORTAL	    23
+#define MDS_MDS_PORTAL		 24
+
+#define MGC_REPLY_PORTAL	       25
+#define MGS_REQUEST_PORTAL	     26
+#define MGS_REPLY_PORTAL	       27
+#define OST_REQUEST_PORTAL	     28
+#define FLD_REQUEST_PORTAL	     29
+#define SEQ_METADATA_PORTAL	    30
+#define SEQ_DATA_PORTAL		31
+#define SEQ_CONTROLLER_PORTAL	  32
+#define MGS_BULK_PORTAL		33
+
+/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
+
+/* packet types */
+#define PTL_RPC_MSG_REQUEST 4711
+#define PTL_RPC_MSG_ERR     4712
+#define PTL_RPC_MSG_REPLY   4713
+
+/* DON'T use swabbed values of MAGIC as magic! */
+#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
+#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
+
+#define LUSTRE_MSG_MAGIC_V1_SWABBED 0xD00BD00B
+#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
+
+#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
+
+#define PTLRPC_MSG_VERSION  0x00000003
+#define LUSTRE_VERSION_MASK 0xffff0000
+#define LUSTRE_OBD_VERSION  0x00010000
+#define LUSTRE_MDS_VERSION  0x00020000
+#define LUSTRE_OST_VERSION  0x00030000
+#define LUSTRE_DLM_VERSION  0x00040000
+#define LUSTRE_LOG_VERSION  0x00050000
+#define LUSTRE_MGS_VERSION  0x00060000
+
+typedef __u32 mdsno_t;
+typedef __u64 seqno_t;
+typedef __u64 obd_id;
+typedef __u64 obd_seq;
+typedef __s64 obd_time;
+typedef __u64 obd_size;
+typedef __u64 obd_off;
+typedef __u64 obd_blocks;
+typedef __u64 obd_valid;
+typedef __u32 obd_blksize;
+typedef __u32 obd_mode;
+typedef __u32 obd_uid;
+typedef __u32 obd_gid;
+typedef __u32 obd_flag;
+typedef __u32 obd_count;
+
+/**
+ * Describes a range of sequence, lsr_start is included but lsr_end is
+ * not in the range.
+ * Same structure is used in fld module where lsr_index field holds mdt id
+ * of the home mdt.
+ */
+struct lu_seq_range {
+	__u64 lsr_start;
+	__u64 lsr_end;
+	__u32 lsr_index;
+	__u32 lsr_flags;
+};
+
+#define LU_SEQ_RANGE_MDT	0x0
+#define LU_SEQ_RANGE_OST	0x1
+#define LU_SEQ_RANGE_ANY	0x3
+
+#define LU_SEQ_RANGE_MASK	0x3
+
+static inline unsigned fld_range_type(const struct lu_seq_range *range)
+{
+	return range->lsr_flags & LU_SEQ_RANGE_MASK;
+}
+
+static inline int fld_range_is_ost(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_OST;
+}
+
+static inline int fld_range_is_mdt(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_MDT;
+}
+
+/**
+ * This all range is only being used when fld client sends fld query request,
+ * but it does not know whether the seq is MDT or OST, so it will send req
+ * with ALL type, which means either seq type gotten from lookup can be
+ * expected.
+ */
+static inline unsigned fld_range_is_any(const struct lu_seq_range *range)
+{
+	return fld_range_type(range) == LU_SEQ_RANGE_ANY;
+}
+
+static inline void fld_range_set_type(struct lu_seq_range *range,
+				      unsigned flags)
+{
+	LASSERT(!(flags & ~LU_SEQ_RANGE_MASK));
+	range->lsr_flags |= flags;
+}
+
+static inline void fld_range_set_mdt(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_MDT);
+}
+
+static inline void fld_range_set_ost(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_OST);
+}
+
+static inline void fld_range_set_any(struct lu_seq_range *range)
+{
+	fld_range_set_type(range, LU_SEQ_RANGE_ANY);
+}
+
+/**
+ * returns  width of given range \a r
+ */
+
+static inline __u64 range_space(const struct lu_seq_range *range)
+{
+	return range->lsr_end - range->lsr_start;
+}
+
+/**
+ * initialize range to zero
+ */
+
+static inline void range_init(struct lu_seq_range *range)
+{
+	range->lsr_start = range->lsr_end = range->lsr_index = 0;
+}
+
+/**
+ * check if given seq id \a s is within given range \a r
+ */
+
+static inline int range_within(const struct lu_seq_range *range,
+			       __u64 s)
+{
+	return s >= range->lsr_start && s < range->lsr_end;
+}
+
+static inline int range_is_sane(const struct lu_seq_range *range)
+{
+	return (range->lsr_end >= range->lsr_start);
+}
+
+static inline int range_is_zero(const struct lu_seq_range *range)
+{
+	return (range->lsr_start == 0 && range->lsr_end == 0);
+}
+
+static inline int range_is_exhausted(const struct lu_seq_range *range)
+
+{
+	return range_space(range) == 0;
+}
+
+/* return 0 if two range have the same location */
+static inline int range_compare_loc(const struct lu_seq_range *r1,
+				    const struct lu_seq_range *r2)
+{
+	return r1->lsr_index != r2->lsr_index ||
+	       r1->lsr_flags != r2->lsr_flags;
+}
+
+#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x):%x:%s"
+
+#define PRANGE(range)		\
+	(range)->lsr_start,	\
+	(range)->lsr_end,	\
+	(range)->lsr_index,	\
+	fld_range_is_mdt(range) ? "mdt" : "ost"
+
+
+/** \defgroup lu_fid lu_fid
+ * @{ */
+
+/**
+ * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat.
+ * Deprecated since HSM and SOM attributes are now stored in separate on-disk
+ * xattr.
+ */
+enum lma_compat {
+	LMAC_HSM = 0x00000001,
+	LMAC_SOM = 0x00000002,
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+	LMAI_RELEASED = 0x0000001, /* file is released */
+	LMAI_AGENT = 0x00000002, /* agent inode */
+	LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object
+					    is on the remote MDT */
+};
+#define LMA_INCOMPAT_SUPP	(LMAI_AGENT | LMAI_REMOTE_PARENT)
+
+extern void lustre_lma_swab(struct lustre_mdt_attrs *lma);
+extern void lustre_lma_init(struct lustre_mdt_attrs *lma,
+			    const struct lu_fid *fid, __u32 incompat);
+/**
+ * SOM on-disk attributes stored in a separate xattr.
+ */
+struct som_attrs {
+	/** Bitfield for supported data in this structure. For future use. */
+	__u32	som_compat;
+
+	/** Incompat feature list. The supported feature mask is availabe in
+	 * SOM_INCOMPAT_SUPP */
+	__u32	som_incompat;
+
+	/** IO Epoch SOM attributes belongs to */
+	__u64	som_ioepoch;
+	/** total file size in objects */
+	__u64	som_size;
+	/** total fs blocks in objects */
+	__u64	som_blocks;
+	/** mds mount id the size is valid for */
+	__u64	som_mountid;
+};
+extern void lustre_som_swab(struct som_attrs *attrs);
+
+#define SOM_INCOMPAT_SUPP 0x0
+
+/**
+ * HSM on-disk attributes stored in a separate xattr.
+ */
+struct hsm_attrs {
+	/** Bitfield for supported data in this structure. For future use. */
+	__u32	hsm_compat;
+
+	/** HSM flags, see hsm_flags enum below */
+	__u32	hsm_flags;
+	/** backend archive id associated with the file */
+	__u64	hsm_arch_id;
+	/** version associated with the last archiving, if any */
+	__u64	hsm_arch_ver;
+};
+extern void lustre_hsm_swab(struct hsm_attrs *attrs);
+
+/**
+ * fid constants
+ */
+enum {
+	/** LASTID file has zero OID */
+	LUSTRE_FID_LASTID_OID = 0UL,
+	/** initial fid id value */
+	LUSTRE_FID_INIT_OID  = 1UL
+};
+
+/** returns fid object sequence */
+static inline __u64 fid_seq(const struct lu_fid *fid)
+{
+	return fid->f_seq;
+}
+
+/** returns fid object id */
+static inline __u32 fid_oid(const struct lu_fid *fid)
+{
+	return fid->f_oid;
+}
+
+/** returns fid object version */
+static inline __u32 fid_ver(const struct lu_fid *fid)
+{
+	return fid->f_ver;
+}
+
+static inline void fid_zero(struct lu_fid *fid)
+{
+	memset(fid, 0, sizeof(*fid));
+}
+
+static inline obd_id fid_ver_oid(const struct lu_fid *fid)
+{
+	return ((__u64)fid_ver(fid) << 32 | fid_oid(fid));
+}
+
+/**
+ * Note that reserved SEQ numbers below 12 will conflict with ldiskfs
+ * inodes in the IGIF namespace, so these reserved SEQ numbers can be
+ * used for other purposes and not risk collisions with existing inodes.
+ *
+ * Different FID Format
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0
+ */
+enum fid_seq {
+	FID_SEQ_OST_MDT0	= 0,
+	FID_SEQ_LLOG		= 1, /* unnamed llogs */
+	FID_SEQ_ECHO		= 2,
+	FID_SEQ_OST_MDT1	= 3,
+	FID_SEQ_OST_MAX		= 9, /* Max MDT count before OST_on_FID */
+	FID_SEQ_LLOG_NAME	= 10, /* named llogs */
+	FID_SEQ_RSVD		= 11,
+	FID_SEQ_IGIF		= 12,
+	FID_SEQ_IGIF_MAX	= 0x0ffffffffULL,
+	FID_SEQ_IDIF		= 0x100000000ULL,
+	FID_SEQ_IDIF_MAX	= 0x1ffffffffULL,
+	/* Normal FID sequence starts from this value, i.e. 1<<33 */
+	FID_SEQ_START		= 0x200000000ULL,
+	/* sequence for local pre-defined FIDs listed in local_oid */
+	FID_SEQ_LOCAL_FILE	= 0x200000001ULL,
+	FID_SEQ_DOT_LUSTRE	= 0x200000002ULL,
+	/* sequence is used for local named objects FIDs generated
+	 * by local_object_storage library */
+	FID_SEQ_LOCAL_NAME	= 0x200000003ULL,
+	/* Because current FLD will only cache the fid sequence, instead
+	 * of oid on the client side, if the FID needs to be exposed to
+	 * clients sides, it needs to make sure all of fids under one
+	 * sequence will be located in one MDT. */
+	FID_SEQ_SPECIAL		= 0x200000004ULL,
+	FID_SEQ_QUOTA		= 0x200000005ULL,
+	FID_SEQ_QUOTA_GLB	= 0x200000006ULL,
+	FID_SEQ_ROOT		= 0x200000007ULL,  /* Located on MDT0 */
+	FID_SEQ_NORMAL		= 0x200000400ULL,
+	FID_SEQ_LOV_DEFAULT	= 0xffffffffffffffffULL
+};
+
+#define OBIF_OID_MAX_BITS	   32
+#define OBIF_MAX_OID		(1ULL << OBIF_OID_MAX_BITS)
+#define OBIF_OID_MASK	       ((1ULL << OBIF_OID_MAX_BITS) - 1)
+#define IDIF_OID_MAX_BITS	   48
+#define IDIF_MAX_OID		(1ULL << IDIF_OID_MAX_BITS)
+#define IDIF_OID_MASK	       ((1ULL << IDIF_OID_MAX_BITS) - 1)
+
+/** OID for FID_SEQ_SPECIAL */
+enum special_oid {
+	/* Big Filesystem Lock to serialize rename operations */
+	FID_OID_SPECIAL_BFL     = 1UL,
+};
+
+/** OID for FID_SEQ_DOT_LUSTRE */
+enum dot_lustre_oid {
+	FID_OID_DOT_LUSTRE  = 1UL,
+	FID_OID_DOT_LUSTRE_OBF = 2UL,
+};
+
+static inline int fid_seq_is_mdt0(obd_seq seq)
+{
+	return (seq == FID_SEQ_OST_MDT0);
+}
+
+static inline int fid_seq_is_mdt(const __u64 seq)
+{
+	return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL;
+};
+
+static inline int fid_seq_is_echo(obd_seq seq)
+{
+	return (seq == FID_SEQ_ECHO);
+}
+
+static inline int fid_is_echo(const struct lu_fid *fid)
+{
+	return fid_seq_is_echo(fid_seq(fid));
+}
+
+static inline int fid_seq_is_llog(obd_seq seq)
+{
+	return (seq == FID_SEQ_LLOG);
+}
+
+static inline int fid_is_llog(const struct lu_fid *fid)
+{
+	/* file with OID == 0 is not llog but contains last oid */
+	return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0;
+}
+
+static inline int fid_seq_is_rsvd(const __u64 seq)
+{
+	return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD);
+};
+
+static inline int fid_seq_is_special(const __u64 seq)
+{
+	return seq == FID_SEQ_SPECIAL;
+};
+
+static inline int fid_seq_is_local_file(const __u64 seq)
+{
+	return seq == FID_SEQ_LOCAL_FILE ||
+	       seq == FID_SEQ_LOCAL_NAME;
+};
+
+static inline int fid_seq_is_root(const __u64 seq)
+{
+	return seq == FID_SEQ_ROOT;
+}
+
+static inline int fid_seq_is_dot(const __u64 seq)
+{
+	return seq == FID_SEQ_DOT_LUSTRE;
+}
+
+static inline int fid_seq_is_default(const __u64 seq)
+{
+	return seq == FID_SEQ_LOV_DEFAULT;
+}
+
+static inline int fid_is_mdt0(const struct lu_fid *fid)
+{
+	return fid_seq_is_mdt0(fid_seq(fid));
+}
+
+static inline void lu_root_fid(struct lu_fid *fid)
+{
+	fid->f_seq = FID_SEQ_ROOT;
+	fid->f_oid = 1;
+	fid->f_ver = 0;
+}
+
+/**
+ * Check if a fid is igif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a igif; otherwise false.
+ */
+static inline int fid_seq_is_igif(const __u64 seq)
+{
+	return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX;
+}
+
+static inline int fid_is_igif(const struct lu_fid *fid)
+{
+	return fid_seq_is_igif(fid_seq(fid));
+}
+
+/**
+ * Check if a fid is idif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a idif; otherwise false.
+ */
+static inline int fid_seq_is_idif(const __u64 seq)
+{
+	return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX;
+}
+
+static inline int fid_is_idif(const struct lu_fid *fid)
+{
+	return fid_seq_is_idif(fid_seq(fid));
+}
+
+static inline int fid_is_local_file(const struct lu_fid *fid)
+{
+	return fid_seq_is_local_file(fid_seq(fid));
+}
+
+static inline int fid_seq_is_norm(const __u64 seq)
+{
+	return (seq >= FID_SEQ_NORMAL);
+}
+
+static inline int fid_is_norm(const struct lu_fid *fid)
+{
+	return fid_seq_is_norm(fid_seq(fid));
+}
+
+/* convert an OST objid into an IDIF FID SEQ number */
+static inline obd_seq fid_idif_seq(obd_id id, __u32 ost_idx)
+{
+	return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff);
+}
+
+/* convert a packed IDIF FID into an OST objid */
+static inline obd_id fid_idif_id(obd_seq seq, __u32 oid, __u32 ver)
+{
+	return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid;
+}
+
+/* extract ost index from IDIF FID */
+static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid)
+{
+	LASSERT(fid_is_idif(fid));
+	return (fid_seq(fid) >> 16) & 0xffff;
+}
+
+/* extract OST sequence (group) from a wire ost_id (id/seq) pair */
+static inline obd_seq ostid_seq(const struct ost_id *ostid)
+{
+	if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+		return FID_SEQ_OST_MDT0;
+
+	if (fid_seq_is_default(ostid->oi.oi_seq))
+		return FID_SEQ_LOV_DEFAULT;
+
+	if (fid_is_idif(&ostid->oi_fid))
+		return FID_SEQ_OST_MDT0;
+
+	return fid_seq(&ostid->oi_fid);
+}
+
+/* extract OST objid from a wire ost_id (id/seq) pair */
+static inline obd_id ostid_id(const struct ost_id *ostid)
+{
+	if (fid_seq_is_mdt0(ostid_seq(ostid)))
+		return ostid->oi.oi_id & IDIF_OID_MASK;
+
+	if (fid_is_idif(&ostid->oi_fid))
+		return fid_idif_id(fid_seq(&ostid->oi_fid),
+				   fid_oid(&ostid->oi_fid), 0);
+
+	return fid_oid(&ostid->oi_fid);
+}
+
+static inline void ostid_set_seq(struct ost_id *oi, __u64 seq)
+{
+	if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) {
+		oi->oi.oi_seq = seq;
+	} else {
+		oi->oi_fid.f_seq = seq;
+		/* Note: if f_oid + f_ver is zero, we need init it
+		 * to be 1, otherwise, ostid_seq will treat this
+		 * as old ostid (oi_seq == 0) */
+		if (oi->oi_fid.f_oid == 0 && oi->oi_fid.f_ver == 0)
+			oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID;
+	}
+}
+
+static inline void ostid_set_seq_mdt0(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_OST_MDT0);
+}
+
+static inline void ostid_set_seq_echo(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_ECHO);
+}
+
+static inline void ostid_set_seq_llog(struct ost_id *oi)
+{
+	ostid_set_seq(oi, FID_SEQ_LLOG);
+}
+
+/**
+ * Note: we need check oi_seq to decide where to set oi_id,
+ * so oi_seq should always be set ahead of oi_id.
+ */
+static inline void ostid_set_id(struct ost_id *oi, __u64 oid)
+{
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		if (oid >= IDIF_MAX_OID) {
+			CERROR("Bad "LPU64" to set "DOSTID"\n",
+				oid, POSTID(oi));
+			return;
+		}
+		oi->oi.oi_id = oid;
+	} else {
+		if (oid > OBIF_MAX_OID) {
+			CERROR("Bad "LPU64" to set "DOSTID"\n",
+				oid, POSTID(oi));
+			return;
+		}
+		oi->oi_fid.f_oid = oid;
+	}
+}
+
+static inline void ostid_inc_id(struct ost_id *oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		if (unlikely(ostid_id(oi) + 1 > IDIF_MAX_OID)) {
+			CERROR("Bad inc "DOSTID"\n", POSTID(oi));
+			return;
+		}
+		oi->oi.oi_id++;
+	} else {
+		oi->oi_fid.f_oid++;
+	}
+}
+
+static inline void ostid_dec_id(struct ost_id *oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(oi)))
+		oi->oi.oi_id--;
+	else
+		oi->oi_fid.f_oid--;
+}
+
+/**
+ * Unpack an OST object id/seq (group) into a FID.  This is needed for
+ * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper
+ * FIDs.  Note that if an id/seq is already in FID/IDIF format it will
+ * be passed through unchanged.  Only legacy OST objects in "group 0"
+ * will be mapped into the IDIF namespace so that they can fit into the
+ * struct lu_fid fields without loss.  For reference see:
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs
+ */
+static inline int ostid_to_fid(struct lu_fid *fid, struct ost_id *ostid,
+			       __u32 ost_idx)
+{
+	if (ost_idx > 0xffff) {
+		CERROR("bad ost_idx, "DOSTID" ost_idx:%u\n", POSTID(ostid),
+		       ost_idx);
+		return -EBADF;
+	}
+
+	if (fid_seq_is_mdt0(ostid_seq(ostid))) {
+		/* This is a "legacy" (old 1.x/2.early) OST object in "group 0"
+		 * that we map into the IDIF namespace.  It allows up to 2^48
+		 * objects per OST, as this is the object namespace that has
+		 * been in production for years.  This can handle create rates
+		 * of 1M objects/s/OST for 9 years, or combinations thereof. */
+		if (ostid_id(ostid) >= IDIF_MAX_OID) {
+			 CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+				POSTID(ostid), ost_idx);
+			 return -EBADF;
+		}
+		fid->f_seq = fid_idif_seq(ostid_id(ostid), ost_idx);
+		/* truncate to 32 bits by assignment */
+		fid->f_oid = ostid_id(ostid);
+		/* in theory, not currently used */
+		fid->f_ver = ostid_id(ostid) >> 48;
+	} else /* if (fid_seq_is_idif(seq) || fid_seq_is_norm(seq)) */ {
+	       /* This is either an IDIF object, which identifies objects across
+		* all OSTs, or a regular FID.  The IDIF namespace maps legacy
+		* OST objects into the FID namespace.  In both cases, we just
+		* pass the FID through, no conversion needed. */
+		if (ostid->oi_fid.f_ver != 0) {
+			CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+				POSTID(ostid), ost_idx);
+			return -EBADF;
+		}
+		*fid = ostid->oi_fid;
+	}
+
+	return 0;
+}
+
+/* pack any OST FID into an ostid (id/seq) for the wire/disk */
+static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid)
+{
+	if (unlikely(fid_seq_is_igif(fid->f_seq))) {
+		CERROR("bad IGIF, "DFID"\n", PFID(fid));
+		return -EBADF;
+	}
+
+	if (fid_is_idif(fid)) {
+		ostid_set_seq_mdt0(ostid);
+		ostid_set_id(ostid, fid_idif_id(fid_seq(fid), fid_oid(fid),
+						fid_ver(fid)));
+	} else {
+		ostid->oi_fid = *fid;
+	}
+
+	return 0;
+}
+
+/* Check whether the fid is for LAST_ID */
+static inline int fid_is_last_id(const struct lu_fid *fid)
+{
+	return (fid_oid(fid) == 0);
+}
+
+/**
+ * Get inode number from a igif.
+ * \param fid a igif to get inode number from.
+ * \return inode number for the igif.
+ */
+static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+{
+	return fid_seq(fid);
+}
+
+extern void lustre_swab_ost_id(struct ost_id *oid);
+
+/**
+ * Get inode generation from a igif.
+ * \param fid a igif to get inode generation from.
+ * \return inode generation for the igif.
+ */
+static inline __u32 lu_igif_gen(const struct lu_fid *fid)
+{
+	return fid_oid(fid);
+}
+
+/**
+ * Build igif from the inode number/generation.
+ */
+static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen)
+{
+	fid->f_seq = ino;
+	fid->f_oid = gen;
+	fid->f_ver = 0;
+}
+
+/*
+ * Fids are transmitted across network (in the sender byte-ordering),
+ * and stored on disk in big-endian order.
+ */
+static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
+{
+	/* check that all fields are converted */
+	CLASSERT(sizeof *src ==
+		 sizeof fid_seq(src) +
+		 sizeof fid_oid(src) + sizeof fid_ver(src));
+	dst->f_seq = cpu_to_le64(fid_seq(src));
+	dst->f_oid = cpu_to_le32(fid_oid(src));
+	dst->f_ver = cpu_to_le32(fid_ver(src));
+}
+
+static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+	/* check that all fields are converted */
+	CLASSERT(sizeof *src ==
+		 sizeof fid_seq(src) +
+		 sizeof fid_oid(src) + sizeof fid_ver(src));
+	dst->f_seq = le64_to_cpu(fid_seq(src));
+	dst->f_oid = le32_to_cpu(fid_oid(src));
+	dst->f_ver = le32_to_cpu(fid_ver(src));
+}
+
+static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src)
+{
+	/* check that all fields are converted */
+	CLASSERT(sizeof *src ==
+		 sizeof fid_seq(src) +
+		 sizeof fid_oid(src) + sizeof fid_ver(src));
+	dst->f_seq = cpu_to_be64(fid_seq(src));
+	dst->f_oid = cpu_to_be32(fid_oid(src));
+	dst->f_ver = cpu_to_be32(fid_ver(src));
+}
+
+static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+	/* check that all fields are converted */
+	CLASSERT(sizeof *src ==
+		 sizeof fid_seq(src) +
+		 sizeof fid_oid(src) + sizeof fid_ver(src));
+	dst->f_seq = be64_to_cpu(fid_seq(src));
+	dst->f_oid = be32_to_cpu(fid_oid(src));
+	dst->f_ver = be32_to_cpu(fid_ver(src));
+}
+
+static inline int fid_is_sane(const struct lu_fid *fid)
+{
+	return fid != NULL &&
+	       ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) ||
+		fid_is_igif(fid) || fid_is_idif(fid) ||
+		fid_seq_is_rsvd(fid_seq(fid)));
+}
+
+static inline int fid_is_zero(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == 0 && fid_oid(fid) == 0;
+}
+
+extern void lustre_swab_lu_fid(struct lu_fid *fid);
+extern void lustre_swab_lu_seq_range(struct lu_seq_range *range);
+
+static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1)
+{
+	/* Check that there is no alignment padding. */
+	CLASSERT(sizeof *f0 ==
+		 sizeof f0->f_seq + sizeof f0->f_oid + sizeof f0->f_ver);
+	return memcmp(f0, f1, sizeof *f0) == 0;
+}
+
+#define __diff_normalize(val0, val1)			    \
+({							      \
+	typeof(val0) __val0 = (val0);			   \
+	typeof(val1) __val1 = (val1);			   \
+								\
+	(__val0 == __val1 ? 0 : __val0 > __val1 ? +1 : -1);     \
+})
+
+static inline int lu_fid_cmp(const struct lu_fid *f0,
+			     const struct lu_fid *f1)
+{
+	return
+		__diff_normalize(fid_seq(f0), fid_seq(f1)) ?:
+		__diff_normalize(fid_oid(f0), fid_oid(f1)) ?:
+		__diff_normalize(fid_ver(f0), fid_ver(f1));
+}
+
+static inline void ostid_cpu_to_le(struct ost_id *src_oi,
+				   struct ost_id *dst_oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+		dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+		dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+	} else {
+		fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid);
+	}
+}
+
+static inline void ostid_le_to_cpu(struct ost_id *src_oi,
+				   struct ost_id *dst_oi)
+{
+	if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+		dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+		dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+	} else {
+		fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid);
+	}
+}
+
+/** @} lu_fid */
+
+/** \defgroup lu_dir lu_dir
+ * @{ */
+
+/**
+ * Enumeration of possible directory entry attributes.
+ *
+ * Attributes follow directory entry header in the order they appear in this
+ * enumeration.
+ */
+enum lu_dirent_attrs {
+	LUDA_FID		= 0x0001,
+	LUDA_TYPE		= 0x0002,
+	LUDA_64BITHASH		= 0x0004,
+
+	/* The following attrs are used for MDT interanl only,
+	 * not visible to client */
+
+	/* Verify the dirent consistency */
+	LUDA_VERIFY		= 0x8000,
+	/* Only check but not repair the dirent inconsistency */
+	LUDA_VERIFY_DRYRUN	= 0x4000,
+	/* The dirent has been repaired, or to be repaired (dryrun). */
+	LUDA_REPAIR		= 0x2000,
+	/* The system is upgraded, has beed or to be repaired (dryrun). */
+	LUDA_UPGRADE		= 0x1000,
+	/* Ignore this record, go to next directly. */
+	LUDA_IGNORE		= 0x0800,
+};
+
+#define LU_DIRENT_ATTRS_MASK	0xf800
+
+/**
+ * Layout of readdir pages, as transmitted on wire.
+ */
+struct lu_dirent {
+	/** valid if LUDA_FID is set. */
+	struct lu_fid lde_fid;
+	/** a unique entry identifier: a hash or an offset. */
+	__u64	 lde_hash;
+	/** total record length, including all attributes. */
+	__u16	 lde_reclen;
+	/** name length */
+	__u16	 lde_namelen;
+	/** optional variable size attributes following this entry.
+	 *  taken from enum lu_dirent_attrs.
+	 */
+	__u32	 lde_attrs;
+	/** name is followed by the attributes indicated in ->ldp_attrs, in
+	 *  their natural order. After the last attribute, padding bytes are
+	 *  added to make ->lde_reclen a multiple of 8.
+	 */
+	char	  lde_name[0];
+};
+
+/*
+ * Definitions of optional directory entry attributes formats.
+ *
+ * Individual attributes do not have their length encoded in a generic way. It
+ * is assumed that consumer of an attribute knows its format. This means that
+ * it is impossible to skip over an unknown attribute, except by skipping over all
+ * remaining attributes (by using ->lde_reclen), which is not too
+ * constraining, because new server versions will append new attributes at
+ * the end of an entry.
+ */
+
+/**
+ * Fid directory attribute: a fid of an object referenced by the entry. This
+ * will be almost always requested by the client and supplied by the server.
+ *
+ * Aligned to 8 bytes.
+ */
+/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
+
+/**
+ * File type.
+ *
+ * Aligned to 2 bytes.
+ */
+struct luda_type {
+	__u16 lt_type;
+};
+
+struct lu_dirpage {
+	__u64	    ldp_hash_start;
+	__u64	    ldp_hash_end;
+	__u32	    ldp_flags;
+	__u32	    ldp_pad0;
+	struct lu_dirent ldp_entries[0];
+};
+
+enum lu_dirpage_flags {
+	/**
+	 * dirpage contains no entry.
+	 */
+	LDF_EMPTY   = 1 << 0,
+	/**
+	 * last entry's lde_hash equals ldp_hash_end.
+	 */
+	LDF_COLLIDE = 1 << 1
+};
+
+static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
+{
+	if (le32_to_cpu(dp->ldp_flags) & LDF_EMPTY)
+		return NULL;
+	else
+		return dp->ldp_entries;
+}
+
+static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
+{
+	struct lu_dirent *next;
+
+	if (le16_to_cpu(ent->lde_reclen) != 0)
+		next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
+	else
+		next = NULL;
+
+	return next;
+}
+
+static inline int lu_dirent_calc_size(int namelen, __u16 attr)
+{
+	int size;
+
+	if (attr & LUDA_TYPE) {
+		const unsigned align = sizeof(struct luda_type) - 1;
+		size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
+		size += sizeof(struct luda_type);
+	} else
+		size = sizeof(struct lu_dirent) + namelen;
+
+	return (size + 7) & ~7;
+}
+
+static inline int lu_dirent_size(struct lu_dirent *ent)
+{
+	if (le16_to_cpu(ent->lde_reclen) == 0) {
+		return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
+					   le32_to_cpu(ent->lde_attrs));
+	}
+	return le16_to_cpu(ent->lde_reclen);
+}
+
+#define MDS_DIR_END_OFF 0xfffffffffffffffeULL
+
+/**
+ * MDS_READPAGE page size
+ *
+ * This is the directory page size packed in MDS_READPAGE RPC.
+ * It's different than PAGE_CACHE_SIZE because the client needs to
+ * access the struct lu_dirpage header packed at the beginning of
+ * the "page" and without this there isn't any way to know find the
+ * lu_dirpage header is if client and server PAGE_CACHE_SIZE differ.
+ */
+#define LU_PAGE_SHIFT 12
+#define LU_PAGE_SIZE  (1UL << LU_PAGE_SHIFT)
+#define LU_PAGE_MASK  (~(LU_PAGE_SIZE - 1))
+
+#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT))
+
+/** @} lu_dir */
+
+struct lustre_handle {
+	__u64 cookie;
+};
+#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL
+
+static inline int lustre_handle_is_used(struct lustre_handle *lh)
+{
+	return lh->cookie != 0ull;
+}
+
+static inline int lustre_handle_equal(const struct lustre_handle *lh1,
+				      const struct lustre_handle *lh2)
+{
+	return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+				      struct lustre_handle *src)
+{
+	tgt->cookie = src->cookie;
+}
+
+/* flags for lm_flags */
+#define MSGHDR_AT_SUPPORT	       0x1
+#define MSGHDR_CKSUM_INCOMPAT18	 0x2
+
+#define lustre_msg lustre_msg_v2
+/* we depend on this structure to be 8-byte aligned */
+/* this type is only endian-adjusted in lustre_unpack_msg() */
+struct lustre_msg_v2 {
+	__u32 lm_bufcount;
+	__u32 lm_secflvr;
+	__u32 lm_magic;
+	__u32 lm_repsize;
+	__u32 lm_cksum;
+	__u32 lm_flags;
+	__u32 lm_padding_2;
+	__u32 lm_padding_3;
+	__u32 lm_buflens[0];
+};
+
+/* without gss, ptlrpc_body is put at the first buffer. */
+#define PTLRPC_NUM_VERSIONS     4
+#define JOBSTATS_JOBID_SIZE     32  /* 32 bytes string */
+struct ptlrpc_body_v3 {
+	struct lustre_handle pb_handle;
+	__u32 pb_type;
+	__u32 pb_version;
+	__u32 pb_opc;
+	__u32 pb_status;
+	__u64 pb_last_xid;
+	__u64 pb_last_seen;
+	__u64 pb_last_committed;
+	__u64 pb_transno;
+	__u32 pb_flags;
+	__u32 pb_op_flags;
+	__u32 pb_conn_cnt;
+	__u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+	__u32 pb_service_time; /* for rep, actual service time */
+	__u32 pb_limit;
+	__u64 pb_slv;
+	/* VBR: pre-versions */
+	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+	/* padding for future needs */
+	__u64 pb_padding[4];
+	char  pb_jobid[JOBSTATS_JOBID_SIZE];
+};
+#define ptlrpc_body     ptlrpc_body_v3
+
+struct ptlrpc_body_v2 {
+	struct lustre_handle pb_handle;
+	__u32 pb_type;
+	__u32 pb_version;
+	__u32 pb_opc;
+	__u32 pb_status;
+	__u64 pb_last_xid;
+	__u64 pb_last_seen;
+	__u64 pb_last_committed;
+	__u64 pb_transno;
+	__u32 pb_flags;
+	__u32 pb_op_flags;
+	__u32 pb_conn_cnt;
+	__u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+	__u32 pb_service_time; /* for rep, actual service time, also used for
+				  net_latency of req */
+	__u32 pb_limit;
+	__u64 pb_slv;
+	/* VBR: pre-versions */
+	__u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+	/* padding for future needs */
+	__u64 pb_padding[4];
+};
+
+extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
+
+/* message body offset for lustre_msg_v2 */
+/* ptlrpc body offset in all request/reply messages */
+#define MSG_PTLRPC_BODY_OFF	     0
+
+/* normal request/reply message record offset */
+#define REQ_REC_OFF		     1
+#define REPLY_REC_OFF		   1
+
+/* ldlm request message body offset */
+#define DLM_LOCKREQ_OFF		 1 /* lockreq offset */
+#define DLM_REQ_REC_OFF		 2 /* normal dlm request record offset */
+
+/* ldlm intent lock message body offset */
+#define DLM_INTENT_IT_OFF	       2 /* intent lock it offset */
+#define DLM_INTENT_REC_OFF	      3 /* intent lock record offset */
+
+/* ldlm reply message body offset */
+#define DLM_LOCKREPLY_OFF	       1 /* lockrep offset */
+#define DLM_REPLY_REC_OFF	       2 /* reply record offset */
+
+/** only use in req->rq_{req,rep}_swab_mask */
+#define MSG_PTLRPC_HEADER_OFF	   31
+
+/* Flags that are operation-specific go in the top 16 bits. */
+#define MSG_OP_FLAG_MASK   0xffff0000
+#define MSG_OP_FLAG_SHIFT  16
+
+/* Flags that apply to all requests are in the bottom 16 bits */
+#define MSG_GEN_FLAG_MASK     0x0000ffff
+#define MSG_LAST_REPLAY	   0x0001
+#define MSG_RESENT		0x0002
+#define MSG_REPLAY		0x0004
+/* #define MSG_AT_SUPPORT	 0x0008
+ * This was used in early prototypes of adaptive timeouts, and while there
+ * shouldn't be any users of that code there also isn't a need for using this
+ * bits. Defer usage until at least 1.10 to avoid potential conflict. */
+#define MSG_DELAY_REPLAY	  0x0010
+#define MSG_VERSION_REPLAY	0x0020
+#define MSG_REQ_REPLAY_DONE       0x0040
+#define MSG_LOCK_REPLAY_DONE      0x0080
+
+/*
+ * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
+ */
+
+#define MSG_CONNECT_RECOVERING  0x00000001
+#define MSG_CONNECT_RECONNECT   0x00000002
+#define MSG_CONNECT_REPLAYABLE  0x00000004
+//#define MSG_CONNECT_PEER	0x8
+#define MSG_CONNECT_LIBCLIENT   0x00000010
+#define MSG_CONNECT_INITIAL     0x00000020
+#define MSG_CONNECT_ASYNC       0x00000040
+#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
+
+/* Connect flags */
+#define OBD_CONNECT_RDONLY		0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX		 0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS		   0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT		 0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK	      0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION	      0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL	    0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL		  0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR	       0x100ULL /*client use extended attr */
+#define OBD_CONNECT_CROW		0x200ULL /*MDS+OST create obj on write*/
+#define OBD_CONNECT_TRUNCLOCK	   0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO	     0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS	      0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_JOIN	       0x2000ULL /*files can be concatenated.
+						  *We do not support JOIN FILE
+						  *anymore, reserve this flags
+						  *just for preventing such bit
+						  *to be reused.*/
+#define OBD_CONNECT_ATTRFID	    0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH	    0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT	0x10000ULL /*Remote client */
+#define OBD_CONNECT_RMT_CLIENT_FORCE  0x20000ULL /*Remote client by force */
+#define OBD_CONNECT_BRW_SIZE	  0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64	   0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA	 0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA	 0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET	0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM	      0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT	      0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE      0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS	 0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL	    0x8000000ULL /*real connection */
+#define OBD_CONNECT_CHANGE_QS      0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM	  0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID	    0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR	    0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3	0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK  0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN   0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE    0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20       0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK   0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH    0x4000000000ULL /* client supports 64-bits
+						  * directory hash */
+#define OBD_CONNECT_MAXBYTES     0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV   0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS    0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK       0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+						  * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+						  * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+						   * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE	0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO     0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS	0x4000000000000ULL/* pings not required */
+/* XXX README XXX:
+ * Please DO NOT add flag values here before first ensuring that this same
+ * flag value is not in use on some other branch.  Please clear any such
+ * changes with senior engineers before starting to use a new flag.  Then,
+ * submit a small patch against EVERY branch that ONLY adds the new flag,
+ * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the
+ * flag to check_obd_connect_data(), and updates wiretests accordingly, so it
+ * can be approved and landed easily to reserve the flag for future use. */
+
+/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS
+ * connection.  It is a temporary bug fix for Imperative Recovery interop
+ * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for
+ * 2.2 clients/servers is no longer needed.  LU-1252/LU-1644. */
+#define OBD_CONNECT_MNE_SWAB		 OBD_CONNECT_MDS_MDS
+
+#define OCD_HAS_FLAG(ocd, flg)  \
+	(!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg))
+
+
+#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE
+
+#define MDT_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+				OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+				OBD_CONNECT_IBITS | \
+				OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \
+				OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+				OBD_CONNECT_RMT_CLIENT | \
+				OBD_CONNECT_RMT_CLIENT_FORCE | \
+				OBD_CONNECT_BRW_SIZE | OBD_CONNECT_MDS_CAPA | \
+				OBD_CONNECT_OSS_CAPA | OBD_CONNECT_MDS_MDS | \
+				OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \
+				OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \
+				OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \
+				OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
+				OBD_CONNECT_EINPROGRESS | \
+				OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
+				OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+				OBD_CONNECT_PINGLESS)
+#define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
+				OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
+				OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
+				OBD_CONNECT_BRW_SIZE | OBD_CONNECT_OSS_CAPA | \
+				OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+				LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \
+				OBD_CONNECT_RMT_CLIENT | \
+				OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \
+				OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \
+				OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \
+				OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \
+				OBD_CONNECT_MAX_EASIZE | \
+				OBD_CONNECT_EINPROGRESS | \
+				OBD_CONNECT_JOBSTATS | \
+				OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
+				OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+				OBD_CONNECT_PINGLESS)
+#define ECHO_CONNECT_SUPPORTED (0)
+#define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
+				OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
+				OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS)
+
+/* Features required for this version of the client to work with server */
+#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
+				 OBD_CONNECT_FULL20)
+
+#define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\
+						((patch)<<8) + (fix))
+#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255)
+#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255)
+#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255)
+#define OBD_OCD_VERSION_FIX(version)   ((int)(version)&255)
+
+/* This structure is used for both request and reply.
+ *
+ * If we eventually have separate connect data for different types, which we
+ * almost certainly will, then perhaps we stick a union in here. */
+struct obd_connect_data_v1 {
+	__u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+	__u32 ocd_version;	 /* lustre release version number */
+	__u32 ocd_grant;	 /* initial cache grant amount (bytes) */
+	__u32 ocd_index;	 /* LOV index to connect to */
+	__u32 ocd_brw_size;	 /* Maximum BRW size in bytes, must be 2^n */
+	__u64 ocd_ibits_known;   /* inode bits this client understands */
+	__u8  ocd_blocksize;     /* log2 of the backend filesystem blocksize */
+	__u8  ocd_inodespace;    /* log2 of the per-inode space consumption */
+	__u16 ocd_grant_extent;  /* per-extent grant overhead, in 1K blocks */
+	__u32 ocd_unused;	/* also fix lustre_swab_connect */
+	__u64 ocd_transno;       /* first transno from client to be replayed */
+	__u32 ocd_group;	 /* MDS group on OST */
+	__u32 ocd_cksum_types;   /* supported checksum algorithms */
+	__u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+	__u32 ocd_instance;      /* also fix lustre_swab_connect */
+	__u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+};
+
+struct obd_connect_data {
+	__u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+	__u32 ocd_version;	 /* lustre release version number */
+	__u32 ocd_grant;	 /* initial cache grant amount (bytes) */
+	__u32 ocd_index;	 /* LOV index to connect to */
+	__u32 ocd_brw_size;	 /* Maximum BRW size in bytes */
+	__u64 ocd_ibits_known;   /* inode bits this client understands */
+	__u8  ocd_blocksize;     /* log2 of the backend filesystem blocksize */
+	__u8  ocd_inodespace;    /* log2 of the per-inode space consumption */
+	__u16 ocd_grant_extent;  /* per-extent grant overhead, in 1K blocks */
+	__u32 ocd_unused;	/* also fix lustre_swab_connect */
+	__u64 ocd_transno;       /* first transno from client to be replayed */
+	__u32 ocd_group;	 /* MDS group on OST */
+	__u32 ocd_cksum_types;   /* supported checksum algorithms */
+	__u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+	__u32 ocd_instance;      /* instance # of this target */
+	__u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+	/* Fields after ocd_maxbytes are only accessible by the receiver
+	 * if the corresponding flag in ocd_connect_flags is set. Accessing
+	 * any field after ocd_maxbytes on the receiver without a valid flag
+	 * may result in out-of-bound memory access and kernel oops. */
+	__u64 padding1;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding2;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding3;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding4;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding5;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding6;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding7;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding8;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 padding9;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingA;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingB;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingC;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingD;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingE;	  /* added 2.1.0. also fix lustre_swab_connect */
+	__u64 paddingF;	  /* added 2.1.0. also fix lustre_swab_connect */
+};
+/* XXX README XXX:
+ * Please DO NOT use any fields here before first ensuring that this same
+ * field is not in use on some other branch.  Please clear any such changes
+ * with senior engineers before starting to use a new field.  Then, submit
+ * a small patch against EVERY branch that ONLY adds the new field along with
+ * the matching OBD_CONNECT flag, so that can be approved and landed easily to
+ * reserve the flag for future use. */
+
+
+extern void lustre_swab_connect(struct obd_connect_data *ocd);
+
+/*
+ * Supported checksum algorithms. Up to 32 checksum types are supported.
+ * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
+ * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags.
+ */
+typedef enum {
+	OBD_CKSUM_CRC32 = 0x00000001,
+	OBD_CKSUM_ADLER = 0x00000002,
+	OBD_CKSUM_CRC32C= 0x00000004,
+} cksum_type_t;
+
+/*
+ *   OST requests: OBDO & OBD request records
+ */
+
+/* opcodes */
+typedef enum {
+	OST_REPLY      =  0,       /* reply ? */
+	OST_GETATTR    =  1,
+	OST_SETATTR    =  2,
+	OST_READ       =  3,
+	OST_WRITE      =  4,
+	OST_CREATE     =  5,
+	OST_DESTROY    =  6,
+	OST_GET_INFO   =  7,
+	OST_CONNECT    =  8,
+	OST_DISCONNECT =  9,
+	OST_PUNCH      = 10,
+	OST_OPEN       = 11,
+	OST_CLOSE      = 12,
+	OST_STATFS     = 13,
+	OST_SYNC       = 16,
+	OST_SET_INFO   = 17,
+	OST_QUOTACHECK = 18,
+	OST_QUOTACTL   = 19,
+	OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
+	OST_LAST_OPC
+} ost_cmd_t;
+#define OST_FIRST_OPC  OST_REPLY
+
+enum obdo_flags {
+	OBD_FL_INLINEDATA   = 0x00000001,
+	OBD_FL_OBDMDEXISTS  = 0x00000002,
+	OBD_FL_DELORPHAN    = 0x00000004, /* if set in o_flags delete orphans */
+	OBD_FL_NORPC	= 0x00000008, /* set in o_flags do in OSC not OST */
+	OBD_FL_IDONLY       = 0x00000010, /* set in o_flags only adjust obj id*/
+	OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */
+	OBD_FL_DEBUG_CHECK  = 0x00000040, /* echo client/server debug check */
+	OBD_FL_NO_USRQUOTA  = 0x00000100, /* the object's owner is over quota */
+	OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
+	OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
+	OBD_FL_SRVLOCK      = 0x00000800, /* delegate DLM locking to server */
+	OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+	OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+	OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+	OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
+	OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+	OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+	OBD_FL_MMAP	 = 0x00040000, /* object is mmapped on the client.
+					   * XXX: obsoleted - reserved for old
+					   * clients prior than 2.2 */
+	OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
+	OBD_FL_NOSPC_BLK    = 0x00100000, /* no more block space on OST */
+
+	/* Note that while these checksum values are currently separate bits,
+	 * in 2.x we can actually allow all values from 1-31 if we wanted. */
+	OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
+			      OBD_FL_CKSUM_CRC32C,
+
+	/* mask for local-only flag, which won't be sent over network */
+	OBD_FL_LOCAL_MASK   = 0xF0000000,
+};
+
+#define LOV_MAGIC_V1      0x0BD10BD0
+#define LOV_MAGIC	 LOV_MAGIC_V1
+#define LOV_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_MAGIC_V3      0x0BD30BD0
+
+/*
+ * magic for fully defined striping
+ * the idea is that we should have different magics for striping "hints"
+ * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct
+ * lov_mds_md_v[13]). at the moment the magics are used in wire protocol,
+ * we can't just change it w/o long way preparation, but we still need a
+ * mechanism to allow LOD to differentiate hint versus ready striping.
+ * so, at the moment we do a trick: MDT knows what to expect from request
+ * depending on the case (replay uses ready striping, non-replay req uses
+ * hints), so MDT replaces magic with appropriate one and now LOD can
+ * easily understand what's inside -bzzz
+ */
+#define LOV_MAGIC_V1_DEF  0x0CD10BD0
+#define LOV_MAGIC_V3_DEF  0x0CD30BD0
+
+#define LOV_PATTERN_RAID0 0x001   /* stripes are used round-robin */
+#define LOV_PATTERN_RAID1 0x002   /* stripes are mirrors of each other */
+#define LOV_PATTERN_FIRST 0x100   /* first stripe is not in round-robin */
+#define LOV_PATTERN_CMOBD 0x200
+
+#define lov_ost_data lov_ost_data_v1
+struct lov_ost_data_v1 {	  /* per-stripe data structure (little-endian)*/
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;	  /* generation of this l_ost_idx */
+	__u32 l_ost_idx;	  /* OST index in LOV (lov_tgt_desc->tgts) */
+};
+
+#define lov_mds_md lov_mds_md_v1
+struct lov_mds_md_v1 {	    /* LOV EA mds/wire data (little-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_MAGIC_V1 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id	lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	/* lmm_stripe_count used to be __u32 */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	__u16 lmm_layout_gen;     /* layout generation number */
+	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+/**
+ * Sigh, because pre-2.4 uses
+ * struct lov_mds_md_v1 {
+ *	........
+ *	__u64 lmm_object_id;
+ *	__u64 lmm_object_seq;
+ *      ......
+ *      }
+ * to identify the LOV(MDT) object, and lmm_object_seq will
+ * be normal_fid, which make it hard to combine these conversion
+ * to ostid_to FID. so we will do lmm_oi/fid conversion separately
+ *
+ * We can tell the lmm_oi by this way,
+ * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0
+ * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL
+ * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k},
+ *      lmm_oi.f_ver = 0
+ *
+ * But currently lmm_oi/lsm_oi does not have any "real" usages,
+ * except for printing some information, and the user can always
+ * get the real FID from LMA, besides this multiple case check might
+ * make swab more complicate. So we will keep using id/seq for lmm_oi.
+ */
+
+static inline void fid_to_lmm_oi(const struct lu_fid *fid,
+				 struct ost_id *oi)
+{
+	oi->oi.oi_id = fid_oid(fid);
+	oi->oi.oi_seq = fid_seq(fid);
+}
+
+static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq)
+{
+	oi->oi.oi_seq = seq;
+}
+
+static inline __u64 lmm_oi_id(struct ost_id *oi)
+{
+	return oi->oi.oi_id;
+}
+
+static inline __u64 lmm_oi_seq(struct ost_id *oi)
+{
+	return oi->oi.oi_seq;
+}
+
+static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi,
+				    struct ost_id *src_oi)
+{
+	dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+	dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+}
+
+static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi,
+				    struct ost_id *src_oi)
+{
+	dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+	dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+}
+
+/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */
+
+#define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data))
+#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
+
+#define XATTR_NAME_ACL_ACCESS   "system.posix_acl_access"
+#define XATTR_NAME_ACL_DEFAULT  "system.posix_acl_default"
+#define XATTR_USER_PREFIX       "user."
+#define XATTR_TRUSTED_PREFIX    "trusted."
+#define XATTR_SECURITY_PREFIX   "security."
+#define XATTR_LUSTRE_PREFIX     "lustre."
+
+#define XATTR_NAME_LOV	  "trusted.lov"
+#define XATTR_NAME_LMA	  "trusted.lma"
+#define XATTR_NAME_LMV	  "trusted.lmv"
+#define XATTR_NAME_LINK	 "trusted.link"
+#define XATTR_NAME_FID	  "trusted.fid"
+#define XATTR_NAME_VERSION      "trusted.version"
+#define XATTR_NAME_SOM		"trusted.som"
+#define XATTR_NAME_HSM		"trusted.hsm"
+#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace"
+
+struct lov_mds_md_v3 {	    /* LOV EA mds/wire data (little-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_MAGIC_V3 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id	lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	/* lmm_stripe_count used to be __u32 */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	__u16 lmm_layout_gen;     /* layout generation number */
+	char  lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */
+	struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+#define OBD_MD_FLID	(0x00000001ULL) /* object ID */
+#define OBD_MD_FLATIME     (0x00000002ULL) /* access time */
+#define OBD_MD_FLMTIME     (0x00000004ULL) /* data modification time */
+#define OBD_MD_FLCTIME     (0x00000008ULL) /* change time */
+#define OBD_MD_FLSIZE      (0x00000010ULL) /* size */
+#define OBD_MD_FLBLOCKS    (0x00000020ULL) /* allocated blocks count */
+#define OBD_MD_FLBLKSZ     (0x00000040ULL) /* block size */
+#define OBD_MD_FLMODE      (0x00000080ULL) /* access bits (mode & ~S_IFMT) */
+#define OBD_MD_FLTYPE      (0x00000100ULL) /* object type (mode & S_IFMT) */
+#define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
+#define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
+#define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+#define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
+#define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
+/*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
+#define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
+#define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
+#define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
+#define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
+#define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
+#define OBD_MD_FLQOS       (0x00200000ULL) /* quality of service stats */
+/*#define OBD_MD_FLOSCOPQ    (0x00400000ULL) osc opaque data, never used */
+#define OBD_MD_FLCOOKIE    (0x00800000ULL) /* log cancellation cookie */
+#define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
+#define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
+#define OBD_MD_FLEPOCH     (0x04000000ULL) /* ->ost write with ioepoch */
+					   /* ->mds if epoch opens or closes */
+#define OBD_MD_FLGRANT     (0x08000000ULL) /* ost preallocation space grant */
+#define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
+#define OBD_MD_FLUSRQUOTA  (0x20000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGRPQUOTA  (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
+
+#define OBD_MD_MDS	 (0x0000000100000000ULL) /* where an inode lives on */
+#define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
+#define OBD_MD_MEA	 (0x0000000400000000ULL) /* CMD split EA  */
+
+/* OBD_MD_MDTIDX is used to get MDT index, but it is never been used overwire,
+ * and it is already obsolete since 2.3 */
+/* #define OBD_MD_MDTIDX      (0x0000000800000000ULL) */
+
+#define OBD_MD_FLXATTR       (0x0000001000000000ULL) /* xattr */
+#define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
+#define OBD_MD_FLXATTRRM     (0x0000004000000000ULL) /* xattr remove */
+#define OBD_MD_FLACL	 (0x0000008000000000ULL) /* ACL */
+#define OBD_MD_FLRMTPERM     (0x0000010000000000ULL) /* remote permission */
+#define OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) /* MDS capability */
+#define OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) /* OSS capability */
+#define OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLCROSSREF    (0x0000100000000000ULL) /* Cross-ref case */
+#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
+						      * under lock */
+#define OBD_MD_FLOBJCOUNT    (0x0000400000000000ULL) /* for multiple destroy */
+
+#define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */
+#define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */
+#define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */
+#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
+
+#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
+
+#define OBD_MD_FLGETATTR (OBD_MD_FLID    | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
+			  OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
+			  OBD_MD_FLMODE  | OBD_MD_FLTYPE  | OBD_MD_FLUID   | \
+			  OBD_MD_FLGID   | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
+			  OBD_MD_FLGENER | OBD_MD_FLRDEV  | OBD_MD_FLGROUP)
+
+/* don't forget obdo_fid which is way down at the bottom so it can
+ * come after the definition of llog_cookie */
+
+enum hss_valid {
+	HSS_SETMASK	= 0x01,
+	HSS_CLEARMASK	= 0x02,
+	HSS_ARCHIVE_ID	= 0x04,
+};
+
+struct hsm_state_set {
+	__u32	hss_valid;
+	__u32	hss_archive_id;
+	__u64	hss_setmask;
+	__u64	hss_clearmask;
+};
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_state_set(struct hsm_state_set *hss);
+
+extern void lustre_swab_obd_statfs (struct obd_statfs *os);
+
+/* ost_body.data values for OST_BRW */
+
+#define OBD_BRW_READ	    0x01
+#define OBD_BRW_WRITE	   0x02
+#define OBD_BRW_RWMASK	  (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_SYNC	    0x08 /* this page is a part of synchronous
+				      * transfer and is not accounted in
+				      * the grant. */
+#define OBD_BRW_CHECK	   0x10
+#define OBD_BRW_FROM_GRANT      0x20 /* the osc manages this under llite */
+#define OBD_BRW_GRANTED	 0x40 /* the ost manages this */
+#define OBD_BRW_NOCACHE	 0x80 /* this page is a part of non-cached IO */
+#define OBD_BRW_NOQUOTA	0x100
+#define OBD_BRW_SRVLOCK	0x200 /* Client holds no lock over this page */
+#define OBD_BRW_ASYNC	  0x400 /* Server may delay commit to disk */
+#define OBD_BRW_MEMALLOC       0x800 /* Client runs in the "kswapd" context */
+#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
+#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+
+#define OBD_OBJECT_EOF 0xffffffffffffffffULL
+
+#define OST_MIN_PRECREATE 32
+#define OST_MAX_PRECREATE 20000
+
+struct obd_ioobj {
+	struct ost_id	ioo_oid;	/* object ID, if multi-obj BRW */
+	__u32		ioo_max_brw;	/* low 16 bits were o_mode before 2.4,
+					 * now (PTLRPC_BULK_OPS_COUNT - 1) in
+					 * high 16 bits in 2.4 and later */
+	__u32		ioo_bufcnt;	/* number of niobufs for this object */
+};
+
+#define IOOBJ_MAX_BRW_BITS	16
+#define IOOBJ_TYPE_MASK		((1U << IOOBJ_MAX_BRW_BITS) - 1)
+#define ioobj_max_brw_get(ioo)	(((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1)
+#define ioobj_max_brw_set(ioo, num)					\
+do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0)
+
+extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo);
+
+/* multiple of 8 bytes => can array */
+struct niobuf_remote {
+	__u64 offset;
+	__u32 len;
+	__u32 flags;
+};
+
+extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr);
+
+/* lock value block communicated between the filter and llite */
+
+/* OST_LVB_ERR_INIT is needed because the return code in rc is
+ * negative, i.e. because ((MASK + rc) & MASK) != MASK. */
+#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL
+#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL
+#define OST_LVB_IS_ERR(blocks)					  \
+	((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK)
+#define OST_LVB_SET_ERR(blocks, rc)				     \
+	do { blocks = OST_LVB_ERR_INIT + rc; } while (0)
+#define OST_LVB_GET_ERR(blocks)    (int)(blocks - OST_LVB_ERR_INIT)
+
+struct ost_lvb_v1 {
+	__u64		lvb_size;
+	obd_time	lvb_mtime;
+	obd_time	lvb_atime;
+	obd_time	lvb_ctime;
+	__u64		lvb_blocks;
+};
+
+extern void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb);
+
+struct ost_lvb {
+	__u64		lvb_size;
+	obd_time	lvb_mtime;
+	obd_time	lvb_atime;
+	obd_time	lvb_ctime;
+	__u64		lvb_blocks;
+	__u32		lvb_mtime_ns;
+	__u32		lvb_atime_ns;
+	__u32		lvb_ctime_ns;
+	__u32		lvb_padding;
+};
+
+extern void lustre_swab_ost_lvb(struct ost_lvb *lvb);
+
+/*
+ *   lquota data structures
+ */
+
+#ifndef QUOTABLOCK_BITS
+#define QUOTABLOCK_BITS 10
+#endif
+
+#ifndef QUOTABLOCK_SIZE
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+#endif
+
+#ifndef toqb
+#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS)
+#endif
+
+/* The lquota_id structure is an union of all the possible identifier types that
+ * can be used with quota, this includes:
+ * - 64-bit user ID
+ * - 64-bit group ID
+ * - a FID which can be used for per-directory quota in the future */
+union lquota_id {
+	struct lu_fid	qid_fid; /* FID for per-directory quota */
+	__u64		qid_uid; /* user identifier */
+	__u64		qid_gid; /* group identifier */
+};
+
+/* quotactl management */
+struct obd_quotactl {
+	__u32			qc_cmd;
+	__u32			qc_type; /* see Q_* flag below */
+	__u32			qc_id;
+	__u32			qc_stat;
+	struct obd_dqinfo	qc_dqinfo;
+	struct obd_dqblk	qc_dqblk;
+};
+
+extern void lustre_swab_obd_quotactl(struct obd_quotactl *q);
+
+#define Q_QUOTACHECK	0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA	0x800101 /* deprecated as of 2.4  */
+#define Q_GETOINFO	0x800102 /* get obd quota info */
+#define Q_GETOQUOTA	0x800103 /* get obd quotas */
+#define Q_FINVALIDATE	0x800104 /* deprecated as of 2.4 */
+
+#define Q_COPY(out, in, member) (out)->member = (in)->member
+
+#define QCTL_COPY(out, in)		\
+do {					\
+	Q_COPY(out, in, qc_cmd);	\
+	Q_COPY(out, in, qc_type);	\
+	Q_COPY(out, in, qc_id);		\
+	Q_COPY(out, in, qc_stat);	\
+	Q_COPY(out, in, qc_dqinfo);	\
+	Q_COPY(out, in, qc_dqblk);	\
+} while (0)
+
+/* Body of quota request used for quota acquire/release RPCs between quota
+ * master (aka QMT) and slaves (ak QSD). */
+struct quota_body {
+	struct lu_fid	qb_fid;     /* FID of global index packing the pool ID
+				      * and type (data or metadata) as well as
+				      * the quota type (user or group). */
+	union lquota_id	qb_id;      /* uid or gid or directory FID */
+	__u32		qb_flags;   /* see below */
+	__u32		qb_padding;
+	__u64		qb_count;   /* acquire/release count (kbytes/inodes) */
+	__u64		qb_usage;   /* current slave usage (kbytes/inodes) */
+	__u64		qb_slv_ver; /* slave index file version */
+	struct lustre_handle	qb_lockh;     /* per-ID lock handle */
+	struct lustre_handle	qb_glb_lockh; /* global lock handle */
+	__u64		qb_padding1[4];
+};
+
+/* When the quota_body is used in the reply of quota global intent
+ * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */
+#define qb_slv_fid	qb_fid
+/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in
+ * quota reply */
+#define qb_qunit	qb_usage
+
+#define QUOTA_DQACQ_FL_ACQ	0x1  /* acquire quota */
+#define QUOTA_DQACQ_FL_PREACQ	0x2  /* pre-acquire */
+#define QUOTA_DQACQ_FL_REL	0x4  /* release quota */
+#define QUOTA_DQACQ_FL_REPORT	0x8  /* report usage */
+
+extern void lustre_swab_quota_body(struct quota_body *b);
+
+/* Quota types currently supported */
+enum {
+	LQUOTA_TYPE_USR	= 0x00, /* maps to USRQUOTA */
+	LQUOTA_TYPE_GRP	= 0x01, /* maps to GRPQUOTA */
+	LQUOTA_TYPE_MAX
+};
+
+/* There are 2 different resource types on which a quota limit can be enforced:
+ * - inodes on the MDTs
+ * - blocks on the OSTs */
+enum {
+	LQUOTA_RES_MD		= 0x01, /* skip 0 to avoid null oid in FID */
+	LQUOTA_RES_DT		= 0x02,
+	LQUOTA_LAST_RES,
+	LQUOTA_FIRST_RES	= LQUOTA_RES_MD
+};
+#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1)
+
+/*
+ * Space accounting support
+ * Format of an accounting record, providing disk usage information for a given
+ * user or group
+ */
+struct lquota_acct_rec { /* 16 bytes */
+	__u64 bspace;  /* current space in use */
+	__u64 ispace;  /* current # inodes in use */
+};
+
+/*
+ * Global quota index support
+ * Format of a global record, providing global quota settings for a given quota
+ * identifier
+ */
+struct lquota_glb_rec { /* 32 bytes */
+	__u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */
+	__u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */
+	__u64 qbr_time;      /* grace time, in seconds */
+	__u64 qbr_granted;   /* how much is granted to slaves, in #inodes or
+			      * kbytes */
+};
+
+/*
+ * Slave index support
+ * Format of a slave record, recording how much space is granted to a given
+ * slave
+ */
+struct lquota_slv_rec { /* 8 bytes */
+	__u64 qsr_granted; /* space granted to the slave for the key=ID,
+			    * in #inodes or kbytes */
+};
+
+/* Data structures associated with the quota locks */
+
+/* Glimpse descriptor used for the index & per-ID quota locks */
+struct ldlm_gl_lquota_desc {
+	union lquota_id	gl_id;    /* quota ID subject to the glimpse */
+	__u64		gl_flags; /* see LQUOTA_FL* below */
+	__u64		gl_ver;   /* new index version */
+	__u64		gl_hardlimit; /* new hardlimit or qunit value */
+	__u64		gl_softlimit; /* new softlimit */
+	__u64		gl_time;
+	__u64		gl_pad2;
+};
+#define gl_qunit	gl_hardlimit /* current qunit value used when
+				      * glimpsing per-ID quota locks */
+
+/* quota glimpse flags */
+#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */
+
+/* LVB used with quota (global and per-ID) locks */
+struct lquota_lvb {
+	__u64	lvb_flags;	/* see LQUOTA_FL* above */
+	__u64	lvb_id_may_rel; /* space that might be released later */
+	__u64	lvb_id_rel;     /* space released by the slave for this ID */
+	__u64	lvb_id_qunit;   /* current qunit value */
+	__u64	lvb_pad1;
+};
+
+extern void lustre_swab_lquota_lvb(struct lquota_lvb *lvb);
+
+/* LVB used with global quota lock */
+#define lvb_glb_ver  lvb_id_may_rel /* current version of the global index */
+
+/* op codes */
+typedef enum {
+	QUOTA_DQACQ	= 601,
+	QUOTA_DQREL	= 602,
+	QUOTA_LAST_OPC
+} quota_cmd_t;
+#define QUOTA_FIRST_OPC	QUOTA_DQACQ
+
+/*
+ *   MDS REQ RECORDS
+ */
+
+/* opcodes */
+typedef enum {
+	MDS_GETATTR		= 33,
+	MDS_GETATTR_NAME	= 34,
+	MDS_CLOSE		= 35,
+	MDS_REINT		= 36,
+	MDS_READPAGE		= 37,
+	MDS_CONNECT		= 38,
+	MDS_DISCONNECT		= 39,
+	MDS_GETSTATUS		= 40,
+	MDS_STATFS		= 41,
+	MDS_PIN			= 42,
+	MDS_UNPIN		= 43,
+	MDS_SYNC		= 44,
+	MDS_DONE_WRITING	= 45,
+	MDS_SET_INFO		= 46,
+	MDS_QUOTACHECK		= 47,
+	MDS_QUOTACTL		= 48,
+	MDS_GETXATTR		= 49,
+	MDS_SETXATTR		= 50, /* obsolete, now it's MDS_REINT op */
+	MDS_WRITEPAGE		= 51,
+	MDS_IS_SUBDIR		= 52,
+	MDS_GET_INFO		= 53,
+	MDS_HSM_STATE_GET	= 54,
+	MDS_HSM_STATE_SET	= 55,
+	MDS_HSM_ACTION		= 56,
+	MDS_HSM_PROGRESS	= 57,
+	MDS_HSM_REQUEST		= 58,
+	MDS_HSM_CT_REGISTER	= 59,
+	MDS_HSM_CT_UNREGISTER	= 60,
+	MDS_SWAP_LAYOUTS	= 61,
+	MDS_LAST_OPC
+} mds_cmd_t;
+
+#define MDS_FIRST_OPC    MDS_GETATTR
+
+
+/* opcodes for object update */
+typedef enum {
+	UPDATE_OBJ	= 1000,
+	UPDATE_LAST_OPC
+} update_cmd_t;
+
+#define UPDATE_FIRST_OPC    UPDATE_OBJ
+
+/*
+ * Do not exceed 63
+ */
+
+typedef enum {
+	REINT_SETATTR  = 1,
+	REINT_CREATE   = 2,
+	REINT_LINK     = 3,
+	REINT_UNLINK   = 4,
+	REINT_RENAME   = 5,
+	REINT_OPEN     = 6,
+	REINT_SETXATTR = 7,
+	REINT_RMENTRY  = 8,
+//      REINT_WRITE    = 9,
+	REINT_MAX
+} mds_reint_t, mdt_reint_t;
+
+extern void lustre_swab_generic_32s (__u32 *val);
+
+/* the disposition of the intent outlines what was executed */
+#define DISP_IT_EXECD	0x00000001
+#define DISP_LOOKUP_EXECD    0x00000002
+#define DISP_LOOKUP_NEG      0x00000004
+#define DISP_LOOKUP_POS      0x00000008
+#define DISP_OPEN_CREATE     0x00000010
+#define DISP_OPEN_OPEN       0x00000020
+#define DISP_ENQ_COMPLETE    0x00400000
+#define DISP_ENQ_OPEN_REF    0x00800000
+#define DISP_ENQ_CREATE_REF  0x01000000
+#define DISP_OPEN_LOCK       0x02000000
+
+/* INODE LOCK PARTS */
+#define MDS_INODELOCK_LOOKUP 0x000001       /* dentry, mode, owner, group */
+#define MDS_INODELOCK_UPDATE 0x000002       /* size, links, timestamps */
+#define MDS_INODELOCK_OPEN   0x000004       /* For opened files */
+#define MDS_INODELOCK_LAYOUT 0x000008       /* for layout */
+#define MDS_INODELOCK_PERM   0x000010       /* for permission */
+
+#define MDS_INODELOCK_MAXSHIFT 4
+/* This FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+
+extern void lustre_swab_ll_fid (struct ll_fid *fid);
+
+/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * name[2,3] fields that need to be used for the quota id (also a FID). */
+enum {
+	LUSTRE_RES_ID_SEQ_OFF = 0,
+	LUSTRE_RES_ID_VER_OID_OFF = 1,
+	LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */
+	LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2,
+	LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3,
+	LUSTRE_RES_ID_HSH_OFF = 3
+};
+
+#define MDS_STATUS_CONN 1
+#define MDS_STATUS_LOV 2
+
+/* mdt_thread_info.mti_flags. */
+enum md_op_flags {
+	/* The flag indicates Size-on-MDS attributes are changed. */
+	MF_SOM_CHANGE	   = (1 << 0),
+	/* Flags indicates an epoch opens or closes. */
+	MF_EPOCH_OPEN	   = (1 << 1),
+	MF_EPOCH_CLOSE	  = (1 << 2),
+	MF_MDC_CANCEL_FID1      = (1 << 3),
+	MF_MDC_CANCEL_FID2      = (1 << 4),
+	MF_MDC_CANCEL_FID3      = (1 << 5),
+	MF_MDC_CANCEL_FID4      = (1 << 6),
+	/* There is a pending attribute update. */
+	MF_SOM_AU	       = (1 << 7),
+	/* Cancel OST locks while getattr OST attributes. */
+	MF_GETATTR_LOCK	 = (1 << 8),
+	MF_GET_MDT_IDX	  = (1 << 9),
+};
+
+#define MF_SOM_LOCAL_FLAGS (MF_SOM_CHANGE | MF_EPOCH_OPEN | MF_EPOCH_CLOSE)
+
+#define LUSTRE_BFLAG_UNCOMMITTED_WRITES   0x1
+
+/* these should be identical to their EXT4_*_FL counterparts, they are
+ * redefined here only to avoid dragging in fs/ext4/ext4.h */
+#define LUSTRE_SYNC_FL	 0x00000008 /* Synchronous updates */
+#define LUSTRE_IMMUTABLE_FL    0x00000010 /* Immutable file */
+#define LUSTRE_APPEND_FL       0x00000020 /* writes to file may only append */
+#define LUSTRE_NOATIME_FL      0x00000080 /* do not update atime */
+#define LUSTRE_DIRSYNC_FL      0x00010000 /* dirsync behaviour (dir only) */
+
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions.  These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history. */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+	return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
+		((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
+		((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
+#if defined(S_DIRSYNC)
+		((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+#endif
+		((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+	return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
+		((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
+		((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
+#if defined(S_DIRSYNC)
+		((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+#endif
+		((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
+}
+
+struct mdt_body {
+	struct lu_fid  fid1;
+	struct lu_fid  fid2;
+	struct lustre_handle handle;
+	__u64	  valid;
+	__u64	  size;   /* Offset, in the case of MDS_READPAGE */
+       obd_time	mtime;
+       obd_time	atime;
+       obd_time	ctime;
+	__u64	  blocks; /* XID, in the case of MDS_READPAGE */
+	__u64	  ioepoch;
+	__u64	       unused1; /* was "ino" until 2.4.0 */
+	__u32	  fsuid;
+	__u32	  fsgid;
+	__u32	  capability;
+	__u32	  mode;
+	__u32	  uid;
+	__u32	  gid;
+	__u32	  flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */
+	__u32	  rdev;
+	__u32	  nlink; /* #bytes to read in the case of MDS_READPAGE */
+	__u32	       unused2; /* was "generation" until 2.4.0 */
+	__u32	  suppgid;
+	__u32	  eadatasize;
+	__u32	  aclsize;
+	__u32	  max_mdsize;
+	__u32	  max_cookiesize;
+	__u32	  uid_h; /* high 32-bits of uid, for FUID */
+	__u32	  gid_h; /* high 32-bits of gid, for FUID */
+	__u32	  padding_5; /* also fix lustre_swab_mdt_body */
+	__u64	  padding_6;
+	__u64	  padding_7;
+	__u64	  padding_8;
+	__u64	  padding_9;
+	__u64	  padding_10;
+}; /* 216 */
+
+extern void lustre_swab_mdt_body (struct mdt_body *b);
+
+struct mdt_ioepoch {
+	struct lustre_handle handle;
+	__u64  ioepoch;
+	__u32  flags;
+	__u32  padding;
+};
+
+extern void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b);
+
+/* permissions for md_perm.mp_perm */
+enum {
+	CFS_SETUID_PERM = 0x01,
+	CFS_SETGID_PERM = 0x02,
+	CFS_SETGRP_PERM = 0x04,
+	CFS_RMTACL_PERM = 0x08,
+	CFS_RMTOWN_PERM = 0x10
+};
+
+/* inode access permission for remote user, the inode info are omitted,
+ * for client knows them. */
+struct mdt_remote_perm {
+	__u32	   rp_uid;
+	__u32	   rp_gid;
+	__u32	   rp_fsuid;
+	__u32	   rp_fsuid_h;
+	__u32	   rp_fsgid;
+	__u32	   rp_fsgid_h;
+	__u32	   rp_access_perm; /* MAY_READ/WRITE/EXEC */
+	__u32	   rp_padding;
+};
+
+extern void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p);
+
+struct mdt_rec_setattr {
+	__u32	   sa_opcode;
+	__u32	   sa_cap;
+	__u32	   sa_fsuid;
+	__u32	   sa_fsuid_h;
+	__u32	   sa_fsgid;
+	__u32	   sa_fsgid_h;
+	__u32	   sa_suppgid;
+	__u32	   sa_suppgid_h;
+	__u32	   sa_padding_1;
+	__u32	   sa_padding_1_h;
+	struct lu_fid   sa_fid;
+	__u64	   sa_valid;
+	__u32	   sa_uid;
+	__u32	   sa_gid;
+	__u64	   sa_size;
+	__u64	   sa_blocks;
+	obd_time	sa_mtime;
+	obd_time	sa_atime;
+	obd_time	sa_ctime;
+	__u32	   sa_attr_flags;
+	__u32	   sa_mode;
+	__u32	   sa_bias;      /* some operation flags */
+	__u32	   sa_padding_3;
+	__u32	   sa_padding_4;
+	__u32	   sa_padding_5;
+};
+
+extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa);
+
+/*
+ * Attribute flags used in mdt_rec_setattr::sa_valid.
+ * The kernel's #defines for ATTR_* should not be used over the network
+ * since the client and MDS may run different kernels (see bug 13828)
+ * Therefore, we should only use MDS_ATTR_* attributes for sa_valid.
+ */
+#define MDS_ATTR_MODE	  0x1ULL /* = 1 */
+#define MDS_ATTR_UID	   0x2ULL /* = 2 */
+#define MDS_ATTR_GID	   0x4ULL /* = 4 */
+#define MDS_ATTR_SIZE	  0x8ULL /* = 8 */
+#define MDS_ATTR_ATIME	0x10ULL /* = 16 */
+#define MDS_ATTR_MTIME	0x20ULL /* = 32 */
+#define MDS_ATTR_CTIME	0x40ULL /* = 64 */
+#define MDS_ATTR_ATIME_SET    0x80ULL /* = 128 */
+#define MDS_ATTR_MTIME_SET   0x100ULL /* = 256 */
+#define MDS_ATTR_FORCE       0x200ULL /* = 512, Not a change, but a change it */
+#define MDS_ATTR_ATTR_FLAG   0x400ULL /* = 1024 */
+#define MDS_ATTR_KILL_SUID   0x800ULL /* = 2048 */
+#define MDS_ATTR_KILL_SGID  0x1000ULL /* = 4096 */
+#define MDS_ATTR_CTIME_SET  0x2000ULL /* = 8192 */
+#define MDS_ATTR_FROM_OPEN  0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
+#define MDS_ATTR_BLOCKS     0x8000ULL /* = 32768 */
+
+#ifndef FMODE_READ
+#define FMODE_READ	       00000001
+#define FMODE_WRITE	      00000002
+#endif
+
+#define MDS_FMODE_CLOSED	 00000000
+#define MDS_FMODE_EXEC	   00000004
+/* IO Epoch is opened on a closed file. */
+#define MDS_FMODE_EPOCH	  01000000
+/* IO Epoch is opened on a file truncate. */
+#define MDS_FMODE_TRUNC	  02000000
+/* Size-on-MDS Attribute Update is pending. */
+#define MDS_FMODE_SOM	    04000000
+
+#define MDS_OPEN_CREATED	 00000010
+#define MDS_OPEN_CROSS	   00000020
+
+#define MDS_OPEN_CREAT	   00000100
+#define MDS_OPEN_EXCL	    00000200
+#define MDS_OPEN_TRUNC	   00001000
+#define MDS_OPEN_APPEND	  00002000
+#define MDS_OPEN_SYNC	    00010000
+#define MDS_OPEN_DIRECTORY       00200000
+
+#define MDS_OPEN_BY_FID		040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
+					   * We do not support JOIN FILE
+					   * anymore, reserve this flags
+					   * just for preventing such bit
+					   * to be reused. */
+
+#define MDS_OPEN_LOCK	 04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
+					      * hsm restore) */
+#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
+						unlinked */
+
+/* permission for create non-directory file */
+#define MAY_CREATE      (1 << 7)
+/* permission for create directory file */
+#define MAY_LINK	(1 << 8)
+/* permission for delete from the directory */
+#define MAY_UNLINK      (1 << 9)
+/* source's permission for rename */
+#define MAY_RENAME_SRC  (1 << 10)
+/* target's permission for rename */
+#define MAY_RENAME_TAR  (1 << 11)
+/* part (parent's) VTX permission check */
+#define MAY_VTX_PART    (1 << 12)
+/* full VTX permission check */
+#define MAY_VTX_FULL    (1 << 13)
+/* lfs rgetfacl permission check */
+#define MAY_RGETFACL    (1 << 14)
+
+enum {
+	MDS_CHECK_SPLIT		= 1 << 0,
+	MDS_CROSS_REF		= 1 << 1,
+	MDS_VTX_BYPASS		= 1 << 2,
+	MDS_PERM_BYPASS		= 1 << 3,
+	MDS_SOM			= 1 << 4,
+	MDS_QUOTA_IGNORE	= 1 << 5,
+	MDS_CLOSE_CLEANUP	= 1 << 6,
+	MDS_KEEP_ORPHAN		= 1 << 7,
+	MDS_RECOV_OPEN		= 1 << 8,
+	MDS_DATA_MODIFIED	= 1 << 9,
+	MDS_CREATE_VOLATILE	= 1 << 10,
+	MDS_OWNEROVERRIDE	= 1 << 11,
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_create {
+	__u32	   cr_opcode;
+	__u32	   cr_cap;
+	__u32	   cr_fsuid;
+	__u32	   cr_fsuid_h;
+	__u32	   cr_fsgid;
+	__u32	   cr_fsgid_h;
+	__u32	   cr_suppgid1;
+	__u32	   cr_suppgid1_h;
+	__u32	   cr_suppgid2;
+	__u32	   cr_suppgid2_h;
+	struct lu_fid   cr_fid1;
+	struct lu_fid   cr_fid2;
+	struct lustre_handle cr_old_handle; /* handle in case of open replay */
+	obd_time	cr_time;
+	__u64	   cr_rdev;
+	__u64	   cr_ioepoch;
+	__u64	   cr_padding_1;   /* rr_blocks */
+	__u32	   cr_mode;
+	__u32	   cr_bias;
+	/* use of helpers set/get_mrc_cr_flags() is needed to access
+	 * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+	 * extend cr_flags size without breaking 1.8 compat */
+	__u32	   cr_flags_l;     /* for use with open, low  32 bits  */
+	__u32	   cr_flags_h;     /* for use with open, high 32 bits */
+	__u32	   cr_umask;       /* umask for create */
+	__u32	   cr_padding_4;   /* rr_padding_4 */
+};
+
+static inline void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags)
+{
+	mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll);
+	mrc->cr_flags_h = (__u32)(flags >> 32);
+}
+
+static inline __u64 get_mrc_cr_flags(struct mdt_rec_create *mrc)
+{
+	return ((__u64)(mrc->cr_flags_l) | ((__u64)mrc->cr_flags_h << 32));
+}
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_link {
+	__u32	   lk_opcode;
+	__u32	   lk_cap;
+	__u32	   lk_fsuid;
+	__u32	   lk_fsuid_h;
+	__u32	   lk_fsgid;
+	__u32	   lk_fsgid_h;
+	__u32	   lk_suppgid1;
+	__u32	   lk_suppgid1_h;
+	__u32	   lk_suppgid2;
+	__u32	   lk_suppgid2_h;
+	struct lu_fid   lk_fid1;
+	struct lu_fid   lk_fid2;
+	obd_time	lk_time;
+	__u64	   lk_padding_1;   /* rr_atime */
+	__u64	   lk_padding_2;   /* rr_ctime */
+	__u64	   lk_padding_3;   /* rr_size */
+	__u64	   lk_padding_4;   /* rr_blocks */
+	__u32	   lk_bias;
+	__u32	   lk_padding_5;   /* rr_mode */
+	__u32	   lk_padding_6;   /* rr_flags */
+	__u32	   lk_padding_7;   /* rr_padding_2 */
+	__u32	   lk_padding_8;   /* rr_padding_3 */
+	__u32	   lk_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_unlink {
+	__u32	   ul_opcode;
+	__u32	   ul_cap;
+	__u32	   ul_fsuid;
+	__u32	   ul_fsuid_h;
+	__u32	   ul_fsgid;
+	__u32	   ul_fsgid_h;
+	__u32	   ul_suppgid1;
+	__u32	   ul_suppgid1_h;
+	__u32	   ul_suppgid2;
+	__u32	   ul_suppgid2_h;
+	struct lu_fid   ul_fid1;
+	struct lu_fid   ul_fid2;
+	obd_time	ul_time;
+	__u64	   ul_padding_2;   /* rr_atime */
+	__u64	   ul_padding_3;   /* rr_ctime */
+	__u64	   ul_padding_4;   /* rr_size */
+	__u64	   ul_padding_5;   /* rr_blocks */
+	__u32	   ul_bias;
+	__u32	   ul_mode;
+	__u32	   ul_padding_6;   /* rr_flags */
+	__u32	   ul_padding_7;   /* rr_padding_2 */
+	__u32	   ul_padding_8;   /* rr_padding_3 */
+	__u32	   ul_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_rename {
+	__u32	   rn_opcode;
+	__u32	   rn_cap;
+	__u32	   rn_fsuid;
+	__u32	   rn_fsuid_h;
+	__u32	   rn_fsgid;
+	__u32	   rn_fsgid_h;
+	__u32	   rn_suppgid1;
+	__u32	   rn_suppgid1_h;
+	__u32	   rn_suppgid2;
+	__u32	   rn_suppgid2_h;
+	struct lu_fid   rn_fid1;
+	struct lu_fid   rn_fid2;
+	obd_time	rn_time;
+	__u64	   rn_padding_1;   /* rr_atime */
+	__u64	   rn_padding_2;   /* rr_ctime */
+	__u64	   rn_padding_3;   /* rr_size */
+	__u64	   rn_padding_4;   /* rr_blocks */
+	__u32	   rn_bias;	/* some operation flags */
+	__u32	   rn_mode;	/* cross-ref rename has mode */
+	__u32	   rn_padding_5;   /* rr_flags */
+	__u32	   rn_padding_6;   /* rr_padding_2 */
+	__u32	   rn_padding_7;   /* rr_padding_3 */
+	__u32	   rn_padding_8;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_setxattr {
+	__u32	   sx_opcode;
+	__u32	   sx_cap;
+	__u32	   sx_fsuid;
+	__u32	   sx_fsuid_h;
+	__u32	   sx_fsgid;
+	__u32	   sx_fsgid_h;
+	__u32	   sx_suppgid1;
+	__u32	   sx_suppgid1_h;
+	__u32	   sx_suppgid2;
+	__u32	   sx_suppgid2_h;
+	struct lu_fid   sx_fid;
+	__u64	   sx_padding_1;   /* These three are rr_fid2 */
+	__u32	   sx_padding_2;
+	__u32	   sx_padding_3;
+	__u64	   sx_valid;
+	obd_time	sx_time;
+	__u64	   sx_padding_5;   /* rr_ctime */
+	__u64	   sx_padding_6;   /* rr_size */
+	__u64	   sx_padding_7;   /* rr_blocks */
+	__u32	   sx_size;
+	__u32	   sx_flags;
+	__u32	   sx_padding_8;   /* rr_flags */
+	__u32	   sx_padding_9;   /* rr_padding_2 */
+	__u32	   sx_padding_10;  /* rr_padding_3 */
+	__u32	   sx_padding_11;  /* rr_padding_4 */
+};
+
+/*
+ * mdt_rec_reint is the template for all mdt_reint_xxx structures.
+ * Do NOT change the size of various members, otherwise the value
+ * will be broken in lustre_swab_mdt_rec_reint().
+ *
+ * If you add new members in other mdt_reint_xxx structres and need to use the
+ * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also.
+ */
+struct mdt_rec_reint {
+	__u32	   rr_opcode;
+	__u32	   rr_cap;
+	__u32	   rr_fsuid;
+	__u32	   rr_fsuid_h;
+	__u32	   rr_fsgid;
+	__u32	   rr_fsgid_h;
+	__u32	   rr_suppgid1;
+	__u32	   rr_suppgid1_h;
+	__u32	   rr_suppgid2;
+	__u32	   rr_suppgid2_h;
+	struct lu_fid   rr_fid1;
+	struct lu_fid   rr_fid2;
+	obd_time	rr_mtime;
+	obd_time	rr_atime;
+	obd_time	rr_ctime;
+	__u64	   rr_size;
+	__u64	   rr_blocks;
+	__u32	   rr_bias;
+	__u32	   rr_mode;
+	__u32	   rr_flags;
+	__u32	   rr_flags_h;
+	__u32	   rr_umask;
+	__u32	   rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+};
+
+extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr);
+
+struct lmv_desc {
+	__u32 ld_tgt_count;		/* how many MDS's */
+	__u32 ld_active_tgt_count;	 /* how many active */
+	__u32 ld_default_stripe_count;     /* how many objects are used */
+	__u32 ld_pattern;		  /* default MEA_MAGIC_* */
+	__u64 ld_default_hash_size;
+	__u64 ld_padding_1;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_padding_2;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_qos_maxage;	       /* in second */
+	__u32 ld_padding_3;		/* also fix lustre_swab_lmv_desc */
+	__u32 ld_padding_4;		/* also fix lustre_swab_lmv_desc */
+	struct obd_uuid ld_uuid;
+};
+
+extern void lustre_swab_lmv_desc (struct lmv_desc *ld);
+
+/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */
+struct lmv_stripe_md {
+	__u32	 mea_magic;
+	__u32	 mea_count;
+	__u32	 mea_master;
+	__u32	 mea_padding;
+	char	  mea_pool_name[LOV_MAXPOOLNAME];
+	struct lu_fid mea_ids[0];
+};
+
+extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea);
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32	 0x7fffffffUL
+#define MAX_HASH_SIZE	    0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+enum fld_rpc_opc {
+	FLD_QUERY		       = 900,
+	FLD_LAST_OPC,
+	FLD_FIRST_OPC		   = FLD_QUERY
+};
+
+enum seq_rpc_opc {
+	SEQ_QUERY		       = 700,
+	SEQ_LAST_OPC,
+	SEQ_FIRST_OPC		   = SEQ_QUERY
+};
+
+enum seq_op {
+	SEQ_ALLOC_SUPER = 0,
+	SEQ_ALLOC_META = 1
+};
+
+/*
+ *  LOV data structures
+ */
+
+#define LOV_MAX_UUID_BUFFER_SIZE  8192
+/* The size of the buffer the lov/mdc reserves for the
+ * array of UUIDs returned by the MDS.  With the current
+ * protocol, this will limit the max number of OSTs per LOV */
+
+#define LOV_DESC_MAGIC 0xB0CCDE5C
+
+/* LOV settings descriptor (should only contain static info) */
+struct lov_desc {
+	__u32 ld_tgt_count;		/* how many OBD's */
+	__u32 ld_active_tgt_count;	 /* how many active */
+	__u32 ld_default_stripe_count;     /* how many objects are used */
+	__u32 ld_pattern;		  /* default PATTERN_RAID0 */
+	__u64 ld_default_stripe_size;      /* in bytes */
+	__u64 ld_default_stripe_offset;    /* in bytes */
+	__u32 ld_padding_0;		/* unused */
+	__u32 ld_qos_maxage;	       /* in second */
+	__u32 ld_padding_1;		/* also fix lustre_swab_lov_desc */
+	__u32 ld_padding_2;		/* also fix lustre_swab_lov_desc */
+	struct obd_uuid ld_uuid;
+};
+
+#define ld_magic ld_active_tgt_count       /* for swabbing from llogs */
+
+extern void lustre_swab_lov_desc (struct lov_desc *ld);
+
+/*
+ *   LDLM requests:
+ */
+/* opcodes -- MUST be distinct from OST/MDS opcodes */
+typedef enum {
+	LDLM_ENQUEUE     = 101,
+	LDLM_CONVERT     = 102,
+	LDLM_CANCEL      = 103,
+	LDLM_BL_CALLBACK = 104,
+	LDLM_CP_CALLBACK = 105,
+	LDLM_GL_CALLBACK = 106,
+	LDLM_SET_INFO    = 107,
+	LDLM_LAST_OPC
+} ldlm_cmd_t;
+#define LDLM_FIRST_OPC LDLM_ENQUEUE
+
+#define RES_NAME_SIZE 4
+struct ldlm_res_id {
+	__u64 name[RES_NAME_SIZE];
+};
+
+extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id);
+
+static inline int ldlm_res_eq(const struct ldlm_res_id *res0,
+			      const struct ldlm_res_id *res1)
+{
+	return !memcmp(res0, res1, sizeof(*res0));
+}
+
+/* lock types */
+typedef enum {
+	LCK_MINMODE = 0,
+	LCK_EX      = 1,
+	LCK_PW      = 2,
+	LCK_PR      = 4,
+	LCK_CW      = 8,
+	LCK_CR      = 16,
+	LCK_NL      = 32,
+	LCK_GROUP   = 64,
+	LCK_COS     = 128,
+	LCK_MAXMODE
+} ldlm_mode_t;
+
+#define LCK_MODE_NUM    8
+
+typedef enum {
+	LDLM_PLAIN     = 10,
+	LDLM_EXTENT    = 11,
+	LDLM_FLOCK     = 12,
+	LDLM_IBITS     = 13,
+	LDLM_MAX_TYPE
+} ldlm_type_t;
+
+#define LDLM_MIN_TYPE LDLM_PLAIN
+
+struct ldlm_extent {
+	__u64 start;
+	__u64 end;
+	__u64 gid;
+};
+
+static inline int ldlm_extent_overlap(struct ldlm_extent *ex1,
+				      struct ldlm_extent *ex2)
+{
+	return (ex1->start <= ex2->end) && (ex2->start <= ex1->end);
+}
+
+/* check if @ex1 contains @ex2 */
+static inline int ldlm_extent_contain(struct ldlm_extent *ex1,
+				      struct ldlm_extent *ex2)
+{
+	return (ex1->start <= ex2->start) && (ex1->end >= ex2->end);
+}
+
+struct ldlm_inodebits {
+	__u64 bits;
+};
+
+struct ldlm_flock_wire {
+	__u64 lfw_start;
+	__u64 lfw_end;
+	__u64 lfw_owner;
+	__u32 lfw_padding;
+	__u32 lfw_pid;
+};
+
+/* it's important that the fields of the ldlm_extent structure match
+ * the first fields of the ldlm_flock structure because there is only
+ * one ldlm_swab routine to process the ldlm_policy_data_t union. if
+ * this ever changes we will need to swab the union differently based
+ * on the resource type. */
+
+typedef union {
+	struct ldlm_extent l_extent;
+	struct ldlm_flock_wire l_flock;
+	struct ldlm_inodebits l_inodebits;
+} ldlm_wire_policy_data_t;
+
+extern void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d);
+
+union ldlm_gl_desc {
+	struct ldlm_gl_lquota_desc	lquota_desc;
+};
+
+extern void lustre_swab_gl_desc(union ldlm_gl_desc *);
+
+struct ldlm_intent {
+	__u64 opc;
+};
+
+extern void lustre_swab_ldlm_intent (struct ldlm_intent *i);
+
+struct ldlm_resource_desc {
+	ldlm_type_t lr_type;
+	__u32 lr_padding;       /* also fix lustre_swab_ldlm_resource_desc */
+	struct ldlm_res_id lr_name;
+};
+
+extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r);
+
+struct ldlm_lock_desc {
+	struct ldlm_resource_desc l_resource;
+	ldlm_mode_t l_req_mode;
+	ldlm_mode_t l_granted_mode;
+	ldlm_wire_policy_data_t l_policy_data;
+};
+
+extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l);
+
+#define LDLM_LOCKREQ_HANDLES 2
+#define LDLM_ENQUEUE_CANCEL_OFF 1
+
+struct ldlm_request {
+	__u32 lock_flags;
+	__u32 lock_count;
+	struct ldlm_lock_desc lock_desc;
+	struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+};
+
+extern void lustre_swab_ldlm_request (struct ldlm_request *rq);
+
+/* If LDLM_ENQUEUE, 1 slot is already occupied, 1 is available.
+ * Otherwise, 2 are available. */
+#define ldlm_request_bufsize(count,type)				\
+({								      \
+	int _avail = LDLM_LOCKREQ_HANDLES;			      \
+	_avail -= (type == LDLM_ENQUEUE ? LDLM_ENQUEUE_CANCEL_OFF : 0); \
+	sizeof(struct ldlm_request) +				   \
+	(count > _avail ? count - _avail : 0) *			 \
+	sizeof(struct lustre_handle);				   \
+})
+
+struct ldlm_reply {
+	__u32 lock_flags;
+	__u32 lock_padding;     /* also fix lustre_swab_ldlm_reply */
+	struct ldlm_lock_desc lock_desc;
+	struct lustre_handle lock_handle;
+	__u64  lock_policy_res1;
+	__u64  lock_policy_res2;
+};
+
+extern void lustre_swab_ldlm_reply (struct ldlm_reply *r);
+
+#define ldlm_flags_to_wire(flags)    ((__u32)(flags))
+#define ldlm_flags_from_wire(flags)  ((__u64)(flags))
+
+/*
+ * Opcodes for mountconf (mgs and mgc)
+ */
+typedef enum {
+	MGS_CONNECT = 250,
+	MGS_DISCONNECT,
+	MGS_EXCEPTION,	 /* node died, etc. */
+	MGS_TARGET_REG,	/* whenever target starts up */
+	MGS_TARGET_DEL,
+	MGS_SET_INFO,
+	MGS_CONFIG_READ,
+	MGS_LAST_OPC
+} mgs_cmd_t;
+#define MGS_FIRST_OPC MGS_CONNECT
+
+#define MGS_PARAM_MAXLEN 1024
+#define KEY_SET_INFO "set_info"
+
+struct mgs_send_param {
+	char	     mgs_param[MGS_PARAM_MAXLEN];
+};
+
+/* We pass this info to the MGS so it can write config logs */
+#define MTI_NAME_MAXLEN  64
+#define MTI_PARAM_MAXLEN 4096
+#define MTI_NIDS_MAX     32
+struct mgs_target_info {
+	__u32	    mti_lustre_ver;
+	__u32	    mti_stripe_index;
+	__u32	    mti_config_ver;
+	__u32	    mti_flags;
+	__u32	    mti_nid_count;
+	__u32	    mti_instance; /* Running instance of target */
+	char	     mti_fsname[MTI_NAME_MAXLEN];
+	char	     mti_svname[MTI_NAME_MAXLEN];
+	char	     mti_uuid[sizeof(struct obd_uuid)];
+	__u64	    mti_nids[MTI_NIDS_MAX];     /* host nids (lnet_nid_t)*/
+	char	     mti_params[MTI_PARAM_MAXLEN];
+};
+extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo);
+
+struct mgs_nidtbl_entry {
+	__u64	   mne_version;    /* table version of this entry */
+	__u32	   mne_instance;   /* target instance # */
+	__u32	   mne_index;      /* target index */
+	__u32	   mne_length;     /* length of this entry - by bytes */
+	__u8	    mne_type;       /* target type LDD_F_SV_TYPE_OST/MDT */
+	__u8	    mne_nid_type;   /* type of nid(mbz). for ipv6. */
+	__u8	    mne_nid_size;   /* size of each NID, by bytes */
+	__u8	    mne_nid_count;  /* # of NIDs in buffer */
+	union {
+		lnet_nid_t nids[0];     /* variable size buffer for NIDs. */
+	} u;
+};
+extern void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo);
+
+struct mgs_config_body {
+	char     mcb_name[MTI_NAME_MAXLEN]; /* logname */
+	__u64    mcb_offset;    /* next index of config log to request */
+	__u16    mcb_type;      /* type of log: CONFIG_T_[CONFIG|RECOVER] */
+	__u8     mcb_reserved;
+	__u8     mcb_bits;      /* bits unit size of config log */
+	__u32    mcb_units;     /* # of units for bulk transfer */
+};
+extern void lustre_swab_mgs_config_body(struct mgs_config_body *body);
+
+struct mgs_config_res {
+	__u64    mcr_offset;    /* index of last config log */
+	__u64    mcr_size;      /* size of the log */
+};
+extern void lustre_swab_mgs_config_res(struct mgs_config_res *body);
+
+/* Config marker flags (in config log) */
+#define CM_START       0x01
+#define CM_END	 0x02
+#define CM_SKIP	0x04
+#define CM_UPGRADE146  0x08
+#define CM_EXCLUDE     0x10
+#define CM_START_SKIP (CM_START | CM_SKIP)
+
+struct cfg_marker {
+	__u32	     cm_step;       /* aka config version */
+	__u32	     cm_flags;
+	__u32	     cm_vers;       /* lustre release version number */
+	__u32	     cm_padding;    /* 64 bit align */
+	obd_time	  cm_createtime; /*when this record was first created */
+	obd_time	  cm_canceltime; /*when this record is no longer valid*/
+	char	      cm_tgtname[MTI_NAME_MAXLEN];
+	char	      cm_comment[MTI_NAME_MAXLEN];
+};
+
+extern void lustre_swab_cfg_marker(struct cfg_marker *marker,
+				   int swab, int size);
+
+/*
+ * Opcodes for multiple servers.
+ */
+
+typedef enum {
+	OBD_PING = 400,
+	OBD_LOG_CANCEL,
+	OBD_QC_CALLBACK,
+	OBD_IDX_READ,
+	OBD_LAST_OPC
+} obd_cmd_t;
+#define OBD_FIRST_OPC OBD_PING
+
+/* catalog of log objects */
+
+/** Identifier for a single log object */
+struct llog_logid {
+	struct ost_id		lgl_oi;
+	__u32		   lgl_ogen;
+} __attribute__((packed));
+
+/** Records written to the CATALOGS list */
+#define CATLIST "CATALOGS"
+struct llog_catid {
+	struct llog_logid       lci_logid;
+	__u32		   lci_padding1;
+	__u32		   lci_padding2;
+	__u32		   lci_padding3;
+} __attribute__((packed));
+
+/* Log data record types - there is no specific reason that these need to
+ * be related to the RPC opcodes, but no reason not to (may be handy later?)
+ */
+#define LLOG_OP_MAGIC 0x10600000
+#define LLOG_OP_MASK  0xfff00000
+
+typedef enum {
+	LLOG_PAD_MAGIC		= LLOG_OP_MAGIC | 0x00000,
+	OST_SZ_REC		= LLOG_OP_MAGIC | 0x00f00,
+	/* OST_RAID1_REC	= LLOG_OP_MAGIC | 0x01000, never used */
+	MDS_UNLINK_REC		= LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) |
+				  REINT_UNLINK, /* obsolete after 2.5.0 */
+	MDS_UNLINK64_REC	= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+				  REINT_UNLINK,
+	/* MDS_SETATTR_REC	= LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */
+	MDS_SETATTR64_REC	= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+				  REINT_SETATTR,
+	OBD_CFG_REC		= LLOG_OP_MAGIC | 0x20000,
+	/* PTL_CFG_REC		= LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */
+	LLOG_GEN_REC		= LLOG_OP_MAGIC | 0x40000,
+	/* LLOG_JOIN_REC	= LLOG_OP_MAGIC | 0x50000, obsolete  1.8.0 */
+	CHANGELOG_REC		= LLOG_OP_MAGIC | 0x60000,
+	CHANGELOG_USER_REC	= LLOG_OP_MAGIC | 0x70000,
+	LLOG_HDR_MAGIC		= LLOG_OP_MAGIC | 0x45539,
+	LLOG_LOGID_MAGIC	= LLOG_OP_MAGIC | 0x4553b,
+} llog_op_type;
+
+#define LLOG_REC_HDR_NEEDS_SWABBING(r) \
+	(((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
+
+/** Log record header - stored in little endian order.
+ * Each record must start with this struct, end with a llog_rec_tail,
+ * and be a multiple of 256 bits in size.
+ */
+struct llog_rec_hdr {
+	__u32	lrh_len;
+	__u32	lrh_index;
+	__u32	lrh_type;
+	__u32	lrh_id;
+};
+
+struct llog_rec_tail {
+	__u32	lrt_len;
+	__u32	lrt_index;
+};
+
+/* Where data follow just after header */
+#define REC_DATA(ptr)						\
+	((void *)((char *)ptr + sizeof(struct llog_rec_hdr)))
+
+#define REC_DATA_LEN(rec)					\
+	(rec->lrh_len - sizeof(struct llog_rec_hdr) -		\
+	 sizeof(struct llog_rec_tail))
+
+struct llog_logid_rec {
+	struct llog_rec_hdr	lid_hdr;
+	struct llog_logid	lid_id;
+	__u32			lid_padding1;
+	__u64			lid_padding2;
+	__u64			lid_padding3;
+	struct llog_rec_tail	lid_tail;
+} __attribute__((packed));
+
+struct llog_unlink_rec {
+	struct llog_rec_hdr	lur_hdr;
+	obd_id			lur_oid;
+	obd_count		lur_oseq;
+	obd_count		lur_count;
+	struct llog_rec_tail	lur_tail;
+} __attribute__((packed));
+
+struct llog_unlink64_rec {
+	struct llog_rec_hdr	lur_hdr;
+	struct lu_fid		lur_fid;
+	obd_count		lur_count; /* to destroy the lost precreated */
+	__u32			lur_padding1;
+	__u64			lur_padding2;
+	__u64			lur_padding3;
+	struct llog_rec_tail    lur_tail;
+} __attribute__((packed));
+
+struct llog_setattr64_rec {
+	struct llog_rec_hdr	lsr_hdr;
+	struct ost_id		lsr_oi;
+	__u32			lsr_uid;
+	__u32			lsr_uid_h;
+	__u32			lsr_gid;
+	__u32			lsr_gid_h;
+	__u64			lsr_padding;
+	struct llog_rec_tail    lsr_tail;
+} __attribute__((packed));
+
+struct llog_size_change_rec {
+	struct llog_rec_hdr	lsc_hdr;
+	struct ll_fid		lsc_fid;
+	__u32			lsc_ioepoch;
+	__u32			lsc_padding1;
+	__u64			lsc_padding2;
+	__u64			lsc_padding3;
+	struct llog_rec_tail	lsc_tail;
+} __attribute__((packed));
+
+#define CHANGELOG_MAGIC 0xca103000
+
+/** \a changelog_rec_type's that can't be masked */
+#define CHANGELOG_MINMASK (1 << CL_MARK)
+/** bits covering all \a changelog_rec_type's */
+#define CHANGELOG_ALLMASK 0XFFFFFFFF
+/** default \a changelog_rec_type mask */
+#define CHANGELOG_DEFMASK CHANGELOG_ALLMASK & ~(1 << CL_ATIME | 1 << CL_CLOSE)
+
+/* changelog llog name, needed by client replicators */
+#define CHANGELOG_CATALOG "changelog_catalog"
+
+struct changelog_setinfo {
+	__u64 cs_recno;
+	__u32 cs_id;
+} __attribute__((packed));
+
+/** changelog record */
+struct llog_changelog_rec {
+	struct llog_rec_hdr  cr_hdr;
+	struct changelog_rec cr;
+	struct llog_rec_tail cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+struct llog_changelog_ext_rec {
+	struct llog_rec_hdr      cr_hdr;
+	struct changelog_ext_rec cr;
+	struct llog_rec_tail     cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+#define CHANGELOG_USER_PREFIX "cl"
+
+struct llog_changelog_user_rec {
+	struct llog_rec_hdr   cur_hdr;
+	__u32		 cur_id;
+	__u32		 cur_padding;
+	__u64		 cur_endrec;
+	struct llog_rec_tail  cur_tail;
+} __attribute__((packed));
+
+/* Old llog gen for compatibility */
+struct llog_gen {
+	__u64 mnt_cnt;
+	__u64 conn_cnt;
+} __attribute__((packed));
+
+struct llog_gen_rec {
+	struct llog_rec_hdr	lgr_hdr;
+	struct llog_gen		lgr_gen;
+	__u64			padding1;
+	__u64			padding2;
+	__u64			padding3;
+	struct llog_rec_tail	lgr_tail;
+};
+
+/* On-disk header structure of each log object, stored in little endian order */
+#define LLOG_CHUNK_SIZE	 8192
+#define LLOG_HEADER_SIZE	(96)
+#define LLOG_BITMAP_BYTES       (LLOG_CHUNK_SIZE - LLOG_HEADER_SIZE)
+
+#define LLOG_MIN_REC_SIZE       (24) /* round(llog_rec_hdr + llog_rec_tail) */
+
+/* flags for the logs */
+enum llog_flag {
+	LLOG_F_ZAP_WHEN_EMPTY	= 0x1,
+	LLOG_F_IS_CAT		= 0x2,
+	LLOG_F_IS_PLAIN		= 0x4,
+};
+
+struct llog_log_hdr {
+	struct llog_rec_hdr     llh_hdr;
+	obd_time		llh_timestamp;
+	__u32		   llh_count;
+	__u32		   llh_bitmap_offset;
+	__u32		   llh_size;
+	__u32		   llh_flags;
+	__u32		   llh_cat_idx;
+	/* for a catalog the first plain slot is next to it */
+	struct obd_uuid	 llh_tgtuuid;
+	__u32		   llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23];
+	__u32		   llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)];
+	struct llog_rec_tail    llh_tail;
+} __attribute__((packed));
+
+#define LLOG_BITMAP_SIZE(llh)  (__u32)((llh->llh_hdr.lrh_len -		\
+					llh->llh_bitmap_offset -	\
+					sizeof(llh->llh_tail)) * 8)
+
+/** log cookies are used to reference a specific log file and a record therein */
+struct llog_cookie {
+	struct llog_logid       lgc_lgl;
+	__u32		   lgc_subsys;
+	__u32		   lgc_index;
+	__u32		   lgc_padding;
+} __attribute__((packed));
+
+/** llog protocol */
+enum llogd_rpc_ops {
+	LLOG_ORIGIN_HANDLE_CREATE       = 501,
+	LLOG_ORIGIN_HANDLE_NEXT_BLOCK   = 502,
+	LLOG_ORIGIN_HANDLE_READ_HEADER  = 503,
+	LLOG_ORIGIN_HANDLE_WRITE_REC    = 504,
+	LLOG_ORIGIN_HANDLE_CLOSE	= 505,
+	LLOG_ORIGIN_CONNECT	     = 506,
+	LLOG_CATINFO			= 507,  /* deprecated */
+	LLOG_ORIGIN_HANDLE_PREV_BLOCK   = 508,
+	LLOG_ORIGIN_HANDLE_DESTROY      = 509,  /* for destroy llog object*/
+	LLOG_LAST_OPC,
+	LLOG_FIRST_OPC		  = LLOG_ORIGIN_HANDLE_CREATE
+};
+
+struct llogd_body {
+	struct llog_logid  lgd_logid;
+	__u32 lgd_ctxt_idx;
+	__u32 lgd_llh_flags;
+	__u32 lgd_index;
+	__u32 lgd_saved_index;
+	__u32 lgd_len;
+	__u64 lgd_cur_offset;
+} __attribute__((packed));
+
+struct llogd_conn_body {
+	struct llog_gen	 lgdc_gen;
+	struct llog_logid       lgdc_logid;
+	__u32		   lgdc_ctxt_idx;
+} __attribute__((packed));
+
+/* Note: 64-bit types are 64-bit aligned in structure */
+struct obdo {
+	obd_valid	       o_valid;	/* hot fields in this obdo */
+	struct ost_id	   o_oi;
+	obd_id		  o_parent_seq;
+	obd_size		o_size;	 /* o_size-o_blocks == ost_lvb */
+	obd_time		o_mtime;
+	obd_time		o_atime;
+	obd_time		o_ctime;
+	obd_blocks	      o_blocks;       /* brw: cli sent cached bytes */
+	obd_size		o_grant;
+
+	/* 32-bit fields start here: keep an even number of them via padding */
+	obd_blksize	     o_blksize;      /* optimal IO blocksize */
+	obd_mode		o_mode;	 /* brw: cli sent cache remain */
+	obd_uid		 o_uid;
+	obd_gid		 o_gid;
+	obd_flag		o_flags;
+	obd_count	       o_nlink;	/* brw: checksum */
+	obd_count	       o_parent_oid;
+	obd_count		o_misc;		/* brw: o_dropped */
+
+	__u64		   o_ioepoch;      /* epoch in ost writes */
+	__u32		   o_stripe_idx;   /* holds stripe idx */
+	__u32		   o_parent_ver;
+	struct lustre_handle    o_handle;       /* brw: lock handle to prolong
+						 * locks */
+	struct llog_cookie      o_lcookie;      /* destroy: unlink cookie from
+						 * MDS */
+	__u32			o_uid_h;
+	__u32			o_gid_h;
+
+	__u64			o_data_version; /* getattr: sum of iversion for
+						 * each stripe.
+						 * brw: grant space consumed on
+						 * the client for the write */
+	__u64			o_padding_4;
+	__u64			o_padding_5;
+	__u64			o_padding_6;
+};
+
+#define o_dirty   o_blocks
+#define o_undirty o_mode
+#define o_dropped o_misc
+#define o_cksum   o_nlink
+#define o_grant_used o_data_version
+
+static inline void lustre_set_wire_obdo(struct obd_connect_data *ocd,
+					struct obdo *wobdo, struct obdo *lobdo)
+{
+	memcpy(wobdo, lobdo, sizeof(*lobdo));
+	wobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+	if (ocd == NULL)
+		return;
+
+	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+	    fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) {
+		/* Currently OBD_FL_OSTID will only be used when 2.4 echo
+		 * client communicate with pre-2.4 server */
+		wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid);
+		wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid);
+	}
+}
+
+static inline void lustre_get_wire_obdo(struct obd_connect_data *ocd,
+					struct obdo *lobdo, struct obdo *wobdo)
+{
+	obd_flag local_flags = 0;
+
+	if (lobdo->o_valid & OBD_MD_FLFLAGS)
+		 local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK;
+
+	LASSERT(!(wobdo->o_flags & OBD_FL_LOCAL_MASK));
+
+	memcpy(lobdo, wobdo, sizeof(*lobdo));
+	if (local_flags != 0) {
+		lobdo->o_valid |= OBD_MD_FLFLAGS;
+		lobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+		lobdo->o_flags |= local_flags;
+	}
+	if (ocd == NULL)
+		return;
+
+	if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+	    fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) {
+		/* see above */
+		lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq;
+		lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id;
+		lobdo->o_oi.oi_fid.f_ver = 0;
+	}
+}
+
+extern void lustre_swab_obdo (struct obdo *o);
+
+/* request structure for OST's */
+struct ost_body {
+	struct  obdo oa;
+};
+
+/* Key for FIEMAP to be used in get_info calls */
+struct ll_fiemap_info_key {
+	char    name[8];
+	struct  obdo oa;
+	struct  ll_user_fiemap fiemap;
+};
+
+extern void lustre_swab_ost_body (struct ost_body *b);
+extern void lustre_swab_ost_last_id(obd_id *id);
+extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap);
+
+extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+					    int stripe_count);
+extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
+
+/* llog_swab.c */
+extern void lustre_swab_llogd_body (struct llogd_body *d);
+extern void lustre_swab_llog_hdr (struct llog_log_hdr *h);
+extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d);
+extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec);
+extern void lustre_swab_llog_id(struct llog_logid *lid);
+
+struct lustre_cfg;
+extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
+
+/* Functions for dumping PTLRPC fields */
+void dump_rniobuf(struct niobuf_remote *rnb);
+void dump_ioo(struct obd_ioobj *nb);
+void dump_obdo(struct obdo *oa);
+void dump_ost_body(struct ost_body *ob);
+void dump_rcs(__u32 *rc);
+
+#define IDX_INFO_MAGIC 0x3D37CC37
+
+/* Index file transfer through the network. The server serializes the index into
+ * a byte stream which is sent to the client via a bulk transfer */
+struct idx_info {
+	__u32		ii_magic;
+
+	/* reply: see idx_info_flags below */
+	__u32		ii_flags;
+
+	/* request & reply: number of lu_idxpage (to be) transferred */
+	__u16		ii_count;
+	__u16		ii_pad0;
+
+	/* request: requested attributes passed down to the iterator API */
+	__u32		ii_attrs;
+
+	/* request & reply: index file identifier (FID) */
+	struct lu_fid	ii_fid;
+
+	/* reply: version of the index file before starting to walk the index.
+	 * Please note that the version can be modified at any time during the
+	 * transfer */
+	__u64		ii_version;
+
+	/* request: hash to start with:
+	 * reply: hash of the first entry of the first lu_idxpage and hash
+	 *	of the entry to read next if any */
+	__u64		ii_hash_start;
+	__u64		ii_hash_end;
+
+	/* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is
+	 * set */
+	__u16		ii_keysize;
+
+	/* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC
+	 * is set */
+	__u16		ii_recsize;
+
+	__u32		ii_pad1;
+	__u64		ii_pad2;
+	__u64		ii_pad3;
+};
+extern void lustre_swab_idx_info(struct idx_info *ii);
+
+#define II_END_OFF	MDS_DIR_END_OFF /* all entries have been read */
+
+/* List of flags used in idx_info::ii_flags */
+enum idx_info_flags {
+	II_FL_NOHASH	= 1 << 0, /* client doesn't care about hash value */
+	II_FL_VARKEY	= 1 << 1, /* keys can be of variable size */
+	II_FL_VARREC	= 1 << 2, /* records can be of variable size */
+	II_FL_NONUNQ	= 1 << 3, /* index supports non-unique keys */
+};
+
+#define LIP_MAGIC 0x8A6D6B6C
+
+/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */
+struct lu_idxpage {
+	/* 16-byte header */
+	__u32	lip_magic;
+	__u16	lip_flags;
+	__u16	lip_nr;   /* number of entries in the container */
+	__u64	lip_pad0; /* additional padding for future use */
+
+	/* key/record pairs are stored in the remaining 4080 bytes.
+	 * depending upon the flags in idx_info::ii_flags, each key/record
+	 * pair might be preceded by:
+	 * - a hash value
+	 * - the key size (II_FL_VARKEY is set)
+	 * - the record size (II_FL_VARREC is set)
+	 *
+	 * For the time being, we only support fixed-size key & record. */
+	char	lip_entries[0];
+};
+extern void lustre_swab_lip_header(struct lu_idxpage *lip);
+
+#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries))
+
+/* Gather all possible type associated with a 4KB container */
+union lu_page {
+	struct lu_dirpage	lp_dir; /* for MDS_READPAGE */
+	struct lu_idxpage	lp_idx; /* for OBD_IDX_READ */
+	char			lp_array[LU_PAGE_SIZE];
+};
+
+/* security opcodes */
+typedef enum {
+	SEC_CTX_INIT	    = 801,
+	SEC_CTX_INIT_CONT       = 802,
+	SEC_CTX_FINI	    = 803,
+	SEC_LAST_OPC,
+	SEC_FIRST_OPC	   = SEC_CTX_INIT
+} sec_cmd_t;
+
+/*
+ * capa related definitions
+ */
+#define CAPA_HMAC_MAX_LEN       64
+#define CAPA_HMAC_KEY_MAX_LEN   56
+
+/* NB take care when changing the sequence of elements this struct,
+ * because the offset info is used in find_capa() */
+struct lustre_capa {
+	struct lu_fid   lc_fid;	 /** fid */
+	__u64	   lc_opc;	 /** operations allowed */
+	__u64	   lc_uid;	 /** file owner */
+	__u64	   lc_gid;	 /** file group */
+	__u32	   lc_flags;       /** HMAC algorithm & flags */
+	__u32	   lc_keyid;       /** key# used for the capability */
+	__u32	   lc_timeout;     /** capa timeout value (sec) */
+	__u32	   lc_expiry;      /** expiry time (sec) */
+	__u8	    lc_hmac[CAPA_HMAC_MAX_LEN];   /** HMAC */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa(struct lustre_capa *c);
+
+/** lustre_capa::lc_opc */
+enum {
+	CAPA_OPC_BODY_WRITE   = 1<<0,  /**< write object data */
+	CAPA_OPC_BODY_READ    = 1<<1,  /**< read object data */
+	CAPA_OPC_INDEX_LOOKUP = 1<<2,  /**< lookup object fid */
+	CAPA_OPC_INDEX_INSERT = 1<<3,  /**< insert object fid */
+	CAPA_OPC_INDEX_DELETE = 1<<4,  /**< delete object fid */
+	CAPA_OPC_OSS_WRITE    = 1<<5,  /**< write oss object data */
+	CAPA_OPC_OSS_READ     = 1<<6,  /**< read oss object data */
+	CAPA_OPC_OSS_TRUNC    = 1<<7,  /**< truncate oss object */
+	CAPA_OPC_OSS_DESTROY  = 1<<8,  /**< destroy oss object */
+	CAPA_OPC_META_WRITE   = 1<<9,  /**< write object meta data */
+	CAPA_OPC_META_READ    = 1<<10, /**< read object meta data */
+};
+
+#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE)
+#define CAPA_OPC_MDS_ONLY						   \
+	(CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \
+	 CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE)
+#define CAPA_OPC_OSS_ONLY						   \
+	(CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC |      \
+	 CAPA_OPC_OSS_DESTROY)
+#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY
+#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY)
+
+/* MDS capability covers object capability for operations of body r/w
+ * (dir readpage/sendpage), index lookup/insert/delete and meta data r/w,
+ * while OSS capability only covers object capability for operations of
+ * oss data(file content) r/w/truncate.
+ */
+static inline int capa_for_mds(struct lustre_capa *c)
+{
+	return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) != 0;
+}
+
+static inline int capa_for_oss(struct lustre_capa *c)
+{
+	return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) == 0;
+}
+
+/* lustre_capa::lc_hmac_alg */
+enum {
+	CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */
+	CAPA_HMAC_ALG_MAX,
+};
+
+#define CAPA_FL_MASK	    0x00ffffff
+#define CAPA_HMAC_ALG_MASK      0xff000000
+
+struct lustre_capa_key {
+	__u64   lk_seq;       /**< mds# */
+	__u32   lk_keyid;     /**< key# */
+	__u32   lk_padding;
+	__u8    lk_key[CAPA_HMAC_KEY_MAX_LEN];    /**< key */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k);
+
+/** The link ea holds 1 \a link_ea_entry for each hardlink */
+#define LINK_EA_MAGIC 0x11EAF1DFUL
+struct link_ea_header {
+	__u32 leh_magic;
+	__u32 leh_reccount;
+	__u64 leh_len;      /* total size */
+	/* future use */
+	__u32 padding1;
+	__u32 padding2;
+};
+
+/** Hardlink data is name and parent fid.
+ * Stored in this crazy struct for maximum packing and endian-neutrality
+ */
+struct link_ea_entry {
+	/** __u16 stored big-endian, unaligned */
+	unsigned char      lee_reclen[2];
+	unsigned char      lee_parent_fid[sizeof(struct lu_fid)];
+	char	       lee_name[0];
+}__attribute__((packed));
+
+/** fid2path request/reply structure */
+struct getinfo_fid2path {
+	struct lu_fid   gf_fid;
+	__u64	   gf_recno;
+	__u32	   gf_linkno;
+	__u32	   gf_pathlen;
+	char	    gf_path[0];
+} __attribute__((packed));
+
+void lustre_swab_fid2path (struct getinfo_fid2path *gf);
+
+enum {
+	LAYOUT_INTENT_ACCESS    = 0,
+	LAYOUT_INTENT_READ      = 1,
+	LAYOUT_INTENT_WRITE     = 2,
+	LAYOUT_INTENT_GLIMPSE   = 3,
+	LAYOUT_INTENT_TRUNC     = 4,
+	LAYOUT_INTENT_RELEASE   = 5,
+	LAYOUT_INTENT_RESTORE   = 6
+};
+
+/* enqueue layout lock with intent */
+struct layout_intent {
+	__u32 li_opc; /* intent operation for enqueue, read, write etc */
+	__u32 li_flags;
+	__u64 li_start;
+	__u64 li_end;
+};
+
+void lustre_swab_layout_intent(struct layout_intent *li);
+
+/**
+ * On the wire version of hsm_progress structure.
+ *
+ * Contains the userspace hsm_progress and some internal fields.
+ */
+struct hsm_progress_kernel {
+	/* Field taken from struct hsm_progress */
+	lustre_fid		hpk_fid;
+	__u64			hpk_cookie;
+	struct hsm_extent	hpk_extent;
+	__u16			hpk_flags;
+	__u16			hpk_errval; /* positive val */
+	__u32			hpk_padding1;
+	/* Additional fields */
+	__u64			hpk_data_version;
+	__u64			hpk_padding2;
+} __attribute__((packed));
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_current_action(struct hsm_current_action *action);
+extern void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk);
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_user_item(struct hsm_user_item *hui);
+extern void lustre_swab_hsm_request(struct hsm_request *hr);
+
+/**
+ * These are object update opcode under UPDATE_OBJ, which is currently
+ * being used by cross-ref operations between MDT.
+ *
+ * During the cross-ref operation, the Master MDT, which the client send the
+ * request to, will disassembly the operation into object updates, then OSP
+ * will send these updates to the remote MDT to be executed.
+ *
+ *   Update request format
+ *   magic:  UPDATE_BUFFER_MAGIC_V1
+ *   Count:  How many updates in the req.
+ *   bufs[0] : following are packets of object.
+ *   update[0]:
+ *		type: object_update_op, the op code of update
+ *		fid: The object fid of the update.
+ *		lens/bufs: other parameters of the update.
+ *   update[1]:
+ *		type: object_update_op, the op code of update
+ *		fid: The object fid of the update.
+ *		lens/bufs: other parameters of the update.
+ *   ..........
+ *   update[7]:	type: object_update_op, the op code of update
+ *		fid: The object fid of the update.
+ *		lens/bufs: other parameters of the update.
+ *   Current 8 maxim updates per object update request.
+ *
+ *******************************************************************
+ *   update reply format:
+ *
+ *   ur_version: UPDATE_REPLY_V1
+ *   ur_count:   The count of the reply, which is usually equal
+ *		 to the number of updates in the request.
+ *   ur_lens:    The reply lengths of each object update.
+ *
+ *   replies:    1st update reply  [4bytes_ret: other body]
+ *		 2nd update reply  [4bytes_ret: other body]
+ *		 .....
+ *		 nth update reply  [4bytes_ret: other body]
+ *
+ *   For each reply of the update, the format would be
+ *	 result(4 bytes):Other stuff
+ */
+
+#define UPDATE_MAX_OPS		10
+#define UPDATE_BUFFER_MAGIC_V1	0xBDDE0001
+#define UPDATE_BUFFER_MAGIC	UPDATE_BUFFER_MAGIC_V1
+#define UPDATE_BUF_COUNT	8
+enum object_update_op {
+	OBJ_CREATE		= 1,
+	OBJ_DESTROY		= 2,
+	OBJ_REF_ADD		= 3,
+	OBJ_REF_DEL		= 4,
+	OBJ_ATTR_SET		= 5,
+	OBJ_ATTR_GET		= 6,
+	OBJ_XATTR_SET		= 7,
+	OBJ_XATTR_GET		= 8,
+	OBJ_INDEX_LOOKUP	= 9,
+	OBJ_INDEX_INSERT	= 10,
+	OBJ_INDEX_DELETE	= 11,
+	OBJ_LAST
+};
+
+struct update {
+	__u32		u_type;
+	__u32		u_batchid;
+	struct lu_fid	u_fid;
+	__u32		u_lens[UPDATE_BUF_COUNT];
+	__u32		u_bufs[0];
+};
+
+struct update_buf {
+	__u32	ub_magic;
+	__u32	ub_count;
+	__u32	ub_bufs[0];
+};
+
+#define UPDATE_REPLY_V1		0x00BD0001
+struct update_reply {
+	__u32	ur_version;
+	__u32	ur_count;
+	__u32	ur_lens[0];
+};
+
+void lustre_swab_update_buf(struct update_buf *ub);
+void lustre_swab_update_reply_buf(struct update_reply *ur);
+
+/** layout swap request structure
+ * fid1 and fid2 are in mdt_body
+ */
+struct mdc_swap_layouts {
+	__u64	   msl_flags;
+} __packed;
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
+
+#endif
+/** @} lustreidl */

diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
new file mode 100644
index 0000000..1c87a61
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h

@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan Yong <yong.fan@whamcloud.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+
+enum lfsck_param_flags {
+	/* Reset LFSCK iterator position to the device beginning. */
+	LPF_RESET       = 0x0001,
+
+	/* Exit when fail. */
+	LPF_FAILOUT     = 0x0002,
+
+	/* Dryrun mode, only check without modification */
+	LPF_DRYRUN      = 0x0004,
+};
+
+enum lfsck_type {
+	/* For MDT-OST consistency check/repair. */
+	LT_LAYOUT	= 0x0001,
+
+	/* For MDT-MDT consistency check/repair. */
+	LT_DNE		= 0x0002,
+
+	/* For FID-in-dirent and linkEA consistency check/repair. */
+	LT_NAMESPACE	= 0x0004,
+};
+
+#define LFSCK_VERSION_V1	1
+#define LFSCK_VERSION_V2	2
+
+#define LFSCK_TYPES_ALL		((__u16)(~0))
+#define LFSCK_TYPES_DEF		((__u16)0)
+#define LFSCK_TYPES_SUPPORTED	LT_NAMESPACE
+
+#define LFSCK_SPEED_NO_LIMIT	0
+#define LFSCK_SPEED_LIMIT_DEF	LFSCK_SPEED_NO_LIMIT
+
+enum lfsck_start_valid {
+	LSV_SPEED_LIMIT		= 0x00000001,
+	LSV_ERROR_HANDLE	= 0x00000002,
+	LSV_DRYRUN		= 0x00000004,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+	/* Which arguments are valid, see 'enum lfsck_start_valid'. */
+	__u32   ls_valid;
+
+	/* How many items can be scanned at most per second. */
+	__u32   ls_speed_limit;
+
+	/* For compatibility between user space tools and kernel service. */
+	__u16   ls_version;
+
+	/* Which LFSCK components to be (have been) started. */
+	__u16   ls_active;
+
+	/* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+	__u16   ls_flags;
+
+	/* For 64-bits aligned. */
+	__u16   ls_padding;
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */

diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
new file mode 100644
index 0000000..7e9f575
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h

@@ -0,0 +1,1145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#include <lustre/ll_fiemap.h>
+#include <linux/lustre_user.h>
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#ifndef FSFILT_IOC_GETFLAGS
+#define FSFILT_IOC_GETFLAGS	       _IOR('f', 1, long)
+#define FSFILT_IOC_SETFLAGS	       _IOW('f', 2, long)
+#define FSFILT_IOC_GETVERSION	     _IOR('f', 3, long)
+#define FSFILT_IOC_SETVERSION	     _IOW('f', 4, long)
+#define FSFILT_IOC_GETVERSION_OLD	 _IOR('v', 1, long)
+#define FSFILT_IOC_SETVERSION_OLD	 _IOW('v', 2, long)
+#define FSFILT_IOC_FIEMAP		 _IOWR('f', 11, struct ll_user_fiemap)
+#endif
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+	OS_STATE_DEGRADED       = 0x00000001, /**< RAID degraded/rebuilding */
+	OS_STATE_READONLY       = 0x00000002, /**< filesystem is read-only */
+	OS_STATE_RDONLY_1       = 0x00000004, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATE_RDONLY_2       = 0x00000008, /**< obsolete 1.6, was EROFS=30 */
+	OS_STATE_RDONLY_3       = 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+};
+
+struct obd_statfs {
+	__u64	   os_type;
+	__u64	   os_blocks;
+	__u64	   os_bfree;
+	__u64	   os_bavail;
+	__u64	   os_files;
+	__u64	   os_ffree;
+	__u8	    os_fsid[40];
+	__u32	   os_bsize;
+	__u32	   os_namelen;
+	__u64	   os_maxbytes;
+	__u32	   os_state;       /**< obd_statfs_state OS_STATE_* flag */
+	__u32	   os_fprecreated;	/* objs available now to the caller */
+					/* used in QoS code to find preferred
+					 * OSTs */
+	__u32	   os_spare2;
+	__u32	   os_spare3;
+	__u32	   os_spare4;
+	__u32	   os_spare5;
+	__u32	   os_spare6;
+	__u32	   os_spare7;
+	__u32	   os_spare8;
+	__u32	   os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+       /**
+	* FID sequence. Sequence is a unit of migration: all files (objects)
+	* with FIDs from a given sequence are stored on the same server.
+	* Lustre should support 2^64 objects, so even if each sequence
+	* has only a single object we can still enumerate 2^64 objects.
+	**/
+	__u64 f_seq;
+	/* FID number within sequence. */
+	__u32 f_oid;
+	/**
+	 * FID version, used to distinguish different versions (in the sense
+	 * of snapshots, etc.) of the same file system object. Not currently
+	 * used.
+	 **/
+	__u32 f_ver;
+};
+
+struct filter_fid {
+	struct lu_fid	ff_parent;  /* ff_parent.f_ver == file stripe number */
+};
+
+/* keep this one for compatibility */
+struct filter_fid_old {
+	struct lu_fid	ff_parent;
+	__u64		ff_objid;
+	__u64		ff_seq;
+};
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them.  Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+typedef struct lu_fid lustre_fid;
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+	/**
+	 * Bitfield for supported data in this structure. From enum lma_compat.
+	 * lma_self_fid and lma_flags are always available.
+	 */
+	__u32   lma_compat;
+	/**
+	 * Per-file incompat feature list. Lustre version should support all
+	 * flags set in this field. The supported feature mask is available in
+	 * LMA_INCOMPAT_SUPP.
+	 */
+	__u32   lma_incompat;
+	/** FID of this inode */
+	struct lu_fid  lma_self_fid;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+	union {
+		struct ostid {
+			__u64	oi_id;
+			__u64	oi_seq;
+		} oi;
+		struct lu_fid oi_fid;
+	};
+};
+
+#define DOSTID LPX64":"LPU64
+#define POSTID(oi) ostid_seq(oi), ostid_id(oi)
+
+/*
+ * The ioctl naming rules:
+ * LL_*     - works on the currently opened filehandle instead of parent dir
+ * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_*  - gets/sets data related to MDC
+ * *_LOV_*  - gets/sets data related to OSC/LOV
+ * *FILE*   - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO    - set/get lov_user_mds_data
+ */
+/* see <lustre_lib.h> for ioctl numberss 101-150 */
+#define LL_IOC_GETFLAGS		 _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS		 _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS		 _IOW ('f', 153, long)
+/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */
+#define LL_IOC_LOV_SETSTRIPE	    _IOW ('f', 154, long)
+/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */
+#define LL_IOC_LOV_GETSTRIPE	    _IOW ('f', 155, long)
+/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */
+#define LL_IOC_LOV_SETEA		_IOW ('f', 156, long)
+#define LL_IOC_RECREATE_OBJ	     _IOW ('f', 157, long)
+#define LL_IOC_RECREATE_FID	     _IOW ('f', 157, struct lu_fid)
+#define LL_IOC_GROUP_LOCK	       _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK	     _IOW ('f', 159, long)
+/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */
+#define LL_IOC_QUOTACHECK	       _IOW ('f', 160, int)
+/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */
+#define LL_IOC_POLL_QUOTACHECK	  _IOR ('f', 161, struct if_quotacheck *)
+/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */
+#define LL_IOC_QUOTACTL		 _IOWR('f', 162, struct if_quotactl)
+#define IOC_OBD_STATFS		  _IOWR('f', 164, struct obd_statfs *)
+#define IOC_LOV_GETINFO		 _IOWR('f', 165, struct lov_user_mds_data *)
+#define LL_IOC_FLUSHCTX		 _IOW ('f', 166, long)
+#define LL_IOC_RMTACL		   _IOW ('f', 167, long)
+#define LL_IOC_GETOBDCOUNT	      _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH	     _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH	     _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO	       _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID		 _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS	_IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX	       _IOR ('f', 175, int)
+
+/* see <lustre_lib.h> for ioctl numbers 177-210 */
+
+#define LL_IOC_HSM_STATE_GET		_IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET		_IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START		_IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START		_IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END		_IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS		_IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST		_IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION		_IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS		_IOW('f', 219, \
+						struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION		_IOR('f', 220, \
+						struct hsm_current_action)
+/* see <lustre_lib.h> for ioctl numbers 221-232 */
+
+#define LL_IOC_LMV_SETSTRIPE	    _IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE	    _IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY	    _IOWR('f', 242, __u64)
+
+#define LL_STATFS_LMV	   1
+#define LL_STATFS_LOV	   2
+#define LL_STATFS_NODELAY	4
+
+#define IOC_MDC_TYPE	    'i'
+#define IOC_MDC_LOOKUP	  _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE   _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
+#define LL_IOC_MDC_GETINFO      _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
+
+/* Keep these for backward compartability. */
+#define LL_IOC_OBD_STATFS       IOC_OBD_STATFS
+#define IOC_MDC_GETSTRIPE       IOC_MDC_GETFILESTRIPE
+
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Hopefully O_LOV_DELAY_CREATE does not conflict with standard O_xxx flags.
+ * Previously it was defined as 0100000000 and conflicts with FMODE_NONOTIFY
+ * which was added since kernel 2.6.36, so we redefine it as 020000000.
+ * To be compatible with old version's statically linked binary, finally we
+ * define it as (020000000 | 0100000000).
+ * */
+#define O_LOV_DELAY_CREATE      0120000000
+
+#define LL_FILE_IGNORE_LOCK     0x00000001
+#define LL_FILE_GROUP_LOCKED    0x00000002
+#define LL_FILE_READAHEA	0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
+#define LL_FILE_RMTACL	  0x00000020
+
+#define LOV_USER_MAGIC_V1 0x0BD10BD0
+#define LOV_USER_MAGIC    LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_USER_MAGIC_V3 0x0BD30BD0
+
+#define LMV_MAGIC_V1      0x0CD10CD0    /*normal stripe lmv magic */
+#define LMV_USER_MAGIC    0x0CD20CD0    /*default lmv magic*/
+
+#define LOV_PATTERN_RAID0 0x001
+#define LOV_PATTERN_RAID1 0x002
+#define LOV_PATTERN_FIRST 0x100
+
+#define LOV_MAXPOOLNAME 16
+#define LOV_POOLNAMEF "%.16s"
+
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simpified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
+	struct ost_id l_ost_oi;	  /* OST object ID */
+	__u32 l_ost_gen;	  /* generation of this OST index */
+	__u32 l_ost_idx;	  /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 {	   /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_USER_MAGIC_V1 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed,  __may_alias__));
+
+struct lov_user_md_v3 {	   /* LOV EA user data (host-endian) */
+	__u32 lmm_magic;	  /* magic number = LOV_USER_MAGIC_V3 */
+	__u32 lmm_pattern;	/* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+	struct ost_id lmm_oi;	  /* LOV object ID */
+	__u32 lmm_stripe_size;    /* size of stripe in bytes */
+	__u16 lmm_stripe_count;   /* num stripes in use for this object */
+	union {
+		__u16 lmm_stripe_offset;  /* starting stripe offset in
+					   * lmm_objects, use when writing */
+		__u16 lmm_layout_gen;     /* layout generation number
+					   * used when reading */
+	};
+	char  lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+	struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this.  It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v1
+struct lov_user_mds_data_v1 {
+	lstat_t lmd_st;		 /* MDS stat struct */
+	struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+	lstat_t lmd_st;		 /* MDS stat struct */
+	struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
+} __attribute__((packed));
+#endif
+
+/* keep this to be the same size as lov_user_ost_data_v1 */
+struct lmv_user_mds_data {
+	struct lu_fid	lum_fid;
+	__u32		lum_padding;
+	__u32		lum_mds;
+};
+
+/* lum_type */
+enum {
+	LMV_STRIPE_TYPE = 0,
+	LMV_DEFAULT_TYPE = 1,
+};
+
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+	__u32	lum_magic;	 /* must be the first field */
+	__u32	lum_stripe_count;  /* dirstripe count */
+	__u32	lum_stripe_offset; /* MDT idx for default dirstripe */
+	__u32	lum_hash_type;     /* Dir stripe policy */
+	__u32	lum_type;	  /* LMV type: default or normal */
+	__u32	lum_padding1;
+	__u32	lum_padding2;
+	__u32	lum_padding3;
+	char	lum_pool_name[LOV_MAXPOOLNAME];
+	struct	lmv_user_mds_data  lum_objects[0];
+};
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+	return sizeof(struct lmv_user_md) +
+		      stripes * sizeof(struct lmv_user_mds_data);
+}
+
+extern void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
+
+struct ll_recreate_obj {
+	__u64 lrc_id;
+	__u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+	__u64 id;	 /* holds object id */
+	__u32 generation; /* holds object generation */
+	__u32 f_type;     /* holds object type or stripe idx when passing it to
+			   * OST for saving into EA. */
+};
+
+#define UUID_MAX	40
+struct obd_uuid {
+	char uuid[UUID_MAX];
+};
+
+static inline int obd_uuid_equals(const struct obd_uuid *u1,
+				  const struct obd_uuid *u2)
+{
+	return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+	return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+	strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+	uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(struct obd_uuid *uuid)
+{
+	if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+		/* Obviously not safe, but for printfs, no real harm done...
+		   we're always null-terminated, even in a race. */
+		static char temp[sizeof(*uuid)];
+		memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
+		temp[sizeof(*uuid) - 1] = '\0';
+		return temp;
+	}
+	return (char *)(uuid->uuid);
+}
+
+/* Extract fsname from uuid (or target name) of a target
+   e.g. (myfs-OST0007_UUID -> myfs)
+   see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+	char *p;
+
+	strncpy(buf, uuid, buflen - 1);
+	buf[buflen - 1] = '\0';
+	p = strrchr(buf, '-');
+	if (p)
+	   *p = '\0';
+}
+
+/* printf display format
+   e.g. printf("file FID is "DFID"\n", PFID(fid)); */
+#define DFID_NOBRACE LPX64":0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid)     \
+	(fid)->f_seq, \
+	(fid)->f_oid, \
+	(fid)->f_ver
+
+/* scanf input parse format -- strip '[' first.
+   e.g. sscanf(fidstr, SFID, RFID(&fid)); */
+/* #define SFID "0x"LPX64i":0x"LPSZX":0x"LPSZX""
+liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 4 has type 'unsigned int *'
+liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 5 has type 'unsigned int *'
+*/
+#define SFID "0x"LPX64i":0x%x:0x%x"
+#define RFID(fid)     \
+	&((fid)->f_seq), \
+	&((fid)->f_oid), \
+	&((fid)->f_ver)
+
+
+/********* Quotas **********/
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON    0x800002     /* turn quotas on */
+#define LUSTRE_Q_QUOTAOFF   0x800003     /* turn quotas off */
+#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
+#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE  0x80000b     /* invalidate quota data */
+#define LUSTRE_Q_FINVALIDATE 0x80000c     /* invalidate filter quota data */
+
+#define UGQUOTA 2       /* set both USRQUOTA and GRPQUOTA */
+
+struct if_quotacheck {
+	char		    obd_type[16];
+	struct obd_uuid	 obd_uuid;
+};
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+
+/* permission */
+#define N_PERMS_MAX      64
+
+struct perm_downcall_data {
+	__u64 pdd_nid;
+	__u32 pdd_perm;
+	__u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+	__u32			    idd_magic;
+	__u32			    idd_err;
+	__u32			    idd_uid;
+	__u32			    idd_gid;
+	__u32			    idd_nperms;
+	__u32			    idd_ngroups;
+	struct perm_downcall_data idd_perms[N_PERMS_MAX];
+	__u32			    idd_groups[0];
+};
+
+/* for non-mapped uid/gid */
+#define NOBODY_UID      99
+#define NOBODY_GID      99
+
+#define INVALID_ID      (-1)
+
+enum {
+	RMT_LSETFACL    = 1,
+	RMT_LGETFACL    = 2,
+	RMT_RSETFACL    = 3,
+	RMT_RGETFACL    = 4
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS     1
+#define QIF_SPACE       2
+#define QIF_ILIMITS     4
+#define QIF_INODES      8
+#define QIF_BTIME       16
+#define QIF_ITIME       32
+#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL	 (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: .^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR	".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN	14
+/* hdr + MDT index */
+#define LUSTRE_VOLATILE_IDX	LUSTRE_VOLATILE_HDR":%.4X:"
+
+typedef enum lustre_quota_version {
+	LUSTRE_QUOTA_V2 = 1
+} lustre_quota_version_t;
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+	__u64 dqi_bgrace;
+	__u64 dqi_igrace;
+	__u32 dqi_flags;
+	__u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+	__u64 dqb_bhardlimit;
+	__u64 dqb_bsoftlimit;
+	__u64 dqb_curspace;
+	__u64 dqb_ihardlimit;
+	__u64 dqb_isoftlimit;
+	__u64 dqb_curinodes;
+	__u64 dqb_btime;
+	__u64 dqb_itime;
+	__u32 dqb_valid;
+	__u32 dqb_padding;
+};
+
+enum {
+	QC_GENERAL      = 0,
+	QC_MDTIDX       = 1,
+	QC_OSTIDX       = 2,
+	QC_UUID	 = 3
+};
+
+struct if_quotactl {
+	__u32		   qc_cmd;
+	__u32		   qc_type;
+	__u32		   qc_id;
+	__u32		   qc_stat;
+	__u32		   qc_valid;
+	__u32		   qc_idx;
+	struct obd_dqinfo       qc_dqinfo;
+	struct obd_dqblk	qc_dqblk;
+	char		    obd_type[16];
+	struct obd_uuid	 obd_uuid;
+};
+
+/* swap layout flags */
+#define	SWAP_LAYOUTS_CHECK_DV1		(1 << 0)
+#define	SWAP_LAYOUTS_CHECK_DV2		(1 << 1)
+#define	SWAP_LAYOUTS_KEEP_MTIME		(1 << 2)
+#define	SWAP_LAYOUTS_KEEP_ATIME		(1 << 3)
+struct lustre_swap_layouts {
+	__u64	sl_flags;
+	__u32	sl_fd;
+	__u32	sl_gid;
+	__u64	sl_dv1;
+	__u64	sl_dv2;
+};
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+	CL_MARK     = 0,
+	CL_CREATE   = 1,  /* namespace */
+	CL_MKDIR    = 2,  /* namespace */
+	CL_HARDLINK = 3,  /* namespace */
+	CL_SOFTLINK = 4,  /* namespace */
+	CL_MKNOD    = 5,  /* namespace */
+	CL_UNLINK   = 6,  /* namespace */
+	CL_RMDIR    = 7,  /* namespace */
+	CL_RENAME   = 8,  /* namespace */
+	CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
+	CL_OPEN     = 10, /* not currently used */
+	CL_CLOSE    = 11, /* may be written to log only with mtime change */
+	CL_LAYOUT   = 12, /* file layout/striping modified */
+	CL_TRUNC    = 13,
+	CL_SETATTR  = 14,
+	CL_XATTR    = 15,
+	CL_HSM      = 16, /* HSM specific events, see flags */
+	CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
+	CL_CTIME    = 18,
+	CL_ATIME    = 19,
+	CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+	static const char *changelog_str[] = {
+		"MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+		"RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
+		"SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME",
+	};
+
+	if (type >= 0 && type < CL_LAST)
+		return changelog_str[type];
+	return NULL;
+}
+
+/* per-record flags */
+#define CLF_VERSION     0x1000
+#define CLF_EXT_VERSION 0x2000
+#define CLF_FLAGSHIFT   12
+#define CLF_FLAGMASK    ((1U << CLF_FLAGSHIFT) - 1)
+#define CLF_VERMASK     (~CLF_FLAGMASK)
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+				     /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST       0x0001 /* rename unlink last hardlink of target */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L	0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H	6
+#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H      9
+#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H      11
+#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H     15
+#define CLF_HSM_LAST	15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+				   >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS      0x00
+#define CLF_HSM_MAXERROR     0x7E
+#define CLF_HSM_ERROVERFLOW  0x7F
+
+#define CLF_HSM_DIRTY	1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+	HE_ARCHIVE      = 0,
+	HE_RESTORE      = 1,
+	HE_CANCEL       = 2,
+	HE_RELEASE      = 3,
+	HE_REMOVE       = 4,
+	HE_STATE	= 5,
+	HE_SPARE1       = 6,
+	HE_SPARE2       = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+	return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
+{
+	*flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(int flags)
+{
+	return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(int *flags, int bits)
+{
+	*flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(int flags)
+{
+	return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(int *flags, int error)
+{
+	*flags |= (error << CLF_HSM_ERR_L);
+}
+
+#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + sizeof(struct changelog_rec))
+
+struct changelog_rec {
+	__u16		 cr_namelen;
+	__u16		 cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */
+	__u32		 cr_type;  /**< \a changelog_rec_type */
+	__u64		 cr_index; /**< changelog record number */
+	__u64		 cr_prev;  /**< last index for this target fid */
+	__u64		 cr_time;
+	union {
+		lustre_fid    cr_tfid;	/**< target fid */
+		__u32	 cr_markerflags; /**< CL_MARK flags */
+	};
+	lustre_fid	    cr_pfid;	/**< parent fid */
+	char		  cr_name[0];     /**< last element */
+} __attribute__((packed));
+
+/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save
+ * space, only rename uses changelog_ext_rec, while others use changelog_rec to
+ * store records.
+ */
+struct changelog_ext_rec {
+	__u16			cr_namelen;
+	__u16			cr_flags; /**< (flags & CLF_FLAGMASK) |
+						CLF_EXT_VERSION */
+	__u32			cr_type;  /**< \a changelog_rec_type */
+	__u64			cr_index; /**< changelog record number */
+	__u64			cr_prev;  /**< last index for this target fid */
+	__u64			cr_time;
+	union {
+		lustre_fid	cr_tfid;	/**< target fid */
+		__u32		cr_markerflags; /**< CL_MARK flags */
+	};
+	lustre_fid		cr_pfid;	/**< target parent fid */
+	lustre_fid		cr_sfid;	/**< source fid, or zero */
+	lustre_fid		cr_spfid;       /**< source parent fid, or zero */
+	char			cr_name[0];     /**< last element */
+} __attribute__((packed));
+
+#define CHANGELOG_REC_EXTENDED(rec) \
+	(((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION)
+
+static inline int changelog_rec_size(struct changelog_rec *rec)
+{
+	return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec):
+					     sizeof(*rec);
+}
+
+static inline char *changelog_rec_name(struct changelog_rec *rec)
+{
+	return CHANGELOG_REC_EXTENDED(rec) ?
+		((struct changelog_ext_rec *)rec)->cr_name: rec->cr_name;
+}
+
+static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec)
+{
+	return rec->cr_namelen - strlen(rec->cr_name) - 1;
+}
+
+static inline char *changelog_rec_sname(struct changelog_ext_rec *rec)
+{
+	return rec->cr_name + strlen(rec->cr_name) + 1;
+}
+
+struct ioc_changelog {
+	__u64 icc_recno;
+	__u32 icc_mdtindex;
+	__u32 icc_id;
+	__u32 icc_flags;
+};
+
+enum changelog_message_type {
+	CL_RECORD = 10, /* message is a changelog_rec */
+	CL_EOF    = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+	__u64 idv_version;
+	__u64 idv_flags;     /* See LL_DV_xxx */
+};
+#define LL_DV_NOFLUSH 0x01   /* Do not take READ EXTENT LOCK before sampling
+				version. Dirty caches are left unchanged. */
+
+#ifndef offsetof
+# define offsetof(typ,memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+	HS_EXISTS	= 0x00000001,
+	HS_DIRTY	= 0x00000002,
+	HS_RELEASED	= 0x00000004,
+	HS_ARCHIVED	= 0x00000008,
+	HS_NORELEASE	= 0x00000010,
+	HS_NOARCHIVE	= 0x00000020,
+	HS_LOST		= 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+	HPS_WAITING	= 1,
+	HPS_RUNNING	= 2,
+	HPS_DONE	= 3,
+};
+#define HPS_NONE	0
+
+static inline char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+	switch  (s) {
+	case HPS_WAITING:	return "waiting";
+	case HPS_RUNNING:	return "running";
+	case HPS_DONE:		return "done";
+	default:		return "unknown";
+	}
+}
+
+struct hsm_extent {
+	__u64 offset;
+	__u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+	/** Current HSM states, from enum hsm_states. */
+	__u32			hus_states;
+	__u32			hus_archive_id;
+	/**  The current undergoing action, if there is one */
+	__u32			hus_in_progress_state;
+	__u32			hus_in_progress_action;
+	struct hsm_extent	hus_in_progress_location;
+	char			hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+	struct lu_fid	hssi_fid;
+	__u64		hssi_setmask;
+	__u64		hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is retuned to user space and send over the wire
+ */
+struct hsm_current_action {
+	/**  The current undergoing action, if there is one */
+	/* state is one of hsm_progress_states */
+	__u32			hca_state;
+	/* action is one of hsm_user_action */
+	__u32			hca_action;
+	struct hsm_extent	hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+	HUA_NONE    =  1, /* no action (noop) */
+	HUA_ARCHIVE = 10, /* copy to hsm */
+	HUA_RESTORE = 11, /* prestage */
+	HUA_RELEASE = 12, /* drop ost objects */
+	HUA_REMOVE  = 13, /* remove from archive */
+	HUA_CANCEL  = 14  /* cancel a request */
+};
+
+static inline char *hsm_user_action2name(enum hsm_user_action  a)
+{
+	switch  (a) {
+	case HUA_NONE:    return "NOOP";
+	case HUA_ARCHIVE: return "ARCHIVE";
+	case HUA_RESTORE: return "RESTORE";
+	case HUA_RELEASE: return "RELEASE";
+	case HUA_REMOVE:  return "REMOVE";
+	case HUA_CANCEL:  return "CANCEL";
+	default:	  return "UNKNOWN";
+	}
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, connot be set by user */
+#define HSM_GHOST_COPY   0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+	__u32 hr_action;	/* enum hsm_user_action */
+	__u32 hr_archive_id;	/* archive id, used only with HUA_ARCHIVE */
+	__u64 hr_flags;		/* request flags */
+	__u32 hr_itemcount;	/* item count in hur_user_item vector */
+	__u32 hr_data_len;
+};
+
+struct hsm_user_item {
+       lustre_fid	hui_fid;
+       struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+	struct hsm_request	hur_request;
+	struct hsm_user_item	hur_user_item[0];
+	/* extra data blob at end of struct (after all
+	 * hur_user_items), only use helpers to access it
+	 */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+	return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/** Compute the current length of the provided hsm_user_request. */
+static inline int hur_len(struct hsm_user_request *hur)
+{
+	return offsetof(struct hsm_user_request,
+			hur_user_item[hur->hur_request.hr_itemcount]) +
+		hur->hur_request.hr_data_len;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+	HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+	HSMA_NONE    = 10, /* no action */
+	HSMA_ARCHIVE = 20, /* arbitrary offset */
+	HSMA_RESTORE = 21,
+	HSMA_REMOVE  = 22,
+	HSMA_CANCEL  = 23
+};
+
+static inline char *hsm_copytool_action2name(enum hsm_copytool_action  a)
+{
+	switch  (a) {
+	case HSMA_NONE:    return "NOOP";
+	case HSMA_ARCHIVE: return "ARCHIVE";
+	case HSMA_RESTORE: return "RESTORE";
+	case HSMA_REMOVE:  return "REMOVE";
+	case HSMA_CANCEL:  return "CANCEL";
+	default:	   return "UNKNOWN";
+	}
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+	__u32      hai_len;     /* valid size of this struct */
+	__u32      hai_action;  /* hsm_copytool_action, but use known size */
+	lustre_fid hai_fid;     /* Lustre FID to operated on */
+	lustre_fid hai_dfid;    /* fid used for data access */
+	struct hsm_extent hai_extent;  /* byte range to operate on */
+	__u64      hai_cookie;  /* action cookie from coordinator */
+	__u64      hai_gid;     /* grouplock id */
+	char       hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/*
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ * \param hai [IN] record to print
+ * \param buffer [OUT] output buffer
+ * \param len [IN] max buffer len
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(struct hsm_action_item *hai,
+					char *buffer, int len)
+{
+	int i, sz, data_len;
+	char *ptr;
+
+	ptr = buffer;
+	sz = len;
+	data_len = hai->hai_len - sizeof(*hai);
+	for (i = 0 ; (i < data_len) && (sz > 0) ; i++)
+	{
+		int cnt;
+
+		cnt = snprintf(ptr, sz, "%.2X",
+			       (unsigned char)hai->hai_data[i]);
+		ptr += cnt;
+		sz -= cnt;
+	}
+	*ptr = '\0';
+	return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+	__u32 hal_version;
+	__u32 hal_count;       /* number of hai's to follow */
+	__u64 hal_compound_id; /* returned by coordinator */
+	__u64 hal_flags;
+	__u32 hal_archive_id; /* which archive backend */
+	__u32 padding1;
+	char  hal_fsname[0];   /* null-terminated */
+	/* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+	   boundaries. See hai_zero */
+} __attribute__((packed));
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+	return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item * hai_zero(struct hsm_action_list *hal)
+{
+	return (struct hsm_action_item *)(hal->hal_fsname +
+					  cfs_size_round(strlen(hal-> \
+								hal_fsname)));
+}
+/* Return pointer to next hai */
+static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
+{
+	return (struct hsm_action_item *)((char *)hai +
+					  cfs_size_round(hai->hai_len));
+}
+
+/* Return size of an hsm_action_list */
+static inline int hal_size(struct hsm_action_list *hal)
+{
+	int i, sz;
+	struct hsm_action_item *hai;
+
+	sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname));
+	hai = hai_zero(hal);
+	for (i = 0 ; i < hal->hal_count ; i++) {
+		sz += cfs_size_round(hai->hai_len);
+		hai = hai_next(hai);
+	}
+	return(sz);
+}
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY     0x02
+
+struct hsm_progress {
+	lustre_fid		hp_fid;
+	__u64			hp_cookie;
+	struct hsm_extent	hp_extent;
+	__u16			hp_flags;
+	__u16			hp_errval; /* positive val */
+	__u32			padding;
+};
+
+/**
+ * Use by copytool during any hsm request they handled.
+ * This structure is initialized by llapi_hsm_copy_start()
+ * which is an helper over the ioctl() interface
+ * Store Lustre, internal use only, data.
+ */
+struct hsm_copy {
+	__u64			hc_data_version;
+	__u16			hc_flags;
+	__u16			hc_errval; /* positive val */
+	__u32			padding;
+	struct hsm_action_item	hc_hai;
+};
+
+/** @} lustreuser */
+
+#endif /* _LUSTRE_USER_H */

diff --git a/drivers/staging/lustre/lustre/include/lustre/lustreapi.h b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h
new file mode 100644
index 0000000..63da665
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h

@@ -0,0 +1,310 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTREAPI_H_
+#define _LUSTREAPI_H_
+
+/** \defgroup llapi llapi
+ *
+ * @{
+ */
+
+#include <lustre/lustre_user.h>
+
+typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid, void *args);
+
+/* lustreapi message severity level */
+enum llapi_message_level {
+	LLAPI_MSG_OFF    = 0,
+	LLAPI_MSG_FATAL  = 1,
+	LLAPI_MSG_ERROR  = 2,
+	LLAPI_MSG_WARN   = 3,
+	LLAPI_MSG_NORMAL = 4,
+	LLAPI_MSG_INFO   = 5,
+	LLAPI_MSG_DEBUG  = 6,
+	LLAPI_MSG_MAX
+};
+
+/* the bottom three bits reserved for llapi_message_level */
+#define LLAPI_MSG_MASK	  0x00000007
+#define LLAPI_MSG_NO_ERRNO      0x00000010
+
+extern void llapi_msg_set_level(int level);
+extern void llapi_error(int level, int rc, char *fmt, ...);
+#define llapi_err_noerrno(level, fmt, a...)			     \
+	llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a)
+extern void llapi_printf(int level, char *fmt, ...);
+extern int llapi_file_create(const char *name, unsigned long long stripe_size,
+			     int stripe_offset, int stripe_count,
+			     int stripe_pattern);
+extern int llapi_file_open(const char *name, int flags, int mode,
+			   unsigned long long stripe_size, int stripe_offset,
+			   int stripe_count, int stripe_pattern);
+extern int llapi_file_create_pool(const char *name,
+				  unsigned long long stripe_size,
+				  int stripe_offset, int stripe_count,
+				  int stripe_pattern, char *pool_name);
+extern int llapi_file_open_pool(const char *name, int flags, int mode,
+				unsigned long long stripe_size,
+				int stripe_offset, int stripe_count,
+				int stripe_pattern, char *pool_name);
+extern int llapi_poollist(const char *name);
+extern int llapi_get_poollist(const char *name, char **poollist, int list_size,
+			      char *buffer, int buffer_size);
+extern int llapi_get_poolmembers(const char *poolname, char **members,
+				 int list_size, char *buffer, int buffer_size);
+extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
+#define HAVE_LLAPI_FILE_LOOKUP
+extern int llapi_file_lookup(int dirfd, const char *name);
+
+#define VERBOSE_COUNT      0x1
+#define VERBOSE_SIZE       0x2
+#define VERBOSE_OFFSET     0x4
+#define VERBOSE_POOL       0x8
+#define VERBOSE_DETAIL     0x10
+#define VERBOSE_OBJID      0x20
+#define VERBOSE_GENERATION 0x40
+#define VERBOSE_MDTINDEX   0x80
+#define VERBOSE_ALL	(VERBOSE_COUNT | VERBOSE_SIZE | VERBOSE_OFFSET | \
+			    VERBOSE_POOL | VERBOSE_OBJID | VERBOSE_GENERATION)
+
+struct find_param {
+	unsigned int maxdepth;
+	time_t  atime;
+	time_t  mtime;
+	time_t  ctime;
+	int     asign;  /* cannot be bitfields due to using pointers to */
+	int     csign;  /* access them during argument parsing. */
+	int     msign;
+	int     type;
+	int	     size_sign:2,	/* these need to be signed values */
+			stripesize_sign:2,
+			stripecount_sign:2;
+	unsigned long long size;
+	unsigned long long size_units;
+	uid_t uid;
+	gid_t gid;
+
+	unsigned long   zeroend:1,
+			recursive:1,
+			exclude_pattern:1,
+			exclude_type:1,
+			exclude_obd:1,
+			exclude_mdt:1,
+			exclude_gid:1,
+			exclude_uid:1,
+			check_gid:1,	    /* group ID */
+			check_uid:1,	    /* user ID */
+			check_pool:1,	   /* LOV pool name */
+			check_size:1,	   /* file size */
+			exclude_pool:1,
+			exclude_size:1,
+			exclude_atime:1,
+			exclude_mtime:1,
+			exclude_ctime:1,
+			get_lmv:1,	      /* get MDT list from LMV */
+			raw:1,		  /* do not fill in defaults */
+			check_stripesize:1,     /* LOV stripe size */
+			exclude_stripesize:1,
+			check_stripecount:1,    /* LOV stripe count */
+			exclude_stripecount:1;
+
+	int     verbose;
+	int     quiet;
+
+	/* regular expression */
+	char   *pattern;
+
+	char   *print_fmt;
+
+	struct  obd_uuid       *obduuid;
+	int		     num_obds;
+	int		     num_alloc_obds;
+	int		     obdindex;
+	int		    *obdindexes;
+
+	struct  obd_uuid       *mdtuuid;
+	int		     num_mdts;
+	int		     num_alloc_mdts;
+	int		     mdtindex;
+	int		    *mdtindexes;
+	int		     file_mdtindex;
+
+	int	lumlen;
+	struct  lov_user_mds_data *lmd;
+
+	char poolname[LOV_MAXPOOLNAME + 1];
+
+	int			fp_lmv_count;
+	struct lmv_user_md	*fp_lmv_md;
+
+	unsigned long long stripesize;
+	unsigned long long stripesize_units;
+	unsigned long long stripecount;
+
+	/* In-process parameters. */
+	unsigned long   got_uuids:1,
+			obds_printed:1,
+			have_fileinfo:1;	/* file attrs and LOV xattr */
+	unsigned int    depth;
+	dev_t	   st_dev;
+};
+
+extern int llapi_ostlist(char *path, struct find_param *param);
+extern int llapi_uuid_match(char *real_uuid, char *search_uuid);
+extern int llapi_getstripe(char *path, struct find_param *param);
+extern int llapi_find(char *path, struct find_param *param);
+
+extern int llapi_file_fget_mdtidx(int fd, int *mdtidx);
+extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
+				 int stripe_count, int stripe_pattern,
+				 char *poolname);
+int llapi_direntry_remove(char *dname);
+extern int llapi_obd_statfs(char *path, __u32 type, __u32 index,
+		     struct obd_statfs *stat_buf,
+		     struct obd_uuid *uuid_buf);
+extern int llapi_ping(char *obd_type, char *obd_name);
+extern int llapi_target_check(int num_types, char **obd_types, char *dir);
+extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
+extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
+extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
+extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
+extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
+extern int llapi_is_lustre_mnttype(const char *type);
+extern int llapi_search_ost(char *fsname, char *poolname, char *ostname);
+extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
+extern int parse_size(char *optarg, unsigned long long *size,
+		      unsigned long long *size_units, int bytes_spec);
+extern int llapi_search_mounts(const char *pathname, int index,
+			       char *mntdir, char *fsname);
+extern int llapi_search_fsname(const char *pathname, char *fsname);
+extern int llapi_getname(const char *path, char *buf, size_t size);
+
+extern void llapi_ping_target(char *obd_type, char *obd_name,
+			      char *obd_uuid, void *args);
+
+extern int llapi_search_rootpath(char *pathname, const char *fsname);
+
+struct mntent;
+#define HAVE_LLAPI_IS_LUSTRE_MNT
+extern int llapi_is_lustre_mnt(struct mntent *mnt);
+extern int llapi_quotachown(char *path, int flag);
+extern int llapi_quotacheck(char *mnt, int check_type);
+extern int llapi_poll_quotacheck(char *mnt, struct if_quotacheck *qchk);
+extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
+extern int llapi_target_iterate(int type_num, char **obd_type, void *args,
+				llapi_cb_t cb);
+extern int llapi_get_connect_flags(const char *mnt, __u64 *flags);
+extern int llapi_lsetfacl(int argc, char *argv[]);
+extern int llapi_lgetfacl(int argc, char *argv[]);
+extern int llapi_rsetfacl(int argc, char *argv[]);
+extern int llapi_rgetfacl(int argc, char *argv[]);
+extern int llapi_cp(int argc, char *argv[]);
+extern int llapi_ls(int argc, char *argv[]);
+extern int llapi_fid2path(const char *device, const char *fidstr, char *path,
+			  int pathlen, long long *recno, int *linkno);
+extern int llapi_path2fid(const char *path, lustre_fid *fid);
+extern int llapi_fd2fid(const int fd, lustre_fid *fid);
+
+extern int llapi_get_version(char *buffer, int buffer_size, char **version);
+extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
+extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
+			       __u32 archive_id);
+
+extern int llapi_create_volatile_idx(char *directory, int idx, int mode);
+static inline int llapi_create_volatile(char *directory, int mode)
+{
+	return llapi_create_volatile_idx(directory, -1, mode);
+}
+
+
+extern int llapi_fswap_layouts(const int fd1, const int fd2,
+			       __u64 dv1, __u64 dv2, __u64 flags);
+extern int llapi_swap_layouts(const char *path1, const char *path2,
+			      __u64 dv1, __u64 dv2, __u64 flags);
+
+/* Changelog interface.  priv is private state, managed internally
+   by these functions */
+#define CHANGELOG_FLAG_FOLLOW 0x01   /* Not yet implemented */
+#define CHANGELOG_FLAG_BLOCK  0x02   /* Blocking IO makes sense in case of
+   slow user parsing of the records, but it also prevents us from cleaning
+   up if the records are not consumed. */
+
+/* Records received are in extentded format now, though most of them are still
+ * written in disk in changelog_rec format (to save space and time), it's
+ * converted to extented format in the lustre api to ease changelog analysis. */
+#define HAVE_CHANGELOG_EXTEND_REC 1
+
+extern int llapi_changelog_start(void **priv, int flags, const char *mdtname,
+				 long long startrec);
+extern int llapi_changelog_fini(void **priv);
+extern int llapi_changelog_recv(void *priv, struct changelog_ext_rec **rech);
+extern int llapi_changelog_free(struct changelog_ext_rec **rech);
+/* Allow records up to endrec to be destroyed; requires registered id. */
+extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
+				 long long endrec);
+
+/* HSM copytool interface.
+ * priv is private state, managed internally by these functions
+ */
+struct hsm_copytool_private;
+extern int llapi_hsm_copytool_start(struct hsm_copytool_private **priv,
+				    char *fsname, int flags,
+				    int archive_count, int *archives);
+extern int llapi_hsm_copytool_fini(struct hsm_copytool_private **priv);
+extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
+				   struct hsm_action_list **hal, int *msgsize);
+extern int llapi_hsm_copytool_free(struct hsm_action_list **hal);
+extern int llapi_hsm_copy_start(char *mnt, struct hsm_copy *copy,
+				const struct hsm_action_item *hai);
+extern int llapi_hsm_copy_end(char *mnt, struct hsm_copy *copy,
+			      const struct hsm_progress *hp);
+extern int llapi_hsm_progress(char *mnt, struct hsm_progress *hp);
+extern int llapi_hsm_import(const char *dst, int archive, struct stat *st,
+			    unsigned long long stripe_size, int stripe_offset,
+			    int stripe_count, int stripe_pattern,
+			    char *pool_name, lustre_fid *newfid);
+
+/* HSM user interface */
+extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
+							     int data_len);
+extern int llapi_hsm_request(char *mnt, struct hsm_user_request *request);
+extern int llapi_hsm_current_action(const char *path,
+				    struct hsm_current_action *hca);
+/** @} llapi */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_acl.h b/drivers/staging/lustre/lustre/include/lustre_acl.h
new file mode 100644
index 0000000..5cfb87b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_acl.h

@@ -0,0 +1,42 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_acl.h
+ */
+
+#ifndef _LUSTRE_ACL_H
+#define _LUSTRE_ACL_H
+
+#include <linux/lustre_acl.h>
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_capa.h b/drivers/staging/lustre/lustre/include/lustre_capa.h
new file mode 100644
index 0000000..d77bffc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_capa.h

@@ -0,0 +1,305 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_capa.h
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#ifndef __LINUX_CAPA_H_
+#define __LINUX_CAPA_H_
+
+/** \defgroup capa capa
+ *
+ * @{
+ */
+
+/*
+ * capability
+ */
+#include <linux/crypto.h>
+#include <lustre/lustre_idl.h>
+
+#define CAPA_TIMEOUT 1800		/* sec, == 30 min */
+#define CAPA_KEY_TIMEOUT (24 * 60 * 60)  /* sec, == 1 days */
+
+struct capa_hmac_alg {
+	const char     *ha_name;
+	int	     ha_len;
+	int	     ha_keylen;
+};
+
+#define DEF_CAPA_HMAC_ALG(name, type, len, keylen)      \
+[CAPA_HMAC_ALG_ ## type] = {			    \
+	.ha_name	 = name,			\
+	.ha_len	  = len,			 \
+	.ha_keylen       = keylen,		      \
+}
+
+struct client_capa {
+	struct inode	     *inode;
+	struct list_head		lli_list;     /* link to lli_oss_capas */
+};
+
+struct target_capa {
+	struct hlist_node	  c_hash;       /* link to capa hash */
+};
+
+struct obd_capa {
+	struct list_head		c_list;       /* link to capa_list */
+
+	struct lustre_capa	c_capa;       /* capa */
+	atomic_t	      c_refc;       /* ref count */
+	cfs_time_t		c_expiry;     /* jiffies */
+	spinlock_t		c_lock;	/* protect capa content */
+	int			c_site;
+
+	union {
+		struct client_capa	cli;
+		struct target_capa	tgt;
+	} u;
+};
+
+enum {
+	CAPA_SITE_CLIENT = 0,
+	CAPA_SITE_SERVER,
+	CAPA_SITE_MAX
+};
+
+static inline struct lu_fid *capa_fid(struct lustre_capa *capa)
+{
+	return &capa->lc_fid;
+}
+
+static inline __u64 capa_opc(struct lustre_capa *capa)
+{
+	return capa->lc_opc;
+}
+
+static inline __u64 capa_uid(struct lustre_capa *capa)
+{
+	return capa->lc_uid;
+}
+
+static inline __u64 capa_gid(struct lustre_capa *capa)
+{
+	return capa->lc_gid;
+}
+
+static inline __u32 capa_flags(struct lustre_capa *capa)
+{
+	return capa->lc_flags & 0xffffff;
+}
+
+static inline __u32 capa_alg(struct lustre_capa *capa)
+{
+	return (capa->lc_flags >> 24);
+}
+
+static inline __u32 capa_keyid(struct lustre_capa *capa)
+{
+	return capa->lc_keyid;
+}
+
+static inline __u64 capa_key_seq(struct lustre_capa_key *key)
+{
+	return key->lk_seq;
+}
+
+static inline __u32 capa_key_keyid(struct lustre_capa_key *key)
+{
+	return key->lk_keyid;
+}
+
+static inline __u32 capa_timeout(struct lustre_capa *capa)
+{
+	return capa->lc_timeout;
+}
+
+static inline __u32 capa_expiry(struct lustre_capa *capa)
+{
+	return capa->lc_expiry;
+}
+
+void _debug_capa(struct lustre_capa *, struct libcfs_debug_msg_data *,
+		 const char *fmt, ... );
+#define DEBUG_CAPA(level, capa, fmt, args...)				  \
+do {									   \
+	if (((level) & D_CANTMASK) != 0 ||				     \
+	    ((libcfs_debug & (level)) != 0 &&				  \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) {	       \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);	      \
+		_debug_capa((capa), &msgdata, fmt, ##args);		    \
+	}								      \
+} while (0)
+
+#define DEBUG_CAPA_KEY(level, k, fmt, args...)				 \
+do {									   \
+CDEBUG(level, fmt " capability key@%p seq "LPU64" keyid %u\n",		 \
+       ##args, k, capa_key_seq(k), capa_key_keyid(k));			 \
+} while (0)
+
+typedef int (* renew_capa_cb_t)(struct obd_capa *, struct lustre_capa *);
+
+/* obdclass/capa.c */
+extern struct list_head capa_list[];
+extern spinlock_t capa_lock;
+extern int capa_count[];
+extern struct kmem_cache *capa_cachep;
+
+struct hlist_head *init_capa_hash(void);
+void cleanup_capa_hash(struct hlist_head *hash);
+
+struct obd_capa *capa_add(struct hlist_head *hash,
+			  struct lustre_capa *capa);
+struct obd_capa *capa_lookup(struct hlist_head *hash,
+			     struct lustre_capa *capa, int alive);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key);
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+void capa_cpy(void *dst, struct obd_capa *ocapa);
+static inline struct obd_capa *alloc_capa(int site)
+{
+	struct obd_capa *ocapa;
+
+	if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER))
+		return ERR_PTR(-EINVAL);
+
+	OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep);
+	if (unlikely(!ocapa))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ocapa->c_list);
+	atomic_set(&ocapa->c_refc, 1);
+	spin_lock_init(&ocapa->c_lock);
+	ocapa->c_site = site;
+	if (ocapa->c_site == CAPA_SITE_CLIENT)
+		INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+	else
+		INIT_HLIST_NODE(&ocapa->u.tgt.c_hash);
+
+	return ocapa;
+}
+
+static inline struct obd_capa *capa_get(struct obd_capa *ocapa)
+{
+	if (!ocapa)
+		return NULL;
+
+	atomic_inc(&ocapa->c_refc);
+	return ocapa;
+}
+
+static inline void capa_put(struct obd_capa *ocapa)
+{
+	if (!ocapa)
+		return;
+
+	if (atomic_read(&ocapa->c_refc) == 0) {
+		DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for");
+		LBUG();
+	}
+
+	if (atomic_dec_and_test(&ocapa->c_refc)) {
+		LASSERT(list_empty(&ocapa->c_list));
+		if (ocapa->c_site == CAPA_SITE_CLIENT) {
+			LASSERT(list_empty(&ocapa->u.cli.lli_list));
+		} else {
+			struct hlist_node *hnode;
+
+			hnode = &ocapa->u.tgt.c_hash;
+			LASSERT(!hnode->next && !hnode->pprev);
+		}
+		OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa));
+	}
+}
+
+static inline int open_flags_to_accmode(int flags)
+{
+	int mode = flags;
+
+	if ((mode + 1) & O_ACCMODE)
+		mode++;
+	if (mode & O_TRUNC)
+		mode |= 2;
+
+	return mode;
+}
+
+static inline __u64 capa_open_opc(int mode)
+{
+	return mode & FMODE_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_READ;
+}
+
+static inline void set_capa_expiry(struct obd_capa *ocapa)
+{
+	cfs_time_t expiry = cfs_time_sub((cfs_time_t)ocapa->c_capa.lc_expiry,
+					 cfs_time_current_sec());
+	ocapa->c_expiry = cfs_time_add(cfs_time_current(),
+				       cfs_time_seconds(expiry));
+}
+
+static inline int capa_is_expired_sec(struct lustre_capa *capa)
+{
+	return (capa->lc_expiry - cfs_time_current_sec() <= 0);
+}
+
+static inline int capa_is_expired(struct obd_capa *ocapa)
+{
+	return cfs_time_beforeq(ocapa->c_expiry, cfs_time_current());
+}
+
+static inline int capa_opc_supported(struct lustre_capa *capa, __u64 opc)
+{
+	return (capa_opc(capa) & opc) == opc;
+}
+
+struct filter_capa_key {
+	struct list_head	      k_list;
+	struct lustre_capa_key  k_key;
+};
+
+enum {
+	LC_ID_NONE      = 0,
+	LC_ID_PLAIN     = 1,
+	LC_ID_CONVERT   = 2
+};
+
+#define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT)
+
+/** @} capa */
+
+#endif /* __LINUX_CAPA_H_ */

diff --git a/drivers/staging/lustre/lustre/include/lustre_cfg.h b/drivers/staging/lustre/lustre/include/lustre_cfg.h
new file mode 100644
index 0000000..f12429f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_cfg.h

@@ -0,0 +1,299 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_CFG_H
+#define _LUSTRE_CFG_H
+
+/** \defgroup cfg cfg
+ *
+ * @{
+ */
+
+/*
+ * 1cf6
+ * lcfG
+ */
+#define LUSTRE_CFG_VERSION 0x1cf60001
+#define LUSTRE_CFG_MAX_BUFCOUNT 8
+
+#define LCFG_HDR_SIZE(count) \
+    cfs_size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)]))
+
+/** If the LCFG_REQUIRED bit is set in a configuration command,
+ * then the client is required to understand this parameter
+ * in order to mount the filesystem. If it does not understand
+ * a REQUIRED command the client mount will fail. */
+#define LCFG_REQUIRED	 0x0001000
+
+enum lcfg_command_type {
+	LCFG_ATTACH	     = 0x00cf001, /**< create a new obd instance */
+	LCFG_DETACH	     = 0x00cf002, /**< destroy obd instance */
+	LCFG_SETUP	      = 0x00cf003, /**< call type-specific setup */
+	LCFG_CLEANUP	    = 0x00cf004, /**< call type-specific cleanup */
+	LCFG_ADD_UUID	   = 0x00cf005, /**< add a nid to a niduuid */
+	LCFG_DEL_UUID	   = 0x00cf006, /**< remove a nid from a niduuid */
+	LCFG_MOUNTOPT	   = 0x00cf007, /**< create a profile (mdc, osc) */
+	LCFG_DEL_MOUNTOPT       = 0x00cf008, /**< destroy a profile */
+	LCFG_SET_TIMEOUT	= 0x00cf009, /**< set obd_timeout */
+	LCFG_SET_UPCALL	 = 0x00cf00a, /**< deprecated */
+	LCFG_ADD_CONN	   = 0x00cf00b, /**< add a failover niduuid to an obd */
+	LCFG_DEL_CONN	   = 0x00cf00c, /**< remove a failover niduuid */
+	LCFG_LOV_ADD_OBD	= 0x00cf00d, /**< add an osc to a lov */
+	LCFG_LOV_DEL_OBD	= 0x00cf00e, /**< remove an osc from a lov */
+	LCFG_PARAM	      = 0x00cf00f, /**< set a proc parameter */
+	LCFG_MARKER	     = 0x00cf010, /**< metadata about next cfg rec */
+	LCFG_LOG_START	  = 0x00ce011, /**< mgc only, process a cfg log */
+	LCFG_LOG_END	    = 0x00ce012, /**< stop processing updates */
+	LCFG_LOV_ADD_INA	= 0x00ce013, /**< like LOV_ADD_OBD, inactive */
+	LCFG_ADD_MDC	    = 0x00cf014, /**< add an mdc to a lmv */
+	LCFG_DEL_MDC	    = 0x00cf015, /**< remove an mdc from a lmv */
+	LCFG_SPTLRPC_CONF       = 0x00ce016, /**< security */
+	LCFG_POOL_NEW	   = 0x00ce020, /**< create an ost pool name */
+	LCFG_POOL_ADD	   = 0x00ce021, /**< add an ost to a pool */
+	LCFG_POOL_REM	   = 0x00ce022, /**< remove an ost from a pool */
+	LCFG_POOL_DEL	   = 0x00ce023, /**< destroy an ost pool name */
+	LCFG_SET_LDLM_TIMEOUT   = 0x00ce030, /**< set ldlm_timeout */
+	LCFG_PRE_CLEANUP	= 0x00cf031, /**< call type-specific pre
+					      * cleanup cleanup */
+};
+
+struct lustre_cfg_bufs {
+	void    *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT];
+	__u32    lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT];
+	__u32    lcfg_bufcount;
+};
+
+struct lustre_cfg {
+	__u32 lcfg_version;
+	__u32 lcfg_command;
+
+	__u32 lcfg_num;
+	__u32 lcfg_flags;
+	__u64 lcfg_nid;
+	__u32 lcfg_nal;		/* not used any more */
+
+	__u32 lcfg_bufcount;
+	__u32 lcfg_buflens[0];
+};
+
+enum cfg_record_type {
+	PORTALS_CFG_TYPE = 1,
+	LUSTRE_CFG_TYPE = 123,
+};
+
+#define LUSTRE_CFG_BUFLEN(lcfg, idx)	    \
+	((lcfg)->lcfg_bufcount <= (idx)	 \
+	 ? 0				    \
+	 : (lcfg)->lcfg_buflens[(idx)])
+
+static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs,
+				       __u32		   index,
+				       void		   *buf,
+				       __u32		   buflen)
+{
+	if (index >= LUSTRE_CFG_MAX_BUFCOUNT)
+		return;
+	if (bufs == NULL)
+		return;
+
+	if (bufs->lcfg_bufcount <= index)
+		bufs->lcfg_bufcount = index + 1;
+
+	bufs->lcfg_buf[index]    = buf;
+	bufs->lcfg_buflen[index] = buflen;
+}
+
+static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs,
+					      __u32 index,
+					      char *str)
+{
+	lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0);
+}
+
+static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name)
+{
+	memset((bufs), 0, sizeof(*bufs));
+	if (name)
+		lustre_cfg_bufs_set_string(bufs, 0, name);
+}
+
+static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index)
+{
+	int i;
+	int offset;
+	int bufcount;
+	LASSERT (lcfg != NULL);
+	LASSERT (index >= 0);
+
+	bufcount = lcfg->lcfg_bufcount;
+	if (index >= bufcount)
+		return NULL;
+
+	offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+	for (i = 0; i < index; i++)
+		offset += cfs_size_round(lcfg->lcfg_buflens[i]);
+	return (char *)lcfg + offset;
+}
+
+static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs,
+					struct lustre_cfg *lcfg)
+{
+	int i;
+	bufs->lcfg_bufcount = lcfg->lcfg_bufcount;
+	for (i = 0; i < bufs->lcfg_bufcount; i++) {
+		bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i];
+		bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i);
+	}
+}
+
+static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index)
+{
+	char *s;
+
+	if (lcfg->lcfg_buflens[index] == 0)
+		return NULL;
+
+	s = lustre_cfg_buf(lcfg, index);
+	if (s == NULL)
+		return NULL;
+
+	/*
+	 * make sure it's NULL terminated, even if this kills a char
+	 * of data.  Try to use the padding first though.
+	 */
+	if (s[lcfg->lcfg_buflens[index] - 1] != '\0') {
+		int last = min((int)lcfg->lcfg_buflens[index],
+			       cfs_size_round(lcfg->lcfg_buflens[index]) - 1);
+		char lost = s[last];
+		s[last] = '\0';
+		if (lost != '\0') {
+			CWARN("Truncated buf %d to '%s' (lost '%c'...)\n",
+			      index, s, lost);
+		}
+	}
+	return s;
+}
+
+static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens)
+{
+	int i;
+	int len;
+	ENTRY;
+
+	len = LCFG_HDR_SIZE(bufcount);
+	for (i = 0; i < bufcount; i++)
+		len += cfs_size_round(buflens[i]);
+
+	RETURN(cfs_size_round(len));
+}
+
+
+#include <obd_support.h>
+
+static inline struct lustre_cfg *lustre_cfg_new(int cmd,
+						struct lustre_cfg_bufs *bufs)
+{
+	struct lustre_cfg *lcfg;
+	char *ptr;
+	int i;
+
+	ENTRY;
+
+	OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
+				       bufs->lcfg_buflen));
+	if (!lcfg)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lcfg->lcfg_version = LUSTRE_CFG_VERSION;
+	lcfg->lcfg_command = cmd;
+	lcfg->lcfg_bufcount = bufs->lcfg_bufcount;
+
+	ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+		lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i];
+		LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr);
+	}
+	RETURN(lcfg);
+}
+
+static inline void lustre_cfg_free(struct lustre_cfg *lcfg)
+{
+	int len;
+
+	len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens);
+
+	OBD_FREE(lcfg, len);
+	EXIT;
+	return;
+}
+
+static inline int lustre_cfg_sanity_check(void *buf, int len)
+{
+	struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
+	ENTRY;
+	if (!lcfg)
+		RETURN(-EINVAL);
+
+	/* check that the first bits of the struct are valid */
+	if (len < LCFG_HDR_SIZE(0))
+		RETURN(-EINVAL);
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
+		RETURN(-EINVAL);
+
+	if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
+		RETURN(-EINVAL);
+
+	/* check that the buflens are valid */
+	if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount))
+		RETURN(-EINVAL);
+
+	/* make sure all the pointers point inside the data */
+	if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens))
+		RETURN(-EINVAL);
+
+	RETURN(0);
+}
+
+#include <lustre/lustre_user.h>
+
+#ifndef INVALID_UID
+#define INVALID_UID     (-1)
+#endif
+
+/** @} cfg */
+
+#endif // _LUSTRE_CFG_H

diff --git a/drivers/staging/lustre/lustre/include/lustre_debug.h b/drivers/staging/lustre/lustre/include/lustre_debug.h
new file mode 100644
index 0000000..3d9e446
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_debug.h

@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_DEBUG_H
+#define _LUSTRE_DEBUG_H
+
+/** \defgroup debug debug
+ *
+ * @{
+ */
+
+#include <lustre_net.h>
+#include <obd.h>
+
+#include <linux/lustre_debug.h>
+
+#define ASSERT_MAX_SIZE_MB 60000ULL
+#define ASSERT_PAGE_INDEX(index, OP)				    \
+do { if (index > ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT)) {	 \
+	CERROR("bad page index %lu > %llu\n", index,		    \
+	       ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT));	    \
+	libcfs_debug = ~0UL;					    \
+	OP;							     \
+}} while(0)
+
+#define ASSERT_FILE_OFFSET(offset, OP)				  \
+do { if (offset > ASSERT_MAX_SIZE_MB << 20) {			   \
+	CERROR("bad file offset %llu > %llu\n", offset,		 \
+	       ASSERT_MAX_SIZE_MB << 20);			       \
+	libcfs_debug = ~0UL;					    \
+	OP;							     \
+}} while(0)
+
+/* lib/debug.c */
+void dump_lniobuf(struct niobuf_local *lnb);
+int dump_req(struct ptlrpc_request *req);
+void dump_lsm(int level, struct lov_stripe_md *lsm);
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id);
+int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id);
+
+/** @} debug */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h
new file mode 100644
index 0000000..8db6086
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_disk.h

@@ -0,0 +1,543 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_disk.h
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_DISK_H
+#define _LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+
+/****************** on-disk files *********************/
+
+#define MDT_LOGS_DIR      "LOGS"  /* COMPAT_146 */
+#define MOUNT_CONFIGS_DIR "CONFIGS"
+#define CONFIGS_FILE      "mountdata"
+/** Persistent mount data are stored on the disk in this file. */
+#define MOUNT_DATA_FILE    MOUNT_CONFIGS_DIR"/"CONFIGS_FILE
+#define LAST_RCVD	 "last_rcvd"
+#define LOV_OBJID	 "lov_objid"
+#define LOV_OBJSEQ		"lov_objseq"
+#define HEALTH_CHECK      "health_check"
+#define CAPA_KEYS	 "capa_keys"
+#define CHANGELOG_USERS   "changelog_users"
+#define MGS_NIDTBL_DIR    "NIDTBL_VERSIONS"
+#define QMT_DIR	   "quota_master"
+#define QSD_DIR	   "quota_slave"
+
+/****************** persistent mount data *********************/
+
+#define LDD_F_SV_TYPE_MDT   0x0001
+#define LDD_F_SV_TYPE_OST   0x0002
+#define LDD_F_SV_TYPE_MGS   0x0004
+#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT  | \
+			    LDD_F_SV_TYPE_OST  | \
+			    LDD_F_SV_TYPE_MGS)
+#define LDD_F_SV_ALL	0x0008
+/** need an index assignment */
+#define LDD_F_NEED_INDEX    0x0010
+/** never registered */
+#define LDD_F_VIRGIN	0x0020
+/** update the config logs for this server */
+#define LDD_F_UPDATE	0x0040
+/** rewrite the LDD */
+#define LDD_F_REWRITE_LDD   0x0080
+/** regenerate config logs for this fs or server */
+#define LDD_F_WRITECONF     0x0100
+/** COMPAT_14 */
+#define LDD_F_UPGRADE14     0x0200
+/** process as lctl conf_param */
+#define LDD_F_PARAM	 0x0400
+/** all nodes are specified as service nodes */
+#define LDD_F_NO_PRIMNODE   0x1000
+/** IR enable flag */
+#define LDD_F_IR_CAPABLE    0x2000
+/** the MGS refused to register the target. */
+#define LDD_F_ERROR	 0x4000
+
+/* opc for target register */
+#define LDD_F_OPC_REG   0x10000000
+#define LDD_F_OPC_UNREG 0x20000000
+#define LDD_F_OPC_READY 0x40000000
+#define LDD_F_OPC_MASK  0xf0000000
+
+#define LDD_F_ONDISK_MASK  (LDD_F_SV_TYPE_MASK)
+
+#define LDD_F_MASK	  0xFFFF
+
+enum ldd_mount_type {
+	LDD_MT_EXT3 = 0,
+	LDD_MT_LDISKFS,
+	LDD_MT_SMFS,
+	LDD_MT_REISERFS,
+	LDD_MT_LDISKFS2,
+	LDD_MT_ZFS,
+	LDD_MT_LAST
+};
+
+static inline char *mt_str(enum ldd_mount_type mt)
+{
+	static char *mount_type_string[] = {
+		"ext3",
+		"ldiskfs",
+		"smfs",
+		"reiserfs",
+		"ldiskfs2",
+		"zfs",
+	};
+	return mount_type_string[mt];
+}
+
+static inline char *mt_type(enum ldd_mount_type mt)
+{
+	static char *mount_type_string[] = {
+		"osd-ldiskfs",
+		"osd-ldiskfs",
+		"osd-smfs",
+		"osd-reiserfs",
+		"osd-ldiskfs",
+		"osd-zfs",
+	};
+	return mount_type_string[mt];
+}
+
+#define LDD_INCOMPAT_SUPP 0
+#define LDD_ROCOMPAT_SUPP 0
+
+#define LDD_MAGIC 0x1dd00001
+
+/* On-disk configuration file. In host-endian order. */
+struct lustre_disk_data {
+	__u32      ldd_magic;
+	__u32      ldd_feature_compat;  /* compatible feature flags */
+	__u32      ldd_feature_rocompat;/* read-only compatible feature flags */
+	__u32      ldd_feature_incompat;/* incompatible feature flags */
+
+	__u32      ldd_config_ver;      /* config rewrite count - not used */
+	__u32      ldd_flags;	   /* LDD_SV_TYPE */
+	__u32      ldd_svindex;	 /* server index (0001), must match
+					   svname */
+	__u32      ldd_mount_type;      /* target fs type LDD_MT_* */
+	char       ldd_fsname[64];      /* filesystem this server is part of,
+					   MTI_NAME_MAXLEN */
+	char       ldd_svname[64];      /* this server's name (lustre-mdt0001)*/
+	__u8       ldd_uuid[40];	/* server UUID (COMPAT_146) */
+
+/*200*/ char       ldd_userdata[1024 - 200]; /* arbitrary user string */
+/*1024*/__u8       ldd_padding[4096 - 1024];
+/*4096*/char       ldd_mount_opts[4096]; /* target fs mount opts */
+/*8192*/char       ldd_params[4096];     /* key=value pairs */
+};
+
+
+#define IS_MDT(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
+#define IS_OST(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_OST)
+#define IS_MGS(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_MGS)
+#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \
+			 LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST))
+#define MT_STR(data)    mt_str((data)->ldd_mount_type)
+
+/* Make the mdt/ost server obd name based on the filesystem name */
+static inline int server_make_name(__u32 flags, __u16 index, char *fs,
+				   char *name)
+{
+	if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) {
+		if (!(flags & LDD_F_SV_ALL))
+			sprintf(name, "%.8s%c%s%04x", fs,
+				(flags & LDD_F_VIRGIN) ? ':' :
+					((flags & LDD_F_WRITECONF) ? '=' : '-'),
+				(flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST",
+				index);
+	} else if (flags & LDD_F_SV_TYPE_MGS) {
+		sprintf(name, "MGS");
+	} else {
+		CERROR("unknown server type %#x\n", flags);
+		return 1;
+	}
+	return 0;
+}
+
+/****************** mount command *********************/
+
+/* The lmd is only used internally by Lustre; mount simply passes
+   everything as string options */
+
+#define LMD_MAGIC    0xbdacbd03
+
+/* gleaned from the mount command - no persistent info here */
+struct lustre_mount_data {
+	__u32      lmd_magic;
+	__u32      lmd_flags;	 /* lustre mount flags */
+	int	lmd_mgs_failnodes; /* mgs failover node count */
+	int	lmd_exclude_count;
+	int	lmd_recovery_time_soft;
+	int	lmd_recovery_time_hard;
+	char      *lmd_dev;	   /* device name */
+	char      *lmd_profile;       /* client only */
+	char      *lmd_mgssec;	/* sptlrpc flavor to mgs */
+	char      *lmd_opts;	  /* lustre mount options (as opposed to
+					 _device_ mount options) */
+	char      *lmd_params;	/* lustre params */
+	__u32     *lmd_exclude;       /* array of OSTs to ignore */
+	char	*lmd_mgs;	   /* MGS nid */
+	char	*lmd_osd_type;      /* OSD type */
+};
+
+#define LMD_FLG_SERVER       0x0001  /* Mounting a server */
+#define LMD_FLG_CLIENT       0x0002  /* Mounting a client */
+#define LMD_FLG_ABORT_RECOV  0x0008  /* Abort recovery */
+#define LMD_FLG_NOSVC	0x0010  /* Only start MGS/MGC for servers,
+					no other services */
+#define LMD_FLG_NOMGS	0x0020  /* Only start target for servers, reusing
+					existing MGS services */
+#define LMD_FLG_WRITECONF    0x0040  /* Rewrite config log */
+#define LMD_FLG_NOIR	 0x0080  /* NO imperative recovery */
+#define LMD_FLG_NOSCRUB	     0x0100  /* Do not trigger scrub automatically */
+#define LMD_FLG_MGS	     0x0200  /* Also start MGS along with server */
+#define LMD_FLG_IAM	     0x0400  /* IAM dir */
+#define LMD_FLG_NO_PRIMNODE  0x0800  /* all nodes are service nodes */
+#define LMD_FLG_VIRGIN	     0x1000  /* the service registers first time */
+#define LMD_FLG_UPDATE	     0x2000  /* update parameters */
+
+#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
+
+
+/****************** last_rcvd file *********************/
+
+/** version recovery epoch */
+#define LR_EPOCH_BITS   32
+#define lr_epoch(a) ((a) >> LR_EPOCH_BITS)
+#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */
+#define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */
+
+#define LR_SERVER_SIZE   512
+#define LR_CLIENT_START 8192
+#define LR_CLIENT_SIZE   128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+
+/*
+ * This limit is arbitrary (131072 clients on x86), but it is convenient to use
+ * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation.
+ * If we need more than 131072 clients (order-2 allocation on x86) then this
+ * should become an array of single-page pointers that are allocated on demand.
+ */
+#if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8)
+#define LR_MAX_CLIENTS (128 * 1024UL)
+#else
+#define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8)
+#endif
+
+/** COMPAT_146: this is an OST (temporary) */
+#define OBD_COMPAT_OST	  0x00000002
+/** COMPAT_146: this is an MDT (temporary) */
+#define OBD_COMPAT_MDT	  0x00000004
+/** 2.0 server, interop flag to show server version is changed */
+#define OBD_COMPAT_20	   0x00000008
+
+/** MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_LOVOBJID   0x00000001
+
+/** OST handles group subdirs */
+#define OBD_INCOMPAT_GROUPS     0x00000001
+/** this is an OST */
+#define OBD_INCOMPAT_OST	0x00000002
+/** this is an MDT */
+#define OBD_INCOMPAT_MDT	0x00000004
+/** common last_rvcd format */
+#define OBD_INCOMPAT_COMMON_LR  0x00000008
+/** FID is enabled */
+#define OBD_INCOMPAT_FID	0x00000010
+/** Size-on-MDS is enabled */
+#define OBD_INCOMPAT_SOM	0x00000020
+/** filesystem using iam format to store directory entries */
+#define OBD_INCOMPAT_IAM_DIR    0x00000040
+/** LMA attribute contains per-inode incompatible flags */
+#define OBD_INCOMPAT_LMA	0x00000080
+/** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16
+ * bits are now used to store a generation. Once we start changing the layout
+ * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count
+ * will be confused by interpreting stripe_count | gen << 16 as the actual
+ * stripe count */
+#define OBD_INCOMPAT_LMM_VER    0x00000100
+/** multiple OI files for MDT */
+#define OBD_INCOMPAT_MULTI_OI   0x00000200
+
+/* Data stored per server at the head of the last_rcvd file.  In le32 order.
+   This should be common to filter_internal.h, lustre_mds.h */
+struct lr_server_data {
+	__u8  lsd_uuid[40];	/* server UUID */
+	__u64 lsd_last_transno;    /* last completed transaction ID */
+	__u64 lsd_compat14;	/* reserved - compat with old last_rcvd */
+	__u64 lsd_mount_count;     /* incarnation number */
+	__u32 lsd_feature_compat;  /* compatible feature flags */
+	__u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+	__u32 lsd_feature_incompat;/* incompatible feature flags */
+	__u32 lsd_server_size;     /* size of server data area */
+	__u32 lsd_client_start;    /* start of per-client data area */
+	__u16 lsd_client_size;     /* size of per-client data area */
+	__u16 lsd_subdir_count;    /* number of subdirectories for objects */
+	__u64 lsd_catalog_oid;     /* recovery catalog object id */
+	__u32 lsd_catalog_ogen;    /* recovery catalog inode generation */
+	__u8  lsd_peeruuid[40];    /* UUID of MDS associated with this OST */
+	__u32 lsd_osd_index;       /* index number of OST in LOV */
+	__u32 lsd_padding1;	/* was lsd_mdt_index, unused in 2.4.0 */
+	__u32 lsd_start_epoch;     /* VBR: start epoch from last boot */
+	/** transaction values since lsd_trans_table_time */
+	__u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
+	/** start point of transno table below */
+	__u32 lsd_trans_table_time; /* time of first slot in table above */
+	__u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
+	__u8  lsd_padding[LR_SERVER_SIZE - 288];
+};
+
+/* Data stored per client in the last_rcvd file.  In le32 order. */
+struct lsd_client_data {
+	__u8  lcd_uuid[40];      /* client UUID */
+	__u64 lcd_last_transno; /* last completed transaction ID */
+	__u64 lcd_last_xid;     /* xid for the last transaction */
+	__u32 lcd_last_result;  /* result from last RPC */
+	__u32 lcd_last_data;    /* per-op data (disposition for open &c.) */
+	/* for MDS_CLOSE requests */
+	__u64 lcd_last_close_transno; /* last completed transaction ID */
+	__u64 lcd_last_close_xid;     /* xid for the last transaction */
+	__u32 lcd_last_close_result;  /* result from last RPC */
+	__u32 lcd_last_close_data;    /* per-op data */
+	/* VBR: last versions */
+	__u64 lcd_pre_versions[4];
+	__u32 lcd_last_epoch;
+	/** orphans handling for delayed export rely on that */
+	__u32 lcd_first_epoch;
+	__u8  lcd_padding[LR_CLIENT_SIZE - 128];
+};
+
+/* bug20354: the lcd_uuid for export of clients may be wrong */
+static inline void check_lcd(char *obd_name, int index,
+			     struct lsd_client_data *lcd)
+{
+	int length = sizeof(lcd->lcd_uuid);
+	if (strnlen((char*)lcd->lcd_uuid, length) == length) {
+		lcd->lcd_uuid[length - 1] = '\0';
+
+		LCONSOLE_ERROR("the client UUID (%s) on %s for exports"
+			       "stored in last_rcvd(index = %d) is bad!\n",
+			       lcd->lcd_uuid, obd_name, index);
+	}
+}
+
+/* last_rcvd handling */
+static inline void lsd_le_to_cpu(struct lr_server_data *buf,
+				 struct lr_server_data *lsd)
+{
+	int i;
+	memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid));
+	lsd->lsd_last_transno     = le64_to_cpu(buf->lsd_last_transno);
+	lsd->lsd_compat14	 = le64_to_cpu(buf->lsd_compat14);
+	lsd->lsd_mount_count      = le64_to_cpu(buf->lsd_mount_count);
+	lsd->lsd_feature_compat   = le32_to_cpu(buf->lsd_feature_compat);
+	lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
+	lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
+	lsd->lsd_server_size      = le32_to_cpu(buf->lsd_server_size);
+	lsd->lsd_client_start     = le32_to_cpu(buf->lsd_client_start);
+	lsd->lsd_client_size      = le16_to_cpu(buf->lsd_client_size);
+	lsd->lsd_subdir_count     = le16_to_cpu(buf->lsd_subdir_count);
+	lsd->lsd_catalog_oid      = le64_to_cpu(buf->lsd_catalog_oid);
+	lsd->lsd_catalog_ogen     = le32_to_cpu(buf->lsd_catalog_ogen);
+	memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
+	lsd->lsd_osd_index	= le32_to_cpu(buf->lsd_osd_index);
+	lsd->lsd_padding1	= le32_to_cpu(buf->lsd_padding1);
+	lsd->lsd_start_epoch      = le32_to_cpu(buf->lsd_start_epoch);
+	for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+		lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
+	lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
+	lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
+}
+
+static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
+				 struct lr_server_data *buf)
+{
+	int i;
+	memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid));
+	buf->lsd_last_transno     = cpu_to_le64(lsd->lsd_last_transno);
+	buf->lsd_compat14	 = cpu_to_le64(lsd->lsd_compat14);
+	buf->lsd_mount_count      = cpu_to_le64(lsd->lsd_mount_count);
+	buf->lsd_feature_compat   = cpu_to_le32(lsd->lsd_feature_compat);
+	buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
+	buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
+	buf->lsd_server_size      = cpu_to_le32(lsd->lsd_server_size);
+	buf->lsd_client_start     = cpu_to_le32(lsd->lsd_client_start);
+	buf->lsd_client_size      = cpu_to_le16(lsd->lsd_client_size);
+	buf->lsd_subdir_count     = cpu_to_le16(lsd->lsd_subdir_count);
+	buf->lsd_catalog_oid      = cpu_to_le64(lsd->lsd_catalog_oid);
+	buf->lsd_catalog_ogen     = cpu_to_le32(lsd->lsd_catalog_ogen);
+	memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
+	buf->lsd_osd_index	  = cpu_to_le32(lsd->lsd_osd_index);
+	buf->lsd_padding1	  = cpu_to_le32(lsd->lsd_padding1);
+	buf->lsd_start_epoch      = cpu_to_le32(lsd->lsd_start_epoch);
+	for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+		buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
+	buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
+	buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
+}
+
+static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
+				 struct lsd_client_data *lcd)
+{
+	memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
+	lcd->lcd_last_transno       = le64_to_cpu(buf->lcd_last_transno);
+	lcd->lcd_last_xid	   = le64_to_cpu(buf->lcd_last_xid);
+	lcd->lcd_last_result	= le32_to_cpu(buf->lcd_last_result);
+	lcd->lcd_last_data	  = le32_to_cpu(buf->lcd_last_data);
+	lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
+	lcd->lcd_last_close_xid     = le64_to_cpu(buf->lcd_last_close_xid);
+	lcd->lcd_last_close_result  = le32_to_cpu(buf->lcd_last_close_result);
+	lcd->lcd_last_close_data    = le32_to_cpu(buf->lcd_last_close_data);
+	lcd->lcd_pre_versions[0]    = le64_to_cpu(buf->lcd_pre_versions[0]);
+	lcd->lcd_pre_versions[1]    = le64_to_cpu(buf->lcd_pre_versions[1]);
+	lcd->lcd_pre_versions[2]    = le64_to_cpu(buf->lcd_pre_versions[2]);
+	lcd->lcd_pre_versions[3]    = le64_to_cpu(buf->lcd_pre_versions[3]);
+	lcd->lcd_last_epoch	 = le32_to_cpu(buf->lcd_last_epoch);
+	lcd->lcd_first_epoch	= le32_to_cpu(buf->lcd_first_epoch);
+}
+
+static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
+				 struct lsd_client_data *buf)
+{
+	memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
+	buf->lcd_last_transno       = cpu_to_le64(lcd->lcd_last_transno);
+	buf->lcd_last_xid	   = cpu_to_le64(lcd->lcd_last_xid);
+	buf->lcd_last_result	= cpu_to_le32(lcd->lcd_last_result);
+	buf->lcd_last_data	  = cpu_to_le32(lcd->lcd_last_data);
+	buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
+	buf->lcd_last_close_xid     = cpu_to_le64(lcd->lcd_last_close_xid);
+	buf->lcd_last_close_result  = cpu_to_le32(lcd->lcd_last_close_result);
+	buf->lcd_last_close_data    = cpu_to_le32(lcd->lcd_last_close_data);
+	buf->lcd_pre_versions[0]    = cpu_to_le64(lcd->lcd_pre_versions[0]);
+	buf->lcd_pre_versions[1]    = cpu_to_le64(lcd->lcd_pre_versions[1]);
+	buf->lcd_pre_versions[2]    = cpu_to_le64(lcd->lcd_pre_versions[2]);
+	buf->lcd_pre_versions[3]    = cpu_to_le64(lcd->lcd_pre_versions[3]);
+	buf->lcd_last_epoch	 = cpu_to_le32(lcd->lcd_last_epoch);
+	buf->lcd_first_epoch	= cpu_to_le32(lcd->lcd_first_epoch);
+}
+
+static inline __u64 lcd_last_transno(struct lsd_client_data *lcd)
+{
+	return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
+		lcd->lcd_last_transno : lcd->lcd_last_close_transno);
+}
+
+static inline __u64 lcd_last_xid(struct lsd_client_data *lcd)
+{
+	return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
+		lcd->lcd_last_xid : lcd->lcd_last_close_xid);
+}
+
+/****************** superblock additional info *********************/
+
+struct ll_sb_info;
+
+struct lustre_sb_info {
+	int		       lsi_flags;
+	struct obd_device	*lsi_mgc;     /* mgc obd */
+	struct lustre_mount_data *lsi_lmd;     /* mount command info */
+	struct ll_sb_info	*lsi_llsbi;   /* add'l client sbi info */
+	struct dt_device	 *lsi_dt_dev;  /* dt device to access disk fs*/
+	struct vfsmount	  *lsi_srv_mnt; /* the one server mount */
+	atomic_t	      lsi_mounts;  /* references to the srv_mnt */
+	char			  lsi_svname[MTI_NAME_MAXLEN];
+	char			  lsi_osd_obdname[64];
+	char			  lsi_osd_uuid[64];
+	struct obd_export	 *lsi_osd_exp;
+	char			  lsi_osd_type[16];
+	char			  lsi_fstype[16];
+	struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
+						  own backing_dev_info */
+};
+
+#define LSI_UMOUNT_FAILOVER	      0x00200000
+#define LSI_BDI_INITIALIZED	      0x00400000
+
+#define     s2lsi(sb)	((struct lustre_sb_info *)((sb)->s_fs_info))
+#define     s2lsi_nocast(sb) ((sb)->s_fs_info)
+
+#define     get_profile_name(sb)   (s2lsi(sb)->lsi_lmd->lmd_profile)
+#define	    get_mount_flags(sb)	   (s2lsi(sb)->lsi_lmd->lmd_flags)
+#define	    get_mntdev_name(sb)	   (s2lsi(sb)->lsi_lmd->lmd_dev)
+
+
+/****************** mount lookup info *********************/
+
+struct lustre_mount_info {
+	char		 *lmi_name;
+	struct super_block   *lmi_sb;
+	struct vfsmount      *lmi_mnt;
+	struct list_head	    lmi_list_chain;
+};
+
+/****************** prototypes *********************/
+
+/* obd_mount.c */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr);
+int server_name2index(const char *svname, __u32 *idx, const char **endptr);
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize);
+
+int lustre_put_lsi(struct super_block *sb);
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4);
+int lustre_start_mgc(struct super_block *sb);
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+						  struct vfsmount *mnt));
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
+int lustre_common_put_super(struct super_block *sb);
+
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
+
+/** @} disk */
+
+#endif // _LUSTRE_DISK_H

diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm.h b/drivers/staging/lustre/lustre/include/lustre_dlm.h
new file mode 100644
index 0000000..317f928
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm.h

@@ -0,0 +1,1671 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** \defgroup LDLM Lustre Distributed Lock Manager
+ *
+ * Lustre DLM is based on VAX DLM.
+ * Its two main roles are:
+ *   - To provide locking assuring consistency of data on all Lustre nodes.
+ *   - To allow clients to cache state protected by a lock by holding the
+ *     lock until a conflicting lock is requested or it is expired by the LRU.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_DLM_H__
+#define _LUSTRE_DLM_H__
+
+#include <linux/lustre_dlm.h>
+
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_handles.h>
+#include <interval_tree.h> /* for interval_node{}, ldlm_extent */
+#include <lu_ref.h>
+
+struct obd_ops;
+struct obd_device;
+
+#define OBD_LDLM_DEVICENAME  "ldlm"
+
+#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
+#define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000))
+#define LDLM_CTIME_AGE_LIMIT (10)
+#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
+
+/**
+ * LDLM non-error return states
+ */
+typedef enum {
+	ELDLM_OK = 0,
+
+	ELDLM_LOCK_CHANGED = 300,
+	ELDLM_LOCK_ABORTED = 301,
+	ELDLM_LOCK_REPLACED = 302,
+	ELDLM_NO_LOCK_DATA = 303,
+	ELDLM_LOCK_WOULDBLOCK = 304,
+
+	ELDLM_NAMESPACE_EXISTS = 400,
+	ELDLM_BAD_NAMESPACE    = 401
+} ldlm_error_t;
+
+/**
+ * LDLM namespace type.
+ * The "client" type is actually an indication that this is a narrow local view
+ * into complete namespace on the server. Such namespaces cannot make any
+ * decisions about lack of conflicts or do any autonomous lock granting without
+ * first speaking to a server.
+ */
+typedef enum {
+	LDLM_NAMESPACE_SERVER = 1 << 0,
+	LDLM_NAMESPACE_CLIENT = 1 << 1
+} ldlm_side_t;
+
+/**
+ * Declaration of flags sent through the wire.
+ **/
+#define LDLM_FL_LOCK_CHANGED   0x000001 /* extent, mode, or resource changed */
+
+/**
+ * If the server returns one of these flags, then the lock was put on that list.
+ * If the client sends one of these flags (during recovery ONLY!), it wants the
+ * lock added to the specified list, no questions asked.
+ */
+#define LDLM_FL_BLOCK_GRANTED  0x000002
+#define LDLM_FL_BLOCK_CONV     0x000004
+#define LDLM_FL_BLOCK_WAIT     0x000008
+
+/* Used to be LDLM_FL_CBPENDING 0x000010 moved to non-wire flags */
+
+#define LDLM_FL_AST_SENT       0x000020 /* blocking or cancel packet was
+					 * queued for sending. */
+/* Used to be LDLM_FL_WAIT_NOREPROC 0x000040   moved to non-wire flags */
+/* Used to be LDLM_FL_CANCEL	0x000080   moved to non-wire flags */
+
+/**
+ * Lock is being replayed.  This could probably be implied by the fact that one
+ * of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous.
+ */
+#define LDLM_FL_REPLAY	 0x000100
+
+#define LDLM_FL_INTENT_ONLY    0x000200 /* Don't grant lock, just do intent. */
+
+/* Used to be LDLM_FL_LOCAL_ONLY 0x000400  moved to non-wire flags */
+/* Used to be LDLM_FL_FAILED     0x000800  moved to non-wire flags */
+
+#define LDLM_FL_HAS_INTENT     0x001000 /* lock request has intent */
+
+/* Used to be LDLM_FL_CANCELING  0x002000  moved to non-wire flags */
+/* Used to be LDLM_FL_LOCAL      0x004000  moved to non-wire flags */
+
+#define LDLM_FL_DISCARD_DATA   0x010000 /* discard (no writeback) on cancel */
+
+#define LDLM_FL_NO_TIMEOUT     0x020000 /* Blocked by group lock - wait
+					 * indefinitely */
+
+/** file & record locking */
+#define LDLM_FL_BLOCK_NOWAIT   0x040000 /* Server told not to wait if blocked.
+					 * For AGL, OST will not send glimpse
+					 * callback. */
+#define LDLM_FL_TEST_LOCK      0x080000 // return blocking lock
+
+/* Used to be LDLM_FL_LVB_READY  0x100000 moved to non-wire flags */
+/* Used to be LDLM_FL_KMS_IGNORE 0x200000 moved to non-wire flags */
+/* Used to be LDLM_FL_NO_LRU     0x400000 moved to non-wire flags */
+
+/* Immediatelly cancel such locks when they block some other locks. Send
+ * cancel notification to original lock holder, but expect no reply. This is
+ * for clients (like liblustre) that cannot be expected to reliably response
+ * to blocking AST. */
+#define LDLM_FL_CANCEL_ON_BLOCK 0x800000
+
+/* Flags flags inherited from parent lock when doing intents. */
+#define LDLM_INHERIT_FLAGS     (LDLM_FL_CANCEL_ON_BLOCK)
+
+/* Used to be LDLM_FL_CP_REQD	0x1000000 moved to non-wire flags */
+/* Used to be LDLM_FL_CLEANED	0x2000000 moved to non-wire flags */
+/* Used to be LDLM_FL_ATOMIC_CB      0x4000000 moved to non-wire flags */
+/* Used to be LDLM_FL_BL_AST	 0x10000000 moved to non-wire flags */
+/* Used to be LDLM_FL_BL_DONE	0x20000000 moved to non-wire flags */
+
+/* measure lock contention and return -EUSERS if locking contention is high */
+#define LDLM_FL_DENY_ON_CONTENTION 0x40000000
+
+/* These are flags that are mapped into the flags and ASTs of blocking locks */
+#define LDLM_AST_DISCARD_DATA  0x80000000 /* Add FL_DISCARD to blocking ASTs */
+
+/* Flags sent in AST lock_flags to be mapped into the receiving lock. */
+#define LDLM_AST_FLAGS	 (LDLM_FL_DISCARD_DATA)
+
+/*
+ * --------------------------------------------------------------------------
+ * NOTE! Starting from this point, that is, LDLM_FL_* flags with values above
+ * 0x80000000 will not be sent over the wire.
+ * --------------------------------------------------------------------------
+ */
+
+/**
+ * Declaration of flags not sent through the wire.
+ **/
+
+/**
+ * Used for marking lock as a target for -EINTR while cp_ast sleep
+ * emulation + race with upcoming bl_ast.
+ */
+#define LDLM_FL_FAIL_LOC       0x100000000ULL
+
+/**
+ * Used while processing the unused list to know that we have already
+ * handled this lock and decided to skip it.
+ */
+#define LDLM_FL_SKIPPED	0x200000000ULL
+/* this lock is being destroyed */
+#define LDLM_FL_CBPENDING      0x400000000ULL
+/* not a real flag, not saved in lock */
+#define LDLM_FL_WAIT_NOREPROC  0x800000000ULL
+/* cancellation callback already run */
+#define LDLM_FL_CANCEL	 0x1000000000ULL
+#define LDLM_FL_LOCAL_ONLY     0x2000000000ULL
+/* don't run the cancel callback under ldlm_cli_cancel_unused */
+#define LDLM_FL_FAILED	 0x4000000000ULL
+/* lock cancel has already been sent */
+#define LDLM_FL_CANCELING      0x8000000000ULL
+/* local lock (ie, no srv/cli split) */
+#define LDLM_FL_LOCAL	  0x10000000000ULL
+/* XXX FIXME: This is being added to b_size as a low-risk fix to the fact that
+ * the LVB filling happens _after_ the lock has been granted, so another thread
+ * can match it before the LVB has been updated.  As a dirty hack, we set
+ * LDLM_FL_LVB_READY only after we've done the LVB poop.
+ * this is only needed on LOV/OSC now, where LVB is actually used and callers
+ * must set it in input flags.
+ *
+ * The proper fix is to do the granting inside of the completion AST, which can
+ * be replaced with a LVB-aware wrapping function for OSC locks.  That change is
+ * pretty high-risk, though, and would need a lot more testing. */
+#define LDLM_FL_LVB_READY      0x20000000000ULL
+/* A lock contributes to the known minimum size (KMS) calculation until it has
+ * finished the part of its cancelation that performs write back on its dirty
+ * pages.  It can remain on the granted list during this whole time.  Threads
+ * racing to update the KMS after performing their writeback need to know to
+ * exclude each other's locks from the calculation as they walk the granted
+ * list. */
+#define LDLM_FL_KMS_IGNORE     0x40000000000ULL
+/* completion AST to be executed */
+#define LDLM_FL_CP_REQD	0x80000000000ULL
+/* cleanup_resource has already handled the lock */
+#define LDLM_FL_CLEANED	0x100000000000ULL
+/* optimization hint: LDLM can run blocking callback from current context
+ * w/o involving separate thread. in order to decrease cs rate */
+#define LDLM_FL_ATOMIC_CB      0x200000000000ULL
+
+/* It may happen that a client initiates two operations, e.g. unlink and
+ * mkdir, such that the server sends a blocking AST for conflicting
+ * locks to this client for the first operation, whereas the second
+ * operation has canceled this lock and is waiting for rpc_lock which is
+ * taken by the first operation. LDLM_FL_BL_AST is set by
+ * ldlm_callback_handler() in the lock to prevent the Early Lock Cancel
+ * (ELC) code from cancelling it.
+ *
+ * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock
+ * cache is dropped to let ldlm_callback_handler() return EINVAL to the
+ * server. It is used when ELC RPC is already prepared and is waiting
+ * for rpc_lock, too late to send a separate CANCEL RPC. */
+#define LDLM_FL_BL_AST	  0x400000000000ULL
+#define LDLM_FL_BL_DONE	 0x800000000000ULL
+/* Don't put lock into the LRU list, so that it is not canceled due to aging.
+ * Used by MGC locks, they are cancelled only at unmount or by callback. */
+#define LDLM_FL_NO_LRU		0x1000000000000ULL
+
+/**
+ * The blocking callback is overloaded to perform two functions.  These flags
+ * indicate which operation should be performed.
+ */
+#define LDLM_CB_BLOCKING    1
+#define LDLM_CB_CANCELING   2
+
+/**
+ * \name Lock Compatibility Matrix.
+ *
+ * A lock has both a type (extent, flock, inode bits, or plain) and a mode.
+ * Lock types are described in their respective implementation files:
+ * ldlm_{extent,flock,inodebits,plain}.c.
+ *
+ * There are six lock modes along with a compatibility matrix to indicate if
+ * two locks are compatible.
+ *
+ * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock
+ *   on the parent.
+ * - PW: Protective Write (normal write) mode. When a client requests a write
+ *   lock from an OST, a lock with PW mode will be issued.
+ * - PR: Protective Read (normal read) mode. When a client requests a read from
+ *   an OST, a lock with PR mode will be issued. Also, if the client opens a
+ *   file for execution, it is granted a lock with PR mode.
+ * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client
+ *   requests a write lock during a file open operation.
+ * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants
+ *   an inodebit lock with the CR mode on the intermediate path component.
+ * - NL Null mode.
+ *
+ * <PRE>
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * </PRE>
+ */
+/** @{ */
+#define LCK_COMPAT_EX  LCK_NL
+#define LCK_COMPAT_PW  (LCK_COMPAT_EX | LCK_CR)
+#define LCK_COMPAT_PR  (LCK_COMPAT_PW | LCK_PR)
+#define LCK_COMPAT_CW  (LCK_COMPAT_PW | LCK_CW)
+#define LCK_COMPAT_CR  (LCK_COMPAT_CW | LCK_PR | LCK_PW)
+#define LCK_COMPAT_NL  (LCK_COMPAT_CR | LCK_EX | LCK_GROUP)
+#define LCK_COMPAT_GROUP  (LCK_GROUP | LCK_NL)
+#define LCK_COMPAT_COS (LCK_COS)
+/** @} Lock Compatibility Matrix */
+
+extern ldlm_mode_t lck_compat_array[];
+
+static inline void lockmode_verify(ldlm_mode_t mode)
+{
+       LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE);
+}
+
+static inline int lockmode_compat(ldlm_mode_t exist_mode, ldlm_mode_t new_mode)
+{
+       return (lck_compat_array[exist_mode] & new_mode);
+}
+
+/*
+ *
+ * cluster name spaces
+ *
+ */
+
+#define DLM_OST_NAMESPACE 1
+#define DLM_MDS_NAMESPACE 2
+
+/* XXX
+   - do we just separate this by security domains and use a prefix for
+     multiple namespaces in the same domain?
+   -
+*/
+
+/**
+ * Locking rules for LDLM:
+ *
+ * lr_lock
+ *
+ * lr_lock
+ *     waiting_locks_spinlock
+ *
+ * lr_lock
+ *     led_lock
+ *
+ * lr_lock
+ *     ns_lock
+ *
+ * lr_lvb_mutex
+ *     lr_lock
+ *
+ */
+
+struct ldlm_pool;
+struct ldlm_lock;
+struct ldlm_resource;
+struct ldlm_namespace;
+
+/**
+ * Operations on LDLM pools.
+ * LDLM pool is a pool of locks in the namespace without any implicitly
+ * specified limits.
+ * Locks in the pool are organized in LRU.
+ * Local memory pressure or server instructions (e.g. mempressure on server)
+ * can trigger freeing of locks from the pool
+ */
+struct ldlm_pool_ops {
+	/** Recalculate pool \a pl usage */
+	int (*po_recalc)(struct ldlm_pool *pl);
+	/** Cancel at least \a nr locks from pool \a pl */
+	int (*po_shrink)(struct ldlm_pool *pl, int nr,
+			 unsigned int gfp_mask);
+	int (*po_setup)(struct ldlm_pool *pl, int limit);
+};
+
+/** One second for pools thread check interval. Each pool has own period. */
+#define LDLM_POOLS_THREAD_PERIOD (1)
+
+/** ~6% margin for modest pools. See ldlm_pool.c for details. */
+#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4)
+
+/** Default recalc period for server side pools in sec. */
+#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1)
+
+/** Default recalc period for client side pools in sec. */
+#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10)
+
+/**
+ * LDLM pool structure to track granted locks.
+ * For purposes of determining when to release locks on e.g. memory pressure.
+ * This feature is commonly referred to as lru_resize.
+ */
+struct ldlm_pool {
+	/** Pool proc directory. */
+	proc_dir_entry_t	*pl_proc_dir;
+	/** Pool name, must be long enough to hold compound proc entry name. */
+	char			pl_name[100];
+	/** Lock for protecting SLV/CLV updates. */
+	spinlock_t		pl_lock;
+	/** Number of allowed locks in in pool, both, client and server side. */
+	atomic_t		pl_limit;
+	/** Number of granted locks in */
+	atomic_t		pl_granted;
+	/** Grant rate per T. */
+	atomic_t		pl_grant_rate;
+	/** Cancel rate per T. */
+	atomic_t		pl_cancel_rate;
+	/** Server lock volume (SLV). Protected by pl_lock. */
+	__u64			pl_server_lock_volume;
+	/** Current biggest client lock volume. Protected by pl_lock. */
+	__u64			pl_client_lock_volume;
+	/** Lock volume factor. SLV on client is calculated as following:
+	 *  server_slv * lock_volume_factor. */
+	atomic_t		pl_lock_volume_factor;
+	/** Time when last SLV from server was obtained. */
+	time_t			pl_recalc_time;
+	/** Recalculation period for pool. */
+	time_t			pl_recalc_period;
+	/** Recalculation and shrink operations. */
+	struct ldlm_pool_ops	*pl_ops;
+	/** Number of planned locks for next period. */
+	int			pl_grant_plan;
+	/** Pool statistics. */
+	struct lprocfs_stats	*pl_stats;
+};
+
+typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
+			       void *req_cookie, ldlm_mode_t mode, __u64 flags,
+			       void *data);
+
+typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock);
+
+/**
+ * LVB operations.
+ * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could
+ * be associated with an LDLM lock and transferred from client to server and
+ * back.
+ *
+ * Currently LVBs are used by:
+ *  - OSC-OST code to maintain current object size/times
+ *  - layout lock code to return the layout when the layout lock is granted
+ */
+struct ldlm_valblock_ops {
+	int (*lvbo_init)(struct ldlm_resource *res);
+	int (*lvbo_update)(struct ldlm_resource *res,
+			   struct ptlrpc_request *r,
+			   int increase);
+	int (*lvbo_free)(struct ldlm_resource *res);
+	/* Return size of lvb data appropriate RPC size can be reserved */
+	int (*lvbo_size)(struct ldlm_lock *lock);
+	/* Called to fill in lvb data to RPC buffer @buf */
+	int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen);
+};
+
+/**
+ * LDLM pools related, type of lock pool in the namespace.
+ * Greedy means release cached locks aggressively
+ */
+typedef enum {
+	LDLM_NAMESPACE_GREEDY = 1 << 0,
+	LDLM_NAMESPACE_MODEST = 1 << 1
+} ldlm_appetite_t;
+
+/**
+ * Default values for the "max_nolock_size", "contention_time" and
+ * "contended_locks" namespace tunables.
+ */
+#define NS_DEFAULT_MAX_NOLOCK_BYTES 0
+#define NS_DEFAULT_CONTENTION_SECONDS 2
+#define NS_DEFAULT_CONTENDED_LOCKS 32
+
+struct ldlm_ns_bucket {
+	/** back pointer to namespace */
+	struct ldlm_namespace      *nsb_namespace;
+	/**
+	 * Estimated lock callback time.  Used by adaptive timeout code to
+	 * avoid spurious client evictions due to unresponsiveness when in
+	 * fact the network or overall system load is at fault
+	 */
+	struct adaptive_timeout     nsb_at_estimate;
+};
+
+enum {
+	/** LDLM namespace lock stats */
+	LDLM_NSS_LOCKS	  = 0,
+	LDLM_NSS_LAST
+};
+
+typedef enum {
+	/** invalide type */
+	LDLM_NS_TYPE_UNKNOWN    = 0,
+	/** mdc namespace */
+	LDLM_NS_TYPE_MDC,
+	/** mds namespace */
+	LDLM_NS_TYPE_MDT,
+	/** osc namespace */
+	LDLM_NS_TYPE_OSC,
+	/** ost namespace */
+	LDLM_NS_TYPE_OST,
+	/** mgc namespace */
+	LDLM_NS_TYPE_MGC,
+	/** mgs namespace */
+	LDLM_NS_TYPE_MGT,
+} ldlm_ns_type_t;
+
+/**
+ * LDLM Namespace.
+ *
+ * Namespace serves to contain locks related to a particular service.
+ * There are two kinds of namespaces:
+ * - Server namespace has knowledge of all locks and is therefore authoritative
+ *   to make decisions like what locks could be granted and what conflicts
+ *   exist during new lock enqueue.
+ * - Client namespace only has limited knowledge about locks in the namespace,
+ *   only seeing locks held by the client.
+ *
+ * Every Lustre service has one server namespace present on the server serving
+ * that service. Every client connected to the service has a client namespace
+ * for it.
+ * Every lock obtained by client in that namespace is actually represented by
+ * two in-memory locks. One on the server and one on the client. The locks are
+ * linked by a special cookie by which one node can tell to the other which lock
+ * it actually means during communications. Such locks are called remote locks.
+ * The locks held by server only without any reference to a client are called
+ * local locks.
+ */
+struct ldlm_namespace {
+	/** Backward link to OBD, required for LDLM pool to store new SLV. */
+	struct obd_device	*ns_obd;
+
+	/** Flag indicating if namespace is on client instead of server */
+	ldlm_side_t		ns_client;
+
+	/** Resource hash table for namespace. */
+	cfs_hash_t		*ns_rs_hash;
+
+	/** serialize */
+	spinlock_t		ns_lock;
+
+	/** big refcount (by bucket) */
+	atomic_t		ns_bref;
+
+	/**
+	 * Namespace connect flags supported by server (may be changed via
+	 * /proc, LRU resize may be disabled/enabled).
+	 */
+	__u64			ns_connect_flags;
+
+	/** Client side original connect flags supported by server. */
+	__u64			ns_orig_connect_flags;
+
+	/* namespace proc dir entry */
+	struct proc_dir_entry	*ns_proc_dir_entry;
+
+	/**
+	 * Position in global namespace list linking all namespaces on
+	 * the node.
+	 */
+	struct list_head		ns_list_chain;
+
+	/**
+	 * List of unused locks for this namespace. This list is also called
+	 * LRU lock list.
+	 * Unused locks are locks with zero reader/writer reference counts.
+	 * This list is only used on clients for lock caching purposes.
+	 * When we want to release some locks voluntarily or if server wants
+	 * us to release some locks due to e.g. memory pressure, we take locks
+	 * to release from the head of this list.
+	 * Locks are linked via l_lru field in \see struct ldlm_lock.
+	 */
+	struct list_head		ns_unused_list;
+	/** Number of locks in the LRU list above */
+	int			ns_nr_unused;
+
+	/**
+	 * Maximum number of locks permitted in the LRU. If 0, means locks
+	 * are managed by pools and there is no preset limit, rather it is all
+	 * controlled by available memory on this client and on server.
+	 */
+	unsigned int		ns_max_unused;
+	/** Maximum allowed age (last used time) for locks in the LRU */
+	unsigned int		ns_max_age;
+	/**
+	 * Server only: number of times we evicted clients due to lack of reply
+	 * to ASTs.
+	 */
+	unsigned int		ns_timeouts;
+	/**
+	 * Number of seconds since the file change time after which the
+	 * MDT will return an UPDATE lock along with a LOOKUP lock.
+	 * This allows the client to start caching negative dentries
+	 * for a directory and may save an RPC for a later stat.
+	 */
+	unsigned int		ns_ctime_age_limit;
+
+	/**
+	 * Used to rate-limit ldlm_namespace_dump calls.
+	 * \see ldlm_namespace_dump. Increased by 10 seconds every time
+	 * it is called.
+	 */
+	cfs_time_t		ns_next_dump;
+
+	/** "policy" function that does actual lock conflict determination */
+	ldlm_res_policy		ns_policy;
+
+	/**
+	 * LVB operations for this namespace.
+	 * \see struct ldlm_valblock_ops
+	 */
+	struct ldlm_valblock_ops *ns_lvbo;
+
+	/**
+	 * Used by filter code to store pointer to OBD of the service.
+	 * Should be dropped in favor of \a ns_obd
+	 */
+	void			*ns_lvbp;
+
+	/**
+	 * Wait queue used by __ldlm_namespace_free. Gets woken up every time
+	 * a resource is removed.
+	 */
+	wait_queue_head_t		ns_waitq;
+	/** LDLM pool structure for this namespace */
+	struct ldlm_pool	ns_pool;
+	/** Definition of how eagerly unused locks will be released from LRU */
+	ldlm_appetite_t		ns_appetite;
+
+	/**
+	 * If more than \a ns_contended_locks are found, the resource is
+	 * considered to be contended. Lock enqueues might specify that no
+	 * contended locks should be granted
+	 */
+	unsigned		ns_contended_locks;
+
+	/**
+	 * The resources in this namespace remember contended state during
+	 * \a ns_contention_time, in seconds.
+	 */
+	unsigned		ns_contention_time;
+
+	/**
+	 * Limit size of contended extent locks, in bytes.
+	 * If extended lock is requested for more then this many bytes and
+	 * caller instructs us not to grant contended locks, we would disregard
+	 * such a request.
+	 */
+	unsigned		ns_max_nolock_size;
+
+	/** Limit of parallel AST RPC count. */
+	unsigned		ns_max_parallel_ast;
+
+	/** Callback to cancel locks before replaying it during recovery. */
+	ldlm_cancel_for_recovery ns_cancel_for_recovery;
+
+	/** LDLM lock stats */
+	struct lprocfs_stats	*ns_stats;
+
+	/**
+	 * Flag to indicate namespace is being freed. Used to determine if
+	 * recalculation of LDLM pool statistics should be skipped.
+	 */
+	unsigned		ns_stopping:1;
+};
+
+/**
+ * Returns 1 if namespace \a ns is a client namespace.
+ */
+static inline int ns_is_client(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+				    LDLM_NAMESPACE_SERVER)));
+	LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+		ns->ns_client == LDLM_NAMESPACE_SERVER);
+	return ns->ns_client == LDLM_NAMESPACE_CLIENT;
+}
+
+/**
+ * Returns 1 if namespace \a ns is a server namespace.
+ */
+static inline int ns_is_server(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+				    LDLM_NAMESPACE_SERVER)));
+	LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+		ns->ns_client == LDLM_NAMESPACE_SERVER);
+	return ns->ns_client == LDLM_NAMESPACE_SERVER;
+}
+
+/**
+ * Returns 1 if namespace \a ns supports early lock cancel (ELC).
+ */
+static inline int ns_connect_cancelset(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET);
+}
+
+/**
+ * Returns 1 if this namespace supports lru_resize.
+ */
+static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
+{
+	LASSERT(ns != NULL);
+	return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline void ns_register_cancel(struct ldlm_namespace *ns,
+				      ldlm_cancel_for_recovery arg)
+{
+	LASSERT(ns != NULL);
+	ns->ns_cancel_for_recovery = arg;
+}
+
+struct ldlm_lock;
+
+/** Type for blocking callback function of a lock. */
+typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
+				      struct ldlm_lock_desc *new, void *data,
+				      int flag);
+/** Type for completion callback function of a lock. */
+typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
+					void *data);
+/** Type for glimpse callback function of a lock. */
+typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
+/** Type for weight callback function of a lock. */
+typedef unsigned long (*ldlm_weigh_callback)(struct ldlm_lock *lock);
+
+/** Work list for sending GL ASTs to multiple locks. */
+struct ldlm_glimpse_work {
+	struct ldlm_lock	*gl_lock; /* lock to glimpse */
+	struct list_head		 gl_list; /* linkage to other gl work structs */
+	__u32			 gl_flags;/* see LDLM_GL_WORK_* below */
+	union ldlm_gl_desc	*gl_desc; /* glimpse descriptor to be packed in
+					   * glimpse callback request */
+};
+
+/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
+#define LDLM_GL_WORK_NOFREE 0x1
+
+/** Interval node data for each LDLM_EXTENT lock. */
+struct ldlm_interval {
+	struct interval_node	li_node;  /* node for tree management */
+	struct list_head		li_group; /* the locks which have the same
+					   * policy - group of the policy */
+};
+#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
+
+/**
+ * Interval tree for extent locks.
+ * The interval tree must be accessed under the resource lock.
+ * Interval trees are used for granted extent locks to speed up conflicts
+ * lookup. See ldlm/interval_tree.c for more details.
+ */
+struct ldlm_interval_tree {
+	/** Tree size. */
+	int			lit_size;
+	ldlm_mode_t		lit_mode;  /* lock mode */
+	struct interval_node	*lit_root; /* actual ldlm_interval */
+};
+
+/** Whether to track references to exports by LDLM locks. */
+#define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
+
+/** Cancel flags. */
+typedef enum {
+	LCF_ASYNC      = 0x1, /* Cancel locks asynchronously. */
+	LCF_LOCAL      = 0x2, /* Cancel locks locally, not notifing server */
+	LCF_BL_AST     = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST
+			       * in the same RPC */
+} ldlm_cancel_flags_t;
+
+struct ldlm_flock {
+	__u64 start;
+	__u64 end;
+	__u64 owner;
+	__u64 blocking_owner;
+	struct obd_export *blocking_export;
+	/* Protected by the hash lock */
+	__u32 blocking_refs;
+	__u32 pid;
+};
+
+typedef union {
+	struct ldlm_extent l_extent;
+	struct ldlm_flock l_flock;
+	struct ldlm_inodebits l_inodebits;
+} ldlm_policy_data_t;
+
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+				 const ldlm_policy_data_t *lpolicy,
+				 ldlm_wire_policy_data_t *wpolicy);
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+				  const ldlm_wire_policy_data_t *wpolicy,
+				  ldlm_policy_data_t *lpolicy);
+
+enum lvb_type {
+	LVB_T_NONE	= 0,
+	LVB_T_OST	= 1,
+	LVB_T_LQUOTA	= 2,
+	LVB_T_LAYOUT	= 3,
+};
+
+/**
+ * LDLM lock structure
+ *
+ * Represents a single LDLM lock and its state in memory. Each lock is
+ * associated with a single ldlm_resource, the object which is being
+ * locked. There may be multiple ldlm_locks on a single resource,
+ * depending on the lock type and whether the locks are conflicting or
+ * not.
+ */
+struct ldlm_lock {
+	/**
+	 * Local lock handle.
+	 * When remote side wants to tell us about a lock, they address
+	 * it by this opaque handle.  The handle does not hold a
+	 * reference on the ldlm_lock, so it can be safely passed to
+	 * other threads or nodes. When the lock needs to be accessed
+	 * from the handle, it is looked up again in the lock table, and
+	 * may no longer exist.
+	 *
+	 * Must be first in the structure.
+	 */
+	struct portals_handle	l_handle;
+	/**
+	 * Lock reference count.
+	 * This is how many users have pointers to actual structure, so that
+	 * we do not accidentally free lock structure that is in use.
+	 */
+	atomic_t		l_refc;
+	/**
+	 * Internal spinlock protects l_resource.  We should hold this lock
+	 * first before taking res_lock.
+	 */
+	spinlock_t		l_lock;
+	/**
+	 * Pointer to actual resource this lock is in.
+	 * ldlm_lock_change_resource() can change this.
+	 */
+	struct ldlm_resource	*l_resource;
+	/**
+	 * List item for client side LRU list.
+	 * Protected by ns_lock in struct ldlm_namespace.
+	 */
+	struct list_head		l_lru;
+	/**
+	 * Linkage to resource's lock queues according to current lock state.
+	 * (could be granted, waiting or converting)
+	 * Protected by lr_lock in struct ldlm_resource.
+	 */
+	struct list_head		l_res_link;
+	/**
+	 * Tree node for ldlm_extent.
+	 */
+	struct ldlm_interval	*l_tree_node;
+	/**
+	 * Per export hash of locks.
+	 * Protected by per-bucket exp->exp_lock_hash locks.
+	 */
+	struct hlist_node	l_exp_hash;
+	/**
+	 * Per export hash of flock locks.
+	 * Protected by per-bucket exp->exp_flock_hash locks.
+	 */
+	struct hlist_node	l_exp_flock_hash;
+	/**
+	 * Requested mode.
+	 * Protected by lr_lock.
+	 */
+	ldlm_mode_t		l_req_mode;
+	/**
+	 * Granted mode, also protected by lr_lock.
+	 */
+	ldlm_mode_t		l_granted_mode;
+	/** Lock completion handler pointer. Called when lock is granted. */
+	ldlm_completion_callback l_completion_ast;
+	/**
+	 * Lock blocking AST handler pointer.
+	 * It plays two roles:
+	 * - as a notification of an attempt to queue a conflicting lock (once)
+	 * - as a notification when the lock is being cancelled.
+	 *
+	 * As such it's typically called twice: once for the initial conflict
+	 * and then once more when the last user went away and the lock is
+	 * cancelled (could happen recursively).
+	 */
+	ldlm_blocking_callback	l_blocking_ast;
+	/**
+	 * Lock glimpse handler.
+	 * Glimpse handler is used to obtain LVB updates from a client by
+	 * server
+	 */
+	ldlm_glimpse_callback	l_glimpse_ast;
+
+	/** XXX apparently unused "weight" handler. To be removed? */
+	ldlm_weigh_callback	l_weigh_ast;
+
+	/**
+	 * Lock export.
+	 * This is a pointer to actual client export for locks that were granted
+	 * to clients. Used server-side.
+	 */
+	struct obd_export	*l_export;
+	/**
+	 * Lock connection export.
+	 * Pointer to server export on a client.
+	 */
+	struct obd_export	*l_conn_export;
+
+	/**
+	 * Remote lock handle.
+	 * If the lock is remote, this is the handle of the other side lock
+	 * (l_handle)
+	 */
+	struct lustre_handle	l_remote_handle;
+
+	/**
+	 * Representation of private data specific for a lock type.
+	 * Examples are: extent range for extent lock or bitmask for ibits locks
+	 */
+	ldlm_policy_data_t	l_policy_data;
+
+	/**
+	 * Lock state flags.
+	 * Like whenever we receive any blocking requests for this lock, etc.
+	 * Protected by lr_lock.
+	 */
+	__u64			l_flags;
+	/**
+	 * Lock r/w usage counters.
+	 * Protected by lr_lock.
+	 */
+	__u32			l_readers;
+	__u32			l_writers;
+	/**
+	 * If the lock is granted, a process sleeps on this waitq to learn when
+	 * it's no longer in use.  If the lock is not granted, a process sleeps
+	 * on this waitq to learn when it becomes granted.
+	 */
+	wait_queue_head_t		l_waitq;
+
+	/**
+	 * Seconds. It will be updated if there is any activity related to
+	 * the lock, e.g. enqueue the lock or send blocking AST.
+	 */
+	cfs_time_t		l_last_activity;
+
+	/**
+	 * Time last used by e.g. being matched by lock match.
+	 * Jiffies. Should be converted to time if needed.
+	 */
+	cfs_time_t		l_last_used;
+
+	/** Originally requested extent for the extent lock. */
+	struct ldlm_extent	l_req_extent;
+
+	unsigned int		l_failed:1,
+	/**
+	 * Set for locks that were removed from class hash table and will be
+	 * destroyed when last reference to them is released. Set by
+	 * ldlm_lock_destroy_internal().
+	 *
+	 * Protected by lock and resource locks.
+	 */
+				l_destroyed:1,
+	/*
+	 * it's set in lock_res_and_lock() and unset in unlock_res_and_lock().
+	 *
+	 * NB: compared with check_res_locked(), checking this bit is cheaper.
+	 * Also, spin_is_locked() is deprecated for kernel code; one reason is
+	 * because it works only for SMP so user needs to add extra macros like
+	 * LASSERT_SPIN_LOCKED for uniprocessor kernels.
+	 */
+				l_res_locked:1,
+	/*
+	 * It's set once we call ldlm_add_waiting_lock_res_locked()
+	 * to start the lock-timeout timer and it will never be reset.
+	 *
+	 * Protected by lock_res_and_lock().
+	 */
+				l_waited:1,
+	/** Flag whether this is a server namespace lock. */
+				l_ns_srv:1;
+
+	/*
+	 * Client-side-only members.
+	 */
+
+	enum lvb_type	      l_lvb_type;
+
+	/**
+	 * Temporary storage for a LVB received during an enqueue operation.
+	 */
+	__u32			l_lvb_len;
+	void			*l_lvb_data;
+
+	/** Private storage for lock user. Opaque to LDLM. */
+	void			*l_ast_data;
+
+	/*
+	 * Server-side-only members.
+	 */
+
+	/**
+	 * Connection cookie for the client originating the operation.
+	 * Used by Commit on Share (COS) code. Currently only used for
+	 * inodebits locks on MDS.
+	 */
+	__u64			l_client_cookie;
+
+	/**
+	 * List item for locks waiting for cancellation from clients.
+	 * The lists this could be linked into are:
+	 * waiting_locks_list (protected by waiting_locks_spinlock),
+	 * then if the lock timed out, it is moved to
+	 * expired_lock_thread.elt_expired_locks for further processing.
+	 * Protected by elt_lock.
+	 */
+	struct list_head		l_pending_chain;
+
+	/**
+	 * Set when lock is sent a blocking AST. Time in seconds when timeout
+	 * is reached and client holding this lock could be evicted.
+	 * This timeout could be further extended by e.g. certain IO activity
+	 * under this lock.
+	 * \see ost_rw_prolong_locks
+	 */
+	cfs_time_t		l_callback_timeout;
+
+	/** Local PID of process which created this lock. */
+	__u32			l_pid;
+
+	/**
+	 * Number of times blocking AST was sent for this lock.
+	 * This is for debugging. Valid values are 0 and 1, if there is an
+	 * attempt to send blocking AST more than once, an assertion would be
+	 * hit. \see ldlm_work_bl_ast_lock
+	 */
+	int			l_bl_ast_run;
+	/** List item ldlm_add_ast_work_item() for case of blocking ASTs. */
+	struct list_head		l_bl_ast;
+	/** List item ldlm_add_ast_work_item() for case of completion ASTs. */
+	struct list_head		l_cp_ast;
+	/** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */
+	struct list_head		l_rk_ast;
+
+	/**
+	 * Pointer to a conflicting lock that caused blocking AST to be sent
+	 * for this lock
+	 */
+	struct ldlm_lock	*l_blocking_lock;
+
+	/**
+	 * Protected by lr_lock, linkages to "skip lists".
+	 * For more explanations of skip lists see ldlm/ldlm_inodebits.c
+	 */
+	struct list_head		l_sl_mode;
+	struct list_head		l_sl_policy;
+
+	/** Reference tracking structure to debug leaked locks. */
+	struct lu_ref		l_reference;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	/* Debugging stuff for bug 20498, for tracking export references. */
+	/** number of export references taken */
+	int			l_exp_refs_nr;
+	/** link all locks referencing one export */
+	struct list_head		l_exp_refs_link;
+	/** referenced export object */
+	struct obd_export	*l_exp_refs_target;
+#endif
+	/**
+	 * export blocking dlm lock list, protected by
+	 * l_export->exp_bl_list_lock.
+	 * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock
+	 * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
+	 */
+	struct list_head		l_exp_list;
+};
+
+/**
+ * LDLM resource description.
+ * Basically, resource is a representation for a single object.
+ * Object has a name which is currently 4 64-bit integers. LDLM user is
+ * responsible for creation of a mapping between objects it wants to be
+ * protected and resource names.
+ *
+ * A resource can only hold locks of a single lock type, though there may be
+ * multiple ldlm_locks on a single resource, depending on the lock type and
+ * whether the locks are conflicting or not.
+ */
+struct ldlm_resource {
+	struct ldlm_ns_bucket	*lr_ns_bucket;
+
+	/**
+	 * List item for list in namespace hash.
+	 * protected by ns_lock
+	 */
+	struct hlist_node	lr_hash;
+
+	/** Spinlock to protect locks under this resource. */
+	spinlock_t		lr_lock;
+
+	/**
+	 * protected by lr_lock
+	 * @{ */
+	/** List of locks in granted state */
+	struct list_head		lr_granted;
+	/** List of locks waiting to change their granted mode (converted) */
+	struct list_head		lr_converting;
+	/**
+	 * List of locks that could not be granted due to conflicts and
+	 * that are waiting for conflicts to go away */
+	struct list_head		lr_waiting;
+	/** @} */
+
+	/* XXX No longer needed? Remove ASAP */
+	ldlm_mode_t		lr_most_restr;
+
+	/** Type of locks this resource can hold. Only one type per resource. */
+	ldlm_type_t		lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */
+
+	/** Resource name */
+	struct ldlm_res_id	lr_name;
+	/** Reference count for this resource */
+	atomic_t		lr_refcount;
+
+	/**
+	 * Interval trees (only for extent locks) for all modes of this resource
+	 */
+	struct ldlm_interval_tree lr_itree[LCK_MODE_NUM];
+
+	/**
+	 * Server-side-only lock value block elements.
+	 * To serialize lvbo_init.
+	 */
+	struct mutex		lr_lvb_mutex;
+	int			lr_lvb_len;
+	/** protected by lr_lock */
+	void			*lr_lvb_data;
+
+	/** When the resource was considered as contended. */
+	cfs_time_t		lr_contention_time;
+	/** List of references to this resource. For debugging. */
+	struct lu_ref		lr_reference;
+
+	struct inode		*lr_lvb_inode;
+};
+
+static inline bool ldlm_has_layout(struct ldlm_lock *lock)
+{
+	return lock->l_resource->lr_type == LDLM_IBITS &&
+		lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
+}
+
+static inline char *
+ldlm_ns_name(struct ldlm_namespace *ns)
+{
+	return ns->ns_rs_hash->hs_name;
+}
+
+static inline struct ldlm_namespace *
+ldlm_res_to_ns(struct ldlm_resource *res)
+{
+	return res->lr_ns_bucket->nsb_namespace;
+}
+
+static inline struct ldlm_namespace *
+ldlm_lock_to_ns(struct ldlm_lock *lock)
+{
+	return ldlm_res_to_ns(lock->l_resource);
+}
+
+static inline char *
+ldlm_lock_to_ns_name(struct ldlm_lock *lock)
+{
+	return ldlm_ns_name(ldlm_lock_to_ns(lock));
+}
+
+static inline struct adaptive_timeout *
+ldlm_lock_to_ns_at(struct ldlm_lock *lock)
+{
+	return &lock->l_resource->lr_ns_bucket->nsb_at_estimate;
+}
+
+static inline int ldlm_lvbo_init(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+	if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_init != NULL)
+		return ns->ns_lvbo->lvbo_init(res);
+
+	return 0;
+}
+
+static inline int ldlm_lvbo_size(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL)
+		return ns->ns_lvbo->lvbo_size(lock);
+
+	return 0;
+}
+
+static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	if (ns->ns_lvbo != NULL) {
+		LASSERT(ns->ns_lvbo->lvbo_fill != NULL);
+		return ns->ns_lvbo->lvbo_fill(lock, buf, len);
+	}
+	return 0;
+}
+
+struct ldlm_ast_work {
+	struct ldlm_lock      *w_lock;
+	int		    w_blocking;
+	struct ldlm_lock_desc  w_desc;
+	struct list_head	     w_list;
+	int		    w_flags;
+	void		  *w_data;
+	int		    w_datalen;
+};
+
+/**
+ * Common ldlm_enqueue parameters
+ */
+struct ldlm_enqueue_info {
+	__u32 ei_type;   /** Type of the lock being enqueued. */
+	__u32 ei_mode;   /** Mode of the lock being enqueued. */
+	void *ei_cb_bl;  /** blocking lock callback */
+	void *ei_cb_cp;  /** lock completion callback */
+	void *ei_cb_gl;  /** lock glimpse callback */
+	void *ei_cb_wg;  /** lock weigh callback */
+	void *ei_cbdata; /** Data to be passed into callbacks. */
+};
+
+extern struct obd_ops ldlm_obd_ops;
+
+extern char *ldlm_lockname[];
+extern char *ldlm_typename[];
+extern char *ldlm_it2str(int it);
+
+/**
+ * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG.
+ * For the cases where we do not have actual lock to print along
+ * with a debugging message that is ldlm-related
+ */
+#define LDLM_DEBUG_NOLOCK(format, a...)			\
+	CDEBUG(D_DLMTRACE, "### " format "\n" , ##a)
+
+/**
+ * Support function for lock information printing into debug logs.
+ * \see LDLM_DEBUG
+ */
+#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do {      \
+	CFS_CHECK_STACK(msgdata, mask, cdls);			   \
+									\
+	if (((mask) & D_CANTMASK) != 0 ||			       \
+	    ((libcfs_debug & (mask)) != 0 &&			    \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))	  \
+		_ldlm_lock_debug(lock, msgdata, fmt, ##a);	      \
+} while(0)
+
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+		      struct libcfs_debug_msg_data *data,
+		      const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Rate-limited version of lock printing function.
+ */
+#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do {			 \
+	static cfs_debug_limit_state_t _ldlm_cdls;			   \
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls);	      \
+	ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\
+} while (0)
+
+#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a)
+#define LDLM_WARN(lock, fmt, a...)  LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a)
+
+/** Non-rate-limited lock printing function for debugging purposes. */
+#define LDLM_DEBUG(lock, fmt, a...)   do {				  \
+	if (likely(lock != NULL)) {					    \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL);      \
+		ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock,	    \
+				"### " fmt , ##a);			    \
+	} else {							    \
+		LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a);		    \
+	}								    \
+} while (0)
+
+typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
+				      int first_enq, ldlm_error_t *err,
+				      struct list_head *work_list);
+
+/**
+ * Return values for lock iterators.
+ * Also used during deciding of lock grants and cancellations.
+ */
+#define LDLM_ITER_CONTINUE 1 /* keep iterating */
+#define LDLM_ITER_STOP     2 /* stop iterating */
+
+typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *);
+typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *);
+
+/** \defgroup ldlm_iterator Lock iterators
+ *
+ * LDLM provides for a way to iterate through every lock on a resource or
+ * namespace or every resource in a namespace.
+ * @{ */
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+			  void *closure);
+void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
+			    void *closure);
+int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *,
+			  ldlm_iterator_t iter, void *data);
+/** @} ldlm_iterator */
+
+int ldlm_replay_locks(struct obd_import *imp);
+
+/* ldlm_flock.c */
+int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+
+/* ldlm_extent.c */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms);
+
+struct ldlm_callback_suite {
+	ldlm_completion_callback lcs_completion;
+	ldlm_blocking_callback   lcs_blocking;
+	ldlm_glimpse_callback    lcs_glimpse;
+	ldlm_weigh_callback      lcs_weigh;
+};
+
+/* ldlm_lockd.c */
+int ldlm_del_waiting_lock(struct ldlm_lock *lock);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_get_ref(void);
+void ldlm_put_ref(void);
+int ldlm_init_export(struct obd_export *exp);
+void ldlm_destroy_export(struct obd_export *exp);
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
+
+/* ldlm_lock.c */
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
+void ldlm_lock2handle(const struct ldlm_lock *lock,
+		      struct lustre_handle *lockh);
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags);
+void ldlm_cancel_callback(struct ldlm_lock *);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *);
+int ldlm_lock_set_data(struct lustre_handle *, void *);
+
+/**
+ * Obtain a lock reference by its handle.
+ */
+static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h)
+{
+	return __ldlm_handle2lock(h, 0);
+}
+
+#define LDLM_LOCK_REF_DEL(lock) \
+	lu_ref_del(&lock->l_reference, "handle", current)
+
+static inline struct ldlm_lock *
+ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
+{
+	struct ldlm_lock *lock;
+
+	lock = __ldlm_handle2lock(h, flags);
+	if (lock != NULL)
+		LDLM_LOCK_REF_DEL(lock);
+	return lock;
+}
+
+/**
+ * Update Lock Value Block Operations (LVBO) on a resource taking into account
+ * data from reqest \a r
+ */
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+				       struct ptlrpc_request *r, int increase)
+{
+	if (ldlm_res_to_ns(res)->ns_lvbo &&
+	    ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
+		return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r,
+								 increase);
+	}
+	return 0;
+}
+
+int ldlm_error2errno(ldlm_error_t error);
+ldlm_error_t ldlm_errno2error(int err_no); /* don't call it `errno': this
+					    * confuses user-space. */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp);
+#endif
+
+/**
+ * Release a temporary lock reference obtained by ldlm_handle2lock() or
+ * __ldlm_handle2lock().
+ */
+#define LDLM_LOCK_PUT(lock)		     \
+do {					    \
+	LDLM_LOCK_REF_DEL(lock);		\
+	/*LDLM_DEBUG((lock), "put");*/	  \
+	ldlm_lock_put(lock);		    \
+} while (0)
+
+/**
+ * Release a lock reference obtained by some other means (see
+ * LDLM_LOCK_PUT()).
+ */
+#define LDLM_LOCK_RELEASE(lock)		 \
+do {					    \
+	/*LDLM_DEBUG((lock), "put");*/	  \
+	ldlm_lock_put(lock);		    \
+} while (0)
+
+#define LDLM_LOCK_GET(lock)		     \
+({					      \
+	ldlm_lock_get(lock);		    \
+	/*LDLM_DEBUG((lock), "get");*/	  \
+	lock;				   \
+})
+
+#define ldlm_lock_list_put(head, member, count)		     \
+({								  \
+	struct ldlm_lock *_lock, *_next;			    \
+	int c = count;					      \
+	list_for_each_entry_safe(_lock, _next, head, member) {  \
+		if (c-- == 0)				       \
+			break;				      \
+		list_del_init(&_lock->member);		  \
+		LDLM_LOCK_RELEASE(_lock);			   \
+	}							   \
+	LASSERT(c <= 0);					    \
+})
+
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+void ldlm_lock_put(struct ldlm_lock *lock);
+void ldlm_lock_destroy(struct ldlm_lock *lock);
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc);
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode);
+int  ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
+void ldlm_lock_fail_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+			    const struct ldlm_res_id *, ldlm_type_t type,
+			    ldlm_policy_data_t *, ldlm_mode_t mode,
+			    struct lustre_handle *, int unref);
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+					__u64 *bits);
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+					__u32 *flags);
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode);
+void ldlm_lock_cancel(struct ldlm_lock *lock);
+void ldlm_reprocess_all(struct ldlm_resource *res);
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns);
+void ldlm_lock_dump_handle(int level, struct lustre_handle *);
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
+
+/* resource.c */
+struct ldlm_namespace *
+ldlm_namespace_new(struct obd_device *obd, char *name,
+		   ldlm_side_t client, ldlm_appetite_t apt,
+		   ldlm_ns_type_t ns_type);
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags);
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+			 struct obd_import *imp, int force);
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client);
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client);
+void ldlm_namespace_get(struct ldlm_namespace *ns);
+void ldlm_namespace_put(struct ldlm_namespace *ns);
+int ldlm_proc_setup(void);
+#ifdef LPROCFS
+void ldlm_proc_cleanup(void);
+#else
+static inline void ldlm_proc_cleanup(void) {}
+#endif
+
+/* resource.c - internal */
+struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
+					struct ldlm_resource *parent,
+					const struct ldlm_res_id *,
+					ldlm_type_t type, int create);
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
+int ldlm_resource_putref(struct ldlm_resource *res);
+void ldlm_resource_add_lock(struct ldlm_resource *res,
+			    struct list_head *head,
+			    struct ldlm_lock *lock);
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock);
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level);
+void ldlm_namespace_dump(int level, struct ldlm_namespace *);
+void ldlm_resource_dump(int level, struct ldlm_resource *);
+int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
+			      const struct ldlm_res_id *);
+
+#define LDLM_RESOURCE_ADDREF(res) do {				  \
+	lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+#define LDLM_RESOURCE_DELREF(res) do {				  \
+	lu_ref_del(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+/* ldlm_request.c */
+int ldlm_expired_completion_wait(void *data);
+/** \defgroup ldlm_local_ast Default AST handlers for local locks
+ * These AST handlers are typically used for server-side local locks and are
+ * also used by client-side lock handlers to perform minimum level base
+ * processing.
+ * @{ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock);
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		      void *data, int flag);
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp);
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+/** @} ldlm_local_ast */
+
+/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users.
+ * These are typically used by client and server (*_local versions)
+ * to obtain and release locks.
+ * @{ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+		     struct ldlm_enqueue_info *einfo,
+		     const struct ldlm_res_id *res_id,
+		     ldlm_policy_data_t const *policy, __u64 *flags,
+		     void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+		     struct lustre_handle *lockh, int async);
+int ldlm_prep_enqueue_req(struct obd_export *exp,
+			  struct ptlrpc_request *req,
+			  struct list_head *cancels,
+			  int count);
+int ldlm_prep_elc_req(struct obd_export *exp,
+		      struct ptlrpc_request *req,
+		      int version, int opc, int canceloff,
+		      struct list_head *cancels, int count);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len);
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+			 const struct ldlm_request *dlm_req,
+			 const struct ldlm_callback_suite *cbs);
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+			  ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+			  __u64 *flags, void *lvb, __u32 lvb_len,
+			  struct lustre_handle *lockh, int rc);
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   ldlm_type_t type, ldlm_policy_data_t *policy,
+			   ldlm_mode_t mode, __u64 *flags,
+			   ldlm_blocking_callback blocking,
+			   ldlm_completion_callback completion,
+			   ldlm_glimpse_callback glimpse,
+			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
+			   const __u64 *client_cookie,
+			   struct lustre_handle *lockh);
+int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new,
+		    void *data, __u32 data_len);
+int ldlm_cli_convert(struct lustre_handle *, int new_mode, __u32 *flags);
+int ldlm_cli_update_pool(struct ptlrpc_request *req);
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+		    ldlm_cancel_flags_t cancel_flags);
+int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *,
+			   ldlm_cancel_flags_t flags, void *opaque);
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+				    const struct ldlm_res_id *res_id,
+				    ldlm_policy_data_t *policy,
+				    ldlm_mode_t mode,
+				    ldlm_cancel_flags_t flags,
+				    void *opaque);
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head,
+			int count, ldlm_cancel_flags_t flags);
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+			       struct list_head *cancels,
+			       ldlm_policy_data_t *policy,
+			       ldlm_mode_t mode, int lock_flags,
+			       ldlm_cancel_flags_t cancel_flags, void *opaque);
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+			       ldlm_cancel_flags_t flags);
+int ldlm_cli_cancel_list(struct list_head *head, int count,
+			 struct ptlrpc_request *req, ldlm_cancel_flags_t flags);
+/** @} ldlm_cli_api */
+
+/* mds/handler.c */
+/* This has to be here because recursive inclusion sucks. */
+int intent_disposition(struct ldlm_reply *rep, int flag);
+void intent_set_disposition(struct ldlm_reply *rep, int flag);
+
+
+/* ioctls for trying requests */
+#define IOC_LDLM_TYPE		   'f'
+#define IOC_LDLM_MIN_NR		 40
+
+#define IOC_LDLM_TEST		   _IOWR('f', 40, long)
+#define IOC_LDLM_DUMP		   _IOWR('f', 41, long)
+#define IOC_LDLM_REGRESS_START	  _IOWR('f', 42, long)
+#define IOC_LDLM_REGRESS_STOP	   _IOWR('f', 43, long)
+#define IOC_LDLM_MAX_NR		 43
+
+/**
+ * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more
+ * than one lock_res is dead-lock safe.
+ */
+enum lock_res_type {
+	LRT_NORMAL,
+	LRT_NEW
+};
+
+/** Lock resource. */
+static inline void lock_res(struct ldlm_resource *res)
+{
+	spin_lock(&res->lr_lock);
+}
+
+/** Lock resource with a way to instruct lockdep code about nestedness-safe. */
+static inline void lock_res_nested(struct ldlm_resource *res,
+				   enum lock_res_type mode)
+{
+	spin_lock_nested(&res->lr_lock, mode);
+}
+
+/** Unlock resource. */
+static inline void unlock_res(struct ldlm_resource *res)
+{
+	spin_unlock(&res->lr_lock);
+}
+
+/** Check if resource is already locked, assert if not. */
+static inline void check_res_locked(struct ldlm_resource *res)
+{
+	LASSERT(spin_is_locked(&res->lr_lock));
+}
+
+struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock);
+void unlock_res_and_lock(struct ldlm_lock *lock);
+
+/* ldlm_pool.c */
+/** \defgroup ldlm_pools Various LDLM pool related functions
+ * There are not used outside of ldlm.
+ * @{
+ */
+void ldlm_pools_recalc(ldlm_side_t client);
+int ldlm_pools_init(void);
+void ldlm_pools_fini(void);
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, ldlm_side_t client);
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+		     unsigned int gfp_mask);
+void ldlm_pool_fini(struct ldlm_pool *pl);
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
+int ldlm_pool_recalc(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl);
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv);
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv);
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit);
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock);
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock);
+/** @} */
+
+#endif
+/** @} LDLM */

diff --git a/drivers/staging/lustre/lustre/include/lustre_eacl.h b/drivers/staging/lustre/lustre/include/lustre_eacl.h
new file mode 100644
index 0000000..b94f76a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_eacl.h

@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_EACL_H
+#define _LUSTRE_EACL_H
+
+/** \defgroup eacl eacl
+ *
+ * @{
+ */
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <linux/posix_acl_xattr.h>
+
+typedef struct {
+	__u16		   e_tag;
+	__u16		   e_perm;
+	__u32		   e_id;
+	__u32		   e_stat;
+} ext_acl_xattr_entry;
+
+typedef struct {
+	__u32		   a_count;
+	ext_acl_xattr_entry     a_entries[0];
+} ext_acl_xattr_header;
+
+#define CFS_ACL_XATTR_SIZE(count, prefix) \
+	(sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry))
+
+#define CFS_ACL_XATTR_COUNT(size, prefix) \
+	(((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry))
+
+
+extern ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size);
+extern int
+lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+			      posix_acl_xattr_header **out);
+extern void
+lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size);
+extern void
+lustre_ext_acl_xattr_free(ext_acl_xattr_header *header);
+extern int
+lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+			     ext_acl_xattr_header *ext_header,
+			     posix_acl_xattr_header **out);
+extern ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+			   ext_acl_xattr_header *ext_header);
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+/** @} eacl */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_export.h b/drivers/staging/lustre/lustre/include/lustre_export.h
new file mode 100644
index 0000000..d61c020
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_export.h

@@ -0,0 +1,389 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_export PortalRPC export definitions
+ *
+ * @{
+ */
+
+#ifndef __EXPORT_H
+#define __EXPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+struct mds_client_data;
+struct mdt_client_data;
+struct mds_idmap_table;
+struct mdt_idmap_table;
+
+/**
+ * Target-specific export data
+ */
+struct tg_export_data {
+	/** Protects led_lcd below */
+	struct mutex		ted_lcd_lock;
+	/** Per-client data for each export */
+	struct lsd_client_data	*ted_lcd;
+	/** Offset of record in last_rcvd file */
+	loff_t			ted_lr_off;
+	/** Client index in last_rcvd file */
+	int			ted_lr_idx;
+};
+
+/**
+ * MDT-specific export data
+ */
+struct mdt_export_data {
+	struct tg_export_data	med_ted;
+	/** List of all files opened by client on this MDT */
+	struct list_head		med_open_head;
+	spinlock_t		med_open_lock; /* med_open_head, mfd_list */
+	/** Bitmask of all ibit locks this MDT understands */
+	__u64			med_ibits_known;
+	struct mutex		med_idmap_mutex;
+	struct lustre_idmap_table *med_idmap;
+};
+
+struct ec_export_data { /* echo client */
+	struct list_head eced_locks;
+};
+
+/* In-memory access to client data from OST struct */
+/** Filter (oss-side) specific import data */
+struct filter_export_data {
+	struct tg_export_data	fed_ted;
+	spinlock_t		fed_lock;	/**< protects fed_mod_list */
+	long		       fed_dirty;    /* in bytes */
+	long		       fed_grant;    /* in bytes */
+	struct list_head		 fed_mod_list; /* files being modified */
+	int			fed_mod_count;/* items in fed_writing list */
+	long		       fed_pending;  /* bytes just being written */
+	__u32		      fed_group;
+	__u8		       fed_pagesize; /* log2 of client page size */
+};
+
+struct mgs_export_data {
+	struct list_head		med_clients;	/* mgc fs client via this exp */
+	spinlock_t		med_lock;	/* protect med_clients */
+};
+
+/**
+ * per-NID statistics structure.
+ * It tracks access patterns to this export on a per-client-NID basis
+ */
+struct nid_stat {
+	lnet_nid_t	       nid;
+	struct hlist_node	 nid_hash;
+	struct list_head	       nid_list;
+	struct obd_device       *nid_obd;
+	struct proc_dir_entry   *nid_proc;
+	struct lprocfs_stats    *nid_stats;
+	struct lprocfs_stats    *nid_ldlm_stats;
+	atomic_t	     nid_exp_ref_count; /* for obd_nid_stats_hash
+							   exp_nid_stats */
+};
+
+#define nidstat_getref(nidstat)						\
+do {									   \
+	atomic_inc(&(nidstat)->nid_exp_ref_count);			 \
+} while(0)
+
+#define nidstat_putref(nidstat)						\
+do {									   \
+	atomic_dec(&(nidstat)->nid_exp_ref_count);			 \
+	LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0,	  \
+		 "stat %p nid_exp_ref_count < 0\n", nidstat);		  \
+} while(0)
+
+enum obd_option {
+	OBD_OPT_FORCE =	 0x0001,
+	OBD_OPT_FAILOVER =      0x0002,
+	OBD_OPT_ABORT_RECOV =   0x0004,
+};
+
+/**
+ * Export structure. Represents target-side of connection in portals.
+ * Also used in Lustre to connect between layers on the same node when
+ * there is no network-connection in-between.
+ * For every connected client there is an export structure on the server
+ * attached to the same obd device.
+ */
+struct obd_export {
+	/**
+	 * Export handle, it's id is provided to client on connect
+	 * Subsequent client RPCs contain this handle id to identify
+	 * what export they are talking to.
+	 */
+	struct portals_handle     exp_handle;
+	atomic_t	      exp_refcount;
+	/**
+	 * Set of counters below is to track where export references are
+	 * kept. The exp_rpc_count is used for reconnect handling also,
+	 * the cb_count and locks_count are for debug purposes only for now.
+	 * The sum of them should be less than exp_refcount by 3
+	 */
+	atomic_t	      exp_rpc_count; /* RPC references */
+	atomic_t	      exp_cb_count; /* Commit callback references */
+	/** Number of queued replay requests to be processes */
+	atomic_t		  exp_replay_count;
+	atomic_t	      exp_locks_count; /** Lock references */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	struct list_head		exp_locks_list;
+	spinlock_t		  exp_locks_list_guard;
+#endif
+	/** UUID of client connected to this export */
+	struct obd_uuid	   exp_client_uuid;
+	/** To link all exports on an obd device */
+	struct list_head		exp_obd_chain;
+	struct hlist_node	  exp_uuid_hash; /** uuid-export hash*/
+	struct hlist_node	  exp_nid_hash; /** nid-export hash */
+	/**
+	 * All exports eligible for ping evictor are linked into a list
+	 * through this field in "most time since last request on this export"
+	 * order
+	 * protected by obd_dev_lock
+	 */
+	struct list_head		exp_obd_chain_timed;
+	/** Obd device of this export */
+	struct obd_device	*exp_obd;
+	/**
+	 * "reverse" import to send requests (e.g. from ldlm) back to client
+	 * exp_lock protect its change
+	 */
+	struct obd_import	*exp_imp_reverse;
+	struct nid_stat	  *exp_nid_stats;
+	struct lprocfs_stats     *exp_md_stats;
+	/** Active connetion */
+	struct ptlrpc_connection *exp_connection;
+	/** Connection count value from last succesful reconnect rpc */
+	__u32		     exp_conn_cnt;
+	/** Hash list of all ldlm locks granted on this export */
+	cfs_hash_t	       *exp_lock_hash;
+	/**
+	 * Hash list for Posix lock deadlock detection, added with
+	 * ldlm_lock::l_exp_flock_hash.
+	 */
+	cfs_hash_t	       *exp_flock_hash;
+	struct list_head		exp_outstanding_replies;
+	struct list_head		exp_uncommitted_replies;
+	spinlock_t		  exp_uncommitted_replies_lock;
+	/** Last committed transno for this export */
+	__u64		     exp_last_committed;
+	/** When was last request received */
+	cfs_time_t		exp_last_request_time;
+	/** On replay all requests waiting for replay are linked here */
+	struct list_head		exp_req_replay_queue;
+	/**
+	 * protects exp_flags, exp_outstanding_replies and the change
+	 * of exp_imp_reverse
+	 */
+	spinlock_t		  exp_lock;
+	/** Compatibility flags for this export are embedded into
+	 *  exp_connect_data */
+	struct obd_connect_data   exp_connect_data;
+	enum obd_option	   exp_flags;
+	unsigned long	     exp_failed:1,
+				  exp_in_recovery:1,
+				  exp_disconnected:1,
+				  exp_connecting:1,
+				  /** VBR: export missed recovery */
+				  exp_delayed:1,
+				  /** VBR: failed version checking */
+				  exp_vbr_failed:1,
+				  exp_req_replay_needed:1,
+				  exp_lock_replay_needed:1,
+				  exp_need_sync:1,
+				  exp_flvr_changed:1,
+				  exp_flvr_adapt:1,
+				  exp_libclient:1, /* liblustre client? */
+				  /* client timed out and tried to reconnect,
+				   * but couldn't because of active rpcs */
+				  exp_abort_active_req:1,
+				  /* if to swap nidtbl entries for 2.2 clients.
+				   * Only used by the MGS to fix LU-1644. */
+				  exp_need_mne_swab:1;
+	/* also protected by exp_lock */
+	enum lustre_sec_part      exp_sp_peer;
+	struct sptlrpc_flavor     exp_flvr;	     /* current */
+	struct sptlrpc_flavor     exp_flvr_old[2];      /* about-to-expire */
+	cfs_time_t		exp_flvr_expire[2];   /* seconds */
+
+	/** protects exp_hp_rpcs */
+	spinlock_t		  exp_rpc_lock;
+	struct list_head		  exp_hp_rpcs;	/* (potential) HP RPCs */
+
+	/** blocking dlm lock list, protected by exp_bl_list_lock */
+	struct list_head		exp_bl_list;
+	spinlock_t		  exp_bl_list_lock;
+
+	/** Target specific data */
+	union {
+		struct tg_export_data     eu_target_data;
+		struct mdt_export_data    eu_mdt_data;
+		struct filter_export_data eu_filter_data;
+		struct ec_export_data     eu_ec_data;
+		struct mgs_export_data    eu_mgs_data;
+	} u;
+};
+
+#define exp_target_data u.eu_target_data
+#define exp_mdt_data    u.eu_mdt_data
+#define exp_filter_data u.eu_filter_data
+#define exp_ec_data     u.eu_ec_data
+
+static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp)
+{
+	return &exp->exp_connect_data.ocd_connect_flags;
+}
+
+static inline __u64 exp_connect_flags(struct obd_export *exp)
+{
+	return *exp_connect_flags_ptr(exp);
+}
+
+static inline int exp_max_brw_size(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)
+		return exp->exp_connect_data.ocd_brw_size;
+
+	return ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_connect_multibulk(struct obd_export *exp)
+{
+	return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_expired(struct obd_export *exp, cfs_duration_t age)
+{
+	LASSERT(exp->exp_delayed);
+	return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age),
+			       cfs_time_current_sec());
+}
+
+static inline int exp_connect_cancelset(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET);
+}
+
+static inline int exp_connect_lru_resize(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_rmtclient(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int client_is_remote(struct obd_export *exp)
+{
+	struct obd_import *imp = class_exp2cliimp(exp);
+
+	return !!(imp->imp_connect_data.ocd_connect_flags &
+		  OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int exp_connect_vbr(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	LASSERT(exp->exp_connection);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR);
+}
+
+static inline int exp_connect_som(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM);
+}
+
+static inline int exp_connect_umask(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK);
+}
+
+static inline int imp_connect_lru_resize(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_layout(struct obd_export *exp)
+{
+	return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK);
+}
+
+static inline bool exp_connect_lvb_type(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE)
+		return true;
+	else
+		return false;
+}
+
+static inline bool imp_connect_lvb_type(struct obd_import *imp)
+{
+	struct obd_connect_data *ocd;
+
+	LASSERT(imp != NULL);
+	ocd = &imp->imp_connect_data;
+	if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE)
+		return true;
+	else
+		return false;
+}
+
+extern struct obd_export *class_conn2export(struct lustre_handle *conn);
+extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
+
+/** @} export */
+
+#endif /* __EXPORT_H */
+/** @} obd_export */

diff --git a/drivers/staging/lustre/lustre/include/lustre_fid.h b/drivers/staging/lustre/lustre/include/lustre_fid.h
new file mode 100644
index 0000000..7d20cba
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fid.h

@@ -0,0 +1,762 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fid.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#ifndef __LINUX_FID_H
+#define __LINUX_FID_H
+
+/** \defgroup fid fid
+ *
+ * @{
+ *
+ * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs
+ * describes the FID namespace and interoperability requirements for FIDs.
+ * The important parts of that document are included here for reference.
+ *
+ * FID
+ *   File IDentifier generated by client from range allocated by the SEQuence
+ *   service and stored in struct lu_fid. The FID is composed of three parts:
+ *   SEQuence, ObjectID, and VERsion.  The SEQ component is a filesystem
+ *   unique 64-bit integer, and only one client is ever assigned any SEQ value.
+ *   The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved
+ *   for system use.  The OID component is a 32-bit value generated by the
+ *   client on a per-SEQ basis to allow creating many unique FIDs without
+ *   communication with the server.  The VER component is a 32-bit value that
+ *   distinguishes between different FID instantiations, such as snapshots or
+ *   separate subtrees within the filesystem.  FIDs with the same VER field
+ *   are considered part of the same namespace.
+ *
+ * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and
+ *   MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while
+ *   OSTs use 64-bit Lustre object IDs and generation numbers.
+ *
+ * NEW filesystems are those formatted since the introduction of FIDs.
+ *
+ * IGIF
+ *   Inode and Generation In FID, a surrogate FID used to globally identify
+ *   an existing object on OLD formatted MDT file system. This would only be
+ *   used on MDT0 in a DNE filesystem, because there cannot be more than one
+ *   MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1]
+ *   range, where inode number is stored in SEQ, and inode generation is in OID.
+ *   NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ *   which is the maximum possible for an ldiskfs backend.  It also assumes
+ *   that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ *   to clients, which has always been true.
+ *
+ * IDIF
+ *   object ID In FID, a surrogate FID used to globally identify an existing
+ *   OST object on OLD formatted OST file system. Belongs to a sequence in
+ *   [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *
+ *      1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ *
+ *   that is, SEQ consists of 16-bit OST index, and higher 16 bits of object
+ *   ID. The generation of unique SEQ values per OST allows the IDIF FIDs to
+ *   be identified in the FLD correctly. The OID field is calculated as:
+ *
+ *      objid & 0xffffffff
+ *
+ *   that is, it consists of lower 32 bits of object ID.  For objects within
+ *   the IDIF range, object ID extraction will be:
+ *
+ *      o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ *      o_seq = 0;  // formerly group number
+ *
+ *   NOTE: This assumes that no more than 2^48-1 objects have ever been created
+ *   on any OST, and that no more than 65535 OSTs are in use.  Both are very
+ *   reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming
+ *   a maximum creation rate of 1M objects per second for a maximum of 9 years,
+ *   or combinations thereof.
+ *
+ * OST_MDT0
+ *   Surrogate FID used to identify an existing object on OLD formatted OST
+ *   filesystem. Belongs to the reserved SEQuence 0, and is used prior to
+ *   the introduction of FID-on-OST, at which point IDIF will be used to
+ *   identify objects as residing on a specific OST.
+ *
+ * LLOG
+ *   For Lustre Log objects the object sequence 1 is used. This is compatible
+ *   with both OLD and NEW namespaces, as this SEQ number is in the
+ *   ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * ECHO
+ *   For testing OST IO performance the object sequence 2 is used. This is
+ *   compatible with both OLD and NEW namespaces, as this SEQ number is in
+ *   the ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * OST_MDT1 .. OST_MAX
+ *   For testing with multiple MDTs the object sequence 3 through 9 is used,
+ *   allowing direct mapping of MDTs 1 through 7 respectively, for a total
+ *   of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ *   mappings. However, this SEQ range is only for testing prior to any
+ *   production DNE release, as the objects in this range conflict across all
+ *   OSTs, as the OST index is not part of the FID.  For production DNE usage,
+ *   OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs.
+ *
+ * DLM OST objid to IDIF mapping
+ *   For compatibility with existing OLD OST network protocol structures, the
+ *   FID must map onto the o_id and o_seq in a manner that ensures existing
+ *   objects are identified consistently for IO, as well as onto the LDLM
+ *   namespace to ensure IDIFs there is only a single resource name for any
+ *   object in the DLM.  The OLD OST object DLM resource mapping is:
+ *
+ *      resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases
+ *
+ *   The NEW OST object DLM resource mapping is the same for both MDT and OST:
+ *
+ *      resource[] = {SEQ, OID, VER, HASH};
+ *
+ *  NOTE: for mapping IDIF values to DLM resource names the o_id may be
+ *  larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ *  for the o_id numbers to overlap FID SEQ numbers in the resource. However,
+ *  in all production releases the OLD o_seq field is always zero, and all
+ *  valid FID OID values are non-zero, so the lock resources will not collide.
+ *  Even so, the MDT and OST resources are also in different LDLM namespaces.
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include <lustre_mdt.h>
+#include <obd.h>
+
+
+struct lu_site;
+struct lu_context;
+
+/* Whole sequences space range and zero range definitions */
+extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE;
+extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE;
+extern const struct lu_fid LUSTRE_BFL_FID;
+extern const struct lu_fid LU_OBF_FID;
+extern const struct lu_fid LU_DOT_LUSTRE_FID;
+
+enum {
+	/*
+	 * This is how may metadata FIDs may be allocated in one sequence(128k)
+	 */
+	LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
+
+	/*
+	 * This is how many data FIDs could be allocated in one sequence(4B - 1)
+	 */
+	LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+
+	/*
+	 * How many sequences to allocate to a client at once.
+	 */
+	LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL,
+
+	/*
+	 * seq allocation pool size.
+	 */
+	LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000,
+
+	/*
+	 * This is how many sequences may be in one super-sequence allocated to
+	 * MDTs.
+	 */
+	LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
+};
+
+enum {
+	/** 2^6 FIDs for OI containers */
+	OSD_OI_FID_OID_BITS     = 6,
+	/** reserve enough FIDs in case we want more in the future */
+	OSD_OI_FID_OID_BITS_MAX = 10,
+};
+
+/** special OID for local objects */
+enum local_oid {
+	/** \see fld_mod_init */
+	FLD_INDEX_OID		= 3UL,
+	/** \see fid_mod_init */
+	FID_SEQ_CTL_OID		= 4UL,
+	FID_SEQ_SRV_OID		= 5UL,
+	/** \see mdd_mod_init */
+	MDD_ROOT_INDEX_OID	= 6UL, /* deprecated in 2.4 */
+	MDD_ORPHAN_OID		= 7UL, /* deprecated in 2.4 */
+	MDD_LOV_OBJ_OID		= 8UL,
+	MDD_CAPA_KEYS_OID	= 9UL,
+	/** \see mdt_mod_init */
+	LAST_RECV_OID		= 11UL,
+	OSD_FS_ROOT_OID		= 13UL,
+	ACCT_USER_OID		= 15UL,
+	ACCT_GROUP_OID		= 16UL,
+	LFSCK_BOOKMARK_OID	= 17UL,
+	OTABLE_IT_OID		= 18UL,
+	/* These two definitions are obsolete
+	 * OFD_GROUP0_LAST_OID     = 20UL,
+	 * OFD_GROUP4K_LAST_OID    = 20UL+4096,
+	 */
+	OFD_LAST_GROUP_OID	= 4117UL,
+	LLOG_CATALOGS_OID	= 4118UL,
+	MGS_CONFIGS_OID		= 4119UL,
+	OFD_HEALTH_CHECK_OID	= 4120UL,
+	MDD_LOV_OBJ_OSEQ	= 4121UL,
+	LFSCK_NAMESPACE_OID     = 4122UL,
+	REMOTE_PARENT_DIR_OID	= 4123UL,
+};
+
+static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+	fid->f_seq = FID_SEQ_LOCAL_FILE;
+	fid->f_oid = oid;
+	fid->f_ver = 0;
+}
+
+static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+	fid->f_seq = FID_SEQ_LOCAL_NAME;
+	fid->f_oid = oid;
+	fid->f_ver = 0;
+}
+
+/* For new FS (>= 2.4), the root FID will be changed to
+ * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4),
+ * the root FID will still be IGIF */
+static inline int fid_is_root(const struct lu_fid *fid)
+{
+	return unlikely((fid_seq(fid) == FID_SEQ_ROOT &&
+			 fid_oid(fid) == 1));
+}
+
+static inline int fid_is_dot_lustre(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+			fid_oid(fid) == FID_OID_DOT_LUSTRE);
+}
+
+static inline int fid_is_obf(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+			fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF);
+}
+
+static inline int fid_is_otable_it(const struct lu_fid *fid)
+{
+	return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+			fid_oid(fid) == OTABLE_IT_OID);
+}
+
+static inline int fid_is_acct(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+	       (fid_oid(fid) == ACCT_USER_OID ||
+		fid_oid(fid) == ACCT_GROUP_OID);
+}
+
+static inline int fid_is_quota(const struct lu_fid *fid)
+{
+	return fid_seq(fid) == FID_SEQ_QUOTA ||
+	       fid_seq(fid) == FID_SEQ_QUOTA_GLB;
+}
+
+static inline int fid_is_namespace_visible(const struct lu_fid *fid)
+{
+	const __u64 seq = fid_seq(fid);
+
+	/* Here, we cannot distinguish whether the normal FID is for OST
+	 * object or not. It is caller's duty to check more if needed. */
+	return (!fid_is_last_id(fid) &&
+		(fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) ||
+	       fid_is_root(fid) || fid_is_dot_lustre(fid);
+}
+
+static inline int fid_seq_in_fldb(__u64 seq)
+{
+	return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) ||
+	       fid_seq_is_root(seq) || fid_seq_is_dot(seq);
+}
+
+static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq)
+{
+	if (fid_seq_is_mdt0(seq)) {
+		fid->f_seq = fid_idif_seq(0, 0);
+	} else {
+		LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) ||
+			 fid_seq_is_idif(seq), LPX64"\n", seq);
+		fid->f_seq = seq;
+	}
+	fid->f_oid = 0;
+	fid->f_ver = 0;
+}
+
+enum lu_mgr_type {
+	LUSTRE_SEQ_SERVER,
+	LUSTRE_SEQ_CONTROLLER
+};
+
+struct lu_server_seq;
+
+/* Client sequence manager interface. */
+struct lu_client_seq {
+	/* Sequence-controller export. */
+	struct obd_export      *lcs_exp;
+	struct mutex		lcs_mutex;
+
+	/*
+	 * Range of allowed for allocation sequeces. When using lu_client_seq on
+	 * clients, this contains meta-sequence range. And for servers this
+	 * contains super-sequence range.
+	 */
+	struct lu_seq_range	 lcs_space;
+
+	/* Seq related proc */
+	proc_dir_entry_t   *lcs_proc_dir;
+
+	/* This holds last allocated fid in last obtained seq */
+	struct lu_fid	   lcs_fid;
+
+	/* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */
+	enum lu_cli_type	lcs_type;
+
+	/*
+	 * Service uuid, passed from MDT + seq name to form unique seq name to
+	 * use it with procfs.
+	 */
+	char		    lcs_name[80];
+
+	/*
+	 * Sequence width, that is how many objects may be allocated in one
+	 * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH.
+	 */
+	__u64		   lcs_width;
+
+	/* Seq-server for direct talking */
+	struct lu_server_seq   *lcs_srv;
+
+	/* wait queue for fid allocation and update indicator */
+	wait_queue_head_t	     lcs_waitq;
+	int		     lcs_update;
+};
+
+/* server sequence manager interface */
+struct lu_server_seq {
+	/* Available sequences space */
+	struct lu_seq_range	 lss_space;
+
+	/* keeps highwater in lsr_end for seq allocation algorithm */
+	struct lu_seq_range	 lss_lowater_set;
+	struct lu_seq_range	 lss_hiwater_set;
+
+	/*
+	 * Device for server side seq manager needs (saving sequences to backing
+	 * store).
+	 */
+	struct dt_device       *lss_dev;
+
+	/* /seq file object device */
+	struct dt_object       *lss_obj;
+
+	/* Seq related proc */
+	proc_dir_entry_t   *lss_proc_dir;
+
+	/* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
+	enum lu_mgr_type       lss_type;
+
+	/* Client interafce to request controller */
+	struct lu_client_seq   *lss_cli;
+
+	/* Mutex for protecting allocation */
+	struct mutex		lss_mutex;
+
+	/*
+	 * Service uuid, passed from MDT + seq name to form unique seq name to
+	 * use it with procfs.
+	 */
+	char		    lss_name[80];
+
+	/*
+	 * Allocation chunks for super and meta sequences. Default values are
+	 * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH.
+	 */
+	__u64		   lss_width;
+
+	/*
+	 * minimum lss_alloc_set size that should be allocated from
+	 * lss_space
+	 */
+	__u64		   lss_set_width;
+
+	/* sync is needed for update operation */
+	__u32		   lss_need_sync;
+
+	/**
+	 * Pointer to site object, required to access site fld.
+	 */
+	struct seq_server_site  *lss_site;
+};
+
+int seq_query(struct com_thread_info *info);
+int seq_handle(struct ptlrpc_request *req);
+
+/* Server methods */
+int seq_server_init(struct lu_server_seq *seq,
+		    struct dt_device *dev,
+		    const char *prefix,
+		    enum lu_mgr_type type,
+		    struct seq_server_site *ss,
+		    const struct lu_env *env);
+
+void seq_server_fini(struct lu_server_seq *seq,
+		     const struct lu_env *env);
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+			   struct lu_seq_range *out,
+			   const struct lu_env *env);
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+			  struct lu_seq_range *out,
+			  const struct lu_env *env);
+
+int seq_server_set_cli(struct lu_server_seq *seq,
+		       struct lu_client_seq *cli,
+		       const struct lu_env *env);
+
+/* Client methods */
+int seq_client_init(struct lu_client_seq *seq,
+		    struct obd_export *exp,
+		    enum lu_cli_type type,
+		    const char *prefix,
+		    struct lu_server_seq *srv);
+
+void seq_client_fini(struct lu_client_seq *seq);
+
+void seq_client_flush(struct lu_client_seq *seq);
+
+int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq,
+			 struct lu_fid *fid);
+int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq,
+		       seqno_t *seqnr);
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss);
+/* Fids common stuff */
+int fid_is_local(const struct lu_env *env,
+		 struct lu_site *site, const struct lu_fid *fid);
+
+int client_fid_init(struct obd_device *obd, struct obd_export *exp,
+		    enum lu_cli_type type);
+int client_fid_fini(struct obd_device *obd);
+
+/* fid locking */
+
+struct ldlm_namespace;
+
+/*
+ * Build (DLM) resource name from FID.
+ *
+ * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * renaming name[2,3] fields that need to be used for the quota identifier.
+ */
+static inline struct ldlm_res_id *
+fid_build_reg_res_name(const struct lu_fid *f,
+		       struct ldlm_res_id *name)
+{
+	memset(name, 0, sizeof *name);
+	name->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(f);
+	name->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(f);
+	return name;
+}
+
+/*
+ * Build (DLM) resource identifier from global quota FID and quota ID.
+ */
+static inline struct ldlm_res_id *
+fid_build_quota_resid(const struct lu_fid *glb_fid, union lquota_id *qid,
+		      struct ldlm_res_id *res)
+{
+	fid_build_reg_res_name(glb_fid, res);
+	res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid);
+	res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid);
+	return res;
+}
+
+/*
+ * Extract global FID and quota ID from resource name
+ */
+static inline void fid_extract_quota_resid(struct ldlm_res_id *res,
+					   struct lu_fid *glb_fid,
+					   union lquota_id *qid)
+{
+	glb_fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF];
+	glb_fid->f_oid = (__u32)res->name[LUSTRE_RES_ID_VER_OID_OFF];
+	glb_fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+	qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF];
+	qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF];
+	qid->qid_fid.f_ver =
+		(__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32);
+}
+
+/*
+ * Return true if resource is for object identified by fid.
+ */
+static inline int fid_res_name_eq(const struct lu_fid *f,
+				  const struct ldlm_res_id *name)
+{
+	return name->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(f) &&
+	       name->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(f);
+}
+
+/* reverse function of fid_build_reg_res_name() */
+static inline void fid_build_from_res_name(struct lu_fid *f,
+					   const struct ldlm_res_id *name)
+{
+	fid_zero(f);
+	f->f_seq = name->name[LUSTRE_RES_ID_SEQ_OFF];
+	f->f_oid = name->name[LUSTRE_RES_ID_VER_OID_OFF] & 0xffffffff;
+	f->f_ver = name->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32;
+	LASSERT(fid_res_name_eq(f, name));
+}
+
+static inline struct ldlm_res_id *
+fid_build_pdo_res_name(const struct lu_fid *f,
+		       unsigned int hash,
+		       struct ldlm_res_id *name)
+{
+	fid_build_reg_res_name(f, name);
+	name->name[LUSTRE_RES_ID_HSH_OFF] = hash;
+	return name;
+}
+
+/**
+ * Build DLM resource name from object id & seq, which will be removed
+ * finally, when we replace ost_id with FID in data stack.
+ *
+ * Currently, resid from the old client, whose res[0] = object_id,
+ * res[1] = object_seq, is just oposite with Metatdata
+ * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid.
+ * To unifiy the resid identification, we will reverse the data
+ * resid to keep it same with Metadata resid, i.e.
+ *
+ * For resid from the old client,
+ *    res[0] = objid,  res[1] = 0, still keep the original order,
+ *    for compatiblity.
+ *
+ * For new resid
+ *    res will be built from normal FID directly, i.e. res[0] = f_seq,
+ *    res[1] = f_oid + f_ver.
+ */
+static inline void ostid_build_res_name(struct ost_id *oi,
+					struct ldlm_res_id *name)
+{
+	memset(name, 0, sizeof *name);
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi);
+		name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi);
+	} else {
+		fid_build_reg_res_name((struct lu_fid *)oi, name);
+	}
+}
+
+static inline void ostid_res_name_to_id(struct ost_id *oi,
+					struct ldlm_res_id *name)
+{
+	if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_SEQ_OFF])) {
+		/* old resid */
+		ostid_set_seq(oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+		ostid_set_id(oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+	} else {
+		/* new resid */
+		fid_build_from_res_name((struct lu_fid *)oi, name);
+	}
+}
+
+/**
+ * Return true if the resource is for the object identified by this id & group.
+ */
+static inline int ostid_res_name_eq(struct ost_id *oi,
+				    struct ldlm_res_id *name)
+{
+	/* Note: it is just a trick here to save some effort, probably the
+	 * correct way would be turn them into the FID and compare */
+	if (fid_seq_is_mdt0(ostid_seq(oi))) {
+		return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) &&
+		       name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi);
+	} else {
+		return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) &&
+		       name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi);
+	}
+}
+
+/* The same as osc_build_res_name() */
+static inline void ost_fid_build_resid(const struct lu_fid *fid,
+				       struct ldlm_res_id *resname)
+{
+	if (fid_is_mdt0(fid) || fid_is_idif(fid)) {
+		struct ost_id oi;
+		oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */
+		if (fid_to_ostid(fid, &oi) != 0)
+			return;
+		ostid_build_res_name(&oi, resname);
+	} else {
+		fid_build_reg_res_name(fid, resname);
+	}
+}
+
+static inline void ost_fid_from_resid(struct lu_fid *fid,
+				      const struct ldlm_res_id *name)
+{
+	if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) {
+		/* old resid */
+		struct ost_id oi;
+		ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+		ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+		ostid_to_fid(fid, &oi, 0);
+	} else {
+		/* new resid */
+		fid_build_from_res_name(fid, name);
+	}
+}
+
+/**
+ * Flatten 128-bit FID values into a 64-bit value for use as an inode number.
+ * For non-IGIF FIDs this starts just over 2^32, and continues without
+ * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ
+ * into the range where there may not be many OID values in use, to minimize
+ * the risk of conflict.
+ *
+ * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true,
+ * the time between re-used inode numbers is very long - 2^40 SEQ numbers,
+ * or about 2^40 client mounts, if clients create less than 2^24 files/mount.
+ */
+static inline __u64 fid_flatten(const struct lu_fid *fid)
+{
+	__u64 ino;
+	__u64 seq;
+
+	if (fid_is_igif(fid)) {
+		ino = lu_igif_ino(fid);
+		RETURN(ino);
+	}
+
+	seq = fid_seq(fid);
+
+	ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
+
+	RETURN(ino ? ino : fid_oid(fid));
+}
+
+static inline __u32 fid_hash(const struct lu_fid *f, int bits)
+{
+	/* all objects with same id and different versions will belong to same
+	 * collisions list. */
+	return cfs_hash_long(fid_flatten(f), bits);
+}
+
+/**
+ * map fid to 32 bit value for ino on 32bit systems. */
+static inline __u32 fid_flatten32(const struct lu_fid *fid)
+{
+	__u32 ino;
+	__u64 seq;
+
+	if (fid_is_igif(fid)) {
+		ino = lu_igif_ino(fid);
+		RETURN(ino);
+	}
+
+	seq = fid_seq(fid) - FID_SEQ_START;
+
+	/* Map the high bits of the OID into higher bits of the inode number so
+	 * that inodes generated at about the same time have a reduced chance
+	 * of collisions. This will give a period of 2^12 = 1024 unique clients
+	 * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
+	 * (from OID), or up to 128M inodes without collisions for new files. */
+	ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
+	       (seq >> (64 - (40-8)) & 0xffffff00) +
+	       (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
+
+	RETURN(ino ? ino : fid_oid(fid));
+}
+
+static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2)
+{
+	LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
+		 PFID(fid1), PFID(fid2));
+
+	if (fid_is_idif(fid1) && fid_is_idif(fid2))
+		return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) -
+		       fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver);
+
+	return fid_oid(fid1) - fid_oid(fid2);
+}
+
+#define LUSTRE_SEQ_SRV_NAME "seq_srv"
+#define LUSTRE_SEQ_CTL_NAME "seq_ctl"
+
+/* Range common stuff */
+static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = cpu_to_le64(src->lsr_start);
+	dst->lsr_end = cpu_to_le64(src->lsr_end);
+	dst->lsr_index = cpu_to_le32(src->lsr_index);
+	dst->lsr_flags = cpu_to_le32(src->lsr_flags);
+}
+
+static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = le64_to_cpu(src->lsr_start);
+	dst->lsr_end = le64_to_cpu(src->lsr_end);
+	dst->lsr_index = le32_to_cpu(src->lsr_index);
+	dst->lsr_flags = le32_to_cpu(src->lsr_flags);
+}
+
+static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = cpu_to_be64(src->lsr_start);
+	dst->lsr_end = cpu_to_be64(src->lsr_end);
+	dst->lsr_index = cpu_to_be32(src->lsr_index);
+	dst->lsr_flags = cpu_to_be32(src->lsr_flags);
+}
+
+static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+	dst->lsr_start = be64_to_cpu(src->lsr_start);
+	dst->lsr_end = be64_to_cpu(src->lsr_end);
+	dst->lsr_index = be32_to_cpu(src->lsr_index);
+	dst->lsr_flags = be32_to_cpu(src->lsr_flags);
+}
+
+/** @} fid */
+
+#endif /* __LINUX_FID_H */

diff --git a/drivers/staging/lustre/lustre/include/lustre_fld.h b/drivers/staging/lustre/lustre/include/lustre_fld.h
new file mode 100644
index 0000000..11e034a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fld.h

@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_FLD_H
+#define __LINUX_FLD_H
+
+/** \defgroup fld fld
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <lustre_mdt.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct lu_client_fld;
+struct lu_server_fld;
+struct lu_fld_hash;
+struct fld_cache;
+
+extern const struct dt_index_features fld_index_features;
+extern const char fld_index_name[];
+
+/*
+ * FLD (Fid Location Database) interface.
+ */
+enum {
+	LUSTRE_CLI_FLD_HASH_DHT = 0,
+	LUSTRE_CLI_FLD_HASH_RRB
+};
+
+
+struct lu_fld_target {
+	struct list_head	       ft_chain;
+	struct obd_export       *ft_exp;
+	struct lu_server_fld    *ft_srv;
+	__u64		    ft_idx;
+};
+
+struct lu_server_fld {
+	/**
+	 * Fld dir proc entry. */
+	proc_dir_entry_t    *lsf_proc_dir;
+
+	/**
+	 * /fld file object device */
+	struct dt_object	*lsf_obj;
+
+	/**
+	 * super sequence controller export, needed to forward fld
+	 * lookup  request. */
+	struct obd_export       *lsf_control_exp;
+
+	/**
+	 * Client FLD cache. */
+	struct fld_cache	*lsf_cache;
+
+	/**
+	 * Protect index modifications */
+	struct mutex		lsf_lock;
+
+	/**
+	 * Fld service name in form "fld-srv-lustre-MDTXXX" */
+	char		     lsf_name[80];
+
+};
+
+struct lu_client_fld {
+	/**
+	 * Client side proc entry. */
+	proc_dir_entry_t    *lcf_proc_dir;
+
+	/**
+	 * List of exports client FLD knows about. */
+	struct list_head	       lcf_targets;
+
+	/**
+	 * Current hash to be used to chose an export. */
+	struct lu_fld_hash      *lcf_hash;
+
+	/**
+	 * Exports count. */
+	int		      lcf_count;
+
+	/**
+	 * Lock protecting exports list and fld_hash. */
+	spinlock_t		 lcf_lock;
+
+	/**
+	 * Client FLD cache. */
+	struct fld_cache	*lcf_cache;
+
+	/**
+	 * Client fld proc entry name. */
+	char		     lcf_name[80];
+
+	const struct lu_context *lcf_ctx;
+
+	int		      lcf_flags;
+};
+
+/**
+ * number of blocks to reserve for particular operations. Should be function of
+ * ... something. Stub for now.
+ */
+enum {
+	/* one insert operation can involve two delete and one insert */
+	FLD_TXN_INDEX_INSERT_CREDITS  = 60,
+	FLD_TXN_INDEX_DELETE_CREDITS  = 20,
+};
+
+int fld_query(struct com_thread_info *info);
+
+/* Server methods */
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+		    struct dt_device *dt, const char *prefix, int mds_node_id,
+		    int type);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_server_create(const struct lu_env *env,
+			      struct lu_server_fld *fld,
+			      struct lu_seq_range *new,
+			      struct thandle *th);
+
+int fld_server_create(const struct lu_env *env,
+		      struct lu_server_fld *fld,
+		      struct lu_seq_range *add_range,
+		      struct thandle *th);
+
+int fld_insert_entry(const struct lu_env *env,
+		     struct lu_server_fld *fld,
+		     const struct lu_seq_range *range);
+
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+		      seqno_t seq, struct lu_seq_range *range);
+
+/* Client methods */
+int fld_client_init(struct lu_client_fld *fld,
+		    const char *prefix, int hash);
+
+void fld_client_fini(struct lu_client_fld *fld);
+
+void fld_client_flush(struct lu_client_fld *fld);
+
+int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds,
+		      __u32 flags, const struct lu_env *env);
+
+int fld_client_create(struct lu_client_fld *fld,
+		      struct lu_seq_range *range,
+		      const struct lu_env *env);
+
+int fld_client_delete(struct lu_client_fld *fld,
+		      seqno_t seq,
+		      const struct lu_env *env);
+
+int fld_client_add_target(struct lu_client_fld *fld,
+			  struct lu_fld_target *tar);
+
+int fld_client_del_target(struct lu_client_fld *fld,
+			  __u64 idx);
+
+void fld_client_proc_fini(struct lu_client_fld *fld);
+
+/** @} fld */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h
new file mode 100644
index 0000000..9dcc332
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h

@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fsfilt.h
+ *
+ * Filesystem interface helper.
+ */
+
+#ifndef _LUSTRE_FSFILT_H
+#define _LUSTRE_FSFILT_H
+
+#include <linux/lustre_fsfilt.h>
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_ha.h b/drivers/staging/lustre/lustre/include/lustre_ha.h
new file mode 100644
index 0000000..105f6d6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_ha.h

@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_HA_H
+#define _LUSTRE_HA_H
+
+/** \defgroup ha ha
+ *
+ * @{
+ */
+
+struct obd_import;
+struct obd_export;
+struct obd_device;
+struct ptlrpc_request;
+
+
+int ptlrpc_replay(struct obd_import *imp);
+int ptlrpc_resend(struct obd_import *imp);
+void ptlrpc_free_committed(struct obd_import *imp);
+void ptlrpc_wake_delayed(struct obd_import *imp);
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
+int ptlrpc_set_import_active(struct obd_import *imp, int active);
+void ptlrpc_activate_import(struct obd_import *imp);
+void ptlrpc_deactivate_import(struct obd_import *imp);
+void ptlrpc_invalidate_import(struct obd_import *imp);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
+int ptlrpc_check_suspend(void);
+void ptlrpc_activate_timeouts(struct obd_import *imp);
+void ptlrpc_deactivate_timeouts(struct obd_import *imp);
+
+/** @} ha */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_handles.h b/drivers/staging/lustre/lustre/include/lustre_handles.h
new file mode 100644
index 0000000..fcd40f3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_handles.h

@@ -0,0 +1,93 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_HANDLES_H_
+#define __LUSTRE_HANDLES_H_
+
+/** \defgroup handles handles
+ *
+ * @{
+ */
+
+#include <linux/lustre_handles.h>
+
+#include <linux/libcfs/libcfs.h>
+
+
+struct portals_handle_ops {
+	void (*hop_addref)(void *object);
+	void (*hop_free)(void *object, int size);
+};
+
+/* These handles are most easily used by having them appear at the very top of
+ * whatever object that you want to make handles for.  ie:
+ *
+ * struct ldlm_lock {
+ *	 struct portals_handle handle;
+ *	 ...
+ * };
+ *
+ * Now you're able to assign the results of cookie2handle directly to an
+ * ldlm_lock.  If it's not at the top, you'll want to use container_of()
+ * to compute the start of the structure based on the handle field. */
+struct portals_handle {
+	struct list_head			h_link;
+	__u64				h_cookie;
+	struct portals_handle_ops	*h_ops;
+
+	/* newly added fields to handle the RCU issue. -jxiong */
+	cfs_rcu_head_t			h_rcu;
+	spinlock_t			h_lock;
+	unsigned int			h_size:31;
+	unsigned int			h_in:1;
+};
+#define RCU2HANDLE(rcu)    container_of(rcu, struct portals_handle, h_rcu)
+
+/* handles.c */
+
+/* Add a handle to the hash table */
+void class_handle_hash(struct portals_handle *,
+		       struct portals_handle_ops *ops);
+void class_handle_unhash(struct portals_handle *);
+void class_handle_hash_back(struct portals_handle *);
+void *class_handle2object(__u64 cookie);
+void class_handle_free_cb(cfs_rcu_head_t *);
+int class_handle_init(void);
+void class_handle_cleanup(void);
+
+/** @} handles */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_idmap.h b/drivers/staging/lustre/lustre/include/lustre_idmap.h
new file mode 100644
index 0000000..084bdd6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_idmap.h

@@ -0,0 +1,104 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_IDMAP_H
+#define _LUSTRE_IDMAP_H
+
+/** \defgroup idmap idmap
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+#define CFS_NGROUPS_PER_BLOCK   ((int)(PAGE_CACHE_SIZE / sizeof(gid_t)))
+
+#define CFS_GROUP_AT(gi, i) \
+	((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK])
+
+enum {
+	CFS_IC_NOTHING     = 0,    /* convert nothing */
+	CFS_IC_ALL	 = 1,    /* convert all items */
+	CFS_IC_MAPPED      = 2,    /* convert mapped uid/gid */
+	CFS_IC_UNMAPPED    = 3     /* convert unmapped uid/gid */
+};
+
+#define  CFS_IDMAP_NOTFOUND     (-1)
+
+#define CFS_IDMAP_HASHSIZE      32
+
+enum lustre_idmap_idx {
+	RMT_UIDMAP_IDX,
+	LCL_UIDMAP_IDX,
+	RMT_GIDMAP_IDX,
+	LCL_GIDMAP_IDX,
+	CFS_IDMAP_N_HASHES
+};
+
+struct lustre_idmap_table {
+	spinlock_t	lit_lock;
+	struct list_head	lit_idmaps[CFS_IDMAP_N_HASHES][CFS_IDMAP_HASHSIZE];
+};
+
+struct lu_ucred;
+
+extern void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist);
+extern void lustre_groups_sort(group_info_t *group_info);
+extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp);
+
+extern int lustre_idmap_add(struct lustre_idmap_table *t,
+			    uid_t ruid, uid_t luid,
+			    gid_t rgid, gid_t lgid);
+extern int lustre_idmap_del(struct lustre_idmap_table *t,
+			    uid_t ruid, uid_t luid,
+			    gid_t rgid, gid_t lgid);
+extern int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+				   struct lustre_idmap_table *t,
+				   int reverse, uid_t uid);
+extern int lustre_idmap_lookup_gid(struct lu_ucred *mu,
+				   struct lustre_idmap_table *t,
+				   int reverse, gid_t gid);
+extern struct lustre_idmap_table *lustre_idmap_init(void);
+extern void lustre_idmap_fini(struct lustre_idmap_table *t);
+
+/** @} idmap */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h
new file mode 100644
index 0000000..3a5dd6a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_import.h

@@ -0,0 +1,367 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_import PtlRPC import definitions
+ * Imports are client-side representation of remote obd target.
+ *
+ * @{
+ */
+
+#ifndef __IMPORT_H
+#define __IMPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <lustre/lustre_idl.h>
+
+
+/**
+ * Adaptive Timeout stuff
+ *
+ * @{
+ */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4		  /* "bin" means "N seconds of history" */
+#define AT_FLG_NOHIST 0x1	  /* use last reported value only */
+
+struct adaptive_timeout {
+	time_t		at_binstart;	 /* bin start time */
+	unsigned int	at_hist[AT_BINS];    /* timeout history bins */
+	unsigned int	at_flags;
+	unsigned int	at_current;	  /* current timeout value */
+	unsigned int	at_worst_ever;       /* worst-ever timeout value */
+	time_t		at_worst_time;       /* worst-ever timeout timestamp */
+	spinlock_t	at_lock;
+};
+
+struct ptlrpc_at_array {
+	struct list_head       *paa_reqs_array; /** array to hold requests */
+	__u32	     paa_size;       /** the size of array */
+	__u32	     paa_count;      /** the total count of reqs */
+	time_t	    paa_deadline;   /** the earliest deadline of reqs */
+	__u32	    *paa_reqs_count; /** the count of reqs in each entry */
+};
+
+#define IMP_AT_MAX_PORTALS 8
+struct imp_at {
+	int		     iat_portal[IMP_AT_MAX_PORTALS];
+	struct adaptive_timeout iat_net_latency;
+	struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+};
+
+
+/** @} */
+
+/** Possible import states */
+enum lustre_imp_state {
+	LUSTRE_IMP_CLOSED     = 1,
+	LUSTRE_IMP_NEW	= 2,
+	LUSTRE_IMP_DISCON     = 3,
+	LUSTRE_IMP_CONNECTING = 4,
+	LUSTRE_IMP_REPLAY     = 5,
+	LUSTRE_IMP_REPLAY_LOCKS = 6,
+	LUSTRE_IMP_REPLAY_WAIT  = 7,
+	LUSTRE_IMP_RECOVER    = 8,
+	LUSTRE_IMP_FULL       = 9,
+	LUSTRE_IMP_EVICTED    = 10,
+};
+
+/** Returns test string representation of numeric import state \a state */
+static inline char * ptlrpc_import_state_name(enum lustre_imp_state state)
+{
+	static char* import_state_names[] = {
+		"<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
+		"CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+		"RECOVER", "FULL", "EVICTED",
+	};
+
+	LASSERT (state <= LUSTRE_IMP_EVICTED);
+	return import_state_names[state];
+}
+
+/**
+ * List of import event types
+ */
+enum obd_import_event {
+	IMP_EVENT_DISCON     = 0x808001,
+	IMP_EVENT_INACTIVE   = 0x808002,
+	IMP_EVENT_INVALIDATE = 0x808003,
+	IMP_EVENT_ACTIVE     = 0x808004,
+	IMP_EVENT_OCD	= 0x808005,
+	IMP_EVENT_DEACTIVATE = 0x808006,
+	IMP_EVENT_ACTIVATE   = 0x808007,
+};
+
+/**
+ * Definition of import connection structure
+ */
+struct obd_import_conn {
+	/** Item for linking connections together */
+	struct list_head		oic_item;
+	/** Pointer to actual PortalRPC connection */
+	struct ptlrpc_connection *oic_conn;
+	/** uuid of remote side */
+	struct obd_uuid	   oic_uuid;
+	/**
+	 * Time (64 bit jiffies) of last connection attempt on this connection
+	 */
+	__u64		     oic_last_attempt;
+};
+
+/* state history */
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+	enum lustre_imp_state ish_state;
+	time_t		ish_time;
+};
+
+/**
+ * Defintion of PortalRPC import structure.
+ * Imports are representing client-side view to remote target.
+ */
+struct obd_import {
+	/** Local handle (== id) for this import. */
+	struct portals_handle     imp_handle;
+	/** Reference counter */
+	atomic_t	      imp_refcount;
+	struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
+	/** Currently active connection */
+	struct ptlrpc_connection *imp_connection;
+	/** PortalRPC client structure for this import */
+	struct ptlrpc_client     *imp_client;
+	/** List element for linking into pinger chain */
+	struct list_head		imp_pinger_chain;
+	/** List element for linking into chain for destruction */
+	struct list_head		imp_zombie_chain;
+
+	/**
+	 * Lists of requests that are retained for replay, waiting for a reply,
+	 * or waiting for recovery to complete, respectively.
+	 * @{
+	 */
+	struct list_head		imp_replay_list;
+	struct list_head		imp_sending_list;
+	struct list_head		imp_delayed_list;
+	/** @} */
+
+	/** obd device for this import */
+	struct obd_device	*imp_obd;
+
+	/**
+	 * some seciruty-related fields
+	 * @{
+	 */
+	struct ptlrpc_sec	*imp_sec;
+	struct mutex		  imp_sec_mutex;
+	cfs_time_t		imp_sec_expire;
+	/** @} */
+
+	/** Wait queue for those who need to wait for recovery completion */
+	wait_queue_head_t	       imp_recovery_waitq;
+
+	/** Number of requests currently in-flight */
+	atomic_t	      imp_inflight;
+	/** Number of requests currently unregistering */
+	atomic_t	      imp_unregistering;
+	/** Number of replay requests inflight */
+	atomic_t	      imp_replay_inflight;
+	/** Number of currently happening import invalidations */
+	atomic_t	      imp_inval_count;
+	/** Numbner of request timeouts */
+	atomic_t	      imp_timeouts;
+	/** Current import state */
+	enum lustre_imp_state     imp_state;
+	/** History of import states */
+	struct import_state_hist  imp_state_hist[IMP_STATE_HIST_LEN];
+	int		       imp_state_hist_idx;
+	/** Current import generation. Incremented on every reconnect */
+	int		       imp_generation;
+	/** Incremented every time we send reconnection request */
+	__u32		     imp_conn_cnt;
+       /**
+	* \see ptlrpc_free_committed remembers imp_generation value here
+	* after a check to save on unnecessary replay list iterations
+	*/
+	int		       imp_last_generation_checked;
+	/** Last tranno we replayed */
+	__u64		     imp_last_replay_transno;
+	/** Last transno committed on remote side */
+	__u64		     imp_peer_committed_transno;
+	/**
+	 * \see ptlrpc_free_committed remembers last_transno since its last
+	 * check here and if last_transno did not change since last run of
+	 * ptlrpc_free_committed and import generation is the same, we can
+	 * skip looking for requests to remove from replay list as optimisation
+	 */
+	__u64		     imp_last_transno_checked;
+	/**
+	 * Remote export handle. This is how remote side knows what export
+	 * we are talking to. Filled from response to connect request
+	 */
+	struct lustre_handle      imp_remote_handle;
+	/** When to perform next ping. time in jiffies. */
+	cfs_time_t		imp_next_ping;
+	/** When we last succesfully connected. time in 64bit jiffies */
+	__u64		     imp_last_success_conn;
+
+	/** List of all possible connection for import. */
+	struct list_head		imp_conn_list;
+	/**
+	 * Current connection. \a imp_connection is imp_conn_current->oic_conn
+	 */
+	struct obd_import_conn   *imp_conn_current;
+
+	/** Protects flags, level, generation, conn_cnt, *_list */
+	spinlock_t		  imp_lock;
+
+	/* flags */
+	unsigned long	     imp_no_timeout:1, /* timeouts are disabled */
+				  imp_invalid:1,    /* evicted */
+				  /* administratively disabled */
+				  imp_deactive:1,
+				  /* try to recover the import */
+				  imp_replayable:1,
+				  /* don't run recovery (timeout instead) */
+				  imp_dlm_fake:1,
+				  /* use 1/2 timeout on MDS' OSCs */
+				  imp_server_timeout:1,
+				  /* VBR: imp in delayed recovery */
+				  imp_delayed_recovery:1,
+				  /* VBR: if gap was found then no lock replays
+				   */
+				  imp_no_lock_replay:1,
+				  /* recovery by versions was failed */
+				  imp_vbr_failed:1,
+				  /* force an immidiate ping */
+				  imp_force_verify:1,
+				  /* force a scheduled ping */
+				  imp_force_next_verify:1,
+				  /* pingable */
+				  imp_pingable:1,
+				  /* resend for replay */
+				  imp_resend_replay:1,
+				  /* disable normal recovery, for test only. */
+				  imp_no_pinger_recover:1,
+				  /* need IR MNE swab */
+				  imp_need_mne_swab:1,
+				  /* import must be reconnected instead of
+				   * chouse new connection */
+				  imp_force_reconnect:1,
+				  /* import has tried to connect with server */
+				  imp_connect_tried:1;
+	__u32		     imp_connect_op;
+	struct obd_connect_data   imp_connect_data;
+	__u64		     imp_connect_flags_orig;
+	int		       imp_connect_error;
+
+	__u32		     imp_msg_magic;
+	__u32		     imp_msghdr_flags;       /* adjusted based on server capability */
+
+	struct ptlrpc_request_pool *imp_rq_pool;	  /* emergency request pool */
+
+	struct imp_at	     imp_at;		 /* adaptive timeout data */
+	time_t		    imp_last_reply_time;    /* for health check */
+};
+
+typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
+				    int event, void *event_arg, void *cb_data);
+
+/**
+ * Structure for import observer.
+ * It is possible to register "observer" on an import and every time
+ * something happens to an import (like connect/evict/disconnect)
+ * obderver will get its callback called with event type
+ */
+struct obd_import_observer {
+	struct list_head	   oio_chain;
+	obd_import_callback  oio_cb;
+	void		*oio_cb_data;
+};
+
+void class_observe_import(struct obd_import *imp, obd_import_callback cb,
+			  void *cb_data);
+void class_unobserve_import(struct obd_import *imp, obd_import_callback cb,
+			    void *cb_data);
+void class_notify_import_observers(struct obd_import *imp, int event,
+				   void *event_arg);
+
+/* import.c */
+static inline unsigned int at_est2timeout(unsigned int val)
+{
+	/* add an arbitrary minimum: 125% +5 sec */
+	return (val + (val >> 2) + 5);
+}
+
+static inline unsigned int at_timeout2est(unsigned int val)
+{
+	/* restore estimate value from timeout: e=4/5(t-5) */
+	LASSERT(val);
+	return (max((val << 2) / 5, 5U) - 4);
+}
+
+static inline void at_reset(struct adaptive_timeout *at, int val) {
+	at->at_current = val;
+	at->at_worst_ever = val;
+	at->at_worst_time = cfs_time_current_sec();
+}
+static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
+	memset(at, 0, sizeof(*at));
+	spin_lock_init(&at->at_lock);
+	at->at_flags = flags;
+	at_reset(at, val);
+}
+extern unsigned int at_min;
+static inline int at_get(struct adaptive_timeout *at) {
+	return (at->at_current > at_min) ? at->at_current : at_min;
+}
+int at_measured(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
+
+/* genops.c */
+struct obd_export;
+extern struct obd_import *class_exp2cliimp(struct obd_export *);
+extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
+
+/** @} import */
+
+#endif /* __IMPORT_H */
+
+/** @} obd_import */

diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
new file mode 100644
index 0000000..bdfc539
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_lib.h

@@ -0,0 +1,667 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LUSTRE_LIB_H
+#define _LUSTRE_LIB_H
+
+/** \defgroup lib lib
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ver.h>
+#include <lustre_cfg.h>
+#include <linux/lustre_lib.h>
+
+/* target.c */
+struct ptlrpc_request;
+struct obd_export;
+struct lu_target;
+struct l_wait_info;
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lvfs.h>
+
+
+int target_pack_pool_reply(struct ptlrpc_request *req);
+int do_set_info_async(struct obd_import *imp,
+		      int opcode, int version,
+		      obd_count keylen, void *key,
+		      obd_count vallen, void *val,
+		      struct ptlrpc_request_set *set);
+
+#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
+#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
+
+/* client.c */
+
+int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg* lcfg);
+struct client_obd *client_conn2cli(struct lustre_handle *conn);
+
+struct md_open_data;
+struct obd_client_handle {
+	struct lustre_handle  och_fh;
+	struct lu_fid	 och_fid;
+	struct md_open_data  *och_mod;
+	__u32 och_magic;
+	int och_flags;
+};
+#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed
+
+/* statfs_pack.c */
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs);
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs);
+
+/* l_lock.c */
+struct lustre_lock {
+	int			l_depth;
+	task_t		*l_owner;
+	struct semaphore	l_sem;
+	spinlock_t		l_spin;
+};
+
+void l_lock_init(struct lustre_lock *);
+void l_lock(struct lustre_lock *);
+void l_unlock(struct lustre_lock *);
+int l_has_lock(struct lustre_lock *);
+
+/*
+ * For md echo client
+ */
+enum md_echo_cmd {
+	ECHO_MD_CREATE       = 1, /* Open/Create file on MDT */
+	ECHO_MD_MKDIR	= 2, /* Mkdir on MDT */
+	ECHO_MD_DESTROY      = 3, /* Unlink file on MDT */
+	ECHO_MD_RMDIR	= 4, /* Rmdir on MDT */
+	ECHO_MD_LOOKUP       = 5, /* Lookup on MDT */
+	ECHO_MD_GETATTR      = 6, /* Getattr on MDT */
+	ECHO_MD_SETATTR      = 7, /* Setattr on MDT */
+	ECHO_MD_ALLOC_FID    = 8, /* Get FIDs from MDT */
+};
+
+/*
+ *   OBD IOCTLS
+ */
+#define OBD_IOCTL_VERSION 0x00010004
+
+struct obd_ioctl_data {
+	__u32 ioc_len;
+	__u32 ioc_version;
+
+	union {
+		__u64 ioc_cookie;
+		__u64 ioc_u64_1;
+	};
+	union {
+		__u32 ioc_conn1;
+		__u32 ioc_u32_1;
+	};
+	union {
+		__u32 ioc_conn2;
+		__u32 ioc_u32_2;
+	};
+
+	struct obdo ioc_obdo1;
+	struct obdo ioc_obdo2;
+
+	obd_size ioc_count;
+	obd_off  ioc_offset;
+	__u32    ioc_dev;
+	__u32    ioc_command;
+
+	__u64 ioc_nid;
+	__u32 ioc_nal;
+	__u32 ioc_type;
+
+	/* buffers the kernel will treat as user pointers */
+	__u32  ioc_plen1;
+	char  *ioc_pbuf1;
+	__u32  ioc_plen2;
+	char  *ioc_pbuf2;
+
+	/* inline buffers for various arguments */
+	__u32  ioc_inllen1;
+	char  *ioc_inlbuf1;
+	__u32  ioc_inllen2;
+	char  *ioc_inlbuf2;
+	__u32  ioc_inllen3;
+	char  *ioc_inlbuf3;
+	__u32  ioc_inllen4;
+	char  *ioc_inlbuf4;
+
+	char    ioc_bulk[0];
+};
+
+struct obd_ioctl_hdr {
+	__u32 ioc_len;
+	__u32 ioc_version;
+};
+
+static inline int obd_ioctl_packlen(struct obd_ioctl_data *data)
+{
+	int len = cfs_size_round(sizeof(struct obd_ioctl_data));
+	len += cfs_size_round(data->ioc_inllen1);
+	len += cfs_size_round(data->ioc_inllen2);
+	len += cfs_size_round(data->ioc_inllen3);
+	len += cfs_size_round(data->ioc_inllen4);
+	return len;
+}
+
+
+static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+	if (data->ioc_len > (1<<30)) {
+		CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen1 > (1<<30)) {
+		CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen2 > (1<<30)) {
+		CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen3 > (1<<30)) {
+		CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inllen4 > (1<<30)) {
+		CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+		CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+		CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf3 && !data->ioc_inllen3) {
+		CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_inlbuf4 && !data->ioc_inllen4) {
+		CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf1 && !data->ioc_plen1) {
+		CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_pbuf2 && !data->ioc_plen2) {
+		CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+		return 1;
+	}
+	if (data->ioc_plen1 && !data->ioc_pbuf1) {
+		CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+		return 1;
+	}
+	if (data->ioc_plen2 && !data->ioc_pbuf2) {
+		CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+		return 1;
+	}
+	if (obd_ioctl_packlen(data) > data->ioc_len) {
+		CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+		       obd_ioctl_packlen(data), data->ioc_len);
+		return 1;
+	}
+	return 0;
+}
+
+
+#include <obd_support.h>
+
+/* function defined in lustre/obdclass/<platform>/<platform>-module.c */
+int obd_ioctl_getdata(char **buf, int *len, void *arg);
+int obd_ioctl_popdata(void *arg, void *data, int len);
+
+static inline void obd_ioctl_freedata(char *buf, int len)
+{
+	ENTRY;
+
+	OBD_FREE_LARGE(buf, len);
+	EXIT;
+	return;
+}
+
+/*
+ * BSD ioctl description:
+ * #define IOC_V1       _IOR(g, n1, long)
+ * #define IOC_V2       _IOW(g, n2, long)
+ *
+ * ioctl(f, IOC_V1, arg);
+ * arg will be treated as a long value,
+ *
+ * ioctl(f, IOC_V2, arg)
+ * arg will be treated as a pointer, bsd will call
+ * copyin(buf, arg, sizeof(long))
+ *
+ * To make BSD ioctl handles argument correctly and simplely,
+ * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data
+ * for us. Does this change affect Linux?  (XXX Liang)
+ */
+#define OBD_IOC_CREATE		 _IOWR('f', 101, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DESTROY		_IOW ('f', 104, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PREALLOCATE	    _IOWR('f', 105, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_SETATTR		_IOW ('f', 107, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETATTR		_IOWR ('f', 108, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ		   _IOWR('f', 109, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_WRITE		  _IOWR('f', 110, OBD_IOC_DATA_TYPE)
+
+
+#define OBD_IOC_STATFS		 _IOWR('f', 113, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SYNC		   _IOW ('f', 114, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ2		  _IOWR('f', 115, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FORMAT		 _IOWR('f', 116, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARTITION	      _IOWR('f', 117, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_COPY		   _IOWR('f', 120, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_MIGR		   _IOWR('f', 121, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PUNCH		  _IOWR('f', 122, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_MODULE_DEBUG	   _IOWR('f', 124, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_READ	       _IOWR('f', 125, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_WRITE	      _IOWR('f', 126, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_NAME2DEV	       _IOWR('f', 127, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_UUID2DEV	       _IOWR('f', 130, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GETNAME		_IOWR('f', 131, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETMDNAME	      _IOR('f', 131, char[MAX_OBD_NAME])
+#define OBD_IOC_GETDTNAME	       OBD_IOC_GETNAME
+
+#define OBD_IOC_LOV_GET_CONFIG	 _IOWR('f', 132, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLIENT_RECOVER	 _IOW ('f', 133, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PING_TARGET	    _IOW ('f', 136, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_DEC_FS_USE_COUNT       _IO  ('f', 139      )
+#define OBD_IOC_NO_TRANSNO	     _IOW ('f', 140, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SET_READONLY	   _IOW ('f', 141, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ABORT_RECOVERY	 _IOR ('f', 142, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_ROOT_SQUASH	    _IOWR('f', 143, OBD_IOC_DATA_TYPE)
+
+#define OBD_GET_VERSION		_IOWR ('f', 144, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GSS_SUPPORT	    _IOWR('f', 145, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CLOSE_UUID	     _IOWR ('f', 147, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CHANGELOG_SEND	 _IOW ('f', 148, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETDEVICE	      _IOWR ('f', 149, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FID2PATH	       _IOWR ('f', 150, OBD_IOC_DATA_TYPE)
+/* see also <lustre/lustre_user.h> for ioctls 151-153 */
+/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */
+#define OBD_IOC_LOV_SETSTRIPE	  _IOW ('f', 154, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */
+#define OBD_IOC_LOV_GETSTRIPE	  _IOW ('f', 155, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */
+#define OBD_IOC_LOV_SETEA	      _IOW ('f', 156, OBD_IOC_DATA_TYPE)
+/* see <lustre/lustre_user.h> for ioctls 157-159 */
+/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */
+#define OBD_IOC_QUOTACHECK	     _IOW ('f', 160, int)
+/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */
+#define OBD_IOC_POLL_QUOTACHECK	_IOR ('f', 161, struct if_quotacheck *)
+/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */
+#define OBD_IOC_QUOTACTL	       _IOWR('f', 162, struct if_quotactl)
+/* see  also <lustre/lustre_user.h> for ioctls 163-176 */
+#define OBD_IOC_CHANGELOG_REG	  _IOW ('f', 177, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_DEREG	_IOW ('f', 178, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_CLEAR	_IOW ('f', 179, struct obd_ioctl_data)
+#define OBD_IOC_RECORD		 _IOWR('f', 180, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ENDRECORD	      _IOWR('f', 181, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARSE		  _IOWR('f', 182, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DORECORD	       _IOWR('f', 183, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PROCESS_CFG	    _IOWR('f', 184, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DUMP_LOG	       _IOWR('f', 185, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLEAR_LOG	      _IOWR('f', 186, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARAM		  _IOW ('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL		   _IOWR('f', 188, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_REPLACE_NIDS	   _IOWR('f', 189, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CATLOGLIST	     _IOWR('f', 190, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_INFO	      _IOWR('f', 191, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_PRINT	     _IOWR('f', 192, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CANCEL	    _IOWR('f', 193, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_REMOVE	    _IOWR('f', 194, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CHECK	     _IOWR('f', 195, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LLOG_CATINFO is deprecated */
+#define OBD_IOC_LLOG_CATINFO	   _IOWR('f', 196, OBD_IOC_DATA_TYPE)
+
+#define ECHO_IOC_GET_STRIPE	    _IOWR('f', 200, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_SET_STRIPE	    _IOWR('f', 201, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_ENQUEUE	       _IOWR('f', 202, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_CANCEL		_IOWR('f', 203, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GET_OBJ_VERSION	_IOR('f', 210, OBD_IOC_DATA_TYPE)
+
+/* <lustre/lustre_user.h> defines ioctl number 218-219 */
+#define OBD_IOC_GET_MNTOPT	     _IOW('f', 220, mntopt_t)
+
+#define OBD_IOC_ECHO_MD		_IOR('f', 221, struct obd_ioctl_data)
+#define OBD_IOC_ECHO_ALLOC_SEQ	 _IOWR('f', 222, struct obd_ioctl_data)
+
+#define OBD_IOC_START_LFSCK	       _IOWR('f', 230, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_STOP_LFSCK	       _IOW('f', 231, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PAUSE_LFSCK	       _IOW('f', 232, OBD_IOC_DATA_TYPE)
+
+/* XXX _IOWR('f', 250, long) has been defined in
+ * libcfs/include/libcfs/libcfs_private.h for debug, don't use it
+ */
+
+/* Until such time as we get_info the per-stripe maximum from the OST,
+ * we define this to be 2T - 4k, which is the ext3 maxbytes. */
+#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL
+
+/* Special values for remove LOV EA from disk */
+#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \
+						 offset == (typeof(offset))(-1))
+
+/* #define POISON_BULK 0 */
+
+/*
+ * l_wait_event is a flexible sleeping function, permitting simple caller
+ * configuration of interrupt and timeout sensitivity along with actions to
+ * be performed in the event of either exception.
+ *
+ * The first form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler,
+ *					   intr_handler, callback_data);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * l_wait_event() makes the current process wait on 'waitq' until 'condition'
+ * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending.  It
+ * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before
+ * 'condition' becomes true, it optionally calls the specified 'intr_handler'
+ * if not NULL, and returns -EINTR.
+ *
+ * If a non-zero timeout is specified, signals are ignored until the timeout
+ * has expired.  At this time, if 'timeout_handler' is not NULL it is called.
+ * If it returns FALSE l_wait_event() continues to wait as described above with
+ * signals enabled.  Otherwise it returns -ETIMEDOUT.
+ *
+ * LWI_INTR(intr_handler, callback_data) is shorthand for
+ * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data)
+ *
+ * The second form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * This form is the same as the first except that it COMPLETELY IGNORES
+ * SIGNALS.  The caller must therefore beware that if 'timeout' is zero, or if
+ * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that
+ * can unblock the current process is 'condition' becoming TRUE.
+ *
+ * Another form of usage is:
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval,
+ *					       timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ * This is the same as previous case, but condition is checked once every
+ * 'interval' jiffies (if non-zero).
+ *
+ * Subtle synchronization point: this macro does *not* necessary takes
+ * wait-queue spin-lock before returning, and, hence, following idiom is safe
+ * ONLY when caller provides some external locking:
+ *
+ *	     Thread1			    Thread2
+ *
+ *   l_wait_event(&obj->wq, ....);				       (1)
+ *
+ *				    wake_up(&obj->wq):		 (2)
+ *					 spin_lock(&q->lock);	  (2.1)
+ *					 __wake_up_common(q, ...);     (2.2)
+ *					 spin_unlock(&q->lock, flags); (2.3)
+ *
+ *   OBD_FREE_PTR(obj);						  (3)
+ *
+ * As l_wait_event() may "short-cut" execution and return without taking
+ * wait-queue spin-lock, some additional synchronization is necessary to
+ * guarantee that step (3) can begin only after (2.3) finishes.
+ *
+ * XXX nikita: some ptlrpc daemon threads have races of that sort.
+ *
+ */
+static inline int back_to_sleep(void *arg)
+{
+	return 0;
+}
+
+#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
+
+struct l_wait_info {
+	cfs_duration_t lwi_timeout;
+	cfs_duration_t lwi_interval;
+	int	    lwi_allow_intr;
+	int  (*lwi_on_timeout)(void *);
+	void (*lwi_on_signal)(void *);
+	void  *lwi_cb_data;
+};
+
+/* NB: LWI_TIMEOUT ignores signals completely */
+#define LWI_TIMEOUT(time, cb, data)	     \
+((struct l_wait_info) {			 \
+	.lwi_timeout    = time,		 \
+	.lwi_on_timeout = cb,		   \
+	.lwi_cb_data    = data,		 \
+	.lwi_interval   = 0,		    \
+	.lwi_allow_intr = 0		     \
+})
+
+#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data)  \
+((struct l_wait_info) {				 \
+	.lwi_timeout    = time,			 \
+	.lwi_on_timeout = cb,			   \
+	.lwi_cb_data    = data,			 \
+	.lwi_interval   = interval,		     \
+	.lwi_allow_intr = 0			     \
+})
+
+#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data)   \
+((struct l_wait_info) {				 \
+	.lwi_timeout    = time,			 \
+	.lwi_on_timeout = time_cb,		      \
+	.lwi_on_signal  = sig_cb,		       \
+	.lwi_cb_data    = data,			 \
+	.lwi_interval   = 0,			    \
+	.lwi_allow_intr = 0			     \
+})
+
+#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data)       \
+((struct l_wait_info) {					 \
+	.lwi_timeout    = time,				 \
+	.lwi_on_timeout = time_cb,			      \
+	.lwi_on_signal  = sig_cb,			       \
+	.lwi_cb_data    = data,				 \
+	.lwi_interval   = 0,				    \
+	.lwi_allow_intr = 1				     \
+})
+
+#define LWI_INTR(cb, data)  LWI_TIMEOUT_INTR(0, NULL, cb, data)
+
+
+/*
+ * wait for @condition to become true, but no longer than timeout, specified
+ * by @info.
+ */
+#define __l_wait_event(wq, condition, info, ret, l_add_wait)		   \
+do {									   \
+	wait_queue_t __wait;						 \
+	cfs_duration_t __timeout = info->lwi_timeout;			  \
+	sigset_t   __blocked;					      \
+	int   __allow_intr = info->lwi_allow_intr;			     \
+									       \
+	ret = 0;							       \
+	if (condition)							 \
+		break;							 \
+									       \
+	init_waitqueue_entry_current(&__wait);					    \
+	l_add_wait(&wq, &__wait);					      \
+									       \
+	/* Block all signals (just the non-fatal ones if no timeout). */       \
+	if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr))   \
+		__blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);	      \
+	else								   \
+		__blocked = cfs_block_sigsinv(0);			      \
+									       \
+	for (;;) {							     \
+		unsigned       __wstate;				       \
+									       \
+		__wstate = info->lwi_on_signal != NULL &&		      \
+			   (__timeout == 0 || __allow_intr) ?		  \
+			TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;	       \
+									       \
+		set_current_state(TASK_INTERRUPTIBLE);		 \
+									       \
+		if (condition)						 \
+			break;						 \
+									       \
+		if (__timeout == 0) {					  \
+			waitq_wait(&__wait, __wstate);		     \
+		} else {						       \
+			cfs_duration_t interval = info->lwi_interval?	  \
+					     min_t(cfs_duration_t,	     \
+						 info->lwi_interval,__timeout):\
+					     __timeout;			\
+			cfs_duration_t remaining = waitq_timedwait(&__wait,\
+						   __wstate,		   \
+						   interval);		  \
+			__timeout = cfs_time_sub(__timeout,		    \
+					    cfs_time_sub(interval, remaining));\
+			if (__timeout == 0) {				  \
+				if (info->lwi_on_timeout == NULL ||	    \
+				    info->lwi_on_timeout(info->lwi_cb_data)) { \
+					ret = -ETIMEDOUT;		      \
+					break;				 \
+				}					      \
+				/* Take signals after the timeout expires. */  \
+				if (info->lwi_on_signal != NULL)	       \
+				    (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\
+			}						      \
+		}							      \
+									       \
+		if (condition)						 \
+			break;						 \
+		if (cfs_signal_pending()) {				    \
+			if (info->lwi_on_signal != NULL &&		     \
+			    (__timeout == 0 || __allow_intr)) {		\
+				if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \
+					info->lwi_on_signal(info->lwi_cb_data);\
+				ret = -EINTR;				  \
+				break;					 \
+			}						      \
+			/* We have to do this here because some signals */     \
+			/* are not blockable - ie from strace(1).       */     \
+			/* In these cases we want to schedule_timeout() */     \
+			/* again, because we don't want that to return  */     \
+			/* -EINTR when the RPC actually succeeded.      */     \
+			/* the recalc_sigpending() below will deliver the */     \
+			/* signal properly.			     */     \
+			cfs_clear_sigpending();				\
+		}							      \
+	}								      \
+									       \
+	cfs_restore_sigs(__blocked);					   \
+									       \
+	set_current_state(TASK_RUNNING);			       \
+	remove_wait_queue(&wq, &__wait);					   \
+} while (0)
+
+
+
+#define l_wait_event(wq, condition, info)		       \
+({							      \
+	int		 __ret;			      \
+	struct l_wait_info *__info = (info);		    \
+								\
+	__l_wait_event(wq, condition, __info,		   \
+		       __ret, add_wait_queue);		   \
+	__ret;						  \
+})
+
+#define l_wait_event_exclusive(wq, condition, info)	     \
+({							      \
+	int		 __ret;			      \
+	struct l_wait_info *__info = (info);		    \
+								\
+	__l_wait_event(wq, condition, __info,		   \
+		       __ret, add_wait_queue_exclusive);	 \
+	__ret;						  \
+})
+
+#define l_wait_event_exclusive_head(wq, condition, info)	\
+({							      \
+	int		 __ret;			      \
+	struct l_wait_info *__info = (info);		    \
+								\
+	__l_wait_event(wq, condition, __info,		   \
+		       __ret, add_wait_queue_exclusive_head);    \
+	__ret;						  \
+})
+
+#define l_wait_condition(wq, condition)			 \
+({							      \
+	struct l_wait_info lwi = { 0 };			 \
+	l_wait_event(wq, condition, &lwi);		      \
+})
+
+#define l_wait_condition_exclusive(wq, condition)	       \
+({							      \
+	struct l_wait_info lwi = { 0 };			 \
+	l_wait_event_exclusive(wq, condition, &lwi);	    \
+})
+
+#define l_wait_condition_exclusive_head(wq, condition)	  \
+({							      \
+	struct l_wait_info lwi = { 0 };			 \
+	l_wait_event_exclusive_head(wq, condition, &lwi);       \
+})
+
+#define LIBLUSTRE_CLIENT (0)
+
+/** @} lib */
+
+#endif /* _LUSTRE_LIB_H */

diff --git a/drivers/staging/lustre/lustre/include/lustre_linkea.h b/drivers/staging/lustre/lustre/include/lustre_linkea.h
new file mode 100644
index 0000000..5790be9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_linkea.h

@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: di wang <di.wang@intel.com>
+ */
+
+struct linkea_data {
+	/**
+	 * Buffer to keep link EA body.
+	 */
+	struct lu_buf		*ld_buf;
+	/**
+	 * The matched header, entry and its lenght in the EA
+	 */
+	struct link_ea_header	*ld_leh;
+	struct link_ea_entry	*ld_lee;
+	int			ld_reclen;
+};
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf);
+int linkea_init(struct linkea_data *ldata);
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+			 struct lu_name *lname, struct lu_fid *pfid);
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		   const struct lu_fid *pfid);
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname);
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+		      const struct lu_fid  *pfid);
+
+#define LINKEA_NEXT_ENTRY(ldata)	\
+	(struct link_ea_entry *)((char *)ldata.ld_lee + ldata.ld_reclen)
+
+#define LINKEA_FIRST_ENTRY(ldata)	\
+	(struct link_ea_entry *)(ldata.ld_leh + 1)

diff --git a/drivers/staging/lustre/lustre/include/lustre_lite.h b/drivers/staging/lustre/lustre/include/lustre_lite.h
new file mode 100644
index 0000000..25f8bfa
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_lite.h

@@ -0,0 +1,147 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LL_H
+#define _LL_H
+
+/** \defgroup lite lite
+ *
+ * @{
+ */
+
+#include <linux/lustre_lite.h>
+
+#include <obd_class.h>
+#include <obd_ost.h>
+#include <lustre_net.h>
+#include <lustre_mds.h>
+#include <lustre_ha.h>
+
+/* 4UL * 1024 * 1024 */
+#define LL_MAX_BLKSIZE_BITS     (22)
+#define LL_MAX_BLKSIZE	  (1UL<<LL_MAX_BLKSIZE_BITS)
+
+#include <lustre/lustre_user.h>
+
+
+struct lustre_rw_params {
+	int		lrp_lock_mode;
+	ldlm_policy_data_t lrp_policy;
+	obd_flag	   lrp_brw_flags;
+	int		lrp_ast_flags;
+};
+
+/*
+ * XXX nikita: this function lives in the header because it is used by both
+ * llite kernel module and liblustre library, and there is no (?) better place
+ * to put it in.
+ */
+static inline void lustre_build_lock_params(int cmd, unsigned long open_flags,
+					    __u64 connect_flags,
+					    loff_t pos, ssize_t len,
+					    struct lustre_rw_params *params)
+{
+	params->lrp_lock_mode = (cmd == OBD_BRW_READ) ? LCK_PR : LCK_PW;
+	params->lrp_brw_flags = 0;
+
+	params->lrp_policy.l_extent.start = pos;
+	params->lrp_policy.l_extent.end = pos + len - 1;
+	/*
+	 * for now O_APPEND always takes local locks.
+	 */
+	if (cmd == OBD_BRW_WRITE && (open_flags & O_APPEND)) {
+		params->lrp_policy.l_extent.start = 0;
+		params->lrp_policy.l_extent.end   = OBD_OBJECT_EOF;
+	} else if (LIBLUSTRE_CLIENT && (connect_flags & OBD_CONNECT_SRVLOCK)) {
+		/*
+		 * liblustre: OST-side locking for all non-O_APPEND
+		 * reads/writes.
+		 */
+		params->lrp_lock_mode = LCK_NL;
+		params->lrp_brw_flags = OBD_BRW_SRVLOCK;
+	} else {
+		/*
+		 * nothing special for the kernel. In the future llite may use
+		 * OST-side locks for small writes into highly contended
+		 * files.
+		 */
+	}
+	params->lrp_ast_flags = (open_flags & O_NONBLOCK) ?
+		LDLM_FL_BLOCK_NOWAIT : 0;
+}
+
+/*
+ * This is embedded into liblustre and llite super-blocks to keep track of
+ * connect flags (capabilities) supported by all imports given mount is
+ * connected to.
+ */
+struct lustre_client_ocd {
+	/*
+	 * This is conjunction of connect_flags across all imports (LOVs) this
+	 * mount is connected to. This field is updated by cl_ocd_update()
+	 * under ->lco_lock.
+	 */
+	__u64	      lco_flags;
+	struct mutex	   lco_lock;
+	struct obd_export *lco_md_exp;
+	struct obd_export *lco_dt_exp;
+};
+
+/*
+ * Chain of hash overflow pages.
+ */
+struct ll_dir_chain {
+	/* XXX something. Later */
+};
+
+static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
+{
+}
+
+static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
+{
+}
+
+static inline unsigned long hash_x_index(__u64 hash, int hash64)
+{
+	if (BITS_PER_LONG == 32 && hash64)
+		hash >>= 32;
+	return ~0UL - hash;
+}
+
+/** @} lite */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_log.h b/drivers/staging/lustre/lustre/include/lustre_log.h
new file mode 100644
index 0000000..714ab37
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_log.h

@@ -0,0 +1,576 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *
+ * - orphan recovery: OST adds record on create
+ * - mtime/size consistency: the OST adds a record on first write
+ * - open/unlinked objects: OST adds a record on destroy
+ *
+ * - mds unlink log: the MDS adds an entry upon delete
+ *
+ * - raid1 replication log between OST's
+ * - MDS replication logs
+ */
+
+#ifndef _LUSTRE_LOG_H
+#define _LUSTRE_LOG_H
+
+/** \defgroup log log
+ *
+ * @{
+ */
+
+#include <linux/lustre_log.h>
+
+#include <obd_class.h>
+#include <obd_ost.h>
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#define LOG_NAME_LIMIT(logname, name)		   \
+	snprintf(logname, sizeof(logname), "LOGS/%s", name)
+#define LLOG_EEMPTY 4711
+
+enum llog_open_param {
+	LLOG_OPEN_EXISTS	= 0x0000,
+	LLOG_OPEN_NEW		= 0x0001,
+};
+
+struct plain_handle_data {
+	struct list_head	  phd_entry;
+	struct llog_handle *phd_cat_handle;
+	struct llog_cookie  phd_cookie; /* cookie of this log in its cat */
+};
+
+struct cat_handle_data {
+	struct list_head	      chd_head;
+	struct llog_handle     *chd_current_log; /* currently open log */
+	struct llog_handle	*chd_next_log; /* llog to be used next */
+};
+
+static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid)
+{
+	/* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS)
+	 * logid's by non-zero ogen (inode generation) and convert them
+	 * into IGIF */
+	if (id->lgl_ogen == 0) {
+		fid->f_seq = id->lgl_oi.oi.oi_seq;
+		fid->f_oid = id->lgl_oi.oi.oi_id;
+		fid->f_ver = 0;
+	} else {
+		lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen);
+	}
+}
+
+static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id)
+{
+	id->lgl_oi.oi.oi_seq = fid->f_seq;
+	id->lgl_oi.oi.oi_id = fid->f_oid;
+	id->lgl_ogen = 0;
+}
+
+static inline void logid_set_id(struct llog_logid *log_id, __u64 id)
+{
+	log_id->lgl_oi.oi.oi_id = id;
+}
+
+static inline __u64 logid_id(struct llog_logid *log_id)
+{
+	return log_id->lgl_oi.oi.oi_id;
+}
+
+struct llog_handle;
+
+/* llog.c  -  general API */
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid);
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+		      struct llog_rec_hdr *rec, void *data);
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata);
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index);
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param);
+int llog_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_get_size(struct llog_handle *loghandle);
+
+/* llog_process flags */
+#define LLOG_FLAG_NODEAMON 0x0001
+
+/* llog_cat.c - catalog api */
+struct llog_process_data {
+	/**
+	 * Any useful data needed while processing catalog. This is
+	 * passed later to process callback.
+	 */
+	void		*lpd_data;
+	/**
+	 * Catalog process callback function, called for each record
+	 * in catalog.
+	 */
+	llog_cb_t	    lpd_cb;
+	/**
+	 * Start processing the catalog from startcat/startidx
+	 */
+	int		  lpd_startcat;
+	int		  lpd_startidx;
+};
+
+struct llog_process_cat_data {
+	/**
+	 * Temporary stored first_idx while scanning log.
+	 */
+	int		  lpcd_first_idx;
+	/**
+	 * Temporary stored last_idx while scanning log.
+	 */
+	int		  lpcd_last_idx;
+};
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     void *buf, struct thandle *th);
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th);
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		 void *buf);
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies);
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cb,
+			     void *data, int startcat, int startidx, bool fork);
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx);
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh, llog_cb_t cb,
+			     void *data);
+int llog_cat_init_and_process(const struct lu_env *env,
+			      struct llog_handle *llh);
+
+/* llog_obd.c */
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, struct llog_operations *op);
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+		 struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+		 struct llog_cookie *logcookies, int numcookies);
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct lov_stripe_md *lsm, int count,
+		struct llog_cookie *cookies, int flags);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+		  struct obd_device *disk_obd, int *idx);
+
+int obd_llog_finish(struct obd_device *obd, int count);
+
+/* llog_ioctl.c */
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+	       struct obd_ioctl_data *data);
+
+/* llog_net.c */
+int llog_initiator_connect(struct llog_ctxt *ctxt);
+
+struct llog_operations {
+	int (*lop_destroy)(const struct lu_env *env,
+			   struct llog_handle *handle);
+	int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h,
+			      int *curr_idx, int next_idx, __u64 *offset,
+			      void *buf, int len);
+	int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h,
+			      int prev_idx, void *buf, int len);
+	int (*lop_read_header)(const struct lu_env *env,
+			       struct llog_handle *handle);
+	int (*lop_setup)(const struct lu_env *env, struct obd_device *obd,
+			 struct obd_llog_group *olg, int ctxt_idx,
+			 struct obd_device *disk_obd);
+	int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
+			int flags);
+	int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
+	int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
+			  struct lov_stripe_md *lsm, int count,
+			  struct llog_cookie *cookies, int flags);
+	int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
+			   struct llog_gen *gen, struct obd_uuid *uuid);
+	/**
+	 * Any llog file must be opened first using llog_open().  Llog can be
+	 * opened by name, logid or without both, in last case the new logid
+	 * will be generated.
+	 */
+	int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh,
+			struct llog_logid *logid, char *name,
+			enum llog_open_param);
+	/**
+	 * Opened llog may not exist and this must be checked where needed using
+	 * the llog_exist() call.
+	 */
+	int (*lop_exist)(struct llog_handle *lgh);
+	/**
+	 * Close llog file and calls llog_free_handle() implicitly.
+	 * Any opened llog must be closed by llog_close() call.
+	 */
+	int (*lop_close)(const struct lu_env *env, struct llog_handle *handle);
+	/**
+	 * Create new llog file. The llog must be opened.
+	 * Must be used only for local llog operations.
+	 */
+	int (*lop_declare_create)(const struct lu_env *env,
+				  struct llog_handle *handle,
+				  struct thandle *th);
+	int (*lop_create)(const struct lu_env *env, struct llog_handle *handle,
+			  struct thandle *th);
+	/**
+	 * write new record in llog. It appends records usually but can edit
+	 * existing records too.
+	 */
+	int (*lop_declare_write_rec)(const struct lu_env *env,
+				     struct llog_handle *lgh,
+				     struct llog_rec_hdr *rec,
+				     int idx, struct thandle *th);
+	int (*lop_write_rec)(const struct lu_env *env,
+			     struct llog_handle *loghandle,
+			     struct llog_rec_hdr *rec,
+			     struct llog_cookie *cookie, int cookiecount,
+			     void *buf, int idx, struct thandle *th);
+	/**
+	 * Add new record in llog catalog. Does the same as llog_write_rec()
+	 * but using llog catalog.
+	 */
+	int (*lop_declare_add)(const struct lu_env *env,
+			       struct llog_handle *lgh,
+			       struct llog_rec_hdr *rec, struct thandle *th);
+	int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh,
+		       struct llog_rec_hdr *rec, struct llog_cookie *cookie,
+		       void *buf, struct thandle *th);
+	/* Old llog_add version, used in MDS-LOV-OSC now and will gone with
+	 * LOD/OSP replacement */
+	int (*lop_obd_add)(const struct lu_env *env, struct llog_ctxt *ctxt,
+			   struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+			   struct llog_cookie *logcookies, int numcookies);
+};
+
+/* In-memory descriptor for a log object or log catalog */
+struct llog_handle {
+	struct rw_semaphore	 lgh_lock;
+	spinlock_t		 lgh_hdr_lock; /* protect lgh_hdr data */
+	struct llog_logid	 lgh_id; /* id of this log */
+	struct llog_log_hdr	*lgh_hdr;
+	struct file		*lgh_file;
+	struct dt_object	*lgh_obj;
+	int			 lgh_last_idx;
+	int			 lgh_cur_idx; /* used during llog_process */
+	__u64			 lgh_cur_offset; /* used during llog_process */
+	struct llog_ctxt	*lgh_ctxt;
+	union {
+		struct plain_handle_data	 phd;
+		struct cat_handle_data		 chd;
+	} u;
+	char			*lgh_name;
+	void			*private_data;
+	struct llog_operations	*lgh_logops;
+	atomic_t		 lgh_refcount;
+};
+
+/* llog_lvfs.c */
+extern struct llog_operations llog_lvfs_ops;
+
+/* llog_osd.c */
+extern struct llog_operations llog_osd_ops;
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count,
+			  struct llog_catid *idarray);
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count,
+			  struct llog_catid *idarray);
+
+#define LLOG_CTXT_FLAG_UNINITIALIZED     0x00000001
+#define LLOG_CTXT_FLAG_STOP		 0x00000002
+
+struct llog_ctxt {
+	int		      loc_idx; /* my index the obd array of ctxt's */
+	struct obd_device       *loc_obd; /* points back to the containing obd*/
+	struct obd_llog_group   *loc_olg; /* group containing that ctxt */
+	struct obd_export       *loc_exp; /* parent "disk" export (e.g. MDS) */
+	struct obd_import       *loc_imp; /* to use in RPC's: can be backward
+					     pointing import */
+	struct llog_operations  *loc_logops;
+	struct llog_handle      *loc_handle;
+	struct mutex		 loc_mutex; /* protect loc_imp */
+	atomic_t	     loc_refcount;
+	long		     loc_flags; /* flags, see above defines */
+	struct dt_object	*loc_dir;
+};
+
+#define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
+
+static inline int llog_obd2ops(struct llog_ctxt *ctxt,
+			       struct llog_operations **lop)
+{
+	if (ctxt == NULL)
+		return -ENOTCONN;
+
+	*lop = ctxt->loc_logops;
+	if (*lop == NULL)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static inline int llog_handle2ops(struct llog_handle *loghandle,
+				  struct llog_operations **lop)
+{
+	if (loghandle == NULL || loghandle->lgh_logops == NULL)
+		return -EINVAL;
+
+	*lop = loghandle->lgh_logops;
+	return 0;
+}
+
+static inline int llog_data_len(int len)
+{
+	return cfs_size_round(len);
+}
+
+static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt)
+{
+	atomic_inc(&ctxt->loc_refcount);
+	CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt,
+	       atomic_read(&ctxt->loc_refcount));
+	return ctxt;
+}
+
+static inline void llog_ctxt_put(struct llog_ctxt *ctxt)
+{
+	if (ctxt == NULL)
+		return;
+	LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt,
+	       atomic_read(&ctxt->loc_refcount) - 1);
+	__llog_ctxt_put(NULL, ctxt);
+}
+
+static inline void llog_group_init(struct obd_llog_group *olg, int group)
+{
+	init_waitqueue_head(&olg->olg_waitq);
+	spin_lock_init(&olg->olg_lock);
+	mutex_init(&olg->olg_cat_processing);
+	olg->olg_seq = group;
+}
+
+static inline int llog_group_set_ctxt(struct obd_llog_group *olg,
+				      struct llog_ctxt *ctxt, int index)
+{
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+	spin_lock(&olg->olg_lock);
+	if (olg->olg_ctxts[index] != NULL) {
+		spin_unlock(&olg->olg_lock);
+		return -EEXIST;
+	}
+	olg->olg_ctxts[index] = ctxt;
+	spin_unlock(&olg->olg_lock);
+	return 0;
+}
+
+static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg,
+						    int index)
+{
+	struct llog_ctxt *ctxt;
+
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+	spin_lock(&olg->olg_lock);
+	if (olg->olg_ctxts[index] == NULL)
+		ctxt = NULL;
+	else
+		ctxt = llog_ctxt_get(olg->olg_ctxts[index]);
+	spin_unlock(&olg->olg_lock);
+	return ctxt;
+}
+
+static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index)
+{
+	LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+	spin_lock(&olg->olg_lock);
+	olg->olg_ctxts[index] = NULL;
+	spin_unlock(&olg->olg_lock);
+}
+
+static inline struct llog_ctxt *llog_get_context(struct obd_device *obd,
+						 int index)
+{
+	return llog_group_get_ctxt(&obd->obd_olg, index);
+}
+
+static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index)
+{
+	return (olg->olg_ctxts[index] == NULL);
+}
+
+static inline int llog_ctxt_null(struct obd_device *obd, int index)
+{
+	return (llog_group_ctxt_null(&obd->obd_olg, index));
+}
+
+static inline int llog_destroy(const struct lu_env *env,
+			       struct llog_handle *handle)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_destroy == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_destroy(env, handle);
+	RETURN(rc);
+}
+
+static inline int llog_next_block(const struct lu_env *env,
+				  struct llog_handle *loghandle, int *cur_idx,
+				  int next_idx, __u64 *cur_offset, void *buf,
+				  int len)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_next_block == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx,
+				 cur_offset, buf, len);
+	RETURN(rc);
+}
+
+static inline int llog_prev_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int prev_idx, void *buf, int len)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_prev_block == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len);
+	RETURN(rc);
+}
+
+static inline int llog_connect(struct llog_ctxt *ctxt,
+			       struct llog_logid *logid, struct llog_gen *gen,
+			       struct obd_uuid *uuid)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_obd2ops(ctxt, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_connect == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_connect(ctxt, logid, gen, uuid);
+	RETURN(rc);
+}
+
+/* llog.c */
+int llog_exist(struct llog_handle *loghandle);
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th);
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th);
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th);
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int numcookies, void *buf, int idx, struct thandle *th);
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     void *buf, struct thandle *th);
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th);
+int lustre_process_log(struct super_block *sb, char *logname,
+		       struct config_llog_instance *cfg);
+int lustre_end_log(struct super_block *sb, char *logname,
+		   struct config_llog_instance *cfg);
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name);
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name);
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+	       int cookiecount, void *buf, int idx);
+
+/** @} log */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h
new file mode 100644
index 0000000..fb1561a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mdc.h

@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mdc.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDC_H
+#define _LUSTRE_MDC_H
+
+/** \defgroup mdc mdc
+ *
+ * @{
+ */
+
+# include <linux/fs.h>
+# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/posix_acl_xattr.h>
+# endif /* CONFIG_FS_POSIX_ACL */
+# include <linux/lustre_intent.h>
+#include <lustre_handles.h>
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct ptlrpc_client;
+struct obd_export;
+struct ptlrpc_request;
+struct obd_device;
+
+struct mdc_rpc_lock {
+	struct mutex		rpcl_mutex;
+	struct lookup_intent	*rpcl_it;
+	int			rpcl_fakes;
+};
+
+#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
+static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
+{
+	mutex_init(&lck->rpcl_mutex);
+	lck->rpcl_it = NULL;
+}
+
+static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
+				    struct lookup_intent *it)
+{
+	ENTRY;
+
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP))
+		return;
+
+	/* This would normally block until the existing request finishes.
+	 * If fail_loc is set it will block until the regular request is
+	 * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
+	 * it will only be cleared when all fake requests are finished.
+	 * Only when all fake requests are finished can normal requests
+	 * be sent, to ensure they are recoverable again. */
+ again:
+	mutex_lock(&lck->rpcl_mutex);
+
+	if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
+		lck->rpcl_it = MDC_FAKE_RPCL_IT;
+		lck->rpcl_fakes++;
+		mutex_unlock(&lck->rpcl_mutex);
+		return;
+	}
+
+	/* This will only happen when the CFS_FAIL_CHECK() was
+	 * just turned off but there are still requests in progress.
+	 * Wait until they finish.  It doesn't need to be efficient
+	 * in this extremely rare case, just have low overhead in
+	 * the common case when it isn't true. */
+	while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
+		mutex_unlock(&lck->rpcl_mutex);
+		schedule_timeout(cfs_time_seconds(1) / 4);
+		goto again;
+	}
+
+	LASSERT(lck->rpcl_it == NULL);
+	lck->rpcl_it = it;
+}
+
+static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
+				    struct lookup_intent *it)
+{
+	if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP))
+		goto out;
+
+	if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
+		mutex_lock(&lck->rpcl_mutex);
+
+		LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
+		lck->rpcl_fakes--;
+
+		if (lck->rpcl_fakes == 0)
+			lck->rpcl_it = NULL;
+
+	} else {
+		LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
+		lck->rpcl_it = NULL;
+	}
+
+	mutex_unlock(&lck->rpcl_mutex);
+ out:
+	EXIT;
+}
+
+static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
+					       struct mdt_body *body)
+{
+	if (body->valid & OBD_MD_FLMODEASIZE) {
+		if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize)
+			exp->exp_obd->u.cli.cl_max_mds_easize =
+						body->max_mdsize;
+		if (exp->exp_obd->u.cli.cl_max_mds_cookiesize <
+						body->max_cookiesize)
+			exp->exp_obd->u.cli.cl_max_mds_cookiesize =
+						body->max_cookiesize;
+	}
+}
+
+
+struct mdc_cache_waiter {
+	struct list_head	      mcw_entry;
+	wait_queue_head_t	     mcw_waitq;
+};
+
+/* mdc/mdc_locks.c */
+int it_disposition(struct lookup_intent *it, int flag);
+void it_clear_disposition(struct lookup_intent *it, int flag);
+void it_set_disposition(struct lookup_intent *it, int flag);
+int it_open_error(int phase, struct lookup_intent *it);
+
+/** @} mdc */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_mds.h b/drivers/staging/lustre/lustre/include/lustre_mds.h
new file mode 100644
index 0000000..b386f87
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mds.h

@@ -0,0 +1,81 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mds.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDS_H
+#define _LUSTRE_MDS_H
+
+/** \defgroup mds mds
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct mds_group_info {
+	struct obd_uuid *uuid;
+	int group;
+};
+
+struct mds_capa_info {
+	struct obd_uuid	*uuid;
+	struct lustre_capa_key *capa;
+};
+
+#define MDD_OBD_NAME     "mdd_obd"
+#define MDD_OBD_UUID     "mdd_obd_uuid"
+
+static inline int md_should_create(__u64 flags)
+{
+       return !(flags & MDS_OPEN_DELAY_CREATE ||
+	       !(flags & FMODE_WRITE));
+}
+
+/* these are local flags, used only on the client, private */
+#define M_CHECK_STALE	   0200000000
+
+/** @} mds */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_mdt.h b/drivers/staging/lustre/lustre/include/lustre_mdt.h
new file mode 100644
index 0000000..dba26a6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mdt.h

@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_MDT_H
+#define __LINUX_MDT_H
+
+/** \defgroup mdt mdt
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include <md_object.h>
+#include <dt_object.h>
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * Common thread info for mdt, seq and fld
+ */
+struct com_thread_info {
+	/*
+	 * for req-layout interface.
+	 */
+	struct req_capsule *cti_pill;
+};
+
+enum {
+	ESERIOUS = 0x0001000
+};
+
+static inline int err_serious(int rc)
+{
+	LASSERT(rc < 0);
+	LASSERT(-rc < ESERIOUS);
+	return -(-rc | ESERIOUS);
+}
+
+static inline int clear_serious(int rc)
+{
+	if (rc < 0)
+		rc = -(-rc & ~ESERIOUS);
+	return rc;
+}
+
+static inline int is_serious(int rc)
+{
+	return (rc < 0 && -rc & ESERIOUS);
+}
+
+/** @} mdt */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h
new file mode 100644
index 0000000..293dd90
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_net.h

@@ -0,0 +1,3451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup PtlRPC Portal RPC and networking module.
+ *
+ * PortalRPC is the layer used by rest of lustre code to achieve network
+ * communications: establish connections with corresponding export and import
+ * states, listen for a service, send and receive RPCs.
+ * PortalRPC also includes base recovery framework: packet resending and
+ * replaying, reconnections, pinger.
+ *
+ * PortalRPC utilizes LNet as its transport layer.
+ *
+ * @{
+ */
+
+
+#ifndef _LUSTRE_NET_H
+#define _LUSTRE_NET_H
+
+/** \defgroup net net
+ *
+ * @{
+ */
+
+#include <linux/lustre_net.h>
+
+#include <linux/libcfs/libcfs.h>
+// #include <obd.h>
+#include <linux/lnet/lnet.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ha.h>
+#include <lustre_sec.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lu_object.h>
+#include <lustre_req_layout.h>
+
+#include <obd_support.h>
+#include <lustre_ver.h>
+
+/* MD flags we _always_ use */
+#define PTLRPC_MD_OPTIONS  0
+
+/**
+ * Max # of bulk operations in one request.
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value.  The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */
+#define PTLRPC_BULK_OPS_BITS	2
+#define PTLRPC_BULK_OPS_COUNT	(1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all.  Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future.  Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK	(~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers.  Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS	(LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE	(1 << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES	(PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+#define ONE_MB_BRW_SIZE		(1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_SIZE		(1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_PAGES	(MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define DT_MAX_BRW_SIZE		PTLRPC_MAX_BRW_SIZE
+#define DT_MAX_BRW_PAGES	(DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define OFD_MAX_BRW_SIZE	(1 << LNET_MTU_BITS)
+
+/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
+# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
+#  error "PTLRPC_MAX_BRW_PAGES isn't a power of two"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE))
+#  error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
+#  error "PTLRPC_MAX_BRW_SIZE too big"
+# endif
+# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
+#  error "PTLRPC_MAX_BRW_PAGES too big"
+# endif
+
+#define PTLRPC_NTHRS_INIT	2
+
+/**
+ * Buffer Constants
+ *
+ * Constants determine how memory is used to buffer incoming service requests.
+ *
+ * ?_NBUFS	      # buffers to allocate when growing the pool
+ * ?_BUFSIZE	    # bytes in a single request buffer
+ * ?_MAXREQSIZE	 # maximum request service will receive
+ *
+ * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk
+ * of ?_NBUFS is added to the pool.
+ *
+ * Messages larger than ?_MAXREQSIZE are dropped.  Request buffers are
+ * considered full when less than ?_MAXREQSIZE is left in them.
+ */
+/**
+ * Thread Constants
+ *
+ * Constants determine how threads are created for ptlrpc service.
+ *
+ * ?_NTHRS_INIT		# threads to create for each service partition on
+ *			  initializing. If it's non-affinity service and
+ *			  there is only one partition, it's the overall #
+ *			  threads for the service while initializing.
+ * ?_NTHRS_BASE		# threads should be created at least for each
+ *			  ptlrpc partition to keep the service healthy.
+ *			  It's the low-water mark of threads upper-limit
+ *			  for each partition.
+ * ?_THR_FACTOR	 # threads can be added on threads upper-limit for
+ *			  each CPU core. This factor is only for reference,
+ *			  we might decrease value of factor if number of cores
+ *			  per CPT is above a limit.
+ * ?_NTHRS_MAX		# overall threads can be created for a service,
+ *			  it's a soft limit because if service is running
+ *			  on machine with hundreds of cores and tens of
+ *			  CPU partitions, we need to guarantee each partition
+ *			  has ?_NTHRS_BASE threads, which means total threads
+ *			  will be ?_NTHRS_BASE * number_of_cpts which can
+ *			  exceed ?_NTHRS_MAX.
+ *
+ * Examples
+ *
+ * #define MDS_NTHRS_INIT	2
+ * #define MDS_NTHRS_BASE	64
+ * #define MDS_NTHRS_FACTOR	8
+ * #define MDS_NTHRS_MAX	1024
+ *
+ * Example 1):
+ * ---------------------------------------------------------------------
+ * Server(A) has 16 cores, user configured it to 4 partitions so each
+ * partition has 4 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96
+ *
+ * Total number of threads for the service is:
+ *     96 * partitions(4) = 384
+ *
+ * Example 2):
+ * ---------------------------------------------------------------------
+ * Server(B) has 32 cores, user configured it to 4 partitions so each
+ * partition has 8 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128
+ *
+ * Total number of threads for the service is:
+ *     128 * partitions(4) = 512
+ *
+ * Example 3):
+ * ---------------------------------------------------------------------
+ * Server(B) has 96 cores, user configured it to 8 partitions so each
+ * partition has 12 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160
+ *
+ * Total number of threads for the service is:
+ *     160 * partitions(8) = 1280
+ *
+ * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number
+ * as upper limit of threads number for each partition:
+ *     MDS_NTHRS_MAX(1024) / partitions(8) = 128
+ *
+ * Example 4):
+ * ---------------------------------------------------------------------
+ * Server(C) have a thousand of cores and user configured it to 32 partitions
+ *     MDS_NTHRS_BASE(64) * 32 = 2048
+ *
+ * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need
+ * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads
+ * to keep service healthy, so total number of threads will just be 2048.
+ *
+ * NB: we don't suggest to choose server with that many cores because backend
+ *     filesystem itself, buffer cache, or underlying network stack might
+ *     have some SMP scalability issues at that large scale.
+ *
+ *     If user already has a fat machine with hundreds or thousands of cores,
+ *     there are two choices for configuration:
+ *     a) create CPU table from subset of all CPUs and run Lustre on
+ *	top of this subset
+ *     b) bind service threads on a few partitions, see modparameters of
+ *	MDS and OSS for details
+*
+ * NB: these calculations (and examples below) are simplified to help
+ *     understanding, the real implementation is a little more complex,
+ *     please see ptlrpc_server_nthreads_check() for details.
+ *
+ */
+
+ /*
+  * LDLM threads constants:
+  *
+  * Given 8 as factor and 24 as base threads number
+  *
+  * example 1)
+  * On 4-core machine we will have 24 + 8 * 4 = 56 threads.
+  *
+  * example 2)
+  * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56
+  * threads for each partition and total threads number will be 112.
+  *
+  * example 3)
+  * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24)
+  * threads for each partition to keep service healthy, so total threads
+  * number should be 24 * 8 = 192.
+  *
+  * So with these constants, threads number will be at the similar level
+  * of old versions, unless target machine has over a hundred cores
+  */
+#define LDLM_THR_FACTOR		8
+#define LDLM_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define LDLM_NTHRS_BASE		24
+#define LDLM_NTHRS_MAX		(num_online_cpus() == 1 ? 64 : 128)
+
+#define LDLM_BL_THREADS   LDLM_NTHRS_AUTO_INIT
+#define LDLM_CLIENT_NBUFS 1
+#define LDLM_SERVER_NBUFS 64
+#define LDLM_BUFSIZE      (8 * 1024)
+#define LDLM_MAXREQSIZE   (5 * 1024)
+#define LDLM_MAXREPSIZE   (1024)
+
+ /*
+  * MDS threads constants:
+  *
+  * Please see examples in "Thread Constants", MDS threads number will be at
+  * the comparable level of old versions, unless the server has many cores.
+  */
+#ifndef MDS_MAX_THREADS
+#define MDS_MAX_THREADS		1024
+#define MDS_MAX_OTHR_THREADS	256
+
+#else /* MDS_MAX_THREADS */
+#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT
+#undef MDS_MAX_THREADS
+#define MDS_MAX_THREADS	PTLRPC_NTHRS_INIT
+#endif
+#define MDS_MAX_OTHR_THREADS	max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2)
+#endif
+
+/* default service */
+#define MDS_THR_FACTOR		8
+#define MDS_NTHRS_INIT		PTLRPC_NTHRS_INIT
+#define MDS_NTHRS_MAX		MDS_MAX_THREADS
+#define MDS_NTHRS_BASE		min(64, MDS_NTHRS_MAX)
+
+/* read-page service */
+#define MDS_RDPG_THR_FACTOR	4
+#define MDS_RDPG_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_RDPG_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+#define MDS_RDPG_NTHRS_BASE	min(48, MDS_RDPG_NTHRS_MAX)
+
+/* these should be removed when we remove setattr service in the future */
+#define MDS_SETA_THR_FACTOR	4
+#define MDS_SETA_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_SETA_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+#define MDS_SETA_NTHRS_BASE	min(48, MDS_SETA_NTHRS_MAX)
+
+/* non-affinity threads */
+#define MDS_OTHR_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define MDS_OTHR_NTHRS_MAX	MDS_MAX_OTHR_THREADS
+
+#define MDS_NBUFS		64
+
+/**
+ * Assume file name length = FNAME_MAX = 256 (true for ext3).
+ *	  path name length = PATH_MAX = 4096
+ *	  LOV MD size max  = EA_MAX = 24 * 2000
+ *		(NB: 24 is size of lov_ost_data)
+ *	  LOV LOGCOOKIE size max = 32 * 2000
+ *		(NB: 32 is size of llog_cookie)
+ * symlink:  FNAME_MAX + PATH_MAX  <- largest
+ * link:     FNAME_MAX + PATH_MAX  (mds_rec_link < mds_rec_create)
+ * rename:   FNAME_MAX + FNAME_MAX
+ * open:     FNAME_MAX + EA_MAX
+ *
+ * MDS_MAXREQSIZE ~= 4736 bytes =
+ * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX
+ * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
+ *
+ * Realistic size is about 512 bytes (20 character name + 128 char symlink),
+ * except in the open case where there are a large number of OSTs in a LOV.
+ */
+#define MDS_MAXREQSIZE		(5 * 1024)	/* >= 4736 */
+#define MDS_MAXREPSIZE		(9 * 1024)	/* >= 8300 */
+
+/**
+ * MDS incoming request with LOV EA
+ * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate
+ */
+#define MDS_LOV_MAXREQSIZE	max(MDS_MAXREQSIZE, \
+				    362 + LOV_MAX_STRIPE_COUNT * 24)
+/**
+ * MDS outgoing reply with LOV EA
+ *
+ * NB: max reply size Lustre 2.4+ client can get from old MDS is:
+ * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes
+ *
+ * but 2.4 or later MDS will never send reply with llog_cookie to any
+ * version client. This macro is defined for server side reply buffer size.
+ */
+#define MDS_LOV_MAXREPSIZE	MDS_LOV_MAXREQSIZE
+
+/**
+ * This is the size of a maximum REINT_SETXATTR request:
+ *
+ *   lustre_msg		 56 (32 + 4 x 5 + 4)
+ *   ptlrpc_body	184
+ *   mdt_rec_setxattr	136
+ *   lustre_capa	120
+ *   name		256 (XATTR_NAME_MAX)
+ *   value	      65536 (XATTR_SIZE_MAX)
+ */
+#define MDS_EA_MAXREQSIZE	66288
+
+/**
+ * These are the maximum request and reply sizes (rounded up to 1 KB
+ * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL.
+ */
+#define MDS_REG_MAXREQSIZE	(((max(MDS_EA_MAXREQSIZE, \
+				       MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10)
+#define MDS_REG_MAXREPSIZE	MDS_REG_MAXREQSIZE
+
+/**
+ * The update request includes all of updates from the create, which might
+ * include linkea (4K maxim), together with other updates, we set it to 9K:
+ * lustre_msg + ptlrpc_body + UPDATE_BUF_SIZE (8K)
+ */
+#define MDS_OUT_MAXREQSIZE	(9 * 1024)
+#define MDS_OUT_MAXREPSIZE	MDS_MAXREPSIZE
+
+/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
+#define MDS_BUFSIZE		max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    8 * 1024)
+
+/**
+ * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD.
+ * However, we need to allocate a much larger buffer for it because LNet
+ * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid
+ * dropping of maximum-sized incoming request.  So if MDS_REG_BUFSIZE is only a
+ * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request
+ * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory
+ * utilization is very low.
+ *
+ * In the meanwhile, size of rqbd can't be too large, because rqbd can't be
+ * reused until all requests fit in it have been processed and released,
+ * which means one long blocked request can prevent the rqbd be reused.
+ * Now we set request buffer size to 160 KB, so even each rqbd is unlinked
+ * from LNet with unused 65 KB, buffer utilization will be about 59%.
+ * Please check LU-2432 for details.
+ */
+#define MDS_REG_BUFSIZE		max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    160 * 1024)
+
+/**
+ * MDS_OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is
+ * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some
+ * extra bytes to each request buffer to improve buffer utilization rate.
+  */
+#define MDS_OUT_BUFSIZE		max(MDS_OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+				    24 * 1024)
+
+/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */
+#define FLD_MAXREQSIZE  (160)
+
+/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */
+#define FLD_MAXREPSIZE  (152)
+#define FLD_BUFSIZE	(1 << 12)
+
+/**
+ * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range +
+ * __u32 padding */
+#define SEQ_MAXREQSIZE  (160)
+
+/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */
+#define SEQ_MAXREPSIZE  (152)
+#define SEQ_BUFSIZE	(1 << 12)
+
+/** MGS threads must be >= 3, see bug 22458 comment #28 */
+#define MGS_NTHRS_INIT	(PTLRPC_NTHRS_INIT + 1)
+#define MGS_NTHRS_MAX	32
+
+#define MGS_NBUFS       64
+#define MGS_BUFSIZE     (8 * 1024)
+#define MGS_MAXREQSIZE  (7 * 1024)
+#define MGS_MAXREPSIZE  (9 * 1024)
+
+ /*
+  * OSS threads constants:
+  *
+  * Given 8 as factor and 64 as base threads number
+  *
+  * example 1):
+  * On 8-core server configured to 2 partitions, we will have
+  * 64 + 8 * 4 = 96 threads for each partition, 192 total threads.
+  *
+  * example 2):
+  * On 32-core machine configured to 4 partitions, we will have
+  * 64 + 8 * 8 = 112 threads for each partition, so total threads number
+  * will be 112 * 4 = 448.
+  *
+  * example 3):
+  * On 64-core machine configured to 4 partitions, we will have
+  * 64 + 16 * 8 = 192 threads for each partition, so total threads number
+  * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we
+  * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads
+  * for each partition.
+  *
+  * So we can see that with these constants, threads number wil be at the
+  * similar level of old versions, unless the server has many cores.
+  */
+ /* depress threads factor for VM with small memory size */
+#define OSS_THR_FACTOR		min_t(int, 8, \
+				NUM_CACHEPAGES >> (28 - PAGE_CACHE_SHIFT))
+#define OSS_NTHRS_INIT		(PTLRPC_NTHRS_INIT + 1)
+#define OSS_NTHRS_BASE		64
+#define OSS_NTHRS_MAX		512
+
+/* threads for handling "create" request */
+#define OSS_CR_THR_FACTOR	1
+#define OSS_CR_NTHRS_INIT	PTLRPC_NTHRS_INIT
+#define OSS_CR_NTHRS_BASE	8
+#define OSS_CR_NTHRS_MAX	64
+
+/**
+ * OST_IO_MAXREQSIZE ~=
+ *	lustre_msg + ptlrpc_body + obdo + obd_ioobj +
+ *	DT_MAX_BRW_PAGES * niobuf_remote
+ *
+ * - single object with 16 pages is 512 bytes
+ * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover
+ * - Must be a multiple of 1024
+ * - actual size is about 18K
+ */
+#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \
+			     sizeof(struct ptlrpc_body) + \
+			     sizeof(struct obdo) + \
+			     sizeof(struct obd_ioobj) + \
+			     sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES)
+/**
+ * FIEMAP request can be 4K+ for now
+ */
+#define OST_MAXREQSIZE		(5 * 1024)
+#define OST_IO_MAXREQSIZE	max_t(int, OST_MAXREQSIZE, \
+				(((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1))
+
+#define OST_MAXREPSIZE		(9 * 1024)
+#define OST_IO_MAXREPSIZE	OST_MAXREPSIZE
+
+#define OST_NBUFS		64
+/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */
+#define OST_BUFSIZE		max_t(int, OST_MAXREQSIZE + 1024, 16 * 1024)
+/**
+ * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization
+ * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details.
+ */
+#define OST_IO_BUFSIZE		max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
+
+/* Macro to hide a typecast. */
+#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
+
+/**
+ * Structure to single define portal connection.
+ */
+struct ptlrpc_connection {
+	/** linkage for connections hash table */
+	struct hlist_node	c_hash;
+	/** Our own lnet nid for this connection */
+	lnet_nid_t	      c_self;
+	/** Remote side nid for this connection */
+	lnet_process_id_t       c_peer;
+	/** UUID of the other side */
+	struct obd_uuid	 c_remote_uuid;
+	/** reference counter for this connection */
+	atomic_t	    c_refcount;
+};
+
+/** Client definition for PortalRPC */
+struct ptlrpc_client {
+	/** What lnet portal does this client send messages to by default */
+	__u32		   cli_request_portal;
+	/** What portal do we expect replies on */
+	__u32		   cli_reply_portal;
+	/** Name of the client */
+	char		   *cli_name;
+};
+
+/** state flags of requests */
+/* XXX only ones left are those used by the bulk descs as well! */
+#define PTL_RPC_FL_INTR      (1 << 0)  /* reply wait was interrupted by user */
+#define PTL_RPC_FL_TIMEOUT   (1 << 7)  /* request timed out waiting for reply */
+
+#define REQ_MAX_ACK_LOCKS 8
+
+union ptlrpc_async_args {
+	/**
+	 * Scratchpad for passing args to completion interpreter. Users
+	 * cast to the struct of their choosing, and CLASSERT that this is
+	 * big enough.  For _tons_ of context, OBD_ALLOC a struct and store
+	 * a pointer to it here.  The pointer_arg ensures this struct is at
+	 * least big enough for that.
+	 */
+	void      *pointer_arg[11];
+	__u64      space[7];
+};
+
+struct ptlrpc_request_set;
+typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
+typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
+
+/**
+ * Definition of request set structure.
+ * Request set is a list of requests (not necessary to the same target) that
+ * once populated with RPCs could be sent in parallel.
+ * There are two kinds of request sets. General purpose and with dedicated
+ * serving thread. Example of the latter is ptlrpcd set.
+ * For general purpose sets once request set started sending it is impossible
+ * to add new requests to such set.
+ * Provides a way to call "completion callbacks" when all requests in the set
+ * returned.
+ */
+struct ptlrpc_request_set {
+	atomic_t	  set_refcount;
+	/** number of in queue requests */
+	atomic_t	  set_new_count;
+	/** number of uncompleted requests */
+	atomic_t	  set_remaining;
+	/** wait queue to wait on for request events */
+	wait_queue_head_t	   set_waitq;
+	wait_queue_head_t	  *set_wakeup_ptr;
+	/** List of requests in the set */
+	struct list_head	    set_requests;
+	/**
+	 * List of completion callbacks to be called when the set is completed
+	 * This is only used if \a set_interpret is NULL.
+	 * Links struct ptlrpc_set_cbdata.
+	 */
+	struct list_head	    set_cblist;
+	/** Completion callback, if only one. */
+	set_interpreter_func  set_interpret;
+	/** opaq argument passed to completion \a set_interpret callback. */
+	void		 *set_arg;
+	/**
+	 * Lock for \a set_new_requests manipulations
+	 * locked so that any old caller can communicate requests to
+	 * the set holder who can then fold them into the lock-free set
+	 */
+	spinlock_t		set_new_req_lock;
+	/** List of new yet unsent requests. Only used with ptlrpcd now. */
+	struct list_head	    set_new_requests;
+
+	/** rq_status of requests that have been freed already */
+	int		   set_rc;
+	/** Additional fields used by the flow control extension */
+	/** Maximum number of RPCs in flight */
+	int		   set_max_inflight;
+	/** Callback function used to generate RPCs */
+	set_producer_func     set_producer;
+	/** opaq argument passed to the producer callback */
+	void		 *set_producer_arg;
+};
+
+/**
+ * Description of a single ptrlrpc_set callback
+ */
+struct ptlrpc_set_cbdata {
+	/** List linkage item */
+	struct list_head	      psc_item;
+	/** Pointer to interpreting function */
+	set_interpreter_func    psc_interpret;
+	/** Opaq argument to pass to the callback */
+	void		   *psc_data;
+};
+
+struct ptlrpc_bulk_desc;
+struct ptlrpc_service_part;
+struct ptlrpc_service;
+
+/**
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+	void   (*cbid_fn)(lnet_event_t *ev);     /* specific callback fn */
+	void    *cbid_arg;		      /* additional arg */
+};
+
+/** Maximum number of locks to fit into reply state */
+#define RS_MAX_LOCKS 8
+#define RS_DEBUG     0
+
+/**
+ * Structure to define reply state on the server
+ * Reply state holds various reply message information. Also for "difficult"
+ * replies (rep-ack case) we store the state after sending reply and wait
+ * for the client to acknowledge the reception. In these cases locks could be
+ * added to the state for replay/failover consistency guarantees.
+ */
+struct ptlrpc_reply_state {
+	/** Callback description */
+	struct ptlrpc_cb_id    rs_cb_id;
+	/** Linkage for list of all reply states in a system */
+	struct list_head	     rs_list;
+	/** Linkage for list of all reply states on same export */
+	struct list_head	     rs_exp_list;
+	/** Linkage for list of all reply states for same obd */
+	struct list_head	     rs_obd_list;
+#if RS_DEBUG
+	struct list_head	     rs_debug_list;
+#endif
+	/** A spinlock to protect the reply state flags */
+	spinlock_t		rs_lock;
+	/** Reply state flags */
+	unsigned long	  rs_difficult:1;     /* ACK/commit stuff */
+	unsigned long	  rs_no_ack:1;    /* no ACK, even for
+						  difficult requests */
+	unsigned long	  rs_scheduled:1;     /* being handled? */
+	unsigned long	  rs_scheduled_ever:1;/* any schedule attempts? */
+	unsigned long	  rs_handled:1;  /* been handled yet? */
+	unsigned long	  rs_on_net:1;   /* reply_out_callback pending? */
+	unsigned long	  rs_prealloc:1; /* rs from prealloc list */
+	unsigned long	  rs_committed:1;/* the transaction was committed
+						 and the rs was dispatched
+						 by ptlrpc_commit_replies */
+	/** Size of the state */
+	int		    rs_size;
+	/** opcode */
+	__u32		  rs_opc;
+	/** Transaction number */
+	__u64		  rs_transno;
+	/** xid */
+	__u64		  rs_xid;
+	struct obd_export     *rs_export;
+	struct ptlrpc_service_part *rs_svcpt;
+	/** Lnet metadata handle for the reply */
+	lnet_handle_md_t       rs_md_h;
+	atomic_t	   rs_refcount;
+
+	/** Context for the sevice thread */
+	struct ptlrpc_svc_ctx *rs_svc_ctx;
+	/** Reply buffer (actually sent to the client), encoded if needed */
+	struct lustre_msg     *rs_repbuf;       /* wrapper */
+	/** Size of the reply buffer */
+	int		    rs_repbuf_len;   /* wrapper buf length */
+	/** Size of the reply message */
+	int		    rs_repdata_len;  /* wrapper msg length */
+	/**
+	 * Actual reply message. Its content is encrupted (if needed) to
+	 * produce reply buffer for actual sending. In simple case
+	 * of no network encryption we jus set \a rs_repbuf to \a rs_msg
+	 */
+	struct lustre_msg     *rs_msg;	  /* reply message */
+
+	/** Number of locks awaiting client ACK */
+	int		    rs_nlocks;
+	/** Handles of locks awaiting client reply ACK */
+	struct lustre_handle   rs_locks[RS_MAX_LOCKS];
+	/** Lock modes of locks in \a rs_locks */
+	ldlm_mode_t	    rs_modes[RS_MAX_LOCKS];
+};
+
+struct ptlrpc_thread;
+
+/** RPC stages */
+enum rq_phase {
+	RQ_PHASE_NEW	    = 0xebc0de00,
+	RQ_PHASE_RPC	    = 0xebc0de01,
+	RQ_PHASE_BULK	   = 0xebc0de02,
+	RQ_PHASE_INTERPRET      = 0xebc0de03,
+	RQ_PHASE_COMPLETE       = 0xebc0de04,
+	RQ_PHASE_UNREGISTERING  = 0xebc0de05,
+	RQ_PHASE_UNDEFINED      = 0xebc0de06
+};
+
+/** Type of request interpreter call-back */
+typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
+				    struct ptlrpc_request *req,
+				    void *arg, int rc);
+
+/**
+ * Definition of request pool structure.
+ * The pool is used to store empty preallocated requests for the case
+ * when we would actually need to send something without performing
+ * any allocations (to avoid e.g. OOM).
+ */
+struct ptlrpc_request_pool {
+	/** Locks the list */
+	spinlock_t prp_lock;
+	/** list of ptlrpc_request structs */
+	struct list_head prp_req_list;
+	/** Maximum message size that would fit into a rquest from this pool */
+	int prp_rq_size;
+	/** Function to allocate more requests for this pool */
+	void (*prp_populate)(struct ptlrpc_request_pool *, int);
+};
+
+struct lu_context;
+struct lu_env;
+
+struct ldlm_lock;
+
+/**
+ * \defgroup nrs Network Request Scheduler
+ * @{
+ */
+struct ptlrpc_nrs_policy;
+struct ptlrpc_nrs_resource;
+struct ptlrpc_nrs_request;
+
+/**
+ * NRS control operations.
+ *
+ * These are common for all policies.
+ */
+enum ptlrpc_nrs_ctl {
+	/**
+	 * Not a valid opcode.
+	 */
+	PTLRPC_NRS_CTL_INVALID,
+	/**
+	 * Activate the policy.
+	 */
+	PTLRPC_NRS_CTL_START,
+	/**
+	 * Reserved for multiple primary policies, which may be a possibility
+	 * in the future.
+	 */
+	PTLRPC_NRS_CTL_STOP,
+	/**
+	 * Policies can start using opcodes from this value and onwards for
+	 * their own purposes; the assigned value itself is arbitrary.
+	 */
+	PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
+};
+
+/**
+ * ORR policy operations
+ */
+enum nrs_ctl_orr {
+	NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	NRS_CTL_ORR_WR_QUANTUM,
+	NRS_CTL_ORR_RD_OFF_TYPE,
+	NRS_CTL_ORR_WR_OFF_TYPE,
+	NRS_CTL_ORR_RD_SUPP_REQ,
+	NRS_CTL_ORR_WR_SUPP_REQ,
+};
+
+/**
+ * NRS policy operations.
+ *
+ * These determine the behaviour of a policy, and are called in response to
+ * NRS core events.
+ */
+struct ptlrpc_nrs_pol_ops {
+	/**
+	 * Called during policy registration; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being initialized
+	 */
+	int	(*op_policy_init) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called during policy unregistration; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being unregistered/finalized
+	 */
+	void	(*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when activating a policy via lprocfs; policies allocate and
+	 * initialize their resources here; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy being started
+	 *
+	 * \see nrs_policy_start_locked()
+	 */
+	int	(*op_policy_start) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Called when deactivating a policy via lprocfs; policies deallocate
+	 * their resources here; this operation is optional
+	 *
+	 * \param[in,out] policy The policy being stopped
+	 *
+	 * \see nrs_policy_stop0()
+	 */
+	void	(*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
+	/**
+	 * Used for policy-specific operations; i.e. not generic ones like
+	 * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
+	 * to an ioctl; this operation is optional.
+	 *
+	 * \param[in,out]	 policy The policy carrying out operation \a opc
+	 * \param[in]	  opc	 The command operation being carried out
+	 * \param[in,out] arg	 An generic buffer for communication between the
+	 *			 user and the control operation
+	 *
+	 * \retval -ve error
+	 * \retval   0 success
+	 *
+	 * \see ptlrpc_nrs_policy_control()
+	 */
+	int	(*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
+				  enum ptlrpc_nrs_ctl opc, void *arg);
+
+	/**
+	 * Called when obtaining references to the resources of the resource
+	 * hierarchy for a request that has arrived for handling at the PTLRPC
+	 * service. Policies should return -ve for requests they do not wish
+	 * to handle. This operation is mandatory.
+	 *
+	 * \param[in,out] policy  The policy we're getting resources for.
+	 * \param[in,out] nrq	  The request we are getting resources for.
+	 * \param[in]	  parent  The parent resource of the resource being
+	 *			  requested; set to NULL if none.
+	 * \param[out]	  resp	  The resource is to be returned here; the
+	 *			  fallback policy in an NRS head should
+	 *			  \e always return a non-NULL pointer value.
+	 * \param[in]  moving_req When set, signifies that this is an attempt
+	 *			  to obtain resources for a request being moved
+	 *			  to the high-priority NRS head by
+	 *			  ldlm_lock_reorder_req().
+	 *			  This implies two things:
+	 *			  1. We are under obd_export::exp_rpc_lock and
+	 *			  so should not sleep.
+	 *			  2. We should not perform non-idempotent or can
+	 *			  skip performing idempotent operations that
+	 *			  were carried out when resources were first
+	 *			  taken for the request when it was initialized
+	 *			  in ptlrpc_nrs_req_initialize().
+	 *
+	 * \retval 0, +ve The level of the returned resource in the resource
+	 *		  hierarchy; currently only 0 (for a non-leaf resource)
+	 *		  and 1 (for a leaf resource) are supported by the
+	 *		  framework.
+	 * \retval -ve	  error
+	 *
+	 * \see ptlrpc_nrs_req_initialize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	int	(*op_res_get) (struct ptlrpc_nrs_policy *policy,
+			       struct ptlrpc_nrs_request *nrq,
+			       const struct ptlrpc_nrs_resource *parent,
+			       struct ptlrpc_nrs_resource **resp,
+			       bool moving_req);
+	/**
+	 * Called when releasing references taken for resources in the resource
+	 * hierarchy for the request; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy the resource belongs to
+	 * \param[in] res	 The resource to be freed
+	 *
+	 * \see ptlrpc_nrs_req_finalize()
+	 * \see ptlrpc_nrs_hpreq_add_nolock()
+	 * \see ptlrpc_nrs_req_hp_move()
+	 */
+	void	(*op_res_put) (struct ptlrpc_nrs_policy *policy,
+			       const struct ptlrpc_nrs_resource *res);
+
+	/**
+	 * Obtains a request for handling from the policy, and optionally
+	 * removes the request from the policy; this operation is mandatory.
+	 *
+	 * \param[in,out] policy The policy to poll
+	 * \param[in]	  peek	 When set, signifies that we just want to
+	 *			 examine the request, and not handle it, so the
+	 *			 request is not removed from the policy.
+	 * \param[in]	  force	 When set, it will force a policy to return a
+	 *			 request if it has one queued.
+	 *
+	 * \retval NULL No request available for handling
+	 * \retval valid-pointer The request polled for handling
+	 *
+	 * \see ptlrpc_nrs_req_get_nolock()
+	 */
+	struct ptlrpc_nrs_request *
+		(*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
+			       bool force);
+	/**
+	 * Called when attempting to add a request to a policy for later
+	 * handling; this operation is mandatory.
+	 *
+	 * \param[in,out] policy  The policy on which to enqueue \a nrq
+	 * \param[in,out] nrq The request to enqueue
+	 *
+	 * \retval 0	success
+	 * \retval != 0	error
+	 *
+	 * \see ptlrpc_nrs_req_add_nolock()
+	 */
+	int	(*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Removes a request from the policy's set of pending requests. Normally
+	 * called after a request has been polled successfully from the policy
+	 * for handling; this operation is mandatory.
+	 *
+	 * \param[in,out] policy The policy the request \a nrq belongs to
+	 * \param[in,out] nrq    The request to dequeue
+	 *
+	 * \see ptlrpc_nrs_req_del_nolock()
+	 */
+	void	(*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
+				   struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Called after the request being carried out. Could be used for
+	 * job/resource control; this operation is optional.
+	 *
+	 * \param[in,out] policy The policy which is stopping to handle request
+	 *			 \a nrq
+	 * \param[in,out] nrq	 The request
+	 *
+	 * \pre spin_is_locked(&svcpt->scp_req_lock)
+	 *
+	 * \see ptlrpc_nrs_req_stop_nolock()
+	 */
+	void	(*op_req_stop) (struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_request *nrq);
+	/**
+	 * Registers the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * \param[in] svc The service
+	 *
+	 * \retval 0	success
+	 * \retval != 0	error
+	 */
+	int	(*op_lprocfs_init) (struct ptlrpc_service *svc);
+	/**
+	 * Unegisters the policy's lprocfs interface with a PTLRPC service.
+	 *
+	 * In cases of failed policy registration in
+	 * \e ptlrpc_nrs_policy_register(), this function may be called for a
+	 * service which has not registered the policy successfully, so
+	 * implementations of this method should make sure their operations are
+	 * safe in such cases.
+	 *
+	 * \param[in] svc The service
+	 */
+	void	(*op_lprocfs_fini) (struct ptlrpc_service *svc);
+};
+
+/**
+ * Policy flags
+ */
+enum nrs_policy_flags {
+	/**
+	 * Fallback policy, use this flag only on a single supported policy per
+	 * service. The flag cannot be used on policies that use
+	 * \e PTLRPC_NRS_FL_REG_EXTERN
+	 */
+	PTLRPC_NRS_FL_FALLBACK		= (1 << 0),
+	/**
+	 * Start policy immediately after registering.
+	 */
+	PTLRPC_NRS_FL_REG_START		= (1 << 1),
+	/**
+	 * This is a policy registering from a module different to the one NRS
+	 * core ships in (currently ptlrpc).
+	 */
+	PTLRPC_NRS_FL_REG_EXTERN	= (1 << 2),
+};
+
+/**
+ * NRS queue type.
+ *
+ * Denotes whether an NRS instance is for handling normal or high-priority
+ * RPCs, or whether an operation pertains to one or both of the NRS instances
+ * in a service.
+ */
+enum ptlrpc_nrs_queue_type {
+	PTLRPC_NRS_QUEUE_REG	= (1 << 0),
+	PTLRPC_NRS_QUEUE_HP	= (1 << 1),
+	PTLRPC_NRS_QUEUE_BOTH	= (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
+};
+
+/**
+ * NRS head
+ *
+ * A PTLRPC service has at least one NRS head instance for handling normal
+ * priority RPCs, and may optionally have a second NRS head instance for
+ * handling high-priority RPCs. Each NRS head maintains a list of available
+ * policies, of which one and only one policy is acting as the fallback policy,
+ * and optionally a different policy may be acting as the primary policy. For
+ * all RPCs handled by this NRS head instance, NRS core will first attempt to
+ * enqueue the RPC using the primary policy (if any). The fallback policy is
+ * used in the following cases:
+ * - when there was no primary policy in the
+ *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
+ *   was initialized.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, denoted it did not wish, or for some other reason was
+ *   not able to handle the request, by returning a non-valid NRS resource
+ *   reference.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, fails later during the request enqueueing stage.
+ *
+ * \see nrs_resource_get_safe()
+ * \see nrs_request_enqueue()
+ */
+struct ptlrpc_nrs {
+	spinlock_t			nrs_lock;
+	/** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
+	/**
+	 * List of registered policies
+	 */
+	struct list_head			nrs_policy_list;
+	/**
+	 * List of policies with queued requests. Policies that have any
+	 * outstanding requests are queued here, and this list is queried
+	 * in a round-robin manner from NRS core when obtaining a request
+	 * for handling. This ensures that requests from policies that at some
+	 * point transition away from the
+	 * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
+	 */
+	struct list_head			nrs_policy_queued;
+	/**
+	 * Service partition for this NRS head
+	 */
+	struct ptlrpc_service_part     *nrs_svcpt;
+	/**
+	 * Primary policy, which is the preferred policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_primary;
+	/**
+	 * Fallback policy, which is the backup policy for handling RPCs
+	 */
+	struct ptlrpc_nrs_policy       *nrs_policy_fallback;
+	/**
+	 * This NRS head handles either HP or regular requests
+	 */
+	enum ptlrpc_nrs_queue_type	nrs_queue_type;
+	/**
+	 * # queued requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_queued;
+	/**
+	 * # scheduled requests from all policies in this NRS head
+	 */
+	unsigned long			nrs_req_started;
+	/**
+	 * # policies on this NRS
+	 */
+	unsigned			nrs_num_pols;
+	/**
+	 * This NRS head is in progress of starting a policy
+	 */
+	unsigned			nrs_policy_starting:1;
+	/**
+	 * In progress of shutting down the whole NRS head; used during
+	 * unregistration
+	 */
+	unsigned			nrs_stopping:1;
+};
+
+#define NRS_POL_NAME_MAX		16
+
+struct ptlrpc_nrs_pol_desc;
+
+/**
+ * Service compatibility predicate; this determines whether a policy is adequate
+ * for handling RPCs of a particular PTLRPC service.
+ *
+ * XXX:This should give the same result during policy registration and
+ * unregistration, and for all partitions of a service; so the result should not
+ * depend on temporal service or other properties, that may influence the
+ * result.
+ */
+typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
+				       const struct ptlrpc_nrs_pol_desc *desc);
+
+struct ptlrpc_nrs_pol_conf {
+	/**
+	 * Human-readable policy name
+	 */
+	char				   nc_name[NRS_POL_NAME_MAX];
+	/**
+	 * NRS operations for this policy
+	 */
+	const struct ptlrpc_nrs_pol_ops	  *nc_ops;
+	/**
+	 * Service compatibility predicate
+	 */
+	nrs_pol_desc_compat_t		   nc_compat;
+	/**
+	 * Set for policies that support a single ptlrpc service, i.e. ones that
+	 * have \a pd_compat set to nrs_policy_compat_one(). The variable value
+	 * depicts the name of the single service that such policies are
+	 * compatible with.
+	 */
+	const char			  *nc_compat_svc_name;
+	/**
+	 * Owner module for this policy descriptor; policies registering from a
+	 * different module to the one the NRS framework is held within
+	 * (currently ptlrpc), should set this field to THIS_MODULE.
+	 */
+	module_t			  *nc_owner;
+	/**
+	 * Policy registration flags; a bitmast of \e nrs_policy_flags
+	 */
+	unsigned			   nc_flags;
+};
+
+/**
+ * NRS policy registering descriptor
+ *
+ * Is used to hold a description of a policy that can be passed to NRS core in
+ * order to register the policy with NRS heads in different PTLRPC services.
+ */
+struct ptlrpc_nrs_pol_desc {
+	/**
+	 * Human-readable policy name
+	 */
+	char					pd_name[NRS_POL_NAME_MAX];
+	/**
+	 * Link into nrs_core::nrs_policies
+	 */
+	struct list_head				pd_list;
+	/**
+	 * NRS operations for this policy
+	 */
+	const struct ptlrpc_nrs_pol_ops	       *pd_ops;
+	/**
+	 * Service compatibility predicate
+	 */
+	nrs_pol_desc_compat_t			pd_compat;
+	/**
+	 * Set for policies that are compatible with only one PTLRPC service.
+	 *
+	 * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
+	 */
+	const char			       *pd_compat_svc_name;
+	/**
+	 * Owner module for this policy descriptor.
+	 *
+	 * We need to hold a reference to the module whenever we might make use
+	 * of any of the module's contents, i.e.
+	 * - If one or more instances of the policy are at a state where they
+	 *   might be handling a request, i.e.
+	 *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+	 *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
+	 *   call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
+	 *   is taken on the module when
+	 *   \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
+	 *   becomes 0, so that we hold only one reference to the module maximum
+	 *   at any time.
+	 *
+	 *   We do not need to hold a reference to the module, even though we
+	 *   might use code and data from the module, in the following cases:
+	 * - During external policy registration, because this should happen in
+	 *   the module's init() function, in which case the module is safe from
+	 *   removal because a reference is being held on the module by the
+	 *   kernel, and iirc kmod (and I guess module-init-tools also) will
+	 *   serialize any racing processes properly anyway.
+	 * - During external policy unregistration, because this should happen
+	 *   in a module's exit() function, and any attempts to start a policy
+	 *   instance would need to take a reference on the module, and this is
+	 *   not possible once we have reached the point where the exit()
+	 *   handler is called.
+	 * - During service registration and unregistration, as service setup
+	 *   and cleanup, and policy registration, unregistration and policy
+	 *   instance starting, are serialized by \e nrs_core::nrs_mutex, so
+	 *   as long as users adhere to the convention of registering policies
+	 *   in init() and unregistering them in module exit() functions, there
+	 *   should not be a race between these operations.
+	 * - During any policy-specific lprocfs operations, because a reference
+	 *   is held by the kernel on a proc entry that has been entered by a
+	 *   syscall, so as long as proc entries are removed during unregistration time,
+	 *   then unregistration and lprocfs operations will be properly
+	 *   serialized.
+	 */
+	module_t			       *pd_owner;
+	/**
+	 * Bitmask of \e nrs_policy_flags
+	 */
+	unsigned				pd_flags;
+	/**
+	 * # of references on this descriptor
+	 */
+	atomic_t				pd_refs;
+};
+
+/**
+ * NRS policy state
+ *
+ * Policies transition from one state to the other during their lifetime
+ */
+enum ptlrpc_nrs_pol_state {
+	/**
+	 * Not a valid policy state.
+	 */
+	NRS_POL_STATE_INVALID,
+	/**
+	 * Policies are at this state either at the start of their life, or
+	 * transition here when the user selects a different policy to act
+	 * as the primary one.
+	 */
+	NRS_POL_STATE_STOPPED,
+	/**
+	 * Policy is progress of stopping
+	 */
+	NRS_POL_STATE_STOPPING,
+	/**
+	 * Policy is in progress of starting
+	 */
+	NRS_POL_STATE_STARTING,
+	/**
+	 * A policy is in this state in two cases:
+	 * - it is the fallback policy, which is always in this state.
+	 * - it has been activated by the user; i.e. it is the primary policy,
+	 */
+	NRS_POL_STATE_STARTED,
+};
+
+/**
+ * NRS policy information
+ *
+ * Used for obtaining information for the status of a policy via lprocfs
+ */
+struct ptlrpc_nrs_pol_info {
+	/**
+	 * Policy name
+	 */
+	char				pi_name[NRS_POL_NAME_MAX];
+	/**
+	 * Current policy state
+	 */
+	enum ptlrpc_nrs_pol_state	pi_state;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pi_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pi_req_started;
+	/**
+	 * Is this a fallback policy?
+	 */
+	unsigned			pi_fallback:1;
+};
+
+/**
+ * NRS policy
+ *
+ * There is one instance of this for each policy in each NRS head of each
+ * PTLRPC service partition.
+ */
+struct ptlrpc_nrs_policy {
+	/**
+	 * Linkage into the NRS head's list of policies,
+	 * ptlrpc_nrs:nrs_policy_list
+	 */
+	struct list_head			pol_list;
+	/**
+	 * Linkage into the NRS head's list of policies with enqueued
+	 * requests ptlrpc_nrs:nrs_policy_queued
+	 */
+	struct list_head			pol_list_queued;
+	/**
+	 * Current state of this policy
+	 */
+	enum ptlrpc_nrs_pol_state	pol_state;
+	/**
+	 * Bitmask of nrs_policy_flags
+	 */
+	unsigned			pol_flags;
+	/**
+	 * # RPCs enqueued for later dispatching by the policy
+	 */
+	long				pol_req_queued;
+	/**
+	 * # RPCs started for dispatch by the policy
+	 */
+	long				pol_req_started;
+	/**
+	 * Usage Reference count taken on the policy instance
+	 */
+	long				pol_ref;
+	/**
+	 * The NRS head this policy has been created at
+	 */
+	struct ptlrpc_nrs	       *pol_nrs;
+	/**
+	 * Private policy data; varies by policy type
+	 */
+	void			       *pol_private;
+	/**
+	 * Policy descriptor for this policy instance.
+	 */
+	struct ptlrpc_nrs_pol_desc     *pol_desc;
+};
+
+/**
+ * NRS resource
+ *
+ * Resources are embedded into two types of NRS entities:
+ * - Inside NRS policies, in the policy's private data in
+ *   ptlrpc_nrs_policy::pol_private
+ * - In objects that act as prime-level scheduling entities in different NRS
+ *   policies; e.g. on a policy that performs round robin or similar order
+ *   scheduling across client NIDs, there would be one NRS resource per unique
+ *   client NID. On a policy which performs round robin scheduling across
+ *   backend filesystem objects, there would be one resource associated with
+ *   each of the backend filesystem objects partaking in the scheduling
+ *   performed by the policy.
+ *
+ * NRS resources share a parent-child relationship, in which resources embedded
+ * in policy instances are the parent entities, with all scheduling entities
+ * a policy schedules across being the children, thus forming a simple resource
+ * hierarchy. This hierarchy may be extended with one or more levels in the
+ * future if the ability to have more than one primary policy is added.
+ *
+ * Upon request initialization, references to the then active NRS policies are
+ * taken and used to later handle the dispatching of the request with one of
+ * these policies.
+ *
+ * \see nrs_resource_get_safe()
+ * \see ptlrpc_nrs_req_add()
+ */
+struct ptlrpc_nrs_resource {
+	/**
+	 * This NRS resource's parent; is NULL for resources embedded in NRS
+	 * policy instances; i.e. those are top-level ones.
+	 */
+	struct ptlrpc_nrs_resource     *res_parent;
+	/**
+	 * The policy associated with this resource.
+	 */
+	struct ptlrpc_nrs_policy       *res_policy;
+};
+
+enum {
+	NRS_RES_FALLBACK,
+	NRS_RES_PRIMARY,
+	NRS_RES_MAX
+};
+
+/* \name fifo
+ *
+ * FIFO policy
+ *
+ * This policy is a logical wrapper around previous, non-NRS functionality.
+ * It dispatches RPCs in the same order as they arrive from the network. This
+ * policy is currently used as the fallback policy, and the only enabled policy
+ * on all NRS heads of all PTLRPC service partitions.
+ * @{
+ */
+
+/**
+ * Private data structure for the FIFO policy
+ */
+struct nrs_fifo_head {
+	/**
+	 * Resource object for policy instance.
+	 */
+	struct ptlrpc_nrs_resource	fh_res;
+	/**
+	 * List of queued requests.
+	 */
+	struct list_head			fh_list;
+	/**
+	 * For debugging purposes.
+	 */
+	__u64				fh_sequence;
+};
+
+struct nrs_fifo_req {
+	struct list_head		fr_list;
+	__u64			fr_sequence;
+};
+
+/** @} fifo */
+
+/**
+ * \name CRR-N
+ *
+ * CRR-N, Client Round Robin over NIDs
+ * @{
+ */
+
+/**
+ * private data structure for CRR-N NRS
+ */
+struct nrs_crrn_net {
+	struct ptlrpc_nrs_resource	cn_res;
+	cfs_binheap_t		       *cn_binheap;
+	cfs_hash_t		       *cn_cli_hash;
+	/**
+	 * Used when a new scheduling round commences, in order to synchronize
+	 * all clients with the new round number.
+	 */
+	__u64				cn_round;
+	/**
+	 * Determines the relevant ordering amongst request batches within a
+	 * scheduling round.
+	 */
+	__u64				cn_sequence;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs that each request
+	 * batch for each client can have in a scheduling round.
+	 */
+	__u16				cn_quantum;
+};
+
+/**
+ * Object representing a client in CRR-N, as identified by its NID
+ */
+struct nrs_crrn_client {
+	struct ptlrpc_nrs_resource	cc_res;
+	struct hlist_node		cc_hnode;
+	lnet_nid_t			cc_nid;
+	/**
+	 * The round number against which this client is currently scheduling
+	 * requests.
+	 */
+	__u64				cc_round;
+	/**
+	 * The sequence number used for requests scheduled by this client during
+	 * the current round number.
+	 */
+	__u64				cc_sequence;
+	atomic_t			cc_ref;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs the client is allowed
+	 * to schedule in a single batch of each round.
+	 */
+	__u16				cc_quantum;
+	/**
+	 * # of pending requests for this client, on all existing rounds
+	 */
+	__u16				cc_active;
+};
+
+/**
+ * CRR-N NRS request definition
+ */
+struct nrs_crrn_req {
+	/**
+	 * Round number for this request; shared with all other requests in the
+	 * same batch.
+	 */
+	__u64			cr_round;
+	/**
+	 * Sequence number for this request; shared with all other requests in
+	 * the same batch.
+	 */
+	__u64			cr_sequence;
+};
+
+/**
+ * CRR-N policy operations.
+ */
+enum nrs_ctl_crr {
+	/**
+	 * Read the RR quantum size of a CRR-N policy.
+	 */
+	NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+	/**
+	 * Write the RR quantum size of a CRR-N policy.
+	 */
+	NRS_CTL_CRRN_WR_QUANTUM,
+};
+
+/** @} CRR-N */
+
+/**
+ * \name ORR/TRR
+ *
+ * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies
+ * @{
+ */
+
+/**
+ * Lower and upper byte offsets of a brw RPC
+ */
+struct nrs_orr_req_range {
+	__u64		or_start;
+	__u64		or_end;
+};
+
+/**
+ * RPC types supported by the ORR/TRR policies
+ */
+enum nrs_orr_supp {
+	NOS_OST_READ  = (1 << 0),
+	NOS_OST_WRITE = (1 << 1),
+	NOS_OST_RW    = (NOS_OST_READ | NOS_OST_WRITE),
+	/**
+	 * Default value for policies.
+	 */
+	NOS_DFLT      = NOS_OST_READ
+};
+
+/**
+ * As unique keys for grouping RPCs together, we use the object's OST FID for
+ * the ORR policy, and the OST index for the TRR policy.
+ *
+ * XXX: We waste some space for TRR policy instances by using a union, but it
+ *	allows to consolidate some of the code between ORR and TRR, and these
+ *	policies will probably eventually merge into one anyway.
+ */
+struct nrs_orr_key {
+	union {
+		/** object FID for ORR */
+		struct lu_fid	ok_fid;
+		/** OST index for TRR */
+		__u32		ok_idx;
+	};
+};
+
+/**
+ * The largest base string for unique hash/slab object names is
+ * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT
+ * id number, so this _should_ be more than enough for the maximum number of
+ * CPTs on any system. If it does happen that this statement is incorrect,
+ * nrs_orr_genobjname() will inevitably yield a non-unique name and cause
+ * kmem_cache_create() to complain (on Linux), so the erroneous situation
+ * will hopefully not go unnoticed.
+ */
+#define NRS_ORR_OBJ_NAME_MAX	(sizeof("nrs_orr_reg_") + 3)
+
+/**
+ * private data structure for ORR and TRR NRS
+ */
+struct nrs_orr_data {
+	struct ptlrpc_nrs_resource	od_res;
+	cfs_binheap_t		       *od_binheap;
+	cfs_hash_t		       *od_obj_hash;
+	struct kmem_cache		       *od_cache;
+	/**
+	 * Used when a new scheduling round commences, in order to synchronize
+	 * all object or OST batches with the new round number.
+	 */
+	__u64				od_round;
+	/**
+	 * Determines the relevant ordering amongst request batches within a
+	 * scheduling round.
+	 */
+	__u64				od_sequence;
+	/**
+	 * RPC types that are currently supported.
+	 */
+	enum nrs_orr_supp		od_supp;
+	/**
+	 * Round Robin quantum; the maxium number of RPCs that each request
+	 * batch for each object or OST can have in a scheduling round.
+	 */
+	__u16				od_quantum;
+	/**
+	 * Whether to use physical disk offsets or logical file offsets.
+	 */
+	bool				od_physical;
+	/**
+	 * XXX: We need to provide a persistently allocated string to hold
+	 * unique object names for this policy, since in currently supported
+	 * versions of Linux by Lustre, kmem_cache_create() just sets a pointer
+	 * to the name string provided. kstrdup() is used in the version of
+	 * kmeme_cache_create() in current Linux mainline, so we may be able to
+	 * remove this in the future.
+	 */
+	char				od_objname[NRS_ORR_OBJ_NAME_MAX];
+};
+
+/**
+ * Represents a backend-fs object or OST in the ORR and TRR policies
+ * respectively
+ */
+struct nrs_orr_object {
+	struct ptlrpc_nrs_resource	oo_res;
+	struct hlist_node		oo_hnode;
+	/**
+	 * The round number against which requests are being scheduled for this
+	 * object or OST
+	 */
+	__u64				oo_round;
+	/**
+	 * The sequence number used for requests scheduled for this object or
+	 * OST during the current round number.
+	 */
+	__u64				oo_sequence;
+	/**
+	 * The key of the object or OST for which this structure instance is
+	 * scheduling RPCs
+	 */
+	struct nrs_orr_key		oo_key;
+	atomic_t			oo_ref;
+	/**
+	 * Round Robin quantum; the maximum number of RPCs that are allowed to
+	 * be scheduled for the object or OST in a single batch of each round.
+	 */
+	__u16				oo_quantum;
+	/**
+	 * # of pending requests for this object or OST, on all existing rounds
+	 */
+	__u16				oo_active;
+};
+
+/**
+ * ORR/TRR NRS request definition
+ */
+struct nrs_orr_req {
+	/**
+	 * The offset range this request covers
+	 */
+	struct nrs_orr_req_range	or_range;
+	/**
+	 * Round number for this request; shared with all other requests in the
+	 * same batch.
+	 */
+	__u64				or_round;
+	/**
+	 * Sequence number for this request; shared with all other requests in
+	 * the same batch.
+	 */
+	__u64				or_sequence;
+	/**
+	 * For debugging purposes.
+	 */
+	struct nrs_orr_key		or_key;
+	/**
+	 * An ORR policy instance has filled in request information while
+	 * enqueueing the request on the service partition's regular NRS head.
+	 */
+	unsigned int			or_orr_set:1;
+	/**
+	 * A TRR policy instance has filled in request information while
+	 * enqueueing the request on the service partition's regular NRS head.
+	 */
+	unsigned int			or_trr_set:1;
+	/**
+	 * Request offset ranges have been filled in with logical offset
+	 * values.
+	 */
+	unsigned int			or_logical_set:1;
+	/**
+	 * Request offset ranges have been filled in with physical offset
+	 * values.
+	 */
+	unsigned int			or_physical_set:1;
+};
+
+/** @} ORR/TRR */
+
+/**
+ * NRS request
+ *
+ * Instances of this object exist embedded within ptlrpc_request; the main
+ * purpose of this object is to hold references to the request's resources
+ * for the lifetime of the request, and to hold properties that policies use
+ * use for determining the request's scheduling priority.
+ * */
+struct ptlrpc_nrs_request {
+	/**
+	 * The request's resource hierarchy.
+	 */
+	struct ptlrpc_nrs_resource     *nr_res_ptrs[NRS_RES_MAX];
+	/**
+	 * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
+	 * policy that was used to enqueue the request.
+	 *
+	 * \see nrs_request_enqueue()
+	 */
+	unsigned			nr_res_idx;
+	unsigned			nr_initialized:1;
+	unsigned			nr_enqueued:1;
+	unsigned			nr_started:1;
+	unsigned			nr_finalized:1;
+	cfs_binheap_node_t		nr_node;
+
+	/**
+	 * Policy-specific fields, used for determining a request's scheduling
+	 * priority, and other supporting functionality.
+	 */
+	union {
+		/**
+		 * Fields for the FIFO policy
+		 */
+		struct nrs_fifo_req	fifo;
+		/**
+		 * CRR-N request defintion
+		 */
+		struct nrs_crrn_req	crr;
+		/** ORR and TRR share the same request definition */
+		struct nrs_orr_req	orr;
+	} nr_u;
+	/**
+	 * Externally-registering policies may want to use this to allocate
+	 * their own request properties.
+	 */
+	void			       *ext;
+};
+
+/** @} nrs */
+
+/**
+ * Basic request prioritization operations structure.
+ * The whole idea is centered around locks and RPCs that might affect locks.
+ * When a lock is contended we try to give priority to RPCs that might lead
+ * to fastest release of that lock.
+ * Currently only implemented for OSTs only in a way that makes all
+ * IO and truncate RPCs that are coming from a locked region where a lock is
+ * contended a priority over other requests.
+ */
+struct ptlrpc_hpreq_ops {
+	/**
+	 * Check if the lock handle of the given lock is the same as
+	 * taken from the request.
+	 */
+	int  (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *);
+	/**
+	 * Check if the request is a high priority one.
+	 */
+	int  (*hpreq_check)(struct ptlrpc_request *);
+	/**
+	 * Called after the request has been handled.
+	 */
+	void (*hpreq_fini)(struct ptlrpc_request *);
+};
+
+/**
+ * Represents remote procedure call.
+ *
+ * This is a staple structure used by everybody wanting to send a request
+ * in Lustre.
+ */
+struct ptlrpc_request {
+	/* Request type: one of PTL_RPC_MSG_* */
+	int rq_type;
+	/** Result of request processing */
+	int rq_status;
+	/**
+	 * Linkage item through which this request is included into
+	 * sending/delayed lists on client and into rqbd list on server
+	 */
+	struct list_head rq_list;
+	/**
+	 * Server side list of incoming unserved requests sorted by arrival
+	 * time.  Traversed from time to time to notice about to expire
+	 * requests and sent back "early replies" to clients to let them
+	 * know server is alive and well, just very busy to service their
+	 * requests in time
+	 */
+	struct list_head rq_timed_list;
+	/** server-side history, used for debuging purposes. */
+	struct list_head rq_history_list;
+	/** server-side per-export list */
+	struct list_head rq_exp_list;
+	/** server-side hp handlers */
+	struct ptlrpc_hpreq_ops *rq_ops;
+
+	/** initial thread servicing this request */
+	struct ptlrpc_thread *rq_svc_thread;
+
+	/** history sequence # */
+	__u64 rq_history_seq;
+	/** \addtogroup  nrs
+	 * @{
+	 */
+	/** stub for NRS request */
+	struct ptlrpc_nrs_request rq_nrq;
+	/** @} nrs */
+	/** the index of service's srv_at_array into which request is linked */
+	time_t rq_at_index;
+	/** Lock to protect request flags and some other important bits, like
+	 * rq_list
+	 */
+	spinlock_t rq_lock;
+	/** client-side flags are serialized by rq_lock */
+	unsigned int rq_intr:1, rq_replied:1, rq_err:1,
+		rq_timedout:1, rq_resend:1, rq_restart:1,
+		/**
+		 * when ->rq_replay is set, request is kept by the client even
+		 * after server commits corresponding transaction. This is
+		 * used for operations that require sequence of multiple
+		 * requests to be replayed. The only example currently is file
+		 * open/close. When last request in such a sequence is
+		 * committed, ->rq_replay is cleared on all requests in the
+		 * sequence.
+		 */
+		rq_replay:1,
+		rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
+		rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
+		rq_early:1, rq_must_unlink:1,
+		rq_memalloc:1,      /* req originated from "kswapd" */
+		/* server-side flags */
+		rq_packed_final:1,  /* packed final reply */
+		rq_hp:1,	    /* high priority RPC */
+		rq_at_linked:1,     /* link into service's srv_at_array */
+		rq_reply_truncate:1,
+		rq_committed:1,
+		/* whether the "rq_set" is a valid one */
+		rq_invalid_rqset:1,
+		rq_generation_set:1,
+		/* do not resend request on -EINPROGRESS */
+		rq_no_retry_einprogress:1,
+		/* allow the req to be sent if the import is in recovery
+		 * status */
+		rq_allow_replay:1;
+
+	unsigned int rq_nr_resend;
+
+	enum rq_phase rq_phase; /* one of RQ_PHASE_* */
+	enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */
+	atomic_t rq_refcount;/* client-side refcount for SENT race,
+				    server-side refcounf for multiple replies */
+
+	/** Portal to which this request would be sent */
+	short rq_request_portal;  /* XXX FIXME bug 249 */
+	/** Portal where to wait for reply and where reply would be sent */
+	short rq_reply_portal;    /* XXX FIXME bug 249 */
+
+	/**
+	 * client-side:
+	 * !rq_truncate : # reply bytes actually received,
+	 *  rq_truncate : required repbuf_len for resend
+	 */
+	int rq_nob_received;
+	/** Request length */
+	int rq_reqlen;
+	/** Reply length */
+	int rq_replen;
+	/** Request message - what client sent */
+	struct lustre_msg *rq_reqmsg;
+	/** Reply message - server response */
+	struct lustre_msg *rq_repmsg;
+	/** Transaction number */
+	__u64 rq_transno;
+	/** xid */
+	__u64 rq_xid;
+	/**
+	 * List item to for replay list. Not yet commited requests get linked
+	 * there.
+	 * Also see \a rq_replay comment above.
+	 */
+	struct list_head rq_replay_list;
+
+	/**
+	 * security and encryption data
+	 * @{ */
+	struct ptlrpc_cli_ctx   *rq_cli_ctx;     /**< client's half ctx */
+	struct ptlrpc_svc_ctx   *rq_svc_ctx;     /**< server's half ctx */
+	struct list_head	       rq_ctx_chain;   /**< link to waited ctx */
+
+	struct sptlrpc_flavor    rq_flvr;	/**< for client & server */
+	enum lustre_sec_part     rq_sp_from;
+
+	/* client/server security flags */
+	unsigned int
+				 rq_ctx_init:1,      /* context initiation */
+				 rq_ctx_fini:1,      /* context destroy */
+				 rq_bulk_read:1,     /* request bulk read */
+				 rq_bulk_write:1,    /* request bulk write */
+				 /* server authentication flags */
+				 rq_auth_gss:1,      /* authenticated by gss */
+				 rq_auth_remote:1,   /* authed as remote user */
+				 rq_auth_usr_root:1, /* authed as root */
+				 rq_auth_usr_mdt:1,  /* authed as mdt */
+				 rq_auth_usr_ost:1,  /* authed as ost */
+				 /* security tfm flags */
+				 rq_pack_udesc:1,
+				 rq_pack_bulk:1,
+				 /* doesn't expect reply FIXME */
+				 rq_no_reply:1,
+				 rq_pill_init:1;     /* pill initialized */
+
+	uid_t		    rq_auth_uid;	/* authed uid */
+	uid_t		    rq_auth_mapped_uid; /* authed uid mapped to */
+
+	/* (server side), pointed directly into req buffer */
+	struct ptlrpc_user_desc *rq_user_desc;
+
+	/* various buffer pointers */
+	struct lustre_msg       *rq_reqbuf;      /* req wrapper */
+	char		    *rq_repbuf;      /* rep buffer */
+	struct lustre_msg       *rq_repdata;     /* rep wrapper msg */
+	struct lustre_msg       *rq_clrbuf;      /* only in priv mode */
+	int		      rq_reqbuf_len;  /* req wrapper buf len */
+	int		      rq_reqdata_len; /* req wrapper msg len */
+	int		      rq_repbuf_len;  /* rep buffer len */
+	int		      rq_repdata_len; /* rep wrapper msg len */
+	int		      rq_clrbuf_len;  /* only in priv mode */
+	int		      rq_clrdata_len; /* only in priv mode */
+
+	/** early replies go to offset 0, regular replies go after that */
+	unsigned int	     rq_reply_off;
+
+	/** @} */
+
+	/** Fields that help to see if request and reply were swabbed or not */
+	__u32 rq_req_swab_mask;
+	__u32 rq_rep_swab_mask;
+
+	/** What was import generation when this request was sent */
+	int rq_import_generation;
+	enum lustre_imp_state rq_send_state;
+
+	/** how many early replies (for stats) */
+	int rq_early_count;
+
+	/** client+server request */
+	lnet_handle_md_t     rq_req_md_h;
+	struct ptlrpc_cb_id  rq_req_cbid;
+	/** optional time limit for send attempts */
+	cfs_duration_t       rq_delay_limit;
+	/** time request was first queued */
+	cfs_time_t	   rq_queued_time;
+
+	/* server-side... */
+	/** request arrival time */
+	struct timeval       rq_arrival_time;
+	/** separated reply state */
+	struct ptlrpc_reply_state *rq_reply_state;
+	/** incoming request buffer */
+	struct ptlrpc_request_buffer_desc *rq_rqbd;
+
+	/** client-only incoming reply */
+	lnet_handle_md_t     rq_reply_md_h;
+	wait_queue_head_t	  rq_reply_waitq;
+	struct ptlrpc_cb_id  rq_reply_cbid;
+
+	/** our LNet NID */
+	lnet_nid_t	   rq_self;
+	/** Peer description (the other side) */
+	lnet_process_id_t    rq_peer;
+	/** Server-side, export on which request was received */
+	struct obd_export   *rq_export;
+	/** Client side, import where request is being sent */
+	struct obd_import   *rq_import;
+
+	/** Replay callback, called after request is replayed at recovery */
+	void (*rq_replay_cb)(struct ptlrpc_request *);
+	/**
+	 * Commit callback, called when request is committed and about to be
+	 * freed.
+	 */
+	void (*rq_commit_cb)(struct ptlrpc_request *);
+	/** Opaq data for replay and commit callbacks. */
+	void  *rq_cb_data;
+
+	/** For bulk requests on client only: bulk descriptor */
+	struct ptlrpc_bulk_desc *rq_bulk;
+
+	/** client outgoing req */
+	/**
+	 * when request/reply sent (secs), or time when request should be sent
+	 */
+	time_t rq_sent;
+	/** time for request really sent out */
+	time_t rq_real_sent;
+
+	/** when request must finish. volatile
+	 * so that servers' early reply updates to the deadline aren't
+	 * kept in per-cpu cache */
+	volatile time_t rq_deadline;
+	/** when req reply unlink must finish. */
+	time_t rq_reply_deadline;
+	/** when req bulk unlink must finish. */
+	time_t rq_bulk_deadline;
+	/**
+	 * service time estimate (secs)
+	 * If the requestsis not served by this time, it is marked as timed out.
+	 */
+	int    rq_timeout;
+
+	/** Multi-rpc bits */
+	/** Per-request waitq introduced by bug 21938 for recovery waiting */
+	wait_queue_head_t rq_set_waitq;
+	/** Link item for request set lists */
+	struct list_head  rq_set_chain;
+	/** Link back to the request set */
+	struct ptlrpc_request_set *rq_set;
+	/** Async completion handler, called when reply is received */
+	ptlrpc_interpterer_t rq_interpret_reply;
+	/** Async completion context */
+	union ptlrpc_async_args rq_async_args;
+
+	/** Pool if request is from preallocated list */
+	struct ptlrpc_request_pool *rq_pool;
+
+	struct lu_context	   rq_session;
+	struct lu_context	   rq_recov_session;
+
+	/** request format description */
+	struct req_capsule	  rq_pill;
+};
+
+/**
+ * Call completion handler for rpc if any, return it's status or original
+ * rc if there was no handler defined for this request.
+ */
+static inline int ptlrpc_req_interpret(const struct lu_env *env,
+				       struct ptlrpc_request *req, int rc)
+{
+	if (req->rq_interpret_reply != NULL) {
+		req->rq_status = req->rq_interpret_reply(env, req,
+							 &req->rq_async_args,
+							 rc);
+		return req->rq_status;
+	}
+	return rc;
+}
+
+/** \addtogroup  nrs
+ * @{
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf);
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf);
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req);
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_pol_info *info);
+
+/*
+ * Can the request be moved from the regular NRS head to the high-priority NRS
+ * head (of the same PTLRPC service partition), if any?
+ *
+ * For a reliable result, this should be checked under svcpt->scp_req lock.
+ */
+static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+
+	/**
+	 * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the
+	 * request has been enqueued first, and ptlrpc_nrs_request::nr_started
+	 * to make sure it has not been scheduled yet (analogous to previous
+	 * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list).
+	 */
+	return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp;
+}
+/** @} nrs */
+
+/**
+ * Returns 1 if request buffer at offset \a index was already swabbed
+ */
+static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+	return req->rq_req_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request reply buffer at offset \a index was already swabbed
+ */
+static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+	return req->rq_rep_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
+{
+	return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Returns 1 if request reply needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+{
+	return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Mark request buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+	LASSERT((req->rq_req_swab_mask & (1 << index)) == 0);
+	req->rq_req_swab_mask |= 1 << index;
+}
+
+/**
+ * Mark request reply buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+	LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+	LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0);
+	req->rq_rep_swab_mask |= 1 << index;
+}
+
+/**
+ * Convert numerical request phase value \a phase into text string description
+ */
+static inline const char *
+ptlrpc_phase2str(enum rq_phase phase)
+{
+	switch (phase) {
+	case RQ_PHASE_NEW:
+		return "New";
+	case RQ_PHASE_RPC:
+		return "Rpc";
+	case RQ_PHASE_BULK:
+		return "Bulk";
+	case RQ_PHASE_INTERPRET:
+		return "Interpret";
+	case RQ_PHASE_COMPLETE:
+		return "Complete";
+	case RQ_PHASE_UNREGISTERING:
+		return "Unregistering";
+	default:
+		return "?Phase?";
+	}
+}
+
+/**
+ * Convert numerical request phase of the request \a req into text stringi
+ * description
+ */
+static inline const char *
+ptlrpc_rqphase2str(struct ptlrpc_request *req)
+{
+	return ptlrpc_phase2str(req->rq_phase);
+}
+
+/**
+ * Debugging functions and helpers to print request structure into debug log
+ * @{
+ */
+/* Spare the preprocessor, spoil the bugs. */
+#define FLAG(field, str) (field ? str : "")
+
+/** Convert bit flags into a string */
+#define DEBUG_REQ_FLAGS(req)						    \
+	ptlrpc_rqphase2str(req),						\
+	FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"),		    \
+	FLAG(req->rq_err, "E"),						 \
+	FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"),   \
+	FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),		  \
+	FLAG(req->rq_no_resend, "N"),					   \
+	FLAG(req->rq_waiting, "W"),					     \
+	FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"),		     \
+	FLAG(req->rq_committed, "M")
+
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s"
+
+void _debug_req(struct ptlrpc_request *req,
+		struct libcfs_debug_msg_data *data, const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Helper that decides if we need to print request accordig to current debug
+ * level settings
+ */
+#define debug_req(msgdata, mask, cdls, req, fmt, a...)			\
+do {									  \
+	CFS_CHECK_STACK(msgdata, mask, cdls);				 \
+									      \
+	if (((mask) & D_CANTMASK) != 0 ||				     \
+	    ((libcfs_debug & (mask)) != 0 &&				  \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))		\
+		_debug_req((req), msgdata, fmt, ##a);			 \
+} while(0)
+
+/**
+ * This is the debug print function you need to use to print request sturucture
+ * content into lustre debug log.
+ * for most callers (level is a constant) this is resolved at compile time */
+#define DEBUG_REQ(level, req, fmt, args...)				   \
+do {									  \
+	if ((level) & (D_ERROR | D_WARNING)) {				\
+		static cfs_debug_limit_state_t cdls;			  \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);	    \
+		debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\
+	} else {							      \
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);	     \
+		debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \
+	}								     \
+} while (0)
+/** @} */
+
+/**
+ * Structure that defines a single page of a bulk transfer
+ */
+struct ptlrpc_bulk_page {
+	/** Linkage to list of pages in a bulk */
+	struct list_head       bp_link;
+	/**
+	 * Number of bytes in a page to transfer starting from \a bp_pageoffset
+	 */
+	int	      bp_buflen;
+	/** offset within a page */
+	int	      bp_pageoffset;
+	/** The page itself */
+	struct page     *bp_page;
+};
+
+#define BULK_GET_SOURCE   0
+#define BULK_PUT_SINK     1
+#define BULK_GET_SINK     2
+#define BULK_PUT_SOURCE   3
+
+/**
+ * Definition of bulk descriptor.
+ * Bulks are special "Two phase" RPCs where initial request message
+ * is sent first and it is followed bt a transfer (o receiving) of a large
+ * amount of data to be settled into pages referenced from the bulk descriptors.
+ * Bulks transfers (the actual data following the small requests) are done
+ * on separate LNet portals.
+ * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs.
+ *  Another user is readpage for MDT.
+ */
+struct ptlrpc_bulk_desc {
+	/** completed with failure */
+	unsigned long bd_failure:1;
+	/** {put,get}{source,sink} */
+	unsigned long bd_type:2;
+	/** client side */
+	unsigned long bd_registered:1;
+	/** For serialization with callback */
+	spinlock_t bd_lock;
+	/** Import generation when request for this bulk was sent */
+	int bd_import_generation;
+	/** LNet portal for this bulk */
+	__u32 bd_portal;
+	/** Server side - export this bulk created for */
+	struct obd_export *bd_export;
+	/** Client side - import this bulk was sent on */
+	struct obd_import *bd_import;
+	/** Back pointer to the request */
+	struct ptlrpc_request *bd_req;
+	wait_queue_head_t	    bd_waitq;	/* server side only WQ */
+	int		    bd_iov_count;    /* # entries in bd_iov */
+	int		    bd_max_iov;      /* allocated size of bd_iov */
+	int		    bd_nob;	  /* # bytes covered */
+	int		    bd_nob_transferred; /* # bytes GOT/PUT */
+
+	__u64		  bd_last_xid;
+
+	struct ptlrpc_cb_id    bd_cbid;	 /* network callback info */
+	lnet_nid_t	     bd_sender;       /* stash event::sender */
+	int			bd_md_count;	/* # valid entries in bd_mds */
+	int			bd_md_max_brw;	/* max entries in bd_mds */
+	/** array of associated MDs */
+	lnet_handle_md_t	bd_mds[PTLRPC_BULK_OPS_COUNT];
+
+	/*
+	 * encrypt iov, size is either 0 or bd_iov_count.
+	 */
+	lnet_kiov_t	   *bd_enc_iov;
+
+	lnet_kiov_t	    bd_iov[0];
+};
+
+enum {
+	SVC_STOPPED     = 1 << 0,
+	SVC_STOPPING    = 1 << 1,
+	SVC_STARTING    = 1 << 2,
+	SVC_RUNNING     = 1 << 3,
+	SVC_EVENT       = 1 << 4,
+	SVC_SIGNAL      = 1 << 5,
+};
+
+#define PTLRPC_THR_NAME_LEN		32
+/**
+ * Definition of server service thread structure
+ */
+struct ptlrpc_thread {
+	/**
+	 * List of active threads in svc->srv_threads
+	 */
+	struct list_head t_link;
+	/**
+	 * thread-private data (preallocated memory)
+	 */
+	void *t_data;
+	__u32 t_flags;
+	/**
+	 * service thread index, from ptlrpc_start_threads
+	 */
+	unsigned int t_id;
+	/**
+	 * service thread pid
+	 */
+	pid_t t_pid;
+	/**
+	 * put watchdog in the structure per thread b=14840
+	 */
+	struct lc_watchdog *t_watchdog;
+	/**
+	 * the svc this thread belonged to b=18582
+	 */
+	struct ptlrpc_service_part	*t_svcpt;
+	wait_queue_head_t			t_ctl_waitq;
+	struct lu_env			*t_env;
+	char				t_name[PTLRPC_THR_NAME_LEN];
+};
+
+static inline int thread_is_init(struct ptlrpc_thread *thread)
+{
+	return thread->t_flags == 0;
+}
+
+static inline int thread_is_stopped(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_STOPPED);
+}
+
+static inline int thread_is_stopping(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_STOPPING);
+}
+
+static inline int thread_is_starting(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_STARTING);
+}
+
+static inline int thread_is_running(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_RUNNING);
+}
+
+static inline int thread_is_event(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_EVENT);
+}
+
+static inline int thread_is_signal(struct ptlrpc_thread *thread)
+{
+	return !!(thread->t_flags & SVC_SIGNAL);
+}
+
+static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+	thread->t_flags &= ~flags;
+}
+
+static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+	thread->t_flags = flags;
+}
+
+static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+	thread->t_flags |= flags;
+}
+
+static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread,
+					      __u32 flags)
+{
+	if (thread->t_flags & flags) {
+		thread->t_flags &= ~flags;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * Request buffer descriptor structure.
+ * This is a structure that contains one posted request buffer for service.
+ * Once data land into a buffer, event callback creates actual request and
+ * notifies wakes one of the service threads to process new incoming request.
+ * More than one request can fit into the buffer.
+ */
+struct ptlrpc_request_buffer_desc {
+	/** Link item for rqbds on a service */
+	struct list_head	     rqbd_list;
+	/** History of requests for this buffer */
+	struct list_head	     rqbd_reqs;
+	/** Back pointer to service for which this buffer is registered */
+	struct ptlrpc_service_part *rqbd_svcpt;
+	/** LNet descriptor */
+	lnet_handle_md_t       rqbd_md_h;
+	int		    rqbd_refcount;
+	/** The buffer itself */
+	char		  *rqbd_buffer;
+	struct ptlrpc_cb_id    rqbd_cbid;
+	/**
+	 * This "embedded" request structure is only used for the
+	 * last request to fit into the buffer
+	 */
+	struct ptlrpc_request  rqbd_req;
+};
+
+typedef int  (*svc_handler_t)(struct ptlrpc_request *req);
+
+struct ptlrpc_service_ops {
+	/**
+	 * if non-NULL called during thread creation (ptlrpc_start_thread())
+	 * to initialize service specific per-thread state.
+	 */
+	int		(*so_thr_init)(struct ptlrpc_thread *thr);
+	/**
+	 * if non-NULL called during thread shutdown (ptlrpc_main()) to
+	 * destruct state created by ->srv_init().
+	 */
+	void		(*so_thr_done)(struct ptlrpc_thread *thr);
+	/**
+	 * Handler function for incoming requests for this service
+	 */
+	int		(*so_req_handler)(struct ptlrpc_request *req);
+	/**
+	 * function to determine priority of the request, it's called
+	 * on every new request
+	 */
+	int		(*so_hpreq_handler)(struct ptlrpc_request *);
+	/**
+	 * service-specific print fn
+	 */
+	void		(*so_req_printer)(void *, struct ptlrpc_request *);
+};
+
+#ifndef __cfs_cacheline_aligned
+/* NB: put it here for reducing patche dependence */
+# define __cfs_cacheline_aligned
+#endif
+
+/**
+ * How many high priority requests to serve before serving one normal
+ * priority request
+ */
+#define PTLRPC_SVC_HP_RATIO 10
+
+/**
+ * Definition of PortalRPC service.
+ * The service is listening on a particular portal (like tcp port)
+ * and perform actions for a specific server like IO service for OST
+ * or general metadata service for MDS.
+ */
+struct ptlrpc_service {
+	/** serialize /proc operations */
+	spinlock_t			srv_lock;
+	/** most often accessed fields */
+	/** chain thru all services */
+	struct list_head		      srv_list;
+	/** service operations table */
+	struct ptlrpc_service_ops	srv_ops;
+	/** only statically allocated strings here; we don't clean them */
+	char			   *srv_name;
+	/** only statically allocated strings here; we don't clean them */
+	char			   *srv_thread_name;
+	/** service thread list */
+	struct list_head		      srv_threads;
+	/** threads # should be created for each partition on initializing */
+	int				srv_nthrs_cpt_init;
+	/** limit of threads number for each partition */
+	int				srv_nthrs_cpt_limit;
+	/** Root of /proc dir tree for this service */
+	proc_dir_entry_t	   *srv_procroot;
+	/** Pointer to statistic data for this service */
+	struct lprocfs_stats	   *srv_stats;
+	/** # hp per lp reqs to handle */
+	int			     srv_hpreq_ratio;
+	/** biggest request to receive */
+	int			     srv_max_req_size;
+	/** biggest reply to send */
+	int			     srv_max_reply_size;
+	/** size of individual buffers */
+	int			     srv_buf_size;
+	/** # buffers to allocate in 1 group */
+	int			     srv_nbuf_per_group;
+	/** Local portal on which to receive requests */
+	__u32			   srv_req_portal;
+	/** Portal on the client to send replies to */
+	__u32			   srv_rep_portal;
+	/**
+	 * Tags for lu_context associated with this thread, see struct
+	 * lu_context.
+	 */
+	__u32			   srv_ctx_tags;
+	/** soft watchdog timeout multiplier */
+	int			     srv_watchdog_factor;
+	/** under unregister_service */
+	unsigned			srv_is_stopping:1;
+
+	/** max # request buffers in history per partition */
+	int				srv_hist_nrqbds_cpt_max;
+	/** number of CPTs this service bound on */
+	int				srv_ncpts;
+	/** CPTs array this service bound on */
+	__u32				*srv_cpts;
+	/** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
+	int				srv_cpt_bits;
+	/** CPT table this service is running over */
+	struct cfs_cpt_table		*srv_cptable;
+	/**
+	 * partition data for ptlrpc service
+	 */
+	struct ptlrpc_service_part	*srv_parts[0];
+};
+
+/**
+ * Definition of PortalRPC service partition data.
+ * Although a service only has one instance of it right now, but we
+ * will have multiple instances very soon (instance per CPT).
+ *
+ * it has four locks:
+ * \a scp_lock
+ *    serialize operations on rqbd and requests waiting for preprocess
+ * \a scp_req_lock
+ *    serialize operations active requests sent to this portal
+ * \a scp_at_lock
+ *    serialize adaptive timeout stuff
+ * \a scp_rep_lock
+ *    serialize operations on RS list (reply states)
+ *
+ * We don't have any use-case to take two or more locks at the same time
+ * for now, so there is no lock order issue.
+ */
+struct ptlrpc_service_part {
+	/** back reference to owner */
+	struct ptlrpc_service		*scp_service __cfs_cacheline_aligned;
+	/* CPT id, reserved */
+	int				scp_cpt;
+	/** always increasing number */
+	int				scp_thr_nextid;
+	/** # of starting threads */
+	int				scp_nthrs_starting;
+	/** # of stopping threads, reserved for shrinking threads */
+	int				scp_nthrs_stopping;
+	/** # running threads */
+	int				scp_nthrs_running;
+	/** service threads list */
+	struct list_head			scp_threads;
+
+	/**
+	 * serialize the following fields, used for protecting
+	 * rqbd list and incoming requests waiting for preprocess,
+	 * threads starting & stopping are also protected by this lock.
+	 */
+	spinlock_t			scp_lock  __cfs_cacheline_aligned;
+	/** total # req buffer descs allocated */
+	int				scp_nrqbds_total;
+	/** # posted request buffers for receiving */
+	int				scp_nrqbds_posted;
+	/** in progress of allocating rqbd */
+	int				scp_rqbd_allocating;
+	/** # incoming reqs */
+	int				scp_nreqs_incoming;
+	/** request buffers to be reposted */
+	struct list_head			scp_rqbd_idle;
+	/** req buffers receiving */
+	struct list_head			scp_rqbd_posted;
+	/** incoming reqs */
+	struct list_head			scp_req_incoming;
+	/** timeout before re-posting reqs, in tick */
+	cfs_duration_t			scp_rqbd_timeout;
+	/**
+	 * all threads sleep on this. This wait-queue is signalled when new
+	 * incoming request arrives and when difficult reply has to be handled.
+	 */
+	wait_queue_head_t			scp_waitq;
+
+	/** request history */
+	struct list_head			scp_hist_reqs;
+	/** request buffer history */
+	struct list_head			scp_hist_rqbds;
+	/** # request buffers in history */
+	int				scp_hist_nrqbds;
+	/** sequence number for request */
+	__u64				scp_hist_seq;
+	/** highest seq culled from history */
+	__u64				scp_hist_seq_culled;
+
+	/**
+	 * serialize the following fields, used for processing requests
+	 * sent to this portal
+	 */
+	spinlock_t			scp_req_lock __cfs_cacheline_aligned;
+	/** # reqs in either of the NRS heads below */
+	/** # reqs being served */
+	int				scp_nreqs_active;
+	/** # HPreqs being served */
+	int				scp_nhreqs_active;
+	/** # hp requests handled */
+	int				scp_hreq_count;
+
+	/** NRS head for regular requests */
+	struct ptlrpc_nrs		scp_nrs_reg;
+	/** NRS head for HP requests; this is only valid for services that can
+	 *  handle HP requests */
+	struct ptlrpc_nrs	       *scp_nrs_hp;
+
+	/** AT stuff */
+	/** @{ */
+	/**
+	 * serialize the following fields, used for changes on
+	 * adaptive timeout
+	 */
+	spinlock_t			scp_at_lock __cfs_cacheline_aligned;
+	/** estimated rpc service time */
+	struct adaptive_timeout		scp_at_estimate;
+	/** reqs waiting for replies */
+	struct ptlrpc_at_array		scp_at_array;
+	/** early reply timer */
+	timer_list_t			scp_at_timer;
+	/** debug */
+	cfs_time_t			scp_at_checktime;
+	/** check early replies */
+	unsigned			scp_at_check;
+	/** @} */
+
+	/**
+	 * serialize the following fields, used for processing
+	 * replies for this portal
+	 */
+	spinlock_t			scp_rep_lock __cfs_cacheline_aligned;
+	/** all the active replies */
+	struct list_head			scp_rep_active;
+	/** List of free reply_states */
+	struct list_head			scp_rep_idle;
+	/** waitq to run, when adding stuff to srv_free_rs_list */
+	wait_queue_head_t			scp_rep_waitq;
+	/** # 'difficult' replies */
+	atomic_t			scp_nreps_difficult;
+};
+
+#define ptlrpc_service_for_each_part(part, i, svc)			\
+	for (i = 0;							\
+	     i < (svc)->srv_ncpts &&					\
+	     (svc)->srv_parts != NULL &&				\
+	     ((part) = (svc)->srv_parts[i]) != NULL; i++)
+
+/**
+ * Declaration of ptlrpcd control structure
+ */
+struct ptlrpcd_ctl {
+	/**
+	 * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+	 */
+	unsigned long			pc_flags;
+	/**
+	 * Thread lock protecting structure fields.
+	 */
+	spinlock_t			pc_lock;
+	/**
+	 * Start completion.
+	 */
+	struct completion		pc_starting;
+	/**
+	 * Stop completion.
+	 */
+	struct completion		pc_finishing;
+	/**
+	 * Thread requests set.
+	 */
+	struct ptlrpc_request_set  *pc_set;
+	/**
+	 * Thread name used in cfs_daemonize()
+	 */
+	char			pc_name[16];
+	/**
+	 * Environment for request interpreters to run in.
+	 */
+	struct lu_env	       pc_env;
+	/**
+	 * Index of ptlrpcd thread in the array.
+	 */
+	int			 pc_index;
+	/**
+	 * Number of the ptlrpcd's partners.
+	 */
+	int			 pc_npartners;
+	/**
+	 * Pointer to the array of partners' ptlrpcd_ctl structure.
+	 */
+	struct ptlrpcd_ctl	**pc_partners;
+	/**
+	 * Record the partner index to be processed next.
+	 */
+	int			 pc_cursor;
+};
+
+/* Bits for pc_flags */
+enum ptlrpcd_ctl_flags {
+	/**
+	 * Ptlrpc thread start flag.
+	 */
+	LIOD_START       = 1 << 0,
+	/**
+	 * Ptlrpc thread stop flag.
+	 */
+	LIOD_STOP	= 1 << 1,
+	/**
+	 * Ptlrpc thread force flag (only stop force so far).
+	 * This will cause aborting any inflight rpcs handled
+	 * by thread if LIOD_STOP is specified.
+	 */
+	LIOD_FORCE       = 1 << 2,
+	/**
+	 * This is a recovery ptlrpc thread.
+	 */
+	LIOD_RECOVERY    = 1 << 3,
+	/**
+	 * The ptlrpcd is bound to some CPU core.
+	 */
+	LIOD_BIND	= 1 << 4,
+};
+
+/**
+ * \addtogroup nrs
+ * @{
+ *
+ * Service compatibility function; the policy is compatible with all services.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return true;
+}
+
+/**
+ * Service compatibility function; the policy is compatible with only a specific
+ * service which is identified by its human-readable name at
+ * ptlrpc_service::srv_name.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval false The policy is not compatible with the service
+ * \retval true	 The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	LASSERT(desc->pd_compat_svc_name != NULL);
+	return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0;
+}
+
+/** @} nrs */
+
+/* ptlrpc/events.c */
+extern lnet_handle_eq_t ptlrpc_eq_h;
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+			       lnet_process_id_t *peer, lnet_nid_t *self);
+/**
+ * These callbacks are invoked by LNet when something happened to
+ * underlying buffer
+ * @{
+ */
+extern void request_out_callback(lnet_event_t *ev);
+extern void reply_in_callback(lnet_event_t *ev);
+extern void client_bulk_callback(lnet_event_t *ev);
+extern void request_in_callback(lnet_event_t *ev);
+extern void reply_out_callback(lnet_event_t *ev);
+/** @} */
+
+/* ptlrpc/connection.c */
+struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer,
+						lnet_nid_t self,
+						struct obd_uuid *uuid);
+int ptlrpc_connection_put(struct ptlrpc_connection *c);
+struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
+int ptlrpc_connection_init(void);
+void ptlrpc_connection_fini(void);
+extern lnet_pid_t ptl_get_pid(void);
+
+/* ptlrpc/niobuf.c */
+/**
+ * Actual interfacing with LNet to put/get/register/unregister stuff
+ * @{
+ */
+
+int ptlrpc_register_bulk(struct ptlrpc_request *req);
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
+
+static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc;
+	int		      rc;
+
+	LASSERT(req != NULL);
+	desc = req->rq_bulk;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+	    req->rq_bulk_deadline > cfs_time_current_sec())
+		return 1;
+
+	if (!desc)
+		return 0;
+
+	spin_lock(&desc->bd_lock);
+	rc = desc->bd_md_count;
+	spin_unlock(&desc->bd_lock);
+	return rc;
+}
+
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY	   0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
+int ptlrpc_reply(struct ptlrpc_request *req);
+int ptlrpc_send_error(struct ptlrpc_request *req, int difficult);
+int ptlrpc_error(struct ptlrpc_request *req);
+void ptlrpc_resend_req(struct ptlrpc_request *request);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd);
+/** @} */
+
+/* ptlrpc/client.c */
+/**
+ * Client-side portals API. Everything to send requests, receive replies,
+ * request queues, request management, etc.
+ * @{
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+			struct ptlrpc_client *);
+void ptlrpc_cleanup_client(struct obd_import *imp);
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
+
+int ptlrpc_queue_wait(struct ptlrpc_request *req);
+int ptlrpc_replay_req(struct ptlrpc_request *req);
+int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async);
+void ptlrpc_restart_req(struct ptlrpc_request *req);
+void ptlrpc_abort_inflight(struct obd_import *imp);
+void ptlrpc_cleanup_imp(struct obd_import *imp);
+void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void);
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+					     void *arg);
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+		      set_interpreter_func fn, void *data);
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
+int ptlrpc_set_wait(struct ptlrpc_request_set *);
+int ptlrpc_expired_set(void *data);
+void ptlrpc_interrupted_set(void *data);
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
+void ptlrpc_set_destroy(struct ptlrpc_request_set *);
+void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+			    struct ptlrpc_request *req);
+
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int, int,
+		    void (*populate_pool)(struct ptlrpc_request_pool *, int));
+
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+					    const struct req_format *format);
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+					    struct ptlrpc_request_pool *,
+					    const struct req_format *format);
+void ptlrpc_request_free(struct ptlrpc_request *request);
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+			__u32 version, int opcode);
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+						const struct req_format *format,
+						__u32 version, int opcode);
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+			     __u32 version, int opcode, char **bufs,
+			     struct ptlrpc_cli_ctx *ctx);
+struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
+				       int opcode, int count, __u32 *lengths,
+				       char **bufs);
+struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp,
+					     __u32 version, int opcode,
+					    int count, __u32 *lengths, char **bufs,
+					    struct ptlrpc_request_pool *pool);
+void ptlrpc_req_finished(struct ptlrpc_request *request);
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+					      unsigned npages, unsigned max_brw,
+					      unsigned type, unsigned portal);
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin);
+static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk)
+{
+	__ptlrpc_free_bulk(bulk, 1);
+}
+static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk)
+{
+	__ptlrpc_free_bulk(bulk, 0);
+}
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     struct page *page, int pageoffset, int len, int);
+static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc,
+					     struct page *page, int pageoffset,
+					     int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1);
+}
+
+static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc,
+					       struct page *page, int pageoffset,
+					       int len)
+{
+	__ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0);
+}
+
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+				      struct obd_import *imp);
+__u64 ptlrpc_next_xid(void);
+__u64 ptlrpc_sample_next_xid(void);
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
+
+/* Set of routines to run a function in ptlrpcd context */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+			 int (*cb)(const struct lu_env *, void *), void *data);
+void ptlrpcd_destroy_work(void *handler);
+int ptlrpcd_queue_work(void *handler);
+
+/** @} */
+struct ptlrpc_service_buf_conf {
+	/* nbufs is buffers # to allocate when growing the pool */
+	unsigned int			bc_nbufs;
+	/* buffer size to post */
+	unsigned int			bc_buf_size;
+	/* portal to listed for requests on */
+	unsigned int			bc_req_portal;
+	/* portal of where to send replies to */
+	unsigned int			bc_rep_portal;
+	/* maximum request size to be accepted for this service */
+	unsigned int			bc_req_max_size;
+	/* maximum reply size this service can ever send */
+	unsigned int			bc_rep_max_size;
+};
+
+struct ptlrpc_service_thr_conf {
+	/* threadname should be 8 characters or less - 6 will be added on */
+	char				*tc_thr_name;
+	/* threads increasing factor for each CPU */
+	unsigned int			tc_thr_factor;
+	/* service threads # to start on each partition while initializing */
+	unsigned int			tc_nthrs_init;
+	/*
+	 * low water of threads # upper-limit on each partition while running,
+	 * service availability may be impacted if threads number is lower
+	 * than this value. It can be ZERO if the service doesn't require
+	 * CPU affinity or there is only one partition.
+	 */
+	unsigned int			tc_nthrs_base;
+	/* "soft" limit for total threads number */
+	unsigned int			tc_nthrs_max;
+	/* user specified threads number, it will be validated due to
+	 * other members of this structure. */
+	unsigned int			tc_nthrs_user;
+	/* set NUMA node affinity for service threads */
+	unsigned int			tc_cpu_affinity;
+	/* Tags for lu_context associated with service thread */
+	__u32				tc_ctx_tags;
+};
+
+struct ptlrpc_service_cpt_conf {
+	struct cfs_cpt_table		*cc_cptable;
+	/* string pattern to describe CPTs for a service */
+	char				*cc_pattern;
+};
+
+struct ptlrpc_service_conf {
+	/* service name */
+	char				*psc_name;
+	/* soft watchdog timeout multiplifier to print stuck service traces */
+	unsigned int			psc_watchdog_factor;
+	/* buffer information */
+	struct ptlrpc_service_buf_conf	psc_buf;
+	/* thread information */
+	struct ptlrpc_service_thr_conf	psc_thr;
+	/* CPU partition information */
+	struct ptlrpc_service_cpt_conf	psc_cpt;
+	/* function table */
+	struct ptlrpc_service_ops	psc_ops;
+};
+
+/* ptlrpc/service.c */
+/**
+ * Server-side services API. Register/unregister service, request state
+ * management, service thread management
+ *
+ * @{
+ */
+void ptlrpc_save_lock(struct ptlrpc_request *req,
+		      struct lustre_handle *lock, int mode, int no_ack);
+void ptlrpc_commit_replies(struct obd_export *exp);
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs);
+void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
+struct ptlrpc_service *ptlrpc_register_service(
+				struct ptlrpc_service_conf *conf,
+				struct proc_dir_entry *proc_entry);
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc);
+int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services(void *arg);
+void ptlrpc_daemonize(char *name);
+int ptlrpc_service_health_check(struct ptlrpc_service *);
+void ptlrpc_server_drop_request(struct ptlrpc_request *req);
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+				  struct obd_export *export);
+
+int ptlrpc_hr_init(void);
+void ptlrpc_hr_fini(void);
+
+/** @} */
+
+/* ptlrpc/import.c */
+/**
+ * Import API
+ * @{
+ */
+int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_init_import(struct obd_import *imp);
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+void deuuidify(char *uuid, const char *prefix, char **uuid_start,
+	       int *uuid_len);
+
+/* ptlrpc/pack_generic.c */
+int ptlrpc_reconnect_import(struct obd_import *imp);
+/** @} */
+
+/**
+ * ptlrpc msg buffer and swab interface
+ *
+ * @{
+ */
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			 int index);
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+				int index);
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len);
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+			char **bufs);
+int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
+			__u32 *lens, char **bufs);
+int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens,
+		      char **bufs);
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+			 __u32 *lens, char **bufs, int flags);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens,
+			    char **bufs, int flags);
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+		      unsigned int newlen, int move_data);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
+int __lustre_unpack_msg(struct lustre_msg *m, int len);
+int lustre_msg_hdr_size(__u32 magic, int count);
+int lustre_msg_size(__u32 magic, int count, __u32 *lengths);
+int lustre_msg_size_v2(int count, __u32 *lengths);
+int lustre_packed_msg_size(struct lustre_msg *msg);
+int lustre_msg_early_size(void);
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size);
+void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
+int lustre_msg_buflen(struct lustre_msg *m, int n);
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len);
+int lustre_msg_bufcount(struct lustre_msg *m);
+char *lustre_msg_string(struct lustre_msg *m, int n, int max_len);
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg);
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_flags(struct lustre_msg *msg);
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags);
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags);
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
+__u32 lustre_msg_get_type(struct lustre_msg *msg);
+__u32 lustre_msg_get_version(struct lustre_msg *msg);
+void lustre_msg_add_version(struct lustre_msg *msg, int version);
+__u32 lustre_msg_get_opc(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg);
+__u64 lustre_msg_get_transno(struct lustre_msg *msg);
+__u64 lustre_msg_get_slv(struct lustre_msg *msg);
+__u32 lustre_msg_get_limit(struct lustre_msg *msg);
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv);
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
+int lustre_msg_get_status(struct lustre_msg *msg);
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+int lustre_msg_is_v1(struct lustre_msg *msg);
+__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+char *lustre_msg_get_jobid(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18);
+#else
+# warning "remove checksum compatibility support for b1_8"
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
+#endif
+void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle);
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid);
+void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed);
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions);
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
+void ptlrpc_request_set_replen(struct ptlrpc_request *req);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
+
+static inline void
+lustre_shrink_reply(struct ptlrpc_request *req, int segment,
+		    unsigned int newlen, int move_data)
+{
+	LASSERT(req->rq_reply_state);
+	LASSERT(req->rq_repmsg);
+	req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment,
+					   newlen, move_data);
+}
+/** @} */
+
+/** Change request phase of \a req to \a new_phase */
+static inline void
+ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase)
+{
+	if (req->rq_phase == new_phase)
+		return;
+
+	if (new_phase == RQ_PHASE_UNREGISTERING) {
+		req->rq_next_phase = req->rq_phase;
+		if (req->rq_import)
+			atomic_inc(&req->rq_import->imp_unregistering);
+	}
+
+	if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+		if (req->rq_import)
+			atomic_dec(&req->rq_import->imp_unregistering);
+	}
+
+	DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"",
+		  ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase));
+
+	req->rq_phase = new_phase;
+}
+
+/**
+ * Returns true if request \a req got early reply and hard deadline is not met
+ */
+static inline int
+ptlrpc_client_early(struct ptlrpc_request *req)
+{
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > cfs_time_current_sec())
+		return 0;
+	return req->rq_early;
+}
+
+/**
+ * Returns true if we got real reply from server for this request
+ */
+static inline int
+ptlrpc_client_replied(struct ptlrpc_request *req)
+{
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > cfs_time_current_sec())
+		return 0;
+	return req->rq_replied;
+}
+
+/** Returns true if request \a req is in process of receiving server reply */
+static inline int
+ptlrpc_client_recv(struct ptlrpc_request *req)
+{
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > cfs_time_current_sec())
+		return 1;
+	return req->rq_receiving_reply;
+}
+
+static inline int
+ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
+{
+	int rc;
+
+	spin_lock(&req->rq_lock);
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    req->rq_reply_deadline > cfs_time_current_sec()) {
+		spin_unlock(&req->rq_lock);
+		return 1;
+	}
+	rc = req->rq_receiving_reply || req->rq_must_unlink;
+	spin_unlock(&req->rq_lock);
+	return rc;
+}
+
+static inline void
+ptlrpc_client_wake_req(struct ptlrpc_request *req)
+{
+	if (req->rq_set == NULL)
+		wake_up(&req->rq_reply_waitq);
+	else
+		wake_up(&req->rq_set->set_waitq);
+}
+
+static inline void
+ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_refcount) > 0);
+	atomic_inc(&rs->rs_refcount);
+}
+
+static inline void
+ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
+{
+	LASSERT(atomic_read(&rs->rs_refcount) > 0);
+	if (atomic_dec_and_test(&rs->rs_refcount))
+		lustre_free_reply_state(rs);
+}
+
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+	if (req->rq_reply_state == NULL)
+		return; /* shouldn't occur */
+	ptlrpc_rs_decref(req->rq_reply_state);
+	req->rq_reply_state = NULL;
+	req->rq_repmsg = NULL;
+}
+
+static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
+{
+	return lustre_msg_get_magic(req->rq_reqmsg);
+}
+
+static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
+{
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return req->rq_reqmsg->lm_repsize;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n",
+			 req->rq_reqmsg->lm_magic);
+		return -EFAULT;
+	}
+}
+
+static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
+{
+	if (req->rq_delay_limit != 0 &&
+	    cfs_time_before(cfs_time_add(req->rq_queued_time,
+					 cfs_time_seconds(req->rq_delay_limit)),
+			    cfs_time_current())) {
+		return 1;
+	}
+	return 0;
+}
+
+static inline int ptlrpc_no_resend(struct ptlrpc_request *req)
+{
+	if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
+		spin_lock(&req->rq_lock);
+		req->rq_no_resend = 1;
+		spin_unlock(&req->rq_lock);
+	}
+	return req->rq_no_resend;
+}
+
+static inline int
+ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
+{
+	int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate);
+
+	return svcpt->scp_service->srv_watchdog_factor *
+	       max_t(int, at, obd_timeout);
+}
+
+static inline struct ptlrpc_service *
+ptlrpc_req2svc(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_rqbd != NULL);
+	return req->rq_rqbd->rqbd_svcpt->scp_service;
+}
+
+/* ldlm/ldlm_lib.c */
+/**
+ * Target client logic
+ * @{
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
+int client_obd_cleanup(struct obd_device *obddev);
+int client_connect_import(const struct lu_env *env,
+			  struct obd_export **exp, struct obd_device *obd,
+			  struct obd_uuid *cluuid, struct obd_connect_data *,
+			  void *localdata);
+int client_disconnect_export(struct obd_export *exp);
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority);
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid);
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+			    struct obd_uuid *uuid);
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
+void client_destroy_import(struct obd_import *imp);
+/** @} */
+
+
+/* ptlrpc/pinger.c */
+/**
+ * Pinger API (client side only)
+ * @{
+ */
+enum timeout_event {
+	TIMEOUT_GRANT = 1
+};
+struct timeout_item;
+typedef int (*timeout_cb_t)(struct timeout_item *, void *);
+int ptlrpc_pinger_add_import(struct obd_import *imp);
+int ptlrpc_pinger_del_import(struct obd_import *imp);
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+			      timeout_cb_t cb, void *data,
+			      struct list_head *obd_list);
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+			      enum timeout_event event);
+struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
+int ptlrpc_obd_ping(struct obd_device *obd);
+cfs_time_t ptlrpc_suspend_wakeup_time(void);
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req);
+void ptlrpc_pinger_ir_up(void);
+void ptlrpc_pinger_ir_down(void);
+/** @} */
+int ptlrpc_pinger_suppress_pings(void);
+
+/* ptlrpc daemon bind policy */
+typedef enum {
+	/* all ptlrpcd threads are free mode */
+	PDB_POLICY_NONE	  = 1,
+	/* all ptlrpcd threads are bound mode */
+	PDB_POLICY_FULL	  = 2,
+	/* <free1 bound1> <free2 bound2> ... <freeN boundN> */
+	PDB_POLICY_PAIR	  = 3,
+	/* <free1 bound1> <bound1 free2> ... <freeN boundN> <boundN free1>,
+	 * means each ptlrpcd[X] has two partners: thread[X-1] and thread[X+1].
+	 * If kernel supports NUMA, pthrpcd threads are binded and
+	 * grouped by NUMA node */
+	PDB_POLICY_NEIGHBOR      = 4,
+} pdb_policy_t;
+
+/* ptlrpc daemon load policy
+ * It is caller's duty to specify how to push the async RPC into some ptlrpcd
+ * queue, but it is not enforced, affected by "ptlrpcd_bind_policy". If it is
+ * "PDB_POLICY_FULL", then the RPC will be processed by the selected ptlrpcd,
+ * Otherwise, the RPC may be processed by the selected ptlrpcd or its partner,
+ * depends on which is scheduled firstly, to accelerate the RPC processing. */
+typedef enum {
+	/* on the same CPU core as the caller */
+	PDL_POLICY_SAME	 = 1,
+	/* within the same CPU partition, but not the same core as the caller */
+	PDL_POLICY_LOCAL	= 2,
+	/* round-robin on all CPU cores, but not the same core as the caller */
+	PDL_POLICY_ROUND	= 3,
+	/* the specified CPU core is preferred, but not enforced */
+	PDL_POLICY_PREFERRED    = 4,
+} pdl_policy_t;
+
+/* ptlrpc/ptlrpcd.c */
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
+void ptlrpcd_wake(struct ptlrpc_request *req);
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx);
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);
+int ptlrpcd_addref(void);
+void ptlrpcd_decref(void);
+
+/* ptlrpc/lproc_ptlrpc.c */
+/**
+ * procfs output related functions
+ * @{
+ */
+const char* ll_opcode2str(__u32 opcode);
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
+#else
+static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
+#endif
+/** @} */
+
+/* ptlrpc/llog_server.c */
+int llog_origin_handle_open(struct ptlrpc_request *req);
+int llog_origin_handle_destroy(struct ptlrpc_request *req);
+int llog_origin_handle_prev_block(struct ptlrpc_request *req);
+int llog_origin_handle_next_block(struct ptlrpc_request *req);
+int llog_origin_handle_read_header(struct ptlrpc_request *req);
+int llog_origin_handle_close(struct ptlrpc_request *req);
+int llog_origin_handle_cancel(struct ptlrpc_request *req);
+
+/* ptlrpc/llog_client.c */
+extern struct llog_operations llog_client_ops;
+
+/** @} net */
+
+#endif
+/** @} PtlRPC */

diff --git a/drivers/staging/lustre/lustre/include/lustre_param.h b/drivers/staging/lustre/lustre/include/lustre_param.h
new file mode 100644
index 0000000..ed65468
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_param.h

@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_param.h
+ *
+ * User-settable parameter keys
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_PARAM_H
+#define _LUSTRE_PARAM_H
+
+/** \defgroup param param
+ *
+ * @{
+ */
+
+/* For interoperability */
+struct cfg_interop_param {
+	char *old_param;
+	char *new_param;
+};
+
+/* obd_config.c */
+int class_find_param(char *buf, char *key, char **valp);
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr);
+int class_get_next_param(char **params, char *copy);
+int class_match_param(char *buf, char *key, char **valp);
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_net(char *buf, __u32 *net, char **endh);
+int class_match_nid(char *buf, char *key, lnet_nid_t nid);
+int class_match_net(char *buf, char *key, __u32 net);
+/* obd_mount.c */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+	    char *s1, char *s2, char *s3, char *s4);
+
+
+
+/****************** User-settable parameter keys *********************/
+/* e.g.
+	tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda
+	lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0
+		    ... testfs-MDT0000.lov.stripesize=4M
+		    ... testfs-OST0000.ost.client_cache_seconds=15
+		    ... testfs.sys.timeout=<secs>
+		    ... testfs.llite.max_read_ahead_mb=16
+*/
+
+/* System global or special params not handled in obd's proc
+ * See mgs_write_log_sys()
+ */
+#define PARAM_TIMEOUT	      "timeout="	  /* global */
+#define PARAM_LDLM_TIMEOUT	 "ldlm_timeout="     /* global */
+#define PARAM_AT_MIN	       "at_min="	   /* global */
+#define PARAM_AT_MAX	       "at_max="	   /* global */
+#define PARAM_AT_EXTRA	     "at_extra="	 /* global */
+#define PARAM_AT_EARLY_MARGIN      "at_early_margin="  /* global */
+#define PARAM_AT_HISTORY	   "at_history="       /* global */
+#define PARAM_JOBID_VAR		   "jobid_var="	       /* global */
+#define PARAM_MGSNODE	      "mgsnode="	  /* only at mounttime */
+#define PARAM_FAILNODE	     "failover.node="    /* add failover nid */
+#define PARAM_FAILMODE	     "failover.mode="    /* initial mount only */
+#define PARAM_ACTIVE	       "active="	   /* activate/deactivate */
+#define PARAM_NETWORK	      "network="	  /* bind on nid */
+#define PARAM_ID_UPCALL		"identity_upcall="  /* identity upcall */
+
+/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */
+#define PARAM_OST		  "ost."
+#define PARAM_OSC		  "osc."
+#define PARAM_MDT		  "mdt."
+#define PARAM_MDD		  "mdd."
+#define PARAM_MDC		  "mdc."
+#define PARAM_LLITE		"llite."
+#define PARAM_LOV		  "lov."
+#define PARAM_LOD		"lod."
+#define PARAM_OSP		"osp."
+#define PARAM_SYS		  "sys."	      /* global */
+#define PARAM_SRPC		 "srpc."
+#define PARAM_SRPC_FLVR	    "srpc.flavor."
+#define PARAM_SRPC_UDESC	   "srpc.udesc.cli2mdt"
+#define PARAM_SEC		  "security."
+#define PARAM_QUOTA		"quota."	    /* global */
+
+/** @} param */
+
+#endif /* _LUSTRE_PARAM_H */

diff --git a/drivers/staging/lustre/lustre/include/lustre_quota.h b/drivers/staging/lustre/lustre/include/lustre_quota.h
new file mode 100644
index 0000000..1c3041f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_quota.h

@@ -0,0 +1,239 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LUSTRE_QUOTA_H
+#define _LUSTRE_QUOTA_H
+
+/** \defgroup quota quota
+ *
+ */
+
+#include <linux/lustre_quota.h>
+
+#include <dt_object.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+
+#ifndef MAX_IQ_TIME
+#define MAX_IQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+#ifndef MAX_DQ_TIME
+#define MAX_DQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+struct lquota_id_info;
+struct lquota_trans;
+
+/* Gather all quota record type in an union that can be used to read any records
+ * from disk. All fields of these records must be 64-bit aligned, otherwise the
+ * OSD layer may swab them incorrectly. */
+union lquota_rec {
+	struct lquota_glb_rec	lqr_glb_rec;
+	struct lquota_slv_rec	lqr_slv_rec;
+	struct lquota_acct_rec	lqr_acct_rec;
+};
+
+/* Index features supported by the global index objects
+ * Only used for migration purpose and should be removed once on-disk migration
+ * is no longer needed */
+extern struct dt_index_features dt_quota_iusr_features;
+extern struct dt_index_features dt_quota_busr_features;
+extern struct dt_index_features dt_quota_igrp_features;
+extern struct dt_index_features dt_quota_bgrp_features;
+
+/* Name used in the configuration logs to identify the default metadata pool
+ * (composed of all the MDTs, with pool ID 0) and the default data pool (all
+ * the OSTs, with pool ID 0 too). */
+#define QUOTA_METAPOOL_NAME   "mdt="
+#define QUOTA_DATAPOOL_NAME   "ost="
+
+/*
+ * Quota Master Target support
+ */
+
+/* Request handlers for quota master operations.
+ * This is used by the MDT to pass quota/lock requests to the quota master
+ * target. This won't be needed any more once the QMT is a real target and
+ * does not rely any more on the MDT service threads and namespace. */
+struct qmt_handlers {
+	/* Handle quotactl request from client. */
+	int (*qmth_quotactl)(const struct lu_env *, struct lu_device *,
+			     struct obd_quotactl *);
+
+	/* Handle dqacq/dqrel request from slave. */
+	int (*qmth_dqacq)(const struct lu_env *, struct lu_device *,
+			  struct ptlrpc_request *);
+
+	/* LDLM intent policy associated with quota locks */
+	int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *,
+				  struct ptlrpc_request *, struct ldlm_lock **,
+				  int);
+
+	/* Initialize LVB of ldlm resource associated with quota objects */
+	int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *);
+
+	/* Update LVB of ldlm resource associated with quota objects */
+	int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *,
+				struct ptlrpc_request *, int);
+
+	/* Return size of LVB to be packed in ldlm message */
+	int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *);
+
+	/* Fill request buffer with lvb */
+	int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *,
+			      int);
+
+	/* Free lvb associated with ldlm resource */
+	int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *);
+};
+
+/* actual handlers are defined in lustre/quota/qmt_handler.c */
+extern struct qmt_handlers qmt_hdls;
+
+/*
+ * Quota enforcement support on slaves
+ */
+
+struct qsd_instance;
+
+/* The quota slave feature is implemented under the form of a library.
+ * The API is the following:
+ *
+ * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd
+ *	       instance via qsd_init(). This creates all required structures
+ *	       to manage quota enforcement for this target and performs all
+ *	       low-level initialization which does not involve any lustre
+ *	       object. qsd_init() should typically be called when the OSD
+ *	       is being set up.
+ *
+ * - qsd_prepare(): This sets up on-disk objects associated with the quota slave
+ *		  feature and initiates the quota reintegration procedure if
+ *		  needed. qsd_prepare() should typically be called when
+ *		  ->ldo_prepare is invoked.
+ *
+ * - qsd_start(): a qsd instance should be started once recovery is completed
+ *		(i.e. when ->ldo_recovery_complete is called). This is used
+ *		to notify the qsd layer that quota should now be enforced
+ *		again via the qsd_op_begin/end functions. The last step of the
+ *		reintegration prodecure (namely usage reconciliation) will be
+ *		completed during start.
+ *
+ * - qsd_fini(): is used to release a qsd_instance structure allocated with
+ *	       qsd_init(). This releases all quota slave objects and frees the
+ *	       structures associated with the qsd_instance.
+ *
+ * - qsd_op_begin(): is used to enforce quota, it must be called in the
+ *		   declaration of each operation. qsd_op_end() should then be
+ *		   invoked later once all operations have been completed in
+ *		   order to release/adjust the quota space.
+ *		   Running qsd_op_begin() before qsd_start() isn't fatal and
+ *		   will return success.
+ *		   Once qsd_start() has been run, qsd_op_begin() will block
+ *		   until the reintegration procedure is completed.
+ *
+ * - qsd_op_end(): performs the post operation quota processing. This must be
+ *		 called after the operation transaction stopped.
+ *		 While qsd_op_begin() must be invoked each time a new
+ *		 operation is declared, qsd_op_end() should be called only
+ *		 once for the whole transaction.
+ *
+ * - qsd_op_adjust(): triggers pre-acquire/release if necessary.
+ *
+ * Below are the function prototypes to be used by OSD layer to manage quota
+ * enforcement. Arguments are documented where each function is defined.  */
+
+struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
+			      proc_dir_entry_t *);
+int qsd_prepare(const struct lu_env *, struct qsd_instance *);
+int qsd_start(const struct lu_env *, struct qsd_instance *);
+void qsd_fini(const struct lu_env *, struct qsd_instance *);
+int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
+		 struct lquota_trans *, struct lquota_id_info *, int *);
+void qsd_op_end(const struct lu_env *, struct qsd_instance *,
+		struct lquota_trans *);
+void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
+		   union lquota_id *, int);
+/* This is exported for the ldiskfs quota migration only,
+ * see convert_quota_file() */
+int lquota_disk_write_glb(const struct lu_env *, struct dt_object *,
+			  __u64, struct lquota_glb_rec *);
+
+/*
+ * Quota information attached to a transaction
+ */
+
+struct lquota_entry;
+
+struct lquota_id_info {
+	/* quota identifier */
+	union lquota_id		 lqi_id;
+
+	/* USRQUOTA or GRPQUOTA for now, could be expanded for
+	 * directory quota or other types later.  */
+	int			 lqi_type;
+
+	/* inodes or kbytes to be consumed or released, it could
+	 * be negative when releasing space.  */
+	long long		 lqi_space;
+
+	/* quota slave entry structure associated with this ID */
+	struct lquota_entry	*lqi_qentry;
+
+	/* whether we are reporting blocks or inodes */
+	bool			 lqi_is_blk;
+};
+
+/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
+ * data pool (OSTs), there are at most 4 quota ids being enforced in a single
+ * transaction, which is chown transaction:
+ * original uid and gid, new uid and gid.
+ *
+ * This value might need to be revised when directory quota is added.  */
+#define QUOTA_MAX_TRANSIDS    4
+
+/* all qids involved in a single transaction */
+struct lquota_trans {
+	unsigned short		lqt_id_cnt;
+	struct lquota_id_info	lqt_ids[QUOTA_MAX_TRANSIDS];
+};
+
+/* flags for quota local enforcement */
+#define QUOTA_FL_OVER_USRQUOTA  0x01
+#define QUOTA_FL_OVER_GRPQUOTA  0x02
+#define QUOTA_FL_SYNC	   0x04
+
+#define IS_LQUOTA_RES(res)						\
+	(res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA ||	\
+	 res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
+
+/* helper function used by MDT & OFD to retrieve quota accounting information
+ * on slave */
+int lquotactl_slv(const struct lu_env *, struct dt_device *,
+		  struct obd_quotactl *);
+/** @} quota */
+#endif /* _LUSTRE_QUOTA_H */

diff --git a/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/drivers/staging/lustre/lustre/include/lustre_req_layout.h
new file mode 100644
index 0000000..f4d3820
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_req_layout.h

@@ -0,0 +1,334 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_req_layout.h
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_REQ_LAYOUT_H__
+#define _LUSTRE_REQ_LAYOUT_H__
+
+/** \defgroup req_layout req_layout
+ *
+ * @{
+ */
+
+struct req_msg_field;
+struct req_format;
+struct req_capsule;
+
+struct ptlrpc_request;
+
+enum req_location {
+	RCL_CLIENT,
+	RCL_SERVER,
+	RCL_NR
+};
+
+/* Maximal number of fields (buffers) in a request message. */
+#define REQ_MAX_FIELD_NR  9
+
+struct req_capsule {
+	struct ptlrpc_request   *rc_req;
+	const struct req_format *rc_fmt;
+	enum req_location	rc_loc;
+	__u32		    rc_area[RCL_NR][REQ_MAX_FIELD_NR];
+};
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_net.h>
+
+void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req,
+		      enum req_location location);
+void req_capsule_fini(struct req_capsule *pill);
+
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt);
+void req_capsule_client_dump(struct req_capsule *pill);
+void req_capsule_server_dump(struct req_capsule *pill);
+void req_capsule_init_area(struct req_capsule *pill);
+int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc);
+int  req_capsule_server_pack(struct req_capsule *pill);
+
+void *req_capsule_client_get(struct req_capsule *pill,
+			     const struct req_msg_field *field);
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber);
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len);
+void *req_capsule_server_get(struct req_capsule *pill,
+			     const struct req_msg_field *field);
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len);
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber);
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+					const struct req_msg_field *field,
+					int len, void *swabber);
+const void *req_capsule_other_get(struct req_capsule *pill,
+				  const struct req_msg_field *field);
+
+void req_capsule_set_size(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc, int size);
+int req_capsule_get_size(const struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc);
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc);
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+			 enum req_location loc);
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt);
+
+int req_capsule_has_field(const struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc);
+int req_capsule_field_present(const struct req_capsule *pill,
+			      const struct req_msg_field *field,
+			      enum req_location loc);
+void req_capsule_shrink(struct req_capsule *pill,
+			const struct req_msg_field *field,
+			unsigned int newlen,
+			enum req_location loc);
+int req_capsule_server_grow(struct req_capsule *pill,
+			    const struct req_msg_field *field,
+			    unsigned int newlen);
+int  req_layout_init(void);
+void req_layout_fini(void);
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+
+extern struct req_format RQF_OBD_PING;
+extern struct req_format RQF_OBD_SET_INFO;
+extern struct req_format RQF_SEC_CTX;
+extern struct req_format RQF_OBD_IDX_READ;
+/* MGS req_format */
+extern struct req_format RQF_MGS_TARGET_REG;
+extern struct req_format RQF_MGS_SET_INFO;
+extern struct req_format RQF_MGS_CONFIG_READ;
+/* fid/fld req_format */
+extern struct req_format RQF_SEQ_QUERY;
+extern struct req_format RQF_FLD_QUERY;
+/* MDS req_format */
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_GETSTATUS;
+extern struct req_format RQF_MDS_SYNC;
+extern struct req_format RQF_MDS_GETXATTR;
+extern struct req_format RQF_MDS_GETATTR;
+extern struct req_format RQF_UPDATE_OBJ;
+
+/*
+ * This is format of direct (non-intent) MDS_GETATTR_NAME request.
+ */
+extern struct req_format RQF_MDS_GETATTR_NAME;
+extern struct req_format RQF_MDS_CLOSE;
+extern struct req_format RQF_MDS_PIN;
+extern struct req_format RQF_MDS_UNPIN;
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_GET_INFO;
+extern struct req_format RQF_MDS_READPAGE;
+extern struct req_format RQF_MDS_WRITEPAGE;
+extern struct req_format RQF_MDS_IS_SUBDIR;
+extern struct req_format RQF_MDS_DONE_WRITING;
+extern struct req_format RQF_MDS_REINT;
+extern struct req_format RQF_MDS_REINT_CREATE;
+extern struct req_format RQF_MDS_REINT_CREATE_RMT_ACL;
+extern struct req_format RQF_MDS_REINT_CREATE_SLAVE;
+extern struct req_format RQF_MDS_REINT_CREATE_SYM;
+extern struct req_format RQF_MDS_REINT_OPEN;
+extern struct req_format RQF_MDS_REINT_UNLINK;
+extern struct req_format RQF_MDS_REINT_LINK;
+extern struct req_format RQF_MDS_REINT_RENAME;
+extern struct req_format RQF_MDS_REINT_SETATTR;
+extern struct req_format RQF_MDS_REINT_SETXATTR;
+extern struct req_format RQF_MDS_QUOTACHECK;
+extern struct req_format RQF_MDS_QUOTACTL;
+extern struct req_format RQF_QC_CALLBACK;
+extern struct req_format RQF_QUOTA_DQACQ;
+extern struct req_format RQF_MDS_SWAP_LAYOUTS;
+/* MDS hsm formats */
+extern struct req_format RQF_MDS_HSM_STATE_GET;
+extern struct req_format RQF_MDS_HSM_STATE_SET;
+extern struct req_format RQF_MDS_HSM_ACTION;
+extern struct req_format RQF_MDS_HSM_PROGRESS;
+extern struct req_format RQF_MDS_HSM_CT_REGISTER;
+extern struct req_format RQF_MDS_HSM_CT_UNREGISTER;
+extern struct req_format RQF_MDS_HSM_REQUEST;
+/* OST req_format */
+extern struct req_format RQF_OST_CONNECT;
+extern struct req_format RQF_OST_DISCONNECT;
+extern struct req_format RQF_OST_QUOTACHECK;
+extern struct req_format RQF_OST_QUOTACTL;
+extern struct req_format RQF_OST_GETATTR;
+extern struct req_format RQF_OST_SETATTR;
+extern struct req_format RQF_OST_CREATE;
+extern struct req_format RQF_OST_PUNCH;
+extern struct req_format RQF_OST_SYNC;
+extern struct req_format RQF_OST_DESTROY;
+extern struct req_format RQF_OST_BRW_READ;
+extern struct req_format RQF_OST_BRW_WRITE;
+extern struct req_format RQF_OST_STATFS;
+extern struct req_format RQF_OST_SET_GRANT_INFO;
+extern struct req_format RQF_OST_GET_INFO_GENERIC;
+extern struct req_format RQF_OST_GET_INFO_LAST_ID;
+extern struct req_format RQF_OST_GET_INFO_LAST_FID;
+extern struct req_format RQF_OST_SET_INFO_LAST_FID;
+extern struct req_format RQF_OST_GET_INFO_FIEMAP;
+
+/* LDLM req_format */
+extern struct req_format RQF_LDLM_ENQUEUE;
+extern struct req_format RQF_LDLM_ENQUEUE_LVB;
+extern struct req_format RQF_LDLM_CONVERT;
+extern struct req_format RQF_LDLM_INTENT;
+extern struct req_format RQF_LDLM_INTENT_BASIC;
+extern struct req_format RQF_LDLM_INTENT_LAYOUT;
+extern struct req_format RQF_LDLM_INTENT_GETATTR;
+extern struct req_format RQF_LDLM_INTENT_OPEN;
+extern struct req_format RQF_LDLM_INTENT_CREATE;
+extern struct req_format RQF_LDLM_INTENT_UNLINK;
+extern struct req_format RQF_LDLM_INTENT_QUOTA;
+extern struct req_format RQF_LDLM_CANCEL;
+extern struct req_format RQF_LDLM_CALLBACK;
+extern struct req_format RQF_LDLM_CP_CALLBACK;
+extern struct req_format RQF_LDLM_BL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
+/* LOG req_format */
+extern struct req_format RQF_LOG_CANCEL;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
+extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
+
+extern struct req_msg_field RMF_GENERIC_DATA;
+extern struct req_msg_field RMF_PTLRPC_BODY;
+extern struct req_msg_field RMF_MDT_BODY;
+extern struct req_msg_field RMF_MDT_EPOCH;
+extern struct req_msg_field RMF_OBD_STATFS;
+extern struct req_msg_field RMF_NAME;
+extern struct req_msg_field RMF_SYMTGT;
+extern struct req_msg_field RMF_TGTUUID;
+extern struct req_msg_field RMF_CLUUID;
+extern struct req_msg_field RMF_SETINFO_VAL;
+extern struct req_msg_field RMF_SETINFO_KEY;
+extern struct req_msg_field RMF_GETINFO_VAL;
+extern struct req_msg_field RMF_GETINFO_VALLEN;
+extern struct req_msg_field RMF_GETINFO_KEY;
+extern struct req_msg_field RMF_IDX_INFO;
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ */
+extern struct req_msg_field RMF_CONN;
+extern struct req_msg_field RMF_CONNECT_DATA;
+extern struct req_msg_field RMF_DLM_REQ;
+extern struct req_msg_field RMF_DLM_REP;
+extern struct req_msg_field RMF_DLM_LVB;
+extern struct req_msg_field RMF_DLM_GL_DESC;
+extern struct req_msg_field RMF_LDLM_INTENT;
+extern struct req_msg_field RMF_LAYOUT_INTENT;
+extern struct req_msg_field RMF_MDT_MD;
+extern struct req_msg_field RMF_REC_REINT;
+extern struct req_msg_field RMF_EADATA;
+extern struct req_msg_field RMF_ACL;
+extern struct req_msg_field RMF_LOGCOOKIES;
+extern struct req_msg_field RMF_CAPA1;
+extern struct req_msg_field RMF_CAPA2;
+extern struct req_msg_field RMF_OBD_QUOTACHECK;
+extern struct req_msg_field RMF_OBD_QUOTACTL;
+extern struct req_msg_field RMF_QUOTA_BODY;
+extern struct req_msg_field RMF_STRING;
+extern struct req_msg_field RMF_SWAP_LAYOUTS;
+extern struct req_msg_field RMF_MDS_HSM_PROGRESS;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_MDS_HSM_USER_ITEM;
+extern struct req_msg_field RMF_MDS_HSM_ARCHIVE;
+extern struct req_msg_field RMF_HSM_USER_STATE;
+extern struct req_msg_field RMF_HSM_STATE_SET;
+extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+
+/* seq-mgr fields */
+extern struct req_msg_field RMF_SEQ_OPC;
+extern struct req_msg_field RMF_SEQ_RANGE;
+extern struct req_msg_field RMF_FID_SPACE;
+
+/* FLD fields */
+extern struct req_msg_field RMF_FLD_OPC;
+extern struct req_msg_field RMF_FLD_MDFLD;
+
+extern struct req_msg_field RMF_LLOGD_BODY;
+extern struct req_msg_field RMF_LLOG_LOG_HDR;
+extern struct req_msg_field RMF_LLOGD_CONN_BODY;
+
+extern struct req_msg_field RMF_MGS_TARGET_INFO;
+extern struct req_msg_field RMF_MGS_SEND_PARAM;
+
+extern struct req_msg_field RMF_OST_BODY;
+extern struct req_msg_field RMF_OBD_IOOBJ;
+extern struct req_msg_field RMF_OBD_ID;
+extern struct req_msg_field RMF_FID;
+extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_RCS;
+extern struct req_msg_field RMF_FIEMAP_KEY;
+extern struct req_msg_field RMF_FIEMAP_VAL;
+extern struct req_msg_field RMF_OST_ID;
+
+/* MGS config read message format */
+extern struct req_msg_field RMF_MGS_CONFIG_BODY;
+extern struct req_msg_field RMF_MGS_CONFIG_RES;
+
+/* generic uint32 */
+extern struct req_msg_field RMF_U32;
+
+/* OBJ update format */
+extern struct req_msg_field RMF_UPDATE;
+extern struct req_msg_field RMF_UPDATE_REPLY;
+/** @} req_layout */
+
+#endif /* _LUSTRE_REQ_LAYOUT_H__ */

diff --git a/drivers/staging/lustre/lustre/include/lustre_sec.h b/drivers/staging/lustre/lustre/include/lustre_sec.h
new file mode 100644
index 0000000..9e0908e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_sec.h

@@ -0,0 +1,1145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_SEC_H_
+#define _LUSTRE_SEC_H_
+
+/** \defgroup sptlrpc sptlrpc
+ *
+ * @{
+ */
+
+/*
+ * to avoid include
+ */
+struct obd_import;
+struct obd_export;
+struct ptlrpc_request;
+struct ptlrpc_reply_state;
+struct ptlrpc_bulk_desc;
+struct brw_page;
+/* Linux specific */
+struct key;
+struct seq_file;
+
+/*
+ * forward declaration
+ */
+struct ptlrpc_sec_policy;
+struct ptlrpc_sec_cops;
+struct ptlrpc_sec_sops;
+struct ptlrpc_sec;
+struct ptlrpc_svc_ctx;
+struct ptlrpc_cli_ctx;
+struct ptlrpc_ctx_ops;
+
+/**
+ * \addtogroup flavor flavor
+ *
+ * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits
+ * are unused, must be set to 0 for future expansion.
+ * <pre>
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * </pre>
+ *
+ * @{
+ */
+
+/*
+ * flavor constants
+ */
+enum sptlrpc_policy {
+	SPTLRPC_POLICY_NULL	     = 0,
+	SPTLRPC_POLICY_PLAIN	    = 1,
+	SPTLRPC_POLICY_GSS	      = 2,
+	SPTLRPC_POLICY_MAX,
+};
+
+enum sptlrpc_mech_null {
+	SPTLRPC_MECH_NULL	       = 0,
+	SPTLRPC_MECH_NULL_MAX,
+};
+
+enum sptlrpc_mech_plain {
+	SPTLRPC_MECH_PLAIN	      = 0,
+	SPTLRPC_MECH_PLAIN_MAX,
+};
+
+enum sptlrpc_mech_gss {
+	SPTLRPC_MECH_GSS_NULL	   = 0,
+	SPTLRPC_MECH_GSS_KRB5	   = 1,
+	SPTLRPC_MECH_GSS_MAX,
+};
+
+enum sptlrpc_service_type {
+	SPTLRPC_SVC_NULL		= 0,    /**< no security */
+	SPTLRPC_SVC_AUTH		= 1,    /**< authentication only */
+	SPTLRPC_SVC_INTG		= 2,    /**< integrity */
+	SPTLRPC_SVC_PRIV		= 3,    /**< privacy */
+	SPTLRPC_SVC_MAX,
+};
+
+enum sptlrpc_bulk_type {
+	SPTLRPC_BULK_DEFAULT	    = 0,    /**< follow rpc flavor */
+	SPTLRPC_BULK_HASH	       = 1,    /**< hash integrity */
+	SPTLRPC_BULK_MAX,
+};
+
+enum sptlrpc_bulk_service {
+	SPTLRPC_BULK_SVC_NULL	   = 0,    /**< no security */
+	SPTLRPC_BULK_SVC_AUTH	   = 1,    /**< authentication only */
+	SPTLRPC_BULK_SVC_INTG	   = 2,    /**< integrity */
+	SPTLRPC_BULK_SVC_PRIV	   = 3,    /**< privacy */
+	SPTLRPC_BULK_SVC_MAX,
+};
+
+/*
+ * compose/extract macros
+ */
+#define FLVR_POLICY_OFFSET	      (0)
+#define FLVR_MECH_OFFSET		(4)
+#define FLVR_SVC_OFFSET		 (8)
+#define FLVR_BULK_TYPE_OFFSET	   (12)
+#define FLVR_BULK_SVC_OFFSET	    (16)
+
+#define MAKE_FLVR(policy, mech, svc, btype, bsvc)		       \
+	(((__u32)(policy) << FLVR_POLICY_OFFSET) |		      \
+	 ((__u32)(mech) << FLVR_MECH_OFFSET) |			  \
+	 ((__u32)(svc) << FLVR_SVC_OFFSET) |			    \
+	 ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) |		    \
+	 ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET))
+
+/*
+ * extraction
+ */
+#define SPTLRPC_FLVR_POLICY(flavor)				     \
+	((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_MECH(flavor)				       \
+	((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_SVC(flavor)					\
+	((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_TYPE(flavor)				  \
+	((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_SVC(flavor)				   \
+	((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF)
+
+#define SPTLRPC_FLVR_BASE(flavor)				       \
+	((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF)
+#define SPTLRPC_FLVR_BASE_SUB(flavor)				   \
+	((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF)
+
+/*
+ * gss subflavors
+ */
+#define MAKE_BASE_SUBFLVR(mech, svc)				    \
+	((__u32)(mech) |						\
+	 ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET)))
+
+#define SPTLRPC_SUBFLVR_KRB5N					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5A					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_KRB5I					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_KRB5P					   \
+	MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
+
+/*
+ * "end user" flavors
+ */
+#define SPTLRPC_FLVR_NULL			       \
+	MAKE_FLVR(SPTLRPC_POLICY_NULL,		  \
+		  SPTLRPC_MECH_NULL,		    \
+		  SPTLRPC_SVC_NULL,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_PLAIN			      \
+	MAKE_FLVR(SPTLRPC_POLICY_PLAIN,		 \
+		  SPTLRPC_MECH_PLAIN,		   \
+		  SPTLRPC_SVC_NULL,		     \
+		  SPTLRPC_BULK_HASH,		    \
+		  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5N			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_NULL,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5A			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_AUTH,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5I			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_INTG,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5P			      \
+	MAKE_FLVR(SPTLRPC_POLICY_GSS,		   \
+		  SPTLRPC_MECH_GSS_KRB5,		\
+		  SPTLRPC_SVC_PRIV,		     \
+		  SPTLRPC_BULK_DEFAULT,		 \
+		  SPTLRPC_BULK_SVC_PRIV)
+
+#define SPTLRPC_FLVR_DEFAULT	    SPTLRPC_FLVR_NULL
+
+#define SPTLRPC_FLVR_INVALID	    ((__u32) 0xFFFFFFFF)
+#define SPTLRPC_FLVR_ANY		((__u32) 0xFFF00000)
+
+/**
+ * extract the useful part from wire flavor
+ */
+#define WIRE_FLVR(wflvr)		(((__u32) (wflvr)) & 0x000FFFFF)
+
+/** @} flavor */
+
+static inline void flvr_set_svc(__u32 *flvr, __u32 svc)
+{
+	LASSERT(svc < SPTLRPC_SVC_MAX);
+	*flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+			  SPTLRPC_FLVR_MECH(*flvr),
+			  svc,
+			  SPTLRPC_FLVR_BULK_TYPE(*flvr),
+			  SPTLRPC_FLVR_BULK_SVC(*flvr));
+}
+
+static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc)
+{
+	LASSERT(svc < SPTLRPC_BULK_SVC_MAX);
+	*flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+			  SPTLRPC_FLVR_MECH(*flvr),
+			  SPTLRPC_FLVR_SVC(*flvr),
+			  SPTLRPC_FLVR_BULK_TYPE(*flvr),
+			  svc);
+}
+
+struct bulk_spec_hash {
+	__u8    hash_alg;
+};
+
+/**
+ * Full description of flavors being used on a ptlrpc connection, include
+ * both regular RPC and bulk transfer parts.
+ */
+struct sptlrpc_flavor {
+	/**
+	 * wire flavor, should be renamed to sf_wire.
+	 */
+	__u32   sf_rpc;
+	/**
+	 * general flags of PTLRPC_SEC_FL_*
+	 */
+	__u32   sf_flags;
+	/**
+	 * rpc flavor specification
+	 */
+	union {
+		/* nothing for now */
+	} u_rpc;
+	/**
+	 * bulk flavor specification
+	 */
+	union {
+		struct bulk_spec_hash hash;
+	} u_bulk;
+};
+
+/**
+ * identify the RPC is generated from what part of Lustre. It's encoded into
+ * RPC requests and to be checked by ptlrpc service.
+ */
+enum lustre_sec_part {
+	LUSTRE_SP_CLI	   = 0,
+	LUSTRE_SP_MDT,
+	LUSTRE_SP_OST,
+	LUSTRE_SP_MGC,
+	LUSTRE_SP_MGS,
+	LUSTRE_SP_ANY	   = 0xFF
+};
+
+const char *sptlrpc_part2name(enum lustre_sec_part sp);
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd);
+
+/**
+ * A rule specifies a flavor to be used by a ptlrpc connection between
+ * two Lustre parts.
+ */
+struct sptlrpc_rule {
+	__u32		   sr_netid;   /* LNET network ID */
+	__u8		    sr_from;    /* sec_part */
+	__u8		    sr_to;      /* sec_part */
+	__u16		   sr_padding;
+	struct sptlrpc_flavor   sr_flvr;
+};
+
+/**
+ * A set of rules in memory.
+ *
+ * Rules are generated and stored on MGS, and propagated to MDT, OST,
+ * and client when needed.
+ */
+struct sptlrpc_rule_set {
+	int		     srs_nslot;
+	int		     srs_nrule;
+	struct sptlrpc_rule    *srs_rules;
+};
+
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr);
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr);
+
+static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
+{
+	memset(set, 0, sizeof(*set));
+}
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set,
+			    struct sptlrpc_rule *rule);
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+			    enum lustre_sec_part from,
+			    enum lustre_sec_part to,
+			    lnet_nid_t nid,
+			    struct sptlrpc_flavor *sf);
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set);
+
+int  sptlrpc_process_config(struct lustre_cfg *lcfg);
+void sptlrpc_conf_log_start(const char *logname);
+void sptlrpc_conf_log_stop(const char *logname);
+void sptlrpc_conf_log_update_begin(const char *logname);
+void sptlrpc_conf_log_update_end(const char *logname);
+void sptlrpc_conf_client_adapt(struct obd_device *obd);
+int  sptlrpc_conf_target_get_rules(struct obd_device *obd,
+				   struct sptlrpc_rule_set *rset,
+				   int initial);
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+				  enum lustre_sec_part from,
+				  lnet_nid_t nid,
+				  struct sptlrpc_flavor *flavor);
+
+/* The maximum length of security payload. 1024 is enough for Kerberos 5,
+ * and should be enough for other future mechanisms but not sure.
+ * Only used by pre-allocated request/reply pool.
+ */
+#define SPTLRPC_MAX_PAYLOAD     (1024)
+
+
+struct vfs_cred {
+	uint32_t	vc_uid;
+	uint32_t	vc_gid;
+};
+
+struct ptlrpc_ctx_ops {
+	/**
+	 * To determine whether it's suitable to use the \a ctx for \a vcred.
+	 */
+	int     (*match)       (struct ptlrpc_cli_ctx *ctx,
+				struct vfs_cred *vcred);
+
+	/**
+	 * To bring the \a ctx uptodate.
+	 */
+	int     (*refresh)     (struct ptlrpc_cli_ctx *ctx);
+
+	/**
+	 * Validate the \a ctx.
+	 */
+	int     (*validate)    (struct ptlrpc_cli_ctx *ctx);
+
+	/**
+	 * Force the \a ctx to die.
+	 */
+	void    (*die)	 (struct ptlrpc_cli_ctx *ctx,
+				int grace);
+	int     (*display)     (struct ptlrpc_cli_ctx *ctx,
+				char *buf, int bufsize);
+
+	/**
+	 * Sign the request message using \a ctx.
+	 *
+	 * \pre req->rq_reqmsg point to request message.
+	 * \pre req->rq_reqlen is the request message length.
+	 * \post req->rq_reqbuf point to request message with signature.
+	 * \post req->rq_reqdata_len is set to the final request message size.
+	 *
+	 * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign().
+	 */
+	int     (*sign)	(struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Verify the reply message using \a ctx.
+	 *
+	 * \pre req->rq_repdata point to reply message with signature.
+	 * \pre req->rq_repdata_len is the total reply message length.
+	 * \post req->rq_repmsg point to reply message without signature.
+	 * \post req->rq_replen is the reply message length.
+	 *
+	 * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify().
+	 */
+	int     (*verify)      (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Encrypt the request message using \a ctx.
+	 *
+	 * \pre req->rq_reqmsg point to request message in clear text.
+	 * \pre req->rq_reqlen is the request message length.
+	 * \post req->rq_reqbuf point to request message.
+	 * \post req->rq_reqdata_len is set to the final request message size.
+	 *
+	 * \see gss_cli_ctx_seal().
+	 */
+	int     (*seal)	(struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Decrypt the reply message using \a ctx.
+	 *
+	 * \pre req->rq_repdata point to encrypted reply message.
+	 * \pre req->rq_repdata_len is the total cipher text length.
+	 * \post req->rq_repmsg point to reply message in clear text.
+	 * \post req->rq_replen is the reply message length in clear text.
+	 *
+	 * \see gss_cli_ctx_unseal().
+	 */
+	int     (*unseal)      (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req);
+
+	/**
+	 * Wrap bulk request data. This is called before wrapping RPC
+	 * request message.
+	 *
+	 * \pre bulk buffer is descripted by desc->bd_iov and
+	 * desc->bd_iov_count. note for read it's just buffer, no data
+	 * need to be sent;  for write it contains data in clear text.
+	 * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared
+	 * (usually inside of RPC request message).
+	 * - encryption: cipher text bulk buffer is descripted by
+	 *   desc->bd_enc_iov and desc->bd_iov_count (currently assume iov
+	 *   count remains the same).
+	 * - otherwise: bulk buffer is still desc->bd_iov and
+	 *   desc->bd_iov_count.
+	 *
+	 * \return 0: success.
+	 * \return -ev: error code.
+	 *
+	 * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk().
+	 */
+	int     (*wrap_bulk)   (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req,
+				struct ptlrpc_bulk_desc *desc);
+
+	/**
+	 * Unwrap bulk reply data. This is called after wrapping RPC
+	 * reply message.
+	 *
+	 * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and
+	 * desc->bd_iov_count, according to wrap_bulk().
+	 * \post final bulk data in clear text is placed in buffer described
+	 * by desc->bd_iov and desc->bd_iov_count.
+	 * \return +ve nob of actual bulk data in clear text.
+	 * \return -ve error code.
+	 *
+	 * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk().
+	 */
+	int     (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+				struct ptlrpc_request *req,
+				struct ptlrpc_bulk_desc *desc);
+};
+
+#define PTLRPC_CTX_NEW_BIT	     (0)  /* newly created */
+#define PTLRPC_CTX_UPTODATE_BIT	(1)  /* uptodate */
+#define PTLRPC_CTX_DEAD_BIT	    (2)  /* mark expired gracefully */
+#define PTLRPC_CTX_ERROR_BIT	   (3)  /* fatal error (refresh, etc.) */
+#define PTLRPC_CTX_CACHED_BIT	  (8)  /* in ctx cache (hash etc.) */
+#define PTLRPC_CTX_ETERNAL_BIT	 (9)  /* always valid */
+
+#define PTLRPC_CTX_NEW		 (1 << PTLRPC_CTX_NEW_BIT)
+#define PTLRPC_CTX_UPTODATE	    (1 << PTLRPC_CTX_UPTODATE_BIT)
+#define PTLRPC_CTX_DEAD		(1 << PTLRPC_CTX_DEAD_BIT)
+#define PTLRPC_CTX_ERROR	       (1 << PTLRPC_CTX_ERROR_BIT)
+#define PTLRPC_CTX_CACHED	      (1 << PTLRPC_CTX_CACHED_BIT)
+#define PTLRPC_CTX_ETERNAL	     (1 << PTLRPC_CTX_ETERNAL_BIT)
+
+#define PTLRPC_CTX_STATUS_MASK	 (PTLRPC_CTX_NEW_BIT    |       \
+					PTLRPC_CTX_UPTODATE   |       \
+					PTLRPC_CTX_DEAD       |       \
+					PTLRPC_CTX_ERROR)
+
+struct ptlrpc_cli_ctx {
+	struct hlist_node	cc_cache;      /* linked into ctx cache */
+	atomic_t	    cc_refcount;
+	struct ptlrpc_sec      *cc_sec;
+	struct ptlrpc_ctx_ops  *cc_ops;
+	cfs_time_t	      cc_expire;     /* in seconds */
+	unsigned int	    cc_early_expire:1;
+	unsigned long	   cc_flags;
+	struct vfs_cred	 cc_vcred;
+	spinlock_t		cc_lock;
+	struct list_head	      cc_req_list;   /* waiting reqs linked here */
+	struct list_head	      cc_gc_chain;   /* linked to gc chain */
+};
+
+/**
+ * client side policy operation vector.
+ */
+struct ptlrpc_sec_cops {
+	/**
+	 * Given an \a imp, create and initialize a ptlrpc_sec structure.
+	 * \param ctx service context:
+	 * - regular import: \a ctx should be NULL;
+	 * - reverse import: \a ctx is obtained from incoming request.
+	 * \param flavor specify what flavor to use.
+	 *
+	 * When necessary, policy module is responsible for taking reference
+	 * on the import.
+	 *
+	 * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr().
+	 */
+	struct ptlrpc_sec *     (*create_sec)  (struct obd_import *imp,
+						struct ptlrpc_svc_ctx *ctx,
+						struct sptlrpc_flavor *flavor);
+
+	/**
+	 * Destructor of ptlrpc_sec. When called, refcount has been dropped
+	 * to 0 and all contexts has been destroyed.
+	 *
+	 * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr().
+	 */
+	void		    (*destroy_sec) (struct ptlrpc_sec *sec);
+
+	/**
+	 * Notify that this ptlrpc_sec is going to die. Optionally, policy
+	 * module is supposed to set sec->ps_dying and whatever necessary
+	 * actions.
+	 *
+	 * \see plain_kill_sec(), gss_sec_kill().
+	 */
+	void		    (*kill_sec)    (struct ptlrpc_sec *sec);
+
+	/**
+	 * Given \a vcred, lookup and/or create its context. The policy module
+	 * is supposed to maintain its own context cache.
+	 * XXX currently \a create and \a remove_dead is always 1, perhaps
+	 * should be removed completely.
+	 *
+	 * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr().
+	 */
+	struct ptlrpc_cli_ctx * (*lookup_ctx)  (struct ptlrpc_sec *sec,
+						struct vfs_cred *vcred,
+						int create,
+						int remove_dead);
+
+	/**
+	 * Called then the reference of \a ctx dropped to 0. The policy module
+	 * is supposed to destroy this context or whatever else according to
+	 * its cache maintainance mechamism.
+	 *
+	 * \param sync if zero, we shouldn't wait for the context being
+	 * destroyed completely.
+	 *
+	 * \see plain_release_ctx(), gss_sec_release_ctx_kr().
+	 */
+	void		    (*release_ctx) (struct ptlrpc_sec *sec,
+						struct ptlrpc_cli_ctx *ctx,
+						int sync);
+
+	/**
+	 * Flush the context cache.
+	 *
+	 * \param uid context of which user, -1 means all contexts.
+	 * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected
+	 * contexts should be cleared immediately.
+	 * \param force if zero, only idle contexts will be flushed.
+	 *
+	 * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr().
+	 */
+	int		     (*flush_ctx_cache)
+					       (struct ptlrpc_sec *sec,
+						uid_t uid,
+						int grace,
+						int force);
+
+	/**
+	 * Called periodically by garbage collector to remove dead contexts
+	 * from cache.
+	 *
+	 * \see gss_sec_gc_ctx_kr().
+	 */
+	void		    (*gc_ctx)      (struct ptlrpc_sec *sec);
+
+	/**
+	 * Given an context \a ctx, install a corresponding reverse service
+	 * context on client side.
+	 * XXX currently it's only used by GSS module, maybe we should remove
+	 * this from general API.
+	 */
+	int		     (*install_rctx)(struct obd_import *imp,
+						struct ptlrpc_sec *sec,
+						struct ptlrpc_cli_ctx *ctx);
+
+	/**
+	 * To allocate request buffer for \a req.
+	 *
+	 * \pre req->rq_reqmsg == NULL.
+	 * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated,
+	 * we are not supposed to free it.
+	 * \post if success, req->rq_reqmsg point to a buffer with size
+	 * at least \a lustre_msg_size.
+	 *
+	 * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf().
+	 */
+	int		     (*alloc_reqbuf)(struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req,
+						int lustre_msg_size);
+
+	/**
+	 * To free request buffer for \a req.
+	 *
+	 * \pre req->rq_reqbuf != NULL.
+	 *
+	 * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf().
+	 */
+	void		    (*free_reqbuf) (struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req);
+
+	/**
+	 * To allocate reply buffer for \a req.
+	 *
+	 * \pre req->rq_repbuf == NULL.
+	 * \post if success, req->rq_repbuf point to a buffer with size
+	 * req->rq_repbuf_len, the size should be large enough to receive
+	 * reply which be transformed from \a lustre_msg_size of clear text.
+	 *
+	 * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf().
+	 */
+	int		     (*alloc_repbuf)(struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req,
+						int lustre_msg_size);
+
+	/**
+	 * To free reply buffer for \a req.
+	 *
+	 * \pre req->rq_repbuf != NULL.
+	 * \post req->rq_repbuf == NULL.
+	 * \post req->rq_repbuf_len == 0.
+	 *
+	 * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf().
+	 */
+	void		    (*free_repbuf) (struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req);
+
+	/**
+	 * To expand the request buffer of \a req, thus the \a segment in
+	 * the request message pointed by req->rq_reqmsg can accommodate
+	 * at least \a newsize of data.
+	 *
+	 * \pre req->rq_reqmsg->lm_buflens[segment] < newsize.
+	 *
+	 * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(),
+	 * gss_enlarge_reqbuf().
+	 */
+	int		     (*enlarge_reqbuf)
+					       (struct ptlrpc_sec *sec,
+						struct ptlrpc_request *req,
+						int segment, int newsize);
+	/*
+	 * misc
+	 */
+	int		     (*display)     (struct ptlrpc_sec *sec,
+						struct seq_file *seq);
+};
+
+/**
+ * server side policy operation vector.
+ */
+struct ptlrpc_sec_sops {
+	/**
+	 * verify an incoming request.
+	 *
+	 * \pre request message is pointed by req->rq_reqbuf, size is
+	 * req->rq_reqdata_len; and the message has been unpacked to
+	 * host byte order.
+	 *
+	 * \retval SECSVC_OK success, req->rq_reqmsg point to request message
+	 * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set;
+	 * req->rq_sp_from is decoded from request.
+	 * \retval SECSVC_COMPLETE success, the request has been fully
+	 * processed, and reply message has been prepared; req->rq_sp_from is
+	 * decoded from request.
+	 * \retval SECSVC_DROP failed, this request should be dropped.
+	 *
+	 * \see null_accept(), plain_accept(), gss_svc_accept_kr().
+	 */
+	int		     (*accept)      (struct ptlrpc_request *req);
+
+	/**
+	 * Perform security transformation upon reply message.
+	 *
+	 * \pre reply message is pointed by req->rq_reply_state->rs_msg, size
+	 * is req->rq_replen.
+	 * \post req->rs_repdata_len is the final message size.
+	 * \post req->rq_reply_off is set.
+	 *
+	 * \see null_authorize(), plain_authorize(), gss_svc_authorize().
+	 */
+	int		     (*authorize)   (struct ptlrpc_request *req);
+
+	/**
+	 * Invalidate server context \a ctx.
+	 *
+	 * \see gss_svc_invalidate_ctx().
+	 */
+	void		    (*invalidate_ctx)
+					       (struct ptlrpc_svc_ctx *ctx);
+
+	/**
+	 * Allocate a ptlrpc_reply_state.
+	 *
+	 * \param msgsize size of the reply message in clear text.
+	 * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we
+	 * should simply use it; otherwise we'll responsible for allocating
+	 * a new one.
+	 * \post req->rq_reply_state != NULL;
+	 * \post req->rq_reply_state->rs_msg != NULL;
+	 *
+	 * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs().
+	 */
+	int		     (*alloc_rs)    (struct ptlrpc_request *req,
+						int msgsize);
+
+	/**
+	 * Free a ptlrpc_reply_state.
+	 */
+	void		    (*free_rs)     (struct ptlrpc_reply_state *rs);
+
+	/**
+	 * Release the server context \a ctx.
+	 *
+	 * \see gss_svc_free_ctx().
+	 */
+	void		    (*free_ctx)    (struct ptlrpc_svc_ctx *ctx);
+
+	/**
+	 * Install a reverse context based on the server context \a ctx.
+	 *
+	 * \see gss_svc_install_rctx_kr().
+	 */
+	int		     (*install_rctx)(struct obd_import *imp,
+						struct ptlrpc_svc_ctx *ctx);
+
+	/**
+	 * Prepare buffer for incoming bulk write.
+	 *
+	 * \pre desc->bd_iov and desc->bd_iov_count describes the buffer
+	 * intended to receive the write.
+	 *
+	 * \see gss_svc_prep_bulk().
+	 */
+	int		     (*prep_bulk)   (struct ptlrpc_request *req,
+						struct ptlrpc_bulk_desc *desc);
+
+	/**
+	 * Unwrap the bulk write data.
+	 *
+	 * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk().
+	 */
+	int		     (*unwrap_bulk) (struct ptlrpc_request *req,
+						struct ptlrpc_bulk_desc *desc);
+
+	/**
+	 * Wrap the bulk read data.
+	 *
+	 * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk().
+	 */
+	int		     (*wrap_bulk)   (struct ptlrpc_request *req,
+						struct ptlrpc_bulk_desc *desc);
+};
+
+struct ptlrpc_sec_policy {
+	module_t		   *sp_owner;
+	char			   *sp_name;
+	__u16			   sp_policy; /* policy number */
+	struct ptlrpc_sec_cops	 *sp_cops;   /* client ops */
+	struct ptlrpc_sec_sops	 *sp_sops;   /* server ops */
+};
+
+#define PTLRPC_SEC_FL_REVERSE	   0x0001 /* reverse sec */
+#define PTLRPC_SEC_FL_ROOTONLY	  0x0002 /* treat everyone as root */
+#define PTLRPC_SEC_FL_UDESC	     0x0004 /* ship udesc */
+#define PTLRPC_SEC_FL_BULK	      0x0008 /* intensive bulk i/o expected */
+#define PTLRPC_SEC_FL_PAG	       0x0010 /* PAG mode */
+
+/**
+ * The ptlrpc_sec represents the client side ptlrpc security facilities,
+ * each obd_import (both regular and reverse import) must associate with
+ * a ptlrpc_sec.
+ *
+ * \see sptlrpc_import_sec_adapt().
+ */
+struct ptlrpc_sec {
+	struct ptlrpc_sec_policy       *ps_policy;
+	atomic_t		    ps_refcount;
+	/** statistic only */
+	atomic_t		    ps_nctx;
+	/** unique identifier */
+	int			     ps_id;
+	struct sptlrpc_flavor	   ps_flvr;
+	enum lustre_sec_part	    ps_part;
+	/** after set, no more new context will be created */
+	unsigned int		    ps_dying:1;
+	/** owning import */
+	struct obd_import	      *ps_import;
+	spinlock_t			ps_lock;
+
+	/*
+	 * garbage collection
+	 */
+	struct list_head		      ps_gc_list;
+	cfs_time_t		      ps_gc_interval; /* in seconds */
+	cfs_time_t		      ps_gc_next;     /* in seconds */
+};
+
+static inline int sec_is_reverse(struct ptlrpc_sec *sec)
+{
+	return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE);
+}
+
+static inline int sec_is_rootonly(struct ptlrpc_sec *sec)
+{
+	return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY);
+}
+
+
+struct ptlrpc_svc_ctx {
+	atomic_t		    sc_refcount;
+	struct ptlrpc_sec_policy       *sc_policy;
+};
+
+/*
+ * user identity descriptor
+ */
+#define LUSTRE_MAX_GROUPS	       (128)
+
+struct ptlrpc_user_desc {
+	__u32	   pud_uid;
+	__u32	   pud_gid;
+	__u32	   pud_fsuid;
+	__u32	   pud_fsgid;
+	__u32	   pud_cap;
+	__u32	   pud_ngroups;
+	__u32	   pud_groups[0];
+};
+
+/*
+ * bulk flavors
+ */
+enum sptlrpc_bulk_hash_alg {
+	BULK_HASH_ALG_NULL      = 0,
+	BULK_HASH_ALG_ADLER32,
+	BULK_HASH_ALG_CRC32,
+	BULK_HASH_ALG_MD5,
+	BULK_HASH_ALG_SHA1,
+	BULK_HASH_ALG_SHA256,
+	BULK_HASH_ALG_SHA384,
+	BULK_HASH_ALG_SHA512,
+	BULK_HASH_ALG_MAX
+};
+
+const char * sptlrpc_get_hash_name(__u8 hash_alg);
+__u8 sptlrpc_get_hash_alg(const char *algname);
+
+enum {
+	BSD_FL_ERR      = 1,
+};
+
+struct ptlrpc_bulk_sec_desc {
+	__u8	    bsd_version;    /* 0 */
+	__u8	    bsd_type;       /* SPTLRPC_BULK_XXX */
+	__u8	    bsd_svc;	/* SPTLRPC_BULK_SVC_XXXX */
+	__u8	    bsd_flags;      /* flags */
+	__u32	   bsd_nob;	/* nob of bulk data */
+	__u8	    bsd_data[0];    /* policy-specific token */
+};
+
+
+/*
+ * lprocfs
+ */
+struct proc_dir_entry;
+extern struct proc_dir_entry *sptlrpc_proc_root;
+
+/*
+ * round size up to next power of 2, for slab allocation.
+ * @size must be sane (can't overflow after round up)
+ */
+static inline int size_roundup_power2(int size)
+{
+	size--;
+	size |= size >> 1;
+	size |= size >> 2;
+	size |= size >> 4;
+	size |= size >> 8;
+	size |= size >> 16;
+	size++;
+	return size;
+}
+
+/*
+ * internal support libraries
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+				  int segment, int newsize);
+
+/*
+ * security policies
+ */
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy);
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy);
+
+__u32 sptlrpc_name2flavor_base(const char *name);
+const char *sptlrpc_flavor2name_base(__u32 flvr);
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+			       char *buf, int bufsize);
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize);
+
+static inline
+struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy)
+{
+	__module_get(policy->sp_owner);
+	return policy;
+}
+
+static inline
+void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy)
+{
+	module_put(policy->sp_owner);
+}
+
+/*
+ * client credential
+ */
+static inline
+unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx)
+{
+	return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK);
+}
+
+static inline
+int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx)
+{
+	return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE);
+}
+
+static inline
+int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx)
+{
+	return (cli_ctx_status(ctx) != 0);
+}
+
+static inline
+int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0);
+}
+
+static inline
+int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0);
+}
+
+static inline
+int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0);
+}
+
+static inline
+int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx)
+{
+	return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0);
+}
+
+/*
+ * sec get/put
+ */
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec);
+void sptlrpc_sec_put(struct ptlrpc_sec *sec);
+
+/*
+ * internal apis which only used by policy impelentation
+ */
+int  sptlrpc_get_next_secid(void);
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec);
+
+/*
+ * exported client context api
+ */
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync);
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx);
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+
+/*
+ * exported client context wrap/buffers
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req);
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+			       int segment, int newsize);
+int  sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+				    struct ptlrpc_request **req_ret);
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+
+/*
+ * exported higher interface of import & request
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+			     struct ptlrpc_svc_ctx *ctx,
+			     struct sptlrpc_flavor *flvr);
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp);
+void sptlrpc_import_sec_put(struct obd_import *imp);
+
+int  sptlrpc_import_check_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
+int  sptlrpc_req_get_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
+int  sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int  sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
+
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule);
+
+/* gc */
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx);
+
+/* misc */
+const char * sec2target_str(struct ptlrpc_sec *sec);
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev);
+
+/*
+ * server side
+ */
+enum secsvc_accept_res {
+	SECSVC_OK       = 0,
+	SECSVC_COMPLETE,
+	SECSVC_DROP,
+};
+
+int  sptlrpc_svc_unwrap_request(struct ptlrpc_request *req);
+int  sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  sptlrpc_svc_wrap_reply(struct ptlrpc_request *req);
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs);
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req);
+
+int  sptlrpc_target_export_check(struct obd_export *exp,
+				 struct ptlrpc_request *req);
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+				      struct sptlrpc_rule_set *rset);
+
+/*
+ * reverse context
+ */
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_svc_ctx *ctx);
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_cli_ctx *ctx);
+
+/* bulk security api */
+int sptlrpc_enc_pool_add_user(void);
+int sptlrpc_enc_pool_del_user(void);
+int  sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc);
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
+
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc);
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+				 struct ptlrpc_bulk_desc *desc,
+				 int nob);
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+				  struct ptlrpc_bulk_desc *desc);
+
+/* bulk helpers (internal use only by policies) */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+			      void *buf, int buflen);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed);
+
+/* user descriptor helpers */
+static inline int sptlrpc_user_desc_size(int ngroups)
+{
+	return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32);
+}
+
+int sptlrpc_current_user_desc_size(void);
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
+int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
+
+
+#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
+#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
+
+enum {
+	LUSTRE_SEC_NONE	 = 0,
+	LUSTRE_SEC_REMOTE       = 1,
+	LUSTRE_SEC_SPECIFY      = 2,
+	LUSTRE_SEC_ALL	  = 3
+};
+
+/** @} sptlrpc */
+
+#endif /* _LUSTRE_SEC_H_ */

diff --git a/drivers/staging/lustre/lustre/include/lustre_update.h b/drivers/staging/lustre/lustre/include/lustre_update.h
new file mode 100644
index 0000000..84defce
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_update.h

@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.htm
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_update.h
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#ifndef _LUSTRE_UPDATE_H
+#define _LUSTRE_UPDATE_H
+
+#define UPDATE_BUFFER_SIZE	8192
+struct update_request {
+	struct dt_device	*ur_dt;
+	struct list_head		ur_list;    /* attached itself to thandle */
+	int			ur_flags;
+	int			ur_rc;	    /* request result */
+	int			ur_batchid; /* Current batch(trans) id */
+	struct update_buf	*ur_buf;   /* Holding the update req */
+};
+
+static inline unsigned long update_size(struct update *update)
+{
+	unsigned long size;
+	int	   i;
+
+	size = cfs_size_round(offsetof(struct update, u_bufs[0]));
+	for (i = 0; i < UPDATE_BUF_COUNT; i++)
+		size += cfs_size_round(update->u_lens[i]);
+
+	return size;
+}
+
+static inline void *update_param_buf(struct update *update, int index,
+				     int *size)
+{
+	int	i;
+	void	*ptr;
+
+	if (index >= UPDATE_BUF_COUNT)
+		return NULL;
+
+	ptr = (char *)update + cfs_size_round(offsetof(struct update,
+						       u_bufs[0]));
+	for (i = 0; i < index; i++) {
+		LASSERT(update->u_lens[i] > 0);
+		ptr += cfs_size_round(update->u_lens[i]);
+	}
+
+	if (size != NULL)
+		*size = update->u_lens[index];
+
+	return ptr;
+}
+
+static inline unsigned long update_buf_size(struct update_buf *buf)
+{
+	unsigned long size;
+	int	   i = 0;
+
+	size = cfs_size_round(offsetof(struct update_buf, ub_bufs[0]));
+	for (i = 0; i < buf->ub_count; i++) {
+		struct update *update;
+
+		update = (struct update *)((char *)buf + size);
+		size += update_size(update);
+	}
+	LASSERT(size <= UPDATE_BUFFER_SIZE);
+	return size;
+}
+
+static inline void *update_buf_get(struct update_buf *buf, int index, int *size)
+{
+	int	count = buf->ub_count;
+	void	*ptr;
+	int	i = 0;
+
+	if (index >= count)
+		return NULL;
+
+	ptr = (char *)buf + cfs_size_round(offsetof(struct update_buf,
+						    ub_bufs[0]));
+	for (i = 0; i < index; i++)
+		ptr += update_size((struct update *)ptr);
+
+	if (size != NULL)
+		*size = update_size((struct update *)ptr);
+
+	return ptr;
+}
+
+static inline void update_init_reply_buf(struct update_reply *reply, int count)
+{
+	reply->ur_version = UPDATE_REPLY_V1;
+	reply->ur_count = count;
+}
+
+static inline void *update_get_buf_internal(struct update_reply *reply,
+					    int index, int *size)
+{
+	char *ptr;
+	int count = reply->ur_count;
+	int i;
+
+	if (index >= count)
+		return NULL;
+
+	ptr = (char *)reply + cfs_size_round(offsetof(struct update_reply,
+					     ur_lens[count]));
+	for (i = 0; i < index; i++) {
+		LASSERT(reply->ur_lens[i] > 0);
+		ptr += cfs_size_round(reply->ur_lens[i]);
+	}
+
+	if (size != NULL)
+		*size = reply->ur_lens[index];
+
+	return ptr;
+}
+
+static inline void update_insert_reply(struct update_reply *reply, void *data,
+				       int data_len, int index, int rc)
+{
+	char *ptr;
+
+	ptr = update_get_buf_internal(reply, index, NULL);
+	LASSERT(ptr != NULL);
+
+	*(int *)ptr = cpu_to_le32(rc);
+	ptr += sizeof(int);
+	if (data_len > 0) {
+		LASSERT(data != NULL);
+		memcpy(ptr, data, data_len);
+	}
+	reply->ur_lens[index] = data_len + sizeof(int);
+}
+
+static inline int update_get_reply_buf(struct update_reply *reply, void **buf,
+				       int index)
+{
+	char *ptr;
+	int  size = 0;
+	int  result;
+
+	ptr = update_get_buf_internal(reply, index, &size);
+	result = *(int *)ptr;
+
+	if (result < 0)
+		return result;
+
+	LASSERT((ptr != NULL && size >= sizeof(int)));
+	*buf = ptr + sizeof(int);
+	return size - sizeof(int);
+}
+
+static inline int update_get_reply_result(struct update_reply *reply,
+					  void **buf, int index)
+{
+	void *ptr;
+	int  size;
+
+	ptr = update_get_buf_internal(reply, index, &size);
+	LASSERT(ptr != NULL && size > sizeof(int));
+	return *(int *)ptr;
+}
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lustre_ver.h b/drivers/staging/lustre/lustre/include/lustre_ver.h
new file mode 100644
index 0000000..dc187b8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_ver.h

@@ -0,0 +1,24 @@
+#ifndef _LUSTRE_VER_H_
+#define _LUSTRE_VER_H_
+/* This file automatically generated from lustre/include/lustre_ver.h.in,
+ * based on parameters in lustre/autoconf/lustre-version.ac.
+ * Changes made directly to this file will be lost. */
+
+#define LUSTRE_MAJOR 2
+#define LUSTRE_MINOR 3
+#define LUSTRE_PATCH 64
+#define LUSTRE_FIX 0
+#define LUSTRE_VERSION_STRING "2.3.64"
+
+#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX)
+
+/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
+ * by this amount (set in lustre/autoconf/lustre-version.ac). */
+#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
+
+/* If lustre version of client and servers it connects to differs by more
+ * than this amount, client would issue a warning.
+ * (set in lustre/autoconf/lustre-version.ac) */
+#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0)
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/lvfs.h b/drivers/staging/lustre/lustre/include/lvfs.h
new file mode 100644
index 0000000..28f1a6b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lvfs.h

@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LVFS_H__
+#define __LVFS_H__
+
+#define LL_FID_NAMELEN (16 + 1 + 8 + 1)
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lvfs.h>
+
+#include <linux/libcfs/lucache.h>
+
+
+/* lvfs_common.c */
+struct dentry *lvfs_fid2dentry(struct lvfs_run_ctxt *, __u64, __u32, __u64 ,void *data);
+
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
+	       struct lvfs_ucred *cred);
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
+	      struct lvfs_ucred *cred);
+#endif

diff --git a/drivers/staging/lustre/lustre/include/md_object.h b/drivers/staging/lustre/lustre/include/md_object.h
new file mode 100644
index 0000000..92d6420
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/md_object.h

@@ -0,0 +1,908 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/md_object.h
+ *
+ * Extention of lu_object.h for metadata objects
+ */
+
+#ifndef _LUSTRE_MD_OBJECT_H
+#define _LUSTRE_MD_OBJECT_H
+
+/** \defgroup md md
+ * Sub-class of lu_object with methods common for "meta-data" objects in MDT
+ * stack.
+ *
+ * Meta-data objects implement namespace operations: you can link, unlink
+ * them, and treat them as directories.
+ *
+ * Examples: mdt, cmm, and mdt are implementations of md interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <dt_object.h>
+
+struct md_device;
+struct md_device_operations;
+struct md_object;
+struct obd_export;
+
+enum {
+	UCRED_INVALID   = -1,
+	UCRED_INIT      = 0,
+	UCRED_OLD       = 1,
+	UCRED_NEW       = 2
+};
+
+enum {
+	MD_CAPAINFO_MAX = 5
+};
+
+/** there are at most 5 fids in one operation, see rename, NOTE the last one
+ * is a temporary one used for is_subdir() */
+struct md_capainfo {
+	__u32		   mc_auth;
+	__u32		   mc_padding;
+	struct lu_fid	   mc_fid[MD_CAPAINFO_MAX];
+	struct lustre_capa     *mc_capa[MD_CAPAINFO_MAX];
+};
+
+struct md_quota {
+	struct obd_export       *mq_exp;
+};
+
+/**
+ * Implemented in mdd/mdd_handler.c.
+ *
+ * XXX should be moved into separate .h/.c together with all md security
+ * related definitions.
+ */
+struct md_capainfo *md_capainfo(const struct lu_env *env);
+struct md_quota *md_quota(const struct lu_env *env);
+
+/** metadata attributes */
+enum ma_valid {
+	MA_INODE     = (1 << 0),
+	MA_LOV       = (1 << 1),
+	MA_COOKIE    = (1 << 2),
+	MA_FLAGS     = (1 << 3),
+	MA_LMV       = (1 << 4),
+	MA_ACL_DEF   = (1 << 5),
+	MA_LOV_DEF   = (1 << 6),
+	MA_LAY_GEN   = (1 << 7),
+	MA_HSM       = (1 << 8),
+	MA_SOM       = (1 << 9),
+	MA_PFID      = (1 << 10)
+};
+
+typedef enum {
+	MDL_MINMODE  = 0,
+	MDL_EX       = 1,
+	MDL_PW       = 2,
+	MDL_PR       = 4,
+	MDL_CW       = 8,
+	MDL_CR       = 16,
+	MDL_NL       = 32,
+	MDL_GROUP    = 64,
+	MDL_MAXMODE
+} mdl_mode_t;
+
+typedef enum {
+	MDT_NUL_LOCK = 0,
+	MDT_REG_LOCK = (1 << 0),
+	MDT_PDO_LOCK = (1 << 1)
+} mdl_type_t;
+
+/* memory structure for hsm attributes
+ * for fields description see the on disk structure hsm_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_hsm {
+	__u32	mh_compat;
+	__u32	mh_flags;
+	__u64	mh_arch_id;
+	__u64	mh_arch_ver;
+};
+
+#define IOEPOCH_INVAL 0
+
+/* memory structure for som attributes
+ * for fields description see the on disk structure som_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_som_data {
+	__u32	msd_compat;
+	__u32	msd_incompat;
+	__u64	msd_ioepoch;
+	__u64	msd_size;
+	__u64	msd_blocks;
+	__u64	msd_mountid;
+};
+
+struct md_attr {
+	__u64		   ma_valid;
+	__u64		   ma_need;
+	__u64		   ma_attr_flags;
+	struct lu_attr	  ma_attr;
+	struct lu_fid	   ma_pfid;
+	struct md_hsm	   ma_hsm;
+	struct lov_mds_md      *ma_lmm;
+	struct lmv_stripe_md   *ma_lmv;
+	void		   *ma_acl;
+	struct llog_cookie     *ma_cookie;
+	struct lustre_capa     *ma_capa;
+	struct md_som_data     *ma_som;
+	int		     ma_lmm_size;
+	int		     ma_lmv_size;
+	int		     ma_acl_size;
+	int		     ma_cookie_size;
+	__u16		   ma_layout_gen;
+};
+
+/** Additional parameters for create */
+struct md_op_spec {
+	union {
+		/** symlink target */
+		const char	       *sp_symname;
+		/** parent FID for cross-ref mkdir */
+		const struct lu_fid      *sp_pfid;
+		/** eadata for regular files */
+		struct md_spec_reg {
+			/** lov objs exist already */
+			const struct lu_fid   *fid;
+			const void *eadata;
+			int  eadatalen;
+		} sp_ea;
+	} u;
+
+	/** Create flag from client: such as MDS_OPEN_CREAT, and others. */
+	__u64      sp_cr_flags;
+
+	/** don't create lov objects or llog cookie - this replay */
+	unsigned int no_create:1,
+		     sp_cr_lookup:1, /* do lookup sanity check or not. */
+		     sp_rm_entry:1;  /* only remove name entry */
+
+	/** Current lock mode for parent dir where create is performing. */
+	mdl_mode_t sp_cr_mode;
+
+	/** to create directory */
+	const struct dt_index_features *sp_feat;
+};
+
+/**
+ * Operations implemented for each md object (both directory and leaf).
+ */
+struct md_object_operations {
+	int (*moo_permission)(const struct lu_env *env,
+			      struct md_object *pobj, struct md_object *cobj,
+			      struct md_attr *attr, int mask);
+
+	int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
+			    struct md_attr *attr);
+
+	int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
+			    const struct md_attr *attr);
+
+	int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
+			     struct lu_buf *buf, const char *name);
+
+	int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
+			      struct lu_buf *buf);
+
+	int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
+			     const struct lu_buf *buf, const char *name,
+			     int fl);
+
+	int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
+			     const char *name);
+
+	/** This method is used to swap the layouts between 2 objects */
+	int (*moo_swap_layouts)(const struct lu_env *env,
+			       struct md_object *obj1, struct md_object *obj2,
+			       __u64 flags);
+
+	/** \retval number of bytes actually read upon success */
+	int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
+			    const struct lu_rdpg *rdpg);
+
+	int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
+			    struct lu_buf *buf);
+	int (*moo_changelog)(const struct lu_env *env,
+			     enum changelog_rec_type type, int flags,
+			     struct md_object *obj);
+	/** part of cross-ref operation */
+	int (*moo_object_create)(const struct lu_env *env,
+				 struct md_object *obj,
+				 const struct md_op_spec *spec,
+				 struct md_attr *ma);
+
+	int (*moo_ref_add)(const struct lu_env *env,
+			   struct md_object *obj,
+			   const struct md_attr *ma);
+
+	int (*moo_ref_del)(const struct lu_env *env,
+			   struct md_object *obj,
+			   struct md_attr *ma);
+
+	int (*moo_open)(const struct lu_env *env,
+			struct md_object *obj, int flag);
+
+	int (*moo_close)(const struct lu_env *env, struct md_object *obj,
+			 struct md_attr *ma, int mode);
+
+	int (*moo_capa_get)(const struct lu_env *, struct md_object *,
+			    struct lustre_capa *, int renewal);
+
+	int (*moo_object_sync)(const struct lu_env *, struct md_object *);
+
+	int (*moo_file_lock)(const struct lu_env *env, struct md_object *obj,
+			     struct lov_mds_md *lmm, struct ldlm_extent *extent,
+			     struct lustre_handle *lockh);
+	int (*moo_file_unlock)(const struct lu_env *env, struct md_object *obj,
+			       struct lov_mds_md *lmm,
+			       struct lustre_handle *lockh);
+	int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj,
+			       struct lustre_handle *lh,
+			       struct ldlm_enqueue_info *einfo,
+			       void *policy);
+};
+
+/**
+ * Operations implemented for each directory object.
+ */
+struct md_dir_operations {
+	int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj,
+			      const struct lu_fid *fid, struct lu_fid *sfid);
+
+	int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
+			  const struct lu_name *lname, struct lu_fid *fid,
+			  struct md_op_spec *spec);
+
+	mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
+				    struct md_object *obj,
+				    mdl_mode_t mode);
+
+	int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
+			  const struct lu_name *lname, struct md_object *child,
+			  struct md_op_spec *spec,
+			  struct md_attr *ma);
+
+	/** This method is used for creating data object for this meta object*/
+	int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
+			       struct md_object *o,
+			       const struct md_op_spec *spec,
+			       struct md_attr *ma);
+
+	int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
+			  struct md_object *tpobj, const struct lu_fid *lf,
+			  const struct lu_name *lsname, struct md_object *tobj,
+			  const struct lu_name *ltname, struct md_attr *ma);
+
+	int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
+			struct md_object *src_obj, const struct lu_name *lname,
+			struct md_attr *ma);
+
+	int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj,
+			  struct md_object *cobj, const struct lu_name *lname,
+			  struct md_attr *ma, int no_name);
+
+	/** This method is used to compare a requested layout to an existing
+	 * layout (struct lov_mds_md_v1/3 vs struct lov_mds_md_v1/3) */
+	int (*mdo_lum_lmm_cmp)(const struct lu_env *env,
+			       struct md_object *cobj,
+			       const struct md_op_spec *spec,
+			       struct md_attr *ma);
+
+	/** partial ops for cross-ref case */
+	int (*mdo_name_insert)(const struct lu_env *env,
+			       struct md_object *obj,
+			       const struct lu_name *lname,
+			       const struct lu_fid *fid,
+			       const struct md_attr *ma);
+
+	int (*mdo_name_remove)(const struct lu_env *env,
+			       struct md_object *obj,
+			       const struct lu_name *lname,
+			       const struct md_attr *ma);
+
+	int (*mdo_rename_tgt)(const struct lu_env *env, struct md_object *pobj,
+			      struct md_object *tobj, const struct lu_fid *fid,
+			      const struct lu_name *lname, struct md_attr *ma);
+};
+
+struct md_device_operations {
+	/** meta-data device related handlers. */
+	int (*mdo_root_get)(const struct lu_env *env, struct md_device *m,
+			    struct lu_fid *f);
+
+	int (*mdo_maxsize_get)(const struct lu_env *env, struct md_device *m,
+			       int *md_size, int *cookie_size);
+
+	int (*mdo_statfs)(const struct lu_env *env, struct md_device *m,
+			  struct obd_statfs *sfs);
+
+	int (*mdo_init_capa_ctxt)(const struct lu_env *env, struct md_device *m,
+				  int mode, unsigned long timeout, __u32 alg,
+				  struct lustre_capa_key *keys);
+
+	int (*mdo_update_capa_key)(const struct lu_env *env,
+				   struct md_device *m,
+				   struct lustre_capa_key *key);
+
+	int (*mdo_llog_ctxt_get)(const struct lu_env *env,
+				 struct md_device *m, int idx, void **h);
+
+	int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m,
+			     unsigned int cmd, int len, void *data);
+};
+
+enum md_upcall_event {
+	/** Sync the md layer*/
+	MD_LOV_SYNC = (1 << 0),
+	/** Just for split, no need trans, for replay */
+	MD_NO_TRANS = (1 << 1),
+	MD_LOV_CONFIG = (1 << 2),
+	/** Trigger quota recovery */
+	MD_LOV_QUOTA = (1 << 3)
+};
+
+struct md_upcall {
+	/** this lock protects upcall using against its removal
+	 * read lock is for usage the upcall, write - for init/fini */
+	struct rw_semaphore	mu_upcall_sem;
+	/** device to call, upper layer normally */
+	struct md_device       *mu_upcall_dev;
+	/** upcall function */
+	int (*mu_upcall)(const struct lu_env *env, struct md_device *md,
+			 enum md_upcall_event ev, void *data);
+};
+
+struct md_device {
+	struct lu_device		   md_lu_dev;
+	const struct md_device_operations *md_ops;
+	struct md_upcall		   md_upcall;
+};
+
+static inline void md_upcall_init(struct md_device *m, void *upcl)
+{
+	init_rwsem(&m->md_upcall.mu_upcall_sem);
+	m->md_upcall.mu_upcall_dev = NULL;
+	m->md_upcall.mu_upcall = upcl;
+}
+
+static inline void md_upcall_dev_set(struct md_device *m, struct md_device *up)
+{
+	down_write(&m->md_upcall.mu_upcall_sem);
+	m->md_upcall.mu_upcall_dev = up;
+	up_write(&m->md_upcall.mu_upcall_sem);
+}
+
+static inline void md_upcall_fini(struct md_device *m)
+{
+	down_write(&m->md_upcall.mu_upcall_sem);
+	m->md_upcall.mu_upcall_dev = NULL;
+	m->md_upcall.mu_upcall = NULL;
+	up_write(&m->md_upcall.mu_upcall_sem);
+}
+
+static inline int md_do_upcall(const struct lu_env *env, struct md_device *m,
+				enum md_upcall_event ev, void *data)
+{
+	int rc = 0;
+	down_read(&m->md_upcall.mu_upcall_sem);
+	if (m->md_upcall.mu_upcall_dev != NULL &&
+	    m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall != NULL) {
+		rc = m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall(env,
+					      m->md_upcall.mu_upcall_dev,
+					      ev, data);
+	}
+	up_read(&m->md_upcall.mu_upcall_sem);
+	return rc;
+}
+
+struct md_object {
+	struct lu_object		   mo_lu;
+	const struct md_object_operations *mo_ops;
+	const struct md_dir_operations    *mo_dir_ops;
+};
+
+/**
+ * seq-server site.
+ */
+struct seq_server_site {
+	struct lu_site	     *ss_lu;
+	/**
+	 * mds number of this site.
+	 */
+	mdsno_t	       ss_node_id;
+	/**
+	 * Fid location database
+	 */
+	struct lu_server_fld *ss_server_fld;
+	struct lu_client_fld *ss_client_fld;
+
+	/**
+	 * Server Seq Manager
+	 */
+	struct lu_server_seq *ss_server_seq;
+
+	/**
+	 * Controller Seq Manager
+	 */
+	struct lu_server_seq *ss_control_seq;
+	struct obd_export    *ss_control_exp;
+
+	/**
+	 * Client Seq Manager
+	 */
+	struct lu_client_seq *ss_client_seq;
+};
+
+static inline struct md_device *lu2md_dev(const struct lu_device *d)
+{
+	LASSERT(IS_ERR(d) || lu_device_is_md(d));
+	return container_of0(d, struct md_device, md_lu_dev);
+}
+
+static inline struct lu_device *md2lu_dev(struct md_device *d)
+{
+	return &d->md_lu_dev;
+}
+
+static inline struct md_object *lu2md(const struct lu_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev));
+	return container_of0(o, struct md_object, mo_lu);
+}
+
+static inline struct md_object *md_object_next(const struct md_object *obj)
+{
+	return (obj ? lu2md(lu_object_next(&obj->mo_lu)) : NULL);
+}
+
+static inline struct md_device *md_obj2dev(const struct md_object *o)
+{
+	LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->mo_lu.lo_dev));
+	return container_of0(o->mo_lu.lo_dev, struct md_device, md_lu_dev);
+}
+
+static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
+{
+	return s->ld_seq_site;
+}
+
+static inline int md_device_init(struct md_device *md, struct lu_device_type *t)
+{
+	return lu_device_init(&md->md_lu_dev, t);
+}
+
+static inline void md_device_fini(struct md_device *md)
+{
+	lu_device_fini(&md->md_lu_dev);
+}
+
+static inline struct md_object *md_object_find_slice(const struct lu_env *env,
+						     struct md_device *md,
+						     const struct lu_fid *f)
+{
+	return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL));
+}
+
+
+/** md operations */
+static inline int mo_permission(const struct lu_env *env,
+				struct md_object *p,
+				struct md_object *c,
+				struct md_attr *at,
+				int mask)
+{
+	LASSERT(c->mo_ops->moo_permission);
+	return c->mo_ops->moo_permission(env, p, c, at, mask);
+}
+
+static inline int mo_attr_get(const struct lu_env *env,
+			      struct md_object *m,
+			      struct md_attr *at)
+{
+	LASSERT(m->mo_ops->moo_attr_get);
+	return m->mo_ops->moo_attr_get(env, m, at);
+}
+
+static inline int mo_readlink(const struct lu_env *env,
+			      struct md_object *m,
+			      struct lu_buf *buf)
+{
+	LASSERT(m->mo_ops->moo_readlink);
+	return m->mo_ops->moo_readlink(env, m, buf);
+}
+
+static inline int mo_changelog(const struct lu_env *env,
+			       enum changelog_rec_type type,
+			       int flags, struct md_object *m)
+{
+	LASSERT(m->mo_ops->moo_changelog);
+	return m->mo_ops->moo_changelog(env, type, flags, m);
+}
+
+static inline int mo_attr_set(const struct lu_env *env,
+			      struct md_object *m,
+			      const struct md_attr *at)
+{
+	LASSERT(m->mo_ops->moo_attr_set);
+	return m->mo_ops->moo_attr_set(env, m, at);
+}
+
+static inline int mo_xattr_get(const struct lu_env *env,
+			       struct md_object *m,
+			       struct lu_buf *buf,
+			       const char *name)
+{
+	LASSERT(m->mo_ops->moo_xattr_get);
+	return m->mo_ops->moo_xattr_get(env, m, buf, name);
+}
+
+static inline int mo_xattr_del(const struct lu_env *env,
+			       struct md_object *m,
+			       const char *name)
+{
+	LASSERT(m->mo_ops->moo_xattr_del);
+	return m->mo_ops->moo_xattr_del(env, m, name);
+}
+
+static inline int mo_xattr_set(const struct lu_env *env,
+			       struct md_object *m,
+			       const struct lu_buf *buf,
+			       const char *name,
+			       int flags)
+{
+	LASSERT(m->mo_ops->moo_xattr_set);
+	return m->mo_ops->moo_xattr_set(env, m, buf, name, flags);
+}
+
+static inline int mo_xattr_list(const struct lu_env *env,
+				struct md_object *m,
+				struct lu_buf *buf)
+{
+	LASSERT(m->mo_ops->moo_xattr_list);
+	return m->mo_ops->moo_xattr_list(env, m, buf);
+}
+
+static inline int mo_swap_layouts(const struct lu_env *env,
+				  struct md_object *o1,
+				  struct md_object *o2, __u64 flags)
+{
+	LASSERT(o1->mo_ops->moo_swap_layouts);
+	LASSERT(o2->mo_ops->moo_swap_layouts);
+	if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts)
+		return -EPERM;
+	return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags);
+}
+
+static inline int mo_open(const struct lu_env *env,
+			  struct md_object *m,
+			  int flags)
+{
+	LASSERT(m->mo_ops->moo_open);
+	return m->mo_ops->moo_open(env, m, flags);
+}
+
+static inline int mo_close(const struct lu_env *env,
+			   struct md_object *m,
+			   struct md_attr *ma,
+			   int mode)
+{
+	LASSERT(m->mo_ops->moo_close);
+	return m->mo_ops->moo_close(env, m, ma, mode);
+}
+
+static inline int mo_readpage(const struct lu_env *env,
+			      struct md_object *m,
+			      const struct lu_rdpg *rdpg)
+{
+	LASSERT(m->mo_ops->moo_readpage);
+	return m->mo_ops->moo_readpage(env, m, rdpg);
+}
+
+static inline int mo_object_create(const struct lu_env *env,
+				   struct md_object *m,
+				   const struct md_op_spec *spc,
+				   struct md_attr *at)
+{
+	LASSERT(m->mo_ops->moo_object_create);
+	return m->mo_ops->moo_object_create(env, m, spc, at);
+}
+
+static inline int mo_ref_add(const struct lu_env *env,
+			     struct md_object *m,
+			     const struct md_attr *ma)
+{
+	LASSERT(m->mo_ops->moo_ref_add);
+	return m->mo_ops->moo_ref_add(env, m, ma);
+}
+
+static inline int mo_ref_del(const struct lu_env *env,
+			     struct md_object *m,
+			     struct md_attr *ma)
+{
+	LASSERT(m->mo_ops->moo_ref_del);
+	return m->mo_ops->moo_ref_del(env, m, ma);
+}
+
+static inline int mo_capa_get(const struct lu_env *env,
+			      struct md_object *m,
+			      struct lustre_capa *c,
+			      int renewal)
+{
+	LASSERT(m->mo_ops->moo_capa_get);
+	return m->mo_ops->moo_capa_get(env, m, c, renewal);
+}
+
+static inline int mo_object_sync(const struct lu_env *env, struct md_object *m)
+{
+	LASSERT(m->mo_ops->moo_object_sync);
+	return m->mo_ops->moo_object_sync(env, m);
+}
+
+static inline int mo_file_lock(const struct lu_env *env, struct md_object *m,
+			       struct lov_mds_md *lmm,
+			       struct ldlm_extent *extent,
+			       struct lustre_handle *lockh)
+{
+	LASSERT(m->mo_ops->moo_file_lock);
+	return m->mo_ops->moo_file_lock(env, m, lmm, extent, lockh);
+}
+
+static inline int mo_file_unlock(const struct lu_env *env, struct md_object *m,
+				 struct lov_mds_md *lmm,
+				 struct lustre_handle *lockh)
+{
+	LASSERT(m->mo_ops->moo_file_unlock);
+	return m->mo_ops->moo_file_unlock(env, m, lmm, lockh);
+}
+
+static inline int mo_object_lock(const struct lu_env *env,
+				 struct md_object *m,
+				 struct lustre_handle *lh,
+				 struct ldlm_enqueue_info *einfo,
+				 void *policy)
+{
+	LASSERT(m->mo_ops->moo_object_lock);
+	return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy);
+}
+
+static inline int mdo_lookup(const struct lu_env *env,
+			     struct md_object *p,
+			     const struct lu_name *lname,
+			     struct lu_fid *f,
+			     struct md_op_spec *spec)
+{
+	LASSERT(p->mo_dir_ops->mdo_lookup);
+	return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec);
+}
+
+static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env,
+				       struct md_object *mo,
+				       mdl_mode_t lm)
+{
+	if (mo->mo_dir_ops->mdo_lock_mode == NULL)
+		return MDL_MINMODE;
+	return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm);
+}
+
+static inline int mdo_create(const struct lu_env *env,
+			     struct md_object *p,
+			     const struct lu_name *lchild_name,
+			     struct md_object *c,
+			     struct md_op_spec *spc,
+			     struct md_attr *at)
+{
+	LASSERT(p->mo_dir_ops->mdo_create);
+	return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at);
+}
+
+static inline int mdo_create_data(const struct lu_env *env,
+				  struct md_object *p,
+				  struct md_object *c,
+				  const struct md_op_spec *spec,
+				  struct md_attr *ma)
+{
+	LASSERT(c->mo_dir_ops->mdo_create_data);
+	return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma);
+}
+
+static inline int mdo_rename(const struct lu_env *env,
+			     struct md_object *sp,
+			     struct md_object *tp,
+			     const struct lu_fid *lf,
+			     const struct lu_name *lsname,
+			     struct md_object *t,
+			     const struct lu_name *ltname,
+			     struct md_attr *ma)
+{
+	LASSERT(tp->mo_dir_ops->mdo_rename);
+	return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname,
+					  ma);
+}
+
+static inline int mdo_is_subdir(const struct lu_env *env,
+				struct md_object *mo,
+				const struct lu_fid *fid,
+				struct lu_fid *sfid)
+{
+	LASSERT(mo->mo_dir_ops->mdo_is_subdir);
+	return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid);
+}
+
+static inline int mdo_link(const struct lu_env *env,
+			   struct md_object *p,
+			   struct md_object *s,
+			   const struct lu_name *lname,
+			   struct md_attr *ma)
+{
+	LASSERT(s->mo_dir_ops->mdo_link);
+	return s->mo_dir_ops->mdo_link(env, p, s, lname, ma);
+}
+
+static inline int mdo_unlink(const struct lu_env *env,
+			     struct md_object *p,
+			     struct md_object *c,
+			     const struct lu_name *lname,
+			     struct md_attr *ma, int no_name)
+{
+	LASSERT(p->mo_dir_ops->mdo_unlink);
+	return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name);
+}
+
+static inline int mdo_lum_lmm_cmp(const struct lu_env *env,
+				  struct md_object *c,
+				  const struct md_op_spec *spec,
+				  struct md_attr *ma)
+{
+	LASSERT(c->mo_dir_ops->mdo_lum_lmm_cmp);
+	return c->mo_dir_ops->mdo_lum_lmm_cmp(env, c, spec, ma);
+}
+
+static inline int mdo_name_insert(const struct lu_env *env,
+				  struct md_object *p,
+				  const struct lu_name *lname,
+				  const struct lu_fid *f,
+				  const struct md_attr *ma)
+{
+	LASSERT(p->mo_dir_ops->mdo_name_insert);
+	return p->mo_dir_ops->mdo_name_insert(env, p, lname, f, ma);
+}
+
+static inline int mdo_name_remove(const struct lu_env *env,
+				  struct md_object *p,
+				  const struct lu_name *lname,
+				  const struct md_attr *ma)
+{
+	LASSERT(p->mo_dir_ops->mdo_name_remove);
+	return p->mo_dir_ops->mdo_name_remove(env, p, lname, ma);
+}
+
+static inline int mdo_rename_tgt(const struct lu_env *env,
+				 struct md_object *p,
+				 struct md_object *t,
+				 const struct lu_fid *lf,
+				 const struct lu_name *lname,
+				 struct md_attr *ma)
+{
+	if (t) {
+		LASSERT(t->mo_dir_ops->mdo_rename_tgt);
+		return t->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma);
+	} else {
+		LASSERT(p->mo_dir_ops->mdo_rename_tgt);
+		return p->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma);
+	}
+}
+
+/**
+ * Used in MDD/OUT layer for object lock rule
+ **/
+enum mdd_object_role {
+	MOR_SRC_PARENT,
+	MOR_SRC_CHILD,
+	MOR_TGT_PARENT,
+	MOR_TGT_CHILD,
+	MOR_TGT_ORPHAN
+};
+
+struct dt_device;
+/**
+ * Structure to hold object information. This is used to create object
+ * \pre llod_dir exist
+ */
+struct lu_local_obj_desc {
+	const char		      *llod_dir;
+	const char		      *llod_name;
+	__u32			    llod_oid;
+	int			      llod_is_index;
+	const struct dt_index_features  *llod_feat;
+	struct list_head		       llod_linkage;
+};
+
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd);
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
+void lustre_hsm2buf(void *buf, struct md_hsm *mh);
+
+struct lu_ucred {
+	__u32	       uc_valid;
+	__u32	       uc_o_uid;
+	__u32	       uc_o_gid;
+	__u32	       uc_o_fsuid;
+	__u32	       uc_o_fsgid;
+	__u32	       uc_uid;
+	__u32	       uc_gid;
+	__u32	       uc_fsuid;
+	__u32	       uc_fsgid;
+	__u32	       uc_suppgids[2];
+	cfs_cap_t	   uc_cap;
+	__u32	       uc_umask;
+	group_info_t   *uc_ginfo;
+	struct md_identity *uc_identity;
+};
+
+struct lu_ucred *lu_ucred(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_check(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env);
+
+int lu_ucred_global_init(void);
+
+void lu_ucred_global_fini(void);
+
+#define md_cap_t(x) (x)
+
+#define MD_CAP_TO_MASK(x) (1 << (x))
+
+#define md_cap_raised(c, flag) (md_cap_t(c) & MD_CAP_TO_MASK(flag))
+
+/* capable() is copied from linux kernel! */
+static inline int md_capable(struct lu_ucred *uc, cfs_cap_t cap)
+{
+	if (md_cap_raised(uc->uc_cap, cap))
+		return 1;
+	return 0;
+}
+
+/** @} md */
+#endif /* _LINUX_MD_OBJECT_H */

diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h
new file mode 100644
index 0000000..0a251fd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd.h

@@ -0,0 +1,1677 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_H
+#define __OBD_H
+
+#include <linux/obd.h>
+
+#define IOC_OSC_TYPE	 'h'
+#define IOC_OSC_MIN_NR       20
+#define IOC_OSC_SET_ACTIVE   _IOWR(IOC_OSC_TYPE, 21, struct obd_device *)
+#define IOC_OSC_MAX_NR       50
+
+#define IOC_MDC_TYPE	 'i'
+#define IOC_MDC_MIN_NR       20
+#define IOC_MDC_MAX_NR       50
+
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+#include <lustre_lib.h>
+#include <lustre_export.h>
+#include <lustre_fld.h>
+#include <lustre_capa.h>
+
+#include <linux/libcfs/bitmap.h>
+
+
+#define MAX_OBD_DEVICES 8192
+
+struct osc_async_rc {
+	int     ar_rc;
+	int     ar_force_sync;
+	__u64   ar_min_xid;
+};
+
+struct lov_oinfo {		 /* per-stripe data structure */
+	struct ost_id   loi_oi;    /* object ID/Sequence on the target OST */
+	int loi_ost_idx;	   /* OST stripe index in lov_tgt_desc->tgts */
+	int loi_ost_gen;	   /* generation of this loi_ost_idx */
+
+	unsigned long loi_kms_valid:1;
+	__u64 loi_kms;	     /* known minimum size */
+	struct ost_lvb loi_lvb;
+	struct osc_async_rc     loi_ar;
+};
+
+static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms)
+{
+	oinfo->loi_kms = kms;
+	oinfo->loi_kms_valid = 1;
+}
+
+static inline void loi_init(struct lov_oinfo *loi)
+{
+}
+
+struct lov_stripe_md {
+	atomic_t     lsm_refc;
+	spinlock_t	lsm_lock;
+	pid_t	    lsm_lock_owner; /* debugging */
+
+	/* maximum possible file size, might change as OSTs status changes,
+	 * e.g. disconnected, deactivated */
+	__u64	    lsm_maxbytes;
+	struct {
+		/* Public members. */
+		struct ost_id lw_object_oi; /* lov object id/seq */
+
+		/* LOV-private members start here -- only for use in lov/. */
+		__u32 lw_magic;
+		__u32 lw_stripe_size;      /* size of the stripe */
+		__u32 lw_pattern;	  /* striping pattern (RAID0, RAID1) */
+		__u16 lw_stripe_count;  /* number of objects being striped over */
+		__u16 lw_layout_gen;       /* generation of the layout */
+		char  lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+	} lsm_wire;
+
+	struct lov_oinfo *lsm_oinfo[0];
+};
+
+#define lsm_oi		 lsm_wire.lw_object_oi
+#define lsm_magic	lsm_wire.lw_magic
+#define lsm_layout_gen   lsm_wire.lw_layout_gen
+#define lsm_stripe_size  lsm_wire.lw_stripe_size
+#define lsm_pattern      lsm_wire.lw_pattern
+#define lsm_stripe_count lsm_wire.lw_stripe_count
+#define lsm_pool_name    lsm_wire.lw_pool_name
+
+struct obd_info;
+
+typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
+
+/* obd info for a particular level (lov, osc). */
+struct obd_info {
+	/* Lock policy. It keeps an extent which is specific for a particular
+	 * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy,
+	 * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */
+	ldlm_policy_data_t      oi_policy;
+	/* Flags used for set request specific flags:
+	   - while lock handling, the flags obtained on the enqueue
+	   request are set here.
+	   - while stats, the flags used for control delay/resend.
+	   - while setattr, the flags used for distinguish punch operation
+	 */
+	__u64		   oi_flags;
+	/* Lock handle specific for every OSC lock. */
+	struct lustre_handle   *oi_lockh;
+	/* lsm data specific for every OSC. */
+	struct lov_stripe_md   *oi_md;
+	/* obdo data specific for every OSC, if needed at all. */
+	struct obdo	    *oi_oa;
+	/* statfs data specific for every OSC, if needed at all. */
+	struct obd_statfs      *oi_osfs;
+	/* An update callback which is called to update some data on upper
+	 * level. E.g. it is used for update lsm->lsm_oinfo at every recieved
+	 * request in osc level for enqueue requests. It is also possible to
+	 * update some caller data from LOV layer if needed. */
+	obd_enqueue_update_f    oi_cb_up;
+	/* oss capability, its type is obd_capa in client to avoid copy.
+	 * in contrary its type is lustre_capa in OSS. */
+	void		   *oi_capa;
+	/* transfer jobid from ost_sync() to filter_sync()... */
+	char		   *oi_jobid;
+};
+
+/* compare all relevant fields. */
+static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1,
+				    struct lov_stripe_md *m2)
+{
+	/*
+	 * ->lsm_wire contains padding, but it should be zeroed out during
+	 * allocation.
+	 */
+	return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof m1->lsm_wire);
+}
+
+static inline int lov_lum_lsm_cmp(struct lov_user_md *lum,
+				  struct lov_stripe_md  *lsm)
+{
+	if (lsm->lsm_magic != lum->lmm_magic)
+		return 1;
+	if ((lsm->lsm_stripe_count != 0) && (lum->lmm_stripe_count != 0) &&
+	    (lsm->lsm_stripe_count != lum->lmm_stripe_count))
+		return 2;
+	if ((lsm->lsm_stripe_size != 0) && (lum->lmm_stripe_size != 0) &&
+	    (lsm->lsm_stripe_size != lum->lmm_stripe_size))
+		return 3;
+	if ((lsm->lsm_pattern != 0) && (lum->lmm_pattern != 0) &&
+	    (lsm->lsm_pattern != lum->lmm_pattern))
+		return 4;
+	if ((lsm->lsm_magic == LOV_MAGIC_V3) &&
+	    (strncmp(lsm->lsm_pool_name,
+		     ((struct lov_user_md_v3 *)lum)->lmm_pool_name,
+		     LOV_MAXPOOLNAME) != 0))
+		return 5;
+	return 0;
+}
+
+static inline int lov_lum_swab_if_needed(struct lov_user_md_v3 *lumv3,
+					 int *lmm_magic,
+					 struct lov_user_md *lum)
+{
+	if (lum && copy_from_user(lumv3, lum,sizeof(struct lov_user_md_v1)))
+		return -EFAULT;
+
+	*lmm_magic = lumv3->lmm_magic;
+
+	if (*lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
+		lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lumv3);
+		*lmm_magic = LOV_USER_MAGIC_V1;
+	} else if (*lmm_magic == LOV_USER_MAGIC_V3) {
+		if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+			return -EFAULT;
+	} else if (*lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
+		if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+			return -EFAULT;
+		lustre_swab_lov_user_md_v3(lumv3);
+		*lmm_magic = LOV_USER_MAGIC_V3;
+	} else if (*lmm_magic != LOV_USER_MAGIC_V1) {
+		CDEBUG(D_IOCTL,
+		       "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+		       *lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3);
+		       return -EINVAL;
+	}
+	return 0;
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md);
+void lov_stripe_unlock(struct lov_stripe_md *md);
+
+struct obd_type {
+	struct list_head typ_chain;
+	struct obd_ops *typ_dt_ops;
+	struct md_ops *typ_md_ops;
+	proc_dir_entry_t *typ_procroot;
+	char *typ_name;
+	int  typ_refcnt;
+	struct lu_device_type *typ_lu;
+	spinlock_t obd_type_lock;
+};
+
+struct brw_page {
+	obd_off  off;
+	struct page *pg;
+	int count;
+	obd_flag flag;
+};
+
+/* Individual type definitions */
+
+struct ost_server_data;
+
+struct osd_properties {
+	size_t osd_max_ea_size;
+};
+
+#define OBT_MAGIC       0xBDDECEAE
+/* hold common fields for "target" device */
+struct obd_device_target {
+	__u32		     obt_magic;
+	__u32		     obt_instance;
+	struct super_block       *obt_sb;
+	/** last_rcvd file */
+	struct file	      *obt_rcvd_filp;
+	__u64		     obt_mount_count;
+	struct rw_semaphore	  obt_rwsem;
+	struct vfsmount	  *obt_vfsmnt;
+	struct file	      *obt_health_check_filp;
+	struct osd_properties     obt_osd_properties;
+	struct obd_job_stats      obt_jobstats;
+};
+
+/* llog contexts */
+enum llog_ctxt_id {
+	LLOG_CONFIG_ORIG_CTXT  =  0,
+	LLOG_CONFIG_REPL_CTXT,
+	LLOG_MDS_OST_ORIG_CTXT,
+	LLOG_MDS_OST_REPL_CTXT,
+	LLOG_SIZE_ORIG_CTXT,
+	LLOG_SIZE_REPL_CTXT,
+	LLOG_RD1_ORIG_CTXT,
+	LLOG_RD1_REPL_CTXT,
+	LLOG_TEST_ORIG_CTXT,
+	LLOG_TEST_REPL_CTXT,
+	LLOG_LOVEA_ORIG_CTXT,
+	LLOG_LOVEA_REPL_CTXT,
+	LLOG_CHANGELOG_ORIG_CTXT,      /**< changelog generation on mdd */
+	LLOG_CHANGELOG_REPL_CTXT,      /**< changelog access on clients */
+	LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */
+	LLOG_MAX_CTXTS
+};
+
+#define FILTER_SUBDIR_COUNT      32	    /* set to zero for no subdirs */
+
+struct filter_subdirs {
+	struct dentry *dentry[FILTER_SUBDIR_COUNT];
+};
+
+
+struct filter_ext {
+	__u64		fe_start;
+	__u64		fe_end;
+};
+
+struct filter_obd {
+	/* NB this field MUST be first */
+	struct obd_device_target fo_obt;
+	const char		*fo_fstype;
+
+	int			fo_group_count;
+	struct dentry		*fo_dentry_O;
+	struct dentry		**fo_dentry_O_groups;
+	struct filter_subdirs	*fo_dentry_O_sub;
+	struct mutex		fo_init_lock;	/* group initialization lock*/
+	int			fo_committed_group;
+
+	spinlock_t		fo_objidlock;	/* protect fo_lastobjid */
+
+	unsigned long		fo_destroys_in_progress;
+	struct mutex		fo_create_locks[FILTER_SUBDIR_COUNT];
+
+	struct list_head fo_export_list;
+	int		  fo_subdir_count;
+
+	obd_size	     fo_tot_dirty;      /* protected by obd_osfs_lock */
+	obd_size	     fo_tot_granted;    /* all values in bytes */
+	obd_size	     fo_tot_pending;
+	int		  fo_tot_granted_clients;
+
+	obd_size	     fo_readcache_max_filesize;
+	spinlock_t		fo_flags_lock;
+	unsigned int	 fo_read_cache:1,   /**< enable read-only cache */
+			     fo_writethrough_cache:1,/**< read cache writes */
+			     fo_mds_ost_sync:1, /**< MDS-OST orphan recovery*/
+			     fo_raid_degraded:1;/**< RAID device degraded */
+
+	struct obd_import   *fo_mdc_imp;
+	struct obd_uuid      fo_mdc_uuid;
+	struct lustre_handle fo_mdc_conn;
+	struct file	**fo_last_objid_files;
+	__u64	       *fo_last_objids; /* last created objid for groups,
+					      * protected by fo_objidlock */
+
+	struct mutex		fo_alloc_lock;
+
+	atomic_t	 fo_r_in_flight;
+	atomic_t	 fo_w_in_flight;
+
+	/*
+	 * per-filter pool of kiobuf's allocated by filter_common_setup() and
+	 * torn down by filter_cleanup().
+	 *
+	 * This pool contains kiobuf used by
+	 * filter_{prep,commit}rw_{read,write}() and is shared by all OST
+	 * threads.
+	 *
+	 * Locking: protected by internal lock of cfs_hash, pool can be
+	 * found from this hash table by t_id of ptlrpc_thread.
+	 */
+	struct cfs_hash		*fo_iobuf_hash;
+
+	struct brw_stats	 fo_filter_stats;
+
+	int		      fo_fmd_max_num; /* per exp filter_mod_data */
+	int		      fo_fmd_max_age; /* jiffies to fmd expiry */
+	unsigned long	    fo_syncjournal:1, /* sync journal on writes */
+				 fo_sync_lock_cancel:2;/* sync on lock cancel */
+
+
+	/* sptlrpc stuff */
+	rwlock_t		fo_sptlrpc_lock;
+	struct sptlrpc_rule_set  fo_sptlrpc_rset;
+
+	/* capability related */
+	unsigned int	     fo_fl_oss_capa;
+	struct list_head	       fo_capa_keys;
+	struct hlist_head	*fo_capa_hash;
+	int		      fo_sec_level;
+};
+
+struct timeout_item {
+	enum timeout_event ti_event;
+	cfs_time_t	 ti_timeout;
+	timeout_cb_t       ti_cb;
+	void	      *ti_cb_data;
+	struct list_head	 ti_obd_list;
+	struct list_head	 ti_chain;
+};
+
+#define OSC_MAX_RIF_DEFAULT       8
+#define MDS_OSC_MAX_RIF_DEFAULT   50
+#define OSC_MAX_RIF_MAX	 256
+#define OSC_MAX_DIRTY_DEFAULT  (OSC_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_MB_MAX   2048     /* arbitrary, but < MAX_LONG bytes */
+#define OSC_DEFAULT_RESENDS      10
+
+/* possible values for fo_sync_lock_cancel */
+enum {
+	NEVER_SYNC_ON_CANCEL = 0,
+	BLOCKING_SYNC_ON_CANCEL = 1,
+	ALWAYS_SYNC_ON_CANCEL = 2,
+	NUM_SYNC_ON_CANCEL_STATES
+};
+
+#define MDC_MAX_RIF_DEFAULT       8
+#define MDC_MAX_RIF_MAX	 512
+
+struct mdc_rpc_lock;
+struct obd_import;
+struct client_obd {
+	struct rw_semaphore  cl_sem;
+	struct obd_uuid	  cl_target_uuid;
+	struct obd_import       *cl_import; /* ptlrpc connection state */
+	int		      cl_conn_count;
+	/* max_mds_easize is purely a performance thing so we don't have to
+	 * call obd_size_diskmd() all the time. */
+	int		      cl_default_mds_easize;
+	int		      cl_max_mds_easize;
+	int		      cl_max_mds_cookiesize;
+
+	enum lustre_sec_part     cl_sp_me;
+	enum lustre_sec_part     cl_sp_to;
+	struct sptlrpc_flavor    cl_flvr_mgc;   /* fixed flavor of mgc->mgs */
+
+	/* the grant values are protected by loi_list_lock below */
+	long		     cl_dirty;	 /* all _dirty_ in bytes */
+	long		     cl_dirty_max;     /* allowed w/o rpc */
+	long		     cl_dirty_transit; /* dirty synchronous */
+	long		     cl_avail_grant;   /* bytes of credit for ost */
+	long		     cl_lost_grant;    /* lost credits (trunc) */
+
+	/* since we allocate grant by blocks, we don't know how many grant will
+	 * be used to add a page into cache. As a solution, we reserve maximum
+	 * grant before trying to dirty a page and unreserve the rest.
+	 * See osc_{reserve|unreserve}_grant for details. */
+	long		 cl_reserved_grant;
+	struct list_head	   cl_cache_waiters; /* waiting for cache/grant */
+	cfs_time_t	   cl_next_shrink_grant;   /* jiffies */
+	struct list_head	   cl_grant_shrink_list;  /* Timeout event list */
+	int		  cl_grant_shrink_interval; /* seconds */
+
+	/* A chunk is an optimal size used by osc_extent to determine
+	 * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */
+	int		  cl_chunkbits;
+	int		  cl_chunk;
+	int		  cl_extent_tax; /* extent overhead, by bytes */
+
+	/* keep track of objects that have lois that contain pages which
+	 * have been queued for async brw.  this lock also protects the
+	 * lists of osc_client_pages that hang off of the loi */
+	/*
+	 * ->cl_loi_list_lock protects consistency of
+	 * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and
+	 * ->ap_completion() call-backs are executed under this lock. As we
+	 * cannot guarantee that these call-backs never block on all platforms
+	 * (as a matter of fact they do block on Mac OS X), type of
+	 * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux
+	 * and blocking mutex on Mac OS X. (Alternative is to make this lock
+	 * blocking everywhere, but we don't want to slow down fast-path of
+	 * our main platform.)
+	 *
+	 * Exact type of ->cl_loi_list_lock is defined in arch/obd.h together
+	 * with client_obd_list_{un,}lock() and
+	 * client_obd_list_lock_{init,done}() functions.
+	 *
+	 * NB by Jinshan: though field names are still _loi_, but actually
+	 * osc_object{}s are in the list.
+	 */
+	client_obd_lock_t	cl_loi_list_lock;
+	struct list_head	       cl_loi_ready_list;
+	struct list_head	       cl_loi_hp_ready_list;
+	struct list_head	       cl_loi_write_list;
+	struct list_head	       cl_loi_read_list;
+	int		      cl_r_in_flight;
+	int		      cl_w_in_flight;
+	/* just a sum of the loi/lop pending numbers to be exported by /proc */
+	atomic_t	     cl_pending_w_pages;
+	atomic_t	     cl_pending_r_pages;
+	__u32			 cl_max_pages_per_rpc;
+	int		      cl_max_rpcs_in_flight;
+	struct obd_histogram     cl_read_rpc_hist;
+	struct obd_histogram     cl_write_rpc_hist;
+	struct obd_histogram     cl_read_page_hist;
+	struct obd_histogram     cl_write_page_hist;
+	struct obd_histogram     cl_read_offset_hist;
+	struct obd_histogram     cl_write_offset_hist;
+
+	/* lru for osc caching pages */
+	struct cl_client_cache	*cl_cache;
+	struct list_head		 cl_lru_osc; /* member of cl_cache->ccc_lru */
+	atomic_t		*cl_lru_left;
+	atomic_t		 cl_lru_busy;
+	atomic_t		 cl_lru_shrinkers;
+	atomic_t		 cl_lru_in_list;
+	struct list_head		 cl_lru_list; /* lru page list */
+	client_obd_lock_t	 cl_lru_list_lock; /* page list protector */
+
+	/* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
+	atomic_t	     cl_destroy_in_flight;
+	wait_queue_head_t	      cl_destroy_waitq;
+
+	struct mdc_rpc_lock     *cl_rpc_lock;
+	struct mdc_rpc_lock     *cl_close_lock;
+
+	/* mgc datastruct */
+	struct semaphore	 cl_mgc_sem;
+	struct vfsmount	 *cl_mgc_vfsmnt;
+	struct dentry	   *cl_mgc_configs_dir;
+	atomic_t	     cl_mgc_refcount;
+	struct obd_export       *cl_mgc_mgsexp;
+
+	/* checksumming for data sent over the network */
+	unsigned int	     cl_checksum:1; /* 0 = disabled, 1 = enabled */
+	/* supported checksum types that are worked out at connect time */
+	__u32		    cl_supp_cksum_types;
+	/* checksum algorithm to be used */
+	cksum_type_t	     cl_cksum_type;
+
+	/* also protected by the poorly named _loi_list_lock lock above */
+	struct osc_async_rc      cl_ar;
+
+	/* used by quotacheck when the servers are older than 2.4 */
+	int		      cl_qchk_stat; /* quotacheck stat of the peer */
+#define CL_NOT_QUOTACHECKED 1   /* client->cl_qchk_stat init value */
+#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0)
+#warning "please consider removing quotacheck compatibility code"
+#endif
+
+	/* sequence manager */
+	struct lu_client_seq    *cl_seq;
+
+	atomic_t	     cl_resends; /* resend count */
+
+	/* ptlrpc work for writeback in ptlrpcd context */
+	void		    *cl_writeback_work;
+	/* hash tables for osc_quota_info */
+	cfs_hash_t	      *cl_quota_hash[MAXQUOTAS];
+};
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
+
+struct obd_id_info {
+	__u32   idx;
+	obd_id  *data;
+};
+
+/* */
+
+struct echo_obd {
+	struct obd_device_target eo_obt;
+	struct obdo		eo_oa;
+	spinlock_t		 eo_lock;
+	__u64			 eo_lastino;
+	struct lustre_handle	eo_nl_lock;
+	atomic_t		eo_prep;
+};
+
+struct ost_obd {
+	struct ptlrpc_service	*ost_service;
+	struct ptlrpc_service	*ost_create_service;
+	struct ptlrpc_service	*ost_io_service;
+	struct ptlrpc_service	*ost_seq_service;
+	struct mutex		ost_health_mutex;
+};
+
+struct echo_client_obd {
+	struct obd_export	*ec_exp;   /* the local connection to osc/lov */
+	spinlock_t		ec_lock;
+	struct list_head	   ec_objects;
+	struct list_head	   ec_locks;
+	int		  ec_nstripes;
+	__u64		ec_unique;
+};
+
+struct lov_qos_oss {
+	struct obd_uuid     lqo_uuid;       /* ptlrpc's c_remote_uuid */
+	struct list_head	  lqo_oss_list;   /* link to lov_qos */
+	__u64	       lqo_bavail;     /* total bytes avail on OSS */
+	__u64	       lqo_penalty;    /* current penalty */
+	__u64	       lqo_penalty_per_obj;/* penalty decrease every obj*/
+	time_t	      lqo_used;       /* last used time, seconds */
+	__u32	       lqo_ost_count;  /* number of osts on this oss */
+};
+
+struct ltd_qos {
+	struct lov_qos_oss *ltq_oss;	 /* oss info */
+	__u64	       ltq_penalty;     /* current penalty */
+	__u64	       ltq_penalty_per_obj; /* penalty decrease every obj*/
+	__u64	       ltq_weight;      /* net weighting */
+	time_t	      ltq_used;	/* last used time, seconds */
+	unsigned int	ltq_usable:1;    /* usable for striping */
+};
+
+/* Generic subset of OSTs */
+struct ost_pool {
+	__u32	      *op_array;      /* array of index of
+						   lov_obd->lov_tgts */
+	unsigned int	op_count;      /* number of OSTs in the array */
+	unsigned int	op_size;       /* allocated size of lp_array */
+	struct rw_semaphore op_rw_sem;     /* to protect ost_pool use */
+};
+
+/* Round-robin allocator data */
+struct lov_qos_rr {
+	__u32	       lqr_start_idx;   /* start index of new inode */
+	__u32	       lqr_offset_idx;  /* aliasing for start_idx  */
+	int		 lqr_start_count; /* reseed counter */
+	struct ost_pool     lqr_pool;	/* round-robin optimized list */
+	unsigned long       lqr_dirty:1;     /* recalc round-robin list */
+};
+
+/* allow statfs data caching for 1 second */
+#define OBD_STATFS_CACHE_SECONDS 1
+
+struct lov_statfs_data {
+	struct obd_info   lsd_oi;
+	struct obd_statfs lsd_statfs;
+};
+/* Stripe placement optimization */
+struct lov_qos {
+	struct list_head	  lq_oss_list; /* list of OSSs that targets use */
+	struct rw_semaphore lq_rw_sem;
+	__u32	       lq_active_oss_count;
+	unsigned int	lq_prio_free;   /* priority for free space */
+	unsigned int	lq_threshold_rr;/* priority for rr */
+	struct lov_qos_rr   lq_rr;	  /* round robin qos data */
+	unsigned long       lq_dirty:1,     /* recalc qos data */
+			    lq_same_space:1,/* the ost's all have approx.
+					       the same space avail */
+			    lq_reset:1,     /* zero current penalties */
+			    lq_statfs_in_progress:1; /* statfs op in
+							progress */
+	/* qos statfs data */
+	struct lov_statfs_data *lq_statfs_data;
+	wait_queue_head_t	 lq_statfs_waitq; /* waitqueue to notify statfs
+					      * requests completion */
+};
+
+struct lov_tgt_desc {
+	struct list_head	  ltd_kill;
+	struct obd_uuid     ltd_uuid;
+	struct obd_device  *ltd_obd;
+	struct obd_export  *ltd_exp;
+	struct ltd_qos      ltd_qos;     /* qos info per target */
+	__u32	       ltd_gen;
+	__u32	       ltd_index;   /* index in lov_obd->tgts */
+	unsigned long       ltd_active:1,/* is this target up for requests */
+			    ltd_activate:1,/* should  target be activated */
+			    ltd_reap:1;  /* should this target be deleted */
+};
+
+/* Pool metadata */
+#define pool_tgt_size(_p)   _p->pool_obds.op_size
+#define pool_tgt_count(_p)  _p->pool_obds.op_count
+#define pool_tgt_array(_p)  _p->pool_obds.op_array
+#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem
+
+struct pool_desc {
+	char		  pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */
+	struct ost_pool       pool_obds;	      /* pool members */
+	atomic_t	  pool_refcount;	  /* pool ref. counter */
+	struct lov_qos_rr     pool_rr;		/* round robin qos */
+	struct hlist_node      pool_hash;	      /* access by poolname */
+	struct list_head	    pool_list;	      /* serial access */
+	proc_dir_entry_t *pool_proc_entry;	/* file in /proc */
+	struct obd_device    *pool_lobd;	      /* obd of the lov/lod to which
+						       * this pool belongs */
+};
+
+struct lov_obd {
+	struct lov_desc	 desc;
+	struct lov_tgt_desc   **lov_tgts;	      /* sparse array */
+	struct ost_pool	 lov_packed;	    /* all OSTs in a packed
+							  array */
+	struct mutex		lov_lock;
+	struct obd_connect_data lov_ocd;
+	atomic_t	    lov_refcount;
+	__u32		   lov_tgt_count;	 /* how many OBD's */
+	__u32		   lov_active_tgt_count;  /* how many active */
+	__u32		   lov_death_row;/* tgts scheduled to be deleted */
+	__u32		   lov_tgt_size;   /* size of tgts array */
+	int		     lov_connects;
+	int		     lov_pool_count;
+	cfs_hash_t	     *lov_pools_hash_body; /* used for key access */
+	struct list_head	      lov_pool_list; /* used for sequential access */
+	proc_dir_entry_t   *lov_pool_proc_entry;
+	enum lustre_sec_part    lov_sp_me;
+
+	/* Cached LRU pages from upper layer */
+	void		       *lov_cache;
+
+	struct rw_semaphore     lov_notify_lock;
+};
+
+struct lmv_tgt_desc {
+	struct obd_uuid		ltd_uuid;
+	struct obd_export	*ltd_exp;
+	int			ltd_idx;
+	struct mutex		ltd_fid_mutex;
+	unsigned long		ltd_active:1; /* target up for requests */
+};
+
+enum placement_policy {
+	PLACEMENT_CHAR_POLICY   = 0,
+	PLACEMENT_NID_POLICY    = 1,
+	PLACEMENT_INVAL_POLICY  = 2,
+	PLACEMENT_MAX_POLICY
+};
+
+typedef enum placement_policy placement_policy_t;
+
+struct lmv_obd {
+	int			refcount;
+	struct lu_client_fld	lmv_fld;
+	spinlock_t		lmv_lock;
+	placement_policy_t	lmv_placement;
+	struct lmv_desc		desc;
+	struct obd_uuid		cluuid;
+	struct obd_export	*exp;
+
+	struct mutex		init_mutex;
+	int			connected;
+	int			max_easize;
+	int			max_def_easize;
+	int			max_cookiesize;
+	int			server_timeout;
+
+	int			tgts_size; /* size of tgts array */
+	struct lmv_tgt_desc	**tgts;
+
+	struct obd_connect_data	conn_data;
+};
+
+struct niobuf_local {
+	__u64		lnb_file_offset;
+	__u32		lnb_page_offset;
+	__u32		len;
+	__u32		flags;
+	struct page	*page;
+	struct dentry	*dentry;
+	int		lnb_grant_used;
+	int		rc;
+};
+
+#define LUSTRE_FLD_NAME	 "fld"
+#define LUSTRE_SEQ_NAME	 "seq"
+
+#define LUSTRE_MDD_NAME	 "mdd"
+#define LUSTRE_OSD_LDISKFS_NAME	"osd-ldiskfs"
+#define LUSTRE_OSD_ZFS_NAME     "osd-zfs"
+#define LUSTRE_VVP_NAME	 "vvp"
+#define LUSTRE_LMV_NAME	 "lmv"
+#define LUSTRE_SLP_NAME	 "slp"
+#define LUSTRE_LOD_NAME		"lod"
+#define LUSTRE_OSP_NAME		"osp"
+#define LUSTRE_LWP_NAME		"lwp"
+
+/* obd device type names */
+ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
+#define LUSTRE_MDS_NAME	 "mds"
+#define LUSTRE_MDT_NAME	 "mdt"
+#define LUSTRE_MDC_NAME	 "mdc"
+#define LUSTRE_OSS_NAME	 "ost"       /* FIXME change name to oss */
+#define LUSTRE_OST_NAME	 "obdfilter" /* FIXME change name to ost */
+#define LUSTRE_OSC_NAME	 "osc"
+#define LUSTRE_LOV_NAME	 "lov"
+#define LUSTRE_MGS_NAME	 "mgs"
+#define LUSTRE_MGC_NAME	 "mgc"
+
+#define LUSTRE_ECHO_NAME	"obdecho"
+#define LUSTRE_ECHO_CLIENT_NAME "echo_client"
+#define LUSTRE_QMT_NAME	 "qmt"
+
+/* Constant obd names (post-rename) */
+#define LUSTRE_MDS_OBDNAME "MDS"
+#define LUSTRE_OSS_OBDNAME "OSS"
+#define LUSTRE_MGS_OBDNAME "MGS"
+#define LUSTRE_MGC_OBDNAME "MGC"
+
+static inline int is_osp_on_mdt(char *name)
+{
+	char   *ptr;
+
+	ptr = strrchr(name, '-');
+	if (ptr == NULL) {
+		CERROR("%s is not a obdname\n", name);
+		return 0;
+	}
+
+	/* 1.8 OSC/OSP name on MDT is fsname-OSTxxxx-osc */
+	if (strncmp(ptr + 1, "osc", 3) == 0)
+		return 1;
+
+	if (strncmp(ptr + 1, "MDT", 3) != 0)
+		return 0;
+
+	while (*(--ptr) != '-' && ptr != name);
+
+	if (ptr == name)
+		return 0;
+
+	if (strncmp(ptr + 1, LUSTRE_OSP_NAME, strlen(LUSTRE_OSP_NAME)) != 0 &&
+	    strncmp(ptr + 1, LUSTRE_OSC_NAME, strlen(LUSTRE_OSC_NAME)) != 0)
+		return 0;
+
+	return 1;
+}
+
+/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */
+#define N_LOCAL_TEMP_PAGE 0x10000000
+
+struct obd_trans_info {
+	__u64		    oti_transno;
+	__u64		    oti_xid;
+	/* Only used on the server side for tracking acks. */
+	struct oti_req_ack_lock {
+		struct lustre_handle lock;
+		__u32		mode;
+	}			oti_ack_locks[4];
+	void		    *oti_handle;
+	struct llog_cookie       oti_onecookie;
+	struct llog_cookie      *oti_logcookies;
+	int		      oti_numcookies;
+	/** synchronous write is needed */
+	unsigned long		 oti_sync_write:1;
+
+	/* initial thread handling transaction */
+	struct ptlrpc_thread *   oti_thread;
+	__u32		    oti_conn_cnt;
+	/** VBR: versions */
+	__u64		    oti_pre_version;
+	/** JobID */
+	char		    *oti_jobid;
+
+	struct obd_uuid	 *oti_ost_uuid;
+};
+
+static inline void oti_init(struct obd_trans_info *oti,
+			    struct ptlrpc_request *req)
+{
+	if (oti == NULL)
+		return;
+	memset(oti, 0, sizeof(*oti));
+
+	if (req == NULL)
+		return;
+
+	oti->oti_xid = req->rq_xid;
+	/** VBR: take versions from request */
+	if (req->rq_reqmsg != NULL &&
+	    lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+		__u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg);
+		oti->oti_pre_version = pre_version ? pre_version[0] : 0;
+		oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	}
+
+	/** called from mds_create_objects */
+	if (req->rq_repmsg != NULL)
+		oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+	oti->oti_thread = req->rq_svc_thread;
+	if (req->rq_reqmsg != NULL)
+		oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+}
+
+static inline void oti_alloc_cookies(struct obd_trans_info *oti,int num_cookies)
+{
+	if (!oti)
+		return;
+
+	if (num_cookies == 1)
+		oti->oti_logcookies = &oti->oti_onecookie;
+	else
+		OBD_ALLOC_LARGE(oti->oti_logcookies,
+				num_cookies * sizeof(oti->oti_onecookie));
+
+	oti->oti_numcookies = num_cookies;
+}
+
+static inline void oti_free_cookies(struct obd_trans_info *oti)
+{
+	if (!oti || !oti->oti_logcookies)
+		return;
+
+	if (oti->oti_logcookies == &oti->oti_onecookie)
+		LASSERT(oti->oti_numcookies == 1);
+	else
+		OBD_FREE_LARGE(oti->oti_logcookies,
+			       oti->oti_numcookies*sizeof(oti->oti_onecookie));
+	oti->oti_logcookies = NULL;
+	oti->oti_numcookies = 0;
+}
+
+/*
+ * Events signalled through obd_notify() upcall-chain.
+ */
+enum obd_notify_event {
+	/* target added */
+	OBD_NOTIFY_CREATE,
+	/* Device connect start */
+	OBD_NOTIFY_CONNECT,
+	/* Device activated */
+	OBD_NOTIFY_ACTIVE,
+	/* Device deactivated */
+	OBD_NOTIFY_INACTIVE,
+	/* Device disconnected */
+	OBD_NOTIFY_DISCON,
+	/* Connect data for import were changed */
+	OBD_NOTIFY_OCD,
+	/* Sync request */
+	OBD_NOTIFY_SYNC_NONBLOCK,
+	OBD_NOTIFY_SYNC,
+	/* Configuration event */
+	OBD_NOTIFY_CONFIG,
+	/* Administratively deactivate/activate event */
+	OBD_NOTIFY_DEACTIVATE,
+	OBD_NOTIFY_ACTIVATE
+};
+
+/*
+ * Data structure used to pass obd_notify()-event to non-obd listeners (llite
+ * and liblustre being main examples).
+ */
+struct obd_notify_upcall {
+	int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
+			  enum obd_notify_event ev, void *owner, void *data);
+	/* Opaque datum supplied by upper layer listener */
+	void *onu_owner;
+};
+
+struct target_recovery_data {
+	svc_handler_t		trd_recovery_handler;
+	pid_t			trd_processing_task;
+	struct completion	trd_starting;
+	struct completion	trd_finishing;
+};
+
+struct obd_llog_group {
+	int		olg_seq;
+	struct llog_ctxt  *olg_ctxts[LLOG_MAX_CTXTS];
+	wait_queue_head_t	olg_waitq;
+	spinlock_t	   olg_lock;
+	struct mutex	   olg_cat_processing;
+};
+
+/* corresponds to one of the obd's */
+#define OBD_DEVICE_MAGIC	0XAB5CD6EF
+#define OBD_DEV_BY_DEVNAME      0xffffd0de
+
+struct obd_device {
+	struct obd_type	*obd_type;
+	__u32		   obd_magic;
+
+	/* common and UUID name of this device */
+	char		    obd_name[MAX_OBD_NAME];
+	struct obd_uuid	 obd_uuid;
+
+	struct lu_device       *obd_lu_dev;
+
+	int		     obd_minor;
+	/* bitfield modification is protected by obd_dev_lock */
+	unsigned long obd_attached:1,      /* finished attach */
+		      obd_set_up:1,	/* finished setup */
+		      obd_recovering:1,    /* there are recoverable clients */
+		      obd_abort_recovery:1,/* recovery expired */
+		      obd_version_recov:1, /* obd uses version checking */
+		      obd_replayable:1,    /* recovery is enabled; inform clients */
+		      obd_no_transno:1,    /* no committed-transno notification */
+		      obd_no_recov:1,      /* fail instead of retry messages */
+		      obd_stopping:1,      /* started cleanup */
+		      obd_starting:1,      /* started setup */
+		      obd_force:1,	 /* cleanup with > 0 obd refcount */
+		      obd_fail:1,	  /* cleanup with failover */
+		      obd_async_recov:1,   /* allow asynchronous orphan cleanup */
+		      obd_no_conn:1,       /* deny new connections */
+		      obd_inactive:1,      /* device active/inactive
+					   * (for /proc/status only!!) */
+		      obd_no_ir:1,	 /* no imperative recovery. */
+		      obd_process_conf:1;  /* device is processing mgs config */
+	/* use separate field as it is set in interrupt to don't mess with
+	 * protection of other bits using _bh lock */
+	unsigned long obd_recovery_expired:1;
+	/* uuid-export hash body */
+	cfs_hash_t	     *obd_uuid_hash;
+	/* nid-export hash body */
+	cfs_hash_t	     *obd_nid_hash;
+	/* nid stats body */
+	cfs_hash_t	     *obd_nid_stats_hash;
+	struct list_head	      obd_nid_stats;
+	atomic_t	    obd_refcount;
+	wait_queue_head_t	     obd_refcount_waitq;
+	struct list_head	      obd_exports;
+	struct list_head	      obd_unlinked_exports;
+	struct list_head	      obd_delayed_exports;
+	int		     obd_num_exports;
+	spinlock_t		obd_nid_lock;
+	struct ldlm_namespace  *obd_namespace;
+	struct ptlrpc_client	obd_ldlm_client; /* XXX OST/MDS only */
+	/* a spinlock is OK for what we do now, may need a semaphore later */
+	spinlock_t		obd_dev_lock; /* protect OBD bitfield above */
+	struct mutex		obd_dev_mutex;
+	__u64			obd_last_committed;
+	struct fsfilt_operations *obd_fsops;
+	spinlock_t		obd_osfs_lock;
+	struct obd_statfs	obd_osfs;       /* locked by obd_osfs_lock */
+	__u64			obd_osfs_age;
+	struct lvfs_run_ctxt	obd_lvfs_ctxt;
+	struct obd_llog_group	obd_olg;	/* default llog group */
+	struct obd_device	*obd_observer;
+	struct rw_semaphore	obd_observer_link_sem;
+	struct obd_notify_upcall obd_upcall;
+	struct obd_export       *obd_self_export;
+	/* list of exports in LRU order, for ping evictor, with obd_dev_lock */
+	struct list_head	      obd_exports_timed;
+	time_t		  obd_eviction_timer; /* for ping evictor */
+
+	int			      obd_max_recoverable_clients;
+	atomic_t		     obd_connected_clients;
+	int			      obd_stale_clients;
+	int			      obd_delayed_clients;
+	/* this lock protects all recovery list_heads, timer and
+	 * obd_next_recovery_transno value */
+	spinlock_t			 obd_recovery_task_lock;
+	__u64			    obd_next_recovery_transno;
+	int			      obd_replayed_requests;
+	int			      obd_requests_queued_for_recovery;
+	wait_queue_head_t		      obd_next_transno_waitq;
+	/* protected by obd_recovery_task_lock */
+	timer_list_t		      obd_recovery_timer;
+	time_t			   obd_recovery_start; /* seconds */
+	time_t			   obd_recovery_end; /* seconds, for lprocfs_status */
+	int			      obd_recovery_time_hard;
+	int			      obd_recovery_timeout;
+	int			      obd_recovery_ir_factor;
+
+	/* new recovery stuff from CMD2 */
+	struct target_recovery_data      obd_recovery_data;
+	int			      obd_replayed_locks;
+	atomic_t		     obd_req_replay_clients;
+	atomic_t		     obd_lock_replay_clients;
+	/* all lists are protected by obd_recovery_task_lock */
+	struct list_head		       obd_req_replay_queue;
+	struct list_head		       obd_lock_replay_queue;
+	struct list_head		       obd_final_req_queue;
+	int			      obd_recovery_stage;
+
+	union {
+		struct obd_device_target obt;
+		struct filter_obd filter;
+		struct client_obd cli;
+		struct ost_obd ost;
+		struct echo_client_obd echo_client;
+		struct echo_obd echo;
+		struct lov_obd lov;
+		struct lmv_obd lmv;
+	} u;
+	/* Fields used by LProcFS */
+	unsigned int	   obd_cntr_base;
+	struct lprocfs_stats  *obd_stats;
+
+	unsigned int	   md_cntr_base;
+	struct lprocfs_stats  *md_stats;
+
+	proc_dir_entry_t  *obd_proc_entry;
+	void		  *obd_proc_private; /* type private PDEs */
+	proc_dir_entry_t  *obd_proc_exports_entry;
+	proc_dir_entry_t  *obd_svc_procroot;
+	struct lprocfs_stats  *obd_svc_stats;
+	atomic_t	   obd_evict_inprogress;
+	wait_queue_head_t	    obd_evict_inprogress_waitq;
+	struct list_head	     obd_evict_list; /* protected with pet_lock */
+
+	/**
+	 * Ldlm pool part. Save last calculated SLV and Limit.
+	 */
+	rwlock_t		obd_pool_lock;
+	int		    obd_pool_limit;
+	__u64		  obd_pool_slv;
+
+	/**
+	 * A list of outstanding class_incref()'s against this obd. For
+	 * debugging.
+	 */
+	struct lu_ref	  obd_reference;
+
+	int		       obd_conn_inprogress;
+};
+
+#define OBD_LLOG_FL_SENDNOW     0x0001
+#define OBD_LLOG_FL_EXIT	0x0002
+
+enum obd_cleanup_stage {
+/* Special case hack for MDS LOVs */
+	OBD_CLEANUP_EARLY,
+/* can be directly mapped to .ldto_device_fini() */
+	OBD_CLEANUP_EXPORTS,
+};
+
+/* get/set_info keys */
+#define KEY_ASYNC	       "async"
+#define KEY_BLOCKSIZE_BITS      "blocksize_bits"
+#define KEY_BLOCKSIZE	   "blocksize"
+#define KEY_CAPA_KEY	    "capa_key"
+#define KEY_CHANGELOG_CLEAR     "changelog_clear"
+#define KEY_FID2PATH	    "fid2path"
+#define KEY_CHECKSUM	    "checksum"
+#define KEY_CLEAR_FS	    "clear_fs"
+#define KEY_CONN_DATA	   "conn_data"
+#define KEY_EVICT_BY_NID	"evict_by_nid"
+#define KEY_FIEMAP	      "fiemap"
+#define KEY_FLUSH_CTX	   "flush_ctx"
+#define KEY_GRANT_SHRINK	"grant_shrink"
+#define KEY_HSM_COPYTOOL_SEND   "hsm_send"
+#define KEY_INIT_RECOV_BACKUP   "init_recov_bk"
+#define KEY_INIT_RECOV	  "initial_recov"
+#define KEY_INTERMDS	    "inter_mds"
+#define KEY_LAST_ID	     "last_id"
+#define KEY_LAST_FID		"last_fid"
+#define KEY_LOCK_TO_STRIPE      "lock_to_stripe"
+#define KEY_LOVDESC	     "lovdesc"
+#define KEY_LOV_IDX	     "lov_idx"
+#define KEY_MAX_EASIZE	  "max_easize"
+#define KEY_MDS_CONN	    "mds_conn"
+#define KEY_MGSSEC	      "mgssec"
+#define KEY_NEXT_ID	     "next_id"
+#define KEY_READ_ONLY	   "read-only"
+#define KEY_REGISTER_TARGET     "register_target"
+#define KEY_SET_FS	      "set_fs"
+#define KEY_TGT_COUNT	   "tgt_count"
+/*      KEY_SET_INFO in lustre_idl.h */
+#define KEY_SPTLRPC_CONF	"sptlrpc_conf"
+#define KEY_CONNECT_FLAG	"connect_flags"
+#define KEY_SYNC_LOCK_CANCEL    "sync_lock_cancel"
+
+#define KEY_CACHE_SET		"cache_set"
+#define KEY_CACHE_LRU_SHRINK	"cache_lru_shrink"
+#define KEY_CHANGELOG_INDEX	"changelog_index"
+
+struct lu_context;
+
+/* /!\ must be coherent with include/linux/namei.h on patched kernel */
+#define IT_OPEN     (1 << 0)
+#define IT_CREAT    (1 << 1)
+#define IT_READDIR  (1 << 2)
+#define IT_GETATTR  (1 << 3)
+#define IT_LOOKUP   (1 << 4)
+#define IT_UNLINK   (1 << 5)
+#define IT_TRUNC    (1 << 6)
+#define IT_GETXATTR (1 << 7)
+#define IT_EXEC     (1 << 8)
+#define IT_PIN      (1 << 9)
+#define IT_LAYOUT   (1 << 10)
+#define IT_QUOTA_DQACQ (1 << 11)
+#define IT_QUOTA_CONN  (1 << 12)
+
+static inline int it_to_lock_mode(struct lookup_intent *it)
+{
+	/* CREAT needs to be tested before open (both could be set) */
+	if (it->it_op & IT_CREAT)
+		return LCK_CW;
+	else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP |
+			      IT_LAYOUT))
+		return LCK_CR;
+
+	LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
+	return -EINVAL;
+}
+
+struct md_op_data {
+	struct lu_fid	   op_fid1; /* operation fid1 (usualy parent) */
+	struct lu_fid	   op_fid2; /* operation fid2 (usualy child) */
+	struct lu_fid	   op_fid3; /* 2 extra fids to find conflicting */
+	struct lu_fid	   op_fid4; /* to the operation locks. */
+	mdsno_t		 op_mds;  /* what mds server open will go to */
+	struct lustre_handle    op_handle;
+	obd_time		op_mod_time;
+	const char	     *op_name;
+	int		     op_namelen;
+	__u32		   op_mode;
+	struct lmv_stripe_md   *op_mea1;
+	struct lmv_stripe_md   *op_mea2;
+	__u32		   op_suppgids[2];
+	__u32		   op_fsuid;
+	__u32		   op_fsgid;
+	cfs_cap_t	       op_cap;
+	void		   *op_data;
+
+	/* iattr fields and blocks. */
+	struct iattr	    op_attr;
+	unsigned int	    op_attr_flags;
+	__u64		   op_valid;
+	loff_t		  op_attr_blocks;
+
+	/* Size-on-MDS epoch and flags. */
+	__u64		   op_ioepoch;
+	__u32		   op_flags;
+
+	/* Capa fields */
+	struct obd_capa	*op_capa1;
+	struct obd_capa	*op_capa2;
+
+	/* Various operation flags. */
+	__u32		   op_bias;
+
+	/* Operation type */
+	__u32		   op_opc;
+
+	/* Used by readdir */
+	__u64		   op_offset;
+
+	/* Used by readdir */
+	__u32		   op_npages;
+
+	/* used to transfer info between the stacks of MD client
+	 * see enum op_cli_flags */
+	__u32			op_cli_flags;
+};
+
+enum op_cli_flags {
+	CLI_SET_MEA	= 1 << 0,
+	CLI_RM_ENTRY	= 1 << 1,
+};
+
+struct md_enqueue_info;
+/* metadata stat-ahead */
+typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req,
+				struct md_enqueue_info *minfo,
+				int rc);
+
+/* seq client type */
+enum lu_cli_type {
+	LUSTRE_SEQ_METADATA = 1,
+	LUSTRE_SEQ_DATA
+};
+
+struct md_enqueue_info {
+	struct md_op_data       mi_data;
+	struct lookup_intent    mi_it;
+	struct lustre_handle    mi_lockh;
+	struct inode	   *mi_dir;
+	md_enqueue_cb_t	 mi_cb;
+	__u64		   mi_cbdata;
+	unsigned int	    mi_generation;
+};
+
+struct obd_ops {
+	module_t *o_owner;
+	int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
+			   void *karg, void *uarg);
+	int (*o_get_info)(const struct lu_env *env, struct obd_export *,
+			  __u32 keylen, void *key, __u32 *vallen, void *val,
+			  struct lov_stripe_md *lsm);
+	int (*o_set_info_async)(const struct lu_env *, struct obd_export *,
+				__u32 keylen, void *key,
+				__u32 vallen, void *val,
+				struct ptlrpc_request_set *set);
+	int (*o_attach)(struct obd_device *dev, obd_count len, void *data);
+	int (*o_detach)(struct obd_device *dev);
+	int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg);
+	int (*o_precleanup)(struct obd_device *dev,
+			    enum obd_cleanup_stage cleanup_stage);
+	int (*o_cleanup)(struct obd_device *dev);
+	int (*o_process_config)(struct obd_device *dev, obd_count len,
+				void *data);
+	int (*o_postrecov)(struct obd_device *dev);
+	int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid,
+			  int priority);
+	int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid);
+	/* connect to the target device with given connection
+	 * data. @ocd->ocd_connect_flags is modified to reflect flags actually
+	 * granted by the target, which are guaranteed to be a subset of flags
+	 * asked for. If @ocd == NULL, use default parameters. */
+	int (*o_connect)(const struct lu_env *env,
+			 struct obd_export **exp, struct obd_device *src,
+			 struct obd_uuid *cluuid, struct obd_connect_data *ocd,
+			 void *localdata);
+	int (*o_reconnect)(const struct lu_env *env,
+			   struct obd_export *exp, struct obd_device *src,
+			   struct obd_uuid *cluuid,
+			   struct obd_connect_data *ocd,
+			   void *localdata);
+	int (*o_disconnect)(struct obd_export *exp);
+
+	/* Initialize/finalize fids infrastructure. */
+	int (*o_fid_init)(struct obd_device *obd,
+			  struct obd_export *exp, enum lu_cli_type type);
+	int (*o_fid_fini)(struct obd_device *obd);
+
+	/* Allocate new fid according to passed @hint. */
+	int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid,
+			   struct md_op_data *op_data);
+
+	/*
+	 * Object with @fid is getting deleted, we may want to do something
+	 * about this.
+	 */
+	int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
+			struct obd_statfs *osfs, __u64 max_age, __u32 flags);
+	int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
+			      __u64 max_age, struct ptlrpc_request_set *set);
+	int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt,
+			struct lov_stripe_md *mem_src);
+	int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt,
+			  struct lov_mds_md *disk_src, int disk_len);
+	int (*o_preallocate)(struct lustre_handle *, obd_count *req,
+			     obd_id *ids);
+	/* FIXME: add fid capability support for create & destroy! */
+	int (*o_precreate)(struct obd_export *exp);
+	int (*o_create)(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa, struct lov_stripe_md **ea,
+			struct obd_trans_info *oti);
+	int (*o_create_async)(struct obd_export *exp,  struct obd_info *oinfo,
+			      struct lov_stripe_md **ea,
+			      struct obd_trans_info *oti);
+	int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
+			 struct obdo *oa, struct lov_stripe_md *ea,
+			 struct obd_trans_info *oti, struct obd_export *md_exp,
+			 void *capa);
+	int (*o_setattr)(const struct lu_env *, struct obd_export *exp,
+			 struct obd_info *oinfo, struct obd_trans_info *oti);
+	int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+			       struct obd_trans_info *oti,
+			       struct ptlrpc_request_set *rqset);
+	int (*o_getattr)(const struct lu_env *env, struct obd_export *exp,
+			 struct obd_info *oinfo);
+	int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+			       struct ptlrpc_request_set *set);
+	int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo,
+		     obd_count oa_bufs, struct brw_page *pgarr,
+		     struct obd_trans_info *oti);
+	int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm,
+			   struct ost_lvb *lvb, int kms_only);
+	int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
+			    obd_off size, int shrink);
+	int (*o_punch)(const struct lu_env *, struct obd_export *exp,
+		       struct obd_info *oinfo, struct obd_trans_info *oti,
+		       struct ptlrpc_request_set *rqset);
+	int (*o_sync)(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_info *oinfo, obd_size start, obd_size end,
+		      struct ptlrpc_request_set *set);
+	int (*o_migrate)(struct lustre_handle *conn, struct lov_stripe_md *dst,
+			 struct lov_stripe_md *src, obd_size start,
+			 obd_size end, struct obd_trans_info *oti);
+	int (*o_copy)(struct lustre_handle *dstconn, struct lov_stripe_md *dst,
+		      struct lustre_handle *srconn, struct lov_stripe_md *src,
+		      obd_size start, obd_size end, struct obd_trans_info *);
+	int (*o_iterate)(struct lustre_handle *conn,
+			 int (*)(obd_id, obd_seq, void *),
+			 obd_id *startid, obd_seq seq, void *data);
+	int (*o_preprw)(const struct lu_env *env, int cmd,
+			struct obd_export *exp, struct obdo *oa, int objcount,
+			struct obd_ioobj *obj, struct niobuf_remote *remote,
+			int *nr_pages, struct niobuf_local *local,
+			struct obd_trans_info *oti, struct lustre_capa *capa);
+	int (*o_commitrw)(const struct lu_env *env, int cmd,
+			  struct obd_export *exp, struct obdo *oa,
+			  int objcount, struct obd_ioobj *obj,
+			  struct niobuf_remote *remote, int pages,
+			  struct niobuf_local *local,
+			  struct obd_trans_info *oti, int rc);
+	int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo,
+			 struct ldlm_enqueue_info *einfo,
+			 struct ptlrpc_request_set *rqset);
+	int (*o_change_cbdata)(struct obd_export *, struct lov_stripe_md *,
+			       ldlm_iterator_t it, void *data);
+	int (*o_find_cbdata)(struct obd_export *, struct lov_stripe_md *,
+			     ldlm_iterator_t it, void *data);
+	int (*o_cancel)(struct obd_export *, struct lov_stripe_md *md,
+			__u32 mode, struct lustre_handle *);
+	int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *,
+			       ldlm_cancel_flags_t flags, void *opaque);
+	int (*o_init_export)(struct obd_export *exp);
+	int (*o_destroy_export)(struct obd_export *exp);
+	int (*o_extent_calc)(struct obd_export *, struct lov_stripe_md *,
+			     int cmd, obd_off *);
+
+	/* llog related obd_methods */
+	int (*o_llog_init)(struct obd_device *obd, struct obd_llog_group *grp,
+			   struct obd_device *disk_obd, int *idx);
+	int (*o_llog_finish)(struct obd_device *obd, int count);
+	int (*o_llog_connect)(struct obd_export *, struct llogd_conn_body *);
+
+	/* metadata-only methods */
+	int (*o_pin)(struct obd_export *, const struct lu_fid *fid,
+		     struct obd_capa *, struct obd_client_handle *, int flag);
+	int (*o_unpin)(struct obd_export *, struct obd_client_handle *, int);
+
+	int (*o_import_event)(struct obd_device *, struct obd_import *,
+			      enum obd_import_event);
+
+	int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
+			enum obd_notify_event ev, void *data);
+
+	int (*o_health_check)(const struct lu_env *env, struct obd_device *);
+	struct obd_uuid *(*o_get_uuid) (struct obd_export *exp);
+
+	/* quota methods */
+	int (*o_quotacheck)(struct obd_device *, struct obd_export *,
+			    struct obd_quotactl *);
+	int (*o_quotactl)(struct obd_device *, struct obd_export *,
+			  struct obd_quotactl *);
+
+	int (*o_ping)(const struct lu_env *, struct obd_export *exp);
+
+	/* pools methods */
+	int (*o_pool_new)(struct obd_device *obd, char *poolname);
+	int (*o_pool_del)(struct obd_device *obd, char *poolname);
+	int (*o_pool_add)(struct obd_device *obd, char *poolname,
+			  char *ostname);
+	int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+			  char *ostname);
+	void (*o_getref)(struct obd_device *obd);
+	void (*o_putref)(struct obd_device *obd);
+	/*
+	 * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
+	 * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
+	 * Also, add a wrapper function in include/linux/obd_class.h. */
+};
+
+enum {
+	LUSTRE_OPC_MKDIR    = (1 << 0),
+	LUSTRE_OPC_SYMLINK  = (1 << 1),
+	LUSTRE_OPC_MKNOD    = (1 << 2),
+	LUSTRE_OPC_CREATE   = (1 << 3),
+	LUSTRE_OPC_ANY      = (1 << 4)
+};
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32	 0x7fffffffUL
+#define MAX_HASH_SIZE	    0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+struct lustre_md {
+	struct mdt_body	 *body;
+	struct lov_stripe_md    *lsm;
+	struct lmv_stripe_md    *mea;
+#ifdef CONFIG_FS_POSIX_ACL
+	struct posix_acl	*posix_acl;
+#endif
+	struct mdt_remote_perm  *remote_perm;
+	struct obd_capa	 *mds_capa;
+	struct obd_capa	 *oss_capa;
+};
+
+struct md_open_data {
+	struct obd_client_handle *mod_och;
+	struct ptlrpc_request    *mod_open_req;
+	struct ptlrpc_request    *mod_close_req;
+	atomic_t	      mod_refcount;
+};
+
+struct lookup_intent;
+
+struct md_ops {
+	int (*m_getstatus)(struct obd_export *, struct lu_fid *,
+			   struct obd_capa **);
+	int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
+	int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *,
+			     ldlm_iterator_t, void *);
+	int (*m_close)(struct obd_export *, struct md_op_data *,
+		       struct md_open_data *, struct ptlrpc_request **);
+	int (*m_create)(struct obd_export *, struct md_op_data *,
+			const void *, int, int, __u32, __u32, cfs_cap_t,
+			__u64, struct ptlrpc_request **);
+	int (*m_done_writing)(struct obd_export *, struct md_op_data  *,
+			      struct md_open_data *);
+	int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *,
+			 struct lookup_intent *, struct md_op_data *,
+			 struct lustre_handle *, void *, int,
+			 struct ptlrpc_request **, __u64);
+	int (*m_getattr)(struct obd_export *, struct md_op_data *,
+			 struct ptlrpc_request **);
+	int (*m_getattr_name)(struct obd_export *, struct md_op_data *,
+			      struct ptlrpc_request **);
+	int (*m_intent_lock)(struct obd_export *, struct md_op_data *,
+			     void *, int, struct lookup_intent *, int,
+			     struct ptlrpc_request **,
+			     ldlm_blocking_callback, __u64);
+	int (*m_link)(struct obd_export *, struct md_op_data *,
+		      struct ptlrpc_request **);
+	int (*m_rename)(struct obd_export *, struct md_op_data *,
+			const char *, int, const char *, int,
+			struct ptlrpc_request **);
+	int (*m_is_subdir)(struct obd_export *, const struct lu_fid *,
+			   const struct lu_fid *,
+			   struct ptlrpc_request **);
+	int (*m_setattr)(struct obd_export *, struct md_op_data *, void *,
+			 int , void *, int, struct ptlrpc_request **,
+			 struct md_open_data **mod);
+	int (*m_sync)(struct obd_export *, const struct lu_fid *,
+		      struct obd_capa *, struct ptlrpc_request **);
+	int (*m_readpage)(struct obd_export *, struct md_op_data *,
+			  struct page **, struct ptlrpc_request **);
+
+	int (*m_unlink)(struct obd_export *, struct md_op_data *,
+			struct ptlrpc_request **);
+
+	int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
+			  struct obd_capa *, obd_valid, const char *,
+			  const char *, int, int, int, __u32,
+			  struct ptlrpc_request **);
+
+	int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
+			  struct obd_capa *, obd_valid, const char *,
+			  const char *, int, int, int,
+			  struct ptlrpc_request **);
+
+	int (*m_init_ea_size)(struct obd_export *, int, int, int);
+
+	int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *,
+			       struct obd_export *, struct obd_export *,
+			       struct lustre_md *);
+
+	int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *);
+
+	int (*m_set_open_replay_data)(struct obd_export *,
+				      struct obd_client_handle *,
+				      struct ptlrpc_request *);
+	int (*m_clear_open_replay_data)(struct obd_export *,
+					struct obd_client_handle *);
+	int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *);
+
+	ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64,
+				    const struct lu_fid *, ldlm_type_t,
+				    ldlm_policy_data_t *, ldlm_mode_t,
+				    struct lustre_handle *);
+
+	int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *,
+			       ldlm_policy_data_t *, ldlm_mode_t,
+			       ldlm_cancel_flags_t flags, void *opaque);
+	int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc,
+			    renew_capa_cb_t cb);
+	int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *,
+			     const struct req_msg_field *, struct obd_capa **);
+
+	int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *,
+				 struct obd_capa *, __u32,
+				 struct ptlrpc_request **);
+
+	int (*m_intent_getattr_async)(struct obd_export *,
+				      struct md_enqueue_info *,
+				      struct ldlm_enqueue_info *);
+
+	int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
+				 struct lu_fid *, __u64 *bits);
+
+	/*
+	 * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to
+	 * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a
+	 * wrapper function in include/linux/obd_class.h.
+	 */
+};
+
+struct lsm_operations {
+	void (*lsm_free)(struct lov_stripe_md *);
+	int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
+			   struct obd_export *md_exp);
+	void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *,
+				    obd_off *);
+	void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *,
+				     obd_off *);
+	int (*lsm_lmm_verify) (struct lov_mds_md *lmm, int lmm_bytes,
+			       __u16 *stripe_count);
+	int (*lsm_unpackmd) (struct lov_obd *lov, struct lov_stripe_md *lsm,
+			     struct lov_mds_md *lmm);
+};
+
+extern const struct lsm_operations lsm_v1_ops;
+extern const struct lsm_operations lsm_v3_ops;
+static inline const struct lsm_operations *lsm_op_find(int magic)
+{
+	switch(magic) {
+	case LOV_MAGIC_V1:
+	       return &lsm_v1_ops;
+	case LOV_MAGIC_V3:
+	       return &lsm_v3_ops;
+	default:
+	       CERROR("Cannot recognize lsm_magic %08x\n", magic);
+	       return NULL;
+	}
+}
+
+/* Requests for obd_extent_calc() */
+#define OBD_CALC_STRIPE_START   1
+#define OBD_CALC_STRIPE_END     2
+
+static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo)
+{
+	return oinfo->oi_capa;
+}
+
+static inline struct md_open_data *obd_mod_alloc(void)
+{
+	struct md_open_data *mod;
+	OBD_ALLOC_PTR(mod);
+	if (mod == NULL)
+		return NULL;
+	atomic_set(&mod->mod_refcount, 1);
+	return mod;
+}
+
+#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount)
+#define obd_mod_put(mod)					\
+({							      \
+	if (atomic_dec_and_test(&(mod)->mod_refcount)) {	  \
+		if ((mod)->mod_open_req)			  \
+			ptlrpc_req_finished((mod)->mod_open_req);   \
+		OBD_FREE_PTR(mod);			      \
+	}						       \
+})
+
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid);
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent);
+
+/* return 1 if client should be resend request */
+static inline int client_should_resend(int resend, struct client_obd *cli)
+{
+	return atomic_read(&cli->cl_resends) ?
+	       atomic_read(&cli->cl_resends) > resend : 1;
+}
+
+/**
+ * Return device name for this device
+ *
+ * XXX: lu_device is declared before obd_device, while a pointer pointing
+ * back to obd_device in lu_device, so this helper function defines here
+ * instead of in lu_object.h
+ */
+static inline const char *lu_dev_name(const struct lu_device *lu_dev)
+{
+	return lu_dev->ld_obd->obd_name;
+}
+
+static inline bool filename_is_volatile(const char *name, int namelen, int *idx)
+{
+	const char	*start;
+	char		*end;
+
+	if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0)
+		return false;
+
+	/* caller does not care of idx */
+	if (idx == NULL)
+		return true;
+
+	/* volatile file, the MDT can be set from name */
+	/* name format is LUSTRE_VOLATILE_HDR:[idx]: */
+	/* if no MDT is specified, use std way */
+	if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2)
+		goto bad_format;
+	/* test for no MDT idx case */
+	if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') &&
+	    (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) {
+		*idx = -1;
+		return true;
+	}
+	/* we have an idx, read it */
+	start = name + LUSTRE_VOLATILE_HDR_LEN + 1;
+	*idx = strtoul(start, &end, 0);
+	/* error cases:
+	 * no digit, no trailing :, negative value
+	 */
+	if (((*idx == 0) && (end == start)) ||
+	    (*end != ':') || (*idx < 0))
+		goto bad_format;
+
+	return true;
+bad_format:
+	/* bad format of mdt idx, we cannot return an error
+	 * to caller so we use hash algo */
+	CERROR("Bad volatile file name format: %s\n",
+	       name + LUSTRE_VOLATILE_HDR_LEN);
+	return false;
+}
+
+static inline int cli_brw_size(struct obd_device *obd)
+{
+	LASSERT(obd != NULL);
+	return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+}
+
+#endif /* __OBD_H */

diff --git a/drivers/staging/lustre/lustre/include/obd_cache.h b/drivers/staging/lustre/lustre/include/obd_cache.h
new file mode 100644
index 0000000..c8249fb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_cache.h

@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_CACHE_H__
+#define _OBD_CACHE_H__
+
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/obd_cksum.h b/drivers/staging/lustre/lustre/include/obd_cksum.h
new file mode 100644
index 0000000..5f740f1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_cksum.h

@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_CKSUM
+#define __OBD_CKSUM
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
+{
+	switch (cksum_type) {
+	case OBD_CKSUM_CRC32:
+		return CFS_HASH_ALG_CRC32;
+	case OBD_CKSUM_ADLER:
+		return CFS_HASH_ALG_ADLER32;
+	case OBD_CKSUM_CRC32C:
+		return CFS_HASH_ALG_CRC32C;
+	default:
+		CERROR("Unknown checksum type (%x)!!!\n", cksum_type);
+		LBUG();
+	}
+	return 0;
+}
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+static inline obd_flag cksum_type_pack(cksum_type_t cksum_type)
+{
+	unsigned int    performance = 0, tmp;
+	obd_flag	flag = OBD_FL_CKSUM_ADLER;
+
+	if (cksum_type & OBD_CKSUM_CRC32) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_CRC32C) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_CRC32C;
+		}
+	}
+	if (cksum_type & OBD_CKSUM_ADLER) {
+		tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+		if (tmp > performance) {
+			performance = tmp;
+			flag = OBD_FL_CKSUM_ADLER;
+		}
+	}
+	if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
+						   OBD_CKSUM_CRC32 |
+						   OBD_CKSUM_ADLER))))
+		CWARN("unknown cksum type %x\n", cksum_type);
+
+	return flag;
+}
+
+static inline cksum_type_t cksum_type_unpack(obd_flag o_flags)
+{
+	switch (o_flags & OBD_FL_CKSUM_ALL) {
+	case OBD_FL_CKSUM_CRC32C:
+		return OBD_CKSUM_CRC32C;
+	case OBD_FL_CKSUM_CRC32:
+		return OBD_CKSUM_CRC32;
+	default:
+		break;
+	}
+
+	return OBD_CKSUM_ADLER;
+}
+
+/* Return a bitmask of the checksum types supported on this system.
+ * 1.8 supported ADLER it is base and not depend on hw
+ * Client uses all available local algos
+ */
+static inline cksum_type_t cksum_types_supported_client(void)
+{
+	cksum_type_t ret = OBD_CKSUM_ADLER;
+
+	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0)
+		ret |= OBD_CKSUM_CRC32C;
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
+		ret |= OBD_CKSUM_CRC32;
+
+	return ret;
+}
+
+/* Server uses algos that perform at 50% or better of the Adler */
+static inline cksum_type_t cksum_types_supported_server(void)
+{
+	int	     base_speed;
+	cksum_type_t    ret = OBD_CKSUM_ADLER;
+
+	CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+	       cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+	base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32C;
+	if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+	    base_speed)
+		ret |= OBD_CKSUM_CRC32;
+
+	return ret;
+}
+
+
+/* Select the best checksum algorithm among those supplied in the cksum_types
+ * input.
+ *
+ * Currently, calling cksum_type_pack() with a mask will return the fastest
+ * checksum type due to its benchmarking at libcfs module load.
+ * Caution is advised, however, since what is fastest on a single client may
+ * not be the fastest or most efficient algorithm on the server.  */
+static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
+{
+	return cksum_type_unpack(cksum_type_pack(cksum_types));
+}
+
+/* Checksum algorithm names. Must be defined in the same order as the
+ * OBD_CKSUM_* flags. */
+#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
+
+#endif /* __OBD_H */

diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h
new file mode 100644
index 0000000..de5c585
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_class.h

@@ -0,0 +1,2281 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef __CLASS_OBD_H
+#define __CLASS_OBD_H
+
+
+#include <obd_support.h>
+#include <lustre_import.h>
+#include <lustre_net.h>
+#include <obd.h>
+#include <lustre_lib.h>
+#include <lustre/lustre_idl.h>
+#include <lprocfs_status.h>
+
+#include <linux/obd_class.h>
+
+#define OBD_STATFS_NODELAY      0x0001  /* requests should be send without delay
+					 * and resends for avoid deadlocks */
+#define OBD_STATFS_FROM_CACHE   0x0002  /* the statfs callback should not update
+					 * obd_osfs_age */
+#define OBD_STATFS_PTLRPCD      0x0004  /* requests will be sent via ptlrpcd
+					 * instead of a specific set. This
+					 * means that we cannot rely on the set
+					 * interpret routine to be called.
+					 * lov_statfs_fini() must thus be called
+					 * by the request interpret routine */
+#define OBD_STATFS_FOR_MDT0	0x0008	/* The statfs is only for retrieving
+					 * information from MDT0. */
+#define OBD_FL_PUNCH    0x00000001      /* To indicate it is punch operation */
+
+/* OBD Device Declarations */
+extern struct obd_device *obd_devs[MAX_OBD_DEVICES];
+extern rwlock_t obd_dev_lock;
+
+/* OBD Operations Declarations */
+extern struct obd_device *class_conn2obd(struct lustre_handle *);
+extern struct obd_device *class_exp2obd(struct obd_export *);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+extern int lustre_get_jobid(char *jobid);
+
+struct lu_device_type;
+
+/* genops.c */
+struct obd_export *class_conn2export(struct lustre_handle *);
+int class_register_type(struct obd_ops *, struct md_ops *,
+			struct lprocfs_vars *, const char *nm,
+			struct lu_device_type *ldt);
+int class_unregister_type(const char *nm);
+
+struct obd_device *class_newdev(const char *type_name, const char *name);
+void class_release_dev(struct obd_device *obd);
+
+int class_name2dev(const char *name);
+struct obd_device *class_name2obd(const char *name);
+int class_uuid2dev(struct obd_uuid *uuid);
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+void class_obd_list(void);
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+					  const char * typ_name,
+					  struct obd_uuid *grp_uuid);
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid,
+					   int *next);
+struct obd_device * class_num2obd(int num);
+int get_devices_count(void);
+
+int class_notify_sptlrpc_conf(const char *fsname, int namelen);
+
+char *obd_export_nid2str(struct obd_export *exp);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep);
+
+int obd_zombie_impexp_init(void);
+void obd_zombie_impexp_stop(void);
+void obd_zombie_impexp_cull(void);
+void obd_zombie_barrier(void);
+void obd_exports_barrier(struct obd_device *obd);
+int kuc_len(int payload_len);
+struct kuc_hdr * kuc_ptr(void *p);
+int kuc_ispayload(void *p);
+void *kuc_alloc(int payload_len, int transport, int type);
+void kuc_free(void *p, int payload_len);
+
+struct llog_handle;
+struct llog_rec_hdr;
+typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *,
+			 struct llog_rec_hdr *, void *);
+/* obd_config.c */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name);
+int class_process_config(struct lustre_cfg *lcfg);
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data);
+int class_attach(struct lustre_cfg *lcfg);
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+struct obd_device *class_incref(struct obd_device *obd,
+				const char *scope, const void *source);
+void class_decref(struct obd_device *obd,
+		  const char *scope, const void *source);
+void dump_exports(struct obd_device *obd, int locks);
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_add_uuid(const char *uuid, __u64 nid);
+
+/*obdecho*/
+#ifdef LPROCFS
+extern void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+#define CFG_F_START     0x01   /* Set when we start updating from a log */
+#define CFG_F_MARKER    0x02   /* We are within a maker */
+#define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
+#define CFG_F_COMPAT146 0x08   /* Allow old-style logs */
+#define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
+
+/* Passed as data param to class_config_parse_llog */
+struct config_llog_instance {
+	char	       *cfg_obdname;
+	void	       *cfg_instance;
+	struct super_block *cfg_sb;
+	struct obd_uuid     cfg_uuid;
+	llog_cb_t	    cfg_callback;
+	int		 cfg_last_idx; /* for partial llog processing */
+	int		 cfg_flags;
+};
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg);
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			   char *name, struct config_llog_instance *cfg);
+
+enum {
+	CONFIG_T_CONFIG  = 0,
+	CONFIG_T_SPTLRPC = 1,
+	CONFIG_T_RECOVER = 2,
+	CONFIG_T_MAX     = 3
+};
+
+/* list of active configuration logs  */
+struct config_llog_data {
+	struct ldlm_res_id	  cld_resid;
+	struct config_llog_instance cld_cfg;
+	struct list_head		  cld_list_chain;
+	atomic_t		cld_refcount;
+	struct config_llog_data    *cld_sptlrpc;/* depended sptlrpc log */
+	struct config_llog_data    *cld_recover;    /* imperative recover log */
+	struct obd_export	  *cld_mgcexp;
+	struct mutex		    cld_lock;
+	int			 cld_type;
+	unsigned int		cld_stopping:1, /* we were told to stop
+						     * watching */
+				    cld_lostlock:1; /* lock not requeued */
+	char			cld_logname[0];
+};
+
+struct lustre_profile {
+	struct list_head       lp_list;
+	char	    *lp_profile;
+	char	    *lp_dt;
+	char	    *lp_md;
+};
+
+struct lustre_profile *class_get_profile(const char * prof);
+void class_del_profile(const char *prof);
+void class_del_profiles(void);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *);
+void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *);
+extern void (*class_export_dump_hook)(struct obd_export *);
+
+#else
+
+#define __class_export_add_lock_ref(exp, lock)	     do {} while(0)
+#define __class_export_del_lock_ref(exp, lock)	     do {} while(0)
+
+#endif
+
+#define class_export_rpc_inc(exp)				       \
+({								      \
+	atomic_inc(&(exp)->exp_rpc_count);			  \
+	CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n",    \
+	       (exp), atomic_read(&(exp)->exp_rpc_count));	  \
+})
+
+#define class_export_rpc_dec(exp)				       \
+({								      \
+	LASSERT_ATOMIC_POS(&exp->exp_rpc_count);			\
+	atomic_dec(&(exp)->exp_rpc_count);			  \
+	CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n",    \
+	       (exp), atomic_read(&(exp)->exp_rpc_count));	  \
+})
+
+#define class_export_lock_get(exp, lock)				\
+({								      \
+	atomic_inc(&(exp)->exp_locks_count);			\
+	__class_export_add_lock_ref(exp, lock);			 \
+	CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \
+	       (exp), atomic_read(&(exp)->exp_locks_count));	\
+	class_export_get(exp);					  \
+})
+
+#define class_export_lock_put(exp, lock)				\
+({								      \
+	LASSERT_ATOMIC_POS(&exp->exp_locks_count);		      \
+	atomic_dec(&(exp)->exp_locks_count);			\
+	__class_export_del_lock_ref(exp, lock);			 \
+	CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \
+	       (exp), atomic_read(&(exp)->exp_locks_count));	\
+	class_export_put(exp);					  \
+})
+
+#define class_export_cb_get(exp)					\
+({								      \
+	atomic_inc(&(exp)->exp_cb_count);			   \
+	CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\
+	       (exp), atomic_read(&(exp)->exp_cb_count));	   \
+	class_export_get(exp);					  \
+})
+
+#define class_export_cb_put(exp)					\
+({								      \
+	LASSERT_ATOMIC_POS(&exp->exp_cb_count);			 \
+	atomic_dec(&(exp)->exp_cb_count);			   \
+	CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\
+	       (exp), atomic_read(&(exp)->exp_cb_count));	   \
+	class_export_put(exp);					  \
+})
+
+/* genops.c */
+struct obd_export *class_export_get(struct obd_export *exp);
+void class_export_put(struct obd_export *exp);
+struct obd_export *class_new_export(struct obd_device *obddev,
+				    struct obd_uuid *cluuid);
+void class_unlink_export(struct obd_export *exp);
+
+struct obd_import *class_import_get(struct obd_import *);
+void class_import_put(struct obd_import *);
+struct obd_import *class_new_import(struct obd_device *obd);
+void class_destroy_import(struct obd_import *exp);
+
+struct obd_type *class_search_type(const char *name);
+struct obd_type *class_get_type(const char *name);
+void class_put_type(struct obd_type *type);
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+		  struct obd_uuid *cluuid);
+int class_disconnect(struct obd_export *exp);
+void class_fail_export(struct obd_export *exp);
+int class_connected_export(struct obd_export *exp);
+void class_disconnect_exports(struct obd_device *obddev);
+int class_manual_cleanup(struct obd_device *obd);
+void class_disconnect_stale_exports(struct obd_device *,
+				    int (*test_export)(struct obd_export *));
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+	return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+		(obd->obd_force ? OBD_OPT_FORCE : 0) |
+		(obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+		0);
+}
+
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid);
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr,
+		     unsigned int ia_valid);
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid);
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid);
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+		  unsigned int valid);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo);
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo);
+
+#define OBT(dev)	(dev)->obd_type
+#define OBP(dev, op)    (dev)->obd_type->typ_dt_ops->o_ ## op
+#define MDP(dev, op)    (dev)->obd_type->typ_md_ops->m_ ## op
+#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
+
+/* Ensure obd_setup: used for cleanup which must be called
+   while obd is stopping */
+#define OBD_CHECK_DEV(obd)				      \
+do {							    \
+	if (!(obd)) {					   \
+		CERROR("NULL device\n");			\
+		RETURN(-ENODEV);				\
+	}						       \
+} while (0)
+
+/* ensure obd_setup and !obd_stopping */
+#define OBD_CHECK_DEV_ACTIVE(obd)			       \
+do {							    \
+	OBD_CHECK_DEV(obd);				     \
+	if (!(obd)->obd_set_up || (obd)->obd_stopping) {	\
+		CERROR("Device %d not setup\n",		 \
+		       (obd)->obd_minor);		       \
+		RETURN(-ENODEV);				\
+	}						       \
+} while (0)
+
+
+#ifdef LPROCFS
+#define OBD_COUNTER_OFFSET(op)				  \
+	((offsetof(struct obd_ops, o_ ## op) -		  \
+	  offsetof(struct obd_ops, o_iocontrol))		\
+	 / sizeof(((struct obd_ops *)(0))->o_iocontrol))
+
+#define OBD_COUNTER_INCREMENT(obdx, op)			   \
+	if ((obdx)->obd_stats != NULL) {			  \
+		unsigned int coffset;			     \
+		coffset = (unsigned int)((obdx)->obd_cntr_base) + \
+			OBD_COUNTER_OFFSET(op);		   \
+		LASSERT(coffset < (obdx)->obd_stats->ls_num);     \
+		lprocfs_counter_incr((obdx)->obd_stats, coffset); \
+	}
+
+#define EXP_COUNTER_INCREMENT(export, op)				    \
+	if ((export)->exp_obd->obd_stats != NULL) {			  \
+		unsigned int coffset;					\
+		coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \
+			OBD_COUNTER_OFFSET(op);			      \
+		LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num);     \
+		lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \
+		if ((export)->exp_nid_stats != NULL &&		       \
+		    (export)->exp_nid_stats->nid_stats != NULL)	      \
+			lprocfs_counter_incr(				\
+				(export)->exp_nid_stats->nid_stats, coffset);\
+	}
+
+#define MD_COUNTER_OFFSET(op)				   \
+	((offsetof(struct md_ops, m_ ## op) -		   \
+	  offsetof(struct md_ops, m_getstatus))		 \
+	 / sizeof(((struct md_ops *)(0))->m_getstatus))
+
+#define MD_COUNTER_INCREMENT(obdx, op)			   \
+	if ((obd)->md_stats != NULL) {			   \
+		unsigned int coffset;			    \
+		coffset = (unsigned int)((obdx)->md_cntr_base) + \
+			MD_COUNTER_OFFSET(op);		   \
+		LASSERT(coffset < (obdx)->md_stats->ls_num);     \
+		lprocfs_counter_incr((obdx)->md_stats, coffset); \
+	}
+
+#define EXP_MD_COUNTER_INCREMENT(export, op)				 \
+	if ((export)->exp_obd->obd_stats != NULL) {			  \
+		unsigned int coffset;					\
+		coffset = (unsigned int)((export)->exp_obd->md_cntr_base) +  \
+			MD_COUNTER_OFFSET(op);			       \
+		LASSERT(coffset < (export)->exp_obd->md_stats->ls_num);      \
+		lprocfs_counter_incr((export)->exp_obd->md_stats, coffset);  \
+		if ((export)->exp_md_stats != NULL)			  \
+			lprocfs_counter_incr(				\
+				(export)->exp_md_stats, coffset);	    \
+	}
+
+#else
+#define OBD_COUNTER_OFFSET(op)
+#define OBD_COUNTER_INCREMENT(obd, op)
+#define EXP_COUNTER_INCREMENT(exp, op)
+#define MD_COUNTER_INCREMENT(obd, op)
+#define EXP_MD_COUNTER_INCREMENT(exp, op)
+#endif
+
+static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
+{
+	/* Always add in ldlm_stats */
+	tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC
+						  ,LPROCFS_STATS_FLAG_NOPERCPU);
+	if (tmp->nid_ldlm_stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+	return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+				      tmp->nid_ldlm_stats);
+}
+
+#define OBD_CHECK_MD_OP(obd, op, err)			   \
+do {							    \
+	if (!OBT(obd) || !MDP((obd), op)) {		     \
+		if (err)					\
+			CERROR("md_" #op ": dev %s/%d no operation\n", \
+			       obd->obd_name, obd->obd_minor);  \
+		RETURN(err);				    \
+	}						       \
+} while (0)
+
+#define EXP_CHECK_MD_OP(exp, op)				\
+do {							    \
+	if ((exp) == NULL) {				    \
+		CERROR("obd_" #op ": NULL export\n");	   \
+		RETURN(-ENODEV);				\
+	}						       \
+	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {   \
+		CERROR("obd_" #op ": cleaned up obd\n");	\
+		RETURN(-EOPNOTSUPP);			    \
+	}						       \
+	if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \
+		CERROR("obd_" #op ": dev %s/%d no operation\n", \
+		       (exp)->exp_obd->obd_name,		\
+		       (exp)->exp_obd->obd_minor);	      \
+		RETURN(-EOPNOTSUPP);			    \
+	}						       \
+} while (0)
+
+
+#define OBD_CHECK_DT_OP(obd, op, err)			   \
+do {							    \
+	if (!OBT(obd) || !OBP((obd), op)) {		     \
+		if (err)					\
+			CERROR("obd_" #op ": dev %d no operation\n",    \
+			       obd->obd_minor);		 \
+		RETURN(err);				    \
+	}						       \
+} while (0)
+
+#define EXP_CHECK_DT_OP(exp, op)				\
+do {							    \
+	if ((exp) == NULL) {				    \
+		CERROR("obd_" #op ": NULL export\n");	   \
+		RETURN(-ENODEV);				\
+	}						       \
+	if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {   \
+		CERROR("obd_" #op ": cleaned up obd\n");	\
+		RETURN(-EOPNOTSUPP);			    \
+	}						       \
+	if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \
+		CERROR("obd_" #op ": dev %d no operation\n",    \
+		       (exp)->exp_obd->obd_minor);	      \
+		RETURN(-EOPNOTSUPP);			    \
+	}						       \
+} while (0)
+
+#define CTXT_CHECK_OP(ctxt, op, err)				 \
+do {								 \
+	if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) {	     \
+		if (err)					     \
+			CERROR("lop_" #op ": dev %d no operation\n", \
+			       ctxt->loc_obd->obd_minor);	    \
+		RETURN(err);					 \
+	}							    \
+} while (0)
+
+static inline int class_devno_max(void)
+{
+	return MAX_OBD_DEVICES;
+}
+
+static inline int obd_get_info(const struct lu_env *env,
+			       struct obd_export *exp, __u32 keylen,
+			       void *key, __u32 *vallen, void *val,
+			       struct lov_stripe_md *lsm)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, get_info);
+	EXP_COUNTER_INCREMENT(exp, get_info);
+
+	rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val,
+					 lsm);
+	RETURN(rc);
+}
+
+static inline int obd_set_info_async(const struct lu_env *env,
+				     struct obd_export *exp, obd_count keylen,
+				     void *key, obd_count vallen, void *val,
+				     struct ptlrpc_request_set *set)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, set_info_async);
+	EXP_COUNTER_INCREMENT(exp, set_info_async);
+
+	rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
+					       val, set);
+	RETURN(rc);
+}
+
+/*
+ * obd-lu integration.
+ *
+ * Functionality is being moved into new lu_device-based layering, but some
+ * pieces of configuration process are still based on obd devices.
+ *
+ * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully
+ * subsume ->o_setup() methods of obd devices they replace. The same for
+ * lu_device_operations::ldo_process_config() and ->o_process_config(). As a
+ * result, obd_setup() and obd_process_config() branch and call one XOR
+ * another.
+ *
+ * Yet neither lu_device_type_operations::ldto_device_fini() nor
+ * lu_device_type_operations::ldto_device_free() fully implement the
+ * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
+ * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
+ */
+
+#define DECLARE_LU_VARS(ldt, d)		 \
+	struct lu_device_type *ldt;       \
+	struct lu_device *d
+
+static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+	ENTRY;
+
+	ldt = obd->obd_type->typ_lu;
+	if (ldt != NULL) {
+		struct lu_context  session_ctx;
+		struct lu_env env;
+		lu_context_init(&session_ctx, LCT_SESSION);
+		session_ctx.lc_thread = NULL;
+		lu_context_enter(&session_ctx);
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			env.le_ses = &session_ctx;
+			d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
+			lu_env_fini(&env);
+			if (!IS_ERR(d)) {
+				obd->obd_lu_dev = d;
+				d->ld_obd = obd;
+				rc = 0;
+			} else
+				rc = PTR_ERR(d);
+		}
+		lu_context_exit(&session_ctx);
+		lu_context_fini(&session_ctx);
+
+	} else {
+		OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
+		OBD_COUNTER_INCREMENT(obd, setup);
+		rc = OBP(obd, setup)(obd, cfg);
+	}
+	RETURN(rc);
+}
+
+static inline int obd_precleanup(struct obd_device *obd,
+				 enum obd_cleanup_stage cleanup_stage)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+	ENTRY;
+
+	OBD_CHECK_DEV(obd);
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
+	if (ldt != NULL && d != NULL) {
+		if (cleanup_stage == OBD_CLEANUP_EXPORTS) {
+			struct lu_env env;
+
+			rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+			if (rc == 0) {
+				ldt->ldt_ops->ldto_device_fini(&env, d);
+				lu_env_fini(&env);
+			}
+		}
+	}
+	OBD_CHECK_DT_OP(obd, precleanup, 0);
+	OBD_COUNTER_INCREMENT(obd, precleanup);
+
+	rc = OBP(obd, precleanup)(obd, cleanup_stage);
+	RETURN(rc);
+}
+
+static inline int obd_cleanup(struct obd_device *obd)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+	ENTRY;
+
+	OBD_CHECK_DEV(obd);
+
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
+	if (ldt != NULL && d != NULL) {
+		struct lu_env env;
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			ldt->ldt_ops->ldto_device_free(&env, d);
+			lu_env_fini(&env);
+			obd->obd_lu_dev = NULL;
+		}
+	}
+	OBD_CHECK_DT_OP(obd, cleanup, 0);
+	OBD_COUNTER_INCREMENT(obd, cleanup);
+
+	rc = OBP(obd, cleanup)(obd);
+	RETURN(rc);
+}
+
+static inline void obd_cleanup_client_import(struct obd_device *obd)
+{
+	ENTRY;
+
+	/* If we set up but never connected, the
+	   client import will not have been cleaned. */
+	down_write(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import) {
+		struct obd_import *imp;
+		imp = obd->u.cli.cl_import;
+		CDEBUG(D_CONFIG, "%s: client import never connected\n",
+		       obd->obd_name);
+		ptlrpc_invalidate_import(imp);
+		if (imp->imp_rq_pool) {
+			ptlrpc_free_rq_pool(imp->imp_rq_pool);
+			imp->imp_rq_pool = NULL;
+		}
+		client_destroy_import(imp);
+		obd->u.cli.cl_import = NULL;
+	}
+	up_write(&obd->u.cli.cl_sem);
+
+	EXIT;
+}
+
+static inline int
+obd_process_config(struct obd_device *obd, int datalen, void *data)
+{
+	int rc;
+	DECLARE_LU_VARS(ldt, d);
+	ENTRY;
+
+	OBD_CHECK_DEV(obd);
+
+	obd->obd_process_conf = 1;
+	ldt = obd->obd_type->typ_lu;
+	d = obd->obd_lu_dev;
+	if (ldt != NULL && d != NULL) {
+		struct lu_env env;
+
+		rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+		if (rc == 0) {
+			rc = d->ld_ops->ldo_process_config(&env, d, data);
+			lu_env_fini(&env);
+		}
+	} else {
+		OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
+		rc = OBP(obd, process_config)(obd, datalen, data);
+	}
+	OBD_COUNTER_INCREMENT(obd, process_config);
+	obd->obd_process_conf = 0;
+
+	RETURN(rc);
+}
+
+/* Pack an in-memory MD struct for storage on disk.
+ * Returns +ve size of packed MD (0 for free), or -ve error.
+ *
+ * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL).
+ * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed.
+ * If @*disk_tgt == NULL, it will be allocated
+ */
+static inline int obd_packmd(struct obd_export *exp,
+			     struct lov_mds_md **disk_tgt,
+			     struct lov_stripe_md *mem_src)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, packmd);
+	EXP_COUNTER_INCREMENT(exp, packmd);
+
+	rc = OBP(exp->exp_obd, packmd)(exp, disk_tgt, mem_src);
+	RETURN(rc);
+}
+
+static inline int obd_size_diskmd(struct obd_export *exp,
+				  struct lov_stripe_md *mem_src)
+{
+	return obd_packmd(exp, NULL, mem_src);
+}
+
+/* helper functions */
+static inline int obd_alloc_diskmd(struct obd_export *exp,
+				   struct lov_mds_md **disk_tgt)
+{
+	LASSERT(disk_tgt);
+	LASSERT(*disk_tgt == NULL);
+	return obd_packmd(exp, disk_tgt, NULL);
+}
+
+static inline int obd_free_diskmd(struct obd_export *exp,
+				  struct lov_mds_md **disk_tgt)
+{
+	LASSERT(disk_tgt);
+	LASSERT(*disk_tgt);
+	/*
+	 * LU-2590, for caller's convenience, *disk_tgt could be host
+	 * endianness, it needs swab to LE if necessary, while just
+	 * lov_mds_md header needs it for figuring out how much memory
+	 * needs to be freed.
+	 */
+	if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+	    (((*disk_tgt)->lmm_magic == LOV_MAGIC_V1) ||
+	     ((*disk_tgt)->lmm_magic == LOV_MAGIC_V3)))
+		lustre_swab_lov_mds_md(*disk_tgt);
+	return obd_packmd(exp, disk_tgt, NULL);
+}
+
+/* Unpack an MD struct from disk to in-memory format.
+ * Returns +ve size of unpacked MD (0 for free), or -ve error.
+ *
+ * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL).
+ * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed.
+ * If @*mem_tgt == NULL, it will be allocated
+ */
+static inline int obd_unpackmd(struct obd_export *exp,
+			       struct lov_stripe_md **mem_tgt,
+			       struct lov_mds_md *disk_src,
+			       int disk_len)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, unpackmd);
+	EXP_COUNTER_INCREMENT(exp, unpackmd);
+
+	rc = OBP(exp->exp_obd, unpackmd)(exp, mem_tgt, disk_src, disk_len);
+	RETURN(rc);
+}
+
+/* helper functions */
+static inline int obd_alloc_memmd(struct obd_export *exp,
+				  struct lov_stripe_md **mem_tgt)
+{
+	LASSERT(mem_tgt);
+	LASSERT(*mem_tgt == NULL);
+	return obd_unpackmd(exp, mem_tgt, NULL, 0);
+}
+
+static inline int obd_free_memmd(struct obd_export *exp,
+				 struct lov_stripe_md **mem_tgt)
+{
+	int rc;
+
+	LASSERT(mem_tgt);
+	LASSERT(*mem_tgt);
+	rc = obd_unpackmd(exp, mem_tgt, NULL, 0);
+	*mem_tgt = NULL;
+	return rc;
+}
+
+static inline int obd_precreate(struct obd_export *exp)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, precreate);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, precreate);
+
+	rc = OBP(exp->exp_obd, precreate)(exp);
+	RETURN(rc);
+}
+
+static inline int obd_create_async(struct obd_export *exp,
+				   struct obd_info *oinfo,
+				   struct lov_stripe_md **ea,
+				   struct obd_trans_info *oti)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, create_async);
+	EXP_COUNTER_INCREMENT(exp, create_async);
+
+	rc = OBP(exp->exp_obd, create_async)(exp, oinfo, ea, oti);
+	RETURN(rc);
+}
+
+static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
+			     struct obdo *obdo, struct lov_stripe_md **ea,
+			     struct obd_trans_info *oti)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, create);
+	EXP_COUNTER_INCREMENT(exp, create);
+
+	rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti);
+	RETURN(rc);
+}
+
+static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
+			      struct obdo *obdo, struct lov_stripe_md *ea,
+			      struct obd_trans_info *oti,
+			      struct obd_export *md_exp, void *capa)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, destroy);
+	EXP_COUNTER_INCREMENT(exp, destroy);
+
+	rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp, capa);
+	RETURN(rc);
+}
+
+static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
+			      struct obd_info *oinfo)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, getattr);
+	EXP_COUNTER_INCREMENT(exp, getattr);
+
+	rc = OBP(exp->exp_obd, getattr)(env, exp, oinfo);
+	RETURN(rc);
+}
+
+static inline int obd_getattr_async(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct ptlrpc_request_set *set)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, getattr_async);
+	EXP_COUNTER_INCREMENT(exp, getattr_async);
+
+	rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set);
+	RETURN(rc);
+}
+
+static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
+			      struct obd_info *oinfo,
+			      struct obd_trans_info *oti)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, setattr);
+	EXP_COUNTER_INCREMENT(exp, setattr);
+
+	rc = OBP(exp->exp_obd, setattr)(env, exp, oinfo, oti);
+	RETURN(rc);
+}
+
+/* This performs all the requests set init/wait/destroy actions. */
+static inline int obd_setattr_rqset(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct obd_trans_info *oti)
+{
+	struct ptlrpc_request_set *set = NULL;
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, setattr_async);
+	EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	RETURN(rc);
+}
+
+/* This adds all the requests into @set if @set != NULL, otherwise
+   all requests are sent asynchronously without waiting for response. */
+static inline int obd_setattr_async(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct obd_trans_info *oti,
+				    struct ptlrpc_request_set *set)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, setattr_async);
+	EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+	rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+	RETURN(rc);
+}
+
+static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			       int priority)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DEV_ACTIVE(obd);
+	OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, add_conn);
+
+	rc = OBP(obd, add_conn)(imp, uuid, priority);
+	RETURN(rc);
+}
+
+static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DEV_ACTIVE(obd);
+	OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, del_conn);
+
+	rc = OBP(obd, del_conn)(imp, uuid);
+	RETURN(rc);
+}
+
+static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
+{
+	struct obd_uuid *uuid;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
+	EXP_COUNTER_INCREMENT(exp, get_uuid);
+
+	uuid = OBP(exp->exp_obd, get_uuid)(exp);
+	RETURN(uuid);
+}
+
+/** Create a new /a exp on device /a obd for the uuid /a cluuid
+ * @param exp New export handle
+ * @param d Connect data, supported flags are set, flags also understood
+ *    by obd are returned.
+ */
+static inline int obd_connect(const struct lu_env *env,
+			      struct obd_export **exp,struct obd_device *obd,
+			      struct obd_uuid *cluuid,
+			      struct obd_connect_data *data,
+			      void *localdata)
+{
+	int rc;
+	__u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition
+						   * check */
+	ENTRY;
+
+	OBD_CHECK_DEV_ACTIVE(obd);
+	OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, connect);
+
+	rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
+	/* check that only subset is granted */
+	LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) ==
+				    data->ocd_connect_flags));
+	RETURN(rc);
+}
+
+static inline int obd_reconnect(const struct lu_env *env,
+				struct obd_export *exp,
+				struct obd_device *obd,
+				struct obd_uuid *cluuid,
+				struct obd_connect_data *d,
+				void *localdata)
+{
+	int rc;
+	__u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
+						   * check */
+
+	ENTRY;
+
+	OBD_CHECK_DEV_ACTIVE(obd);
+	OBD_CHECK_DT_OP(obd, reconnect, 0);
+	OBD_COUNTER_INCREMENT(obd, reconnect);
+
+	rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
+	/* check that only subset is granted */
+	LASSERT(ergo(d != NULL,
+		     (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
+	RETURN(rc);
+}
+
+static inline int obd_disconnect(struct obd_export *exp)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, disconnect);
+	EXP_COUNTER_INCREMENT(exp, disconnect);
+
+	rc = OBP(exp->exp_obd, disconnect)(exp);
+	RETURN(rc);
+}
+
+static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
+			       enum lu_cli_type type)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, fid_init, 0);
+	OBD_COUNTER_INCREMENT(obd, fid_init);
+
+	rc = OBP(obd, fid_init)(obd, exp, type);
+	RETURN(rc);
+}
+
+static inline int obd_fid_fini(struct obd_device *obd)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, fid_fini, 0);
+	OBD_COUNTER_INCREMENT(obd, fid_fini);
+
+	rc = OBP(obd, fid_fini)(obd);
+	RETURN(rc);
+}
+
+static inline int obd_fid_alloc(struct obd_export *exp,
+				struct lu_fid *fid,
+				struct md_op_data *op_data)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, fid_alloc);
+	EXP_COUNTER_INCREMENT(exp, fid_alloc);
+
+	rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data);
+	RETURN(rc);
+}
+
+static inline int obd_ping(const struct lu_env *env, struct obd_export *exp)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, ping, 0);
+	EXP_COUNTER_INCREMENT(exp, ping);
+
+	rc = OBP(exp->exp_obd, ping)(env, exp);
+	RETURN(rc);
+}
+
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_new);
+
+	rc = OBP(obd, pool_new)(obd, poolname);
+	RETURN(rc);
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_del);
+
+	rc = OBP(obd, pool_del)(obd, poolname);
+	RETURN(rc);
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_add);
+
+	rc = OBP(obd, pool_add)(obd, poolname, ostname);
+	RETURN(rc);
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, pool_rem);
+
+	rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+	RETURN(rc);
+}
+
+static inline void obd_getref(struct obd_device *obd)
+{
+	ENTRY;
+	if (OBT(obd) && OBP(obd, getref)) {
+		OBD_COUNTER_INCREMENT(obd, getref);
+		OBP(obd, getref)(obd);
+	}
+	EXIT;
+}
+
+static inline void obd_putref(struct obd_device *obd)
+{
+	ENTRY;
+	if (OBT(obd) && OBP(obd, putref)) {
+		OBD_COUNTER_INCREMENT(obd, putref);
+		OBP(obd, putref)(obd);
+	}
+	EXIT;
+}
+
+static inline int obd_init_export(struct obd_export *exp)
+{
+	int rc = 0;
+
+	ENTRY;
+	if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+	    OBP((exp)->exp_obd, init_export))
+		rc = OBP(exp->exp_obd, init_export)(exp);
+	RETURN(rc);
+}
+
+static inline int obd_destroy_export(struct obd_export *exp)
+{
+	ENTRY;
+	if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+	    OBP((exp)->exp_obd, destroy_export))
+		OBP(exp->exp_obd, destroy_export)(exp);
+	RETURN(0);
+}
+
+static inline int obd_extent_calc(struct obd_export *exp,
+				  struct lov_stripe_md *md,
+				  int cmd, obd_off *offset)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_DT_OP(exp, extent_calc);
+	rc = OBP(exp->exp_obd, extent_calc)(exp, md, cmd, offset);
+	RETURN(rc);
+}
+
+static inline struct dentry *
+obd_lvfs_fid2dentry(struct obd_export *exp, struct ost_id *oi, __u32 gen)
+{
+	struct lvfs_run_ctxt *ctxt = &exp->exp_obd->obd_lvfs_ctxt;
+	LASSERT(exp->exp_obd);
+
+	return ctxt->cb_ops.l_fid2dentry(ostid_id(oi), gen, ostid_seq(oi),
+					 exp->exp_obd);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs_async(struct obd_export *exp,
+				   struct obd_info *oinfo,
+				   __u64 max_age,
+				   struct ptlrpc_request_set *rqset)
+{
+	int rc = 0;
+	struct obd_device *obd;
+	ENTRY;
+
+	if (exp == NULL || exp->exp_obd == NULL)
+		RETURN(-EINVAL);
+
+	obd = exp->exp_obd;
+	OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, statfs);
+
+	CDEBUG(D_SUPER, "%s: osfs %p age "LPU64", max_age "LPU64"\n",
+	       obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
+	if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+		rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+	} else {
+		CDEBUG(D_SUPER,"%s: use %p cache blocks "LPU64"/"LPU64
+		       " objects "LPU64"/"LPU64"\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+		oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+		if (oinfo->oi_cb_up)
+			oinfo->oi_cb_up(oinfo, 0);
+	}
+	RETURN(rc);
+}
+
+static inline int obd_statfs_rqset(struct obd_export *exp,
+				   struct obd_statfs *osfs, __u64 max_age,
+				   __u32 flags)
+{
+	struct ptlrpc_request_set *set = NULL;
+	struct obd_info oinfo = { { { 0 } } };
+	int rc = 0;
+	ENTRY;
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	oinfo.oi_osfs = osfs;
+	oinfo.oi_flags = flags;
+	rc = obd_statfs_async(exp, &oinfo, max_age, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	RETURN(rc);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
+			     struct obd_statfs *osfs, __u64 max_age,
+			     __u32 flags)
+{
+	int rc = 0;
+	struct obd_device *obd = exp->exp_obd;
+	ENTRY;
+
+	if (obd == NULL)
+		RETURN(-EINVAL);
+
+	OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+	OBD_COUNTER_INCREMENT(obd, statfs);
+
+	CDEBUG(D_SUPER, "osfs "LPU64", max_age "LPU64"\n",
+	       obd->obd_osfs_age, max_age);
+	if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+		rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+		if (rc == 0) {
+			spin_lock(&obd->obd_osfs_lock);
+			memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
+			obd->obd_osfs_age = cfs_time_current_64();
+			spin_unlock(&obd->obd_osfs_lock);
+		}
+	} else {
+		CDEBUG(D_SUPER, "%s: use %p cache blocks "LPU64"/"LPU64
+		       " objects "LPU64"/"LPU64"\n",
+		       obd->obd_name, &obd->obd_osfs,
+		       obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+		       obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
+		spin_unlock(&obd->obd_osfs_lock);
+	}
+	RETURN(rc);
+}
+
+static inline int obd_sync_rqset(struct obd_export *exp, struct obd_info *oinfo,
+				 obd_size start, obd_size end)
+{
+	struct ptlrpc_request_set *set = NULL;
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+	EXP_COUNTER_INCREMENT(exp, sync);
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	rc = OBP(exp->exp_obd, sync)(NULL, exp, oinfo, start, end, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	RETURN(rc);
+}
+
+static inline int obd_sync(const struct lu_env *env, struct obd_export *exp,
+			   struct obd_info *oinfo, obd_size start, obd_size end,
+			   struct ptlrpc_request_set *set)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+	EXP_COUNTER_INCREMENT(exp, sync);
+
+	rc = OBP(exp->exp_obd, sync)(env, exp, oinfo, start, end, set);
+	RETURN(rc);
+}
+
+static inline int obd_punch_rqset(struct obd_export *exp,
+				  struct obd_info *oinfo,
+				  struct obd_trans_info *oti)
+{
+	struct ptlrpc_request_set *set = NULL;
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, punch);
+	EXP_COUNTER_INCREMENT(exp, punch);
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	rc = OBP(exp->exp_obd, punch)(NULL, exp, oinfo, oti, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	RETURN(rc);
+}
+
+static inline int obd_punch(const struct lu_env *env, struct obd_export *exp,
+			    struct obd_info *oinfo, struct obd_trans_info *oti,
+			    struct ptlrpc_request_set *rqset)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, punch);
+	EXP_COUNTER_INCREMENT(exp, punch);
+
+	rc = OBP(exp->exp_obd, punch)(env, exp, oinfo, oti, rqset);
+	RETURN(rc);
+}
+
+static inline int obd_brw(int cmd, struct obd_export *exp,
+			  struct obd_info *oinfo, obd_count oa_bufs,
+			  struct brw_page *pg, struct obd_trans_info *oti)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, brw);
+	EXP_COUNTER_INCREMENT(exp, brw);
+
+	if (!(cmd & (OBD_BRW_RWMASK | OBD_BRW_CHECK))) {
+		CERROR("obd_brw: cmd must be OBD_BRW_READ, OBD_BRW_WRITE, "
+		       "or OBD_BRW_CHECK\n");
+		LBUG();
+	}
+
+	rc = OBP(exp->exp_obd, brw)(cmd, exp, oinfo, oa_bufs, pg, oti);
+	RETURN(rc);
+}
+
+static inline int obd_preprw(const struct lu_env *env, int cmd,
+			     struct obd_export *exp, struct obdo *oa,
+			     int objcount, struct obd_ioobj *obj,
+			     struct niobuf_remote *remote, int *pages,
+			     struct niobuf_local *local,
+			     struct obd_trans_info *oti,
+			     struct lustre_capa *capa)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, preprw);
+	EXP_COUNTER_INCREMENT(exp, preprw);
+
+	rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
+				       pages, local, oti, capa);
+	RETURN(rc);
+}
+
+static inline int obd_commitrw(const struct lu_env *env, int cmd,
+			       struct obd_export *exp, struct obdo *oa,
+			       int objcount, struct obd_ioobj *obj,
+			       struct niobuf_remote *rnb, int pages,
+			       struct niobuf_local *local,
+			       struct obd_trans_info *oti, int rc)
+{
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, commitrw);
+	EXP_COUNTER_INCREMENT(exp, commitrw);
+
+	rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
+					 rnb, pages, local, oti, rc);
+	RETURN(rc);
+}
+
+static inline int obd_merge_lvb(struct obd_export *exp,
+				struct lov_stripe_md *lsm,
+				struct ost_lvb *lvb, int kms_only)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, merge_lvb);
+	EXP_COUNTER_INCREMENT(exp, merge_lvb);
+
+	rc = OBP(exp->exp_obd, merge_lvb)(exp, lsm, lvb, kms_only);
+	RETURN(rc);
+}
+
+static inline int obd_adjust_kms(struct obd_export *exp,
+				 struct lov_stripe_md *lsm, obd_off size,
+				 int shrink)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, adjust_kms);
+	EXP_COUNTER_INCREMENT(exp, adjust_kms);
+
+	rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink);
+	RETURN(rc);
+}
+
+static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
+				int len, void *karg, void *uarg)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, iocontrol);
+	EXP_COUNTER_INCREMENT(exp, iocontrol);
+
+	rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
+	RETURN(rc);
+}
+
+static inline int obd_enqueue_rqset(struct obd_export *exp,
+				    struct obd_info *oinfo,
+				    struct ldlm_enqueue_info *einfo)
+{
+	struct ptlrpc_request_set *set = NULL;
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, enqueue);
+	EXP_COUNTER_INCREMENT(exp, enqueue);
+
+	set =  ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+	RETURN(rc);
+}
+
+static inline int obd_enqueue(struct obd_export *exp,
+			      struct obd_info *oinfo,
+			      struct ldlm_enqueue_info *einfo,
+			      struct ptlrpc_request_set *set)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, enqueue);
+	EXP_COUNTER_INCREMENT(exp, enqueue);
+
+	rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set);
+	RETURN(rc);
+}
+
+static inline int obd_change_cbdata(struct obd_export *exp,
+				    struct lov_stripe_md *lsm,
+				    ldlm_iterator_t it, void *data)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, change_cbdata);
+	EXP_COUNTER_INCREMENT(exp, change_cbdata);
+
+	rc = OBP(exp->exp_obd, change_cbdata)(exp, lsm, it, data);
+	RETURN(rc);
+}
+
+static inline int obd_find_cbdata(struct obd_export *exp,
+				  struct lov_stripe_md *lsm,
+				  ldlm_iterator_t it, void *data)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, find_cbdata);
+	EXP_COUNTER_INCREMENT(exp, find_cbdata);
+
+	rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data);
+	RETURN(rc);
+}
+
+static inline int obd_cancel(struct obd_export *exp,
+			     struct lov_stripe_md *ea, __u32 mode,
+			     struct lustre_handle *lockh)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, cancel);
+	EXP_COUNTER_INCREMENT(exp, cancel);
+
+	rc = OBP(exp->exp_obd, cancel)(exp, ea, mode, lockh);
+	RETURN(rc);
+}
+
+static inline int obd_cancel_unused(struct obd_export *exp,
+				    struct lov_stripe_md *ea,
+				    ldlm_cancel_flags_t flags,
+				    void *opaque)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, cancel_unused);
+	EXP_COUNTER_INCREMENT(exp, cancel_unused);
+
+	rc = OBP(exp->exp_obd, cancel_unused)(exp, ea, flags, opaque);
+	RETURN(rc);
+}
+
+static inline int obd_pin(struct obd_export *exp, const struct lu_fid *fid,
+			  struct obd_capa *oc, struct obd_client_handle *handle,
+			  int flag)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, pin);
+	EXP_COUNTER_INCREMENT(exp, pin);
+
+	rc = OBP(exp->exp_obd, pin)(exp, fid, oc, handle, flag);
+	RETURN(rc);
+}
+
+static inline int obd_unpin(struct obd_export *exp,
+			    struct obd_client_handle *handle, int flag)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, unpin);
+	EXP_COUNTER_INCREMENT(exp, unpin);
+
+	rc = OBP(exp->exp_obd, unpin)(exp, handle, flag);
+	RETURN(rc);
+}
+
+
+static inline void obd_import_event(struct obd_device *obd,
+				    struct obd_import *imp,
+				    enum obd_import_event event)
+{
+	ENTRY;
+	if (!obd) {
+		CERROR("NULL device\n");
+		EXIT;
+		return;
+	}
+	if (obd->obd_set_up && OBP(obd, import_event)) {
+		OBD_COUNTER_INCREMENT(obd, import_event);
+		OBP(obd, import_event)(obd, imp, event);
+	}
+	EXIT;
+}
+
+static inline int obd_llog_connect(struct obd_export *exp,
+				   struct llogd_conn_body *body)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, llog_connect, 0);
+	EXP_COUNTER_INCREMENT(exp, llog_connect);
+
+	rc = OBP(exp->exp_obd, llog_connect)(exp, body);
+	RETURN(rc);
+}
+
+
+static inline int obd_notify(struct obd_device *obd,
+			     struct obd_device *watched,
+			     enum obd_notify_event ev,
+			     void *data)
+{
+	int rc;
+	ENTRY;
+	OBD_CHECK_DEV(obd);
+
+	/* the check for async_recov is a complete hack - I'm hereby
+	   overloading the meaning to also mean "this was called from
+	   mds_postsetup".  I know that my mds is able to handle notifies
+	   by this point, and it needs to get them to execute mds_postrecov. */
+	if (!obd->obd_set_up && !obd->obd_async_recov) {
+		CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	if (!OBP(obd, notify)) {
+		CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name);
+		RETURN(-ENOSYS);
+	}
+
+	OBD_COUNTER_INCREMENT(obd, notify);
+	rc = OBP(obd, notify)(obd, watched, ev, data);
+	RETURN(rc);
+}
+
+static inline int obd_notify_observer(struct obd_device *observer,
+				      struct obd_device *observed,
+				      enum obd_notify_event ev,
+				      void *data)
+{
+	int rc1;
+	int rc2;
+
+	struct obd_notify_upcall *onu;
+
+	if (observer->obd_observer)
+		rc1 = obd_notify(observer->obd_observer, observed, ev, data);
+	else
+		rc1 = 0;
+	/*
+	 * Also, call non-obd listener, if any
+	 */
+	onu = &observer->obd_upcall;
+	if (onu->onu_upcall != NULL)
+		rc2 = onu->onu_upcall(observer, observed, ev,
+				      onu->onu_owner, NULL);
+	else
+		rc2 = 0;
+
+	return rc1 ? rc1 : rc2;
+}
+
+static inline int obd_quotacheck(struct obd_export *exp,
+				 struct obd_quotactl *oqctl)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, quotacheck);
+	EXP_COUNTER_INCREMENT(exp, quotacheck);
+
+	rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl);
+	RETURN(rc);
+}
+
+static inline int obd_quotactl(struct obd_export *exp,
+			       struct obd_quotactl *oqctl)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_DT_OP(exp, quotactl);
+	EXP_COUNTER_INCREMENT(exp, quotactl);
+
+	rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
+	RETURN(rc);
+}
+
+static inline int obd_health_check(const struct lu_env *env,
+				   struct obd_device *obd)
+{
+	/* returns: 0 on healthy
+	 *	 >0 on unhealthy + reason code/flag
+	 *	    however the only suppored reason == 1 right now
+	 *	    We'll need to define some better reasons
+	 *	    or flags in the future.
+	 *	 <0 on error
+	 */
+	int rc;
+	ENTRY;
+
+	/* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
+	if (obd == NULL || !OBT(obd)) {
+		CERROR("cleaned up obd\n");
+		RETURN(-EOPNOTSUPP);
+	}
+	if (!obd->obd_set_up || obd->obd_stopping)
+		RETURN(0);
+	if (!OBP(obd, health_check))
+		RETURN(0);
+
+	rc = OBP(obd, health_check)(env, obd);
+	RETURN(rc);
+}
+
+static inline int obd_register_observer(struct obd_device *obd,
+					struct obd_device *observer)
+{
+	ENTRY;
+	OBD_CHECK_DEV(obd);
+	down_write(&obd->obd_observer_link_sem);
+	if (obd->obd_observer && observer) {
+		up_write(&obd->obd_observer_link_sem);
+		RETURN(-EALREADY);
+	}
+	obd->obd_observer = observer;
+	up_write(&obd->obd_observer_link_sem);
+	RETURN(0);
+}
+
+static inline int obd_pin_observer(struct obd_device *obd,
+				   struct obd_device **observer)
+{
+	ENTRY;
+	down_read(&obd->obd_observer_link_sem);
+	if (!obd->obd_observer) {
+		*observer = NULL;
+		up_read(&obd->obd_observer_link_sem);
+		RETURN(-ENOENT);
+	}
+	*observer = obd->obd_observer;
+	RETURN(0);
+}
+
+static inline int obd_unpin_observer(struct obd_device *obd)
+{
+	ENTRY;
+	up_read(&obd->obd_observer_link_sem);
+	RETURN(0);
+}
+
+#if 0
+static inline int obd_register_page_removal_cb(struct obd_export *exp,
+					       obd_page_removal_cb_t cb,
+					       obd_pin_extent_cb pin_cb)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, register_page_removal_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, register_page_removal_cb);
+
+	rc = OBP(exp->exp_obd, register_page_removal_cb)(exp, cb, pin_cb);
+	RETURN(rc);
+}
+
+static inline int obd_unregister_page_removal_cb(struct obd_export *exp,
+						 obd_page_removal_cb_t cb)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, unregister_page_removal_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_page_removal_cb);
+
+	rc = OBP(exp->exp_obd, unregister_page_removal_cb)(exp, cb);
+	RETURN(rc);
+}
+
+static inline int obd_register_lock_cancel_cb(struct obd_export *exp,
+					      obd_lock_cancel_cb cb)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, register_lock_cancel_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, register_lock_cancel_cb);
+
+	rc = OBP(exp->exp_obd, register_lock_cancel_cb)(exp, cb);
+	RETURN(rc);
+}
+
+static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp,
+						 obd_lock_cancel_cb cb)
+{
+	int rc;
+	ENTRY;
+
+	OBD_CHECK_DT_OP(exp->exp_obd, unregister_lock_cancel_cb, 0);
+	OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_lock_cancel_cb);
+
+	rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb);
+	RETURN(rc);
+}
+#endif
+
+/* metadata helpers */
+static inline int md_getstatus(struct obd_export *exp,
+			       struct lu_fid *fid, struct obd_capa **pc)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_MD_OP(exp, getstatus);
+	EXP_MD_COUNTER_INCREMENT(exp, getstatus);
+	rc = MDP(exp->exp_obd, getstatus)(exp, fid, pc);
+	RETURN(rc);
+}
+
+static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
+			     struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, getattr);
+	EXP_MD_COUNTER_INCREMENT(exp, getattr);
+	rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
+	RETURN(rc);
+}
+
+static inline int md_null_inode(struct obd_export *exp,
+				   const struct lu_fid *fid)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, null_inode);
+	EXP_MD_COUNTER_INCREMENT(exp, null_inode);
+	rc = MDP(exp->exp_obd, null_inode)(exp, fid);
+	RETURN(rc);
+}
+
+static inline int md_find_cbdata(struct obd_export *exp,
+				 const struct lu_fid *fid,
+				 ldlm_iterator_t it, void *data)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, find_cbdata);
+	EXP_MD_COUNTER_INCREMENT(exp, find_cbdata);
+	rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data);
+	RETURN(rc);
+}
+
+static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
+			   struct md_open_data *mod,
+			   struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, close);
+	EXP_MD_COUNTER_INCREMENT(exp, close);
+	rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+	RETURN(rc);
+}
+
+static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
+			    const void *data, int datalen, int mode, __u32 uid,
+			    __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+			    struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, create);
+	EXP_MD_COUNTER_INCREMENT(exp, create);
+	rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+				       uid, gid, cap_effective, rdev, request);
+	RETURN(rc);
+}
+
+static inline int md_done_writing(struct obd_export *exp,
+				  struct md_op_data *op_data,
+				  struct md_open_data *mod)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, done_writing);
+	EXP_MD_COUNTER_INCREMENT(exp, done_writing);
+	rc = MDP(exp->exp_obd, done_writing)(exp, op_data, mod);
+	RETURN(rc);
+}
+
+static inline int md_enqueue(struct obd_export *exp,
+			     struct ldlm_enqueue_info *einfo,
+			     struct lookup_intent *it,
+			     struct md_op_data *op_data,
+			     struct lustre_handle *lockh,
+			     void *lmm, int lmmsize,
+			     struct ptlrpc_request **req,
+			     int extra_lock_flags)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, enqueue);
+	EXP_MD_COUNTER_INCREMENT(exp, enqueue);
+	rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh,
+					lmm, lmmsize, req, extra_lock_flags);
+	RETURN(rc);
+}
+
+static inline int md_getattr_name(struct obd_export *exp,
+				  struct md_op_data *op_data,
+				  struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, getattr_name);
+	EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
+	rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+	RETURN(rc);
+}
+
+static inline int md_intent_lock(struct obd_export *exp,
+				 struct md_op_data *op_data, void *lmm,
+				 int lmmsize, struct lookup_intent *it,
+				 int lookup_flags, struct ptlrpc_request **reqp,
+				 ldlm_blocking_callback cb_blocking,
+				 __u64 extra_lock_flags)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, intent_lock);
+	EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
+	rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize,
+					    it, lookup_flags, reqp, cb_blocking,
+					    extra_lock_flags);
+	RETURN(rc);
+}
+
+static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
+			  struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, link);
+	EXP_MD_COUNTER_INCREMENT(exp, link);
+	rc = MDP(exp->exp_obd, link)(exp, op_data, request);
+	RETURN(rc);
+}
+
+static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
+			    const char *old, int oldlen, const char *new,
+			    int newlen, struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, rename);
+	EXP_MD_COUNTER_INCREMENT(exp, rename);
+	rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
+				       newlen, request);
+	RETURN(rc);
+}
+
+static inline int md_is_subdir(struct obd_export *exp,
+			       const struct lu_fid *pfid,
+			       const struct lu_fid *cfid,
+			       struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, is_subdir);
+	EXP_MD_COUNTER_INCREMENT(exp, is_subdir);
+	rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request);
+	RETURN(rc);
+}
+
+static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
+			     void *ea, int ealen, void *ea2, int ea2len,
+			     struct ptlrpc_request **request,
+			     struct md_open_data **mod)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, setattr);
+	EXP_MD_COUNTER_INCREMENT(exp, setattr);
+	rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen,
+					ea2, ea2len, request, mod);
+	RETURN(rc);
+}
+
+static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid,
+			  struct obd_capa *oc, struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, sync);
+	EXP_MD_COUNTER_INCREMENT(exp, sync);
+	rc = MDP(exp->exp_obd, sync)(exp, fid, oc, request);
+	RETURN(rc);
+}
+
+static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata,
+			      struct page **pages,
+			      struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, readpage);
+	EXP_MD_COUNTER_INCREMENT(exp, readpage);
+	rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request);
+	RETURN(rc);
+}
+
+static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
+			    struct ptlrpc_request **request)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, unlink);
+	EXP_MD_COUNTER_INCREMENT(exp, unlink);
+	rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
+	RETURN(rc);
+}
+
+static inline int md_get_lustre_md(struct obd_export *exp,
+				   struct ptlrpc_request *req,
+				   struct obd_export *dt_exp,
+				   struct obd_export *md_exp,
+				   struct lustre_md *md)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, get_lustre_md);
+	EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
+	RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md));
+}
+
+static inline int md_free_lustre_md(struct obd_export *exp,
+				    struct lustre_md *md)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, free_lustre_md);
+	EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
+	RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md));
+}
+
+static inline int md_setxattr(struct obd_export *exp,
+			      const struct lu_fid *fid, struct obd_capa *oc,
+			      obd_valid valid, const char *name,
+			      const char *input, int input_size,
+			      int output_size, int flags, __u32 suppgid,
+			      struct ptlrpc_request **request)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, setxattr);
+	EXP_MD_COUNTER_INCREMENT(exp, setxattr);
+	RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, oc, valid, name, input,
+					   input_size, output_size, flags,
+					   suppgid, request));
+}
+
+static inline int md_getxattr(struct obd_export *exp,
+			      const struct lu_fid *fid, struct obd_capa *oc,
+			      obd_valid valid, const char *name,
+			      const char *input, int input_size,
+			      int output_size, int flags,
+			      struct ptlrpc_request **request)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, getxattr);
+	EXP_MD_COUNTER_INCREMENT(exp, getxattr);
+	RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, oc, valid, name, input,
+					   input_size, output_size, flags,
+					   request));
+}
+
+static inline int md_set_open_replay_data(struct obd_export *exp,
+					  struct obd_client_handle *och,
+					  struct ptlrpc_request *open_req)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, set_open_replay_data);
+	EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+	RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, open_req));
+}
+
+static inline int md_clear_open_replay_data(struct obd_export *exp,
+					    struct obd_client_handle *och)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, clear_open_replay_data);
+	EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
+	RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och));
+}
+
+static inline int md_set_lock_data(struct obd_export *exp,
+				   __u64 *lockh, void *data, __u64 *bits)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, set_lock_data);
+	EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
+	RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits));
+}
+
+static inline int md_cancel_unused(struct obd_export *exp,
+				   const struct lu_fid *fid,
+				   ldlm_policy_data_t *policy,
+				   ldlm_mode_t mode,
+				   ldlm_cancel_flags_t flags,
+				   void *opaque)
+{
+	int rc;
+	ENTRY;
+
+	EXP_CHECK_MD_OP(exp, cancel_unused);
+	EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
+
+	rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+					      flags, opaque);
+	RETURN(rc);
+}
+
+static inline ldlm_mode_t md_lock_match(struct obd_export *exp, __u64 flags,
+					const struct lu_fid *fid,
+					ldlm_type_t type,
+					ldlm_policy_data_t *policy,
+					ldlm_mode_t mode,
+					struct lustre_handle *lockh)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, lock_match);
+	EXP_MD_COUNTER_INCREMENT(exp, lock_match);
+	RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+					     policy, mode, lockh));
+}
+
+static inline int md_init_ea_size(struct obd_export *exp, int easize,
+				  int def_asize, int cookiesize)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, init_ea_size);
+	EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
+	RETURN(MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize,
+					       cookiesize));
+}
+
+static inline int md_get_remote_perm(struct obd_export *exp,
+				     const struct lu_fid *fid,
+				     struct obd_capa *oc, __u32 suppgid,
+				     struct ptlrpc_request **request)
+{
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, get_remote_perm);
+	EXP_MD_COUNTER_INCREMENT(exp, get_remote_perm);
+	RETURN(MDP(exp->exp_obd, get_remote_perm)(exp, fid, oc, suppgid,
+						  request));
+}
+
+static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa,
+				renew_capa_cb_t cb)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, renew_capa);
+	EXP_MD_COUNTER_INCREMENT(exp, renew_capa);
+	rc = MDP(exp->exp_obd, renew_capa)(exp, ocapa, cb);
+	RETURN(rc);
+}
+
+static inline int md_unpack_capa(struct obd_export *exp,
+				 struct ptlrpc_request *req,
+				 const struct req_msg_field *field,
+				 struct obd_capa **oc)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, unpack_capa);
+	EXP_MD_COUNTER_INCREMENT(exp, unpack_capa);
+	rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc);
+	RETURN(rc);
+}
+
+static inline int md_intent_getattr_async(struct obd_export *exp,
+					  struct md_enqueue_info *minfo,
+					  struct ldlm_enqueue_info *einfo)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, intent_getattr_async);
+	EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
+	rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo, einfo);
+	RETURN(rc);
+}
+
+static inline int md_revalidate_lock(struct obd_export *exp,
+				     struct lookup_intent *it,
+				     struct lu_fid *fid, __u64 *bits)
+{
+	int rc;
+	ENTRY;
+	EXP_CHECK_MD_OP(exp, revalidate_lock);
+	EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
+	rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+	RETURN(rc);
+}
+
+
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
+
+/* support routines */
+extern struct kmem_cache *obdo_cachep;
+
+#define OBDO_ALLOC(ptr)						       \
+do {									  \
+	OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, __GFP_IO);	     \
+} while(0)
+
+#define OBDO_FREE(ptr)							\
+do {									  \
+	OBD_SLAB_FREE_PTR((ptr), obdo_cachep);				\
+} while(0)
+
+
+static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid)
+{
+	/* something here */
+}
+
+static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa)
+{
+	/* something here */
+}
+
+typedef int (*register_lwp_cb)(void *data);
+
+struct lwp_register_item {
+	struct obd_export **lri_exp;
+	register_lwp_cb	    lri_cb_func;
+	void		   *lri_cb_data;
+	struct list_head	    lri_list;
+	char		    lri_name[MTI_NAME_MAXLEN];
+};
+
+/* I'm as embarrassed about this as you are.
+ *
+ * <shaver> // XXX do not look into _superhack with remaining eye
+ * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
+extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+
+/* obd_mount.c */
+
+/* sysctl.c */
+extern void obd_sysctl_init (void);
+extern void obd_sysctl_clean (void);
+
+/* uuid.c  */
+typedef __u8 class_uuid_t[16];
+void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+
+/* lustre_peer.c    */
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
+int class_add_uuid(const char *uuid, __u64 nid);
+int class_del_uuid (const char *uuid);
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
+void class_init_uuidlist(void);
+void class_exit_uuidlist(void);
+
+/* mea.c */
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen);
+int raw_name2idx(int hashtype, int count, const char *name, int namelen);
+
+/* prng.c */
+#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
+
+#endif /* __LINUX_OBD_CLASS_H */

diff --git a/drivers/staging/lustre/lustre/include/obd_lov.h b/drivers/staging/lustre/lustre/include/obd_lov.h
new file mode 100644
index 0000000..d82f334
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_lov.h

@@ -0,0 +1,126 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_LOV_H__
+#define _OBD_LOV_H__
+
+#define LOV_DEFAULT_STRIPE_SIZE (1 << LNET_MTU_BITS)
+
+static inline int lov_stripe_md_size(__u16 stripes)
+{
+	return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo*);
+}
+
+static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
+{
+	if (lmm_magic == LOV_MAGIC_V3)
+		return sizeof(struct lov_mds_md_v3) +
+			stripes * sizeof(struct lov_ost_data_v1);
+	else
+		return sizeof(struct lov_mds_md_v1) +
+			stripes * sizeof(struct lov_ost_data_v1);
+}
+
+struct lov_version_size {
+	__u32   lvs_magic;
+	size_t  lvs_lmm_size;
+	size_t  lvs_lod_size;
+};
+
+static inline __u32 lov_mds_md_stripecnt(int ea_size, __u32 lmm_magic)
+{
+	static const struct lov_version_size lmm_ver_size[] = {
+			{ .lvs_magic = LOV_MAGIC_V3,
+			  .lvs_lmm_size = sizeof(struct lov_mds_md_v3),
+			  .lvs_lod_size = sizeof(struct lov_ost_data_v1) },
+			{ .lvs_magic = LOV_MAGIC_V1,
+			  .lvs_lmm_size = sizeof(struct lov_mds_md_v1),
+			  .lvs_lod_size = sizeof(struct lov_ost_data_v1)} };
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lmm_ver_size); i++) {
+		if (lmm_magic == lmm_ver_size[i].lvs_magic) {
+			if (ea_size <= lmm_ver_size[i].lvs_lmm_size)
+				return 0;
+			return (ea_size - lmm_ver_size[i].lvs_lmm_size) /
+				lmm_ver_size[i].lvs_lod_size;
+		}
+	}
+
+	/* Invalid LOV magic, so no stripes could fit */
+	return 0;
+}
+
+/* lov_do_div64(a, b) returns a % b, and a = a / b.
+ * The 32-bit code is LOV-specific due to knowing about stripe limits in
+ * order to reduce the divisor to a 32-bit number.  If the divisor is
+ * already a 32-bit value the compiler handles this directly. */
+#if BITS_PER_LONG > 32
+# define lov_do_div64(n,base) ({					\
+	uint64_t __base = (base);					\
+	uint64_t __rem;							\
+	__rem = ((uint64_t)(n)) % __base;				\
+	(n) = ((uint64_t)(n)) / __base;					\
+	__rem;								\
+  })
+#else
+# define lov_do_div64(n,base) ({					\
+	uint64_t __rem;							\
+	if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) {  \
+		int __remainder;					      \
+		LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
+			 "division %llu / %llu\n", (n), (uint64_t)(base));    \
+		__remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);		\
+		(n) >>= LOV_MIN_STRIPE_BITS;				\
+		__rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS);	\
+		__rem <<= LOV_MIN_STRIPE_BITS;				\
+		__rem += __remainder;					\
+	} else {							\
+		__rem = do_div(n, base);				\
+	}								\
+	__rem;								\
+  })
+#endif
+
+#define IOC_LOV_TYPE		   'g'
+#define IOC_LOV_MIN_NR		 50
+#define IOC_LOV_SET_OSC_ACTIVE	 _IOWR('g', 50, long)
+#define IOC_LOV_MAX_NR		 50
+
+#define QOS_DEFAULT_THRESHOLD	   10 /* MB */
+#define QOS_DEFAULT_MAXAGE	      5  /* Seconds */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/obd_ost.h b/drivers/staging/lustre/lustre/include/obd_ost.h
new file mode 100644
index 0000000..af89843
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_ost.h

@@ -0,0 +1,96 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/obd_ost.h
+ *
+ * Data structures for object storage targets and client: OST & OSC's
+ *
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_OST_H
+#define _LUSTRE_OST_H
+
+#include <obd_class.h>
+
+struct osc_brw_async_args {
+	struct obdo       *aa_oa;
+	int		aa_requested_nob;
+	int		aa_nio_count;
+	obd_count	  aa_page_count;
+	int		aa_resends;
+	struct brw_page  **aa_ppga;
+	struct client_obd *aa_cli;
+	struct list_head	 aa_oaps;
+	struct list_head	 aa_exts;
+	struct obd_capa   *aa_ocapa;
+	struct cl_req     *aa_clerq;
+};
+
+#define osc_grant_args osc_brw_async_args
+struct osc_async_args {
+	struct obd_info   *aa_oi;
+};
+
+struct osc_setattr_args {
+	struct obdo	 *sa_oa;
+	obd_enqueue_update_f sa_upcall;
+	void		*sa_cookie;
+};
+
+struct osc_fsync_args {
+	struct obd_info     *fa_oi;
+	obd_enqueue_update_f fa_upcall;
+	void		*fa_cookie;
+};
+
+struct osc_enqueue_args {
+	struct obd_export	*oa_exp;
+	__u64		    *oa_flags;
+	obd_enqueue_update_f      oa_upcall;
+	void		     *oa_cookie;
+	struct ost_lvb	   *oa_lvb;
+	struct lustre_handle     *oa_lockh;
+	struct ldlm_enqueue_info *oa_ei;
+	unsigned int	      oa_agl:1;
+};
+
+#if 0
+int osc_extent_blocking_cb(struct ldlm_lock *lock,
+			   struct ldlm_lock_desc *new, void *data,
+			   int flag);
+#endif
+
+#endif

diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h
new file mode 100644
index 0000000..b5d40af
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_support.h

@@ -0,0 +1,851 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_SUPPORT
+#define _OBD_SUPPORT
+
+#include <linux/libcfs/libcfs.h>
+#include <lvfs.h>
+#include <lprocfs_status.h>
+
+#include <linux/obd_support.h>
+
+/* global variables */
+extern struct lprocfs_stats *obd_memory;
+enum {
+	OBD_MEMORY_STAT = 0,
+	OBD_MEMORY_PAGES_STAT = 1,
+	OBD_STATS_NUM,
+};
+
+extern unsigned int obd_debug_peer_on_timeout;
+extern unsigned int obd_dump_on_timeout;
+extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+   networking / disk / timings affected by load (use Adaptive Timeouts) */
+extern unsigned int obd_timeout;	  /* seconds */
+extern unsigned int ldlm_timeout;	 /* seconds */
+extern unsigned int obd_timeout_set;
+extern unsigned int ldlm_timeout_set;
+extern unsigned int at_min;
+extern unsigned int at_max;
+extern unsigned int at_history;
+extern int at_early_margin;
+extern int at_extra;
+extern unsigned int obd_sync_filter;
+extern unsigned int obd_max_dirty_pages;
+extern atomic_t obd_dirty_pages;
+extern atomic_t obd_dirty_transit_pages;
+extern unsigned int obd_alloc_fail_rate;
+extern char obd_jobid_var[];
+
+/* lvfs.c */
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+		   size_t size, const char *file, int line);
+
+/* Some hash init argument constants */
+#define HASH_POOLS_BKT_BITS 3
+#define HASH_POOLS_CUR_BITS 3
+#define HASH_POOLS_MAX_BITS 7
+#define HASH_UUID_BKT_BITS 5
+#define HASH_UUID_CUR_BITS 7
+#define HASH_UUID_MAX_BITS 12
+#define HASH_NID_BKT_BITS 5
+#define HASH_NID_CUR_BITS 7
+#define HASH_NID_MAX_BITS 12
+#define HASH_NID_STATS_BKT_BITS 5
+#define HASH_NID_STATS_CUR_BITS 7
+#define HASH_NID_STATS_MAX_BITS 12
+#define HASH_LQE_BKT_BITS 5
+#define HASH_LQE_CUR_BITS 7
+#define HASH_LQE_MAX_BITS 12
+#define HASH_CONN_BKT_BITS 5
+#define HASH_CONN_CUR_BITS 5
+#define HASH_CONN_MAX_BITS 15
+#define HASH_EXP_LOCK_BKT_BITS  5
+#define HASH_EXP_LOCK_CUR_BITS  7
+#define HASH_EXP_LOCK_MAX_BITS  16
+#define HASH_CL_ENV_BKT_BITS    5
+#define HASH_CL_ENV_BITS	10
+#define HASH_JOB_STATS_BKT_BITS 5
+#define HASH_JOB_STATS_CUR_BITS 7
+#define HASH_JOB_STATS_MAX_BITS 12
+
+/* Timeout definitions */
+#define OBD_TIMEOUT_DEFAULT	     100
+#define LDLM_TIMEOUT_DEFAULT	    20
+#define MDS_LDLM_TIMEOUT_DEFAULT	6
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD	  (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIME_SOFT	  (obd_timeout * 3)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* a bit more than maximal journal commit time in seconds */
+#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
+/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
+ * failover targets the client only pings one server at a time, and pings
+ * can be lost on a loaded network. Since eviction has serious consequences,
+ * and there's no urgent need to evict a client just because it's idle, we
+ * should be very conservative here. */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define DISK_TIMEOUT 50	  /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
+ /* Max connect interval for nonresponsive servers; ~50s to avoid building up
+    connect requests in the LND queues, but within obd_timeout so we don't
+    miss the recovery window */
+#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
+#define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
+/* In general this should be low to have quick detection of a system
+   running on a backup server. (If it's too low, import_select_connection
+   will increase the timeout anyhow.)  */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
+/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */
+#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \
+			     INITIAL_CONNECT_TIMEOUT)
+/* The min time a target should wait for clients to reconnect in recovery */
+#define OBD_RECOVERY_TIME_MIN    (2*RECONNECT_DELAY_MAX)
+#define OBD_IR_FACTOR_MIN	 1
+#define OBD_IR_FACTOR_MAX	 10
+#define OBD_IR_FACTOR_DEFAULT    (OBD_IR_FACTOR_MAX/2)
+/* default timeout for the MGS to become IR_FULL */
+#define OBD_IR_MGS_TIMEOUT       (4*obd_timeout)
+#define LONG_UNLINK 300	  /* Unlink should happen before now */
+
+/**
+ * Time interval of shrink, if the client is "idle" more than this interval,
+ * then the ll_grant thread will return the requested grant space to filter
+ */
+#define GRANT_SHRINK_INTERVAL	    1200/*20 minutes*/
+
+#define OBD_FAIL_MDS		     0x100
+#define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
+#define OBD_FAIL_MDS_GETATTR_NET	 0x102
+#define OBD_FAIL_MDS_GETATTR_PACK	0x103
+#define OBD_FAIL_MDS_READPAGE_NET	0x104
+#define OBD_FAIL_MDS_READPAGE_PACK       0x105
+#define OBD_FAIL_MDS_SENDPAGE	    0x106
+#define OBD_FAIL_MDS_REINT_NET	   0x107
+#define OBD_FAIL_MDS_REINT_UNPACK	0x108
+#define OBD_FAIL_MDS_REINT_SETATTR       0x109
+#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a
+#define OBD_FAIL_MDS_REINT_CREATE	0x10b
+#define OBD_FAIL_MDS_REINT_CREATE_WRITE  0x10c
+#define OBD_FAIL_MDS_REINT_UNLINK	0x10d
+#define OBD_FAIL_MDS_REINT_UNLINK_WRITE  0x10e
+#define OBD_FAIL_MDS_REINT_LINK	  0x10f
+#define OBD_FAIL_MDS_REINT_LINK_WRITE    0x110
+#define OBD_FAIL_MDS_REINT_RENAME	0x111
+#define OBD_FAIL_MDS_REINT_RENAME_WRITE  0x112
+#define OBD_FAIL_MDS_OPEN_NET	    0x113
+#define OBD_FAIL_MDS_OPEN_PACK	   0x114
+#define OBD_FAIL_MDS_CLOSE_NET	   0x115
+#define OBD_FAIL_MDS_CLOSE_PACK	  0x116
+#define OBD_FAIL_MDS_CONNECT_NET	 0x117
+#define OBD_FAIL_MDS_CONNECT_PACK	0x118
+#define OBD_FAIL_MDS_REINT_NET_REP       0x119
+#define OBD_FAIL_MDS_DISCONNECT_NET      0x11a
+#define OBD_FAIL_MDS_GETSTATUS_NET       0x11b
+#define OBD_FAIL_MDS_GETSTATUS_PACK      0x11c
+#define OBD_FAIL_MDS_STATFS_PACK	 0x11d
+#define OBD_FAIL_MDS_STATFS_NET	  0x11e
+#define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
+#define OBD_FAIL_MDS_PIN_NET	     0x120
+#define OBD_FAIL_MDS_UNPIN_NET	   0x121
+#define OBD_FAIL_MDS_ALL_REPLY_NET       0x122
+#define OBD_FAIL_MDS_ALL_REQUEST_NET     0x123
+#define OBD_FAIL_MDS_SYNC_NET	    0x124
+#define OBD_FAIL_MDS_SYNC_PACK	   0x125
+#define OBD_FAIL_MDS_DONE_WRITING_NET    0x126
+#define OBD_FAIL_MDS_DONE_WRITING_PACK   0x127
+#define OBD_FAIL_MDS_ALLOC_OBDO	  0x128
+#define OBD_FAIL_MDS_PAUSE_OPEN	  0x129
+#define OBD_FAIL_MDS_STATFS_LCW_SLEEP    0x12a
+#define OBD_FAIL_MDS_OPEN_CREATE	 0x12b
+#define OBD_FAIL_MDS_OST_SETATTR	 0x12c
+#define OBD_FAIL_MDS_QUOTACHECK_NET      0x12d
+#define OBD_FAIL_MDS_QUOTACTL_NET	0x12e
+#define OBD_FAIL_MDS_CLIENT_ADD	  0x12f
+#define OBD_FAIL_MDS_GETXATTR_NET	0x130
+#define OBD_FAIL_MDS_GETXATTR_PACK       0x131
+#define OBD_FAIL_MDS_SETXATTR_NET	0x132
+#define OBD_FAIL_MDS_SETXATTR	    0x133
+#define OBD_FAIL_MDS_SETXATTR_WRITE      0x134
+#define OBD_FAIL_MDS_FS_SETUP	    0x135
+#define OBD_FAIL_MDS_RESEND	      0x136
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE       0x138
+#define OBD_FAIL_MDS_OSC_PRECREATE       0x139
+#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT   0x13a
+#define OBD_FAIL_MDS_CLOSE_NET_REP       0x13b
+#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ     0x13c
+#define OBD_FAIL_MDS_DROP_QUOTA_REQ      0x13d
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA    0x13e
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING   0x13f
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD    0x140
+#define OBD_FAIL_MDS_LOV_PREP_CREATE     0x141
+#define OBD_FAIL_MDS_REINT_DELAY	 0x142
+#define OBD_FAIL_MDS_READLINK_EPROTO     0x143
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE    0x144
+#define OBD_FAIL_MDS_PDO_LOCK	    0x145
+#define OBD_FAIL_MDS_PDO_LOCK2	   0x146
+#define OBD_FAIL_MDS_OSC_CREATE_FAIL     0x147
+#define OBD_FAIL_MDS_NEGATIVE_POSITIVE	 0x148
+#define OBD_FAIL_MDS_HSM_STATE_GET_NET		0x149
+#define OBD_FAIL_MDS_HSM_STATE_SET_NET		0x14a
+#define OBD_FAIL_MDS_HSM_PROGRESS_NET		0x14b
+#define OBD_FAIL_MDS_HSM_REQUEST_NET		0x14c
+#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET	0x14d
+#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET	0x14e
+#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET		0x14f
+#define OBD_FAIL_MDS_HSM_ACTION_NET		0x150
+#define OBD_FAIL_MDS_CHANGELOG_INIT		0x151
+
+/* layout lock */
+#define OBD_FAIL_MDS_NO_LL_GETATTR	 0x170
+#define OBD_FAIL_MDS_NO_LL_OPEN		 0x171
+#define OBD_FAIL_MDS_LL_BLOCK		 0x172
+
+/* CMD */
+#define OBD_FAIL_MDS_IS_SUBDIR_NET       0x180
+#define OBD_FAIL_MDS_IS_SUBDIR_PACK      0x181
+#define OBD_FAIL_MDS_SET_INFO_NET	0x182
+#define OBD_FAIL_MDS_WRITEPAGE_NET       0x183
+#define OBD_FAIL_MDS_WRITEPAGE_PACK      0x184
+#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
+#define OBD_FAIL_MDS_GET_INFO_NET	0x186
+#define OBD_FAIL_MDS_DQACQ_NET	   0x187
+
+/* OI scrub */
+#define OBD_FAIL_OSD_SCRUB_DELAY			0x190
+#define OBD_FAIL_OSD_SCRUB_CRASH			0x191
+#define OBD_FAIL_OSD_SCRUB_FATAL			0x192
+#define OBD_FAIL_OSD_FID_MAPPING			0x193
+#define OBD_FAIL_OSD_LMA_INCOMPAT			0x194
+
+#define OBD_FAIL_OST		     0x200
+#define OBD_FAIL_OST_CONNECT_NET	 0x201
+#define OBD_FAIL_OST_DISCONNECT_NET      0x202
+#define OBD_FAIL_OST_GET_INFO_NET	0x203
+#define OBD_FAIL_OST_CREATE_NET	  0x204
+#define OBD_FAIL_OST_DESTROY_NET	 0x205
+#define OBD_FAIL_OST_GETATTR_NET	 0x206
+#define OBD_FAIL_OST_SETATTR_NET	 0x207
+#define OBD_FAIL_OST_OPEN_NET	    0x208
+#define OBD_FAIL_OST_CLOSE_NET	   0x209
+#define OBD_FAIL_OST_BRW_NET	     0x20a
+#define OBD_FAIL_OST_PUNCH_NET	   0x20b
+#define OBD_FAIL_OST_STATFS_NET	  0x20c
+#define OBD_FAIL_OST_HANDLE_UNPACK       0x20d
+#define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
+#define OBD_FAIL_OST_BRW_READ_BULK       0x20f
+#define OBD_FAIL_OST_SYNC_NET	    0x210
+#define OBD_FAIL_OST_ALL_REPLY_NET       0x211
+#define OBD_FAIL_OST_ALL_REQUEST_NET     0x212
+#define OBD_FAIL_OST_LDLM_REPLY_NET      0x213
+#define OBD_FAIL_OST_BRW_PAUSE_BULK      0x214
+#define OBD_FAIL_OST_ENOSPC	      0x215
+#define OBD_FAIL_OST_EROFS	       0x216
+#define OBD_FAIL_OST_ENOENT	      0x217
+#define OBD_FAIL_OST_QUOTACHECK_NET      0x218
+#define OBD_FAIL_OST_QUOTACTL_NET	0x219
+#define OBD_FAIL_OST_CHECKSUM_RECEIVE    0x21a
+#define OBD_FAIL_OST_CHECKSUM_SEND       0x21b
+#define OBD_FAIL_OST_BRW_SIZE	    0x21c
+#define OBD_FAIL_OST_DROP_REQ	    0x21d
+#define OBD_FAIL_OST_SETATTR_CREDITS     0x21e
+#define OBD_FAIL_OST_HOLD_WRITE_RPC      0x21f
+#define OBD_FAIL_OST_BRW_WRITE_BULK2     0x220
+#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
+#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE	0x223
+#define OBD_FAIL_OST_BRW_PAUSE_PACK      0x224
+#define OBD_FAIL_OST_CONNECT_NET2	0x225
+#define OBD_FAIL_OST_NOMEM	       0x226
+#define OBD_FAIL_OST_BRW_PAUSE_BULK2     0x227
+#define OBD_FAIL_OST_MAPBLK_ENOSPC       0x228
+#define OBD_FAIL_OST_ENOINO	      0x229
+#define OBD_FAIL_OST_DQACQ_NET	   0x230
+#define OBD_FAIL_OST_STATFS_EINPROGRESS  0x231
+
+#define OBD_FAIL_LDLM		    0x300
+#define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
+#define OBD_FAIL_LDLM_ENQUEUE_NET			0x302
+#define OBD_FAIL_LDLM_CONVERT_NET			0x303
+#define OBD_FAIL_LDLM_CANCEL_NET			0x304
+#define OBD_FAIL_LDLM_BL_CALLBACK_NET			0x305
+#define OBD_FAIL_LDLM_CP_CALLBACK_NET			0x306
+#define OBD_FAIL_LDLM_GL_CALLBACK_NET			0x307
+#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
+#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309
+#define OBD_FAIL_LDLM_CREATE_RESOURCE    0x30a
+#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED    0x30b
+#define OBD_FAIL_LDLM_REPLY	      0x30c
+#define OBD_FAIL_LDLM_RECOV_CLIENTS      0x30d
+#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e
+#define OBD_FAIL_LDLM_GLIMPSE	    0x30f
+#define OBD_FAIL_LDLM_CANCEL_RACE	0x310
+#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE  0x311
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+#define OBD_FAIL_LDLM_CLOSE_THREAD       0x313
+#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE  0x314
+#define OBD_FAIL_LDLM_CP_CB_WAIT	 0x315
+#define OBD_FAIL_LDLM_OST_FAIL_RACE      0x316
+#define OBD_FAIL_LDLM_INTR_CP_AST	0x317
+#define OBD_FAIL_LDLM_CP_BL_RACE	 0x318
+#define OBD_FAIL_LDLM_NEW_LOCK	   0x319
+#define OBD_FAIL_LDLM_AGL_DELAY	  0x31a
+#define OBD_FAIL_LDLM_AGL_NOLOCK	 0x31b
+#define OBD_FAIL_LDLM_OST_LVB		 0x31c
+
+/* LOCKLESS IO */
+#define OBD_FAIL_LDLM_SET_CONTENTION     0x385
+
+#define OBD_FAIL_OSC		     0x400
+#define OBD_FAIL_OSC_BRW_READ_BULK       0x401
+#define OBD_FAIL_OSC_BRW_WRITE_BULK      0x402
+#define OBD_FAIL_OSC_LOCK_BL_AST	 0x403
+#define OBD_FAIL_OSC_LOCK_CP_AST	 0x404
+#define OBD_FAIL_OSC_MATCH	       0x405
+#define OBD_FAIL_OSC_BRW_PREP_REQ	0x406
+#define OBD_FAIL_OSC_SHUTDOWN	    0x407
+#define OBD_FAIL_OSC_CHECKSUM_RECEIVE    0x408
+#define OBD_FAIL_OSC_CHECKSUM_SEND       0x409
+#define OBD_FAIL_OSC_BRW_PREP_REQ2       0x40a
+#define OBD_FAIL_OSC_CONNECT_CKSUM       0x40b
+#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY    0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE	   0x40d
+#define OBD_FAIL_OSC_OBJECT_CONTENTION   0x40e
+#define OBD_FAIL_OSC_CP_CANCEL_RACE      0x40f
+#define OBD_FAIL_OSC_CP_ENQ_RACE	 0x410
+#define OBD_FAIL_OSC_NO_GRANT	    0x411
+#define OBD_FAIL_OSC_DELAY_SETTIME	 0x412
+
+#define OBD_FAIL_PTLRPC		  0x500
+#define OBD_FAIL_PTLRPC_ACK	      0x501
+#define OBD_FAIL_PTLRPC_RQBD	     0x502
+#define OBD_FAIL_PTLRPC_BULK_GET_NET     0x503
+#define OBD_FAIL_PTLRPC_BULK_PUT_NET     0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC	 0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
+#define OBD_FAIL_PTLRPC_DELAY_RECOV      0x507
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ	0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP	0x50c
+#define OBD_FAIL_PTLRPC_IMP_DEACTIVE     0x50d
+#define OBD_FAIL_PTLRPC_DUMP_LOG	 0x50e
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT    0x511
+#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
+#define OBD_FAIL_PTLRPC_DROP_REQ_OPC     0x513
+#define OBD_FAIL_PTLRPC_FINISH_REPLAY    0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
+#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL   0x516
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND    0x517
+
+#define OBD_FAIL_OBD_PING_NET	    0x600
+#define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
+#define OBD_FAIL_OBD_LOGD_NET	    0x602
+#define OBD_FAIL_OBD_QC_CALLBACK_NET     0x603
+#define OBD_FAIL_OBD_DQACQ	       0x604
+#define OBD_FAIL_OBD_LLOG_SETUP	  0x605
+#define OBD_FAIL_OBD_LOG_CANCEL_REP      0x606
+#define OBD_FAIL_OBD_IDX_READ_NET	0x607
+#define OBD_FAIL_OBD_IDX_READ_BREAK	 0x608
+#define OBD_FAIL_OBD_NO_LRU		 0x609
+
+#define OBD_FAIL_TGT_REPLY_NET	   0x700
+#define OBD_FAIL_TGT_CONN_RACE	   0x701
+#define OBD_FAIL_TGT_FORCE_RECONNECT     0x702
+#define OBD_FAIL_TGT_DELAY_CONNECT       0x703
+#define OBD_FAIL_TGT_DELAY_RECONNECT     0x704
+#define OBD_FAIL_TGT_DELAY_PRECREATE     0x705
+#define OBD_FAIL_TGT_TOOMANY_THREADS     0x706
+#define OBD_FAIL_TGT_REPLAY_DROP	 0x707
+#define OBD_FAIL_TGT_FAKE_EXP	    0x708
+#define OBD_FAIL_TGT_REPLAY_DELAY	0x709
+#define OBD_FAIL_TGT_LAST_REPLAY	 0x710
+#define OBD_FAIL_TGT_CLIENT_ADD	  0x711
+#define OBD_FAIL_TGT_RCVG_FLAG	   0x712
+
+#define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
+#define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
+#define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
+#define OBD_FAIL_MDC_RPCS_SEM		 0x804
+#define OBD_FAIL_MDC_LIGHTWEIGHT	 0x805
+
+#define OBD_FAIL_MGS		     0x900
+#define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
+#define OBD_FAIL_MGS_ALL_REPLY_NET       0x902
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG   0x903
+#define OBD_FAIL_MGS_PAUSE_REQ	   0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG    0x905
+
+#define OBD_FAIL_QUOTA_DQACQ_NET			0xA01
+#define OBD_FAIL_QUOTA_EDQUOT	    0xA02
+#define OBD_FAIL_QUOTA_DELAY_REINT       0xA03
+#define OBD_FAIL_QUOTA_RECOVERABLE_ERR   0xA04
+
+#define OBD_FAIL_LPROC_REMOVE	    0xB00
+
+#define OBD_FAIL_GENERAL_ALLOC	   0xC00
+
+#define OBD_FAIL_SEQ		     0x1000
+#define OBD_FAIL_SEQ_QUERY_NET	   0x1001
+#define OBD_FAIL_SEQ_EXHAUST		 0x1002
+
+#define OBD_FAIL_FLD		     0x1100
+#define OBD_FAIL_FLD_QUERY_NET	   0x1101
+
+#define OBD_FAIL_SEC_CTX		 0x1200
+#define OBD_FAIL_SEC_CTX_INIT_NET	0x1201
+#define OBD_FAIL_SEC_CTX_INIT_CONT_NET   0x1202
+#define OBD_FAIL_SEC_CTX_FINI_NET	0x1203
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
+
+#define OBD_FAIL_LLOG			       0x1300
+#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET	    0x1301
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET      0x1302
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET  0x1305
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET  0x1306
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308
+#define OBD_FAIL_LLOG_CATINFO_NET		   0x1309
+#define OBD_FAIL_MDS_SYNC_CAPA_SL		   0x1310
+#define OBD_FAIL_SEQ_ALLOC			  0x1311
+
+#define OBD_FAIL_LLITE			      0x1400
+#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE	     0x1401
+#define OBD_FAIL_LOCK_STATE_WAIT_INTR	       0x1402
+#define OBD_FAIL_LOV_INIT			    0x1403
+#define OBD_FAIL_GLIMPSE_DELAY			    0x1404
+
+#define OBD_FAIL_FID_INDIR	0x1501
+#define OBD_FAIL_FID_INLMA	0x1502
+#define OBD_FAIL_FID_IGIF	0x1504
+#define OBD_FAIL_FID_LOOKUP	0x1505
+#define OBD_FAIL_FID_NOLMA	0x1506
+
+/* LFSCK */
+#define OBD_FAIL_LFSCK_DELAY1		0x1600
+#define OBD_FAIL_LFSCK_DELAY2		0x1601
+#define OBD_FAIL_LFSCK_DELAY3		0x1602
+#define OBD_FAIL_LFSCK_LINKEA_CRASH	0x1603
+#define OBD_FAIL_LFSCK_LINKEA_MORE	0x1604
+#define OBD_FAIL_LFSCK_FATAL1		0x1608
+#define OBD_FAIL_LFSCK_FATAL2		0x1609
+#define OBD_FAIL_LFSCK_CRASH		0x160a
+#define OBD_FAIL_LFSCK_NO_AUTO		0x160b
+#define OBD_FAIL_LFSCK_NO_DOUBLESCAN	0x160c
+
+/* UPDATE */
+#define OBD_FAIL_UPDATE_OBJ_NET			0x1700
+#define OBD_FAIL_UPDATE_OBJ_NET_REP		0x1701
+
+
+/* Assign references to moved code to reduce code changes */
+#define OBD_FAIL_PRECHECK(id)		   CFS_FAIL_PRECHECK(id)
+#define OBD_FAIL_CHECK(id)		      CFS_FAIL_CHECK(id)
+#define OBD_FAIL_CHECK_VALUE(id, value)	 CFS_FAIL_CHECK_VALUE(id, value)
+#define OBD_FAIL_CHECK_ORSET(id, value)	 CFS_FAIL_CHECK_ORSET(id, value)
+#define OBD_FAIL_CHECK_RESET(id, value)	 CFS_FAIL_CHECK_RESET(id, value)
+#define OBD_FAIL_RETURN(id, ret)		CFS_FAIL_RETURN(id, ret)
+#define OBD_FAIL_TIMEOUT(id, secs)	      CFS_FAIL_TIMEOUT(id, secs)
+#define OBD_FAIL_TIMEOUT_MS(id, ms)	     CFS_FAIL_TIMEOUT_MS(id, ms)
+#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs)
+#define OBD_RACE(id)			    CFS_RACE(id)
+#define OBD_FAIL_ONCE			   CFS_FAIL_ONCE
+#define OBD_FAILED			      CFS_FAILED
+
+extern atomic_t libcfs_kmemory;
+
+#ifdef LPROCFS
+#define obd_memory_add(size)						  \
+	lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sub(size)						  \
+	lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sum()						      \
+	lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT,		  \
+				LPROCFS_FIELDS_FLAGS_SUM)
+#define obd_pages_add(order)						  \
+	lprocfs_counter_add(obd_memory, OBD_MEMORY_PAGES_STAT,		\
+			    (long)(1 << (order)))
+#define obd_pages_sub(order)						  \
+	lprocfs_counter_sub(obd_memory, OBD_MEMORY_PAGES_STAT,		\
+			    (long)(1 << (order)))
+#define obd_pages_sum()						       \
+	lprocfs_stats_collector(obd_memory, OBD_MEMORY_PAGES_STAT,	    \
+				LPROCFS_FIELDS_FLAGS_SUM)
+
+extern void obd_update_maxusage(void);
+extern __u64 obd_memory_max(void);
+extern __u64 obd_pages_max(void);
+
+#else
+
+extern __u64 obd_alloc;
+extern __u64 obd_pages;
+
+extern __u64 obd_max_alloc;
+extern __u64 obd_max_pages;
+
+static inline void obd_memory_add(long size)
+{
+	obd_alloc += size;
+	if (obd_alloc > obd_max_alloc)
+		obd_max_alloc = obd_alloc;
+}
+
+static inline void obd_memory_sub(long size)
+{
+	obd_alloc -= size;
+}
+
+static inline void obd_pages_add(int order)
+{
+	obd_pages += 1<< order;
+	if (obd_pages > obd_max_pages)
+		obd_max_pages = obd_pages;
+}
+
+static inline void obd_pages_sub(int order)
+{
+	obd_pages -= 1<< order;
+}
+
+#define obd_memory_sum() (obd_alloc)
+#define obd_pages_sum()  (obd_pages)
+
+#define obd_memory_max() (obd_max_alloc)
+#define obd_pages_max() (obd_max_pages)
+
+#endif
+
+#define OBD_DEBUG_MEMUSAGE (1)
+
+#if OBD_DEBUG_MEMUSAGE
+#define OBD_ALLOC_POST(ptr, size, name)				 \
+		obd_memory_add(size);				   \
+		CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",       \
+		       (int)(size), ptr)
+
+#define OBD_FREE_PRE(ptr, size, name)				   \
+	LASSERT(ptr);						   \
+	obd_memory_sub(size);					   \
+	CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",	       \
+	       (int)(size), ptr);				       \
+	POISON(ptr, 0x5a, size)
+
+#else /* !OBD_DEBUG_MEMUSAGE */
+
+#define OBD_ALLOC_POST(ptr, size, name) ((void)0)
+#define OBD_FREE_PRE(ptr, size, name)   ((void)0)
+
+#endif /* !OBD_DEBUG_MEMUSAGE */
+
+#define HAS_FAIL_ALLOC_FLAG OBD_FAIL_CHECK(OBD_FAIL_GENERAL_ALLOC)
+
+#define OBD_ALLOC_FAIL_BITS 24
+#define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1)
+#define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100)
+
+#if defined(LUSTRE_UTILS) /* this version is for utils only */
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
+do {									      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmalloc(size, flags) :				      \
+		kmalloc_node(size, flags, cfs_cpt_spread_node(cptab, cpt));   \
+	if (unlikely((ptr) == NULL)) {					\
+		CERROR("kmalloc of '" #ptr "' (%d bytes) failed at %s:%d\n",  \
+		       (int)(size), __FILE__, __LINE__);		      \
+	} else {							      \
+		memset(ptr, 0, size);					      \
+		CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p\n",	      \
+		       (int)(size), ptr);				      \
+	}								      \
+} while (0)
+
+#else /* this version is for the kernel and liblustre */
+#define OBD_FREE_RTN0(ptr)						    \
+({									    \
+	kfree(ptr);							\
+	(ptr) = NULL;							 \
+	0;								    \
+})
+
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)		      \
+do {									      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmalloc(size, flags | __GFP_ZERO) :			      \
+		kmalloc_node(size, flags | __GFP_ZERO,			      \
+			     cfs_cpt_spread_node(cptab, cpt));		      \
+	if (likely((ptr) != NULL &&					   \
+		   (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 ||       \
+		    !obd_alloc_fail(ptr, #ptr, "km", size,		    \
+				    __FILE__, __LINE__) ||		    \
+		    OBD_FREE_RTN0(ptr)))){				    \
+		OBD_ALLOC_POST(ptr, size, "kmalloced");		       \
+	}								     \
+} while (0)
+#endif
+
+#define OBD_ALLOC_GFP(ptr, size, gfp_mask)				      \
+	__OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask)
+
+#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, __GFP_IO)
+#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_IOFS)
+#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof *(ptr))
+#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof *(ptr))
+
+#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask)		      \
+	__OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask)
+
+#define OBD_CPT_ALLOC(ptr, cptab, cpt, size)				      \
+	OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO)
+
+#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt)				      \
+	OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof *(ptr))
+
+# define __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)			      \
+do {									      \
+	(ptr) = cptab == NULL ?						      \
+		vzalloc(size) :						      \
+		vzalloc_node(size, cfs_cpt_spread_node(cptab, cpt));	      \
+	if (unlikely((ptr) == NULL)) {					\
+		CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",	   \
+		       (int)(size));					  \
+		CERROR(LPU64" total bytes allocated by Lustre, %d by LNET\n", \
+		       obd_memory_sum(), atomic_read(&libcfs_kmemory));   \
+	} else {							      \
+		OBD_ALLOC_POST(ptr, size, "vmalloced");		       \
+	}								     \
+} while(0)
+
+# define OBD_VMALLOC(ptr, size)						      \
+	 __OBD_VMALLOC_VEROBSE(ptr, NULL, 0, size)
+# define OBD_CPT_VMALLOC(ptr, cptab, cpt, size)				      \
+	 __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)
+
+
+/* Allocations above this size are considered too big and could not be done
+ * atomically.
+ *
+ * Be very careful when changing this value, especially when decreasing it,
+ * since vmalloc in Linux doesn't perform well on multi-cores system, calling
+ * vmalloc in critical path would hurt peformance badly. See LU-66.
+ */
+#define OBD_ALLOC_BIG (4 * PAGE_CACHE_SIZE)
+
+#define OBD_ALLOC_LARGE(ptr, size)					    \
+do {									  \
+	if (size > OBD_ALLOC_BIG)					     \
+		OBD_VMALLOC(ptr, size);				       \
+	else								  \
+		OBD_ALLOC(ptr, size);					 \
+} while (0)
+
+#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size)			      \
+do {									      \
+	if (size > OBD_ALLOC_BIG)					      \
+		OBD_CPT_VMALLOC(ptr, cptab, cpt, size);			      \
+	else								      \
+		OBD_CPT_ALLOC(ptr, cptab, cpt, size);			      \
+} while (0)
+
+#define OBD_FREE_LARGE(ptr, size)					     \
+do {									  \
+	if (size > OBD_ALLOC_BIG)					     \
+		OBD_VFREE(ptr, size);					 \
+	else								  \
+		OBD_FREE(ptr, size);					  \
+} while (0)
+
+
+#ifdef CONFIG_DEBUG_SLAB
+#define POISON(ptr, c, s) do {} while (0)
+#define POISON_PTR(ptr)  ((void)0)
+#else
+#define POISON(ptr, c, s) memset(ptr, c, s)
+#define POISON_PTR(ptr)  (ptr) = (void *)0xdeadbeef
+#endif
+
+#ifdef POISON_BULK
+#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_CACHE_SIZE);   \
+				    kunmap(page); } while (0)
+#else
+#define POISON_PAGE(page, val) do { } while (0)
+#endif
+
+#define OBD_FREE(ptr, size)						   \
+do {									  \
+	OBD_FREE_PRE(ptr, size, "kfreed");				    \
+	kfree(ptr);							\
+	POISON_PTR(ptr);						      \
+} while(0)
+
+
+#define OBD_FREE_RCU(ptr, size, handle)					      \
+do {									      \
+	struct portals_handle *__h = (handle);				      \
+									      \
+	LASSERT(handle != NULL);					      \
+	__h->h_cookie = (unsigned long)(ptr);				      \
+	__h->h_size = (size);						      \
+	call_rcu(&__h->h_rcu, class_handle_free_cb);			      \
+	POISON_PTR(ptr);						      \
+} while(0)
+
+
+#define OBD_VFREE(ptr, size)				\
+	do {						\
+		OBD_FREE_PRE(ptr, size, "vfreed");	\
+		vfree(ptr);			\
+		POISON_PTR(ptr);			\
+	} while (0)
+
+/* we memset() the slab object to 0 when allocation succeeds, so DO NOT
+ * HAVE A CTOR THAT DOES ANYTHING.  its work will be cleared here.  we'd
+ * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */
+#define OBD_SLAB_FREE_RTN0(ptr, slab)					 \
+({									    \
+	kmem_cache_free((slab), (ptr));				    \
+	(ptr) = NULL;							 \
+	0;								    \
+})
+
+#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type)	      \
+do {									      \
+	LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt()));	      \
+	(ptr) = (cptab) == NULL ?					      \
+		kmem_cache_alloc(slab, type | __GFP_ZERO) :		\
+		kmem_cache_alloc_node(slab, type | __GFP_ZERO,		\
+				      cfs_cpt_spread_node(cptab, cpt));	\
+	if (likely((ptr) != NULL &&					   \
+		   (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 ||       \
+		    !obd_alloc_fail(ptr, #ptr, "slab-", size,		 \
+				    __FILE__, __LINE__) ||		    \
+		    OBD_SLAB_FREE_RTN0(ptr, slab)))) {			\
+		OBD_ALLOC_POST(ptr, size, "slab-alloced");		    \
+	}								     \
+} while(0)
+
+#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags)			      \
+	__OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags)
+#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags)	      \
+	__OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags)
+
+#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof *(ptr))
+
+#define OBD_SLAB_FREE(ptr, slab, size)					\
+do {									  \
+	OBD_FREE_PRE(ptr, size, "slab-freed");				\
+	kmem_cache_free(slab, ptr);					\
+	POISON_PTR(ptr);						      \
+} while(0)
+
+#define OBD_SLAB_ALLOC(ptr, slab, size)					      \
+	OBD_SLAB_ALLOC_GFP(ptr, slab, size, __GFP_IO)
+
+#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size)			      \
+	OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, __GFP_IO)
+
+#define OBD_SLAB_ALLOC_PTR(ptr, slab)					      \
+	OBD_SLAB_ALLOC(ptr, slab, sizeof *(ptr))
+
+#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt)			      \
+	OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof *(ptr))
+
+#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags)			      \
+	OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof *(ptr), flags)
+
+#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags)		      \
+	OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof *(ptr), flags)
+
+#define OBD_SLAB_FREE_PTR(ptr, slab)					      \
+	OBD_SLAB_FREE((ptr), (slab), sizeof *(ptr))
+
+#define KEY_IS(str) \
+	(keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0)
+
+/* Wrapper for contiguous page frame allocation */
+#define __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)		      \
+do {									      \
+	(ptr) = (cptab) == NULL ?					      \
+		alloc_page(gfp_mask) :				      \
+		alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), gfp_mask, 0);\
+	if (unlikely((ptr) == NULL)) {					\
+		CERROR("alloc_pages of '" #ptr "' %d page(s) / "LPU64" bytes "\
+		       "failed\n", (int)1,				    \
+		       (__u64)(1 << PAGE_CACHE_SHIFT));			 \
+		CERROR(LPU64" total bytes and "LPU64" total pages "	   \
+		       "("LPU64" bytes) allocated by Lustre, "		\
+		       "%d total bytes by LNET\n",			    \
+		       obd_memory_sum(),				      \
+		       obd_pages_sum() << PAGE_CACHE_SHIFT,		     \
+		       obd_pages_sum(),				       \
+		       atomic_read(&libcfs_kmemory));		     \
+	} else {							      \
+		obd_pages_add(0);					     \
+		CDEBUG(D_MALLOC, "alloc_pages '" #ptr "': %d page(s) / "      \
+		       LPU64" bytes at %p.\n",				\
+		       (int)1,						\
+		       (__u64)(1 << PAGE_CACHE_SHIFT), ptr);		    \
+	}								     \
+} while (0)
+
+#define OBD_PAGE_ALLOC(ptr, gfp_mask)					      \
+	__OBD_PAGE_ALLOC_VERBOSE(ptr, NULL, 0, gfp_mask)
+#define OBD_PAGE_CPT_ALLOC(ptr, cptab, cpt, gfp_mask)			      \
+	__OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)
+
+#define OBD_PAGE_FREE(ptr)						    \
+do {									  \
+	LASSERT(ptr);							 \
+	obd_pages_sub(0);						     \
+	CDEBUG(D_MALLOC, "free_pages '" #ptr "': %d page(s) / "LPU64" bytes " \
+	       "at %p.\n",						    \
+	       (int)1, (__u64)(1 << PAGE_CACHE_SHIFT),			  \
+	       ptr);							  \
+	__free_page(ptr);						   \
+	(ptr) = (void *)0xdeadbeef;					   \
+} while (0)
+
+#endif

diff --git a/drivers/staging/lustre/lustre/lclient/glimpse.c b/drivers/staging/lustre/lustre/lclient/glimpse.c
new file mode 100644
index 0000000..7f3974b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lclient/glimpse.c

@@ -0,0 +1,274 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * glimpse code shared between vvp and liblustre (and other Lustre clients in
+ * the future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+
+# include <lustre_dlm.h>
+# include <lustre_lite.h>
+# include <lustre_mdc.h>
+# include <linux/pagemap.h>
+# include <linux/file.h>
+
+#include "cl_object.h"
+#include "lclient.h"
+# include "../llite/llite_internal.h"
+
+static const struct cl_lock_descr whole_file = {
+	.cld_start = 0,
+	.cld_end   = CL_PAGE_EOF,
+	.cld_mode  = CLM_READ
+};
+
+/*
+ * Check whether file has possible unwriten pages.
+ *
+ * \retval 1    file is mmap-ed or has dirty pages
+ *	 0    otherwise
+ */
+blkcnt_t dirty_cnt(struct inode *inode)
+{
+	blkcnt_t cnt = 0;
+	struct ccc_object *vob = cl_inode2ccc(inode);
+	void	      *results[1];
+
+	if (inode->i_mapping != NULL)
+		cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
+						  results, 0, 1,
+						  PAGECACHE_TAG_DIRTY);
+	if (cnt == 0 && atomic_read(&vob->cob_mmap_cnt) > 0)
+		cnt = 1;
+
+	return (cnt > 0) ? 1 : 0;
+}
+
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+		    struct inode *inode, struct cl_object *clob, int agl)
+{
+	struct cl_lock_descr *descr = &ccc_env_info(env)->cti_descr;
+	struct cl_inode_info *lli   = cl_i2info(inode);
+	const struct lu_fid  *fid   = lu_object_fid(&clob->co_lu);
+	struct ccc_io	*cio   = ccc_env_io(env);
+	struct cl_lock       *lock;
+	int result;
+
+	ENTRY;
+	result = 0;
+	if (!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)) {
+		CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
+		if (lli->lli_has_smd) {
+			/* NOTE: this looks like DLM lock request, but it may
+			 *       not be one. Due to CEF_ASYNC flag (translated
+			 *       to LDLM_FL_HAS_INTENT by osc), this is
+			 *       glimpse request, that won't revoke any
+			 *       conflicting DLM locks held. Instead,
+			 *       ll_glimpse_callback() will be called on each
+			 *       client holding a DLM lock against this file,
+			 *       and resulting size will be returned for each
+			 *       stripe. DLM lock on [0, EOF] is acquired only
+			 *       if there were no conflicting locks. If there
+			 *       were conflicting locks, enqueuing or waiting
+			 *       fails with -ENAVAIL, but valid inode
+			 *       attributes are returned anyway. */
+			*descr = whole_file;
+			descr->cld_obj   = clob;
+			descr->cld_mode  = CLM_PHANTOM;
+			descr->cld_enq_flags = CEF_ASYNC | CEF_MUST;
+			if (agl)
+				descr->cld_enq_flags |= CEF_AGL;
+			cio->cui_glimpse = 1;
+			/*
+			 * CEF_ASYNC is used because glimpse sub-locks cannot
+			 * deadlock (because they never conflict with other
+			 * locks) and, hence, can be enqueued out-of-order.
+			 *
+			 * CEF_MUST protects glimpse lock from conversion into
+			 * a lockless mode.
+			 */
+			lock = cl_lock_request(env, io, descr, "glimpse",
+					       current);
+			cio->cui_glimpse = 0;
+
+			if (lock == NULL)
+				RETURN(0);
+
+			if (IS_ERR(lock))
+				RETURN(PTR_ERR(lock));
+
+			LASSERT(agl == 0);
+			result = cl_wait(env, lock);
+			if (result == 0) {
+				cl_merge_lvb(env, inode);
+				if (cl_isize_read(inode) > 0 &&
+				    inode->i_blocks == 0) {
+					/*
+					 * LU-417: Add dirty pages block count
+					 * lest i_blocks reports 0, some "cp" or
+					 * "tar" may think it's a completely
+					 * sparse file and skip it.
+					 */
+					inode->i_blocks = dirty_cnt(inode);
+				}
+				cl_unuse(env, lock);
+			}
+			cl_lock_release(env, lock, "glimpse", current);
+		} else {
+			CDEBUG(D_DLMTRACE, "No objects for inode\n");
+			cl_merge_lvb(env, inode);
+		}
+	}
+
+	RETURN(result);
+}
+
+static int cl_io_get(struct inode *inode, struct lu_env **envout,
+		     struct cl_io **ioout, int *refcheck)
+{
+	struct lu_env	  *env;
+	struct cl_io	   *io;
+	struct cl_inode_info   *lli = cl_i2info(inode);
+	struct cl_object       *clob = lli->lli_clob;
+	int result;
+
+	if (S_ISREG(cl_inode_mode(inode))) {
+		env = cl_env_get(refcheck);
+		if (!IS_ERR(env)) {
+			io = ccc_env_thread_io(env);
+			io->ci_obj = clob;
+			*envout = env;
+			*ioout  = io;
+			result = +1;
+		} else
+			result = PTR_ERR(env);
+	} else
+		result = 0;
+	return result;
+}
+
+int cl_glimpse_size0(struct inode *inode, int agl)
+{
+	/*
+	 * We don't need ast_flags argument to cl_glimpse_size(), because
+	 * osc_lock_enqueue() takes care of the possible deadlock that said
+	 * argument was introduced to avoid.
+	 */
+	/*
+	 * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to
+	 * cl_glimpse_size(), which doesn't make sense: glimpse locks are not
+	 * blocking anyway.
+	 */
+	struct lu_env	  *env = NULL;
+	struct cl_io	   *io  = NULL;
+	int		     result;
+	int		     refcheck;
+
+	ENTRY;
+
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result > 0) {
+	again:
+		io->ci_verify_layout = 1;
+		result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+		if (result > 0)
+			/*
+			 * nothing to do for this io. This currently happens
+			 * when stripe sub-object's are not yet created.
+			 */
+			result = io->ci_result;
+		else if (result == 0)
+			result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+						 agl);
+
+		OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
+		cl_io_fini(env, io);
+		if (unlikely(io->ci_need_restart))
+			goto again;
+		cl_env_put(env, &refcheck);
+	}
+	RETURN(result);
+}
+
+int cl_local_size(struct inode *inode)
+{
+	struct lu_env	   *env = NULL;
+	struct cl_io	    *io  = NULL;
+	struct ccc_thread_info  *cti;
+	struct cl_object	*clob;
+	struct cl_lock_descr    *descr;
+	struct cl_lock	  *lock;
+	int		      result;
+	int		      refcheck;
+
+	ENTRY;
+
+	if (!cl_i2info(inode)->lli_has_smd)
+		RETURN(0);
+
+	result = cl_io_get(inode, &env, &io, &refcheck);
+	if (result <= 0)
+		RETURN(result);
+
+	clob = io->ci_obj;
+	result = cl_io_init(env, io, CIT_MISC, clob);
+	if (result > 0)
+		result = io->ci_result;
+	else if (result == 0) {
+		cti = ccc_env_info(env);
+		descr = &cti->cti_descr;
+
+		*descr = whole_file;
+		descr->cld_obj = clob;
+		lock = cl_lock_peek(env, io, descr, "localsize", current);
+		if (lock != NULL) {
+			cl_merge_lvb(env, inode);
+			cl_unuse(env, lock);
+			cl_lock_release(env, lock, "localsize", current);
+			result = 0;
+		} else
+			result = -ENODATA;
+	}
+	cl_io_fini(env, io);
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}

diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_cl.c b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
new file mode 100644
index 0000000..4a01666
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c

@@ -0,0 +1,1325 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/fs.h>
+# include <linux/sched.h>
+# include <linux/mm.h>
+# include <linux/quotaops.h>
+# include <linux/highmem.h>
+# include <linux/pagemap.h>
+# include <linux/rbtree.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_mdc.h>
+#include <cl_object.h>
+
+#include <lclient.h>
+
+#include "../llite/llite_internal.h"
+
+const struct cl_req_operations ccc_req_ops;
+
+/*
+ * ccc_ prefix stands for "Common Client Code".
+ */
+
+static struct kmem_cache *ccc_lock_kmem;
+static struct kmem_cache *ccc_object_kmem;
+static struct kmem_cache *ccc_thread_kmem;
+static struct kmem_cache *ccc_session_kmem;
+static struct kmem_cache *ccc_req_kmem;
+
+static struct lu_kmem_descr ccc_caches[] = {
+	{
+		.ckd_cache = &ccc_lock_kmem,
+		.ckd_name  = "ccc_lock_kmem",
+		.ckd_size  = sizeof (struct ccc_lock)
+	},
+	{
+		.ckd_cache = &ccc_object_kmem,
+		.ckd_name  = "ccc_object_kmem",
+		.ckd_size  = sizeof (struct ccc_object)
+	},
+	{
+		.ckd_cache = &ccc_thread_kmem,
+		.ckd_name  = "ccc_thread_kmem",
+		.ckd_size  = sizeof (struct ccc_thread_info),
+	},
+	{
+		.ckd_cache = &ccc_session_kmem,
+		.ckd_name  = "ccc_session_kmem",
+		.ckd_size  = sizeof (struct ccc_session)
+	},
+	{
+		.ckd_cache = &ccc_req_kmem,
+		.ckd_name  = "ccc_req_kmem",
+		.ckd_size  = sizeof (struct ccc_req)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+void *ccc_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct ccc_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+void ccc_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct ccc_thread_info *info = data;
+	OBD_SLAB_FREE_PTR(info, ccc_thread_kmem);
+}
+
+void *ccc_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct ccc_session *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, __GFP_IO);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+void ccc_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct ccc_session *session = data;
+	OBD_SLAB_FREE_PTR(session, ccc_session_kmem);
+}
+
+struct lu_context_key ccc_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = ccc_key_init,
+	.lct_fini = ccc_key_fini
+};
+
+struct lu_context_key ccc_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = ccc_session_key_init,
+	.lct_fini = ccc_session_key_fini
+};
+
+
+/* type constructor/destructor: ccc_type_{init,fini,start,stop}(). */
+// LU_TYPE_INIT_FINI(ccc, &ccc_key, &ccc_session_key);
+
+int ccc_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	struct ccc_device  *vdv;
+	int rc;
+	ENTRY;
+
+	vdv = lu2ccc_dev(d);
+	vdv->cdv_next = lu2cl_dev(next);
+
+	LASSERT(d->ld_site != NULL && next->ld_type != NULL);
+	next->ld_site = d->ld_site;
+	rc = next->ld_type->ldt_ops->ldto_device_init(
+			env, next, next->ld_type->ldt_name, NULL);
+	if (rc == 0) {
+		lu_device_get(next);
+		lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	}
+	RETURN(rc);
+}
+
+struct lu_device *ccc_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	return cl2lu_dev(lu2ccc_dev(d)->cdv_next);
+}
+
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+				   struct lu_device_type *t,
+				   struct lustre_cfg *cfg,
+				   const struct lu_device_operations *luops,
+				   const struct cl_device_operations *clops)
+{
+	struct ccc_device *vdv;
+	struct lu_device  *lud;
+	struct cl_site    *site;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(vdv);
+	if (vdv == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lud = &vdv->cdv_cl.cd_lu_dev;
+	cl_device_init(&vdv->cdv_cl, t);
+	ccc2lu_dev(vdv)->ld_ops = luops;
+	vdv->cdv_cl.cd_ops = clops;
+
+	OBD_ALLOC_PTR(site);
+	if (site != NULL) {
+		rc = cl_site_init(site, &vdv->cdv_cl);
+		if (rc == 0)
+			rc = lu_site_init_finish(&site->cs_lu);
+		else {
+			LASSERT(lud->ld_site == NULL);
+			CERROR("Cannot init lu_site, rc %d.\n", rc);
+			OBD_FREE_PTR(site);
+		}
+	} else
+		rc = -ENOMEM;
+	if (rc != 0) {
+		ccc_device_free(env, lud);
+		lud = ERR_PTR(rc);
+	}
+	RETURN(lud);
+}
+
+struct lu_device *ccc_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct ccc_device *vdv  = lu2ccc_dev(d);
+	struct cl_site    *site = lu2cl_site(d->ld_site);
+	struct lu_device  *next = cl2lu_dev(vdv->cdv_next);
+
+	if (d->ld_site != NULL) {
+		cl_site_fini(site);
+		OBD_FREE_PTR(site);
+	}
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(vdv);
+	return next;
+}
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+			struct cl_req *req)
+{
+	struct ccc_req *vrq;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, __GFP_IO);
+	if (vrq != NULL) {
+		cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+/**
+ * An `emergency' environment used by ccc_inode_fini() when cl_env_get()
+ * fails. Access to this environment is serialized by ccc_inode_fini_guard
+ * mutex.
+ */
+static struct lu_env *ccc_inode_fini_env = NULL;
+
+/**
+ * A mutex serializing calls to slp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+static DEFINE_MUTEX(ccc_inode_fini_guard);
+static int dummy_refcheck;
+
+int ccc_global_init(struct lu_device_type *device_type)
+{
+	int result;
+
+	result = lu_kmem_init(ccc_caches);
+	if (result)
+		return result;
+
+	result = lu_device_type_init(device_type);
+	if (result)
+		goto out_kmem;
+
+	ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck,
+					  LCT_REMEMBER|LCT_NOREF);
+	if (IS_ERR(ccc_inode_fini_env)) {
+		result = PTR_ERR(ccc_inode_fini_env);
+		goto out_device;
+	}
+
+	ccc_inode_fini_env->le_ctx.lc_cookie = 0x4;
+	return 0;
+out_device:
+	lu_device_type_fini(device_type);
+out_kmem:
+	lu_kmem_fini(ccc_caches);
+	return result;
+}
+
+void ccc_global_fini(struct lu_device_type *device_type)
+{
+	if (ccc_inode_fini_env != NULL) {
+		cl_env_put(ccc_inode_fini_env, &dummy_refcheck);
+		ccc_inode_fini_env = NULL;
+	}
+	lu_device_type_fini(device_type);
+	lu_kmem_fini(ccc_caches);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev,
+				   const struct cl_object_operations *clops,
+				   const struct lu_object_operations *luops)
+{
+	struct ccc_object *vob;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, __GFP_IO);
+	if (vob != NULL) {
+		struct cl_object_header *hdr;
+
+		obj = ccc2lu(vob);
+		hdr = &vob->cob_header;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+
+		vob->cob_cl.co_ops = clops;
+		obj->lo_ops = luops;
+	} else
+		obj = NULL;
+	return obj;
+}
+
+int ccc_object_init0(const struct lu_env *env,
+			    struct ccc_object *vob,
+			    const struct cl_object_conf *conf)
+{
+	vob->cob_inode = conf->coc_inode;
+	vob->cob_transient_pages = 0;
+	cl_object_page_init(&vob->cob_cl, sizeof(struct ccc_page));
+	return 0;
+}
+
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct ccc_device *dev = lu2ccc_dev(obj->lo_dev);
+	struct ccc_object *vob = lu2ccc(obj);
+	struct lu_object  *below;
+	struct lu_device  *under;
+	int result;
+
+	under = &dev->cdv_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below != NULL) {
+		const struct cl_object_conf *cconf;
+
+		cconf = lu2cl_conf(conf);
+		INIT_LIST_HEAD(&vob->cob_pending_list);
+		lu_object_add(obj, below);
+		result = ccc_object_init0(env, vob, cconf);
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct ccc_object *vob = lu2ccc(obj);
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
+	OBD_SLAB_FREE_PTR(vob, ccc_object_kmem);
+}
+
+int ccc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *unused,
+		  const struct cl_lock_operations *lkops)
+{
+	struct ccc_lock *clk;
+	int result;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, __GFP_IO);
+	if (clk != NULL) {
+		cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_attr *attr, unsigned valid)
+{
+	return 0;
+}
+
+int ccc_object_glimpse(const struct lu_env *env,
+		       const struct cl_object *obj, struct ost_lvb *lvb)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	ENTRY;
+	lvb->lvb_mtime = cl_inode_mtime(inode);
+	lvb->lvb_atime = cl_inode_atime(inode);
+	lvb->lvb_ctime = cl_inode_ctime(inode);
+	/*
+	 * LU-417: Add dirty pages block count lest i_blocks reports 0, some
+	 * "cp" or "tar" on remote node may think it's a completely sparse file
+	 * and skip it.
+	 */
+	if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0)
+		lvb->lvb_blocks = dirty_cnt(inode);
+	RETURN(0);
+}
+
+
+
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_object_conf *conf)
+{
+	/* TODO: destroy all pages attached to this object. */
+	return 0;
+}
+
+static void ccc_object_size_lock(struct cl_object *obj)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	cl_isize_lock(inode);
+	cl_object_attr_lock(obj);
+}
+
+static void ccc_object_size_unlock(struct cl_object *obj)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	cl_object_attr_unlock(obj);
+	cl_isize_unlock(inode);
+}
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+struct page *ccc_page_vmpage(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	return cl2vm_page(slice);
+}
+
+int ccc_page_is_under_lock(const struct lu_env *env,
+			   const struct cl_page_slice *slice,
+			   struct cl_io *io)
+{
+	struct ccc_io	*cio  = ccc_env_io(env);
+	struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr;
+	struct cl_page       *page = slice->cpl_page;
+
+	int result;
+
+	ENTRY;
+
+	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+	    io->ci_type == CIT_FAULT) {
+		if (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+			result = -EBUSY;
+		else {
+			desc->cld_start = page->cp_index;
+			desc->cld_end   = page->cp_index;
+			desc->cld_obj   = page->cp_obj;
+			desc->cld_mode  = CLM_READ;
+			result = cl_queue_match(&io->ci_lockset.cls_done,
+						desc) ? -EBUSY : 0;
+		}
+	} else
+		result = 0;
+	RETURN(result);
+}
+
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice)
+{
+	/*
+	 * Cached read?
+	 */
+	LBUG();
+	return 0;
+}
+
+void ccc_transient_page_verify(const struct cl_page *page)
+{
+}
+
+int ccc_transient_page_own(const struct lu_env *env,
+				   const struct cl_page_slice *slice,
+				   struct cl_io *unused,
+				   int nonblock)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+	return 0;
+}
+
+void ccc_transient_page_assume(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_unassume(const struct lu_env *env,
+					const struct cl_page_slice *slice,
+					struct cl_io *unused)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_disown(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_discard(const struct lu_env *env,
+				       const struct cl_page_slice *slice,
+				       struct cl_io *unused)
+{
+	struct cl_page *page = slice->cpl_page;
+
+	ccc_transient_page_verify(slice->cpl_page);
+
+	/*
+	 * For transient pages, remove it from the radix tree.
+	 */
+	cl_page_delete(env, page);
+}
+
+int ccc_transient_page_prep(const struct lu_env *env,
+				   const struct cl_page_slice *slice,
+				   struct cl_io *unused)
+{
+	ENTRY;
+	/* transient page should always be sent. */
+	RETURN(0);
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+void ccc_lock_delete(const struct lu_env *env,
+		     const struct cl_lock_slice *slice)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+}
+
+void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+{
+	struct ccc_lock *clk = cl2ccc_lock(slice);
+	OBD_SLAB_FREE_PTR(clk, ccc_lock_kmem);
+}
+
+int ccc_lock_enqueue(const struct lu_env *env,
+		     const struct cl_lock_slice *slice,
+		     struct cl_io *unused, __u32 enqflags)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+	return 0;
+}
+
+int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+	return 0;
+}
+
+int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+	CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+	return 0;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_fits_into() methods for ccc
+ * layer. This function is executed every time io finds an existing lock in
+ * the lock cache while creating new lock. This function has to decide whether
+ * cached lock "fits" into io.
+ *
+ * \param slice lock to be checked
+ * \param io    IO that wants a lock.
+ *
+ * \see lov_lock_fits_into().
+ */
+int ccc_lock_fits_into(const struct lu_env *env,
+		       const struct cl_lock_slice *slice,
+		       const struct cl_lock_descr *need,
+		       const struct cl_io *io)
+{
+	const struct cl_lock       *lock  = slice->cls_lock;
+	const struct cl_lock_descr *descr = &lock->cll_descr;
+	const struct ccc_io	*cio   = ccc_env_io(env);
+	int			 result;
+
+	ENTRY;
+	/*
+	 * Work around DLM peculiarity: it assumes that glimpse
+	 * (LDLM_FL_HAS_INTENT) lock is always LCK_PR, and returns reads lock
+	 * when asked for LCK_PW lock with LDLM_FL_HAS_INTENT flag set. Make
+	 * sure that glimpse doesn't get CLM_WRITE top-lock, so that it
+	 * doesn't enqueue CLM_WRITE sub-locks.
+	 */
+	if (cio->cui_glimpse)
+		result = descr->cld_mode != CLM_WRITE;
+
+	/*
+	 * Also, don't match incomplete write locks for read, otherwise read
+	 * would enqueue missing sub-locks in the write mode.
+	 */
+	else if (need->cld_mode != descr->cld_mode)
+		result = lock->cll_state >= CLS_ENQUEUED;
+	else
+		result = 1;
+	RETURN(result);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for ccc layer, invoked
+ * whenever lock state changes. Transfers object attributes, that might be
+ * updated as a result of lock acquiring into inode.
+ */
+void ccc_lock_state(const struct lu_env *env,
+		    const struct cl_lock_slice *slice,
+		    enum cl_lock_state state)
+{
+	struct cl_lock *lock = slice->cls_lock;
+	ENTRY;
+
+	/*
+	 * Refresh inode attributes when the lock is moving into CLS_HELD
+	 * state, and only when this is a result of real enqueue, rather than
+	 * of finding lock in the cache.
+	 */
+	if (state == CLS_HELD && lock->cll_state < CLS_HELD) {
+		struct cl_object *obj;
+		struct inode     *inode;
+
+		obj   = slice->cls_obj;
+		inode = ccc_object_inode(obj);
+
+		/* vmtruncate() sets the i_size
+		 * under both a DLM lock and the
+		 * ll_inode_size_lock().  If we don't get the
+		 * ll_inode_size_lock() here we can match the DLM lock and
+		 * reset i_size.  generic_file_write can then trust the
+		 * stale i_size when doing appending writes and effectively
+		 * cancel the result of the truncate.  Getting the
+		 * ll_inode_size_lock() after the enqueue maintains the DLM
+		 * -> ll_inode_size_lock() acquiring order. */
+		if (lock->cll_descr.cld_start == 0 &&
+		    lock->cll_descr.cld_end == CL_PAGE_EOF)
+			cl_merge_lvb(env, inode);
+	}
+	EXIT;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+
+	CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+}
+
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+			  __u32 enqflags, enum cl_lock_mode mode,
+			  pgoff_t start, pgoff_t end)
+{
+	struct ccc_io	  *cio   = ccc_env_io(env);
+	struct cl_lock_descr   *descr = &cio->cui_link.cill_descr;
+	struct cl_object       *obj   = io->ci_obj;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end);
+
+	memset(&cio->cui_link, 0, sizeof cio->cui_link);
+
+	if (cio->cui_fd && (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+		descr->cld_mode = CLM_GROUP;
+		descr->cld_gid  = cio->cui_fd->fd_grouplock.cg_gid;
+	} else {
+		descr->cld_mode  = mode;
+	}
+	descr->cld_obj   = obj;
+	descr->cld_start = start;
+	descr->cld_end   = end;
+	descr->cld_enq_flags = enqflags;
+
+	cl_io_lock_add(env, io, &cio->cui_link);
+	RETURN(0);
+}
+
+void ccc_io_update_iov(const struct lu_env *env,
+		       struct ccc_io *cio, struct cl_io *io)
+{
+	int i;
+	size_t size = io->u.ci_rw.crw_count;
+
+	cio->cui_iov_olen = 0;
+	if (!cl_is_normalio(env, io) || cio->cui_tot_nrsegs == 0)
+		return;
+
+	for (i = 0; i < cio->cui_tot_nrsegs; i++) {
+		struct iovec *iv = &cio->cui_iov[i];
+
+		if (iv->iov_len < size)
+			size -= iv->iov_len;
+		else {
+			if (iv->iov_len > size) {
+				cio->cui_iov_olen = iv->iov_len;
+				iv->iov_len = size;
+			}
+			break;
+		}
+	}
+
+	cio->cui_nrsegs = i + 1;
+	LASSERTF(cio->cui_tot_nrsegs >= cio->cui_nrsegs,
+		 "tot_nrsegs: %lu, nrsegs: %lu\n",
+		 cio->cui_tot_nrsegs, cio->cui_nrsegs);
+}
+
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+		    __u32 enqflags, enum cl_lock_mode mode,
+		    loff_t start, loff_t end)
+{
+	struct cl_object *obj = io->ci_obj;
+	return ccc_io_one_lock_index(env, io, enqflags, mode,
+				     cl_index(obj, start), cl_index(obj, end));
+}
+
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	CLOBINVRNT(env, ios->cis_io->ci_obj,
+		   ccc_object_invariant(ios->cis_io->ci_obj));
+}
+
+void ccc_io_advance(const struct lu_env *env,
+		    const struct cl_io_slice *ios,
+		    size_t nob)
+{
+	struct ccc_io    *cio = cl2ccc_io(env, ios);
+	struct cl_io     *io  = ios->cis_io;
+	struct cl_object *obj = ios->cis_io->ci_obj;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	if (!cl_is_normalio(env, io))
+		return;
+
+	LASSERT(cio->cui_tot_nrsegs >= cio->cui_nrsegs);
+	LASSERT(cio->cui_tot_count  >= nob);
+
+	cio->cui_iov	+= cio->cui_nrsegs;
+	cio->cui_tot_nrsegs -= cio->cui_nrsegs;
+	cio->cui_tot_count  -= nob;
+
+	/* update the iov */
+	if (cio->cui_iov_olen > 0) {
+		struct iovec *iv;
+
+		cio->cui_iov--;
+		cio->cui_tot_nrsegs++;
+		iv = &cio->cui_iov[0];
+		if (io->ci_continue) {
+			iv->iov_base += iv->iov_len;
+			LASSERT(cio->cui_iov_olen > iv->iov_len);
+			iv->iov_len = cio->cui_iov_olen - iv->iov_len;
+		} else {
+			/* restore the iov_len, in case of restart io. */
+			iv->iov_len = cio->cui_iov_olen;
+		}
+		cio->cui_iov_olen = 0;
+	}
+}
+
+/**
+ * Helper function that if necessary adjusts file size (inode->i_size), when
+ * position at the offset \a pos is accessed. File size can be arbitrary stale
+ * on a Lustre client, but client at least knows KMS. If accessed area is
+ * inside [0, KMS], set file size to KMS, otherwise glimpse file size.
+ *
+ * Locking: cl_isize_lock is used to serialize changes to inode size and to
+ * protect consistency between inode size and cl_object
+ * attributes. cl_object_size_lock() protects consistency between cl_attr's of
+ * top-object and sub-objects.
+ */
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_io *io, loff_t start, size_t count, int *exceed)
+{
+	struct cl_attr *attr  = ccc_env_thread_attr(env);
+	struct inode   *inode = ccc_object_inode(obj);
+	loff_t	  pos   = start + count - 1;
+	loff_t kms;
+	int result;
+
+	/*
+	 * Consistency guarantees: following possibilities exist for the
+	 * relation between region being accessed and real file size at this
+	 * moment:
+	 *
+	 *  (A): the region is completely inside of the file;
+	 *
+	 *  (B-x): x bytes of region are inside of the file, the rest is
+	 *  outside;
+	 *
+	 *  (C): the region is completely outside of the file.
+	 *
+	 * This classification is stable under DLM lock already acquired by
+	 * the caller, because to change the class, other client has to take
+	 * DLM lock conflicting with our lock. Also, any updates to ->i_size
+	 * by other threads on this client are serialized by
+	 * ll_inode_size_lock(). This guarantees that short reads are handled
+	 * correctly in the face of concurrent writes and truncates.
+	 */
+	ccc_object_size_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	if (result == 0) {
+		kms = attr->cat_kms;
+		if (pos > kms) {
+			/*
+			 * A glimpse is necessary to determine whether we
+			 * return a short read (B) or some zeroes at the end
+			 * of the buffer (C)
+			 */
+			ccc_object_size_unlock(obj);
+			result = cl_glimpse_lock(env, io, inode, obj, 0);
+			if (result == 0 && exceed != NULL) {
+				/* If objective page index exceed end-of-file
+				 * page index, return directly. Do not expect
+				 * kernel will check such case correctly.
+				 * linux-2.6.18-128.1.1 miss to do that.
+				 * --bug 17336 */
+				loff_t size = cl_isize_read(inode);
+				unsigned long cur_index = start >> PAGE_CACHE_SHIFT;
+
+				if ((size == 0 && cur_index != 0) ||
+				    (((size - 1) >> PAGE_CACHE_SHIFT) < cur_index))
+				*exceed = 1;
+			}
+			return result;
+		} else {
+			/*
+			 * region is within kms and, hence, within real file
+			 * size (A). We need to increase i_size to cover the
+			 * read region so that generic_file_read() will do its
+			 * job, but that doesn't mean the kms size is
+			 * _correct_, it is only the _minimum_ size. If
+			 * someone does a stat they will get the correct size
+			 * which will always be >= the kms value here.
+			 * b=11081
+			 */
+			if (cl_isize_read(inode) < kms) {
+				cl_isize_write_nolock(inode, kms);
+				CDEBUG(D_VFSTRACE,
+				       DFID" updating i_size "LPU64"\n",
+				       PFID(lu_object_fid(&obj->co_lu)),
+				       (__u64)cl_isize_read(inode));
+
+			}
+		}
+	}
+	ccc_object_size_unlock(obj);
+	return result;
+}
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+void ccc_req_completion(const struct lu_env *env,
+			const struct cl_req_slice *slice, int ioret)
+{
+	struct ccc_req *vrq;
+
+	if (ioret > 0)
+		cl_stats_tally(slice->crs_dev, slice->crs_req->crq_type, ioret);
+
+	vrq = cl2ccc_req(slice);
+	OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for ccc
+ * layer. ccc is responsible for
+ *
+ *    - o_[mac]time
+ *
+ *    - o_mode
+ *
+ *    - o_parent_seq
+ *
+ *    - o_[ug]id
+ *
+ *    - o_parent_oid
+ *
+ *    - o_parent_ver
+ *
+ *    - o_ioepoch,
+ *
+ *  and capability.
+ */
+void ccc_req_attr_set(const struct lu_env *env,
+		      const struct cl_req_slice *slice,
+		      const struct cl_object *obj,
+		      struct cl_req_attr *attr, obd_valid flags)
+{
+	struct inode *inode;
+	struct obdo  *oa;
+	obd_flag      valid_flags;
+
+	oa = attr->cra_oa;
+	inode = ccc_object_inode(obj);
+	valid_flags = OBD_MD_FLTYPE;
+
+	if ((flags & OBD_MD_FLOSSCAPA) != 0) {
+		LASSERT(attr->cra_capa == NULL);
+		attr->cra_capa = cl_capa_lookup(inode,
+						slice->crs_req->crq_type);
+	}
+
+	if (slice->crs_req->crq_type == CRT_WRITE) {
+		if (flags & OBD_MD_FLEPOCH) {
+			oa->o_valid |= OBD_MD_FLEPOCH;
+			oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch;
+			valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+				       OBD_MD_FLUID | OBD_MD_FLGID;
+		}
+	}
+	obdo_from_inode(oa, inode, valid_flags & flags);
+	obdo_set_parent_fid(oa, &cl_i2info(inode)->lli_fid);
+	memcpy(attr->cra_jobid, cl_i2info(inode)->lli_jobid,
+	       JOBSTATS_JOBID_SIZE);
+}
+
+const struct cl_req_operations ccc_req_ops = {
+	.cro_attr_set   = ccc_req_attr_set,
+	.cro_completion = ccc_req_completion
+};
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+		   struct obd_capa *capa)
+{
+	struct lu_env *env;
+	struct cl_io  *io;
+	int	    result;
+	int	    refcheck;
+
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = cl_i2info(inode)->lli_clob;
+
+	io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime);
+	io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime);
+	io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime);
+	io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
+	io->u.ci_setattr.sa_valid = attr->ia_valid;
+	io->u.ci_setattr.sa_capa = capa;
+
+again:
+	if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
+		struct ccc_io *cio = ccc_env_io(env);
+
+		if (attr->ia_valid & ATTR_FILE)
+			/* populate the file descriptor for ftruncate to honor
+			 * group lock - see LU-787 */
+			cio->cui_fd = cl_iattr2fd(inode, attr);
+
+		result = cl_io_loop(env, io);
+	} else {
+		result = io->ci_result;
+	}
+	cl_io_fini(env, io);
+	if (unlikely(io->ci_need_restart))
+		goto again;
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+struct lu_device *ccc2lu_dev(struct ccc_device *vdv)
+{
+	return &vdv->cdv_cl.cd_lu_dev;
+}
+
+struct ccc_device *lu2ccc_dev(const struct lu_device *d)
+{
+	return container_of0(d, struct ccc_device, cdv_cl.cd_lu_dev);
+}
+
+struct ccc_device *cl2ccc_dev(const struct cl_device *d)
+{
+	return container_of0(d, struct ccc_device, cdv_cl);
+}
+
+struct lu_object *ccc2lu(struct ccc_object *vob)
+{
+	return &vob->cob_cl.co_lu;
+}
+
+struct ccc_object *lu2ccc(const struct lu_object *obj)
+{
+	return container_of0(obj, struct ccc_object, cob_cl.co_lu);
+}
+
+struct ccc_object *cl2ccc(const struct cl_object *obj)
+{
+	return container_of0(obj, struct ccc_object, cob_cl);
+}
+
+struct ccc_lock *cl2ccc_lock(const struct cl_lock_slice *slice)
+{
+	return container_of(slice, struct ccc_lock, clk_cl);
+}
+
+struct ccc_io *cl2ccc_io(const struct lu_env *env,
+			 const struct cl_io_slice *slice)
+{
+	struct ccc_io *cio;
+
+	cio = container_of(slice, struct ccc_io, cui_cl);
+	LASSERT(cio == ccc_env_io(env));
+	return cio;
+}
+
+struct ccc_req *cl2ccc_req(const struct cl_req_slice *slice)
+{
+	return container_of0(slice, struct ccc_req, crq_cl);
+}
+
+struct page *cl2vm_page(const struct cl_page_slice *slice)
+{
+	return cl2ccc_page(slice)->cpg_page;
+}
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+int ccc_object_invariant(const struct cl_object *obj)
+{
+	struct inode	 *inode = ccc_object_inode(obj);
+	struct cl_inode_info *lli   = cl_i2info(inode);
+
+	return (S_ISREG(cl_inode_mode(inode)) ||
+		/* i_mode of unlinked inode is zeroed. */
+		cl_inode_mode(inode) == 0) && lli->lli_clob == obj;
+}
+
+struct inode *ccc_object_inode(const struct cl_object *obj)
+{
+	return cl2ccc(obj)->cob_inode;
+}
+
+/**
+ * Returns a pointer to cl_page associated with \a vmpage, without acquiring
+ * additional reference to the resulting page. This is an unsafe version of
+ * cl_vmpage_page() that can only be used under vmpage lock.
+ */
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage)
+{
+	KLASSERT(PageLocked(vmpage));
+	return (struct cl_page *)vmpage->private;
+}
+
+/**
+ * Initialize or update CLIO structures for regular files when new
+ * meta-data arrives from the server.
+ *
+ * \param inode regular file inode
+ * \param md    new file metadata from MDS
+ * - allocates cl_object if necessary,
+ * - updated layout, if object was already here.
+ */
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
+{
+	struct lu_env	*env;
+	struct cl_inode_info *lli;
+	struct cl_object     *clob;
+	struct lu_site       *site;
+	struct lu_fid	*fid;
+	struct cl_object_conf conf = {
+		.coc_inode = inode,
+		.u = {
+			.coc_md    = md
+		}
+	};
+	int result = 0;
+	int refcheck;
+
+	LASSERT(md->body->valid & OBD_MD_FLID);
+	LASSERT(S_ISREG(cl_inode_mode(inode)));
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	site = cl_i2sbi(inode)->ll_site;
+	lli  = cl_i2info(inode);
+	fid  = &lli->lli_fid;
+	LASSERT(fid_is_sane(fid));
+
+	if (lli->lli_clob == NULL) {
+		/* clob is slave of inode, empty lli_clob means for new inode,
+		 * there is no clob in cache with the given fid, so it is
+		 * unnecessary to perform lookup-alloc-lookup-insert, just
+		 * alloc and insert directly. */
+		LASSERT(inode->i_state & I_NEW);
+		conf.coc_lu.loc_flags = LOC_F_NEW;
+		clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
+				      fid, &conf);
+		if (!IS_ERR(clob)) {
+			/*
+			 * No locking is necessary, as new inode is
+			 * locked by I_NEW bit.
+			 */
+			lli->lli_clob = clob;
+			lli->lli_has_smd = md->lsm != NULL;
+			lu_object_ref_add(&clob->co_lu, "inode", inode);
+		} else
+			result = PTR_ERR(clob);
+	} else {
+		result = cl_conf_set(env, lli->lli_clob, &conf);
+	}
+
+	cl_env_put(env, &refcheck);
+
+	if (result != 0)
+		CERROR("Failure to initialize cl object "DFID": %d\n",
+		       PFID(fid), result);
+	return result;
+}
+
+/**
+ * Wait for others drop their references of the object at first, then we drop
+ * the last one, which will lead to the object be destroyed immediately.
+ * Must be called after cl_object_kill() against this object.
+ *
+ * The reason we want to do this is: destroying top object will wait for sub
+ * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs)
+ * to initiate top object destroying which may deadlock. See bz22520.
+ */
+static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
+{
+	struct lu_object_header *header = obj->co_lu.lo_header;
+	wait_queue_t	   waiter;
+
+	if (unlikely(atomic_read(&header->loh_ref) != 1)) {
+		struct lu_site *site = obj->co_lu.lo_dev->ld_site;
+		struct lu_site_bkt_data *bkt;
+
+		bkt = lu_site_bkt_from_fid(site, &header->loh_fid);
+
+		init_waitqueue_entry_current(&waiter);
+		add_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+
+		while (1) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&header->loh_ref) == 1)
+				break;
+			waitq_wait(&waiter, TASK_UNINTERRUPTIBLE);
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+	}
+
+	cl_object_put(env, obj);
+}
+
+void cl_inode_fini(struct inode *inode)
+{
+	struct lu_env	   *env;
+	struct cl_inode_info    *lli  = cl_i2info(inode);
+	struct cl_object	*clob = lli->lli_clob;
+	int refcheck;
+	int emergency;
+
+	if (clob != NULL) {
+		void		    *cookie;
+
+		cookie = cl_env_reenter();
+		env = cl_env_get(&refcheck);
+		emergency = IS_ERR(env);
+		if (emergency) {
+			mutex_lock(&ccc_inode_fini_guard);
+			LASSERT(ccc_inode_fini_env != NULL);
+			cl_env_implant(ccc_inode_fini_env, &refcheck);
+			env = ccc_inode_fini_env;
+		}
+		/*
+		 * cl_object cache is a slave to inode cache (which, in turn
+		 * is a slave to dentry cache), don't keep cl_object in memory
+		 * when its master is evicted.
+		 */
+		cl_object_kill(env, clob);
+		lu_object_ref_del(&clob->co_lu, "inode", inode);
+		cl_object_put_last(env, clob);
+		lli->lli_clob = NULL;
+		if (emergency) {
+			cl_env_unplant(ccc_inode_fini_env, &refcheck);
+			mutex_unlock(&ccc_inode_fini_guard);
+		} else
+			cl_env_put(env, &refcheck);
+		cl_env_reexit(cookie);
+	}
+}
+
+/**
+ * return IF_* type for given lu_dirent entry.
+ * IF_* flag shld be converted to particular OS file type in
+ * platform llite module.
+ */
+__u16 ll_dirent_type_get(struct lu_dirent *ent)
+{
+	__u16 type = 0;
+	struct luda_type *lt;
+	int len = 0;
+
+	if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
+		const unsigned align = sizeof(struct luda_type) - 1;
+
+		len = le16_to_cpu(ent->lde_namelen);
+		len = (len + align) & ~align;
+		lt = (void *)ent->lde_name + len;
+		type = IFTODT(le16_to_cpu(lt->lt_type));
+	}
+	return type;
+}
+
+/**
+ * build inode number from passed @fid */
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32)
+{
+	if (BITS_PER_LONG == 32 || api32)
+		RETURN(fid_flatten32(fid));
+	else
+		RETURN(fid_flatten(fid));
+}
+
+/**
+ * build inode generation from passed @fid.  If our FID overflows the 32-bit
+ * inode number then return a non-zero generation to distinguish them. */
+__u32 cl_fid_build_gen(const struct lu_fid *fid)
+{
+	__u32 gen;
+	ENTRY;
+
+	if (fid_is_igif(fid)) {
+		gen = lu_igif_gen(fid);
+		RETURN(gen);
+	}
+
+	gen = (fid_flatten(fid) >> 32);
+	RETURN(gen);
+}
+
+/* lsm is unreliable after hsm implementation as layout can be changed at
+ * any time. This is only to support old, non-clio-ized interfaces. It will
+ * cause deadlock if clio operations are called with this extra layout refcount
+ * because in case the layout changed during the IO, ll_layout_refresh() will
+ * have to wait for the refcount to become zero to destroy the older layout.
+ *
+ * Notice that the lsm returned by this function may not be valid unless called
+ * inside layout lock - MDS_INODELOCK_LAYOUT. */
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode)
+{
+	return lov_lsm_get(cl_i2info(inode)->lli_clob);
+}
+
+void inline ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm)
+{
+	lov_lsm_put(cl_i2info(inode)->lli_clob, lsm);
+}

diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_misc.c b/drivers/staging/lustre/lustre/lclient/lcommon_misc.c
new file mode 100644
index 0000000..8ecbef9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lclient/lcommon_misc.c

@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <cl_object.h>
+#include <lclient.h>
+
+#include <lustre_lite.h>
+
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
+{
+	struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 };
+	__u32 valsize = sizeof(struct lov_desc);
+	int rc, easize, def_easize, cookiesize;
+	struct lov_desc desc;
+	__u16 stripes;
+	ENTRY;
+
+	rc = obd_get_info(NULL, dt_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC,
+			  &valsize, &desc, NULL);
+	if (rc)
+		RETURN(rc);
+
+	stripes = min(desc.ld_tgt_count, (__u32)LOV_MAX_STRIPE_COUNT);
+	lsm.lsm_stripe_count = stripes;
+	easize = obd_size_diskmd(dt_exp, &lsm);
+
+	lsm.lsm_stripe_count = desc.ld_default_stripe_count;
+	def_easize = obd_size_diskmd(dt_exp, &lsm);
+
+	cookiesize = stripes * sizeof(struct llog_cookie);
+
+	CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n",
+	       easize, cookiesize);
+
+	rc = md_init_ea_size(md_exp, easize, def_easize, cookiesize);
+	RETURN(rc);
+}
+
+/**
+ * This function is used as an upcall-callback hooked by liblustre and llite
+ * clients into obd_notify() listeners chain to handle notifications about
+ * change of import connect_flags. See llu_fsswop_mount() and
+ * lustre_common_fill_super().
+ */
+int cl_ocd_update(struct obd_device *host,
+		  struct obd_device *watched,
+		  enum obd_notify_event ev, void *owner, void *data)
+{
+	struct lustre_client_ocd *lco;
+	struct client_obd	*cli;
+	__u64 flags;
+	int   result;
+
+	ENTRY;
+	if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+		cli = &watched->u.cli;
+		lco = owner;
+		flags = cli->cl_import->imp_connect_data.ocd_connect_flags;
+		CDEBUG(D_SUPER, "Changing connect_flags: "LPX64" -> "LPX64"\n",
+		       lco->lco_flags, flags);
+		mutex_lock(&lco->lco_lock);
+		lco->lco_flags &= flags;
+		/* for each osc event update ea size */
+		if (lco->lco_dt_exp)
+			cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp);
+
+		mutex_unlock(&lco->lco_lock);
+		result = 0;
+	} else {
+		CERROR("unexpected notification from %s %s!\n",
+		       watched->obd_type->typ_name,
+		       watched->obd_name);
+		result = -EINVAL;
+	}
+	RETURN(result);
+}
+
+#define GROUPLOCK_SCOPE "grouplock"
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+		     struct ccc_grouplock *cg)
+{
+	struct lu_env	  *env;
+	struct cl_io	   *io;
+	struct cl_lock	 *lock;
+	struct cl_lock_descr   *descr;
+	__u32		   enqflags;
+	int		     refcheck;
+	int		     rc;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return PTR_ERR(env);
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+
+	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (rc) {
+		LASSERT(rc < 0);
+		cl_env_put(env, &refcheck);
+		return rc;
+	}
+
+	descr = &ccc_env_info(env)->cti_descr;
+	descr->cld_obj = obj;
+	descr->cld_start = 0;
+	descr->cld_end = CL_PAGE_EOF;
+	descr->cld_gid = gid;
+	descr->cld_mode = CLM_GROUP;
+
+	enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0);
+	descr->cld_enq_flags = enqflags;
+
+	lock = cl_lock_request(env, io, descr, GROUPLOCK_SCOPE, current);
+	if (IS_ERR(lock)) {
+		cl_io_fini(env, io);
+		cl_env_put(env, &refcheck);
+		return PTR_ERR(lock);
+	}
+
+	cg->cg_env  = cl_env_get(&refcheck);
+	cg->cg_io   = io;
+	cg->cg_lock = lock;
+	cg->cg_gid  = gid;
+	LASSERT(cg->cg_env == env);
+
+	cl_env_unplant(env, &refcheck);
+	return 0;
+}
+
+void cl_put_grouplock(struct ccc_grouplock *cg)
+{
+	struct lu_env  *env  = cg->cg_env;
+	struct cl_io   *io   = cg->cg_io;
+	struct cl_lock *lock = cg->cg_lock;
+	int	     refcheck;
+
+	LASSERT(cg->cg_env);
+	LASSERT(cg->cg_gid);
+
+	cl_env_implant(env, &refcheck);
+	cl_env_put(env, &refcheck);
+
+	cl_unuse(env, lock);
+	cl_lock_release(env, lock, GROUPLOCK_SCOPE, current);
+	cl_io_fini(env, io);
+	cl_env_put(env, NULL);
+}

diff --git a/drivers/staging/lustre/lustre/ldlm/interval_tree.c b/drivers/staging/lustre/lustre/ldlm/interval_tree.c
new file mode 100644
index 0000000..ce90c7e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/interval_tree.c

@@ -0,0 +1,764 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/interval_tree.c
+ *
+ * Interval tree library used by ldlm extent lock code
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+# include <lustre_dlm.h>
+#include <obd_support.h>
+#include <interval_tree.h>
+
+enum {
+	INTERVAL_RED = 0,
+	INTERVAL_BLACK = 1
+};
+
+static inline int node_is_left_child(struct interval_node *node)
+{
+	LASSERT(node->in_parent != NULL);
+	return node == node->in_parent->in_left;
+}
+
+static inline int node_is_right_child(struct interval_node *node)
+{
+	LASSERT(node->in_parent != NULL);
+	return node == node->in_parent->in_right;
+}
+
+static inline int node_is_red(struct interval_node *node)
+{
+	return node->in_color == INTERVAL_RED;
+}
+
+static inline int node_is_black(struct interval_node *node)
+{
+	return node->in_color == INTERVAL_BLACK;
+}
+
+static inline int extent_compare(struct interval_node_extent *e1,
+				 struct interval_node_extent *e2)
+{
+	int rc;
+	if (e1->start == e2->start) {
+		if (e1->end < e2->end)
+			rc = -1;
+		else if (e1->end > e2->end)
+			rc = 1;
+		else
+			rc = 0;
+	} else {
+		if (e1->start < e2->start)
+			rc = -1;
+		else
+			rc = 1;
+	}
+	return rc;
+}
+
+static inline int extent_equal(struct interval_node_extent *e1,
+			       struct interval_node_extent *e2)
+{
+	return (e1->start == e2->start) && (e1->end == e2->end);
+}
+
+static inline int extent_overlapped(struct interval_node_extent *e1,
+				    struct interval_node_extent *e2)
+{
+	return (e1->start <= e2->end) && (e2->start <= e1->end);
+}
+
+static inline int node_compare(struct interval_node *n1,
+			       struct interval_node *n2)
+{
+	return extent_compare(&n1->in_extent, &n2->in_extent);
+}
+
+static inline int node_equal(struct interval_node *n1,
+			     struct interval_node *n2)
+{
+	return extent_equal(&n1->in_extent, &n2->in_extent);
+}
+
+static inline __u64 max_u64(__u64 x, __u64 y)
+{
+	return x > y ? x : y;
+}
+
+static inline __u64 min_u64(__u64 x, __u64 y)
+{
+	return x < y ? x : y;
+}
+
+#define interval_for_each(node, root)		   \
+for (node = interval_first(root); node != NULL;	 \
+     node = interval_next(node))
+
+#define interval_for_each_reverse(node, root)	   \
+for (node = interval_last(root); node != NULL;	  \
+     node = interval_prev(node))
+
+static struct interval_node *interval_first(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+	while (node->in_left)
+		node = node->in_left;
+	RETURN(node);
+}
+
+static struct interval_node *interval_last(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+	while (node->in_right)
+		node = node->in_right;
+	RETURN(node);
+}
+
+static struct interval_node *interval_next(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+	if (node->in_right)
+		RETURN(interval_first(node->in_right));
+	while (node->in_parent && node_is_right_child(node))
+		node = node->in_parent;
+	RETURN(node->in_parent);
+}
+
+static struct interval_node *interval_prev(struct interval_node *node)
+{
+	ENTRY;
+
+	if (!node)
+		RETURN(NULL);
+
+	if (node->in_left)
+		RETURN(interval_last(node->in_left));
+
+	while (node->in_parent && node_is_left_child(node))
+		node = node->in_parent;
+
+	RETURN(node->in_parent);
+}
+
+enum interval_iter interval_iterate(struct interval_node *root,
+				    interval_callback_t func,
+				    void *data)
+{
+	struct interval_node *node;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+	ENTRY;
+
+	interval_for_each(node, root) {
+		rc = func(node, data);
+		if (rc == INTERVAL_ITER_STOP)
+			break;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate);
+
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+					    interval_callback_t func,
+					    void *data)
+{
+	struct interval_node *node;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+	ENTRY;
+
+	interval_for_each_reverse(node, root) {
+		rc = func(node, data);
+		if (rc == INTERVAL_ITER_STOP)
+			break;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate_reverse);
+
+/* try to find a node with same interval in the tree,
+ * if found, return the pointer to the node, otherwise return NULL*/
+struct interval_node *interval_find(struct interval_node *root,
+				    struct interval_node_extent *ex)
+{
+	struct interval_node *walk = root;
+	int rc;
+	ENTRY;
+
+	while (walk) {
+		rc = extent_compare(ex, &walk->in_extent);
+		if (rc == 0)
+			break;
+		else if (rc < 0)
+			walk = walk->in_left;
+		else
+			walk = walk->in_right;
+	}
+
+	RETURN(walk);
+}
+EXPORT_SYMBOL(interval_find);
+
+static void __rotate_change_maxhigh(struct interval_node *node,
+				    struct interval_node *rotate)
+{
+	__u64 left_max, right_max;
+
+	rotate->in_max_high = node->in_max_high;
+	left_max = node->in_left ? node->in_left->in_max_high : 0;
+	right_max = node->in_right ? node->in_right->in_max_high : 0;
+	node->in_max_high  = max_u64(interval_high(node),
+				     max_u64(left_max,right_max));
+}
+
+/* The left rotation "pivots" around the link from node to node->right, and
+ * - node will be linked to node->right's left child, and
+ * - node->right's left child will be linked to node's right child.  */
+static void __rotate_left(struct interval_node *node,
+			  struct interval_node **root)
+{
+	struct interval_node *right = node->in_right;
+	struct interval_node *parent = node->in_parent;
+
+	node->in_right = right->in_left;
+	if (node->in_right)
+		right->in_left->in_parent = node;
+
+	right->in_left = node;
+	right->in_parent = parent;
+	if (parent) {
+		if (node_is_left_child(node))
+			parent->in_left = right;
+		else
+			parent->in_right = right;
+	} else {
+		*root = right;
+	}
+	node->in_parent = right;
+
+	/* update max_high for node and right */
+	__rotate_change_maxhigh(node, right);
+}
+
+/* The right rotation "pivots" around the link from node to node->left, and
+ * - node will be linked to node->left's right child, and
+ * - node->left's right child will be linked to node's left child.  */
+static void __rotate_right(struct interval_node *node,
+			   struct interval_node **root)
+{
+	struct interval_node *left = node->in_left;
+	struct interval_node *parent = node->in_parent;
+
+	node->in_left = left->in_right;
+	if (node->in_left)
+		left->in_right->in_parent = node;
+	left->in_right = node;
+
+	left->in_parent = parent;
+	if (parent) {
+		if (node_is_right_child(node))
+			parent->in_right = left;
+		else
+			parent->in_left = left;
+	} else {
+		*root = left;
+	}
+	node->in_parent = left;
+
+	/* update max_high for node and left */
+	__rotate_change_maxhigh(node, left);
+}
+
+#define interval_swap(a, b) do {			\
+	struct interval_node *c = a; a = b; b = c;      \
+} while (0)
+
+/*
+ * Operations INSERT and DELETE, when run on a tree with n keys,
+ * take O(logN) time.Because they modify the tree, the result
+ * may violate the red-black properties.To restore these properties,
+ * we must change the colors of some of the nodes in the tree
+ * and also change the pointer structure.
+ */
+static void interval_insert_color(struct interval_node *node,
+				  struct interval_node **root)
+{
+	struct interval_node *parent, *gparent;
+	ENTRY;
+
+	while ((parent = node->in_parent) && node_is_red(parent)) {
+		gparent = parent->in_parent;
+		/* Parent is RED, so gparent must not be NULL */
+		if (node_is_left_child(parent)) {
+			struct interval_node *uncle;
+			uncle = gparent->in_right;
+			if (uncle && node_is_red(uncle)) {
+				uncle->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_BLACK;
+				gparent->in_color = INTERVAL_RED;
+				node = gparent;
+				continue;
+			}
+
+			if (parent->in_right == node) {
+				__rotate_left(parent, root);
+				interval_swap(node, parent);
+			}
+
+			parent->in_color = INTERVAL_BLACK;
+			gparent->in_color = INTERVAL_RED;
+			__rotate_right(gparent, root);
+		} else {
+			struct interval_node *uncle;
+			uncle = gparent->in_left;
+			if (uncle && node_is_red(uncle)) {
+				uncle->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_BLACK;
+				gparent->in_color = INTERVAL_RED;
+				node = gparent;
+				continue;
+			}
+
+			if (node_is_left_child(node)) {
+				__rotate_right(parent, root);
+				interval_swap(node, parent);
+			}
+
+			parent->in_color = INTERVAL_BLACK;
+			gparent->in_color = INTERVAL_RED;
+			__rotate_left(gparent, root);
+		}
+	}
+
+	(*root)->in_color = INTERVAL_BLACK;
+	EXIT;
+}
+
+struct interval_node *interval_insert(struct interval_node *node,
+				      struct interval_node **root)
+
+{
+	struct interval_node **p, *parent = NULL;
+	ENTRY;
+
+	LASSERT(!interval_is_intree(node));
+	p = root;
+	while (*p) {
+		parent = *p;
+		if (node_equal(parent, node))
+			RETURN(parent);
+
+		/* max_high field must be updated after each iteration */
+		if (parent->in_max_high < interval_high(node))
+			parent->in_max_high = interval_high(node);
+
+		if (node_compare(node, parent) < 0)
+			p = &parent->in_left;
+		else
+			p = &parent->in_right;
+	}
+
+	/* link node into the tree */
+	node->in_parent = parent;
+	node->in_color = INTERVAL_RED;
+	node->in_left = node->in_right = NULL;
+	*p = node;
+
+	interval_insert_color(node, root);
+	node->in_intree = 1;
+
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(interval_insert);
+
+static inline int node_is_black_or_0(struct interval_node *node)
+{
+	return !node || node_is_black(node);
+}
+
+static void interval_erase_color(struct interval_node *node,
+				 struct interval_node *parent,
+				 struct interval_node **root)
+{
+	struct interval_node *tmp;
+	ENTRY;
+
+	while (node_is_black_or_0(node) && node != *root) {
+		if (parent->in_left == node) {
+			tmp = parent->in_right;
+			if (node_is_red(tmp)) {
+				tmp->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_RED;
+				__rotate_left(parent, root);
+				tmp = parent->in_right;
+			}
+			if (node_is_black_or_0(tmp->in_left) &&
+			    node_is_black_or_0(tmp->in_right)) {
+				tmp->in_color = INTERVAL_RED;
+				node = parent;
+				parent = node->in_parent;
+			} else {
+				if (node_is_black_or_0(tmp->in_right)) {
+					struct interval_node *o_left;
+					if ((o_left = tmp->in_left))
+					     o_left->in_color = INTERVAL_BLACK;
+					tmp->in_color = INTERVAL_RED;
+					__rotate_right(tmp, root);
+					tmp = parent->in_right;
+				}
+				tmp->in_color = parent->in_color;
+				parent->in_color = INTERVAL_BLACK;
+				if (tmp->in_right)
+				    tmp->in_right->in_color = INTERVAL_BLACK;
+				__rotate_left(parent, root);
+				node = *root;
+				break;
+			}
+		} else {
+			tmp = parent->in_left;
+			if (node_is_red(tmp)) {
+				tmp->in_color = INTERVAL_BLACK;
+				parent->in_color = INTERVAL_RED;
+				__rotate_right(parent, root);
+				tmp = parent->in_left;
+			}
+			if (node_is_black_or_0(tmp->in_left) &&
+			    node_is_black_or_0(tmp->in_right)) {
+				tmp->in_color = INTERVAL_RED;
+				node = parent;
+				parent = node->in_parent;
+			} else {
+				if (node_is_black_or_0(tmp->in_left)) {
+					struct interval_node *o_right;
+					if ((o_right = tmp->in_right))
+					    o_right->in_color = INTERVAL_BLACK;
+					tmp->in_color = INTERVAL_RED;
+					__rotate_left(tmp, root);
+					tmp = parent->in_left;
+				}
+				tmp->in_color = parent->in_color;
+				parent->in_color = INTERVAL_BLACK;
+				if (tmp->in_left)
+					tmp->in_left->in_color = INTERVAL_BLACK;
+				__rotate_right(parent, root);
+				node = *root;
+				break;
+			}
+		}
+	}
+	if (node)
+		node->in_color = INTERVAL_BLACK;
+	EXIT;
+}
+
+/*
+ * if the @max_high value of @node is changed, this function traverse  a path
+ * from node  up to the root to update max_high for the whole tree.
+ */
+static void update_maxhigh(struct interval_node *node,
+			   __u64  old_maxhigh)
+{
+	__u64 left_max, right_max;
+	ENTRY;
+
+	while (node) {
+		left_max = node->in_left ? node->in_left->in_max_high : 0;
+		right_max = node->in_right ? node->in_right->in_max_high : 0;
+		node->in_max_high = max_u64(interval_high(node),
+					    max_u64(left_max, right_max));
+
+		if (node->in_max_high >= old_maxhigh)
+			break;
+		node = node->in_parent;
+	}
+	EXIT;
+}
+
+void interval_erase(struct interval_node *node,
+		    struct interval_node **root)
+{
+	struct interval_node *child, *parent;
+	int color;
+	ENTRY;
+
+	LASSERT(interval_is_intree(node));
+	node->in_intree = 0;
+	if (!node->in_left) {
+		child = node->in_right;
+	} else if (!node->in_right) {
+		child = node->in_left;
+	} else { /* Both left and right child are not NULL */
+		struct interval_node *old = node;
+
+		node = interval_next(node);
+		child = node->in_right;
+		parent = node->in_parent;
+		color = node->in_color;
+
+		if (child)
+			child->in_parent = parent;
+		if (parent == old)
+			parent->in_right = child;
+		else
+			parent->in_left = child;
+
+		node->in_color = old->in_color;
+		node->in_right = old->in_right;
+		node->in_left = old->in_left;
+		node->in_parent = old->in_parent;
+
+		if (old->in_parent) {
+			if (node_is_left_child(old))
+				old->in_parent->in_left = node;
+			else
+				old->in_parent->in_right = node;
+		} else {
+			*root = node;
+		}
+
+		old->in_left->in_parent = node;
+		if (old->in_right)
+			old->in_right->in_parent = node;
+		update_maxhigh(child ? : parent, node->in_max_high);
+		update_maxhigh(node, old->in_max_high);
+		if (parent == old)
+			 parent = node;
+		goto color;
+	}
+	parent = node->in_parent;
+	color = node->in_color;
+
+	if (child)
+		child->in_parent = parent;
+	if (parent) {
+		if (node_is_left_child(node))
+			parent->in_left = child;
+		else
+			parent->in_right = child;
+	} else {
+		*root = child;
+	}
+
+	update_maxhigh(child ? : parent, node->in_max_high);
+
+color:
+	if (color == INTERVAL_BLACK)
+		interval_erase_color(child, parent, root);
+	EXIT;
+}
+EXPORT_SYMBOL(interval_erase);
+
+static inline int interval_may_overlap(struct interval_node *node,
+					  struct interval_node_extent *ext)
+{
+	return (ext->start <= node->in_max_high &&
+		ext->end >= interval_low(node));
+}
+
+/*
+ * This function finds all intervals that overlap interval ext,
+ * and calls func to handle resulted intervals one by one.
+ * in lustre, this function will find all conflicting locks in
+ * the granted queue and add these locks to the ast work list.
+ *
+ * {
+ *       if (node == NULL)
+ *	       return 0;
+ *       if (ext->end < interval_low(node)) {
+ *	       interval_search(node->in_left, ext, func, data);
+ *       } else if (interval_may_overlap(node, ext)) {
+ *	       if (extent_overlapped(ext, &node->in_extent))
+ *		       func(node, data);
+ *	       interval_search(node->in_left, ext, func, data);
+ *	       interval_search(node->in_right, ext, func, data);
+ *       }
+ *       return 0;
+ * }
+ *
+ */
+enum interval_iter interval_search(struct interval_node *node,
+				   struct interval_node_extent *ext,
+				   interval_callback_t func,
+				   void *data)
+{
+	struct interval_node *parent;
+	enum interval_iter rc = INTERVAL_ITER_CONT;
+
+	LASSERT(ext != NULL);
+	LASSERT(func != NULL);
+
+	while (node) {
+		if (ext->end < interval_low(node)) {
+			if (node->in_left) {
+				node = node->in_left;
+				continue;
+			}
+		} else if (interval_may_overlap(node, ext)) {
+			if (extent_overlapped(ext, &node->in_extent)) {
+				rc = func(node, data);
+				if (rc == INTERVAL_ITER_STOP)
+					break;
+			}
+
+			if (node->in_left) {
+				node = node->in_left;
+				continue;
+			}
+			if (node->in_right) {
+				node = node->in_right;
+				continue;
+			}
+		}
+
+		parent = node->in_parent;
+		while (parent) {
+			if (node_is_left_child(node) &&
+			    parent->in_right) {
+				/* If we ever got the left, it means that the
+				 * parent met ext->end<interval_low(parent), or
+				 * may_overlap(parent). If the former is true,
+				 * we needn't go back. So stop early and check
+				 * may_overlap(parent) after this loop.  */
+				node = parent->in_right;
+				break;
+			}
+			node = parent;
+			parent = parent->in_parent;
+		}
+		if (parent == NULL || !interval_may_overlap(parent, ext))
+			break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(interval_search);
+
+static enum interval_iter interval_overlap_cb(struct interval_node *n,
+					      void *args)
+{
+	*(int *)args = 1;
+	return INTERVAL_ITER_STOP;
+}
+
+int interval_is_overlapped(struct interval_node *root,
+			   struct interval_node_extent *ext)
+{
+	int has = 0;
+	(void)interval_search(root, ext, interval_overlap_cb, &has);
+	return has;
+}
+EXPORT_SYMBOL(interval_is_overlapped);
+
+/* Don't expand to low. Expanding downwards is expensive, and meaningless to
+ * some extents, because programs seldom do IO backward.
+ *
+ * The recursive algorithm of expanding low:
+ * expand_low {
+ *	struct interval_node *tmp;
+ *	static __u64 res = 0;
+ *
+ *	if (root == NULL)
+ *		return res;
+ *	if (root->in_max_high < low) {
+ *		res = max_u64(root->in_max_high + 1, res);
+ *		return res;
+ *	} else if (low < interval_low(root)) {
+ *		interval_expand_low(root->in_left, low);
+ *		return res;
+ *	}
+ *
+ *	if (interval_high(root) < low)
+ *		res = max_u64(interval_high(root) + 1, res);
+ *	interval_expand_low(root->in_left, low);
+ *	interval_expand_low(root->in_right, low);
+ *
+ *	return res;
+ * }
+ *
+ * It's much easy to eliminate the recursion, see interval_search for
+ * an example. -jay
+ */
+static inline __u64 interval_expand_low(struct interval_node *root, __u64 low)
+{
+	/* we only concern the empty tree right now. */
+	if (root == NULL)
+		return 0;
+	return low;
+}
+
+static inline __u64 interval_expand_high(struct interval_node *node, __u64 high)
+{
+	__u64 result = ~0;
+
+	while (node != NULL) {
+		if (node->in_max_high < high)
+			break;
+
+		if (interval_low(node) > high) {
+			result = interval_low(node) - 1;
+			node = node->in_left;
+		} else {
+			node = node->in_right;
+		}
+	}
+
+	return result;
+}
+
+/* expanding the extent based on @ext. */
+void interval_expand(struct interval_node *root,
+		     struct interval_node_extent *ext,
+		     struct interval_node_extent *limiter)
+{
+	/* The assertion of interval_is_overlapped is expensive because we may
+	 * travel many nodes to find the overlapped node. */
+	LASSERT(interval_is_overlapped(root, ext) == 0);
+	if (!limiter || limiter->start < ext->start)
+		ext->start = interval_expand_low(root, ext->start);
+	if (!limiter || limiter->end > ext->end)
+		ext->end = interval_expand_high(root, ext->end);
+	LASSERT(interval_is_overlapped(root, ext) == 0);
+}
+EXPORT_SYMBOL(interval_expand);

diff --git a/drivers/staging/lustre/lustre/ldlm/l_lock.c b/drivers/staging/lustre/lustre/ldlm/l_lock.c
new file mode 100644
index 0000000..853409a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/l_lock.c

@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <lustre_lib.h>
+
+/**
+ * Lock a lock and its resource.
+ *
+ * LDLM locking uses resource to serialize access to locks
+ * but there is a case when we change resource of lock upon
+ * enqueue reply. We rely on lock->l_resource = new_res
+ * being an atomic operation.
+ */
+struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock)
+{
+	/* on server-side resource of lock doesn't change */
+	if (!lock->l_ns_srv)
+		spin_lock(&lock->l_lock);
+
+	lock_res(lock->l_resource);
+
+	lock->l_res_locked = 1;
+	return lock->l_resource;
+}
+EXPORT_SYMBOL(lock_res_and_lock);
+
+/**
+ * Unlock a lock and its resource previously locked with lock_res_and_lock
+ */
+void unlock_res_and_lock(struct ldlm_lock *lock)
+{
+	/* on server-side resource of lock doesn't change */
+	lock->l_res_locked = 0;
+
+	unlock_res(lock->l_resource);
+	if (!lock->l_ns_srv)
+		spin_unlock(&lock->l_lock);
+}
+EXPORT_SYMBOL(unlock_res_and_lock);

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
new file mode 100644
index 0000000..f7432f7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c

@@ -0,0 +1,242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_extent.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of EXTENT lock type
+ *
+ * EXTENT lock type is for locking a contiguous range of values, represented
+ * by 64-bit starting and ending offsets (inclusive). There are several extent
+ * lock modes, some of which may be mutually incompatible. Extent locks are
+ * considered incompatible if their modes are incompatible and their extents
+ * intersect.  See the lock mode compatibility matrix in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+/* When a lock is cancelled by a client, the KMS may undergo change if this
+ * is the "highest lock".  This function returns the new KMS value.
+ * Caller must hold lr_lock already.
+ *
+ * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct list_head *tmp;
+	struct ldlm_lock *lck;
+	__u64 kms = 0;
+	ENTRY;
+
+	/* don't let another thread in ldlm_extent_shift_kms race in
+	 * just after we finish and take our lock into account in its
+	 * calculation of the kms */
+	lock->l_flags |= LDLM_FL_KMS_IGNORE;
+
+	list_for_each(tmp, &res->lr_granted) {
+		lck = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (lck->l_flags & LDLM_FL_KMS_IGNORE)
+			continue;
+
+		if (lck->l_policy_data.l_extent.end >= old_kms)
+			RETURN(old_kms);
+
+		/* This extent _has_ to be smaller than old_kms (checked above)
+		 * so kms can only ever be smaller or the same as old_kms. */
+		if (lck->l_policy_data.l_extent.end + 1 > kms)
+			kms = lck->l_policy_data.l_extent.end + 1;
+	}
+	LASSERTF(kms <= old_kms, "kms "LPU64" old_kms "LPU64"\n", kms, old_kms);
+
+	RETURN(kms);
+}
+EXPORT_SYMBOL(ldlm_extent_shift_kms);
+
+struct kmem_cache *ldlm_interval_slab;
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+{
+	struct ldlm_interval *node;
+	ENTRY;
+
+	LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
+	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+	if (node == NULL)
+		RETURN(NULL);
+
+	INIT_LIST_HEAD(&node->li_group);
+	ldlm_interval_attach(node, lock);
+	RETURN(node);
+}
+
+void ldlm_interval_free(struct ldlm_interval *node)
+{
+	if (node) {
+		LASSERT(list_empty(&node->li_group));
+		LASSERT(!interval_is_intree(&node->li_node));
+		OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+	}
+}
+
+/* interval tree, for LDLM_EXTENT. */
+void ldlm_interval_attach(struct ldlm_interval *n,
+			  struct ldlm_lock *l)
+{
+	LASSERT(l->l_tree_node == NULL);
+	LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
+
+	list_add_tail(&l->l_sl_policy, &n->li_group);
+	l->l_tree_node = n;
+}
+
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
+{
+	struct ldlm_interval *n = l->l_tree_node;
+
+	if (n == NULL)
+		return NULL;
+
+	LASSERT(!list_empty(&n->li_group));
+	l->l_tree_node = NULL;
+	list_del_init(&l->l_sl_policy);
+
+	return (list_empty(&n->li_group) ? n : NULL);
+}
+
+static inline int lock_mode_to_index(ldlm_mode_t mode)
+{
+	int index;
+
+	LASSERT(mode != 0);
+	LASSERT(IS_PO2(mode));
+	for (index = -1; mode; index++, mode >>= 1) ;
+	LASSERT(index < LCK_MODE_NUM);
+	return index;
+}
+
+/** Add newly granted lock into interval tree for the resource. */
+void ldlm_extent_add_lock(struct ldlm_resource *res,
+			  struct ldlm_lock *lock)
+{
+	struct interval_node *found, **root;
+	struct ldlm_interval *node;
+	struct ldlm_extent *extent;
+	int idx;
+
+	LASSERT(lock->l_granted_mode == lock->l_req_mode);
+
+	node = lock->l_tree_node;
+	LASSERT(node != NULL);
+	LASSERT(!interval_is_intree(&node->li_node));
+
+	idx = lock_mode_to_index(lock->l_granted_mode);
+	LASSERT(lock->l_granted_mode == 1 << idx);
+	LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
+
+	/* node extent initialize */
+	extent = &lock->l_policy_data.l_extent;
+	interval_set(&node->li_node, extent->start, extent->end);
+
+	root = &res->lr_itree[idx].lit_root;
+	found = interval_insert(&node->li_node, root);
+	if (found) { /* The policy group found. */
+		struct ldlm_interval *tmp = ldlm_interval_detach(lock);
+		LASSERT(tmp != NULL);
+		ldlm_interval_free(tmp);
+		ldlm_interval_attach(to_ldlm_interval(found), lock);
+	}
+	res->lr_itree[idx].lit_size++;
+
+	/* even though we use interval tree to manage the extent lock, we also
+	 * add the locks into grant list, for debug purpose, .. */
+	ldlm_resource_add_lock(res, &res->lr_granted, lock);
+}
+
+/** Remove cancelled lock from resource interval tree. */
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	struct ldlm_interval *node = lock->l_tree_node;
+	struct ldlm_interval_tree *tree;
+	int idx;
+
+	if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */
+		return;
+
+	idx = lock_mode_to_index(lock->l_granted_mode);
+	LASSERT(lock->l_granted_mode == 1 << idx);
+	tree = &res->lr_itree[idx];
+
+	LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
+
+	tree->lit_size--;
+	node = ldlm_interval_detach(lock);
+	if (node) {
+		interval_erase(&node->li_node, &tree->lit_root);
+		ldlm_interval_free(node);
+	}
+}
+
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_extent.start = wpolicy->l_extent.start;
+	lpolicy->l_extent.end = wpolicy->l_extent.end;
+	lpolicy->l_extent.gid = wpolicy->l_extent.gid;
+}
+
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_extent.start = lpolicy->l_extent.start;
+	wpolicy->l_extent.end = lpolicy->l_extent.end;
+	wpolicy->l_extent.gid = lpolicy->l_extent.gid;
+}

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
new file mode 100644
index 0000000..f100a84
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c

@@ -0,0 +1,849 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company LP.
+ * Developed under the sponsorship of the US Government under
+ * Subcontract No. B514193
+ *
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file implements POSIX lock type for Lustre.
+ * Its policy properties are start and end of extent and PID.
+ *
+ * These locks are only done through MDS due to POSIX semantics requiring
+ * e.g. that locks could be only partially released and as such split into
+ * two parts, and also that two adjacent locks from the same process may be
+ * merged into a single wider lock.
+ *
+ * Lock modes are mapped like this:
+ * PR and PW for READ and WRITE locks
+ * NL to request a releasing of a portion of the lock
+ *
+ * These flock locks never timeout.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <linux/list.h>
+
+#include "ldlm_internal.h"
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag);
+
+/**
+ * list_for_remaining_safe - iterate over the remaining entries in a list
+ *	      and safeguard against removal of a list entry.
+ * \param pos   the &struct list_head to use as a loop counter. pos MUST
+ *	      have been initialized prior to using it in this macro.
+ * \param n     another &struct list_head to use as temporary storage
+ * \param head  the head for your list.
+ */
+#define list_for_remaining_safe(pos, n, head) \
+	for (n = pos->next; pos != (head); pos = n, n = pos->next)
+
+static inline int
+ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+	return((new->l_policy_data.l_flock.owner ==
+		lock->l_policy_data.l_flock.owner) &&
+	       (new->l_export == lock->l_export));
+}
+
+static inline int
+ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+	return((new->l_policy_data.l_flock.start <=
+		lock->l_policy_data.l_flock.end) &&
+	       (new->l_policy_data.l_flock.end >=
+		lock->l_policy_data.l_flock.start));
+}
+
+static inline int ldlm_flock_blocking_link(struct ldlm_lock *req,
+					   struct ldlm_lock *lock)
+{
+	int rc = 0;
+
+	/* For server only */
+	if (req->l_export == NULL)
+		return 0;
+
+	if (unlikely(req->l_export->exp_flock_hash == NULL)) {
+		rc = ldlm_init_flock_export(req->l_export);
+		if (rc)
+			goto error;
+	}
+
+	LASSERT(hlist_unhashed(&req->l_exp_flock_hash));
+
+	req->l_policy_data.l_flock.blocking_owner =
+		lock->l_policy_data.l_flock.owner;
+	req->l_policy_data.l_flock.blocking_export =
+		lock->l_export;
+	req->l_policy_data.l_flock.blocking_refs = 0;
+
+	cfs_hash_add(req->l_export->exp_flock_hash,
+		     &req->l_policy_data.l_flock.owner,
+		     &req->l_exp_flock_hash);
+error:
+	return rc;
+}
+
+static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req)
+{
+	/* For server only */
+	if (req->l_export == NULL)
+		return;
+
+	check_res_locked(req->l_resource);
+	if (req->l_export->exp_flock_hash != NULL &&
+	    !hlist_unhashed(&req->l_exp_flock_hash))
+		cfs_hash_del(req->l_export->exp_flock_hash,
+			     &req->l_policy_data.l_flock.owner,
+			     &req->l_exp_flock_hash);
+}
+
+static inline void
+ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, __u64 flags)
+{
+	ENTRY;
+
+	LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: 0x%llx)",
+		   mode, flags);
+
+	/* Safe to not lock here, since it should be empty anyway */
+	LASSERT(hlist_unhashed(&lock->l_exp_flock_hash));
+
+	list_del_init(&lock->l_res_link);
+	if (flags == LDLM_FL_WAIT_NOREPROC &&
+	    !(lock->l_flags & LDLM_FL_FAILED)) {
+		/* client side - set a flag to prevent sending a CANCEL */
+		lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
+
+		/* when reaching here, it is under lock_res_and_lock(). Thus,
+		   need call the nolock version of ldlm_lock_decref_internal*/
+		ldlm_lock_decref_internal_nolock(lock, mode);
+	}
+
+	ldlm_lock_destroy_nolock(lock);
+	EXIT;
+}
+
+/**
+ * POSIX locks deadlock detection code.
+ *
+ * Given a new lock \a req and an existing lock \a bl_lock it conflicts
+ * with, we need to iterate through all blocked POSIX locks for this
+ * export and see if there is a deadlock condition arising. (i.e. when
+ * one client holds a lock on something and want a lock on something
+ * else and at the same time another client has the opposite situation).
+ */
+static int
+ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock)
+{
+	struct obd_export *req_exp = req->l_export;
+	struct obd_export *bl_exp = bl_lock->l_export;
+	__u64 req_owner = req->l_policy_data.l_flock.owner;
+	__u64 bl_owner = bl_lock->l_policy_data.l_flock.owner;
+
+	/* For server only */
+	if (req_exp == NULL)
+		return 0;
+
+	class_export_get(bl_exp);
+	while (1) {
+		struct obd_export *bl_exp_new;
+		struct ldlm_lock *lock = NULL;
+		struct ldlm_flock *flock;
+
+		if (bl_exp->exp_flock_hash != NULL)
+			lock = cfs_hash_lookup(bl_exp->exp_flock_hash,
+					       &bl_owner);
+		if (lock == NULL)
+			break;
+
+		flock = &lock->l_policy_data.l_flock;
+		LASSERT(flock->owner == bl_owner);
+		bl_owner = flock->blocking_owner;
+		bl_exp_new = class_export_get(flock->blocking_export);
+		class_export_put(bl_exp);
+
+		cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash);
+		bl_exp = bl_exp_new;
+
+		if (bl_owner == req_owner && bl_exp == req_exp) {
+			class_export_put(bl_exp);
+			return 1;
+		}
+	}
+	class_export_put(bl_exp);
+
+	return 0;
+}
+
+/**
+ * Process a granting attempt for flock lock.
+ * Must be called under ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ *
+ * It is also responsible for splitting a lock if a portion of the lock
+ * is released.
+ *
+ * If \a first_enq is 0 (ie, called from ldlm_reprocess_queue):
+ *   - blocking ASTs have already been sent
+ *
+ * If \a first_enq is 1 (ie, called from ldlm_lock_enqueue):
+ *   - blocking ASTs have not been sent yet, so list of conflicting locks
+ *     would be collected and ASTs sent.
+ */
+int
+ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, int first_enq,
+			ldlm_error_t *err, struct list_head *work_list)
+{
+	struct ldlm_resource *res = req->l_resource;
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	struct list_head *tmp;
+	struct list_head *ownlocks = NULL;
+	struct ldlm_lock *lock = NULL;
+	struct ldlm_lock *new = req;
+	struct ldlm_lock *new2 = NULL;
+	ldlm_mode_t mode = req->l_req_mode;
+	int local = ns_is_client(ns);
+	int added = (mode == LCK_NL);
+	int overlaps = 0;
+	int splitted = 0;
+	const struct ldlm_callback_suite null_cbs = { NULL };
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_DLMTRACE, "flags %#llx owner "LPU64" pid %u mode %u start "
+	       LPU64" end "LPU64"\n", *flags,
+	       new->l_policy_data.l_flock.owner,
+	       new->l_policy_data.l_flock.pid, mode,
+	       req->l_policy_data.l_flock.start,
+	       req->l_policy_data.l_flock.end);
+
+	*err = ELDLM_OK;
+
+	if (local) {
+		/* No blocking ASTs are sent to the clients for
+		 * Posix file & record locks */
+		req->l_blocking_ast = NULL;
+	} else {
+		/* Called on the server for lock cancels. */
+		req->l_blocking_ast = ldlm_flock_blocking_ast;
+	}
+
+reprocess:
+	if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) {
+		/* This loop determines where this processes locks start
+		 * in the resource lr_granted list. */
+		list_for_each(tmp, &res->lr_granted) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					      l_res_link);
+			if (ldlm_same_flock_owner(lock, req)) {
+				ownlocks = tmp;
+				break;
+			}
+		}
+	} else {
+		lockmode_verify(mode);
+
+		/* This loop determines if there are existing locks
+		 * that conflict with the new lock request. */
+		list_for_each(tmp, &res->lr_granted) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					      l_res_link);
+
+			if (ldlm_same_flock_owner(lock, req)) {
+				if (!ownlocks)
+					ownlocks = tmp;
+				continue;
+			}
+
+			/* locks are compatible, overlap doesn't matter */
+			if (lockmode_compat(lock->l_granted_mode, mode))
+				continue;
+
+			if (!ldlm_flocks_overlap(lock, req))
+				continue;
+
+			if (!first_enq)
+				RETURN(LDLM_ITER_CONTINUE);
+
+			if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = -EAGAIN;
+				RETURN(LDLM_ITER_STOP);
+			}
+
+			if (*flags & LDLM_FL_TEST_LOCK) {
+				ldlm_flock_destroy(req, mode, *flags);
+				req->l_req_mode = lock->l_granted_mode;
+				req->l_policy_data.l_flock.pid =
+					lock->l_policy_data.l_flock.pid;
+				req->l_policy_data.l_flock.start =
+					lock->l_policy_data.l_flock.start;
+				req->l_policy_data.l_flock.end =
+					lock->l_policy_data.l_flock.end;
+				*flags |= LDLM_FL_LOCK_CHANGED;
+				RETURN(LDLM_ITER_STOP);
+			}
+
+			if (ldlm_flock_deadlock(req, lock)) {
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = -EDEADLK;
+				RETURN(LDLM_ITER_STOP);
+			}
+
+			rc = ldlm_flock_blocking_link(req, lock);
+			if (rc) {
+				ldlm_flock_destroy(req, mode, *flags);
+				*err = rc;
+				RETURN(LDLM_ITER_STOP);
+			}
+			ldlm_resource_add_lock(res, &res->lr_waiting, req);
+			*flags |= LDLM_FL_BLOCK_GRANTED;
+			RETURN(LDLM_ITER_STOP);
+		}
+	}
+
+	if (*flags & LDLM_FL_TEST_LOCK) {
+		ldlm_flock_destroy(req, mode, *flags);
+		req->l_req_mode = LCK_NL;
+		*flags |= LDLM_FL_LOCK_CHANGED;
+		RETURN(LDLM_ITER_STOP);
+	}
+
+	/* In case we had slept on this lock request take it off of the
+	 * deadlock detection hash list. */
+	ldlm_flock_blocking_unlink(req);
+
+	/* Scan the locks owned by this process that overlap this request.
+	 * We may have to merge or split existing locks. */
+
+	if (!ownlocks)
+		ownlocks = &res->lr_granted;
+
+	list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) {
+		lock = list_entry(ownlocks, struct ldlm_lock, l_res_link);
+
+		if (!ldlm_same_flock_owner(lock, new))
+			break;
+
+		if (lock->l_granted_mode == mode) {
+			/* If the modes are the same then we need to process
+			 * locks that overlap OR adjoin the new lock. The extra
+			 * logic condition is necessary to deal with arithmetic
+			 * overflow and underflow. */
+			if ((new->l_policy_data.l_flock.start >
+			     (lock->l_policy_data.l_flock.end + 1))
+			    && (lock->l_policy_data.l_flock.end !=
+				OBD_OBJECT_EOF))
+				continue;
+
+			if ((new->l_policy_data.l_flock.end <
+			     (lock->l_policy_data.l_flock.start - 1))
+			    && (lock->l_policy_data.l_flock.start != 0))
+				break;
+
+			if (new->l_policy_data.l_flock.start <
+			    lock->l_policy_data.l_flock.start) {
+				lock->l_policy_data.l_flock.start =
+					new->l_policy_data.l_flock.start;
+			} else {
+				new->l_policy_data.l_flock.start =
+					lock->l_policy_data.l_flock.start;
+			}
+
+			if (new->l_policy_data.l_flock.end >
+			    lock->l_policy_data.l_flock.end) {
+				lock->l_policy_data.l_flock.end =
+					new->l_policy_data.l_flock.end;
+			} else {
+				new->l_policy_data.l_flock.end =
+					lock->l_policy_data.l_flock.end;
+			}
+
+			if (added) {
+				ldlm_flock_destroy(lock, mode, *flags);
+			} else {
+				new = lock;
+				added = 1;
+			}
+			continue;
+		}
+
+		if (new->l_policy_data.l_flock.start >
+		    lock->l_policy_data.l_flock.end)
+			continue;
+
+		if (new->l_policy_data.l_flock.end <
+		    lock->l_policy_data.l_flock.start)
+			break;
+
+		++overlaps;
+
+		if (new->l_policy_data.l_flock.start <=
+		    lock->l_policy_data.l_flock.start) {
+			if (new->l_policy_data.l_flock.end <
+			    lock->l_policy_data.l_flock.end) {
+				lock->l_policy_data.l_flock.start =
+					new->l_policy_data.l_flock.end + 1;
+				break;
+			}
+			ldlm_flock_destroy(lock, lock->l_req_mode, *flags);
+			continue;
+		}
+		if (new->l_policy_data.l_flock.end >=
+		    lock->l_policy_data.l_flock.end) {
+			lock->l_policy_data.l_flock.end =
+				new->l_policy_data.l_flock.start - 1;
+			continue;
+		}
+
+		/* split the existing lock into two locks */
+
+		/* if this is an F_UNLCK operation then we could avoid
+		 * allocating a new lock and use the req lock passed in
+		 * with the request but this would complicate the reply
+		 * processing since updates to req get reflected in the
+		 * reply. The client side replays the lock request so
+		 * it must see the original lock data in the reply. */
+
+		/* XXX - if ldlm_lock_new() can sleep we should
+		 * release the lr_lock, allocate the new lock,
+		 * and restart processing this lock. */
+		if (!new2) {
+			unlock_res_and_lock(req);
+			new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK,
+						lock->l_granted_mode, &null_cbs,
+						NULL, 0, LVB_T_NONE);
+			lock_res_and_lock(req);
+			if (!new2) {
+				ldlm_flock_destroy(req, lock->l_granted_mode,
+						   *flags);
+				*err = -ENOLCK;
+				RETURN(LDLM_ITER_STOP);
+			}
+			goto reprocess;
+		}
+
+		splitted = 1;
+
+		new2->l_granted_mode = lock->l_granted_mode;
+		new2->l_policy_data.l_flock.pid =
+			new->l_policy_data.l_flock.pid;
+		new2->l_policy_data.l_flock.owner =
+			new->l_policy_data.l_flock.owner;
+		new2->l_policy_data.l_flock.start =
+			lock->l_policy_data.l_flock.start;
+		new2->l_policy_data.l_flock.end =
+			new->l_policy_data.l_flock.start - 1;
+		lock->l_policy_data.l_flock.start =
+			new->l_policy_data.l_flock.end + 1;
+		new2->l_conn_export = lock->l_conn_export;
+		if (lock->l_export != NULL) {
+			new2->l_export = class_export_lock_get(lock->l_export, new2);
+			if (new2->l_export->exp_lock_hash &&
+			    hlist_unhashed(&new2->l_exp_hash))
+				cfs_hash_add(new2->l_export->exp_lock_hash,
+					     &new2->l_remote_handle,
+					     &new2->l_exp_hash);
+		}
+		if (*flags == LDLM_FL_WAIT_NOREPROC)
+			ldlm_lock_addref_internal_nolock(new2,
+							 lock->l_granted_mode);
+
+		/* insert new2 at lock */
+		ldlm_resource_add_lock(res, ownlocks, new2);
+		LDLM_LOCK_RELEASE(new2);
+		break;
+	}
+
+	/* if new2 is created but never used, destroy it*/
+	if (splitted == 0 && new2 != NULL)
+		ldlm_lock_destroy_nolock(new2);
+
+	/* At this point we're granting the lock request. */
+	req->l_granted_mode = req->l_req_mode;
+
+	/* Add req to the granted queue before calling ldlm_reprocess_all(). */
+	if (!added) {
+		list_del_init(&req->l_res_link);
+		/* insert new lock before ownlocks in list. */
+		ldlm_resource_add_lock(res, ownlocks, req);
+	}
+
+	if (*flags != LDLM_FL_WAIT_NOREPROC) {
+		/* The only one possible case for client-side calls flock
+		 * policy function is ldlm_flock_completion_ast inside which
+		 * carries LDLM_FL_WAIT_NOREPROC flag. */
+		CERROR("Illegal parameter for client-side-only module.\n");
+		LBUG();
+	}
+
+	/* In case we're reprocessing the requested lock we can't destroy
+	 * it until after calling ldlm_add_ast_work_item() above so that laawi()
+	 * can bump the reference count on \a req. Otherwise \a req
+	 * could be freed before the completion AST can be sent.  */
+	if (added)
+		ldlm_flock_destroy(req, mode, *flags);
+
+	ldlm_resource_dump(D_INFO, res);
+	RETURN(LDLM_ITER_CONTINUE);
+}
+
+struct ldlm_flock_wait_data {
+	struct ldlm_lock *fwd_lock;
+	int	       fwd_generation;
+};
+
+static void
+ldlm_flock_interrupted_wait(void *data)
+{
+	struct ldlm_lock *lock;
+	ENTRY;
+
+	lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock;
+
+	/* take lock off the deadlock detection hash list. */
+	lock_res_and_lock(lock);
+	ldlm_flock_blocking_unlink(lock);
+
+	/* client side - set flag to prevent lock from being put on LRU list */
+	lock->l_flags |= LDLM_FL_CBPENDING;
+	unlock_res_and_lock(lock);
+
+	EXIT;
+}
+
+/**
+ * Flock completion callback function.
+ *
+ * \param lock [in,out]: A lock to be handled
+ * \param flags    [in]: flags
+ * \param *data    [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg
+ *
+ * \retval 0    : success
+ * \retval <0   : failure
+ */
+int
+ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	struct file_lock		*getlk = lock->l_ast_data;
+	struct obd_device	      *obd;
+	struct obd_import	      *imp = NULL;
+	struct ldlm_flock_wait_data     fwd;
+	struct l_wait_info	      lwi;
+	ldlm_error_t		    err;
+	int			     rc = 0;
+	ENTRY;
+
+	CDEBUG(D_DLMTRACE, "flags: 0x%llx data: %p getlk: %p\n",
+	       flags, data, getlk);
+
+	/* Import invalidation. We need to actually release the lock
+	 * references being held, so that it can go away. No point in
+	 * holding the lock even if app still believes it has it, since
+	 * server already dropped it anyway. Only for granted locks too. */
+	if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) ==
+	    (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) {
+		if (lock->l_req_mode == lock->l_granted_mode &&
+		    lock->l_granted_mode != LCK_NL &&
+		    NULL == data)
+			ldlm_lock_decref_internal(lock, lock->l_req_mode);
+
+		/* Need to wake up the waiter if we were evicted */
+		wake_up(&lock->l_waitq);
+		RETURN(0);
+	}
+
+	LASSERT(flags != LDLM_FL_WAIT_NOREPROC);
+
+	if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+		       LDLM_FL_BLOCK_CONV))) {
+		if (NULL == data)
+			/* mds granted the lock in the reply */
+			goto granted;
+		/* CP AST RPC: lock get granted, wake it up */
+		wake_up(&lock->l_waitq);
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+		   "sleeping");
+	fwd.fwd_lock = lock;
+	obd = class_exp2obd(lock->l_conn_export);
+
+	/* if this is a local lock, there is no import */
+	if (NULL != obd)
+		imp = obd->u.cli.cl_import;
+
+	if (NULL != imp) {
+		spin_lock(&imp->imp_lock);
+		fwd.fwd_generation = imp->imp_generation;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	lwi = LWI_TIMEOUT_INTR(0, NULL, ldlm_flock_interrupted_wait, &fwd);
+
+	/* Go to sleep until the lock is granted. */
+	rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi);
+
+	if (rc) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		RETURN(rc);
+	}
+
+granted:
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10);
+
+	if (lock->l_destroyed) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed");
+		RETURN(0);
+	}
+
+	if (lock->l_flags & LDLM_FL_FAILED) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed");
+		RETURN(-EIO);
+	}
+
+	if (rc) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		RETURN(rc);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue granted");
+
+	lock_res_and_lock(lock);
+
+	/* take lock off the deadlock detection hash list. */
+	ldlm_flock_blocking_unlink(lock);
+
+	/* ldlm_lock_enqueue() has already placed lock on the granted list. */
+	list_del_init(&lock->l_res_link);
+
+	if (flags & LDLM_FL_TEST_LOCK) {
+		/* fcntl(F_GETLK) request */
+		/* The old mode was saved in getlk->fl_type so that if the mode
+		 * in the lock changes we can decref the appropriate refcount.*/
+		ldlm_flock_destroy(lock, flock_type(getlk),
+				   LDLM_FL_WAIT_NOREPROC);
+		switch (lock->l_granted_mode) {
+		case LCK_PR:
+			flock_set_type(getlk, F_RDLCK);
+			break;
+		case LCK_PW:
+			flock_set_type(getlk, F_WRLCK);
+			break;
+		default:
+			flock_set_type(getlk, F_UNLCK);
+		}
+		flock_set_pid(getlk, (pid_t)lock->l_policy_data.l_flock.pid);
+		flock_set_start(getlk,
+				(loff_t)lock->l_policy_data.l_flock.start);
+		flock_set_end(getlk,
+			      (loff_t)lock->l_policy_data.l_flock.end);
+	} else {
+		__u64 noreproc = LDLM_FL_WAIT_NOREPROC;
+
+		/* We need to reprocess the lock to do merges or splits
+		 * with existing locks owned by this process. */
+		ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL);
+	}
+	unlock_res_and_lock(lock);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_flock_completion_ast);
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag)
+{
+	ENTRY;
+
+	LASSERT(lock);
+	LASSERT(flag == LDLM_CB_CANCELING);
+
+	/* take lock off the deadlock detection hash list. */
+	lock_res_and_lock(lock);
+	ldlm_flock_blocking_unlink(lock);
+	unlock_res_and_lock(lock);
+	RETURN(0);
+}
+
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				       ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+	lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+	lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+	/* Compat code, old clients had no idea about owner field and
+	 * relied solely on pid for ownership. Introduced in LU-104, 2.1,
+	 * April 2011 */
+	lpolicy->l_flock.owner = wpolicy->l_flock.lfw_pid;
+}
+
+
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				       ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+	lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+	lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+	lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner;
+}
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_flock.lfw_start = lpolicy->l_flock.start;
+	wpolicy->l_flock.lfw_end = lpolicy->l_flock.end;
+	wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid;
+	wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner;
+}
+
+/*
+ * Export handle<->flock hash operations.
+ */
+static unsigned
+ldlm_export_flock_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u64_hash(*(__u64 *)key, mask);
+}
+
+static void *
+ldlm_export_flock_key(struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	return &lock->l_policy_data.l_flock.owner;
+}
+
+static int
+ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64));
+}
+
+static void *
+ldlm_export_flock_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+}
+
+static void
+ldlm_export_flock_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_flock *flock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	LDLM_LOCK_GET(lock);
+
+	flock = &lock->l_policy_data.l_flock;
+	LASSERT(flock->blocking_export != NULL);
+	class_export_get(flock->blocking_export);
+	flock->blocking_refs++;
+}
+
+static void
+ldlm_export_flock_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_flock *flock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+	LDLM_LOCK_RELEASE(lock);
+
+	flock = &lock->l_policy_data.l_flock;
+	LASSERT(flock->blocking_export != NULL);
+	class_export_put(flock->blocking_export);
+	if (--flock->blocking_refs == 0) {
+		flock->blocking_owner = 0;
+		flock->blocking_export = NULL;
+	}
+}
+
+static cfs_hash_ops_t ldlm_export_flock_ops = {
+	.hs_hash	= ldlm_export_flock_hash,
+	.hs_key	 = ldlm_export_flock_key,
+	.hs_keycmp      = ldlm_export_flock_keycmp,
+	.hs_object      = ldlm_export_flock_object,
+	.hs_get	 = ldlm_export_flock_get,
+	.hs_put	 = ldlm_export_flock_put,
+	.hs_put_locked  = ldlm_export_flock_put,
+};
+
+int ldlm_init_flock_export(struct obd_export *exp)
+{
+	exp->exp_flock_hash =
+		cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+				HASH_EXP_LOCK_CUR_BITS,
+				HASH_EXP_LOCK_MAX_BITS,
+				HASH_EXP_LOCK_BKT_BITS, 0,
+				CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				&ldlm_export_flock_ops,
+				CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE);
+	if (!exp->exp_flock_hash)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_init_flock_export);
+
+void ldlm_destroy_flock_export(struct obd_export *exp)
+{
+	ENTRY;
+	if (exp->exp_flock_hash) {
+		cfs_hash_putref(exp->exp_flock_hash);
+		exp->exp_flock_hash = NULL;
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_flock_export);

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c
new file mode 100644
index 0000000..574b2ff
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c

@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_inodebits.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of IBITS lock type
+ *
+ * IBITS lock type contains a bit mask determining various properties of an
+ * object. The meanings of specific bits are specific to the caller and are
+ * opaque to LDLM code.
+ *
+ * Locks with intersecting bitmasks and conflicting lock modes (e.g.  LCK_PW)
+ * are considered conflicting.  See the lock mode compatibility matrix
+ * in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy)
+{
+	memset(lpolicy, 0, sizeof(*lpolicy));
+	lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
+}
+
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	memset(wpolicy, 0, sizeof(*wpolicy));
+	wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
+}

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h
new file mode 100644
index 0000000..141a957
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h

@@ -0,0 +1,276 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define MAX_STRING_SIZE 128
+
+extern atomic_t ldlm_srv_namespace_nr;
+extern atomic_t ldlm_cli_namespace_nr;
+extern struct mutex ldlm_srv_namespace_lock;
+extern struct list_head ldlm_srv_namespace_list;
+extern struct mutex ldlm_cli_namespace_lock;
+extern struct list_head ldlm_cli_namespace_list;
+
+static inline atomic_t *ldlm_namespace_nr(ldlm_side_t client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_nr : &ldlm_cli_namespace_nr;
+}
+
+static inline struct list_head *ldlm_namespace_list(ldlm_side_t client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_list : &ldlm_cli_namespace_list;
+}
+
+static inline struct mutex *ldlm_namespace_lock(ldlm_side_t client)
+{
+	return client == LDLM_NAMESPACE_SERVER ?
+		&ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock;
+}
+
+/* ldlm_request.c */
+/* Cancel lru flag, it indicates we cancel aged locks. */
+enum {
+	LDLM_CANCEL_AGED   = 1 << 0, /* Cancel aged locks (non lru resize). */
+	LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */
+	LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */
+	LDLM_CANCEL_LRUR   = 1 << 3, /* Cancel locks from lru resize. */
+	LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither
+				      * sending nor waiting for any rpcs) */
+};
+
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+		    ldlm_cancel_flags_t sync, int flags);
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
+			  struct list_head *cancels, int count, int max,
+			  ldlm_cancel_flags_t cancel_flags, int flags);
+extern int ldlm_enqueue_min;
+int ldlm_get_enq_timeout(struct ldlm_lock *lock);
+
+/* ldlm_resource.c */
+int ldlm_resource_putref_locked(struct ldlm_resource *res);
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+				     struct ldlm_lock *new);
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+			       struct obd_import *imp, int force);
+void ldlm_namespace_free_post(struct ldlm_namespace *ns);
+/* ldlm_lock.c */
+
+struct ldlm_cb_set_arg {
+	struct ptlrpc_request_set	*set;
+	int				 type; /* LDLM_{CP,BL,GL}_CALLBACK */
+	atomic_t			 restart;
+	struct list_head			*list;
+	union ldlm_gl_desc		*gl_desc; /* glimpse AST descriptor */
+};
+
+typedef enum {
+	LDLM_WORK_BL_AST,
+	LDLM_WORK_CP_AST,
+	LDLM_WORK_REVOKE_AST,
+	LDLM_WORK_GL_AST
+} ldlm_desc_ast_t;
+
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+		  enum req_location loc, void *data, int size);
+struct ldlm_lock *
+ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
+		 ldlm_type_t type, ldlm_mode_t,
+		 const struct ldlm_callback_suite *cbs,
+		 void *data, __u32 lvb_len, enum lvb_type lvb_type);
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
+			       void *cookie, __u64 *flags);
+void ldlm_lock_addref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			    struct list_head *work_list);
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+		      ldlm_desc_ast_t ast_type);
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock);
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock);
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock);
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
+
+void ldlm_cancel_locks_for_export(struct obd_export *export);
+
+/* ldlm_lockd.c */
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct ldlm_lock *lock);
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns,
+			   struct ldlm_lock_desc *ld,
+			   struct list_head *cancels, int count,
+			   ldlm_cancel_flags_t cancel_flags);
+
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+
+
+/* ldlm_extent.c */
+void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
+
+/* ldlm_flock.c */
+int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
+			    int first_enq, ldlm_error_t *err,
+			    struct list_head *work_list);
+int ldlm_init_flock_export(struct obd_export *exp);
+void ldlm_destroy_flock_export(struct obd_export *exp);
+
+/* l_lock.c */
+void l_check_ns_lock(struct ldlm_namespace *ns);
+void l_check_no_ns_lock(struct ldlm_namespace *ns);
+
+extern proc_dir_entry_t *ldlm_svc_proc_dir;
+extern proc_dir_entry_t *ldlm_type_proc_dir;
+
+struct ldlm_state {
+	struct ptlrpc_service *ldlm_cb_service;
+	struct ptlrpc_service *ldlm_cancel_service;
+	struct ptlrpc_client *ldlm_client;
+	struct ptlrpc_connection *ldlm_server_conn;
+	struct ldlm_bl_pool *ldlm_bl_pool;
+};
+
+/* interval tree, for LDLM_EXTENT. */
+extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
+extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
+extern void ldlm_interval_free(struct ldlm_interval *node);
+/* this function must be called with res lock held */
+static inline struct ldlm_extent *
+ldlm_interval_extent(struct ldlm_interval *node)
+{
+	struct ldlm_lock *lock;
+	LASSERT(!list_empty(&node->li_group));
+
+	lock = list_entry(node->li_group.next, struct ldlm_lock,
+			      l_sl_policy);
+	return &lock->l_policy_data.l_extent;
+}
+
+int ldlm_init(void);
+void ldlm_exit(void);
+
+enum ldlm_policy_res {
+	LDLM_POLICY_CANCEL_LOCK,
+	LDLM_POLICY_KEEP_LOCK,
+	LDLM_POLICY_SKIP_LOCK
+};
+
+typedef enum ldlm_policy_res ldlm_policy_res_t;
+
+#define LDLM_POOL_PROC_READER_SEQ_SHOW(var, type)			    \
+	static int lprocfs_##var##_seq_show(struct seq_file *m, void *v) \
+	{								    \
+		struct ldlm_pool *pl = m->private;			    \
+		type tmp;						    \
+									    \
+		spin_lock(&pl->pl_lock);				    \
+		tmp = pl->pl_##var;					    \
+		spin_unlock(&pl->pl_lock);				    \
+									    \
+		return lprocfs_rd_uint(m, &tmp);			    \
+	}								    \
+	struct __##var##__dummy_read {;} /* semicolon catcher */
+
+#define LDLM_POOL_PROC_WRITER(var, type)				    \
+	int lprocfs_wr_##var(struct file *file, const char *buffer,	    \
+			     unsigned long count, void *data)		    \
+	{								    \
+		struct ldlm_pool *pl = data;				    \
+		type tmp;						    \
+		int rc;							    \
+									    \
+		rc = lprocfs_wr_uint(file, buffer, count, &tmp);	    \
+		if (rc < 0) {						    \
+			CERROR("Can't parse user input, rc = %d\n", rc);    \
+			return rc;					    \
+		}							    \
+									    \
+		spin_lock(&pl->pl_lock);				    \
+		pl->pl_##var = tmp;					    \
+		spin_unlock(&pl->pl_lock);				    \
+									    \
+		return rc;						    \
+	}								    \
+	struct __##var##__dummy_write {;} /* semicolon catcher */
+
+static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
+{
+	int ret = 0;
+
+	lock_res_and_lock(lock);
+	if (((lock->l_req_mode == lock->l_granted_mode) &&
+	     !(lock->l_flags & LDLM_FL_CP_REQD)) ||
+	    (lock->l_flags & (LDLM_FL_FAILED | LDLM_FL_CANCEL)))
+		ret = 1;
+	unlock_res_and_lock(lock);
+
+	return ret;
+}
+
+typedef void (*ldlm_policy_wire_to_local_t)(const ldlm_wire_policy_data_t *,
+					    ldlm_policy_data_t *);
+
+typedef void (*ldlm_policy_local_to_wire_t)(const ldlm_policy_data_t *,
+					    ldlm_wire_policy_data_t *);
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy);
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy);

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
new file mode 100644
index 0000000..42df530
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c

@@ -0,0 +1,868 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file deals with various client/target related logic including recovery.
+ *
+ * TODO: This code more logically belongs in the ptlrpc module than in ldlm and
+ * should be moved.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ldlm_internal.h"
+
+/* @priority: If non-zero, move the selected connection to the list head.
+ * @create: If zero, only search in existing connections.
+ */
+static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority, int create)
+{
+	struct ptlrpc_connection *ptlrpc_conn;
+	struct obd_import_conn *imp_conn = NULL, *item;
+	int rc = 0;
+	ENTRY;
+
+	if (!create && !priority) {
+		CDEBUG(D_HA, "Nothing to do\n");
+		RETURN(-EINVAL);
+	}
+
+	ptlrpc_conn = ptlrpc_uuid_to_connection(uuid);
+	if (!ptlrpc_conn) {
+		CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid);
+		RETURN (-ENOENT);
+	}
+
+	if (create) {
+		OBD_ALLOC(imp_conn, sizeof(*imp_conn));
+		if (!imp_conn) {
+			GOTO(out_put, rc = -ENOMEM);
+		}
+	}
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(item, &imp->imp_conn_list, oic_item) {
+		if (obd_uuid_equals(uuid, &item->oic_uuid)) {
+			if (priority) {
+				list_del(&item->oic_item);
+				list_add(&item->oic_item,
+					     &imp->imp_conn_list);
+				item->oic_last_attempt = 0;
+			}
+			CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n",
+			       imp, imp->imp_obd->obd_name, uuid->uuid,
+			       (priority ? ", moved to head" : ""));
+			spin_unlock(&imp->imp_lock);
+			GOTO(out_free, rc = 0);
+		}
+	}
+	/* No existing import connection found for \a uuid. */
+	if (create) {
+		imp_conn->oic_conn = ptlrpc_conn;
+		imp_conn->oic_uuid = *uuid;
+		imp_conn->oic_last_attempt = 0;
+		if (priority)
+			list_add(&imp_conn->oic_item, &imp->imp_conn_list);
+		else
+			list_add_tail(&imp_conn->oic_item,
+					  &imp->imp_conn_list);
+		CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n",
+		       imp, imp->imp_obd->obd_name, uuid->uuid,
+		       (priority ? "head" : "tail"));
+	} else {
+		spin_unlock(&imp->imp_lock);
+		GOTO(out_free, rc = -ENOENT);
+	}
+
+	spin_unlock(&imp->imp_lock);
+	RETURN(0);
+out_free:
+	if (imp_conn)
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+out_put:
+	ptlrpc_connection_put(ptlrpc_conn);
+	RETURN(rc);
+}
+
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	return import_set_conn(imp, uuid, 1, 0);
+}
+
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+			   int priority)
+{
+	return import_set_conn(imp, uuid, priority, 1);
+}
+EXPORT_SYMBOL(client_import_add_conn);
+
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+	struct obd_import_conn *imp_conn;
+	struct obd_export *dlmexp;
+	int rc = -ENOENT;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (list_empty(&imp->imp_conn_list)) {
+		LASSERT(!imp->imp_connection);
+		GOTO(out, rc);
+	}
+
+	list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) {
+		if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid))
+			continue;
+		LASSERT(imp_conn->oic_conn);
+
+		if (imp_conn == imp->imp_conn_current) {
+			LASSERT(imp_conn->oic_conn == imp->imp_connection);
+
+			if (imp->imp_state != LUSTRE_IMP_CLOSED &&
+			    imp->imp_state != LUSTRE_IMP_DISCON) {
+				CERROR("can't remove current connection\n");
+				GOTO(out, rc = -EBUSY);
+			}
+
+			ptlrpc_connection_put(imp->imp_connection);
+			imp->imp_connection = NULL;
+
+			dlmexp = class_conn2export(&imp->imp_dlm_handle);
+			if (dlmexp && dlmexp->exp_connection) {
+				LASSERT(dlmexp->exp_connection ==
+					imp_conn->oic_conn);
+				ptlrpc_connection_put(dlmexp->exp_connection);
+				dlmexp->exp_connection = NULL;
+			}
+		}
+
+		list_del(&imp_conn->oic_item);
+		ptlrpc_connection_put(imp_conn->oic_conn);
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+		CDEBUG(D_HA, "imp %p@%s: remove connection %s\n",
+		       imp, imp->imp_obd->obd_name, uuid->uuid);
+		rc = 0;
+		break;
+	}
+out:
+	spin_unlock(&imp->imp_lock);
+	if (rc == -ENOENT)
+		CERROR("connection %s not found\n", uuid->uuid);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_del_conn);
+
+/**
+ * Find conn UUID by peer NID. \a peer is a server NID. This function is used
+ * to find a conn uuid of \a imp which can reach \a peer.
+ */
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+			    struct obd_uuid *uuid)
+{
+	struct obd_import_conn *conn;
+	int rc = -ENOENT;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		/* Check if conn UUID does have this peer NID. */
+		if (class_check_uuid(&conn->oic_uuid, peer)) {
+			*uuid = conn->oic_uuid;
+			rc = 0;
+			break;
+		}
+	}
+	spin_unlock(&imp->imp_lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_find_conn);
+
+void client_destroy_import(struct obd_import *imp)
+{
+	/* Drop security policy instance after all RPCs have finished/aborted
+	 * to let all busy contexts be released. */
+	class_import_get(imp);
+	class_destroy_import(imp);
+	sptlrpc_import_sec_put(imp);
+	class_import_put(imp);
+}
+EXPORT_SYMBOL(client_destroy_import);
+
+/**
+ * Check whether or not the OSC is on MDT.
+ * In the config log,
+ * osc on MDT
+ *	setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
+ * osc on client
+ *	setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID
+ *
+ **/
+static int osc_on_mdt(char *obdname)
+{
+	char *ptr;
+
+	ptr = strrchr(obdname, '-');
+	if (ptr == NULL)
+		return 0;
+
+	if (strncmp(ptr + 1, "MDT", 3) == 0)
+		return 1;
+
+	return 0;
+}
+
+/* Configure an RPC client OBD device.
+ *
+ * lcfg parameters:
+ * 1 - client UUID
+ * 2 - server UUID
+ * 3 - inactive-on-startup
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+	struct client_obd *cli = &obddev->u.cli;
+	struct obd_import *imp;
+	struct obd_uuid server_uuid;
+	int rq_portal, rp_portal, connect_op;
+	char *name = obddev->obd_type->typ_name;
+	ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN;
+	int rc;
+	char	*cli_name = lustre_cfg_buf(lcfg, 0);
+	ENTRY;
+
+	/* In a more perfect world, we would hang a ptlrpc_client off of
+	 * obd_type and just use the values from there. */
+	if (!strcmp(name, LUSTRE_OSC_NAME) ||
+	    (!(strcmp(name, LUSTRE_OSP_NAME)) &&
+	     (is_osp_on_mdt(cli_name) &&
+	       strstr(lustre_cfg_buf(lcfg, 1), "OST") != NULL))) {
+		/* OSC or OSP_on_MDT for OSTs */
+		rq_portal = OST_REQUEST_PORTAL;
+		rp_portal = OSC_REPLY_PORTAL;
+		connect_op = OST_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_CLI;
+		cli->cl_sp_to = LUSTRE_SP_OST;
+		ns_type = LDLM_NS_TYPE_OSC;
+	} else if (!strcmp(name, LUSTRE_MDC_NAME) ||
+		   !strcmp(name, LUSTRE_LWP_NAME) ||
+		   (!strcmp(name, LUSTRE_OSP_NAME) &&
+		    (is_osp_on_mdt(cli_name) &&
+		     strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL))) {
+		/* MDC or OSP_on_MDT for other MDTs */
+		rq_portal = MDS_REQUEST_PORTAL;
+		rp_portal = MDC_REPLY_PORTAL;
+		connect_op = MDS_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_CLI;
+		cli->cl_sp_to = LUSTRE_SP_MDT;
+		ns_type = LDLM_NS_TYPE_MDC;
+	} else if (!strcmp(name, LUSTRE_MGC_NAME)) {
+		rq_portal = MGS_REQUEST_PORTAL;
+		rp_portal = MGC_REPLY_PORTAL;
+		connect_op = MGS_CONNECT;
+		cli->cl_sp_me = LUSTRE_SP_MGC;
+		cli->cl_sp_to = LUSTRE_SP_MGS;
+		cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID;
+		ns_type = LDLM_NS_TYPE_MGC;
+	} else {
+		CERROR("unknown client OBD type \"%s\", can't setup\n",
+		       name);
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("requires a TARGET UUID\n");
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) {
+		CERROR("client UUID must be less than 38 characters\n");
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
+		CERROR("setup requires a SERVER UUID\n");
+		RETURN(-EINVAL);
+	}
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) {
+		CERROR("target UUID must be less than 38 characters\n");
+		RETURN(-EINVAL);
+	}
+
+	init_rwsem(&cli->cl_sem);
+	sema_init(&cli->cl_mgc_sem, 1);
+	cli->cl_conn_count = 0;
+	memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
+	       min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
+		     sizeof(server_uuid)));
+
+	cli->cl_dirty = 0;
+	cli->cl_avail_grant = 0;
+	/* FIXME: Should limit this for the sum of all cl_dirty_max. */
+	cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024;
+	if (cli->cl_dirty_max >> PAGE_CACHE_SHIFT > num_physpages / 8)
+		cli->cl_dirty_max = num_physpages << (PAGE_CACHE_SHIFT - 3);
+	INIT_LIST_HEAD(&cli->cl_cache_waiters);
+	INIT_LIST_HEAD(&cli->cl_loi_ready_list);
+	INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
+	INIT_LIST_HEAD(&cli->cl_loi_write_list);
+	INIT_LIST_HEAD(&cli->cl_loi_read_list);
+	client_obd_list_lock_init(&cli->cl_loi_list_lock);
+	atomic_set(&cli->cl_pending_w_pages, 0);
+	atomic_set(&cli->cl_pending_r_pages, 0);
+	cli->cl_r_in_flight = 0;
+	cli->cl_w_in_flight = 0;
+
+	spin_lock_init(&cli->cl_read_rpc_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_rpc_hist.oh_lock);
+	spin_lock_init(&cli->cl_read_page_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_page_hist.oh_lock);
+	spin_lock_init(&cli->cl_read_offset_hist.oh_lock);
+	spin_lock_init(&cli->cl_write_offset_hist.oh_lock);
+
+	/* lru for osc. */
+	INIT_LIST_HEAD(&cli->cl_lru_osc);
+	atomic_set(&cli->cl_lru_shrinkers, 0);
+	atomic_set(&cli->cl_lru_busy, 0);
+	atomic_set(&cli->cl_lru_in_list, 0);
+	INIT_LIST_HEAD(&cli->cl_lru_list);
+	client_obd_list_lock_init(&cli->cl_lru_list_lock);
+
+	init_waitqueue_head(&cli->cl_destroy_waitq);
+	atomic_set(&cli->cl_destroy_in_flight, 0);
+	/* Turn on checksumming by default. */
+	cli->cl_checksum = 1;
+	/*
+	 * The supported checksum types will be worked out at connect time
+	 * Set cl_chksum* to CRC32 for now to avoid returning screwed info
+	 * through procfs.
+	 */
+	cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
+	atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
+
+	/* This value may be reduced at connect time in
+	 * ptlrpc_connect_interpret() . We initialize it to only
+	 * 1MB until we know what the performance looks like.
+	 * In the future this should likely be increased. LU-1431 */
+	cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES,
+					  LNET_MTU >> PAGE_CACHE_SHIFT);
+
+	if (!strcmp(name, LUSTRE_MDC_NAME)) {
+		cli->cl_max_rpcs_in_flight = MDC_MAX_RIF_DEFAULT;
+	} else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 128 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 2;
+	} else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 256 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 3;
+	} else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 512 /* MB */) {
+		cli->cl_max_rpcs_in_flight = 4;
+	} else {
+		if (osc_on_mdt(obddev->obd_name))
+			cli->cl_max_rpcs_in_flight = MDS_OSC_MAX_RIF_DEFAULT;
+		else
+			cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT;
+	}
+	rc = ldlm_get_ref();
+	if (rc) {
+		CERROR("ldlm_get_ref failed: %d\n", rc);
+		GOTO(err, rc);
+	}
+
+	ptlrpc_init_client(rq_portal, rp_portal, name,
+			   &obddev->obd_ldlm_client);
+
+	imp = class_new_import(obddev);
+	if (imp == NULL)
+		GOTO(err_ldlm, rc = -ENOENT);
+	imp->imp_client = &obddev->obd_ldlm_client;
+	imp->imp_connect_op = connect_op;
+	memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+	       LUSTRE_CFG_BUFLEN(lcfg, 1));
+	class_import_put(imp);
+
+	rc = client_import_add_conn(imp, &server_uuid, 1);
+	if (rc) {
+		CERROR("can't add initial connection\n");
+		GOTO(err_import, rc);
+	}
+
+	cli->cl_import = imp;
+	/* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
+	cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
+	cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
+		if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
+			CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
+			       name, obddev->obd_name,
+			       cli->cl_target_uuid.uuid);
+			spin_lock(&imp->imp_lock);
+			imp->imp_deactive = 1;
+			spin_unlock(&imp->imp_lock);
+		}
+	}
+
+	obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name,
+						   LDLM_NAMESPACE_CLIENT,
+						   LDLM_NAMESPACE_GREEDY,
+						   ns_type);
+	if (obddev->obd_namespace == NULL) {
+		CERROR("Unable to create client namespace - %s\n",
+		       obddev->obd_name);
+		GOTO(err_import, rc = -ENOMEM);
+	}
+
+	cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
+
+	RETURN(rc);
+
+err_import:
+	class_destroy_import(imp);
+err_ldlm:
+	ldlm_put_ref();
+err:
+	RETURN(rc);
+
+}
+EXPORT_SYMBOL(client_obd_setup);
+
+int client_obd_cleanup(struct obd_device *obddev)
+{
+	ENTRY;
+
+	ldlm_namespace_free_post(obddev->obd_namespace);
+	obddev->obd_namespace = NULL;
+
+	LASSERT(obddev->u.cli.cl_import == NULL);
+
+	ldlm_put_ref();
+	RETURN(0);
+}
+EXPORT_SYMBOL(client_obd_cleanup);
+
+/* ->o_connect() method for client side (OSC and MDC and MGC) */
+int client_connect_import(const struct lu_env *env,
+			  struct obd_export **exp,
+			  struct obd_device *obd, struct obd_uuid *cluuid,
+			  struct obd_connect_data *data, void *localdata)
+{
+	struct client_obd       *cli    = &obd->u.cli;
+	struct obd_import       *imp    = cli->cl_import;
+	struct obd_connect_data *ocd;
+	struct lustre_handle    conn    = { 0 };
+	int		     rc;
+	ENTRY;
+
+	*exp = NULL;
+	down_write(&cli->cl_sem);
+	if (cli->cl_conn_count > 0 )
+		GOTO(out_sem, rc = -EALREADY);
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc)
+		GOTO(out_sem, rc);
+
+	cli->cl_conn_count++;
+	*exp = class_conn2export(&conn);
+
+	LASSERT(obd->obd_namespace);
+
+	imp->imp_dlm_handle = conn;
+	rc = ptlrpc_init_import(imp);
+	if (rc != 0)
+		GOTO(out_ldlm, rc);
+
+	ocd = &imp->imp_connect_data;
+	if (data) {
+		*ocd = *data;
+		imp->imp_connect_flags_orig = data->ocd_connect_flags;
+	}
+
+	rc = ptlrpc_connect_import(imp);
+	if (rc != 0) {
+		LASSERT (imp->imp_state == LUSTRE_IMP_DISCON);
+		GOTO(out_ldlm, rc);
+	}
+	LASSERT((*exp)->exp_connection);
+
+	if (data) {
+		LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
+			 ocd->ocd_connect_flags, "old "LPX64", new "LPX64"\n",
+			 data->ocd_connect_flags, ocd->ocd_connect_flags);
+		data->ocd_connect_flags = ocd->ocd_connect_flags;
+	}
+
+	ptlrpc_pinger_add_import(imp);
+
+	EXIT;
+
+	if (rc) {
+out_ldlm:
+		cli->cl_conn_count--;
+		class_disconnect(*exp);
+		*exp = NULL;
+	}
+out_sem:
+	up_write(&cli->cl_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(client_connect_import);
+
+int client_disconnect_export(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct client_obd *cli;
+	struct obd_import *imp;
+	int rc = 0, err;
+	ENTRY;
+
+	if (!obd) {
+		CERROR("invalid export for disconnect: exp %p cookie "LPX64"\n",
+		       exp, exp ? exp->exp_handle.h_cookie : -1);
+		RETURN(-EINVAL);
+	}
+
+	cli = &obd->u.cli;
+	imp = cli->cl_import;
+
+	down_write(&cli->cl_sem);
+	CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name,
+	       cli->cl_conn_count);
+
+	if (!cli->cl_conn_count) {
+		CERROR("disconnecting disconnected device (%s)\n",
+		       obd->obd_name);
+		GOTO(out_disconnect, rc = -EINVAL);
+	}
+
+	cli->cl_conn_count--;
+	if (cli->cl_conn_count)
+		GOTO(out_disconnect, rc = 0);
+
+	/* Mark import deactivated now, so we don't try to reconnect if any
+	 * of the cleanup RPCs fails (e.g. LDLM cancel, etc).  We don't
+	 * fully deactivate the import, or that would drop all requests. */
+	spin_lock(&imp->imp_lock);
+	imp->imp_deactive = 1;
+	spin_unlock(&imp->imp_lock);
+
+	/* Some non-replayable imports (MDS's OSCs) are pinged, so just
+	 * delete it regardless.  (It's safe to delete an import that was
+	 * never added.) */
+	(void)ptlrpc_pinger_del_import(imp);
+
+	if (obd->obd_namespace != NULL) {
+		/* obd_force == local only */
+		ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
+				       obd->obd_force ? LCF_LOCAL : 0, NULL);
+		ldlm_namespace_free_prior(obd->obd_namespace, imp, obd->obd_force);
+	}
+
+	/* There's no need to hold sem while disconnecting an import,
+	 * and it may actually cause deadlock in GSS. */
+	up_write(&cli->cl_sem);
+	rc = ptlrpc_disconnect_import(imp, 0);
+	down_write(&cli->cl_sem);
+
+	ptlrpc_invalidate_import(imp);
+
+	EXIT;
+
+out_disconnect:
+	/* Use server style - class_disconnect should be always called for
+	 * o_disconnect. */
+	err = class_disconnect(exp);
+	if (!rc && err)
+		rc = err;
+
+	up_write(&cli->cl_sem);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(client_disconnect_export);
+
+
+/**
+ * Packs current SLV and Limit into \a req.
+ */
+int target_pack_pool_reply(struct ptlrpc_request *req)
+{
+	struct obd_device *obd;
+	ENTRY;
+
+	/* Check that we still have all structures alive as this may
+	 * be some late RPC at shutdown time. */
+	if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
+		     !exp_connect_lru_resize(req->rq_export))) {
+		lustre_msg_set_slv(req->rq_repmsg, 0);
+		lustre_msg_set_limit(req->rq_repmsg, 0);
+		RETURN(0);
+	}
+
+	/* OBD is alive here as export is alive, which we checked above. */
+	obd = req->rq_export->exp_obd;
+
+	read_lock(&obd->obd_pool_lock);
+	lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv);
+	lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit);
+	read_unlock(&obd->obd_pool_lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(target_pack_pool_reply);
+
+int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id)
+{
+	if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
+		DEBUG_REQ(D_ERROR, req, "dropping reply");
+		return (-ECOMM);
+	}
+
+	if (unlikely(rc)) {
+		DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
+		req->rq_status = rc;
+		return (ptlrpc_send_error(req, 1));
+	} else {
+		DEBUG_REQ(D_NET, req, "sending reply");
+	}
+
+	return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT));
+}
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+	struct ptlrpc_service_part *svcpt;
+	int			netrc;
+	struct ptlrpc_reply_state *rs;
+	struct obd_export	 *exp;
+	ENTRY;
+
+	if (req->rq_no_reply) {
+		EXIT;
+		return;
+	}
+
+	svcpt = req->rq_rqbd->rqbd_svcpt;
+	rs = req->rq_reply_state;
+	if (rs == NULL || !rs->rs_difficult) {
+		/* no notifiers */
+		target_send_reply_msg (req, rc, fail_id);
+		EXIT;
+		return;
+	}
+
+	/* must be an export if locks saved */
+	LASSERT (req->rq_export != NULL);
+	/* req/reply consistent */
+	LASSERT(rs->rs_svcpt == svcpt);
+
+	/* "fresh" reply */
+	LASSERT (!rs->rs_scheduled);
+	LASSERT (!rs->rs_scheduled_ever);
+	LASSERT (!rs->rs_handled);
+	LASSERT (!rs->rs_on_net);
+	LASSERT (rs->rs_export == NULL);
+	LASSERT (list_empty(&rs->rs_obd_list));
+	LASSERT (list_empty(&rs->rs_exp_list));
+
+	exp = class_export_get (req->rq_export);
+
+	/* disable reply scheduling while I'm setting up */
+	rs->rs_scheduled = 1;
+	rs->rs_on_net    = 1;
+	rs->rs_xid       = req->rq_xid;
+	rs->rs_transno   = req->rq_transno;
+	rs->rs_export    = exp;
+	rs->rs_opc       = lustre_msg_get_opc(req->rq_reqmsg);
+
+	spin_lock(&exp->exp_uncommitted_replies_lock);
+	CDEBUG(D_NET, "rs transno = "LPU64", last committed = "LPU64"\n",
+	       rs->rs_transno, exp->exp_last_committed);
+	if (rs->rs_transno > exp->exp_last_committed) {
+		/* not committed already */
+		list_add_tail(&rs->rs_obd_list,
+				  &exp->exp_uncommitted_replies);
+	}
+	spin_unlock(&exp->exp_uncommitted_replies_lock);
+
+	spin_lock(&exp->exp_lock);
+	list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies);
+	spin_unlock(&exp->exp_lock);
+
+	netrc = target_send_reply_msg(req, rc, fail_id);
+
+	spin_lock(&svcpt->scp_rep_lock);
+
+	atomic_inc(&svcpt->scp_nreps_difficult);
+
+	if (netrc != 0) {
+		/* error sending: reply is off the net.  Also we need +1
+		 * reply ref until ptlrpc_handle_rs() is done
+		 * with the reply state (if the send was successful, there
+		 * would have been +1 ref for the net, which
+		 * reply_out_callback leaves alone) */
+		rs->rs_on_net = 0;
+		ptlrpc_rs_addref(rs);
+	}
+
+	spin_lock(&rs->rs_lock);
+	if (rs->rs_transno <= exp->exp_last_committed ||
+	    (!rs->rs_on_net && !rs->rs_no_ack) ||
+	    list_empty(&rs->rs_exp_list) ||     /* completed already */
+	    list_empty(&rs->rs_obd_list)) {
+		CDEBUG(D_HA, "Schedule reply immediately\n");
+		ptlrpc_dispatch_difficult_reply(rs);
+	} else {
+		list_add(&rs->rs_list, &svcpt->scp_rep_active);
+		rs->rs_scheduled = 0;	/* allow notifier to schedule */
+	}
+	spin_unlock(&rs->rs_lock);
+	spin_unlock(&svcpt->scp_rep_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(target_send_reply);
+
+ldlm_mode_t lck_compat_array[] = {
+	[LCK_EX] LCK_COMPAT_EX,
+	[LCK_PW] LCK_COMPAT_PW,
+	[LCK_PR] LCK_COMPAT_PR,
+	[LCK_CW] LCK_COMPAT_CW,
+	[LCK_CR] LCK_COMPAT_CR,
+	[LCK_NL] LCK_COMPAT_NL,
+	[LCK_GROUP] LCK_COMPAT_GROUP,
+	[LCK_COS] LCK_COMPAT_COS,
+};
+
+/**
+ * Rather arbitrary mapping from LDLM error codes to errno values. This should
+ * not escape to the user level.
+ */
+int ldlm_error2errno(ldlm_error_t error)
+{
+	int result;
+
+	switch (error) {
+	case ELDLM_OK:
+		result = 0;
+		break;
+	case ELDLM_LOCK_CHANGED:
+		result = -ESTALE;
+		break;
+	case ELDLM_LOCK_ABORTED:
+		result = -ENAVAIL;
+		break;
+	case ELDLM_LOCK_REPLACED:
+		result = -ESRCH;
+		break;
+	case ELDLM_NO_LOCK_DATA:
+		result = -ENOENT;
+		break;
+	case ELDLM_NAMESPACE_EXISTS:
+		result = -EEXIST;
+		break;
+	case ELDLM_BAD_NAMESPACE:
+		result = -EBADF;
+		break;
+	default:
+		if (((int)error) < 0)  /* cast to signed type */
+			result = error; /* as ldlm_error_t can be unsigned */
+		else {
+			CERROR("Invalid DLM result code: %d\n", error);
+			result = -EPROTO;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(ldlm_error2errno);
+
+/**
+ * Dual to ldlm_error2errno(): maps errno values back to ldlm_error_t.
+ */
+ldlm_error_t ldlm_errno2error(int err_no)
+{
+	int error;
+
+	switch (err_no) {
+	case 0:
+		error = ELDLM_OK;
+		break;
+	case -ESTALE:
+		error = ELDLM_LOCK_CHANGED;
+		break;
+	case -ENAVAIL:
+		error = ELDLM_LOCK_ABORTED;
+		break;
+	case -ESRCH:
+		error = ELDLM_LOCK_REPLACED;
+		break;
+	case -ENOENT:
+		error = ELDLM_NO_LOCK_DATA;
+		break;
+	case -EEXIST:
+		error = ELDLM_NAMESPACE_EXISTS;
+		break;
+	case -EBADF:
+		error = ELDLM_BAD_NAMESPACE;
+		break;
+	default:
+		error = err_no;
+	}
+	return error;
+}
+EXPORT_SYMBOL(ldlm_errno2error);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+	if (!list_empty(&exp->exp_locks_list)) {
+		struct ldlm_lock *lock;
+
+		CERROR("dumping locks for export %p,"
+		       "ignore if the unmount doesn't hang\n", exp);
+		list_for_each_entry(lock, &exp->exp_locks_list,
+					l_exp_refs_link)
+			LDLM_ERROR(lock, "lock:");
+	}
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+#endif

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
new file mode 100644
index 0000000..33b76a1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c

@@ -0,0 +1,2429 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lock.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/lustre_intent.h>
+
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+/* lock types */
+char *ldlm_lockname[] = {
+	[0] "--",
+	[LCK_EX] "EX",
+	[LCK_PW] "PW",
+	[LCK_PR] "PR",
+	[LCK_CW] "CW",
+	[LCK_CR] "CR",
+	[LCK_NL] "NL",
+	[LCK_GROUP] "GROUP",
+	[LCK_COS] "COS"
+};
+EXPORT_SYMBOL(ldlm_lockname);
+
+char *ldlm_typename[] = {
+	[LDLM_PLAIN] "PLN",
+	[LDLM_EXTENT] "EXT",
+	[LDLM_FLOCK] "FLK",
+	[LDLM_IBITS] "IBT",
+};
+EXPORT_SYMBOL(ldlm_typename);
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire18_to_local[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_wire_to_local,
+	[LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_wire_to_local,
+	[LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_wire18_to_local,
+	[LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire21_to_local[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_wire_to_local,
+	[LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_wire_to_local,
+	[LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_wire21_to_local,
+	[LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = {
+	[LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_local_to_wire,
+	[LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_local_to_wire,
+	[LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_local_to_wire,
+	[LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_local_to_wire,
+};
+
+/**
+ * Converts lock policy from local format to on the wire lock_desc format
+ */
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+				 const ldlm_policy_data_t *lpolicy,
+				 ldlm_wire_policy_data_t *wpolicy)
+{
+	ldlm_policy_local_to_wire_t convert;
+
+	convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE];
+
+	convert(lpolicy, wpolicy);
+}
+
+/**
+ * Converts lock policy from on the wire lock_desc format to local format
+ */
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+				  const ldlm_wire_policy_data_t *wpolicy,
+				  ldlm_policy_data_t *lpolicy)
+{
+	ldlm_policy_wire_to_local_t convert;
+	int new_client;
+
+	/** some badness for 2.0.0 clients, but 2.0.0 isn't supported */
+	new_client = (exp_connect_flags(exp) & OBD_CONNECT_FULL20) != 0;
+	if (new_client)
+		convert = ldlm_policy_wire21_to_local[type - LDLM_MIN_TYPE];
+	else
+		convert = ldlm_policy_wire18_to_local[type - LDLM_MIN_TYPE];
+
+	convert(wpolicy, lpolicy);
+}
+
+char *ldlm_it2str(int it)
+{
+	switch (it) {
+	case IT_OPEN:
+		return "open";
+	case IT_CREAT:
+		return "creat";
+	case (IT_OPEN | IT_CREAT):
+		return "open|creat";
+	case IT_READDIR:
+		return "readdir";
+	case IT_GETATTR:
+		return "getattr";
+	case IT_LOOKUP:
+		return "lookup";
+	case IT_UNLINK:
+		return "unlink";
+	case IT_GETXATTR:
+		return "getxattr";
+	case IT_LAYOUT:
+		return "layout";
+	default:
+		CERROR("Unknown intent %d\n", it);
+		return "UNKNOWN";
+	}
+}
+EXPORT_SYMBOL(ldlm_it2str);
+
+extern struct kmem_cache *ldlm_lock_slab;
+
+
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
+{
+	ns->ns_policy = arg;
+}
+EXPORT_SYMBOL(ldlm_register_intent);
+
+/*
+ * REFCOUNTED LOCK OBJECTS
+ */
+
+
+/**
+ * Get a reference on a lock.
+ *
+ * Lock refcounts, during creation:
+ *   - one special one for allocation, dec'd only once in destroy
+ *   - one for being a lock that's in-use
+ *   - one for the addref associated with a new lock
+ */
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock)
+{
+	atomic_inc(&lock->l_refc);
+	return lock;
+}
+EXPORT_SYMBOL(ldlm_lock_get);
+
+/**
+ * Release lock reference.
+ *
+ * Also frees the lock if it was last reference.
+ */
+void ldlm_lock_put(struct ldlm_lock *lock)
+{
+	ENTRY;
+
+	LASSERT(lock->l_resource != LP_POISON);
+	LASSERT(atomic_read(&lock->l_refc) > 0);
+	if (atomic_dec_and_test(&lock->l_refc)) {
+		struct ldlm_resource *res;
+
+		LDLM_DEBUG(lock,
+			   "final lock_put on destroyed lock, freeing it.");
+
+		res = lock->l_resource;
+		LASSERT(lock->l_destroyed);
+		LASSERT(list_empty(&lock->l_res_link));
+		LASSERT(list_empty(&lock->l_pending_chain));
+
+		lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
+				     LDLM_NSS_LOCKS);
+		lu_ref_del(&res->lr_reference, "lock", lock);
+		ldlm_resource_putref(res);
+		lock->l_resource = NULL;
+		if (lock->l_export) {
+			class_export_lock_put(lock->l_export, lock);
+			lock->l_export = NULL;
+		}
+
+		if (lock->l_lvb_data != NULL)
+			OBD_FREE(lock->l_lvb_data, lock->l_lvb_len);
+
+		ldlm_interval_free(ldlm_interval_detach(lock));
+		lu_ref_fini(&lock->l_reference);
+		OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_put);
+
+/**
+ * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked.
+ */
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
+{
+	int rc = 0;
+	if (!list_empty(&lock->l_lru)) {
+		struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+		LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+		list_del_init(&lock->l_lru);
+		if (lock->l_flags & LDLM_FL_SKIPPED)
+			lock->l_flags &= ~LDLM_FL_SKIPPED;
+		LASSERT(ns->ns_nr_unused > 0);
+		ns->ns_nr_unused--;
+		rc = 1;
+	}
+	return rc;
+}
+
+/**
+ * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first.
+ */
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+	int rc;
+
+	ENTRY;
+	if (lock->l_ns_srv) {
+		LASSERT(list_empty(&lock->l_lru));
+		RETURN(0);
+	}
+
+	spin_lock(&ns->ns_lock);
+	rc = ldlm_lock_remove_from_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+	EXIT;
+	return rc;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked.
+ */
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	lock->l_last_used = cfs_time_current();
+	LASSERT(list_empty(&lock->l_lru));
+	LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+	list_add_tail(&lock->l_lru, &ns->ns_unused_list);
+	LASSERT(ns->ns_nr_unused >= 0);
+	ns->ns_nr_unused++;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks
+ * first.
+ */
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	ENTRY;
+	spin_lock(&ns->ns_lock);
+	ldlm_lock_add_to_lru_nolock(lock);
+	spin_unlock(&ns->ns_lock);
+	EXIT;
+}
+
+/**
+ * Moves LDLM lock \a lock that is already in namespace LRU to the tail of
+ * the LRU. Performs necessary LRU locking
+ */
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock)
+{
+	struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+	ENTRY;
+	if (lock->l_ns_srv) {
+		LASSERT(list_empty(&lock->l_lru));
+		EXIT;
+		return;
+	}
+
+	spin_lock(&ns->ns_lock);
+	if (!list_empty(&lock->l_lru)) {
+		ldlm_lock_remove_from_lru_nolock(lock);
+		ldlm_lock_add_to_lru_nolock(lock);
+	}
+	spin_unlock(&ns->ns_lock);
+	EXIT;
+}
+
+/**
+ * Helper to destroy a locked lock.
+ *
+ * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock
+ * Must be called with l_lock and lr_lock held.
+ *
+ * Does not actually free the lock data, but rather marks the lock as
+ * destroyed by setting l_destroyed field in the lock to 1.  Destroys a
+ * handle->lock association too, so that the lock can no longer be found
+ * and removes the lock from LRU list.  Actual lock freeing occurs when
+ * last lock reference goes away.
+ *
+ * Original comment (of some historical value):
+ * This used to have a 'strict' flag, which recovery would use to mark an
+ * in-use lock as needing-to-die.  Lest I am ever tempted to put it back, I
+ * shall explain why it's gone: with the new hash table scheme, once you call
+ * ldlm_lock_destroy, you can never drop your final references on this lock.
+ * Because it's not in the hash table anymore.  -phil
+ */
+int ldlm_lock_destroy_internal(struct ldlm_lock *lock)
+{
+	ENTRY;
+
+	if (lock->l_readers || lock->l_writers) {
+		LDLM_ERROR(lock, "lock still has references");
+		LBUG();
+	}
+
+	if (!list_empty(&lock->l_res_link)) {
+		LDLM_ERROR(lock, "lock still on resource");
+		LBUG();
+	}
+
+	if (lock->l_destroyed) {
+		LASSERT(list_empty(&lock->l_lru));
+		EXIT;
+		return 0;
+	}
+	lock->l_destroyed = 1;
+
+	if (lock->l_export && lock->l_export->exp_lock_hash) {
+		/* NB: it's safe to call cfs_hash_del() even lock isn't
+		 * in exp_lock_hash. */
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_del(lock->l_export->exp_lock_hash,
+			     &lock->l_remote_handle, &lock->l_exp_hash);
+	}
+
+	ldlm_lock_remove_from_lru(lock);
+	class_handle_unhash(&lock->l_handle);
+
+#if 0
+	/* Wake anyone waiting for this lock */
+	/* FIXME: I should probably add yet another flag, instead of using
+	 * l_export to only call this on clients */
+	if (lock->l_export)
+		class_export_put(lock->l_export);
+	lock->l_export = NULL;
+	if (lock->l_export && lock->l_completion_ast)
+		lock->l_completion_ast(lock, 0);
+#endif
+	EXIT;
+	return 1;
+}
+
+/**
+ * Destroys a LDLM lock \a lock. Performs necessary locking first.
+ */
+void ldlm_lock_destroy(struct ldlm_lock *lock)
+{
+	int first;
+	ENTRY;
+	lock_res_and_lock(lock);
+	first = ldlm_lock_destroy_internal(lock);
+	unlock_res_and_lock(lock);
+
+	/* drop reference from hashtable only for first destroy */
+	if (first) {
+		lu_ref_del(&lock->l_reference, "hash", lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+	EXIT;
+}
+
+/**
+ * Destroys a LDLM lock \a lock that is already locked.
+ */
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock)
+{
+	int first;
+	ENTRY;
+	first = ldlm_lock_destroy_internal(lock);
+	/* drop reference from hashtable only for first destroy */
+	if (first) {
+		lu_ref_del(&lock->l_reference, "hash", lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+	EXIT;
+}
+
+/* this is called by portals_handle2object with the handle lock taken */
+static void lock_handle_addref(void *lock)
+{
+	LDLM_LOCK_GET((struct ldlm_lock *)lock);
+}
+
+static void lock_handle_free(void *lock, int size)
+{
+	LASSERT(size == sizeof(struct ldlm_lock));
+	OBD_SLAB_FREE(lock, ldlm_lock_slab, size);
+}
+
+struct portals_handle_ops lock_handle_ops = {
+	.hop_addref = lock_handle_addref,
+	.hop_free   = lock_handle_free,
+};
+
+/**
+ *
+ * Allocate and initialize new lock structure.
+ *
+ * usage: pass in a resource on which you have done ldlm_resource_get
+ *	new lock will take over the refcount.
+ * returns: lock with refcount 2 - one for current caller and one for remote
+ */
+static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
+{
+	struct ldlm_lock *lock;
+	ENTRY;
+
+	if (resource == NULL)
+		LBUG();
+
+	OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, __GFP_IO);
+	if (lock == NULL)
+		RETURN(NULL);
+
+	spin_lock_init(&lock->l_lock);
+	lock->l_resource = resource;
+	lu_ref_add(&resource->lr_reference, "lock", lock);
+
+	atomic_set(&lock->l_refc, 2);
+	INIT_LIST_HEAD(&lock->l_res_link);
+	INIT_LIST_HEAD(&lock->l_lru);
+	INIT_LIST_HEAD(&lock->l_pending_chain);
+	INIT_LIST_HEAD(&lock->l_bl_ast);
+	INIT_LIST_HEAD(&lock->l_cp_ast);
+	INIT_LIST_HEAD(&lock->l_rk_ast);
+	init_waitqueue_head(&lock->l_waitq);
+	lock->l_blocking_lock = NULL;
+	INIT_LIST_HEAD(&lock->l_sl_mode);
+	INIT_LIST_HEAD(&lock->l_sl_policy);
+	INIT_HLIST_NODE(&lock->l_exp_hash);
+	INIT_HLIST_NODE(&lock->l_exp_flock_hash);
+
+	lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
+			     LDLM_NSS_LOCKS);
+	INIT_LIST_HEAD(&lock->l_handle.h_link);
+	class_handle_hash(&lock->l_handle, &lock_handle_ops);
+
+	lu_ref_init(&lock->l_reference);
+	lu_ref_add(&lock->l_reference, "hash", lock);
+	lock->l_callback_timeout = 0;
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&lock->l_exp_refs_link);
+	lock->l_exp_refs_nr = 0;
+	lock->l_exp_refs_target = NULL;
+#endif
+	INIT_LIST_HEAD(&lock->l_exp_list);
+
+	RETURN(lock);
+}
+
+/**
+ * Moves LDLM lock \a lock to another resource.
+ * This is used on client when server returns some other lock than requested
+ * (typically as a result of intent operation)
+ */
+int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+			      const struct ldlm_res_id *new_resid)
+{
+	struct ldlm_resource *oldres = lock->l_resource;
+	struct ldlm_resource *newres;
+	int type;
+	ENTRY;
+
+	LASSERT(ns_is_client(ns));
+
+	lock_res_and_lock(lock);
+	if (memcmp(new_resid, &lock->l_resource->lr_name,
+		   sizeof(lock->l_resource->lr_name)) == 0) {
+		/* Nothing to do */
+		unlock_res_and_lock(lock);
+		RETURN(0);
+	}
+
+	LASSERT(new_resid->name[0] != 0);
+
+	/* This function assumes that the lock isn't on any lists */
+	LASSERT(list_empty(&lock->l_res_link));
+
+	type = oldres->lr_type;
+	unlock_res_and_lock(lock);
+
+	newres = ldlm_resource_get(ns, NULL, new_resid, type, 1);
+	if (newres == NULL)
+		RETURN(-ENOMEM);
+
+	lu_ref_add(&newres->lr_reference, "lock", lock);
+	/*
+	 * To flip the lock from the old to the new resource, lock, oldres and
+	 * newres have to be locked. Resource spin-locks are nested within
+	 * lock->l_lock, and are taken in the memory address order to avoid
+	 * dead-locks.
+	 */
+	spin_lock(&lock->l_lock);
+	oldres = lock->l_resource;
+	if (oldres < newres) {
+		lock_res(oldres);
+		lock_res_nested(newres, LRT_NEW);
+	} else {
+		lock_res(newres);
+		lock_res_nested(oldres, LRT_NEW);
+	}
+	LASSERT(memcmp(new_resid, &oldres->lr_name,
+		       sizeof oldres->lr_name) != 0);
+	lock->l_resource = newres;
+	unlock_res(oldres);
+	unlock_res_and_lock(lock);
+
+	/* ...and the flowers are still standing! */
+	lu_ref_del(&oldres->lr_reference, "lock", lock);
+	ldlm_resource_putref(oldres);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_lock_change_resource);
+
+/** \defgroup ldlm_handles LDLM HANDLES
+ * Ways to get hold of locks without any addresses.
+ * @{
+ */
+
+/**
+ * Fills in handle for LDLM lock \a lock into supplied \a lockh
+ * Does not take any references.
+ */
+void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh)
+{
+	lockh->cookie = lock->l_handle.h_cookie;
+}
+EXPORT_SYMBOL(ldlm_lock2handle);
+
+/**
+ * Obtain a lock reference by handle.
+ *
+ * if \a flags: atomically get the lock and set the flags.
+ *	      Return NULL if flag already set
+ */
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle,
+				     __u64 flags)
+{
+	struct ldlm_lock *lock;
+	ENTRY;
+
+	LASSERT(handle);
+
+	lock = class_handle2object(handle->cookie);
+	if (lock == NULL)
+		RETURN(NULL);
+
+	/* It's unlikely but possible that someone marked the lock as
+	 * destroyed after we did handle2object on it */
+	if (flags == 0 && !lock->l_destroyed) {
+		lu_ref_add(&lock->l_reference, "handle", current);
+		RETURN(lock);
+	}
+
+	lock_res_and_lock(lock);
+
+	LASSERT(lock->l_resource != NULL);
+
+	lu_ref_add_atomic(&lock->l_reference, "handle", current);
+	if (unlikely(lock->l_destroyed)) {
+		unlock_res_and_lock(lock);
+		CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock);
+		LDLM_LOCK_PUT(lock);
+		RETURN(NULL);
+	}
+
+	if (flags && (lock->l_flags & flags)) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+		RETURN(NULL);
+	}
+
+	if (flags)
+		lock->l_flags |= flags;
+
+	unlock_res_and_lock(lock);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(__ldlm_handle2lock);
+/** @} ldlm_handles */
+
+/**
+ * Fill in "on the wire" representation for given LDLM lock into supplied
+ * lock descriptor \a desc structure.
+ */
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc)
+{
+	struct obd_export *exp = lock->l_export ?: lock->l_conn_export;
+
+	/* INODEBITS_INTEROP: If the other side does not support
+	 * inodebits, reply with a plain lock descriptor. */
+	if ((lock->l_resource->lr_type == LDLM_IBITS) &&
+	    (exp && !(exp_connect_flags(exp) & OBD_CONNECT_IBITS))) {
+		/* Make sure all the right bits are set in this lock we
+		   are going to pass to client */
+		LASSERTF(lock->l_policy_data.l_inodebits.bits ==
+			 (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+			  MDS_INODELOCK_LAYOUT),
+			 "Inappropriate inode lock bits during "
+			 "conversion " LPU64 "\n",
+			 lock->l_policy_data.l_inodebits.bits);
+
+		ldlm_res2desc(lock->l_resource, &desc->l_resource);
+		desc->l_resource.lr_type = LDLM_PLAIN;
+
+		/* Convert "new" lock mode to something old client can
+		   understand */
+		if ((lock->l_req_mode == LCK_CR) ||
+		    (lock->l_req_mode == LCK_CW))
+			desc->l_req_mode = LCK_PR;
+		else
+			desc->l_req_mode = lock->l_req_mode;
+		if ((lock->l_granted_mode == LCK_CR) ||
+		    (lock->l_granted_mode == LCK_CW)) {
+			desc->l_granted_mode = LCK_PR;
+		} else {
+			/* We never grant PW/EX locks to clients */
+			LASSERT((lock->l_granted_mode != LCK_PW) &&
+				(lock->l_granted_mode != LCK_EX));
+			desc->l_granted_mode = lock->l_granted_mode;
+		}
+
+		/* We do not copy policy here, because there is no
+		   policy for plain locks */
+	} else {
+		ldlm_res2desc(lock->l_resource, &desc->l_resource);
+		desc->l_req_mode = lock->l_req_mode;
+		desc->l_granted_mode = lock->l_granted_mode;
+		ldlm_convert_policy_to_wire(lock->l_resource->lr_type,
+					    &lock->l_policy_data,
+					    &desc->l_policy_data);
+	}
+}
+EXPORT_SYMBOL(ldlm_lock2desc);
+
+/**
+ * Add a lock to list of conflicting locks to send AST to.
+ *
+ * Only add if we have not sent a blocking AST to the lock yet.
+ */
+void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			   struct list_head *work_list)
+{
+	if ((lock->l_flags & LDLM_FL_AST_SENT) == 0) {
+		LDLM_DEBUG(lock, "lock incompatible; sending blocking AST.");
+		lock->l_flags |= LDLM_FL_AST_SENT;
+		/* If the enqueuing client said so, tell the AST recipient to
+		 * discard dirty data, rather than writing back. */
+		if (new->l_flags & LDLM_AST_DISCARD_DATA)
+			lock->l_flags |= LDLM_FL_DISCARD_DATA;
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, work_list);
+		LDLM_LOCK_GET(lock);
+		LASSERT(lock->l_blocking_lock == NULL);
+		lock->l_blocking_lock = LDLM_LOCK_GET(new);
+	}
+}
+
+/**
+ * Add a lock to list of just granted locks to send completion AST to.
+ */
+void ldlm_add_cp_work_item(struct ldlm_lock *lock, struct list_head *work_list)
+{
+	if ((lock->l_flags & LDLM_FL_CP_REQD) == 0) {
+		lock->l_flags |= LDLM_FL_CP_REQD;
+		LDLM_DEBUG(lock, "lock granted; sending completion AST.");
+		LASSERT(list_empty(&lock->l_cp_ast));
+		list_add(&lock->l_cp_ast, work_list);
+		LDLM_LOCK_GET(lock);
+	}
+}
+
+/**
+ * Aggregator function to add AST work items into a list. Determines
+ * what sort of an AST work needs to be done and calls the proper
+ * adding function.
+ * Must be called with lr_lock held.
+ */
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+			    struct list_head *work_list)
+{
+	ENTRY;
+	check_res_locked(lock->l_resource);
+	if (new)
+		ldlm_add_bl_work_item(lock, new, work_list);
+	else
+		ldlm_add_cp_work_item(lock, work_list);
+	EXIT;
+}
+
+/**
+ * Add specified reader/writer reference to LDLM lock with handle \a lockh.
+ * r/w reference type is determined by \a mode
+ * Calls ldlm_lock_addref_internal.
+ */
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock;
+
+	lock = ldlm_handle2lock(lockh);
+	LASSERT(lock != NULL);
+	ldlm_lock_addref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_addref);
+
+/**
+ * Helper function.
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * r/w reference type is determined by \a mode
+ * Removes lock from LRU if it is there.
+ * Assumes the LDLM lock is already locked.
+ */
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+	ldlm_lock_remove_from_lru(lock);
+	if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+		lock->l_readers++;
+		lu_ref_add_atomic(&lock->l_reference, "reader", lock);
+	}
+	if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+		lock->l_writers++;
+		lu_ref_add_atomic(&lock->l_reference, "writer", lock);
+	}
+	LDLM_LOCK_GET(lock);
+	lu_ref_add_atomic(&lock->l_reference, "user", lock);
+	LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]);
+}
+
+/**
+ * Attempts to add reader/writer reference to a lock with handle \a lockh, and
+ * fails if lock is already LDLM_FL_CBPENDING or destroyed.
+ *
+ * \retval 0 success, lock was addref-ed
+ *
+ * \retval -EAGAIN lock is being canceled.
+ */
+int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock;
+	int	       result;
+
+	result = -EAGAIN;
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		if (lock->l_readers != 0 || lock->l_writers != 0 ||
+		    !(lock->l_flags & LDLM_FL_CBPENDING)) {
+			ldlm_lock_addref_internal_nolock(lock, mode);
+			result = 0;
+		}
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+	return result;
+}
+EXPORT_SYMBOL(ldlm_lock_addref_try);
+
+/**
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work.
+ * Only called for local locks.
+ */
+void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_addref_internal_nolock(lock, mode);
+	unlock_res_and_lock(lock);
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Assumes LDLM lock is already locked.
+ * only called in ldlm_flock_destroy and for local locks.
+ * Does NOT add lock to LRU if no r/w references left to accomodate flock locks
+ * that cannot be placed in LRU.
+ */
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+	LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+	if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+		LASSERT(lock->l_readers > 0);
+		lu_ref_del(&lock->l_reference, "reader", lock);
+		lock->l_readers--;
+	}
+	if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+		LASSERT(lock->l_writers > 0);
+		lu_ref_del(&lock->l_reference, "writer", lock);
+		lock->l_writers--;
+	}
+
+	lu_ref_del(&lock->l_reference, "user", lock);
+	LDLM_LOCK_RELEASE(lock);    /* matches the LDLM_LOCK_GET() in addref */
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Locks LDLM lock first.
+ * If the lock is determined to be client lock on a client and r/w refcount
+ * drops to zero and the lock is not blocked, the lock is added to LRU lock
+ * on the namespace.
+ * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called.
+ */
+void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+	struct ldlm_namespace *ns;
+	ENTRY;
+
+	lock_res_and_lock(lock);
+
+	ns = ldlm_lock_to_ns(lock);
+
+	ldlm_lock_decref_internal_nolock(lock, mode);
+
+	if (lock->l_flags & LDLM_FL_LOCAL &&
+	    !lock->l_readers && !lock->l_writers) {
+		/* If this is a local lock on a server namespace and this was
+		 * the last reference, cancel the lock. */
+		CDEBUG(D_INFO, "forcing cancel of local lock\n");
+		lock->l_flags |= LDLM_FL_CBPENDING;
+	}
+
+	if (!lock->l_readers && !lock->l_writers &&
+	    (lock->l_flags & LDLM_FL_CBPENDING)) {
+		/* If we received a blocked AST and this was the last reference,
+		 * run the callback. */
+		if (lock->l_ns_srv && lock->l_export)
+			CERROR("FL_CBPENDING set on non-local lock--just a "
+			       "warning\n");
+
+		LDLM_DEBUG(lock, "final decref done on cbpending lock");
+
+		LDLM_LOCK_GET(lock); /* dropped by bl thread */
+		ldlm_lock_remove_from_lru(lock);
+		unlock_res_and_lock(lock);
+
+		if (lock->l_flags & LDLM_FL_FAIL_LOC)
+			OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+		if ((lock->l_flags & LDLM_FL_ATOMIC_CB) ||
+		    ldlm_bl_to_thread_lock(ns, NULL, lock) != 0)
+			ldlm_handle_bl_callback(ns, NULL, lock);
+	} else if (ns_is_client(ns) &&
+		   !lock->l_readers && !lock->l_writers &&
+		   !(lock->l_flags & LDLM_FL_NO_LRU) &&
+		   !(lock->l_flags & LDLM_FL_BL_AST)) {
+
+		LDLM_DEBUG(lock, "add lock into lru list");
+
+		/* If this is a client-side namespace and this was the last
+		 * reference, put it on the LRU. */
+		ldlm_lock_add_to_lru(lock);
+		unlock_res_and_lock(lock);
+
+		if (lock->l_flags & LDLM_FL_FAIL_LOC)
+			OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+		/* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE
+		 * are not supported by the server, otherwise, it is done on
+		 * enqueue. */
+		if (!exp_connect_cancelset(lock->l_conn_export) &&
+		    !ns_connect_lru_resize(ns))
+			ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
+	} else {
+		LDLM_DEBUG(lock, "do not add lock into lru list");
+		unlock_res_and_lock(lock);
+	}
+
+	EXIT;
+}
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle \a lockh
+ */
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+	LASSERTF(lock != NULL, "Non-existing lock: "LPX64"\n", lockh->cookie);
+	ldlm_lock_decref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref);
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle
+ * \a lockh and mark it for subsequent cancellation once r/w refcount
+ * drops to zero instead of putting into LRU.
+ *
+ * Typical usage is for GROUP locks which we cannot allow to be cached.
+ */
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode)
+{
+	struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+	ENTRY;
+
+	LASSERT(lock != NULL);
+
+	LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+	lock_res_and_lock(lock);
+	lock->l_flags |= LDLM_FL_CBPENDING;
+	unlock_res_and_lock(lock);
+	ldlm_lock_decref_internal(lock, mode);
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref_and_cancel);
+
+struct sl_insert_point {
+	struct list_head *res_link;
+	struct list_head *mode_link;
+	struct list_head *policy_link;
+};
+
+/**
+ * Finds a position to insert the new lock into granted lock list.
+ *
+ * Used for locks eligible for skiplist optimization.
+ *
+ * Parameters:
+ *      queue [input]:  the granted list where search acts on;
+ *      req [input]:    the lock whose position to be located;
+ *      prev [output]:  positions within 3 lists to insert @req to
+ * Return Value:
+ *      filled @prev
+ * NOTE: called by
+ *  - ldlm_grant_lock_with_skiplist
+ */
+static void search_granted_lock(struct list_head *queue,
+				struct ldlm_lock *req,
+				struct sl_insert_point *prev)
+{
+	struct list_head *tmp;
+	struct ldlm_lock *lock, *mode_end, *policy_end;
+	ENTRY;
+
+	list_for_each(tmp, queue) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		mode_end = list_entry(lock->l_sl_mode.prev,
+					  struct ldlm_lock, l_sl_mode);
+
+		if (lock->l_req_mode != req->l_req_mode) {
+			/* jump to last lock of mode group */
+			tmp = &mode_end->l_res_link;
+			continue;
+		}
+
+		/* suitable mode group is found */
+		if (lock->l_resource->lr_type == LDLM_PLAIN) {
+			/* insert point is last lock of the mode group */
+			prev->res_link = &mode_end->l_res_link;
+			prev->mode_link = &mode_end->l_sl_mode;
+			prev->policy_link = &req->l_sl_policy;
+			EXIT;
+			return;
+		} else if (lock->l_resource->lr_type == LDLM_IBITS) {
+			for (;;) {
+				policy_end =
+					list_entry(lock->l_sl_policy.prev,
+						       struct ldlm_lock,
+						       l_sl_policy);
+
+				if (lock->l_policy_data.l_inodebits.bits ==
+				    req->l_policy_data.l_inodebits.bits) {
+					/* insert point is last lock of
+					 * the policy group */
+					prev->res_link =
+						&policy_end->l_res_link;
+					prev->mode_link =
+						&policy_end->l_sl_mode;
+					prev->policy_link =
+						&policy_end->l_sl_policy;
+					EXIT;
+					return;
+				}
+
+				if (policy_end == mode_end)
+					/* done with mode group */
+					break;
+
+				/* go to next policy group within mode group */
+				tmp = policy_end->l_res_link.next;
+				lock = list_entry(tmp, struct ldlm_lock,
+						      l_res_link);
+			}  /* loop over policy groups within the mode group */
+
+			/* insert point is last lock of the mode group,
+			 * new policy group is started */
+			prev->res_link = &mode_end->l_res_link;
+			prev->mode_link = &mode_end->l_sl_mode;
+			prev->policy_link = &req->l_sl_policy;
+			EXIT;
+			return;
+		} else {
+			LDLM_ERROR(lock,"is not LDLM_PLAIN or LDLM_IBITS lock");
+			LBUG();
+		}
+	}
+
+	/* insert point is last lock on the queue,
+	 * new mode group and new policy group are started */
+	prev->res_link = queue->prev;
+	prev->mode_link = &req->l_sl_mode;
+	prev->policy_link = &req->l_sl_policy;
+	EXIT;
+	return;
+}
+
+/**
+ * Add a lock into resource granted list after a position described by
+ * \a prev.
+ */
+static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
+				       struct sl_insert_point *prev)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	ENTRY;
+
+	check_res_locked(res);
+
+	ldlm_resource_dump(D_INFO, res);
+	LDLM_DEBUG(lock, "About to add lock:");
+
+	if (lock->l_destroyed) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		return;
+	}
+
+	LASSERT(list_empty(&lock->l_res_link));
+	LASSERT(list_empty(&lock->l_sl_mode));
+	LASSERT(list_empty(&lock->l_sl_policy));
+
+	/*
+	 * lock->link == prev->link means lock is first starting the group.
+	 * Don't re-add to itself to suppress kernel warnings.
+	 */
+	if (&lock->l_res_link != prev->res_link)
+		list_add(&lock->l_res_link, prev->res_link);
+	if (&lock->l_sl_mode != prev->mode_link)
+		list_add(&lock->l_sl_mode, prev->mode_link);
+	if (&lock->l_sl_policy != prev->policy_link)
+		list_add(&lock->l_sl_policy, prev->policy_link);
+
+	EXIT;
+}
+
+/**
+ * Add a lock to granted list on a resource maintaining skiplist
+ * correctness.
+ */
+static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+{
+	struct sl_insert_point prev;
+	ENTRY;
+
+	LASSERT(lock->l_req_mode == lock->l_granted_mode);
+
+	search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+	ldlm_granted_list_add_lock(lock, &prev);
+	EXIT;
+}
+
+/**
+ * Perform lock granting bookkeeping.
+ *
+ * Includes putting the lock into granted list and updating lock mode.
+ * NOTE: called by
+ *  - ldlm_lock_enqueue
+ *  - ldlm_reprocess_queue
+ *  - ldlm_lock_convert
+ *
+ * must be called with lr_lock held
+ */
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
+{
+	struct ldlm_resource *res = lock->l_resource;
+	ENTRY;
+
+	check_res_locked(res);
+
+	lock->l_granted_mode = lock->l_req_mode;
+	if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS)
+		ldlm_grant_lock_with_skiplist(lock);
+	else if (res->lr_type == LDLM_EXTENT)
+		ldlm_extent_add_lock(res, lock);
+	else
+		ldlm_resource_add_lock(res, &res->lr_granted, lock);
+
+	if (lock->l_granted_mode < res->lr_most_restr)
+		res->lr_most_restr = lock->l_granted_mode;
+
+	if (work_list && lock->l_completion_ast != NULL)
+		ldlm_add_ast_work_item(lock, NULL, work_list);
+
+	ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock);
+	EXIT;
+}
+
+/**
+ * Search for a lock with given properties in a queue.
+ *
+ * \retval a referenced lock or NULL.  See the flag descriptions below, in the
+ * comment above ldlm_lock_match
+ */
+static struct ldlm_lock *search_queue(struct list_head *queue,
+				      ldlm_mode_t *mode,
+				      ldlm_policy_data_t *policy,
+				      struct ldlm_lock *old_lock,
+				      __u64 flags, int unref)
+{
+	struct ldlm_lock *lock;
+	struct list_head       *tmp;
+
+	list_for_each(tmp, queue) {
+		ldlm_mode_t match;
+
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (lock == old_lock)
+			break;
+
+		/* llite sometimes wants to match locks that will be
+		 * canceled when their users drop, but we allow it to match
+		 * if it passes in CBPENDING and the lock still has users.
+		 * this is generally only going to be used by children
+		 * whose parents already hold a lock so forward progress
+		 * can still happen. */
+		if (lock->l_flags & LDLM_FL_CBPENDING &&
+		    !(flags & LDLM_FL_CBPENDING))
+			continue;
+		if (!unref && lock->l_flags & LDLM_FL_CBPENDING &&
+		    lock->l_readers == 0 && lock->l_writers == 0)
+			continue;
+
+		if (!(lock->l_req_mode & *mode))
+			continue;
+		match = lock->l_req_mode;
+
+		if (lock->l_resource->lr_type == LDLM_EXTENT &&
+		    (lock->l_policy_data.l_extent.start >
+		     policy->l_extent.start ||
+		     lock->l_policy_data.l_extent.end < policy->l_extent.end))
+			continue;
+
+		if (unlikely(match == LCK_GROUP) &&
+		    lock->l_resource->lr_type == LDLM_EXTENT &&
+		    lock->l_policy_data.l_extent.gid != policy->l_extent.gid)
+			continue;
+
+		/* We match if we have existing lock with same or wider set
+		   of bits. */
+		if (lock->l_resource->lr_type == LDLM_IBITS &&
+		     ((lock->l_policy_data.l_inodebits.bits &
+		      policy->l_inodebits.bits) !=
+		      policy->l_inodebits.bits))
+			continue;
+
+		if (!unref &&
+		    (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED ||
+		     lock->l_failed))
+			continue;
+
+		if ((flags & LDLM_FL_LOCAL_ONLY) &&
+		    !(lock->l_flags & LDLM_FL_LOCAL))
+			continue;
+
+		if (flags & LDLM_FL_TEST_LOCK) {
+			LDLM_LOCK_GET(lock);
+			ldlm_lock_touch_in_lru(lock);
+		} else {
+			ldlm_lock_addref_internal_nolock(lock, match);
+		}
+		*mode = match;
+		return lock;
+	}
+
+	return NULL;
+}
+
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock)
+{
+	if (!lock->l_failed) {
+		lock->l_failed = 1;
+		wake_up_all(&lock->l_waitq);
+	}
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match_locked);
+
+void ldlm_lock_fail_match(struct ldlm_lock *lock)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_fail_match_locked(lock);
+	unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match);
+
+/**
+ * Mark lock as "matchable" by OST.
+ *
+ * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB
+ * is not yet valid.
+ * Assumes LDLM lock is already locked.
+ */
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock)
+{
+	lock->l_flags |= LDLM_FL_LVB_READY;
+	wake_up_all(&lock->l_waitq);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match_locked);
+
+/**
+ * Mark lock as "matchable" by OST.
+ * Locks the lock and then \see ldlm_lock_allow_match_locked
+ */
+void ldlm_lock_allow_match(struct ldlm_lock *lock)
+{
+	lock_res_and_lock(lock);
+	ldlm_lock_allow_match_locked(lock);
+	unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match);
+
+/**
+ * Attempt to find a lock with specified properties.
+ *
+ * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is
+ * set in \a flags
+ *
+ * Can be called in two ways:
+ *
+ * If 'ns' is NULL, then lockh describes an existing lock that we want to look
+ * for a duplicate of.
+ *
+ * Otherwise, all of the fields must be filled in, to match against.
+ *
+ * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the
+ *     server (ie, connh is NULL)
+ * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted
+ *     list will be considered
+ * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked
+ *     to be canceled can still be matched as long as they still have reader
+ *     or writer refernces
+ * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock,
+ *     just tell us if we would have matched.
+ *
+ * \retval 1 if it finds an already-existing lock that is compatible; in this
+ * case, lockh is filled in with a addref()ed lock
+ *
+ * We also check security context, and if that fails we simply return 0 (to
+ * keep caller code unchanged), the context failure will be discovered by
+ * caller sometime later.
+ */
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+			    const struct ldlm_res_id *res_id, ldlm_type_t type,
+			    ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			    struct lustre_handle *lockh, int unref)
+{
+	struct ldlm_resource *res;
+	struct ldlm_lock *lock, *old_lock = NULL;
+	int rc = 0;
+	ENTRY;
+
+	if (ns == NULL) {
+		old_lock = ldlm_handle2lock(lockh);
+		LASSERT(old_lock);
+
+		ns = ldlm_lock_to_ns(old_lock);
+		res_id = &old_lock->l_resource->lr_name;
+		type = old_lock->l_resource->lr_type;
+		mode = old_lock->l_req_mode;
+	}
+
+	res = ldlm_resource_get(ns, NULL, res_id, type, 0);
+	if (res == NULL) {
+		LASSERT(old_lock == NULL);
+		RETURN(0);
+	}
+
+	LDLM_RESOURCE_ADDREF(res);
+	lock_res(res);
+
+	lock = search_queue(&res->lr_granted, &mode, policy, old_lock,
+			    flags, unref);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+	if (flags & LDLM_FL_BLOCK_GRANTED)
+		GOTO(out, rc = 0);
+	lock = search_queue(&res->lr_converting, &mode, policy, old_lock,
+			    flags, unref);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+	lock = search_queue(&res->lr_waiting, &mode, policy, old_lock,
+			    flags, unref);
+	if (lock != NULL)
+		GOTO(out, rc = 1);
+
+	EXIT;
+ out:
+	unlock_res(res);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+
+	if (lock) {
+		ldlm_lock2handle(lock, lockh);
+		if ((flags & LDLM_FL_LVB_READY) &&
+		    (!(lock->l_flags & LDLM_FL_LVB_READY))) {
+			struct l_wait_info lwi;
+			if (lock->l_completion_ast) {
+				int err = lock->l_completion_ast(lock,
+							  LDLM_FL_WAIT_NOREPROC,
+								 NULL);
+				if (err) {
+					if (flags & LDLM_FL_TEST_LOCK)
+						LDLM_LOCK_RELEASE(lock);
+					else
+						ldlm_lock_decref_internal(lock,
+									  mode);
+					rc = 0;
+					goto out2;
+				}
+			}
+
+			lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+					       NULL, LWI_ON_SIGNAL_NOOP, NULL);
+
+			/* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */
+			l_wait_event(lock->l_waitq,
+				     lock->l_flags & LDLM_FL_LVB_READY ||
+				     lock->l_destroyed || lock->l_failed,
+				     &lwi);
+			if (!(lock->l_flags & LDLM_FL_LVB_READY)) {
+				if (flags & LDLM_FL_TEST_LOCK)
+					LDLM_LOCK_RELEASE(lock);
+				else
+					ldlm_lock_decref_internal(lock, mode);
+				rc = 0;
+			}
+		}
+	}
+ out2:
+	if (rc) {
+		LDLM_DEBUG(lock, "matched ("LPU64" "LPU64")",
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				res_id->name[2] : policy->l_extent.start,
+			   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+				res_id->name[3] : policy->l_extent.end);
+
+		/* check user's security context */
+		if (lock->l_conn_export &&
+		    sptlrpc_import_check_ctx(
+				class_exp2cliimp(lock->l_conn_export))) {
+			if (!(flags & LDLM_FL_TEST_LOCK))
+				ldlm_lock_decref_internal(lock, mode);
+			rc = 0;
+		}
+
+		if (flags & LDLM_FL_TEST_LOCK)
+			LDLM_LOCK_RELEASE(lock);
+
+	} else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
+		LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
+				  LPU64"/"LPU64" ("LPU64" "LPU64")", ns,
+				  type, mode, res_id->name[0], res_id->name[1],
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+					res_id->name[2] :policy->l_extent.start,
+				  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+					res_id->name[3] : policy->l_extent.end);
+	}
+	if (old_lock)
+		LDLM_LOCK_PUT(old_lock);
+
+	return rc ? mode : 0;
+}
+EXPORT_SYMBOL(ldlm_lock_match);
+
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+					__u64 *bits)
+{
+	struct ldlm_lock *lock;
+	ldlm_mode_t mode = 0;
+	ENTRY;
+
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL) {
+		lock_res_and_lock(lock);
+		if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED ||
+		    lock->l_failed)
+			GOTO(out, mode);
+
+		if (lock->l_flags & LDLM_FL_CBPENDING &&
+		    lock->l_readers == 0 && lock->l_writers == 0)
+			GOTO(out, mode);
+
+		if (bits)
+			*bits = lock->l_policy_data.l_inodebits.bits;
+		mode = lock->l_granted_mode;
+		ldlm_lock_addref_internal_nolock(lock, mode);
+	}
+
+	EXIT;
+
+out:
+	if (lock != NULL) {
+		unlock_res_and_lock(lock);
+		LDLM_LOCK_PUT(lock);
+	}
+	return mode;
+}
+EXPORT_SYMBOL(ldlm_revalidate_lock_handle);
+
+/** The caller must guarantee that the buffer is large enough. */
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+		  enum req_location loc, void *data, int size)
+{
+	void *lvb;
+	ENTRY;
+
+	LASSERT(data != NULL);
+	LASSERT(size >= 0);
+
+	switch (lock->l_lvb_type) {
+	case LVB_T_OST:
+		if (size == sizeof(struct ost_lvb)) {
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb);
+			else
+				lvb = req_capsule_server_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+		} else if (size == sizeof(struct ost_lvb_v1)) {
+			struct ost_lvb *olvb = data;
+
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_ost_lvb_v1);
+			else
+				lvb = req_capsule_server_sized_swab_get(pill,
+						&RMF_DLM_LVB, size,
+						lustre_swab_ost_lvb_v1);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+			olvb->lvb_mtime_ns = 0;
+			olvb->lvb_atime_ns = 0;
+			olvb->lvb_ctime_ns = 0;
+		} else {
+			LDLM_ERROR(lock, "Replied unexpected ost LVB size %d",
+				   size);
+			RETURN(-EINVAL);
+		}
+		break;
+	case LVB_T_LQUOTA:
+		if (size == sizeof(struct lquota_lvb)) {
+			if (loc == RCL_CLIENT)
+				lvb = req_capsule_client_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_lquota_lvb);
+			else
+				lvb = req_capsule_server_swab_get(pill,
+						&RMF_DLM_LVB,
+						lustre_swab_lquota_lvb);
+			if (unlikely(lvb == NULL)) {
+				LDLM_ERROR(lock, "no LVB");
+				RETURN(-EPROTO);
+			}
+
+			memcpy(data, lvb, size);
+		} else {
+			LDLM_ERROR(lock, "Replied unexpected lquota LVB size %d",
+				   size);
+			RETURN(-EINVAL);
+		}
+		break;
+	case LVB_T_LAYOUT:
+		if (size == 0)
+			break;
+
+		if (loc == RCL_CLIENT)
+			lvb = req_capsule_client_get(pill, &RMF_DLM_LVB);
+		else
+			lvb = req_capsule_server_get(pill, &RMF_DLM_LVB);
+		if (unlikely(lvb == NULL)) {
+			LDLM_ERROR(lock, "no LVB");
+			RETURN(-EPROTO);
+		}
+
+		memcpy(data, lvb, size);
+		break;
+	default:
+		LDLM_ERROR(lock, "Unknown LVB type: %d\n", lock->l_lvb_type);
+		libcfs_debug_dumpstack(NULL);
+		RETURN(-EINVAL);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Create and fill in new LDLM lock with specified properties.
+ * Returns a referenced lock
+ */
+struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
+				   const struct ldlm_res_id *res_id,
+				   ldlm_type_t type,
+				   ldlm_mode_t mode,
+				   const struct ldlm_callback_suite *cbs,
+				   void *data, __u32 lvb_len,
+				   enum lvb_type lvb_type)
+{
+	struct ldlm_lock *lock;
+	struct ldlm_resource *res;
+	ENTRY;
+
+	res = ldlm_resource_get(ns, NULL, res_id, type, 1);
+	if (res == NULL)
+		RETURN(NULL);
+
+	lock = ldlm_lock_new(res);
+
+	if (lock == NULL)
+		RETURN(NULL);
+
+	lock->l_req_mode = mode;
+	lock->l_ast_data = data;
+	lock->l_pid = current_pid();
+	lock->l_ns_srv = !!ns_is_server(ns);
+	if (cbs) {
+		lock->l_blocking_ast = cbs->lcs_blocking;
+		lock->l_completion_ast = cbs->lcs_completion;
+		lock->l_glimpse_ast = cbs->lcs_glimpse;
+		lock->l_weigh_ast = cbs->lcs_weigh;
+	}
+
+	lock->l_tree_node = NULL;
+	/* if this is the extent lock, allocate the interval tree node */
+	if (type == LDLM_EXTENT) {
+		if (ldlm_interval_alloc(lock) == NULL)
+			GOTO(out, 0);
+	}
+
+	if (lvb_len) {
+		lock->l_lvb_len = lvb_len;
+		OBD_ALLOC(lock->l_lvb_data, lvb_len);
+		if (lock->l_lvb_data == NULL)
+			GOTO(out, 0);
+	}
+
+	lock->l_lvb_type = lvb_type;
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK))
+		GOTO(out, 0);
+
+	RETURN(lock);
+
+out:
+	ldlm_lock_destroy(lock);
+	LDLM_LOCK_RELEASE(lock);
+	return NULL;
+}
+
+/**
+ * Enqueue (request) a lock.
+ *
+ * Does not block. As a result of enqueue the lock would be put
+ * into granted or waiting list.
+ *
+ * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag
+ * set, skip all the enqueueing and delegate lock processing to intent policy
+ * function.
+ */
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
+			       struct ldlm_lock **lockp,
+			       void *cookie, __u64 *flags)
+{
+	struct ldlm_lock *lock = *lockp;
+	struct ldlm_resource *res = lock->l_resource;
+	int local = ns_is_client(ldlm_res_to_ns(res));
+	ldlm_error_t rc = ELDLM_OK;
+	struct ldlm_interval *node = NULL;
+	ENTRY;
+
+	lock->l_last_activity = cfs_time_current_sec();
+	/* policies are not executed on the client or during replay */
+	if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
+	    && !local && ns->ns_policy) {
+		rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags,
+				   NULL);
+		if (rc == ELDLM_LOCK_REPLACED) {
+			/* The lock that was returned has already been granted,
+			 * and placed into lockp.  If it's not the same as the
+			 * one we passed in, then destroy the old one and our
+			 * work here is done. */
+			if (lock != *lockp) {
+				ldlm_lock_destroy(lock);
+				LDLM_LOCK_RELEASE(lock);
+			}
+			*flags |= LDLM_FL_LOCK_CHANGED;
+			RETURN(0);
+		} else if (rc != ELDLM_OK ||
+			   (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) {
+			ldlm_lock_destroy(lock);
+			RETURN(rc);
+		}
+	}
+
+	/* For a replaying lock, it might be already in granted list. So
+	 * unlinking the lock will cause the interval node to be freed, we
+	 * have to allocate the interval node early otherwise we can't regrant
+	 * this lock in the future. - jay */
+	if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
+		OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+
+	lock_res_and_lock(lock);
+	if (local && lock->l_req_mode == lock->l_granted_mode) {
+		/* The server returned a blocked lock, but it was granted
+		 * before we got a chance to actually enqueue it.  We don't
+		 * need to do anything else. */
+		*flags &= ~(LDLM_FL_BLOCK_GRANTED |
+			    LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT);
+		GOTO(out, ELDLM_OK);
+	}
+
+	ldlm_resource_unlink_lock(lock);
+	if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) {
+		if (node == NULL) {
+			ldlm_lock_destroy_nolock(lock);
+			GOTO(out, rc = -ENOMEM);
+		}
+
+		INIT_LIST_HEAD(&node->li_group);
+		ldlm_interval_attach(node, lock);
+		node = NULL;
+	}
+
+	/* Some flags from the enqueue want to make it into the AST, via the
+	 * lock's l_flags. */
+	lock->l_flags |= *flags & LDLM_AST_DISCARD_DATA;
+
+	/* This distinction between local lock trees is very important; a client
+	 * namespace only has information about locks taken by that client, and
+	 * thus doesn't have enough information to decide for itself if it can
+	 * be granted (below).  In this case, we do exactly what the server
+	 * tells us to do, as dictated by the 'flags'.
+	 *
+	 * We do exactly the same thing during recovery, when the server is
+	 * more or less trusting the clients not to lie.
+	 *
+	 * FIXME (bug 268): Detect obvious lies by checking compatibility in
+	 * granted/converting queues. */
+	if (local) {
+		if (*flags & LDLM_FL_BLOCK_CONV)
+			ldlm_resource_add_lock(res, &res->lr_converting, lock);
+		else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+			ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+		else
+			ldlm_grant_lock(lock, NULL);
+		GOTO(out, ELDLM_OK);
+	} else {
+		CERROR("This is client-side-only module, cannot handle "
+		       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
+
+out:
+	unlock_res_and_lock(lock);
+	if (node)
+		OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+	return rc;
+}
+
+
+/**
+ * Process a call to blocking AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock_desc   d;
+	int		     rc;
+	struct ldlm_lock       *lock;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
+
+	/* nobody should touch l_bl_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_bl_ast);
+
+	LASSERT(lock->l_flags & LDLM_FL_AST_SENT);
+	LASSERT(lock->l_bl_ast_run == 0);
+	LASSERT(lock->l_blocking_lock);
+	lock->l_bl_ast_run++;
+	unlock_res_and_lock(lock);
+
+	ldlm_lock2desc(lock->l_blocking_lock, &d);
+
+	rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
+	LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+	lock->l_blocking_lock = NULL;
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg  *arg = opaq;
+	int		      rc = 0;
+	struct ldlm_lock	*lock;
+	ldlm_completion_callback completion_callback;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+	/* It's possible to receive a completion AST before we've set
+	 * the l_completion_ast pointer: either because the AST arrived
+	 * before the reply, or simply because there's a small race
+	 * window between receiving the reply and finishing the local
+	 * enqueue. (bug 842)
+	 *
+	 * This can't happen with the blocking_ast, however, because we
+	 * will never call the local blocking_ast until we drop our
+	 * reader/writer reference, which we won't do until we get the
+	 * reply and finish enqueueing. */
+
+	/* nobody should touch l_cp_ast */
+	lock_res_and_lock(lock);
+	list_del_init(&lock->l_cp_ast);
+	LASSERT(lock->l_flags & LDLM_FL_CP_REQD);
+	/* save l_completion_ast since it can be changed by
+	 * mds_intent_policy(), see bug 14225 */
+	completion_callback = lock->l_completion_ast;
+	lock->l_flags &= ~LDLM_FL_CP_REQD;
+	unlock_res_and_lock(lock);
+
+	if (completion_callback != NULL)
+		rc = completion_callback(lock, 0, (void *)arg);
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to revocation AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg *arg = opaq;
+	struct ldlm_lock_desc   desc;
+	int		     rc;
+	struct ldlm_lock       *lock;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast);
+	list_del_init(&lock->l_rk_ast);
+
+	/* the desc just pretend to exclusive */
+	ldlm_lock2desc(lock, &desc);
+	desc.l_req_mode = LCK_EX;
+	desc.l_granted_mode = 0;
+
+	rc = lock->l_blocking_ast(lock, &desc, (void*)arg, LDLM_CB_BLOCKING);
+	LDLM_LOCK_RELEASE(lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Process a call to glimpse AST callback for a lock in ast_work list
+ */
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+	struct ldlm_cb_set_arg		*arg = opaq;
+	struct ldlm_glimpse_work	*gl_work;
+	struct ldlm_lock		*lock;
+	int				 rc = 0;
+	ENTRY;
+
+	if (list_empty(arg->list))
+		RETURN(-ENOENT);
+
+	gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work,
+				 gl_list);
+	list_del_init(&gl_work->gl_list);
+
+	lock = gl_work->gl_lock;
+
+	/* transfer the glimpse descriptor to ldlm_cb_set_arg */
+	arg->gl_desc = gl_work->gl_desc;
+
+	/* invoke the actual glimpse callback */
+	if (lock->l_glimpse_ast(lock, (void*)arg) == 0)
+		rc = 1;
+
+	LDLM_LOCK_RELEASE(lock);
+
+	if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0)
+		OBD_FREE_PTR(gl_work);
+
+	RETURN(rc);
+}
+
+/**
+ * Process list of locks in need of ASTs being sent.
+ *
+ * Used on server to send multiple ASTs together instead of sending one by
+ * one.
+ */
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+		      ldlm_desc_ast_t ast_type)
+{
+	struct ldlm_cb_set_arg *arg;
+	set_producer_func       work_ast_lock;
+	int		     rc;
+
+	if (list_empty(rpc_list))
+		RETURN(0);
+
+	OBD_ALLOC_PTR(arg);
+	if (arg == NULL)
+		RETURN(-ENOMEM);
+
+	atomic_set(&arg->restart, 0);
+	arg->list = rpc_list;
+
+	switch (ast_type) {
+		case LDLM_WORK_BL_AST:
+			arg->type = LDLM_BL_CALLBACK;
+			work_ast_lock = ldlm_work_bl_ast_lock;
+			break;
+		case LDLM_WORK_CP_AST:
+			arg->type = LDLM_CP_CALLBACK;
+			work_ast_lock = ldlm_work_cp_ast_lock;
+			break;
+		case LDLM_WORK_REVOKE_AST:
+			arg->type = LDLM_BL_CALLBACK;
+			work_ast_lock = ldlm_work_revoke_ast_lock;
+			break;
+		case LDLM_WORK_GL_AST:
+			arg->type = LDLM_GL_CALLBACK;
+			work_ast_lock = ldlm_work_gl_ast_lock;
+			break;
+		default:
+			LBUG();
+	}
+
+	/* We create a ptlrpc request set with flow control extension.
+	 * This request set will use the work_ast_lock function to produce new
+	 * requests and will send a new request each time one completes in order
+	 * to keep the number of requests in flight to ns_max_parallel_ast */
+	arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX,
+				     work_ast_lock, arg);
+	if (arg->set == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	ptlrpc_set_wait(arg->set);
+	ptlrpc_set_destroy(arg->set);
+
+	rc = atomic_read(&arg->restart) ? -ERESTART : 0;
+	GOTO(out, rc);
+out:
+	OBD_FREE_PTR(arg);
+	return rc;
+}
+
+static int reprocess_one_queue(struct ldlm_resource *res, void *closure)
+{
+	ldlm_reprocess_all(res);
+	return LDLM_ITER_CONTINUE;
+}
+
+static int ldlm_reprocess_res(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			      struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	int    rc;
+
+	rc = reprocess_one_queue(res, arg);
+
+	return rc == LDLM_ITER_STOP;
+}
+
+/**
+ * Iterate through all resources on a namespace attempting to grant waiting
+ * locks.
+ */
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns)
+{
+	ENTRY;
+
+	if (ns != NULL) {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_reprocess_res, NULL);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_reprocess_all_ns);
+
+/**
+ * Try to grant all waiting locks on a resource.
+ *
+ * Calls ldlm_reprocess_queue on converting and waiting queues.
+ *
+ * Typically called after some resource locks are cancelled to see
+ * if anything could be granted as a result of the cancellation.
+ */
+void ldlm_reprocess_all(struct ldlm_resource *res)
+{
+	LIST_HEAD(rpc_list);
+
+	ENTRY;
+	if (!ns_is_client(ldlm_res_to_ns(res))) {
+		CERROR("This is client-side-only module, cannot handle "
+		       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
+	EXIT;
+}
+
+/**
+ * Helper function to call blocking AST for LDLM lock \a lock in a
+ * "cancelling" mode.
+ */
+void ldlm_cancel_callback(struct ldlm_lock *lock)
+{
+	check_res_locked(lock->l_resource);
+	if (!(lock->l_flags & LDLM_FL_CANCEL)) {
+		lock->l_flags |= LDLM_FL_CANCEL;
+		if (lock->l_blocking_ast) {
+			unlock_res_and_lock(lock);
+			lock->l_blocking_ast(lock, NULL, lock->l_ast_data,
+					     LDLM_CB_CANCELING);
+			lock_res_and_lock(lock);
+		} else {
+			LDLM_DEBUG(lock, "no blocking ast");
+		}
+	}
+	lock->l_flags |= LDLM_FL_BL_DONE;
+}
+
+/**
+ * Remove skiplist-enabled LDLM lock \a req from granted list
+ */
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req)
+{
+	if (req->l_resource->lr_type != LDLM_PLAIN &&
+	    req->l_resource->lr_type != LDLM_IBITS)
+		return;
+
+	list_del_init(&req->l_sl_policy);
+	list_del_init(&req->l_sl_mode);
+}
+
+/**
+ * Attempts to cancel LDLM lock \a lock that has no reader/writer references.
+ */
+void ldlm_lock_cancel(struct ldlm_lock *lock)
+{
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns;
+	ENTRY;
+
+	lock_res_and_lock(lock);
+
+	res = lock->l_resource;
+	ns  = ldlm_res_to_ns(res);
+
+	/* Please do not, no matter how tempting, remove this LBUG without
+	 * talking to me first. -phik */
+	if (lock->l_readers || lock->l_writers) {
+		LDLM_ERROR(lock, "lock still has references");
+		LBUG();
+	}
+
+	if (lock->l_waited)
+		ldlm_del_waiting_lock(lock);
+
+	/* Releases cancel callback. */
+	ldlm_cancel_callback(lock);
+
+	/* Yes, second time, just in case it was added again while we were
+	   running with no res lock in ldlm_cancel_callback */
+	if (lock->l_waited)
+		ldlm_del_waiting_lock(lock);
+
+	ldlm_resource_unlink_lock(lock);
+	ldlm_lock_destroy_nolock(lock);
+
+	if (lock->l_granted_mode == lock->l_req_mode)
+		ldlm_pool_del(&ns->ns_pool, lock);
+
+	/* Make sure we will not be called again for same lock what is possible
+	 * if not to zero out lock->l_granted_mode */
+	lock->l_granted_mode = LCK_MINMODE;
+	unlock_res_and_lock(lock);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_cancel);
+
+/**
+ * Set opaque data into the lock that only makes sense to upper layer.
+ */
+int ldlm_lock_set_data(struct lustre_handle *lockh, void *data)
+{
+	struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+	int rc = -EINVAL;
+	ENTRY;
+
+	if (lock) {
+		if (lock->l_ast_data == NULL)
+			lock->l_ast_data = data;
+		if (lock->l_ast_data == data)
+			rc = 0;
+		LDLM_LOCK_PUT(lock);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_lock_set_data);
+
+struct export_cl_data {
+	struct obd_export	*ecl_exp;
+	int			ecl_loop;
+};
+
+/**
+ * Iterator function for ldlm_cancel_locks_for_export.
+ * Cancels passed locks.
+ */
+int ldlm_cancel_locks_for_export_cb(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+				    struct hlist_node *hnode, void *data)
+
+{
+	struct export_cl_data	*ecl = (struct export_cl_data *)data;
+	struct obd_export	*exp  = ecl->ecl_exp;
+	struct ldlm_lock     *lock = cfs_hash_object(hs, hnode);
+	struct ldlm_resource *res;
+
+	res = ldlm_resource_getref(lock->l_resource);
+	LDLM_LOCK_GET(lock);
+
+	LDLM_DEBUG(lock, "export %p", exp);
+	ldlm_res_lvbo_update(res, NULL, 1);
+	ldlm_lock_cancel(lock);
+	ldlm_reprocess_all(res);
+	ldlm_resource_putref(res);
+	LDLM_LOCK_RELEASE(lock);
+
+	ecl->ecl_loop++;
+	if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) {
+		CDEBUG(D_INFO,
+		       "Cancel lock %p for export %p (loop %d), still have "
+		       "%d locks left on hash table.\n",
+		       lock, exp, ecl->ecl_loop,
+		       atomic_read(&hs->hs_count));
+	}
+
+	return 0;
+}
+
+/**
+ * Cancel all locks for given export.
+ *
+ * Typically called on client disconnection/eviction
+ */
+void ldlm_cancel_locks_for_export(struct obd_export *exp)
+{
+	struct export_cl_data	ecl = {
+		.ecl_exp	= exp,
+		.ecl_loop	= 0,
+	};
+
+	cfs_hash_for_each_empty(exp->exp_lock_hash,
+				ldlm_cancel_locks_for_export_cb, &ecl);
+}
+
+/**
+ * Downgrade an exclusive lock.
+ *
+ * A fast variant of ldlm_lock_convert for convertion of exclusive
+ * locks. The convertion is always successful.
+ * Used by Commit on Sharing (COS) code.
+ *
+ * \param lock A lock to convert
+ * \param new_mode new lock mode
+ */
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode)
+{
+	ENTRY;
+
+	LASSERT(lock->l_granted_mode & (LCK_PW | LCK_EX));
+	LASSERT(new_mode == LCK_COS);
+
+	lock_res_and_lock(lock);
+	ldlm_resource_unlink_lock(lock);
+	/*
+	 * Remove the lock from pool as it will be added again in
+	 * ldlm_grant_lock() called below.
+	 */
+	ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
+
+	lock->l_req_mode = new_mode;
+	ldlm_grant_lock(lock, NULL);
+	unlock_res_and_lock(lock);
+	ldlm_reprocess_all(lock->l_resource);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_downgrade);
+
+/**
+ * Attempt to convert already granted lock to a different mode.
+ *
+ * While lock conversion is not currently used, future client-side
+ * optimizations could take advantage of it to avoid discarding cached
+ * pages on a file.
+ */
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+					__u32 *flags)
+{
+	LIST_HEAD(rpc_list);
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns;
+	int granted = 0;
+	struct ldlm_interval *node;
+	ENTRY;
+
+	/* Just return if mode is unchanged. */
+	if (new_mode == lock->l_granted_mode) {
+		*flags |= LDLM_FL_BLOCK_GRANTED;
+		RETURN(lock->l_resource);
+	}
+
+	/* I can't check the type of lock here because the bitlock of lock
+	 * is not held here, so do the allocation blindly. -jay */
+	OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+	if (node == NULL)  /* Actually, this causes EDEADLOCK to be returned */
+		RETURN(NULL);
+
+	LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR),
+		 "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
+
+	lock_res_and_lock(lock);
+
+	res = lock->l_resource;
+	ns  = ldlm_res_to_ns(res);
+
+	lock->l_req_mode = new_mode;
+	if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
+		ldlm_resource_unlink_lock(lock);
+	} else {
+		ldlm_resource_unlink_lock(lock);
+		if (res->lr_type == LDLM_EXTENT) {
+			/* FIXME: ugly code, I have to attach the lock to a
+			 * interval node again since perhaps it will be granted
+			 * soon */
+			INIT_LIST_HEAD(&node->li_group);
+			ldlm_interval_attach(node, lock);
+			node = NULL;
+		}
+	}
+
+	/*
+	 * Remove old lock from the pool before adding the lock with new
+	 * mode below in ->policy()
+	 */
+	ldlm_pool_del(&ns->ns_pool, lock);
+
+	/* If this is a local resource, put it on the appropriate list. */
+	if (ns_is_client(ldlm_res_to_ns(res))) {
+		if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) {
+			ldlm_resource_add_lock(res, &res->lr_converting, lock);
+		} else {
+			/* This should never happen, because of the way the
+			 * server handles conversions. */
+			LDLM_ERROR(lock, "Erroneous flags %x on local lock\n",
+				   *flags);
+			LBUG();
+
+			ldlm_grant_lock(lock, &rpc_list);
+			granted = 1;
+			/* FIXME: completion handling not with lr_lock held ! */
+			if (lock->l_completion_ast)
+				lock->l_completion_ast(lock, 0, NULL);
+		}
+	} else {
+		CERROR("This is client-side-only module, cannot handle "
+		       "LDLM_NAMESPACE_SERVER resource type lock.\n");
+		LBUG();
+	}
+	unlock_res_and_lock(lock);
+
+	if (granted)
+		ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST);
+	if (node)
+		OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+	RETURN(res);
+}
+EXPORT_SYMBOL(ldlm_lock_convert);
+
+/**
+ * Print lock with lock handle \a lockh description into debug log.
+ *
+ * Used when printing all locks on a resource for debug purposes.
+ */
+void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh)
+{
+	struct ldlm_lock *lock;
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	lock = ldlm_handle2lock(lockh);
+	if (lock == NULL)
+		return;
+
+	LDLM_DEBUG_LIMIT(level, lock, "###");
+
+	LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_dump_handle);
+
+/**
+ * Print lock information with custom message into debug log.
+ * Helper function.
+ */
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+		      struct libcfs_debug_msg_data *msgdata,
+		      const char *fmt, ...)
+{
+	va_list args;
+	struct obd_export *exp = lock->l_export;
+	struct ldlm_resource *resource = lock->l_resource;
+	char *nid = "local";
+
+	va_start(args, fmt);
+
+	if (exp && exp->exp_connection) {
+		nid = libcfs_nid2str(exp->exp_connection->c_peer.nid);
+	} else if (exp && exp->exp_obd != NULL) {
+		struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
+		nid = libcfs_nid2str(imp->imp_connection->c_peer.nid);
+	}
+
+	if (resource == NULL) {
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+		       "res: \?\? rrc=\?\? type: \?\?\? flags: "LPX64" nid: %s "
+		       "remote: "LPX64" expref: %d pid: %u timeout: %lu "
+		       "lvb_type: %d\n",
+		       lock,
+		       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+		       lock->l_readers, lock->l_writers,
+		       ldlm_lockname[lock->l_granted_mode],
+		       ldlm_lockname[lock->l_req_mode],
+		       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+		       lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+		va_end(args);
+		return;
+	}
+
+	switch (resource->lr_type) {
+	case LDLM_EXTENT:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+		       "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64
+		       "] (req "LPU64"->"LPU64") flags: "LPX64" nid: %s remote:"
+		       " "LPX64" expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+		       ldlm_lock_to_ns_name(lock), lock,
+		       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+		       lock->l_readers, lock->l_writers,
+		       ldlm_lockname[lock->l_granted_mode],
+		       ldlm_lockname[lock->l_req_mode],
+		       resource->lr_name.name[0],
+		       resource->lr_name.name[1],
+		       atomic_read(&resource->lr_refcount),
+		       ldlm_typename[resource->lr_type],
+		       lock->l_policy_data.l_extent.start,
+		       lock->l_policy_data.l_extent.end,
+		       lock->l_req_extent.start, lock->l_req_extent.end,
+		       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+		       lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+		break;
+
+	case LDLM_FLOCK:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+		       "res: "LPU64"/"LPU64" rrc: %d type: %s pid: %d "
+		       "["LPU64"->"LPU64"] flags: "LPX64" nid: %s remote: "LPX64
+		       " expref: %d pid: %u timeout: %lu\n",
+		       ldlm_lock_to_ns_name(lock), lock,
+		       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+		       lock->l_readers, lock->l_writers,
+		       ldlm_lockname[lock->l_granted_mode],
+		       ldlm_lockname[lock->l_req_mode],
+		       resource->lr_name.name[0],
+		       resource->lr_name.name[1],
+		       atomic_read(&resource->lr_refcount),
+		       ldlm_typename[resource->lr_type],
+		       lock->l_policy_data.l_flock.pid,
+		       lock->l_policy_data.l_flock.start,
+		       lock->l_policy_data.l_flock.end,
+		       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+		       lock->l_pid, lock->l_callback_timeout);
+		break;
+
+	case LDLM_IBITS:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+		       "res: "LPU64"/"LPU64" bits "LPX64" rrc: %d type: %s "
+		       "flags: "LPX64" nid: %s remote: "LPX64" expref: %d "
+		       "pid: %u timeout: %lu lvb_type: %d\n",
+		       ldlm_lock_to_ns_name(lock),
+		       lock, lock->l_handle.h_cookie,
+		       atomic_read (&lock->l_refc),
+		       lock->l_readers, lock->l_writers,
+		       ldlm_lockname[lock->l_granted_mode],
+		       ldlm_lockname[lock->l_req_mode],
+		       resource->lr_name.name[0],
+		       resource->lr_name.name[1],
+		       lock->l_policy_data.l_inodebits.bits,
+		       atomic_read(&resource->lr_refcount),
+		       ldlm_typename[resource->lr_type],
+		       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+		       lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+		break;
+
+	default:
+		libcfs_debug_vmsg2(msgdata, fmt, args,
+		       " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+		       "res: "LPU64"/"LPU64" rrc: %d type: %s flags: "LPX64" "
+		       "nid: %s remote: "LPX64" expref: %d pid: %u timeout: %lu"
+		       "lvb_type: %d\n",
+		       ldlm_lock_to_ns_name(lock),
+		       lock, lock->l_handle.h_cookie,
+		       atomic_read (&lock->l_refc),
+		       lock->l_readers, lock->l_writers,
+		       ldlm_lockname[lock->l_granted_mode],
+		       ldlm_lockname[lock->l_req_mode],
+		       resource->lr_name.name[0],
+		       resource->lr_name.name[1],
+		       atomic_read(&resource->lr_refcount),
+		       ldlm_typename[resource->lr_type],
+		       lock->l_flags, nid, lock->l_remote_handle.cookie,
+		       exp ? atomic_read(&exp->exp_refcount) : -99,
+		       lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+		break;
+	}
+	va_end(args);
+}
+EXPORT_SYMBOL(_ldlm_lock_debug);

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
new file mode 100644
index 0000000..324d5e4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c

@@ -0,0 +1,1238 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lockd.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <linux/list.h>
+#include "ldlm_internal.h"
+
+static int ldlm_num_threads;
+CFS_MODULE_PARM(ldlm_num_threads, "i", int, 0444,
+		"number of DLM service threads to start");
+
+static char *ldlm_cpts;
+CFS_MODULE_PARM(ldlm_cpts, "s", charp, 0444,
+		"CPU partitions ldlm threads should run on");
+
+extern struct kmem_cache *ldlm_resource_slab;
+extern struct kmem_cache *ldlm_lock_slab;
+static struct mutex	ldlm_ref_mutex;
+static int ldlm_refcount;
+
+struct ldlm_cb_async_args {
+	struct ldlm_cb_set_arg *ca_set_arg;
+	struct ldlm_lock       *ca_lock;
+};
+
+/* LDLM state */
+
+static struct ldlm_state *ldlm_state;
+
+inline cfs_time_t round_timeout(cfs_time_t timeout)
+{
+	return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
+}
+
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
+{
+	/* Non-AT value */
+	unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+
+	return timeout < 1 ? 1 : timeout;
+}
+
+#define ELT_STOPPED   0
+#define ELT_READY     1
+#define ELT_TERMINATE 2
+
+struct ldlm_bl_pool {
+	spinlock_t		blp_lock;
+
+	/*
+	 * blp_prio_list is used for callbacks that should be handled
+	 * as a priority. It is used for LDLM_FL_DISCARD_DATA requests.
+	 * see bug 13843
+	 */
+	struct list_head	      blp_prio_list;
+
+	/*
+	 * blp_list is used for all other callbacks which are likely
+	 * to take longer to process.
+	 */
+	struct list_head	      blp_list;
+
+	wait_queue_head_t	     blp_waitq;
+	struct completion	blp_comp;
+	atomic_t	    blp_num_threads;
+	atomic_t	    blp_busy_threads;
+	int		     blp_min_threads;
+	int		     blp_max_threads;
+};
+
+struct ldlm_bl_work_item {
+	struct list_head	      blwi_entry;
+	struct ldlm_namespace  *blwi_ns;
+	struct ldlm_lock_desc   blwi_ld;
+	struct ldlm_lock       *blwi_lock;
+	struct list_head	      blwi_head;
+	int		     blwi_count;
+	struct completion	blwi_comp;
+	ldlm_cancel_flags_t     blwi_flags;
+	int		     blwi_mem_pressure;
+};
+
+
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+	RETURN(0);
+}
+
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+{
+	RETURN(0);
+}
+
+
+
+/**
+ * Callback handler for receiving incoming blocking ASTs.
+ *
+ * This can only happen on client side.
+ */
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+	int do_ast;
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client blocking AST callback handler");
+
+	lock_res_and_lock(lock);
+	lock->l_flags |= LDLM_FL_CBPENDING;
+
+	if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)
+		lock->l_flags |= LDLM_FL_CANCEL;
+
+	do_ast = (!lock->l_readers && !lock->l_writers);
+	unlock_res_and_lock(lock);
+
+	if (do_ast) {
+		CDEBUG(D_DLMTRACE, "Lock %p already unused, calling callback (%p)\n",
+		       lock, lock->l_blocking_ast);
+		if (lock->l_blocking_ast != NULL)
+			lock->l_blocking_ast(lock, ld, lock->l_ast_data,
+					     LDLM_CB_BLOCKING);
+	} else {
+		CDEBUG(D_DLMTRACE, "Lock %p is referenced, will be cancelled later\n",
+		       lock);
+	}
+
+	LDLM_DEBUG(lock, "client blocking callback handler END");
+	LDLM_LOCK_RELEASE(lock);
+	EXIT;
+}
+
+/**
+ * Callback handler for receiving incoming completion ASTs.
+ *
+ * This only can happen on client side.
+ */
+static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+				    struct ldlm_namespace *ns,
+				    struct ldlm_request *dlm_req,
+				    struct ldlm_lock *lock)
+{
+	int lvb_len;
+	LIST_HEAD(ast_list);
+	int rc = 0;
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client completion callback handler START");
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
+		int to = cfs_time_seconds(1);
+		while (to > 0) {
+			schedule_timeout_and_set_state(
+				TASK_INTERRUPTIBLE, to);
+			if (lock->l_granted_mode == lock->l_req_mode ||
+			    lock->l_destroyed)
+				break;
+		}
+	}
+
+	lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT);
+	if (lvb_len < 0) {
+		LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len);
+		GOTO(out, rc = lvb_len);
+	} else if (lvb_len > 0) {
+		if (lock->l_lvb_len > 0) {
+			/* for extent lock, lvb contains ost_lvb{}. */
+			LASSERT(lock->l_lvb_data != NULL);
+
+			if (unlikely(lock->l_lvb_len < lvb_len)) {
+				LDLM_ERROR(lock, "Replied LVB is larger than "
+					   "expectation, expected = %d, "
+					   "replied = %d",
+					   lock->l_lvb_len, lvb_len);
+				GOTO(out, rc = -EINVAL);
+			}
+		} else if (ldlm_has_layout(lock)) { /* for layout lock, lvb has
+						     * variable length */
+			void *lvb_data;
+
+			OBD_ALLOC(lvb_data, lvb_len);
+			if (lvb_data == NULL) {
+				LDLM_ERROR(lock, "No memory: %d.\n", lvb_len);
+				GOTO(out, rc = -ENOMEM);
+			}
+
+			lock_res_and_lock(lock);
+			LASSERT(lock->l_lvb_data == NULL);
+			lock->l_lvb_data = lvb_data;
+			lock->l_lvb_len = lvb_len;
+			unlock_res_and_lock(lock);
+		}
+	}
+
+	lock_res_and_lock(lock);
+	if (lock->l_destroyed ||
+	    lock->l_granted_mode == lock->l_req_mode) {
+		/* bug 11300: the lock has already been granted */
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "Double grant race happened");
+		GOTO(out, rc = 0);
+	}
+
+	/* If we receive the completion AST before the actual enqueue returned,
+	 * then we might need to switch lock modes, resources, or extents. */
+	if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) {
+		lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
+		LDLM_DEBUG(lock, "completion AST, new lock mode");
+	}
+
+	if (lock->l_resource->lr_type != LDLM_PLAIN) {
+		ldlm_convert_policy_to_local(req->rq_export,
+					  dlm_req->lock_desc.l_resource.lr_type,
+					  &dlm_req->lock_desc.l_policy_data,
+					  &lock->l_policy_data);
+		LDLM_DEBUG(lock, "completion AST, new policy data");
+	}
+
+	ldlm_resource_unlink_lock(lock);
+	if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
+		   &lock->l_resource->lr_name,
+		   sizeof(lock->l_resource->lr_name)) != 0) {
+		unlock_res_and_lock(lock);
+		rc = ldlm_lock_change_resource(ns, lock,
+				&dlm_req->lock_desc.l_resource.lr_name);
+		if (rc < 0) {
+			LDLM_ERROR(lock, "Failed to allocate resource");
+			GOTO(out, rc);
+		}
+		LDLM_DEBUG(lock, "completion AST, new resource");
+		CERROR("change resource!\n");
+		lock_res_and_lock(lock);
+	}
+
+	if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+		/* BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast. */
+		ldlm_lock_remove_from_lru(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+		LDLM_DEBUG(lock, "completion AST includes blocking AST");
+	}
+
+	if (lock->l_lvb_len > 0) {
+		rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT,
+				   lock->l_lvb_data, lvb_len);
+		if (rc < 0) {
+			unlock_res_and_lock(lock);
+			GOTO(out, rc);
+		}
+	}
+
+	ldlm_grant_lock(lock, &ast_list);
+	unlock_res_and_lock(lock);
+
+	LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
+
+	/* Let Enqueue to call osc_lock_upcall() and initialize
+	 * l_ast_data */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2);
+
+	ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST);
+
+	LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)",
+			  lock);
+	GOTO(out, rc);
+
+out:
+	if (rc < 0) {
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_FAILED;
+		unlock_res_and_lock(lock);
+		wake_up(&lock->l_waitq);
+	}
+	LDLM_LOCK_RELEASE(lock);
+}
+
+/**
+ * Callback handler for receiving incoming glimpse ASTs.
+ *
+ * This only can happen on client side.  After handling the glimpse AST
+ * we also consider dropping the lock here if it is unused locally for a
+ * long time.
+ */
+static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
+				    struct ldlm_namespace *ns,
+				    struct ldlm_request *dlm_req,
+				    struct ldlm_lock *lock)
+{
+	int rc = -ENOSYS;
+	ENTRY;
+
+	LDLM_DEBUG(lock, "client glimpse AST callback handler");
+
+	if (lock->l_glimpse_ast != NULL)
+		rc = lock->l_glimpse_ast(lock, req);
+
+	if (req->rq_repmsg != NULL) {
+		ptlrpc_reply(req);
+	} else {
+		req->rq_status = rc;
+		ptlrpc_error(req);
+	}
+
+	lock_res_and_lock(lock);
+	if (lock->l_granted_mode == LCK_PW &&
+	    !lock->l_readers && !lock->l_writers &&
+	    cfs_time_after(cfs_time_current(),
+			   cfs_time_add(lock->l_last_used,
+					cfs_time_seconds(10)))) {
+		unlock_res_and_lock(lock);
+		if (ldlm_bl_to_thread_lock(ns, NULL, lock))
+			ldlm_handle_bl_callback(ns, NULL, lock);
+
+		EXIT;
+		return;
+	}
+	unlock_res_and_lock(lock);
+	LDLM_LOCK_RELEASE(lock);
+	EXIT;
+}
+
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+	if (req->rq_no_reply)
+		return 0;
+
+	req->rq_status = rc;
+	if (!req->rq_packed_final) {
+		rc = lustre_pack_reply(req, 1, NULL, NULL);
+		if (rc)
+			return rc;
+	}
+	return ptlrpc_reply(req);
+}
+
+static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
+			       ldlm_cancel_flags_t cancel_flags)
+{
+	struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+	ENTRY;
+
+	spin_lock(&blp->blp_lock);
+	if (blwi->blwi_lock &&
+	    blwi->blwi_lock->l_flags & LDLM_FL_DISCARD_DATA) {
+		/* add LDLM_FL_DISCARD_DATA requests to the priority list */
+		list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list);
+	} else {
+		/* other blocking callbacks are added to the regular list */
+		list_add_tail(&blwi->blwi_entry, &blp->blp_list);
+	}
+	spin_unlock(&blp->blp_lock);
+
+	wake_up(&blp->blp_waitq);
+
+	/* can not check blwi->blwi_flags as blwi could be already freed in
+	   LCF_ASYNC mode */
+	if (!(cancel_flags & LCF_ASYNC))
+		wait_for_completion(&blwi->blwi_comp);
+
+	RETURN(0);
+}
+
+static inline void init_blwi(struct ldlm_bl_work_item *blwi,
+			     struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld,
+			     struct list_head *cancels, int count,
+			     struct ldlm_lock *lock,
+			     ldlm_cancel_flags_t cancel_flags)
+{
+	init_completion(&blwi->blwi_comp);
+	INIT_LIST_HEAD(&blwi->blwi_head);
+
+	if (memory_pressure_get())
+		blwi->blwi_mem_pressure = 1;
+
+	blwi->blwi_ns = ns;
+	blwi->blwi_flags = cancel_flags;
+	if (ld != NULL)
+		blwi->blwi_ld = *ld;
+	if (count) {
+		list_add(&blwi->blwi_head, cancels);
+		list_del_init(cancels);
+		blwi->blwi_count = count;
+	} else {
+		blwi->blwi_lock = lock;
+	}
+}
+
+/**
+ * Queues a list of locks \a cancels containing \a count locks
+ * for later processing by a blocking thread.  If \a count is zero,
+ * then the lock referenced as \a lock is queued instead.
+ *
+ * The blocking thread would then call ->l_blocking_ast callback in the lock.
+ * If list addition fails an error is returned and caller is supposed to
+ * call ->l_blocking_ast itself.
+ */
+static int ldlm_bl_to_thread(struct ldlm_namespace *ns,
+			     struct ldlm_lock_desc *ld,
+			     struct ldlm_lock *lock,
+			     struct list_head *cancels, int count,
+			     ldlm_cancel_flags_t cancel_flags)
+{
+	ENTRY;
+
+	if (cancels && count == 0)
+		RETURN(0);
+
+	if (cancel_flags & LCF_ASYNC) {
+		struct ldlm_bl_work_item *blwi;
+
+		OBD_ALLOC(blwi, sizeof(*blwi));
+		if (blwi == NULL)
+			RETURN(-ENOMEM);
+		init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags);
+
+		RETURN(__ldlm_bl_to_thread(blwi, cancel_flags));
+	} else {
+		/* if it is synchronous call do minimum mem alloc, as it could
+		 * be triggered from kernel shrinker
+		 */
+		struct ldlm_bl_work_item blwi;
+
+		memset(&blwi, 0, sizeof(blwi));
+		init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags);
+		RETURN(__ldlm_bl_to_thread(&blwi, cancel_flags));
+	}
+}
+
+
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct ldlm_lock *lock)
+{
+	return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC);
+}
+
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+			   struct list_head *cancels, int count,
+			   ldlm_cancel_flags_t cancel_flags)
+{
+	return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags);
+}
+
+/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */
+static int ldlm_handle_setinfo(struct ptlrpc_request *req)
+{
+	struct obd_device *obd = req->rq_export->exp_obd;
+	char *key;
+	void *val;
+	int keylen, vallen;
+	int rc = -ENOSYS;
+	ENTRY;
+
+	DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name);
+
+	req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO);
+
+	key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	if (key == NULL) {
+		DEBUG_REQ(D_IOCTL, req, "no set_info key");
+		RETURN(-EFAULT);
+	}
+	keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY,
+				      RCL_CLIENT);
+	val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+	if (val == NULL) {
+		DEBUG_REQ(D_IOCTL, req, "no set_info val");
+		RETURN(-EFAULT);
+	}
+	vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL,
+				      RCL_CLIENT);
+
+	/* We are responsible for swabbing contents of val */
+
+	if (KEY_IS(KEY_HSM_COPYTOOL_SEND))
+		/* Pass it on to mdc (the "export" in this case) */
+		rc = obd_set_info_async(req->rq_svc_thread->t_env,
+					req->rq_export,
+					sizeof(KEY_HSM_COPYTOOL_SEND),
+					KEY_HSM_COPYTOOL_SEND,
+					vallen, val, NULL);
+	else
+		DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key);
+
+	return rc;
+}
+
+static inline void ldlm_callback_errmsg(struct ptlrpc_request *req,
+					const char *msg, int rc,
+					struct lustre_handle *handle)
+{
+	DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req,
+		  "%s: [nid %s] [rc %d] [lock "LPX64"]",
+		  msg, libcfs_id2str(req->rq_peer), rc,
+		  handle ? handle->cookie : 0);
+	if (req->rq_no_reply)
+		CWARN("No reply was sent, maybe cause bug 21636.\n");
+	else if (rc)
+		CWARN("Send reply failed, maybe cause bug 21636.\n");
+}
+
+static int ldlm_handle_qc_callback(struct ptlrpc_request *req)
+{
+	struct obd_quotactl *oqctl;
+	struct client_obd *cli = &req->rq_export->exp_obd->u.cli;
+
+	oqctl = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	if (oqctl == NULL) {
+		CERROR("Can't unpack obd_quotactl\n");
+		RETURN(-EPROTO);
+	}
+
+	cli->cl_qchk_stat = oqctl->qc_stat;
+	return 0;
+}
+
+/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */
+static int ldlm_callback_handler(struct ptlrpc_request *req)
+{
+	struct ldlm_namespace *ns;
+	struct ldlm_request *dlm_req;
+	struct ldlm_lock *lock;
+	int rc;
+	ENTRY;
+
+	/* Requests arrive in sender's byte order.  The ptlrpc service
+	 * handler has already checked and, if necessary, byte-swapped the
+	 * incoming request message body, but I am responsible for the
+	 * message buffers. */
+
+	/* do nothing for sec context finalize */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI)
+		RETURN(0);
+
+	req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+	if (req->rq_export == NULL) {
+		rc = ldlm_callback_reply(req, -ENOTCONN);
+		ldlm_callback_errmsg(req, "Operate on unconnected server",
+				     rc, NULL);
+		RETURN(0);
+	}
+
+	LASSERT(req->rq_export != NULL);
+	LASSERT(req->rq_export->exp_obd != NULL);
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case LDLM_BL_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+			RETURN(0);
+		break;
+	case LDLM_CP_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET))
+			RETURN(0);
+		break;
+	case LDLM_GL_CALLBACK:
+		if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET))
+			RETURN(0);
+		break;
+	case LDLM_SET_INFO:
+		rc = ldlm_handle_setinfo(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case OBD_LOG_CANCEL: /* remove this eventually - for 1.4.0 compat */
+		CERROR("shouldn't be handling OBD_LOG_CANCEL on DLM thread\n");
+		req_capsule_set(&req->rq_pill, &RQF_LOG_CANCEL);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_NET))
+			RETURN(0);
+		rc = llog_origin_handle_cancel(req);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_REP))
+			RETURN(0);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case LLOG_ORIGIN_HANDLE_CREATE:
+		req_capsule_set(&req->rq_pill, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+			RETURN(0);
+		rc = llog_origin_handle_open(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
+		req_capsule_set(&req->rq_pill,
+				&RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+			RETURN(0);
+		rc = llog_origin_handle_next_block(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case LLOG_ORIGIN_HANDLE_READ_HEADER:
+		req_capsule_set(&req->rq_pill,
+				&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+			RETURN(0);
+		rc = llog_origin_handle_read_header(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case LLOG_ORIGIN_HANDLE_CLOSE:
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+			RETURN(0);
+		rc = llog_origin_handle_close(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	case OBD_QC_CALLBACK:
+		req_capsule_set(&req->rq_pill, &RQF_QC_CALLBACK);
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_QC_CALLBACK_NET))
+			RETURN(0);
+		rc = ldlm_handle_qc_callback(req);
+		ldlm_callback_reply(req, rc);
+		RETURN(0);
+	default:
+		CERROR("unknown opcode %u\n",
+		       lustre_msg_get_opc(req->rq_reqmsg));
+		ldlm_callback_reply(req, -EPROTO);
+		RETURN(0);
+	}
+
+	ns = req->rq_export->exp_obd->obd_namespace;
+	LASSERT(ns != NULL);
+
+	req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+
+	dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	if (dlm_req == NULL) {
+		rc = ldlm_callback_reply(req, -EPROTO);
+		ldlm_callback_errmsg(req, "Operate without parameter", rc,
+				     NULL);
+		RETURN(0);
+	}
+
+	/* Force a known safe race, send a cancel to the server for a lock
+	 * which the server has already started a blocking callback on. */
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+		rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0);
+		if (rc < 0)
+			CERROR("ldlm_cli_cancel: %d\n", rc);
+	}
+
+	lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0);
+	if (!lock) {
+		CDEBUG(D_DLMTRACE, "callback on lock "LPX64" - lock "
+		       "disappeared\n", dlm_req->lock_handle[0].cookie);
+		rc = ldlm_callback_reply(req, -EINVAL);
+		ldlm_callback_errmsg(req, "Operate with invalid parameter", rc,
+				     &dlm_req->lock_handle[0]);
+		RETURN(0);
+	}
+
+	if ((lock->l_flags & LDLM_FL_FAIL_LOC) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK)
+		OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+	/* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
+	lock_res_and_lock(lock);
+	lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
+					      LDLM_AST_FLAGS);
+	if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+		/* If somebody cancels lock and cache is already dropped,
+		 * or lock is failed before cp_ast received on client,
+		 * we can tell the server we have no lock. Otherwise, we
+		 * should send cancel after dropping the cache. */
+		if (((lock->l_flags & LDLM_FL_CANCELING) &&
+		    (lock->l_flags & LDLM_FL_BL_DONE)) ||
+		    (lock->l_flags & LDLM_FL_FAILED)) {
+			LDLM_DEBUG(lock, "callback on lock "
+				   LPX64" - lock disappeared\n",
+				   dlm_req->lock_handle[0].cookie);
+			unlock_res_and_lock(lock);
+			LDLM_LOCK_RELEASE(lock);
+			rc = ldlm_callback_reply(req, -EINVAL);
+			ldlm_callback_errmsg(req, "Operate on stale lock", rc,
+					     &dlm_req->lock_handle[0]);
+			RETURN(0);
+		}
+		/* BL_AST locks are not needed in LRU.
+		 * Let ldlm_cancel_lru() be fast. */
+		ldlm_lock_remove_from_lru(lock);
+		lock->l_flags |= LDLM_FL_BL_AST;
+	}
+	unlock_res_and_lock(lock);
+
+	/* We want the ost thread to get this reply so that it can respond
+	 * to ost requests (write cache writeback) that might be triggered
+	 * in the callback.
+	 *
+	 * But we'd also like to be able to indicate in the reply that we're
+	 * cancelling right now, because it's unused, or have an intent result
+	 * in the reply, so we might have to push the responsibility for sending
+	 * the reply down into the AST handlers, alas. */
+
+	switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+	case LDLM_BL_CALLBACK:
+		CDEBUG(D_INODE, "blocking ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
+		if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)) {
+			rc = ldlm_callback_reply(req, 0);
+			if (req->rq_no_reply || rc)
+				ldlm_callback_errmsg(req, "Normal process", rc,
+						     &dlm_req->lock_handle[0]);
+		}
+		if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+			ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+		break;
+	case LDLM_CP_CALLBACK:
+		CDEBUG(D_INODE, "completion ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+		ldlm_callback_reply(req, 0);
+		ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+		break;
+	case LDLM_GL_CALLBACK:
+		CDEBUG(D_INODE, "glimpse ast\n");
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+		ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+		break;
+	default:
+		LBUG();			 /* checked above */
+	}
+
+	RETURN(0);
+}
+
+
+static struct ldlm_bl_work_item *ldlm_bl_get_work(struct ldlm_bl_pool *blp)
+{
+	struct ldlm_bl_work_item *blwi = NULL;
+	static unsigned int num_bl = 0;
+
+	spin_lock(&blp->blp_lock);
+	/* process a request from the blp_list at least every blp_num_threads */
+	if (!list_empty(&blp->blp_list) &&
+	    (list_empty(&blp->blp_prio_list) || num_bl == 0))
+		blwi = list_entry(blp->blp_list.next,
+				      struct ldlm_bl_work_item, blwi_entry);
+	else
+		if (!list_empty(&blp->blp_prio_list))
+			blwi = list_entry(blp->blp_prio_list.next,
+					      struct ldlm_bl_work_item,
+					      blwi_entry);
+
+	if (blwi) {
+		if (++num_bl >= atomic_read(&blp->blp_num_threads))
+			num_bl = 0;
+		list_del(&blwi->blwi_entry);
+	}
+	spin_unlock(&blp->blp_lock);
+
+	return blwi;
+}
+
+/* This only contains temporary data until the thread starts */
+struct ldlm_bl_thread_data {
+	char			bltd_name[CFS_CURPROC_COMM_MAX];
+	struct ldlm_bl_pool	*bltd_blp;
+	struct completion	bltd_comp;
+	int			bltd_num;
+};
+
+static int ldlm_bl_thread_main(void *arg);
+
+static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp)
+{
+	struct ldlm_bl_thread_data bltd = { .bltd_blp = blp };
+	task_t *task;
+
+	init_completion(&bltd.bltd_comp);
+	bltd.bltd_num = atomic_read(&blp->blp_num_threads);
+	snprintf(bltd.bltd_name, sizeof(bltd.bltd_name) - 1,
+		"ldlm_bl_%02d", bltd.bltd_num);
+	task = kthread_run(ldlm_bl_thread_main, &bltd, bltd.bltd_name);
+	if (IS_ERR(task)) {
+		CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n",
+		       atomic_read(&blp->blp_num_threads), PTR_ERR(task));
+		return PTR_ERR(task);
+	}
+	wait_for_completion(&bltd.bltd_comp);
+
+	return 0;
+}
+
+/**
+ * Main blocking requests processing thread.
+ *
+ * Callers put locks into its queue by calling ldlm_bl_to_thread.
+ * This thread in the end ends up doing actual call to ->l_blocking_ast
+ * for queued locks.
+ */
+static int ldlm_bl_thread_main(void *arg)
+{
+	struct ldlm_bl_pool *blp;
+	ENTRY;
+
+	{
+		struct ldlm_bl_thread_data *bltd = arg;
+
+		blp = bltd->bltd_blp;
+
+		atomic_inc(&blp->blp_num_threads);
+		atomic_inc(&blp->blp_busy_threads);
+
+		complete(&bltd->bltd_comp);
+		/* cannot use bltd after this, it is only on caller's stack */
+	}
+
+	while (1) {
+		struct l_wait_info lwi = { 0 };
+		struct ldlm_bl_work_item *blwi = NULL;
+		int busy;
+
+		blwi = ldlm_bl_get_work(blp);
+
+		if (blwi == NULL) {
+			atomic_dec(&blp->blp_busy_threads);
+			l_wait_event_exclusive(blp->blp_waitq,
+					 (blwi = ldlm_bl_get_work(blp)) != NULL,
+					 &lwi);
+			busy = atomic_inc_return(&blp->blp_busy_threads);
+		} else {
+			busy = atomic_read(&blp->blp_busy_threads);
+		}
+
+		if (blwi->blwi_ns == NULL)
+			/* added by ldlm_cleanup() */
+			break;
+
+		/* Not fatal if racy and have a few too many threads */
+		if (unlikely(busy < blp->blp_max_threads &&
+			     busy >= atomic_read(&blp->blp_num_threads) &&
+			     !blwi->blwi_mem_pressure))
+			/* discard the return value, we tried */
+			ldlm_bl_thread_start(blp);
+
+		if (blwi->blwi_mem_pressure)
+			memory_pressure_set();
+
+		if (blwi->blwi_count) {
+			int count;
+			/* The special case when we cancel locks in LRU
+			 * asynchronously, we pass the list of locks here.
+			 * Thus locks are marked LDLM_FL_CANCELING, but NOT
+			 * canceled locally yet. */
+			count = ldlm_cli_cancel_list_local(&blwi->blwi_head,
+							   blwi->blwi_count,
+							   LCF_BL_AST);
+			ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL,
+					     blwi->blwi_flags);
+		} else {
+			ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld,
+						blwi->blwi_lock);
+		}
+		if (blwi->blwi_mem_pressure)
+			memory_pressure_clr();
+
+		if (blwi->blwi_flags & LCF_ASYNC)
+			OBD_FREE(blwi, sizeof(*blwi));
+		else
+			complete(&blwi->blwi_comp);
+	}
+
+	atomic_dec(&blp->blp_busy_threads);
+	atomic_dec(&blp->blp_num_threads);
+	complete(&blp->blp_comp);
+	RETURN(0);
+}
+
+
+static int ldlm_setup(void);
+static int ldlm_cleanup(void);
+
+int ldlm_get_ref(void)
+{
+	int rc = 0;
+	ENTRY;
+	mutex_lock(&ldlm_ref_mutex);
+	if (++ldlm_refcount == 1) {
+		rc = ldlm_setup();
+		if (rc)
+			ldlm_refcount--;
+	}
+	mutex_unlock(&ldlm_ref_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_get_ref);
+
+void ldlm_put_ref(void)
+{
+	ENTRY;
+	mutex_lock(&ldlm_ref_mutex);
+	if (ldlm_refcount == 1) {
+		int rc = ldlm_cleanup();
+		if (rc)
+			CERROR("ldlm_cleanup failed: %d\n", rc);
+		else
+			ldlm_refcount--;
+	} else {
+		ldlm_refcount--;
+	}
+	mutex_unlock(&ldlm_ref_mutex);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_put_ref);
+
+/*
+ * Export handle<->lock hash operations.
+ */
+static unsigned
+ldlm_export_lock_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask);
+}
+
+static void *
+ldlm_export_lock_key(struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	return &lock->l_remote_handle;
+}
+
+static void
+ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key)
+{
+	struct ldlm_lock     *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	lock->l_remote_handle = *(struct lustre_handle *)key;
+}
+
+static int
+ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return lustre_handle_equal(ldlm_export_lock_key(hnode), key);
+}
+
+static void *
+ldlm_export_lock_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+}
+
+static void
+ldlm_export_lock_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	LDLM_LOCK_GET(lock);
+}
+
+static void
+ldlm_export_lock_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_lock *lock;
+
+	lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+	LDLM_LOCK_RELEASE(lock);
+}
+
+static cfs_hash_ops_t ldlm_export_lock_ops = {
+	.hs_hash	= ldlm_export_lock_hash,
+	.hs_key	 = ldlm_export_lock_key,
+	.hs_keycmp      = ldlm_export_lock_keycmp,
+	.hs_keycpy      = ldlm_export_lock_keycpy,
+	.hs_object      = ldlm_export_lock_object,
+	.hs_get	 = ldlm_export_lock_get,
+	.hs_put	 = ldlm_export_lock_put,
+	.hs_put_locked  = ldlm_export_lock_put,
+};
+
+int ldlm_init_export(struct obd_export *exp)
+{
+	ENTRY;
+
+	exp->exp_lock_hash =
+		cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+				HASH_EXP_LOCK_CUR_BITS,
+				HASH_EXP_LOCK_MAX_BITS,
+				HASH_EXP_LOCK_BKT_BITS, 0,
+				CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+				&ldlm_export_lock_ops,
+				CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY |
+				CFS_HASH_NBLK_CHANGE);
+
+	if (!exp->exp_lock_hash)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_init_export);
+
+void ldlm_destroy_export(struct obd_export *exp)
+{
+	ENTRY;
+	cfs_hash_putref(exp->exp_lock_hash);
+	exp->exp_lock_hash = NULL;
+
+	ldlm_destroy_flock_export(exp);
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_export);
+
+static int ldlm_setup(void)
+{
+	static struct ptlrpc_service_conf	conf;
+	struct ldlm_bl_pool			*blp = NULL;
+	int rc = 0;
+	int i;
+	ENTRY;
+
+	if (ldlm_state != NULL)
+		RETURN(-EALREADY);
+
+	OBD_ALLOC(ldlm_state, sizeof(*ldlm_state));
+	if (ldlm_state == NULL)
+		RETURN(-ENOMEM);
+
+#ifdef LPROCFS
+	rc = ldlm_proc_setup();
+	if (rc != 0)
+		GOTO(out, rc);
+#endif
+
+	memset(&conf, 0, sizeof(conf));
+	conf = (typeof(conf)) {
+		.psc_name		= "ldlm_cbd",
+		.psc_watchdog_factor	= 2,
+		.psc_buf		= {
+			.bc_nbufs		= LDLM_CLIENT_NBUFS,
+			.bc_buf_size		= LDLM_BUFSIZE,
+			.bc_req_max_size	= LDLM_MAXREQSIZE,
+			.bc_rep_max_size	= LDLM_MAXREPSIZE,
+			.bc_req_portal		= LDLM_CB_REQUEST_PORTAL,
+			.bc_rep_portal		= LDLM_CB_REPLY_PORTAL,
+		},
+		.psc_thr		= {
+			.tc_thr_name		= "ldlm_cb",
+			.tc_thr_factor		= LDLM_THR_FACTOR,
+			.tc_nthrs_init		= LDLM_NTHRS_INIT,
+			.tc_nthrs_base		= LDLM_NTHRS_BASE,
+			.tc_nthrs_max		= LDLM_NTHRS_MAX,
+			.tc_nthrs_user		= ldlm_num_threads,
+			.tc_cpu_affinity	= 1,
+			.tc_ctx_tags		= LCT_MD_THREAD | LCT_DT_THREAD,
+		},
+		.psc_cpt		= {
+			.cc_pattern		= ldlm_cpts,
+		},
+		.psc_ops		= {
+			.so_req_handler		= ldlm_callback_handler,
+		},
+	};
+	ldlm_state->ldlm_cb_service = \
+			ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+	if (IS_ERR(ldlm_state->ldlm_cb_service)) {
+		CERROR("failed to start service\n");
+		rc = PTR_ERR(ldlm_state->ldlm_cb_service);
+		ldlm_state->ldlm_cb_service = NULL;
+		GOTO(out, rc);
+	}
+
+
+	OBD_ALLOC(blp, sizeof(*blp));
+	if (blp == NULL)
+		GOTO(out, rc = -ENOMEM);
+	ldlm_state->ldlm_bl_pool = blp;
+
+	spin_lock_init(&blp->blp_lock);
+	INIT_LIST_HEAD(&blp->blp_list);
+	INIT_LIST_HEAD(&blp->blp_prio_list);
+	init_waitqueue_head(&blp->blp_waitq);
+	atomic_set(&blp->blp_num_threads, 0);
+	atomic_set(&blp->blp_busy_threads, 0);
+
+	if (ldlm_num_threads == 0) {
+		blp->blp_min_threads = LDLM_NTHRS_INIT;
+		blp->blp_max_threads = LDLM_NTHRS_MAX;
+	} else {
+		blp->blp_min_threads = blp->blp_max_threads = \
+			min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT,
+							 ldlm_num_threads));
+	}
+
+	for (i = 0; i < blp->blp_min_threads; i++) {
+		rc = ldlm_bl_thread_start(blp);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+
+	rc = ldlm_pools_init();
+	if (rc) {
+		CERROR("Failed to initialize LDLM pools: %d\n", rc);
+		GOTO(out, rc);
+	}
+	RETURN(0);
+
+ out:
+	ldlm_cleanup();
+	RETURN(rc);
+}
+
+static int ldlm_cleanup(void)
+{
+	ENTRY;
+
+	if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) ||
+	    !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) {
+		CERROR("ldlm still has namespaces; clean these up first.\n");
+		ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+		ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+		RETURN(-EBUSY);
+	}
+
+	ldlm_pools_fini();
+
+	if (ldlm_state->ldlm_bl_pool != NULL) {
+		struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+		while (atomic_read(&blp->blp_num_threads) > 0) {
+			struct ldlm_bl_work_item blwi = { .blwi_ns = NULL };
+
+			init_completion(&blp->blp_comp);
+
+			spin_lock(&blp->blp_lock);
+			list_add_tail(&blwi.blwi_entry, &blp->blp_list);
+			wake_up(&blp->blp_waitq);
+			spin_unlock(&blp->blp_lock);
+
+			wait_for_completion(&blp->blp_comp);
+		}
+
+		OBD_FREE(blp, sizeof(*blp));
+	}
+
+	if (ldlm_state->ldlm_cb_service != NULL)
+		ptlrpc_unregister_service(ldlm_state->ldlm_cb_service);
+
+	ldlm_proc_cleanup();
+
+
+	OBD_FREE(ldlm_state, sizeof(*ldlm_state));
+	ldlm_state = NULL;
+
+	RETURN(0);
+}
+
+int ldlm_init(void)
+{
+	mutex_init(&ldlm_ref_mutex);
+	mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+	mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+	ldlm_resource_slab = kmem_cache_create("ldlm_resources",
+					       sizeof(struct ldlm_resource), 0,
+					       SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_resource_slab == NULL)
+		return -ENOMEM;
+
+	ldlm_lock_slab = kmem_cache_create("ldlm_locks",
+			      sizeof(struct ldlm_lock), 0,
+			      SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL);
+	if (ldlm_lock_slab == NULL) {
+		kmem_cache_destroy(ldlm_resource_slab);
+		return -ENOMEM;
+	}
+
+	ldlm_interval_slab = kmem_cache_create("interval_node",
+					sizeof(struct ldlm_interval),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ldlm_interval_slab == NULL) {
+		kmem_cache_destroy(ldlm_resource_slab);
+		kmem_cache_destroy(ldlm_lock_slab);
+		return -ENOMEM;
+	}
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	class_export_dump_hook = ldlm_dump_export_locks;
+#endif
+	return 0;
+}
+
+void ldlm_exit(void)
+{
+	if (ldlm_refcount)
+		CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
+	kmem_cache_destroy(ldlm_resource_slab);
+	/* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+	 * synchronize_rcu() to wait a grace period elapsed, so that
+	 * ldlm_lock_free() get a chance to be called. */
+	synchronize_rcu();
+	kmem_cache_destroy(ldlm_lock_slab);
+	kmem_cache_destroy(ldlm_interval_slab);
+}

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c
new file mode 100644
index 0000000..ec29e28
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c

@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_plain.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of PLAIN lock type.
+ *
+ * PLAIN locks are the simplest form of LDLM locking, and are used when
+ * there only needs to be a single lock on a resource. This avoids some
+ * of the complexity of EXTENT and IBITS lock types, but doesn't allow
+ * different "parts" of a resource to be locked concurrently.  Example
+ * use cases for PLAIN locks include locking of MGS configuration logs
+ * and (as of Lustre 2.4) quota records.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+				     ldlm_policy_data_t *lpolicy)
+{
+	/* No policy for plain locks */
+}
+
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+				     ldlm_wire_policy_data_t *wpolicy)
+{
+	/* No policy for plain locks */
+}

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
new file mode 100644
index 0000000..b3b6028
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c

@@ -0,0 +1,1384 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_pool.c
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+/*
+ * Idea of this code is rather simple. Each second, for each server namespace
+ * we have SLV - server lock volume which is calculated on current number of
+ * granted locks, grant speed for past period, etc - that is, locking load.
+ * This SLV number may be thought as a flow definition for simplicity. It is
+ * sent to clients with each occasion to let them know what is current load
+ * situation on the server. By default, at the beginning, SLV on server is
+ * set max value which is calculated as the following: allow to one client
+ * have all locks of limit ->pl_limit for 10h.
+ *
+ * Next, on clients, number of cached locks is not limited artificially in any
+ * way as it was before. Instead, client calculates CLV, that is, client lock
+ * volume for each lock and compares it with last SLV from the server. CLV is
+ * calculated as the number of locks in LRU * lock live time in seconds. If
+ * CLV > SLV - lock is canceled.
+ *
+ * Client has LVF, that is, lock volume factor which regulates how much sensitive
+ * client should be about last SLV from server. The higher LVF is the more locks
+ * will be canceled on client. Default value for it is 1. Setting LVF to 2 means
+ * that client will cancel locks 2 times faster.
+ *
+ * Locks on a client will be canceled more intensively in these cases:
+ * (1) if SLV is smaller, that is, load is higher on the server;
+ * (2) client has a lot of locks (the more locks are held by client, the bigger
+ *     chances that some of them should be canceled);
+ * (3) client has old locks (taken some time ago);
+ *
+ * Thus, according to flow paradigm that we use for better understanding SLV,
+ * CLV is the volume of particle in flow described by SLV. According to this,
+ * if flow is getting thinner, more and more particles become outside of it and
+ * as particles are locks, they should be canceled.
+ *
+ * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas
+ * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many
+ * cleanups. Flow definition to allow more easy understanding of the logic belongs
+ * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes.
+ * And design and implementation are done by Yury Umanets (umka@clusterfs.com).
+ *
+ * Glossary for terms used:
+ *
+ * pl_limit - Number of allowed locks in pool. Applies to server and client
+ * side (tunable);
+ *
+ * pl_granted - Number of granted locks (calculated);
+ * pl_grant_rate - Number of granted locks for last T (calculated);
+ * pl_cancel_rate - Number of canceled locks for last T (calculated);
+ * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
+ * pl_grant_plan - Planned number of granted locks for next T (calculated);
+ * pl_server_lock_volume - Current server lock volume (calculated);
+ *
+ * As it may be seen from list above, we have few possible tunables which may
+ * affect behavior much. They all may be modified via proc. However, they also
+ * give a possibility for constructing few pre-defined behavior policies. If
+ * none of predefines is suitable for a working pattern being used, new one may
+ * be "constructed" via proc tunables.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <lustre_dlm.h>
+
+#include <cl_object.h>
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include "ldlm_internal.h"
+
+
+/*
+ * 50 ldlm locks for 1MB of RAM.
+ */
+#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_CACHE_SHIFT)) * 50)
+
+/*
+ * Maximal possible grant step plan in %.
+ */
+#define LDLM_POOL_MAX_GSP (30)
+
+/*
+ * Minimal possible grant step plan in %.
+ */
+#define LDLM_POOL_MIN_GSP (1)
+
+/*
+ * This controls the speed of reaching LDLM_POOL_MAX_GSP
+ * with increasing thread period.
+ */
+#define LDLM_POOL_GSP_STEP_SHIFT (2)
+
+/*
+ * LDLM_POOL_GSP% of all locks is default GP.
+ */
+#define LDLM_POOL_GP(L)   (((L) * LDLM_POOL_MAX_GSP) / 100)
+
+/*
+ * Max age for locks on clients.
+ */
+#define LDLM_POOL_MAX_AGE (36000)
+
+/*
+ * The granularity of SLV calculation.
+ */
+#define LDLM_POOL_SLV_SHIFT (10)
+
+extern proc_dir_entry_t *ldlm_ns_proc_dir;
+
+static inline __u64 dru(__u64 val, __u32 shift, int round_up)
+{
+	return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift;
+}
+
+static inline __u64 ldlm_pool_slv_max(__u32 L)
+{
+	/*
+	 * Allow to have all locks for 1 client for 10 hrs.
+	 * Formula is the following: limit * 10h / 1 client.
+	 */
+	__u64 lim = (__u64)L *  LDLM_POOL_MAX_AGE / 1;
+	return lim;
+}
+
+static inline __u64 ldlm_pool_slv_min(__u32 L)
+{
+	return 1;
+}
+
+enum {
+	LDLM_POOL_FIRST_STAT = 0,
+	LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT,
+	LDLM_POOL_GRANT_STAT,
+	LDLM_POOL_CANCEL_STAT,
+	LDLM_POOL_GRANT_RATE_STAT,
+	LDLM_POOL_CANCEL_RATE_STAT,
+	LDLM_POOL_GRANT_PLAN_STAT,
+	LDLM_POOL_SLV_STAT,
+	LDLM_POOL_SHRINK_REQTD_STAT,
+	LDLM_POOL_SHRINK_FREED_STAT,
+	LDLM_POOL_RECALC_STAT,
+	LDLM_POOL_TIMING_STAT,
+	LDLM_POOL_LAST_STAT
+};
+
+static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
+{
+	return container_of(pl, struct ldlm_namespace, ns_pool);
+}
+
+/**
+ * Calculates suggested grant_step in % of available locks for passed
+ * \a period. This is later used in grant_plan calculations.
+ */
+static inline int ldlm_pool_t2gsp(unsigned int t)
+{
+	/*
+	 * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP
+	 * and up to 30% for anything higher than LDLM_POOL_GSP_STEP.
+	 *
+	 * How this will affect execution is the following:
+	 *
+	 * - for thread period 1s we will have grant_step 1% which good from
+	 * pov of taking some load off from server and push it out to clients.
+	 * This is like that because 1% for grant_step means that server will
+	 * not allow clients to get lots of locks in short period of time and
+	 * keep all old locks in their caches. Clients will always have to
+	 * get some locks back if they want to take some new;
+	 *
+	 * - for thread period 10s (which is default) we will have 23% which
+	 * means that clients will have enough of room to take some new locks
+	 * without getting some back. All locks from this 23% which were not
+	 * taken by clients in current period will contribute in SLV growing.
+	 * SLV growing means more locks cached on clients until limit or grant
+	 * plan is reached.
+	 */
+	return LDLM_POOL_MAX_GSP -
+		((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >>
+		 (t >> LDLM_POOL_GSP_STEP_SHIFT));
+}
+
+/**
+ * Recalculates next grant limit on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
+{
+	int granted, grant_step, limit;
+
+	limit = ldlm_pool_get_limit(pl);
+	granted = atomic_read(&pl->pl_granted);
+
+	grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+	grant_step = ((limit - granted) * grant_step) / 100;
+	pl->pl_grant_plan = granted + grant_step;
+	limit = (limit * 5) >> 2;
+	if (pl->pl_grant_plan > limit)
+		pl->pl_grant_plan = limit;
+}
+
+/**
+ * Recalculates next SLV on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
+{
+	int granted;
+	int grant_plan;
+	int round_up;
+	__u64 slv;
+	__u64 slv_factor;
+	__u64 grant_usage;
+	__u32 limit;
+
+	slv = pl->pl_server_lock_volume;
+	grant_plan = pl->pl_grant_plan;
+	limit = ldlm_pool_get_limit(pl);
+	granted = atomic_read(&pl->pl_granted);
+	round_up = granted < limit;
+
+	grant_usage = max_t(int, limit - (granted - grant_plan), 1);
+
+	/*
+	 * Find out SLV change factor which is the ratio of grant usage
+	 * from limit. SLV changes as fast as the ratio of grant plan
+	 * consumption. The more locks from grant plan are not consumed
+	 * by clients in last interval (idle time), the faster grows
+	 * SLV. And the opposite, the more grant plan is over-consumed
+	 * (load time) the faster drops SLV.
+	 */
+	slv_factor = (grant_usage << LDLM_POOL_SLV_SHIFT);
+	do_div(slv_factor, limit);
+	slv = slv * slv_factor;
+	slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up);
+
+	if (slv > ldlm_pool_slv_max(limit)) {
+		slv = ldlm_pool_slv_max(limit);
+	} else if (slv < ldlm_pool_slv_min(limit)) {
+		slv = ldlm_pool_slv_min(limit);
+	}
+
+	pl->pl_server_lock_volume = slv;
+}
+
+/**
+ * Recalculates next stats on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
+{
+	int grant_plan = pl->pl_grant_plan;
+	__u64 slv = pl->pl_server_lock_volume;
+	int granted = atomic_read(&pl->pl_granted);
+	int grant_rate = atomic_read(&pl->pl_grant_rate);
+	int cancel_rate = atomic_read(&pl->pl_cancel_rate);
+
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
+			    slv);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+			    granted);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+			    grant_rate);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+			    grant_plan);
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+			    cancel_rate);
+}
+
+/**
+ * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd.
+ */
+static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl)
+{
+	struct obd_device *obd;
+
+	/*
+	 * Set new SLV in obd field for using it later without accessing the
+	 * pool. This is required to avoid race between sending reply to client
+	 * with new SLV and cleanup server stack in which we can't guarantee
+	 * that namespace is still alive. We know only that obd is alive as
+	 * long as valid export is alive.
+	 */
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL);
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_slv = pl->pl_server_lock_volume;
+	write_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates all pool fields on passed \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
+{
+	time_t recalc_interval_sec;
+	ENTRY;
+
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period)
+		RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period) {
+		spin_unlock(&pl->pl_lock);
+		RETURN(0);
+	}
+	/*
+	 * Recalc SLV after last period. This should be done
+	 * _before_ recalculating new grant plan.
+	 */
+	ldlm_pool_recalc_slv(pl);
+
+	/*
+	 * Make sure that pool informed obd of last SLV changes.
+	 */
+	ldlm_srv_pool_push_slv(pl);
+
+	/*
+	 * Update grant_plan for new period.
+	 */
+	ldlm_pool_recalc_grant_plan(pl);
+
+	pl->pl_recalc_time = cfs_time_current_sec();
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			    recalc_interval_sec);
+	spin_unlock(&pl->pl_lock);
+	RETURN(0);
+}
+
+/**
+ * This function is used on server side as main entry point for memory
+ * pressure handling. It decreases SLV on \a pl according to passed
+ * \a nr and \a gfp_mask.
+ *
+ * Our goal here is to decrease SLV such a way that clients hold \a nr
+ * locks smaller in next 10h.
+ */
+static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
+				int nr, unsigned int gfp_mask)
+{
+	__u32 limit;
+
+	/*
+	 * VM is asking how many entries may be potentially freed.
+	 */
+	if (nr == 0)
+		return atomic_read(&pl->pl_granted);
+
+	/*
+	 * Client already canceled locks but server is already in shrinker
+	 * and can't cancel anything. Let's catch this race.
+	 */
+	if (atomic_read(&pl->pl_granted) == 0)
+		RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+
+	/*
+	 * We want shrinker to possibly cause cancellation of @nr locks from
+	 * clients or grant approximately @nr locks smaller next intervals.
+	 *
+	 * This is why we decreased SLV by @nr. This effect will only be as
+	 * long as one re-calc interval (1s these days) and this should be
+	 * enough to pass this decreased SLV to all clients. On next recalc
+	 * interval pool will either increase SLV if locks load is not high
+	 * or will keep on same level or even decrease again, thus, shrinker
+	 * decreased SLV will affect next recalc intervals and this way will
+	 * make locking load lower.
+	 */
+	if (nr < pl->pl_server_lock_volume) {
+		pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr;
+	} else {
+		limit = ldlm_pool_get_limit(pl);
+		pl->pl_server_lock_volume = ldlm_pool_slv_min(limit);
+	}
+
+	/*
+	 * Make sure that pool informed obd of last SLV changes.
+	 */
+	ldlm_srv_pool_push_slv(pl);
+	spin_unlock(&pl->pl_lock);
+
+	/*
+	 * We did not really free any memory here so far, it only will be
+	 * freed later may be, so that we return 0 to not confuse VM.
+	 */
+	return 0;
+}
+
+/**
+ * Setup server side pool \a pl with passed \a limit.
+ */
+static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit)
+{
+	struct obd_device *obd;
+
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL && obd != LP_POISON);
+	LASSERT(obd->obd_type != LP_POISON);
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_limit = limit;
+	write_unlock(&obd->obd_pool_lock);
+
+	ldlm_pool_set_limit(pl, limit);
+	return 0;
+}
+
+/**
+ * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl.
+ */
+static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl)
+{
+	struct obd_device *obd;
+
+	/*
+	 * Get new SLV and Limit from obd which is updated with coming
+	 * RPCs.
+	 */
+	obd = ldlm_pl2ns(pl)->ns_obd;
+	LASSERT(obd != NULL);
+	read_lock(&obd->obd_pool_lock);
+	pl->pl_server_lock_volume = obd->obd_pool_slv;
+	ldlm_pool_set_limit(pl, obd->obd_pool_limit);
+	read_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates client size pool \a pl according to current SLV and Limit.
+ */
+static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
+{
+	time_t recalc_interval_sec;
+	ENTRY;
+
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period)
+		RETURN(0);
+
+	spin_lock(&pl->pl_lock);
+	/*
+	 * Check if we need to recalc lists now.
+	 */
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec < pl->pl_recalc_period) {
+		spin_unlock(&pl->pl_lock);
+		RETURN(0);
+	}
+
+	/*
+	 * Make sure that pool knows last SLV and Limit from obd.
+	 */
+	ldlm_cli_pool_pop_slv(pl);
+
+	pl->pl_recalc_time = cfs_time_current_sec();
+	lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			    recalc_interval_sec);
+	spin_unlock(&pl->pl_lock);
+
+	/*
+	 * Do not cancel locks in case lru resize is disabled for this ns.
+	 */
+	if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
+		RETURN(0);
+
+	/*
+	 * In the time of canceling locks on client we do not need to maintain
+	 * sharp timing, we only want to cancel locks asap according to new SLV.
+	 * It may be called when SLV has changed much, this is why we do not
+	 * take into account pl->pl_recalc_time here.
+	 */
+	RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC,
+			       LDLM_CANCEL_LRUR));
+}
+
+/**
+ * This function is main entry point for memory pressure handling on client
+ * side.  Main goal of this function is to cancel some number of locks on
+ * passed \a pl according to \a nr and \a gfp_mask.
+ */
+static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
+				int nr, unsigned int gfp_mask)
+{
+	struct ldlm_namespace *ns;
+	int canceled = 0, unused;
+
+	ns = ldlm_pl2ns(pl);
+
+	/*
+	 * Do not cancel locks in case lru resize is disabled for this ns.
+	 */
+	if (!ns_connect_lru_resize(ns))
+		RETURN(0);
+
+	/*
+	 * Make sure that pool knows last SLV and Limit from obd.
+	 */
+	ldlm_cli_pool_pop_slv(pl);
+
+	spin_lock(&ns->ns_lock);
+	unused = ns->ns_nr_unused;
+	spin_unlock(&ns->ns_lock);
+
+	if (nr) {
+		canceled = ldlm_cancel_lru(ns, nr, LCF_ASYNC,
+					   LDLM_CANCEL_SHRINK);
+	}
+	/*
+	 * Return the number of potentially reclaimable locks.
+	 */
+	return ((unused - canceled) / 100) * sysctl_vfs_cache_pressure;
+}
+
+struct ldlm_pool_ops ldlm_srv_pool_ops = {
+	.po_recalc = ldlm_srv_pool_recalc,
+	.po_shrink = ldlm_srv_pool_shrink,
+	.po_setup  = ldlm_srv_pool_setup
+};
+
+struct ldlm_pool_ops ldlm_cli_pool_ops = {
+	.po_recalc = ldlm_cli_pool_recalc,
+	.po_shrink = ldlm_cli_pool_shrink
+};
+
+/**
+ * Pool recalc wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_recalc(struct ldlm_pool *pl)
+{
+	time_t recalc_interval_sec;
+	int count;
+
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec <= 0)
+		goto recalc;
+
+	spin_lock(&pl->pl_lock);
+	recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+	if (recalc_interval_sec > 0) {
+		/*
+		 * Update pool statistics every 1s.
+		 */
+		ldlm_pool_recalc_stats(pl);
+
+		/*
+		 * Zero out all rates and speed for the last period.
+		 */
+		atomic_set(&pl->pl_grant_rate, 0);
+		atomic_set(&pl->pl_cancel_rate, 0);
+	}
+	spin_unlock(&pl->pl_lock);
+
+ recalc:
+	if (pl->pl_ops->po_recalc != NULL) {
+		count = pl->pl_ops->po_recalc(pl);
+		lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+				    count);
+		return count;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_pool_recalc);
+
+/**
+ * Pool shrink wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+		     unsigned int gfp_mask)
+{
+	int cancel = 0;
+
+	if (pl->pl_ops->po_shrink != NULL) {
+		cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
+		if (nr > 0) {
+			lprocfs_counter_add(pl->pl_stats,
+					    LDLM_POOL_SHRINK_REQTD_STAT,
+					    nr);
+			lprocfs_counter_add(pl->pl_stats,
+					    LDLM_POOL_SHRINK_FREED_STAT,
+					    cancel);
+			CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, "
+			       "shrunk %d\n", pl->pl_name, nr, cancel);
+		}
+	}
+	return cancel;
+}
+EXPORT_SYMBOL(ldlm_pool_shrink);
+
+/**
+ * Pool setup wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ *
+ * Sets passed \a limit into pool \a pl.
+ */
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
+{
+	if (pl->pl_ops->po_setup != NULL)
+		return(pl->pl_ops->po_setup(pl, limit));
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_pool_setup);
+
+static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused)
+{
+	int granted, grant_rate, cancel_rate, grant_step;
+	int grant_speed, grant_plan, lvf;
+	struct ldlm_pool *pl = m->private;
+	__u64 slv, clv;
+	__u32 limit;
+
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_server_lock_volume;
+	clv = pl->pl_client_lock_volume;
+	limit = ldlm_pool_get_limit(pl);
+	grant_plan = pl->pl_grant_plan;
+	granted = atomic_read(&pl->pl_granted);
+	grant_rate = atomic_read(&pl->pl_grant_rate);
+	cancel_rate = atomic_read(&pl->pl_cancel_rate);
+	grant_speed = grant_rate - cancel_rate;
+	lvf = atomic_read(&pl->pl_lock_volume_factor);
+	grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+	spin_unlock(&pl->pl_lock);
+
+	seq_printf(m, "LDLM pool state (%s):\n"
+		      "  SLV: "LPU64"\n"
+		      "  CLV: "LPU64"\n"
+		      "  LVF: %d\n",
+		      pl->pl_name, slv, clv, lvf);
+
+	if (ns_is_server(ldlm_pl2ns(pl))) {
+		seq_printf(m, "  GSP: %d%%\n"
+			      "  GP:  %d\n",
+			      grant_step, grant_plan);
+	}
+	seq_printf(m, "  GR:  %d\n" "  CR:  %d\n" "  GS:  %d\n"
+		      "  G:   %d\n" "  L:   %d\n",
+		      grant_rate, cancel_rate, grant_speed,
+		      granted, limit);
+
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_pool_state);
+
+static int lprocfs_grant_speed_seq_show(struct seq_file *m, void *unused)
+{
+	struct ldlm_pool *pl = m->private;
+	int	       grant_speed;
+
+	spin_lock(&pl->pl_lock);
+	/* serialize with ldlm_pool_recalc */
+	grant_speed = atomic_read(&pl->pl_grant_rate) -
+			atomic_read(&pl->pl_cancel_rate);
+	spin_unlock(&pl->pl_lock);
+	return lprocfs_rd_uint(m, &grant_speed);
+}
+
+LDLM_POOL_PROC_READER_SEQ_SHOW(grant_plan, int);
+LPROC_SEQ_FOPS_RO(lprocfs_grant_plan);
+
+LDLM_POOL_PROC_READER_SEQ_SHOW(recalc_period, int);
+LDLM_POOL_PROC_WRITER(recalc_period, int);
+static ssize_t lprocfs_recalc_period_seq_write(struct file *file, const char *buf,
+					   size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+
+	return lprocfs_wr_recalc_period(file, buf, len, seq->private);
+}
+LPROC_SEQ_FOPS(lprocfs_recalc_period);
+
+LPROC_SEQ_FOPS_RO_TYPE(ldlm_pool, u64);
+LPROC_SEQ_FOPS_RO_TYPE(ldlm_pool, atomic);
+LPROC_SEQ_FOPS_RW_TYPE(ldlm_pool_rw, atomic);
+
+LPROC_SEQ_FOPS_RO(lprocfs_grant_speed);
+
+#define LDLM_POOL_ADD_VAR(name, var, ops)			\
+	do {							\
+		snprintf(var_name, MAX_STRING_SIZE, #name);	\
+		pool_vars[0].data = var;			\
+		pool_vars[0].fops = ops;			\
+		lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);\
+	} while (0)
+
+static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+{
+	struct ldlm_namespace *ns = ldlm_pl2ns(pl);
+	struct proc_dir_entry *parent_ns_proc;
+	struct lprocfs_vars pool_vars[2];
+	char *var_name = NULL;
+	int rc = 0;
+	ENTRY;
+
+	OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
+	if (!var_name)
+		RETURN(-ENOMEM);
+
+	parent_ns_proc = ns->ns_proc_dir_entry;
+	if (parent_ns_proc == NULL) {
+		CERROR("%s: proc entry is not initialized\n",
+		       ldlm_ns_name(ns));
+		GOTO(out_free_name, rc = -EINVAL);
+	}
+	pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
+					   NULL, NULL);
+	if (IS_ERR(pl->pl_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-pool-init\n");
+		rc = PTR_ERR(pl->pl_proc_dir);
+		GOTO(out_free_name, rc);
+	}
+
+	var_name[MAX_STRING_SIZE] = '\0';
+	memset(pool_vars, 0, sizeof(pool_vars));
+	pool_vars[0].name = var_name;
+
+	LDLM_POOL_ADD_VAR("server_lock_volume", &pl->pl_server_lock_volume,
+			  &ldlm_pool_u64_fops);
+	LDLM_POOL_ADD_VAR("limit", &pl->pl_limit, &ldlm_pool_rw_atomic_fops);
+	LDLM_POOL_ADD_VAR("granted", &pl->pl_granted, &ldlm_pool_atomic_fops);
+	LDLM_POOL_ADD_VAR("grant_speed", pl, &lprocfs_grant_speed_fops);
+	LDLM_POOL_ADD_VAR("cancel_rate", &pl->pl_cancel_rate,
+			  &ldlm_pool_atomic_fops);
+	LDLM_POOL_ADD_VAR("grant_rate", &pl->pl_grant_rate,
+			  &ldlm_pool_atomic_fops);
+	LDLM_POOL_ADD_VAR("grant_plan", pl, &lprocfs_grant_plan_fops);
+	LDLM_POOL_ADD_VAR("recalc_period", pl, &lprocfs_recalc_period_fops);
+	LDLM_POOL_ADD_VAR("lock_volume_factor", &pl->pl_lock_volume_factor,
+			  &ldlm_pool_rw_atomic_fops);
+	LDLM_POOL_ADD_VAR("state", pl, &lprocfs_pool_state_fops);
+
+	pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
+					   LDLM_POOL_FIRST_STAT, 0);
+	if (!pl->pl_stats)
+		GOTO(out_free_name, rc = -ENOMEM);
+
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "granted", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "grant", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "cancel", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "grant_rate", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "cancel_rate", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "grant_plan", "locks/s");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "slv", "slv");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "shrink_request", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "shrink_freed", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "recalc_freed", "locks");
+	lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+			     LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+			     "recalc_timing", "sec");
+	rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
+
+	EXIT;
+out_free_name:
+	OBD_FREE(var_name, MAX_STRING_SIZE + 1);
+	return rc;
+}
+
+static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
+{
+	if (pl->pl_stats != NULL) {
+		lprocfs_free_stats(&pl->pl_stats);
+		pl->pl_stats = NULL;
+	}
+	if (pl->pl_proc_dir != NULL) {
+		lprocfs_remove(&pl->pl_proc_dir);
+		pl->pl_proc_dir = NULL;
+	}
+}
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+		   int idx, ldlm_side_t client)
+{
+	int rc;
+	ENTRY;
+
+	spin_lock_init(&pl->pl_lock);
+	atomic_set(&pl->pl_granted, 0);
+	pl->pl_recalc_time = cfs_time_current_sec();
+	atomic_set(&pl->pl_lock_volume_factor, 1);
+
+	atomic_set(&pl->pl_grant_rate, 0);
+	atomic_set(&pl->pl_cancel_rate, 0);
+	pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L);
+
+	snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
+		 ldlm_ns_name(ns), idx);
+
+	if (client == LDLM_NAMESPACE_SERVER) {
+		pl->pl_ops = &ldlm_srv_pool_ops;
+		ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
+		pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD;
+		pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L);
+	} else {
+		ldlm_pool_set_limit(pl, 1);
+		pl->pl_server_lock_volume = 0;
+		pl->pl_ops = &ldlm_cli_pool_ops;
+		pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+	}
+	pl->pl_client_lock_volume = 0;
+	rc = ldlm_pool_proc_init(pl);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_pool_init);
+
+void ldlm_pool_fini(struct ldlm_pool *pl)
+{
+	ENTRY;
+	ldlm_pool_proc_fini(pl);
+
+	/*
+	 * Pool should not be used after this point. We can't free it here as
+	 * it lives in struct ldlm_namespace, but still interested in catching
+	 * any abnormal using cases.
+	 */
+	POISON(pl, 0x5a, sizeof(*pl));
+	EXIT;
+}
+EXPORT_SYMBOL(ldlm_pool_fini);
+
+/**
+ * Add new taken ldlm lock \a lock into pool \a pl accounting.
+ */
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	/*
+	 * FLOCK locks are special in a sense that they are almost never
+	 * cancelled, instead special kind of lock is used to drop them.
+	 * also there is no LRU for flock locks, so no point in tracking
+	 * them anyway.
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK)
+		return;
+
+	atomic_inc(&pl->pl_granted);
+	atomic_inc(&pl->pl_grant_rate);
+	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
+	/*
+	 * Do not do pool recalc for client side as all locks which
+	 * potentially may be canceled has already been packed into
+	 * enqueue/cancel rpc. Also we do not want to run out of stack
+	 * with too long call paths.
+	 */
+	if (ns_is_server(ldlm_pl2ns(pl)))
+		ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_add);
+
+/**
+ * Remove ldlm lock \a lock from pool \a pl accounting.
+ */
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+	/*
+	 * Filter out FLOCK locks. Read above comment in ldlm_pool_add().
+	 */
+	if (lock->l_resource->lr_type == LDLM_FLOCK)
+		return;
+
+	LASSERT(atomic_read(&pl->pl_granted) > 0);
+	atomic_dec(&pl->pl_granted);
+	atomic_inc(&pl->pl_cancel_rate);
+
+	lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
+
+	if (ns_is_server(ldlm_pl2ns(pl)))
+		ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_del);
+
+/**
+ * Returns current \a pl SLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
+{
+	__u64 slv;
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_server_lock_volume;
+	spin_unlock(&pl->pl_lock);
+	return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_slv);
+
+/**
+ * Sets passed \a slv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
+{
+	spin_lock(&pl->pl_lock);
+	pl->pl_server_lock_volume = slv;
+	spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_slv);
+
+/**
+ * Returns current \a pl CLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
+{
+	__u64 slv;
+	spin_lock(&pl->pl_lock);
+	slv = pl->pl_client_lock_volume;
+	spin_unlock(&pl->pl_lock);
+	return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_clv);
+
+/**
+ * Sets passed \a clv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
+{
+	spin_lock(&pl->pl_lock);
+	pl->pl_client_lock_volume = clv;
+	spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_clv);
+
+/**
+ * Returns current \a pl limit.
+ */
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_limit);
+}
+EXPORT_SYMBOL(ldlm_pool_get_limit);
+
+/**
+ * Sets passed \a limit to \a pl.
+ */
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
+{
+	atomic_set(&pl->pl_limit, limit);
+}
+EXPORT_SYMBOL(ldlm_pool_set_limit);
+
+/**
+ * Returns current LVF from \a pl.
+ */
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_lock_volume_factor);
+}
+EXPORT_SYMBOL(ldlm_pool_get_lvf);
+
+static int ldlm_pool_granted(struct ldlm_pool *pl)
+{
+	return atomic_read(&pl->pl_granted);
+}
+
+static struct ptlrpc_thread *ldlm_pools_thread;
+static struct shrinker *ldlm_pools_srv_shrinker;
+static struct shrinker *ldlm_pools_cli_shrinker;
+static struct completion ldlm_pools_comp;
+
+/*
+ * Cancel \a nr locks from all namespaces (if possible). Returns number of
+ * cached locks after shrink is finished. All namespaces are asked to
+ * cancel approximately equal amount of locks to keep balancing.
+ */
+static int ldlm_pools_shrink(ldlm_side_t client, int nr,
+			     unsigned int gfp_mask)
+{
+	int total = 0, cached = 0, nr_ns;
+	struct ldlm_namespace *ns;
+	void *cookie;
+
+	if (client == LDLM_NAMESPACE_CLIENT && nr != 0 &&
+	    !(gfp_mask & __GFP_FS))
+		return -1;
+
+	CDEBUG(D_DLMTRACE, "Request to shrink %d %s locks from all pools\n",
+	       nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
+
+	cookie = cl_env_reenter();
+
+	/*
+	 * Find out how many resources we may release.
+	 */
+	for (nr_ns = atomic_read(ldlm_namespace_nr(client));
+	     nr_ns > 0; nr_ns--)
+	{
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			cl_env_reexit(cookie);
+			return 0;
+		}
+		ns = ldlm_namespace_first_locked(client);
+		ldlm_namespace_get(ns);
+		ldlm_namespace_move_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+		total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask);
+		ldlm_namespace_put(ns);
+	}
+
+	if (nr == 0 || total == 0) {
+		cl_env_reexit(cookie);
+		return total;
+	}
+
+	/*
+	 * Shrink at least ldlm_namespace_nr(client) namespaces.
+	 */
+	for (nr_ns = atomic_read(ldlm_namespace_nr(client));
+	     nr_ns > 0; nr_ns--)
+	{
+		int cancel, nr_locks;
+
+		/*
+		 * Do not call shrink under ldlm_namespace_lock(client)
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			/*
+			 * If list is empty, we can't return any @cached > 0,
+			 * that probably would cause needless shrinker
+			 * call.
+			 */
+			cached = 0;
+			break;
+		}
+		ns = ldlm_namespace_first_locked(client);
+		ldlm_namespace_get(ns);
+		ldlm_namespace_move_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+
+		nr_locks = ldlm_pool_granted(&ns->ns_pool);
+		cancel = 1 + nr_locks * nr / total;
+		ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
+		cached += ldlm_pool_granted(&ns->ns_pool);
+		ldlm_namespace_put(ns);
+	}
+	cl_env_reexit(cookie);
+	/* we only decrease the SLV in server pools shrinker, return -1 to
+	 * kernel to avoid needless loop. LU-1128 */
+	return (client == LDLM_NAMESPACE_SERVER) ? -1 : cached;
+}
+
+static int ldlm_pools_srv_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER,
+				 shrink_param(sc, nr_to_scan),
+				 shrink_param(sc, gfp_mask));
+}
+
+static int ldlm_pools_cli_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT,
+				 shrink_param(sc, nr_to_scan),
+				 shrink_param(sc, gfp_mask));
+}
+
+void ldlm_pools_recalc(ldlm_side_t client)
+{
+	__u32 nr_l = 0, nr_p = 0, l;
+	struct ldlm_namespace *ns;
+	int nr, equal = 0;
+
+	/*
+	 * No need to setup pool limit for client pools.
+	 */
+	if (client == LDLM_NAMESPACE_SERVER) {
+		/*
+		 * Check all modest namespaces first.
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		list_for_each_entry(ns, ldlm_namespace_list(client),
+					ns_list_chain)
+		{
+			if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+				continue;
+
+			l = ldlm_pool_granted(&ns->ns_pool);
+			if (l == 0)
+				l = 1;
+
+			/*
+			 * Set the modest pools limit equal to their avg granted
+			 * locks + ~6%.
+			 */
+			l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+			ldlm_pool_setup(&ns->ns_pool, l);
+			nr_l += l;
+			nr_p++;
+		}
+
+		/*
+		 * Make sure that modest namespaces did not eat more that 2/3
+		 * of limit.
+		 */
+		if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+			CWARN("\"Modest\" pools eat out 2/3 of server locks "
+			      "limit (%d of %lu). This means that you have too "
+			      "many clients for this amount of server RAM. "
+			      "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
+			equal = 1;
+		}
+
+		/*
+		 * The rest is given to greedy namespaces.
+		 */
+		list_for_each_entry(ns, ldlm_namespace_list(client),
+					ns_list_chain)
+		{
+			if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+				continue;
+
+			if (equal) {
+				/*
+				 * In the case 2/3 locks are eaten out by
+				 * modest pools, we re-setup equal limit
+				 * for _all_ pools.
+				 */
+				l = LDLM_POOL_HOST_L /
+					atomic_read(
+						ldlm_namespace_nr(client));
+			} else {
+				/*
+				 * All the rest of greedy pools will have
+				 * all locks in equal parts.
+				 */
+				l = (LDLM_POOL_HOST_L - nr_l) /
+					(atomic_read(
+						ldlm_namespace_nr(client)) -
+					 nr_p);
+			}
+			ldlm_pool_setup(&ns->ns_pool, l);
+		}
+		mutex_unlock(ldlm_namespace_lock(client));
+	}
+
+	/*
+	 * Recalc at least ldlm_namespace_nr(client) namespaces.
+	 */
+	for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) {
+		int     skip;
+		/*
+		 * Lock the list, get first @ns in the list, getref, move it
+		 * to the tail, unlock and call pool recalc. This way we avoid
+		 * calling recalc under @ns lock what is really good as we get
+		 * rid of potential deadlock on client nodes when canceling
+		 * locks synchronously.
+		 */
+		mutex_lock(ldlm_namespace_lock(client));
+		if (list_empty(ldlm_namespace_list(client))) {
+			mutex_unlock(ldlm_namespace_lock(client));
+			break;
+		}
+		ns = ldlm_namespace_first_locked(client);
+
+		spin_lock(&ns->ns_lock);
+		/*
+		 * skip ns which is being freed, and we don't want to increase
+		 * its refcount again, not even temporarily. bz21519 & LU-499.
+		 */
+		if (ns->ns_stopping) {
+			skip = 1;
+		} else {
+			skip = 0;
+			ldlm_namespace_get(ns);
+		}
+		spin_unlock(&ns->ns_lock);
+
+		ldlm_namespace_move_locked(ns, client);
+		mutex_unlock(ldlm_namespace_lock(client));
+
+		/*
+		 * After setup is done - recalc the pool.
+		 */
+		if (!skip) {
+			ldlm_pool_recalc(&ns->ns_pool);
+			ldlm_namespace_put(ns);
+		}
+	}
+}
+EXPORT_SYMBOL(ldlm_pools_recalc);
+
+static int ldlm_pools_thread_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+	ENTRY;
+
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
+		"ldlm_poold", current_pid());
+
+	while (1) {
+		struct l_wait_info lwi;
+
+		/*
+		 * Recal all pools on this tick.
+		 */
+		ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
+		ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
+
+		/*
+		 * Wait until the next check time, or until we're
+		 * stopped.
+		 */
+		lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD),
+				  NULL, NULL);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopping(thread) ||
+			     thread_is_event(thread),
+			     &lwi);
+
+		if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+			break;
+		else
+			thread_test_and_clear_flags(thread, SVC_EVENT);
+	}
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
+		"ldlm_poold", current_pid());
+
+	complete_and_exit(&ldlm_pools_comp, 0);
+}
+
+static int ldlm_pools_thread_start(void)
+{
+	struct l_wait_info lwi = { 0 };
+	task_t *task;
+	ENTRY;
+
+	if (ldlm_pools_thread != NULL)
+		RETURN(-EALREADY);
+
+	OBD_ALLOC_PTR(ldlm_pools_thread);
+	if (ldlm_pools_thread == NULL)
+		RETURN(-ENOMEM);
+
+	init_completion(&ldlm_pools_comp);
+	init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq);
+
+	task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread,
+			   "ldlm_poold");
+	if (IS_ERR(task)) {
+		CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task));
+		OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
+		ldlm_pools_thread = NULL;
+		RETURN(PTR_ERR(task));
+	}
+	l_wait_event(ldlm_pools_thread->t_ctl_waitq,
+		     thread_is_running(ldlm_pools_thread), &lwi);
+	RETURN(0);
+}
+
+static void ldlm_pools_thread_stop(void)
+{
+	ENTRY;
+
+	if (ldlm_pools_thread == NULL) {
+		EXIT;
+		return;
+	}
+
+	thread_set_flags(ldlm_pools_thread, SVC_STOPPING);
+	wake_up(&ldlm_pools_thread->t_ctl_waitq);
+
+	/*
+	 * Make sure that pools thread is finished before freeing @thread.
+	 * This fixes possible race and oops due to accessing freed memory
+	 * in pools thread.
+	 */
+	wait_for_completion(&ldlm_pools_comp);
+	OBD_FREE_PTR(ldlm_pools_thread);
+	ldlm_pools_thread = NULL;
+	EXIT;
+}
+
+int ldlm_pools_init(void)
+{
+	int rc;
+	ENTRY;
+
+	rc = ldlm_pools_thread_start();
+	if (rc == 0) {
+		ldlm_pools_srv_shrinker =
+			set_shrinker(DEFAULT_SEEKS,
+					 ldlm_pools_srv_shrink);
+		ldlm_pools_cli_shrinker =
+			set_shrinker(DEFAULT_SEEKS,
+					 ldlm_pools_cli_shrink);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_pools_init);
+
+void ldlm_pools_fini(void)
+{
+	if (ldlm_pools_srv_shrinker != NULL) {
+		remove_shrinker(ldlm_pools_srv_shrinker);
+		ldlm_pools_srv_shrinker = NULL;
+	}
+	if (ldlm_pools_cli_shrinker != NULL) {
+		remove_shrinker(ldlm_pools_cli_shrinker);
+		ldlm_pools_cli_shrinker = NULL;
+	}
+	ldlm_pools_thread_stop();
+}
+EXPORT_SYMBOL(ldlm_pools_fini);

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
new file mode 100644
index 0000000..1a690ed
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c

@@ -0,0 +1,2333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/**
+ * This file contains Asynchronous System Trap (AST) handlers and related
+ * LDLM request-processing routines.
+ *
+ * An AST is a callback issued on a lock when its state is changed. There are
+ * several different types of ASTs (callbacks) registered for each lock:
+ *
+ * - completion AST: when a lock is enqueued by some process, but cannot be
+ *   granted immediately due to other conflicting locks on the same resource,
+ *   the completion AST is sent to notify the caller when the lock is
+ *   eventually granted
+ *
+ * - blocking AST: when a lock is granted to some process, if another process
+ *   enqueues a conflicting (blocking) lock on a resource, a blocking AST is
+ *   sent to notify the holder(s) of the lock(s) of the conflicting lock
+ *   request. The lock holder(s) must release their lock(s) on that resource in
+ *   a timely manner or be evicted by the server.
+ *
+ * - glimpse AST: this is used when a process wants information about a lock
+ *   (i.e. the lock value block (LVB)) but does not necessarily require holding
+ *   the lock. If the resource is locked, the lock holder(s) are sent glimpse
+ *   ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL
+ *   their lock(s) if they are idle. If the resource is not locked, the server
+ *   may grant the lock.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <obd.h>
+
+#include "ldlm_internal.h"
+
+int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
+		"lock enqueue timeout minimum");
+
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
+static void interrupted_completion_wait(void *data)
+{
+}
+
+struct lock_wait_data {
+	struct ldlm_lock *lwd_lock;
+	__u32	     lwd_conn_cnt;
+};
+
+struct ldlm_async_args {
+	struct lustre_handle lock_handle;
+};
+
+int ldlm_expired_completion_wait(void *data)
+{
+	struct lock_wait_data *lwd = data;
+	struct ldlm_lock *lock = lwd->lwd_lock;
+	struct obd_import *imp;
+	struct obd_device *obd;
+
+	ENTRY;
+	if (lock->l_conn_export == NULL) {
+		static cfs_time_t next_dump = 0, last_dump = 0;
+
+		if (ptlrpc_check_suspend())
+			RETURN(0);
+
+		LCONSOLE_WARN("lock timed out (enqueued at "CFS_TIME_T", "
+			      CFS_DURATION_T"s ago)\n",
+			      lock->l_last_activity,
+			      cfs_time_sub(cfs_time_current_sec(),
+					   lock->l_last_activity));
+		LDLM_DEBUG(lock, "lock timed out (enqueued at "CFS_TIME_T", "
+			   CFS_DURATION_T"s ago); not entering recovery in "
+			   "server code, just going back to sleep",
+			   lock->l_last_activity,
+			   cfs_time_sub(cfs_time_current_sec(),
+					lock->l_last_activity));
+		if (cfs_time_after(cfs_time_current(), next_dump)) {
+			last_dump = next_dump;
+			next_dump = cfs_time_shift(300);
+			ldlm_namespace_dump(D_DLMTRACE,
+					    ldlm_lock_to_ns(lock));
+			if (last_dump == 0)
+				libcfs_debug_dumplog();
+		}
+		RETURN(0);
+	}
+
+	obd = lock->l_conn_export->exp_obd;
+	imp = obd->u.cli.cl_import;
+	ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
+	LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", "
+		  CFS_DURATION_T"s ago), entering recovery for %s@%s",
+		  lock->l_last_activity,
+		  cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity),
+		  obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_expired_completion_wait);
+
+/* We use the same basis for both server side and client side functions
+   from a single node. */
+int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+{
+	int timeout = at_get(ldlm_lock_to_ns_at(lock));
+	if (AT_OFF)
+		return obd_timeout / 2;
+	/* Since these are non-updating timeouts, we should be conservative.
+	   It would be nice to have some kind of "early reply" mechanism for
+	   lock callbacks too... */
+	timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */
+	return max(timeout, ldlm_enqueue_min);
+}
+EXPORT_SYMBOL(ldlm_get_enq_timeout);
+
+/**
+ * Helper function for ldlm_completion_ast(), updating timings when lock is
+ * actually granted.
+ */
+static int ldlm_completion_tail(struct ldlm_lock *lock)
+{
+	long delay;
+	int  result;
+
+	if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED) {
+		LDLM_DEBUG(lock, "client-side enqueue: destroyed");
+		result = -EIO;
+	} else {
+		delay = cfs_time_sub(cfs_time_current_sec(),
+				     lock->l_last_activity);
+		LDLM_DEBUG(lock, "client-side enqueue: granted after "
+			   CFS_DURATION_T"s", delay);
+
+		/* Update our time estimate */
+		at_measured(ldlm_lock_to_ns_at(lock),
+			    delay);
+		result = 0;
+	}
+	return result;
+}
+
+/**
+ * Implementation of ->l_completion_ast() for a client, that doesn't wait
+ * until lock is granted. Suitable for locks enqueued through ptlrpcd, of
+ * other threads that cannot block for long.
+ */
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	ENTRY;
+
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+		RETURN(0);
+	}
+
+	if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+		       LDLM_FL_BLOCK_CONV))) {
+		wake_up(&lock->l_waitq);
+		RETURN(ldlm_completion_tail(lock));
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+		   "going forward");
+	ldlm_reprocess_all(lock->l_resource);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_completion_ast_async);
+
+/**
+ * Generic LDLM "completion" AST. This is called in several cases:
+ *
+ *     - when a reply to an ENQUEUE RPC is received from the server
+ *       (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at
+ *       this point (determined by flags);
+ *
+ *     - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has
+ *       been granted;
+ *
+ *     - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock
+ *       gets correct lvb;
+ *
+ *     - to force all locks when resource is destroyed (cleanup_resource());
+ *
+ *     - during lock conversion (not used currently).
+ *
+ * If lock is not granted in the first case, this function waits until second
+ * or penultimate cases happen in some other thread.
+ *
+ */
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+	/* XXX ALLOCATE - 160 bytes */
+	struct lock_wait_data lwd;
+	struct obd_device *obd;
+	struct obd_import *imp = NULL;
+	struct l_wait_info lwi;
+	__u32 timeout;
+	int rc = 0;
+	ENTRY;
+
+	if (flags == LDLM_FL_WAIT_NOREPROC) {
+		LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+		goto noreproc;
+	}
+
+	if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+		       LDLM_FL_BLOCK_CONV))) {
+		wake_up(&lock->l_waitq);
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+		   "sleeping");
+
+noreproc:
+
+	obd = class_exp2obd(lock->l_conn_export);
+
+	/* if this is a local lock, then there is no import */
+	if (obd != NULL) {
+		imp = obd->u.cli.cl_import;
+	}
+
+	/* Wait a long time for enqueue - server may have to callback a
+	   lock from another client.  Server will evict the other client if it
+	   doesn't respond reasonably, and then give us the lock. */
+	timeout = ldlm_get_enq_timeout(lock) * 2;
+
+	lwd.lwd_lock = lock;
+
+	if (lock->l_flags & LDLM_FL_NO_TIMEOUT) {
+		LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
+		lwi = LWI_INTR(interrupted_completion_wait, &lwd);
+	} else {
+		lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+				       ldlm_expired_completion_wait,
+				       interrupted_completion_wait, &lwd);
+	}
+
+	if (imp != NULL) {
+		spin_lock(&imp->imp_lock);
+		lwd.lwd_conn_cnt = imp->imp_conn_cnt;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	if (ns_is_client(ldlm_lock_to_ns(lock)) &&
+	    OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
+				 OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
+		lock->l_flags |= LDLM_FL_FAIL_LOC;
+		rc = -EINTR;
+	} else {
+		/* Go to sleep until the lock is granted or cancelled. */
+		rc = l_wait_event(lock->l_waitq,
+				  is_granted_or_cancelled(lock), &lwi);
+	}
+
+	if (rc) {
+		LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+			   rc);
+		RETURN(rc);
+	}
+
+	RETURN(ldlm_completion_tail(lock));
+}
+EXPORT_SYMBOL(ldlm_completion_ast);
+
+/**
+ * A helper to build a blocking AST function
+ *
+ * Perform a common operation for blocking ASTs:
+ * defferred lock cancellation.
+ *
+ * \param lock the lock blocking or canceling AST was called on
+ * \retval 0
+ * \see mdt_blocking_ast
+ * \see ldlm_blocking_ast
+ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock)
+{
+	int do_ast;
+	ENTRY;
+
+	lock->l_flags |= LDLM_FL_CBPENDING;
+	do_ast = (!lock->l_readers && !lock->l_writers);
+	unlock_res_and_lock(lock);
+
+	if (do_ast) {
+		struct lustre_handle lockh;
+		int rc;
+
+		LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0)
+			CERROR("ldlm_cli_cancel: %d\n", rc);
+	} else {
+		LDLM_DEBUG(lock, "Lock still has references, will be "
+			   "cancelled later");
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_blocking_ast_nocheck);
+
+/**
+ * Server blocking AST
+ *
+ * ->l_blocking_ast() callback for LDLM locks acquired by server-side
+ * OBDs.
+ *
+ * \param lock the lock which blocks a request or cancelling lock
+ * \param desc unused
+ * \param data unused
+ * \param flag indicates whether this cancelling or blocking callback
+ * \retval 0
+ * \see ldlm_blocking_ast_nocheck
+ */
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		      void *data, int flag)
+{
+	ENTRY;
+
+	if (flag == LDLM_CB_CANCELING) {
+		/* Don't need to do anything here. */
+		RETURN(0);
+	}
+
+	lock_res_and_lock(lock);
+	/* Get this: if ldlm_blocking_ast is racing with intent_policy, such
+	 * that ldlm_blocking_ast is called just before intent_policy method
+	 * takes the lr_lock, then by the time we get the lock, we might not
+	 * be the correct blocking function anymore.  So check, and return
+	 * early, if so. */
+	if (lock->l_blocking_ast != ldlm_blocking_ast) {
+		unlock_res_and_lock(lock);
+		RETURN(0);
+	}
+	RETURN(ldlm_blocking_ast_nocheck(lock));
+}
+EXPORT_SYMBOL(ldlm_blocking_ast);
+
+/**
+ * ->l_glimpse_ast() for DLM extent locks acquired on the server-side. See
+ * comment in filter_intent_policy() on why you may need this.
+ */
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+{
+	/*
+	 * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for
+	 * that is rather subtle: with OST-side locking, it may so happen that
+	 * _all_ extent locks are held by the OST. If client wants to obtain
+	 * current file size it calls ll{,u}_glimpse_size(), and (as locks are
+	 * on the server), dummy glimpse callback fires and does
+	 * nothing. Client still receives correct file size due to the
+	 * following fragment in filter_intent_policy():
+	 *
+	 * rc = l->l_glimpse_ast(l, NULL); // this will update the LVB
+	 * if (rc != 0 && res->lr_namespace->ns_lvbo &&
+	 *     res->lr_namespace->ns_lvbo->lvbo_update) {
+	 *	 res->lr_namespace->ns_lvbo->lvbo_update(res, NULL, 0, 1);
+	 * }
+	 *
+	 * that is, after glimpse_ast() fails, filter_lvbo_update() runs, and
+	 * returns correct file size to the client.
+	 */
+	return -ELDLM_NO_LOCK_DATA;
+}
+EXPORT_SYMBOL(ldlm_glimpse_ast);
+
+/**
+ * Enqueue a local lock (typically on a server).
+ */
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   ldlm_type_t type, ldlm_policy_data_t *policy,
+			   ldlm_mode_t mode, __u64 *flags,
+			   ldlm_blocking_callback blocking,
+			   ldlm_completion_callback completion,
+			   ldlm_glimpse_callback glimpse,
+			   void *data, __u32 lvb_len, enum lvb_type lvb_type,
+			   const __u64 *client_cookie,
+			   struct lustre_handle *lockh)
+{
+	struct ldlm_lock *lock;
+	int err;
+	const struct ldlm_callback_suite cbs = { .lcs_completion = completion,
+						 .lcs_blocking   = blocking,
+						 .lcs_glimpse    = glimpse,
+	};
+	ENTRY;
+
+	LASSERT(!(*flags & LDLM_FL_REPLAY));
+	if (unlikely(ns_is_client(ns))) {
+		CERROR("Trying to enqueue local lock in a shadow namespace\n");
+		LBUG();
+	}
+
+	lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len,
+				lvb_type);
+	if (unlikely(!lock))
+		GOTO(out_nolock, err = -ENOMEM);
+
+	ldlm_lock2handle(lock, lockh);
+
+	/* NB: we don't have any lock now (lock_res_and_lock)
+	 * because it's a new lock */
+	ldlm_lock_addref_internal_nolock(lock, mode);
+	lock->l_flags |= LDLM_FL_LOCAL;
+	if (*flags & LDLM_FL_ATOMIC_CB)
+		lock->l_flags |= LDLM_FL_ATOMIC_CB;
+
+	if (policy != NULL)
+		lock->l_policy_data = *policy;
+	if (client_cookie != NULL)
+		lock->l_client_cookie = *client_cookie;
+	if (type == LDLM_EXTENT)
+		lock->l_req_extent = policy->l_extent;
+
+	err = ldlm_lock_enqueue(ns, &lock, policy, flags);
+	if (unlikely(err != ELDLM_OK))
+		GOTO(out, err);
+
+	if (policy != NULL)
+		*policy = lock->l_policy_data;
+
+	if (lock->l_completion_ast)
+		lock->l_completion_ast(lock, *flags, NULL);
+
+	LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
+	EXIT;
+ out:
+	LDLM_LOCK_RELEASE(lock);
+ out_nolock:
+	return err;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_local);
+
+static void failed_lock_cleanup(struct ldlm_namespace *ns,
+				struct ldlm_lock *lock, int mode)
+{
+	int need_cancel = 0;
+
+	/* Set a flag to prevent us from sending a CANCEL (bug 407) */
+	lock_res_and_lock(lock);
+	/* Check that lock is not granted or failed, we might race. */
+	if ((lock->l_req_mode != lock->l_granted_mode) &&
+	    !(lock->l_flags & LDLM_FL_FAILED)) {
+		/* Make sure that this lock will not be found by raced
+		 * bl_ast and -EINVAL reply is sent to server anyways.
+		 * bug 17645 */
+		lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED |
+				 LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING;
+		need_cancel = 1;
+	}
+	unlock_res_and_lock(lock);
+
+	if (need_cancel)
+		LDLM_DEBUG(lock,
+			   "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | "
+			   "LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING");
+	else
+		LDLM_DEBUG(lock, "lock was granted or failed in race");
+
+	ldlm_lock_decref_internal(lock, mode);
+
+	/* XXX - HACK because we shouldn't call ldlm_lock_destroy()
+	 *       from llite/file.c/ll_file_flock(). */
+	/* This code makes for the fact that we do not have blocking handler on
+	 * a client for flock locks. As such this is the place where we must
+	 * completely kill failed locks. (interrupted and those that
+	 * were waiting to be granted when server evicted us. */
+	if (lock->l_resource->lr_type == LDLM_FLOCK) {
+		lock_res_and_lock(lock);
+		ldlm_resource_unlink_lock(lock);
+		ldlm_lock_destroy_nolock(lock);
+		unlock_res_and_lock(lock);
+	}
+}
+
+/**
+ * Finishing portion of client lock enqueue code.
+ *
+ * Called after receiving reply from server.
+ */
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+			  ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+			  __u64 *flags, void *lvb, __u32 lvb_len,
+			  struct lustre_handle *lockh,int rc)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	int is_replay = *flags & LDLM_FL_REPLAY;
+	struct ldlm_lock *lock;
+	struct ldlm_reply *reply;
+	int cleanup_phase = 1;
+	int size = 0;
+	ENTRY;
+
+	lock = ldlm_handle2lock(lockh);
+	/* ldlm_cli_enqueue is holding a reference on this lock. */
+	if (!lock) {
+		LASSERT(type == LDLM_FLOCK);
+		RETURN(-ENOLCK);
+	}
+
+	LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len),
+		 "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len);
+
+	if (rc != ELDLM_OK) {
+		LASSERT(!is_replay);
+		LDLM_DEBUG(lock, "client-side enqueue END (%s)",
+			   rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED");
+
+		if (rc != ELDLM_LOCK_ABORTED)
+			GOTO(cleanup, rc);
+	}
+
+	/* Before we return, swab the reply */
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL)
+		GOTO(cleanup, rc = -EPROTO);
+
+	if (lvb_len != 0) {
+		LASSERT(lvb != NULL);
+
+		size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB,
+					    RCL_SERVER);
+		if (size < 0) {
+			LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size);
+			GOTO(cleanup, rc = size);
+		} else if (unlikely(size > lvb_len)) {
+			LDLM_ERROR(lock, "Replied LVB is larger than "
+				   "expectation, expected = %d, replied = %d",
+				   lvb_len, size);
+			GOTO(cleanup, rc = -EINVAL);
+		}
+	}
+
+	if (rc == ELDLM_LOCK_ABORTED) {
+		if (lvb_len != 0)
+			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+					   lvb, size);
+		GOTO(cleanup, rc = (rc != 0 ? rc : ELDLM_LOCK_ABORTED));
+	}
+
+	/* lock enqueued on the server */
+	cleanup_phase = 0;
+
+	lock_res_and_lock(lock);
+	/* Key change rehash lock in per-export hash with new key */
+	if (exp->exp_lock_hash) {
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_rehash_key(exp->exp_lock_hash,
+				    &lock->l_remote_handle,
+				    &reply->lock_handle,
+				    &lock->l_exp_hash);
+	} else {
+		lock->l_remote_handle = reply->lock_handle;
+	}
+
+	*flags = ldlm_flags_from_wire(reply->lock_flags);
+	lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+					      LDLM_INHERIT_FLAGS);
+	/* move NO_TIMEOUT flag to the lock to force ldlm_lock_match()
+	 * to wait with no timeout as well */
+	lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+					      LDLM_FL_NO_TIMEOUT);
+	unlock_res_and_lock(lock);
+
+	CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: 0x%llx\n",
+	       lock, reply->lock_handle.cookie, *flags);
+
+	/* If enqueue returned a blocked lock but the completion handler has
+	 * already run, then it fixed up the resource and we don't need to do it
+	 * again. */
+	if ((*flags) & LDLM_FL_LOCK_CHANGED) {
+		int newmode = reply->lock_desc.l_req_mode;
+		LASSERT(!is_replay);
+		if (newmode && newmode != lock->l_req_mode) {
+			LDLM_DEBUG(lock, "server returned different mode %s",
+				   ldlm_lockname[newmode]);
+			lock->l_req_mode = newmode;
+		}
+
+		if (memcmp(reply->lock_desc.l_resource.lr_name.name,
+			  lock->l_resource->lr_name.name,
+			  sizeof(struct ldlm_res_id))) {
+			CDEBUG(D_INFO, "remote intent success, locking "
+					"(%ld,%ld,%ld) instead of "
+					"(%ld,%ld,%ld)\n",
+			      (long)reply->lock_desc.l_resource.lr_name.name[0],
+			      (long)reply->lock_desc.l_resource.lr_name.name[1],
+			      (long)reply->lock_desc.l_resource.lr_name.name[2],
+			      (long)lock->l_resource->lr_name.name[0],
+			      (long)lock->l_resource->lr_name.name[1],
+			      (long)lock->l_resource->lr_name.name[2]);
+
+			rc = ldlm_lock_change_resource(ns, lock,
+					&reply->lock_desc.l_resource.lr_name);
+			if (rc || lock->l_resource == NULL)
+				GOTO(cleanup, rc = -ENOMEM);
+			LDLM_DEBUG(lock, "client-side enqueue, new resource");
+		}
+		if (with_policy)
+			if (!(type == LDLM_IBITS &&
+			      !(exp_connect_flags(exp) & OBD_CONNECT_IBITS)))
+				/* We assume lock type cannot change on server*/
+				ldlm_convert_policy_to_local(exp,
+						lock->l_resource->lr_type,
+						&reply->lock_desc.l_policy_data,
+						&lock->l_policy_data);
+		if (type != LDLM_PLAIN)
+			LDLM_DEBUG(lock,"client-side enqueue, new policy data");
+	}
+
+	if ((*flags) & LDLM_FL_AST_SENT ||
+	    /* Cancel extent locks as soon as possible on a liblustre client,
+	     * because it cannot handle asynchronous ASTs robustly (see
+	     * bug 7311). */
+	    (LIBLUSTRE_CLIENT && type == LDLM_EXTENT)) {
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING |  LDLM_FL_BL_AST;
+		unlock_res_and_lock(lock);
+		LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+	}
+
+	/* If the lock has already been granted by a completion AST, don't
+	 * clobber the LVB with an older one. */
+	if (lvb_len != 0) {
+		/* We must lock or a racing completion might update lvb without
+		 * letting us know and we'll clobber the correct value.
+		 * Cannot unlock after the check either, a that still leaves
+		 * a tiny window for completion to get in */
+		lock_res_and_lock(lock);
+		if (lock->l_req_mode != lock->l_granted_mode)
+			rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+					   lock->l_lvb_data, size);
+		unlock_res_and_lock(lock);
+		if (rc < 0) {
+			cleanup_phase = 1;
+			GOTO(cleanup, rc);
+		}
+	}
+
+	if (!is_replay) {
+		rc = ldlm_lock_enqueue(ns, &lock, NULL, flags);
+		if (lock->l_completion_ast != NULL) {
+			int err = lock->l_completion_ast(lock, *flags, NULL);
+			if (!rc)
+				rc = err;
+			if (rc)
+				cleanup_phase = 1;
+		}
+	}
+
+	if (lvb_len && lvb != NULL) {
+		/* Copy the LVB here, and not earlier, because the completion
+		 * AST (if any) can override what we got in the reply */
+		memcpy(lvb, lock->l_lvb_data, lvb_len);
+	}
+
+	LDLM_DEBUG(lock, "client-side enqueue END");
+	EXIT;
+cleanup:
+	if (cleanup_phase == 1 && rc)
+		failed_lock_cleanup(ns, lock, mode);
+	/* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
+	LDLM_LOCK_PUT(lock);
+	LDLM_LOCK_RELEASE(lock);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
+
+/**
+ * Estimate number of lock handles that would fit into request of given
+ * size.  PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into
+ * a single page on the send/receive side. XXX: 512 should be changed to
+ * more adequate value.
+ */
+static inline int ldlm_req_handles_avail(int req_size, int off)
+{
+	int avail;
+
+	avail = min_t(int, LDLM_MAXREQSIZE, PAGE_CACHE_SIZE - 512) - req_size;
+	if (likely(avail >= 0))
+		avail /= (int)sizeof(struct lustre_handle);
+	else
+		avail = 0;
+	avail += LDLM_LOCKREQ_HANDLES - off;
+
+	return avail;
+}
+
+static inline int ldlm_capsule_handles_avail(struct req_capsule *pill,
+					     enum req_location loc,
+					     int off)
+{
+	int size = req_capsule_msg_size(pill, loc);
+	return ldlm_req_handles_avail(size, off);
+}
+
+static inline int ldlm_format_handles_avail(struct obd_import *imp,
+					    const struct req_format *fmt,
+					    enum req_location loc, int off)
+{
+	int size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc);
+	return ldlm_req_handles_avail(size, off);
+}
+
+/**
+ * Cancel LRU locks and pack them into the enqueue request. Pack there the given
+ * \a count locks in \a cancels.
+ *
+ * This is to be called by functions preparing their own requests that
+ * might contain lists of locks to cancel in addition to actual operation
+ * that needs to be performed.
+ */
+int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
+		      int version, int opc, int canceloff,
+		      struct list_head *cancels, int count)
+{
+	struct ldlm_namespace   *ns = exp->exp_obd->obd_namespace;
+	struct req_capsule      *pill = &req->rq_pill;
+	struct ldlm_request     *dlm = NULL;
+	int flags, avail, to_free, pack = 0;
+	LIST_HEAD(head);
+	int rc;
+	ENTRY;
+
+	if (cancels == NULL)
+		cancels = &head;
+	if (ns_connect_cancelset(ns)) {
+		/* Estimate the amount of available space in the request. */
+		req_capsule_filled_sizes(pill, RCL_CLIENT);
+		avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
+
+		flags = ns_connect_lru_resize(ns) ?
+			LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+		to_free = !ns_connect_lru_resize(ns) &&
+			  opc == LDLM_ENQUEUE ? 1 : 0;
+
+		/* Cancel LRU locks here _only_ if the server supports
+		 * EARLY_CANCEL. Otherwise we have to send extra CANCEL
+		 * RPC, which will make us slower. */
+		if (avail > count)
+			count += ldlm_cancel_lru_local(ns, cancels, to_free,
+						       avail - count, 0, flags);
+		if (avail > count)
+			pack = count;
+		else
+			pack = avail;
+		req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT,
+				     ldlm_request_bufsize(pack, opc));
+	}
+
+	rc = ptlrpc_request_pack(req, version, opc);
+	if (rc) {
+		ldlm_lock_list_put(cancels, l_bl_ast, count);
+		RETURN(rc);
+	}
+
+	if (ns_connect_cancelset(ns)) {
+		if (canceloff) {
+			dlm = req_capsule_client_get(pill, &RMF_DLM_REQ);
+			LASSERT(dlm);
+			/* Skip first lock handler in ldlm_request_pack(),
+			 * this method will incrment @lock_count according
+			 * to the lock handle amount actually written to
+			 * the buffer. */
+			dlm->lock_count = canceloff;
+		}
+		/* Pack into the request @pack lock handles. */
+		ldlm_cli_cancel_list(cancels, pack, req, 0);
+		/* Prepare and send separate cancel RPC for others. */
+		ldlm_cli_cancel_list(cancels, count - pack, NULL, 0);
+	} else {
+		ldlm_lock_list_put(cancels, l_bl_ast, count);
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_prep_elc_req);
+
+int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
+			  struct list_head *cancels, int count)
+{
+	return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+				 LDLM_ENQUEUE_CANCEL_OFF, cancels, count);
+}
+EXPORT_SYMBOL(ldlm_prep_enqueue_req);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+	struct ptlrpc_request *req;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+EXPORT_SYMBOL(ldlm_enqueue_pack);
+
+/**
+ * Client-side lock enqueue.
+ *
+ * If a request has some specific initialisation it is passed in \a reqp,
+ * otherwise it is created in ldlm_cli_enqueue.
+ *
+ * Supports sync and async requests, pass \a async flag accordingly. If a
+ * request was created in ldlm_cli_enqueue and it is the async request,
+ * pass it to the caller in \a reqp.
+ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+		     struct ldlm_enqueue_info *einfo,
+		     const struct ldlm_res_id *res_id,
+		     ldlm_policy_data_t const *policy, __u64 *flags,
+		     void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+		     struct lustre_handle *lockh, int async)
+{
+	struct ldlm_namespace *ns;
+	struct ldlm_lock      *lock;
+	struct ldlm_request   *body;
+	int		    is_replay = *flags & LDLM_FL_REPLAY;
+	int		    req_passed_in = 1;
+	int		    rc, err;
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	LASSERT(exp != NULL);
+
+	ns = exp->exp_obd->obd_namespace;
+
+	/* If we're replaying this lock, just check some invariants.
+	 * If we're creating a new lock, get everything all setup nice. */
+	if (is_replay) {
+		lock = ldlm_handle2lock_long(lockh, 0);
+		LASSERT(lock != NULL);
+		LDLM_DEBUG(lock, "client-side enqueue START");
+		LASSERT(exp == lock->l_conn_export);
+	} else {
+		const struct ldlm_callback_suite cbs = {
+			.lcs_completion = einfo->ei_cb_cp,
+			.lcs_blocking   = einfo->ei_cb_bl,
+			.lcs_glimpse    = einfo->ei_cb_gl,
+			.lcs_weigh      = einfo->ei_cb_wg
+		};
+		lock = ldlm_lock_create(ns, res_id, einfo->ei_type,
+					einfo->ei_mode, &cbs, einfo->ei_cbdata,
+					lvb_len, lvb_type);
+		if (lock == NULL)
+			RETURN(-ENOMEM);
+		/* for the local lock, add the reference */
+		ldlm_lock_addref_internal(lock, einfo->ei_mode);
+		ldlm_lock2handle(lock, lockh);
+		if (policy != NULL) {
+			/* INODEBITS_INTEROP: If the server does not support
+			 * inodebits, we will request a plain lock in the
+			 * descriptor (ldlm_lock2desc() below) but use an
+			 * inodebits lock internally with both bits set.
+			 */
+			if (einfo->ei_type == LDLM_IBITS &&
+			    !(exp_connect_flags(exp) &
+			      OBD_CONNECT_IBITS))
+				lock->l_policy_data.l_inodebits.bits =
+					MDS_INODELOCK_LOOKUP |
+					MDS_INODELOCK_UPDATE;
+			else
+				lock->l_policy_data = *policy;
+		}
+
+		if (einfo->ei_type == LDLM_EXTENT)
+			lock->l_req_extent = policy->l_extent;
+		LDLM_DEBUG(lock, "client-side enqueue START, flags %llx\n",
+			   *flags);
+	}
+
+	lock->l_conn_export = exp;
+	lock->l_export = NULL;
+	lock->l_blocking_ast = einfo->ei_cb_bl;
+	lock->l_flags |= (*flags & LDLM_FL_NO_LRU);
+
+	/* lock not sent to server yet */
+
+	if (reqp == NULL || *reqp == NULL) {
+		req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+						&RQF_LDLM_ENQUEUE,
+						LUSTRE_DLM_VERSION,
+						LDLM_ENQUEUE);
+		if (req == NULL) {
+			failed_lock_cleanup(ns, lock, einfo->ei_mode);
+			LDLM_LOCK_RELEASE(lock);
+			RETURN(-ENOMEM);
+		}
+		req_passed_in = 0;
+		if (reqp)
+			*reqp = req;
+	} else {
+		int len;
+
+		req = *reqp;
+		len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ,
+					   RCL_CLIENT);
+		LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n",
+			 DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
+	}
+
+	/* Dump lock data into the request buffer */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	ldlm_lock2desc(lock, &body->lock_desc);
+	body->lock_flags = ldlm_flags_to_wire(*flags);
+	body->lock_handle[0] = *lockh;
+
+	/* Continue as normal. */
+	if (!req_passed_in) {
+		if (lvb_len > 0)
+			req_capsule_extend(&req->rq_pill,
+					   &RQF_LDLM_ENQUEUE_LVB);
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+				     lvb_len);
+		ptlrpc_request_set_replen(req);
+	}
+
+	/*
+	 * Liblustre client doesn't get extent locks, except for O_APPEND case
+	 * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where
+	 * [i_size, OBD_OBJECT_EOF] lock is taken.
+	 */
+	LASSERT(ergo(LIBLUSTRE_CLIENT, einfo->ei_type != LDLM_EXTENT ||
+		     policy->l_extent.end == OBD_OBJECT_EOF));
+
+	if (async) {
+		LASSERT(reqp != NULL);
+		RETURN(0);
+	}
+
+	LDLM_DEBUG(lock, "sending request");
+
+	rc = ptlrpc_queue_wait(req);
+
+	err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0,
+				    einfo->ei_mode, flags, lvb, lvb_len,
+				    lockh, rc);
+
+	/* If ldlm_cli_enqueue_fini did not find the lock, we need to free
+	 * one reference that we took */
+	if (err == -ENOLCK)
+		LDLM_LOCK_RELEASE(lock);
+	else
+		rc = err;
+
+	if (!req_passed_in && req != NULL) {
+		ptlrpc_req_finished(req);
+		if (reqp)
+			*reqp = NULL;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue);
+
+static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
+				  __u32 *flags)
+{
+	struct ldlm_resource *res;
+	int rc;
+	ENTRY;
+	if (ns_is_client(ldlm_lock_to_ns(lock))) {
+		CERROR("Trying to cancel local lock\n");
+		LBUG();
+	}
+	LDLM_DEBUG(lock, "client-side local convert");
+
+	res = ldlm_lock_convert(lock, new_mode, flags);
+	if (res) {
+		ldlm_reprocess_all(res);
+		rc = 0;
+	} else {
+		rc = EDEADLOCK;
+	}
+	LDLM_DEBUG(lock, "client-side local convert handler END");
+	LDLM_LOCK_PUT(lock);
+	RETURN(rc);
+}
+
+/* FIXME: one of ldlm_cli_convert or the server side should reject attempted
+ * conversion of locks which are on the waiting or converting queue */
+/* Caller of this code is supposed to take care of lock readers/writers
+   accounting */
+int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, __u32 *flags)
+{
+	struct ldlm_request   *body;
+	struct ldlm_reply     *reply;
+	struct ldlm_lock      *lock;
+	struct ldlm_resource  *res;
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	lock = ldlm_handle2lock(lockh);
+	if (!lock) {
+		LBUG();
+		RETURN(-EINVAL);
+	}
+	*flags = 0;
+
+	if (lock->l_conn_export == NULL)
+		RETURN(ldlm_cli_convert_local(lock, new_mode, flags));
+
+	LDLM_DEBUG(lock, "client-side convert");
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export),
+					&RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+					LDLM_CONVERT);
+	if (req == NULL) {
+		LDLM_LOCK_PUT(lock);
+		RETURN(-ENOMEM);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	body->lock_handle[0] = lock->l_remote_handle;
+
+	body->lock_desc.l_req_mode = new_mode;
+	body->lock_flags = ldlm_flags_to_wire(*flags);
+
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc != ELDLM_OK)
+		GOTO(out, rc);
+
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	if (req->rq_status)
+		GOTO(out, rc = req->rq_status);
+
+	res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
+	if (res != NULL) {
+		ldlm_reprocess_all(res);
+		/* Go to sleep until the lock is granted. */
+		/* FIXME: or cancelled. */
+		if (lock->l_completion_ast) {
+			rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
+						    NULL);
+			if (rc)
+				GOTO(out, rc);
+		}
+	} else {
+		rc = EDEADLOCK;
+	}
+	EXIT;
+ out:
+	LDLM_LOCK_PUT(lock);
+	ptlrpc_req_finished(req);
+	return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_convert);
+
+/**
+ * Cancel locks locally.
+ * Returns:
+ * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server
+ * \retval LDLM_FL_CANCELING otherwise;
+ * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC.
+ */
+static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
+{
+	__u64 rc = LDLM_FL_LOCAL_ONLY;
+	ENTRY;
+
+	if (lock->l_conn_export) {
+		bool local_only;
+
+		LDLM_DEBUG(lock, "client-side cancel");
+		/* Set this flag to prevent others from getting new references*/
+		lock_res_and_lock(lock);
+		lock->l_flags |= LDLM_FL_CBPENDING;
+		local_only = !!(lock->l_flags &
+				(LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
+		ldlm_cancel_callback(lock);
+		rc = (lock->l_flags & LDLM_FL_BL_AST) ?
+			LDLM_FL_BL_AST : LDLM_FL_CANCELING;
+		unlock_res_and_lock(lock);
+
+		if (local_only) {
+			CDEBUG(D_DLMTRACE, "not sending request (at caller's "
+			       "instruction)\n");
+			rc = LDLM_FL_LOCAL_ONLY;
+		}
+		ldlm_lock_cancel(lock);
+	} else {
+		if (ns_is_client(ldlm_lock_to_ns(lock))) {
+			LDLM_ERROR(lock, "Trying to cancel local lock");
+			LBUG();
+		}
+		LDLM_DEBUG(lock, "server-side local cancel");
+		ldlm_lock_cancel(lock);
+		ldlm_reprocess_all(lock->l_resource);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Pack \a count locks in \a head into ldlm_request buffer of request \a req.
+ */
+static void ldlm_cancel_pack(struct ptlrpc_request *req,
+			     struct list_head *head, int count)
+{
+	struct ldlm_request *dlm;
+	struct ldlm_lock *lock;
+	int max, packed = 0;
+	ENTRY;
+
+	dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	LASSERT(dlm != NULL);
+
+	/* Check the room in the request buffer. */
+	max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) -
+		sizeof(struct ldlm_request);
+	max /= sizeof(struct lustre_handle);
+	max += LDLM_LOCKREQ_HANDLES;
+	LASSERT(max >= dlm->lock_count + count);
+
+	/* XXX: it would be better to pack lock handles grouped by resource.
+	 * so that the server cancel would call filter_lvbo_update() less
+	 * frequently. */
+	list_for_each_entry(lock, head, l_bl_ast) {
+		if (!count--)
+			break;
+		LASSERT(lock->l_conn_export);
+		/* Pack the lock handle to the given request buffer. */
+		LDLM_DEBUG(lock, "packing");
+		dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle;
+		packed++;
+	}
+	CDEBUG(D_DLMTRACE, "%d locks packed\n", packed);
+	EXIT;
+}
+
+/**
+ * Prepare and send a batched cancel RPC. It will include \a count lock
+ * handles of locks given in \a cancels list. */
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
+			int count, ldlm_cancel_flags_t flags)
+{
+	struct ptlrpc_request *req = NULL;
+	struct obd_import *imp;
+	int free, sent = 0;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(exp != NULL);
+	LASSERT(count > 0);
+
+	CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val);
+
+	if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
+		RETURN(count);
+
+	free = ldlm_format_handles_avail(class_exp2cliimp(exp),
+					 &RQF_LDLM_CANCEL, RCL_CLIENT, 0);
+	if (count > free)
+		count = free;
+
+	while (1) {
+		imp = class_exp2cliimp(exp);
+		if (imp == NULL || imp->imp_invalid) {
+			CDEBUG(D_DLMTRACE,
+			       "skipping cancel on invalid import %p\n", imp);
+			RETURN(count);
+		}
+
+		req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL);
+		if (req == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT,
+				     ldlm_request_bufsize(count, LDLM_CANCEL));
+
+		rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL);
+		if (rc) {
+			ptlrpc_request_free(req);
+			GOTO(out, rc);
+		}
+
+		req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+		req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+		ptlrpc_at_set_req_timeout(req);
+
+		ldlm_cancel_pack(req, cancels, count);
+
+		ptlrpc_request_set_replen(req);
+		if (flags & LCF_ASYNC) {
+			ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+			sent = count;
+			GOTO(out, 0);
+		} else {
+			rc = ptlrpc_queue_wait(req);
+		}
+		if (rc == ESTALE) {
+			CDEBUG(D_DLMTRACE, "client/server (nid %s) "
+			       "out of sync -- not fatal\n",
+			       libcfs_nid2str(req->rq_import->
+					      imp_connection->c_peer.nid));
+			rc = 0;
+		} else if (rc == -ETIMEDOUT && /* check there was no reconnect*/
+			   req->rq_import_generation == imp->imp_generation) {
+			ptlrpc_req_finished(req);
+			continue;
+		} else if (rc != ELDLM_OK) {
+			/* -ESHUTDOWN is common on umount */
+			CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+				     "Got rc %d from cancel RPC: "
+				     "canceling anyway\n", rc);
+			break;
+		}
+		sent = count;
+		break;
+	}
+
+	ptlrpc_req_finished(req);
+	EXIT;
+out:
+	return sent ? sent : rc;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_req);
+
+static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
+{
+	LASSERT(imp != NULL);
+	return &imp->imp_obd->obd_namespace->ns_pool;
+}
+
+/**
+ * Update client's OBD pool related fields with new SLV and Limit from \a req.
+ */
+int ldlm_cli_update_pool(struct ptlrpc_request *req)
+{
+	struct obd_device *obd;
+	__u64 new_slv;
+	__u32 new_limit;
+	ENTRY;
+	if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
+		     !imp_connect_lru_resize(req->rq_import)))
+	{
+		/*
+		 * Do nothing for corner cases.
+		 */
+		RETURN(0);
+	}
+
+	/* In some cases RPC may contain SLV and limit zeroed out. This
+	 * is the case when server does not support LRU resize feature.
+	 * This is also possible in some recovery cases when server-side
+	 * reqs have no reference to the OBD export and thus access to
+	 * server-side namespace is not possible. */
+	if (lustre_msg_get_slv(req->rq_repmsg) == 0 ||
+	    lustre_msg_get_limit(req->rq_repmsg) == 0) {
+		DEBUG_REQ(D_HA, req, "Zero SLV or Limit found "
+			  "(SLV: "LPU64", Limit: %u)",
+			  lustre_msg_get_slv(req->rq_repmsg),
+			  lustre_msg_get_limit(req->rq_repmsg));
+		RETURN(0);
+	}
+
+	new_limit = lustre_msg_get_limit(req->rq_repmsg);
+	new_slv = lustre_msg_get_slv(req->rq_repmsg);
+	obd = req->rq_import->imp_obd;
+
+	/* Set new SLV and limit in OBD fields to make them accessible
+	 * to the pool thread. We do not access obd_namespace and pool
+	 * directly here as there is no reliable way to make sure that
+	 * they are still alive at cleanup time. Evil races are possible
+	 * which may cause Oops at that time. */
+	write_lock(&obd->obd_pool_lock);
+	obd->obd_pool_slv = new_slv;
+	obd->obd_pool_limit = new_limit;
+	write_unlock(&obd->obd_pool_lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_update_pool);
+
+/**
+ * Client side lock cancel.
+ *
+ * Lock must not have any readers or writers by this time.
+ */
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+		    ldlm_cancel_flags_t cancel_flags)
+{
+	struct obd_export *exp;
+	int avail, flags, count = 1;
+	__u64 rc = 0;
+	struct ldlm_namespace *ns;
+	struct ldlm_lock *lock;
+	LIST_HEAD(cancels);
+	ENTRY;
+
+	/* concurrent cancels on the same handle can happen */
+	lock = ldlm_handle2lock_long(lockh, LDLM_FL_CANCELING);
+	if (lock == NULL) {
+		LDLM_DEBUG_NOLOCK("lock is already being destroyed\n");
+		RETURN(0);
+	}
+
+	rc = ldlm_cli_cancel_local(lock);
+	if (rc == LDLM_FL_LOCAL_ONLY) {
+		LDLM_LOCK_RELEASE(lock);
+		RETURN(0);
+	}
+	/* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL
+	 * RPC which goes to canceld portal, so we can cancel other LRU locks
+	 * here and send them all as one LDLM_CANCEL RPC. */
+	LASSERT(list_empty(&lock->l_bl_ast));
+	list_add(&lock->l_bl_ast, &cancels);
+
+	exp = lock->l_conn_export;
+	if (exp_connect_cancelset(exp)) {
+		avail = ldlm_format_handles_avail(class_exp2cliimp(exp),
+						  &RQF_LDLM_CANCEL,
+						  RCL_CLIENT, 0);
+		LASSERT(avail > 0);
+
+		ns = ldlm_lock_to_ns(lock);
+		flags = ns_connect_lru_resize(ns) ?
+			LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+		count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
+					       LCF_BL_AST, flags);
+	}
+	ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel);
+
+/**
+ * Locally cancel up to \a count locks in list \a cancels.
+ * Return the number of cancelled locks.
+ */
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+			       ldlm_cancel_flags_t flags)
+{
+	LIST_HEAD(head);
+	struct ldlm_lock *lock, *next;
+	int left = 0, bl_ast = 0;
+	__u64 rc;
+
+	left = count;
+	list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
+		if (left-- == 0)
+			break;
+
+		if (flags & LCF_LOCAL) {
+			rc = LDLM_FL_LOCAL_ONLY;
+			ldlm_lock_cancel(lock);
+		} else {
+			rc = ldlm_cli_cancel_local(lock);
+		}
+		/* Until we have compound requests and can send LDLM_CANCEL
+		 * requests batched with generic RPCs, we need to send cancels
+		 * with the LDLM_FL_BL_AST flag in a separate RPC from
+		 * the one being generated now. */
+		if (!(flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) {
+			LDLM_DEBUG(lock, "Cancel lock separately");
+			list_del_init(&lock->l_bl_ast);
+			list_add(&lock->l_bl_ast, &head);
+			bl_ast++;
+			continue;
+		}
+		if (rc == LDLM_FL_LOCAL_ONLY) {
+			/* CANCEL RPC should not be sent to server. */
+			list_del_init(&lock->l_bl_ast);
+			LDLM_LOCK_RELEASE(lock);
+			count--;
+		}
+	}
+	if (bl_ast > 0) {
+		count -= bl_ast;
+		ldlm_cli_cancel_list(&head, bl_ast, NULL, 0);
+	}
+
+	RETURN(count);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list_local);
+
+/**
+ * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns,
+						    struct ldlm_lock *lock,
+						    int unused, int added,
+						    int count)
+{
+	ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK;
+	ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery;
+	lock_res_and_lock(lock);
+
+	/* don't check added & count since we want to process all locks
+	 * from unused list */
+	switch (lock->l_resource->lr_type) {
+		case LDLM_EXTENT:
+		case LDLM_IBITS:
+			if (cb && cb(lock))
+				break;
+		default:
+			result = LDLM_POLICY_SKIP_LOCK;
+			lock->l_flags |= LDLM_FL_SKIPPED;
+			break;
+	}
+
+	unlock_res_and_lock(lock);
+	RETURN(result);
+}
+
+/**
+ * Callback function for LRU-resize policy. Decides whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current
+ * scan \a added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
+						 struct ldlm_lock *lock,
+						 int unused, int added,
+						 int count)
+{
+	cfs_time_t cur = cfs_time_current();
+	struct ldlm_pool *pl = &ns->ns_pool;
+	__u64 slv, lvf, lv;
+	cfs_time_t la;
+
+	/* Stop LRU processing when we reach past @count or have checked all
+	 * locks in LRU. */
+	if (count && added >= count)
+		return LDLM_POLICY_KEEP_LOCK;
+
+	slv = ldlm_pool_get_slv(pl);
+	lvf = ldlm_pool_get_lvf(pl);
+	la = cfs_duration_sec(cfs_time_sub(cur,
+			      lock->l_last_used));
+	lv = lvf * la * unused;
+
+	/* Inform pool about current CLV to see it via proc. */
+	ldlm_pool_set_clv(pl, lv);
+
+	/* Stop when SLV is not yet come from server or lv is smaller than
+	 * it is. */
+	return (slv == 0 || lv < slv) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for proc used policy. Makes decision whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
+ * added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
+						   struct ldlm_lock *lock,
+						   int unused, int added,
+						   int count)
+{
+	/* Stop LRU processing when we reach past @count or have checked all
+	 * locks in LRU. */
+	return (added >= count) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for aged policy. Makes decision whether to keep \a lock in
+ * LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
+						 struct ldlm_lock *lock,
+						 int unused, int added,
+						 int count)
+{
+	/* Stop LRU processing if young lock is found and we reach past count */
+	return ((added >= count) &&
+		cfs_time_before(cfs_time_current(),
+				cfs_time_add(lock->l_last_used,
+					     ns->ns_max_age))) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for default policy. Makes decision whether to keep \a lock
+ * in LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_default_policy(struct ldlm_namespace *ns,
+						    struct ldlm_lock *lock,
+						    int unused, int added,
+						    int count)
+{
+	/* Stop LRU processing when we reach past count or have checked all
+	 * locks in LRU. */
+	return (added >= count) ?
+		LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
+						      struct ldlm_lock *, int,
+						      int, int);
+
+static ldlm_cancel_lru_policy_t
+ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
+{
+	if (flags & LDLM_CANCEL_NO_WAIT)
+		return ldlm_cancel_no_wait_policy;
+
+	if (ns_connect_lru_resize(ns)) {
+		if (flags & LDLM_CANCEL_SHRINK)
+			/* We kill passed number of old locks. */
+			return ldlm_cancel_passed_policy;
+		else if (flags & LDLM_CANCEL_LRUR)
+			return ldlm_cancel_lrur_policy;
+		else if (flags & LDLM_CANCEL_PASSED)
+			return ldlm_cancel_passed_policy;
+	} else {
+		if (flags & LDLM_CANCEL_AGED)
+			return ldlm_cancel_aged_policy;
+	}
+
+	return ldlm_cancel_default_policy;
+}
+
+/**
+ * - Free space in LRU for \a count new locks,
+ *   redundant unused locks are canceled locally;
+ * - also cancel locally unused aged locks;
+ * - do not cancel more than \a max locks;
+ * - GET the found locks and add them into the \a cancels list.
+ *
+ * A client lock can be added to the l_bl_ast list only when it is
+ * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing
+ * CANCEL.  There are the following use cases:
+ * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and
+ * ldlm_cli_cancel(), which check and set this flag properly. As any
+ * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
+ * later without any special locking.
+ *
+ * Calling policies for enabled LRU resize:
+ * ----------------------------------------
+ * flags & LDLM_CANCEL_LRUR - use LRU resize policy (SLV from server) to
+ *			    cancel not more than \a count locks;
+ *
+ * flags & LDLM_CANCEL_PASSED - cancel \a count number of old locks (located at
+ *			      the beginning of LRU list);
+ *
+ * flags & LDLM_CANCEL_SHRINK - cancel not more than \a count locks according to
+ *			      memory pressre policy function;
+ *
+ * flags & LDLM_CANCEL_AGED - cancel \a count locks according to "aged policy".
+ *
+ * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible
+ *			       (typically before replaying locks) w/o
+ *			       sending any RPCs or waiting for any
+ *			       outstanding RPC to complete.
+ */
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, struct list_head *cancels,
+				 int count, int max, int flags)
+{
+	ldlm_cancel_lru_policy_t pf;
+	struct ldlm_lock *lock, *next;
+	int added = 0, unused, remained;
+	ENTRY;
+
+	spin_lock(&ns->ns_lock);
+	unused = ns->ns_nr_unused;
+	remained = unused;
+
+	if (!ns_connect_lru_resize(ns))
+		count += unused - ns->ns_max_unused;
+
+	pf = ldlm_cancel_lru_policy(ns, flags);
+	LASSERT(pf != NULL);
+
+	while (!list_empty(&ns->ns_unused_list)) {
+		ldlm_policy_res_t result;
+
+		/* all unused locks */
+		if (remained-- <= 0)
+			break;
+
+		/* For any flags, stop scanning if @max is reached. */
+		if (max && added >= max)
+			break;
+
+		list_for_each_entry_safe(lock, next, &ns->ns_unused_list,
+					     l_lru) {
+			/* No locks which got blocking requests. */
+			LASSERT(!(lock->l_flags & LDLM_FL_BL_AST));
+
+			if (flags & LDLM_CANCEL_NO_WAIT &&
+			    lock->l_flags & LDLM_FL_SKIPPED)
+				/* already processed */
+				continue;
+
+			/* Somebody is already doing CANCEL. No need for this
+			 * lock in LRU, do not traverse it again. */
+			if (!(lock->l_flags & LDLM_FL_CANCELING))
+				break;
+
+			ldlm_lock_remove_from_lru_nolock(lock);
+		}
+		if (&lock->l_lru == &ns->ns_unused_list)
+			break;
+
+		LDLM_LOCK_GET(lock);
+		spin_unlock(&ns->ns_lock);
+		lu_ref_add(&lock->l_reference, __FUNCTION__, current);
+
+		/* Pass the lock through the policy filter and see if it
+		 * should stay in LRU.
+		 *
+		 * Even for shrinker policy we stop scanning if
+		 * we find a lock that should stay in the cache.
+		 * We should take into account lock age anyway
+		 * as a new lock is a valuable resource even if
+		 * it has a low weight.
+		 *
+		 * That is, for shrinker policy we drop only
+		 * old locks, but additionally choose them by
+		 * their weight. Big extent locks will stay in
+		 * the cache. */
+		result = pf(ns, lock, unused, added, count);
+		if (result == LDLM_POLICY_KEEP_LOCK) {
+			lu_ref_del(&lock->l_reference,
+				   __FUNCTION__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			break;
+		}
+		if (result == LDLM_POLICY_SKIP_LOCK) {
+			lu_ref_del(&lock->l_reference,
+				   __func__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			continue;
+		}
+
+		lock_res_and_lock(lock);
+		/* Check flags again under the lock. */
+		if ((lock->l_flags & LDLM_FL_CANCELING) ||
+		    (ldlm_lock_remove_from_lru(lock) == 0)) {
+			/* Another thread is removing lock from LRU, or
+			 * somebody is already doing CANCEL, or there
+			 * is a blocking request which will send cancel
+			 * by itself, or the lock is no longer unused. */
+			unlock_res_and_lock(lock);
+			lu_ref_del(&lock->l_reference,
+				   __FUNCTION__, current);
+			LDLM_LOCK_RELEASE(lock);
+			spin_lock(&ns->ns_lock);
+			continue;
+		}
+		LASSERT(!lock->l_readers && !lock->l_writers);
+
+		/* If we have chosen to cancel this lock voluntarily, we
+		 * better send cancel notification to server, so that it
+		 * frees appropriate state. This might lead to a race
+		 * where while we are doing cancel here, server is also
+		 * silently cancelling this lock. */
+		lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK;
+
+		/* Setting the CBPENDING flag is a little misleading,
+		 * but prevents an important race; namely, once
+		 * CBPENDING is set, the lock can accumulate no more
+		 * readers/writers. Since readers and writers are
+		 * already zero here, ldlm_lock_decref() won't see
+		 * this flag and call l_blocking_ast */
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
+
+		/* We can't re-add to l_lru as it confuses the
+		 * refcounting in ldlm_lock_remove_from_lru() if an AST
+		 * arrives after we drop lr_lock below. We use l_bl_ast
+		 * and can't use l_pending_chain as it is used both on
+		 * server and client nevertheless bug 5666 says it is
+		 * used only on server */
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, cancels);
+		unlock_res_and_lock(lock);
+		lu_ref_del(&lock->l_reference, __FUNCTION__, current);
+		spin_lock(&ns->ns_lock);
+		added++;
+		unused--;
+	}
+	spin_unlock(&ns->ns_lock);
+	RETURN(added);
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
+			  int count, int max, ldlm_cancel_flags_t cancel_flags,
+			  int flags)
+{
+	int added;
+	added = ldlm_prepare_lru_list(ns, cancels, count, max, flags);
+	if (added <= 0)
+		return added;
+	return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
+}
+
+/**
+ * Cancel at least \a nr locks from given namespace LRU.
+ *
+ * When called with LCF_ASYNC the blocking callback will be handled
+ * in a thread and this function will return after the thread has been
+ * asked to call the callback.  When called with LCF_ASYNC the blocking
+ * callback will be performed in this function.
+ */
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+		    ldlm_cancel_flags_t cancel_flags,
+		    int flags)
+{
+	LIST_HEAD(cancels);
+	int count, rc;
+	ENTRY;
+
+	/* Just prepare the list of locks, do not actually cancel them yet.
+	 * Locks are cancelled later in a separate thread. */
+	count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags);
+	rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
+	if (rc == 0)
+		RETURN(count);
+
+	RETURN(0);
+}
+
+/**
+ * Find and cancel locally unused locks found on resource, matched to the
+ * given policy, mode. GET the found locks and add them into the \a cancels
+ * list.
+ */
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+			       struct list_head *cancels,
+			       ldlm_policy_data_t *policy,
+			       ldlm_mode_t mode, int lock_flags,
+			       ldlm_cancel_flags_t cancel_flags, void *opaque)
+{
+	struct ldlm_lock *lock;
+	int count = 0;
+	ENTRY;
+
+	lock_res(res);
+	list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+		if (opaque != NULL && lock->l_ast_data != opaque) {
+			LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+				   lock->l_ast_data, opaque);
+			//LBUG();
+			continue;
+		}
+
+		if (lock->l_readers || lock->l_writers)
+			continue;
+
+		/* If somebody is already doing CANCEL, or blocking AST came,
+		 * skip this lock. */
+		if (lock->l_flags & LDLM_FL_BL_AST ||
+		    lock->l_flags & LDLM_FL_CANCELING)
+			continue;
+
+		if (lockmode_compat(lock->l_granted_mode, mode))
+			continue;
+
+		/* If policy is given and this is IBITS lock, add to list only
+		 * those locks that match by policy. */
+		if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
+		    !(lock->l_policy_data.l_inodebits.bits &
+		      policy->l_inodebits.bits))
+			continue;
+
+		/* See CBPENDING comment in ldlm_cancel_lru */
+		lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
+				 lock_flags;
+
+		LASSERT(list_empty(&lock->l_bl_ast));
+		list_add(&lock->l_bl_ast, cancels);
+		LDLM_LOCK_GET(lock);
+		count++;
+	}
+	unlock_res(res);
+
+	RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
+}
+EXPORT_SYMBOL(ldlm_cancel_resource_local);
+
+/**
+ * Cancel client-side locks from a list and send/prepare cancel RPCs to the
+ * server.
+ * If \a req is NULL, send CANCEL request to server with handles of locks
+ * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests
+ * separately per lock.
+ * If \a req is not NULL, put handles of locks in \a cancels into the request
+ * buffer at the offset \a off.
+ * Destroy \a cancels at the end.
+ */
+int ldlm_cli_cancel_list(struct list_head *cancels, int count,
+			 struct ptlrpc_request *req, ldlm_cancel_flags_t flags)
+{
+	struct ldlm_lock *lock;
+	int res = 0;
+	ENTRY;
+
+	if (list_empty(cancels) || count == 0)
+		RETURN(0);
+
+	/* XXX: requests (both batched and not) could be sent in parallel.
+	 * Usually it is enough to have just 1 RPC, but it is possible that
+	 * there are too many locks to be cancelled in LRU or on a resource.
+	 * It would also speed up the case when the server does not support
+	 * the feature. */
+	while (count > 0) {
+		LASSERT(!list_empty(cancels));
+		lock = list_entry(cancels->next, struct ldlm_lock,
+				      l_bl_ast);
+		LASSERT(lock->l_conn_export);
+
+		if (exp_connect_cancelset(lock->l_conn_export)) {
+			res = count;
+			if (req)
+				ldlm_cancel_pack(req, cancels, count);
+			else
+				res = ldlm_cli_cancel_req(lock->l_conn_export,
+							  cancels, count,
+							  flags);
+		} else {
+			res = ldlm_cli_cancel_req(lock->l_conn_export,
+						  cancels, 1, flags);
+		}
+
+		if (res < 0) {
+			CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+				     "ldlm_cli_cancel_list: %d\n", res);
+			res = count;
+		}
+
+		count -= res;
+		ldlm_lock_list_put(cancels, l_bl_ast, res);
+	}
+	LASSERT(count == 0);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list);
+
+/**
+ * Cancel all locks on a resource that have 0 readers/writers.
+ *
+ * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+				    const struct ldlm_res_id *res_id,
+				    ldlm_policy_data_t *policy,
+				    ldlm_mode_t mode,
+				    ldlm_cancel_flags_t flags,
+				    void *opaque)
+{
+	struct ldlm_resource *res;
+	LIST_HEAD(cancels);
+	int count;
+	int rc;
+	ENTRY;
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (res == NULL) {
+		/* This is not a problem. */
+		CDEBUG(D_INFO, "No resource "LPU64"\n", res_id->name[0]);
+		RETURN(0);
+	}
+
+	LDLM_RESOURCE_ADDREF(res);
+	count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
+					   0, flags | LCF_BL_AST, opaque);
+	rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
+	if (rc != ELDLM_OK)
+		CERROR("ldlm_cli_cancel_unused_resource: %d\n", rc);
+
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource);
+
+struct ldlm_cli_cancel_arg {
+	int     lc_flags;
+	void   *lc_opaque;
+};
+
+static int ldlm_cli_hash_cancel_unused(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+				       struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource	   *res = cfs_hash_object(hs, hnode);
+	struct ldlm_cli_cancel_arg     *lc = arg;
+	int			     rc;
+
+	rc = ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+					     NULL, LCK_MINMODE,
+					     lc->lc_flags, lc->lc_opaque);
+	if (rc != 0) {
+		CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n",
+		       res->lr_name.name[0], rc);
+	}
+	/* must return 0 for hash iteration */
+	return 0;
+}
+
+/**
+ * Cancel all locks on a namespace (or a specific resource, if given)
+ * that have 0 readers/writers.
+ *
+ * If flags & LCF_LOCAL, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
+			   const struct ldlm_res_id *res_id,
+			   ldlm_cancel_flags_t flags, void *opaque)
+{
+	struct ldlm_cli_cancel_arg arg = {
+		.lc_flags       = flags,
+		.lc_opaque      = opaque,
+	};
+
+	ENTRY;
+
+	if (ns == NULL)
+		RETURN(ELDLM_OK);
+
+	if (res_id != NULL) {
+		RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
+						       LCK_MINMODE, flags,
+						       opaque));
+	} else {
+		cfs_hash_for_each_nolock(ns->ns_rs_hash,
+					 ldlm_cli_hash_cancel_unused, &arg);
+		RETURN(ELDLM_OK);
+	}
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused);
+
+/* Lock iterators. */
+
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+			  void *closure)
+{
+	struct list_head *tmp, *next;
+	struct ldlm_lock *lock;
+	int rc = LDLM_ITER_CONTINUE;
+
+	ENTRY;
+
+	if (!res)
+		RETURN(LDLM_ITER_CONTINUE);
+
+	lock_res(res);
+	list_for_each_safe(tmp, next, &res->lr_granted) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
+
+	list_for_each_safe(tmp, next, &res->lr_converting) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
+
+	list_for_each_safe(tmp, next, &res->lr_waiting) {
+		lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+		if (iter(lock, closure) == LDLM_ITER_STOP)
+			GOTO(out, rc = LDLM_ITER_STOP);
+	}
+ out:
+	unlock_res(res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_resource_foreach);
+
+struct iter_helper_data {
+	ldlm_iterator_t iter;
+	void *closure;
+};
+
+static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
+{
+	struct iter_helper_data *helper = closure;
+	return helper->iter(lock, helper->closure);
+}
+
+static int ldlm_res_iter_helper(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+				struct hlist_node *hnode, void *arg)
+
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+	return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+	       LDLM_ITER_STOP;
+}
+
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+			    ldlm_iterator_t iter, void *closure)
+
+{
+	struct iter_helper_data helper = { iter: iter, closure: closure };
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_iter_helper, &helper);
+
+}
+EXPORT_SYMBOL(ldlm_namespace_foreach);
+
+/* non-blocking function to manipulate a lock whose cb_data is being put away.
+ * return  0:  find no resource
+ *       > 0:  must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE.
+ *       < 0:  errors
+ */
+int ldlm_resource_iterate(struct ldlm_namespace *ns,
+			  const struct ldlm_res_id *res_id,
+			  ldlm_iterator_t iter, void *data)
+{
+	struct ldlm_resource *res;
+	int rc;
+	ENTRY;
+
+	if (ns == NULL) {
+		CERROR("must pass in namespace\n");
+		LBUG();
+	}
+
+	res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+	if (res == NULL)
+		RETURN(0);
+
+	LDLM_RESOURCE_ADDREF(res);
+	rc = ldlm_resource_foreach(res, iter, data);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_resource_iterate);
+
+/* Lock replay */
+
+static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
+{
+	struct list_head *list = closure;
+
+	/* we use l_pending_chain here, because it's unused on clients. */
+	LASSERTF(list_empty(&lock->l_pending_chain),
+		 "lock %p next %p prev %p\n",
+		 lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev);
+	/* bug 9573: don't replay locks left after eviction, or
+	 * bug 17614: locks being actively cancelled. Get a reference
+	 * on a lock so that it does not disapear under us (e.g. due to cancel)
+	 */
+	if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) {
+		list_add(&lock->l_pending_chain, list);
+		LDLM_LOCK_GET(lock);
+	}
+
+	return LDLM_ITER_CONTINUE;
+}
+
+static int replay_lock_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct ldlm_async_args *aa, int rc)
+{
+	struct ldlm_lock     *lock;
+	struct ldlm_reply    *reply;
+	struct obd_export    *exp;
+
+	ENTRY;
+	atomic_dec(&req->rq_import->imp_replay_inflight);
+	if (rc != ELDLM_OK)
+		GOTO(out, rc);
+
+
+	reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	if (reply == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lock = ldlm_handle2lock(&aa->lock_handle);
+	if (!lock) {
+		CERROR("received replay ack for unknown local cookie "LPX64
+		       " remote cookie "LPX64 " from server %s id %s\n",
+		       aa->lock_handle.cookie, reply->lock_handle.cookie,
+		       req->rq_export->exp_client_uuid.uuid,
+		       libcfs_id2str(req->rq_peer));
+		GOTO(out, rc = -ESTALE);
+	}
+
+	/* Key change rehash lock in per-export hash with new key */
+	exp = req->rq_export;
+	if (exp && exp->exp_lock_hash) {
+		/* In the function below, .hs_keycmp resolves to
+		 * ldlm_export_lock_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		cfs_hash_rehash_key(exp->exp_lock_hash,
+				    &lock->l_remote_handle,
+				    &reply->lock_handle,
+				    &lock->l_exp_hash);
+	} else {
+		lock->l_remote_handle = reply->lock_handle;
+	}
+
+	LDLM_DEBUG(lock, "replayed lock:");
+	ptlrpc_import_recovery_state_machine(req->rq_import);
+	LDLM_LOCK_PUT(lock);
+out:
+	if (rc != ELDLM_OK)
+		ptlrpc_connect_import(req->rq_import);
+
+	RETURN(rc);
+}
+
+static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
+{
+	struct ptlrpc_request *req;
+	struct ldlm_async_args *aa;
+	struct ldlm_request   *body;
+	int flags;
+	ENTRY;
+
+
+	/* Bug 11974: Do not replay a lock which is actively being canceled */
+	if (lock->l_flags & LDLM_FL_CANCELING) {
+		LDLM_DEBUG(lock, "Not replaying canceled lock:");
+		RETURN(0);
+	}
+
+	/* If this is reply-less callback lock, we cannot replay it, since
+	 * server might have long dropped it, but notification of that event was
+	 * lost by network. (and server granted conflicting lock already) */
+	if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) {
+		LDLM_DEBUG(lock, "Not replaying reply-less lock:");
+		ldlm_lock_cancel(lock);
+		RETURN(0);
+	}
+
+	/*
+	 * If granted mode matches the requested mode, this lock is granted.
+	 *
+	 * If they differ, but we have a granted mode, then we were granted
+	 * one mode and now want another: ergo, converting.
+	 *
+	 * If we haven't been granted anything and are on a resource list,
+	 * then we're blocked/waiting.
+	 *
+	 * If we haven't been granted anything and we're NOT on a resource list,
+	 * then we haven't got a reply yet and don't have a known disposition.
+	 * This happens whenever a lock enqueue is the request that triggers
+	 * recovery.
+	 */
+	if (lock->l_granted_mode == lock->l_req_mode)
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+	else if (lock->l_granted_mode)
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
+	else if (!list_empty(&lock->l_res_link))
+		flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+	else
+		flags = LDLM_FL_REPLAY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
+					LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	/* We're part of recovery, so don't wait for it. */
+	req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+	ldlm_lock2desc(lock, &body->lock_desc);
+	body->lock_flags = ldlm_flags_to_wire(flags);
+
+	ldlm_lock2handle(lock, &body->lock_handle[0]);
+	if (lock->l_lvb_len > 0)
+		req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB);
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			     lock->l_lvb_len);
+	ptlrpc_request_set_replen(req);
+	/* notify the server we've replayed all requests.
+	 * also, we mark the request to be put on a dedicated
+	 * queue to be processed after all request replayes.
+	 * bug 6063 */
+	lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE);
+
+	LDLM_DEBUG(lock, "replaying lock:");
+
+	atomic_inc(&req->rq_import->imp_replay_inflight);
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->lock_handle = body->lock_handle[0];
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret;
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+	RETURN(0);
+}
+
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+	int canceled;
+	LIST_HEAD(cancels);
+
+	CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before"
+			   "replay for namespace %s (%d)\n",
+			   ldlm_ns_name(ns), ns->ns_nr_unused);
+
+	/* We don't need to care whether or not LRU resize is enabled
+	 * because the LDLM_CANCEL_NO_WAIT policy doesn't use the
+	 * count parameter */
+	canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+					 LCF_LOCAL, LDLM_CANCEL_NO_WAIT);
+
+	CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+			   canceled, ldlm_ns_name(ns));
+}
+
+int ldlm_replay_locks(struct obd_import *imp)
+{
+	struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+	LIST_HEAD(list);
+	struct ldlm_lock *lock, *next;
+	int rc = 0;
+
+	ENTRY;
+
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+
+	/* don't replay locks if import failed recovery */
+	if (imp->imp_vbr_failed)
+		RETURN(0);
+
+	/* ensure this doesn't fall to 0 before all have been queued */
+	atomic_inc(&imp->imp_replay_inflight);
+
+	if (ldlm_cancel_unused_locks_before_replay)
+		ldlm_cancel_unused_locks_for_replay(ns);
+
+	ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+
+	list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
+		list_del_init(&lock->l_pending_chain);
+		if (rc) {
+			LDLM_LOCK_RELEASE(lock);
+			continue; /* or try to do the rest? */
+		}
+		rc = replay_one_lock(imp, lock);
+		LDLM_LOCK_RELEASE(lock);
+	}
+
+	atomic_dec(&imp->imp_replay_inflight);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_replay_locks);

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c
new file mode 100644
index 0000000..9052dc5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c

@@ -0,0 +1,1409 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_resource.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+# include <lustre_dlm.h>
+
+#include <lustre_fid.h>
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
+
+atomic_t ldlm_srv_namespace_nr = ATOMIC_INIT(0);
+atomic_t ldlm_cli_namespace_nr = ATOMIC_INIT(0);
+
+struct mutex ldlm_srv_namespace_lock;
+LIST_HEAD(ldlm_srv_namespace_list);
+
+struct mutex ldlm_cli_namespace_lock;
+LIST_HEAD(ldlm_cli_namespace_list);
+
+proc_dir_entry_t *ldlm_type_proc_dir = NULL;
+proc_dir_entry_t *ldlm_ns_proc_dir = NULL;
+proc_dir_entry_t *ldlm_svc_proc_dir = NULL;
+
+extern unsigned int ldlm_cancel_unused_locks_before_replay;
+
+/* during debug dump certain amount of granted locks for one resource to avoid
+ * DDOS. */
+unsigned int ldlm_dump_granted_max = 256;
+
+#ifdef LPROCFS
+static ssize_t lprocfs_wr_dump_ns(struct file *file, const char *buffer,
+				  size_t count, loff_t *off)
+{
+	ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+	ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+	RETURN(count);
+}
+LPROC_SEQ_FOPS_WR_ONLY(ldlm, dump_ns);
+
+LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint);
+LPROC_SEQ_FOPS_RO_TYPE(ldlm, uint);
+
+int ldlm_proc_setup(void)
+{
+	int rc;
+	struct lprocfs_vars list[] = {
+		{ "dump_namespaces", &ldlm_dump_ns_fops, 0, 0222 },
+		{ "dump_granted_max", &ldlm_rw_uint_fops,
+		  &ldlm_dump_granted_max },
+		{ "cancel_unused_locks_before_replay", &ldlm_rw_uint_fops,
+		  &ldlm_cancel_unused_locks_before_replay },
+		{ NULL }};
+	ENTRY;
+	LASSERT(ldlm_ns_proc_dir == NULL);
+
+	ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME,
+					      proc_lustre_root,
+					      NULL, NULL);
+	if (IS_ERR(ldlm_type_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_type_proc_dir);
+		GOTO(err, rc);
+	}
+
+	ldlm_ns_proc_dir = lprocfs_register("namespaces",
+					    ldlm_type_proc_dir,
+					    NULL, NULL);
+	if (IS_ERR(ldlm_ns_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_ns_proc_dir);
+		GOTO(err_type, rc);
+	}
+
+	ldlm_svc_proc_dir = lprocfs_register("services",
+					    ldlm_type_proc_dir,
+					    NULL, NULL);
+	if (IS_ERR(ldlm_svc_proc_dir)) {
+		CERROR("LProcFS failed in ldlm-init\n");
+		rc = PTR_ERR(ldlm_svc_proc_dir);
+		GOTO(err_ns, rc);
+	}
+
+	rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL);
+
+	RETURN(0);
+
+err_ns:
+	lprocfs_remove(&ldlm_ns_proc_dir);
+err_type:
+	lprocfs_remove(&ldlm_type_proc_dir);
+err:
+	ldlm_svc_proc_dir = NULL;
+	ldlm_type_proc_dir = NULL;
+	ldlm_ns_proc_dir = NULL;
+	RETURN(rc);
+}
+
+void ldlm_proc_cleanup(void)
+{
+	if (ldlm_svc_proc_dir)
+		lprocfs_remove(&ldlm_svc_proc_dir);
+
+	if (ldlm_ns_proc_dir)
+		lprocfs_remove(&ldlm_ns_proc_dir);
+
+	if (ldlm_type_proc_dir)
+		lprocfs_remove(&ldlm_type_proc_dir);
+
+	ldlm_svc_proc_dir = NULL;
+	ldlm_type_proc_dir = NULL;
+	ldlm_ns_proc_dir = NULL;
+}
+
+static int lprocfs_ns_resources_seq_show(struct seq_file *m, void *v)
+{
+	struct ldlm_namespace *ns  = m->private;
+	__u64		  res = 0;
+	cfs_hash_bd_t	  bd;
+	int		    i;
+
+	/* result is not strictly consistant */
+	cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i)
+		res += cfs_hash_bd_count_get(&bd);
+	return lprocfs_rd_u64(m, &res);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_ns_resources);
+
+static int lprocfs_ns_locks_seq_show(struct seq_file *m, void *v)
+{
+	struct ldlm_namespace *ns = m->private;
+	__u64		  locks;
+
+	locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS,
+					LPROCFS_FIELDS_FLAGS_SUM);
+	return lprocfs_rd_u64(m, &locks);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_ns_locks);
+
+static int lprocfs_lru_size_seq_show(struct seq_file *m, void *v)
+{
+	struct ldlm_namespace *ns = m->private;
+	__u32 *nr = &ns->ns_max_unused;
+
+	if (ns_connect_lru_resize(ns))
+		nr = &ns->ns_nr_unused;
+	return lprocfs_rd_uint(m, nr);
+}
+
+static ssize_t lprocfs_lru_size_seq_write(struct file *file, const char *buffer,
+				      size_t count, loff_t *off)
+{
+	struct ldlm_namespace *ns = ((struct seq_file *)file->private_data)->private;
+	char dummy[MAX_STRING_SIZE + 1], *end;
+	unsigned long tmp;
+	int lru_resize;
+
+	dummy[MAX_STRING_SIZE] = '\0';
+	if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+		return -EFAULT;
+
+	if (strncmp(dummy, "clear", 5) == 0) {
+		CDEBUG(D_DLMTRACE,
+		       "dropping all unused locks from namespace %s\n",
+		       ldlm_ns_name(ns));
+		if (ns_connect_lru_resize(ns)) {
+			int canceled, unused  = ns->ns_nr_unused;
+
+			/* Try to cancel all @ns_nr_unused locks. */
+			canceled = ldlm_cancel_lru(ns, unused, 0,
+						   LDLM_CANCEL_PASSED);
+			if (canceled < unused) {
+				CDEBUG(D_DLMTRACE,
+				       "not all requested locks are canceled, "
+				       "requested: %d, canceled: %d\n", unused,
+				       canceled);
+				return -EINVAL;
+			}
+		} else {
+			tmp = ns->ns_max_unused;
+			ns->ns_max_unused = 0;
+			ldlm_cancel_lru(ns, 0, 0, LDLM_CANCEL_PASSED);
+			ns->ns_max_unused = tmp;
+		}
+		return count;
+	}
+
+	tmp = simple_strtoul(dummy, &end, 0);
+	if (dummy == end) {
+		CERROR("invalid value written\n");
+		return -EINVAL;
+	}
+	lru_resize = (tmp == 0);
+
+	if (ns_connect_lru_resize(ns)) {
+		if (!lru_resize)
+			ns->ns_max_unused = (unsigned int)tmp;
+
+		if (tmp > ns->ns_nr_unused)
+			tmp = ns->ns_nr_unused;
+		tmp = ns->ns_nr_unused - tmp;
+
+		CDEBUG(D_DLMTRACE,
+		       "changing namespace %s unused locks from %u to %u\n",
+		       ldlm_ns_name(ns), ns->ns_nr_unused,
+		       (unsigned int)tmp);
+		ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+		if (!lru_resize) {
+			CDEBUG(D_DLMTRACE,
+			       "disable lru_resize for namespace %s\n",
+			       ldlm_ns_name(ns));
+			ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
+		}
+	} else {
+		CDEBUG(D_DLMTRACE,
+		       "changing namespace %s max_unused from %u to %u\n",
+		       ldlm_ns_name(ns), ns->ns_max_unused,
+		       (unsigned int)tmp);
+		ns->ns_max_unused = (unsigned int)tmp;
+		ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+		/* Make sure that LRU resize was originally supported before
+		 * turning it on here. */
+		if (lru_resize &&
+		    (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) {
+			CDEBUG(D_DLMTRACE,
+			       "enable lru_resize for namespace %s\n",
+			       ldlm_ns_name(ns));
+			ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+		}
+	}
+
+	return count;
+}
+LPROC_SEQ_FOPS(lprocfs_lru_size);
+
+static int lprocfs_elc_seq_show(struct seq_file *m, void *v)
+{
+	struct ldlm_namespace *ns = m->private;
+	unsigned int supp = ns_connect_cancelset(ns);
+
+	return lprocfs_rd_uint(m, &supp);
+}
+
+static ssize_t lprocfs_elc_seq_write(struct file *file, const char *buffer,
+				 size_t count, loff_t *off)
+{
+	struct ldlm_namespace *ns = ((struct seq_file *)file->private_data)->private;
+	unsigned int supp = -1;
+	int rc;
+
+	rc = lprocfs_wr_uint(file, buffer, count, &supp);
+	if (rc < 0)
+		return rc;
+
+	if (supp == 0)
+		ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET;
+	else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET)
+		ns->ns_connect_flags |= OBD_CONNECT_CANCELSET;
+	return count;
+}
+LPROC_SEQ_FOPS(lprocfs_elc);
+
+void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns)
+{
+	if (ns->ns_proc_dir_entry == NULL)
+		CERROR("dlm namespace %s has no procfs dir?\n",
+		       ldlm_ns_name(ns));
+	else
+		lprocfs_remove(&ns->ns_proc_dir_entry);
+
+	if (ns->ns_stats != NULL)
+		lprocfs_free_stats(&ns->ns_stats);
+}
+
+#define LDLM_NS_ADD_VAR(name, var, ops)				\
+	do {							\
+		snprintf(lock_name, MAX_STRING_SIZE, name);	\
+		lock_vars[0].data = var;			\
+		lock_vars[0].fops = ops;			\
+		lprocfs_add_vars(ns_pde, lock_vars, 0);		\
+	} while (0)
+
+int ldlm_namespace_proc_register(struct ldlm_namespace *ns)
+{
+	struct lprocfs_vars lock_vars[2];
+	char lock_name[MAX_STRING_SIZE + 1];
+	proc_dir_entry_t *ns_pde;
+
+	LASSERT(ns != NULL);
+	LASSERT(ns->ns_rs_hash != NULL);
+
+	if (ns->ns_proc_dir_entry != NULL) {
+		ns_pde = ns->ns_proc_dir_entry;
+	} else {
+		ns_pde = proc_mkdir(ldlm_ns_name(ns), ldlm_ns_proc_dir);
+		if (ns_pde == NULL)
+			return -ENOMEM;
+		ns->ns_proc_dir_entry = ns_pde;
+	}
+
+	ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0);
+	if (ns->ns_stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS,
+			     LPROCFS_CNTR_AVGMINMAX, "locks", "locks");
+
+	lock_name[MAX_STRING_SIZE] = '\0';
+
+	memset(lock_vars, 0, sizeof(lock_vars));
+	lock_vars[0].name = lock_name;
+
+	LDLM_NS_ADD_VAR("resource_count", ns, &lprocfs_ns_resources_fops);
+	LDLM_NS_ADD_VAR("lock_count", ns, &lprocfs_ns_locks_fops);
+
+	if (ns_is_client(ns)) {
+		LDLM_NS_ADD_VAR("lock_unused_count", &ns->ns_nr_unused,
+				&ldlm_uint_fops);
+		LDLM_NS_ADD_VAR("lru_size", ns, &lprocfs_lru_size_fops);
+		LDLM_NS_ADD_VAR("lru_max_age", &ns->ns_max_age,
+				&ldlm_rw_uint_fops);
+		LDLM_NS_ADD_VAR("early_lock_cancel", ns, &lprocfs_elc_fops);
+	} else {
+		LDLM_NS_ADD_VAR("ctime_age_limit", &ns->ns_ctime_age_limit,
+				&ldlm_rw_uint_fops);
+		LDLM_NS_ADD_VAR("lock_timeouts", &ns->ns_timeouts,
+				&ldlm_uint_fops);
+		LDLM_NS_ADD_VAR("max_nolock_bytes", &ns->ns_max_nolock_size,
+				&ldlm_rw_uint_fops);
+		LDLM_NS_ADD_VAR("contention_seconds", &ns->ns_contention_time,
+				&ldlm_rw_uint_fops);
+		LDLM_NS_ADD_VAR("contended_locks", &ns->ns_contended_locks,
+				&ldlm_rw_uint_fops);
+		LDLM_NS_ADD_VAR("max_parallel_ast", &ns->ns_max_parallel_ast,
+				&ldlm_rw_uint_fops);
+	}
+	return 0;
+}
+#undef MAX_STRING_SIZE
+#else /* LPROCFS */
+
+#define ldlm_namespace_proc_unregister(ns)      ({;})
+#define ldlm_namespace_proc_register(ns)	({0;})
+
+#endif /* LPROCFS */
+
+static unsigned ldlm_res_hop_hash(cfs_hash_t *hs,
+				  const void *key, unsigned mask)
+{
+	const struct ldlm_res_id     *id  = key;
+	unsigned		val = 0;
+	unsigned		i;
+
+	for (i = 0; i < RES_NAME_SIZE; i++)
+		val += id->name[i];
+	return val & mask;
+}
+
+static unsigned ldlm_res_hop_fid_hash(cfs_hash_t *hs,
+				      const void *key, unsigned mask)
+{
+	const struct ldlm_res_id *id = key;
+	struct lu_fid       fid;
+	__u32	       hash;
+	__u32	       val;
+
+	fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF];
+	fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF];
+	fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+	hash = fid_flatten32(&fid);
+	hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+	if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) {
+		val = id->name[LUSTRE_RES_ID_HSH_OFF];
+		hash += (val >> 5) + (val << 11);
+	} else {
+		val = fid_oid(&fid);
+	}
+	hash = cfs_hash_long(hash, hs->hs_bkt_bits);
+	/* give me another random factor */
+	hash -= cfs_hash_long((unsigned long)hs, val % 11 + 3);
+
+	hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+	hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1);
+
+	return hash & mask;
+}
+
+static void *ldlm_res_hop_key(struct hlist_node *hnode)
+{
+	struct ldlm_resource   *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	return &res->lr_name;
+}
+
+static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct ldlm_resource   *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	return ldlm_res_eq((const struct ldlm_res_id *)key,
+			   (const struct ldlm_res_id *)&res->lr_name);
+}
+
+static void *ldlm_res_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ldlm_resource, lr_hash);
+}
+
+static void ldlm_res_hop_get_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	ldlm_resource_getref(res);
+}
+
+static void ldlm_res_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	/* cfs_hash_for_each_nolock is the only chance we call it */
+	ldlm_resource_putref_locked(res);
+}
+
+static void ldlm_res_hop_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ldlm_resource *res;
+
+	res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+	ldlm_resource_putref(res);
+}
+
+cfs_hash_ops_t ldlm_ns_hash_ops = {
+	.hs_hash	= ldlm_res_hop_hash,
+	.hs_key	 = ldlm_res_hop_key,
+	.hs_keycmp      = ldlm_res_hop_keycmp,
+	.hs_keycpy      = NULL,
+	.hs_object      = ldlm_res_hop_object,
+	.hs_get	 = ldlm_res_hop_get_locked,
+	.hs_put_locked  = ldlm_res_hop_put_locked,
+	.hs_put	 = ldlm_res_hop_put
+};
+
+cfs_hash_ops_t ldlm_ns_fid_hash_ops = {
+	.hs_hash	= ldlm_res_hop_fid_hash,
+	.hs_key	 = ldlm_res_hop_key,
+	.hs_keycmp      = ldlm_res_hop_keycmp,
+	.hs_keycpy      = NULL,
+	.hs_object      = ldlm_res_hop_object,
+	.hs_get	 = ldlm_res_hop_get_locked,
+	.hs_put_locked  = ldlm_res_hop_put_locked,
+	.hs_put	 = ldlm_res_hop_put
+};
+
+typedef struct {
+	ldlm_ns_type_t  nsd_type;
+	/** hash bucket bits */
+	unsigned	nsd_bkt_bits;
+	/** hash bits */
+	unsigned	nsd_all_bits;
+	/** hash operations */
+	cfs_hash_ops_t *nsd_hops;
+} ldlm_ns_hash_def_t;
+
+ldlm_ns_hash_def_t ldlm_ns_hash_defs[] =
+{
+	{
+		.nsd_type       = LDLM_NS_TYPE_MDC,
+		.nsd_bkt_bits   = 11,
+		.nsd_all_bits   = 16,
+		.nsd_hops       = &ldlm_ns_fid_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_MDT,
+		.nsd_bkt_bits   = 14,
+		.nsd_all_bits   = 21,
+		.nsd_hops       = &ldlm_ns_fid_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_OSC,
+		.nsd_bkt_bits   = 8,
+		.nsd_all_bits   = 12,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_OST,
+		.nsd_bkt_bits   = 11,
+		.nsd_all_bits   = 17,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_MGC,
+		.nsd_bkt_bits   = 4,
+		.nsd_all_bits   = 4,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_MGT,
+		.nsd_bkt_bits   = 4,
+		.nsd_all_bits   = 4,
+		.nsd_hops       = &ldlm_ns_hash_ops,
+	},
+	{
+		.nsd_type       = LDLM_NS_TYPE_UNKNOWN,
+	},
+};
+
+/**
+ * Create and initialize new empty namespace.
+ */
+struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
+					  ldlm_side_t client,
+					  ldlm_appetite_t apt,
+					  ldlm_ns_type_t ns_type)
+{
+	struct ldlm_namespace *ns = NULL;
+	struct ldlm_ns_bucket *nsb;
+	ldlm_ns_hash_def_t    *nsd;
+	cfs_hash_bd_t	  bd;
+	int		    idx;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(obd != NULL);
+
+	rc = ldlm_get_ref();
+	if (rc) {
+		CERROR("ldlm_get_ref failed: %d\n", rc);
+		RETURN(NULL);
+	}
+
+	for (idx = 0;;idx++) {
+		nsd = &ldlm_ns_hash_defs[idx];
+		if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) {
+			CERROR("Unknown type %d for ns %s\n", ns_type, name);
+			GOTO(out_ref, NULL);
+		}
+
+		if (nsd->nsd_type == ns_type)
+			break;
+	}
+
+	OBD_ALLOC_PTR(ns);
+	if (!ns)
+		GOTO(out_ref, NULL);
+
+	ns->ns_rs_hash = cfs_hash_create(name,
+					 nsd->nsd_all_bits, nsd->nsd_all_bits,
+					 nsd->nsd_bkt_bits, sizeof(*nsb),
+					 CFS_HASH_MIN_THETA,
+					 CFS_HASH_MAX_THETA,
+					 nsd->nsd_hops,
+					 CFS_HASH_DEPTH |
+					 CFS_HASH_BIGNAME |
+					 CFS_HASH_SPIN_BKTLOCK |
+					 CFS_HASH_NO_ITEMREF);
+	if (ns->ns_rs_hash == NULL)
+		GOTO(out_ns, NULL);
+
+	cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) {
+		nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+		at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0);
+		nsb->nsb_namespace = ns;
+	}
+
+	ns->ns_obd      = obd;
+	ns->ns_appetite = apt;
+	ns->ns_client   = client;
+
+	INIT_LIST_HEAD(&ns->ns_list_chain);
+	INIT_LIST_HEAD(&ns->ns_unused_list);
+	spin_lock_init(&ns->ns_lock);
+	atomic_set(&ns->ns_bref, 0);
+	init_waitqueue_head(&ns->ns_waitq);
+
+	ns->ns_max_nolock_size    = NS_DEFAULT_MAX_NOLOCK_BYTES;
+	ns->ns_contention_time    = NS_DEFAULT_CONTENTION_SECONDS;
+	ns->ns_contended_locks    = NS_DEFAULT_CONTENDED_LOCKS;
+
+	ns->ns_max_parallel_ast   = LDLM_DEFAULT_PARALLEL_AST_LIMIT;
+	ns->ns_nr_unused	  = 0;
+	ns->ns_max_unused	 = LDLM_DEFAULT_LRU_SIZE;
+	ns->ns_max_age	    = LDLM_DEFAULT_MAX_ALIVE;
+	ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
+	ns->ns_timeouts	   = 0;
+	ns->ns_orig_connect_flags = 0;
+	ns->ns_connect_flags      = 0;
+	ns->ns_stopping	   = 0;
+	rc = ldlm_namespace_proc_register(ns);
+	if (rc != 0) {
+		CERROR("Can't initialize ns proc, rc %d\n", rc);
+		GOTO(out_hash, rc);
+	}
+
+	idx = atomic_read(ldlm_namespace_nr(client));
+	rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client);
+	if (rc) {
+		CERROR("Can't initialize lock pool, rc %d\n", rc);
+		GOTO(out_proc, rc);
+	}
+
+	ldlm_namespace_register(ns, client);
+	RETURN(ns);
+out_proc:
+	ldlm_namespace_proc_unregister(ns);
+	ldlm_namespace_cleanup(ns, 0);
+out_hash:
+	cfs_hash_putref(ns->ns_rs_hash);
+out_ns:
+	OBD_FREE_PTR(ns);
+out_ref:
+	ldlm_put_ref();
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(ldlm_namespace_new);
+
+extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+
+/**
+ * Cancel and destroy all locks on a resource.
+ *
+ * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just
+ * clean up.  This is currently only used for recovery, and we make
+ * certain assumptions as a result--notably, that we shouldn't cancel
+ * locks with refs.
+ */
+static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
+			     __u64 flags)
+{
+	struct list_head *tmp;
+	int rc = 0, client = ns_is_client(ldlm_res_to_ns(res));
+	bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY);
+
+	do {
+		struct ldlm_lock *lock = NULL;
+
+		/* First, we look for non-cleaned-yet lock
+		 * all cleaned locks are marked by CLEANED flag. */
+		lock_res(res);
+		list_for_each(tmp, q) {
+			lock = list_entry(tmp, struct ldlm_lock,
+					      l_res_link);
+			if (lock->l_flags & LDLM_FL_CLEANED) {
+				lock = NULL;
+				continue;
+			}
+			LDLM_LOCK_GET(lock);
+			lock->l_flags |= LDLM_FL_CLEANED;
+			break;
+		}
+
+		if (lock == NULL) {
+			unlock_res(res);
+			break;
+		}
+
+		/* Set CBPENDING so nothing in the cancellation path
+		 * can match this lock. */
+		lock->l_flags |= LDLM_FL_CBPENDING;
+		lock->l_flags |= LDLM_FL_FAILED;
+		lock->l_flags |= flags;
+
+		/* ... without sending a CANCEL message for local_only. */
+		if (local_only)
+			lock->l_flags |= LDLM_FL_LOCAL_ONLY;
+
+		if (local_only && (lock->l_readers || lock->l_writers)) {
+			/* This is a little bit gross, but much better than the
+			 * alternative: pretend that we got a blocking AST from
+			 * the server, so that when the lock is decref'd, it
+			 * will go away ... */
+			unlock_res(res);
+			LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY");
+			if (lock->l_completion_ast)
+				lock->l_completion_ast(lock, 0, NULL);
+			LDLM_LOCK_RELEASE(lock);
+			continue;
+		}
+
+		if (client) {
+			struct lustre_handle lockh;
+
+			unlock_res(res);
+			ldlm_lock2handle(lock, &lockh);
+			rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+			if (rc)
+				CERROR("ldlm_cli_cancel: %d\n", rc);
+		} else {
+			ldlm_resource_unlink_lock(lock);
+			unlock_res(res);
+			LDLM_DEBUG(lock, "Freeing a lock still held by a "
+				   "client node");
+			ldlm_lock_destroy(lock);
+		}
+		LDLM_LOCK_RELEASE(lock);
+	} while (1);
+}
+
+static int ldlm_resource_clean(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			       struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	__u64 flags = *(__u64 *)arg;
+
+	cleanup_resource(res, &res->lr_granted, flags);
+	cleanup_resource(res, &res->lr_converting, flags);
+	cleanup_resource(res, &res->lr_waiting, flags);
+
+	return 0;
+}
+
+static int ldlm_resource_complain(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+				  struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource  *res = cfs_hash_object(hs, hnode);
+
+	lock_res(res);
+	CERROR("Namespace %s resource refcount nonzero "
+	       "(%d) after lock cleanup; forcing "
+	       "cleanup.\n",
+	       ldlm_ns_name(ldlm_res_to_ns(res)),
+	       atomic_read(&res->lr_refcount) - 1);
+
+	CERROR("Resource: %p ("LPU64"/"LPU64"/"LPU64"/"
+	       LPU64") (rc: %d)\n", res,
+	       res->lr_name.name[0], res->lr_name.name[1],
+	       res->lr_name.name[2], res->lr_name.name[3],
+	       atomic_read(&res->lr_refcount) - 1);
+
+	ldlm_resource_dump(D_ERROR, res);
+	unlock_res(res);
+	return 0;
+}
+
+/**
+ * Cancel and destroy all locks in the namespace.
+ *
+ * Typically used during evictions when server notified client that it was
+ * evicted and all of its state needs to be destroyed.
+ * Also used during shutdown.
+ */
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags)
+{
+	if (ns == NULL) {
+		CDEBUG(D_INFO, "NULL ns, skipping cleanup\n");
+		return ELDLM_OK;
+	}
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, &flags);
+	cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, NULL);
+	return ELDLM_OK;
+}
+EXPORT_SYMBOL(ldlm_namespace_cleanup);
+
+/**
+ * Attempts to free namespace.
+ *
+ * Only used when namespace goes away, like during an unmount.
+ */
+static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force)
+{
+	ENTRY;
+
+	/* At shutdown time, don't call the cancellation callback */
+	ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0);
+
+	if (atomic_read(&ns->ns_bref) > 0) {
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		int rc;
+		CDEBUG(D_DLMTRACE,
+		       "dlm namespace %s free waiting on refcount %d\n",
+		       ldlm_ns_name(ns), atomic_read(&ns->ns_bref));
+force_wait:
+		if (force)
+			lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+
+		rc = l_wait_event(ns->ns_waitq,
+				  atomic_read(&ns->ns_bref) == 0, &lwi);
+
+		/* Forced cleanups should be able to reclaim all references,
+		 * so it's safe to wait forever... we can't leak locks... */
+		if (force && rc == -ETIMEDOUT) {
+			LCONSOLE_ERROR("Forced cleanup waiting for %s "
+				       "namespace with %d resources in use, "
+				       "(rc=%d)\n", ldlm_ns_name(ns),
+				       atomic_read(&ns->ns_bref), rc);
+			GOTO(force_wait, rc);
+		}
+
+		if (atomic_read(&ns->ns_bref)) {
+			LCONSOLE_ERROR("Cleanup waiting for %s namespace "
+				       "with %d resources in use, (rc=%d)\n",
+				       ldlm_ns_name(ns),
+				       atomic_read(&ns->ns_bref), rc);
+			RETURN(ELDLM_NAMESPACE_EXISTS);
+		}
+		CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n",
+		       ldlm_ns_name(ns));
+	}
+
+	RETURN(ELDLM_OK);
+}
+
+/**
+ * Performs various cleanups for passed \a ns to make it drop refc and be
+ * ready for freeing. Waits for refc == 0.
+ *
+ * The following is done:
+ * (0) Unregister \a ns from its list to make inaccessible for potential
+ * users like pools thread and others;
+ * (1) Clear all locks in \a ns.
+ */
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+			       struct obd_import *imp,
+			       int force)
+{
+	int rc;
+	ENTRY;
+	if (!ns) {
+		EXIT;
+		return;
+	}
+
+	spin_lock(&ns->ns_lock);
+	ns->ns_stopping = 1;
+	spin_unlock(&ns->ns_lock);
+
+	/*
+	 * Can fail with -EINTR when force == 0 in which case try harder.
+	 */
+	rc = __ldlm_namespace_free(ns, force);
+	if (rc != ELDLM_OK) {
+		if (imp) {
+			ptlrpc_disconnect_import(imp, 0);
+			ptlrpc_invalidate_import(imp);
+		}
+
+		/*
+		 * With all requests dropped and the import inactive
+		 * we are gaurenteed all reference will be dropped.
+		 */
+		rc = __ldlm_namespace_free(ns, 1);
+		LASSERT(rc == 0);
+	}
+	EXIT;
+}
+
+/**
+ * Performs freeing memory structures related to \a ns. This is only done
+ * when ldlm_namespce_free_prior() successfully removed all resources
+ * referencing \a ns and its refc == 0.
+ */
+void ldlm_namespace_free_post(struct ldlm_namespace *ns)
+{
+	ENTRY;
+	if (!ns) {
+		EXIT;
+		return;
+	}
+
+	/* Make sure that nobody can find this ns in its list. */
+	ldlm_namespace_unregister(ns, ns->ns_client);
+	/* Fini pool _before_ parent proc dir is removed. This is important as
+	 * ldlm_pool_fini() removes own proc dir which is child to @dir.
+	 * Removing it after @dir may cause oops. */
+	ldlm_pool_fini(&ns->ns_pool);
+
+	ldlm_namespace_proc_unregister(ns);
+	cfs_hash_putref(ns->ns_rs_hash);
+	/* Namespace \a ns should be not on list at this time, otherwise
+	 * this will cause issues related to using freed \a ns in poold
+	 * thread. */
+	LASSERT(list_empty(&ns->ns_list_chain));
+	OBD_FREE_PTR(ns);
+	ldlm_put_ref();
+	EXIT;
+}
+
+/**
+ * Cleanup the resource, and free namespace.
+ * bug 12864:
+ * Deadlock issue:
+ * proc1: destroy import
+ *	class_disconnect_export(grab cl_sem) ->
+ *	      -> ldlm_namespace_free ->
+ *	      -> lprocfs_remove(grab _lprocfs_lock).
+ * proc2: read proc info
+ *	lprocfs_fops_read(grab _lprocfs_lock) ->
+ *	      -> osc_rd_active, etc(grab cl_sem).
+ *
+ * So that I have to split the ldlm_namespace_free into two parts - the first
+ * part ldlm_namespace_free_prior is used to cleanup the resource which is
+ * being used; the 2nd part ldlm_namespace_free_post is used to unregister the
+ * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem
+ * held.
+ */
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+			 struct obd_import *imp,
+			 int force)
+{
+	ldlm_namespace_free_prior(ns, imp, force);
+	ldlm_namespace_free_post(ns);
+}
+EXPORT_SYMBOL(ldlm_namespace_free);
+
+void ldlm_namespace_get(struct ldlm_namespace *ns)
+{
+	atomic_inc(&ns->ns_bref);
+}
+EXPORT_SYMBOL(ldlm_namespace_get);
+
+void ldlm_namespace_put(struct ldlm_namespace *ns)
+{
+	if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) {
+		wake_up(&ns->ns_waitq);
+		spin_unlock(&ns->ns_lock);
+	}
+}
+EXPORT_SYMBOL(ldlm_namespace_put);
+
+/** Register \a ns in the list of namespaces */
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+	mutex_lock(ldlm_namespace_lock(client));
+	LASSERT(list_empty(&ns->ns_list_chain));
+	list_add(&ns->ns_list_chain, ldlm_namespace_list(client));
+	atomic_inc(ldlm_namespace_nr(client));
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Unregister \a ns from the list of namespaces. */
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+	mutex_lock(ldlm_namespace_lock(client));
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	/* Some asserts and possibly other parts of the code are still
+	 * using list_empty(&ns->ns_list_chain). This is why it is
+	 * important to use list_del_init() here. */
+	list_del_init(&ns->ns_list_chain);
+	atomic_dec(ldlm_namespace_nr(client));
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+	LASSERT(!list_empty(&ns->ns_list_chain));
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client)
+{
+	LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+	LASSERT(!list_empty(ldlm_namespace_list(client)));
+	return container_of(ldlm_namespace_list(client)->next,
+		struct ldlm_namespace, ns_list_chain);
+}
+
+/** Create and initialize new resource. */
+static struct ldlm_resource *ldlm_resource_new(void)
+{
+	struct ldlm_resource *res;
+	int idx;
+
+	OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, __GFP_IO);
+	if (res == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&res->lr_granted);
+	INIT_LIST_HEAD(&res->lr_converting);
+	INIT_LIST_HEAD(&res->lr_waiting);
+
+	/* Initialize interval trees for each lock mode. */
+	for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+		res->lr_itree[idx].lit_size = 0;
+		res->lr_itree[idx].lit_mode = 1 << idx;
+		res->lr_itree[idx].lit_root = NULL;
+	}
+
+	atomic_set(&res->lr_refcount, 1);
+	spin_lock_init(&res->lr_lock);
+	lu_ref_init(&res->lr_reference);
+
+	/* The creator of the resource must unlock the mutex after LVB
+	 * initialization. */
+	mutex_init(&res->lr_lvb_mutex);
+	mutex_lock(&res->lr_lvb_mutex);
+
+	return res;
+}
+
+/**
+ * Return a reference to resource with given name, creating it if necessary.
+ * Args: namespace with ns_lock unlocked
+ * Locks: takes and releases NS hash-lock and res->lr_lock
+ * Returns: referenced, unlocked ldlm_resource or NULL
+ */
+struct ldlm_resource *
+ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
+		  const struct ldlm_res_id *name, ldlm_type_t type, int create)
+{
+	struct hlist_node     *hnode;
+	struct ldlm_resource *res;
+	cfs_hash_bd_t	 bd;
+	__u64		 version;
+
+	LASSERT(ns != NULL);
+	LASSERT(parent == NULL);
+	LASSERT(ns->ns_rs_hash != NULL);
+	LASSERT(name->name[0] != 0);
+
+	cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0);
+	hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+	if (hnode != NULL) {
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+		/* Synchronize with regard to resource creation. */
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+			mutex_lock(&res->lr_lvb_mutex);
+			mutex_unlock(&res->lr_lvb_mutex);
+		}
+
+		if (unlikely(res->lr_lvb_len < 0)) {
+			ldlm_resource_putref(res);
+			res = NULL;
+		}
+		return res;
+	}
+
+	version = cfs_hash_bd_version_get(&bd);
+	cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+
+	if (create == 0)
+		return NULL;
+
+	LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE,
+		 "type: %d\n", type);
+	res = ldlm_resource_new();
+	if (!res)
+		return NULL;
+
+	res->lr_ns_bucket  = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+	res->lr_name       = *name;
+	res->lr_type       = type;
+	res->lr_most_restr = LCK_NL;
+
+	cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+	hnode = (version == cfs_hash_bd_version_get(&bd)) ?  NULL :
+		cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+
+	if (hnode != NULL) {
+		/* Someone won the race and already added the resource. */
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		/* Clean lu_ref for failed resource. */
+		lu_ref_fini(&res->lr_reference);
+		/* We have taken lr_lvb_mutex. Drop it. */
+		mutex_unlock(&res->lr_lvb_mutex);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+
+		res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+		/* Synchronize with regard to resource creation. */
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+			mutex_lock(&res->lr_lvb_mutex);
+			mutex_unlock(&res->lr_lvb_mutex);
+		}
+
+		if (unlikely(res->lr_lvb_len < 0)) {
+			ldlm_resource_putref(res);
+			res = NULL;
+		}
+		return res;
+	}
+	/* We won! Let's add the resource. */
+	cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash);
+	if (cfs_hash_bd_count_get(&bd) == 1)
+		ldlm_namespace_get(ns);
+
+	cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+	if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+		int rc;
+
+		OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2);
+		rc = ns->ns_lvbo->lvbo_init(res);
+		if (rc < 0) {
+			CERROR("lvbo_init failed for resource "
+			       LPU64": rc %d\n", name->name[0], rc);
+			if (res->lr_lvb_data) {
+				OBD_FREE(res->lr_lvb_data, res->lr_lvb_len);
+				res->lr_lvb_data = NULL;
+			}
+			res->lr_lvb_len = rc;
+			mutex_unlock(&res->lr_lvb_mutex);
+			ldlm_resource_putref(res);
+			return NULL;
+		}
+	}
+
+	/* We create resource with locked lr_lvb_mutex. */
+	mutex_unlock(&res->lr_lvb_mutex);
+
+	return res;
+}
+EXPORT_SYMBOL(ldlm_resource_get);
+
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
+{
+	LASSERT(res != NULL);
+	LASSERT(res != LP_POISON);
+	atomic_inc(&res->lr_refcount);
+	CDEBUG(D_INFO, "getref res: %p count: %d\n", res,
+	       atomic_read(&res->lr_refcount));
+	return res;
+}
+
+static void __ldlm_resource_putref_final(cfs_hash_bd_t *bd,
+					 struct ldlm_resource *res)
+{
+	struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+
+	if (!list_empty(&res->lr_granted)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	if (!list_empty(&res->lr_converting)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	if (!list_empty(&res->lr_waiting)) {
+		ldlm_resource_dump(D_ERROR, res);
+		LBUG();
+	}
+
+	cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+			       bd, &res->lr_hash);
+	lu_ref_fini(&res->lr_reference);
+	if (cfs_hash_bd_count_get(bd) == 0)
+		ldlm_namespace_put(nsb->nsb_namespace);
+}
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+	cfs_hash_bd_t   bd;
+
+	LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "putref res: %p count: %d\n",
+	       res, atomic_read(&res->lr_refcount) - 1);
+
+	cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd);
+	if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) {
+		__ldlm_resource_putref_final(&bd, res);
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+			ns->ns_lvbo->lvbo_free(res);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ldlm_resource_putref);
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref_locked(struct ldlm_resource *res)
+{
+	struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+	LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "putref res: %p count: %d\n",
+	       res, atomic_read(&res->lr_refcount) - 1);
+
+	if (atomic_dec_and_test(&res->lr_refcount)) {
+		cfs_hash_bd_t bd;
+
+		cfs_hash_bd_get(ldlm_res_to_ns(res)->ns_rs_hash,
+				&res->lr_name, &bd);
+		__ldlm_resource_putref_final(&bd, res);
+		cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+		/* NB: ns_rs_hash is created with CFS_HASH_NO_ITEMREF,
+		 * so we should never be here while calling cfs_hash_del,
+		 * cfs_hash_for_each_nolock is the only case we can get
+		 * here, which is safe to release cfs_hash_bd_lock.
+		 */
+		if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+			ns->ns_lvbo->lvbo_free(res);
+		OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+
+		cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * Add a lock into a given resource into specified lock list.
+ */
+void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
+			    struct ldlm_lock *lock)
+{
+	check_res_locked(res);
+
+	LDLM_DEBUG(lock, "About to add this lock:\n");
+
+	if (lock->l_destroyed) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		return;
+	}
+
+	LASSERT(list_empty(&lock->l_res_link));
+
+	list_add_tail(&lock->l_res_link, head);
+}
+
+/**
+ * Insert a lock into resource after specified lock.
+ *
+ * Obtain resource description from the lock we are inserting after.
+ */
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+				     struct ldlm_lock *new)
+{
+	struct ldlm_resource *res = original->l_resource;
+
+	check_res_locked(res);
+
+	ldlm_resource_dump(D_INFO, res);
+	LDLM_DEBUG(new, "About to insert this lock after %p:\n", original);
+
+	if (new->l_destroyed) {
+		CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+		goto out;
+	}
+
+	LASSERT(list_empty(&new->l_res_link));
+
+	list_add(&new->l_res_link, &original->l_res_link);
+ out:;
+}
+
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
+{
+	int type = lock->l_resource->lr_type;
+
+	check_res_locked(lock->l_resource);
+	if (type == LDLM_IBITS || type == LDLM_PLAIN)
+		ldlm_unlink_lock_skiplist(lock);
+	else if (type == LDLM_EXTENT)
+		ldlm_extent_unlink_lock(lock);
+	list_del_init(&lock->l_res_link);
+}
+EXPORT_SYMBOL(ldlm_resource_unlink_lock);
+
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc)
+{
+	desc->lr_type = res->lr_type;
+	desc->lr_name = res->lr_name;
+}
+
+/**
+ * Print information about all locks in all namespaces on this node to debug
+ * log.
+ */
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level)
+{
+	struct list_head *tmp;
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	mutex_lock(ldlm_namespace_lock(client));
+
+	list_for_each(tmp, ldlm_namespace_list(client)) {
+		struct ldlm_namespace *ns;
+		ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain);
+		ldlm_namespace_dump(level, ns);
+	}
+
+	mutex_unlock(ldlm_namespace_lock(client));
+}
+EXPORT_SYMBOL(ldlm_dump_all_namespaces);
+
+static int ldlm_res_hash_dump(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			      struct hlist_node *hnode, void *arg)
+{
+	struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+	int    level = (int)(unsigned long)arg;
+
+	lock_res(res);
+	ldlm_resource_dump(level, res);
+	unlock_res(res);
+
+	return 0;
+}
+
+/**
+ * Print information about all locks in this namespace on this node to debug
+ * log.
+ */
+void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
+{
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n",
+	       ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
+	       ns_is_client(ns) ? "client" : "server");
+
+	if (cfs_time_before(cfs_time_current(), ns->ns_next_dump))
+		return;
+
+	cfs_hash_for_each_nolock(ns->ns_rs_hash,
+				 ldlm_res_hash_dump,
+				 (void *)(unsigned long)level);
+	spin_lock(&ns->ns_lock);
+	ns->ns_next_dump = cfs_time_shift(10);
+	spin_unlock(&ns->ns_lock);
+}
+EXPORT_SYMBOL(ldlm_namespace_dump);
+
+/**
+ * Print information about all locks in this resource to debug log.
+ */
+void ldlm_resource_dump(int level, struct ldlm_resource *res)
+{
+	struct ldlm_lock *lock;
+	unsigned int granted = 0;
+
+	CLASSERT(RES_NAME_SIZE == 4);
+
+	if (!((libcfs_debug | D_ERROR) & level))
+		return;
+
+	CDEBUG(level, "--- Resource: %p ("LPU64"/"LPU64"/"LPU64"/"LPU64
+	       ") (rc: %d)\n", res, res->lr_name.name[0], res->lr_name.name[1],
+	       res->lr_name.name[2], res->lr_name.name[3],
+	       atomic_read(&res->lr_refcount));
+
+	if (!list_empty(&res->lr_granted)) {
+		CDEBUG(level, "Granted locks (in reverse order):\n");
+		list_for_each_entry_reverse(lock, &res->lr_granted,
+						l_res_link) {
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+			if (!(level & D_CANTMASK) &&
+			    ++granted > ldlm_dump_granted_max) {
+				CDEBUG(level, "only dump %d granted locks to "
+				       "avoid DDOS.\n", granted);
+				break;
+			}
+		}
+	}
+	if (!list_empty(&res->lr_converting)) {
+		CDEBUG(level, "Converting locks:\n");
+		list_for_each_entry(lock, &res->lr_converting, l_res_link)
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+	}
+	if (!list_empty(&res->lr_waiting)) {
+		CDEBUG(level, "Waiting locks:\n");
+		list_for_each_entry(lock, &res->lr_waiting, l_res_link)
+			LDLM_DEBUG_LIMIT(level, lock, "###");
+	}
+}

diff --git a/drivers/staging/lustre/lustre/libcfs/Makefile b/drivers/staging/lustre/lustre/libcfs/Makefile
new file mode 100644
index 0000000..bf5c563
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/Makefile

@@ -0,0 +1,21 @@
+obj-$(CONFIG_LUSTRE_FS) += libcfs.o
+
+libcfs-linux-objs := linux-tracefile.o linux-debug.o
+libcfs-linux-objs += linux-prim.o linux-cpu.o
+libcfs-linux-objs += linux-tcpip.o
+libcfs-linux-objs += linux-proc.o linux-curproc.o
+libcfs-linux-objs += linux-module.o
+libcfs-linux-objs += linux-crypto.o
+libcfs-linux-objs += linux-crypto-adler.o
+
+libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs))
+
+libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \
+		   watchdog.o libcfs_string.o hash.o kernel_user_comm.o \
+		   prng.o workitem.o upcall_cache.o libcfs_cpu.o \
+		   libcfs_mem.o libcfs_lock.o
+
+libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs)
+
+ccflags-y := -I$(src)/../include
+ccflags-y += -I$(src)/

diff --git a/drivers/staging/lustre/lustre/libcfs/debug.c b/drivers/staging/lustre/lustre/libcfs/debug.c
new file mode 100644
index 0000000..5a87b08
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/debug.c

@@ -0,0 +1,476 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+static char debug_file_name[1024];
+
+unsigned int libcfs_subsystem_debug = ~0;
+CFS_MODULE_PARM(libcfs_subsystem_debug, "i", int, 0644,
+		"Lustre kernel debug subsystem mask");
+EXPORT_SYMBOL(libcfs_subsystem_debug);
+
+unsigned int libcfs_debug = (D_CANTMASK |
+			     D_NETERROR | D_HA | D_CONFIG | D_IOCTL);
+CFS_MODULE_PARM(libcfs_debug, "i", int, 0644,
+		"Lustre kernel debug mask");
+EXPORT_SYMBOL(libcfs_debug);
+
+unsigned int libcfs_debug_mb = 0;
+CFS_MODULE_PARM(libcfs_debug_mb, "i", uint, 0644,
+		"Total debug buffer size.");
+EXPORT_SYMBOL(libcfs_debug_mb);
+
+unsigned int libcfs_printk = D_CANTMASK;
+CFS_MODULE_PARM(libcfs_printk, "i", uint, 0644,
+		"Lustre kernel debug console mask");
+EXPORT_SYMBOL(libcfs_printk);
+
+unsigned int libcfs_console_ratelimit = 1;
+CFS_MODULE_PARM(libcfs_console_ratelimit, "i", uint, 0644,
+		"Lustre kernel debug console ratelimit (0 to disable)");
+EXPORT_SYMBOL(libcfs_console_ratelimit);
+
+unsigned int libcfs_console_max_delay;
+CFS_MODULE_PARM(libcfs_console_max_delay, "l", uint, 0644,
+		"Lustre kernel debug console max delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_max_delay);
+
+unsigned int libcfs_console_min_delay;
+CFS_MODULE_PARM(libcfs_console_min_delay, "l", uint, 0644,
+		"Lustre kernel debug console min delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_min_delay);
+
+unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
+CFS_MODULE_PARM(libcfs_console_backoff, "i", uint, 0644,
+		"Lustre kernel debug console backoff factor");
+EXPORT_SYMBOL(libcfs_console_backoff);
+
+unsigned int libcfs_debug_binary = 1;
+EXPORT_SYMBOL(libcfs_debug_binary);
+
+unsigned int libcfs_stack = 3 * THREAD_SIZE / 4;
+EXPORT_SYMBOL(libcfs_stack);
+
+unsigned int portal_enter_debugger;
+EXPORT_SYMBOL(portal_enter_debugger);
+
+unsigned int libcfs_catastrophe;
+EXPORT_SYMBOL(libcfs_catastrophe);
+
+unsigned int libcfs_watchdog_ratelimit = 300;
+EXPORT_SYMBOL(libcfs_watchdog_ratelimit);
+
+unsigned int libcfs_panic_on_lbug = 1;
+CFS_MODULE_PARM(libcfs_panic_on_lbug, "i", uint, 0644,
+		"Lustre kernel panic on LBUG");
+EXPORT_SYMBOL(libcfs_panic_on_lbug);
+
+atomic_t libcfs_kmemory = ATOMIC_INIT(0);
+EXPORT_SYMBOL(libcfs_kmemory);
+
+static wait_queue_head_t debug_ctlwq;
+
+char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
+
+/* We need to pass a pointer here, but elsewhere this must be a const */
+char *libcfs_debug_file_path;
+CFS_MODULE_PARM(libcfs_debug_file_path, "s", charp, 0644,
+		"Path for dumping debug logs, "
+		"set 'NONE' to prevent log dumping");
+
+int libcfs_panic_in_progress;
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+const char *
+libcfs_debug_subsys2str(int subsys)
+{
+	switch (1 << subsys) {
+	default:
+		return NULL;
+	case S_UNDEFINED:
+		return "undefined";
+	case S_MDC:
+		return "mdc";
+	case S_MDS:
+		return "mds";
+	case S_OSC:
+		return "osc";
+	case S_OST:
+		return "ost";
+	case S_CLASS:
+		return "class";
+	case S_LOG:
+		return "log";
+	case S_LLITE:
+		return "llite";
+	case S_RPC:
+		return "rpc";
+	case S_LNET:
+		return "lnet";
+	case S_LND:
+		return "lnd";
+	case S_PINGER:
+		return "pinger";
+	case S_FILTER:
+		return "filter";
+	case S_ECHO:
+		return "echo";
+	case S_LDLM:
+		return "ldlm";
+	case S_LOV:
+		return "lov";
+	case S_LQUOTA:
+		return "lquota";
+	case S_OSD:
+		return "osd";
+	case S_LMV:
+		return "lmv";
+	case S_SEC:
+		return "sec";
+	case S_GSS:
+		return "gss";
+	case S_MGC:
+		return "mgc";
+	case S_MGS:
+		return "mgs";
+	case S_FID:
+		return "fid";
+	case S_FLD:
+		return "fld";
+	}
+}
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+const char *
+libcfs_debug_dbg2str(int debug)
+{
+	switch (1 << debug) {
+	default:
+		return NULL;
+	case D_TRACE:
+		return "trace";
+	case D_INODE:
+		return "inode";
+	case D_SUPER:
+		return "super";
+	case D_EXT2:
+		return "ext2";
+	case D_MALLOC:
+		return "malloc";
+	case D_CACHE:
+		return "cache";
+	case D_INFO:
+		return "info";
+	case D_IOCTL:
+		return "ioctl";
+	case D_NETERROR:
+		return "neterror";
+	case D_NET:
+		return "net";
+	case D_WARNING:
+		return "warning";
+	case D_BUFFS:
+		return "buffs";
+	case D_OTHER:
+		return "other";
+	case D_DENTRY:
+		return "dentry";
+	case D_NETTRACE:
+		return "nettrace";
+	case D_PAGE:
+		return "page";
+	case D_DLMTRACE:
+		return "dlmtrace";
+	case D_ERROR:
+		return "error";
+	case D_EMERG:
+		return "emerg";
+	case D_HA:
+		return "ha";
+	case D_RPCTRACE:
+		return "rpctrace";
+	case D_VFSTRACE:
+		return "vfstrace";
+	case D_READA:
+		return "reada";
+	case D_MMAP:
+		return "mmap";
+	case D_CONFIG:
+		return "config";
+	case D_CONSOLE:
+		return "console";
+	case D_QUOTA:
+		return "quota";
+	case D_SEC:
+		return "sec";
+	case D_LFSCK:
+		return "lfsck";
+	}
+}
+
+int
+libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
+{
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int	   len = 0;
+	const char   *token;
+	int	   i;
+
+	if (mask == 0) {			/* "0" */
+		if (size > 0)
+			str[0] = '0';
+		len = 1;
+	} else {				/* space-separated tokens */
+		for (i = 0; i < 32; i++) {
+			if ((mask & (1 << i)) == 0)
+				continue;
+
+			token = fn(i);
+			if (token == NULL)	      /* unused bit */
+				continue;
+
+			if (len > 0) {		  /* separator? */
+				if (len < size)
+					str[len] = ' ';
+				len++;
+			}
+
+			while (*token != 0) {
+				if (len < size)
+					str[len] = *token;
+				token++;
+				len++;
+			}
+		}
+	}
+
+	/* terminate 'str' */
+	if (len < size)
+		str[len] = 0;
+	else
+		str[size - 1] = 0;
+
+	return len;
+}
+
+int
+libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
+{
+	const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+						 libcfs_debug_dbg2str;
+	int	 m = 0;
+	int	 matched;
+	int	 n;
+	int	 t;
+
+	/* Allow a number for backwards compatibility */
+
+	for (n = strlen(str); n > 0; n--)
+		if (!isspace(str[n-1]))
+			break;
+	matched = n;
+
+	if ((t = sscanf(str, "%i%n", &m, &matched)) >= 1 &&
+	    matched == n) {
+		/* don't print warning for lctl set_param debug=0 or -1 */
+		if (m != 0 && m != -1)
+			CWARN("You are trying to use a numerical value for the "
+			      "mask - this will be deprecated in a future "
+			      "release.\n");
+		*mask = m;
+		return 0;
+	}
+
+	return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+			    0xffffffff);
+}
+
+/**
+ * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages()
+ */
+void libcfs_debug_dumplog_internal(void *arg)
+{
+	DECL_JOURNAL_DATA;
+
+	PUSH_JOURNAL;
+
+	if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0) {
+		snprintf(debug_file_name, sizeof(debug_file_name) - 1,
+			 "%s.%ld." LPLD, libcfs_debug_file_path_arr,
+			 cfs_time_current_sec(), (long_ptr_t)arg);
+		printk(KERN_ALERT "LustreError: dumping log to %s\n",
+		       debug_file_name);
+		cfs_tracefile_dump_all_pages(debug_file_name);
+		libcfs_run_debug_log_upcall(debug_file_name);
+	}
+	POP_JOURNAL;
+}
+
+int libcfs_debug_dumplog_thread(void *arg)
+{
+	libcfs_debug_dumplog_internal(arg);
+	wake_up(&debug_ctlwq);
+	return 0;
+}
+
+void libcfs_debug_dumplog(void)
+{
+	wait_queue_t wait;
+	task_t    *dumper;
+	ENTRY;
+
+	/* we're being careful to ensure that the kernel thread is
+	 * able to set our state to running as it exits before we
+	 * get to schedule() */
+	init_waitqueue_entry_current(&wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&debug_ctlwq, &wait);
+
+	dumper = kthread_run(libcfs_debug_dumplog_thread,
+			     (void *)(long)current_pid(),
+			     "libcfs_debug_dumper");
+	if (IS_ERR(dumper))
+		printk(KERN_ERR "LustreError: cannot start log dump thread:"
+		       " %ld\n", PTR_ERR(dumper));
+	else
+		waitq_wait(&wait, TASK_INTERRUPTIBLE);
+
+	/* be sure to teardown if cfs_create_thread() failed */
+	remove_wait_queue(&debug_ctlwq, &wait);
+	set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(libcfs_debug_dumplog);
+
+int libcfs_debug_init(unsigned long bufsize)
+{
+	int    rc = 0;
+	unsigned int max = libcfs_debug_mb;
+
+	init_waitqueue_head(&debug_ctlwq);
+
+	if (libcfs_console_max_delay <= 0 || /* not set by user or */
+	    libcfs_console_min_delay <= 0 || /* set to invalid values */
+	    libcfs_console_min_delay >= libcfs_console_max_delay) {
+		libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY;
+		libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY;
+	}
+
+	if (libcfs_debug_file_path != NULL) {
+		memset(libcfs_debug_file_path_arr, 0, PATH_MAX);
+		strncpy(libcfs_debug_file_path_arr,
+			libcfs_debug_file_path, PATH_MAX-1);
+	}
+
+	/* If libcfs_debug_mb is set to an invalid value or uninitialized
+	 * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
+	if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
+		max = TCD_MAX_PAGES;
+	} else {
+		max = (max / num_possible_cpus());
+		max = (max << (20 - PAGE_CACHE_SHIFT));
+	}
+	rc = cfs_tracefile_init(max);
+
+	if (rc == 0)
+		libcfs_register_panic_notifier();
+
+	return rc;
+}
+
+int libcfs_debug_cleanup(void)
+{
+	libcfs_unregister_panic_notifier();
+	cfs_tracefile_exit();
+	return 0;
+}
+
+int libcfs_debug_clear_buffer(void)
+{
+	cfs_trace_flush_pages();
+	return 0;
+}
+
+/* Debug markers, although printed by S_LNET
+ * should not be be marked as such. */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int libcfs_debug_mark_buffer(const char *text)
+{
+	CDEBUG(D_TRACE,"***************************************************\n");
+	LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+	CDEBUG(D_TRACE,"***************************************************\n");
+
+	return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_LNET
+
+void libcfs_debug_set_level(unsigned int debug_level)
+{
+	printk(KERN_WARNING "Lustre: Setting portals debug level to %08x\n",
+	       debug_level);
+	libcfs_debug = debug_level;
+}
+
+EXPORT_SYMBOL(libcfs_debug_set_level);
+
+long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc)
+{
+	libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
+			 rc, rc, rc);
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_log_return);
+
+void libcfs_log_goto(struct libcfs_debug_msg_data *msgdata, const char *label,
+		     long_ptr_t rc)
+{
+	libcfs_debug_msg(msgdata, "Process leaving via %s (rc=" LPLU " : " LPLD
+			 " : " LPLX ")\n", label, (ulong_ptr_t)rc, rc, rc);
+}
+EXPORT_SYMBOL(libcfs_log_goto);

diff --git a/drivers/staging/lustre/lustre/libcfs/fail.c b/drivers/staging/lustre/lustre/libcfs/fail.c
new file mode 100644
index 0000000..c54448d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/fail.c

@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+unsigned long cfs_fail_loc = 0;
+unsigned int cfs_fail_val = 0;
+wait_queue_head_t cfs_race_waitq;
+int cfs_race_state;
+
+EXPORT_SYMBOL(cfs_fail_loc);
+EXPORT_SYMBOL(cfs_fail_val);
+EXPORT_SYMBOL(cfs_race_waitq);
+EXPORT_SYMBOL(cfs_race_state);
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set)
+{
+	static atomic_t cfs_fail_count = ATOMIC_INIT(0);
+
+	LASSERT(!(id & CFS_FAIL_ONCE));
+
+	if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) ==
+	    (CFS_FAILED | CFS_FAIL_ONCE)) {
+		atomic_set(&cfs_fail_count, 0); /* paranoia */
+		return 0;
+	}
+
+	/* Fail 1/cfs_fail_val times */
+	if (cfs_fail_loc & CFS_FAIL_RAND) {
+		if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0)
+			return 0;
+	}
+
+	/* Skip the first cfs_fail_val, then fail */
+	if (cfs_fail_loc & CFS_FAIL_SKIP) {
+		if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val)
+			return 0;
+	}
+
+	/* check cfs_fail_val... */
+	if (set == CFS_FAIL_LOC_VALUE) {
+		if (cfs_fail_val != -1 && cfs_fail_val != value)
+			return 0;
+	}
+
+	/* Fail cfs_fail_val times, overridden by FAIL_ONCE */
+	if (cfs_fail_loc & CFS_FAIL_SOME &&
+	    (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) {
+		int count = atomic_inc_return(&cfs_fail_count);
+
+		if (count >= cfs_fail_val) {
+			set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+			atomic_set(&cfs_fail_count, 0);
+			/* we are lost race to increase  */
+			if (count > cfs_fail_val)
+				return 0;
+		}
+	}
+
+	if ((set == CFS_FAIL_LOC_ORSET || set == CFS_FAIL_LOC_RESET) &&
+	    (value & CFS_FAIL_ONCE))
+		set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+	/* Lost race to set CFS_FAILED_BIT. */
+	if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) {
+		/* If CFS_FAIL_ONCE is valid, only one process can fail,
+		 * otherwise multi-process can fail at the same time. */
+		if (cfs_fail_loc & CFS_FAIL_ONCE)
+			return 0;
+	}
+
+	switch (set) {
+		case CFS_FAIL_LOC_NOSET:
+		case CFS_FAIL_LOC_VALUE:
+			break;
+		case CFS_FAIL_LOC_ORSET:
+			cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE);
+			break;
+		case CFS_FAIL_LOC_RESET:
+			cfs_fail_loc = value;
+			break;
+		default:
+			LASSERTF(0, "called with bad set %u\n", set);
+			break;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(__cfs_fail_check_set);
+
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+	int ret = 0;
+
+	ret = __cfs_fail_check_set(id, value, set);
+	if (ret) {
+		CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
+		       id, ms);
+		schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+						   cfs_time_seconds(ms) / 1000);
+		set_current_state(TASK_RUNNING);
+		CERROR("cfs_fail_timeout id %x awake\n", id);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cfs_fail_timeout_set);

diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c
new file mode 100644
index 0000000..98c76df
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/hash.c

@@ -0,0 +1,2123 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/hash.c
+ *
+ * Implement a hash class for hash process in lustre system.
+ *
+ * Author: YuZhangyong <yzy@clusterfs.com>
+ *
+ * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
+ * - Simplified API and improved documentation
+ * - Added per-hash feature flags:
+ *   * CFS_HASH_DEBUG additional validation
+ *   * CFS_HASH_REHASH dynamic rehashing
+ * - Added per-hash statistics
+ * - General performance enhancements
+ *
+ * 2009-07-31: Liang Zhen <zhen.liang@sun.com>
+ * - move all stuff to libcfs
+ * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH
+ * - ignore hs_rwlock if without CFS_HASH_REHASH setting
+ * - buckets are allocated one by one(intead of contiguous memory),
+ *   to avoid unnecessary cacheline conflict
+ *
+ * 2010-03-01: Liang Zhen <zhen.liang@sun.com>
+ * - "bucket" is a group of hlist_head now, user can speicify bucket size
+ *   by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share
+ *   one lock for reducing memory overhead.
+ *
+ * - support lockless hash, caller will take care of locks:
+ *   avoid lock overhead for hash tables that are already protected
+ *   by locking in the caller for another reason
+ *
+ * - support both spin_lock/rwlock for bucket:
+ *   overhead of spinlock contention is lower than read/write
+ *   contention of rwlock, so using spinlock to serialize operations on
+ *   bucket is more reasonable for those frequently changed hash tables
+ *
+ * - support one-single lock mode:
+ *   one lock to protect all hash operations to avoid overhead of
+ *   multiple locks if hash table is always small
+ *
+ * - removed a lot of unnecessary addref & decref on hash element:
+ *   addref & decref are atomic operations in many use-cases which
+ *   are expensive.
+ *
+ * - support non-blocking cfs_hash_add() and cfs_hash_findadd():
+ *   some lustre use-cases require these functions to be strictly
+ *   non-blocking, we need to schedule required rehash on a different
+ *   thread on those cases.
+ *
+ * - safer rehash on large hash table
+ *   In old implementation, rehash function will exclusively lock the
+ *   hash table and finish rehash in one batch, it's dangerous on SMP
+ *   system because rehash millions of elements could take long time.
+ *   New implemented rehash can release lock and relax CPU in middle
+ *   of rehash, it's safe for another thread to search/change on the
+ *   hash table even it's in rehasing.
+ *
+ * - support two different refcount modes
+ *   . hash table has refcount on element
+ *   . hash table doesn't change refcount on adding/removing element
+ *
+ * - support long name hash table (for param-tree)
+ *
+ * - fix a bug for cfs_hash_rehash_key:
+ *   in old implementation, cfs_hash_rehash_key could screw up the
+ *   hash-table because @key is overwritten without any protection.
+ *   Now we need user to define hs_keycpy for those rehash enabled
+ *   hash tables, cfs_hash_rehash_key will overwrite hash-key
+ *   inside lock by calling hs_keycpy.
+ *
+ * - better hash iteration:
+ *   Now we support both locked iteration & lockless iteration of hash
+ *   table. Also, user can break the iteration by return 1 in callback.
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/seq_file.h>
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static unsigned int warn_on_depth = 8;
+CFS_MODULE_PARM(warn_on_depth, "i", uint, 0644,
+		"warning when hash depth is high.");
+#endif
+
+struct cfs_wi_sched *cfs_sched_rehash;
+
+static inline void
+cfs_hash_nl_lock(cfs_hash_lock_t *lock, int exclusive) {}
+
+static inline void
+cfs_hash_nl_unlock(cfs_hash_lock_t *lock, int exclusive) {}
+
+static inline void
+cfs_hash_spin_lock(cfs_hash_lock_t *lock, int exclusive)
+{
+	spin_lock(&lock->spin);
+}
+
+static inline void
+cfs_hash_spin_unlock(cfs_hash_lock_t *lock, int exclusive)
+{
+	spin_unlock(&lock->spin);
+}
+
+static inline void
+cfs_hash_rw_lock(cfs_hash_lock_t *lock, int exclusive)
+{
+	if (!exclusive)
+		read_lock(&lock->rw);
+	else
+		write_lock(&lock->rw);
+}
+
+static inline void
+cfs_hash_rw_unlock(cfs_hash_lock_t *lock, int exclusive)
+{
+	if (!exclusive)
+		read_unlock(&lock->rw);
+	else
+		write_unlock(&lock->rw);
+}
+
+/** No lock hash */
+static cfs_hash_lock_ops_t cfs_hash_nl_lops =
+{
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_nl_lock,
+	.hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** no bucket lock, one spinlock to protect everything */
+static cfs_hash_lock_ops_t cfs_hash_nbl_lops =
+{
+	.hs_lock	= cfs_hash_spin_lock,
+	.hs_unlock      = cfs_hash_spin_unlock,
+	.hs_bkt_lock    = cfs_hash_nl_lock,
+	.hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** spin bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_spin_lops =
+{
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock      = cfs_hash_rw_unlock,
+	.hs_bkt_lock    = cfs_hash_spin_lock,
+	.hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_rw_lops =
+{
+	.hs_lock	= cfs_hash_rw_lock,
+	.hs_unlock      = cfs_hash_rw_unlock,
+	.hs_bkt_lock    = cfs_hash_rw_lock,
+	.hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+/** spin bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_spin_lops =
+{
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_spin_lock,
+	.hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_rw_lops =
+{
+	.hs_lock	= cfs_hash_nl_lock,
+	.hs_unlock      = cfs_hash_nl_unlock,
+	.hs_bkt_lock    = cfs_hash_rw_lock,
+	.hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+static void
+cfs_hash_lock_setup(cfs_hash_t *hs)
+{
+	if (cfs_hash_with_no_lock(hs)) {
+		hs->hs_lops = &cfs_hash_nl_lops;
+
+	} else if (cfs_hash_with_no_bktlock(hs)) {
+		hs->hs_lops = &cfs_hash_nbl_lops;
+		spin_lock_init(&hs->hs_lock.spin);
+
+	} else if (cfs_hash_with_rehash(hs)) {
+		rwlock_init(&hs->hs_lock.rw);
+
+		if (cfs_hash_with_rw_bktlock(hs))
+			hs->hs_lops = &cfs_hash_bkt_rw_lops;
+		else if (cfs_hash_with_spin_bktlock(hs))
+			hs->hs_lops = &cfs_hash_bkt_spin_lops;
+		else
+			LBUG();
+	} else {
+		if (cfs_hash_with_rw_bktlock(hs))
+			hs->hs_lops = &cfs_hash_nr_bkt_rw_lops;
+		else if (cfs_hash_with_spin_bktlock(hs))
+			hs->hs_lops = &cfs_hash_nr_bkt_spin_lops;
+		else
+			LBUG();
+	}
+}
+
+/**
+ * Simple hash head without depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+	struct hlist_head	hh_head;	/**< entries list */
+} cfs_hash_head_t;
+
+static int
+cfs_hash_hh_hhead_size(cfs_hash_t *hs)
+{
+	return sizeof(cfs_hash_head_t);
+}
+
+static struct hlist_head *
+cfs_hash_hh_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	cfs_hash_head_t *head = (cfs_hash_head_t *)&bd->bd_bucket->hsb_head[0];
+
+	return &head[bd->bd_offset].hh_head;
+}
+
+static int
+cfs_hash_hh_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd));
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_hh_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	hlist_del_init(hnode);
+	return -1; /* unknown depth */
+}
+
+/**
+ * Simple hash head with depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+	struct hlist_head	hd_head;	/**< entries list */
+	unsigned int	    hd_depth;       /**< list length */
+} cfs_hash_head_dep_t;
+
+static int
+cfs_hash_hd_hhead_size(cfs_hash_t *hs)
+{
+	return sizeof(cfs_hash_head_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_hd_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	cfs_hash_head_dep_t   *head;
+
+	head = (cfs_hash_head_dep_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].hd_head;
+}
+
+static int
+cfs_hash_hd_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+					       cfs_hash_head_dep_t, hd_head);
+	hlist_add_head(hnode, &hh->hd_head);
+	return ++hh->hd_depth;
+}
+
+static int
+cfs_hash_hd_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+					       cfs_hash_head_dep_t, hd_head);
+	hlist_del_init(hnode);
+	return --hh->hd_depth;
+}
+
+/**
+ * double links hash head without depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+	struct hlist_head	dh_head;	/**< entries list */
+	struct hlist_node       *dh_tail;	/**< the last entry */
+} cfs_hash_dhead_t;
+
+static int
+cfs_hash_dh_hhead_size(cfs_hash_t *hs)
+{
+	return sizeof(cfs_hash_dhead_t);
+}
+
+static struct hlist_head *
+cfs_hash_dh_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	cfs_hash_dhead_t *head;
+
+	head = (cfs_hash_dhead_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dh_head;
+}
+
+static int
+cfs_hash_dh_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+					    cfs_hash_dhead_t, dh_head);
+
+	if (dh->dh_tail != NULL) /* not empty */
+		hlist_add_after(dh->dh_tail, hnode);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dh_head);
+	dh->dh_tail = hnode;
+	return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_dh_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnd)
+{
+	cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+					    cfs_hash_dhead_t, dh_head);
+
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return -1; /* unknown depth */
+}
+
+/**
+ * double links hash head with depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+	struct hlist_head	dd_head;	/**< entries list */
+	struct hlist_node       *dd_tail;	/**< the last entry */
+	unsigned int	    dd_depth;       /**< list length */
+} cfs_hash_dhead_dep_t;
+
+static int
+cfs_hash_dd_hhead_size(cfs_hash_t *hs)
+{
+	return sizeof(cfs_hash_dhead_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_dd_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+	cfs_hash_dhead_dep_t *head;
+
+	head = (cfs_hash_dhead_dep_t *)&bd->bd_bucket->hsb_head[0];
+	return &head[bd->bd_offset].dd_head;
+}
+
+static int
+cfs_hash_dd_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnode)
+{
+	cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+						cfs_hash_dhead_dep_t, dd_head);
+
+	if (dh->dd_tail != NULL) /* not empty */
+		hlist_add_after(dh->dd_tail, hnode);
+	else /* empty list */
+		hlist_add_head(hnode, &dh->dd_head);
+	dh->dd_tail = hnode;
+	return ++dh->dd_depth;
+}
+
+static int
+cfs_hash_dd_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		      struct hlist_node *hnd)
+{
+	cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+						cfs_hash_dhead_dep_t, dd_head);
+
+	if (hnd->next == NULL) { /* it's the tail */
+		dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL :
+			      container_of(hnd->pprev, struct hlist_node, next);
+	}
+	hlist_del_init(hnd);
+	return --dh->dd_depth;
+}
+
+static cfs_hash_hlist_ops_t cfs_hash_hh_hops = {
+       .hop_hhead      = cfs_hash_hh_hhead,
+       .hop_hhead_size = cfs_hash_hh_hhead_size,
+       .hop_hnode_add  = cfs_hash_hh_hnode_add,
+       .hop_hnode_del  = cfs_hash_hh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_hd_hops = {
+       .hop_hhead      = cfs_hash_hd_hhead,
+       .hop_hhead_size = cfs_hash_hd_hhead_size,
+       .hop_hnode_add  = cfs_hash_hd_hnode_add,
+       .hop_hnode_del  = cfs_hash_hd_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dh_hops = {
+       .hop_hhead      = cfs_hash_dh_hhead,
+       .hop_hhead_size = cfs_hash_dh_hhead_size,
+       .hop_hnode_add  = cfs_hash_dh_hnode_add,
+       .hop_hnode_del  = cfs_hash_dh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dd_hops = {
+       .hop_hhead      = cfs_hash_dd_hhead,
+       .hop_hhead_size = cfs_hash_dd_hhead_size,
+       .hop_hnode_add  = cfs_hash_dd_hnode_add,
+       .hop_hnode_del  = cfs_hash_dd_hnode_del,
+};
+
+static void
+cfs_hash_hlist_setup(cfs_hash_t *hs)
+{
+	if (cfs_hash_with_add_tail(hs)) {
+		hs->hs_hops = cfs_hash_with_depth(hs) ?
+			      &cfs_hash_dd_hops : &cfs_hash_dh_hops;
+	} else {
+		hs->hs_hops = cfs_hash_with_depth(hs) ?
+			      &cfs_hash_hd_hops : &cfs_hash_hh_hops;
+	}
+}
+
+static void
+cfs_hash_bd_from_key(cfs_hash_t *hs, cfs_hash_bucket_t **bkts,
+		     unsigned int bits, const void *key, cfs_hash_bd_t *bd)
+{
+	unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1);
+
+	LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits);
+
+	bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)];
+	bd->bd_offset = index >> (bits - hs->hs_bkt_bits);
+}
+
+void
+cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (likely(hs->hs_rehash_buckets == NULL)) {
+		cfs_hash_bd_from_key(hs, hs->hs_buckets,
+				     hs->hs_cur_bits, key, bd);
+	} else {
+		LASSERT(hs->hs_rehash_bits != 0);
+		cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+				     hs->hs_rehash_bits, key, bd);
+	}
+}
+EXPORT_SYMBOL(cfs_hash_bd_get);
+
+static inline void
+cfs_hash_bd_dep_record(cfs_hash_t *hs, cfs_hash_bd_t *bd, int dep_cur)
+{
+	if (likely(dep_cur <= bd->bd_bucket->hsb_depmax))
+		return;
+
+	bd->bd_bucket->hsb_depmax = dep_cur;
+# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+	if (likely(warn_on_depth == 0 ||
+		   max(warn_on_depth, hs->hs_dep_max) >= dep_cur))
+		return;
+
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_max  = dep_cur;
+	hs->hs_dep_bkt  = bd->bd_bucket->hsb_index;
+	hs->hs_dep_off  = bd->bd_offset;
+	hs->hs_dep_bits = hs->hs_cur_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi);
+# endif
+}
+
+void
+cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		       struct hlist_node *hnode)
+{
+	int		rc;
+
+	rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode);
+	cfs_hash_bd_dep_record(hs, bd, rc);
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+	bd->bd_bucket->hsb_count++;
+
+	if (cfs_hash_with_counter(hs))
+		atomic_inc(&hs->hs_count);
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_get(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_add_locked);
+
+void
+cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		       struct hlist_node *hnode)
+{
+	hs->hs_hops->hop_hnode_del(hs, bd, hnode);
+
+	LASSERT(bd->bd_bucket->hsb_count > 0);
+	bd->bd_bucket->hsb_count--;
+	bd->bd_bucket->hsb_version++;
+	if (unlikely(bd->bd_bucket->hsb_version == 0))
+		bd->bd_bucket->hsb_version++;
+
+	if (cfs_hash_with_counter(hs)) {
+		LASSERT(atomic_read(&hs->hs_count) > 0);
+		atomic_dec(&hs->hs_count);
+	}
+	if (!cfs_hash_with_no_itemref(hs))
+		cfs_hash_put_locked(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_del_locked);
+
+void
+cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old,
+			cfs_hash_bd_t *bd_new, struct hlist_node *hnode)
+{
+	cfs_hash_bucket_t *obkt = bd_old->bd_bucket;
+	cfs_hash_bucket_t *nbkt = bd_new->bd_bucket;
+	int		rc;
+
+	if (cfs_hash_bd_compare(bd_old, bd_new) == 0)
+		return;
+
+	/* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops
+	 * in cfs_hash_bd_del/add_locked */
+	hs->hs_hops->hop_hnode_del(hs, bd_old, hnode);
+	rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode);
+	cfs_hash_bd_dep_record(hs, bd_new, rc);
+
+	LASSERT(obkt->hsb_count > 0);
+	obkt->hsb_count--;
+	obkt->hsb_version++;
+	if (unlikely(obkt->hsb_version == 0))
+		obkt->hsb_version++;
+	nbkt->hsb_count++;
+	nbkt->hsb_version++;
+	if (unlikely(nbkt->hsb_version == 0))
+		nbkt->hsb_version++;
+}
+EXPORT_SYMBOL(cfs_hash_bd_move_locked);
+
+enum {
+	/** always set, for sanity (avoid ZERO intent) */
+	CFS_HS_LOOKUP_MASK_FIND     = 1 << 0,
+	/** return entry with a ref */
+	CFS_HS_LOOKUP_MASK_REF      = 1 << 1,
+	/** add entry if not existing */
+	CFS_HS_LOOKUP_MASK_ADD      = 1 << 2,
+	/** delete entry, ignore other masks */
+	CFS_HS_LOOKUP_MASK_DEL      = 1 << 3,
+};
+
+typedef enum cfs_hash_lookup_intent {
+	/** return item w/o refcount */
+	CFS_HS_LOOKUP_IT_PEEK       = CFS_HS_LOOKUP_MASK_FIND,
+	/** return item with refcount */
+	CFS_HS_LOOKUP_IT_FIND       = (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_REF),
+	/** return item w/o refcount if existed, otherwise add */
+	CFS_HS_LOOKUP_IT_ADD	= (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_ADD),
+	/** return item with refcount if existed, otherwise add */
+	CFS_HS_LOOKUP_IT_FINDADD    = (CFS_HS_LOOKUP_IT_FIND |
+				       CFS_HS_LOOKUP_MASK_ADD),
+	/** delete if existed */
+	CFS_HS_LOOKUP_IT_FINDDEL    = (CFS_HS_LOOKUP_MASK_FIND |
+				       CFS_HS_LOOKUP_MASK_DEL)
+} cfs_hash_lookup_intent_t;
+
+static struct hlist_node *
+cfs_hash_bd_lookup_intent(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			  const void *key, struct hlist_node *hnode,
+			  cfs_hash_lookup_intent_t intent)
+
+{
+	struct hlist_head  *hhead = cfs_hash_bd_hhead(hs, bd);
+	struct hlist_node  *ehnode;
+	struct hlist_node  *match;
+	int  intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0;
+
+	/* with this function, we can avoid a lot of useless refcount ops,
+	 * which are expensive atomic operations most time. */
+	match = intent_add ? NULL : hnode;
+	hlist_for_each(ehnode, hhead) {
+		if (!cfs_hash_keycmp(hs, key, ehnode))
+			continue;
+
+		if (match != NULL && match != ehnode) /* can't match */
+			continue;
+
+		/* match and ... */
+		if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) {
+			cfs_hash_bd_del_locked(hs, bd, ehnode);
+			return ehnode;
+		}
+
+		/* caller wants refcount? */
+		if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0)
+			cfs_hash_get(hs, ehnode);
+		return ehnode;
+	}
+	/* no match item */
+	if (!intent_add)
+		return NULL;
+
+	LASSERT(hnode != NULL);
+	cfs_hash_bd_add_locked(hs, bd, hnode);
+	return hnode;
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					 CFS_HS_LOOKUP_IT_FIND);
+}
+EXPORT_SYMBOL(cfs_hash_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_bd_peek_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, const void *key)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+					 CFS_HS_LOOKUP_IT_PEEK);
+}
+EXPORT_SYMBOL(cfs_hash_bd_peek_locked);
+
+struct hlist_node *
+cfs_hash_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			   const void *key, struct hlist_node *hnode,
+			   int noref)
+{
+	return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+					 CFS_HS_LOOKUP_IT_ADD |
+					 (!noref * CFS_HS_LOOKUP_MASK_REF));
+}
+EXPORT_SYMBOL(cfs_hash_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			   const void *key, struct hlist_node *hnode)
+{
+	/* hnode can be NULL, we find the first item with @key */
+	return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+					 CFS_HS_LOOKUP_IT_FINDDEL);
+}
+EXPORT_SYMBOL(cfs_hash_bd_finddel_locked);
+
+static void
+cfs_hash_multi_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+		       unsigned n, int excl)
+{
+	cfs_hash_bucket_t *prev = NULL;
+	int		i;
+
+	/**
+	 * bds must be ascendantly ordered by bd->bd_bucket->hsb_index.
+	 * NB: it's possible that several bds point to the same bucket but
+	 * have different bd::bd_offset, so need take care of deadlock.
+	 */
+	cfs_hash_for_each_bd(bds, n, i) {
+		if (prev == bds[i].bd_bucket)
+			continue;
+
+		LASSERT(prev == NULL ||
+			prev->hsb_index < bds[i].bd_bucket->hsb_index);
+		cfs_hash_bd_lock(hs, &bds[i], excl);
+		prev = bds[i].bd_bucket;
+	}
+}
+
+static void
+cfs_hash_multi_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+			 unsigned n, int excl)
+{
+	cfs_hash_bucket_t *prev = NULL;
+	int		i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		if (prev != bds[i].bd_bucket) {
+			cfs_hash_bd_unlock(hs, &bds[i], excl);
+			prev = bds[i].bd_bucket;
+		}
+	}
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+				unsigned n, const void *key)
+{
+	struct hlist_node  *ehnode;
+	unsigned	   i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL,
+						   CFS_HS_LOOKUP_IT_FIND);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_findadd_locked(cfs_hash_t *hs,
+				 cfs_hash_bd_t *bds, unsigned n, const void *key,
+				 struct hlist_node *hnode, int noref)
+{
+	struct hlist_node  *ehnode;
+	int		intent;
+	unsigned	   i;
+
+	LASSERT(hnode != NULL);
+	intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF);
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key,
+						   NULL, intent);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+
+	if (i == 1) { /* only one bucket */
+		cfs_hash_bd_add_locked(hs, &bds[0], hnode);
+	} else {
+		cfs_hash_bd_t      mybd;
+
+		cfs_hash_bd_get(hs, key, &mybd);
+		cfs_hash_bd_add_locked(hs, &mybd, hnode);
+	}
+
+	return hnode;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+				 unsigned n, const void *key,
+				 struct hlist_node *hnode)
+{
+	struct hlist_node  *ehnode;
+	unsigned	   i;
+
+	cfs_hash_for_each_bd(bds, n, i) {
+		ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode,
+						   CFS_HS_LOOKUP_IT_FINDDEL);
+		if (ehnode != NULL)
+			return ehnode;
+	}
+	return NULL;
+}
+
+static void
+cfs_hash_bd_order(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2)
+{
+	int     rc;
+
+	if (bd2->bd_bucket == NULL)
+		return;
+
+	if (bd1->bd_bucket == NULL) {
+		*bd1 = *bd2;
+		bd2->bd_bucket = NULL;
+		return;
+	}
+
+	rc = cfs_hash_bd_compare(bd1, bd2);
+	if (rc == 0) {
+		bd2->bd_bucket = NULL;
+
+	} else if (rc > 0) { /* swab bd1 and bd2 */
+		cfs_hash_bd_t tmp;
+
+		tmp = *bd2;
+		*bd2 = *bd1;
+		*bd1 = tmp;
+	}
+}
+
+void
+cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds)
+{
+	/* NB: caller should hold hs_lock.rw if REHASH is set */
+	cfs_hash_bd_from_key(hs, hs->hs_buckets,
+			     hs->hs_cur_bits, key, &bds[0]);
+	if (likely(hs->hs_rehash_buckets == NULL)) {
+		/* no rehash or not rehashing */
+		bds[1].bd_bucket = NULL;
+		return;
+	}
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+			     hs->hs_rehash_bits, key, &bds[1]);
+
+	cfs_hash_bd_order(&bds[0], &bds[1]);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_get);
+
+void
+cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl)
+{
+	cfs_hash_multi_bd_lock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lock);
+
+void
+cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl)
+{
+	cfs_hash_multi_bd_unlock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_unlock);
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+			       const void *key)
+{
+	return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+				const void *key, struct hlist_node *hnode,
+				int noref)
+{
+	return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key,
+						hnode, noref);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+				const void *key, struct hlist_node *hnode)
+{
+	return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_finddel_locked);
+
+static void
+cfs_hash_buckets_free(cfs_hash_bucket_t **buckets,
+		      int bkt_size, int prev_size, int size)
+{
+	int     i;
+
+	for (i = prev_size; i < size; i++) {
+		if (buckets[i] != NULL)
+			LIBCFS_FREE(buckets[i], bkt_size);
+	}
+
+	LIBCFS_FREE(buckets, sizeof(buckets[0]) * size);
+}
+
+/*
+ * Create or grow bucket memory. Return old_buckets if no allocation was
+ * needed, the newly allocated buckets if allocation was needed and
+ * successful, and NULL on error.
+ */
+static cfs_hash_bucket_t **
+cfs_hash_buckets_realloc(cfs_hash_t *hs, cfs_hash_bucket_t **old_bkts,
+			 unsigned int old_size, unsigned int new_size)
+{
+	cfs_hash_bucket_t **new_bkts;
+	int		 i;
+
+	LASSERT(old_size == 0 || old_bkts != NULL);
+
+	if (old_bkts != NULL && old_size == new_size)
+		return old_bkts;
+
+	LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size);
+	if (new_bkts == NULL)
+		return NULL;
+
+	if (old_bkts != NULL) {
+		memcpy(new_bkts, old_bkts,
+		       min(old_size, new_size) * sizeof(*old_bkts));
+	}
+
+	for (i = old_size; i < new_size; i++) {
+		struct hlist_head *hhead;
+		cfs_hash_bd_t     bd;
+
+		LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs));
+		if (new_bkts[i] == NULL) {
+			cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs),
+					      old_size, new_size);
+			return NULL;
+		}
+
+		new_bkts[i]->hsb_index   = i;
+		new_bkts[i]->hsb_version = 1;  /* shouldn't be zero */
+		new_bkts[i]->hsb_depmax  = -1; /* unknown */
+		bd.bd_bucket = new_bkts[i];
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead)
+			INIT_HLIST_HEAD(hhead);
+
+		if (cfs_hash_with_no_lock(hs) ||
+		    cfs_hash_with_no_bktlock(hs))
+			continue;
+
+		if (cfs_hash_with_rw_bktlock(hs))
+			rwlock_init(&new_bkts[i]->hsb_lock.rw);
+		else if (cfs_hash_with_spin_bktlock(hs))
+			spin_lock_init(&new_bkts[i]->hsb_lock.spin);
+		else
+			LBUG(); /* invalid use-case */
+	}
+	return new_bkts;
+}
+
+/**
+ * Initialize new libcfs hash, where:
+ * @name     - Descriptive hash name
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
+ * @ops      - Registered hash table operations
+ * @flags    - CFS_HASH_REHASH enable synamic hash resizing
+ *	   - CFS_HASH_SORT enable chained hash sort
+ */
+static int cfs_hash_rehash_worker(cfs_workitem_t *wi);
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static int cfs_hash_dep_print(cfs_workitem_t *wi)
+{
+	cfs_hash_t *hs = container_of(wi, cfs_hash_t, hs_dep_wi);
+	int	 dep;
+	int	 bkt;
+	int	 off;
+	int	 bits;
+
+	spin_lock(&hs->hs_dep_lock);
+	dep  = hs->hs_dep_max;
+	bkt  = hs->hs_dep_bkt;
+	off  = hs->hs_dep_off;
+	bits = hs->hs_dep_bits;
+	spin_unlock(&hs->hs_dep_lock);
+
+	LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n",
+		      hs->hs_name, bits, dep, bkt, off);
+	spin_lock(&hs->hs_dep_lock);
+	hs->hs_dep_bits = 0; /* mark as workitem done */
+	spin_unlock(&hs->hs_dep_lock);
+	return 0;
+}
+
+static void cfs_hash_depth_wi_init(cfs_hash_t *hs)
+{
+	spin_lock_init(&hs->hs_dep_lock);
+	cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print);
+}
+
+static void cfs_hash_depth_wi_cancel(cfs_hash_t *hs)
+{
+	if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi))
+		return;
+
+	spin_lock(&hs->hs_dep_lock);
+	while (hs->hs_dep_bits != 0) {
+		spin_unlock(&hs->hs_dep_lock);
+		cond_resched();
+		spin_lock(&hs->hs_dep_lock);
+	}
+	spin_unlock(&hs->hs_dep_lock);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */
+
+static inline void cfs_hash_depth_wi_init(cfs_hash_t *hs) {}
+static inline void cfs_hash_depth_wi_cancel(cfs_hash_t *hs) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */
+
+cfs_hash_t *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+		unsigned bkt_bits, unsigned extra_bytes,
+		unsigned min_theta, unsigned max_theta,
+		cfs_hash_ops_t *ops, unsigned flags)
+{
+	cfs_hash_t *hs;
+	int	 len;
+
+	ENTRY;
+
+	CLASSERT(CFS_HASH_THETA_BITS < 15);
+
+	LASSERT(name != NULL);
+	LASSERT(ops != NULL);
+	LASSERT(ops->hs_key);
+	LASSERT(ops->hs_hash);
+	LASSERT(ops->hs_object);
+	LASSERT(ops->hs_keycmp);
+	LASSERT(ops->hs_get != NULL);
+	LASSERT(ops->hs_put_locked != NULL);
+
+	if ((flags & CFS_HASH_REHASH) != 0)
+		flags |= CFS_HASH_COUNTER; /* must have counter */
+
+	LASSERT(cur_bits > 0);
+	LASSERT(cur_bits >= bkt_bits);
+	LASSERT(max_bits >= cur_bits && max_bits < 31);
+	LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits));
+	LASSERT(ergo((flags & CFS_HASH_REHASH) != 0,
+		     (flags & CFS_HASH_NO_LOCK) == 0));
+	LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0,
+		      ops->hs_keycpy != NULL));
+
+	len = (flags & CFS_HASH_BIGNAME) == 0 ?
+	      CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN;
+	LIBCFS_ALLOC(hs, offsetof(cfs_hash_t, hs_name[len]));
+	if (hs == NULL)
+		RETURN(NULL);
+
+	strncpy(hs->hs_name, name, len);
+	hs->hs_name[len - 1] = '\0';
+	hs->hs_flags = flags;
+
+	atomic_set(&hs->hs_refcount, 1);
+	atomic_set(&hs->hs_count, 0);
+
+	cfs_hash_lock_setup(hs);
+	cfs_hash_hlist_setup(hs);
+
+	hs->hs_cur_bits = (__u8)cur_bits;
+	hs->hs_min_bits = (__u8)cur_bits;
+	hs->hs_max_bits = (__u8)max_bits;
+	hs->hs_bkt_bits = (__u8)bkt_bits;
+
+	hs->hs_ops	 = ops;
+	hs->hs_extra_bytes = extra_bytes;
+	hs->hs_rehash_bits = 0;
+	cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker);
+	cfs_hash_depth_wi_init(hs);
+
+	if (cfs_hash_with_rehash(hs))
+		__cfs_hash_set_theta(hs, min_theta, max_theta);
+
+	hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0,
+						  CFS_HASH_NBKT(hs));
+	if (hs->hs_buckets != NULL)
+		return hs;
+
+	LIBCFS_FREE(hs, offsetof(cfs_hash_t, hs_name[len]));
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(cfs_hash_create);
+
+/**
+ * Cleanup libcfs hash @hs.
+ */
+static void
+cfs_hash_destroy(cfs_hash_t *hs)
+{
+	struct hlist_node     *hnode;
+	struct hlist_node     *pos;
+	cfs_hash_bd_t	 bd;
+	int		   i;
+	ENTRY;
+
+	LASSERT(hs != NULL);
+	LASSERT(!cfs_hash_is_exiting(hs) &&
+		!cfs_hash_is_iterating(hs));
+
+	/**
+	 * prohibit further rehashes, don't need any lock because
+	 * I'm the only (last) one can change it.
+	 */
+	hs->hs_exiting = 1;
+	if (cfs_hash_with_rehash(hs))
+		cfs_hash_rehash_cancel(hs);
+
+	cfs_hash_depth_wi_cancel(hs);
+	/* rehash should be done/canceled */
+	LASSERT(hs->hs_buckets != NULL &&
+		hs->hs_rehash_buckets == NULL);
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		LASSERT(bd.bd_bucket != NULL);
+		/* no need to take this lock, just for consistent code */
+		cfs_hash_bd_lock(hs, &bd, 1);
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+				LASSERTF(!cfs_hash_with_assert_empty(hs),
+					 "hash %s bucket %u(%u) is not "
+					 " empty: %u items left\n",
+					 hs->hs_name, bd.bd_bucket->hsb_index,
+					 bd.bd_offset, bd.bd_bucket->hsb_count);
+				/* can't assert key valicate, because we
+				 * can interrupt rehash */
+				cfs_hash_bd_del_locked(hs, &bd, hnode);
+				cfs_hash_exit(hs, hnode);
+			}
+		}
+		LASSERT(bd.bd_bucket->hsb_count == 0);
+		cfs_hash_bd_unlock(hs, &bd, 1);
+		cond_resched();
+	}
+
+	LASSERT(atomic_read(&hs->hs_count) == 0);
+
+	cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs),
+			      0, CFS_HASH_NBKT(hs));
+	i = cfs_hash_with_bigname(hs) ?
+	    CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN;
+	LIBCFS_FREE(hs, offsetof(cfs_hash_t, hs_name[i]));
+
+	EXIT;
+}
+
+cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs)
+{
+	if (atomic_inc_not_zero(&hs->hs_refcount))
+		return hs;
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_getref);
+
+void cfs_hash_putref(cfs_hash_t *hs)
+{
+	if (atomic_dec_and_test(&hs->hs_refcount))
+		cfs_hash_destroy(hs);
+}
+EXPORT_SYMBOL(cfs_hash_putref);
+
+static inline int
+cfs_hash_rehash_bits(cfs_hash_t *hs)
+{
+	if (cfs_hash_with_no_lock(hs) ||
+	    !cfs_hash_with_rehash(hs))
+		return -EOPNOTSUPP;
+
+	if (unlikely(cfs_hash_is_exiting(hs)))
+		return -ESRCH;
+
+	if (unlikely(cfs_hash_is_rehashing(hs)))
+		return -EALREADY;
+
+	if (unlikely(cfs_hash_is_iterating(hs)))
+		return -EAGAIN;
+
+	/* XXX: need to handle case with max_theta != 2.0
+	 *      and the case with min_theta != 0.5 */
+	if ((hs->hs_cur_bits < hs->hs_max_bits) &&
+	    (__cfs_hash_theta(hs) > hs->hs_max_theta))
+		return hs->hs_cur_bits + 1;
+
+	if (!cfs_hash_with_shrink(hs))
+		return 0;
+
+	if ((hs->hs_cur_bits > hs->hs_min_bits) &&
+	    (__cfs_hash_theta(hs) < hs->hs_min_theta))
+		return hs->hs_cur_bits - 1;
+
+	return 0;
+}
+
+/**
+ * don't allow inline rehash if:
+ * - user wants non-blocking change (add/del) on hash table
+ * - too many elements
+ */
+static inline int
+cfs_hash_rehash_inline(cfs_hash_t *hs)
+{
+	return !cfs_hash_with_nblk_change(hs) &&
+	       atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called when the item is added.
+ */
+void
+cfs_hash_add(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+	cfs_hash_bd_t   bd;
+	int	     bits;
+
+	LASSERT(hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_bd_get_and_lock(hs, key, &bd, 1);
+
+	cfs_hash_key_validate(hs, key, hnode);
+	cfs_hash_bd_add_locked(hs, &bd, hnode);
+
+	cfs_hash_bd_unlock(hs, &bd, 1);
+
+	bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+}
+EXPORT_SYMBOL(cfs_hash_add);
+
+static struct hlist_node *
+cfs_hash_find_or_add(cfs_hash_t *hs, const void *key,
+		     struct hlist_node *hnode, int noref)
+{
+	struct hlist_node *ehnode;
+	cfs_hash_bd_t     bds[2];
+	int	       bits = 0;
+
+	LASSERT(hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	cfs_hash_key_validate(hs, key, hnode);
+	ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key,
+						 hnode, noref);
+	cfs_hash_dual_bd_unlock(hs, bds, 1);
+
+	if (ehnode == hnode) /* new item added */
+		bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+	return ehnode;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
+ */
+int
+cfs_hash_add_unique(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+	return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ?
+	       -EALREADY : 0;
+}
+EXPORT_SYMBOL(cfs_hash_add_unique);
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  If this @key
+ * already exists in the hash then ops->hs_get will be called on the
+ * conflicting entry and that entry will be returned to the caller.
+ * Otherwise ops->hs_get is called on the item which was added.
+ */
+void *
+cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key,
+			struct hlist_node *hnode)
+{
+	hnode = cfs_hash_find_or_add(hs, key, hnode, 0);
+
+	return cfs_hash_object(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_findadd_unique);
+
+/**
+ * Delete item @hnode from the libcfs hash @hs using @key.  The @key
+ * is required to ensure the correct hash bucket is locked since there
+ * is no direct linkage from the item to the bucket.  The object
+ * removed from the hash will be returned and obs->hs_put is called
+ * on the removed object.
+ */
+void *
+cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+	void	   *obj  = NULL;
+	int	     bits = 0;
+	cfs_hash_bd_t   bds[2];
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+	/* NB: do nothing if @hnode is not in hash table */
+	if (hnode == NULL || !hlist_unhashed(hnode)) {
+		if (bds[1].bd_bucket == NULL && hnode != NULL) {
+			cfs_hash_bd_del_locked(hs, &bds[0], hnode);
+		} else {
+			hnode = cfs_hash_dual_bd_finddel_locked(hs, bds,
+								key, hnode);
+		}
+	}
+
+	if (hnode != NULL) {
+		obj  = cfs_hash_object(hs, hnode);
+		bits = cfs_hash_rehash_bits(hs);
+	}
+
+	cfs_hash_dual_bd_unlock(hs, bds, 1);
+	cfs_hash_unlock(hs, 0);
+	if (bits > 0)
+		cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+	return obj;
+}
+EXPORT_SYMBOL(cfs_hash_del);
+
+/**
+ * Delete item given @key in libcfs hash @hs.  The first @key found in
+ * the hash will be removed, if the key exists multiple times in the hash
+ * @hs this function must be called once per key.  The removed object
+ * will be returned and ops->hs_put is called on the removed object.
+ */
+void *
+cfs_hash_del_key(cfs_hash_t *hs, const void *key)
+{
+	return cfs_hash_del(hs, key, NULL);
+}
+EXPORT_SYMBOL(cfs_hash_del_key);
+
+/**
+ * Lookup an item using @key in the libcfs hash @hs and return it.
+ * If the @key is found in the hash hs->hs_get() is called and the
+ * matching objects is returned.  It is the callers responsibility
+ * to call the counterpart ops->hs_put using the cfs_hash_put() macro
+ * when when finished with the object.  If the @key was not found
+ * in the hash @hs NULL is returned.
+ */
+void *
+cfs_hash_lookup(cfs_hash_t *hs, const void *key)
+{
+	void		 *obj = NULL;
+	struct hlist_node     *hnode;
+	cfs_hash_bd_t	 bds[2];
+
+	cfs_hash_lock(hs, 0);
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+	hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key);
+	if (hnode != NULL)
+		obj = cfs_hash_object(hs, hnode);
+
+	cfs_hash_dual_bd_unlock(hs, bds, 0);
+	cfs_hash_unlock(hs, 0);
+
+	return obj;
+}
+EXPORT_SYMBOL(cfs_hash_lookup);
+
+static void
+cfs_hash_for_each_enter(cfs_hash_t *hs)
+{
+	LASSERT(!cfs_hash_is_exiting(hs));
+
+	if (!cfs_hash_with_rehash(hs))
+		return;
+	/*
+	 * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter
+	 * because it's just an unreliable signal to rehash-thread,
+	 * rehash-thread will try to finsih rehash ASAP when seeing this.
+	 */
+	hs->hs_iterating = 1;
+
+	cfs_hash_lock(hs, 1);
+	hs->hs_iterators++;
+
+	/* NB: iteration is mostly called by service thread,
+	 * we tend to cancel pending rehash-requst, instead of
+	 * blocking service thread, we will relaunch rehash request
+	 * after iteration */
+	if (cfs_hash_is_rehashing(hs))
+		cfs_hash_rehash_cancel_locked(hs);
+	cfs_hash_unlock(hs, 1);
+}
+
+static void
+cfs_hash_for_each_exit(cfs_hash_t *hs)
+{
+	int remained;
+	int bits;
+
+	if (!cfs_hash_with_rehash(hs))
+		return;
+	cfs_hash_lock(hs, 1);
+	remained = --hs->hs_iterators;
+	bits = cfs_hash_rehash_bits(hs);
+	cfs_hash_unlock(hs, 1);
+	/* NB: it's race on cfs_has_t::hs_iterating, see above */
+	if (remained == 0)
+		hs->hs_iterating = 0;
+	if (bits > 0) {
+		cfs_hash_rehash(hs, atomic_read(&hs->hs_count) <
+				    CFS_HASH_LOOP_HOG);
+	}
+}
+
+/**
+ * For each item in the libcfs hash @hs call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ *
+ * a) the function may sleep!
+ * b) during the callback:
+ *    . the bucket lock is held so the callback must never sleep.
+ *    . if @removal_safe is true, use can remove current item by
+ *      cfs_hash_bd_del_locked
+ */
+static __u64
+cfs_hash_for_each_tight(cfs_hash_t *hs, cfs_hash_for_each_cb_t func,
+			void *data, int remove_safe)
+{
+	struct hlist_node     *hnode;
+	struct hlist_node     *pos;
+	cfs_hash_bd_t	 bd;
+	__u64		 count = 0;
+	int		   excl  = !!remove_safe;
+	int		   loop  = 0;
+	int		   i;
+	ENTRY;
+
+	cfs_hash_for_each_enter(hs);
+
+	cfs_hash_lock(hs, 0);
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		cfs_hash_bd_lock(hs, &bd, excl);
+		if (func == NULL) { /* only glimpse size */
+			count += bd.bd_bucket->hsb_count;
+			cfs_hash_bd_unlock(hs, &bd, excl);
+			continue;
+		}
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			hlist_for_each_safe(hnode, pos, hhead) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				count++;
+				loop++;
+				if (func(hs, &bd, hnode, data)) {
+					cfs_hash_bd_unlock(hs, &bd, excl);
+					goto out;
+				}
+			}
+		}
+		cfs_hash_bd_unlock(hs, &bd, excl);
+		if (loop < CFS_HASH_LOOP_HOG)
+			continue;
+		loop = 0;
+		cfs_hash_unlock(hs, 0);
+		cond_resched();
+		cfs_hash_lock(hs, 0);
+	}
+ out:
+	cfs_hash_unlock(hs, 0);
+
+	cfs_hash_for_each_exit(hs);
+	RETURN(count);
+}
+
+typedef struct {
+	cfs_hash_cond_opt_cb_t  func;
+	void		   *arg;
+} cfs_hash_cond_arg_t;
+
+static int
+cfs_hash_cond_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			 struct hlist_node *hnode, void *data)
+{
+	cfs_hash_cond_arg_t *cond = data;
+
+	if (cond->func(cfs_hash_object(hs, hnode), cond->arg))
+		cfs_hash_bd_del_locked(hs, bd, hnode);
+	return 0;
+}
+
+/**
+ * Delete item from the libcfs hash @hs when @func return true.
+ * The write lock being hold during loop for each bucket to avoid
+ * any object be reference.
+ */
+void
+cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t func, void *data)
+{
+	cfs_hash_cond_arg_t arg = {
+		.func   = func,
+		.arg    = data,
+	};
+
+	cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1);
+}
+EXPORT_SYMBOL(cfs_hash_cond_del);
+
+void
+cfs_hash_for_each(cfs_hash_t *hs,
+		  cfs_hash_for_each_cb_t func, void *data)
+{
+	cfs_hash_for_each_tight(hs, func, data, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each);
+
+void
+cfs_hash_for_each_safe(cfs_hash_t *hs,
+		       cfs_hash_for_each_cb_t func, void *data)
+{
+	cfs_hash_for_each_tight(hs, func, data, 1);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_safe);
+
+static int
+cfs_hash_peek(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+	      struct hlist_node *hnode, void *data)
+{
+	*(int *)data = 0;
+	return 1; /* return 1 to break the loop */
+}
+
+int
+cfs_hash_is_empty(cfs_hash_t *hs)
+{
+	int empty = 1;
+
+	cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0);
+	return empty;
+}
+EXPORT_SYMBOL(cfs_hash_is_empty);
+
+__u64
+cfs_hash_size_get(cfs_hash_t *hs)
+{
+	return cfs_hash_with_counter(hs) ?
+	       atomic_read(&hs->hs_count) :
+	       cfs_hash_for_each_tight(hs, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(cfs_hash_size_get);
+
+/*
+ * cfs_hash_for_each_relax:
+ * Iterate the hash table and call @func on each item without
+ * any lock. This function can't guarantee to finish iteration
+ * if these features are enabled:
+ *
+ *  a. if rehash_key is enabled, an item can be moved from
+ *     one bucket to another bucket
+ *  b. user can remove non-zero-ref item from hash-table,
+ *     so the item can be removed from hash-table, even worse,
+ *     it's possible that user changed key and insert to another
+ *     hash bucket.
+ * there's no way for us to finish iteration correctly on previous
+ * two cases, so iteration has to be stopped on change.
+ */
+static int
+cfs_hash_for_each_relax(cfs_hash_t *hs, cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_node *hnode;
+	struct hlist_node *tmp;
+	cfs_hash_bd_t     bd;
+	__u32	     version;
+	int	       count = 0;
+	int	       stop_on_change;
+	int	       rc;
+	int	       i;
+	ENTRY;
+
+	stop_on_change = cfs_hash_with_rehash_key(hs) ||
+			 !cfs_hash_with_no_itemref(hs) ||
+			 CFS_HOP(hs, put_locked) == NULL;
+	cfs_hash_lock(hs, 0);
+	LASSERT(!cfs_hash_is_rehashing(hs));
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct hlist_head *hhead;
+
+		cfs_hash_bd_lock(hs, &bd, 0);
+		version = cfs_hash_bd_version_get(&bd);
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			for (hnode = hhead->first; hnode != NULL;) {
+				cfs_hash_bucket_validate(hs, &bd, hnode);
+				cfs_hash_get(hs, hnode);
+				cfs_hash_bd_unlock(hs, &bd, 0);
+				cfs_hash_unlock(hs, 0);
+
+				rc = func(hs, &bd, hnode, data);
+				if (stop_on_change)
+					cfs_hash_put(hs, hnode);
+				cond_resched();
+				count++;
+
+				cfs_hash_lock(hs, 0);
+				cfs_hash_bd_lock(hs, &bd, 0);
+				if (!stop_on_change) {
+					tmp = hnode->next;
+					cfs_hash_put_locked(hs, hnode);
+					hnode = tmp;
+				} else { /* bucket changed? */
+					if (version !=
+					    cfs_hash_bd_version_get(&bd))
+						break;
+					/* safe to continue because no change */
+					hnode = hnode->next;
+				}
+				if (rc) /* callback wants to break iteration */
+					break;
+			}
+		}
+		cfs_hash_bd_unlock(hs, &bd, 0);
+	}
+	cfs_hash_unlock(hs, 0);
+
+	return count;
+}
+
+int
+cfs_hash_for_each_nolock(cfs_hash_t *hs,
+			 cfs_hash_for_each_cb_t func, void *data)
+{
+	ENTRY;
+
+	if (cfs_hash_with_no_lock(hs) ||
+	    cfs_hash_with_rehash_key(hs) ||
+	    !cfs_hash_with_no_itemref(hs))
+		RETURN(-EOPNOTSUPP);
+
+	if (CFS_HOP(hs, get) == NULL ||
+	    (CFS_HOP(hs, put) == NULL &&
+	     CFS_HOP(hs, put_locked) == NULL))
+		RETURN(-EOPNOTSUPP);
+
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_for_each_relax(hs, func, data);
+	cfs_hash_for_each_exit(hs);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_nolock);
+
+/**
+ * For each hash bucket in the libcfs hash @hs call the passed callback
+ * @func until all the hash buckets are empty.  The passed callback @func
+ * or the previously registered callback hs->hs_put must remove the item
+ * from the hash.  You may either use the cfs_hash_del() or hlist_del()
+ * functions.  No rwlocks will be held during the callback @func it is
+ * safe to sleep if needed.  This function will not terminate until the
+ * hash is empty.  Note it is still possible to concurrently add new
+ * items in to the hash.  It is the callers responsibility to ensure
+ * the required locking is in place to prevent concurrent insertions.
+ */
+int
+cfs_hash_for_each_empty(cfs_hash_t *hs,
+			cfs_hash_for_each_cb_t func, void *data)
+{
+	unsigned  i = 0;
+	ENTRY;
+
+	if (cfs_hash_with_no_lock(hs))
+		return -EOPNOTSUPP;
+
+	if (CFS_HOP(hs, get) == NULL ||
+	    (CFS_HOP(hs, put) == NULL &&
+	     CFS_HOP(hs, put_locked) == NULL))
+		return -EOPNOTSUPP;
+
+	cfs_hash_for_each_enter(hs);
+	while (cfs_hash_for_each_relax(hs, func, data)) {
+		CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
+		       hs->hs_name, i++);
+	}
+	cfs_hash_for_each_exit(hs);
+	RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_empty);
+
+void
+cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex,
+			cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_head   *hhead;
+	struct hlist_node   *hnode;
+	cfs_hash_bd_t       bd;
+
+	cfs_hash_for_each_enter(hs);
+	cfs_hash_lock(hs, 0);
+	if (hindex >= CFS_HASH_NHLIST(hs))
+		goto out;
+
+	cfs_hash_bd_index_set(hs, hindex, &bd);
+
+	cfs_hash_bd_lock(hs, &bd, 0);
+	hhead = cfs_hash_bd_hhead(hs, &bd);
+	hlist_for_each(hnode, hhead) {
+		if (func(hs, &bd, hnode, data))
+			break;
+	}
+	cfs_hash_bd_unlock(hs, &bd, 0);
+ out:
+	cfs_hash_unlock(hs, 0);
+	cfs_hash_for_each_exit(hs);
+}
+
+EXPORT_SYMBOL(cfs_hash_hlist_for_each);
+
+/*
+ * For each item in the libcfs hash @hs which matches the @key call
+ * the passed callback @func and pass to it as an argument each hash
+ * item and the private @data. During the callback the bucket lock
+ * is held so the callback must never sleep.
+   */
+void
+cfs_hash_for_each_key(cfs_hash_t *hs, const void *key,
+		      cfs_hash_for_each_cb_t func, void *data)
+{
+	struct hlist_node   *hnode;
+	cfs_hash_bd_t       bds[2];
+	unsigned	    i;
+
+	cfs_hash_lock(hs, 0);
+
+	cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+	cfs_hash_for_each_bd(bds, 2, i) {
+		struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]);
+
+		hlist_for_each(hnode, hlist) {
+			cfs_hash_bucket_validate(hs, &bds[i], hnode);
+
+			if (cfs_hash_keycmp(hs, key, hnode)) {
+				if (func(hs, &bds[i], hnode, data))
+					break;
+			}
+		}
+	}
+
+	cfs_hash_dual_bd_unlock(hs, bds, 0);
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_key);
+
+/**
+ * Rehash the libcfs hash @hs to the given @bits.  This can be used
+ * to grow the hash size when excessive chaining is detected, or to
+ * shrink the hash when it is larger than needed.  When the CFS_HASH_REHASH
+ * flag is set in @hs the libcfs hash may be dynamically rehashed
+ * during addition or removal if the hash's theta value exceeds
+ * either the hs->hs_min_theta or hs->max_theta values.  By default
+ * these values are tuned to keep the chained hash depth small, and
+ * this approach assumes a reasonably uniform hashing function.  The
+ * theta thresholds for @hs are tunable via cfs_hash_set_theta().
+ */
+void
+cfs_hash_rehash_cancel_locked(cfs_hash_t *hs)
+{
+	int     i;
+
+	/* need hold cfs_hash_lock(hs, 1) */
+	LASSERT(cfs_hash_with_rehash(hs) &&
+		!cfs_hash_with_no_lock(hs));
+
+	if (!cfs_hash_is_rehashing(hs))
+		return;
+
+	if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) {
+		hs->hs_rehash_bits = 0;
+		return;
+	}
+
+	for (i = 2; cfs_hash_is_rehashing(hs); i++) {
+		cfs_hash_unlock(hs, 1);
+		/* raise console warning while waiting too long */
+		CDEBUG(IS_PO2(i >> 3) ? D_WARNING : D_INFO,
+		       "hash %s is still rehashing, rescheded %d\n",
+		       hs->hs_name, i - 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel_locked);
+
+void
+cfs_hash_rehash_cancel(cfs_hash_t *hs)
+{
+	cfs_hash_lock(hs, 1);
+	cfs_hash_rehash_cancel_locked(hs);
+	cfs_hash_unlock(hs, 1);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel);
+
+int
+cfs_hash_rehash(cfs_hash_t *hs, int do_rehash)
+{
+	int     rc;
+
+	LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs));
+
+	cfs_hash_lock(hs, 1);
+
+	rc = cfs_hash_rehash_bits(hs);
+	if (rc <= 0) {
+		cfs_hash_unlock(hs, 1);
+		return rc;
+	}
+
+	hs->hs_rehash_bits = rc;
+	if (!do_rehash) {
+		/* launch and return */
+		cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi);
+		cfs_hash_unlock(hs, 1);
+		return 0;
+	}
+
+	/* rehash right now */
+	cfs_hash_unlock(hs, 1);
+
+	return cfs_hash_rehash_worker(&hs->hs_rehash_wi);
+}
+EXPORT_SYMBOL(cfs_hash_rehash);
+
+static int
+cfs_hash_rehash_bd(cfs_hash_t *hs, cfs_hash_bd_t *old)
+{
+	cfs_hash_bd_t      new;
+	struct hlist_head  *hhead;
+	struct hlist_node  *hnode;
+	struct hlist_node  *pos;
+	void	      *key;
+	int		c = 0;
+
+	/* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */
+	cfs_hash_bd_for_each_hlist(hs, old, hhead) {
+		hlist_for_each_safe(hnode, pos, hhead) {
+			key = cfs_hash_key(hs, hnode);
+			LASSERT(key != NULL);
+			/* Validate hnode is in the correct bucket. */
+			cfs_hash_bucket_validate(hs, old, hnode);
+			/*
+			 * Delete from old hash bucket; move to new bucket.
+			 * ops->hs_key must be defined.
+			 */
+			cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+					     hs->hs_rehash_bits, key, &new);
+			cfs_hash_bd_move_locked(hs, old, &new, hnode);
+			c++;
+		}
+	}
+
+	return c;
+}
+
+static int
+cfs_hash_rehash_worker(cfs_workitem_t *wi)
+{
+	cfs_hash_t	 *hs = container_of(wi, cfs_hash_t, hs_rehash_wi);
+	cfs_hash_bucket_t **bkts;
+	cfs_hash_bd_t       bd;
+	unsigned int	old_size;
+	unsigned int	new_size;
+	int		 bsize;
+	int		 count = 0;
+	int		 rc = 0;
+	int		 i;
+
+	LASSERT (hs != NULL && cfs_hash_with_rehash(hs));
+
+	cfs_hash_lock(hs, 0);
+	LASSERT(cfs_hash_is_rehashing(hs));
+
+	old_size = CFS_HASH_NBKT(hs);
+	new_size = CFS_HASH_RH_NBKT(hs);
+
+	cfs_hash_unlock(hs, 0);
+
+	/*
+	 * don't need hs::hs_rwlock for hs::hs_buckets,
+	 * because nobody can change bkt-table except me.
+	 */
+	bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets,
+					old_size, new_size);
+	cfs_hash_lock(hs, 1);
+	if (bkts == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (bkts == hs->hs_buckets) {
+		bkts = NULL; /* do nothing */
+		goto out;
+	}
+
+	rc = __cfs_hash_theta(hs);
+	if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) {
+		/* free the new allocated bkt-table */
+		old_size = new_size;
+		new_size = CFS_HASH_NBKT(hs);
+		rc = -EALREADY;
+		goto out;
+	}
+
+	LASSERT(hs->hs_rehash_buckets == NULL);
+	hs->hs_rehash_buckets = bkts;
+
+	rc = 0;
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		if (cfs_hash_is_exiting(hs)) {
+			rc = -ESRCH;
+			/* someone wants to destroy the hash, abort now */
+			if (old_size < new_size) /* OK to free old bkt-table */
+				break;
+			/* it's shrinking, need free new bkt-table */
+			hs->hs_rehash_buckets = NULL;
+			old_size = new_size;
+			new_size = CFS_HASH_NBKT(hs);
+			goto out;
+		}
+
+		count += cfs_hash_rehash_bd(hs, &bd);
+		if (count < CFS_HASH_LOOP_HOG ||
+		    cfs_hash_is_iterating(hs)) { /* need to finish ASAP */
+			continue;
+		}
+
+		count = 0;
+		cfs_hash_unlock(hs, 1);
+		cond_resched();
+		cfs_hash_lock(hs, 1);
+	}
+
+	hs->hs_rehash_count++;
+
+	bkts = hs->hs_buckets;
+	hs->hs_buckets = hs->hs_rehash_buckets;
+	hs->hs_rehash_buckets = NULL;
+
+	hs->hs_cur_bits = hs->hs_rehash_bits;
+ out:
+	hs->hs_rehash_bits = 0;
+	if (rc == -ESRCH) /* never be scheduled again */
+		cfs_wi_exit(cfs_sched_rehash, wi);
+	bsize = cfs_hash_bkt_size(hs);
+	cfs_hash_unlock(hs, 1);
+	/* can't refer to @hs anymore because it could be destroyed */
+	if (bkts != NULL)
+		cfs_hash_buckets_free(bkts, bsize, new_size, old_size);
+	if (rc != 0)
+		CDEBUG(D_INFO, "early quit of of rehashing: %d\n", rc);
+	/* return 1 only if cfs_wi_exit is called */
+	return rc == -ESRCH;
+}
+
+/**
+ * Rehash the object referenced by @hnode in the libcfs hash @hs.  The
+ * @old_key must be provided to locate the objects previous location
+ * in the hash, and the @new_key will be used to reinsert the object.
+ * Use this function instead of a cfs_hash_add() + cfs_hash_del()
+ * combo when it is critical that there is no window in time where the
+ * object is missing from the hash.  When an object is being rehashed
+ * the registered cfs_hash_get() and cfs_hash_put() functions will
+ * not be called.
+ */
+void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key,
+			 void *new_key, struct hlist_node *hnode)
+{
+	cfs_hash_bd_t	bds[3];
+	cfs_hash_bd_t	old_bds[2];
+	cfs_hash_bd_t	new_bd;
+
+	LASSERT(!hlist_unhashed(hnode));
+
+	cfs_hash_lock(hs, 0);
+
+	cfs_hash_dual_bd_get(hs, old_key, old_bds);
+	cfs_hash_bd_get(hs, new_key, &new_bd);
+
+	bds[0] = old_bds[0];
+	bds[1] = old_bds[1];
+	bds[2] = new_bd;
+
+	/* NB: bds[0] and bds[1] are ordered already */
+	cfs_hash_bd_order(&bds[1], &bds[2]);
+	cfs_hash_bd_order(&bds[0], &bds[1]);
+
+	cfs_hash_multi_bd_lock(hs, bds, 3, 1);
+	if (likely(old_bds[1].bd_bucket == NULL)) {
+		cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode);
+	} else {
+		cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode);
+		cfs_hash_bd_add_locked(hs, &new_bd, hnode);
+	}
+	/* overwrite key inside locks, otherwise may screw up with
+	 * other operations, i.e: rehash */
+	cfs_hash_keycpy(hs, new_key, hnode);
+
+	cfs_hash_multi_bd_unlock(hs, bds, 3, 1);
+	cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_key);
+
+int cfs_hash_debug_header(struct seq_file *m)
+{
+	return seq_printf(m, "%-*s%6s%6s%6s%6s%6s%6s%6s%7s%8s%8s%8s%s\n",
+		 CFS_HASH_BIGNAME_LEN,
+		 "name", "cur", "min", "max", "theta", "t-min", "t-max",
+		 "flags", "rehash", "count", "maxdep", "maxdepb",
+		 " distribution");
+}
+EXPORT_SYMBOL(cfs_hash_debug_header);
+
+static cfs_hash_bucket_t **
+cfs_hash_full_bkts(cfs_hash_t *hs)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (hs->hs_rehash_buckets == NULL)
+		return hs->hs_buckets;
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	return hs->hs_rehash_bits > hs->hs_cur_bits ?
+	       hs->hs_rehash_buckets : hs->hs_buckets;
+}
+
+static unsigned int
+cfs_hash_full_nbkt(cfs_hash_t *hs)
+{
+	/* NB: caller should hold hs->hs_rwlock if REHASH is set */
+	if (hs->hs_rehash_buckets == NULL)
+		return CFS_HASH_NBKT(hs);
+
+	LASSERT(hs->hs_rehash_bits != 0);
+	return hs->hs_rehash_bits > hs->hs_cur_bits ?
+	       CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs);
+}
+
+int cfs_hash_debug_str(cfs_hash_t *hs, struct seq_file *m)
+{
+	int		    dist[8] = { 0, };
+	int		    maxdep  = -1;
+	int		    maxdepb = -1;
+	int		    total   = 0;
+	int		    theta;
+	int		    i;
+
+	cfs_hash_lock(hs, 0);
+	theta = __cfs_hash_theta(hs);
+
+	seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d  0x%02x %6d ",
+		      CFS_HASH_BIGNAME_LEN, hs->hs_name,
+		      1 << hs->hs_cur_bits, 1 << hs->hs_min_bits,
+		      1 << hs->hs_max_bits,
+		      __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta),
+		      __cfs_hash_theta_int(hs->hs_min_theta),
+		      __cfs_hash_theta_frac(hs->hs_min_theta),
+		      __cfs_hash_theta_int(hs->hs_max_theta),
+		      __cfs_hash_theta_frac(hs->hs_max_theta),
+		      hs->hs_flags, hs->hs_rehash_count);
+
+	/*
+	 * The distribution is a summary of the chained hash depth in
+	 * each of the libcfs hash buckets.  Each buckets hsb_count is
+	 * divided by the hash theta value and used to generate a
+	 * histogram of the hash distribution.  A uniform hash will
+	 * result in all hash buckets being close to the average thus
+	 * only the first few entries in the histogram will be non-zero.
+	 * If you hash function results in a non-uniform hash the will
+	 * be observable by outlier bucks in the distribution histogram.
+	 *
+	 * Uniform hash distribution:      128/128/0/0/0/0/0/0
+	 * Non-Uniform hash distribution:  128/125/0/0/0/0/2/1
+	 */
+	for (i = 0; i < cfs_hash_full_nbkt(hs); i++) {
+		cfs_hash_bd_t  bd;
+
+		bd.bd_bucket = cfs_hash_full_bkts(hs)[i];
+		cfs_hash_bd_lock(hs, &bd, 0);
+		if (maxdep < bd.bd_bucket->hsb_depmax) {
+			maxdep  = bd.bd_bucket->hsb_depmax;
+			maxdepb = ffz(~maxdep);
+		}
+		total += bd.bd_bucket->hsb_count;
+		dist[min(__cfs_fls(bd.bd_bucket->hsb_count/max(theta,1)),7)]++;
+		cfs_hash_bd_unlock(hs, &bd, 0);
+	}
+
+	seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb);
+	for (i = 0; i < 8; i++)
+		seq_printf(m, "%d%c",  dist[i], (i == 7) ? '\n' : '/');
+
+	cfs_hash_unlock(hs, 0);
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_hash_debug_str);

diff --git a/drivers/staging/lustre/lustre/libcfs/heap.c b/drivers/staging/lustre/lustre/libcfs/heap.c
new file mode 100644
index 0000000..147e4fe
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/heap.c

@@ -0,0 +1,475 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/libcfs/heap.c
+ *
+ * Author: Eric Barton	<eeb@whamcloud.com>
+ *	   Liang Zhen	<liang@whamcloud.com>
+ */
+/** \addtogroup heap
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define CBH_ALLOC(ptr, h)						\
+do {									\
+	if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW)			\
+		LIBCFS_CPT_ALLOC_GFP((ptr), h->cbh_cptab, h->cbh_cptid,	\
+				     CBH_NOB, GFP_ATOMIC);	\
+	else								\
+		LIBCFS_CPT_ALLOC((ptr), h->cbh_cptab, h->cbh_cptid,	\
+				 CBH_NOB);				\
+} while (0)
+
+#define CBH_FREE(ptr)	LIBCFS_FREE(ptr, CBH_NOB)
+
+/**
+ * Grows the capacity of a binary heap so that it can handle a larger number of
+ * \e cfs_binheap_node_t objects.
+ *
+ * \param[in] h The binary heap
+ *
+ * \retval 0	   Successfully grew the heap
+ * \retval -ENOMEM OOM error
+ */
+static int
+cfs_binheap_grow(cfs_binheap_t *h)
+{
+	cfs_binheap_node_t ***frag1 = NULL;
+	cfs_binheap_node_t  **frag2;
+	int hwm = h->cbh_hwm;
+
+	/* need a whole new chunk of pointers */
+	LASSERT((h->cbh_hwm & CBH_MASK) == 0);
+
+	if (hwm == 0) {
+		/* first use of single indirect */
+		CBH_ALLOC(h->cbh_elements1, h);
+		if (h->cbh_elements1 == NULL)
+			return -ENOMEM;
+
+		goto out;
+	}
+
+	hwm -= CBH_SIZE;
+	if (hwm < CBH_SIZE * CBH_SIZE) {
+		/* not filled double indirect */
+		CBH_ALLOC(frag2, h);
+		if (frag2 == NULL)
+			return -ENOMEM;
+
+		if (hwm == 0) {
+			/* first use of double indirect */
+			CBH_ALLOC(h->cbh_elements2, h);
+			if (h->cbh_elements2 == NULL) {
+				CBH_FREE(frag2);
+				return -ENOMEM;
+			}
+		}
+
+		h->cbh_elements2[hwm >> CBH_SHIFT] = frag2;
+		goto out;
+	}
+
+	hwm -= CBH_SIZE * CBH_SIZE;
+#if (CBH_SHIFT * 3 < 32)
+	if (hwm >= CBH_SIZE * CBH_SIZE * CBH_SIZE) {
+		/* filled triple indirect */
+		return -ENOMEM;
+	}
+#endif
+	CBH_ALLOC(frag2, h);
+	if (frag2 == NULL)
+		return -ENOMEM;
+
+	if (((hwm >> CBH_SHIFT) & CBH_MASK) == 0) {
+		/* first use of this 2nd level index */
+		CBH_ALLOC(frag1, h);
+		if (frag1 == NULL) {
+			CBH_FREE(frag2);
+			return -ENOMEM;
+		}
+	}
+
+	if (hwm == 0) {
+		/* first use of triple indirect */
+		CBH_ALLOC(h->cbh_elements3, h);
+		if (h->cbh_elements3 == NULL) {
+			CBH_FREE(frag2);
+			CBH_FREE(frag1);
+			return -ENOMEM;
+		}
+	}
+
+	if (frag1 != NULL) {
+		LASSERT(h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] == NULL);
+		h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] = frag1;
+	} else {
+		frag1 = h->cbh_elements3[hwm >> (2 * CBH_SHIFT)];
+		LASSERT(frag1 != NULL);
+	}
+
+	frag1[(hwm >> CBH_SHIFT) & CBH_MASK] = frag2;
+
+ out:
+	h->cbh_hwm += CBH_SIZE;
+	return 0;
+}
+
+/**
+ * Creates and initializes a binary heap instance.
+ *
+ * \param[in] ops   The operations to be used
+ * \param[in] flags The heap flags
+ * \parm[in]  count The initial heap capacity in # of elements
+ * \param[in] arg   An optional private argument
+ * \param[in] cptab The CPT table this heap instance will operate over
+ * \param[in] cptid The CPT id of \a cptab this heap instance will operate over
+ *
+ * \retval valid-pointer A newly-created and initialized binary heap object
+ * \retval NULL		 error
+ */
+cfs_binheap_t *
+cfs_binheap_create(cfs_binheap_ops_t *ops, unsigned int flags,
+		   unsigned count, void *arg, struct cfs_cpt_table *cptab,
+		   int cptid)
+{
+	cfs_binheap_t *h;
+
+	LASSERT(ops != NULL);
+	LASSERT(ops->hop_compare != NULL);
+	LASSERT(cptab != NULL);
+	LASSERT(cptid == CFS_CPT_ANY ||
+	       (cptid >= 0 && cptid < cptab->ctb_nparts));
+
+	LIBCFS_CPT_ALLOC(h, cptab, cptid, sizeof(*h));
+	if (h == NULL)
+		return NULL;
+
+	h->cbh_ops	  = ops;
+	h->cbh_nelements  = 0;
+	h->cbh_hwm	  = 0;
+	h->cbh_private	  = arg;
+	h->cbh_flags	  = flags & (~CBH_FLAG_ATOMIC_GROW);
+	h->cbh_cptab	  = cptab;
+	h->cbh_cptid	  = cptid;
+
+	while (h->cbh_hwm < count) { /* preallocate */
+		if (cfs_binheap_grow(h) != 0) {
+			cfs_binheap_destroy(h);
+			return NULL;
+		}
+	}
+
+	h->cbh_flags |= flags & CBH_FLAG_ATOMIC_GROW;
+
+	return h;
+}
+EXPORT_SYMBOL(cfs_binheap_create);
+
+/**
+ * Releases all resources associated with a binary heap instance.
+ *
+ * Deallocates memory for all indirection levels and the binary heap object
+ * itself.
+ *
+ * \param[in] h The binary heap object
+ */
+void
+cfs_binheap_destroy(cfs_binheap_t *h)
+{
+	int idx0;
+	int idx1;
+	int n;
+
+	LASSERT(h != NULL);
+
+	n = h->cbh_hwm;
+
+	if (n > 0) {
+		CBH_FREE(h->cbh_elements1);
+		n -= CBH_SIZE;
+	}
+
+	if (n > 0) {
+		for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+			CBH_FREE(h->cbh_elements2[idx0]);
+			n -= CBH_SIZE;
+		}
+
+		CBH_FREE(h->cbh_elements2);
+	}
+
+	if (n > 0) {
+		for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+
+			for (idx1 = 0; idx1 < CBH_SIZE && n > 0; idx1++) {
+				CBH_FREE(h->cbh_elements3[idx0][idx1]);
+				n -= CBH_SIZE;
+			}
+
+			CBH_FREE(h->cbh_elements3[idx0]);
+		}
+
+		CBH_FREE(h->cbh_elements3);
+	}
+
+	LIBCFS_FREE(h, sizeof(*h));
+}
+EXPORT_SYMBOL(cfs_binheap_destroy);
+
+/**
+ * Obtains a double pointer to a heap element, given its index into the binary
+ * tree.
+ *
+ * \param[in] h	  The binary heap instance
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer A double pointer to a heap pointer entry
+ */
+static cfs_binheap_node_t **
+cfs_binheap_pointer(cfs_binheap_t *h, unsigned int idx)
+{
+	if (idx < CBH_SIZE)
+		return &(h->cbh_elements1[idx]);
+
+	idx -= CBH_SIZE;
+	if (idx < CBH_SIZE * CBH_SIZE)
+		return &(h->cbh_elements2[idx >> CBH_SHIFT][idx & CBH_MASK]);
+
+	idx -= CBH_SIZE * CBH_SIZE;
+	return &(h->cbh_elements3[idx >> (2 * CBH_SHIFT)]\
+				 [(idx >> CBH_SHIFT) & CBH_MASK]\
+				 [idx & CBH_MASK]);
+}
+
+/**
+ * Obtains a pointer to a heap element, given its index into the binary tree.
+ *
+ * \param[in] h	  The binary heap
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer The requested heap node
+ * \retval NULL		 Supplied index is out of bounds
+ */
+cfs_binheap_node_t *
+cfs_binheap_find(cfs_binheap_t *h, unsigned int idx)
+{
+	if (idx >= h->cbh_nelements)
+		return NULL;
+
+	return *cfs_binheap_pointer(h, idx);
+}
+EXPORT_SYMBOL(cfs_binheap_find);
+
+/**
+ * Moves a node upwards, towards the root of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+cfs_binheap_bubble(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+	unsigned int	     cur_idx = e->chn_index;
+	cfs_binheap_node_t **cur_ptr;
+	unsigned int	     parent_idx;
+	cfs_binheap_node_t **parent_ptr;
+	int		     did_sth = 0;
+
+	cur_ptr = cfs_binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	while (cur_idx > 0) {
+		parent_idx = (cur_idx - 1) >> 1;
+
+		parent_ptr = cfs_binheap_pointer(h, parent_idx);
+		LASSERT((*parent_ptr)->chn_index == parent_idx);
+
+		if (h->cbh_ops->hop_compare(*parent_ptr, e))
+			break;
+
+		(*parent_ptr)->chn_index = cur_idx;
+		*cur_ptr = *parent_ptr;
+		cur_ptr = parent_ptr;
+		cur_idx = parent_idx;
+		did_sth = 1;
+	}
+
+	e->chn_index = cur_idx;
+	*cur_ptr = e;
+
+	return did_sth;
+}
+
+/**
+ * Moves a node downwards, towards the last level of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+cfs_binheap_sink(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+	unsigned int	     n = h->cbh_nelements;
+	unsigned int	     child_idx;
+	cfs_binheap_node_t **child_ptr;
+	cfs_binheap_node_t  *child;
+	unsigned int	     child2_idx;
+	cfs_binheap_node_t **child2_ptr;
+	cfs_binheap_node_t  *child2;
+	unsigned int	     cur_idx;
+	cfs_binheap_node_t **cur_ptr;
+	int		     did_sth = 0;
+
+	cur_idx = e->chn_index;
+	cur_ptr = cfs_binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	while (cur_idx < n) {
+		child_idx = (cur_idx << 1) + 1;
+		if (child_idx >= n)
+			break;
+
+		child_ptr = cfs_binheap_pointer(h, child_idx);
+		child = *child_ptr;
+
+		child2_idx = child_idx + 1;
+		if (child2_idx < n) {
+			child2_ptr = cfs_binheap_pointer(h, child2_idx);
+			child2 = *child2_ptr;
+
+			if (h->cbh_ops->hop_compare(child2, child)) {
+				child_idx = child2_idx;
+				child_ptr = child2_ptr;
+				child = child2;
+			}
+		}
+
+		LASSERT(child->chn_index == child_idx);
+
+		if (h->cbh_ops->hop_compare(e, child))
+			break;
+
+		child->chn_index = cur_idx;
+		*cur_ptr = child;
+		cur_ptr = child_ptr;
+		cur_idx = child_idx;
+		did_sth = 1;
+	}
+
+	e->chn_index = cur_idx;
+	*cur_ptr = e;
+
+	return did_sth;
+}
+
+/**
+ * Sort-inserts a node into the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 0	Element inserted successfully
+ * \retval != 0 error
+ */
+int
+cfs_binheap_insert(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+	cfs_binheap_node_t **new_ptr;
+	unsigned int	     new_idx = h->cbh_nelements;
+	int		     rc;
+
+	if (new_idx == h->cbh_hwm) {
+		rc = cfs_binheap_grow(h);
+		if (rc != 0)
+			return rc;
+	}
+
+	if (h->cbh_ops->hop_enter) {
+		rc = h->cbh_ops->hop_enter(h, e);
+		if (rc != 0)
+			return rc;
+	}
+
+	e->chn_index = new_idx;
+	new_ptr = cfs_binheap_pointer(h, new_idx);
+	h->cbh_nelements++;
+	*new_ptr = e;
+
+	cfs_binheap_bubble(h, e);
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_binheap_insert);
+
+/**
+ * Removes a node from the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ */
+void
+cfs_binheap_remove(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+	unsigned int	     n = h->cbh_nelements;
+	unsigned int	     cur_idx = e->chn_index;
+	cfs_binheap_node_t **cur_ptr;
+	cfs_binheap_node_t  *last;
+
+	LASSERT(cur_idx != CBH_POISON);
+	LASSERT(cur_idx < n);
+
+	cur_ptr = cfs_binheap_pointer(h, cur_idx);
+	LASSERT(*cur_ptr == e);
+
+	n--;
+	last = *cfs_binheap_pointer(h, n);
+	h->cbh_nelements = n;
+	if (last == e)
+		return;
+
+	last->chn_index = cur_idx;
+	*cur_ptr = last;
+	if (!cfs_binheap_bubble(h, *cur_ptr))
+		cfs_binheap_sink(h, *cur_ptr);
+
+	e->chn_index = CBH_POISON;
+	if (h->cbh_ops->hop_exit)
+		h->cbh_ops->hop_exit(h, e);
+}
+EXPORT_SYMBOL(cfs_binheap_remove);
+
+/** @} heap */

diff --git a/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c b/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
new file mode 100644
index 0000000..d6d3b2e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c

@@ -0,0 +1,346 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * Using pipes for all arches.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_KUC D_OTHER
+
+#include <linux/libcfs/libcfs.h>
+
+#ifdef LUSTRE_UTILS
+/* This is the userspace side. */
+
+/** Start the userspace side of a KUC pipe.
+ * @param link Private descriptor for pipe/socket.
+ * @param groups KUC broadcast group to listen to
+ *	  (can be null for unicast to this pid)
+ */
+int libcfs_ukuc_start(lustre_kernelcomm *link, int group)
+{
+	int pfd[2];
+
+	if (pipe(pfd) < 0)
+		return -errno;
+
+	memset(link, 0, sizeof(*link));
+	link->lk_rfd = pfd[0];
+	link->lk_wfd = pfd[1];
+	link->lk_group = group;
+	link->lk_uid = getpid();
+	return 0;
+}
+
+int libcfs_ukuc_stop(lustre_kernelcomm *link)
+{
+	if (link->lk_wfd > 0)
+		close(link->lk_wfd);
+	return close(link->lk_rfd);
+}
+
+#define lhsz sizeof(*kuch)
+
+/** Read a message from the link.
+ * Allocates memory, returns handle
+ *
+ * @param link Private descriptor for pipe/socket.
+ * @param buf Buffer to read into, must include size for kuc_hdr
+ * @param maxsize Maximum message size allowed
+ * @param transport Only listen to messages on this transport
+ *      (and the generic transport)
+ */
+int libcfs_ukuc_msg_get(lustre_kernelcomm *link, char *buf, int maxsize,
+			int transport)
+{
+	struct kuc_hdr *kuch;
+	int rc = 0;
+
+	memset(buf, 0, maxsize);
+
+	CDEBUG(D_KUC, "Waiting for message from kernel on fd %d\n",
+	       link->lk_rfd);
+
+	while (1) {
+		/* Read header first to get message size */
+		rc = read(link->lk_rfd, buf, lhsz);
+		if (rc <= 0) {
+			rc = -errno;
+			break;
+		}
+		kuch = (struct kuc_hdr *)buf;
+
+		CDEBUG(D_KUC, "Received message mg=%x t=%d m=%d l=%d\n",
+		       kuch->kuc_magic, kuch->kuc_transport, kuch->kuc_msgtype,
+		       kuch->kuc_msglen);
+
+		if (kuch->kuc_magic != KUC_MAGIC) {
+			CERROR("bad message magic %x != %x\n",
+			       kuch->kuc_magic, KUC_MAGIC);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (kuch->kuc_msglen > maxsize) {
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		/* Read payload */
+		rc = read(link->lk_rfd, buf + lhsz, kuch->kuc_msglen - lhsz);
+		if (rc < 0) {
+			rc = -errno;
+			break;
+		}
+		if (rc < (kuch->kuc_msglen - lhsz)) {
+			CERROR("short read: got %d of %d bytes\n",
+			       rc, kuch->kuc_msglen);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (kuch->kuc_transport == transport ||
+		    kuch->kuc_transport == KUC_TRANSPORT_GENERIC) {
+			return 0;
+		}
+		/* Drop messages for other transports */
+	}
+	return rc;
+}
+
+#else /* LUSTRE_UTILS */
+/* This is the kernel side (liblustre as well). */
+
+/**
+ * libcfs_kkuc_msg_put - send an message from kernel to userspace
+ * @param fp to send the message to
+ * @param payload Payload data.  First field of payload is always
+ *   struct kuc_hdr
+ */
+int libcfs_kkuc_msg_put(struct file *filp, void *payload)
+{
+	struct kuc_hdr *kuch = (struct kuc_hdr *)payload;
+	ssize_t count = kuch->kuc_msglen;
+	loff_t offset = 0;
+	mm_segment_t fs;
+	int rc = -ENOSYS;
+
+	if (filp == NULL || IS_ERR(filp))
+		return -EBADF;
+
+	if (kuch->kuc_magic != KUC_MAGIC) {
+		CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic);
+		return -ENOSYS;
+	}
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	while (count > 0) {
+		rc = vfs_write(filp, (void __force __user *)payload,
+			       count, &offset);
+		if (rc < 0)
+			break;
+		count -= rc;
+		payload += rc;
+		rc = 0;
+	}
+	set_fs(fs);
+
+	if (rc < 0)
+		CWARN("message send failed (%d)\n", rc);
+	else
+		CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_msg_put);
+
+/* Broadcast groups are global across all mounted filesystems;
+ * i.e. registering for a group on 1 fs will get messages for that
+ * group from any fs */
+/** A single group reigstration has a uid and a file pointer */
+struct kkuc_reg {
+	struct list_head	kr_chain;
+	int		kr_uid;
+	struct file	*kr_fp;
+	__u32		kr_data;
+};
+static struct list_head kkuc_groups[KUC_GRP_MAX+1] = {};
+/* Protect message sending against remove and adds */
+static DECLARE_RWSEM(kg_sem);
+
+/** Add a receiver to a broadcast group
+ * @param filp pipe to write into
+ * @param uid identidier for this receiver
+ * @param group group number
+ */
+int libcfs_kkuc_group_add(struct file *filp, int uid, int group, __u32 data)
+{
+	struct kkuc_reg *reg;
+
+	if (group > KUC_GRP_MAX) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		return -EINVAL;
+	}
+
+	/* fput in group_rem */
+	if (filp == NULL)
+		return -EBADF;
+
+	/* freed in group_rem */
+	reg = kmalloc(sizeof(*reg), 0);
+	if (reg == NULL)
+		return -ENOMEM;
+
+	reg->kr_fp = filp;
+	reg->kr_uid = uid;
+	reg->kr_data = data;
+
+	down_write(&kg_sem);
+	if (kkuc_groups[group].next == NULL)
+		INIT_LIST_HEAD(&kkuc_groups[group]);
+	list_add(&reg->kr_chain, &kkuc_groups[group]);
+	up_write(&kg_sem);
+
+	CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_add);
+
+int libcfs_kkuc_group_rem(int uid, int group)
+{
+	struct kkuc_reg *reg, *next;
+	ENTRY;
+
+	if (kkuc_groups[group].next == NULL)
+		RETURN(0);
+
+	if (uid == 0) {
+		/* Broadcast a shutdown message */
+		struct kuc_hdr lh;
+
+		lh.kuc_magic = KUC_MAGIC;
+		lh.kuc_transport = KUC_TRANSPORT_GENERIC;
+		lh.kuc_msgtype = KUC_MSG_SHUTDOWN;
+		lh.kuc_msglen = sizeof(lh);
+		libcfs_kkuc_group_put(group, &lh);
+	}
+
+	down_write(&kg_sem);
+	list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) {
+		if ((uid == 0) || (uid == reg->kr_uid)) {
+			list_del(&reg->kr_chain);
+			CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
+			       reg->kr_uid, reg->kr_fp, group);
+			if (reg->kr_fp != NULL)
+				fput(reg->kr_fp);
+			kfree(reg);
+		}
+	}
+	up_write(&kg_sem);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_rem);
+
+int libcfs_kkuc_group_put(int group, void *payload)
+{
+	struct kkuc_reg	*reg;
+	int		 rc = 0;
+	int one_success = 0;
+	ENTRY;
+
+	down_read(&kg_sem);
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (reg->kr_fp != NULL) {
+			rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
+			if (rc == 0)
+				one_success = 1;
+			else if (rc == -EPIPE) {
+				fput(reg->kr_fp);
+				reg->kr_fp = NULL;
+			}
+		}
+	}
+	up_read(&kg_sem);
+
+	/* don't return an error if the message has been delivered
+	 * at least to one agent */
+	if (one_success)
+		rc = 0;
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_put);
+
+/**
+ * Calls a callback function for each link of the given kuc group.
+ * @param group the group to call the function on.
+ * @param cb_func the function to be called.
+ * @param cb_arg iextra argument to be passed to the callback function.
+ */
+int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+			      void *cb_arg)
+{
+	struct kkuc_reg *reg;
+	int rc = 0;
+	ENTRY;
+
+	if (group > KUC_GRP_MAX) {
+		CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+		RETURN(-EINVAL);
+	}
+
+	/* no link for this group */
+	if (kkuc_groups[group].next == NULL)
+		RETURN(0);
+
+	down_read(&kg_sem);
+	list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+		if (reg->kr_fp != NULL) {
+			rc = cb_func(reg->kr_data, cb_arg);
+		}
+	}
+	up_read(&kg_sem);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_foreach);
+
+#endif /* LUSTRE_UTILS */

diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c b/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
new file mode 100644
index 0000000..8e88eb5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c

@@ -0,0 +1,204 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+/** Global CPU partition table */
+struct cfs_cpt_table   *cfs_cpt_table __read_mostly = NULL;
+EXPORT_SYMBOL(cfs_cpt_table);
+
+#ifndef HAVE_LIBCFS_CPT
+
+#define CFS_CPU_VERSION_MAGIC	   0xbabecafe
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+
+	if (ncpt != 1) {
+		CERROR("Can't support cpu partition number %d\n", ncpt);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab != NULL) {
+		cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+		cptab->ctb_nparts  = ncpt;
+	}
+
+	return cptab;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+void
+cfs_cpu_fini(void)
+{
+	if (cfs_cpt_table != NULL) {
+		cfs_cpt_table_free(cfs_cpt_table);
+		cfs_cpt_table = NULL;
+	}
+}
+
+int
+cfs_cpu_init(void)
+{
+	cfs_cpt_table = cfs_cpt_table_alloc(1);
+
+	return cfs_cpt_table != NULL ? 0 : -1;
+}
+
+#endif /* HAVE_LIBCFS_CPT */

diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c b/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
new file mode 100644
index 0000000..8d6c4ad
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c

@@ -0,0 +1,192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+
+/** destroy cpu-partition lock, see libcfs_private.h for more detail */
+void
+cfs_percpt_lock_free(struct cfs_percpt_lock *pcl)
+{
+	LASSERT(pcl->pcl_locks != NULL);
+	LASSERT(!pcl->pcl_locked);
+
+	cfs_percpt_free(pcl->pcl_locks);
+	LIBCFS_FREE(pcl, sizeof(*pcl));
+}
+EXPORT_SYMBOL(cfs_percpt_lock_free);
+
+/**
+ * create cpu-partition lock, see libcfs_private.h for more detail.
+ *
+ * cpu-partition lock is designed for large-scale SMP system, so we need to
+ * reduce cacheline conflict as possible as we can, that's the
+ * reason we always allocate cacheline-aligned memory block.
+ */
+struct cfs_percpt_lock *
+cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab)
+{
+	struct cfs_percpt_lock	*pcl;
+	spinlock_t		*lock;
+	int			i;
+
+	/* NB: cptab can be NULL, pcl will be for HW CPUs on that case */
+	LIBCFS_ALLOC(pcl, sizeof(*pcl));
+	if (pcl == NULL)
+		return NULL;
+
+	pcl->pcl_cptab = cptab;
+	pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock));
+	if (pcl->pcl_locks == NULL) {
+		LIBCFS_FREE(pcl, sizeof(*pcl));
+		return NULL;
+	}
+
+	cfs_percpt_for_each(lock, i, pcl->pcl_locks)
+		spin_lock_init(lock);
+
+	return pcl;
+}
+EXPORT_SYMBOL(cfs_percpt_lock_alloc);
+
+/**
+ * lock a CPU partition
+ *
+ * \a index != CFS_PERCPT_LOCK_EX
+ *     hold private lock indexed by \a index
+ *
+ * \a index == CFS_PERCPT_LOCK_EX
+ *     exclusively lock @pcl and nobody can take private lock
+ */
+void
+cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt);
+
+	if (ncpt == 1) {
+		index = 0;
+	} else { /* serialize with exclusive lock */
+		while (pcl->pcl_locked)
+			cpu_relax();
+	}
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_lock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	/* exclusive lock request */
+	for (i = 0; i < ncpt; i++) {
+		spin_lock(pcl->pcl_locks[i]);
+		if (i == 0) {
+			LASSERT(!pcl->pcl_locked);
+			/* nobody should take private lock after this
+			 * so I wouldn't starve for too long time */
+			pcl->pcl_locked = 1;
+		}
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_lock);
+
+/** unlock a CPU partition */
+void
+cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index)
+{
+	int	ncpt = cfs_cpt_number(pcl->pcl_cptab);
+	int	i;
+
+	index = ncpt == 1 ? 0 : index;
+
+	if (likely(index != CFS_PERCPT_LOCK_EX)) {
+		spin_unlock(pcl->pcl_locks[index]);
+		return;
+	}
+
+	for (i = ncpt - 1; i >= 0; i--) {
+		if (i == 0) {
+			LASSERT(pcl->pcl_locked);
+			pcl->pcl_locked = 0;
+		}
+		spin_unlock(pcl->pcl_locks[i]);
+	}
+}
+EXPORT_SYMBOL(cfs_percpt_unlock);
+
+
+/** free cpu-partition refcount */
+void
+cfs_percpt_atomic_free(atomic_t **refs)
+{
+	cfs_percpt_free(refs);
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_free);
+
+/** allocate cpu-partition refcount with initial value @init_val */
+atomic_t **
+cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int init_val)
+{
+	atomic_t	**refs;
+	atomic_t	*ref;
+	int		i;
+
+	refs = cfs_percpt_alloc(cptab, sizeof(*ref));
+	if (refs == NULL)
+		return NULL;
+
+	cfs_percpt_for_each(ref, i, refs)
+		atomic_set(ref, init_val);
+	return refs;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_alloc);
+
+/** return sum of cpu-partition refs */
+int
+cfs_percpt_atomic_summary(atomic_t **refs)
+{
+	atomic_t	*ref;
+	int		i;
+	int		val = 0;
+
+	cfs_percpt_for_each(ref, i, refs)
+		val += atomic_read(ref);
+
+	return val;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_summary);

diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c b/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
new file mode 100644
index 0000000..8791373
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c

@@ -0,0 +1,205 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+struct cfs_var_array {
+	unsigned int		va_count;	/* # of buffers */
+	unsigned int		va_size;	/* size of each var */
+	struct cfs_cpt_table	*va_cptab;	/* cpu partition table */
+	void			*va_ptrs[0];	/* buffer addresses */
+};
+
+/*
+ * free per-cpu data, see more detail in cfs_percpt_free
+ */
+void
+cfs_percpt_free(void *vars)
+{
+	struct	cfs_var_array *arr;
+	int	i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] != NULL)
+			LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_percpt_free);
+
+/*
+ * allocate per cpu-partition variables, returned value is an array of pointers,
+ * variable can be indexed by CPU partition ID, i.e:
+ *
+ *	arr = cfs_percpt_alloc(cfs_cpu_pt, size);
+ *	then caller can access memory block for CPU 0 by arr[0],
+ *	memory block for CPU 1 by arr[1]...
+ *	memory block for CPU N by arr[N]...
+ *
+ * cacheline aligned.
+ */
+void *
+cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			count;
+	int			i;
+
+	count = cfs_cpt_number(cptab);
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_size	= size = L1_CACHE_ALIGN(size);
+	arr->va_count	= count;
+	arr->va_cptab	= cptab;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_percpt_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_percpt_alloc);
+
+/*
+ * return number of CPUs (or number of elements in per-cpu data)
+ * according to cptab of @vars
+ */
+int
+cfs_percpt_number(void *vars)
+{
+	struct cfs_var_array *arr;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	return arr->va_count;
+}
+EXPORT_SYMBOL(cfs_percpt_number);
+
+/*
+ * return memory block shadowed from current CPU
+ */
+void *
+cfs_percpt_current(void *vars)
+{
+	struct cfs_var_array *arr;
+	int    cpt;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+	cpt = cfs_cpt_current(arr->va_cptab, 0);
+	if (cpt < 0)
+		return NULL;
+
+	return arr->va_ptrs[cpt];
+}
+EXPORT_SYMBOL(cfs_percpt_current);
+
+void *
+cfs_percpt_index(void *vars, int idx)
+{
+	struct cfs_var_array *arr;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	LASSERT(idx >= 0 && idx < arr->va_count);
+	return arr->va_ptrs[idx];
+}
+EXPORT_SYMBOL(cfs_percpt_index);
+
+/*
+ * free variable array, see more detail in cfs_array_alloc
+ */
+void
+cfs_array_free(void *vars)
+{
+	struct cfs_var_array	*arr;
+	int			i;
+
+	arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+	for (i = 0; i < arr->va_count; i++) {
+		if (arr->va_ptrs[i] == NULL)
+			continue;
+
+		LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+	}
+	LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+				  va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_array_free);
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by @count, @size is size of each
+ * memory block in array.
+ */
+void *
+cfs_array_alloc(int count, unsigned int size)
+{
+	struct cfs_var_array	*arr;
+	int			i;
+
+	LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+	if (arr == NULL)
+		return NULL;
+
+	arr->va_count	= count;
+	arr->va_size	= size;
+
+	for (i = 0; i < count; i++) {
+		LIBCFS_ALLOC(arr->va_ptrs[i], size);
+
+		if (arr->va_ptrs[i] == NULL) {
+			cfs_array_free((void *)&arr->va_ptrs[0]);
+			return NULL;
+		}
+	}
+
+	return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_array_alloc);

diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_string.c b/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
new file mode 100644
index 0000000..9edccc9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_string.c

@@ -0,0 +1,647 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/libcfs_string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+/* non-0 = don't match */
+int cfs_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+	if (s1 == NULL || s2 == NULL)
+		return 1;
+
+	if (n == 0)
+		return 0;
+
+	while (n-- != 0 && tolower(*s1) == tolower(*s2)) {
+		if (n == 0 || *s1 == '\0' || *s2 == '\0')
+			break;
+		s1++;
+		s2++;
+	}
+
+	return tolower(*(unsigned char *)s1) - tolower(*(unsigned char *)s2);
+}
+EXPORT_SYMBOL(cfs_strncasecmp);
+
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+		 int *oldmask, int minmask, int allmask)
+{
+	const char *debugstr;
+	char op = 0;
+	int newmask = minmask, i, len, found = 0;
+	ENTRY;
+
+	/* <str> must be a list of tokens separated by whitespace
+	 * and optionally an operator ('+' or '-').  If an operator
+	 * appears first in <str>, '*oldmask' is used as the starting point
+	 * (relative), otherwise minmask is used (absolute).  An operator
+	 * applies to all following tokens up to the next operator. */
+	while (*str != 0) {
+		while (isspace(*str))
+			str++;
+		if (*str == 0)
+			break;
+		if (*str == '+' || *str == '-') {
+			op = *str++;
+			if (!found)
+				/* only if first token is relative */
+				newmask = *oldmask;
+			while (isspace(*str))
+				str++;
+			if (*str == 0)	  /* trailing op */
+				return -EINVAL;
+		}
+
+		/* find token length */
+		for (len = 0; str[len] != 0 && !isspace(str[len]) &&
+		      str[len] != '+' && str[len] != '-'; len++);
+
+		/* match token */
+		found = 0;
+		for (i = 0; i < 32; i++) {
+			debugstr = bit2str(i);
+			if (debugstr != NULL &&
+			    strlen(debugstr) == len &&
+			    cfs_strncasecmp(str, debugstr, len) == 0) {
+				if (op == '-')
+					newmask &= ~(1 << i);
+				else
+					newmask |= (1 << i);
+				found = 1;
+				break;
+			}
+		}
+		if (!found && len == 3 &&
+		    (cfs_strncasecmp(str, "ALL", len) == 0)) {
+			if (op == '-')
+				newmask = minmask;
+			else
+				newmask = allmask;
+			found = 1;
+		}
+		if (!found) {
+			CWARN("unknown mask '%.*s'.\n"
+			      "mask usage: [+|-]<all|type> ...\n", len, str);
+			return -EINVAL;
+		}
+		str += len;
+	}
+
+	*oldmask = newmask;
+	return 0;
+}
+EXPORT_SYMBOL(cfs_str2mask);
+
+/* Duplicate a string in a platform-independent way */
+char *cfs_strdup(const char *str, u_int32_t flags)
+{
+	size_t lenz; /* length of str + zero byte */
+	char *dup_str;
+
+	lenz = strlen(str) + 1;
+
+	dup_str = kmalloc(lenz, flags);
+	if (dup_str == NULL)
+		return NULL;
+
+	memcpy(dup_str, str, lenz);
+
+	return dup_str;
+}
+EXPORT_SYMBOL(cfs_strdup);
+
+/**
+ * cfs_{v}snprintf() return the actual size that is printed rather than
+ * the size that would be printed in standard functions.
+ */
+/* safe vsnprintf */
+int cfs_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+{
+	int i;
+
+	LASSERT(size > 0);
+	i = vsnprintf(buf, size, fmt, args);
+
+	return  (i >= size ? size - 1 : i);
+}
+EXPORT_SYMBOL(cfs_vsnprintf);
+
+/* safe snprintf */
+int cfs_snprintf(char *buf, size_t size, const char *fmt, ...)
+{
+	va_list args;
+	int i;
+
+	va_start(args, fmt);
+	i = cfs_vsnprintf(buf, size, fmt, args);
+	va_end(args);
+
+	return  i;
+}
+EXPORT_SYMBOL(cfs_snprintf);
+
+/* get the first string out of @str */
+char *cfs_firststr(char *str, size_t size)
+{
+	size_t i = 0;
+	char  *end;
+
+	/* trim leading spaces */
+	while (i < size && *str && isspace(*str)) {
+		++i;
+		++str;
+	}
+
+	/* string with all spaces */
+	if (*str == '\0')
+		goto out;
+
+	end = str;
+	while (i < size && *end != '\0' && !isspace(*end)) {
+		++i;
+		++end;
+	}
+
+	*end= '\0';
+out:
+	return str;
+}
+EXPORT_SYMBOL(cfs_firststr);
+
+char *
+cfs_trimwhite(char *str)
+{
+	char *end;
+
+	while (cfs_iswhite(*str))
+		str++;
+
+	end = str + strlen(str);
+	while (end > str) {
+		if (!cfs_iswhite(end[-1]))
+			break;
+		end--;
+	}
+
+	*end = 0;
+	return str;
+}
+EXPORT_SYMBOL(cfs_trimwhite);
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+	char *end;
+
+	if (next->ls_str == NULL)
+		return 0;
+
+	/* skip leading white spaces */
+	while (next->ls_len) {
+		if (!cfs_iswhite(*next->ls_str))
+			break;
+		next->ls_str++;
+		next->ls_len--;
+	}
+
+	if (next->ls_len == 0) /* whitespaces only */
+		return 0;
+
+	if (*next->ls_str == delim) {
+		/* first non-writespace is the delimiter */
+		return 0;
+	}
+
+	res->ls_str = next->ls_str;
+	end = memchr(next->ls_str, delim, next->ls_len);
+	if (end == NULL) {
+		/* there is no the delimeter in the string */
+		end = next->ls_str + next->ls_len;
+		next->ls_str = NULL;
+	} else {
+		next->ls_str = end + 1;
+		next->ls_len -= (end - res->ls_str + 1);
+	}
+
+	/* skip ending whitespaces */
+	while (--end != res->ls_str) {
+		if (!cfs_iswhite(*end))
+			break;
+	}
+
+	res->ls_len = end - res->ls_str + 1;
+	return 1;
+}
+EXPORT_SYMBOL(cfs_gettok);
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+		  unsigned min, unsigned max)
+{
+	char	*endp;
+
+	str = cfs_trimwhite(str);
+	*num = strtoul(str, &endp, 0);
+	if (endp == str)
+		return 0;
+
+	for (; endp < str + nob; endp++) {
+		if (!cfs_iswhite(*endp))
+			return 0;
+	}
+
+	return (*num >= min && *num <= max);
+}
+EXPORT_SYMBOL(cfs_str2num_check);
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or  \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ `* src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+		     int bracketed, struct cfs_range_expr **expr)
+{
+	struct cfs_range_expr	*re;
+	struct cfs_lstr		tok;
+
+	LIBCFS_ALLOC(re, sizeof(*re));
+	if (re == NULL)
+		return -ENOMEM;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		re->re_lo = min;
+		re->re_hi = max;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_lo, min, max)) {
+		/* <number> is parsed */
+		re->re_hi = re->re_lo;
+		re->re_stride = 1;
+		goto out;
+	}
+
+	if (!bracketed || !cfs_gettok(src, '-', &tok))
+		goto failed;
+
+	if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+			       &re->re_lo, min, max))
+		goto failed;
+
+	/* <number> - */
+	if (cfs_str2num_check(src->ls_str, src->ls_len,
+			      &re->re_hi, min, max)) {
+		/* <number> - <number> is parsed */
+		re->re_stride = 1;
+		goto out;
+	}
+
+	/* go to check <number> '-' <number> '/' <number> */
+	if (cfs_gettok(src, '/', &tok)) {
+		if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+				       &re->re_hi, min, max))
+			goto failed;
+
+		/* <number> - <number> / ... */
+		if (cfs_str2num_check(src->ls_str, src->ls_len,
+				      &re->re_stride, min, max)) {
+			/* <number> - <number> / <number> is parsed */
+			goto out;
+		}
+	}
+
+ out:
+	*expr = re;
+	return 0;
+
+ failed:
+	LIBCFS_FREE(re, sizeof(*re));
+	return -EINVAL;
+}
+EXPORT_SYMBOL(cfs_range_expr_parse);
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr	*expr;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		if (value >= expr->re_lo && value <= expr->re_hi &&
+		    ((value - expr->re_lo) % expr->re_stride) == 0)
+			return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(cfs_expr_list_match);
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+	struct cfs_range_expr	*expr;
+	__u32			*val;
+	int			count = 0;
+	int			i;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				count++;
+		}
+	}
+
+	if (count == 0) /* empty expression list */
+		return 0;
+
+	if (count > max) {
+		CERROR("Number of values %d exceeds max allowed %d\n",
+		       max, count);
+		return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(val, sizeof(val[0]) * count);
+	if (val == NULL)
+		return -ENOMEM;
+
+	count = 0;
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		for (i = expr->re_lo; i <= expr->re_hi; i++) {
+			if (((i - expr->re_lo) % expr->re_stride) == 0)
+				val[count++] = i;
+		}
+	}
+
+	*valpp = val;
+	return count;
+}
+EXPORT_SYMBOL(cfs_expr_list_values);
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+	while (!list_empty(&expr_list->el_exprs)) {
+		struct cfs_range_expr *expr;
+
+		expr = list_entry(expr_list->el_exprs.next,
+				      struct cfs_range_expr, re_link),
+		list_del(&expr->re_link);
+		LIBCFS_FREE(expr, sizeof(*expr));
+	}
+
+	LIBCFS_FREE(expr_list, sizeof(*expr_list));
+}
+EXPORT_SYMBOL(cfs_expr_list_free);
+
+void
+cfs_expr_list_print(struct cfs_expr_list *expr_list)
+{
+	struct cfs_range_expr *expr;
+
+	list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+		CDEBUG(D_WARNING, "%d-%d/%d\n",
+		       expr->re_lo, expr->re_hi, expr->re_stride);
+	}
+}
+EXPORT_SYMBOL(cfs_expr_list_print);
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 1 if \a str parses to \<number\> | \<expr_list\>
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+		    struct cfs_expr_list **elpp)
+{
+	struct cfs_expr_list	*expr_list;
+	struct cfs_range_expr	*expr;
+	struct cfs_lstr		src;
+	int			rc;
+
+	LIBCFS_ALLOC(expr_list, sizeof(*expr_list));
+	if (expr_list == NULL)
+		return -ENOMEM;
+
+	src.ls_str = str;
+	src.ls_len = len;
+
+	INIT_LIST_HEAD(&expr_list->el_exprs);
+
+	if (src.ls_str[0] == '[' &&
+	    src.ls_str[src.ls_len - 1] == ']') {
+		src.ls_str++;
+		src.ls_len -= 2;
+
+		rc = -EINVAL;
+		while (src.ls_str != NULL) {
+			struct cfs_lstr tok;
+
+			if (!cfs_gettok(&src, ',', &tok)) {
+				rc = -EINVAL;
+				break;
+			}
+
+			rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+			if (rc != 0)
+				break;
+
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	} else {
+		rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+		if (rc == 0) {
+			list_add_tail(&expr->re_link,
+					  &expr_list->el_exprs);
+		}
+	}
+
+	if (rc != 0)
+		cfs_expr_list_free(expr_list);
+	else
+		*elpp = expr_list;
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_expr_list_parse);
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+	struct cfs_expr_list *el;
+
+	while (!list_empty(list)) {
+		el = list_entry(list->next,
+				    struct cfs_expr_list, el_link);
+		list_del(&el->el_link);
+		cfs_expr_list_free(el);
+	}
+}
+EXPORT_SYMBOL(cfs_expr_list_free_list);
+
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list	*el;
+	struct cfs_lstr		src;
+	int			rc;
+	int			i;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	i = 0;
+
+	while (src.ls_str != NULL) {
+		struct cfs_lstr res;
+
+		if (!cfs_gettok(&src, '.', &res)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+		if (rc != 0)
+			goto out;
+
+		list_add_tail(&el->el_link, list);
+		i++;
+	}
+
+	if (i == 4)
+		return 0;
+
+	rc = -EINVAL;
+ out:
+	cfs_expr_list_free_list(list);
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_ip_addr_parse);
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int i = 0;
+
+	list_for_each_entry_reverse(el, list, el_link) {
+		if (!cfs_expr_list_match(addr & 0xff, el))
+			return 0;
+		addr >>= 8;
+		i++;
+	}
+
+	return i == 4;
+}
+EXPORT_SYMBOL(cfs_ip_addr_match);
+
+void
+cfs_ip_addr_free(struct list_head *list)
+{
+	cfs_expr_list_free_list(list);
+}
+EXPORT_SYMBOL(cfs_ip_addr_free);

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
new file mode 100644
index 0000000..95142d1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c

@@ -0,0 +1,1085 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/libcfs/libcfs.h>
+
+#ifdef CONFIG_SMP
+
+/**
+ * modparam for setting number of partitions
+ *
+ *  0 : estimate best value based on cores or NUMA nodes
+ *  1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int	cpu_npartitions;
+CFS_MODULE_PARM(cpu_npartitions, "i", int, 0444, "# of CPU partitions");
+
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ *      number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ *       are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char	*cpu_pattern = "";
+CFS_MODULE_PARM(cpu_pattern, "s", charp, 0444, "CPU partitions pattern");
+
+struct cfs_cpt_data {
+	/* serialize hotplug etc */
+	spinlock_t		cpt_lock;
+	/* reserved for hotplug */
+	unsigned long		cpt_version;
+	/* mutex to protect cpt_cpumask */
+	struct semaphore	cpt_mutex;
+	/* scratch buffer for set/unset_node */
+	cpumask_t		*cpt_cpumask;
+};
+
+static struct cfs_cpt_data	cpt_data;
+
+void
+cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
+{
+	/* return cpumask of cores in the same socket */
+	cpumask_copy(mask, topology_core_cpumask(cpu));
+}
+EXPORT_SYMBOL(cfs_cpu_core_siblings);
+
+/* return number of cores in the same socket of \a cpu */
+int
+cfs_cpu_core_nsiblings(int cpu)
+{
+	int	num;
+
+	down(&cpt_data.cpt_mutex);
+
+	cfs_cpu_core_siblings(cpu, cpt_data.cpt_cpumask);
+	num = cpus_weight(*cpt_data.cpt_cpumask);
+
+	up(&cpt_data.cpt_mutex);
+
+	return num;
+}
+EXPORT_SYMBOL(cfs_cpu_core_nsiblings);
+
+/* return cpumask of HTs in the same core */
+void
+cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
+{
+	cpumask_copy(mask, topology_thread_cpumask(cpu));
+}
+EXPORT_SYMBOL(cfs_cpu_ht_siblings);
+
+/* return number of HTs in the same core of \a cpu */
+int
+cfs_cpu_ht_nsiblings(int cpu)
+{
+	int	num;
+
+	down(&cpt_data.cpt_mutex);
+
+	cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
+	num = cpus_weight(*cpt_data.cpt_cpumask);
+
+	up(&cpt_data.cpt_mutex);
+
+	return num;
+}
+EXPORT_SYMBOL(cfs_cpu_ht_nsiblings);
+
+void
+cfs_node_to_cpumask(int node, cpumask_t *mask)
+{
+	cpumask_copy(mask, cpumask_of_node(node));
+}
+EXPORT_SYMBOL(cfs_node_to_cpumask);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+	int	i;
+
+	if (cptab->ctb_cpu2cpt != NULL) {
+		LIBCFS_FREE(cptab->ctb_cpu2cpt,
+			    num_possible_cpus() *
+			    sizeof(cptab->ctb_cpu2cpt[0]));
+	}
+
+	for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		if (part->cpt_nodemask != NULL) {
+			LIBCFS_FREE(part->cpt_nodemask,
+				    sizeof(*part->cpt_nodemask));
+		}
+
+		if (part->cpt_cpumask != NULL)
+			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+	}
+
+	if (cptab->ctb_parts != NULL) {
+		LIBCFS_FREE(cptab->ctb_parts,
+			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+	}
+
+	if (cptab->ctb_nodemask != NULL)
+		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+	if (cptab->ctb_cpumask != NULL)
+		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+
+	LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+	struct cfs_cpt_table *cptab;
+	int	i;
+
+	LIBCFS_ALLOC(cptab, sizeof(*cptab));
+	if (cptab == NULL)
+		return NULL;
+
+	cptab->ctb_nparts = ncpt;
+
+	LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
+	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+
+	if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
+		goto failed;
+
+	LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
+		     num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+	if (cptab->ctb_cpu2cpt == NULL)
+		goto failed;
+
+	memset(cptab->ctb_cpu2cpt, -1,
+	       num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+
+	LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
+	if (cptab->ctb_parts == NULL)
+		goto failed;
+
+	for (i = 0; i < ncpt; i++) {
+		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+		LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+		if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
+			goto failed;
+	}
+
+	spin_lock(&cpt_data.cpt_lock);
+	/* Reserved for hotplug */
+	cptab->ctb_version = cpt_data.cpt_version;
+	spin_unlock(&cpt_data.cpt_lock);
+
+	return cptab;
+
+ failed:
+	cfs_cpt_table_free(cptab);
+	return NULL;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+	char	*tmp = buf;
+	int	rc = 0;
+	int	i;
+	int	j;
+
+	for (i = 0; i < cptab->ctb_nparts; i++) {
+		if (len > 0) {
+			rc = snprintf(tmp, len, "%d\t: ", i);
+			len -= rc;
+		}
+
+		if (len <= 0) {
+			rc = -EFBIG;
+			goto out;
+		}
+
+		tmp += rc;
+		for_each_cpu_mask(j, *cptab->ctb_parts[i].cpt_cpumask) {
+			rc = snprintf(tmp, len, "%d ", j);
+			len -= rc;
+			if (len <= 0) {
+				rc = -EFBIG;
+				goto out;
+			}
+			tmp += rc;
+		}
+
+		*tmp = '\n';
+		tmp++;
+		len--;
+	}
+
+ out:
+	if (rc < 0)
+		return rc;
+
+	return tmp - buf;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+	return cptab->ctb_nparts;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cpus_weight(*cptab->ctb_cpumask) :
+	       cpus_weight(*cptab->ctb_parts[cpt].cpt_cpumask);
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       any_online_cpu(*cptab->ctb_cpumask) != NR_CPUS :
+	       any_online_cpu(*cptab->ctb_parts[cpt].cpt_cpumask) != NR_CPUS;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
+nodemask_t *
+cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
+{
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	return cpt == CFS_CPT_ANY ?
+	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
+}
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	int	node;
+
+	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+	if (cpu < 0 || cpu >= NR_CPUS || !cpu_online(cpu)) {
+		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+		return 0;
+	}
+
+	if (cptab->ctb_cpu2cpt[cpu] != -1) {
+		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+		       cpu, cptab->ctb_cpu2cpt[cpu]);
+		return 0;
+	}
+
+	cptab->ctb_cpu2cpt[cpu] = cpt;
+
+	LASSERT(!cpu_isset(cpu, *cptab->ctb_cpumask));
+	LASSERT(!cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
+
+	cpu_set(cpu, *cptab->ctb_cpumask);
+	cpu_set(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
+
+	node = cpu_to_node(cpu);
+
+	/* first CPU of @node in this CPT table */
+	if (!node_isset(node, *cptab->ctb_nodemask))
+		node_set(node, *cptab->ctb_nodemask);
+
+	/* first CPU of @node in this partition */
+	if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
+		node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+	int	node;
+	int	i;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpu < 0 || cpu >= NR_CPUS) {
+		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+		return;
+	}
+
+	if (cpt == CFS_CPT_ANY) {
+		/* caller doesn't know the partition ID */
+		cpt = cptab->ctb_cpu2cpt[cpu];
+		if (cpt < 0) { /* not set in this CPT-table */
+			CDEBUG(D_INFO, "Try to unset cpu %d which is "
+				       "not in CPT-table %p\n", cpt, cptab);
+			return;
+		}
+
+	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+		CDEBUG(D_INFO,
+		       "CPU %d is not in cpu-partition %d\n", cpu, cpt);
+		return;
+	}
+
+	LASSERT(cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
+	LASSERT(cpu_isset(cpu, *cptab->ctb_cpumask));
+
+	cpu_clear(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
+	cpu_clear(cpu, *cptab->ctb_cpumask);
+	cptab->ctb_cpu2cpt[cpu] = -1;
+
+	node = cpu_to_node(cpu);
+
+	LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
+	LASSERT(node_isset(node, *cptab->ctb_nodemask));
+
+	for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) {
+		/* this CPT has other CPU belonging to this node? */
+		if (cpu_to_node(i) == node)
+			break;
+	}
+
+	if (i == NR_CPUS)
+		node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+	for_each_cpu_mask(i, *cptab->ctb_cpumask) {
+		/* this CPT-table has other CPU belonging to this node? */
+		if (cpu_to_node(i) == node)
+			break;
+	}
+
+	if (i == NR_CPUS)
+		node_clear(node, *cptab->ctb_nodemask);
+
+	return;
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	int	i;
+
+	if (cpus_weight(*mask) == 0 || any_online_cpu(*mask) == NR_CPUS) {
+		CDEBUG(D_INFO, "No online CPU is found in the CPU mask "
+			       "for CPU partition %d\n", cpt);
+		return 0;
+	}
+
+	for_each_cpu_mask(i, *mask) {
+		if (!cfs_cpt_set_cpu(cptab, cpt, i))
+			return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+	int	i;
+
+	for_each_cpu_mask(i, *mask)
+		cfs_cpt_unset_cpu(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	cpumask_t	*mask;
+	int		rc;
+
+	if (node < 0 || node >= MAX_NUMNODES) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return 0;
+	}
+
+	down(&cpt_data.cpt_mutex);
+
+	mask = cpt_data.cpt_cpumask;
+	cfs_node_to_cpumask(node, mask);
+
+	rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
+
+	up(&cpt_data.cpt_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+	cpumask_t *mask;
+
+	if (node < 0 || node >= MAX_NUMNODES) {
+		CDEBUG(D_INFO,
+		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+		return;
+	}
+
+	down(&cpt_data.cpt_mutex);
+
+	mask = cpt_data.cpt_cpumask;
+	cfs_node_to_cpumask(node, mask);
+
+	cfs_cpt_unset_cpumask(cptab, cpt, mask);
+
+	up(&cpt_data.cpt_mutex);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	int	i;
+
+	for_each_node_mask(i, *mask) {
+		if (!cfs_cpt_set_node(cptab, cpt, i))
+			return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+	int	i;
+
+	for_each_node_mask(i, *mask)
+		cfs_cpt_unset_node(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+	int	last;
+	int	i;
+
+	if (cpt == CFS_CPT_ANY) {
+		last = cptab->ctb_nparts - 1;
+		cpt = 0;
+	} else {
+		last = cpt;
+	}
+
+	for (; cpt <= last; cpt++) {
+		for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask)
+			cfs_cpt_unset_cpu(cptab, cpt, i);
+	}
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+	nodemask_t	*mask;
+	int		weight;
+	int		rotor;
+	int		node;
+
+	/* convert CPU partition ID to HW node id */
+
+	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+		mask = cptab->ctb_nodemask;
+		rotor = cptab->ctb_spread_rotor++;
+	} else {
+		mask = cptab->ctb_parts[cpt].cpt_nodemask;
+		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+	}
+
+	weight = nodes_weight(*mask);
+	LASSERT(weight > 0);
+
+	rotor %= weight;
+
+	for_each_node_mask(node, *mask) {
+		if (rotor-- == 0)
+			return node;
+	}
+
+	LBUG();
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+	int	cpu = smp_processor_id();
+	int	cpt = cptab->ctb_cpu2cpt[cpu];
+
+	if (cpt < 0) {
+		if (!remap)
+			return cpt;
+
+		/* don't return negative value for safety of upper layer,
+		 * instead we shadow the unknown cpu to a valid partition ID */
+		cpt = cpu % cptab->ctb_nparts;
+	}
+
+	return cpt;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+	LASSERT(cpu >= 0 && cpu < NR_CPUS);
+
+	return cptab->ctb_cpu2cpt[cpu];
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+	cpumask_t	*cpumask;
+	nodemask_t	*nodemask;
+	int		rc;
+	int		i;
+
+	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+	if (cpt == CFS_CPT_ANY) {
+		cpumask = cptab->ctb_cpumask;
+		nodemask = cptab->ctb_nodemask;
+	} else {
+		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+	}
+
+	if (any_online_cpu(*cpumask) == NR_CPUS) {
+		CERROR("No online CPU found in CPU partition %d, did someone "
+		       "do CPU hotplug on system? You might need to reload "
+		       "Lustre modules to keep system working well.\n", cpt);
+		return -EINVAL;
+	}
+
+	for_each_online_cpu(i) {
+		if (cpu_isset(i, *cpumask))
+			continue;
+
+		rc = set_cpus_allowed_ptr(current, cpumask);
+		set_mems_allowed(*nodemask);
+		if (rc == 0)
+			schedule(); /* switch to allowed CPU */
+
+		return rc;
+	}
+
+	/* don't need to set affinity because all online CPUs are covered */
+	return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int
+cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+		     cpumask_t *node, int number)
+{
+	cpumask_t	*socket = NULL;
+	cpumask_t	*core = NULL;
+	int		rc = 0;
+	int		cpu;
+
+	LASSERT(number > 0);
+
+	if (number >= cpus_weight(*node)) {
+		while (!cpus_empty(*node)) {
+			cpu = first_cpu(*node);
+
+			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+			if (!rc)
+				return -EINVAL;
+			cpu_clear(cpu, *node);
+		}
+		return 0;
+	}
+
+	/* allocate scratch buffer */
+	LIBCFS_ALLOC(socket, cpumask_size());
+	LIBCFS_ALLOC(core, cpumask_size());
+	if (socket == NULL || core == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	while (!cpus_empty(*node)) {
+		cpu = first_cpu(*node);
+
+		/* get cpumask for cores in the same socket */
+		cfs_cpu_core_siblings(cpu, socket);
+		cpus_and(*socket, *socket, *node);
+
+		LASSERT(!cpus_empty(*socket));
+
+		while (!cpus_empty(*socket)) {
+			int     i;
+
+			/* get cpumask for hts in the same core */
+			cfs_cpu_ht_siblings(cpu, core);
+			cpus_and(*core, *core, *node);
+
+			LASSERT(!cpus_empty(*core));
+
+			for_each_cpu_mask(i, *core) {
+				cpu_clear(i, *socket);
+				cpu_clear(i, *node);
+
+				rc = cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					rc = -EINVAL;
+					goto out;
+				}
+
+				if (--number == 0)
+					goto out;
+			}
+			cpu = first_cpu(*socket);
+		}
+	}
+
+ out:
+	if (socket != NULL)
+		LIBCFS_FREE(socket, cpumask_size());
+	if (core != NULL)
+		LIBCFS_FREE(core, cpumask_size());
+	return rc;
+}
+
+#define CPT_WEIGHT_MIN  4u
+
+static unsigned int
+cfs_cpt_num_estimate(void)
+{
+	unsigned nnode = num_online_nodes();
+	unsigned ncpu  = num_online_cpus();
+	unsigned ncpt;
+
+	if (ncpu <= CPT_WEIGHT_MIN) {
+		ncpt = 1;
+		goto out;
+	}
+
+	/* generate reasonable number of CPU partitions based on total number
+	 * of CPUs, Preferred N should be power2 and match this condition:
+	 * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
+	for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
+
+	if (ncpt <= nnode) { /* fat numa system */
+		while (nnode > ncpt)
+			nnode >>= 1;
+
+	} else { /* ncpt > nnode */
+		while ((nnode << 1) <= ncpt)
+			nnode <<= 1;
+	}
+
+	ncpt = nnode;
+
+ out:
+#if (BITS_PER_LONG == 32)
+	/* config many CPU partitions on 32-bit system could consume
+	 * too much memory */
+	ncpt = min(2U, ncpt);
+#endif
+	while (ncpu % ncpt != 0)
+		ncpt--; /* worst case is 1 */
+
+	return ncpt;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create(int ncpt)
+{
+	struct cfs_cpt_table *cptab = NULL;
+	cpumask_t	*mask = NULL;
+	int		cpt = 0;
+	int		num;
+	int		rc;
+	int		i;
+
+	rc = cfs_cpt_num_estimate();
+	if (ncpt <= 0)
+		ncpt = rc;
+
+	if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
+		CWARN("CPU partition number %d is larger than suggested "
+		      "value (%d), your system may have performance"
+		      "issue or run out of memory while under pressure\n",
+		      ncpt, rc);
+	}
+
+	if (num_online_cpus() % ncpt != 0) {
+		CERROR("CPU number %d is not multiple of cpu_npartition %d, "
+		       "please try different cpu_npartitions value or"
+		       "set pattern string by cpu_pattern=STRING\n",
+		       (int)num_online_cpus(), ncpt);
+		goto failed;
+	}
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (cptab == NULL) {
+		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+		goto failed;
+	}
+
+	num = num_online_cpus() / ncpt;
+	if (num == 0) {
+		CERROR("CPU changed while setting CPU partition\n");
+		goto failed;
+	}
+
+	LIBCFS_ALLOC(mask, cpumask_size());
+	if (mask == NULL) {
+		CERROR("Failed to allocate scratch cpumask\n");
+		goto failed;
+	}
+
+	for_each_online_node(i) {
+		cfs_node_to_cpumask(i, mask);
+
+		while (!cpus_empty(*mask)) {
+			struct cfs_cpu_partition *part;
+			int    n;
+
+			if (cpt >= ncpt)
+				goto failed;
+
+			part = &cptab->ctb_parts[cpt];
+
+			n = num - cpus_weight(*part->cpt_cpumask);
+			LASSERT(n > 0);
+
+			rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
+			if (rc < 0)
+				goto failed;
+
+			LASSERT(num >= cpus_weight(*part->cpt_cpumask));
+			if (num == cpus_weight(*part->cpt_cpumask))
+				cpt++;
+		}
+	}
+
+	if (cpt != ncpt ||
+	    num != cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
+		CERROR("Expect %d(%d) CPU partitions but got %d(%d), "
+		       "CPU hotplug/unplug while setting?\n",
+		       cptab->ctb_nparts, num, cpt,
+		       cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask));
+		goto failed;
+	}
+
+	LIBCFS_FREE(mask, cpumask_size());
+
+	return cptab;
+
+ failed:
+	CERROR("Failed to setup CPU-partition-table with %d "
+	       "CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
+	       ncpt, num_online_nodes(), num_online_cpus());
+
+	if (mask != NULL)
+		LIBCFS_FREE(mask, cpumask_size());
+
+	if (cptab != NULL)
+		cfs_cpt_table_free(cptab);
+
+	return NULL;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create_pattern(char *pattern)
+{
+	struct cfs_cpt_table	*cptab;
+	char			*str	= pattern;
+	int			node	= 0;
+	int			high;
+	int			ncpt;
+	int			c;
+
+	for (ncpt = 0;; ncpt++) { /* quick scan bracket */
+		str = strchr(str, '[');
+		if (str == NULL)
+			break;
+		str++;
+	}
+
+	str = cfs_trimwhite(pattern);
+	if (*str == 'n' || *str == 'N') {
+		pattern = str + 1;
+		node = 1;
+	}
+
+	if (ncpt == 0 ||
+	    (node && ncpt > num_online_nodes()) ||
+	    (!node && ncpt > num_online_cpus())) {
+		CERROR("Invalid pattern %s, or too many partitions %d\n",
+		       pattern, ncpt);
+		return NULL;
+	}
+
+	high = node ? MAX_NUMNODES - 1 : NR_CPUS - 1;
+
+	cptab = cfs_cpt_table_alloc(ncpt);
+	if (cptab == NULL) {
+		CERROR("Failed to allocate cpu partition table\n");
+		return NULL;
+	}
+
+	for (str = cfs_trimwhite(pattern), c = 0;; c++) {
+		struct cfs_range_expr	*range;
+		struct cfs_expr_list	*el;
+		char			*bracket = strchr(str, '[');
+		int			cpt;
+		int			rc;
+		int			i;
+		int			n;
+
+		if (bracket == NULL) {
+			if (*str != 0) {
+				CERROR("Invalid pattern %s\n", str);
+				goto failed;
+			} else if (c != ncpt) {
+				CERROR("expect %d partitions but found %d\n",
+				       ncpt, c);
+				goto failed;
+			}
+			break;
+		}
+
+		if (sscanf(str, "%u%n", &cpt, &n) < 1) {
+			CERROR("Invalid cpu pattern %s\n", str);
+			goto failed;
+		}
+
+		if (cpt < 0 || cpt >= ncpt) {
+			CERROR("Invalid partition id %d, total partitions %d\n",
+			       cpt, ncpt);
+			goto failed;
+		}
+
+		if (cfs_cpt_weight(cptab, cpt) != 0) {
+			CERROR("Partition %d has already been set.\n", cpt);
+			goto failed;
+		}
+
+		str = cfs_trimwhite(str + n);
+		if (str != bracket) {
+			CERROR("Invalid pattern %s\n", str);
+			goto failed;
+		}
+
+		bracket = strchr(str, ']');
+		if (bracket == NULL) {
+			CERROR("missing right bracket for cpt %d, %s\n",
+			       cpt, str);
+			goto failed;
+		}
+
+		if (cfs_expr_list_parse(str, (bracket - str) + 1,
+					0, high, &el) != 0) {
+			CERROR("Can't parse number range: %s\n", str);
+			goto failed;
+		}
+
+		list_for_each_entry(range, &el->el_exprs, re_link) {
+			for (i = range->re_lo; i <= range->re_hi; i++) {
+				if ((i - range->re_lo) % range->re_stride != 0)
+					continue;
+
+				rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
+					    cfs_cpt_set_cpu(cptab, cpt, i);
+				if (!rc) {
+					cfs_expr_list_free(el);
+					goto failed;
+				}
+			}
+		}
+
+		cfs_expr_list_free(el);
+
+		if (!cfs_cpt_online(cptab, cpt)) {
+			CERROR("No online CPU is found on partition %d\n", cpt);
+			goto failed;
+		}
+
+		str = cfs_trimwhite(bracket + 1);
+	}
+
+	return cptab;
+
+ failed:
+	cfs_cpt_table_free(cptab);
+	return NULL;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int
+cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int  cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		spin_lock(&cpt_data.cpt_lock);
+		cpt_data.cpt_version++;
+		spin_unlock(&cpt_data.cpt_lock);
+	default:
+		CWARN("Lustre: can't support CPU hotplug well now, "
+		      "performance and stability could be impacted"
+		      "[CPU %u notify: %lx]\n", cpu, action);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+	.notifier_call	= cfs_cpu_notify,
+	.priority	= 0
+};
+
+#endif
+
+void
+cfs_cpu_fini(void)
+{
+	if (cfs_cpt_table != NULL)
+		cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+	if (cpt_data.cpt_cpumask != NULL)
+		LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
+}
+
+int
+cfs_cpu_init(void)
+{
+	LASSERT(cfs_cpt_table == NULL);
+
+	memset(&cpt_data, 0, sizeof(cpt_data));
+
+	LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
+	if (cpt_data.cpt_cpumask == NULL) {
+		CERROR("Failed to allocate scratch buffer\n");
+		return -1;
+	}
+
+	spin_lock_init(&cpt_data.cpt_lock);
+	sema_init(&cpt_data.cpt_mutex, 1);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+
+	if (*cpu_pattern != 0) {
+		cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
+		if (cfs_cpt_table == NULL) {
+			CERROR("Failed to create cptab from pattern %s\n",
+			       cpu_pattern);
+			goto failed;
+		}
+
+	} else {
+		cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
+		if (cfs_cpt_table == NULL) {
+			CERROR("Failed to create ptable with npartitions %d\n",
+			       cpu_npartitions);
+			goto failed;
+		}
+	}
+
+	spin_lock(&cpt_data.cpt_lock);
+	if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
+		spin_unlock(&cpt_data.cpt_lock);
+		CERROR("CPU hotplug/unplug during setup\n");
+		goto failed;
+	}
+	spin_unlock(&cpt_data.cpt_lock);
+
+	LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
+		 num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
+	return 0;
+
+ failed:
+	cfs_cpu_fini();
+	return -1;
+}
+
+#endif

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
new file mode 100644
index 0000000..20b2d61
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c

@@ -0,0 +1,144 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to zlib_adler32.
+ */
+
+#include <linux/module.h>
+#include <linux/zutil.h>
+#include <crypto/internal/hash.h>
+
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+
+static u32 __adler32(u32 cksum, unsigned char const *p, size_t len)
+{
+	return zlib_adler32(cksum, p, len);
+}
+
+static int adler32_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 1;
+
+	return 0;
+}
+
+static int adler32_setkey(struct crypto_shash *hash, const u8 *key,
+			  unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = *(u32 *)key;
+	return 0;
+}
+
+static int adler32_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = *mctx;
+
+	return 0;
+}
+
+static int adler32_update(struct shash_desc *desc, const u8 *data,
+			  unsigned int len)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*cksump = __adler32(*cksump, data, len);
+	return 0;
+}
+static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len,
+			   u8 *out)
+{
+	*(u32 *)out = __adler32(*cksump, data, len);
+	return 0;
+}
+
+static int adler32_finup(struct shash_desc *desc, const u8 *data,
+			 unsigned int len, u8 *out)
+{
+	return __adler32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int adler32_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *cksump = shash_desc_ctx(desc);
+
+	*(u32 *)out = *cksump;
+	return 0;
+}
+
+static int adler32_digest(struct shash_desc *desc, const u8 *data,
+			  unsigned int len, u8 *out)
+{
+	return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+static struct shash_alg alg = {
+	.setkey		= adler32_setkey,
+	.init		= adler32_init,
+	.update		= adler32_update,
+	.final		= adler32_final,
+	.finup		= adler32_finup,
+	.digest		= adler32_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "adler32",
+		.cra_driver_name	= "adler32-zlib",
+		.cra_priority		= 100,
+		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(u32),
+		.cra_module		= THIS_MODULE,
+		.cra_init		= adler32_cra_init,
+	}
+};
+
+
+int cfs_crypto_adler32_register(void)
+{
+	return crypto_register_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_adler32_register);
+
+void cfs_crypto_adler32_unregister(void)
+{
+	crypto_unregister_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_adler32_unregister);

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
new file mode 100644
index 0000000..8e35777
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c

@@ -0,0 +1,289 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/linux-crypto.h>
+/**
+ *  Array of  hash algorithm speed in MByte per second
+ */
+static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX];
+
+
+
+static int cfs_crypto_hash_alloc(unsigned char alg_id,
+				 const struct cfs_crypto_hash_type **type,
+				 struct hash_desc *desc, unsigned char *key,
+				 unsigned int key_len)
+{
+	int     err = 0;
+
+	*type = cfs_crypto_hash_type(alg_id);
+
+	if (*type == NULL) {
+		CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
+		      alg_id, CFS_HASH_ALG_MAX);
+		return -EINVAL;
+	}
+	desc->tfm = crypto_alloc_hash((*type)->cht_name, 0, 0);
+
+	if (desc->tfm == NULL)
+		return -EINVAL;
+
+	if (IS_ERR(desc->tfm)) {
+		CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
+		       (*type)->cht_name);
+		return PTR_ERR(desc->tfm);
+	}
+
+	desc->flags = 0;
+
+	/** Shash have different logic for initialization then digest
+	 * shash: crypto_hash_setkey, crypto_hash_init
+	 * digest: crypto_digest_init, crypto_digest_setkey
+	 * Skip this function for digest, because we use shash logic at
+	 * cfs_crypto_hash_alloc.
+	 */
+	if (key != NULL) {
+		err = crypto_hash_setkey(desc->tfm, key, key_len);
+	} else if ((*type)->cht_key != 0) {
+		err = crypto_hash_setkey(desc->tfm,
+					 (unsigned char *)&((*type)->cht_key),
+					 (*type)->cht_size);
+	}
+
+	if (err != 0) {
+		crypto_free_hash(desc->tfm);
+		return err;
+	}
+
+	CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
+	       (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_name,
+	       (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_driver_name,
+	       cfs_crypto_hash_speeds[alg_id]);
+
+	return crypto_hash_init(desc);
+}
+
+int cfs_crypto_hash_digest(unsigned char alg_id,
+			   const void *buf, unsigned int buf_len,
+			   unsigned char *key, unsigned int key_len,
+			   unsigned char *hash, unsigned int *hash_len)
+{
+	struct scatterlist	sl;
+	struct hash_desc	hdesc;
+	int			err;
+	const struct cfs_crypto_hash_type	*type;
+
+	if (buf == NULL || buf_len == 0 || hash_len == NULL)
+		return -EINVAL;
+
+	err = cfs_crypto_hash_alloc(alg_id, &type, &hdesc, key, key_len);
+	if (err != 0)
+		return err;
+
+	if (hash == NULL || *hash_len < type->cht_size) {
+		*hash_len = type->cht_size;
+		crypto_free_hash(hdesc.tfm);
+		return -ENOSPC;
+	}
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	hdesc.flags = 0;
+	err = crypto_hash_digest(&hdesc, &sl, sl.length, hash);
+	crypto_free_hash(hdesc.tfm);
+
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_digest);
+
+struct cfs_crypto_hash_desc *
+	cfs_crypto_hash_init(unsigned char alg_id,
+			     unsigned char *key, unsigned int key_len)
+{
+
+	struct  hash_desc       *hdesc;
+	int		     err;
+	const struct cfs_crypto_hash_type       *type;
+
+	hdesc = kmalloc(sizeof(*hdesc), 0);
+	if (hdesc == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	err = cfs_crypto_hash_alloc(alg_id, &type, hdesc, key, key_len);
+
+	if (err) {
+		kfree(hdesc);
+		return ERR_PTR(err);
+	}
+	return (struct cfs_crypto_hash_desc *)hdesc;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_init);
+
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
+				struct page *page, unsigned int offset,
+				unsigned int len)
+{
+	struct scatterlist sl;
+
+	sg_init_table(&sl, 1);
+	sg_set_page(&sl, page, len, offset & ~CFS_PAGE_MASK);
+
+	return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update_page);
+
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
+			   const void *buf, unsigned int buf_len)
+{
+	struct scatterlist sl;
+
+	sg_init_one(&sl, (void *)buf, buf_len);
+
+	return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update);
+
+/*      If hash_len pointer is NULL - destroy descriptor. */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
+			  unsigned char *hash, unsigned int *hash_len)
+{
+	int     err;
+	int     size = crypto_hash_digestsize(((struct hash_desc *)hdesc)->tfm);
+
+	if (hash_len == NULL) {
+		crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+		kfree(hdesc);
+		return 0;
+	}
+	if (hash == NULL || *hash_len < size) {
+		*hash_len = size;
+		return -ENOSPC;
+	}
+	err = crypto_hash_final((struct hash_desc *) hdesc, hash);
+
+	if (err < 0) {
+		/* May be caller can fix error */
+		return err;
+	}
+	crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+	kfree(hdesc);
+	return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_final);
+
+static void cfs_crypto_performance_test(unsigned char alg_id,
+					const unsigned char *buf,
+					unsigned int buf_len)
+{
+	unsigned long		   start, end;
+	int			     bcount, err = 0;
+	int			     sec = 1; /* do test only 1 sec */
+	unsigned char		   hash[64];
+	unsigned int		    hash_len = 64;
+
+	for (start = jiffies, end = start + sec * HZ, bcount = 0;
+	     time_before(jiffies, end); bcount++) {
+		err = cfs_crypto_hash_digest(alg_id, buf, buf_len, NULL, 0,
+					     hash, &hash_len);
+		if (err)
+			break;
+
+	}
+	end = jiffies;
+
+	if (err) {
+		cfs_crypto_hash_speeds[alg_id] =  -1;
+		CDEBUG(D_INFO, "Crypto hash algorithm %s, err = %d\n",
+		       cfs_crypto_hash_name(alg_id), err);
+	} else {
+		unsigned long   tmp;
+		tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+		       1000) / (1024 * 1024);
+		cfs_crypto_hash_speeds[alg_id] = (int)tmp;
+	}
+	CDEBUG(D_INFO, "Crypto hash algorithm %s speed = %d MB/s\n",
+	       cfs_crypto_hash_name(alg_id), cfs_crypto_hash_speeds[alg_id]);
+}
+
+int cfs_crypto_hash_speed(unsigned char hash_alg)
+{
+	if (hash_alg < CFS_HASH_ALG_MAX)
+		return cfs_crypto_hash_speeds[hash_alg];
+	else
+		return -1;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_speed);
+
+/**
+ * Do performance test for all hash algorithms.
+ */
+static int cfs_crypto_test_hashes(void)
+{
+	unsigned char	   i;
+	unsigned char	   *data;
+	unsigned int	    j;
+	/* Data block size for testing hash. Maximum
+	 * kmalloc size for 2.6.18 kernel is 128K */
+	unsigned int	    data_len = 1 * 128 * 1024;
+
+	data = kmalloc(data_len, 0);
+	if (data == NULL) {
+		CERROR("Failed to allocate mem\n");
+		return -ENOMEM;
+	}
+
+	for (j = 0; j < data_len; j++)
+		data[j] = j & 0xff;
+
+	for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+		cfs_crypto_performance_test(i, data, data_len);
+
+	kfree(data);
+	return 0;
+}
+
+static int adler32;
+
+int cfs_crypto_register(void)
+{
+	adler32 = cfs_crypto_adler32_register();
+
+	/* check all algorithms and do performance test */
+	cfs_crypto_test_hashes();
+	return 0;
+}
+void cfs_crypto_unregister(void)
+{
+	if (adler32 == 0)
+		cfs_crypto_adler32_unregister();
+
+	return;
+}

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
new file mode 100644
index 0000000..f236510
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c

@@ -0,0 +1,339 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-curproc.c
+ *
+ * Lustre curproc API implementation for Linux kernel
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+
+#include <linux/compat.h>
+#include <linux/thread_info.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * for Linux kernel.
+ */
+
+int    cfs_curproc_groups_nr(void)
+{
+	int nr;
+
+	task_lock(current);
+	nr = current_cred()->group_info->ngroups;
+	task_unlock(current);
+	return nr;
+}
+
+void   cfs_curproc_groups_dump(gid_t *array, int size)
+{
+	task_lock(current);
+	size = min_t(int, size, current_cred()->group_info->ngroups);
+	memcpy(array, current_cred()->group_info->blocks[0], size * sizeof(__u32));
+	task_unlock(current);
+}
+
+
+int    current_is_in_group(gid_t gid)
+{
+	return in_group_p(gid);
+}
+
+/* Currently all the CFS_CAP_* defines match CAP_* ones. */
+#define cfs_cap_pack(cap) (cap)
+#define cfs_cap_unpack(cap) (cap)
+
+void cfs_cap_raise(cfs_cap_t cap)
+{
+	struct cred *cred;
+	if ((cred = prepare_creds())) {
+		cap_raise(cred->cap_effective, cfs_cap_unpack(cap));
+		commit_creds(cred);
+	}
+}
+
+void cfs_cap_lower(cfs_cap_t cap)
+{
+	struct cred *cred;
+	if ((cred = prepare_creds())) {
+		cap_lower(cred->cap_effective, cfs_cap_unpack(cap));
+		commit_creds(cred);
+	}
+}
+
+int cfs_cap_raised(cfs_cap_t cap)
+{
+	return cap_raised(current_cap(), cfs_cap_unpack(cap));
+}
+
+void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap)
+{
+#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330
+	*cap = cfs_cap_pack(kcap);
+#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026
+	*cap = cfs_cap_pack(kcap[0]);
+#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522
+	/* XXX lost high byte */
+	*cap = cfs_cap_pack(kcap.cap[0]);
+#else
+	#error "need correct _KERNEL_CAPABILITY_VERSION "
+#endif
+}
+
+void cfs_kernel_cap_unpack(kernel_cap_t *kcap, cfs_cap_t cap)
+{
+#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330
+	*kcap = cfs_cap_unpack(cap);
+#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026
+	(*kcap)[0] = cfs_cap_unpack(cap);
+#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522
+	kcap->cap[0] = cfs_cap_unpack(cap);
+#else
+	#error "need correct _KERNEL_CAPABILITY_VERSION "
+#endif
+}
+
+cfs_cap_t cfs_curproc_cap_pack(void)
+{
+	cfs_cap_t cap;
+	cfs_kernel_cap_pack(current_cap(), &cap);
+	return cap;
+}
+
+void cfs_curproc_cap_unpack(cfs_cap_t cap)
+{
+	struct cred *cred;
+	if ((cred = prepare_creds())) {
+		cfs_kernel_cap_unpack(&cred->cap_effective, cap);
+		commit_creds(cred);
+	}
+}
+
+int cfs_capable(cfs_cap_t cap)
+{
+	return capable(cfs_cap_unpack(cap));
+}
+
+/* Check if task is running in 32-bit API mode, for the purpose of
+ * userspace binary interfaces.  On 32-bit Linux this is (unfortunately)
+ * always true, even if the application is using LARGEFILE64 and 64-bit
+ * APIs, because Linux provides no way for the filesystem to know if it
+ * is called via 32-bit or 64-bit APIs.  Other clients may vary.  On
+ * 64-bit systems, this will only be true if the binary is calling a
+ * 32-bit system call. */
+int current_is_32bit(void)
+{
+	return is_compat_task();
+}
+
+static int cfs_access_process_vm(struct task_struct *tsk, unsigned long addr,
+				 void *buf, int len, int write)
+{
+	/* Just copied from kernel for the kernels which doesn't
+	 * have access_process_vm() exported */
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct page *page;
+	void *old_buf = buf;
+
+	mm = get_task_mm(tsk);
+	if (!mm)
+		return 0;
+
+	down_read(&mm->mmap_sem);
+	/* ignore errors, just check how much was sucessfully transfered */
+	while (len) {
+		int bytes, rc, offset;
+		void *maddr;
+
+		rc = get_user_pages(tsk, mm, addr, 1,
+				     write, 1, &page, &vma);
+		if (rc <= 0)
+			break;
+
+		bytes = len;
+		offset = addr & (PAGE_SIZE-1);
+		if (bytes > PAGE_SIZE-offset)
+			bytes = PAGE_SIZE-offset;
+
+		maddr = kmap(page);
+		if (write) {
+			copy_to_user_page(vma, page, addr,
+					  maddr + offset, buf, bytes);
+			set_page_dirty_lock(page);
+		} else {
+			copy_from_user_page(vma, page, addr,
+					    buf, maddr + offset, bytes);
+		}
+		kunmap(page);
+		page_cache_release(page);
+		len -= bytes;
+		buf += bytes;
+		addr += bytes;
+	}
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	return buf - old_buf;
+}
+
+/* Read the environment variable of current process specified by @key. */
+int cfs_get_environ(const char *key, char *value, int *val_len)
+{
+	struct mm_struct *mm;
+	char *buffer, *tmp_buf = NULL;
+	int buf_len = PAGE_CACHE_SIZE;
+	int key_len = strlen(key);
+	unsigned long addr;
+	int rc;
+	ENTRY;
+
+	buffer = kmalloc(buf_len, GFP_USER);
+	if (!buffer)
+		RETURN(-ENOMEM);
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(buffer);
+		RETURN(-EINVAL);
+	}
+
+	/* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
+	 * which is already holding mmap_sem for writes.  If some other
+	 * thread gets the write lock in the meantime, this thread will
+	 * block, but at least it won't deadlock on itself.  LU-1735 */
+	if (down_read_trylock(&mm->mmap_sem) == 0)
+		return -EDEADLK;
+	up_read(&mm->mmap_sem);
+
+	addr = mm->env_start;
+	while (addr < mm->env_end) {
+		int this_len, retval, scan_len;
+		char *env_start, *env_end;
+
+		memset(buffer, 0, buf_len);
+
+		this_len = min_t(int, mm->env_end - addr, buf_len);
+		retval = cfs_access_process_vm(current, addr, buffer,
+					       this_len, 0);
+		if (retval != this_len)
+			break;
+
+		addr += retval;
+
+		/* Parse the buffer to find out the specified key/value pair.
+		 * The "key=value" entries are separated by '\0'. */
+		env_start = buffer;
+		scan_len = this_len;
+		while (scan_len) {
+			char *entry;
+			int entry_len;
+
+			env_end = memscan(env_start, '\0', scan_len);
+			LASSERT(env_end >= env_start &&
+				env_end <= env_start + scan_len);
+
+			/* The last entry of this buffer cross the buffer
+			 * boundary, reread it in next cycle. */
+			if (unlikely(env_end - env_start == scan_len)) {
+				/* This entry is too large to fit in buffer */
+				if (unlikely(scan_len == this_len)) {
+					CERROR("Too long env variable.\n");
+					GOTO(out, rc = -EINVAL);
+				}
+				addr -= scan_len;
+				break;
+			}
+
+			entry = env_start;
+			entry_len = env_end - env_start;
+
+			/* Key length + length of '=' */
+			if (entry_len > key_len + 1 &&
+			    !memcmp(entry, key, key_len)) {
+				entry += key_len + 1;
+				entry_len -= key_len + 1;
+				/* The 'value' buffer passed in is too small.*/
+				if (entry_len >= *val_len)
+					GOTO(out, rc = -EOVERFLOW);
+
+				memcpy(value, entry, entry_len);
+				*val_len = entry_len;
+				GOTO(out, rc = 0);
+			}
+
+			scan_len -= (env_end - env_start + 1);
+			env_start = env_end + 1;
+		}
+	}
+	GOTO(out, rc = -ENOENT);
+
+out:
+	mmput(mm);
+	kfree((void *)buffer);
+	if (tmp_buf)
+		kfree((void *)tmp_buf);
+	return rc;
+}
+EXPORT_SYMBOL(cfs_get_environ);
+
+EXPORT_SYMBOL(cfs_curproc_groups_nr);
+EXPORT_SYMBOL(cfs_curproc_groups_dump);
+EXPORT_SYMBOL(current_is_in_group);
+EXPORT_SYMBOL(cfs_cap_raise);
+EXPORT_SYMBOL(cfs_cap_lower);
+EXPORT_SYMBOL(cfs_cap_raised);
+EXPORT_SYMBOL(cfs_curproc_cap_pack);
+EXPORT_SYMBOL(cfs_curproc_cap_unpack);
+EXPORT_SYMBOL(cfs_capable);
+EXPORT_SYMBOL(current_is_32bit);
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
new file mode 100644
index 0000000..e2c195b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c

@@ -0,0 +1,264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include <linux/completion.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/version.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include "tracefile.h"
+
+#include <linux/kallsyms.h>
+
+char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall";
+char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall";
+
+/**
+ * Upcall function once a Lustre log has been dumped.
+ *
+ * \param file  path of the dumped log
+ */
+void libcfs_run_debug_log_upcall(char *file)
+{
+	char *argv[3];
+	int   rc;
+	char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL};
+	ENTRY;
+
+	argv[0] = lnet_debug_log_upcall;
+
+	LASSERTF(file != NULL, "called on a null filename\n");
+	argv[1] = file; //only need to pass the path of the file
+
+	argv[2] = NULL;
+
+	rc = USERMODEHELPER(argv[0], argv, envp);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("Error %d invoking LNET debug log upcall %s %s; "
+		       "check /proc/sys/lnet/debug_log_upcall\n",
+		       rc, argv[0], argv[1]);
+	} else {
+		CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n",
+		       argv[0], argv[1]);
+	}
+
+	EXIT;
+}
+
+void libcfs_run_upcall(char **argv)
+{
+	int   rc;
+	int   argc;
+	char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL};
+	ENTRY;
+
+	argv[0] = lnet_upcall;
+	argc = 1;
+	while (argv[argc] != NULL)
+		argc++;
+
+	LASSERT(argc >= 2);
+
+	rc = USERMODEHELPER(argv[0], argv, envp);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; "
+		       "check /proc/sys/lnet/upcall\n",
+		       rc, argv[0], argv[1],
+		       argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+		       argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+		       argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+		       argc < 6 ? "" : ",...");
+	} else {
+		CDEBUG(D_HA, "Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n",
+		       argv[0], argv[1],
+		       argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+		       argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+		       argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+		       argc < 6 ? "" : ",...");
+	}
+}
+
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *msgdata)
+{
+	char *argv[6];
+	char buf[32];
+
+	ENTRY;
+	snprintf (buf, sizeof buf, "%d", msgdata->msg_line);
+
+	argv[1] = "LBUG";
+	argv[2] = (char *)msgdata->msg_file;
+	argv[3] = (char *)msgdata->msg_fn;
+	argv[4] = buf;
+	argv[5] = NULL;
+
+	libcfs_run_upcall (argv);
+}
+
+/* coverity[+kill] */
+void lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
+{
+	libcfs_catastrophe = 1;
+	libcfs_debug_msg(msgdata, "LBUG\n");
+
+	if (in_interrupt()) {
+		panic("LBUG in interrupt.\n");
+		/* not reached */
+	}
+
+	libcfs_debug_dumpstack(NULL);
+	if (!libcfs_panic_on_lbug)
+		libcfs_debug_dumplog();
+	libcfs_run_lbug_upcall(msgdata);
+	if (libcfs_panic_on_lbug)
+		panic("LBUG");
+	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	while (1)
+		schedule();
+}
+
+
+#include <linux/nmi.h>
+#include <asm/stacktrace.h>
+
+
+static int print_trace_stack(void *data, char *name)
+{
+	printk(" <%s> ", name);
+	return 0;
+}
+
+# define RELIABLE reliable
+# define DUMP_TRACE_CONST const
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+	char fmt[32];
+	touch_nmi_watchdog();
+	sprintf(fmt, " [<%016lx>] %s%%s\n", addr, RELIABLE ? "": "? ");
+	__print_symbol(fmt, addr);
+}
+
+static DUMP_TRACE_CONST struct stacktrace_ops print_trace_ops = {
+	.stack = print_trace_stack,
+	.address = print_trace_address,
+	.walk_stack = print_context_stack,
+};
+
+void libcfs_debug_dumpstack(struct task_struct *tsk)
+{
+	/* dump_stack() */
+	/* show_trace() */
+	if (tsk == NULL)
+		tsk = current;
+	printk("Pid: %d, comm: %.20s\n", tsk->pid, tsk->comm);
+	/* show_trace_log_lvl() */
+	printk("\nCall Trace:\n");
+	dump_trace(tsk, NULL, NULL,
+		   0,
+		   &print_trace_ops, NULL);
+	printk("\n");
+}
+
+task_t *libcfs_current(void)
+{
+	CWARN("current task struct is %p\n", current);
+	return current;
+}
+
+static int panic_notifier(struct notifier_block *self, unsigned long unused1,
+			 void *unused2)
+{
+	if (libcfs_panic_in_progress)
+		return 0;
+
+	libcfs_panic_in_progress = 1;
+	mb();
+
+	return 0;
+}
+
+static struct notifier_block libcfs_panic_notifier = {
+	notifier_call :     panic_notifier,
+	next :	      NULL,
+	priority :	  10000
+};
+
+void libcfs_register_panic_notifier(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+void libcfs_unregister_panic_notifier(void)
+{
+	atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+EXPORT_SYMBOL(libcfs_debug_dumpstack);
+EXPORT_SYMBOL(libcfs_current);
+
+
+EXPORT_SYMBOL(libcfs_run_upcall);
+EXPORT_SYMBOL(libcfs_run_lbug_upcall);
+EXPORT_SYMBOL(lbug_with_loc);

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
new file mode 100644
index 0000000..2c7d4a3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c

@@ -0,0 +1,183 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define LNET_MINOR 240
+
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
+{
+	struct libcfs_ioctl_hdr   *hdr;
+	struct libcfs_ioctl_data  *data;
+	int err;
+	ENTRY;
+
+	hdr = (struct libcfs_ioctl_hdr *)buf;
+	data = (struct libcfs_ioctl_data *)buf;
+
+	err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
+	if (err)
+		RETURN(err);
+
+	if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
+		CERROR("PORTALS: version mismatch kernel vs application\n");
+		RETURN(-EINVAL);
+	}
+
+	if (hdr->ioc_len + buf >= end) {
+		CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+		RETURN(-EINVAL);
+	}
+
+
+	if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
+		CERROR("PORTALS: user buffer too small for ioctl\n");
+		RETURN(-EINVAL);
+	}
+
+	err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
+	if (err)
+		RETURN(err);
+
+	if (libcfs_ioctl_is_invalid(data)) {
+		CERROR("PORTALS: ioctl not correctly formatted\n");
+		RETURN(-EINVAL);
+	}
+
+	if (data->ioc_inllen1)
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+	if (data->ioc_inllen2)
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+			cfs_size_round(data->ioc_inllen1);
+
+	RETURN(0);
+}
+
+int libcfs_ioctl_popdata(void *arg, void *data, int size)
+{
+	if (copy_to_user((char *)arg, data, size))
+		return -EFAULT;
+	return 0;
+}
+
+extern struct cfs_psdev_ops	  libcfs_psdev_ops;
+
+static int
+libcfs_psdev_open(struct inode * inode, struct file * file)
+{
+	struct libcfs_device_userstate **pdu = NULL;
+	int    rc = 0;
+
+	if (!inode)
+		return (-EINVAL);
+	pdu = (struct libcfs_device_userstate **)&file->private_data;
+	if (libcfs_psdev_ops.p_open != NULL)
+		rc = libcfs_psdev_ops.p_open(0, (void *)pdu);
+	else
+		return (-EPERM);
+	return rc;
+}
+
+/* called when closing /dev/device */
+static int
+libcfs_psdev_release(struct inode * inode, struct file * file)
+{
+	struct libcfs_device_userstate *pdu;
+	int    rc = 0;
+
+	if (!inode)
+		return (-EINVAL);
+	pdu = file->private_data;
+	if (libcfs_psdev_ops.p_close != NULL)
+		rc = libcfs_psdev_ops.p_close(0, (void *)pdu);
+	else
+		rc = -EPERM;
+	return rc;
+}
+
+static long libcfs_ioctl(struct file *file,
+			 unsigned int cmd, unsigned long arg)
+{
+	struct cfs_psdev_file	 pfile;
+	int    rc = 0;
+
+	if (current_fsuid() != 0)
+		return -EACCES;
+
+	if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+	     _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  ||
+	     _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) {
+		CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+		       _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+		return (-EINVAL);
+	}
+
+	/* Handle platform-dependent IOC requests */
+	switch (cmd) {
+	case IOC_LIBCFS_PANIC:
+		if (!cfs_capable(CFS_CAP_SYS_BOOT))
+			return (-EPERM);
+		panic("debugctl-invoked panic");
+		return (0);
+	case IOC_LIBCFS_MEMHOG:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			return -EPERM;
+		/* go thought */
+	}
+
+	pfile.off = 0;
+	pfile.private_data = file->private_data;
+	if (libcfs_psdev_ops.p_ioctl != NULL)
+		rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
+	else
+		rc = -EPERM;
+	return (rc);
+}
+
+static struct file_operations libcfs_fops = {
+	unlocked_ioctl: libcfs_ioctl,
+	open :	  libcfs_psdev_open,
+	release :       libcfs_psdev_release
+};
+
+psdev_t libcfs_dev = {
+	LNET_MINOR,
+	"lnet",
+	&libcfs_fops
+};

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
new file mode 100644
index 0000000..b652a79
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c

@@ -0,0 +1,259 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs_struct.h>
+#include <linux/sched.h>
+
+#include <linux/libcfs/libcfs.h>
+
+#if defined(CONFIG_KGDB)
+#include <asm/kgdb.h>
+#endif
+
+#define LINUX_WAITQ(w) ((wait_queue_t *) w)
+#define LINUX_WAITQ_HEAD(w) ((wait_queue_head_t *) w)
+
+void
+init_waitqueue_entry_current(wait_queue_t *link)
+{
+	init_waitqueue_entry(LINUX_WAITQ(link), current);
+}
+EXPORT_SYMBOL(init_waitqueue_entry_current);
+
+/**
+ * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * waiting threads, which is not always desirable because all threads will
+ * be waken up again and again, even user only needs a few of them to be
+ * active most time. This is not good for performance because cache can
+ * be polluted by different threads.
+ *
+ * LIFO list can resolve this problem because we always wakeup the most
+ * recent active thread by default.
+ *
+ * NB: please don't call non-exclusive & exclusive wait on the same
+ * waitq if add_wait_queue_exclusive_head is used.
+ */
+void
+add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&LINUX_WAITQ_HEAD(waitq)->lock, flags);
+	__add_wait_queue_exclusive(LINUX_WAITQ_HEAD(waitq), LINUX_WAITQ(link));
+	spin_unlock_irqrestore(&LINUX_WAITQ_HEAD(waitq)->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive_head);
+
+void
+waitq_wait(wait_queue_t *link, cfs_task_state_t state)
+{
+	schedule();
+}
+EXPORT_SYMBOL(waitq_wait);
+
+int64_t
+waitq_timedwait(wait_queue_t *link, cfs_task_state_t state,
+		    int64_t timeout)
+{
+	return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(waitq_timedwait);
+
+void
+schedule_timeout_and_set_state(cfs_task_state_t state, int64_t timeout)
+{
+	set_current_state(state);
+	schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_and_set_state);
+
+/* deschedule for a bit... */
+void
+cfs_pause(cfs_duration_t ticks)
+{
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(ticks);
+}
+EXPORT_SYMBOL(cfs_pause);
+
+void cfs_init_timer(timer_list_t *t)
+{
+	init_timer(t);
+}
+EXPORT_SYMBOL(cfs_init_timer);
+
+void cfs_timer_init(timer_list_t *t, cfs_timer_func_t *func, void *arg)
+{
+	init_timer(t);
+	t->function = func;
+	t->data = (unsigned long)arg;
+}
+EXPORT_SYMBOL(cfs_timer_init);
+
+void cfs_timer_done(timer_list_t *t)
+{
+	return;
+}
+EXPORT_SYMBOL(cfs_timer_done);
+
+void cfs_timer_arm(timer_list_t *t, cfs_time_t deadline)
+{
+	mod_timer(t, deadline);
+}
+EXPORT_SYMBOL(cfs_timer_arm);
+
+void cfs_timer_disarm(timer_list_t *t)
+{
+	del_timer(t);
+}
+EXPORT_SYMBOL(cfs_timer_disarm);
+
+int  cfs_timer_is_armed(timer_list_t *t)
+{
+	return timer_pending(t);
+}
+EXPORT_SYMBOL(cfs_timer_is_armed);
+
+cfs_time_t cfs_timer_deadline(timer_list_t *t)
+{
+	return t->expires;
+}
+EXPORT_SYMBOL(cfs_timer_deadline);
+
+void cfs_enter_debugger(void)
+{
+#if defined(CONFIG_KGDB)
+//	BREAKPOINT();
+#else
+	/* nothing */
+#endif
+}
+
+
+sigset_t
+cfs_block_allsigs(void)
+{
+	unsigned long	  flags;
+	sigset_t	old;
+
+	SIGNAL_MASK_LOCK(current, flags);
+	old = current->blocked;
+	sigfillset(&current->blocked);
+	recalc_sigpending();
+	SIGNAL_MASK_UNLOCK(current, flags);
+
+	return old;
+}
+
+sigset_t cfs_block_sigs(unsigned long sigs)
+{
+	unsigned long  flags;
+	sigset_t	old;
+
+	SIGNAL_MASK_LOCK(current, flags);
+	old = current->blocked;
+	sigaddsetmask(&current->blocked, sigs);
+	recalc_sigpending();
+	SIGNAL_MASK_UNLOCK(current, flags);
+	return old;
+}
+
+/* Block all signals except for the @sigs */
+sigset_t cfs_block_sigsinv(unsigned long sigs)
+{
+	unsigned long flags;
+	sigset_t old;
+
+	SIGNAL_MASK_LOCK(current, flags);
+	old = current->blocked;
+	sigaddsetmask(&current->blocked, ~sigs);
+	recalc_sigpending();
+	SIGNAL_MASK_UNLOCK(current, flags);
+
+	return old;
+}
+
+void
+cfs_restore_sigs (sigset_t old)
+{
+	unsigned long  flags;
+
+	SIGNAL_MASK_LOCK(current, flags);
+	current->blocked = old;
+	recalc_sigpending();
+	SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+int
+cfs_signal_pending(void)
+{
+	return signal_pending(current);
+}
+
+void
+cfs_clear_sigpending(void)
+{
+	unsigned long flags;
+
+	SIGNAL_MASK_LOCK(current, flags);
+	clear_tsk_thread_flag(current, TIF_SIGPENDING);
+	SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+int
+libcfs_arch_init(void)
+{
+	return 0;
+}
+
+void
+libcfs_arch_cleanup(void)
+{
+	return;
+}
+
+EXPORT_SYMBOL(libcfs_arch_init);
+EXPORT_SYMBOL(libcfs_arch_cleanup);
+EXPORT_SYMBOL(cfs_enter_debugger);
+EXPORT_SYMBOL(cfs_block_allsigs);
+EXPORT_SYMBOL(cfs_block_sigs);
+EXPORT_SYMBOL(cfs_block_sigsinv);
+EXPORT_SYMBOL(cfs_restore_sigs);
+EXPORT_SYMBOL(cfs_signal_pending);
+EXPORT_SYMBOL(cfs_clear_sigpending);

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c
new file mode 100644
index 0000000..522b28e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c

@@ -0,0 +1,580 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-proc.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <asm/div64.h>
+#include "tracefile.h"
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_header_t *lnet_table_header = NULL;
+#endif
+extern char lnet_upcall[1024];
+/**
+ * The path of debug log dump upcall script.
+ */
+extern char lnet_debug_log_upcall[1024];
+
+#define CTL_LNET	(0x100)
+enum {
+	PSDEV_DEBUG = 1,	  /* control debugging */
+	PSDEV_SUBSYSTEM_DEBUG,    /* control debugging */
+	PSDEV_PRINTK,	     /* force all messages to console */
+	PSDEV_CONSOLE_RATELIMIT,  /* ratelimit console messages */
+	PSDEV_CONSOLE_MAX_DELAY_CS, /* maximum delay over which we skip messages */
+	PSDEV_CONSOLE_MIN_DELAY_CS, /* initial delay over which we skip messages */
+	PSDEV_CONSOLE_BACKOFF,    /* delay increase factor */
+	PSDEV_DEBUG_PATH,	 /* crashdump log location */
+	PSDEV_DEBUG_DUMP_PATH,    /* crashdump tracelog location */
+	PSDEV_CPT_TABLE,	  /* information about cpu partitions */
+	PSDEV_LNET_UPCALL,	/* User mode upcall script  */
+	PSDEV_LNET_MEMUSED,       /* bytes currently PORTAL_ALLOCated */
+	PSDEV_LNET_CATASTROPHE,   /* if we have LBUGged or panic'd */
+	PSDEV_LNET_PANIC_ON_LBUG, /* flag to panic on LBUG */
+	PSDEV_LNET_DUMP_KERNEL,   /* snapshot kernel debug buffer to file */
+	PSDEV_LNET_DAEMON_FILE,   /* spool kernel debug buffer to file */
+	PSDEV_LNET_DEBUG_MB,      /* size of debug buffer */
+	PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */
+	PSDEV_LNET_WATCHDOG_RATELIMIT,  /* ratelimit watchdog messages  */
+	PSDEV_LNET_FORCE_LBUG,    /* hook to force an LBUG */
+	PSDEV_LNET_FAIL_LOC,      /* control test failures instrumentation */
+	PSDEV_LNET_FAIL_VAL,      /* userdata for fail loc */
+};
+
+int
+proc_call_handler(void *data, int write,
+		  loff_t *ppos, void *buffer, size_t *lenp,
+		  int (*handler)(void *data, int write,
+				 loff_t pos, void *buffer, int len))
+{
+	int rc = handler(data, write, *ppos, buffer, *lenp);
+
+	if (rc < 0)
+		return rc;
+
+	if (write) {
+		*ppos += *lenp;
+	} else {
+		*lenp = rc;
+		*ppos += rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(proc_call_handler);
+
+static int __proc_dobitmasks(void *data, int write,
+			     loff_t pos, void *buffer, int nob)
+{
+	const int     tmpstrlen = 512;
+	char	 *tmpstr;
+	int	   rc;
+	unsigned int *mask = data;
+	int	   is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0;
+	int	   is_printk = (mask == &libcfs_printk) ? 1 : 0;
+
+	rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen);
+	if (rc < 0)
+		return rc;
+
+	if (!write) {
+		libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys);
+		rc = strlen(tmpstr);
+
+		if (pos >= rc) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+						      tmpstr + pos, "\n");
+		}
+	} else {
+		rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob);
+		if (rc < 0) {
+			cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+			return rc;
+		}
+
+		rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys);
+		/* Always print LBUG/LASSERT to console, so keep this mask */
+		if (is_printk)
+			*mask |= D_EMERG;
+	}
+
+	cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+	return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_dobitmasks)
+
+static int min_watchdog_ratelimit = 0;	  /* disable ratelimiting */
+static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */
+
+static int __proc_dump_kernel(void *data, int write,
+			      loff_t pos, void *buffer, int nob)
+{
+	if (!write)
+		return 0;
+
+	return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_dump_kernel)
+
+static int __proc_daemon_file(void *data, int write,
+			      loff_t pos, void *buffer, int nob)
+{
+	if (!write) {
+		int len = strlen(cfs_tracefile);
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob,
+						cfs_tracefile + pos, "\n");
+	}
+
+	return cfs_trace_daemon_command_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_daemon_file)
+
+static int __proc_debug_mb(void *data, int write,
+			   loff_t pos, void *buffer, int nob)
+{
+	if (!write) {
+		char tmpstr[32];
+		int  len = snprintf(tmpstr, sizeof(tmpstr), "%d",
+				    cfs_trace_get_debug_mb());
+
+		if (pos >= len)
+			return 0;
+
+		return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
+		       "\n");
+	}
+
+	return cfs_trace_set_debug_mb_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_debug_mb)
+
+int LL_PROC_PROTO(proc_console_max_delay_cs)
+{
+	int rc, max_delay_cs;
+	ctl_table_t dummy = *table;
+	cfs_duration_t d;
+
+	dummy.data = &max_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
+		rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	max_delay_cs = 0;
+	rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (max_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(max_delay_cs) / 100;
+	if (d == 0 || d < libcfs_console_min_delay)
+		return -EINVAL;
+	libcfs_console_max_delay = d;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_console_min_delay_cs)
+{
+	int rc, min_delay_cs;
+	ctl_table_t dummy = *table;
+	cfs_duration_t d;
+
+	dummy.data = &min_delay_cs;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
+		rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	min_delay_cs = 0;
+	rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (min_delay_cs <= 0)
+		return -EINVAL;
+
+	d = cfs_time_seconds(min_delay_cs) / 100;
+	if (d == 0 || d > libcfs_console_max_delay)
+		return -EINVAL;
+	libcfs_console_min_delay = d;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_console_backoff)
+{
+	int rc, backoff;
+	ctl_table_t dummy = *table;
+
+	dummy.data = &backoff;
+	dummy.proc_handler = &proc_dointvec;
+
+	if (!write) { /* read */
+		backoff= libcfs_console_backoff;
+		rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+		return rc;
+	}
+
+	/* write */
+	backoff = 0;
+	rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+	if (rc < 0)
+		return rc;
+	if (backoff <= 0)
+		return -EINVAL;
+
+	libcfs_console_backoff = backoff;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(libcfs_force_lbug)
+{
+	if (write)
+		LBUG();
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_fail_loc)
+{
+	int rc;
+	long old_fail_loc = cfs_fail_loc;
+
+	rc = ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos);
+	if (old_fail_loc != cfs_fail_loc)
+		wake_up(&cfs_race_waitq);
+	return rc;
+}
+
+static int __proc_cpt_table(void *data, int write,
+			    loff_t pos, void *buffer, int nob)
+{
+	char *buf = NULL;
+	int   len = 4096;
+	int   rc  = 0;
+
+	if (write)
+		return -EPERM;
+
+	LASSERT(cfs_cpt_table != NULL);
+
+	while (1) {
+		LIBCFS_ALLOC(buf, len);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		rc = cfs_cpt_table_print(cfs_cpt_table, buf, len);
+		if (rc >= 0)
+			break;
+
+		LIBCFS_FREE(buf, len);
+		if (rc == -EFBIG) {
+			len <<= 1;
+			continue;
+		}
+		goto out;
+	}
+
+	if (pos >= rc) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+	if (buf != NULL)
+		LIBCFS_FREE(buf, len);
+	return rc;
+}
+DECLARE_PROC_HANDLER(proc_cpt_table)
+
+static ctl_table_t lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		INIT_CTL_NAME(PSDEV_DEBUG)
+		.procname = "debug",
+		.data     = &libcfs_debug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_SUBSYSTEM_DEBUG)
+		.procname = "subsystem_debug",
+		.data     = &libcfs_subsystem_debug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_PRINTK)
+		.procname = "printk",
+		.data     = &libcfs_printk,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dobitmasks,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_CONSOLE_RATELIMIT)
+		.procname = "console_ratelimit",
+		.data     = &libcfs_console_ratelimit,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(PSDEV_CONSOLE_MAX_DELAY_CS)
+		.procname = "console_max_delay_centisecs",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_max_delay_cs
+	},
+	{
+		INIT_CTL_NAME(PSDEV_CONSOLE_MIN_DELAY_CS)
+		.procname = "console_min_delay_centisecs",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_min_delay_cs
+	},
+	{
+		INIT_CTL_NAME(PSDEV_CONSOLE_BACKOFF)
+		.procname = "console_backoff",
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_console_backoff
+	},
+
+	{
+		INIT_CTL_NAME(PSDEV_DEBUG_PATH)
+		.procname = "debug_path",
+		.data     = libcfs_debug_file_path_arr,
+		.maxlen   = sizeof(libcfs_debug_file_path_arr),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+
+	{
+		INIT_CTL_NAME(PSDEV_CPT_TABLE)
+		.procname = "cpu_partition_table",
+		.maxlen   = 128,
+		.mode     = 0444,
+		.proc_handler = &proc_cpt_table,
+	},
+
+	{
+		INIT_CTL_NAME(PSDEV_LNET_UPCALL)
+		.procname = "upcall",
+		.data     = lnet_upcall,
+		.maxlen   = sizeof(lnet_upcall),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_DEBUG_LOG_UPCALL)
+		.procname = "debug_log_upcall",
+		.data     = lnet_debug_log_upcall,
+		.maxlen   = sizeof(lnet_debug_log_upcall),
+		.mode     = 0644,
+		.proc_handler = &proc_dostring,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_MEMUSED)
+		.procname = "lnet_memused",
+		.data     = (int *)&libcfs_kmemory.counter,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		INIT_STRATEGY(&sysctl_intvec)
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_CATASTROPHE)
+		.procname = "catastrophe",
+		.data     = &libcfs_catastrophe,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		INIT_STRATEGY(&sysctl_intvec)
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PANIC_ON_LBUG)
+		.procname = "panic_on_lbug",
+		.data     = &libcfs_panic_on_lbug,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		INIT_STRATEGY(&sysctl_intvec)
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_DUMP_KERNEL)
+		.procname = "dump_kernel",
+		.maxlen   = 256,
+		.mode     = 0200,
+		.proc_handler = &proc_dump_kernel,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_DAEMON_FILE)
+		.procname = "daemon_file",
+		.mode     = 0644,
+		.maxlen   = 256,
+		.proc_handler = &proc_daemon_file,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_DEBUG_MB)
+		.procname = "debug_mb",
+		.mode     = 0644,
+		.proc_handler = &proc_debug_mb,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_WATCHDOG_RATELIMIT)
+		.procname = "watchdog_ratelimit",
+		.data     = &libcfs_watchdog_ratelimit,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec_minmax,
+		.extra1   = &min_watchdog_ratelimit,
+		.extra2   = &max_watchdog_ratelimit,
+	},
+	{       INIT_CTL_NAME(PSDEV_LNET_FORCE_LBUG)
+		.procname = "force_lbug",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0200,
+		.proc_handler = &libcfs_force_lbug
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_FAIL_LOC)
+		.procname = "fail_loc",
+		.data     = &cfs_fail_loc,
+		.maxlen   = sizeof(cfs_fail_loc),
+		.mode     = 0644,
+		.proc_handler = &proc_fail_loc
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_FAIL_VAL)
+		.procname = "fail_val",
+		.data     = &cfs_fail_val,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(0)
+	}
+};
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_t top_table[] = {
+	{
+		INIT_CTL_NAME(CTL_LNET)
+		.procname = "lnet",
+		.mode     = 0555,
+		.data     = NULL,
+		.maxlen   = 0,
+		.child    = lnet_table,
+	},
+	{
+		INIT_CTL_NAME(0)
+	}
+};
+#endif
+
+int insert_proc(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header == NULL)
+		lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+	return 0;
+}
+
+void remove_proc(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+#endif
+}

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
new file mode 100644
index 0000000..855c7e8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c

@@ -0,0 +1,659 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/libcfs.h>
+
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/file.h>
+/* For sys_open & sys_close */
+#include <linux/syscalls.h>
+
+int
+libcfs_sock_ioctl(int cmd, unsigned long arg)
+{
+	mm_segment_t    oldmm = get_fs();
+	struct socket  *sock;
+	int	     rc;
+	struct file    *sock_filp;
+
+	rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+	if (rc != 0) {
+		CERROR ("Can't create socket: %d\n", rc);
+		return rc;
+	}
+
+	sock_filp = sock_alloc_file(sock, 0, NULL);
+	if (IS_ERR(sock_filp)) {
+		sock_release(sock);
+		rc = PTR_ERR(sock_filp);
+		goto out;
+	}
+
+	set_fs(KERNEL_DS);
+	if (sock_filp->f_op->unlocked_ioctl)
+		rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg);
+	set_fs(oldmm);
+
+	fput(sock_filp);
+out:
+	return rc;
+}
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+	struct ifreq   ifr;
+	int	    nob;
+	int	    rc;
+	__u32	  val;
+
+	nob = strnlen(name, IFNAMSIZ);
+	if (nob == IFNAMSIZ) {
+		CERROR("Interface name %s too long\n", name);
+		return -EINVAL;
+	}
+
+	CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+
+	strcpy(ifr.ifr_name, name);
+	rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get flags for interface %s\n", name);
+		return rc;
+	}
+
+	if ((ifr.ifr_flags & IFF_UP) == 0) {
+		CDEBUG(D_NET, "Interface %s down\n", name);
+		*up = 0;
+		*ip = *mask = 0;
+		return 0;
+	}
+
+	*up = 1;
+
+	strcpy(ifr.ifr_name, name);
+	ifr.ifr_addr.sa_family = AF_INET;
+	rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get IP address for interface %s\n", name);
+		return rc;
+	}
+
+	val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+	*ip = ntohl(val);
+
+	strcpy(ifr.ifr_name, name);
+	ifr.ifr_addr.sa_family = AF_INET;
+	rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
+
+	if (rc != 0) {
+		CERROR("Can't get netmask for interface %s\n", name);
+		return rc;
+	}
+
+	val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
+	*mask = ntohl(val);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_query);
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+	/* Allocate and fill in 'names', returning # interfaces/error */
+	char	   **names;
+	int	     toobig;
+	int	     nalloc;
+	int	     nfound;
+	struct ifreq   *ifr;
+	struct ifconf   ifc;
+	int	     rc;
+	int	     nob;
+	int	     i;
+
+
+	nalloc = 16;	/* first guess at max interfaces */
+	toobig = 0;
+	for (;;) {
+		if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) {
+			toobig = 1;
+			nalloc = PAGE_CACHE_SIZE/sizeof(*ifr);
+			CWARN("Too many interfaces: only enumerating first %d\n",
+			      nalloc);
+		}
+
+		LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+		if (ifr == NULL) {
+			CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+			rc = -ENOMEM;
+			goto out0;
+		}
+
+		ifc.ifc_buf = (char *)ifr;
+		ifc.ifc_len = nalloc * sizeof(*ifr);
+
+		rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
+
+		if (rc < 0) {
+			CERROR ("Error %d enumerating interfaces\n", rc);
+			goto out1;
+		}
+
+		LASSERT (rc == 0);
+
+		nfound = ifc.ifc_len/sizeof(*ifr);
+		LASSERT (nfound <= nalloc);
+
+		if (nfound < nalloc || toobig)
+			break;
+
+		LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+		nalloc *= 2;
+	}
+
+	if (nfound == 0)
+		goto out1;
+
+	LIBCFS_ALLOC(names, nfound * sizeof(*names));
+	if (names == NULL) {
+		rc = -ENOMEM;
+		goto out1;
+	}
+	/* NULL out all names[i] */
+	memset (names, 0, nfound * sizeof(*names));
+
+	for (i = 0; i < nfound; i++) {
+
+		nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+		if (nob == IFNAMSIZ) {
+			/* no space for terminating NULL */
+			CERROR("interface name %.*s too long (%d max)\n",
+			       nob, ifr[i].ifr_name, IFNAMSIZ);
+			rc = -ENAMETOOLONG;
+			goto out2;
+		}
+
+		LIBCFS_ALLOC(names[i], IFNAMSIZ);
+		if (names[i] == NULL) {
+			rc = -ENOMEM;
+			goto out2;
+		}
+
+		memcpy(names[i], ifr[i].ifr_name, nob);
+		names[i][nob] = 0;
+	}
+
+	*namesp = names;
+	rc = nfound;
+
+ out2:
+	if (rc < 0)
+		libcfs_ipif_free_enumeration(names, nfound);
+ out1:
+	LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+ out0:
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_enumerate);
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+	int      i;
+
+	LASSERT (n > 0);
+
+	for (i = 0; i < n && names[i] != NULL; i++)
+		LIBCFS_FREE(names[i], IFNAMSIZ);
+
+	LIBCFS_FREE(names, n * sizeof(*names));
+}
+
+EXPORT_SYMBOL(libcfs_ipif_free_enumeration);
+
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int	    rc;
+	mm_segment_t   oldmm = get_fs();
+	long	   ticks = timeout * HZ;
+	unsigned long  then;
+	struct timeval tv;
+
+	LASSERT (nob > 0);
+	/* Caller may pass a zero timeout if she thinks the socket buffer is
+	 * empty enough to take the whole message immediately */
+
+	for (;;) {
+		struct iovec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_name       = NULL,
+			.msg_namelen    = 0,
+			.msg_iov	= &iov,
+			.msg_iovlen     = 1,
+			.msg_control    = NULL,
+			.msg_controllen = 0,
+			.msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
+		};
+
+		if (timeout != 0) {
+			/* Set send timeout to remaining time */
+			tv = (struct timeval) {
+				.tv_sec = ticks / HZ,
+				.tv_usec = ((ticks % HZ) * 1000000) / HZ
+			};
+			set_fs(KERNEL_DS);
+			rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+					     (char *)&tv, sizeof(tv));
+			set_fs(oldmm);
+			if (rc != 0) {
+				CERROR("Can't set socket send timeout "
+				       "%ld.%06d: %d\n",
+				       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+				return rc;
+			}
+		}
+
+		set_fs (KERNEL_DS);
+		then = jiffies;
+		rc = sock_sendmsg (sock, &msg, iov.iov_len);
+		ticks -= jiffies - then;
+		set_fs (oldmm);
+
+		if (rc == nob)
+			return 0;
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0) {
+			CERROR ("Unexpected zero rc\n");
+			return (-ECONNABORTED);
+		}
+
+		if (ticks <= 0)
+			return -EAGAIN;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+	}
+
+	return (0);
+}
+EXPORT_SYMBOL(libcfs_sock_write);
+
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+	int	    rc;
+	mm_segment_t   oldmm = get_fs();
+	long	   ticks = timeout * HZ;
+	unsigned long  then;
+	struct timeval tv;
+
+	LASSERT (nob > 0);
+	LASSERT (ticks > 0);
+
+	for (;;) {
+		struct iovec  iov = {
+			.iov_base = buffer,
+			.iov_len  = nob
+		};
+		struct msghdr msg = {
+			.msg_name       = NULL,
+			.msg_namelen    = 0,
+			.msg_iov	= &iov,
+			.msg_iovlen     = 1,
+			.msg_control    = NULL,
+			.msg_controllen = 0,
+			.msg_flags      = 0
+		};
+
+		/* Set receive timeout to remaining time */
+		tv = (struct timeval) {
+			.tv_sec = ticks / HZ,
+			.tv_usec = ((ticks % HZ) * 1000000) / HZ
+		};
+		set_fs(KERNEL_DS);
+		rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+				     (char *)&tv, sizeof(tv));
+		set_fs(oldmm);
+		if (rc != 0) {
+			CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
+			       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+			return rc;
+		}
+
+		set_fs(KERNEL_DS);
+		then = jiffies;
+		rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
+		ticks -= jiffies - then;
+		set_fs(oldmm);
+
+		if (rc < 0)
+			return rc;
+
+		if (rc == 0)
+			return -ECONNRESET;
+
+		buffer = ((char *)buffer) + rc;
+		nob -= rc;
+
+		if (nob == 0)
+			return 0;
+
+		if (ticks <= 0)
+			return -ETIMEDOUT;
+	}
+}
+
+EXPORT_SYMBOL(libcfs_sock_read);
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal,
+		    __u32 local_ip, int local_port)
+{
+	struct sockaddr_in  locaddr;
+	struct socket      *sock;
+	int		 rc;
+	int		 option;
+	mm_segment_t	oldmm = get_fs();
+
+	/* All errors are fatal except bind failure if the port is in use */
+	*fatal = 1;
+
+	rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+	*sockp = sock;
+	if (rc != 0) {
+		CERROR ("Can't create socket: %d\n", rc);
+		return (rc);
+	}
+
+	set_fs (KERNEL_DS);
+	option = 1;
+	rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+			     (char *)&option, sizeof (option));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+		goto failed;
+	}
+
+	if (local_ip != 0 || local_port != 0) {
+		memset(&locaddr, 0, sizeof(locaddr));
+		locaddr.sin_family = AF_INET;
+		locaddr.sin_port = htons(local_port);
+		locaddr.sin_addr.s_addr = (local_ip == 0) ?
+					  INADDR_ANY : htonl(local_ip);
+
+		rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr,
+				     sizeof(locaddr));
+		if (rc == -EADDRINUSE) {
+			CDEBUG(D_NET, "Port %d already in use\n", local_port);
+			*fatal = 0;
+			goto failed;
+		}
+		if (rc != 0) {
+			CERROR("Error trying to bind to port %d: %d\n",
+			       local_port, rc);
+			goto failed;
+		}
+	}
+
+	return 0;
+
+ failed:
+	sock_release(sock);
+	return rc;
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+	mm_segment_t	oldmm = get_fs();
+	int		 option;
+	int		 rc;
+
+	if (txbufsize != 0) {
+		option = txbufsize;
+		set_fs (KERNEL_DS);
+		rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+				     (char *)&option, sizeof (option));
+		set_fs (oldmm);
+		if (rc != 0) {
+			CERROR ("Can't set send buffer %d: %d\n",
+				option, rc);
+			return (rc);
+		}
+	}
+
+	if (rxbufsize != 0) {
+		option = rxbufsize;
+		set_fs (KERNEL_DS);
+		rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
+				      (char *)&option, sizeof (option));
+		set_fs (oldmm);
+		if (rc != 0) {
+			CERROR ("Can't set receive buffer %d: %d\n",
+				option, rc);
+			return (rc);
+		}
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_setbuf);
+
+int
+libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port)
+{
+	struct sockaddr_in sin;
+	int		len = sizeof (sin);
+	int		rc;
+
+	rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len,
+				 remote ? 2 : 0);
+	if (rc != 0) {
+		CERROR ("Error %d getting sock %s IP/port\n",
+			rc, remote ? "peer" : "local");
+		return rc;
+	}
+
+	if (ip != NULL)
+		*ip = ntohl (sin.sin_addr.s_addr);
+
+	if (port != NULL)
+		*port = ntohs (sin.sin_port);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getaddr);
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+
+	if (txbufsize != NULL) {
+		*txbufsize = sock->sk->sk_sndbuf;
+	}
+
+	if (rxbufsize != NULL) {
+		*rxbufsize = sock->sk->sk_rcvbuf;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getbuf);
+
+int
+libcfs_sock_listen (struct socket **sockp,
+		    __u32 local_ip, int local_port, int backlog)
+{
+	int      fatal;
+	int      rc;
+
+	rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+	if (rc != 0) {
+		if (!fatal)
+			CERROR("Can't create socket: port %d already in use\n",
+			       local_port);
+		return rc;
+	}
+
+	rc = (*sockp)->ops->listen(*sockp, backlog);
+	if (rc == 0)
+		return 0;
+
+	CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+	sock_release(*sockp);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_listen);
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
+{
+	wait_queue_t   wait;
+	struct socket *newsock;
+	int	    rc;
+
+	init_waitqueue_entry(&wait, current);
+
+	/* XXX this should add a ref to sock->ops->owner, if
+	 * TCP could be a module */
+	rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock);
+	if (rc) {
+		CERROR("Can't allocate socket\n");
+		return rc;
+	}
+
+	newsock->ops = sock->ops;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(cfs_sk_sleep(sock->sk), &wait);
+
+	rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+	if (rc == -EAGAIN) {
+		/* Nothing ready, so wait for activity */
+		schedule();
+		rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+	}
+
+	remove_wait_queue(cfs_sk_sleep(sock->sk), &wait);
+	set_current_state(TASK_RUNNING);
+
+	if (rc != 0)
+		goto failed;
+
+	*newsockp = newsock;
+	return 0;
+
+ failed:
+	sock_release(newsock);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_accept);
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+	wake_up_all(cfs_sk_sleep(sock->sk));
+}
+
+EXPORT_SYMBOL(libcfs_sock_abort_accept);
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal,
+		     __u32 local_ip, int local_port,
+		     __u32 peer_ip, int peer_port)
+{
+	struct sockaddr_in  srvaddr;
+	int		 rc;
+
+	rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+	if (rc != 0)
+		return rc;
+
+	memset (&srvaddr, 0, sizeof (srvaddr));
+	srvaddr.sin_family = AF_INET;
+	srvaddr.sin_port = htons(peer_port);
+	srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+	rc = (*sockp)->ops->connect(*sockp,
+				    (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+				    0);
+	if (rc == 0)
+		return 0;
+
+	/* EADDRNOTAVAIL probably means we're already connected to the same
+	 * peer/port on the same local port on a differently typed
+	 * connection.  Let our caller retry with a different local
+	 * port... */
+	*fatal = !(rc == -EADDRNOTAVAIL);
+
+	CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET,
+	       "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+	       HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+
+	sock_release(*sockp);
+	return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_connect);
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+	sock_release(sock);
+}
+
+EXPORT_SYMBOL(libcfs_sock_release);

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
new file mode 100644
index 0000000..6f56343
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c

@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+/* percents to share the total debug memory for each type */
+static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = {
+	80,  /* 80% pages for CFS_TCD_TYPE_PROC */
+	10,  /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */
+	10   /* 10% pages for CFS_TCD_TYPE_IRQ */
+};
+
+char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+
+struct rw_semaphore cfs_tracefile_sem;
+
+int cfs_tracefile_init_arch()
+{
+	int    i;
+	int    j;
+	struct cfs_trace_cpu_data *tcd;
+
+	init_rwsem(&cfs_tracefile_sem);
+
+	/* initialize trace_data */
+	memset(cfs_trace_data, 0, sizeof(cfs_trace_data));
+	for (i = 0; i < CFS_TCD_TYPE_MAX; i++) {
+		cfs_trace_data[i] =
+			kmalloc(sizeof(union cfs_trace_data_union) *
+				num_possible_cpus(), GFP_KERNEL);
+		if (cfs_trace_data[i] == NULL)
+			goto out;
+
+	}
+
+	/* arch related info initialized */
+	cfs_tcd_for_each(tcd, i, j) {
+		spin_lock_init(&tcd->tcd_lock);
+		tcd->tcd_pages_factor = pages_factor[i];
+		tcd->tcd_type = i;
+		tcd->tcd_cpu = j;
+	}
+
+	for (i = 0; i < num_possible_cpus(); i++)
+		for (j = 0; j < 3; j++) {
+			cfs_trace_console_buffers[i][j] =
+				kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE,
+					GFP_KERNEL);
+
+			if (cfs_trace_console_buffers[i][j] == NULL)
+				goto out;
+		}
+
+	return 0;
+
+out:
+	cfs_tracefile_fini_arch();
+	printk(KERN_ERR "lnet: Not enough memory\n");
+	return -ENOMEM;
+}
+
+void cfs_tracefile_fini_arch()
+{
+	int    i;
+	int    j;
+
+	for (i = 0; i < num_possible_cpus(); i++)
+		for (j = 0; j < 3; j++)
+			if (cfs_trace_console_buffers[i][j] != NULL) {
+				kfree(cfs_trace_console_buffers[i][j]);
+				cfs_trace_console_buffers[i][j] = NULL;
+			}
+
+	for (i = 0; cfs_trace_data[i] != NULL; i++) {
+		kfree(cfs_trace_data[i]);
+		cfs_trace_data[i] = NULL;
+	}
+
+	fini_rwsem(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_lock()
+{
+	down_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_unlock()
+{
+	up_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_lock()
+{
+	down_write(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_unlock()
+{
+	up_write(&cfs_tracefile_sem);
+}
+
+cfs_trace_buf_type_t cfs_trace_buf_idx_get()
+{
+	if (in_irq())
+		return CFS_TCD_TYPE_IRQ;
+	else if (in_softirq())
+		return CFS_TCD_TYPE_SOFTIRQ;
+	else
+		return CFS_TCD_TYPE_PROC;
+}
+
+/*
+ * The walking argument indicates the locking comes from all tcd types
+ * iterator and we must lock it and dissable local irqs to avoid deadlocks
+ * with other interrupt locks that might be happening. See LU-1311
+ * for details.
+ */
+int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_lock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_lock_irq(&tcd->tcd_lock);
+	else
+		spin_lock(&tcd->tcd_lock);
+	return 1;
+}
+
+void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+{
+	__LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+	if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+		spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags);
+	else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+		spin_unlock_bh(&tcd->tcd_lock);
+	else if (unlikely(walking))
+		spin_unlock_irq(&tcd->tcd_lock);
+	else
+		spin_unlock(&tcd->tcd_lock);
+}
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+		      struct cfs_trace_page *tage)
+{
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+	return tcd->tcd_cpu == tage->cpu;
+}
+
+void
+cfs_set_ptldebug_header(struct ptldebug_header *header,
+			struct libcfs_debug_msg_data *msgdata,
+			unsigned long stack)
+{
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+
+	header->ph_subsys = msgdata->msg_subsys;
+	header->ph_mask = msgdata->msg_mask;
+	header->ph_cpu_id = smp_processor_id();
+	header->ph_type = cfs_trace_buf_idx_get();
+	header->ph_sec = (__u32)tv.tv_sec;
+	header->ph_usec = tv.tv_usec;
+	header->ph_stack = stack;
+	header->ph_pid = current->pid;
+	header->ph_line_num = msgdata->msg_line;
+	header->ph_extern_pid = 0;
+	return;
+}
+
+static char *
+dbghdr_to_err_string(struct ptldebug_header *hdr)
+{
+	switch (hdr->ph_subsys) {
+
+		case S_LND:
+		case S_LNET:
+			return "LNetError";
+		default:
+			return "LustreError";
+	}
+}
+
+static char *
+dbghdr_to_info_string(struct ptldebug_header *hdr)
+{
+	switch (hdr->ph_subsys) {
+
+		case S_LND:
+		case S_LNET:
+			return "LNet";
+		default:
+			return "Lustre";
+	}
+}
+
+void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+			  const char *buf, int len, const char *file,
+			  const char *fn)
+{
+	char *prefix = "Lustre", *ptype = NULL;
+
+	if ((mask & D_EMERG) != 0) {
+		prefix = dbghdr_to_err_string(hdr);
+		ptype = KERN_EMERG;
+	} else if ((mask & D_ERROR) != 0) {
+		prefix = dbghdr_to_err_string(hdr);
+		ptype = KERN_ERR;
+	} else if ((mask & D_WARNING) != 0) {
+		prefix = dbghdr_to_info_string(hdr);
+		ptype = KERN_WARNING;
+	} else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) {
+		prefix = dbghdr_to_info_string(hdr);
+		ptype = KERN_INFO;
+	}
+
+	if ((mask & D_CONSOLE) != 0) {
+		printk("%s%s: %.*s", ptype, prefix, len, buf);
+	} else {
+		printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix,
+		       hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num,
+		       fn, len, buf);
+	}
+	return;
+}
+
+int cfs_trace_max_debug_mb(void)
+{
+	int  total_mb = (num_physpages >> (20 - PAGE_SHIFT));
+
+	return MAX(512, (total_mb * 80)/100);
+}

diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
new file mode 100644
index 0000000..ba84e4f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h

@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_TRACEFILE_H__
+#define __LIBCFS_LINUX_TRACEFILE_H__
+
+/**
+ * three types of trace_data in linux
+ */
+typedef enum {
+	CFS_TCD_TYPE_PROC = 0,
+	CFS_TCD_TYPE_SOFTIRQ,
+	CFS_TCD_TYPE_IRQ,
+	CFS_TCD_TYPE_MAX
+} cfs_trace_buf_type_t;
+
+#endif

diff --git a/drivers/staging/lustre/lustre/libcfs/lwt.c b/drivers/staging/lustre/lustre/libcfs/lwt.c
new file mode 100644
index 0000000..b631f7d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/lwt.c

@@ -0,0 +1,266 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/lwt.c
+ *
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#if LWT_SUPPORT
+
+#if !KLWT_SUPPORT
+int	 lwt_enabled;
+lwt_cpu_t   lwt_cpus[NR_CPUS];
+#endif
+
+int	 lwt_pages_per_cpu;
+
+/* NB only root is allowed to retrieve LWT info; it's an open door into the
+ * kernel... */
+
+int
+lwt_lookup_string (int *size, char *knl_ptr,
+		   char *user_ptr, int user_size)
+{
+	int   maxsize = 128;
+
+	/* knl_ptr was retrieved from an LWT snapshot and the caller wants to
+	 * turn it into a string.  NB we can crash with an access violation
+	 * trying to determine the string length, so we're trusting our
+	 * caller... */
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		return (-EPERM);
+
+	if (user_size > 0 &&
+	    maxsize > user_size)
+		maxsize = user_size;
+
+	*size = strnlen (knl_ptr, maxsize - 1) + 1;
+
+	if (user_ptr != NULL) {
+		if (user_size < 4)
+			return (-EINVAL);
+
+		if (copy_to_user (user_ptr, knl_ptr, *size))
+			return (-EFAULT);
+
+		/* Did I truncate the string?  */
+		if (knl_ptr[*size - 1] != 0)
+			copy_to_user (user_ptr + *size - 4, "...", 4);
+	}
+
+	return (0);
+}
+
+int
+lwt_control (int enable, int clear)
+{
+	lwt_page_t  *p;
+	int	  i;
+	int	  j;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		return (-EPERM);
+
+	if (!enable) {
+		LWT_EVENT(0,0,0,0);
+		lwt_enabled = 0;
+		mb();
+		/* give people some time to stop adding traces */
+		schedule_timeout(10);
+	}
+
+	for (i = 0; i < num_online_cpus(); i++) {
+		p = lwt_cpus[i].lwtc_current_page;
+
+		if (p == NULL)
+			return (-ENODATA);
+
+		if (!clear)
+			continue;
+
+		for (j = 0; j < lwt_pages_per_cpu; j++) {
+			memset (p->lwtp_events, 0, PAGE_CACHE_SIZE);
+
+			p = list_entry (p->lwtp_list.next,
+					    lwt_page_t, lwtp_list);
+		}
+	}
+
+	if (enable) {
+		lwt_enabled = 1;
+		mb();
+		LWT_EVENT(0,0,0,0);
+	}
+
+	return (0);
+}
+
+int
+lwt_snapshot (cfs_cycles_t *now, int *ncpu, int *total_size,
+	      void *user_ptr, int user_size)
+{
+	const int    events_per_page = PAGE_CACHE_SIZE / sizeof(lwt_event_t);
+	const int    bytes_per_page = events_per_page * sizeof(lwt_event_t);
+	lwt_page_t  *p;
+	int	  i;
+	int	  j;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		return (-EPERM);
+
+	*ncpu = num_online_cpus();
+	*total_size = num_online_cpus() * lwt_pages_per_cpu *
+		bytes_per_page;
+	*now = get_cycles();
+
+	if (user_ptr == NULL)
+		return (0);
+
+	for (i = 0; i < num_online_cpus(); i++) {
+		p = lwt_cpus[i].lwtc_current_page;
+
+		if (p == NULL)
+			return (-ENODATA);
+
+		for (j = 0; j < lwt_pages_per_cpu; j++) {
+			if (copy_to_user(user_ptr, p->lwtp_events,
+					     bytes_per_page))
+				return (-EFAULT);
+
+			user_ptr = ((char *)user_ptr) + bytes_per_page;
+			p = list_entry(p->lwtp_list.next,
+					   lwt_page_t, lwtp_list);
+		}
+	}
+
+	return (0);
+}
+
+int
+lwt_init ()
+{
+	int     i;
+	int     j;
+
+	for (i = 0; i < num_online_cpus(); i++)
+		if (lwt_cpus[i].lwtc_current_page != NULL)
+			return (-EALREADY);
+
+	LASSERT (!lwt_enabled);
+
+	/* NULL pointers, zero scalars */
+	memset (lwt_cpus, 0, sizeof (lwt_cpus));
+	lwt_pages_per_cpu =
+		LWT_MEMORY / (num_online_cpus() * PAGE_CACHE_SIZE);
+
+	for (i = 0; i < num_online_cpus(); i++)
+		for (j = 0; j < lwt_pages_per_cpu; j++) {
+			struct page *page = alloc_page (GFP_KERNEL);
+			lwt_page_t  *lwtp;
+
+			if (page == NULL) {
+				CERROR ("Can't allocate page\n");
+				lwt_fini ();
+				return (-ENOMEM);
+			}
+
+			LIBCFS_ALLOC(lwtp, sizeof (*lwtp));
+			if (lwtp == NULL) {
+				CERROR ("Can't allocate lwtp\n");
+				__free_page(page);
+				lwt_fini ();
+				return (-ENOMEM);
+			}
+
+			lwtp->lwtp_page = page;
+			lwtp->lwtp_events = page_address(page);
+			memset (lwtp->lwtp_events, 0, PAGE_CACHE_SIZE);
+
+			if (j == 0) {
+				INIT_LIST_HEAD (&lwtp->lwtp_list);
+				lwt_cpus[i].lwtc_current_page = lwtp;
+			} else {
+				list_add (&lwtp->lwtp_list,
+				    &lwt_cpus[i].lwtc_current_page->lwtp_list);
+			}
+		}
+
+	lwt_enabled = 1;
+	mb();
+
+	LWT_EVENT(0,0,0,0);
+
+	return (0);
+}
+
+void
+lwt_fini ()
+{
+	int    i;
+
+	lwt_control(0, 0);
+
+	for (i = 0; i < num_online_cpus(); i++)
+		while (lwt_cpus[i].lwtc_current_page != NULL) {
+			lwt_page_t *lwtp = lwt_cpus[i].lwtc_current_page;
+
+			if (list_empty (&lwtp->lwtp_list)) {
+				lwt_cpus[i].lwtc_current_page = NULL;
+			} else {
+				lwt_cpus[i].lwtc_current_page =
+					list_entry (lwtp->lwtp_list.next,
+							lwt_page_t, lwtp_list);
+
+				list_del (&lwtp->lwtp_list);
+			}
+
+			__free_page (lwtp->lwtp_page);
+			LIBCFS_FREE (lwtp, sizeof (*lwtp));
+		}
+}
+
+EXPORT_SYMBOL(lwt_enabled);
+EXPORT_SYMBOL(lwt_cpus);
+
+EXPORT_SYMBOL(lwt_init);
+EXPORT_SYMBOL(lwt_fini);
+EXPORT_SYMBOL(lwt_lookup_string);
+EXPORT_SYMBOL(lwt_control);
+EXPORT_SYMBOL(lwt_snapshot);
+#endif

diff --git a/drivers/staging/lustre/lustre/libcfs/module.c b/drivers/staging/lustre/lustre/libcfs/module.c
new file mode 100644
index 0000000..3372537
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/module.c

@@ -0,0 +1,498 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/libcfs_crypto.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnet.h>
+#include "tracefile.h"
+
+void
+kportal_memhog_free (struct libcfs_device_userstate *ldu)
+{
+	struct page **level0p = &ldu->ldu_memhog_root_page;
+	struct page **level1p;
+	struct page **level2p;
+	int	   count1;
+	int	   count2;
+
+	if (*level0p != NULL) {
+
+		level1p = (struct page **)page_address(*level0p);
+		count1 = 0;
+
+		while (count1 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+		       *level1p != NULL) {
+
+			level2p = (struct page **)page_address(*level1p);
+			count2 = 0;
+
+			while (count2 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+			       *level2p != NULL) {
+
+				__free_page(*level2p);
+				ldu->ldu_memhog_pages--;
+				level2p++;
+				count2++;
+			}
+
+			__free_page(*level1p);
+			ldu->ldu_memhog_pages--;
+			level1p++;
+			count1++;
+		}
+
+		__free_page(*level0p);
+		ldu->ldu_memhog_pages--;
+
+		*level0p = NULL;
+	}
+
+	LASSERT (ldu->ldu_memhog_pages == 0);
+}
+
+int
+kportal_memhog_alloc (struct libcfs_device_userstate *ldu, int npages, int flags)
+{
+	struct page **level0p;
+	struct page **level1p;
+	struct page **level2p;
+	int	   count1;
+	int	   count2;
+
+	LASSERT (ldu->ldu_memhog_pages == 0);
+	LASSERT (ldu->ldu_memhog_root_page == NULL);
+
+	if (npages < 0)
+		return -EINVAL;
+
+	if (npages == 0)
+		return 0;
+
+	level0p = &ldu->ldu_memhog_root_page;
+	*level0p = alloc_page(flags);
+	if (*level0p == NULL)
+		return -ENOMEM;
+	ldu->ldu_memhog_pages++;
+
+	level1p = (struct page **)page_address(*level0p);
+	count1 = 0;
+	memset(level1p, 0, PAGE_CACHE_SIZE);
+
+	while (ldu->ldu_memhog_pages < npages &&
+	       count1 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+		if (cfs_signal_pending())
+			return (-EINTR);
+
+		*level1p = alloc_page(flags);
+		if (*level1p == NULL)
+			return -ENOMEM;
+		ldu->ldu_memhog_pages++;
+
+		level2p = (struct page **)page_address(*level1p);
+		count2 = 0;
+		memset(level2p, 0, PAGE_CACHE_SIZE);
+
+		while (ldu->ldu_memhog_pages < npages &&
+		       count2 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+			if (cfs_signal_pending())
+				return (-EINTR);
+
+			*level2p = alloc_page(flags);
+			if (*level2p == NULL)
+				return (-ENOMEM);
+			ldu->ldu_memhog_pages++;
+
+			level2p++;
+			count2++;
+		}
+
+		level1p++;
+		count1++;
+	}
+
+	return 0;
+}
+
+/* called when opening /dev/device */
+static int libcfs_psdev_open(unsigned long flags, void *args)
+{
+	struct libcfs_device_userstate *ldu;
+	ENTRY;
+
+	try_module_get(THIS_MODULE);
+
+	LIBCFS_ALLOC(ldu, sizeof(*ldu));
+	if (ldu != NULL) {
+		ldu->ldu_memhog_pages = 0;
+		ldu->ldu_memhog_root_page = NULL;
+	}
+	*(struct libcfs_device_userstate **)args = ldu;
+
+	RETURN(0);
+}
+
+/* called when closing /dev/device */
+static int libcfs_psdev_release(unsigned long flags, void *args)
+{
+	struct libcfs_device_userstate *ldu;
+	ENTRY;
+
+	ldu = (struct libcfs_device_userstate *)args;
+	if (ldu != NULL) {
+		kportal_memhog_free(ldu);
+		LIBCFS_FREE(ldu, sizeof(*ldu));
+	}
+
+	module_put(THIS_MODULE);
+	RETURN(0);
+}
+
+static struct rw_semaphore ioctl_list_sem;
+static struct list_head ioctl_list;
+
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (!list_empty(&hand->item))
+		rc = -EBUSY;
+	else
+		list_add_tail(&hand->item, &ioctl_list);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_register_ioctl);
+
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
+{
+	int rc = 0;
+
+	down_write(&ioctl_list_sem);
+	if (list_empty(&hand->item))
+		rc = -ENOENT;
+	else
+		list_del_init(&hand->item);
+	up_write(&ioctl_list_sem);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_deregister_ioctl);
+
+static int libcfs_ioctl_int(struct cfs_psdev_file *pfile,unsigned long cmd,
+			    void *arg, struct libcfs_ioctl_data *data)
+{
+	int err = -EINVAL;
+	ENTRY;
+
+	switch (cmd) {
+	case IOC_LIBCFS_CLEAR_DEBUG:
+		libcfs_debug_clear_buffer();
+		RETURN(0);
+	/*
+	 * case IOC_LIBCFS_PANIC:
+	 * Handled in arch/cfs_module.c
+	 */
+	case IOC_LIBCFS_MARK_DEBUG:
+		if (data->ioc_inlbuf1 == NULL ||
+		    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+			RETURN(-EINVAL);
+		libcfs_debug_mark_buffer(data->ioc_inlbuf1);
+		RETURN(0);
+#if LWT_SUPPORT
+	case IOC_LIBCFS_LWT_CONTROL:
+		err = lwt_control ((data->ioc_flags & 1) != 0,
+				   (data->ioc_flags & 2) != 0);
+		break;
+
+	case IOC_LIBCFS_LWT_SNAPSHOT: {
+		cfs_cycles_t   now;
+		int	    ncpu;
+		int	    total_size;
+
+		err = lwt_snapshot (&now, &ncpu, &total_size,
+				    data->ioc_pbuf1, data->ioc_plen1);
+		data->ioc_u64[0] = now;
+		data->ioc_u32[0] = ncpu;
+		data->ioc_u32[1] = total_size;
+
+		/* Hedge against broken user/kernel typedefs (e.g. cycles_t) */
+		data->ioc_u32[2] = sizeof(lwt_event_t);
+		data->ioc_u32[3] = offsetof(lwt_event_t, lwte_where);
+
+		if (err == 0 &&
+		    libcfs_ioctl_popdata(arg, data, sizeof (*data)))
+			err = -EFAULT;
+		break;
+	}
+
+	case IOC_LIBCFS_LWT_LOOKUP_STRING:
+		err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
+					 data->ioc_pbuf2, data->ioc_plen2);
+		if (err == 0 &&
+		    libcfs_ioctl_popdata(arg, data, sizeof (*data)))
+			err = -EFAULT;
+		break;
+#endif
+	case IOC_LIBCFS_MEMHOG:
+		if (pfile->private_data == NULL) {
+			err = -EINVAL;
+		} else {
+			kportal_memhog_free(pfile->private_data);
+			/* XXX The ioc_flags is not GFP flags now, need to be fixed */
+			err = kportal_memhog_alloc(pfile->private_data,
+						   data->ioc_count,
+						   data->ioc_flags);
+			if (err != 0)
+				kportal_memhog_free(pfile->private_data);
+		}
+		break;
+
+	case IOC_LIBCFS_PING_TEST: {
+		extern void (kping_client)(struct libcfs_ioctl_data *);
+		void (*ping)(struct libcfs_ioctl_data *);
+
+		CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n",
+		       data->ioc_count, libcfs_nid2str(data->ioc_nid),
+		       libcfs_nid2str(data->ioc_nid));
+		ping = symbol_get(kping_client);
+		if (!ping)
+			CERROR("symbol_get failed\n");
+		else {
+			ping(data);
+			symbol_put(kping_client);
+		}
+		RETURN(0);
+	}
+
+	default: {
+		struct libcfs_ioctl_handler *hand;
+		err = -EINVAL;
+		down_read(&ioctl_list_sem);
+		list_for_each_entry(hand, &ioctl_list, item) {
+			err = hand->handle_ioctl(cmd, data);
+			if (err != -EINVAL) {
+				if (err == 0)
+					err = libcfs_ioctl_popdata(arg,
+							data, sizeof (*data));
+				break;
+			}
+		}
+		up_read(&ioctl_list_sem);
+		break;
+	}
+	}
+
+	RETURN(err);
+}
+
+static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *arg)
+{
+	char    *buf;
+	struct libcfs_ioctl_data *data;
+	int err = 0;
+	ENTRY;
+
+	LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS);
+	if (buf == NULL)
+		RETURN(-ENOMEM);
+
+	/* 'cmd' and permissions get checked in our arch-specific caller */
+	if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+		CERROR("PORTALS ioctl: data error\n");
+		GOTO(out, err = -EINVAL);
+	}
+	data = (struct libcfs_ioctl_data *)buf;
+
+	err = libcfs_ioctl_int(pfile, cmd, arg, data);
+
+out:
+	LIBCFS_FREE(buf, 1024);
+	RETURN(err);
+}
+
+
+struct cfs_psdev_ops libcfs_psdev_ops = {
+	libcfs_psdev_open,
+	libcfs_psdev_release,
+	NULL,
+	NULL,
+	libcfs_ioctl
+};
+
+extern int insert_proc(void);
+extern void remove_proc(void);
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+extern psdev_t libcfs_dev;
+extern struct rw_semaphore cfs_tracefile_sem;
+extern struct mutex cfs_trace_thread_mutex;
+extern struct cfs_wi_sched *cfs_sched_rehash;
+
+extern void libcfs_init_nidstrings(void);
+extern int libcfs_arch_init(void);
+extern void libcfs_arch_cleanup(void);
+
+static int init_libcfs_module(void)
+{
+	int rc;
+
+	libcfs_arch_init();
+	libcfs_init_nidstrings();
+	init_rwsem(&cfs_tracefile_sem);
+	mutex_init(&cfs_trace_thread_mutex);
+	init_rwsem(&ioctl_list_sem);
+	INIT_LIST_HEAD(&ioctl_list);
+	init_waitqueue_head(&cfs_race_waitq);
+
+	rc = libcfs_debug_init(5 * 1024 * 1024);
+	if (rc < 0) {
+		printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc);
+		return (rc);
+	}
+
+	rc = cfs_cpu_init();
+	if (rc != 0)
+		goto cleanup_debug;
+
+#if LWT_SUPPORT
+	rc = lwt_init();
+	if (rc != 0) {
+		CERROR("lwt_init: error %d\n", rc);
+		goto cleanup_debug;
+	}
+#endif
+	rc = misc_register(&libcfs_dev);
+	if (rc) {
+		CERROR("misc_register: error %d\n", rc);
+		goto cleanup_lwt;
+	}
+
+	rc = cfs_wi_startup();
+	if (rc) {
+		CERROR("initialize workitem: error %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	/* max to 4 threads, should be enough for rehash */
+	rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4);
+	rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY,
+				 rc, &cfs_sched_rehash);
+	if (rc != 0) {
+		CERROR("Startup workitem scheduler: error: %d\n", rc);
+		goto cleanup_deregister;
+	}
+
+	rc = cfs_crypto_register();
+	if (rc) {
+		CERROR("cfs_crypto_regster: error %d\n", rc);
+		goto cleanup_wi;
+	}
+
+
+	rc = insert_proc();
+	if (rc) {
+		CERROR("insert_proc: error %d\n", rc);
+		goto cleanup_crypto;
+	}
+
+	CDEBUG (D_OTHER, "portals setup OK\n");
+	return 0;
+ cleanup_crypto:
+	cfs_crypto_unregister();
+ cleanup_wi:
+	cfs_wi_shutdown();
+ cleanup_deregister:
+	misc_deregister(&libcfs_dev);
+ cleanup_lwt:
+#if LWT_SUPPORT
+	lwt_fini();
+#endif
+ cleanup_debug:
+	libcfs_debug_cleanup();
+	return rc;
+}
+
+static void exit_libcfs_module(void)
+{
+	int rc;
+
+	remove_proc();
+
+	CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	if (cfs_sched_rehash != NULL) {
+		cfs_wi_sched_destroy(cfs_sched_rehash);
+		cfs_sched_rehash = NULL;
+	}
+
+	cfs_crypto_unregister();
+	cfs_wi_shutdown();
+
+	rc = misc_deregister(&libcfs_dev);
+	if (rc)
+		CERROR("misc_deregister error %d\n", rc);
+
+#if LWT_SUPPORT
+	lwt_fini();
+#endif
+	cfs_cpu_fini();
+
+	if (atomic_read(&libcfs_kmemory) != 0)
+		CERROR("Portals memory leaked: %d bytes\n",
+		       atomic_read(&libcfs_kmemory));
+
+	rc = libcfs_debug_cleanup();
+	if (rc)
+		printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n",
+		       rc);
+
+	fini_rwsem(&ioctl_list_sem);
+	fini_rwsem(&cfs_tracefile_sem);
+
+	libcfs_arch_cleanup();
+}
+
+cfs_module(libcfs, "1.0.0", init_libcfs_module, exit_libcfs_module);

diff --git a/drivers/staging/lustre/lustre/libcfs/nidstrings.c b/drivers/staging/lustre/lustre/libcfs/nidstrings.c
new file mode 100644
index 0000000..ccfd107
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/nidstrings.c

@@ -0,0 +1,867 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char      libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int       libcfs_nidstring_idx = 0;
+
+static spinlock_t libcfs_nidstring_lock;
+
+void libcfs_init_nidstrings (void)
+{
+	spin_lock_init(&libcfs_nidstring_lock);
+}
+
+# define NIDSTR_LOCK(f)   spin_lock_irqsave(&libcfs_nidstring_lock, f)
+# define NIDSTR_UNLOCK(f) spin_unlock_irqrestore(&libcfs_nidstring_lock, f)
+
+static char *
+libcfs_next_nidstring (void)
+{
+	char	  *str;
+	unsigned long  flags;
+
+	NIDSTR_LOCK(flags);
+
+	str = libcfs_nidstrings[libcfs_nidstring_idx++];
+	if (libcfs_nidstring_idx ==
+	    sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0]))
+		libcfs_nidstring_idx = 0;
+
+	NIDSTR_UNLOCK(flags);
+	return str;
+}
+
+static int  libcfs_lo_str2addr(const char *str, int nob, __u32 *addr);
+static void libcfs_ip_addr2str(__u32 addr, char *str);
+static int  libcfs_ip_str2addr(const char *str, int nob, __u32 *addr);
+static void libcfs_decnum_addr2str(__u32 addr, char *str);
+static void libcfs_hexnum_addr2str(__u32 addr, char *str);
+static int  libcfs_num_str2addr(const char *str, int nob, __u32 *addr);
+static int  libcfs_num_parse(char *str, int len, struct list_head *list);
+static int  libcfs_num_match(__u32 addr, struct list_head *list);
+
+struct netstrfns {
+	int	  nf_type;
+	char	*nf_name;
+	char	*nf_modname;
+	void       (*nf_addr2str)(__u32 addr, char *str);
+	int	(*nf_str2addr)(const char *str, int nob, __u32 *addr);
+	int	(*nf_parse_addrlist)(char *str, int len,
+					struct list_head *list);
+	int	(*nf_match_addr)(__u32 addr, struct list_head *list);
+};
+
+static struct netstrfns  libcfs_netstrfns[] = {
+	{/* .nf_type      */  LOLND,
+	 /* .nf_name      */  "lo",
+	 /* .nf_modname   */  "klolnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_lo_str2addr,
+	 /* .nf_parse_addr*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  SOCKLND,
+	 /* .nf_name      */  "tcp",
+	 /* .nf_modname   */  "ksocklnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  O2IBLND,
+	 /* .nf_name      */  "o2ib",
+	 /* .nf_modname   */  "ko2iblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  CIBLND,
+	 /* .nf_name      */  "cib",
+	 /* .nf_modname   */  "kciblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  OPENIBLND,
+	 /* .nf_name      */  "openib",
+	 /* .nf_modname   */  "kopeniblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  IIBLND,
+	 /* .nf_name      */  "iib",
+	 /* .nf_modname   */  "kiiblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  VIBLND,
+	 /* .nf_name      */  "vib",
+	 /* .nf_modname   */  "kviblnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  RALND,
+	 /* .nf_name      */  "ra",
+	 /* .nf_modname   */  "kralnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  QSWLND,
+	 /* .nf_name      */  "elan",
+	 /* .nf_modname   */  "kqswlnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  GMLND,
+	 /* .nf_name      */  "gm",
+	 /* .nf_modname   */  "kgmlnd",
+	 /* .nf_addr2str  */  libcfs_hexnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  MXLND,
+	 /* .nf_name      */  "mx",
+	 /* .nf_modname   */  "kmxlnd",
+	 /* .nf_addr2str  */  libcfs_ip_addr2str,
+	 /* .nf_str2addr  */  libcfs_ip_str2addr,
+	 /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+	 /* .nf_match_addr*/  cfs_ip_addr_match},
+	{/* .nf_type      */  PTLLND,
+	 /* .nf_name      */  "ptl",
+	 /* .nf_modname   */  "kptllnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	{/* .nf_type      */  GNILND,
+	 /* .nf_name      */  "gni",
+	 /* .nf_modname   */  "kgnilnd",
+	 /* .nf_addr2str  */  libcfs_decnum_addr2str,
+	 /* .nf_str2addr  */  libcfs_num_str2addr,
+	 /* .nf_parse_addrlist*/  libcfs_num_parse,
+	 /* .nf_match_addr*/  libcfs_num_match},
+	/* placeholder for net0 alias.  It MUST BE THE LAST ENTRY */
+	{/* .nf_type      */  -1},
+};
+
+const int libcfs_nnetstrfns = sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]);
+
+int
+libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+	*addr = 0;
+	return 1;
+}
+
+void
+libcfs_ip_addr2str(__u32 addr, char *str)
+{
+#if 0   /* never lookup */
+#endif
+	snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u",
+		 (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+		 (addr >> 8) & 0xff, addr & 0xff);
+}
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+
+int
+libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+	int   a;
+	int   b;
+	int   c;
+	int   d;
+	int   n = nob;			  /* XscanfX */
+
+	/* numeric IP? */
+	if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+	    n == nob &&
+	    (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+	    (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+		*addr = ((a<<24)|(b<<16)|(c<<8)|d);
+		return 1;
+	}
+
+	return 0;
+}
+
+void
+libcfs_decnum_addr2str(__u32 addr, char *str)
+{
+	snprintf(str, LNET_NIDSTR_SIZE, "%u", addr);
+}
+
+void
+libcfs_hexnum_addr2str(__u32 addr, char *str)
+{
+	snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr);
+}
+
+int
+libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+	int     n;
+
+	n = nob;
+	if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	n = nob;
+	if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+		return 1;
+
+	return 0;
+}
+
+struct netstrfns *
+libcfs_lnd2netstrfns(int lnd)
+{
+	int    i;
+
+	if (lnd >= 0)
+		for (i = 0; i < libcfs_nnetstrfns; i++)
+			if (lnd == libcfs_netstrfns[i].nf_type)
+				return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+	struct netstrfns *nf;
+	int	       i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (nf->nf_type >= 0 &&
+		    !strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+			return nf;
+	}
+	return NULL;
+}
+
+struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+	int    i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++)
+		if (libcfs_netstrfns[i].nf_type >= 0 &&
+		    !strcmp(libcfs_netstrfns[i].nf_name, name))
+			return &libcfs_netstrfns[i];
+
+	return NULL;
+}
+
+int
+libcfs_isknown_lnd(int type)
+{
+	return libcfs_lnd2netstrfns(type) != NULL;
+}
+
+char *
+libcfs_lnd2modname(int lnd)
+{
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	return (nf == NULL) ? NULL : nf->nf_modname;
+}
+
+char *
+libcfs_lnd2str(int lnd)
+{
+	char	   *str;
+	struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+	if (nf != NULL)
+		return nf->nf_name;
+
+	str = libcfs_next_nidstring();
+	snprintf(str, LNET_NIDSTR_SIZE, "?%u?", lnd);
+	return str;
+}
+
+int
+libcfs_str2lnd(const char *str)
+{
+	struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+	if (nf != NULL)
+		return nf->nf_type;
+
+	return -1;
+}
+
+char *
+libcfs_net2str(__u32 net)
+{
+	int	       lnd = LNET_NETTYP(net);
+	int	       num = LNET_NETNUM(net);
+	struct netstrfns *nf  = libcfs_lnd2netstrfns(lnd);
+	char	     *str = libcfs_next_nidstring();
+
+	if (nf == NULL)
+		snprintf(str, LNET_NIDSTR_SIZE, "<%u:%u>", lnd, num);
+	else if (num == 0)
+		snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name);
+	else
+		snprintf(str, LNET_NIDSTR_SIZE, "%s%u", nf->nf_name, num);
+
+	return str;
+}
+
+char *
+libcfs_nid2str(lnet_nid_t nid)
+{
+	__u32	     addr = LNET_NIDADDR(nid);
+	__u32	     net = LNET_NIDNET(nid);
+	int	       lnd = LNET_NETTYP(net);
+	int	       nnum = LNET_NETNUM(net);
+	struct netstrfns *nf;
+	char	     *str;
+	int	       nob;
+
+	if (nid == LNET_NID_ANY)
+		return "<?>";
+
+	nf = libcfs_lnd2netstrfns(lnd);
+	str = libcfs_next_nidstring();
+
+	if (nf == NULL)
+		snprintf(str, LNET_NIDSTR_SIZE, "%x@<%u:%u>", addr, lnd, nnum);
+	else {
+		nf->nf_addr2str(addr, str);
+		nob = strlen(str);
+		if (nnum == 0)
+			snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s",
+				 nf->nf_name);
+		else
+			snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%u",
+				 nf->nf_name, nnum);
+	}
+
+	return str;
+}
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+	struct netstrfns *uninitialized_var(nf);
+	int	       nob;
+	int	       netnum;
+	int	       i;
+
+	for (i = 0; i < libcfs_nnetstrfns; i++) {
+		nf = &libcfs_netstrfns[i];
+		if (nf->nf_type >= 0 &&
+		    !strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+			break;
+	}
+
+	if (i == libcfs_nnetstrfns)
+		return NULL;
+
+	nob = strlen(nf->nf_name);
+
+	if (strlen(str) == (unsigned int)nob) {
+		netnum = 0;
+	} else {
+		if (nf->nf_type == LOLND) /* net number not allowed */
+			return NULL;
+
+		str += nob;
+		i = strlen(str);
+		if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+		    i != (int)strlen(str))
+			return NULL;
+	}
+
+	*net = LNET_MKNET(nf->nf_type, netnum);
+	return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+	__u32  net;
+
+	if (libcfs_str2net_internal(str, &net) != NULL)
+		return net;
+
+	return LNET_NIDNET(LNET_NID_ANY);
+}
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+	const char       *sep = strchr(str, '@');
+	struct netstrfns *nf;
+	__u32	     net;
+	__u32	     addr;
+
+	if (sep != NULL) {
+		nf = libcfs_str2net_internal(sep + 1, &net);
+		if (nf == NULL)
+			return LNET_NID_ANY;
+	} else {
+		sep = str + strlen(str);
+		net = LNET_MKNET(SOCKLND, 0);
+		nf = libcfs_lnd2netstrfns(SOCKLND);
+		LASSERT (nf != NULL);
+	}
+
+	if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+		return LNET_NID_ANY;
+
+	return LNET_MKNID(net, addr);
+}
+
+char *
+libcfs_id2str(lnet_process_id_t id)
+{
+	char *str = libcfs_next_nidstring();
+
+	if (id.pid == LNET_PID_ANY) {
+		snprintf(str, LNET_NIDSTR_SIZE,
+			 "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+		return str;
+	}
+
+	snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+		 ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+		 (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+	return str;
+}
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+	if (!strcmp(str, "*")) {
+		*nidp = LNET_NID_ANY;
+		return 1;
+	}
+
+	*nidp = libcfs_str2nid(str);
+	return *nidp != LNET_NID_ANY;
+}
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist>	 :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange>	:== <addrrange> '@' <net>
+ * <addrrange>       :== '*' |
+ *		       <ipaddr_range> |
+ *			 <cfs_expr_list>
+ * <ipaddr_range>    :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ *			 <cfs_expr_list>
+ * <cfs_expr_list>   :== <number> |
+ *		       <expr_list>
+ * <expr_list>       :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr>      :== <number> |
+ *		       <number> '-' <number> |
+ *		       <number> '-' <number> '/' <number>
+ * <net>	     :== <netname> | <netname><number>
+ * <netname>	 :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ *		       "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+	/**
+	 * Link to list of this structures which is built on nid range
+	 * list parsing.
+	 */
+	struct list_head nr_link;
+	/**
+	 * List head for addrrange::ar_link.
+	 */
+	struct list_head nr_addrranges;
+	/**
+	 * Flag indicating that *@<net> is found.
+	 */
+	int nr_all;
+	/**
+	 * Pointer to corresponding element of libcfs_netstrfns.
+	 */
+	struct netstrfns *nr_netstrfns;
+	/**
+	 * Number of network. E.g. 5 if \<net\> is "elan5".
+	 */
+	int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+	/**
+	 * Link to nidrange::nr_addrranges.
+	 */
+	struct list_head ar_link;
+	/**
+	 * List head for cfs_expr_list::el_list.
+	 */
+	struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+static int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+	struct cfs_expr_list *el;
+	int	rc;
+
+	rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+	if (rc == 0)
+		list_add_tail(&el->el_link, list);
+
+	return rc;
+}
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 1 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval 0 otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+	struct addrrange *addrrange;
+
+	if (src->ls_len == 1 && src->ls_str[0] == '*') {
+		nidrange->nr_all = 1;
+		return 1;
+	}
+
+	LIBCFS_ALLOC(addrrange, sizeof(struct addrrange));
+	if (addrrange == NULL)
+		return 0;
+	list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+	INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+	return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+						src->ls_len,
+						&addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+	     struct list_head *nidlist)
+{
+	struct netstrfns *nf;
+	struct nidrange *nr;
+	int endlen;
+	unsigned netnum;
+
+	if (src->ls_len >= LNET_NIDSTR_SIZE)
+		return NULL;
+
+	nf = libcfs_namenum2netstrfns(src->ls_str);
+	if (nf == NULL)
+		return NULL;
+	endlen = src->ls_len - strlen(nf->nf_name);
+	if (endlen == 0)
+		/* network name only, e.g. "elan" or "tcp" */
+		netnum = 0;
+	else {
+		/* e.g. "elan25" or "tcp23", refuse to parse if
+		 * network name is not appended with decimal or
+		 * hexadecimal number */
+		if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+				       endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+			return NULL;
+	}
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns != nf)
+			continue;
+		if (nr->nr_netnum != netnum)
+			continue;
+		return nr;
+	}
+
+	LIBCFS_ALLOC(nr, sizeof(struct nidrange));
+	if (nr == NULL)
+		return NULL;
+	list_add_tail(&nr->nr_link, nidlist);
+	INIT_LIST_HEAD(&nr->nr_addrranges);
+	nr->nr_netstrfns = nf;
+	nr->nr_all = 0;
+	nr->nr_netnum = netnum;
+
+	return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+	struct cfs_lstr addrrange;
+	struct cfs_lstr net;
+	struct cfs_lstr tmp;
+	struct nidrange *nr;
+
+	tmp = *src;
+	if (cfs_gettok(src, '@', &addrrange) == 0)
+		goto failed;
+
+	if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+		goto failed;
+
+	nr = add_nidrange(&net, nidlist);
+	if (nr == NULL)
+		goto failed;
+
+	if (parse_addrange(&addrrange, nr) != 0)
+		goto failed;
+
+	return 1;
+ failed:
+	CWARN("can't parse nidrange: \"%.*s\"\n", tmp.ls_len, tmp.ls_str);
+	return 0;
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct addrrange *ar;
+
+		ar = list_entry(list->next, struct addrrange, ar_link);
+
+		cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+		list_del(&ar->ar_link);
+		LIBCFS_FREE(ar, sizeof(struct addrrange));
+	}
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	struct nidrange *nr;
+
+	list_for_each_safe(pos, next, list) {
+		nr = list_entry(pos, struct nidrange, nr_link);
+		free_addrranges(&nr->nr_addrranges);
+		list_del(pos);
+		LIBCFS_FREE(nr, sizeof(struct nidrange));
+	}
+}
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+	struct cfs_lstr src;
+	struct cfs_lstr res;
+	int rc;
+	ENTRY;
+
+	src.ls_str = str;
+	src.ls_len = len;
+	INIT_LIST_HEAD(nidlist);
+	while (src.ls_str) {
+		rc = cfs_gettok(&src, ' ', &res);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			RETURN(0);
+		}
+		rc = parse_nidrange(&res, nidlist);
+		if (rc == 0) {
+			cfs_free_nidlist(nidlist);
+			RETURN(0);
+		}
+	}
+	RETURN(1);
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+	struct cfs_expr_list *el;
+
+	LASSERT(!list_empty(numaddr));
+	el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+	return cfs_expr_list_match(addr, el);
+}
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0  otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+	struct nidrange *nr;
+	struct addrrange *ar;
+	ENTRY;
+
+	list_for_each_entry(nr, nidlist, nr_link) {
+		if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+			continue;
+		if (nr->nr_all)
+			RETURN(1);
+		list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+			if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+						       &ar->ar_numaddr_ranges))
+				RETURN(1);
+	}
+	RETURN(0);
+}
+
+
+EXPORT_SYMBOL(libcfs_isknown_lnd);
+EXPORT_SYMBOL(libcfs_lnd2modname);
+EXPORT_SYMBOL(libcfs_lnd2str);
+EXPORT_SYMBOL(libcfs_str2lnd);
+EXPORT_SYMBOL(libcfs_net2str);
+EXPORT_SYMBOL(libcfs_nid2str);
+EXPORT_SYMBOL(libcfs_str2net);
+EXPORT_SYMBOL(libcfs_str2nid);
+EXPORT_SYMBOL(libcfs_id2str);
+EXPORT_SYMBOL(libcfs_str2anynid);
+EXPORT_SYMBOL(cfs_free_nidlist);
+EXPORT_SYMBOL(cfs_parse_nidlist);
+EXPORT_SYMBOL(cfs_match_nid);

diff --git a/drivers/staging/lustre/lustre/libcfs/prng.c b/drivers/staging/lustre/lustre/libcfs/prng.c
new file mode 100644
index 0000000..69224d8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/prng.c

@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/prng.c
+ *
+ * concatenation of following two 16-bit multiply with carry generators
+ * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16,
+ * number and carry packed within the same 32 bit integer.
+ * algorithm recommended by Marsaglia
+*/
+
+#include <linux/libcfs/libcfs.h>
+
+/*
+From: George Marsaglia <geo@stat.fsu.edu>
+Newsgroups: sci.math
+Subject: Re: A RANDOM NUMBER GENERATOR FOR C
+Date: Tue, 30 Sep 1997 05:29:35 -0700
+
+ * You may replace the two constants 36969 and 18000 by any
+ * pair of distinct constants from this list:
+ * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584
+ * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243
+ * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974
+ * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114
+ * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088
+ * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834
+ * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013
+ * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083
+ * (or any other 16-bit constants k for which both k*2^16-1
+ * and k*2^15-1 are prime) */
+
+#define RANDOM_CONST_A 18030
+#define RANDOM_CONST_B 29013
+
+static unsigned int seed_x = 521288629;
+static unsigned int seed_y = 362436069;
+
+/**
+ * cfs_rand - creates new seeds
+ *
+ * First it creates new seeds from the previous seeds. Then it generates a
+ * new psuedo random number for use.
+ *
+ * Returns a pseudo-random 32-bit integer
+ */
+unsigned int cfs_rand(void)
+{
+	seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16);
+	seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16);
+
+	return ((seed_x << 16) + (seed_y & 65535));
+}
+EXPORT_SYMBOL(cfs_rand);
+
+/**
+ * cfs_srand - sets the inital seed
+ * @seed1 : (seed_x) should have the most entropy in the low bits of the word
+ * @seed2 : (seed_y) should have the most entropy in the high bits of the word
+ *
+ * Replaces the original seeds with new values. Used to generate a new pseudo
+ * random numbers.
+ */
+void cfs_srand(unsigned int seed1, unsigned int seed2)
+{
+	if (seed1)
+		seed_x = seed1; /* use default seeds if parameter is 0 */
+	if (seed2)
+		seed_y = seed2;
+}
+EXPORT_SYMBOL(cfs_srand);
+
+/**
+ * cfs_get_random_bytes - generate a bunch of random numbers
+ * @buf : buffer to fill with random numbers
+ * @size: size of passed in buffer
+ *
+ * Fills a buffer with random bytes
+ */
+void cfs_get_random_bytes(void *buf, int size)
+{
+	int *p = buf;
+	int rem, tmp;
+
+	LASSERT(size >= 0);
+
+	rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size);
+	if (rem) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		tmp ^= cfs_rand();
+		memcpy(buf, &tmp, rem);
+		p = buf + rem;
+		size -= rem;
+	}
+
+	while (size >= sizeof(int)) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		*p = cfs_rand() ^ tmp;
+		size -= sizeof(int);
+		p++;
+	}
+	buf = p;
+	if (size) {
+		get_random_bytes(&tmp, sizeof(tmp));
+		tmp ^= cfs_rand();
+		memcpy(buf, &tmp, size);
+	}
+}
+EXPORT_SYMBOL(cfs_get_random_bytes);

diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.c b/drivers/staging/lustre/lustre/libcfs/tracefile.c
new file mode 100644
index 0000000..439e71d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/tracefile.c

@@ -0,0 +1,1195 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/tracefile.c
+ *
+ * Author: Zach Brown <zab@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+#include "tracefile.h"
+
+#include <linux/libcfs/libcfs.h>
+
+/* XXX move things up to the top, comment */
+union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned;
+
+char cfs_tracefile[TRACEFILE_NAME_SIZE];
+long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+static struct tracefiled_ctl trace_tctl;
+struct mutex cfs_trace_thread_mutex;
+static int thread_running = 0;
+
+atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
+
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+					 struct cfs_trace_cpu_data *tcd);
+
+static inline struct cfs_trace_page *
+cfs_tage_from_list(struct list_head *list)
+{
+	return list_entry(list, struct cfs_trace_page, linkage);
+}
+
+static struct cfs_trace_page *cfs_tage_alloc(int gfp)
+{
+	struct page	    *page;
+	struct cfs_trace_page *tage;
+
+	/* My caller is trying to free memory */
+	if (!in_interrupt() && memory_pressure_get())
+		return NULL;
+
+	/*
+	 * Don't spam console with allocation failures: they will be reported
+	 * by upper layer anyway.
+	 */
+	gfp |= __GFP_NOWARN;
+	page = alloc_page(gfp);
+	if (page == NULL)
+		return NULL;
+
+	tage = kmalloc(sizeof(*tage), gfp);
+	if (tage == NULL) {
+		__free_page(page);
+		return NULL;
+	}
+
+	tage->page = page;
+	atomic_inc(&cfs_tage_allocated);
+	return tage;
+}
+
+static void cfs_tage_free(struct cfs_trace_page *tage)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(tage->page != NULL);
+
+	__free_page(tage->page);
+	kfree(tage);
+	atomic_dec(&cfs_tage_allocated);
+}
+
+static void cfs_tage_to_tail(struct cfs_trace_page *tage,
+			     struct list_head *queue)
+{
+	__LASSERT(tage != NULL);
+	__LASSERT(queue != NULL);
+
+	list_move_tail(&tage->linkage, queue);
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, int gfp,
+			   struct list_head *stock)
+{
+	int i;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) {
+		struct cfs_trace_page *tage;
+
+		tage = cfs_tage_alloc(gfp);
+		if (tage == NULL)
+			break;
+		list_add_tail(&tage->linkage, stock);
+	}
+	return i;
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *
+cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
+{
+	struct cfs_trace_page *tage;
+
+	if (tcd->tcd_cur_pages > 0) {
+		__LASSERT(!list_empty(&tcd->tcd_pages));
+		tage = cfs_tage_from_list(tcd->tcd_pages.prev);
+		if (tage->used + len <= PAGE_CACHE_SIZE)
+			return tage;
+	}
+
+	if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
+		if (tcd->tcd_cur_stock_pages > 0) {
+			tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
+			--tcd->tcd_cur_stock_pages;
+			list_del_init(&tage->linkage);
+		} else {
+			tage = cfs_tage_alloc(GFP_ATOMIC);
+			if (unlikely(tage == NULL)) {
+				if ((!memory_pressure_get() ||
+				     in_interrupt()) && printk_ratelimit())
+					printk(KERN_WARNING
+					       "cannot allocate a tage (%ld)\n",
+					       tcd->tcd_cur_pages);
+				return NULL;
+			}
+		}
+
+		tage->used = 0;
+		tage->cpu = smp_processor_id();
+		tage->type = tcd->tcd_type;
+		list_add_tail(&tage->linkage, &tcd->tcd_pages);
+		tcd->tcd_cur_pages++;
+
+		if (tcd->tcd_cur_pages > 8 && thread_running) {
+			struct tracefiled_ctl *tctl = &trace_tctl;
+			/*
+			 * wake up tracefiled to process some pages.
+			 */
+			wake_up(&tctl->tctl_waitq);
+		}
+		return tage;
+	}
+	return NULL;
+}
+
+static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
+{
+	int pgcount = tcd->tcd_cur_pages / 10;
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (printk_ratelimit())
+		printk(KERN_WARNING "debug daemon buffer overflowed; "
+		       "discarding 10%% of pages (%d of %ld)\n",
+		       pgcount + 1, tcd->tcd_cur_pages);
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+	spin_lock_init(&pc.pc_lock);
+
+	list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+		if (pgcount-- == 0)
+			break;
+
+		list_move_tail(&tage->linkage, &pc.pc_pages);
+		tcd->tcd_cur_pages--;
+	}
+	put_pages_on_tcd_daemon_list(&pc, tcd);
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
+						 unsigned long len)
+{
+	struct cfs_trace_page *tage;
+
+	/*
+	 * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+	 * from here: this will lead to infinite recursion.
+	 */
+
+	if (len > PAGE_CACHE_SIZE) {
+		printk(KERN_ERR
+		       "cowardly refusing to write %lu bytes in a page\n", len);
+		return NULL;
+	}
+
+	tage = cfs_trace_get_tage_try(tcd, len);
+	if (tage != NULL)
+		return tage;
+	if (thread_running)
+		cfs_tcd_shrink(tcd);
+	if (tcd->tcd_cur_pages > 0) {
+		tage = cfs_tage_from_list(tcd->tcd_pages.next);
+		tage->used = 0;
+		cfs_tage_to_tail(tage, &tcd->tcd_pages);
+	}
+	return tage;
+}
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+		     const char *format, ...)
+{
+	va_list args;
+	int     rc;
+
+	va_start(args, format);
+	rc = libcfs_debug_vmsg2(msgdata, format, args, NULL);
+	va_end(args);
+
+	return rc;
+}
+EXPORT_SYMBOL(libcfs_debug_msg);
+
+int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+		       const char *format1, va_list args,
+		       const char *format2, ...)
+{
+	struct cfs_trace_cpu_data *tcd = NULL;
+	struct ptldebug_header     header = {0};
+	struct cfs_trace_page     *tage;
+	/* string_buf is used only if tcd != NULL, and is always set then */
+	char		      *string_buf = NULL;
+	char		      *debug_buf;
+	int			known_size;
+	int			needed = 85; /* average message length */
+	int			max_nob;
+	va_list		    ap;
+	int			depth;
+	int			i;
+	int			remain;
+	int			mask = msgdata->msg_mask;
+	char		      *file = (char *)msgdata->msg_file;
+	cfs_debug_limit_state_t   *cdls = msgdata->msg_cdls;
+
+	if (strchr(file, '/'))
+		file = strrchr(file, '/') + 1;
+
+	tcd = cfs_trace_get_tcd();
+
+	/* cfs_trace_get_tcd() grabs a lock, which disables preemption and
+	 * pins us to a particular CPU.  This avoids an smp_processor_id()
+	 * warning on Linux when debugging is enabled. */
+	cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
+
+	if (tcd == NULL)		/* arch may not log in IRQ context */
+		goto console;
+
+	if (tcd->tcd_cur_pages == 0)
+		header.ph_flags |= PH_FLAG_FIRST_RECORD;
+
+	if (tcd->tcd_shutting_down) {
+		cfs_trace_put_tcd(tcd);
+		tcd = NULL;
+		goto console;
+	}
+
+	depth = __current_nesting_level();
+	known_size = strlen(file) + 1 + depth;
+	if (msgdata->msg_fn)
+		known_size += strlen(msgdata->msg_fn) + 1;
+
+	if (libcfs_debug_binary)
+		known_size += sizeof(header);
+
+	/*/
+	 * '2' used because vsnprintf return real size required for output
+	 * _without_ terminating NULL.
+	 * if needed is to small for this format.
+	 */
+	for (i = 0; i < 2; i++) {
+		tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
+		if (tage == NULL) {
+			if (needed + known_size > PAGE_CACHE_SIZE)
+				mask |= D_ERROR;
+
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+		string_buf = (char *)page_address(tage->page) +
+					tage->used + known_size;
+
+		max_nob = PAGE_CACHE_SIZE - tage->used - known_size;
+		if (max_nob <= 0) {
+			printk(KERN_EMERG "negative max_nob: %d\n",
+			       max_nob);
+			mask |= D_ERROR;
+			cfs_trace_put_tcd(tcd);
+			tcd = NULL;
+			goto console;
+		}
+
+		needed = 0;
+		if (format1) {
+			va_copy(ap, args);
+			needed = vsnprintf(string_buf, max_nob, format1, ap);
+			va_end(ap);
+		}
+
+		if (format2) {
+			remain = max_nob - needed;
+			if (remain < 0)
+				remain = 0;
+
+			va_start(ap, format2);
+			needed += vsnprintf(string_buf + needed, remain,
+					    format2, ap);
+			va_end(ap);
+		}
+
+		if (needed < max_nob) /* well. printing ok.. */
+			break;
+	}
+
+	if (*(string_buf+needed-1) != '\n')
+		printk(KERN_INFO "format at %s:%d:%s doesn't end in "
+		       "newline\n", file, msgdata->msg_line, msgdata->msg_fn);
+
+	header.ph_len = known_size + needed;
+	debug_buf = (char *)page_address(tage->page) + tage->used;
+
+	if (libcfs_debug_binary) {
+		memcpy(debug_buf, &header, sizeof(header));
+		tage->used += sizeof(header);
+		debug_buf += sizeof(header);
+	}
+
+	/* indent message according to the nesting level */
+	while (depth-- > 0) {
+		*(debug_buf++) = '.';
+		++ tage->used;
+	}
+
+	strcpy(debug_buf, file);
+	tage->used += strlen(file) + 1;
+	debug_buf += strlen(file) + 1;
+
+	if (msgdata->msg_fn) {
+		strcpy(debug_buf, msgdata->msg_fn);
+		tage->used += strlen(msgdata->msg_fn) + 1;
+		debug_buf += strlen(msgdata->msg_fn) + 1;
+	}
+
+	__LASSERT(debug_buf == string_buf);
+
+	tage->used += needed;
+	__LASSERT (tage->used <= PAGE_CACHE_SIZE);
+
+console:
+	if ((mask & libcfs_printk) == 0) {
+		/* no console output requested */
+		if (tcd != NULL)
+			cfs_trace_put_tcd(tcd);
+		return 1;
+	}
+
+	if (cdls != NULL) {
+		if (libcfs_console_ratelimit &&
+		    cdls->cdls_next != 0 &&     /* not first time ever */
+		    !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+			/* skipping a console message */
+			cdls->cdls_count++;
+			if (tcd != NULL)
+				cfs_trace_put_tcd(tcd);
+			return 1;
+		}
+
+		if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
+						       libcfs_console_max_delay
+						       + cfs_time_seconds(10))) {
+			/* last timeout was a long time ago */
+			cdls->cdls_delay /= libcfs_console_backoff * 4;
+		} else {
+			cdls->cdls_delay *= libcfs_console_backoff;
+
+			if (cdls->cdls_delay < libcfs_console_min_delay)
+				cdls->cdls_delay = libcfs_console_min_delay;
+			else if (cdls->cdls_delay > libcfs_console_max_delay)
+				cdls->cdls_delay = libcfs_console_max_delay;
+		}
+
+		/* ensure cdls_next is never zero after it's been seen */
+		cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+	}
+
+	if (tcd != NULL) {
+		cfs_print_to_console(&header, mask, string_buf, needed, file,
+				     msgdata->msg_fn);
+		cfs_trace_put_tcd(tcd);
+	} else {
+		string_buf = cfs_trace_get_console_buffer();
+
+		needed = 0;
+		if (format1 != NULL) {
+			va_copy(ap, args);
+			needed = vsnprintf(string_buf,
+					   CFS_TRACE_CONSOLE_BUFFER_SIZE,
+					   format1, ap);
+			va_end(ap);
+		}
+		if (format2 != NULL) {
+			remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed;
+			if (remain > 0) {
+				va_start(ap, format2);
+				needed += vsnprintf(string_buf+needed, remain,
+						    format2, ap);
+				va_end(ap);
+			}
+		}
+		cfs_print_to_console(&header, mask,
+				     string_buf, needed, file, msgdata->msg_fn);
+
+		cfs_trace_put_console_buffer(string_buf);
+	}
+
+	if (cdls != NULL && cdls->cdls_count != 0) {
+		string_buf = cfs_trace_get_console_buffer();
+
+		needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE,
+				  "Skipped %d previous similar message%s\n",
+				  cdls->cdls_count,
+				  (cdls->cdls_count > 1) ? "s" : "");
+
+		cfs_print_to_console(&header, mask,
+				     string_buf, needed, file, msgdata->msg_fn);
+
+		cfs_trace_put_console_buffer(string_buf);
+		cdls->cdls_count = 0;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(libcfs_debug_vmsg2);
+
+void
+cfs_trace_assertion_failed(const char *str,
+			   struct libcfs_debug_msg_data *msgdata)
+{
+	struct ptldebug_header hdr;
+
+	libcfs_panic_in_progress = 1;
+	libcfs_catastrophe = 1;
+	mb();
+
+	cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
+
+	cfs_print_to_console(&hdr, D_EMERG, str, strlen(str),
+			     msgdata->msg_file, msgdata->msg_fn);
+
+	panic("Lustre debug assertion failure\n");
+
+	/* not reached */
+}
+
+static void
+panic_collect_pages(struct page_collection *pc)
+{
+	/* Do the collect_pages job on a single CPU: assumes that all other
+	 * CPUs have been stopped during a panic.  If this isn't true for some
+	 * arch, this will have to be implemented separately in each arch.  */
+	int			i;
+	int			j;
+	struct cfs_trace_cpu_data *tcd;
+
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	cfs_tcd_for_each(tcd, i, j) {
+		list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+		tcd->tcd_cur_pages = 0;
+
+		if (pc->pc_want_daemon_pages) {
+			list_splice_init(&tcd->tcd_daemon_pages,
+					     &pc->pc_pages);
+			tcd->tcd_cur_daemon_pages = 0;
+		}
+	}
+}
+
+static void collect_pages_on_all_cpus(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i, cpu;
+
+	spin_lock(&pc->pc_lock);
+	cfs_for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+			tcd->tcd_cur_pages = 0;
+			if (pc->pc_want_daemon_pages) {
+				list_splice_init(&tcd->tcd_daemon_pages,
+						     &pc->pc_pages);
+				tcd->tcd_cur_daemon_pages = 0;
+			}
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void collect_pages(struct page_collection *pc)
+{
+	INIT_LIST_HEAD(&pc->pc_pages);
+
+	if (libcfs_panic_in_progress)
+		panic_collect_pages(pc);
+	else
+		collect_pages_on_all_cpus(pc);
+}
+
+static void put_pages_back_on_all_cpus(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	struct list_head *cur_head;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	int i, cpu;
+
+	spin_lock(&pc->pc_lock);
+	cfs_for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			cur_head = tcd->tcd_pages.next;
+
+			list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
+						 linkage) {
+
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				if (tage->cpu != cpu || tage->type != i)
+					continue;
+
+				cfs_tage_to_tail(tage, cur_head);
+				tcd->tcd_cur_pages++;
+			}
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_back(struct page_collection *pc)
+{
+	if (!libcfs_panic_in_progress)
+		put_pages_back_on_all_cpus(pc);
+}
+
+/* Add pages to a per-cpu debug daemon ringbuffer.  This buffer makes sure that
+ * we have a good amount of data at all times for dumping during an LBUG, even
+ * if we have been steadily writing (and otherwise discarding) pages via the
+ * debug daemon. */
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+					 struct cfs_trace_cpu_data *tcd)
+{
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock(&pc->pc_lock);
+	list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type)
+			continue;
+
+		cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages);
+		tcd->tcd_cur_daemon_pages++;
+
+		if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
+			struct cfs_trace_page *victim;
+
+			__LASSERT(!list_empty(&tcd->tcd_daemon_pages));
+			victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next);
+
+			__LASSERT_TAGE_INVARIANT(victim);
+
+			list_del(&victim->linkage);
+			cfs_tage_free(victim);
+			tcd->tcd_cur_daemon_pages--;
+		}
+	}
+	spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_on_daemon_list(struct page_collection *pc)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int i, cpu;
+
+	cfs_for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu)
+			put_pages_on_tcd_daemon_list(pc, tcd);
+	}
+}
+
+void cfs_trace_debug_print(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock_init(&pc.pc_lock);
+
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+		char *p, *file, *fn;
+		struct page *page;
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		page = tage->page;
+		p = page_address(page);
+		while (p < ((char *)page_address(page) + tage->used)) {
+			struct ptldebug_header *hdr;
+			int len;
+			hdr = (void *)p;
+			p += sizeof(*hdr);
+			file = p;
+			p += strlen(file) + 1;
+			fn = p;
+			p += strlen(fn) + 1;
+			len = hdr->ph_len - (int)(p - (char *)hdr);
+
+			cfs_print_to_console(hdr, D_EMERG, p, len, file, fn);
+
+			p += len;
+		}
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+}
+
+int cfs_tracefile_dump_all_pages(char *filename)
+{
+	struct page_collection	pc;
+	struct file		*filp;
+	struct cfs_trace_page	*tage;
+	struct cfs_trace_page	*tmp;
+	int rc;
+
+	DECL_MMSPACE;
+
+	cfs_tracefile_write_lock();
+
+	filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
+	if (IS_ERR(filp)) {
+		rc = PTR_ERR(filp);
+		filp = NULL;
+		printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
+		      filename, rc);
+		goto out;
+	}
+
+	spin_lock_init(&pc.pc_lock);
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	if (list_empty(&pc.pc_pages)) {
+		rc = 0;
+		goto close;
+	}
+
+	/* ok, for now, just write the pages.  in the future we'll be building
+	 * iobufs with the pages and calling generic_direct_IO */
+	MMSPACE_OPEN;
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		rc = filp_write(filp, page_address(tage->page),
+				tage->used, filp_poff(filp));
+		if (rc != (int)tage->used) {
+			printk(KERN_WARNING "wanted to write %u but wrote "
+			       "%d\n", tage->used, rc);
+			put_pages_back(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+			break;
+		}
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+	MMSPACE_CLOSE;
+	rc = filp_fsync(filp);
+	if (rc)
+		printk(KERN_ERR "sync returns %d\n", rc);
+close:
+	filp_close(filp, NULL);
+out:
+	cfs_tracefile_write_unlock();
+	return rc;
+}
+
+void cfs_trace_flush_pages(void)
+{
+	struct page_collection pc;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+
+	spin_lock_init(&pc.pc_lock);
+
+	pc.pc_want_daemon_pages = 1;
+	collect_pages(&pc);
+	list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+		__LASSERT_TAGE_INVARIANT(tage);
+
+		list_del(&tage->linkage);
+		cfs_tage_free(tage);
+	}
+}
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+			    const char *usr_buffer, int usr_buffer_nob)
+{
+	int    nob;
+
+	if (usr_buffer_nob > knl_buffer_nob)
+		return -EOVERFLOW;
+
+	if (copy_from_user((void *)knl_buffer,
+			   (void *)usr_buffer, usr_buffer_nob))
+		return -EFAULT;
+
+	nob = strnlen(knl_buffer, usr_buffer_nob);
+	while (nob-- >= 0)		      /* strip trailing whitespace */
+		if (!isspace(knl_buffer[nob]))
+			break;
+
+	if (nob < 0)			    /* empty string */
+		return -EINVAL;
+
+	if (nob == knl_buffer_nob)	      /* no space to terminate */
+		return -EOVERFLOW;
+
+	knl_buffer[nob + 1] = 0;		/* terminate */
+	return 0;
+}
+EXPORT_SYMBOL(cfs_trace_copyin_string);
+
+int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+			     const char *knl_buffer, char *append)
+{
+	/* NB if 'append' != NULL, it's a single character to append to the
+	 * copied out string - usually "\n", for /proc entries and "" (i.e. a
+	 * terminating zero byte) for sysctl entries */
+	int   nob = strlen(knl_buffer);
+
+	if (nob > usr_buffer_nob)
+		nob = usr_buffer_nob;
+
+	if (copy_to_user(usr_buffer, knl_buffer, nob))
+		return -EFAULT;
+
+	if (append != NULL && nob < usr_buffer_nob) {
+		if (copy_to_user(usr_buffer + nob, append, 1))
+			return -EFAULT;
+
+		nob++;
+	}
+
+	return nob;
+}
+EXPORT_SYMBOL(cfs_trace_copyout_string);
+
+int cfs_trace_allocate_string_buffer(char **str, int nob)
+{
+	if (nob > 2 * PAGE_CACHE_SIZE)	    /* string must be "sensible" */
+		return -EINVAL;
+
+	*str = kmalloc(nob, GFP_IOFS | __GFP_ZERO);
+	if (*str == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void cfs_trace_free_string_buffer(char *str, int nob)
+{
+	kfree(str);
+}
+
+int cfs_trace_dump_debug_buffer_usrstr(void *usr_str, int usr_str_nob)
+{
+	char	 *str;
+	int	   rc;
+
+	rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+	if (rc != 0)
+		return rc;
+
+	rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+				     usr_str, usr_str_nob);
+	if (rc != 0)
+		goto out;
+
+	if (str[0] != '/') {
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = cfs_tracefile_dump_all_pages(str);
+out:
+	cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+	return rc;
+}
+
+int cfs_trace_daemon_command(char *str)
+{
+	int       rc = 0;
+
+	cfs_tracefile_write_lock();
+
+	if (strcmp(str, "stop") == 0) {
+		cfs_tracefile_write_unlock();
+		cfs_trace_stop_thread();
+		cfs_tracefile_write_lock();
+		memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
+
+	} else if (strncmp(str, "size=", 5) == 0) {
+		cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
+		if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
+			cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+		else
+			cfs_tracefile_size <<= 20;
+
+	} else if (strlen(str) >= sizeof(cfs_tracefile)) {
+		rc = -ENAMETOOLONG;
+	} else if (str[0] != '/') {
+		rc = -EINVAL;
+	} else {
+		strcpy(cfs_tracefile, str);
+
+		printk(KERN_INFO
+		       "Lustre: debug daemon will attempt to start writing "
+		       "to %s (%lukB max)\n", cfs_tracefile,
+		       (long)(cfs_tracefile_size >> 10));
+
+		cfs_trace_start_thread();
+	}
+
+	cfs_tracefile_write_unlock();
+	return rc;
+}
+
+int cfs_trace_daemon_command_usrstr(void *usr_str, int usr_str_nob)
+{
+	char *str;
+	int   rc;
+
+	rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+	if (rc != 0)
+		return rc;
+
+	rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+				 usr_str, usr_str_nob);
+	if (rc == 0)
+		rc = cfs_trace_daemon_command(str);
+
+	cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+	return rc;
+}
+
+int cfs_trace_set_debug_mb(int mb)
+{
+	int i;
+	int j;
+	int pages;
+	int limit = cfs_trace_max_debug_mb();
+	struct cfs_trace_cpu_data *tcd;
+
+	if (mb < num_possible_cpus()) {
+		printk(KERN_WARNING
+		       "Lustre: %d MB is too small for debug buffer size, "
+		       "setting it to %d MB.\n", mb, num_possible_cpus());
+		mb = num_possible_cpus();
+	}
+
+	if (mb > limit) {
+		printk(KERN_WARNING
+		       "Lustre: %d MB is too large for debug buffer size, "
+		       "setting it to %d MB.\n", mb, limit);
+		mb = limit;
+	}
+
+	mb /= num_possible_cpus();
+	pages = mb << (20 - PAGE_CACHE_SHIFT);
+
+	cfs_tracefile_write_lock();
+
+	cfs_tcd_for_each(tcd, i, j)
+		tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
+
+	cfs_tracefile_write_unlock();
+
+	return 0;
+}
+
+int cfs_trace_set_debug_mb_usrstr(void *usr_str, int usr_str_nob)
+{
+	char     str[32];
+	int      rc;
+
+	rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
+	if (rc < 0)
+		return rc;
+
+	return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
+}
+
+int cfs_trace_get_debug_mb(void)
+{
+	int i;
+	int j;
+	struct cfs_trace_cpu_data *tcd;
+	int total_pages = 0;
+
+	cfs_tracefile_read_lock();
+
+	cfs_tcd_for_each(tcd, i, j)
+		total_pages += tcd->tcd_max_pages;
+
+	cfs_tracefile_read_unlock();
+
+	return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1;
+}
+
+static int tracefiled(void *arg)
+{
+	struct page_collection pc;
+	struct tracefiled_ctl *tctl = arg;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	struct file *filp;
+	int last_loop = 0;
+	int rc;
+
+	DECL_MMSPACE;
+
+	/* we're started late enough that we pick up init's fs context */
+	/* this is so broken in uml?  what on earth is going on? */
+
+	spin_lock_init(&pc.pc_lock);
+	complete(&tctl->tctl_start);
+
+	while (1) {
+		wait_queue_t __wait;
+
+		pc.pc_want_daemon_pages = 0;
+		collect_pages(&pc);
+		if (list_empty(&pc.pc_pages))
+			goto end_loop;
+
+		filp = NULL;
+		cfs_tracefile_read_lock();
+		if (cfs_tracefile[0] != 0) {
+			filp = filp_open(cfs_tracefile,
+					 O_CREAT | O_RDWR | O_LARGEFILE,
+					 0600);
+			if (IS_ERR(filp)) {
+				rc = PTR_ERR(filp);
+				filp = NULL;
+				printk(KERN_WARNING "couldn't open %s: "
+				       "%d\n", cfs_tracefile, rc);
+			}
+		}
+		cfs_tracefile_read_unlock();
+		if (filp == NULL) {
+			put_pages_on_daemon_list(&pc);
+			__LASSERT(list_empty(&pc.pc_pages));
+			goto end_loop;
+		}
+
+		MMSPACE_OPEN;
+
+		list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+						   linkage) {
+			static loff_t f_pos;
+
+			__LASSERT_TAGE_INVARIANT(tage);
+
+			if (f_pos >= (off_t)cfs_tracefile_size)
+				f_pos = 0;
+			else if (f_pos > (off_t)filp_size(filp))
+				f_pos = filp_size(filp);
+
+			rc = filp_write(filp, page_address(tage->page),
+					tage->used, &f_pos);
+			if (rc != (int)tage->used) {
+				printk(KERN_WARNING "wanted to write %u "
+				       "but wrote %d\n", tage->used, rc);
+				put_pages_back(&pc);
+				__LASSERT(list_empty(&pc.pc_pages));
+			}
+		}
+		MMSPACE_CLOSE;
+
+		filp_close(filp, NULL);
+		put_pages_on_daemon_list(&pc);
+		if (!list_empty(&pc.pc_pages)) {
+			int i;
+
+			printk(KERN_ALERT "Lustre: trace pages aren't "
+			       " empty\n");
+			printk(KERN_ERR "total cpus(%d): ",
+			       num_possible_cpus());
+			for (i = 0; i < num_possible_cpus(); i++)
+				if (cpu_online(i))
+					printk(KERN_ERR "%d(on) ", i);
+				else
+					printk(KERN_ERR "%d(off) ", i);
+			printk(KERN_ERR "\n");
+
+			i = 0;
+			list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+						     linkage)
+				printk(KERN_ERR "page %d belongs to cpu "
+				       "%d\n", ++i, tage->cpu);
+			printk(KERN_ERR "There are %d pages unwritten\n",
+			       i);
+		}
+		__LASSERT(list_empty(&pc.pc_pages));
+end_loop:
+		if (atomic_read(&tctl->tctl_shutdown)) {
+			if (last_loop == 0) {
+				last_loop = 1;
+				continue;
+			} else {
+				break;
+			}
+		}
+		init_waitqueue_entry_current(&__wait);
+		add_wait_queue(&tctl->tctl_waitq, &__wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+		waitq_timedwait(&__wait, TASK_INTERRUPTIBLE,
+				    cfs_time_seconds(1));
+		remove_wait_queue(&tctl->tctl_waitq, &__wait);
+	}
+	complete(&tctl->tctl_stop);
+	return 0;
+}
+
+int cfs_trace_start_thread(void)
+{
+	struct tracefiled_ctl *tctl = &trace_tctl;
+	int rc = 0;
+
+	mutex_lock(&cfs_trace_thread_mutex);
+	if (thread_running)
+		goto out;
+
+	init_completion(&tctl->tctl_start);
+	init_completion(&tctl->tctl_stop);
+	init_waitqueue_head(&tctl->tctl_waitq);
+	atomic_set(&tctl->tctl_shutdown, 0);
+
+	if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) {
+		rc = -ECHILD;
+		goto out;
+	}
+
+	wait_for_completion(&tctl->tctl_start);
+	thread_running = 1;
+out:
+	mutex_unlock(&cfs_trace_thread_mutex);
+	return rc;
+}
+
+void cfs_trace_stop_thread(void)
+{
+	struct tracefiled_ctl *tctl = &trace_tctl;
+
+	mutex_lock(&cfs_trace_thread_mutex);
+	if (thread_running) {
+		printk(KERN_INFO
+		       "Lustre: shutting down debug daemon thread...\n");
+		atomic_set(&tctl->tctl_shutdown, 1);
+		wait_for_completion(&tctl->tctl_stop);
+		thread_running = 0;
+	}
+	mutex_unlock(&cfs_trace_thread_mutex);
+}
+
+int cfs_tracefile_init(int max_pages)
+{
+	struct cfs_trace_cpu_data *tcd;
+	int		    i;
+	int		    j;
+	int		    rc;
+	int		    factor;
+
+	rc = cfs_tracefile_init_arch();
+	if (rc != 0)
+		return rc;
+
+	cfs_tcd_for_each(tcd, i, j) {
+		/* tcd_pages_factor is initialized int tracefile_init_arch. */
+		factor = tcd->tcd_pages_factor;
+		INIT_LIST_HEAD(&tcd->tcd_pages);
+		INIT_LIST_HEAD(&tcd->tcd_stock_pages);
+		INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
+		tcd->tcd_cur_pages = 0;
+		tcd->tcd_cur_stock_pages = 0;
+		tcd->tcd_cur_daemon_pages = 0;
+		tcd->tcd_max_pages = (max_pages * factor) / 100;
+		LASSERT(tcd->tcd_max_pages > 0);
+		tcd->tcd_shutting_down = 0;
+	}
+
+	return 0;
+}
+
+static void trace_cleanup_on_all_cpus(void)
+{
+	struct cfs_trace_cpu_data *tcd;
+	struct cfs_trace_page *tage;
+	struct cfs_trace_page *tmp;
+	int i, cpu;
+
+	cfs_for_each_possible_cpu(cpu) {
+		cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+			tcd->tcd_shutting_down = 1;
+
+			list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages,
+							   linkage) {
+				__LASSERT_TAGE_INVARIANT(tage);
+
+				list_del(&tage->linkage);
+				cfs_tage_free(tage);
+			}
+
+			tcd->tcd_cur_pages = 0;
+		}
+	}
+}
+
+static void cfs_trace_cleanup(void)
+{
+	struct page_collection pc;
+
+	INIT_LIST_HEAD(&pc.pc_pages);
+	spin_lock_init(&pc.pc_lock);
+
+	trace_cleanup_on_all_cpus();
+
+	cfs_tracefile_fini_arch();
+}
+
+void cfs_tracefile_exit(void)
+{
+	cfs_trace_stop_thread();
+	cfs_trace_cleanup();
+}

diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.h b/drivers/staging/lustre/lustre/libcfs/tracefile.h
new file mode 100644
index 0000000..7e8d17c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/tracefile.h

@@ -0,0 +1,340 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_TRACEFILE_H__
+#define __LIBCFS_TRACEFILE_H__
+
+#include <linux/libcfs/libcfs.h>
+
+#include "linux/linux-tracefile.h"
+
+/* trace file lock routines */
+
+#define TRACEFILE_NAME_SIZE 1024
+extern char      cfs_tracefile[TRACEFILE_NAME_SIZE];
+extern long long cfs_tracefile_size;
+
+extern void libcfs_run_debug_log_upcall(char *file);
+
+int  cfs_tracefile_init_arch(void);
+void cfs_tracefile_fini_arch(void);
+
+void cfs_tracefile_read_lock(void);
+void cfs_tracefile_read_unlock(void);
+void cfs_tracefile_write_lock(void);
+void cfs_tracefile_write_unlock(void);
+
+int cfs_tracefile_dump_all_pages(char *filename);
+void cfs_trace_debug_print(void);
+void cfs_trace_flush_pages(void);
+int cfs_trace_start_thread(void);
+void cfs_trace_stop_thread(void);
+int cfs_tracefile_init(int max_pages);
+void cfs_tracefile_exit(void);
+
+
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+			    const char *usr_buffer, int usr_buffer_nob);
+int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+			     const char *knl_str, char *append);
+int cfs_trace_allocate_string_buffer(char **str, int nob);
+void cfs_trace_free_string_buffer(char *str, int nob);
+int cfs_trace_dump_debug_buffer_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_daemon_command(char *str);
+int cfs_trace_daemon_command_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_set_debug_mb(int mb);
+int cfs_trace_set_debug_mb_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_get_debug_mb(void);
+
+extern void libcfs_debug_dumplog_internal(void *arg);
+extern void libcfs_register_panic_notifier(void);
+extern void libcfs_unregister_panic_notifier(void);
+extern int  libcfs_panic_in_progress;
+extern int  cfs_trace_max_debug_mb(void);
+
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+#ifdef LUSTRE_TRACEFILE_PRIVATE
+
+/*
+ * Private declare for tracefile
+ */
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+/* Size of a buffer for sprinting console messages if we can't get a page
+ * from system */
+#define CFS_TRACE_CONSOLE_BUFFER_SIZE   1024
+
+union cfs_trace_data_union {
+	struct cfs_trace_cpu_data {
+		/*
+		 * Even though this structure is meant to be per-CPU, locking
+		 * is needed because in some places the data may be accessed
+		 * from other CPUs. This lock is directly used in trace_get_tcd
+		 * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and
+		 * tcd_for_each_type_lock
+		 */
+		spinlock_t		tcd_lock;
+		unsigned long	   tcd_lock_flags;
+
+		/*
+		 * pages with trace records not yet processed by tracefiled.
+		 */
+		struct list_head	      tcd_pages;
+		/* number of pages on ->tcd_pages */
+		unsigned long	   tcd_cur_pages;
+
+		/*
+		 * pages with trace records already processed by
+		 * tracefiled. These pages are kept in memory, so that some
+		 * portion of log can be written in the event of LBUG. This
+		 * list is maintained in LRU order.
+		 *
+		 * Pages are moved to ->tcd_daemon_pages by tracefiled()
+		 * (put_pages_on_daemon_list()). LRU pages from this list are
+		 * discarded when list grows too large.
+		 */
+		struct list_head	      tcd_daemon_pages;
+		/* number of pages on ->tcd_daemon_pages */
+		unsigned long	   tcd_cur_daemon_pages;
+
+		/*
+		 * Maximal number of pages allowed on ->tcd_pages and
+		 * ->tcd_daemon_pages each.
+		 * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current
+		 * implementation.
+		 */
+		unsigned long	   tcd_max_pages;
+
+		/*
+		 * preallocated pages to write trace records into. Pages from
+		 * ->tcd_stock_pages are moved to ->tcd_pages by
+		 * portals_debug_msg().
+		 *
+		 * This list is necessary, because on some platforms it's
+		 * impossible to perform efficient atomic page allocation in a
+		 * non-blockable context.
+		 *
+		 * Such platforms fill ->tcd_stock_pages "on occasion", when
+		 * tracing code is entered in blockable context.
+		 *
+		 * trace_get_tage_try() tries to get a page from
+		 * ->tcd_stock_pages first and resorts to atomic page
+		 * allocation only if this queue is empty. ->tcd_stock_pages
+		 * is replenished when tracing code is entered in blocking
+		 * context (darwin-tracefile.c:trace_get_tcd()). We try to
+		 * maintain TCD_STOCK_PAGES (40 by default) pages in this
+		 * queue. Atomic allocation is only required if more than
+		 * TCD_STOCK_PAGES pagesful are consumed by trace records all
+		 * emitted in non-blocking contexts. Which is quite unlikely.
+		 */
+		struct list_head	      tcd_stock_pages;
+		/* number of pages on ->tcd_stock_pages */
+		unsigned long	   tcd_cur_stock_pages;
+
+		unsigned short	  tcd_shutting_down;
+		unsigned short	  tcd_cpu;
+		unsigned short	  tcd_type;
+		/* The factors to share debug memory. */
+		unsigned short	  tcd_pages_factor;
+	} tcd;
+	char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))];
+};
+
+#define TCD_MAX_TYPES      8
+extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS];
+
+#define cfs_tcd_for_each(tcd, i, j)				       \
+    for (i = 0; cfs_trace_data[i] != NULL; i++)			   \
+	for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd);	       \
+	     j < num_possible_cpus();				 \
+	     j++, (tcd) = &(*cfs_trace_data[i])[j].tcd)
+
+#define cfs_tcd_for_each_type_lock(tcd, i, cpu)			   \
+    for (i = 0; cfs_trace_data[i] &&				      \
+	 (tcd = &(*cfs_trace_data[i])[cpu].tcd) &&			\
+	 cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++)
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct page_collection {
+	struct list_head	pc_pages;
+	/*
+	 * spin-lock protecting ->pc_pages. It is taken by smp_call_function()
+	 * call-back functions. XXX nikita: Which is horrible: all processors
+	 * receive NMI at the same time only to be serialized by this
+	 * lock. Probably ->pc_pages should be replaced with an array of
+	 * NR_CPUS elements accessed locklessly.
+	 */
+	spinlock_t	pc_lock;
+	/*
+	 * if this flag is set, collect_pages() will spill both
+	 * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise,
+	 * only ->tcd_pages are spilled.
+	 */
+	int		pc_want_daemon_pages;
+};
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct tracefiled_ctl {
+	struct completion	tctl_start;
+	struct completion	tctl_stop;
+	wait_queue_head_t		tctl_waitq;
+	pid_t			tctl_pid;
+	atomic_t		tctl_shutdown;
+};
+
+/*
+ * small data-structure for each page owned by tracefiled.
+ */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct cfs_trace_page {
+	/*
+	 * page itself
+	 */
+	struct page	  *page;
+	/*
+	 * linkage into one of the lists in trace_data_union or
+	 * page_collection
+	 */
+	struct list_head	   linkage;
+	/*
+	 * number of bytes used within this page
+	 */
+	unsigned int	 used;
+	/*
+	 * cpu that owns this page
+	 */
+	unsigned short       cpu;
+	/*
+	 * type(context) of this page
+	 */
+	unsigned short       type;
+};
+
+extern void cfs_set_ptldebug_header(struct ptldebug_header *header,
+				    struct libcfs_debug_msg_data *m,
+				    unsigned long stack);
+extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+				 const char *buf, int len, const char *file,
+				 const char *fn);
+
+extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+
+/**
+ * trace_buf_type_t, trace_buf_idx_get() and trace_console_buffers[][]
+ * are not public libcfs API; they should be defined in
+ * platform-specific tracefile include files
+ * (see, for example, linux-tracefile.h).
+ */
+
+extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+extern cfs_trace_buf_type_t cfs_trace_buf_idx_get(void);
+
+static inline char *
+cfs_trace_get_console_buffer(void)
+{
+	unsigned int i = get_cpu();
+	unsigned int j = cfs_trace_buf_idx_get();
+
+	return cfs_trace_console_buffers[i][j];
+}
+
+static inline void
+cfs_trace_put_console_buffer(char *buffer)
+{
+	put_cpu();
+}
+
+static inline struct cfs_trace_cpu_data *
+cfs_trace_get_tcd(void)
+{
+	struct cfs_trace_cpu_data *tcd =
+		&(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd;
+
+	cfs_trace_lock_tcd(tcd, 0);
+
+	return tcd;
+}
+
+static inline void
+cfs_trace_put_tcd (struct cfs_trace_cpu_data *tcd)
+{
+	cfs_trace_unlock_tcd(tcd, 0);
+
+	put_cpu();
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, int gfp,
+			   struct list_head *stock);
+
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+		      struct cfs_trace_page *tage);
+
+extern void cfs_trace_assertion_failed(const char *str,
+				       struct libcfs_debug_msg_data *m);
+
+/* ASSERTION that is safe to use within the debug system */
+#define __LASSERT(cond)						 \
+do {								    \
+	if (unlikely(!(cond))) {					\
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);     \
+		cfs_trace_assertion_failed("ASSERTION("#cond") failed", \
+					   &msgdata);		   \
+	}							       \
+} while (0)
+
+#define __LASSERT_TAGE_INVARIANT(tage)				  \
+do {								    \
+	__LASSERT(tage != NULL);					\
+	__LASSERT(tage->page != NULL);				  \
+	__LASSERT(tage->used <= PAGE_CACHE_SIZE);			 \
+	__LASSERT(page_count(tage->page) > 0);		      \
+} while (0)
+
+#endif	/* LUSTRE_TRACEFILE_PRIVATE */
+
+#endif /* __LIBCFS_TRACEFILE_H__ */

diff --git a/drivers/staging/lustre/lustre/libcfs/upcall_cache.c b/drivers/staging/lustre/lustre/libcfs/upcall_cache.c
new file mode 100644
index 0000000..18c68c3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/upcall_cache.c

@@ -0,0 +1,462 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/upcall_cache.c
+ *
+ * Supplementary groups cache.
+ */
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/lucache.h>
+
+static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache,
+					      __u64 key, void *args)
+{
+	struct upcall_cache_entry *entry;
+
+	LIBCFS_ALLOC(entry, sizeof(*entry));
+	if (!entry)
+		return NULL;
+
+	UC_CACHE_SET_NEW(entry);
+	INIT_LIST_HEAD(&entry->ue_hash);
+	entry->ue_key = key;
+	atomic_set(&entry->ue_refcount, 0);
+	init_waitqueue_head(&entry->ue_waitq);
+	if (cache->uc_ops->init_entry)
+		cache->uc_ops->init_entry(entry, args);
+	return entry;
+}
+
+/* protected by cache lock */
+static void free_entry(struct upcall_cache *cache,
+		       struct upcall_cache_entry *entry)
+{
+	if (cache->uc_ops->free_entry)
+		cache->uc_ops->free_entry(cache, entry);
+
+	list_del(&entry->ue_hash);
+	CDEBUG(D_OTHER, "destroy cache entry %p for key "LPU64"\n",
+	       entry, entry->ue_key);
+	LIBCFS_FREE(entry, sizeof(*entry));
+}
+
+static inline int upcall_compare(struct upcall_cache *cache,
+				 struct upcall_cache_entry *entry,
+				 __u64 key, void *args)
+{
+	if (entry->ue_key != key)
+		return -1;
+
+	if (cache->uc_ops->upcall_compare)
+		return cache->uc_ops->upcall_compare(cache, entry, key, args);
+
+	return 0;
+}
+
+static inline int downcall_compare(struct upcall_cache *cache,
+				   struct upcall_cache_entry *entry,
+				   __u64 key, void *args)
+{
+	if (entry->ue_key != key)
+		return -1;
+
+	if (cache->uc_ops->downcall_compare)
+		return cache->uc_ops->downcall_compare(cache, entry, key, args);
+
+	return 0;
+}
+
+static inline void get_entry(struct upcall_cache_entry *entry)
+{
+	atomic_inc(&entry->ue_refcount);
+}
+
+static inline void put_entry(struct upcall_cache *cache,
+			     struct upcall_cache_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->ue_refcount) &&
+	    (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) {
+		free_entry(cache, entry);
+	}
+}
+
+static int check_unlink_entry(struct upcall_cache *cache,
+			      struct upcall_cache_entry *entry)
+{
+	if (UC_CACHE_IS_VALID(entry) &&
+	    cfs_time_before(cfs_time_current(), entry->ue_expire))
+		return 0;
+
+	if (UC_CACHE_IS_ACQUIRING(entry)) {
+		if (entry->ue_acquire_expire == 0 ||
+		    cfs_time_before(cfs_time_current(),
+				    entry->ue_acquire_expire))
+			return 0;
+
+		UC_CACHE_SET_EXPIRED(entry);
+		wake_up_all(&entry->ue_waitq);
+	} else if (!UC_CACHE_IS_INVALID(entry)) {
+		UC_CACHE_SET_EXPIRED(entry);
+	}
+
+	list_del_init(&entry->ue_hash);
+	if (!atomic_read(&entry->ue_refcount))
+		free_entry(cache, entry);
+	return 1;
+}
+
+static inline int refresh_entry(struct upcall_cache *cache,
+			 struct upcall_cache_entry *entry)
+{
+	LASSERT(cache->uc_ops->do_upcall);
+	return cache->uc_ops->do_upcall(cache, entry);
+}
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+						  __u64 key, void *args)
+{
+	struct upcall_cache_entry *entry = NULL, *new = NULL, *next;
+	struct list_head *head;
+	wait_queue_t wait;
+	int rc, found;
+	ENTRY;
+
+	LASSERT(cache);
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+find_again:
+	found = 0;
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry_safe(entry, next, head, ue_hash) {
+		/* check invalid & expired items */
+		if (check_unlink_entry(cache, entry))
+			continue;
+		if (upcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		if (!new) {
+			spin_unlock(&cache->uc_lock);
+			new = alloc_entry(cache, key, args);
+			if (!new) {
+				CERROR("fail to alloc entry\n");
+				RETURN(ERR_PTR(-ENOMEM));
+			}
+			goto find_again;
+		} else {
+			list_add(&new->ue_hash, head);
+			entry = new;
+		}
+	} else {
+		if (new) {
+			free_entry(cache, new);
+			new = NULL;
+		}
+		list_move(&entry->ue_hash, head);
+	}
+	get_entry(entry);
+
+	/* acquire for new one */
+	if (UC_CACHE_IS_NEW(entry)) {
+		UC_CACHE_SET_ACQUIRING(entry);
+		UC_CACHE_CLEAR_NEW(entry);
+		spin_unlock(&cache->uc_lock);
+		rc = refresh_entry(cache, entry);
+		spin_lock(&cache->uc_lock);
+		entry->ue_acquire_expire =
+			cfs_time_shift(cache->uc_acquire_expire);
+		if (rc < 0) {
+			UC_CACHE_CLEAR_ACQUIRING(entry);
+			UC_CACHE_SET_INVALID(entry);
+			wake_up_all(&entry->ue_waitq);
+			if (unlikely(rc == -EREMCHG)) {
+				put_entry(cache, entry);
+				GOTO(out, entry = ERR_PTR(rc));
+			}
+		}
+	}
+	/* someone (and only one) is doing upcall upon this item,
+	 * wait it to complete */
+	if (UC_CACHE_IS_ACQUIRING(entry)) {
+		long expiry = (entry == new) ?
+			      cfs_time_seconds(cache->uc_acquire_expire) :
+			      MAX_SCHEDULE_TIMEOUT;
+		long left;
+
+		init_waitqueue_entry_current(&wait);
+		add_wait_queue(&entry->ue_waitq, &wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock(&cache->uc_lock);
+
+		left = waitq_timedwait(&wait, TASK_INTERRUPTIBLE,
+					   expiry);
+
+		spin_lock(&cache->uc_lock);
+		remove_wait_queue(&entry->ue_waitq, &wait);
+		if (UC_CACHE_IS_ACQUIRING(entry)) {
+			/* we're interrupted or upcall failed in the middle */
+			rc = left > 0 ? -EINTR : -ETIMEDOUT;
+			CERROR("acquire for key "LPU64": error %d\n",
+			       entry->ue_key, rc);
+			put_entry(cache, entry);
+			GOTO(out, entry = ERR_PTR(rc));
+		}
+	}
+
+	/* invalid means error, don't need to try again */
+	if (UC_CACHE_IS_INVALID(entry)) {
+		put_entry(cache, entry);
+		GOTO(out, entry = ERR_PTR(-EIDRM));
+	}
+
+	/* check expired
+	 * We can't refresh the existing one because some
+	 * memory might be shared by multiple processes.
+	 */
+	if (check_unlink_entry(cache, entry)) {
+		/* if expired, try again. but if this entry is
+		 * created by me but too quickly turn to expired
+		 * without any error, should at least give a
+		 * chance to use it once.
+		 */
+		if (entry != new) {
+			put_entry(cache, entry);
+			spin_unlock(&cache->uc_lock);
+			new = NULL;
+			goto find_again;
+		}
+	}
+
+	/* Now we know it's good */
+out:
+	spin_unlock(&cache->uc_lock);
+	RETURN(entry);
+}
+EXPORT_SYMBOL(upcall_cache_get_entry);
+
+void upcall_cache_put_entry(struct upcall_cache *cache,
+			    struct upcall_cache_entry *entry)
+{
+	ENTRY;
+
+	if (!entry) {
+		EXIT;
+		return;
+	}
+
+	LASSERT(atomic_read(&entry->ue_refcount) > 0);
+	spin_lock(&cache->uc_lock);
+	put_entry(cache, entry);
+	spin_unlock(&cache->uc_lock);
+	EXIT;
+}
+EXPORT_SYMBOL(upcall_cache_put_entry);
+
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+			  void *args)
+{
+	struct upcall_cache_entry *entry = NULL;
+	struct list_head *head;
+	int found = 0, rc = 0;
+	ENTRY;
+
+	LASSERT(cache);
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry(entry, head, ue_hash) {
+		if (downcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			get_entry(entry);
+			break;
+		}
+	}
+
+	if (!found) {
+		CDEBUG(D_OTHER, "%s: upcall for key "LPU64" not expected\n",
+		       cache->uc_name, key);
+		/* haven't found, it's possible */
+		spin_unlock(&cache->uc_lock);
+		RETURN(-EINVAL);
+	}
+
+	if (err) {
+		CDEBUG(D_OTHER, "%s: upcall for key "LPU64" returned %d\n",
+		       cache->uc_name, entry->ue_key, err);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (!UC_CACHE_IS_ACQUIRING(entry)) {
+		CDEBUG(D_RPCTRACE,"%s: found uptodate entry %p (key "LPU64")\n",
+		       cache->uc_name, entry, entry->ue_key);
+		GOTO(out, rc = 0);
+	}
+
+	if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) {
+		CERROR("%s: found a stale entry %p (key "LPU64") in ioctl\n",
+		       cache->uc_name, entry, entry->ue_key);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	spin_unlock(&cache->uc_lock);
+	if (cache->uc_ops->parse_downcall)
+		rc = cache->uc_ops->parse_downcall(cache, entry, args);
+	spin_lock(&cache->uc_lock);
+	if (rc)
+		GOTO(out, rc);
+
+	entry->ue_expire = cfs_time_shift(cache->uc_entry_expire);
+	UC_CACHE_SET_VALID(entry);
+	CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key "LPU64"\n",
+	       cache->uc_name, entry, entry->ue_key);
+out:
+	if (rc) {
+		UC_CACHE_SET_INVALID(entry);
+		list_del_init(&entry->ue_hash);
+	}
+	UC_CACHE_CLEAR_ACQUIRING(entry);
+	spin_unlock(&cache->uc_lock);
+	wake_up_all(&entry->ue_waitq);
+	put_entry(cache, entry);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(upcall_cache_downcall);
+
+static void cache_flush(struct upcall_cache *cache, int force)
+{
+	struct upcall_cache_entry *entry, *next;
+	int i;
+	ENTRY;
+
+	spin_lock(&cache->uc_lock);
+	for (i = 0; i < UC_CACHE_HASH_SIZE; i++) {
+		list_for_each_entry_safe(entry, next,
+					 &cache->uc_hashtable[i], ue_hash) {
+			if (!force && atomic_read(&entry->ue_refcount)) {
+				UC_CACHE_SET_EXPIRED(entry);
+				continue;
+			}
+			LASSERT(!atomic_read(&entry->ue_refcount));
+			free_entry(cache, entry);
+		}
+	}
+	spin_unlock(&cache->uc_lock);
+	EXIT;
+}
+
+void upcall_cache_flush_idle(struct upcall_cache *cache)
+{
+	cache_flush(cache, 0);
+}
+EXPORT_SYMBOL(upcall_cache_flush_idle);
+
+void upcall_cache_flush_all(struct upcall_cache *cache)
+{
+	cache_flush(cache, 1);
+}
+EXPORT_SYMBOL(upcall_cache_flush_all);
+
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args)
+{
+	struct list_head *head;
+	struct upcall_cache_entry *entry;
+	int found = 0;
+	ENTRY;
+
+	head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+	spin_lock(&cache->uc_lock);
+	list_for_each_entry(entry, head, ue_hash) {
+		if (upcall_compare(cache, entry, key, args) == 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		CWARN("%s: flush entry %p: key "LPU64", ref %d, fl %x, "
+		      "cur %lu, ex %ld/%ld\n",
+		      cache->uc_name, entry, entry->ue_key,
+		      atomic_read(&entry->ue_refcount), entry->ue_flags,
+		      cfs_time_current_sec(), entry->ue_acquire_expire,
+		      entry->ue_expire);
+		UC_CACHE_SET_EXPIRED(entry);
+		if (!atomic_read(&entry->ue_refcount))
+			free_entry(cache, entry);
+	}
+	spin_unlock(&cache->uc_lock);
+}
+EXPORT_SYMBOL(upcall_cache_flush_one);
+
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+				       struct upcall_cache_ops *ops)
+{
+	struct upcall_cache *cache;
+	int i;
+	ENTRY;
+
+	LIBCFS_ALLOC(cache, sizeof(*cache));
+	if (!cache)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	spin_lock_init(&cache->uc_lock);
+	rwlock_init(&cache->uc_upcall_rwlock);
+	for (i = 0; i < UC_CACHE_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&cache->uc_hashtable[i]);
+	strncpy(cache->uc_name, name, sizeof(cache->uc_name) - 1);
+	/* upcall pathname proc tunable */
+	strncpy(cache->uc_upcall, upcall, sizeof(cache->uc_upcall) - 1);
+	cache->uc_entry_expire = 20 * 60;
+	cache->uc_acquire_expire = 30;
+	cache->uc_ops = ops;
+
+	RETURN(cache);
+}
+EXPORT_SYMBOL(upcall_cache_init);
+
+void upcall_cache_cleanup(struct upcall_cache *cache)
+{
+	if (!cache)
+		return;
+	upcall_cache_flush_all(cache);
+	LIBCFS_FREE(cache, sizeof(*cache));
+}
+EXPORT_SYMBOL(upcall_cache_cleanup);

diff --git a/drivers/staging/lustre/lustre/libcfs/watchdog.c b/drivers/staging/lustre/lustre/libcfs/watchdog.c
new file mode 100644
index 0000000..7c385ad
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/watchdog.c

@@ -0,0 +1,516 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/watchdog.c
+ *
+ * Author: Jacob Berkman <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+struct lc_watchdog {
+	spinlock_t  lcw_lock;     /* check or change lcw_list */
+	int	     lcw_refcount; /* must hold lcw_pending_timers_lock */
+	timer_list_t     lcw_timer;    /* kernel timer */
+	struct list_head      lcw_list;     /* chain on pending list */
+	cfs_time_t      lcw_last_touched; /* last touched stamp */
+	task_t     *lcw_task;     /* owner task */
+	void	  (*lcw_callback)(pid_t, void *);
+	void	   *lcw_data;
+
+	pid_t	   lcw_pid;
+
+	enum {
+		LC_WATCHDOG_DISABLED,
+		LC_WATCHDOG_ENABLED,
+		LC_WATCHDOG_EXPIRED
+	} lcw_state;
+};
+
+#ifdef WITH_WATCHDOG
+/*
+ * The dispatcher will complete lcw_start_completion when it starts,
+ * and lcw_stop_completion when it exits.
+ * Wake lcw_event_waitq to signal timer callback dispatches.
+ */
+static struct completion lcw_start_completion;
+static struct completion  lcw_stop_completion;
+static wait_queue_head_t lcw_event_waitq;
+
+/*
+ * Set this and wake lcw_event_waitq to stop the dispatcher.
+ */
+enum {
+	LCW_FLAG_STOP = 0
+};
+static unsigned long lcw_flags = 0;
+
+/*
+ * Number of outstanding watchdogs.
+ * When it hits 1, we start the dispatcher.
+ * When it hits 0, we stop the dispatcher.
+ */
+static __u32	 lcw_refcount = 0;
+static DEFINE_MUTEX(lcw_refcount_mutex);
+
+/*
+ * List of timers that have fired that need their callbacks run by the
+ * dispatcher.
+ */
+/* BH lock! */
+static DEFINE_SPINLOCK(lcw_pending_timers_lock);
+static struct list_head lcw_pending_timers = LIST_HEAD_INIT(lcw_pending_timers);
+
+/* Last time a watchdog expired */
+static cfs_time_t lcw_last_watchdog_time;
+static int lcw_recent_watchdog_count;
+
+static void
+lcw_dump(struct lc_watchdog *lcw)
+{
+	ENTRY;
+	rcu_read_lock();
+       if (lcw->lcw_task == NULL) {
+		LCONSOLE_WARN("Process " LPPID " was not found in the task "
+			      "list; watchdog callback may be incomplete\n",
+			      (int)lcw->lcw_pid);
+	} else {
+		libcfs_debug_dumpstack(lcw->lcw_task);
+	}
+
+	rcu_read_unlock();
+	EXIT;
+}
+
+static void lcw_cb(ulong_ptr_t data)
+{
+	struct lc_watchdog *lcw = (struct lc_watchdog *)data;
+	ENTRY;
+
+	if (lcw->lcw_state != LC_WATCHDOG_ENABLED) {
+		EXIT;
+		return;
+	}
+
+	lcw->lcw_state = LC_WATCHDOG_EXPIRED;
+
+	spin_lock_bh(&lcw->lcw_lock);
+	LASSERT(list_empty(&lcw->lcw_list));
+
+	spin_lock_bh(&lcw_pending_timers_lock);
+	lcw->lcw_refcount++; /* +1 for pending list */
+	list_add(&lcw->lcw_list, &lcw_pending_timers);
+	wake_up(&lcw_event_waitq);
+
+	spin_unlock_bh(&lcw_pending_timers_lock);
+	spin_unlock_bh(&lcw->lcw_lock);
+	EXIT;
+}
+
+static int is_watchdog_fired(void)
+{
+	int rc;
+
+	if (test_bit(LCW_FLAG_STOP, &lcw_flags))
+		return 1;
+
+	spin_lock_bh(&lcw_pending_timers_lock);
+	rc = !list_empty(&lcw_pending_timers);
+	spin_unlock_bh(&lcw_pending_timers_lock);
+	return rc;
+}
+
+static void lcw_dump_stack(struct lc_watchdog *lcw)
+{
+	cfs_time_t      current_time;
+	cfs_duration_t  delta_time;
+	struct timeval  timediff;
+
+	current_time = cfs_time_current();
+	delta_time = cfs_time_sub(current_time, lcw->lcw_last_touched);
+	cfs_duration_usec(delta_time, &timediff);
+
+	/*
+	 * Check to see if we should throttle the watchdog timer to avoid
+	 * too many dumps going to the console thus triggering an NMI.
+	 */
+	delta_time = cfs_duration_sec(cfs_time_sub(current_time,
+						   lcw_last_watchdog_time));
+
+	if (delta_time < libcfs_watchdog_ratelimit &&
+	    lcw_recent_watchdog_count > 3) {
+		LCONSOLE_WARN("Service thread pid %u was inactive for "
+			      "%lu.%.02lus. Watchdog stack traces are limited "
+			      "to 3 per %d seconds, skipping this one.\n",
+			      (int)lcw->lcw_pid,
+			      timediff.tv_sec,
+			      timediff.tv_usec / 10000,
+			      libcfs_watchdog_ratelimit);
+	} else {
+		if (delta_time < libcfs_watchdog_ratelimit) {
+			lcw_recent_watchdog_count++;
+		} else {
+			memcpy(&lcw_last_watchdog_time, &current_time,
+			       sizeof(current_time));
+			lcw_recent_watchdog_count = 0;
+		}
+
+		LCONSOLE_WARN("Service thread pid %u was inactive for "
+			      "%lu.%.02lus. The thread might be hung, or it "
+			      "might only be slow and will resume later. "
+			      "Dumping the stack trace for debugging purposes:"
+			      "\n",
+			      (int)lcw->lcw_pid,
+			      timediff.tv_sec,
+			      timediff.tv_usec / 10000);
+		lcw_dump(lcw);
+	}
+}
+
+static int lcw_dispatch_main(void *data)
+{
+	int		 rc = 0;
+	struct lc_watchdog *lcw;
+	LIST_HEAD      (zombies);
+
+	ENTRY;
+
+	complete(&lcw_start_completion);
+
+	while (1) {
+		int dumplog = 1;
+
+		cfs_wait_event_interruptible(lcw_event_waitq,
+					     is_watchdog_fired(), rc);
+		CDEBUG(D_INFO, "Watchdog got woken up...\n");
+		if (test_bit(LCW_FLAG_STOP, &lcw_flags)) {
+			CDEBUG(D_INFO, "LCW_FLAG_STOP set, shutting down...\n");
+
+			spin_lock_bh(&lcw_pending_timers_lock);
+			rc = !list_empty(&lcw_pending_timers);
+			spin_unlock_bh(&lcw_pending_timers_lock);
+			if (rc) {
+				CERROR("pending timers list was not empty at "
+				       "time of watchdog dispatch shutdown\n");
+			}
+			break;
+		}
+
+		spin_lock_bh(&lcw_pending_timers_lock);
+		while (!list_empty(&lcw_pending_timers)) {
+			int is_dumplog;
+
+			lcw = list_entry(lcw_pending_timers.next,
+					     struct lc_watchdog, lcw_list);
+			/* +1 ref for callback to make sure lwc wouldn't be
+			 * deleted after releasing lcw_pending_timers_lock */
+			lcw->lcw_refcount++;
+			spin_unlock_bh(&lcw_pending_timers_lock);
+
+			/* lock ordering */
+			spin_lock_bh(&lcw->lcw_lock);
+			spin_lock_bh(&lcw_pending_timers_lock);
+
+			if (list_empty(&lcw->lcw_list)) {
+				/* already removed from pending list */
+				lcw->lcw_refcount--; /* -1 ref for callback */
+				if (lcw->lcw_refcount == 0)
+					list_add(&lcw->lcw_list, &zombies);
+				spin_unlock_bh(&lcw->lcw_lock);
+				/* still hold lcw_pending_timers_lock */
+				continue;
+			}
+
+			list_del_init(&lcw->lcw_list);
+			lcw->lcw_refcount--; /* -1 ref for pending list */
+
+			spin_unlock_bh(&lcw_pending_timers_lock);
+			spin_unlock_bh(&lcw->lcw_lock);
+
+			CDEBUG(D_INFO, "found lcw for pid " LPPID "\n",
+			       lcw->lcw_pid);
+			lcw_dump_stack(lcw);
+
+			is_dumplog = lcw->lcw_callback == lc_watchdog_dumplog;
+			if (lcw->lcw_state != LC_WATCHDOG_DISABLED &&
+			    (dumplog || !is_dumplog)) {
+				lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data);
+				if (dumplog && is_dumplog)
+					dumplog = 0;
+			}
+
+			spin_lock_bh(&lcw_pending_timers_lock);
+			lcw->lcw_refcount--; /* -1 ref for callback */
+			if (lcw->lcw_refcount == 0)
+				list_add(&lcw->lcw_list, &zombies);
+		}
+		spin_unlock_bh(&lcw_pending_timers_lock);
+
+		while (!list_empty(&zombies)) {
+			lcw = list_entry(lcw_pending_timers.next,
+					 struct lc_watchdog, lcw_list);
+			list_del(&lcw->lcw_list);
+			LIBCFS_FREE(lcw, sizeof(*lcw));
+		}
+	}
+
+	complete(&lcw_stop_completion);
+
+	RETURN(rc);
+}
+
+static void lcw_dispatch_start(void)
+{
+	task_t *task;
+
+	ENTRY;
+	LASSERT(lcw_refcount == 1);
+
+	init_completion(&lcw_stop_completion);
+	init_completion(&lcw_start_completion);
+	init_waitqueue_head(&lcw_event_waitq);
+
+	CDEBUG(D_INFO, "starting dispatch thread\n");
+	task = kthread_run(lcw_dispatch_main, NULL, "lc_watchdogd");
+	if (IS_ERR(task)) {
+		CERROR("error spawning watchdog dispatch thread: %ld\n",
+			PTR_ERR(task));
+		EXIT;
+		return;
+	}
+	wait_for_completion(&lcw_start_completion);
+	CDEBUG(D_INFO, "watchdog dispatcher initialization complete.\n");
+
+	EXIT;
+}
+
+static void lcw_dispatch_stop(void)
+{
+	ENTRY;
+	LASSERT(lcw_refcount == 0);
+
+	CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n");
+
+	set_bit(LCW_FLAG_STOP, &lcw_flags);
+	wake_up(&lcw_event_waitq);
+
+	wait_for_completion(&lcw_stop_completion);
+
+	CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n");
+
+	EXIT;
+}
+
+struct lc_watchdog *lc_watchdog_add(int timeout,
+				    void (*callback)(pid_t, void *),
+				    void *data)
+{
+	struct lc_watchdog *lcw = NULL;
+	ENTRY;
+
+	LIBCFS_ALLOC(lcw, sizeof(*lcw));
+	if (lcw == NULL) {
+		CDEBUG(D_INFO, "Could not allocate new lc_watchdog\n");
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	spin_lock_init(&lcw->lcw_lock);
+	lcw->lcw_refcount = 1; /* refcount for owner */
+	lcw->lcw_task     = current;
+	lcw->lcw_pid      = current_pid();
+	lcw->lcw_callback = (callback != NULL) ? callback : lc_watchdog_dumplog;
+	lcw->lcw_data     = data;
+	lcw->lcw_state    = LC_WATCHDOG_DISABLED;
+
+	INIT_LIST_HEAD(&lcw->lcw_list);
+	cfs_timer_init(&lcw->lcw_timer, lcw_cb, lcw);
+
+	mutex_lock(&lcw_refcount_mutex);
+	if (++lcw_refcount == 1)
+		lcw_dispatch_start();
+	mutex_unlock(&lcw_refcount_mutex);
+
+	/* Keep this working in case we enable them by default */
+	if (lcw->lcw_state == LC_WATCHDOG_ENABLED) {
+		lcw->lcw_last_touched = cfs_time_current();
+		cfs_timer_arm(&lcw->lcw_timer, cfs_time_seconds(timeout) +
+			      cfs_time_current());
+	}
+
+	RETURN(lcw);
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
+{
+	cfs_time_t newtime = cfs_time_current();;
+
+	if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) {
+		struct timeval timediff;
+		cfs_time_t delta_time = cfs_time_sub(newtime,
+						     lcw->lcw_last_touched);
+		cfs_duration_usec(delta_time, &timediff);
+
+		LCONSOLE_WARN("Service thread pid %u %s after %lu.%.02lus. "
+			      "This indicates the system was overloaded (too "
+			      "many service threads, or there were not enough "
+			      "hardware resources).\n",
+			      lcw->lcw_pid,
+			      message,
+			      timediff.tv_sec,
+			      timediff.tv_usec / 10000);
+	}
+	lcw->lcw_last_touched = newtime;
+}
+
+static void lc_watchdog_del_pending(struct lc_watchdog *lcw)
+{
+	spin_lock_bh(&lcw->lcw_lock);
+	if (unlikely(!list_empty(&lcw->lcw_list))) {
+		spin_lock_bh(&lcw_pending_timers_lock);
+		list_del_init(&lcw->lcw_list);
+		lcw->lcw_refcount--; /* -1 ref for pending list */
+		spin_unlock_bh(&lcw_pending_timers_lock);
+	}
+
+	spin_unlock_bh(&lcw->lcw_lock);
+}
+
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
+{
+	ENTRY;
+	LASSERT(lcw != NULL);
+
+	lc_watchdog_del_pending(lcw);
+
+	lcw_update_time(lcw, "resumed");
+	lcw->lcw_state = LC_WATCHDOG_ENABLED;
+
+	cfs_timer_arm(&lcw->lcw_timer, cfs_time_current() +
+		      cfs_time_seconds(timeout));
+
+	EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+	ENTRY;
+	LASSERT(lcw != NULL);
+
+	lc_watchdog_del_pending(lcw);
+
+	lcw_update_time(lcw, "completed");
+	lcw->lcw_state = LC_WATCHDOG_DISABLED;
+
+	EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+	int dead;
+
+	ENTRY;
+	LASSERT(lcw != NULL);
+
+	cfs_timer_disarm(&lcw->lcw_timer);
+
+	lcw_update_time(lcw, "stopped");
+
+	spin_lock_bh(&lcw->lcw_lock);
+	spin_lock_bh(&lcw_pending_timers_lock);
+	if (unlikely(!list_empty(&lcw->lcw_list))) {
+		list_del_init(&lcw->lcw_list);
+		lcw->lcw_refcount--; /* -1 ref for pending list */
+	}
+
+	lcw->lcw_refcount--; /* -1 ref for owner */
+	dead = lcw->lcw_refcount == 0;
+	spin_unlock_bh(&lcw_pending_timers_lock);
+	spin_unlock_bh(&lcw->lcw_lock);
+
+	if (dead)
+		LIBCFS_FREE(lcw, sizeof(*lcw));
+
+	mutex_lock(&lcw_refcount_mutex);
+	if (--lcw_refcount == 0)
+		lcw_dispatch_stop();
+	mutex_unlock(&lcw_refcount_mutex);
+
+	EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+/*
+ * Provided watchdog handlers
+ */
+
+void lc_watchdog_dumplog(pid_t pid, void *data)
+{
+	libcfs_debug_dumplog_internal((void *)((long_ptr_t)pid));
+}
+EXPORT_SYMBOL(lc_watchdog_dumplog);
+
+#else   /* !defined(WITH_WATCHDOG) */
+
+struct lc_watchdog *lc_watchdog_add(int timeout,
+				    void (*callback)(pid_t pid, void *),
+				    void *data)
+{
+	static struct lc_watchdog      watchdog;
+	return &watchdog;
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+#endif

diff --git a/drivers/staging/lustre/lustre/libcfs/workitem.c b/drivers/staging/lustre/lustre/libcfs/workitem.c
new file mode 100644
index 0000000..b533666
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/workitem.c

@@ -0,0 +1,475 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/workitem.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *	 Liang Zhen  <zhen.liang@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define CFS_WS_NAME_LEN	 16
+
+typedef struct cfs_wi_sched {
+	struct list_head		ws_list;	/* chain on global list */
+	/** serialised workitems */
+	spinlock_t		ws_lock;
+	/** where schedulers sleep */
+	wait_queue_head_t		ws_waitq;
+	/** concurrent workitems */
+	struct list_head		ws_runq;
+	/** rescheduled running-workitems, a workitem can be rescheduled
+	 * while running in wi_action(), but we don't to execute it again
+	 * unless it returns from wi_action(), so we put it on ws_rerunq
+	 * while rescheduling, and move it to runq after it returns
+	 * from wi_action() */
+	struct list_head		ws_rerunq;
+	/** CPT-table for this scheduler */
+	struct cfs_cpt_table	*ws_cptab;
+	/** CPT id for affinity */
+	int			ws_cpt;
+	/** number of scheduled workitems */
+	int			ws_nscheduled;
+	/** started scheduler thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_nthreads:30;
+	/** shutting down, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_stopping:1;
+	/** serialize starting thread, protected by cfs_wi_data::wi_glock */
+	unsigned int		ws_starting:1;
+	/** scheduler name */
+	char			ws_name[CFS_WS_NAME_LEN];
+} cfs_wi_sched_t;
+
+struct cfs_workitem_data {
+	/** serialize */
+	spinlock_t		wi_glock;
+	/** list of all schedulers */
+	struct list_head		wi_scheds;
+	/** WI module is initialized */
+	int			wi_init;
+	/** shutting down the whole WI module */
+	int			wi_stopping;
+} cfs_wi_data;
+
+static inline void
+cfs_wi_sched_lock(cfs_wi_sched_t *sched)
+{
+	spin_lock(&sched->ws_lock);
+}
+
+static inline void
+cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
+{
+	spin_unlock(&sched->ws_lock);
+}
+
+static inline int
+cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
+{
+	cfs_wi_sched_lock(sched);
+	if (sched->ws_stopping) {
+		cfs_wi_sched_unlock(sched);
+		return 0;
+	}
+
+	if (!list_empty(&sched->ws_runq)) {
+		cfs_wi_sched_unlock(sched);
+		return 0;
+	}
+	cfs_wi_sched_unlock(sched);
+	return 1;
+}
+
+
+/* XXX:
+ * 0. it only works when called from wi->wi_action.
+ * 1. when it returns no one shall try to schedule the workitem.
+ */
+void
+cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	cfs_wi_sched_lock(sched);
+
+	LASSERT(wi->wi_running);
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+	}
+
+	LASSERT(list_empty(&wi->wi_list));
+
+	wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+	cfs_wi_sched_unlock(sched);
+
+	return;
+}
+EXPORT_SYMBOL(cfs_wi_exit);
+
+/**
+ * cancel schedule request of workitem \a wi
+ */
+int
+cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	int	rc;
+
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	/*
+	 * return 0 if it's running already, otherwise return 1, which
+	 * means the workitem will not be scheduled and will not have
+	 * any race with wi_action.
+	 */
+	cfs_wi_sched_lock(sched);
+
+	rc = !(wi->wi_running);
+
+	if (wi->wi_scheduled) { /* cancel pending schedules */
+		LASSERT(!list_empty(&wi->wi_list));
+		list_del_init(&wi->wi_list);
+
+		LASSERT(sched->ws_nscheduled > 0);
+		sched->ws_nscheduled--;
+
+		wi->wi_scheduled = 0;
+	}
+
+	LASSERT (list_empty(&wi->wi_list));
+
+	cfs_wi_sched_unlock(sched);
+	return rc;
+}
+EXPORT_SYMBOL(cfs_wi_deschedule);
+
+/*
+ * Workitem scheduled with (serial == 1) is strictly serialised not only with
+ * itself, but also with others scheduled this way.
+ *
+ * Now there's only one static serialised queue, but in the future more might
+ * be added, and even dynamic creation of serialised queues might be supported.
+ */
+void
+cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+	LASSERT(!in_interrupt()); /* because we use plain spinlock */
+	LASSERT(!sched->ws_stopping);
+
+	cfs_wi_sched_lock(sched);
+
+	if (!wi->wi_scheduled) {
+		LASSERT (list_empty(&wi->wi_list));
+
+		wi->wi_scheduled = 1;
+		sched->ws_nscheduled++;
+		if (!wi->wi_running) {
+			list_add_tail(&wi->wi_list, &sched->ws_runq);
+			wake_up(&sched->ws_waitq);
+		} else {
+			list_add(&wi->wi_list, &sched->ws_rerunq);
+		}
+	}
+
+	LASSERT (!list_empty(&wi->wi_list));
+	cfs_wi_sched_unlock(sched);
+	return;
+}
+EXPORT_SYMBOL(cfs_wi_schedule);
+
+
+static int
+cfs_wi_scheduler (void *arg)
+{
+	struct cfs_wi_sched	*sched = (cfs_wi_sched_t *)arg;
+
+	cfs_block_allsigs();
+
+	/* CPT affinity scheduler? */
+	if (sched->ws_cptab != NULL)
+		cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+
+	LASSERT(sched->ws_starting == 1);
+	sched->ws_starting--;
+	sched->ws_nthreads++;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	cfs_wi_sched_lock(sched);
+
+	while (!sched->ws_stopping) {
+		int	     nloops = 0;
+		int	     rc;
+		cfs_workitem_t *wi;
+
+		while (!list_empty(&sched->ws_runq) &&
+		       nloops < CFS_WI_RESCHED) {
+			wi = list_entry(sched->ws_runq.next,
+					    cfs_workitem_t, wi_list);
+			LASSERT(wi->wi_scheduled && !wi->wi_running);
+
+			list_del_init(&wi->wi_list);
+
+			LASSERT(sched->ws_nscheduled > 0);
+			sched->ws_nscheduled--;
+
+			wi->wi_running   = 1;
+			wi->wi_scheduled = 0;
+
+
+			cfs_wi_sched_unlock(sched);
+			nloops++;
+
+			rc = (*wi->wi_action) (wi);
+
+			cfs_wi_sched_lock(sched);
+			if (rc != 0) /* WI should be dead, even be freed! */
+				continue;
+
+			wi->wi_running = 0;
+			if (list_empty(&wi->wi_list))
+				continue;
+
+			LASSERT(wi->wi_scheduled);
+			/* wi is rescheduled, should be on rerunq now, we
+			 * move it to runq so it can run action now */
+			list_move_tail(&wi->wi_list, &sched->ws_runq);
+		}
+
+		if (!list_empty(&sched->ws_runq)) {
+			cfs_wi_sched_unlock(sched);
+			/* don't sleep because some workitems still
+			 * expect me to come back soon */
+			cond_resched();
+			cfs_wi_sched_lock(sched);
+			continue;
+		}
+
+		cfs_wi_sched_unlock(sched);
+		cfs_wait_event_interruptible_exclusive(sched->ws_waitq,
+				!cfs_wi_sched_cansleep(sched), rc);
+		cfs_wi_sched_lock(sched);
+	}
+
+	cfs_wi_sched_unlock(sched);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	sched->ws_nthreads--;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	return 0;
+}
+
+
+void
+cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
+{
+	int	i;
+
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	if (sched->ws_stopping) {
+		CDEBUG(D_INFO, "%s is in progress of stopping\n",
+		       sched->ws_name);
+		spin_unlock(&cfs_wi_data.wi_glock);
+		return;
+	}
+
+	LASSERT(!list_empty(&sched->ws_list));
+	sched->ws_stopping = 1;
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	i = 2;
+	wake_up_all(&sched->ws_waitq);
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	while (sched->ws_nthreads > 0) {
+		CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
+		       "waiting for %d threads of WI sched[%s] to terminate\n",
+		       sched->ws_nthreads, sched->ws_name);
+
+		spin_unlock(&cfs_wi_data.wi_glock);
+		cfs_pause(cfs_time_seconds(1) / 20);
+		spin_lock(&cfs_wi_data.wi_glock);
+	}
+
+	list_del(&sched->ws_list);
+
+	spin_unlock(&cfs_wi_data.wi_glock);
+	LASSERT(sched->ws_nscheduled == 0);
+
+	LIBCFS_FREE(sched, sizeof(*sched));
+}
+EXPORT_SYMBOL(cfs_wi_sched_destroy);
+
+int
+cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
+		    int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
+{
+	struct cfs_wi_sched	*sched;
+	int			rc;
+
+	LASSERT(cfs_wi_data.wi_init);
+	LASSERT(!cfs_wi_data.wi_stopping);
+	LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
+		(cpt >= 0 && cpt < cfs_cpt_number(cptab)));
+
+	LIBCFS_ALLOC(sched, sizeof(*sched));
+	if (sched == NULL)
+		return -ENOMEM;
+
+	strncpy(sched->ws_name, name, CFS_WS_NAME_LEN);
+	sched->ws_cptab = cptab;
+	sched->ws_cpt = cpt;
+
+	spin_lock_init(&sched->ws_lock);
+	init_waitqueue_head(&sched->ws_waitq);
+	INIT_LIST_HEAD(&sched->ws_runq);
+	INIT_LIST_HEAD(&sched->ws_rerunq);
+	INIT_LIST_HEAD(&sched->ws_list);
+
+	rc = 0;
+	while (nthrs > 0)  {
+		char	name[16];
+		task_t	*task;
+		spin_lock(&cfs_wi_data.wi_glock);
+		while (sched->ws_starting > 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			schedule();
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+
+		sched->ws_starting++;
+		spin_unlock(&cfs_wi_data.wi_glock);
+
+		if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
+			snprintf(name, sizeof(name), "%s_%02d_%02d",
+				 sched->ws_name, sched->ws_cpt,
+				 sched->ws_nthreads);
+		} else {
+			snprintf(name, sizeof(name), "%s_%02d",
+				 sched->ws_name, sched->ws_nthreads);
+		}
+
+		task = kthread_run(cfs_wi_scheduler, sched, name);
+		if (!IS_ERR(task)) {
+			nthrs--;
+			continue;
+		}
+		rc = PTR_ERR(task);
+
+		CERROR("Failed to create thread for WI scheduler %s: %d\n",
+		       name, rc);
+
+		spin_lock(&cfs_wi_data.wi_glock);
+
+		/* make up for cfs_wi_sched_destroy */
+		list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+		sched->ws_starting--;
+
+		spin_unlock(&cfs_wi_data.wi_glock);
+
+		cfs_wi_sched_destroy(sched);
+		return rc;
+	}
+	spin_lock(&cfs_wi_data.wi_glock);
+	list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	*sched_pp = sched;
+	return 0;
+}
+EXPORT_SYMBOL(cfs_wi_sched_create);
+
+int
+cfs_wi_startup(void)
+{
+	memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
+
+	spin_lock_init(&cfs_wi_data.wi_glock);
+	INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
+	cfs_wi_data.wi_init = 1;
+
+	return 0;
+}
+
+void
+cfs_wi_shutdown (void)
+{
+	struct cfs_wi_sched	*sched;
+
+	spin_lock(&cfs_wi_data.wi_glock);
+	cfs_wi_data.wi_stopping = 1;
+	spin_unlock(&cfs_wi_data.wi_glock);
+
+	/* nobody should contend on this list */
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		sched->ws_stopping = 1;
+		wake_up_all(&sched->ws_waitq);
+	}
+
+	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+		spin_lock(&cfs_wi_data.wi_glock);
+
+		while (sched->ws_nthreads != 0) {
+			spin_unlock(&cfs_wi_data.wi_glock);
+			cfs_pause(cfs_time_seconds(1) / 20);
+			spin_lock(&cfs_wi_data.wi_glock);
+		}
+		spin_unlock(&cfs_wi_data.wi_glock);
+	}
+	while (!list_empty(&cfs_wi_data.wi_scheds)) {
+		sched = list_entry(cfs_wi_data.wi_scheds.next,
+				       struct cfs_wi_sched, ws_list);
+		list_del(&sched->ws_list);
+		LIBCFS_FREE(sched, sizeof(*sched));
+	}
+
+	cfs_wi_data.wi_stopping = 0;
+	cfs_wi_data.wi_init = 0;
+}

diff --git a/drivers/staging/lustre/lustre/llite/Makefile b/drivers/staging/lustre/lustre/llite/Makefile
new file mode 100644
index 0000000..dff0c04
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/Makefile

@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTRE_FS) += lustre.o
+obj-$(CONFIG_LUSTRE_FS) += llite_lloop.o
+lustre-y := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o \
+	    rw.o lproc_llite.o namei.o symlink.o llite_mmap.o \
+	    xattr.o remote_perm.o llite_rmtacl.o llite_capa.o \
+	    rw26.o super25.o statahead.o \
+	    ../lclient/glimpse.o ../lclient/lcommon_cl.o ../lclient/lcommon_misc.o \
+	    vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
+
+llite_lloop-y := lloop.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c
new file mode 100644
index 0000000..7d6abff
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/dcache.c

@@ -0,0 +1,675 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/quotaops.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+#include "llite_internal.h"
+
+static void free_dentry_data(struct rcu_head *head)
+{
+	struct ll_dentry_data *lld;
+
+	lld = container_of(head, struct ll_dentry_data, lld_rcu_head);
+	OBD_FREE_PTR(lld);
+}
+
+/* should NOT be called with the dcache lock, see fs/dcache.c */
+static void ll_release(struct dentry *de)
+{
+	struct ll_dentry_data *lld;
+	ENTRY;
+	LASSERT(de != NULL);
+	lld = ll_d2d(de);
+	if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */
+		RETURN_EXIT;
+
+	if (lld->lld_it) {
+		ll_intent_release(lld->lld_it);
+		OBD_FREE(lld->lld_it, sizeof(*lld->lld_it));
+	}
+	LASSERT(lld->lld_cwd_count == 0);
+	LASSERT(lld->lld_mnt_count == 0);
+	de->d_fsdata = NULL;
+	call_rcu(&lld->lld_rcu_head, free_dentry_data);
+
+	EXIT;
+}
+
+/* Compare if two dentries are the same.  Don't match if the existing dentry
+ * is marked invalid.  Returns 1 if different, 0 if the same.
+ *
+ * This avoids a race where ll_lookup_it() instantiates a dentry, but we get
+ * an AST before calling d_revalidate_it().  The dentry still exists (marked
+ * INVALID) so d_lookup() matches it, but we have no lock on it (so
+ * lock_match() fails) and we spin around real_lookup(). */
+int ll_dcompare(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+{
+	ENTRY;
+
+	if (len != name->len)
+		RETURN(1);
+
+	if (memcmp(str, name->name, len))
+		RETURN(1);
+
+	CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n",
+	       name->len, name->name, dentry, dentry->d_flags,
+	       d_refcount(dentry));
+
+	/* mountpoint is always valid */
+	if (d_mountpoint((struct dentry *)dentry))
+		RETURN(0);
+
+	if (d_lustre_invalid(dentry))
+		RETURN(1);
+
+	RETURN(0);
+}
+
+static inline int return_if_equal(struct ldlm_lock *lock, void *data)
+{
+	if ((lock->l_flags &
+	     (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA)) ==
+	    (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA))
+		return LDLM_ITER_CONTINUE;
+	return LDLM_ITER_STOP;
+}
+
+/* find any ldlm lock of the inode in mdc and lov
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+static int find_cbdata(struct inode *inode)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct lov_stripe_md *lsm;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(inode);
+	rc = md_find_cbdata(sbi->ll_md_exp, ll_inode2fid(inode),
+			    return_if_equal, NULL);
+	if (rc != 0)
+		 RETURN(rc);
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm == NULL)
+		RETURN(rc);
+
+	rc = obd_find_cbdata(sbi->ll_dt_exp, lsm, return_if_equal, NULL);
+	ccc_inode_lsm_put(inode, lsm);
+
+	RETURN(rc);
+}
+
+/**
+ * Called when last reference to a dentry is dropped and dcache wants to know
+ * whether or not it should cache it:
+ * - return 1 to delete the dentry immediately
+ * - return 0 to cache the dentry
+ * Should NOT be called with the dcache lock, see fs/dcache.c
+ */
+static int ll_ddelete(const struct dentry *de)
+{
+	ENTRY;
+	LASSERT(de);
+
+	CDEBUG(D_DENTRY, "%s dentry %.*s (%p, parent %p, inode %p) %s%s\n",
+	       d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping",
+	       de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+	       d_unhashed((struct dentry *)de) ? "" : "hashed,",
+	       list_empty(&de->d_subdirs) ? "" : "subdirs");
+
+	/* kernel >= 2.6.38 last refcount is decreased after this function. */
+	LASSERT(d_refcount(de) == 1);
+
+	/* Disable this piece of code temproarily because this is called
+	 * inside dcache_lock so it's not appropriate to do lots of work
+	 * here. ATTENTION: Before this piece of code enabling, LU-2487 must be
+	 * resolved. */
+#if 0
+	/* if not ldlm lock for this inode, set i_nlink to 0 so that
+	 * this inode can be recycled later b=20433 */
+	if (de->d_inode && !find_cbdata(de->d_inode))
+		clear_nlink(de->d_inode);
+#endif
+
+	if (d_lustre_invalid((struct dentry *)de))
+		RETURN(1);
+	RETURN(0);
+}
+
+static int ll_set_dd(struct dentry *de)
+{
+	ENTRY;
+	LASSERT(de != NULL);
+
+	CDEBUG(D_DENTRY, "ldd on dentry %.*s (%p) parent %p inode %p refc %d\n",
+		de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+		d_refcount(de));
+
+	if (de->d_fsdata == NULL) {
+		struct ll_dentry_data *lld;
+
+		OBD_ALLOC_PTR(lld);
+		if (likely(lld != NULL)) {
+			spin_lock(&de->d_lock);
+			if (likely(de->d_fsdata == NULL))
+				de->d_fsdata = lld;
+			else
+				OBD_FREE_PTR(lld);
+			spin_unlock(&de->d_lock);
+		} else {
+			RETURN(-ENOMEM);
+		}
+	}
+
+	RETURN(0);
+}
+
+int ll_dops_init(struct dentry *de, int block, int init_sa)
+{
+	struct ll_dentry_data *lld = ll_d2d(de);
+	int rc = 0;
+
+	if (lld == NULL && block != 0) {
+		rc = ll_set_dd(de);
+		if (rc)
+			return rc;
+
+		lld = ll_d2d(de);
+	}
+
+	if (lld != NULL && init_sa != 0)
+		lld->lld_sa_generation = 0;
+
+	/* kernel >= 2.6.38 d_op is set in d_alloc() */
+	LASSERT(de->d_op == &ll_d_ops);
+	return rc;
+}
+
+void ll_intent_drop_lock(struct lookup_intent *it)
+{
+	if (it->it_op && it->d.lustre.it_lock_mode) {
+		struct lustre_handle handle;
+
+		handle.cookie = it->d.lustre.it_lock_handle;
+
+		CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64
+		       " from it %p\n", handle.cookie, it);
+		ldlm_lock_decref(&handle, it->d.lustre.it_lock_mode);
+
+		/* bug 494: intent_release may be called multiple times, from
+		 * this thread and we don't want to double-decref this lock */
+		it->d.lustre.it_lock_mode = 0;
+		if (it->d.lustre.it_remote_lock_mode != 0) {
+			handle.cookie = it->d.lustre.it_remote_lock_handle;
+
+			CDEBUG(D_DLMTRACE, "releasing remote lock with cookie"
+			       LPX64" from it %p\n", handle.cookie, it);
+			ldlm_lock_decref(&handle,
+					 it->d.lustre.it_remote_lock_mode);
+			it->d.lustre.it_remote_lock_mode = 0;
+		}
+	}
+}
+
+void ll_intent_release(struct lookup_intent *it)
+{
+	ENTRY;
+
+	CDEBUG(D_INFO, "intent %p released\n", it);
+	ll_intent_drop_lock(it);
+	/* We are still holding extra reference on a request, need to free it */
+	if (it_disposition(it, DISP_ENQ_OPEN_REF))
+		 ptlrpc_req_finished(it->d.lustre.it_data); /* ll_file_open */
+	if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */
+		ptlrpc_req_finished(it->d.lustre.it_data);
+	if (it_disposition(it, DISP_ENQ_COMPLETE)) /* saved req from revalidate
+						    * to lookup */
+		ptlrpc_req_finished(it->d.lustre.it_data);
+
+	it->d.lustre.it_disposition = 0;
+	it->d.lustre.it_data = NULL;
+	EXIT;
+}
+
+void ll_invalidate_aliases(struct inode *inode)
+{
+	struct dentry *dentry;
+	struct ll_d_hlist_node *p;
+	ENTRY;
+
+	LASSERT(inode != NULL);
+
+	CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
+	       inode->i_ino, inode->i_generation, inode);
+
+	ll_lock_dcache(inode);
+	ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+		CDEBUG(D_DENTRY, "dentry in drop %.*s (%p) parent %p "
+		       "inode %p flags %d\n", dentry->d_name.len,
+		       dentry->d_name.name, dentry, dentry->d_parent,
+		       dentry->d_inode, dentry->d_flags);
+
+		if (dentry->d_name.len == 1 && dentry->d_name.name[0] == '/') {
+			CERROR("called on root (?) dentry=%p, inode=%p "
+			       "ino=%lu\n", dentry, inode, inode->i_ino);
+			lustre_dump_dentry(dentry, 1);
+			libcfs_debug_dumpstack(NULL);
+		}
+
+		d_lustre_invalidate(dentry, 0);
+	}
+	ll_unlock_dcache(inode);
+
+	EXIT;
+}
+
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+			    struct lookup_intent *it,
+			    struct dentry *de)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (!request)
+		RETURN(0);
+
+	if (it_disposition(it, DISP_LOOKUP_NEG))
+		RETURN(-ENOENT);
+
+	rc = ll_prep_inode(&de->d_inode, request, NULL, it);
+
+	RETURN(rc);
+}
+
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry)
+{
+	LASSERT(it != NULL);
+	LASSERT(dentry != NULL);
+
+	if (it->d.lustre.it_lock_mode && dentry->d_inode != NULL) {
+		struct inode *inode = dentry->d_inode;
+		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+
+		CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+		       inode, inode->i_ino, inode->i_generation);
+		ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+	}
+
+	/* drop lookup or getattr locks immediately */
+	if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) {
+		/* on 2.6 there are situation when several lookups and
+		 * revalidations may be requested during single operation.
+		 * therefore, we don't release intent here -bzzz */
+		ll_intent_drop_lock(it);
+	}
+}
+
+void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft)
+{
+	struct lookup_intent *it = *itp;
+
+	if (!it || it->it_op == IT_GETXATTR)
+		it = *itp = deft;
+
+}
+
+int ll_revalidate_it(struct dentry *de, int lookup_flags,
+		     struct lookup_intent *it)
+{
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+	struct obd_export *exp;
+	struct inode *parent = de->d_parent->d_inode;
+	int rc;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%s,intent=%s\n", de->d_name.name,
+	       LL_IT2STR(it));
+
+	if (de->d_inode == NULL) {
+		__u64 ibits;
+
+		/* We can only use negative dentries if this is stat or lookup,
+		   for opens and stuff we do need to query server. */
+		/* If there is IT_CREAT in intent op set, then we must throw
+		   away this negative dentry and actually do the request to
+		   kernel to create whatever needs to be created (if possible)*/
+		if (it && (it->it_op & IT_CREAT))
+			RETURN(0);
+
+		if (d_lustre_invalid(de))
+			RETURN(0);
+
+		ibits = MDS_INODELOCK_UPDATE;
+		rc = ll_have_md_lock(parent, &ibits, LCK_MINMODE);
+		GOTO(out_sa, rc);
+	}
+
+	/* Never execute intents for mount points.
+	 * Attributes will be fixed up in ll_inode_revalidate_it */
+	if (d_mountpoint(de))
+		GOTO(out_sa, rc = 1);
+
+	/* need to get attributes in case root got changed from other client */
+	if (de == de->d_sb->s_root) {
+		rc = __ll_inode_revalidate_it(de, it, MDS_INODELOCK_LOOKUP);
+		if (rc == 0)
+			rc = 1;
+		GOTO(out_sa, rc);
+	}
+
+	exp = ll_i2mdexp(de->d_inode);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_REVALIDATE_PAUSE, 5);
+	ll_frob_intent(&it, &lookup_it);
+	LASSERT(it);
+
+	if (it->it_op == IT_LOOKUP && !d_lustre_invalid(de))
+		RETURN(1);
+
+	if (it->it_op == IT_OPEN) {
+		struct inode *inode = de->d_inode;
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct obd_client_handle **och_p;
+		__u64 *och_usecount;
+		__u64 ibits;
+
+		/*
+		 * We used to check for MDS_INODELOCK_OPEN here, but in fact
+		 * just having LOOKUP lock is enough to justify inode is the
+		 * same. And if inode is the same and we have suitable
+		 * openhandle, then there is no point in doing another OPEN RPC
+		 * just to throw away newly received openhandle.  There are no
+		 * security implications too, if file owner or access mode is
+		 * change, LOOKUP lock is revoked.
+		 */
+
+
+		if (it->it_flags & FMODE_WRITE) {
+			och_p = &lli->lli_mds_write_och;
+			och_usecount = &lli->lli_open_fd_write_count;
+		} else if (it->it_flags & FMODE_EXEC) {
+			och_p = &lli->lli_mds_exec_och;
+			och_usecount = &lli->lli_open_fd_exec_count;
+		} else {
+			och_p = &lli->lli_mds_read_och;
+			och_usecount = &lli->lli_open_fd_read_count;
+		}
+		/* Check for the proper lock. */
+		ibits = MDS_INODELOCK_LOOKUP;
+		if (!ll_have_md_lock(inode, &ibits, LCK_MINMODE))
+			goto do_lock;
+		mutex_lock(&lli->lli_och_mutex);
+		if (*och_p) { /* Everything is open already, do nothing */
+			/*(*och_usecount)++;  Do not let them steal our open
+			  handle from under us */
+			SET_BUT_UNUSED(och_usecount);
+			/* XXX The code above was my original idea, but in case
+			   we have the handle, but we cannot use it due to later
+			   checks (e.g. O_CREAT|O_EXCL flags set), nobody
+			   would decrement counter increased here. So we just
+			   hope the lock won't be invalidated in between. But
+			   if it would be, we'll reopen the open request to
+			   MDS later during file open path */
+			mutex_unlock(&lli->lli_och_mutex);
+			RETURN(1);
+		} else {
+			mutex_unlock(&lli->lli_och_mutex);
+		}
+	}
+
+	if (it->it_op == IT_GETATTR) {
+		rc = ll_statahead_enter(parent, &de, 0);
+		if (rc == 1)
+			goto mark;
+		else if (rc != -EAGAIN && rc != 0)
+			GOTO(out, rc = 0);
+	}
+
+do_lock:
+	op_data = ll_prep_md_op_data(NULL, parent, de->d_inode,
+				     de->d_name.name, de->d_name.len,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (!IS_POSIXACL(parent) || !exp_connect_umask(exp))
+		it->it_create_mode &= ~current_umask();
+	it->it_create_mode |= M_CHECK_STALE;
+	rc = md_intent_lock(exp, op_data, NULL, 0, it,
+			    lookup_flags,
+			    &req, ll_md_blocking_ast, 0);
+	it->it_create_mode &= ~M_CHECK_STALE;
+	ll_finish_md_op_data(op_data);
+
+	/* If req is NULL, then md_intent_lock only tried to do a lock match;
+	 * if all was well, it will return 1 if it found locks, 0 otherwise. */
+	if (req == NULL && rc >= 0) {
+		if (!rc)
+			goto do_lookup;
+		GOTO(out, rc);
+	}
+
+	if (rc < 0) {
+		if (rc != -ESTALE) {
+			CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status "
+			       "%d\n", rc, it->d.lustre.it_status);
+		}
+		GOTO(out, rc = 0);
+	}
+
+revalidate_finish:
+	rc = ll_revalidate_it_finish(req, it, de);
+	if (rc != 0) {
+		if (rc != -ESTALE && rc != -ENOENT)
+			ll_intent_release(it);
+		GOTO(out, rc = 0);
+	}
+
+	if ((it->it_op & IT_OPEN) && de->d_inode &&
+	    !S_ISREG(de->d_inode->i_mode) &&
+	    !S_ISDIR(de->d_inode->i_mode)) {
+		ll_release_openhandle(de, it);
+	}
+	rc = 1;
+
+out:
+	/* We do not free request as it may be reused during following lookup
+	 * (see comment in mdc/mdc_locks.c::mdc_intent_lock()), request will
+	 * be freed in ll_lookup_it or in ll_intent_release. But if
+	 * request was not completed, we need to free it. (bug 5154, 9903) */
+	if (req != NULL && !it_disposition(it, DISP_ENQ_COMPLETE))
+		ptlrpc_req_finished(req);
+	if (rc == 0) {
+		/* mdt may grant layout lock for the newly created file, so
+		 * release the lock to avoid leaking */
+		ll_intent_drop_lock(it);
+		ll_invalidate_aliases(de->d_inode);
+	} else {
+		__u64 bits = 0;
+		__u64 matched_bits = 0;
+
+		CDEBUG(D_DENTRY, "revalidated dentry %.*s (%p) parent %p "
+		       "inode %p refc %d\n", de->d_name.len,
+		       de->d_name.name, de, de->d_parent, de->d_inode,
+		       d_refcount(de));
+
+		ll_set_lock_data(exp, de->d_inode, it, &bits);
+
+		/* Note: We have to match both LOOKUP and PERM lock
+		 * here to make sure the dentry is valid and no one
+		 * changing the permission.
+		 * But if the client connects < 2.4 server, which will
+		 * only grant LOOKUP lock, so we can only Match LOOKUP
+		 * lock for old server */
+		if (exp_connect_flags(ll_i2mdexp(de->d_inode)) &&
+							OBD_CONNECT_LVB_TYPE)
+			matched_bits =
+				MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM;
+		else
+			matched_bits = MDS_INODELOCK_LOOKUP;
+
+		if (((bits & matched_bits) == matched_bits) &&
+		    d_lustre_invalid(de))
+			d_lustre_revalidate(de);
+		ll_lookup_finish_locks(it, de);
+	}
+
+mark:
+	if (it != NULL && it->it_op == IT_GETATTR && rc > 0)
+		ll_statahead_mark(parent, de);
+	RETURN(rc);
+
+	/*
+	 * This part is here to combat evil-evil race in real_lookup on 2.6
+	 * kernels.  The race details are: We enter do_lookup() looking for some
+	 * name, there is nothing in dcache for this name yet and d_lookup()
+	 * returns NULL.  We proceed to real_lookup(), and while we do this,
+	 * another process does open on the same file we looking up (most simple
+	 * reproducer), open succeeds and the dentry is added. Now back to
+	 * us. In real_lookup() we do d_lookup() again and suddenly find the
+	 * dentry, so we call d_revalidate on it, but there is no lock, so
+	 * without this code we would return 0, but unpatched real_lookup just
+	 * returns -ENOENT in such a case instead of retrying the lookup. Once
+	 * this is dealt with in real_lookup(), all of this ugly mess can go and
+	 * we can just check locks in ->d_revalidate without doing any RPCs
+	 * ever.
+	 */
+do_lookup:
+	if (it != &lookup_it) {
+		/* MDS_INODELOCK_UPDATE needed for IT_GETATTR case. */
+		if (it->it_op == IT_GETATTR)
+			lookup_it.it_op = IT_GETATTR;
+		ll_lookup_finish_locks(it, de);
+		it = &lookup_it;
+	}
+
+	/* Do real lookup here. */
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, de->d_name.name,
+				     de->d_name.len, 0, (it->it_op & IT_CREAT ?
+							 LUSTRE_OPC_CREATE :
+							 LUSTRE_OPC_ANY), NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	rc = md_intent_lock(exp, op_data, NULL, 0,  it, 0, &req,
+			    ll_md_blocking_ast, 0);
+	if (rc >= 0) {
+		struct mdt_body *mdt_body;
+		struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
+		mdt_body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+		if (de->d_inode)
+			fid = *ll_inode2fid(de->d_inode);
+
+		/* see if we got same inode, if not - return error */
+		if (lu_fid_eq(&fid, &mdt_body->fid1)) {
+			ll_finish_md_op_data(op_data);
+			op_data = NULL;
+			goto revalidate_finish;
+		}
+		ll_intent_release(it);
+	}
+	ll_finish_md_op_data(op_data);
+	GOTO(out, rc = 0);
+
+out_sa:
+	/*
+	 * For rc == 1 case, should not return directly to prevent losing
+	 * statahead windows; for rc == 0 case, the "lookup" will be done later.
+	 */
+	if (it != NULL && it->it_op == IT_GETATTR && rc == 1)
+		ll_statahead_enter(parent, &de, 1);
+	goto mark;
+}
+
+/*
+ * Always trust cached dentries. Update statahead window if necessary.
+ */
+int ll_revalidate_nd(struct dentry *dentry, unsigned int flags)
+{
+	struct inode *parent = dentry->d_parent->d_inode;
+	int unplug = 0;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%s,flags=%u\n",
+	       dentry->d_name.name, flags);
+
+	if (!(flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE)) &&
+	    ll_need_statahead(parent, dentry) > 0) {
+		if (flags & LOOKUP_RCU)
+			RETURN(-ECHILD);
+
+		if (dentry->d_inode == NULL)
+			unplug = 1;
+		do_statahead_enter(parent, &dentry, unplug);
+		ll_statahead_mark(parent, dentry);
+	}
+
+	RETURN(1);
+}
+
+
+void ll_d_iput(struct dentry *de, struct inode *inode)
+{
+	LASSERT(inode);
+	if (!find_cbdata(inode))
+		clear_nlink(inode);
+	iput(inode);
+}
+
+struct dentry_operations ll_d_ops = {
+	.d_revalidate = ll_revalidate_nd,
+	.d_release = ll_release,
+	.d_delete  = ll_ddelete,
+	.d_iput    = ll_d_iput,
+	.d_compare = ll_dcompare,
+};

diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c
new file mode 100644
index 0000000..23c61fe
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/dir.c

@@ -0,0 +1,1978 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/dir.c
+ *
+ * Directory code for lustre client.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+#include <linux/buffer_head.h>   // for wait_on_buffer
+#include <linux/pagevec.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre/lustre_idl.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include "llite_internal.h"
+
+/*
+ * (new) readdir implementation overview.
+ *
+ * Original lustre readdir implementation cached exact copy of raw directory
+ * pages on the client. These pages were indexed in client page cache by
+ * logical offset in the directory file. This design, while very simple and
+ * intuitive had some inherent problems:
+ *
+ *     . it implies that byte offset to the directory entry serves as a
+ *     telldir(3)/seekdir(3) cookie, but that offset is not stable: in
+ *     ext3/htree directory entries may move due to splits, and more
+ *     importantly,
+ *
+ *     . it is incompatible with the design of split directories for cmd3,
+ *     that assumes that names are distributed across nodes based on their
+ *     hash, and so readdir should be done in hash order.
+ *
+ * New readdir implementation does readdir in hash order, and uses hash of a
+ * file name as a telldir/seekdir cookie. This led to number of complications:
+ *
+ *     . hash is not unique, so it cannot be used to index cached directory
+ *     pages on the client (note, that it requires a whole pageful of hash
+ *     collided entries to cause two pages to have identical hashes);
+ *
+ *     . hash is not unique, so it cannot, strictly speaking, be used as an
+ *     entry cookie. ext3/htree has the same problem and lustre implementation
+ *     mimics their solution: seekdir(hash) positions directory at the first
+ *     entry with the given hash.
+ *
+ * Client side.
+ *
+ * 0. caching
+ *
+ * Client caches directory pages using hash of the first entry as an index. As
+ * noted above hash is not unique, so this solution doesn't work as is:
+ * special processing is needed for "page hash chains" (i.e., sequences of
+ * pages filled with entries all having the same hash value).
+ *
+ * First, such chains have to be detected. To this end, server returns to the
+ * client the hash of the first entry on the page next to one returned. When
+ * client detects that this hash is the same as hash of the first entry on the
+ * returned page, page hash collision has to be handled. Pages in the
+ * hash chain, except first one, are termed "overflow pages".
+ *
+ * Solution to index uniqueness problem is to not cache overflow
+ * pages. Instead, when page hash collision is detected, all overflow pages
+ * from emerging chain are immediately requested from the server and placed in
+ * a special data structure (struct ll_dir_chain). This data structure is used
+ * by ll_readdir() to process entries from overflow pages. When readdir
+ * invocation finishes, overflow pages are discarded. If page hash collision
+ * chain weren't completely processed, next call to readdir will again detect
+ * page hash collision, again read overflow pages in, process next portion of
+ * entries and again discard the pages. This is not as wasteful as it looks,
+ * because, given reasonable hash, page hash collisions are extremely rare.
+ *
+ * 1. directory positioning
+ *
+ * When seekdir(hash) is called, original
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * Server.
+ *
+ * identification of and access to overflow pages
+ *
+ * page format
+ *
+ * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
+ * a header lu_dirpage which describes the start/end hash, and whether this
+ * page is empty (contains no dir entry) or hash collide with next page.
+ * After client receives reply, several pages will be integrated into dir page
+ * in PAGE_CACHE_SIZE (if PAGE_CACHE_SIZE greater than LU_PAGE_SIZE), and the
+ * lu_dirpage for this integrated page will be adjusted. See
+ * lmv_adjust_dirpages().
+ *
+ */
+
+/* returns the page unlocked, but with a reference */
+static int ll_dir_filler(void *_hash, struct page *page0)
+{
+	struct inode *inode = page0->mapping->host;
+	int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH;
+	struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp;
+	struct ptlrpc_request *request;
+	struct mdt_body *body;
+	struct md_op_data *op_data;
+	__u64 hash = *((__u64 *)_hash);
+	struct page **page_pool;
+	struct page *page;
+	struct lu_dirpage *dp;
+	int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_CACHE_SHIFT;
+	int nrdpgs = 0; /* number of pages read actually */
+	int npages;
+	int i;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash "LPU64"\n",
+	       inode->i_ino, inode->i_generation, inode, hash);
+
+	LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES);
+
+	OBD_ALLOC(page_pool, sizeof(page) * max_pages);
+	if (page_pool != NULL) {
+		page_pool[0] = page0;
+	} else {
+		page_pool = &page0;
+		max_pages = 1;
+	}
+	for (npages = 1; npages < max_pages; npages++) {
+		page = page_cache_alloc_cold(inode->i_mapping);
+		if (!page)
+			break;
+		page_pool[npages] = page;
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	op_data->op_npages = npages;
+	op_data->op_offset = hash;
+	rc = md_readpage(exp, op_data, page_pool, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc == 0) {
+		body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+		/* Checked by mdc_readpage() */
+		LASSERT(body != NULL);
+
+		if (body->valid & OBD_MD_FLSIZE)
+			cl_isize_write(inode, body->size);
+
+		nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_CACHE_SIZE-1)
+			 >> PAGE_CACHE_SHIFT;
+		SetPageUptodate(page0);
+	}
+	unlock_page(page0);
+	ptlrpc_req_finished(request);
+
+	CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages);
+
+	ll_pagevec_init(&lru_pvec, 0);
+	for (i = 1; i < npages; i++) {
+		unsigned long offset;
+		int ret;
+
+		page = page_pool[i];
+
+		if (rc < 0 || i >= nrdpgs) {
+			page_cache_release(page);
+			continue;
+		}
+
+		SetPageUptodate(page);
+
+		dp = kmap(page);
+		hash = le64_to_cpu(dp->ldp_hash_start);
+		kunmap(page);
+
+		offset = hash_x_index(hash, hash64);
+
+		prefetchw(&page->flags);
+		ret = add_to_page_cache_lru(page, inode->i_mapping, offset,
+					    GFP_KERNEL);
+		if (ret == 0) {
+			unlock_page(page);
+			if (ll_pagevec_add(&lru_pvec, page) == 0)
+				ll_pagevec_lru_add_file(&lru_pvec);
+		} else {
+			CDEBUG(D_VFSTRACE, "page %lu add to page cache failed:"
+			       " %d\n", offset, ret);
+		}
+		page_cache_release(page);
+	}
+	ll_pagevec_lru_add_file(&lru_pvec);
+
+	if (page_pool != &page0)
+		OBD_FREE(page_pool, sizeof(struct page *) * max_pages);
+	EXIT;
+	return rc;
+}
+
+static void ll_check_page(struct inode *dir, struct page *page)
+{
+	/* XXX: check page format later */
+	SetPageChecked(page);
+}
+
+void ll_release_page(struct page *page, int remove)
+{
+	kunmap(page);
+	if (remove) {
+		lock_page(page);
+		if (likely(page->mapping != NULL))
+			truncate_complete_page(page->mapping, page);
+		unlock_page(page);
+	}
+	page_cache_release(page);
+}
+
+/*
+ * Find, kmap and return page that contains given hash.
+ */
+static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
+				       __u64 *start, __u64 *end)
+{
+	int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+	struct address_space *mapping = dir->i_mapping;
+	/*
+	 * Complement of hash is used as an index so that
+	 * radix_tree_gang_lookup() can be used to find a page with starting
+	 * hash _smaller_ than one we are looking for.
+	 */
+	unsigned long offset = hash_x_index(*hash, hash64);
+	struct page *page;
+	int found;
+
+	TREE_READ_LOCK_IRQ(mapping);
+	found = radix_tree_gang_lookup(&mapping->page_tree,
+				       (void **)&page, offset, 1);
+	if (found > 0) {
+		struct lu_dirpage *dp;
+
+		page_cache_get(page);
+		TREE_READ_UNLOCK_IRQ(mapping);
+		/*
+		 * In contrast to find_lock_page() we are sure that directory
+		 * page cannot be truncated (while DLM lock is held) and,
+		 * hence, can avoid restart.
+		 *
+		 * In fact, page cannot be locked here at all, because
+		 * ll_dir_filler() does synchronous io.
+		 */
+		wait_on_page_locked(page);
+		if (PageUptodate(page)) {
+			dp = kmap(page);
+			if (BITS_PER_LONG == 32 && hash64) {
+				*start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+				*end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+				*hash  = *hash >> 32;
+			} else {
+				*start = le64_to_cpu(dp->ldp_hash_start);
+				*end   = le64_to_cpu(dp->ldp_hash_end);
+			}
+			LASSERTF(*start <= *hash, "start = "LPX64",end = "
+				 LPX64",hash = "LPX64"\n", *start, *end, *hash);
+			CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash "LPU64"\n",
+			       offset, *start, *end, *hash);
+			if (*hash > *end) {
+				ll_release_page(page, 0);
+				page = NULL;
+			} else if (*end != *start && *hash == *end) {
+				/*
+				 * upon hash collision, remove this page,
+				 * otherwise put page reference, and
+				 * ll_get_dir_page() will issue RPC to fetch
+				 * the page we want.
+				 */
+				ll_release_page(page,
+				    le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+				page = NULL;
+			}
+		} else {
+			page_cache_release(page);
+			page = ERR_PTR(-EIO);
+		}
+
+	} else {
+		TREE_READ_UNLOCK_IRQ(mapping);
+		page = NULL;
+	}
+	return page;
+}
+
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+			     struct ll_dir_chain *chain)
+{
+	ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
+	struct address_space *mapping = dir->i_mapping;
+	struct lustre_handle lockh;
+	struct lu_dirpage *dp;
+	struct page *page;
+	ldlm_mode_t mode;
+	int rc;
+	__u64 start = 0;
+	__u64 end = 0;
+	__u64 lhash = hash;
+	struct ll_inode_info *lli = ll_i2info(dir);
+	int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+
+	mode = LCK_PR;
+	rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED,
+			   ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh);
+	if (!rc) {
+		struct ldlm_enqueue_info einfo = {.ei_type = LDLM_IBITS,
+						  .ei_mode = mode,
+						  .ei_cb_bl =
+						  ll_md_blocking_ast,
+						  .ei_cb_cp =
+						  ldlm_completion_ast,
+						  .ei_cb_gl = NULL,
+						  .ei_cb_wg = NULL,
+						  .ei_cbdata = NULL};
+		struct lookup_intent it = { .it_op = IT_READDIR };
+		struct ptlrpc_request *request;
+		struct md_op_data *op_data;
+
+		op_data = ll_prep_md_op_data(NULL, dir, NULL, NULL, 0, 0,
+		LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			return (void *)op_data;
+
+		rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it,
+				op_data, &lockh, NULL, 0, NULL, 0);
+
+		ll_finish_md_op_data(op_data);
+
+		request = (struct ptlrpc_request *)it.d.lustre.it_data;
+		if (request)
+			ptlrpc_req_finished(request);
+		if (rc < 0) {
+			CERROR("lock enqueue: "DFID" at "LPU64": rc %d\n",
+				PFID(ll_inode2fid(dir)), hash, rc);
+			return ERR_PTR(rc);
+		}
+
+		CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n",
+		       dir, dir->i_ino, dir->i_generation);
+		md_set_lock_data(ll_i2sbi(dir)->ll_md_exp,
+				 &it.d.lustre.it_lock_handle, dir, NULL);
+	} else {
+		/* for cross-ref object, l_ast_data of the lock may not be set,
+		 * we reset it here */
+		md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie,
+				 dir, NULL);
+	}
+	ldlm_lock_dump_handle(D_OTHER, &lockh);
+
+	mutex_lock(&lli->lli_readdir_mutex);
+	page = ll_dir_page_locate(dir, &lhash, &start, &end);
+	if (IS_ERR(page)) {
+		CERROR("dir page locate: "DFID" at "LPU64": rc %ld\n",
+		       PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
+		GOTO(out_unlock, page);
+	} else if (page != NULL) {
+		/*
+		 * XXX nikita: not entirely correct handling of a corner case:
+		 * suppose hash chain of entries with hash value HASH crosses
+		 * border between pages P0 and P1. First both P0 and P1 are
+		 * cached, seekdir() is called for some entry from the P0 part
+		 * of the chain. Later P0 goes out of cache. telldir(HASH)
+		 * happens and finds P1, as it starts with matching hash
+		 * value. Remaining entries from P0 part of the chain are
+		 * skipped. (Is that really a bug?)
+		 *
+		 * Possible solutions: 0. don't cache P1 is such case, handle
+		 * it as an "overflow" page. 1. invalidate all pages at
+		 * once. 2. use HASH|1 as an index for P1.
+		 */
+		GOTO(hash_collision, page);
+	}
+
+	page = read_cache_page(mapping, hash_x_index(hash, hash64),
+			       ll_dir_filler, &lhash);
+	if (IS_ERR(page)) {
+		CERROR("read cache page: "DFID" at "LPU64": rc %ld\n",
+		       PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
+		GOTO(out_unlock, page);
+	}
+
+	wait_on_page_locked(page);
+	(void)kmap(page);
+	if (!PageUptodate(page)) {
+		CERROR("page not updated: "DFID" at "LPU64": rc %d\n",
+		       PFID(ll_inode2fid(dir)), hash, -5);
+		goto fail;
+	}
+	if (!PageChecked(page))
+		ll_check_page(dir, page);
+	if (PageError(page)) {
+		CERROR("page error: "DFID" at "LPU64": rc %d\n",
+		       PFID(ll_inode2fid(dir)), hash, -5);
+		goto fail;
+	}
+hash_collision:
+	dp = page_address(page);
+	if (BITS_PER_LONG == 32 && hash64) {
+		start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+		end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+		lhash = hash >> 32;
+	} else {
+		start = le64_to_cpu(dp->ldp_hash_start);
+		end   = le64_to_cpu(dp->ldp_hash_end);
+		lhash = hash;
+	}
+	if (end == start) {
+		LASSERT(start == lhash);
+		CWARN("Page-wide hash collision: "LPU64"\n", end);
+		if (BITS_PER_LONG == 32 && hash64)
+			CWARN("Real page-wide hash collision at ["LPU64" "LPU64
+			      "] with hash "LPU64"\n",
+			      le64_to_cpu(dp->ldp_hash_start),
+			      le64_to_cpu(dp->ldp_hash_end), hash);
+		/*
+		 * Fetch whole overflow chain...
+		 *
+		 * XXX not yet.
+		 */
+		goto fail;
+	}
+out_unlock:
+	mutex_unlock(&lli->lli_readdir_mutex);
+	ldlm_lock_decref(&lockh, mode);
+	return page;
+
+fail:
+	ll_release_page(page, 1);
+	page = ERR_PTR(-EIO);
+	goto out_unlock;
+}
+
+int ll_dir_read(struct inode *inode, __u64 *_pos, void *cookie,
+		filldir_t filldir)
+{
+	struct ll_inode_info *info       = ll_i2info(inode);
+	struct ll_sb_info    *sbi	= ll_i2sbi(inode);
+	__u64		 pos	= *_pos;
+	int		   api32      = ll_need_32bit_api(sbi);
+	int		   hash64     = sbi->ll_flags & LL_SBI_64BIT_HASH;
+	struct page	  *page;
+	struct ll_dir_chain   chain;
+	int		   done = 0;
+	int		   rc = 0;
+	ENTRY;
+
+	ll_dir_chain_init(&chain);
+
+	page = ll_get_dir_page(inode, pos, &chain);
+
+	while (rc == 0 && !done) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (!IS_ERR(page)) {
+			/*
+			 * If page is empty (end of directory is reached),
+			 * use this value.
+			 */
+			__u64 hash = MDS_DIR_END_OFF;
+			__u64 next;
+
+			dp = page_address(page);
+			for (ent = lu_dirent_start(dp); ent != NULL && !done;
+			     ent = lu_dirent_next(ent)) {
+				__u16	  type;
+				int	    namelen;
+				struct lu_fid  fid;
+				__u64	  lhash;
+				__u64	  ino;
+
+				/*
+				 * XXX: implement correct swabbing here.
+				 */
+
+				hash = le64_to_cpu(ent->lde_hash);
+				if (hash < pos)
+					/*
+					 * Skip until we find target hash
+					 * value.
+					 */
+					continue;
+
+				namelen = le16_to_cpu(ent->lde_namelen);
+				if (namelen == 0)
+					/*
+					 * Skip dummy record.
+					 */
+					continue;
+
+				if (api32 && hash64)
+					lhash = hash >> 32;
+				else
+					lhash = hash;
+				fid_le_to_cpu(&fid, &ent->lde_fid);
+				ino = cl_fid_build_ino(&fid, api32);
+				type = ll_dirent_type_get(ent);
+				/* For 'll_nfs_get_name_filldir()', it will try
+				 * to access the 'ent' through its 'lde_name',
+				 * so the parameter 'name' for 'filldir()' must
+				 * be part of the 'ent'. */
+				done = filldir(cookie, ent->lde_name, namelen,
+					       lhash, ino, type);
+			}
+			next = le64_to_cpu(dp->ldp_hash_end);
+			if (!done) {
+				pos = next;
+				if (pos == MDS_DIR_END_OFF) {
+					/*
+					 * End of directory reached.
+					 */
+					done = 1;
+					ll_release_page(page, 0);
+				} else if (1 /* chain is exhausted*/) {
+					/*
+					 * Normal case: continue to the next
+					 * page.
+					 */
+					ll_release_page(page,
+					    le32_to_cpu(dp->ldp_flags) &
+							LDF_COLLIDE);
+					next = pos;
+					page = ll_get_dir_page(inode, pos,
+							       &chain);
+				} else {
+					/*
+					 * go into overflow page.
+					 */
+					LASSERT(le32_to_cpu(dp->ldp_flags) &
+						LDF_COLLIDE);
+					ll_release_page(page, 1);
+				}
+			} else {
+				pos = hash;
+				ll_release_page(page, 0);
+			}
+		} else {
+			rc = PTR_ERR(page);
+			CERROR("error reading dir "DFID" at %lu: rc %d\n",
+			       PFID(&info->lli_fid), (unsigned long)pos, rc);
+		}
+	}
+
+	*_pos = pos;
+	ll_dir_chain_fini(&chain);
+	RETURN(rc);
+}
+
+static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
+{
+	struct inode		*inode	= filp->f_dentry->d_inode;
+	struct ll_file_data	*lfd	= LUSTRE_FPRIVATE(filp);
+	struct ll_sb_info	*sbi	= ll_i2sbi(inode);
+	__u64			pos	= lfd->lfd_pos;
+	int			hash64	= sbi->ll_flags & LL_SBI_64BIT_HASH;
+	int			api32	= ll_need_32bit_api(sbi);
+	int			rc;
+	struct path		path;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu "
+	       " 32bit_api %d\n", inode->i_ino, inode->i_generation,
+	       inode, (unsigned long)pos, i_size_read(inode), api32);
+
+	if (pos == MDS_DIR_END_OFF)
+		/*
+		 * end-of-file.
+		 */
+		GOTO(out, rc = 0);
+
+	rc = ll_dir_read(inode, &pos, cookie, filldir);
+	lfd->lfd_pos = pos;
+	if (pos == MDS_DIR_END_OFF) {
+		if (api32)
+			filp->f_pos = LL_DIR_END_OFF_32BIT;
+		else
+			filp->f_pos = LL_DIR_END_OFF;
+	} else {
+		if (api32 && hash64)
+			filp->f_pos = pos >> 32;
+		else
+			filp->f_pos = pos;
+	}
+	filp->f_version = inode->i_version;
+	path.mnt = filp->f_path.mnt;
+	path.dentry = filp->f_dentry;
+	touch_atime(&path);
+
+out:
+	if (!rc)
+		ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1);
+
+	RETURN(rc);
+}
+
+int ll_send_mgc_param(struct obd_export *mgc, char *string)
+{
+	struct mgs_send_param *msp;
+	int rc = 0;
+
+	OBD_ALLOC_PTR(msp);
+	if (!msp)
+		return -ENOMEM;
+
+	strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
+	rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
+				sizeof(struct mgs_send_param), msp, NULL);
+	if (rc)
+		CERROR("Failed to set parameter: %d\n", rc);
+	OBD_FREE_PTR(msp);
+
+	return rc;
+}
+
+int ll_dir_setdirstripe(struct inode *dir, struct lmv_user_md *lump,
+			char *filename)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	int mode;
+	int err;
+
+	ENTRY;
+
+	mode = (0755 & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR;
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, filename,
+				     strlen(filename), mode, LUSTRE_OPC_MKDIR,
+				     lump);
+	if (IS_ERR(op_data))
+		GOTO(err_exit, err = PTR_ERR(op_data));
+
+	op_data->op_cli_flags |= CLI_SET_MEA;
+	err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
+			current_fsuid(), current_fsgid(),
+			cfs_curproc_cap_pack(), 0, &request);
+	ll_finish_md_op_data(op_data);
+	if (err)
+		GOTO(err_exit, err);
+err_exit:
+	ptlrpc_req_finished(request);
+	return err;
+}
+
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+		     int set_default)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	int rc = 0;
+	struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int lum_size;
+	ENTRY;
+
+	if (lump != NULL) {
+		/*
+		 * This is coming from userspace, so should be in
+		 * local endian.  But the MDS would like it in little
+		 * endian, so we swab it before we send it.
+		 */
+		switch (lump->lmm_magic) {
+		case LOV_USER_MAGIC_V1: {
+			if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+				lustre_swab_lov_user_md_v1(lump);
+			lum_size = sizeof(struct lov_user_md_v1);
+			break;
+		}
+		case LOV_USER_MAGIC_V3: {
+			if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+				lustre_swab_lov_user_md_v3(
+					(struct lov_user_md_v3 *)lump);
+			lum_size = sizeof(struct lov_user_md_v3);
+			break;
+		}
+		default: {
+			CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+					" %#08x != %#08x nor %#08x\n",
+					lump->lmm_magic, LOV_USER_MAGIC_V1,
+					LOV_USER_MAGIC_V3);
+			RETURN(-EINVAL);
+		}
+		}
+	} else {
+		lum_size = sizeof(struct lov_user_md_v1);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	if (lump != NULL && lump->lmm_magic == cpu_to_le32(LMV_USER_MAGIC))
+		op_data->op_cli_flags |= CLI_SET_MEA;
+
+	/* swabbing is done in lov_setstripe() on server side */
+	rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size,
+			NULL, 0, &req, NULL);
+	ll_finish_md_op_data(op_data);
+	ptlrpc_req_finished(req);
+	if (rc) {
+		if (rc != -EPERM && rc != -EACCES)
+			CERROR("mdc_setattr fails: rc = %d\n", rc);
+	}
+
+	/* In the following we use the fact that LOV_USER_MAGIC_V1 and
+	 LOV_USER_MAGIC_V3 have the same initial fields so we do not
+	 need the make the distiction between the 2 versions */
+	if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
+		char *param = NULL;
+		char *buf;
+
+		OBD_ALLOC(param, MGS_PARAM_MAXLEN);
+		if (param == NULL)
+			GOTO(end, rc = -ENOMEM);
+
+		buf = param;
+		/* Get fsname and assume devname to be -MDT0000. */
+		ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN);
+		strcat(buf, "-MDT0000.lov");
+		buf += strlen(buf);
+
+		/* Set root stripesize */
+		sprintf(buf, ".stripesize=%u",
+			lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+		if (rc)
+			GOTO(end, rc);
+
+		/* Set root stripecount */
+		sprintf(buf, ".stripecount=%hd",
+			lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+		if (rc)
+			GOTO(end, rc);
+
+		/* Set root stripeoffset */
+		sprintf(buf, ".stripeoffset=%hd",
+			lump ? le16_to_cpu(lump->lmm_stripe_offset) :
+			(typeof(lump->lmm_stripe_offset))(-1));
+		rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+
+end:
+		if (param != NULL)
+			OBD_FREE(param, MGS_PARAM_MAXLEN);
+	}
+	RETURN(rc);
+}
+
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+		     int *lmm_size, struct ptlrpc_request **request)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct mdt_body   *body;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *req = NULL;
+	int rc, lmmsize;
+	struct md_op_data *op_data;
+
+	rc = ll_get_max_mdsize(sbi, &lmmsize);
+	if (rc)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+				     0, lmmsize, LUSTRE_OPC_ANY,
+				     NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr failed on inode "
+		       "%lu/%u: rc %d\n", inode->i_ino,
+		       inode->i_generation, rc);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	lmmsize = body->eadatasize;
+
+	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+	    lmmsize == 0) {
+		GOTO(out, rc = -ENODATA);
+	}
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill,
+					   &RMF_MDT_MD, lmmsize);
+	LASSERT(lmm != NULL);
+
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian.  We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	/* We don't swab objects for directories */
+	switch (le32_to_cpu(lmm->lmm_magic)) {
+	case LOV_MAGIC_V1:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+		break;
+	case LOV_MAGIC_V3:
+		if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+		break;
+	default:
+		CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
+		rc = -EPROTO;
+	}
+out:
+	*lmmp = lmm;
+	*lmm_size = lmmsize;
+	*request = req;
+	return rc;
+}
+
+/*
+ *  Get MDT index for the inode.
+ */
+int ll_get_mdt_idx(struct inode *inode)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct md_op_data *op_data;
+	int rc, mdtidx;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_flags |= MF_GET_MDT_IDX;
+	rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
+	mdtidx = op_data->op_mds;
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+		RETURN(rc);
+	}
+	return mdtidx;
+}
+
+/**
+ * Generic handler to do any pre-copy work.
+ *
+ * It send a first hsm_progress (with extent length == 0) to coordinator as a
+ * first information for it that real work has started.
+ *
+ * Moreover, for a ARCHIVE request, it will sample the file data version and
+ * store it in \a copy.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
+{
+	struct ll_sb_info		*sbi = ll_s2sbi(sb);
+	struct hsm_progress_kernel	 hpk;
+	int				 rc;
+	ENTRY;
+
+	/* Forge a hsm_progress based on data from copy. */
+	hpk.hpk_fid = copy->hc_hai.hai_fid;
+	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+	hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
+	hpk.hpk_extent.length = 0;
+	hpk.hpk_flags = 0;
+	hpk.hpk_errval = 0;
+	hpk.hpk_data_version = 0;
+
+
+	/* For archive request, we need to read the current file version. */
+	if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
+		struct inode	*inode;
+		__u64		 data_version = 0;
+
+		/* Get inode for this fid */
+		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+		if (IS_ERR(inode)) {
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval is >= 0 */
+			hpk.hpk_errval = -PTR_ERR(inode);
+			GOTO(progress, rc = PTR_ERR(inode));
+		}
+
+		/* Read current file data version */
+		rc = ll_data_version(inode, &data_version, 1);
+		iput(inode);
+		if (rc != 0) {
+			CDEBUG(D_HSM, "Could not read file data version of "
+				      DFID" (rc = %d). Archive request ("
+				      LPX64") could not be done.\n",
+				      PFID(&copy->hc_hai.hai_fid), rc,
+				      copy->hc_hai.hai_cookie);
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -rc;
+			GOTO(progress, rc);
+		}
+
+		/* Store it the hsm_copy for later copytool use.
+		 * Always modified even if no lsm. */
+		copy->hc_data_version = data_version;
+	}
+
+progress:
+	rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+			   &hpk, NULL);
+
+	RETURN(rc);
+}
+
+/**
+ * Generic handler to do any post-copy work.
+ *
+ * It will send the last hsm_progress update to coordinator to inform it
+ * that copy is finished and whether it was successful or not.
+ *
+ * Moreover,
+ * - for ARCHIVE request, it will sample the file data version and compare it
+ *   with the version saved in ll_ioc_copy_start(). If they do not match, copy
+ *   will be considered as failed.
+ * - for RESTORE request, it will sample the file data version and send it to
+ *   coordinator which is useful if the file was imported as 'released'.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
+{
+	struct ll_sb_info		*sbi = ll_s2sbi(sb);
+	struct hsm_progress_kernel	 hpk;
+	int				 rc;
+	ENTRY;
+
+	/* If you modify the logic here, also check llapi_hsm_copy_end(). */
+	/* Take care: copy->hc_hai.hai_action, len, gid and data are not
+	 * initialized if copy_end was called with copy == NULL.
+	 */
+
+	/* Forge a hsm_progress based on data from copy. */
+	hpk.hpk_fid = copy->hc_hai.hai_fid;
+	hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+	hpk.hpk_extent = copy->hc_hai.hai_extent;
+	hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
+	hpk.hpk_errval = copy->hc_errval;
+	hpk.hpk_data_version = 0;
+
+	/* For archive request, we need to check the file data was not changed.
+	 *
+	 * For restore request, we need to send the file data version, this is
+	 * useful when the file was created using hsm_import.
+	 */
+	if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
+	     (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
+	    (copy->hc_errval == 0)) {
+		struct inode	*inode;
+		__u64		 data_version = 0;
+
+		/* Get lsm for this fid */
+		inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+		if (IS_ERR(inode)) {
+			hpk.hpk_flags |= HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = -PTR_ERR(inode);
+			GOTO(progress, rc = PTR_ERR(inode));
+		}
+
+		rc = ll_data_version(inode, &data_version,
+				     copy->hc_hai.hai_action == HSMA_ARCHIVE);
+		iput(inode);
+		if (rc) {
+			CDEBUG(D_HSM, "Could not read file data version. "
+				      "Request could not be confirmed.\n");
+			if (hpk.hpk_errval == 0)
+				hpk.hpk_errval = -rc;
+			GOTO(progress, rc);
+		}
+
+		/* Store it the hsm_copy for later copytool use.
+		 * Always modified even if no lsm. */
+		hpk.hpk_data_version = data_version;
+
+		/* File could have been stripped during archiving, so we need
+		 * to check anyway. */
+		if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
+		    (copy->hc_data_version != data_version)) {
+			CDEBUG(D_HSM, "File data version mismatched. "
+			      "File content was changed during archiving. "
+			       DFID", start:"LPX64" current:"LPX64"\n",
+			       PFID(&copy->hc_hai.hai_fid),
+			       copy->hc_data_version, data_version);
+			/* File was changed, send error to cdt. Do not ask for
+			 * retry because if a file is modified frequently,
+			 * the cdt will loop on retried archive requests.
+			 * The policy engine will ask for a new archive later
+			 * when the file will not be modified for some tunable
+			 * time */
+			/* we do not notify caller */
+			hpk.hpk_flags &= ~HP_FLAG_RETRY;
+			/* hpk_errval must be >= 0 */
+			hpk.hpk_errval = EBUSY;
+		}
+
+	}
+
+progress:
+	rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+			   &hpk, NULL);
+
+	RETURN(rc);
+}
+
+
+static int copy_and_ioctl(int cmd, struct obd_export *exp, void *data, int len)
+{
+	void *ptr;
+	int rc;
+
+	OBD_ALLOC(ptr, len);
+	if (ptr == NULL)
+		return -ENOMEM;
+	if (copy_from_user(ptr, data, len)) {
+		OBD_FREE(ptr, len);
+		return -EFAULT;
+	}
+	rc = obd_iocontrol(cmd, exp, len, data, NULL);
+	OBD_FREE(ptr, len);
+	return rc;
+}
+
+static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
+{
+	int cmd = qctl->qc_cmd;
+	int type = qctl->qc_type;
+	int id = qctl->qc_id;
+	int valid = qctl->qc_valid;
+	int rc = 0;
+	ENTRY;
+
+	switch (cmd) {
+	case LUSTRE_Q_INVALIDATE:
+	case LUSTRE_Q_FINVALIDATE:
+	case Q_QUOTAON:
+	case Q_QUOTAOFF:
+	case Q_SETQUOTA:
+	case Q_SETINFO:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			RETURN(-EPERM);
+		break;
+	case Q_GETQUOTA:
+		if (((type == USRQUOTA && current_euid() != id) ||
+		     (type == GRPQUOTA && !in_egroup_p(id))) &&
+		    (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+		     sbi->ll_flags & LL_SBI_RMT_CLIENT))
+			RETURN(-EPERM);
+		break;
+	case Q_GETINFO:
+		break;
+	default:
+		CERROR("unsupported quotactl op: %#x\n", cmd);
+		RETURN(-ENOTTY);
+	}
+
+	if (valid != QC_GENERAL) {
+		if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			RETURN(-EOPNOTSUPP);
+
+		if (cmd == Q_GETINFO)
+			qctl->qc_cmd = Q_GETOINFO;
+		else if (cmd == Q_GETQUOTA)
+			qctl->qc_cmd = Q_GETOQUOTA;
+		else
+			RETURN(-EINVAL);
+
+		switch (valid) {
+		case QC_MDTIDX:
+			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+					   sizeof(*qctl), qctl, NULL);
+			break;
+		case QC_OSTIDX:
+			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
+					   sizeof(*qctl), qctl, NULL);
+			break;
+		case QC_UUID:
+			rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+					   sizeof(*qctl), qctl, NULL);
+			if (rc == -EAGAIN)
+				rc = obd_iocontrol(OBD_IOC_QUOTACTL,
+						   sbi->ll_dt_exp,
+						   sizeof(*qctl), qctl, NULL);
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc)
+			RETURN(rc);
+
+		qctl->qc_cmd = cmd;
+	} else {
+		struct obd_quotactl *oqctl;
+
+		OBD_ALLOC_PTR(oqctl);
+		if (oqctl == NULL)
+			RETURN(-ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(sbi->ll_md_exp, oqctl);
+		if (rc) {
+			if (rc != -EALREADY && cmd == Q_QUOTAON) {
+				oqctl->qc_cmd = Q_QUOTAOFF;
+				obd_quotactl(sbi->ll_md_exp, oqctl);
+			}
+			OBD_FREE_PTR(oqctl);
+			RETURN(rc);
+		}
+		/* If QIF_SPACE is not set, client should collect the
+		 * space usage from OSSs by itself */
+		if (cmd == Q_GETQUOTA &&
+		    !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
+		    !oqctl->qc_dqblk.dqb_curspace) {
+			struct obd_quotactl *oqctl_tmp;
+
+			OBD_ALLOC_PTR(oqctl_tmp);
+			if (oqctl_tmp == NULL)
+				GOTO(out, rc = -ENOMEM);
+
+			oqctl_tmp->qc_cmd = Q_GETOQUOTA;
+			oqctl_tmp->qc_id = oqctl->qc_id;
+			oqctl_tmp->qc_type = oqctl->qc_type;
+
+			/* collect space usage from OSTs */
+			oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+			rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
+			if (!rc || rc == -EREMOTEIO) {
+				oqctl->qc_dqblk.dqb_curspace =
+					oqctl_tmp->qc_dqblk.dqb_curspace;
+				oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
+			}
+
+			/* collect space & inode usage from MDTs */
+			oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+			oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
+			rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
+			if (!rc || rc == -EREMOTEIO) {
+				oqctl->qc_dqblk.dqb_curspace +=
+					oqctl_tmp->qc_dqblk.dqb_curspace;
+				oqctl->qc_dqblk.dqb_curinodes =
+					oqctl_tmp->qc_dqblk.dqb_curinodes;
+				oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
+			} else {
+				oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
+			}
+
+			OBD_FREE_PTR(oqctl_tmp);
+		}
+out:
+		QCTL_COPY(qctl, oqctl);
+		OBD_FREE_PTR(oqctl);
+	}
+
+	RETURN(rc);
+}
+
+static char *
+ll_getname(const char __user *filename)
+{
+	int ret = 0, len;
+	char *tmp = __getname();
+
+	if (!tmp)
+		return ERR_PTR(-ENOMEM);
+
+	len = strncpy_from_user(tmp, filename, PATH_MAX);
+	if (len == 0)
+		ret = -ENOENT;
+	else if (len > PATH_MAX)
+		ret = -ENAMETOOLONG;
+
+	if (ret) {
+		__putname(tmp);
+		tmp =  ERR_PTR(ret);
+	}
+	return tmp;
+}
+
+#define ll_putname(filename) __putname(filename)
+
+static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_ioctl_data *data;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
+	       inode->i_ino, inode->i_generation, inode, cmd);
+
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		return -ENOTTY;
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+	switch(cmd) {
+	case FSFILT_IOC_GETFLAGS:
+	case FSFILT_IOC_SETFLAGS:
+		RETURN(ll_iocontrol(inode, file, cmd, arg));
+	case FSFILT_IOC_GETVERSION_OLD:
+	case FSFILT_IOC_GETVERSION:
+		RETURN(put_user(inode->i_generation, (int *)arg));
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field.
+	case FSFILT_IOC_SETVERSION_OLD:
+	case FSFILT_IOC_SETVERSION:
+	*/
+	case LL_IOC_GET_MDTIDX: {
+		int mdtidx;
+
+		mdtidx = ll_get_mdt_idx(inode);
+		if (mdtidx < 0)
+			RETURN(mdtidx);
+
+		if (put_user((int)mdtidx, (int*)arg))
+			RETURN(-EFAULT);
+
+		return 0;
+	}
+	case IOC_MDC_LOOKUP: {
+		struct ptlrpc_request *request = NULL;
+		int namelen, len = 0;
+		char *buf = NULL;
+		char *filename;
+		struct md_op_data *op_data;
+
+		rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+		if (rc)
+			RETURN(rc);
+		data = (void *)buf;
+
+		filename = data->ioc_inlbuf1;
+		namelen = strlen(filename);
+
+		if (namelen < 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			GOTO(out_free, rc = -EINVAL);
+		}
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, namelen,
+					     0, LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			GOTO(out_free, rc = PTR_ERR(op_data));
+
+		op_data->op_valid = OBD_MD_FLID;
+		rc = md_getattr_name(sbi->ll_md_exp, op_data, &request);
+		ll_finish_md_op_data(op_data);
+		if (rc < 0) {
+			CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+			GOTO(out_free, rc);
+		}
+		ptlrpc_req_finished(request);
+		EXIT;
+out_free:
+		obd_ioctl_freedata(buf, len);
+		return rc;
+	}
+	case LL_IOC_LMV_SETSTRIPE: {
+		struct lmv_user_md  *lum;
+		char		*buf = NULL;
+		char		*filename;
+		int		 namelen = 0;
+		int		 lumlen = 0;
+		int		 len;
+		int		 rc;
+
+		rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+		if (rc)
+			RETURN(rc);
+
+		data = (void *)buf;
+		if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
+		    data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0)
+			GOTO(lmv_out_free, rc = -EINVAL);
+
+		filename = data->ioc_inlbuf1;
+		namelen = data->ioc_inllen1;
+
+		if (namelen < 1) {
+			CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+			GOTO(lmv_out_free, rc = -EINVAL);
+		}
+		lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+		lumlen = data->ioc_inllen2;
+
+		if (lum->lum_magic != LMV_USER_MAGIC ||
+		    lumlen != sizeof(*lum)) {
+			CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
+			       filename, lum->lum_magic, lumlen, -EFAULT);
+			GOTO(lmv_out_free, rc = -EINVAL);
+		}
+
+		/**
+		 * ll_dir_setdirstripe will be used to set dir stripe
+		 *  mdc_create--->mdt_reint_create (with dirstripe)
+		 */
+		rc = ll_dir_setdirstripe(inode, lum, filename);
+lmv_out_free:
+		obd_ioctl_freedata(buf, len);
+		RETURN(rc);
+
+	}
+	case LL_IOC_LOV_SETSTRIPE: {
+		struct lov_user_md_v3 lumv3;
+		struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+		struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+		struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+
+		int set_default = 0;
+
+		LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
+		LASSERT(sizeof(lumv3.lmm_objects[0]) ==
+			sizeof(lumv3p->lmm_objects[0]));
+		/* first try with v1 which is smaller than v3 */
+		if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
+			RETURN(-EFAULT);
+
+		if ((lumv1->lmm_magic == LOV_USER_MAGIC_V3) ) {
+			if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
+				RETURN(-EFAULT);
+		}
+
+		if (inode->i_sb->s_root == file->f_dentry)
+			set_default = 1;
+
+		/* in v1 and v3 cases lumv1 points to data */
+		rc = ll_dir_setstripe(inode, lumv1, set_default);
+
+		RETURN(rc);
+	}
+	case LL_IOC_LMV_GETSTRIPE: {
+		struct lmv_user_md *lump = (struct lmv_user_md *)arg;
+		struct lmv_user_md lum;
+		struct lmv_user_md *tmp;
+		int lum_size;
+		int rc = 0;
+		int mdtindex;
+
+		if (copy_from_user(&lum, lump, sizeof(struct lmv_user_md)))
+			RETURN(-EFAULT);
+
+		if (lum.lum_magic != LMV_MAGIC_V1)
+			RETURN(-EINVAL);
+
+		lum_size = lmv_user_md_size(1, LMV_MAGIC_V1);
+		OBD_ALLOC(tmp, lum_size);
+		if (tmp == NULL)
+			GOTO(free_lmv, rc = -ENOMEM);
+
+		memcpy(tmp, &lum, sizeof(lum));
+		tmp->lum_type = LMV_STRIPE_TYPE;
+		tmp->lum_stripe_count = 1;
+		mdtindex = ll_get_mdt_idx(inode);
+		if (mdtindex < 0)
+			GOTO(free_lmv, rc = -ENOMEM);
+
+		tmp->lum_stripe_offset = mdtindex;
+		tmp->lum_objects[0].lum_mds = mdtindex;
+		memcpy(&tmp->lum_objects[0].lum_fid, ll_inode2fid(inode),
+		       sizeof(struct lu_fid));
+		if (copy_to_user((void *)arg, tmp, lum_size))
+			GOTO(free_lmv, rc = -EFAULT);
+free_lmv:
+		if (tmp)
+			OBD_FREE(tmp, lum_size);
+		RETURN(rc);
+	}
+	case LL_IOC_REMOVE_ENTRY: {
+		char		*filename = NULL;
+		int		 namelen = 0;
+		int		 rc;
+
+		/* Here is a little hack to avoid sending REINT_RMENTRY to
+		 * unsupported server, which might crash the server(LU-2730),
+		 * Because both LVB_TYPE and REINT_RMENTRY will be supported
+		 * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the
+		 * server will support REINT_RMENTRY XXX*/
+		if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE))
+			return -ENOTSUPP;
+
+		filename = ll_getname((const char *)arg);
+		if (IS_ERR(filename))
+			RETURN(PTR_ERR(filename));
+
+		namelen = strlen(filename);
+		if (namelen < 1)
+			GOTO(out_rmdir, rc = -EINVAL);
+
+		rc = ll_rmdir_entry(inode, filename, namelen);
+out_rmdir:
+		if (filename)
+			ll_putname(filename);
+		RETURN(rc);
+	}
+	case LL_IOC_LOV_SWAP_LAYOUTS:
+		RETURN(-EPERM);
+	case LL_IOC_OBD_STATFS:
+		RETURN(ll_obd_statfs(inode, (void *)arg));
+	case LL_IOC_LOV_GETSTRIPE:
+	case LL_IOC_MDC_GETINFO:
+	case IOC_MDC_GETFILEINFO:
+	case IOC_MDC_GETFILESTRIPE: {
+		struct ptlrpc_request *request = NULL;
+		struct lov_user_md *lump;
+		struct lov_mds_md *lmm = NULL;
+		struct mdt_body *body;
+		char *filename = NULL;
+		int lmmsize;
+
+		if (cmd == IOC_MDC_GETFILEINFO ||
+		    cmd == IOC_MDC_GETFILESTRIPE) {
+			filename = ll_getname((const char *)arg);
+			if (IS_ERR(filename))
+				RETURN(PTR_ERR(filename));
+
+			rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+						      &lmmsize, &request);
+		} else {
+			rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+		}
+
+		if (request) {
+			body = req_capsule_server_get(&request->rq_pill,
+						      &RMF_MDT_BODY);
+			LASSERT(body != NULL);
+		} else {
+			GOTO(out_req, rc);
+		}
+
+		if (rc < 0) {
+			if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
+					       cmd == LL_IOC_MDC_GETINFO))
+				GOTO(skip_lmm, rc = 0);
+			else
+				GOTO(out_req, rc);
+		}
+
+		if (cmd == IOC_MDC_GETFILESTRIPE ||
+		    cmd == LL_IOC_LOV_GETSTRIPE) {
+			lump = (struct lov_user_md *)arg;
+		} else {
+			struct lov_user_mds_data *lmdp;
+			lmdp = (struct lov_user_mds_data *)arg;
+			lump = &lmdp->lmd_lmm;
+		}
+		if (copy_to_user(lump, lmm, lmmsize)) {
+			if (copy_to_user(lump, lmm, sizeof(*lump)))
+				GOTO(out_req, rc = -EFAULT);
+			rc = -EOVERFLOW;
+		}
+	skip_lmm:
+		if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
+			struct lov_user_mds_data *lmdp;
+			lstat_t st = { 0 };
+
+			st.st_dev     = inode->i_sb->s_dev;
+			st.st_mode    = body->mode;
+			st.st_nlink   = body->nlink;
+			st.st_uid     = body->uid;
+			st.st_gid     = body->gid;
+			st.st_rdev    = body->rdev;
+			st.st_size    = body->size;
+			st.st_blksize = PAGE_CACHE_SIZE;
+			st.st_blocks  = body->blocks;
+			st.st_atime   = body->atime;
+			st.st_mtime   = body->mtime;
+			st.st_ctime   = body->ctime;
+			st.st_ino     = inode->i_ino;
+
+			lmdp = (struct lov_user_mds_data *)arg;
+			if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st)))
+				GOTO(out_req, rc = -EFAULT);
+		}
+
+		EXIT;
+	out_req:
+		ptlrpc_req_finished(request);
+		if (filename)
+			ll_putname(filename);
+		return rc;
+	}
+	case IOC_LOV_GETINFO: {
+		struct lov_user_mds_data *lumd;
+		struct lov_stripe_md *lsm;
+		struct lov_user_md *lum;
+		struct lov_mds_md *lmm;
+		int lmmsize;
+		lstat_t st;
+
+		lumd = (struct lov_user_mds_data *)arg;
+		lum = &lumd->lmd_lmm;
+
+		rc = ll_get_max_mdsize(sbi, &lmmsize);
+		if (rc)
+			RETURN(rc);
+
+		OBD_ALLOC_LARGE(lmm, lmmsize);
+		if (copy_from_user(lmm, lum, lmmsize))
+			GOTO(free_lmm, rc = -EFAULT);
+
+		switch (lmm->lmm_magic) {
+		case LOV_USER_MAGIC_V1:
+			if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1))
+				break;
+			/* swab objects first so that stripes num will be sane */
+			lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+			break;
+		case LOV_USER_MAGIC_V3:
+			if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3))
+				break;
+			/* swab objects first so that stripes num will be sane */
+			lustre_swab_lov_user_md_objects(
+				((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+			break;
+		default:
+			GOTO(free_lmm, rc = -EINVAL);
+		}
+
+		rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
+		if (rc < 0)
+			GOTO(free_lmm, rc = -ENOMEM);
+
+		/* Perform glimpse_size operation. */
+		memset(&st, 0, sizeof(st));
+
+		rc = ll_glimpse_ioctl(sbi, lsm, &st);
+		if (rc)
+			GOTO(free_lsm, rc);
+
+		if (copy_to_user(&lumd->lmd_st, &st, sizeof(st)))
+			GOTO(free_lsm, rc = -EFAULT);
+
+		EXIT;
+	free_lsm:
+		obd_free_memmd(sbi->ll_dt_exp, &lsm);
+	free_lmm:
+		OBD_FREE_LARGE(lmm, lmmsize);
+		return rc;
+	}
+	case OBD_IOC_LLOG_CATINFO: {
+		RETURN(-EOPNOTSUPP);
+	}
+	case OBD_IOC_QUOTACHECK: {
+		struct obd_quotactl *oqctl;
+		int error = 0;
+
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			RETURN(-EPERM);
+
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			RETURN(-ENOMEM);
+		oqctl->qc_type = arg;
+		rc = obd_quotacheck(sbi->ll_md_exp, oqctl);
+		if (rc < 0) {
+			CDEBUG(D_INFO, "md_quotacheck failed: rc %d\n", rc);
+			error = rc;
+		}
+
+		rc = obd_quotacheck(sbi->ll_dt_exp, oqctl);
+		if (rc < 0)
+			CDEBUG(D_INFO, "obd_quotacheck failed: rc %d\n", rc);
+
+		OBD_FREE_PTR(oqctl);
+		return error ?: rc;
+	}
+	case OBD_IOC_POLL_QUOTACHECK: {
+		struct if_quotacheck *check;
+
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+		    sbi->ll_flags & LL_SBI_RMT_CLIENT)
+			RETURN(-EPERM);
+
+		OBD_ALLOC_PTR(check);
+		if (!check)
+			RETURN(-ENOMEM);
+
+		rc = obd_iocontrol(cmd, sbi->ll_md_exp, 0, (void *)check,
+				   NULL);
+		if (rc) {
+			CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
+			if (copy_to_user((void *)arg, check,
+					     sizeof(*check)))
+				CDEBUG(D_QUOTA, "copy_to_user failed\n");
+			GOTO(out_poll, rc);
+		}
+
+		rc = obd_iocontrol(cmd, sbi->ll_dt_exp, 0, (void *)check,
+				   NULL);
+		if (rc) {
+			CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
+			if (copy_to_user((void *)arg, check,
+					     sizeof(*check)))
+				CDEBUG(D_QUOTA, "copy_to_user failed\n");
+			GOTO(out_poll, rc);
+		}
+	out_poll:
+		OBD_FREE_PTR(check);
+		RETURN(rc);
+	}
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+	case LL_IOC_QUOTACTL_18: {
+		/* copy the old 1.x quota struct for internal use, then copy
+		 * back into old format struct.  For 1.8 compatibility. */
+		struct if_quotactl_18 *qctl_18;
+		struct if_quotactl *qctl_20;
+
+		OBD_ALLOC_PTR(qctl_18);
+		if (!qctl_18)
+			RETURN(-ENOMEM);
+
+		OBD_ALLOC_PTR(qctl_20);
+		if (!qctl_20)
+			GOTO(out_quotactl_18, rc = -ENOMEM);
+
+		if (copy_from_user(qctl_18, (void *)arg, sizeof(*qctl_18)))
+			GOTO(out_quotactl_20, rc = -ENOMEM);
+
+		QCTL_COPY(qctl_20, qctl_18);
+		qctl_20->qc_idx = 0;
+
+		/* XXX: dqb_valid was borrowed as a flag to mark that
+		 *      only mds quota is wanted */
+		if (qctl_18->qc_cmd == Q_GETQUOTA &&
+		    qctl_18->qc_dqblk.dqb_valid) {
+			qctl_20->qc_valid = QC_MDTIDX;
+			qctl_20->qc_dqblk.dqb_valid = 0;
+		} else if (qctl_18->obd_uuid.uuid[0] != '\0') {
+			qctl_20->qc_valid = QC_UUID;
+			qctl_20->obd_uuid = qctl_18->obd_uuid;
+		} else {
+			qctl_20->qc_valid = QC_GENERAL;
+		}
+
+		rc = quotactl_ioctl(sbi, qctl_20);
+
+		if (rc == 0) {
+			QCTL_COPY(qctl_18, qctl_20);
+			qctl_18->obd_uuid = qctl_20->obd_uuid;
+
+			if (copy_to_user((void *)arg, qctl_18,
+					     sizeof(*qctl_18)))
+				rc = -EFAULT;
+		}
+
+	out_quotactl_20:
+		OBD_FREE_PTR(qctl_20);
+	out_quotactl_18:
+		OBD_FREE_PTR(qctl_18);
+		RETURN(rc);
+	}
+#else
+#warning "remove old LL_IOC_QUOTACTL_18 compatibility code"
+#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */
+	case LL_IOC_QUOTACTL: {
+		struct if_quotactl *qctl;
+
+		OBD_ALLOC_PTR(qctl);
+		if (!qctl)
+			RETURN(-ENOMEM);
+
+		if (copy_from_user(qctl, (void *)arg, sizeof(*qctl)))
+			GOTO(out_quotactl, rc = -EFAULT);
+
+		rc = quotactl_ioctl(sbi, qctl);
+
+		if (rc == 0 && copy_to_user((void *)arg,qctl,sizeof(*qctl)))
+			rc = -EFAULT;
+
+	out_quotactl:
+		OBD_FREE_PTR(qctl);
+		RETURN(rc);
+	}
+	case OBD_IOC_GETDTNAME:
+	case OBD_IOC_GETMDNAME:
+		RETURN(ll_get_obd_name(inode, cmd, arg));
+	case LL_IOC_FLUSHCTX:
+		RETURN(ll_flush_ctx(inode));
+#ifdef CONFIG_FS_POSIX_ACL
+	case LL_IOC_RMTACL: {
+	    if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+		inode == inode->i_sb->s_root->d_inode) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		LASSERT(fd != NULL);
+		rc = rct_add(&sbi->ll_rct, current_pid(), arg);
+		if (!rc)
+			fd->fd_flags |= LL_FILE_RMTACL;
+		RETURN(rc);
+	    } else
+		RETURN(0);
+	}
+#endif
+	case LL_IOC_GETOBDCOUNT: {
+		int count, vallen;
+		struct obd_export *exp;
+
+		if (copy_from_user(&count, (int *)arg, sizeof(int)))
+			RETURN(-EFAULT);
+
+		/* get ost count when count is zero, get mdt count otherwise */
+		exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
+		vallen = sizeof(count);
+		rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
+				  KEY_TGT_COUNT, &vallen, &count, NULL);
+		if (rc) {
+			CERROR("get target count failed: %d\n", rc);
+			RETURN(rc);
+		}
+
+		if (copy_to_user((int *)arg, &count, sizeof(int)))
+			RETURN(-EFAULT);
+
+		RETURN(0);
+	}
+	case LL_IOC_PATH2FID:
+		if (copy_to_user((void *)arg, ll_inode2fid(inode),
+				     sizeof(struct lu_fid)))
+			RETURN(-EFAULT);
+		RETURN(0);
+	case LL_IOC_GET_CONNECT_FLAGS: {
+		RETURN(obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, (void*)arg));
+	}
+	case OBD_IOC_CHANGELOG_SEND:
+	case OBD_IOC_CHANGELOG_CLEAR:
+		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+				    sizeof(struct ioc_changelog));
+		RETURN(rc);
+	case OBD_IOC_FID2PATH:
+		RETURN(ll_fid2path(inode, (void *)arg));
+	case LL_IOC_HSM_REQUEST: {
+		struct hsm_user_request	*hur;
+		int			 totalsize;
+
+		OBD_ALLOC_PTR(hur);
+		if (hur == NULL)
+			RETURN(-ENOMEM);
+
+		/* We don't know the true size yet; copy the fixed-size part */
+		if (copy_from_user(hur, (void *)arg, sizeof(*hur))) {
+			OBD_FREE_PTR(hur);
+			RETURN(-EFAULT);
+		}
+
+		/* Compute the whole struct size */
+		totalsize = hur_len(hur);
+		OBD_FREE_PTR(hur);
+		OBD_ALLOC_LARGE(hur, totalsize);
+		if (hur == NULL)
+			RETURN(-ENOMEM);
+
+		/* Copy the whole struct */
+		if (copy_from_user(hur, (void *)arg, totalsize)) {
+			OBD_FREE_LARGE(hur, totalsize);
+			RETURN(-EFAULT);
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
+				   hur, NULL);
+
+		OBD_FREE_LARGE(hur, totalsize);
+
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_PROGRESS: {
+		struct hsm_progress_kernel	hpk;
+		struct hsm_progress		hp;
+
+		if (copy_from_user(&hp, (void *)arg, sizeof(hp)))
+			RETURN(-EFAULT);
+
+		hpk.hpk_fid = hp.hp_fid;
+		hpk.hpk_cookie = hp.hp_cookie;
+		hpk.hpk_extent = hp.hp_extent;
+		hpk.hpk_flags = hp.hp_flags;
+		hpk.hpk_errval = hp.hp_errval;
+		hpk.hpk_data_version = 0;
+
+		/* File may not exist in Lustre; all progress
+		 * reported to Lustre root */
+		rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
+				   NULL);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_CT_START:
+		rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+				    sizeof(struct lustre_kernelcomm));
+		RETURN(rc);
+
+	case LL_IOC_HSM_COPY_START: {
+		struct hsm_copy	*copy;
+		int		 rc;
+
+		OBD_ALLOC_PTR(copy);
+		if (copy == NULL)
+			RETURN(-ENOMEM);
+		if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+			OBD_FREE_PTR(copy);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_ioc_copy_start(inode->i_sb, copy);
+		if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+			rc = -EFAULT;
+
+		OBD_FREE_PTR(copy);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_COPY_END: {
+		struct hsm_copy	*copy;
+		int		 rc;
+
+		OBD_ALLOC_PTR(copy);
+		if (copy == NULL)
+			RETURN(-ENOMEM);
+		if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+			OBD_FREE_PTR(copy);
+			RETURN(-EFAULT);
+		}
+
+		rc = ll_ioc_copy_end(inode->i_sb, copy);
+		if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+			rc = -EFAULT;
+
+		OBD_FREE_PTR(copy);
+		RETURN(rc);
+	}
+	default:
+		RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL,
+				     (void *)arg));
+	}
+}
+
+static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	int api32 = ll_need_32bit_api(sbi);
+	loff_t ret = -EINVAL;
+	ENTRY;
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+		case SEEK_SET:
+			break;
+		case SEEK_CUR:
+			offset += file->f_pos;
+			break;
+		case SEEK_END:
+			if (offset > 0)
+				GOTO(out, ret);
+			if (api32)
+				offset += LL_DIR_END_OFF_32BIT;
+			else
+				offset += LL_DIR_END_OFF;
+			break;
+		default:
+			GOTO(out, ret);
+	}
+
+	if (offset >= 0 &&
+	    ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
+	     (!api32 && offset <= LL_DIR_END_OFF))) {
+		if (offset != file->f_pos) {
+			if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
+			    (!api32 && offset == LL_DIR_END_OFF))
+				fd->lfd_pos = MDS_DIR_END_OFF;
+			else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH)
+				fd->lfd_pos = offset << 32;
+			else
+				fd->lfd_pos = offset;
+			file->f_pos = offset;
+			file->f_version = 0;
+		}
+		ret = offset;
+	}
+	GOTO(out, ret);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+int ll_dir_open(struct inode *inode, struct file *file)
+{
+	ENTRY;
+	RETURN(ll_file_open(inode, file));
+}
+
+int ll_dir_release(struct inode *inode, struct file *file)
+{
+	ENTRY;
+	RETURN(ll_file_release(inode, file));
+}
+
+struct file_operations ll_dir_operations = {
+	.llseek   = ll_dir_seek,
+	.open     = ll_dir_open,
+	.release  = ll_dir_release,
+	.read     = generic_read_dir,
+	.readdir  = ll_readdir,
+	.unlocked_ioctl   = ll_dir_ioctl,
+	.fsync    = ll_fsync,
+};

diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
new file mode 100644
index 0000000..ed1e3f7b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/file.c

@@ -0,0 +1,3198 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/file.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <lustre_dlm.h>
+#include <lustre_lite.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include "llite_internal.h"
+#include <lustre/ll_fiemap.h>
+
+#include "cl_object.h"
+
+struct ll_file_data *ll_file_data_get(void)
+{
+	struct ll_file_data *fd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
+	fd->fd_write_failed = false;
+	return fd;
+}
+
+static void ll_file_data_put(struct ll_file_data *fd)
+{
+	if (fd != NULL)
+		OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
+}
+
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+			  struct lustre_handle *fh)
+{
+	op_data->op_fid1 = ll_i2info(inode)->lli_fid;
+	op_data->op_attr.ia_mode = inode->i_mode;
+	op_data->op_attr.ia_atime = inode->i_atime;
+	op_data->op_attr.ia_mtime = inode->i_mtime;
+	op_data->op_attr.ia_ctime = inode->i_ctime;
+	op_data->op_attr.ia_size = i_size_read(inode);
+	op_data->op_attr_blocks = inode->i_blocks;
+	((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+					ll_inode_to_ext_flags(inode->i_flags);
+	op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
+	if (fh)
+		op_data->op_handle = *fh;
+	op_data->op_capa1 = ll_mdscapa_get(inode);
+
+	if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
+		op_data->op_bias |= MDS_DATA_MODIFIED;
+}
+
+/**
+ * Closes the IO epoch and packs all the attributes into @op_data for
+ * the CLOSE rpc.
+ */
+static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
+			     struct obd_client_handle *och)
+{
+	ENTRY;
+
+	op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
+					ATTR_MTIME | ATTR_MTIME_SET |
+					ATTR_CTIME | ATTR_CTIME_SET;
+
+	if (!(och->och_flags & FMODE_WRITE))
+		goto out;
+
+	if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
+		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+	else
+		ll_ioepoch_close(inode, op_data, &och, 0);
+
+out:
+	ll_pack_inode2opdata(inode, op_data, &och->och_fh);
+	ll_prep_md_op_data(op_data, inode, NULL, NULL,
+			   0, 0, LUSTRE_OPC_ANY, NULL);
+	EXIT;
+}
+
+static int ll_close_inode_openhandle(struct obd_export *md_exp,
+				     struct inode *inode,
+				     struct obd_client_handle *och)
+{
+	struct obd_export *exp = ll_i2mdexp(inode);
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req = NULL;
+	struct obd_device *obd = class_exp2obd(exp);
+	int epoch_close = 1;
+	int rc;
+	ENTRY;
+
+	if (obd == NULL) {
+		/*
+		 * XXX: in case of LMV, is this correct to access
+		 * ->exp_handle?
+		 */
+		CERROR("Invalid MDC connection handle "LPX64"\n",
+		       ll_i2mdexp(inode)->exp_handle.h_cookie);
+		GOTO(out, rc = 0);
+	}
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
+
+	ll_prepare_close(inode, op_data, och);
+	epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
+	rc = md_close(md_exp, op_data, och->och_mod, &req);
+	if (rc == -EAGAIN) {
+		/* This close must have the epoch closed. */
+		LASSERT(epoch_close);
+		/* MDS has instructed us to obtain Size-on-MDS attribute from
+		 * OSTs and send setattr to back to MDS. */
+		rc = ll_som_update(inode, op_data);
+		if (rc) {
+			CERROR("inode %lu mdc Size-on-MDS update failed: "
+			       "rc = %d\n", inode->i_ino, rc);
+			rc = 0;
+		}
+	} else if (rc) {
+		CERROR("inode %lu mdc close failed: rc = %d\n",
+		       inode->i_ino, rc);
+	}
+
+	/* DATA_MODIFIED flag was successfully sent on close, cancel data
+	 * modification flag. */
+	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	ll_finish_md_op_data(op_data);
+
+	if (rc == 0) {
+		rc = ll_objects_destroy(req, inode);
+		if (rc)
+			CERROR("inode %lu ll_objects destroy: rc = %d\n",
+			       inode->i_ino, rc);
+	}
+
+	EXIT;
+out:
+
+	if (exp_connect_som(exp) && !epoch_close &&
+	    S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
+		ll_queue_done_writing(inode, LLIF_DONE_WRITING);
+	} else {
+		md_clear_open_replay_data(md_exp, och);
+		/* Free @och if it is not waiting for DONE_WRITING. */
+		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+		OBD_FREE_PTR(och);
+	}
+	if (req) /* This is close request */
+		ptlrpc_req_finished(req);
+	return rc;
+}
+
+int ll_md_real_close(struct inode *inode, int flags)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_client_handle **och_p;
+	struct obd_client_handle *och;
+	__u64 *och_usecount;
+	int rc = 0;
+	ENTRY;
+
+	if (flags & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else if (flags & FMODE_EXEC) {
+		och_p = &lli->lli_mds_exec_och;
+		och_usecount = &lli->lli_open_fd_exec_count;
+	} else {
+		LASSERT(flags & FMODE_READ);
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (*och_usecount) { /* There are still users of this handle, so
+				skip freeing it. */
+		mutex_unlock(&lli->lli_och_mutex);
+		RETURN(0);
+	}
+	och=*och_p;
+	*och_p = NULL;
+	mutex_unlock(&lli->lli_och_mutex);
+
+	if (och) { /* There might be a race and somebody have freed this och
+		      already */
+		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+					       inode, och);
+	}
+
+	RETURN(rc);
+}
+
+int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+		struct file *file)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc = 0;
+	ENTRY;
+
+	/* clear group lock, if present */
+	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
+		ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
+
+	/* Let's see if we have good enough OPEN lock on the file and if
+	   we can skip talking to MDS */
+	if (file->f_dentry->d_inode) { /* Can this ever be false? */
+		int lockmode;
+		int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
+		struct lustre_handle lockh;
+		struct inode *inode = file->f_dentry->d_inode;
+		ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
+
+		mutex_lock(&lli->lli_och_mutex);
+		if (fd->fd_omode & FMODE_WRITE) {
+			lockmode = LCK_CW;
+			LASSERT(lli->lli_open_fd_write_count);
+			lli->lli_open_fd_write_count--;
+		} else if (fd->fd_omode & FMODE_EXEC) {
+			lockmode = LCK_PR;
+			LASSERT(lli->lli_open_fd_exec_count);
+			lli->lli_open_fd_exec_count--;
+		} else {
+			lockmode = LCK_CR;
+			LASSERT(lli->lli_open_fd_read_count);
+			lli->lli_open_fd_read_count--;
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+
+		if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
+				   LDLM_IBITS, &policy, lockmode,
+				   &lockh)) {
+			rc = ll_md_real_close(file->f_dentry->d_inode,
+					      fd->fd_omode);
+		}
+	} else {
+		CERROR("Releasing a file %p with negative dentry %p. Name %s",
+		       file, file->f_dentry, file->f_dentry->d_name.name);
+	}
+
+	LUSTRE_FPRIVATE(file) = NULL;
+	ll_file_data_put(fd);
+	ll_capa_close(inode);
+
+	RETURN(rc);
+}
+
+/* While this returns an error code, fput() the caller does not, so we need
+ * to make every effort to clean up all of our state here.  Also, applications
+ * rarely check close errors and even if an error is returned they will not
+ * re-try the close call.
+ */
+int ll_file_release(struct inode *inode, struct file *file)
+{
+	struct ll_file_data *fd;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+	       inode->i_generation, inode);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+	    inode == inode->i_sb->s_root->d_inode) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		LASSERT(fd != NULL);
+		if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
+			fd->fd_flags &= ~LL_FILE_RMTACL;
+			rct_del(&sbi->ll_rct, current_pid());
+			et_search_free(&sbi->ll_et, current_pid());
+		}
+	}
+#endif
+
+	if (inode->i_sb->s_root != file->f_dentry)
+		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
+	fd = LUSTRE_FPRIVATE(file);
+	LASSERT(fd != NULL);
+
+	/* The last ref on @file, maybe not the the owner pid of statahead.
+	 * Different processes can open the same dir, "ll_opendir_key" means:
+	 * it is me that should stop the statahead thread. */
+	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
+	    lli->lli_opendir_pid != 0)
+		ll_stop_statahead(inode, lli->lli_opendir_key);
+
+	if (inode->i_sb->s_root == file->f_dentry) {
+		LUSTRE_FPRIVATE(file) = NULL;
+		ll_file_data_put(fd);
+		RETURN(0);
+	}
+
+	if (!S_ISDIR(inode->i_mode)) {
+		lov_read_and_clear_async_rc(lli->lli_clob);
+		lli->lli_async_rc = 0;
+	}
+
+	rc = ll_md_close(sbi->ll_md_exp, inode, file);
+
+	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
+		libcfs_debug_dumplog();
+
+	RETURN(rc);
+}
+
+static int ll_intent_file_open(struct file *file, void *lmm,
+			       int lmmsize, struct lookup_intent *itp)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
+	struct dentry *parent = file->f_dentry->d_parent;
+	const char *name = file->f_dentry->d_name.name;
+	const int len = file->f_dentry->d_name.len;
+	struct md_op_data *op_data;
+	struct ptlrpc_request *req;
+	__u32 opc = LUSTRE_OPC_ANY;
+	int rc;
+	ENTRY;
+
+	if (!parent)
+		RETURN(-ENOENT);
+
+	/* Usually we come here only for NFSD, and we want open lock.
+	   But we can also get here with pre 2.6.15 patchless kernels, and in
+	   that case that lock is also ok */
+	/* We can also get here if there was cached open handle in revalidate_it
+	 * but it disappeared while we were getting from there to ll_file_open.
+	 * But this means this file was closed and immediatelly opened which
+	 * makes a good candidate for using OPEN lock */
+	/* If lmmsize & lmm are not 0, we are just setting stripe info
+	 * parameters. No need for the open lock */
+	if (lmm == NULL && lmmsize == 0) {
+		itp->it_flags |= MDS_OPEN_LOCK;
+		if (itp->it_flags & FMODE_WRITE)
+			opc = LUSTRE_OPC_CREATE;
+	}
+
+	op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
+				      file->f_dentry->d_inode, name, len,
+				      O_RDWR, opc, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	itp->it_flags |= MDS_OPEN_BY_FID;
+	rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
+			    0 /*unused */, &req, ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	if (rc == -ESTALE) {
+		/* reason for keep own exit path - don`t flood log
+		* with messages with -ESTALE errors.
+		*/
+		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
+		     it_open_error(DISP_OPEN_OPEN, itp))
+			GOTO(out, rc);
+		ll_release_openhandle(file->f_dentry, itp);
+		GOTO(out, rc);
+	}
+
+	if (it_disposition(itp, DISP_LOOKUP_NEG))
+		GOTO(out, rc = -ENOENT);
+
+	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
+		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
+		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
+	if (!rc && itp->d.lustre.it_lock_mode)
+		ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
+				 itp, NULL);
+
+out:
+	ptlrpc_req_finished(itp->d.lustre.it_data);
+	it_clear_disposition(itp, DISP_ENQ_COMPLETE);
+	ll_intent_drop_lock(itp);
+
+	RETURN(rc);
+}
+
+/**
+ * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
+ * not believe attributes if a few ioepoch holders exist. Attributes for
+ * previous ioepoch if new one is opened are also skipped by MDS.
+ */
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
+{
+	if (ioepoch && lli->lli_ioepoch != ioepoch) {
+		lli->lli_ioepoch = ioepoch;
+		CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
+		       ioepoch, PFID(&lli->lli_fid));
+	}
+}
+
+static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
+		       struct lookup_intent *it, struct obd_client_handle *och)
+{
+	struct ptlrpc_request *req = it->d.lustre.it_data;
+	struct mdt_body *body;
+
+	LASSERT(och);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);		      /* reply already checked out */
+
+	memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
+	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+	och->och_fid = lli->lli_fid;
+	och->och_flags = it->it_flags;
+	ll_ioepoch_open(lli, body->ioepoch);
+
+	return md_set_open_replay_data(md_exp, och, req);
+}
+
+int ll_local_open(struct file *file, struct lookup_intent *it,
+		  struct ll_file_data *fd, struct obd_client_handle *och)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	ENTRY;
+
+	LASSERT(!LUSTRE_FPRIVATE(file));
+
+	LASSERT(fd != NULL);
+
+	if (och) {
+		struct ptlrpc_request *req = it->d.lustre.it_data;
+		struct mdt_body *body;
+		int rc;
+
+		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
+		if (rc)
+			RETURN(rc);
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if ((it->it_flags & FMODE_WRITE) &&
+		    (body->valid & OBD_MD_FLSIZE))
+			CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
+			       lli->lli_ioepoch, PFID(&lli->lli_fid));
+	}
+
+	LUSTRE_FPRIVATE(file) = fd;
+	ll_readahead_init(inode, &fd->fd_ras);
+	fd->fd_omode = it->it_flags;
+	RETURN(0);
+}
+
+/* Open a file, and (for the very first open) create objects on the OSTs at
+ * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
+ * creation or open until ll_lov_setstripe() ioctl is called.
+ *
+ * If we already have the stripe MD locally then we don't request it in
+ * md_open(), by passing a lmm_size = 0.
+ *
+ * It is up to the application to ensure no other processes open this file
+ * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
+ * used.  We might be able to avoid races of that sort by getting lli_open_sem
+ * before returning in the O_LOV_DELAY_CREATE case and dropping it here
+ * or in ll_file_release(), but I'm not sure that is desirable/necessary.
+ */
+int ll_file_open(struct inode *inode, struct file *file)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
+					  .it_flags = file->f_flags };
+	struct obd_client_handle **och_p = NULL;
+	__u64 *och_usecount = NULL;
+	struct ll_file_data *fd;
+	int rc = 0, opendir_set = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
+	       inode->i_generation, inode, file->f_flags);
+
+	it = file->private_data; /* XXX: compat macro */
+	file->private_data = NULL; /* prevent ll_local_open assertion */
+
+	fd = ll_file_data_get();
+	if (fd == NULL)
+		GOTO(out_och_free, rc = -ENOMEM);
+
+	fd->fd_file = file;
+	if (S_ISDIR(inode->i_mode)) {
+		spin_lock(&lli->lli_sa_lock);
+		if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
+		    lli->lli_opendir_pid == 0) {
+			lli->lli_opendir_key = fd;
+			lli->lli_opendir_pid = current_pid();
+			opendir_set = 1;
+		}
+		spin_unlock(&lli->lli_sa_lock);
+	}
+
+	if (inode->i_sb->s_root == file->f_dentry) {
+		LUSTRE_FPRIVATE(file) = fd;
+		RETURN(0);
+	}
+
+	if (!it || !it->d.lustre.it_disposition) {
+		/* Convert f_flags into access mode. We cannot use file->f_mode,
+		 * because everything but O_ACCMODE mask was stripped from
+		 * there */
+		if ((oit.it_flags + 1) & O_ACCMODE)
+			oit.it_flags++;
+		if (file->f_flags & O_TRUNC)
+			oit.it_flags |= FMODE_WRITE;
+
+		/* kernel only call f_op->open in dentry_open.  filp_open calls
+		 * dentry_open after call to open_namei that checks permissions.
+		 * Only nfsd_open call dentry_open directly without checking
+		 * permissions and because of that this code below is safe. */
+		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+
+		/* We do not want O_EXCL here, presumably we opened the file
+		 * already? XXX - NFS implications? */
+		oit.it_flags &= ~O_EXCL;
+
+		/* bug20584, if "it_flags" contains O_CREAT, the file will be
+		 * created if necessary, then "IT_CREAT" should be set to keep
+		 * consistent with it */
+		if (oit.it_flags & O_CREAT)
+			oit.it_op |= IT_CREAT;
+
+		it = &oit;
+	}
+
+restart:
+	/* Let's see if we have file open on MDS already. */
+	if (it->it_flags & FMODE_WRITE) {
+		och_p = &lli->lli_mds_write_och;
+		och_usecount = &lli->lli_open_fd_write_count;
+	} else if (it->it_flags & FMODE_EXEC) {
+		och_p = &lli->lli_mds_exec_och;
+		och_usecount = &lli->lli_open_fd_exec_count;
+	 } else {
+		och_p = &lli->lli_mds_read_och;
+		och_usecount = &lli->lli_open_fd_read_count;
+	}
+
+	mutex_lock(&lli->lli_och_mutex);
+	if (*och_p) { /* Open handle is present */
+		if (it_disposition(it, DISP_OPEN_OPEN)) {
+			/* Well, there's extra open request that we do not need,
+			   let's close it somehow. This will decref request. */
+			rc = it_open_error(DISP_OPEN_OPEN, it);
+			if (rc) {
+				mutex_unlock(&lli->lli_och_mutex);
+				GOTO(out_openerr, rc);
+			}
+
+			ll_release_openhandle(file->f_dentry, it);
+		}
+		(*och_usecount)++;
+
+		rc = ll_local_open(file, it, fd, NULL);
+		if (rc) {
+			(*och_usecount)--;
+			mutex_unlock(&lli->lli_och_mutex);
+			GOTO(out_openerr, rc);
+		}
+	} else {
+		LASSERT(*och_usecount == 0);
+		if (!it->d.lustre.it_disposition) {
+			/* We cannot just request lock handle now, new ELC code
+			   means that one of other OPEN locks for this file
+			   could be cancelled, and since blocking ast handler
+			   would attempt to grab och_mutex as well, that would
+			   result in a deadlock */
+			mutex_unlock(&lli->lli_och_mutex);
+			it->it_create_mode |= M_CHECK_STALE;
+			rc = ll_intent_file_open(file, NULL, 0, it);
+			it->it_create_mode &= ~M_CHECK_STALE;
+			if (rc)
+				GOTO(out_openerr, rc);
+
+			goto restart;
+		}
+		OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
+		if (!*och_p)
+			GOTO(out_och_free, rc = -ENOMEM);
+
+		(*och_usecount)++;
+
+		/* md_intent_lock() didn't get a request ref if there was an
+		 * open error, so don't do cleanup on the request here
+		 * (bug 3430) */
+		/* XXX (green): Should not we bail out on any error here, not
+		 * just open error? */
+		rc = it_open_error(DISP_OPEN_OPEN, it);
+		if (rc)
+			GOTO(out_och_free, rc);
+
+		LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
+
+		rc = ll_local_open(file, it, fd, *och_p);
+		if (rc)
+			GOTO(out_och_free, rc);
+	}
+	mutex_unlock(&lli->lli_och_mutex);
+	fd = NULL;
+
+	/* Must do this outside lli_och_mutex lock to prevent deadlock where
+	   different kind of OPEN lock for this same inode gets cancelled
+	   by ldlm_cancel_lru */
+	if (!S_ISREG(inode->i_mode))
+		GOTO(out_och_free, rc);
+
+	ll_capa_open(inode);
+
+	if (!lli->lli_has_smd) {
+		if (file->f_flags & O_LOV_DELAY_CREATE ||
+		    !(file->f_mode & FMODE_WRITE)) {
+			CDEBUG(D_INODE, "object creation was delayed\n");
+			GOTO(out_och_free, rc);
+		}
+	}
+	file->f_flags &= ~O_LOV_DELAY_CREATE;
+	GOTO(out_och_free, rc);
+
+out_och_free:
+	if (rc) {
+		if (och_p && *och_p) {
+			OBD_FREE(*och_p, sizeof (struct obd_client_handle));
+			*och_p = NULL; /* OBD_FREE writes some magic there */
+			(*och_usecount)--;
+		}
+		mutex_unlock(&lli->lli_och_mutex);
+
+out_openerr:
+		if (opendir_set != 0)
+			ll_stop_statahead(inode, lli->lli_opendir_key);
+		if (fd != NULL)
+			ll_file_data_put(fd);
+	} else {
+		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
+	}
+
+	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
+		ptlrpc_req_finished(it->d.lustre.it_data);
+		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+	}
+
+	return rc;
+}
+
+/* Fills the obdo with the attributes for the lsm */
+static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
+			  struct obd_capa *capa, struct obdo *obdo,
+			  __u64 ioepoch, int sync)
+{
+	struct ptlrpc_request_set *set;
+	struct obd_info	    oinfo = { { { 0 } } };
+	int			rc;
+
+	ENTRY;
+
+	LASSERT(lsm != NULL);
+
+	oinfo.oi_md = lsm;
+	oinfo.oi_oa = obdo;
+	oinfo.oi_oa->o_oi = lsm->lsm_oi;
+	oinfo.oi_oa->o_mode = S_IFREG;
+	oinfo.oi_oa->o_ioepoch = ioepoch;
+	oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
+			       OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+			       OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
+			       OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+			       OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
+			       OBD_MD_FLDATAVERSION;
+	oinfo.oi_capa = capa;
+	if (sync) {
+		oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
+		oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
+	}
+
+	set = ptlrpc_prep_set();
+	if (set == NULL) {
+		CERROR("can't allocate ptlrpc set\n");
+		rc = -ENOMEM;
+	} else {
+		rc = obd_getattr_async(exp, &oinfo, set);
+		if (rc == 0)
+			rc = ptlrpc_set_wait(set);
+		ptlrpc_set_destroy(set);
+	}
+	if (rc == 0)
+		oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+					 OBD_MD_FLATIME | OBD_MD_FLMTIME |
+					 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
+					 OBD_MD_FLDATAVERSION);
+	RETURN(rc);
+}
+
+/**
+  * Performs the getattr on the inode and updates its fields.
+  * If @sync != 0, perform the getattr under the server-side lock.
+  */
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+		     __u64 ioepoch, int sync)
+{
+	struct obd_capa      *capa = ll_mdscapa_get(inode);
+	struct lov_stripe_md *lsm;
+	int rc;
+	ENTRY;
+
+	lsm = ccc_inode_lsm_get(inode);
+	rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
+			    capa, obdo, ioepoch, sync);
+	capa_put(capa);
+	if (rc == 0) {
+		struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
+
+		obdo_refresh_inode(inode, obdo, obdo->o_valid);
+		CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
+		       " blksize %lu\n", POSTID(oi), i_size_read(inode),
+		       (unsigned long long)inode->i_blocks,
+		       (unsigned long)ll_inode_blksize(inode));
+	}
+	ccc_inode_lsm_put(inode, lsm);
+	RETURN(rc);
+}
+
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_object *obj = lli->lli_clob;
+	struct cl_attr *attr = ccc_env_thread_attr(env);
+	struct ost_lvb lvb;
+	int rc = 0;
+
+	ENTRY;
+
+	ll_inode_size_lock(inode);
+	/* merge timestamps the most recently obtained from mds with
+	   timestamps obtained from osts */
+	LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
+	LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
+	LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
+	inode_init_lvb(inode, &lvb);
+
+	cl_object_attr_lock(obj);
+	rc = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+
+	if (rc == 0) {
+		if (lvb.lvb_atime < attr->cat_atime)
+			lvb.lvb_atime = attr->cat_atime;
+		if (lvb.lvb_ctime < attr->cat_ctime)
+			lvb.lvb_ctime = attr->cat_ctime;
+		if (lvb.lvb_mtime < attr->cat_mtime)
+			lvb.lvb_mtime = attr->cat_mtime;
+
+		CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
+				PFID(&lli->lli_fid), attr->cat_size);
+		cl_isize_write_nolock(inode, attr->cat_size);
+
+		inode->i_blocks = attr->cat_blocks;
+
+		LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
+		LTIME_S(inode->i_atime) = lvb.lvb_atime;
+		LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
+	}
+	ll_inode_size_unlock(inode);
+
+	RETURN(rc);
+}
+
+int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
+		     lstat_t *st)
+{
+	struct obdo obdo = { 0 };
+	int rc;
+
+	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
+	if (rc == 0) {
+		st->st_size   = obdo.o_size;
+		st->st_blocks = obdo.o_blocks;
+		st->st_mtime  = obdo.o_mtime;
+		st->st_atime  = obdo.o_atime;
+		st->st_ctime  = obdo.o_ctime;
+	}
+	return rc;
+}
+
+void ll_io_init(struct cl_io *io, const struct file *file, int write)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+
+	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
+	if (write) {
+		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
+		io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
+				      file->f_flags & O_DIRECT ||
+				      IS_SYNC(inode);
+	}
+	io->ci_obj     = ll_i2info(inode)->lli_clob;
+	io->ci_lockreq = CILR_MAYBE;
+	if (ll_file_nolock(file)) {
+		io->ci_lockreq = CILR_NEVER;
+		io->ci_no_srvlock = 1;
+	} else if (file->f_flags & O_APPEND) {
+		io->ci_lockreq = CILR_MANDATORY;
+	}
+}
+
+static ssize_t
+ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
+		   struct file *file, enum cl_io_type iot,
+		   loff_t *ppos, size_t count)
+{
+	struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
+	struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
+	struct cl_io	 *io;
+	ssize_t	       result;
+	ENTRY;
+
+restart:
+	io = ccc_env_thread_io(env);
+	ll_io_init(io, file, iot == CIT_WRITE);
+
+	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
+		struct vvp_io *vio = vvp_env_io(env);
+		struct ccc_io *cio = ccc_env_io(env);
+		int write_mutex_locked = 0;
+
+		cio->cui_fd  = LUSTRE_FPRIVATE(file);
+		vio->cui_io_subtype = args->via_io_subtype;
+
+		switch (vio->cui_io_subtype) {
+		case IO_NORMAL:
+			cio->cui_iov = args->u.normal.via_iov;
+			cio->cui_nrsegs = args->u.normal.via_nrsegs;
+			cio->cui_tot_nrsegs = cio->cui_nrsegs;
+			cio->cui_iocb = args->u.normal.via_iocb;
+			if ((iot == CIT_WRITE) &&
+			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+				if (mutex_lock_interruptible(&lli->
+							       lli_write_mutex))
+					GOTO(out, result = -ERESTARTSYS);
+				write_mutex_locked = 1;
+			} else if (iot == CIT_READ) {
+				down_read(&lli->lli_trunc_sem);
+			}
+			break;
+		case IO_SENDFILE:
+			vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
+			vio->u.sendfile.cui_target = args->u.sendfile.via_target;
+			break;
+		case IO_SPLICE:
+			vio->u.splice.cui_pipe = args->u.splice.via_pipe;
+			vio->u.splice.cui_flags = args->u.splice.via_flags;
+			break;
+		default:
+			CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
+			LBUG();
+		}
+		result = cl_io_loop(env, io);
+		if (write_mutex_locked)
+			mutex_unlock(&lli->lli_write_mutex);
+		else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
+			up_read(&lli->lli_trunc_sem);
+	} else {
+		/* cl_io_rw_init() handled IO */
+		result = io->ci_result;
+	}
+
+	if (io->ci_nob > 0) {
+		result = io->ci_nob;
+		*ppos = io->u.ci_wr.wr.crw_pos;
+	}
+	GOTO(out, result);
+out:
+	cl_io_fini(env, io);
+	/* If any bit been read/written (result != 0), we just return
+	 * short read/write instead of restart io. */
+	if (result == 0 && io->ci_need_restart) {
+		CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
+		       iot == CIT_READ ? "read" : "write",
+		       file->f_dentry->d_name.name, *ppos, count);
+		LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
+		goto restart;
+	}
+
+	if (iot == CIT_READ) {
+		if (result >= 0)
+			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
+					   LPROC_LL_READ_BYTES, result);
+	} else if (iot == CIT_WRITE) {
+		if (result >= 0) {
+			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
+					   LPROC_LL_WRITE_BYTES, result);
+			fd->fd_write_failed = false;
+		} else if (result != -ERESTARTSYS) {
+			fd->fd_write_failed = true;
+		}
+	}
+
+	return result;
+}
+
+
+/*
+ * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
+ */
+static int ll_file_get_iov_count(const struct iovec *iov,
+				 unsigned long *nr_segs, size_t *count)
+{
+	size_t cnt = 0;
+	unsigned long seg;
+
+	for (seg = 0; seg < *nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		cnt += iv->iov_len;
+		if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		*nr_segs = seg;
+		cnt -= iv->iov_len;   /* This segment is no good */
+		break;
+	}
+	*count = cnt;
+	return 0;
+}
+
+static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	struct lu_env      *env;
+	struct vvp_io_args *args;
+	size_t	      count;
+	ssize_t	     result;
+	int		 refcheck;
+	ENTRY;
+
+	result = ll_file_get_iov_count(iov, &nr_segs, &count);
+	if (result)
+		RETURN(result);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	args = vvp_env_args(env, IO_NORMAL);
+	args->u.normal.via_iov = (struct iovec *)iov;
+	args->u.normal.via_nrsegs = nr_segs;
+	args->u.normal.via_iocb = iocb;
+
+	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+				    &iocb->ki_pos, count);
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct lu_env *env;
+	struct iovec  *local_iov;
+	struct kiocb  *kiocb;
+	ssize_t	result;
+	int	    refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	local_iov = &vvp_env_info(env)->vti_local_iov;
+	kiocb = &vvp_env_info(env)->vti_kiocb;
+	local_iov->iov_base = (void __user *)buf;
+	local_iov->iov_len = count;
+	init_sync_kiocb(kiocb, file);
+	kiocb->ki_pos = *ppos;
+	kiocb->ki_left = count;
+
+	result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
+	*ppos = kiocb->ki_pos;
+
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+/*
+ * Write to a file (through the page cache).
+ */
+static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				 unsigned long nr_segs, loff_t pos)
+{
+	struct lu_env      *env;
+	struct vvp_io_args *args;
+	size_t	      count;
+	ssize_t	     result;
+	int		 refcheck;
+	ENTRY;
+
+	result = ll_file_get_iov_count(iov, &nr_segs, &count);
+	if (result)
+		RETURN(result);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	args = vvp_env_args(env, IO_NORMAL);
+	args->u.normal.via_iov = (struct iovec *)iov;
+	args->u.normal.via_nrsegs = nr_segs;
+	args->u.normal.via_iocb = iocb;
+
+	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
+				  &iocb->ki_pos, count);
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
+			     loff_t *ppos)
+{
+	struct lu_env *env;
+	struct iovec  *local_iov;
+	struct kiocb  *kiocb;
+	ssize_t	result;
+	int	    refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	local_iov = &vvp_env_info(env)->vti_local_iov;
+	kiocb = &vvp_env_info(env)->vti_kiocb;
+	local_iov->iov_base = (void __user *)buf;
+	local_iov->iov_len = count;
+	init_sync_kiocb(kiocb, file);
+	kiocb->ki_pos = *ppos;
+	kiocb->ki_left = count;
+
+	result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
+	*ppos = kiocb->ki_pos;
+
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+
+
+/*
+ * Send file content (through pagecache) somewhere with helper
+ */
+static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
+				   struct pipe_inode_info *pipe, size_t count,
+				   unsigned int flags)
+{
+	struct lu_env      *env;
+	struct vvp_io_args *args;
+	ssize_t	     result;
+	int		 refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	args = vvp_env_args(env, IO_SPLICE);
+	args->u.splice.via_pipe = pipe;
+	args->u.splice.via_flags = flags;
+
+	result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
+	cl_env_put(env, &refcheck);
+	RETURN(result);
+}
+
+static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
+			   obd_count ost_idx)
+{
+	struct obd_export *exp = ll_i2dtexp(inode);
+	struct obd_trans_info oti = { 0 };
+	struct obdo *oa = NULL;
+	int lsm_size;
+	int rc = 0;
+	struct lov_stripe_md *lsm = NULL, *lsm2;
+	ENTRY;
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		RETURN(-ENOMEM);
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm == NULL)
+		GOTO(out, rc = -ENOENT);
+
+	lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
+		   (lsm->lsm_stripe_count));
+
+	OBD_ALLOC_LARGE(lsm2, lsm_size);
+	if (lsm2 == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	oa->o_oi = *oi;
+	oa->o_nlink = ost_idx;
+	oa->o_flags |= OBD_FL_RECREATE_OBJS;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
+	obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
+				   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
+	memcpy(lsm2, lsm, lsm_size);
+	ll_inode_size_lock(inode);
+	rc = obd_create(NULL, exp, oa, &lsm2, &oti);
+	ll_inode_size_unlock(inode);
+
+	OBD_FREE_LARGE(lsm2, lsm_size);
+	GOTO(out, rc);
+out:
+	ccc_inode_lsm_put(inode, lsm);
+	OBDO_FREE(oa);
+	return rc;
+}
+
+static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
+{
+	struct ll_recreate_obj ucreat;
+	struct ost_id		oi;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
+			   sizeof(ucreat)))
+		RETURN(-EFAULT);
+
+	ostid_set_seq_mdt0(&oi);
+	ostid_set_id(&oi, ucreat.lrc_id);
+	RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
+}
+
+static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
+{
+	struct lu_fid	fid;
+	struct ost_id	oi;
+	obd_count	ost_idx;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
+		RETURN(-EFAULT);
+
+	fid_to_ostid(&fid, &oi);
+	ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
+	RETURN(ll_lov_recreate(inode, &oi, ost_idx));
+}
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
+			     int flags, struct lov_user_md *lum, int lum_size)
+{
+	struct lov_stripe_md *lsm = NULL;
+	struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
+	int rc = 0;
+	ENTRY;
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm != NULL) {
+		ccc_inode_lsm_put(inode, lsm);
+		CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
+		       inode->i_ino);
+		RETURN(-EEXIST);
+	}
+
+	ll_inode_size_lock(inode);
+	rc = ll_intent_file_open(file, lum, lum_size, &oit);
+	if (rc)
+		GOTO(out, rc);
+	rc = oit.d.lustre.it_status;
+	if (rc < 0)
+		GOTO(out_req_free, rc);
+
+	ll_release_openhandle(file->f_dentry, &oit);
+
+ out:
+	ll_inode_size_unlock(inode);
+	ll_intent_release(&oit);
+	ccc_inode_lsm_put(inode, lsm);
+	RETURN(rc);
+out_req_free:
+	ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
+	goto out;
+}
+
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+			     struct lov_mds_md **lmmp, int *lmm_size,
+			     struct ptlrpc_request **request)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct mdt_body  *body;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data;
+	int rc, lmmsize;
+
+	rc = ll_get_max_mdsize(sbi, &lmmsize);
+	if (rc)
+		RETURN(rc);
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
+				     strlen(filename), lmmsize,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0) {
+		CDEBUG(D_INFO, "md_getattr_name failed "
+		       "on %s: rc %d\n", filename, rc);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL); /* checked by mdc_getattr_name */
+
+	lmmsize = body->eadatasize;
+
+	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+			lmmsize == 0) {
+		GOTO(out, rc = -ENODATA);
+	}
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
+	LASSERT(lmm != NULL);
+
+	if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
+	    (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
+		GOTO(out, rc = -EPROTO);
+	}
+
+	/*
+	 * This is coming from the MDS, so is probably in
+	 * little endian.  We convert it to host endian before
+	 * passing it to userspace.
+	 */
+	if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
+		/* if function called for directory - we should
+		 * avoid swab not existent lsm objects */
+		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
+			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+			if (S_ISREG(body->mode))
+				lustre_swab_lov_user_md_objects(
+				 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+				 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+			if (S_ISREG(body->mode))
+				lustre_swab_lov_user_md_objects(
+				 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+				 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+		}
+	}
+
+out:
+	*lmmp = lmm;
+	*lmm_size = lmmsize;
+	*request = req;
+	return rc;
+}
+
+static int ll_lov_setea(struct inode *inode, struct file *file,
+			    unsigned long arg)
+{
+	int			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
+	struct lov_user_md	*lump;
+	int			 lum_size = sizeof(struct lov_user_md) +
+					    sizeof(struct lov_user_ost_data);
+	int			 rc;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+		RETURN(-EPERM);
+
+	OBD_ALLOC_LARGE(lump, lum_size);
+	if (lump == NULL)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
+		OBD_FREE_LARGE(lump, lum_size);
+		RETURN(-EFAULT);
+	}
+
+	rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
+
+	OBD_FREE_LARGE(lump, lum_size);
+	RETURN(rc);
+}
+
+static int ll_lov_setstripe(struct inode *inode, struct file *file,
+			    unsigned long arg)
+{
+	struct lov_user_md_v3	 lumv3;
+	struct lov_user_md_v1	*lumv1 = (struct lov_user_md_v1 *)&lumv3;
+	struct lov_user_md_v1	*lumv1p = (struct lov_user_md_v1 *)arg;
+	struct lov_user_md_v3	*lumv3p = (struct lov_user_md_v3 *)arg;
+	int			 lum_size, rc;
+	int			 flags = FMODE_WRITE;
+	ENTRY;
+
+	/* first try with v1 which is smaller than v3 */
+	lum_size = sizeof(struct lov_user_md_v1);
+	if (copy_from_user(lumv1, lumv1p, lum_size))
+		RETURN(-EFAULT);
+
+	if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+		lum_size = sizeof(struct lov_user_md_v3);
+		if (copy_from_user(&lumv3, lumv3p, lum_size))
+			RETURN(-EFAULT);
+	}
+
+	rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
+	if (rc == 0) {
+		struct lov_stripe_md *lsm;
+		__u32 gen;
+
+		put_user(0, &lumv1p->lmm_stripe_count);
+
+		ll_layout_refresh(inode, &gen);
+		lsm = ccc_inode_lsm_get(inode);
+		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
+				   0, lsm, (void *)arg);
+		ccc_inode_lsm_put(inode, lsm);
+	}
+	RETURN(rc);
+}
+
+static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
+{
+	struct lov_stripe_md *lsm;
+	int rc = -ENODATA;
+	ENTRY;
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm != NULL)
+		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
+				   lsm, (void *)arg);
+	ccc_inode_lsm_put(inode, lsm);
+	RETURN(rc);
+}
+
+int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+	struct ll_inode_info   *lli = ll_i2info(inode);
+	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+	struct ccc_grouplock    grouplock;
+	int		     rc;
+	ENTRY;
+
+	if (ll_file_nolock(file))
+		RETURN(-EOPNOTSUPP);
+
+	spin_lock(&lli->lli_lock);
+	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+		CWARN("group lock already existed with gid %lu\n",
+		      fd->fd_grouplock.cg_gid);
+		spin_unlock(&lli->lli_lock);
+		RETURN(-EINVAL);
+	}
+	LASSERT(fd->fd_grouplock.cg_lock == NULL);
+	spin_unlock(&lli->lli_lock);
+
+	rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
+			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
+	if (rc)
+		RETURN(rc);
+
+	spin_lock(&lli->lli_lock);
+	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+		spin_unlock(&lli->lli_lock);
+		CERROR("another thread just won the race\n");
+		cl_put_grouplock(&grouplock);
+		RETURN(-EINVAL);
+	}
+
+	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
+	fd->fd_grouplock = grouplock;
+	spin_unlock(&lli->lli_lock);
+
+	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
+	RETURN(0);
+}
+
+int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+	struct ll_inode_info   *lli = ll_i2info(inode);
+	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+	struct ccc_grouplock    grouplock;
+	ENTRY;
+
+	spin_lock(&lli->lli_lock);
+	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+		spin_unlock(&lli->lli_lock);
+		CWARN("no group lock held\n");
+		RETURN(-EINVAL);
+	}
+	LASSERT(fd->fd_grouplock.cg_lock != NULL);
+
+	if (fd->fd_grouplock.cg_gid != arg) {
+		CWARN("group lock %lu doesn't match current id %lu\n",
+		       arg, fd->fd_grouplock.cg_gid);
+		spin_unlock(&lli->lli_lock);
+		RETURN(-EINVAL);
+	}
+
+	grouplock = fd->fd_grouplock;
+	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
+	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
+	spin_unlock(&lli->lli_lock);
+
+	cl_put_grouplock(&grouplock);
+	CDEBUG(D_INFO, "group lock %lu released\n", arg);
+	RETURN(0);
+}
+
+/**
+ * Close inode open handle
+ *
+ * \param dentry [in]     dentry which contains the inode
+ * \param it     [in,out] intent which contains open info and result
+ *
+ * \retval 0     success
+ * \retval <0    failure
+ */
+int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
+{
+	struct inode *inode = dentry->d_inode;
+	struct obd_client_handle *och;
+	int rc;
+	ENTRY;
+
+	LASSERT(inode);
+
+	/* Root ? Do nothing. */
+	if (dentry->d_inode->i_sb->s_root == dentry)
+		RETURN(0);
+
+	/* No open handle to close? Move away */
+	if (!it_disposition(it, DISP_OPEN_OPEN))
+		RETURN(0);
+
+	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
+
+	OBD_ALLOC(och, sizeof(*och));
+	if (!och)
+		GOTO(out, rc = -ENOMEM);
+
+	ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
+		    ll_i2info(inode), it, och);
+
+	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+				       inode, och);
+ out:
+	/* this one is in place of ll_file_open */
+	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
+		ptlrpc_req_finished(it->d.lustre.it_data);
+		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+	}
+	RETURN(rc);
+}
+
+/**
+ * Get size for inode for which FIEMAP mapping is requested.
+ * Make the FIEMAP get_info call and returns the result.
+ */
+int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+	      int num_bytes)
+{
+	struct obd_export *exp = ll_i2dtexp(inode);
+	struct lov_stripe_md *lsm = NULL;
+	struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
+	int vallen = num_bytes;
+	int rc;
+	ENTRY;
+
+	/* Checks for fiemap flags */
+	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+		return -EBADR;
+	}
+
+	/* Check for FIEMAP_FLAG_SYNC */
+	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
+		rc = filemap_fdatawrite(inode->i_mapping);
+		if (rc)
+			return rc;
+	}
+
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm == NULL)
+		return -ENOENT;
+
+	/* If the stripe_count > 1 and the application does not understand
+	 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
+	 */
+	if (lsm->lsm_stripe_count > 1 &&
+	    !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
+		GOTO(out, rc = -EOPNOTSUPP);
+
+	fm_key.oa.o_oi = lsm->lsm_oi;
+	fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
+	obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
+	/* If filesize is 0, then there would be no objects for mapping */
+	if (fm_key.oa.o_size == 0) {
+		fiemap->fm_mapped_extents = 0;
+		GOTO(out, rc = 0);
+	}
+
+	memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
+
+	rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
+			  fiemap, lsm);
+	if (rc)
+		CERROR("obd_get_info failed: rc = %d\n", rc);
+
+out:
+	ccc_inode_lsm_put(inode, lsm);
+	RETURN(rc);
+}
+
+int ll_fid2path(struct inode *inode, void *arg)
+{
+	struct obd_export	*exp = ll_i2mdexp(inode);
+	struct getinfo_fid2path	*gfout, *gfin;
+	int			 outsize, rc;
+	ENTRY;
+
+	if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
+	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
+		RETURN(-EPERM);
+
+	/* Need to get the buflen */
+	OBD_ALLOC_PTR(gfin);
+	if (gfin == NULL)
+		RETURN(-ENOMEM);
+	if (copy_from_user(gfin, arg, sizeof(*gfin))) {
+		OBD_FREE_PTR(gfin);
+		RETURN(-EFAULT);
+	}
+
+	outsize = sizeof(*gfout) + gfin->gf_pathlen;
+	OBD_ALLOC(gfout, outsize);
+	if (gfout == NULL) {
+		OBD_FREE_PTR(gfin);
+		RETURN(-ENOMEM);
+	}
+	memcpy(gfout, gfin, sizeof(*gfout));
+	OBD_FREE_PTR(gfin);
+
+	/* Call mdc_iocontrol */
+	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
+	if (rc)
+		GOTO(gf_free, rc);
+
+	if (copy_to_user(arg, gfout, outsize))
+		rc = -EFAULT;
+
+gf_free:
+	OBD_FREE(gfout, outsize);
+	RETURN(rc);
+}
+
+static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
+{
+	struct ll_user_fiemap *fiemap_s;
+	size_t num_bytes, ret_bytes;
+	unsigned int extent_count;
+	int rc = 0;
+
+	/* Get the extent count so we can calculate the size of
+	 * required fiemap buffer */
+	if (get_user(extent_count,
+	    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
+		RETURN(-EFAULT);
+	num_bytes = sizeof(*fiemap_s) + (extent_count *
+					 sizeof(struct ll_fiemap_extent));
+
+	OBD_ALLOC_LARGE(fiemap_s, num_bytes);
+	if (fiemap_s == NULL)
+		RETURN(-ENOMEM);
+
+	/* get the fiemap value */
+	if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
+			   sizeof(*fiemap_s)))
+		GOTO(error, rc = -EFAULT);
+
+	/* If fm_extent_count is non-zero, read the first extent since
+	 * it is used to calculate end_offset and device from previous
+	 * fiemap call. */
+	if (extent_count) {
+		if (copy_from_user(&fiemap_s->fm_extents[0],
+		    (char __user *)arg + sizeof(*fiemap_s),
+		    sizeof(struct ll_fiemap_extent)))
+			GOTO(error, rc = -EFAULT);
+	}
+
+	rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
+	if (rc)
+		GOTO(error, rc);
+
+	ret_bytes = sizeof(struct ll_user_fiemap);
+
+	if (extent_count != 0)
+		ret_bytes += (fiemap_s->fm_mapped_extents *
+				 sizeof(struct ll_fiemap_extent));
+
+	if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
+		rc = -EFAULT;
+
+error:
+	OBD_FREE_LARGE(fiemap_s, num_bytes);
+	RETURN(rc);
+}
+
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param extent_lock  Take extent lock. Not needed if a process is already
+ *		       holding the OST object group locks.
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version,
+		    int extent_lock)
+{
+	struct lov_stripe_md	*lsm = NULL;
+	struct ll_sb_info	*sbi = ll_i2sbi(inode);
+	struct obdo		*obdo = NULL;
+	int			 rc;
+	ENTRY;
+
+	/* If no stripe, we consider version is 0. */
+	lsm = ccc_inode_lsm_get(inode);
+	if (lsm == NULL) {
+		*data_version = 0;
+		CDEBUG(D_INODE, "No object for inode\n");
+		RETURN(0);
+	}
+
+	OBD_ALLOC_PTR(obdo);
+	if (obdo == NULL) {
+		ccc_inode_lsm_put(inode, lsm);
+		RETURN(-ENOMEM);
+	}
+
+	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
+	if (!rc) {
+		if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
+			rc = -EOPNOTSUPP;
+		else
+			*data_version = obdo->o_data_version;
+	}
+
+	OBD_FREE_PTR(obdo);
+	ccc_inode_lsm_put(inode, lsm);
+
+	RETURN(rc);
+}
+
+struct ll_swap_stack {
+	struct iattr		 ia1, ia2;
+	__u64			 dv1, dv2;
+	struct inode		*inode1, *inode2;
+	bool			 check_dv1, check_dv2;
+};
+
+static int ll_swap_layouts(struct file *file1, struct file *file2,
+			   struct lustre_swap_layouts *lsl)
+{
+	struct mdc_swap_layouts	 msl;
+	struct md_op_data	*op_data;
+	__u32			 gid;
+	__u64			 dv;
+	struct ll_swap_stack	*llss = NULL;
+	int			 rc;
+
+	OBD_ALLOC_PTR(llss);
+	if (llss == NULL)
+		RETURN(-ENOMEM);
+
+	llss->inode1 = file1->f_dentry->d_inode;
+	llss->inode2 = file2->f_dentry->d_inode;
+
+	if (!S_ISREG(llss->inode2->i_mode))
+		GOTO(free, rc = -EINVAL);
+
+	if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
+	    ll_permission(llss->inode2, MAY_WRITE, NULL))
+		GOTO(free, rc = -EPERM);
+
+	if (llss->inode2->i_sb != llss->inode1->i_sb)
+		GOTO(free, rc = -EXDEV);
+
+	/* we use 2 bool because it is easier to swap than 2 bits */
+	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
+		llss->check_dv1 = true;
+
+	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
+		llss->check_dv2 = true;
+
+	/* we cannot use lsl->sl_dvX directly because we may swap them */
+	llss->dv1 = lsl->sl_dv1;
+	llss->dv2 = lsl->sl_dv2;
+
+	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
+	if (rc == 0) /* same file, done! */
+		GOTO(free, rc = 0);
+
+	if (rc < 0) { /* sequentialize it */
+		swap(llss->inode1, llss->inode2);
+		swap(file1, file2);
+		swap(llss->dv1, llss->dv2);
+		swap(llss->check_dv1, llss->check_dv2);
+	}
+
+	gid = lsl->sl_gid;
+	if (gid != 0) { /* application asks to flush dirty cache */
+		rc = ll_get_grouplock(llss->inode1, file1, gid);
+		if (rc < 0)
+			GOTO(free, rc);
+
+		rc = ll_get_grouplock(llss->inode2, file2, gid);
+		if (rc < 0) {
+			ll_put_grouplock(llss->inode1, file1, gid);
+			GOTO(free, rc);
+		}
+	}
+
+	/* to be able to restore mtime and atime after swap
+	 * we need to first save them */
+	if (lsl->sl_flags &
+	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
+		llss->ia1.ia_mtime = llss->inode1->i_mtime;
+		llss->ia1.ia_atime = llss->inode1->i_atime;
+		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
+		llss->ia2.ia_mtime = llss->inode2->i_mtime;
+		llss->ia2.ia_atime = llss->inode2->i_atime;
+		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
+	}
+
+	/* ultimate check, before swaping the layouts we check if
+	 * dataversion has changed (if requested) */
+	if (llss->check_dv1) {
+		rc = ll_data_version(llss->inode1, &dv, 0);
+		if (rc)
+			GOTO(putgl, rc);
+		if (dv != llss->dv1)
+			GOTO(putgl, rc = -EAGAIN);
+	}
+
+	if (llss->check_dv2) {
+		rc = ll_data_version(llss->inode2, &dv, 0);
+		if (rc)
+			GOTO(putgl, rc);
+		if (dv != llss->dv2)
+			GOTO(putgl, rc = -EAGAIN);
+	}
+
+	/* struct md_op_data is used to send the swap args to the mdt
+	 * only flags is missing, so we use struct mdc_swap_layouts
+	 * through the md_op_data->op_data */
+	/* flags from user space have to be converted before they are send to
+	 * server, no flag is sent today, they are only used on the client */
+	msl.msl_flags = 0;
+	rc = -ENOMEM;
+	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
+				     0, LUSTRE_OPC_ANY, &msl);
+	if (op_data != NULL) {
+		rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
+				   ll_i2mdexp(llss->inode1),
+				   sizeof(*op_data), op_data, NULL);
+		ll_finish_md_op_data(op_data);
+	}
+
+putgl:
+	if (gid != 0) {
+		ll_put_grouplock(llss->inode2, file2, gid);
+		ll_put_grouplock(llss->inode1, file1, gid);
+	}
+
+	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
+	if (rc != 0)
+		GOTO(free, rc);
+
+	/* clear useless flags */
+	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
+		llss->ia1.ia_valid &= ~ATTR_MTIME;
+		llss->ia2.ia_valid &= ~ATTR_MTIME;
+	}
+
+	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
+		llss->ia1.ia_valid &= ~ATTR_ATIME;
+		llss->ia2.ia_valid &= ~ATTR_ATIME;
+	}
+
+	/* update time if requested */
+	rc = 0;
+	if (llss->ia2.ia_valid != 0) {
+		mutex_lock(&llss->inode1->i_mutex);
+		rc = ll_setattr(file1->f_dentry, &llss->ia2);
+		mutex_unlock(&llss->inode1->i_mutex);
+	}
+
+	if (llss->ia1.ia_valid != 0) {
+		int rc1;
+
+		mutex_lock(&llss->inode2->i_mutex);
+		rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
+		mutex_unlock(&llss->inode2->i_mutex);
+		if (rc == 0)
+			rc = rc1;
+	}
+
+free:
+	if (llss != NULL)
+		OBD_FREE_PTR(llss);
+
+	RETURN(rc);
+}
+
+long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode		*inode = file->f_dentry->d_inode;
+	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
+	int			 flags, rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
+	       inode->i_generation, inode, cmd);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+
+	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+		RETURN(-ENOTTY);
+
+	switch(cmd) {
+	case LL_IOC_GETFLAGS:
+		/* Get the current value of the file flags */
+		return put_user(fd->fd_flags, (int *)arg);
+	case LL_IOC_SETFLAGS:
+	case LL_IOC_CLRFLAGS:
+		/* Set or clear specific file flags */
+		/* XXX This probably needs checks to ensure the flags are
+		 *     not abused, and to handle any flag side effects.
+		 */
+		if (get_user(flags, (int *) arg))
+			RETURN(-EFAULT);
+
+		if (cmd == LL_IOC_SETFLAGS) {
+			if ((flags & LL_FILE_IGNORE_LOCK) &&
+			    !(file->f_flags & O_DIRECT)) {
+				CERROR("%s: unable to disable locking on "
+				       "non-O_DIRECT file\n", current->comm);
+				RETURN(-EINVAL);
+			}
+
+			fd->fd_flags |= flags;
+		} else {
+			fd->fd_flags &= ~flags;
+		}
+		RETURN(0);
+	case LL_IOC_LOV_SETSTRIPE:
+		RETURN(ll_lov_setstripe(inode, file, arg));
+	case LL_IOC_LOV_SETEA:
+		RETURN(ll_lov_setea(inode, file, arg));
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		struct file *file2;
+		struct lustre_swap_layouts lsl;
+
+		if (copy_from_user(&lsl, (char *)arg,
+				       sizeof(struct lustre_swap_layouts)))
+			RETURN(-EFAULT);
+
+		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
+			RETURN(-EPERM);
+
+		file2 = fget(lsl.sl_fd);
+		if (file2 == NULL)
+			RETURN(-EBADF);
+
+		rc = -EPERM;
+		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
+			rc = ll_swap_layouts(file, file2, &lsl);
+		fput(file2);
+		RETURN(rc);
+	}
+	case LL_IOC_LOV_GETSTRIPE:
+		RETURN(ll_lov_getstripe(inode, arg));
+	case LL_IOC_RECREATE_OBJ:
+		RETURN(ll_lov_recreate_obj(inode, arg));
+	case LL_IOC_RECREATE_FID:
+		RETURN(ll_lov_recreate_fid(inode, arg));
+	case FSFILT_IOC_FIEMAP:
+		RETURN(ll_ioctl_fiemap(inode, arg));
+	case FSFILT_IOC_GETFLAGS:
+	case FSFILT_IOC_SETFLAGS:
+		RETURN(ll_iocontrol(inode, file, cmd, arg));
+	case FSFILT_IOC_GETVERSION_OLD:
+	case FSFILT_IOC_GETVERSION:
+		RETURN(put_user(inode->i_generation, (int *)arg));
+	case LL_IOC_GROUP_LOCK:
+		RETURN(ll_get_grouplock(inode, file, arg));
+	case LL_IOC_GROUP_UNLOCK:
+		RETURN(ll_put_grouplock(inode, file, arg));
+	case IOC_OBD_STATFS:
+		RETURN(ll_obd_statfs(inode, (void *)arg));
+
+	/* We need to special case any other ioctls we want to handle,
+	 * to send them to the MDS/OST as appropriate and to properly
+	 * network encode the arg field.
+	case FSFILT_IOC_SETVERSION_OLD:
+	case FSFILT_IOC_SETVERSION:
+	*/
+	case LL_IOC_FLUSHCTX:
+		RETURN(ll_flush_ctx(inode));
+	case LL_IOC_PATH2FID: {
+		if (copy_to_user((void *)arg, ll_inode2fid(inode),
+				 sizeof(struct lu_fid)))
+			RETURN(-EFAULT);
+
+		RETURN(0);
+	}
+	case OBD_IOC_FID2PATH:
+		RETURN(ll_fid2path(inode, (void *)arg));
+	case LL_IOC_DATA_VERSION: {
+		struct ioc_data_version	idv;
+		int			rc;
+
+		if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
+			RETURN(-EFAULT);
+
+		rc = ll_data_version(inode, &idv.idv_version,
+				!(idv.idv_flags & LL_DV_NOFLUSH));
+
+		if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
+			RETURN(-EFAULT);
+
+		RETURN(rc);
+	}
+
+	case LL_IOC_GET_MDTIDX: {
+		int mdtidx;
+
+		mdtidx = ll_get_mdt_idx(inode);
+		if (mdtidx < 0)
+			RETURN(mdtidx);
+
+		if (put_user((int)mdtidx, (int*)arg))
+			RETURN(-EFAULT);
+
+		RETURN(0);
+	}
+	case OBD_IOC_GETDTNAME:
+	case OBD_IOC_GETMDNAME:
+		RETURN(ll_get_obd_name(inode, cmd, arg));
+	case LL_IOC_HSM_STATE_GET: {
+		struct md_op_data	*op_data;
+		struct hsm_user_state	*hus;
+		int			 rc;
+
+		OBD_ALLOC_PTR(hus);
+		if (hus == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hus);
+		if (op_data == NULL) {
+			OBD_FREE_PTR(hus);
+			RETURN(-ENOMEM);
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((void *)arg, hus, sizeof(*hus)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hus);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_STATE_SET: {
+		struct md_op_data	*op_data;
+		struct hsm_state_set	*hss;
+		int			 rc;
+
+		OBD_ALLOC_PTR(hss);
+		if (hss == NULL)
+			RETURN(-ENOMEM);
+		if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
+			OBD_FREE_PTR(hss);
+			RETURN(-EFAULT);
+		}
+
+		/* Non-root users are forbidden to set or clear flags which are
+		 * NOT defined in HSM_USER_MASK. */
+		if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
+		    && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
+			OBD_FREE_PTR(hss);
+			RETURN(-EPERM);
+		}
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hss);
+		if (op_data == NULL) {
+			OBD_FREE_PTR(hss);
+			RETURN(-ENOMEM);
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		ll_finish_md_op_data(op_data);
+
+		OBD_FREE_PTR(hss);
+		RETURN(rc);
+	}
+	case LL_IOC_HSM_ACTION: {
+		struct md_op_data		*op_data;
+		struct hsm_current_action	*hca;
+		int				 rc;
+
+		OBD_ALLOC_PTR(hca);
+		if (hca == NULL)
+			RETURN(-ENOMEM);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, hca);
+		if (op_data == NULL) {
+			OBD_FREE_PTR(hca);
+			RETURN(-ENOMEM);
+		}
+
+		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+				   op_data, NULL);
+
+		if (copy_to_user((char *)arg, hca, sizeof(*hca)))
+			rc = -EFAULT;
+
+		ll_finish_md_op_data(op_data);
+		OBD_FREE_PTR(hca);
+		RETURN(rc);
+	}
+	default: {
+		int err;
+
+		if (LLIOC_STOP ==
+		     ll_iocontrol_call(inode, file, cmd, arg, &err))
+			RETURN(err);
+
+		RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
+				     (void *)arg));
+	}
+	}
+}
+
+
+loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	loff_t retval, eof = 0;
+
+	ENTRY;
+	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
+			   (origin == SEEK_CUR) ? file->f_pos : 0);
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
+	       inode->i_ino, inode->i_generation, inode, retval, retval,
+	       origin);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
+
+	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
+		retval = ll_glimpse_size(inode);
+		if (retval != 0)
+			RETURN(retval);
+		eof = i_size_read(inode);
+	}
+
+	retval = ll_generic_file_llseek_size(file, offset, origin,
+					  ll_file_maxbytes(inode), eof);
+	RETURN(retval);
+}
+
+int ll_flush(struct file *file, fl_owner_t id)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	int rc, err;
+
+	LASSERT(!S_ISDIR(inode->i_mode));
+
+	/* catch async errors that were recorded back when async writeback
+	 * failed for pages in this mapping. */
+	rc = lli->lli_async_rc;
+	lli->lli_async_rc = 0;
+	err = lov_read_and_clear_async_rc(lli->lli_clob);
+	if (rc == 0)
+		rc = err;
+
+	/* The application has been told write failure already.
+	 * Do not report failure again. */
+	if (fd->fd_write_failed)
+		return 0;
+	return rc ? -EIO : 0;
+}
+
+/**
+ * Called to make sure a portion of file has been written out.
+ * if @local_only is not true, it will send OST_SYNC RPCs to ost.
+ *
+ * Return how many pages have been written.
+ */
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+		       enum cl_fsync_mode mode, int ignore_layout)
+{
+	struct cl_env_nest nest;
+	struct lu_env *env;
+	struct cl_io *io;
+	struct obd_capa *capa = NULL;
+	struct cl_fsync_io *fio;
+	int result;
+	ENTRY;
+
+	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
+	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
+		RETURN(-EINVAL);
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = cl_i2info(inode)->lli_clob;
+	io->ci_ignore_layout = ignore_layout;
+
+	/* initialize parameters for sync */
+	fio = &io->u.ci_fsync;
+	fio->fi_capa = capa;
+	fio->fi_start = start;
+	fio->fi_end = end;
+	fio->fi_fid = ll_inode2fid(inode);
+	fio->fi_mode = mode;
+	fio->fi_nr_written = 0;
+
+	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
+		result = cl_io_loop(env, io);
+	else
+		result = io->ci_result;
+	if (result == 0)
+		result = fio->fi_nr_written;
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+
+	capa_put(capa);
+
+	RETURN(result);
+}
+
+/*
+ * When dentry is provided (the 'else' case), *file->f_dentry may be
+ * null and dentry must be used directly rather than pulled from
+ * *file->f_dentry as is done otherwise.
+ */
+
+int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct dentry *dentry = file->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ptlrpc_request *req;
+	struct obd_capa *oc;
+	int rc, err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+	       inode->i_generation, inode);
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
+
+	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	mutex_lock(&inode->i_mutex);
+
+	/* catch async errors that were recorded back when async writeback
+	 * failed for pages in this mapping. */
+	if (!S_ISDIR(inode->i_mode)) {
+		err = lli->lli_async_rc;
+		lli->lli_async_rc = 0;
+		if (rc == 0)
+			rc = err;
+		err = lov_read_and_clear_async_rc(lli->lli_clob);
+		if (rc == 0)
+			rc = err;
+	}
+
+	oc = ll_mdscapa_get(inode);
+	err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
+		      &req);
+	capa_put(oc);
+	if (!rc)
+		rc = err;
+	if (!err)
+		ptlrpc_req_finished(req);
+
+	if (datasync && S_ISREG(inode->i_mode)) {
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
+				CL_FSYNC_ALL, 0);
+		if (rc == 0 && err < 0)
+			rc = err;
+		if (rc < 0)
+			fd->fd_write_failed = true;
+		else
+			fd->fd_write_failed = false;
+	}
+
+	mutex_unlock(&inode->i_mutex);
+	RETURN(rc);
+}
+
+int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
+					   .ei_cb_cp =ldlm_flock_completion_ast,
+					   .ei_cbdata = file_lock };
+	struct md_op_data *op_data;
+	struct lustre_handle lockh = {0};
+	ldlm_policy_data_t flock = {{0}};
+	int flags = 0;
+	int rc;
+	int rc2 = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
+	       inode->i_ino, file_lock);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
+
+	if (file_lock->fl_flags & FL_FLOCK) {
+		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
+		/* flocks are whole-file locks */
+		flock.l_flock.end = OFFSET_MAX;
+		/* For flocks owner is determined by the local file desctiptor*/
+		flock.l_flock.owner = (unsigned long)file_lock->fl_file;
+	} else if (file_lock->fl_flags & FL_POSIX) {
+		flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
+		flock.l_flock.start = file_lock->fl_start;
+		flock.l_flock.end = file_lock->fl_end;
+	} else {
+		RETURN(-EINVAL);
+	}
+	flock.l_flock.pid = file_lock->fl_pid;
+
+	/* Somewhat ugly workaround for svc lockd.
+	 * lockd installs custom fl_lmops->lm_compare_owner that checks
+	 * for the fl_owner to be the same (which it always is on local node
+	 * I guess between lockd processes) and then compares pid.
+	 * As such we assign pid to the owner field to make it all work,
+	 * conflict with normal locks is unlikely since pid space and
+	 * pointer space for current->files are not intersecting */
+	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
+		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
+
+	switch (file_lock->fl_type) {
+	case F_RDLCK:
+		einfo.ei_mode = LCK_PR;
+		break;
+	case F_UNLCK:
+		/* An unlock request may or may not have any relation to
+		 * existing locks so we may not be able to pass a lock handle
+		 * via a normal ldlm_lock_cancel() request. The request may even
+		 * unlock a byte range in the middle of an existing lock. In
+		 * order to process an unlock request we need all of the same
+		 * information that is given with a normal read or write record
+		 * lock request. To avoid creating another ldlm unlock (cancel)
+		 * message we'll treat a LCK_NL flock request as an unlock. */
+		einfo.ei_mode = LCK_NL;
+		break;
+	case F_WRLCK:
+		einfo.ei_mode = LCK_PW;
+		break;
+	default:
+		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
+			file_lock->fl_type);
+		RETURN (-ENOTSUPP);
+	}
+
+	switch (cmd) {
+	case F_SETLKW:
+#ifdef F_SETLKW64
+	case F_SETLKW64:
+#endif
+		flags = 0;
+		break;
+	case F_SETLK:
+#ifdef F_SETLK64
+	case F_SETLK64:
+#endif
+		flags = LDLM_FL_BLOCK_NOWAIT;
+		break;
+	case F_GETLK:
+#ifdef F_GETLK64
+	case F_GETLK64:
+#endif
+		flags = LDLM_FL_TEST_LOCK;
+		/* Save the old mode so that if the mode in the lock changes we
+		 * can decrement the appropriate reader or writer refcount. */
+		file_lock->fl_type = einfo.ei_mode;
+		break;
+	default:
+		CERROR("unknown fcntl lock command: %d\n", cmd);
+		RETURN (-EINVAL);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
+	       "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
+	       flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
+
+	rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+			op_data, &lockh, &flock, 0, NULL /* req */, flags);
+
+	if ((file_lock->fl_flags & FL_FLOCK) &&
+	    (rc == 0 || file_lock->fl_type == F_UNLCK))
+		rc2  = flock_lock_file_wait(file, file_lock);
+	if ((file_lock->fl_flags & FL_POSIX) &&
+	    (rc == 0 || file_lock->fl_type == F_UNLCK) &&
+	    !(flags & LDLM_FL_TEST_LOCK))
+		rc2  = posix_lock_file_wait(file, file_lock);
+
+	if (rc2 && file_lock->fl_type != F_UNLCK) {
+		einfo.ei_mode = LCK_NL;
+		md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+			op_data, &lockh, &flock, 0, NULL /* req */, flags);
+		rc = rc2;
+	}
+
+	ll_finish_md_op_data(op_data);
+
+	RETURN(rc);
+}
+
+int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+	ENTRY;
+
+	RETURN(-ENOSYS);
+}
+
+/**
+ * test if some locks matching bits and l_req_mode are acquired
+ * - bits can be in different locks
+ * - if found clear the common lock bits in *bits
+ * - the bits not found, are kept in *bits
+ * \param inode [IN]
+ * \param bits [IN] searched lock bits [IN]
+ * \param l_req_mode [IN] searched lock mode
+ * \retval boolean, true iff all bits are found
+ */
+int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
+{
+	struct lustre_handle lockh;
+	ldlm_policy_data_t policy;
+	ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
+				(LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
+	struct lu_fid *fid;
+	__u64 flags;
+	int i;
+	ENTRY;
+
+	if (!inode)
+	       RETURN(0);
+
+	fid = &ll_i2info(inode)->lli_fid;
+	CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
+	       ldlm_lockname[mode]);
+
+	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
+	for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
+		policy.l_inodebits.bits = *bits & (1 << i);
+		if (policy.l_inodebits.bits == 0)
+			continue;
+
+		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
+				  &policy, mode, &lockh)) {
+			struct ldlm_lock *lock;
+
+			lock = ldlm_handle2lock(&lockh);
+			if (lock) {
+				*bits &=
+				      ~(lock->l_policy_data.l_inodebits.bits);
+				LDLM_LOCK_PUT(lock);
+			} else {
+				*bits &= ~policy.l_inodebits.bits;
+			}
+		}
+	}
+	RETURN(*bits == 0);
+}
+
+ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+			    struct lustre_handle *lockh, __u64 flags)
+{
+	ldlm_policy_data_t policy = { .l_inodebits = {bits}};
+	struct lu_fid *fid;
+	ldlm_mode_t rc;
+	ENTRY;
+
+	fid = &ll_i2info(inode)->lli_fid;
+	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
+
+	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
+			   fid, LDLM_IBITS, &policy,
+			   LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
+	RETURN(rc);
+}
+
+static int ll_inode_revalidate_fini(struct inode *inode, int rc)
+{
+	/* Already unlinked. Just update nlink and return success */
+	if (rc == -ENOENT) {
+		clear_nlink(inode);
+		/* This path cannot be hit for regular files unless in
+		 * case of obscure races, so no need to to validate
+		 * size. */
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return 0;
+	} else if (rc != 0) {
+		CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(ll_inode2fid(inode)), rc);
+	}
+
+	return rc;
+}
+
+int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
+			     __u64 ibits)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ptlrpc_request *req = NULL;
+	struct obd_export *exp;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(inode != NULL);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
+	       inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
+
+	exp = ll_i2mdexp(inode);
+
+	/* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
+	 *      But under CMD case, it caused some lock issues, should be fixed
+	 *      with new CMD ibits lock. See bug 12718 */
+	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
+		struct lookup_intent oit = { .it_op = IT_GETATTR };
+		struct md_op_data *op_data;
+
+		if (ibits == MDS_INODELOCK_LOOKUP)
+			oit.it_op = IT_LOOKUP;
+
+		/* Call getattr by fid, so do not provide name at all. */
+		op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
+					     dentry->d_inode, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
+
+		oit.it_create_mode |= M_CHECK_STALE;
+		rc = md_intent_lock(exp, op_data, NULL, 0,
+				    /* we are not interested in name
+				       based lookup */
+				    &oit, 0, &req,
+				    ll_md_blocking_ast, 0);
+		ll_finish_md_op_data(op_data);
+		oit.it_create_mode &= ~M_CHECK_STALE;
+		if (rc < 0) {
+			rc = ll_inode_revalidate_fini(inode, rc);
+			GOTO (out, rc);
+		}
+
+		rc = ll_revalidate_it_finish(req, &oit, dentry);
+		if (rc != 0) {
+			ll_intent_release(&oit);
+			GOTO(out, rc);
+		}
+
+		/* Unlinked? Unhash dentry, so it is not picked up later by
+		   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+		   here to preserve get_cwd functionality on 2.6.
+		   Bug 10503 */
+		if (!dentry->d_inode->i_nlink)
+			d_lustre_invalidate(dentry, 0);
+
+		ll_lookup_finish_locks(&oit, dentry);
+	} else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
+		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+		obd_valid valid = OBD_MD_FLGETATTR;
+		struct md_op_data *op_data;
+		int ealen = 0;
+
+		if (S_ISREG(inode->i_mode)) {
+			rc = ll_get_max_mdsize(sbi, &ealen);
+			if (rc)
+				RETURN(rc);
+			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
+		}
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+					     0, ealen, LUSTRE_OPC_ANY,
+					     NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
+
+		op_data->op_valid = valid;
+		/* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
+		 * capa for this inode. Because we only keep capas of dirs
+		 * fresh. */
+		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+		ll_finish_md_op_data(op_data);
+		if (rc) {
+			rc = ll_inode_revalidate_fini(inode, rc);
+			RETURN(rc);
+		}
+
+		rc = ll_prep_inode(&inode, req, NULL, NULL);
+	}
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
+			   __u64 ibits)
+{
+	struct inode *inode = dentry->d_inode;
+	int rc;
+	ENTRY;
+
+	rc = __ll_inode_revalidate_it(dentry, it, ibits);
+	if (rc != 0)
+		RETURN(rc);
+
+	/* if object isn't regular file, don't validate size */
+	if (!S_ISREG(inode->i_mode)) {
+		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
+		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
+		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
+	} else {
+		rc = ll_glimpse_size(inode);
+	}
+	RETURN(rc);
+}
+
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
+		  struct lookup_intent *it, struct kstat *stat)
+{
+	struct inode *inode = de->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int res = 0;
+
+	res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
+					     MDS_INODELOCK_LOOKUP);
+	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
+
+	if (res)
+		return res;
+
+	stat->dev = inode->i_sb->s_dev;
+	if (ll_need_32bit_api(sbi))
+		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
+	else
+		stat->ino = inode->i_ino;
+	stat->mode = inode->i_mode;
+	stat->nlink = inode->i_nlink;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	stat->rdev = inode->i_rdev;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blksize = 1 << inode->i_blkbits;
+
+	stat->size = i_size_read(inode);
+	stat->blocks = inode->i_blocks;
+
+	return 0;
+}
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+	struct lookup_intent it = { .it_op = IT_GETATTR };
+
+	return ll_getattr_it(mnt, de, &it, stat);
+}
+
+
+struct posix_acl * ll_get_acl(struct inode *inode, int type)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct posix_acl *acl = NULL;
+	ENTRY;
+
+	spin_lock(&lli->lli_lock);
+	/* VFS' acl_permission_check->check_acl will release the refcount */
+	acl = posix_acl_dup(lli->lli_posix_acl);
+	spin_unlock(&lli->lli_lock);
+
+	RETURN(acl);
+}
+
+
+int ll_inode_permission(struct inode *inode, int mask)
+{
+	int rc = 0;
+	ENTRY;
+
+#ifdef MAY_NOT_BLOCK
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+#endif
+
+       /* as root inode are NOT getting validated in lookup operation,
+	* need to do it before permission check. */
+
+	if (inode == inode->i_sb->s_root->d_inode) {
+		struct lookup_intent it = { .it_op = IT_LOOKUP };
+
+		rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
+					      MDS_INODELOCK_LOOKUP);
+		if (rc)
+			RETURN(rc);
+	}
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
+	       inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
+
+	if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
+		return lustre_check_remote_perm(inode, mask);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
+	rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
+
+	RETURN(rc);
+}
+
+#define READ_METHOD aio_read
+#define READ_FUNCTION ll_file_aio_read
+#define WRITE_METHOD aio_write
+#define WRITE_FUNCTION ll_file_aio_write
+
+/* -o localflock - only provides locally consistent flock locks */
+struct file_operations ll_file_operations = {
+	.read	   = ll_file_read,
+	.READ_METHOD    = READ_FUNCTION,
+	.write	  = ll_file_write,
+	.WRITE_METHOD   = WRITE_FUNCTION,
+	.unlocked_ioctl = ll_file_ioctl,
+	.open	   = ll_file_open,
+	.release	= ll_file_release,
+	.mmap	   = ll_file_mmap,
+	.llseek	 = ll_file_seek,
+	.splice_read    = ll_file_splice_read,
+	.fsync	  = ll_fsync,
+	.flush	  = ll_flush
+};
+
+struct file_operations ll_file_operations_flock = {
+	.read	   = ll_file_read,
+	.READ_METHOD    = READ_FUNCTION,
+	.write	  = ll_file_write,
+	.WRITE_METHOD   = WRITE_FUNCTION,
+	.unlocked_ioctl = ll_file_ioctl,
+	.open	   = ll_file_open,
+	.release	= ll_file_release,
+	.mmap	   = ll_file_mmap,
+	.llseek	 = ll_file_seek,
+	.splice_read    = ll_file_splice_read,
+	.fsync	  = ll_fsync,
+	.flush	  = ll_flush,
+	.flock	  = ll_file_flock,
+	.lock	   = ll_file_flock
+};
+
+/* These are for -o noflock - to return ENOSYS on flock calls */
+struct file_operations ll_file_operations_noflock = {
+	.read	   = ll_file_read,
+	.READ_METHOD    = READ_FUNCTION,
+	.write	  = ll_file_write,
+	.WRITE_METHOD   = WRITE_FUNCTION,
+	.unlocked_ioctl = ll_file_ioctl,
+	.open	   = ll_file_open,
+	.release	= ll_file_release,
+	.mmap	   = ll_file_mmap,
+	.llseek	 = ll_file_seek,
+	.splice_read    = ll_file_splice_read,
+	.fsync	  = ll_fsync,
+	.flush	  = ll_flush,
+	.flock	  = ll_file_noflock,
+	.lock	   = ll_file_noflock
+};
+
+struct inode_operations ll_file_inode_operations = {
+	.setattr	= ll_setattr,
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.listxattr	= ll_listxattr,
+	.removexattr	= ll_removexattr,
+	.get_acl	= ll_get_acl,
+};
+
+/* dynamic ioctl number support routins */
+static struct llioc_ctl_data {
+	struct rw_semaphore	ioc_sem;
+	struct list_head	      ioc_head;
+} llioc = {
+	__RWSEM_INITIALIZER(llioc.ioc_sem),
+	LIST_HEAD_INIT(llioc.ioc_head)
+};
+
+
+struct llioc_data {
+	struct list_head	      iocd_list;
+	unsigned int	    iocd_size;
+	llioc_callback_t	iocd_cb;
+	unsigned int	    iocd_count;
+	unsigned int	    iocd_cmd[0];
+};
+
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
+{
+	unsigned int size;
+	struct llioc_data *in_data = NULL;
+	ENTRY;
+
+	if (cb == NULL || cmd == NULL ||
+	    count > LLIOC_MAX_CMD || count < 0)
+		RETURN(NULL);
+
+	size = sizeof(*in_data) + count * sizeof(unsigned int);
+	OBD_ALLOC(in_data, size);
+	if (in_data == NULL)
+		RETURN(NULL);
+
+	memset(in_data, 0, sizeof(*in_data));
+	in_data->iocd_size = size;
+	in_data->iocd_cb = cb;
+	in_data->iocd_count = count;
+	memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
+
+	down_write(&llioc.ioc_sem);
+	list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
+	up_write(&llioc.ioc_sem);
+
+	RETURN(in_data);
+}
+
+void ll_iocontrol_unregister(void *magic)
+{
+	struct llioc_data *tmp;
+
+	if (magic == NULL)
+		return;
+
+	down_write(&llioc.ioc_sem);
+	list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
+		if (tmp == magic) {
+			unsigned int size = tmp->iocd_size;
+
+			list_del(&tmp->iocd_list);
+			up_write(&llioc.ioc_sem);
+
+			OBD_FREE(tmp, size);
+			return;
+		}
+	}
+	up_write(&llioc.ioc_sem);
+
+	CWARN("didn't find iocontrol register block with magic: %p\n", magic);
+}
+
+EXPORT_SYMBOL(ll_iocontrol_register);
+EXPORT_SYMBOL(ll_iocontrol_unregister);
+
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
+			unsigned int cmd, unsigned long arg, int *rcp)
+{
+	enum llioc_iter ret = LLIOC_CONT;
+	struct llioc_data *data;
+	int rc = -EINVAL, i;
+
+	down_read(&llioc.ioc_sem);
+	list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
+		for (i = 0; i < data->iocd_count; i++) {
+			if (cmd != data->iocd_cmd[i])
+				continue;
+
+			ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
+			break;
+		}
+
+		if (ret == LLIOC_STOP)
+			break;
+	}
+	up_read(&llioc.ioc_sem);
+
+	if (rcp)
+		*rcp = rc;
+	return ret;
+}
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct cl_env_nest nest;
+	struct lu_env *env;
+	int result;
+	ENTRY;
+
+	if (lli->lli_clob == NULL)
+		RETURN(0);
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	result = cl_conf_set(env, lli->lli_clob, conf);
+	cl_env_nested_put(&nest, env);
+
+	if (conf->coc_opc == OBJECT_CONF_SET) {
+		struct ldlm_lock *lock = conf->coc_lock;
+
+		LASSERT(lock != NULL);
+		LASSERT(ldlm_has_layout(lock));
+		if (result == 0) {
+			/* it can only be allowed to match after layout is
+			 * applied to inode otherwise false layout would be
+			 * seen. Applying layout shoud happen before dropping
+			 * the intent lock. */
+			ldlm_lock_allow_match(lock);
+		}
+	}
+	RETURN(result);
+}
+
+/* Fetch layout from MDT with getxattr request, if it's not ready yet */
+static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
+
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_capa *oc;
+	struct ptlrpc_request *req;
+	struct mdt_body *body;
+	void *lvbdata;
+	void *lmm;
+	int lmmsize;
+	int rc;
+	ENTRY;
+
+	if (lock->l_lvb_data != NULL)
+		RETURN(0);
+
+	/* if layout lock was granted right away, the layout is returned
+	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
+	 * blocked and then granted via completion ast, we have to fetch
+	 * layout here. Please note that we can't use the LVB buffer in
+	 * completion AST because it doesn't have a large enough buffer */
+	oc = ll_mdscapa_get(inode);
+	rc = ll_get_max_mdsize(sbi, &lmmsize);
+	if (rc == 0)
+		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
+				lmmsize, 0, &req);
+	capa_put(oc);
+	if (rc < 0)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL || body->eadatasize > lmmsize)
+		GOTO(out, rc = -EPROTO);
+
+	lmmsize = body->eadatasize;
+	if (lmmsize == 0) /* empty layout */
+		GOTO(out, rc = 0);
+
+	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
+	if (lmm == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	OBD_ALLOC_LARGE(lvbdata, lmmsize);
+	if (lvbdata == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	memcpy(lvbdata, lmm, lmmsize);
+	lock_res_and_lock(lock);
+	if (lock->l_lvb_data == NULL) {
+		lock->l_lvb_data = lvbdata;
+		lock->l_lvb_len = lmmsize;
+		lvbdata = NULL;
+	}
+	unlock_res_and_lock(lock);
+
+	if (lvbdata != NULL)
+		OBD_FREE_LARGE(lvbdata, lmmsize);
+	EXIT;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/**
+ * Apply the layout to the inode. Layout lock is held and will be released
+ * in this function.
+ */
+static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
+				struct inode *inode, __u32 *gen, bool reconf)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info    *sbi = ll_i2sbi(inode);
+	struct ldlm_lock *lock;
+	struct lustre_md md = { NULL };
+	struct cl_object_conf conf;
+	int rc = 0;
+	bool lvb_ready;
+	bool wait_layout = false;
+	ENTRY;
+
+	LASSERT(lustre_handle_is_used(lockh));
+
+	lock = ldlm_handle2lock(lockh);
+	LASSERT(lock != NULL);
+	LASSERT(ldlm_has_layout(lock));
+
+	LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
+		inode, PFID(&lli->lli_fid), reconf);
+
+	/* in case this is a caching lock and reinstate with new inode */
+	md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
+
+	lock_res_and_lock(lock);
+	lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
+	unlock_res_and_lock(lock);
+	/* checking lvb_ready is racy but this is okay. The worst case is
+	 * that multi processes may configure the file on the same time. */
+	if (lvb_ready || !reconf) {
+		rc = -ENODATA;
+		if (lvb_ready) {
+			/* layout_gen must be valid if layout lock is not
+			 * cancelled and stripe has already set */
+			*gen = lli->lli_layout_gen;
+			rc = 0;
+		}
+		GOTO(out, rc);
+	}
+
+	rc = ll_layout_fetch(inode, lock);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* for layout lock, lmm is returned in lock's lvb.
+	 * lvb_data is immutable if the lock is held so it's safe to access it
+	 * without res lock. See the description in ldlm_lock_decref_internal()
+	 * for the condition to free lvb_data of layout lock */
+	if (lock->l_lvb_data != NULL) {
+		rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
+				  lock->l_lvb_data, lock->l_lvb_len);
+		if (rc >= 0) {
+			*gen = LL_LAYOUT_GEN_EMPTY;
+			if (md.lsm != NULL)
+				*gen = md.lsm->lsm_layout_gen;
+			rc = 0;
+		} else {
+			CERROR("%s: file "DFID" unpackmd error: %d\n",
+				ll_get_fsname(inode->i_sb, NULL, 0),
+				PFID(&lli->lli_fid), rc);
+		}
+	}
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* set layout to file. Unlikely this will fail as old layout was
+	 * surely eliminated */
+	memset(&conf, 0, sizeof conf);
+	conf.coc_opc = OBJECT_CONF_SET;
+	conf.coc_inode = inode;
+	conf.coc_lock = lock;
+	conf.u.coc_md = &md;
+	rc = ll_layout_conf(inode, &conf);
+
+	if (md.lsm != NULL)
+		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+
+	/* refresh layout failed, need to wait */
+	wait_layout = rc == -EBUSY;
+	EXIT;
+
+out:
+	LDLM_LOCK_PUT(lock);
+	ldlm_lock_decref(lockh, mode);
+
+	/* wait for IO to complete if it's still being used. */
+	if (wait_layout) {
+		CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
+			ll_get_fsname(inode->i_sb, NULL, 0),
+			inode, PFID(&lli->lli_fid));
+
+		memset(&conf, 0, sizeof conf);
+		conf.coc_opc = OBJECT_CONF_WAIT;
+		conf.coc_inode = inode;
+		rc = ll_layout_conf(inode, &conf);
+		if (rc == 0)
+			rc = -EAGAIN;
+
+		CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
+			PFID(&lli->lli_fid), rc);
+	}
+	RETURN(rc);
+}
+
+/**
+ * This function checks if there exists a LAYOUT lock on the client side,
+ * or enqueues it if it doesn't have one in cache.
+ *
+ * This function will not hold layout lock so it may be revoked any time after
+ * this function returns. Any operations depend on layout should be redone
+ * in that case.
+ *
+ * This function should be called before lov_io_init() to get an uptodate
+ * layout version, the caller should save the version number and after IO
+ * is finished, this function should be called again to verify that layout
+ * is not changed during IO time.
+ */
+int ll_layout_refresh(struct inode *inode, __u32 *gen)
+{
+	struct ll_inode_info  *lli = ll_i2info(inode);
+	struct ll_sb_info     *sbi = ll_i2sbi(inode);
+	struct md_op_data     *op_data;
+	struct lookup_intent   it;
+	struct lustre_handle   lockh;
+	ldlm_mode_t	       mode;
+	struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
+					   .ei_mode = LCK_CR,
+					   .ei_cb_bl = ll_md_blocking_ast,
+					   .ei_cb_cp = ldlm_completion_ast,
+					   .ei_cbdata = NULL };
+	int rc;
+	ENTRY;
+
+	*gen = lli->lli_layout_gen;
+	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+		RETURN(0);
+
+	/* sanity checks */
+	LASSERT(fid_is_sane(ll_inode2fid(inode)));
+	LASSERT(S_ISREG(inode->i_mode));
+
+	/* mostly layout lock is caching on the local side, so try to match
+	 * it before grabbing layout lock mutex. */
+	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
+	if (mode != 0) { /* hit cached lock */
+		rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
+		if (rc == 0)
+			RETURN(0);
+
+		/* better hold lli_layout_mutex to try again otherwise
+		 * it will have starvation problem. */
+	}
+
+	/* take layout lock mutex to enqueue layout lock exclusively. */
+	mutex_lock(&lli->lli_layout_mutex);
+
+again:
+	/* try again. Maybe somebody else has done this. */
+	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
+	if (mode != 0) { /* hit cached lock */
+		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+		if (rc == -EAGAIN)
+			goto again;
+
+		mutex_unlock(&lli->lli_layout_mutex);
+		RETURN(rc);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
+			0, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		mutex_unlock(&lli->lli_layout_mutex);
+		RETURN(PTR_ERR(op_data));
+	}
+
+	/* have to enqueue one */
+	memset(&it, 0, sizeof(it));
+	it.it_op = IT_LAYOUT;
+	lockh.cookie = 0ULL;
+
+	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
+			ll_get_fsname(inode->i_sb, NULL, 0), inode,
+			PFID(&lli->lli_fid));
+
+	rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
+			NULL, 0, NULL, 0);
+	if (it.d.lustre.it_data != NULL)
+		ptlrpc_req_finished(it.d.lustre.it_data);
+	it.d.lustre.it_data = NULL;
+
+	ll_finish_md_op_data(op_data);
+
+	mode = it.d.lustre.it_lock_mode;
+	it.d.lustre.it_lock_mode = 0;
+	ll_intent_drop_lock(&it);
+
+	if (rc == 0) {
+		/* set lock data in case this is a new lock */
+		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+		if (rc == -EAGAIN)
+			goto again;
+	}
+	mutex_unlock(&lli->lli_layout_mutex);
+
+	RETURN(rc);
+}

diff --git a/drivers/staging/lustre/lustre/llite/llite_capa.c b/drivers/staging/lustre/lustre/llite/llite_capa.c
new file mode 100644
index 0000000..b6fd959
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_capa.c

@@ -0,0 +1,661 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_capa.c
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/fs.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+#include <linux/file.h>
+#include <linux/kmod.h>
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+/* for obd_capa.c_list, client capa might stay in three places:
+ * 1. ll_capa_list.
+ * 2. ll_idle_capas.
+ * 3. stand alone: just allocated.
+ */
+
+/* capas for oss writeback and those failed to renew */
+static LIST_HEAD(ll_idle_capas);
+static struct ptlrpc_thread ll_capa_thread;
+static struct list_head *ll_capa_list = &capa_list[CAPA_SITE_CLIENT];
+
+/* llite capa renewal timer */
+struct timer_list ll_capa_timer;
+/* for debug: indicate whether capa on llite is enabled or not */
+static atomic_t ll_capa_debug = ATOMIC_INIT(0);
+static unsigned long long ll_capa_renewed = 0;
+static unsigned long long ll_capa_renewal_noent = 0;
+static unsigned long long ll_capa_renewal_failed = 0;
+static unsigned long long ll_capa_renewal_retries = 0;
+
+static inline void update_capa_timer(struct obd_capa *ocapa, cfs_time_t expiry)
+{
+	if (cfs_time_before(expiry, ll_capa_timer.expires) ||
+	    !timer_pending(&ll_capa_timer)) {
+		mod_timer(&ll_capa_timer, expiry);
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+			   "ll_capa_timer update: %lu/%lu by", expiry, jiffies);
+	}
+}
+
+static inline cfs_time_t capa_renewal_time(struct obd_capa *ocapa)
+{
+	return cfs_time_sub(ocapa->c_expiry,
+			    cfs_time_seconds(ocapa->c_capa.lc_timeout) / 2);
+}
+
+static inline int capa_is_to_expire(struct obd_capa *ocapa)
+{
+	return cfs_time_beforeq(capa_renewal_time(ocapa), cfs_time_current());
+}
+
+static inline int have_expired_capa(void)
+{
+	struct obd_capa *ocapa = NULL;
+	int expired = 0;
+
+	/* if ll_capa_list has client capa to expire or ll_idle_capas has
+	 * expired capa, return 1.
+	 */
+	spin_lock(&capa_lock);
+	if (!list_empty(ll_capa_list)) {
+		ocapa = list_entry(ll_capa_list->next, struct obd_capa,
+				       c_list);
+		expired = capa_is_to_expire(ocapa);
+		if (!expired)
+			update_capa_timer(ocapa, capa_renewal_time(ocapa));
+	} else if (!list_empty(&ll_idle_capas)) {
+		ocapa = list_entry(ll_idle_capas.next, struct obd_capa,
+				       c_list);
+		expired = capa_is_expired(ocapa);
+		if (!expired)
+			update_capa_timer(ocapa, ocapa->c_expiry);
+	}
+	spin_unlock(&capa_lock);
+
+	if (expired)
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired");
+	return expired;
+}
+
+static void sort_add_capa(struct obd_capa *ocapa, struct list_head *head)
+{
+	struct obd_capa *tmp;
+	struct list_head *before = NULL;
+
+	/* TODO: client capa is sorted by expiry, this could be optimized */
+	list_for_each_entry_reverse(tmp, head, c_list) {
+		if (cfs_time_aftereq(ocapa->c_expiry, tmp->c_expiry)) {
+			before = &tmp->c_list;
+			break;
+		}
+	}
+
+	LASSERT(&ocapa->c_list != before);
+	list_add(&ocapa->c_list, before ?: head);
+}
+
+static inline int obd_capa_open_count(struct obd_capa *oc)
+{
+	struct ll_inode_info *lli = ll_i2info(oc->u.cli.inode);
+	return atomic_read(&lli->lli_open_count);
+}
+
+static void ll_delete_capa(struct obd_capa *ocapa)
+{
+	struct ll_inode_info *lli = ll_i2info(ocapa->u.cli.inode);
+
+	if (capa_for_mds(&ocapa->c_capa)) {
+		LASSERT(lli->lli_mds_capa == ocapa);
+		lli->lli_mds_capa = NULL;
+	} else if (capa_for_oss(&ocapa->c_capa)) {
+		list_del_init(&ocapa->u.cli.lli_list);
+	}
+
+	DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client");
+	list_del_init(&ocapa->c_list);
+	capa_count[CAPA_SITE_CLIENT]--;
+	/* release the ref when alloc */
+	capa_put(ocapa);
+}
+
+/* three places where client capa is deleted:
+ * 1. capa_thread_main(), main place to delete expired capa.
+ * 2. ll_clear_inode_capas() in ll_clear_inode().
+ * 3. ll_truncate_free_capa() delete truncate capa explicitly in ll_setattr_ost().
+ */
+static int capa_thread_main(void *unused)
+{
+	struct obd_capa *ocapa, *tmp, *next;
+	struct inode *inode = NULL;
+	struct l_wait_info lwi = { 0 };
+	int rc;
+	ENTRY;
+
+	thread_set_flags(&ll_capa_thread, SVC_RUNNING);
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+
+	while (1) {
+		l_wait_event(ll_capa_thread.t_ctl_waitq,
+			     !thread_is_running(&ll_capa_thread) ||
+			     have_expired_capa(),
+			     &lwi);
+
+		if (!thread_is_running(&ll_capa_thread))
+			break;
+
+		next = NULL;
+
+		spin_lock(&capa_lock);
+		list_for_each_entry_safe(ocapa, tmp, ll_capa_list, c_list) {
+			__u64 ibits;
+
+			LASSERT(ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC);
+
+			if (!capa_is_to_expire(ocapa)) {
+				next = ocapa;
+				break;
+			}
+
+			list_del_init(&ocapa->c_list);
+
+			/* for MDS capability, only renew those which belong to
+			 * dir, or its inode is opened, or client holds LOOKUP
+			 * lock.
+			 */
+			/* ibits may be changed by ll_have_md_lock() so we have
+			 * to set it each time */
+			ibits = MDS_INODELOCK_LOOKUP;
+			if (capa_for_mds(&ocapa->c_capa) &&
+			    !S_ISDIR(ocapa->u.cli.inode->i_mode) &&
+			    obd_capa_open_count(ocapa) == 0 &&
+			    !ll_have_md_lock(ocapa->u.cli.inode,
+					     &ibits, LCK_MINMODE)) {
+				DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+					   "skip renewal for");
+				sort_add_capa(ocapa, &ll_idle_capas);
+				continue;
+			}
+
+			/* for OSS capability, only renew those whose inode is
+			 * opened.
+			 */
+			if (capa_for_oss(&ocapa->c_capa) &&
+			    obd_capa_open_count(ocapa) == 0) {
+				/* oss capa with open count == 0 won't renew,
+				 * move to idle list */
+				sort_add_capa(ocapa, &ll_idle_capas);
+				continue;
+			}
+
+			/* NB iput() is in ll_update_capa() */
+			inode = igrab(ocapa->u.cli.inode);
+			if (inode == NULL) {
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "igrab failed for");
+				continue;
+			}
+
+			capa_get(ocapa);
+			ll_capa_renewed++;
+			spin_unlock(&capa_lock);
+			rc = md_renew_capa(ll_i2mdexp(inode), ocapa,
+					   ll_update_capa);
+			spin_lock(&capa_lock);
+			if (rc) {
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "renew failed: %d", rc);
+				ll_capa_renewal_failed++;
+			}
+		}
+
+		if (next)
+			update_capa_timer(next, capa_renewal_time(next));
+
+		list_for_each_entry_safe(ocapa, tmp, &ll_idle_capas,
+					     c_list) {
+			if (!capa_is_expired(ocapa)) {
+				if (!next)
+					update_capa_timer(ocapa,
+							  ocapa->c_expiry);
+				break;
+			}
+
+			if (atomic_read(&ocapa->c_refc) > 1) {
+				DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+					   "expired(c_refc %d), don't release",
+					   atomic_read(&ocapa->c_refc));
+				/* don't try to renew any more */
+				list_del_init(&ocapa->c_list);
+				continue;
+			}
+
+			/* expired capa is released. */
+			DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release expired");
+			ll_delete_capa(ocapa);
+		}
+
+		spin_unlock(&capa_lock);
+	}
+
+	thread_set_flags(&ll_capa_thread, SVC_STOPPED);
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+	RETURN(0);
+}
+
+void ll_capa_timer_callback(unsigned long unused)
+{
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+}
+
+int ll_capa_thread_start(void)
+{
+	task_t *task;
+	ENTRY;
+
+	init_waitqueue_head(&ll_capa_thread.t_ctl_waitq);
+
+	task = kthread_run(capa_thread_main, NULL, "ll_capa");
+	if (IS_ERR(task)) {
+		CERROR("cannot start expired capa thread: rc %ld\n",
+			PTR_ERR(task));
+		RETURN(PTR_ERR(task));
+	}
+	wait_event(ll_capa_thread.t_ctl_waitq,
+		       thread_is_running(&ll_capa_thread));
+
+	RETURN(0);
+}
+
+void ll_capa_thread_stop(void)
+{
+	thread_set_flags(&ll_capa_thread, SVC_STOPPING);
+	wake_up(&ll_capa_thread.t_ctl_waitq);
+	wait_event(ll_capa_thread.t_ctl_waitq,
+		       thread_is_stopped(&ll_capa_thread));
+}
+
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa;
+	int found = 0;
+
+	ENTRY;
+
+	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0)
+		RETURN(NULL);
+
+	LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW ||
+		opc == CAPA_OPC_OSS_TRUNC);
+
+	spin_lock(&capa_lock);
+	list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+		if (capa_is_expired(ocapa))
+			continue;
+		if ((opc & CAPA_OPC_OSS_WRITE) &&
+		    capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) {
+			found = 1;
+			break;
+		} else if ((opc & CAPA_OPC_OSS_READ) &&
+			   capa_opc_supported(&ocapa->c_capa,
+					      CAPA_OPC_OSS_READ)) {
+			found = 1;
+			break;
+		} else if ((opc & CAPA_OPC_OSS_TRUNC) &&
+			   capa_opc_supported(&ocapa->c_capa, opc)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+				  ll_inode2fid(inode)));
+		LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+		capa_get(ocapa);
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+	} else {
+		ocapa = NULL;
+
+		if (atomic_read(&ll_capa_debug)) {
+			CERROR("no capability for "DFID" opc "LPX64"\n",
+			       PFID(&lli->lli_fid), opc);
+			atomic_set(&ll_capa_debug, 0);
+		}
+	}
+	spin_unlock(&capa_lock);
+
+	RETURN(ocapa);
+}
+EXPORT_SYMBOL(ll_osscapa_get);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa;
+	ENTRY;
+
+	LASSERT(inode != NULL);
+
+	if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0)
+		RETURN(NULL);
+
+	spin_lock(&capa_lock);
+	ocapa = capa_get(lli->lli_mds_capa);
+	spin_unlock(&capa_lock);
+	if (!ocapa && atomic_read(&ll_capa_debug)) {
+		CERROR("no mds capability for "DFID"\n", PFID(&lli->lli_fid));
+		atomic_set(&ll_capa_debug, 0);
+	}
+
+	RETURN(ocapa);
+}
+
+static struct obd_capa *do_add_mds_capa(struct inode *inode,
+					struct obd_capa *ocapa)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *old = lli->lli_mds_capa;
+	struct lustre_capa *capa = &ocapa->c_capa;
+
+	if (!old) {
+		ocapa->u.cli.inode = inode;
+		lli->lli_mds_capa = ocapa;
+		capa_count[CAPA_SITE_CLIENT]++;
+
+		DEBUG_CAPA(D_SEC, capa, "add MDS");
+	} else {
+		spin_lock(&old->c_lock);
+		old->c_capa = *capa;
+		spin_unlock(&old->c_lock);
+
+		DEBUG_CAPA(D_SEC, capa, "update MDS");
+
+		capa_put(ocapa);
+		ocapa = old;
+	}
+	return ocapa;
+}
+
+static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa;
+
+	/* inside capa_lock */
+	list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+		if ((capa_opc(&ocapa->c_capa) & opc) != opc)
+			continue;
+
+		LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+				  ll_inode2fid(inode)));
+		LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+		return ocapa;
+	}
+
+	return NULL;
+}
+
+static inline void inode_add_oss_capa(struct inode *inode,
+				      struct obd_capa *ocapa)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *tmp;
+	struct list_head *next = NULL;
+
+	/* capa is sorted in lli_oss_capas so lookup can always find the
+	 * latest one */
+	list_for_each_entry(tmp, &lli->lli_oss_capas, u.cli.lli_list) {
+		if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) {
+			next = &tmp->u.cli.lli_list;
+			break;
+		}
+	}
+	LASSERT(&ocapa->u.cli.lli_list != next);
+	list_move_tail(&ocapa->u.cli.lli_list, next ?: &lli->lli_oss_capas);
+}
+
+static struct obd_capa *do_add_oss_capa(struct inode *inode,
+					struct obd_capa *ocapa)
+{
+	struct obd_capa *old;
+	struct lustre_capa *capa = &ocapa->c_capa;
+
+	LASSERTF(S_ISREG(inode->i_mode),
+		 "inode has oss capa, but not regular file, mode: %d\n",
+		 inode->i_mode);
+
+	/* FIXME: can't replace it so easily with fine-grained opc */
+	old = do_lookup_oss_capa(inode, capa_opc(capa) & CAPA_OPC_OSS_ONLY);
+	if (!old) {
+		ocapa->u.cli.inode = inode;
+		INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+		capa_count[CAPA_SITE_CLIENT]++;
+
+		DEBUG_CAPA(D_SEC, capa, "add OSS");
+	} else {
+		spin_lock(&old->c_lock);
+		old->c_capa = *capa;
+		spin_unlock(&old->c_lock);
+
+		DEBUG_CAPA(D_SEC, capa, "update OSS");
+
+		capa_put(ocapa);
+		ocapa = old;
+	}
+
+	inode_add_oss_capa(inode, ocapa);
+	return ocapa;
+}
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa)
+{
+	spin_lock(&capa_lock);
+	ocapa = capa_for_mds(&ocapa->c_capa) ? do_add_mds_capa(inode, ocapa) :
+					       do_add_oss_capa(inode, ocapa);
+
+	/* truncate capa won't renew */
+	if (ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) {
+		set_capa_expiry(ocapa);
+		list_del_init(&ocapa->c_list);
+		sort_add_capa(ocapa, ll_capa_list);
+
+		update_capa_timer(ocapa, capa_renewal_time(ocapa));
+	}
+
+	spin_unlock(&capa_lock);
+
+	atomic_set(&ll_capa_debug, 1);
+	return ocapa;
+}
+
+static inline void delay_capa_renew(struct obd_capa *oc, cfs_time_t delay)
+{
+	/* NB: set a fake expiry for this capa to prevent it renew too soon */
+	oc->c_expiry = cfs_time_add(oc->c_expiry, cfs_time_seconds(delay));
+}
+
+int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa)
+{
+	struct inode *inode = ocapa->u.cli.inode;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(ocapa);
+
+	if (IS_ERR(capa)) {
+		/* set error code */
+		rc = PTR_ERR(capa);
+		spin_lock(&capa_lock);
+		if (rc == -ENOENT) {
+			DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+				   "renewal canceled because object removed");
+			ll_capa_renewal_noent++;
+		} else {
+			ll_capa_renewal_failed++;
+
+			/* failed capa won't be renewed any longer, but if -EIO,
+			 * client might be doing recovery, retry in 2 min. */
+			if (rc == -EIO && !capa_is_expired(ocapa)) {
+				delay_capa_renew(ocapa, 120);
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "renewal failed: -EIO, "
+					   "retry in 2 mins");
+				ll_capa_renewal_retries++;
+				GOTO(retry, rc);
+			} else {
+				DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+					   "renewal failed(rc: %d) for", rc);
+			}
+		}
+
+		list_del_init(&ocapa->c_list);
+		sort_add_capa(ocapa, &ll_idle_capas);
+		spin_unlock(&capa_lock);
+
+		capa_put(ocapa);
+		iput(inode);
+		RETURN(rc);
+	}
+
+	spin_lock(&ocapa->c_lock);
+	LASSERT(!memcmp(&ocapa->c_capa, capa,
+			offsetof(struct lustre_capa, lc_opc)));
+	ocapa->c_capa = *capa;
+	set_capa_expiry(ocapa);
+	spin_unlock(&ocapa->c_lock);
+
+	spin_lock(&capa_lock);
+	if (capa_for_oss(capa))
+		inode_add_oss_capa(inode, ocapa);
+	DEBUG_CAPA(D_SEC, capa, "renew");
+	EXIT;
+retry:
+	list_del_init(&ocapa->c_list);
+	sort_add_capa(ocapa, ll_capa_list);
+	update_capa_timer(ocapa, capa_renewal_time(ocapa));
+	spin_unlock(&capa_lock);
+
+	capa_put(ocapa);
+	iput(inode);
+	return rc;
+}
+
+void ll_capa_open(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+	    == 0)
+		return;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	atomic_inc(&lli->lli_open_count);
+}
+
+void ll_capa_close(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+
+	if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+	    == 0)
+		return;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	atomic_dec(&lli->lli_open_count);
+}
+
+/* delete CAPA_OPC_OSS_TRUNC only */
+void ll_truncate_free_capa(struct obd_capa *ocapa)
+{
+	if (!ocapa)
+		return;
+
+	LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC);
+	DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free truncate");
+
+	/* release ref when find */
+	capa_put(ocapa);
+	if (likely(ocapa->c_capa.lc_opc == CAPA_OPC_OSS_TRUNC)) {
+		spin_lock(&capa_lock);
+		ll_delete_capa(ocapa);
+		spin_unlock(&capa_lock);
+	}
+}
+
+void ll_clear_inode_capas(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct obd_capa *ocapa, *tmp;
+
+	spin_lock(&capa_lock);
+	ocapa = lli->lli_mds_capa;
+	if (ocapa)
+		ll_delete_capa(ocapa);
+
+	list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas,
+				     u.cli.lli_list)
+		ll_delete_capa(ocapa);
+	spin_unlock(&capa_lock);
+}
+
+void ll_print_capa_stat(struct ll_sb_info *sbi)
+{
+	if (sbi->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+		LCONSOLE_INFO("Fid capabilities renewed: %llu\n"
+			      "Fid capabilities renewal ENOENT: %llu\n"
+			      "Fid capabilities failed to renew: %llu\n"
+			      "Fid capabilities renewal retries: %llu\n",
+			      ll_capa_renewed, ll_capa_renewal_noent,
+			      ll_capa_renewal_failed, ll_capa_renewal_retries);
+}

diff --git a/drivers/staging/lustre/lustre/llite/llite_close.c b/drivers/staging/lustre/lustre/llite/llite_close.c
new file mode 100644
index 0000000..00b2b38
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_close.c

@@ -0,0 +1,412 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_close.c
+ *
+ * Lustre Lite routines to issue a secondary close after writeback
+ */
+
+#include <linux/module.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+/** records that a write is in flight */
+void vvp_write_pending(struct ccc_object *club, struct ccc_page *page)
+{
+	struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+
+	ENTRY;
+	spin_lock(&lli->lli_lock);
+	lli->lli_flags |= LLIF_SOM_DIRTY;
+	if (page != NULL && list_empty(&page->cpg_pending_linkage))
+		list_add(&page->cpg_pending_linkage,
+			     &club->cob_pending_list);
+	spin_unlock(&lli->lli_lock);
+	EXIT;
+}
+
+/** records that a write has completed */
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page)
+{
+	struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+	int rc = 0;
+
+	ENTRY;
+	spin_lock(&lli->lli_lock);
+	if (page != NULL && !list_empty(&page->cpg_pending_linkage)) {
+		list_del_init(&page->cpg_pending_linkage);
+		rc = 1;
+	}
+	spin_unlock(&lli->lli_lock);
+	if (rc)
+		ll_queue_done_writing(club->cob_inode, 0);
+	EXIT;
+}
+
+/** Queues DONE_WRITING if
+ * - done writing is allowed;
+ * - inode has no no dirty pages; */
+void ll_queue_done_writing(struct inode *inode, unsigned long flags)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+	ENTRY;
+
+	spin_lock(&lli->lli_lock);
+	lli->lli_flags |= flags;
+
+	if ((lli->lli_flags & LLIF_DONE_WRITING) &&
+	    list_empty(&club->cob_pending_list)) {
+		struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq;
+
+		if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+			CWARN("ino %lu/%u(flags %u) som valid it just after "
+			      "recovery\n",
+			      inode->i_ino, inode->i_generation,
+			      lli->lli_flags);
+		/* DONE_WRITING is allowed and inode has no dirty page. */
+		spin_lock(&lcq->lcq_lock);
+
+		LASSERT(list_empty(&lli->lli_close_list));
+		CDEBUG(D_INODE, "adding inode %lu/%u to close list\n",
+		       inode->i_ino, inode->i_generation);
+		list_add_tail(&lli->lli_close_list, &lcq->lcq_head);
+
+		/* Avoid a concurrent insertion into the close thread queue:
+		 * an inode is already in the close thread, open(), write(),
+		 * close() happen, epoch is closed as the inode is marked as
+		 * LLIF_EPOCH_PENDING. When pages are written inode should not
+		 * be inserted into the queue again, clear this flag to avoid
+		 * it. */
+		lli->lli_flags &= ~LLIF_DONE_WRITING;
+
+		wake_up(&lcq->lcq_waitq);
+		spin_unlock(&lcq->lcq_lock);
+	}
+	spin_unlock(&lli->lli_lock);
+	EXIT;
+}
+
+/** Pack SOM attributes info @opdata for CLOSE, DONE_WRITING rpc. */
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	ENTRY;
+
+	op_data->op_flags |= MF_SOM_CHANGE;
+	/* Check if Size-on-MDS attributes are valid. */
+	if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+		CERROR("ino %lu/%u(flags %u) som valid it just after "
+		       "recovery\n", inode->i_ino, inode->i_generation,
+		       lli->lli_flags);
+
+	if (!cl_local_size(inode)) {
+		/* Send Size-on-MDS Attributes if valid. */
+		op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET |
+				ATTR_ATIME_SET | ATTR_SIZE | ATTR_BLOCKS;
+	}
+	EXIT;
+}
+
+/** Closes ioepoch and packs Size-on-MDS attribute if needed into @op_data. */
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+		      struct obd_client_handle **och, unsigned long flags)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+	ENTRY;
+
+	spin_lock(&lli->lli_lock);
+	if (!(list_empty(&club->cob_pending_list))) {
+		if (!(lli->lli_flags & LLIF_EPOCH_PENDING)) {
+			LASSERT(*och != NULL);
+			LASSERT(lli->lli_pending_och == NULL);
+			/* Inode is dirty and there is no pending write done
+			 * request yet, DONE_WRITE is to be sent later. */
+			lli->lli_flags |= LLIF_EPOCH_PENDING;
+			lli->lli_pending_och = *och;
+			spin_unlock(&lli->lli_lock);
+
+			inode = igrab(inode);
+			LASSERT(inode);
+			GOTO(out, 0);
+		}
+		if (flags & LLIF_DONE_WRITING) {
+			/* Some pages are still dirty, it is early to send
+			 * DONE_WRITE. Wait untill all pages will be flushed
+			 * and try DONE_WRITE again later. */
+			LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+			lli->lli_flags |= LLIF_DONE_WRITING;
+			spin_unlock(&lli->lli_lock);
+
+			inode = igrab(inode);
+			LASSERT(inode);
+			GOTO(out, 0);
+		}
+	}
+	CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID"\n",
+	       ll_i2info(inode)->lli_ioepoch, PFID(&lli->lli_fid));
+	op_data->op_flags |= MF_EPOCH_CLOSE;
+
+	if (flags & LLIF_DONE_WRITING) {
+		LASSERT(lli->lli_flags & LLIF_SOM_DIRTY);
+		LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+		*och = lli->lli_pending_och;
+		lli->lli_pending_och = NULL;
+		lli->lli_flags &= ~LLIF_EPOCH_PENDING;
+	} else {
+		/* Pack Size-on-MDS inode attributes only if they has changed */
+		if (!(lli->lli_flags & LLIF_SOM_DIRTY)) {
+			spin_unlock(&lli->lli_lock);
+			GOTO(out, 0);
+		}
+
+		/* There is a pending DONE_WRITE -- close epoch with no
+		 * attribute change. */
+		if (lli->lli_flags & LLIF_EPOCH_PENDING) {
+			spin_unlock(&lli->lli_lock);
+			GOTO(out, 0);
+		}
+	}
+
+	LASSERT(list_empty(&club->cob_pending_list));
+	lli->lli_flags &= ~LLIF_SOM_DIRTY;
+	spin_unlock(&lli->lli_lock);
+	ll_done_writing_attr(inode, op_data);
+
+	EXIT;
+out:
+	return;
+}
+
+/**
+ * Cliens updates SOM attributes on MDS (including llog cookies):
+ * obd_getattr with no lock and md_setattr.
+ */
+int ll_som_update(struct inode *inode, struct md_op_data *op_data)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ptlrpc_request *request = NULL;
+	__u32 old_flags;
+	struct obdo *oa;
+	int rc;
+	ENTRY;
+
+	LASSERT(op_data != NULL);
+	if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+		CERROR("ino %lu/%u(flags %u) som valid it just after "
+		       "recovery\n", inode->i_ino, inode->i_generation,
+		       lli->lli_flags);
+
+	OBDO_ALLOC(oa);
+	if (!oa) {
+		CERROR("can't allocate memory for Size-on-MDS update.\n");
+		RETURN(-ENOMEM);
+	}
+
+	old_flags = op_data->op_flags;
+	op_data->op_flags = MF_SOM_CHANGE;
+
+	/* If inode is already in another epoch, skip getattr from OSTs. */
+	if (lli->lli_ioepoch == op_data->op_ioepoch) {
+		rc = ll_inode_getattr(inode, oa, op_data->op_ioepoch,
+				      old_flags & MF_GETATTR_LOCK);
+		if (rc) {
+			oa->o_valid = 0;
+			if (rc != -ENOENT)
+				CERROR("inode_getattr failed (%d): unable to "
+				       "send a Size-on-MDS attribute update "
+				       "for inode %lu/%u\n", rc, inode->i_ino,
+				       inode->i_generation);
+		} else {
+			CDEBUG(D_INODE, "Size-on-MDS update on "DFID"\n",
+			       PFID(&lli->lli_fid));
+		}
+		/* Install attributes into op_data. */
+		md_from_obdo(op_data, oa, oa->o_valid);
+	}
+
+	rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data,
+			NULL, 0, NULL, 0, &request, NULL);
+	ptlrpc_req_finished(request);
+
+	OBDO_FREE(oa);
+	RETURN(rc);
+}
+
+/**
+ * Closes the ioepoch and packs all the attributes into @op_data for
+ * DONE_WRITING rpc.
+ */
+static void ll_prepare_done_writing(struct inode *inode,
+				    struct md_op_data *op_data,
+				    struct obd_client_handle **och)
+{
+	ll_ioepoch_close(inode, op_data, och, LLIF_DONE_WRITING);
+	/* If there is no @och, we do not do D_W yet. */
+	if (*och == NULL)
+		return;
+
+	ll_pack_inode2opdata(inode, op_data, &(*och)->och_fh);
+	ll_prep_md_op_data(op_data, inode, NULL, NULL,
+			   0, 0, LUSTRE_OPC_ANY, NULL);
+}
+
+/** Send a DONE_WRITING rpc. */
+static void ll_done_writing(struct inode *inode)
+{
+	struct obd_client_handle *och = NULL;
+	struct md_op_data *op_data;
+	int rc;
+	ENTRY;
+
+	LASSERT(exp_connect_som(ll_i2mdexp(inode)));
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL) {
+		CERROR("can't allocate op_data\n");
+		EXIT;
+		return;
+	}
+
+	ll_prepare_done_writing(inode, op_data, &och);
+	/* If there is no @och, we do not do D_W yet. */
+	if (och == NULL)
+		GOTO(out, 0);
+
+	rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL);
+	if (rc == -EAGAIN) {
+		/* MDS has instructed us to obtain Size-on-MDS attribute from
+		 * OSTs and send setattr to back to MDS. */
+		rc = ll_som_update(inode, op_data);
+	} else if (rc) {
+		CERROR("inode %lu mdc done_writing failed: rc = %d\n",
+		       inode->i_ino, rc);
+	}
+out:
+	ll_finish_md_op_data(op_data);
+	if (och) {
+		md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och);
+		OBD_FREE_PTR(och);
+	}
+	EXIT;
+}
+
+static struct ll_inode_info *ll_close_next_lli(struct ll_close_queue *lcq)
+{
+	struct ll_inode_info *lli = NULL;
+
+	spin_lock(&lcq->lcq_lock);
+
+	if (!list_empty(&lcq->lcq_head)) {
+		lli = list_entry(lcq->lcq_head.next, struct ll_inode_info,
+				     lli_close_list);
+		list_del_init(&lli->lli_close_list);
+	} else if (atomic_read(&lcq->lcq_stop))
+		lli = ERR_PTR(-EALREADY);
+
+	spin_unlock(&lcq->lcq_lock);
+	return lli;
+}
+
+static int ll_close_thread(void *arg)
+{
+	struct ll_close_queue *lcq = arg;
+	ENTRY;
+
+	complete(&lcq->lcq_comp);
+
+	while (1) {
+		struct l_wait_info lwi = { 0 };
+		struct ll_inode_info *lli;
+		struct inode *inode;
+
+		l_wait_event_exclusive(lcq->lcq_waitq,
+				       (lli = ll_close_next_lli(lcq)) != NULL,
+				       &lwi);
+		if (IS_ERR(lli))
+			break;
+
+		inode = ll_info2i(lli);
+		CDEBUG(D_INFO, "done_writting for inode %lu/%u\n",
+		       inode->i_ino, inode->i_generation);
+		ll_done_writing(inode);
+		iput(inode);
+	}
+
+	CDEBUG(D_INFO, "ll_close exiting\n");
+	complete(&lcq->lcq_comp);
+	RETURN(0);
+}
+
+int ll_close_thread_start(struct ll_close_queue **lcq_ret)
+{
+	struct ll_close_queue *lcq;
+	task_t *task;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CLOSE_THREAD))
+		return -EINTR;
+
+	OBD_ALLOC(lcq, sizeof(*lcq));
+	if (lcq == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&lcq->lcq_lock);
+	INIT_LIST_HEAD(&lcq->lcq_head);
+	init_waitqueue_head(&lcq->lcq_waitq);
+	init_completion(&lcq->lcq_comp);
+
+	task = kthread_run(ll_close_thread, lcq, "ll_close");
+	if (IS_ERR(task)) {
+		OBD_FREE(lcq, sizeof(*lcq));
+		return PTR_ERR(task);
+	}
+
+	wait_for_completion(&lcq->lcq_comp);
+	*lcq_ret = lcq;
+	return 0;
+}
+
+void ll_close_thread_shutdown(struct ll_close_queue *lcq)
+{
+	init_completion(&lcq->lcq_comp);
+	atomic_inc(&lcq->lcq_stop);
+	wake_up(&lcq->lcq_waitq);
+	wait_for_completion(&lcq->lcq_comp);
+	OBD_FREE(lcq, sizeof(*lcq));
+}

diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
new file mode 100644
index 0000000..992cd20
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h

@@ -0,0 +1,1576 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LLITE_INTERNAL_H
+#define LLITE_INTERNAL_H
+#include <lustre_debug.h>
+#include <lustre_ver.h>
+#include <lustre_disk.h>  /* for s2sbi */
+#include <lustre_eacl.h>
+
+/* for struct cl_lock_descr and struct cl_io */
+#include <cl_object.h>
+#include <lclient.h>
+#include <lustre_mdc.h>
+#include <linux/lustre_intent.h>
+
+#ifndef FMODE_EXEC
+#define FMODE_EXEC 0
+#endif
+
+#ifndef VM_FAULT_RETRY
+#define VM_FAULT_RETRY 0
+#endif
+
+/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it.
+ * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */
+#ifndef LOOKUP_CONTINUE
+#define LOOKUP_CONTINUE LOOKUP_PARENT
+#endif
+
+/** Only used on client-side for indicating the tail of dir hash/offset. */
+#define LL_DIR_END_OFF	  0x7fffffffffffffffULL
+#define LL_DIR_END_OFF_32BIT    0x7fffffffUL
+
+#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
+#define LUSTRE_FPRIVATE(file) ((file)->private_data)
+
+struct ll_dentry_data {
+	int				lld_cwd_count;
+	int				lld_mnt_count;
+	struct obd_client_handle	lld_cwd_och;
+	struct obd_client_handle	lld_mnt_och;
+	struct lookup_intent		*lld_it;
+	unsigned int			lld_sa_generation;
+	unsigned int			lld_invalid:1;
+	struct rcu_head			lld_rcu_head;
+};
+
+#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
+
+extern struct file_operations ll_pgcache_seq_fops;
+
+#define LLI_INODE_MAGIC		 0x111d0de5
+#define LLI_INODE_DEAD		  0xdeadd00d
+
+/* remote client permission cache */
+#define REMOTE_PERM_HASHSIZE 16
+
+struct ll_getname_data {
+	char	    *lgd_name;      /* points to a buffer with NAME_MAX+1 size */
+	struct lu_fid    lgd_fid;       /* target fid we are looking for */
+	int	      lgd_found;     /* inode matched? */
+};
+
+/* llite setxid/access permission for user on remote client */
+struct ll_remote_perm {
+	struct hlist_node	lrp_list;
+	uid_t		   lrp_uid;
+	gid_t		   lrp_gid;
+	uid_t		   lrp_fsuid;
+	gid_t		   lrp_fsgid;
+	int		     lrp_access_perm; /* MAY_READ/WRITE/EXEC, this
+						    is access permission with
+						    lrp_fsuid/lrp_fsgid. */
+};
+
+enum lli_flags {
+	/* MDS has an authority for the Size-on-MDS attributes. */
+	LLIF_MDS_SIZE_LOCK      = (1 << 0),
+	/* Epoch close is postponed. */
+	LLIF_EPOCH_PENDING      = (1 << 1),
+	/* DONE WRITING is allowed. */
+	LLIF_DONE_WRITING       = (1 << 2),
+	/* Sizeon-on-MDS attributes are changed. An attribute update needs to
+	 * be sent to MDS. */
+	LLIF_SOM_DIRTY	  = (1 << 3),
+	/* File is contented */
+	LLIF_CONTENDED	  = (1 << 4),
+	/* Truncate uses server lock for this file */
+	LLIF_SRVLOCK	    = (1 << 5),
+	/* File data is modified. */
+	LLIF_DATA_MODIFIED      = (1 << 6),
+};
+
+struct ll_inode_info {
+	__u32				lli_inode_magic;
+	__u32				lli_flags;
+	__u64				lli_ioepoch;
+
+	spinlock_t			lli_lock;
+	struct posix_acl		*lli_posix_acl;
+
+	struct hlist_head		*lli_remote_perms;
+	struct mutex				lli_rmtperm_mutex;
+
+	/* identifying fields for both metadata and data stacks. */
+	struct lu_fid		   lli_fid;
+	/* Parent fid for accessing default stripe data on parent directory
+	 * for allocating OST objects after a mknod() and later open-by-FID. */
+	struct lu_fid		   lli_pfid;
+
+	struct list_head		      lli_close_list;
+	struct list_head		      lli_oss_capas;
+	/* open count currently used by capability only, indicate whether
+	 * capability needs renewal */
+	atomic_t		    lli_open_count;
+	struct obd_capa		*lli_mds_capa;
+	cfs_time_t		      lli_rmtperm_time;
+
+	/* handle is to be sent to MDS later on done_writing and setattr.
+	 * Open handle data are needed for the recovery to reconstruct
+	 * the inode state on the MDS. XXX: recovery is not ready yet. */
+	struct obd_client_handle       *lli_pending_och;
+
+	/* We need all three because every inode may be opened in different
+	 * modes */
+	struct obd_client_handle       *lli_mds_read_och;
+	struct obd_client_handle       *lli_mds_write_och;
+	struct obd_client_handle       *lli_mds_exec_och;
+	__u64			   lli_open_fd_read_count;
+	__u64			   lli_open_fd_write_count;
+	__u64			   lli_open_fd_exec_count;
+	/* Protects access to och pointers and their usage counters */
+	struct mutex			lli_och_mutex;
+
+	struct inode			lli_vfs_inode;
+
+	/* the most recent timestamps obtained from mds */
+	struct ost_lvb			lli_lvb;
+	spinlock_t			lli_agl_lock;
+
+	/* Try to make the d::member and f::member are aligned. Before using
+	 * these members, make clear whether it is directory or not. */
+	union {
+		/* for directory */
+		struct {
+			/* serialize normal readdir and statahead-readdir. */
+			struct mutex			d_readdir_mutex;
+
+			/* metadata statahead */
+			/* since parent-child threads can share the same @file
+			 * struct, "opendir_key" is the token when dir close for
+			 * case of parent exit before child -- it is me should
+			 * cleanup the dir readahead. */
+			void			   *d_opendir_key;
+			struct ll_statahead_info       *d_sai;
+			struct posix_acl	       *d_def_acl;
+			/* protect statahead stuff. */
+			spinlock_t			d_sa_lock;
+			/* "opendir_pid" is the token when lookup/revalid
+			 * -- I am the owner of dir statahead. */
+			pid_t			   d_opendir_pid;
+		} d;
+
+#define lli_readdir_mutex       u.d.d_readdir_mutex
+#define lli_opendir_key	 u.d.d_opendir_key
+#define lli_sai		 u.d.d_sai
+#define lli_def_acl	     u.d.d_def_acl
+#define lli_sa_lock	     u.d.d_sa_lock
+#define lli_opendir_pid	 u.d.d_opendir_pid
+
+		/* for non-directory */
+		struct {
+			struct semaphore		f_size_sem;
+			void				*f_size_sem_owner;
+			char				*f_symlink_name;
+			__u64				f_maxbytes;
+			/*
+			 * struct rw_semaphore {
+			 *    signed long	count;     // align d.d_def_acl
+			 *    spinlock_t	wait_lock; // align d.d_sa_lock
+			 *    struct list_head wait_list;
+			 * }
+			 */
+			struct rw_semaphore		f_trunc_sem;
+			struct mutex			f_write_mutex;
+
+			struct rw_semaphore		f_glimpse_sem;
+			cfs_time_t			f_glimpse_time;
+			struct list_head			f_agl_list;
+			__u64				f_agl_index;
+
+			/* for writepage() only to communicate to fsync */
+			int				f_async_rc;
+
+			/* volatile file criteria is based on file name, this
+			 * flag is used to keep the test result, so the strcmp
+			 * is done only once
+			 */
+			bool				f_volatile;
+			/*
+			 * whenever a process try to read/write the file, the
+			 * jobid of the process will be saved here, and it'll
+			 * be packed into the write PRC when flush later.
+			 *
+			 * so the read/write statistics for jobid will not be
+			 * accurate if the file is shared by different jobs.
+			 */
+			char		     f_jobid[JOBSTATS_JOBID_SIZE];
+		} f;
+
+#define lli_size_sem	    u.f.f_size_sem
+#define lli_size_sem_owner      u.f.f_size_sem_owner
+#define lli_symlink_name	u.f.f_symlink_name
+#define lli_maxbytes	    u.f.f_maxbytes
+#define lli_trunc_sem	   u.f.f_trunc_sem
+#define lli_write_mutex	 u.f.f_write_mutex
+#define lli_glimpse_sem		u.f.f_glimpse_sem
+#define lli_glimpse_time	u.f.f_glimpse_time
+#define lli_agl_list		u.f.f_agl_list
+#define lli_agl_index		u.f.f_agl_index
+#define lli_async_rc		u.f.f_async_rc
+#define lli_jobid		u.f.f_jobid
+#define lli_volatile		u.f.f_volatile
+
+	} u;
+
+	/* XXX: For following frequent used members, although they maybe special
+	 *      used for non-directory object, it is some time-wasting to check
+	 *      whether the object is directory or not before using them. On the
+	 *      other hand, currently, sizeof(f) > sizeof(d), it cannot reduce
+	 *      the "ll_inode_info" size even if moving those members into u.f.
+	 *      So keep them out side.
+	 *
+	 *      In the future, if more members are added only for directory,
+	 *      some of the following members can be moved into u.f.
+	 */
+	bool			    lli_has_smd;
+	struct cl_object	       *lli_clob;
+
+	/* mutex to request for layout lock exclusively. */
+	struct mutex			lli_layout_mutex;
+	/* valid only inside LAYOUT ibits lock, protected by lli_layout_mutex */
+	__u32				lli_layout_gen;
+};
+
+/*
+ * Locking to guarantee consistency of non-atomic updates to long long i_size,
+ * consistency between file size and KMS.
+ *
+ * Implemented by ->lli_size_sem and ->lsm_lock, nested in that order.
+ */
+
+void ll_inode_size_lock(struct inode *inode);
+void ll_inode_size_unlock(struct inode *inode);
+
+// FIXME: replace the name of this with LL_I to conform to kernel stuff
+// static inline struct ll_inode_info *LL_I(struct inode *inode)
+static inline struct ll_inode_info *ll_i2info(struct inode *inode)
+{
+	return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+}
+
+/* default to about 40meg of readahead on a given system.  That much tied
+ * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */
+#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT))
+
+/* default to read-ahead full files smaller than 2MB on the second read */
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT))
+
+enum ra_stat {
+	RA_STAT_HIT = 0,
+	RA_STAT_MISS,
+	RA_STAT_DISTANT_READPAGE,
+	RA_STAT_MISS_IN_WINDOW,
+	RA_STAT_FAILED_GRAB_PAGE,
+	RA_STAT_FAILED_MATCH,
+	RA_STAT_DISCARDED,
+	RA_STAT_ZERO_LEN,
+	RA_STAT_ZERO_WINDOW,
+	RA_STAT_EOF,
+	RA_STAT_MAX_IN_FLIGHT,
+	RA_STAT_WRONG_GRAB_PAGE,
+	_NR_RA_STAT,
+};
+
+struct ll_ra_info {
+	atomic_t	      ra_cur_pages;
+	unsigned long	     ra_max_pages;
+	unsigned long	     ra_max_pages_per_file;
+	unsigned long	     ra_max_read_ahead_whole_pages;
+};
+
+/* ra_io_arg will be filled in the beginning of ll_readahead with
+ * ras_lock, then the following ll_read_ahead_pages will read RA
+ * pages according to this arg, all the items in this structure are
+ * counted by page index.
+ */
+struct ra_io_arg {
+	unsigned long ria_start;  /* start offset of read-ahead*/
+	unsigned long ria_end;    /* end offset of read-ahead*/
+	/* If stride read pattern is detected, ria_stoff means where
+	 * stride read is started. Note: for normal read-ahead, the
+	 * value here is meaningless, and also it will not be accessed*/
+	pgoff_t ria_stoff;
+	/* ria_length and ria_pages are the length and pages length in the
+	 * stride I/O mode. And they will also be used to check whether
+	 * it is stride I/O read-ahead in the read-ahead pages*/
+	unsigned long ria_length;
+	unsigned long ria_pages;
+};
+
+/* LL_HIST_MAX=32 causes an overflow */
+#define LL_HIST_MAX 28
+#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */
+#define LL_PROCESS_HIST_MAX 10
+struct per_process_info {
+	pid_t pid;
+	struct obd_histogram pp_r_hist;
+	struct obd_histogram pp_w_hist;
+};
+
+/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */
+struct ll_rw_extents_info {
+	struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1];
+};
+
+#define LL_OFFSET_HIST_MAX 100
+struct ll_rw_process_info {
+	pid_t		     rw_pid;
+	int		       rw_op;
+	loff_t		    rw_range_start;
+	loff_t		    rw_range_end;
+	loff_t		    rw_last_file_pos;
+	loff_t		    rw_offset;
+	size_t		    rw_smallest_extent;
+	size_t		    rw_largest_extent;
+	struct ll_file_data      *rw_last_file;
+};
+
+enum stats_track_type {
+	STATS_TRACK_ALL = 0,  /* track all processes */
+	STATS_TRACK_PID,      /* track process with this pid */
+	STATS_TRACK_PPID,     /* track processes with this ppid */
+	STATS_TRACK_GID,      /* track processes with this gid */
+	STATS_TRACK_LAST,
+};
+
+/* flags for sbi->ll_flags */
+#define LL_SBI_NOLCK	     0x01 /* DLM locking disabled (directio-only) */
+#define LL_SBI_CHECKSUM	  0x02 /* checksum each page as it's written */
+#define LL_SBI_FLOCK	     0x04
+#define LL_SBI_USER_XATTR	0x08 /* support user xattr */
+#define LL_SBI_ACL	       0x10 /* support ACL */
+#define LL_SBI_RMT_CLIENT	0x40 /* remote client */
+#define LL_SBI_MDS_CAPA	  0x80 /* support mds capa */
+#define LL_SBI_OSS_CAPA	 0x100 /* support oss capa */
+#define LL_SBI_LOCALFLOCK       0x200 /* Local flocks support by kernel */
+#define LL_SBI_LRU_RESIZE       0x400 /* lru resize support */
+#define LL_SBI_LAZYSTATFS       0x800 /* lazystatfs mount option */
+#define LL_SBI_SOM_PREVIEW     0x1000 /* SOM preview mount option */
+#define LL_SBI_32BIT_API       0x2000 /* generate 32 bit inodes. */
+#define LL_SBI_64BIT_HASH      0x4000 /* support 64-bits dir hash/offset */
+#define LL_SBI_AGL_ENABLED     0x8000 /* enable agl */
+#define LL_SBI_VERBOSE	0x10000 /* verbose mount/umount */
+#define LL_SBI_LAYOUT_LOCK    0x20000 /* layout lock support */
+#define LL_SBI_USER_FID2PATH  0x40000 /* allow fid2path by unprivileged users */
+
+#define LL_SBI_FLAGS {	\
+	"nolck",	\
+	"checksum",	\
+	"flock",	\
+	"xattr",	\
+	"acl",		\
+	"rmt_client",	\
+	"mds_capa",	\
+	"oss_capa",	\
+	"flock",	\
+	"lru_resize",	\
+	"lazy_statfs",	\
+	"som",		\
+	"32bit_api",	\
+	"64bit_hash",	\
+	"agl",		\
+	"verbose",	\
+	"layout",	\
+	"user_fid2path" }
+
+/* default value for ll_sb_info->contention_time */
+#define SBI_DEFAULT_CONTENTION_SECONDS     60
+/* default value for lockless_truncate_enable */
+#define SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE 1
+#define RCE_HASHES      32
+
+struct rmtacl_ctl_entry {
+	struct list_head       rce_list;
+	pid_t	    rce_key; /* hash key */
+	int	      rce_ops; /* acl operation type */
+};
+
+struct rmtacl_ctl_table {
+	spinlock_t	rct_lock;
+	struct list_head	rct_entries[RCE_HASHES];
+};
+
+#define EE_HASHES       32
+
+struct eacl_entry {
+	struct list_head	    ee_list;
+	pid_t		 ee_key; /* hash key */
+	struct lu_fid	 ee_fid;
+	int		   ee_type; /* ACL type for ACCESS or DEFAULT */
+	ext_acl_xattr_header *ee_acl;
+};
+
+struct eacl_table {
+	spinlock_t	et_lock;
+	struct list_head	et_entries[EE_HASHES];
+};
+
+struct ll_sb_info {
+	struct list_head		  ll_list;
+	/* this protects pglist and ra_info.  It isn't safe to
+	 * grab from interrupt contexts */
+	spinlock_t		  ll_lock;
+	spinlock_t		  ll_pp_extent_lock; /* pp_extent entry*/
+	spinlock_t		  ll_process_lock; /* ll_rw_process_info */
+	struct obd_uuid	   ll_sb_uuid;
+	struct obd_export	*ll_md_exp;
+	struct obd_export	*ll_dt_exp;
+	struct proc_dir_entry*    ll_proc_root;
+	struct lu_fid	     ll_root_fid; /* root object fid */
+
+	int		       ll_flags;
+	int			  ll_umounting:1;
+	struct list_head		ll_conn_chain; /* per-conn chain of SBs */
+	struct lustre_client_ocd  ll_lco;
+
+	struct list_head		ll_orphan_dentry_list; /*please don't ask -p*/
+	struct ll_close_queue    *ll_lcq;
+
+	struct lprocfs_stats     *ll_stats; /* lprocfs stats counter */
+
+	struct cl_client_cache    ll_cache;
+
+	struct lprocfs_stats     *ll_ra_stats;
+
+	struct ll_ra_info	 ll_ra_info;
+	unsigned int	      ll_namelen;
+	struct file_operations   *ll_fop;
+
+	/* =0 - hold lock over whole read/write
+	 * >0 - max. chunk to be read/written w/o lock re-acquiring */
+	unsigned long	     ll_max_rw_chunk;
+	unsigned int	      ll_md_brw_size; /* used by readdir */
+
+	struct lu_site	   *ll_site;
+	struct cl_device	 *ll_cl;
+	/* Statistics */
+	struct ll_rw_extents_info ll_rw_extents_info;
+	int		       ll_extent_process_count;
+	struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX];
+	unsigned int	      ll_offset_process_count;
+	struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX];
+	unsigned int	      ll_rw_offset_entry_count;
+	int		       ll_stats_track_id;
+	enum stats_track_type     ll_stats_track_type;
+	int		       ll_rw_stats_on;
+
+	/* metadata stat-ahead */
+	unsigned int	      ll_sa_max;     /* max statahead RPCs */
+	atomic_t		  ll_sa_total;   /* statahead thread started
+						  * count */
+	atomic_t		  ll_sa_wrong;   /* statahead thread stopped for
+						  * low hit ratio */
+	atomic_t		  ll_agl_total;  /* AGL thread started count */
+
+	dev_t		     ll_sdev_orig; /* save s_dev before assign for
+						 * clustred nfs */
+	struct rmtacl_ctl_table   ll_rct;
+	struct eacl_table	 ll_et;
+};
+
+#define LL_DEFAULT_MAX_RW_CHUNK      (32 * 1024 * 1024)
+
+struct ll_ra_read {
+	pgoff_t	     lrr_start;
+	pgoff_t	     lrr_count;
+	struct task_struct *lrr_reader;
+	struct list_head	  lrr_linkage;
+};
+
+/*
+ * per file-descriptor read-ahead data.
+ */
+struct ll_readahead_state {
+	spinlock_t  ras_lock;
+	/*
+	 * index of the last page that read(2) needed and that wasn't in the
+	 * cache. Used by ras_update() to detect seeks.
+	 *
+	 * XXX nikita: if access seeks into cached region, Lustre doesn't see
+	 * this.
+	 */
+	unsigned long   ras_last_readpage;
+	/*
+	 * number of pages read after last read-ahead window reset. As window
+	 * is reset on each seek, this is effectively a number of consecutive
+	 * accesses. Maybe ->ras_accessed_in_window is better name.
+	 *
+	 * XXX nikita: window is also reset (by ras_update()) when Lustre
+	 * believes that memory pressure evicts read-ahead pages. In that
+	 * case, it probably doesn't make sense to expand window to
+	 * PTLRPC_MAX_BRW_PAGES on the third access.
+	 */
+	unsigned long   ras_consecutive_pages;
+	/*
+	 * number of read requests after the last read-ahead window reset
+	 * As window is reset on each seek, this is effectively the number
+	 * on consecutive read request and is used to trigger read-ahead.
+	 */
+	unsigned long   ras_consecutive_requests;
+	/*
+	 * Parameters of current read-ahead window. Handled by
+	 * ras_update(). On the initial access to the file or after a seek,
+	 * window is reset to 0. After 3 consecutive accesses, window is
+	 * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by
+	 * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages.
+	 */
+	unsigned long   ras_window_start, ras_window_len;
+	/*
+	 * Where next read-ahead should start at. This lies within read-ahead
+	 * window. Read-ahead window is read in pieces rather than at once
+	 * because: 1. lustre limits total number of pages under read-ahead by
+	 * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages
+	 * not covered by DLM lock.
+	 */
+	unsigned long   ras_next_readahead;
+	/*
+	 * Total number of ll_file_read requests issued, reads originating
+	 * due to mmap are not counted in this total.  This value is used to
+	 * trigger full file read-ahead after multiple reads to a small file.
+	 */
+	unsigned long   ras_requests;
+	/*
+	 * Page index with respect to the current request, these value
+	 * will not be accurate when dealing with reads issued via mmap.
+	 */
+	unsigned long   ras_request_index;
+	/*
+	 * list of struct ll_ra_read's one per read(2) call current in
+	 * progress against this file descriptor. Used by read-ahead code,
+	 * protected by ->ras_lock.
+	 */
+	struct list_head      ras_read_beads;
+	/*
+	 * The following 3 items are used for detecting the stride I/O
+	 * mode.
+	 * In stride I/O mode,
+	 * ...............|-----data-----|****gap*****|--------|******|....
+	 *    offset      |-stride_pages-|-stride_gap-|
+	 * ras_stride_offset = offset;
+	 * ras_stride_length = stride_pages + stride_gap;
+	 * ras_stride_pages = stride_pages;
+	 * Note: all these three items are counted by pages.
+	 */
+	unsigned long   ras_stride_length;
+	unsigned long   ras_stride_pages;
+	pgoff_t	 ras_stride_offset;
+	/*
+	 * number of consecutive stride request count, and it is similar as
+	 * ras_consecutive_requests, but used for stride I/O mode.
+	 * Note: only more than 2 consecutive stride request are detected,
+	 * stride read-ahead will be enable
+	 */
+	unsigned long   ras_consecutive_stride_requests;
+};
+
+extern struct kmem_cache *ll_file_data_slab;
+struct lustre_handle;
+struct ll_file_data {
+	struct ll_readahead_state fd_ras;
+	int fd_omode;
+	struct ccc_grouplock fd_grouplock;
+	__u64 lfd_pos;
+	__u32 fd_flags;
+	struct file *fd_file;
+	/* Indicate whether need to report failure when close.
+	 * true: failure is known, not report again.
+	 * false: unknown failure, should report. */
+	bool fd_write_failed;
+};
+
+struct lov_stripe_md;
+
+extern spinlock_t inode_lock;
+
+extern struct proc_dir_entry *proc_lustre_fs_root;
+
+static inline struct inode *ll_info2i(struct ll_inode_info *lli)
+{
+	return &lli->lli_vfs_inode;
+}
+
+struct it_cb_data {
+	struct inode  *icbd_parent;
+	struct dentry **icbd_childp;
+	obd_id	hash;
+};
+
+__u32 ll_i2suppgid(struct inode *i);
+void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2);
+
+static inline int ll_need_32bit_api(struct ll_sb_info *sbi)
+{
+#if BITS_PER_LONG == 32
+	return 1;
+#else
+	return unlikely(current_is_32bit() || (sbi->ll_flags & LL_SBI_32BIT_API));
+#endif
+}
+
+#define LLAP_MAGIC 98764321
+
+extern struct kmem_cache *ll_async_page_slab;
+extern size_t ll_async_page_slab_size;
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
+struct ll_ra_read *ll_ra_read_get(struct file *f);
+
+/* llite/lproc_llite.c */
+#ifdef LPROCFS
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+				struct super_block *sb, char *osc, char *mdc);
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi);
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+			struct super_block *sb, char *osc, char *mdc){return 0;}
+static inline void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) {}
+static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
+static void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+
+/* llite/dir.c */
+void ll_release_page(struct page *page, int remove);
+extern struct file_operations ll_dir_operations;
+extern struct inode_operations ll_dir_inode_operations;
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+			     struct ll_dir_chain *chain);
+int ll_dir_read(struct inode *inode, __u64 *_pos, void *cookie,
+		filldir_t filldir);
+
+int ll_get_mdt_idx(struct inode *inode);
+/* llite/namei.c */
+int ll_objects_destroy(struct ptlrpc_request *request,
+		       struct inode *dir);
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+		      struct lustre_md *lic);
+int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+		       void *data, int flag);
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de);
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen);
+
+/* llite/rw.c */
+int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_writepage(struct page *page, struct writeback_control *wbc);
+int ll_writepages(struct address_space *, struct writeback_control *wbc);
+void ll_removepage(struct page *page);
+int ll_readpage(struct file *file, struct page *page);
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
+int ll_file_punch(struct inode *, loff_t, int);
+ssize_t ll_file_lockless_io(struct file *, char *, size_t, loff_t *, int);
+void ll_clear_file_contended(struct inode*);
+int ll_sync_page_range(struct inode *, struct address_space *, loff_t, size_t);
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+		 struct ll_readahead_state *ras, struct address_space *mapping,
+		 struct cl_page_list *queue, int flags);
+
+/* llite/file.c */
+extern struct file_operations ll_file_operations;
+extern struct file_operations ll_file_operations_flock;
+extern struct file_operations ll_file_operations_noflock;
+extern struct inode_operations ll_file_inode_operations;
+extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
+				  __u64);
+extern int ll_have_md_lock(struct inode *inode, __u64 *bits,
+			   ldlm_mode_t l_req_mode);
+extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+				   struct lustre_handle *lockh, __u64 flags);
+int __ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
+			     __u64 bits);
+int ll_revalidate_nd(struct dentry *dentry, unsigned int flags);
+int ll_file_open(struct inode *inode, struct file *file);
+int ll_file_release(struct inode *inode, struct file *file);
+int ll_glimpse_ioctl(struct ll_sb_info *sbi,
+		     struct lov_stripe_md *lsm, lstat_t *st);
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch);
+int ll_local_open(struct file *file,
+		  struct lookup_intent *it, struct ll_file_data *fd,
+		  struct obd_client_handle *och);
+int ll_release_openhandle(struct dentry *, struct lookup_intent *);
+int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+		struct file *file);
+int ll_md_real_close(struct inode *inode, int flags);
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+		      struct obd_client_handle **och, unsigned long flags);
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data);
+int ll_som_update(struct inode *inode, struct md_op_data *op_data);
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+		     __u64 ioepoch, int sync);
+int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
+		  struct md_open_data **mod);
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+			  struct lustre_handle *fh);
+extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+			      struct ll_file_data *file, loff_t pos,
+			      size_t count, int rw);
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
+	       struct lookup_intent *it, struct kstat *stat);
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+struct ll_file_data *ll_file_data_get(void);
+struct posix_acl * ll_get_acl(struct inode *inode, int type);
+
+int ll_inode_permission(struct inode *inode, int mask);
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
+			     int flags, struct lov_user_md *lum,
+			     int lum_size);
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+			     struct lov_mds_md **lmm, int *lmm_size,
+			     struct ptlrpc_request **request);
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+		     int set_default);
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+		     int *lmm_size, struct ptlrpc_request **request);
+int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
+int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+	      int num_bytes);
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode);
+int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+int ll_fid2path(struct inode *inode, void *arg);
+int ll_data_version(struct inode *inode, __u64 *data_version, int extent_lock);
+
+/* llite/dcache.c */
+
+int ll_dops_init(struct dentry *de, int block, int init_sa);
+extern struct dentry_operations ll_d_ops;
+void ll_intent_drop_lock(struct lookup_intent *);
+void ll_intent_release(struct lookup_intent *);
+void ll_invalidate_aliases(struct inode *);
+void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
+int ll_dcompare(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *d_name);
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+			    struct lookup_intent *it, struct dentry *de);
+
+/* llite/llite_lib.c */
+extern struct super_operations lustre_super_operations;
+
+char *ll_read_opt(const char *opt, char *data);
+void ll_lli_init(struct ll_inode_info *lli);
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt);
+void ll_put_super(struct super_block *sb);
+void ll_kill_super(struct super_block *sb);
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
+struct inode *ll_inode_from_lock(struct ldlm_lock *lock);
+void ll_clear_inode(struct inode *inode);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr);
+int ll_setattr(struct dentry *de, struct iattr *attr);
+int ll_statfs(struct dentry *de, struct kstatfs *sfs);
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+		       __u64 max_age, __u32 flags);
+void ll_update_inode(struct inode *inode, struct lustre_md *md);
+void ll_read_inode2(struct inode *inode, void *opaque);
+void ll_delete_inode(struct inode *inode);
+int ll_iocontrol(struct inode *inode, struct file *file,
+		 unsigned int cmd, unsigned long arg);
+int ll_flush_ctx(struct inode *inode);
+void ll_umount_begin(struct super_block *sb);
+int ll_remount_fs(struct super_block *sb, int *flags, char *data);
+int ll_show_options(struct seq_file *seq, struct dentry *dentry);
+void ll_dirty_page_discard_warn(struct page *page, int ioret);
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+		  struct super_block *, struct lookup_intent *);
+void lustre_dump_dentry(struct dentry *, int recur);
+void lustre_dump_inode(struct inode *);
+int ll_obd_statfs(struct inode *inode, void *arg);
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
+int ll_process_config(struct lustre_cfg *lcfg);
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+				      struct inode *i1, struct inode *i2,
+				      const char *name, int namelen,
+				      int mode, __u32 opc, void *data);
+void ll_finish_md_op_data(struct md_op_data *op_data);
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg);
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen);
+
+/* llite/llite_nfs.c */
+extern struct export_operations lustre_export_operations;
+__u32 get_uuid2int(const char *name, int len);
+struct inode *search_inode_for_lustre(struct super_block *sb,
+				      const struct lu_fid *fid);
+
+/* llite/special.c */
+extern struct inode_operations ll_special_inode_operations;
+extern struct file_operations ll_special_chr_inode_fops;
+extern struct file_operations ll_special_chr_file_fops;
+extern struct file_operations ll_special_blk_inode_fops;
+extern struct file_operations ll_special_fifo_inode_fops;
+extern struct file_operations ll_special_fifo_file_fops;
+extern struct file_operations ll_special_sock_inode_fops;
+
+/* llite/symlink.c */
+extern struct inode_operations ll_fast_symlink_inode_operations;
+
+/* llite/llite_close.c */
+struct ll_close_queue {
+	spinlock_t		lcq_lock;
+	struct list_head		lcq_head;
+	wait_queue_head_t		lcq_waitq;
+	struct completion	lcq_comp;
+	atomic_t		lcq_stop;
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+
+void vvp_write_pending (struct ccc_object *club, struct ccc_page *page);
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page);
+
+/* specific achitecture can implement only part of this list */
+enum vvp_io_subtype {
+	/** normal IO */
+	IO_NORMAL,
+	/** io called from .sendfile */
+	IO_SENDFILE,
+	/** io started from splice_{read|write} */
+	IO_SPLICE
+};
+
+/* IO subtypes */
+struct vvp_io {
+	/** io subtype */
+	enum vvp_io_subtype    cui_io_subtype;
+
+	union {
+		struct {
+			read_actor_t      cui_actor;
+			void	     *cui_target;
+		} sendfile;
+		struct {
+			struct pipe_inode_info *cui_pipe;
+			unsigned int	    cui_flags;
+		} splice;
+		struct vvp_fault_io {
+			/**
+			 * Inode modification time that is checked across DLM
+			 * lock request.
+			 */
+			time_t		 ft_mtime;
+			struct vm_area_struct *ft_vma;
+			/**
+			 *  locked page returned from vvp_io
+			 */
+			struct page	    *ft_vmpage;
+			struct vm_fault_api {
+				/**
+				 * kernel fault info
+				 */
+				struct vm_fault *ft_vmf;
+				/**
+				 * fault API used bitflags for return code.
+				 */
+				unsigned int    ft_flags;
+			} fault;
+		} fault;
+	} u;
+	/**
+	 * Read-ahead state used by read and page-fault IO contexts.
+	 */
+	struct ll_ra_read    cui_bead;
+	/**
+	 * Set when cui_bead has been initialized.
+	 */
+	int		  cui_ra_window_set;
+	/**
+	 * Partially truncated page, that vvp_io_trunc_start() keeps locked
+	 * across truncate.
+	 */
+	struct cl_page      *cui_partpage;
+};
+
+/**
+ * IO arguments for various VFS I/O interfaces.
+ */
+struct vvp_io_args {
+	/** normal/sendfile/splice */
+	enum vvp_io_subtype via_io_subtype;
+
+	union {
+		struct {
+			struct kiocb      *via_iocb;
+			struct iovec      *via_iov;
+			unsigned long      via_nrsegs;
+		} normal;
+		struct {
+			read_actor_t       via_actor;
+			void	      *via_target;
+		} sendfile;
+		struct {
+			struct pipe_inode_info  *via_pipe;
+			unsigned int       via_flags;
+		} splice;
+	} u;
+};
+
+struct ll_cl_context {
+	void	   *lcc_cookie;
+	struct cl_io   *lcc_io;
+	struct cl_page *lcc_page;
+	struct lu_env  *lcc_env;
+	int	     lcc_refcheck;
+	int	     lcc_created;
+};
+
+struct vvp_thread_info {
+	struct ost_lvb       vti_lvb;
+	struct cl_2queue     vti_queue;
+	struct iovec	 vti_local_iov;
+	struct vvp_io_args   vti_args;
+	struct ra_io_arg     vti_ria;
+	struct kiocb	 vti_kiocb;
+	struct ll_cl_context vti_io_ctx;
+};
+
+static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
+{
+	extern struct lu_context_key vvp_key;
+	struct vvp_thread_info      *info;
+
+	info = lu_context_key_get(&env->le_ctx, &vvp_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct vvp_io_args *vvp_env_args(const struct lu_env *env,
+					       enum vvp_io_subtype type)
+{
+	struct vvp_io_args *ret = &vvp_env_info(env)->vti_args;
+
+	ret->via_io_subtype = type;
+
+	return ret;
+}
+
+struct vvp_session {
+	struct vvp_io	 vs_ios;
+};
+
+static inline struct vvp_session *vvp_env_session(const struct lu_env *env)
+{
+	extern struct lu_context_key vvp_session_key;
+	struct vvp_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &vvp_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct vvp_io *vvp_env_io(const struct lu_env *env)
+{
+	return &vvp_env_session(env)->vs_ios;
+}
+
+void ll_queue_done_writing(struct inode *inode, unsigned long flags);
+void ll_close_thread_shutdown(struct ll_close_queue *lcq);
+int ll_close_thread_start(struct ll_close_queue **lcq_ret);
+
+/* llite/llite_mmap.c */
+typedef struct rb_root  rb_root_t;
+typedef struct rb_node  rb_node_t;
+
+struct ll_lock_tree_node;
+struct ll_lock_tree {
+	rb_root_t		       lt_root;
+	struct list_head		      lt_locked_list;
+	struct ll_file_data	    *lt_fd;
+};
+
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
+struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+					      __u64 end, ldlm_mode_t mode);
+void policy_from_vma(ldlm_policy_data_t *policy,
+		struct vm_area_struct *vma, unsigned long addr, size_t count);
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+			       size_t count);
+
+static inline void ll_invalidate_page(struct page *vmpage)
+{
+	struct address_space *mapping = vmpage->mapping;
+	loff_t offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+	LASSERT(PageLocked(vmpage));
+	if (mapping == NULL)
+		return;
+
+	ll_teardown_mmaps(mapping, offset, offset + PAGE_CACHE_SIZE);
+	truncate_complete_page(mapping, vmpage);
+}
+
+#define    ll_s2sbi(sb)	(s2lsi(sb)->lsi_llsbi)
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2dtexp(struct super_block *sb)
+{
+	return ll_s2sbi(sb)->ll_dt_exp;
+}
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2mdexp(struct super_block *sb)
+{
+	return ll_s2sbi(sb)->ll_md_exp;
+}
+
+static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi)
+{
+	struct obd_device *obd = sbi->ll_md_exp->exp_obd;
+	if (obd == NULL)
+		LBUG();
+	return &obd->u.cli;
+}
+
+// FIXME: replace the name of this with LL_SB to conform to kernel stuff
+static inline struct ll_sb_info *ll_i2sbi(struct inode *inode)
+{
+	return ll_s2sbi(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2dtexp(struct inode *inode)
+{
+	return ll_s2dtexp(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2mdexp(struct inode *inode)
+{
+	return ll_s2mdexp(inode->i_sb);
+}
+
+static inline struct lu_fid *ll_inode2fid(struct inode *inode)
+{
+	struct lu_fid *fid;
+
+	LASSERT(inode != NULL);
+	fid = &ll_i2info(inode)->lli_fid;
+
+	return fid;
+}
+
+static inline int ll_mds_max_easize(struct super_block *sb)
+{
+	return sbi2mdc(ll_s2sbi(sb))->cl_max_mds_easize;
+}
+
+static inline __u64 ll_file_maxbytes(struct inode *inode)
+{
+	return ll_i2info(inode)->lli_maxbytes;
+}
+
+/* llite/xattr.c */
+int ll_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags);
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+		    void *buffer, size_t size);
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ll_removexattr(struct dentry *dentry, const char *name);
+
+/* llite/remote_perm.c */
+extern struct kmem_cache *ll_remote_perm_cachep;
+extern struct kmem_cache *ll_rmtperm_hash_cachep;
+
+struct hlist_head *alloc_rmtperm_hash(void);
+void free_rmtperm_hash(struct hlist_head *hash);
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm);
+int lustre_check_remote_perm(struct inode *inode, int mask);
+
+/* llite/llite_capa.c */
+extern timer_list_t ll_capa_timer;
+
+int ll_capa_thread_start(void);
+void ll_capa_thread_stop(void);
+void ll_capa_timer_callback(unsigned long unused);
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa);
+int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa);
+
+void ll_capa_open(struct inode *inode);
+void ll_capa_close(struct inode *inode);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode);
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc);
+
+void ll_truncate_free_capa(struct obd_capa *ocapa);
+void ll_clear_inode_capas(struct inode *inode);
+void ll_print_capa_stat(struct ll_sb_info *sbi);
+
+/* llite/llite_cl.c */
+extern struct lu_device_type vvp_device_type;
+
+/**
+ * Common IO arguments for various VFS I/O interfaces.
+ */
+int cl_sb_init(struct super_block *sb);
+int cl_sb_fini(struct super_block *sb);
+enum cl_lock_mode  vvp_mode_from_vma(struct vm_area_struct *vma);
+void ll_io_init(struct cl_io *io, const struct file *file, int write);
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+		struct ll_readahead_state *ras, unsigned long index,
+		unsigned hit);
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
+int ll_is_file_contended(struct file *file);
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which);
+
+/* llite/llite_rmtacl.c */
+#ifdef CONFIG_FS_POSIX_ACL
+obd_valid rce_ops2valid(int ops);
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key);
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops);
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key);
+void rct_init(struct rmtacl_ctl_table *rct);
+void rct_fini(struct rmtacl_ctl_table *rct);
+
+void ee_free(struct eacl_entry *ee);
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+	   ext_acl_xattr_header *header);
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+				 struct lu_fid *fid, int type);
+void et_search_free(struct eacl_table *et, pid_t key);
+void et_init(struct eacl_table *et);
+void et_fini(struct eacl_table *et);
+#endif
+
+/* statahead.c */
+
+#define LL_SA_RPC_MIN	   2
+#define LL_SA_RPC_DEF	   32
+#define LL_SA_RPC_MAX	   8192
+
+#define LL_SA_CACHE_BIT	 5
+#define LL_SA_CACHE_SIZE	(1 << LL_SA_CACHE_BIT)
+#define LL_SA_CACHE_MASK	(LL_SA_CACHE_SIZE - 1)
+
+/* per inode struct, for dir only */
+struct ll_statahead_info {
+	struct inode	   *sai_inode;
+	atomic_t	    sai_refcount;   /* when access this struct, hold
+						 * refcount */
+	unsigned int	    sai_generation; /* generation for statahead */
+	unsigned int	    sai_max;	/* max ahead of lookup */
+	__u64		   sai_sent;       /* stat requests sent count */
+	__u64		   sai_replied;    /* stat requests which received
+						 * reply */
+	__u64		   sai_index;      /* index of statahead entry */
+	__u64		   sai_index_wait; /* index of entry which is the
+						 * caller is waiting for */
+	__u64		   sai_hit;	/* hit count */
+	__u64		   sai_miss;       /* miss count:
+						 * for "ls -al" case, it includes
+						 * hidden dentry miss;
+						 * for "ls -l" case, it does not
+						 * include hidden dentry miss.
+						 * "sai_miss_hidden" is used for
+						 * the later case.
+						 */
+	unsigned int	    sai_consecutive_miss; /* consecutive miss */
+	unsigned int	    sai_miss_hidden;/* "ls -al", but first dentry
+						 * is not a hidden one */
+	unsigned int	    sai_skip_hidden;/* skipped hidden dentry count */
+	unsigned int	    sai_ls_all:1,   /* "ls -al", do stat-ahead for
+						 * hidden entries */
+				sai_in_readpage:1,/* statahead is in readdir()*/
+				sai_agl_valid:1;/* AGL is valid for the dir */
+	wait_queue_head_t	     sai_waitq;      /* stat-ahead wait queue */
+	struct ptlrpc_thread    sai_thread;     /* stat-ahead thread */
+	struct ptlrpc_thread    sai_agl_thread; /* AGL thread */
+	struct list_head	      sai_entries;    /* entry list */
+	struct list_head	      sai_entries_received; /* entries returned */
+	struct list_head	      sai_entries_stated;   /* entries stated */
+	struct list_head	      sai_entries_agl; /* AGL entries to be sent */
+	struct list_head	      sai_cache[LL_SA_CACHE_SIZE];
+	spinlock_t		sai_cache_lock[LL_SA_CACHE_SIZE];
+	atomic_t		sai_cache_count; /* entry count in cache */
+};
+
+int do_statahead_enter(struct inode *dir, struct dentry **dentry,
+		       int only_unplug);
+void ll_stop_statahead(struct inode *dir, void *key);
+
+static inline int ll_glimpse_size(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc;
+
+	down_read(&lli->lli_glimpse_sem);
+	rc = cl_glimpse_size(inode);
+	lli->lli_glimpse_time = cfs_time_current();
+	up_read(&lli->lli_glimpse_sem);
+	return rc;
+}
+
+static inline void
+ll_statahead_mark(struct inode *dir, struct dentry *dentry)
+{
+	struct ll_inode_info     *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = lli->lli_sai;
+	struct ll_dentry_data    *ldd = ll_d2d(dentry);
+
+	/* not the same process, don't mark */
+	if (lli->lli_opendir_pid != current_pid())
+		return;
+
+	if (sai != NULL && ldd != NULL)
+		ldd->lld_sa_generation = sai->sai_generation;
+}
+
+static inline int
+ll_need_statahead(struct inode *dir, struct dentry *dentryp)
+{
+	struct ll_inode_info  *lli;
+	struct ll_dentry_data *ldd;
+
+	if (ll_i2sbi(dir)->ll_sa_max == 0)
+		return -EAGAIN;
+
+	lli = ll_i2info(dir);
+	/* not the same process, don't statahead */
+	if (lli->lli_opendir_pid != current_pid())
+		return -EAGAIN;
+
+	/* statahead has been stopped */
+	if (lli->lli_opendir_key == NULL)
+		return -EAGAIN;
+
+	ldd = ll_d2d(dentryp);
+	/*
+	 * When stats a dentry, the system trigger more than once "revalidate"
+	 * or "lookup", for "getattr", for "getxattr", and maybe for others.
+	 * Under patchless client mode, the operation intent is not accurate,
+	 * which maybe misguide the statahead thread. For example:
+	 * The "revalidate" call for "getattr" and "getxattr" of a dentry maybe
+	 * have the same operation intent -- "IT_GETATTR".
+	 * In fact, one dentry should has only one chance to interact with the
+	 * statahead thread, otherwise the statahead windows will be confused.
+	 * The solution is as following:
+	 * Assign "lld_sa_generation" with "sai_generation" when a dentry
+	 * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
+	 * will bypass interacting with statahead thread for checking:
+	 * "lld_sa_generation == lli_sai->sai_generation"
+	 */
+	if (ldd && lli->lli_sai &&
+	    ldd->lld_sa_generation == lli->lli_sai->sai_generation)
+		return -EAGAIN;
+
+	return 1;
+}
+
+static inline int
+ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int only_unplug)
+{
+	int ret;
+
+	ret = ll_need_statahead(dir, *dentryp);
+	if (ret <= 0)
+		return ret;
+
+	return do_statahead_enter(dir, dentryp, only_unplug);
+}
+
+/* llite ioctl register support rountine */
+enum llioc_iter {
+	LLIOC_CONT = 0,
+	LLIOC_STOP
+};
+
+#define LLIOC_MAX_CMD	   256
+
+/*
+ * Rules to write a callback function:
+ *
+ * Parameters:
+ *  @magic: Dynamic ioctl call routine will feed this vaule with the pointer
+ *      returned to ll_iocontrol_register.  Callback functions should use this
+ *      data to check the potential collasion of ioctl cmd. If collasion is
+ *      found, callback function should return LLIOC_CONT.
+ *  @rcp: The result of ioctl command.
+ *
+ *  Return values:
+ *      If @magic matches the pointer returned by ll_iocontrol_data, the
+ *      callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
+ */
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
+		struct file *file, unsigned int cmd, unsigned long arg,
+		void *magic, int *rcp);
+
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
+		unsigned int cmd, unsigned long arg, int *rcp);
+
+/* export functions */
+/* Register ioctl block dynamatically for a regular file.
+ *
+ * @cmd: the array of ioctl command set
+ * @count: number of commands in the @cmd
+ * @cb: callback function, it will be called if an ioctl command is found to
+ *      belong to the command list @cmd.
+ *
+ * Return vaule:
+ *      A magic pointer will be returned if success;
+ *      otherwise, NULL will be returned.
+ * */
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
+void ll_iocontrol_unregister(void *magic);
+
+
+/* lclient compat stuff */
+#define cl_inode_info ll_inode_info
+#define cl_i2info(info) ll_i2info(info)
+#define cl_inode_mode(inode) ((inode)->i_mode)
+#define cl_i2sbi ll_i2sbi
+
+static inline struct ll_file_data *cl_iattr2fd(struct inode *inode,
+					       const struct iattr *attr)
+{
+	LASSERT(attr->ia_valid & ATTR_FILE);
+	return LUSTRE_FPRIVATE(attr->ia_file);
+}
+
+static inline void cl_isize_lock(struct inode *inode)
+{
+	ll_inode_size_lock(inode);
+}
+
+static inline void cl_isize_unlock(struct inode *inode)
+{
+	ll_inode_size_unlock(inode);
+}
+
+static inline void cl_isize_write_nolock(struct inode *inode, loff_t kms)
+{
+	LASSERT(down_trylock(&ll_i2info(inode)->lli_size_sem) != 0);
+	i_size_write(inode, kms);
+}
+
+static inline void cl_isize_write(struct inode *inode, loff_t kms)
+{
+	ll_inode_size_lock(inode);
+	i_size_write(inode, kms);
+	ll_inode_size_unlock(inode);
+}
+
+#define cl_isize_read(inode)	     i_size_read(inode)
+
+static inline int cl_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+	return ll_merge_lvb(env, inode);
+}
+
+#define cl_inode_atime(inode) LTIME_S((inode)->i_atime)
+#define cl_inode_ctime(inode) LTIME_S((inode)->i_ctime)
+#define cl_inode_mtime(inode) LTIME_S((inode)->i_mtime)
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt);
+
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+		       enum cl_fsync_mode mode, int ignore_layout);
+
+/** direct write pages */
+struct ll_dio_pages {
+	/** page array to be written. we don't support
+	 * partial pages except the last one. */
+	struct page **ldp_pages;
+	/* offset of each page */
+	loff_t       *ldp_offsets;
+	/** if ldp_offsets is NULL, it means a sequential
+	 * pages to be written, then this is the file offset
+	 * of the * first page. */
+	loff_t	ldp_start_offset;
+	/** how many bytes are to be written. */
+	size_t	ldp_size;
+	/** # of pages in the array. */
+	int	   ldp_nr;
+};
+
+static inline void cl_stats_tally(struct cl_device *dev, enum cl_req_type crt,
+				  int rc)
+{
+	int opc = (crt == CRT_READ) ? LPROC_LL_OSC_READ :
+				      LPROC_LL_OSC_WRITE;
+
+	ll_stats_ops_tally(ll_s2sbi(cl2ccc_dev(dev)->cdv_sb), opc, rc);
+}
+
+extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+				  int rw, struct inode *inode,
+				  struct ll_dio_pages *pv);
+
+static inline int ll_file_nolock(const struct file *file)
+{
+	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+	struct inode *inode = file->f_dentry->d_inode;
+
+	LASSERT(fd != NULL);
+	return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) ||
+		(ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK));
+}
+
+static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode,
+				    struct lookup_intent *it, __u64 *bits)
+{
+	if (!it->d.lustre.it_lock_set) {
+		struct lustre_handle handle;
+
+		/* If this inode is a remote object, it will get two
+		 * separate locks in different namespaces, Master MDT,
+		 * where the name entry is, will grant LOOKUP lock,
+		 * remote MDT, where the object is, will grant
+		 * UPDATE|PERM lock. The inode will be attched to both
+		 * LOOKUP and PERM locks, so revoking either locks will
+		 * case the dcache being cleared */
+		if (it->d.lustre.it_remote_lock_mode) {
+			handle.cookie = it->d.lustre.it_remote_lock_handle;
+			CDEBUG(D_DLMTRACE, "setting l_data to inode %p"
+			       "(%lu/%u) for remote lock "LPX64"\n", inode,
+			       inode->i_ino, inode->i_generation,
+			       handle.cookie);
+			md_set_lock_data(exp, &handle.cookie, inode, NULL);
+		}
+
+		handle.cookie = it->d.lustre.it_lock_handle;
+
+		CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)"
+		       " for lock "LPX64"\n", inode, inode->i_ino,
+		       inode->i_generation, handle.cookie);
+
+		md_set_lock_data(exp, &handle.cookie, inode,
+				 &it->d.lustre.it_lock_bits);
+		it->d.lustre.it_lock_set = 1;
+	}
+
+	if (bits != NULL)
+		*bits = it->d.lustre.it_lock_bits;
+}
+
+static inline void ll_lock_dcache(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+}
+
+static inline void ll_unlock_dcache(struct inode *inode)
+{
+	spin_unlock(&inode->i_lock);
+}
+
+static inline int d_lustre_invalid(const struct dentry *dentry)
+{
+	struct ll_dentry_data *lld = ll_d2d(dentry);
+
+	return (lld == NULL) || lld->lld_invalid;
+}
+
+static inline void __d_lustre_invalidate(struct dentry *dentry)
+{
+	struct ll_dentry_data *lld = ll_d2d(dentry);
+
+	if (lld != NULL)
+		lld->lld_invalid = 1;
+}
+
+/*
+ * Mark dentry INVALID, if dentry refcount is zero (this is normally case for
+ * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later;
+ * else dput() of the last refcount will unhash this dentry and kill it.
+ */
+static inline void d_lustre_invalidate(struct dentry *dentry, int nested)
+{
+	CDEBUG(D_DENTRY, "invalidate dentry %.*s (%p) parent %p inode %p "
+	       "refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry,
+	       dentry->d_parent, dentry->d_inode, d_refcount(dentry));
+
+	spin_lock_nested(&dentry->d_lock,
+			 nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL);
+	__d_lustre_invalidate(dentry);
+	if (d_refcount(dentry) == 0)
+		__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline void d_lustre_revalidate(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	LASSERT(ll_d2d(dentry) != NULL);
+	ll_d2d(dentry)->lld_invalid = 0;
+	spin_unlock(&dentry->d_lock);
+}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+/* Compatibility for old (1.8) compiled userspace quota code */
+struct if_quotactl_18 {
+	__u32		   qc_cmd;
+	__u32		   qc_type;
+	__u32		   qc_id;
+	__u32		   qc_stat;
+	struct obd_dqinfo       qc_dqinfo;
+	struct obd_dqblk	qc_dqblk;
+	char		    obd_type[16];
+	struct obd_uuid	 obd_uuid;
+};
+#define LL_IOC_QUOTACTL_18	      _IOWR('f', 162, struct if_quotactl_18 *)
+/* End compatibility for old (1.8) compiled userspace quota code */
+#else
+#warning "remove old LL_IOC_QUOTACTL_18 compatibility code"
+#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */
+
+enum {
+	LL_LAYOUT_GEN_NONE  = ((__u32)-2),	/* layout lock was cancelled */
+	LL_LAYOUT_GEN_EMPTY = ((__u32)-1)	/* for empty layout */
+};
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
+int ll_layout_refresh(struct inode *inode, __u32 *gen);
+
+#endif /* LLITE_INTERNAL_H */

diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
new file mode 100644
index 0000000..2311b20
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c

@@ -0,0 +1,2408 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_lib.c
+ *
+ * Lustre Light Super operations
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include <lustre_log.h>
+#include <cl_object.h>
+#include <obd_cksum.h>
+#include "llite_internal.h"
+
+struct kmem_cache *ll_file_data_slab;
+
+LIST_HEAD(ll_super_blocks);
+DEFINE_SPINLOCK(ll_sb_lock);
+
+#ifndef MS_HAS_NEW_AOPS
+extern struct address_space_operations ll_aops;
+#else
+extern struct address_space_operations_ext ll_aops;
+#endif
+
+#ifndef log2
+#define log2(n) ffz(~(n))
+#endif
+
+static struct ll_sb_info *ll_init_sbi(void)
+{
+	struct ll_sb_info *sbi = NULL;
+	unsigned long pages;
+	unsigned long lru_page_max;
+	struct sysinfo si;
+	class_uuid_t uuid;
+	int i;
+	ENTRY;
+
+	OBD_ALLOC(sbi, sizeof(*sbi));
+	if (!sbi)
+		RETURN(NULL);
+
+	spin_lock_init(&sbi->ll_lock);
+	mutex_init(&sbi->ll_lco.lco_lock);
+	spin_lock_init(&sbi->ll_pp_extent_lock);
+	spin_lock_init(&sbi->ll_process_lock);
+	sbi->ll_rw_stats_on = 0;
+
+	si_meminfo(&si);
+	pages = si.totalram - si.totalhigh;
+	if (pages >> (20 - PAGE_CACHE_SHIFT) < 512) {
+		lru_page_max = pages / 2;
+	} else {
+		lru_page_max = (pages / 4) * 3;
+	}
+
+	/* initialize lru data */
+	atomic_set(&sbi->ll_cache.ccc_users, 0);
+	sbi->ll_cache.ccc_lru_max = lru_page_max;
+	atomic_set(&sbi->ll_cache.ccc_lru_left, lru_page_max);
+	spin_lock_init(&sbi->ll_cache.ccc_lru_lock);
+	INIT_LIST_HEAD(&sbi->ll_cache.ccc_lru);
+
+	sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
+					   SBI_DEFAULT_READAHEAD_MAX);
+	sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
+	sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+					   SBI_DEFAULT_READAHEAD_WHOLE_MAX;
+	INIT_LIST_HEAD(&sbi->ll_conn_chain);
+	INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
+
+	ll_generate_random_uuid(uuid);
+	class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
+	CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
+
+	spin_lock(&ll_sb_lock);
+	list_add_tail(&sbi->ll_list, &ll_super_blocks);
+	spin_unlock(&ll_sb_lock);
+
+	sbi->ll_flags |= LL_SBI_VERBOSE;
+	sbi->ll_flags |= LL_SBI_CHECKSUM;
+
+	sbi->ll_flags |= LL_SBI_LRU_RESIZE;
+
+	for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+		spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+			       pp_r_hist.oh_lock);
+		spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+			       pp_w_hist.oh_lock);
+	}
+
+	/* metadata statahead is enabled by default */
+	sbi->ll_sa_max = LL_SA_RPC_DEF;
+	atomic_set(&sbi->ll_sa_total, 0);
+	atomic_set(&sbi->ll_sa_wrong, 0);
+	atomic_set(&sbi->ll_agl_total, 0);
+	sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+
+	RETURN(sbi);
+}
+
+void ll_free_sbi(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	ENTRY;
+
+	if (sbi != NULL) {
+		spin_lock(&ll_sb_lock);
+		list_del(&sbi->ll_list);
+		spin_unlock(&ll_sb_lock);
+		OBD_FREE(sbi, sizeof(*sbi));
+	}
+	EXIT;
+}
+
+static struct dentry_operations ll_d_root_ops = {
+	.d_compare = ll_dcompare,
+	.d_revalidate = ll_revalidate_nd,
+};
+
+static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
+				    struct vfsmount *mnt)
+{
+	struct inode *root = 0;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct obd_capa *oc = NULL;
+	struct obd_statfs *osfs = NULL;
+	struct ptlrpc_request *request = NULL;
+	struct obd_connect_data *data = NULL;
+	struct obd_uuid *uuid;
+	struct md_op_data *op_data;
+	struct lustre_md lmd;
+	obd_valid valid;
+	int size, err, checksum;
+	ENTRY;
+
+	obd = class_name2obd(md);
+	if (!obd) {
+		CERROR("MD %s: not setup or attached\n", md);
+		RETURN(-EINVAL);
+	}
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC_PTR(osfs);
+	if (osfs == NULL) {
+		OBD_FREE_PTR(data);
+		RETURN(-ENOMEM);
+	}
+
+	if (proc_lustre_fs_root) {
+		err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
+						  dt, md);
+		if (err < 0)
+			CERROR("could not register mount in /proc/fs/lustre\n");
+	}
+
+	/* indicate the features supported by this client */
+	data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+				  OBD_CONNECT_ATTRFID  |
+				  OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+				  OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+				  OBD_CONNECT_RMT_CLIENT | OBD_CONNECT_VBR    |
+				  OBD_CONNECT_FULL20   | OBD_CONNECT_64BITHASH|
+				  OBD_CONNECT_EINPROGRESS |
+				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
+
+	if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+		data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+	if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
+		data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+#ifdef CONFIG_FS_POSIX_ACL
+	data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK;
+#endif
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
+		/* flag mdc connection as lightweight, only used for test
+		 * purpose, use with care */
+		data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
+
+	data->ocd_ibits_known = MDS_INODELOCK_FULL;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+
+	if (sb->s_flags & MS_RDONLY)
+		data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
+	if (sbi->ll_flags & LL_SBI_USER_XATTR)
+		data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+
+#ifdef HAVE_MS_FLOCK_LOCK
+	/* force vfs to use lustre handler for flock() calls - bug 10743 */
+	sb->s_flags |= MS_FLOCK_LOCK;
+#endif
+#ifdef MS_HAS_NEW_AOPS
+	sb->s_flags |= MS_HAS_NEW_AOPS;
+#endif
+
+	if (sbi->ll_flags & LL_SBI_FLOCK)
+		sbi->ll_fop = &ll_file_operations_flock;
+	else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+		sbi->ll_fop = &ll_file_operations;
+	else
+		sbi->ll_fop = &ll_file_operations_noflock;
+
+	/* real client */
+	data->ocd_connect_flags |= OBD_CONNECT_REAL;
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+	data->ocd_brw_size = MD_MAX_BRW_SIZE;
+
+	err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL);
+	if (err == -EBUSY) {
+		LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
+				   "recovery, of which this client is not a "
+				   "part. Please wait for recovery to complete,"
+				   " abort, or time out.\n", md);
+		GOTO(out, err);
+	} else if (err) {
+		CERROR("cannot connect to %s: rc = %d\n", md, err);
+		GOTO(out, err);
+	}
+
+	sbi->ll_md_exp->exp_connect_data = *data;
+
+	err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp,
+			   LUSTRE_SEQ_METADATA);
+	if (err) {
+		CERROR("%s: Can't init metadata layer FID infrastructure, "
+		       "rc = %d\n", sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_md, err);
+	}
+
+	/* For mount, we only need fs info from MDT0, and also in DNE, it
+	 * can make sure the client can be mounted as long as MDT0 is
+	 * avaible */
+	err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
+			cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			OBD_STATFS_FOR_MDT0);
+	if (err)
+		GOTO(out_md_fid, err);
+
+	/* This needs to be after statfs to ensure connect has finished.
+	 * Note that "data" does NOT contain the valid connect reply.
+	 * If connecting to a 1.8 server there will be no LMV device, so
+	 * we can access the MDC export directly and exp_connect_flags will
+	 * be non-zero, but if accessing an upgraded 2.1 server it will
+	 * have the correct flags filled in.
+	 * XXX: fill in the LMV exp_connect_flags from MDC(s). */
+	valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD;
+	if (exp_connect_flags(sbi->ll_md_exp) != 0 &&
+	    valid != CLIENT_CONNECT_MDT_REQD) {
+		char *buf;
+
+		OBD_ALLOC_WAIT(buf, PAGE_CACHE_SIZE);
+		obd_connect_flags2str(buf, PAGE_CACHE_SIZE,
+				      valid ^ CLIENT_CONNECT_MDT_REQD, ",");
+		LCONSOLE_ERROR_MSG(0x170, "Server %s does not support "
+				   "feature(s) needed for correct operation "
+				   "of this client (%s). Please upgrade "
+				   "server or downgrade client.\n",
+				   sbi->ll_md_exp->exp_obd->obd_name, buf);
+		OBD_FREE(buf, PAGE_CACHE_SIZE);
+		GOTO(out_md_fid, err = -EPROTO);
+	}
+
+	size = sizeof(*data);
+	err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA),
+			   KEY_CONN_DATA,  &size, data, NULL);
+	if (err) {
+		CERROR("%s: Get connect data failed: rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_md_fid, err);
+	}
+
+	LASSERT(osfs->os_bsize);
+	sb->s_blocksize = osfs->os_bsize;
+	sb->s_blocksize_bits = log2(osfs->os_bsize);
+	sb->s_magic = LL_SUPER_MAGIC;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sbi->ll_namelen = osfs->os_namelen;
+	sbi->ll_max_rw_chunk = LL_DEFAULT_MAX_RW_CHUNK;
+
+	if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
+	    !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
+		LCONSOLE_INFO("Disabling user_xattr feature because "
+			      "it is not supported on the server\n");
+		sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+#ifdef MS_POSIXACL
+		sb->s_flags |= MS_POSIXACL;
+#endif
+		sbi->ll_flags |= LL_SBI_ACL;
+	} else {
+		LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
+#ifdef MS_POSIXACL
+		sb->s_flags &= ~MS_POSIXACL;
+#endif
+		sbi->ll_flags &= ~LL_SBI_ACL;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) {
+		if (!(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+			sbi->ll_flags |= LL_SBI_RMT_CLIENT;
+			LCONSOLE_INFO("client is set as remote by default.\n");
+		}
+	} else {
+		if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+			sbi->ll_flags &= ~LL_SBI_RMT_CLIENT;
+			LCONSOLE_INFO("client claims to be remote, but server "
+				      "rejected, forced to be local.\n");
+		}
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_MDS_CAPA) {
+		LCONSOLE_INFO("client enabled MDS capability!\n");
+		sbi->ll_flags |= LL_SBI_MDS_CAPA;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_OSS_CAPA) {
+		LCONSOLE_INFO("client enabled OSS capability!\n");
+		sbi->ll_flags |= LL_SBI_OSS_CAPA;
+	}
+
+	if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
+		sbi->ll_flags |= LL_SBI_64BIT_HASH;
+
+	if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+		sbi->ll_md_brw_size = data->ocd_brw_size;
+	else
+		sbi->ll_md_brw_size = PAGE_CACHE_SIZE;
+
+	if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) {
+		LCONSOLE_INFO("Layout lock feature supported.\n");
+		sbi->ll_flags |= LL_SBI_LAYOUT_LOCK;
+	}
+
+	obd = class_name2obd(dt);
+	if (!obd) {
+		CERROR("DT %s: not setup or attached\n", dt);
+		GOTO(out_md_fid, err = -ENODEV);
+	}
+
+	data->ocd_connect_flags = OBD_CONNECT_GRANT     | OBD_CONNECT_VERSION  |
+				  OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
+				  OBD_CONNECT_CANCELSET | OBD_CONNECT_FID      |
+				  OBD_CONNECT_SRVLOCK   | OBD_CONNECT_TRUNCLOCK|
+				  OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT |
+				  OBD_CONNECT_OSS_CAPA | OBD_CONNECT_VBR|
+				  OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH |
+				  OBD_CONNECT_MAXBYTES |
+				  OBD_CONNECT_EINPROGRESS |
+				  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+				  OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
+
+	if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+		data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+	if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
+		/* OBD_CONNECT_CKSUM should always be set, even if checksums are
+		 * disabled by default, because it can still be enabled on the
+		 * fly via /proc. As a consequence, we still need to come to an
+		 * agreement on the supported algorithms at connect time */
+		data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
+			data->ocd_cksum_types = OBD_CKSUM_ADLER;
+		else
+			data->ocd_cksum_types = cksum_types_supported_client();
+	}
+
+	data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+	CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
+	       "ocd_grant: %d\n", data->ocd_connect_flags,
+	       data->ocd_version, data->ocd_grant);
+
+	obd->obd_upcall.onu_owner = &sbi->ll_lco;
+	obd->obd_upcall.onu_upcall = cl_ocd_update;
+
+	data->ocd_brw_size = DT_MAX_BRW_SIZE;
+
+	err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
+			  NULL);
+	if (err == -EBUSY) {
+		LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
+				   "recovery, of which this client is not a "
+				   "part.  Please wait for recovery to "
+				   "complete, abort, or time out.\n", dt);
+		GOTO(out_md, err);
+	} else if (err) {
+		CERROR("%s: Cannot connect to %s: rc = %d\n",
+		       sbi->ll_dt_exp->exp_obd->obd_name, dt, err);
+		GOTO(out_md, err);
+	}
+
+	sbi->ll_dt_exp->exp_connect_data = *data;
+
+	err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
+			   LUSTRE_SEQ_METADATA);
+	if (err) {
+		CERROR("%s: Can't init data layer FID infrastructure, "
+		       "rc = %d\n", sbi->ll_dt_exp->exp_obd->obd_name, err);
+		GOTO(out_dt, err);
+	}
+
+	mutex_lock(&sbi->ll_lco.lco_lock);
+	sbi->ll_lco.lco_flags = data->ocd_connect_flags;
+	sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
+	sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
+	mutex_unlock(&sbi->ll_lco.lco_lock);
+
+	fid_zero(&sbi->ll_root_fid);
+	err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc);
+	if (err) {
+		CERROR("cannot mds_connect: rc = %d\n", err);
+		GOTO(out_lock_cn_cb, err);
+	}
+	if (!fid_is_sane(&sbi->ll_root_fid)) {
+		CERROR("%s: Invalid root fid "DFID" during mount\n",
+		       sbi->ll_md_exp->exp_obd->obd_name,
+		       PFID(&sbi->ll_root_fid));
+		GOTO(out_lock_cn_cb, err = -EINVAL);
+	}
+	CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid));
+
+	sb->s_op = &lustre_super_operations;
+#if THREAD_SIZE >= 8192 /*b=17630*/
+	sb->s_export_op = &lustre_export_operations;
+#endif
+
+	/* make root inode
+	 * XXX: move this to after cbd setup? */
+	valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMDSCAPA;
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		valid |= OBD_MD_FLRMTPERM;
+	else if (sbi->ll_flags & LL_SBI_ACL)
+		valid |= OBD_MD_FLACL;
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out_lock_cn_cb, err = -ENOMEM);
+
+	op_data->op_fid1 = sbi->ll_root_fid;
+	op_data->op_mode = 0;
+	op_data->op_capa1 = oc;
+	op_data->op_valid = valid;
+
+	err = md_getattr(sbi->ll_md_exp, op_data, &request);
+	if (oc)
+		capa_put(oc);
+	OBD_FREE_PTR(op_data);
+	if (err) {
+		CERROR("%s: md_getattr failed for root: rc = %d\n",
+		       sbi->ll_md_exp->exp_obd->obd_name, err);
+		GOTO(out_lock_cn_cb, err);
+	}
+
+	err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+			       sbi->ll_md_exp, &lmd);
+	if (err) {
+		CERROR("failed to understand root inode md: rc = %d\n", err);
+		ptlrpc_req_finished(request);
+		GOTO(out_lock_cn_cb, err);
+	}
+
+	LASSERT(fid_is_sane(&sbi->ll_root_fid));
+	root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid,
+					    sbi->ll_flags & LL_SBI_32BIT_API),
+		       &lmd);
+	md_free_lustre_md(sbi->ll_md_exp, &lmd);
+	ptlrpc_req_finished(request);
+
+	if (root == NULL || IS_ERR(root)) {
+		if (lmd.lsm)
+			obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm);
+#ifdef CONFIG_FS_POSIX_ACL
+		if (lmd.posix_acl) {
+			posix_acl_release(lmd.posix_acl);
+			lmd.posix_acl = NULL;
+		}
+#endif
+		err = IS_ERR(root) ? PTR_ERR(root) : -EBADF;
+		root = NULL;
+		CERROR("lustre_lite: bad iget4 for root\n");
+		GOTO(out_root, err);
+	}
+
+	err = ll_close_thread_start(&sbi->ll_lcq);
+	if (err) {
+		CERROR("cannot start close thread: rc %d\n", err);
+		GOTO(out_root, err);
+	}
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		rct_init(&sbi->ll_rct);
+		et_init(&sbi->ll_et);
+	}
+#endif
+
+	checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
+	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+				 KEY_CHECKSUM, sizeof(checksum), &checksum,
+				 NULL);
+	cl_sb_init(sb);
+
+	err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET),
+				 KEY_CACHE_SET, sizeof(sbi->ll_cache),
+				 &sbi->ll_cache, NULL);
+
+	sb->s_root = d_make_root(root);
+	if (sb->s_root == NULL) {
+		CERROR("%s: can't make root dentry\n",
+			ll_get_fsname(sb, NULL, 0));
+		GOTO(out_root, err = -ENOMEM);
+	}
+
+	/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
+	d_set_d_op(sb->s_root, &ll_d_root_ops);
+	sb->s_d_op = &ll_d_ops;
+
+	sbi->ll_sdev_orig = sb->s_dev;
+
+	/* We set sb->s_dev equal on all lustre clients in order to support
+	 * NFS export clustering.  NFSD requires that the FSID be the same
+	 * on all clients. */
+	/* s_dev is also used in lt_compare() to compare two fs, but that is
+	 * only a node-local comparison. */
+	uuid = obd_get_uuid(sbi->ll_md_exp);
+	if (uuid != NULL)
+		sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
+
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (osfs != NULL)
+		OBD_FREE_PTR(osfs);
+
+	RETURN(err);
+out_root:
+	if (root)
+		iput(root);
+out_lock_cn_cb:
+	obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+out_dt:
+	obd_disconnect(sbi->ll_dt_exp);
+	sbi->ll_dt_exp = NULL;
+	/* Make sure all OScs are gone, since cl_cache is accessing sbi. */
+	obd_zombie_barrier();
+out_md_fid:
+	obd_fid_fini(sbi->ll_md_exp->exp_obd);
+out_md:
+	obd_disconnect(sbi->ll_md_exp);
+	sbi->ll_md_exp = NULL;
+out:
+	if (data != NULL)
+		OBD_FREE_PTR(data);
+	if (osfs != NULL)
+		OBD_FREE_PTR(osfs);
+	lprocfs_unregister_mountpoint(sbi);
+	return err;
+}
+
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+	int size, rc;
+
+	*lmmsize = obd_size_diskmd(sbi->ll_dt_exp, NULL);
+	size = sizeof(int);
+	rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE),
+			  KEY_MAX_EASIZE, &size, lmmsize, NULL);
+	if (rc)
+		CERROR("Get max mdsize error rc %d \n", rc);
+
+	RETURN(rc);
+}
+
+void ll_dump_inode(struct inode *inode)
+{
+	struct ll_d_hlist_node *tmp;
+	int dentry_count = 0;
+
+	LASSERT(inode != NULL);
+
+	ll_d_hlist_for_each(tmp, &inode->i_dentry)
+		dentry_count++;
+
+	CERROR("inode %p dump: dev=%s ino=%lu mode=%o count=%u, %d dentries\n",
+	       inode, ll_i2mdexp(inode)->exp_obd->obd_name, inode->i_ino,
+	       inode->i_mode, atomic_read(&inode->i_count), dentry_count);
+}
+
+void lustre_dump_dentry(struct dentry *dentry, int recur)
+{
+	struct list_head *tmp;
+	int subdirs = 0;
+
+	LASSERT(dentry != NULL);
+
+	list_for_each(tmp, &dentry->d_subdirs)
+		subdirs++;
+
+	CERROR("dentry %p dump: name=%.*s parent=%.*s (%p), inode=%p, count=%u,"
+	       " flags=0x%x, fsdata=%p, %d subdirs\n", dentry,
+	       dentry->d_name.len, dentry->d_name.name,
+	       dentry->d_parent->d_name.len, dentry->d_parent->d_name.name,
+	       dentry->d_parent, dentry->d_inode, d_refcount(dentry),
+	       dentry->d_flags, dentry->d_fsdata, subdirs);
+	if (dentry->d_inode != NULL)
+		ll_dump_inode(dentry->d_inode);
+
+	if (recur == 0)
+		return;
+
+	list_for_each(tmp, &dentry->d_subdirs) {
+		struct dentry *d = list_entry(tmp, struct dentry, d_u.d_child);
+		lustre_dump_dentry(d, recur - 1);
+	}
+}
+
+void client_common_put_super(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	ENTRY;
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		et_fini(&sbi->ll_et);
+		rct_fini(&sbi->ll_rct);
+	}
+#endif
+
+	ll_close_thread_shutdown(sbi->ll_lcq);
+
+	cl_sb_fini(sb);
+
+	list_del(&sbi->ll_conn_chain);
+
+	obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+	obd_disconnect(sbi->ll_dt_exp);
+	sbi->ll_dt_exp = NULL;
+	/* wait till all OSCs are gone, since cl_cache is accessing sbi.
+	 * see LU-2543. */
+	obd_zombie_barrier();
+
+	lprocfs_unregister_mountpoint(sbi);
+
+	obd_fid_fini(sbi->ll_md_exp->exp_obd);
+	obd_disconnect(sbi->ll_md_exp);
+	sbi->ll_md_exp = NULL;
+
+	EXIT;
+}
+
+void ll_kill_super(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+
+	ENTRY;
+
+	/* not init sb ?*/
+	if (!(sb->s_flags & MS_ACTIVE))
+		return;
+
+	sbi = ll_s2sbi(sb);
+	/* we need restore s_dev from changed for clustred NFS before put_super
+	 * because new kernels have cached s_dev and change sb->s_dev in
+	 * put_super not affected real removing devices */
+	if (sbi) {
+		sb->s_dev = sbi->ll_sdev_orig;
+		sbi->ll_umounting = 1;
+	}
+	EXIT;
+}
+
+char *ll_read_opt(const char *opt, char *data)
+{
+	char *value;
+	char *retval;
+	ENTRY;
+
+	CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data);
+	if (strncmp(opt, data, strlen(opt)))
+		RETURN(NULL);
+	if ((value = strchr(data, '=')) == NULL)
+		RETURN(NULL);
+
+	value++;
+	OBD_ALLOC(retval, strlen(value) + 1);
+	if (!retval) {
+		CERROR("out of memory!\n");
+		RETURN(NULL);
+	}
+
+	memcpy(retval, value, strlen(value)+1);
+	CDEBUG(D_SUPER, "Assigned option: %s, value %s\n", opt, retval);
+	RETURN(retval);
+}
+
+static inline int ll_set_opt(const char *opt, char *data, int fl)
+{
+	if (strncmp(opt, data, strlen(opt)) != 0)
+		return(0);
+	else
+		return(fl);
+}
+
+/* non-client-specific mount options are parsed in lmd_parse */
+static int ll_options(char *options, int *flags)
+{
+	int tmp;
+	char *s1 = options, *s2;
+	ENTRY;
+
+	if (!options)
+		RETURN(0);
+
+	CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+
+	while (*s1) {
+		CDEBUG(D_SUPER, "next opt=%s\n", s1);
+		tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 5, 50, 0)
+		tmp = ll_set_opt("acl", s1, LL_SBI_ACL);
+		if (tmp) {
+			/* Ignore deprecated mount option.  The client will
+			 * always try to mount with ACL support, whether this
+			 * is used depends on whether server supports it. */
+			LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
+						  "mount option 'acl'.\n");
+			goto next;
+		}
+		tmp = ll_set_opt("noacl", s1, LL_SBI_ACL);
+		if (tmp) {
+			LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
+						  "mount option 'noacl'.\n");
+			goto next;
+		}
+#else
+#warning "{no}acl options have been deprecated since 1.8, please remove them"
+#endif
+		tmp = ll_set_opt("remote_client", s1, LL_SBI_RMT_CLIENT);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+
+		tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("som_preview", s1, LL_SBI_SOM_PREVIEW);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE);
+		if (tmp) {
+			*flags |= tmp;
+			goto next;
+		}
+		tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE);
+		if (tmp) {
+			*flags &= ~tmp;
+			goto next;
+		}
+		LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
+				   s1);
+		RETURN(-EINVAL);
+
+next:
+		/* Find next opt */
+		s2 = strchr(s1, ',');
+		if (s2 == NULL)
+			break;
+		s1 = s2 + 1;
+	}
+	RETURN(0);
+}
+
+void ll_lli_init(struct ll_inode_info *lli)
+{
+	lli->lli_inode_magic = LLI_INODE_MAGIC;
+	lli->lli_flags = 0;
+	lli->lli_ioepoch = 0;
+	lli->lli_maxbytes = MAX_LFS_FILESIZE;
+	spin_lock_init(&lli->lli_lock);
+	lli->lli_posix_acl = NULL;
+	lli->lli_remote_perms = NULL;
+	mutex_init(&lli->lli_rmtperm_mutex);
+	/* Do not set lli_fid, it has been initialized already. */
+	fid_zero(&lli->lli_pfid);
+	INIT_LIST_HEAD(&lli->lli_close_list);
+	INIT_LIST_HEAD(&lli->lli_oss_capas);
+	atomic_set(&lli->lli_open_count, 0);
+	lli->lli_mds_capa = NULL;
+	lli->lli_rmtperm_time = 0;
+	lli->lli_pending_och = NULL;
+	lli->lli_mds_read_och = NULL;
+	lli->lli_mds_write_och = NULL;
+	lli->lli_mds_exec_och = NULL;
+	lli->lli_open_fd_read_count = 0;
+	lli->lli_open_fd_write_count = 0;
+	lli->lli_open_fd_exec_count = 0;
+	mutex_init(&lli->lli_och_mutex);
+	spin_lock_init(&lli->lli_agl_lock);
+	lli->lli_has_smd = false;
+	lli->lli_layout_gen = LL_LAYOUT_GEN_NONE;
+	lli->lli_clob = NULL;
+
+	LASSERT(lli->lli_vfs_inode.i_mode != 0);
+	if (S_ISDIR(lli->lli_vfs_inode.i_mode)) {
+		mutex_init(&lli->lli_readdir_mutex);
+		lli->lli_opendir_key = NULL;
+		lli->lli_sai = NULL;
+		lli->lli_def_acl = NULL;
+		spin_lock_init(&lli->lli_sa_lock);
+		lli->lli_opendir_pid = 0;
+	} else {
+		sema_init(&lli->lli_size_sem, 1);
+		lli->lli_size_sem_owner = NULL;
+		lli->lli_symlink_name = NULL;
+		init_rwsem(&lli->lli_trunc_sem);
+		mutex_init(&lli->lli_write_mutex);
+		init_rwsem(&lli->lli_glimpse_sem);
+		lli->lli_glimpse_time = 0;
+		INIT_LIST_HEAD(&lli->lli_agl_list);
+		lli->lli_agl_index = 0;
+		lli->lli_async_rc = 0;
+		lli->lli_volatile = false;
+	}
+	mutex_init(&lli->lli_layout_mutex);
+}
+
+static inline int ll_bdi_register(struct backing_dev_info *bdi)
+{
+	static atomic_t ll_bdi_num = ATOMIC_INIT(0);
+
+	bdi->name = "lustre";
+	return bdi_register(bdi, NULL, "lustre-%d",
+			    atomic_inc_return(&ll_bdi_num));
+}
+
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct lustre_profile *lprof = NULL;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi;
+	char  *dt = NULL, *md = NULL;
+	char  *profilenm = get_profile_name(sb);
+	struct config_llog_instance *cfg;
+	/* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
+	const int instlen = sizeof(cfg->cfg_instance) * 2 + 2;
+	int    err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+	OBD_ALLOC_PTR(cfg);
+	if (cfg == NULL)
+		RETURN(-ENOMEM);
+
+	try_module_get(THIS_MODULE);
+
+	/* client additional sb info */
+	lsi->lsi_llsbi = sbi = ll_init_sbi();
+	if (!sbi) {
+		module_put(THIS_MODULE);
+		OBD_FREE_PTR(cfg);
+		RETURN(-ENOMEM);
+	}
+
+	err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
+	if (err)
+		GOTO(out_free, err);
+
+	err = bdi_init(&lsi->lsi_bdi);
+	if (err)
+		GOTO(out_free, err);
+	lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+	lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+	err = ll_bdi_register(&lsi->lsi_bdi);
+	if (err)
+		GOTO(out_free, err);
+
+	sb->s_bdi = &lsi->lsi_bdi;
+
+	/* Generate a string unique to this super, in case some joker tries
+	   to mount the same fs at two mount points.
+	   Use the address of the super itself.*/
+	cfg->cfg_instance = sb;
+	cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
+	cfg->cfg_callback = class_config_llog_handler;
+	/* set up client obds */
+	err = lustre_process_log(sb, profilenm, cfg);
+	if (err < 0) {
+		CERROR("Unable to process log: %d\n", err);
+		GOTO(out_free, err);
+	}
+
+	/* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
+	lprof = class_get_profile(profilenm);
+	if (lprof == NULL) {
+		LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be"
+				   " read from the MGS.  Does that filesystem "
+				   "exist?\n", profilenm);
+		GOTO(out_free, err = -EINVAL);
+	}
+	CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
+	       lprof->lp_md, lprof->lp_dt);
+
+	OBD_ALLOC(dt, strlen(lprof->lp_dt) + instlen + 2);
+	if (!dt)
+		GOTO(out_free, err = -ENOMEM);
+	sprintf(dt, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
+
+	OBD_ALLOC(md, strlen(lprof->lp_md) + instlen + 2);
+	if (!md)
+		GOTO(out_free, err = -ENOMEM);
+	sprintf(md, "%s-%p", lprof->lp_md, cfg->cfg_instance);
+
+	/* connections, registrations, sb setup */
+	err = client_common_fill_super(sb, md, dt, mnt);
+
+out_free:
+	if (md)
+		OBD_FREE(md, strlen(lprof->lp_md) + instlen + 2);
+	if (dt)
+		OBD_FREE(dt, strlen(lprof->lp_dt) + instlen + 2);
+	if (err)
+		ll_put_super(sb);
+	else if (sbi->ll_flags & LL_SBI_VERBOSE)
+		LCONSOLE_WARN("Mounted %s\n", profilenm);
+
+	OBD_FREE_PTR(cfg);
+	RETURN(err);
+} /* ll_fill_super */
+
+void ll_put_super(struct super_block *sb)
+{
+	struct config_llog_instance cfg;
+	struct obd_device *obd;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	int next, force = 1;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
+
+	ll_print_capa_stat(sbi);
+
+	cfg.cfg_instance = sb;
+	lustre_end_log(sb, profilenm, &cfg);
+
+	if (sbi->ll_md_exp) {
+		obd = class_exp2obd(sbi->ll_md_exp);
+		if (obd)
+			force = obd->obd_force;
+	}
+
+	/* We need to set force before the lov_disconnect in
+	   lustre_common_put_super, since l_d cleans up osc's as well. */
+	if (force) {
+		next = 0;
+		while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
+						     &next)) != NULL) {
+			obd->obd_force = force;
+		}
+	}
+
+	if (sbi->ll_lcq) {
+		/* Only if client_common_fill_super succeeded */
+		client_common_put_super(sb);
+	}
+
+	next = 0;
+	while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
+		class_manual_cleanup(obd);
+	}
+
+	if (sbi->ll_flags & LL_SBI_VERBOSE)
+		LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : "");
+
+	if (profilenm)
+		class_del_profile(profilenm);
+
+	if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
+		bdi_destroy(&lsi->lsi_bdi);
+		lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
+	}
+
+	ll_free_sbi(sb);
+	lsi->lsi_llsbi = NULL;
+
+	lustre_common_put_super(sb);
+
+	module_put(THIS_MODULE);
+
+	EXIT;
+} /* client_put_super */
+
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock)
+{
+	struct inode *inode = NULL;
+
+	/* NOTE: we depend on atomic igrab() -bzzz */
+	lock_res_and_lock(lock);
+	if (lock->l_resource->lr_lvb_inode) {
+		struct ll_inode_info * lli;
+		lli = ll_i2info(lock->l_resource->lr_lvb_inode);
+		if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+			inode = igrab(lock->l_resource->lr_lvb_inode);
+		} else {
+			inode = lock->l_resource->lr_lvb_inode;
+			LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+					 D_WARNING, lock, "lr_lvb_inode %p is "
+					 "bogus: magic %08x",
+					 lock->l_resource->lr_lvb_inode,
+					 lli->lli_inode_magic);
+			inode = NULL;
+		}
+	}
+	unlock_res_and_lock(lock);
+	return inode;
+}
+
+struct inode *ll_inode_from_lock(struct ldlm_lock *lock)
+{
+	struct inode *inode = NULL;
+	/* NOTE: we depend on atomic igrab() -bzzz */
+	lock_res_and_lock(lock);
+	if (lock->l_ast_data) {
+		struct ll_inode_info *lli = ll_i2info(lock->l_ast_data);
+		if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+			inode = igrab(lock->l_ast_data);
+		} else {
+			inode = lock->l_ast_data;
+			LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+					 D_WARNING, lock, "l_ast_data %p is "
+					 "bogus: magic %08x", lock->l_ast_data,
+					 lli->lli_inode_magic);
+			inode = NULL;
+		}
+	}
+	unlock_res_and_lock(lock);
+	return inode;
+}
+
+void ll_clear_inode(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+	       inode->i_generation, inode);
+
+	if (S_ISDIR(inode->i_mode)) {
+		/* these should have been cleared in ll_file_release */
+		LASSERT(lli->lli_opendir_key == NULL);
+		LASSERT(lli->lli_sai == NULL);
+		LASSERT(lli->lli_opendir_pid == 0);
+	}
+
+	ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+	md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
+
+	LASSERT(!lli->lli_open_fd_write_count);
+	LASSERT(!lli->lli_open_fd_read_count);
+	LASSERT(!lli->lli_open_fd_exec_count);
+
+	if (lli->lli_mds_write_och)
+		ll_md_real_close(inode, FMODE_WRITE);
+	if (lli->lli_mds_exec_och)
+		ll_md_real_close(inode, FMODE_EXEC);
+	if (lli->lli_mds_read_och)
+		ll_md_real_close(inode, FMODE_READ);
+
+	if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) {
+		OBD_FREE(lli->lli_symlink_name,
+			 strlen(lli->lli_symlink_name) + 1);
+		lli->lli_symlink_name = NULL;
+	}
+
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		LASSERT(lli->lli_posix_acl == NULL);
+		if (lli->lli_remote_perms) {
+			free_rmtperm_hash(lli->lli_remote_perms);
+			lli->lli_remote_perms = NULL;
+		}
+	}
+#ifdef CONFIG_FS_POSIX_ACL
+	else if (lli->lli_posix_acl) {
+		LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
+		LASSERT(lli->lli_remote_perms == NULL);
+		posix_acl_release(lli->lli_posix_acl);
+		lli->lli_posix_acl = NULL;
+	}
+#endif
+	lli->lli_inode_magic = LLI_INODE_DEAD;
+
+	ll_clear_inode_capas(inode);
+	if (!S_ISDIR(inode->i_mode))
+		LASSERT(list_empty(&lli->lli_agl_list));
+
+	/*
+	 * XXX This has to be done before lsm is freed below, because
+	 * cl_object still uses inode lsm.
+	 */
+	cl_inode_fini(inode);
+	lli->lli_has_smd = false;
+
+	EXIT;
+}
+
+int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
+		  struct md_open_data **mod)
+{
+	struct lustre_md md;
+	struct inode *inode = dentry->d_inode;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *request = NULL;
+	int rc, ia_valid;
+	ENTRY;
+
+	op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, NULL, 0,
+			&request, mod);
+	if (rc) {
+		ptlrpc_req_finished(request);
+		if (rc == -ENOENT) {
+			clear_nlink(inode);
+			/* Unlinked special device node? Or just a race?
+			 * Pretend we done everything. */
+			if (!S_ISREG(inode->i_mode) &&
+			    !S_ISDIR(inode->i_mode)) {
+				ia_valid = op_data->op_attr.ia_valid;
+				op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
+				rc = simple_setattr(dentry, &op_data->op_attr);
+				op_data->op_attr.ia_valid = ia_valid;
+			}
+		} else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
+			CERROR("md_setattr fails: rc = %d\n", rc);
+		}
+		RETURN(rc);
+	}
+
+	rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+			      sbi->ll_md_exp, &md);
+	if (rc) {
+		ptlrpc_req_finished(request);
+		RETURN(rc);
+	}
+
+	ia_valid = op_data->op_attr.ia_valid;
+	/* inode size will be in ll_setattr_ost, can't do it now since dirty
+	 * cache is not cleared yet. */
+	op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
+	rc = simple_setattr(dentry, &op_data->op_attr);
+	op_data->op_attr.ia_valid = ia_valid;
+
+	/* Extract epoch data if obtained. */
+	op_data->op_handle = md.body->handle;
+	op_data->op_ioepoch = md.body->ioepoch;
+
+	ll_update_inode(inode, &md);
+	ptlrpc_req_finished(request);
+
+	RETURN(rc);
+}
+
+/* Close IO epoch and send Size-on-MDS attribute update. */
+static int ll_setattr_done_writing(struct inode *inode,
+				   struct md_op_data *op_data,
+				   struct md_open_data *mod)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(op_data != NULL);
+	if (!S_ISREG(inode->i_mode))
+		RETURN(0);
+
+	CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID" for truncate\n",
+	       op_data->op_ioepoch, PFID(&lli->lli_fid));
+
+	op_data->op_flags = MF_EPOCH_CLOSE;
+	ll_done_writing_attr(inode, op_data);
+	ll_pack_inode2opdata(inode, op_data, NULL);
+
+	rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, mod);
+	if (rc == -EAGAIN) {
+		/* MDS has instructed us to obtain Size-on-MDS attribute
+		 * from OSTs and send setattr to back to MDS. */
+		rc = ll_som_update(inode, op_data);
+	} else if (rc) {
+		CERROR("inode %lu mdc truncate failed: rc = %d\n",
+		       inode->i_ino, rc);
+	}
+	RETURN(rc);
+}
+
+static int ll_setattr_ost(struct inode *inode, struct iattr *attr)
+{
+	struct obd_capa *capa;
+	int rc;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC);
+	else
+		capa = ll_mdscapa_get(inode);
+
+	rc = cl_setattr_ost(inode, attr, capa);
+
+	if (attr->ia_valid & ATTR_SIZE)
+		ll_truncate_free_capa(capa);
+	else
+		capa_put(capa);
+
+	return rc;
+}
+
+
+/* If this inode has objects allocated to it (lsm != NULL), then the OST
+ * object(s) determine the file size and mtime.  Otherwise, the MDS will
+ * keep these values until such a time that objects are allocated for it.
+ * We do the MDS operations first, as it is checking permissions for us.
+ * We don't to the MDS RPC if there is nothing that we want to store there,
+ * otherwise there is no harm in updating mtime/atime on the MDS if we are
+ * going to do an RPC anyways.
+ *
+ * If we are doing a truncate, we will send the mtime and ctime updates
+ * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
+ * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
+ * at the same time.
+ */
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct md_op_data *op_data = NULL;
+	struct md_open_data *mod = NULL;
+	int rc = 0, rc1 = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "%s: setattr inode %p/fid:"DFID" from %llu to %llu, "
+		"valid %x\n", ll_get_fsname(inode->i_sb, NULL, 0), inode,
+		PFID(&lli->lli_fid), i_size_read(inode), attr->ia_size,
+		attr->ia_valid);
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		/* Check new size against VFS/VM file size limit and rlimit */
+		rc = inode_newsize_ok(inode, attr->ia_size);
+		if (rc)
+			RETURN(rc);
+
+		/* The maximum Lustre file size is variable, based on the
+		 * OST maximum object size and number of stripes.  This
+		 * needs another check in addition to the VFS check above. */
+		if (attr->ia_size > ll_file_maxbytes(inode)) {
+			CDEBUG(D_INODE,"file "DFID" too large %llu > "LPU64"\n",
+			       PFID(&lli->lli_fid), attr->ia_size,
+			       ll_file_maxbytes(inode));
+			RETURN(-EFBIG);
+		}
+
+		attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+	}
+
+	/* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
+	if (attr->ia_valid & TIMES_SET_FLAGS) {
+		if (current_fsuid() != inode->i_uid &&
+		    !cfs_capable(CFS_CAP_FOWNER))
+			RETURN(-EPERM);
+	}
+
+	/* We mark all of the fields "set" so MDS/OST does not re-set them */
+	if (attr->ia_valid & ATTR_CTIME) {
+		attr->ia_ctime = CFS_CURRENT_TIME;
+		attr->ia_valid |= ATTR_CTIME_SET;
+	}
+	if (!(attr->ia_valid & ATTR_ATIME_SET) &&
+	    (attr->ia_valid & ATTR_ATIME)) {
+		attr->ia_atime = CFS_CURRENT_TIME;
+		attr->ia_valid |= ATTR_ATIME_SET;
+	}
+	if (!(attr->ia_valid & ATTR_MTIME_SET) &&
+	    (attr->ia_valid & ATTR_MTIME)) {
+		attr->ia_mtime = CFS_CURRENT_TIME;
+		attr->ia_valid |= ATTR_MTIME_SET;
+	}
+
+	if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
+		       LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
+		       cfs_time_current_sec());
+
+	/* If we are changing file size, file content is modified, flag it. */
+	if (attr->ia_valid & ATTR_SIZE) {
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags |= LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	/* We always do an MDS RPC, even if we're only changing the size;
+	 * only the MDS knows whether truncate() should fail with -ETXTBUSY */
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		RETURN(-ENOMEM);
+
+	if (!S_ISDIR(inode->i_mode)) {
+		if (attr->ia_valid & ATTR_SIZE)
+			inode_dio_write_done(inode);
+		mutex_unlock(&inode->i_mutex);
+		down_write(&lli->lli_trunc_sem);
+	}
+
+	memcpy(&op_data->op_attr, attr, sizeof(*attr));
+
+	/* Open epoch for truncate. */
+	if (exp_connect_som(ll_i2mdexp(inode)) &&
+	    (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET)))
+		op_data->op_flags = MF_EPOCH_OPEN;
+
+	rc = ll_md_setattr(dentry, op_data, &mod);
+	if (rc)
+		GOTO(out, rc);
+
+	/* RPC to MDT is sent, cancel data modification flag */
+	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	ll_ioepoch_open(lli, op_data->op_ioepoch);
+	if (!S_ISREG(inode->i_mode))
+		GOTO(out, rc = 0);
+
+	if (attr->ia_valid & (ATTR_SIZE |
+			      ATTR_ATIME | ATTR_ATIME_SET |
+			      ATTR_MTIME | ATTR_MTIME_SET))
+		/* For truncate and utimes sending attributes to OSTs, setting
+		 * mtime/atime to the past will be performed under PW [0:EOF]
+		 * extent lock (new_size:EOF for truncate).  It may seem
+		 * excessive to send mtime/atime updates to OSTs when not
+		 * setting times to past, but it is necessary due to possible
+		 * time de-synchronization between MDT inode and OST objects */
+		rc = ll_setattr_ost(inode, attr);
+	EXIT;
+out:
+	if (op_data) {
+		if (op_data->op_ioepoch) {
+			rc1 = ll_setattr_done_writing(inode, op_data, mod);
+			if (!rc)
+				rc = rc1;
+		}
+		ll_finish_md_op_data(op_data);
+	}
+	if (!S_ISDIR(inode->i_mode)) {
+		up_write(&lli->lli_trunc_sem);
+		mutex_lock(&inode->i_mutex);
+		if (attr->ia_valid & ATTR_SIZE)
+			inode_dio_wait(inode);
+	}
+
+	ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ?
+			LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1);
+
+	return rc;
+}
+
+int ll_setattr(struct dentry *de, struct iattr *attr)
+{
+	int mode = de->d_inode->i_mode;
+
+	if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
+			      (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+
+	if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
+			       (ATTR_SIZE|ATTR_MODE)) &&
+	    (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
+	     (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+	      !(attr->ia_mode & S_ISGID))))
+		attr->ia_valid |= ATTR_FORCE;
+
+	if ((mode & S_ISUID) &&
+	    !(attr->ia_mode & S_ISUID) &&
+	    !(attr->ia_valid & ATTR_KILL_SUID))
+		attr->ia_valid |= ATTR_KILL_SUID;
+
+	if (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+	    !(attr->ia_mode & S_ISGID) &&
+	    !(attr->ia_valid & ATTR_KILL_SGID))
+		attr->ia_valid |= ATTR_KILL_SGID;
+
+	return ll_setattr_raw(de, attr);
+}
+
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+		       __u64 max_age, __u32 flags)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_statfs obd_osfs;
+	int rc;
+	ENTRY;
+
+	rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+	if (rc) {
+		CERROR("md_statfs fails: rc = %d\n", rc);
+		RETURN(rc);
+	}
+
+	osfs->os_type = sb->s_magic;
+
+	CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
+	       osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
+
+	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+		flags |= OBD_STATFS_NODELAY;
+
+	rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+	if (rc) {
+		CERROR("obd_statfs fails: rc = %d\n", rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_SUPER, "OSC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
+	       obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+	       obd_osfs.os_files);
+
+	osfs->os_bsize = obd_osfs.os_bsize;
+	osfs->os_blocks = obd_osfs.os_blocks;
+	osfs->os_bfree = obd_osfs.os_bfree;
+	osfs->os_bavail = obd_osfs.os_bavail;
+
+	/* If we don't have as many objects free on the OST as inodes
+	 * on the MDS, we reduce the total number of inodes to
+	 * compensate, so that the "inodes in use" number is correct.
+	 */
+	if (obd_osfs.os_ffree < osfs->os_ffree) {
+		osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+			obd_osfs.os_ffree;
+		osfs->os_ffree = obd_osfs.os_ffree;
+	}
+
+	RETURN(rc);
+}
+int ll_statfs(struct dentry *de, struct kstatfs *sfs)
+{
+	struct super_block *sb = de->d_sb;
+	struct obd_statfs osfs;
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op: at "LPU64" jiffies\n", get_jiffies_64());
+	ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
+
+	/* Some amount of caching on the client is allowed */
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+	if (rc)
+		return rc;
+
+	statfs_unpack(sfs, &osfs);
+
+	/* We need to downshift for all 32-bit kernels, because we can't
+	 * tell if the kernel is being called via sys_statfs64() or not.
+	 * Stop before overflowing f_bsize - in which case it is better
+	 * to just risk EOVERFLOW if caller is using old sys_statfs(). */
+	if (sizeof(long) < 8) {
+		while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
+			sfs->f_bsize <<= 1;
+
+			osfs.os_blocks >>= 1;
+			osfs.os_bfree >>= 1;
+			osfs.os_bavail >>= 1;
+		}
+	}
+
+	sfs->f_blocks = osfs.os_blocks;
+	sfs->f_bfree = osfs.os_bfree;
+	sfs->f_bavail = osfs.os_bavail;
+
+	return 0;
+}
+
+void ll_inode_size_lock(struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	LASSERT(!S_ISDIR(inode->i_mode));
+
+	lli = ll_i2info(inode);
+	LASSERT(lli->lli_size_sem_owner != current);
+	down(&lli->lli_size_sem);
+	LASSERT(lli->lli_size_sem_owner == NULL);
+	lli->lli_size_sem_owner = current;
+}
+
+void ll_inode_size_unlock(struct inode *inode)
+{
+	struct ll_inode_info *lli;
+
+	lli = ll_i2info(inode);
+	LASSERT(lli->lli_size_sem_owner == current);
+	lli->lli_size_sem_owner = NULL;
+	up(&lli->lli_size_sem);
+}
+
+void ll_update_inode(struct inode *inode, struct lustre_md *md)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body = md->body;
+	struct lov_stripe_md *lsm = md->lsm;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+	LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
+	if (lsm != NULL) {
+		if (!lli->lli_has_smd &&
+		    !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+			cl_file_inode_init(inode, md);
+
+		lli->lli_maxbytes = lsm->lsm_maxbytes;
+		if (lli->lli_maxbytes > MAX_LFS_FILESIZE)
+			lli->lli_maxbytes = MAX_LFS_FILESIZE;
+	}
+
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+		if (body->valid & OBD_MD_FLRMTPERM)
+			ll_update_remote_perm(inode, md->remote_perm);
+	}
+#ifdef CONFIG_FS_POSIX_ACL
+	else if (body->valid & OBD_MD_FLACL) {
+		spin_lock(&lli->lli_lock);
+		if (lli->lli_posix_acl)
+			posix_acl_release(lli->lli_posix_acl);
+		lli->lli_posix_acl = md->posix_acl;
+		spin_unlock(&lli->lli_lock);
+	}
+#endif
+	inode->i_ino = cl_fid_build_ino(&body->fid1,
+					sbi->ll_flags & LL_SBI_32BIT_API);
+	inode->i_generation = cl_fid_build_gen(&body->fid1);
+
+	if (body->valid & OBD_MD_FLATIME) {
+		if (body->atime > LTIME_S(inode->i_atime))
+			LTIME_S(inode->i_atime) = body->atime;
+		lli->lli_lvb.lvb_atime = body->atime;
+	}
+	if (body->valid & OBD_MD_FLMTIME) {
+		if (body->mtime > LTIME_S(inode->i_mtime)) {
+			CDEBUG(D_INODE, "setting ino %lu mtime from %lu "
+			       "to "LPU64"\n", inode->i_ino,
+			       LTIME_S(inode->i_mtime), body->mtime);
+			LTIME_S(inode->i_mtime) = body->mtime;
+		}
+		lli->lli_lvb.lvb_mtime = body->mtime;
+	}
+	if (body->valid & OBD_MD_FLCTIME) {
+		if (body->ctime > LTIME_S(inode->i_ctime))
+			LTIME_S(inode->i_ctime) = body->ctime;
+		lli->lli_lvb.lvb_ctime = body->ctime;
+	}
+	if (body->valid & OBD_MD_FLMODE)
+		inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
+	if (body->valid & OBD_MD_FLTYPE)
+		inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT);
+	LASSERT(inode->i_mode != 0);
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1, LL_MAX_BLKSIZE_BITS);
+	} else {
+		inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+	}
+	if (body->valid & OBD_MD_FLUID)
+		inode->i_uid = body->uid;
+	if (body->valid & OBD_MD_FLGID)
+		inode->i_gid = body->gid;
+	if (body->valid & OBD_MD_FLFLAGS)
+		inode->i_flags = ll_ext_to_inode_flags(body->flags);
+	if (body->valid & OBD_MD_FLNLINK)
+		set_nlink(inode, body->nlink);
+	if (body->valid & OBD_MD_FLRDEV)
+		inode->i_rdev = old_decode_dev(body->rdev);
+
+	if (body->valid & OBD_MD_FLID) {
+		/* FID shouldn't be changed! */
+		if (fid_is_sane(&lli->lli_fid)) {
+			LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1),
+				 "Trying to change FID "DFID
+				 " to the "DFID", inode %lu/%u(%p)\n",
+				 PFID(&lli->lli_fid), PFID(&body->fid1),
+				 inode->i_ino, inode->i_generation, inode);
+		} else
+			lli->lli_fid = body->fid1;
+	}
+
+	LASSERT(fid_seq(&lli->lli_fid) != 0);
+
+	if (body->valid & OBD_MD_FLSIZE) {
+		if (exp_connect_som(ll_i2mdexp(inode)) &&
+		    S_ISREG(inode->i_mode)) {
+			struct lustre_handle lockh;
+			ldlm_mode_t mode;
+
+			/* As it is possible a blocking ast has been processed
+			 * by this time, we need to check there is an UPDATE
+			 * lock on the client and set LLIF_MDS_SIZE_LOCK holding
+			 * it. */
+			mode = ll_take_md_lock(inode, MDS_INODELOCK_UPDATE,
+					       &lockh, LDLM_FL_CBPENDING);
+			if (mode) {
+				if (lli->lli_flags & (LLIF_DONE_WRITING |
+						      LLIF_EPOCH_PENDING |
+						      LLIF_SOM_DIRTY)) {
+					CERROR("ino %lu flags %u still has "
+					       "size authority! do not trust "
+					       "the size got from MDS\n",
+					       inode->i_ino, lli->lli_flags);
+				} else {
+					/* Use old size assignment to avoid
+					 * deadlock bz14138 & bz14326 */
+					i_size_write(inode, body->size);
+					lli->lli_flags |= LLIF_MDS_SIZE_LOCK;
+				}
+				ldlm_lock_decref(&lockh, mode);
+			}
+		} else {
+			/* Use old size assignment to avoid
+			 * deadlock bz14138 & bz14326 */
+			i_size_write(inode, body->size);
+
+			CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n",
+			       inode->i_ino, (unsigned long long)body->size);
+		}
+
+		if (body->valid & OBD_MD_FLBLOCKS)
+			inode->i_blocks = body->blocks;
+	}
+
+	if (body->valid & OBD_MD_FLMDSCAPA) {
+		LASSERT(md->mds_capa);
+		ll_add_capa(inode, md->mds_capa);
+	}
+	if (body->valid & OBD_MD_FLOSSCAPA) {
+		LASSERT(md->oss_capa);
+		ll_add_capa(inode, md->oss_capa);
+	}
+}
+
+void ll_read_inode2(struct inode *inode, void *opaque)
+{
+	struct lustre_md *md = opaque;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+	       PFID(&lli->lli_fid), inode);
+
+	LASSERT(!lli->lli_has_smd);
+
+	/* Core attributes from the MDS first.  This is a new inode, and
+	 * the VFS doesn't zero times in the core inode so we have to do
+	 * it ourselves.  They will be overwritten by either MDS or OST
+	 * attributes - we just need to make sure they aren't newer. */
+	LTIME_S(inode->i_mtime) = 0;
+	LTIME_S(inode->i_atime) = 0;
+	LTIME_S(inode->i_ctime) = 0;
+	inode->i_rdev = 0;
+	ll_update_inode(inode, md);
+
+	/* OIDEBUG(inode); */
+
+	/* initializing backing dev info. */
+	inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi;
+
+
+	if (S_ISREG(inode->i_mode)) {
+		struct ll_sb_info *sbi = ll_i2sbi(inode);
+		inode->i_op = &ll_file_inode_operations;
+		inode->i_fop = sbi->ll_fop;
+		inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops;
+		EXIT;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ll_dir_inode_operations;
+		inode->i_fop = &ll_dir_operations;
+		EXIT;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &ll_fast_symlink_inode_operations;
+		EXIT;
+	} else {
+		inode->i_op = &ll_special_inode_operations;
+
+		init_special_inode(inode, inode->i_mode,
+				   inode->i_rdev);
+
+		EXIT;
+	}
+}
+
+void ll_delete_inode(struct inode *inode)
+{
+	struct cl_inode_info *lli = cl_i2info(inode);
+	ENTRY;
+
+	if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
+		/* discard all dirty pages before truncating them, required by
+		 * osc_extent implementation at LU-1030. */
+		cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
+				   CL_FSYNC_DISCARD, 1);
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	/* Workaround for LU-118 */
+	if (inode->i_data.nrpages) {
+		TREE_READ_LOCK_IRQ(&inode->i_data);
+		TREE_READ_UNLOCK_IRQ(&inode->i_data);
+		LASSERTF(inode->i_data.nrpages == 0,
+			 "inode=%lu/%u(%p) nrpages=%lu, see "
+			 "http://jira.whamcloud.com/browse/LU-118\n",
+			 inode->i_ino, inode->i_generation, inode,
+			 inode->i_data.nrpages);
+	}
+	/* Workaround end */
+
+	ll_clear_inode(inode);
+	clear_inode(inode);
+
+	EXIT;
+}
+
+int ll_iocontrol(struct inode *inode, struct file *file,
+		 unsigned int cmd, unsigned long arg)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	int rc, flags = 0;
+	ENTRY;
+
+	switch(cmd) {
+	case FSFILT_IOC_GETFLAGS: {
+		struct mdt_body *body;
+		struct md_op_data *op_data;
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+					     0, 0, LUSTRE_OPC_ANY,
+					     NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
+
+		op_data->op_valid = OBD_MD_FLFLAGS;
+		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+		ll_finish_md_op_data(op_data);
+		if (rc) {
+			CERROR("failure %d inode %lu\n", rc, inode->i_ino);
+			RETURN(-abs(rc));
+		}
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+		flags = body->flags;
+
+		ptlrpc_req_finished(req);
+
+		RETURN(put_user(flags, (int *)arg));
+	}
+	case FSFILT_IOC_SETFLAGS: {
+		struct lov_stripe_md *lsm;
+		struct obd_info oinfo = { { { 0 } } };
+		struct md_op_data *op_data;
+
+		if (get_user(flags, (int *)arg))
+			RETURN(-EFAULT);
+
+		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+					     LUSTRE_OPC_ANY, NULL);
+		if (IS_ERR(op_data))
+			RETURN(PTR_ERR(op_data));
+
+		((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = flags;
+		op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+		rc = md_setattr(sbi->ll_md_exp, op_data,
+				NULL, 0, NULL, 0, &req, NULL);
+		ll_finish_md_op_data(op_data);
+		ptlrpc_req_finished(req);
+		if (rc)
+			RETURN(rc);
+
+		inode->i_flags = ll_ext_to_inode_flags(flags);
+
+		lsm = ccc_inode_lsm_get(inode);
+		if (lsm == NULL)
+			RETURN(0);
+
+		OBDO_ALLOC(oinfo.oi_oa);
+		if (!oinfo.oi_oa) {
+			ccc_inode_lsm_put(inode, lsm);
+			RETURN(-ENOMEM);
+		}
+		oinfo.oi_md = lsm;
+		oinfo.oi_oa->o_oi = lsm->lsm_oi;
+		oinfo.oi_oa->o_flags = flags;
+		oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS |
+				       OBD_MD_FLGROUP;
+		oinfo.oi_capa = ll_mdscapa_get(inode);
+		obdo_set_parent_fid(oinfo.oi_oa, &ll_i2info(inode)->lli_fid);
+		rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL);
+		capa_put(oinfo.oi_capa);
+		OBDO_FREE(oinfo.oi_oa);
+		ccc_inode_lsm_put(inode, lsm);
+
+		if (rc && rc != -EPERM && rc != -EACCES)
+			CERROR("osc_setattr_async fails: rc = %d\n", rc);
+
+		RETURN(rc);
+	}
+	default:
+		RETURN(-ENOSYS);
+	}
+
+	RETURN(0);
+}
+
+int ll_flush_ctx(struct inode *inode)
+{
+	struct ll_sb_info  *sbi = ll_i2sbi(inode);
+
+	CDEBUG(D_SEC, "flush context for user %d\n", current_uid());
+
+	obd_set_info_async(NULL, sbi->ll_md_exp,
+			   sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+			   0, NULL, NULL);
+	obd_set_info_async(NULL, sbi->ll_dt_exp,
+			   sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+			   0, NULL, NULL);
+	return 0;
+}
+
+/* umount -f client means force down, don't save state */
+void ll_umount_begin(struct super_block *sb)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	struct obd_ioctl_data *ioc_data;
+	ENTRY;
+
+
+	CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
+	       sb->s_count, atomic_read(&sb->s_active));
+
+	obd = class_exp2obd(sbi->ll_md_exp);
+	if (obd == NULL) {
+		CERROR("Invalid MDC connection handle "LPX64"\n",
+		       sbi->ll_md_exp->exp_handle.h_cookie);
+		EXIT;
+		return;
+	}
+	obd->obd_force = 1;
+
+	obd = class_exp2obd(sbi->ll_dt_exp);
+	if (obd == NULL) {
+		CERROR("Invalid LOV connection handle "LPX64"\n",
+		       sbi->ll_dt_exp->exp_handle.h_cookie);
+		EXIT;
+		return;
+	}
+	obd->obd_force = 1;
+
+	OBD_ALLOC_PTR(ioc_data);
+	if (ioc_data) {
+		obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp,
+			      sizeof *ioc_data, ioc_data, NULL);
+
+		obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp,
+			      sizeof *ioc_data, ioc_data, NULL);
+
+		OBD_FREE_PTR(ioc_data);
+	}
+
+	/* Really, we'd like to wait until there are no requests outstanding,
+	 * and then continue.  For now, we just invalidate the requests,
+	 * schedule() and sleep one second if needed, and hope.
+	 */
+	schedule();
+
+	EXIT;
+}
+
+int ll_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	char *profilenm = get_profile_name(sb);
+	int err;
+	__u32 read_only;
+
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		read_only = *flags & MS_RDONLY;
+		err = obd_set_info_async(NULL, sbi->ll_md_exp,
+					 sizeof(KEY_READ_ONLY),
+					 KEY_READ_ONLY, sizeof(read_only),
+					 &read_only, NULL);
+		if (err) {
+			LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
+				      profilenm, read_only ?
+				      "read-only" : "read-write", err);
+			return err;
+		}
+
+		if (read_only)
+			sb->s_flags |= MS_RDONLY;
+		else
+			sb->s_flags &= ~MS_RDONLY;
+
+		if (sbi->ll_flags & LL_SBI_VERBOSE)
+			LCONSOLE_WARN("Remounted %s %s\n", profilenm,
+				      read_only ?  "read-only" : "read-write");
+	}
+	return 0;
+}
+
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+		  struct super_block *sb, struct lookup_intent *it)
+{
+	struct ll_sb_info *sbi = NULL;
+	struct lustre_md md;
+	int rc;
+	ENTRY;
+
+	LASSERT(*inode || sb);
+	sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
+	rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp,
+			      sbi->ll_md_exp, &md);
+	if (rc)
+		RETURN(rc);
+
+	if (*inode) {
+		ll_update_inode(*inode, &md);
+	} else {
+		LASSERT(sb != NULL);
+
+		/*
+		 * At this point server returns to client's same fid as client
+		 * generated for creating. So using ->fid1 is okay here.
+		 */
+		LASSERT(fid_is_sane(&md.body->fid1));
+
+		*inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1,
+					     sbi->ll_flags & LL_SBI_32BIT_API),
+				 &md);
+		if (*inode == NULL || IS_ERR(*inode)) {
+#ifdef CONFIG_FS_POSIX_ACL
+			if (md.posix_acl) {
+				posix_acl_release(md.posix_acl);
+				md.posix_acl = NULL;
+			}
+#endif
+			rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM;
+			*inode = NULL;
+			CERROR("new_inode -fatal: rc %d\n", rc);
+			GOTO(out, rc);
+		}
+	}
+
+	/* Handling piggyback layout lock.
+	 * Layout lock can be piggybacked by getattr and open request.
+	 * The lsm can be applied to inode only if it comes with a layout lock
+	 * otherwise correct layout may be overwritten, for example:
+	 * 1. proc1: mdt returns a lsm but not granting layout
+	 * 2. layout was changed by another client
+	 * 3. proc2: refresh layout and layout lock granted
+	 * 4. proc1: to apply a stale layout */
+	if (it != NULL && it->d.lustre.it_lock_mode != 0) {
+		struct lustre_handle lockh;
+		struct ldlm_lock *lock;
+
+		lockh.cookie = it->d.lustre.it_lock_handle;
+		lock = ldlm_handle2lock(&lockh);
+		LASSERT(lock != NULL);
+		if (ldlm_has_layout(lock)) {
+			struct cl_object_conf conf;
+
+			memset(&conf, 0, sizeof(conf));
+			conf.coc_opc = OBJECT_CONF_SET;
+			conf.coc_inode = *inode;
+			conf.coc_lock = lock;
+			conf.u.coc_md = &md;
+			(void)ll_layout_conf(*inode, &conf);
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+
+out:
+	if (md.lsm != NULL)
+		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+	md_free_lustre_md(sbi->ll_md_exp, &md);
+	RETURN(rc);
+}
+
+int ll_obd_statfs(struct inode *inode, void *arg)
+{
+	struct ll_sb_info *sbi = NULL;
+	struct obd_export *exp;
+	char *buf = NULL;
+	struct obd_ioctl_data *data = NULL;
+	__u32 type;
+	__u32 flags;
+	int len = 0, rc;
+
+	if (!inode || !(sbi = ll_i2sbi(inode)))
+		GOTO(out_statfs, rc = -EINVAL);
+
+	rc = obd_ioctl_getdata(&buf, &len, arg);
+	if (rc)
+		GOTO(out_statfs, rc);
+
+	data = (void*)buf;
+	if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
+	    !data->ioc_pbuf1 || !data->ioc_pbuf2)
+		GOTO(out_statfs, rc = -EINVAL);
+
+	if (data->ioc_inllen1 != sizeof(__u32) ||
+	    data->ioc_inllen2 != sizeof(__u32) ||
+	    data->ioc_plen1 != sizeof(struct obd_statfs) ||
+	    data->ioc_plen2 != sizeof(struct obd_uuid))
+		GOTO(out_statfs, rc = -EINVAL);
+
+	memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
+	if (type & LL_STATFS_LMV)
+		exp = sbi->ll_md_exp;
+	else if (type & LL_STATFS_LOV)
+		exp = sbi->ll_dt_exp;
+	else
+		GOTO(out_statfs, rc = -ENODEV);
+
+	flags = (type & LL_STATFS_NODELAY) ? OBD_STATFS_NODELAY : 0;
+	rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, &flags);
+	if (rc)
+		GOTO(out_statfs, rc);
+out_statfs:
+	if (buf)
+		obd_ioctl_freedata(buf, len);
+	return rc;
+}
+
+int ll_process_config(struct lustre_cfg *lcfg)
+{
+	char *ptr;
+	void *sb;
+	struct lprocfs_static_vars lvars;
+	unsigned long x;
+	int rc = 0;
+
+	lprocfs_llite_init_vars(&lvars);
+
+	/* The instance name contains the sb: lustre-client-aacfe000 */
+	ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
+	if (!ptr || !*(++ptr))
+		return -EINVAL;
+	if (sscanf(ptr, "%lx", &x) != 1)
+		return -EINVAL;
+	sb = (void *)x;
+	/* This better be a real Lustre superblock! */
+	LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+	/* Note we have not called client_common_fill_super yet, so
+	   proc fns must be able to handle that! */
+	rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
+				      lcfg, sb);
+	if (rc > 0)
+		rc = 0;
+	return(rc);
+}
+
+/* this function prepares md_op_data hint for passing ot down to MD stack. */
+struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data,
+				       struct inode *i1, struct inode *i2,
+				       const char *name, int namelen,
+				       int mode, __u32 opc, void *data)
+{
+	LASSERT(i1 != NULL);
+
+	if (namelen > ll_i2sbi(i1)->ll_namelen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	if (op_data == NULL)
+		OBD_ALLOC_PTR(op_data);
+
+	if (op_data == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ll_i2gids(op_data->op_suppgids, i1, i2);
+	op_data->op_fid1 = *ll_inode2fid(i1);
+	op_data->op_capa1 = ll_mdscapa_get(i1);
+
+	if (i2) {
+		op_data->op_fid2 = *ll_inode2fid(i2);
+		op_data->op_capa2 = ll_mdscapa_get(i2);
+	} else {
+		fid_zero(&op_data->op_fid2);
+		op_data->op_capa2 = NULL;
+	}
+
+	op_data->op_name = name;
+	op_data->op_namelen = namelen;
+	op_data->op_mode = mode;
+	op_data->op_mod_time = cfs_time_current_sec();
+	op_data->op_fsuid = current_fsuid();
+	op_data->op_fsgid = current_fsgid();
+	op_data->op_cap = cfs_curproc_cap_pack();
+	op_data->op_bias = 0;
+	op_data->op_cli_flags = 0;
+	if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
+	     filename_is_volatile(name, namelen, NULL))
+		op_data->op_bias |= MDS_CREATE_VOLATILE;
+	op_data->op_opc = opc;
+	op_data->op_mds = 0;
+	op_data->op_data = data;
+
+	/* If the file is being opened after mknod() (normally due to NFS)
+	 * try to use the default stripe data from parent directory for
+	 * allocating OST objects.  Try to pass the parent FID to MDS. */
+	if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) &&
+	    !ll_i2info(i2)->lli_has_smd) {
+		struct ll_inode_info *lli = ll_i2info(i2);
+
+		spin_lock(&lli->lli_lock);
+		if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid)))
+			op_data->op_fid1 = lli->lli_pfid;
+		spin_unlock(&lli->lli_lock);
+		/** We ignore parent's capability temporary. */
+	}
+
+	/* When called by ll_setattr_raw, file is i1. */
+	if (LLIF_DATA_MODIFIED & ll_i2info(i1)->lli_flags)
+		op_data->op_bias |= MDS_DATA_MODIFIED;
+
+	return op_data;
+}
+
+void ll_finish_md_op_data(struct md_op_data *op_data)
+{
+	capa_put(op_data->op_capa1);
+	capa_put(op_data->op_capa2);
+	OBD_FREE_PTR(op_data);
+}
+
+int ll_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+	struct ll_sb_info *sbi;
+
+	LASSERT((seq != NULL) && (dentry != NULL));
+	sbi = ll_s2sbi(dentry->d_sb);
+
+	if (sbi->ll_flags & LL_SBI_NOLCK)
+		seq_puts(seq, ",nolock");
+
+	if (sbi->ll_flags & LL_SBI_FLOCK)
+		seq_puts(seq, ",flock");
+
+	if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+		seq_puts(seq, ",localflock");
+
+	if (sbi->ll_flags & LL_SBI_USER_XATTR)
+		seq_puts(seq, ",user_xattr");
+
+	if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+		seq_puts(seq, ",lazystatfs");
+
+	if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
+		seq_puts(seq, ",user_fid2path");
+
+	RETURN(0);
+}
+
+/**
+ * Get obd name by cmd, and copy out to user space
+ */
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct obd_device *obd;
+	ENTRY;
+
+	if (cmd == OBD_IOC_GETDTNAME)
+		obd = class_exp2obd(sbi->ll_dt_exp);
+	else if (cmd == OBD_IOC_GETMDNAME)
+		obd = class_exp2obd(sbi->ll_md_exp);
+	else
+		RETURN(-EINVAL);
+
+	if (!obd)
+		RETURN(-ENOENT);
+
+	if (copy_to_user((void *)arg, obd->obd_name,
+			     strlen(obd->obd_name) + 1))
+		RETURN(-EFAULT);
+
+	RETURN(0);
+}
+
+/**
+ * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the
+ * fsname will be returned in this buffer; otherwise, a static buffer will be
+ * used to store the fsname and returned to caller.
+ */
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen)
+{
+	static char fsname_static[MTI_NAME_MAXLEN];
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	char *ptr;
+	int len;
+
+	if (buf == NULL) {
+		/* this means the caller wants to use static buffer
+		 * and it doesn't care about race. Usually this is
+		 * in error reporting path */
+		buf = fsname_static;
+		buflen = sizeof(fsname_static);
+	}
+
+	len = strlen(lsi->lsi_lmd->lmd_profile);
+	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
+
+	if (unlikely(len >= buflen))
+		len = buflen - 1;
+	strncpy(buf, lsi->lsi_lmd->lmd_profile, len);
+	buf[len] = '\0';
+
+	return buf;
+}
+
+static char* ll_d_path(struct dentry *dentry, char *buf, int bufsize)
+{
+	char *path = NULL;
+
+	struct path p;
+
+	p.dentry = dentry;
+	p.mnt = current->fs->root.mnt;
+	path_get(&p);
+	path = d_path(&p, buf, bufsize);
+	path_put(&p);
+
+	return path;
+}
+
+void ll_dirty_page_discard_warn(struct page *page, int ioret)
+{
+	char *buf, *path = NULL;
+	struct dentry *dentry = NULL;
+	struct ccc_object *obj = cl_inode2ccc(page->mapping->host);
+
+	/* this can be called inside spin lock so use GFP_ATOMIC. */
+	buf = (char *)__get_free_page(GFP_ATOMIC);
+	if (buf != NULL) {
+		dentry = d_find_alias(page->mapping->host);
+		if (dentry != NULL)
+			path = ll_d_path(dentry, buf, PAGE_SIZE);
+	}
+
+	CWARN("%s: dirty page discard: %s/fid: "DFID"/%s may get corrupted "
+	      "(rc %d)\n", ll_get_fsname(page->mapping->host->i_sb, NULL, 0),
+	      s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev,
+	      PFID(&obj->cob_header.coh_lu.loh_fid),
+	      (path && !IS_ERR(path)) ? path : "", ioret);
+
+	if (dentry != NULL)
+		dput(dentry);
+
+	if (buf != NULL)
+		free_page((unsigned long)buf);
+}

diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
new file mode 100644
index 0000000..d9590d8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c

@@ -0,0 +1,507 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+		       int *type);
+
+static struct vm_operations_struct ll_file_vm_ops;
+
+void policy_from_vma(ldlm_policy_data_t *policy,
+			    struct vm_area_struct *vma, unsigned long addr,
+			    size_t count)
+{
+	policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) +
+				 (vma->vm_pgoff << PAGE_CACHE_SHIFT);
+	policy->l_extent.end = (policy->l_extent.start + count - 1) |
+			       ~CFS_PAGE_MASK;
+}
+
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+			       size_t count)
+{
+	struct vm_area_struct *vma, *ret = NULL;
+	ENTRY;
+
+	/* mmap_sem must have been held by caller. */
+	LASSERT(!down_write_trylock(&mm->mmap_sem));
+
+	for(vma = find_vma(mm, addr);
+	    vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+		if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
+		    vma->vm_flags & VM_SHARED) {
+			ret = vma;
+			break;
+		}
+	}
+	RETURN(ret);
+}
+
+/**
+ * API independent part for page fault initialization.
+ * \param vma - virtual memory area addressed to page fault
+ * \param env - corespondent lu_env to processing
+ * \param nest - nested level
+ * \param index - page index corespondent to fault.
+ * \parm ra_flags - vma readahead flags.
+ *
+ * \return allocated and initialized env for fault operation.
+ * \retval EINVAL if env can't allocated
+ * \return other error codes from cl_io_init.
+ */
+struct cl_io *ll_fault_io_init(struct vm_area_struct *vma,
+			       struct lu_env **env_ret,
+			       struct cl_env_nest *nest,
+			       pgoff_t index, unsigned long *ra_flags)
+{
+	struct file       *file  = vma->vm_file;
+	struct inode      *inode = file->f_dentry->d_inode;
+	struct cl_io      *io;
+	struct cl_fault_io *fio;
+	struct lu_env     *env;
+	ENTRY;
+
+	*env_ret = NULL;
+	if (ll_file_nolock(file))
+		RETURN(ERR_PTR(-EOPNOTSUPP));
+
+	/*
+	 * page fault can be called when lustre IO is
+	 * already active for the current thread, e.g., when doing read/write
+	 * against user level buffer mapped from Lustre buffer. To avoid
+	 * stomping on existing context, optionally force an allocation of a new
+	 * one.
+	 */
+	env = cl_env_nested_get(nest);
+	if (IS_ERR(env))
+		 RETURN(ERR_PTR(-EINVAL));
+
+	*env_ret = env;
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = ll_i2info(inode)->lli_clob;
+	LASSERT(io->ci_obj != NULL);
+
+	fio = &io->u.ci_fault;
+	fio->ft_index      = index;
+	fio->ft_executable = vma->vm_flags&VM_EXEC;
+
+	/*
+	 * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+	 * the kernel will not read other pages not covered by ldlm in
+	 * filemap_nopage. we do our readahead in ll_readpage.
+	 */
+	if (ra_flags != NULL)
+		*ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+	vma->vm_flags &= ~VM_SEQ_READ;
+	vma->vm_flags |= VM_RAND_READ;
+
+	CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags,
+	       fio->ft_index, fio->ft_executable);
+
+	if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) {
+		struct ccc_io *cio = ccc_env_io(env);
+		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+		LASSERT(cio->cui_cl.cis_io == io);
+
+		/* mmap lock must be MANDATORY
+		 * it has to cache pages. */
+		io->ci_lockreq = CILR_MANDATORY;
+
+		cio->cui_fd  = fd;
+	}
+
+	return io;
+}
+
+/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
+static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
+			    bool *retry)
+{
+	struct lu_env	   *env;
+	struct cl_io	    *io;
+	struct vvp_io	   *vio;
+	struct cl_env_nest       nest;
+	int		      result;
+	sigset_t	     set;
+	struct inode	     *inode;
+	struct ll_inode_info     *lli;
+	ENTRY;
+
+	LASSERT(vmpage != NULL);
+
+	io = ll_fault_io_init(vma, &env,  &nest, vmpage->index, NULL);
+	if (IS_ERR(io))
+		GOTO(out, result = PTR_ERR(io));
+
+	result = io->ci_result;
+	if (result < 0)
+		GOTO(out, result);
+
+	io->u.ci_fault.ft_mkwrite = 1;
+	io->u.ci_fault.ft_writable = 1;
+
+	vio = vvp_env_io(env);
+	vio->u.fault.ft_vma    = vma;
+	vio->u.fault.ft_vmpage = vmpage;
+
+	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+	/* we grab lli_trunc_sem to exclude truncate case.
+	 * Otherwise, we could add dirty pages into osc cache
+	 * while truncate is on-going. */
+	inode = ccc_object_inode(io->ci_obj);
+	lli = ll_i2info(inode);
+	down_read(&lli->lli_trunc_sem);
+
+	result = cl_io_loop(env, io);
+
+	up_read(&lli->lli_trunc_sem);
+
+	cfs_restore_sigs(set);
+
+	if (result == 0) {
+		struct inode *inode = vma->vm_file->f_dentry->d_inode;
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		lock_page(vmpage);
+		if (vmpage->mapping == NULL) {
+			unlock_page(vmpage);
+
+			/* page was truncated and lock was cancelled, return
+			 * ENODATA so that VM_FAULT_NOPAGE will be returned
+			 * to handle_mm_fault(). */
+			if (result == 0)
+				result = -ENODATA;
+		} else if (!PageDirty(vmpage)) {
+			/* race, the page has been cleaned by ptlrpcd after
+			 * it was unlocked, it has to be added into dirty
+			 * cache again otherwise this soon-to-dirty page won't
+			 * consume any grants, even worse if this page is being
+			 * transferred because it will break RPC checksum.
+			 */
+			unlock_page(vmpage);
+
+			CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has "
+			       "been written out, retry.\n",
+			       vmpage, vmpage->index);
+
+			*retry = true;
+			result = -EAGAIN;
+		}
+
+		if (result == 0) {
+			spin_lock(&lli->lli_lock);
+			lli->lli_flags |= LLIF_DATA_MODIFIED;
+			spin_unlock(&lli->lli_lock);
+		}
+	}
+	EXIT;
+
+out:
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+
+	CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
+
+	LASSERT(ergo(result == 0, PageLocked(vmpage)));
+	return(result);
+}
+
+
+
+static inline int to_fault_error(int result)
+{
+	switch(result) {
+	case 0:
+		result = VM_FAULT_LOCKED;
+		break;
+	case -EFAULT:
+		result = VM_FAULT_NOPAGE;
+		break;
+	case -ENOMEM:
+		result = VM_FAULT_OOM;
+		break;
+	default:
+		result = VM_FAULT_SIGBUS;
+		break;
+	}
+	return result;
+}
+
+/**
+ * Lustre implementation of a vm_operations_struct::fault() method, called by
+ * VM to server page fault (both in kernel and user space).
+ *
+ * \param vma - is virtiual area struct related to page fault
+ * \param vmf - structure which describe type and address where hit fault
+ *
+ * \return allocated and filled _locked_ page for address
+ * \retval VM_FAULT_ERROR on general error
+ * \retval NOPAGE_OOM not have memory for allocate new page
+ */
+static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct lu_env	   *env;
+	struct cl_io	    *io;
+	struct vvp_io	   *vio = NULL;
+	struct page	     *vmpage;
+	unsigned long	    ra_flags;
+	struct cl_env_nest       nest;
+	int		      result;
+	int		      fault_ret = 0;
+	ENTRY;
+
+	io = ll_fault_io_init(vma, &env,  &nest, vmf->pgoff, &ra_flags);
+	if (IS_ERR(io))
+		RETURN(to_fault_error(PTR_ERR(io)));
+
+	result = io->ci_result;
+	if (result == 0) {
+		vio = vvp_env_io(env);
+		vio->u.fault.ft_vma       = vma;
+		vio->u.fault.ft_vmpage    = NULL;
+		vio->u.fault.fault.ft_vmf = vmf;
+
+		result = cl_io_loop(env, io);
+
+		fault_ret = vio->u.fault.fault.ft_flags;
+		vmpage = vio->u.fault.ft_vmpage;
+		if (result != 0 && vmpage != NULL) {
+			page_cache_release(vmpage);
+			vmf->page = NULL;
+		}
+	}
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+
+	vma->vm_flags |= ra_flags;
+	if (result != 0 && !(fault_ret & VM_FAULT_RETRY))
+		fault_ret |= to_fault_error(result);
+
+	CDEBUG(D_MMAP, "%s fault %d/%d\n",
+	       current->comm, fault_ret, result);
+	RETURN(fault_ret);
+}
+
+static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int count = 0;
+	bool printed = false;
+	int result;
+	sigset_t set;
+
+	/* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
+	 * so that it can be killed by admin but not cause segfault by
+	 * other signals. */
+	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+restart:
+	result = ll_fault0(vma, vmf);
+	LASSERT(!(result & VM_FAULT_LOCKED));
+	if (result == 0) {
+		struct page *vmpage = vmf->page;
+
+		/* check if this page has been truncated */
+		lock_page(vmpage);
+		if (unlikely(vmpage->mapping == NULL)) { /* unlucky */
+			unlock_page(vmpage);
+			page_cache_release(vmpage);
+			vmf->page = NULL;
+
+			if (!printed && ++count > 16) {
+				CWARN("the page is under heavy contention,"
+				      "maybe your app(%s) needs revising :-)\n",
+				      current->comm);
+				printed = true;
+			}
+
+			goto restart;
+		}
+
+		result |= VM_FAULT_LOCKED;
+	}
+	cfs_restore_sigs(set);
+	return result;
+}
+
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int count = 0;
+	bool printed = false;
+	bool retry;
+	int result;
+
+	do {
+		retry = false;
+		result = ll_page_mkwrite0(vma, vmf->page, &retry);
+
+		if (!printed && ++count > 16) {
+			CWARN("app(%s): the page %lu of file %lu is under heavy"
+			      " contention.\n",
+			      current->comm, vmf->pgoff,
+			      vma->vm_file->f_dentry->d_inode->i_ino);
+			printed = true;
+		}
+	} while (retry);
+
+	switch(result) {
+	case 0:
+		LASSERT(PageLocked(vmf->page));
+		result = VM_FAULT_LOCKED;
+		break;
+	case -ENODATA:
+	case -EFAULT:
+		result = VM_FAULT_NOPAGE;
+		break;
+	case -ENOMEM:
+		result = VM_FAULT_OOM;
+		break;
+	case -EAGAIN:
+		result = VM_FAULT_RETRY;
+		break;
+	default:
+		result = VM_FAULT_SIGBUS;
+		break;
+	}
+
+	return result;
+}
+
+/**
+ *  To avoid cancel the locks covering mmapped region for lock cache pressure,
+ *  we track the mapped vma count in ccc_object::cob_mmap_cnt.
+ */
+static void ll_vm_open(struct vm_area_struct * vma)
+{
+	struct inode *inode    = vma->vm_file->f_dentry->d_inode;
+	struct ccc_object *vob = cl_inode2ccc(inode);
+
+	ENTRY;
+	LASSERT(vma->vm_file);
+	LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+	atomic_inc(&vob->cob_mmap_cnt);
+	EXIT;
+}
+
+/**
+ * Dual to ll_vm_open().
+ */
+static void ll_vm_close(struct vm_area_struct *vma)
+{
+	struct inode      *inode = vma->vm_file->f_dentry->d_inode;
+	struct ccc_object *vob   = cl_inode2ccc(inode);
+
+	ENTRY;
+	LASSERT(vma->vm_file);
+	atomic_dec(&vob->cob_mmap_cnt);
+	LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+	EXIT;
+}
+
+
+/* return the user space pointer that maps to a file offset via a vma */
+static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte)
+{
+	return vma->vm_start + (byte - ((__u64)vma->vm_pgoff << PAGE_CACHE_SHIFT));
+
+}
+
+/* XXX put nice comment here.  talk about __free_pte -> dirty pages and
+ * nopage's reference passing to the pte */
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last)
+{
+	int rc = -ENOENT;
+	ENTRY;
+
+	LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first);
+	if (mapping_mapped(mapping)) {
+		rc = 0;
+		unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1,
+				    last - first + 1, 0);
+	}
+
+	RETURN(rc);
+}
+
+static struct vm_operations_struct ll_file_vm_ops = {
+	.fault			= ll_fault,
+	.page_mkwrite		= ll_page_mkwrite,
+	.open			= ll_vm_open,
+	.close			= ll_vm_close,
+};
+
+int ll_file_mmap(struct file *file, struct vm_area_struct * vma)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int rc;
+	ENTRY;
+
+	if (ll_file_nolock(file))
+		RETURN(-EOPNOTSUPP);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1);
+	rc = generic_file_mmap(file, vma);
+	if (rc == 0) {
+		vma->vm_ops = &ll_file_vm_ops;
+		vma->vm_ops->open(vma);
+		/* update the inode's size and mtime */
+		rc = ll_glimpse_size(inode);
+	}
+
+	RETURN(rc);
+}

diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c
new file mode 100644
index 0000000..28cc41e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c

@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/llite_nfs.c
+ *
+ * NFS export of Lustre Light File System
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Huang Hua <huanghua@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/exportfs.h>
+
+__u32 get_uuid2int(const char *name, int len)
+{
+	__u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9;
+	while (len--) {
+		__u32 key = key1 + (key0 ^ (*name++ * 7152373));
+		if (key & 0x80000000) key -= 0x7fffffff;
+		key1 = key0;
+		key0 = key;
+	}
+	return (key0 << 1);
+}
+
+static int ll_nfs_test_inode(struct inode *inode, void *opaque)
+{
+	return lu_fid_eq(&ll_i2info(inode)->lli_fid,
+			 (struct lu_fid *)opaque);
+}
+
+struct inode *search_inode_for_lustre(struct super_block *sb,
+				      const struct lu_fid *fid)
+{
+	struct ll_sb_info     *sbi = ll_s2sbi(sb);
+	struct ptlrpc_request *req = NULL;
+	struct inode	  *inode = NULL;
+	int		   eadatalen = 0;
+	unsigned long	      hash = cl_fid_build_ino(fid,
+						      ll_need_32bit_api(sbi));
+	struct  md_op_data    *op_data;
+	int		   rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid));
+
+	inode = ilookup5(sb, hash, ll_nfs_test_inode, (void *)fid);
+	if (inode)
+		RETURN(inode);
+
+	rc = ll_get_max_mdsize(sbi, &eadatalen);
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+	/* Because inode is NULL, ll_prep_md_op_data can not
+	 * be used here. So we allocate op_data ourselves */
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	op_data->op_fid1 = *fid;
+	op_data->op_mode = eadatalen;
+	op_data->op_valid = OBD_MD_FLEASIZE;
+
+	/* mds_fid2dentry ignores f_type */
+	rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+	OBD_FREE_PTR(op_data);
+	if (rc) {
+		CERROR("can't get object attrs, fid "DFID", rc %d\n",
+		       PFID(fid), rc);
+		RETURN(ERR_PTR(rc));
+	}
+	rc = ll_prep_inode(&inode, req, sb, NULL);
+	ptlrpc_req_finished(req);
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+	RETURN(inode);
+}
+
+struct lustre_nfs_fid {
+	struct lu_fid   lnf_child;
+	struct lu_fid   lnf_parent;
+};
+
+static struct dentry *
+ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent)
+{
+	struct inode  *inode;
+	struct dentry *result;
+	ENTRY;
+
+	CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid));
+	if (!fid_is_sane(fid))
+		RETURN(ERR_PTR(-ESTALE));
+
+	inode = search_inode_for_lustre(sb, fid);
+	if (IS_ERR(inode))
+		RETURN(ERR_PTR(PTR_ERR(inode)));
+
+	if (is_bad_inode(inode)) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		RETURN(ERR_PTR(-ESTALE));
+	}
+
+	/**
+	 * It is an anonymous dentry without OST objects created yet.
+	 * We have to find the parent to tell MDS how to init lov objects.
+	 */
+	if (S_ISREG(inode->i_mode) && !ll_i2info(inode)->lli_has_smd &&
+	    parent != NULL) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		spin_lock(&lli->lli_lock);
+		lli->lli_pfid = *parent;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	result = d_obtain_alias(inode);
+	if (IS_ERR(result))
+		RETURN(result);
+
+	ll_dops_init(result, 1, 0);
+
+	RETURN(result);
+}
+
+#define LUSTRE_NFS_FID	  0x97
+
+/**
+ * \a connectable - is nfsd will connect himself or this should be done
+ *		  at lustre
+ *
+ * The return value is file handle type:
+ * 1 -- contains child file handle;
+ * 2 -- contains child file handle and parent file handle;
+ * 255 -- error.
+ */
+static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen,
+			struct inode *parent)
+{
+	struct lustre_nfs_fid *nfs_fid = (void *)fh;
+	ENTRY;
+
+	CDEBUG(D_INFO, "encoding for (%lu,"DFID") maxlen=%d minlen=%d\n",
+	      inode->i_ino, PFID(ll_inode2fid(inode)), *plen,
+	      (int)sizeof(struct lustre_nfs_fid));
+
+	if (*plen < sizeof(struct lustre_nfs_fid) / 4)
+		RETURN(255);
+
+	nfs_fid->lnf_child = *ll_inode2fid(inode);
+	nfs_fid->lnf_parent = *ll_inode2fid(parent);
+	*plen = sizeof(struct lustre_nfs_fid) / 4;
+
+	RETURN(LUSTRE_NFS_FID);
+}
+
+static int ll_nfs_get_name_filldir(void *cookie, const char *name, int namelen,
+				   loff_t hash, u64 ino, unsigned type)
+{
+	/* It is hack to access lde_fid for comparison with lgd_fid.
+	 * So the input 'name' must be part of the 'lu_dirent'. */
+	struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name);
+	struct ll_getname_data *lgd = cookie;
+	struct lu_fid fid;
+
+	fid_le_to_cpu(&fid, &lde->lde_fid);
+	if (lu_fid_eq(&fid, &lgd->lgd_fid)) {
+		memcpy(lgd->lgd_name, name, namelen);
+		lgd->lgd_name[namelen] = 0;
+		lgd->lgd_found = 1;
+	}
+	return lgd->lgd_found;
+}
+
+static int ll_get_name(struct dentry *dentry, char *name,
+		       struct dentry *child)
+{
+	struct inode *dir = dentry->d_inode;
+	struct ll_getname_data lgd;
+	__u64 offset = 0;
+	int rc;
+	ENTRY;
+
+	if (!dir || !S_ISDIR(dir->i_mode))
+		GOTO(out, rc = -ENOTDIR);
+
+	if (!dir->i_fop)
+		GOTO(out, rc = -EINVAL);
+
+	lgd.lgd_name = name;
+	lgd.lgd_fid = ll_i2info(child->d_inode)->lli_fid;
+	lgd.lgd_found = 0;
+
+	mutex_lock(&dir->i_mutex);
+	rc = ll_dir_read(dir, &offset, &lgd, ll_nfs_get_name_filldir);
+	mutex_unlock(&dir->i_mutex);
+	if (!rc && !lgd.lgd_found)
+		rc = -ENOENT;
+	EXIT;
+
+out:
+	return rc;
+}
+
+static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
+				      int fh_len, int fh_type)
+{
+	struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+	if (fh_type != LUSTRE_NFS_FID)
+		RETURN(ERR_PTR(-EPROTO));
+
+	RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent));
+}
+
+static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
+				      int fh_len, int fh_type)
+{
+	struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+	if (fh_type != LUSTRE_NFS_FID)
+		RETURN(ERR_PTR(-EPROTO));
+
+	RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL));
+}
+
+static struct dentry *ll_get_parent(struct dentry *dchild)
+{
+	struct ptlrpc_request *req = NULL;
+	struct inode	  *dir = dchild->d_inode;
+	struct ll_sb_info     *sbi;
+	struct dentry	 *result = NULL;
+	struct mdt_body       *body;
+	static char	   dotdot[] = "..";
+	struct md_op_data     *op_data;
+	int		   rc;
+	int		      lmmsize;
+	ENTRY;
+
+	LASSERT(dir && S_ISDIR(dir->i_mode));
+
+	sbi = ll_s2sbi(dir->i_sb);
+
+	CDEBUG(D_INFO, "getting parent for (%lu,"DFID")\n",
+			dir->i_ino, PFID(ll_inode2fid(dir)));
+
+	rc = ll_get_max_mdsize(sbi, &lmmsize);
+	if (rc != 0)
+		RETURN(ERR_PTR(rc));
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot,
+				     strlen(dotdot), lmmsize,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN((void *)op_data);
+
+	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+	ll_finish_md_op_data(op_data);
+	if (rc) {
+		CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino);
+		RETURN(ERR_PTR(rc));
+	}
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body->valid & OBD_MD_FLID);
+
+	CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n",
+		PFID(ll_inode2fid(dir)), PFID(&body->fid1));
+
+	result = ll_iget_for_nfs(dir->i_sb, &body->fid1, NULL);
+
+	ptlrpc_req_finished(req);
+	RETURN(result);
+}
+
+struct export_operations lustre_export_operations = {
+       .get_parent = ll_get_parent,
+       .encode_fh  = ll_encode_fh,
+       .get_name   = ll_get_name,
+	.fh_to_dentry = ll_fh_to_dentry,
+	.fh_to_parent = ll_fh_to_parent,
+};

diff --git a/drivers/staging/lustre/lustre/llite/llite_rmtacl.c b/drivers/staging/lustre/lustre/llite/llite_rmtacl.c
new file mode 100644
index 0000000..4c61036
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_rmtacl.c

@@ -0,0 +1,301 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_rmtacl.c
+ *
+ * Lustre Remote User Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <lustre_lite.h>
+#include <lustre_eacl.h>
+#include "llite_internal.h"
+
+static inline __u32 rce_hashfunc(uid_t id)
+{
+	return id & (RCE_HASHES - 1);
+}
+
+static inline __u32 ee_hashfunc(uid_t id)
+{
+	return id & (EE_HASHES - 1);
+}
+
+obd_valid rce_ops2valid(int ops)
+{
+	switch (ops) {
+	case RMT_LSETFACL:
+		return OBD_MD_FLRMTLSETFACL;
+	case RMT_LGETFACL:
+		return OBD_MD_FLRMTLGETFACL;
+	case RMT_RSETFACL:
+		return OBD_MD_FLRMTRSETFACL;
+	case RMT_RGETFACL:
+		return OBD_MD_FLRMTRGETFACL;
+	default:
+		return 0;
+	}
+}
+
+static struct rmtacl_ctl_entry *rce_alloc(pid_t key, int ops)
+{
+	struct rmtacl_ctl_entry *rce;
+
+	OBD_ALLOC_PTR(rce);
+	if (!rce)
+		return NULL;
+
+	INIT_LIST_HEAD(&rce->rce_list);
+	rce->rce_key = key;
+	rce->rce_ops = ops;
+
+	return rce;
+}
+
+static void rce_free(struct rmtacl_ctl_entry *rce)
+{
+	if (!list_empty(&rce->rce_list))
+		list_del(&rce->rce_list);
+
+	OBD_FREE_PTR(rce);
+}
+
+static struct rmtacl_ctl_entry *__rct_search(struct rmtacl_ctl_table *rct,
+					   pid_t key)
+{
+	struct rmtacl_ctl_entry *rce;
+	struct list_head *head = &rct->rct_entries[rce_hashfunc(key)];
+
+	list_for_each_entry(rce, head, rce_list)
+		if (rce->rce_key == key)
+			return rce;
+
+	return NULL;
+}
+
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key)
+{
+	struct rmtacl_ctl_entry *rce;
+
+	spin_lock(&rct->rct_lock);
+	rce = __rct_search(rct, key);
+	spin_unlock(&rct->rct_lock);
+	return rce;
+}
+
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops)
+{
+	struct rmtacl_ctl_entry *rce, *e;
+
+	rce = rce_alloc(key, ops);
+	if (rce == NULL)
+		return -ENOMEM;
+
+	spin_lock(&rct->rct_lock);
+	e = __rct_search(rct, key);
+	if (unlikely(e != NULL)) {
+		CWARN("Unexpected stale rmtacl_entry found: "
+		      "[key: %d] [ops: %d]\n", (int)key, ops);
+		rce_free(e);
+	}
+	list_add_tail(&rce->rce_list, &rct->rct_entries[rce_hashfunc(key)]);
+	spin_unlock(&rct->rct_lock);
+
+	return 0;
+}
+
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key)
+{
+	struct rmtacl_ctl_entry *rce;
+
+	spin_lock(&rct->rct_lock);
+	rce = __rct_search(rct, key);
+	if (rce)
+		rce_free(rce);
+	spin_unlock(&rct->rct_lock);
+
+	return rce ? 0 : -ENOENT;
+}
+
+void rct_init(struct rmtacl_ctl_table *rct)
+{
+	int i;
+
+	spin_lock_init(&rct->rct_lock);
+	for (i = 0; i < RCE_HASHES; i++)
+		INIT_LIST_HEAD(&rct->rct_entries[i]);
+}
+
+void rct_fini(struct rmtacl_ctl_table *rct)
+{
+	struct rmtacl_ctl_entry *rce;
+	int i;
+
+	spin_lock(&rct->rct_lock);
+	for (i = 0; i < RCE_HASHES; i++)
+		while (!list_empty(&rct->rct_entries[i])) {
+			rce = list_entry(rct->rct_entries[i].next,
+					     struct rmtacl_ctl_entry, rce_list);
+			rce_free(rce);
+		}
+	spin_unlock(&rct->rct_lock);
+}
+
+
+static struct eacl_entry *ee_alloc(pid_t key, struct lu_fid *fid, int type,
+				   ext_acl_xattr_header *header)
+{
+	struct eacl_entry *ee;
+
+	OBD_ALLOC_PTR(ee);
+	if (!ee)
+		return NULL;
+
+	INIT_LIST_HEAD(&ee->ee_list);
+	ee->ee_key = key;
+	ee->ee_fid = *fid;
+	ee->ee_type = type;
+	ee->ee_acl = header;
+
+	return ee;
+}
+
+void ee_free(struct eacl_entry *ee)
+{
+	if (!list_empty(&ee->ee_list))
+		list_del(&ee->ee_list);
+
+	if (ee->ee_acl)
+		lustre_ext_acl_xattr_free(ee->ee_acl);
+
+	OBD_FREE_PTR(ee);
+}
+
+static struct eacl_entry *__et_search_del(struct eacl_table *et, pid_t key,
+					struct lu_fid *fid, int type)
+{
+	struct eacl_entry *ee;
+	struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+	LASSERT(fid != NULL);
+	list_for_each_entry(ee, head, ee_list)
+		if (ee->ee_key == key) {
+			if (lu_fid_eq(&ee->ee_fid, fid) &&
+			    ee->ee_type == type) {
+				list_del_init(&ee->ee_list);
+				return ee;
+			}
+		}
+
+	return NULL;
+}
+
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+				 struct lu_fid *fid, int type)
+{
+	struct eacl_entry *ee;
+
+	spin_lock(&et->et_lock);
+	ee = __et_search_del(et, key, fid, type);
+	spin_unlock(&et->et_lock);
+	return ee;
+}
+
+void et_search_free(struct eacl_table *et, pid_t key)
+{
+	struct eacl_entry *ee, *next;
+	struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+	spin_lock(&et->et_lock);
+	list_for_each_entry_safe(ee, next, head, ee_list)
+		if (ee->ee_key == key)
+			ee_free(ee);
+
+	spin_unlock(&et->et_lock);
+}
+
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+	   ext_acl_xattr_header *header)
+{
+	struct eacl_entry *ee, *e;
+
+	ee = ee_alloc(key, fid, type, header);
+	if (ee == NULL)
+		return -ENOMEM;
+
+	spin_lock(&et->et_lock);
+	e = __et_search_del(et, key, fid, type);
+	if (unlikely(e != NULL)) {
+		CWARN("Unexpected stale eacl_entry found: "
+		      "[key: %d] [fid: "DFID"] [type: %d]\n",
+		      (int)key, PFID(fid), type);
+		ee_free(e);
+	}
+	list_add_tail(&ee->ee_list, &et->et_entries[ee_hashfunc(key)]);
+	spin_unlock(&et->et_lock);
+
+	return 0;
+}
+
+void et_init(struct eacl_table *et)
+{
+	int i;
+
+	spin_lock_init(&et->et_lock);
+	for (i = 0; i < EE_HASHES; i++)
+		INIT_LIST_HEAD(&et->et_entries[i]);
+}
+
+void et_fini(struct eacl_table *et)
+{
+	struct eacl_entry *ee;
+	int i;
+
+	spin_lock(&et->et_lock);
+	for (i = 0; i < EE_HASHES; i++)
+		while (!list_empty(&et->et_entries[i])) {
+			ee = list_entry(et->et_entries[i].next,
+					    struct eacl_entry, ee_list);
+			ee_free(ee);
+		}
+	spin_unlock(&et->et_lock);
+}
+
+#endif

diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
new file mode 100644
index 0000000..9d4c17e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/lloop.c

@@ -0,0 +1,867 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ *  linux/drivers/block/loop.c
+ *
+ *  Written by Theodore Ts'o, 3/29/93
+ *
+ * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
+ * permitted under the GNU General Public License.
+ *
+ * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
+ * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
+ *
+ * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
+ *
+ * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
+ *
+ * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
+ *
+ * Loadable modules and other fixes by AK, 1998
+ *
+ * Maximum number of loop devices now dynamic via max_loop module parameter.
+ * Russell Kroll <rkroll@exploits.org> 19990701
+ *
+ * Maximum number of loop devices when compiled-in now selectable by passing
+ * max_loop=<1-255> to the kernel on boot.
+ * Erik I. Bols?, <eriki@himolde.no>, Oct 31, 1999
+ *
+ * Completely rewrite request handling to be make_request_fn style and
+ * non blocking, pushing work to a helper thread. Lots of fixes from
+ * Al Viro too.
+ * Jens Axboe <axboe@suse.de>, Nov 2000
+ *
+ * Support up to 256 loop devices
+ * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
+ *
+ * Support for falling back on the write file operation when the address space
+ * operations prepare_write and/or commit_write are not available on the
+ * backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
+ * Still To Fix:
+ * - Advisory locking is ignored here.
+ * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
+ *
+ */
+
+#include <linux/module.h>
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>		/* for invalidate_bdev() */
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+#include <linux/pagevec.h>
+
+#include <asm/uaccess.h>
+
+#include <lustre_lib.h>
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+#define LLOOP_MAX_SEGMENTS	LNET_MAX_IOV
+
+/* Possible states of device */
+enum {
+	LLOOP_UNBOUND,
+	LLOOP_BOUND,
+	LLOOP_RUNDOWN,
+};
+
+struct lloop_device {
+	int		  lo_number;
+	int		  lo_refcnt;
+	loff_t	       lo_offset;
+	loff_t	       lo_sizelimit;
+	int		  lo_flags;
+	int		(*ioctl)(struct lloop_device *, int cmd,
+				    unsigned long arg);
+
+	struct file	 *lo_backing_file;
+	struct block_device *lo_device;
+	unsigned	     lo_blocksize;
+
+	int		  old_gfp_mask;
+
+	spinlock_t		lo_lock;
+	struct bio		*lo_bio;
+	struct bio		*lo_biotail;
+	int			lo_state;
+	struct semaphore	lo_sem;
+	struct mutex		lo_ctl_mutex;
+	atomic_t	 lo_pending;
+	wait_queue_head_t	  lo_bh_wait;
+
+	struct request_queue *lo_queue;
+
+	const struct lu_env *lo_env;
+	struct cl_io	 lo_io;
+	struct ll_dio_pages  lo_pvec;
+
+	/* data to handle bio for lustre. */
+	struct lo_request_data {
+		struct page *lrd_pages[LLOOP_MAX_SEGMENTS];
+		loff_t       lrd_offsets[LLOOP_MAX_SEGMENTS];
+	} lo_requests[1];
+};
+
+/*
+ * Loop flags
+ */
+enum {
+	LO_FLAGS_READ_ONLY       = 1,
+};
+
+static int lloop_major;
+#define MAX_LOOP_DEFAULT  16
+static int max_loop = MAX_LOOP_DEFAULT;
+static struct lloop_device *loop_dev;
+static struct gendisk **disks;
+static struct mutex lloop_mutex;
+static void *ll_iocontrol_magic = NULL;
+
+static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
+{
+	loff_t size, offset, loopsize;
+
+	/* Compute loopsize in bytes */
+	size = i_size_read(file->f_mapping->host);
+	offset = lo->lo_offset;
+	loopsize = size - offset;
+	if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
+		loopsize = lo->lo_sizelimit;
+
+	/*
+	 * Unfortunately, if we want to do I/O on the device,
+	 * the number of 512-byte sectors has to fit into a sector_t.
+	 */
+	return loopsize >> 9;
+}
+
+static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
+{
+	const struct lu_env  *env   = lo->lo_env;
+	struct cl_io	 *io    = &lo->lo_io;
+	struct inode	 *inode = lo->lo_backing_file->f_dentry->d_inode;
+	struct cl_object     *obj = ll_i2info(inode)->lli_clob;
+	pgoff_t	       offset;
+	int		   ret;
+	int		   i;
+	int		   rw;
+	obd_count	     page_count = 0;
+	struct bio_vec       *bvec;
+	struct bio	   *bio;
+	ssize_t	       bytes;
+
+	struct ll_dio_pages  *pvec = &lo->lo_pvec;
+	struct page	 **pages = pvec->ldp_pages;
+	loff_t	       *offsets = pvec->ldp_offsets;
+
+	truncate_inode_pages(inode->i_mapping, 0);
+
+	/* initialize the IO */
+	memset(io, 0, sizeof(*io));
+	io->ci_obj = obj;
+	ret = cl_io_init(env, io, CIT_MISC, obj);
+	if (ret)
+		return io->ci_result;
+	io->ci_lockreq = CILR_NEVER;
+
+	LASSERT(head != NULL);
+	rw = head->bi_rw;
+	for (bio = head; bio != NULL; bio = bio->bi_next) {
+		LASSERT(rw == bio->bi_rw);
+
+		offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
+		bio_for_each_segment(bvec, bio, i) {
+			BUG_ON(bvec->bv_offset != 0);
+			BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE);
+
+			pages[page_count] = bvec->bv_page;
+			offsets[page_count] = offset;
+			page_count++;
+			offset += bvec->bv_len;
+		}
+		LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
+	}
+
+	ll_stats_ops_tally(ll_i2sbi(inode),
+			(rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
+			page_count);
+
+	pvec->ldp_size = page_count << PAGE_CACHE_SHIFT;
+	pvec->ldp_nr = page_count;
+
+	/* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to
+	 * write those pages into OST. Even worse case is that more pages
+	 * would be asked to write out to swap space, and then finally get here
+	 * again.
+	 * Unfortunately this is NOT easy to fix.
+	 * Thoughts on solution:
+	 * 0. Define a reserved pool for cl_pages, which could be a list of
+	 *    pre-allocated cl_pages;
+	 * 1. Define a new operation in cl_object_operations{}, says clo_depth,
+	 *    which measures how many layers for this lustre object. Generally
+	 *    speaking, the depth would be 2, one for llite, and one for lovsub.
+	 *    However, for SNS, there will be more since we need additional page
+	 *    to store parity;
+	 * 2. Reserve the # of (page_count * depth) cl_pages from the reserved
+	 *    pool. Afterwards, the clio would allocate the pages from reserved
+	 *    pool, this guarantees we neeedn't allocate the cl_pages from
+	 *    generic cl_page slab cache.
+	 *    Of course, if there is NOT enough pages in the pool, we might
+	 *    be asked to write less pages once, this purely depends on
+	 *    implementation. Anyway, we should be careful to avoid deadlocking.
+	 */
+	mutex_lock(&inode->i_mutex);
+	bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
+	mutex_unlock(&inode->i_mutex);
+	cl_io_fini(env, io);
+	return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
+}
+
+/*
+ * Add bio to back of pending list
+ */
+static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lo->lo_lock, flags);
+	if (lo->lo_biotail) {
+		lo->lo_biotail->bi_next = bio;
+		lo->lo_biotail = bio;
+	} else
+		lo->lo_bio = lo->lo_biotail = bio;
+	spin_unlock_irqrestore(&lo->lo_lock, flags);
+
+	atomic_inc(&lo->lo_pending);
+	if (waitqueue_active(&lo->lo_bh_wait))
+		wake_up(&lo->lo_bh_wait);
+}
+
+/*
+ * Grab first pending buffer
+ */
+static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
+{
+	struct bio *first;
+	struct bio **bio;
+	unsigned int count = 0;
+	unsigned int page_count = 0;
+	int rw;
+
+	spin_lock_irq(&lo->lo_lock);
+	first = lo->lo_bio;
+	if (unlikely(first == NULL)) {
+		spin_unlock_irq(&lo->lo_lock);
+		return 0;
+	}
+
+	/* TODO: need to split the bio, too bad. */
+	LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS);
+
+	rw = first->bi_rw;
+	bio = &lo->lo_bio;
+	while (*bio && (*bio)->bi_rw == rw) {
+		CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
+		       (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+		       page_count, (*bio)->bi_vcnt);
+		if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
+			break;
+
+
+		page_count += (*bio)->bi_vcnt;
+		count++;
+		bio = &(*bio)->bi_next;
+	}
+	if (*bio) {
+		/* Some of bios can't be mergable. */
+		lo->lo_bio = *bio;
+		*bio = NULL;
+	} else {
+		/* Hit the end of queue */
+		lo->lo_biotail = NULL;
+		lo->lo_bio = NULL;
+	}
+	*req = first;
+	spin_unlock_irq(&lo->lo_lock);
+	return count;
+}
+
+static ll_mrf_ret
+loop_make_request(struct request_queue *q, struct bio *old_bio)
+{
+	struct lloop_device *lo = q->queuedata;
+	int rw = bio_rw(old_bio);
+	int inactive;
+
+	if (!lo)
+		goto err;
+
+	CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
+	       (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
+
+	spin_lock_irq(&lo->lo_lock);
+	inactive = (lo->lo_state != LLOOP_BOUND);
+	spin_unlock_irq(&lo->lo_lock);
+	if (inactive)
+		goto err;
+
+	if (rw == WRITE) {
+		if (lo->lo_flags & LO_FLAGS_READ_ONLY)
+			goto err;
+	} else if (rw == READA) {
+		rw = READ;
+	} else if (rw != READ) {
+		CERROR("lloop: unknown command (%x)\n", rw);
+		goto err;
+	}
+	loop_add_bio(lo, old_bio);
+	LL_MRF_RETURN(0);
+err:
+	cfs_bio_io_error(old_bio, old_bio->bi_size);
+	LL_MRF_RETURN(0);
+}
+
+
+static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
+{
+	int ret;
+	ret = do_bio_lustrebacked(lo, bio);
+	while (bio) {
+		struct bio *tmp = bio->bi_next;
+		bio->bi_next = NULL;
+		cfs_bio_endio(bio, bio->bi_size, ret);
+		bio = tmp;
+	}
+}
+
+static inline int loop_active(struct lloop_device *lo)
+{
+	return atomic_read(&lo->lo_pending) ||
+		(lo->lo_state == LLOOP_RUNDOWN);
+}
+
+/*
+ * worker thread that handles reads/writes to file backed loop devices,
+ * to avoid blocking in our make_request_fn.
+ */
+static int loop_thread(void *data)
+{
+	struct lloop_device *lo = data;
+	struct bio *bio;
+	unsigned int count;
+	unsigned long times = 0;
+	unsigned long total_count = 0;
+
+	struct lu_env *env;
+	int refcheck;
+	int ret = 0;
+
+	set_user_nice(current, -20);
+
+	lo->lo_state = LLOOP_BOUND;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		GOTO(out, ret = PTR_ERR(env));
+
+	lo->lo_env = env;
+	memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec));
+	lo->lo_pvec.ldp_pages   = lo->lo_requests[0].lrd_pages;
+	lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets;
+
+	/*
+	 * up sem, we are running
+	 */
+	up(&lo->lo_sem);
+
+	for (;;) {
+		wait_event(lo->lo_bh_wait, loop_active(lo));
+		if (!atomic_read(&lo->lo_pending)) {
+			int exiting = 0;
+			spin_lock_irq(&lo->lo_lock);
+			exiting = (lo->lo_state == LLOOP_RUNDOWN);
+			spin_unlock_irq(&lo->lo_lock);
+			if (exiting)
+				break;
+		}
+
+		bio = NULL;
+		count = loop_get_bio(lo, &bio);
+		if (!count) {
+			CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
+			continue;
+		}
+
+		total_count += count;
+		if (total_count < count) {     /* overflow */
+			total_count = count;
+			times = 1;
+		} else {
+			times++;
+		}
+		if ((times & 127) == 0) {
+			CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n",
+			       total_count, times, total_count / times);
+		}
+
+		LASSERT(bio != NULL);
+		LASSERT(count <= atomic_read(&lo->lo_pending));
+		loop_handle_bio(lo, bio);
+		atomic_sub(count, &lo->lo_pending);
+	}
+	cl_env_put(env, &refcheck);
+
+out:
+	up(&lo->lo_sem);
+	return ret;
+}
+
+static int loop_set_fd(struct lloop_device *lo, struct file *unused,
+		       struct block_device *bdev, struct file *file)
+{
+	struct inode	 *inode;
+	struct address_space *mapping;
+	int		   lo_flags = 0;
+	int		   error;
+	loff_t		size;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	error = -EBUSY;
+	if (lo->lo_state != LLOOP_UNBOUND)
+		goto out;
+
+	mapping = file->f_mapping;
+	inode = mapping->host;
+
+	error = -EINVAL;
+	if (!S_ISREG(inode->i_mode) || inode->i_sb->s_magic != LL_SUPER_MAGIC)
+		goto out;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		lo_flags |= LO_FLAGS_READ_ONLY;
+
+	size = get_loop_size(lo, file);
+
+	if ((loff_t)(sector_t)size != size) {
+		error = -EFBIG;
+		goto out;
+	}
+
+	/* remove all pages in cache so as dirty pages not to be existent. */
+	truncate_inode_pages(mapping, 0);
+
+	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+
+	lo->lo_blocksize = PAGE_CACHE_SIZE;
+	lo->lo_device = bdev;
+	lo->lo_flags = lo_flags;
+	lo->lo_backing_file = file;
+	lo->ioctl = NULL;
+	lo->lo_sizelimit = 0;
+	lo->old_gfp_mask = mapping_gfp_mask(mapping);
+	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+	lo->lo_bio = lo->lo_biotail = NULL;
+
+	/*
+	 * set queue make_request_fn, and add limits based on lower level
+	 * device
+	 */
+	blk_queue_make_request(lo->lo_queue, loop_make_request);
+	lo->lo_queue->queuedata = lo;
+
+	/* queue parameters */
+	CLASSERT(PAGE_CACHE_SIZE < (1 << (sizeof(unsigned short) * 8)));
+	blk_queue_logical_block_size(lo->lo_queue,
+				     (unsigned short)PAGE_CACHE_SIZE);
+	blk_queue_max_hw_sectors(lo->lo_queue,
+				 LLOOP_MAX_SEGMENTS << (PAGE_CACHE_SHIFT - 9));
+	blk_queue_max_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+
+	set_capacity(disks[lo->lo_number], size);
+	bd_set_size(bdev, size << 9);
+
+	set_blocksize(bdev, lo->lo_blocksize);
+
+	kthread_run(loop_thread, lo, "lloop%d", lo->lo_number);
+	down(&lo->lo_sem);
+	return 0;
+
+out:
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return error;
+}
+
+static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
+		       int count)
+{
+	struct file *filp = lo->lo_backing_file;
+	int gfp = lo->old_gfp_mask;
+
+	if (lo->lo_state != LLOOP_BOUND)
+		return -ENXIO;
+
+	if (lo->lo_refcnt > count)	/* we needed one fd for the ioctl */
+		return -EBUSY;
+
+	if (filp == NULL)
+		return -EINVAL;
+
+	spin_lock_irq(&lo->lo_lock);
+	lo->lo_state = LLOOP_RUNDOWN;
+	spin_unlock_irq(&lo->lo_lock);
+	wake_up(&lo->lo_bh_wait);
+
+	down(&lo->lo_sem);
+	lo->lo_backing_file = NULL;
+	lo->ioctl = NULL;
+	lo->lo_device = NULL;
+	lo->lo_offset = 0;
+	lo->lo_sizelimit = 0;
+	lo->lo_flags = 0;
+	ll_invalidate_bdev(bdev, 0);
+	set_capacity(disks[lo->lo_number], 0);
+	bd_set_size(bdev, 0);
+	mapping_set_gfp_mask(filp->f_mapping, gfp);
+	lo->lo_state = LLOOP_UNBOUND;
+	fput(filp);
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+static int lo_open(struct block_device *bdev, fmode_t mode)
+{
+	struct lloop_device *lo = bdev->bd_disk->private_data;
+
+	mutex_lock(&lo->lo_ctl_mutex);
+	lo->lo_refcnt++;
+	mutex_unlock(&lo->lo_ctl_mutex);
+
+	return 0;
+}
+
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+	struct lloop_device *lo = disk->private_data;
+
+	mutex_lock(&lo->lo_ctl_mutex);
+	--lo->lo_refcnt;
+	mutex_unlock(&lo->lo_ctl_mutex);
+}
+
+/* lloop device node's ioctl function. */
+static int lo_ioctl(struct block_device *bdev, fmode_t mode,
+		    unsigned int cmd, unsigned long arg)
+{
+	struct lloop_device *lo = bdev->bd_disk->private_data;
+	struct inode *inode = NULL;
+	int err = 0;
+
+	mutex_lock(&lloop_mutex);
+	switch (cmd) {
+	case LL_IOC_LLOOP_DETACH: {
+		err = loop_clr_fd(lo, bdev, 2);
+		if (err == 0)
+			ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+		break;
+	}
+
+	case LL_IOC_LLOOP_INFO: {
+		struct lu_fid fid;
+
+		LASSERT(lo->lo_backing_file != NULL);
+		if (inode == NULL)
+			inode = lo->lo_backing_file->f_dentry->d_inode;
+		if (lo->lo_state == LLOOP_BOUND)
+			fid = ll_i2info(inode)->lli_fid;
+		else
+			fid_zero(&fid);
+
+		if (copy_to_user((struct lu_fid *)arg, &fid, sizeof(fid)))
+			err = -EFAULT;
+		break;
+	}
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+	mutex_unlock(&lloop_mutex);
+
+	return err;
+}
+
+static struct block_device_operations lo_fops = {
+	.owner =	THIS_MODULE,
+	.open =	 lo_open,
+	.release =      lo_release,
+	.ioctl =	lo_ioctl,
+};
+
+/* dynamic iocontrol callback.
+ * This callback is registered in lloop_init and will be called by
+ * ll_iocontrol_call.
+ *
+ * This is a llite regular file ioctl function. It takes the responsibility
+ * of attaching or detaching a file by a lloop's device numner.
+ */
+static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
+				   unsigned int cmd, unsigned long arg,
+				   void *magic, int *rcp)
+{
+	struct lloop_device *lo = NULL;
+	struct block_device *bdev = NULL;
+	int err = 0;
+	dev_t dev;
+
+	if (magic != ll_iocontrol_magic)
+		return LLIOC_CONT;
+
+	if (disks == NULL)
+		GOTO(out1, err = -ENODEV);
+
+	CWARN("Enter llop_ioctl\n");
+
+	mutex_lock(&lloop_mutex);
+	switch (cmd) {
+	case LL_IOC_LLOOP_ATTACH: {
+		struct lloop_device *lo_free = NULL;
+		int i;
+
+		for (i = 0; i < max_loop; i++, lo = NULL) {
+			lo = &loop_dev[i];
+			if (lo->lo_state == LLOOP_UNBOUND) {
+				if (!lo_free)
+					lo_free = lo;
+				continue;
+			}
+			if (lo->lo_backing_file->f_dentry->d_inode ==
+			    file->f_dentry->d_inode)
+				break;
+		}
+		if (lo || !lo_free)
+			GOTO(out, err = -EBUSY);
+
+		lo = lo_free;
+		dev = MKDEV(lloop_major, lo->lo_number);
+
+		/* quit if the used pointer is writable */
+		if (put_user((long)old_encode_dev(dev), (long*)arg))
+			GOTO(out, err = -EFAULT);
+
+		bdev = blkdev_get_by_dev(dev, file->f_mode, NULL);
+		if (IS_ERR(bdev))
+			GOTO(out, err = PTR_ERR(bdev));
+
+		get_file(file);
+		err = loop_set_fd(lo, NULL, bdev, file);
+		if (err) {
+			fput(file);
+			ll_blkdev_put(bdev, 0);
+		}
+
+		break;
+	}
+
+	case LL_IOC_LLOOP_DETACH_BYDEV: {
+		int minor;
+
+		dev = old_decode_dev(arg);
+		if (MAJOR(dev) != lloop_major)
+			GOTO(out, err = -EINVAL);
+
+		minor = MINOR(dev);
+		if (minor > max_loop - 1)
+			GOTO(out, err = -EINVAL);
+
+		lo = &loop_dev[minor];
+		if (lo->lo_state != LLOOP_BOUND)
+			GOTO(out, err = -EINVAL);
+
+		bdev = lo->lo_device;
+		err = loop_clr_fd(lo, bdev, 1);
+		if (err == 0)
+			ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+
+		break;
+	}
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+out:
+	mutex_unlock(&lloop_mutex);
+out1:
+	if (rcp)
+		*rcp = err;
+	return LLIOC_STOP;
+}
+
+static int __init lloop_init(void)
+{
+	int	i;
+	unsigned int cmdlist[] = {
+		LL_IOC_LLOOP_ATTACH,
+		LL_IOC_LLOOP_DETACH_BYDEV,
+	};
+
+	if (max_loop < 1 || max_loop > 256) {
+		max_loop = MAX_LOOP_DEFAULT;
+		CWARN("lloop: invalid max_loop (must be between"
+		      " 1 and 256), using default (%u)\n", max_loop);
+	}
+
+	lloop_major = register_blkdev(0, "lloop");
+	if (lloop_major < 0)
+		return -EIO;
+
+	CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n",
+	       lloop_major, max_loop);
+
+	ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
+	if (ll_iocontrol_magic == NULL)
+		goto out_mem1;
+
+	OBD_ALLOC_WAIT(loop_dev, max_loop * sizeof(*loop_dev));
+	if (!loop_dev)
+		goto out_mem1;
+
+	OBD_ALLOC_WAIT(disks, max_loop * sizeof(*disks));
+	if (!disks)
+		goto out_mem2;
+
+	for (i = 0; i < max_loop; i++) {
+		disks[i] = alloc_disk(1);
+		if (!disks[i])
+			goto out_mem3;
+	}
+
+	mutex_init(&lloop_mutex);
+
+	for (i = 0; i < max_loop; i++) {
+		struct lloop_device *lo = &loop_dev[i];
+		struct gendisk *disk = disks[i];
+
+		lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
+		if (!lo->lo_queue)
+			goto out_mem4;
+
+		mutex_init(&lo->lo_ctl_mutex);
+		sema_init(&lo->lo_sem, 0);
+		init_waitqueue_head(&lo->lo_bh_wait);
+		lo->lo_number = i;
+		spin_lock_init(&lo->lo_lock);
+		disk->major = lloop_major;
+		disk->first_minor = i;
+		disk->fops = &lo_fops;
+		sprintf(disk->disk_name, "lloop%d", i);
+		disk->private_data = lo;
+		disk->queue = lo->lo_queue;
+	}
+
+	/* We cannot fail after we call this, so another loop!*/
+	for (i = 0; i < max_loop; i++)
+		add_disk(disks[i]);
+	return 0;
+
+out_mem4:
+	while (i--)
+		blk_cleanup_queue(loop_dev[i].lo_queue);
+	i = max_loop;
+out_mem3:
+	while (i--)
+		put_disk(disks[i]);
+	OBD_FREE(disks, max_loop * sizeof(*disks));
+out_mem2:
+	OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+out_mem1:
+	unregister_blkdev(lloop_major, "lloop");
+	ll_iocontrol_unregister(ll_iocontrol_magic);
+	CERROR("lloop: ran out of memory\n");
+	return -ENOMEM;
+}
+
+static void lloop_exit(void)
+{
+	int i;
+
+	ll_iocontrol_unregister(ll_iocontrol_magic);
+	for (i = 0; i < max_loop; i++) {
+		del_gendisk(disks[i]);
+		blk_cleanup_queue(loop_dev[i].lo_queue);
+		put_disk(disks[i]);
+	}
+	if (ll_unregister_blkdev(lloop_major, "lloop"))
+		CWARN("lloop: cannot unregister blkdev\n");
+	else
+		CDEBUG(D_CONFIG, "unregistered lloop major %d\n", lloop_major);
+
+	OBD_FREE(disks, max_loop * sizeof(*disks));
+	OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+}
+
+module_init(lloop_init);
+module_exit(lloop_exit);
+
+CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre virtual block device");
+MODULE_LICENSE("GPL");

diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c
new file mode 100644
index 0000000..6a82505
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/lproc_llite.c

@@ -0,0 +1,1370 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/version.h>
+#include <lustre_lite.h>
+#include <lprocfs_status.h>
+#include <linux/seq_file.h>
+#include <obd_support.h>
+
+#include "llite_internal.h"
+
+struct proc_dir_entry *proc_lustre_fs_root;
+
+#ifdef LPROCFS
+/* /proc/lustre/llite mount point registration */
+extern struct file_operations vvp_dump_pgcache_file_ops;
+struct file_operations ll_rw_extents_stats_fops;
+struct file_operations ll_rw_extents_stats_pp_fops;
+struct file_operations ll_rw_offset_stats_fops;
+
+static int ll_blksize_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+	      rc = seq_printf(m, "%u\n", osfs.os_bsize);
+
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_blksize);
+
+static int ll_kbytestotal_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		rc = seq_printf(m, LPU64"\n", result);
+	}
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytestotal);
+
+static int ll_kbytesfree_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		rc = seq_printf(m, LPU64"\n", result);
+	}
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytesfree);
+
+static int ll_kbytesavail_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		rc = seq_printf(m, LPU64"\n", result);
+	}
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytesavail);
+
+static int ll_filestotal_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		 rc = seq_printf(m, LPU64"\n", osfs.os_files);
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filestotal);
+
+static int ll_filesfree_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+	struct obd_statfs osfs;
+	int rc;
+
+	LASSERT(sb != NULL);
+	rc = ll_statfs_internal(sb, &osfs,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				OBD_STATFS_NODELAY);
+	if (!rc)
+		 rc = seq_printf(m, LPU64"\n", osfs.os_ffree);
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filesfree);
+
+static int ll_client_type_seq_show(struct seq_file *m, void *v)
+{
+	struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+	int rc;
+
+	LASSERT(sbi != NULL);
+
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+		rc = seq_printf(m, "remote client\n");
+	else
+		rc = seq_printf(m, "local client\n");
+
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_client_type);
+
+static int ll_fstype_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+
+	LASSERT(sb != NULL);
+	return seq_printf(m, "%s\n", sb->s_type->name);
+}
+LPROC_SEQ_FOPS_RO(ll_fstype);
+
+static int ll_sb_uuid_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = (struct super_block *)m->private;
+
+	LASSERT(sb != NULL);
+	return seq_printf(m, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
+}
+LPROC_SEQ_FOPS_RO(ll_sb_uuid);
+
+static int ll_site_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+
+	/*
+	 * See description of statistical counters in struct cl_site, and
+	 * struct lu_site.
+	 */
+	return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m);
+}
+LPROC_SEQ_FOPS_RO(ll_site_stats);
+
+static int ll_max_readahead_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_pages;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_readahead_mb_seq_write(struct file *file, const char *buffer,
+					 size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int mult, rc, pages_number;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number < 0 || pages_number > num_physpages / 2) {
+		CERROR("can't set file readahead more than %lu MB\n",
+		       num_physpages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_readahead_mb);
+
+static int ll_max_readahead_per_file_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_readahead_per_file_mb_seq_write(struct file *file,
+						  const char *buffer,
+						  size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int mult, rc, pages_number;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number < 0 ||
+		pages_number > sbi->ll_ra_info.ra_max_pages) {
+		CERROR("can't set file readahead more than"
+		       "max_read_ahead_mb %lu MB\n",
+		       sbi->ll_ra_info.ra_max_pages);
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_readahead_per_file_mb);
+
+static int ll_max_read_ahead_whole_mb_seq_show(struct seq_file *m, void *unused)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	long pages_number;
+	int mult;
+
+	spin_lock(&sbi->ll_lock);
+	pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
+	spin_unlock(&sbi->ll_lock);
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_read_ahead_whole_mb_seq_write(struct file *file,
+						const char *buffer,
+						size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int mult, rc, pages_number;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	/* Cap this at the current max readahead window size, the readahead
+	 * algorithm does this anyway so it's pointless to set it larger. */
+	if (pages_number < 0 ||
+	    pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
+		CERROR("can't set max_read_ahead_whole_mb more than "
+		       "max_read_ahead_per_file_mb: %lu\n",
+			sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_CACHE_SHIFT));
+		return -ERANGE;
+	}
+
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_read_ahead_whole_mb);
+
+static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block     *sb    = m->private;
+	struct ll_sb_info      *sbi   = ll_s2sbi(sb);
+	struct cl_client_cache *cache = &sbi->ll_cache;
+	int shift = 20 - PAGE_CACHE_SHIFT;
+	int max_cached_mb;
+	int unused_mb;
+
+	max_cached_mb = cache->ccc_lru_max >> shift;
+	unused_mb = atomic_read(&cache->ccc_lru_left) >> shift;
+	return seq_printf(m,
+			"users: %d\n"
+			"max_cached_mb: %d\n"
+			"used_mb: %d\n"
+			"unused_mb: %d\n"
+			"reclaim_count: %u\n",
+			atomic_read(&cache->ccc_users),
+			max_cached_mb,
+			max_cached_mb - unused_mb,
+			unused_mb,
+			cache->ccc_lru_shrinkers);
+}
+
+static ssize_t ll_max_cached_mb_seq_write(struct file *file, const char *buffer,
+				      size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct cl_client_cache *cache = &sbi->ll_cache;
+	int mult, rc, pages_number;
+	int diff = 0;
+	int nrpages = 0;
+	ENTRY;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	buffer = lprocfs_find_named_value(buffer, "max_cached_mb:", &count);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		RETURN(rc);
+
+	if (pages_number < 0 || pages_number > num_physpages) {
+		CERROR("%s: can't set max cache more than %lu MB\n",
+		       ll_get_fsname(sb, NULL, 0),
+		       num_physpages >> (20 - PAGE_CACHE_SHIFT));
+		RETURN(-ERANGE);
+	}
+
+	if (sbi->ll_dt_exp == NULL)
+		RETURN(-ENODEV);
+
+	spin_lock(&sbi->ll_lock);
+	diff = pages_number - cache->ccc_lru_max;
+	spin_unlock(&sbi->ll_lock);
+
+	/* easy - add more LRU slots. */
+	if (diff >= 0) {
+		atomic_add(diff, &cache->ccc_lru_left);
+		GOTO(out, rc = 0);
+	}
+
+	diff = -diff;
+	while (diff > 0) {
+		int tmp;
+
+		/* reduce LRU budget from free slots. */
+		do {
+			int ov, nv;
+
+			ov = atomic_read(&cache->ccc_lru_left);
+			if (ov == 0)
+				break;
+
+			nv = ov > diff ? ov - diff : 0;
+			rc = cfs_atomic_cmpxchg(&cache->ccc_lru_left, ov, nv);
+			if (likely(ov == rc)) {
+				diff -= ov - nv;
+				nrpages += ov - nv;
+				break;
+			}
+		} while (1);
+
+		if (diff <= 0)
+			break;
+
+		/* difficult - have to ask OSCs to drop LRU slots. */
+		tmp = diff << 1;
+		rc = obd_set_info_async(NULL, sbi->ll_dt_exp,
+				sizeof(KEY_CACHE_LRU_SHRINK),
+				KEY_CACHE_LRU_SHRINK,
+				sizeof(tmp), &tmp, NULL);
+		if (rc < 0)
+			break;
+	}
+
+out:
+	if (rc >= 0) {
+		spin_lock(&sbi->ll_lock);
+		cache->ccc_lru_max = pages_number;
+		spin_unlock(&sbi->ll_lock);
+		rc = count;
+	} else {
+		atomic_add(nrpages, &cache->ccc_lru_left);
+	}
+	return rc;
+}
+LPROC_SEQ_FOPS(ll_max_cached_mb);
+
+static int ll_checksum_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return seq_printf(m, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
+}
+
+static ssize_t ll_checksum_seq_write(struct file *file, const char *buffer,
+				 size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	if (!sbi->ll_dt_exp)
+		/* Not set up yet */
+		return -EAGAIN;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val)
+		sbi->ll_flags |= LL_SBI_CHECKSUM;
+	else
+		sbi->ll_flags &= ~LL_SBI_CHECKSUM;
+
+	rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+				KEY_CHECKSUM, sizeof(val), &val, NULL);
+	if (rc)
+		CWARN("Failed to set OSC checksum flags: %d\n", rc);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_checksum);
+
+static int ll_max_rw_chunk_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+
+	return seq_printf(m, "%lu\n", ll_s2sbi(sb)->ll_max_rw_chunk);
+}
+
+static ssize_t ll_max_rw_chunk_seq_write(struct file *file, const char *buffer,
+				     size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	int rc, val;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+	ll_s2sbi(sb)->ll_max_rw_chunk = val;
+	return count;
+}
+LPROC_SEQ_FOPS(ll_max_rw_chunk);
+
+static int ll_rd_track_id(struct seq_file *m, enum stats_track_type type)
+{
+	struct super_block *sb = m->private;
+
+	if (ll_s2sbi(sb)->ll_stats_track_type == type) {
+		return seq_printf(m, "%d\n",
+				ll_s2sbi(sb)->ll_stats_track_id);
+
+	} else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) {
+		return seq_printf(m, "0 (all)\n");
+	} else {
+		return seq_printf(m, "untracked\n");
+	}
+}
+
+static int ll_wr_track_id(const char *buffer, unsigned long count, void *data,
+			  enum stats_track_type type)
+{
+	struct super_block *sb = data;
+	int rc, pid;
+
+	rc = lprocfs_write_helper(buffer, count, &pid);
+	if (rc)
+		return rc;
+	ll_s2sbi(sb)->ll_stats_track_id = pid;
+	if (pid == 0)
+		ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL;
+	else
+		ll_s2sbi(sb)->ll_stats_track_type = type;
+	lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats);
+	return count;
+}
+
+static int ll_track_pid_seq_show(struct seq_file *m, void *v)
+{
+	return ll_rd_track_id(m, STATS_TRACK_PID);
+}
+
+static ssize_t ll_track_pid_seq_write(struct file *file, const char *buffer,
+				  size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PID);
+}
+LPROC_SEQ_FOPS(ll_track_pid);
+
+static int ll_track_ppid_seq_show(struct seq_file *m, void *v)
+{
+	return ll_rd_track_id(m, STATS_TRACK_PPID);
+}
+
+static ssize_t ll_track_ppid_seq_write(struct file *file, const char *buffer,
+				   size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PPID);
+}
+LPROC_SEQ_FOPS(ll_track_ppid);
+
+static int ll_track_gid_seq_show(struct seq_file *m, void *v)
+{
+	return ll_rd_track_id(m, STATS_TRACK_GID);
+}
+
+static ssize_t ll_track_gid_seq_write(struct file *file, const char *buffer,
+				  size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_GID);
+}
+LPROC_SEQ_FOPS(ll_track_gid);
+
+static int ll_statahead_max_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return seq_printf(m, "%u\n", sbi->ll_sa_max);
+}
+
+static ssize_t ll_statahead_max_seq_write(struct file *file, const char *buffer,
+				      size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val >= 0 && val <= LL_SA_RPC_MAX)
+		sbi->ll_sa_max = val;
+	else
+		CERROR("Bad statahead_max value %d. Valid values are in the "
+		       "range [0, %d]\n", val, LL_SA_RPC_MAX);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_max);
+
+static int ll_statahead_agl_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return seq_printf(m, "%u\n",
+			sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
+}
+
+static ssize_t ll_statahead_agl_seq_write(struct file *file, const char *buffer,
+				      size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+	else
+		sbi->ll_flags &= ~LL_SBI_AGL_ENABLED;
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_agl);
+
+static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return seq_printf(m,
+			"statahead total: %u\n"
+			"statahead wrong: %u\n"
+			"agl total: %u\n",
+			atomic_read(&sbi->ll_sa_total),
+			atomic_read(&sbi->ll_sa_wrong),
+			atomic_read(&sbi->ll_agl_total));
+}
+LPROC_SEQ_FOPS_RO(ll_statahead_stats);
+
+static int ll_lazystatfs_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+	return seq_printf(m, "%u\n",
+			(sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
+}
+
+static ssize_t ll_lazystatfs_seq_write(struct file *file, const char *buffer,
+				   size_t count, loff_t *off)
+{
+	struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val)
+		sbi->ll_flags |= LL_SBI_LAZYSTATFS;
+	else
+		sbi->ll_flags &= ~LL_SBI_LAZYSTATFS;
+
+	return count;
+}
+LPROC_SEQ_FOPS(ll_lazystatfs);
+
+static int ll_maxea_size_seq_show(struct seq_file *m, void *v)
+{
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	unsigned int ealen;
+	int rc;
+
+	rc = ll_get_max_mdsize(sbi, &ealen);
+	if (rc)
+		return rc;
+
+	return seq_printf(m, "%u\n", ealen);
+}
+LPROC_SEQ_FOPS_RO(ll_maxea_size);
+
+static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
+{
+	const char *str[] = LL_SBI_FLAGS;
+	struct super_block *sb = m->private;
+	int flags = ll_s2sbi(sb)->ll_flags;
+	int i = 0;
+
+	while (flags != 0) {
+		if (ARRAY_SIZE(str) <= i) {
+			CERROR("%s: Revise array LL_SBI_FLAGS to match sbi "
+				"flags please.\n", ll_get_fsname(sb, NULL, 0));
+			return -EINVAL;
+		}
+
+		if (flags & 0x1)
+			seq_printf(m, "%s ", str[i]);
+		flags >>= 1;
+		++i;
+	}
+	seq_printf(m, "\b\n");
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_sbi_flags);
+
+static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
+	{ "uuid",	  &ll_sb_uuid_fops,	  0, 0 },
+	//{ "mntpt_path",   ll_rd_path,	     0, 0 },
+	{ "fstype",       &ll_fstype_fops,	  0, 0 },
+	{ "site",	  &ll_site_stats_fops,    0, 0 },
+	{ "blocksize",    &ll_blksize_fops,	  0, 0 },
+	{ "kbytestotal",  &ll_kbytestotal_fops,   0, 0 },
+	{ "kbytesfree",   &ll_kbytesfree_fops,    0, 0 },
+	{ "kbytesavail",  &ll_kbytesavail_fops,   0, 0 },
+	{ "filestotal",   &ll_filestotal_fops,    0, 0 },
+	{ "filesfree",    &ll_filesfree_fops,	  0, 0 },
+	{ "client_type",  &ll_client_type_fops,   0, 0 },
+	//{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
+	{ "max_read_ahead_mb", &ll_max_readahead_mb_fops, 0 },
+	{ "max_read_ahead_per_file_mb", &ll_max_readahead_per_file_mb_fops, 0 },
+	{ "max_read_ahead_whole_mb", &ll_max_read_ahead_whole_mb_fops, 0 },
+	{ "max_cached_mb",    &ll_max_cached_mb_fops, 0 },
+	{ "checksum_pages",   &ll_checksum_fops, 0 },
+	{ "max_rw_chunk",     &ll_max_rw_chunk_fops, 0 },
+	{ "stats_track_pid",  &ll_track_pid_fops, 0 },
+	{ "stats_track_ppid", &ll_track_ppid_fops, 0 },
+	{ "stats_track_gid",  &ll_track_gid_fops, 0 },
+	{ "statahead_max",    &ll_statahead_max_fops, 0 },
+	{ "statahead_agl",    &ll_statahead_agl_fops, 0 },
+	{ "statahead_stats",  &ll_statahead_stats_fops, 0, 0 },
+	{ "lazystatfs",       &ll_lazystatfs_fops, 0 },
+	{ "max_easize",       &ll_maxea_size_fops, 0, 0 },
+	{ "sbi_flags",	      &ll_sbi_flags_fops, 0, 0 },
+	{ 0 }
+};
+
+#define MAX_STRING_SIZE 128
+
+struct llite_file_opcode {
+	__u32       opcode;
+	__u32       type;
+	const char *opname;
+} llite_opcode_table[LPROC_LL_FILE_OPCODES] = {
+	/* file operation */
+	{ LPROC_LL_DIRTY_HITS,     LPROCFS_TYPE_REGS, "dirty_pages_hits" },
+	{ LPROC_LL_DIRTY_MISSES,   LPROCFS_TYPE_REGS, "dirty_pages_misses" },
+	{ LPROC_LL_READ_BYTES,     LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "read_bytes" },
+	{ LPROC_LL_WRITE_BYTES,    LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "write_bytes" },
+	{ LPROC_LL_BRW_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+				   "brw_read" },
+	{ LPROC_LL_BRW_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+				   "brw_write" },
+	{ LPROC_LL_OSC_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "osc_read" },
+	{ LPROC_LL_OSC_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+				   "osc_write" },
+	{ LPROC_LL_IOCTL,	  LPROCFS_TYPE_REGS, "ioctl" },
+	{ LPROC_LL_OPEN,	   LPROCFS_TYPE_REGS, "open" },
+	{ LPROC_LL_RELEASE,	LPROCFS_TYPE_REGS, "close" },
+	{ LPROC_LL_MAP,	    LPROCFS_TYPE_REGS, "mmap" },
+	{ LPROC_LL_LLSEEK,	 LPROCFS_TYPE_REGS, "seek" },
+	{ LPROC_LL_FSYNC,	  LPROCFS_TYPE_REGS, "fsync" },
+	{ LPROC_LL_READDIR,	LPROCFS_TYPE_REGS, "readdir" },
+	/* inode operation */
+	{ LPROC_LL_SETATTR,	LPROCFS_TYPE_REGS, "setattr" },
+	{ LPROC_LL_TRUNC,	  LPROCFS_TYPE_REGS, "truncate" },
+	{ LPROC_LL_FLOCK,	  LPROCFS_TYPE_REGS, "flock" },
+	{ LPROC_LL_GETATTR,	LPROCFS_TYPE_REGS, "getattr" },
+	/* dir inode operation */
+	{ LPROC_LL_CREATE,	 LPROCFS_TYPE_REGS, "create" },
+	{ LPROC_LL_LINK,	   LPROCFS_TYPE_REGS, "link" },
+	{ LPROC_LL_UNLINK,	 LPROCFS_TYPE_REGS, "unlink" },
+	{ LPROC_LL_SYMLINK,	LPROCFS_TYPE_REGS, "symlink" },
+	{ LPROC_LL_MKDIR,	  LPROCFS_TYPE_REGS, "mkdir" },
+	{ LPROC_LL_RMDIR,	  LPROCFS_TYPE_REGS, "rmdir" },
+	{ LPROC_LL_MKNOD,	  LPROCFS_TYPE_REGS, "mknod" },
+	{ LPROC_LL_RENAME,	 LPROCFS_TYPE_REGS, "rename" },
+	/* special inode operation */
+	{ LPROC_LL_STAFS,	  LPROCFS_TYPE_REGS, "statfs" },
+	{ LPROC_LL_ALLOC_INODE,    LPROCFS_TYPE_REGS, "alloc_inode" },
+	{ LPROC_LL_SETXATTR,       LPROCFS_TYPE_REGS, "setxattr" },
+	{ LPROC_LL_GETXATTR,       LPROCFS_TYPE_REGS, "getxattr" },
+	{ LPROC_LL_LISTXATTR,      LPROCFS_TYPE_REGS, "listxattr" },
+	{ LPROC_LL_REMOVEXATTR,    LPROCFS_TYPE_REGS, "removexattr" },
+	{ LPROC_LL_INODE_PERM,     LPROCFS_TYPE_REGS, "inode_permission" },
+};
+
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count)
+{
+	if (!sbi->ll_stats)
+		return;
+	if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_PID &&
+		 sbi->ll_stats_track_id == current->pid)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_PPID &&
+		 sbi->ll_stats_track_id == current->parent->pid)
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+	else if (sbi->ll_stats_track_type == STATS_TRACK_GID &&
+		 sbi->ll_stats_track_id == current_gid())
+		lprocfs_counter_add(sbi->ll_stats, op, count);
+}
+EXPORT_SYMBOL(ll_stats_ops_tally);
+
+static const char *ra_stat_string[] = {
+	[RA_STAT_HIT] = "hits",
+	[RA_STAT_MISS] = "misses",
+	[RA_STAT_DISTANT_READPAGE] = "readpage not consecutive",
+	[RA_STAT_MISS_IN_WINDOW] = "miss inside window",
+	[RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page",
+	[RA_STAT_FAILED_MATCH] = "failed lock match",
+	[RA_STAT_DISCARDED] = "read but discarded",
+	[RA_STAT_ZERO_LEN] = "zero length file",
+	[RA_STAT_ZERO_WINDOW] = "zero size window",
+	[RA_STAT_EOF] = "read-ahead to EOF",
+	[RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
+	[RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(llite, name);
+LPROC_SEQ_FOPS_RO_TYPE(llite, uuid);
+
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+				struct super_block *sb, char *osc, char *mdc)
+{
+	struct lprocfs_vars lvars[2];
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
+	struct obd_device *obd;
+	proc_dir_entry_t *dir;
+	char name[MAX_STRING_SIZE + 1], *ptr;
+	int err, id, len, rc;
+	ENTRY;
+
+	memset(lvars, 0, sizeof(lvars));
+
+	name[MAX_STRING_SIZE] = '\0';
+	lvars[0].name = name;
+
+	LASSERT(sbi != NULL);
+	LASSERT(mdc != NULL);
+	LASSERT(osc != NULL);
+
+	/* Get fsname */
+	len = strlen(lsi->lsi_lmd->lmd_profile);
+	ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+	if (ptr && (strcmp(ptr, "-client") == 0))
+		len -= 7;
+
+	/* Mount info */
+	snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
+		 lsi->lsi_lmd->lmd_profile, sb);
+
+	sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
+	if (IS_ERR(sbi->ll_proc_root)) {
+		err = PTR_ERR(sbi->ll_proc_root);
+		sbi->ll_proc_root = NULL;
+		RETURN(err);
+	}
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
+				&vvp_dump_pgcache_file_ops, sbi);
+	if (rc)
+		CWARN("Error adding the dump_page_cache file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644,
+				&ll_rw_extents_stats_fops, sbi);
+	if (rc)
+		CWARN("Error adding the extent_stats file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process",
+				0644, &ll_rw_extents_stats_pp_fops, sbi);
+	if (rc)
+		CWARN("Error adding the extents_stats_per_process file\n");
+
+	rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644,
+				&ll_rw_offset_stats_fops, sbi);
+	if (rc)
+		CWARN("Error adding the offset_stats file\n");
+
+	/* File operations stats */
+	sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
+					    LPROCFS_STATS_FLAG_NONE);
+	if (sbi->ll_stats == NULL)
+		GOTO(out, err = -ENOMEM);
+	/* do counter init */
+	for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
+		__u32 type = llite_opcode_table[id].type;
+		void *ptr = NULL;
+		if (type & LPROCFS_TYPE_REGS)
+			ptr = "regs";
+		else if (type & LPROCFS_TYPE_BYTES)
+			ptr = "bytes";
+		else if (type & LPROCFS_TYPE_PAGES)
+			ptr = "pages";
+		lprocfs_counter_init(sbi->ll_stats,
+				     llite_opcode_table[id].opcode,
+				     (type & LPROCFS_CNTR_AVGMINMAX),
+				     llite_opcode_table[id].opname, ptr);
+	}
+	err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats);
+	if (err)
+		GOTO(out, err);
+
+	sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
+					       LPROCFS_STATS_FLAG_NONE);
+	if (sbi->ll_ra_stats == NULL)
+		GOTO(out, err = -ENOMEM);
+
+	for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
+		lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
+				     ra_stat_string[id], "pages");
+	err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
+				     sbi->ll_ra_stats);
+	if (err)
+		GOTO(out, err);
+
+
+	err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb);
+	if (err)
+		GOTO(out, err);
+
+	/* MDC info */
+	obd = class_name2obd(mdc);
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_name != NULL);
+
+	dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
+	if (dir == NULL)
+		GOTO(out, err = -ENOMEM);
+
+	snprintf(name, MAX_STRING_SIZE, "common_name");
+	lvars[0].fops = &llite_name_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+	if (err)
+		GOTO(out, err);
+
+	snprintf(name, MAX_STRING_SIZE, "uuid");
+	lvars[0].fops = &llite_uuid_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+	if (err)
+		GOTO(out, err);
+
+	/* OSC */
+	obd = class_name2obd(osc);
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_name != NULL);
+
+	dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
+	if (dir == NULL)
+		GOTO(out, err = -ENOMEM);
+
+	snprintf(name, MAX_STRING_SIZE, "common_name");
+	lvars[0].fops = &llite_name_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+	if (err)
+		GOTO(out, err);
+
+	snprintf(name, MAX_STRING_SIZE, "uuid");
+	lvars[0].fops = &llite_uuid_fops;
+	err = lprocfs_add_vars(dir, lvars, obd);
+out:
+	if (err) {
+		lprocfs_remove(&sbi->ll_proc_root);
+		lprocfs_free_stats(&sbi->ll_ra_stats);
+		lprocfs_free_stats(&sbi->ll_stats);
+	}
+	RETURN(err);
+}
+
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi)
+{
+	if (sbi->ll_proc_root) {
+		lprocfs_remove(&sbi->ll_proc_root);
+		lprocfs_free_stats(&sbi->ll_ra_stats);
+		lprocfs_free_stats(&sbi->ll_stats);
+	}
+}
+#undef MAX_STRING_SIZE
+
+#define pct(a,b) (b ? a * 100 / b : 0)
+
+static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
+				   struct seq_file *seq, int which)
+{
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	unsigned long start, end, r, w;
+	char *unitp = "KMGTPEZY";
+	int i, units = 10;
+	struct per_process_info *pp_info = &io_extents->pp_extents[which];
+
+	read_cum = 0;
+	write_cum = 0;
+	start = 0;
+
+	for(i = 0; i < LL_HIST_MAX; i++) {
+		read_tot += pp_info->pp_r_hist.oh_buckets[i];
+		write_tot += pp_info->pp_w_hist.oh_buckets[i];
+	}
+
+	for(i = 0; i < LL_HIST_MAX; i++) {
+		r = pp_info->pp_r_hist.oh_buckets[i];
+		w = pp_info->pp_w_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		end = 1 << (i + LL_HIST_START - units);
+		seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu  | "
+			   "%14lu %4lu %4lu\n", start, *unitp, end, *unitp,
+			   (i == LL_HIST_MAX - 1) ? '+' : ' ',
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+		start = end;
+		if (start == 1<<10) {
+			start = 1;
+			units += 10;
+			unitp++;
+		}
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+}
+
+static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int k;
+
+	do_gettimeofday(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_printf(seq, "disabled\n"
+				"write anything in this file to activate, "
+				"then 0 or \"[D/d]isabled\" to deactivate\n");
+		return 0;
+	}
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, now.tv_usec);
+	seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+	seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+		   "extents", "calls", "%", "cum%",
+		   "calls", "%", "cum%");
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (k = 0; k < LL_PROCESS_HIST_MAX; k++) {
+		if (io_extents->pp_extents[k].pid != 0) {
+			seq_printf(seq, "\nPID: %d\n",
+				   io_extents->pp_extents[k].pid);
+			ll_display_extents_info(io_extents, seq, k);
+		}
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+	return 0;
+}
+
+static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
+						const char *buf, size_t len,
+						loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int i;
+	int value = 1, rc = 0;
+
+	rc = lprocfs_write_helper(buf, len, &value);
+	if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+		       strcmp(buf, "Disabled") == 0))
+		value = 0;
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		io_extents->pp_extents[i].pid = 0;
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+	return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats_pp);
+
+static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+	do_gettimeofday(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_printf(seq, "disabled\n"
+				"write anything in this file to activate, "
+				"then 0 or \"[D/d]isabled\" to deactivate\n");
+		return 0;
+	}
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, now.tv_usec);
+
+	seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+	seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+		   "extents", "calls", "%", "cum%",
+		   "calls", "%", "cum%");
+	spin_lock(&sbi->ll_lock);
+	ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX);
+	spin_unlock(&sbi->ll_lock);
+
+	return 0;
+}
+
+static ssize_t ll_rw_extents_stats_seq_write(struct file *file, const char *buf,
+					size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+	int i;
+	int value = 1, rc = 0;
+
+	rc = lprocfs_write_helper(buf, len, &value);
+	if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+		       strcmp(buf, "Disabled") == 0))
+		value = 0;
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+	spin_lock(&sbi->ll_pp_extent_lock);
+	for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+		io_extents->pp_extents[i].pid = 0;
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats);
+
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+		       struct ll_file_data *file, loff_t pos,
+		       size_t count, int rw)
+{
+	int i, cur = -1;
+	struct ll_rw_process_info *process;
+	struct ll_rw_process_info *offset;
+	int *off_count = &sbi->ll_rw_offset_entry_count;
+	int *process_count = &sbi->ll_offset_process_count;
+	struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+	if(!sbi->ll_rw_stats_on)
+		return;
+	process = sbi->ll_rw_process_info;
+	offset = sbi->ll_rw_offset_info;
+
+	spin_lock(&sbi->ll_pp_extent_lock);
+	/* Extent statistics */
+	for(i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if(io_extents->pp_extents[i].pid == pid) {
+			cur = i;
+			break;
+		}
+	}
+
+	if (cur == -1) {
+		/* new process */
+		sbi->ll_extent_process_count =
+			(sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX;
+		cur = sbi->ll_extent_process_count;
+		io_extents->pp_extents[cur].pid = pid;
+		lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist);
+		lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
+	}
+
+	for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
+	     (i < (LL_HIST_MAX - 1)); i++);
+	if (rw == 0) {
+		io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+	} else {
+		io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+		io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+	}
+	spin_unlock(&sbi->ll_pp_extent_lock);
+
+	spin_lock(&sbi->ll_process_lock);
+	/* Offset statistics */
+	for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if (process[i].rw_pid == pid) {
+			if (process[i].rw_last_file != file) {
+				process[i].rw_range_start = pos;
+				process[i].rw_last_file_pos = pos + count;
+				process[i].rw_smallest_extent = count;
+				process[i].rw_largest_extent = count;
+				process[i].rw_offset = 0;
+				process[i].rw_last_file = file;
+				spin_unlock(&sbi->ll_process_lock);
+				return;
+			}
+			if (process[i].rw_last_file_pos != pos) {
+				*off_count =
+				    (*off_count + 1) % LL_OFFSET_HIST_MAX;
+				offset[*off_count].rw_op = process[i].rw_op;
+				offset[*off_count].rw_pid = pid;
+				offset[*off_count].rw_range_start =
+					process[i].rw_range_start;
+				offset[*off_count].rw_range_end =
+					process[i].rw_last_file_pos;
+				offset[*off_count].rw_smallest_extent =
+					process[i].rw_smallest_extent;
+				offset[*off_count].rw_largest_extent =
+					process[i].rw_largest_extent;
+				offset[*off_count].rw_offset =
+					process[i].rw_offset;
+				process[i].rw_op = rw;
+				process[i].rw_range_start = pos;
+				process[i].rw_smallest_extent = count;
+				process[i].rw_largest_extent = count;
+				process[i].rw_offset = pos -
+					process[i].rw_last_file_pos;
+			}
+			if(process[i].rw_smallest_extent > count)
+				process[i].rw_smallest_extent = count;
+			if(process[i].rw_largest_extent < count)
+				process[i].rw_largest_extent = count;
+			process[i].rw_last_file_pos = pos + count;
+			spin_unlock(&sbi->ll_process_lock);
+			return;
+		}
+	}
+	*process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX;
+	process[*process_count].rw_pid = pid;
+	process[*process_count].rw_op = rw;
+	process[*process_count].rw_range_start = pos;
+	process[*process_count].rw_last_file_pos = pos + count;
+	process[*process_count].rw_smallest_extent = count;
+	process[*process_count].rw_largest_extent = count;
+	process[*process_count].rw_offset = 0;
+	process[*process_count].rw_last_file = file;
+	spin_unlock(&sbi->ll_process_lock);
+}
+
+static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_process_info *offset = sbi->ll_rw_offset_info;
+	struct ll_rw_process_info *process = sbi->ll_rw_process_info;
+	int i;
+
+	do_gettimeofday(&now);
+
+	if (!sbi->ll_rw_stats_on) {
+		seq_printf(seq, "disabled\n"
+				"write anything in this file to activate, "
+				"then 0 or \"[D/d]isabled\" to deactivate\n");
+		return 0;
+	}
+	spin_lock(&sbi->ll_process_lock);
+
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, now.tv_usec);
+	seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n",
+		   "R/W", "PID", "RANGE START", "RANGE END",
+		   "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET");
+	/* We stored the discontiguous offsets here; print them first */
+	for(i = 0; i < LL_OFFSET_HIST_MAX; i++) {
+		if (offset[i].rw_pid != 0)
+			seq_printf(seq,"%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				   offset[i].rw_op ? 'W' : 'R',
+				   offset[i].rw_pid,
+				   offset[i].rw_range_start,
+				   offset[i].rw_range_end,
+				   (unsigned long)offset[i].rw_smallest_extent,
+				   (unsigned long)offset[i].rw_largest_extent,
+				   offset[i].rw_offset);
+	}
+	/* Then print the current offsets for each process */
+	for(i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+		if (process[i].rw_pid != 0)
+			seq_printf(seq,"%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+				   process[i].rw_op ? 'W' : 'R',
+				   process[i].rw_pid,
+				   process[i].rw_range_start,
+				   process[i].rw_last_file_pos,
+				   (unsigned long)process[i].rw_smallest_extent,
+				   (unsigned long)process[i].rw_largest_extent,
+				   process[i].rw_offset);
+	}
+	spin_unlock(&sbi->ll_process_lock);
+
+	return 0;
+}
+
+static ssize_t ll_rw_offset_stats_seq_write(struct file *file, const char *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct ll_sb_info *sbi = seq->private;
+	struct ll_rw_process_info *process_info = sbi->ll_rw_process_info;
+	struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info;
+	int value = 1, rc = 0;
+
+	rc = lprocfs_write_helper(buf, len, &value);
+
+	if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+			   strcmp(buf, "Disabled") == 0))
+		value = 0;
+
+	if (value == 0)
+		sbi->ll_rw_stats_on = 0;
+	else
+		sbi->ll_rw_stats_on = 1;
+
+	spin_lock(&sbi->ll_process_lock);
+	sbi->ll_offset_process_count = 0;
+	sbi->ll_rw_offset_entry_count = 0;
+	memset(process_info, 0, sizeof(struct ll_rw_process_info) *
+	       LL_PROCESS_HIST_MAX);
+	memset(offset_info, 0, sizeof(struct ll_rw_process_info) *
+	       LL_OFFSET_HIST_MAX);
+	spin_unlock(&sbi->ll_process_lock);
+
+	return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_offset_stats);
+
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = NULL;
+    lvars->obd_vars     = lprocfs_llite_obd_vars;
+}
+#endif /* LPROCFS */

diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c
new file mode 100644
index 0000000..58d59aa
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/namei.c

@@ -0,0 +1,1279 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include "llite_internal.h"
+
+static int ll_create_it(struct inode *, struct dentry *,
+			int, struct lookup_intent *);
+
+/*
+ * Check if we have something mounted at the named dchild.
+ * In such a case there would always be dentry present.
+ */
+static int ll_d_mountpoint(struct dentry *dparent, struct dentry *dchild,
+			   struct qstr *name)
+{
+	int mounted = 0;
+
+	if (unlikely(dchild)) {
+		mounted = d_mountpoint(dchild);
+	} else if (dparent) {
+		dchild = d_lookup(dparent, name);
+		if (dchild) {
+			mounted = d_mountpoint(dchild);
+			dput(dchild);
+		}
+	}
+	return mounted;
+}
+
+int ll_unlock(__u32 mode, struct lustre_handle *lockh)
+{
+	ENTRY;
+
+	ldlm_lock_decref(lockh, mode);
+
+	RETURN(0);
+}
+
+
+/* called from iget5_locked->find_inode() under inode_lock spinlock */
+static int ll_test_inode(struct inode *inode, void *opaque)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct lustre_md     *md = opaque;
+
+	if (unlikely(!(md->body->valid & OBD_MD_FLID))) {
+		CERROR("MDS body missing FID\n");
+		return 0;
+	}
+
+	if (!lu_fid_eq(&lli->lli_fid, &md->body->fid1))
+		return 0;
+
+	return 1;
+}
+
+static int ll_set_inode(struct inode *inode, void *opaque)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct mdt_body *body = ((struct lustre_md *)opaque)->body;
+
+	if (unlikely(!(body->valid & OBD_MD_FLID))) {
+		CERROR("MDS body missing FID\n");
+		return -EINVAL;
+	}
+
+	lli->lli_fid = body->fid1;
+	if (unlikely(!(body->valid & OBD_MD_FLTYPE))) {
+		CERROR("Can not initialize inode "DFID" without object type: "
+		       "valid = "LPX64"\n", PFID(&lli->lli_fid), body->valid);
+		return -EINVAL;
+	}
+
+	inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mode & S_IFMT);
+	if (unlikely(inode->i_mode == 0)) {
+		CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid));
+		return -EINVAL;
+	}
+
+	ll_lli_init(lli);
+
+	return 0;
+}
+
+
+/*
+ * Get an inode by inode number (already instantiated by the intent lookup).
+ * Returns inode or NULL
+ */
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+		      struct lustre_md *md)
+{
+	struct inode	 *inode;
+	ENTRY;
+
+	LASSERT(hash != 0);
+	inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md);
+
+	if (inode) {
+		if (inode->i_state & I_NEW) {
+			int rc = 0;
+
+			ll_read_inode2(inode, md);
+			if (S_ISREG(inode->i_mode) &&
+			    ll_i2info(inode)->lli_clob == NULL) {
+				CDEBUG(D_INODE,
+					"%s: apply lsm %p to inode "DFID".\n",
+					ll_get_fsname(sb, NULL, 0), md->lsm,
+					PFID(ll_inode2fid(inode)));
+				rc = cl_file_inode_init(inode, md);
+			}
+			if (rc != 0) {
+				make_bad_inode(inode);
+				unlock_new_inode(inode);
+				iput(inode);
+				inode = ERR_PTR(rc);
+			} else
+				unlock_new_inode(inode);
+		} else if (!(inode->i_state & (I_FREEING | I_CLEAR)))
+			ll_update_inode(inode, md);
+		CDEBUG(D_VFSTRACE, "got inode: %p for "DFID"\n",
+		       inode, PFID(&md->body->fid1));
+	}
+	RETURN(inode);
+}
+
+static void ll_invalidate_negative_children(struct inode *dir)
+{
+	struct dentry *dentry, *tmp_subdir;
+	struct ll_d_hlist_node *p;
+
+	ll_lock_dcache(dir);
+	ll_d_hlist_for_each_entry(dentry, p, &dir->i_dentry, d_alias) {
+		spin_lock(&dentry->d_lock);
+		if (!list_empty(&dentry->d_subdirs)) {
+			struct dentry *child;
+
+			list_for_each_entry_safe(child, tmp_subdir,
+						 &dentry->d_subdirs,
+						 d_u.d_child) {
+				if (child->d_inode == NULL)
+					d_lustre_invalidate(child, 1);
+			}
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	ll_unlock_dcache(dir);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+		       void *data, int flag)
+{
+	int rc;
+	struct lustre_handle lockh;
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		if (rc < 0) {
+			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+			RETURN(rc);
+		}
+		break;
+	case LDLM_CB_CANCELING: {
+		struct inode *inode = ll_inode_from_resource_lock(lock);
+		struct ll_inode_info *lli;
+		__u64 bits = lock->l_policy_data.l_inodebits.bits;
+		struct lu_fid *fid;
+		ldlm_mode_t mode = lock->l_req_mode;
+
+		/* Inode is set to lock->l_resource->lr_lvb_inode
+		 * for mdc - bug 24555 */
+		LASSERT(lock->l_ast_data == NULL);
+
+		/* Invalidate all dentries associated with this inode */
+		if (inode == NULL)
+			break;
+
+		LASSERT(lock->l_flags & LDLM_FL_CANCELING);
+		/* For OPEN locks we differentiate between lock modes
+		 * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+		if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+			    MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+			ll_have_md_lock(inode, &bits, LCK_MINMODE);
+
+		if (bits & MDS_INODELOCK_OPEN)
+			ll_have_md_lock(inode, &bits, mode);
+
+		fid = ll_inode2fid(inode);
+		if (lock->l_resource->lr_name.name[0] != fid_seq(fid) ||
+		    lock->l_resource->lr_name.name[1] != fid_oid(fid) ||
+		    lock->l_resource->lr_name.name[2] != fid_ver(fid)) {
+			LDLM_ERROR(lock, "data mismatch with object "
+				   DFID" (%p)", PFID(fid), inode);
+		}
+
+		if (bits & MDS_INODELOCK_OPEN) {
+			int flags = 0;
+			switch (lock->l_req_mode) {
+			case LCK_CW:
+				flags = FMODE_WRITE;
+				break;
+			case LCK_PR:
+				flags = FMODE_EXEC;
+				break;
+			case LCK_CR:
+				flags = FMODE_READ;
+				break;
+			default:
+				CERROR("Unexpected lock mode for OPEN lock "
+				       "%d, inode %ld\n", lock->l_req_mode,
+				       inode->i_ino);
+			}
+			ll_md_real_close(inode, flags);
+		}
+
+		lli = ll_i2info(inode);
+		if (bits & MDS_INODELOCK_LAYOUT) {
+			struct cl_object_conf conf = { { 0 } };
+
+			conf.coc_opc = OBJECT_CONF_INVALIDATE;
+			conf.coc_inode = inode;
+			rc = ll_layout_conf(inode, &conf);
+			if (rc)
+				CDEBUG(D_INODE, "invaliding layout %d.\n", rc);
+		}
+
+		if (bits & MDS_INODELOCK_UPDATE)
+			lli->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+
+		if (S_ISDIR(inode->i_mode) &&
+		     (bits & MDS_INODELOCK_UPDATE)) {
+			CDEBUG(D_INODE, "invalidating inode %lu\n",
+			       inode->i_ino);
+			truncate_inode_pages(inode->i_mapping, 0);
+			ll_invalidate_negative_children(inode);
+		}
+
+		if (inode->i_sb->s_root &&
+		    inode != inode->i_sb->s_root->d_inode &&
+		    (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)))
+			ll_invalidate_aliases(inode);
+		iput(inode);
+		break;
+	}
+	default:
+		LBUG();
+	}
+
+	RETURN(0);
+}
+
+__u32 ll_i2suppgid(struct inode *i)
+{
+	if (current_is_in_group(i->i_gid))
+		return (__u32)i->i_gid;
+	else
+		return (__u32)(-1);
+}
+
+/* Pack the required supplementary groups into the supplied groups array.
+ * If we don't need to use the groups from the target inode(s) then we
+ * instead pack one or more groups from the user's supplementary group
+ * array in case it might be useful.  Not needed if doing an MDS-side upcall. */
+void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2)
+{
+#if 0
+	int i;
+#endif
+
+	LASSERT(i1 != NULL);
+	LASSERT(suppgids != NULL);
+
+	suppgids[0] = ll_i2suppgid(i1);
+
+	if (i2)
+		suppgids[1] = ll_i2suppgid(i2);
+		else
+			suppgids[1] = -1;
+
+#if 0
+	for (i = 0; i < current_ngroups; i++) {
+		if (suppgids[0] == -1) {
+			if (current_groups[i] != suppgids[1])
+				suppgids[0] = current_groups[i];
+			continue;
+		}
+		if (suppgids[1] == -1) {
+			if (current_groups[i] != suppgids[0])
+				suppgids[1] = current_groups[i];
+			continue;
+		}
+		break;
+	}
+#endif
+}
+
+/*
+ * try to reuse three types of dentry:
+ * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid
+ *    by concurrent .revalidate).
+ * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may
+ *    be cleared by others calling d_lustre_revalidate).
+ * 3. DISCONNECTED alias.
+ */
+static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry)
+{
+	struct dentry *alias, *discon_alias, *invalid_alias;
+	struct ll_d_hlist_node *p;
+
+	if (ll_d_hlist_empty(&inode->i_dentry))
+		return NULL;
+
+	discon_alias = invalid_alias = NULL;
+
+	ll_lock_dcache(inode);
+	ll_d_hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) {
+		LASSERT(alias != dentry);
+
+		spin_lock(&alias->d_lock);
+		if (alias->d_flags & DCACHE_DISCONNECTED)
+			/* LASSERT(last_discon == NULL); LU-405, bz 20055 */
+			discon_alias = alias;
+		else if (alias->d_parent == dentry->d_parent	     &&
+			 alias->d_name.hash == dentry->d_name.hash       &&
+			 alias->d_name.len == dentry->d_name.len	 &&
+			 memcmp(alias->d_name.name, dentry->d_name.name,
+				dentry->d_name.len) == 0)
+			invalid_alias = alias;
+		spin_unlock(&alias->d_lock);
+
+		if (invalid_alias)
+			break;
+	}
+	alias = invalid_alias ?: discon_alias ?: NULL;
+	if (alias) {
+		spin_lock(&alias->d_lock);
+		dget_dlock(alias);
+		spin_unlock(&alias->d_lock);
+	}
+	ll_unlock_dcache(inode);
+
+	return alias;
+}
+
+/*
+ * Similar to d_splice_alias(), but lustre treats invalid alias
+ * similar to DCACHE_DISCONNECTED, and tries to use it anyway.
+ */
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
+{
+	struct dentry *new;
+
+	if (inode) {
+		new = ll_find_alias(inode, de);
+		if (new) {
+			ll_dops_init(new, 1, 1);
+			d_move(new, de);
+			iput(inode);
+			CDEBUG(D_DENTRY,
+			       "Reuse dentry %p inode %p refc %d flags %#x\n",
+			      new, new->d_inode, d_refcount(new), new->d_flags);
+			return new;
+		}
+	}
+	ll_dops_init(de, 1, 1);
+	__d_lustre_invalidate(de);
+	d_add(de, inode);
+	CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n",
+	       de, de->d_inode, d_refcount(de), de->d_flags);
+	return de;
+}
+
+int ll_lookup_it_finish(struct ptlrpc_request *request,
+			struct lookup_intent *it, void *data)
+{
+	struct it_cb_data *icbd = data;
+	struct dentry **de = icbd->icbd_childp;
+	struct inode *parent = icbd->icbd_parent;
+	struct inode *inode = NULL;
+	__u64 bits = 0;
+	int rc;
+	ENTRY;
+
+	/* NB 1 request reference will be taken away by ll_intent_lock()
+	 * when I return */
+	CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
+	       it->d.lustre.it_disposition);
+	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+		rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+		if (rc)
+			RETURN(rc);
+
+		ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+
+		/* We used to query real size from OSTs here, but actually
+		   this is not needed. For stat() calls size would be updated
+		   from subsequent do_revalidate()->ll_inode_revalidate_it() in
+		   2.4 and
+		   vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+		   Everybody else who needs correct file size would call
+		   ll_glimpse_size or some equivalent themselves anyway.
+		   Also see bug 7198. */
+	}
+
+	/* Only hash *de if it is unhashed (new dentry).
+	 * Atoimc_open may passin hashed dentries for open.
+	 */
+	if (d_unhashed(*de))
+		*de = ll_splice_alias(inode, *de);
+
+	if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+		/* we have lookup look - unhide dentry */
+		if (bits & MDS_INODELOCK_LOOKUP)
+			d_lustre_revalidate(*de);
+	} else if (!it_disposition(it, DISP_OPEN_CREATE)) {
+		/* If file created on server, don't depend on parent UPDATE
+		 * lock to unhide it. It is left hidden and next lookup can
+		 * find it in ll_splice_alias.
+		 */
+		/* Check that parent has UPDATE lock. */
+		struct lookup_intent parent_it = {
+					.it_op = IT_GETATTR,
+					.d.lustre.it_lock_handle = 0 };
+
+		if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it,
+				       &ll_i2info(parent)->lli_fid, NULL)) {
+			d_lustre_revalidate(*de);
+			ll_intent_release(&parent_it);
+		}
+	}
+
+	RETURN(0);
+}
+
+static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
+				   struct lookup_intent *it, int lookup_flags)
+{
+	struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+	struct dentry *save = dentry, *retval;
+	struct ptlrpc_request *req = NULL;
+	struct md_op_data *op_data;
+	struct it_cb_data icbd;
+	__u32 opc;
+	int rc;
+	ENTRY;
+
+	if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
+		RETURN(ERR_PTR(-ENAMETOOLONG));
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n",
+	       dentry->d_name.len, dentry->d_name.name, parent->i_ino,
+	       parent->i_generation, parent, LL_IT2STR(it));
+
+	if (d_mountpoint(dentry))
+		CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
+
+	ll_frob_intent(&it, &lookup_it);
+
+	/* As do_lookup is called before follow_mount, root dentry may be left
+	 * not valid, revalidate it here. */
+	if (parent->i_sb->s_root && (parent->i_sb->s_root->d_inode == parent) &&
+	    (it->it_op & (IT_OPEN | IT_CREAT))) {
+		rc = ll_inode_revalidate_it(parent->i_sb->s_root, it,
+					    MDS_INODELOCK_LOOKUP);
+		if (rc)
+			RETURN(ERR_PTR(rc));
+	}
+
+	if (it->it_op == IT_GETATTR) {
+		rc = ll_statahead_enter(parent, &dentry, 0);
+		if (rc == 1) {
+			if (dentry == save)
+				GOTO(out, retval = NULL);
+			GOTO(out, retval = dentry);
+		}
+	}
+
+	icbd.icbd_childp = &dentry;
+	icbd.icbd_parent = parent;
+
+	if (it->it_op & IT_CREAT ||
+	    (it->it_op & IT_OPEN && it->it_create_mode & O_CREAT))
+		opc = LUSTRE_OPC_CREATE;
+	else
+		opc = LUSTRE_OPC_ANY;
+
+	op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name,
+				     dentry->d_name.len, lookup_flags, opc,
+				     NULL);
+	if (IS_ERR(op_data))
+		RETURN((void *)op_data);
+
+	/* enforce umask if acl disabled or MDS doesn't support umask */
+	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
+		it->it_create_mode &= ~current_umask();
+
+	rc = md_intent_lock(ll_i2mdexp(parent), op_data, NULL, 0, it,
+			    lookup_flags, &req, ll_md_blocking_ast, 0);
+	ll_finish_md_op_data(op_data);
+	if (rc < 0)
+		GOTO(out, retval = ERR_PTR(rc));
+
+	rc = ll_lookup_it_finish(req, it, &icbd);
+	if (rc != 0) {
+		ll_intent_release(it);
+		GOTO(out, retval = ERR_PTR(rc));
+	}
+
+	if ((it->it_op & IT_OPEN) && dentry->d_inode &&
+	    !S_ISREG(dentry->d_inode->i_mode) &&
+	    !S_ISDIR(dentry->d_inode->i_mode)) {
+		ll_release_openhandle(dentry, it);
+	}
+	ll_lookup_finish_locks(it, dentry);
+
+	if (dentry == save)
+		GOTO(out, retval = NULL);
+	else
+		GOTO(out, retval = dentry);
+ out:
+	if (req)
+		ptlrpc_req_finished(req);
+	if (it->it_op == IT_GETATTR && (retval == NULL || retval == dentry))
+		ll_statahead_mark(parent, dentry);
+	return retval;
+}
+
+static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct lookup_intent *itp, it = { .it_op = IT_GETATTR };
+	struct dentry *de;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),flags=%u\n",
+	       dentry->d_name.len, dentry->d_name.name, parent->i_ino,
+	       parent->i_generation, parent, flags);
+
+	/* Optimize away (CREATE && !OPEN). Let .create handle the race. */
+	if ((flags & LOOKUP_CREATE ) && !(flags & LOOKUP_OPEN)) {
+		ll_dops_init(dentry, 1, 1);
+		__d_lustre_invalidate(dentry);
+		d_add(dentry, NULL);
+		return NULL;
+	}
+
+	if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE))
+		itp = NULL;
+	else
+		itp = &it;
+	de = ll_lookup_it(parent, dentry, itp, 0);
+
+	if (itp != NULL)
+		ll_intent_release(itp);
+
+	return de;
+}
+
+/*
+ * For cached negative dentry and new dentry, handle lookup/create/open
+ * together.
+ */
+static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
+			  struct file *file, unsigned open_flags,
+			  umode_t mode, int *opened)
+{
+	struct lookup_intent *it;
+	struct dentry *de;
+	long long lookup_flags = LOOKUP_OPEN;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),file %p,"
+			   "open_flags %x,mode %x opened %d\n",
+	       dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+	       dir->i_generation, dir, file, open_flags, mode, *opened);
+
+	OBD_ALLOC(it, sizeof(*it));
+	if (!it)
+		RETURN(-ENOMEM);
+
+	it->it_op = IT_OPEN;
+	if (mode) {
+		it->it_op |= IT_CREAT;
+		lookup_flags |= LOOKUP_CREATE;
+	}
+	it->it_create_mode = (mode & S_IALLUGO) | S_IFREG;
+	it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags);
+
+	/* Dentry added to dcache tree in ll_lookup_it */
+	de = ll_lookup_it(dir, dentry, it, lookup_flags);
+	if (IS_ERR(de))
+		rc = PTR_ERR(de);
+	else if (de != NULL)
+		dentry = de;
+
+	if (!rc) {
+		if (it_disposition(it, DISP_OPEN_CREATE)) {
+			/* Dentry instantiated in ll_create_it. */
+			rc = ll_create_it(dir, dentry, mode, it);
+			if (rc) {
+				/* We dget in ll_splice_alias. */
+				if (de != NULL)
+					dput(de);
+				goto out_release;
+			}
+
+			*opened |= FILE_CREATED;
+		}
+		if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) {
+			/* Open dentry. */
+			if (S_ISFIFO(dentry->d_inode->i_mode)) {
+				/* We cannot call open here as it would
+				 * deadlock.
+				 */
+				if (it_disposition(it, DISP_ENQ_OPEN_REF))
+					ptlrpc_req_finished(
+						       (struct ptlrpc_request *)
+							  it->d.lustre.it_data);
+				rc = finish_no_open(file, de);
+			} else {
+				file->private_data = it;
+				rc = finish_open(file, dentry, NULL, opened);
+				/* We dget in ll_splice_alias. finish_open takes
+				 * care of dget for fd open.
+				 */
+				if (de != NULL)
+					dput(de);
+			}
+		} else {
+			rc = finish_no_open(file, de);
+		}
+	}
+
+out_release:
+	ll_intent_release(it);
+	OBD_FREE(it, sizeof(*it));
+
+	RETURN(rc);
+}
+
+
+/* We depend on "mode" being set with the proper file type/umask by now */
+static struct inode *ll_create_node(struct inode *dir, const char *name,
+				    int namelen, const void *data, int datalen,
+				    int mode, __u64 extra,
+				    struct lookup_intent *it)
+{
+	struct inode *inode = NULL;
+	struct ptlrpc_request *request = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	int rc;
+	ENTRY;
+
+	LASSERT(it && it->d.lustre.it_disposition);
+
+	LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF));
+	request = it->d.lustre.it_data;
+	it_clear_disposition(it, DISP_ENQ_CREATE_REF);
+	rc = ll_prep_inode(&inode, request, dir->i_sb, it);
+	if (rc)
+		GOTO(out, inode = ERR_PTR(rc));
+
+	LASSERT(ll_d_hlist_empty(&inode->i_dentry));
+
+	/* We asked for a lock on the directory, but were granted a
+	 * lock on the inode.  Since we finally have an inode pointer,
+	 * stuff it in the lock. */
+	CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n",
+	       inode, inode->i_ino, inode->i_generation);
+	ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+	EXIT;
+ out:
+	ptlrpc_req_finished(request);
+	return inode;
+}
+
+/*
+ * By the time this is called, we already have created the directory cache
+ * entry for the new file, but it is so far negative - it has no inode.
+ *
+ * We defer creating the OBD object(s) until open, to keep the intent and
+ * non-intent code paths similar, and also because we do not have the MDS
+ * inode number before calling ll_create_node() (which is needed for LOV),
+ * so we would need to do yet another RPC to the MDS to store the LOV EA
+ * data on the MDS.  If needed, we would pass the PACKED lmm as data and
+ * lmm_size in datalen (the MDS still has code which will handle that).
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode,
+			struct lookup_intent *it)
+{
+	struct inode *inode;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n",
+	       dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+	       dir->i_generation, dir, LL_IT2STR(it));
+
+	rc = it_open_error(DISP_OPEN_CREATE, it);
+	if (rc)
+		RETURN(rc);
+
+	inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len,
+			       NULL, 0, mode, 0, it);
+	if (IS_ERR(inode))
+		RETURN(PTR_ERR(inode));
+
+	if (filename_is_volatile(dentry->d_name.name, dentry->d_name.len, NULL))
+		ll_i2info(inode)->lli_volatile = true;
+
+	d_instantiate(dentry, inode);
+	RETURN(0);
+}
+
+static void ll_update_times(struct ptlrpc_request *request,
+			    struct inode *inode)
+{
+	struct mdt_body *body = req_capsule_server_get(&request->rq_pill,
+						       &RMF_MDT_BODY);
+
+	LASSERT(body);
+	if (body->valid & OBD_MD_FLMTIME &&
+	    body->mtime > LTIME_S(inode->i_mtime)) {
+		CDEBUG(D_INODE, "setting ino %lu mtime from %lu to "LPU64"\n",
+		       inode->i_ino, LTIME_S(inode->i_mtime), body->mtime);
+		LTIME_S(inode->i_mtime) = body->mtime;
+	}
+	if (body->valid & OBD_MD_FLCTIME &&
+	    body->ctime > LTIME_S(inode->i_ctime))
+		LTIME_S(inode->i_ctime) = body->ctime;
+}
+
+static int ll_new_node(struct inode *dir, struct qstr *name,
+		       const char *tgt, int mode, int rdev,
+		       struct dentry *dchild, __u32 opc)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	struct inode *inode = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	int tgt_len = 0;
+	int err;
+
+	ENTRY;
+	if (unlikely(tgt != NULL))
+		tgt_len = strlen(tgt) + 1;
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name,
+				     name->len, 0, opc, NULL);
+	if (IS_ERR(op_data))
+		GOTO(err_exit, err = PTR_ERR(op_data));
+
+	err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode,
+			current_fsuid(), current_fsgid(),
+			cfs_curproc_cap_pack(), rdev, &request);
+	ll_finish_md_op_data(op_data);
+	if (err)
+		GOTO(err_exit, err);
+
+	ll_update_times(request, dir);
+
+	if (dchild) {
+		err = ll_prep_inode(&inode, request, dchild->d_sb, NULL);
+		if (err)
+		     GOTO(err_exit, err);
+
+		d_instantiate(dchild, inode);
+	}
+	EXIT;
+err_exit:
+	ptlrpc_req_finished(request);
+
+	return err;
+}
+
+static int ll_mknod_generic(struct inode *dir, struct qstr *name, int mode,
+			    unsigned rdev, struct dentry *dchild)
+{
+	int err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p) mode %o dev %x\n",
+	       name->len, name->name, dir->i_ino, dir->i_generation, dir,
+	       mode, rdev);
+
+	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+		mode &= ~current_umask();
+
+	switch (mode & S_IFMT) {
+	case 0:
+		mode |= S_IFREG; /* for mode = 0 case, fallthrough */
+	case S_IFREG:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		err = ll_new_node(dir, name, NULL, mode, rdev, dchild,
+				  LUSTRE_OPC_MKNOD);
+		break;
+	case S_IFDIR:
+		err = -EPERM;
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
+
+	RETURN(err);
+}
+
+/*
+ * Plain create. Intent create is handled in atomic_open.
+ */
+static int ll_create_nd(struct inode *dir, struct dentry *dentry,
+			umode_t mode, bool want_excl)
+{
+	int rc;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),"
+			   "flags=%u, excl=%d\n",
+	       dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+	       dir->i_generation, dir, mode, want_excl);
+
+	rc = ll_mknod_generic(dir, &dentry->d_name, mode, 0, dentry);
+
+	ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, unhashed %d\n",
+	       dentry->d_name.len, dentry->d_name.name, d_unhashed(dentry));
+
+	return rc;
+}
+
+static int ll_symlink_generic(struct inode *dir, struct qstr *name,
+			      const char *tgt, struct dentry *dchild)
+{
+	int err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),target=%.*s\n",
+	       name->len, name->name, dir->i_ino, dir->i_generation,
+	       dir, 3000, tgt);
+
+	err = ll_new_node(dir, name, (char *)tgt, S_IFLNK | S_IRWXUGO,
+			  0, dchild, LUSTRE_OPC_SYMLINK);
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1);
+
+	RETURN(err);
+}
+
+static int ll_link_generic(struct inode *src,  struct inode *dir,
+			   struct qstr *name, struct dentry *dchild)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(dir);
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int err;
+
+	ENTRY;
+	CDEBUG(D_VFSTRACE,
+	       "VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%.*s\n",
+	       src->i_ino, src->i_generation, src, dir->i_ino,
+	       dir->i_generation, dir, name->len, name->name);
+
+	op_data = ll_prep_md_op_data(NULL, src, dir, name->name, name->len,
+				     0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	err = md_link(sbi->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (err)
+		GOTO(out, err);
+
+	ll_update_times(request, dir);
+	ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1);
+	EXIT;
+out:
+	ptlrpc_req_finished(request);
+	RETURN(err);
+}
+
+static int ll_mkdir_generic(struct inode *dir, struct qstr *name,
+			    int mode, struct dentry *dchild)
+
+{
+	int err;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+	       name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+		mode &= ~current_umask();
+	mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR;
+	err = ll_new_node(dir, name, NULL, mode, 0, dchild, LUSTRE_OPC_MKDIR);
+
+	if (!err)
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1);
+
+	RETURN(err);
+}
+
+/* Try to find the child dentry by its name.
+   If found, put the result fid into @fid. */
+static void ll_get_child_fid(struct inode * dir, struct qstr *name,
+			     struct lu_fid *fid)
+{
+	struct dentry *parent, *child;
+
+	parent = ll_d_hlist_entry(dir->i_dentry, struct dentry, d_alias);
+	child = d_lookup(parent, name);
+	if (child) {
+		if (child->d_inode)
+			*fid = *ll_inode2fid(child->d_inode);
+		dput(child);
+	}
+}
+
+static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent,
+			    struct dentry *dchild, struct qstr *name)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+	       name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+	if (unlikely(ll_d_mountpoint(dparent, dchild, name)))
+		RETURN(-EBUSY);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len,
+				     S_IFDIR, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	ll_get_child_fid(dir, name, &op_data->op_fid3);
+	op_data->op_fid2 = op_data->op_fid3;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc == 0) {
+		ll_update_times(request, dir);
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+	}
+
+	ptlrpc_req_finished(request);
+	RETURN(rc);
+}
+
+/**
+ * Remove dir entry
+ **/
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+	       namelen, name, dir->i_ino, dir->i_generation, dir);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name),
+				     S_IFDIR, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+	op_data->op_cli_flags |= CLI_RM_ENTRY;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc == 0) {
+		ll_update_times(request, dir);
+		ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+	}
+
+	ptlrpc_req_finished(request);
+	RETURN(rc);
+}
+
+int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
+{
+	struct mdt_body *body;
+	struct lov_mds_md *eadata;
+	struct lov_stripe_md *lsm = NULL;
+	struct obd_trans_info oti = { 0 };
+	struct obdo *oa;
+	struct obd_capa *oc = NULL;
+	int rc;
+	ENTRY;
+
+	/* req is swabbed so this is safe */
+	body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+	if (!(body->valid & OBD_MD_FLEASIZE))
+		RETURN(0);
+
+	if (body->eadatasize == 0) {
+		CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n");
+		GOTO(out, rc = -EPROTO);
+	}
+
+	/* The MDS sent back the EA because we unlinked the last reference
+	 * to this file. Use this EA to unlink the objects on the OST.
+	 * It's opaque so we don't swab here; we leave it to obd_unpackmd() to
+	 * check it is complete and sensible. */
+	eadata = req_capsule_server_sized_get(&request->rq_pill, &RMF_MDT_MD,
+					      body->eadatasize);
+	LASSERT(eadata != NULL);
+
+	rc = obd_unpackmd(ll_i2dtexp(dir), &lsm, eadata, body->eadatasize);
+	if (rc < 0) {
+		CERROR("obd_unpackmd: %d\n", rc);
+		GOTO(out, rc);
+	}
+	LASSERT(rc >= sizeof(*lsm));
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		GOTO(out_free_memmd, rc = -ENOMEM);
+
+	oa->o_oi = lsm->lsm_oi;
+	oa->o_mode = body->mode & S_IFMT;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP;
+
+	if (body->valid & OBD_MD_FLCOOKIE) {
+		oa->o_valid |= OBD_MD_FLCOOKIE;
+		oti.oti_logcookies =
+			req_capsule_server_sized_get(&request->rq_pill,
+						     &RMF_LOGCOOKIES,
+						   sizeof(struct llog_cookie) *
+						     lsm->lsm_stripe_count);
+		if (oti.oti_logcookies == NULL) {
+			oa->o_valid &= ~OBD_MD_FLCOOKIE;
+			body->valid &= ~OBD_MD_FLCOOKIE;
+		}
+	}
+
+	if (body->valid & OBD_MD_FLOSSCAPA) {
+		rc = md_unpack_capa(ll_i2mdexp(dir), request, &RMF_CAPA2, &oc);
+		if (rc)
+			GOTO(out_free_memmd, rc);
+	}
+
+	rc = obd_destroy(NULL, ll_i2dtexp(dir), oa, lsm, &oti,
+			 ll_i2mdexp(dir), oc);
+	capa_put(oc);
+	if (rc)
+		CERROR("obd destroy objid "DOSTID" error %d\n",
+		       POSTID(&lsm->lsm_oi), rc);
+out_free_memmd:
+	obd_free_memmd(ll_i2dtexp(dir), &lsm);
+	OBDO_FREE(oa);
+out:
+	return rc;
+}
+
+/* ll_unlink_generic() doesn't update the inode with the new link count.
+ * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there
+ * is any lock existing. They will recycle dentries and inodes based upon locks
+ * too. b=20433 */
+static int ll_unlink_generic(struct inode *dir, struct dentry *dparent,
+			     struct dentry *dchild, struct qstr *name)
+{
+	struct ptlrpc_request *request = NULL;
+	struct md_op_data *op_data;
+	int rc;
+	ENTRY;
+	CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+	       name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+	/*
+	 * XXX: unlink bind mountpoint maybe call to here,
+	 * just check it as vfs_unlink does.
+	 */
+	if (unlikely(ll_d_mountpoint(dparent, dchild, name)))
+		RETURN(-EBUSY);
+
+	op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name,
+				     name->len, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	ll_get_child_fid(dir, name, &op_data->op_fid3);
+	op_data->op_fid2 = op_data->op_fid3;
+	rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+	ll_finish_md_op_data(op_data);
+	if (rc)
+		GOTO(out, rc);
+
+	ll_update_times(request, dir);
+	ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+
+	rc = ll_objects_destroy(request, dir);
+ out:
+	ptlrpc_req_finished(request);
+	RETURN(rc);
+}
+
+static int ll_rename_generic(struct inode *src, struct dentry *src_dparent,
+			     struct dentry *src_dchild, struct qstr *src_name,
+			     struct inode *tgt, struct dentry *tgt_dparent,
+			     struct dentry *tgt_dchild, struct qstr *tgt_name)
+{
+	struct ptlrpc_request *request = NULL;
+	struct ll_sb_info *sbi = ll_i2sbi(src);
+	struct md_op_data *op_data;
+	int err;
+	ENTRY;
+	CDEBUG(D_VFSTRACE,"VFS Op:oldname=%.*s,src_dir=%lu/%u(%p),newname=%.*s,"
+	       "tgt_dir=%lu/%u(%p)\n", src_name->len, src_name->name,
+	       src->i_ino, src->i_generation, src, tgt_name->len,
+	       tgt_name->name, tgt->i_ino, tgt->i_generation, tgt);
+
+	if (unlikely(ll_d_mountpoint(src_dparent, src_dchild, src_name) ||
+	    ll_d_mountpoint(tgt_dparent, tgt_dchild, tgt_name)))
+		RETURN(-EBUSY);
+
+	op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	ll_get_child_fid(src, src_name, &op_data->op_fid3);
+	ll_get_child_fid(tgt, tgt_name, &op_data->op_fid4);
+	err = md_rename(sbi->ll_md_exp, op_data,
+			src_name->name, src_name->len,
+			tgt_name->name, tgt_name->len, &request);
+	ll_finish_md_op_data(op_data);
+	if (!err) {
+		ll_update_times(request, src);
+		ll_update_times(request, tgt);
+		ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1);
+		err = ll_objects_destroy(request, src);
+	}
+
+	ptlrpc_req_finished(request);
+
+	RETURN(err);
+}
+
+static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode,
+		    dev_t rdev)
+{
+	return ll_mknod_generic(dir, &dchild->d_name, mode,
+				old_encode_dev(rdev), dchild);
+}
+
+static int ll_unlink(struct inode * dir, struct dentry *dentry)
+{
+	return ll_unlink_generic(dir, NULL, dentry, &dentry->d_name);
+}
+
+static int ll_mkdir(struct inode *dir, struct dentry *dentry, ll_umode_t mode)
+{
+	return ll_mkdir_generic(dir, &dentry->d_name, mode, dentry);
+}
+
+static int ll_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	return ll_rmdir_generic(dir, NULL, dentry, &dentry->d_name);
+}
+
+static int ll_symlink(struct inode *dir, struct dentry *dentry,
+		      const char *oldname)
+{
+	return ll_symlink_generic(dir, &dentry->d_name, oldname, dentry);
+}
+
+static int ll_link(struct dentry *old_dentry, struct inode *dir,
+		   struct dentry *new_dentry)
+{
+	return ll_link_generic(old_dentry->d_inode, dir, &new_dentry->d_name,
+			       new_dentry);
+}
+
+static int ll_rename(struct inode *old_dir, struct dentry *old_dentry,
+		     struct inode *new_dir, struct dentry *new_dentry)
+{
+	int err;
+	err = ll_rename_generic(old_dir, NULL,
+				 old_dentry, &old_dentry->d_name,
+				 new_dir, NULL, new_dentry,
+				 &new_dentry->d_name);
+	if (!err) {
+			d_move(old_dentry, new_dentry);
+	}
+	return err;
+}
+
+struct inode_operations ll_dir_inode_operations = {
+	.mknod	      = ll_mknod,
+	.atomic_open	    = ll_atomic_open,
+	.lookup	     = ll_lookup_nd,
+	.create	     = ll_create_nd,
+	/* We need all these non-raw things for NFSD, to not patch it. */
+	.unlink	     = ll_unlink,
+	.mkdir	      = ll_mkdir,
+	.rmdir	      = ll_rmdir,
+	.symlink	    = ll_symlink,
+	.link	       = ll_link,
+	.rename	     = ll_rename,
+	.setattr	    = ll_setattr,
+	.getattr	    = ll_getattr,
+	.permission	 = ll_inode_permission,
+	.setxattr	   = ll_setxattr,
+	.getxattr	   = ll_getxattr,
+	.listxattr	  = ll_listxattr,
+	.removexattr	= ll_removexattr,
+	.get_acl	    = ll_get_acl,
+};
+
+struct inode_operations ll_special_inode_operations = {
+	.setattr	= ll_setattr,
+	.getattr	= ll_getattr,
+	.permission     = ll_inode_permission,
+	.setxattr       = ll_setxattr,
+	.getxattr       = ll_getxattr,
+	.listxattr      = ll_listxattr,
+	.removexattr    = ll_removexattr,
+	.get_acl	    = ll_get_acl,
+};

diff --git a/drivers/staging/lustre/lustre/llite/remote_perm.c b/drivers/staging/lustre/lustre/llite/remote_perm.c
new file mode 100644
index 0000000..68b2dc4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/remote_perm.c

@@ -0,0 +1,333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/remote_perm.c
+ *
+ * Lustre Permission Cache for Remote Client
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include "llite_internal.h"
+
+struct kmem_cache *ll_remote_perm_cachep = NULL;
+struct kmem_cache *ll_rmtperm_hash_cachep = NULL;
+
+static inline struct ll_remote_perm *alloc_ll_remote_perm(void)
+{
+	struct ll_remote_perm *lrp;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lrp, ll_remote_perm_cachep, GFP_KERNEL);
+	if (lrp)
+		INIT_HLIST_NODE(&lrp->lrp_list);
+	return lrp;
+}
+
+static inline void free_ll_remote_perm(struct ll_remote_perm *lrp)
+{
+	if (!lrp)
+		return;
+
+	if (!hlist_unhashed(&lrp->lrp_list))
+		hlist_del(&lrp->lrp_list);
+	OBD_SLAB_FREE(lrp, ll_remote_perm_cachep, sizeof(*lrp));
+}
+
+struct hlist_head *alloc_rmtperm_hash(void)
+{
+	struct hlist_head *hash;
+	int i;
+
+	OBD_SLAB_ALLOC_GFP(hash, ll_rmtperm_hash_cachep,
+			   REMOTE_PERM_HASHSIZE * sizeof(*hash),
+			   GFP_IOFS);
+	if (!hash)
+		return NULL;
+
+	for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+		INIT_HLIST_HEAD(hash + i);
+
+	return hash;
+}
+
+void free_rmtperm_hash(struct hlist_head *hash)
+{
+	int i;
+	struct ll_remote_perm *lrp;
+	struct hlist_node *next;
+
+	if(!hash)
+		return;
+
+	for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+		hlist_for_each_entry_safe(lrp, next, hash + i,
+					      lrp_list)
+			free_ll_remote_perm(lrp);
+	OBD_SLAB_FREE(hash, ll_rmtperm_hash_cachep,
+		      REMOTE_PERM_HASHSIZE * sizeof(*hash));
+}
+
+static inline int remote_perm_hashfunc(uid_t uid)
+{
+	return uid & (REMOTE_PERM_HASHSIZE - 1);
+}
+
+/* NB: setxid permission is not checked here, instead it's done on
+ * MDT when client get remote permission. */
+static int do_check_remote_perm(struct ll_inode_info *lli, int mask)
+{
+	struct hlist_head *head;
+	struct ll_remote_perm *lrp;
+	int found = 0, rc;
+	ENTRY;
+
+	if (!lli->lli_remote_perms)
+		RETURN(-ENOENT);
+
+	head = lli->lli_remote_perms + remote_perm_hashfunc(current_uid());
+
+	spin_lock(&lli->lli_lock);
+	hlist_for_each_entry(lrp, head, lrp_list) {
+		if (lrp->lrp_uid != current_uid())
+			continue;
+		if (lrp->lrp_gid != current_gid())
+			continue;
+		if (lrp->lrp_fsuid != current_fsuid())
+			continue;
+		if (lrp->lrp_fsgid != current_fsgid())
+			continue;
+		found = 1;
+		break;
+	}
+
+	if (!found)
+		GOTO(out, rc = -ENOENT);
+
+	CDEBUG(D_SEC, "found remote perm: %u/%u/%u/%u - %#x\n",
+	       lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+	       lrp->lrp_access_perm);
+	rc = ((lrp->lrp_access_perm & mask) == mask) ? 0 : -EACCES;
+
+out:
+	spin_unlock(&lli->lli_lock);
+	return rc;
+}
+
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_remote_perm *lrp = NULL, *tmp = NULL;
+	struct hlist_head *head, *perm_hash = NULL;
+	ENTRY;
+
+	LASSERT(ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT);
+
+#if 0
+	if (perm->rp_uid != current->uid ||
+	    perm->rp_gid != current->gid ||
+	    perm->rp_fsuid != current->fsuid ||
+	    perm->rp_fsgid != current->fsgid) {
+		/* user might setxid in this small period */
+		CDEBUG(D_SEC,
+		       "remote perm user %u/%u/%u/%u != current %u/%u/%u/%u\n",
+		       perm->rp_uid, perm->rp_gid, perm->rp_fsuid,
+		       perm->rp_fsgid, current->uid, current->gid,
+		       current->fsuid, current->fsgid);
+		RETURN(-EAGAIN);
+	}
+#endif
+
+	if (!lli->lli_remote_perms) {
+		perm_hash = alloc_rmtperm_hash();
+		if (perm_hash == NULL) {
+			CERROR("alloc lli_remote_perms failed!\n");
+			RETURN(-ENOMEM);
+		}
+	}
+
+	spin_lock(&lli->lli_lock);
+
+	if (!lli->lli_remote_perms)
+		lli->lli_remote_perms = perm_hash;
+	else if (perm_hash)
+		free_rmtperm_hash(perm_hash);
+
+	head = lli->lli_remote_perms + remote_perm_hashfunc(perm->rp_uid);
+
+again:
+	hlist_for_each_entry(tmp, head, lrp_list) {
+		if (tmp->lrp_uid != perm->rp_uid)
+			continue;
+		if (tmp->lrp_gid != perm->rp_gid)
+			continue;
+		if (tmp->lrp_fsuid != perm->rp_fsuid)
+			continue;
+		if (tmp->lrp_fsgid != perm->rp_fsgid)
+			continue;
+		if (lrp)
+			free_ll_remote_perm(lrp);
+		lrp = tmp;
+		break;
+	}
+
+	if (!lrp) {
+		spin_unlock(&lli->lli_lock);
+		lrp = alloc_ll_remote_perm();
+		if (!lrp) {
+			CERROR("alloc memory for ll_remote_perm failed!\n");
+			RETURN(-ENOMEM);
+		}
+		spin_lock(&lli->lli_lock);
+		goto again;
+	}
+
+	lrp->lrp_access_perm = perm->rp_access_perm;
+	if (lrp != tmp) {
+		lrp->lrp_uid	 = perm->rp_uid;
+		lrp->lrp_gid	 = perm->rp_gid;
+		lrp->lrp_fsuid       = perm->rp_fsuid;
+		lrp->lrp_fsgid       = perm->rp_fsgid;
+		hlist_add_head(&lrp->lrp_list, head);
+	}
+	lli->lli_rmtperm_time = cfs_time_current();
+	spin_unlock(&lli->lli_lock);
+
+	CDEBUG(D_SEC, "new remote perm@%p: %u/%u/%u/%u - %#x\n",
+	       lrp, lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+	       lrp->lrp_access_perm);
+
+	RETURN(0);
+}
+
+int lustre_check_remote_perm(struct inode *inode, int mask)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	struct mdt_remote_perm *perm;
+	struct obd_capa *oc;
+	cfs_time_t save;
+	int i = 0, rc;
+	ENTRY;
+
+	do {
+		save = lli->lli_rmtperm_time;
+		rc = do_check_remote_perm(lli, mask);
+		if (!rc || (rc != -ENOENT && i))
+			break;
+
+		might_sleep();
+
+		mutex_lock(&lli->lli_rmtperm_mutex);
+		/* check again */
+		if (save != lli->lli_rmtperm_time) {
+			rc = do_check_remote_perm(lli, mask);
+			if (!rc || (rc != -ENOENT && i)) {
+				mutex_unlock(&lli->lli_rmtperm_mutex);
+				break;
+			}
+		}
+
+		if (i++ > 5) {
+			CERROR("check remote perm falls in dead loop!\n");
+			LBUG();
+		}
+
+		oc = ll_mdscapa_get(inode);
+		rc = md_get_remote_perm(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+					ll_i2suppgid(inode), &req);
+		capa_put(oc);
+		if (rc) {
+			mutex_unlock(&lli->lli_rmtperm_mutex);
+			break;
+		}
+
+		perm = req_capsule_server_swab_get(&req->rq_pill, &RMF_ACL,
+						   lustre_swab_mdt_remote_perm);
+		if (unlikely(perm == NULL)) {
+			mutex_unlock(&lli->lli_rmtperm_mutex);
+			rc = -EPROTO;
+			break;
+		}
+
+		rc = ll_update_remote_perm(inode, perm);
+		mutex_unlock(&lli->lli_rmtperm_mutex);
+		if (rc == -ENOMEM)
+			break;
+
+		ptlrpc_req_finished(req);
+		req = NULL;
+	} while (1);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+#if 0  /* NB: remote perms can't be freed in ll_mdc_blocking_ast of UPDATE lock,
+	* because it will fail sanity test 48.
+	*/
+void ll_free_remote_perms(struct inode *inode)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct hlist_head *hash = lli->lli_remote_perms;
+	struct ll_remote_perm *lrp;
+	struct hlist_node *node, *next;
+	int i;
+
+	LASSERT(hash);
+
+	spin_lock(&lli->lli_lock);
+
+	for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) {
+		hlist_for_each_entry_safe(lrp, node, next, hash + i,
+					      lrp_list)
+			free_ll_remote_perm(lrp);
+	}
+
+	spin_unlock(&lli->lli_lock);
+}
+#endif

diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
new file mode 100644
index 0000000..fac1178
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/rw.c

@@ -0,0 +1,1314 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/rw.c
+ *
+ * Lustre Lite I/O page cache routines shared by different kernel revs
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+/* current_is_kswapd() */
+#include <linux/swap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include <obd_cksum.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+/**
+ * Finalizes cl-data before exiting typical address_space operation. Dual to
+ * ll_cl_init().
+ */
+static void ll_cl_fini(struct ll_cl_context *lcc)
+{
+	struct lu_env  *env  = lcc->lcc_env;
+	struct cl_io   *io   = lcc->lcc_io;
+	struct cl_page *page = lcc->lcc_page;
+
+	LASSERT(lcc->lcc_cookie == current);
+	LASSERT(env != NULL);
+
+	if (page != NULL) {
+		lu_ref_del(&page->cp_reference, "cl_io", io);
+		cl_page_put(env, page);
+	}
+
+	if (io && lcc->lcc_created) {
+		cl_io_end(env, io);
+		cl_io_unlock(env, io);
+		cl_io_iter_fini(env, io);
+		cl_io_fini(env, io);
+	}
+	cl_env_put(env, &lcc->lcc_refcheck);
+}
+
+/**
+ * Initializes common cl-data at the typical address_space operation entry
+ * point.
+ */
+static struct ll_cl_context *ll_cl_init(struct file *file,
+					struct page *vmpage, int create)
+{
+	struct ll_cl_context *lcc;
+	struct lu_env    *env;
+	struct cl_io     *io;
+	struct cl_object *clob;
+	struct ccc_io    *cio;
+
+	int refcheck;
+	int result = 0;
+
+	clob = ll_i2info(vmpage->mapping->host)->lli_clob;
+	LASSERT(clob != NULL);
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		return ERR_PTR(PTR_ERR(env));
+
+	lcc = &vvp_env_info(env)->vti_io_ctx;
+	memset(lcc, 0, sizeof(*lcc));
+	lcc->lcc_env = env;
+	lcc->lcc_refcheck = refcheck;
+	lcc->lcc_cookie = current;
+
+	cio = ccc_env_io(env);
+	io = cio->cui_cl.cis_io;
+	if (io == NULL && create) {
+		struct inode *inode = vmpage->mapping->host;
+		loff_t pos;
+
+		if (mutex_trylock(&inode->i_mutex)) {
+			mutex_unlock(&(inode)->i_mutex);
+
+			/* this is too bad. Someone is trying to write the
+			 * page w/o holding inode mutex. This means we can
+			 * add dirty pages into cache during truncate */
+			CERROR("Proc %s is dirting page w/o inode lock, this"
+			       "will break truncate.\n", current->comm);
+			libcfs_debug_dumpstack(NULL);
+			LBUG();
+			return ERR_PTR(-EIO);
+		}
+
+		/*
+		 * Loop-back driver calls ->prepare_write() and ->sendfile()
+		 * methods directly, bypassing file system ->write() operation,
+		 * so cl_io has to be created here.
+		 */
+		io = ccc_env_thread_io(env);
+		ll_io_init(io, file, 1);
+
+		/* No lock at all for this kind of IO - we can't do it because
+		 * we have held page lock, it would cause deadlock.
+		 * XXX: This causes poor performance to loop device - One page
+		 *      per RPC.
+		 *      In order to get better performance, users should use
+		 *      lloop driver instead.
+		 */
+		io->ci_lockreq = CILR_NEVER;
+
+		pos = (vmpage->index << PAGE_CACHE_SHIFT);
+
+		/* Create a temp IO to serve write. */
+		result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE);
+		if (result == 0) {
+			cio->cui_fd = LUSTRE_FPRIVATE(file);
+			cio->cui_iov = NULL;
+			cio->cui_nrsegs = 0;
+			result = cl_io_iter_init(env, io);
+			if (result == 0) {
+				result = cl_io_lock(env, io);
+				if (result == 0)
+					result = cl_io_start(env, io);
+			}
+		} else
+			result = io->ci_result;
+		lcc->lcc_created = 1;
+	}
+
+	lcc->lcc_io = io;
+	if (io == NULL)
+		result = -EIO;
+	if (result == 0) {
+		struct cl_page   *page;
+
+		LASSERT(io != NULL);
+		LASSERT(io->ci_state == CIS_IO_GOING);
+		LASSERT(cio->cui_fd == LUSTRE_FPRIVATE(file));
+		page = cl_page_find(env, clob, vmpage->index, vmpage,
+				    CPT_CACHEABLE);
+		if (!IS_ERR(page)) {
+			lcc->lcc_page = page;
+			lu_ref_add(&page->cp_reference, "cl_io", io);
+			result = 0;
+		} else
+			result = PTR_ERR(page);
+	}
+	if (result) {
+		ll_cl_fini(lcc);
+		lcc = ERR_PTR(result);
+	}
+
+	CDEBUG(D_VFSTRACE, "%lu@"DFID" -> %d %p %p\n",
+	       vmpage->index, PFID(lu_object_fid(&clob->co_lu)), result,
+	       env, io);
+	return lcc;
+}
+
+static struct ll_cl_context *ll_cl_get(void)
+{
+	struct ll_cl_context *lcc;
+	struct lu_env *env;
+	int refcheck;
+
+	env = cl_env_get(&refcheck);
+	LASSERT(!IS_ERR(env));
+	lcc = &vvp_env_info(env)->vti_io_ctx;
+	LASSERT(env == lcc->lcc_env);
+	LASSERT(current == lcc->lcc_cookie);
+	cl_env_put(env, &refcheck);
+
+	/* env has got in ll_cl_init, so it is still usable. */
+	return lcc;
+}
+
+/**
+ * ->prepare_write() address space operation called by generic_file_write()
+ * for every page during write.
+ */
+int ll_prepare_write(struct file *file, struct page *vmpage, unsigned from,
+		     unsigned to)
+{
+	struct ll_cl_context *lcc;
+	int result;
+	ENTRY;
+
+	lcc = ll_cl_init(file, vmpage, 1);
+	if (!IS_ERR(lcc)) {
+		struct lu_env  *env = lcc->lcc_env;
+		struct cl_io   *io  = lcc->lcc_io;
+		struct cl_page *page = lcc->lcc_page;
+
+		cl_page_assume(env, io, page);
+
+		result = cl_io_prepare_write(env, io, page, from, to);
+		if (result == 0) {
+			/*
+			 * Add a reference, so that page is not evicted from
+			 * the cache until ->commit_write() is called.
+			 */
+			cl_page_get(page);
+			lu_ref_add(&page->cp_reference, "prepare_write",
+				   current);
+		} else {
+			cl_page_unassume(env, io, page);
+			ll_cl_fini(lcc);
+		}
+		/* returning 0 in prepare assumes commit must be called
+		 * afterwards */
+	} else {
+		result = PTR_ERR(lcc);
+	}
+	RETURN(result);
+}
+
+int ll_commit_write(struct file *file, struct page *vmpage, unsigned from,
+		    unsigned to)
+{
+	struct ll_cl_context *lcc;
+	struct lu_env    *env;
+	struct cl_io     *io;
+	struct cl_page   *page;
+	int result = 0;
+	ENTRY;
+
+	lcc  = ll_cl_get();
+	env  = lcc->lcc_env;
+	page = lcc->lcc_page;
+	io   = lcc->lcc_io;
+
+	LASSERT(cl_page_is_owned(page, io));
+	LASSERT(from <= to);
+	if (from != to) /* handle short write case. */
+		result = cl_io_commit_write(env, io, page, from, to);
+	if (cl_page_is_owned(page, io))
+		cl_page_unassume(env, io, page);
+
+	/*
+	 * Release reference acquired by ll_prepare_write().
+	 */
+	lu_ref_del(&page->cp_reference, "prepare_write", current);
+	cl_page_put(env, page);
+	ll_cl_fini(lcc);
+	RETURN(result);
+}
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt)
+{
+	__u64 opc;
+
+	opc = crt == CRT_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
+	return ll_osscapa_get(inode, opc);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
+
+/**
+ * Get readahead pages from the filesystem readahead pool of the client for a
+ * thread.
+ *
+ * /param sbi superblock for filesystem readahead state ll_ra_info
+ * /param ria per-thread readahead state
+ * /param pages number of pages requested for readahead for the thread.
+ *
+ * WARNING: This algorithm is used to reduce contention on sbi->ll_lock.
+ * It should work well if the ra_max_pages is much greater than the single
+ * file's read-ahead window, and not too many threads contending for
+ * these readahead pages.
+ *
+ * TODO: There may be a 'global sync problem' if many threads are trying
+ * to get an ra budget that is larger than the remaining readahead pages
+ * and reach here at exactly the same time. They will compute /a ret to
+ * consume the remaining pages, but will fail at atomic_add_return() and
+ * get a zero ra window, although there is still ra space remaining. - Jay */
+
+static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
+				     struct ra_io_arg *ria,
+				     unsigned long pages)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	long ret;
+	ENTRY;
+
+	/* If read-ahead pages left are less than 1M, do not do read-ahead,
+	 * otherwise it will form small read RPC(< 1M), which hurt server
+	 * performance a lot. */
+	ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), pages);
+	if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
+		GOTO(out, ret = 0);
+
+	/* If the non-strided (ria_pages == 0) readahead window
+	 * (ria_start + ret) has grown across an RPC boundary, then trim
+	 * readahead size by the amount beyond the RPC so it ends on an
+	 * RPC boundary. If the readahead window is already ending on
+	 * an RPC boundary (beyond_rpc == 0), or smaller than a full
+	 * RPC (beyond_rpc < ret) the readahead size is unchanged.
+	 * The (beyond_rpc != 0) check is skipped since the conditional
+	 * branch is more expensive than subtracting zero from the result.
+	 *
+	 * Strided read is left unaligned to avoid small fragments beyond
+	 * the RPC boundary from needing an extra read RPC. */
+	if (ria->ria_pages == 0) {
+		long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES;
+		if (/* beyond_rpc != 0 && */ beyond_rpc < ret)
+			ret -= beyond_rpc;
+	}
+
+	if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
+		atomic_sub(ret, &ra->ra_cur_pages);
+		ret = 0;
+	}
+
+out:
+	RETURN(ret);
+}
+
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	atomic_sub(len, &ra->ra_cur_pages);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
+{
+	LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
+	lprocfs_counter_incr(sbi->ll_ra_stats, which);
+}
+
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(mapping->host);
+	ll_ra_stats_inc_sbi(sbi, which);
+}
+
+#define RAS_CDEBUG(ras) \
+	CDEBUG(D_READA,						      \
+	       "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
+	       "csr %lu sf %lu sp %lu sl %lu \n",			    \
+	       ras->ras_last_readpage, ras->ras_consecutive_requests,	\
+	       ras->ras_consecutive_pages, ras->ras_window_start,	    \
+	       ras->ras_window_len, ras->ras_next_readahead,		 \
+	       ras->ras_requests, ras->ras_request_index,		    \
+	       ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
+	       ras->ras_stride_pages, ras->ras_stride_length)
+
+static int index_in_window(unsigned long index, unsigned long point,
+			   unsigned long before, unsigned long after)
+{
+	unsigned long start = point - before, end = point + after;
+
+	if (start > point)
+	       start = 0;
+	if (end < point)
+	       end = ~0;
+
+	return start <= index && index <= end;
+}
+
+static struct ll_readahead_state *ll_ras_get(struct file *f)
+{
+	struct ll_file_data       *fd;
+
+	fd = LUSTRE_FPRIVATE(f);
+	return &fd->fd_ras;
+}
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
+{
+	struct ll_readahead_state *ras;
+
+	ras = ll_ras_get(f);
+
+	spin_lock(&ras->ras_lock);
+	ras->ras_requests++;
+	ras->ras_request_index = 0;
+	ras->ras_consecutive_requests++;
+	rar->lrr_reader = current;
+
+	list_add(&rar->lrr_linkage, &ras->ras_read_beads);
+	spin_unlock(&ras->ras_lock);
+}
+
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar)
+{
+	struct ll_readahead_state *ras;
+
+	ras = ll_ras_get(f);
+
+	spin_lock(&ras->ras_lock);
+	list_del_init(&rar->lrr_linkage);
+	spin_unlock(&ras->ras_lock);
+}
+
+static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras)
+{
+	struct ll_ra_read *scan;
+
+	list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) {
+		if (scan->lrr_reader == current)
+			return scan;
+	}
+	return NULL;
+}
+
+struct ll_ra_read *ll_ra_read_get(struct file *f)
+{
+	struct ll_readahead_state *ras;
+	struct ll_ra_read	 *bead;
+
+	ras = ll_ras_get(f);
+
+	spin_lock(&ras->ras_lock);
+	bead = ll_ra_read_get_locked(ras);
+	spin_unlock(&ras->ras_lock);
+	return bead;
+}
+
+static int cl_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page_list *queue, struct cl_page *page,
+			      struct page *vmpage)
+{
+	struct ccc_page *cp;
+	int	      rc;
+
+	ENTRY;
+
+	rc = 0;
+	cl_page_assume(env, io, page);
+	lu_ref_add(&page->cp_reference, "ra", current);
+	cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+	if (!cp->cpg_defer_uptodate && !PageUptodate(vmpage)) {
+		rc = cl_page_is_under_lock(env, io, page);
+		if (rc == -EBUSY) {
+			cp->cpg_defer_uptodate = 1;
+			cp->cpg_ra_used = 0;
+			cl_page_list_add(queue, page);
+			rc = 1;
+		} else {
+			cl_page_delete(env, page);
+			rc = -ENOLCK;
+		}
+	} else {
+		/* skip completed pages */
+		cl_page_unassume(env, io, page);
+	}
+	lu_ref_del(&page->cp_reference, "ra", current);
+	cl_page_put(env, page);
+	RETURN(rc);
+}
+
+/**
+ * Initiates read-ahead of a page with given index.
+ *
+ * \retval     +ve: page was added to \a queue.
+ *
+ * \retval -ENOLCK: there is no extent lock for this part of a file, stop
+ *		  read-ahead.
+ *
+ * \retval  -ve, 0: page wasn't added to \a queue for other reason.
+ */
+static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+			      struct cl_page_list *queue,
+			      pgoff_t index, struct address_space *mapping)
+{
+	struct page      *vmpage;
+	struct cl_object *clob  = ll_i2info(mapping->host)->lli_clob;
+	struct cl_page   *page;
+	enum ra_stat      which = _NR_RA_STAT; /* keep gcc happy */
+	unsigned int      gfp_mask;
+	int	       rc    = 0;
+	const char       *msg   = NULL;
+
+	ENTRY;
+
+	gfp_mask = GFP_HIGHUSER & ~__GFP_WAIT;
+#ifdef __GFP_NOWARN
+	gfp_mask |= __GFP_NOWARN;
+#endif
+	vmpage = grab_cache_page_nowait(mapping, index);
+	if (vmpage != NULL) {
+		/* Check if vmpage was truncated or reclaimed */
+		if (vmpage->mapping == mapping) {
+			page = cl_page_find(env, clob, vmpage->index,
+					    vmpage, CPT_CACHEABLE);
+			if (!IS_ERR(page)) {
+				rc = cl_read_ahead_page(env, io, queue,
+							page, vmpage);
+				if (rc == -ENOLCK) {
+					which = RA_STAT_FAILED_MATCH;
+					msg   = "lock match failed";
+				}
+			} else {
+				which = RA_STAT_FAILED_GRAB_PAGE;
+				msg   = "cl_page_find failed";
+			}
+		} else {
+			which = RA_STAT_WRONG_GRAB_PAGE;
+			msg   = "g_c_p_n returned invalid page";
+		}
+		if (rc != 1)
+			unlock_page(vmpage);
+		page_cache_release(vmpage);
+	} else {
+		which = RA_STAT_FAILED_GRAB_PAGE;
+		msg   = "g_c_p_n failed";
+	}
+	if (msg != NULL) {
+		ll_ra_stats_inc(mapping, which);
+		CDEBUG(D_READA, "%s\n", msg);
+	}
+	RETURN(rc);
+}
+
+#define RIA_DEBUG(ria)						       \
+	CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n",       \
+	ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
+	ria->ria_pages)
+
+/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
+ * know what the actual RPC size is.  If this needs to change, it makes more
+ * sense to tune the i_blkbits value for the file based on the OSTs it is
+ * striped over, rather than having a constant value for all files here. */
+
+/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_CACHE_SHIFT)).
+ * Temprarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
+ * by default, this should be adjusted corresponding with max_read_ahead_mb
+ * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
+ * up quickly which will affect read performance siginificantly. See LU-2816 */
+#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+static inline int stride_io_mode(struct ll_readahead_state *ras)
+{
+	return ras->ras_consecutive_stride_requests > 1;
+}
+/* The function calculates how much pages will be read in
+ * [off, off + length], in such stride IO area,
+ * stride_offset = st_off, stride_lengh = st_len,
+ * stride_pages = st_pgs
+ *
+ *   |------------------|*****|------------------|*****|------------|*****|....
+ * st_off
+ *   |--- st_pgs     ---|
+ *   |-----     st_len   -----|
+ *
+ *	      How many pages it should read in such pattern
+ *	      |-------------------------------------------------------------|
+ *	      off
+ *	      |<------		  length		      ------->|
+ *
+ *	  =   |<----->|  +  |-------------------------------------| +   |---|
+ *	     start_left		 st_pgs * i		    end_left
+ */
+static unsigned long
+stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs,
+		unsigned long off, unsigned long length)
+{
+	__u64 start = off > st_off ? off - st_off : 0;
+	__u64 end = off + length > st_off ? off + length - st_off : 0;
+	unsigned long start_left = 0;
+	unsigned long end_left = 0;
+	unsigned long pg_count;
+
+	if (st_len == 0 || length == 0 || end == 0)
+		return length;
+
+	start_left = do_div(start, st_len);
+	if (start_left < st_pgs)
+		start_left = st_pgs - start_left;
+	else
+		start_left = 0;
+
+	end_left = do_div(end, st_len);
+	if (end_left > st_pgs)
+		end_left = st_pgs;
+
+	CDEBUG(D_READA, "start "LPU64", end "LPU64" start_left %lu end_left %lu \n",
+	       start, end, start_left, end_left);
+
+	if (start == end)
+		pg_count = end_left - (st_pgs - start_left);
+	else
+		pg_count = start_left + st_pgs * (end - start - 1) + end_left;
+
+	CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu"
+	       "pgcount %lu\n", st_off, st_len, st_pgs, off, length, pg_count);
+
+	return pg_count;
+}
+
+static int ria_page_count(struct ra_io_arg *ria)
+{
+	__u64 length = ria->ria_end >= ria->ria_start ?
+		       ria->ria_end - ria->ria_start + 1 : 0;
+
+	return stride_pg_count(ria->ria_stoff, ria->ria_length,
+			       ria->ria_pages, ria->ria_start,
+			       length);
+}
+
+/*Check whether the index is in the defined ra-window */
+static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
+{
+	/* If ria_length == ria_pages, it means non-stride I/O mode,
+	 * idx should always inside read-ahead window in this case
+	 * For stride I/O mode, just check whether the idx is inside
+	 * the ria_pages. */
+	return ria->ria_length == 0 || ria->ria_length == ria->ria_pages ||
+	       (idx >= ria->ria_stoff && (idx - ria->ria_stoff) %
+		ria->ria_length < ria->ria_pages);
+}
+
+static int ll_read_ahead_pages(const struct lu_env *env,
+			       struct cl_io *io, struct cl_page_list *queue,
+			       struct ra_io_arg *ria,
+			       unsigned long *reserved_pages,
+			       struct address_space *mapping,
+			       unsigned long *ra_end)
+{
+	int rc, count = 0, stride_ria;
+	unsigned long page_idx;
+
+	LASSERT(ria != NULL);
+	RIA_DEBUG(ria);
+
+	stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
+	for (page_idx = ria->ria_start; page_idx <= ria->ria_end &&
+			*reserved_pages > 0; page_idx++) {
+		if (ras_inside_ra_window(page_idx, ria)) {
+			/* If the page is inside the read-ahead window*/
+			rc = ll_read_ahead_page(env, io, queue,
+						page_idx, mapping);
+			if (rc == 1) {
+				(*reserved_pages)--;
+				count ++;
+			} else if (rc == -ENOLCK)
+				break;
+		} else if (stride_ria) {
+			/* If it is not in the read-ahead window, and it is
+			 * read-ahead mode, then check whether it should skip
+			 * the stride gap */
+			pgoff_t offset;
+			/* FIXME: This assertion only is valid when it is for
+			 * forward read-ahead, it will be fixed when backward
+			 * read-ahead is implemented */
+			LASSERTF(page_idx > ria->ria_stoff, "Invalid page_idx %lu"
+				"rs %lu re %lu ro %lu rl %lu rp %lu\n", page_idx,
+				ria->ria_start, ria->ria_end, ria->ria_stoff,
+				ria->ria_length, ria->ria_pages);
+			offset = page_idx - ria->ria_stoff;
+			offset = offset % (ria->ria_length);
+			if (offset > ria->ria_pages) {
+				page_idx += ria->ria_length - offset;
+				CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
+				       ria->ria_length - offset);
+				continue;
+			}
+		}
+	}
+	*ra_end = page_idx;
+	return count;
+}
+
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+		 struct ll_readahead_state *ras, struct address_space *mapping,
+		 struct cl_page_list *queue, int flags)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+	struct vvp_thread_info *vti = vvp_env_info(env);
+	struct cl_attr *attr = ccc_env_thread_attr(env);
+	unsigned long start = 0, end = 0, reserved;
+	unsigned long ra_end, len;
+	struct inode *inode;
+	struct ll_ra_read *bead;
+	struct ra_io_arg *ria = &vti->vti_ria;
+	struct ll_inode_info *lli;
+	struct cl_object *clob;
+	int ret = 0;
+	__u64 kms;
+	ENTRY;
+
+	inode = mapping->host;
+	lli = ll_i2info(inode);
+	clob = lli->lli_clob;
+
+	memset(ria, 0, sizeof *ria);
+
+	cl_object_attr_lock(clob);
+	ret = cl_object_attr_get(env, clob, attr);
+	cl_object_attr_unlock(clob);
+
+	if (ret != 0)
+		RETURN(ret);
+	kms = attr->cat_kms;
+	if (kms == 0) {
+		ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN);
+		RETURN(0);
+	}
+
+	spin_lock(&ras->ras_lock);
+	if (vio->cui_ra_window_set)
+		bead = &vio->cui_bead;
+	else
+		bead = NULL;
+
+	/* Enlarge the RA window to encompass the full read */
+	if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
+	    bead->lrr_start + bead->lrr_count) {
+		ras->ras_window_len = bead->lrr_start + bead->lrr_count -
+				      ras->ras_window_start;
+	}
+	/* Reserve a part of the read-ahead window that we'll be issuing */
+	if (ras->ras_window_len) {
+		start = ras->ras_next_readahead;
+		end = ras->ras_window_start + ras->ras_window_len - 1;
+	}
+	if (end != 0) {
+		unsigned long rpc_boundary;
+		/*
+		 * Align RA window to an optimal boundary.
+		 *
+		 * XXX This would be better to align to cl_max_pages_per_rpc
+		 * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
+		 * be aligned to the RAID stripe size in the future and that
+		 * is more important than the RPC size.
+		 */
+		/* Note: we only trim the RPC, instead of extending the RPC
+		 * to the boundary, so to avoid reading too much pages during
+		 * random reading. */
+		rpc_boundary = ((end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1)));
+		if (rpc_boundary > 0)
+			rpc_boundary--;
+
+		if (rpc_boundary  > start)
+			end = rpc_boundary;
+
+		/* Truncate RA window to end of file */
+		end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
+
+		ras->ras_next_readahead = max(end, end + 1);
+		RAS_CDEBUG(ras);
+	}
+	ria->ria_start = start;
+	ria->ria_end = end;
+	/* If stride I/O mode is detected, get stride window*/
+	if (stride_io_mode(ras)) {
+		ria->ria_stoff = ras->ras_stride_offset;
+		ria->ria_length = ras->ras_stride_length;
+		ria->ria_pages = ras->ras_stride_pages;
+	}
+	spin_unlock(&ras->ras_lock);
+
+	if (end == 0) {
+		ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW);
+		RETURN(0);
+	}
+	len = ria_page_count(ria);
+	if (len == 0)
+		RETURN(0);
+
+	reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len);
+	if (reserved < len)
+		ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
+
+	CDEBUG(D_READA, "reserved page %lu ra_cur %d ra_max %lu\n", reserved,
+	       atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
+	       ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
+
+	ret = ll_read_ahead_pages(env, io, queue,
+				  ria, &reserved, mapping, &ra_end);
+
+	LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
+	if (reserved != 0)
+		ll_ra_count_put(ll_i2sbi(inode), reserved);
+
+	if (ra_end == end + 1 && ra_end == (kms >> PAGE_CACHE_SHIFT))
+		ll_ra_stats_inc(mapping, RA_STAT_EOF);
+
+	/* if we didn't get to the end of the region we reserved from
+	 * the ras we need to go back and update the ras so that the
+	 * next read-ahead tries from where we left off.  we only do so
+	 * if the region we failed to issue read-ahead on is still ahead
+	 * of the app and behind the next index to start read-ahead from */
+	CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n",
+	       ra_end, end, ria->ria_end);
+
+	if (ra_end != end + 1) {
+		spin_lock(&ras->ras_lock);
+		if (ra_end < ras->ras_next_readahead &&
+		    index_in_window(ra_end, ras->ras_window_start, 0,
+				    ras->ras_window_len)) {
+			ras->ras_next_readahead = ra_end;
+			RAS_CDEBUG(ras);
+		}
+		spin_unlock(&ras->ras_lock);
+	}
+
+	RETURN(ret);
+}
+
+static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
+			  unsigned long index)
+{
+	ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1));
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_reset(struct inode *inode, struct ll_readahead_state *ras,
+		      unsigned long index)
+{
+	ras->ras_last_readpage = index;
+	ras->ras_consecutive_requests = 0;
+	ras->ras_consecutive_pages = 0;
+	ras->ras_window_len = 0;
+	ras_set_start(inode, ras, index);
+	ras->ras_next_readahead = max(ras->ras_window_start, index);
+
+	RAS_CDEBUG(ras);
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_stride_reset(struct ll_readahead_state *ras)
+{
+	ras->ras_consecutive_stride_requests = 0;
+	ras->ras_stride_length = 0;
+	ras->ras_stride_pages = 0;
+	RAS_CDEBUG(ras);
+}
+
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
+{
+	spin_lock_init(&ras->ras_lock);
+	ras_reset(inode, ras, 0);
+	ras->ras_requests = 0;
+	INIT_LIST_HEAD(&ras->ras_read_beads);
+}
+
+/*
+ * Check whether the read request is in the stride window.
+ * If it is in the stride window, return 1, otherwise return 0.
+ */
+static int index_in_stride_window(struct ll_readahead_state *ras,
+				  unsigned long index)
+{
+	unsigned long stride_gap;
+
+	if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 ||
+	    ras->ras_stride_pages == ras->ras_stride_length)
+		return 0;
+
+	stride_gap = index - ras->ras_last_readpage - 1;
+
+	/* If it is contiguous read */
+	if (stride_gap == 0)
+		return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
+
+	/* Otherwise check the stride by itself */
+	return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
+		ras->ras_consecutive_pages == ras->ras_stride_pages;
+}
+
+static void ras_update_stride_detector(struct ll_readahead_state *ras,
+				       unsigned long index)
+{
+	unsigned long stride_gap = index - ras->ras_last_readpage - 1;
+
+	if (!stride_io_mode(ras) && (stride_gap != 0 ||
+	     ras->ras_consecutive_stride_requests == 0)) {
+		ras->ras_stride_pages = ras->ras_consecutive_pages;
+		ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+	}
+	LASSERT(ras->ras_request_index == 0);
+	LASSERT(ras->ras_consecutive_stride_requests == 0);
+
+	if (index <= ras->ras_last_readpage) {
+		/*Reset stride window for forward read*/
+		ras_stride_reset(ras);
+		return;
+	}
+
+	ras->ras_stride_pages = ras->ras_consecutive_pages;
+	ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+
+	RAS_CDEBUG(ras);
+	return;
+}
+
+static unsigned long
+stride_page_count(struct ll_readahead_state *ras, unsigned long len)
+{
+	return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length,
+			       ras->ras_stride_pages, ras->ras_stride_offset,
+			       len);
+}
+
+/* Stride Read-ahead window will be increased inc_len according to
+ * stride I/O pattern */
+static void ras_stride_increase_window(struct ll_readahead_state *ras,
+				       struct ll_ra_info *ra,
+				       unsigned long inc_len)
+{
+	unsigned long left, step, window_len;
+	unsigned long stride_len;
+
+	LASSERT(ras->ras_stride_length > 0);
+	LASSERTF(ras->ras_window_start + ras->ras_window_len
+		 >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
+		 " stride_offset %lu\n", ras->ras_window_start,
+		 ras->ras_window_len, ras->ras_stride_offset);
+
+	stride_len = ras->ras_window_start + ras->ras_window_len -
+		     ras->ras_stride_offset;
+
+	left = stride_len % ras->ras_stride_length;
+	window_len = ras->ras_window_len - left;
+
+	if (left < ras->ras_stride_pages)
+		left += inc_len;
+	else
+		left = ras->ras_stride_pages + inc_len;
+
+	LASSERT(ras->ras_stride_pages != 0);
+
+	step = left / ras->ras_stride_pages;
+	left %= ras->ras_stride_pages;
+
+	window_len += step * ras->ras_stride_length + left;
+
+	if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
+		ras->ras_window_len = window_len;
+
+	RAS_CDEBUG(ras);
+}
+
+static void ras_increase_window(struct inode *inode,
+				struct ll_readahead_state *ras,
+				struct ll_ra_info *ra)
+{
+	/* The stretch of ra-window should be aligned with max rpc_size
+	 * but current clio architecture does not support retrieve such
+	 * information from lower layer. FIXME later
+	 */
+	if (stride_io_mode(ras))
+		ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode));
+	else
+		ras->ras_window_len = min(ras->ras_window_len +
+					  RAS_INCREASE_STEP(inode),
+					  ra->ra_max_pages_per_file);
+}
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+		struct ll_readahead_state *ras, unsigned long index,
+		unsigned hit)
+{
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	int zero = 0, stride_detect = 0, ra_miss = 0;
+	ENTRY;
+
+	spin_lock(&ras->ras_lock);
+
+	ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
+
+	/* reset the read-ahead window in two cases.  First when the app seeks
+	 * or reads to some other part of the file.  Secondly if we get a
+	 * read-ahead miss that we think we've previously issued.  This can
+	 * be a symptom of there being so many read-ahead pages that the VM is
+	 * reclaiming it before we get to it. */
+	if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
+		zero = 1;
+		ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE);
+	} else if (!hit && ras->ras_window_len &&
+		   index < ras->ras_next_readahead &&
+		   index_in_window(index, ras->ras_window_start, 0,
+				   ras->ras_window_len)) {
+		ra_miss = 1;
+		ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
+	}
+
+	/* On the second access to a file smaller than the tunable
+	 * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+	 * file up to ra_max_pages_per_file.  This is simply a best effort
+	 * and only occurs once per open file.  Normal RA behavior is reverted
+	 * to for subsequent IO.  The mmap case does not increment
+	 * ras_requests and thus can never trigger this behavior. */
+	if (ras->ras_requests == 2 && !ras->ras_request_index) {
+		__u64 kms_pages;
+
+		kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+			    PAGE_CACHE_SHIFT;
+
+		CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
+		       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
+
+		if (kms_pages &&
+		    kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+			ras->ras_window_start = 0;
+			ras->ras_last_readpage = 0;
+			ras->ras_next_readahead = 0;
+			ras->ras_window_len = min(ra->ra_max_pages_per_file,
+				ra->ra_max_read_ahead_whole_pages);
+			GOTO(out_unlock, 0);
+		}
+	}
+	if (zero) {
+		/* check whether it is in stride I/O mode*/
+		if (!index_in_stride_window(ras, index)) {
+			if (ras->ras_consecutive_stride_requests == 0 &&
+			    ras->ras_request_index == 0) {
+				ras_update_stride_detector(ras, index);
+				ras->ras_consecutive_stride_requests++;
+			} else {
+				ras_stride_reset(ras);
+			}
+			ras_reset(inode, ras, index);
+			ras->ras_consecutive_pages++;
+			GOTO(out_unlock, 0);
+		} else {
+			ras->ras_consecutive_pages = 0;
+			ras->ras_consecutive_requests = 0;
+			if (++ras->ras_consecutive_stride_requests > 1)
+				stride_detect = 1;
+			RAS_CDEBUG(ras);
+		}
+	} else {
+		if (ra_miss) {
+			if (index_in_stride_window(ras, index) &&
+			    stride_io_mode(ras)) {
+				/*If stride-RA hit cache miss, the stride dector
+				 *will not be reset to avoid the overhead of
+				 *redetecting read-ahead mode */
+				if (index != ras->ras_last_readpage + 1)
+					ras->ras_consecutive_pages = 0;
+				ras_reset(inode, ras, index);
+				RAS_CDEBUG(ras);
+			} else {
+				/* Reset both stride window and normal RA
+				 * window */
+				ras_reset(inode, ras, index);
+				ras->ras_consecutive_pages++;
+				ras_stride_reset(ras);
+				GOTO(out_unlock, 0);
+			}
+		} else if (stride_io_mode(ras)) {
+			/* If this is contiguous read but in stride I/O mode
+			 * currently, check whether stride step still is valid,
+			 * if invalid, it will reset the stride ra window*/
+			if (!index_in_stride_window(ras, index)) {
+				/* Shrink stride read-ahead window to be zero */
+				ras_stride_reset(ras);
+				ras->ras_window_len = 0;
+				ras->ras_next_readahead = index;
+			}
+		}
+	}
+	ras->ras_consecutive_pages++;
+	ras->ras_last_readpage = index;
+	ras_set_start(inode, ras, index);
+
+	if (stride_io_mode(ras))
+		/* Since stride readahead is sentivite to the offset
+		 * of read-ahead, so we use original offset here,
+		 * instead of ras_window_start, which is RPC aligned */
+		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+	else
+		ras->ras_next_readahead = max(ras->ras_window_start,
+					      ras->ras_next_readahead);
+	RAS_CDEBUG(ras);
+
+	/* Trigger RA in the mmap case where ras_consecutive_requests
+	 * is not incremented and thus can't be used to trigger RA */
+	if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) {
+		ras->ras_window_len = RAS_INCREASE_STEP(inode);
+		GOTO(out_unlock, 0);
+	}
+
+	/* Initially reset the stride window offset to next_readahead*/
+	if (ras->ras_consecutive_stride_requests == 2 && stride_detect) {
+		/**
+		 * Once stride IO mode is detected, next_readahead should be
+		 * reset to make sure next_readahead > stride offset
+		 */
+		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+		ras->ras_stride_offset = index;
+		ras->ras_window_len = RAS_INCREASE_STEP(inode);
+	}
+
+	/* The initial ras_window_len is set to the request size.  To avoid
+	 * uselessly reading and discarding pages for random IO the window is
+	 * only increased once per consecutive request received. */
+	if ((ras->ras_consecutive_requests > 1 || stride_detect) &&
+	    !ras->ras_request_index)
+		ras_increase_window(inode, ras, ra);
+	EXIT;
+out_unlock:
+	RAS_CDEBUG(ras);
+	ras->ras_request_index++;
+	spin_unlock(&ras->ras_lock);
+	return;
+}
+
+int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
+{
+	struct inode	       *inode = vmpage->mapping->host;
+	struct ll_inode_info   *lli   = ll_i2info(inode);
+	struct lu_env	  *env;
+	struct cl_io	   *io;
+	struct cl_page	 *page;
+	struct cl_object       *clob;
+	struct cl_env_nest      nest;
+	bool redirtied = false;
+	bool unlocked = false;
+	int result;
+	ENTRY;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageWriteback(vmpage));
+
+	LASSERT(ll_i2dtexp(inode) != NULL);
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		GOTO(out, result = PTR_ERR(env));
+
+	clob  = ll_i2info(inode)->lli_clob;
+	LASSERT(clob != NULL);
+
+	io = ccc_env_thread_io(env);
+	io->ci_obj = clob;
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, clob);
+	if (result == 0) {
+		page = cl_page_find(env, clob, vmpage->index,
+				    vmpage, CPT_CACHEABLE);
+		if (!IS_ERR(page)) {
+			lu_ref_add(&page->cp_reference, "writepage",
+				   current);
+			cl_page_assume(env, io, page);
+			result = cl_page_flush(env, io, page);
+			if (result != 0) {
+				/*
+				 * Re-dirty page on error so it retries write,
+				 * but not in case when IO has actually
+				 * occurred and completed with an error.
+				 */
+				if (!PageError(vmpage)) {
+					redirty_page_for_writepage(wbc, vmpage);
+					result = 0;
+					redirtied = true;
+				}
+			}
+			cl_page_disown(env, io, page);
+			unlocked = true;
+			lu_ref_del(&page->cp_reference,
+				   "writepage", current);
+			cl_page_put(env, page);
+		} else {
+			result = PTR_ERR(page);
+		}
+	}
+	cl_io_fini(env, io);
+
+	if (redirtied && wbc->sync_mode == WB_SYNC_ALL) {
+		loff_t offset = cl_offset(clob, vmpage->index);
+
+		/* Flush page failed because the extent is being written out.
+		 * Wait for the write of extent to be finished to avoid
+		 * breaking kernel which assumes ->writepage should mark
+		 * PageWriteback or clean the page. */
+		result = cl_sync_file_range(inode, offset,
+					    offset + PAGE_CACHE_SIZE - 1,
+					    CL_FSYNC_LOCAL, 1);
+		if (result > 0) {
+			/* actually we may have written more than one page.
+			 * decreasing this page because the caller will count
+			 * it. */
+			wbc->nr_to_write -= result - 1;
+			result = 0;
+		}
+	}
+
+	cl_env_nested_put(&nest, env);
+	GOTO(out, result);
+
+out:
+	if (result < 0) {
+		if (!lli->lli_async_rc)
+			lli->lli_async_rc = result;
+		SetPageError(vmpage);
+		if (!unlocked)
+			unlock_page(vmpage);
+	}
+	return result;
+}
+
+int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	loff_t start;
+	loff_t end;
+	enum cl_fsync_mode mode;
+	int range_whole = 0;
+	int result;
+	int ignore_layout = 0;
+	ENTRY;
+
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+		end = OBD_OBJECT_EOF;
+	} else {
+		start = wbc->range_start;
+		end = wbc->range_end;
+		if (end == LLONG_MAX) {
+			end = OBD_OBJECT_EOF;
+			range_whole = start == 0;
+		}
+	}
+
+	mode = CL_FSYNC_NONE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		mode = CL_FSYNC_LOCAL;
+
+	if (sbi->ll_umounting)
+		/* if the mountpoint is being umounted, all pages have to be
+		 * evicted to avoid hitting LBUG when truncate_inode_pages()
+		 * is called later on. */
+		ignore_layout = 1;
+	result = cl_sync_file_range(inode, start, end, mode, ignore_layout);
+	if (result > 0) {
+		wbc->nr_to_write -= result;
+		result = 0;
+	 }
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
+		if (end == OBD_OBJECT_EOF)
+			end = i_size_read(inode);
+		mapping->writeback_index = (end >> PAGE_CACHE_SHIFT) + 1;
+	}
+	RETURN(result);
+}
+
+int ll_readpage(struct file *file, struct page *vmpage)
+{
+	struct ll_cl_context *lcc;
+	int result;
+	ENTRY;
+
+	lcc = ll_cl_init(file, vmpage, 0);
+	if (!IS_ERR(lcc)) {
+		struct lu_env  *env  = lcc->lcc_env;
+		struct cl_io   *io   = lcc->lcc_io;
+		struct cl_page *page = lcc->lcc_page;
+
+		LASSERT(page->cp_type == CPT_CACHEABLE);
+		if (likely(!PageUptodate(vmpage))) {
+			cl_page_assume(env, io, page);
+			result = cl_io_read_page(env, io, page);
+		} else {
+			/* Page from a non-object file. */
+			unlock_page(vmpage);
+			result = 0;
+		}
+		ll_cl_fini(lcc);
+	} else {
+		unlock_page(vmpage);
+		result = PTR_ERR(lcc);
+	}
+	RETURN(result);
+}

diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
new file mode 100644
index 0000000..27e4e64
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/rw26.c

@@ -0,0 +1,586 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/rw26.c
+ *
+ * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <asm/uaccess.h>
+
+#include <linux/migrate.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+/**
+ * Implements Linux VM address_space::invalidatepage() method. This method is
+ * called when the page is truncate from a file, either as a result of
+ * explicit truncate, or when inode is removed from memory (as a result of
+ * final iput(), umount, or memory pressure induced icache shrinking).
+ *
+ * [0, offset] bytes of the page remain valid (this is for a case of not-page
+ * aligned truncate). Lustre leaves partially truncated page in the cache,
+ * relying on struct inode::i_size to limit further accesses.
+ */
+static void ll_invalidatepage(struct page *vmpage, unsigned long offset)
+{
+	struct inode     *inode;
+	struct lu_env    *env;
+	struct cl_page   *page;
+	struct cl_object *obj;
+
+	int refcheck;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageWriteback(vmpage));
+
+	/*
+	 * It is safe to not check anything in invalidatepage/releasepage
+	 * below because they are run with page locked and all our io is
+	 * happening with locked page too
+	 */
+	if (offset == 0) {
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			inode = vmpage->mapping->host;
+			obj = ll_i2info(inode)->lli_clob;
+			if (obj != NULL) {
+				page = cl_vmpage_page(vmpage, obj);
+				if (page != NULL) {
+					lu_ref_add(&page->cp_reference,
+						   "delete", vmpage);
+					cl_page_delete(env, page);
+					lu_ref_del(&page->cp_reference,
+						   "delete", vmpage);
+					cl_page_put(env, page);
+				}
+			} else
+				LASSERT(vmpage->private == 0);
+			cl_env_put(env, &refcheck);
+		}
+	}
+}
+
+#ifdef HAVE_RELEASEPAGE_WITH_INT
+#define RELEASEPAGE_ARG_TYPE int
+#else
+#define RELEASEPAGE_ARG_TYPE gfp_t
+#endif
+static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask)
+{
+	struct cl_env_nest nest;
+	struct lu_env     *env;
+	struct cl_object  *obj;
+	struct cl_page    *page;
+	struct address_space *mapping;
+	int result;
+
+	LASSERT(PageLocked(vmpage));
+	if (PageWriteback(vmpage) || PageDirty(vmpage))
+		return 0;
+
+	mapping = vmpage->mapping;
+	if (mapping == NULL)
+		return 1;
+
+	obj = ll_i2info(mapping->host)->lli_clob;
+	if (obj == NULL)
+		return 1;
+
+	/* 1 for page allocator, 1 for cl_page and 1 for page cache */
+	if (page_count(vmpage) > 3)
+		return 0;
+
+	/* TODO: determine what gfp should be used by @gfp_mask. */
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		/* If we can't allocate an env we won't call cl_page_put()
+		 * later on which further means it's impossible to drop
+		 * page refcount by cl_page, so ask kernel to not free
+		 * this page. */
+		return 0;
+
+	page = cl_vmpage_page(vmpage, obj);
+	result = page == NULL;
+	if (page != NULL) {
+		if (!cl_page_in_use(page)) {
+			result = 1;
+			cl_page_delete(env, page);
+		}
+		cl_page_put(env, page);
+	}
+	cl_env_nested_put(&nest, env);
+	return result;
+}
+
+static int ll_set_page_dirty(struct page *vmpage)
+{
+#if 0
+	struct cl_page    *page = vvp_vmpage_page_transient(vmpage);
+	struct vvp_object *obj  = cl_inode2vvp(vmpage->mapping->host);
+	struct vvp_page   *cpg;
+
+	/*
+	 * XXX should page method be called here?
+	 */
+	LASSERT(&obj->co_cl == page->cp_obj);
+	cpg = cl2vvp_page(cl_page_at(page, &vvp_device_type));
+	/*
+	 * XXX cannot do much here, because page is possibly not locked:
+	 * sys_munmap()->...
+	 *     ->unmap_page_range()->zap_pte_range()->set_page_dirty().
+	 */
+	vvp_write_pending(obj, cpg);
+#endif
+	RETURN(__set_page_dirty_nobuffers(vmpage));
+}
+
+#define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL
+
+static inline int ll_get_user_pages(int rw, unsigned long user_addr,
+				    size_t size, struct page ***pages,
+				    int *max_pages)
+{
+	int result = -ENOMEM;
+
+	/* set an arbitrary limit to prevent arithmetic overflow */
+	if (size > MAX_DIRECTIO_SIZE) {
+		*pages = NULL;
+		return -EFBIG;
+	}
+
+	*max_pages = (user_addr + size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	*max_pages -= user_addr >> PAGE_CACHE_SHIFT;
+
+	OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages));
+	if (*pages) {
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					*max_pages, (rw == READ), 0, *pages,
+					NULL);
+		up_read(&current->mm->mmap_sem);
+		if (unlikely(result <= 0))
+			OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages));
+	}
+
+	return result;
+}
+
+/*  ll_free_user_pages - tear down page struct array
+ *  @pages: array of page struct pointers underlying target buffer */
+static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
+{
+	int i;
+
+	for (i = 0; i < npages; i++) {
+		if (pages[i] == NULL)
+			break;
+		if (do_dirty)
+			set_page_dirty_lock(pages[i]);
+		page_cache_release(pages[i]);
+	}
+
+	OBD_FREE_LARGE(pages, npages * sizeof(*pages));
+}
+
+ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+			   int rw, struct inode *inode,
+			   struct ll_dio_pages *pv)
+{
+	struct cl_page    *clp;
+	struct cl_2queue  *queue;
+	struct cl_object  *obj = io->ci_obj;
+	int i;
+	ssize_t rc = 0;
+	loff_t file_offset  = pv->ldp_start_offset;
+	long size	   = pv->ldp_size;
+	int page_count      = pv->ldp_nr;
+	struct page **pages = pv->ldp_pages;
+	long page_size      = cl_page_size(obj);
+	bool do_io;
+	int  io_pages       = 0;
+	ENTRY;
+
+	queue = &io->ci_queue;
+	cl_2queue_init(queue);
+	for (i = 0; i < page_count; i++) {
+		if (pv->ldp_offsets)
+		    file_offset = pv->ldp_offsets[i];
+
+		LASSERT(!(file_offset & (page_size - 1)));
+		clp = cl_page_find(env, obj, cl_index(obj, file_offset),
+				   pv->ldp_pages[i], CPT_TRANSIENT);
+		if (IS_ERR(clp)) {
+			rc = PTR_ERR(clp);
+			break;
+		}
+
+		rc = cl_page_own(env, io, clp);
+		if (rc) {
+			LASSERT(clp->cp_state == CPS_FREEING);
+			cl_page_put(env, clp);
+			break;
+		}
+
+		do_io = true;
+
+		/* check the page type: if the page is a host page, then do
+		 * write directly */
+		if (clp->cp_type == CPT_CACHEABLE) {
+			struct page *vmpage = cl_page_vmpage(env, clp);
+			struct page *src_page;
+			struct page *dst_page;
+			void       *src;
+			void       *dst;
+
+			src_page = (rw == WRITE) ? pages[i] : vmpage;
+			dst_page = (rw == WRITE) ? vmpage : pages[i];
+
+			src = ll_kmap_atomic(src_page, KM_USER0);
+			dst = ll_kmap_atomic(dst_page, KM_USER1);
+			memcpy(dst, src, min(page_size, size));
+			ll_kunmap_atomic(dst, KM_USER1);
+			ll_kunmap_atomic(src, KM_USER0);
+
+			/* make sure page will be added to the transfer by
+			 * cl_io_submit()->...->vvp_page_prep_write(). */
+			if (rw == WRITE)
+				set_page_dirty(vmpage);
+
+			if (rw == READ) {
+				/* do not issue the page for read, since it
+				 * may reread a ra page which has NOT uptodate
+				 * bit set. */
+				cl_page_disown(env, io, clp);
+				do_io = false;
+			}
+		}
+
+		if (likely(do_io)) {
+			cl_2queue_add(queue, clp);
+
+			/*
+			 * Set page clip to tell transfer formation engine
+			 * that page has to be sent even if it is beyond KMS.
+			 */
+			cl_page_clip(env, clp, 0, min(size, page_size));
+
+			++io_pages;
+		}
+
+		/* drop the reference count for cl_page_find */
+		cl_page_put(env, clp);
+		size -= page_size;
+		file_offset += page_size;
+	}
+
+	if (rc == 0 && io_pages) {
+		rc = cl_io_submit_sync(env, io,
+				       rw == READ ? CRT_READ : CRT_WRITE,
+				       queue, 0);
+	}
+	if (rc == 0)
+		rc = pv->ldp_size;
+
+	cl_2queue_discard(env, io, queue);
+	cl_2queue_disown(env, io, queue);
+	cl_2queue_fini(env, queue);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ll_direct_rw_pages);
+
+static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
+				   int rw, struct inode *inode,
+				   struct address_space *mapping,
+				   size_t size, loff_t file_offset,
+				   struct page **pages, int page_count)
+{
+    struct ll_dio_pages pvec = { .ldp_pages	= pages,
+				 .ldp_nr	   = page_count,
+				 .ldp_size	 = size,
+				 .ldp_offsets      = NULL,
+				 .ldp_start_offset = file_offset
+			       };
+
+    return ll_direct_rw_pages(env, io, rw, inode, &pvec);
+}
+
+#ifdef KMALLOC_MAX_SIZE
+#define MAX_MALLOC KMALLOC_MAX_SIZE
+#else
+#define MAX_MALLOC (128 * 1024)
+#endif
+
+/* This is the maximum size of a single O_DIRECT request, based on the
+ * kmalloc limit.  We need to fit all of the brw_page structs, each one
+ * representing PAGE_SIZE worth of user data, into a single buffer, and
+ * then truncate this to be a full-sized RPC.  For 4kB PAGE_SIZE this is
+ * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */
+#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \
+		      ~(DT_MAX_BRW_SIZE - 1))
+static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
+			       const struct iovec *iov, loff_t file_offset,
+			       unsigned long nr_segs)
+{
+	struct lu_env *env;
+	struct cl_io *io;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct ccc_object *obj = cl_inode2ccc(inode);
+	long count = iov_length(iov, nr_segs);
+	long tot_bytes = 0, result = 0;
+	struct ll_inode_info *lli = ll_i2info(inode);
+	unsigned long seg = 0;
+	long size = MAX_DIO_SIZE;
+	int refcheck;
+	ENTRY;
+
+	if (!lli->lli_has_smd)
+		RETURN(-EBADF);
+
+	/* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
+	if ((file_offset & ~CFS_PAGE_MASK) || (count & ~CFS_PAGE_MASK))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), size=%lu (max %lu), "
+	       "offset=%lld=%llx, pages %lu (max %lu)\n",
+	       inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE,
+	       file_offset, file_offset, count >> PAGE_CACHE_SHIFT,
+	       MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
+
+	/* Check that all user buffers are aligned as well */
+	for (seg = 0; seg < nr_segs; seg++) {
+		if (((unsigned long)iov[seg].iov_base & ~CFS_PAGE_MASK) ||
+		    (iov[seg].iov_len & ~CFS_PAGE_MASK))
+			RETURN(-EINVAL);
+	}
+
+	env = cl_env_get(&refcheck);
+	LASSERT(!IS_ERR(env));
+	io = ccc_env_io(env)->cui_cl.cis_io;
+	LASSERT(io != NULL);
+
+	/* 0. Need locking between buffered and direct access. and race with
+	 *    size changing by concurrent truncates and writes.
+	 * 1. Need inode mutex to operate transient pages.
+	 */
+	if (rw == READ)
+		mutex_lock(&inode->i_mutex);
+
+	LASSERT(obj->cob_transient_pages == 0);
+	for (seg = 0; seg < nr_segs; seg++) {
+		long iov_left = iov[seg].iov_len;
+		unsigned long user_addr = (unsigned long)iov[seg].iov_base;
+
+		if (rw == READ) {
+			if (file_offset >= i_size_read(inode))
+				break;
+			if (file_offset + iov_left > i_size_read(inode))
+				iov_left = i_size_read(inode) - file_offset;
+		}
+
+		while (iov_left > 0) {
+			struct page **pages;
+			int page_count, max_pages = 0;
+			long bytes;
+
+			bytes = min(size, iov_left);
+			page_count = ll_get_user_pages(rw, user_addr, bytes,
+						       &pages, &max_pages);
+			if (likely(page_count > 0)) {
+				if (unlikely(page_count <  max_pages))
+					bytes = page_count << PAGE_CACHE_SHIFT;
+				result = ll_direct_IO_26_seg(env, io, rw, inode,
+							     file->f_mapping,
+							     bytes, file_offset,
+							     pages, page_count);
+				ll_free_user_pages(pages, max_pages, rw==READ);
+			} else if (page_count == 0) {
+				GOTO(out, result = -EFAULT);
+			} else {
+				result = page_count;
+			}
+			if (unlikely(result <= 0)) {
+				/* If we can't allocate a large enough buffer
+				 * for the request, shrink it to a smaller
+				 * PAGE_SIZE multiple and try again.
+				 * We should always be able to kmalloc for a
+				 * page worth of page pointers = 4MB on i386. */
+				if (result == -ENOMEM &&
+				    size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
+					   PAGE_CACHE_SIZE) {
+					size = ((((size / 2) - 1) |
+						 ~CFS_PAGE_MASK) + 1) &
+						CFS_PAGE_MASK;
+					CDEBUG(D_VFSTRACE,"DIO size now %lu\n",
+					       size);
+					continue;
+				}
+
+				GOTO(out, result);
+			}
+
+			tot_bytes += result;
+			file_offset += result;
+			iov_left -= result;
+			user_addr += result;
+		}
+	}
+out:
+	LASSERT(obj->cob_transient_pages == 0);
+	if (rw == READ)
+		mutex_unlock(&inode->i_mutex);
+
+	if (tot_bytes > 0) {
+		if (rw == WRITE) {
+			struct lov_stripe_md *lsm;
+
+			lsm = ccc_inode_lsm_get(inode);
+			LASSERT(lsm != NULL);
+			lov_stripe_lock(lsm);
+			obd_adjust_kms(ll_i2dtexp(inode), lsm, file_offset, 0);
+			lov_stripe_unlock(lsm);
+			ccc_inode_lsm_put(inode, lsm);
+		}
+	}
+
+	cl_env_put(env, &refcheck);
+	RETURN(tot_bytes ? : result);
+}
+
+static int ll_write_begin(struct file *file, struct address_space *mapping,
+			 loff_t pos, unsigned len, unsigned flags,
+			 struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int rc;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	ENTRY;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		RETURN(-ENOMEM);
+
+	*pagep = page;
+
+	rc = ll_prepare_write(file, page, from, from + len);
+	if (rc) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	RETURN(rc);
+}
+
+static int ll_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	int rc;
+
+	rc = ll_commit_write(file, page, from, from + copied);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return rc ?: copied;
+}
+
+#ifdef CONFIG_MIGRATION
+int ll_migratepage(struct address_space *mapping,
+		struct page *newpage, struct page *page
+		, enum migrate_mode mode
+		)
+{
+	/* Always fail page migration until we have a proper implementation */
+	return -EIO;
+}
+#endif
+
+#ifndef MS_HAS_NEW_AOPS
+struct address_space_operations ll_aops = {
+	.readpage       = ll_readpage,
+//	.readpages      = ll_readpages,
+	.direct_IO      = ll_direct_IO_26,
+	.writepage      = ll_writepage,
+	.writepages     = ll_writepages,
+	.set_page_dirty = ll_set_page_dirty,
+	.write_begin    = ll_write_begin,
+	.write_end      = ll_write_end,
+	.invalidatepage = ll_invalidatepage,
+	.releasepage    = (void *)ll_releasepage,
+#ifdef CONFIG_MIGRATION
+	.migratepage    = ll_migratepage,
+#endif
+	.bmap	   = NULL
+};
+#else
+struct address_space_operations_ext ll_aops = {
+	.orig_aops.readpage       = ll_readpage,
+//	.orig_aops.readpages      = ll_readpages,
+	.orig_aops.direct_IO      = ll_direct_IO_26,
+	.orig_aops.writepage      = ll_writepage,
+	.orig_aops.writepages     = ll_writepages,
+	.orig_aops.set_page_dirty = ll_set_page_dirty,
+	.orig_aops.prepare_write  = ll_prepare_write,
+	.orig_aops.commit_write   = ll_commit_write,
+	.orig_aops.invalidatepage = ll_invalidatepage,
+	.orig_aops.releasepage    = ll_releasepage,
+#ifdef CONFIG_MIGRATION
+	.orig_aops.migratepage    = ll_migratepage,
+#endif
+	.orig_aops.bmap	   = NULL,
+	.write_begin    = ll_write_begin,
+	.write_end      = ll_write_end
+};
+#endif

diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c
new file mode 100644
index 0000000..7747f8f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/statahead.c

@@ -0,0 +1,1722 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include "llite_internal.h"
+
+#define SA_OMITTED_ENTRY_MAX 8ULL
+
+typedef enum {
+	/** negative values are for error cases */
+	SA_ENTRY_INIT = 0,      /** init entry */
+	SA_ENTRY_SUCC = 1,      /** stat succeed */
+	SA_ENTRY_INVA = 2,      /** invalid entry */
+	SA_ENTRY_DEST = 3,      /** entry to be destroyed */
+} se_stat_t;
+
+struct ll_sa_entry {
+	/* link into sai->sai_entries */
+	struct list_head	      se_link;
+	/* link into sai->sai_entries_{received,stated} */
+	struct list_head	      se_list;
+	/* link into sai hash table locally */
+	struct list_head	      se_hash;
+	/* entry reference count */
+	atomic_t	    se_refcount;
+	/* entry index in the sai */
+	__u64		   se_index;
+	/* low layer ldlm lock handle */
+	__u64		   se_handle;
+	/* entry status */
+	se_stat_t	       se_stat;
+	/* entry size, contains name */
+	int		     se_size;
+	/* pointer to async getattr enqueue info */
+	struct md_enqueue_info *se_minfo;
+	/* pointer to the async getattr request */
+	struct ptlrpc_request  *se_req;
+	/* pointer to the target inode */
+	struct inode	   *se_inode;
+	/* entry name */
+	struct qstr	     se_qstr;
+};
+
+static unsigned int sai_generation = 0;
+static DEFINE_SPINLOCK(sai_generation_lock);
+
+static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry)
+{
+	return list_empty(&entry->se_hash);
+}
+
+/*
+ * The entry only can be released by the caller, it is necessary to hold lock.
+ */
+static inline int ll_sa_entry_stated(struct ll_sa_entry *entry)
+{
+	smp_rmb();
+	return (entry->se_stat != SA_ENTRY_INIT);
+}
+
+static inline int ll_sa_entry_hash(int val)
+{
+	return val & LL_SA_CACHE_MASK;
+}
+
+/*
+ * Insert entry to hash SA table.
+ */
+static inline void
+ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+	spin_lock(&sai->sai_cache_lock[i]);
+	list_add_tail(&entry->se_hash, &sai->sai_cache[i]);
+	spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+/*
+ * Remove entry from SA table.
+ */
+static inline void
+ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+	spin_lock(&sai->sai_cache_lock[i]);
+	list_del_init(&entry->se_hash);
+	spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+static inline int agl_should_run(struct ll_statahead_info *sai,
+				 struct inode *inode)
+{
+	return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid);
+}
+
+static inline struct ll_sa_entry *
+sa_first_received_entry(struct ll_statahead_info *sai)
+{
+	return list_entry(sai->sai_entries_received.next,
+			      struct ll_sa_entry, se_list);
+}
+
+static inline struct ll_inode_info *
+agl_first_entry(struct ll_statahead_info *sai)
+{
+	return list_entry(sai->sai_entries_agl.next,
+			      struct ll_inode_info, lli_agl_list);
+}
+
+static inline int sa_sent_full(struct ll_statahead_info *sai)
+{
+	return atomic_read(&sai->sai_cache_count) >= sai->sai_max;
+}
+
+static inline int sa_received_empty(struct ll_statahead_info *sai)
+{
+	return list_empty(&sai->sai_entries_received);
+}
+
+static inline int agl_list_empty(struct ll_statahead_info *sai)
+{
+	return list_empty(&sai->sai_entries_agl);
+}
+
+/**
+ * (1) hit ratio less than 80%
+ * or
+ * (2) consecutive miss more than 8
+ * then means low hit.
+ */
+static inline int sa_low_hit(struct ll_statahead_info *sai)
+{
+	return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) ||
+		(sai->sai_consecutive_miss > 8));
+}
+
+/*
+ * If the given index is behind of statahead window more than
+ * SA_OMITTED_ENTRY_MAX, then it is old.
+ */
+static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index)
+{
+	return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX <
+		 sai->sai_index);
+}
+
+/*
+ * Insert it into sai_entries tail when init.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index,
+		  const char *name, int len)
+{
+	struct ll_inode_info *lli;
+	struct ll_sa_entry   *entry;
+	int		   entry_size;
+	char		 *dname;
+	ENTRY;
+
+	entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4;
+	OBD_ALLOC(entry, entry_size);
+	if (unlikely(entry == NULL))
+		RETURN(ERR_PTR(-ENOMEM));
+
+	CDEBUG(D_READA, "alloc sa entry %.*s(%p) index "LPU64"\n",
+	       len, name, entry, index);
+
+	entry->se_index = index;
+
+	/*
+	 * Statahead entry reference rules:
+	 *
+	 * 1) When statahead entry is initialized, its reference is set as 2.
+	 *    One reference is used by the directory scanner. When the scanner
+	 *    searches the statahead cache for the given name, it can perform
+	 *    lockless hash lookup (only the scanner can remove entry from hash
+	 *    list), and once found, it needn't to call "atomic_inc()" for the
+	 *    entry reference. So the performance is improved. After using the
+	 *    statahead entry, the scanner will call "atomic_dec()" to drop the
+	 *    reference held when initialization. If it is the last reference,
+	 *    the statahead entry will be freed.
+	 *
+	 * 2) All other threads, including statahead thread and ptlrpcd thread,
+	 *    when they process the statahead entry, the reference for target
+	 *    should be held to guarantee the entry will not be released by the
+	 *    directory scanner. After processing the entry, these threads will
+	 *    drop the entry reference. If it is the last reference, the entry
+	 *    will be freed.
+	 *
+	 *    The second reference when initializes the statahead entry is used
+	 *    by the statahead thread, following the rule 2).
+	 */
+	atomic_set(&entry->se_refcount, 2);
+	entry->se_stat = SA_ENTRY_INIT;
+	entry->se_size = entry_size;
+	dname = (char *)entry + sizeof(struct ll_sa_entry);
+	memcpy(dname, name, len);
+	dname[len] = 0;
+	entry->se_qstr.hash = full_name_hash(name, len);
+	entry->se_qstr.len = len;
+	entry->se_qstr.name = dname;
+
+	lli = ll_i2info(sai->sai_inode);
+	spin_lock(&lli->lli_sa_lock);
+	list_add_tail(&entry->se_link, &sai->sai_entries);
+	INIT_LIST_HEAD(&entry->se_list);
+	ll_sa_entry_enhash(sai, entry);
+	spin_unlock(&lli->lli_sa_lock);
+
+	atomic_inc(&sai->sai_cache_count);
+
+	RETURN(entry);
+}
+
+/*
+ * Used by the directory scanner to search entry with name.
+ *
+ * Only the caller can remove the entry from hash, so it is unnecessary to hold
+ * hash lock. It is caller's duty to release the init refcount on the entry, so
+ * it is also unnecessary to increase refcount on the entry.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr)
+{
+	struct ll_sa_entry *entry;
+	int i = ll_sa_entry_hash(qstr->hash);
+
+	list_for_each_entry(entry, &sai->sai_cache[i], se_hash) {
+		if (entry->se_qstr.hash == qstr->hash &&
+		    entry->se_qstr.len == qstr->len &&
+		    memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0)
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Used by the async getattr request callback to find entry with index.
+ *
+ * Inside lli_sa_lock to prevent others to change the list during the search.
+ * It needs to increase entry refcount before returning to guarantee that the
+ * entry cannot be freed by others.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index)
+{
+	struct ll_sa_entry *entry;
+
+	list_for_each_entry(entry, &sai->sai_entries, se_link) {
+		if (entry->se_index == index) {
+			LASSERT(atomic_read(&entry->se_refcount) > 0);
+			atomic_inc(&entry->se_refcount);
+			return entry;
+		}
+		if (entry->se_index > index)
+			break;
+	}
+	return NULL;
+}
+
+static void ll_sa_entry_cleanup(struct ll_statahead_info *sai,
+				 struct ll_sa_entry *entry)
+{
+	struct md_enqueue_info *minfo = entry->se_minfo;
+	struct ptlrpc_request  *req   = entry->se_req;
+
+	if (minfo) {
+		entry->se_minfo = NULL;
+		ll_intent_release(&minfo->mi_it);
+		iput(minfo->mi_dir);
+		OBD_FREE_PTR(minfo);
+	}
+
+	if (req) {
+		entry->se_req = NULL;
+		ptlrpc_req_finished(req);
+	}
+}
+
+static void ll_sa_entry_put(struct ll_statahead_info *sai,
+			     struct ll_sa_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->se_refcount)) {
+		CDEBUG(D_READA, "free sa entry %.*s(%p) index "LPU64"\n",
+		       entry->se_qstr.len, entry->se_qstr.name, entry,
+		       entry->se_index);
+
+		LASSERT(list_empty(&entry->se_link));
+		LASSERT(list_empty(&entry->se_list));
+		LASSERT(ll_sa_entry_unhashed(entry));
+
+		ll_sa_entry_cleanup(sai, entry);
+		if (entry->se_inode)
+			iput(entry->se_inode);
+
+		OBD_FREE(entry, entry->se_size);
+		atomic_dec(&sai->sai_cache_count);
+	}
+}
+
+static inline void
+do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+	LASSERT(!ll_sa_entry_unhashed(entry));
+	LASSERT(!list_empty(&entry->se_link));
+
+	ll_sa_entry_unhash(sai, entry);
+
+	spin_lock(&lli->lli_sa_lock);
+	entry->se_stat = SA_ENTRY_DEST;
+	list_del_init(&entry->se_link);
+	if (likely(!list_empty(&entry->se_list)))
+		list_del_init(&entry->se_list);
+	spin_unlock(&lli->lli_sa_lock);
+
+	ll_sa_entry_put(sai, entry);
+}
+
+/*
+ * Delete it from sai_entries_stated list when fini.
+ */
+static void
+ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	struct ll_sa_entry *pos, *next;
+
+	if (entry)
+		do_sa_entry_fini(sai, entry);
+
+	/* drop old entry, only 'scanner' process does this, no need to lock */
+	list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) {
+		if (!is_omitted_entry(sai, pos->se_index))
+			break;
+		do_sa_entry_fini(sai, pos);
+	}
+}
+
+/*
+ * Inside lli_sa_lock.
+ */
+static void
+do_sa_entry_to_stated(struct ll_statahead_info *sai,
+		      struct ll_sa_entry *entry, se_stat_t stat)
+{
+	struct ll_sa_entry *se;
+	struct list_head	 *pos = &sai->sai_entries_stated;
+
+	if (!list_empty(&entry->se_list))
+		list_del_init(&entry->se_list);
+
+	list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) {
+		if (se->se_index < entry->se_index) {
+			pos = &se->se_list;
+			break;
+		}
+	}
+
+	list_add(&entry->se_list, pos);
+	entry->se_stat = stat;
+}
+
+/*
+ * Move entry to sai_entries_stated and sort with the index.
+ * \retval 1    -- entry to be destroyed.
+ * \retval 0    -- entry is inserted into stated list.
+ */
+static int
+ll_sa_entry_to_stated(struct ll_statahead_info *sai,
+		      struct ll_sa_entry *entry, se_stat_t stat)
+{
+	struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+	int		   ret = 1;
+
+	ll_sa_entry_cleanup(sai, entry);
+
+	spin_lock(&lli->lli_sa_lock);
+	if (likely(entry->se_stat != SA_ENTRY_DEST)) {
+		do_sa_entry_to_stated(sai, entry, stat);
+		ret = 0;
+	}
+	spin_unlock(&lli->lli_sa_lock);
+
+	return ret;
+}
+
+/*
+ * Insert inode into the list of sai_entries_agl.
+ */
+static void ll_agl_add(struct ll_statahead_info *sai,
+		       struct inode *inode, int index)
+{
+	struct ll_inode_info *child  = ll_i2info(inode);
+	struct ll_inode_info *parent = ll_i2info(sai->sai_inode);
+	int		   added  = 0;
+
+	spin_lock(&child->lli_agl_lock);
+	if (child->lli_agl_index == 0) {
+		child->lli_agl_index = index;
+		spin_unlock(&child->lli_agl_lock);
+
+		LASSERT(list_empty(&child->lli_agl_list));
+
+		igrab(inode);
+		spin_lock(&parent->lli_agl_lock);
+		if (agl_list_empty(sai))
+			added = 1;
+		list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl);
+		spin_unlock(&parent->lli_agl_lock);
+	} else {
+		spin_unlock(&child->lli_agl_lock);
+	}
+
+	if (added > 0)
+		wake_up(&sai->sai_agl_thread.t_ctl_waitq);
+}
+
+static struct ll_statahead_info *ll_sai_alloc(void)
+{
+	struct ll_statahead_info *sai;
+	int		       i;
+	ENTRY;
+
+	OBD_ALLOC_PTR(sai);
+	if (!sai)
+		RETURN(NULL);
+
+	atomic_set(&sai->sai_refcount, 1);
+
+	spin_lock(&sai_generation_lock);
+	sai->sai_generation = ++sai_generation;
+	if (unlikely(sai_generation == 0))
+		sai->sai_generation = ++sai_generation;
+	spin_unlock(&sai_generation_lock);
+
+	sai->sai_max = LL_SA_RPC_MIN;
+	sai->sai_index = 1;
+	init_waitqueue_head(&sai->sai_waitq);
+	init_waitqueue_head(&sai->sai_thread.t_ctl_waitq);
+	init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq);
+
+	INIT_LIST_HEAD(&sai->sai_entries);
+	INIT_LIST_HEAD(&sai->sai_entries_received);
+	INIT_LIST_HEAD(&sai->sai_entries_stated);
+	INIT_LIST_HEAD(&sai->sai_entries_agl);
+
+	for (i = 0; i < LL_SA_CACHE_SIZE; i++) {
+		INIT_LIST_HEAD(&sai->sai_cache[i]);
+		spin_lock_init(&sai->sai_cache_lock[i]);
+	}
+	atomic_set(&sai->sai_cache_count, 0);
+
+	RETURN(sai);
+}
+
+static inline struct ll_statahead_info *
+ll_sai_get(struct ll_statahead_info *sai)
+{
+	atomic_inc(&sai->sai_refcount);
+	return sai;
+}
+
+static void ll_sai_put(struct ll_statahead_info *sai)
+{
+	struct inode	 *inode = sai->sai_inode;
+	struct ll_inode_info *lli   = ll_i2info(inode);
+	ENTRY;
+
+	if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) {
+		struct ll_sa_entry *entry, *next;
+
+		if (unlikely(atomic_read(&sai->sai_refcount) > 0)) {
+			/* It is race case, the interpret callback just hold
+			 * a reference count */
+			spin_unlock(&lli->lli_sa_lock);
+			RETURN_EXIT;
+		}
+
+		LASSERT(lli->lli_opendir_key == NULL);
+		LASSERT(thread_is_stopped(&sai->sai_thread));
+		LASSERT(thread_is_stopped(&sai->sai_agl_thread));
+
+		lli->lli_sai = NULL;
+		lli->lli_opendir_pid = 0;
+		spin_unlock(&lli->lli_sa_lock);
+
+		if (sai->sai_sent > sai->sai_replied)
+			CDEBUG(D_READA,"statahead for dir "DFID" does not "
+			      "finish: [sent:"LPU64"] [replied:"LPU64"]\n",
+			      PFID(&lli->lli_fid),
+			      sai->sai_sent, sai->sai_replied);
+
+		list_for_each_entry_safe(entry, next,
+					     &sai->sai_entries, se_link)
+			do_sa_entry_fini(sai, entry);
+
+		LASSERT(list_empty(&sai->sai_entries));
+		LASSERT(sa_received_empty(sai));
+		LASSERT(list_empty(&sai->sai_entries_stated));
+
+		LASSERT(atomic_read(&sai->sai_cache_count) == 0);
+		LASSERT(agl_list_empty(sai));
+
+		iput(inode);
+		OBD_FREE_PTR(sai);
+	}
+
+	EXIT;
+}
+
+/* Do NOT forget to drop inode refcount when into sai_entries_agl. */
+static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
+{
+	struct ll_inode_info *lli   = ll_i2info(inode);
+	__u64		 index = lli->lli_agl_index;
+	int		   rc;
+	ENTRY;
+
+	LASSERT(list_empty(&lli->lli_agl_list));
+
+	/* AGL maybe fall behind statahead with one entry */
+	if (is_omitted_entry(sai, index + 1)) {
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+	/* Someone is in glimpse (sync or async), do nothing. */
+	rc = down_write_trylock(&lli->lli_glimpse_sem);
+	if (rc == 0) {
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+	/*
+	 * Someone triggered glimpse within 1 sec before.
+	 * 1) The former glimpse succeeded with glimpse lock granted by OST, and
+	 *    if the lock is still cached on client, AGL needs to do nothing. If
+	 *    it is cancelled by other client, AGL maybe cannot obtaion new lock
+	 *    for no glimpse callback triggered by AGL.
+	 * 2) The former glimpse succeeded, but OST did not grant glimpse lock.
+	 *    Under such case, it is quite possible that the OST will not grant
+	 *    glimpse lock for AGL also.
+	 * 3) The former glimpse failed, compared with other two cases, it is
+	 *    relative rare. AGL can ignore such case, and it will not muchly
+	 *    affect the performance.
+	 */
+	if (lli->lli_glimpse_time != 0 &&
+	    cfs_time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) {
+		up_write(&lli->lli_glimpse_sem);
+		lli->lli_agl_index = 0;
+		iput(inode);
+		RETURN_EXIT;
+	}
+
+	CDEBUG(D_READA, "Handling (init) async glimpse: inode = "
+	       DFID", idx = "LPU64"\n", PFID(&lli->lli_fid), index);
+
+	cl_agl(inode);
+	lli->lli_agl_index = 0;
+	lli->lli_glimpse_time = cfs_time_current();
+	up_write(&lli->lli_glimpse_sem);
+
+	CDEBUG(D_READA, "Handled (init) async glimpse: inode= "
+	       DFID", idx = "LPU64", rc = %d\n",
+	       PFID(&lli->lli_fid), index, rc);
+
+	iput(inode);
+
+	EXIT;
+}
+
+static void ll_post_statahead(struct ll_statahead_info *sai)
+{
+	struct inode	   *dir   = sai->sai_inode;
+	struct inode	   *child;
+	struct ll_inode_info   *lli   = ll_i2info(dir);
+	struct ll_sa_entry     *entry;
+	struct md_enqueue_info *minfo;
+	struct lookup_intent   *it;
+	struct ptlrpc_request  *req;
+	struct mdt_body	*body;
+	int		     rc    = 0;
+	ENTRY;
+
+	spin_lock(&lli->lli_sa_lock);
+	if (unlikely(sa_received_empty(sai))) {
+		spin_unlock(&lli->lli_sa_lock);
+		RETURN_EXIT;
+	}
+	entry = sa_first_received_entry(sai);
+	atomic_inc(&entry->se_refcount);
+	list_del_init(&entry->se_list);
+	spin_unlock(&lli->lli_sa_lock);
+
+	LASSERT(entry->se_handle != 0);
+
+	minfo = entry->se_minfo;
+	it = &minfo->mi_it;
+	req = entry->se_req;
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	child = entry->se_inode;
+	if (child == NULL) {
+		/*
+		 * lookup.
+		 */
+		LASSERT(fid_is_zero(&minfo->mi_data.op_fid2));
+
+		/* XXX: No fid in reply, this is probaly cross-ref case.
+		 * SA can't handle it yet. */
+		if (body->valid & OBD_MD_MDS)
+			GOTO(out, rc = -EAGAIN);
+	} else {
+		/*
+		 * revalidate.
+		 */
+		/* unlinked and re-created with the same name */
+		if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))){
+			entry->se_inode = NULL;
+			iput(child);
+			child = NULL;
+		}
+	}
+
+	it->d.lustre.it_lock_handle = entry->se_handle;
+	rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
+	if (rc != 1)
+		GOTO(out, rc = -EAGAIN);
+
+	rc = ll_prep_inode(&child, req, dir->i_sb, it);
+	if (rc)
+		GOTO(out, rc);
+
+	CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+	       child, child->i_ino, child->i_generation);
+	ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+
+	entry->se_inode = child;
+
+	if (agl_should_run(sai, child))
+		ll_agl_add(sai, child, entry->se_index);
+
+	EXIT;
+
+out:
+	/* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock
+	 * reference count by calling "ll_intent_drop_lock()" in spite of the
+	 * above operations failed or not. Do not worry about calling
+	 * "ll_intent_drop_lock()" more than once. */
+	rc = ll_sa_entry_to_stated(sai, entry,
+				   rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+	if (rc == 0 && entry->se_index == sai->sai_index_wait)
+		wake_up(&sai->sai_waitq);
+	ll_sa_entry_put(sai, entry);
+}
+
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+				  struct md_enqueue_info *minfo, int rc)
+{
+	struct lookup_intent     *it  = &minfo->mi_it;
+	struct inode	     *dir = minfo->mi_dir;
+	struct ll_inode_info     *lli = ll_i2info(dir);
+	struct ll_statahead_info *sai = NULL;
+	struct ll_sa_entry       *entry;
+	int		       wakeup;
+	ENTRY;
+
+	if (it_disposition(it, DISP_LOOKUP_NEG))
+		rc = -ENOENT;
+
+	spin_lock(&lli->lli_sa_lock);
+	/* stale entry */
+	if (unlikely(lli->lli_sai == NULL ||
+		     lli->lli_sai->sai_generation != minfo->mi_generation)) {
+		spin_unlock(&lli->lli_sa_lock);
+		GOTO(out, rc = -ESTALE);
+	} else {
+		sai = ll_sai_get(lli->lli_sai);
+		if (unlikely(!thread_is_running(&sai->sai_thread))) {
+			sai->sai_replied++;
+			spin_unlock(&lli->lli_sa_lock);
+			GOTO(out, rc = -EBADFD);
+		}
+
+		entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata);
+		if (entry == NULL) {
+			sai->sai_replied++;
+			spin_unlock(&lli->lli_sa_lock);
+			GOTO(out, rc = -EIDRM);
+		}
+
+		if (rc != 0) {
+			do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA);
+			wakeup = (entry->se_index == sai->sai_index_wait);
+		} else {
+			entry->se_minfo = minfo;
+			entry->se_req = ptlrpc_request_addref(req);
+			/* Release the async ibits lock ASAP to avoid deadlock
+			 * when statahead thread tries to enqueue lock on parent
+			 * for readpage and other tries to enqueue lock on child
+			 * with parent's lock held, for example: unlink. */
+			entry->se_handle = it->d.lustre.it_lock_handle;
+			ll_intent_drop_lock(it);
+			wakeup = sa_received_empty(sai);
+			list_add_tail(&entry->se_list,
+					  &sai->sai_entries_received);
+		}
+		sai->sai_replied++;
+		spin_unlock(&lli->lli_sa_lock);
+
+		ll_sa_entry_put(sai, entry);
+		if (wakeup)
+			wake_up(&sai->sai_thread.t_ctl_waitq);
+	}
+
+	EXIT;
+
+out:
+	if (rc != 0) {
+		ll_intent_release(it);
+		iput(dir);
+		OBD_FREE_PTR(minfo);
+	}
+	if (sai != NULL)
+		ll_sai_put(sai);
+	return rc;
+}
+
+static void sa_args_fini(struct md_enqueue_info *minfo,
+			 struct ldlm_enqueue_info *einfo)
+{
+	LASSERT(minfo && einfo);
+	iput(minfo->mi_dir);
+	capa_put(minfo->mi_data.op_capa1);
+	capa_put(minfo->mi_data.op_capa2);
+	OBD_FREE_PTR(minfo);
+	OBD_FREE_PTR(einfo);
+}
+
+/**
+ * There is race condition between "capa_put" and "ll_statahead_interpret" for
+ * accessing "op_data.op_capa[1,2]" as following:
+ * "capa_put" releases "op_data.op_capa[1,2]"'s reference count after calling
+ * "md_intent_getattr_async". But "ll_statahead_interpret" maybe run first, and
+ * fill "op_data.op_capa[1,2]" as POISON, then cause "capa_put" access invalid
+ * "ocapa". So here reserve "op_data.op_capa[1,2]" in "pcapa" before calling
+ * "md_intent_getattr_async".
+ */
+static int sa_args_init(struct inode *dir, struct inode *child,
+			struct ll_sa_entry *entry, struct md_enqueue_info **pmi,
+			struct ldlm_enqueue_info **pei,
+			struct obd_capa **pcapa)
+{
+	struct qstr	      *qstr = &entry->se_qstr;
+	struct ll_inode_info     *lli  = ll_i2info(dir);
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct md_op_data	*op_data;
+
+	OBD_ALLOC_PTR(einfo);
+	if (einfo == NULL)
+		return -ENOMEM;
+
+	OBD_ALLOC_PTR(minfo);
+	if (minfo == NULL) {
+		OBD_FREE_PTR(einfo);
+		return -ENOMEM;
+	}
+
+	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name,
+				     qstr->len, 0, LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data)) {
+		OBD_FREE_PTR(einfo);
+		OBD_FREE_PTR(minfo);
+		return PTR_ERR(op_data);
+	}
+
+	minfo->mi_it.it_op = IT_GETATTR;
+	minfo->mi_dir = igrab(dir);
+	minfo->mi_cb = ll_statahead_interpret;
+	minfo->mi_generation = lli->lli_sai->sai_generation;
+	minfo->mi_cbdata = entry->se_index;
+
+	einfo->ei_type   = LDLM_IBITS;
+	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+	einfo->ei_cb_bl  = ll_md_blocking_ast;
+	einfo->ei_cb_cp  = ldlm_completion_ast;
+	einfo->ei_cb_gl  = NULL;
+	einfo->ei_cbdata = NULL;
+
+	*pmi = minfo;
+	*pei = einfo;
+	pcapa[0] = op_data->op_capa1;
+	pcapa[1] = op_data->op_capa2;
+
+	return 0;
+}
+
+static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry)
+{
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct obd_capa	  *capas[2];
+	int		       rc;
+	ENTRY;
+
+	rc = sa_args_init(dir, NULL, entry, &minfo, &einfo, capas);
+	if (rc)
+		RETURN(rc);
+
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+	if (!rc) {
+		capa_put(capas[0]);
+		capa_put(capas[1]);
+	} else {
+		sa_args_fini(minfo, einfo);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * similar to ll_revalidate_it().
+ * \retval      1 -- dentry valid
+ * \retval      0 -- will send stat-ahead request
+ * \retval others -- prepare stat-ahead request failed
+ */
+static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry,
+			    struct dentry *dentry)
+{
+	struct inode	     *inode = dentry->d_inode;
+	struct lookup_intent      it = { .it_op = IT_GETATTR,
+					 .d.lustre.it_lock_handle = 0 };
+	struct md_enqueue_info   *minfo;
+	struct ldlm_enqueue_info *einfo;
+	struct obd_capa	  *capas[2];
+	int rc;
+	ENTRY;
+
+	if (unlikely(inode == NULL))
+		RETURN(1);
+
+	if (d_mountpoint(dentry))
+		RETURN(1);
+
+	if (unlikely(dentry == dentry->d_sb->s_root))
+		RETURN(1);
+
+	entry->se_inode = igrab(inode);
+	rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),NULL);
+	if (rc == 1) {
+		entry->se_handle = it.d.lustre.it_lock_handle;
+		ll_intent_release(&it);
+		RETURN(1);
+	}
+
+	rc = sa_args_init(dir, inode, entry, &minfo, &einfo, capas);
+	if (rc) {
+		entry->se_inode = NULL;
+		iput(inode);
+		RETURN(rc);
+	}
+
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+	if (!rc) {
+		capa_put(capas[0]);
+		capa_put(capas[1]);
+	} else {
+		entry->se_inode = NULL;
+		iput(inode);
+		sa_args_fini(minfo, einfo);
+	}
+
+	RETURN(rc);
+}
+
+static void ll_statahead_one(struct dentry *parent, const char* entry_name,
+			     int entry_name_len)
+{
+	struct inode	     *dir    = parent->d_inode;
+	struct ll_inode_info     *lli    = ll_i2info(dir);
+	struct ll_statahead_info *sai    = lli->lli_sai;
+	struct dentry	    *dentry = NULL;
+	struct ll_sa_entry       *entry;
+	int		       rc;
+	int		       rc1;
+	ENTRY;
+
+	entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name,
+				  entry_name_len);
+	if (IS_ERR(entry))
+		RETURN_EXIT;
+
+	dentry = d_lookup(parent, &entry->se_qstr);
+	if (!dentry) {
+		rc = do_sa_lookup(dir, entry);
+	} else {
+		rc = do_sa_revalidate(dir, entry, dentry);
+		if (rc == 1 && agl_should_run(sai, dentry->d_inode))
+			ll_agl_add(sai, dentry->d_inode, entry->se_index);
+	}
+
+	if (dentry != NULL)
+		dput(dentry);
+
+	if (rc) {
+		rc1 = ll_sa_entry_to_stated(sai, entry,
+					rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+		if (rc1 == 0 && entry->se_index == sai->sai_index_wait)
+			wake_up(&sai->sai_waitq);
+	} else {
+		sai->sai_sent++;
+	}
+
+	sai->sai_index++;
+	/* drop one refcount on entry by ll_sa_entry_alloc */
+	ll_sa_entry_put(sai, entry);
+
+	EXIT;
+}
+
+static int ll_agl_thread(void *arg)
+{
+	struct dentry	    *parent = (struct dentry *)arg;
+	struct inode	     *dir    = parent->d_inode;
+	struct ll_inode_info     *plli   = ll_i2info(dir);
+	struct ll_inode_info     *clli;
+	struct ll_sb_info	*sbi    = ll_i2sbi(dir);
+	struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
+	struct ptlrpc_thread     *thread = &sai->sai_agl_thread;
+	struct l_wait_info	lwi    = { 0 };
+	ENTRY;
+
+	CDEBUG(D_READA, "agl thread started: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+
+	atomic_inc(&sbi->ll_agl_total);
+	spin_lock(&plli->lli_agl_lock);
+	sai->sai_agl_valid = 1;
+	thread_set_flags(thread, SVC_RUNNING);
+	spin_unlock(&plli->lli_agl_lock);
+	wake_up(&thread->t_ctl_waitq);
+
+	while (1) {
+		l_wait_event(thread->t_ctl_waitq,
+			     !agl_list_empty(sai) ||
+			     !thread_is_running(thread),
+			     &lwi);
+
+		if (!thread_is_running(thread))
+			break;
+
+		spin_lock(&plli->lli_agl_lock);
+		/* The statahead thread maybe help to process AGL entries,
+		 * so check whether list empty again. */
+		if (!agl_list_empty(sai)) {
+			clli = agl_first_entry(sai);
+			list_del_init(&clli->lli_agl_list);
+			spin_unlock(&plli->lli_agl_lock);
+			ll_agl_trigger(&clli->lli_vfs_inode, sai);
+		} else {
+			spin_unlock(&plli->lli_agl_lock);
+		}
+	}
+
+	spin_lock(&plli->lli_agl_lock);
+	sai->sai_agl_valid = 0;
+	while (!agl_list_empty(sai)) {
+		clli = agl_first_entry(sai);
+		list_del_init(&clli->lli_agl_list);
+		spin_unlock(&plli->lli_agl_lock);
+		clli->lli_agl_index = 0;
+		iput(&clli->lli_vfs_inode);
+		spin_lock(&plli->lli_agl_lock);
+	}
+	thread_set_flags(thread, SVC_STOPPED);
+	spin_unlock(&plli->lli_agl_lock);
+	wake_up(&thread->t_ctl_waitq);
+	ll_sai_put(sai);
+	CDEBUG(D_READA, "agl thread stopped: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+	RETURN(0);
+}
+
+static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai)
+{
+	struct ptlrpc_thread *thread = &sai->sai_agl_thread;
+	struct l_wait_info    lwi    = { 0 };
+	struct ll_inode_info  *plli;
+	task_t	      *task;
+	ENTRY;
+
+	CDEBUG(D_READA, "start agl thread: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+
+	plli = ll_i2info(parent->d_inode);
+	task = kthread_run(ll_agl_thread, parent,
+			       "ll_agl_%u", plli->lli_opendir_pid);
+	if (IS_ERR(task)) {
+		CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task));
+		thread_set_flags(thread, SVC_STOPPED);
+		RETURN_EXIT;
+	}
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+	EXIT;
+}
+
+static int ll_statahead_thread(void *arg)
+{
+	struct dentry	    *parent = (struct dentry *)arg;
+	struct inode	     *dir    = parent->d_inode;
+	struct ll_inode_info     *plli   = ll_i2info(dir);
+	struct ll_inode_info     *clli;
+	struct ll_sb_info	*sbi    = ll_i2sbi(dir);
+	struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
+	struct ptlrpc_thread     *thread = &sai->sai_thread;
+	struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread;
+	struct page	      *page;
+	__u64		     pos    = 0;
+	int		       first  = 0;
+	int		       rc     = 0;
+	struct ll_dir_chain       chain;
+	struct l_wait_info	lwi    = { 0 };
+	ENTRY;
+
+	CDEBUG(D_READA, "statahead thread started: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+
+	if (sbi->ll_flags & LL_SBI_AGL_ENABLED)
+		ll_start_agl(parent, sai);
+
+	atomic_inc(&sbi->ll_sa_total);
+	spin_lock(&plli->lli_sa_lock);
+	thread_set_flags(thread, SVC_RUNNING);
+	spin_unlock(&plli->lli_sa_lock);
+	wake_up(&thread->t_ctl_waitq);
+
+	ll_dir_chain_init(&chain);
+	page = ll_get_dir_page(dir, pos, &chain);
+
+	while (1) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (IS_ERR(page)) {
+			rc = PTR_ERR(page);
+			CDEBUG(D_READA, "error reading dir "DFID" at "LPU64
+			       "/"LPU64": [rc %d] [parent %u]\n",
+			       PFID(ll_inode2fid(dir)), pos, sai->sai_index,
+			       rc, plli->lli_opendir_pid);
+			GOTO(out, rc);
+		}
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL;
+		     ent = lu_dirent_next(ent)) {
+			__u64 hash;
+			int namelen;
+			char *name;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			if (unlikely(hash < pos))
+				/*
+				 * Skip until we find target hash value.
+				 */
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (unlikely(namelen == 0))
+				/*
+				 * Skip dummy record.
+				 */
+				continue;
+
+			name = ent->lde_name;
+			if (name[0] == '.') {
+				if (namelen == 1) {
+					/*
+					 * skip "."
+					 */
+					continue;
+				} else if (name[1] == '.' && namelen == 2) {
+					/*
+					 * skip ".."
+					 */
+					continue;
+				} else if (!sai->sai_ls_all) {
+					/*
+					 * skip hidden files.
+					 */
+					sai->sai_skip_hidden++;
+					continue;
+				}
+			}
+
+			/*
+			 * don't stat-ahead first entry.
+			 */
+			if (unlikely(++first == 1))
+				continue;
+
+keep_it:
+			l_wait_event(thread->t_ctl_waitq,
+				     !sa_sent_full(sai) ||
+				     !sa_received_empty(sai) ||
+				     !agl_list_empty(sai) ||
+				     !thread_is_running(thread),
+				     &lwi);
+
+interpret_it:
+			while (!sa_received_empty(sai))
+				ll_post_statahead(sai);
+
+			if (unlikely(!thread_is_running(thread))) {
+				ll_release_page(page, 0);
+				GOTO(out, rc = 0);
+			}
+
+			/* If no window for metadata statahead, but there are
+			 * some AGL entries to be triggered, then try to help
+			 * to process the AGL entries. */
+			if (sa_sent_full(sai)) {
+				spin_lock(&plli->lli_agl_lock);
+				while (!agl_list_empty(sai)) {
+					clli = agl_first_entry(sai);
+					list_del_init(&clli->lli_agl_list);
+					spin_unlock(&plli->lli_agl_lock);
+					ll_agl_trigger(&clli->lli_vfs_inode,
+						       sai);
+
+					if (!sa_received_empty(sai))
+						goto interpret_it;
+
+					if (unlikely(
+						!thread_is_running(thread))) {
+						ll_release_page(page, 0);
+						GOTO(out, rc = 0);
+					}
+
+					if (!sa_sent_full(sai))
+						goto do_it;
+
+					spin_lock(&plli->lli_agl_lock);
+				}
+				spin_unlock(&plli->lli_agl_lock);
+
+				goto keep_it;
+			}
+
+do_it:
+			ll_statahead_one(parent, name, namelen);
+		}
+		pos = le64_to_cpu(dp->ldp_hash_end);
+		if (pos == MDS_DIR_END_OFF) {
+			/*
+			 * End of directory reached.
+			 */
+			ll_release_page(page, 0);
+			while (1) {
+				l_wait_event(thread->t_ctl_waitq,
+					     !sa_received_empty(sai) ||
+					     sai->sai_sent == sai->sai_replied||
+					     !thread_is_running(thread),
+					     &lwi);
+
+				while (!sa_received_empty(sai))
+					ll_post_statahead(sai);
+
+				if (unlikely(!thread_is_running(thread)))
+					GOTO(out, rc = 0);
+
+				if (sai->sai_sent == sai->sai_replied &&
+				    sa_received_empty(sai))
+					break;
+			}
+
+			spin_lock(&plli->lli_agl_lock);
+			while (!agl_list_empty(sai) &&
+			       thread_is_running(thread)) {
+				clli = agl_first_entry(sai);
+				list_del_init(&clli->lli_agl_list);
+				spin_unlock(&plli->lli_agl_lock);
+				ll_agl_trigger(&clli->lli_vfs_inode, sai);
+				spin_lock(&plli->lli_agl_lock);
+			}
+			spin_unlock(&plli->lli_agl_lock);
+
+			GOTO(out, rc = 0);
+		} else if (1) {
+			/*
+			 * chain is exhausted.
+			 * Normal case: continue to the next page.
+			 */
+			ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+					      LDF_COLLIDE);
+			sai->sai_in_readpage = 1;
+			page = ll_get_dir_page(dir, pos, &chain);
+			sai->sai_in_readpage = 0;
+		} else {
+			LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+			ll_release_page(page, 1);
+			/*
+			 * go into overflow page.
+			 */
+		}
+	}
+	EXIT;
+
+out:
+	if (sai->sai_agl_valid) {
+		spin_lock(&plli->lli_agl_lock);
+		thread_set_flags(agl_thread, SVC_STOPPING);
+		spin_unlock(&plli->lli_agl_lock);
+		wake_up(&agl_thread->t_ctl_waitq);
+
+		CDEBUG(D_READA, "stop agl thread: [pid %d]\n",
+		       current_pid());
+		l_wait_event(agl_thread->t_ctl_waitq,
+			     thread_is_stopped(agl_thread),
+			     &lwi);
+	} else {
+		/* Set agl_thread flags anyway. */
+		thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+	}
+	ll_dir_chain_fini(&chain);
+	spin_lock(&plli->lli_sa_lock);
+	if (!sa_received_empty(sai)) {
+		thread_set_flags(thread, SVC_STOPPING);
+		spin_unlock(&plli->lli_sa_lock);
+
+		/* To release the resources held by received entries. */
+		while (!sa_received_empty(sai))
+			ll_post_statahead(sai);
+
+		spin_lock(&plli->lli_sa_lock);
+	}
+	thread_set_flags(thread, SVC_STOPPED);
+	spin_unlock(&plli->lli_sa_lock);
+	wake_up(&sai->sai_waitq);
+	wake_up(&thread->t_ctl_waitq);
+	ll_sai_put(sai);
+	dput(parent);
+	CDEBUG(D_READA, "statahead thread stopped: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+	return rc;
+}
+
+/**
+ * called in ll_file_release().
+ */
+void ll_stop_statahead(struct inode *dir, void *key)
+{
+	struct ll_inode_info *lli = ll_i2info(dir);
+
+	if (unlikely(key == NULL))
+		return;
+
+	spin_lock(&lli->lli_sa_lock);
+	if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) {
+		spin_unlock(&lli->lli_sa_lock);
+		return;
+	}
+
+	lli->lli_opendir_key = NULL;
+
+	if (lli->lli_sai) {
+		struct l_wait_info lwi = { 0 };
+		struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread;
+
+		if (!thread_is_stopped(thread)) {
+			thread_set_flags(thread, SVC_STOPPING);
+			spin_unlock(&lli->lli_sa_lock);
+			wake_up(&thread->t_ctl_waitq);
+
+			CDEBUG(D_READA, "stop statahead thread: [pid %d]\n",
+			       current_pid());
+			l_wait_event(thread->t_ctl_waitq,
+				     thread_is_stopped(thread),
+				     &lwi);
+		} else {
+			spin_unlock(&lli->lli_sa_lock);
+		}
+
+		/*
+		 * Put the ref which was held when first statahead_enter.
+		 * It maybe not the last ref for some statahead requests
+		 * maybe inflight.
+		 */
+		ll_sai_put(lli->lli_sai);
+	} else {
+		lli->lli_opendir_pid = 0;
+		spin_unlock(&lli->lli_sa_lock);
+	}
+}
+
+enum {
+	/**
+	 * not first dirent, or is "."
+	 */
+	LS_NONE_FIRST_DE = 0,
+	/**
+	 * the first non-hidden dirent
+	 */
+	LS_FIRST_DE,
+	/**
+	 * the first hidden dirent, that is "."
+	 */
+	LS_FIRST_DOT_DE
+};
+
+static int is_first_dirent(struct inode *dir, struct dentry *dentry)
+{
+	struct ll_dir_chain   chain;
+	struct qstr	  *target = &dentry->d_name;
+	struct page	  *page;
+	__u64		 pos    = 0;
+	int		   dot_de;
+	int		   rc     = LS_NONE_FIRST_DE;
+	ENTRY;
+
+	ll_dir_chain_init(&chain);
+	page = ll_get_dir_page(dir, pos, &chain);
+
+	while (1) {
+		struct lu_dirpage *dp;
+		struct lu_dirent  *ent;
+
+		if (IS_ERR(page)) {
+			struct ll_inode_info *lli = ll_i2info(dir);
+
+			rc = PTR_ERR(page);
+			CERROR("error reading dir "DFID" at "LPU64": "
+			       "[rc %d] [parent %u]\n",
+			       PFID(ll_inode2fid(dir)), pos,
+			       rc, lli->lli_opendir_pid);
+			break;
+		}
+
+		dp = page_address(page);
+		for (ent = lu_dirent_start(dp); ent != NULL;
+		     ent = lu_dirent_next(ent)) {
+			__u64 hash;
+			int namelen;
+			char *name;
+
+			hash = le64_to_cpu(ent->lde_hash);
+			/* The ll_get_dir_page() can return any page containing
+			 * the given hash which may be not the start hash. */
+			if (unlikely(hash < pos))
+				continue;
+
+			namelen = le16_to_cpu(ent->lde_namelen);
+			if (unlikely(namelen == 0))
+				/*
+				 * skip dummy record.
+				 */
+				continue;
+
+			name = ent->lde_name;
+			if (name[0] == '.') {
+				if (namelen == 1)
+					/*
+					 * skip "."
+					 */
+					continue;
+				else if (name[1] == '.' && namelen == 2)
+					/*
+					 * skip ".."
+					 */
+					continue;
+				else
+					dot_de = 1;
+			} else {
+				dot_de = 0;
+			}
+
+			if (dot_de && target->name[0] != '.') {
+				CDEBUG(D_READA, "%.*s skip hidden file %.*s\n",
+				       target->len, target->name,
+				       namelen, name);
+				continue;
+			}
+
+			if (target->len != namelen ||
+			    memcmp(target->name, name, namelen) != 0)
+				rc = LS_NONE_FIRST_DE;
+			else if (!dot_de)
+				rc = LS_FIRST_DE;
+			else
+				rc = LS_FIRST_DOT_DE;
+
+			ll_release_page(page, 0);
+			GOTO(out, rc);
+		}
+		pos = le64_to_cpu(dp->ldp_hash_end);
+		if (pos == MDS_DIR_END_OFF) {
+			/*
+			 * End of directory reached.
+			 */
+			ll_release_page(page, 0);
+			break;
+		} else if (1) {
+			/*
+			 * chain is exhausted
+			 * Normal case: continue to the next page.
+			 */
+			ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+					      LDF_COLLIDE);
+			page = ll_get_dir_page(dir, pos, &chain);
+		} else {
+			/*
+			 * go into overflow page.
+			 */
+			LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+			ll_release_page(page, 1);
+		}
+	}
+	EXIT;
+
+out:
+	ll_dir_chain_fini(&chain);
+	return rc;
+}
+
+static void
+ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+	struct ptlrpc_thread *thread = &sai->sai_thread;
+	struct ll_sb_info    *sbi    = ll_i2sbi(sai->sai_inode);
+	int		   hit;
+	ENTRY;
+
+	if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC)
+		hit = 1;
+	else
+		hit = 0;
+
+	ll_sa_entry_fini(sai, entry);
+	if (hit) {
+		sai->sai_hit++;
+		sai->sai_consecutive_miss = 0;
+		sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max);
+	} else {
+		struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+		sai->sai_miss++;
+		sai->sai_consecutive_miss++;
+		if (sa_low_hit(sai) && thread_is_running(thread)) {
+			atomic_inc(&sbi->ll_sa_wrong);
+			CDEBUG(D_READA, "Statahead for dir "DFID" hit "
+			       "ratio too low: hit/miss "LPU64"/"LPU64
+			       ", sent/replied "LPU64"/"LPU64", stopping "
+			       "statahead thread: pid %d\n",
+			       PFID(&lli->lli_fid), sai->sai_hit,
+			       sai->sai_miss, sai->sai_sent,
+			       sai->sai_replied, current_pid());
+			spin_lock(&lli->lli_sa_lock);
+			if (!thread_is_stopped(thread))
+				thread_set_flags(thread, SVC_STOPPING);
+			spin_unlock(&lli->lli_sa_lock);
+		}
+	}
+
+	if (!thread_is_stopped(thread))
+		wake_up(&thread->t_ctl_waitq);
+
+	EXIT;
+}
+
+/**
+ * Start statahead thread if this is the first dir entry.
+ * Otherwise if a thread is started already, wait it until it is ahead of me.
+ * \retval 1       -- find entry with lock in cache, the caller needs to do
+ *		    nothing.
+ * \retval 0       -- find entry in cache, but without lock, the caller needs
+ *		    refresh from MDS.
+ * \retval others  -- the caller need to process as non-statahead.
+ */
+int do_statahead_enter(struct inode *dir, struct dentry **dentryp,
+		       int only_unplug)
+{
+	struct ll_inode_info     *lli   = ll_i2info(dir);
+	struct ll_statahead_info *sai   = lli->lli_sai;
+	struct dentry	    *parent;
+	struct ll_sa_entry       *entry;
+	struct ptlrpc_thread     *thread;
+	struct l_wait_info	lwi   = { 0 };
+	int		       rc    = 0;
+	struct ll_inode_info     *plli;
+	ENTRY;
+
+	LASSERT(lli->lli_opendir_pid == current_pid());
+
+	if (sai) {
+		thread = &sai->sai_thread;
+		if (unlikely(thread_is_stopped(thread) &&
+			     list_empty(&sai->sai_entries_stated))) {
+			/* to release resource */
+			ll_stop_statahead(dir, lli->lli_opendir_key);
+			RETURN(-EAGAIN);
+		}
+
+		if ((*dentryp)->d_name.name[0] == '.') {
+			if (sai->sai_ls_all ||
+			    sai->sai_miss_hidden >= sai->sai_skip_hidden) {
+				/*
+				 * Hidden dentry is the first one, or statahead
+				 * thread does not skip so many hidden dentries
+				 * before "sai_ls_all" enabled as below.
+				 */
+			} else {
+				if (!sai->sai_ls_all)
+					/*
+					 * It maybe because hidden dentry is not
+					 * the first one, "sai_ls_all" was not
+					 * set, then "ls -al" missed. Enable
+					 * "sai_ls_all" for such case.
+					 */
+					sai->sai_ls_all = 1;
+
+				/*
+				 * Such "getattr" has been skipped before
+				 * "sai_ls_all" enabled as above.
+				 */
+				sai->sai_miss_hidden++;
+				RETURN(-EAGAIN);
+			}
+		}
+
+		entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name);
+		if (entry == NULL || only_unplug) {
+			ll_sai_unplug(sai, entry);
+			RETURN(entry ? 1 : -EAGAIN);
+		}
+
+		/* if statahead is busy in readdir, help it do post-work */
+		while (!ll_sa_entry_stated(entry) &&
+		       sai->sai_in_readpage &&
+		       !sa_received_empty(sai))
+			ll_post_statahead(sai);
+
+		if (!ll_sa_entry_stated(entry)) {
+			sai->sai_index_wait = entry->se_index;
+			lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL,
+					       LWI_ON_SIGNAL_NOOP, NULL);
+			rc = l_wait_event(sai->sai_waitq,
+					  ll_sa_entry_stated(entry) ||
+					  thread_is_stopped(thread),
+					  &lwi);
+			if (rc < 0) {
+				ll_sai_unplug(sai, entry);
+				RETURN(-EAGAIN);
+			}
+		}
+
+		if (entry->se_stat == SA_ENTRY_SUCC &&
+		    entry->se_inode != NULL) {
+			struct inode *inode = entry->se_inode;
+			struct lookup_intent it = { .it_op = IT_GETATTR,
+						    .d.lustre.it_lock_handle =
+						     entry->se_handle };
+			__u64 bits;
+
+			rc = md_revalidate_lock(ll_i2mdexp(dir), &it,
+						ll_inode2fid(inode), &bits);
+			if (rc == 1) {
+				if ((*dentryp)->d_inode == NULL) {
+					*dentryp = ll_splice_alias(inode,
+								   *dentryp);
+				} else if ((*dentryp)->d_inode != inode) {
+					/* revalidate, but inode is recreated */
+					CDEBUG(D_READA,
+					      "stale dentry %.*s inode %lu/%u, "
+					      "statahead inode %lu/%u\n",
+					      (*dentryp)->d_name.len,
+					      (*dentryp)->d_name.name,
+					      (*dentryp)->d_inode->i_ino,
+					      (*dentryp)->d_inode->i_generation,
+					      inode->i_ino,
+					      inode->i_generation);
+					ll_sai_unplug(sai, entry);
+					RETURN(-ESTALE);
+				} else {
+					iput(inode);
+				}
+				entry->se_inode = NULL;
+
+				if ((bits & MDS_INODELOCK_LOOKUP) &&
+				    d_lustre_invalid(*dentryp))
+					d_lustre_revalidate(*dentryp);
+				ll_intent_release(&it);
+			}
+		}
+
+		ll_sai_unplug(sai, entry);
+		RETURN(rc);
+	}
+
+	/* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
+	rc = is_first_dirent(dir, *dentryp);
+	if (rc == LS_NONE_FIRST_DE)
+		/* It is not "ls -{a}l" operation, no need statahead for it. */
+		GOTO(out, rc = -EAGAIN);
+
+	sai = ll_sai_alloc();
+	if (sai == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	sai->sai_ls_all = (rc == LS_FIRST_DOT_DE);
+	sai->sai_inode = igrab(dir);
+	if (unlikely(sai->sai_inode == NULL)) {
+		CWARN("Do not start stat ahead on dying inode "DFID"\n",
+		      PFID(&lli->lli_fid));
+		GOTO(out, rc = -ESTALE);
+	}
+
+	/* get parent reference count here, and put it in ll_statahead_thread */
+	parent = dget((*dentryp)->d_parent);
+	if (unlikely(sai->sai_inode != parent->d_inode)) {
+		struct ll_inode_info *nlli = ll_i2info(parent->d_inode);
+
+		CWARN("Race condition, someone changed %.*s just now: "
+		      "old parent "DFID", new parent "DFID"\n",
+		      (*dentryp)->d_name.len, (*dentryp)->d_name.name,
+		      PFID(&lli->lli_fid), PFID(&nlli->lli_fid));
+		dput(parent);
+		iput(sai->sai_inode);
+		GOTO(out, rc = -EAGAIN);
+	}
+
+	CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %.*s]\n",
+	       current_pid(), parent->d_name.len, parent->d_name.name);
+
+	lli->lli_sai = sai;
+
+	plli = ll_i2info(parent->d_inode);
+	rc = PTR_ERR(kthread_run(ll_statahead_thread, parent,
+				 "ll_sa_%u", plli->lli_opendir_pid));
+	thread = &sai->sai_thread;
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("can't start ll_sa thread, rc: %d\n", rc);
+		dput(parent);
+		lli->lli_opendir_key = NULL;
+		thread_set_flags(thread, SVC_STOPPED);
+		thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+		ll_sai_put(sai);
+		LASSERT(lli->lli_sai == NULL);
+		RETURN(-EAGAIN);
+	}
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+
+	/*
+	 * We don't stat-ahead for the first dirent since we are already in
+	 * lookup.
+	 */
+	RETURN(-EAGAIN);
+
+out:
+	if (sai != NULL)
+		OBD_FREE_PTR(sai);
+	spin_lock(&lli->lli_sa_lock);
+	lli->lli_opendir_key = NULL;
+	lli->lli_opendir_pid = 0;
+	spin_unlock(&lli->lli_sa_lock);
+	return rc;
+}

diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c
new file mode 100644
index 0000000..82c14a9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/super25.c

@@ -0,0 +1,226 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <lprocfs_status.h>
+#include "llite_internal.h"
+
+static struct kmem_cache *ll_inode_cachep;
+
+static struct inode *ll_alloc_inode(struct super_block *sb)
+{
+	struct ll_inode_info *lli;
+	ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1);
+	OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, __GFP_IO);
+	if (lli == NULL)
+		return NULL;
+
+	inode_init_once(&lli->lli_vfs_inode);
+	return &lli->lli_vfs_inode;
+}
+
+static void ll_inode_destroy_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ll_inode_info *ptr = ll_i2info(inode);
+	OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep);
+}
+
+static void ll_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ll_inode_destroy_callback);
+}
+
+int ll_init_inodecache(void)
+{
+	ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
+					       sizeof(struct ll_inode_info),
+					       0, SLAB_HWCACHE_ALIGN, NULL);
+	if (ll_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ll_destroy_inodecache(void)
+{
+	kmem_cache_destroy(ll_inode_cachep);
+}
+
+/* exported operations */
+struct super_operations lustre_super_operations =
+{
+	.alloc_inode   = ll_alloc_inode,
+	.destroy_inode = ll_destroy_inode,
+	.evict_inode   = ll_delete_inode,
+	.put_super     = ll_put_super,
+	.statfs	= ll_statfs,
+	.umount_begin  = ll_umount_begin,
+	.remount_fs    = ll_remount_fs,
+	.show_options  = ll_show_options,
+};
+MODULE_ALIAS_FS("lustre");
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
+
+int vvp_global_init(void);
+void vvp_global_fini(void);
+
+static int __init init_lustre_lite(void)
+{
+	int i, rc, seed[2];
+	struct timeval tv;
+	lnet_process_id_t lnet_id;
+
+	CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1);
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre client module (%p).\n",
+	       &lustre_super_operations);
+
+	rc = ll_init_inodecache();
+	if (rc)
+		return -ENOMEM;
+	ll_file_data_slab = kmem_cache_create("ll_file_data",
+						 sizeof(struct ll_file_data), 0,
+						 SLAB_HWCACHE_ALIGN, NULL);
+	if (ll_file_data_slab == NULL) {
+		ll_destroy_inodecache();
+		return -ENOMEM;
+	}
+
+	ll_remote_perm_cachep = kmem_cache_create("ll_remote_perm_cache",
+						  sizeof(struct ll_remote_perm),
+						      0, 0, NULL);
+	if (ll_remote_perm_cachep == NULL) {
+		kmem_cache_destroy(ll_file_data_slab);
+		ll_file_data_slab = NULL;
+		ll_destroy_inodecache();
+		return -ENOMEM;
+	}
+
+	ll_rmtperm_hash_cachep = kmem_cache_create("ll_rmtperm_hash_cache",
+						   REMOTE_PERM_HASHSIZE *
+						   sizeof(struct list_head),
+						   0, 0, NULL);
+	if (ll_rmtperm_hash_cachep == NULL) {
+		kmem_cache_destroy(ll_remote_perm_cachep);
+		ll_remote_perm_cachep = NULL;
+		kmem_cache_destroy(ll_file_data_slab);
+		ll_file_data_slab = NULL;
+		ll_destroy_inodecache();
+		return -ENOMEM;
+	}
+
+	proc_lustre_fs_root = proc_lustre_root ?
+			      lprocfs_register("llite", proc_lustre_root, NULL, NULL) : NULL;
+
+	lustre_register_client_fill_super(ll_fill_super);
+	lustre_register_kill_super_cb(ll_kill_super);
+
+	lustre_register_client_process_config(ll_process_config);
+
+	cfs_get_random_bytes(seed, sizeof(seed));
+
+	/* Nodes with small feet have little entropy
+	 * the NID for this node gives the most entropy in the low bits */
+	for (i=0; ; i++) {
+		if (LNetGetId(i, &lnet_id) == -ENOENT) {
+			break;
+		}
+		if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) {
+			seed[0] ^= LNET_NIDADDR(lnet_id.nid);
+		}
+	}
+
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+	init_timer(&ll_capa_timer);
+	ll_capa_timer.function = ll_capa_timer_callback;
+	rc = ll_capa_thread_start();
+	/*
+	 * XXX normal cleanup is needed here.
+	 */
+	if (rc == 0)
+		rc = vvp_global_init();
+
+	return rc;
+}
+
+static void __exit exit_lustre_lite(void)
+{
+	vvp_global_fini();
+	del_timer(&ll_capa_timer);
+	ll_capa_thread_stop();
+	LASSERTF(capa_count[CAPA_SITE_CLIENT] == 0,
+		 "client remaining capa count %d\n",
+		 capa_count[CAPA_SITE_CLIENT]);
+
+	lustre_register_client_fill_super(NULL);
+	lustre_register_kill_super_cb(NULL);
+
+	lustre_register_client_process_config(NULL);
+
+	ll_destroy_inodecache();
+
+	kmem_cache_destroy(ll_rmtperm_hash_cachep);
+	ll_rmtperm_hash_cachep = NULL;
+
+	kmem_cache_destroy(ll_remote_perm_cachep);
+	ll_remote_perm_cachep = NULL;
+
+	kmem_cache_destroy(ll_file_data_slab);
+	if (proc_lustre_fs_root)
+		lprocfs_remove(&proc_lustre_fs_root);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Lite Client File System");
+MODULE_LICENSE("GPL");
+
+module_init(init_lustre_lite);
+module_exit(exit_lustre_lite);

diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
new file mode 100644
index 0000000..5260e98
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/symlink.c

@@ -0,0 +1,192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/version.h>
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+static int ll_readlink_internal(struct inode *inode,
+				struct ptlrpc_request **request, char **symname)
+{
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	int rc, symlen = i_size_read(inode) + 1;
+	struct mdt_body *body;
+	struct md_op_data *op_data;
+	ENTRY;
+
+	*request = NULL;
+
+	if (lli->lli_symlink_name) {
+		int print_limit = min_t(int, PAGE_SIZE - 128, symlen);
+
+		*symname = lli->lli_symlink_name;
+		/* If the total CDEBUG() size is larger than a page, it
+		 * will print a warning to the console, avoid this by
+		 * printing just the last part of the symlink. */
+		CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n",
+		       print_limit < symlen ? "..." : "", print_limit,
+		       (*symname) + symlen - print_limit, symlen);
+		RETURN(0);
+	}
+
+	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen,
+				     LUSTRE_OPC_ANY, NULL);
+	if (IS_ERR(op_data))
+		RETURN(PTR_ERR(op_data));
+
+	op_data->op_valid = OBD_MD_LINKNAME;
+	rc = md_getattr(sbi->ll_md_exp, op_data, request);
+	ll_finish_md_op_data(op_data);
+	if (rc) {
+		if (rc != -ENOENT)
+			CERROR("inode %lu: rc = %d\n", inode->i_ino, rc);
+		GOTO (failed, rc);
+	}
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+	if ((body->valid & OBD_MD_LINKNAME) == 0) {
+		CERROR("OBD_MD_LINKNAME not set on reply\n");
+		GOTO(failed, rc = -EPROTO);
+	}
+
+	LASSERT(symlen != 0);
+	if (body->eadatasize != symlen) {
+		CERROR("inode %lu: symlink length %d not expected %d\n",
+			inode->i_ino, body->eadatasize - 1, symlen - 1);
+		GOTO(failed, rc = -EPROTO);
+	}
+
+	*symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD);
+	if (*symname == NULL ||
+	    strnlen(*symname, symlen) != symlen - 1) {
+		/* not full/NULL terminated */
+		CERROR("inode %lu: symlink not NULL terminated string"
+			"of length %d\n", inode->i_ino, symlen - 1);
+		GOTO(failed, rc = -EPROTO);
+	}
+
+	OBD_ALLOC(lli->lli_symlink_name, symlen);
+	/* do not return an error if we cannot cache the symlink locally */
+	if (lli->lli_symlink_name) {
+		memcpy(lli->lli_symlink_name, *symname, symlen);
+		*symname = lli->lli_symlink_name;
+	}
+	RETURN(0);
+
+failed:
+	RETURN (rc);
+}
+
+static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ptlrpc_request *request;
+	char *symname;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+
+	ll_inode_size_lock(inode);
+	rc = ll_readlink_internal(inode, &request, &symname);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = vfs_readlink(dentry, buffer, buflen, symname);
+ out:
+	ptlrpc_req_finished(request);
+	ll_inode_size_unlock(inode);
+	RETURN(rc);
+}
+
+static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ptlrpc_request *request = NULL;
+	int rc;
+	char *symname;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op\n");
+	/* Limit the recursive symlink depth to 5 instead of default
+	 * 8 links when kernel has 4k stack to prevent stack overflow.
+	 * For 8k stacks we need to limit it to 7 for local servers. */
+	if (THREAD_SIZE < 8192 && current->link_count >= 6) {
+		rc = -ELOOP;
+	} else if (THREAD_SIZE == 8192 && current->link_count >= 8) {
+		rc = -ELOOP;
+	} else {
+		ll_inode_size_lock(inode);
+		rc = ll_readlink_internal(inode, &request, &symname);
+		ll_inode_size_unlock(inode);
+	}
+	if (rc) {
+		ptlrpc_req_finished(request);
+		request = NULL;
+		symname = ERR_PTR(rc);
+	}
+
+	nd_set_link(nd, symname);
+	/* symname may contain a pointer to the request message buffer,
+	 * we delay request releasing until ll_put_link then.
+	 */
+	RETURN(request);
+}
+
+static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+	ptlrpc_req_finished(cookie);
+}
+
+struct inode_operations ll_fast_symlink_inode_operations = {
+	.readlink	= ll_readlink,
+	.setattr	= ll_setattr,
+	.follow_link	= ll_follow_link,
+	.put_link	= ll_put_link,
+	.getattr	= ll_getattr,
+	.permission	= ll_inode_permission,
+	.setxattr	= ll_setxattr,
+	.getxattr	= ll_getxattr,
+	.listxattr	= ll_listxattr,
+	.removexattr	= ll_removexattr,
+};

diff --git a/drivers/staging/lustre/lustre/llite/vvp_dev.c b/drivers/staging/lustre/lustre/llite/vvp_dev.c
new file mode 100644
index 0000000..9254b99
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_dev.c

@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_device and cl_device_type implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+/*
+ * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical
+ * "llite_" (var. "ll_") prefix.
+ */
+
+struct kmem_cache *vvp_thread_kmem;
+static struct kmem_cache *vvp_session_kmem;
+static struct lu_kmem_descr vvp_caches[] = {
+	{
+		.ckd_cache = &vvp_thread_kmem,
+		.ckd_name  = "vvp_thread_kmem",
+		.ckd_size  = sizeof (struct vvp_thread_info),
+	},
+	{
+		.ckd_cache = &vvp_session_kmem,
+		.ckd_name  = "vvp_session_kmem",
+		.ckd_size  = sizeof (struct vvp_session)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+static void *vvp_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct vvp_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, vvp_thread_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void vvp_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct vvp_thread_info *info = data;
+	OBD_SLAB_FREE_PTR(info, vvp_thread_kmem);
+}
+
+static void *vvp_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct vvp_session *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, __GFP_IO);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+static void vvp_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct vvp_session *session = data;
+	OBD_SLAB_FREE_PTR(session, vvp_session_kmem);
+}
+
+
+struct lu_context_key vvp_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = vvp_key_init,
+	.lct_fini = vvp_key_fini
+};
+
+struct lu_context_key vvp_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = vvp_session_key_init,
+	.lct_fini = vvp_session_key_fini
+};
+
+/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(vvp, &ccc_key, &ccc_session_key, &vvp_key, &vvp_session_key);
+
+static const struct lu_device_operations vvp_lu_ops = {
+	.ldo_object_alloc      = vvp_object_alloc
+};
+
+static const struct cl_device_operations vvp_cl_ops = {
+	.cdo_req_init = ccc_req_init
+};
+
+static struct lu_device *vvp_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	return ccc_device_alloc(env, t, cfg, &vvp_lu_ops, &vvp_cl_ops);
+}
+
+static const struct lu_device_type_operations vvp_device_type_ops = {
+	.ldto_init = vvp_type_init,
+	.ldto_fini = vvp_type_fini,
+
+	.ldto_start = vvp_type_start,
+	.ldto_stop  = vvp_type_stop,
+
+	.ldto_device_alloc = vvp_device_alloc,
+	.ldto_device_free  = ccc_device_free,
+	.ldto_device_init  = ccc_device_init,
+	.ldto_device_fini  = ccc_device_fini
+};
+
+struct lu_device_type vvp_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_VVP_NAME,
+	.ldt_ops      = &vvp_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/**
+ * A mutex serializing calls to vvp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+int vvp_global_init(void)
+{
+	int result;
+
+	result = lu_kmem_init(vvp_caches);
+	if (result == 0) {
+		result = ccc_global_init(&vvp_device_type);
+		if (result != 0)
+			lu_kmem_fini(vvp_caches);
+	}
+	return result;
+}
+
+void vvp_global_fini(void)
+{
+	ccc_global_fini(&vvp_device_type);
+	lu_kmem_fini(vvp_caches);
+}
+
+
+/*****************************************************************************
+ *
+ * mirror obd-devices into cl devices.
+ *
+ */
+
+int cl_sb_init(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+	struct cl_device  *cl;
+	struct lu_env     *env;
+	int rc = 0;
+	int refcheck;
+
+	sbi  = ll_s2sbi(sb);
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		cl = cl_type_setup(env, NULL, &vvp_device_type,
+				   sbi->ll_dt_exp->exp_obd->obd_lu_dev);
+		if (!IS_ERR(cl)) {
+			cl2ccc_dev(cl)->cdv_sb = sb;
+			sbi->ll_cl = cl;
+			sbi->ll_site = cl2lu_dev(cl)->ld_site;
+		}
+		cl_env_put(env, &refcheck);
+	} else
+		rc = PTR_ERR(env);
+	RETURN(rc);
+}
+
+int cl_sb_fini(struct super_block *sb)
+{
+	struct ll_sb_info *sbi;
+	struct lu_env     *env;
+	struct cl_device  *cld;
+	int		refcheck;
+	int		result;
+
+	ENTRY;
+	sbi = ll_s2sbi(sb);
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		cld = sbi->ll_cl;
+
+		if (cld != NULL) {
+			cl_stack_fini(env, cld);
+			sbi->ll_cl = NULL;
+			sbi->ll_site = NULL;
+		}
+		cl_env_put(env, &refcheck);
+		result = 0;
+	} else {
+		CERROR("Cannot cleanup cl-stack due to memory shortage.\n");
+		result = PTR_ERR(env);
+	}
+	/*
+	 * If mount failed (sbi->ll_cl == NULL), and this there are no other
+	 * mounts, stop device types manually (this usually happens
+	 * automatically when last device is destroyed).
+	 */
+	lu_types_stop();
+	RETURN(result);
+}
+
+/****************************************************************************
+ *
+ * /proc/fs/lustre/llite/$MNT/dump_page_cache
+ *
+ ****************************************************************************/
+
+/*
+ * To represent contents of a page cache as a byte stream, following
+ * information if encoded in 64bit offset:
+ *
+ *       - file hash bucket in lu_site::ls_hash[]       28bits
+ *
+ *       - how far file is from bucket head	      4bits
+ *
+ *       - page index				   32bits
+ *
+ * First two data identify a file in the cache uniquely.
+ */
+
+#define PGC_OBJ_SHIFT (32 + 4)
+#define PGC_DEPTH_SHIFT (32)
+
+struct vvp_pgcache_id {
+	unsigned		 vpi_bucket;
+	unsigned		 vpi_depth;
+	uint32_t		 vpi_index;
+
+	unsigned		 vpi_curdep;
+	struct lu_object_header *vpi_obj;
+};
+
+static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
+{
+	CLASSERT(sizeof(pos) == sizeof(__u64));
+
+	id->vpi_index  = pos & 0xffffffff;
+	id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
+	id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT);
+}
+
+static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
+{
+	return
+		((__u64)id->vpi_index) |
+		((__u64)id->vpi_depth  << PGC_DEPTH_SHIFT) |
+		((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
+}
+
+static int vvp_pgcache_obj_get(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			       struct hlist_node *hnode, void *data)
+{
+	struct vvp_pgcache_id   *id  = data;
+	struct lu_object_header *hdr = cfs_hash_object(hs, hnode);
+
+	if (id->vpi_curdep-- > 0)
+		return 0; /* continue */
+
+	if (lu_object_is_dying(hdr))
+		return 1;
+
+	cfs_hash_get(hs, hnode);
+	id->vpi_obj = hdr;
+	return 1;
+}
+
+static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
+					 struct lu_device *dev,
+					 struct vvp_pgcache_id *id)
+{
+	LASSERT(lu_device_is_cl(dev));
+
+	id->vpi_depth &= 0xf;
+	id->vpi_obj    = NULL;
+	id->vpi_curdep = id->vpi_depth;
+
+	cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket,
+				vvp_pgcache_obj_get, id);
+	if (id->vpi_obj != NULL) {
+		struct lu_object *lu_obj;
+
+		lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type);
+		if (lu_obj != NULL) {
+			lu_object_ref_add(lu_obj, "dump", current);
+			return lu2cl(lu_obj);
+		}
+		lu_object_put(env, lu_object_top(id->vpi_obj));
+
+	} else if (id->vpi_curdep > 0) {
+		id->vpi_depth = 0xf;
+	}
+	return NULL;
+}
+
+static loff_t vvp_pgcache_find(const struct lu_env *env,
+			       struct lu_device *dev, loff_t pos)
+{
+	struct cl_object     *clob;
+	struct lu_site       *site;
+	struct vvp_pgcache_id id;
+
+	site = dev->ld_site;
+	vvp_pgcache_id_unpack(pos, &id);
+
+	while (1) {
+		if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash))
+			return ~0ULL;
+		clob = vvp_pgcache_obj(env, dev, &id);
+		if (clob != NULL) {
+			struct cl_object_header *hdr;
+			int		      nr;
+			struct cl_page	  *pg;
+
+			/* got an object. Find next page. */
+			hdr = cl_object_header(clob);
+
+			spin_lock(&hdr->coh_page_guard);
+			nr = radix_tree_gang_lookup(&hdr->coh_tree,
+						    (void **)&pg,
+						    id.vpi_index, 1);
+			if (nr > 0) {
+				id.vpi_index = pg->cp_index;
+				/* Cant support over 16T file */
+				nr = !(pg->cp_index > 0xffffffff);
+			}
+			spin_unlock(&hdr->coh_page_guard);
+
+			lu_object_ref_del(&clob->co_lu, "dump", current);
+			cl_object_put(env, clob);
+			if (nr > 0)
+				return vvp_pgcache_id_pack(&id);
+		}
+		/* to the next object. */
+		++id.vpi_depth;
+		id.vpi_depth &= 0xf;
+		if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
+			return ~0ULL;
+		id.vpi_index = 0;
+	}
+}
+
+#define seq_page_flag(seq, page, flag, has_flags) do {		  \
+	if (test_bit(PG_##flag, &(page)->flags)) {		  \
+		seq_printf(seq, "%s"#flag, has_flags ? "|" : "");       \
+		has_flags = 1;					  \
+	}							       \
+} while(0)
+
+static void vvp_pgcache_page_show(const struct lu_env *env,
+				  struct seq_file *seq, struct cl_page *page)
+{
+	struct ccc_page *cpg;
+	struct page      *vmpage;
+	int	      has_flags;
+
+	cpg = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+	vmpage = cpg->cpg_page;
+	seq_printf(seq," %5i | %p %p %s %s %s %s | %p %lu/%u(%p) %lu %u [",
+		   0 /* gen */,
+		   cpg, page,
+		   "none",
+		   cpg->cpg_write_queued ? "wq" : "- ",
+		   cpg->cpg_defer_uptodate ? "du" : "- ",
+		   PageWriteback(vmpage) ? "wb" : "-",
+		   vmpage, vmpage->mapping->host->i_ino,
+		   vmpage->mapping->host->i_generation,
+		   vmpage->mapping->host, vmpage->index,
+		   page_count(vmpage));
+	has_flags = 0;
+	seq_page_flag(seq, vmpage, locked, has_flags);
+	seq_page_flag(seq, vmpage, error, has_flags);
+	seq_page_flag(seq, vmpage, referenced, has_flags);
+	seq_page_flag(seq, vmpage, uptodate, has_flags);
+	seq_page_flag(seq, vmpage, dirty, has_flags);
+	seq_page_flag(seq, vmpage, writeback, has_flags);
+	seq_printf(seq, "%s]\n", has_flags ? "" : "-");
+}
+
+static int vvp_pgcache_show(struct seq_file *f, void *v)
+{
+	loff_t		   pos;
+	struct ll_sb_info       *sbi;
+	struct cl_object	*clob;
+	struct lu_env	   *env;
+	struct cl_page	  *page;
+	struct cl_object_header *hdr;
+	struct vvp_pgcache_id    id;
+	int		      refcheck;
+	int		      result;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		pos = *(loff_t *) v;
+		vvp_pgcache_id_unpack(pos, &id);
+		sbi = f->private;
+		clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
+		if (clob != NULL) {
+			hdr = cl_object_header(clob);
+
+			spin_lock(&hdr->coh_page_guard);
+			page = cl_page_lookup(hdr, id.vpi_index);
+			spin_unlock(&hdr->coh_page_guard);
+
+			seq_printf(f, "%8x@"DFID": ",
+				   id.vpi_index, PFID(&hdr->coh_lu.loh_fid));
+			if (page != NULL) {
+				vvp_pgcache_page_show(env, f, page);
+				cl_page_put(env, page);
+			} else
+				seq_puts(f, "missing\n");
+			lu_object_ref_del(&clob->co_lu, "dump", current);
+			cl_object_put(env, clob);
+		} else
+			seq_printf(f, "%llx missing\n", pos);
+		cl_env_put(env, &refcheck);
+		result = 0;
+	} else
+		result = PTR_ERR(env);
+	return result;
+}
+
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+	struct ll_sb_info *sbi;
+	struct lu_env     *env;
+	int		refcheck;
+
+	sbi = f->private;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		sbi = f->private;
+		if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT)
+			pos = ERR_PTR(-EFBIG);
+		else {
+			*pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
+						*pos);
+			if (*pos == ~0ULL)
+				pos = NULL;
+		}
+		cl_env_put(env, &refcheck);
+	}
+	return pos;
+}
+
+static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	struct ll_sb_info *sbi;
+	struct lu_env     *env;
+	int		refcheck;
+
+	env = cl_env_get(&refcheck);
+	if (!IS_ERR(env)) {
+		sbi = f->private;
+		*pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
+		if (*pos == ~0ULL)
+			pos = NULL;
+		cl_env_put(env, &refcheck);
+	}
+	return pos;
+}
+
+static void vvp_pgcache_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static struct seq_operations vvp_pgcache_ops = {
+	.start = vvp_pgcache_start,
+	.next  = vvp_pgcache_next,
+	.stop  = vvp_pgcache_stop,
+	.show  = vvp_pgcache_show
+};
+
+static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
+{
+	struct ll_sb_info     *sbi = PDE_DATA(inode);
+	struct seq_file       *seq;
+	int		    result;
+
+	result = seq_open(filp, &vvp_pgcache_ops);
+	if (result == 0) {
+		seq = filp->private_data;
+		seq->private = sbi;
+	}
+	return result;
+}
+
+struct file_operations vvp_dump_pgcache_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = vvp_dump_pgcache_seq_open,
+	.read    = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};

diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h
new file mode 100644
index 0000000..c82bf17
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_internal.h

@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal definitions for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef VVP_INTERNAL_H
+#define VVP_INTERNAL_H
+
+
+#include <cl_object.h>
+#include "llite_internal.h"
+
+int	       vvp_io_init     (const struct lu_env *env,
+				   struct cl_object *obj, struct cl_io *io);
+int	       vvp_lock_init   (const struct lu_env *env,
+				   struct cl_object *obj, struct cl_lock *lock,
+				   const struct cl_io *io);
+int		  vvp_page_init   (const struct lu_env *env,
+				   struct cl_object *obj,
+				   struct cl_page *page, struct page *vmpage);
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+extern struct kmem_cache *vvp_thread_kmem;
+
+#endif /* VVP_INTERNAL_H */

diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
new file mode 100644
index 0000000..eb964ac
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c

@@ -0,0 +1,1186 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+				const struct cl_io_slice *slice);
+
+/**
+ * True, if \a io is a normal io, False for sendfile() / splice_{read|write}
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io)
+{
+	struct vvp_io *vio = vvp_env_io(env);
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	return vio->cui_io_subtype == IO_NORMAL;
+}
+
+/**
+ * For swapping layout. The file's layout may have changed.
+ * To avoid populating pages to a wrong stripe, we have to verify the
+ * correctness of layout. It works because swapping layout processes
+ * have to acquire group lock.
+ */
+static bool can_populate_pages(const struct lu_env *env, struct cl_io *io,
+				struct inode *inode)
+{
+	struct ll_inode_info	*lli = ll_i2info(inode);
+	struct ccc_io		*cio = ccc_env_io(env);
+	bool rc = true;
+
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		/* don't need lock here to check lli_layout_gen as we have held
+		 * extent lock and GROUP lock has to hold to swap layout */
+		if (lli->lli_layout_gen != cio->cui_layout_gen) {
+			io->ci_need_restart = 1;
+			/* this will return application a short read/write */
+			io->ci_continue = 0;
+			rc = false;
+		}
+	case CIT_FAULT:
+		/* fault is okay because we've already had a page. */
+	default:
+		break;
+	}
+
+	return rc;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static int vvp_io_fault_iter_init(const struct lu_env *env,
+				  const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio   = cl2vvp_io(env, ios);
+	struct inode  *inode = ccc_object_inode(ios->cis_obj);
+
+	LASSERT(inode ==
+		cl2ccc_io(env, ios)->cui_fd->fd_file->f_dentry->d_inode);
+	vio->u.fault.ft_mtime = LTIME_S(inode->i_mtime);
+	return 0;
+}
+
+static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct cl_io     *io  = ios->cis_io;
+	struct cl_object *obj = io->ci_obj;
+	struct ccc_io    *cio = cl2ccc_io(env, ios);
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, "ignore/verify layout %d/%d, layout version %d.\n",
+		io->ci_ignore_layout, io->ci_verify_layout, cio->cui_layout_gen);
+
+	if (!io->ci_ignore_layout && io->ci_verify_layout) {
+		__u32 gen = 0;
+
+		/* check layout version */
+		ll_layout_refresh(ccc_object_inode(obj), &gen);
+		io->ci_need_restart = cio->cui_layout_gen != gen;
+		if (io->ci_need_restart)
+			CDEBUG(D_VFSTRACE, "layout changed from %d to %d.\n",
+				cio->cui_layout_gen, gen);
+	}
+}
+
+static void vvp_io_fault_fini(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_io   *io   = ios->cis_io;
+	struct cl_page *page = io->u.ci_fault.ft_page;
+
+	CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+
+	if (page != NULL) {
+		lu_ref_del(&page->cp_reference, "fault", io);
+		cl_page_put(env, page);
+		io->u.ci_fault.ft_page = NULL;
+	}
+	vvp_io_fini(env, ios);
+}
+
+enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
+{
+	/*
+	 * we only want to hold PW locks if the mmap() can generate
+	 * writes back to the file and that only happens in shared
+	 * writable vmas
+	 */
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+		return CLM_WRITE;
+	return CLM_READ;
+}
+
+static int vvp_mmap_locks(const struct lu_env *env,
+			  struct ccc_io *vio, struct cl_io *io)
+{
+	struct ccc_thread_info *cti = ccc_env_info(env);
+	struct mm_struct       *mm = current->mm;
+	struct vm_area_struct  *vma;
+	struct cl_lock_descr   *descr = &cti->cti_descr;
+	ldlm_policy_data_t      policy;
+	unsigned long	   addr;
+	unsigned long	   seg;
+	ssize_t		 count;
+	int		     result;
+	ENTRY;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	if (!cl_is_normalio(env, io))
+		RETURN(0);
+
+	if (vio->cui_iov == NULL) /* nfs or loop back device write */
+		RETURN(0);
+
+	/* No MM (e.g. NFS)? No vmas too. */
+	if (mm == NULL)
+		RETURN(0);
+
+	for (seg = 0; seg < vio->cui_nrsegs; seg++) {
+		const struct iovec *iv = &vio->cui_iov[seg];
+
+		addr = (unsigned long)iv->iov_base;
+		count = iv->iov_len;
+		if (count == 0)
+			continue;
+
+		count += addr & (~CFS_PAGE_MASK);
+		addr &= CFS_PAGE_MASK;
+
+		down_read(&mm->mmap_sem);
+		while((vma = our_vma(mm, addr, count)) != NULL) {
+			struct inode *inode = vma->vm_file->f_dentry->d_inode;
+			int flags = CEF_MUST;
+
+			if (ll_file_nolock(vma->vm_file)) {
+				/*
+				 * For no lock case, a lockless lock will be
+				 * generated.
+				 */
+				flags = CEF_NEVER;
+			}
+
+			/*
+			 * XXX: Required lock mode can be weakened: CIT_WRITE
+			 * io only ever reads user level buffer, and CIT_READ
+			 * only writes on it.
+			 */
+			policy_from_vma(&policy, vma, addr, count);
+			descr->cld_mode = vvp_mode_from_vma(vma);
+			descr->cld_obj = ll_i2info(inode)->lli_clob;
+			descr->cld_start = cl_index(descr->cld_obj,
+						    policy.l_extent.start);
+			descr->cld_end = cl_index(descr->cld_obj,
+						  policy.l_extent.end);
+			descr->cld_enq_flags = flags;
+			result = cl_io_lock_alloc_add(env, io, descr);
+
+			CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+			       descr->cld_mode, descr->cld_start,
+			       descr->cld_end);
+
+			if (result < 0)
+				RETURN(result);
+
+			if (vma->vm_end - addr >= count)
+				break;
+
+			count -= vma->vm_end - addr;
+			addr = vma->vm_end;
+		}
+		up_read(&mm->mmap_sem);
+	}
+	RETURN(0);
+}
+
+static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
+			  enum cl_lock_mode mode, loff_t start, loff_t end)
+{
+	struct ccc_io *cio = ccc_env_io(env);
+	int result;
+	int ast_flags = 0;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+	ENTRY;
+
+	ccc_io_update_iov(env, cio, io);
+
+	if (io->u.ci_rw.crw_nonblock)
+		ast_flags |= CEF_NONBLOCK;
+	result = vvp_mmap_locks(env, cio, io);
+	if (result == 0)
+		result = ccc_io_one_lock(env, io, ast_flags, mode, start, end);
+	RETURN(result);
+}
+
+static int vvp_io_read_lock(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
+{
+	struct cl_io	 *io  = ios->cis_io;
+	struct ll_inode_info *lli = ll_i2info(ccc_object_inode(io->ci_obj));
+	int result;
+
+	ENTRY;
+	/* XXX: Layer violation, we shouldn't see lsm at llite level. */
+	if (lli->lli_has_smd) /* lsm-less file doesn't need to lock */
+		result = vvp_io_rw_lock(env, io, CLM_READ,
+					io->u.ci_rd.rd.crw_pos,
+					io->u.ci_rd.rd.crw_pos +
+					io->u.ci_rd.rd.crw_count - 1);
+	else
+		result = 0;
+	RETURN(result);
+}
+
+static int vvp_io_fault_lock(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct cl_io *io   = ios->cis_io;
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+	/*
+	 * XXX LDLM_FL_CBPENDING
+	 */
+	return ccc_io_one_lock_index
+		(env, io, 0, vvp_mode_from_vma(vio->u.fault.ft_vma),
+		 io->u.ci_fault.ft_index, io->u.ci_fault.ft_index);
+}
+
+static int vvp_io_write_lock(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct cl_io *io = ios->cis_io;
+	loff_t start;
+	loff_t end;
+
+	if (io->u.ci_wr.wr_append) {
+		start = 0;
+		end   = OBD_OBJECT_EOF;
+	} else {
+		start = io->u.ci_wr.wr.crw_pos;
+		end   = start + io->u.ci_wr.wr.crw_count - 1;
+	}
+	return vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
+}
+
+static int vvp_io_setattr_iter_init(const struct lu_env *env,
+				    const struct cl_io_slice *ios)
+{
+	return 0;
+}
+
+/**
+ * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io.
+ *
+ * Handles "lockless io" mode when extent locking is done by server.
+ */
+static int vvp_io_setattr_lock(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct ccc_io *cio = ccc_env_io(env);
+	struct cl_io  *io  = ios->cis_io;
+	__u64 new_size;
+	__u32 enqflags = 0;
+
+	if (cl_io_is_trunc(io)) {
+		new_size = io->u.ci_setattr.sa_attr.lvb_size;
+		if (new_size == 0)
+			enqflags = CEF_DISCARD_DATA;
+	} else {
+		if ((io->u.ci_setattr.sa_attr.lvb_mtime >=
+		     io->u.ci_setattr.sa_attr.lvb_ctime) ||
+		    (io->u.ci_setattr.sa_attr.lvb_atime >=
+		     io->u.ci_setattr.sa_attr.lvb_ctime))
+			return 0;
+		new_size = 0;
+	}
+	cio->u.setattr.cui_local_lock = SETATTR_EXTENT_LOCK;
+	return ccc_io_one_lock(env, io, enqflags, CLM_WRITE,
+			       new_size, OBD_OBJECT_EOF);
+}
+
+static int vvp_do_vmtruncate(struct inode *inode, size_t size)
+{
+	int     result;
+	/*
+	 * Only ll_inode_size_lock is taken at this level.
+	 */
+	ll_inode_size_lock(inode);
+	result = inode_newsize_ok(inode, size);
+	if (result < 0) {
+		ll_inode_size_unlock(inode);
+		return result;
+	}
+	truncate_setsize(inode, size);
+	ll_inode_size_unlock(inode);
+	return result;
+}
+
+static int vvp_io_setattr_trunc(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				struct inode *inode, loff_t size)
+{
+	inode_dio_wait(inode);
+	return 0;
+}
+
+static int vvp_io_setattr_time(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct cl_io       *io    = ios->cis_io;
+	struct cl_object   *obj   = io->ci_obj;
+	struct cl_attr     *attr  = ccc_env_thread_attr(env);
+	int result;
+	unsigned valid = CAT_CTIME;
+
+	cl_object_attr_lock(obj);
+	attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+	if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) {
+		attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+		valid |= CAT_ATIME;
+	}
+	if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) {
+		attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+		valid |= CAT_MTIME;
+	}
+	result = cl_object_attr_set(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+
+	return result;
+}
+
+static int vvp_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	struct cl_io	*io    = ios->cis_io;
+	struct inode	*inode = ccc_object_inode(io->ci_obj);
+
+	mutex_lock(&inode->i_mutex);
+	if (cl_io_is_trunc(io))
+		return vvp_io_setattr_trunc(env, ios, inode,
+					    io->u.ci_setattr.sa_attr.lvb_size);
+	else
+		return vvp_io_setattr_time(env, ios);
+}
+
+static void vvp_io_setattr_end(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct cl_io *io    = ios->cis_io;
+	struct inode *inode = ccc_object_inode(io->ci_obj);
+
+	if (cl_io_is_trunc(io)) {
+		/* Truncate in memory pages - they must be clean pages
+		 * because osc has already notified to destroy osc_extents. */
+		vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
+		inode_dio_write_done(inode);
+	}
+	mutex_unlock(&inode->i_mutex);
+}
+
+static void vvp_io_setattr_fini(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	vvp_io_fini(env, ios);
+}
+
+static ssize_t lustre_generic_file_read(struct file *file,
+					struct ccc_io *vio, loff_t *ppos)
+{
+	return generic_file_aio_read(vio->cui_iocb, vio->cui_iov,
+				     vio->cui_nrsegs, *ppos);
+}
+
+static ssize_t lustre_generic_file_write(struct file *file,
+					struct ccc_io *vio, loff_t *ppos)
+{
+	return generic_file_aio_write(vio->cui_iocb, vio->cui_iov,
+				      vio->cui_nrsegs, *ppos);
+}
+
+static int vvp_io_read_start(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct vvp_io     *vio   = cl2vvp_io(env, ios);
+	struct ccc_io     *cio   = cl2ccc_io(env, ios);
+	struct cl_io      *io    = ios->cis_io;
+	struct cl_object  *obj   = io->ci_obj;
+	struct inode      *inode = ccc_object_inode(obj);
+	struct ll_ra_read *bead  = &vio->cui_bead;
+	struct file       *file  = cio->cui_fd->fd_file;
+
+	int     result;
+	loff_t  pos = io->u.ci_rd.rd.crw_pos;
+	long    cnt = io->u.ci_rd.rd.crw_count;
+	long    tot = cio->cui_tot_count;
+	int     exceed = 0;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+
+	if (!can_populate_pages(env, io, inode))
+		return 0;
+
+	result = ccc_prep_size(env, obj, io, pos, tot, &exceed);
+	if (result != 0)
+		return result;
+	else if (exceed != 0)
+		goto out;
+
+	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+			"Read ino %lu, %lu bytes, offset %lld, size %llu\n",
+			inode->i_ino, cnt, pos, i_size_read(inode));
+
+	/* turn off the kernel's read-ahead */
+	cio->cui_fd->fd_file->f_ra.ra_pages = 0;
+
+	/* initialize read-ahead window once per syscall */
+	if (!vio->cui_ra_window_set) {
+		vio->cui_ra_window_set = 1;
+		bead->lrr_start = cl_index(obj, pos);
+		/*
+		 * XXX: explicit PAGE_CACHE_SIZE
+		 */
+		bead->lrr_count = cl_index(obj, tot + PAGE_CACHE_SIZE - 1);
+		ll_ra_read_in(file, bead);
+	}
+
+	/* BUG: 5972 */
+	file_accessed(file);
+	switch (vio->cui_io_subtype) {
+	case IO_NORMAL:
+		 result = lustre_generic_file_read(file, cio, &pos);
+		 break;
+	case IO_SPLICE:
+		result = generic_file_splice_read(file, &pos,
+				vio->u.splice.cui_pipe, cnt,
+				vio->u.splice.cui_flags);
+		/* LU-1109: do splice read stripe by stripe otherwise if it
+		 * may make nfsd stuck if this read occupied all internal pipe
+		 * buffers. */
+		io->ci_continue = 0;
+		break;
+	default:
+		CERROR("Wrong IO type %u\n", vio->cui_io_subtype);
+		LBUG();
+	}
+
+out:
+	if (result >= 0) {
+		if (result < cnt)
+			io->ci_continue = 0;
+		io->ci_nob += result;
+		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+				  cio->cui_fd, pos, result, 0);
+		result = 0;
+	}
+	return result;
+}
+
+static void vvp_io_read_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct vvp_io *vio = cl2vvp_io(env, ios);
+	struct ccc_io *cio = cl2ccc_io(env, ios);
+
+	if (vio->cui_ra_window_set)
+		ll_ra_read_ex(cio->cui_fd->fd_file, &vio->cui_bead);
+
+	vvp_io_fini(env, ios);
+}
+
+static int vvp_io_write_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct ccc_io      *cio   = cl2ccc_io(env, ios);
+	struct cl_io       *io    = ios->cis_io;
+	struct cl_object   *obj   = io->ci_obj;
+	struct inode       *inode = ccc_object_inode(obj);
+	struct file	*file  = cio->cui_fd->fd_file;
+	ssize_t result = 0;
+	loff_t pos = io->u.ci_wr.wr.crw_pos;
+	size_t cnt = io->u.ci_wr.wr.crw_count;
+
+	ENTRY;
+
+	if (!can_populate_pages(env, io, inode))
+		return 0;
+
+	if (cl_io_is_append(io)) {
+		/*
+		 * PARALLEL IO This has to be changed for parallel IO doing
+		 * out-of-order writes.
+		 */
+		pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
+		cio->cui_iocb->ki_pos = pos;
+	}
+
+	CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
+
+	if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */
+		result = 0;
+	else
+		result = lustre_generic_file_write(file, cio, &pos);
+
+	if (result > 0) {
+		if (result < cnt)
+			io->ci_continue = 0;
+		io->ci_nob += result;
+		ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+				  cio->cui_fd, pos, result, 0);
+		result = 0;
+	}
+	RETURN(result);
+}
+
+static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
+{
+	struct vm_fault *vmf = cfio->fault.ft_vmf;
+
+	cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, vmf);
+
+	if (vmf->page) {
+		LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
+			       vmf->virtual_address);
+		if (unlikely(!(cfio->fault.ft_flags & VM_FAULT_LOCKED))) {
+			lock_page(vmf->page);
+			cfio->fault.ft_flags &= VM_FAULT_LOCKED;
+		}
+
+		cfio->ft_vmpage = vmf->page;
+		return 0;
+	}
+
+	if (cfio->fault.ft_flags & VM_FAULT_SIGBUS) {
+		CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
+		return -EFAULT;
+	}
+
+	if (cfio->fault.ft_flags & VM_FAULT_OOM) {
+		CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address);
+		return -ENOMEM;
+	}
+
+	if (cfio->fault.ft_flags & VM_FAULT_RETRY)
+		return -EAGAIN;
+
+	CERROR("unknow error in page fault %d!\n", cfio->fault.ft_flags);
+	return -EINVAL;
+}
+
+
+static int vvp_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct vvp_io       *vio     = cl2vvp_io(env, ios);
+	struct cl_io	*io      = ios->cis_io;
+	struct cl_object    *obj     = io->ci_obj;
+	struct inode	*inode   = ccc_object_inode(obj);
+	struct cl_fault_io  *fio     = &io->u.ci_fault;
+	struct vvp_fault_io *cfio    = &vio->u.fault;
+	loff_t	       offset;
+	int		  result  = 0;
+	struct page	  *vmpage  = NULL;
+	struct cl_page      *page;
+	loff_t	       size;
+	pgoff_t	      last; /* last page in a file data region */
+
+	if (fio->ft_executable &&
+	    LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
+		CWARN("binary "DFID
+		      " changed while waiting for the page fault lock\n",
+		      PFID(lu_object_fid(&obj->co_lu)));
+
+	/* offset of the last byte on the page */
+	offset = cl_offset(obj, fio->ft_index + 1) - 1;
+	LASSERT(cl_index(obj, offset) == fio->ft_index);
+	result = ccc_prep_size(env, obj, io, 0, offset + 1, NULL);
+	if (result != 0)
+		return result;
+
+	/* must return locked page */
+	if (fio->ft_mkwrite) {
+		LASSERT(cfio->ft_vmpage != NULL);
+		lock_page(cfio->ft_vmpage);
+	} else {
+		result = vvp_io_kernel_fault(cfio);
+		if (result != 0)
+			return result;
+	}
+
+	vmpage = cfio->ft_vmpage;
+	LASSERT(PageLocked(vmpage));
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+		ll_invalidate_page(vmpage);
+
+	size = i_size_read(inode);
+	/* Though we have already held a cl_lock upon this page, but
+	 * it still can be truncated locally. */
+	if (unlikely((vmpage->mapping != inode->i_mapping) ||
+		     (page_offset(vmpage) > size))) {
+		CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
+
+		/* return +1 to stop cl_io_loop() and ll_fault() will catch
+		 * and retry. */
+		GOTO(out, result = +1);
+	}
+
+
+	if (fio->ft_mkwrite ) {
+		pgoff_t last_index;
+		/*
+		 * Capture the size while holding the lli_trunc_sem from above
+		 * we want to make sure that we complete the mkwrite action
+		 * while holding this lock. We need to make sure that we are
+		 * not past the end of the file.
+		 */
+		last_index = cl_index(obj, size - 1);
+		if (last_index < fio->ft_index) {
+			CDEBUG(D_PAGE,
+				"llite: mkwrite and truncate race happened: "
+				"%p: 0x%lx 0x%lx\n",
+				vmpage->mapping,fio->ft_index,last_index);
+			/*
+			 * We need to return if we are
+			 * passed the end of the file. This will propagate
+			 * up the call stack to ll_page_mkwrite where
+			 * we will return VM_FAULT_NOPAGE. Any non-negative
+			 * value returned here will be silently
+			 * converted to 0. If the vmpage->mapping is null
+			 * the error code would be converted back to ENODATA
+			 * in ll_page_mkwrite0. Thus we return -ENODATA
+			 * to handle both cases
+			 */
+			GOTO(out, result = -ENODATA);
+		}
+	}
+
+	page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
+	if (IS_ERR(page))
+		GOTO(out, result = PTR_ERR(page));
+
+	/* if page is going to be written, we should add this page into cache
+	 * earlier. */
+	if (fio->ft_mkwrite) {
+		wait_on_page_writeback(vmpage);
+		if (set_page_dirty(vmpage)) {
+			struct ccc_page *cp;
+
+			/* vvp_page_assume() calls wait_on_page_writeback(). */
+			cl_page_assume(env, io, page);
+
+			cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+			vvp_write_pending(cl2ccc(obj), cp);
+
+			/* Do not set Dirty bit here so that in case IO is
+			 * started before the page is really made dirty, we
+			 * still have chance to detect it. */
+			result = cl_page_cache_add(env, io, page, CRT_WRITE);
+			LASSERT(cl_page_is_owned(page, io));
+
+			vmpage = NULL;
+			if (result < 0) {
+				cl_page_unmap(env, io, page);
+				cl_page_discard(env, io, page);
+				cl_page_disown(env, io, page);
+
+				cl_page_put(env, page);
+
+				/* we're in big trouble, what can we do now? */
+				if (result == -EDQUOT)
+					result = -ENOSPC;
+				GOTO(out, result);
+			} else
+				cl_page_disown(env, io, page);
+		}
+	}
+
+	last = cl_index(obj, size - 1);
+	/*
+	 * The ft_index is only used in the case of
+	 * a mkwrite action. We need to check
+	 * our assertions are correct, since
+	 * we should have caught this above
+	 */
+	LASSERT(!fio->ft_mkwrite || fio->ft_index <= last);
+	if (fio->ft_index == last)
+		/*
+		 * Last page is mapped partially.
+		 */
+		fio->ft_nob = size - cl_offset(obj, fio->ft_index);
+	else
+		fio->ft_nob = cl_page_size(obj);
+
+	lu_ref_add(&page->cp_reference, "fault", io);
+	fio->ft_page = page;
+	EXIT;
+
+out:
+	/* return unlocked vmpage to avoid deadlocking */
+	if (vmpage != NULL)
+		unlock_page(vmpage);
+	cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+	return result;
+}
+
+static int vvp_io_fsync_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	/* we should mark TOWRITE bit to each dirty page in radix tree to
+	 * verify pages have been written, but this is difficult because of
+	 * race. */
+	return 0;
+}
+
+static int vvp_io_read_page(const struct lu_env *env,
+			    const struct cl_io_slice *ios,
+			    const struct cl_page_slice *slice)
+{
+	struct cl_io	      *io     = ios->cis_io;
+	struct cl_object	  *obj    = slice->cpl_obj;
+	struct ccc_page	   *cp     = cl2ccc_page(slice);
+	struct cl_page	    *page   = slice->cpl_page;
+	struct inode	      *inode  = ccc_object_inode(obj);
+	struct ll_sb_info	 *sbi    = ll_i2sbi(inode);
+	struct ll_file_data       *fd     = cl2ccc_io(env, ios)->cui_fd;
+	struct ll_readahead_state *ras    = &fd->fd_ras;
+	struct page		*vmpage = cp->cpg_page;
+	struct cl_2queue	  *queue  = &io->ci_queue;
+	int rc;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+	LASSERT(slice->cpl_obj == obj);
+
+	ENTRY;
+
+	if (sbi->ll_ra_info.ra_max_pages_per_file &&
+	    sbi->ll_ra_info.ra_max_pages)
+		ras_update(sbi, inode, ras, page->cp_index,
+			   cp->cpg_defer_uptodate);
+
+	/* Sanity check whether the page is protected by a lock. */
+	rc = cl_page_is_under_lock(env, io, page);
+	if (rc != -EBUSY) {
+		CL_PAGE_HEADER(D_WARNING, env, page, "%s: %d\n",
+			       rc == -ENODATA ? "without a lock" :
+			       "match failed", rc);
+		if (rc != -ENODATA)
+			RETURN(rc);
+	}
+
+	if (cp->cpg_defer_uptodate) {
+		cp->cpg_ra_used = 1;
+		cl_page_export(env, page, 1);
+	}
+	/*
+	 * Add page into the queue even when it is marked uptodate above.
+	 * this will unlock it automatically as part of cl_page_list_disown().
+	 */
+	cl_2queue_add(queue, page);
+	if (sbi->ll_ra_info.ra_max_pages_per_file &&
+	    sbi->ll_ra_info.ra_max_pages)
+		ll_readahead(env, io, ras,
+			     vmpage->mapping, &queue->c2_qin, fd->fd_flags);
+
+	RETURN(0);
+}
+
+static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io,
+			    struct cl_page *page, struct ccc_page *cp,
+			    enum cl_req_type crt)
+{
+	struct cl_2queue  *queue;
+	int result;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+	queue = &io->ci_queue;
+	cl_2queue_init_page(queue, page);
+
+	result = cl_io_submit_sync(env, io, crt, queue, 0);
+	LASSERT(cl_page_is_owned(page, io));
+
+	if (crt == CRT_READ)
+		/*
+		 * in CRT_WRITE case page is left locked even in case of
+		 * error.
+		 */
+		cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_2queue_fini(env, queue);
+
+	return result;
+}
+
+/**
+ * Prepare partially written-to page for a write.
+ */
+static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io,
+				  struct cl_object *obj, struct cl_page *pg,
+				  struct ccc_page *cp,
+				  unsigned from, unsigned to)
+{
+	struct cl_attr *attr   = ccc_env_thread_attr(env);
+	loff_t	  offset = cl_offset(obj, pg->cp_index);
+	int	     result;
+
+	cl_object_attr_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+	if (result == 0) {
+		/*
+		 * If are writing to a new page, no need to read old data.
+		 * The extent locking will have updated the KMS, and for our
+		 * purposes here we can treat it like i_size.
+		 */
+		if (attr->cat_kms <= offset) {
+			char *kaddr = ll_kmap_atomic(cp->cpg_page, KM_USER0);
+
+			memset(kaddr, 0, cl_page_size(obj));
+			ll_kunmap_atomic(kaddr, KM_USER0);
+		} else if (cp->cpg_defer_uptodate)
+			cp->cpg_ra_used = 1;
+		else
+			result = vvp_page_sync_io(env, io, pg, cp, CRT_READ);
+		/*
+		 * In older implementations, obdo_refresh_inode is called here
+		 * to update the inode because the write might modify the
+		 * object info at OST. However, this has been proven useless,
+		 * since LVB functions will be called when user space program
+		 * tries to retrieve inode attribute.  Also, see bug 15909 for
+		 * details. -jay
+		 */
+		if (result == 0)
+			cl_page_export(env, pg, 1);
+	}
+	return result;
+}
+
+static int vvp_io_prepare_write(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				const struct cl_page_slice *slice,
+				unsigned from, unsigned to)
+{
+	struct cl_object *obj    = slice->cpl_obj;
+	struct ccc_page  *cp     = cl2ccc_page(slice);
+	struct cl_page   *pg     = slice->cpl_page;
+	struct page       *vmpage = cp->cpg_page;
+
+	int result;
+
+	ENTRY;
+
+	LINVRNT(cl_page_is_vmlocked(env, pg));
+	LASSERT(vmpage->mapping->host == ccc_object_inode(obj));
+
+	result = 0;
+
+	CL_PAGE_HEADER(D_PAGE, env, pg, "preparing: [%d, %d]\n", from, to);
+	if (!PageUptodate(vmpage)) {
+		/*
+		 * We're completely overwriting an existing page, so _don't_
+		 * set it up to date until commit_write
+		 */
+		if (from == 0 && to == PAGE_CACHE_SIZE) {
+			CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n");
+			POISON_PAGE(page, 0x11);
+		} else
+			result = vvp_io_prepare_partial(env, ios->cis_io, obj,
+							pg, cp, from, to);
+	} else
+		CL_PAGE_HEADER(D_PAGE, env, pg, "uptodate\n");
+	RETURN(result);
+}
+
+static int vvp_io_commit_write(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       const struct cl_page_slice *slice,
+			       unsigned from, unsigned to)
+{
+	struct cl_object  *obj    = slice->cpl_obj;
+	struct cl_io      *io     = ios->cis_io;
+	struct ccc_page   *cp     = cl2ccc_page(slice);
+	struct cl_page    *pg     = slice->cpl_page;
+	struct inode      *inode  = ccc_object_inode(obj);
+	struct ll_sb_info *sbi    = ll_i2sbi(inode);
+	struct ll_inode_info *lli = ll_i2info(inode);
+	struct page	*vmpage = cp->cpg_page;
+
+	int    result;
+	int    tallyop;
+	loff_t size;
+
+	ENTRY;
+
+	LINVRNT(cl_page_is_vmlocked(env, pg));
+	LASSERT(vmpage->mapping->host == inode);
+
+	LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "commiting page write\n");
+	CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to);
+
+	/*
+	 * queue a write for some time in the future the first time we
+	 * dirty the page.
+	 *
+	 * This is different from what other file systems do: they usually
+	 * just mark page (and some of its buffers) dirty and rely on
+	 * balance_dirty_pages() to start a write-back. Lustre wants write-back
+	 * to be started earlier for the following reasons:
+	 *
+	 *     (1) with a large number of clients we need to limit the amount
+	 *     of cached data on the clients a lot;
+	 *
+	 *     (2) large compute jobs generally want compute-only then io-only
+	 *     and the IO should complete as quickly as possible;
+	 *
+	 *     (3) IO is batched up to the RPC size and is async until the
+	 *     client max cache is hit
+	 *     (/proc/fs/lustre/osc/OSC.../max_dirty_mb)
+	 *
+	 */
+	if (!PageDirty(vmpage)) {
+		tallyop = LPROC_LL_DIRTY_MISSES;
+		result = cl_page_cache_add(env, io, pg, CRT_WRITE);
+		if (result == 0) {
+			/* page was added into cache successfully. */
+			set_page_dirty(vmpage);
+			vvp_write_pending(cl2ccc(obj), cp);
+		} else if (result == -EDQUOT) {
+			pgoff_t last_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+			bool need_clip = true;
+
+			/*
+			 * Client ran out of disk space grant. Possible
+			 * strategies are:
+			 *
+			 *     (a) do a sync write, renewing grant;
+			 *
+			 *     (b) stop writing on this stripe, switch to the
+			 *     next one.
+			 *
+			 * (b) is a part of "parallel io" design that is the
+			 * ultimate goal. (a) is what "old" client did, and
+			 * what the new code continues to do for the time
+			 * being.
+			 */
+			if (last_index > pg->cp_index) {
+				to = PAGE_CACHE_SIZE;
+				need_clip = false;
+			} else if (last_index == pg->cp_index) {
+				int size_to = i_size_read(inode) & ~CFS_PAGE_MASK;
+				if (to < size_to)
+					to = size_to;
+			}
+			if (need_clip)
+				cl_page_clip(env, pg, 0, to);
+			result = vvp_page_sync_io(env, io, pg, cp, CRT_WRITE);
+			if (result)
+				CERROR("Write page %lu of inode %p failed %d\n",
+				       pg->cp_index, inode, result);
+		}
+	} else {
+		tallyop = LPROC_LL_DIRTY_HITS;
+		result = 0;
+	}
+	ll_stats_ops_tally(sbi, tallyop, 1);
+
+	/* Inode should be marked DIRTY even if no new page was marked DIRTY
+	 * because page could have been not flushed between 2 modifications.
+	 * It is important the file is marked DIRTY as soon as the I/O is done
+	 * Indeed, when cache is flushed, file could be already closed and it
+	 * is too late to warn the MDT.
+	 * It is acceptable that file is marked DIRTY even if I/O is dropped
+	 * for some reasons before being flushed to OST.
+	 */
+	if (result == 0) {
+		spin_lock(&lli->lli_lock);
+		lli->lli_flags |= LLIF_DATA_MODIFIED;
+		spin_unlock(&lli->lli_lock);
+	}
+
+	size = cl_offset(obj, pg->cp_index) + to;
+
+	ll_inode_size_lock(inode);
+	if (result == 0) {
+		if (size > i_size_read(inode)) {
+			cl_isize_write_nolock(inode, size);
+			CDEBUG(D_VFSTRACE, DFID" updating i_size %lu\n",
+			       PFID(lu_object_fid(&obj->co_lu)),
+			       (unsigned long)size);
+		}
+		cl_page_export(env, pg, 1);
+	} else {
+		if (size > i_size_read(inode))
+			cl_page_discard(env, io, pg);
+	}
+	ll_inode_size_unlock(inode);
+	RETURN(result);
+}
+
+static const struct cl_io_operations vvp_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini      = vvp_io_read_fini,
+			.cio_lock      = vvp_io_read_lock,
+			.cio_start     = vvp_io_read_start,
+			.cio_advance   = ccc_io_advance
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = vvp_io_fini,
+			.cio_lock      = vvp_io_write_lock,
+			.cio_start     = vvp_io_write_start,
+			.cio_advance   = ccc_io_advance
+		},
+		[CIT_SETATTR] = {
+			.cio_fini       = vvp_io_setattr_fini,
+			.cio_iter_init  = vvp_io_setattr_iter_init,
+			.cio_lock       = vvp_io_setattr_lock,
+			.cio_start      = vvp_io_setattr_start,
+			.cio_end	= vvp_io_setattr_end
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = vvp_io_fault_fini,
+			.cio_iter_init = vvp_io_fault_iter_init,
+			.cio_lock      = vvp_io_fault_lock,
+			.cio_start     = vvp_io_fault_start,
+			.cio_end       = ccc_io_end
+		},
+		[CIT_FSYNC] = {
+			.cio_start  = vvp_io_fsync_start,
+			.cio_fini   = vvp_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = vvp_io_fini
+		}
+	},
+	.cio_read_page     = vvp_io_read_page,
+	.cio_prepare_write = vvp_io_prepare_write,
+	.cio_commit_write  = vvp_io_commit_write
+};
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io)
+{
+	struct vvp_io      *vio   = vvp_env_io(env);
+	struct ccc_io      *cio   = ccc_env_io(env);
+	struct inode       *inode = ccc_object_inode(obj);
+	int		 result;
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+	ENTRY;
+
+	CL_IO_SLICE_CLEAN(cio, cui_cl);
+	cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops);
+	vio->cui_ra_window_set = 0;
+	result = 0;
+	if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+		size_t count;
+		struct ll_inode_info *lli = ll_i2info(inode);
+
+		count = io->u.ci_rw.crw_count;
+		/* "If nbyte is 0, read() will return 0 and have no other
+		 *  results."  -- Single Unix Spec */
+		if (count == 0)
+			result = 1;
+		else {
+			cio->cui_tot_count = count;
+			cio->cui_tot_nrsegs = 0;
+		}
+		/* for read/write, we store the jobid in the inode, and
+		 * it'll be fetched by osc when building RPC.
+		 *
+		 * it's not accurate if the file is shared by different
+		 * jobs.
+		 */
+		lustre_get_jobid(lli->lli_jobid);
+	} else if (io->ci_type == CIT_SETATTR) {
+		if (!cl_io_is_trunc(io))
+			io->ci_lockreq = CILR_MANDATORY;
+	}
+
+	/* ignore layout change for generic CIT_MISC but not for glimpse.
+	 * io context for glimpse must set ci_verify_layout to true,
+	 * see cl_glimpse_size0() for details. */
+	if (io->ci_type == CIT_MISC && !io->ci_verify_layout)
+		io->ci_ignore_layout = 1;
+
+	/* Enqueue layout lock and get layout version. We need to do this
+	 * even for operations requiring to open file, such as read and write,
+	 * because it might not grant layout lock in IT_OPEN. */
+	if (result == 0 && !io->ci_ignore_layout) {
+		result = ll_layout_refresh(inode, &cio->cui_layout_gen);
+		if (result == -ENOENT)
+			/* If the inode on MDS has been removed, but the objects
+			 * on OSTs haven't been destroyed (async unlink), layout
+			 * fetch will return -ENOENT, we'd ingore this error
+			 * and continue with dirty flush. LU-3230. */
+			result = 0;
+		if (result < 0)
+			CERROR("%s: refresh file layout " DFID " error %d.\n",
+				ll_get_fsname(inode->i_sb, NULL, 0),
+				PFID(lu_object_fid(&obj->co_lu)), result);
+	}
+
+	RETURN(result);
+}
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	/* Caling just for assertion */
+	cl2ccc_io(env, slice);
+	return vvp_env_io(env);
+}

diff --git a/drivers/staging/lustre/lustre/llite/vvp_lock.c b/drivers/staging/lustre/lustre/llite/vvp_lock.c
new file mode 100644
index 0000000..9b8712b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_lock.c

@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp lock functions.
+ *
+ */
+
+/**
+ * Estimates lock value for the purpose of managing the lock cache during
+ * memory shortages.
+ *
+ * Locks for memory mapped files are almost infinitely precious, others are
+ * junk. "Mapped locks" are heavy, but not infinitely heavy, so that they are
+ * ordered within themselves by weights assigned from other layers.
+ */
+static unsigned long vvp_lock_weigh(const struct lu_env *env,
+				    const struct cl_lock_slice *slice)
+{
+	struct ccc_object *cob = cl2ccc(slice->cls_obj);
+
+	ENTRY;
+	RETURN(atomic_read(&cob->cob_mmap_cnt) > 0 ? ~0UL >> 2 : 0);
+}
+
+static const struct cl_lock_operations vvp_lock_ops = {
+	.clo_delete    = ccc_lock_delete,
+	.clo_fini      = ccc_lock_fini,
+	.clo_enqueue   = ccc_lock_enqueue,
+	.clo_wait      = ccc_lock_wait,
+	.clo_unuse     = ccc_lock_unuse,
+	.clo_fits_into = ccc_lock_fits_into,
+	.clo_state     = ccc_lock_state,
+	.clo_weigh     = vvp_lock_weigh
+};
+
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	return ccc_lock_init(env, obj, lock, io, &vvp_lock_ops);
+}

diff --git a/drivers/staging/lustre/lustre/llite/vvp_object.c b/drivers/staging/lustre/lustre/llite/vvp_object.c
new file mode 100644
index 0000000..01edc5b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_object.c

@@ -0,0 +1,186 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_object implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int vvp_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	struct ccc_object    *obj   = lu2ccc(o);
+	struct inode	 *inode = obj->cob_inode;
+	struct ll_inode_info *lli;
+
+	(*p)(env, cookie, "(%s %d %d) inode: %p ",
+	     list_empty(&obj->cob_pending_list) ? "-" : "+",
+	     obj->cob_transient_pages, atomic_read(&obj->cob_mmap_cnt),
+	     inode);
+	if (inode) {
+		lli = ll_i2info(inode);
+		(*p)(env, cookie, "%lu/%u %o %u %d %p "DFID,
+		     inode->i_ino, inode->i_generation, inode->i_mode,
+		     inode->i_nlink, atomic_read(&inode->i_count),
+		     lli->lli_clob, PFID(&lli->lli_fid));
+	}
+	return 0;
+}
+
+static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	/*
+	 * lov overwrites most of these fields in
+	 * lov_attr_get()->...lov_merge_lvb_kms(), except when inode
+	 * attributes are newer.
+	 */
+
+	attr->cat_size = i_size_read(inode);
+	attr->cat_mtime = LTIME_S(inode->i_mtime);
+	attr->cat_atime = LTIME_S(inode->i_atime);
+	attr->cat_ctime = LTIME_S(inode->i_ctime);
+	attr->cat_blocks = inode->i_blocks;
+	attr->cat_uid = inode->i_uid;
+	attr->cat_gid = inode->i_gid;
+	/* KMS is not known by this layer */
+	return 0; /* layers below have to fill in the rest */
+}
+
+static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_attr *attr, unsigned valid)
+{
+	struct inode *inode = ccc_object_inode(obj);
+
+	if (valid & CAT_UID)
+		inode->i_uid = attr->cat_uid;
+	if (valid & CAT_GID)
+		inode->i_gid = attr->cat_gid;
+	if (valid & CAT_ATIME)
+		LTIME_S(inode->i_atime) = attr->cat_atime;
+	if (valid & CAT_MTIME)
+		LTIME_S(inode->i_mtime) = attr->cat_mtime;
+	if (valid & CAT_CTIME)
+		LTIME_S(inode->i_ctime) = attr->cat_ctime;
+	if (0 && valid & CAT_SIZE)
+		cl_isize_write_nolock(inode, attr->cat_size);
+	/* not currently necessary */
+	if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE))
+		mark_inode_dirty(inode);
+	return 0;
+}
+
+int vvp_conf_set(const struct lu_env *env, struct cl_object *obj,
+		const struct cl_object_conf *conf)
+{
+	struct ll_inode_info *lli = ll_i2info(conf->coc_inode);
+
+	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+		lli->lli_layout_gen = LL_LAYOUT_GEN_NONE;
+		return 0;
+	}
+
+	if (conf->coc_opc != OBJECT_CONF_SET)
+		return 0;
+
+	if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL) {
+		CDEBUG(D_VFSTRACE, "layout lock change: %u -> %u\n",
+			lli->lli_layout_gen,
+			conf->u.coc_md->lsm->lsm_layout_gen);
+
+		lli->lli_has_smd = true;
+		lli->lli_layout_gen = conf->u.coc_md->lsm->lsm_layout_gen;
+	} else {
+		CDEBUG(D_VFSTRACE, "layout lock destroyed: %u.\n",
+			lli->lli_layout_gen);
+
+		lli->lli_has_smd = false;
+		lli->lli_layout_gen = LL_LAYOUT_GEN_EMPTY;
+	}
+	return 0;
+}
+
+static const struct cl_object_operations vvp_ops = {
+	.coo_page_init = vvp_page_init,
+	.coo_lock_init = vvp_lock_init,
+	.coo_io_init   = vvp_io_init,
+	.coo_attr_get  = vvp_attr_get,
+	.coo_attr_set  = vvp_attr_set,
+	.coo_conf_set  = vvp_conf_set,
+	.coo_glimpse   = ccc_object_glimpse
+};
+
+static const struct lu_object_operations vvp_lu_obj_ops = {
+	.loo_object_init  = ccc_object_init,
+	.loo_object_free  = ccc_object_free,
+	.loo_object_print = vvp_object_print
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode)
+{
+	struct cl_inode_info *lli = cl_i2info(inode);
+	struct cl_object     *obj = lli->lli_clob;
+	struct lu_object     *lu;
+
+	LASSERT(obj != NULL);
+	lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type);
+	LASSERT(lu != NULL);
+	return lu2ccc(lu);
+}
+
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev)
+{
+	return ccc_object_alloc(env, hdr, dev, &vvp_ops, &vvp_lu_obj_ops);
+}

diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c
new file mode 100644
index 0000000..4568e69
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_page.c

@@ -0,0 +1,558 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+static void vvp_page_fini_common(struct ccc_page *cp)
+{
+	struct page *vmpage = cp->cpg_page;
+
+	LASSERT(vmpage != NULL);
+	page_cache_release(vmpage);
+}
+
+static void vvp_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice)
+{
+	struct ccc_page *cp = cl2ccc_page(slice);
+	struct page *vmpage  = cp->cpg_page;
+
+	/*
+	 * vmpage->private was already cleared when page was moved into
+	 * VPG_FREEING state.
+	 */
+	LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
+	vvp_page_fini_common(cp);
+}
+
+static int vvp_page_own(const struct lu_env *env,
+			const struct cl_page_slice *slice, struct cl_io *io,
+			int nonblock)
+{
+	struct ccc_page *vpg    = cl2ccc_page(slice);
+	struct page      *vmpage = vpg->cpg_page;
+
+	LASSERT(vmpage != NULL);
+	if (nonblock) {
+		if (!trylock_page(vmpage))
+			return -EAGAIN;
+
+		if (unlikely(PageWriteback(vmpage))) {
+			unlock_page(vmpage);
+			return -EAGAIN;
+		}
+
+		return 0;
+	}
+
+	lock_page(vmpage);
+	wait_on_page_writeback(vmpage);
+	return 0;
+}
+
+static void vvp_page_assume(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+	wait_on_page_writeback(vmpage);
+}
+
+static void vvp_page_unassume(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+}
+
+static void vvp_page_disown(const struct lu_env *env,
+			    const struct cl_page_slice *slice, struct cl_io *io)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	unlock_page(cl2vm_page(slice));
+}
+
+static void vvp_page_discard(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *unused)
+{
+	struct page	   *vmpage  = cl2vm_page(slice);
+	struct address_space *mapping;
+	struct ccc_page      *cpg     = cl2ccc_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	mapping = vmpage->mapping;
+
+	if (cpg->cpg_defer_uptodate && !cpg->cpg_ra_used)
+		ll_ra_stats_inc(mapping, RA_STAT_DISCARDED);
+
+	/*
+	 * truncate_complete_page() calls
+	 * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete().
+	 */
+	truncate_complete_page(mapping, vmpage);
+}
+
+static int vvp_page_unmap(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+	__u64       offset;
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+
+	offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+	/*
+	 * XXX is it safe to call this with the page lock held?
+	 */
+	ll_teardown_mmaps(vmpage->mapping, offset, offset + PAGE_CACHE_SIZE);
+	return 0;
+}
+
+static void vvp_page_delete(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	struct page       *vmpage = cl2vm_page(slice);
+	struct inode     *inode  = vmpage->mapping->host;
+	struct cl_object *obj    = slice->cpl_obj;
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT((struct cl_page *)vmpage->private == slice->cpl_page);
+	LASSERT(inode == ccc_object_inode(obj));
+
+	vvp_write_complete(cl2ccc(obj), cl2ccc_page(slice));
+	ClearPagePrivate(vmpage);
+	vmpage->private = 0;
+	/*
+	 * Reference from vmpage to cl_page is removed, but the reference back
+	 * is still here. It is removed later in vvp_page_fini().
+	 */
+}
+
+static void vvp_page_export(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    int uptodate)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(vmpage != NULL);
+	LASSERT(PageLocked(vmpage));
+	if (uptodate)
+		SetPageUptodate(vmpage);
+	else
+		ClearPageUptodate(vmpage);
+}
+
+static int vvp_page_is_vmlocked(const struct lu_env *env,
+				const struct cl_page_slice *slice)
+{
+	return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA;
+}
+
+static int vvp_page_prep_read(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	ENTRY;
+	/* Skip the page already marked as PG_uptodate. */
+	RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0);
+}
+
+static int vvp_page_prep_write(const struct lu_env *env,
+			       const struct cl_page_slice *slice,
+			       struct cl_io *unused)
+{
+	struct page *vmpage = cl2vm_page(slice);
+
+	LASSERT(PageLocked(vmpage));
+	LASSERT(!PageDirty(vmpage));
+
+	set_page_writeback(vmpage);
+	vvp_write_pending(cl2ccc(slice->cpl_obj), cl2ccc_page(slice));
+
+	return 0;
+}
+
+/**
+ * Handles page transfer errors at VM level.
+ *
+ * This takes inode as a separate argument, because inode on which error is to
+ * be set can be different from \a vmpage inode in case of direct-io.
+ */
+static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret)
+{
+	struct ccc_object *obj = cl_inode2ccc(inode);
+
+	if (ioret == 0) {
+		ClearPageError(vmpage);
+		obj->cob_discard_page_warned = 0;
+	} else {
+		SetPageError(vmpage);
+		if (ioret == -ENOSPC)
+			set_bit(AS_ENOSPC, &inode->i_mapping->flags);
+		else
+			set_bit(AS_EIO, &inode->i_mapping->flags);
+
+		if ((ioret == -ESHUTDOWN || ioret == -EINTR) &&
+		     obj->cob_discard_page_warned == 0) {
+			obj->cob_discard_page_warned = 1;
+			ll_dirty_page_discard_warn(vmpage, ioret);
+		}
+	}
+}
+
+static void vvp_page_completion_read(const struct lu_env *env,
+				     const struct cl_page_slice *slice,
+				     int ioret)
+{
+	struct ccc_page *cp     = cl2ccc_page(slice);
+	struct page      *vmpage = cp->cpg_page;
+	struct cl_page  *page   = cl_page_top(slice->cpl_page);
+	struct inode    *inode  = ccc_object_inode(page->cp_obj);
+	ENTRY;
+
+	LASSERT(PageLocked(vmpage));
+	CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret);
+
+	if (cp->cpg_defer_uptodate)
+		ll_ra_count_put(ll_i2sbi(inode), 1);
+
+	if (ioret == 0)  {
+		if (!cp->cpg_defer_uptodate)
+			cl_page_export(env, page, 1);
+	} else
+		cp->cpg_defer_uptodate = 0;
+
+	if (page->cp_sync_io == NULL)
+		unlock_page(vmpage);
+
+	EXIT;
+}
+
+static void vvp_page_completion_write(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      int ioret)
+{
+	struct ccc_page *cp     = cl2ccc_page(slice);
+	struct cl_page  *pg     = slice->cpl_page;
+	struct page      *vmpage = cp->cpg_page;
+	ENTRY;
+
+	LASSERT(ergo(pg->cp_sync_io != NULL, PageLocked(vmpage)));
+	LASSERT(PageWriteback(vmpage));
+
+	CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret);
+
+	/*
+	 * TODO: Actually it makes sense to add the page into oap pending
+	 * list again and so that we don't need to take the page out from
+	 * SoM write pending list, if we just meet a recoverable error,
+	 * -ENOMEM, etc.
+	 * To implement this, we just need to return a non zero value in
+	 * ->cpo_completion method. The underlying transfer should be notified
+	 * and then re-add the page into pending transfer queue.  -jay
+	 */
+
+	cp->cpg_write_queued = 0;
+	vvp_write_complete(cl2ccc(slice->cpl_obj), cp);
+
+	/*
+	 * Only mark the page error only when it's an async write because
+	 * applications won't wait for IO to finish.
+	 */
+	if (pg->cp_sync_io == NULL)
+		vvp_vmpage_error(ccc_object_inode(pg->cp_obj), vmpage, ioret);
+
+	end_page_writeback(vmpage);
+	EXIT;
+}
+
+/**
+ * Implements cl_page_operations::cpo_make_ready() method.
+ *
+ * This is called to yank a page from the transfer cache and to send it out as
+ * a part of transfer. This function try-locks the page. If try-lock failed,
+ * page is owned by some concurrent IO, and should be skipped (this is bad,
+ * but hopefully rare situation, as it usually results in transfer being
+ * shorter than possible).
+ *
+ * \retval 0      success, page can be placed into transfer
+ *
+ * \retval -EAGAIN page is either used by concurrent IO has been
+ * truncated. Skip it.
+ */
+static int vvp_page_make_ready(const struct lu_env *env,
+			       const struct cl_page_slice *slice)
+{
+	struct page *vmpage = cl2vm_page(slice);
+	struct cl_page *pg = slice->cpl_page;
+	int result = 0;
+
+	lock_page(vmpage);
+	if (clear_page_dirty_for_io(vmpage)) {
+		LASSERT(pg->cp_state == CPS_CACHED);
+		/* This actually clears the dirty bit in the radix
+		 * tree. */
+		set_page_writeback(vmpage);
+		vvp_write_pending(cl2ccc(slice->cpl_obj),
+				cl2ccc_page(slice));
+		CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
+	} else if (pg->cp_state == CPS_PAGEOUT) {
+		/* is it possible for osc_flush_async_page() to already
+		 * make it ready? */
+		result = -EALREADY;
+	} else {
+		CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",
+			      pg->cp_state);
+		LBUG();
+	}
+	unlock_page(vmpage);
+	RETURN(result);
+}
+
+static int vvp_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct ccc_page *vp = cl2ccc_page(slice);
+	struct page      *vmpage = vp->cpg_page;
+
+	(*printer)(env, cookie, LUSTRE_VVP_NAME"-page@%p(%d:%d:%d) "
+		   "vm@%p ",
+		   vp, vp->cpg_defer_uptodate, vp->cpg_ra_used,
+		   vp->cpg_write_queued, vmpage);
+	if (vmpage != NULL) {
+		(*printer)(env, cookie, "%lx %d:%d %lx %lu %slru",
+			   (long)vmpage->flags, page_count(vmpage),
+			   page_mapcount(vmpage), vmpage->private,
+			   page_index(vmpage),
+			   list_empty(&vmpage->lru) ? "not-" : "");
+	}
+	(*printer)(env, cookie, "\n");
+	return 0;
+}
+
+static const struct cl_page_operations vvp_page_ops = {
+	.cpo_own	   = vvp_page_own,
+	.cpo_assume	= vvp_page_assume,
+	.cpo_unassume      = vvp_page_unassume,
+	.cpo_disown	= vvp_page_disown,
+	.cpo_vmpage	= ccc_page_vmpage,
+	.cpo_discard       = vvp_page_discard,
+	.cpo_delete	= vvp_page_delete,
+	.cpo_unmap	 = vvp_page_unmap,
+	.cpo_export	= vvp_page_export,
+	.cpo_is_vmlocked   = vvp_page_is_vmlocked,
+	.cpo_fini	  = vvp_page_fini,
+	.cpo_print	 = vvp_page_print,
+	.cpo_is_under_lock = ccc_page_is_under_lock,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep	= vvp_page_prep_read,
+			.cpo_completion  = vvp_page_completion_read,
+			.cpo_make_ready  = ccc_fail,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep	= vvp_page_prep_write,
+			.cpo_completion  = vvp_page_completion_write,
+			.cpo_make_ready  = vvp_page_make_ready,
+		}
+	}
+};
+
+static void vvp_transient_page_verify(const struct cl_page *page)
+{
+	struct inode *inode = ccc_object_inode(page->cp_obj);
+
+	LASSERT(!mutex_trylock(&inode->i_mutex));
+}
+
+static int vvp_transient_page_own(const struct lu_env *env,
+				  const struct cl_page_slice *slice,
+				  struct cl_io *unused, int nonblock)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+	return 0;
+}
+
+static void vvp_transient_page_assume(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_unassume(const struct lu_env *env,
+					const struct cl_page_slice *slice,
+					struct cl_io *unused)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_disown(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      struct cl_io *unused)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_discard(const struct lu_env *env,
+				       const struct cl_page_slice *slice,
+				       struct cl_io *unused)
+{
+	struct cl_page *page = slice->cpl_page;
+
+	vvp_transient_page_verify(slice->cpl_page);
+
+	/*
+	 * For transient pages, remove it from the radix tree.
+	 */
+	cl_page_delete(env, page);
+}
+
+static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
+					  const struct cl_page_slice *slice)
+{
+	struct inode    *inode = ccc_object_inode(slice->cpl_obj);
+	int	locked;
+
+	locked = !mutex_trylock(&inode->i_mutex);
+	if (!locked)
+		mutex_unlock(&inode->i_mutex);
+	return locked ? -EBUSY : -ENODATA;
+}
+
+static void
+vvp_transient_page_completion(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      int ioret)
+{
+	vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_fini(const struct lu_env *env,
+				    struct cl_page_slice *slice)
+{
+	struct ccc_page *cp = cl2ccc_page(slice);
+	struct cl_page *clp = slice->cpl_page;
+	struct ccc_object *clobj = cl2ccc(clp->cp_obj);
+
+	vvp_page_fini_common(cp);
+	LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+	clobj->cob_transient_pages--;
+}
+
+static const struct cl_page_operations vvp_transient_page_ops = {
+	.cpo_own	   = vvp_transient_page_own,
+	.cpo_assume	= vvp_transient_page_assume,
+	.cpo_unassume      = vvp_transient_page_unassume,
+	.cpo_disown	= vvp_transient_page_disown,
+	.cpo_discard       = vvp_transient_page_discard,
+	.cpo_vmpage	= ccc_page_vmpage,
+	.cpo_fini	  = vvp_transient_page_fini,
+	.cpo_is_vmlocked   = vvp_transient_page_is_vmlocked,
+	.cpo_print	 = vvp_page_print,
+	.cpo_is_under_lock = ccc_page_is_under_lock,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep	= ccc_transient_page_prep,
+			.cpo_completion  = vvp_transient_page_completion,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep	= ccc_transient_page_prep,
+			.cpo_completion  = vvp_transient_page_completion,
+		}
+	}
+};
+
+int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, struct page *vmpage)
+{
+	struct ccc_page *cpg = cl_object_page_slice(obj, page);
+
+	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+	cpg->cpg_page = vmpage;
+	page_cache_get(vmpage);
+
+	INIT_LIST_HEAD(&cpg->cpg_pending_linkage);
+	if (page->cp_type == CPT_CACHEABLE) {
+		SetPagePrivate(vmpage);
+		vmpage->private = (unsigned long)page;
+		cl_page_slice_add(page, &cpg->cpg_cl, obj,
+				&vvp_page_ops);
+	} else {
+		struct ccc_object *clobj = cl2ccc(obj);
+
+		LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+		cl_page_slice_add(page, &cpg->cpg_cl, obj,
+				&vvp_transient_page_ops);
+		clobj->cob_transient_pages++;
+	}
+	return 0;
+}

diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c
new file mode 100644
index 0000000..4176264
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/xattr.c

@@ -0,0 +1,578 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/selinux.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_eacl.h>
+
+#include "llite_internal.h"
+
+#define XATTR_USER_T	    (1)
+#define XATTR_TRUSTED_T	 (2)
+#define XATTR_SECURITY_T	(3)
+#define XATTR_ACL_ACCESS_T      (4)
+#define XATTR_ACL_DEFAULT_T     (5)
+#define XATTR_LUSTRE_T	  (6)
+#define XATTR_OTHER_T	   (7)
+
+static
+int get_xattr_type(const char *name)
+{
+	if (!strcmp(name, POSIX_ACL_XATTR_ACCESS))
+		return XATTR_ACL_ACCESS_T;
+
+	if (!strcmp(name, POSIX_ACL_XATTR_DEFAULT))
+		return XATTR_ACL_DEFAULT_T;
+
+	if (!strncmp(name, XATTR_USER_PREFIX,
+		     sizeof(XATTR_USER_PREFIX) - 1))
+		return XATTR_USER_T;
+
+	if (!strncmp(name, XATTR_TRUSTED_PREFIX,
+		     sizeof(XATTR_TRUSTED_PREFIX) - 1))
+		return XATTR_TRUSTED_T;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX,
+		     sizeof(XATTR_SECURITY_PREFIX) - 1))
+		return XATTR_SECURITY_T;
+
+	if (!strncmp(name, XATTR_LUSTRE_PREFIX,
+		     sizeof(XATTR_LUSTRE_PREFIX) - 1))
+		return XATTR_LUSTRE_T;
+
+	return XATTR_OTHER_T;
+}
+
+static
+int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
+{
+	if ((xattr_type == XATTR_ACL_ACCESS_T ||
+	     xattr_type == XATTR_ACL_DEFAULT_T) &&
+	   !(sbi->ll_flags & LL_SBI_ACL))
+		return -EOPNOTSUPP;
+
+	if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
+		return -EOPNOTSUPP;
+	if (xattr_type == XATTR_TRUSTED_T && !cfs_capable(CFS_CAP_SYS_ADMIN))
+		return -EPERM;
+	if (xattr_type == XATTR_OTHER_T)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static
+int ll_setxattr_common(struct inode *inode, const char *name,
+		       const void *value, size_t size,
+		       int flags, __u64 valid)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req;
+	int xattr_type, rc;
+	struct obd_capa *oc;
+	posix_acl_xattr_header *new_value = NULL;
+	struct rmtacl_ctl_entry *rce = NULL;
+	ext_acl_xattr_header *acl = NULL;
+	const char *pv = value;
+	ENTRY;
+
+	xattr_type = get_xattr_type(name);
+	rc = xattr_type_filter(sbi, xattr_type);
+	if (rc)
+		RETURN(rc);
+
+	/* b10667: ignore lustre special xattr for now */
+	if ((xattr_type == XATTR_TRUSTED_T && strcmp(name, "trusted.lov") == 0) ||
+	    (xattr_type == XATTR_LUSTRE_T && strcmp(name, "lustre.lov") == 0))
+		RETURN(0);
+
+	/* b15587: ignore security.capability xattr for now */
+	if ((xattr_type == XATTR_SECURITY_T &&
+	    strcmp(name, "security.capability") == 0))
+		RETURN(0);
+
+	/* LU-549:  Disable security.selinux when selinux is disabled */
+	if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    strcmp(name, "security.selinux") == 0)
+		RETURN(-EOPNOTSUPP);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+	    (xattr_type == XATTR_ACL_ACCESS_T ||
+	    xattr_type == XATTR_ACL_DEFAULT_T)) {
+		rce = rct_search(&sbi->ll_rct, current_pid());
+		if (rce == NULL ||
+		    (rce->rce_ops != RMT_LSETFACL &&
+		    rce->rce_ops != RMT_RSETFACL))
+			RETURN(-EOPNOTSUPP);
+
+		if (rce->rce_ops == RMT_LSETFACL) {
+			struct eacl_entry *ee;
+
+			ee = et_search_del(&sbi->ll_et, current_pid(),
+					   ll_inode2fid(inode), xattr_type);
+			LASSERT(ee != NULL);
+			if (valid & OBD_MD_FLXATTR) {
+				acl = lustre_acl_xattr_merge2ext(
+						(posix_acl_xattr_header *)value,
+						size, ee->ee_acl);
+				if (IS_ERR(acl)) {
+					ee_free(ee);
+					RETURN(PTR_ERR(acl));
+				}
+				size =  CFS_ACL_XATTR_SIZE(\
+						le32_to_cpu(acl->a_count), \
+						ext_acl_xattr);
+				pv = (const char *)acl;
+			}
+			ee_free(ee);
+		} else if (rce->rce_ops == RMT_RSETFACL) {
+			size = lustre_posix_acl_xattr_filter(
+						(posix_acl_xattr_header *)value,
+						size, &new_value);
+			if (unlikely(size < 0))
+				RETURN(size);
+
+			pv = (const char *)new_value;
+		} else
+			RETURN(-EOPNOTSUPP);
+
+		valid |= rce_ops2valid(rce->rce_ops);
+	}
+#endif
+	oc = ll_mdscapa_get(inode);
+	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+			 valid, name, pv, size, 0, flags, ll_i2suppgid(inode),
+			 &req);
+	capa_put(oc);
+#ifdef CONFIG_FS_POSIX_ACL
+	if (new_value != NULL)
+		lustre_posix_acl_xattr_free(new_value, size);
+	if (acl != NULL)
+		lustre_ext_acl_xattr_free(acl);
+#endif
+	if (rc) {
+		if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+			LCONSOLE_INFO("Disabling user_xattr feature because "
+				      "it is not supported on the server\n");
+			sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+		}
+		RETURN(rc);
+	}
+
+	ptlrpc_req_finished(req);
+	RETURN(0);
+}
+
+int ll_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+	       inode->i_ino, inode->i_generation, inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1);
+
+	if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+		     sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+	    (strncmp(name, XATTR_LUSTRE_PREFIX,
+		     sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+		struct lov_user_md *lump = (struct lov_user_md *)value;
+		int rc = 0;
+
+		/* Attributes that are saved via getxattr will always have
+		 * the stripe_offset as 0.  Instead, the MDS should be
+		 * allowed to pick the starting OST index.   b=17846 */
+		if (lump != NULL && lump->lmm_stripe_offset == 0)
+			lump->lmm_stripe_offset = -1;
+
+		if (lump != NULL && S_ISREG(inode->i_mode)) {
+			struct file f;
+			int flags = FMODE_WRITE;
+			int lum_size = (lump->lmm_magic == LOV_USER_MAGIC_V1) ?
+				sizeof(*lump) : sizeof(struct lov_user_md_v3);
+
+			f.f_dentry = dentry;
+			rc = ll_lov_setstripe_ea_info(inode, &f, flags, lump,
+						      lum_size);
+			/* b10667: rc always be 0 here for now */
+			rc = 0;
+		} else if (S_ISDIR(inode->i_mode)) {
+			rc = ll_dir_setstripe(inode, lump, 0);
+		}
+
+		return rc;
+
+	} else if (strcmp(name, XATTR_NAME_LMA) == 0 ||
+		   strcmp(name, XATTR_NAME_LINK) == 0)
+		return 0;
+
+	return ll_setxattr_common(inode, name, value, size, flags,
+				  OBD_MD_FLXATTR);
+}
+
+int ll_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+	       inode->i_ino, inode->i_generation, inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
+	return ll_setxattr_common(inode, name, NULL, 0, 0,
+				  OBD_MD_FLXATTRRM);
+}
+
+static
+int ll_getxattr_common(struct inode *inode, const char *name,
+		       void *buffer, size_t size, __u64 valid)
+{
+	struct ll_sb_info *sbi = ll_i2sbi(inode);
+	struct ptlrpc_request *req = NULL;
+	struct mdt_body *body;
+	int xattr_type, rc;
+	void *xdata;
+	struct obd_capa *oc;
+	struct rmtacl_ctl_entry *rce = NULL;
+	ENTRY;
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+	       inode->i_ino, inode->i_generation, inode);
+
+	/* listxattr have slightly different behavior from of ext3:
+	 * without 'user_xattr' ext3 will list all xattr names but
+	 * filtered out "^user..*"; we list them all for simplicity.
+	 */
+	if (!name) {
+		xattr_type = XATTR_OTHER_T;
+		goto do_getxattr;
+	}
+
+	xattr_type = get_xattr_type(name);
+	rc = xattr_type_filter(sbi, xattr_type);
+	if (rc)
+		RETURN(rc);
+
+	/* b15587: ignore security.capability xattr for now */
+	if ((xattr_type == XATTR_SECURITY_T &&
+	    strcmp(name, "security.capability") == 0))
+		RETURN(-ENODATA);
+
+	/* LU-549:  Disable security.selinux when selinux is disabled */
+	if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+	    strcmp(name, "security.selinux") == 0)
+		RETURN(-EOPNOTSUPP);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+	    (xattr_type == XATTR_ACL_ACCESS_T ||
+	    xattr_type == XATTR_ACL_DEFAULT_T)) {
+		rce = rct_search(&sbi->ll_rct, current_pid());
+		if (rce == NULL ||
+		    (rce->rce_ops != RMT_LSETFACL &&
+		    rce->rce_ops != RMT_LGETFACL &&
+		    rce->rce_ops != RMT_RSETFACL &&
+		    rce->rce_ops != RMT_RGETFACL))
+			RETURN(-EOPNOTSUPP);
+	}
+
+	/* posix acl is under protection of LOOKUP lock. when calling to this,
+	 * we just have path resolution to the target inode, so we have great
+	 * chance that cached ACL is uptodate.
+	 */
+	if (xattr_type == XATTR_ACL_ACCESS_T &&
+	    !(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+		struct ll_inode_info *lli = ll_i2info(inode);
+		struct posix_acl *acl;
+
+		spin_lock(&lli->lli_lock);
+		acl = posix_acl_dup(lli->lli_posix_acl);
+		spin_unlock(&lli->lli_lock);
+
+		if (!acl)
+			RETURN(-ENODATA);
+
+		rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+		posix_acl_release(acl);
+		RETURN(rc);
+	}
+	if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode))
+		RETURN(-ENODATA);
+#endif
+
+do_getxattr:
+	oc = ll_mdscapa_get(inode);
+	rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+			 valid | (rce ? rce_ops2valid(rce->rce_ops) : 0),
+			 name, NULL, 0, size, 0, &req);
+	capa_put(oc);
+	if (rc) {
+		if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+			LCONSOLE_INFO("Disabling user_xattr feature because "
+				      "it is not supported on the server\n");
+			sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+		}
+		RETURN(rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body);
+
+	/* only detect the xattr size */
+	if (size == 0)
+		GOTO(out, rc = body->eadatasize);
+
+	if (size < body->eadatasize) {
+		CERROR("server bug: replied size %u > %u\n",
+		       body->eadatasize, (int)size);
+		GOTO(out, rc = -ERANGE);
+	}
+
+	if (body->eadatasize == 0)
+		GOTO(out, rc = -ENODATA);
+
+	/* do not need swab xattr data */
+	xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+					     body->eadatasize);
+	if (!xdata)
+		GOTO(out, rc = -EFAULT);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (body->eadatasize >= 0 && rce && rce->rce_ops == RMT_LSETFACL) {
+		ext_acl_xattr_header *acl;
+
+		acl = lustre_posix_acl_xattr_2ext((posix_acl_xattr_header *)xdata,
+						  body->eadatasize);
+		if (IS_ERR(acl))
+			GOTO(out, rc = PTR_ERR(acl));
+
+		rc = ee_add(&sbi->ll_et, current_pid(), ll_inode2fid(inode),
+			    xattr_type, acl);
+		if (unlikely(rc < 0)) {
+			lustre_ext_acl_xattr_free(acl);
+			GOTO(out, rc);
+		}
+	}
+#endif
+
+	if (body->eadatasize == 0) {
+		rc = -ENODATA;
+	} else {
+		LASSERT(buffer);
+		memcpy(buffer, xdata, body->eadatasize);
+		rc = body->eadatasize;
+	}
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+		    void *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+
+	LASSERT(inode);
+	LASSERT(name);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+	       inode->i_ino, inode->i_generation, inode, name);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
+
+	if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+		     sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+	    (strncmp(name, XATTR_LUSTRE_PREFIX,
+		     sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+	     strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+		struct lov_stripe_md *lsm;
+		struct lov_user_md *lump;
+		struct lov_mds_md *lmm = NULL;
+		struct ptlrpc_request *request = NULL;
+		int rc = 0, lmmsize = 0;
+
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return -ENODATA;
+
+		if (size == 0 && S_ISDIR(inode->i_mode)) {
+			/* XXX directory EA is fix for now, optimize to save
+			 * RPC transfer */
+			GOTO(out, rc = sizeof(struct lov_user_md));
+		}
+
+		lsm = ccc_inode_lsm_get(inode);
+		if (lsm == NULL) {
+			if (S_ISDIR(inode->i_mode)) {
+				rc = ll_dir_getstripe(inode, &lmm,
+						      &lmmsize, &request);
+			} else {
+				rc = -ENODATA;
+			}
+		} else {
+			/* LSM is present already after lookup/getattr call.
+			 * we need to grab layout lock once it is implemented */
+			rc = obd_packmd(ll_i2dtexp(inode), &lmm, lsm);
+			lmmsize = rc;
+		}
+		ccc_inode_lsm_put(inode, lsm);
+
+		if (rc < 0)
+		       GOTO(out, rc);
+
+		if (size == 0) {
+			/* used to call ll_get_max_mdsize() forward to get
+			 * the maximum buffer size, while some apps (such as
+			 * rsync 3.0.x) care much about the exact xattr value
+			 * size */
+			rc = lmmsize;
+			GOTO(out, rc);
+		}
+
+		if (size < lmmsize) {
+			CERROR("server bug: replied size %d > %d for %s (%s)\n",
+			       lmmsize, (int)size, dentry->d_name.name, name);
+			GOTO(out, rc = -ERANGE);
+		}
+
+		lump = (struct lov_user_md *)buffer;
+		memcpy(lump, lmm, lmmsize);
+		/* do not return layout gen for getxattr otherwise it would
+		 * confuse tar --xattr by recognizing layout gen as stripe
+		 * offset when the file is restored. See LU-2809. */
+		lump->lmm_layout_gen = 0;
+
+		rc = lmmsize;
+out:
+		if (request)
+			ptlrpc_req_finished(request);
+		else if (lmm)
+			obd_free_diskmd(ll_i2dtexp(inode), &lmm);
+		return(rc);
+	}
+
+	return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR);
+}
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int rc = 0, rc2 = 0;
+	struct lov_mds_md *lmm = NULL;
+	struct ptlrpc_request *request = NULL;
+	int lmmsize;
+
+	LASSERT(inode);
+
+	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+	       inode->i_ino, inode->i_generation, inode);
+
+	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1);
+
+	rc = ll_getxattr_common(inode, NULL, buffer, size, OBD_MD_FLXATTRLS);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	if (buffer != NULL) {
+		struct ll_sb_info *sbi = ll_i2sbi(inode);
+		char *xattr_name = buffer;
+		int xlen, rem = rc;
+
+		while (rem > 0) {
+			xlen = strnlen(xattr_name, rem - 1) + 1;
+			rem -= xlen;
+			if (xattr_type_filter(sbi,
+					get_xattr_type(xattr_name)) == 0) {
+				/* skip OK xattr type
+				 * leave it in buffer
+				 */
+				xattr_name += xlen;
+				continue;
+			}
+			/* move up remaining xattrs in buffer
+			 * removing the xattr that is not OK
+			 */
+			memmove(xattr_name, xattr_name + xlen, rem);
+			rc -= xlen;
+		}
+	}
+	if (S_ISREG(inode->i_mode)) {
+		if (!ll_i2info(inode)->lli_has_smd)
+			rc2 = -1;
+	} else if (S_ISDIR(inode->i_mode)) {
+		rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+	}
+
+	if (rc2 < 0) {
+		GOTO(out, rc2 = 0);
+	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+		const int prefix_len = sizeof(XATTR_LUSTRE_PREFIX) - 1;
+		const size_t name_len   = sizeof("lov") - 1;
+		const size_t total_len  = prefix_len + name_len + 1;
+
+		if (buffer && (rc + total_len) <= size) {
+			buffer += rc;
+			memcpy(buffer, XATTR_LUSTRE_PREFIX, prefix_len);
+			memcpy(buffer + prefix_len, "lov", name_len);
+			buffer[prefix_len + name_len] = '\0';
+		}
+		rc2 = total_len;
+	}
+out:
+	ptlrpc_req_finished(request);
+	rc = rc + rc2;
+
+	return rc;
+}

diff --git a/drivers/staging/lustre/lustre/lmv/Makefile b/drivers/staging/lustre/lustre/lmv/Makefile
new file mode 100644
index 0000000..8cc81ad
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/Makefile

@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += lmv.o
+lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/lmv/lmv_fld.c b/drivers/staging/lustre/lustre/lmv/lmv_fld.c
new file mode 100644
index 0000000..a4805ae
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_fld.c

@@ -0,0 +1,88 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_fid.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+int lmv_fld_lookup(struct lmv_obd *lmv,
+		   const struct lu_fid *fid,
+		   mdsno_t *mds)
+{
+	int rc;
+	ENTRY;
+
+
+	/* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and
+	 * this fid_is_local check should be removed once LU-2240 is fixed */
+	LASSERTF((fid_seq_in_fldb(fid_seq(fid)) ||
+		  fid_seq_is_local_file(fid_seq(fid))) &&
+		 fid_is_sane(fid), DFID" is insane!\n", PFID(fid));
+
+	rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds,
+			       LU_SEQ_RANGE_MDT, NULL);
+	if (rc) {
+		CERROR("Error while looking for mds number. Seq "LPX64
+		       ", err = %d\n", fid_seq(fid), rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+	       *mds, PFID(fid));
+
+	if (*mds >= lmv->desc.ld_tgt_count) {
+		CERROR("FLD lookup got invalid mds #%x (max: %x) "
+		       "for fid="DFID"\n", *mds, lmv->desc.ld_tgt_count,
+		       PFID(fid));
+		rc = -EINVAL;
+	}
+	RETURN(rc);
+}

diff --git a/drivers/staging/lustre/lustre/lmv/lmv_intent.c b/drivers/staging/lustre/lustre/lmv/lmv_intent.c
new file mode 100644
index 0000000..7eefab5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_intent.c

@@ -0,0 +1,328 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <linux/lustre_intent.h>
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+static int lmv_intent_remote(struct obd_export *exp, void *lmm,
+			     int lmmsize, struct lookup_intent *it,
+			     const struct lu_fid *parent_fid, int flags,
+			     struct ptlrpc_request **reqp,
+			     ldlm_blocking_callback cb_blocking,
+			     __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct ptlrpc_request	*req = NULL;
+	struct lustre_handle	plock;
+	struct md_op_data	*op_data;
+	struct lmv_tgt_desc	*tgt;
+	struct mdt_body		*body;
+	int			pmode;
+	int			rc = 0;
+	ENTRY;
+
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	LASSERT((body->valid & OBD_MD_MDS));
+
+	/*
+	 * Unfortunately, we have to lie to MDC/MDS to retrieve
+	 * attributes llite needs and provideproper locking.
+	 */
+	if (it->it_op & IT_LOOKUP)
+		it->it_op = IT_GETATTR;
+
+	/*
+	 * We got LOOKUP lock, but we really need attrs.
+	 */
+	pmode = it->d.lustre.it_lock_mode;
+	if (pmode) {
+		plock.cookie = it->d.lustre.it_lock_handle;
+		it->d.lustre.it_lock_mode = 0;
+		it->d.lustre.it_data = NULL;
+	}
+
+	LASSERT(fid_is_sane(&body->fid1));
+
+	tgt = lmv_find_target(lmv, &body->fid1);
+	if (IS_ERR(tgt))
+		GOTO(out, rc = PTR_ERR(tgt));
+
+	OBD_ALLOC_PTR(op_data);
+	if (op_data == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	op_data->op_fid1 = body->fid1;
+	/* Sent the parent FID to the remote MDT */
+	if (parent_fid != NULL) {
+		/* The parent fid is only for remote open to
+		 * check whether the open is from OBF,
+		 * see mdt_cross_open */
+		LASSERT(it->it_op & IT_OPEN);
+		op_data->op_fid2 = *parent_fid;
+		/* Add object FID to op_fid3, in case it needs to check stale
+		 * (M_CHECK_STALE), see mdc_finish_intent_lock */
+		op_data->op_fid3 = body->fid1;
+	}
+
+	op_data->op_bias = MDS_CROSS_REF;
+	CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%d\n",
+	       PFID(&body->fid1), tgt->ltd_idx);
+
+	it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
+	rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+			    flags, &req, cb_blocking, extra_lock_flags);
+	if (rc)
+		GOTO(out_free_op_data, rc);
+
+	/*
+	 * LLite needs LOOKUP lock to track dentry revocation in order to
+	 * maintain dcache consistency. Thus drop UPDATE|PERM lock here
+	 * and put LOOKUP in request.
+	 */
+	if (it->d.lustre.it_lock_mode != 0) {
+		it->d.lustre.it_remote_lock_handle =
+					it->d.lustre.it_lock_handle;
+		it->d.lustre.it_remote_lock_mode = it->d.lustre.it_lock_mode;
+	}
+
+	it->d.lustre.it_lock_handle = plock.cookie;
+	it->d.lustre.it_lock_mode = pmode;
+
+	EXIT;
+out_free_op_data:
+	OBD_FREE_PTR(op_data);
+out:
+	if (rc && pmode)
+		ldlm_lock_decref(&plock, pmode);
+
+	ptlrpc_req_finished(*reqp);
+	*reqp = req;
+	return rc;
+}
+
+/*
+ * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
+ * may be split dir.
+ */
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc	*tgt;
+	struct mdt_body		*body;
+	int			rc;
+	ENTRY;
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	/* If it is ready to open the file by FID, do not need
+	 * allocate FID at all, otherwise it will confuse MDT */
+	if ((it->it_op & IT_CREAT) &&
+	    !(it->it_flags & MDS_OPEN_BY_FID)) {
+		/*
+		 * For open with IT_CREATE and for IT_CREATE cases allocate new
+		 * fid and setup FLD for it.
+		 */
+		op_data->op_fid3 = op_data->op_fid2;
+		rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID","
+	       " name='%s' -> mds #%d\n", PFID(&op_data->op_fid1),
+	       PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx);
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, flags,
+			    reqp, cb_blocking, extra_lock_flags);
+	if (rc != 0)
+		RETURN(rc);
+	/*
+	 * Nothing is found, do not access body->fid1 as it is zero and thus
+	 * pointless.
+	 */
+	if ((it->d.lustre.it_disposition & DISP_LOOKUP_NEG) &&
+	    !(it->d.lustre.it_disposition & DISP_OPEN_CREATE) &&
+	    !(it->d.lustre.it_disposition & DISP_OPEN_OPEN))
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+	/*
+	 * Not cross-ref case, just get out of here.
+	 */
+	if (likely(!(body->valid & OBD_MD_MDS)))
+		RETURN(0);
+
+	/*
+	 * Okay, MDS has returned success. Probably name has been resolved in
+	 * remote inode.
+	 */
+	rc = lmv_intent_remote(exp, lmm, lmmsize, it, &op_data->op_fid1, flags,
+			       reqp, cb_blocking, extra_lock_flags);
+	if (rc != 0) {
+		LASSERT(rc < 0);
+		/*
+		 * This is possible, that some userspace application will try to
+		 * open file as directory and we will have -ENOTDIR here. As
+		 * this is normal situation, we should not print error here,
+		 * only debug info.
+		 */
+		CDEBUG(D_INODE, "Can't handle remote %s: dir "DFID"("DFID"):"
+		       "%*s: %d\n", LL_IT2STR(it), PFID(&op_data->op_fid2),
+		       PFID(&op_data->op_fid1), op_data->op_namelen,
+		       op_data->op_name, rc);
+		RETURN(rc);
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * Handler for: getattr, lookup and revalidate cases.
+ */
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+		      void *lmm, int lmmsize, struct lookup_intent *it,
+		      int flags, struct ptlrpc_request **reqp,
+		      ldlm_blocking_callback cb_blocking,
+		      __u64 extra_lock_flags)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt = NULL;
+	struct mdt_body	*body;
+	int		     rc = 0;
+	ENTRY;
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	if (!fid_is_sane(&op_data->op_fid2))
+		fid_zero(&op_data->op_fid2);
+
+	CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
+	       ", name='%s' -> mds #%d\n", PFID(&op_data->op_fid1),
+	       PFID(&op_data->op_fid2),
+	       op_data->op_name ? op_data->op_name : "<NULL>",
+	       tgt->ltd_idx);
+
+	op_data->op_bias &= ~MDS_CROSS_REF;
+
+	rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+			     flags, reqp, cb_blocking, extra_lock_flags);
+
+	if (rc < 0 || *reqp == NULL)
+		RETURN(rc);
+
+	/*
+	 * MDS has returned success. Probably name has been resolved in
+	 * remote inode. Let's check this.
+	 */
+	body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(body->valid & OBD_MD_MDS)))
+		RETURN(0);
+
+	rc = lmv_intent_remote(exp, lmm, lmmsize, it, NULL, flags, reqp,
+			       cb_blocking, extra_lock_flags);
+
+	RETURN(rc);
+}
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	struct obd_device *obd = exp->exp_obd;
+	int		rc;
+	ENTRY;
+
+	LASSERT(it != NULL);
+	LASSERT(fid_is_sane(&op_data->op_fid1));
+
+	CDEBUG(D_INODE, "INTENT LOCK '%s' for '%*s' on "DFID"\n",
+	       LL_IT2STR(it), op_data->op_namelen, op_data->op_name,
+	       PFID(&op_data->op_fid1));
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT))
+		rc = lmv_intent_lookup(exp, op_data, lmm, lmmsize, it,
+				       flags, reqp, cb_blocking,
+				       extra_lock_flags);
+	else if (it->it_op & IT_OPEN)
+		rc = lmv_intent_open(exp, op_data, lmm, lmmsize, it,
+				     flags, reqp, cb_blocking,
+				     extra_lock_flags);
+	else
+		LBUG();
+	RETURN(rc);
+}

diff --git a/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/drivers/staging/lustre/lustre/lmv/lmv_internal.h
new file mode 100644
index 0000000..f75b0a9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_internal.h

@@ -0,0 +1,159 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LMV_INTERNAL_H_
+#define _LMV_INTERNAL_H_
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+
+#define LMV_MAX_TGT_COUNT 128
+
+#define lmv_init_lock(lmv)   mutex_lock(&lmv->init_mutex);
+#define lmv_init_unlock(lmv) mutex_unlock(&lmv->init_mutex);
+
+#define LL_IT2STR(it)					\
+	((it) ? ldlm_it2str((it)->it_op) : "0")
+
+int lmv_check_connect(struct obd_device *obd);
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+		      void *lmm, int lmmsize, struct lookup_intent *it,
+		      int flags, struct ptlrpc_request **reqp,
+		      ldlm_blocking_callback cb_blocking,
+		      __u64 extra_lock_flags);
+
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+
+int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+		     void *, int);
+int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid,
+		   mdsno_t *mds);
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid,
+		    mdsno_t mds);
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data);
+
+static inline struct lmv_stripe_md *lmv_get_mea(struct ptlrpc_request *req)
+{
+	struct mdt_body	 *body;
+	struct lmv_stripe_md    *mea;
+
+	LASSERT(req != NULL);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+	if (!body || !S_ISDIR(body->mode) || !body->eadatasize)
+		return NULL;
+
+	mea = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD,
+					   body->eadatasize);
+	LASSERT(mea != NULL);
+
+	if (mea->mea_count == 0)
+		return NULL;
+	if( mea->mea_magic != MEA_MAGIC_LAST_CHAR &&
+		mea->mea_magic != MEA_MAGIC_ALL_CHARS &&
+		mea->mea_magic != MEA_MAGIC_HASH_SEGMENT)
+		return NULL;
+
+	return mea;
+}
+
+static inline int lmv_get_easize(struct lmv_obd *lmv)
+{
+	return sizeof(struct lmv_stripe_md) +
+		lmv->desc.ld_tgt_count *
+		sizeof(struct lu_fid);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_get_target(struct lmv_obd *lmv, mdsno_t mds)
+{
+	int count = lmv->desc.ld_tgt_count;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		if (lmv->tgts[i] == NULL)
+			continue;
+
+		if (lmv->tgts[i]->ltd_idx == mds)
+			return lmv->tgts[i];
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid)
+{
+	mdsno_t mds = 0;
+	int rc;
+
+	if (lmv->desc.ld_tgt_count > 1) {
+		rc = lmv_fld_lookup(lmv, fid, &mds);
+		if (rc)
+			return ERR_PTR(rc);
+	}
+
+	return lmv_get_target(lmv, mds);
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+		struct lu_fid *fid);
+/* lproc_lmv.c */
+#ifdef LPROCFS
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+extern struct file_operations lmv_proc_target_fops;
+
+#endif

diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
new file mode 100644
index 0000000..1eebfbf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_obd.c

@@ -0,0 +1,2727 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+
+#include <lustre/lustre_idl.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre_lite.h>
+#include <lustre_fid.h>
+#include "lmv_internal.h"
+
+static void lmv_activate_target(struct lmv_obd *lmv,
+				struct lmv_tgt_desc *tgt,
+				int activate)
+{
+	if (tgt->ltd_active == activate)
+		return;
+
+	tgt->ltd_active = activate;
+	lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
+}
+
+/**
+ * Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LMV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
+ */
+static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
+			      int activate)
+{
+	struct lmv_tgt_desc    *uninitialized_var(tgt);
+	struct obd_device      *obd;
+	int		     i;
+	int		     rc = 0;
+	ENTRY;
+
+	CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
+	       lmv, uuid->uuid, activate);
+
+	spin_lock(&lmv->lmv_lock);
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL || tgt->ltd_exp == NULL)
+			continue;
+
+		CDEBUG(D_INFO, "Target idx %d is %s conn "LPX64"\n", i,
+		       tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
+
+		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+			break;
+	}
+
+	if (i == lmv->desc.ld_tgt_count)
+		GOTO(out_lmv_lock, rc = -EINVAL);
+
+	obd = class_exp2obd(tgt->ltd_exp);
+	if (obd == NULL)
+		GOTO(out_lmv_lock, rc = -ENOTCONN);
+
+	CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
+	       obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
+	       obd->obd_type->typ_name, i);
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
+
+	if (tgt->ltd_active == activate) {
+		CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+		       activate ? "" : "in");
+		GOTO(out_lmv_lock, rc);
+	}
+
+	CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
+	       activate ? "" : "in");
+	lmv_activate_target(lmv, tgt, activate);
+	EXIT;
+
+ out_lmv_lock:
+	spin_unlock(&lmv->lmv_lock);
+	return rc;
+}
+
+struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+	return obd_get_uuid(lmv->tgts[0]->ltd_exp);
+}
+
+static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev, void *data)
+{
+	struct obd_connect_data *conn_data;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct obd_uuid	 *uuid;
+	int		      rc = 0;
+	ENTRY;
+
+	if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
+		CERROR("unexpected notification of %s %s!\n",
+		       watched->obd_type->typ_name,
+		       watched->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	uuid = &watched->u.cli.cl_target_uuid;
+	if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+		/*
+		 * Set MDC as active before notifying the observer, so the
+		 * observer can use the MDC normally.
+		 */
+		rc = lmv_set_mdc_active(lmv, uuid,
+					ev == OBD_NOTIFY_ACTIVE);
+		if (rc) {
+			CERROR("%sactivation of %s failed: %d\n",
+			       ev == OBD_NOTIFY_ACTIVE ? "" : "de",
+			       uuid->uuid, rc);
+			RETURN(rc);
+		}
+	} else if (ev == OBD_NOTIFY_OCD) {
+		conn_data = &watched->u.cli.cl_import->imp_connect_data;
+		/*
+		 * XXX: Make sure that ocd_connect_flags from all targets are
+		 * the same. Otherwise one of MDTs runs wrong version or
+		 * something like this.  --umka
+		 */
+		obd->obd_self_export->exp_connect_data = *conn_data;
+	}
+#if 0
+	else if (ev == OBD_NOTIFY_DISCON) {
+		/*
+		 * For disconnect event, flush fld cache for failout MDS case.
+		 */
+		fld_client_flush(&lmv->lmv_fld);
+	}
+#endif
+	/*
+	 * Pass the notification up the chain.
+	 */
+	if (obd->obd_observer)
+		rc = obd_notify(obd->obd_observer, watched, ev, data);
+
+	RETURN(rc);
+}
+
+/**
+ * This is fake connect function. Its purpose is to initialize lmv and say
+ * caller that everything is okay. Real connection will be performed later.
+ */
+static int lmv_connect(const struct lu_env *env,
+		       struct obd_export **exp, struct obd_device *obd,
+		       struct obd_uuid *cluuid, struct obd_connect_data *data,
+		       void *localdata)
+{
+	struct proc_dir_entry *lmv_proc_dir;
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lustre_handle  conn = { 0 };
+	int		    rc = 0;
+	ENTRY;
+
+	/*
+	 * We don't want to actually do the underlying connections more than
+	 * once, so keep track.
+	 */
+	lmv->refcount++;
+	if (lmv->refcount > 1) {
+		*exp = NULL;
+		RETURN(0);
+	}
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc) {
+		CERROR("class_connection() returned %d\n", rc);
+		RETURN(rc);
+	}
+
+	*exp = class_conn2export(&conn);
+	class_export_get(*exp);
+
+	lmv->exp = *exp;
+	lmv->connected = 0;
+	lmv->cluuid = *cluuid;
+
+	if (data)
+		lmv->conn_data = *data;
+
+	if (obd->obd_proc_private != NULL) {
+		lmv_proc_dir = obd->obd_proc_private;
+	} else {
+		lmv_proc_dir = lprocfs_register("target_obds", obd->obd_proc_entry,
+						NULL, NULL);
+		if (IS_ERR(lmv_proc_dir)) {
+			CERROR("could not register /proc/fs/lustre/%s/%s/target_obds.",
+			       obd->obd_type->typ_name, obd->obd_name);
+			lmv_proc_dir = NULL;
+		}
+		obd->obd_proc_private = lmv_proc_dir;
+	}
+
+	/*
+	 * All real clients should perform actual connection right away, because
+	 * it is possible, that LMV will not have opportunity to connect targets
+	 * and MDC stuff will be called directly, for instance while reading
+	 * ../mdc/../kbytesfree procfs file, etc.
+	 */
+	if (data->ocd_connect_flags & OBD_CONNECT_REAL)
+		rc = lmv_check_connect(obd);
+
+	if (rc && lmv_proc_dir) {
+		lprocfs_remove(&lmv_proc_dir);
+		obd->obd_proc_private = NULL;
+	}
+
+	RETURN(rc);
+}
+
+static void lmv_set_timeouts(struct obd_device *obd)
+{
+	struct lmv_tgt_desc   *tgt;
+	struct lmv_obd	*lmv;
+	int		    i;
+
+	lmv = &obd->u.lmv;
+	if (lmv->server_timeout == 0)
+		return;
+
+	if (lmv->connected == 0)
+		return;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+			continue;
+
+		obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS),
+				   KEY_INTERMDS, 0, NULL, NULL);
+	}
+}
+
+static int lmv_init_ea_size(struct obd_export *exp, int easize,
+			    int def_easize, int cookiesize)
+{
+	struct obd_device   *obd = exp->exp_obd;
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	int		  i;
+	int		  rc = 0;
+	int		  change = 0;
+	ENTRY;
+
+	if (lmv->max_easize < easize) {
+		lmv->max_easize = easize;
+		change = 1;
+	}
+	if (lmv->max_def_easize < def_easize) {
+		lmv->max_def_easize = def_easize;
+		change = 1;
+	}
+	if (lmv->max_cookiesize < cookiesize) {
+		lmv->max_cookiesize = cookiesize;
+		change = 1;
+	}
+	if (change == 0)
+		RETURN(0);
+
+	if (lmv->connected == 0)
+		RETURN(0);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL ||
+		    lmv->tgts[i]->ltd_exp == NULL ||
+		    lmv->tgts[i]->ltd_active == 0) {
+			CWARN("%s: NULL export for %d\n", obd->obd_name, i);
+			continue;
+		}
+
+		rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize,
+				     cookiesize);
+		if (rc) {
+			CERROR("%s: obd_init_ea_size() failed on MDT target %d:"
+			       " rc = %d.\n", obd->obd_name, i, rc);
+			break;
+		}
+	}
+	RETURN(rc);
+}
+
+#define MAX_STRING_SIZE 128
+
+int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+	struct proc_dir_entry   *lmv_proc_dir;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct obd_uuid	 *cluuid = &lmv->cluuid;
+	struct obd_uuid	  lmv_mdc_uuid = { "LMV_MDC_UUID" };
+	struct obd_device       *mdc_obd;
+	struct obd_export       *mdc_exp;
+	struct lu_fld_target     target;
+	int		      rc;
+	ENTRY;
+
+	mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+					&obd->obd_uuid);
+	if (!mdc_obd) {
+		CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n",
+		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+		tgt->ltd_uuid.uuid, obd->obd_uuid.uuid,
+		cluuid->uuid);
+
+	if (!mdc_obd->obd_set_up) {
+		CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+		RETURN(-EINVAL);
+	}
+
+	rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid,
+			 &lmv->conn_data, NULL);
+	if (rc) {
+		CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
+		RETURN(rc);
+	}
+
+	/*
+	 * Init fid sequence client for this mdc and add new fld target.
+	 */
+	rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA);
+	if (rc)
+		RETURN(rc);
+
+	target.ft_srv = NULL;
+	target.ft_exp = mdc_exp;
+	target.ft_idx = tgt->ltd_idx;
+
+	fld_client_add_target(&lmv->lmv_fld, &target);
+
+	rc = obd_register_observer(mdc_obd, obd);
+	if (rc) {
+		obd_disconnect(mdc_exp);
+		CERROR("target %s register_observer error %d\n",
+		       tgt->ltd_uuid.uuid, rc);
+		RETURN(rc);
+	}
+
+	if (obd->obd_observer) {
+		/*
+		 * Tell the observer about the new target.
+		 */
+		rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
+				OBD_NOTIFY_ACTIVE,
+				(void *)(tgt - lmv->tgts[0]));
+		if (rc) {
+			obd_disconnect(mdc_exp);
+			RETURN(rc);
+		}
+	}
+
+	tgt->ltd_active = 1;
+	tgt->ltd_exp = mdc_exp;
+	lmv->desc.ld_active_tgt_count++;
+
+	md_init_ea_size(tgt->ltd_exp, lmv->max_easize,
+			lmv->max_def_easize, lmv->max_cookiesize);
+
+	CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
+		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+		atomic_read(&obd->obd_refcount));
+
+	lmv_proc_dir = obd->obd_proc_private;
+	if (lmv_proc_dir) {
+		struct proc_dir_entry *mdc_symlink;
+
+		LASSERT(mdc_obd->obd_type != NULL);
+		LASSERT(mdc_obd->obd_type->typ_name != NULL);
+		mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
+						  lmv_proc_dir,
+						  "../../../%s/%s",
+						  mdc_obd->obd_type->typ_name,
+						  mdc_obd->obd_name);
+		if (mdc_symlink == NULL) {
+			CERROR("Could not register LMV target "
+			       "/proc/fs/lustre/%s/%s/target_obds/%s.",
+			       obd->obd_type->typ_name, obd->obd_name,
+			       mdc_obd->obd_name);
+			lprocfs_remove(&lmv_proc_dir);
+			obd->obd_proc_private = NULL;
+		}
+	}
+	RETURN(0);
+}
+
+static void lmv_del_target(struct lmv_obd *lmv, int index)
+{
+	if (lmv->tgts[index] == NULL)
+		return;
+
+	OBD_FREE_PTR(lmv->tgts[index]);
+	lmv->tgts[index] = NULL;
+	return;
+}
+
+static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+			   __u32 index, int gen)
+{
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int		  rc = 0;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
+
+	lmv_init_lock(lmv);
+
+	if (lmv->desc.ld_tgt_count == 0) {
+		struct obd_device *mdc_obd;
+
+		mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
+						&obd->obd_uuid);
+		if (!mdc_obd) {
+			lmv_init_unlock(lmv);
+			CERROR("%s: Target %s not attached: rc = %d\n",
+			       obd->obd_name, uuidp->uuid, -EINVAL);
+			RETURN(-EINVAL);
+		}
+	}
+
+	if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
+		tgt = lmv->tgts[index];
+		CERROR("%s: UUID %s already assigned at LOV target index %d:"
+		       " rc = %d\n", obd->obd_name,
+		       obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
+		lmv_init_unlock(lmv);
+		RETURN(-EEXIST);
+	}
+
+	if (index >= lmv->tgts_size) {
+		/* We need to reallocate the lmv target array. */
+		struct lmv_tgt_desc **newtgts, **old = NULL;
+		__u32 newsize = 1;
+		__u32 oldsize = 0;
+
+		while (newsize < index + 1)
+			newsize = newsize << 1;
+		OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+		if (newtgts == NULL) {
+			lmv_init_unlock(lmv);
+			RETURN(-ENOMEM);
+		}
+
+		if (lmv->tgts_size) {
+			memcpy(newtgts, lmv->tgts,
+			       sizeof(*newtgts) * lmv->tgts_size);
+			old = lmv->tgts;
+			oldsize = lmv->tgts_size;
+		}
+
+		lmv->tgts = newtgts;
+		lmv->tgts_size = newsize;
+		smp_rmb();
+		if (old)
+			OBD_FREE(old, sizeof(*old) * oldsize);
+
+		CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts,
+		       lmv->tgts_size);
+	}
+
+	OBD_ALLOC_PTR(tgt);
+	if (!tgt) {
+		lmv_init_unlock(lmv);
+		RETURN(-ENOMEM);
+	}
+
+	mutex_init(&tgt->ltd_fid_mutex);
+	tgt->ltd_idx = index;
+	tgt->ltd_uuid = *uuidp;
+	tgt->ltd_active = 0;
+	lmv->tgts[index] = tgt;
+	if (index >= lmv->desc.ld_tgt_count)
+		lmv->desc.ld_tgt_count = index + 1;
+
+	if (lmv->connected) {
+		rc = lmv_connect_mdc(obd, tgt);
+		if (rc) {
+			spin_lock(&lmv->lmv_lock);
+			lmv->desc.ld_tgt_count--;
+			memset(tgt, 0, sizeof(*tgt));
+			spin_unlock(&lmv->lmv_lock);
+		} else {
+			int easize = sizeof(struct lmv_stripe_md) +
+				     lmv->desc.ld_tgt_count *
+				     sizeof(struct lu_fid);
+			lmv_init_ea_size(obd->obd_self_export, easize, 0, 0);
+		}
+	}
+
+	lmv_init_unlock(lmv);
+	RETURN(rc);
+}
+
+int lmv_check_connect(struct obd_device *obd)
+{
+	struct lmv_obd       *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc  *tgt;
+	int		   i;
+	int		   rc;
+	int		   easize;
+	ENTRY;
+
+	if (lmv->connected)
+		RETURN(0);
+
+	lmv_init_lock(lmv);
+	if (lmv->connected) {
+		lmv_init_unlock(lmv);
+		RETURN(0);
+	}
+
+	if (lmv->desc.ld_tgt_count == 0) {
+		lmv_init_unlock(lmv);
+		CERROR("%s: no targets configured.\n", obd->obd_name);
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
+	       lmv->cluuid.uuid, obd->obd_name);
+
+	LASSERT(lmv->tgts != NULL);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		tgt = lmv->tgts[i];
+		if (tgt == NULL)
+			continue;
+		rc = lmv_connect_mdc(obd, tgt);
+		if (rc)
+			GOTO(out_disc, rc);
+	}
+
+	lmv_set_timeouts(obd);
+	class_export_put(lmv->exp);
+	lmv->connected = 1;
+	easize = lmv_get_easize(lmv);
+	lmv_init_ea_size(obd->obd_self_export, easize, 0, 0);
+	lmv_init_unlock(lmv);
+	RETURN(0);
+
+ out_disc:
+	while (i-- > 0) {
+		int rc2;
+		tgt = lmv->tgts[i];
+		if (tgt == NULL)
+			continue;
+		tgt->ltd_active = 0;
+		if (tgt->ltd_exp) {
+			--lmv->desc.ld_active_tgt_count;
+			rc2 = obd_disconnect(tgt->ltd_exp);
+			if (rc2) {
+				CERROR("LMV target %s disconnect on "
+				       "MDC idx %d: error %d\n",
+				       tgt->ltd_uuid.uuid, i, rc2);
+			}
+		}
+	}
+	class_disconnect(lmv->exp);
+	lmv_init_unlock(lmv);
+	RETURN(rc);
+}
+
+static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+	struct proc_dir_entry  *lmv_proc_dir;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct obd_device      *mdc_obd;
+	int		     rc;
+	ENTRY;
+
+	LASSERT(tgt != NULL);
+	LASSERT(obd != NULL);
+
+	mdc_obd = class_exp2obd(tgt->ltd_exp);
+
+	if (mdc_obd) {
+		mdc_obd->obd_force = obd->obd_force;
+		mdc_obd->obd_fail = obd->obd_fail;
+		mdc_obd->obd_no_recov = obd->obd_no_recov;
+	}
+
+	lmv_proc_dir = obd->obd_proc_private;
+	if (lmv_proc_dir)
+		lprocfs_remove_proc_entry(mdc_obd->obd_name, lmv_proc_dir);
+
+	rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
+	if (rc)
+		CERROR("Can't finanize fids factory\n");
+
+	CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n",
+	       tgt->ltd_exp->exp_obd->obd_name,
+	       tgt->ltd_exp->exp_obd->obd_uuid.uuid);
+
+	obd_register_observer(tgt->ltd_exp->exp_obd, NULL);
+	rc = obd_disconnect(tgt->ltd_exp);
+	if (rc) {
+		if (tgt->ltd_active) {
+			CERROR("Target %s disconnect error %d\n",
+			       tgt->ltd_uuid.uuid, rc);
+		}
+	}
+
+	lmv_activate_target(lmv, tgt, 0);
+	tgt->ltd_exp = NULL;
+	RETURN(0);
+}
+
+static int lmv_disconnect(struct obd_export *exp)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	int		    rc;
+	int		    i;
+	ENTRY;
+
+	if (!lmv->tgts)
+		goto out_local;
+
+	/*
+	 * Only disconnect the underlying layers on the final disconnect.
+	 */
+	lmv->refcount--;
+	if (lmv->refcount != 0)
+		goto out_local;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+
+		lmv_disconnect_mdc(obd, lmv->tgts[i]);
+	}
+
+	if (obd->obd_proc_private)
+		lprocfs_remove((proc_dir_entry_t **)&obd->obd_proc_private);
+	else
+		CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
+		       obd->obd_type->typ_name, obd->obd_name);
+
+out_local:
+	/*
+	 * This is the case when no real connection is established by
+	 * lmv_check_connect().
+	 */
+	if (!lmv->connected)
+		class_export_put(exp);
+	rc = class_disconnect(exp);
+	if (lmv->refcount == 0)
+		lmv->connected = 0;
+	RETURN(rc);
+}
+
+static int lmv_fid2path(struct obd_export *exp, int len, void *karg, void *uarg)
+{
+	struct obd_device	*obddev = class_exp2obd(exp);
+	struct lmv_obd		*lmv = &obddev->u.lmv;
+	struct getinfo_fid2path *gf;
+	struct lmv_tgt_desc     *tgt;
+	struct getinfo_fid2path *remote_gf = NULL;
+	int			remote_gf_size = 0;
+	int			rc;
+
+	gf = (struct getinfo_fid2path *)karg;
+	tgt = lmv_find_target(lmv, &gf->gf_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+repeat_fid2path:
+	rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg);
+	if (rc != 0 && rc != -EREMOTE)
+		GOTO(out_fid2path, rc);
+
+	/* If remote_gf != NULL, it means just building the
+	 * path on the remote MDT, copy this path segement to gf */
+	if (remote_gf != NULL) {
+		struct getinfo_fid2path *ori_gf;
+		char *ptr;
+
+		ori_gf = (struct getinfo_fid2path *)karg;
+		if (strlen(ori_gf->gf_path) +
+		    strlen(gf->gf_path) > ori_gf->gf_pathlen)
+			GOTO(out_fid2path, rc = -EOVERFLOW);
+
+		ptr = ori_gf->gf_path;
+
+		memmove(ptr + strlen(gf->gf_path) + 1, ptr,
+			strlen(ori_gf->gf_path));
+
+		strncpy(ptr, gf->gf_path, strlen(gf->gf_path));
+		ptr += strlen(gf->gf_path);
+		*ptr = '/';
+	}
+
+	CDEBUG(D_INFO, "%s: get path %s "DFID" rec: "LPU64" ln: %u\n",
+	       tgt->ltd_exp->exp_obd->obd_name,
+	       gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno,
+	       gf->gf_linkno);
+
+	if (rc == 0)
+		GOTO(out_fid2path, rc);
+
+	/* sigh, has to go to another MDT to do path building further */
+	if (remote_gf == NULL) {
+		remote_gf_size = sizeof(*remote_gf) + PATH_MAX;
+		OBD_ALLOC(remote_gf, remote_gf_size);
+		if (remote_gf == NULL)
+			GOTO(out_fid2path, rc = -ENOMEM);
+		remote_gf->gf_pathlen = PATH_MAX;
+	}
+
+	if (!fid_is_sane(&gf->gf_fid)) {
+		CERROR("%s: invalid FID "DFID": rc = %d\n",
+		       tgt->ltd_exp->exp_obd->obd_name,
+		       PFID(&gf->gf_fid), -EINVAL);
+		GOTO(out_fid2path, rc = -EINVAL);
+	}
+
+	tgt = lmv_find_target(lmv, &gf->gf_fid);
+	if (IS_ERR(tgt))
+		GOTO(out_fid2path, rc = -EINVAL);
+
+	remote_gf->gf_fid = gf->gf_fid;
+	remote_gf->gf_recno = -1;
+	remote_gf->gf_linkno = -1;
+	memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen);
+	gf = remote_gf;
+	goto repeat_fid2path;
+
+out_fid2path:
+	if (remote_gf != NULL)
+		OBD_FREE(remote_gf, remote_gf_size);
+	RETURN(rc);
+}
+
+static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
+			 int len, void *karg, void *uarg)
+{
+	struct obd_device    *obddev = class_exp2obd(exp);
+	struct lmv_obd       *lmv = &obddev->u.lmv;
+	int		   i = 0;
+	int		   rc = 0;
+	int		   set = 0;
+	int		   count = lmv->desc.ld_tgt_count;
+	ENTRY;
+
+	if (count == 0)
+		RETURN(-ENOTTY);
+
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *mdc_obd;
+		struct obd_statfs stat_buf = {0};
+		__u32 index;
+
+		memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+		if ((index >= count))
+			RETURN(-ENODEV);
+
+		if (lmv->tgts[index] == NULL ||
+		    lmv->tgts[index]->ltd_active == 0)
+			RETURN(-ENODATA);
+
+		mdc_obd = class_exp2obd(lmv->tgts[index]->ltd_exp);
+		if (!mdc_obd)
+			RETURN(-EINVAL);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd),
+				     min((int) data->ioc_plen2,
+					 (int) sizeof(struct obd_uuid))))
+			RETURN(-EFAULT);
+
+		rc = obd_statfs(NULL, lmv->tgts[index]->ltd_exp, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+		if (rc)
+			RETURN(rc);
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				     min((int) data->ioc_plen1,
+					 (int) sizeof(stat_buf))))
+			RETURN(-EFAULT);
+		break;
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct lmv_tgt_desc *tgt = NULL;
+		struct obd_quotactl *oqctl;
+
+		if (qctl->qc_valid == QC_MDTIDX) {
+			if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+				RETURN(-EINVAL);
+
+			tgt = lmv->tgts[qctl->qc_idx];
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				RETURN(-EINVAL);
+		} else if (qctl->qc_valid == QC_UUID) {
+			for (i = 0; i < count; i++) {
+				tgt = lmv->tgts[i];
+				if (tgt == NULL)
+					continue;
+				if (!obd_uuid_equals(&tgt->ltd_uuid,
+						     &qctl->obd_uuid))
+					continue;
+
+				if (tgt->ltd_exp == NULL)
+					RETURN(-EINVAL);
+
+				break;
+			}
+		} else {
+			RETURN(-EINVAL);
+		}
+
+		if (i >= count)
+			RETURN(-EAGAIN);
+
+		LASSERT(tgt && tgt->ltd_exp);
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			RETURN(-ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_MDTIDX;
+			qctl->obd_uuid = tgt->ltd_uuid;
+		}
+		OBD_FREE_PTR(oqctl);
+		break;
+	}
+	case OBD_IOC_CHANGELOG_SEND:
+	case OBD_IOC_CHANGELOG_CLEAR: {
+		struct ioc_changelog *icc = karg;
+
+		if (icc->icc_mdtindex >= count)
+			RETURN(-ENODEV);
+
+		if (lmv->tgts[icc->icc_mdtindex] == NULL ||
+		    lmv->tgts[icc->icc_mdtindex]->ltd_exp == NULL ||
+		    lmv->tgts[icc->icc_mdtindex]->ltd_active == 0)
+			RETURN(-ENODEV);
+		rc = obd_iocontrol(cmd, lmv->tgts[icc->icc_mdtindex]->ltd_exp,
+				   sizeof(*icc), icc, NULL);
+		break;
+	}
+	case LL_IOC_GET_CONNECT_FLAGS: {
+		if (lmv->tgts[0] == NULL)
+			RETURN(-ENODATA);
+		rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg);
+		break;
+	}
+	case OBD_IOC_FID2PATH: {
+		rc = lmv_fid2path(exp, len, karg, uarg);
+		break;
+	}
+	case LL_IOC_HSM_STATE_GET:
+	case LL_IOC_HSM_STATE_SET:
+	case LL_IOC_HSM_ACTION:
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		struct md_op_data	*op_data = karg;
+		struct lmv_tgt_desc	*tgt1, *tgt2;
+
+		tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
+		if (IS_ERR(tgt1))
+			RETURN(PTR_ERR(tgt1));
+
+		tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
+		if (IS_ERR(tgt2))
+			RETURN(PTR_ERR(tgt2));
+
+		if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL))
+			RETURN(-EINVAL);
+
+		/* only files on same MDT can have their layouts swapped */
+		if (tgt1->ltd_idx != tgt2->ltd_idx)
+			RETURN(-EPERM);
+
+		rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
+		break;
+	}
+	default:
+		for (i = 0; i < count; i++) {
+			struct obd_device *mdc_obd;
+			int err;
+
+			if (lmv->tgts[i] == NULL ||
+			    lmv->tgts[i]->ltd_exp == NULL)
+				continue;
+			/* ll_umount_begin() sets force flag but for lmv, not
+			 * mdc. Let's pass it through */
+			mdc_obd = class_exp2obd(lmv->tgts[i]->ltd_exp);
+			mdc_obd->obd_force = obddev->obd_force;
+			err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len,
+					    karg, uarg);
+			if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+				RETURN(err);
+			} else if (err) {
+				if (lmv->tgts[i]->ltd_active) {
+					CERROR("error: iocontrol MDC %s on MDT"
+					       "idx %d cmd %x: err = %d\n",
+						lmv->tgts[i]->ltd_uuid.uuid,
+						i, cmd, err);
+					if (!rc)
+						rc = err;
+				}
+			} else
+				set = 1;
+		}
+		if (!set && !rc)
+			rc = -EIO;
+	}
+	RETURN(rc);
+}
+
+#if 0
+static int lmv_all_chars_policy(int count, const char *name,
+				int len)
+{
+	unsigned int c = 0;
+
+	while (len > 0)
+		c += name[--len];
+	c = c % count;
+	return c;
+}
+
+static int lmv_nid_policy(struct lmv_obd *lmv)
+{
+	struct obd_import *imp;
+	__u32	      id;
+
+	/*
+	 * XXX: To get nid we assume that underlying obd device is mdc.
+	 */
+	imp = class_exp2cliimp(lmv->tgts[0].ltd_exp);
+	id = imp->imp_connection->c_self ^ (imp->imp_connection->c_self >> 32);
+	return id % lmv->desc.ld_tgt_count;
+}
+
+static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+			  placement_policy_t placement)
+{
+	switch (placement) {
+	case PLACEMENT_CHAR_POLICY:
+		return lmv_all_chars_policy(lmv->desc.ld_tgt_count,
+					    op_data->op_name,
+					    op_data->op_namelen);
+	case PLACEMENT_NID_POLICY:
+		return lmv_nid_policy(lmv);
+
+	default:
+		break;
+	}
+
+	CERROR("Unsupported placement policy %x\n", placement);
+	return -EINVAL;
+}
+#endif
+
+/**
+ * This is _inode_ placement policy function (not name).
+ */
+static int lmv_placement_policy(struct obd_device *obd,
+				struct md_op_data *op_data,
+				mdsno_t *mds)
+{
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	ENTRY;
+
+	LASSERT(mds != NULL);
+
+	if (lmv->desc.ld_tgt_count == 1) {
+		*mds = 0;
+		RETURN(0);
+	}
+
+	/**
+	 * If stripe_offset is provided during setdirstripe
+	 * (setdirstripe -i xx), xx MDS will be choosen.
+	 */
+	if (op_data->op_cli_flags & CLI_SET_MEA) {
+		struct lmv_user_md *lum;
+
+		lum = (struct lmv_user_md *)op_data->op_data;
+		if (lum->lum_type == LMV_STRIPE_TYPE &&
+		    lum->lum_stripe_offset != -1) {
+			if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) {
+				CERROR("%s: Stripe_offset %d > MDT count %d:"
+				       " rc = %d\n", obd->obd_name,
+				       lum->lum_stripe_offset,
+				       lmv->desc.ld_tgt_count, -ERANGE);
+				RETURN(-ERANGE);
+			}
+			*mds = lum->lum_stripe_offset;
+			RETURN(0);
+		}
+	}
+
+	/* Allocate new fid on target according to operation type and parent
+	 * home mds. */
+	*mds = op_data->op_mds;
+	RETURN(0);
+}
+
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid,
+		    mdsno_t mds)
+{
+	struct lmv_tgt_desc	*tgt;
+	int			 rc;
+	ENTRY;
+
+	tgt = lmv_get_target(lmv, mds);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	/*
+	 * New seq alloc and FLD setup should be atomic. Otherwise we may find
+	 * on server that seq in new allocated fid is not yet known.
+	 */
+	mutex_lock(&tgt->ltd_fid_mutex);
+
+	if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL)
+		GOTO(out, rc = -ENODEV);
+
+	/*
+	 * Asking underlaying tgt layer to allocate new fid.
+	 */
+	rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL);
+	if (rc > 0) {
+		LASSERT(fid_is_sane(fid));
+		rc = 0;
+	}
+
+	EXIT;
+out:
+	mutex_unlock(&tgt->ltd_fid_mutex);
+	return rc;
+}
+
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	mdsno_t		mds = 0;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(op_data != NULL);
+	LASSERT(fid != NULL);
+
+	rc = lmv_placement_policy(obd, op_data, &mds);
+	if (rc) {
+		CERROR("Can't get target for allocating fid, "
+		       "rc %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = __lmv_fid_alloc(lmv, fid, mds);
+	if (rc) {
+		CERROR("Can't alloc new fid, rc %d\n", rc);
+		RETURN(rc);
+	}
+
+	RETURN(rc);
+}
+
+static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lmv_obd	     *lmv = &obd->u.lmv;
+	struct lprocfs_static_vars  lvars;
+	struct lmv_desc	    *desc;
+	int			 rc;
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("LMV setup requires a descriptor\n");
+		RETURN(-EINVAL);
+	}
+
+	desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
+	if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("Lmv descriptor size wrong: %d > %d\n",
+		       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * 32);
+	if (lmv->tgts == NULL)
+		RETURN(-ENOMEM);
+	lmv->tgts_size = 32;
+
+	obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
+	lmv->desc.ld_tgt_count = 0;
+	lmv->desc.ld_active_tgt_count = 0;
+	lmv->max_cookiesize = 0;
+	lmv->max_def_easize = 0;
+	lmv->max_easize = 0;
+	lmv->lmv_placement = PLACEMENT_CHAR_POLICY;
+
+	spin_lock_init(&lmv->lmv_lock);
+	mutex_init(&lmv->init_mutex);
+
+	lprocfs_lmv_init_vars(&lvars);
+
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+#ifdef LPROCFS
+	{
+		rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+					0444, &lmv_proc_target_fops, obd);
+		if (rc)
+			CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+			       obd->obd_name, rc);
+       }
+#endif
+	rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
+			     LUSTRE_CLI_FLD_HASH_DHT);
+	if (rc) {
+		CERROR("Can't init FLD, err %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	RETURN(0);
+
+out:
+	return rc;
+}
+
+static int lmv_cleanup(struct obd_device *obd)
+{
+	struct lmv_obd   *lmv = &obd->u.lmv;
+	ENTRY;
+
+	fld_client_fini(&lmv->lmv_fld);
+	if (lmv->tgts != NULL) {
+		int i;
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			if (lmv->tgts[i] == NULL)
+				continue;
+			lmv_del_target(lmv, i);
+		}
+		OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
+		lmv->tgts_size = 0;
+	}
+	RETURN(0);
+}
+
+static int lmv_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+	struct lustre_cfg	*lcfg = buf;
+	struct obd_uuid		obd_uuid;
+	int			gen;
+	__u32			index;
+	int			rc;
+	ENTRY;
+
+	switch (lcfg->lcfg_command) {
+	case LCFG_ADD_MDC:
+		/* modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
+		 * 2:0  3:1  4:lustre-MDT0000-mdc_UUID */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1)
+			GOTO(out, rc = -EINVAL);
+		if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1)
+			GOTO(out, rc = -EINVAL);
+		rc = lmv_add_target(obd, &obd_uuid, index, gen);
+		GOTO(out, rc);
+	default:
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		GOTO(out, rc = -EINVAL);
+	}
+out:
+	RETURN(rc);
+}
+
+static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct obd_statfs     *temp;
+	int		    rc = 0;
+	int		    i;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	OBD_ALLOC(temp, sizeof(*temp));
+	if (temp == NULL)
+		RETURN(-ENOMEM);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+
+		rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
+				max_age, flags);
+		if (rc) {
+			CERROR("can't stat MDS #%d (%s), error %d\n", i,
+			       lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
+			       rc);
+			GOTO(out_free_temp, rc);
+		}
+
+		if (i == 0) {
+			*osfs = *temp;
+			/* If the statfs is from mount, it will needs
+			 * retrieve necessary information from MDT0.
+			 * i.e. mount does not need the merged osfs
+			 * from all of MDT.
+			 * And also clients can be mounted as long as
+			 * MDT0 is in service*/
+			if (flags & OBD_STATFS_FOR_MDT0)
+				GOTO(out_free_temp, rc);
+		} else {
+			osfs->os_bavail += temp->os_bavail;
+			osfs->os_blocks += temp->os_blocks;
+			osfs->os_ffree += temp->os_ffree;
+			osfs->os_files += temp->os_files;
+		}
+	}
+
+	EXIT;
+out_free_temp:
+	OBD_FREE(temp, sizeof(*temp));
+	return rc;
+}
+
+static int lmv_getstatus(struct obd_export *exp,
+			 struct lu_fid *fid,
+			 struct obd_capa **pc)
+{
+	struct obd_device    *obd = exp->exp_obd;
+	struct lmv_obd       *lmv = &obd->u.lmv;
+	int		   rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	rc = md_getstatus(lmv->tgts[0]->ltd_exp, fid, pc);
+	RETURN(rc);
+}
+
+static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+			struct obd_capa *oc, obd_valid valid, const char *name,
+			const char *input, int input_size, int output_size,
+			int flags, struct ptlrpc_request **request)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt;
+	int		     rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_getxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+			 input_size, output_size, flags, request);
+
+	RETURN(rc);
+}
+
+static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+			struct obd_capa *oc, obd_valid valid, const char *name,
+			const char *input, int input_size, int output_size,
+			int flags, __u32 suppgid,
+			struct ptlrpc_request **request)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt;
+	int		     rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+			 input_size, output_size, flags, suppgid,
+			 request);
+
+	RETURN(rc);
+}
+
+static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
+		       struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	if (op_data->op_flags & MF_GET_MDT_IDX) {
+		op_data->op_mds = tgt->ltd_idx;
+		RETURN(0);
+	}
+
+	rc = md_getattr(tgt->ltd_exp, op_data, request);
+
+	RETURN(rc);
+}
+
+static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
+{
+	struct obd_device   *obd = exp->exp_obd;
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	int		  i;
+	int		  rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+	/*
+	 * With DNE every object can have two locks in different namespaces:
+	 * lookup lock in space of MDT storing direntry and update/open lock in
+	 * space of MDT storing inode.
+	 */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+		md_null_inode(lmv->tgts[i]->ltd_exp, fid);
+	}
+
+	RETURN(0);
+}
+
+static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+			   ldlm_iterator_t it, void *data)
+{
+	struct obd_device   *obd = exp->exp_obd;
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	int		  i;
+	int		  rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+	/*
+	 * With DNE every object can have two locks in different namespaces:
+	 * lookup lock in space of MDT storing direntry and update/open lock in
+	 * space of MDT storing inode.
+	 */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+			continue;
+		rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data);
+		if (rc)
+			RETURN(rc);
+	}
+
+	RETURN(rc);
+}
+
+
+static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
+		     struct md_open_data *mod, struct ptlrpc_request **request)
+{
+	struct obd_device     *obd = exp->exp_obd;
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc   *tgt;
+	int		    rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
+	rc = md_close(tgt->ltd_exp, op_data, mod, request);
+	RETURN(rc);
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+		struct lu_fid *fid)
+{
+	struct lmv_tgt_desc *tgt;
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		return tgt;
+
+	op_data->op_mds = tgt->ltd_idx;
+
+	return tgt;
+}
+
+int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
+	       const void *data, int datalen, int mode, __u32 uid,
+	       __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+	       struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	if (!lmv->desc.ld_active_tgt_count)
+		RETURN(-EIO);
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n",
+	       op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+	       op_data->op_mds);
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID1;
+	rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
+		       cap_effective, rdev, request);
+
+	if (rc == 0) {
+		if (*request == NULL)
+			RETURN(rc);
+		CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2));
+	}
+	RETURN(rc);
+}
+
+static int lmv_done_writing(struct obd_export *exp,
+			    struct md_op_data *op_data,
+			    struct md_open_data *mod)
+{
+	struct obd_device     *obd = exp->exp_obd;
+	struct lmv_obd	*lmv = &obd->u.lmv;
+	struct lmv_tgt_desc   *tgt;
+	int		    rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_done_writing(tgt->ltd_exp, op_data, mod);
+	RETURN(rc);
+}
+
+static int
+lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		   struct lookup_intent *it, struct md_op_data *op_data,
+		   struct lustre_handle *lockh, void *lmm, int lmmsize,
+		   int extra_lock_flags)
+{
+	struct ptlrpc_request      *req = it->d.lustre.it_data;
+	struct obd_device	  *obd = exp->exp_obd;
+	struct lmv_obd	     *lmv = &obd->u.lmv;
+	struct lustre_handle	plock;
+	struct lmv_tgt_desc	*tgt;
+	struct md_op_data	  *rdata;
+	struct lu_fid	       fid1;
+	struct mdt_body	    *body;
+	int			 rc = 0;
+	int			 pmode;
+	ENTRY;
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	if (!(body->valid & OBD_MD_MDS))
+		RETURN(0);
+
+	CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n",
+	       LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1));
+
+	/*
+	 * We got LOOKUP lock, but we really need attrs.
+	 */
+	pmode = it->d.lustre.it_lock_mode;
+	LASSERT(pmode != 0);
+	memcpy(&plock, lockh, sizeof(plock));
+	it->d.lustre.it_lock_mode = 0;
+	it->d.lustre.it_data = NULL;
+	fid1 = body->fid1;
+
+	it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
+	ptlrpc_req_finished(req);
+
+	tgt = lmv_find_target(lmv, &fid1);
+	if (IS_ERR(tgt))
+		GOTO(out, rc = PTR_ERR(tgt));
+
+	OBD_ALLOC_PTR(rdata);
+	if (rdata == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rdata->op_fid1 = fid1;
+	rdata->op_bias = MDS_CROSS_REF;
+
+	rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh,
+			lmm, lmmsize, NULL, extra_lock_flags);
+	OBD_FREE_PTR(rdata);
+	EXIT;
+out:
+	ldlm_lock_decref(&plock, pmode);
+	return rc;
+}
+
+static int
+lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+	    struct lookup_intent *it, struct md_op_data *op_data,
+	    struct lustre_handle *lockh, void *lmm, int lmmsize,
+	    struct ptlrpc_request **req, __u64 extra_lock_flags)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd	   *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc      *tgt;
+	int		       rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID"\n",
+	       LL_IT2STR(it), PFID(&op_data->op_fid1));
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n",
+	       LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx);
+
+	rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh,
+			lmm, lmmsize, req, extra_lock_flags);
+
+	if (rc == 0 && it && it->it_op == IT_OPEN) {
+		rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh,
+					lmm, lmmsize, extra_lock_flags);
+	}
+	RETURN(rc);
+}
+
+static int
+lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
+		 struct ptlrpc_request **request)
+{
+	struct ptlrpc_request   *req = NULL;
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	struct mdt_body	 *body;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
+	       op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+	       tgt->ltd_idx);
+
+	rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+	if (rc != 0)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*request)->rq_pill,
+				      &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	if (body->valid & OBD_MD_MDS) {
+		struct lu_fid rid = body->fid1;
+		CDEBUG(D_INODE, "Request attrs for "DFID"\n",
+		       PFID(&rid));
+
+		tgt = lmv_find_target(lmv, &rid);
+		if (IS_ERR(tgt)) {
+			ptlrpc_req_finished(*request);
+			RETURN(PTR_ERR(tgt));
+		}
+
+		op_data->op_fid1 = rid;
+		op_data->op_valid |= OBD_MD_FLCROSSREF;
+		op_data->op_namelen = 0;
+		op_data->op_name = NULL;
+		rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
+		ptlrpc_req_finished(*request);
+		*request = req;
+	}
+
+	RETURN(rc);
+}
+
+#define md_op_data_fid(op_data, fl)		     \
+	(fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \
+	 fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \
+	 fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \
+	 fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \
+	 NULL)
+
+static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data,
+			    int op_tgt, ldlm_mode_t mode, int bits, int flag)
+{
+	struct lu_fid	  *fid = md_op_data_fid(op_data, flag);
+	struct obd_device      *obd = exp->exp_obd;
+	struct lmv_obd	 *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc    *tgt;
+	ldlm_policy_data_t      policy = {{0}};
+	int		     rc = 0;
+	ENTRY;
+
+	if (!fid_is_sane(fid))
+		RETURN(0);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	if (tgt->ltd_idx != op_tgt) {
+		CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
+		policy.l_inodebits.bits = bits;
+		rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
+				      mode, LCF_ASYNC, NULL);
+	} else {
+		CDEBUG(D_INODE,
+		       "EARLY_CANCEL skip operation target %d on "DFID"\n",
+		       op_tgt, PFID(fid));
+		op_data->op_flags |= flag;
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * llite passes fid of an target inode in op_data->op_fid1 and id of directory in
+ * op_data->op_fid2
+ */
+static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
+		    struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(op_data->op_namelen != 0);
+
+	CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n",
+	       PFID(&op_data->op_fid2), op_data->op_namelen,
+	       op_data->op_name, PFID(&op_data->op_fid1));
+
+	op_data->op_fsuid = current_fsuid();
+	op_data->op_fsgid = current_fsgid();
+	op_data->op_cap = cfs_curproc_cap_pack();
+	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	/*
+	 * Cancel UPDATE lock on child (fid1).
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID2;
+	rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+	if (rc != 0)
+		RETURN(rc);
+
+	rc = md_link(tgt->ltd_exp, op_data, request);
+
+	RETURN(rc);
+}
+
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+		      const char *old, int oldlen, const char *new, int newlen,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *src_tgt;
+	struct lmv_tgt_desc     *tgt_tgt;
+	int			rc;
+	ENTRY;
+
+	LASSERT(oldlen != 0);
+
+	CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n",
+	       oldlen, old, PFID(&op_data->op_fid1),
+	       newlen, new, PFID(&op_data->op_fid2));
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	op_data->op_fsuid = current_fsuid();
+	op_data->op_fsgid = current_fsgid();
+	op_data->op_cap = cfs_curproc_cap_pack();
+	src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(src_tgt))
+		RETURN(PTR_ERR(src_tgt));
+
+	tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	if (IS_ERR(tgt_tgt))
+		RETURN(PTR_ERR(tgt_tgt));
+	/*
+	 * LOOKUP lock on src child (fid3) should also be cancelled for
+	 * src_tgt in mdc_rename.
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+	/*
+	 * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
+	 * own target.
+	 */
+	rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+			      LCK_EX, MDS_INODELOCK_UPDATE,
+			      MF_MDC_CANCEL_FID2);
+
+	/*
+	 * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt.
+	 */
+	if (rc == 0) {
+		rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_LOOKUP,
+				      MF_MDC_CANCEL_FID4);
+	}
+
+	/*
+	 * Cancel all the locks on tgt child (fid4).
+	 */
+	if (rc == 0)
+		rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+				      LCK_EX, MDS_INODELOCK_FULL,
+				      MF_MDC_CANCEL_FID4);
+
+	if (rc == 0)
+		rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen,
+			       new, newlen, request);
+	RETURN(rc);
+}
+
+static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		       void *ea, int ealen, void *ea2, int ea2len,
+		       struct ptlrpc_request **request,
+		       struct md_open_data **mod)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc = 0;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
+	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
+
+	op_data->op_flags |= MF_MDC_CANCEL_FID1;
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, ea2,
+			ea2len, request, mod);
+
+	RETURN(rc);
+}
+
+static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid,
+		    struct obd_capa *oc, struct ptlrpc_request **request)
+{
+	struct obd_device	 *obd = exp->exp_obd;
+	struct lmv_obd	    *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc       *tgt;
+	int			rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_sync(tgt->ltd_exp, fid, oc, request);
+	RETURN(rc);
+}
+
+/*
+ * Adjust a set of pages, each page containing an array of lu_dirpages,
+ * so that each page can be used as a single logical lu_dirpage.
+ *
+ * A lu_dirpage is laid out as follows, where s = ldp_hash_start,
+ * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a
+ * struct lu_dirent.  It has size up to LU_PAGE_SIZE. The ldp_hash_end
+ * value is used as a cookie to request the next lu_dirpage in a
+ * directory listing that spans multiple pages (two in this example):
+ *   ________
+ *  |	|
+ * .|--------v-------   -----.
+ * |s|e|f|p|ent|ent| ... |ent|
+ * '--|--------------   -----'   Each CFS_PAGE contains a single
+ *    '------.		   lu_dirpage.
+ * .---------v-------   -----.
+ * |s|e|f|p|ent| 0 | ... | 0 |
+ * '-----------------   -----'
+ *
+ * However, on hosts where the native VM page size (PAGE_CACHE_SIZE) is
+ * larger than LU_PAGE_SIZE, a single host page may contain multiple
+ * lu_dirpages. After reading the lu_dirpages from the MDS, the
+ * ldp_hash_end of the first lu_dirpage refers to the one immediately
+ * after it in the same CFS_PAGE (arrows simplified for brevity, but
+ * in general e0==s1, e1==s2, etc.):
+ *
+ * .--------------------   -----.
+ * |s0|e0|f0|p|ent|ent| ... |ent|
+ * |---v----------------   -----|
+ * |s1|e1|f1|p|ent|ent| ... |ent|
+ * |---v----------------   -----|  Here, each CFS_PAGE contains
+ *	     ...		 multiple lu_dirpages.
+ * |---v----------------   -----|
+ * |s'|e'|f'|p|ent|ent| ... |ent|
+ * '---|----------------   -----'
+ *     v
+ * .----------------------------.
+ * |	next CFS_PAGE       |
+ *
+ * This structure is transformed into a single logical lu_dirpage as follows:
+ *
+ * - Replace e0 with e' so the request for the next lu_dirpage gets the page
+ *   labeled 'next CFS_PAGE'.
+ *
+ * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether
+ *   a hash collision with the next page exists.
+ *
+ * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span
+ *   to the first entry of the next lu_dirpage.
+ */
+#if PAGE_CACHE_SIZE > LU_PAGE_SIZE
+static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs)
+{
+	int i;
+
+	for (i = 0; i < ncfspgs; i++) {
+		struct lu_dirpage	*dp = kmap(pages[i]);
+		struct lu_dirpage	*first = dp;
+		struct lu_dirent	*end_dirent = NULL;
+		struct lu_dirent	*ent;
+		__u64			hash_end = dp->ldp_hash_end;
+		__u32			flags = dp->ldp_flags;
+
+		for (; nlupgs > 1; nlupgs--) {
+			ent = lu_dirent_start(dp);
+			for (end_dirent = ent; ent != NULL;
+			     end_dirent = ent, ent = lu_dirent_next(ent));
+
+			/* Advance dp to next lu_dirpage. */
+			dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE);
+
+			/* Check if we've reached the end of the CFS_PAGE. */
+			if (!((unsigned long)dp & ~CFS_PAGE_MASK))
+				break;
+
+			/* Save the hash and flags of this lu_dirpage. */
+			hash_end = dp->ldp_hash_end;
+			flags = dp->ldp_flags;
+
+			/* Check if lu_dirpage contains no entries. */
+			if (!end_dirent)
+				break;
+
+			/* Enlarge the end entry lde_reclen from 0 to
+			 * first entry of next lu_dirpage. */
+			LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0);
+			end_dirent->lde_reclen =
+				cpu_to_le16((char *)(dp->ldp_entries) -
+					    (char *)end_dirent);
+		}
+
+		first->ldp_hash_end = hash_end;
+		first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE);
+		first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE);
+
+		kunmap(pages[i]);
+	}
+}
+#else
+#define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0)
+#endif	/* PAGE_CACHE_SIZE > LU_PAGE_SIZE */
+
+static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data,
+			struct page **pages, struct ptlrpc_request **request)
+{
+	struct obd_device	*obd = exp->exp_obd;
+	struct lmv_obd		*lmv = &obd->u.lmv;
+	__u64			offset = op_data->op_offset;
+	int			rc;
+	int			ncfspgs; /* pages read in PAGE_CACHE_SIZE */
+	int			nlupgs; /* pages read in LU_PAGE_SIZE */
+	struct lmv_tgt_desc	*tgt;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "READPAGE at "LPX64" from "DFID"\n",
+	       offset, PFID(&op_data->op_fid1));
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_readpage(tgt->ltd_exp, op_data, pages, request);
+	if (rc != 0)
+		RETURN(rc);
+
+	ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_CACHE_SIZE - 1)
+		 >> PAGE_CACHE_SHIFT;
+	nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
+	LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
+	LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages);
+
+	CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs,
+	       op_data->op_npages);
+
+	lmv_adjust_dirpages(pages, ncfspgs, nlupgs);
+
+	RETURN(rc);
+}
+
+static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
+		      struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt = NULL;
+	struct mdt_body		*body;
+	int		     rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+retry:
+	/* Send unlink requests to the MDT where the child is located */
+	if (likely(!fid_is_zero(&op_data->op_fid2)))
+		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+	else
+		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	op_data->op_fsuid = current_fsuid();
+	op_data->op_fsgid = current_fsgid();
+	op_data->op_cap = cfs_curproc_cap_pack();
+
+	/*
+	 * If child's fid is given, cancel unused locks for it if it is from
+	 * another export than parent.
+	 *
+	 * LOOKUP lock for child (fid3) should also be cancelled on parent
+	 * tgt_tgt in mdc_unlink().
+	 */
+	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+	/*
+	 * Cancel FULL locks on child (fid3).
+	 */
+	rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+			      MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+
+	if (rc != 0)
+		RETURN(rc);
+
+	CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n",
+	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
+
+	rc = md_unlink(tgt->ltd_exp, op_data, request);
+	if (rc != 0 && rc != -EREMOTE)
+		RETURN(rc);
+
+	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	/* Not cross-ref case, just get out of here. */
+	if (likely(!(body->valid & OBD_MD_MDS)))
+		RETURN(0);
+
+	CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
+	       exp->exp_obd->obd_name, PFID(&body->fid1));
+
+	/* This is a remote object, try remote MDT, Note: it may
+	 * try more than 1 time here, Considering following case
+	 * /mnt/lustre is root on MDT0, remote1 is on MDT1
+	 * 1. Initially A does not know where remote1 is, it send
+	 *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
+	 *    resend unlink RPC to MDT1 (retry 1st time).
+	 *
+	 * 2. During the unlink RPC in flight,
+	 *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
+	 *    and create new remote1, but on MDT0
+	 *
+	 * 3. MDT1 get unlink RPC(from A), then do remote lock on
+	 *    /mnt/lustre, then lookup get fid of remote1, and find
+	 *    it is remote dir again, and replay -EREMOTE again.
+	 *
+	 * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
+	 *
+	 * In theory, it might try unlimited time here, but it should
+	 * be very rare case.  */
+	op_data->op_fid2 = body->fid1;
+	ptlrpc_req_finished(*request);
+	*request = NULL;
+
+	goto retry;
+}
+
+static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	struct lmv_obd *lmv = &obd->u.lmv;
+	int rc = 0;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+		/* XXX: here should be calling obd_precleanup() down to
+		 * stack. */
+		break;
+	case OBD_CLEANUP_EXPORTS:
+		fld_client_proc_fini(&lmv->lmv_fld);
+		lprocfs_obd_cleanup(obd);
+		break;
+	default:
+		break;
+	}
+	RETURN(rc);
+}
+
+static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *lsm)
+{
+	struct obd_device       *obd;
+	struct lmv_obd	  *lmv;
+	int		      rc = 0;
+	ENTRY;
+
+	obd = class_exp2obd(exp);
+	if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	lmv = &obd->u.lmv;
+	if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
+		struct lmv_tgt_desc *tgt;
+		int i;
+
+		rc = lmv_check_connect(obd);
+		if (rc)
+			RETURN(rc);
+
+		LASSERT(*vallen == sizeof(__u32));
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			tgt = lmv->tgts[i];
+			/*
+			 * All tgts should be connected when this gets called.
+			 */
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				continue;
+
+			if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
+					  vallen, val, NULL))
+				RETURN(0);
+		}
+		RETURN(-EINVAL);
+	} else if (KEY_IS(KEY_MAX_EASIZE) || KEY_IS(KEY_CONN_DATA)) {
+		rc = lmv_check_connect(obd);
+		if (rc)
+			RETURN(rc);
+
+		/*
+		 * Forwarding this request to first MDS, it should know LOV
+		 * desc.
+		 */
+		rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key,
+				  vallen, val, NULL);
+		if (!rc && KEY_IS(KEY_CONN_DATA))
+			exp->exp_connect_data = *(struct obd_connect_data *)val;
+		RETURN(rc);
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = lmv->desc.ld_tgt_count;
+		RETURN(0);
+	}
+
+	CDEBUG(D_IOCTL, "Invalid key\n");
+	RETURN(-EINVAL);
+}
+
+int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       obd_count keylen, void *key, obd_count vallen,
+		       void *val, struct ptlrpc_request_set *set)
+{
+	struct lmv_tgt_desc    *tgt;
+	struct obd_device      *obd;
+	struct lmv_obd	 *lmv;
+	int rc = 0;
+	ENTRY;
+
+	obd = class_exp2obd(exp);
+	if (obd == NULL) {
+		CDEBUG(D_IOCTL, "Invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+	lmv = &obd->u.lmv;
+
+	if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) {
+		int i, err = 0;
+
+		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+			tgt = lmv->tgts[i];
+
+			if (tgt == NULL || tgt->ltd_exp == NULL)
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp,
+						 keylen, key, vallen, val, set);
+			if (err && rc == 0)
+				rc = err;
+		}
+
+		RETURN(rc);
+	}
+
+	RETURN(-EINVAL);
+}
+
+int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+	       struct lov_stripe_md *lsm)
+{
+	struct obd_device	 *obd = class_exp2obd(exp);
+	struct lmv_obd	    *lmv = &obd->u.lmv;
+	struct lmv_stripe_md      *meap;
+	struct lmv_stripe_md      *lsmp;
+	int			mea_size;
+	int			i;
+	ENTRY;
+
+	mea_size = lmv_get_easize(lmv);
+	if (!lmmp)
+		RETURN(mea_size);
+
+	if (*lmmp && !lsm) {
+		OBD_FREE_LARGE(*lmmp, mea_size);
+		*lmmp = NULL;
+		RETURN(0);
+	}
+
+	if (*lmmp == NULL) {
+		OBD_ALLOC_LARGE(*lmmp, mea_size);
+		if (*lmmp == NULL)
+			RETURN(-ENOMEM);
+	}
+
+	if (!lsm)
+		RETURN(mea_size);
+
+	lsmp = (struct lmv_stripe_md *)lsm;
+	meap = (struct lmv_stripe_md *)*lmmp;
+
+	if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR &&
+	    lsmp->mea_magic != MEA_MAGIC_ALL_CHARS)
+		RETURN(-EINVAL);
+
+	meap->mea_magic = cpu_to_le32(lsmp->mea_magic);
+	meap->mea_count = cpu_to_le32(lsmp->mea_count);
+	meap->mea_master = cpu_to_le32(lsmp->mea_master);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		meap->mea_ids[i] = lsmp->mea_ids[i];
+		fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]);
+	}
+
+	RETURN(mea_size);
+}
+
+int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+		 struct lov_mds_md *lmm, int lmm_size)
+{
+	struct obd_device	  *obd = class_exp2obd(exp);
+	struct lmv_stripe_md      **tmea = (struct lmv_stripe_md **)lsmp;
+	struct lmv_stripe_md       *mea = (struct lmv_stripe_md *)lmm;
+	struct lmv_obd	     *lmv = &obd->u.lmv;
+	int			 mea_size;
+	int			 i;
+	__u32		       magic;
+	ENTRY;
+
+	mea_size = lmv_get_easize(lmv);
+	if (lsmp == NULL)
+		return mea_size;
+
+	if (*lsmp != NULL && lmm == NULL) {
+		OBD_FREE_LARGE(*tmea, mea_size);
+		*lsmp = NULL;
+		RETURN(0);
+	}
+
+	LASSERT(mea_size == lmm_size);
+
+	OBD_ALLOC_LARGE(*tmea, mea_size);
+	if (*tmea == NULL)
+		RETURN(-ENOMEM);
+
+	if (!lmm)
+		RETURN(mea_size);
+
+	if (mea->mea_magic == MEA_MAGIC_LAST_CHAR ||
+	    mea->mea_magic == MEA_MAGIC_ALL_CHARS ||
+	    mea->mea_magic == MEA_MAGIC_HASH_SEGMENT)
+	{
+		magic = le32_to_cpu(mea->mea_magic);
+	} else {
+		/*
+		 * Old mea is not handled here.
+		 */
+		CERROR("Old not supportable EA is found\n");
+		LBUG();
+	}
+
+	(*tmea)->mea_magic = magic;
+	(*tmea)->mea_count = le32_to_cpu(mea->mea_count);
+	(*tmea)->mea_master = le32_to_cpu(mea->mea_master);
+
+	for (i = 0; i < (*tmea)->mea_count; i++) {
+		(*tmea)->mea_ids[i] = mea->mea_ids[i];
+		fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]);
+	}
+	RETURN(mea_size);
+}
+
+static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+			     ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			     ldlm_cancel_flags_t flags, void *opaque)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	int		      rc = 0;
+	int		      err;
+	int		      i;
+	ENTRY;
+
+	LASSERT(fid != NULL);
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL ||
+		    lmv->tgts[i]->ltd_active == 0)
+			continue;
+
+		err = md_cancel_unused(lmv->tgts[i]->ltd_exp, fid,
+				       policy, mode, flags, opaque);
+		if (!rc)
+			rc = err;
+	}
+	RETURN(rc);
+}
+
+int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+		      __u64 *bits)
+{
+	struct lmv_obd	  *lmv = &exp->exp_obd->u.lmv;
+	int		      rc;
+	ENTRY;
+
+	rc =  md_set_lock_data(lmv->tgts[0]->ltd_exp, lockh, data, bits);
+	RETURN(rc);
+}
+
+ldlm_mode_t lmv_lock_match(struct obd_export *exp, __u64 flags,
+			   const struct lu_fid *fid, ldlm_type_t type,
+			   ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			   struct lustre_handle *lockh)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	ldlm_mode_t	      rc;
+	int		      i;
+	ENTRY;
+
+	CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
+
+	/*
+	 * With CMD every object can have two locks in different namespaces:
+	 * lookup lock in space of mds storing direntry and update/open lock in
+	 * space of mds storing inode. Thus we check all targets, not only that
+	 * one fid was created in.
+	 */
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		if (lmv->tgts[i] == NULL ||
+		    lmv->tgts[i]->ltd_exp == NULL ||
+		    lmv->tgts[i]->ltd_active == 0)
+			continue;
+
+		rc = md_lock_match(lmv->tgts[i]->ltd_exp, flags, fid,
+				   type, policy, mode, lockh);
+		if (rc)
+			RETURN(rc);
+	}
+
+	RETURN(0);
+}
+
+int lmv_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+		      struct obd_export *dt_exp, struct obd_export *md_exp,
+		      struct lustre_md *md)
+{
+	struct lmv_obd	  *lmv = &exp->exp_obd->u.lmv;
+
+	return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md);
+}
+
+int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	ENTRY;
+
+	if (md->mea)
+		obd_free_memmd(exp, (void *)&md->mea);
+	RETURN(md_free_lustre_md(lmv->tgts[0]->ltd_exp, md));
+}
+
+int lmv_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct ptlrpc_request *open_req)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	ENTRY;
+
+	tgt = lmv_find_target(lmv, &och->och_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	RETURN(md_set_open_replay_data(tgt->ltd_exp, och, open_req));
+}
+
+int lmv_clear_open_replay_data(struct obd_export *exp,
+			       struct obd_client_handle *och)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	ENTRY;
+
+	tgt = lmv_find_target(lmv, &och->och_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	RETURN(md_clear_open_replay_data(tgt->ltd_exp, och));
+}
+
+static int lmv_get_remote_perm(struct obd_export *exp,
+			       const struct lu_fid *fid,
+			       struct obd_capa *oc, __u32 suppgid,
+			       struct ptlrpc_request **request)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_get_remote_perm(tgt->ltd_exp, fid, oc, suppgid, request);
+	RETURN(rc);
+}
+
+static int lmv_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+			  renew_capa_cb_t cb)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &oc->c_capa.lc_fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_renew_capa(tgt->ltd_exp, oc, cb);
+	RETURN(rc);
+}
+
+int lmv_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+		    const struct req_msg_field *field, struct obd_capa **oc)
+{
+	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+	return md_unpack_capa(lmv->tgts[0]->ltd_exp, req, field, oc);
+}
+
+int lmv_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo,
+			     struct ldlm_enqueue_info *einfo)
+{
+	struct md_op_data       *op_data = &minfo->mi_data;
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt = NULL;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, &op_data->op_fid1);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_intent_getattr_async(tgt->ltd_exp, minfo, einfo);
+	RETURN(rc);
+}
+
+int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			struct lu_fid *fid, __u64 *bits)
+{
+	struct obd_device       *obd = exp->exp_obd;
+	struct lmv_obd	  *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc     *tgt;
+	int		      rc;
+	ENTRY;
+
+	rc = lmv_check_connect(obd);
+	if (rc)
+		RETURN(rc);
+
+	tgt = lmv_find_target(lmv, fid);
+	if (IS_ERR(tgt))
+		RETURN(PTR_ERR(tgt));
+
+	rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
+	RETURN(rc);
+}
+
+/**
+ * For lmv, only need to send request to master MDT, and the master MDT will
+ * process with other slave MDTs. The only exception is Q_GETOQUOTA for which
+ * we directly fetch data from the slave MDTs.
+ */
+int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
+		 struct obd_quotactl *oqctl)
+{
+	struct obd_device   *obd = class_exp2obd(exp);
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt = lmv->tgts[0];
+	int		  rc = 0, i;
+	__u64		curspace, curinodes;
+	ENTRY;
+
+	if (!lmv->desc.ld_tgt_count || !tgt->ltd_active) {
+		CERROR("master lmv inactive\n");
+		RETURN(-EIO);
+	}
+
+	if (oqctl->qc_cmd != Q_GETOQUOTA) {
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		RETURN(rc);
+	}
+
+	curspace = curinodes = 0;
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		int err;
+		tgt = lmv->tgts[i];
+
+		if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+			continue;
+		if (!tgt->ltd_active) {
+			CDEBUG(D_HA, "mdt %d is inactive.\n", i);
+			continue;
+		}
+
+		err = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (err) {
+			CERROR("getquota on mdt %d failed. %d\n", i, err);
+			if (!rc)
+				rc = err;
+		} else {
+			curspace += oqctl->qc_dqblk.dqb_curspace;
+			curinodes += oqctl->qc_dqblk.dqb_curinodes;
+		}
+	}
+	oqctl->qc_dqblk.dqb_curspace = curspace;
+	oqctl->qc_dqblk.dqb_curinodes = curinodes;
+
+	RETURN(rc);
+}
+
+int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp,
+		   struct obd_quotactl *oqctl)
+{
+	struct obd_device   *obd = class_exp2obd(exp);
+	struct lmv_obd      *lmv = &obd->u.lmv;
+	struct lmv_tgt_desc *tgt;
+	int		  i, rc = 0;
+	ENTRY;
+
+	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+		int err;
+		tgt = lmv->tgts[i];
+		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
+			CERROR("lmv idx %d inactive\n", i);
+			RETURN(-EIO);
+		}
+
+		err = obd_quotacheck(tgt->ltd_exp, oqctl);
+		if (err && !rc)
+			rc = err;
+	}
+
+	RETURN(rc);
+}
+
+struct obd_ops lmv_obd_ops = {
+	.o_owner		= THIS_MODULE,
+	.o_setup		= lmv_setup,
+	.o_cleanup	      = lmv_cleanup,
+	.o_precleanup	   = lmv_precleanup,
+	.o_process_config       = lmv_process_config,
+	.o_connect	      = lmv_connect,
+	.o_disconnect	   = lmv_disconnect,
+	.o_statfs	       = lmv_statfs,
+	.o_get_info	     = lmv_get_info,
+	.o_set_info_async       = lmv_set_info_async,
+	.o_packmd	       = lmv_packmd,
+	.o_unpackmd	     = lmv_unpackmd,
+	.o_notify	       = lmv_notify,
+	.o_get_uuid	     = lmv_get_uuid,
+	.o_iocontrol	    = lmv_iocontrol,
+	.o_quotacheck	   = lmv_quotacheck,
+	.o_quotactl	     = lmv_quotactl
+};
+
+struct md_ops lmv_md_ops = {
+	.m_getstatus	    = lmv_getstatus,
+	.m_null_inode		= lmv_null_inode,
+	.m_find_cbdata	  = lmv_find_cbdata,
+	.m_close		= lmv_close,
+	.m_create	       = lmv_create,
+	.m_done_writing	 = lmv_done_writing,
+	.m_enqueue	      = lmv_enqueue,
+	.m_getattr	      = lmv_getattr,
+	.m_getxattr	     = lmv_getxattr,
+	.m_getattr_name	 = lmv_getattr_name,
+	.m_intent_lock	  = lmv_intent_lock,
+	.m_link		 = lmv_link,
+	.m_rename	       = lmv_rename,
+	.m_setattr	      = lmv_setattr,
+	.m_setxattr	     = lmv_setxattr,
+	.m_sync		 = lmv_sync,
+	.m_readpage	     = lmv_readpage,
+	.m_unlink	       = lmv_unlink,
+	.m_init_ea_size	 = lmv_init_ea_size,
+	.m_cancel_unused	= lmv_cancel_unused,
+	.m_set_lock_data	= lmv_set_lock_data,
+	.m_lock_match	   = lmv_lock_match,
+	.m_get_lustre_md	= lmv_get_lustre_md,
+	.m_free_lustre_md       = lmv_free_lustre_md,
+	.m_set_open_replay_data = lmv_set_open_replay_data,
+	.m_clear_open_replay_data = lmv_clear_open_replay_data,
+	.m_renew_capa	   = lmv_renew_capa,
+	.m_unpack_capa	  = lmv_unpack_capa,
+	.m_get_remote_perm      = lmv_get_remote_perm,
+	.m_intent_getattr_async = lmv_intent_getattr_async,
+	.m_revalidate_lock      = lmv_revalidate_lock
+};
+
+int __init lmv_init(void)
+{
+	struct lprocfs_static_vars lvars;
+	int			rc;
+
+	lprocfs_lmv_init_vars(&lvars);
+
+	rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
+				 lvars.module_vars, LUSTRE_LMV_NAME, NULL);
+	return rc;
+}
+
+static void lmv_exit(void)
+{
+	class_unregister_type(LUSTRE_LMV_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver");
+MODULE_LICENSE("GPL");
+
+module_init(lmv_init);
+module_exit(lmv_exit);

diff --git a/drivers/staging/lustre/lustre/lmv/lproc_lmv.c b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c
new file mode 100644
index 0000000..d1c45b5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c

@@ -0,0 +1,235 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/seq_file.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+#ifndef LPROCFS
+static struct lprocfs_vars lprocfs_module_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_obd_vars[] = { {0} };
+#else
+static int lmv_numobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device       *dev = (struct obd_device *)m->private;
+	struct lmv_desc	 *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lmv.desc;
+	return seq_printf(m, "%u\n", desc->ld_tgt_count);
+}
+LPROC_SEQ_FOPS_RO(lmv_numobd);
+
+static const char *placement_name[] = {
+	[PLACEMENT_CHAR_POLICY] = "CHAR",
+	[PLACEMENT_NID_POLICY]  = "NID",
+	[PLACEMENT_INVAL_POLICY]  = "INVAL"
+};
+
+static placement_policy_t placement_name2policy(char *name, int len)
+{
+	int		     i;
+
+	for (i = 0; i < PLACEMENT_MAX_POLICY; i++) {
+		if (!strncmp(placement_name[i], name, len))
+			return i;
+	}
+	return PLACEMENT_INVAL_POLICY;
+}
+
+static const char *placement_policy2name(placement_policy_t placement)
+{
+	LASSERT(placement < PLACEMENT_MAX_POLICY);
+	return placement_name[placement];
+}
+
+static int lmv_placement_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device       *dev = (struct obd_device *)m->private;
+	struct lmv_obd	  *lmv;
+
+	LASSERT(dev != NULL);
+	lmv = &dev->u.lmv;
+	return seq_printf(m, "%s\n", placement_policy2name(lmv->lmv_placement));
+}
+
+#define MAX_POLICY_STRING_SIZE 64
+
+static ssize_t lmv_placement_seq_write(struct file *file, const char *buffer,
+				   size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	char		     dummy[MAX_POLICY_STRING_SIZE + 1];
+	int		      len = count;
+	placement_policy_t       policy;
+	struct lmv_obd	  *lmv;
+
+	if (copy_from_user(dummy, buffer, MAX_POLICY_STRING_SIZE))
+		return -EFAULT;
+
+	LASSERT(dev != NULL);
+	lmv = &dev->u.lmv;
+
+	if (len > MAX_POLICY_STRING_SIZE)
+		len = MAX_POLICY_STRING_SIZE;
+
+	if (dummy[len - 1] == '\n')
+		len--;
+	dummy[len] = '\0';
+
+	policy = placement_name2policy(dummy, len);
+	if (policy != PLACEMENT_INVAL_POLICY) {
+		spin_lock(&lmv->lmv_lock);
+		lmv->lmv_placement = policy;
+		spin_unlock(&lmv->lmv_lock);
+	} else {
+		CERROR("Invalid placement policy \"%s\"!\n", dummy);
+		return -EINVAL;
+	}
+	return count;
+}
+LPROC_SEQ_FOPS(lmv_placement);
+
+static int lmv_activeobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device       *dev = (struct obd_device *)m->private;
+	struct lmv_desc	 *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lmv.desc;
+	return seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+}
+LPROC_SEQ_FOPS_RO(lmv_activeobd);
+
+static int lmv_desc_uuid_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lmv_obd	  *lmv;
+
+	LASSERT(dev != NULL);
+	lmv = &dev->u.lmv;
+	return seq_printf(m, "%s\n", lmv->desc.ld_uuid.uuid);
+}
+LPROC_SEQ_FOPS_RO(lmv_desc_uuid);
+
+static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_device       *dev = p->private;
+	struct lmv_obd	  *lmv = &dev->u.lmv;
+	return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
+{
+	return;
+}
+
+static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_device       *dev = p->private;
+	struct lmv_obd	  *lmv = &dev->u.lmv;
+	++*pos;
+	return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static int lmv_tgt_seq_show(struct seq_file *p, void *v)
+{
+	struct lmv_tgt_desc     *tgt = v;
+
+	if (tgt == NULL)
+		return 0;
+	return seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_idx,
+			  tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN");
+}
+
+struct seq_operations lmv_tgt_sops = {
+	.start		 = lmv_tgt_seq_start,
+	.stop		  = lmv_tgt_seq_stop,
+	.next		  = lmv_tgt_seq_next,
+	.show		  = lmv_tgt_seq_show,
+};
+
+static int lmv_target_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file	 *seq;
+	int		     rc;
+
+	rc = seq_open(file, &lmv_tgt_sops);
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(lmv, uuid);
+
+struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
+	{ "numobd",	  &lmv_numobd_fops,	  0, 0 },
+	{ "placement",	  &lmv_placement_fops,    0, 0 },
+	{ "activeobd",	  &lmv_activeobd_fops,    0, 0 },
+	{ "uuid",	  &lmv_uuid_fops,	  0, 0 },
+	{ "desc_uuid",	  &lmv_desc_uuid_fops,    0, 0 },
+	{ 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(lmv, numrefs);
+
+static struct lprocfs_vars lprocfs_lmv_module_vars[] = {
+	{ "num_refs",	   &lmv_numrefs_fops, 0, 0 },
+	{ 0 }
+};
+
+struct file_operations lmv_proc_target_fops = {
+	.owner		= THIS_MODULE,
+	.open		 = lmv_target_seq_open,
+	.read		 = seq_read,
+	.llseek	       = seq_lseek,
+	.release	      = seq_release,
+};
+
+#endif /* LPROCFS */
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars    = lprocfs_lmv_module_vars;
+	lvars->obd_vars       = lprocfs_lmv_obd_vars;
+}

diff --git a/drivers/staging/lustre/lustre/lov/Makefile b/drivers/staging/lustre/lustre/lov/Makefile
new file mode 100644
index 0000000..67eaec2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/Makefile

@@ -0,0 +1,9 @@
+obj-$(CONFIG_LUSTRE_FS) += lov.o
+lov-y := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o \
+	 lov_request.o lov_ea.o lov_dev.o lov_object.o lov_page.o  \
+	 lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o      \
+	 lovsub_lock.o lovsub_io.o lov_pool.o
+
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
new file mode 100644
index 0000000..28801b8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h

@@ -0,0 +1,820 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#ifndef LOV_CL_INTERNAL_H
+#define LOV_CL_INTERNAL_H
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <cl_object.h>
+#include "lov_internal.h"
+
+/** \defgroup lov lov
+ * Logical object volume layer. This layer implements data striping (raid0).
+ *
+ * At the lov layer top-entity (object, page, lock, io) is connected to one or
+ * more sub-entities: top-object, representing a file is connected to a set of
+ * sub-objects, each representing a stripe, file-level top-lock is connected
+ * to a set of per-stripe sub-locks, top-page is connected to a (single)
+ * sub-page, and a top-level IO is connected to a set of (potentially
+ * concurrent) sub-IO's.
+ *
+ * Sub-object, sub-page, and sub-io have well-defined top-object and top-page
+ * respectively, while a single sub-lock can be part of multiple top-locks.
+ *
+ * Reference counting models are different for different types of entities:
+ *
+ *     - top-object keeps a reference to its sub-objects, and destroys them
+ *       when it is destroyed.
+ *
+ *     - top-page keeps a reference to its sub-page, and destroys it when it
+ *       is destroyed.
+ *
+ *     - sub-lock keep a reference to its top-locks. Top-lock keeps a
+ *       reference (and a hold, see cl_lock_hold()) on its sub-locks when it
+ *       actively using them (that is, in cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED, cl_lock_state::CLS_HELD states). When
+ *       moving into cl_lock_state::CLS_CACHED state, top-lock releases a
+ *       hold. From this moment top-lock has only a 'weak' reference to its
+ *       sub-locks. This reference is protected by top-lock
+ *       cl_lock::cll_guard, and will be automatically cleared by the sub-lock
+ *       when the latter is destroyed. When a sub-lock is canceled, a
+ *       reference to it is removed from the top-lock array, and top-lock is
+ *       moved into CLS_NEW state. It is guaranteed that all sub-locks exist
+ *       while their top-lock is in CLS_HELD or CLS_CACHED states.
+ *
+ *     - IO's are not reference counted.
+ *
+ * To implement a connection between top and sub entities, lov layer is split
+ * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both
+ * implementing full set of cl-interfaces. For example, top-object has vvp and
+ * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is
+ * used to track child-parent relationship.
+ *
+ * @{
+ */
+
+struct lovsub_device;
+struct lovsub_object;
+struct lovsub_lock;
+
+enum lov_device_flags {
+	LOV_DEV_INITIALIZED = 1 << 0
+};
+
+/*
+ * Upper half.
+ */
+
+/**
+ * Resources that are used in memory-cleaning path, and whose allocation
+ * cannot fail even when memory is tight. They are preallocated in sufficient
+ * quantities in lov_device::ld_emerg[], and access to them is serialized
+ * lov_device::ld_mutex.
+ */
+struct lov_device_emerg {
+	/**
+	 * Page list used to submit IO when memory is in pressure.
+	 */
+	struct cl_page_list emrg_page_list;
+	/**
+	 * sub-io's shared by all threads accessing this device when memory is
+	 * too low to allocate sub-io's dynamically.
+	 */
+	struct cl_io	emrg_subio;
+	/**
+	 * Environments used by sub-io's in
+	 * lov_device_emerg::emrg_subio.
+	 */
+	struct lu_env      *emrg_env;
+	/**
+	 * Refchecks for lov_device_emerg::emrg_env.
+	 *
+	 * \see cl_env_get()
+	 */
+	int		 emrg_refcheck;
+};
+
+struct lov_device {
+	/*
+	 * XXX Locking of lov-private data is missing.
+	 */
+	struct cl_device	  ld_cl;
+	struct lov_obd	   *ld_lov;
+	/** size of lov_device::ld_target[] array */
+	__u32		     ld_target_nr;
+	struct lovsub_device    **ld_target;
+	__u32		     ld_flags;
+
+	/** Emergency resources used in memory-cleansing paths. */
+	struct lov_device_emerg **ld_emrg;
+	/**
+	 * Serializes access to lov_device::ld_emrg in low-memory
+	 * conditions.
+	 */
+	struct mutex		  ld_mutex;
+};
+
+/**
+ * Layout type.
+ */
+enum lov_layout_type {
+	/** empty file without body */
+	LLT_EMPTY,
+	/** striped file */
+	LLT_RAID0,
+	LLT_NR
+};
+
+/**
+ * lov-specific file state.
+ *
+ * lov object has particular layout type, determining how top-object is built
+ * on top of sub-objects. Layout type can change dynamically. When this
+ * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode,
+ * all state pertaining to the old layout type is destroyed, and new state is
+ * constructed. All object methods take said semaphore in the shared mode,
+ * providing serialization against transition between layout types.
+ *
+ * To avoid multiple `if' or `switch' statements, selecting behavior for the
+ * current layout type, object methods perform double-dispatch, invoking
+ * function corresponding to the current layout type.
+ */
+struct lov_object {
+	struct cl_object       lo_cl;
+	/**
+	 * Serializes object operations with transitions between layout types.
+	 *
+	 * This semaphore is taken in shared mode by all object methods, and
+	 * is taken in exclusive mode when object type is changed.
+	 *
+	 * \see lov_object::lo_type
+	 */
+	struct rw_semaphore	lo_type_guard;
+	/**
+	 * Type of an object. Protected by lov_object::lo_type_guard.
+	 */
+	enum lov_layout_type	lo_type;
+	/**
+	 * True if layout is invalid. This bit is cleared when layout lock
+	 * is lost.
+	 */
+	bool			lo_layout_invalid;
+	/**
+	 * How many IOs are on going on this object. Layout can be changed
+	 * only if there is no active IO.
+	 */
+	atomic_t	       lo_active_ios;
+	/**
+	 * Waitq - wait for no one else is using lo_lsm
+	 */
+	wait_queue_head_t	       lo_waitq;
+	/**
+	 * Layout metadata. NULL if empty layout.
+	 */
+	struct lov_stripe_md  *lo_lsm;
+
+	union lov_layout_state {
+		struct lov_layout_raid0 {
+			unsigned	       lo_nr;
+			/**
+			 * When this is true, lov_object::lo_attr contains
+			 * valid up to date attributes for a top-level
+			 * object. This field is reset to 0 when attributes of
+			 * any sub-object change.
+			 */
+			int		       lo_attr_valid;
+			/**
+			 * Array of sub-objects. Allocated when top-object is
+			 * created (lov_init_raid0()).
+			 *
+			 * Top-object is a strict master of its sub-objects:
+			 * it is created before them, and outlives its
+			 * children (this later is necessary so that basic
+			 * functions like cl_object_top() always
+			 * work). Top-object keeps a reference on every
+			 * sub-object.
+			 *
+			 * When top-object is destroyed (lov_delete_raid0())
+			 * it releases its reference to a sub-object and waits
+			 * until the latter is finally destroyed.
+			 */
+			struct lovsub_object **lo_sub;
+			/**
+			 * protect lo_sub
+			 */
+			spinlock_t		lo_sub_lock;
+			/**
+			 * Cached object attribute, built from sub-object
+			 * attributes.
+			 */
+			struct cl_attr	 lo_attr;
+		} raid0;
+		struct lov_layout_state_empty {
+		} empty;
+	} u;
+	/**
+	 * Thread that acquired lov_object::lo_type_guard in an exclusive
+	 * mode.
+	 */
+	task_t	    *lo_owner;
+};
+
+/**
+ * Flags that top-lock can set on each of its sub-locks.
+ */
+enum lov_sub_flags {
+	/** Top-lock acquired a hold (cl_lock_hold()) on a sub-lock. */
+	LSF_HELD = 1 << 0
+};
+
+/**
+ * State lov_lock keeps for each sub-lock.
+ */
+struct lov_lock_sub {
+	/** sub-lock itself */
+	struct lovsub_lock  *sub_lock;
+	/** An array of per-sub-lock flags, taken from enum lov_sub_flags */
+	unsigned	     sub_flags;
+	int		  sub_stripe;
+	struct cl_lock_descr sub_descr;
+	struct cl_lock_descr sub_got;
+};
+
+/**
+ * lov-specific lock state.
+ */
+struct lov_lock {
+	struct cl_lock_slice   lls_cl;
+	/** Number of sub-locks in this lock */
+	int		    lls_nr;
+	/**
+	 * Number of existing sub-locks.
+	 */
+	unsigned	       lls_nr_filled;
+	/**
+	 * Set when sub-lock was canceled, while top-lock was being
+	 * used, or unused.
+	 */
+	unsigned int	       lls_cancel_race:1;
+	/**
+	 * An array of sub-locks
+	 *
+	 * There are two issues with managing sub-locks:
+	 *
+	 *     - sub-locks are concurrently canceled, and
+	 *
+	 *     - sub-locks are shared with other top-locks.
+	 *
+	 * To manage cancellation, top-lock acquires a hold on a sublock
+	 * (lov_sublock_adopt()) when the latter is inserted into
+	 * lov_lock::lls_sub[]. This hold is released (lov_sublock_release())
+	 * when top-lock is going into CLS_CACHED state or destroyed. Hold
+	 * prevents sub-lock from cancellation.
+	 *
+	 * Sub-lock sharing means, among other things, that top-lock that is
+	 * in the process of creation (i.e., not yet inserted into lock list)
+	 * is already accessible to other threads once at least one of its
+	 * sub-locks is created, see lov_lock_sub_init().
+	 *
+	 * Sub-lock can be in one of the following states:
+	 *
+	 *     - doesn't exist, lov_lock::lls_sub[]::sub_lock == NULL. Such
+	 *       sub-lock was either never created (top-lock is in CLS_NEW
+	 *       state), or it was created, then canceled, then destroyed
+	 *       (lov_lock_unlink() cleared sub-lock pointer in the top-lock).
+	 *
+	 *     - sub-lock exists and is on
+	 *       hold. (lov_lock::lls_sub[]::sub_flags & LSF_HELD). This is a
+	 *       normal state of a sub-lock in CLS_HELD and CLS_CACHED states
+	 *       of a top-lock.
+	 *
+	 *     - sub-lock exists, but is not held by the top-lock. This
+	 *       happens after top-lock released a hold on sub-locks before
+	 *       going into cache (lov_lock_unuse()).
+	 *
+	 * \todo To support wide-striping, array has to be replaced with a set
+	 * of queues to avoid scanning.
+	 */
+	struct lov_lock_sub   *lls_sub;
+	/**
+	 * Original description with which lock was enqueued.
+	 */
+	struct cl_lock_descr   lls_orig;
+};
+
+struct lov_page {
+	struct cl_page_slice lps_cl;
+	int		  lps_invalid;
+};
+
+/*
+ * Bottom half.
+ */
+
+struct lovsub_device {
+	struct cl_device   acid_cl;
+	struct lov_device *acid_super;
+	int		acid_idx;
+	struct cl_device  *acid_next;
+};
+
+struct lovsub_object {
+	struct cl_object_header lso_header;
+	struct cl_object	lso_cl;
+	struct lov_object      *lso_super;
+	int		     lso_index;
+};
+
+/**
+ * A link between a top-lock and a sub-lock. Separate data-structure is
+ * necessary, because top-locks and sub-locks are in M:N relationship.
+ *
+ * \todo This can be optimized for a (by far) most frequent case of a single
+ * top-lock per sub-lock.
+ */
+struct lov_lock_link {
+	struct lov_lock *lll_super;
+	/** An index within parent lock. */
+	int	      lll_idx;
+	/**
+	 * A linkage into per sub-lock list of all corresponding top-locks,
+	 * hanging off lovsub_lock::lss_parents.
+	 */
+	struct list_head       lll_list;
+};
+
+/**
+ * Lock state at lovsub layer.
+ */
+struct lovsub_lock {
+	struct cl_lock_slice  lss_cl;
+	/**
+	 * List of top-locks that have given sub-lock as their part. Protected
+	 * by cl_lock::cll_guard mutex.
+	 */
+	struct list_head	    lss_parents;
+	/**
+	 * Top-lock that initiated current operation on this sub-lock. This is
+	 * only set during top-to-bottom lock operations like enqueue, and is
+	 * used to optimize state change notification. Protected by
+	 * cl_lock::cll_guard mutex.
+	 *
+	 * \see lovsub_lock_state_one().
+	 */
+	struct cl_lock       *lss_active;
+};
+
+/**
+ * Describe the environment settings for sublocks.
+ */
+struct lov_sublock_env {
+	const struct lu_env *lse_env;
+	struct cl_io	*lse_io;
+	struct lov_io_sub   *lse_sub;
+};
+
+struct lovsub_page {
+	struct cl_page_slice lsb_cl;
+};
+
+
+struct lov_thread_info {
+	struct cl_object_conf   lti_stripe_conf;
+	struct lu_fid	   lti_fid;
+	struct cl_lock_descr    lti_ldescr;
+	struct ost_lvb	  lti_lvb;
+	struct cl_2queue	lti_cl2q;
+	struct cl_lock_closure  lti_closure;
+	wait_queue_t	  lti_waiter;
+};
+
+/**
+ * State that lov_io maintains for every sub-io.
+ */
+struct lov_io_sub {
+	int		  sub_stripe;
+	/**
+	 * sub-io for a stripe. Ideally sub-io's can be stopped and resumed
+	 * independently, with lov acting as a scheduler to maximize overall
+	 * throughput.
+	 */
+	struct cl_io	*sub_io;
+	/**
+	 * Linkage into a list (hanging off lov_io::lis_active) of all
+	 * sub-io's active for the current IO iteration.
+	 */
+	struct list_head	   sub_linkage;
+	/**
+	 * true, iff cl_io_init() was successfully executed against
+	 * lov_io_sub::sub_io.
+	 */
+	int		  sub_io_initialized;
+	/**
+	 * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't
+	 * allocated, but borrowed from a per-device emergency pool.
+	 */
+	int		  sub_borrowed;
+	/**
+	 * environment, in which sub-io executes.
+	 */
+	struct lu_env *sub_env;
+	/**
+	 * environment's refcheck.
+	 *
+	 * \see cl_env_get()
+	 */
+	int		  sub_refcheck;
+	int		  sub_refcheck2;
+	int		  sub_reenter;
+	void		*sub_cookie;
+};
+
+/**
+ * IO state private for LOV.
+ */
+struct lov_io {
+	/** super-class */
+	struct cl_io_slice lis_cl;
+	/**
+	 * Pointer to the object slice. This is a duplicate of
+	 * lov_io::lis_cl::cis_object.
+	 */
+	struct lov_object *lis_object;
+	/**
+	 * Original end-of-io position for this IO, set by the upper layer as
+	 * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this,
+	 * changes pos and count to fit IO into a single stripe and uses saved
+	 * value to determine when IO iterations have to stop.
+	 *
+	 * This is used only for CIT_READ and CIT_WRITE io's.
+	 */
+	loff_t	     lis_io_endpos;
+
+	/**
+	 * starting position within a file, for the current io loop iteration
+	 * (stripe), used by ci_io_loop().
+	 */
+	obd_off	    lis_pos;
+	/**
+	 * end position with in a file, for the current stripe io. This is
+	 * exclusive (i.e., next offset after last byte affected by io).
+	 */
+	obd_off	    lis_endpos;
+
+	int		lis_mem_frozen;
+	int		lis_stripe_count;
+	int		lis_active_subios;
+
+	/**
+	 * the index of ls_single_subio in ls_subios array
+	 */
+	int		lis_single_subio_index;
+	struct cl_io       lis_single_subio;
+
+	/**
+	 * size of ls_subios array, actually the highest stripe #
+	 */
+	int		lis_nr_subios;
+	struct lov_io_sub *lis_subs;
+	/**
+	 * List of active sub-io's.
+	 */
+	struct list_head	 lis_active;
+};
+
+struct lov_session {
+	struct lov_io	  ls_io;
+	struct lov_sublock_env ls_subenv;
+};
+
+/**
+ * State of transfer for lov.
+ */
+struct lov_req {
+	struct cl_req_slice lr_cl;
+};
+
+/**
+ * State of transfer for lovsub.
+ */
+struct lovsub_req {
+	struct cl_req_slice lsrq_cl;
+};
+
+extern struct lu_device_type lov_device_type;
+extern struct lu_device_type lovsub_device_type;
+
+extern struct lu_context_key lov_key;
+extern struct lu_context_key lov_session_key;
+
+extern struct kmem_cache *lov_lock_kmem;
+extern struct kmem_cache *lov_object_kmem;
+extern struct kmem_cache *lov_thread_kmem;
+extern struct kmem_cache *lov_session_kmem;
+extern struct kmem_cache *lov_req_kmem;
+
+extern struct kmem_cache *lovsub_lock_kmem;
+extern struct kmem_cache *lovsub_object_kmem;
+extern struct kmem_cache *lovsub_req_kmem;
+
+extern struct kmem_cache *lov_lock_link_kmem;
+
+int   lov_object_init     (const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf);
+int   lovsub_object_init  (const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf);
+int   lov_lock_init       (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init	 (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_io *io);
+int   lovsub_lock_init    (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+
+int   lov_lock_init_raid0 (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+int   lov_lock_init_empty (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init_raid0   (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_io *io);
+int   lov_io_init_empty   (const struct lu_env *env, struct cl_object *obj,
+			   struct cl_io *io);
+void  lov_lock_unlink     (const struct lu_env *env, struct lov_lock_link *link,
+			   struct lovsub_lock *sub);
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
+			       int stripe);
+void  lov_sub_put	     (struct lov_io_sub *sub);
+int   lov_sublock_modify  (const struct lu_env *env, struct lov_lock *lov,
+			   struct lovsub_lock *sublock,
+			   const struct cl_lock_descr *d, int idx);
+
+
+int   lov_page_init       (const struct lu_env *env, struct cl_object *ob,
+			   struct cl_page *page, struct page *vmpage);
+int   lovsub_page_init    (const struct lu_env *env, struct cl_object *ob,
+			   struct cl_page *page, struct page *vmpage);
+
+int   lov_page_init_empty (const struct lu_env *env,
+			   struct cl_object *obj,
+			   struct cl_page *page, struct page *vmpage);
+int   lov_page_init_raid0 (const struct lu_env *env,
+			   struct cl_object *obj,
+			   struct cl_page *page, struct page *vmpage);
+struct lu_object *lov_object_alloc   (const struct lu_env *env,
+				      const struct lu_object_header *hdr,
+				      struct lu_device *dev);
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+				      const struct lu_object_header *hdr,
+				      struct lu_device *dev);
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+					 struct lov_lock *lck,
+					 struct lovsub_lock *sub);
+struct lov_io_sub    *lov_page_subio    (const struct lu_env *env,
+					 struct lov_io *lio,
+					 const struct cl_page_slice *slice);
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm);
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
+
+#define lov_foreach_target(lov, var)		    \
+	for (var = 0; var < lov_targets_nr(lov); ++var)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct lov_session *lov_env_session(const struct lu_env *env)
+{
+	struct lov_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &lov_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct lov_io *lov_env_io(const struct lu_env *env)
+{
+	return &lov_env_session(env)->ls_io;
+}
+
+static inline int lov_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &lov_device_type;
+}
+
+static inline int lovsub_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &lovsub_device_type;
+}
+
+static inline struct lu_device *lov2lu_dev(struct lov_device *lov)
+{
+	return &lov->ld_cl.cd_lu_dev;
+}
+
+static inline struct lov_device *lu2lov_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &lov_device_type);
+	return container_of0(d, struct lov_device, ld_cl.cd_lu_dev);
+}
+
+static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub)
+{
+	return &lovsub->acid_cl;
+}
+
+static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub)
+{
+	return &lovsub2cl_dev(lovsub)->cd_lu_dev;
+}
+
+static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &lovsub_device_type);
+	return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev);
+}
+
+static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d)
+{
+	LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type);
+	return container_of0(d, struct lovsub_device, acid_cl);
+}
+
+static inline struct lu_object *lov2lu(struct lov_object *lov)
+{
+	return &lov->lo_cl.co_lu;
+}
+
+static inline struct cl_object *lov2cl(struct lov_object *lov)
+{
+	return &lov->lo_cl;
+}
+
+static inline struct lov_object *lu2lov(const struct lu_object *obj)
+{
+	LINVRNT(lov_is_object(obj));
+	return container_of0(obj, struct lov_object, lo_cl.co_lu);
+}
+
+static inline struct lov_object *cl2lov(const struct cl_object *obj)
+{
+	LINVRNT(lov_is_object(&obj->co_lu));
+	return container_of0(obj, struct lov_object, lo_cl);
+}
+
+static inline struct lu_object *lovsub2lu(struct lovsub_object *los)
+{
+	return &los->lso_cl.co_lu;
+}
+
+static inline struct cl_object *lovsub2cl(struct lovsub_object *los)
+{
+	return &los->lso_cl;
+}
+
+static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj)
+{
+	LINVRNT(lovsub_is_object(&obj->co_lu));
+	return container_of0(obj, struct lovsub_object, lso_cl);
+}
+
+static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
+{
+	LINVRNT(lovsub_is_object(obj));
+	return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
+}
+
+static inline struct lovsub_lock *
+cl2lovsub_lock(const struct cl_lock_slice *slice)
+{
+	LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
+	return container_of(slice, struct lovsub_lock, lss_cl);
+}
+
+static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+
+	slice = cl_lock_at(lock, &lovsub_device_type);
+	LASSERT(slice != NULL);
+	return cl2lovsub_lock(slice);
+}
+
+static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
+{
+	LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
+	return container_of(slice, struct lov_lock, lls_cl);
+}
+
+static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
+{
+	LINVRNT(lov_is_object(&slice->cpl_obj->co_lu));
+	return container_of0(slice, struct lov_page, lps_cl);
+}
+
+static inline struct lov_req *cl2lov_req(const struct cl_req_slice *slice)
+{
+	return container_of0(slice, struct lov_req, lr_cl);
+}
+
+static inline struct lovsub_page *
+cl2lovsub_page(const struct cl_page_slice *slice)
+{
+	LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
+	return container_of0(slice, struct lovsub_page, lsb_cl);
+}
+
+static inline struct lovsub_req *cl2lovsub_req(const struct cl_req_slice *slice)
+{
+	return container_of0(slice, struct lovsub_req, lsrq_cl);
+}
+
+static inline struct cl_page *lov_sub_page(const struct cl_page_slice *slice)
+{
+	return slice->cpl_page->cp_child;
+}
+
+static inline struct lov_io *cl2lov_io(const struct lu_env *env,
+				const struct cl_io_slice *ios)
+{
+	struct lov_io *lio;
+
+	lio = container_of(ios, struct lov_io, lis_cl);
+	LASSERT(lio == lov_env_io(env));
+	return lio;
+}
+
+static inline int lov_targets_nr(const struct lov_device *lov)
+{
+	return lov->ld_lov->desc.ld_tgt_count;
+}
+
+static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
+{
+	struct lov_thread_info *info;
+
+	info = lu_context_key_get(&env->le_ctx, &lov_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov)
+{
+	LASSERT(lov->lo_type == LLT_RAID0);
+	LASSERT(lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC ||
+		lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC_V3);
+	return &lov->u.raid0;
+}
+
+/** @} lov */
+
+#endif

diff --git a/drivers/staging/lustre/lustre/lov/lov_dev.c b/drivers/staging/lustre/lustre/lov/lov_dev.c
new file mode 100644
index 0000000..f94f8d9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_dev.c

@@ -0,0 +1,533 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "lov_cl_internal.h"
+
+struct kmem_cache *lov_lock_kmem;
+struct kmem_cache *lov_object_kmem;
+struct kmem_cache *lov_thread_kmem;
+struct kmem_cache *lov_session_kmem;
+struct kmem_cache *lov_req_kmem;
+
+struct kmem_cache *lovsub_lock_kmem;
+struct kmem_cache *lovsub_object_kmem;
+struct kmem_cache *lovsub_req_kmem;
+
+struct kmem_cache *lov_lock_link_kmem;
+
+/** Lock class of lov_device::ld_mutex. */
+struct lock_class_key cl_lov_device_mutex_class;
+
+struct lu_kmem_descr lov_caches[] = {
+	{
+		.ckd_cache = &lov_lock_kmem,
+		.ckd_name  = "lov_lock_kmem",
+		.ckd_size  = sizeof (struct lov_lock)
+	},
+	{
+		.ckd_cache = &lov_object_kmem,
+		.ckd_name  = "lov_object_kmem",
+		.ckd_size  = sizeof (struct lov_object)
+	},
+	{
+		.ckd_cache = &lov_thread_kmem,
+		.ckd_name  = "lov_thread_kmem",
+		.ckd_size  = sizeof (struct lov_thread_info)
+	},
+	{
+		.ckd_cache = &lov_session_kmem,
+		.ckd_name  = "lov_session_kmem",
+		.ckd_size  = sizeof (struct lov_session)
+	},
+	{
+		.ckd_cache = &lov_req_kmem,
+		.ckd_name  = "lov_req_kmem",
+		.ckd_size  = sizeof (struct lov_req)
+	},
+	{
+		.ckd_cache = &lovsub_lock_kmem,
+		.ckd_name  = "lovsub_lock_kmem",
+		.ckd_size  = sizeof (struct lovsub_lock)
+	},
+	{
+		.ckd_cache = &lovsub_object_kmem,
+		.ckd_name  = "lovsub_object_kmem",
+		.ckd_size  = sizeof (struct lovsub_object)
+	},
+	{
+		.ckd_cache = &lovsub_req_kmem,
+		.ckd_name  = "lovsub_req_kmem",
+		.ckd_size  = sizeof (struct lovsub_req)
+	},
+	{
+		.ckd_cache = &lov_lock_link_kmem,
+		.ckd_name  = "lov_lock_link_kmem",
+		.ckd_size  = sizeof (struct lov_lock_link)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/*****************************************************************************
+ *
+ * Lov transfer operations.
+ *
+ */
+
+static void lov_req_completion(const struct lu_env *env,
+			       const struct cl_req_slice *slice, int ioret)
+{
+	struct lov_req *lr;
+
+	ENTRY;
+	lr = cl2lov_req(slice);
+	OBD_SLAB_FREE_PTR(lr, lov_req_kmem);
+	EXIT;
+}
+
+static const struct cl_req_operations lov_req_ops = {
+	.cro_completion = lov_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov device and device type functions.
+ *
+ */
+
+static void *lov_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct lov_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, __GFP_IO);
+	if (info != NULL)
+		INIT_LIST_HEAD(&info->lti_closure.clc_list);
+	else
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void lov_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct lov_thread_info *info = data;
+	LINVRNT(list_empty(&info->lti_closure.clc_list));
+	OBD_SLAB_FREE_PTR(info, lov_thread_kmem);
+}
+
+struct lu_context_key lov_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = lov_key_init,
+	.lct_fini = lov_key_fini
+};
+
+static void *lov_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct lov_session *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void lov_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct lov_session *info = data;
+	OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+}
+
+struct lu_context_key lov_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = lov_session_key_init,
+	.lct_fini = lov_session_key_fini
+};
+
+/* type constructor/destructor: lov_type_{init,fini,start,stop}() */
+LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
+
+static struct lu_device *lov_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	int i;
+	struct lov_device *ld = lu2lov_dev(d);
+
+	LASSERT(ld->ld_lov != NULL);
+	if (ld->ld_target == NULL)
+		RETURN(NULL);
+
+	lov_foreach_target(ld, i) {
+		struct lovsub_device *lsd;
+
+		lsd = ld->ld_target[i];
+		if (lsd != NULL) {
+			cl_stack_fini(env, lovsub2cl_dev(lsd));
+			ld->ld_target[i] = NULL;
+		}
+	}
+	RETURN(NULL);
+}
+
+static int lov_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	int i;
+	int rc = 0;
+
+	LASSERT(d->ld_site != NULL);
+	if (ld->ld_target == NULL)
+		RETURN(rc);
+
+	lov_foreach_target(ld, i) {
+		struct lovsub_device *lsd;
+		struct cl_device     *cl;
+		struct lov_tgt_desc  *desc;
+
+		desc = ld->ld_lov->lov_tgts[i];
+		if (desc == NULL)
+			continue;
+
+		cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
+				   desc->ltd_obd->obd_lu_dev);
+		if (IS_ERR(cl)) {
+			rc = PTR_ERR(cl);
+			break;
+		}
+		lsd = cl2lovsub_dev(cl);
+		lsd->acid_idx = i;
+		lsd->acid_super = ld;
+		ld->ld_target[i] = lsd;
+	}
+
+	if (rc)
+		lov_device_fini(env, d);
+	else
+		ld->ld_flags |= LOV_DEV_INITIALIZED;
+
+	RETURN(rc);
+}
+
+static int lov_req_init(const struct lu_env *env, struct cl_device *dev,
+			struct cl_req *req)
+{
+	struct lov_req *lr;
+	int result;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lr, lov_req_kmem, __GFP_IO);
+	if (lr != NULL) {
+		cl_req_slice_add(req, &lr->lr_cl, dev, &lov_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+}
+
+static const struct cl_device_operations lov_cl_ops = {
+	.cdo_req_init = lov_req_init
+};
+
+static void lov_emerg_free(struct lov_device_emerg **emrg, int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; ++i) {
+		struct lov_device_emerg *em;
+
+		em = emrg[i];
+		if (em != NULL) {
+			LASSERT(em->emrg_page_list.pl_nr == 0);
+			if (em->emrg_env != NULL)
+				cl_env_put(em->emrg_env, &em->emrg_refcheck);
+			OBD_FREE_PTR(em);
+		}
+	}
+	OBD_FREE(emrg, nr * sizeof emrg[0]);
+}
+
+static struct lu_device *lov_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct lov_device *ld = lu2lov_dev(d);
+	const int	  nr = ld->ld_target_nr;
+
+	cl_device_fini(lu2cl_dev(d));
+	if (ld->ld_target != NULL)
+		OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]);
+	if (ld->ld_emrg != NULL)
+		lov_emerg_free(ld->ld_emrg, nr);
+	OBD_FREE_PTR(ld);
+	return NULL;
+}
+
+static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
+			      __u32 index)
+{
+	struct lov_device *ld = lu2lov_dev(dev);
+	ENTRY;
+
+	if (ld->ld_target[index] != NULL) {
+		cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+		ld->ld_target[index] = NULL;
+	}
+	EXIT;
+}
+
+static struct lov_device_emerg **lov_emerg_alloc(int nr)
+{
+	struct lov_device_emerg **emerg;
+	int i;
+	int result;
+
+	OBD_ALLOC(emerg, nr * sizeof emerg[0]);
+	if (emerg == NULL)
+		return ERR_PTR(-ENOMEM);
+	for (result = i = 0; i < nr && result == 0; i++) {
+		struct lov_device_emerg *em;
+
+		OBD_ALLOC_PTR(em);
+		if (em != NULL) {
+			emerg[i] = em;
+			cl_page_list_init(&em->emrg_page_list);
+			em->emrg_env = cl_env_alloc(&em->emrg_refcheck,
+						    LCT_REMEMBER|LCT_NOREF);
+			if (!IS_ERR(em->emrg_env))
+				em->emrg_env->le_ctx.lc_cookie = 0x2;
+			else {
+				result = PTR_ERR(em->emrg_env);
+				em->emrg_env = NULL;
+			}
+		} else
+			result = -ENOMEM;
+	}
+	if (result != 0) {
+		lov_emerg_free(emerg, nr);
+		emerg = ERR_PTR(result);
+	}
+	return emerg;
+}
+
+static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
+{
+	int   result;
+	__u32 tgt_size;
+	__u32 sub_size;
+
+	ENTRY;
+	result = 0;
+	tgt_size = dev->ld_lov->lov_tgt_size;
+	sub_size = dev->ld_target_nr;
+	if (sub_size < tgt_size) {
+		struct lovsub_device    **newd;
+		struct lov_device_emerg **emerg;
+		const size_t	      sz   = sizeof newd[0];
+
+		emerg = lov_emerg_alloc(tgt_size);
+		if (IS_ERR(emerg))
+			RETURN(PTR_ERR(emerg));
+
+		OBD_ALLOC(newd, tgt_size * sz);
+		if (newd != NULL) {
+			mutex_lock(&dev->ld_mutex);
+			if (sub_size > 0) {
+				memcpy(newd, dev->ld_target, sub_size * sz);
+				OBD_FREE(dev->ld_target, sub_size * sz);
+			}
+			dev->ld_target    = newd;
+			dev->ld_target_nr = tgt_size;
+
+			if (dev->ld_emrg != NULL)
+				lov_emerg_free(dev->ld_emrg, sub_size);
+			dev->ld_emrg = emerg;
+			mutex_unlock(&dev->ld_mutex);
+		} else {
+			lov_emerg_free(emerg, tgt_size);
+			result = -ENOMEM;
+		}
+	}
+	RETURN(result);
+}
+
+static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
+			     __u32 index)
+{
+	struct obd_device    *obd = dev->ld_obd;
+	struct lov_device    *ld  = lu2lov_dev(dev);
+	struct lov_tgt_desc  *tgt;
+	struct lovsub_device *lsd;
+	struct cl_device     *cl;
+	int rc;
+	ENTRY;
+
+	obd_getref(obd);
+
+	tgt = obd->u.lov.lov_tgts[index];
+	LASSERT(tgt != NULL);
+	LASSERT(tgt->ltd_obd != NULL);
+
+	if (!tgt->ltd_obd->obd_set_up) {
+		CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+		RETURN(-EINVAL);
+	}
+
+	rc = lov_expand_targets(env, ld);
+	if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+		LASSERT(dev->ld_site != NULL);
+
+		cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
+				   tgt->ltd_obd->obd_lu_dev);
+		if (!IS_ERR(cl)) {
+			lsd = cl2lovsub_dev(cl);
+			lsd->acid_idx = index;
+			lsd->acid_super = ld;
+			ld->ld_target[index] = lsd;
+		} else {
+			CERROR("add failed (%d), deleting %s\n", rc,
+			       obd_uuid2str(&tgt->ltd_uuid));
+			lov_cl_del_target(env, dev, index);
+			rc = PTR_ERR(cl);
+		}
+	}
+	obd_putref(obd);
+	RETURN(rc);
+}
+
+static int lov_process_config(const struct lu_env *env,
+			      struct lu_device *d, struct lustre_cfg *cfg)
+{
+	struct obd_device *obd = d->ld_obd;
+	int cmd;
+	int rc;
+	int gen;
+	__u32 index;
+
+	obd_getref(obd);
+
+	cmd = cfg->lcfg_command;
+	rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+	if (rc == 0) {
+		switch(cmd) {
+		case LCFG_LOV_ADD_OBD:
+		case LCFG_LOV_ADD_INA:
+			rc = lov_cl_add_target(env, d, index);
+			if (rc != 0)
+				lov_del_target(d->ld_obd, index, 0, 0);
+			break;
+		case LCFG_LOV_DEL_OBD:
+			lov_cl_del_target(env, d, index);
+			break;
+		}
+	}
+	obd_putref(obd);
+	RETURN(rc);
+}
+
+static const struct lu_device_operations lov_lu_ops = {
+	.ldo_object_alloc      = lov_object_alloc,
+	.ldo_process_config    = lov_process_config,
+};
+
+static struct lu_device *lov_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct lov_device *ld;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(ld);
+	if (ld == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	cl_device_init(&ld->ld_cl, t);
+	d = lov2lu_dev(ld);
+	d->ld_ops	= &lov_lu_ops;
+	ld->ld_cl.cd_ops = &lov_cl_ops;
+
+	mutex_init(&ld->ld_mutex);
+	lockdep_set_class(&ld->ld_mutex, &cl_lov_device_mutex_class);
+
+	/* setup the LOV OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	rc = lov_setup(obd, cfg);
+	if (rc) {
+		lov_device_free(env, d);
+		RETURN(ERR_PTR(rc));
+	}
+
+	ld->ld_lov = &obd->u.lov;
+	RETURN(d);
+}
+
+static const struct lu_device_type_operations lov_device_type_ops = {
+	.ldto_init = lov_type_init,
+	.ldto_fini = lov_type_fini,
+
+	.ldto_start = lov_type_start,
+	.ldto_stop  = lov_type_stop,
+
+	.ldto_device_alloc = lov_device_alloc,
+	.ldto_device_free  = lov_device_free,
+
+	.ldto_device_init    = lov_device_init,
+	.ldto_device_fini    = lov_device_fini
+};
+
+struct lu_device_type lov_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOV_NAME,
+	.ldt_ops      = &lov_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+EXPORT_SYMBOL(lov_device_type);
+
+/** @} lov */

diff --git a/drivers/staging/lustre/lustre/lov/lov_ea.c b/drivers/staging/lustre/lustre/lov/lov_ea.c
new file mode 100644
index 0000000..340dbcf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_ea.c

@@ -0,0 +1,333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_ea.c
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <asm/div64.h>
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_idl.h>
+
+#include "lov_internal.h"
+
+struct lovea_unpack_args {
+	struct lov_stripe_md *lsm;
+	int		   cursor;
+};
+
+static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
+				 __u16 stripe_count)
+{
+	if (stripe_count == 0 || stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+		CERROR("bad stripe count %d\n", stripe_count);
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lmm_oi_id(&lmm->lmm_oi) == 0) {
+		CERROR("zero object id\n");
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) {
+		CERROR("bad striping pattern\n");
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	if (lmm->lmm_stripe_size == 0 ||
+	    (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) {
+		CERROR("bad stripe size %u\n",
+		       le32_to_cpu(lmm->lmm_stripe_size));
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size)
+{
+	struct lov_stripe_md *lsm;
+	struct lov_oinfo     *loi;
+	int		   i, oinfo_ptrs_size;
+
+	LASSERT(stripe_count <= LOV_MAX_STRIPE_COUNT);
+
+	oinfo_ptrs_size = sizeof(struct lov_oinfo *) * stripe_count;
+	*size = sizeof(struct lov_stripe_md) + oinfo_ptrs_size;
+
+	OBD_ALLOC_LARGE(lsm, *size);
+	if (!lsm)
+		return NULL;;
+
+	for (i = 0; i < stripe_count; i++) {
+		OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, __GFP_IO);
+		if (loi == NULL)
+			goto err;
+		lsm->lsm_oinfo[i] = loi;
+	}
+	lsm->lsm_stripe_count = stripe_count;
+	return lsm;
+
+err:
+	while (--i >= 0)
+		OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab, sizeof(*loi));
+	OBD_FREE_LARGE(lsm, *size);
+	return NULL;
+}
+
+void lsm_free_plain(struct lov_stripe_md *lsm)
+{
+	__u16 stripe_count = lsm->lsm_stripe_count;
+	int i;
+
+	for (i = 0; i < stripe_count; i++)
+		OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab,
+			      sizeof(struct lov_oinfo));
+	OBD_FREE_LARGE(lsm, sizeof(struct lov_stripe_md) +
+		       stripe_count * sizeof(struct lov_oinfo *));
+}
+
+static void lsm_unpackmd_common(struct lov_stripe_md *lsm,
+				struct lov_mds_md *lmm)
+{
+	/*
+	 * This supposes lov_mds_md_v1/v3 first fields are
+	 * are the same
+	 */
+	lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi);
+	lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
+	lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern);
+	lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
+	lsm->lsm_pool_name[0] = '\0';
+}
+
+static void
+lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno,
+			   obd_off *lov_off, obd_off *swidth)
+{
+	if (swidth)
+		*swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static void
+lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno,
+			   obd_off *lov_off, obd_off *swidth)
+{
+	if (swidth)
+		*swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa,
+			     struct obd_export *md_exp)
+{
+	return 0;
+}
+
+/* Find minimum stripe maxbytes value.  For inactive or
+ * reconnecting targets use LUSTRE_STRIPE_MAXBYTES. */
+static void lov_tgt_maxbytes(struct lov_tgt_desc *tgt, __u64 *stripe_maxbytes)
+{
+	struct obd_import *imp = tgt->ltd_obd->u.cli.cl_import;
+
+	if (imp == NULL || !tgt->ltd_active) {
+		*stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+		return;
+	}
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_FULL &&
+	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
+	    imp->imp_connect_data.ocd_maxbytes > 0) {
+		if (*stripe_maxbytes > imp->imp_connect_data.ocd_maxbytes)
+			*stripe_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+	} else {
+		*stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+	}
+	spin_unlock(&imp->imp_lock);
+}
+
+static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes,
+			     __u16 *stripe_count)
+{
+	if (lmm_bytes < sizeof(*lmm)) {
+		CERROR("lov_mds_md_v1 too small: %d, need at least %d\n",
+		       lmm_bytes, (int)sizeof(*lmm));
+		return -EINVAL;
+	}
+
+	*stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+
+	if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) {
+		CERROR("LOV EA V1 too small: %d, need %d\n",
+		       lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1));
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count);
+}
+
+int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm,
+		    struct lov_mds_md_v1 *lmm)
+{
+	struct lov_oinfo *loi;
+	int i;
+	__u64 stripe_maxbytes = OBD_OBJECT_EOF;
+
+	lsm_unpackmd_common(lsm, lmm);
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		/* XXX LOV STACKING call down to osc_unpackmd() */
+		loi = lsm->lsm_oinfo[i];
+		ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+		loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+		loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+		if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+			CERROR("OST index %d more than OST count %d\n",
+			       loi->loi_ost_idx, lov->desc.ld_tgt_count);
+			lov_dump_lmm_v1(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CERROR("OST index %d missing\n", loi->loi_ost_idx);
+			lov_dump_lmm_v1(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		/* calculate the minimum stripe max bytes */
+		lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+				 &stripe_maxbytes);
+	}
+
+	lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+
+	return 0;
+}
+
+const struct lsm_operations lsm_v1_ops = {
+	.lsm_free	    = lsm_free_plain,
+	.lsm_destroy	 = lsm_destroy_plain,
+	.lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+	.lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+	.lsm_lmm_verify	 = lsm_lmm_verify_v1,
+	.lsm_unpackmd	   = lsm_unpackmd_v1,
+};
+
+static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes,
+			     __u16 *stripe_count)
+{
+	struct lov_mds_md_v3 *lmm;
+
+	lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+	if (lmm_bytes < sizeof(*lmm)) {
+		CERROR("lov_mds_md_v3 too small: %d, need at least %d\n",
+		       lmm_bytes, (int)sizeof(*lmm));
+		return -EINVAL;
+	}
+
+	*stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+
+	if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) {
+		CERROR("LOV EA V3 too small: %d, need %d\n",
+		       lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3));
+		lov_dump_lmm_common(D_WARNING, lmm);
+		return -EINVAL;
+	}
+
+	return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes,
+				     *stripe_count);
+}
+
+int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm,
+		    struct lov_mds_md *lmmv1)
+{
+	struct lov_mds_md_v3 *lmm;
+	struct lov_oinfo *loi;
+	int i;
+	__u64 stripe_maxbytes = OBD_OBJECT_EOF;
+	int cplen = 0;
+
+	lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+	lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm);
+	cplen = strlcpy(lsm->lsm_pool_name, lmm->lmm_pool_name,
+			sizeof(lsm->lsm_pool_name));
+	if (cplen >= sizeof(lsm->lsm_pool_name))
+		return -E2BIG;
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		/* XXX LOV STACKING call down to osc_unpackmd() */
+		loi = lsm->lsm_oinfo[i];
+		ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+		loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+		loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+		if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+			CERROR("OST index %d more than OST count %d\n",
+			       loi->loi_ost_idx, lov->desc.ld_tgt_count);
+			lov_dump_lmm_v3(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CERROR("OST index %d missing\n", loi->loi_ost_idx);
+			lov_dump_lmm_v3(D_WARNING, lmm);
+			return -EINVAL;
+		}
+		/* calculate the minimum stripe max bytes */
+		lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+				 &stripe_maxbytes);
+	}
+
+	lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+
+	return 0;
+}
+
+const struct lsm_operations lsm_v3_ops = {
+	.lsm_free	    = lsm_free_plain,
+	.lsm_destroy	 = lsm_destroy_plain,
+	.lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+	.lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+	.lsm_lmm_verify	 = lsm_lmm_verify_v3,
+	.lsm_unpackmd	   = lsm_unpackmd_v3,
+};

diff --git a/drivers/staging/lustre/lustre/lov/lov_internal.h b/drivers/staging/lustre/lustre/lov/lov_internal.h
new file mode 100644
index 0000000..16770d14
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_internal.h

@@ -0,0 +1,323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LOV_INTERNAL_H
+#define LOV_INTERNAL_H
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_user.h>
+
+struct lov_lock_handles {
+	struct portals_handle   llh_handle;
+	atomic_t	    llh_refcount;
+	int		     llh_stripe_count;
+	struct lustre_handle    llh_handles[0];
+};
+
+struct lov_request {
+	struct obd_info	  rq_oi;
+	struct lov_request_set  *rq_rqset;
+
+	struct list_head	       rq_link;
+
+	int		      rq_idx;	/* index in lov->tgts array */
+	int		      rq_stripe;     /* stripe number */
+	int		      rq_complete;
+	int		      rq_rc;
+	int		      rq_buflen;     /* length of sub_md */
+
+	obd_count		rq_oabufs;
+	obd_count		rq_pgaidx;
+};
+
+struct lov_request_set {
+	struct ldlm_enqueue_info	*set_ei;
+	struct obd_info			*set_oi;
+	atomic_t			set_refcount;
+	struct obd_export		*set_exp;
+	/* XXX: There is @set_exp already, however obd_statfs gets obd_device
+	   only. */
+	struct obd_device		*set_obd;
+	int				set_count;
+	atomic_t			set_completes;
+	atomic_t			set_success;
+	atomic_t			set_finish_checked;
+	struct llog_cookie		*set_cookies;
+	int				set_cookie_sent;
+	struct obd_trans_info		*set_oti;
+	obd_count			set_oabufs;
+	struct brw_page			*set_pga;
+	struct lov_lock_handles		*set_lockh;
+	struct list_head			set_list;
+	wait_queue_head_t			set_waitq;
+	spinlock_t			set_lock;
+};
+
+extern struct kmem_cache *lov_oinfo_slab;
+
+void lov_finish_set(struct lov_request_set *set);
+
+static inline void lov_get_reqset(struct lov_request_set *set)
+{
+	LASSERT(set != NULL);
+	LASSERT(atomic_read(&set->set_refcount) > 0);
+	atomic_inc(&set->set_refcount);
+}
+
+static inline void lov_put_reqset(struct lov_request_set *set)
+{
+	if (atomic_dec_and_test(&set->set_refcount))
+		lov_finish_set(set);
+}
+
+static inline struct lov_lock_handles *
+lov_handle2llh(struct lustre_handle *handle)
+{
+	LASSERT(handle != NULL);
+	return(class_handle2object(handle->cookie));
+}
+
+static inline void lov_llh_put(struct lov_lock_handles *llh)
+{
+	CDEBUG(D_INFO, "PUTting llh %p : new refcount %d\n", llh,
+	       atomic_read(&llh->llh_refcount) - 1);
+	LASSERT(atomic_read(&llh->llh_refcount) > 0 &&
+		atomic_read(&llh->llh_refcount) < 0x5a5a);
+	if (atomic_dec_and_test(&llh->llh_refcount)) {
+		class_handle_unhash(&llh->llh_handle);
+		/* The structure may be held by other threads because RCU.
+		 *   -jxiong */
+		if (atomic_read(&llh->llh_refcount))
+			return;
+
+		OBD_FREE_RCU(llh, sizeof *llh +
+			     sizeof(*llh->llh_handles) * llh->llh_stripe_count,
+			     &llh->llh_handle);
+	}
+}
+
+#define lov_uuid2str(lv, index) \
+	(char *)((lv)->lov_tgts[index]->ltd_uuid.uuid)
+
+/* lov_merge.c */
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_valid valid,
+		     struct lov_stripe_md *lsm, int stripeno, int *set);
+int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
+		  struct ost_lvb *lvb, int kms_only);
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+		   obd_off size, int shrink);
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+		      struct ost_lvb *lvb, __u64 *kms_place);
+
+/* lov_offset.c */
+obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
+			 int stripeno);
+int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
+		      int stripeno, obd_off *obd_off);
+obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
+			   int stripeno);
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+			  obd_off start, obd_off end,
+			  obd_off *obd_start, obd_off *obd_end);
+int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off);
+
+/* lov_qos.c */
+#define LOV_USES_ASSIGNED_STRIPE	0
+#define LOV_USES_DEFAULT_STRIPE	 1
+int qos_add_tgt(struct obd_device *obd, __u32 index);
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt);
+void qos_shrink_lsm(struct lov_request_set *set);
+int qos_prep_create(struct obd_export *exp, struct lov_request_set *set);
+void qos_update(struct lov_obd *lov);
+void qos_statfs_done(struct lov_obd *lov);
+void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait);
+int qos_remedy_create(struct lov_request_set *set, struct lov_request *req);
+
+/* lov_request.c */
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set);
+int lov_set_finished(struct lov_request_set *set, int idempotent);
+void lov_update_set(struct lov_request_set *set,
+		    struct lov_request *req, int rc);
+int lov_update_common_set(struct lov_request_set *set,
+			  struct lov_request *req, int rc);
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx);
+int lov_prep_create_set(struct obd_export *exp, struct obd_info *oifo,
+			struct lov_stripe_md **ea, struct obdo *src_oa,
+			struct obd_trans_info *oti,
+			struct lov_request_set **reqset);
+int cb_create_update(void *cookie, int rc);
+int lov_fini_create_set(struct lov_request_set *set, struct lov_stripe_md **ea);
+int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
+		     obd_count oa_bufs, struct brw_page *pga,
+		     struct obd_trans_info *oti,
+		     struct lov_request_set **reqset);
+int lov_fini_brw_set(struct lov_request_set *set);
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct lov_request_set **reqset);
+int lov_fini_getattr_set(struct lov_request_set *set);
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obdo *src_oa, struct lov_stripe_md *lsm,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset);
+int lov_update_destroy_set(struct lov_request_set *set,
+			   struct lov_request *req, int rc);
+int lov_fini_destroy_set(struct lov_request_set *set);
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset);
+int lov_update_setattr_set(struct lov_request_set *set,
+			   struct lov_request *req, int rc);
+int lov_fini_setattr_set(struct lov_request_set *set);
+int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
+		       struct obd_trans_info *oti,
+		       struct lov_request_set **reqset);
+int lov_fini_punch_set(struct lov_request_set *set);
+int lov_prep_sync_set(struct obd_export *exp, struct obd_info *obd_info,
+		      obd_off start, obd_off end,
+		      struct lov_request_set **reqset);
+int lov_fini_sync_set(struct lov_request_set *set);
+int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct ldlm_enqueue_info *einfo,
+			 struct lov_request_set **reqset);
+int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode, int rc,
+			 struct ptlrpc_request_set *rqset);
+int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
+		       struct lov_stripe_md *lsm,
+		       ldlm_policy_data_t *policy, __u32 mode,
+		       struct lustre_handle *lockh,
+		       struct lov_request_set **reqset);
+int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags);
+int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
+			struct lov_stripe_md *lsm,
+			__u32 mode, struct lustre_handle *lockh,
+			struct lov_request_set **reqset);
+int lov_fini_cancel_set(struct lov_request_set *set);
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+			struct lov_request_set **reqset);
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+		       int success);
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+		    int success);
+int lov_fini_statfs_set(struct lov_request_set *set);
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc);
+
+/* lov_obd.c */
+void lov_fix_desc(struct lov_desc *desc);
+void lov_fix_desc_stripe_size(__u64 *val);
+void lov_fix_desc_stripe_count(__u32 *val);
+void lov_fix_desc_pattern(__u32 *val);
+void lov_fix_desc_qos_maxage(__u32 *val);
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count);
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+		    struct obd_connect_data *data);
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+			    __u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, __u32 index,
+		   struct obd_uuid *uuidp, int gen);
+/* lov_log.c */
+int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+		  struct obd_device *tgt, int *idx);
+int lov_llog_finish(struct obd_device *obd, int count);
+
+/* lov_pack.c */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmm,
+	       struct lov_stripe_md *lsm);
+int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+		 struct lov_mds_md *lmm, int lmm_bytes);
+int lov_setstripe(struct obd_export *exp, int max_lmm_size,
+		  struct lov_stripe_md **lsmp, struct lov_user_md *lump);
+int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
+	      struct lov_user_md *lump);
+int lov_getstripe(struct obd_export *exp,
+		  struct lov_stripe_md *lsm, struct lov_user_md *lump);
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+		    int pattern, int magic);
+int lov_free_memmd(struct lov_stripe_md **lsmp);
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm);
+void lov_dump_lmm_common(int level, void *lmmp);
+void lov_dump_lmm(int level, void *lmm);
+
+/* lov_ea.c */
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size);
+void lsm_free_plain(struct lov_stripe_md *lsm);
+
+int lovea_destroy_object(struct lov_obd *lov, struct lov_stripe_md *lsm,
+			 struct obdo *oa, void *data);
+/* lproc_lov.c */
+extern struct file_operations lov_proc_target_fops;
+#ifdef LPROCFS
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+/* lov_cl.c */
+extern struct lu_device_type lov_device_type;
+
+/* pools */
+extern cfs_hash_ops_t pool_hash_operations;
+/* ost_pool methods */
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count);
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count);
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count);
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx);
+int lov_ost_pool_free(struct ost_pool *op);
+
+/* high level pool methods */
+int lov_pool_new(struct obd_device *obd, char *poolname);
+int lov_pool_del(struct obd_device *obd, char *poolname);
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname);
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+void lov_dump_pool(int level, struct pool_desc *pool);
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
+void lov_pool_putref(struct pool_desc *pool);
+
+static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm)
+{
+	LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+	atomic_inc(&lsm->lsm_refc);
+	return lsm;
+}
+
+#endif

diff --git a/drivers/staging/lustre/lustre/lov/lov_io.c b/drivers/staging/lustre/lustre/lov/lov_io.c
new file mode 100644
index 0000000..1a87abd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_io.c

@@ -0,0 +1,967 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static inline void lov_sub_enter(struct lov_io_sub *sub)
+{
+	sub->sub_reenter++;
+}
+static inline void lov_sub_exit(struct lov_io_sub *sub)
+{
+	sub->sub_reenter--;
+}
+
+static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
+			    struct lov_io_sub *sub)
+{
+	ENTRY;
+	if (sub->sub_io != NULL) {
+		if (sub->sub_io_initialized) {
+			lov_sub_enter(sub);
+			cl_io_fini(sub->sub_env, sub->sub_io);
+			lov_sub_exit(sub);
+			sub->sub_io_initialized = 0;
+			lio->lis_active_subios--;
+		}
+		if (sub->sub_stripe == lio->lis_single_subio_index)
+			lio->lis_single_subio_index = -1;
+		else if (!sub->sub_borrowed)
+			OBD_FREE_PTR(sub->sub_io);
+		sub->sub_io = NULL;
+	}
+	if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
+		if (!sub->sub_borrowed)
+			cl_env_put(sub->sub_env, &sub->sub_refcheck);
+		sub->sub_env = NULL;
+	}
+	EXIT;
+}
+
+static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio,
+			       int stripe, loff_t start, loff_t end)
+{
+	struct lov_stripe_md *lsm    = lio->lis_object->lo_lsm;
+	struct cl_io	 *parent = lio->lis_cl.cis_io;
+
+	switch(io->ci_type) {
+	case CIT_SETATTR: {
+		io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
+		io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid;
+		io->u.ci_setattr.sa_capa = parent->u.ci_setattr.sa_capa;
+		if (cl_io_is_trunc(io)) {
+			loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size;
+
+			new_size = lov_size_to_stripe(lsm, new_size, stripe);
+			io->u.ci_setattr.sa_attr.lvb_size = new_size;
+		}
+		break;
+	}
+	case CIT_FAULT: {
+		struct cl_object *obj = parent->ci_obj;
+		loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index);
+
+		io->u.ci_fault = parent->u.ci_fault;
+		off = lov_size_to_stripe(lsm, off, stripe);
+		io->u.ci_fault.ft_index = cl_index(obj, off);
+		break;
+	}
+	case CIT_FSYNC: {
+		io->u.ci_fsync.fi_start = start;
+		io->u.ci_fsync.fi_end = end;
+		io->u.ci_fsync.fi_capa = parent->u.ci_fsync.fi_capa;
+		io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid;
+		io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode;
+		break;
+	}
+	case CIT_READ:
+	case CIT_WRITE: {
+		io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+		if (cl_io_is_append(parent)) {
+			io->u.ci_wr.wr_append = 1;
+		} else {
+			io->u.ci_rw.crw_pos = start;
+			io->u.ci_rw.crw_count = end - start;
+		}
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
+			   struct lov_io_sub *sub)
+{
+	struct lov_object *lov = lio->lis_object;
+	struct lov_device *ld  = lu2lov_dev(lov2cl(lov)->co_lu.lo_dev);
+	struct cl_io      *sub_io;
+	struct cl_object  *sub_obj;
+	struct cl_io      *io  = lio->lis_cl.cis_io;
+
+	int stripe = sub->sub_stripe;
+	int result;
+
+	LASSERT(sub->sub_io == NULL);
+	LASSERT(sub->sub_env == NULL);
+	LASSERT(sub->sub_stripe < lio->lis_stripe_count);
+	ENTRY;
+
+	result = 0;
+	sub->sub_io_initialized = 0;
+	sub->sub_borrowed = 0;
+
+	if (lio->lis_mem_frozen) {
+		LASSERT(mutex_is_locked(&ld->ld_mutex));
+		sub->sub_io  = &ld->ld_emrg[stripe]->emrg_subio;
+		sub->sub_env = ld->ld_emrg[stripe]->emrg_env;
+		sub->sub_borrowed = 1;
+	} else {
+		void *cookie;
+
+		/* obtain new environment */
+		cookie = cl_env_reenter();
+		sub->sub_env = cl_env_get(&sub->sub_refcheck);
+		cl_env_reexit(cookie);
+		if (IS_ERR(sub->sub_env))
+			result = PTR_ERR(sub->sub_env);
+
+		if (result == 0) {
+			/*
+			 * First sub-io. Use ->lis_single_subio to
+			 * avoid dynamic allocation.
+			 */
+			if (lio->lis_active_subios == 0) {
+				sub->sub_io = &lio->lis_single_subio;
+				lio->lis_single_subio_index = stripe;
+			} else {
+				OBD_ALLOC_PTR(sub->sub_io);
+				if (sub->sub_io == NULL)
+					result = -ENOMEM;
+			}
+		}
+	}
+
+	if (result == 0) {
+		sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]);
+		sub_io  = sub->sub_io;
+
+		sub_io->ci_obj    = sub_obj;
+		sub_io->ci_result = 0;
+
+		sub_io->ci_parent  = io;
+		sub_io->ci_lockreq = io->ci_lockreq;
+		sub_io->ci_type    = io->ci_type;
+		sub_io->ci_no_srvlock = io->ci_no_srvlock;
+
+		lov_sub_enter(sub);
+		result = cl_io_sub_init(sub->sub_env, sub_io,
+					io->ci_type, sub_obj);
+		lov_sub_exit(sub);
+		if (result >= 0) {
+			lio->lis_active_subios++;
+			sub->sub_io_initialized = 1;
+			result = 0;
+		}
+	}
+	if (result != 0)
+		lov_io_sub_fini(env, lio, sub);
+	RETURN(result);
+}
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env,
+			       struct lov_io *lio, int stripe)
+{
+	int rc;
+	struct lov_io_sub *sub = &lio->lis_subs[stripe];
+
+	LASSERT(stripe < lio->lis_stripe_count);
+	ENTRY;
+
+	if (!sub->sub_io_initialized) {
+		sub->sub_stripe = stripe;
+		rc = lov_io_sub_init(env, lio, sub);
+	} else
+		rc = 0;
+	if (rc == 0)
+		lov_sub_enter(sub);
+	else
+		sub = ERR_PTR(rc);
+	RETURN(sub);
+}
+
+void lov_sub_put(struct lov_io_sub *sub)
+{
+	lov_sub_exit(sub);
+}
+
+/*****************************************************************************
+ *
+ * Lov io operations.
+ *
+ */
+
+static int lov_page_stripe(const struct cl_page *page)
+{
+	struct lovsub_object *subobj;
+
+	ENTRY;
+	subobj = lu2lovsub(
+		lu_object_locate(page->cp_child->cp_obj->co_lu.lo_header,
+				 &lovsub_device_type));
+	LASSERT(subobj != NULL);
+	RETURN(subobj->lso_index);
+}
+
+struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio,
+				  const struct cl_page_slice *slice)
+{
+	struct lov_stripe_md *lsm  = lio->lis_object->lo_lsm;
+	struct cl_page       *page = slice->cpl_page;
+	int stripe;
+
+	LASSERT(lio->lis_cl.cis_io != NULL);
+	LASSERT(cl2lov(slice->cpl_obj) == lio->lis_object);
+	LASSERT(lsm != NULL);
+	LASSERT(lio->lis_nr_subios > 0);
+	ENTRY;
+
+	stripe = lov_page_stripe(page);
+	RETURN(lov_sub_get(env, lio, stripe));
+}
+
+
+static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
+			     struct cl_io *io)
+{
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	int result;
+
+	LASSERT(lio->lis_object != NULL);
+	ENTRY;
+
+	/*
+	 * Need to be optimized, we can't afford to allocate a piece of memory
+	 * when writing a page. -jay
+	 */
+	OBD_ALLOC_LARGE(lio->lis_subs,
+			lsm->lsm_stripe_count * sizeof lio->lis_subs[0]);
+	if (lio->lis_subs != NULL) {
+		lio->lis_nr_subios = lio->lis_stripe_count;
+		lio->lis_single_subio_index = -1;
+		lio->lis_active_subios = 0;
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+}
+
+static void lov_io_slice_init(struct lov_io *lio,
+			      struct lov_object *obj, struct cl_io *io)
+{
+	ENTRY;
+
+	io->ci_result = 0;
+	lio->lis_object = obj;
+
+	LASSERT(obj->lo_lsm != NULL);
+	lio->lis_stripe_count = obj->lo_lsm->lsm_stripe_count;
+
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		lio->lis_pos = io->u.ci_rw.crw_pos;
+		lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+		lio->lis_io_endpos = lio->lis_endpos;
+		if (cl_io_is_append(io)) {
+			LASSERT(io->ci_type == CIT_WRITE);
+			lio->lis_pos = 0;
+			lio->lis_endpos = OBD_OBJECT_EOF;
+		}
+		break;
+
+	case CIT_SETATTR:
+		if (cl_io_is_trunc(io))
+			lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+		else
+			lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+	case CIT_FAULT: {
+		pgoff_t index = io->u.ci_fault.ft_index;
+		lio->lis_pos = cl_offset(io->ci_obj, index);
+		lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+		break;
+	}
+
+	case CIT_FSYNC: {
+		lio->lis_pos = io->u.ci_fsync.fi_start;
+		lio->lis_endpos = io->u.ci_fsync.fi_end;
+		break;
+	}
+
+	case CIT_MISC:
+		lio->lis_pos = 0;
+		lio->lis_endpos = OBD_OBJECT_EOF;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	EXIT;
+}
+
+static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_object *lov = cl2lov(ios->cis_obj);
+	int i;
+
+	ENTRY;
+	if (lio->lis_subs != NULL) {
+		for (i = 0; i < lio->lis_nr_subios; i++)
+			lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+		OBD_FREE_LARGE(lio->lis_subs,
+			 lio->lis_nr_subios * sizeof lio->lis_subs[0]);
+		lio->lis_nr_subios = 0;
+	}
+
+	LASSERT(atomic_read(&lov->lo_active_ios) > 0);
+	if (atomic_dec_and_test(&lov->lo_active_ios))
+		wake_up_all(&lov->lo_waitq);
+	EXIT;
+}
+
+static obd_off lov_offset_mod(obd_off val, int delta)
+{
+	if (val != OBD_OBJECT_EOF)
+		val += delta;
+	return val;
+}
+
+static int lov_io_iter_init(const struct lu_env *env,
+			    const struct cl_io_slice *ios)
+{
+	struct lov_io	*lio = cl2lov_io(env, ios);
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	struct lov_io_sub    *sub;
+	obd_off endpos;
+	obd_off start;
+	obd_off end;
+	int stripe;
+	int rc = 0;
+
+	ENTRY;
+	endpos = lov_offset_mod(lio->lis_endpos, -1);
+	for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) {
+		if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos,
+					   endpos, &start, &end))
+			continue;
+
+		end = lov_offset_mod(end, +1);
+		sub = lov_sub_get(env, lio, stripe);
+		if (!IS_ERR(sub)) {
+			lov_io_sub_inherit(sub->sub_io, lio, stripe,
+					   start, end);
+			rc = cl_io_iter_init(sub->sub_env, sub->sub_io);
+			lov_sub_put(sub);
+			CDEBUG(D_VFSTRACE, "shrink: %d ["LPU64", "LPU64")\n",
+			       stripe, start, end);
+		} else
+			rc = PTR_ERR(sub);
+
+		if (!rc)
+			list_add_tail(&sub->sub_linkage, &lio->lis_active);
+		else
+			break;
+	}
+	RETURN(rc);
+}
+
+static int lov_io_rw_iter_init(const struct lu_env *env,
+			       const struct cl_io_slice *ios)
+{
+	struct lov_io	*lio = cl2lov_io(env, ios);
+	struct cl_io	 *io  = ios->cis_io;
+	struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+	loff_t start = io->u.ci_rw.crw_pos;
+	loff_t next;
+	unsigned long ssize = lsm->lsm_stripe_size;
+
+	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+	ENTRY;
+
+	/* fast path for common case. */
+	if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) {
+
+		lov_do_div64(start, ssize);
+		next = (start + 1) * ssize;
+		if (next <= start * ssize)
+			next = ~0ull;
+
+		io->ci_continue = next < lio->lis_io_endpos;
+		io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos,
+					      next) - io->u.ci_rw.crw_pos;
+		lio->lis_pos    = io->u.ci_rw.crw_pos;
+		lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+		CDEBUG(D_VFSTRACE, "stripe: "LPU64" chunk: ["LPU64", "LPU64") "
+		       LPU64"\n", (__u64)start, lio->lis_pos, lio->lis_endpos,
+		       (__u64)lio->lis_io_endpos);
+	}
+	/*
+	 * XXX The following call should be optimized: we know, that
+	 * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe.
+	 */
+	RETURN(lov_io_iter_init(env, ios));
+}
+
+static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
+		       int (*iofunc)(const struct lu_env *, struct cl_io *))
+{
+	struct cl_io *parent = lio->lis_cl.cis_io;
+	struct lov_io_sub *sub;
+	int rc = 0;
+
+	ENTRY;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		lov_sub_enter(sub);
+		rc = iofunc(sub->sub_env, sub->sub_io);
+		lov_sub_exit(sub);
+		if (rc)
+			break;
+
+		if (parent->ci_result == 0)
+			parent->ci_result = sub->sub_io->ci_result;
+	}
+	RETURN(rc);
+}
+
+static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	ENTRY;
+	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
+}
+
+static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	ENTRY;
+	RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
+}
+
+static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	ENTRY;
+	/*
+	 * It's possible that lov_io_start() wasn't called against this
+	 * sub-io, either because previous sub-io failed, or upper layer
+	 * completed IO.
+	 */
+	if (io->ci_state == CIS_IO_GOING)
+		cl_io_end(env, io);
+	else
+		io->ci_state = CIS_IO_FINISHED;
+	RETURN(0);
+}
+
+static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	cl_io_iter_fini(env, io);
+	RETURN(0);
+}
+
+static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+	cl_io_unlock(env, io);
+	RETURN(0);
+}
+
+static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+	int rc;
+
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+	LASSERT(rc == 0);
+}
+
+static void lov_io_iter_fini(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	int rc;
+
+	ENTRY;
+	rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+	LASSERT(rc == 0);
+	while (!list_empty(&lio->lis_active))
+		list_del_init(lio->lis_active.next);
+	EXIT;
+}
+
+static void lov_io_unlock(const struct lu_env *env,
+			  const struct cl_io_slice *ios)
+{
+	int rc;
+
+	ENTRY;
+	rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+	LASSERT(rc == 0);
+	EXIT;
+}
+
+
+static struct cl_page_list *lov_io_submit_qin(struct lov_device *ld,
+					      struct cl_page_list *qin,
+					      int idx, int alloc)
+{
+	return alloc ? &qin[idx] : &ld->ld_emrg[idx]->emrg_page_list;
+}
+
+/**
+ * lov implementation of cl_operations::cio_submit() method. It takes a list
+ * of pages in \a queue, splits it into per-stripe sub-lists, invokes
+ * cl_io_submit() on underlying devices to submit sub-lists, and then splices
+ * everything back.
+ *
+ * Major complication of this function is a need to handle memory cleansing:
+ * cl_io_submit() is called to write out pages as a part of VM memory
+ * reclamation, and hence it may not fail due to memory shortages (system
+ * dead-locks otherwise). To deal with this, some resources (sub-lists,
+ * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a
+ * not-memory cleansing context), and in case of memory shortage, these
+ * pre-allocated resources are used by lov_io_submit() under
+ * lov_device::ld_mutex mutex.
+ */
+static int lov_io_submit(const struct lu_env *env,
+			 const struct cl_io_slice *ios,
+			 enum cl_req_type crt, struct cl_2queue *queue)
+{
+	struct lov_io	  *lio = cl2lov_io(env, ios);
+	struct lov_object      *obj = lio->lis_object;
+	struct lov_device       *ld = lu2lov_dev(lov2cl(obj)->co_lu.lo_dev);
+	struct cl_page_list    *qin = &queue->c2_qin;
+	struct cl_2queue      *cl2q = &lov_env_info(env)->lti_cl2q;
+	struct cl_page_list *stripes_qin = NULL;
+	struct cl_page *page;
+	struct cl_page *tmp;
+	int stripe;
+
+#define QIN(stripe) lov_io_submit_qin(ld, stripes_qin, stripe, alloc)
+
+	int rc = 0;
+	int alloc =
+		!(current->flags & PF_MEMALLOC);
+	ENTRY;
+	if (lio->lis_active_subios == 1) {
+		int idx = lio->lis_single_subio_index;
+		struct lov_io_sub *sub;
+
+		LASSERT(idx < lio->lis_nr_subios);
+		sub = lov_sub_get(env, lio, idx);
+		LASSERT(!IS_ERR(sub));
+		LASSERT(sub->sub_io == &lio->lis_single_subio);
+		rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+				     crt, queue);
+		lov_sub_put(sub);
+		RETURN(rc);
+	}
+
+	LASSERT(lio->lis_subs != NULL);
+	if (alloc) {
+		OBD_ALLOC_LARGE(stripes_qin,
+				sizeof(*stripes_qin) * lio->lis_nr_subios);
+		if (stripes_qin == NULL)
+			RETURN(-ENOMEM);
+
+		for (stripe = 0; stripe < lio->lis_nr_subios; stripe++)
+			cl_page_list_init(&stripes_qin[stripe]);
+	} else {
+		/*
+		 * If we get here, it means pageout & swap doesn't help.
+		 * In order to not make things worse, even don't try to
+		 * allocate the memory with __GFP_NOWARN. -jay
+		 */
+		mutex_lock(&ld->ld_mutex);
+		lio->lis_mem_frozen = 1;
+	}
+
+	cl_2queue_init(cl2q);
+	cl_page_list_for_each_safe(page, tmp, qin) {
+		stripe = lov_page_stripe(page);
+		cl_page_list_move(QIN(stripe), qin, page);
+	}
+
+	for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+		struct lov_io_sub   *sub;
+		struct cl_page_list *sub_qin = QIN(stripe);
+
+		if (list_empty(&sub_qin->pl_pages))
+			continue;
+
+		cl_page_list_splice(sub_qin, &cl2q->c2_qin);
+		sub = lov_sub_get(env, lio, stripe);
+		if (!IS_ERR(sub)) {
+			rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+					     crt, cl2q);
+			lov_sub_put(sub);
+		} else
+			rc = PTR_ERR(sub);
+		cl_page_list_splice(&cl2q->c2_qin,  &queue->c2_qin);
+		cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout);
+		if (rc != 0)
+			break;
+	}
+
+	for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+		struct cl_page_list *sub_qin = QIN(stripe);
+
+		if (list_empty(&sub_qin->pl_pages))
+			continue;
+
+		cl_page_list_splice(sub_qin, qin);
+	}
+
+	if (alloc) {
+		OBD_FREE_LARGE(stripes_qin,
+			 sizeof(*stripes_qin) * lio->lis_nr_subios);
+	} else {
+		int i;
+
+		for (i = 0; i < lio->lis_nr_subios; i++) {
+			struct cl_io *cio = lio->lis_subs[i].sub_io;
+
+			if (cio && cio == &ld->ld_emrg[i]->emrg_subio)
+				lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+		}
+		lio->lis_mem_frozen = 0;
+		mutex_unlock(&ld->ld_mutex);
+	}
+
+	RETURN(rc);
+#undef QIN
+}
+
+static int lov_io_prepare_write(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				const struct cl_page_slice *slice,
+				unsigned from, unsigned to)
+{
+	struct lov_io     *lio      = cl2lov_io(env, ios);
+	struct cl_page    *sub_page = lov_sub_page(slice);
+	struct lov_io_sub *sub;
+	int result;
+
+	ENTRY;
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		result = cl_io_prepare_write(sub->sub_env, sub->sub_io,
+					     sub_page, from, to);
+		lov_sub_put(sub);
+	} else
+		result = PTR_ERR(sub);
+	RETURN(result);
+}
+
+static int lov_io_commit_write(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       const struct cl_page_slice *slice,
+			       unsigned from, unsigned to)
+{
+	struct lov_io     *lio      = cl2lov_io(env, ios);
+	struct cl_page    *sub_page = lov_sub_page(slice);
+	struct lov_io_sub *sub;
+	int result;
+
+	ENTRY;
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		result = cl_io_commit_write(sub->sub_env, sub->sub_io,
+					    sub_page, from, to);
+		lov_sub_put(sub);
+	} else
+		result = PTR_ERR(sub);
+	RETURN(result);
+}
+
+static int lov_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_fault_io *fio;
+	struct lov_io      *lio;
+	struct lov_io_sub  *sub;
+
+	ENTRY;
+	fio = &ios->cis_io->u.ci_fault;
+	lio = cl2lov_io(env, ios);
+	sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page));
+	sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob;
+	lov_sub_put(sub);
+	RETURN(lov_io_start(env, ios));
+}
+
+static void lov_io_fsync_end(const struct lu_env *env,
+			     const struct cl_io_slice *ios)
+{
+	struct lov_io *lio = cl2lov_io(env, ios);
+	struct lov_io_sub *sub;
+	unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written;
+	ENTRY;
+
+	*written = 0;
+	list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+		struct cl_io *subio = sub->sub_io;
+
+		lov_sub_enter(sub);
+		lov_io_end_wrapper(sub->sub_env, subio);
+		lov_sub_exit(sub);
+
+		if (subio->ci_result == 0)
+			*written += subio->u.ci_fsync.fi_nr_written;
+	}
+	RETURN_EXIT;
+}
+
+static const struct cl_io_operations lov_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_rw_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_fault_start,
+			.cio_end       = lov_io_end
+		},
+		[CIT_FSYNC] = {
+			.cio_fini      = lov_io_fini,
+			.cio_iter_init = lov_io_iter_init,
+			.cio_iter_fini = lov_io_iter_fini,
+			.cio_lock      = lov_io_lock,
+			.cio_unlock    = lov_io_unlock,
+			.cio_start     = lov_io_start,
+			.cio_end       = lov_io_fsync_end
+		},
+		[CIT_MISC] = {
+			.cio_fini   = lov_io_fini
+		}
+	},
+	.req_op = {
+		 [CRT_READ] = {
+			 .cio_submit    = lov_io_submit
+		 },
+		 [CRT_WRITE] = {
+			 .cio_submit    = lov_io_submit
+		 }
+	 },
+	.cio_prepare_write = lov_io_prepare_write,
+	.cio_commit_write  = lov_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Empty lov io operations.
+ *
+ */
+
+static void lov_empty_io_fini(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct lov_object *lov = cl2lov(ios->cis_obj);
+	ENTRY;
+
+	if (atomic_dec_and_test(&lov->lo_active_ios))
+		wake_up_all(&lov->lo_waitq);
+	EXIT;
+}
+
+static void lov_empty_impossible(const struct lu_env *env,
+				 struct cl_io_slice *ios)
+{
+	LBUG();
+}
+
+#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
+
+/**
+ * An io operation vector for files without stripes.
+ */
+static const struct cl_io_operations lov_empty_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_fini       = lov_empty_io_fini,
+#if 0
+			.cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end	= LOV_EMPTY_IMPOSSIBLE
+#endif
+		},
+		[CIT_WRITE] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_SETATTR] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_FAULT] = {
+			.cio_fini      = lov_empty_io_fini,
+			.cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+			.cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+			.cio_start     = LOV_EMPTY_IMPOSSIBLE,
+			.cio_end       = LOV_EMPTY_IMPOSSIBLE
+		},
+		[CIT_FSYNC] = {
+			.cio_fini   = lov_empty_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = lov_empty_io_fini
+		}
+	},
+	.req_op = {
+		 [CRT_READ] = {
+			 .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+		 },
+		 [CRT_WRITE] = {
+			 .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+		 }
+	 },
+	.cio_commit_write = LOV_EMPTY_IMPOSSIBLE
+};
+
+int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj,
+		      struct cl_io *io)
+{
+	struct lov_io       *lio = lov_env_io(env);
+	struct lov_object   *lov = cl2lov(obj);
+
+	ENTRY;
+	INIT_LIST_HEAD(&lio->lis_active);
+	lov_io_slice_init(lio, lov, io);
+	if (io->ci_result == 0) {
+		io->ci_result = lov_io_subio_init(env, lio, io);
+		if (io->ci_result == 0) {
+			cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+			atomic_inc(&lov->lo_active_ios);
+		}
+	}
+	RETURN(io->ci_result);
+}
+
+int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+		      struct cl_io *io)
+{
+	struct lov_object *lov = cl2lov(obj);
+	struct lov_io *lio = lov_env_io(env);
+	int result;
+	ENTRY;
+
+	lio->lis_object = lov;
+	switch (io->ci_type) {
+	default:
+		LBUG();
+	case CIT_MISC:
+	case CIT_READ:
+		result = 0;
+		break;
+	case CIT_FSYNC:
+	case CIT_SETATTR:
+		result = +1;
+		break;
+	case CIT_WRITE:
+		result = -EBADF;
+		break;
+	case CIT_FAULT:
+		result = -EFAULT;
+		CERROR("Page fault on a file without stripes: "DFID"\n",
+		       PFID(lu_object_fid(&obj->co_lu)));
+		break;
+	}
+	if (result == 0) {
+		cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+		atomic_inc(&lov->lo_active_ios);
+	}
+
+	io->ci_result = result < 0 ? result : 0;
+	RETURN(result != 0);
+}
+
+/** @} lov */

diff --git a/drivers/staging/lustre/lustre/lov/lov_lock.c b/drivers/staging/lustre/lustre/lov/lov_lock.c
new file mode 100644
index 0000000..bdf3334
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_lock.c

@@ -0,0 +1,1253 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+					       struct cl_lock *parent);
+
+static int lov_lock_unuse(const struct lu_env *env,
+			  const struct cl_lock_slice *slice);
+/*****************************************************************************
+ *
+ * Lov lock operations.
+ *
+ */
+
+static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
+						   struct cl_lock *parent,
+						   struct lov_lock_sub *lls)
+{
+	struct lov_sublock_env *subenv;
+	struct lov_io	  *lio    = lov_env_io(env);
+	struct cl_io	   *io     = lio->lis_cl.cis_io;
+	struct lov_io_sub      *sub;
+
+	subenv = &lov_env_session(env)->ls_subenv;
+
+	/*
+	 * FIXME: We tend to use the subio's env & io to call the sublock
+	 * lock operations because osc lock sometimes stores some control
+	 * variables in thread's IO infomation(Now only lockless information).
+	 * However, if the lock's host(object) is different from the object
+	 * for current IO, we have no way to get the subenv and subio because
+	 * they are not initialized at all. As a temp fix, in this case,
+	 * we still borrow the parent's env to call sublock operations.
+	 */
+	if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
+		subenv->lse_env = env;
+		subenv->lse_io  = io;
+		subenv->lse_sub = NULL;
+	} else {
+		sub = lov_sub_get(env, lio, lls->sub_stripe);
+		if (!IS_ERR(sub)) {
+			subenv->lse_env = sub->sub_env;
+			subenv->lse_io  = sub->sub_io;
+			subenv->lse_sub = sub;
+		} else {
+			subenv = (void*)sub;
+		}
+	}
+	return subenv;
+}
+
+static void lov_sublock_env_put(struct lov_sublock_env *subenv)
+{
+	if (subenv && subenv->lse_sub)
+		lov_sub_put(subenv->lse_sub);
+}
+
+static void lov_sublock_adopt(const struct lu_env *env, struct lov_lock *lck,
+			      struct cl_lock *sublock, int idx,
+			      struct lov_lock_link *link)
+{
+	struct lovsub_lock *lsl;
+	struct cl_lock     *parent = lck->lls_cl.cls_lock;
+	int		 rc;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+	LASSERT(cl_lock_is_mutexed(sublock));
+	ENTRY;
+
+	lsl = cl2sub_lock(sublock);
+	/*
+	 * check that sub-lock doesn't have lock link to this top-lock.
+	 */
+	LASSERT(lov_lock_link_find(env, lck, lsl) == NULL);
+	LASSERT(idx < lck->lls_nr);
+
+	lck->lls_sub[idx].sub_lock = lsl;
+	lck->lls_nr_filled++;
+	LASSERT(lck->lls_nr_filled <= lck->lls_nr);
+	list_add_tail(&link->lll_list, &lsl->lss_parents);
+	link->lll_idx = idx;
+	link->lll_super = lck;
+	cl_lock_get(parent);
+	lu_ref_add(&parent->cll_reference, "lov-child", sublock);
+	lck->lls_sub[idx].sub_flags |= LSF_HELD;
+	cl_lock_user_add(env, sublock);
+
+	rc = lov_sublock_modify(env, lck, lsl, &sublock->cll_descr, idx);
+	LASSERT(rc == 0); /* there is no way this can fail, currently */
+	EXIT;
+}
+
+static struct cl_lock *lov_sublock_alloc(const struct lu_env *env,
+					 const struct cl_io *io,
+					 struct lov_lock *lck,
+					 int idx, struct lov_lock_link **out)
+{
+	struct cl_lock       *sublock;
+	struct cl_lock       *parent;
+	struct lov_lock_link *link;
+
+	LASSERT(idx < lck->lls_nr);
+	ENTRY;
+
+	OBD_SLAB_ALLOC_PTR_GFP(link, lov_lock_link_kmem, __GFP_IO);
+	if (link != NULL) {
+		struct lov_sublock_env *subenv;
+		struct lov_lock_sub  *lls;
+		struct cl_lock_descr *descr;
+
+		parent = lck->lls_cl.cls_lock;
+		lls    = &lck->lls_sub[idx];
+		descr  = &lls->sub_got;
+
+		subenv = lov_sublock_env_get(env, parent, lls);
+		if (!IS_ERR(subenv)) {
+			/* CAVEAT: Don't try to add a field in lov_lock_sub
+			 * to remember the subio. This is because lock is able
+			 * to be cached, but this is not true for IO. This
+			 * further means a sublock might be referenced in
+			 * different io context. -jay */
+
+			sublock = cl_lock_hold(subenv->lse_env, subenv->lse_io,
+					       descr, "lov-parent", parent);
+			lov_sublock_env_put(subenv);
+		} else {
+			/* error occurs. */
+			sublock = (void*)subenv;
+		}
+
+		if (!IS_ERR(sublock))
+			*out = link;
+		else
+			OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+	} else
+		sublock = ERR_PTR(-ENOMEM);
+	RETURN(sublock);
+}
+
+static void lov_sublock_unlock(const struct lu_env *env,
+			       struct lovsub_lock *lsl,
+			       struct cl_lock_closure *closure,
+			       struct lov_sublock_env *subenv)
+{
+	ENTRY;
+	lov_sublock_env_put(subenv);
+	lsl->lss_active = NULL;
+	cl_lock_disclosure(env, closure);
+	EXIT;
+}
+
+static int lov_sublock_lock(const struct lu_env *env,
+			    struct lov_lock *lck,
+			    struct lov_lock_sub *lls,
+			    struct cl_lock_closure *closure,
+			    struct lov_sublock_env **lsep)
+{
+	struct lovsub_lock *sublock;
+	struct cl_lock     *child;
+	int		 result = 0;
+	ENTRY;
+
+	LASSERT(list_empty(&closure->clc_list));
+
+	sublock = lls->sub_lock;
+	child = sublock->lss_cl.cls_lock;
+	result = cl_lock_closure_build(env, child, closure);
+	if (result == 0) {
+		struct cl_lock *parent = closure->clc_origin;
+
+		LASSERT(cl_lock_is_mutexed(child));
+		sublock->lss_active = parent;
+
+		if (unlikely((child->cll_state == CLS_FREEING) ||
+			     (child->cll_flags & CLF_CANCELLED))) {
+			struct lov_lock_link *link;
+			/*
+			 * we could race with lock deletion which temporarily
+			 * put the lock in freeing state, bug 19080.
+			 */
+			LASSERT(!(lls->sub_flags & LSF_HELD));
+
+			link = lov_lock_link_find(env, lck, sublock);
+			LASSERT(link != NULL);
+			lov_lock_unlink(env, link, sublock);
+			lov_sublock_unlock(env, sublock, closure, NULL);
+			lck->lls_cancel_race = 1;
+			result = CLO_REPEAT;
+		} else if (lsep) {
+			struct lov_sublock_env *subenv;
+			subenv = lov_sublock_env_get(env, parent, lls);
+			if (IS_ERR(subenv)) {
+				lov_sublock_unlock(env, sublock,
+						   closure, NULL);
+				result = PTR_ERR(subenv);
+			} else {
+				*lsep = subenv;
+			}
+		}
+	}
+	RETURN(result);
+}
+
+/**
+ * Updates the result of a top-lock operation from a result of sub-lock
+ * sub-operations. Top-operations like lov_lock_{enqueue,use,unuse}() iterate
+ * over sub-locks and lov_subresult() is used to calculate return value of a
+ * top-operation. To this end, possible return values of sub-operations are
+ * ordered as
+ *
+ *     - 0		  success
+ *     - CLO_WAIT	   wait for event
+ *     - CLO_REPEAT	 repeat top-operation
+ *     - -ne		fundamental error
+ *
+ * Top-level return code can only go down through this list. CLO_REPEAT
+ * overwrites CLO_WAIT, because lock mutex was released and sleeping condition
+ * has to be rechecked by the upper layer.
+ */
+static int lov_subresult(int result, int rc)
+{
+	int result_rank;
+	int rc_rank;
+
+	ENTRY;
+
+	LASSERTF(result <= 0 || result == CLO_REPEAT || result == CLO_WAIT,
+		 "result = %d", result);
+	LASSERTF(rc <= 0 || rc == CLO_REPEAT || rc == CLO_WAIT,
+		 "rc = %d\n", rc);
+	CLASSERT(CLO_WAIT < CLO_REPEAT);
+
+	/* calculate ranks in the ordering above */
+	result_rank = result < 0 ? 1 + CLO_REPEAT : result;
+	rc_rank = rc < 0 ? 1 + CLO_REPEAT : rc;
+
+	if (result_rank < rc_rank)
+		result = rc;
+	RETURN(result);
+}
+
+/**
+ * Creates sub-locks for a given lov_lock for the first time.
+ *
+ * Goes through all sub-objects of top-object, and creates sub-locks on every
+ * sub-object intersecting with top-lock extent. This is complicated by the
+ * fact that top-lock (that is being created) can be accessed concurrently
+ * through already created sub-locks (possibly shared with other top-locks).
+ */
+static int lov_lock_sub_init(const struct lu_env *env,
+			     struct lov_lock *lck, const struct cl_io *io)
+{
+	int result = 0;
+	int i;
+	int nr;
+	obd_off start;
+	obd_off end;
+	obd_off file_start;
+	obd_off file_end;
+
+	struct lov_object       *loo    = cl2lov(lck->lls_cl.cls_obj);
+	struct lov_layout_raid0 *r0     = lov_r0(loo);
+	struct cl_lock	  *parent = lck->lls_cl.cls_lock;
+
+	ENTRY;
+
+	lck->lls_orig = parent->cll_descr;
+	file_start = cl_offset(lov2cl(loo), parent->cll_descr.cld_start);
+	file_end   = cl_offset(lov2cl(loo), parent->cll_descr.cld_end + 1) - 1;
+
+	for (i = 0, nr = 0; i < r0->lo_nr; i++) {
+		/*
+		 * XXX for wide striping smarter algorithm is desirable,
+		 * breaking out of the loop, early.
+		 */
+		if (lov_stripe_intersects(loo->lo_lsm, i,
+					  file_start, file_end, &start, &end))
+			nr++;
+	}
+	LASSERT(nr > 0);
+	OBD_ALLOC_LARGE(lck->lls_sub, nr * sizeof lck->lls_sub[0]);
+	if (lck->lls_sub == NULL)
+		RETURN(-ENOMEM);
+
+	lck->lls_nr = nr;
+	/*
+	 * First, fill in sub-lock descriptions in
+	 * lck->lls_sub[].sub_descr. They are used by lov_sublock_alloc()
+	 * (called below in this function, and by lov_lock_enqueue()) to
+	 * create sub-locks. At this moment, no other thread can access
+	 * top-lock.
+	 */
+	for (i = 0, nr = 0; i < r0->lo_nr; ++i) {
+		if (lov_stripe_intersects(loo->lo_lsm, i,
+					  file_start, file_end, &start, &end)) {
+			struct cl_lock_descr *descr;
+
+			descr = &lck->lls_sub[nr].sub_descr;
+
+			LASSERT(descr->cld_obj == NULL);
+			descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
+			descr->cld_start = cl_index(descr->cld_obj, start);
+			descr->cld_end   = cl_index(descr->cld_obj, end);
+			descr->cld_mode  = parent->cll_descr.cld_mode;
+			descr->cld_gid   = parent->cll_descr.cld_gid;
+			descr->cld_enq_flags   = parent->cll_descr.cld_enq_flags;
+			/* XXX has no effect */
+			lck->lls_sub[nr].sub_got = *descr;
+			lck->lls_sub[nr].sub_stripe = i;
+			nr++;
+		}
+	}
+	LASSERT(nr == lck->lls_nr);
+	/*
+	 * Then, create sub-locks. Once at least one sub-lock was created,
+	 * top-lock can be reached by other threads.
+	 */
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct cl_lock       *sublock;
+		struct lov_lock_link *link;
+
+		if (lck->lls_sub[i].sub_lock == NULL) {
+			sublock = lov_sublock_alloc(env, io, lck, i, &link);
+			if (IS_ERR(sublock)) {
+				result = PTR_ERR(sublock);
+				break;
+			}
+			cl_lock_get_trust(sublock);
+			cl_lock_mutex_get(env, sublock);
+			cl_lock_mutex_get(env, parent);
+			/*
+			 * recheck under mutex that sub-lock wasn't created
+			 * concurrently, and that top-lock is still alive.
+			 */
+			if (lck->lls_sub[i].sub_lock == NULL &&
+			    parent->cll_state < CLS_FREEING) {
+				lov_sublock_adopt(env, lck, sublock, i, link);
+				cl_lock_mutex_put(env, parent);
+			} else {
+				OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+				cl_lock_mutex_put(env, parent);
+				cl_lock_unhold(env, sublock,
+					       "lov-parent", parent);
+			}
+			cl_lock_mutex_put(env, sublock);
+			cl_lock_put(env, sublock);
+		}
+	}
+	/*
+	 * Some sub-locks can be missing at this point. This is not a problem,
+	 * because enqueue will create them anyway. Main duty of this function
+	 * is to fill in sub-lock descriptions in a race free manner.
+	 */
+	RETURN(result);
+}
+
+static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck,
+			       int i, int deluser, int rc)
+{
+	struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+	ENTRY;
+
+	if (lck->lls_sub[i].sub_flags & LSF_HELD) {
+		struct cl_lock    *sublock;
+		int dying;
+
+		LASSERT(lck->lls_sub[i].sub_lock != NULL);
+		sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+		LASSERT(cl_lock_is_mutexed(sublock));
+
+		lck->lls_sub[i].sub_flags &= ~LSF_HELD;
+		if (deluser)
+			cl_lock_user_del(env, sublock);
+		/*
+		 * If the last hold is released, and cancellation is pending
+		 * for a sub-lock, release parent mutex, to avoid keeping it
+		 * while sub-lock is being paged out.
+		 */
+		dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM ||
+			 sublock->cll_descr.cld_mode == CLM_GROUP ||
+			 (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) &&
+			sublock->cll_holds == 1;
+		if (dying)
+			cl_lock_mutex_put(env, parent);
+		cl_lock_unhold(env, sublock, "lov-parent", parent);
+		if (dying) {
+			cl_lock_mutex_get(env, parent);
+			rc = lov_subresult(rc, CLO_REPEAT);
+		}
+		/*
+		 * From now on lck->lls_sub[i].sub_lock is a "weak" pointer,
+		 * not backed by a reference on a
+		 * sub-lock. lovsub_lock_delete() will clear
+		 * lck->lls_sub[i].sub_lock under semaphores, just before
+		 * sub-lock is destroyed.
+		 */
+	}
+	RETURN(rc);
+}
+
+static void lov_sublock_hold(const struct lu_env *env, struct lov_lock *lck,
+			     int i)
+{
+	struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+	ENTRY;
+
+	if (!(lck->lls_sub[i].sub_flags & LSF_HELD)) {
+		struct cl_lock *sublock;
+
+		LASSERT(lck->lls_sub[i].sub_lock != NULL);
+		sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+		LASSERT(cl_lock_is_mutexed(sublock));
+		LASSERT(sublock->cll_state != CLS_FREEING);
+
+		lck->lls_sub[i].sub_flags |= LSF_HELD;
+
+		cl_lock_get_trust(sublock);
+		cl_lock_hold_add(env, sublock, "lov-parent", parent);
+		cl_lock_user_add(env, sublock);
+		cl_lock_put(env, sublock);
+	}
+	EXIT;
+}
+
+static void lov_lock_fini(const struct lu_env *env,
+			  struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck;
+	int i;
+
+	ENTRY;
+	lck = cl2lov_lock(slice);
+	LASSERT(lck->lls_nr_filled == 0);
+	if (lck->lls_sub != NULL) {
+		for (i = 0; i < lck->lls_nr; ++i)
+			/*
+			 * No sub-locks exists at this point, as sub-lock has
+			 * a reference on its parent.
+			 */
+			LASSERT(lck->lls_sub[i].sub_lock == NULL);
+		OBD_FREE_LARGE(lck->lls_sub,
+			       lck->lls_nr * sizeof lck->lls_sub[0]);
+	}
+	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+	EXIT;
+}
+
+static int lov_lock_enqueue_wait(const struct lu_env *env,
+				 struct lov_lock *lck,
+				 struct cl_lock *sublock)
+{
+	struct cl_lock *lock = lck->lls_cl.cls_lock;
+	int	     result;
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+
+	cl_lock_mutex_put(env, lock);
+	result = cl_lock_enqueue_wait(env, sublock, 0);
+	cl_lock_mutex_get(env, lock);
+	RETURN(result ?: CLO_REPEAT);
+}
+
+/**
+ * Tries to advance a state machine of a given sub-lock toward enqueuing of
+ * the top-lock.
+ *
+ * \retval 0 if state-transition can proceed
+ * \retval -ve otherwise.
+ */
+static int lov_lock_enqueue_one(const struct lu_env *env, struct lov_lock *lck,
+				struct cl_lock *sublock,
+				struct cl_io *io, __u32 enqflags, int last)
+{
+	int result;
+	ENTRY;
+
+	/* first, try to enqueue a sub-lock ... */
+	result = cl_enqueue_try(env, sublock, io, enqflags);
+	if ((sublock->cll_state == CLS_ENQUEUED) && !(enqflags & CEF_AGL)) {
+		/* if it is enqueued, try to `wait' on it---maybe it's already
+		 * granted */
+		result = cl_wait_try(env, sublock);
+		if (result == CLO_REENQUEUED)
+			result = CLO_WAIT;
+	}
+	/*
+	 * If CEF_ASYNC flag is set, then all sub-locks can be enqueued in
+	 * parallel, otherwise---enqueue has to wait until sub-lock is granted
+	 * before proceeding to the next one.
+	 */
+	if ((result == CLO_WAIT) && (sublock->cll_state <= CLS_HELD) &&
+	    (enqflags & CEF_ASYNC) && (!last || (enqflags & CEF_AGL)))
+		result = 0;
+	RETURN(result);
+}
+
+/**
+ * Helper function for lov_lock_enqueue() that creates missing sub-lock.
+ */
+static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent,
+			    struct cl_io *io, struct lov_lock *lck, int idx)
+{
+	struct lov_lock_link *link;
+	struct cl_lock       *sublock;
+	int		   result;
+
+	LASSERT(parent->cll_depth == 1);
+	cl_lock_mutex_put(env, parent);
+	sublock = lov_sublock_alloc(env, io, lck, idx, &link);
+	if (!IS_ERR(sublock))
+		cl_lock_mutex_get(env, sublock);
+	cl_lock_mutex_get(env, parent);
+
+	if (!IS_ERR(sublock)) {
+		cl_lock_get_trust(sublock);
+		if (parent->cll_state == CLS_QUEUING &&
+		    lck->lls_sub[idx].sub_lock == NULL) {
+			lov_sublock_adopt(env, lck, sublock, idx, link);
+		} else {
+			OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+			/* other thread allocated sub-lock, or enqueue is no
+			 * longer going on */
+			cl_lock_mutex_put(env, parent);
+			cl_lock_unhold(env, sublock, "lov-parent", parent);
+			cl_lock_mutex_get(env, parent);
+		}
+		cl_lock_mutex_put(env, sublock);
+		cl_lock_put(env, sublock);
+		result = CLO_REPEAT;
+	} else
+		result = PTR_ERR(sublock);
+	return result;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This
+ * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock
+ * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock
+ * state machines in the face of sub-locks sharing (by multiple top-locks),
+ * and concurrent sub-lock cancellations.
+ */
+static int lov_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *io, __u32 enqflags)
+{
+	struct cl_lock	 *lock    = slice->cls_lock;
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, lock);
+	int i;
+	int result;
+	enum cl_lock_state minstate;
+
+	ENTRY;
+
+	for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct lov_lock_sub    *lls;
+		struct cl_lock	 *sublock;
+		struct lov_sublock_env *subenv;
+
+		if (lock->cll_state != CLS_QUEUING) {
+			/*
+			 * Lock might have left QUEUING state if previous
+			 * iteration released its mutex. Stop enqueing in this
+			 * case and let the upper layer to decide what to do.
+			 */
+			LASSERT(i > 0 && result != 0);
+			break;
+		}
+
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		/*
+		 * Sub-lock might have been canceled, while top-lock was
+		 * cached.
+		 */
+		if (sub == NULL) {
+			result = lov_sublock_fill(env, lock, io, lck, i);
+			/* lov_sublock_fill() released @lock mutex,
+			 * restart. */
+			break;
+		}
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			lov_sublock_hold(env, lck, i);
+			rc = lov_lock_enqueue_one(subenv->lse_env, lck, sublock,
+						  subenv->lse_io, enqflags,
+						  i == lck->lls_nr - 1);
+			minstate = min(minstate, sublock->cll_state);
+			if (rc == CLO_WAIT) {
+				switch (sublock->cll_state) {
+				case CLS_QUEUING:
+					/* take recursive mutex, the lock is
+					 * released in lov_lock_enqueue_wait.
+					 */
+					cl_lock_mutex_get(env, sublock);
+					lov_sublock_unlock(env, sub, closure,
+							   subenv);
+					rc = lov_lock_enqueue_wait(env, lck,
+								   sublock);
+					break;
+				case CLS_CACHED:
+					cl_lock_get(sublock);
+					/* take recursive mutex of sublock */
+					cl_lock_mutex_get(env, sublock);
+					/* need to release all locks in closure
+					 * otherwise it may deadlock. LU-2683.*/
+					lov_sublock_unlock(env, sub, closure,
+							   subenv);
+					/* sublock and parent are held. */
+					rc = lov_sublock_release(env, lck, i,
+								 1, rc);
+					cl_lock_mutex_put(env, sublock);
+					cl_lock_put(env, sublock);
+					break;
+				default:
+					lov_sublock_unlock(env, sub, closure,
+							   subenv);
+					break;
+				}
+			} else {
+				LASSERT(sublock->cll_conflict == NULL);
+				lov_sublock_unlock(env, sub, closure, subenv);
+			}
+		}
+		result = lov_subresult(result, rc);
+		if (result != 0)
+			break;
+	}
+	cl_lock_closure_fini(closure);
+	RETURN(result ?: minstate >= CLS_ENQUEUED ? 0 : CLO_WAIT);
+}
+
+static int lov_lock_unuse(const struct lu_env *env,
+			  const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	int i;
+	int result;
+
+	ENTRY;
+
+	for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		/* top-lock state cannot change concurrently, because single
+		 * thread (one that released the last hold) carries unlocking
+		 * to the completion. */
+		LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		if (sub == NULL)
+			continue;
+
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			if (lls->sub_flags & LSF_HELD) {
+				LASSERT(sublock->cll_state == CLS_HELD ||
+					sublock->cll_state == CLS_ENQUEUED);
+				rc = cl_unuse_try(subenv->lse_env, sublock);
+				rc = lov_sublock_release(env, lck, i, 0, rc);
+			}
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+		result = lov_subresult(result, rc);
+	}
+
+	if (result == 0 && lck->lls_cancel_race) {
+		lck->lls_cancel_race = 0;
+		result = -ESTALE;
+	}
+	cl_lock_closure_fini(closure);
+	RETURN(result);
+}
+
+
+static void lov_lock_cancel(const struct lu_env *env,
+			   const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	int i;
+	int result;
+
+	ENTRY;
+
+	for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		/* top-lock state cannot change concurrently, because single
+		 * thread (one that released the last hold) carries unlocking
+		 * to the completion. */
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		if (sub == NULL)
+			continue;
+
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			if (!(lls->sub_flags & LSF_HELD)) {
+				lov_sublock_unlock(env, sub, closure, subenv);
+				continue;
+			}
+
+			switch(sublock->cll_state) {
+			case CLS_HELD:
+				rc = cl_unuse_try(subenv->lse_env, sublock);
+				lov_sublock_release(env, lck, i, 0, 0);
+				break;
+			default:
+				lov_sublock_release(env, lck, i, 1, 0);
+				break;
+			}
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+
+		if (rc == CLO_REPEAT) {
+			--i;
+			continue;
+		}
+
+		result = lov_subresult(result, rc);
+	}
+
+	if (result)
+		CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock,
+			      "lov_lock_cancel fails with %d.\n", result);
+
+	cl_lock_closure_fini(closure);
+}
+
+static int lov_lock_wait(const struct lu_env *env,
+			 const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	enum cl_lock_state      minstate;
+	int		     reenqueued;
+	int		     result;
+	int		     i;
+
+	ENTRY;
+
+again:
+	for (result = 0, minstate = CLS_FREEING, i = 0, reenqueued = 0;
+	     i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		LASSERT(sub != NULL);
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			LASSERT(sublock->cll_state >= CLS_ENQUEUED);
+			if (sublock->cll_state < CLS_HELD)
+				rc = cl_wait_try(env, sublock);
+
+			minstate = min(minstate, sublock->cll_state);
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+		if (rc == CLO_REENQUEUED) {
+			reenqueued++;
+			rc = 0;
+		}
+		result = lov_subresult(result, rc);
+		if (result != 0)
+			break;
+	}
+	/* Each sublock only can be reenqueued once, so will not loop for
+	 * ever. */
+	if (result == 0 && reenqueued != 0)
+		goto again;
+	cl_lock_closure_fini(closure);
+	RETURN(result ?: minstate >= CLS_HELD ? 0 : CLO_WAIT);
+}
+
+static int lov_lock_use(const struct lu_env *env,
+			const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	int		     result;
+	int		     i;
+
+	LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+	ENTRY;
+
+	for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+		int rc;
+		struct lovsub_lock     *sub;
+		struct cl_lock	 *sublock;
+		struct lov_lock_sub    *lls;
+		struct lov_sublock_env *subenv;
+
+		LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+
+		lls = &lck->lls_sub[i];
+		sub = lls->sub_lock;
+		if (sub == NULL) {
+			/*
+			 * Sub-lock might have been canceled, while top-lock was
+			 * cached.
+			 */
+			result = -ESTALE;
+			break;
+		}
+
+		sublock = sub->lss_cl.cls_lock;
+		rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+		if (rc == 0) {
+			LASSERT(sublock->cll_state != CLS_FREEING);
+			lov_sublock_hold(env, lck, i);
+			if (sublock->cll_state == CLS_CACHED) {
+				rc = cl_use_try(subenv->lse_env, sublock, 0);
+				if (rc != 0)
+					rc = lov_sublock_release(env, lck,
+								 i, 1, rc);
+			} else if (sublock->cll_state == CLS_NEW) {
+				/* Sub-lock might have been canceled, while
+				 * top-lock was cached. */
+				result = -ESTALE;
+				lov_sublock_release(env, lck, i, 1, result);
+			}
+			lov_sublock_unlock(env, sub, closure, subenv);
+		}
+		result = lov_subresult(result, rc);
+		if (result != 0)
+			break;
+	}
+
+	if (lck->lls_cancel_race) {
+		/*
+		 * If there is unlocking happened at the same time, then
+		 * sublock_lock state should be FREEING, and lov_sublock_lock
+		 * should return CLO_REPEAT. In this case, it should return
+		 * ESTALE, and up layer should reset the lock state to be NEW.
+		 */
+		lck->lls_cancel_race = 0;
+		LASSERT(result != 0);
+		result = -ESTALE;
+	}
+	cl_lock_closure_fini(closure);
+	RETURN(result);
+}
+
+#if 0
+static int lock_lock_multi_match()
+{
+	struct cl_lock	  *lock    = slice->cls_lock;
+	struct cl_lock_descr    *subneed = &lov_env_info(env)->lti_ldescr;
+	struct lov_object       *loo     = cl2lov(lov->lls_cl.cls_obj);
+	struct lov_layout_raid0 *r0      = lov_r0(loo);
+	struct lov_lock_sub     *sub;
+	struct cl_object	*subobj;
+	obd_off  fstart;
+	obd_off  fend;
+	obd_off  start;
+	obd_off  end;
+	int i;
+
+	fstart = cl_offset(need->cld_obj, need->cld_start);
+	fend   = cl_offset(need->cld_obj, need->cld_end + 1) - 1;
+	subneed->cld_mode = need->cld_mode;
+	cl_lock_mutex_get(env, lock);
+	for (i = 0; i < lov->lls_nr; ++i) {
+		sub = &lov->lls_sub[i];
+		if (sub->sub_lock == NULL)
+			continue;
+		subobj = sub->sub_descr.cld_obj;
+		if (!lov_stripe_intersects(loo->lo_lsm, sub->sub_stripe,
+					   fstart, fend, &start, &end))
+			continue;
+		subneed->cld_start = cl_index(subobj, start);
+		subneed->cld_end   = cl_index(subobj, end);
+		subneed->cld_obj   = subobj;
+		if (!cl_lock_ext_match(&sub->sub_got, subneed)) {
+			result = 0;
+			break;
+		}
+	}
+	cl_lock_mutex_put(env, lock);
+}
+#endif
+
+/**
+ * Check if the extent region \a descr is covered by \a child against the
+ * specific \a stripe.
+ */
+static int lov_lock_stripe_is_matching(const struct lu_env *env,
+				       struct lov_object *lov, int stripe,
+				       const struct cl_lock_descr *child,
+				       const struct cl_lock_descr *descr)
+{
+	struct lov_stripe_md *lsm = lov->lo_lsm;
+	obd_off start;
+	obd_off end;
+	int result;
+
+	if (lov_r0(lov)->lo_nr == 1)
+		return cl_lock_ext_match(child, descr);
+
+	/*
+	 * For a multi-stripes object:
+	 * - make sure the descr only covers child's stripe, and
+	 * - check if extent is matching.
+	 */
+	start = cl_offset(&lov->lo_cl, descr->cld_start);
+	end   = cl_offset(&lov->lo_cl, descr->cld_end + 1) - 1;
+	result = end - start <= lsm->lsm_stripe_size &&
+		 stripe == lov_stripe_number(lsm, start) &&
+		 stripe == lov_stripe_number(lsm, end);
+	if (result) {
+		struct cl_lock_descr *subd = &lov_env_info(env)->lti_ldescr;
+		obd_off sub_start;
+		obd_off sub_end;
+
+		subd->cld_obj  = NULL;   /* don't need sub object at all */
+		subd->cld_mode = descr->cld_mode;
+		subd->cld_gid  = descr->cld_gid;
+		result = lov_stripe_intersects(lsm, stripe, start, end,
+					       &sub_start, &sub_end);
+		LASSERT(result);
+		subd->cld_start = cl_index(child->cld_obj, sub_start);
+		subd->cld_end   = cl_index(child->cld_obj, sub_end);
+		result = cl_lock_ext_match(child, subd);
+	}
+	return result;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_fits_into() method.
+ *
+ * Checks whether a lock (given by \a slice) is suitable for \a
+ * io. Multi-stripe locks can be used only for "quick" io, like truncate, or
+ * O_APPEND write.
+ *
+ * \see ccc_lock_fits_into().
+ */
+static int lov_lock_fits_into(const struct lu_env *env,
+			      const struct cl_lock_slice *slice,
+			      const struct cl_lock_descr *need,
+			      const struct cl_io *io)
+{
+	struct lov_lock   *lov = cl2lov_lock(slice);
+	struct lov_object *obj = cl2lov(slice->cls_obj);
+	int result;
+
+	LASSERT(cl_object_same(need->cld_obj, slice->cls_obj));
+	LASSERT(lov->lls_nr > 0);
+
+	ENTRY;
+
+	/* for top lock, it's necessary to match enq flags otherwise it will
+	 * run into problem if a sublock is missing and reenqueue. */
+	if (need->cld_enq_flags != lov->lls_orig.cld_enq_flags)
+		return 0;
+
+	if (need->cld_mode == CLM_GROUP)
+		/*
+		 * always allow to match group lock.
+		 */
+		result = cl_lock_ext_match(&lov->lls_orig, need);
+	else if (lov->lls_nr == 1) {
+		struct cl_lock_descr *got = &lov->lls_sub[0].sub_got;
+		result = lov_lock_stripe_is_matching(env,
+						     cl2lov(slice->cls_obj),
+						     lov->lls_sub[0].sub_stripe,
+						     got, need);
+	} else if (io->ci_type != CIT_SETATTR && io->ci_type != CIT_MISC &&
+		   !cl_io_is_append(io) && need->cld_mode != CLM_PHANTOM)
+		/*
+		 * Multi-stripe locks are only suitable for `quick' IO and for
+		 * glimpse.
+		 */
+		result = 0;
+	else
+		/*
+		 * Most general case: multi-stripe existing lock, and
+		 * (potentially) multi-stripe @need lock. Check that @need is
+		 * covered by @lov's sub-locks.
+		 *
+		 * For now, ignore lock expansions made by the server, and
+		 * match against original lock extent.
+		 */
+		result = cl_lock_ext_match(&lov->lls_orig, need);
+	CDEBUG(D_DLMTRACE, DDESCR"/"DDESCR" %d %d/%d: %d\n",
+	       PDESCR(&lov->lls_orig), PDESCR(&lov->lls_sub[0].sub_got),
+	       lov->lls_sub[0].sub_stripe, lov->lls_nr, lov_r0(obj)->lo_nr,
+	       result);
+	RETURN(result);
+}
+
+void lov_lock_unlink(const struct lu_env *env,
+		     struct lov_lock_link *link, struct lovsub_lock *sub)
+{
+	struct lov_lock *lck    = link->lll_super;
+	struct cl_lock  *parent = lck->lls_cl.cls_lock;
+
+	LASSERT(cl_lock_is_mutexed(parent));
+	LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+	ENTRY;
+
+	list_del_init(&link->lll_list);
+	LASSERT(lck->lls_sub[link->lll_idx].sub_lock == sub);
+	/* yank this sub-lock from parent's array */
+	lck->lls_sub[link->lll_idx].sub_lock = NULL;
+	LASSERT(lck->lls_nr_filled > 0);
+	lck->lls_nr_filled--;
+	lu_ref_del(&parent->cll_reference, "lov-child", sub->lss_cl.cls_lock);
+	cl_lock_put(env, parent);
+	OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+	EXIT;
+}
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+					 struct lov_lock *lck,
+					 struct lovsub_lock *sub)
+{
+	struct lov_lock_link *scan;
+
+	LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+	ENTRY;
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		if (scan->lll_super == lck)
+			RETURN(scan);
+	}
+	RETURN(NULL);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked for "top-to-bottom" delete, when lock destruction starts from the
+ * top-lock, e.g., as a result of inode destruction.
+ *
+ * Unlinks top-lock from all its sub-locks. Sub-locks are not deleted there:
+ * this is done separately elsewhere:
+ *
+ *     - for inode destruction, lov_object_delete() calls cl_object_kill() for
+ *       each sub-object, purging its locks;
+ *
+ *     - in other cases (e.g., a fatal error with a top-lock) sub-locks are
+ *       left in the cache.
+ */
+static void lov_lock_delete(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct lov_lock	*lck     = cl2lov_lock(slice);
+	struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+	struct lov_lock_link   *link;
+	int		     rc;
+	int		     i;
+
+	LASSERT(slice->cls_lock->cll_state == CLS_FREEING);
+	ENTRY;
+
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct lov_lock_sub *lls = &lck->lls_sub[i];
+		struct lovsub_lock  *lsl = lls->sub_lock;
+
+		if (lsl == NULL) /* already removed */
+			continue;
+
+		rc = lov_sublock_lock(env, lck, lls, closure, NULL);
+		if (rc == CLO_REPEAT) {
+			--i;
+			continue;
+		}
+
+		LASSERT(rc == 0);
+		LASSERT(lsl->lss_cl.cls_lock->cll_state < CLS_FREEING);
+
+		if (lls->sub_flags & LSF_HELD)
+			lov_sublock_release(env, lck, i, 1, 0);
+
+		link = lov_lock_link_find(env, lck, lsl);
+		LASSERT(link != NULL);
+		lov_lock_unlink(env, link, lsl);
+		LASSERT(lck->lls_sub[i].sub_lock == NULL);
+
+		lov_sublock_unlock(env, lsl, closure, NULL);
+	}
+
+	cl_lock_closure_fini(closure);
+	EXIT;
+}
+
+static int lov_lock_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck = cl2lov_lock(slice);
+	int	      i;
+
+	(*p)(env, cookie, "%d\n", lck->lls_nr);
+	for (i = 0; i < lck->lls_nr; ++i) {
+		struct lov_lock_sub *sub;
+
+		sub = &lck->lls_sub[i];
+		(*p)(env, cookie, "    %d %x: ", i, sub->sub_flags);
+		if (sub->sub_lock != NULL)
+			cl_lock_print(env, cookie, p,
+				      sub->sub_lock->lss_cl.cls_lock);
+		else
+			(*p)(env, cookie, "---\n");
+	}
+	return 0;
+}
+
+static const struct cl_lock_operations lov_lock_ops = {
+	.clo_fini      = lov_lock_fini,
+	.clo_enqueue   = lov_lock_enqueue,
+	.clo_wait      = lov_lock_wait,
+	.clo_use       = lov_lock_use,
+	.clo_unuse     = lov_lock_unuse,
+	.clo_cancel    = lov_lock_cancel,
+	.clo_fits_into = lov_lock_fits_into,
+	.clo_delete    = lov_lock_delete,
+	.clo_print     = lov_lock_print
+};
+
+int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj,
+			struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lov_lock *lck;
+	int result;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, __GFP_IO);
+	if (lck != NULL) {
+		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
+		result = lov_lock_sub_init(env, lck, io);
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+}
+
+static void lov_empty_lock_fini(const struct lu_env *env,
+				struct cl_lock_slice *slice)
+{
+	struct lov_lock *lck = cl2lov_lock(slice);
+	OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+}
+
+static int lov_empty_lock_print(const struct lu_env *env, void *cookie,
+			lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	(*p)(env, cookie, "empty\n");
+	return 0;
+}
+
+/* XXX: more methods will be added later. */
+static const struct cl_lock_operations lov_empty_lock_ops = {
+	.clo_fini  = lov_empty_lock_fini,
+	.clo_print = lov_empty_lock_print
+};
+
+int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
+		struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lov_lock *lck;
+	int result = -ENOMEM;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, __GFP_IO);
+	if (lck != NULL) {
+		cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
+		lck->lls_orig = lock->cll_descr;
+		result = 0;
+	}
+	RETURN(result);
+}
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+					       struct cl_lock *parent)
+{
+	struct cl_lock_closure *closure;
+
+	closure = &lov_env_info(env)->lti_closure;
+	LASSERT(list_empty(&closure->clc_list));
+	cl_lock_closure_init(env, closure, parent, 1);
+	return closure;
+}
+
+
+/** @} lov */

diff --git a/drivers/staging/lustre/lustre/lov/lov_log.c b/drivers/staging/lustre/lustre/lov/lov_log.c
new file mode 100644
index 0000000..63b7f8d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_log.c

@@ -0,0 +1,278 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_log.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+#include <lustre_mds.h>
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <obd_ost.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+
+#include "lov_internal.h"
+
+/* Add log records for each OSC that this object is striped over, and return
+ * cookies for each one.  We _would_ have nice abstraction here, except that
+ * we need to keep cookies in stripe order, even if some are NULL, so that
+ * the right cookies are passed back to the right OSTs at the client side.
+ * Unset cookies should be all-zero (which will never occur naturally). */
+static int lov_llog_origin_add(const struct lu_env *env,
+			       struct llog_ctxt *ctxt,
+			       struct llog_rec_hdr *rec,
+			       struct lov_stripe_md *lsm,
+			       struct llog_cookie *logcookies, int numcookies)
+{
+	struct obd_device *obd = ctxt->loc_obd;
+	struct lov_obd *lov = &obd->u.lov;
+	int i, rc = 0, cookies = 0;
+	ENTRY;
+
+	LASSERTF(logcookies && numcookies >= lsm->lsm_stripe_count,
+		 "logcookies %p, numcookies %d lsm->lsm_stripe_count %d \n",
+		 logcookies, numcookies, lsm->lsm_stripe_count);
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		struct obd_device *child =
+			lov->lov_tgts[loi->loi_ost_idx]->ltd_exp->exp_obd;
+		struct llog_ctxt *cctxt = llog_get_context(child, ctxt->loc_idx);
+
+		/* fill mds unlink/setattr log record */
+		switch (rec->lrh_type) {
+		case MDS_UNLINK_REC: {
+			struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+			lur->lur_oid = ostid_id(&loi->loi_oi);
+			lur->lur_oseq = (__u32)ostid_seq(&loi->loi_oi);
+			break;
+		}
+		case MDS_SETATTR64_REC: {
+			struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec;
+			lsr->lsr_oi = loi->loi_oi;
+			break;
+		}
+		default:
+			break;
+		}
+
+		/* inject error in llog_obd_add() below */
+		if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FAIL_LOV_LOG_ADD)) {
+			llog_ctxt_put(cctxt);
+			cctxt = NULL;
+		}
+		rc = llog_obd_add(env, cctxt, rec, NULL, logcookies + cookies,
+				  numcookies - cookies);
+		llog_ctxt_put(cctxt);
+		if (rc < 0) {
+			CERROR("Can't add llog (rc = %d) for stripe %d\n",
+			       rc, cookies);
+			memset(logcookies + cookies, 0,
+			       sizeof(struct llog_cookie));
+			rc = 1; /* skip this cookie */
+		}
+		/* Note that rc is always 1 if llog_obd_add was successful */
+		cookies += rc;
+	}
+	RETURN(cookies);
+}
+
+static int lov_llog_origin_connect(struct llog_ctxt *ctxt,
+				   struct llog_logid *logid,
+				   struct llog_gen *gen,
+				   struct obd_uuid *uuid)
+{
+	struct obd_device *obd = ctxt->loc_obd;
+	struct lov_obd *lov = &obd->u.lov;
+	int i, rc = 0, err = 0;
+	ENTRY;
+
+	obd_getref(obd);
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		struct obd_device *child;
+		struct llog_ctxt *cctxt;
+
+		if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+			continue;
+		if (uuid && !obd_uuid_equals(uuid, &lov->lov_tgts[i]->ltd_uuid))
+			continue;
+		CDEBUG(D_CONFIG, "connect %d/%d\n", i, lov->desc.ld_tgt_count);
+		child = lov->lov_tgts[i]->ltd_exp->exp_obd;
+		cctxt = llog_get_context(child, ctxt->loc_idx);
+		rc = llog_connect(cctxt, logid, gen, uuid);
+		llog_ctxt_put(cctxt);
+
+		if (rc) {
+			CERROR("error osc_llog_connect tgt %d (%d)\n", i, rc);
+			if (!err)
+				err = rc;
+		}
+	}
+	obd_putref(obd);
+
+	RETURN(err);
+}
+
+/* the replicators commit callback */
+static int lov_llog_repl_cancel(const struct lu_env *env,
+				struct llog_ctxt *ctxt,
+				struct lov_stripe_md *lsm,
+				int count, struct llog_cookie *cookies,
+				int flags)
+{
+	struct lov_obd *lov;
+	struct obd_device *obd = ctxt->loc_obd;
+	int rc = 0, i;
+	ENTRY;
+
+	LASSERT(lsm != NULL);
+	LASSERT(count == lsm->lsm_stripe_count);
+
+	lov = &obd->u.lov;
+	obd_getref(obd);
+	for (i = 0; i < count; i++, cookies++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		struct obd_device *child =
+			lov->lov_tgts[loi->loi_ost_idx]->ltd_exp->exp_obd;
+		struct llog_ctxt *cctxt =
+			llog_get_context(child, ctxt->loc_idx);
+		int err;
+
+		err = llog_cancel(env, cctxt, NULL, 1, cookies, flags);
+		llog_ctxt_put(cctxt);
+		if (err && lov->lov_tgts[loi->loi_ost_idx]->ltd_active) {
+			CERROR("%s: objid "DOSTID" subobj "DOSTID
+			       " on OST idx %d: rc = %d\n",
+			       obd->obd_name, POSTID(&lsm->lsm_oi),
+			       POSTID(&loi->loi_oi), loi->loi_ost_idx, err);
+			if (!rc)
+				rc = err;
+		}
+	}
+	obd_putref(obd);
+	RETURN(rc);
+}
+
+static struct llog_operations lov_mds_ost_orig_logops = {
+	.lop_obd_add	= lov_llog_origin_add,
+	.lop_connect	= lov_llog_origin_connect,
+};
+
+static struct llog_operations lov_size_repl_logops = {
+	.lop_cancel	= lov_llog_repl_cancel,
+};
+
+int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+		  struct obd_device *disk_obd, int *index)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_device *child;
+	int i, rc = 0;
+	ENTRY;
+
+	LASSERT(olg == &obd->obd_olg);
+	rc = llog_setup(NULL, obd, olg, LLOG_MDS_OST_ORIG_CTXT, disk_obd,
+			&lov_mds_ost_orig_logops);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_setup(NULL, obd, olg, LLOG_SIZE_REPL_CTXT, disk_obd,
+			&lov_size_repl_logops);
+	if (rc)
+		GOTO(err_cleanup, rc);
+
+	obd_getref(obd);
+	/* count may not match lov->desc.ld_tgt_count during dynamic ost add */
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		if (!lov->lov_tgts[i])
+			continue;
+
+		if (index && i != *index)
+			continue;
+
+		child = lov->lov_tgts[i]->ltd_obd;
+		rc = obd_llog_init(child, &child->obd_olg, disk_obd, &i);
+		if (rc)
+			CERROR("error osc_llog_init idx %d osc '%s' tgt '%s' "
+			       "(rc=%d)\n", i, child->obd_name,
+			       disk_obd->obd_name, rc);
+		rc = 0;
+	}
+	obd_putref(obd);
+	GOTO(err_cleanup, rc);
+err_cleanup:
+	if (rc) {
+		struct llog_ctxt *ctxt =
+			llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+		if (ctxt)
+			llog_cleanup(NULL, ctxt);
+		ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+		if (ctxt)
+			llog_cleanup(NULL, ctxt);
+	}
+	return rc;
+}
+
+int lov_llog_finish(struct obd_device *obd, int count)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	/* cleanup our llogs only if the ctxts have been setup
+	 * (client lov doesn't setup, mds lov does). */
+	ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+
+	ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+
+	/* lov->tgt llogs are cleaned during osc_cleanup. */
+	RETURN(0);
+}

diff --git a/drivers/staging/lustre/lustre/lov/lov_merge.c b/drivers/staging/lustre/lustre/lov/lov_merge.c
new file mode 100644
index 0000000..ddbac12
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_merge.c

@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+
+#include "lov_internal.h"
+
+/** Merge the lock value block(&lvb) attributes and KMS from each of the
+ * stripes in a file into a single lvb. It is expected that the caller
+ * initializes the current atime, mtime, ctime to avoid regressing a more
+ * uptodate time on the local client.
+ */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+		      struct ost_lvb *lvb, __u64 *kms_place)
+{
+	__u64 size = 0;
+	__u64 kms = 0;
+	__u64 blocks = 0;
+	obd_time current_mtime = lvb->lvb_mtime;
+	obd_time current_atime = lvb->lvb_atime;
+	obd_time current_ctime = lvb->lvb_ctime;
+	int i;
+	int rc = 0;
+
+	LASSERT(spin_is_locked(&lsm->lsm_lock));
+	LASSERT(lsm->lsm_lock_owner == current_pid());
+
+	CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s="LPU64" m="LPU64
+	       " a="LPU64" c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi),
+	       lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime,
+	       lvb->lvb_blocks);
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		obd_size lov_size, tmpsize;
+
+		if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) {
+			rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+			continue;
+		}
+
+		tmpsize = loi->loi_kms;
+		lov_size = lov_stripe_size(lsm, tmpsize, i);
+		if (lov_size > kms)
+			kms = lov_size;
+
+		if (loi->loi_lvb.lvb_size > tmpsize)
+			tmpsize = loi->loi_lvb.lvb_size;
+
+		lov_size = lov_stripe_size(lsm, tmpsize, i);
+		if (lov_size > size)
+			size = lov_size;
+		/* merge blocks, mtime, atime */
+		blocks += loi->loi_lvb.lvb_blocks;
+		if (loi->loi_lvb.lvb_mtime > current_mtime)
+			current_mtime = loi->loi_lvb.lvb_mtime;
+		if (loi->loi_lvb.lvb_atime > current_atime)
+			current_atime = loi->loi_lvb.lvb_atime;
+		if (loi->loi_lvb.lvb_ctime > current_ctime)
+			current_ctime = loi->loi_lvb.lvb_ctime;
+
+		CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s="LPU64" m="LPU64
+		       " a="LPU64" c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi),
+		       loi->loi_ost_idx, loi->loi_lvb.lvb_size,
+		       loi->loi_lvb.lvb_mtime, loi->loi_lvb.lvb_atime,
+		       loi->loi_lvb.lvb_ctime, loi->loi_lvb.lvb_blocks);
+	}
+
+	*kms_place = kms;
+	lvb->lvb_size = size;
+	lvb->lvb_blocks = blocks;
+	lvb->lvb_mtime = current_mtime;
+	lvb->lvb_atime = current_atime;
+	lvb->lvb_ctime = current_ctime;
+	RETURN(rc);
+}
+
+/** Merge the lock value block(&lvb) attributes from each of the stripes in a
+ * file into a single lvb. It is expected that the caller initializes the
+ * current atime, mtime, ctime to avoid regressing a more uptodate time on
+ * the local client.
+ *
+ * If \a kms_only is set then we do not consider the recently seen size (rss)
+ * when updating the known minimum size (kms).  Even when merging RSS, we will
+ * take the KMS value if it's larger.  This prevents getattr from stomping on
+ * dirty cached pages which extend the file size. */
+int lov_merge_lvb(struct obd_export *exp,
+		  struct lov_stripe_md *lsm, struct ost_lvb *lvb, int kms_only)
+{
+	int   rc;
+	__u64 kms;
+
+	ENTRY;
+	lov_stripe_lock(lsm);
+	rc = lov_merge_lvb_kms(lsm, lvb, &kms);
+	lov_stripe_unlock(lsm);
+	if (kms_only)
+		lvb->lvb_size = kms;
+
+	CDEBUG(D_INODE, "merged for ID "DOSTID" s="LPU64" m="LPU64" a="LPU64
+	       " c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi), lvb->lvb_size,
+	       lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime, lvb->lvb_blocks);
+	RETURN(rc);
+}
+
+/* Must be called under the lov_stripe_lock() */
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+		   obd_off size, int shrink)
+{
+	struct lov_oinfo *loi;
+	int stripe = 0;
+	__u64 kms;
+	ENTRY;
+
+	LASSERT(spin_is_locked(&lsm->lsm_lock));
+	LASSERT(lsm->lsm_lock_owner == current_pid());
+
+	if (shrink) {
+		for (; stripe < lsm->lsm_stripe_count; stripe++) {
+			struct lov_oinfo *loi = lsm->lsm_oinfo[stripe];
+			kms = lov_size_to_stripe(lsm, size, stripe);
+			CDEBUG(D_INODE,
+			       "stripe %d KMS %sing "LPU64"->"LPU64"\n",
+			       stripe, kms > loi->loi_kms ? "increas":"shrink",
+			       loi->loi_kms, kms);
+			loi_kms_set(loi, loi->loi_lvb.lvb_size = kms);
+		}
+		RETURN(0);
+	}
+
+	if (size > 0)
+		stripe = lov_stripe_number(lsm, size - 1);
+	kms = lov_size_to_stripe(lsm, size, stripe);
+	loi = lsm->lsm_oinfo[stripe];
+
+	CDEBUG(D_INODE, "stripe %d KMS %sincreasing "LPU64"->"LPU64"\n",
+	       stripe, kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms);
+	if (kms > loi->loi_kms)
+		loi_kms_set(loi, kms);
+
+	RETURN(0);
+}
+
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_valid valid,
+		     struct lov_stripe_md *lsm, int stripeno, int *set)
+{
+	valid &= src->o_valid;
+
+	if (*set) {
+		if (valid & OBD_MD_FLSIZE) {
+			/* this handles sparse files properly */
+			obd_size lov_size;
+
+			lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
+			if (lov_size > tgt->o_size)
+				tgt->o_size = lov_size;
+		}
+		if (valid & OBD_MD_FLBLOCKS)
+			tgt->o_blocks += src->o_blocks;
+		if (valid & OBD_MD_FLBLKSZ)
+			tgt->o_blksize += src->o_blksize;
+		if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
+			tgt->o_ctime = src->o_ctime;
+		if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
+			tgt->o_mtime = src->o_mtime;
+		if (valid & OBD_MD_FLDATAVERSION)
+			tgt->o_data_version += src->o_data_version;
+	} else {
+		memcpy(tgt, src, sizeof(*tgt));
+		tgt->o_oi = lsm->lsm_oi;
+		if (valid & OBD_MD_FLSIZE)
+			tgt->o_size = lov_stripe_size(lsm, src->o_size,
+						      stripeno);
+	}
+
+	/* data_version needs to be valid on all stripes to be correct! */
+	if (!(valid & OBD_MD_FLDATAVERSION))
+		tgt->o_valid &= ~OBD_MD_FLDATAVERSION;
+
+	*set += 1;
+}

diff --git a/drivers/staging/lustre/lustre/lov/lov_obd.c b/drivers/staging/lustre/lustre/lov/lov_obd.c
new file mode 100644
index 0000000..ef7ff09
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_obd.c

@@ -0,0 +1,2916 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_obd.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+#include <lustre_mds.h>
+#include <lustre_debug.h>
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <obd_ost.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+#include <cl_object.h>
+#include <lclient.h> /* for cl_client_lru */
+#include <lustre/ll_fiemap.h>
+#include <lustre_log.h>
+#include <lustre_fid.h>
+
+#include "lov_internal.h"
+
+/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
+   Any function that expects lov_tgts to remain stationary must take a ref. */
+static void lov_getref(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	/* nobody gets through here until lov_putref is done */
+	mutex_lock(&lov->lov_lock);
+	atomic_inc(&lov->lov_refcount);
+	mutex_unlock(&lov->lov_lock);
+	return;
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
+
+static void lov_putref(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+
+	mutex_lock(&lov->lov_lock);
+	/* ok to dec to 0 more than once -- ltd_exp's will be null */
+	if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) {
+		LIST_HEAD(kill);
+		int i;
+		struct lov_tgt_desc *tgt, *n;
+		CDEBUG(D_CONFIG, "destroying %d lov targets\n",
+		       lov->lov_death_row);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			tgt = lov->lov_tgts[i];
+
+			if (!tgt || !tgt->ltd_reap)
+				continue;
+			list_add(&tgt->ltd_kill, &kill);
+			/* XXX - right now there is a dependency on ld_tgt_count
+			 * being the maximum tgt index for computing the
+			 * mds_max_easize. So we can't shrink it. */
+			lov_ost_pool_remove(&lov->lov_packed, i);
+			lov->lov_tgts[i] = NULL;
+			lov->lov_death_row--;
+		}
+		mutex_unlock(&lov->lov_lock);
+
+		list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
+			list_del(&tgt->ltd_kill);
+			/* Disconnect */
+			__lov_del_obd(obd, tgt);
+		}
+	} else {
+		mutex_unlock(&lov->lov_lock);
+	}
+}
+
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+			      enum obd_notify_event ev);
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev, void *data);
+
+
+#define MAX_STRING_SIZE 128
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+		    struct obd_connect_data *data)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_uuid *tgt_uuid;
+	struct obd_device *tgt_obd;
+	static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
+	struct obd_import *imp;
+	proc_dir_entry_t *lov_proc_dir;
+	int rc;
+	ENTRY;
+
+	if (!lov->lov_tgts[index])
+		RETURN(-EINVAL);
+
+	tgt_uuid = &lov->lov_tgts[index]->ltd_uuid;
+	tgt_obd = lov->lov_tgts[index]->ltd_obd;
+
+	if (!tgt_obd->obd_set_up) {
+		CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid));
+		RETURN(-EINVAL);
+	}
+
+	/* override the sp_me from lov */
+	tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me;
+
+	if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX))
+		data->ocd_index = index;
+
+	/*
+	 * Divine LOV knows that OBDs under it are OSCs.
+	 */
+	imp = tgt_obd->u.cli.cl_import;
+
+	if (activate) {
+		tgt_obd->obd_no_recov = 0;
+		/* FIXME this is probably supposed to be
+		   ptlrpc_set_import_active.  Horrible naming. */
+		ptlrpc_activate_import(imp);
+	}
+
+	rc = obd_register_observer(tgt_obd, obd);
+	if (rc) {
+		CERROR("Target %s register_observer error %d\n",
+		       obd_uuid2str(tgt_uuid), rc);
+		RETURN(rc);
+	}
+
+
+	if (imp->imp_invalid) {
+		CDEBUG(D_CONFIG, "not connecting OSC %s; administratively "
+		       "disabled\n", obd_uuid2str(tgt_uuid));
+		RETURN(0);
+	}
+
+	rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd,
+			 &lov_osc_uuid, data, NULL);
+	if (rc || !lov->lov_tgts[index]->ltd_exp) {
+		CERROR("Target %s connect error %d\n",
+		       obd_uuid2str(tgt_uuid), rc);
+		RETURN(-ENODEV);
+	}
+
+	lov->lov_tgts[index]->ltd_reap = 0;
+
+	CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
+	       obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
+
+	lov_proc_dir = obd->obd_proc_private;
+	if (lov_proc_dir) {
+		struct obd_device *osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
+		proc_dir_entry_t *osc_symlink;
+
+		LASSERT(osc_obd != NULL);
+		LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC);
+		LASSERT(osc_obd->obd_type->typ_name != NULL);
+
+		osc_symlink = lprocfs_add_symlink(osc_obd->obd_name,
+						  lov_proc_dir,
+						  "../../../%s/%s",
+						  osc_obd->obd_type->typ_name,
+						  osc_obd->obd_name);
+		if (osc_symlink == NULL) {
+			CERROR("could not register LOV target "
+				"/proc/fs/lustre/%s/%s/target_obds/%s.",
+				obd->obd_type->typ_name, obd->obd_name,
+				osc_obd->obd_name);
+			lprocfs_remove(&lov_proc_dir);
+			obd->obd_proc_private = NULL;
+		}
+	}
+
+	RETURN(0);
+}
+
+static int lov_connect(const struct lu_env *env,
+		       struct obd_export **exp, struct obd_device *obd,
+		       struct obd_uuid *cluuid, struct obd_connect_data *data,
+		       void *localdata)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	struct lustre_handle conn;
+	int i, rc;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects);
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc)
+		RETURN(rc);
+
+	*exp = class_conn2export(&conn);
+
+	/* Why should there ever be more than 1 connect? */
+	lov->lov_connects++;
+	LASSERT(lov->lov_connects == 1);
+
+	memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd));
+	if (data)
+		lov->lov_ocd = *data;
+
+	obd_getref(obd);
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		tgt = lov->lov_tgts[i];
+		if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
+			continue;
+		/* Flags will be lowest common denominator */
+		rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd);
+		if (rc) {
+			CERROR("%s: lov connect tgt %d failed: %d\n",
+			       obd->obd_name, i, rc);
+			continue;
+		}
+		/* connect to administrative disabled ost */
+		if (!lov->lov_tgts[i]->ltd_exp)
+			continue;
+
+		rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd,
+				OBD_NOTIFY_CONNECT, (void *)&i);
+		if (rc) {
+			CERROR("%s error sending notify %d\n",
+			       obd->obd_name, rc);
+		}
+	}
+	obd_putref(obd);
+
+	RETURN(0);
+}
+
+static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+	proc_dir_entry_t *lov_proc_dir;
+	struct lov_obd *lov = &obd->u.lov;
+	struct obd_device *osc_obd;
+	int rc;
+	ENTRY;
+
+	osc_obd = class_exp2obd(tgt->ltd_exp);
+	CDEBUG(D_CONFIG, "%s: disconnecting target %s\n",
+	       obd->obd_name, osc_obd->obd_name);
+
+	if (tgt->ltd_active) {
+		tgt->ltd_active = 0;
+		lov->desc.ld_active_tgt_count--;
+		tgt->ltd_exp->exp_obd->obd_inactive = 1;
+	}
+
+	lov_proc_dir = obd->obd_proc_private;
+	if (lov_proc_dir)
+		lprocfs_remove_proc_entry(osc_obd->obd_name, lov_proc_dir);
+
+	if (osc_obd) {
+		/* Pass it on to our clients.
+		 * XXX This should be an argument to disconnect,
+		 * XXX not a back-door flag on the OBD.  Ah well.
+		 */
+		osc_obd->obd_force = obd->obd_force;
+		osc_obd->obd_fail = obd->obd_fail;
+		osc_obd->obd_no_recov = obd->obd_no_recov;
+	}
+
+	obd_register_observer(osc_obd, NULL);
+
+	rc = obd_disconnect(tgt->ltd_exp);
+	if (rc) {
+		CERROR("Target %s disconnect error %d\n",
+		       tgt->ltd_uuid.uuid, rc);
+		rc = 0;
+	}
+
+	tgt->ltd_exp = NULL;
+	RETURN(0);
+}
+
+static int lov_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	int i, rc;
+	ENTRY;
+
+	if (!lov->lov_tgts)
+		goto out;
+
+	/* Only disconnect the underlying layers on the final disconnect. */
+	lov->lov_connects--;
+	if (lov->lov_connects != 0) {
+		/* why should there be more than 1 connect? */
+		CERROR("disconnect #%d\n", lov->lov_connects);
+		goto out;
+	}
+
+	/* Let's hold another reference so lov_del_obd doesn't spin through
+	   putref every time */
+	obd_getref(obd);
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
+			/* Disconnection is the last we know about an obd */
+			lov_del_target(obd, i, 0, lov->lov_tgts[i]->ltd_gen);
+		}
+	}
+	obd_putref(obd);
+
+out:
+	rc = class_disconnect(exp); /* bz 9811 */
+	RETURN(rc);
+}
+
+/* Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LOV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD is the wrong type (!)
+ *  any >= 0 : is log target index
+ */
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+			      enum obd_notify_event ev)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	int index, activate, active;
+	ENTRY;
+
+	CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
+	       lov, uuid->uuid, ev);
+
+	obd_getref(obd);
+	for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+		tgt = lov->lov_tgts[index];
+		if (!tgt)
+			continue;
+		/*
+		 * LU-642, initially inactive OSC could miss the obd_connect,
+		 * we make up for it here.
+		 */
+		if (ev == OBD_NOTIFY_ACTIVATE && tgt->ltd_exp == NULL &&
+		    obd_uuid_equals(uuid, &tgt->ltd_uuid)) {
+			struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"};
+
+			obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd,
+				    &lov_osc_uuid, &lov->lov_ocd, NULL);
+		}
+		if (!tgt->ltd_exp)
+			continue;
+
+		CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n",
+		       index, obd_uuid2str(&tgt->ltd_uuid),
+		       tgt->ltd_exp->exp_handle.h_cookie);
+		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+			break;
+	}
+
+	if (index == lov->desc.ld_tgt_count)
+		GOTO(out, index = -EINVAL);
+
+	if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) {
+		activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0;
+
+		if (lov->lov_tgts[index]->ltd_activate == activate) {
+			CDEBUG(D_INFO, "OSC %s already %sactivate!\n",
+			       uuid->uuid, activate ? "" : "de");
+		} else {
+			lov->lov_tgts[index]->ltd_activate = activate;
+			CDEBUG(D_CONFIG, "%sactivate OSC %s\n",
+			       activate ? "" : "de", obd_uuid2str(uuid));
+		}
+
+	} else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) {
+		active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0;
+
+		if (lov->lov_tgts[index]->ltd_active == active) {
+			CDEBUG(D_INFO, "OSC %s already %sactive!\n",
+			       uuid->uuid, active ? "" : "in");
+			GOTO(out, index);
+		} else {
+			CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n",
+			       obd_uuid2str(uuid), active ? "" : "in");
+		}
+
+		lov->lov_tgts[index]->ltd_active = active;
+		if (active) {
+			lov->desc.ld_active_tgt_count++;
+			lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0;
+		} else {
+			lov->desc.ld_active_tgt_count--;
+			lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1;
+		}
+	} else {
+		CERROR("Unknown event(%d) for uuid %s", ev, uuid->uuid);
+	}
+
+ out:
+	obd_putref(obd);
+	RETURN(index);
+}
+
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+		      enum obd_notify_event ev, void *data)
+{
+	int rc = 0;
+	struct lov_obd *lov = &obd->u.lov;
+	ENTRY;
+
+	down_read(&lov->lov_notify_lock);
+	if (!lov->lov_connects) {
+		up_read(&lov->lov_notify_lock);
+		RETURN(rc);
+	}
+
+	if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE ||
+	    ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) {
+		struct obd_uuid *uuid;
+
+		LASSERT(watched);
+
+		if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+			up_read(&lov->lov_notify_lock);
+			CERROR("unexpected notification of %s %s!\n",
+			       watched->obd_type->typ_name,
+			       watched->obd_name);
+			RETURN(-EINVAL);
+		}
+		uuid = &watched->u.cli.cl_target_uuid;
+
+		/* Set OSC as active before notifying the observer, so the
+		 * observer can use the OSC normally.
+		 */
+		rc = lov_set_osc_active(obd, uuid, ev);
+		if (rc < 0) {
+			up_read(&lov->lov_notify_lock);
+			CERROR("event(%d) of %s failed: %d\n", ev,
+			       obd_uuid2str(uuid), rc);
+			RETURN(rc);
+		}
+		/* active event should be pass lov target index as data */
+		data = &rc;
+	}
+
+	/* Pass the notification up the chain. */
+	if (watched) {
+		rc = obd_notify_observer(obd, watched, ev, data);
+	} else {
+		/* NULL watched means all osc's in the lov (only for syncs) */
+		/* sync event should be send lov idx as data */
+		struct lov_obd *lov = &obd->u.lov;
+		int i, is_sync;
+
+		data = &i;
+		is_sync = (ev == OBD_NOTIFY_SYNC) ||
+			  (ev == OBD_NOTIFY_SYNC_NONBLOCK);
+
+		obd_getref(obd);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i])
+				continue;
+
+			/* don't send sync event if target not
+			 * connected/activated */
+			if (is_sync &&  !lov->lov_tgts[i]->ltd_active)
+				continue;
+
+			rc = obd_notify_observer(obd, lov->lov_tgts[i]->ltd_obd,
+						 ev, data);
+			if (rc) {
+				CERROR("%s: notify %s of %s failed %d\n",
+				       obd->obd_name,
+				       obd->obd_observer->obd_name,
+				       lov->lov_tgts[i]->ltd_obd->obd_name,
+				       rc);
+			}
+		}
+		obd_putref(obd);
+	}
+
+	up_read(&lov->lov_notify_lock);
+	RETURN(rc);
+}
+
+static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+			  __u32 index, int gen, int active)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *tgt_obd;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
+	       uuidp->uuid, index, gen, active);
+
+	if (gen <= 0) {
+		CERROR("request to add OBD %s with invalid generation: %d\n",
+		       uuidp->uuid, gen);
+		RETURN(-EINVAL);
+	}
+
+	tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME,
+					&obd->obd_uuid);
+	if (tgt_obd == NULL)
+		RETURN(-EINVAL);
+
+	mutex_lock(&lov->lov_lock);
+
+	if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+		tgt = lov->lov_tgts[index];
+		CERROR("UUID %s already assigned at LOV target index %d\n",
+		       obd_uuid2str(&tgt->ltd_uuid), index);
+		mutex_unlock(&lov->lov_lock);
+		RETURN(-EEXIST);
+	}
+
+	if (index >= lov->lov_tgt_size) {
+		/* We need to reallocate the lov target array. */
+		struct lov_tgt_desc **newtgts, **old = NULL;
+		__u32 newsize, oldsize = 0;
+
+		newsize = max(lov->lov_tgt_size, (__u32)2);
+		while (newsize < index + 1)
+			newsize = newsize << 1;
+		OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+		if (newtgts == NULL) {
+			mutex_unlock(&lov->lov_lock);
+			RETURN(-ENOMEM);
+		}
+
+		if (lov->lov_tgt_size) {
+			memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) *
+			       lov->lov_tgt_size);
+			old = lov->lov_tgts;
+			oldsize = lov->lov_tgt_size;
+		}
+
+		lov->lov_tgts = newtgts;
+		lov->lov_tgt_size = newsize;
+		smp_rmb();
+		if (old)
+			OBD_FREE(old, sizeof(*old) * oldsize);
+
+		CDEBUG(D_CONFIG, "tgts: %p size: %d\n",
+		       lov->lov_tgts, lov->lov_tgt_size);
+	}
+
+	OBD_ALLOC_PTR(tgt);
+	if (!tgt) {
+		mutex_unlock(&lov->lov_lock);
+		RETURN(-ENOMEM);
+	}
+
+	rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+	if (rc) {
+		mutex_unlock(&lov->lov_lock);
+		OBD_FREE_PTR(tgt);
+		RETURN(rc);
+	}
+
+	tgt->ltd_uuid = *uuidp;
+	tgt->ltd_obd = tgt_obd;
+	/* XXX - add a sanity check on the generation number. */
+	tgt->ltd_gen = gen;
+	tgt->ltd_index = index;
+	tgt->ltd_activate = active;
+	lov->lov_tgts[index] = tgt;
+	if (index >= lov->desc.ld_tgt_count)
+		lov->desc.ld_tgt_count = index + 1;
+
+	mutex_unlock(&lov->lov_lock);
+
+	CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
+		index, tgt->ltd_gen, lov->desc.ld_tgt_count);
+
+	rc = obd_notify(obd, tgt_obd, OBD_NOTIFY_CREATE, &index);
+
+	if (lov->lov_connects == 0) {
+		/* lov_connect hasn't been called yet. We'll do the
+		   lov_connect_obd on this target when that fn first runs,
+		   because we don't know the connect flags yet. */
+		RETURN(0);
+	}
+
+	obd_getref(obd);
+
+	rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
+	if (rc)
+		GOTO(out, rc);
+
+	/* connect to administrative disabled ost */
+	if (!tgt->ltd_exp)
+		GOTO(out, rc = 0);
+
+	if (lov->lov_cache != NULL) {
+		rc = obd_set_info_async(NULL, tgt->ltd_exp,
+				sizeof(KEY_CACHE_SET), KEY_CACHE_SET,
+				sizeof(struct cl_client_cache), lov->lov_cache,
+				NULL);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+	rc = lov_notify(obd, tgt->ltd_exp->exp_obd,
+			active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE,
+			(void *)&index);
+
+out:
+	if (rc) {
+		CERROR("add failed (%d), deleting %s\n", rc,
+		       obd_uuid2str(&tgt->ltd_uuid));
+		lov_del_target(obd, index, 0, 0);
+	}
+	obd_putref(obd);
+	RETURN(rc);
+}
+
+/* Schedule a target for deletion */
+int lov_del_target(struct obd_device *obd, __u32 index,
+		   struct obd_uuid *uuidp, int gen)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	int count = lov->desc.ld_tgt_count;
+	int rc = 0;
+	ENTRY;
+
+	if (index >= count) {
+		CERROR("LOV target index %d >= number of LOV OBDs %d.\n",
+		       index, count);
+		RETURN(-EINVAL);
+	}
+
+	/* to make sure there's no ongoing lov_notify() now */
+	down_write(&lov->lov_notify_lock);
+	obd_getref(obd);
+
+	if (!lov->lov_tgts[index]) {
+		CERROR("LOV target at index %d is not setup.\n", index);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) {
+		CERROR("LOV target UUID %s at index %d doesn't match %s.\n",
+		       lov_uuid2str(lov, index), index,
+		       obd_uuid2str(uuidp));
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n",
+	       lov_uuid2str(lov, index), index,
+	       lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp,
+	       lov->lov_tgts[index]->ltd_active);
+
+	lov->lov_tgts[index]->ltd_reap = 1;
+	lov->lov_death_row++;
+	/* we really delete it from obd_putref */
+out:
+	obd_putref(obd);
+	up_write(&lov->lov_notify_lock);
+
+	RETURN(rc);
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+	struct obd_device *osc_obd;
+
+	LASSERT(tgt);
+	LASSERT(tgt->ltd_reap);
+
+	osc_obd = class_exp2obd(tgt->ltd_exp);
+
+	CDEBUG(D_CONFIG, "Removing tgt %s : %s\n",
+	       tgt->ltd_uuid.uuid,
+	       osc_obd ? osc_obd->obd_name : "<no obd>");
+
+	if (tgt->ltd_exp)
+		lov_disconnect_obd(obd, tgt);
+
+	OBD_FREE_PTR(tgt);
+
+	/* Manual cleanup - no cleanup logs to clean up the osc's.  We must
+	   do it ourselves. And we can't do it from lov_cleanup,
+	   because we just lost our only reference to it. */
+	if (osc_obd)
+		class_manual_cleanup(osc_obd);
+}
+
+void lov_fix_desc_stripe_size(__u64 *val)
+{
+	if (*val < LOV_MIN_STRIPE_SIZE) {
+		if (*val != 0)
+			LCONSOLE_INFO("Increasing default stripe size to "
+				      "minimum %u\n",
+				      LOV_DEFAULT_STRIPE_SIZE);
+		*val = LOV_DEFAULT_STRIPE_SIZE;
+	} else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) {
+		*val &= ~(LOV_MIN_STRIPE_SIZE - 1);
+		LCONSOLE_WARN("Changing default stripe size to "LPU64" (a "
+			      "multiple of %u)\n",
+			      *val, LOV_MIN_STRIPE_SIZE);
+	}
+}
+
+void lov_fix_desc_stripe_count(__u32 *val)
+{
+	if (*val == 0)
+		*val = 1;
+}
+
+void lov_fix_desc_pattern(__u32 *val)
+{
+	/* from lov_setstripe */
+	if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) {
+		LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val);
+		*val = 0;
+	}
+}
+
+void lov_fix_desc_qos_maxage(__u32 *val)
+{
+	/* fix qos_maxage */
+	if (*val == 0)
+		*val = QOS_DEFAULT_MAXAGE;
+}
+
+void lov_fix_desc(struct lov_desc *desc)
+{
+	lov_fix_desc_stripe_size(&desc->ld_default_stripe_size);
+	lov_fix_desc_stripe_count(&desc->ld_default_stripe_count);
+	lov_fix_desc_pattern(&desc->ld_pattern);
+	lov_fix_desc_qos_maxage(&desc->ld_qos_maxage);
+}
+
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	struct lov_desc *desc;
+	struct lov_obd *lov = &obd->u.lov;
+	int rc;
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("LOV setup requires a descriptor\n");
+		RETURN(-EINVAL);
+	}
+
+	desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1);
+
+	if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("descriptor size wrong: %d > %d\n",
+		       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	if (desc->ld_magic != LOV_DESC_MAGIC) {
+		if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) {
+			    CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n",
+				   obd->obd_name, desc);
+			    lustre_swab_lov_desc(desc);
+		} else {
+			CERROR("%s: Bad lov desc magic: %#x\n",
+			       obd->obd_name, desc->ld_magic);
+			RETURN(-EINVAL);
+		}
+	}
+
+	lov_fix_desc(desc);
+
+	desc->ld_active_tgt_count = 0;
+	lov->desc = *desc;
+	lov->lov_tgt_size = 0;
+
+	mutex_init(&lov->lov_lock);
+	atomic_set(&lov->lov_refcount, 0);
+	lov->lov_sp_me = LUSTRE_SP_CLI;
+
+	init_rwsem(&lov->lov_notify_lock);
+
+	lov->lov_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS,
+						   HASH_POOLS_MAX_BITS,
+						   HASH_POOLS_BKT_BITS, 0,
+						   CFS_HASH_MIN_THETA,
+						   CFS_HASH_MAX_THETA,
+						   &pool_hash_operations,
+						   CFS_HASH_DEFAULT);
+	INIT_LIST_HEAD(&lov->lov_pool_list);
+	lov->lov_pool_count = 0;
+	rc = lov_ost_pool_init(&lov->lov_packed, 0);
+	if (rc)
+		GOTO(out, rc);
+
+	lprocfs_lov_init_vars(&lvars);
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+#ifdef LPROCFS
+	{
+		int rc;
+
+		rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+					0444, &lov_proc_target_fops, obd);
+		if (rc)
+			CWARN("Error adding the target_obd file\n");
+	}
+#endif
+	lov->lov_pool_proc_entry = lprocfs_register("pools",
+						    obd->obd_proc_entry,
+						    NULL, NULL);
+
+	RETURN(0);
+
+out:
+	return rc;
+}
+
+static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	struct lov_obd *lov = &obd->u.lov;
+
+	ENTRY;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY: {
+		int i;
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+				continue;
+			obd_precleanup(class_exp2obd(lov->lov_tgts[i]->ltd_exp),
+				       OBD_CLEANUP_EARLY);
+		}
+		break;
+	}
+	case OBD_CLEANUP_EXPORTS:
+		rc = obd_llog_finish(obd, 0);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+	}
+	RETURN(rc);
+}
+
+static int lov_cleanup(struct obd_device *obd)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	struct list_head *pos, *tmp;
+	struct pool_desc *pool;
+	ENTRY;
+
+	list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
+		pool = list_entry(pos, struct pool_desc, pool_list);
+		/* free pool structs */
+		CDEBUG(D_INFO, "delete pool %p\n", pool);
+		/* In the function below, .hs_keycmp resolves to
+		 * pool_hashkey_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		lov_pool_del(obd, pool->pool_name);
+	}
+	cfs_hash_putref(lov->lov_pools_hash_body);
+	lov_ost_pool_free(&lov->lov_packed);
+
+	lprocfs_obd_cleanup(obd);
+	if (lov->lov_tgts) {
+		int i;
+		obd_getref(obd);
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			if (!lov->lov_tgts[i])
+				continue;
+
+			/* Inactive targets may never have connected */
+			if (lov->lov_tgts[i]->ltd_active ||
+			    atomic_read(&lov->lov_refcount))
+			    /* We should never get here - these
+			       should have been removed in the
+			     disconnect. */
+				CERROR("lov tgt %d not cleaned!"
+				       " deathrow=%d, lovrc=%d\n",
+				       i, lov->lov_death_row,
+				       atomic_read(&lov->lov_refcount));
+			lov_del_target(obd, i, 0, 0);
+		}
+		obd_putref(obd);
+		OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
+			 lov->lov_tgt_size);
+		lov->lov_tgt_size = 0;
+	}
+	RETURN(0);
+}
+
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+			    __u32 *indexp, int *genp)
+{
+	struct obd_uuid obd_uuid;
+	int cmd;
+	int rc = 0;
+	ENTRY;
+
+	switch(cmd = lcfg->lcfg_command) {
+	case LCFG_LOV_ADD_OBD:
+	case LCFG_LOV_ADD_INA:
+	case LCFG_LOV_DEL_OBD: {
+		__u32 index;
+		int gen;
+		/* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+			GOTO(out, rc = -EINVAL);
+
+		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+		if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", indexp) != 1)
+			GOTO(out, rc = -EINVAL);
+		if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1)
+			GOTO(out, rc = -EINVAL);
+		index = *indexp;
+		gen = *genp;
+		if (cmd == LCFG_LOV_ADD_OBD)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+		else if (cmd == LCFG_LOV_ADD_INA)
+			rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+		else
+			rc = lov_del_target(obd, index, &obd_uuid, gen);
+		GOTO(out, rc);
+	}
+	case LCFG_PARAM: {
+		struct lprocfs_static_vars lvars = { 0 };
+		struct lov_desc *desc = &(obd->u.lov.desc);
+
+		if (!desc)
+			GOTO(out, rc = -EINVAL);
+
+		lprocfs_lov_init_vars(&lvars);
+
+		rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+		GOTO(out, rc);
+	}
+	case LCFG_POOL_NEW:
+	case LCFG_POOL_ADD:
+	case LCFG_POOL_DEL:
+	case LCFG_POOL_REM:
+		GOTO(out, rc);
+
+	default: {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		GOTO(out, rc = -EINVAL);
+
+	}
+	}
+out:
+	RETURN(rc);
+}
+
+static int lov_recreate(struct obd_export *exp, struct obdo *src_oa,
+			struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+	struct lov_stripe_md *obj_mdp, *lsm;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	unsigned ost_idx;
+	int rc, i;
+	ENTRY;
+
+	LASSERT(src_oa->o_valid & OBD_MD_FLFLAGS &&
+		src_oa->o_flags & OBD_FL_RECREATE_OBJS);
+
+	OBD_ALLOC(obj_mdp, sizeof(*obj_mdp));
+	if (obj_mdp == NULL)
+		RETURN(-ENOMEM);
+
+	ost_idx = src_oa->o_nlink;
+	lsm = *ea;
+	if (lsm == NULL)
+		GOTO(out, rc = -EINVAL);
+	if (ost_idx >= lov->desc.ld_tgt_count ||
+	    !lov->lov_tgts[ost_idx])
+		GOTO(out, rc = -EINVAL);
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		if (lsm->lsm_oinfo[i]->loi_ost_idx == ost_idx) {
+			if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) !=
+					ostid_id(&src_oa->o_oi))
+				GOTO(out, rc = -EINVAL);
+			break;
+		}
+	}
+	if (i == lsm->lsm_stripe_count)
+		GOTO(out, rc = -EINVAL);
+
+	rc = obd_create(NULL, lov->lov_tgts[ost_idx]->ltd_exp,
+			src_oa, &obj_mdp, oti);
+out:
+	OBD_FREE(obj_mdp, sizeof(*obj_mdp));
+	RETURN(rc);
+}
+
+/* the LOV expects oa->o_id to be set to the LOV object id */
+static int lov_create(const struct lu_env *env, struct obd_export *exp,
+		      struct obdo *src_oa, struct lov_stripe_md **ea,
+		      struct obd_trans_info *oti)
+{
+	struct lov_obd *lov;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(ea != NULL);
+	if (exp == NULL)
+		RETURN(-EINVAL);
+
+	if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+	    src_oa->o_flags == OBD_FL_DELORPHAN) {
+		/* should be used with LOV anymore */
+		LBUG();
+	}
+
+	lov = &exp->exp_obd->u.lov;
+	if (!lov->desc.ld_active_tgt_count)
+		RETURN(-EIO);
+
+	obd_getref(exp->exp_obd);
+	/* Recreate a specific object id at the given OST index */
+	if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+	    (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) {
+		 rc = lov_recreate(exp, src_oa, ea, oti);
+	}
+
+	obd_putref(exp->exp_obd);
+	RETURN(rc);
+}
+
+#define ASSERT_LSM_MAGIC(lsmp)						  \
+do {									    \
+	LASSERT((lsmp) != NULL);						\
+	LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 ||			  \
+		 (lsmp)->lsm_magic == LOV_MAGIC_V3),			    \
+		 "%p->lsm_magic=%x\n", (lsmp), (lsmp)->lsm_magic);	      \
+} while (0)
+
+static int lov_destroy(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa, struct lov_stripe_md *lsm,
+		       struct obd_trans_info *oti, struct obd_export *md_exp,
+		       void *capa)
+{
+	struct lov_request_set *set;
+	struct obd_info oinfo;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int rc = 0, err = 0;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	if (oa->o_valid & OBD_MD_FLCOOKIE) {
+		LASSERT(oti);
+		LASSERT(oti->oti_logcookies);
+	}
+
+	lov = &exp->exp_obd->u.lov;
+	obd_getref(exp->exp_obd);
+	rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set);
+	if (rc)
+		GOTO(out, rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (oa->o_valid & OBD_MD_FLCOOKIE)
+			oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+		err = obd_destroy(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+				  req->rq_oi.oi_oa, NULL, oti, NULL, capa);
+		err = lov_update_common_set(set, req, err);
+		if (err) {
+			CERROR("%s: destroying objid "DOSTID" subobj "
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name, POSTID(&oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, err);
+			if (!rc)
+				rc = err;
+		}
+	}
+
+	if (rc == 0) {
+		LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+		rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp);
+	}
+	err = lov_fini_destroy_set(set);
+out:
+	obd_putref(exp->exp_obd);
+	RETURN(rc ? rc : err);
+}
+
+static int lov_getattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_info *oinfo)
+{
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int err = 0, rc = 0;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+
+	rc = lov_prep_getattr_set(exp, oinfo, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+		       " %u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+		       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+
+		rc = obd_getattr(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+				 &req->rq_oi);
+		err = lov_update_common_set(set, req, rc);
+		if (err) {
+			CERROR("%s: getattr objid "DOSTID" subobj "
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&oinfo->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, err);
+			break;
+		}
+	}
+
+	rc = lov_fini_getattr_set(set);
+	if (err)
+		rc = err;
+	RETURN(rc);
+}
+
+static int lov_getattr_interpret(struct ptlrpc_request_set *rqset,
+				 void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+	ENTRY;
+
+	/* don't do attribute merge if this aysnc op failed */
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_getattr_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+static int lov_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			      struct ptlrpc_request_set *rqset)
+{
+	struct lov_request_set *lovset;
+	struct lov_obd *lov;
+	struct list_head *pos;
+	struct lov_request *req;
+	int rc = 0, err;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+
+	rc = lov_prep_getattr_set(exp, oinfo, &lovset);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+	       POSTID(&oinfo->oi_md->lsm_oi), oinfo->oi_md->lsm_stripe_count,
+	       oinfo->oi_md->lsm_stripe_size);
+
+	list_for_each(pos, &lovset->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+		       "%u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+		       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+		rc = obd_getattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				       &req->rq_oi, rqset);
+		if (rc) {
+			CERROR("%s: getattr objid "DOSTID" subobj"
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&oinfo->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, rc);
+			GOTO(out, rc);
+		}
+	}
+
+	if (!list_empty(&rqset->set_requests)) {
+		LASSERT(rc == 0);
+		LASSERT (rqset->set_interpret == NULL);
+		rqset->set_interpret = lov_getattr_interpret;
+		rqset->set_arg = (void *)lovset;
+		RETURN(rc);
+	}
+out:
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_getattr_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+static int lov_setattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov;
+	struct list_head *pos;
+	struct lov_request *req;
+	int err = 0, rc = 0;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	/* for now, we only expect the following updates here */
+	LASSERT(!(oinfo->oi_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLTYPE |
+					    OBD_MD_FLMODE | OBD_MD_FLATIME |
+					    OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+					    OBD_MD_FLFLAGS | OBD_MD_FLSIZE |
+					    OBD_MD_FLGROUP | OBD_MD_FLUID |
+					    OBD_MD_FLGID | OBD_MD_FLFID |
+					    OBD_MD_FLGENER)));
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		rc = obd_setattr(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+				 &req->rq_oi, NULL);
+		err = lov_update_setattr_set(set, req, rc);
+		if (err) {
+			CERROR("%s: setattr objid "DOSTID" subobj "
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&set->set_oi->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx,
+			       err);
+			if (!rc)
+				rc = err;
+		}
+	}
+	err = lov_fini_setattr_set(set);
+	if (!rc)
+		rc = err;
+	RETURN(rc);
+}
+
+static int lov_setattr_interpret(struct ptlrpc_request_set *rqset,
+				 void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+	ENTRY;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_setattr_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+/* If @oti is given, the request goes from MDS and responses from OSTs are not
+   needed. Otherwise, a client is waiting for responses. */
+static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			     struct obd_trans_info *oti,
+			     struct ptlrpc_request_set *rqset)
+{
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+	if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) {
+		LASSERT(oti);
+		LASSERT(oti->oti_logcookies);
+	}
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+	       POSTID(&oinfo->oi_md->lsm_oi),
+	       oinfo->oi_md->lsm_stripe_count,
+	       oinfo->oi_md->lsm_stripe_size);
+
+	list_for_each(pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+			oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+		CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+		       "%u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+		       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+
+		rc = obd_setattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				       &req->rq_oi, oti, rqset);
+		if (rc) {
+			CERROR("error: setattr objid "DOSTID" subobj"
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       POSTID(&set->set_oi->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi),
+			       req->rq_idx, rc);
+			break;
+		}
+	}
+
+	/* If we are not waiting for responses on async requests, return. */
+	if (rc || !rqset || list_empty(&rqset->set_requests)) {
+		int err;
+		if (rc)
+			atomic_set(&set->set_completes, 0);
+		err = lov_fini_setattr_set(set);
+		RETURN(rc ? rc : err);
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_setattr_interpret;
+	rqset->set_arg = (void *)set;
+
+	RETURN(0);
+}
+
+static int lov_punch_interpret(struct ptlrpc_request_set *rqset,
+			       void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+	ENTRY;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_punch_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+/* FIXME: maybe we'll just make one node the authoritative attribute node, then
+ * we can send this 'punch' to just the authoritative node and the nodes
+ * that the punch will affect. */
+static int lov_punch(const struct lu_env *env, struct obd_export *exp,
+		     struct obd_info *oinfo, struct obd_trans_info *oti,
+		     struct ptlrpc_request_set *rqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov;
+	struct list_head *pos;
+	struct lov_request *req;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_punch_set(exp, oinfo, oti, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		rc = obd_punch(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+			       &req->rq_oi, NULL, rqset);
+		if (rc) {
+			CERROR("%s: punch objid "DOSTID" subobj "DOSTID
+			       " on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&set->set_oi->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx, rc);
+			break;
+		}
+	}
+
+	if (rc || list_empty(&rqset->set_requests)) {
+		int err;
+		err = lov_fini_punch_set(set);
+		RETURN(rc ? rc : err);
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_punch_interpret;
+	rqset->set_arg = (void *)set;
+
+	RETURN(0);
+}
+
+static int lov_sync_interpret(struct ptlrpc_request_set *rqset,
+			      void *data, int rc)
+{
+	struct lov_request_set *lovset = data;
+	int err;
+	ENTRY;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+	err = lov_fini_sync_set(lovset);
+	RETURN(rc ?: err);
+}
+
+static int lov_sync(const struct lu_env *env, struct obd_export *exp,
+		    struct obd_info *oinfo, obd_off start, obd_off end,
+		    struct ptlrpc_request_set *rqset)
+{
+	struct lov_request_set *set = NULL;
+	struct lov_obd *lov;
+	struct list_head *pos;
+	struct lov_request *req;
+	int rc = 0;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+	LASSERT(rqset != NULL);
+
+	if (!exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_sync_set(exp, oinfo, start, end, &set);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_INFO, "fsync objid "DOSTID" ["LPX64", "LPX64"]\n",
+	       POSTID(&set->set_oi->oi_oa->o_oi), start, end);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		rc = obd_sync(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+			      &req->rq_oi, req->rq_oi.oi_policy.l_extent.start,
+			      req->rq_oi.oi_policy.l_extent.end, rqset);
+		if (rc) {
+			CERROR("%s: fsync objid "DOSTID" subobj "DOSTID
+			       " on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name,
+			       POSTID(&set->set_oi->oi_oa->o_oi),
+			       POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx,
+			       rc);
+			break;
+		}
+	}
+
+	/* If we are not waiting for responses on async requests, return. */
+	if (rc || list_empty(&rqset->set_requests)) {
+		int err = lov_fini_sync_set(set);
+
+		RETURN(rc ?: err);
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_sync_interpret;
+	rqset->set_arg = (void *)set;
+
+	RETURN(0);
+}
+
+static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo,
+			 obd_count oa_bufs, struct brw_page *pga)
+{
+	struct obd_info oinfo = { { { 0 } } };
+	int i, rc = 0;
+
+	oinfo.oi_oa = lov_oinfo->oi_oa;
+
+	/* The caller just wants to know if there's a chance that this
+	 * I/O can succeed */
+	for (i = 0; i < oa_bufs; i++) {
+		int stripe = lov_stripe_number(lov_oinfo->oi_md, pga[i].off);
+		int ost = lov_oinfo->oi_md->lsm_oinfo[stripe]->loi_ost_idx;
+		obd_off start, end;
+
+		if (!lov_stripe_intersects(lov_oinfo->oi_md, i, pga[i].off,
+					   pga[i].off + pga[i].count - 1,
+					   &start, &end))
+			continue;
+
+		if (!lov->lov_tgts[ost] || !lov->lov_tgts[ost]->ltd_active) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", ost);
+			return -EIO;
+		}
+
+		rc = obd_brw(OBD_BRW_CHECK, lov->lov_tgts[ost]->ltd_exp, &oinfo,
+			     1, &pga[i], NULL);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+static int lov_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+		   obd_count oa_bufs, struct brw_page *pga,
+		   struct obd_trans_info *oti)
+{
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int err, rc = 0;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+	if (cmd == OBD_BRW_CHECK) {
+		rc = lov_brw_check(lov, oinfo, oa_bufs, pga);
+		RETURN(rc);
+	}
+
+	rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		struct obd_export *sub_exp;
+		struct brw_page *sub_pga;
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		sub_exp = lov->lov_tgts[req->rq_idx]->ltd_exp;
+		sub_pga = set->set_pga + req->rq_pgaidx;
+		rc = obd_brw(cmd, sub_exp, &req->rq_oi, req->rq_oabufs,
+			     sub_pga, oti);
+		if (rc)
+			break;
+		lov_update_common_set(set, req, rc);
+	}
+
+	err = lov_fini_brw_set(set);
+	if (!rc)
+		rc = err;
+	RETURN(rc);
+}
+
+static int lov_enqueue_interpret(struct ptlrpc_request_set *rqset,
+				 void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	ENTRY;
+	rc = lov_fini_enqueue_set(lovset, lovset->set_ei->ei_mode, rc, rqset);
+	RETURN(rc);
+}
+
+static int lov_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+		       struct ldlm_enqueue_info *einfo,
+		       struct ptlrpc_request_set *rqset)
+{
+	ldlm_mode_t mode = einfo->ei_mode;
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	ldlm_error_t rc;
+	ENTRY;
+
+	LASSERT(oinfo);
+	ASSERT_LSM_MAGIC(oinfo->oi_md);
+	LASSERT(mode == (mode & -mode));
+
+	/* we should never be asked to replay a lock this way. */
+	LASSERT((oinfo->oi_flags & LDLM_FL_REPLAY) == 0);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_enqueue_set(exp, oinfo, einfo, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		rc = obd_enqueue(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				 &req->rq_oi, einfo, rqset);
+		if (rc != ELDLM_OK)
+			GOTO(out, rc);
+	}
+
+	if (rqset && !list_empty(&rqset->set_requests)) {
+		LASSERT(rc == 0);
+		LASSERT(rqset->set_interpret == NULL);
+		rqset->set_interpret = lov_enqueue_interpret;
+		rqset->set_arg = (void *)set;
+		RETURN(rc);
+	}
+out:
+	rc = lov_fini_enqueue_set(set, mode, rc, rqset);
+	RETURN(rc);
+}
+
+static int lov_change_cbdata(struct obd_export *exp,
+			     struct lov_stripe_md *lsm, ldlm_iterator_t it,
+			     void *data)
+{
+	struct lov_obd *lov;
+	int rc = 0, i;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_stripe_md submd;
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx);
+			continue;
+		}
+
+		submd.lsm_oi = loi->loi_oi;
+		submd.lsm_stripe_count = 0;
+		rc = obd_change_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
+				       &submd, it, data);
+	}
+	RETURN(rc);
+}
+
+/* find any ldlm lock of the inode in lov
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+static int lov_find_cbdata(struct obd_export *exp,
+			   struct lov_stripe_md *lsm, ldlm_iterator_t it,
+			   void *data)
+{
+	struct lov_obd *lov;
+	int rc = 0, i;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_stripe_md submd;
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+		if (!lov->lov_tgts[loi->loi_ost_idx]) {
+			CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx);
+			continue;
+		}
+		submd.lsm_oi = loi->loi_oi;
+		submd.lsm_stripe_count = 0;
+		rc = obd_find_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
+				     &submd, it, data);
+		if (rc != 0)
+			RETURN(rc);
+	}
+	RETURN(rc);
+}
+
+static int lov_cancel(struct obd_export *exp, struct lov_stripe_md *lsm,
+		      __u32 mode, struct lustre_handle *lockh)
+{
+	struct lov_request_set *set;
+	struct obd_info oinfo;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	struct lustre_handle *lov_lockhp;
+	int err = 0, rc = 0;
+	ENTRY;
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	LASSERT(lockh);
+	lov = &exp->exp_obd->u.lov;
+	rc = lov_prep_cancel_set(exp, &oinfo, lsm, mode, lockh, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each(pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+		lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+
+		rc = obd_cancel(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				req->rq_oi.oi_md, mode, lov_lockhp);
+		rc = lov_update_common_set(set, req, rc);
+		if (rc) {
+			CERROR("%s: cancel objid "DOSTID" subobj "
+			       DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name, POSTID(&lsm->lsm_oi),
+			       POSTID(&req->rq_oi.oi_md->lsm_oi),
+			       req->rq_idx, rc);
+			err = rc;
+		}
+
+	}
+	lov_fini_cancel_set(set);
+	RETURN(err);
+}
+
+static int lov_cancel_unused(struct obd_export *exp,
+			     struct lov_stripe_md *lsm,
+			     ldlm_cancel_flags_t flags, void *opaque)
+{
+	struct lov_obd *lov;
+	int rc = 0, i;
+	ENTRY;
+
+	if (!exp || !exp->exp_obd)
+		RETURN(-ENODEV);
+
+	lov = &exp->exp_obd->u.lov;
+	if (lsm == NULL) {
+		for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+			int err;
+			if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+				continue;
+
+			err = obd_cancel_unused(lov->lov_tgts[i]->ltd_exp, NULL,
+						flags, opaque);
+			if (!rc)
+				rc = err;
+		}
+		RETURN(rc);
+	}
+
+	ASSERT_LSM_MAGIC(lsm);
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_stripe_md submd;
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		int idx = loi->loi_ost_idx;
+		int err;
+
+		if (!lov->lov_tgts[idx]) {
+			CDEBUG(D_HA, "lov idx %d NULL\n", idx);
+			continue;
+		}
+
+		if (!lov->lov_tgts[idx]->ltd_active)
+			CDEBUG(D_HA, "lov idx %d inactive\n", idx);
+
+		submd.lsm_oi = loi->loi_oi;
+		submd.lsm_stripe_count = 0;
+		err = obd_cancel_unused(lov->lov_tgts[idx]->ltd_exp,
+					&submd, flags, opaque);
+		if (err && lov->lov_tgts[idx]->ltd_active) {
+			CERROR("%s: cancel unused objid "DOSTID
+			       " subobj "DOSTID" on OST idx %d: rc = %d\n",
+			       exp->exp_obd->obd_name, POSTID(&lsm->lsm_oi),
+			       POSTID(&loi->loi_oi), idx, err);
+			if (!rc)
+				rc = err;
+		}
+	}
+	RETURN(rc);
+}
+
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc)
+{
+	struct lov_request_set *lovset = (struct lov_request_set *)data;
+	int err;
+	ENTRY;
+
+	if (rc)
+		atomic_set(&lovset->set_completes, 0);
+
+	err = lov_fini_statfs_set(lovset);
+	RETURN(rc ? rc : err);
+}
+
+static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo,
+			    __u64 max_age, struct ptlrpc_request_set *rqset)
+{
+	struct obd_device      *obd = class_exp2obd(exp);
+	struct lov_request_set *set;
+	struct lov_request *req;
+	struct list_head *pos;
+	struct lov_obd *lov;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oinfo != NULL);
+	LASSERT(oinfo->oi_osfs != NULL);
+
+	lov = &obd->u.lov;
+	rc = lov_prep_statfs_set(obd, oinfo, &set);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+		rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				      &req->rq_oi, max_age, rqset);
+		if (rc)
+			break;
+	}
+
+	if (rc || list_empty(&rqset->set_requests)) {
+		int err;
+		if (rc)
+			atomic_set(&set->set_completes, 0);
+		err = lov_fini_statfs_set(set);
+		RETURN(rc ? rc : err);
+	}
+
+	LASSERT(rqset->set_interpret == NULL);
+	rqset->set_interpret = lov_statfs_interpret;
+	rqset->set_arg = (void *)set;
+	RETURN(0);
+}
+
+static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct ptlrpc_request_set *set = NULL;
+	struct obd_info oinfo = { { { 0 } } };
+	int rc = 0;
+	ENTRY;
+
+
+	/* for obdclass we forbid using obd_statfs_rqset, but prefer using async
+	 * statfs requests */
+	set = ptlrpc_prep_set();
+	if (set == NULL)
+		RETURN(-ENOMEM);
+
+	oinfo.oi_osfs = osfs;
+	oinfo.oi_flags = flags;
+	rc = lov_statfs_async(exp, &oinfo, max_age, set);
+	if (rc == 0)
+		rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+
+	RETURN(rc);
+}
+
+static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void *uarg)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+	struct obd_uuid *uuidp;
+	ENTRY;
+
+	switch (cmd) {
+	case IOC_OBD_STATFS: {
+		struct obd_ioctl_data *data = karg;
+		struct obd_device *osc_obd;
+		struct obd_statfs stat_buf = {0};
+		__u32 index;
+		__u32 flags;
+
+		memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+		if ((index >= count))
+			RETURN(-ENODEV);
+
+		if (!lov->lov_tgts[index])
+			/* Try again with the next index */
+			RETURN(-EAGAIN);
+		if (!lov->lov_tgts[index]->ltd_active)
+			RETURN(-ENODATA);
+
+		osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+		if (!osc_obd)
+			RETURN(-EINVAL);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
+				     min((int) data->ioc_plen2,
+					 (int) sizeof(struct obd_uuid))))
+			RETURN(-EFAULT);
+
+		flags = uarg ? *(__u32*)uarg : 0;
+		/* got statfs data */
+		rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				flags);
+		if (rc)
+			RETURN(rc);
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				     min((int) data->ioc_plen1,
+					 (int) sizeof(stat_buf))))
+			RETURN(-EFAULT);
+		break;
+	}
+	case OBD_IOC_LOV_GET_CONFIG: {
+		struct obd_ioctl_data *data;
+		struct lov_desc *desc;
+		char *buf = NULL;
+		__u32 *genp;
+
+		len = 0;
+		if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
+			RETURN(-EINVAL);
+
+		data = (struct obd_ioctl_data *)buf;
+
+		if (sizeof(*desc) > data->ioc_inllen1) {
+			obd_ioctl_freedata(buf, len);
+			RETURN(-EINVAL);
+		}
+
+		if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) {
+			obd_ioctl_freedata(buf, len);
+			RETURN(-EINVAL);
+		}
+
+		if (sizeof(__u32) * count > data->ioc_inllen3) {
+			obd_ioctl_freedata(buf, len);
+			RETURN(-EINVAL);
+		}
+
+		desc = (struct lov_desc *)data->ioc_inlbuf1;
+		memcpy(desc, &(lov->desc), sizeof(*desc));
+
+		uuidp = (struct obd_uuid *)data->ioc_inlbuf2;
+		genp = (__u32 *)data->ioc_inlbuf3;
+		/* the uuid will be empty for deleted OSTs */
+		for (i = 0; i < count; i++, uuidp++, genp++) {
+			if (!lov->lov_tgts[i])
+				continue;
+			*uuidp = lov->lov_tgts[i]->ltd_uuid;
+			*genp = lov->lov_tgts[i]->ltd_gen;
+		}
+
+		if (copy_to_user((void *)uarg, buf, len))
+			rc = -EFAULT;
+		obd_ioctl_freedata(buf, len);
+		break;
+	}
+	case LL_IOC_LOV_SETSTRIPE:
+		rc = lov_setstripe(exp, len, karg, uarg);
+		break;
+	case LL_IOC_LOV_GETSTRIPE:
+		rc = lov_getstripe(exp, karg, uarg);
+		break;
+	case LL_IOC_LOV_SETEA:
+		rc = lov_setea(exp, karg, uarg);
+		break;
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct lov_tgt_desc *tgt = NULL;
+		struct obd_quotactl *oqctl;
+
+		if (qctl->qc_valid == QC_OSTIDX) {
+			if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+				RETURN(-EINVAL);
+
+			tgt = lov->lov_tgts[qctl->qc_idx];
+			if (!tgt || !tgt->ltd_exp)
+				RETURN(-EINVAL);
+		} else if (qctl->qc_valid == QC_UUID) {
+			for (i = 0; i < count; i++) {
+				tgt = lov->lov_tgts[i];
+				if (!tgt ||
+				    !obd_uuid_equals(&tgt->ltd_uuid,
+						     &qctl->obd_uuid))
+					continue;
+
+				if (tgt->ltd_exp == NULL)
+					RETURN(-EINVAL);
+
+				break;
+			}
+		} else {
+			RETURN(-EINVAL);
+		}
+
+		if (i >= count)
+			RETURN(-EAGAIN);
+
+		LASSERT(tgt && tgt->ltd_exp);
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			RETURN(-ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_OSTIDX;
+			qctl->obd_uuid = tgt->ltd_uuid;
+		}
+		OBD_FREE_PTR(oqctl);
+		break;
+	}
+	default: {
+		int set = 0;
+
+		if (count == 0)
+			RETURN(-ENOTTY);
+
+		for (i = 0; i < count; i++) {
+			int err;
+			struct obd_device *osc_obd;
+
+			/* OST was disconnected */
+			if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+				continue;
+
+			/* ll_umount_begin() sets force flag but for lov, not
+			 * osc. Let's pass it through */
+			osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+			osc_obd->obd_force = obddev->obd_force;
+			err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+					    len, karg, uarg);
+			if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+				RETURN(err);
+			} else if (err) {
+				if (lov->lov_tgts[i]->ltd_active) {
+					CDEBUG(err == -ENOTTY ?
+					       D_IOCTL : D_WARNING,
+					       "iocontrol OSC %s on OST "
+					       "idx %d cmd %x: err = %d\n",
+					       lov_uuid2str(lov, i),
+					       i, cmd, err);
+					if (!rc)
+						rc = err;
+				}
+			} else {
+				set = 1;
+			}
+		}
+		if (!set && !rc)
+			rc = -EIO;
+	}
+	}
+
+	RETURN(rc);
+}
+
+#define FIEMAP_BUFFER_SIZE 4096
+
+/**
+ * Non-zero fe_logical indicates that this is a continuation FIEMAP
+ * call. The local end offset and the device are sent in the first
+ * fm_extent. This function calculates the stripe number from the index.
+ * This function returns a stripe_no on which mapping is to be restarted.
+ *
+ * This function returns fm_end_offset which is the in-OST offset at which
+ * mapping should be restarted. If fm_end_offset=0 is returned then caller
+ * will re-calculate proper offset in next stripe.
+ * Note that the first extent is passed to lov_get_info via the value field.
+ *
+ * \param fiemap fiemap request header
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe will be returned in this
+ */
+obd_size fiemap_calc_fm_end_offset(struct ll_user_fiemap *fiemap,
+				   struct lov_stripe_md *lsm, obd_size fm_start,
+				   obd_size fm_end, int *start_stripe)
+{
+	obd_size local_end = fiemap->fm_extents[0].fe_logical;
+	obd_off lun_start, lun_end;
+	obd_size fm_end_offset;
+	int stripe_no = -1, i;
+
+	if (fiemap->fm_extent_count == 0 ||
+	    fiemap->fm_extents[0].fe_logical == 0)
+		return 0;
+
+	/* Find out stripe_no from ost_index saved in the fe_device */
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		if (lsm->lsm_oinfo[i]->loi_ost_idx ==
+					fiemap->fm_extents[0].fe_device) {
+			stripe_no = i;
+			break;
+		}
+	}
+	if (stripe_no == -1)
+		return -EINVAL;
+
+	/* If we have finished mapping on previous device, shift logical
+	 * offset to start of next device */
+	if ((lov_stripe_intersects(lsm, stripe_no, fm_start, fm_end,
+				   &lun_start, &lun_end)) != 0 &&
+				   local_end < lun_end) {
+		fm_end_offset = local_end;
+		*start_stripe = stripe_no;
+	} else {
+		/* This is a special value to indicate that caller should
+		 * calculate offset in next stripe. */
+		fm_end_offset = 0;
+		*start_stripe = (stripe_no + 1) % lsm->lsm_stripe_count;
+	}
+
+	return fm_end_offset;
+}
+
+/**
+ * We calculate on which OST the mapping will end. If the length of mapping
+ * is greater than (stripe_size * stripe_count) then the last_stripe will
+ * will be one just before start_stripe. Else we check if the mapping
+ * intersects each OST and find last_stripe.
+ * This function returns the last_stripe and also sets the stripe_count
+ * over which the mapping is spread
+ *
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe of the mapping
+ * \param stripe_count the number of stripes across which to map is returned
+ *
+ * \retval last_stripe return the last stripe of the mapping
+ */
+int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, obd_size fm_start,
+			    obd_size fm_end, int start_stripe,
+			    int *stripe_count)
+{
+	int last_stripe;
+	obd_off obd_start, obd_end;
+	int i, j;
+
+	if (fm_end - fm_start > lsm->lsm_stripe_size * lsm->lsm_stripe_count) {
+		last_stripe = (start_stripe < 1 ? lsm->lsm_stripe_count - 1 :
+							      start_stripe - 1);
+		*stripe_count = lsm->lsm_stripe_count;
+	} else {
+		for (j = 0, i = start_stripe; j < lsm->lsm_stripe_count;
+		     i = (i + 1) % lsm->lsm_stripe_count, j++) {
+			if ((lov_stripe_intersects(lsm, i, fm_start, fm_end,
+						   &obd_start, &obd_end)) == 0)
+				break;
+		}
+		*stripe_count = j;
+		last_stripe = (start_stripe + j - 1) %lsm->lsm_stripe_count;
+	}
+
+	return last_stripe;
+}
+
+/**
+ * Set fe_device and copy extents from local buffer into main return buffer.
+ *
+ * \param fiemap fiemap request header
+ * \param lcl_fm_ext array of local fiemap extents to be copied
+ * \param ost_index OST index to be written into the fm_device field for each
+		    extent
+ * \param ext_count number of extents to be copied
+ * \param current_extent where to start copying in main extent array
+ */
+void fiemap_prepare_and_copy_exts(struct ll_user_fiemap *fiemap,
+				  struct ll_fiemap_extent *lcl_fm_ext,
+				  int ost_index, unsigned int ext_count,
+				  int current_extent)
+{
+	char *to;
+	int ext;
+
+	for (ext = 0; ext < ext_count; ext++) {
+		lcl_fm_ext[ext].fe_device = ost_index;
+		lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET;
+	}
+
+	/* Copy fm_extent's from fm_local to return buffer */
+	to = (char *)fiemap + fiemap_count_to_size(current_extent);
+	memcpy(to, lcl_fm_ext, ext_count * sizeof(struct ll_fiemap_extent));
+}
+
+/**
+ * Break down the FIEMAP request and send appropriate calls to individual OSTs.
+ * This also handles the restarting of FIEMAP calls in case mapping overflows
+ * the available number of extents in single call.
+ */
+static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key,
+		      __u32 *vallen, void *val, struct lov_stripe_md *lsm)
+{
+	struct ll_fiemap_info_key *fm_key = key;
+	struct ll_user_fiemap *fiemap = val;
+	struct ll_user_fiemap *fm_local = NULL;
+	struct ll_fiemap_extent *lcl_fm_ext;
+	int count_local;
+	unsigned int get_num_extents = 0;
+	int ost_index = 0, actual_start_stripe, start_stripe;
+	obd_size fm_start, fm_end, fm_length, fm_end_offset;
+	obd_size curr_loc;
+	int current_extent = 0, rc = 0, i;
+	int ost_eof = 0; /* EOF for object */
+	int ost_done = 0; /* done with required mapping for this OST? */
+	int last_stripe;
+	int cur_stripe = 0, cur_stripe_wrap = 0, stripe_count;
+	unsigned int buffer_size = FIEMAP_BUFFER_SIZE;
+
+	if (lsm == NULL)
+		GOTO(out, rc = 0);
+
+	if (fiemap_count_to_size(fm_key->fiemap.fm_extent_count) < buffer_size)
+		buffer_size = fiemap_count_to_size(fm_key->fiemap.fm_extent_count);
+
+	OBD_ALLOC_LARGE(fm_local, buffer_size);
+	if (fm_local == NULL)
+		GOTO(out, rc = -ENOMEM);
+	lcl_fm_ext = &fm_local->fm_extents[0];
+
+	count_local = fiemap_size_to_count(buffer_size);
+
+	memcpy(fiemap, &fm_key->fiemap, sizeof(*fiemap));
+	fm_start = fiemap->fm_start;
+	fm_length = fiemap->fm_length;
+	/* Calculate start stripe, last stripe and length of mapping */
+	actual_start_stripe = start_stripe = lov_stripe_number(lsm, fm_start);
+	fm_end = (fm_length == ~0ULL ? fm_key->oa.o_size :
+						fm_start + fm_length - 1);
+	/* If fm_length != ~0ULL but fm_start+fm_length-1 exceeds file size */
+	if (fm_end > fm_key->oa.o_size)
+		fm_end = fm_key->oa.o_size;
+
+	last_stripe = fiemap_calc_last_stripe(lsm, fm_start, fm_end,
+					    actual_start_stripe, &stripe_count);
+
+	fm_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, fm_start,
+						  fm_end, &start_stripe);
+	if (fm_end_offset == -EINVAL)
+		GOTO(out, rc = -EINVAL);
+
+	if (fiemap->fm_extent_count == 0) {
+		get_num_extents = 1;
+		count_local = 0;
+	}
+
+	/* Check each stripe */
+	for (cur_stripe = start_stripe, i = 0; i < stripe_count;
+	     i++, cur_stripe = (cur_stripe + 1) % lsm->lsm_stripe_count) {
+		obd_size req_fm_len; /* Stores length of required mapping */
+		obd_size len_mapped_single_call;
+		obd_off lun_start, lun_end, obd_object_end;
+		unsigned int ext_count;
+
+		cur_stripe_wrap = cur_stripe;
+
+		/* Find out range of mapping on this stripe */
+		if ((lov_stripe_intersects(lsm, cur_stripe, fm_start, fm_end,
+					   &lun_start, &obd_object_end)) == 0)
+			continue;
+
+		/* If this is a continuation FIEMAP call and we are on
+		 * starting stripe then lun_start needs to be set to
+		 * fm_end_offset */
+		if (fm_end_offset != 0 && cur_stripe == start_stripe)
+			lun_start = fm_end_offset;
+
+		if (fm_length != ~0ULL) {
+			/* Handle fm_start + fm_length overflow */
+			if (fm_start + fm_length < fm_start)
+				fm_length = ~0ULL - fm_start;
+			lun_end = lov_size_to_stripe(lsm, fm_start + fm_length,
+						     cur_stripe);
+		} else {
+			lun_end = ~0ULL;
+		}
+
+		if (lun_start == lun_end)
+			continue;
+
+		req_fm_len = obd_object_end - lun_start;
+		fm_local->fm_length = 0;
+		len_mapped_single_call = 0;
+
+		/* If the output buffer is very large and the objects have many
+		 * extents we may need to loop on a single OST repeatedly */
+		ost_eof = 0;
+		ost_done = 0;
+		do {
+			if (get_num_extents == 0) {
+				/* Don't get too many extents. */
+				if (current_extent + count_local >
+				    fiemap->fm_extent_count)
+					count_local = fiemap->fm_extent_count -
+								 current_extent;
+			}
+
+			lun_start += len_mapped_single_call;
+			fm_local->fm_length = req_fm_len - len_mapped_single_call;
+			req_fm_len = fm_local->fm_length;
+			fm_local->fm_extent_count = count_local;
+			fm_local->fm_mapped_extents = 0;
+			fm_local->fm_flags = fiemap->fm_flags;
+
+			fm_key->oa.o_oi = lsm->lsm_oinfo[cur_stripe]->loi_oi;
+			ost_index = lsm->lsm_oinfo[cur_stripe]->loi_ost_idx;
+
+			if (ost_index < 0 || ost_index >=lov->desc.ld_tgt_count)
+				GOTO(out, rc = -EINVAL);
+
+			/* If OST is inactive, return extent with UNKNOWN flag */
+			if (!lov->lov_tgts[ost_index]->ltd_active) {
+				fm_local->fm_flags |= FIEMAP_EXTENT_LAST;
+				fm_local->fm_mapped_extents = 1;
+
+				lcl_fm_ext[0].fe_logical = lun_start;
+				lcl_fm_ext[0].fe_length = obd_object_end -
+								      lun_start;
+				lcl_fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
+
+				goto inactive_tgt;
+			}
+
+			fm_local->fm_start = lun_start;
+			fm_local->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
+			memcpy(&fm_key->fiemap, fm_local, sizeof(*fm_local));
+			*vallen=fiemap_count_to_size(fm_local->fm_extent_count);
+			rc = obd_get_info(NULL,
+					  lov->lov_tgts[ost_index]->ltd_exp,
+					  keylen, key, vallen, fm_local, lsm);
+			if (rc != 0)
+				GOTO(out, rc);
+
+inactive_tgt:
+			ext_count = fm_local->fm_mapped_extents;
+			if (ext_count == 0) {
+				ost_done = 1;
+				/* If last stripe has hole at the end,
+				 * then we need to return */
+				if (cur_stripe_wrap == last_stripe) {
+					fiemap->fm_mapped_extents = 0;
+					goto finish;
+				}
+				break;
+			}
+
+			/* If we just need num of extents then go to next device */
+			if (get_num_extents) {
+				current_extent += ext_count;
+				break;
+			}
+
+			len_mapped_single_call = lcl_fm_ext[ext_count-1].fe_logical -
+				  lun_start + lcl_fm_ext[ext_count - 1].fe_length;
+
+			/* Have we finished mapping on this device? */
+			if (req_fm_len <= len_mapped_single_call)
+				ost_done = 1;
+
+			/* Clear the EXTENT_LAST flag which can be present on
+			 * last extent */
+			if (lcl_fm_ext[ext_count-1].fe_flags & FIEMAP_EXTENT_LAST)
+				lcl_fm_ext[ext_count - 1].fe_flags &=
+							    ~FIEMAP_EXTENT_LAST;
+
+			curr_loc = lov_stripe_size(lsm,
+					   lcl_fm_ext[ext_count - 1].fe_logical+
+					   lcl_fm_ext[ext_count - 1].fe_length,
+					   cur_stripe);
+			if (curr_loc >= fm_key->oa.o_size)
+				ost_eof = 1;
+
+			fiemap_prepare_and_copy_exts(fiemap, lcl_fm_ext,
+						     ost_index, ext_count,
+						     current_extent);
+
+			current_extent += ext_count;
+
+			/* Ran out of available extents? */
+			if (current_extent >= fiemap->fm_extent_count)
+				goto finish;
+		} while (ost_done == 0 && ost_eof == 0);
+
+		if (cur_stripe_wrap == last_stripe)
+			goto finish;
+	}
+
+finish:
+	/* Indicate that we are returning device offsets unless file just has
+	 * single stripe */
+	if (lsm->lsm_stripe_count > 1)
+		fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER;
+
+	if (get_num_extents)
+		goto skip_last_device_calc;
+
+	/* Check if we have reached the last stripe and whether mapping for that
+	 * stripe is done. */
+	if (cur_stripe_wrap == last_stripe) {
+		if (ost_done || ost_eof)
+			fiemap->fm_extents[current_extent - 1].fe_flags |=
+							     FIEMAP_EXTENT_LAST;
+	}
+
+skip_last_device_calc:
+	fiemap->fm_mapped_extents = current_extent;
+
+out:
+	OBD_FREE_LARGE(fm_local, buffer_size);
+	return rc;
+}
+
+static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *lsm)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	int i, rc;
+	ENTRY;
+
+	if (!vallen || !val)
+		RETURN(-EFAULT);
+
+	obd_getref(obddev);
+
+	if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+		struct {
+			char name[16];
+			struct ldlm_lock *lock;
+		} *data = key;
+		struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name;
+		struct lov_oinfo *loi;
+		__u32 *stripe = val;
+
+		if (*vallen < sizeof(*stripe))
+			GOTO(out, rc = -EFAULT);
+		*vallen = sizeof(*stripe);
+
+		/* XXX This is another one of those bits that will need to
+		 * change if we ever actually support nested LOVs.  It uses
+		 * the lock's export to find out which stripe it is. */
+		/* XXX - it's assumed all the locks for deleted OSTs have
+		 * been cancelled. Also, the export for deleted OSTs will
+		 * be NULL and won't match the lock's export. */
+		for (i = 0; i < lsm->lsm_stripe_count; i++) {
+			loi = lsm->lsm_oinfo[i];
+			if (!lov->lov_tgts[loi->loi_ost_idx])
+				continue;
+			if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp ==
+			    data->lock->l_conn_export &&
+			    ostid_res_name_eq(&loi->loi_oi, res_id)) {
+				*stripe = i;
+				GOTO(out, rc = 0);
+			}
+		}
+		LDLM_ERROR(data->lock, "lock on inode without such object");
+		dump_lsm(D_ERROR, lsm);
+		GOTO(out, rc = -ENXIO);
+	} else if (KEY_IS(KEY_LAST_ID)) {
+		struct obd_id_info *info = val;
+		__u32 size = sizeof(obd_id);
+		struct lov_tgt_desc *tgt;
+
+		LASSERT(*vallen == sizeof(struct obd_id_info));
+		tgt = lov->lov_tgts[info->idx];
+
+		if (!tgt || !tgt->ltd_active)
+			GOTO(out, rc = -ESRCH);
+
+		rc = obd_get_info(env, tgt->ltd_exp, keylen, key,
+				  &size, info->data, NULL);
+		GOTO(out, rc = 0);
+	} else if (KEY_IS(KEY_LOVDESC)) {
+		struct lov_desc *desc_ret = val;
+		*desc_ret = lov->desc;
+
+		GOTO(out, rc = 0);
+	} else if (KEY_IS(KEY_FIEMAP)) {
+		rc = lov_fiemap(lov, keylen, key, vallen, val, lsm);
+		GOTO(out, rc);
+	} else if (KEY_IS(KEY_CONNECT_FLAG)) {
+		struct lov_tgt_desc *tgt;
+		__u64 ost_idx = *((__u64*)val);
+
+		LASSERT(*vallen == sizeof(__u64));
+		LASSERT(ost_idx < lov->desc.ld_tgt_count);
+		tgt = lov->lov_tgts[ost_idx];
+
+		if (!tgt || !tgt->ltd_exp)
+			GOTO(out, rc = -ESRCH);
+
+		*((__u64 *)val) = exp_connect_flags(tgt->ltd_exp);
+		GOTO(out, rc = 0);
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = lov->desc.ld_tgt_count;
+		GOTO(out, rc = 0);
+	}
+
+	rc = -EINVAL;
+
+out:
+	obd_putref(obddev);
+	RETURN(rc);
+}
+
+static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      obd_count keylen, void *key, obd_count vallen,
+			      void *val, struct ptlrpc_request_set *set)
+{
+	struct obd_device *obddev = class_exp2obd(exp);
+	struct lov_obd *lov = &obddev->u.lov;
+	obd_count count;
+	int i, rc = 0, err;
+	struct lov_tgt_desc *tgt;
+	unsigned incr, check_uuid,
+		 do_inactive, no_set;
+	unsigned next_id = 0,  mds_con = 0, capa = 0;
+	ENTRY;
+
+	incr = check_uuid = do_inactive = no_set = 0;
+	if (set == NULL) {
+		no_set = 1;
+		set = ptlrpc_prep_set();
+		if (!set)
+			RETURN(-ENOMEM);
+	}
+
+	obd_getref(obddev);
+	count = lov->desc.ld_tgt_count;
+
+	if (KEY_IS(KEY_NEXT_ID)) {
+		count = vallen / sizeof(struct obd_id_info);
+		vallen = sizeof(obd_id);
+		incr = sizeof(struct obd_id_info);
+		do_inactive = 1;
+		next_id = 1;
+	} else if (KEY_IS(KEY_CHECKSUM)) {
+		do_inactive = 1;
+	} else if (KEY_IS(KEY_EVICT_BY_NID)) {
+		/* use defaults:  do_inactive = incr = 0; */
+	} else if (KEY_IS(KEY_MDS_CONN)) {
+		mds_con = 1;
+	} else if (KEY_IS(KEY_CAPA_KEY)) {
+		capa = 1;
+	} else if (KEY_IS(KEY_CACHE_SET)) {
+		LASSERT(lov->lov_cache == NULL);
+		lov->lov_cache = val;
+		do_inactive = 1;
+	}
+
+	for (i = 0; i < count; i++, val = (char *)val + incr) {
+		if (next_id) {
+			tgt = lov->lov_tgts[((struct obd_id_info*)val)->idx];
+		} else {
+			tgt = lov->lov_tgts[i];
+		}
+		/* OST was disconnected */
+		if (!tgt || !tgt->ltd_exp)
+			continue;
+
+		/* OST is inactive and we don't want inactive OSCs */
+		if (!tgt->ltd_active && !do_inactive)
+			continue;
+
+		if (mds_con) {
+			struct mds_group_info *mgi;
+
+			LASSERT(vallen == sizeof(*mgi));
+			mgi = (struct mds_group_info *)val;
+
+			/* Only want a specific OSC */
+			if (mgi->uuid && !obd_uuid_equals(mgi->uuid,
+						&tgt->ltd_uuid))
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp,
+					 keylen, key, sizeof(int),
+					 &mgi->group, set);
+		} else if (next_id) {
+			err = obd_set_info_async(env, tgt->ltd_exp,
+					 keylen, key, vallen,
+					 ((struct obd_id_info*)val)->data, set);
+		} else if (capa) {
+			struct mds_capa_info *info = (struct mds_capa_info*)val;
+
+			LASSERT(vallen == sizeof(*info));
+
+			 /* Only want a specific OSC */
+			if (info->uuid &&
+			    !obd_uuid_equals(info->uuid, &tgt->ltd_uuid))
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp, keylen,
+						 key, sizeof(*info->capa),
+						 info->capa, set);
+		} else {
+			/* Only want a specific OSC */
+			if (check_uuid &&
+			    !obd_uuid_equals(val, &tgt->ltd_uuid))
+				continue;
+
+			err = obd_set_info_async(env, tgt->ltd_exp,
+					 keylen, key, vallen, val, set);
+		}
+
+		if (!rc)
+			rc = err;
+	}
+
+	obd_putref(obddev);
+	if (no_set) {
+		err = ptlrpc_set_wait(set);
+		if (!rc)
+			rc = err;
+		ptlrpc_set_destroy(set);
+	}
+	RETURN(rc);
+}
+
+static int lov_extent_calc(struct obd_export *exp, struct lov_stripe_md *lsm,
+			   int cmd, __u64 *offset)
+{
+	__u32 ssize = lsm->lsm_stripe_size;
+	__u64 start;
+
+	start = *offset;
+	lov_do_div64(start, ssize);
+	start = start * ssize;
+
+	CDEBUG(D_DLMTRACE, "offset "LPU64", stripe %u, start "LPU64
+			   ", end "LPU64"\n", *offset, ssize, start,
+			   start + ssize - 1);
+	if (cmd == OBD_CALC_STRIPE_END) {
+		*offset = start + ssize - 1;
+	} else if (cmd == OBD_CALC_STRIPE_START) {
+		*offset = start;
+	} else {
+		LBUG();
+	}
+
+	RETURN(0);
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md)
+{
+	LASSERT(md->lsm_lock_owner != current_pid());
+	spin_lock(&md->lsm_lock);
+	LASSERT(md->lsm_lock_owner == 0);
+	md->lsm_lock_owner = current_pid();
+}
+EXPORT_SYMBOL(lov_stripe_lock);
+
+void lov_stripe_unlock(struct lov_stripe_md *md)
+{
+	LASSERT(md->lsm_lock_owner == current_pid());
+	md->lsm_lock_owner = 0;
+	spin_unlock(&md->lsm_lock);
+}
+EXPORT_SYMBOL(lov_stripe_unlock);
+
+static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
+			struct obd_quotactl *oqctl)
+{
+	struct lov_obd      *lov = &obd->u.lov;
+	struct lov_tgt_desc *tgt;
+	__u64		curspace = 0;
+	__u64		bhardlimit = 0;
+	int		  i, rc = 0;
+	ENTRY;
+
+	if (oqctl->qc_cmd != LUSTRE_Q_QUOTAON &&
+	    oqctl->qc_cmd != LUSTRE_Q_QUOTAOFF &&
+	    oqctl->qc_cmd != Q_GETOQUOTA &&
+	    oqctl->qc_cmd != Q_INITQUOTA &&
+	    oqctl->qc_cmd != LUSTRE_Q_SETQUOTA &&
+	    oqctl->qc_cmd != Q_FINVALIDATE) {
+		CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd);
+		RETURN(-EFAULT);
+	}
+
+	/* for lov tgt */
+	obd_getref(obd);
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		int err;
+
+		tgt = lov->lov_tgts[i];
+
+		if (!tgt)
+			continue;
+
+		if (!tgt->ltd_active || tgt->ltd_reap) {
+			if (oqctl->qc_cmd == Q_GETOQUOTA &&
+			    lov->lov_tgts[i]->ltd_activate) {
+				rc = -EREMOTEIO;
+				CERROR("ost %d is inactive\n", i);
+			} else {
+				CDEBUG(D_HA, "ost %d is inactive\n", i);
+			}
+			continue;
+		}
+
+		err = obd_quotactl(tgt->ltd_exp, oqctl);
+		if (err) {
+			if (tgt->ltd_active && !rc)
+				rc = err;
+			continue;
+		}
+
+		if (oqctl->qc_cmd == Q_GETOQUOTA) {
+			curspace += oqctl->qc_dqblk.dqb_curspace;
+			bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
+		}
+	}
+	obd_putref(obd);
+
+	if (oqctl->qc_cmd == Q_GETOQUOTA) {
+		oqctl->qc_dqblk.dqb_curspace = curspace;
+		oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit;
+	}
+	RETURN(rc);
+}
+
+static int lov_quotacheck(struct obd_device *obd, struct obd_export *exp,
+			  struct obd_quotactl *oqctl)
+{
+	struct lov_obd *lov = &obd->u.lov;
+	int	     i, rc = 0;
+	ENTRY;
+
+	obd_getref(obd);
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		if (!lov->lov_tgts[i])
+			continue;
+
+		/* Skip quota check on the administratively disabled OSTs. */
+		if (!lov->lov_tgts[i]->ltd_activate) {
+			CWARN("lov idx %d was administratively disabled, "
+			      "skip quotacheck on it.\n", i);
+			continue;
+		}
+
+		if (!lov->lov_tgts[i]->ltd_active) {
+			CERROR("lov idx %d inactive\n", i);
+			rc = -EIO;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		int err;
+
+		if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_activate)
+			continue;
+
+		err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl);
+		if (err && !rc)
+			rc = err;
+	}
+
+out:
+	obd_putref(obd);
+
+	RETURN(rc);
+}
+
+struct obd_ops lov_obd_ops = {
+	.o_owner	       = THIS_MODULE,
+	.o_setup	       = lov_setup,
+	.o_precleanup	  = lov_precleanup,
+	.o_cleanup	     = lov_cleanup,
+	//.o_process_config      = lov_process_config,
+	.o_connect	     = lov_connect,
+	.o_disconnect	  = lov_disconnect,
+	.o_statfs	      = lov_statfs,
+	.o_statfs_async	= lov_statfs_async,
+	.o_packmd	      = lov_packmd,
+	.o_unpackmd	    = lov_unpackmd,
+	.o_create	      = lov_create,
+	.o_destroy	     = lov_destroy,
+	.o_getattr	     = lov_getattr,
+	.o_getattr_async       = lov_getattr_async,
+	.o_setattr	     = lov_setattr,
+	.o_setattr_async       = lov_setattr_async,
+	.o_brw		 = lov_brw,
+	.o_merge_lvb	   = lov_merge_lvb,
+	.o_adjust_kms	  = lov_adjust_kms,
+	.o_punch	       = lov_punch,
+	.o_sync		= lov_sync,
+	.o_enqueue	     = lov_enqueue,
+	.o_change_cbdata       = lov_change_cbdata,
+	.o_find_cbdata	 = lov_find_cbdata,
+	.o_cancel	      = lov_cancel,
+	.o_cancel_unused       = lov_cancel_unused,
+	.o_iocontrol	   = lov_iocontrol,
+	.o_get_info	    = lov_get_info,
+	.o_set_info_async      = lov_set_info_async,
+	.o_extent_calc	 = lov_extent_calc,
+	.o_llog_init	   = lov_llog_init,
+	.o_llog_finish	 = lov_llog_finish,
+	.o_notify	      = lov_notify,
+	.o_pool_new	    = lov_pool_new,
+	.o_pool_rem	    = lov_pool_remove,
+	.o_pool_add	    = lov_pool_add,
+	.o_pool_del	    = lov_pool_del,
+	.o_getref	      = lov_getref,
+	.o_putref	      = lov_putref,
+	.o_quotactl	    = lov_quotactl,
+	.o_quotacheck	  = lov_quotacheck,
+};
+
+struct kmem_cache *lov_oinfo_slab;
+
+extern struct lu_kmem_descr lov_caches[];
+
+int __init lov_init(void)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc;
+	ENTRY;
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches);
+
+	rc = lu_kmem_init(lov_caches);
+	if (rc)
+		return rc;
+
+	lov_oinfo_slab = kmem_cache_create("lov_oinfo",
+					      sizeof(struct lov_oinfo),
+					      0, SLAB_HWCACHE_ALIGN, NULL);
+	if (lov_oinfo_slab == NULL) {
+		lu_kmem_fini(lov_caches);
+		return -ENOMEM;
+	}
+	lprocfs_lov_init_vars(&lvars);
+
+	rc = class_register_type(&lov_obd_ops, NULL, lvars.module_vars,
+				 LUSTRE_LOV_NAME, &lov_device_type);
+
+	if (rc) {
+		kmem_cache_destroy(lov_oinfo_slab);
+		lu_kmem_fini(lov_caches);
+	}
+
+	RETURN(rc);
+}
+
+static void /*__exit*/ lov_exit(void)
+{
+	class_unregister_type(LUSTRE_LOV_NAME);
+	kmem_cache_destroy(lov_oinfo_slab);
+
+	lu_kmem_fini(lov_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver");
+MODULE_LICENSE("GPL");
+
+cfs_module(lov, LUSTRE_VERSION_STRING, lov_init, lov_exit);

diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c
new file mode 100644
index 0000000..aa8ae80
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_object.c

@@ -0,0 +1,942 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+#include <lustre_debug.h>
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Layout operations.
+ *
+ */
+
+struct lov_layout_operations {
+	int (*llo_init)(const struct lu_env *env, struct lov_device *dev,
+			struct lov_object *lov,
+			const struct cl_object_conf *conf,
+			union lov_layout_state *state);
+	int (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
+			   union lov_layout_state *state);
+	void (*llo_fini)(const struct lu_env *env, struct lov_object *lov,
+			 union lov_layout_state *state);
+	void (*llo_install)(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state);
+	int  (*llo_print)(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct lu_object *o);
+	int  (*llo_page_init)(const struct lu_env *env, struct cl_object *obj,
+				struct cl_page *page, struct page *vmpage);
+	int  (*llo_lock_init)(const struct lu_env *env,
+			      struct cl_object *obj, struct cl_lock *lock,
+			      const struct cl_io *io);
+	int  (*llo_io_init)(const struct lu_env *env,
+			    struct cl_object *obj, struct cl_io *io);
+	int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
+			    struct cl_attr *attr);
+};
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
+
+/*****************************************************************************
+ *
+ * Lov object layout operations.
+ *
+ */
+
+static void lov_install_empty(const struct lu_env *env,
+			      struct lov_object *lov,
+			      union  lov_layout_state *state)
+{
+	/*
+	 * File without objects.
+	 */
+}
+
+static int lov_init_empty(const struct lu_env *env,
+			  struct lov_device *dev, struct lov_object *lov,
+			  const struct cl_object_conf *conf,
+			  union  lov_layout_state *state)
+{
+	return 0;
+}
+
+static void lov_install_raid0(const struct lu_env *env,
+			      struct lov_object *lov,
+			      union  lov_layout_state *state)
+{
+}
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+				      struct cl_device *dev,
+				      const struct lu_fid *fid,
+				      const struct cl_object_conf *conf)
+{
+	struct lu_object *o;
+
+	ENTRY;
+	o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+	LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+	RETURN(lu2cl(o));
+}
+
+static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
+			struct cl_object *stripe,
+			struct lov_layout_raid0 *r0, int idx)
+{
+	struct cl_object_header *hdr;
+	struct cl_object_header *subhdr;
+	struct cl_object_header *parent;
+	struct lov_oinfo	*oinfo;
+	int result;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) {
+		/* For sanity:test_206.
+		 * Do not leave the object in cache to avoid accessing
+		 * freed memory. This is because osc_object is referring to
+		 * lov_oinfo of lsm_stripe_data which will be freed due to
+		 * this failure. */
+		cl_object_kill(env, stripe);
+		cl_object_put(env, stripe);
+		return -EIO;
+	}
+
+	hdr    = cl_object_header(lov2cl(lov));
+	subhdr = cl_object_header(stripe);
+	parent = subhdr->coh_parent;
+
+	oinfo = lov->lo_lsm->lsm_oinfo[idx];
+	CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: ostid: "DOSTID
+	       " idx: %d gen: %d\n",
+	       PFID(&subhdr->coh_lu.loh_fid), subhdr, idx,
+	       PFID(&hdr->coh_lu.loh_fid), hdr, POSTID(&oinfo->loi_oi),
+	       oinfo->loi_ost_idx, oinfo->loi_ost_gen);
+
+	if (parent == NULL) {
+		subhdr->coh_parent = hdr;
+		subhdr->coh_nesting = hdr->coh_nesting + 1;
+		lu_object_ref_add(&stripe->co_lu, "lov-parent", lov);
+		r0->lo_sub[idx] = cl2lovsub(stripe);
+		r0->lo_sub[idx]->lso_super = lov;
+		r0->lo_sub[idx]->lso_index = idx;
+		result = 0;
+	} else {
+		struct lu_object  *old_obj;
+		struct lov_object *old_lov;
+		unsigned int mask = D_INODE;
+
+		old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
+		LASSERT(old_obj != NULL);
+		old_lov = cl2lov(lu2cl(old_obj));
+		if (old_lov->lo_layout_invalid) {
+			/* the object's layout has already changed but isn't
+			 * refreshed */
+			lu_object_unhash(env, &stripe->co_lu);
+			result = -EAGAIN;
+		} else {
+			mask = D_ERROR;
+			result = -EIO;
+		}
+
+		LU_OBJECT_DEBUG(mask, env, &stripe->co_lu,
+				"stripe %d is already owned.\n", idx);
+		LU_OBJECT_DEBUG(mask, env, old_obj, "owned.\n");
+		LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n");
+		cl_object_put(env, stripe);
+	}
+	return result;
+}
+
+static int lov_init_raid0(const struct lu_env *env,
+			  struct lov_device *dev, struct lov_object *lov,
+			  const struct cl_object_conf *conf,
+			  union  lov_layout_state *state)
+{
+	int result;
+	int i;
+
+	struct cl_object	*stripe;
+	struct lov_thread_info  *lti     = lov_env_info(env);
+	struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
+	struct lov_stripe_md    *lsm     = conf->u.coc_md->lsm;
+	struct lu_fid	   *ofid    = &lti->lti_fid;
+	struct lov_layout_raid0 *r0      = &state->raid0;
+
+	ENTRY;
+
+	if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) {
+		dump_lsm(D_ERROR, lsm);
+		LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n",
+			 LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic);
+	}
+
+	LASSERT(lov->lo_lsm == NULL);
+	lov->lo_lsm = lsm_addref(lsm);
+	r0->lo_nr  = lsm->lsm_stripe_count;
+	LASSERT(r0->lo_nr <= lov_targets_nr(dev));
+
+	OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+	if (r0->lo_sub != NULL) {
+		result = 0;
+		subconf->coc_inode = conf->coc_inode;
+		spin_lock_init(&r0->lo_sub_lock);
+		/*
+		 * Create stripe cl_objects.
+		 */
+		for (i = 0; i < r0->lo_nr && result == 0; ++i) {
+			struct cl_device *subdev;
+			struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
+			int ost_idx = oinfo->loi_ost_idx;
+
+			result = ostid_to_fid(ofid, &oinfo->loi_oi,
+					      oinfo->loi_ost_idx);
+			if (result != 0)
+				GOTO(out, result);
+
+			subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+			subconf->u.coc_oinfo = oinfo;
+			LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx);
+			/* In the function below, .hs_keycmp resolves to
+			 * lu_obj_hop_keycmp() */
+			/* coverity[overrun-buffer-val] */
+			stripe = lov_sub_find(env, subdev, ofid, subconf);
+			if (!IS_ERR(stripe)) {
+				result = lov_init_sub(env, lov, stripe, r0, i);
+				if (result == -EAGAIN) { /* try again */
+					--i;
+					result = 0;
+				}
+			} else {
+				result = PTR_ERR(stripe);
+			}
+		}
+	} else
+		result = -ENOMEM;
+out:
+	RETURN(result);
+}
+
+static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state)
+{
+	LASSERT(lov->lo_type == LLT_EMPTY);
+
+	lov_layout_wait(env, lov);
+
+	cl_object_prune(env, &lov->lo_cl);
+	return 0;
+}
+
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+			       struct lovsub_object *los, int idx)
+{
+	struct cl_object	*sub;
+	struct lov_layout_raid0 *r0;
+	struct lu_site	  *site;
+	struct lu_site_bkt_data *bkt;
+	wait_queue_t	  *waiter;
+
+	r0  = &lov->u.raid0;
+	LASSERT(r0->lo_sub[idx] == los);
+
+	sub  = lovsub2cl(los);
+	site = sub->co_lu.lo_dev->ld_site;
+	bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+	cl_object_kill(env, sub);
+	/* release a reference to the sub-object and ... */
+	lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+	cl_object_put(env, sub);
+
+	/* ... wait until it is actually destroyed---sub-object clears its
+	 * ->lo_sub[] slot in lovsub_object_fini() */
+	if (r0->lo_sub[idx] == los) {
+		waiter = &lov_env_info(env)->lti_waiter;
+		init_waitqueue_entry_current(waiter);
+		add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		while (1) {
+			/* this wait-queue is signaled at the end of
+			 * lu_object_free(). */
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_lock(&r0->lo_sub_lock);
+			if (r0->lo_sub[idx] == los) {
+				spin_unlock(&r0->lo_sub_lock);
+				waitq_wait(waiter, TASK_UNINTERRUPTIBLE);
+			} else {
+				spin_unlock(&r0->lo_sub_lock);
+				set_current_state(TASK_RUNNING);
+				break;
+			}
+		}
+		remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
+	}
+	LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static int lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+			    union lov_layout_state *state)
+{
+	struct lov_layout_raid0 *r0 = &state->raid0;
+	struct lov_stripe_md    *lsm = lov->lo_lsm;
+	int i;
+
+	ENTRY;
+
+	dump_lsm(D_INODE, lsm);
+
+	lov_layout_wait(env, lov);
+	if (r0->lo_sub != NULL) {
+		for (i = 0; i < r0->lo_nr; ++i) {
+			struct lovsub_object *los = r0->lo_sub[i];
+
+			if (los != NULL) {
+				cl_locks_prune(env, &los->lso_cl, 1);
+				/*
+				 * If top-level object is to be evicted from
+				 * the cache, so are its sub-objects.
+				 */
+				lov_subobject_kill(env, lov, los, i);
+			}
+		}
+	}
+	cl_object_prune(env, &lov->lo_cl);
+	RETURN(0);
+}
+
+static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
+			   union lov_layout_state *state)
+{
+	LASSERT(lov->lo_type == LLT_EMPTY);
+}
+
+static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov,
+			   union lov_layout_state *state)
+{
+	struct lov_layout_raid0 *r0 = &state->raid0;
+	ENTRY;
+
+	if (r0->lo_sub != NULL) {
+		OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+		r0->lo_sub = NULL;
+	}
+
+	dump_lsm(D_INODE, lov->lo_lsm);
+	lov_free_memmd(&lov->lo_lsm);
+
+	EXIT;
+}
+
+static int lov_print_empty(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, const struct lu_object *o)
+{
+	(*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid);
+	return 0;
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+			   lu_printer_t p, const struct lu_object *o)
+{
+	struct lov_object       *lov = lu2lov(o);
+	struct lov_layout_raid0 *r0  = lov_r0(lov);
+	struct lov_stripe_md    *lsm = lov->lo_lsm;
+	int i;
+
+	(*p)(env, cookie, "stripes: %d, %svalid, lsm{%p 0x%08X %d %u %u}: \n",
+		r0->lo_nr, lov->lo_layout_invalid ? "in" : "", lsm,
+		lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+		lsm->lsm_stripe_count, lsm->lsm_layout_gen);
+	for (i = 0; i < r0->lo_nr; ++i) {
+		struct lu_object *sub;
+
+		if (r0->lo_sub[i] != NULL) {
+			sub = lovsub2lu(r0->lo_sub[i]);
+			lu_object_print(env, cookie, p, sub);
+		} else
+			(*p)(env, cookie, "sub %d absent\n", i);
+	}
+	return 0;
+}
+
+/**
+ * Implements cl_object_operations::coo_attr_get() method for an object
+ * without stripes (LLT_EMPTY layout type).
+ *
+ * The only attributes this layer is authoritative in this case is
+ * cl_attr::cat_blocks---it's 0.
+ */
+static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
+			      struct cl_attr *attr)
+{
+	attr->cat_blocks = 0;
+	return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj,
+			      struct cl_attr *attr)
+{
+	struct lov_object	*lov = cl2lov(obj);
+	struct lov_layout_raid0 *r0 = lov_r0(lov);
+	struct cl_attr		*lov_attr = &r0->lo_attr;
+	int			 result = 0;
+
+	ENTRY;
+
+	/* this is called w/o holding type guard mutex, so it must be inside
+	 * an on going IO otherwise lsm may be replaced.
+	 * LU-2117: it turns out there exists one exception. For mmaped files,
+	 * the lock of those files may be requested in the other file's IO
+	 * context, and this function is called in ccc_lock_state(), it will
+	 * hit this assertion.
+	 * Anyway, it's still okay to call attr_get w/o type guard as layout
+	 * can't go if locks exist. */
+	/* LASSERT(atomic_read(&lsm->lsm_refc) > 1); */
+
+	if (!r0->lo_attr_valid) {
+		struct lov_stripe_md    *lsm = lov->lo_lsm;
+		struct ost_lvb	  *lvb = &lov_env_info(env)->lti_lvb;
+		__u64		    kms = 0;
+
+		memset(lvb, 0, sizeof(*lvb));
+		/* XXX: timestamps can be negative by sanity:test_39m,
+		 * how can it be? */
+		lvb->lvb_atime = LLONG_MIN;
+		lvb->lvb_ctime = LLONG_MIN;
+		lvb->lvb_mtime = LLONG_MIN;
+
+		/*
+		 * XXX that should be replaced with a loop over sub-objects,
+		 * doing cl_object_attr_get() on them. But for now, let's
+		 * reuse old lov code.
+		 */
+
+		/*
+		 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+		 * happy. It's not needed, because new code uses
+		 * ->coh_attr_guard spin-lock to protect consistency of
+		 * sub-object attributes.
+		 */
+		lov_stripe_lock(lsm);
+		result = lov_merge_lvb_kms(lsm, lvb, &kms);
+		lov_stripe_unlock(lsm);
+		if (result == 0) {
+			cl_lvb2attr(lov_attr, lvb);
+			lov_attr->cat_kms = kms;
+			r0->lo_attr_valid = 1;
+		}
+	}
+	if (result == 0) { /* merge results */
+		attr->cat_blocks = lov_attr->cat_blocks;
+		attr->cat_size = lov_attr->cat_size;
+		attr->cat_kms = lov_attr->cat_kms;
+		if (attr->cat_atime < lov_attr->cat_atime)
+			attr->cat_atime = lov_attr->cat_atime;
+		if (attr->cat_ctime < lov_attr->cat_ctime)
+			attr->cat_ctime = lov_attr->cat_ctime;
+		if (attr->cat_mtime < lov_attr->cat_mtime)
+			attr->cat_mtime = lov_attr->cat_mtime;
+	}
+	RETURN(result);
+}
+
+const static struct lov_layout_operations lov_dispatch[] = {
+	[LLT_EMPTY] = {
+		.llo_init      = lov_init_empty,
+		.llo_delete    = lov_delete_empty,
+		.llo_fini      = lov_fini_empty,
+		.llo_install   = lov_install_empty,
+		.llo_print     = lov_print_empty,
+		.llo_page_init = lov_page_init_empty,
+		.llo_lock_init = lov_lock_init_empty,
+		.llo_io_init   = lov_io_init_empty,
+		.llo_getattr   = lov_attr_get_empty
+	},
+	[LLT_RAID0] = {
+		.llo_init      = lov_init_raid0,
+		.llo_delete    = lov_delete_raid0,
+		.llo_fini      = lov_fini_raid0,
+		.llo_install   = lov_install_raid0,
+		.llo_print     = lov_print_raid0,
+		.llo_page_init = lov_page_init_raid0,
+		.llo_lock_init = lov_lock_init_raid0,
+		.llo_io_init   = lov_io_init_raid0,
+		.llo_getattr   = lov_attr_get_raid0
+	}
+};
+
+
+/**
+ * Performs a double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH_NOLOCK(obj, op, ...)			      \
+({								      \
+	struct lov_object		      *__obj = (obj);	  \
+	enum lov_layout_type		    __llt;		  \
+									\
+	__llt = __obj->lo_type;					 \
+	LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));	\
+	lov_dispatch[__llt].op(__VA_ARGS__);			    \
+})
+
+static inline void lov_conf_freeze(struct lov_object *lov)
+{
+	if (lov->lo_owner != current)
+		down_read(&lov->lo_type_guard);
+}
+
+static inline void lov_conf_thaw(struct lov_object *lov)
+{
+	if (lov->lo_owner != current)
+		up_read(&lov->lo_type_guard);
+}
+
+#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...)		       \
+({								      \
+	struct lov_object		      *__obj = (obj);	  \
+	int				     __lock = !!(lock);      \
+	typeof(lov_dispatch[0].op(__VA_ARGS__)) __result;	       \
+									\
+	if (__lock)						     \
+		lov_conf_freeze(__obj);					\
+	__result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__);	  \
+	if (__lock)						     \
+		lov_conf_thaw(__obj);					\
+	__result;						       \
+})
+
+/**
+ * Performs a locked double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH(obj, op, ...)		     \
+	LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__)
+
+#define LOV_2DISPATCH_VOID(obj, op, ...)				\
+do {								    \
+	struct lov_object		      *__obj = (obj);	  \
+	enum lov_layout_type		    __llt;		  \
+									\
+	lov_conf_freeze(__obj);						\
+	__llt = __obj->lo_type;					 \
+	LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));	\
+	lov_dispatch[__llt].op(__VA_ARGS__);			    \
+	lov_conf_thaw(__obj);						\
+} while (0)
+
+static void lov_conf_lock(struct lov_object *lov)
+{
+	LASSERT(lov->lo_owner != current);
+	down_write(&lov->lo_type_guard);
+	LASSERT(lov->lo_owner == NULL);
+	lov->lo_owner = current;
+}
+
+static void lov_conf_unlock(struct lov_object *lov)
+{
+	lov->lo_owner = NULL;
+	up_write(&lov->lo_type_guard);
+}
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov)
+{
+	struct l_wait_info lwi = { 0 };
+	ENTRY;
+
+	while (atomic_read(&lov->lo_active_ios) > 0) {
+		CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n",
+			PFID(lu_object_fid(lov2lu(lov))),
+			atomic_read(&lov->lo_active_ios));
+
+		l_wait_event(lov->lo_waitq,
+			     atomic_read(&lov->lo_active_ios) == 0, &lwi);
+	}
+	RETURN(0);
+}
+
+static int lov_layout_change(const struct lu_env *unused,
+			     struct lov_object *lov,
+			     const struct cl_object_conf *conf)
+{
+	int result;
+	enum lov_layout_type llt = LLT_EMPTY;
+	union lov_layout_state *state = &lov->u;
+	const struct lov_layout_operations *old_ops;
+	const struct lov_layout_operations *new_ops;
+
+	struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+	void *cookie;
+	struct lu_env *env;
+	int refcheck;
+	ENTRY;
+
+	LASSERT(0 <= lov->lo_type && lov->lo_type < ARRAY_SIZE(lov_dispatch));
+
+	if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL)
+		llt = LLT_RAID0; /* only raid0 is supported. */
+	LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch));
+
+	cookie = cl_env_reenter();
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env)) {
+		cl_env_reexit(cookie);
+		RETURN(PTR_ERR(env));
+	}
+
+	old_ops = &lov_dispatch[lov->lo_type];
+	new_ops = &lov_dispatch[llt];
+
+	result = old_ops->llo_delete(env, lov, &lov->u);
+	if (result == 0) {
+		old_ops->llo_fini(env, lov, &lov->u);
+
+		LASSERT(atomic_read(&lov->lo_active_ios) == 0);
+		LASSERT(hdr->coh_tree.rnode == NULL);
+		LASSERT(hdr->coh_pages == 0);
+
+		lov->lo_type = LLT_EMPTY;
+		result = new_ops->llo_init(env,
+					lu2lov_dev(lov->lo_cl.co_lu.lo_dev),
+					lov, conf, state);
+		if (result == 0) {
+			new_ops->llo_install(env, lov, state);
+			lov->lo_type = llt;
+		} else {
+			new_ops->llo_delete(env, lov, state);
+			new_ops->llo_fini(env, lov, state);
+			/* this file becomes an EMPTY file. */
+		}
+	}
+
+	cl_env_put(env, &refcheck);
+	cl_env_reexit(cookie);
+	RETURN(result);
+}
+
+/*****************************************************************************
+ *
+ * Lov object operations.
+ *
+ */
+
+int lov_object_init(const struct lu_env *env, struct lu_object *obj,
+		    const struct lu_object_conf *conf)
+{
+	struct lov_device	    *dev   = lu2lov_dev(obj->lo_dev);
+	struct lov_object	    *lov   = lu2lov(obj);
+	const struct cl_object_conf  *cconf = lu2cl_conf(conf);
+	union  lov_layout_state      *set   = &lov->u;
+	const struct lov_layout_operations *ops;
+	int result;
+
+	ENTRY;
+	init_rwsem(&lov->lo_type_guard);
+	atomic_set(&lov->lo_active_ios, 0);
+	init_waitqueue_head(&lov->lo_waitq);
+
+	cl_object_page_init(lu2cl(obj), sizeof(struct lov_page));
+
+	/* no locking is necessary, as object is being created */
+	lov->lo_type = cconf->u.coc_md->lsm != NULL ? LLT_RAID0 : LLT_EMPTY;
+	ops = &lov_dispatch[lov->lo_type];
+	result = ops->llo_init(env, dev, lov, cconf, set);
+	if (result == 0)
+		ops->llo_install(env, lov, set);
+	RETURN(result);
+}
+
+static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_object_conf *conf)
+{
+	struct lov_stripe_md *lsm = NULL;
+	struct lov_object *lov = cl2lov(obj);
+	int result = 0;
+	ENTRY;
+
+	lov_conf_lock(lov);
+	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+		lov->lo_layout_invalid = true;
+		GOTO(out, result = 0);
+	}
+
+	if (conf->coc_opc == OBJECT_CONF_WAIT) {
+		if (lov->lo_layout_invalid &&
+		    atomic_read(&lov->lo_active_ios) > 0) {
+			lov_conf_unlock(lov);
+			result = lov_layout_wait(env, lov);
+			lov_conf_lock(lov);
+		}
+		GOTO(out, result);
+	}
+
+	LASSERT(conf->coc_opc == OBJECT_CONF_SET);
+
+	if (conf->u.coc_md != NULL)
+		lsm = conf->u.coc_md->lsm;
+	if ((lsm == NULL && lov->lo_lsm == NULL) ||
+	    (lsm != NULL && lov->lo_lsm != NULL &&
+	     lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen)) {
+		/* same version of layout */
+		lov->lo_layout_invalid = false;
+		GOTO(out, result = 0);
+	}
+
+	/* will change layout - check if there still exists active IO. */
+	if (atomic_read(&lov->lo_active_ios) > 0) {
+		lov->lo_layout_invalid = true;
+		GOTO(out, result = -EBUSY);
+	}
+
+	lov->lo_layout_invalid = lov_layout_change(env, lov, conf);
+	EXIT;
+
+out:
+	lov_conf_unlock(lov);
+	RETURN(result);
+}
+
+static void lov_object_delete(const struct lu_env *env, struct lu_object *obj)
+{
+	struct lov_object *lov = lu2lov(obj);
+
+	ENTRY;
+	LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u);
+	EXIT;
+}
+
+static void lov_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct lov_object *lov = lu2lov(obj);
+
+	ENTRY;
+	LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u);
+	lu_object_fini(obj);
+	OBD_SLAB_FREE_PTR(lov, lov_object_kmem);
+	EXIT;
+}
+
+static int lov_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o);
+}
+
+int lov_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, struct page *vmpage)
+{
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj),
+				    llo_page_init, env, obj, page, vmpage);
+}
+
+/**
+ * Implements cl_object_operations::clo_io_init() method for lov
+ * layer. Dispatches to the appropriate layout io initialization method.
+ */
+int lov_io_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_io *io)
+{
+	CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+	return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
+				     !io->ci_ignore_layout, env, obj, io);
+}
+
+/**
+ * An implementation of cl_object_operations::clo_attr_get() method for lov
+ * layer. For raid0 layout this collects and merges attributes of all
+ * sub-objects.
+ */
+static int lov_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	/* do not take lock, as this function is called under a
+	 * spin-lock. Layout is protected from changing by ongoing IO. */
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr);
+}
+
+static int lov_attr_set(const struct lu_env *env, struct cl_object *obj,
+			const struct cl_attr *attr, unsigned valid)
+{
+	/*
+	 * No dispatch is required here, as no layout implements this.
+	 */
+	return 0;
+}
+
+int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_lock *lock, const struct cl_io *io)
+{
+	/* No need to lock because we've taken one refcount of layout.  */
+	return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock,
+				    io);
+}
+
+static const struct cl_object_operations lov_ops = {
+	.coo_page_init = lov_page_init,
+	.coo_lock_init = lov_lock_init,
+	.coo_io_init   = lov_io_init,
+	.coo_attr_get  = lov_attr_get,
+	.coo_attr_set  = lov_attr_set,
+	.coo_conf_set  = lov_conf_set
+};
+
+static const struct lu_object_operations lov_lu_obj_ops = {
+	.loo_object_init      = lov_object_init,
+	.loo_object_delete    = lov_object_delete,
+	.loo_object_release   = NULL,
+	.loo_object_free      = lov_object_free,
+	.loo_object_print     = lov_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct lov_object *lov;
+	struct lu_object  *obj;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, __GFP_IO);
+	if (lov != NULL) {
+		obj = lov2lu(lov);
+		lu_object_init(obj, NULL, dev);
+		lov->lo_cl.co_ops = &lov_ops;
+		lov->lo_type = -1; /* invalid, to catch uninitialized type */
+		/*
+		 * object io operation vector (cl_object::co_iop) is installed
+		 * later in lov_object_init(), as different vectors are used
+		 * for object with different layouts.
+		 */
+		obj->lo_ops = &lov_lu_obj_ops;
+	} else
+		obj = NULL;
+	RETURN(obj);
+}
+
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
+{
+	struct lov_stripe_md *lsm = NULL;
+
+	lov_conf_freeze(lov);
+	if (lov->lo_lsm != NULL) {
+		lsm = lsm_addref(lov->lo_lsm);
+		CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
+			lsm, atomic_read(&lsm->lsm_refc),
+			lov->lo_layout_invalid, current);
+	}
+	lov_conf_thaw(lov);
+	return lsm;
+}
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm)
+{
+	if (lsm == NULL)
+		return;
+
+	CDEBUG(D_INODE, "lsm %p decref %d by %p.\n",
+		lsm, atomic_read(&lsm->lsm_refc), current);
+
+	lov_free_memmd(&lsm);
+}
+
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj)
+{
+	struct lu_object *luobj;
+	struct lov_stripe_md *lsm = NULL;
+
+	if (clobj == NULL)
+		return NULL;
+
+	luobj = lu_object_locate(&cl_object_header(clobj)->coh_lu,
+				 &lov_device_type);
+	if (luobj != NULL)
+		lsm = lov_lsm_addref(lu2lov(luobj));
+	return lsm;
+}
+EXPORT_SYMBOL(lov_lsm_get);
+
+void lov_lsm_put(struct cl_object *unused, struct lov_stripe_md *lsm)
+{
+	if (lsm != NULL)
+		lov_free_memmd(&lsm);
+}
+EXPORT_SYMBOL(lov_lsm_put);
+
+int lov_read_and_clear_async_rc(struct cl_object *clob)
+{
+	struct lu_object *luobj;
+	int rc = 0;
+	ENTRY;
+
+	luobj = lu_object_locate(&cl_object_header(clob)->coh_lu,
+				 &lov_device_type);
+	if (luobj != NULL) {
+		struct lov_object *lov = lu2lov(luobj);
+
+		lov_conf_freeze(lov);
+		switch (lov->lo_type) {
+		case LLT_RAID0: {
+			struct lov_stripe_md *lsm;
+			int i;
+
+			lsm = lov->lo_lsm;
+			LASSERT(lsm != NULL);
+			for (i = 0; i < lsm->lsm_stripe_count; i++) {
+				struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+				if (loi->loi_ar.ar_rc && !rc)
+					rc = loi->loi_ar.ar_rc;
+				loi->loi_ar.ar_rc = 0;
+			}
+		}
+		case LLT_EMPTY:
+			break;
+		default:
+			LBUG();
+		}
+		lov_conf_thaw(lov);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lov_read_and_clear_async_rc);
+
+/** @} lov */

diff --git a/drivers/staging/lustre/lustre/lov/lov_offset.c b/drivers/staging/lustre/lustre/lov/lov_offset.c
new file mode 100644
index 0000000..f62b7e5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_offset.c

@@ -0,0 +1,267 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+
+#include "lov_internal.h"
+
+/* compute object size given "stripeno" and the ost size */
+obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
+			 int stripeno)
+{
+	unsigned long ssize = lsm->lsm_stripe_size;
+	unsigned long stripe_size;
+	obd_off swidth;
+	obd_size lov_size;
+	int magic = lsm->lsm_magic;
+	ENTRY;
+
+	if (ost_size == 0)
+		RETURN(0);
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_size = lov_do_div64(ost_size, ssize);
+	if (stripe_size)
+		lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
+	else
+		lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
+
+	RETURN(lov_size);
+}
+
+/* we have an offset in file backed by an lov and want to find out where
+ * that offset lands in our given stripe of the file.  for the easy
+ * case where the offset is within the stripe, we just have to scale the
+ * offset down to make it relative to the stripe instead of the lov.
+ *
+ * the harder case is what to do when the offset doesn't intersect the
+ * stripe.  callers will want start offsets clamped ahead to the start
+ * of the nearest stripe in the file.  end offsets similarly clamped to the
+ * nearest ending byte of a stripe in the file:
+ *
+ * all this function does is move offsets to the nearest region of the
+ * stripe, and it does its work "mod" the full length of all the stripes.
+ * consider a file with 3 stripes:
+ *
+ *	     S					      E
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * to find stripe 1's offsets for S and E, it divides by the full stripe
+ * width and does its math in the context of a single set of stripes:
+ *
+ *	     S	 E
+ * -----------------------------------
+ * |    0    |     1     |     2     |
+ * -----------------------------------
+ *
+ * it'll notice that E is outside stripe 1 and clamp it to the end of the
+ * stripe, then multiply it back out by lov_off to give the real offsets in
+ * the stripe:
+ *
+ *   S		   E
+ * ---------------------------------------------------------------------
+ * |    1    |     1     |     1     |    1    |     1     |     1     |
+ * ---------------------------------------------------------------------
+ *
+ * it would have done similarly and pulled S forward to the start of a 1
+ * stripe if, say, S had landed in a 0 stripe.
+ *
+ * this rounding isn't always correct.  consider an E lov offset that lands
+ * on a 0 stripe, the "mod stripe width" math will pull it forward to the
+ * start of a 1 stripe, when in fact it wanted to be rounded back to the end
+ * of a previous 1 stripe.  this logic is handled by callers and this is why:
+ *
+ * this function returns < 0 when the offset was "before" the stripe and
+ * was moved forward to the start of the stripe in question;  0 when it
+ * falls in the stripe and no shifting was done; > 0 when the offset
+ * was outside the stripe and was pulled back to its final byte. */
+int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
+		      int stripeno, obd_off *obdoff)
+{
+	unsigned long ssize  = lsm->lsm_stripe_size;
+	obd_off stripe_off, this_stripe, swidth;
+	int magic = lsm->lsm_magic;
+	int ret = 0;
+
+	if (lov_off == OBD_OBJECT_EOF) {
+		*obdoff = OBD_OBJECT_EOF;
+		return 0;
+	}
+
+	LASSERT(lsm_op_find(magic) != NULL);
+
+	lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off,
+						&swidth);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_off = lov_do_div64(lov_off, swidth);
+
+	this_stripe = (obd_off)stripeno * ssize;
+	if (stripe_off < this_stripe) {
+		stripe_off = 0;
+		ret = -1;
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			stripe_off = ssize;
+			ret = 1;
+		}
+	}
+
+	*obdoff = lov_off * ssize + stripe_off;
+	return ret;
+}
+
+/* Given a whole-file size and a stripe number, give the file size which
+ * corresponds to the individual object of that stripe.
+ *
+ * This behaves basically in the same was as lov_stripe_offset, except that
+ * file sizes falling before the beginning of a stripe are clamped to the end
+ * of the previous stripe, not the beginning of the next:
+ *
+ *					       S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * if clamped to stripe 2 becomes:
+ *
+ *				   S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ */
+obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
+			   int stripeno)
+{
+	unsigned long ssize  = lsm->lsm_stripe_size;
+	obd_off stripe_off, this_stripe, swidth;
+	int magic = lsm->lsm_magic;
+
+	if (file_size == OBD_OBJECT_EOF)
+		return OBD_OBJECT_EOF;
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size,
+						&swidth);
+
+	/* lov_do_div64(a, b) returns a % b, and a = a / b */
+	stripe_off = lov_do_div64(file_size, swidth);
+
+	this_stripe = (obd_off)stripeno * ssize;
+	if (stripe_off < this_stripe) {
+		/* Move to end of previous stripe, or zero */
+		if (file_size > 0) {
+			file_size--;
+			stripe_off = ssize;
+		} else {
+			stripe_off = 0;
+		}
+	} else {
+		stripe_off -= this_stripe;
+
+		if (stripe_off >= ssize) {
+			/* Clamp to end of this stripe */
+			stripe_off = ssize;
+		}
+	}
+
+	return (file_size * ssize + stripe_off);
+}
+
+/* given an extent in an lov and a stripe, calculate the extent of the stripe
+ * that is contained within the lov extent.  this returns true if the given
+ * stripe does intersect with the lov extent. */
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+			  obd_off start, obd_off end,
+			  obd_off *obd_start, obd_off *obd_end)
+{
+	int start_side, end_side;
+
+	start_side = lov_stripe_offset(lsm, start, stripeno, obd_start);
+	end_side = lov_stripe_offset(lsm, end, stripeno, obd_end);
+
+	CDEBUG(D_INODE, "["LPU64"->"LPU64"] -> [(%d) "LPU64"->"LPU64" (%d)]\n",
+	       start, end, start_side, *obd_start, *obd_end, end_side);
+
+	/* this stripe doesn't intersect the file extent when neither
+	 * start or the end intersected the stripe and obd_start and
+	 * obd_end got rounded up to the save value. */
+	if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+		return 0;
+
+	/* as mentioned in the lov_stripe_offset commentary, end
+	 * might have been shifted in the wrong direction.  This
+	 * happens when an end offset is before the stripe when viewed
+	 * through the "mod stripe size" math. we detect it being shifted
+	 * in the wrong direction and touch it up.
+	 * interestingly, this can't underflow since end must be > start
+	 * if we passed through the previous check.
+	 * (should we assert for that somewhere?) */
+	if (end_side != 0)
+		(*obd_end)--;
+
+	return 1;
+}
+
+/* compute which stripe number "lov_off" will be written into */
+int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
+{
+	unsigned long ssize  = lsm->lsm_stripe_size;
+	obd_off stripe_off, swidth;
+	int magic = lsm->lsm_magic;
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth);
+
+	stripe_off = lov_do_div64(lov_off, swidth);
+
+	/* Puts stripe_off/ssize result into stripe_off */
+	lov_do_div64(stripe_off, ssize);
+
+	return stripe_off;
+}

diff --git a/drivers/staging/lustre/lustre/lov/lov_pack.c b/drivers/staging/lustre/lustre/lov/lov_pack.c
new file mode 100644
index 0000000..492948a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_pack.c

@@ -0,0 +1,678 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <lustre_net.h>
+#include <obd.h>
+#include <obd_lov.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_user.h>
+
+#include "lov_internal.h"
+
+void lov_dump_lmm_common(int level, void *lmmp)
+{
+	struct lov_mds_md *lmm = lmmp;
+	struct ost_id	oi;
+
+	lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
+	CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+	       POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+	       le32_to_cpu(lmm->lmm_pattern));
+	CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+	       le32_to_cpu(lmm->lmm_stripe_size),
+	       le16_to_cpu(lmm->lmm_stripe_count),
+	       le16_to_cpu(lmm->lmm_layout_gen));
+}
+
+static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
+				 int stripe_count)
+{
+	int i;
+
+	if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+		CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+		       stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+		return;
+	}
+
+	for (i = 0; i < stripe_count; ++i, ++lod) {
+		struct ost_id	oi;
+
+		ostid_le_to_cpu(&lod->l_ost_oi, &oi);
+		CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+		       le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+	}
+}
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
+{
+	lov_dump_lmm_common(level, lmm);
+	lov_dump_lmm_objects(level, lmm->lmm_objects,
+			     le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
+{
+	lov_dump_lmm_common(level, lmm);
+	CDEBUG(level,"pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+	lov_dump_lmm_objects(level, lmm->lmm_objects,
+			     le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm(int level, void *lmm)
+{
+	int magic;
+
+	magic = ((struct lov_mds_md_v1 *)(lmm))->lmm_magic;
+	switch (magic) {
+	case LOV_MAGIC_V1:
+		return lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)(lmm));
+	case LOV_MAGIC_V3:
+		return lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)(lmm));
+	default:
+		CERROR("Cannot recognize lmm_magic %x", magic);
+	}
+	return;
+}
+
+#define LMM_ASSERT(test)						\
+do {								    \
+	if (!(test)) lov_dump_lmm(D_ERROR, lmm);			\
+	LASSERT(test); /* so we know what assertion failed */	   \
+} while(0)
+
+/* Pack LOV object metadata for disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ *
+ * XXX In the future, this will be enhanced to get the EA size from the
+ *     underlying OSC device(s) to get their EA sizes so we can stack
+ *     LOVs properly.  For now lov_mds_md_size() just assumes one obd_id
+ *     per stripe.
+ */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+	       struct lov_stripe_md *lsm)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	struct lov_mds_md_v1 *lmmv1;
+	struct lov_mds_md_v3 *lmmv3;
+	__u16 stripe_count;
+	struct lov_ost_data_v1 *lmm_objects;
+	int lmm_size, lmm_magic;
+	int i;
+	int cplen = 0;
+	ENTRY;
+
+	if (lsm) {
+		lmm_magic = lsm->lsm_magic;
+	} else {
+		if (lmmp && *lmmp)
+			lmm_magic = le32_to_cpu((*lmmp)->lmm_magic);
+		else
+			/* lsm == NULL and lmmp == NULL */
+			lmm_magic = LOV_MAGIC;
+	}
+
+	if ((lmm_magic != LOV_MAGIC_V1) &&
+	    (lmm_magic != LOV_MAGIC_V3)) {
+		CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n",
+			lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3);
+		RETURN(-EINVAL);
+
+	}
+
+	if (lsm) {
+		/* If we are just sizing the EA, limit the stripe count
+		 * to the actual number of OSTs in this filesystem. */
+		if (!lmmp) {
+			stripe_count = lov_get_stripecnt(lov, lmm_magic,
+							 lsm->lsm_stripe_count);
+			lsm->lsm_stripe_count = stripe_count;
+		} else {
+			stripe_count = lsm->lsm_stripe_count;
+		}
+	} else {
+		/* No need to allocate more than maximum supported stripes.
+		 * Anyway, this is pretty inaccurate since ld_tgt_count now
+		 * represents max index and we should rely on the actual number
+		 * of OSTs instead */
+		stripe_count = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize,
+						    lmm_magic);
+		if (stripe_count > lov->desc.ld_tgt_count)
+			stripe_count = lov->desc.ld_tgt_count;
+	}
+
+	/* XXX LOV STACKING call into osc for sizes */
+	lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+
+	if (!lmmp)
+		RETURN(lmm_size);
+
+	if (*lmmp && !lsm) {
+		stripe_count = le16_to_cpu((*lmmp)->lmm_stripe_count);
+		lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+		OBD_FREE_LARGE(*lmmp, lmm_size);
+		*lmmp = NULL;
+		RETURN(0);
+	}
+
+	if (!*lmmp) {
+		OBD_ALLOC_LARGE(*lmmp, lmm_size);
+		if (!*lmmp)
+			RETURN(-ENOMEM);
+	}
+
+	CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n",
+	       lmm_magic, lmm_size);
+
+	lmmv1 = *lmmp;
+	lmmv3 = (struct lov_mds_md_v3 *)*lmmp;
+	if (lmm_magic == LOV_MAGIC_V3)
+		lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3);
+	else
+		lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
+
+	if (!lsm)
+		RETURN(lmm_size);
+
+	/* lmmv1 and lmmv3 point to the same struct and have the
+	 * same first fields
+	 */
+	lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi);
+	lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size);
+	lmmv1->lmm_stripe_count = cpu_to_le16(stripe_count);
+	lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern);
+	lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen);
+	if (lsm->lsm_magic == LOV_MAGIC_V3) {
+		cplen = strlcpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name,
+				sizeof(lmmv3->lmm_pool_name));
+		if (cplen >= sizeof(lmmv3->lmm_pool_name))
+			RETURN(-E2BIG);
+		lmm_objects = lmmv3->lmm_objects;
+	} else {
+		lmm_objects = lmmv1->lmm_objects;
+	}
+
+	for (i = 0; i < stripe_count; i++) {
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+		/* XXX LOV STACKING call down to osc_packmd() to do packing */
+		LASSERTF(ostid_id(&loi->loi_oi) != 0, "lmm_oi "DOSTID
+			 " stripe %u/%u idx %u\n", POSTID(&lmmv1->lmm_oi),
+			 i, stripe_count, loi->loi_ost_idx);
+		ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi);
+		lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
+		lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
+	}
+
+	RETURN(lmm_size);
+}
+
+/* Find the max stripecount we should use */
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
+{
+	__u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD;
+
+	if (!stripe_count)
+		stripe_count = lov->desc.ld_default_stripe_count;
+	if (stripe_count > lov->desc.ld_active_tgt_count)
+		stripe_count = lov->desc.ld_active_tgt_count;
+	if (!stripe_count)
+		stripe_count = 1;
+
+	/* stripe count is based on whether ldiskfs can handle
+	 * larger EA sizes */
+	if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
+	    lov->lov_ocd.ocd_max_easize)
+		max_stripes = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize,
+						   magic);
+
+	if (stripe_count > max_stripes)
+		stripe_count = max_stripes;
+
+	return stripe_count;
+}
+
+
+static int lov_verify_lmm(void *lmm, int lmm_bytes, __u16 *stripe_count)
+{
+	int rc;
+
+	if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) {
+		char *buffer;
+		int sz;
+
+		CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n",
+		       le32_to_cpu(*(__u32 *)lmm), lmm_bytes);
+		sz = lmm_bytes * 2 + 1;
+		OBD_ALLOC_LARGE(buffer, sz);
+		if (buffer != NULL) {
+			int i;
+
+			for (i = 0; i < lmm_bytes; i++)
+				sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]);
+			buffer[sz - 1] = '\0';
+			CERROR("%s\n", buffer);
+			OBD_FREE_LARGE(buffer, sz);
+		}
+		return -EINVAL;
+	}
+	rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm,
+				     lmm_bytes, stripe_count);
+	return rc;
+}
+
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+		    int pattern, int magic)
+{
+	int i, lsm_size;
+	ENTRY;
+
+	CDEBUG(D_INFO, "alloc lsm, stripe_count %d\n", stripe_count);
+
+	*lsmp = lsm_alloc_plain(stripe_count, &lsm_size);
+	if (!*lsmp) {
+		CERROR("can't allocate lsmp stripe_count %d\n", stripe_count);
+		RETURN(-ENOMEM);
+	}
+
+	atomic_set(&(*lsmp)->lsm_refc, 1);
+	spin_lock_init(&(*lsmp)->lsm_lock);
+	(*lsmp)->lsm_magic = magic;
+	(*lsmp)->lsm_stripe_count = stripe_count;
+	(*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
+	(*lsmp)->lsm_pattern = pattern;
+	(*lsmp)->lsm_pool_name[0] = '\0';
+	(*lsmp)->lsm_layout_gen = 0;
+	(*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0;
+
+	for (i = 0; i < stripe_count; i++)
+		loi_init((*lsmp)->lsm_oinfo[i]);
+
+	RETURN(lsm_size);
+}
+
+int lov_free_memmd(struct lov_stripe_md **lsmp)
+{
+	struct lov_stripe_md *lsm = *lsmp;
+	int refc;
+
+	*lsmp = NULL;
+	LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+	if ((refc = atomic_dec_return(&lsm->lsm_refc)) == 0) {
+		LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+		lsm_op_find(lsm->lsm_magic)->lsm_free(lsm);
+	}
+	return refc;
+}
+
+
+/* Unpack LOV object metadata from disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ */
+int lov_unpackmd(struct obd_export *exp,  struct lov_stripe_md **lsmp,
+		 struct lov_mds_md *lmm, int lmm_bytes)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	int rc = 0, lsm_size;
+	__u16 stripe_count;
+	__u32 magic;
+	ENTRY;
+
+	/* If passed an MDS struct use values from there, otherwise defaults */
+	if (lmm) {
+		rc = lov_verify_lmm(lmm, lmm_bytes, &stripe_count);
+		if (rc)
+			RETURN(rc);
+		magic = le32_to_cpu(lmm->lmm_magic);
+	} else {
+		magic = LOV_MAGIC;
+		stripe_count = lov_get_stripecnt(lov, magic, 0);
+	}
+
+	/* If we aren't passed an lsmp struct, we just want the size */
+	if (!lsmp) {
+		/* XXX LOV STACKING call into osc for sizes */
+		LBUG();
+		RETURN(lov_stripe_md_size(stripe_count));
+	}
+	/* If we are passed an allocated struct but nothing to unpack, free */
+	if (*lsmp && !lmm) {
+		lov_free_memmd(lsmp);
+		RETURN(0);
+	}
+
+	lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0,
+				   magic);
+	if (lsm_size < 0)
+		RETURN(lsm_size);
+
+	/* If we are passed a pointer but nothing to unpack, we only alloc */
+	if (!lmm)
+		RETURN(lsm_size);
+
+	LASSERT(lsm_op_find(magic) != NULL);
+	rc = lsm_op_find(magic)->lsm_unpackmd(lov, *lsmp, lmm);
+	if (rc) {
+		lov_free_memmd(lsmp);
+		RETURN(rc);
+	}
+
+	RETURN(lsm_size);
+}
+
+static int __lov_setstripe(struct obd_export *exp, int max_lmm_size,
+			   struct lov_stripe_md **lsmp,
+			   struct lov_user_md *lump)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct lov_obd *lov = &obd->u.lov;
+	char buffer[sizeof(struct lov_user_md_v3)];
+	struct lov_user_md_v3 *lumv3 = (struct lov_user_md_v3 *)&buffer[0];
+	struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&buffer[0];
+	int lmm_magic;
+	__u16 stripe_count;
+	int rc;
+	int cplen = 0;
+	ENTRY;
+
+	rc = lov_lum_swab_if_needed(lumv3, &lmm_magic, lump);
+	if (rc)
+		RETURN(rc);
+
+	/* in the rest of the tests, as *lumv1 and lumv3 have the same
+	 * fields, we use lumv1 to avoid code duplication */
+
+	if (lumv1->lmm_pattern == 0) {
+		lumv1->lmm_pattern = lov->desc.ld_pattern ?
+			lov->desc.ld_pattern : LOV_PATTERN_RAID0;
+	}
+
+	if (lumv1->lmm_pattern != LOV_PATTERN_RAID0) {
+		CDEBUG(D_IOCTL, "bad userland stripe pattern: %#x\n",
+		       lumv1->lmm_pattern);
+		RETURN(-EINVAL);
+	}
+
+	/* 64kB is the largest common page size we see (ia64), and matches the
+	 * check in lfs */
+	if (lumv1->lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) {
+		CDEBUG(D_IOCTL, "stripe size %u not multiple of %u, fixing\n",
+		       lumv1->lmm_stripe_size, LOV_MIN_STRIPE_SIZE);
+		lumv1->lmm_stripe_size = LOV_MIN_STRIPE_SIZE;
+	}
+
+	if ((lumv1->lmm_stripe_offset >= lov->desc.ld_tgt_count) &&
+	    (lumv1->lmm_stripe_offset !=
+	     (typeof(lumv1->lmm_stripe_offset))(-1))) {
+		CDEBUG(D_IOCTL, "stripe offset %u > number of OSTs %u\n",
+		       lumv1->lmm_stripe_offset, lov->desc.ld_tgt_count);
+		RETURN(-EINVAL);
+	}
+	stripe_count = lov_get_stripecnt(lov, lmm_magic,
+					 lumv1->lmm_stripe_count);
+
+	if (max_lmm_size) {
+		int max_stripes = (max_lmm_size -
+				   lov_mds_md_size(0, lmm_magic)) /
+				   sizeof(struct lov_ost_data_v1);
+		if (unlikely(max_stripes < stripe_count)) {
+			CDEBUG(D_IOCTL, "stripe count reset from %d to %d\n",
+			       stripe_count, max_stripes);
+			stripe_count = max_stripes;
+		}
+	}
+
+	if (lmm_magic == LOV_USER_MAGIC_V3) {
+		struct pool_desc *pool;
+
+		/* In the function below, .hs_keycmp resolves to
+		 * pool_hashkey_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		pool = lov_find_pool(lov, lumv3->lmm_pool_name);
+		if (pool != NULL) {
+			if (lumv3->lmm_stripe_offset !=
+			    (typeof(lumv3->lmm_stripe_offset))(-1)) {
+				rc = lov_check_index_in_pool(
+					lumv3->lmm_stripe_offset, pool);
+				if (rc < 0) {
+					lov_pool_putref(pool);
+					RETURN(-EINVAL);
+				}
+			}
+
+			if (stripe_count > pool_tgt_count(pool))
+				stripe_count = pool_tgt_count(pool);
+
+			lov_pool_putref(pool);
+		}
+	}
+
+	rc = lov_alloc_memmd(lsmp, stripe_count, lumv1->lmm_pattern, lmm_magic);
+
+	if (rc >= 0) {
+		(*lsmp)->lsm_oinfo[0]->loi_ost_idx = lumv1->lmm_stripe_offset;
+		(*lsmp)->lsm_stripe_size = lumv1->lmm_stripe_size;
+		if (lmm_magic == LOV_USER_MAGIC_V3) {
+			cplen = strlcpy((*lsmp)->lsm_pool_name,
+					lumv3->lmm_pool_name,
+					sizeof((*lsmp)->lsm_pool_name));
+			if (cplen >= sizeof((*lsmp)->lsm_pool_name))
+				rc = -E2BIG;
+		}
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+/* Configure object striping information on a new file.
+ *
+ * @lmmu is a pointer to a user struct with one or more of the fields set to
+ * indicate the application preference: lmm_stripe_count, lmm_stripe_size,
+ * lmm_stripe_offset, and lmm_stripe_pattern.  lmm_magic must be LOV_MAGIC.
+ * @lsmp is a pointer to an in-core stripe MD that needs to be filled in.
+ */
+int lov_setstripe(struct obd_export *exp, int max_lmm_size,
+		  struct lov_stripe_md **lsmp, struct lov_user_md *lump)
+{
+	int rc;
+	mm_segment_t seg;
+
+	seg = get_fs();
+	set_fs(KERNEL_DS);
+
+	rc = __lov_setstripe(exp, max_lmm_size, lsmp, lump);
+	set_fs(seg);
+	RETURN(rc);
+}
+
+int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
+	      struct lov_user_md *lump)
+{
+	int i;
+	int rc;
+	struct obd_export *oexp;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	obd_id last_id = 0;
+	struct lov_user_ost_data_v1 *lmm_objects;
+
+	ENTRY;
+
+	if (lump->lmm_magic == LOV_USER_MAGIC_V3)
+		lmm_objects = ((struct lov_user_md_v3 *)lump)->lmm_objects;
+	else
+		lmm_objects = lump->lmm_objects;
+
+	for (i = 0; i < lump->lmm_stripe_count; i++) {
+		__u32 len = sizeof(last_id);
+		oexp = lov->lov_tgts[lmm_objects[i].l_ost_idx]->ltd_exp;
+		rc = obd_get_info(NULL, oexp, sizeof(KEY_LAST_ID), KEY_LAST_ID,
+				  &len, &last_id, NULL);
+		if (rc)
+			RETURN(rc);
+		if (ostid_id(&lmm_objects[i].l_ost_oi) > last_id) {
+			CERROR("Setting EA for object > than last id on"
+			       " ost idx %d "DOSTID" > "LPD64" \n",
+			       lmm_objects[i].l_ost_idx,
+			       POSTID(&lmm_objects[i].l_ost_oi), last_id);
+			RETURN(-EINVAL);
+		}
+	}
+
+	rc = lov_setstripe(exp, 0, lsmp, lump);
+	if (rc)
+		RETURN(rc);
+
+	for (i = 0; i < lump->lmm_stripe_count; i++) {
+		(*lsmp)->lsm_oinfo[i]->loi_ost_idx =
+			lmm_objects[i].l_ost_idx;
+		(*lsmp)->lsm_oinfo[i]->loi_oi = lmm_objects[i].l_ost_oi;
+	}
+	RETURN(0);
+}
+
+
+/* Retrieve object striping information.
+ *
+ * @lump is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_USER_MAGIC.
+ */
+int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
+		  struct lov_user_md *lump)
+{
+	/*
+	 * XXX huge struct allocated on stack.
+	 */
+	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+	struct lov_user_md_v3 lum;
+	struct lov_mds_md *lmmk = NULL;
+	int rc, lmm_size;
+	int lum_size;
+	mm_segment_t seg;
+	ENTRY;
+
+	if (!lsm)
+		RETURN(-ENODATA);
+
+	/*
+	 * "Switch to kernel segment" to allow copying from kernel space by
+	 * copy_{to,from}_user().
+	 */
+	seg = get_fs();
+	set_fs(KERNEL_DS);
+
+	/* we only need the header part from user space to get lmm_magic and
+	 * lmm_stripe_count, (the header part is common to v1 and v3) */
+	lum_size = sizeof(struct lov_user_md_v1);
+	if (copy_from_user(&lum, lump, lum_size))
+		GOTO(out_set, rc = -EFAULT);
+	else if ((lum.lmm_magic != LOV_USER_MAGIC) &&
+		 (lum.lmm_magic != LOV_USER_MAGIC_V3))
+		GOTO(out_set, rc = -EINVAL);
+
+	if (lum.lmm_stripe_count &&
+	    (lum.lmm_stripe_count < lsm->lsm_stripe_count)) {
+		/* Return right size of stripe to user */
+		lum.lmm_stripe_count = lsm->lsm_stripe_count;
+		rc = copy_to_user(lump, &lum, lum_size);
+		GOTO(out_set, rc = -EOVERFLOW);
+	}
+	rc = lov_packmd(exp, &lmmk, lsm);
+	if (rc < 0)
+		GOTO(out_set, rc);
+	lmm_size = rc;
+	rc = 0;
+
+	/* FIXME: Bug 1185 - copy fields properly when structs change */
+	/* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */
+	CLASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3));
+	CLASSERT(sizeof lum.lmm_objects[0] == sizeof lmmk->lmm_objects[0]);
+
+	if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+	    ((lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) ||
+	    (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)))) {
+		lustre_swab_lov_mds_md(lmmk);
+		lustre_swab_lov_user_md_objects(
+				(struct lov_user_ost_data*)lmmk->lmm_objects,
+				lmmk->lmm_stripe_count);
+	}
+	if (lum.lmm_magic == LOV_USER_MAGIC) {
+		/* User request for v1, we need skip lmm_pool_name */
+		if (lmmk->lmm_magic == LOV_MAGIC_V3) {
+			memmove((char*)(&lmmk->lmm_stripe_count) +
+				sizeof(lmmk->lmm_stripe_count),
+				((struct lov_mds_md_v3*)lmmk)->lmm_objects,
+				lmmk->lmm_stripe_count *
+				sizeof(struct lov_ost_data_v1));
+			lmm_size -= LOV_MAXPOOLNAME;
+		}
+	} else {
+		/* if v3 we just have to update the lum_size */
+		lum_size = sizeof(struct lov_user_md_v3);
+	}
+
+	/* User wasn't expecting this many OST entries */
+	if (lum.lmm_stripe_count == 0)
+		lmm_size = lum_size;
+	else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count)
+		GOTO(out_set, rc = -EOVERFLOW);
+	/*
+	 * Have a difference between lov_mds_md & lov_user_md.
+	 * So we have to re-order the data before copy to user.
+	 */
+	lum.lmm_stripe_count = lmmk->lmm_stripe_count;
+	lum.lmm_layout_gen = lmmk->lmm_layout_gen;
+	((struct lov_user_md *)lmmk)->lmm_layout_gen = lum.lmm_layout_gen;
+	((struct lov_user_md *)lmmk)->lmm_stripe_count = lum.lmm_stripe_count;
+	if (copy_to_user(lump, lmmk, lmm_size))
+		rc = -EFAULT;
+
+	obd_free_diskmd(exp, &lmmk);
+out_set:
+	set_fs(seg);
+	RETURN(rc);
+}

diff --git a/drivers/staging/lustre/lustre/lov/lov_page.c b/drivers/staging/lustre/lustre/lov/lov_page.c
new file mode 100644
index 0000000..65790d68
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_page.c

@@ -0,0 +1,235 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov page operations.
+ *
+ */
+
+static int lov_page_invariant(const struct cl_page_slice *slice)
+{
+	const struct cl_page  *page = slice->cpl_page;
+	const struct cl_page  *sub  = lov_sub_page(slice);
+
+	return ergo(sub != NULL,
+		    page->cp_child == sub &&
+		    sub->cp_parent == page &&
+		    page->cp_state == sub->cp_state);
+}
+
+static void lov_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice)
+{
+	struct cl_page  *sub = lov_sub_page(slice);
+
+	LINVRNT(lov_page_invariant(slice));
+	ENTRY;
+
+	if (sub != NULL) {
+		LASSERT(sub->cp_state == CPS_FREEING);
+		lu_ref_del(&sub->cp_reference, "lov", sub->cp_parent);
+		sub->cp_parent = NULL;
+		slice->cpl_page->cp_child = NULL;
+		cl_page_put(env, sub);
+	}
+	EXIT;
+}
+
+static int lov_page_own(const struct lu_env *env,
+			const struct cl_page_slice *slice, struct cl_io *io,
+			int nonblock)
+{
+	struct lov_io     *lio = lov_env_io(env);
+	struct lov_io_sub *sub;
+
+	LINVRNT(lov_page_invariant(slice));
+	LINVRNT(!cl2lov_page(slice)->lps_invalid);
+	ENTRY;
+
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		lov_sub_page(slice)->cp_owner = sub->sub_io;
+		lov_sub_put(sub);
+	} else
+		LBUG(); /* Arrgh */
+	RETURN(0);
+}
+
+static void lov_page_assume(const struct lu_env *env,
+			    const struct cl_page_slice *slice, struct cl_io *io)
+{
+	lov_page_own(env, slice, io, 0);
+}
+
+static int lov_page_cache_add(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *io)
+{
+	struct lov_io     *lio = lov_env_io(env);
+	struct lov_io_sub *sub;
+	int rc = 0;
+
+	LINVRNT(lov_page_invariant(slice));
+	LINVRNT(!cl2lov_page(slice)->lps_invalid);
+	ENTRY;
+
+	sub = lov_page_subio(env, lio, slice);
+	if (!IS_ERR(sub)) {
+		rc = cl_page_cache_add(sub->sub_env, sub->sub_io,
+				       slice->cpl_page->cp_child, CRT_WRITE);
+		lov_sub_put(sub);
+	} else {
+		rc = PTR_ERR(sub);
+		CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, "rc = %d\n", rc);
+	}
+	RETURN(rc);
+}
+
+static int lov_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct lov_page *lp = cl2lov_page(slice);
+
+	return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p\n", lp);
+}
+
+static const struct cl_page_operations lov_page_ops = {
+	.cpo_fini   = lov_page_fini,
+	.cpo_own    = lov_page_own,
+	.cpo_assume = lov_page_assume,
+	.io = {
+		[CRT_WRITE] = {
+			.cpo_cache_add = lov_page_cache_add
+		}
+	},
+	.cpo_print  = lov_page_print
+};
+
+static void lov_empty_page_fini(const struct lu_env *env,
+				struct cl_page_slice *slice)
+{
+	LASSERT(slice->cpl_page->cp_child == NULL);
+}
+
+int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *vmpage)
+{
+	struct lov_object *loo = cl2lov(obj);
+	struct lov_layout_raid0 *r0 = lov_r0(loo);
+	struct lov_io     *lio = lov_env_io(env);
+	struct cl_page    *subpage;
+	struct cl_object  *subobj;
+	struct lov_io_sub *sub;
+	struct lov_page   *lpg = cl_object_page_slice(obj, page);
+	loff_t	     offset;
+	obd_off	    suboff;
+	int		stripe;
+	int		rc;
+	ENTRY;
+
+	offset = cl_offset(obj, page->cp_index);
+	stripe = lov_stripe_number(loo->lo_lsm, offset);
+	LASSERT(stripe < r0->lo_nr);
+	rc = lov_stripe_offset(loo->lo_lsm, offset, stripe,
+				   &suboff);
+	LASSERT(rc == 0);
+
+	lpg->lps_invalid = 1;
+	cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_page_ops);
+
+	sub = lov_sub_get(env, lio, stripe);
+	if (IS_ERR(sub))
+		GOTO(out, rc = PTR_ERR(sub));
+
+	subobj = lovsub2cl(r0->lo_sub[stripe]);
+	subpage = cl_page_find_sub(sub->sub_env, subobj,
+				   cl_index(subobj, suboff), vmpage, page);
+	lov_sub_put(sub);
+	if (IS_ERR(subpage))
+		GOTO(out, rc = PTR_ERR(subpage));
+
+	if (likely(subpage->cp_parent == page)) {
+		lu_ref_add(&subpage->cp_reference, "lov", page);
+		lpg->lps_invalid = 0;
+		rc = 0;
+	} else {
+		CL_PAGE_DEBUG(D_ERROR, env, page, "parent page\n");
+		CL_PAGE_DEBUG(D_ERROR, env, subpage, "child page\n");
+		LASSERT(0);
+	}
+
+	EXIT;
+out:
+	return rc;
+}
+
+
+static const struct cl_page_operations lov_empty_page_ops = {
+	.cpo_fini   = lov_empty_page_fini,
+	.cpo_print  = lov_page_print
+};
+
+int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *vmpage)
+{
+	struct lov_page *lpg = cl_object_page_slice(obj, page);
+	void *addr;
+	ENTRY;
+
+	cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_empty_page_ops);
+	addr = kmap(vmpage);
+	memset(addr, 0, cl_page_size(obj));
+	kunmap(vmpage);
+	cl_page_export(env, page, 1);
+	RETURN(0);
+}
+
+
+/** @} lov */

diff --git a/drivers/staging/lustre/lustre/lov/lov_pool.c b/drivers/staging/lustre/lustre/lov/lov_pool.c
new file mode 100644
index 0000000..a96f908
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_pool.c

@@ -0,0 +1,681 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pool.c
+ *
+ * OST pool methods
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
+ * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include "lov_internal.h"
+
+#define pool_tgt(_p, _i) \
+		_p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]]
+
+static void lov_pool_getref(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	atomic_inc(&pool->pool_refcount);
+}
+
+void lov_pool_putref(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	if (atomic_dec_and_test(&pool->pool_refcount)) {
+		LASSERT(hlist_unhashed(&pool->pool_hash));
+		LASSERT(list_empty(&pool->pool_list));
+		LASSERT(pool->pool_proc_entry == NULL);
+		lov_ost_pool_free(&(pool->pool_rr.lqr_pool));
+		lov_ost_pool_free(&(pool->pool_obds));
+		OBD_FREE_PTR(pool);
+		EXIT;
+	}
+}
+
+void lov_pool_putref_locked(struct pool_desc *pool)
+{
+	CDEBUG(D_INFO, "pool %p\n", pool);
+	LASSERT(atomic_read(&pool->pool_refcount) > 1);
+
+	atomic_dec(&pool->pool_refcount);
+}
+
+/*
+ * hash function using a Rotating Hash algorithm
+ * Knuth, D. The Art of Computer Programming,
+ * Volume 3: Sorting and Searching,
+ * Chapter 6.4.
+ * Addison Wesley, 1973
+ */
+static __u32 pool_hashfn(cfs_hash_t *hash_body, const void *key, unsigned mask)
+{
+	int i;
+	__u32 result;
+	char *poolname;
+
+	result = 0;
+	poolname = (char *)key;
+	for (i = 0; i < LOV_MAXPOOLNAME; i++) {
+		if (poolname[i] == '\0')
+			break;
+		result = (result << 4)^(result >> 28) ^  poolname[i];
+	}
+	return (result % mask);
+}
+
+static void *pool_key(struct hlist_node *hnode)
+{
+	struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+	return (pool->pool_name);
+}
+
+static int pool_hashkey_keycmp(const void *key, struct hlist_node *compared_hnode)
+{
+	char *pool_name;
+	struct pool_desc *pool;
+
+	pool_name = (char *)key;
+	pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash);
+	return !strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME);
+}
+
+static void *pool_hashobject(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct pool_desc, pool_hash);
+}
+
+static void pool_hashrefcount_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+	lov_pool_getref(pool);
+}
+
+static void pool_hashrefcount_put_locked(cfs_hash_t *hs,
+					 struct hlist_node *hnode)
+{
+	struct pool_desc *pool;
+
+	pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+	lov_pool_putref_locked(pool);
+}
+
+cfs_hash_ops_t pool_hash_operations = {
+	.hs_hash	= pool_hashfn,
+	.hs_key	 = pool_key,
+	.hs_keycmp      = pool_hashkey_keycmp,
+	.hs_object      = pool_hashobject,
+	.hs_get	 = pool_hashrefcount_get,
+	.hs_put_locked  = pool_hashrefcount_put_locked,
+
+};
+
+#ifdef LPROCFS
+/* ifdef needed for liblustre support */
+/*
+ * pool /proc seq_file methods
+ */
+/*
+ * iterator is used to go through the target pool entries
+ * index is the current entry index in the lp_array[] array
+ * index >= pos returned to the seq_file interface
+ * pos is from 0 to (pool->pool_obds.op_count - 1)
+ */
+#define POOL_IT_MAGIC 0xB001CEA0
+struct pool_iterator {
+	int magic;
+	struct pool_desc *pool;
+	int idx;	/* from 0 to pool_tgt_size - 1 */
+};
+
+static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct pool_iterator *iter = (struct pool_iterator *)s->private;
+	int prev_idx;
+
+	LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+
+	/* test if end of file */
+	if (*pos >= pool_tgt_count(iter->pool))
+		return NULL;
+
+	/* iterate to find a non empty entry */
+	prev_idx = iter->idx;
+	down_read(&pool_tgt_rw_sem(iter->pool));
+	iter->idx++;
+	if (iter->idx == pool_tgt_count(iter->pool)) {
+		iter->idx = prev_idx; /* we stay on the last entry */
+		up_read(&pool_tgt_rw_sem(iter->pool));
+		return NULL;
+	}
+	up_read(&pool_tgt_rw_sem(iter->pool));
+	(*pos)++;
+	/* return != NULL to continue */
+	return iter;
+}
+
+static void *pool_proc_start(struct seq_file *s, loff_t *pos)
+{
+	struct pool_desc *pool = (struct pool_desc *)s->private;
+	struct pool_iterator *iter;
+
+	lov_pool_getref(pool);
+	if ((pool_tgt_count(pool) == 0) ||
+	    (*pos >= pool_tgt_count(pool))) {
+		/* iter is not created, so stop() has no way to
+		 * find pool to dec ref */
+		lov_pool_putref(pool);
+		return NULL;
+	}
+
+	OBD_ALLOC_PTR(iter);
+	if (!iter)
+		return ERR_PTR(-ENOMEM);
+	iter->magic = POOL_IT_MAGIC;
+	iter->pool = pool;
+	iter->idx = 0;
+
+	/* we use seq_file private field to memorized iterator so
+	 * we can free it at stop() */
+	/* /!\ do not forget to restore it to pool before freeing it */
+	s->private = iter;
+	if (*pos > 0) {
+		loff_t i;
+		void *ptr;
+
+		i = 0;
+		do {
+		     ptr = pool_proc_next(s, &iter, &i);
+		} while ((i < *pos) && (ptr != NULL));
+		return ptr;
+	}
+	return iter;
+}
+
+static void pool_proc_stop(struct seq_file *s, void *v)
+{
+	struct pool_iterator *iter = (struct pool_iterator *)s->private;
+
+	/* in some cases stop() method is called 2 times, without
+	 * calling start() method (see seq_read() from fs/seq_file.c)
+	 * we have to free only if s->private is an iterator */
+	if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+		/* we restore s->private so next call to pool_proc_start()
+		 * will work */
+		s->private = iter->pool;
+		lov_pool_putref(iter->pool);
+		OBD_FREE_PTR(iter);
+	}
+	return;
+}
+
+static int pool_proc_show(struct seq_file *s, void *v)
+{
+	struct pool_iterator *iter = (struct pool_iterator *)v;
+	struct lov_tgt_desc *tgt;
+
+	LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+	LASSERT(iter->pool != NULL);
+	LASSERT(iter->idx <= pool_tgt_count(iter->pool));
+
+	down_read(&pool_tgt_rw_sem(iter->pool));
+	tgt = pool_tgt(iter->pool, iter->idx);
+	up_read(&pool_tgt_rw_sem(iter->pool));
+	if (tgt)
+		seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
+
+	return 0;
+}
+
+static struct seq_operations pool_proc_ops = {
+	.start	  = pool_proc_start,
+	.next	   = pool_proc_next,
+	.stop	   = pool_proc_stop,
+	.show	   = pool_proc_show,
+};
+
+static int pool_proc_open(struct inode *inode, struct file *file)
+{
+	int rc;
+
+	rc = seq_open(file, &pool_proc_ops);
+	if (!rc) {
+		struct seq_file *s = file->private_data;
+		s->private = PDE_DATA(inode);
+	}
+	return rc;
+}
+
+static struct file_operations pool_proc_operations = {
+	.open	   = pool_proc_open,
+	.read	   = seq_read,
+	.llseek	 = seq_lseek,
+	.release	= seq_release,
+};
+#endif /* LPROCFS */
+
+void lov_dump_pool(int level, struct pool_desc *pool)
+{
+	int i;
+
+	lov_pool_getref(pool);
+
+	CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n",
+	       pool->pool_name, pool->pool_obds.op_count);
+	down_read(&pool_tgt_rw_sem(pool));
+
+	for (i = 0; i < pool_tgt_count(pool) ; i++) {
+		if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp)
+			continue;
+		CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n",
+		       pool->pool_name, i,
+		       obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid)));
+	}
+
+	up_read(&pool_tgt_rw_sem(pool));
+	lov_pool_putref(pool);
+}
+
+#define LOV_POOL_INIT_COUNT 2
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
+{
+	ENTRY;
+
+	if (count == 0)
+		count = LOV_POOL_INIT_COUNT;
+	op->op_array = NULL;
+	op->op_count = 0;
+	init_rwsem(&op->op_rw_sem);
+	op->op_size = count;
+	OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0]));
+	if (op->op_array == NULL) {
+		op->op_size = 0;
+		RETURN(-ENOMEM);
+	}
+	EXIT;
+	return 0;
+}
+
+/* Caller must hold write op_rwlock */
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count)
+{
+	__u32 *new;
+	int new_size;
+
+	LASSERT(min_count != 0);
+
+	if (op->op_count < op->op_size)
+		return 0;
+
+	new_size = max(min_count, 2 * op->op_size);
+	OBD_ALLOC(new, new_size * sizeof(op->op_array[0]));
+	if (new == NULL)
+		return -ENOMEM;
+
+	/* copy old array to new one */
+	memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0]));
+	OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+	op->op_array = new;
+	op->op_size = new_size;
+	return 0;
+}
+
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count)
+{
+	int rc = 0, i;
+	ENTRY;
+
+	down_write(&op->op_rw_sem);
+
+	rc = lov_ost_pool_extend(op, min_count);
+	if (rc)
+		GOTO(out, rc);
+
+	/* search ost in pool array */
+	for (i = 0; i < op->op_count; i++) {
+		if (op->op_array[i] == idx)
+			GOTO(out, rc = -EEXIST);
+	}
+	/* ost not found we add it */
+	op->op_array[op->op_count] = idx;
+	op->op_count++;
+	EXIT;
+out:
+	up_write(&op->op_rw_sem);
+	return rc;
+}
+
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
+{
+	int i;
+	ENTRY;
+
+	down_write(&op->op_rw_sem);
+
+	for (i = 0; i < op->op_count; i++) {
+		if (op->op_array[i] == idx) {
+			memmove(&op->op_array[i], &op->op_array[i + 1],
+				(op->op_count - i - 1) * sizeof(op->op_array[0]));
+			op->op_count--;
+			up_write(&op->op_rw_sem);
+			EXIT;
+			return 0;
+		}
+	}
+
+	up_write(&op->op_rw_sem);
+	RETURN(-EINVAL);
+}
+
+int lov_ost_pool_free(struct ost_pool *op)
+{
+	ENTRY;
+
+	if (op->op_size == 0)
+		RETURN(0);
+
+	down_write(&op->op_rw_sem);
+
+	OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+	op->op_array = NULL;
+	op->op_count = 0;
+	op->op_size = 0;
+
+	up_write(&op->op_rw_sem);
+	RETURN(0);
+}
+
+
+int lov_pool_new(struct obd_device *obd, char *poolname)
+{
+	struct lov_obd *lov;
+	struct pool_desc *new_pool;
+	int rc;
+	ENTRY;
+
+	lov = &(obd->u.lov);
+
+	if (strlen(poolname) > LOV_MAXPOOLNAME)
+		RETURN(-ENAMETOOLONG);
+
+	OBD_ALLOC_PTR(new_pool);
+	if (new_pool == NULL)
+		RETURN(-ENOMEM);
+
+	strncpy(new_pool->pool_name, poolname, LOV_MAXPOOLNAME);
+	new_pool->pool_name[LOV_MAXPOOLNAME] = '\0';
+	new_pool->pool_lobd = obd;
+	/* ref count init to 1 because when created a pool is always used
+	 * up to deletion
+	 */
+	atomic_set(&new_pool->pool_refcount, 1);
+	rc = lov_ost_pool_init(&new_pool->pool_obds, 0);
+	if (rc)
+	       GOTO(out_err, rc);
+
+	memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr));
+	rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0);
+	if (rc)
+		GOTO(out_free_pool_obds, rc);
+
+	INIT_HLIST_NODE(&new_pool->pool_hash);
+
+#ifdef LPROCFS
+	/* we need this assert seq_file is not implementated for liblustre */
+	/* get ref for /proc file */
+	lov_pool_getref(new_pool);
+	new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
+						       poolname, new_pool,
+						       &pool_proc_operations);
+	if (IS_ERR(new_pool->pool_proc_entry)) {
+		CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname);
+		new_pool->pool_proc_entry = NULL;
+		lov_pool_putref(new_pool);
+	}
+	CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry);
+#endif
+
+	spin_lock(&obd->obd_dev_lock);
+	list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+	lov->lov_pool_count++;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* add to find only when it fully ready  */
+	rc = cfs_hash_add_unique(lov->lov_pools_hash_body, poolname,
+				 &new_pool->pool_hash);
+	if (rc)
+		GOTO(out_err, rc = -EEXIST);
+
+	CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
+	       poolname, lov->lov_pool_count);
+
+	RETURN(0);
+
+out_err:
+	spin_lock(&obd->obd_dev_lock);
+	list_del_init(&new_pool->pool_list);
+	lov->lov_pool_count--;
+	spin_unlock(&obd->obd_dev_lock);
+
+	lprocfs_remove(&new_pool->pool_proc_entry);
+
+	lov_ost_pool_free(&new_pool->pool_rr.lqr_pool);
+out_free_pool_obds:
+	lov_ost_pool_free(&new_pool->pool_obds);
+	OBD_FREE_PTR(new_pool);
+	return rc;
+}
+
+int lov_pool_del(struct obd_device *obd, char *poolname)
+{
+	struct lov_obd *lov;
+	struct pool_desc *pool;
+	ENTRY;
+
+	lov = &(obd->u.lov);
+
+	/* lookup and kill hash reference */
+	pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname);
+	if (pool == NULL)
+		RETURN(-ENOENT);
+
+	if (pool->pool_proc_entry != NULL) {
+		CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry);
+		lprocfs_remove(&pool->pool_proc_entry);
+		lov_pool_putref(pool);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	list_del_init(&pool->pool_list);
+	lov->lov_pool_count--;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* release last reference */
+	lov_pool_putref(pool);
+
+	RETURN(0);
+}
+
+
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+	struct obd_uuid ost_uuid;
+	struct lov_obd *lov;
+	struct pool_desc *pool;
+	unsigned int lov_idx;
+	int rc;
+	ENTRY;
+
+	lov = &(obd->u.lov);
+
+	pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+	if (pool == NULL)
+		RETURN(-ENOENT);
+
+	obd_str2uuid(&ost_uuid, ostname);
+
+
+	/* search ost in lov array */
+	obd_getref(obd);
+	for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+		if (!lov->lov_tgts[lov_idx])
+			continue;
+		if (obd_uuid_equals(&ost_uuid,
+				    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+			break;
+	}
+	/* test if ost found in lov */
+	if (lov_idx == lov->desc.ld_tgt_count)
+		GOTO(out, rc = -EINVAL);
+
+	rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
+	if (rc)
+		GOTO(out, rc);
+
+	pool->pool_rr.lqr_dirty = 1;
+
+	CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n",
+	       ostname, poolname,  pool_tgt_count(pool));
+
+	EXIT;
+out:
+	obd_putref(obd);
+	lov_pool_putref(pool);
+	return rc;
+}
+
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
+{
+	struct obd_uuid ost_uuid;
+	struct lov_obd *lov;
+	struct pool_desc *pool;
+	unsigned int lov_idx;
+	int rc = 0;
+	ENTRY;
+
+	lov = &(obd->u.lov);
+
+	pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+	if (pool == NULL)
+		RETURN(-ENOENT);
+
+	obd_str2uuid(&ost_uuid, ostname);
+
+	obd_getref(obd);
+	/* search ost in lov array, to get index */
+	for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+		if (!lov->lov_tgts[lov_idx])
+			continue;
+
+		if (obd_uuid_equals(&ost_uuid,
+				    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+			break;
+	}
+
+	/* test if ost found in lov */
+	if (lov_idx == lov->desc.ld_tgt_count)
+		GOTO(out, rc = -EINVAL);
+
+	lov_ost_pool_remove(&pool->pool_obds, lov_idx);
+
+	pool->pool_rr.lqr_dirty = 1;
+
+	CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
+	       poolname);
+
+	EXIT;
+out:
+	obd_putref(obd);
+	lov_pool_putref(pool);
+	return rc;
+}
+
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool)
+{
+	int i, rc;
+	ENTRY;
+
+	/* caller may no have a ref on pool if it got the pool
+	 * without calling lov_find_pool() (e.g. go through the lov pool
+	 * list)
+	 */
+	lov_pool_getref(pool);
+
+	down_read(&pool_tgt_rw_sem(pool));
+
+	for (i = 0; i < pool_tgt_count(pool); i++) {
+		if (pool_tgt_array(pool)[i] == idx)
+			GOTO(out, rc = 0);
+	}
+	rc = -ENOENT;
+	EXIT;
+out:
+	up_read(&pool_tgt_rw_sem(pool));
+
+	lov_pool_putref(pool);
+	return rc;
+}
+
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname)
+{
+	struct pool_desc *pool;
+
+	pool = NULL;
+	if (poolname[0] != '\0') {
+		pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+		if (pool == NULL)
+			CWARN("Request for an unknown pool ("LOV_POOLNAMEF")\n",
+			      poolname);
+		if ((pool != NULL) && (pool_tgt_count(pool) == 0)) {
+			CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n",
+			       poolname);
+			/* pool is ignored, so we remove ref on it */
+			lov_pool_putref(pool);
+			pool = NULL;
+		}
+	}
+	return pool;
+}

diff --git a/drivers/staging/lustre/lustre/lov/lov_request.c b/drivers/staging/lustre/lustre/lov/lov_request.c
new file mode 100644
index 0000000..13f1637
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_request.c

@@ -0,0 +1,1551 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_idl.h>
+
+#include "lov_internal.h"
+
+static void lov_init_set(struct lov_request_set *set)
+{
+	set->set_count = 0;
+	atomic_set(&set->set_completes, 0);
+	atomic_set(&set->set_success, 0);
+	atomic_set(&set->set_finish_checked, 0);
+	set->set_cookies = 0;
+	INIT_LIST_HEAD(&set->set_list);
+	atomic_set(&set->set_refcount, 1);
+	init_waitqueue_head(&set->set_waitq);
+	spin_lock_init(&set->set_lock);
+}
+
+void lov_finish_set(struct lov_request_set *set)
+{
+	struct list_head *pos, *n;
+	ENTRY;
+
+	LASSERT(set);
+	list_for_each_safe(pos, n, &set->set_list) {
+		struct lov_request *req = list_entry(pos,
+							 struct lov_request,
+							 rq_link);
+		list_del_init(&req->rq_link);
+
+		if (req->rq_oi.oi_oa)
+			OBDO_FREE(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_md)
+			OBD_FREE_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_osfs)
+			OBD_FREE(req->rq_oi.oi_osfs,
+				 sizeof(*req->rq_oi.oi_osfs));
+		OBD_FREE(req, sizeof(*req));
+	}
+
+	if (set->set_pga) {
+		int len = set->set_oabufs * sizeof(*set->set_pga);
+		OBD_FREE_LARGE(set->set_pga, len);
+	}
+	if (set->set_lockh)
+		lov_llh_put(set->set_lockh);
+
+	OBD_FREE(set, sizeof(*set));
+	EXIT;
+}
+
+int lov_set_finished(struct lov_request_set *set, int idempotent)
+{
+	int completes = atomic_read(&set->set_completes);
+
+	CDEBUG(D_INFO, "check set %d/%d\n", completes, set->set_count);
+
+	if (completes == set->set_count) {
+		if (idempotent)
+			return 1;
+		if (atomic_inc_return(&set->set_finish_checked) == 1)
+			return 1;
+	}
+	return 0;
+}
+
+void lov_update_set(struct lov_request_set *set,
+		    struct lov_request *req, int rc)
+{
+	req->rq_complete = 1;
+	req->rq_rc = rc;
+
+	atomic_inc(&set->set_completes);
+	if (rc == 0)
+		atomic_inc(&set->set_success);
+
+	wake_up(&set->set_waitq);
+}
+
+int lov_update_common_set(struct lov_request_set *set,
+			  struct lov_request *req, int rc)
+{
+	struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+	ENTRY;
+
+	lov_update_set(set, req, rc);
+
+	/* grace error on inactive ost */
+	if (rc && !(lov->lov_tgts[req->rq_idx] &&
+		    lov->lov_tgts[req->rq_idx]->ltd_active))
+		rc = 0;
+
+	/* FIXME in raid1 regime, should return 0 */
+	RETURN(rc);
+}
+
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
+{
+	list_add_tail(&req->rq_link, &set->set_list);
+	set->set_count++;
+	req->rq_rqset = set;
+}
+
+static int lov_check_set(struct lov_obd *lov, int idx)
+{
+	int rc = 0;
+	mutex_lock(&lov->lov_lock);
+
+	if (lov->lov_tgts[idx] == NULL ||
+	    lov->lov_tgts[idx]->ltd_active ||
+	    (lov->lov_tgts[idx]->ltd_exp != NULL &&
+	     class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried))
+		rc = 1;
+
+	mutex_unlock(&lov->lov_lock);
+	return rc;
+}
+
+/* Check if the OSC connection exists and is active.
+ * If the OSC has not yet had a chance to connect to the OST the first time,
+ * wait once for it to connect instead of returning an error.
+ */
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
+{
+	wait_queue_head_t waitq;
+	struct l_wait_info lwi;
+	struct lov_tgt_desc *tgt;
+	int rc = 0;
+
+	mutex_lock(&lov->lov_lock);
+
+	tgt = lov->lov_tgts[ost_idx];
+
+	if (unlikely(tgt == NULL))
+		GOTO(out, rc = 0);
+
+	if (likely(tgt->ltd_active))
+		GOTO(out, rc = 1);
+
+	if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried)
+		GOTO(out, rc = 0);
+
+	mutex_unlock(&lov->lov_lock);
+
+	init_waitqueue_head(&waitq);
+	lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout),
+				   cfs_time_seconds(1), NULL, NULL);
+
+	rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi);
+	if (tgt != NULL && tgt->ltd_active)
+		return 1;
+
+	return 0;
+
+out:
+	mutex_unlock(&lov->lov_lock);
+	return rc;
+}
+
+extern void osc_update_enqueue(struct lustre_handle *lov_lockhp,
+			       struct lov_oinfo *loi, int flags,
+			       struct ost_lvb *lvb, __u32 mode, int rc);
+
+static int lov_update_enqueue_lov(struct obd_export *exp,
+				  struct lustre_handle *lov_lockhp,
+				  struct lov_oinfo *loi, int flags, int idx,
+				  struct ost_id *oi, int rc)
+{
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+
+	if (rc != ELDLM_OK &&
+	    !(rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT))) {
+		memset(lov_lockhp, 0, sizeof(*lov_lockhp));
+		if (lov->lov_tgts[idx] && lov->lov_tgts[idx]->ltd_active) {
+			/* -EUSERS used by OST to report file contention */
+			if (rc != -EINTR && rc != -EUSERS)
+				CERROR("%s: enqueue objid "DOSTID" subobj"
+				       DOSTID" on OST idx %d: rc %d\n",
+				       exp->exp_obd->obd_name,
+				       POSTID(oi), POSTID(&loi->loi_oi),
+				       loi->loi_ost_idx, rc);
+		} else
+			rc = ELDLM_OK;
+	}
+	return rc;
+}
+
+int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc)
+{
+	struct lov_request_set *set = req->rq_rqset;
+	struct lustre_handle *lov_lockhp;
+	struct obd_info *oi = set->set_oi;
+	struct lov_oinfo *loi;
+	ENTRY;
+
+	LASSERT(oi != NULL);
+
+	lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+	loi = oi->oi_md->lsm_oinfo[req->rq_stripe];
+
+	/* XXX LOV STACKING: OSC gets a copy, created in lov_prep_enqueue_set
+	 * and that copy can be arbitrarily out of date.
+	 *
+	 * The LOV API is due for a serious rewriting anyways, and this
+	 * can be addressed then. */
+
+	lov_stripe_lock(oi->oi_md);
+	osc_update_enqueue(lov_lockhp, loi, oi->oi_flags,
+			   &req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb, mode, rc);
+	if (rc == ELDLM_LOCK_ABORTED && (oi->oi_flags & LDLM_FL_HAS_INTENT))
+		memset(lov_lockhp, 0, sizeof *lov_lockhp);
+	rc = lov_update_enqueue_lov(set->set_exp, lov_lockhp, loi, oi->oi_flags,
+				    req->rq_idx, &oi->oi_md->lsm_oi, rc);
+	lov_stripe_unlock(oi->oi_md);
+	lov_update_set(set, req, rc);
+	RETURN(rc);
+}
+
+/* The callback for osc_enqueue that updates lov info for every OSC request. */
+static int cb_update_enqueue(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct ldlm_enqueue_info *einfo;
+	struct lov_request *lovreq;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	einfo = lovreq->rq_rqset->set_ei;
+	return lov_update_enqueue_set(lovreq, einfo->ei_mode, rc);
+}
+
+static int enqueue_done(struct lov_request_set *set, __u32 mode)
+{
+	struct lov_request *req;
+	struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+	int completes = atomic_read(&set->set_completes);
+	int rc = 0;
+	ENTRY;
+
+	/* enqueue/match success, just return */
+	if (completes && completes == atomic_read(&set->set_success))
+		RETURN(0);
+
+	/* cancel enqueued/matched locks */
+	list_for_each_entry(req, &set->set_list, rq_link) {
+		struct lustre_handle *lov_lockhp;
+
+		if (!req->rq_complete || req->rq_rc)
+			continue;
+
+		lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+		LASSERT(lov_lockhp);
+		if (!lustre_handle_is_used(lov_lockhp))
+			continue;
+
+		rc = obd_cancel(lov->lov_tgts[req->rq_idx]->ltd_exp,
+				req->rq_oi.oi_md, mode, lov_lockhp);
+		if (rc && lov->lov_tgts[req->rq_idx] &&
+		    lov->lov_tgts[req->rq_idx]->ltd_active)
+			CERROR("%s: cancelling obdjid "DOSTID" on OST"
+			       "idx %d error: rc = %d\n",
+			       set->set_exp->exp_obd->obd_name,
+			       POSTID(&req->rq_oi.oi_md->lsm_oi),
+			       req->rq_idx, rc);
+	}
+	if (set->set_lockh)
+		lov_llh_put(set->set_lockh);
+	RETURN(rc);
+}
+
+int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode, int rc,
+			 struct ptlrpc_request_set *rqset)
+{
+	int ret = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	/* Do enqueue_done only for sync requests and if any request
+	 * succeeded. */
+	if (!rqset) {
+		if (rc)
+			atomic_set(&set->set_completes, 0);
+		ret = enqueue_done(set, mode);
+	} else if (set->set_lockh)
+		lov_llh_put(set->set_lockh);
+
+	lov_put_reqset(set);
+
+	RETURN(rc ? rc : ret);
+}
+
+static void lov_llh_addref(void *llhp)
+{
+	struct lov_lock_handles *llh = llhp;
+
+	atomic_inc(&llh->llh_refcount);
+	CDEBUG(D_INFO, "GETting llh %p : new refcount %d\n", llh,
+	       atomic_read(&llh->llh_refcount));
+}
+
+static struct portals_handle_ops lov_handle_ops = {
+	.hop_addref = lov_llh_addref,
+	.hop_free   = NULL,
+};
+
+static struct lov_lock_handles *lov_llh_new(struct lov_stripe_md *lsm)
+{
+	struct lov_lock_handles *llh;
+
+	OBD_ALLOC(llh, sizeof *llh +
+		  sizeof(*llh->llh_handles) * lsm->lsm_stripe_count);
+	if (llh == NULL)
+		return NULL;
+
+	atomic_set(&llh->llh_refcount, 2);
+	llh->llh_stripe_count = lsm->lsm_stripe_count;
+	INIT_LIST_HEAD(&llh->llh_handle.h_link);
+	class_handle_hash(&llh->llh_handle, &lov_handle_ops);
+
+	return llh;
+}
+
+int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct ldlm_enqueue_info *einfo,
+			 struct lov_request_set **reqset)
+{
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	struct lov_request_set *set;
+	int i, rc = 0;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+	set->set_ei = einfo;
+	set->set_lockh = lov_llh_new(oinfo->oi_md);
+	if (set->set_lockh == NULL)
+		GOTO(out_set, rc = -ENOMEM);
+	oinfo->oi_lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi;
+		struct lov_request *req;
+		obd_off start, end;
+
+		loi = oinfo->oi_md->lsm_oinfo[i];
+		if (!lov_stripe_intersects(oinfo->oi_md, i,
+					   oinfo->oi_policy.l_extent.start,
+					   oinfo->oi_policy.l_extent.end,
+					   &start, &end))
+			continue;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		req->rq_buflen = sizeof(*req->rq_oi.oi_md) +
+			sizeof(struct lov_oinfo *) +
+			sizeof(struct lov_oinfo);
+		OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_md == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		req->rq_oi.oi_md->lsm_oinfo[0] =
+			((void *)req->rq_oi.oi_md) + sizeof(*req->rq_oi.oi_md) +
+			sizeof(struct lov_oinfo *);
+
+		/* Set lov request specific parameters. */
+		req->rq_oi.oi_lockh = set->set_lockh->llh_handles + i;
+		req->rq_oi.oi_cb_up = cb_update_enqueue;
+		req->rq_oi.oi_flags = oinfo->oi_flags;
+
+		LASSERT(req->rq_oi.oi_lockh);
+
+		req->rq_oi.oi_policy.l_extent.gid =
+			oinfo->oi_policy.l_extent.gid;
+		req->rq_oi.oi_policy.l_extent.start = start;
+		req->rq_oi.oi_policy.l_extent.end = end;
+
+		req->rq_idx = loi->loi_ost_idx;
+		req->rq_stripe = i;
+
+		/* XXX LOV STACKING: submd should be from the subobj */
+		req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+		req->rq_oi.oi_md->lsm_stripe_count = 0;
+		req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms_valid =
+			loi->loi_kms_valid;
+		req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms = loi->loi_kms;
+		req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb = loi->loi_lvb;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(0);
+out_set:
+	lov_fini_enqueue_set(set, einfo->ei_mode, rc, NULL);
+	RETURN(rc);
+}
+
+int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	rc = enqueue_done(set, mode);
+	if ((set->set_count == atomic_read(&set->set_success)) &&
+	    (flags & LDLM_FL_TEST_LOCK))
+		lov_llh_put(set->set_lockh);
+
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
+		       struct lov_stripe_md *lsm, ldlm_policy_data_t *policy,
+		       __u32 mode, struct lustre_handle *lockh,
+		       struct lov_request_set **reqset)
+{
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	struct lov_request_set *set;
+	int i, rc = 0;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+	set->set_oi->oi_md = lsm;
+	set->set_lockh = lov_llh_new(lsm);
+	if (set->set_lockh == NULL)
+		GOTO(out_set, rc = -ENOMEM);
+	lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++){
+		struct lov_oinfo *loi;
+		struct lov_request *req;
+		obd_off start, end;
+
+		loi = lsm->lsm_oinfo[i];
+		if (!lov_stripe_intersects(lsm, i, policy->l_extent.start,
+					   policy->l_extent.end, &start, &end))
+			continue;
+
+		/* FIXME raid1 should grace this error */
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			GOTO(out_set, rc = -EIO);
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+		OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_md == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+
+		req->rq_oi.oi_policy.l_extent.start = start;
+		req->rq_oi.oi_policy.l_extent.end = end;
+		req->rq_oi.oi_policy.l_extent.gid = policy->l_extent.gid;
+
+		req->rq_idx = loi->loi_ost_idx;
+		req->rq_stripe = i;
+
+		/* XXX LOV STACKING: submd should be from the subobj */
+		req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+		req->rq_oi.oi_md->lsm_stripe_count = 0;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_match_set(set, mode, 0);
+	RETURN(rc);
+}
+
+int lov_fini_cancel_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+
+	LASSERT(set->set_exp);
+	if (set->set_lockh)
+		lov_llh_put(set->set_lockh);
+
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
+			struct lov_stripe_md *lsm, __u32 mode,
+			struct lustre_handle *lockh,
+			struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	int i, rc = 0;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+	set->set_oi->oi_md = lsm;
+	set->set_lockh = lov_handle2llh(lockh);
+	if (set->set_lockh == NULL) {
+		CERROR("LOV: invalid lov lock handle %p\n", lockh);
+		GOTO(out_set, rc = -EINVAL);
+	}
+	lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++){
+		struct lov_request *req;
+		struct lustre_handle *lov_lockhp;
+		struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+		lov_lockhp = set->set_lockh->llh_handles + i;
+		if (!lustre_handle_is_used(lov_lockhp)) {
+			CDEBUG(D_INFO, "lov idx %d subobj "DOSTID" no lock\n",
+			       loi->loi_ost_idx, POSTID(&loi->loi_oi));
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+		OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_md == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+
+		req->rq_idx = loi->loi_ost_idx;
+		req->rq_stripe = i;
+
+		/* XXX LOV STACKING: submd should be from the subobj */
+		req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+		req->rq_oi.oi_md->lsm_stripe_count = 0;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_cancel_set(set);
+	RETURN(rc);
+}
+static int common_attr_done(struct lov_request_set *set)
+{
+	struct list_head *pos;
+	struct lov_request *req;
+	struct obdo *tmp_oa;
+	int rc = 0, attrset = 0;
+	ENTRY;
+
+	LASSERT(set->set_oi != NULL);
+
+	if (set->set_oi->oi_oa == NULL)
+		RETURN(0);
+
+	if (!atomic_read(&set->set_success))
+		RETURN(-EIO);
+
+	OBDO_ALLOC(tmp_oa);
+	if (tmp_oa == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (!req->rq_complete || req->rq_rc)
+			continue;
+		if (req->rq_oi.oi_oa->o_valid == 0)   /* inactive stripe */
+			continue;
+		lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa,
+				req->rq_oi.oi_oa->o_valid,
+				set->set_oi->oi_md, req->rq_stripe, &attrset);
+	}
+	if (!attrset) {
+		CERROR("No stripes had valid attrs\n");
+		rc = -EIO;
+	}
+	if ((set->set_oi->oi_oa->o_valid & OBD_MD_FLEPOCH) &&
+	    (set->set_oi->oi_md->lsm_stripe_count != attrset)) {
+		/* When we take attributes of some epoch, we require all the
+		 * ost to be active. */
+		CERROR("Not all the stripes had valid attrs\n");
+		GOTO(out, rc = -EIO);
+	}
+
+	tmp_oa->o_oi = set->set_oi->oi_oa->o_oi;
+	memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa));
+out:
+	if (tmp_oa)
+		OBDO_FREE(tmp_oa);
+	RETURN(rc);
+
+}
+
+static int brw_done(struct lov_request_set *set)
+{
+	struct lov_stripe_md *lsm = set->set_oi->oi_md;
+	struct lov_oinfo     *loi = NULL;
+	struct list_head *pos;
+	struct lov_request *req;
+	ENTRY;
+
+	list_for_each (pos, &set->set_list) {
+		req = list_entry(pos, struct lov_request, rq_link);
+
+		if (!req->rq_complete || req->rq_rc)
+			continue;
+
+		loi = lsm->lsm_oinfo[req->rq_stripe];
+
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS)
+			loi->loi_lvb.lvb_blocks = req->rq_oi.oi_oa->o_blocks;
+	}
+
+	RETURN(0);
+}
+
+int lov_fini_brw_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		rc = brw_done(set);
+		/* FIXME update qos data here */
+	}
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
+		     obd_count oa_bufs, struct brw_page *pga,
+		     struct obd_trans_info *oti,
+		     struct lov_request_set **reqset)
+{
+	struct {
+		obd_count       index;
+		obd_count       count;
+		obd_count       off;
+	} *info = NULL;
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i, shift;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oti = oti;
+	set->set_oi = oinfo;
+	set->set_oabufs = oa_bufs;
+	OBD_ALLOC_LARGE(set->set_pga, oa_bufs * sizeof(*set->set_pga));
+	if (!set->set_pga)
+		GOTO(out, rc = -ENOMEM);
+
+	OBD_ALLOC_LARGE(info, sizeof(*info) * oinfo->oi_md->lsm_stripe_count);
+	if (!info)
+		GOTO(out, rc = -ENOMEM);
+
+	/* calculate the page count for each stripe */
+	for (i = 0; i < oa_bufs; i++) {
+		int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off);
+		info[stripe].count++;
+	}
+
+	/* alloc and initialize lov request */
+	shift = 0;
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++){
+		struct lov_oinfo *loi = NULL;
+		struct lov_request *req;
+
+		if (info[i].count == 0)
+			continue;
+
+		loi = oinfo->oi_md->lsm_oinfo[i];
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			GOTO(out, rc = -EIO);
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out, rc = -ENOMEM);
+		}
+
+		if (oinfo->oi_oa) {
+			memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+			       sizeof(*req->rq_oi.oi_oa));
+		}
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_oa->o_stripe_idx = i;
+
+		req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+		OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+		if (req->rq_oi.oi_md == NULL) {
+			OBDO_FREE(req->rq_oi.oi_oa);
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out, rc = -ENOMEM);
+		}
+
+		req->rq_idx = loi->loi_ost_idx;
+		req->rq_stripe = i;
+
+		/* XXX LOV STACKING */
+		req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+		req->rq_oabufs = info[i].count;
+		req->rq_pgaidx = shift;
+		shift += req->rq_oabufs;
+
+		/* remember the index for sort brw_page array */
+		info[i].index = req->rq_pgaidx;
+
+		req->rq_oi.oi_capa = oinfo->oi_capa;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out, rc = -EIO);
+
+	/* rotate & sort the brw_page array */
+	for (i = 0; i < oa_bufs; i++) {
+		int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off);
+
+		shift = info[stripe].index + info[stripe].off;
+		LASSERT(shift < oa_bufs);
+		set->set_pga[shift] = pga[i];
+		lov_stripe_offset(oinfo->oi_md, pga[i].off, stripe,
+				  &set->set_pga[shift].off);
+		info[stripe].off++;
+	}
+out:
+	if (info)
+		OBD_FREE_LARGE(info,
+			       sizeof(*info) * oinfo->oi_md->lsm_stripe_count);
+
+	if (rc == 0)
+		*reqset = set;
+	else
+		lov_fini_brw_set(set);
+
+	RETURN(rc);
+}
+
+int lov_fini_getattr_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes))
+		rc = common_attr_done(set);
+
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+/* The callback for osc_getattr_async that finilizes a request info when a
+ * response is received. */
+static int cb_getattr_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi;
+		struct lov_request *req;
+
+		loi = oinfo->oi_md->lsm_oinfo[i];
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			if (oinfo->oi_oa->o_valid & OBD_MD_FLEPOCH)
+				/* SOM requires all the OSTs to be active. */
+				GOTO(out_set, rc = -EIO);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+		       sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_cb_up = cb_getattr_update;
+		req->rq_oi.oi_capa = oinfo->oi_capa;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_getattr_set(set);
+	RETURN(rc);
+}
+
+int lov_fini_destroy_set(struct lov_request_set *set)
+{
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		/* FIXME update qos data here */
+	}
+
+	lov_put_reqset(set);
+
+	RETURN(0);
+}
+
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obdo *src_oa, struct lov_stripe_md *lsm,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+	set->set_oi->oi_md = lsm;
+	set->set_oi->oi_oa = src_oa;
+	set->set_oti = oti;
+	if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE)
+		set->set_cookies = oti->oti_logcookies;
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi;
+		struct lov_request *req;
+
+		loi = lsm->lsm_oinfo[i];
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_destroy_set(set);
+	RETURN(rc);
+}
+
+int lov_fini_setattr_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		rc = common_attr_done(set);
+		/* FIXME update qos data here */
+	}
+
+	lov_put_reqset(set);
+	RETURN(rc);
+}
+
+int lov_update_setattr_set(struct lov_request_set *set,
+			   struct lov_request *req, int rc)
+{
+	struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+	struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+	ENTRY;
+
+	lov_update_set(set, req, rc);
+
+	/* grace error on inactive ost */
+	if (rc && !(lov->lov_tgts[req->rq_idx] &&
+		    lov->lov_tgts[req->rq_idx]->ltd_active))
+		rc = 0;
+
+	if (rc == 0) {
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME)
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_ctime =
+				req->rq_oi.oi_oa->o_ctime;
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME)
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_mtime =
+				req->rq_oi.oi_oa->o_mtime;
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME)
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_atime =
+				req->rq_oi.oi_oa->o_atime;
+	}
+
+	RETURN(rc);
+}
+
+/* The callback for osc_setattr_async that finilizes a request info when a
+ * response is received. */
+static int cb_setattr_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+			 struct obd_trans_info *oti,
+			 struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oti = oti;
+	set->set_oi = oinfo;
+	if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+		set->set_cookies = oti->oti_logcookies;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+		struct lov_request *req;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+		       sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_oa->o_stripe_idx = i;
+		req->rq_oi.oi_cb_up = cb_setattr_update;
+		req->rq_oi.oi_capa = oinfo->oi_capa;
+
+		if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) {
+			int off = lov_stripe_offset(oinfo->oi_md,
+						    oinfo->oi_oa->o_size, i,
+						    &req->rq_oi.oi_oa->o_size);
+
+			if (off < 0 && req->rq_oi.oi_oa->o_size)
+				req->rq_oi.oi_oa->o_size--;
+
+			CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n",
+			       i, req->rq_oi.oi_oa->o_size,
+			       oinfo->oi_oa->o_size);
+		}
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_setattr_set(set);
+	RETURN(rc);
+}
+
+int lov_fini_punch_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		rc = -EIO;
+		/* FIXME update qos data here */
+		if (atomic_read(&set->set_success))
+			rc = common_attr_done(set);
+	}
+
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+int lov_update_punch_set(struct lov_request_set *set,
+			 struct lov_request *req, int rc)
+{
+	struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+	struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+	ENTRY;
+
+	lov_update_set(set, req, rc);
+
+	/* grace error on inactive ost */
+	if (rc && !lov->lov_tgts[req->rq_idx]->ltd_active)
+		rc = 0;
+
+	if (rc == 0) {
+		lov_stripe_lock(lsm);
+		if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS) {
+			lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_blocks =
+				req->rq_oi.oi_oa->o_blocks;
+		}
+
+		lov_stripe_unlock(lsm);
+	}
+
+	RETURN(rc);
+}
+
+/* The callback for osc_punch that finilizes a request info when a response
+ * is received. */
+static int cb_update_punch(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	return lov_update_punch_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
+		       struct obd_trans_info *oti,
+		       struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_oi = oinfo;
+	set->set_exp = exp;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+		struct lov_request *req;
+		obd_off rs, re;
+
+		if (!lov_stripe_intersects(oinfo->oi_md, i,
+					   oinfo->oi_policy.l_extent.start,
+					   oinfo->oi_policy.l_extent.end,
+					   &rs, &re))
+			continue;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			GOTO(out_set, rc = -EIO);
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+		       sizeof(*req->rq_oi.oi_oa));
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_oa->o_valid |= OBD_MD_FLGROUP;
+
+		req->rq_oi.oi_oa->o_stripe_idx = i;
+		req->rq_oi.oi_cb_up = cb_update_punch;
+
+		req->rq_oi.oi_policy.l_extent.start = rs;
+		req->rq_oi.oi_policy.l_extent.end = re;
+		req->rq_oi.oi_policy.l_extent.gid = -1;
+
+		req->rq_oi.oi_capa = oinfo->oi_capa;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_punch_set(set);
+	RETURN(rc);
+}
+
+int lov_fini_sync_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+	LASSERT(set->set_exp);
+	if (atomic_read(&set->set_completes)) {
+		if (!atomic_read(&set->set_success))
+			rc = -EIO;
+		/* FIXME update qos data here */
+	}
+
+	lov_put_reqset(set);
+
+	RETURN(rc);
+}
+
+/* The callback for osc_sync that finilizes a request info when a
+ * response is recieved. */
+static int cb_sync_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo,
+		      obd_off start, obd_off end,
+		      struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &exp->exp_obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC_PTR(set);
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_exp = exp;
+	set->set_oi = oinfo;
+
+	for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+		struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+		struct lov_request *req;
+		obd_off rs, re;
+
+		if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+			continue;
+		}
+
+		if (!lov_stripe_intersects(oinfo->oi_md, i, start, end, &rs,
+					   &re))
+			continue;
+
+		OBD_ALLOC_PTR(req);
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+		req->rq_stripe = i;
+		req->rq_idx = loi->loi_ost_idx;
+
+		OBDO_ALLOC(req->rq_oi.oi_oa);
+		if (req->rq_oi.oi_oa == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+		*req->rq_oi.oi_oa = *oinfo->oi_oa;
+		req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+		req->rq_oi.oi_oa->o_stripe_idx = i;
+
+		req->rq_oi.oi_policy.l_extent.start = rs;
+		req->rq_oi.oi_policy.l_extent.end = re;
+		req->rq_oi.oi_policy.l_extent.gid = -1;
+		req->rq_oi.oi_cb_up = cb_sync_update;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_sync_set(set);
+	RETURN(rc);
+}
+
+#define LOV_U64_MAX ((__u64)~0ULL)
+#define LOV_SUM_MAX(tot, add)					   \
+	do {							    \
+		if ((tot) + (add) < (tot))			      \
+			(tot) = LOV_U64_MAX;			    \
+		else						    \
+			(tot) += (add);				 \
+	} while(0)
+
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,int success)
+{
+	ENTRY;
+
+	if (success) {
+		__u32 expected_stripes = lov_get_stripecnt(&obd->u.lov,
+							   LOV_MAGIC, 0);
+		if (osfs->os_files != LOV_U64_MAX)
+			lov_do_div64(osfs->os_files, expected_stripes);
+		if (osfs->os_ffree != LOV_U64_MAX)
+			lov_do_div64(osfs->os_ffree, expected_stripes);
+
+		spin_lock(&obd->obd_osfs_lock);
+		memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
+		obd->obd_osfs_age = cfs_time_current_64();
+		spin_unlock(&obd->obd_osfs_lock);
+		RETURN(0);
+	}
+
+	RETURN(-EIO);
+}
+
+int lov_fini_statfs_set(struct lov_request_set *set)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (set == NULL)
+		RETURN(0);
+
+	if (atomic_read(&set->set_completes)) {
+		rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs,
+				     atomic_read(&set->set_success));
+	}
+	lov_put_reqset(set);
+	RETURN(rc);
+}
+
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+		       int success)
+{
+	int shift = 0, quit = 0;
+	__u64 tmp;
+
+	if (success == 0) {
+		memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+	} else {
+		if (osfs->os_bsize != lov_sfs->os_bsize) {
+			/* assume all block sizes are always powers of 2 */
+			/* get the bits difference */
+			tmp = osfs->os_bsize | lov_sfs->os_bsize;
+			for (shift = 0; shift <= 64; ++shift) {
+				if (tmp & 1) {
+					if (quit)
+						break;
+					else
+						quit = 1;
+					shift = 0;
+				}
+				tmp >>= 1;
+			}
+		}
+
+		if (osfs->os_bsize < lov_sfs->os_bsize) {
+			osfs->os_bsize = lov_sfs->os_bsize;
+
+			osfs->os_bfree  >>= shift;
+			osfs->os_bavail >>= shift;
+			osfs->os_blocks >>= shift;
+		} else if (shift != 0) {
+			lov_sfs->os_bfree  >>= shift;
+			lov_sfs->os_bavail >>= shift;
+			lov_sfs->os_blocks >>= shift;
+		}
+		osfs->os_bfree += lov_sfs->os_bfree;
+		osfs->os_bavail += lov_sfs->os_bavail;
+		osfs->os_blocks += lov_sfs->os_blocks;
+		/* XXX not sure about this one - depends on policy.
+		 *   - could be minimum if we always stripe on all OBDs
+		 *     (but that would be wrong for any other policy,
+		 *     if one of the OBDs has no more objects left)
+		 *   - could be sum if we stripe whole objects
+		 *   - could be average, just to give a nice number
+		 *
+		 * To give a "reasonable" (if not wholly accurate)
+		 * number, we divide the total number of free objects
+		 * by expected stripe count (watch out for overflow).
+		 */
+		LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+		LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+	}
+}
+
+/* The callback for osc_statfs_async that finilizes a request info when a
+ * response is received. */
+static int cb_statfs_update(void *cookie, int rc)
+{
+	struct obd_info *oinfo = cookie;
+	struct lov_request *lovreq;
+	struct lov_request_set *set;
+	struct obd_statfs *osfs, *lov_sfs;
+	struct lov_obd *lov;
+	struct lov_tgt_desc *tgt;
+	struct obd_device *lovobd, *tgtobd;
+	int success;
+	ENTRY;
+
+	lovreq = container_of(oinfo, struct lov_request, rq_oi);
+	set = lovreq->rq_rqset;
+	lovobd = set->set_obd;
+	lov = &lovobd->u.lov;
+	osfs = set->set_oi->oi_osfs;
+	lov_sfs = oinfo->oi_osfs;
+	success = atomic_read(&set->set_success);
+	/* XXX: the same is done in lov_update_common_set, however
+	   lovset->set_exp is not initialized. */
+	lov_update_set(set, lovreq, rc);
+	if (rc)
+		GOTO(out, rc);
+
+	obd_getref(lovobd);
+	tgt = lov->lov_tgts[lovreq->rq_idx];
+	if (!tgt || !tgt->ltd_active)
+		GOTO(out_update, rc);
+
+	tgtobd = class_exp2obd(tgt->ltd_exp);
+	spin_lock(&tgtobd->obd_osfs_lock);
+	memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
+	if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
+		tgtobd->obd_osfs_age = cfs_time_current_64();
+	spin_unlock(&tgtobd->obd_osfs_lock);
+
+out_update:
+	lov_update_statfs(osfs, lov_sfs, success);
+	obd_putref(lovobd);
+
+out:
+	if (set->set_oi->oi_flags & OBD_STATFS_PTLRPCD &&
+	    lov_set_finished(set, 0)) {
+		lov_statfs_interpret(NULL, set, set->set_count !=
+				     atomic_read(&set->set_success));
+	}
+
+	RETURN(0);
+}
+
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+			struct lov_request_set **reqset)
+{
+	struct lov_request_set *set;
+	struct lov_obd *lov = &obd->u.lov;
+	int rc = 0, i;
+	ENTRY;
+
+	OBD_ALLOC(set, sizeof(*set));
+	if (set == NULL)
+		RETURN(-ENOMEM);
+	lov_init_set(set);
+
+	set->set_obd = obd;
+	set->set_oi = oinfo;
+
+	/* We only get block data from the OBD */
+	for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+		struct lov_request *req;
+
+		if (lov->lov_tgts[i] == NULL ||
+		    (!lov_check_and_wait_active(lov, i) &&
+		     (oinfo->oi_flags & OBD_STATFS_NODELAY))) {
+			CDEBUG(D_HA, "lov idx %d inactive\n", i);
+			continue;
+		}
+
+		/* skip targets that have been explicitely disabled by the
+		 * administrator */
+		if (!lov->lov_tgts[i]->ltd_exp) {
+			CDEBUG(D_HA, "lov idx %d administratively disabled\n", i);
+			continue;
+		}
+
+		OBD_ALLOC(req, sizeof(*req));
+		if (req == NULL)
+			GOTO(out_set, rc = -ENOMEM);
+
+		OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+		if (req->rq_oi.oi_osfs == NULL) {
+			OBD_FREE(req, sizeof(*req));
+			GOTO(out_set, rc = -ENOMEM);
+		}
+
+		req->rq_idx = i;
+		req->rq_oi.oi_cb_up = cb_statfs_update;
+		req->rq_oi.oi_flags = oinfo->oi_flags;
+
+		lov_set_add_req(req, set);
+	}
+	if (!set->set_count)
+		GOTO(out_set, rc = -EIO);
+	*reqset = set;
+	RETURN(rc);
+out_set:
+	lov_fini_statfs_set(set);
+	RETURN(rc);
+}

diff --git a/drivers/staging/lustre/lustre/lov/lovsub_dev.c b/drivers/staging/lustre/lustre/lov/lovsub_dev.c
new file mode 100644
index 0000000..204ecd0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_dev.c

@@ -0,0 +1,211 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub transfer operations.
+ *
+ */
+
+static void lovsub_req_completion(const struct lu_env *env,
+				  const struct cl_req_slice *slice, int ioret)
+{
+	struct lovsub_req *lsr;
+
+	ENTRY;
+	lsr = cl2lovsub_req(slice);
+	OBD_SLAB_FREE_PTR(lsr, lovsub_req_kmem);
+	EXIT;
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for lovsub
+ * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx
+ * field, which is filled there.
+ */
+static void lovsub_req_attr_set(const struct lu_env *env,
+				const struct cl_req_slice *slice,
+				const struct cl_object *obj,
+				struct cl_req_attr *attr, obd_valid flags)
+{
+	struct lovsub_object *subobj;
+
+	ENTRY;
+	subobj = cl2lovsub(obj);
+	/*
+	 * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it
+	 * unconditionally. It never changes anyway.
+	 */
+	attr->cra_oa->o_stripe_idx = subobj->lso_index;
+	EXIT;
+}
+
+static const struct cl_req_operations lovsub_req_ops = {
+	.cro_attr_set   = lovsub_req_attr_set,
+	.cro_completion = lovsub_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov-sub device and device type functions.
+ *
+ */
+
+static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
+			      const char *name, struct lu_device *next)
+{
+	struct lovsub_device  *lsd = lu2lovsub_dev(d);
+	struct lu_device_type *ldt;
+	int rc;
+
+	ENTRY;
+	next->ld_site = d->ld_site;
+	ldt = next->ld_type;
+	LASSERT(ldt != NULL);
+	rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+	if (rc) {
+		next->ld_site = NULL;
+		RETURN(rc);
+	}
+
+	lu_device_get(next);
+	lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+	lsd->acid_next = lu2cl_dev(next);
+	RETURN(rc);
+}
+
+static struct lu_device *lovsub_device_fini(const struct lu_env *env,
+					    struct lu_device *d)
+{
+	struct lu_device *next;
+	struct lovsub_device *lsd;
+
+	ENTRY;
+	lsd = lu2lovsub_dev(d);
+	next = cl2lu_dev(lsd->acid_next);
+	lsd->acid_super = NULL;
+	lsd->acid_next = NULL;
+	RETURN(next);
+}
+
+static struct lu_device *lovsub_device_free(const struct lu_env *env,
+					    struct lu_device *d)
+{
+	struct lovsub_device *lsd  = lu2lovsub_dev(d);
+	struct lu_device     *next = cl2lu_dev(lsd->acid_next);
+
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(lsd);
+	return next;
+}
+
+static int lovsub_req_init(const struct lu_env *env, struct cl_device *dev,
+			   struct cl_req *req)
+{
+	struct lovsub_req *lsr;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(lsr, lovsub_req_kmem, __GFP_IO);
+	if (lsr != NULL) {
+		cl_req_slice_add(req, &lsr->lsrq_cl, dev, &lovsub_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+static const struct lu_device_operations lovsub_lu_ops = {
+	.ldo_object_alloc      = lovsub_object_alloc,
+	.ldo_process_config    = NULL,
+	.ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations lovsub_cl_ops = {
+	.cdo_req_init = lovsub_req_init
+};
+
+static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
+					     struct lu_device_type *t,
+					     struct lustre_cfg *cfg)
+{
+	struct lu_device     *d;
+	struct lovsub_device *lsd;
+
+	OBD_ALLOC_PTR(lsd);
+	if (lsd != NULL) {
+		int result;
+
+		result = cl_device_init(&lsd->acid_cl, t);
+		if (result == 0) {
+			d = lovsub2lu_dev(lsd);
+			d->ld_ops	 = &lovsub_lu_ops;
+			lsd->acid_cl.cd_ops = &lovsub_cl_ops;
+		} else
+			d = ERR_PTR(result);
+	} else
+		d = ERR_PTR(-ENOMEM);
+	return d;
+}
+
+static const struct lu_device_type_operations lovsub_device_type_ops = {
+	.ldto_device_alloc = lovsub_device_alloc,
+	.ldto_device_free  = lovsub_device_free,
+
+	.ldto_device_init    = lovsub_device_init,
+	.ldto_device_fini    = lovsub_device_fini
+};
+
+#define LUSTRE_LOVSUB_NAME	 "lovsub"
+
+struct lu_device_type lovsub_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_LOVSUB_NAME,
+	.ldt_ops      = &lovsub_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+
+/** @} lov */

diff --git a/drivers/staging/lustre/lustre/lov/lovsub_io.c b/drivers/staging/lustre/lustre/lov/lovsub_io.c
new file mode 100644
index 0000000..783ec68
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_io.c

@@ -0,0 +1,55 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub io operations.
+ *
+ */
+
+/* All trivial */
+
+/** @} lov */

diff --git a/drivers/staging/lustre/lustre/lov/lovsub_lock.c b/drivers/staging/lustre/lustre/lov/lovsub_lock.c
new file mode 100644
index 0000000..03bab17
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_lock.c

@@ -0,0 +1,485 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub lock operations.
+ *
+ */
+
+static void lovsub_lock_fini(const struct lu_env *env,
+			     struct cl_lock_slice *slice)
+{
+	struct lovsub_lock   *lsl;
+
+	ENTRY;
+	lsl = cl2lovsub_lock(slice);
+	LASSERT(list_empty(&lsl->lss_parents));
+	OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
+	EXIT;
+}
+
+static void lovsub_parent_lock(const struct lu_env *env, struct lov_lock *lov)
+{
+	struct cl_lock *parent;
+
+	ENTRY;
+	parent = lov->lls_cl.cls_lock;
+	cl_lock_get(parent);
+	lu_ref_add(&parent->cll_reference, "lovsub-parent", current);
+	cl_lock_mutex_get(env, parent);
+	EXIT;
+}
+
+static void lovsub_parent_unlock(const struct lu_env *env, struct lov_lock *lov)
+{
+	struct cl_lock *parent;
+
+	ENTRY;
+	parent = lov->lls_cl.cls_lock;
+	cl_lock_mutex_put(env, lov->lls_cl.cls_lock);
+	lu_ref_del(&parent->cll_reference, "lovsub-parent", current);
+	cl_lock_put(env, parent);
+	EXIT;
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for lovsub layer, which
+ * method is called whenever sub-lock state changes. Propagates state change
+ * to the top-locks.
+ */
+static void lovsub_lock_state(const struct lu_env *env,
+			      const struct cl_lock_slice *slice,
+			      enum cl_lock_state state)
+{
+	struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+	struct lov_lock_link *scan;
+
+	LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+	ENTRY;
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		struct lov_lock *lov    = scan->lll_super;
+		struct cl_lock  *parent = lov->lls_cl.cls_lock;
+
+		if (sub->lss_active != parent) {
+			lovsub_parent_lock(env, lov);
+			cl_lock_signal(env, parent);
+			lovsub_parent_unlock(env, lov);
+		}
+	}
+	EXIT;
+}
+
+/**
+ * Implementation of cl_lock_operation::clo_weigh() estimating lock weight by
+ * asking parent lock.
+ */
+static unsigned long lovsub_lock_weigh(const struct lu_env *env,
+				       const struct cl_lock_slice *slice)
+{
+	struct lovsub_lock *lock = cl2lovsub_lock(slice);
+	struct lov_lock    *lov;
+	unsigned long       dumbbell;
+
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+	if (!list_empty(&lock->lss_parents)) {
+		/*
+		 * It is not clear whether all parents have to be asked and
+		 * their estimations summed, or it is enough to ask one. For
+		 * the current usages, one is always enough.
+		 */
+		lov = container_of(lock->lss_parents.next,
+				   struct lov_lock_link, lll_list)->lll_super;
+
+		lovsub_parent_lock(env, lov);
+		dumbbell = cl_lock_weigh(env, lov->lls_cl.cls_lock);
+		lovsub_parent_unlock(env, lov);
+	} else
+		dumbbell = 0;
+
+	RETURN(dumbbell);
+}
+
+/**
+ * Maps start/end offsets within a stripe, to offsets within a file.
+ */
+static void lovsub_lock_descr_map(const struct cl_lock_descr *in,
+				  struct lov_object *lov,
+				  int stripe, struct cl_lock_descr *out)
+{
+	pgoff_t size; /* stripe size in pages */
+	pgoff_t skip; /* how many pages in every stripe are occupied by
+		       * "other" stripes */
+	pgoff_t start;
+	pgoff_t end;
+
+	ENTRY;
+	start = in->cld_start;
+	end   = in->cld_end;
+
+	if (lov->lo_lsm->lsm_stripe_count > 1) {
+		size = cl_index(lov2cl(lov), lov->lo_lsm->lsm_stripe_size);
+		skip = (lov->lo_lsm->lsm_stripe_count - 1) * size;
+
+		/* XXX overflow check here? */
+		start += start/size * skip + stripe * size;
+
+		if (end != CL_PAGE_EOF) {
+			end += end/size * skip + stripe * size;
+			/*
+			 * And check for overflow...
+			 */
+			if (end < in->cld_end)
+				end = CL_PAGE_EOF;
+		}
+	}
+	out->cld_start = start;
+	out->cld_end   = end;
+	EXIT;
+}
+
+/**
+ * Adjusts parent lock extent when a sub-lock is attached to a parent. This is
+ * called in two ways:
+ *
+ *     - as part of receive call-back, when server returns granted extent to
+ *       the client, and
+ *
+ *     - when top-lock finds existing sub-lock in the cache.
+ *
+ * Note, that lock mode is not propagated to the parent: i.e., if CLM_READ
+ * top-lock matches CLM_WRITE sub-lock, top-lock is still CLM_READ.
+ */
+int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov,
+		       struct lovsub_lock *sublock,
+		       const struct cl_lock_descr *d, int idx)
+{
+	struct cl_lock       *parent;
+	struct lovsub_object *subobj;
+	struct cl_lock_descr *pd;
+	struct cl_lock_descr *parent_descr;
+	int		   result;
+
+	parent       = lov->lls_cl.cls_lock;
+	parent_descr = &parent->cll_descr;
+	LASSERT(cl_lock_mode_match(d->cld_mode, parent_descr->cld_mode));
+
+	subobj = cl2lovsub(sublock->lss_cl.cls_obj);
+	pd     = &lov_env_info(env)->lti_ldescr;
+
+	pd->cld_obj  = parent_descr->cld_obj;
+	pd->cld_mode = parent_descr->cld_mode;
+	pd->cld_gid  = parent_descr->cld_gid;
+	lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd);
+	lov->lls_sub[idx].sub_got = *d;
+	/*
+	 * Notify top-lock about modification, if lock description changes
+	 * materially.
+	 */
+	if (!cl_lock_ext_match(parent_descr, pd))
+		result = cl_lock_modify(env, parent, pd);
+	else
+		result = 0;
+	return result;
+}
+
+static int lovsub_lock_modify(const struct lu_env *env,
+			      const struct cl_lock_slice *s,
+			      const struct cl_lock_descr *d)
+{
+	struct lovsub_lock   *lock   = cl2lovsub_lock(s);
+	struct lov_lock_link *scan;
+	struct lov_lock      *lov;
+	int result		   = 0;
+
+	ENTRY;
+
+	LASSERT(cl_lock_mode_match(d->cld_mode,
+				   s->cls_lock->cll_descr.cld_mode));
+	list_for_each_entry(scan, &lock->lss_parents, lll_list) {
+		int rc;
+
+		lov = scan->lll_super;
+		lovsub_parent_lock(env, lov);
+		rc = lov_sublock_modify(env, lov, lock, d, scan->lll_idx);
+		lovsub_parent_unlock(env, lov);
+		result = result ?: rc;
+	}
+	RETURN(result);
+}
+
+static int lovsub_lock_closure(const struct lu_env *env,
+			       const struct cl_lock_slice *slice,
+			       struct cl_lock_closure *closure)
+{
+	struct lovsub_lock   *sub;
+	struct cl_lock       *parent;
+	struct lov_lock_link *scan;
+	int		   result;
+
+	LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+	ENTRY;
+
+	sub    = cl2lovsub_lock(slice);
+	result = 0;
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		parent = scan->lll_super->lls_cl.cls_lock;
+		result = cl_lock_closure_build(env, parent, closure);
+		if (result != 0)
+			break;
+	}
+	RETURN(result);
+}
+
+/**
+ * A helper function for lovsub_lock_delete() that deals with a given parent
+ * top-lock.
+ */
+static int lovsub_lock_delete_one(const struct lu_env *env,
+				  struct cl_lock *child, struct lov_lock *lov)
+{
+	struct cl_lock *parent;
+	int	     result;
+	ENTRY;
+
+	parent = lov->lls_cl.cls_lock;
+	if (parent->cll_error)
+		RETURN(0);
+
+	result = 0;
+	switch (parent->cll_state) {
+	case CLS_ENQUEUED:
+		/* See LU-1355 for the case that a glimpse lock is
+		 * interrupted by signal */
+		LASSERT(parent->cll_flags & CLF_CANCELLED);
+		break;
+	case CLS_QUEUING:
+	case CLS_FREEING:
+		cl_lock_signal(env, parent);
+		break;
+	case CLS_INTRANSIT:
+		/*
+		 * Here lies a problem: a sub-lock is canceled while top-lock
+		 * is being unlocked. Top-lock cannot be moved into CLS_NEW
+		 * state, because unlocking has to succeed eventually by
+		 * placing lock into CLS_CACHED (or failing it), see
+		 * cl_unuse_try(). Nor can top-lock be left in CLS_CACHED
+		 * state, because lov maintains an invariant that all
+		 * sub-locks exist in CLS_CACHED (this allows cached top-lock
+		 * to be reused immediately). Nor can we wait for top-lock
+		 * state to change, because this can be synchronous to the
+		 * current thread.
+		 *
+		 * We know for sure that lov_lock_unuse() will be called at
+		 * least one more time to finish un-using, so leave a mark on
+		 * the top-lock, that will be seen by the next call to
+		 * lov_lock_unuse().
+		 */
+		if (cl_lock_is_intransit(parent))
+			lov->lls_cancel_race = 1;
+		break;
+	case CLS_CACHED:
+		/*
+		 * if a sub-lock is canceled move its top-lock into CLS_NEW
+		 * state to preserve an invariant that a top-lock in
+		 * CLS_CACHED is immediately ready for re-use (i.e., has all
+		 * sub-locks), and so that next attempt to re-use the top-lock
+		 * enqueues missing sub-lock.
+		 */
+		cl_lock_state_set(env, parent, CLS_NEW);
+		/* fall through */
+	case CLS_NEW:
+		/*
+		 * if last sub-lock is canceled, destroy the top-lock (which
+		 * is now `empty') proactively.
+		 */
+		if (lov->lls_nr_filled == 0) {
+			/* ... but unfortunately, this cannot be done easily,
+			 * as cancellation of a top-lock might acquire mutices
+			 * of its other sub-locks, violating lock ordering,
+			 * see cl_lock_{cancel,delete}() preconditions.
+			 *
+			 * To work around this, the mutex of this sub-lock is
+			 * released, top-lock is destroyed, and sub-lock mutex
+			 * acquired again. The list of parents has to be
+			 * re-scanned from the beginning after this.
+			 *
+			 * Only do this if no mutices other than on @child and
+			 * @parent are held by the current thread.
+			 *
+			 * TODO: The lock modal here is too complex, because
+			 * the lock may be canceled and deleted by voluntarily:
+			 *    cl_lock_request
+			 *      -> osc_lock_enqueue_wait
+			 *	-> osc_lock_cancel_wait
+			 *	  -> cl_lock_delete
+			 *	    -> lovsub_lock_delete
+			 *	      -> cl_lock_cancel/delete
+			 *		-> ...
+			 *
+			 * The better choice is to spawn a kernel thread for
+			 * this purpose. -jay
+			 */
+			if (cl_lock_nr_mutexed(env) == 2) {
+				cl_lock_mutex_put(env, child);
+				cl_lock_cancel(env, parent);
+				cl_lock_delete(env, parent);
+				result = 1;
+			}
+		}
+		break;
+	case CLS_HELD:
+		CL_LOCK_DEBUG(D_ERROR, env, parent, "Delete CLS_HELD lock\n");
+	default:
+		CERROR("Impossible state: %d\n", parent->cll_state);
+		LBUG();
+		break;
+	}
+
+	RETURN(result);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked in "bottom-to-top" delete, when lock destruction starts from the
+ * sub-lock (e.g, as a result of ldlm lock LRU policy).
+ */
+static void lovsub_lock_delete(const struct lu_env *env,
+			       const struct cl_lock_slice *slice)
+{
+	struct cl_lock     *child = slice->cls_lock;
+	struct lovsub_lock *sub   = cl2lovsub_lock(slice);
+	int restart;
+
+	LASSERT(cl_lock_is_mutexed(child));
+
+	ENTRY;
+	/*
+	 * Destruction of a sub-lock might take multiple iterations, because
+	 * when the last sub-lock of a given top-lock is deleted, top-lock is
+	 * canceled proactively, and this requires to release sub-lock
+	 * mutex. Once sub-lock mutex has been released, list of its parents
+	 * has to be re-scanned from the beginning.
+	 */
+	do {
+		struct lov_lock      *lov;
+		struct lov_lock_link *scan;
+		struct lov_lock_link *temp;
+		struct lov_lock_sub  *subdata;
+
+		restart = 0;
+		list_for_each_entry_safe(scan, temp,
+					     &sub->lss_parents, lll_list) {
+			lov     = scan->lll_super;
+			subdata = &lov->lls_sub[scan->lll_idx];
+			lovsub_parent_lock(env, lov);
+			subdata->sub_got = subdata->sub_descr;
+			lov_lock_unlink(env, scan, sub);
+			restart = lovsub_lock_delete_one(env, child, lov);
+			lovsub_parent_unlock(env, lov);
+
+			if (restart) {
+				cl_lock_mutex_get(env, child);
+				break;
+			}
+	       }
+	} while (restart);
+	EXIT;
+}
+
+static int lovsub_lock_print(const struct lu_env *env, void *cookie,
+			     lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+	struct lov_lock      *lov;
+	struct lov_lock_link *scan;
+
+	list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+		lov = scan->lll_super;
+		(*p)(env, cookie, "[%d %p ", scan->lll_idx, lov);
+		if (lov != NULL)
+			cl_lock_descr_print(env, cookie, p,
+					    &lov->lls_cl.cls_lock->cll_descr);
+		(*p)(env, cookie, "] ");
+	}
+	return 0;
+}
+
+static const struct cl_lock_operations lovsub_lock_ops = {
+	.clo_fini    = lovsub_lock_fini,
+	.clo_state   = lovsub_lock_state,
+	.clo_delete  = lovsub_lock_delete,
+	.clo_modify  = lovsub_lock_modify,
+	.clo_closure = lovsub_lock_closure,
+	.clo_weigh   = lovsub_lock_weigh,
+	.clo_print   = lovsub_lock_print
+};
+
+int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+		     struct cl_lock *lock, const struct cl_io *io)
+{
+	struct lovsub_lock *lsk;
+	int result;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, __GFP_IO);
+	if (lsk != NULL) {
+		INIT_LIST_HEAD(&lsk->lss_parents);
+		cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+}
+
+/** @} lov */

diff --git a/drivers/staging/lustre/lustre/lov/lovsub_object.c b/drivers/staging/lustre/lustre/lov/lovsub_object.c
new file mode 100644
index 0000000..1b83d90
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_object.c

@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub object operations.
+ *
+ */
+
+int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+		       const struct lu_object_conf *conf)
+{
+	struct lovsub_device  *dev   = lu2lovsub_dev(obj->lo_dev);
+	struct lu_object      *below;
+	struct lu_device      *under;
+
+	int result;
+
+	ENTRY;
+	under = &dev->acid_next->cd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+	if (below != NULL) {
+		lu_object_add(obj, below);
+		cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page));
+		result = 0;
+	} else
+		result = -ENOMEM;
+	RETURN(result);
+
+}
+
+static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct lovsub_object *los = lu2lovsub(obj);
+	struct lov_object    *lov = los->lso_super;
+	ENTRY;
+
+	/* We can't assume lov was assigned here, because of the shadow
+	 * object handling in lu_object_find.
+	 */
+	if (lov) {
+		LASSERT(lov->lo_type == LLT_RAID0);
+		LASSERT(lov->u.raid0.lo_sub[los->lso_index] == los);
+		spin_lock(&lov->u.raid0.lo_sub_lock);
+		lov->u.raid0.lo_sub[los->lso_index] = NULL;
+		spin_unlock(&lov->u.raid0.lo_sub_lock);
+	}
+
+	lu_object_fini(obj);
+	lu_object_header_fini(&los->lso_header.coh_lu);
+	OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+	EXIT;
+}
+
+static int lovsub_object_print(const struct lu_env *env, void *cookie,
+			       lu_printer_t p, const struct lu_object *obj)
+{
+	struct lovsub_object *los = lu2lovsub(obj);
+
+	return (*p)(env, cookie, "[%d]", los->lso_index);
+}
+
+static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj,
+			   const struct cl_attr *attr, unsigned valid)
+{
+	struct lov_object *lov = cl2lovsub(obj)->lso_super;
+
+	ENTRY;
+	lov_r0(lov)->lo_attr_valid = 0;
+	RETURN(0);
+}
+
+static int lovsub_object_glimpse(const struct lu_env *env,
+				 const struct cl_object *obj,
+				 struct ost_lvb *lvb)
+{
+	struct lovsub_object *los = cl2lovsub(obj);
+
+	ENTRY;
+	RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
+}
+
+
+
+static const struct cl_object_operations lovsub_ops = {
+	.coo_page_init = lovsub_page_init,
+	.coo_lock_init = lovsub_lock_init,
+	.coo_attr_set  = lovsub_attr_set,
+	.coo_glimpse   = lovsub_object_glimpse
+};
+
+static const struct lu_object_operations lovsub_lu_obj_ops = {
+	.loo_object_init      = lovsub_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = lovsub_object_free,
+	.loo_object_print     = lovsub_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+				      const struct lu_object_header *unused,
+				      struct lu_device *dev)
+{
+	struct lovsub_object *los;
+	struct lu_object     *obj;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, __GFP_IO);
+	if (los != NULL) {
+		struct cl_object_header *hdr;
+
+		obj = lovsub2lu(los);
+		hdr = &los->lso_header;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+		los->lso_cl.co_ops = &lovsub_ops;
+		obj->lo_ops = &lovsub_lu_obj_ops;
+	} else
+		obj = NULL;
+	RETURN(obj);
+}
+
+/** @} lov */

diff --git a/drivers/staging/lustre/lustre/lov/lovsub_page.c b/drivers/staging/lustre/lustre/lov/lovsub_page.c
new file mode 100644
index 0000000..bc9e683
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_page.c

@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub page operations.
+ *
+ */
+
+static void lovsub_page_fini(const struct lu_env *env,
+			     struct cl_page_slice *slice)
+{
+}
+
+static const struct cl_page_operations lovsub_page_ops = {
+	.cpo_fini   = lovsub_page_fini
+};
+
+int lovsub_page_init(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *unused)
+{
+	struct lovsub_page *lsb = cl_object_page_slice(obj, page);
+	ENTRY;
+
+	cl_page_slice_add(page, &lsb->lsb_cl, obj, &lovsub_page_ops);
+	RETURN(0);
+}
+
+/** @} lov */

diff --git a/drivers/staging/lustre/lustre/lov/lproc_lov.c b/drivers/staging/lustre/lustre/lov/lproc_lov.c
new file mode 100644
index 0000000..5b2c0d8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lproc_lov.c

@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+#include <linux/seq_file.h>
+#include "lov_internal.h"
+
+#ifdef LPROCFS
+static int lov_stripesize_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	return seq_printf(m, LPU64"\n", desc->ld_default_stripe_size);
+}
+
+static ssize_t lov_stripesize_seq_write(struct file *file, const char *buffer,
+				    size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	__u64 val;
+	int rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	lov_fix_desc_stripe_size(&val);
+	desc->ld_default_stripe_size = val;
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripesize);
+
+static int lov_stripeoffset_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	return seq_printf(m, LPU64"\n", desc->ld_default_stripe_offset);
+}
+
+static ssize_t lov_stripeoffset_seq_write(struct file *file, const char *buffer,
+				      size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	__u64 val;
+	int rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	desc->ld_default_stripe_offset = val;
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripeoffset);
+
+static int lov_stripetype_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	return seq_printf(m, "%u\n", desc->ld_pattern);
+}
+
+static ssize_t lov_stripetype_seq_write(struct file *file, const char *buffer,
+				    size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	int val, rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	lov_fix_desc_pattern(&val);
+	desc->ld_pattern = val;
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripetype);
+
+static int lov_stripecount_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	return seq_printf(m, "%d\n",
+			(__s16)(desc->ld_default_stripe_count + 1) - 1);
+}
+
+static ssize_t lov_stripecount_seq_write(struct file *file, const char *buffer,
+				     size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct lov_desc *desc;
+	int val, rc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	lov_fix_desc_stripe_count(&val);
+	desc->ld_default_stripe_count = val;
+	return count;
+}
+LPROC_SEQ_FOPS(lov_stripecount);
+
+static int lov_numobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	return seq_printf(m, "%u\n", desc->ld_tgt_count);
+}
+LPROC_SEQ_FOPS_RO(lov_numobd);
+
+static int lov_activeobd_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_desc *desc;
+
+	LASSERT(dev != NULL);
+	desc = &dev->u.lov.desc;
+	return seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+}
+LPROC_SEQ_FOPS_RO(lov_activeobd);
+
+static int lov_desc_uuid_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = (struct obd_device *)m->private;
+	struct lov_obd *lov;
+
+	LASSERT(dev != NULL);
+	lov = &dev->u.lov;
+	return seq_printf(m, "%s\n", lov->desc.ld_uuid.uuid);
+}
+LPROC_SEQ_FOPS_RO(lov_desc_uuid);
+
+static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_device *dev = p->private;
+	struct lov_obd *lov = &dev->u.lov;
+
+	while (*pos < lov->desc.ld_tgt_count) {
+		if (lov->lov_tgts[*pos])
+			return lov->lov_tgts[*pos];
+		++*pos;
+	}
+	return NULL;
+}
+
+static void lov_tgt_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_device *dev = p->private;
+	struct lov_obd *lov = &dev->u.lov;
+
+	while (++*pos < lov->desc.ld_tgt_count) {
+		if (lov->lov_tgts[*pos])
+			return lov->lov_tgts[*pos];
+	}
+	return NULL;
+}
+
+static int lov_tgt_seq_show(struct seq_file *p, void *v)
+{
+	struct lov_tgt_desc *tgt = v;
+	return seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index,
+			  obd_uuid2str(&tgt->ltd_uuid),
+			  tgt->ltd_active ? "" : "IN");
+}
+
+struct seq_operations lov_tgt_sops = {
+	.start = lov_tgt_seq_start,
+	.stop = lov_tgt_seq_stop,
+	.next = lov_tgt_seq_next,
+	.show = lov_tgt_seq_show,
+};
+
+static int lov_target_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = seq_open(file, &lov_tgt_sops);
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(lov, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesavail);
+
+struct lprocfs_vars lprocfs_lov_obd_vars[] = {
+	{ "uuid",	  &lov_uuid_fops,	  0, 0 },
+	{ "stripesize",   &lov_stripesize_fops,   0 },
+	{ "stripeoffset", &lov_stripeoffset_fops, 0 },
+	{ "stripecount",  &lov_stripecount_fops,  0 },
+	{ "stripetype",   &lov_stripetype_fops,   0 },
+	{ "numobd",       &lov_numobd_fops,	  0, 0 },
+	{ "activeobd",    &lov_activeobd_fops,	  0, 0 },
+	{ "filestotal",   &lov_filestotal_fops,   0, 0 },
+	{ "filesfree",    &lov_filesfree_fops,    0, 0 },
+	/*{ "filegroups", lprocfs_rd_filegroups,  0, 0 },*/
+	{ "blocksize",    &lov_blksize_fops,      0, 0 },
+	{ "kbytestotal",  &lov_kbytestotal_fops,  0, 0 },
+	{ "kbytesfree",   &lov_kbytesfree_fops,   0, 0 },
+	{ "kbytesavail",  &lov_kbytesavail_fops,  0, 0 },
+	{ "desc_uuid",    &lov_desc_uuid_fops,    0, 0 },
+	{ 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(lov, numrefs);
+
+static struct lprocfs_vars lprocfs_lov_module_vars[] = {
+	{ "num_refs",     &lov_numrefs_fops,     0, 0 },
+	{ 0 }
+};
+
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_lov_module_vars;
+    lvars->obd_vars     = lprocfs_lov_obd_vars;
+}
+
+struct file_operations lov_proc_target_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lov_target_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+#endif /* LPROCFS */

diff --git a/drivers/staging/lustre/lustre/lvfs/Makefile b/drivers/staging/lustre/lustre/lvfs/Makefile
new file mode 100644
index 0000000..f50b1c5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lvfs/Makefile

@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTRE_FS) += lvfs.o
+
+lvfs-y := lvfs_linux.o fsfilt.o lvfs_lib.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/lvfs/fsfilt.c b/drivers/staging/lustre/lustre/lvfs/fsfilt.c
new file mode 100644
index 0000000..064445c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lvfs/fsfilt.c

@@ -0,0 +1,138 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+
+LIST_HEAD(fsfilt_types);
+
+static struct fsfilt_operations *fsfilt_search_type(const char *type)
+{
+	struct fsfilt_operations *found;
+	struct list_head *p;
+
+	list_for_each(p, &fsfilt_types) {
+		found = list_entry(p, struct fsfilt_operations, fs_list);
+		if (!strcmp(found->fs_type, type)) {
+			return found;
+		}
+	}
+	return NULL;
+}
+
+int fsfilt_register_ops(struct fsfilt_operations *fs_ops)
+{
+	struct fsfilt_operations *found;
+
+	/* lock fsfilt_types list */
+	if ((found = fsfilt_search_type(fs_ops->fs_type))) {
+		if (found != fs_ops) {
+			CERROR("different operations for type %s\n",
+			       fs_ops->fs_type);
+			/* unlock fsfilt_types list */
+			RETURN(-EEXIST);
+		}
+	} else {
+		try_module_get(THIS_MODULE);
+		list_add(&fs_ops->fs_list, &fsfilt_types);
+	}
+
+	/* unlock fsfilt_types list */
+	return 0;
+}
+EXPORT_SYMBOL(fsfilt_register_ops);
+
+void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops)
+{
+	struct list_head *p;
+
+	/* lock fsfilt_types list */
+	list_for_each(p, &fsfilt_types) {
+		struct fsfilt_operations *found;
+
+		found = list_entry(p, typeof(*found), fs_list);
+		if (found == fs_ops) {
+			list_del(p);
+			module_put(THIS_MODULE);
+			break;
+		}
+	}
+	/* unlock fsfilt_types list */
+}
+EXPORT_SYMBOL(fsfilt_unregister_ops);
+
+struct fsfilt_operations *fsfilt_get_ops(const char *type)
+{
+	struct fsfilt_operations *fs_ops;
+
+	/* lock fsfilt_types list */
+	if (!(fs_ops = fsfilt_search_type(type))) {
+		char name[32];
+		int rc;
+
+		snprintf(name, sizeof(name) - 1, "fsfilt_%s", type);
+		name[sizeof(name) - 1] = '\0';
+
+		if (!(rc = request_module("%s", name))) {
+			fs_ops = fsfilt_search_type(type);
+			CDEBUG(D_INFO, "Loaded module '%s'\n", name);
+			if (!fs_ops)
+				rc = -ENOENT;
+		}
+
+		if (rc) {
+			CERROR("Can't find %s interface\n", name);
+			RETURN(ERR_PTR(rc < 0 ? rc : -rc));
+			/* unlock fsfilt_types list */
+		}
+	}
+	try_module_get(fs_ops->fs_owner);
+	/* unlock fsfilt_types list */
+
+	return fs_ops;
+}
+EXPORT_SYMBOL(fsfilt_get_ops);
+
+void fsfilt_put_ops(struct fsfilt_operations *fs_ops)
+{
+	module_put(fs_ops->fs_owner);
+}
+EXPORT_SYMBOL(fsfilt_put_ops);

diff --git a/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c b/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c
new file mode 100644
index 0000000..c1e99b3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c

@@ -0,0 +1,761 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/fsfilt_ext3.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <ldiskfs/ldiskfs_config.h>
+#include <ext4/ext4.h>
+#include <ext4/ext4_jbd2.h>
+#include <linux/version.h>
+#include <linux/bitops.h>
+#include <linux/quota.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+#include <obd.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lprocfs_status.h>
+
+#include <ext4/ext4_extents.h>
+
+#ifdef HAVE_EXT_PBLOCK /* Name changed to ext4_ext_pblock for kernel 2.6.35 */
+#define ext3_ext_pblock(ex) ext_pblock((ex))
+#endif
+
+/* for kernels 2.6.18 and later */
+#define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS(sb)
+
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+	       ext3_ext_insert_extent(handle, inode, path, newext, flag)
+
+#define ext3_mb_discard_inode_preallocations(inode) \
+		 ext3_discard_preallocations(inode)
+
+#define fsfilt_log_start_commit(journal, tid) jbd2_log_start_commit(journal, tid)
+#define fsfilt_log_wait_commit(journal, tid) jbd2_log_wait_commit(journal, tid)
+
+static struct kmem_cache *fcb_cache;
+
+struct fsfilt_cb_data {
+	struct ext4_journal_cb_entry cb_jcb; /* private data - MUST BE FIRST */
+	fsfilt_cb_t cb_func;	    /* MDS/OBD completion function */
+	struct obd_device *cb_obd;      /* MDS/OBD completion device */
+	__u64 cb_last_rcvd;	     /* MDS/OST last committed operation */
+	void *cb_data;		  /* MDS/OST completion function data */
+};
+
+static char *fsfilt_ext3_get_label(struct super_block *sb)
+{
+	return EXT3_SB(sb)->s_es->s_volume_name;
+}
+
+/* kernel has ext4_blocks_for_truncate since linux-3.1.1 */
+# include <ext4/truncate.h>
+
+/*
+ * We don't currently need any additional blocks for rmdir and
+ * unlink transactions because we are storing the OST oa_id inside
+ * the inode (which we will be changing anyways as part of this
+ * transaction).
+ */
+static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
+			       int logs)
+{
+	/* For updates to the last received file */
+	int nblocks = FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb);
+	journal_t *journal;
+	void *handle;
+
+	if (current->journal_info) {
+		CDEBUG(D_INODE, "increasing refcount on %p\n",
+		       current->journal_info);
+		goto journal_start;
+	}
+
+	switch(op) {
+	case FSFILT_OP_UNLINK:
+		/* delete one file + create/update logs for each stripe */
+		nblocks += EXT3_DELETE_TRANS_BLOCKS(inode->i_sb);
+		nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+			    FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
+		break;
+	case FSFILT_OP_CANCEL_UNLINK:
+		LASSERT(logs == 1);
+
+		/* blocks for log header bitmap update OR
+		 * blocks for catalog header bitmap update + unlink of logs +
+		 * blocks for delete the inode (include blocks truncating). */
+		nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
+			  EXT3_DELETE_TRANS_BLOCKS(inode->i_sb) +
+			  ext4_blocks_for_truncate(inode) + 3;
+		break;
+	default: CERROR("unknown transaction start op %d\n", op);
+		LBUG();
+	}
+
+	LASSERT(current->journal_info == desc_private);
+	journal = EXT3_SB(inode->i_sb)->s_journal;
+	if (nblocks > journal->j_max_transaction_buffers) {
+		CWARN("too many credits %d for op %ux%u using %d instead\n",
+		       nblocks, op, logs, journal->j_max_transaction_buffers);
+		nblocks = journal->j_max_transaction_buffers;
+	}
+
+ journal_start:
+	LASSERTF(nblocks > 0, "can't start %d credit transaction\n", nblocks);
+	handle = ext3_journal_start(inode, nblocks);
+
+	if (!IS_ERR(handle))
+		LASSERT(current->journal_info == handle);
+	else
+		CERROR("error starting handle for op %u (%u credits): rc %ld\n",
+		       op, nblocks, PTR_ERR(handle));
+	return handle;
+}
+
+static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync)
+{
+	int rc;
+	handle_t *handle = h;
+
+	LASSERT(current->journal_info == handle);
+	if (force_sync)
+		handle->h_sync = 1; /* recovery likes this */
+
+	rc = ext3_journal_stop(handle);
+
+	return rc;
+}
+
+#ifndef EXT3_EXTENTS_FL
+#define EXT3_EXTENTS_FL		 0x00080000 /* Inode uses extents */
+#endif
+
+#ifndef EXT_ASSERT
+#define EXT_ASSERT(cond)  BUG_ON(!(cond))
+#endif
+
+#define EXT_GENERATION(inode)	   (EXT4_I(inode)->i_ext_generation)
+#define ext3_ext_base		   inode
+#define ext3_ext_base2inode(inode)      (inode)
+#define EXT_DEPTH(inode)		ext_depth(inode)
+#define fsfilt_ext3_ext_walk_space(inode, block, num, cb, cbdata) \
+			ext3_ext_walk_space(inode, block, num, cb, cbdata);
+
+struct bpointers {
+	unsigned long *blocks;
+	unsigned long start;
+	int num;
+	int init_num;
+	int create;
+};
+
+static long ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
+			       unsigned long block, int *aflags)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	unsigned long bg_start;
+	unsigned long colour;
+	int depth;
+
+	if (path) {
+		struct ext3_extent *ex;
+		depth = path->p_depth;
+
+		/* try to predict block placement */
+		if ((ex = path[depth].p_ext))
+			return ext4_ext_pblock(ex) + (block - le32_to_cpu(ex->ee_block));
+
+		/* it looks index is empty
+		 * try to find starting from index itself */
+		if (path[depth].p_bh)
+			return path[depth].p_bh->b_blocknr;
+	}
+
+	/* OK. use inode's group */
+	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+		le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+	colour = (current->pid % 16) *
+		(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour + block;
+}
+
+#define ll_unmap_underlying_metadata(sb, blocknr) \
+	unmap_underlying_metadata((sb)->s_bdev, blocknr)
+
+#ifndef EXT3_MB_HINT_GROUP_ALLOC
+static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
+				struct ext3_ext_path *path, unsigned long block,
+				unsigned long *count, int *err)
+{
+	unsigned long pblock, goal;
+	int aflags = 0;
+	struct inode *inode = ext3_ext_base2inode(base);
+
+	goal = ext3_ext_find_goal(inode, path, block, &aflags);
+	aflags |= 2; /* block have been already reserved */
+	pblock = ext3_mb_new_blocks(handle, inode, goal, count, aflags, err);
+	return pblock;
+
+}
+#else
+static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
+				struct ext3_ext_path *path, unsigned long block,
+				unsigned long *count, int *err)
+{
+	struct inode *inode = ext3_ext_base2inode(base);
+	struct ext3_allocation_request ar;
+	unsigned long pblock;
+	int aflags;
+
+	/* find neighbour allocated blocks */
+	ar.lleft = block;
+	*err = ext3_ext_search_left(base, path, &ar.lleft, &ar.pleft);
+	if (*err)
+		return 0;
+	ar.lright = block;
+	*err = ext3_ext_search_right(base, path, &ar.lright, &ar.pright);
+	if (*err)
+		return 0;
+
+	/* allocate new block */
+	ar.goal = ext3_ext_find_goal(inode, path, block, &aflags);
+	ar.inode = inode;
+	ar.logical = block;
+	ar.len = *count;
+	ar.flags = EXT3_MB_HINT_DATA;
+	pblock = ext3_mb_new_blocks(handle, &ar, err);
+	*count = ar.len;
+	return pblock;
+}
+#endif
+
+static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
+				  struct ext3_ext_path *path,
+				  struct ext3_ext_cache *cex,
+#ifdef HAVE_EXT_PREPARE_CB_EXTENT
+				   struct ext3_extent *ex,
+#endif
+				  void *cbdata)
+{
+	struct bpointers *bp = cbdata;
+	struct inode *inode = ext3_ext_base2inode(base);
+	struct ext3_extent nex;
+	unsigned long pblock;
+	unsigned long tgen;
+	int err, i;
+	unsigned long count;
+	handle_t *handle;
+
+#ifdef EXT3_EXT_CACHE_EXTENT
+	if (cex->ec_type == EXT3_EXT_CACHE_EXTENT)
+#else
+	if ((cex->ec_len != 0) && (cex->ec_start != 0))
+#endif
+						   {
+		err = EXT_CONTINUE;
+		goto map;
+	}
+
+	if (bp->create == 0) {
+		i = 0;
+		if (cex->ec_block < bp->start)
+			i = bp->start - cex->ec_block;
+		if (i >= cex->ec_len)
+			CERROR("nothing to do?! i = %d, e_num = %u\n",
+					i, cex->ec_len);
+		for (; i < cex->ec_len && bp->num; i++) {
+			*(bp->blocks) = 0;
+			bp->blocks++;
+			bp->num--;
+			bp->start++;
+		}
+
+		return EXT_CONTINUE;
+	}
+
+	tgen = EXT_GENERATION(base);
+	count = ext3_ext_calc_credits_for_insert(base, path);
+
+	handle = ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1);
+	if (IS_ERR(handle)) {
+		return PTR_ERR(handle);
+	}
+
+	if (tgen != EXT_GENERATION(base)) {
+		/* the tree has changed. so path can be invalid at moment */
+		ext3_journal_stop(handle);
+		return EXT_REPEAT;
+	}
+
+	/* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not
+	 * protected by i_data_sem as whole. so we patch it to store
+	 * generation to path and now verify the tree hasn't changed */
+	down_write((&EXT4_I(inode)->i_data_sem));
+
+	/* validate extent, make sure the extent tree does not changed */
+	if (EXT_GENERATION(base) != path[0].p_generation) {
+		/* cex is invalid, try again */
+		up_write(&EXT4_I(inode)->i_data_sem);
+		ext3_journal_stop(handle);
+		return EXT_REPEAT;
+	}
+
+	count = cex->ec_len;
+	pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err);
+	if (!pblock)
+		goto out;
+	EXT_ASSERT(count <= cex->ec_len);
+
+	/* insert new extent */
+	nex.ee_block = cpu_to_le32(cex->ec_block);
+	ext3_ext_store_pblock(&nex, pblock);
+	nex.ee_len = cpu_to_le16(count);
+	err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0);
+	if (err) {
+		/* free data blocks we just allocated */
+		/* not a good idea to call discard here directly,
+		 * but otherwise we'd need to call it every free() */
+#ifdef EXT3_MB_HINT_GROUP_ALLOC
+		ext3_mb_discard_inode_preallocations(inode);
+#endif
+#ifdef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD /* Introduced in 2.6.32-rc7 */
+		ext3_free_blocks(handle, inode, NULL, ext4_ext_pblock(&nex),
+				 cpu_to_le16(nex.ee_len), 0);
+#else
+		ext3_free_blocks(handle, inode, ext4_ext_pblock(&nex),
+				 cpu_to_le16(nex.ee_len), 0);
+#endif
+		goto out;
+	}
+
+	/*
+	 * Putting len of the actual extent we just inserted,
+	 * we are asking ext3_ext_walk_space() to continue
+	 * scaning after that block
+	 */
+	cex->ec_len = le16_to_cpu(nex.ee_len);
+	cex->ec_start = ext4_ext_pblock(&nex);
+	BUG_ON(le16_to_cpu(nex.ee_len) == 0);
+	BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block);
+
+out:
+	up_write((&EXT4_I(inode)->i_data_sem));
+	ext3_journal_stop(handle);
+map:
+	if (err >= 0) {
+		/* map blocks */
+		if (bp->num == 0) {
+			CERROR("hmm. why do we find this extent?\n");
+			CERROR("initial space: %lu:%u\n",
+				bp->start, bp->init_num);
+#ifdef EXT3_EXT_CACHE_EXTENT
+			CERROR("current extent: %u/%u/%llu %d\n",
+				cex->ec_block, cex->ec_len,
+				(unsigned long long)cex->ec_start,
+				cex->ec_type);
+#else
+			CERROR("current extent: %u/%u/%llu\n",
+				cex->ec_block, cex->ec_len,
+				(unsigned long long)cex->ec_start);
+#endif
+		}
+		i = 0;
+		if (cex->ec_block < bp->start)
+			i = bp->start - cex->ec_block;
+		if (i >= cex->ec_len)
+			CERROR("nothing to do?! i = %d, e_num = %u\n",
+					i, cex->ec_len);
+		for (; i < cex->ec_len && bp->num; i++) {
+			*(bp->blocks) = cex->ec_start + i;
+#ifdef EXT3_EXT_CACHE_EXTENT
+			if (cex->ec_type != EXT3_EXT_CACHE_EXTENT)
+#else
+			if ((cex->ec_len == 0) || (cex->ec_start == 0))
+#endif
+									{
+				/* unmap any possible underlying metadata from
+				 * the block device mapping.  bug 6998. */
+				ll_unmap_underlying_metadata(inode->i_sb,
+							     *(bp->blocks));
+			}
+			bp->blocks++;
+			bp->num--;
+			bp->start++;
+		}
+	}
+	return err;
+}
+
+int fsfilt_map_nblocks(struct inode *inode, unsigned long block,
+		       unsigned long num, unsigned long *blocks,
+		       int create)
+{
+	struct ext3_ext_base *base = inode;
+	struct bpointers bp;
+	int err;
+
+	CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n",
+	       block, block + num - 1, (unsigned) inode->i_ino);
+
+	bp.blocks = blocks;
+	bp.start = block;
+	bp.init_num = bp.num = num;
+	bp.create = create;
+
+	err = fsfilt_ext3_ext_walk_space(base, block, num,
+					 ext3_ext_new_extent_cb, &bp);
+	ext3_ext_invalidate_cache(base);
+
+	return err;
+}
+
+int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
+				    int pages, unsigned long *blocks,
+				    int create)
+{
+	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+	int rc = 0, i = 0;
+	struct page *fp = NULL;
+	int clen = 0;
+
+	CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
+		inode->i_ino, pages, (*page)->index);
+
+	/* pages are sorted already. so, we just have to find
+	 * contig. space and process them properly */
+	while (i < pages) {
+		if (fp == NULL) {
+			/* start new extent */
+			fp = *page++;
+			clen = 1;
+			i++;
+			continue;
+		} else if (fp->index + clen == (*page)->index) {
+			/* continue the extent */
+			page++;
+			clen++;
+			i++;
+			continue;
+		}
+
+		/* process found extent */
+		rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
+					clen * blocks_per_page, blocks,
+					create);
+		if (rc)
+			GOTO(cleanup, rc);
+
+		/* look for next extent */
+		fp = NULL;
+		blocks += blocks_per_page * clen;
+	}
+
+	if (fp)
+		rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
+					clen * blocks_per_page, blocks,
+					create);
+cleanup:
+	return rc;
+}
+
+int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page,
+				   int pages, unsigned long *blocks,
+				   int create)
+{
+	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+	unsigned long *b;
+	int rc = 0, i;
+
+	for (i = 0, b = blocks; i < pages; i++, page++) {
+		rc = ext3_map_inode_page(inode, *page, b, create);
+		if (rc) {
+			CERROR("ino %lu, blk %lu create %d: rc %d\n",
+			       inode->i_ino, *b, create, rc);
+			break;
+		}
+
+		b += blocks_per_page;
+	}
+	return rc;
+}
+
+int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
+				int pages, unsigned long *blocks,
+				int create, struct mutex *optional_mutex)
+{
+	int rc;
+
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
+		rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
+						     blocks, create);
+		return rc;
+	}
+	if (optional_mutex != NULL)
+		mutex_lock(optional_mutex);
+	rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks, create);
+	if (optional_mutex != NULL)
+		mutex_unlock(optional_mutex);
+
+	return rc;
+}
+
+int fsfilt_ext3_read(struct inode *inode, void *buf, int size, loff_t *offs)
+{
+	unsigned long block;
+	struct buffer_head *bh;
+	int err, blocksize, csize, boffs, osize = size;
+
+	/* prevent reading after eof */
+	spin_lock(&inode->i_lock);
+	if (i_size_read(inode) < *offs + size) {
+		size = i_size_read(inode) - *offs;
+		spin_unlock(&inode->i_lock);
+		if (size < 0) {
+			CDEBUG(D_EXT2, "size %llu is too short for read @%llu\n",
+			       i_size_read(inode), *offs);
+			return -EBADR;
+		} else if (size == 0) {
+			return 0;
+		}
+	} else {
+		spin_unlock(&inode->i_lock);
+	}
+
+	blocksize = 1 << inode->i_blkbits;
+
+	while (size > 0) {
+		block = *offs >> inode->i_blkbits;
+		boffs = *offs & (blocksize - 1);
+		csize = min(blocksize - boffs, size);
+		bh = ext3_bread(NULL, inode, block, 0, &err);
+		if (!bh) {
+			CERROR("can't read block: %d\n", err);
+			return err;
+		}
+
+		memcpy(buf, bh->b_data + boffs, csize);
+		brelse(bh);
+
+		*offs += csize;
+		buf += csize;
+		size -= csize;
+	}
+	return osize;
+}
+EXPORT_SYMBOL(fsfilt_ext3_read);
+
+static int fsfilt_ext3_read_record(struct file * file, void *buf,
+				   int size, loff_t *offs)
+{
+	int rc;
+	rc = fsfilt_ext3_read(file->f_dentry->d_inode, buf, size, offs);
+	if (rc > 0)
+		rc = 0;
+	return rc;
+}
+
+int fsfilt_ext3_write_handle(struct inode *inode, void *buf, int bufsize,
+				loff_t *offs, handle_t *handle)
+{
+	struct buffer_head *bh = NULL;
+	loff_t old_size = i_size_read(inode), offset = *offs;
+	loff_t new_size = i_size_read(inode);
+	unsigned long block;
+	int err = 0, blocksize = 1 << inode->i_blkbits, size, boffs;
+
+	while (bufsize > 0) {
+		if (bh != NULL)
+			brelse(bh);
+
+		block = offset >> inode->i_blkbits;
+		boffs = offset & (blocksize - 1);
+		size = min(blocksize - boffs, bufsize);
+		bh = ext3_bread(handle, inode, block, 1, &err);
+		if (!bh) {
+			CERROR("can't read/create block: %d\n", err);
+			break;
+		}
+
+		err = ext3_journal_get_write_access(handle, bh);
+		if (err) {
+			CERROR("journal_get_write_access() returned error %d\n",
+			       err);
+			break;
+		}
+		LASSERT(bh->b_data + boffs + size <= bh->b_data + bh->b_size);
+		memcpy(bh->b_data + boffs, buf, size);
+		err = ext3_journal_dirty_metadata(handle, bh);
+		if (err) {
+			CERROR("journal_dirty_metadata() returned error %d\n",
+			       err);
+			break;
+		}
+		if (offset + size > new_size)
+			new_size = offset + size;
+		offset += size;
+		bufsize -= size;
+		buf += size;
+	}
+	if (bh)
+		brelse(bh);
+
+	/* correct in-core and on-disk sizes */
+	if (new_size > i_size_read(inode)) {
+		spin_lock(&inode->i_lock);
+		if (new_size > i_size_read(inode))
+			i_size_write(inode, new_size);
+		if (i_size_read(inode) > EXT3_I(inode)->i_disksize)
+			EXT3_I(inode)->i_disksize = i_size_read(inode);
+		if (i_size_read(inode) > old_size) {
+			spin_unlock(&inode->i_lock);
+			mark_inode_dirty(inode);
+		} else {
+			spin_unlock(&inode->i_lock);
+		}
+	}
+
+	if (err == 0)
+		*offs = offset;
+	return err;
+}
+EXPORT_SYMBOL(fsfilt_ext3_write_handle);
+
+static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize,
+				    loff_t *offs, int force_sync)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	handle_t *handle;
+	int err, block_count = 0, blocksize;
+
+	/* Determine how many transaction credits are needed */
+	blocksize = 1 << inode->i_blkbits;
+	block_count = (*offs & (blocksize - 1)) + bufsize;
+	block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
+
+	handle = ext3_journal_start(inode,
+			block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2);
+	if (IS_ERR(handle)) {
+		CERROR("can't start transaction for %d blocks (%d bytes)\n",
+		       block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2,
+		       bufsize);
+		return PTR_ERR(handle);
+	}
+
+	err = fsfilt_ext3_write_handle(inode, buf, bufsize, offs, handle);
+
+	if (!err && force_sync)
+		handle->h_sync = 1; /* recovery likes this */
+
+	ext3_journal_stop(handle);
+
+	return err;
+}
+
+static int fsfilt_ext3_setup(struct super_block *sb)
+{
+	if (!EXT3_HAS_COMPAT_FEATURE(sb,
+				EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+		CERROR("ext3 mounted without journal\n");
+		return -EINVAL;
+	}
+
+#ifdef S_PDIROPS
+	CWARN("Enabling PDIROPS\n");
+	set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS);
+	sb->s_flags |= S_PDIROPS;
+#endif
+	if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+		CWARN("filesystem doesn't have dir_index feature enabled\n");
+	return 0;
+}
+static struct fsfilt_operations fsfilt_ext3_ops = {
+	.fs_type		= "ext3",
+	.fs_owner	       = THIS_MODULE,
+	.fs_getlabel	    = fsfilt_ext3_get_label,
+	.fs_start	       = fsfilt_ext3_start,
+	.fs_commit	      = fsfilt_ext3_commit,
+	.fs_map_inode_pages     = fsfilt_ext3_map_inode_pages,
+	.fs_write_record	= fsfilt_ext3_write_record,
+	.fs_read_record	 = fsfilt_ext3_read_record,
+	.fs_setup	       = fsfilt_ext3_setup,
+};
+
+static int __init fsfilt_ext3_init(void)
+{
+	int rc;
+
+	fcb_cache = kmem_cache_create("fsfilt_ext3_fcb",
+					 sizeof(struct fsfilt_cb_data), 0, 0);
+	if (!fcb_cache) {
+		CERROR("error allocating fsfilt journal callback cache\n");
+		GOTO(out, rc = -ENOMEM);
+	}
+
+	rc = fsfilt_register_ops(&fsfilt_ext3_ops);
+
+	if (rc) {
+		int err = kmem_cache_destroy(fcb_cache);
+		LASSERTF(err == 0, "error destroying new cache: rc %d\n", err);
+	}
+out:
+	return rc;
+}
+
+static void __exit fsfilt_ext3_exit(void)
+{
+	int rc;
+
+	fsfilt_unregister_ops(&fsfilt_ext3_ops);
+	rc = kmem_cache_destroy(fcb_cache);
+	LASSERTF(rc == 0, "couldn't destroy fcb_cache slab\n");
+}
+
+module_init(fsfilt_ext3_init);
+module_exit(fsfilt_ext3_exit);
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
+MODULE_LICENSE("GPL");

diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c b/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c
new file mode 100644
index 0000000..97a8be2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c

@@ -0,0 +1,173 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_lib.c
+ *
+ * Lustre filesystem abstraction routines
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+#include <linux/module.h>
+#include <lustre_lib.h>
+#include <lprocfs_status.h>
+
+#ifdef LPROCFS
+void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				smp_id;
+	unsigned long			flags = 0;
+
+	if (stats == NULL)
+		return;
+
+	/* With per-client stats, statistics are allocated only for
+	 * single CPU area, so the smp_id should be 0 always. */
+	smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+	if (smp_id < 0)
+		return;
+
+	header = &stats->ls_cnt_header[idx];
+	percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+	percpu_cntr->lc_count++;
+
+	if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+		/*
+		 * lprocfs_counter_add() can be called in interrupt context,
+		 * as memory allocation could trigger memory shrinker call
+		 * ldlm_pool_shrink(), which calls lprocfs_counter_add().
+		 * LU-1727.
+		 *
+		 * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+		 * flag, because it needs accurate counting lest memory leak
+		 * check reports error.
+		 */
+		if (in_interrupt() &&
+		    (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq += amount;
+		else
+			percpu_cntr->lc_sum += amount;
+
+		if (header->lc_config & LPROCFS_CNTR_STDDEV)
+			percpu_cntr->lc_sumsquare += (__s64)amount * amount;
+		if (amount < percpu_cntr->lc_min)
+			percpu_cntr->lc_min = amount;
+		if (amount > percpu_cntr->lc_max)
+			percpu_cntr->lc_max = amount;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_add);
+
+void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				smp_id;
+	unsigned long			flags = 0;
+
+	if (stats == NULL)
+		return;
+
+	/* With per-client stats, statistics are allocated only for
+	 * single CPU area, so the smp_id should be 0 always. */
+	smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+	if (smp_id < 0)
+		return;
+
+	header = &stats->ls_cnt_header[idx];
+	percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+	if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+		/*
+		 * Sometimes we use RCU callbacks to free memory which calls
+		 * lprocfs_counter_sub(), and RCU callbacks may execute in
+		 * softirq context - right now that's the only case we're in
+		 * softirq context here, use separate counter for that.
+		 * bz20650.
+		 *
+		 * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+		 * flag, because it needs accurate counting lest memory leak
+		 * check reports error.
+		 */
+		if (in_interrupt() &&
+		    (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq -= amount;
+		else
+			percpu_cntr->lc_sum -= amount;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_sub);
+
+int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
+{
+	struct lprocfs_counter	*cntr;
+	unsigned int		percpusize;
+	int			rc = -ENOMEM;
+	unsigned long		flags = 0;
+	int			i;
+
+	LASSERT(stats->ls_percpu[cpuid] == NULL);
+	LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
+	if (stats->ls_percpu[cpuid] != NULL) {
+		rc = 0;
+		if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				spin_lock_irqsave(&stats->ls_lock, flags);
+			else
+				spin_lock(&stats->ls_lock);
+			if (stats->ls_biggest_alloc_num <= cpuid)
+				stats->ls_biggest_alloc_num = cpuid + 1;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+				spin_unlock_irqrestore(&stats->ls_lock, flags);
+			} else {
+				spin_unlock(&stats->ls_lock);
+			}
+		}
+		/* initialize the ls_percpu[cpuid] non-zero counter */
+		for (i = 0; i < stats->ls_num; ++i) {
+			cntr = lprocfs_stats_counter_get(stats, cpuid, i);
+			cntr->lc_min = LC_MIN_INIT;
+		}
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_stats_alloc_one);
+#endif  /* LPROCFS */

diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c
new file mode 100644
index 0000000..1e6f32c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c

@@ -0,0 +1,295 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_linux.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/version.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+#include <obd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/lustre_compat25.h>
+#include <lvfs.h>
+
+#include <obd.h>
+#include <lustre_lib.h>
+
+struct lprocfs_stats *obd_memory = NULL;
+EXPORT_SYMBOL(obd_memory);
+/* refine later and change to seqlock or simlar from libcfs */
+
+/* Debugging check only needed during development */
+#ifdef OBD_CTXT_DEBUG
+# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
+# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
+					      msg)
+# define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
+#else
+# define ASSERT_CTXT_MAGIC(magic) do {} while(0)
+# define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
+# define ASSERT_KERNEL_CTXT(msg) do {} while(0)
+#endif
+
+static void push_group_info(struct lvfs_run_ctxt *save,
+			    struct group_info *ginfo)
+{
+	if (!ginfo) {
+		save->ngroups = current_ngroups;
+		current_ngroups = 0;
+	} else {
+		struct cred *cred;
+		task_lock(current);
+		save->group_info = current_cred()->group_info;
+		if ((cred = prepare_creds())) {
+			cred->group_info = ginfo;
+			commit_creds(cred);
+		}
+		task_unlock(current);
+	}
+}
+
+static void pop_group_info(struct lvfs_run_ctxt *save,
+			   struct group_info *ginfo)
+{
+	if (!ginfo) {
+		current_ngroups = save->ngroups;
+	} else {
+		struct cred *cred;
+		task_lock(current);
+		if ((cred = prepare_creds())) {
+			cred->group_info = save->group_info;
+			commit_creds(cred);
+		}
+		task_unlock(current);
+	}
+}
+
+/* push / pop to root of obd store */
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
+	       struct lvfs_ucred *uc)
+{
+	/* if there is underlaying dt_device then push_ctxt is not needed */
+	if (new_ctx->dt != NULL)
+		return;
+
+	//ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
+	ASSERT_CTXT_MAGIC(new_ctx->magic);
+	OBD_SET_CTXT_MAGIC(save);
+
+	save->fs = get_fs();
+	LASSERT(d_refcount(cfs_fs_pwd(current->fs)));
+	LASSERT(d_refcount(new_ctx->pwd));
+	save->pwd = dget(cfs_fs_pwd(current->fs));
+	save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
+	save->luc.luc_umask = current_umask();
+	save->ngroups = current_cred()->group_info->ngroups;
+
+	LASSERT(save->pwd);
+	LASSERT(save->pwdmnt);
+	LASSERT(new_ctx->pwd);
+	LASSERT(new_ctx->pwdmnt);
+
+	if (uc) {
+		struct cred *cred;
+		save->luc.luc_uid = current_uid();
+		save->luc.luc_gid = current_gid();
+		save->luc.luc_fsuid = current_fsuid();
+		save->luc.luc_fsgid = current_fsgid();
+		save->luc.luc_cap = current_cap();
+
+		if ((cred = prepare_creds())) {
+			cred->uid = uc->luc_uid;
+			cred->gid = uc->luc_gid;
+			cred->fsuid = uc->luc_fsuid;
+			cred->fsgid = uc->luc_fsgid;
+			cred->cap_effective = uc->luc_cap;
+			commit_creds(cred);
+		}
+
+		push_group_info(save,
+				uc->luc_ginfo ?:
+				uc->luc_identity ? uc->luc_identity->mi_ginfo :
+						   NULL);
+	}
+	current->fs->umask = 0; /* umask already applied on client */
+	set_fs(new_ctx->fs);
+	ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
+}
+EXPORT_SYMBOL(push_ctxt);
+
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
+	      struct lvfs_ucred *uc)
+{
+	/* if there is underlaying dt_device then pop_ctxt is not needed */
+	if (new_ctx->dt != NULL)
+		return;
+
+	ASSERT_CTXT_MAGIC(saved->magic);
+	ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
+
+	LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
+		 cfs_fs_pwd(current->fs), new_ctx->pwd);
+	LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
+		 cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
+
+	set_fs(saved->fs);
+	ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
+
+	dput(saved->pwd);
+	mntput(saved->pwdmnt);
+	current->fs->umask = saved->luc.luc_umask;
+	if (uc) {
+		struct cred *cred;
+		if ((cred = prepare_creds())) {
+			cred->uid = saved->luc.luc_uid;
+			cred->gid = saved->luc.luc_gid;
+			cred->fsuid = saved->luc.luc_fsuid;
+			cred->fsgid = saved->luc.luc_fsgid;
+			cred->cap_effective = saved->luc.luc_cap;
+			commit_creds(cred);
+		}
+
+		pop_group_info(saved,
+			       uc->luc_ginfo ?:
+			       uc->luc_identity ? uc->luc_identity->mi_ginfo :
+						  NULL);
+	}
+}
+EXPORT_SYMBOL(pop_ctxt);
+
+/* utility to rename a file */
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
+		  char *oldname, char *newname)
+{
+	struct dentry *dchild_old, *dchild_new;
+	int err = 0;
+	ENTRY;
+
+	ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
+	CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
+	       (int)strlen(oldname), oldname, (int)strlen(newname), newname);
+
+	dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
+	if (IS_ERR(dchild_old))
+		RETURN(PTR_ERR(dchild_old));
+
+	if (!dchild_old->d_inode)
+		GOTO(put_old, err = -ENOENT);
+
+	dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
+	if (IS_ERR(dchild_new))
+		GOTO(put_old, err = PTR_ERR(dchild_new));
+
+	err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
+			    dir->d_inode, dchild_new, mnt);
+
+	dput(dchild_new);
+put_old:
+	dput(dchild_old);
+	RETURN(err);
+}
+EXPORT_SYMBOL(lustre_rename);
+
+/* Note: dput(dchild) will *not* be called if there is an error */
+struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
+			     int flags)
+{
+	struct path path = {
+		.dentry = de,
+		.mnt = ctxt->pwdmnt,
+	};
+	return ll_dentry_open(&path, flags, current_cred());
+}
+EXPORT_SYMBOL(l_dentry_open);
+
+#ifdef LPROCFS
+__s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+			  struct lprocfs_counter_header *header,
+			  enum lprocfs_stats_flags flags,
+			  enum lprocfs_fields_flags field)
+{
+	__s64 ret = 0;
+
+	if (lc == NULL || header == NULL)
+		RETURN(0);
+
+	switch (field) {
+		case LPROCFS_FIELDS_FLAGS_CONFIG:
+			ret = header->lc_config;
+			break;
+		case LPROCFS_FIELDS_FLAGS_SUM:
+			ret = lc->lc_sum;
+			if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+				ret += lc->lc_sum_irq;
+			break;
+		case LPROCFS_FIELDS_FLAGS_MIN:
+			ret = lc->lc_min;
+			break;
+		case LPROCFS_FIELDS_FLAGS_MAX:
+			ret = lc->lc_max;
+			break;
+		case LPROCFS_FIELDS_FLAGS_AVG:
+			ret = (lc->lc_max - lc->lc_min) / 2;
+			break;
+		case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
+			ret = lc->lc_sumsquare;
+			break;
+		case LPROCFS_FIELDS_FLAGS_COUNT:
+			ret = lc->lc_count;
+			break;
+		default:
+			break;
+	};
+
+	RETURN(ret);
+}
+EXPORT_SYMBOL(lprocfs_read_helper);
+#endif /* LPROCFS */
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
+MODULE_LICENSE("GPL");

diff --git a/drivers/staging/lustre/lustre/mdc/Makefile b/drivers/staging/lustre/lustre/mdc/Makefile
new file mode 100644
index 0000000..93bae24
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/Makefile

@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += mdc.o
+mdc-y := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/mdc/lproc_mdc.c b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c
new file mode 100644
index 0000000..6592478
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c

@@ -0,0 +1,219 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+#ifdef LPROCFS
+
+static int mdc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+static ssize_t mdc_max_rpcs_in_flight_seq_write(struct file *file,
+						const char *buffer,
+						size_t count,
+						loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > MDC_MAX_RIF_MAX)
+		return -ERANGE;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_rpcs_in_flight = val;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(mdc_max_rpcs_in_flight);
+
+static int mdc_kuc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, NULL, PDE_DATA(inode));
+}
+
+/* temporary for testing */
+static ssize_t mdc_kuc_write(struct file *file, const char *buffer,
+			     size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct kuc_hdr		*lh;
+	struct hsm_action_list	*hal;
+	struct hsm_action_item	*hai;
+	int			 len;
+	int			 fd, rc;
+	ENTRY;
+
+	rc = lprocfs_write_helper(buffer, count, &fd);
+	if (rc)
+		RETURN(rc);
+
+	if (fd < 0)
+		RETURN(-ERANGE);
+	CWARN("message to fd %d\n", fd);
+
+	len = sizeof(*lh) + sizeof(*hal) + MTI_NAME_MAXLEN +
+		/* for mockup below */ 2 * cfs_size_round(sizeof(*hai));
+
+	OBD_ALLOC(lh, len);
+
+	lh->kuc_magic = KUC_MAGIC;
+	lh->kuc_transport = KUC_TRANSPORT_HSM;
+	lh->kuc_msgtype = HMT_ACTION_LIST;
+	lh->kuc_msglen = len;
+
+	hal = (struct hsm_action_list *)(lh + 1);
+	hal->hal_version = HAL_VERSION;
+	hal->hal_archive_id = 1;
+	hal->hal_flags = 0;
+	obd_uuid2fsname(hal->hal_fsname, obd->obd_name, MTI_NAME_MAXLEN);
+
+	/* mock up an action list */
+	hal->hal_count = 2;
+	hai = hai_zero(hal);
+	hai->hai_action = HSMA_ARCHIVE;
+	hai->hai_fid.f_oid = 5;
+	hai->hai_len = sizeof(*hai);
+	hai = hai_next(hai);
+	hai->hai_action = HSMA_RESTORE;
+	hai->hai_fid.f_oid = 10;
+	hai->hai_len = sizeof(*hai);
+
+	/* This works for either broadcast or unicast to a single fd */
+	if (fd == 0) {
+		rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+	} else {
+		struct file *fp = fget(fd);
+
+		rc = libcfs_kkuc_msg_put(fp, lh);
+		fput(fp);
+	}
+	OBD_FREE(lh, len);
+	if (rc < 0)
+		RETURN(rc);
+	RETURN(count);
+}
+
+struct file_operations mdc_kuc_fops = {
+	.open		= mdc_kuc_open,
+	.write		= mdc_kuc_write,
+	.release	= single_release,
+};
+
+LPROC_SEQ_FOPS_WR_ONLY(mdc, ping);
+
+LPROC_SEQ_FOPS_RO_TYPE(mdc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, state);
+
+static int mdc_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *v)
+{
+	return lprocfs_obd_rd_max_pages_per_rpc(m, m->private);
+}
+LPROC_SEQ_FOPS_RO(mdc_obd_max_pages_per_rpc);
+
+LPROC_SEQ_FOPS_RW_TYPE(mdc, import);
+LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov);
+
+static struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
+	{ "uuid",	    &mdc_uuid_fops,		0, 0 },
+	{ "ping",	    &mdc_ping_fops,		0, 0222 },
+	{ "connect_flags",  &mdc_connect_flags_fops,	0, 0 },
+	{ "blocksize",      &mdc_blksize_fops,		0, 0 },
+	{ "kbytestotal",    &mdc_kbytestotal_fops,	0, 0 },
+	{ "kbytesfree",     &mdc_kbytesfree_fops,	0, 0 },
+	{ "kbytesavail",    &mdc_kbytesavail_fops,	0, 0 },
+	{ "filestotal",     &mdc_filestotal_fops,	0, 0 },
+	{ "filesfree",      &mdc_filesfree_fops,	0, 0 },
+	/*{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },*/
+	{ "mds_server_uuid", &mdc_server_uuid_fops,	0, 0 },
+	{ "mds_conn_uuid",  &mdc_conn_uuid_fops,	0, 0 },
+	/*
+	 * FIXME: below proc entry is provided, but not in used, instead
+	 * sbi->sb_md_brw_size is used, the per obd variable should be used
+	 * when CMD is enabled, and dir pages are managed in MDC layer.
+	 * Remember to enable proc write function.
+	 */
+	{ "max_pages_per_rpc",  &mdc_obd_max_pages_per_rpc_fops, 0, 0 },
+	{ "max_rpcs_in_flight", &mdc_max_rpcs_in_flight_fops, 0, 0 },
+	{ "timeouts",		&mdc_timeouts_fops,    0, 0 },
+	{ "import",		&mdc_import_fops, 0 },
+	{ "state",		&mdc_state_fops, 0, 0 },
+	{ "hsm_nl",		&mdc_kuc_fops, 0, 0200 },
+	{ "pinger_recov",	&mdc_pinger_recov_fops, 0, 0 },
+	{ 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(mdc, numrefs);
+
+static struct lprocfs_vars lprocfs_mdc_module_vars[] = {
+	{ "num_refs",	&mdc_numrefs_fops,     0, 0 },
+	{ 0 }
+};
+
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_mdc_module_vars;
+    lvars->obd_vars     = lprocfs_mdc_obd_vars;
+}
+#endif /* LPROCFS */

diff --git a/drivers/staging/lustre/lustre/mdc/mdc_internal.h b/drivers/staging/lustre/lustre/mdc/mdc_internal.h
new file mode 100644
index 0000000..2aeff0e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_internal.h

@@ -0,0 +1,180 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MDC_INTERNAL_H
+#define _MDC_INTERNAL_H
+
+#include <lustre_mdc.h>
+#include <lustre_mds.h>
+
+#ifdef LPROCFS
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
+		   struct obd_capa *oc, __u64 valid, int ea_size,
+		   __u32 suppgid, int flags);
+void mdc_pack_capa(struct ptlrpc_request *req,
+		   const struct req_msg_field *field, struct obd_capa *oc);
+int mdc_pack_req(struct ptlrpc_request *req, int version, int opc);
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+			const struct lu_fid *cfid, int flags);
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+			   struct md_op_data *op_data);
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, __u32 size,
+		      const struct lu_fid *fid, struct obd_capa *oc);
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+		      struct md_op_data *data, int ea_size);
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     void *ea, int ealen, void *ea2, int ea2len);
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const void *data, int datalen, __u32 mode, __u32 uid,
+		     __u32 gid, cfs_cap_t capability, __u64 rdev);
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		   __u32 mode, __u64 rdev, __u32 flags, const void *data,
+		   int datalen);
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const char *old, int oldlen, const char *new, int newlen);
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+int mdc_enter_request(struct client_obd *cli);
+void mdc_exit_request(struct client_obd *cli);
+
+/* mdc/mdc_locks.c */
+int mdc_set_lock_data(struct obd_export *exp,
+		      __u64 *lockh, void *data, __u64 *bits);
+
+int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid);
+
+int mdc_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+		    ldlm_iterator_t it, void *data);
+
+int mdc_intent_lock(struct obd_export *exp,
+		    struct md_op_data *,
+		    void *lmm, int lmmsize,
+		    struct lookup_intent *, int,
+		    struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags);
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		struct lookup_intent *it, struct md_op_data *op_data,
+		struct lustre_handle *lockh, void *lmm, int lmmsize,
+		struct ptlrpc_request **req, __u64 extra_lock_flags);
+
+int mdc_resource_get_unused(struct obd_export *exp, struct lu_fid *fid,
+			    struct list_head *cancels, ldlm_mode_t mode,
+			    __u64 bits);
+/* mdc/mdc_request.c */
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data);
+
+int mdc_open(struct obd_export *exp, obd_id ino, int type, int flags,
+	     struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh,
+	     struct ptlrpc_request **);
+
+struct obd_client_handle;
+
+int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req,
+		      struct obd_export *dt_exp, struct obd_export *lmv_exp,
+		      struct lustre_md *md);
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md);
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct ptlrpc_request *open_req);
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+			       struct obd_client_handle *och);
+void mdc_commit_open(struct ptlrpc_request *req);
+void mdc_replay_open(struct ptlrpc_request *req);
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+	       const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+	       cfs_cap_t capability, __u64 rdev,
+	       struct ptlrpc_request **request);
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+	     struct ptlrpc_request **request);
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+	       const char *old, int oldlen, const char *new, int newlen,
+	       struct ptlrpc_request **request);
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		void *ea, int ealen, void *ea2, int ea2len,
+		struct ptlrpc_request **request, struct md_open_data **mod);
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+	       struct ptlrpc_request **request);
+int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+		      ldlm_policy_data_t *policy, ldlm_mode_t mode,
+		      ldlm_cancel_flags_t flags, void *opaque);
+
+static inline void mdc_set_capa_size(struct ptlrpc_request *req,
+				     const struct req_msg_field *field,
+				     struct obd_capa *oc)
+{
+	if (oc == NULL)
+		req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+	else
+		/* it is already calculated as sizeof struct obd_capa */
+		;
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			struct lu_fid *fid, __u64 *bits);
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo,
+			     struct ldlm_enqueue_info *einfo);
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+			   const struct lu_fid *fid, ldlm_type_t type,
+			   ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			   struct lustre_handle *lockh);
+
+static inline int mdc_prep_elc_req(struct obd_export *exp,
+				   struct ptlrpc_request *req, int opc,
+				   struct list_head *cancels, int count)
+{
+	return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels,
+				 count);
+}
+
+#endif

diff --git a/drivers/staging/lustre/lustre/mdc/mdc_lib.c b/drivers/staging/lustre/lustre/mdc/mdc_lib.c
new file mode 100644
index 0000000..e789aed
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_lib.c

@@ -0,0 +1,564 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include "mdc_internal.h"
+
+
+static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid)
+{
+	LASSERT (b != NULL);
+
+	b->suppgid = suppgid;
+	b->uid = current_uid();
+	b->gid = current_gid();
+	b->fsuid = current_fsuid();
+	b->fsgid = current_fsgid();
+	b->capability = cfs_curproc_cap_pack();
+}
+
+void mdc_pack_capa(struct ptlrpc_request *req, const struct req_msg_field *field,
+		   struct obd_capa *oc)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	struct lustre_capa *c;
+
+	if (oc == NULL) {
+		LASSERT(req_capsule_get_size(pill, field, RCL_CLIENT) == 0);
+		return;
+	}
+
+	c = req_capsule_client_get(pill, field);
+	LASSERT(c != NULL);
+	capa_cpy(c, oc);
+	DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+			const struct lu_fid *cfid, int flags)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	if (pfid) {
+		b->fid1 = *pfid;
+		b->valid = OBD_MD_FLID;
+	}
+	if (cfid)
+		b->fid2 = *cfid;
+	b->flags = flags;
+}
+
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+			   struct md_op_data *op_data)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	__mdc_pack_body(b, op_data->op_suppgids[0]);
+	b->fid1 = op_data->op_fid1;
+	b->fid2 = op_data->op_fid2;
+	b->valid |= OBD_MD_FLID;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+}
+
+void mdc_pack_body(struct ptlrpc_request *req,
+		   const struct lu_fid *fid, struct obd_capa *oc,
+		   __u64 valid, int ea_size, __u32 suppgid, int flags)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+	LASSERT(b != NULL);
+	b->valid = valid;
+	b->eadatasize = ea_size;
+	b->flags = flags;
+	__mdc_pack_body(b, suppgid);
+	if (fid) {
+		b->fid1 = *fid;
+		b->valid |= OBD_MD_FLID;
+		mdc_pack_capa(req, &RMF_CAPA1, oc);
+	}
+}
+
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff,
+		      __u32 size, const struct lu_fid *fid, struct obd_capa *oc)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+	b->fid1 = *fid;
+	b->valid |= OBD_MD_FLID;
+	b->size = pgoff;		       /* !! */
+	b->nlink = size;			/* !! */
+	__mdc_pack_body(b, -1);
+	b->mode = LUDA_FID | LUDA_TYPE;
+
+	mdc_pack_capa(req, &RMF_CAPA1, oc);
+}
+
+/* packing of MDS records */
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const void *data, int datalen, __u32 mode,
+		     __u32 uid, __u32 gid, cfs_cap_t cap_effective, __u64 rdev)
+{
+	struct mdt_rec_create	*rec;
+	char			*tmp;
+	__u64			 flags;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+
+	rec->cr_opcode   = REINT_CREATE;
+	rec->cr_fsuid    = uid;
+	rec->cr_fsgid    = gid;
+	rec->cr_cap      = cap_effective;
+	rec->cr_fid1     = op_data->op_fid1;
+	rec->cr_fid2     = op_data->op_fid2;
+	rec->cr_mode     = mode;
+	rec->cr_rdev     = rdev;
+	rec->cr_time     = op_data->op_mod_time;
+	rec->cr_suppgid1 = op_data->op_suppgids[0];
+	rec->cr_suppgid2 = op_data->op_suppgids[1];
+	flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+	if (op_data->op_bias & MDS_CREATE_VOLATILE)
+		flags |= MDS_OPEN_VOLATILE;
+	set_mrc_cr_flags(rec, flags);
+	rec->cr_bias     = op_data->op_bias;
+	rec->cr_umask    = current_umask();
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+	if (data) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, data, datalen);
+	}
+}
+
+static __u64 mds_pack_open_flags(__u32 flags, __u32 mode)
+{
+	__u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE |
+				   MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |
+				   MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |
+				   MDS_OPEN_BY_FID));
+	if (flags & O_CREAT)
+		cr_flags |= MDS_OPEN_CREAT;
+	if (flags & O_EXCL)
+		cr_flags |= MDS_OPEN_EXCL;
+	if (flags & O_TRUNC)
+		cr_flags |= MDS_OPEN_TRUNC;
+	if (flags & O_APPEND)
+		cr_flags |= MDS_OPEN_APPEND;
+	if (flags & O_SYNC)
+		cr_flags |= MDS_OPEN_SYNC;
+	if (flags & O_DIRECTORY)
+		cr_flags |= MDS_OPEN_DIRECTORY;
+#ifdef FMODE_EXEC
+	if (flags & FMODE_EXEC)
+		cr_flags |= MDS_FMODE_EXEC;
+#endif
+	if (flags & O_LOV_DELAY_CREATE)
+		cr_flags |= MDS_OPEN_DELAY_CREATE;
+
+	if (flags & O_NONBLOCK)
+		cr_flags |= MDS_OPEN_NORESTORE;
+
+	return cr_flags;
+}
+
+/* packing of MDS records */
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		   __u32 mode, __u64 rdev, __u32 flags, const void *lmm,
+		   int lmmlen)
+{
+	struct mdt_rec_create *rec;
+	char *tmp;
+	__u64 cr_flags;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	/* XXX do something about time, uid, gid */
+	rec->cr_opcode   = REINT_OPEN;
+	rec->cr_fsuid   = current_fsuid();
+	rec->cr_fsgid   = current_fsgid();
+	rec->cr_cap      = cfs_curproc_cap_pack();
+	if (op_data != NULL) {
+		rec->cr_fid1 = op_data->op_fid1;
+		rec->cr_fid2 = op_data->op_fid2;
+	}
+	rec->cr_mode     = mode;
+	cr_flags = mds_pack_open_flags(flags, mode);
+	rec->cr_rdev     = rdev;
+	rec->cr_time     = op_data->op_mod_time;
+	rec->cr_suppgid1 = op_data->op_suppgids[0];
+	rec->cr_suppgid2 = op_data->op_suppgids[1];
+	rec->cr_bias     = op_data->op_bias;
+	rec->cr_umask    = current_umask();
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	/* the next buffer is child capa, which is used for replay,
+	 * will be packed from the data in reply message. */
+
+	if (op_data->op_name) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+		if (op_data->op_bias & MDS_CREATE_VOLATILE)
+			cr_flags |= MDS_OPEN_VOLATILE;
+	}
+
+	if (lmm) {
+		cr_flags |= MDS_OPEN_HAS_EA;
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, lmm, lmmlen);
+	}
+	set_mrc_cr_flags(rec, cr_flags);
+}
+
+static inline __u64 attr_pack(unsigned int ia_valid) {
+	__u64 sa_valid = 0;
+
+	if (ia_valid & ATTR_MODE)
+		sa_valid |= MDS_ATTR_MODE;
+	if (ia_valid & ATTR_UID)
+		sa_valid |= MDS_ATTR_UID;
+	if (ia_valid & ATTR_GID)
+		sa_valid |= MDS_ATTR_GID;
+	if (ia_valid & ATTR_SIZE)
+		sa_valid |= MDS_ATTR_SIZE;
+	if (ia_valid & ATTR_ATIME)
+		sa_valid |= MDS_ATTR_ATIME;
+	if (ia_valid & ATTR_MTIME)
+		sa_valid |= MDS_ATTR_MTIME;
+	if (ia_valid & ATTR_CTIME)
+		sa_valid |= MDS_ATTR_CTIME;
+	if (ia_valid & ATTR_ATIME_SET)
+		sa_valid |= MDS_ATTR_ATIME_SET;
+	if (ia_valid & ATTR_MTIME_SET)
+		sa_valid |= MDS_ATTR_MTIME_SET;
+	if (ia_valid & ATTR_FORCE)
+		sa_valid |= MDS_ATTR_FORCE;
+	if (ia_valid & ATTR_ATTR_FLAG)
+		sa_valid |= MDS_ATTR_ATTR_FLAG;
+	if (ia_valid & ATTR_KILL_SUID)
+		sa_valid |=  MDS_ATTR_KILL_SUID;
+	if (ia_valid & ATTR_KILL_SGID)
+		sa_valid |= MDS_ATTR_KILL_SGID;
+	if (ia_valid & ATTR_CTIME_SET)
+		sa_valid |= MDS_ATTR_CTIME_SET;
+	if (ia_valid & ATTR_FROM_OPEN)
+		sa_valid |= MDS_ATTR_FROM_OPEN;
+	if (ia_valid & ATTR_BLOCKS)
+		sa_valid |= MDS_ATTR_BLOCKS;
+	if (ia_valid & MDS_OPEN_OWNEROVERRIDE)
+		/* NFSD hack (see bug 5781) */
+		sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+	return sa_valid;
+}
+
+static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
+				 struct md_op_data *op_data)
+{
+	rec->sa_opcode  = REINT_SETATTR;
+	rec->sa_fsuid   = current_fsuid();
+	rec->sa_fsgid   = current_fsgid();
+	rec->sa_cap     = cfs_curproc_cap_pack();
+	rec->sa_suppgid = -1;
+
+	rec->sa_fid    = op_data->op_fid1;
+	rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid);
+	rec->sa_mode   = op_data->op_attr.ia_mode;
+	rec->sa_uid    = op_data->op_attr.ia_uid;
+	rec->sa_gid    = op_data->op_attr.ia_gid;
+	rec->sa_size   = op_data->op_attr.ia_size;
+	rec->sa_blocks = op_data->op_attr_blocks;
+	rec->sa_atime  = LTIME_S(op_data->op_attr.ia_atime);
+	rec->sa_mtime  = LTIME_S(op_data->op_attr.ia_mtime);
+	rec->sa_ctime  = LTIME_S(op_data->op_attr.ia_ctime);
+	rec->sa_attr_flags = ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+	if ((op_data->op_attr.ia_valid & ATTR_GID) &&
+	    current_is_in_group(op_data->op_attr.ia_gid))
+		rec->sa_suppgid = op_data->op_attr.ia_gid;
+	else
+		rec->sa_suppgid = op_data->op_suppgids[0];
+
+	rec->sa_bias = op_data->op_bias;
+}
+
+static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
+			     struct md_op_data *op_data)
+{
+	memcpy(&epoch->handle, &op_data->op_handle, sizeof(epoch->handle));
+	epoch->ioepoch = op_data->op_ioepoch;
+	epoch->flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+}
+
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		      void *ea, int ealen, void *ea2, int ea2len)
+{
+	struct mdt_rec_setattr *rec;
+	struct mdt_ioepoch *epoch;
+	struct lov_user_md *lum = NULL;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) ==sizeof(struct mdt_rec_setattr));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	mdc_setattr_pack_rec(rec, op_data);
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	if (op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) {
+		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+		mdc_ioepoch_pack(epoch, op_data);
+	}
+
+	if (ealen == 0)
+		return;
+
+	lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+	if (ea == NULL) { /* Remove LOV EA */
+		lum->lmm_magic = LOV_USER_MAGIC_V1;
+		lum->lmm_stripe_size = 0;
+		lum->lmm_stripe_count = 0;
+		lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1);
+	} else {
+		memcpy(lum, ea, ealen);
+	}
+
+	if (ea2len == 0)
+		return;
+
+	memcpy(req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES), ea2,
+	       ea2len);
+}
+
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_rec_unlink *rec;
+	char *tmp;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	LASSERT(rec != NULL);
+
+	rec->ul_opcode  = op_data->op_cli_flags & CLI_RM_ENTRY ?
+					REINT_RMENTRY : REINT_UNLINK;
+	rec->ul_fsuid   = op_data->op_fsuid;
+	rec->ul_fsgid   = op_data->op_fsgid;
+	rec->ul_cap     = op_data->op_cap;
+	rec->ul_mode    = op_data->op_mode;
+	rec->ul_suppgid1= op_data->op_suppgids[0];
+	rec->ul_suppgid2= -1;
+	rec->ul_fid1    = op_data->op_fid1;
+	rec->ul_fid2    = op_data->op_fid2;
+	rec->ul_time    = op_data->op_mod_time;
+	rec->ul_bias    = op_data->op_bias;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LASSERT(tmp != NULL);
+	LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_rec_link *rec;
+	char *tmp;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+	LASSERT (rec != NULL);
+
+	rec->lk_opcode   = REINT_LINK;
+	rec->lk_fsuid    = op_data->op_fsuid;//current->fsuid;
+	rec->lk_fsgid    = op_data->op_fsgid;//current->fsgid;
+	rec->lk_cap      = op_data->op_cap;//current->cap_effective;
+	rec->lk_suppgid1 = op_data->op_suppgids[0];
+	rec->lk_suppgid2 = op_data->op_suppgids[1];
+	rec->lk_fid1     = op_data->op_fid1;
+	rec->lk_fid2     = op_data->op_fid2;
+	rec->lk_time     = op_data->op_mod_time;
+	rec->lk_bias     = op_data->op_bias;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+		     const char *old, int oldlen, const char *new, int newlen)
+{
+	struct mdt_rec_rename *rec;
+	char *tmp;
+
+	CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	/* XXX do something about time, uid, gid */
+	rec->rn_opcode   = REINT_RENAME;
+	rec->rn_fsuid    = op_data->op_fsuid;
+	rec->rn_fsgid    = op_data->op_fsgid;
+	rec->rn_cap      = op_data->op_cap;
+	rec->rn_suppgid1 = op_data->op_suppgids[0];
+	rec->rn_suppgid2 = op_data->op_suppgids[1];
+	rec->rn_fid1     = op_data->op_fid1;
+	rec->rn_fid2     = op_data->op_fid2;
+	rec->rn_time     = op_data->op_mod_time;
+	rec->rn_mode     = op_data->op_mode;
+	rec->rn_bias     = op_data->op_bias;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+	LOGL0(old, oldlen, tmp);
+
+	if (new) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_SYMTGT);
+		LOGL0(new, newlen, tmp);
+	}
+}
+
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+		      struct md_op_data *op_data, int ea_size)
+{
+	struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+						    &RMF_MDT_BODY);
+
+	b->valid = valid;
+	if (op_data->op_bias & MDS_CHECK_SPLIT)
+		b->valid |= OBD_MD_FLCKSPLIT;
+	if (op_data->op_bias & MDS_CROSS_REF)
+		b->valid |= OBD_MD_FLCROSSREF;
+	b->eadatasize = ea_size;
+	b->flags = flags;
+	__mdc_pack_body(b, op_data->op_suppgids[0]);
+
+	b->fid1 = op_data->op_fid1;
+	b->fid2 = op_data->op_fid2;
+	b->valid |= OBD_MD_FLID;
+
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+	if (op_data->op_name) {
+		char *tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+	}
+}
+
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+	struct mdt_ioepoch *epoch;
+	struct mdt_rec_setattr *rec;
+
+	epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+	rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+	mdc_setattr_pack_rec(rec, op_data);
+	mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_ioepoch_pack(epoch, op_data);
+}
+
+static int mdc_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+	int rc;
+	ENTRY;
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&mcw->mcw_entry);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	RETURN(rc);
+};
+
+/* We record requests in flight in cli->cl_r_in_flight here.
+ * There is only one write rpc possible in mdc anyway. If this to change
+ * in the future - the code may need to be revisited. */
+int mdc_enter_request(struct client_obd *cli)
+{
+	int rc = 0;
+	struct mdc_cache_waiter mcw;
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+		list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+		init_waitqueue_head(&mcw.mcw_waitq);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		rc = l_wait_event(mcw.mcw_waitq, mdc_req_avail(cli, &mcw), &lwi);
+		if (rc) {
+			client_obd_list_lock(&cli->cl_loi_list_lock);
+			if (list_empty(&mcw.mcw_entry))
+				cli->cl_r_in_flight--;
+			list_del_init(&mcw.mcw_entry);
+			client_obd_list_unlock(&cli->cl_loi_list_lock);
+		}
+	} else {
+		cli->cl_r_in_flight++;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+	}
+	return rc;
+}
+
+void mdc_exit_request(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct mdc_cache_waiter *mcw;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_r_in_flight--;
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+		if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+			/* No free request slots anymore */
+			break;
+		}
+
+		mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+		list_del_init(&mcw->mcw_entry);
+		cli->cl_r_in_flight++;
+		wake_up(&mcw->mcw_waitq);
+	}
+	/* Empty waiting list? Decrease reqs in-flight number */
+
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}

diff --git a/drivers/staging/lustre/lustre/mdc/mdc_locks.c b/drivers/staging/lustre/lustre/mdc/mdc_locks.c
new file mode 100644
index 0000000..1cc90b6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_locks.c

@@ -0,0 +1,1229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+
+#include <lustre_acl.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+/* fid_res_name_eq() */
+#include <lustre_fid.h>
+#include <lprocfs_status.h>
+#include "mdc_internal.h"
+
+struct mdc_getattr_args {
+	struct obd_export	   *ga_exp;
+	struct md_enqueue_info      *ga_minfo;
+	struct ldlm_enqueue_info    *ga_einfo;
+};
+
+int it_disposition(struct lookup_intent *it, int flag)
+{
+	return it->d.lustre.it_disposition & flag;
+}
+EXPORT_SYMBOL(it_disposition);
+
+void it_set_disposition(struct lookup_intent *it, int flag)
+{
+	it->d.lustre.it_disposition |= flag;
+}
+EXPORT_SYMBOL(it_set_disposition);
+
+void it_clear_disposition(struct lookup_intent *it, int flag)
+{
+	it->d.lustre.it_disposition &= ~flag;
+}
+EXPORT_SYMBOL(it_clear_disposition);
+
+int it_open_error(int phase, struct lookup_intent *it)
+{
+	if (it_disposition(it, DISP_OPEN_OPEN)) {
+		if (phase >= DISP_OPEN_OPEN)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_OPEN_CREATE)) {
+		if (phase >= DISP_OPEN_CREATE)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_LOOKUP_EXECD)) {
+		if (phase >= DISP_LOOKUP_EXECD)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+
+	if (it_disposition(it, DISP_IT_EXECD)) {
+		if (phase >= DISP_IT_EXECD)
+			return it->d.lustre.it_status;
+		else
+			return 0;
+	}
+	CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
+	       it->d.lustre.it_status);
+	LBUG();
+	return 0;
+}
+EXPORT_SYMBOL(it_open_error);
+
+/* this must be called on a lockh that is known to have a referenced lock */
+int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+		      __u64 *bits)
+{
+	struct ldlm_lock *lock;
+	struct inode *new_inode = data;
+	ENTRY;
+
+	if(bits)
+		*bits = 0;
+
+	if (!*lockh)
+		RETURN(0);
+
+	lock = ldlm_handle2lock((struct lustre_handle *)lockh);
+
+	LASSERT(lock != NULL);
+	lock_res_and_lock(lock);
+	if (lock->l_resource->lr_lvb_inode &&
+	    lock->l_resource->lr_lvb_inode != data) {
+		struct inode *old_inode = lock->l_resource->lr_lvb_inode;
+		LASSERTF(old_inode->i_state & I_FREEING,
+			 "Found existing inode %p/%lu/%u state %lu in lock: "
+			 "setting data to %p/%lu/%u\n", old_inode,
+			 old_inode->i_ino, old_inode->i_generation,
+			 old_inode->i_state,
+			 new_inode, new_inode->i_ino, new_inode->i_generation);
+	}
+	lock->l_resource->lr_lvb_inode = new_inode;
+	if (bits)
+		*bits = lock->l_policy_data.l_inodebits.bits;
+
+	unlock_res_and_lock(lock);
+	LDLM_LOCK_PUT(lock);
+
+	RETURN(0);
+}
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+			   const struct lu_fid *fid, ldlm_type_t type,
+			   ldlm_policy_data_t *policy, ldlm_mode_t mode,
+			   struct lustre_handle *lockh)
+{
+	struct ldlm_res_id res_id;
+	ldlm_mode_t rc;
+	ENTRY;
+
+	fid_build_reg_res_name(fid, &res_id);
+	rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags,
+			     &res_id, type, policy, mode, lockh, 0);
+	RETURN(rc);
+}
+
+int mdc_cancel_unused(struct obd_export *exp,
+		      const struct lu_fid *fid,
+		      ldlm_policy_data_t *policy,
+		      ldlm_mode_t mode,
+		      ldlm_cancel_flags_t flags,
+		      void *opaque)
+{
+	struct ldlm_res_id res_id;
+	struct obd_device *obd = class_exp2obd(exp);
+	int rc;
+
+	ENTRY;
+
+	fid_build_reg_res_name(fid, &res_id);
+	rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id,
+					     policy, mode, flags, opaque);
+	RETURN(rc);
+}
+
+int mdc_null_inode(struct obd_export *exp,
+		   const struct lu_fid *fid)
+{
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace;
+	ENTRY;
+
+	LASSERTF(ns != NULL, "no namespace passed\n");
+
+	fid_build_reg_res_name(fid, &res_id);
+
+	res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+	if(res == NULL)
+		RETURN(0);
+
+	lock_res(res);
+	res->lr_lvb_inode = NULL;
+	unlock_res(res);
+
+	ldlm_resource_putref(res);
+	RETURN(0);
+}
+
+/* find any ldlm lock of the inode in mdc
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+int mdc_find_cbdata(struct obd_export *exp,
+		    const struct lu_fid *fid,
+		    ldlm_iterator_t it, void *data)
+{
+	struct ldlm_res_id res_id;
+	int rc = 0;
+	ENTRY;
+
+	fid_build_reg_res_name((struct lu_fid*)fid, &res_id);
+	rc = ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id,
+				   it, data);
+	if (rc == LDLM_ITER_STOP)
+		RETURN(1);
+	else if (rc == LDLM_ITER_CONTINUE)
+		RETURN(0);
+	RETURN(rc);
+}
+
+static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc)
+{
+	/* Don't hold error requests for replay. */
+	if (req->rq_replay) {
+		spin_lock(&req->rq_lock);
+		req->rq_replay = 0;
+		spin_unlock(&req->rq_lock);
+	}
+	if (rc && req->rq_transno != 0) {
+		DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc);
+		LBUG();
+	}
+}
+
+/* Save a large LOV EA into the request buffer so that it is available
+ * for replay.  We don't do this in the initial request because the
+ * original request doesn't need this buffer (at most it sends just the
+ * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
+ * buffer and may also be difficult to allocate and save a very large
+ * request buffer for each open. (bug 5707)
+ *
+ * OOM here may cause recovery failure if lmm is needed (only for the
+ * original open if the MDS crashed just when this client also OOM'd)
+ * but this is incredibly unlikely, and questionable whether the client
+ * could do MDS recovery under OOM anyways... */
+static void mdc_realloc_openmsg(struct ptlrpc_request *req,
+				struct mdt_body *body)
+{
+	int     rc;
+
+	/* FIXME: remove this explicit offset. */
+	rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4,
+					body->eadatasize);
+	if (rc) {
+		CERROR("Can't enlarge segment %d size to %d\n",
+		       DLM_INTENT_REC_OFF + 4, body->eadatasize);
+		body->valid &= ~OBD_MD_FLEASIZE;
+		body->eadatasize = 0;
+	}
+}
+
+static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
+						   struct lookup_intent *it,
+						   struct md_op_data *op_data,
+						   void *lmm, int lmmsize,
+						   void *cb_data)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obddev = class_exp2obd(exp);
+	struct ldlm_intent    *lit;
+	LIST_HEAD(cancels);
+	int		    count = 0;
+	int		    mode;
+	int		    rc;
+	ENTRY;
+
+	it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
+
+	/* XXX: openlock is not cancelled for cross-refs. */
+	/* If inode is known, cancel conflicting OPEN locks. */
+	if (fid_is_sane(&op_data->op_fid2)) {
+		if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
+			mode = LCK_CW;
+#ifdef FMODE_EXEC
+		else if (it->it_flags & FMODE_EXEC)
+			mode = LCK_PR;
+#endif
+		else
+			mode = LCK_CR;
+		count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+						&cancels, mode,
+						MDS_INODELOCK_OPEN);
+	}
+
+	/* If CREATE, cancel parent's UPDATE lock. */
+	if (it->it_op & IT_CREAT)
+		mode = LCK_EX;
+	else
+		mode = LCK_CR;
+	count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+					 &cancels, mode,
+					 MDS_INODELOCK_UPDATE);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_OPEN);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	/* parent capability */
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	/* child capability, reserve the size according to parent capa, it will
+	 * be filled after we get the reply */
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa1);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     max(lmmsize, obddev->u.cli.cl_default_mds_easize));
+
+	rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		return NULL;
+	}
+
+	spin_lock(&req->rq_lock);
+	req->rq_replay = req->rq_import->imp_replayable;
+	spin_unlock(&req->rq_lock);
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the intended request */
+	mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm,
+		      lmmsize);
+
+	/* for remote client, fetch remote perm for current user */
+	if (client_is_remote(exp))
+		req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+				     sizeof(struct mdt_remote_perm));
+	ptlrpc_request_set_replen(req);
+	return req;
+}
+
+static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
+						     struct lookup_intent *it,
+						     struct md_op_data *op_data)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obddev = class_exp2obd(exp);
+	struct ldlm_intent    *lit;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_UNLINK);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the intended request */
+	mdc_unlink_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obddev->u.cli.cl_max_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     obddev->u.cli.cl_max_mds_cookiesize);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
+						      struct lookup_intent *it,
+						      struct md_op_data *op_data)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obddev = class_exp2obd(exp);
+	obd_valid	      valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
+				       OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA |
+				       OBD_MD_FLMDSCAPA | OBD_MD_MEA |
+				       (client_is_remote(exp) ?
+					       OBD_MD_FLRMTPERM : OBD_MD_FLACL);
+	struct ldlm_intent    *lit;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_LDLM_INTENT_GETATTR);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the intended request */
+	mdc_getattr_pack(req, valid, it->it_flags, op_data,
+			 obddev->u.cli.cl_max_mds_easize);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obddev->u.cli.cl_max_mds_easize);
+	if (client_is_remote(exp))
+		req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+				     sizeof(struct mdt_remote_perm));
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp,
+						     struct lookup_intent *it,
+						     struct md_op_data *unused)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct ldlm_intent    *lit;
+	struct layout_intent  *layout;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				&RQF_LDLM_INTENT_LAYOUT);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* pack the intent */
+	lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+	lit->opc = (__u64)it->it_op;
+
+	/* pack the layout intent request */
+	layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT);
+	/* LAYOUT_INTENT_ACCESS is generic, specific operation will be
+	 * set for replication */
+	layout->li_opc = LAYOUT_INTENT_ACCESS;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+			obd->u.cli.cl_max_mds_easize);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static struct ptlrpc_request *
+mdc_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+	struct ptlrpc_request *req;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(ERR_PTR(rc));
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+	ptlrpc_request_set_replen(req);
+	RETURN(req);
+}
+
+static int mdc_finish_enqueue(struct obd_export *exp,
+			      struct ptlrpc_request *req,
+			      struct ldlm_enqueue_info *einfo,
+			      struct lookup_intent *it,
+			      struct lustre_handle *lockh,
+			      int rc)
+{
+	struct req_capsule  *pill = &req->rq_pill;
+	struct ldlm_request *lockreq;
+	struct ldlm_reply   *lockrep;
+	struct lustre_intent_data *intent = &it->d.lustre;
+	struct ldlm_lock    *lock;
+	void		*lvb_data = NULL;
+	int		  lvb_len = 0;
+	ENTRY;
+
+	LASSERT(rc >= 0);
+	/* Similarly, if we're going to replay this request, we don't want to
+	 * actually get a lock, just perform the intent. */
+	if (req->rq_transno || req->rq_replay) {
+		lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ);
+		lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY);
+	}
+
+	if (rc == ELDLM_LOCK_ABORTED) {
+		einfo->ei_mode = 0;
+		memset(lockh, 0, sizeof(*lockh));
+		rc = 0;
+	} else { /* rc = 0 */
+		lock = ldlm_handle2lock(lockh);
+		LASSERT(lock != NULL);
+
+		/* If the server gave us back a different lock mode, we should
+		 * fix up our variables. */
+		if (lock->l_req_mode != einfo->ei_mode) {
+			ldlm_lock_addref(lockh, lock->l_req_mode);
+			ldlm_lock_decref(lockh, einfo->ei_mode);
+			einfo->ei_mode = lock->l_req_mode;
+		}
+		LDLM_LOCK_PUT(lock);
+	}
+
+	lockrep = req_capsule_server_get(pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */
+
+	intent->it_disposition = (int)lockrep->lock_policy_res1;
+	intent->it_status = (int)lockrep->lock_policy_res2;
+	intent->it_lock_mode = einfo->ei_mode;
+	intent->it_lock_handle = lockh->cookie;
+	intent->it_data = req;
+
+	/* Technically speaking rq_transno must already be zero if
+	 * it_status is in error, so the check is a bit redundant */
+	if ((!req->rq_transno || intent->it_status < 0) && req->rq_replay)
+		mdc_clear_replay_flag(req, intent->it_status);
+
+	/* If we're doing an IT_OPEN which did not result in an actual
+	 * successful open, then we need to remove the bit which saves
+	 * this request for unconditional replay.
+	 *
+	 * It's important that we do this first!  Otherwise we might exit the
+	 * function without doing so, and try to replay a failed create
+	 * (bug 3440) */
+	if (it->it_op & IT_OPEN && req->rq_replay &&
+	    (!it_disposition(it, DISP_OPEN_OPEN) ||intent->it_status != 0))
+		mdc_clear_replay_flag(req, intent->it_status);
+
+	DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d",
+		  it->it_op, intent->it_disposition, intent->it_status);
+
+	/* We know what to expect, so we do any byte flipping required here */
+	if (it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR)) {
+		struct mdt_body *body;
+
+		body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+		if (body == NULL) {
+			CERROR ("Can't swab mdt_body\n");
+			RETURN (-EPROTO);
+		}
+
+		if (it_disposition(it, DISP_OPEN_OPEN) &&
+		    !it_open_error(DISP_OPEN_OPEN, it)) {
+			/*
+			 * If this is a successful OPEN request, we need to set
+			 * replay handler and data early, so that if replay
+			 * happens immediately after swabbing below, new reply
+			 * is swabbed by that handler correctly.
+			 */
+			mdc_set_open_replay_data(NULL, NULL, req);
+		}
+
+		if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) {
+			void *eadata;
+
+			mdc_update_max_ea_from_body(exp, body);
+
+			/*
+			 * The eadata is opaque; just check that it is there.
+			 * Eventually, obd_unpackmd() will check the contents.
+			 */
+			eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+							      body->eadatasize);
+			if (eadata == NULL)
+				RETURN(-EPROTO);
+
+			/* save lvb data and length in case this is for layout
+			 * lock */
+			lvb_data = eadata;
+			lvb_len = body->eadatasize;
+
+			/*
+			 * We save the reply LOV EA in case we have to replay a
+			 * create for recovery.  If we didn't allocate a large
+			 * enough request buffer above we need to reallocate it
+			 * here to hold the actual LOV EA.
+			 *
+			 * To not save LOV EA if request is not going to replay
+			 * (for example error one).
+			 */
+			if ((it->it_op & IT_OPEN) && req->rq_replay) {
+				void *lmm;
+				if (req_capsule_get_size(pill, &RMF_EADATA,
+							 RCL_CLIENT) <
+				    body->eadatasize)
+					mdc_realloc_openmsg(req, body);
+				else
+					req_capsule_shrink(pill, &RMF_EADATA,
+							   body->eadatasize,
+							   RCL_CLIENT);
+
+				req_capsule_set_size(pill, &RMF_EADATA,
+						     RCL_CLIENT,
+						     body->eadatasize);
+
+				lmm = req_capsule_client_get(pill, &RMF_EADATA);
+				if (lmm)
+					memcpy(lmm, eadata, body->eadatasize);
+			}
+		}
+
+		if (body->valid & OBD_MD_FLRMTPERM) {
+			struct mdt_remote_perm *perm;
+
+			LASSERT(client_is_remote(exp));
+			perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+						lustre_swab_mdt_remote_perm);
+			if (perm == NULL)
+				RETURN(-EPROTO);
+		}
+		if (body->valid & OBD_MD_FLMDSCAPA) {
+			struct lustre_capa *capa, *p;
+
+			capa = req_capsule_server_get(pill, &RMF_CAPA1);
+			if (capa == NULL)
+				RETURN(-EPROTO);
+
+			if (it->it_op & IT_OPEN) {
+				/* client fid capa will be checked in replay */
+				p = req_capsule_client_get(pill, &RMF_CAPA2);
+				LASSERT(p);
+				*p = *capa;
+			}
+		}
+		if (body->valid & OBD_MD_FLOSSCAPA) {
+			struct lustre_capa *capa;
+
+			capa = req_capsule_server_get(pill, &RMF_CAPA2);
+			if (capa == NULL)
+				RETURN(-EPROTO);
+		}
+	} else if (it->it_op & IT_LAYOUT) {
+		/* maybe the lock was granted right away and layout
+		 * is packed into RMF_DLM_LVB of req */
+		lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER);
+		if (lvb_len > 0) {
+			lvb_data = req_capsule_server_sized_get(pill,
+							&RMF_DLM_LVB, lvb_len);
+			if (lvb_data == NULL)
+				RETURN(-EPROTO);
+		}
+	}
+
+	/* fill in stripe data for layout lock */
+	lock = ldlm_handle2lock(lockh);
+	if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL) {
+		void *lmm;
+
+		LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d\n",
+			ldlm_it2str(it->it_op), lvb_len);
+
+		OBD_ALLOC_LARGE(lmm, lvb_len);
+		if (lmm == NULL) {
+			LDLM_LOCK_PUT(lock);
+			RETURN(-ENOMEM);
+		}
+		memcpy(lmm, lvb_data, lvb_len);
+
+		/* install lvb_data */
+		lock_res_and_lock(lock);
+		if (lock->l_lvb_data == NULL) {
+			lock->l_lvb_data = lmm;
+			lock->l_lvb_len = lvb_len;
+			lmm = NULL;
+		}
+		unlock_res_and_lock(lock);
+		if (lmm != NULL)
+			OBD_FREE_LARGE(lmm, lvb_len);
+	}
+	if (lock != NULL)
+		LDLM_LOCK_PUT(lock);
+
+	RETURN(rc);
+}
+
+/* We always reserve enough space in the reply packet for a stripe MD, because
+ * we don't know in advance the file type. */
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+		struct lookup_intent *it, struct md_op_data *op_data,
+		struct lustre_handle *lockh, void *lmm, int lmmsize,
+		struct ptlrpc_request **reqp, __u64 extra_lock_flags)
+{
+	struct obd_device     *obddev = class_exp2obd(exp);
+	struct ptlrpc_request *req = NULL;
+	__u64		  flags, saved_flags = extra_lock_flags;
+	int		    rc;
+	struct ldlm_res_id res_id;
+	static const ldlm_policy_data_t lookup_policy =
+			    { .l_inodebits = { MDS_INODELOCK_LOOKUP } };
+	static const ldlm_policy_data_t update_policy =
+			    { .l_inodebits = { MDS_INODELOCK_UPDATE } };
+	static const ldlm_policy_data_t layout_policy =
+			    { .l_inodebits = { MDS_INODELOCK_LAYOUT } };
+	ldlm_policy_data_t const *policy = &lookup_policy;
+	int		    generation, resends = 0;
+	struct ldlm_reply     *lockrep;
+	enum lvb_type	       lvb_type = 0;
+	ENTRY;
+
+	LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n",
+		 einfo->ei_type);
+
+	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+
+	if (it) {
+		saved_flags |= LDLM_FL_HAS_INTENT;
+		if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
+			policy = &update_policy;
+		else if (it->it_op & IT_LAYOUT)
+			policy = &layout_policy;
+	}
+
+	LASSERT(reqp == NULL);
+
+	generation = obddev->u.cli.cl_import->imp_generation;
+resend:
+	flags = saved_flags;
+	if (!it) {
+		/* The only way right now is FLOCK, in this case we hide flock
+		   policy as lmm, but lmmsize is 0 */
+		LASSERT(lmm && lmmsize == 0);
+		LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
+			 einfo->ei_type);
+		policy = (ldlm_policy_data_t *)lmm;
+		res_id.name[3] = LDLM_FLOCK;
+	} else if (it->it_op & IT_OPEN) {
+		req = mdc_intent_open_pack(exp, it, op_data, lmm, lmmsize,
+					   einfo->ei_cbdata);
+		policy = &update_policy;
+		einfo->ei_cbdata = NULL;
+		lmm = NULL;
+	} else if (it->it_op & IT_UNLINK) {
+		req = mdc_intent_unlink_pack(exp, it, op_data);
+	} else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
+		req = mdc_intent_getattr_pack(exp, it, op_data);
+	} else if (it->it_op & IT_READDIR) {
+		req = mdc_enqueue_pack(exp, 0);
+	} else if (it->it_op & IT_LAYOUT) {
+		if (!imp_connect_lvb_type(class_exp2cliimp(exp)))
+			RETURN(-EOPNOTSUPP);
+
+		req = mdc_intent_layout_pack(exp, it, op_data);
+		lvb_type = LVB_T_LAYOUT;
+	} else {
+		LBUG();
+		RETURN(-EINVAL);
+	}
+
+	if (IS_ERR(req))
+		RETURN(PTR_ERR(req));
+
+	if (req != NULL && it && it->it_op & IT_CREAT)
+		/* ask ptlrpc not to resend on EINPROGRESS since we have our own
+		 * retry logic */
+		req->rq_no_retry_einprogress = 1;
+
+	if (resends) {
+		req->rq_generation_set = 1;
+		req->rq_import_generation = generation;
+		req->rq_sent = cfs_time_current_sec() + resends;
+	}
+
+	/* It is important to obtain rpc_lock first (if applicable), so that
+	 * threads that are serialised with rpc_lock are not polluting our
+	 * rpcs in flight counter. We do not do flock request limiting, though*/
+	if (it) {
+		mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+		rc = mdc_enter_request(&obddev->u.cli);
+		if (rc != 0) {
+			mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+			mdc_clear_replay_flag(req, 0);
+			ptlrpc_req_finished(req);
+			RETURN(rc);
+		}
+	}
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+			      0, lvb_type, lockh, 0);
+	if (!it) {
+		/* For flock requests we immediatelly return without further
+		   delay and let caller deal with the rest, since rest of
+		   this function metadata processing makes no sense for flock
+		   requests anyway */
+		RETURN(rc);
+	}
+
+	mdc_exit_request(&obddev->u.cli);
+	mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+
+	if (rc < 0) {
+		CERROR("ldlm_cli_enqueue: %d\n", rc);
+		mdc_clear_replay_flag(req, rc);
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+	LASSERT(lockrep != NULL);
+
+	/* Retry the create infinitely when we get -EINPROGRESS from
+	 * server. This is required by the new quota design. */
+	if (it && it->it_op & IT_CREAT &&
+	    (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
+		mdc_clear_replay_flag(req, rc);
+		ptlrpc_req_finished(req);
+		resends++;
+
+		CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n",
+		       obddev->obd_name, resends, it->it_op,
+		       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+		if (generation == obddev->u.cli.cl_import->imp_generation) {
+			goto resend;
+		} else {
+			CDEBUG(D_HA, "resend cross eviction\n");
+			RETURN(-EIO);
+		}
+	}
+
+	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+	if (rc < 0) {
+		if (lustre_handle_is_used(lockh)) {
+			ldlm_lock_decref(lockh, einfo->ei_mode);
+			memset(lockh, 0, sizeof(*lockh));
+		}
+		ptlrpc_req_finished(req);
+	}
+	RETURN(rc);
+}
+
+static int mdc_finish_intent_lock(struct obd_export *exp,
+				  struct ptlrpc_request *request,
+				  struct md_op_data *op_data,
+				  struct lookup_intent *it,
+				  struct lustre_handle *lockh)
+{
+	struct lustre_handle old_lock;
+	struct mdt_body *mdt_body;
+	struct ldlm_lock *lock;
+	int rc;
+
+
+	LASSERT(request != NULL);
+	LASSERT(request != LP_POISON);
+	LASSERT(request->rq_repmsg != LP_POISON);
+
+	if (!it_disposition(it, DISP_IT_EXECD)) {
+		/* The server failed before it even started executing the
+		 * intent, i.e. because it couldn't unpack the request. */
+		LASSERT(it->d.lustre.it_status != 0);
+		RETURN(it->d.lustre.it_status);
+	}
+	rc = it_open_error(DISP_IT_EXECD, it);
+	if (rc)
+		RETURN(rc);
+
+	mdt_body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+	LASSERT(mdt_body != NULL);      /* mdc_enqueue checked */
+
+	/* If we were revalidating a fid/name pair, mark the intent in
+	 * case we fail and get called again from lookup */
+	if (fid_is_sane(&op_data->op_fid2) &&
+	    it->it_create_mode & M_CHECK_STALE &&
+	    it->it_op != IT_GETATTR) {
+		it_set_disposition(it, DISP_ENQ_COMPLETE);
+
+		/* Also: did we find the same inode? */
+		/* sever can return one of two fids:
+		 * op_fid2 - new allocated fid - if file is created.
+		 * op_fid3 - existent fid - if file only open.
+		 * op_fid3 is saved in lmv_intent_open */
+		if ((!lu_fid_eq(&op_data->op_fid2, &mdt_body->fid1)) &&
+		    (!lu_fid_eq(&op_data->op_fid3, &mdt_body->fid1))) {
+			CDEBUG(D_DENTRY, "Found stale data "DFID"("DFID")/"DFID
+			       "\n", PFID(&op_data->op_fid2),
+			       PFID(&op_data->op_fid2), PFID(&mdt_body->fid1));
+			RETURN(-ESTALE);
+		}
+	}
+
+	rc = it_open_error(DISP_LOOKUP_EXECD, it);
+	if (rc)
+		RETURN(rc);
+
+	/* keep requests around for the multiple phases of the call
+	 * this shows the DISP_XX must guarantee we make it into the call
+	 */
+	if (!it_disposition(it, DISP_ENQ_CREATE_REF) &&
+	    it_disposition(it, DISP_OPEN_CREATE) &&
+	    !it_open_error(DISP_OPEN_CREATE, it)) {
+		it_set_disposition(it, DISP_ENQ_CREATE_REF);
+		ptlrpc_request_addref(request); /* balanced in ll_create_node */
+	}
+	if (!it_disposition(it, DISP_ENQ_OPEN_REF) &&
+	    it_disposition(it, DISP_OPEN_OPEN) &&
+	    !it_open_error(DISP_OPEN_OPEN, it)) {
+		it_set_disposition(it, DISP_ENQ_OPEN_REF);
+		ptlrpc_request_addref(request); /* balanced in ll_file_open */
+		/* BUG 11546 - eviction in the middle of open rpc processing */
+		OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, obd_timeout);
+	}
+
+	if (it->it_op & IT_CREAT) {
+		/* XXX this belongs in ll_create_it */
+	} else if (it->it_op == IT_OPEN) {
+		LASSERT(!it_disposition(it, DISP_OPEN_CREATE));
+	} else {
+		LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP | IT_LAYOUT));
+	}
+
+	/* If we already have a matching lock, then cancel the new
+	 * one.  We have to set the data here instead of in
+	 * mdc_enqueue, because we need to use the child's inode as
+	 * the l_ast_data to match, and that's not available until
+	 * intent_finish has performed the iget().) */
+	lock = ldlm_handle2lock(lockh);
+	if (lock) {
+		ldlm_policy_data_t policy = lock->l_policy_data;
+		LDLM_DEBUG(lock, "matching against this");
+
+		LASSERTF(fid_res_name_eq(&mdt_body->fid1,
+					 &lock->l_resource->lr_name),
+			 "Lock res_id: %lu/%lu/%lu, fid: %lu/%lu/%lu.\n",
+			 (unsigned long)lock->l_resource->lr_name.name[0],
+			 (unsigned long)lock->l_resource->lr_name.name[1],
+			 (unsigned long)lock->l_resource->lr_name.name[2],
+			 (unsigned long)fid_seq(&mdt_body->fid1),
+			 (unsigned long)fid_oid(&mdt_body->fid1),
+			 (unsigned long)fid_ver(&mdt_body->fid1));
+		LDLM_LOCK_PUT(lock);
+
+		memcpy(&old_lock, lockh, sizeof(*lockh));
+		if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
+				    LDLM_IBITS, &policy, LCK_NL, &old_lock, 0)) {
+			ldlm_lock_decref_and_cancel(lockh,
+						    it->d.lustre.it_lock_mode);
+			memcpy(lockh, &old_lock, sizeof(old_lock));
+			it->d.lustre.it_lock_handle = lockh->cookie;
+		}
+	}
+	CDEBUG(D_DENTRY,"D_IT dentry %.*s intent: %s status %d disp %x rc %d\n",
+	       op_data->op_namelen, op_data->op_name, ldlm_it2str(it->it_op),
+	       it->d.lustre.it_status, it->d.lustre.it_disposition, rc);
+	RETURN(rc);
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+			struct lu_fid *fid, __u64 *bits)
+{
+	/* We could just return 1 immediately, but since we should only
+	 * be called in revalidate_it if we already have a lock, let's
+	 * verify that. */
+	struct ldlm_res_id res_id;
+	struct lustre_handle lockh;
+	ldlm_policy_data_t policy;
+	ldlm_mode_t mode;
+	ENTRY;
+
+	if (it->d.lustre.it_lock_handle) {
+		lockh.cookie = it->d.lustre.it_lock_handle;
+		mode = ldlm_revalidate_lock_handle(&lockh, bits);
+	} else {
+		fid_build_reg_res_name(fid, &res_id);
+		switch (it->it_op) {
+		case IT_GETATTR:
+			policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
+			break;
+		case IT_LAYOUT:
+			policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT;
+			break;
+		default:
+			policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP;
+			break;
+		}
+		mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
+				       LDLM_FL_BLOCK_GRANTED, &res_id,
+				       LDLM_IBITS, &policy,
+				       LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh, 0);
+	}
+
+	if (mode) {
+		it->d.lustre.it_lock_handle = lockh.cookie;
+		it->d.lustre.it_lock_mode = mode;
+	} else {
+		it->d.lustre.it_lock_handle = 0;
+		it->d.lustre.it_lock_mode = 0;
+	}
+
+	RETURN(!!mode);
+}
+
+/*
+ * This long block is all about fixing up the lock and request state
+ * so that it is correct as of the moment _before_ the operation was
+ * applied; that way, the VFS will think that everything is normal and
+ * call Lustre's regular VFS methods.
+ *
+ * If we're performing a creation, that means that unless the creation
+ * failed with EEXIST, we should fake up a negative dentry.
+ *
+ * For everything else, we want to lookup to succeed.
+ *
+ * One additional note: if CREATE or OPEN succeeded, we add an extra
+ * reference to the request because we need to keep it around until
+ * ll_create/ll_open gets called.
+ *
+ * The server will return to us, in it_disposition, an indication of
+ * exactly what d.lustre.it_status refers to.
+ *
+ * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
+ * otherwise if DISP_OPEN_CREATE is set, then it status is the
+ * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
+ * DISP_LOOKUP_POS will be set, indicating whether the child lookup
+ * was successful.
+ *
+ * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
+ * child lookup.
+ */
+int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+		    void *lmm, int lmmsize, struct lookup_intent *it,
+		    int lookup_flags, struct ptlrpc_request **reqp,
+		    ldlm_blocking_callback cb_blocking,
+		    __u64 extra_lock_flags)
+{
+	struct lustre_handle lockh;
+	int rc = 0;
+	ENTRY;
+	LASSERT(it);
+
+	CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID
+	       ", intent: %s flags %#o\n", op_data->op_namelen,
+	       op_data->op_name, PFID(&op_data->op_fid2),
+	       PFID(&op_data->op_fid1), ldlm_it2str(it->it_op),
+	       it->it_flags);
+
+	lockh.cookie = 0;
+	if (fid_is_sane(&op_data->op_fid2) &&
+	    (it->it_op & (IT_LOOKUP | IT_GETATTR))) {
+		/* We could just return 1 immediately, but since we should only
+		 * be called in revalidate_it if we already have a lock, let's
+		 * verify that. */
+		it->d.lustre.it_lock_handle = 0;
+		rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL);
+		/* Only return failure if it was not GETATTR by cfid
+		   (from inode_revalidate) */
+		if (rc || op_data->op_namelen != 0)
+			RETURN(rc);
+	}
+
+	/* lookup_it may be called only after revalidate_it has run, because
+	 * revalidate_it cannot return errors, only zero.  Returning zero causes
+	 * this call to lookup, which *can* return an error.
+	 *
+	 * We only want to execute the request associated with the intent one
+	 * time, however, so don't send the request again.  Instead, skip past
+	 * this and use the request from revalidate.  In this case, revalidate
+	 * never dropped its reference, so the refcounts are all OK */
+	if (!it_disposition(it, DISP_ENQ_COMPLETE)) {
+		struct ldlm_enqueue_info einfo =
+			{ LDLM_IBITS, it_to_lock_mode(it), cb_blocking,
+			  ldlm_completion_ast, NULL, NULL, NULL };
+
+		/* For case if upper layer did not alloc fid, do it now. */
+		if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) {
+			rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+			if (rc < 0) {
+				CERROR("Can't alloc new fid, rc %d\n", rc);
+				RETURN(rc);
+			}
+		}
+		rc = mdc_enqueue(exp, &einfo, it, op_data, &lockh,
+				 lmm, lmmsize, NULL, extra_lock_flags);
+		if (rc < 0)
+			RETURN(rc);
+	} else if (!fid_is_sane(&op_data->op_fid2) ||
+		   !(it->it_create_mode & M_CHECK_STALE)) {
+		/* DISP_ENQ_COMPLETE set means there is extra reference on
+		 * request referenced from this intent, saved for subsequent
+		 * lookup.  This path is executed when we proceed to this
+		 * lookup, so we clear DISP_ENQ_COMPLETE */
+		it_clear_disposition(it, DISP_ENQ_COMPLETE);
+	}
+	*reqp = it->d.lustre.it_data;
+	rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh);
+	RETURN(rc);
+}
+
+static int mdc_intent_getattr_async_interpret(const struct lu_env *env,
+					      struct ptlrpc_request *req,
+					      void *args, int rc)
+{
+	struct mdc_getattr_args  *ga = args;
+	struct obd_export	*exp = ga->ga_exp;
+	struct md_enqueue_info   *minfo = ga->ga_minfo;
+	struct ldlm_enqueue_info *einfo = ga->ga_einfo;
+	struct lookup_intent     *it;
+	struct lustre_handle     *lockh;
+	struct obd_device	*obddev;
+	__u64		     flags = LDLM_FL_HAS_INTENT;
+	ENTRY;
+
+	it    = &minfo->mi_it;
+	lockh = &minfo->mi_lockh;
+
+	obddev = class_exp2obd(exp);
+
+	mdc_exit_request(&obddev->u.cli);
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE))
+		rc = -ETIMEDOUT;
+
+	rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode,
+				   &flags, NULL, 0, lockh, rc);
+	if (rc < 0) {
+		CERROR("ldlm_cli_enqueue_fini: %d\n", rc);
+		mdc_clear_replay_flag(req, rc);
+		GOTO(out, rc);
+	}
+
+	rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh);
+	EXIT;
+
+out:
+	OBD_FREE_PTR(einfo);
+	minfo->mi_cb(req, minfo, rc);
+	return 0;
+}
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+			     struct md_enqueue_info *minfo,
+			     struct ldlm_enqueue_info *einfo)
+{
+	struct md_op_data       *op_data = &minfo->mi_data;
+	struct lookup_intent    *it = &minfo->mi_it;
+	struct ptlrpc_request   *req;
+	struct mdc_getattr_args *ga;
+	struct obd_device       *obddev = class_exp2obd(exp);
+	struct ldlm_res_id       res_id;
+	/*XXX: Both MDS_INODELOCK_LOOKUP and MDS_INODELOCK_UPDATE are needed
+	 *     for statahead currently. Consider CMD in future, such two bits
+	 *     maybe managed by different MDS, should be adjusted then. */
+	ldlm_policy_data_t       policy = {
+					.l_inodebits = { MDS_INODELOCK_LOOKUP |
+							 MDS_INODELOCK_UPDATE }
+				 };
+	int		      rc = 0;
+	__u64		    flags = LDLM_FL_HAS_INTENT;
+	ENTRY;
+
+	CDEBUG(D_DLMTRACE,"name: %.*s in inode "DFID", intent: %s flags %#o\n",
+	       op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+	       ldlm_it2str(it->it_op), it->it_flags);
+
+	fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+	req = mdc_intent_getattr_pack(exp, it, op_data);
+	if (!req)
+		RETURN(-ENOMEM);
+
+	rc = mdc_enter_request(&obddev->u.cli);
+	if (rc != 0) {
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, &policy, &flags, NULL,
+			      0, LVB_T_NONE, &minfo->mi_lockh, 1);
+	if (rc < 0) {
+		mdc_exit_request(&obddev->u.cli);
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args));
+	ga = ptlrpc_req_async_args(req);
+	ga->ga_exp = exp;
+	ga->ga_minfo = minfo;
+	ga->ga_einfo = einfo;
+
+	req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+	RETURN(0);
+}

diff --git a/drivers/staging/lustre/lustre/mdc/mdc_reint.c b/drivers/staging/lustre/lustre/mdc/mdc_reint.c
new file mode 100644
index 0000000..5e25a07
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_reint.c

@@ -0,0 +1,489 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/kernel.h>
+
+#include <obd_class.h>
+#include "mdc_internal.h"
+#include <lustre_fid.h>
+
+/* mdc_setattr does its own semaphore handling */
+static int mdc_reint(struct ptlrpc_request *request,
+		     struct mdc_rpc_lock *rpc_lock,
+		     int level)
+{
+	int rc;
+
+	request->rq_send_state = level;
+
+	mdc_get_rpc_lock(rpc_lock, NULL);
+	rc = ptlrpc_queue_wait(request);
+	mdc_put_rpc_lock(rpc_lock, NULL);
+	if (rc)
+		CDEBUG(D_INFO, "error in handling %d\n", rc);
+	else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) {
+		rc = -EPROTO;
+	}
+	return rc;
+}
+
+/* Find and cancel locally locks matched by inode @bits & @mode in the resource
+ * found by @fid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+int mdc_resource_get_unused(struct obd_export *exp, struct lu_fid *fid,
+			    struct list_head *cancels, ldlm_mode_t mode,
+			    __u64 bits)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	ldlm_policy_data_t policy = {{0}};
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	int count;
+	ENTRY;
+
+	/* Return, i.e. cancel nothing, only if ELC is supported (flag in
+	 * export) but disabled through procfs (flag in NS).
+	 *
+	 * This distinguishes from a case when ELC is not supported originally,
+	 * when we still want to cancel locks in advance and just cancel them
+	 * locally, without sending any RPC. */
+	if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+		RETURN(0);
+
+	fid_build_reg_res_name(fid, &res_id);
+	res = ldlm_resource_get(exp->exp_obd->obd_namespace,
+				NULL, &res_id, 0, 0);
+	if (res == NULL)
+		RETURN(0);
+	LDLM_RESOURCE_ADDREF(res);
+	/* Initialize ibits lock policy. */
+	policy.l_inodebits.bits = bits;
+	count = ldlm_cancel_resource_local(res, cancels, &policy,
+					   mode, 0, 0, NULL);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(count);
+}
+
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+		void *ea, int ealen, void *ea2, int ea2len,
+		struct ptlrpc_request **request, struct md_open_data **mod)
+{
+	LIST_HEAD(cancels);
+	struct ptlrpc_request *req;
+	struct mdc_rpc_lock *rpc_lock;
+	struct obd_device *obd = exp->exp_obd;
+	int count = 0, rc;
+	__u64 bits;
+	ENTRY;
+
+	LASSERT(op_data != NULL);
+
+	bits = MDS_INODELOCK_UPDATE;
+	if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+		bits |= MDS_INODELOCK_LOOKUP;
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX, bits);
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_SETATTR);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	if ((op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) == 0)
+		req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT,
+				     0);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT,
+			     ea2len);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	rpc_lock = obd->u.cli.cl_rpc_lock;
+
+	if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		CDEBUG(D_INODE, "setting mtime "CFS_TIME_T
+		       ", ctime "CFS_TIME_T"\n",
+		       LTIME_S(op_data->op_attr.ia_mtime),
+		       LTIME_S(op_data->op_attr.ia_ctime));
+	mdc_setattr_pack(req, op_data, ea, ealen, ea2, ea2len);
+
+	ptlrpc_request_set_replen(req);
+	if (mod && (op_data->op_flags & MF_EPOCH_OPEN) &&
+	    req->rq_import->imp_replayable)
+	{
+		LASSERT(*mod == NULL);
+
+		*mod = obd_mod_alloc();
+		if (*mod == NULL) {
+			DEBUG_REQ(D_ERROR, req, "Can't allocate "
+				  "md_open_data");
+		} else {
+			req->rq_replay = 1;
+			req->rq_cb_data = *mod;
+			(*mod)->mod_open_req = req;
+			req->rq_commit_cb = mdc_commit_open;
+			/**
+			 * Take an extra reference on \var mod, it protects \var
+			 * mod from being freed on eviction (commit callback is
+			 * called despite rq_replay flag).
+			 * Will be put on mdc_done_writing().
+			 */
+			obd_mod_get(*mod);
+		}
+	}
+
+	rc = mdc_reint(req, rpc_lock, LUSTRE_IMP_FULL);
+
+	/* Save the obtained info in the original RPC for the replay case. */
+	if (rc == 0 && (op_data->op_flags & MF_EPOCH_OPEN)) {
+		struct mdt_ioepoch *epoch;
+		struct mdt_body  *body;
+
+		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(epoch != NULL);
+		LASSERT(body != NULL);
+		epoch->handle = body->handle;
+		epoch->ioepoch = body->ioepoch;
+		req->rq_replay_cb = mdc_replay_open;
+	/** bug 3633, open may be committed and estale answer is not error */
+	} else if (rc == -ESTALE && (op_data->op_flags & MF_SOM_CHANGE)) {
+		rc = 0;
+	} else if (rc == -ERESTARTSYS) {
+		rc = 0;
+	}
+	*request = req;
+	if (rc && req->rq_commit_cb) {
+		/* Put an extra reference on \var mod on error case. */
+		obd_mod_put(*mod);
+		req->rq_commit_cb(req);
+	}
+	RETURN(rc);
+}
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+	       const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+	       cfs_cap_t cap_effective, __u64 rdev,
+	       struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int level, rc;
+	int count, resends = 0;
+	struct obd_import *import = exp->exp_obd->u.cli.cl_import;
+	int generation = import->imp_generation;
+	LIST_HEAD(cancels);
+	ENTRY;
+
+	/* For case if upper layer did not alloc fid, do it now. */
+	if (!fid_is_sane(&op_data->op_fid2)) {
+		/*
+		 * mdc_fid_alloc() may return errno 1 in case of switch to new
+		 * sequence, handle this.
+		 */
+		rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+		if (rc < 0) {
+			CERROR("Can't alloc new fid, rc %d\n", rc);
+			RETURN(rc);
+		}
+	}
+
+rebuild:
+	count = 0;
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_CREATE_RMT_ACL);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+			     data && datalen ? datalen : 0);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	/*
+	 * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with
+	 * tgt, for symlinks or lov MD data.
+	 */
+	mdc_create_pack(req, op_data, data, datalen, mode, uid,
+			gid, cap_effective, rdev);
+
+	ptlrpc_request_set_replen(req);
+
+	/* ask ptlrpc not to resend on EINPROGRESS since we have our own retry
+	 * logic here */
+	req->rq_no_retry_einprogress = 1;
+
+	if (resends) {
+		req->rq_generation_set = 1;
+		req->rq_import_generation = generation;
+		req->rq_sent = cfs_time_current_sec() + resends;
+	}
+	level = LUSTRE_IMP_FULL;
+ resend:
+	rc = mdc_reint(req, exp->exp_obd->u.cli.cl_rpc_lock, level);
+
+	/* Resend if we were told to. */
+	if (rc == -ERESTARTSYS) {
+		level = LUSTRE_IMP_RECOVER;
+		goto resend;
+	} else if (rc == -EINPROGRESS) {
+		/* Retry create infinitely until succeed or get other
+		 * error code. */
+		ptlrpc_req_finished(req);
+		resends++;
+
+		CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n",
+		       exp->exp_obd->obd_name, resends,
+		       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+		if (generation == import->imp_generation) {
+			goto rebuild;
+		} else {
+			CDEBUG(D_HA, "resend cross eviction\n");
+			RETURN(-EIO);
+		}
+	} else if (rc == 0) {
+		struct mdt_body *body;
+		struct lustre_capa *capa;
+
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body);
+		if (body->valid & OBD_MD_FLMDSCAPA) {
+			capa = req_capsule_server_get(&req->rq_pill,
+						      &RMF_CAPA1);
+			if (capa == NULL)
+				rc = -EPROTO;
+		}
+	}
+
+	*request = req;
+	RETURN(rc);
+}
+
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+	       struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req = *request;
+	int count = 0, rc;
+	ENTRY;
+
+	LASSERT(req == NULL);
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)) &&
+	    !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_FULL);
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_UNLINK);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_unlink_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_cookiesize);
+	ptlrpc_request_set_replen(req);
+
+	*request = req;
+
+	rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+	RETURN(rc);
+}
+
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+	     struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req;
+	int count = 0, rc;
+	ENTRY;
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+	    (fid_is_sane(&op_data->op_fid2)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_UPDATE);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_link_pack(req, op_data);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+	*request = req;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+
+	RETURN(rc);
+}
+
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+	       const char *old, int oldlen, const char *new, int newlen,
+	       struct ptlrpc_request **request)
+{
+	LIST_HEAD(cancels);
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req;
+	int count = 0, rc;
+	ENTRY;
+
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+	    (fid_is_sane(&op_data->op_fid1)))
+		count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+						&cancels, LCK_EX,
+						MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+	    (fid_is_sane(&op_data->op_fid2)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_UPDATE);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+	    (fid_is_sane(&op_data->op_fid3)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_LOOKUP);
+	if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+	     (fid_is_sane(&op_data->op_fid4)))
+		count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+						 &cancels, LCK_EX,
+						 MDS_INODELOCK_FULL);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_REINT_RENAME);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+	req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (exp_connect_cancelset(exp) && req)
+		ldlm_cli_cancel_list(&cancels, count, req, 0);
+
+	mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_cookiesize);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+	*request = req;
+	if (rc == -ERESTARTSYS)
+		rc = 0;
+
+	RETURN(rc);
+}

diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
new file mode 100644
index 0000000..3cf9d8d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c

@@ -0,0 +1,2753 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+# include <linux/utsname.h>
+
+#include <lustre_acl.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+#include <lustre_log.h>
+
+#include "mdc_internal.h"
+
+#define REQUEST_MINOR 244
+
+struct mdc_renew_capa_args {
+	struct obd_capa	*ra_oc;
+	renew_capa_cb_t	 ra_cb;
+};
+
+static int mdc_cleanup(struct obd_device *obd);
+
+int mdc_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+		    const struct req_msg_field *field, struct obd_capa **oc)
+{
+	struct lustre_capa *capa;
+	struct obd_capa *c;
+	ENTRY;
+
+	/* swabbed already in mdc_enqueue */
+	capa = req_capsule_server_get(&req->rq_pill, field);
+	if (capa == NULL)
+		RETURN(-EPROTO);
+
+	c = alloc_capa(CAPA_SITE_CLIENT);
+	if (IS_ERR(c)) {
+		CDEBUG(D_INFO, "alloc capa failed!\n");
+		RETURN(PTR_ERR(c));
+	} else {
+		c->c_capa = *capa;
+		*oc = c;
+		RETURN(0);
+	}
+}
+
+static inline int mdc_queue_wait(struct ptlrpc_request *req)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	int rc;
+
+	/* mdc_enter_request() ensures that this client has no more
+	 * than cl_max_rpcs_in_flight RPCs simultaneously inf light
+	 * against an MDT. */
+	rc = mdc_enter_request(cli);
+	if (rc != 0)
+		return rc;
+
+	rc = ptlrpc_queue_wait(req);
+	mdc_exit_request(cli);
+
+	return rc;
+}
+
+/* Helper that implements most of mdc_getstatus and signal_completed_replay. */
+/* XXX this should become mdc_get_info("key"), sending MDS_GET_INFO RPC */
+static int send_getstatus(struct obd_import *imp, struct lu_fid *rootfid,
+			  struct obd_capa **pc, int level, int msg_flags)
+{
+	struct ptlrpc_request *req;
+	struct mdt_body       *body;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_GETSTATUS,
+					LUSTRE_MDS_VERSION, MDS_GETSTATUS);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_pack_body(req, NULL, NULL, 0, 0, -1, 0);
+	lustre_msg_add_flags(req->rq_reqmsg, msg_flags);
+	req->rq_send_state = level;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	if (body->valid & OBD_MD_FLMDSCAPA) {
+		rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, pc);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	*rootfid = body->fid1;
+	CDEBUG(D_NET,
+	       "root fid="DFID", last_committed="LPU64"\n",
+	       PFID(rootfid),
+	       lustre_msg_get_last_committed(req->rq_repmsg));
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* This should be mdc_get_info("rootfid") */
+int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid,
+		  struct obd_capa **pc)
+{
+	return send_getstatus(class_exp2cliimp(exp), rootfid, pc,
+			      LUSTRE_IMP_FULL, 0);
+}
+
+/*
+ * This function now is known to always saying that it will receive 4 buffers
+ * from server. Even for cases when acl_size and md_size is zero, RPC header
+ * will contain 4 fields and RPC itself will contain zero size fields. This is
+ * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed
+ * and thus zero, it shrinks it, making zero size. The same story about
+ * md_size. And this is course of problem when client waits for smaller number
+ * of fields. This issue will be fixed later when client gets aware of RPC
+ * layouts.  --umka
+ */
+static int mdc_getattr_common(struct obd_export *exp,
+			      struct ptlrpc_request *req)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	struct mdt_body    *body;
+	void	       *eadata;
+	int		 rc;
+	ENTRY;
+
+	/* Request message already built. */
+	rc = ptlrpc_queue_wait(req);
+	if (rc != 0)
+		RETURN(rc);
+
+	/* sanity check for the reply */
+	body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		RETURN(-EPROTO);
+
+	CDEBUG(D_NET, "mode: %o\n", body->mode);
+
+	if (body->eadatasize != 0) {
+		mdc_update_max_ea_from_body(exp, body);
+
+		eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+						      body->eadatasize);
+		if (eadata == NULL)
+			RETURN(-EPROTO);
+	}
+
+	if (body->valid & OBD_MD_FLRMTPERM) {
+		struct mdt_remote_perm *perm;
+
+		LASSERT(client_is_remote(exp));
+		perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+						lustre_swab_mdt_remote_perm);
+		if (perm == NULL)
+			RETURN(-EPROTO);
+	}
+
+	if (body->valid & OBD_MD_FLMDSCAPA) {
+		struct lustre_capa *capa;
+		capa = req_capsule_server_get(pill, &RMF_CAPA1);
+		if (capa == NULL)
+			RETURN(-EPROTO);
+	}
+
+	RETURN(0);
+}
+
+int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
+		struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	/* Single MDS without an LMV case */
+	if (op_data->op_flags & MF_GET_MDT_IDX) {
+		op_data->op_mds = 0;
+		RETURN(0);
+	}
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      op_data->op_valid, op_data->op_mode, -1, 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	if (op_data->op_valid & OBD_MD_FLRMTPERM) {
+		LASSERT(client_is_remote(exp));
+		req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+				     sizeof(struct mdt_remote_perm));
+	}
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_getattr_common(exp, req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+		     struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_GETATTR_NAME);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+			     op_data->op_namelen + 1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      op_data->op_valid, op_data->op_mode,
+		      op_data->op_suppgids[0], 0);
+
+	if (op_data->op_name) {
+		char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
+				op_data->op_namelen);
+		memcpy(name, op_data->op_name, op_data->op_namelen);
+	}
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     op_data->op_mode);
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_getattr_common(exp, req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+static int mdc_is_subdir(struct obd_export *exp,
+			 const struct lu_fid *pfid,
+			 const struct lu_fid *cfid,
+			 struct ptlrpc_request **request)
+{
+	struct ptlrpc_request  *req;
+	int		     rc;
+
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MDS_IS_SUBDIR, LUSTRE_MDS_VERSION,
+					MDS_IS_SUBDIR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_is_subdir_pack(req, pfid, cfid, 0);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc && rc != -EREMOTE)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
+			    const struct lu_fid *fid,
+			    struct obd_capa *oc, int opcode, obd_valid valid,
+			    const char *xattr_name, const char *input,
+			    int input_size, int output_size, int flags,
+			    __u32 suppgid, struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int   xattr_namelen = 0;
+	char *tmp;
+	int   rc;
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+	if (xattr_name) {
+		xattr_namelen = strlen(xattr_name) + 1;
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     xattr_namelen);
+	}
+	if (input_size) {
+		LASSERT(input);
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+				     input_size);
+	}
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (opcode == MDS_REINT) {
+		struct mdt_rec_setxattr *rec;
+
+		CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+			 sizeof(struct mdt_rec_reint));
+		rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+		rec->sx_opcode = REINT_SETXATTR;
+		/* TODO:
+		 *  cfs_curproc_fs{u,g}id() should replace
+		 *  current->fs{u,g}id for portability.
+		 */
+		rec->sx_fsuid  = current_fsuid();
+		rec->sx_fsgid  = current_fsgid();
+		rec->sx_cap    = cfs_curproc_cap_pack();
+		rec->sx_suppgid1 = suppgid;
+		rec->sx_suppgid2 = -1;
+		rec->sx_fid    = *fid;
+		rec->sx_valid  = valid | OBD_MD_FLCTIME;
+		rec->sx_time   = cfs_time_current_sec();
+		rec->sx_size   = output_size;
+		rec->sx_flags  = flags;
+
+		mdc_pack_capa(req, &RMF_CAPA1, oc);
+	} else {
+		mdc_pack_body(req, fid, oc, valid, output_size, suppgid, flags);
+	}
+
+	if (xattr_name) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		memcpy(tmp, xattr_name, xattr_namelen);
+	}
+	if (input_size) {
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+		memcpy(tmp, input, input_size);
+	}
+
+	if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
+		req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+				     RCL_SERVER, output_size);
+	ptlrpc_request_set_replen(req);
+
+	/* make rpc */
+	if (opcode == MDS_REINT)
+		mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+	rc = ptlrpc_queue_wait(req);
+
+	if (opcode == MDS_REINT)
+		mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+		 struct obd_capa *oc, obd_valid valid, const char *xattr_name,
+		 const char *input, int input_size, int output_size,
+		 int flags, __u32 suppgid, struct ptlrpc_request **request)
+{
+	return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
+				fid, oc, MDS_REINT, valid, xattr_name,
+				input, input_size, output_size, flags,
+				suppgid, request);
+}
+
+int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+		 struct obd_capa *oc, obd_valid valid, const char *xattr_name,
+		 const char *input, int input_size, int output_size,
+		 int flags, struct ptlrpc_request **request)
+{
+	return mdc_xattr_common(exp, &RQF_MDS_GETXATTR,
+				fid, oc, MDS_GETXATTR, valid, xattr_name,
+				input, input_size, output_size, flags,
+				-1, request);
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md)
+{
+	struct req_capsule     *pill = &req->rq_pill;
+	struct mdt_body	*body = md->body;
+	struct posix_acl       *acl;
+	void		   *buf;
+	int		     rc;
+	ENTRY;
+
+	if (!body->aclsize)
+		RETURN(0);
+
+	buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->aclsize);
+
+	if (!buf)
+		RETURN(-EPROTO);
+
+	acl = posix_acl_from_xattr(&init_user_ns, buf, body->aclsize);
+	if (IS_ERR(acl)) {
+		rc = PTR_ERR(acl);
+		CERROR("convert xattr to acl: %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = posix_acl_valid(acl);
+	if (rc) {
+		CERROR("validate acl: %d\n", rc);
+		posix_acl_release(acl);
+		RETURN(rc);
+	}
+
+	md->posix_acl = acl;
+	RETURN(0);
+}
+#else
+#define mdc_unpack_acl(req, md) 0
+#endif
+
+int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+		      struct obd_export *dt_exp, struct obd_export *md_exp,
+		      struct lustre_md *md)
+{
+	struct req_capsule *pill = &req->rq_pill;
+	int rc;
+	ENTRY;
+
+	LASSERT(md);
+	memset(md, 0, sizeof(*md));
+
+	md->body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+	LASSERT(md->body != NULL);
+
+	if (md->body->valid & OBD_MD_FLEASIZE) {
+		int lmmsize;
+		struct lov_mds_md *lmm;
+
+		if (!S_ISREG(md->body->mode)) {
+			CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, should be a "
+			       "regular file, but is not\n");
+			GOTO(out, rc = -EPROTO);
+		}
+
+		if (md->body->eadatasize == 0) {
+			CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, "
+			       "but eadatasize 0\n");
+			GOTO(out, rc = -EPROTO);
+		}
+		lmmsize = md->body->eadatasize;
+		lmm = req_capsule_server_sized_get(pill, &RMF_MDT_MD, lmmsize);
+		if (!lmm)
+			GOTO(out, rc = -EPROTO);
+
+		rc = obd_unpackmd(dt_exp, &md->lsm, lmm, lmmsize);
+		if (rc < 0)
+			GOTO(out, rc);
+
+		if (rc < sizeof(*md->lsm)) {
+			CDEBUG(D_INFO, "lsm size too small: "
+			       "rc < sizeof (*md->lsm) (%d < %d)\n",
+			       rc, (int)sizeof(*md->lsm));
+			GOTO(out, rc = -EPROTO);
+		}
+
+	} else if (md->body->valid & OBD_MD_FLDIREA) {
+		int lmvsize;
+		struct lov_mds_md *lmv;
+
+		if(!S_ISDIR(md->body->mode)) {
+			CDEBUG(D_INFO, "OBD_MD_FLDIREA set, should be a "
+			       "directory, but is not\n");
+			GOTO(out, rc = -EPROTO);
+		}
+
+		if (md->body->eadatasize == 0) {
+			CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, "
+			       "but eadatasize 0\n");
+			RETURN(-EPROTO);
+		}
+		if (md->body->valid & OBD_MD_MEA) {
+			lmvsize = md->body->eadatasize;
+			lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+							   lmvsize);
+			if (!lmv)
+				GOTO(out, rc = -EPROTO);
+
+			rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv,
+					  lmvsize);
+			if (rc < 0)
+				GOTO(out, rc);
+
+			if (rc < sizeof(*md->mea)) {
+				CDEBUG(D_INFO, "size too small:  "
+				       "rc < sizeof(*md->mea) (%d < %d)\n",
+					rc, (int)sizeof(*md->mea));
+				GOTO(out, rc = -EPROTO);
+			}
+		}
+	}
+	rc = 0;
+
+	if (md->body->valid & OBD_MD_FLRMTPERM) {
+		/* remote permission */
+		LASSERT(client_is_remote(exp));
+		md->remote_perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+						lustre_swab_mdt_remote_perm);
+		if (!md->remote_perm)
+			GOTO(out, rc = -EPROTO);
+	}
+	else if (md->body->valid & OBD_MD_FLACL) {
+		/* for ACL, it's possible that FLACL is set but aclsize is zero.
+		 * only when aclsize != 0 there's an actual segment for ACL
+		 * in reply buffer.
+		 */
+		if (md->body->aclsize) {
+			rc = mdc_unpack_acl(req, md);
+			if (rc)
+				GOTO(out, rc);
+#ifdef CONFIG_FS_POSIX_ACL
+		} else {
+			md->posix_acl = NULL;
+#endif
+		}
+	}
+	if (md->body->valid & OBD_MD_FLMDSCAPA) {
+		struct obd_capa *oc = NULL;
+
+		rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, &oc);
+		if (rc)
+			GOTO(out, rc);
+		md->mds_capa = oc;
+	}
+
+	if (md->body->valid & OBD_MD_FLOSSCAPA) {
+		struct obd_capa *oc = NULL;
+
+		rc = mdc_unpack_capa(NULL, req, &RMF_CAPA2, &oc);
+		if (rc)
+			GOTO(out, rc);
+		md->oss_capa = oc;
+	}
+
+	EXIT;
+out:
+	if (rc) {
+		if (md->oss_capa) {
+			capa_put(md->oss_capa);
+			md->oss_capa = NULL;
+		}
+		if (md->mds_capa) {
+			capa_put(md->mds_capa);
+			md->mds_capa = NULL;
+		}
+#ifdef CONFIG_FS_POSIX_ACL
+		posix_acl_release(md->posix_acl);
+#endif
+		if (md->lsm)
+			obd_free_memmd(dt_exp, &md->lsm);
+	}
+	return rc;
+}
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+	ENTRY;
+	RETURN(0);
+}
+
+/**
+ * Handles both OPEN and SETATTR RPCs for OPEN-CLOSE and SETATTR-DONE_WRITING
+ * RPC chains.
+ */
+void mdc_replay_open(struct ptlrpc_request *req)
+{
+	struct md_open_data *mod = req->rq_cb_data;
+	struct ptlrpc_request *close_req;
+	struct obd_client_handle *och;
+	struct lustre_handle old;
+	struct mdt_body *body;
+	ENTRY;
+
+	if (mod == NULL) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Can't properly replay without open data.");
+		EXIT;
+		return;
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(body != NULL);
+
+	och = mod->mod_och;
+	if (och != NULL) {
+		struct lustre_handle *file_fh;
+
+		LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
+
+		file_fh = &och->och_fh;
+		CDEBUG(D_HA, "updating handle from "LPX64" to "LPX64"\n",
+		       file_fh->cookie, body->handle.cookie);
+		old = *file_fh;
+		*file_fh = body->handle;
+	}
+	close_req = mod->mod_close_req;
+	if (close_req != NULL) {
+		__u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg);
+		struct mdt_ioepoch *epoch;
+
+		LASSERT(opc == MDS_CLOSE || opc == MDS_DONE_WRITING);
+		epoch = req_capsule_client_get(&close_req->rq_pill,
+					       &RMF_MDT_EPOCH);
+		LASSERT(epoch);
+
+		if (och != NULL)
+			LASSERT(!memcmp(&old, &epoch->handle, sizeof(old)));
+		DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
+		epoch->handle = body->handle;
+	}
+	EXIT;
+}
+
+void mdc_commit_open(struct ptlrpc_request *req)
+{
+	struct md_open_data *mod = req->rq_cb_data;
+	if (mod == NULL)
+		return;
+
+	/**
+	 * No need to touch md_open_data::mod_och, it holds a reference on
+	 * \var mod and will zero references to each other, \var mod will be
+	 * freed after that when md_open_data::mod_och will put the reference.
+	 */
+
+	/**
+	 * Do not let open request to disappear as it still may be needed
+	 * for close rpc to happen (it may happen on evict only, otherwise
+	 * ptlrpc_request::rq_replay does not let mdc_commit_open() to be
+	 * called), just mark this rpc as committed to distinguish these 2
+	 * cases, see mdc_close() for details. The open request reference will
+	 * be put along with freeing \var mod.
+	 */
+	ptlrpc_request_addref(req);
+	spin_lock(&req->rq_lock);
+	req->rq_committed = 1;
+	spin_unlock(&req->rq_lock);
+	req->rq_cb_data = NULL;
+	obd_mod_put(mod);
+}
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+			     struct obd_client_handle *och,
+			     struct ptlrpc_request *open_req)
+{
+	struct md_open_data   *mod;
+	struct mdt_rec_create *rec;
+	struct mdt_body       *body;
+	struct obd_import     *imp = open_req->rq_import;
+	ENTRY;
+
+	if (!open_req->rq_replay)
+		RETURN(0);
+
+	rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT);
+	body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
+	LASSERT(rec != NULL);
+	/* Incoming message in my byte order (it's been swabbed). */
+	/* Outgoing messages always in my byte order. */
+	LASSERT(body != NULL);
+
+	/* Only if the import is replayable, we set replay_open data */
+	if (och && imp->imp_replayable) {
+		mod = obd_mod_alloc();
+		if (mod == NULL) {
+			DEBUG_REQ(D_ERROR, open_req,
+				  "Can't allocate md_open_data");
+			RETURN(0);
+		}
+
+		/**
+		 * Take a reference on \var mod, to be freed on mdc_close().
+		 * It protects \var mod from being freed on eviction (commit
+		 * callback is called despite rq_replay flag).
+		 * Another reference for \var och.
+		 */
+		obd_mod_get(mod);
+		obd_mod_get(mod);
+
+		spin_lock(&open_req->rq_lock);
+		och->och_mod = mod;
+		mod->mod_och = och;
+		mod->mod_open_req = open_req;
+		open_req->rq_cb_data = mod;
+		open_req->rq_commit_cb = mdc_commit_open;
+		spin_unlock(&open_req->rq_lock);
+	}
+
+	rec->cr_fid2 = body->fid1;
+	rec->cr_ioepoch = body->ioepoch;
+	rec->cr_old_handle.cookie = body->handle.cookie;
+	open_req->rq_replay_cb = mdc_replay_open;
+	if (!fid_is_sane(&body->fid1)) {
+		DEBUG_REQ(D_ERROR, open_req, "Saving replay request with "
+			  "insane fid");
+		LBUG();
+	}
+
+	DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+	RETURN(0);
+}
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+			       struct obd_client_handle *och)
+{
+	struct md_open_data *mod = och->och_mod;
+	ENTRY;
+
+	/**
+	 * It is possible to not have \var mod in a case of eviction between
+	 * lookup and ll_file_open().
+	 **/
+	if (mod == NULL)
+		RETURN(0);
+
+	LASSERT(mod != LP_POISON);
+
+	mod->mod_och = NULL;
+	och->och_mod = NULL;
+	obd_mod_put(mod);
+
+	RETURN(0);
+}
+
+/* Prepares the request for the replay by the given reply */
+static void mdc_close_handle_reply(struct ptlrpc_request *req,
+				   struct md_op_data *op_data, int rc) {
+	struct mdt_body  *repbody;
+	struct mdt_ioepoch *epoch;
+
+	if (req && rc == -EAGAIN) {
+		repbody = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+
+		epoch->flags |= MF_SOM_AU;
+		if (repbody->valid & OBD_MD_FLGETATTRLOCK)
+			op_data->op_flags |= MF_GETATTR_LOCK;
+	}
+}
+
+int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
+	      struct md_open_data *mod, struct ptlrpc_request **request)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_CLOSE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	/* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
+	 * portal whose threads are not taking any DLM locks and are therefore
+	 * always progressing */
+	req->rq_request_portal = MDS_READPAGE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	/* Ensure that this close's handle is fixed up during replay. */
+	if (likely(mod != NULL)) {
+		LASSERTF(mod->mod_open_req != NULL &&
+			 mod->mod_open_req->rq_type != LI_POISON,
+			 "POISONED open %p!\n", mod->mod_open_req);
+
+		mod->mod_close_req = req;
+
+		DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
+		/* We no longer want to preserve this open for replay even
+		 * though the open was committed. b=3632, b=3633 */
+		spin_lock(&mod->mod_open_req->rq_lock);
+		mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&mod->mod_open_req->rq_lock);
+	} else {
+		 CDEBUG(D_HA, "couldn't find open req; expecting close error\n");
+	}
+
+	mdc_close_pack(req, op_data);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_easize);
+	req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+			     obd->u.cli.cl_max_mds_cookiesize);
+
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+	if (req->rq_repmsg == NULL) {
+		CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req,
+		       req->rq_status);
+		if (rc == 0)
+			rc = req->rq_status ?: -EIO;
+	} else if (rc == 0 || rc == -EAGAIN) {
+		struct mdt_body *body;
+
+		rc = lustre_msg_get_status(req->rq_repmsg);
+		if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+			DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR, err "
+				  "= %d", rc);
+			if (rc > 0)
+				rc = -rc;
+		}
+		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+		if (body == NULL)
+			rc = -EPROTO;
+	} else if (rc == -ESTALE) {
+		/**
+		 * it can be allowed error after 3633 if open was committed and
+		 * server failed before close was sent. Let's check if mod
+		 * exists and return no error in that case
+		 */
+		if (mod) {
+			DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc);
+			LASSERT(mod->mod_open_req != NULL);
+			if (mod->mod_open_req->rq_committed)
+				rc = 0;
+		}
+	}
+
+	if (mod) {
+		if (rc != 0)
+			mod->mod_close_req = NULL;
+		/* Since now, mod is accessed through open_req only,
+		 * thus close req does not keep a reference on mod anymore. */
+		obd_mod_put(mod);
+	}
+	*request = req;
+	mdc_close_handle_reply(req, op_data, rc);
+	RETURN(rc);
+}
+
+int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data,
+		     struct md_open_data *mod)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_DONE_WRITING);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_DONE_WRITING);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (mod != NULL) {
+		LASSERTF(mod->mod_open_req != NULL &&
+			 mod->mod_open_req->rq_type != LI_POISON,
+			 "POISONED setattr %p!\n", mod->mod_open_req);
+
+		mod->mod_close_req = req;
+		DEBUG_REQ(D_HA, mod->mod_open_req, "matched setattr");
+		/* We no longer want to preserve this setattr for replay even
+		 * though the open was committed. b=3632, b=3633 */
+		spin_lock(&mod->mod_open_req->rq_lock);
+		mod->mod_open_req->rq_replay = 0;
+		spin_unlock(&mod->mod_open_req->rq_lock);
+	}
+
+	mdc_close_pack(req, op_data);
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+	if (rc == -ESTALE) {
+		/**
+		 * it can be allowed error after 3633 if open or setattr were
+		 * committed and server failed before close was sent.
+		 * Let's check if mod exists and return no error in that case
+		 */
+		if (mod) {
+			LASSERT(mod->mod_open_req != NULL);
+			if (mod->mod_open_req->rq_committed)
+				rc = 0;
+		}
+	}
+
+	if (mod) {
+		if (rc != 0)
+			mod->mod_close_req = NULL;
+		/* Since now, mod is accessed through setattr req only,
+		 * thus DW req does not keep a reference on mod anymore. */
+		obd_mod_put(mod);
+	}
+
+	mdc_close_handle_reply(req, op_data, rc);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+
+int mdc_readpage(struct obd_export *exp, struct md_op_data *op_data,
+		 struct page **pages, struct ptlrpc_request **request)
+{
+	struct ptlrpc_request   *req;
+	struct ptlrpc_bulk_desc *desc;
+	int		      i;
+	wait_queue_head_t	      waitq;
+	int		      resends = 0;
+	struct l_wait_info       lwi;
+	int		      rc;
+	ENTRY;
+
+	*request = NULL;
+	init_waitqueue_head(&waitq);
+
+restart_bulk:
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	req->rq_request_portal = MDS_READPAGE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	desc = ptlrpc_prep_bulk_imp(req, op_data->op_npages, 1, BULK_PUT_SINK,
+				    MDS_BULK_PORTAL);
+	if (desc == NULL) {
+		ptlrpc_request_free(req);
+		RETURN(-ENOMEM);
+	}
+
+	/* NB req now owns desc and will free it when it gets freed */
+	for (i = 0; i < op_data->op_npages; i++)
+		ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+	mdc_readdir_pack(req, op_data->op_offset,
+			 PAGE_CACHE_SIZE * op_data->op_npages,
+			 &op_data->op_fid1, op_data->op_capa1);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		ptlrpc_req_finished(req);
+		if (rc != -ETIMEDOUT)
+			RETURN(rc);
+
+		resends++;
+		if (!client_should_resend(resends, &exp->exp_obd->u.cli)) {
+			CERROR("too many resend retries, returning error\n");
+			RETURN(-EIO);
+		}
+		lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, NULL);
+		l_wait_event(waitq, 0, &lwi);
+
+		goto restart_bulk;
+	}
+
+	rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk,
+					  req->rq_bulk->bd_nob_transferred);
+	if (rc < 0) {
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	}
+
+	if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) {
+		CERROR("Unexpected # bytes transferred: %d (%ld expected)\n",
+			req->rq_bulk->bd_nob_transferred,
+			PAGE_CACHE_SIZE * op_data->op_npages);
+		ptlrpc_req_finished(req);
+		RETURN(-EPROTO);
+	}
+
+	*request = req;
+	RETURN(0);
+}
+
+static int mdc_statfs(const struct lu_env *env,
+		      struct obd_export *exp, struct obd_statfs *osfs,
+		      __u64 max_age, __u32 flags)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct obd_statfs     *msfs;
+	struct obd_import     *imp = NULL;
+	int		    rc;
+	ENTRY;
+
+	/*
+	 * Since the request might also come from lprocfs, so we need
+	 * sync this with client_disconnect_export Bug15684
+	 */
+	down_read(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import)
+		imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+	if (!imp)
+		RETURN(-ENODEV);
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS,
+					LUSTRE_MDS_VERSION, MDS_STATFS);
+	if (req == NULL)
+		GOTO(output, rc = -ENOMEM);
+
+	ptlrpc_request_set_replen(req);
+
+	if (flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stay in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		/* check connection error first */
+		if (imp->imp_connect_error)
+			rc = imp->imp_connect_error;
+		GOTO(out, rc);
+	}
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*osfs = *msfs;
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+output:
+	class_import_put(imp);
+	return rc;
+}
+
+static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf)
+{
+	__u32 keylen, vallen;
+	void *key;
+	int rc;
+
+	if (gf->gf_pathlen > PATH_MAX)
+		RETURN(-ENAMETOOLONG);
+	if (gf->gf_pathlen < 2)
+		RETURN(-EOVERFLOW);
+
+	/* Key is KEY_FID2PATH + getinfo_fid2path description */
+	keylen = cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf);
+	OBD_ALLOC(key, keylen);
+	if (key == NULL)
+		RETURN(-ENOMEM);
+	memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH));
+	memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf));
+
+	CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n",
+	       PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno);
+
+	if (!fid_is_sane(&gf->gf_fid))
+		GOTO(out, rc = -EINVAL);
+
+	/* Val is struct getinfo_fid2path result plus path */
+	vallen = sizeof(*gf) + gf->gf_pathlen;
+
+	rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf, NULL);
+	if (rc != 0 && rc != -EREMOTE)
+		GOTO(out, rc);
+
+	if (vallen <= sizeof(*gf))
+		GOTO(out, rc = -EPROTO);
+	else if (vallen > sizeof(*gf) + gf->gf_pathlen)
+		GOTO(out, rc = -EOVERFLOW);
+
+	CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n%s\n",
+	       PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, gf->gf_path);
+
+out:
+	OBD_FREE(key, keylen);
+	return rc;
+}
+
+static int mdc_ioc_hsm_progress(struct obd_export *exp,
+				struct hsm_progress_kernel *hpk)
+{
+	struct obd_import		*imp = class_exp2cliimp(exp);
+	struct hsm_progress_kernel	*req_hpk;
+	struct ptlrpc_request		*req;
+	int				 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS,
+					LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	/* Copy hsm_progress struct */
+	req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS);
+	if (req_hpk == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*req_hpk = *hpk;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives)
+{
+	__u32			*archive_mask;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER,
+					LUSTRE_MDS_VERSION,
+					MDS_HSM_CT_REGISTER);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	/* Copy hsm_progress struct */
+	archive_mask = req_capsule_client_get(&req->rq_pill,
+					      &RMF_MDS_HSM_ARCHIVE);
+	if (archive_mask == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*archive_mask = archives;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_current_action(struct obd_export *exp,
+				      struct md_op_data *op_data)
+{
+	struct hsm_current_action	*hca = op_data->op_data;
+	struct hsm_current_action	*req_hca;
+	struct ptlrpc_request		*req;
+	int				 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_ACTION);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	req_hca = req_capsule_server_get(&req->rq_pill,
+					 &RMF_MDS_HSM_CURRENT_ACTION);
+	if (req_hca == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*hca = *req_hca;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp)
+{
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER,
+					LUSTRE_MDS_VERSION,
+					MDS_HSM_CT_UNREGISTER);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_state_get(struct obd_export *exp,
+				 struct md_op_data *op_data)
+{
+	struct hsm_user_state	*hus = op_data->op_data;
+	struct hsm_user_state	*req_hus;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_STATE_GET);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET);
+	if (rc != 0) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE);
+	if (req_hus == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	*hus = *req_hus;
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_state_set(struct obd_export *exp,
+				 struct md_op_data *op_data)
+{
+	struct hsm_state_set	*hss = op_data->op_data;
+	struct hsm_state_set	*req_hss;
+	struct ptlrpc_request	*req;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_HSM_STATE_SET);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+		      OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+	/* Copy states */
+	req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET);
+	if (req_hss == NULL)
+		GOTO(out, rc = -EPROTO);
+	*req_hss = *hss;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_ioc_hsm_request(struct obd_export *exp,
+			       struct hsm_user_request *hur)
+{
+	struct obd_import	*imp = class_exp2cliimp(exp);
+	struct ptlrpc_request	*req;
+	struct hsm_request	*req_hr;
+	struct hsm_user_item	*req_hui;
+	char			*req_opaque;
+	int			 rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT,
+			     hur->hur_request.hr_itemcount
+			     * sizeof(struct hsm_user_item));
+	req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT,
+			     hur->hur_request.hr_data_len);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+	/* Copy hsm_request struct */
+	req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST);
+	if (req_hr == NULL)
+		GOTO(out, rc = -EPROTO);
+	*req_hr = hur->hur_request;
+
+	/* Copy hsm_user_item structs */
+	req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM);
+	if (req_hui == NULL)
+		GOTO(out, rc = -EPROTO);
+	memcpy(req_hui, hur->hur_user_item,
+	       hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item));
+
+	/* Copy opaque field */
+	req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA);
+	if (req_opaque == NULL)
+		GOTO(out, rc = -EPROTO);
+	memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = mdc_queue_wait(req);
+	GOTO(out, rc);
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static struct kuc_hdr *changelog_kuc_hdr(char *buf, int len, int flags)
+{
+	struct kuc_hdr *lh = (struct kuc_hdr *)buf;
+
+	LASSERT(len <= CR_MAXSIZE);
+
+	lh->kuc_magic = KUC_MAGIC;
+	lh->kuc_transport = KUC_TRANSPORT_CHANGELOG;
+	lh->kuc_flags = flags;
+	lh->kuc_msgtype = CL_RECORD;
+	lh->kuc_msglen = len;
+	return lh;
+}
+
+#define D_CHANGELOG 0
+
+struct changelog_show {
+	__u64		cs_startrec;
+	__u32		cs_flags;
+	struct file	*cs_fp;
+	char		*cs_buf;
+	struct obd_device *cs_obd;
+};
+
+static int changelog_kkuc_cb(const struct lu_env *env, struct llog_handle *llh,
+			     struct llog_rec_hdr *hdr, void *data)
+{
+	struct changelog_show *cs = data;
+	struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr;
+	struct kuc_hdr *lh;
+	int len, rc;
+	ENTRY;
+
+	if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
+		rc = -EINVAL;
+		CERROR("%s: not a changelog rec %x/%d: rc = %d\n",
+		       cs->cs_obd->obd_name, rec->cr_hdr.lrh_type,
+		       rec->cr.cr_type, rc);
+		RETURN(rc);
+	}
+
+	if (rec->cr.cr_index < cs->cs_startrec) {
+		/* Skip entries earlier than what we are interested in */
+		CDEBUG(D_CHANGELOG, "rec="LPU64" start="LPU64"\n",
+		       rec->cr.cr_index, cs->cs_startrec);
+		RETURN(0);
+	}
+
+	CDEBUG(D_CHANGELOG, LPU64" %02d%-5s "LPU64" 0x%x t="DFID" p="DFID
+		" %.*s\n", rec->cr.cr_index, rec->cr.cr_type,
+		changelog_type2str(rec->cr.cr_type), rec->cr.cr_time,
+		rec->cr.cr_flags & CLF_FLAGMASK,
+		PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
+		rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
+
+	len = sizeof(*lh) + changelog_rec_size(&rec->cr) + rec->cr.cr_namelen;
+
+	/* Set up the message */
+	lh = changelog_kuc_hdr(cs->cs_buf, len, cs->cs_flags);
+	memcpy(lh + 1, &rec->cr, len - sizeof(*lh));
+
+	rc = libcfs_kkuc_msg_put(cs->cs_fp, lh);
+	CDEBUG(D_CHANGELOG, "kucmsg fp %p len %d rc %d\n", cs->cs_fp, len,rc);
+
+	RETURN(rc);
+}
+
+static int mdc_changelog_send_thread(void *csdata)
+{
+	struct changelog_show *cs = csdata;
+	struct llog_ctxt *ctxt = NULL;
+	struct llog_handle *llh = NULL;
+	struct kuc_hdr *kuch;
+	int rc;
+
+	CDEBUG(D_CHANGELOG, "changelog to fp=%p start "LPU64"\n",
+	       cs->cs_fp, cs->cs_startrec);
+
+	OBD_ALLOC(cs->cs_buf, CR_MAXSIZE);
+	if (cs->cs_buf == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	/* Set up the remote catalog handle */
+	ctxt = llog_get_context(cs->cs_obd, LLOG_CHANGELOG_REPL_CTXT);
+	if (ctxt == NULL)
+		GOTO(out, rc = -ENOENT);
+	rc = llog_open(NULL, ctxt, &llh, NULL, CHANGELOG_CATALOG,
+		       LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("%s: fail to open changelog catalog: rc = %d\n",
+		       cs->cs_obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+	rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT, NULL);
+	if (rc) {
+		CERROR("llog_init_handle failed %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	rc = llog_cat_process(NULL, llh, changelog_kkuc_cb, cs, 0, 0);
+
+	/* Send EOF no matter what our result */
+	if ((kuch = changelog_kuc_hdr(cs->cs_buf, sizeof(*kuch),
+				      cs->cs_flags))) {
+		kuch->kuc_msgtype = CL_EOF;
+		libcfs_kkuc_msg_put(cs->cs_fp, kuch);
+	}
+
+out:
+	fput(cs->cs_fp);
+	if (llh)
+		llog_cat_close(NULL, llh);
+	if (ctxt)
+		llog_ctxt_put(ctxt);
+	if (cs->cs_buf)
+		OBD_FREE(cs->cs_buf, CR_MAXSIZE);
+	OBD_FREE_PTR(cs);
+	return rc;
+}
+
+static int mdc_ioc_changelog_send(struct obd_device *obd,
+				  struct ioc_changelog *icc)
+{
+	struct changelog_show *cs;
+	int rc;
+
+	/* Freed in mdc_changelog_send_thread */
+	OBD_ALLOC_PTR(cs);
+	if (!cs)
+		return -ENOMEM;
+
+	cs->cs_obd = obd;
+	cs->cs_startrec = icc->icc_recno;
+	/* matching fput in mdc_changelog_send_thread */
+	cs->cs_fp = fget(icc->icc_id);
+	cs->cs_flags = icc->icc_flags;
+
+	/*
+	 * New thread because we should return to user app before
+	 * writing into our pipe
+	 */
+	rc = PTR_ERR(kthread_run(mdc_changelog_send_thread, cs,
+				 "mdc_clg_send_thread"));
+	if (!IS_ERR_VALUE(rc)) {
+		CDEBUG(D_CHANGELOG, "start changelog thread\n");
+		return 0;
+	}
+
+	CERROR("Failed to start changelog thread: %d\n", rc);
+	OBD_FREE_PTR(cs);
+	return rc;
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+				struct lustre_kernelcomm *lk);
+
+static int mdc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+			  struct obd_quotactl *oqctl)
+{
+	struct client_obd       *cli = &exp->exp_obd->u.cli;
+	struct ptlrpc_request   *req;
+	struct obd_quotactl     *body;
+	int		      rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MDS_QUOTACHECK, LUSTRE_MDS_VERSION,
+					MDS_QUOTACHECK);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*body = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+
+	/* the next poll will find -ENODATA, that means quotacheck is
+	 * going on */
+	cli->cl_qchk_stat = -ENODATA;
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		cli->cl_qchk_stat = rc;
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int mdc_quota_poll_check(struct obd_export *exp,
+				struct if_quotacheck *qchk)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	int rc;
+	ENTRY;
+
+	qchk->obd_uuid = cli->cl_target_uuid;
+	memcpy(qchk->obd_type, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME));
+
+	rc = cli->cl_qchk_stat;
+	/* the client is not the previous one */
+	if (rc == CL_NOT_QUOTACHECKED)
+		rc = -EINTR;
+	RETURN(rc);
+}
+
+static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp,
+			struct obd_quotactl *oqctl)
+{
+	struct ptlrpc_request   *req;
+	struct obd_quotactl     *oqc;
+	int		      rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION,
+					MDS_QUOTACTL);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*oqc = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+	ptlrpc_at_set_req_timeout(req);
+	req->rq_no_resend = 1;
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+	if (req->rq_repmsg &&
+	    (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+		*oqctl = *oqc;
+	} else if (!rc) {
+		CERROR ("Can't unpack obd_quotactl\n");
+		rc = -EPROTO;
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static int mdc_ioc_swap_layouts(struct obd_export *exp,
+				struct md_op_data *op_data)
+{
+	LIST_HEAD(cancels);
+	struct ptlrpc_request	*req;
+	int			 rc, count;
+	struct mdc_swap_layouts *msl, *payload;
+	ENTRY;
+
+	msl = op_data->op_data;
+
+	/* When the MDT will get the MDS_SWAP_LAYOUTS RPC the
+	 * first thing it will do is to cancel the 2 layout
+	 * locks hold by this client.
+	 * So the client must cancel its layout locks on the 2 fids
+	 * with the request RPC to avoid extra RPC round trips
+	 */
+	count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels,
+					LCK_CR, MDS_INODELOCK_LAYOUT);
+	count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels,
+					 LCK_CR, MDS_INODELOCK_LAYOUT);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+				   &RQF_MDS_SWAP_LAYOUTS);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+	mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+	mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+
+	rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_swap_layouts_pack(req, op_data);
+
+	payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS);
+	LASSERT(payload);
+
+	*payload = *msl;
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+	EXIT;
+
+out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void *uarg)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct obd_ioctl_data *data = karg;
+	struct obd_import *imp = obd->u.cli.cl_import;
+	struct llog_ctxt *ctxt;
+	int rc;
+	ENTRY;
+
+	if (!try_module_get(THIS_MODULE)) {
+		CERROR("Can't get module. Is it alive?");
+		return -EINVAL;
+	}
+	switch (cmd) {
+	case OBD_IOC_CHANGELOG_SEND:
+		rc = mdc_ioc_changelog_send(obd, karg);
+		GOTO(out, rc);
+	case OBD_IOC_CHANGELOG_CLEAR: {
+		struct ioc_changelog *icc = karg;
+		struct changelog_setinfo cs =
+			{.cs_recno = icc->icc_recno, .cs_id = icc->icc_id};
+		rc = obd_set_info_async(NULL, exp, strlen(KEY_CHANGELOG_CLEAR),
+					KEY_CHANGELOG_CLEAR, sizeof(cs), &cs,
+					NULL);
+		GOTO(out, rc);
+	}
+	case OBD_IOC_FID2PATH:
+		rc = mdc_ioc_fid2path(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_CT_START:
+		rc = mdc_ioc_hsm_ct_start(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_PROGRESS:
+		rc = mdc_ioc_hsm_progress(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_STATE_GET:
+		rc = mdc_ioc_hsm_state_get(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_STATE_SET:
+		rc = mdc_ioc_hsm_state_set(exp, karg);
+	case LL_IOC_HSM_ACTION:
+		rc = mdc_ioc_hsm_current_action(exp, karg);
+		GOTO(out, rc);
+	case LL_IOC_HSM_REQUEST:
+		rc = mdc_ioc_hsm_request(exp, karg);
+		GOTO(out, rc);
+	case OBD_IOC_CLIENT_RECOVER:
+		rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0);
+		if (rc < 0)
+			GOTO(out, rc);
+		GOTO(out, rc = 0);
+	case IOC_OSC_SET_ACTIVE:
+		rc = ptlrpc_set_import_active(imp, data->ioc_offset);
+		GOTO(out, rc);
+	case OBD_IOC_PARSE: {
+		ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT);
+		rc = class_config_parse_llog(NULL, ctxt, data->ioc_inlbuf1,
+					     NULL);
+		llog_ctxt_put(ctxt);
+		GOTO(out, rc);
+	}
+	case OBD_IOC_LLOG_INFO:
+	case OBD_IOC_LLOG_PRINT: {
+		ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+		rc = llog_ioctl(NULL, ctxt, cmd, data);
+		llog_ctxt_put(ctxt);
+		GOTO(out, rc);
+	}
+	case OBD_IOC_POLL_QUOTACHECK:
+		rc = mdc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+		GOTO(out, rc);
+	case OBD_IOC_PING_TARGET:
+		rc = ptlrpc_obd_ping(obd);
+		GOTO(out, rc);
+	/*
+	 * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by
+	 * LMV instead of MDC. But when the cluster is upgraded from 1.8,
+	 * there'd be no LMV layer thus we might be called here. Eventually
+	 * this code should be removed.
+	 * bz20731, LU-592.
+	 */
+	case IOC_OBD_STATFS: {
+		struct obd_statfs stat_buf = {0};
+
+		if (*((__u32 *) data->ioc_inlbuf2) != 0)
+			GOTO(out, rc = -ENODEV);
+
+		/* copy UUID */
+		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd),
+				     min((int) data->ioc_plen2,
+					 (int) sizeof(struct obd_uuid))))
+			GOTO(out, rc = -EFAULT);
+
+		rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
+				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+				0);
+		if (rc != 0)
+			GOTO(out, rc);
+
+		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+				     min((int) data->ioc_plen1,
+					 (int) sizeof(stat_buf))))
+			GOTO(out, rc = -EFAULT);
+
+		GOTO(out, rc = 0);
+	}
+	case OBD_IOC_QUOTACTL: {
+		struct if_quotactl *qctl = karg;
+		struct obd_quotactl *oqctl;
+
+		OBD_ALLOC_PTR(oqctl);
+		if (!oqctl)
+			RETURN(-ENOMEM);
+
+		QCTL_COPY(oqctl, qctl);
+		rc = obd_quotactl(exp, oqctl);
+		if (rc == 0) {
+			QCTL_COPY(qctl, oqctl);
+			qctl->qc_valid = QC_MDTIDX;
+			qctl->obd_uuid = obd->u.cli.cl_target_uuid;
+		}
+		OBD_FREE_PTR(oqctl);
+		break;
+	}
+	case LL_IOC_GET_CONNECT_FLAGS: {
+		if (copy_to_user(uarg,
+				     exp_connect_flags_ptr(exp),
+				     sizeof(__u64)))
+			GOTO(out, rc = -EFAULT);
+		else
+			GOTO(out, rc = 0);
+	}
+	case LL_IOC_LOV_SWAP_LAYOUTS: {
+		rc = mdc_ioc_swap_layouts(exp, karg);
+		break;
+	}
+	default:
+		CERROR("mdc_ioctl(): unrecognised ioctl %#x\n", cmd);
+		GOTO(out, rc = -ENOTTY);
+	}
+out:
+	module_put(THIS_MODULE);
+
+	return rc;
+}
+
+int mdc_get_info_rpc(struct obd_export *exp,
+		     obd_count keylen, void *key,
+		     int vallen, void *val)
+{
+	struct obd_import      *imp = class_exp2cliimp(exp);
+	struct ptlrpc_request  *req;
+	char		   *tmp;
+	int		     rc = -EINVAL;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
+			     RCL_CLIENT, sizeof(__u32));
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
+	memcpy(tmp, &vallen, sizeof(__u32));
+
+	req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
+			     RCL_SERVER, vallen);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	/* -EREMOTE means the get_info result is partial, and it needs to
+	 * continue on another MDT, see fid2path part in lmv_iocontrol */
+	if (rc == 0 || rc == -EREMOTE) {
+		tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL);
+		memcpy(val, tmp, vallen);
+		if (ptlrpc_rep_need_swab(req)) {
+			if (KEY_IS(KEY_FID2PATH))
+				lustre_swab_fid2path(val);
+		}
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static void lustre_swab_hai(struct hsm_action_item *h)
+{
+	__swab32s(&h->hai_len);
+	__swab32s(&h->hai_action);
+	lustre_swab_lu_fid(&h->hai_fid);
+	lustre_swab_lu_fid(&h->hai_dfid);
+	__swab64s(&h->hai_cookie);
+	__swab64s(&h->hai_extent.offset);
+	__swab64s(&h->hai_extent.length);
+	__swab64s(&h->hai_gid);
+}
+
+static void lustre_swab_hal(struct hsm_action_list *h)
+{
+	struct hsm_action_item	*hai;
+	int			 i;
+
+	__swab32s(&h->hal_version);
+	__swab32s(&h->hal_count);
+	__swab32s(&h->hal_archive_id);
+	__swab64s(&h->hal_flags);
+	hai = hai_zero(h);
+	for (i = 0; i < h->hal_count; i++) {
+		lustre_swab_hai(hai);
+		hai = hai_next(hai);
+	}
+}
+
+static void lustre_swab_kuch(struct kuc_hdr *l)
+{
+	__swab16s(&l->kuc_magic);
+	/* __u8 l->kuc_transport */
+	__swab16s(&l->kuc_msgtype);
+	__swab16s(&l->kuc_msglen);
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+				struct lustre_kernelcomm *lk)
+{
+	struct obd_import  *imp = class_exp2cliimp(exp);
+	__u32		    archive = lk->lk_data;
+	int		    rc = 0;
+
+	if (lk->lk_group != KUC_GRP_HSM) {
+		CERROR("Bad copytool group %d\n", lk->lk_group);
+		return -EINVAL;
+	}
+
+	CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd,
+	       lk->lk_uid, lk->lk_group, lk->lk_flags);
+
+	if (lk->lk_flags & LK_FLG_STOP) {
+		rc = libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
+		/* Unregister with the coordinator */
+		if (rc == 0)
+			rc = mdc_ioc_hsm_ct_unregister(imp);
+	} else {
+		struct file *fp = fget(lk->lk_wfd);
+
+		rc = libcfs_kkuc_group_add(fp, lk->lk_uid, lk->lk_group,
+					   lk->lk_data);
+		if (rc && fp)
+			fput(fp);
+		if (rc == 0)
+			rc = mdc_ioc_hsm_ct_register(imp, archive);
+	}
+
+	return rc;
+}
+
+/**
+ * Send a message to any listening copytools
+ * @param val KUC message (kuc_hdr + hsm_action_list)
+ * @param len total length of message
+ */
+static int mdc_hsm_copytool_send(int len, void *val)
+{
+	struct kuc_hdr		*lh = (struct kuc_hdr *)val;
+	struct hsm_action_list	*hal = (struct hsm_action_list *)(lh + 1);
+	int			 rc;
+	ENTRY;
+
+	if (len < sizeof(*lh) + sizeof(*hal)) {
+		CERROR("Short HSM message %d < %d\n", len,
+		       (int) (sizeof(*lh) + sizeof(*hal)));
+		RETURN(-EPROTO);
+	}
+	if (lh->kuc_magic == __swab16(KUC_MAGIC)) {
+		lustre_swab_kuch(lh);
+		lustre_swab_hal(hal);
+	} else if (lh->kuc_magic != KUC_MAGIC) {
+		CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC);
+		RETURN(-EPROTO);
+	}
+
+	CDEBUG(D_HSM, " Received message mg=%x t=%d m=%d l=%d actions=%d "
+	       "on %s\n",
+	       lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype,
+	       lh->kuc_msglen, hal->hal_count, hal->hal_fsname);
+
+	/* Broadcast to HSM listeners */
+	rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+
+	RETURN(rc);
+}
+
+/**
+ * callback function passed to kuc for re-registering each HSM copytool
+ * running on MDC, after MDT shutdown/recovery.
+ * @param data archive id served by the copytool
+ * @param cb_arg callback argument (obd_import)
+ */
+static int mdc_hsm_ct_reregister(__u32 data, void *cb_arg)
+{
+	struct obd_import	*imp = (struct obd_import *)cb_arg;
+	__u32			 archive = data;
+	int			 rc;
+
+	CDEBUG(D_HA, "recover copytool registration to MDT (archive=%#x)\n",
+	       archive);
+	rc = mdc_ioc_hsm_ct_register(imp, archive);
+
+	/* ignore error if the copytool is already registered */
+	return ((rc != 0) && (rc != -EEXIST)) ? rc : 0;
+}
+
+/**
+ * Re-establish all kuc contexts with MDT
+ * after MDT shutdown/recovery.
+ */
+static int mdc_kuc_reregister(struct obd_import *imp)
+{
+	/* re-register HSM agents */
+	return libcfs_kkuc_group_foreach(KUC_GRP_HSM, mdc_hsm_ct_reregister,
+					 (void *)imp);
+}
+
+int mdc_set_info_async(const struct lu_env *env,
+		       struct obd_export *exp,
+		       obd_count keylen, void *key,
+		       obd_count vallen, void *val,
+		       struct ptlrpc_request_set *set)
+{
+	struct obd_import	*imp = class_exp2cliimp(exp);
+	int			 rc;
+	ENTRY;
+
+	if (KEY_IS(KEY_READ_ONLY)) {
+		if (vallen != sizeof(int))
+			RETURN(-EINVAL);
+
+		spin_lock(&imp->imp_lock);
+		if (*((int *)val)) {
+			imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY;
+			imp->imp_connect_data.ocd_connect_flags |=
+							OBD_CONNECT_RDONLY;
+		} else {
+			imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY;
+			imp->imp_connect_data.ocd_connect_flags &=
+							~OBD_CONNECT_RDONLY;
+		}
+		spin_unlock(&imp->imp_lock);
+
+		rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+				       keylen, key, vallen, val, set);
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_SPTLRPC_CONF)) {
+		sptlrpc_conf_client_adapt(exp->exp_obd);
+		RETURN(0);
+	}
+	if (KEY_IS(KEY_FLUSH_CTX)) {
+		sptlrpc_import_flush_my_ctx(imp);
+		RETURN(0);
+	}
+	if (KEY_IS(KEY_MDS_CONN)) {
+		/* mds-mds import */
+		spin_lock(&imp->imp_lock);
+		imp->imp_server_timeout = 1;
+		spin_unlock(&imp->imp_lock);
+		imp->imp_client->cli_request_portal = MDS_MDS_PORTAL;
+		CDEBUG(D_OTHER, "%s: timeout / 2\n", exp->exp_obd->obd_name);
+		RETURN(0);
+	}
+	if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
+		rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+				       keylen, key, vallen, val, set);
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) {
+		rc = mdc_hsm_copytool_send(vallen, val);
+		RETURN(rc);
+	}
+
+	CERROR("Unknown key %s\n", (char *)key);
+	RETURN(-EINVAL);
+}
+
+int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
+		 __u32 keylen, void *key, __u32 *vallen, void *val,
+		 struct lov_stripe_md *lsm)
+{
+	int rc = -EINVAL;
+
+	if (KEY_IS(KEY_MAX_EASIZE)) {
+		int mdsize, *max_easize;
+
+		if (*vallen != sizeof(int))
+			RETURN(-EINVAL);
+		mdsize = *(int*)val;
+		if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize)
+			exp->exp_obd->u.cli.cl_max_mds_easize = mdsize;
+		max_easize = val;
+		*max_easize = exp->exp_obd->u.cli.cl_max_mds_easize;
+		RETURN(0);
+	} else if (KEY_IS(KEY_CONN_DATA)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		struct obd_connect_data *data = val;
+
+		if (*vallen != sizeof(*data))
+			RETURN(-EINVAL);
+
+		*data = imp->imp_connect_data;
+		RETURN(0);
+	} else if (KEY_IS(KEY_TGT_COUNT)) {
+		*((int *)val) = 1;
+		RETURN(0);
+	}
+
+	rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val);
+
+	RETURN(rc);
+}
+
+static int mdc_pin(struct obd_export *exp, const struct lu_fid *fid,
+		   struct obd_capa *oc, struct obd_client_handle *handle,
+		   int flags)
+{
+	struct ptlrpc_request *req;
+	struct mdt_body       *body;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_PIN);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_PIN);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, fid, oc, 0, 0, -1, flags);
+
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	if (rc) {
+		CERROR("Pin failed: %d\n", rc);
+		GOTO(err_out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(err_out, rc = -EPROTO);
+
+	handle->och_fh = body->handle;
+	handle->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+
+	handle->och_mod = obd_mod_alloc();
+	if (handle->och_mod == NULL) {
+		DEBUG_REQ(D_ERROR, req, "can't allocate md_open_data");
+		GOTO(err_out, rc = -ENOMEM);
+	}
+	handle->och_mod->mod_open_req = req; /* will be dropped by unpin */
+
+	RETURN(0);
+
+err_out:
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int mdc_unpin(struct obd_export *exp, struct obd_client_handle *handle,
+		     int flag)
+{
+	struct ptlrpc_request *req;
+	struct mdt_body       *body;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_UNPIN,
+					LUSTRE_MDS_VERSION, MDS_UNPIN);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+	body->handle = handle->och_fh;
+	body->flags = flag;
+
+	ptlrpc_request_set_replen(req);
+
+	mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+	rc = ptlrpc_queue_wait(req);
+	mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+	if (rc != 0)
+		CERROR("Unpin failed: %d\n", rc);
+
+	ptlrpc_req_finished(req);
+	ptlrpc_req_finished(handle->och_mod->mod_open_req);
+
+	obd_mod_put(handle->och_mod);
+	RETURN(rc);
+}
+
+int mdc_sync(struct obd_export *exp, const struct lu_fid *fid,
+	     struct obd_capa *oc, struct ptlrpc_request **request)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, fid, oc, 0, 0, -1, 0);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	int rc = 0;
+
+	LASSERT(imp->imp_obd == obd);
+
+	switch (event) {
+	case IMP_EVENT_DISCON: {
+#if 0
+		/* XXX Pass event up to OBDs stack. used only for FLD now */
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DISCON, NULL);
+#endif
+		break;
+	}
+	case IMP_EVENT_INACTIVE: {
+		struct client_obd *cli = &obd->u.cli;
+		/*
+		 * Flush current sequence to make client obtain new one
+		 * from server in case of disconnect/reconnect.
+		 */
+		if (cli->cl_seq != NULL)
+			seq_client_flush(cli->cl_seq);
+
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+		break;
+	}
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+
+		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+
+		break;
+	}
+	case IMP_EVENT_ACTIVE:
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+		/* restore re-establish kuc registration after reconnecting */
+		if (rc == 0)
+			rc = mdc_kuc_reregister(imp);
+		break;
+	case IMP_EVENT_OCD:
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+		break;
+	case IMP_EVENT_DEACTIVATE:
+	case IMP_EVENT_ACTIVATE:
+		break;
+	default:
+		CERROR("Unknown import event %x\n", event);
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+		  struct md_op_data *op_data)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	struct lu_client_seq *seq = cli->cl_seq;
+	ENTRY;
+	RETURN(seq_client_alloc_fid(NULL, seq, fid));
+}
+
+struct obd_uuid *mdc_get_uuid(struct obd_export *exp) {
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	return &cli->cl_target_uuid;
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying it during
+ * recovery, non zero value will be return if the lock can be canceled,
+ * or zero returned for not
+ */
+static int mdc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+	if (lock->l_resource->lr_type != LDLM_IBITS)
+		RETURN(0);
+
+	/* FIXME: if we ever get into a situation where there are too many
+	 * opened files with open locks on a single node, then we really
+	 * should replay these open locks to reget it */
+	if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
+		RETURN(0);
+
+	RETURN(1);
+}
+
+static int mdc_resource_inode_free(struct ldlm_resource *res)
+{
+	if (res->lr_lvb_inode)
+		res->lr_lvb_inode = NULL;
+
+	return 0;
+}
+
+struct ldlm_valblock_ops inode_lvbo = {
+	lvbo_free: mdc_resource_inode_free
+};
+
+static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+	struct client_obd *cli = &obd->u.cli;
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+	if (!cli->cl_rpc_lock)
+		RETURN(-ENOMEM);
+	mdc_init_rpc_lock(cli->cl_rpc_lock);
+
+	ptlrpcd_addref();
+
+	OBD_ALLOC(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+	if (!cli->cl_close_lock)
+		GOTO(err_rpc_lock, rc = -ENOMEM);
+	mdc_init_rpc_lock(cli->cl_close_lock);
+
+	rc = client_obd_setup(obd, cfg);
+	if (rc)
+		GOTO(err_close_lock, rc);
+	lprocfs_mdc_init_vars(&lvars);
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+	sptlrpc_lprocfs_cliobd_attach(obd);
+	ptlrpc_lprocfs_register_obd(obd);
+
+	ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery);
+
+	obd->obd_namespace->ns_lvbo = &inode_lvbo;
+
+	rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+	if (rc) {
+		mdc_cleanup(obd);
+		CERROR("failed to setup llogging subsystems\n");
+	}
+
+	RETURN(rc);
+
+err_close_lock:
+	OBD_FREE(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+err_rpc_lock:
+	OBD_FREE(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+static int mdc_init_ea_size(struct obd_export *exp, int easize,
+		     int def_easize, int cookiesize)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct client_obd *cli = &obd->u.cli;
+	ENTRY;
+
+	if (cli->cl_max_mds_easize < easize)
+		cli->cl_max_mds_easize = easize;
+
+	if (cli->cl_default_mds_easize < def_easize)
+		cli->cl_default_mds_easize = def_easize;
+
+	if (cli->cl_max_mds_cookiesize < cookiesize)
+		cli->cl_max_mds_cookiesize = cookiesize;
+
+	RETURN(0);
+}
+
+static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	ENTRY;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+		break;
+	case OBD_CLEANUP_EXPORTS:
+		/* Failsafe, ok if racy */
+		if (obd->obd_type->typ_refcnt <= 1)
+			libcfs_kkuc_group_rem(0, KUC_GRP_HSM);
+
+		obd_cleanup_client_import(obd);
+		ptlrpc_lprocfs_unregister_obd(obd);
+		lprocfs_obd_cleanup(obd);
+
+		rc = obd_llog_finish(obd, 0);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+	}
+	RETURN(rc);
+}
+
+static int mdc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+
+	OBD_FREE(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+	OBD_FREE(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+
+	ptlrpcd_decref();
+
+	return client_obd_cleanup(obd);
+}
+
+
+static int mdc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+			 struct obd_device *tgt, int *index)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(olg == &obd->obd_olg);
+
+	rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, tgt,
+			&llog_client_ops);
+	if (rc)
+		RETURN(rc);
+
+	ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT);
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	RETURN(0);
+}
+
+static int mdc_llog_finish(struct obd_device *obd, int count)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+
+	RETURN(0);
+}
+
+static int mdc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+	struct lustre_cfg *lcfg = buf;
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc = 0;
+
+	lprocfs_mdc_init_vars(&lvars);
+	switch (lcfg->lcfg_command) {
+	default:
+		rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+		break;
+	}
+	return(rc);
+}
+
+
+/* get remote permission for current user on fid */
+int mdc_get_remote_perm(struct obd_export *exp, const struct lu_fid *fid,
+			struct obd_capa *oc, __u32 suppgid,
+			struct ptlrpc_request **request)
+{
+	struct ptlrpc_request  *req;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(client_is_remote(exp));
+
+	*request = NULL;
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	mdc_pack_body(req, fid, oc, OBD_MD_FLRMTPERM, 0, suppgid, 0);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+			     sizeof(struct mdt_remote_perm));
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		ptlrpc_req_finished(req);
+	else
+		*request = req;
+	RETURN(rc);
+}
+
+static int mdc_interpret_renew_capa(const struct lu_env *env,
+				    struct ptlrpc_request *req, void *args,
+				    int status)
+{
+	struct mdc_renew_capa_args *ra = args;
+	struct mdt_body *body = NULL;
+	struct lustre_capa *capa;
+	ENTRY;
+
+	if (status)
+		GOTO(out, capa = ERR_PTR(status));
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+	if (body == NULL)
+		GOTO(out, capa = ERR_PTR(-EFAULT));
+
+	if ((body->valid & OBD_MD_FLOSSCAPA) == 0)
+		GOTO(out, capa = ERR_PTR(-ENOENT));
+
+	capa = req_capsule_server_get(&req->rq_pill, &RMF_CAPA2);
+	if (!capa)
+		GOTO(out, capa = ERR_PTR(-EFAULT));
+	EXIT;
+out:
+	ra->ra_cb(ra->ra_oc, capa);
+	return 0;
+}
+
+static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+			  renew_capa_cb_t cb)
+{
+	struct ptlrpc_request *req;
+	struct mdc_renew_capa_args *ra;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_GETATTR,
+					LUSTRE_MDS_VERSION, MDS_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	/* NB, OBD_MD_FLOSSCAPA is set here, but it doesn't necessarily mean the
+	 * capa to renew is oss capa.
+	 */
+	mdc_pack_body(req, &oc->c_capa.lc_fid, oc, OBD_MD_FLOSSCAPA, 0, -1, 0);
+	ptlrpc_request_set_replen(req);
+
+	CLASSERT(sizeof(*ra) <= sizeof(req->rq_async_args));
+	ra = ptlrpc_req_async_args(req);
+	ra->ra_oc = oc;
+	ra->ra_cb = cb;
+	req->rq_interpret_reply = mdc_interpret_renew_capa;
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+	RETURN(0);
+}
+
+static int mdc_connect(const struct lu_env *env,
+		       struct obd_export **exp,
+		       struct obd_device *obd, struct obd_uuid *cluuid,
+		       struct obd_connect_data *data,
+		       void *localdata)
+{
+	struct obd_import *imp = obd->u.cli.cl_import;
+
+	/* mds-mds import features */
+	if (data && (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS)) {
+		spin_lock(&imp->imp_lock);
+		imp->imp_server_timeout = 1;
+		spin_unlock(&imp->imp_lock);
+		imp->imp_client->cli_request_portal = MDS_MDS_PORTAL;
+		CDEBUG(D_OTHER, "%s: Set 'mds' portal and timeout\n",
+		       obd->obd_name);
+	}
+
+	return client_connect_import(env, exp, obd, cluuid, data, NULL);
+}
+
+struct obd_ops mdc_obd_ops = {
+	.o_owner	    = THIS_MODULE,
+	.o_setup	    = mdc_setup,
+	.o_precleanup       = mdc_precleanup,
+	.o_cleanup	  = mdc_cleanup,
+	.o_add_conn	 = client_import_add_conn,
+	.o_del_conn	 = client_import_del_conn,
+	.o_connect	  = mdc_connect,
+	.o_disconnect       = client_disconnect_export,
+	.o_iocontrol	= mdc_iocontrol,
+	.o_set_info_async   = mdc_set_info_async,
+	.o_statfs	   = mdc_statfs,
+	.o_pin	      = mdc_pin,
+	.o_unpin	    = mdc_unpin,
+	.o_fid_init	    = client_fid_init,
+	.o_fid_fini	    = client_fid_fini,
+	.o_fid_alloc	= mdc_fid_alloc,
+	.o_import_event     = mdc_import_event,
+	.o_llog_init	= mdc_llog_init,
+	.o_llog_finish      = mdc_llog_finish,
+	.o_get_info	 = mdc_get_info,
+	.o_process_config   = mdc_process_config,
+	.o_get_uuid	 = mdc_get_uuid,
+	.o_quotactl	 = mdc_quotactl,
+	.o_quotacheck       = mdc_quotacheck
+};
+
+struct md_ops mdc_md_ops = {
+	.m_getstatus	= mdc_getstatus,
+	.m_null_inode	    = mdc_null_inode,
+	.m_find_cbdata      = mdc_find_cbdata,
+	.m_close	    = mdc_close,
+	.m_create	   = mdc_create,
+	.m_done_writing     = mdc_done_writing,
+	.m_enqueue	  = mdc_enqueue,
+	.m_getattr	  = mdc_getattr,
+	.m_getattr_name     = mdc_getattr_name,
+	.m_intent_lock      = mdc_intent_lock,
+	.m_link	     = mdc_link,
+	.m_is_subdir	= mdc_is_subdir,
+	.m_rename	   = mdc_rename,
+	.m_setattr	  = mdc_setattr,
+	.m_setxattr	 = mdc_setxattr,
+	.m_getxattr	 = mdc_getxattr,
+	.m_sync	     = mdc_sync,
+	.m_readpage	 = mdc_readpage,
+	.m_unlink	   = mdc_unlink,
+	.m_cancel_unused    = mdc_cancel_unused,
+	.m_init_ea_size     = mdc_init_ea_size,
+	.m_set_lock_data    = mdc_set_lock_data,
+	.m_lock_match       = mdc_lock_match,
+	.m_get_lustre_md    = mdc_get_lustre_md,
+	.m_free_lustre_md   = mdc_free_lustre_md,
+	.m_set_open_replay_data = mdc_set_open_replay_data,
+	.m_clear_open_replay_data = mdc_clear_open_replay_data,
+	.m_renew_capa       = mdc_renew_capa,
+	.m_unpack_capa      = mdc_unpack_capa,
+	.m_get_remote_perm  = mdc_get_remote_perm,
+	.m_intent_getattr_async = mdc_intent_getattr_async,
+	.m_revalidate_lock      = mdc_revalidate_lock
+};
+
+int __init mdc_init(void)
+{
+	int rc;
+	struct lprocfs_static_vars lvars = { 0 };
+	lprocfs_mdc_init_vars(&lvars);
+
+	rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, lvars.module_vars,
+				 LUSTRE_MDC_NAME, NULL);
+	RETURN(rc);
+}
+
+static void /*__exit*/ mdc_exit(void)
+{
+	class_unregister_type(LUSTRE_MDC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Metadata Client");
+MODULE_LICENSE("GPL");
+
+module_init(mdc_init);
+module_exit(mdc_exit);

diff --git a/drivers/staging/lustre/lustre/mgc/Makefile b/drivers/staging/lustre/lustre/mgc/Makefile
new file mode 100644
index 0000000..2672463
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/Makefile

@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += mgc.o
+mgc-y := mgc_request.o lproc_mgc.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/mgc/libmgc.c b/drivers/staging/lustre/lustre/mgc/libmgc.c
new file mode 100644
index 0000000..442146c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/libmgc.c

@@ -0,0 +1,166 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/libmgc.c
+ *
+ * Lustre Management Client
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+/* Minimal MGC for liblustre: only used to read the config log from the MGS
+   at setup time, no updates. */
+
+#define DEBUG_SUBSYSTEM S_MGC
+
+#include <liblustre.h>
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int rc;
+	ENTRY;
+
+	ptlrpcd_addref();
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		GOTO(err_decref, rc);
+
+	/* liblustre only support null flavor to MGS */
+	obd->u.cli.cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_NULL;
+
+	rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+	if (rc) {
+		CERROR("failed to setup llogging subsystems\n");
+		GOTO(err_cleanup, rc);
+	}
+
+	RETURN(rc);
+
+err_cleanup:
+	client_obd_cleanup(obd);
+err_decref:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	ENTRY;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+	case OBD_CLEANUP_EXPORTS:
+		obd_cleanup_client_import(obd);
+		rc = obd_llog_finish(obd, 0);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+	}
+	RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc;
+	ENTRY;
+
+	LASSERT(cli->cl_mgc_vfsmnt == NULL);
+
+	ptlrpcd_decref();
+
+	rc = client_obd_cleanup(obd);
+	RETURN(rc);
+}
+
+static int mgc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+			 struct obd_device *tgt, int *index)
+{
+	struct llog_ctxt *ctxt;
+	int rc;
+	ENTRY;
+
+	LASSERT(olg == &obd->obd_olg);
+	rc = llog_setup(NULL, obd, olg, LLOG_CONFIG_REPL_CTXT, tgt,
+			&llog_client_ops);
+	if (rc < 0)
+		RETURN(rc);
+
+	ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_REPL_CTXT);
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+static int mgc_llog_finish(struct obd_device *obd, int count)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+
+	RETURN(0);
+}
+
+struct obd_ops mgc_obd_ops = {
+	.o_owner	= THIS_MODULE,
+	.o_setup	= mgc_setup,
+	.o_precleanup   = mgc_precleanup,
+	.o_cleanup      = mgc_cleanup,
+	.o_add_conn     = client_import_add_conn,
+	.o_del_conn     = client_import_del_conn,
+	.o_connect      = client_connect_import,
+	.o_disconnect   = client_disconnect_export,
+	.o_llog_init    = mgc_llog_init,
+	.o_llog_finish  = mgc_llog_finish,
+};
+
+int __init mgc_init(void)
+{
+	return class_register_type(&mgc_obd_ops, NULL,
+				   NULL, LUSTRE_MGC_NAME, NULL);
+}

diff --git a/drivers/staging/lustre/lustre/mgc/lproc_mgc.c b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c
new file mode 100644
index 0000000..1105eaa
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c

@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "mgc_internal.h"
+
+#ifdef LPROCFS
+
+LPROC_SEQ_FOPS_RO_TYPE(mgc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, import);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, state);
+
+LPROC_SEQ_FOPS_WR_ONLY(mgc, ping);
+
+static int mgc_ir_state_seq_show(struct seq_file *m, void *v)
+{
+	return lprocfs_mgc_rd_ir_state(m, m->private);
+}
+LPROC_SEQ_FOPS_RO(mgc_ir_state);
+
+static struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
+	{ "uuid",	     &mgc_uuid_fops,	  0, 0 },
+	{ "ping",	     &mgc_ping_fops,      0, 0222 },
+	{ "connect_flags",   &mgc_connect_flags_fops, 0, 0 },
+	{ "mgs_server_uuid", &mgc_server_uuid_fops,   0, 0 },
+	{ "mgs_conn_uuid",   &mgc_conn_uuid_fops,     0, 0 },
+	{ "import",	     &mgc_import_fops,	0, 0 },
+	{ "state",	     &mgc_state_fops,	 0, 0 },
+	{ "ir_state",	     &mgc_ir_state_fops,  0, 0 },
+	{ 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(mgc, numrefs);
+static struct lprocfs_vars lprocfs_mgc_module_vars[] = {
+	{ "num_refs",	&mgc_numrefs_fops,       0, 0 },
+	{ 0 }
+};
+
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars = lprocfs_mgc_module_vars;
+	lvars->obd_vars    = lprocfs_mgc_obd_vars;
+}
+#endif /* LPROCFS */

diff --git a/drivers/staging/lustre/lustre/mgc/mgc_internal.h b/drivers/staging/lustre/lustre/mgc/mgc_internal.h
new file mode 100644
index 0000000..dbd6982
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/mgc_internal.h

@@ -0,0 +1,73 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MGC_INTERNAL_H
+#define _MGC_INTERNAL_H
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_export.h>
+
+#ifdef LPROCFS
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars);
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data);
+#else
+static void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+static inline int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
+{
+	return 0;
+}
+#endif  /* LPROCFS */
+
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
+
+static inline int cld_is_sptlrpc(struct config_llog_data *cld)
+{
+	return cld->cld_type == CONFIG_T_SPTLRPC;
+}
+
+static inline int cld_is_recover(struct config_llog_data *cld)
+{
+	return cld->cld_type == CONFIG_T_RECOVER;
+}
+
+#endif  /* _MGC_INTERNAL_H */

diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c
new file mode 100644
index 0000000..c6c84d9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/mgc_request.c

@@ -0,0 +1,1860 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/mgc_request.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MGC
+#define D_MGC D_CONFIG /*|D_WARNING*/
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include "mgc_internal.h"
+
+static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
+			  int type)
+{
+	__u64 resname = 0;
+
+	if (len > 8) {
+		CERROR("name too long: %s\n", name);
+		return -EINVAL;
+	}
+	if (len <= 0) {
+		CERROR("missing name: %s\n", name);
+		return -EINVAL;
+	}
+	memcpy(&resname, name, len);
+
+	/* Always use the same endianness for the resid */
+	memset(res_id, 0, sizeof(*res_id));
+	res_id->name[0] = cpu_to_le64(resname);
+	/* XXX: unfortunately, sptlprc and config llog share one lock */
+	switch(type) {
+	case CONFIG_T_CONFIG:
+	case CONFIG_T_SPTLRPC:
+		resname = 0;
+		break;
+	case CONFIG_T_RECOVER:
+		resname = type;
+		break;
+	default:
+		LBUG();
+	}
+	res_id->name[1] = cpu_to_le64(resname);
+	CDEBUG(D_MGC, "log %s to resid "LPX64"/"LPX64" (%.8s)\n", name,
+	       res_id->name[0], res_id->name[1], (char *)&res_id->name[0]);
+	return 0;
+}
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type)
+{
+	/* fsname is at most 8 chars long, maybe contain "-".
+	 * e.g. "lustre", "SUN-000" */
+	return mgc_name2resid(fsname, strlen(fsname), res_id, type);
+}
+EXPORT_SYMBOL(mgc_fsname2resid);
+
+int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type)
+{
+	char *name_end;
+	int len;
+
+	/* logname consists of "fsname-nodetype".
+	 * e.g. "lustre-MDT0001", "SUN-000-client" */
+	name_end = strrchr(logname, '-');
+	LASSERT(name_end);
+	len = name_end - logname;
+	return mgc_name2resid(logname, len, res_id, type);
+}
+
+/********************** config llog list **********************/
+static LIST_HEAD(config_llog_list);
+static DEFINE_SPINLOCK(config_list_lock);
+
+/* Take a reference to a config log */
+static int config_log_get(struct config_llog_data *cld)
+{
+	ENTRY;
+	atomic_inc(&cld->cld_refcount);
+	CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+	       atomic_read(&cld->cld_refcount));
+	RETURN(0);
+}
+
+/* Drop a reference to a config log.  When no longer referenced,
+   we can free the config log data */
+static void config_log_put(struct config_llog_data *cld)
+{
+	ENTRY;
+
+	CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+	       atomic_read(&cld->cld_refcount));
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/* spinlock to make sure no item with 0 refcount in the list */
+	if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) {
+		list_del(&cld->cld_list_chain);
+		spin_unlock(&config_list_lock);
+
+		CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
+
+		if (cld->cld_recover)
+			config_log_put(cld->cld_recover);
+		if (cld->cld_sptlrpc)
+			config_log_put(cld->cld_sptlrpc);
+		if (cld_is_sptlrpc(cld))
+			sptlrpc_conf_log_stop(cld->cld_logname);
+
+		class_export_put(cld->cld_mgcexp);
+		OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+	}
+
+	EXIT;
+}
+
+/* Find a config log by name */
+static
+struct config_llog_data *config_log_find(char *logname,
+					 struct config_llog_instance *cfg)
+{
+	struct config_llog_data *cld;
+	struct config_llog_data *found = NULL;
+	void *		   instance;
+	ENTRY;
+
+	LASSERT(logname != NULL);
+
+	instance = cfg ? cfg->cfg_instance : NULL;
+	spin_lock(&config_list_lock);
+	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+		/* check if instance equals */
+		if (instance != cld->cld_cfg.cfg_instance)
+			continue;
+
+		/* instance may be NULL, should check name */
+		if (strcmp(logname, cld->cld_logname) == 0) {
+			found = cld;
+			break;
+		}
+	}
+	if (found) {
+		atomic_inc(&found->cld_refcount);
+		LASSERT(found->cld_stopping == 0 || cld_is_sptlrpc(found) == 0);
+	}
+	spin_unlock(&config_list_lock);
+	RETURN(found);
+}
+
+static
+struct config_llog_data *do_config_log_add(struct obd_device *obd,
+					   char *logname,
+					   int type,
+					   struct config_llog_instance *cfg,
+					   struct super_block *sb)
+{
+	struct config_llog_data *cld;
+	int		      rc;
+	ENTRY;
+
+	CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
+	       cfg ? cfg->cfg_instance : 0);
+
+	OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
+	if (!cld)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	strcpy(cld->cld_logname, logname);
+	if (cfg)
+		cld->cld_cfg = *cfg;
+	else
+		cld->cld_cfg.cfg_callback = class_config_llog_handler;
+	mutex_init(&cld->cld_lock);
+	cld->cld_cfg.cfg_last_idx = 0;
+	cld->cld_cfg.cfg_flags = 0;
+	cld->cld_cfg.cfg_sb = sb;
+	cld->cld_type = type;
+	atomic_set(&cld->cld_refcount, 1);
+
+	/* Keep the mgc around until we are done */
+	cld->cld_mgcexp = class_export_get(obd->obd_self_export);
+
+	if (cld_is_sptlrpc(cld)) {
+		sptlrpc_conf_log_start(logname);
+		cld->cld_cfg.cfg_obdname = obd->obd_name;
+	}
+
+	rc = mgc_logname2resid(logname, &cld->cld_resid, type);
+
+	spin_lock(&config_list_lock);
+	list_add(&cld->cld_list_chain, &config_llog_list);
+	spin_unlock(&config_list_lock);
+
+	if (rc) {
+		config_log_put(cld);
+		RETURN(ERR_PTR(rc));
+	}
+
+	if (cld_is_sptlrpc(cld)) {
+		rc = mgc_process_log(obd, cld);
+		if (rc && rc != -ENOENT)
+			CERROR("failed processing sptlrpc log: %d\n", rc);
+	}
+
+	RETURN(cld);
+}
+
+static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
+	char *fsname,
+	struct config_llog_instance *cfg,
+	struct super_block *sb)
+{
+	struct config_llog_instance lcfg = *cfg;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld;
+	char logname[32];
+
+	if (IS_OST(lsi))
+		return NULL;
+
+	/* for osp-on-ost, see lustre_start_osp() */
+	if (IS_MDT(lsi) && lcfg.cfg_instance)
+		return NULL;
+
+	/* we have to use different llog for clients and mdts for cmd
+	 * where only clients are notified if one of cmd server restarts */
+	LASSERT(strlen(fsname) < sizeof(logname) / 2);
+	strcpy(logname, fsname);
+	if (IS_SERVER(lsi)) { /* mdt */
+		LASSERT(lcfg.cfg_instance == NULL);
+		lcfg.cfg_instance = sb;
+		strcat(logname, "-mdtir");
+	} else {
+		LASSERT(lcfg.cfg_instance != NULL);
+		strcat(logname, "-cliir");
+	}
+
+	cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
+	return cld;
+}
+
+
+/** Add this log to the list of active logs watched by an MGC.
+ * Active means we're watching for updates.
+ * We have one active log per "mount" - client instance or servername.
+ * Each instance may be at a different point in the log.
+ */
+static int config_log_add(struct obd_device *obd, char *logname,
+			  struct config_llog_instance *cfg,
+			  struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct config_llog_data *cld;
+	struct config_llog_data *sptlrpc_cld;
+	char		     seclogname[32];
+	char		    *ptr;
+	ENTRY;
+
+	CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
+
+	/*
+	 * for each regular log, the depended sptlrpc log name is
+	 * <fsname>-sptlrpc. multiple regular logs may share one sptlrpc log.
+	 */
+	ptr = strrchr(logname, '-');
+	if (ptr == NULL || ptr - logname > 8) {
+		CERROR("logname %s is too long\n", logname);
+		RETURN(-EINVAL);
+	}
+
+	memcpy(seclogname, logname, ptr - logname);
+	strcpy(seclogname + (ptr - logname), "-sptlrpc");
+
+	sptlrpc_cld = config_log_find(seclogname, NULL);
+	if (sptlrpc_cld == NULL) {
+		sptlrpc_cld = do_config_log_add(obd, seclogname,
+						CONFIG_T_SPTLRPC, NULL, NULL);
+		if (IS_ERR(sptlrpc_cld)) {
+			CERROR("can't create sptlrpc log: %s\n", seclogname);
+			RETURN(PTR_ERR(sptlrpc_cld));
+		}
+	}
+
+	cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb);
+	if (IS_ERR(cld)) {
+		CERROR("can't create log: %s\n", logname);
+		config_log_put(sptlrpc_cld);
+		RETURN(PTR_ERR(cld));
+	}
+
+	cld->cld_sptlrpc = sptlrpc_cld;
+
+	LASSERT(lsi->lsi_lmd);
+	if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) {
+		struct config_llog_data *recover_cld;
+		*strrchr(seclogname, '-') = 0;
+		recover_cld = config_recover_log_add(obd, seclogname, cfg, sb);
+		if (IS_ERR(recover_cld)) {
+			config_log_put(cld);
+			RETURN(PTR_ERR(recover_cld));
+		}
+		cld->cld_recover = recover_cld;
+	}
+
+	RETURN(0);
+}
+
+DEFINE_MUTEX(llog_process_lock);
+
+/** Stop watching for updates on this log.
+ */
+static int config_log_end(char *logname, struct config_llog_instance *cfg)
+{
+	struct config_llog_data *cld;
+	struct config_llog_data *cld_sptlrpc = NULL;
+	struct config_llog_data *cld_recover = NULL;
+	int rc = 0;
+	ENTRY;
+
+	cld = config_log_find(logname, cfg);
+	if (cld == NULL)
+		RETURN(-ENOENT);
+
+	mutex_lock(&cld->cld_lock);
+	/*
+	 * if cld_stopping is set, it means we didn't start the log thus
+	 * not owning the start ref. this can happen after previous umount:
+	 * the cld still hanging there waiting for lock cancel, and we
+	 * remount again but failed in the middle and call log_end without
+	 * calling start_log.
+	 */
+	if (unlikely(cld->cld_stopping)) {
+		mutex_unlock(&cld->cld_lock);
+		/* drop the ref from the find */
+		config_log_put(cld);
+		RETURN(rc);
+	}
+
+	cld->cld_stopping = 1;
+
+	cld_recover = cld->cld_recover;
+	cld->cld_recover = NULL;
+	mutex_unlock(&cld->cld_lock);
+
+	if (cld_recover) {
+		mutex_lock(&cld_recover->cld_lock);
+		cld_recover->cld_stopping = 1;
+		mutex_unlock(&cld_recover->cld_lock);
+		config_log_put(cld_recover);
+	}
+
+	spin_lock(&config_list_lock);
+	cld_sptlrpc = cld->cld_sptlrpc;
+	cld->cld_sptlrpc = NULL;
+	spin_unlock(&config_list_lock);
+
+	if (cld_sptlrpc)
+		config_log_put(cld_sptlrpc);
+
+	/* drop the ref from the find */
+	config_log_put(cld);
+	/* drop the start ref */
+	config_log_put(cld);
+
+	CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
+	       rc);
+	RETURN(rc);
+}
+
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
+{
+	struct obd_device       *obd = data;
+	struct obd_import       *imp = obd->u.cli.cl_import;
+	struct obd_connect_data *ocd = &imp->imp_connect_data;
+	struct config_llog_data *cld;
+	ENTRY;
+
+	seq_printf(m, "imperative_recovery: %s\n",
+		      OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED");
+	seq_printf(m, "client_state:\n");
+
+	spin_lock(&config_list_lock);
+	list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+		if (cld->cld_recover == NULL)
+			continue;
+		seq_printf(m,  "    - { client: %s, nidtbl_version: %u }\n",
+			       cld->cld_logname,
+			       cld->cld_recover->cld_cfg.cfg_last_idx);
+	}
+	spin_unlock(&config_list_lock);
+
+	RETURN(0);
+}
+
+/* reenqueue any lost locks */
+#define RQ_RUNNING 0x1
+#define RQ_NOW     0x2
+#define RQ_LATER   0x4
+#define RQ_STOP    0x8
+static int		    rq_state = 0;
+static wait_queue_head_t	    rq_waitq;
+static DECLARE_COMPLETION(rq_exit);
+
+static void do_requeue(struct config_llog_data *cld)
+{
+	ENTRY;
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	/* Do not run mgc_process_log on a disconnected export or an
+	   export which is being disconnected. Take the client
+	   semaphore to make the check non-racy. */
+	down_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+	if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
+		CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
+		mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+	} else {
+		CDEBUG(D_MGC, "disconnecting, won't update log %s\n",
+		       cld->cld_logname);
+	}
+	up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+
+	EXIT;
+}
+
+/* this timeout represents how many seconds MGC should wait before
+ * requeue config and recover lock to the MGS. We need to randomize this
+ * in order to not flood the MGS.
+ */
+#define MGC_TIMEOUT_MIN_SECONDS   5
+#define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */
+
+static int mgc_requeue_thread(void *data)
+{
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_MGC, "Starting requeue thread\n");
+
+	/* Keep trying failed locks periodically */
+	spin_lock(&config_list_lock);
+	rq_state |= RQ_RUNNING;
+	while (1) {
+		struct l_wait_info lwi;
+		struct config_llog_data *cld, *cld_prev;
+		int rand = cfs_rand() & MGC_TIMEOUT_RAND_CENTISEC;
+		int stopped = !!(rq_state & RQ_STOP);
+		int to;
+
+		/* Any new or requeued lostlocks will change the state */
+		rq_state &= ~(RQ_NOW | RQ_LATER);
+		spin_unlock(&config_list_lock);
+
+		/* Always wait a few seconds to allow the server who
+		   caused the lock revocation to finish its setup, plus some
+		   random so everyone doesn't try to reconnect at once. */
+		to = MGC_TIMEOUT_MIN_SECONDS * HZ;
+		to += rand * HZ / 100; /* rand is centi-seconds */
+		lwi = LWI_TIMEOUT(to, NULL, NULL);
+		l_wait_event(rq_waitq, rq_state & RQ_STOP, &lwi);
+
+		/*
+		 * iterate & processing through the list. for each cld, process
+		 * its depending sptlrpc cld firstly (if any) and then itself.
+		 *
+		 * it's guaranteed any item in the list must have
+		 * reference > 0; and if cld_lostlock is set, at
+		 * least one reference is taken by the previous enqueue.
+		 */
+		cld_prev = NULL;
+
+		spin_lock(&config_list_lock);
+		list_for_each_entry(cld, &config_llog_list,
+					cld_list_chain) {
+			if (!cld->cld_lostlock)
+				continue;
+
+			spin_unlock(&config_list_lock);
+
+			LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+			/* Whether we enqueued again or not in mgc_process_log,
+			 * we're done with the ref from the old enqueue */
+			if (cld_prev)
+				config_log_put(cld_prev);
+			cld_prev = cld;
+
+			cld->cld_lostlock = 0;
+			if (likely(!stopped))
+				do_requeue(cld);
+
+			spin_lock(&config_list_lock);
+		}
+		spin_unlock(&config_list_lock);
+		if (cld_prev)
+			config_log_put(cld_prev);
+
+		/* break after scanning the list so that we can drop
+		 * refcount to losing lock clds */
+		if (unlikely(stopped)) {
+			spin_lock(&config_list_lock);
+			break;
+		}
+
+		/* Wait a bit to see if anyone else needs a requeue */
+		lwi = (struct l_wait_info) { 0 };
+		l_wait_event(rq_waitq, rq_state & (RQ_NOW | RQ_STOP),
+			     &lwi);
+		spin_lock(&config_list_lock);
+	}
+	/* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */
+	rq_state &= ~RQ_RUNNING;
+	spin_unlock(&config_list_lock);
+
+	complete(&rq_exit);
+
+	CDEBUG(D_MGC, "Ending requeue thread\n");
+	RETURN(rc);
+}
+
+/* Add a cld to the list to requeue.  Start the requeue thread if needed.
+   We are responsible for dropping the config log reference from here on out. */
+static void mgc_requeue_add(struct config_llog_data *cld)
+{
+	ENTRY;
+
+	CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n",
+	       cld->cld_logname, atomic_read(&cld->cld_refcount),
+	       cld->cld_stopping, rq_state);
+	LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+	mutex_lock(&cld->cld_lock);
+	if (cld->cld_stopping || cld->cld_lostlock) {
+		mutex_unlock(&cld->cld_lock);
+		RETURN_EXIT;
+	}
+	/* this refcount will be released in mgc_requeue_thread. */
+	config_log_get(cld);
+	cld->cld_lostlock = 1;
+	mutex_unlock(&cld->cld_lock);
+
+	/* Hold lock for rq_state */
+	spin_lock(&config_list_lock);
+	if (rq_state & RQ_STOP) {
+		spin_unlock(&config_list_lock);
+		cld->cld_lostlock = 0;
+		config_log_put(cld);
+	} else {
+		rq_state |= RQ_NOW;
+		spin_unlock(&config_list_lock);
+		wake_up(&rq_waitq);
+	}
+	EXIT;
+}
+
+/********************** class fns **********************/
+
+static int mgc_fs_setup(struct obd_device *obd, struct super_block *sb,
+			struct vfsmount *mnt)
+{
+	struct lvfs_run_ctxt saved;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct client_obd *cli = &obd->u.cli;
+	struct dentry *dentry;
+	char *label;
+	int err = 0;
+	ENTRY;
+
+	LASSERT(lsi);
+	LASSERT(lsi->lsi_srv_mnt == mnt);
+
+	/* The mgc fs exclusion sem. Only one fs can be setup at a time. */
+	down(&cli->cl_mgc_sem);
+
+	cfs_cleanup_group_info();
+
+	obd->obd_fsops = fsfilt_get_ops(lsi->lsi_fstype);
+	if (IS_ERR(obd->obd_fsops)) {
+		up(&cli->cl_mgc_sem);
+		CERROR("%s: No fstype %s: rc = %ld\n", lsi->lsi_fstype,
+		       obd->obd_name, PTR_ERR(obd->obd_fsops));
+		RETURN(PTR_ERR(obd->obd_fsops));
+	}
+
+	cli->cl_mgc_vfsmnt = mnt;
+	err = fsfilt_setup(obd, mnt->mnt_sb);
+	if (err)
+		GOTO(err_ops, err);
+
+	OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+	obd->obd_lvfs_ctxt.pwdmnt = mnt;
+	obd->obd_lvfs_ctxt.pwd = mnt->mnt_root;
+	obd->obd_lvfs_ctxt.fs = get_ds();
+
+	push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+	dentry = ll_lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
+				   strlen(MOUNT_CONFIGS_DIR));
+	pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		CERROR("cannot lookup %s directory: rc = %d\n",
+		       MOUNT_CONFIGS_DIR, err);
+		GOTO(err_ops, err);
+	}
+	cli->cl_mgc_configs_dir = dentry;
+
+	/* We take an obd ref to insure that we can't get to mgc_cleanup
+	   without calling mgc_fs_cleanup first. */
+	class_incref(obd, "mgc_fs", obd);
+
+	label = fsfilt_get_label(obd, mnt->mnt_sb);
+	if (label)
+		CDEBUG(D_MGC, "MGC using disk labelled=%s\n", label);
+
+	/* We keep the cl_mgc_sem until mgc_fs_cleanup */
+	RETURN(0);
+
+err_ops:
+	fsfilt_put_ops(obd->obd_fsops);
+	obd->obd_fsops = NULL;
+	cli->cl_mgc_vfsmnt = NULL;
+	up(&cli->cl_mgc_sem);
+	RETURN(err);
+}
+
+static int mgc_fs_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(cli->cl_mgc_vfsmnt != NULL);
+
+	if (cli->cl_mgc_configs_dir != NULL) {
+		struct lvfs_run_ctxt saved;
+		push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+		l_dput(cli->cl_mgc_configs_dir);
+		cli->cl_mgc_configs_dir = NULL;
+		pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+		class_decref(obd, "mgc_fs", obd);
+	}
+
+	cli->cl_mgc_vfsmnt = NULL;
+	if (obd->obd_fsops)
+		fsfilt_put_ops(obd->obd_fsops);
+
+	up(&cli->cl_mgc_sem);
+
+	RETURN(rc);
+}
+
+static atomic_t mgc_count = ATOMIC_INIT(0);
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	ENTRY;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY:
+		break;
+	case OBD_CLEANUP_EXPORTS:
+		if (atomic_dec_and_test(&mgc_count)) {
+			int running;
+			/* stop requeue thread */
+			spin_lock(&config_list_lock);
+			running = rq_state & RQ_RUNNING;
+			if (running)
+				rq_state |= RQ_STOP;
+			spin_unlock(&config_list_lock);
+			if (running) {
+				wake_up(&rq_waitq);
+				wait_for_completion(&rq_exit);
+			}
+		}
+		obd_cleanup_client_import(obd);
+		rc = obd_llog_finish(obd, 0);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+	}
+	RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc;
+	ENTRY;
+
+	LASSERT(cli->cl_mgc_vfsmnt == NULL);
+
+	/* COMPAT_146 - old config logs may have added profiles we don't
+	   know about */
+	if (obd->obd_type->typ_refcnt <= 1)
+		/* Only for the last mgc */
+		class_del_profiles();
+
+	lprocfs_obd_cleanup(obd);
+	ptlrpcd_decref();
+
+	rc = client_obd_cleanup(obd);
+	RETURN(rc);
+}
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars;
+	int rc;
+	ENTRY;
+
+	ptlrpcd_addref();
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		GOTO(err_decref, rc);
+
+	rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+	if (rc) {
+		CERROR("failed to setup llogging subsystems\n");
+		GOTO(err_cleanup, rc);
+	}
+
+	lprocfs_mgc_init_vars(&lvars);
+	lprocfs_obd_setup(obd, lvars.obd_vars);
+	sptlrpc_lprocfs_cliobd_attach(obd);
+
+	if (atomic_inc_return(&mgc_count) == 1) {
+		rq_state = 0;
+		init_waitqueue_head(&rq_waitq);
+
+		/* start requeue thread */
+		rc = PTR_ERR(kthread_run(mgc_requeue_thread, NULL,
+					     "ll_cfg_requeue"));
+		if (IS_ERR_VALUE(rc)) {
+			CERROR("%s: Cannot start requeue thread (%d),"
+			       "no more log updates!\n",
+			       obd->obd_name, rc);
+			GOTO(err_cleanup, rc);
+		}
+		/* rc is the task_struct pointer of mgc_requeue_thread. */
+		rc = 0;
+	}
+
+	RETURN(rc);
+
+err_cleanup:
+	client_obd_cleanup(obd);
+err_decref:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+/* based on ll_mdc_blocking_ast */
+static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+			    void *data, int flag)
+{
+	struct lustre_handle lockh;
+	struct config_llog_data *cld = (struct config_llog_data *)data;
+	int rc = 0;
+	ENTRY;
+
+	switch (flag) {
+	case LDLM_CB_BLOCKING:
+		/* mgs wants the lock, give it up... */
+		LDLM_DEBUG(lock, "MGC blocking CB");
+		ldlm_lock2handle(lock, &lockh);
+		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+		break;
+	case LDLM_CB_CANCELING:
+		/* We've given up the lock, prepare ourselves to update. */
+		LDLM_DEBUG(lock, "MGC cancel CB");
+
+		CDEBUG(D_MGC, "Lock res "LPX64" (%.8s)\n",
+		       lock->l_resource->lr_name.name[0],
+		       (char *)&lock->l_resource->lr_name.name[0]);
+
+		if (!cld) {
+			CDEBUG(D_INFO, "missing data, won't requeue\n");
+			break;
+		}
+
+		/* held at mgc_process_log(). */
+		LASSERT(atomic_read(&cld->cld_refcount) > 0);
+		/* Are we done with this log? */
+		if (cld->cld_stopping) {
+			CDEBUG(D_MGC, "log %s: stopping, won't requeue\n",
+			       cld->cld_logname);
+			config_log_put(cld);
+			break;
+		}
+		/* Make sure not to re-enqueue when the mgc is stopping
+		   (we get called from client_disconnect_export) */
+		if (!lock->l_conn_export ||
+		    !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) {
+			CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n",
+			       cld->cld_logname);
+			config_log_put(cld);
+			break;
+		}
+
+		/* Re-enqueue now */
+		mgc_requeue_add(cld);
+		config_log_put(cld);
+		break;
+	default:
+		LBUG();
+	}
+
+	RETURN(rc);
+}
+
+/* Not sure where this should go... */
+#define  MGC_ENQUEUE_LIMIT 50
+#define  MGC_TARGET_REG_LIMIT 10
+#define  MGC_SEND_PARAM_LIMIT 10
+
+/* Send parameter to MGS*/
+static int mgc_set_mgs_param(struct obd_export *exp,
+			     struct mgs_send_param *msp)
+{
+	struct ptlrpc_request *req;
+	struct mgs_send_param *req_msp, *rep_msp;
+	int rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION,
+					MGS_SET_INFO);
+	if (!req)
+		RETURN(-ENOMEM);
+
+	req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+	if (!req_msp) {
+		ptlrpc_req_finished(req);
+		RETURN(-ENOMEM);
+	}
+
+	memcpy(req_msp, msp, sizeof(*req_msp));
+	ptlrpc_request_set_replen(req);
+
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = MGC_SEND_PARAM_LIMIT;
+	rc = ptlrpc_queue_wait(req);
+	if (!rc) {
+		rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+		memcpy(msp, rep_msp, sizeof(*rep_msp));
+	}
+
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+/* Take a config lock so we can get cancel notifications */
+static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
+		       __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+		       __u64 *flags, void *bl_cb, void *cp_cb, void *gl_cb,
+		       void *data, __u32 lvb_len, void *lvb_swabber,
+		       struct lustre_handle *lockh)
+{
+	struct config_llog_data *cld = (struct config_llog_data *)data;
+	struct ldlm_enqueue_info einfo = { type, mode, mgc_blocking_ast,
+			 ldlm_completion_ast, NULL, NULL, NULL };
+	struct ptlrpc_request *req;
+	int short_limit = cld_is_sptlrpc(cld);
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MGC, "Enqueue for %s (res "LPX64")\n", cld->cld_logname,
+	       cld->cld_resid.name[0]);
+
+	/* We need a callback for every lockholder, so don't try to
+	   ldlm_lock_match (see rev 1.1.2.11.2.47) */
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION,
+					LDLM_ENQUEUE);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0);
+	ptlrpc_request_set_replen(req);
+
+	/* check if this is server or client */
+	if (cld->cld_cfg.cfg_sb) {
+		struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb);
+		if (lsi && IS_SERVER(lsi))
+			short_limit = 1;
+	}
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT;
+	rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags,
+			      NULL, 0, LVB_T_NONE, lockh, 0);
+	/* A failed enqueue should still call the mgc_blocking_ast,
+	   where it will be requeued if needed ("grant failed"). */
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int mgc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
+		      __u32 mode, struct lustre_handle *lockh)
+{
+	ENTRY;
+
+	ldlm_lock_decref(lockh, mode);
+
+	RETURN(0);
+}
+
+static void mgc_notify_active(struct obd_device *unused)
+{
+	/* wakeup mgc_requeue_thread to requeue mgc lock */
+	spin_lock(&config_list_lock);
+	rq_state |= RQ_NOW;
+	spin_unlock(&config_list_lock);
+	wake_up(&rq_waitq);
+
+	/* TODO: Help the MGS rebuild nidtbl. -jay */
+}
+
+/* Send target_reg message to MGS */
+static int mgc_target_register(struct obd_export *exp,
+			       struct mgs_target_info *mti)
+{
+	struct ptlrpc_request  *req;
+	struct mgs_target_info *req_mti, *rep_mti;
+	int		     rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION,
+					MGS_TARGET_REG);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO);
+	if (!req_mti) {
+		ptlrpc_req_finished(req);
+		RETURN(-ENOMEM);
+	}
+
+	memcpy(req_mti, mti, sizeof(*req_mti));
+	ptlrpc_request_set_replen(req);
+	CDEBUG(D_MGC, "register %s\n", mti->mti_svname);
+	/* Limit how long we will wait for the enqueue to complete */
+	req->rq_delay_limit = MGC_TARGET_REG_LIMIT;
+
+	rc = ptlrpc_queue_wait(req);
+	if (!rc) {
+		rep_mti = req_capsule_server_get(&req->rq_pill,
+						 &RMF_MGS_TARGET_INFO);
+		memcpy(mti, rep_mti, sizeof(*rep_mti));
+		CDEBUG(D_MGC, "register %s got index = %d\n",
+		       mti->mti_svname, mti->mti_stripe_index);
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+		       obd_count keylen, void *key, obd_count vallen,
+		       void *val, struct ptlrpc_request_set *set)
+{
+	int rc = -EINVAL;
+	ENTRY;
+
+	/* Turn off initial_recov after we try all backup servers once */
+	if (KEY_IS(KEY_INIT_RECOV_BACKUP)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		int value;
+		if (vallen != sizeof(int))
+			RETURN(-EINVAL);
+		value = *(int *)val;
+		CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n",
+		       imp->imp_obd->obd_name, value,
+		       imp->imp_deactive, imp->imp_invalid,
+		       imp->imp_replayable, imp->imp_obd->obd_replayable,
+		       ptlrpc_import_state_name(imp->imp_state));
+		/* Resurrect if we previously died */
+		if ((imp->imp_state != LUSTRE_IMP_FULL &&
+		     imp->imp_state != LUSTRE_IMP_NEW) || value > 1)
+			ptlrpc_reconnect_import(imp);
+		RETURN(0);
+	}
+	/* FIXME move this to mgc_process_config */
+	if (KEY_IS(KEY_REGISTER_TARGET)) {
+		struct mgs_target_info *mti;
+		if (vallen != sizeof(struct mgs_target_info))
+			RETURN(-EINVAL);
+		mti = (struct mgs_target_info *)val;
+		CDEBUG(D_MGC, "register_target %s %#x\n",
+		       mti->mti_svname, mti->mti_flags);
+		rc =  mgc_target_register(exp, mti);
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_SET_FS)) {
+		struct super_block *sb = (struct super_block *)val;
+		struct lustre_sb_info *lsi;
+		if (vallen != sizeof(struct super_block))
+			RETURN(-EINVAL);
+		lsi = s2lsi(sb);
+		rc = mgc_fs_setup(exp->exp_obd, sb, lsi->lsi_srv_mnt);
+		if (rc) {
+			CERROR("set_fs got %d\n", rc);
+		}
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_CLEAR_FS)) {
+		if (vallen != 0)
+			RETURN(-EINVAL);
+		rc = mgc_fs_cleanup(exp->exp_obd);
+		if (rc) {
+			CERROR("clear_fs got %d\n", rc);
+		}
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_SET_INFO)) {
+		struct mgs_send_param *msp;
+
+		msp = (struct mgs_send_param *)val;
+		rc =  mgc_set_mgs_param(exp, msp);
+		RETURN(rc);
+	}
+	if (KEY_IS(KEY_MGSSEC)) {
+		struct client_obd     *cli = &exp->exp_obd->u.cli;
+		struct sptlrpc_flavor  flvr;
+
+		/*
+		 * empty string means using current flavor, if which haven't
+		 * been set yet, set it as null.
+		 *
+		 * if flavor has been set previously, check the asking flavor
+		 * must match the existing one.
+		 */
+		if (vallen == 0) {
+			if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID)
+				RETURN(0);
+			val = "null";
+			vallen = 4;
+		}
+
+		rc = sptlrpc_parse_flavor(val, &flvr);
+		if (rc) {
+			CERROR("invalid sptlrpc flavor %s to MGS\n",
+			       (char *) val);
+			RETURN(rc);
+		}
+
+		/*
+		 * caller already hold a mutex
+		 */
+		if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) {
+			cli->cl_flvr_mgc = flvr;
+		} else if (memcmp(&cli->cl_flvr_mgc, &flvr,
+				  sizeof(flvr)) != 0) {
+			char    str[20];
+
+			sptlrpc_flavor2name(&cli->cl_flvr_mgc,
+					    str, sizeof(str));
+			LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but "
+				       "currently %s is in use\n",
+				       (char *) val, str);
+			rc = -EPERM;
+		}
+		RETURN(rc);
+	}
+
+	RETURN(rc);
+}
+
+static int mgc_get_info(const struct lu_env *env, struct obd_export *exp,
+			__u32 keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *unused)
+{
+	int rc = -EINVAL;
+
+	if (KEY_IS(KEY_CONN_DATA)) {
+		struct obd_import *imp = class_exp2cliimp(exp);
+		struct obd_connect_data *data = val;
+
+		if (*vallen == sizeof(*data)) {
+			*data = imp->imp_connect_data;
+			rc = 0;
+		}
+	}
+
+	return rc;
+}
+
+static int mgc_import_event(struct obd_device *obd,
+			    struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	int rc = 0;
+
+	LASSERT(imp->imp_obd == obd);
+	CDEBUG(D_MGC, "import event %#x\n", event);
+
+	switch (event) {
+	case IMP_EVENT_DISCON:
+		/* MGC imports should not wait for recovery */
+		if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+			ptlrpc_pinger_ir_down();
+		break;
+	case IMP_EVENT_INACTIVE:
+		break;
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+		ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+		break;
+	}
+	case IMP_EVENT_ACTIVE:
+		CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name);
+		/* Clearing obd_no_recov allows us to continue pinging */
+		obd->obd_no_recov = 0;
+		mgc_notify_active(obd);
+		if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+			ptlrpc_pinger_ir_up();
+		break;
+	case IMP_EVENT_OCD:
+		break;
+	case IMP_EVENT_DEACTIVATE:
+	case IMP_EVENT_ACTIVATE:
+		break;
+	default:
+		CERROR("Unknown import event %#x\n", event);
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+static int mgc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+			 struct obd_device *tgt, int *index)
+{
+	struct llog_ctxt *ctxt;
+	int rc;
+	ENTRY;
+
+	LASSERT(olg == &obd->obd_olg);
+
+
+	rc = llog_setup(NULL, obd, olg, LLOG_CONFIG_REPL_CTXT, tgt,
+			&llog_client_ops);
+	if (rc)
+		GOTO(out, rc);
+
+	ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_REPL_CTXT);
+	if (!ctxt)
+		GOTO(out, rc = -ENODEV);
+
+	llog_initiator_connect(ctxt);
+	llog_ctxt_put(ctxt);
+
+	RETURN(0);
+out:
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+	RETURN(rc);
+}
+
+static int mgc_llog_finish(struct obd_device *obd, int count)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+	RETURN(0);
+}
+
+enum {
+	CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_CACHE_SHIFT),
+	CONFIG_READ_NRPAGES      = 4
+};
+
+static int mgc_apply_recover_logs(struct obd_device *mgc,
+				  struct config_llog_data *cld,
+				  __u64 max_version,
+				  void *data, int datalen, bool mne_swab)
+{
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct lustre_sb_info       *lsi = s2lsi(cfg->cfg_sb);
+	struct mgs_nidtbl_entry *entry;
+	struct lustre_cfg       *lcfg;
+	struct lustre_cfg_bufs   bufs;
+	u64   prev_version = 0;
+	char *inst;
+	char *buf;
+	int   bufsz;
+	int   pos;
+	int   rc  = 0;
+	int   off = 0;
+	ENTRY;
+
+	LASSERT(cfg->cfg_instance != NULL);
+	LASSERT(cfg->cfg_sb == cfg->cfg_instance);
+
+	OBD_ALLOC(inst, PAGE_CACHE_SIZE);
+	if (inst == NULL)
+		RETURN(-ENOMEM);
+
+	if (!IS_SERVER(lsi)) {
+		pos = snprintf(inst, PAGE_CACHE_SIZE, "%p", cfg->cfg_instance);
+		if (pos >= PAGE_CACHE_SIZE) {
+			OBD_FREE(inst, PAGE_CACHE_SIZE);
+			return -E2BIG;
+		}
+	} else {
+		LASSERT(IS_MDT(lsi));
+		rc = server_name2svname(lsi->lsi_svname, inst, NULL,
+					PAGE_CACHE_SIZE);
+		if (rc) {
+			OBD_FREE(inst, PAGE_CACHE_SIZE);
+			RETURN(-EINVAL);
+		}
+		pos = strlen(inst);
+	}
+
+	++pos;
+	buf   = inst + pos;
+	bufsz = PAGE_CACHE_SIZE - pos;
+
+	while (datalen > 0) {
+		int   entry_len = sizeof(*entry);
+		int   is_ost;
+		struct obd_device *obd;
+		char *obdname;
+		char *cname;
+		char *params;
+		char *uuid;
+
+		rc = -EINVAL;
+		if (datalen < sizeof(*entry))
+			break;
+
+		entry = (typeof(entry))(data + off);
+
+		/* sanity check */
+		if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */
+			break;
+		if (entry->mne_nid_count == 0) /* at least one nid entry */
+			break;
+		if (entry->mne_nid_size != sizeof(lnet_nid_t))
+			break;
+
+		entry_len += entry->mne_nid_count * entry->mne_nid_size;
+		if (datalen < entry_len) /* must have entry_len at least */
+			break;
+
+		/* Keep this swab for normal mixed endian handling. LU-1644 */
+		if (mne_swab)
+			lustre_swab_mgs_nidtbl_entry(entry);
+		if (entry->mne_length > PAGE_CACHE_SIZE) {
+			CERROR("MNE too large (%u)\n", entry->mne_length);
+			break;
+		}
+
+		if (entry->mne_length < entry_len)
+			break;
+
+		off     += entry->mne_length;
+		datalen -= entry->mne_length;
+		if (datalen < 0)
+			break;
+
+		if (entry->mne_version > max_version) {
+			CERROR("entry index(%lld) is over max_index(%lld)\n",
+			       entry->mne_version, max_version);
+			break;
+		}
+
+		if (prev_version >= entry->mne_version) {
+			CERROR("index unsorted, prev %lld, now %lld\n",
+			       prev_version, entry->mne_version);
+			break;
+		}
+		prev_version = entry->mne_version;
+
+		/*
+		 * Write a string with format "nid::instance" to
+		 * lustre/<osc|mdc>/<target>-<osc|mdc>-<instance>/import.
+		 */
+
+		is_ost = entry->mne_type == LDD_F_SV_TYPE_OST;
+		memset(buf, 0, bufsz);
+		obdname = buf;
+		pos = 0;
+
+		/* lustre-OST0001-osc-<instance #> */
+		strcpy(obdname, cld->cld_logname);
+		cname = strrchr(obdname, '-');
+		if (cname == NULL) {
+			CERROR("mgc %s: invalid logname %s\n",
+			       mgc->obd_name, obdname);
+			break;
+		}
+
+		pos = cname - obdname;
+		obdname[pos] = 0;
+		pos += sprintf(obdname + pos, "-%s%04x",
+				  is_ost ? "OST" : "MDT", entry->mne_index);
+
+		cname = is_ost ? "osc" : "mdc",
+		pos += sprintf(obdname + pos, "-%s-%s", cname, inst);
+		lustre_cfg_bufs_reset(&bufs, obdname);
+
+		/* find the obd by obdname */
+		obd = class_name2obd(obdname);
+		if (obd == NULL) {
+			CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n",
+			       mgc->obd_name, obdname);
+			rc = 0;
+			/* this is a safe race, when the ost is starting up...*/
+			continue;
+		}
+
+		/* osc.import = "connection=<Conn UUID>::<target instance>" */
+		++pos;
+		params = buf + pos;
+		pos += sprintf(params, "%s.import=%s", cname, "connection=");
+		uuid = buf + pos;
+
+		down_read(&obd->u.cli.cl_sem);
+		if (obd->u.cli.cl_import == NULL) {
+			/* client does not connect to the OST yet */
+			up_read(&obd->u.cli.cl_sem);
+			rc = 0;
+			continue;
+		}
+
+		/* TODO: iterate all nids to find one */
+		/* find uuid by nid */
+		rc = client_import_find_conn(obd->u.cli.cl_import,
+					     entry->u.nids[0],
+					     (struct obd_uuid *)uuid);
+		up_read(&obd->u.cli.cl_sem);
+		if (rc < 0) {
+			CERROR("mgc: cannot find uuid by nid %s\n",
+			       libcfs_nid2str(entry->u.nids[0]));
+			break;
+		}
+
+		CDEBUG(D_INFO, "Find uuid %s by nid %s\n",
+		       uuid, libcfs_nid2str(entry->u.nids[0]));
+
+		pos += strlen(uuid);
+		pos += sprintf(buf + pos, "::%u", entry->mne_instance);
+		LASSERT(pos < bufsz);
+
+		lustre_cfg_bufs_set_string(&bufs, 1, params);
+
+		rc = -ENOMEM;
+		lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
+		if (lcfg == NULL) {
+			CERROR("mgc: cannot allocate memory\n");
+			break;
+		}
+
+		CDEBUG(D_INFO, "ir apply logs "LPD64"/"LPD64" for %s -> %s\n",
+		       prev_version, max_version, obdname, params);
+
+		rc = class_process_config(lcfg);
+		lustre_cfg_free(lcfg);
+		if (rc)
+			CDEBUG(D_INFO, "process config for %s error %d\n",
+			       obdname, rc);
+
+		/* continue, even one with error */
+	}
+
+	OBD_FREE(inst, PAGE_CACHE_SIZE);
+	RETURN(rc);
+}
+
+/**
+ * This function is called if this client was notified for target restarting
+ * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery logs.
+ */
+static int mgc_process_recover_log(struct obd_device *obd,
+				   struct config_llog_data *cld)
+{
+	struct ptlrpc_request *req = NULL;
+	struct config_llog_instance *cfg = &cld->cld_cfg;
+	struct mgs_config_body *body;
+	struct mgs_config_res  *res;
+	struct ptlrpc_bulk_desc *desc;
+	struct page **pages;
+	int nrpages;
+	bool eof = true;
+	bool mne_swab = false;
+	int i;
+	int ealen;
+	int rc;
+	ENTRY;
+
+	/* allocate buffer for bulk transfer.
+	 * if this is the first time for this mgs to read logs,
+	 * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs
+	 * once; otherwise, it only reads increment of logs, this should be
+	 * small and CONFIG_READ_NRPAGES will be used.
+	 */
+	nrpages = CONFIG_READ_NRPAGES;
+	if (cfg->cfg_last_idx == 0) /* the first time */
+		nrpages = CONFIG_READ_NRPAGES_INIT;
+
+	OBD_ALLOC(pages, sizeof(*pages) * nrpages);
+	if (pages == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	for (i = 0; i < nrpages; i++) {
+		pages[i] = alloc_page(GFP_IOFS);
+		if (pages[i] == NULL)
+			GOTO(out, rc = -ENOMEM);
+	}
+
+again:
+	LASSERT(cld_is_recover(cld));
+	LASSERT(mutex_is_locked(&cld->cld_lock));
+	req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
+				   &RQF_MGS_CONFIG_READ);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ);
+	if (rc)
+		GOTO(out, rc);
+
+	/* pack request */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY);
+	LASSERT(body != NULL);
+	LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname));
+	if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name))
+	    >= sizeof(body->mcb_name))
+		GOTO(out, rc = -E2BIG);
+	body->mcb_offset = cfg->cfg_last_idx + 1;
+	body->mcb_type   = cld->cld_type;
+	body->mcb_bits   = PAGE_CACHE_SHIFT;
+	body->mcb_units  = nrpages;
+
+	/* allocate bulk transfer descriptor */
+	desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, BULK_PUT_SINK,
+				    MGS_BULK_PORTAL);
+	if (desc == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	for (i = 0; i < nrpages; i++)
+		ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
+	if (res->mcr_size < res->mcr_offset)
+		GOTO(out, rc = -EINVAL);
+
+	/* always update the index even though it might have errors with
+	 * handling the recover logs */
+	cfg->cfg_last_idx = res->mcr_offset;
+	eof = res->mcr_offset == res->mcr_size;
+
+	CDEBUG(D_INFO, "Latest version "LPD64", more %d.\n",
+	       res->mcr_offset, eof == false);
+
+	ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0);
+	if (ealen < 0)
+		GOTO(out, rc = ealen);
+
+	if (ealen > nrpages << PAGE_CACHE_SHIFT)
+		GOTO(out, rc = -EINVAL);
+
+	if (ealen == 0) { /* no logs transferred */
+		if (!eof)
+			rc = -EINVAL;
+		GOTO(out, rc);
+	}
+
+	mne_swab = !!ptlrpc_rep_need_swab(req);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+	/* This import flag means the server did an extra swab of IR MNE
+	 * records (fixed in LU-1252), reverse it here if needed. LU-1644 */
+	if (unlikely(req->rq_import->imp_need_mne_swab))
+		mne_swab = !mne_swab;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+	for (i = 0; i < nrpages && ealen > 0; i++) {
+		int rc2;
+		void *ptr;
+
+		ptr = kmap(pages[i]);
+		rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr,
+					     min_t(int, ealen, PAGE_CACHE_SIZE),
+					     mne_swab);
+		kunmap(pages[i]);
+		if (rc2 < 0) {
+			CWARN("Process recover log %s error %d\n",
+			      cld->cld_logname, rc2);
+			break;
+		}
+
+		ealen -= PAGE_CACHE_SIZE;
+	}
+
+out:
+	if (req)
+		ptlrpc_req_finished(req);
+
+	if (rc == 0 && !eof)
+		goto again;
+
+	if (pages) {
+		for (i = 0; i < nrpages; i++) {
+			if (pages[i] == NULL)
+				break;
+			__free_page(pages[i]);
+		}
+		OBD_FREE(pages, sizeof(*pages) * nrpages);
+	}
+	return rc;
+}
+
+
+/* local_only means it cannot get remote llogs */
+static int mgc_process_cfg_log(struct obd_device *mgc,
+			       struct config_llog_data *cld,
+			       int local_only)
+{
+	struct llog_ctxt *ctxt, *lctxt = NULL;
+	struct lvfs_run_ctxt *saved_ctxt;
+	struct lustre_sb_info *lsi = NULL;
+	int rc = 0, must_pop = 0;
+	bool sptlrpc_started = false;
+
+	ENTRY;
+
+	LASSERT(cld);
+	LASSERT(mutex_is_locked(&cld->cld_lock));
+
+	/*
+	 * local copy of sptlrpc log is controlled elsewhere, don't try to
+	 * read it up here.
+	 */
+	if (cld_is_sptlrpc(cld) && local_only)
+		RETURN(0);
+
+	if (cld->cld_cfg.cfg_sb)
+		lsi = s2lsi(cld->cld_cfg.cfg_sb);
+
+	ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+	if (!ctxt) {
+		CERROR("missing llog context\n");
+		RETURN(-EINVAL);
+	}
+
+	OBD_ALLOC_PTR(saved_ctxt);
+	if (saved_ctxt == NULL)
+		RETURN(-ENOMEM);
+
+	lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT);
+
+		if (local_only) { /* no local log at client side */
+		GOTO(out_pop, rc = -EIO);
+	}
+
+	if (cld_is_sptlrpc(cld)) {
+		sptlrpc_conf_log_update_begin(cld->cld_logname);
+		sptlrpc_started = true;
+	}
+
+	/* logname and instance info should be the same, so use our
+	   copy of the instance for the update.  The cfg_last_idx will
+	   be updated here. */
+	rc = class_config_parse_llog(NULL, ctxt, cld->cld_logname,
+				     &cld->cld_cfg);
+	EXIT;
+
+out_pop:
+	llog_ctxt_put(ctxt);
+	if (lctxt)
+		llog_ctxt_put(lctxt);
+	if (must_pop)
+		pop_ctxt(saved_ctxt, &mgc->obd_lvfs_ctxt, NULL);
+
+	OBD_FREE_PTR(saved_ctxt);
+	/*
+	 * update settings on existing OBDs. doing it inside
+	 * of llog_process_lock so no device is attaching/detaching
+	 * in parallel.
+	 * the logname must be <fsname>-sptlrpc
+	 */
+	if (sptlrpc_started) {
+		LASSERT(cld_is_sptlrpc(cld));
+		sptlrpc_conf_log_update_end(cld->cld_logname);
+		class_notify_sptlrpc_conf(cld->cld_logname,
+					  strlen(cld->cld_logname) -
+					  strlen("-sptlrpc"));
+	}
+
+	RETURN(rc);
+}
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Copy the log locally before parsing it if appropriate (non-MGS server)
+ */
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
+{
+	struct lustre_handle lockh = { 0 };
+	__u64 flags = LDLM_FL_NO_LRU;
+	int rc = 0, rcl;
+	ENTRY;
+
+	LASSERT(cld);
+
+	/* I don't want multiple processes running process_log at once --
+	   sounds like badness.  It actually might be fine, as long as
+	   we're not trying to update from the same log
+	   simultaneously (in which case we should use a per-log sem.) */
+	mutex_lock(&cld->cld_lock);
+	if (cld->cld_stopping) {
+		mutex_unlock(&cld->cld_lock);
+		RETURN(0);
+	}
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+
+	CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
+	       cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
+
+	/* Get the cfg lock on the llog */
+	rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL,
+			  LCK_CR, &flags, NULL, NULL, NULL,
+			  cld, 0, NULL, &lockh);
+	if (rcl == 0) {
+		/* Get the cld, it will be released in mgc_blocking_ast. */
+		config_log_get(cld);
+		rc = ldlm_lock_set_data(&lockh, (void *)cld);
+		LASSERT(rc == 0);
+	} else {
+		CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+
+		/* mark cld_lostlock so that it will requeue
+		 * after MGC becomes available. */
+		cld->cld_lostlock = 1;
+		/* Get extra reference, it will be put in requeue thread */
+		config_log_get(cld);
+	}
+
+
+	if (cld_is_recover(cld)) {
+		rc = 0; /* this is not a fatal error for recover log */
+		if (rcl == 0)
+			rc = mgc_process_recover_log(mgc, cld);
+	} else {
+		rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
+	}
+
+	CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
+	       mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
+
+	mutex_unlock(&cld->cld_lock);
+
+	/* Now drop the lock so MGS can revoke it */
+	if (!rcl) {
+		rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, NULL,
+				 LCK_CR, &lockh);
+		if (rcl)
+			CERROR("Can't drop cfg lock: %d\n", rcl);
+	}
+
+	RETURN(rc);
+}
+
+
+/** Called from lustre_process_log.
+ * LCFG_LOG_START gets the config log from the MGS, processes it to start
+ * any services, and adds it to the list logs to watch (follow).
+ */
+static int mgc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+	struct lustre_cfg *lcfg = buf;
+	struct config_llog_instance *cfg = NULL;
+	char *logname;
+	int rc = 0;
+	ENTRY;
+
+	switch(lcfg->lcfg_command) {
+	case LCFG_LOV_ADD_OBD: {
+		/* Overloading this cfg command: register a new target */
+		struct mgs_target_info *mti;
+
+		if (LUSTRE_CFG_BUFLEN(lcfg, 1) !=
+		    sizeof(struct mgs_target_info))
+			GOTO(out, rc = -EINVAL);
+
+		mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1);
+		CDEBUG(D_MGC, "add_target %s %#x\n",
+		       mti->mti_svname, mti->mti_flags);
+		rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti);
+		break;
+	}
+	case LCFG_LOV_DEL_OBD:
+		/* Unregister has no meaning at the moment. */
+		CERROR("lov_del_obd unimplemented\n");
+		rc = -ENOSYS;
+		break;
+	case LCFG_SPTLRPC_CONF: {
+		rc = sptlrpc_process_config(lcfg);
+		break;
+	}
+	case LCFG_LOG_START: {
+		struct config_llog_data *cld;
+		struct super_block *sb;
+
+		logname = lustre_cfg_string(lcfg, 1);
+		cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2);
+		sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3);
+
+		CDEBUG(D_MGC, "parse_log %s from %d\n", logname,
+		       cfg->cfg_last_idx);
+
+		/* We're only called through here on the initial mount */
+		rc = config_log_add(obd, logname, cfg, sb);
+		if (rc)
+			break;
+		cld = config_log_find(logname, cfg);
+		if (cld == NULL) {
+			rc = -ENOENT;
+			break;
+		}
+
+		/* COMPAT_146 */
+		/* FIXME only set this for old logs!  Right now this forces
+		   us to always skip the "inside markers" check */
+		cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146;
+
+		rc = mgc_process_log(obd, cld);
+		if (rc == 0 && cld->cld_recover != NULL) {
+			if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
+					 imp_connect_data, IMP_RECOV)) {
+				rc = mgc_process_log(obd, cld->cld_recover);
+			} else {
+				struct config_llog_data *cir = cld->cld_recover;
+				cld->cld_recover = NULL;
+				config_log_put(cir);
+			}
+			if (rc)
+				CERROR("Cannot process recover llog %d\n", rc);
+		}
+		config_log_put(cld);
+
+		break;
+	}
+	case LCFG_LOG_END: {
+		logname = lustre_cfg_string(lcfg, 1);
+
+		if (lcfg->lcfg_bufcount >= 2)
+			cfg = (struct config_llog_instance *)lustre_cfg_buf(
+				lcfg, 2);
+		rc = config_log_end(logname, cfg);
+		break;
+	}
+	default: {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		GOTO(out, rc = -EINVAL);
+
+	}
+	}
+out:
+	RETURN(rc);
+}
+
+struct obd_ops mgc_obd_ops = {
+	.o_owner	= THIS_MODULE,
+	.o_setup	= mgc_setup,
+	.o_precleanup   = mgc_precleanup,
+	.o_cleanup      = mgc_cleanup,
+	.o_add_conn     = client_import_add_conn,
+	.o_del_conn     = client_import_del_conn,
+	.o_connect      = client_connect_import,
+	.o_disconnect   = client_disconnect_export,
+	//.o_enqueue      = mgc_enqueue,
+	.o_cancel       = mgc_cancel,
+	//.o_iocontrol    = mgc_iocontrol,
+	.o_set_info_async = mgc_set_info_async,
+	.o_get_info       = mgc_get_info,
+	.o_import_event = mgc_import_event,
+	.o_llog_init    = mgc_llog_init,
+	.o_llog_finish  = mgc_llog_finish,
+	.o_process_config = mgc_process_config,
+};
+
+int __init mgc_init(void)
+{
+	return class_register_type(&mgc_obd_ops, NULL, NULL,
+				   LUSTRE_MGC_NAME, NULL);
+}
+
+static void /*__exit*/ mgc_exit(void)
+{
+	class_unregister_type(LUSTRE_MGC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Management Client");
+MODULE_LICENSE("GPL");
+
+module_init(mgc_init);
+module_exit(mgc_exit);

diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile
new file mode 100644
index 0000000..b80c13c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/Makefile

@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTRE_FS) += obdclass.o llog_test.o
+
+obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \
+	      llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \
+	      genops.o uuid.o llog_ioctl.o lprocfs_status.o		   \
+	      lprocfs_jobstats.o lustre_handles.o lustre_peer.o llog_osd.o \
+	      local_storage.o statfs_pack.o obdo.o obd_config.o obd_mount.o\
+	      mea.o lu_object.o dt_object.o capa.o cl_object.o   \
+	      cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o idmap.o	   \
+	      lu_ucred.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/obdclass/acl.c b/drivers/staging/lustre/lustre/obdclass/acl.c
new file mode 100644
index 0000000..c2a6702
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/acl.c

@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/acl.c
+ *
+ * Lustre Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <lu_object.h>
+#include <lustre_acl.h>
+#include <lustre_eacl.h>
+#include <obd_support.h>
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION
+
+enum {
+	ES_UNK  = 0,    /* unknown stat */
+	ES_UNC  = 1,    /* ACL entry is not changed */
+	ES_MOD  = 2,    /* ACL entry is modified */
+	ES_ADD  = 3,    /* ACL entry is added */
+	ES_DEL  = 4     /* ACL entry is deleted */
+};
+
+static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d,
+					    ext_acl_xattr_entry *s)
+{
+	d->e_tag	= le16_to_cpu(s->e_tag);
+	d->e_perm       = le16_to_cpu(s->e_perm);
+	d->e_id	 = le32_to_cpu(s->e_id);
+	d->e_stat       = le32_to_cpu(s->e_stat);
+}
+
+static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d,
+					    ext_acl_xattr_entry *s)
+{
+	d->e_tag	= cpu_to_le16(s->e_tag);
+	d->e_perm       = cpu_to_le16(s->e_perm);
+	d->e_id	 = cpu_to_le32(s->e_id);
+	d->e_stat       = cpu_to_le32(s->e_stat);
+}
+
+static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
+					      posix_acl_xattr_entry *s)
+{
+	d->e_tag	= le16_to_cpu(s->e_tag);
+	d->e_perm       = le16_to_cpu(s->e_perm);
+	d->e_id	 = le32_to_cpu(s->e_id);
+}
+
+static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+					      posix_acl_xattr_entry *s)
+{
+	d->e_tag	= cpu_to_le16(s->e_tag);
+	d->e_perm       = cpu_to_le16(s->e_perm);
+	d->e_id	 = cpu_to_le32(s->e_id);
+}
+
+
+/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */
+static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header,
+					       int old_count, int new_count)
+{
+	int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr);
+	int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr);
+	posix_acl_xattr_header *new;
+
+	if (unlikely(old_count <= new_count))
+		return old_size;
+
+	OBD_ALLOC(new, new_size);
+	if (unlikely(new == NULL))
+		return -ENOMEM;
+
+	memcpy(new, *header, new_size);
+	OBD_FREE(*header, old_size);
+	*header = new;
+	return new_size;
+}
+
+/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */
+static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header,
+					     int old_count)
+{
+	int ext_count = le32_to_cpu((*header)->a_count);
+	int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+	int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr);
+	ext_acl_xattr_header *new;
+
+	if (unlikely(old_count <= ext_count))
+		return 0;
+
+	OBD_ALLOC(new, ext_size);
+	if (unlikely(new == NULL))
+		return -ENOMEM;
+
+	memcpy(new, *header, ext_size);
+	OBD_FREE(*header, old_size);
+	*header = new;
+	return 0;
+}
+
+/*
+ * Generate new extended ACL based on the posix ACL.
+ */
+ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size)
+{
+	int count, i, esize;
+	ext_acl_xattr_header *new;
+	ENTRY;
+
+	if (unlikely(size < 0))
+		RETURN(ERR_PTR(-EINVAL));
+	else if (!size)
+		count = 0;
+	else
+		count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr);
+	OBD_ALLOC(new, esize);
+	if (unlikely(new == NULL))
+		RETURN(ERR_PTR(-ENOMEM));
+
+	new->a_count = cpu_to_le32(count);
+	for (i = 0; i < count; i++) {
+		new->a_entries[i].e_tag  = header->a_entries[i].e_tag;
+		new->a_entries[i].e_perm = header->a_entries[i].e_perm;
+		new->a_entries[i].e_id   = header->a_entries[i].e_id;
+		new->a_entries[i].e_stat = cpu_to_le32(ES_UNK);
+	}
+
+	RETURN(new);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext);
+
+/*
+ * Filter out the "nobody" entries in the posix ACL.
+ */
+int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+				  posix_acl_xattr_header **out)
+{
+	int count, i, j, rc = 0;
+	__u32 id;
+	posix_acl_xattr_header *new;
+	ENTRY;
+
+	if (unlikely(size < 0))
+		RETURN(-EINVAL);
+	else if (!size)
+		RETURN(0);
+
+	OBD_ALLOC(new, size);
+	if (unlikely(new == NULL))
+		RETURN(-ENOMEM);
+
+	new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+	count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	for (i = 0, j = 0; i < count; i++) {
+		id = le32_to_cpu(header->a_entries[i].e_id);
+		switch (le16_to_cpu(header->a_entries[i].e_tag)) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			if (id != ACL_UNDEFINED_ID)
+				GOTO(_out, rc = -EIO);
+
+			memcpy(&new->a_entries[j++], &header->a_entries[i],
+			       sizeof(posix_acl_xattr_entry));
+			break;
+		case ACL_USER:
+			if (id != NOBODY_UID)
+				memcpy(&new->a_entries[j++],
+				       &header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+			break;
+		case ACL_GROUP:
+			if (id != NOBODY_GID)
+				memcpy(&new->a_entries[j++],
+				       &header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+			break;
+		default:
+			GOTO(_out, rc = -EIO);
+		}
+	}
+
+	/* free unused space. */
+	rc = lustre_posix_acl_xattr_reduce_space(&new, count, j);
+	if (rc >= 0) {
+		size = rc;
+		*out = new;
+		rc = 0;
+	}
+	EXIT;
+
+_out:
+	if (rc) {
+		OBD_FREE(new, size);
+		size = rc;
+	}
+	return size;
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_filter);
+
+/*
+ * Release the posix ACL space.
+ */
+void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size)
+{
+	OBD_FREE(header, size);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_free);
+
+/*
+ * Release the extended ACL space.
+ */
+void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header)
+{
+	OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \
+					    ext_acl_xattr));
+}
+EXPORT_SYMBOL(lustre_ext_acl_xattr_free);
+
+static ext_acl_xattr_entry *
+lustre_ext_acl_xattr_search(ext_acl_xattr_header *header,
+			    posix_acl_xattr_entry *entry, int *pos)
+{
+	int once, start, end, i, j, count = le32_to_cpu(header->a_count);
+
+	once = 0;
+	start = *pos;
+	end = count;
+
+again:
+	for (i = start; i < end; i++) {
+		if (header->a_entries[i].e_tag == entry->e_tag &&
+		    header->a_entries[i].e_id == entry->e_id) {
+			j = i;
+			if (++i >= count)
+				i = 0;
+			*pos = i;
+			return &header->a_entries[j];
+		}
+	}
+
+	if (!once) {
+		once = 1;
+		start = 0;
+		end = *pos;
+		goto again;
+	}
+
+	return NULL;
+}
+
+/*
+ * Merge the posix ACL and the extended ACL into new posix ACL.
+ */
+int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+				 ext_acl_xattr_header *ext_header,
+				 posix_acl_xattr_header **out)
+{
+	int posix_count, posix_size, i, j;
+	int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0;
+	posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID};
+	posix_acl_xattr_header *new;
+	ext_acl_xattr_entry *ee, ae;
+	ENTRY;
+
+	lustre_posix_acl_cpu_to_le(&pe, &pe);
+	ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos);
+	if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) {
+		/* there are only base ACL entries at most. */
+		posix_count = 3;
+		posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+		OBD_ALLOC(new, posix_size);
+		if (unlikely(new == NULL))
+			RETURN(-ENOMEM);
+
+		new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+		for (i = 0, j = 0; i < ext_count; i++) {
+			lustre_ext_acl_le_to_cpu(&ae,
+						 &ext_header->a_entries[i]);
+			switch (ae.e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_OTHER:
+				if (ae.e_id != ACL_UNDEFINED_ID)
+					GOTO(_out, rc = -EIO);
+
+				if (ae.e_stat != ES_DEL) {
+					new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+					new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+					new->a_entries[j++].e_id =
+						ext_header->a_entries[i].e_id;
+				}
+				break;
+			case ACL_MASK:
+			case ACL_USER:
+			case ACL_GROUP:
+				if (ae.e_stat == ES_DEL)
+					break;
+			default:
+				GOTO(_out, rc = -EIO);
+			}
+		}
+	} else {
+		/* maybe there are valid ACL_USER or ACL_GROUP entries in the
+		 * original server-side ACL, they are regarded as ES_UNC stat.*/
+		int ori_posix_count;
+
+		if (unlikely(size < 0))
+			RETURN(-EINVAL);
+		else if (!size)
+			ori_posix_count = 0;
+		else
+			ori_posix_count =
+				CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+		posix_count = ori_posix_count + ext_count;
+		posix_size =
+			CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+		OBD_ALLOC(new, posix_size);
+		if (unlikely(new == NULL))
+			RETURN(-ENOMEM);
+
+		new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+		/* 1. process the unchanged ACL entries
+		 *    in the original server-side ACL. */
+		pos = 0;
+		for (i = 0, j = 0; i < ori_posix_count; i++) {
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee == NULL)
+				memcpy(&new->a_entries[j++],
+				       &posix_header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+		}
+
+		/* 2. process the non-deleted entries
+		 *    from client-side extended ACL. */
+		for (i = 0; i < ext_count; i++) {
+			if (le16_to_cpu(ext_header->a_entries[i].e_stat) !=
+			    ES_DEL) {
+				new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+				new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+				new->a_entries[j++].e_id =
+						ext_header->a_entries[i].e_id;
+			}
+		}
+	}
+
+	/* free unused space. */
+	rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j);
+	if (rc >= 0) {
+		posix_size = rc;
+		*out = new;
+		rc = 0;
+	}
+	EXIT;
+
+_out:
+	if (rc) {
+		OBD_FREE(new, posix_size);
+		posix_size = rc;
+	}
+	return posix_size;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2posix);
+
+/*
+ * Merge the posix ACL and the extended ACL into new extended ACL.
+ */
+ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+			   ext_acl_xattr_header *ext_header)
+{
+	int ori_ext_count, posix_count, ext_count, ext_size;
+	int i, j, pos = 0, rc = 0;
+	posix_acl_xattr_entry pae;
+	ext_acl_xattr_header *new;
+	ext_acl_xattr_entry *ee, eae;
+	ENTRY;
+
+	if (unlikely(size < 0))
+		RETURN(ERR_PTR(-EINVAL));
+	else if (!size)
+		posix_count = 0;
+	else
+		posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	ori_ext_count = le32_to_cpu(ext_header->a_count);
+	ext_count = posix_count + ori_ext_count;
+	ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+
+	OBD_ALLOC(new, ext_size);
+	if (unlikely(new == NULL))
+		RETURN(ERR_PTR(-ENOMEM));
+
+	for (i = 0, j = 0; i < posix_count; i++) {
+		lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]);
+		switch (pae.e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			if (pae.e_id != ACL_UNDEFINED_ID)
+				GOTO(out, rc = -EIO);
+		case ACL_USER:
+			/* ignore "nobody" entry. */
+			if (pae.e_id == NOBODY_UID)
+				break;
+
+			new->a_entries[j].e_tag =
+					posix_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+					posix_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id =
+					posix_header->a_entries[i].e_id;
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee) {
+				if (posix_header->a_entries[i].e_perm !=
+								ee->e_perm)
+					/* entry modified. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_MOD);
+				else
+					/* entry unchanged. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_UNC);
+			} else {
+				/* new entry. */
+				new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_ADD);
+			}
+			break;
+		case ACL_GROUP:
+			/* ignore "nobody" entry. */
+			if (pae.e_id == NOBODY_GID)
+				break;
+			new->a_entries[j].e_tag =
+					posix_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+					posix_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id =
+					posix_header->a_entries[i].e_id;
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee) {
+				if (posix_header->a_entries[i].e_perm !=
+								ee->e_perm)
+					/* entry modified. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_MOD);
+				else
+					/* entry unchanged. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_UNC);
+			} else {
+				/* new entry. */
+				new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_ADD);
+			}
+			break;
+		default:
+			GOTO(out, rc = -EIO);
+		}
+	}
+
+	/* process deleted entries. */
+	for (i = 0; i < ori_ext_count; i++) {
+		lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]);
+		if (eae.e_stat == ES_UNK) {
+			/* ignore "nobody" entry. */
+			if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) ||
+			    (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID))
+				continue;
+
+			new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id = ext_header->a_entries[i].e_id;
+			new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL);
+		}
+	}
+
+	new->a_count = cpu_to_le32(j);
+	/* free unused space. */
+	rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count);
+	EXIT;
+
+out:
+	if (rc) {
+		OBD_FREE(new, ext_size);
+		new = ERR_PTR(rc);
+	}
+	return new;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2ext);
+
+#endif

diff --git a/drivers/staging/lustre/lustre/obdclass/capa.c b/drivers/staging/lustre/lustre/obdclass/capa.c
new file mode 100644
index 0000000..3e532f5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/capa.c

@@ -0,0 +1,401 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/capa.c
+ *
+ * Lustre Capability Hash Management
+ *
+ * Author: Lai Siyao<lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/list.h>
+#include <lustre_capa.h>
+
+#define NR_CAPAHASH 32
+#define CAPA_HASH_SIZE 3000	      /* for MDS & OSS */
+
+struct kmem_cache *capa_cachep = NULL;
+
+/* lock for capa hash/capa_list/fo_capa_keys */
+DEFINE_SPINLOCK(capa_lock);
+
+struct list_head capa_list[CAPA_SITE_MAX];
+
+static struct capa_hmac_alg capa_hmac_algs[] = {
+	DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20),
+};
+/* capa count */
+int capa_count[CAPA_SITE_MAX] = { 0, };
+
+EXPORT_SYMBOL(capa_cachep);
+EXPORT_SYMBOL(capa_list);
+EXPORT_SYMBOL(capa_lock);
+EXPORT_SYMBOL(capa_count);
+
+struct hlist_head *init_capa_hash(void)
+{
+	struct hlist_head *hash;
+	int nr_hash, i;
+
+	OBD_ALLOC(hash, PAGE_CACHE_SIZE);
+	if (!hash)
+		return NULL;
+
+	nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head);
+	LASSERT(nr_hash > NR_CAPAHASH);
+
+	for (i = 0; i < NR_CAPAHASH; i++)
+		INIT_HLIST_HEAD(hash + i);
+	return hash;
+}
+EXPORT_SYMBOL(init_capa_hash);
+
+static inline int capa_on_server(struct obd_capa *ocapa)
+{
+	return ocapa->c_site == CAPA_SITE_SERVER;
+}
+
+static inline void capa_delete(struct obd_capa *ocapa)
+{
+	LASSERT(capa_on_server(ocapa));
+	hlist_del_init(&ocapa->u.tgt.c_hash);
+	list_del_init(&ocapa->c_list);
+	capa_count[ocapa->c_site]--;
+	/* release the ref when alloc */
+	capa_put(ocapa);
+}
+
+void cleanup_capa_hash(struct hlist_head *hash)
+{
+	int i;
+	struct hlist_node *next;
+	struct obd_capa *oc;
+
+	spin_lock(&capa_lock);
+	for (i = 0; i < NR_CAPAHASH; i++) {
+		hlist_for_each_entry_safe(oc, next, hash + i,
+					      u.tgt.c_hash)
+			capa_delete(oc);
+	}
+	spin_unlock(&capa_lock);
+
+	OBD_FREE(hash, PAGE_CACHE_SIZE);
+}
+EXPORT_SYMBOL(cleanup_capa_hash);
+
+static inline int capa_hashfn(struct lu_fid *fid)
+{
+	return (fid_oid(fid) ^ fid_ver(fid)) *
+	       (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH;
+}
+
+/* capa renewal time check is earlier than that on client, which is to prevent
+ * client renew right after obtaining it. */
+static inline int capa_is_to_expire(struct obd_capa *oc)
+{
+	return cfs_time_before(cfs_time_sub(oc->c_expiry,
+				   cfs_time_seconds(oc->c_capa.lc_timeout)*2/3),
+			       cfs_time_current());
+}
+
+static struct obd_capa *find_capa(struct lustre_capa *capa,
+				  struct hlist_head *head, int alive)
+{
+	struct obd_capa *ocapa;
+	int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa);
+
+	hlist_for_each_entry(ocapa, head, u.tgt.c_hash) {
+		if (memcmp(&ocapa->c_capa, capa, len))
+			continue;
+		/* don't return one that will expire soon in this case */
+		if (alive && capa_is_to_expire(ocapa))
+			continue;
+
+		LASSERT(capa_on_server(ocapa));
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found");
+		return ocapa;
+	}
+
+	return NULL;
+}
+
+#define LRU_CAPA_DELETE_COUNT 12
+static inline void capa_delete_lru(struct list_head *head)
+{
+	struct obd_capa *ocapa;
+	struct list_head *node = head->next;
+	int count = 0;
+
+	/* free LRU_CAPA_DELETE_COUNT unused capa from head */
+	while (count++ < LRU_CAPA_DELETE_COUNT) {
+		ocapa = list_entry(node, struct obd_capa, c_list);
+		node = node->next;
+		if (atomic_read(&ocapa->c_refc))
+			continue;
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru");
+		capa_delete(ocapa);
+	}
+}
+
+/* add or update */
+struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa)
+{
+	struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid);
+	struct obd_capa *ocapa, *old = NULL;
+	struct list_head *list = &capa_list[CAPA_SITE_SERVER];
+
+	ocapa = alloc_capa(CAPA_SITE_SERVER);
+	if (IS_ERR(ocapa))
+		return NULL;
+
+	spin_lock(&capa_lock);
+	old = find_capa(capa, head, 0);
+	if (!old) {
+		ocapa->c_capa = *capa;
+		set_capa_expiry(ocapa);
+		hlist_add_head(&ocapa->u.tgt.c_hash, head);
+		list_add_tail(&ocapa->c_list, list);
+		capa_get(ocapa);
+		capa_count[CAPA_SITE_SERVER]++;
+		if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE)
+			capa_delete_lru(list);
+		spin_unlock(&capa_lock);
+		return ocapa;
+	} else {
+		capa_get(old);
+		spin_unlock(&capa_lock);
+		capa_put(ocapa);
+		return old;
+	}
+}
+EXPORT_SYMBOL(capa_add);
+
+struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa,
+			     int alive)
+{
+	struct obd_capa *ocapa;
+
+	spin_lock(&capa_lock);
+	ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive);
+	if (ocapa) {
+		list_move_tail(&ocapa->c_list,
+				   &capa_list[CAPA_SITE_SERVER]);
+		capa_get(ocapa);
+	}
+	spin_unlock(&capa_lock);
+
+	return ocapa;
+}
+EXPORT_SYMBOL(capa_lookup);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key)
+{
+	struct ll_crypto_hash *tfm;
+	struct capa_hmac_alg  *alg;
+	int keylen;
+	struct scatterlist sl;
+
+	if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) {
+		CERROR("unknown capability hmac algorithm!\n");
+		return -EFAULT;
+	}
+
+	alg = &capa_hmac_algs[capa_alg(capa)];
+
+	tfm = ll_crypto_alloc_hash(alg->ha_name, 0, 0);
+	if (!tfm) {
+		CERROR("crypto_alloc_tfm failed, check whether your kernel"
+		       "has crypto support!\n");
+		return -ENOMEM;
+	}
+	keylen = alg->ha_keylen;
+
+	sg_set_page(&sl, virt_to_page(capa),
+		    offsetof(struct lustre_capa, lc_hmac),
+		    (unsigned long)(capa) % PAGE_CACHE_SIZE);
+
+	ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac);
+	ll_crypto_free_hash(tfm);
+
+	return 0;
+}
+EXPORT_SYMBOL(capa_hmac);
+
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+	struct ll_crypto_cipher *tfm;
+	struct scatterlist sd;
+	struct scatterlist ss;
+	struct blkcipher_desc desc;
+	unsigned int min;
+	int rc;
+	char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+	ENTRY;
+
+	/* passing "aes" in a variable instead of a constant string keeps gcc
+	 * 4.3.2 happy */
+	tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+	if (IS_ERR(tfm)) {
+		CERROR("failed to load transform for aes\n");
+		RETURN(PTR_ERR(tfm));
+	}
+
+	min = ll_crypto_tfm_alg_min_keysize(tfm);
+	if (keylen < min) {
+		CERROR("keylen at least %d bits for aes\n", min * 8);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+	if (rc) {
+		CERROR("failed to setting key for aes\n");
+		GOTO(out, rc);
+	}
+
+	sg_set_page(&sd, virt_to_page(d), 16,
+		    (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+	sg_set_page(&ss, virt_to_page(s), 16,
+		    (unsigned long)(s) % PAGE_CACHE_SIZE);
+	desc.tfm   = tfm;
+	desc.info  = NULL;
+	desc.flags = 0;
+	rc = ll_crypto_blkcipher_encrypt(&desc, &sd, &ss, 16);
+	if (rc) {
+		CERROR("failed to encrypt for aes\n");
+		GOTO(out, rc);
+	}
+
+	EXIT;
+
+out:
+	ll_crypto_free_blkcipher(tfm);
+	return rc;
+}
+EXPORT_SYMBOL(capa_encrypt_id);
+
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+	struct ll_crypto_cipher *tfm;
+	struct scatterlist sd;
+	struct scatterlist ss;
+	struct blkcipher_desc desc;
+	unsigned int min;
+	int rc;
+	char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+	ENTRY;
+
+	/* passing "aes" in a variable instead of a constant string keeps gcc
+	 * 4.3.2 happy */
+	tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+	if (IS_ERR(tfm)) {
+		CERROR("failed to load transform for aes\n");
+		RETURN(PTR_ERR(tfm));
+	}
+
+	min = ll_crypto_tfm_alg_min_keysize(tfm);
+	if (keylen < min) {
+		CERROR("keylen at least %d bits for aes\n", min * 8);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+	if (rc) {
+		CERROR("failed to setting key for aes\n");
+		GOTO(out, rc);
+	}
+
+	sg_set_page(&sd, virt_to_page(d), 16,
+		    (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+	sg_set_page(&ss, virt_to_page(s), 16,
+		    (unsigned long)(s) % PAGE_CACHE_SIZE);
+
+	desc.tfm   = tfm;
+	desc.info  = NULL;
+	desc.flags = 0;
+	rc = ll_crypto_blkcipher_decrypt(&desc, &sd, &ss, 16);
+	if (rc) {
+		CERROR("failed to decrypt for aes\n");
+		GOTO(out, rc);
+	}
+
+	EXIT;
+
+out:
+	ll_crypto_free_blkcipher(tfm);
+	return rc;
+}
+EXPORT_SYMBOL(capa_decrypt_id);
+
+void capa_cpy(void *capa, struct obd_capa *ocapa)
+{
+	spin_lock(&ocapa->c_lock);
+	*(struct lustre_capa *)capa = ocapa->c_capa;
+	spin_unlock(&ocapa->c_lock);
+}
+EXPORT_SYMBOL(capa_cpy);
+
+void _debug_capa(struct lustre_capa *c,
+		 struct libcfs_debug_msg_data *msgdata,
+		 const char *fmt, ... )
+{
+	va_list args;
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+			   " capability@%p fid "DFID" opc "LPX64" uid "LPU64
+			   " gid "LPU64" flags %u alg %d keyid %u timeout %u "
+			   "expiry %u\n", c, PFID(capa_fid(c)), capa_opc(c),
+			   capa_uid(c), capa_gid(c), capa_flags(c),
+			   capa_alg(c), capa_keyid(c), capa_timeout(c),
+			   capa_expiry(c));
+	va_end(args);
+}
+EXPORT_SYMBOL(_debug_capa);

diff --git a/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
new file mode 100644
index 0000000..7eb0ad7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_internal.h

@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal cl interfaces.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+#define CLT_PVEC_SIZE (14)
+
+/**
+ * Possible levels of the nesting. Currently this is 2: there are "top"
+ * entities (files, extent locks), and "sub" entities (stripes and stripe
+ * locks). This is used only for debugging counters right now.
+ */
+enum clt_nesting_level {
+	CNL_TOP,
+	CNL_SUB,
+	CNL_NR
+};
+
+/**
+ * Counters used to check correctness of cl_lock interface usage.
+ */
+struct cl_thread_counters {
+	/**
+	 * Number of outstanding calls to cl_lock_mutex_get() made by the
+	 * current thread. For debugging.
+	 */
+	int	   ctc_nr_locks_locked;
+	/** List of locked locks. */
+	struct lu_ref ctc_locks_locked;
+	/** Number of outstanding holds on locks. */
+	int	   ctc_nr_held;
+	/** Number of outstanding uses on locks. */
+	int	   ctc_nr_used;
+	/** Number of held extent locks. */
+	int	   ctc_nr_locks_acquired;
+};
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+	/*
+	 * Common fields.
+	 */
+	struct cl_io	 clt_io;
+	struct cl_2queue     clt_queue;
+
+	/*
+	 * Fields used by cl_lock.c
+	 */
+	struct cl_lock_descr clt_descr;
+	struct cl_page_list  clt_list;
+	/**
+	 * Counters for every level of lock nesting.
+	 */
+	struct cl_thread_counters clt_counters[CNL_NR];
+	/** @} debugging */
+
+	/*
+	 * Fields used by cl_page.c
+	 */
+	struct cl_page      *clt_pvec[CLT_PVEC_SIZE];
+
+	/*
+	 * Fields used by cl_io.c
+	 */
+	/**
+	 * Pointer to the topmost ongoing IO in this thread.
+	 */
+	struct cl_io	*clt_current_io;
+	/**
+	 * Used for submitting a sync io.
+	 */
+	struct cl_sync_io    clt_anchor;
+	/**
+	 * Fields used by cl_lock_discard_pages().
+	 */
+	pgoff_t	      clt_next_index;
+	pgoff_t	      clt_fn_index; /* first non-overlapped index */
+};
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+
+#endif /* _CL_INTERNAL_H */

diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c
new file mode 100644
index 0000000..75c9be8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_io.c

@@ -0,0 +1,1753 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client IO.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+#define cl_io_for_each(slice, io) \
+	list_for_each_entry((slice), &io->ci_layers, cis_linkage)
+#define cl_io_for_each_reverse(slice, io)		 \
+	list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage)
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+	return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+	return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * Returns true iff there is an IO ongoing in the given environment.
+ */
+int cl_io_is_going(const struct lu_env *env)
+{
+	return cl_env_info(env)->clt_current_io != NULL;
+}
+EXPORT_SYMBOL(cl_io_is_going);
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+	struct cl_io *up;
+
+	up = io->ci_parent;
+	return
+		/*
+		 * io can own pages only when it is ongoing. Sub-io might
+		 * still be in CIS_LOCKED state when top-io is in
+		 * CIS_IO_GOING.
+		 */
+		ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+		     (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_io_slice    *slice;
+	struct cl_thread_info *info;
+
+	LINVRNT(cl_io_type_is_valid(io->ci_type));
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	while (!list_empty(&io->ci_layers)) {
+		slice = container_of(io->ci_layers.prev, struct cl_io_slice,
+				     cis_linkage);
+		list_del_init(&slice->cis_linkage);
+		if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+			slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+		/*
+		 * Invalidate slice to catch use after free. This assumes that
+		 * slices are allocated within session and can be touched
+		 * after ->cio_fini() returns.
+		 */
+		slice->cis_io = NULL;
+	}
+	io->ci_state = CIS_FINI;
+	info = cl_env_info(env);
+	if (info->clt_current_io == io)
+		info->clt_current_io = NULL;
+
+	/* sanity check for layout change */
+	switch(io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		break;
+	case CIT_FAULT:
+	case CIT_FSYNC:
+		LASSERT(!io->ci_need_restart);
+		break;
+	case CIT_SETATTR:
+	case CIT_MISC:
+		/* Check ignore layout change conf */
+		LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
+				!io->ci_need_restart));
+		break;
+	default:
+		LBUG();
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+		       enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_object *scan;
+	int result;
+
+	LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+	LINVRNT(cl_io_type_is_valid(iot));
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	io->ci_type = iot;
+	INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+	INIT_LIST_HEAD(&io->ci_lockset.cls_curr);
+	INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+	INIT_LIST_HEAD(&io->ci_layers);
+
+	result = 0;
+	cl_object_for_each(scan, obj) {
+		if (scan->co_ops->coo_io_init != NULL) {
+			result = scan->co_ops->coo_io_init(env, scan, io);
+			if (result != 0)
+				break;
+		}
+	}
+	if (result == 0)
+		io->ci_state = CIS_INIT;
+	RETURN(result);
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+		   enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+
+	LASSERT(obj != cl_object_top(obj));
+	if (info->clt_current_io == NULL)
+		info->clt_current_io = io;
+	return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+	       enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+
+	LASSERT(obj == cl_object_top(obj));
+	LASSERT(info->clt_current_io == NULL);
+
+	info->clt_current_io = io;
+	return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+		  enum cl_io_type iot, loff_t pos, size_t count)
+{
+	LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+	LINVRNT(io->ci_obj != NULL);
+	ENTRY;
+
+	LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+			 "io range: %u ["LPU64", "LPU64") %u %u\n",
+			 iot, (__u64)pos, (__u64)pos + count,
+			 io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+	io->u.ci_rw.crw_pos    = pos;
+	io->u.ci_rw.crw_count  = count;
+	RETURN(cl_io_init(env, io, iot, io->ci_obj));
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+static inline const struct lu_fid *
+cl_lock_descr_fid(const struct cl_lock_descr *descr)
+{
+	return lu_object_fid(&descr->cld_obj->co_lu);
+}
+
+static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
+			      const struct cl_lock_descr *d1)
+{
+	return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?:
+		__diff_normalize(d0->cld_start, d1->cld_start);
+}
+
+static int cl_lock_descr_cmp(const struct cl_lock_descr *d0,
+			     const struct cl_lock_descr *d1)
+{
+	int ret;
+
+	ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1));
+	if (ret)
+		return ret;
+	if (d0->cld_end < d1->cld_start)
+		return -1;
+	if (d0->cld_start > d0->cld_end)
+		return 1;
+	return 0;
+}
+
+static void cl_lock_descr_merge(struct cl_lock_descr *d0,
+				const struct cl_lock_descr *d1)
+{
+	d0->cld_start = min(d0->cld_start, d1->cld_start);
+	d0->cld_end = max(d0->cld_end, d1->cld_end);
+
+	if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE)
+		d0->cld_mode = CLM_WRITE;
+
+	if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP)
+		d0->cld_mode = CLM_GROUP;
+}
+
+/*
+ * Sort locks in lexicographical order of their (fid, start-offset) pairs.
+ */
+static void cl_io_locks_sort(struct cl_io *io)
+{
+	int done = 0;
+
+	ENTRY;
+	/* hidden treasure: bubble sort for now. */
+	do {
+		struct cl_io_lock_link *curr;
+		struct cl_io_lock_link *prev;
+		struct cl_io_lock_link *temp;
+
+		done = 1;
+		prev = NULL;
+
+		list_for_each_entry_safe(curr, temp,
+					     &io->ci_lockset.cls_todo,
+					     cill_linkage) {
+			if (prev != NULL) {
+				switch (cl_lock_descr_sort(&prev->cill_descr,
+							  &curr->cill_descr)) {
+				case 0:
+					/*
+					 * IMPOSSIBLE: Identical locks are
+					 *	     already removed at
+					 *	     this point.
+					 */
+				default:
+					LBUG();
+				case +1:
+					list_move_tail(&curr->cill_linkage,
+							   &prev->cill_linkage);
+					done = 0;
+					continue; /* don't change prev: it's
+						   * still "previous" */
+				case -1: /* already in order */
+					break;
+				}
+			}
+			prev = curr;
+		}
+	} while (!done);
+	EXIT;
+}
+
+/**
+ * Check whether \a queue contains locks matching \a need.
+ *
+ * \retval +ve there is a matching lock in the \a queue
+ * \retval   0 there are no matching locks in the \a queue
+ */
+int cl_queue_match(const struct list_head *queue,
+		   const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+	       if (cl_lock_descr_match(&scan->cill_descr, need))
+		       RETURN(+1);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(cl_queue_match);
+
+static int cl_queue_merge(const struct list_head *queue,
+			  const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+	       if (cl_lock_descr_cmp(&scan->cill_descr, need))
+		       continue;
+	       cl_lock_descr_merge(&scan->cill_descr, need);
+	       CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+		      scan->cill_descr.cld_mode, scan->cill_descr.cld_start,
+		      scan->cill_descr.cld_end);
+	       RETURN(+1);
+       }
+       RETURN(0);
+
+}
+
+static int cl_lockset_match(const struct cl_lockset *set,
+			    const struct cl_lock_descr *need)
+{
+	return cl_queue_match(&set->cls_curr, need) ||
+	       cl_queue_match(&set->cls_done, need);
+}
+
+static int cl_lockset_merge(const struct cl_lockset *set,
+			    const struct cl_lock_descr *need)
+{
+	return cl_queue_merge(&set->cls_todo, need) ||
+	       cl_lockset_match(set, need);
+}
+
+static int cl_lockset_lock_one(const struct lu_env *env,
+			       struct cl_io *io, struct cl_lockset *set,
+			       struct cl_io_lock_link *link)
+{
+	struct cl_lock *lock;
+	int	     result;
+
+	ENTRY;
+
+	lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+
+	if (!IS_ERR(lock)) {
+		link->cill_lock = lock;
+		list_move(&link->cill_linkage, &set->cls_curr);
+		if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) {
+			result = cl_wait(env, lock);
+			if (result == 0)
+				list_move(&link->cill_linkage,
+					      &set->cls_done);
+		} else
+			result = 0;
+	} else
+		result = PTR_ERR(lock);
+	RETURN(result);
+}
+
+static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io,
+			      struct cl_io_lock_link *link)
+{
+	struct cl_lock *lock = link->cill_lock;
+
+	ENTRY;
+	list_del_init(&link->cill_linkage);
+	if (lock != NULL) {
+		cl_lock_release(env, lock, "io", io);
+		link->cill_lock = NULL;
+	}
+	if (link->cill_fini != NULL)
+		link->cill_fini(env, link);
+	EXIT;
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+			   struct cl_lockset *set)
+{
+	struct cl_io_lock_link *link;
+	struct cl_io_lock_link *temp;
+	struct cl_lock	 *lock;
+	int result;
+
+	ENTRY;
+	result = 0;
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+		if (!cl_lockset_match(set, &link->cill_descr)) {
+			/* XXX some locking to guarantee that locks aren't
+			 * expanded in between. */
+			result = cl_lockset_lock_one(env, io, set, link);
+			if (result != 0)
+				break;
+		} else
+			cl_lock_link_fini(env, io, link);
+	}
+	if (result == 0) {
+		list_for_each_entry_safe(link, temp,
+					     &set->cls_curr, cill_linkage) {
+			lock = link->cill_lock;
+			result = cl_wait(env, lock);
+			if (result == 0)
+				list_move(&link->cill_linkage,
+					      &set->cls_done);
+			else
+				break;
+		}
+	}
+	RETURN(result);
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_IT_STARTED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+		if (result != 0)
+			break;
+	}
+	if (result == 0) {
+		cl_io_locks_sort(io);
+		result = cl_lockset_lock(env, io, &io->ci_lockset);
+	}
+	if (result != 0)
+		cl_io_unlock(env, io);
+	else
+		io->ci_state = CIS_LOCKED;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_lockset	*set;
+	struct cl_io_lock_link   *link;
+	struct cl_io_lock_link   *temp;
+	const struct cl_io_slice *scan;
+
+	LASSERT(cl_io_is_loopable(io));
+	LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	set = &io->ci_lockset;
+
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage)
+		cl_lock_link_fini(env, io, link);
+
+	list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage)
+		cl_lock_link_fini(env, io, link);
+
+	list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+		cl_unuse(env, link->cill_lock);
+		cl_lock_link_fini(env, io, link);
+	}
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+			scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+	}
+	io->ci_state = CIS_UNLOCKED;
+	LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	result = 0;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+								      scan);
+		if (result != 0)
+			break;
+	}
+	if (result == 0)
+		io->ci_state = CIS_IT_STARTED;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_UNLOCKED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+			scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+	}
+	io->ci_state = CIS_IT_ENDED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+		nob == 0);
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+
+	io->u.ci_rw.crw_pos   += nob;
+	io->u.ci_rw.crw_count -= nob;
+
+	/* layers have to be notified. */
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+			scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+								   nob);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_rw_advance);
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+		   struct cl_io_lock_link *link)
+{
+	int result;
+
+	ENTRY;
+	if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr))
+		result = +1;
+	else {
+		list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+		result = 0;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+				 struct cl_io_lock_link *link)
+{
+	OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+			 struct cl_lock_descr *descr)
+{
+	struct cl_io_lock_link *link;
+	int result;
+
+	ENTRY;
+	OBD_ALLOC_PTR(link);
+	if (link != NULL) {
+		link->cill_descr     = *descr;
+		link->cill_fini      = cl_free_io_lock_link;
+		result = cl_io_lock_add(env, io, link);
+		if (result) /* lock match */
+			link->cill_fini(env, link);
+	} else
+		result = -ENOMEM;
+
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	io->ci_state = CIS_IO_GOING;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+		if (result != 0)
+			break;
+	}
+	if (result >= 0)
+		result = 0;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_IO_GOING);
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+			scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+		/* TODO: error handling. */
+	}
+	io->ci_state = CIS_IO_FINISHED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+static const struct cl_page_slice *
+cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type);
+	LINVRNT(slice != NULL);
+	return slice;
+}
+
+/**
+ * True iff \a page is within \a io range.
+ */
+static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io)
+{
+	int     result = 1;
+	loff_t  start;
+	loff_t  end;
+	pgoff_t idx;
+
+	idx = page->cp_index;
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		/*
+		 * check that [start, end) and [pos, pos + count) extents
+		 * overlap.
+		 */
+		if (!cl_io_is_append(io)) {
+			const struct cl_io_rw_common *crw = &(io->u.ci_rw);
+			start = cl_offset(page->cp_obj, idx);
+			end   = cl_offset(page->cp_obj, idx + 1);
+			result = crw->crw_pos < end &&
+				 start < crw->crw_pos + crw->crw_count;
+		}
+		break;
+	case CIT_FAULT:
+		result = io->u.ci_fault.ft_index == idx;
+		break;
+	default:
+		LBUG();
+	}
+	return result;
+}
+
+/**
+ * Called by read io, when page has to be read from the server.
+ *
+ * \see cl_io_operations::cio_read_page()
+ */
+int cl_io_read_page(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *page)
+{
+	const struct cl_io_slice *scan;
+	struct cl_2queue	 *queue;
+	int		       result = 0;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT);
+	LINVRNT(cl_page_is_owned(page, io));
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_page_in_io(page, io));
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	queue = &io->ci_queue;
+
+	cl_2queue_init(queue);
+	/*
+	 * ->cio_read_page() methods called in the loop below are supposed to
+	 * never block waiting for network (the only subtle point is the
+	 * creation of new pages for read-ahead that might result in cache
+	 * shrinking, but currently only clean pages are shrunk and this
+	 * requires no network io).
+	 *
+	 * Should this ever starts blocking, retry loop would be needed for
+	 * "parallel io" (see CLO_REPEAT loops in cl_lock.c).
+	 */
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->cio_read_page != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			LINVRNT(slice != NULL);
+			result = scan->cis_iop->cio_read_page(env, scan, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	if (result == 0)
+		result = cl_io_submit_rw(env, io, CRT_READ, queue);
+	/*
+	 * Unlock unsent pages in case of error.
+	 */
+	cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_2queue_fini(env, queue);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_read_page);
+
+/**
+ * Called by write io to prepare page to receive data from user buffer.
+ *
+ * \see cl_io_operations::cio_prepare_write()
+ */
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *page, unsigned from, unsigned to)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(io->ci_type == CIT_WRITE);
+	LINVRNT(cl_page_is_owned(page, io));
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	LASSERT(cl_page_in_io(page, io));
+	ENTRY;
+
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->cio_prepare_write != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			result = scan->cis_iop->cio_prepare_write(env, scan,
+								  slice,
+								  from, to);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_prepare_write);
+
+/**
+ * Called by write io after user data were copied into a page.
+ *
+ * \see cl_io_operations::cio_commit_write()
+ */
+int cl_io_commit_write(const struct lu_env *env, struct cl_io *io,
+		       struct cl_page *page, unsigned from, unsigned to)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(io->ci_type == CIT_WRITE);
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	/*
+	 * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov)
+	 * already called cl_page_cache_add(), moving page into CPS_CACHED
+	 * state. Better (and more general) way of dealing with such situation
+	 * is needed.
+	 */
+	LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL);
+	LASSERT(cl_page_in_io(page, io));
+	ENTRY;
+
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->cio_commit_write != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			result = scan->cis_iop->cio_commit_write(env, scan,
+								 slice,
+								 from, to);
+			if (result != 0)
+				break;
+		}
+	}
+	LINVRNT(result <= 0);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_commit_write);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+		    enum cl_req_type crt, struct cl_2queue *queue)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op));
+	ENTRY;
+
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->req_op[crt].cio_submit == NULL)
+			continue;
+		result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt,
+							       queue);
+		if (result != 0)
+			break;
+	}
+	/*
+	 * If ->cio_submit() failed, no pages were sent.
+	 */
+	LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Submit a sync_io and wait for the IO to be finished, or error happens.
+ * If \a timeout is zero, it means to wait for the IO unconditionally.
+ */
+int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
+		      enum cl_req_type iot, struct cl_2queue *queue,
+		      long timeout)
+{
+	struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
+	struct cl_page *pg;
+	int rc;
+
+	cl_page_list_for_each(pg, &queue->c2_qin) {
+		LASSERT(pg->cp_sync_io == NULL);
+		pg->cp_sync_io = anchor;
+	}
+
+	cl_sync_io_init(anchor, queue->c2_qin.pl_nr);
+	rc = cl_io_submit_rw(env, io, iot, queue);
+	if (rc == 0) {
+		/*
+		 * If some pages weren't sent for any reason (e.g.,
+		 * read found up-to-date pages in the cache, or write found
+		 * clean pages), count them as completed to avoid infinite
+		 * wait.
+		 */
+		 cl_page_list_for_each(pg, &queue->c2_qin) {
+			pg->cp_sync_io = NULL;
+			cl_sync_io_note(anchor, +1);
+		 }
+
+		 /* wait for the IO to be finished. */
+		 rc = cl_sync_io_wait(env, io, &queue->c2_qout,
+				      anchor, timeout);
+	} else {
+		LASSERT(list_empty(&queue->c2_qout.pl_pages));
+		cl_page_list_for_each(pg, &queue->c2_qin)
+			pg->cp_sync_io = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(cl_io_submit_sync);
+
+/**
+ * Cancel an IO which has been submitted by cl_io_submit_rw.
+ */
+int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
+		 struct cl_page_list *queue)
+{
+	struct cl_page *page;
+	int result = 0;
+
+	CERROR("Canceling ongoing page trasmission\n");
+	cl_page_list_for_each(page, queue) {
+		int rc;
+
+		LINVRNT(cl_page_in_io(page, io));
+		rc = cl_page_cancel(env, page);
+		result = result ?: rc;
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_io_cancel);
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ *    - cl_io_iter_init()
+ *
+ *    - cl_io_lock()
+ *
+ *    - cl_io_start()
+ *
+ *    - cl_io_end()
+ *
+ *    - cl_io_unlock()
+ *
+ *    - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+	int result   = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	ENTRY;
+
+	do {
+		size_t nob;
+
+		io->ci_continue = 0;
+		result = cl_io_iter_init(env, io);
+		if (result == 0) {
+			nob    = io->ci_nob;
+			result = cl_io_lock(env, io);
+			if (result == 0) {
+				/*
+				 * Notify layers that locks has been taken,
+				 * and do actual i/o.
+				 *
+				 *   - llite: kms, short read;
+				 *   - llite: generic_file_read();
+				 */
+				result = cl_io_start(env, io);
+				/*
+				 * Send any remaining pending
+				 * io, etc.
+				 *
+				 *   - llite: ll_rw_stats_tally.
+				 */
+				cl_io_end(env, io);
+				cl_io_unlock(env, io);
+				cl_io_rw_advance(env, io, io->ci_nob - nob);
+			}
+		}
+		cl_io_iter_fini(env, io);
+	} while (result == 0 && io->ci_continue);
+	if (result == 0)
+		result = io->ci_result;
+	RETURN(result < 0 ? result : 0);
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+		     struct cl_object *obj,
+		     const struct cl_io_operations *ops)
+{
+	struct list_head *linkage = &slice->cis_linkage;
+
+	LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+		list_empty(linkage));
+	ENTRY;
+
+	list_add_tail(linkage, &io->ci_layers);
+	slice->cis_io  = io;
+	slice->cis_obj = obj;
+	slice->cis_iop = ops;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+	ENTRY;
+	plist->pl_nr = 0;
+	INIT_LIST_HEAD(&plist->pl_pages);
+	plist->pl_owner = current;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page)
+{
+	ENTRY;
+	/* it would be better to check that page is owned by "current" io, but
+	 * it is not passed here. */
+	LASSERT(page->cp_owner != NULL);
+	LINVRNT(plist->pl_owner == current);
+
+	lockdep_off();
+	mutex_lock(&page->cp_mutex);
+	lockdep_on();
+	LASSERT(list_empty(&page->cp_batch));
+	list_add_tail(&page->cp_batch, &plist->pl_pages);
+	++plist->pl_nr;
+	page->cp_queue_ref = lu_ref_add(&page->cp_reference, "queue", plist);
+	cl_page_get(page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+		      struct cl_page_list *plist, struct cl_page *page)
+{
+	LASSERT(plist->pl_nr > 0);
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	list_del_init(&page->cp_batch);
+	lockdep_off();
+	mutex_unlock(&page->cp_mutex);
+	lockdep_on();
+	--plist->pl_nr;
+	lu_ref_del_at(&page->cp_reference, page->cp_queue_ref, "queue", plist);
+	cl_page_put(env, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+		       struct cl_page *page)
+{
+	LASSERT(src->pl_nr > 0);
+	LINVRNT(dst->pl_owner == current);
+	LINVRNT(src->pl_owner == current);
+
+	ENTRY;
+	list_move_tail(&page->cp_batch, &dst->pl_pages);
+	--src->pl_nr;
+	++dst->pl_nr;
+	lu_ref_set_at(&page->cp_reference,
+		      page->cp_queue_ref, "queue", src, dst);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head)
+{
+	struct cl_page *page;
+	struct cl_page *tmp;
+
+	LINVRNT(list->pl_owner == current);
+	LINVRNT(head->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, tmp, list)
+		cl_page_list_move(head, list, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, temp, plist) {
+		LASSERT(plist->pl_nr > 0);
+
+		list_del_init(&page->cp_batch);
+		lockdep_off();
+		mutex_unlock(&page->cp_mutex);
+		lockdep_on();
+		--plist->pl_nr;
+		/*
+		 * cl_page_disown0 rather than usual cl_page_disown() is used,
+		 * because pages are possibly in CPS_FREEING state already due
+		 * to the call to cl_page_list_discard().
+		 */
+		/*
+		 * XXX cl_page_disown0() will fail if page is not locked.
+		 */
+		cl_page_disown0(env, io, page);
+		lu_ref_del(&page->cp_reference, "queue", plist);
+		cl_page_put(env, page);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, temp, plist)
+		cl_page_list_del(env, plist, page);
+	LASSERT(plist->pl_nr == 0);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Owns all pages in a queue.
+ */
+int cl_page_list_own(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+	pgoff_t index = 0;
+	int result;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	result = 0;
+	cl_page_list_for_each_safe(page, temp, plist) {
+		LASSERT(index <= page->cp_index);
+		index = page->cp_index;
+		if (cl_page_own(env, io, page) == 0)
+			result = result ?: page->cp_error;
+		else
+			cl_page_list_del(env, plist, page);
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_own);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	LINVRNT(plist->pl_owner == current);
+
+	cl_page_list_for_each(page, plist)
+		cl_page_assume(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_assume);
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	LINVRNT(plist->pl_owner == current);
+	ENTRY;
+	cl_page_list_for_each(page, plist)
+		cl_page_discard(env, io, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_discard);
+
+/**
+ * Unmaps all pages in a queue from user virtual memory.
+ */
+int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io,
+			struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	int result;
+
+	LINVRNT(plist->pl_owner == current);
+	ENTRY;
+	result = 0;
+	cl_page_list_for_each(page, plist) {
+		result = cl_page_unmap(env, io, page);
+		if (result != 0)
+			break;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_unmap);
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_init(&queue->c2_qin);
+	cl_page_list_init(&queue->c2_qout);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page)
+{
+	ENTRY;
+	cl_page_list_add(&queue->c2_qin, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+		      struct cl_io *io, struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_page_list_disown(env, io, &queue->c2_qout);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+		       struct cl_io *io, struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_discard(env, io, &queue->c2_qin);
+	cl_page_list_discard(env, io, &queue->c2_qout);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+		      struct cl_io *io, struct cl_2queue *queue)
+{
+	cl_page_list_assume(env, io, &queue->c2_qin);
+	cl_page_list_assume(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_assume);
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_fini(env, &queue->c2_qout);
+	cl_page_list_fini(env, &queue->c2_qin);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+	ENTRY;
+	cl_2queue_init(queue);
+	cl_2queue_add(queue, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top(), cl_page_top().
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+	ENTRY;
+	while (io->ci_parent != NULL)
+		io = io->ci_parent;
+	RETURN(io);
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+		 lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Adds request slice to the compound request.
+ *
+ * This is called by cl_device_operations::cdo_req_init() methods to add a
+ * per-layer state to the request. New state is added at the end of
+ * cl_req::crq_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+		      struct cl_device *dev,
+		      const struct cl_req_operations *ops)
+{
+	ENTRY;
+	list_add_tail(&slice->crs_linkage, &req->crq_layers);
+	slice->crs_dev = dev;
+	slice->crs_ops = ops;
+	slice->crs_req = req;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_slice_add);
+
+static void cl_req_free(const struct lu_env *env, struct cl_req *req)
+{
+	unsigned i;
+
+	LASSERT(list_empty(&req->crq_pages));
+	LASSERT(req->crq_nrpages == 0);
+	LINVRNT(list_empty(&req->crq_layers));
+	LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL));
+	ENTRY;
+
+	if (req->crq_o != NULL) {
+		for (i = 0; i < req->crq_nrobjs; ++i) {
+			struct cl_object *obj = req->crq_o[i].ro_obj;
+			if (obj != NULL) {
+				lu_object_ref_del_at(&obj->co_lu,
+						     req->crq_o[i].ro_obj_ref,
+						     "cl_req", req);
+				cl_object_put(env, obj);
+			}
+		}
+		OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof req->crq_o[0]);
+	}
+	OBD_FREE_PTR(req);
+	EXIT;
+}
+
+static int cl_req_init(const struct lu_env *env, struct cl_req *req,
+		       struct cl_page *page)
+{
+	struct cl_device     *dev;
+	struct cl_page_slice *slice;
+	int result;
+
+	ENTRY;
+	result = 0;
+	page = cl_page_top(page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev);
+			if (dev->cd_ops->cdo_req_init != NULL) {
+				result = dev->cd_ops->cdo_req_init(env,
+								   dev, req);
+				if (result != 0)
+					break;
+			}
+		}
+		page = page->cp_child;
+	} while (page != NULL && result == 0);
+	RETURN(result);
+}
+
+/**
+ * Invokes per-request transfer completion call-backs
+ * (cl_req_operations::cro_completion()) bottom-to-top.
+ */
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc)
+{
+	struct cl_req_slice *slice;
+
+	ENTRY;
+	/*
+	 * for the lack of list_for_each_entry_reverse_safe()...
+	 */
+	while (!list_empty(&req->crq_layers)) {
+		slice = list_entry(req->crq_layers.prev,
+				       struct cl_req_slice, crs_linkage);
+		list_del_init(&slice->crs_linkage);
+		if (slice->crs_ops->cro_completion != NULL)
+			slice->crs_ops->cro_completion(env, slice, rc);
+	}
+	cl_req_free(env, req);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_completion);
+
+/**
+ * Allocates new transfer request.
+ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+			    enum cl_req_type crt, int nr_objects)
+{
+	struct cl_req *req;
+
+	LINVRNT(nr_objects > 0);
+	ENTRY;
+
+	OBD_ALLOC_PTR(req);
+	if (req != NULL) {
+		int result;
+
+		OBD_ALLOC(req->crq_o, nr_objects * sizeof req->crq_o[0]);
+		if (req->crq_o != NULL) {
+			req->crq_nrobjs = nr_objects;
+			req->crq_type = crt;
+			INIT_LIST_HEAD(&req->crq_pages);
+			INIT_LIST_HEAD(&req->crq_layers);
+			result = cl_req_init(env, req, page);
+		} else
+			result = -ENOMEM;
+		if (result != 0) {
+			cl_req_completion(env, req, result);
+			req = ERR_PTR(result);
+		}
+	} else
+		req = ERR_PTR(-ENOMEM);
+	RETURN(req);
+}
+EXPORT_SYMBOL(cl_req_alloc);
+
+/**
+ * Adds a page to a request.
+ */
+void cl_req_page_add(const struct lu_env *env,
+		     struct cl_req *req, struct cl_page *page)
+{
+	struct cl_object  *obj;
+	struct cl_req_obj *rqo;
+	int i;
+
+	ENTRY;
+	page = cl_page_top(page);
+
+	LASSERT(list_empty(&page->cp_flight));
+	LASSERT(page->cp_req == NULL);
+
+	CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n",
+		      req, req->crq_type, req->crq_nrpages);
+
+	list_add_tail(&page->cp_flight, &req->crq_pages);
+	++req->crq_nrpages;
+	page->cp_req = req;
+	obj = cl_object_top(page->cp_obj);
+	for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) {
+		if (rqo->ro_obj == NULL) {
+			rqo->ro_obj = obj;
+			cl_object_get(obj);
+			rqo->ro_obj_ref = lu_object_ref_add(&obj->co_lu,
+							    "cl_req", req);
+			break;
+		}
+	}
+	LASSERT(i < req->crq_nrobjs);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_add);
+
+/**
+ * Removes a page from a request.
+ */
+void cl_req_page_done(const struct lu_env *env, struct cl_page *page)
+{
+	struct cl_req *req = page->cp_req;
+
+	ENTRY;
+	page = cl_page_top(page);
+
+	LASSERT(!list_empty(&page->cp_flight));
+	LASSERT(req->crq_nrpages > 0);
+
+	list_del_init(&page->cp_flight);
+	--req->crq_nrpages;
+	page->cp_req = NULL;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_done);
+
+/**
+ * Notifies layers that request is about to depart by calling
+ * cl_req_operations::cro_prep() top-to-bottom.
+ */
+int cl_req_prep(const struct lu_env *env, struct cl_req *req)
+{
+	int i;
+	int result;
+	const struct cl_req_slice *slice;
+
+	ENTRY;
+	/*
+	 * Check that the caller of cl_req_alloc() didn't lie about the number
+	 * of objects.
+	 */
+	for (i = 0; i < req->crq_nrobjs; ++i)
+		LASSERT(req->crq_o[i].ro_obj != NULL);
+
+	result = 0;
+	list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+		if (slice->crs_ops->cro_prep != NULL) {
+			result = slice->crs_ops->cro_prep(env, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_req_prep);
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_req *req,
+		     struct cl_req_attr *attr, obd_valid flags)
+{
+	const struct cl_req_slice *slice;
+	struct cl_page	    *page;
+	int i;
+
+	LASSERT(!list_empty(&req->crq_pages));
+	ENTRY;
+
+	/* Take any page to use as a model. */
+	page = list_entry(req->crq_pages.next, struct cl_page, cp_flight);
+
+	for (i = 0; i < req->crq_nrobjs; ++i) {
+		list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+			const struct cl_page_slice *scan;
+			const struct cl_object     *obj;
+
+			scan = cl_page_at(page,
+					  slice->crs_dev->cd_lu_dev.ld_type);
+			LASSERT(scan != NULL);
+			obj = scan->cpl_obj;
+			if (slice->crs_ops->cro_attr_set != NULL)
+				slice->crs_ops->cro_attr_set(env, slice, obj,
+							     attr + i, flags);
+		}
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/* XXX complete(), init_completion(), and wait_for_completion(), until they are
+ * implemented in libcfs. */
+# include <linux/sched.h>
+
+/**
+ * Initialize synchronous io wait anchor, for transfer of \a nrpages pages.
+ */
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages)
+{
+	ENTRY;
+	init_waitqueue_head(&anchor->csi_waitq);
+	atomic_set(&anchor->csi_sync_nr, nrpages);
+	atomic_set(&anchor->csi_barrier, nrpages > 0);
+	anchor->csi_sync_rc = 0;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_init);
+
+/**
+ * Wait until all transfer completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every page.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page_list *queue, struct cl_sync_io *anchor,
+		    long timeout)
+{
+	struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+						  NULL, NULL, NULL);
+	int rc;
+	ENTRY;
+
+	LASSERT(timeout >= 0);
+
+	rc = l_wait_event(anchor->csi_waitq,
+			  atomic_read(&anchor->csi_sync_nr) == 0,
+			  &lwi);
+	if (rc < 0) {
+		CERROR("SYNC IO failed with error: %d, try to cancel "
+		       "%d remaining pages\n",
+		       rc, atomic_read(&anchor->csi_sync_nr));
+
+		(void)cl_io_cancel(env, io, queue);
+
+		lwi = (struct l_wait_info) { 0 };
+		(void)l_wait_event(anchor->csi_waitq,
+				   atomic_read(&anchor->csi_sync_nr) == 0,
+				   &lwi);
+	} else {
+		rc = anchor->csi_sync_rc;
+	}
+	LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+	cl_page_list_assume(env, io, queue);
+
+	/* wait until cl_sync_io_note() has done wakeup */
+	while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) {
+		cpu_relax();
+	}
+
+	POISON(anchor, 0x5a, sizeof *anchor);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret)
+{
+	ENTRY;
+	if (anchor->csi_sync_rc == 0 && ioret < 0)
+		anchor->csi_sync_rc = ioret;
+	/*
+	 * Synchronous IO done without releasing page lock (e.g., as a part of
+	 * ->{prepare,commit}_write(). Completion is used to signal the end of
+	 * IO.
+	 */
+	LASSERT(atomic_read(&anchor->csi_sync_nr) > 0);
+	if (atomic_dec_and_test(&anchor->csi_sync_nr)) {
+		wake_up_all(&anchor->csi_waitq);
+		/* it's safe to nuke or reuse anchor now */
+		atomic_set(&anchor->csi_barrier, 0);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_note);

diff --git a/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
new file mode 100644
index 0000000..d34e044
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_lock.c

@@ -0,0 +1,2304 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Extent Lock.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/** Lock class of cl_lock::cll_guard */
+static struct lock_class_key cl_lock_guard_class;
+static struct kmem_cache *cl_lock_kmem;
+
+static struct lu_kmem_descr cl_lock_caches[] = {
+	{
+		.ckd_cache = &cl_lock_kmem,
+		.ckd_name  = "cl_lock_kmem",
+		.ckd_size  = sizeof (struct cl_lock)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+#define CS_LOCK_INC(o, item)
+#define CS_LOCK_DEC(o, item)
+#define CS_LOCKSTATE_INC(o, state)
+#define CS_LOCKSTATE_DEC(o, state)
+
+/**
+ * Basic lock invariant that is maintained at all times. Caller either has a
+ * reference to \a lock, or somehow assures that \a lock cannot be freed.
+ *
+ * \see cl_lock_invariant()
+ */
+static int cl_lock_invariant_trusted(const struct lu_env *env,
+				     const struct cl_lock *lock)
+{
+	return  ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) &&
+		atomic_read(&lock->cll_ref) >= lock->cll_holds &&
+		lock->cll_holds >= lock->cll_users &&
+		lock->cll_holds >= 0 &&
+		lock->cll_users >= 0 &&
+		lock->cll_depth >= 0;
+}
+
+/**
+ * Stronger lock invariant, checking that caller has a reference on a lock.
+ *
+ * \see cl_lock_invariant_trusted()
+ */
+static int cl_lock_invariant(const struct lu_env *env,
+			     const struct cl_lock *lock)
+{
+	int result;
+
+	result = atomic_read(&lock->cll_ref) > 0 &&
+		cl_lock_invariant_trusted(env, lock);
+	if (!result && env != NULL)
+		CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken");
+	return result;
+}
+
+/**
+ * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock.
+ */
+static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock)
+{
+	return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting;
+}
+
+/**
+ * Returns a set of counters for this lock, depending on a lock nesting.
+ */
+static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env,
+						   const struct cl_lock *lock)
+{
+	struct cl_thread_info *info;
+	enum clt_nesting_level nesting;
+
+	info = cl_env_info(env);
+	nesting = cl_lock_nesting(lock);
+	LASSERT(nesting < ARRAY_SIZE(info->clt_counters));
+	return &info->clt_counters[nesting];
+}
+
+static void cl_lock_trace0(int level, const struct lu_env *env,
+			   const char *prefix, const struct cl_lock *lock,
+			   const char *func, const int line)
+{
+	struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+	CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)"
+		      "(%p/%d/%d) at %s():%d\n",
+	       prefix, lock, atomic_read(&lock->cll_ref),
+	       lock->cll_guarder, lock->cll_depth,
+	       lock->cll_state, lock->cll_error, lock->cll_holds,
+	       lock->cll_users, lock->cll_flags,
+	       env, h->coh_nesting, cl_lock_nr_mutexed(env),
+	       func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock)			 \
+	cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__)
+
+#define RETIP ((unsigned long)__builtin_return_address(0))
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key cl_lock_key;
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{
+	lockdep_set_class_and_name(lock, &cl_lock_key, "EXT");
+}
+
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+				    struct cl_lock *lock, __u32 enqflags)
+{
+	cl_lock_counters(env, lock)->ctc_nr_locks_acquired++;
+	lock_map_acquire(&lock->dep_map);
+}
+
+static void cl_lock_lockdep_release(const struct lu_env *env,
+				    struct cl_lock *lock)
+{
+	cl_lock_counters(env, lock)->ctc_nr_locks_acquired--;
+	lock_release(&lock->dep_map, 0, RETIP);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{}
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+				    struct cl_lock *lock, __u32 enqflags)
+{}
+static void cl_lock_lockdep_release(const struct lu_env *env,
+				    struct cl_lock *lock)
+{}
+
+#endif /* !CONFIG_LOCKDEP */
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_lock_operations *ops)
+{
+	ENTRY;
+	slice->cls_lock = lock;
+	list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+	slice->cls_obj = obj;
+	slice->cls_ops = ops;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+/**
+ * Returns true iff a lock with the mode \a has provides at least the same
+ * guarantees as a lock with the mode \a need.
+ */
+int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need)
+{
+	LINVRNT(need == CLM_READ || need == CLM_WRITE ||
+		need == CLM_PHANTOM || need == CLM_GROUP);
+	LINVRNT(has == CLM_READ || has == CLM_WRITE ||
+		has == CLM_PHANTOM || has == CLM_GROUP);
+	CLASSERT(CLM_PHANTOM < CLM_READ);
+	CLASSERT(CLM_READ < CLM_WRITE);
+	CLASSERT(CLM_WRITE < CLM_GROUP);
+
+	if (has != CLM_GROUP)
+		return need <= has;
+	else
+		return need == has;
+}
+EXPORT_SYMBOL(cl_lock_mode_match);
+
+/**
+ * Returns true iff extent portions of lock descriptions match.
+ */
+int cl_lock_ext_match(const struct cl_lock_descr *has,
+		      const struct cl_lock_descr *need)
+{
+	return
+		has->cld_start <= need->cld_start &&
+		has->cld_end >= need->cld_end &&
+		cl_lock_mode_match(has->cld_mode, need->cld_mode) &&
+		(has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid);
+}
+EXPORT_SYMBOL(cl_lock_ext_match);
+
+/**
+ * Returns true iff a lock with the description \a has provides at least the
+ * same guarantees as a lock with the description \a need.
+ */
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+			const struct cl_lock_descr *need)
+{
+	return
+		cl_object_same(has->cld_obj, need->cld_obj) &&
+		cl_lock_ext_match(has, need);
+}
+EXPORT_SYMBOL(cl_lock_descr_match);
+
+static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object *obj = lock->cll_descr.cld_obj;
+
+	LINVRNT(!cl_lock_is_mutexed(lock));
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "free lock", lock);
+	might_sleep();
+	while (!list_empty(&lock->cll_layers)) {
+		struct cl_lock_slice *slice;
+
+		slice = list_entry(lock->cll_layers.next,
+				       struct cl_lock_slice, cls_linkage);
+		list_del_init(lock->cll_layers.next);
+		slice->cls_ops->clo_fini(env, slice);
+	}
+	CS_LOCK_DEC(obj, total);
+	CS_LOCKSTATE_DEC(obj, lock->cll_state);
+	lu_object_ref_del_at(&obj->co_lu, lock->cll_obj_ref, "cl_lock", lock);
+	cl_object_put(env, obj);
+	lu_ref_fini(&lock->cll_reference);
+	lu_ref_fini(&lock->cll_holders);
+	mutex_destroy(&lock->cll_guard);
+	OBD_SLAB_FREE_PTR(lock, cl_lock_kmem);
+	EXIT;
+}
+
+/**
+ * Releases a reference on a lock.
+ *
+ * When last reference is released, lock is returned to the cache, unless it
+ * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed
+ * immediately.
+ *
+ * \see cl_object_put(), cl_page_put()
+ */
+void cl_lock_put(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object	*obj;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	obj = lock->cll_descr.cld_obj;
+	LINVRNT(obj != NULL);
+
+	CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+
+	if (atomic_dec_and_test(&lock->cll_ref)) {
+		if (lock->cll_state == CLS_FREEING) {
+			LASSERT(list_empty(&lock->cll_linkage));
+			cl_lock_free(env, lock);
+		}
+		CS_LOCK_DEC(obj, busy);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_put);
+
+/**
+ * Acquires an additional reference to a lock.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * lock.
+ *
+ * \see cl_object_get(), cl_page_get()
+ */
+void cl_lock_get(struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_invariant(NULL, lock));
+	CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+	atomic_inc(&lock->cll_ref);
+}
+EXPORT_SYMBOL(cl_lock_get);
+
+/**
+ * Acquires a reference to a lock.
+ *
+ * This is much like cl_lock_get(), except that this function can be used to
+ * acquire initial reference to the cached lock. Caller has to deal with all
+ * possible races. Use with care!
+ *
+ * \see cl_page_get_trust()
+ */
+void cl_lock_get_trust(struct cl_lock *lock)
+{
+	CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+	if (atomic_inc_return(&lock->cll_ref) == 1)
+		CS_LOCK_INC(lock->cll_descr.cld_obj, busy);
+}
+EXPORT_SYMBOL(cl_lock_get_trust);
+
+/**
+ * Helper function destroying the lock that wasn't completely initialized.
+ *
+ * Other threads can acquire references to the top-lock through its
+ * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately.
+ */
+static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock)
+{
+	cl_lock_mutex_get(env, lock);
+	cl_lock_cancel(env, lock);
+	cl_lock_delete(env, lock);
+	cl_lock_mutex_put(env, lock);
+	cl_lock_put(env, lock);
+}
+
+static struct cl_lock *cl_lock_alloc(const struct lu_env *env,
+				     struct cl_object *obj,
+				     const struct cl_io *io,
+				     const struct cl_lock_descr *descr)
+{
+	struct cl_lock	  *lock;
+	struct lu_object_header *head;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, __GFP_IO);
+	if (lock != NULL) {
+		atomic_set(&lock->cll_ref, 1);
+		lock->cll_descr = *descr;
+		lock->cll_state = CLS_NEW;
+		cl_object_get(obj);
+		lock->cll_obj_ref = lu_object_ref_add(&obj->co_lu,
+						      "cl_lock", lock);
+		INIT_LIST_HEAD(&lock->cll_layers);
+		INIT_LIST_HEAD(&lock->cll_linkage);
+		INIT_LIST_HEAD(&lock->cll_inclosure);
+		lu_ref_init(&lock->cll_reference);
+		lu_ref_init(&lock->cll_holders);
+		mutex_init(&lock->cll_guard);
+		lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class);
+		init_waitqueue_head(&lock->cll_wq);
+		head = obj->co_lu.lo_header;
+		CS_LOCKSTATE_INC(obj, CLS_NEW);
+		CS_LOCK_INC(obj, total);
+		CS_LOCK_INC(obj, create);
+		cl_lock_lockdep_init(lock);
+		list_for_each_entry(obj, &head->loh_layers,
+					co_lu.lo_linkage) {
+			int err;
+
+			err = obj->co_ops->coo_lock_init(env, obj, lock, io);
+			if (err != 0) {
+				cl_lock_finish(env, lock);
+				lock = ERR_PTR(err);
+				break;
+			}
+		}
+	} else
+		lock = ERR_PTR(-ENOMEM);
+	RETURN(lock);
+}
+
+/**
+ * Transfer the lock into INTRANSIT state and return the original state.
+ *
+ * \pre  state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED
+ * \post state: CLS_INTRANSIT
+ * \see CLS_INTRANSIT
+ */
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+				     struct cl_lock *lock)
+{
+	enum cl_lock_state state = lock->cll_state;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(state != CLS_INTRANSIT);
+	LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED,
+		 "Malformed lock state %d.\n", state);
+
+	cl_lock_state_set(env, lock, CLS_INTRANSIT);
+	lock->cll_intransit_owner = current;
+	cl_lock_hold_add(env, lock, "intransit", current);
+	return state;
+}
+EXPORT_SYMBOL(cl_lock_intransit);
+
+/**
+ *  Exit the intransit state and restore the lock state to the original state
+ */
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state)
+{
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(lock->cll_state == CLS_INTRANSIT);
+	LASSERT(state != CLS_INTRANSIT);
+	LASSERT(lock->cll_intransit_owner == current);
+
+	lock->cll_intransit_owner = NULL;
+	cl_lock_state_set(env, lock, state);
+	cl_lock_unhold(env, lock, "intransit", current);
+}
+EXPORT_SYMBOL(cl_lock_extransit);
+
+/**
+ * Checking whether the lock is intransit state
+ */
+int cl_lock_is_intransit(struct cl_lock *lock)
+{
+	LASSERT(cl_lock_is_mutexed(lock));
+	return lock->cll_state == CLS_INTRANSIT &&
+	       lock->cll_intransit_owner != current;
+}
+EXPORT_SYMBOL(cl_lock_is_intransit);
+/**
+ * Returns true iff lock is "suitable" for given io. E.g., locks acquired by
+ * truncate and O_APPEND cannot be reused for read/non-append-write, as they
+ * cover multiple stripes and can trigger cascading timeouts.
+ */
+static int cl_lock_fits_into(const struct lu_env *env,
+			     const struct cl_lock *lock,
+			     const struct cl_lock_descr *need,
+			     const struct cl_io *io)
+{
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_invariant_trusted(env, lock));
+	ENTRY;
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_fits_into != NULL &&
+		    !slice->cls_ops->clo_fits_into(env, slice, need, io))
+			RETURN(0);
+	}
+	RETURN(1);
+}
+
+static struct cl_lock *cl_lock_lookup(const struct lu_env *env,
+				      struct cl_object *obj,
+				      const struct cl_io *io,
+				      const struct cl_lock_descr *need)
+{
+	struct cl_lock	  *lock;
+	struct cl_object_header *head;
+
+	ENTRY;
+
+	head = cl_object_header(obj);
+	LINVRNT(spin_is_locked(&head->coh_lock_guard));
+	CS_LOCK_INC(obj, lookup);
+	list_for_each_entry(lock, &head->coh_locks, cll_linkage) {
+		int matched;
+
+		matched = cl_lock_ext_match(&lock->cll_descr, need) &&
+			  lock->cll_state < CLS_FREEING &&
+			  lock->cll_error == 0 &&
+			  !(lock->cll_flags & CLF_CANCELLED) &&
+			  cl_lock_fits_into(env, lock, need, io);
+		CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n",
+		       PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need),
+		       matched);
+		if (matched) {
+			cl_lock_get_trust(lock);
+			CS_LOCK_INC(obj, hit);
+			RETURN(lock);
+		}
+	}
+	RETURN(NULL);
+}
+
+/**
+ * Returns a lock matching description \a need.
+ *
+ * This is the main entry point into the cl_lock caching interface. First, a
+ * cache (implemented as a per-object linked list) is consulted. If lock is
+ * found there, it is returned immediately. Otherwise new lock is allocated
+ * and returned. In any case, additional reference to lock is acquired.
+ *
+ * \see cl_object_find(), cl_page_find()
+ */
+static struct cl_lock *cl_lock_find(const struct lu_env *env,
+				    const struct cl_io *io,
+				    const struct cl_lock_descr *need)
+{
+	struct cl_object_header *head;
+	struct cl_object	*obj;
+	struct cl_lock	  *lock;
+
+	ENTRY;
+
+	obj  = need->cld_obj;
+	head = cl_object_header(obj);
+
+	spin_lock(&head->coh_lock_guard);
+	lock = cl_lock_lookup(env, obj, io, need);
+	spin_unlock(&head->coh_lock_guard);
+
+	if (lock == NULL) {
+		lock = cl_lock_alloc(env, obj, io, need);
+		if (!IS_ERR(lock)) {
+			struct cl_lock *ghost;
+
+			spin_lock(&head->coh_lock_guard);
+			ghost = cl_lock_lookup(env, obj, io, need);
+			if (ghost == NULL) {
+				list_add_tail(&lock->cll_linkage,
+						  &head->coh_locks);
+				spin_unlock(&head->coh_lock_guard);
+				CS_LOCK_INC(obj, busy);
+			} else {
+				spin_unlock(&head->coh_lock_guard);
+				/*
+				 * Other threads can acquire references to the
+				 * top-lock through its sub-locks. Hence, it
+				 * cannot be cl_lock_free()-ed immediately.
+				 */
+				cl_lock_finish(env, lock);
+				lock = ghost;
+			}
+		}
+	}
+	RETURN(lock);
+}
+
+/**
+ * Returns existing lock matching given description. This is similar to
+ * cl_lock_find() except that no new lock is created, and returned lock is
+ * guaranteed to be in enum cl_lock_state::CLS_HELD state.
+ */
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source)
+{
+	struct cl_object_header *head;
+	struct cl_object	*obj;
+	struct cl_lock	  *lock;
+
+	obj  = need->cld_obj;
+	head = cl_object_header(obj);
+
+	do {
+		spin_lock(&head->coh_lock_guard);
+		lock = cl_lock_lookup(env, obj, io, need);
+		spin_unlock(&head->coh_lock_guard);
+		if (lock == NULL)
+			return NULL;
+
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state == CLS_INTRANSIT)
+			/* Don't care return value. */
+			cl_lock_state_wait(env, lock);
+		if (lock->cll_state == CLS_FREEING) {
+			cl_lock_mutex_put(env, lock);
+			cl_lock_put(env, lock);
+			lock = NULL;
+		}
+	} while (lock == NULL);
+
+	cl_lock_hold_add(env, lock, scope, source);
+	cl_lock_user_add(env, lock);
+	if (lock->cll_state == CLS_CACHED)
+		cl_use_try(env, lock, 1);
+	if (lock->cll_state == CLS_HELD) {
+		cl_lock_mutex_put(env, lock);
+		cl_lock_lockdep_acquire(env, lock, 0);
+		cl_lock_put(env, lock);
+	} else {
+		cl_unuse_try(env, lock);
+		cl_lock_unhold(env, lock, scope, source);
+		cl_lock_mutex_put(env, lock);
+		cl_lock_put(env, lock);
+		lock = NULL;
+	}
+
+	return lock;
+}
+EXPORT_SYMBOL(cl_lock_peek);
+
+/**
+ * Returns a slice within a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype)
+{
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_invariant_trusted(NULL, lock));
+	ENTRY;
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+			RETURN(slice);
+	}
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_counters *counters;
+
+	counters = cl_lock_counters(env, lock);
+	lock->cll_depth++;
+	counters->ctc_nr_locks_locked++;
+	lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock);
+	cl_lock_trace(D_TRACE, env, "got mutex", lock);
+}
+
+/**
+ * Locks cl_lock object.
+ *
+ * This is used to manipulate cl_lock fields, and to serialize state
+ * transitions in the lock state machine.
+ *
+ * \post cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_put()
+ */
+void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	if (lock->cll_guarder == current) {
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(lock->cll_depth > 0);
+	} else {
+		struct cl_object_header *hdr;
+		struct cl_thread_info   *info;
+		int i;
+
+		LINVRNT(lock->cll_guarder != current);
+		hdr = cl_object_header(lock->cll_descr.cld_obj);
+		/*
+		 * Check that mutices are taken in the bottom-to-top order.
+		 */
+		info = cl_env_info(env);
+		for (i = 0; i < hdr->coh_nesting; ++i)
+			LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+		mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting);
+		lock->cll_guarder = current;
+		LINVRNT(lock->cll_depth == 0);
+	}
+	cl_lock_mutex_tail(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_mutex_get);
+
+/**
+ * Try-locks cl_lock object.
+ *
+ * \retval 0 \a lock was successfully locked
+ *
+ * \retval -EBUSY \a lock cannot be locked right now
+ *
+ * \post ergo(result == 0, cl_lock_is_mutexed(lock))
+ *
+ * \see cl_lock_mutex_get()
+ */
+int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+
+	LINVRNT(cl_lock_invariant_trusted(env, lock));
+	ENTRY;
+
+	result = 0;
+	if (lock->cll_guarder == current) {
+		LINVRNT(lock->cll_depth > 0);
+		cl_lock_mutex_tail(env, lock);
+	} else if (mutex_trylock(&lock->cll_guard)) {
+		LINVRNT(lock->cll_depth == 0);
+		lock->cll_guarder = current;
+		cl_lock_mutex_tail(env, lock);
+	} else
+		result = -EBUSY;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_mutex_try);
+
+/**
+ {* Unlocks cl_lock object.
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_get()
+ */
+void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_counters *counters;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(lock->cll_guarder == current);
+	LINVRNT(lock->cll_depth > 0);
+
+	counters = cl_lock_counters(env, lock);
+	LINVRNT(counters->ctc_nr_locks_locked > 0);
+
+	cl_lock_trace(D_TRACE, env, "put mutex", lock);
+	lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock);
+	counters->ctc_nr_locks_locked--;
+	if (--lock->cll_depth == 0) {
+		lock->cll_guarder = NULL;
+		mutex_unlock(&lock->cll_guard);
+	}
+}
+EXPORT_SYMBOL(cl_lock_mutex_put);
+
+/**
+ * Returns true iff lock's mutex is owned by the current thread.
+ */
+int cl_lock_is_mutexed(struct cl_lock *lock)
+{
+	return lock->cll_guarder == current;
+}
+EXPORT_SYMBOL(cl_lock_is_mutexed);
+
+/**
+ * Returns number of cl_lock mutices held by the current thread (environment).
+ */
+int cl_lock_nr_mutexed(const struct lu_env *env)
+{
+	struct cl_thread_info *info;
+	int i;
+	int locked;
+
+	/*
+	 * NOTE: if summation across all nesting levels (currently 2) proves
+	 *       too expensive, a summary counter can be added to
+	 *       struct cl_thread_info.
+	 */
+	info = cl_env_info(env);
+	for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+		locked += info->clt_counters[i].ctc_nr_locks_locked;
+	return locked;
+}
+EXPORT_SYMBOL(cl_lock_nr_mutexed);
+
+static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	if (!(lock->cll_flags & CLF_CANCELLED)) {
+		const struct cl_lock_slice *slice;
+
+		lock->cll_flags |= CLF_CANCELLED;
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_cancel != NULL)
+				slice->cls_ops->clo_cancel(env, slice);
+		}
+	}
+	EXIT;
+}
+
+static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object_header    *head;
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	if (lock->cll_state < CLS_FREEING) {
+		LASSERT(lock->cll_state != CLS_INTRANSIT);
+		cl_lock_state_set(env, lock, CLS_FREEING);
+
+		head = cl_object_header(lock->cll_descr.cld_obj);
+
+		spin_lock(&head->coh_lock_guard);
+		list_del_init(&lock->cll_linkage);
+		spin_unlock(&head->coh_lock_guard);
+
+		/*
+		 * From now on, no new references to this lock can be acquired
+		 * by cl_lock_lookup().
+		 */
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_delete != NULL)
+				slice->cls_ops->clo_delete(env, slice);
+		}
+		/*
+		 * From now on, no new references to this lock can be acquired
+		 * by layer-specific means (like a pointer from struct
+		 * ldlm_lock in osc, or a pointer from top-lock to sub-lock in
+		 * lov).
+		 *
+		 * Lock will be finally freed in cl_lock_put() when last of
+		 * existing references goes away.
+		 */
+	}
+	EXIT;
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a
+ * top-lock (nesting == 0) accounts for this modification in the per-thread
+ * debugging counters. Sub-lock holds can be released by a thread different
+ * from one that acquired it.
+ */
+static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock,
+			     int delta)
+{
+	struct cl_thread_counters *counters;
+	enum clt_nesting_level     nesting;
+
+	lock->cll_holds += delta;
+	nesting = cl_lock_nesting(lock);
+	if (nesting == CNL_TOP) {
+		counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+		counters->ctc_nr_held += delta;
+		LASSERT(counters->ctc_nr_held >= 0);
+	}
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_users counter for a given lock. See
+ * cl_lock_hold_mod() for the explanation of the debugging code.
+ */
+static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock,
+			     int delta)
+{
+	struct cl_thread_counters *counters;
+	enum clt_nesting_level     nesting;
+
+	lock->cll_users += delta;
+	nesting = cl_lock_nesting(lock);
+	if (nesting == CNL_TOP) {
+		counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+		counters->ctc_nr_used += delta;
+		LASSERT(counters->ctc_nr_used >= 0);
+	}
+}
+
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+			  const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_holds > 0);
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock);
+	lu_ref_del(&lock->cll_holders, scope, source);
+	cl_lock_hold_mod(env, lock, -1);
+	if (lock->cll_holds == 0) {
+		CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock);
+		if (lock->cll_descr.cld_mode == CLM_PHANTOM ||
+		    lock->cll_descr.cld_mode == CLM_GROUP ||
+		    lock->cll_state != CLS_CACHED)
+			/*
+			 * If lock is still phantom or grouplock when user is
+			 * done with it---destroy the lock.
+			 */
+			lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
+		if (lock->cll_flags & CLF_CANCELPEND) {
+			lock->cll_flags &= ~CLF_CANCELPEND;
+			cl_lock_cancel0(env, lock);
+		}
+		if (lock->cll_flags & CLF_DOOMED) {
+			/* no longer doomed: it's dead... Jim. */
+			lock->cll_flags &= ~CLF_DOOMED;
+			cl_lock_delete0(env, lock);
+		}
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_release);
+
+/**
+ * Waits until lock state is changed.
+ *
+ * This function is called with cl_lock mutex locked, atomically releases
+ * mutex and goes to sleep, waiting for a lock state change (signaled by
+ * cl_lock_signal()), and re-acquires the mutex before return.
+ *
+ * This function is used to wait until lock state machine makes some progress
+ * and to emulate synchronous operations on top of asynchronous lock
+ * interface.
+ *
+ * \retval -EINTR wait was interrupted
+ *
+ * \retval 0 wait wasn't interrupted
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_signal()
+ */
+int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+	wait_queue_t waiter;
+	sigset_t blocked;
+	int result;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_depth == 1);
+	LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */
+
+	cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock);
+	result = lock->cll_error;
+	if (result == 0) {
+		/* To avoid being interrupted by the 'non-fatal' signals
+		 * (SIGCHLD, for instance), we'd block them temporarily.
+		 * LU-305 */
+		blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+		init_waitqueue_entry_current(&waiter);
+		add_wait_queue(&lock->cll_wq, &waiter);
+		set_current_state(TASK_INTERRUPTIBLE);
+		cl_lock_mutex_put(env, lock);
+
+		LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+		/* Returning ERESTARTSYS instead of EINTR so syscalls
+		 * can be restarted if signals are pending here */
+		result = -ERESTARTSYS;
+		if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) {
+			waitq_wait(&waiter, TASK_INTERRUPTIBLE);
+			if (!cfs_signal_pending())
+				result = 0;
+		}
+
+		cl_lock_mutex_get(env, lock);
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&lock->cll_wq, &waiter);
+
+		/* Restore old blocked signals */
+		cfs_restore_sigs(blocked);
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_state_wait);
+
+static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
+				 enum cl_lock_state state)
+{
+	const struct cl_lock_slice *slice;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage)
+		if (slice->cls_ops->clo_state != NULL)
+			slice->cls_ops->clo_state(env, slice, state);
+	wake_up_all(&lock->cll_wq);
+	EXIT;
+}
+
+/**
+ * Notifies waiters that lock state changed.
+ *
+ * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all
+ * layers about state change by calling cl_lock_operations::clo_state()
+ * top-to-bottom.
+ */
+void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock)
+{
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock);
+	cl_lock_state_signal(env, lock, lock->cll_state);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_signal);
+
+/**
+ * Changes lock state.
+ *
+ * This function is invoked to notify layers that lock state changed, possible
+ * as a result of an asynchronous event such as call-back reception.
+ *
+ * \post lock->cll_state == state
+ *
+ * \see cl_lock_operations::clo_state()
+ */
+void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state)
+{
+	ENTRY;
+	LASSERT(lock->cll_state <= state ||
+		(lock->cll_state == CLS_CACHED &&
+		 (state == CLS_HELD || /* lock found in cache */
+		  state == CLS_NEW  ||   /* sub-lock canceled */
+		  state == CLS_INTRANSIT)) ||
+		/* lock is in transit state */
+		lock->cll_state == CLS_INTRANSIT);
+
+	if (lock->cll_state != state) {
+		CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state);
+		CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state);
+
+		cl_lock_state_signal(env, lock, state);
+		lock->cll_state = state;
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_state_set);
+
+static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+
+	do {
+		result = 0;
+
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(cl_lock_invariant(env, lock));
+		LASSERT(lock->cll_state == CLS_INTRANSIT);
+
+		result = -ENOSYS;
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_unuse != NULL) {
+				result = slice->cls_ops->clo_unuse(env, slice);
+				if (result != 0)
+					break;
+			}
+		}
+		LASSERT(result != -ENOSYS);
+	} while (result == CLO_REPEAT);
+
+	return result;
+}
+
+/**
+ * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling
+ * cl_lock_operations::clo_use() top-to-bottom to notify layers.
+ * @atomic = 1, it must unuse the lock to recovery the lock to keep the
+ *  use process atomic
+ */
+int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+	enum cl_lock_state state;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "use lock", lock);
+
+	LASSERT(lock->cll_state == CLS_CACHED);
+	if (lock->cll_error)
+		RETURN(lock->cll_error);
+
+	result = -ENOSYS;
+	state = cl_lock_intransit(env, lock);
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_use != NULL) {
+			result = slice->cls_ops->clo_use(env, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	LASSERT(result != -ENOSYS);
+
+	LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n",
+		 lock->cll_state);
+
+	if (result == 0) {
+		state = CLS_HELD;
+	} else {
+		if (result == -ESTALE) {
+			/*
+			 * ESTALE means sublock being cancelled
+			 * at this time, and set lock state to
+			 * be NEW here and ask the caller to repeat.
+			 */
+			state = CLS_NEW;
+			result = CLO_REPEAT;
+		}
+
+		/* @atomic means back-off-on-failure. */
+		if (atomic) {
+			int rc;
+			rc = cl_unuse_try_internal(env, lock);
+			/* Vet the results. */
+			if (rc < 0 && result > 0)
+				result = rc;
+		}
+
+	}
+	cl_lock_extransit(env, lock, state);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_use_try);
+
+/**
+ * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers
+ * top-to-bottom.
+ */
+static int cl_enqueue_kick(const struct lu_env *env,
+			   struct cl_lock *lock,
+			   struct cl_io *io, __u32 flags)
+{
+	int result;
+	const struct cl_lock_slice *slice;
+
+	ENTRY;
+	result = -ENOSYS;
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_enqueue != NULL) {
+			result = slice->cls_ops->clo_enqueue(env,
+							     slice, io, flags);
+			if (result != 0)
+				break;
+		}
+	}
+	LASSERT(result != -ENOSYS);
+	RETURN(result);
+}
+
+/**
+ * Tries to enqueue a lock.
+ *
+ * This function is called repeatedly by cl_enqueue() until either lock is
+ * enqueued, or error occurs. This function does not block waiting for
+ * networking communication to complete.
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			 lock->cll_state == CLS_HELD)
+ *
+ * \see cl_enqueue() cl_lock_operations::clo_enqueue()
+ * \see cl_lock_state::CLS_ENQUEUED
+ */
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+		   struct cl_io *io, __u32 flags)
+{
+	int result;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock);
+	do {
+		LINVRNT(cl_lock_is_mutexed(lock));
+
+		result = lock->cll_error;
+		if (result != 0)
+			break;
+
+		switch (lock->cll_state) {
+		case CLS_NEW:
+			cl_lock_state_set(env, lock, CLS_QUEUING);
+			/* fall-through */
+		case CLS_QUEUING:
+			/* kick layers. */
+			result = cl_enqueue_kick(env, lock, io, flags);
+			/* For AGL case, the cl_lock::cll_state may
+			 * become CLS_HELD already. */
+			if (result == 0 && lock->cll_state == CLS_QUEUING)
+				cl_lock_state_set(env, lock, CLS_ENQUEUED);
+			break;
+		case CLS_INTRANSIT:
+			LASSERT(cl_lock_is_intransit(lock));
+			result = CLO_WAIT;
+			break;
+		case CLS_CACHED:
+			/* yank lock from the cache. */
+			result = cl_use_try(env, lock, 0);
+			break;
+		case CLS_ENQUEUED:
+		case CLS_HELD:
+			result = 0;
+			break;
+		default:
+		case CLS_FREEING:
+			/*
+			 * impossible, only held locks with increased
+			 * ->cll_holds can be enqueued, and they cannot be
+			 * freed.
+			 */
+			LBUG();
+		}
+	} while (result == CLO_REPEAT);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue_try);
+
+/**
+ * Cancel the conflicting lock found during previous enqueue.
+ *
+ * \retval 0 conflicting lock has been canceled.
+ * \retval -ve error code.
+ */
+int cl_lock_enqueue_wait(const struct lu_env *env,
+			 struct cl_lock *lock,
+			 int keep_mutex)
+{
+	struct cl_lock  *conflict;
+	int	      rc = 0;
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(lock->cll_state == CLS_QUEUING);
+	LASSERT(lock->cll_conflict != NULL);
+
+	conflict = lock->cll_conflict;
+	lock->cll_conflict = NULL;
+
+	cl_lock_mutex_put(env, lock);
+	LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+	cl_lock_mutex_get(env, conflict);
+	cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict);
+	cl_lock_cancel(env, conflict);
+	cl_lock_delete(env, conflict);
+
+	while (conflict->cll_state != CLS_FREEING) {
+		rc = cl_lock_state_wait(env, conflict);
+		if (rc != 0)
+			break;
+	}
+	cl_lock_mutex_put(env, conflict);
+	lu_ref_del(&conflict->cll_reference, "cancel-wait", lock);
+	cl_lock_put(env, conflict);
+
+	if (keep_mutex)
+		cl_lock_mutex_get(env, lock);
+
+	LASSERT(rc <= 0);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_lock_enqueue_wait);
+
+static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock,
+			     struct cl_io *io, __u32 enqflags)
+{
+	int result;
+
+	ENTRY;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_holds > 0);
+
+	cl_lock_user_add(env, lock);
+	do {
+		result = cl_enqueue_try(env, lock, io, enqflags);
+		if (result == CLO_WAIT) {
+			if (lock->cll_conflict != NULL)
+				result = cl_lock_enqueue_wait(env, lock, 1);
+			else
+				result = cl_lock_state_wait(env, lock);
+			if (result == 0)
+				continue;
+		}
+		break;
+	} while (1);
+	if (result != 0)
+		cl_unuse_try(env, lock);
+	LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL),
+		     lock->cll_state == CLS_ENQUEUED ||
+		     lock->cll_state == CLS_HELD));
+	RETURN(result);
+}
+
+/**
+ * Enqueues a lock.
+ *
+ * \pre current thread or io owns a hold on lock.
+ *
+ * \post ergo(result == 0, lock->users increased)
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			 lock->cll_state == CLS_HELD)
+ */
+int cl_enqueue(const struct lu_env *env, struct cl_lock *lock,
+	       struct cl_io *io, __u32 enqflags)
+{
+	int result;
+
+	ENTRY;
+
+	cl_lock_lockdep_acquire(env, lock, enqflags);
+	cl_lock_mutex_get(env, lock);
+	result = cl_enqueue_locked(env, lock, io, enqflags);
+	cl_lock_mutex_put(env, lock);
+	if (result != 0)
+		cl_lock_lockdep_release(env, lock);
+	LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+		     lock->cll_state == CLS_HELD));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue);
+
+/**
+ * Tries to unlock a lock.
+ *
+ * This function is called to release underlying resource:
+ * 1. for top lock, the resource is sublocks it held;
+ * 2. for sublock, the resource is the reference to dlmlock.
+ *
+ * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT.
+ *
+ * \see cl_unuse() cl_lock_operations::clo_unuse()
+ * \see cl_lock_state::CLS_CACHED
+ */
+int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	int			 result;
+	enum cl_lock_state	  state = CLS_NEW;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock);
+
+	if (lock->cll_users > 1) {
+		cl_lock_user_del(env, lock);
+		RETURN(0);
+	}
+
+	/* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold
+	 * underlying resources. */
+	if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) {
+		cl_lock_user_del(env, lock);
+		RETURN(0);
+	}
+
+	/*
+	 * New lock users (->cll_users) are not protecting unlocking
+	 * from proceeding. From this point, lock eventually reaches
+	 * CLS_CACHED, is reinitialized to CLS_NEW or fails into
+	 * CLS_FREEING.
+	 */
+	state = cl_lock_intransit(env, lock);
+
+	result = cl_unuse_try_internal(env, lock);
+	LASSERT(lock->cll_state == CLS_INTRANSIT);
+	LASSERT(result != CLO_WAIT);
+	cl_lock_user_del(env, lock);
+	if (result == 0 || result == -ESTALE) {
+		/*
+		 * Return lock back to the cache. This is the only
+		 * place where lock is moved into CLS_CACHED state.
+		 *
+		 * If one of ->clo_unuse() methods returned -ESTALE, lock
+		 * cannot be placed into cache and has to be
+		 * re-initialized. This happens e.g., when a sub-lock was
+		 * canceled while unlocking was in progress.
+		 */
+		if (state == CLS_HELD && result == 0)
+			state = CLS_CACHED;
+		else
+			state = CLS_NEW;
+		cl_lock_extransit(env, lock, state);
+
+		/*
+		 * Hide -ESTALE error.
+		 * If the lock is a glimpse lock, and it has multiple
+		 * stripes. Assuming that one of its sublock returned -ENAVAIL,
+		 * and other sublocks are matched write locks. In this case,
+		 * we can't set this lock to error because otherwise some of
+		 * its sublocks may not be canceled. This causes some dirty
+		 * pages won't be written to OSTs. -jay
+		 */
+		result = 0;
+	} else {
+		CERROR("result = %d, this is unlikely!\n", result);
+		state = CLS_NEW;
+		cl_lock_extransit(env, lock, state);
+	}
+	RETURN(result ?: lock->cll_error);
+}
+EXPORT_SYMBOL(cl_unuse_try);
+
+static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+	ENTRY;
+
+	result = cl_unuse_try(env, lock);
+	if (result)
+		CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result);
+
+	EXIT;
+}
+
+/**
+ * Unlocks a lock.
+ */
+void cl_unuse(const struct lu_env *env, struct cl_lock *lock)
+{
+	ENTRY;
+	cl_lock_mutex_get(env, lock);
+	cl_unuse_locked(env, lock);
+	cl_lock_mutex_put(env, lock);
+	cl_lock_lockdep_release(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_unuse);
+
+/**
+ * Tries to wait for a lock.
+ *
+ * This function is called repeatedly by cl_wait() until either lock is
+ * granted, or error occurs. This function does not block waiting for network
+ * communication to complete.
+ *
+ * \see cl_wait() cl_lock_operations::clo_wait()
+ * \see cl_lock_state::CLS_HELD
+ */
+int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	int			 result;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock);
+	do {
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(cl_lock_invariant(env, lock));
+		LASSERTF(lock->cll_state == CLS_QUEUING ||
+			 lock->cll_state == CLS_ENQUEUED ||
+			 lock->cll_state == CLS_HELD ||
+			 lock->cll_state == CLS_INTRANSIT,
+			 "lock state: %d\n", lock->cll_state);
+		LASSERT(lock->cll_users > 0);
+		LASSERT(lock->cll_holds > 0);
+
+		result = lock->cll_error;
+		if (result != 0)
+			break;
+
+		if (cl_lock_is_intransit(lock)) {
+			result = CLO_WAIT;
+			break;
+		}
+
+		if (lock->cll_state == CLS_HELD)
+			/* nothing to do */
+			break;
+
+		result = -ENOSYS;
+		list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+			if (slice->cls_ops->clo_wait != NULL) {
+				result = slice->cls_ops->clo_wait(env, slice);
+				if (result != 0)
+					break;
+			}
+		}
+		LASSERT(result != -ENOSYS);
+		if (result == 0) {
+			LASSERT(lock->cll_state != CLS_INTRANSIT);
+			cl_lock_state_set(env, lock, CLS_HELD);
+		}
+	} while (result == CLO_REPEAT);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait_try);
+
+/**
+ * Waits until enqueued lock is granted.
+ *
+ * \pre current thread or io owns a hold on the lock
+ * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			lock->cll_state == CLS_HELD)
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_HELD)
+ */
+int cl_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+
+	ENTRY;
+	cl_lock_mutex_get(env, lock);
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD,
+		 "Wrong state %d \n", lock->cll_state);
+	LASSERT(lock->cll_holds > 0);
+
+	do {
+		result = cl_wait_try(env, lock);
+		if (result == CLO_WAIT) {
+			result = cl_lock_state_wait(env, lock);
+			if (result == 0)
+				continue;
+		}
+		break;
+	} while (1);
+	if (result < 0) {
+		cl_unuse_try(env, lock);
+		cl_lock_lockdep_release(env, lock);
+	}
+	cl_lock_trace(D_DLMTRACE, env, "wait lock", lock);
+	cl_lock_mutex_put(env, lock);
+	LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait);
+
+/**
+ * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock
+ * value.
+ */
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	unsigned long pound;
+	unsigned long ounce;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	pound = 0;
+	list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_weigh != NULL) {
+			ounce = slice->cls_ops->clo_weigh(env, slice);
+			pound += ounce;
+			if (pound < ounce) /* over-weight^Wflow */
+				pound = ~0UL;
+		}
+	}
+	RETURN(pound);
+}
+EXPORT_SYMBOL(cl_lock_weigh);
+
+/**
+ * Notifies layers that lock description changed.
+ *
+ * The server can grant client a lock different from one that was requested
+ * (e.g., larger in extent). This method is called when actually granted lock
+ * description becomes known to let layers to accommodate for changed lock
+ * description.
+ *
+ * \see cl_lock_operations::clo_modify()
+ */
+int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock,
+		   const struct cl_lock_descr *desc)
+{
+	const struct cl_lock_slice *slice;
+	struct cl_object	   *obj = lock->cll_descr.cld_obj;
+	struct cl_object_header    *hdr = cl_object_header(obj);
+	int result;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "modify lock", lock);
+	/* don't allow object to change */
+	LASSERT(obj == desc->cld_obj);
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_modify != NULL) {
+			result = slice->cls_ops->clo_modify(env, slice, desc);
+			if (result != 0)
+				RETURN(result);
+		}
+	}
+	CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n",
+		      PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu)));
+	/*
+	 * Just replace description in place. Nothing more is needed for
+	 * now. If locks were indexed according to their extent and/or mode,
+	 * that index would have to be updated here.
+	 */
+	spin_lock(&hdr->coh_lock_guard);
+	lock->cll_descr = *desc;
+	spin_unlock(&hdr->coh_lock_guard);
+	RETURN(0);
+}
+EXPORT_SYMBOL(cl_lock_modify);
+
+/**
+ * Initializes lock closure with a given origin.
+ *
+ * \see cl_lock_closure
+ */
+void cl_lock_closure_init(const struct lu_env *env,
+			  struct cl_lock_closure *closure,
+			  struct cl_lock *origin, int wait)
+{
+	LINVRNT(cl_lock_is_mutexed(origin));
+	LINVRNT(cl_lock_invariant(env, origin));
+
+	INIT_LIST_HEAD(&closure->clc_list);
+	closure->clc_origin = origin;
+	closure->clc_wait   = wait;
+	closure->clc_nr     = 0;
+}
+EXPORT_SYMBOL(cl_lock_closure_init);
+
+/**
+ * Builds a closure of \a lock.
+ *
+ * Building of a closure consists of adding initial lock (\a lock) into it,
+ * and calling cl_lock_operations::clo_closure() methods of \a lock. These
+ * methods might call cl_lock_closure_build() recursively again, adding more
+ * locks to the closure, etc.
+ *
+ * \see cl_lock_closure
+ */
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+			  struct cl_lock_closure *closure)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(closure->clc_origin));
+	LINVRNT(cl_lock_invariant(env, closure->clc_origin));
+
+	result = cl_lock_enclosure(env, lock, closure);
+	if (result == 0) {
+		list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+			if (slice->cls_ops->clo_closure != NULL) {
+				result = slice->cls_ops->clo_closure(env, slice,
+								     closure);
+				if (result != 0)
+					break;
+			}
+		}
+	}
+	if (result != 0)
+		cl_lock_disclosure(env, closure);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_closure_build);
+
+/**
+ * Adds new lock to a closure.
+ *
+ * Try-locks \a lock and if succeeded, adds it to the closure (never more than
+ * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting
+ * until next try-lock is likely to succeed.
+ */
+int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock,
+		      struct cl_lock_closure *closure)
+{
+	int result = 0;
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock);
+	if (!cl_lock_mutex_try(env, lock)) {
+		/*
+		 * If lock->cll_inclosure is not empty, lock is already in
+		 * this closure.
+		 */
+		if (list_empty(&lock->cll_inclosure)) {
+			cl_lock_get_trust(lock);
+			lu_ref_add(&lock->cll_reference, "closure", closure);
+			list_add(&lock->cll_inclosure, &closure->clc_list);
+			closure->clc_nr++;
+		} else
+			cl_lock_mutex_put(env, lock);
+		result = 0;
+	} else {
+		cl_lock_disclosure(env, closure);
+		if (closure->clc_wait) {
+			cl_lock_get_trust(lock);
+			lu_ref_add(&lock->cll_reference, "closure-w", closure);
+			cl_lock_mutex_put(env, closure->clc_origin);
+
+			LASSERT(cl_lock_nr_mutexed(env) == 0);
+			cl_lock_mutex_get(env, lock);
+			cl_lock_mutex_put(env, lock);
+
+			cl_lock_mutex_get(env, closure->clc_origin);
+			lu_ref_del(&lock->cll_reference, "closure-w", closure);
+			cl_lock_put(env, lock);
+		}
+		result = CLO_REPEAT;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_enclosure);
+
+/** Releases mutices of enclosed locks. */
+void cl_lock_disclosure(const struct lu_env *env,
+			struct cl_lock_closure *closure)
+{
+	struct cl_lock *scan;
+	struct cl_lock *temp;
+
+	cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin);
+	list_for_each_entry_safe(scan, temp, &closure->clc_list,
+				     cll_inclosure){
+		list_del_init(&scan->cll_inclosure);
+		cl_lock_mutex_put(env, scan);
+		lu_ref_del(&scan->cll_reference, "closure", closure);
+		cl_lock_put(env, scan);
+		closure->clc_nr--;
+	}
+	LASSERT(closure->clc_nr == 0);
+}
+EXPORT_SYMBOL(cl_lock_disclosure);
+
+/** Finalizes a closure. */
+void cl_lock_closure_fini(struct cl_lock_closure *closure)
+{
+	LASSERT(closure->clc_nr == 0);
+	LASSERT(list_empty(&closure->clc_list));
+}
+EXPORT_SYMBOL(cl_lock_closure_fini);
+
+/**
+ * Destroys this lock. Notifies layers (bottom-to-top) that lock is being
+ * destroyed, then destroy the lock. If there are holds on the lock, postpone
+ * destruction until all holds are released. This is called when a decision is
+ * made to destroy the lock in the future. E.g., when a blocking AST is
+ * received on it, or fatal communication error happens.
+ *
+ * Caller must have a reference on this lock to prevent a situation, when
+ * deleted lock lingers in memory for indefinite time, because nobody calls
+ * cl_lock_put() to finish it.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ * \pre ergo(cl_lock_nesting(lock) == CNL_TOP,
+ *	   cl_lock_nr_mutexed(env) == 1)
+ *      [i.e., if a top-lock is deleted, mutices of no other locks can be
+ *      held, as deletion of sub-locks might require releasing a top-lock
+ *      mutex]
+ *
+ * \see cl_lock_operations::clo_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP,
+		     cl_lock_nr_mutexed(env) == 1));
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "delete lock", lock);
+	if (lock->cll_holds == 0)
+		cl_lock_delete0(env, lock);
+	else
+		lock->cll_flags |= CLF_DOOMED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_delete);
+
+/**
+ * Mark lock as irrecoverably failed, and mark it for destruction. This
+ * happens when, e.g., server fails to grant a lock to us, or networking
+ * time-out happens.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ *
+ * \see clo_lock_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	if (lock->cll_error == 0 && error != 0) {
+		cl_lock_trace(D_DLMTRACE, env, "set lock error", lock);
+		lock->cll_error = error;
+		cl_lock_signal(env, lock);
+		cl_lock_cancel(env, lock);
+		cl_lock_delete(env, lock);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_error);
+
+/**
+ * Cancels this lock. Notifies layers
+ * (bottom-to-top) that lock is being cancelled, then destroy the lock. If
+ * there are holds on the lock, postpone cancellation until
+ * all holds are released.
+ *
+ * Cancellation notification is delivered to layers at most once.
+ *
+ * \see cl_lock_operations::clo_cancel()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
+	if (lock->cll_holds == 0)
+		cl_lock_cancel0(env, lock);
+	else
+		lock->cll_flags |= CLF_CANCELPEND;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Finds an existing lock covering given index and optionally different from a
+ * given \a except lock.
+ */
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+				 struct cl_object *obj, pgoff_t index,
+				 struct cl_lock *except,
+				 int pending, int canceld)
+{
+	struct cl_object_header *head;
+	struct cl_lock	  *scan;
+	struct cl_lock	  *lock;
+	struct cl_lock_descr    *need;
+
+	ENTRY;
+
+	head = cl_object_header(obj);
+	need = &cl_env_info(env)->clt_descr;
+	lock = NULL;
+
+	need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but
+				    * not PHANTOM */
+	need->cld_start = need->cld_end = index;
+	need->cld_enq_flags = 0;
+
+	spin_lock(&head->coh_lock_guard);
+	/* It is fine to match any group lock since there could be only one
+	 * with a uniq gid and it conflicts with all other lock modes too */
+	list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
+		if (scan != except &&
+		    (scan->cll_descr.cld_mode == CLM_GROUP ||
+		    cl_lock_ext_match(&scan->cll_descr, need)) &&
+		    scan->cll_state >= CLS_HELD &&
+		    scan->cll_state < CLS_FREEING &&
+		    /*
+		     * This check is racy as the lock can be canceled right
+		     * after it is done, but this is fine, because page exists
+		     * already.
+		     */
+		    (canceld || !(scan->cll_flags & CLF_CANCELLED)) &&
+		    (pending || !(scan->cll_flags & CLF_CANCELPEND))) {
+			/* Don't increase cs_hit here since this
+			 * is just a helper function. */
+			cl_lock_get_trust(scan);
+			lock = scan;
+			break;
+		}
+	}
+	spin_unlock(&head->coh_lock_guard);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_at_pgoff);
+
+/**
+ * Calculate the page offset at the layer of @lock.
+ * At the time of this writing, @page is top page and @lock is sub lock.
+ */
+static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock)
+{
+	struct lu_device_type *dtype;
+	const struct cl_page_slice *slice;
+
+	dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type;
+	slice = cl_page_at(page, dtype);
+	LASSERT(slice != NULL);
+	return slice->cpl_page->cp_index;
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+				struct cl_page *page, void *cbdata)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+	struct cl_lock *lock = cbdata;
+	pgoff_t index = pgoff_at_lock(page, lock);
+
+	if (index >= info->clt_fn_index) {
+		struct cl_lock *tmp;
+
+		/* refresh non-overlapped index */
+		tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index,
+					lock, 1, 0);
+		if (tmp != NULL) {
+			/* Cache the first-non-overlapped index so as to skip
+			 * all pages within [index, clt_fn_index). This
+			 * is safe because if tmp lock is canceled, it will
+			 * discard these pages. */
+			info->clt_fn_index = tmp->cll_descr.cld_end + 1;
+			if (tmp->cll_descr.cld_end == CL_PAGE_EOF)
+				info->clt_fn_index = CL_PAGE_EOF;
+			cl_lock_put(env, tmp);
+		} else if (cl_page_own(env, io, page) == 0) {
+			/* discard the page */
+			cl_page_unmap(env, io, page);
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+		}
+	}
+
+	info->clt_next_index = index + 1;
+	return CLP_GANG_OKAY;
+}
+
+static int discard_cb(const struct lu_env *env, struct cl_io *io,
+		      struct cl_page *page, void *cbdata)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+	struct cl_lock *lock   = cbdata;
+
+	LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+	KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+		      !PageWriteback(cl_page_vmpage(env, page))));
+	KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+		      !PageDirty(cl_page_vmpage(env, page))));
+
+	info->clt_next_index = pgoff_at_lock(page, lock) + 1;
+	if (cl_page_own(env, io, page) == 0) {
+		/* discard the page */
+		cl_page_unmap(env, io, page);
+		cl_page_discard(env, io, page);
+		cl_page_disown(env, io, page);
+	} else {
+		LASSERT(page->cp_state == CPS_FREEING);
+	}
+
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_info *info  = cl_env_info(env);
+	struct cl_io	  *io    = &info->clt_io;
+	struct cl_lock_descr  *descr = &lock->cll_descr;
+	cl_page_gang_cb_t      cb;
+	int res;
+	int result;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+
+	io->ci_obj = cl_object_top(descr->cld_obj);
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		GOTO(out, result);
+
+	cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb;
+	info->clt_fn_index = info->clt_next_index = descr->cld_start;
+	do {
+		res = cl_page_gang_lookup(env, descr->cld_obj, io,
+					  info->clt_next_index, descr->cld_end,
+					  cb, (void *)lock);
+		if (info->clt_next_index > descr->cld_end)
+			break;
+
+		if (res == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (res != CLP_GANG_OKAY);
+out:
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_discard_pages);
+
+/**
+ * Eliminate all locks for a given object.
+ *
+ * Caller has to guarantee that no lock is in active use.
+ *
+ * \param cancel when this is set, cl_locks_prune() cancels locks before
+ *	       destroying.
+ */
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel)
+{
+	struct cl_object_header *head;
+	struct cl_lock	  *lock;
+
+	ENTRY;
+	head = cl_object_header(obj);
+	/*
+	 * If locks are destroyed without cancellation, all pages must be
+	 * already destroyed (as otherwise they will be left unprotected).
+	 */
+	LASSERT(ergo(!cancel,
+		     head->coh_tree.rnode == NULL && head->coh_pages == 0));
+
+	spin_lock(&head->coh_lock_guard);
+	while (!list_empty(&head->coh_locks)) {
+		lock = container_of(head->coh_locks.next,
+				    struct cl_lock, cll_linkage);
+		cl_lock_get_trust(lock);
+		spin_unlock(&head->coh_lock_guard);
+		lu_ref_add(&lock->cll_reference, "prune", current);
+
+again:
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state < CLS_FREEING) {
+			LASSERT(lock->cll_users <= 1);
+			if (unlikely(lock->cll_users == 1)) {
+				struct l_wait_info lwi = { 0 };
+
+				cl_lock_mutex_put(env, lock);
+				l_wait_event(lock->cll_wq,
+					     lock->cll_users == 0,
+					     &lwi);
+				goto again;
+			}
+
+			if (cancel)
+				cl_lock_cancel(env, lock);
+			cl_lock_delete(env, lock);
+		}
+		cl_lock_mutex_put(env, lock);
+		lu_ref_del(&lock->cll_reference, "prune", current);
+		cl_lock_put(env, lock);
+		spin_lock(&head->coh_lock_guard);
+	}
+	spin_unlock(&head->coh_lock_guard);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_locks_prune);
+
+static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env,
+					  const struct cl_io *io,
+					  const struct cl_lock_descr *need,
+					  const char *scope, const void *source)
+{
+	struct cl_lock *lock;
+
+	ENTRY;
+
+	while (1) {
+		lock = cl_lock_find(env, io, need);
+		if (IS_ERR(lock))
+			break;
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state < CLS_FREEING &&
+		    !(lock->cll_flags & CLF_CANCELLED)) {
+			cl_lock_hold_mod(env, lock, +1);
+			lu_ref_add(&lock->cll_holders, scope, source);
+			lu_ref_add(&lock->cll_reference, scope, source);
+			break;
+		}
+		cl_lock_mutex_put(env, lock);
+		cl_lock_put(env, lock);
+	}
+	RETURN(lock);
+}
+
+/**
+ * Returns a lock matching \a need description with a reference and a hold on
+ * it.
+ *
+ * This is much like cl_lock_find(), except that cl_lock_hold() additionally
+ * guarantees that lock is not in the CLS_FREEING state on return.
+ */
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source)
+{
+	struct cl_lock *lock;
+
+	ENTRY;
+
+	lock = cl_lock_hold_mutex(env, io, need, scope, source);
+	if (!IS_ERR(lock))
+		cl_lock_mutex_put(env, lock);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_hold);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+				const struct cl_lock_descr *need,
+				const char *scope, const void *source)
+{
+	struct cl_lock       *lock;
+	int		   rc;
+	__u32		 enqflags = need->cld_enq_flags;
+
+	ENTRY;
+	do {
+		lock = cl_lock_hold_mutex(env, io, need, scope, source);
+		if (IS_ERR(lock))
+			break;
+
+		rc = cl_enqueue_locked(env, lock, io, enqflags);
+		if (rc == 0) {
+			if (cl_lock_fits_into(env, lock, need, io)) {
+				if (!(enqflags & CEF_AGL)) {
+					cl_lock_mutex_put(env, lock);
+					cl_lock_lockdep_acquire(env, lock,
+								enqflags);
+					break;
+				}
+				rc = 1;
+			}
+			cl_unuse_locked(env, lock);
+		}
+		cl_lock_trace(D_DLMTRACE, env,
+			      rc <= 0 ? "enqueue failed" : "agl succeed", lock);
+		cl_lock_hold_release(env, lock, scope, source);
+		cl_lock_mutex_put(env, lock);
+		lu_ref_del(&lock->cll_reference, scope, source);
+		cl_lock_put(env, lock);
+		if (rc > 0) {
+			LASSERT(enqflags & CEF_AGL);
+			lock = NULL;
+		} else if (rc != 0) {
+			lock = ERR_PTR(rc);
+		}
+	} while (rc == 0);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Adds a hold to a known lock.
+ */
+void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock,
+		      const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_state != CLS_FREEING);
+
+	ENTRY;
+	cl_lock_hold_mod(env, lock, +1);
+	cl_lock_get(lock);
+	lu_ref_add(&lock->cll_holders, scope, source);
+	lu_ref_add(&lock->cll_reference, scope, source);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_add);
+
+/**
+ * Releases a hold and a reference on a lock, on which caller acquired a
+ * mutex.
+ */
+void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock,
+		    const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	cl_lock_hold_release(env, lock, scope, source);
+	lu_ref_del(&lock->cll_reference, scope, source);
+	cl_lock_put(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_unhold);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock,
+		     const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
+	cl_lock_mutex_get(env, lock);
+	cl_lock_hold_release(env, lock, scope, source);
+	cl_lock_mutex_put(env, lock);
+	lu_ref_del(&lock->cll_reference, scope, source);
+	cl_lock_put(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	cl_lock_used_mod(env, lock, +1);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_add);
+
+void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_users > 0);
+
+	ENTRY;
+	cl_lock_used_mod(env, lock, -1);
+	if (lock->cll_users == 0)
+		wake_up_all(&lock->cll_wq);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_del);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+	static const char *names[] = {
+		[CLM_PHANTOM] = "P",
+		[CLM_READ]    = "R",
+		[CLM_WRITE]   = "W",
+		[CLM_GROUP]   = "G"
+	};
+	if (0 <= mode && mode < ARRAY_SIZE(names))
+		return names[mode];
+	else
+		return "U";
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+		       lu_printer_t printer,
+		       const struct cl_lock_descr *descr)
+{
+	const struct lu_fid  *fid;
+
+	fid = lu_object_fid(&descr->cld_obj->co_lu);
+	(*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	(*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ",
+		   lock, atomic_read(&lock->cll_ref),
+		   lock->cll_state, lock->cll_error, lock->cll_holds,
+		   lock->cll_users, lock->cll_flags);
+	cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+	(*printer)(env, cookie, " {\n");
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		(*printer)(env, cookie, "    %s@%p: ",
+			   slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+			   slice);
+		if (slice->cls_ops->clo_print != NULL)
+			slice->cls_ops->clo_print(env, cookie, printer, slice);
+		(*printer)(env, cookie, "\n");
+	}
+	(*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
+
+int cl_lock_init(void)
+{
+	return lu_kmem_init(cl_lock_caches);
+}
+
+void cl_lock_fini(void)
+{
+	lu_kmem_fini(cl_lock_caches);
+}

diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c
new file mode 100644
index 0000000..cdb5fba
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_object.c

@@ -0,0 +1,1148 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Object.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/*
+ * Locking.
+ *
+ *  i_mutex
+ *      PG_locked
+ *	  ->coh_page_guard
+ *	  ->coh_lock_guard
+ *	  ->coh_attr_guard
+ *	  ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+/* class_put_type() */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <linux/libcfs/libcfs_hash.h> /* for cfs_hash stuff */
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static struct kmem_cache *cl_env_kmem;
+
+/** Lock class of cl_object_header::coh_page_guard */
+static struct lock_class_key cl_page_guard_class;
+/** Lock class of cl_object_header::coh_lock_guard */
+static struct lock_class_key cl_lock_guard_class;
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+	int result;
+
+	ENTRY;
+	result = lu_object_header_init(&h->coh_lu);
+	if (result == 0) {
+		spin_lock_init(&h->coh_page_guard);
+		spin_lock_init(&h->coh_lock_guard);
+		spin_lock_init(&h->coh_attr_guard);
+		lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class);
+		lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class);
+		lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+		h->coh_pages = 0;
+		/* XXX hard coded GFP_* mask. */
+		INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC);
+		INIT_LIST_HEAD(&h->coh_locks);
+		h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8);
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+	LASSERT(list_empty(&h->coh_locks));
+	lu_object_header_fini(&h->coh_lu);
+}
+EXPORT_SYMBOL(cl_object_header_fini);
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+				 struct cl_device *cd, const struct lu_fid *fid,
+				 const struct cl_object_conf *c)
+{
+	might_sleep();
+	return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+	lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+	lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_page_top(), cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+	struct cl_object_header *hdr = cl_object_header(o);
+	struct cl_object *top;
+
+	while (hdr->coh_parent != NULL)
+		hdr = hdr->coh_parent;
+
+	top = lu2cl(lu_object_top(&hdr->coh_lu));
+	CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+	return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+	return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_set().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+{
+	spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+{
+	spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+		       struct cl_attr *attr)
+{
+	struct lu_object_header *top;
+	int result;
+
+	LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_attr_get != NULL) {
+			result = obj->co_ops->coo_attr_get(env, obj, attr);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj,
+		       const struct cl_attr *attr, unsigned v)
+{
+	struct lu_object_header *top;
+	int result;
+
+	LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry_reverse(obj, &top->loh_layers,
+					co_lu.lo_linkage) {
+		if (obj->co_ops->coo_attr_set != NULL) {
+			result = obj->co_ops->coo_attr_set(env, obj, attr, v);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_set);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj,
+		      struct ost_lvb *lvb)
+{
+	struct lu_object_header *top;
+	int result;
+
+	ENTRY;
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry_reverse(obj, &top->loh_layers,
+					co_lu.lo_linkage) {
+		if (obj->co_ops->coo_glimpse != NULL) {
+			result = obj->co_ops->coo_glimpse(env, obj, lvb);
+			if (result != 0)
+				break;
+		}
+	}
+	LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top),
+			 "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+			 "ctime: "LPU64" blocks: "LPU64"\n",
+			 lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+			 lvb->lvb_ctime, lvb->lvb_blocks);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *obj,
+		const struct cl_object_conf *conf)
+{
+	struct lu_object_header *top;
+	int result;
+
+	ENTRY;
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_conf_set != NULL) {
+			result = obj->co_ops->coo_conf_set(env, obj, conf);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+	struct cl_object_header *hdr;
+
+	hdr = cl_object_header(obj);
+	LASSERT(hdr->coh_tree.rnode == NULL);
+	LASSERT(hdr->coh_pages == 0);
+
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+	/*
+	 * Destroy all locks. Object destruction (including cl_inode_fini())
+	 * cannot cancel the locks, because in the case of a local client,
+	 * where client and server share the same thread running
+	 * prune_icache(), this can dead-lock with ldlm_cancel_handler()
+	 * waiting on __wait_on_freeing_inode().
+	 */
+	cl_locks_prune(env, obj, 0);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+void cl_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	ENTRY;
+	cl_pages_prune(env, obj);
+	cl_locks_prune(env, obj, 1);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+/**
+ * Check if the object has locks.
+ */
+int cl_object_has_locks(struct cl_object *obj)
+{
+	struct cl_object_header *head = cl_object_header(obj);
+	int has;
+
+	spin_lock(&head->coh_lock_guard);
+	has = list_empty(&head->coh_locks);
+	spin_unlock(&head->coh_lock_guard);
+
+	return (has == 0);
+}
+EXPORT_SYMBOL(cl_object_has_locks);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+	int i;
+
+	cs->cs_name = name;
+	for (i = 0; i < CS_NR; i++)
+		atomic_set(&cs->cs_stats[i], 0);
+}
+
+int cache_stats_print(const struct cache_stats *cs, struct seq_file *m, int h)
+{
+	int i;
+	/*
+	 *   lookup    hit    total  cached create
+	 * env: ...... ...... ...... ...... ......
+	 */
+	if (h) {
+		const char *names[CS_NR] = CS_NAMES;
+
+		seq_printf(m, "%6s", " ");
+		for (i = 0; i < CS_NR; i++)
+			seq_printf(m, "%8s", names[i]);
+		seq_printf(m, "\n");
+	}
+
+	seq_printf(m, "%5.5s:", cs->cs_name);
+	for (i = 0; i < CS_NR; i++)
+		seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i]));
+	return 0;
+}
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+	int i;
+	int result;
+
+	result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+	if (result == 0) {
+		cache_stats_init(&s->cs_pages, "pages");
+		cache_stats_init(&s->cs_locks, "locks");
+		for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+			atomic_set(&s->cs_pages_state[0], 0);
+		for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i)
+			atomic_set(&s->cs_locks_state[i], 0);
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+	lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+	.cs_name    = "envs",
+	.cs_stats = { ATOMIC_INIT(0), }
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m)
+{
+	int i;
+	static const char *pstate[] = {
+		[CPS_CACHED]  = "c",
+		[CPS_OWNED]   = "o",
+		[CPS_PAGEOUT] = "w",
+		[CPS_PAGEIN]  = "r",
+		[CPS_FREEING] = "f"
+	};
+	static const char *lstate[] = {
+		[CLS_NEW]       = "n",
+		[CLS_QUEUING]   = "q",
+		[CLS_ENQUEUED]  = "e",
+		[CLS_HELD]      = "h",
+		[CLS_INTRANSIT] = "t",
+		[CLS_CACHED]    = "c",
+		[CLS_FREEING]   = "f"
+	};
+/*
+       lookup    hit  total   busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+  env: ...... ...... ...... ...... ......
+ */
+	lu_site_stats_print(&site->cs_lu, m);
+	cache_stats_print(&site->cs_pages, m, 1);
+	seq_printf(m, " [");
+	for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+		seq_printf(m, "%s: %u ", pstate[i],
+				atomic_read(&site->cs_pages_state[i]));
+	seq_printf(m, "]\n");
+	cache_stats_print(&site->cs_locks, m, 0);
+	seq_printf(m, " [");
+	for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i)
+		seq_printf(m, "%s: %u ", lstate[i],
+				atomic_read(&site->cs_locks_state[i]));
+	seq_printf(m, "]\n");
+	cache_stats_print(&cl_env_stats, m, 0);
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/**
+ * The most efficient way is to store cl_env pointer in task specific
+ * structures. On Linux, it wont' be easy to use task_struct->journal_info
+ * because Lustre code may call into other fs which has certain assumptions
+ * about journal_info. Currently following fields in task_struct are identified
+ * can be used for this purpose:
+ *  - cl_env: for liblustre.
+ *  - tux_info: ony on RedHat kernel.
+ *  - ...
+ * \note As long as we use task_struct to store cl_env, we assume that once
+ * called into Lustre, we'll never call into the other part of the kernel
+ * which will use those fields in task_struct without explicitly exiting
+ * Lustre.
+ *
+ * If there's no space in task_struct is available, hash will be used.
+ * bz20044, bz22683.
+ */
+
+struct cl_env {
+	void	     *ce_magic;
+	struct lu_env     ce_lu;
+	struct lu_context ce_ses;
+
+	/**
+	 * This allows cl_env to be entered into cl_env_hash which implements
+	 * the current thread -> client environment lookup.
+	 */
+	struct hlist_node  ce_node;
+	/**
+	 * Owner for the current cl_env.
+	 *
+	 * If LL_TASK_CL_ENV is defined, this point to the owning current,
+	 * only for debugging purpose ;
+	 * Otherwise hash is used, and this is the key for cfs_hash.
+	 * Now current thread pid is stored. Note using thread pointer would
+	 * lead to unbalanced hash because of its specific allocation locality
+	 * and could be varied for different platforms and OSes, even different
+	 * OS versions.
+	 */
+	void	     *ce_owner;
+
+	/*
+	 * Linkage into global list of all client environments. Used for
+	 * garbage collection.
+	 */
+	struct list_head	ce_linkage;
+	/*
+	 *
+	 */
+	int	       ce_ref;
+	/*
+	 * Debugging field: address of the caller who made original
+	 * allocation.
+	 */
+	void	     *ce_debug;
+};
+
+#define CL_ENV_INC(counter)
+#define CL_ENV_DEC(counter)
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+	LASSERT(cle->ce_ref == 0);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+	LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL);
+
+	cle->ce_ref = 1;
+	cle->ce_debug = debug;
+	CL_ENV_INC(busy);
+}
+
+
+/*
+ * The implementation of using hash table to connect cl_env and thread
+ */
+
+static cfs_hash_t *cl_env_hash;
+
+static unsigned cl_env_hops_hash(cfs_hash_t *lh,
+				 const void *key, unsigned mask)
+{
+#if BITS_PER_LONG == 64
+	return cfs_hash_u64_hash((__u64)key, mask);
+#else
+	return cfs_hash_u32_hash((__u32)key, mask);
+#endif
+}
+
+static void *cl_env_hops_obj(struct hlist_node *hn)
+{
+	struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+	return (void *)cle;
+}
+
+static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn)
+{
+	struct cl_env *cle = cl_env_hops_obj(hn);
+
+	LASSERT(cle->ce_owner != NULL);
+	return (key == cle->ce_owner);
+}
+
+static void cl_env_hops_noop(cfs_hash_t *hs, struct hlist_node *hn)
+{
+	struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+}
+
+static cfs_hash_ops_t cl_env_hops = {
+	.hs_hash	= cl_env_hops_hash,
+	.hs_key	 = cl_env_hops_obj,
+	.hs_keycmp      = cl_env_hops_keycmp,
+	.hs_object      = cl_env_hops_obj,
+	.hs_get	 = cl_env_hops_noop,
+	.hs_put_locked  = cl_env_hops_noop,
+};
+
+static inline struct cl_env *cl_env_fetch(void)
+{
+	struct cl_env *cle;
+
+	cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid);
+	LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0));
+	return cle;
+}
+
+static inline void cl_env_attach(struct cl_env *cle)
+{
+	if (cle) {
+		int rc;
+
+		LASSERT(cle->ce_owner == NULL);
+		cle->ce_owner = (void *) (long) current->pid;
+		rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner,
+					 &cle->ce_node);
+		LASSERT(rc == 0);
+	}
+}
+
+static inline void cl_env_do_detach(struct cl_env *cle)
+{
+	void *cookie;
+
+	LASSERT(cle->ce_owner == (void *) (long) current->pid);
+	cookie = cfs_hash_del(cl_env_hash, cle->ce_owner,
+			      &cle->ce_node);
+	LASSERT(cookie == cle);
+	cle->ce_owner = NULL;
+}
+
+static int cl_env_store_init(void) {
+	cl_env_hash = cfs_hash_create("cl_env",
+				      HASH_CL_ENV_BITS, HASH_CL_ENV_BITS,
+				      HASH_CL_ENV_BKT_BITS, 0,
+				      CFS_HASH_MIN_THETA,
+				      CFS_HASH_MAX_THETA,
+				      &cl_env_hops,
+				      CFS_HASH_RW_BKTLOCK);
+	return cl_env_hash != NULL ? 0 :-ENOMEM;
+}
+
+static void cl_env_store_fini(void) {
+	cfs_hash_putref(cl_env_hash);
+}
+
+
+static inline struct cl_env *cl_env_detach(struct cl_env *cle)
+{
+	if (cle == NULL)
+		cle = cl_env_fetch();
+
+	if (cle && cle->ce_owner)
+		cl_env_do_detach(cle);
+
+	return cle;
+}
+
+static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
+{
+	struct lu_env *env;
+	struct cl_env *cle;
+
+	OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, __GFP_IO);
+	if (cle != NULL) {
+		int rc;
+
+		INIT_LIST_HEAD(&cle->ce_linkage);
+		cle->ce_magic = &cl_env_init0;
+		env = &cle->ce_lu;
+		rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags);
+		if (rc == 0) {
+			rc = lu_context_init(&cle->ce_ses,
+					     LCT_SESSION | ses_tags);
+			if (rc == 0) {
+				lu_context_enter(&cle->ce_ses);
+				env->le_ses = &cle->ce_ses;
+				cl_env_init0(cle, debug);
+			} else
+				lu_env_fini(env);
+		}
+		if (rc != 0) {
+			OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+			env = ERR_PTR(rc);
+		} else {
+			CL_ENV_INC(create);
+			CL_ENV_INC(total);
+		}
+	} else
+		env = ERR_PTR(-ENOMEM);
+	return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+	CL_ENV_DEC(total);
+	lu_context_fini(&cle->ce_lu.le_ctx);
+	lu_context_fini(&cle->ce_ses);
+	OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+	return container_of(env, struct cl_env, ce_lu);
+}
+
+struct lu_env *cl_env_peek(int *refcheck)
+{
+	struct lu_env *env;
+	struct cl_env *cle;
+
+	CL_ENV_INC(lookup);
+
+	/* check that we don't go far from untrusted pointer */
+	CLASSERT(offsetof(struct cl_env, ce_magic) == 0);
+
+	env = NULL;
+	cle = cl_env_fetch();
+	if (cle != NULL) {
+		CL_ENV_INC(hit);
+		env = &cle->ce_lu;
+		*refcheck = ++cle->ce_ref;
+	}
+	CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle);
+	return env;
+}
+EXPORT_SYMBOL(cl_env_peek);
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(int *refcheck)
+{
+	struct lu_env *env;
+
+	env = cl_env_peek(refcheck);
+	if (env == NULL) {
+		env = cl_env_new(lu_context_tags_default,
+				 lu_session_tags_default,
+				 __builtin_return_address(0));
+
+		if (!IS_ERR(env)) {
+			struct cl_env *cle;
+
+			cle = cl_env_container(env);
+			cl_env_attach(cle);
+			*refcheck = cle->ce_ref;
+			CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+		}
+	}
+	return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(int *refcheck, __u32 tags)
+{
+	struct lu_env *env;
+
+	LASSERT(cl_env_peek(refcheck) == NULL);
+	env = cl_env_new(tags, tags, __builtin_return_address(0));
+	if (!IS_ERR(env)) {
+		struct cl_env *cle;
+
+		cle = cl_env_container(env);
+		*refcheck = cle->ce_ref;
+		CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+	}
+	return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+	LASSERT(cle->ce_owner == NULL);
+	lu_context_exit(&cle->ce_lu.le_ctx);
+	lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle;
+
+	cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 0);
+	LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+	if (--cle->ce_ref == 0) {
+		CL_ENV_DEC(busy);
+		cl_env_detach(cle);
+		cle->ce_debug = NULL;
+		cl_env_exit(cle);
+		cl_env_fini(cle);
+	}
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Declares a point of re-entrancy.
+ *
+ * \see cl_env_reexit()
+ */
+void *cl_env_reenter(void)
+{
+	return cl_env_detach(NULL);
+}
+EXPORT_SYMBOL(cl_env_reenter);
+
+/**
+ * Exits re-entrancy.
+ */
+void cl_env_reexit(void *cookie)
+{
+	cl_env_detach(NULL);
+	cl_env_attach(cookie);
+}
+EXPORT_SYMBOL(cl_env_reexit);
+
+/**
+ * Setup user-supplied \a env as a current environment. This is to be used to
+ * guaranteed that environment exists even when cl_env_get() fails. It is up
+ * to user to ensure proper concurrency control.
+ *
+ * \see cl_env_unplant()
+ */
+void cl_env_implant(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 0);
+
+	cl_env_attach(cle);
+	cl_env_get(refcheck);
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+}
+EXPORT_SYMBOL(cl_env_implant);
+
+/**
+ * Detach environment installed earlier by cl_env_implant().
+ */
+void cl_env_unplant(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 1);
+
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+
+	cl_env_detach(cle);
+	cl_env_put(env, refcheck);
+}
+EXPORT_SYMBOL(cl_env_unplant);
+
+struct lu_env *cl_env_nested_get(struct cl_env_nest *nest)
+{
+	struct lu_env *env;
+
+	nest->cen_cookie = NULL;
+	env = cl_env_peek(&nest->cen_refcheck);
+	if (env != NULL) {
+		if (!cl_io_is_going(env))
+			return env;
+		else {
+			cl_env_put(env, &nest->cen_refcheck);
+			nest->cen_cookie = cl_env_reenter();
+		}
+	}
+	env = cl_env_get(&nest->cen_refcheck);
+	if (IS_ERR(env)) {
+		cl_env_reexit(nest->cen_cookie);
+		return env;
+	}
+
+	LASSERT(!cl_io_is_going(env));
+	return env;
+}
+EXPORT_SYMBOL(cl_env_nested_get);
+
+void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env)
+{
+	cl_env_put(env, &nest->cen_refcheck);
+	cl_env_reexit(nest->cen_cookie);
+}
+EXPORT_SYMBOL(cl_env_nested_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+	ENTRY;
+	lvb->lvb_size   = attr->cat_size;
+	lvb->lvb_mtime  = attr->cat_mtime;
+	lvb->lvb_atime  = attr->cat_atime;
+	lvb->lvb_ctime  = attr->cat_ctime;
+	lvb->lvb_blocks = attr->cat_blocks;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_attr2lvb);
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+	ENTRY;
+	attr->cat_size   = lvb->lvb_size;
+	attr->cat_mtime  = lvb->lvb_mtime;
+	attr->cat_atime  = lvb->lvb_atime;
+	attr->cat_ctime  = lvb->lvb_ctime;
+	attr->cat_blocks = lvb->lvb_blocks;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+				struct lu_device_type *ldt,
+				struct lu_device *next)
+{
+	const char       *typename;
+	struct lu_device *d;
+
+	LASSERT(ldt != NULL);
+
+	typename = ldt->ldt_name;
+	d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+	if (!IS_ERR(d)) {
+		int rc;
+
+		if (site != NULL)
+			d->ld_site = site;
+		rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+		if (rc == 0) {
+			lu_device_get(d);
+			lu_ref_add(&d->ld_reference,
+				   "lu-stack", &lu_site_init);
+		} else {
+			ldt->ldt_ops->ldto_device_free(env, d);
+			CERROR("can't init device '%s', %d\n", typename, rc);
+			d = ERR_PTR(rc);
+		}
+	} else
+		CERROR("Cannot allocate device: '%s'\n", typename);
+	return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+	lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+int  cl_lock_init(void);
+void cl_lock_fini(void);
+
+int  cl_page_init(void);
+void cl_page_fini(void);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+	return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl0_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl0, struct cl_thread_info);
+
+static void *cl_key_init(const struct lu_context *ctx,
+			 struct lu_context_key *key)
+{
+	struct cl_thread_info *info;
+
+	info = cl0_key_init(ctx, key);
+	if (!IS_ERR(info)) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+			lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+	}
+	return info;
+}
+
+static void cl_key_fini(const struct lu_context *ctx,
+			struct lu_context_key *key, void *data)
+{
+	struct cl_thread_info *info;
+	int i;
+
+	info = data;
+	for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+		lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+	cl0_key_fini(ctx, key, data);
+}
+
+static void cl_key_exit(const struct lu_context *ctx,
+			struct lu_context_key *key, void *data)
+{
+	struct cl_thread_info *info = data;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) {
+		LASSERT(info->clt_counters[i].ctc_nr_held == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_used == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+		lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+		lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+	}
+}
+
+static struct lu_context_key cl_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = cl_key_init,
+	.lct_fini = cl_key_fini,
+	.lct_exit = cl_key_exit
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+	{
+		.ckd_cache = &cl_env_kmem,
+		.ckd_name  = "cl_env_kmem",
+		.ckd_size  = sizeof (struct cl_env)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+	int result;
+
+	result = cl_env_store_init();
+	if (result)
+		return result;
+
+	result = lu_kmem_init(cl_object_caches);
+	if (result)
+		goto out_store;
+
+	LU_CONTEXT_KEY_INIT(&cl_key);
+	result = lu_context_key_register(&cl_key);
+	if (result)
+		goto out_kmem;
+
+	result = cl_lock_init();
+	if (result)
+		goto out_context;
+
+	result = cl_page_init();
+	if (result)
+		goto out_lock;
+
+	return 0;
+out_lock:
+	cl_lock_fini();
+out_context:
+	lu_context_key_degister(&cl_key);
+out_kmem:
+	lu_kmem_fini(cl_object_caches);
+out_store:
+	cl_env_store_fini();
+	return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+	cl_lock_fini();
+	cl_page_fini();
+	lu_context_key_degister(&cl_key);
+	lu_kmem_fini(cl_object_caches);
+	cl_env_store_fini();
+}

diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c
new file mode 100644
index 0000000..bb93359
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_page.c

@@ -0,0 +1,1605 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Page.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <linux/list.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+			    int radix);
+
+# define PASSERT(env, page, expr)				       \
+  do {								    \
+	  if (unlikely(!(expr))) {				      \
+		  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+		  LASSERT(0);					   \
+	  }							     \
+  } while (0)
+
+# define PINVRNT(env, page, exp) \
+	((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+
+/* Disable page statistic by default due to huge performance penalty. */
+#define CS_PAGE_INC(o, item)
+#define CS_PAGE_DEC(o, item)
+#define CS_PAGESTATE_INC(o, state)
+#define CS_PAGESTATE_DEC(o, state)
+
+/**
+ * Internal version of cl_page_top, it should be called if the page is
+ * known to be not freed, says with page referenced, or radix tree lock held,
+ * or page owned.
+ */
+static struct cl_page *cl_page_top_trusted(struct cl_page *page)
+{
+	while (page->cp_parent != NULL)
+		page = page->cp_parent;
+	return page;
+}
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by locking page radix-tree
+ * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+	LASSERT(atomic_read(&page->cp_ref) > 0);
+	atomic_inc(&page->cp_ref);
+}
+
+/**
+ * Returns a slice within a page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *page,
+		   const struct lu_device_type *dtype)
+{
+	const struct cl_page_slice *slice;
+	ENTRY;
+
+	page = cl_page_top_trusted((struct cl_page *)page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+				RETURN(slice);
+		}
+		page = page->cp_child;
+	} while (page != NULL);
+	RETURN(NULL);
+}
+
+/**
+ * Returns a page with given index in the given object, or NULL if no page is
+ * found. Acquires a reference on \a page.
+ *
+ * Locking: called under cl_object_header::coh_page_guard spin-lock.
+ */
+struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index)
+{
+	struct cl_page *page;
+
+	LASSERT(spin_is_locked(&hdr->coh_page_guard));
+
+	page = radix_tree_lookup(&hdr->coh_tree, index);
+	if (page != NULL)
+		cl_page_get_trust(page);
+	return page;
+}
+EXPORT_SYMBOL(cl_page_lookup);
+
+/**
+ * Returns a list of pages by a given [start, end] of \a obj.
+ *
+ * \param resched If not NULL, then we give up before hogging CPU for too
+ * long and set *resched = 1, in that case caller should implement a retry
+ * logic.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
+ */
+int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io, pgoff_t start, pgoff_t end,
+			cl_page_gang_cb_t cb, void *cbdata)
+{
+	struct cl_object_header *hdr;
+	struct cl_page	  *page;
+	struct cl_page	 **pvec;
+	const struct cl_page_slice  *slice;
+	const struct lu_device_type *dtype;
+	pgoff_t		  idx;
+	unsigned int	     nr;
+	unsigned int	     i;
+	unsigned int	     j;
+	int		      res = CLP_GANG_OKAY;
+	int		      tree_lock = 1;
+	ENTRY;
+
+	idx = start;
+	hdr = cl_object_header(obj);
+	pvec = cl_env_info(env)->clt_pvec;
+	dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type;
+	spin_lock(&hdr->coh_page_guard);
+	while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
+					    idx, CLT_PVEC_SIZE)) > 0) {
+		int end_of_region = 0;
+		idx = pvec[nr - 1]->cp_index + 1;
+		for (i = 0, j = 0; i < nr; ++i) {
+			page = pvec[i];
+			pvec[i] = NULL;
+
+			LASSERT(page->cp_type == CPT_CACHEABLE);
+			if (page->cp_index > end) {
+				end_of_region = 1;
+				break;
+			}
+			if (page->cp_state == CPS_FREEING)
+				continue;
+
+			slice = cl_page_at_trusted(page, dtype);
+			/*
+			 * Pages for lsm-less file has no underneath sub-page
+			 * for osc, in case of ...
+			 */
+			PASSERT(env, page, slice != NULL);
+
+			page = slice->cpl_page;
+			/*
+			 * Can safely call cl_page_get_trust() under
+			 * radix-tree spin-lock.
+			 *
+			 * XXX not true, because @page is from object another
+			 * than @hdr and protected by different tree lock.
+			 */
+			cl_page_get_trust(page);
+			lu_ref_add_atomic(&page->cp_reference,
+					  "gang_lookup", current);
+			pvec[j++] = page;
+		}
+
+		/*
+		 * Here a delicate locking dance is performed. Current thread
+		 * holds a reference to a page, but has to own it before it
+		 * can be placed into queue. Owning implies waiting, so
+		 * radix-tree lock is to be released. After a wait one has to
+		 * check that pages weren't truncated (cl_page_own() returns
+		 * error in the latter case).
+		 */
+		spin_unlock(&hdr->coh_page_guard);
+		tree_lock = 0;
+
+		for (i = 0; i < j; ++i) {
+			page = pvec[i];
+			if (res == CLP_GANG_OKAY)
+				res = (*cb)(env, io, page, cbdata);
+			lu_ref_del(&page->cp_reference,
+				   "gang_lookup", current);
+			cl_page_put(env, page);
+		}
+		if (nr < CLT_PVEC_SIZE || end_of_region)
+			break;
+
+		if (res == CLP_GANG_OKAY && need_resched())
+			res = CLP_GANG_RESCHED;
+		if (res != CLP_GANG_OKAY)
+			break;
+
+		spin_lock(&hdr->coh_page_guard);
+		tree_lock = 1;
+	}
+	if (tree_lock)
+		spin_unlock(&hdr->coh_page_guard);
+	RETURN(res);
+}
+EXPORT_SYMBOL(cl_page_gang_lookup);
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+{
+	struct cl_object *obj  = page->cp_obj;
+	int pagesize = cl_object_header(obj)->coh_page_bufsize;
+
+	PASSERT(env, page, list_empty(&page->cp_batch));
+	PASSERT(env, page, page->cp_owner == NULL);
+	PASSERT(env, page, page->cp_req == NULL);
+	PASSERT(env, page, page->cp_parent == NULL);
+	PASSERT(env, page, page->cp_state == CPS_FREEING);
+
+	ENTRY;
+	might_sleep();
+	while (!list_empty(&page->cp_layers)) {
+		struct cl_page_slice *slice;
+
+		slice = list_entry(page->cp_layers.next,
+				       struct cl_page_slice, cpl_linkage);
+		list_del_init(page->cp_layers.next);
+		slice->cpl_ops->cpo_fini(env, slice);
+	}
+	CS_PAGE_DEC(obj, total);
+	CS_PAGESTATE_DEC(obj, page->cp_state);
+	lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page);
+	cl_object_put(env, obj);
+	lu_ref_fini(&page->cp_reference);
+	OBD_FREE(page, pagesize);
+	EXIT;
+}
+
+/**
+ * Helper function updating page state. This is the only place in the code
+ * where cl_page::cp_state field is mutated.
+ */
+static inline void cl_page_state_set_trust(struct cl_page *page,
+					   enum cl_page_state state)
+{
+	/* bypass const. */
+	*(enum cl_page_state *)&page->cp_state = state;
+}
+
+static struct cl_page *cl_page_alloc(const struct lu_env *env,
+		struct cl_object *o, pgoff_t ind, struct page *vmpage,
+		enum cl_page_type type)
+{
+	struct cl_page	  *page;
+	struct lu_object_header *head;
+
+	ENTRY;
+	OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize,
+			__GFP_IO);
+	if (page != NULL) {
+		int result = 0;
+		atomic_set(&page->cp_ref, 1);
+		if (type == CPT_CACHEABLE) /* for radix tree */
+			atomic_inc(&page->cp_ref);
+		page->cp_obj = o;
+		cl_object_get(o);
+		page->cp_obj_ref = lu_object_ref_add(&o->co_lu, "cl_page",page);
+		page->cp_index = ind;
+		cl_page_state_set_trust(page, CPS_CACHED);
+		page->cp_type = type;
+		INIT_LIST_HEAD(&page->cp_layers);
+		INIT_LIST_HEAD(&page->cp_batch);
+		INIT_LIST_HEAD(&page->cp_flight);
+		mutex_init(&page->cp_mutex);
+		lu_ref_init(&page->cp_reference);
+		head = o->co_lu.lo_header;
+		list_for_each_entry(o, &head->loh_layers,
+					co_lu.lo_linkage) {
+			if (o->co_ops->coo_page_init != NULL) {
+				result = o->co_ops->coo_page_init(env, o,
+								  page, vmpage);
+				if (result != 0) {
+					cl_page_delete0(env, page, 0);
+					cl_page_free(env, page);
+					page = ERR_PTR(result);
+					break;
+				}
+			}
+		}
+		if (result == 0) {
+			CS_PAGE_INC(o, total);
+			CS_PAGE_INC(o, create);
+			CS_PAGESTATE_DEC(o, CPS_CACHED);
+		}
+	} else {
+		page = ERR_PTR(-ENOMEM);
+	}
+	RETURN(page);
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+static struct cl_page *cl_page_find0(const struct lu_env *env,
+				     struct cl_object *o,
+				     pgoff_t idx, struct page *vmpage,
+				     enum cl_page_type type,
+				     struct cl_page *parent)
+{
+	struct cl_page	  *page = NULL;
+	struct cl_page	  *ghost = NULL;
+	struct cl_object_header *hdr;
+	int err;
+
+	LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+	might_sleep();
+
+	ENTRY;
+
+	hdr = cl_object_header(o);
+	CS_PAGE_INC(o, lookup);
+
+	CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
+	       idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+	/* fast path. */
+	if (type == CPT_CACHEABLE) {
+		/* vmpage lock is used to protect the child/parent
+		 * relationship */
+		KLASSERT(PageLocked(vmpage));
+		/*
+		 * cl_vmpage_page() can be called here without any locks as
+		 *
+		 *     - "vmpage" is locked (which prevents ->private from
+		 *       concurrent updates), and
+		 *
+		 *     - "o" cannot be destroyed while current thread holds a
+		 *       reference on it.
+		 */
+		page = cl_vmpage_page(vmpage, o);
+		PINVRNT(env, page,
+			ergo(page != NULL,
+			     cl_page_vmpage(env, page) == vmpage &&
+			     (void *)radix_tree_lookup(&hdr->coh_tree,
+						       idx) == page));
+	}
+
+	if (page != NULL) {
+		CS_PAGE_INC(o, hit);
+		RETURN(page);
+	}
+
+	/* allocate and initialize cl_page */
+	page = cl_page_alloc(env, o, idx, vmpage, type);
+	if (IS_ERR(page))
+		RETURN(page);
+
+	if (type == CPT_TRANSIENT) {
+		if (parent) {
+			LASSERT(page->cp_parent == NULL);
+			page->cp_parent = parent;
+			parent->cp_child = page;
+		}
+		RETURN(page);
+	}
+
+	/*
+	 * XXX optimization: use radix_tree_preload() here, and change tree
+	 * gfp mask to GFP_KERNEL in cl_object_header_init().
+	 */
+	spin_lock(&hdr->coh_page_guard);
+	err = radix_tree_insert(&hdr->coh_tree, idx, page);
+	if (err != 0) {
+		ghost = page;
+		/*
+		 * Noted by Jay: a lock on \a vmpage protects cl_page_find()
+		 * from this race, but
+		 *
+		 *     0. it's better to have cl_page interface "locally
+		 *     consistent" so that its correctness can be reasoned
+		 *     about without appealing to the (obscure world of) VM
+		 *     locking.
+		 *
+		 *     1. handling this race allows ->coh_tree to remain
+		 *     consistent even when VM locking is somehow busted,
+		 *     which is very useful during diagnosing and debugging.
+		 */
+		page = ERR_PTR(err);
+		CL_PAGE_DEBUG(D_ERROR, env, ghost,
+			      "fail to insert into radix tree: %d\n", err);
+	} else {
+		if (parent) {
+			LASSERT(page->cp_parent == NULL);
+			page->cp_parent = parent;
+			parent->cp_child = page;
+		}
+		hdr->coh_pages++;
+	}
+	spin_unlock(&hdr->coh_page_guard);
+
+	if (unlikely(ghost != NULL)) {
+		cl_page_delete0(env, ghost, 0);
+		cl_page_free(env, ghost);
+	}
+	RETURN(page);
+}
+
+struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o,
+			     pgoff_t idx, struct page *vmpage,
+			     enum cl_page_type type)
+{
+	return cl_page_find0(env, o, idx, vmpage, type, NULL);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+
+struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o,
+				 pgoff_t idx, struct page *vmpage,
+				 struct cl_page *parent)
+{
+	return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent);
+}
+EXPORT_SYMBOL(cl_page_find_sub);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+	struct cl_object_header *header;
+	struct cl_page	  *parent;
+	struct cl_page	  *child;
+	struct cl_io	    *owner;
+
+	/*
+	 * Page invariant is protected by a VM lock.
+	 */
+	LINVRNT(cl_page_is_vmlocked(NULL, pg));
+
+	header = cl_object_header(pg->cp_obj);
+	parent = pg->cp_parent;
+	child  = pg->cp_child;
+	owner  = pg->cp_owner;
+
+	return cl_page_in_use(pg) &&
+		ergo(parent != NULL, parent->cp_child == pg) &&
+		ergo(child != NULL, child->cp_parent == pg) &&
+		ergo(child != NULL, pg->cp_obj != child->cp_obj) &&
+		ergo(parent != NULL, pg->cp_obj != parent->cp_obj) &&
+		ergo(owner != NULL && parent != NULL,
+		     parent->cp_owner == pg->cp_owner->ci_parent) &&
+		ergo(owner != NULL && child != NULL,
+		     child->cp_owner->ci_parent == owner) &&
+		/*
+		 * Either page is early in initialization (has neither child
+		 * nor parent yet), or it is in the object radix tree.
+		 */
+		ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE,
+		     (void *)radix_tree_lookup(&header->coh_tree,
+					       pg->cp_index) == pg ||
+		     (child == NULL && parent == NULL));
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+			       struct cl_page *page, enum cl_page_state state)
+{
+	enum cl_page_state old;
+
+	/*
+	 * Matrix of allowed state transitions [old][new], for sanity
+	 * checking.
+	 */
+	static const int allowed_transitions[CPS_NR][CPS_NR] = {
+		[CPS_CACHED] = {
+			[CPS_CACHED]  = 0,
+			[CPS_OWNED]   = 1, /* io finds existing cached page */
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 1, /* write-out from the cache */
+			[CPS_FREEING] = 1, /* eviction on the memory pressure */
+		},
+		[CPS_OWNED] = {
+			[CPS_CACHED]  = 1, /* release to the cache */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 1, /* start read immediately */
+			[CPS_PAGEOUT] = 1, /* start write immediately */
+			[CPS_FREEING] = 1, /* lock invalidation or truncate */
+		},
+		[CPS_PAGEIN] = {
+			[CPS_CACHED]  = 1, /* io completion */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		},
+		[CPS_PAGEOUT] = {
+			[CPS_CACHED]  = 1, /* io completion */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		},
+		[CPS_FREEING] = {
+			[CPS_CACHED]  = 0,
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		}
+	};
+
+	ENTRY;
+	old = page->cp_state;
+	PASSERT(env, page, allowed_transitions[old][state]);
+	CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
+	for (; page != NULL; page = page->cp_child) {
+		PASSERT(env, page, page->cp_state == old);
+		PASSERT(env, page,
+			equi(state == CPS_OWNED, page->cp_owner != NULL));
+
+		CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
+		CS_PAGESTATE_INC(page->cp_obj, state);
+		cl_page_state_set_trust(page, state);
+	}
+	EXIT;
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+			      struct cl_page *page, enum cl_page_state state)
+{
+	cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+	ENTRY;
+	cl_page_get_trust(page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page.
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+	PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent);
+
+	ENTRY;
+	CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
+		       atomic_read(&page->cp_ref));
+
+	if (atomic_dec_and_test(&page->cp_ref)) {
+		LASSERT(page->cp_state == CPS_FREEING);
+
+		LASSERT(atomic_read(&page->cp_ref) == 0);
+		PASSERT(env, page, page->cp_owner == NULL);
+		PASSERT(env, page, list_empty(&page->cp_batch));
+		/*
+		 * Page is no longer reachable by other threads. Tear
+		 * it down.
+		 */
+		cl_page_free(env, page);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a VM page associated with a given cl_page.
+ */
+struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	/*
+	 * Find uppermost layer with ->cpo_vmpage() method, and return its
+	 * result.
+	 */
+	page = cl_page_top(page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			if (slice->cpl_ops->cpo_vmpage != NULL)
+				RETURN(slice->cpl_ops->cpo_vmpage(env, slice));
+		}
+		page = page->cp_child;
+	} while (page != NULL);
+	LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */
+}
+EXPORT_SYMBOL(cl_page_vmpage);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
+{
+	struct cl_page *top;
+	struct cl_page *page;
+
+	ENTRY;
+	KLASSERT(PageLocked(vmpage));
+
+	/*
+	 * NOTE: absence of races and liveness of data are guaranteed by page
+	 *       lock on a "vmpage". That works because object destruction has
+	 *       bottom-to-top pass.
+	 */
+
+	/*
+	 * This loop assumes that ->private points to the top-most page. This
+	 * can be rectified easily.
+	 */
+	top = (struct cl_page *)vmpage->private;
+	if (top == NULL)
+		RETURN(NULL);
+
+	for (page = top; page != NULL; page = page->cp_child) {
+		if (cl_object_same(page->cp_obj, obj)) {
+			cl_page_get_trust(page);
+			break;
+		}
+	}
+	LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE));
+	RETURN(page);
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+/**
+ * Returns the top-page for a given page.
+ *
+ * \see cl_object_top(), cl_io_top()
+ */
+struct cl_page *cl_page_top(struct cl_page *page)
+{
+	return cl_page_top_trusted(page);
+}
+EXPORT_SYMBOL(cl_page_top);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+				       const struct lu_device_type *dtype)
+{
+	return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname)
+
+#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...)		   \
+({								      \
+	const struct lu_env	*__env  = (_env);		    \
+	struct cl_page	     *__page = (_page);		   \
+	const struct cl_page_slice *__scan;			     \
+	int			 __result;			   \
+	ptrdiff_t		   __op   = (_op);		     \
+	int		       (*__method)_proto;		    \
+									\
+	__result = 0;						   \
+	__page = cl_page_top(__page);				   \
+	do {							    \
+		list_for_each_entry(__scan, &__page->cp_layers,     \
+					cpl_linkage) {		  \
+			__method = *(void **)((char *)__scan->cpl_ops + \
+					      __op);		    \
+			if (__method != NULL) {			 \
+				__result = (*__method)(__env, __scan,   \
+						       ## __VA_ARGS__); \
+				if (__result != 0)		      \
+					break;			  \
+			}					       \
+		}						       \
+		__page = __page->cp_child;			      \
+	} while (__page != NULL && __result == 0);		      \
+	if (__result > 0)					       \
+		__result = 0;					   \
+	__result;						       \
+})
+
+#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...)		   \
+do {								    \
+	const struct lu_env	*__env  = (_env);		    \
+	struct cl_page	     *__page = (_page);		   \
+	const struct cl_page_slice *__scan;			     \
+	ptrdiff_t		   __op   = (_op);		     \
+	void		      (*__method)_proto;		    \
+									\
+	__page = cl_page_top(__page);				   \
+	do {							    \
+		list_for_each_entry(__scan, &__page->cp_layers,     \
+					cpl_linkage) {		  \
+			__method = *(void **)((char *)__scan->cpl_ops + \
+					      __op);		    \
+			if (__method != NULL)			   \
+				(*__method)(__env, __scan,	      \
+					    ## __VA_ARGS__);	    \
+		}						       \
+		__page = __page->cp_child;			      \
+	} while (__page != NULL);				       \
+} while (0)
+
+#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...)	       \
+do {									\
+	const struct lu_env	*__env  = (_env);			\
+	struct cl_page	     *__page = (_page);		       \
+	const struct cl_page_slice *__scan;				 \
+	ptrdiff_t		   __op   = (_op);			 \
+	void		      (*__method)_proto;			\
+									    \
+	/* get to the bottom page. */				       \
+	while (__page->cp_child != NULL)				    \
+		__page = __page->cp_child;				  \
+	do {								\
+		list_for_each_entry_reverse(__scan, &__page->cp_layers, \
+						cpl_linkage) {	      \
+			__method = *(void **)((char *)__scan->cpl_ops +     \
+					      __op);			\
+			if (__method != NULL)			       \
+				(*__method)(__env, __scan,		  \
+					    ## __VA_ARGS__);		\
+		}							   \
+		__page = __page->cp_parent;				 \
+	} while (__page != NULL);					   \
+} while (0)
+
+static int cl_page_invoke(const struct lu_env *env,
+			  struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+	PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+	ENTRY;
+	RETURN(CL_PAGE_INVOKE(env, page, op,
+			      (const struct lu_env *,
+			       const struct cl_page_slice *, struct cl_io *),
+			      io));
+}
+
+static void cl_page_invoid(const struct lu_env *env,
+			   struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+	PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+	ENTRY;
+	CL_PAGE_INVOID(env, page, op,
+		       (const struct lu_env *,
+			const struct cl_page_slice *, struct cl_io *), io);
+	EXIT;
+}
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+	ENTRY;
+	for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+		if (page->cp_owner != NULL) {
+			LASSERT(page->cp_owner->ci_owned_nr > 0);
+			page->cp_owner->ci_owned_nr--;
+			page->cp_owner = NULL;
+			page->cp_task = NULL;
+		}
+	}
+	EXIT;
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+	ENTRY;
+	for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+		LASSERT(page->cp_owner != NULL);
+		page->cp_owner->ci_owned_nr++;
+	}
+	EXIT;
+}
+
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg)
+{
+	enum cl_page_state state;
+
+	ENTRY;
+	state = pg->cp_state;
+	PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING);
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	cl_page_owner_clear(pg);
+
+	if (state == CPS_OWNED)
+		cl_page_state_set(env, pg, CPS_CACHED);
+	/*
+	 * Completion call-backs are executed in the bottom-up order, so that
+	 * uppermost layer (llite), responsible for VFS/VM interaction runs
+	 * last and can release locks safely.
+	 */
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, struct cl_io *),
+			       io);
+	EXIT;
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+	LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+	ENTRY;
+	RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == io);
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Try to own a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre  !cl_page_is_owned(pg, io)
+ * \post result == 0 iff cl_page_is_owned(pg, io)
+ *
+ * \retval 0   success
+ *
+ * \retval -ve failure, e.g., page was destroyed (and landed in
+ *	     cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ *	     or, page was owned by another thread, or in IO.
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
+ */
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *pg, int nonblock)
+{
+	int result;
+
+	PINVRNT(env, pg, !cl_page_is_owned(pg, io));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+
+	if (pg->cp_state == CPS_FREEING) {
+		result = -ENOENT;
+	} else {
+		result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
+					(const struct lu_env *,
+					 const struct cl_page_slice *,
+					 struct cl_io *, int),
+					io, nonblock);
+		if (result == 0) {
+			PASSERT(env, pg, pg->cp_owner == NULL);
+			PASSERT(env, pg, pg->cp_req == NULL);
+			pg->cp_owner = io;
+			pg->cp_task  = current;
+			cl_page_owner_set(pg);
+			if (pg->cp_state != CPS_FREEING) {
+				cl_page_state_set(env, pg, CPS_OWNED);
+			} else {
+				cl_page_disown0(env, io, pg);
+				result = -ENOENT;
+			}
+		}
+	}
+	PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
+	RETURN(result);
+}
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+	return cl_page_own0(env, io, pg, 0);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *pg)
+{
+	return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+		    struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+
+	cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume));
+	PASSERT(env, pg, pg->cp_owner == NULL);
+	pg->cp_owner = io;
+	pg->cp_task = current;
+	cl_page_owner_set(pg);
+	cl_page_state_set(env, pg, CPS_OWNED);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the
+ * underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+		      struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+	cl_page_owner_clear(pg);
+	cl_page_state_set(env, pg, CPS_CACHED);
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, struct cl_io *),
+			       io);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+		    struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+	cl_page_disown0(env, io, pg);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when page is to be removed from the object, e.g., as a result of
+ * truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard));
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+			    int radix)
+{
+	struct cl_page *tmp = pg;
+	ENTRY;
+
+	PASSERT(env, pg, pg == cl_page_top(pg));
+	PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+
+	/*
+	 * Severe all ways to obtain new pointers to @pg.
+	 */
+	cl_page_owner_clear(pg);
+
+	/*
+	 * unexport the page firstly before freeing it so that
+	 * the page content is considered to be invalid.
+	 * We have to do this because a CPS_FREEING cl_page may
+	 * be NOT under the protection of a cl_lock.
+	 * Afterwards, if this page is found by other threads, then this
+	 * page will be forced to reread.
+	 */
+	cl_page_export(env, pg, 0);
+	cl_page_state_set0(env, pg, CPS_FREEING);
+
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete),
+		       (const struct lu_env *, const struct cl_page_slice *));
+
+	if (tmp->cp_type == CPT_CACHEABLE) {
+		if (!radix)
+			/* !radix means that @pg is not yet in the radix tree,
+			 * skip removing it.
+			 */
+			tmp = pg->cp_child;
+		for (; tmp != NULL; tmp = tmp->cp_child) {
+			void		    *value;
+			struct cl_object_header *hdr;
+
+			hdr = cl_object_header(tmp->cp_obj);
+			spin_lock(&hdr->coh_page_guard);
+			value = radix_tree_delete(&hdr->coh_tree,
+						  tmp->cp_index);
+			PASSERT(env, tmp, value == tmp);
+			PASSERT(env, tmp, hdr->coh_pages > 0);
+			hdr->coh_pages--;
+			spin_unlock(&hdr->coh_page_guard);
+			cl_page_put(env, tmp);
+		}
+	}
+
+	EXIT;
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ *     - removes page from the radix trees,
+ *
+ *     - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre  pg == cl_page_top(pg)
+ * \pre  VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	ENTRY;
+	cl_page_delete0(env, pg, 1);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Unmaps page from user virtual memory.
+ *
+ * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to unmap page from user space
+ * virtual memory.
+ *
+ * \see cl_page_operations::cpo_unmap()
+ */
+int cl_page_unmap(const struct lu_env *env,
+		  struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap));
+}
+EXPORT_SYMBOL(cl_page_unmap);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark/clear page as up-to-date
+ * by the \a uptodate argument.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export),
+		       (const struct lu_env *,
+			const struct cl_page_slice *, int), uptodate);
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, iff \a pg is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
+{
+	int result;
+	const struct cl_page_slice *slice;
+
+	ENTRY;
+	pg = cl_page_top_trusted((struct cl_page *)pg);
+	slice = container_of(pg->cp_layers.next,
+			     const struct cl_page_slice, cpl_linkage);
+	PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL);
+	/*
+	 * Call ->cpo_is_vmlocked() directly instead of going through
+	 * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+	 * cl_page_invariant().
+	 */
+	result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+	PASSERT(env, pg, result == -EBUSY || result == -ENODATA);
+	RETURN(result == -EBUSY);
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+	ENTRY;
+	RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN);
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+			     struct cl_page *pg, enum cl_req_type crt)
+{
+	/*
+	 * Page is queued for IO, change its state.
+	 */
+	ENTRY;
+	cl_page_owner_clear(pg);
+	cl_page_state_set(env, pg, cl_req_type_state(crt));
+	EXIT;
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+		 struct cl_page *pg, enum cl_req_type crt)
+{
+	int result;
+
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	PINVRNT(env, pg, crt < CRT_NR);
+
+	/*
+	 * XXX this has to be called bottom-to-top, so that llite can set up
+	 * PG_writeback without risking other layers deciding to skip this
+	 * page.
+	 */
+	if (crt >= CRT_NR)
+		return -EINVAL;
+	result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep));
+	if (result == 0)
+		cl_page_io_start(env, pg, crt);
+
+	KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE,
+		      equi(result == 0,
+			   PageWriteback(cl_page_vmpage(env, pg)))));
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre  pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ * \post pg->cp_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+			struct cl_page *pg, enum cl_req_type crt, int ioret)
+{
+	struct cl_sync_io *anchor = pg->cp_sync_io;
+
+	PASSERT(env, pg, crt < CRT_NR);
+	/* cl_page::cp_req already cleared by the caller (osc_completion()) */
+	PASSERT(env, pg, pg->cp_req == NULL);
+	PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+
+	ENTRY;
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
+	if (crt == CRT_READ && ioret == 0) {
+		PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
+		pg->cp_flags |= CPF_READ_COMPLETED;
+	}
+
+	cl_page_state_set(env, pg, CPS_CACHED);
+	if (crt >= CRT_NR)
+		return;
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, int), ioret);
+	if (anchor) {
+		LASSERT(cl_page_is_vmlocked(env, pg));
+		LASSERT(pg->cp_sync_io == anchor);
+		pg->cp_sync_io = NULL;
+	}
+	/*
+	 * As page->cp_obj is pinned by a reference from page->cp_req, it is
+	 * safe to call cl_page_put() without risking object destruction in a
+	 * non-blocking context.
+	 */
+	cl_page_put(env, pg);
+
+	if (anchor)
+		cl_sync_io_note(anchor, ioret);
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre  pg->cp_state == CPS_CACHED
+ * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
+		       enum cl_req_type crt)
+{
+	int result;
+
+	PINVRNT(env, pg, crt < CRT_NR);
+
+	ENTRY;
+	if (crt >= CRT_NR)
+		RETURN(-EINVAL);
+	result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready),
+				(const struct lu_env *,
+				 const struct cl_page_slice *));
+	if (result == 0) {
+		PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+		cl_page_io_start(env, pg, crt);
+	}
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Notify layers that high level io decided to place this page into a cache
+ * for future transfer.
+ *
+ * The layer implementing transfer engine (osc) has to register this page in
+ * its queues.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_cache_add()
+ */
+int cl_page_cache_add(const struct lu_env *env, struct cl_io *io,
+		      struct cl_page *pg, enum cl_req_type crt)
+{
+	const struct cl_page_slice *scan;
+	int result = 0;
+
+	PINVRNT(env, pg, crt < CRT_NR);
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	ENTRY;
+
+	if (crt >= CRT_NR)
+		RETURN(-EINVAL);
+
+	list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) {
+		if (scan->cpl_ops->io[crt].cpo_cache_add == NULL)
+			continue;
+
+		result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io);
+		if (result != 0)
+			break;
+	}
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_cache_add);
+
+/**
+ * Called if a pge is being written back by kernel's intention.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+		  struct cl_page *pg)
+{
+	int result;
+
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	ENTRY;
+
+	result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush));
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
+ * Checks whether page is protected by any extent lock is at least required
+ * mode.
+ *
+ * \return the same as in cl_page_operations::cpo_is_under_lock() method.
+ * \see cl_page_operations::cpo_is_under_lock()
+ */
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page)
+{
+	int rc;
+
+	PINVRNT(env, page, cl_page_invariant(page));
+
+	ENTRY;
+	rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock),
+			    (const struct lu_env *,
+			     const struct cl_page_slice *, struct cl_io *),
+			    io);
+	PASSERT(env, page, rc != 0);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_page_is_under_lock);
+
+static int page_prune_cb(const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *page, void *cbdata)
+{
+	cl_page_own(env, io, page);
+	cl_page_unmap(env, io, page);
+	cl_page_discard(env, io, page);
+	cl_page_disown(env, io, page);
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Purges all cached pages belonging to the object \a obj.
+ */
+int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
+{
+	struct cl_thread_info   *info;
+	struct cl_object	*obj = cl_object_top(clobj);
+	struct cl_io	    *io;
+	int		      result;
+
+	ENTRY;
+	info  = cl_env_info(env);
+	io    = &info->clt_io;
+
+	/*
+	 * initialize the io. This is ugly since we never do IO in this
+	 * function, we just make cl_page_list functions happy. -jay
+	 */
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, obj);
+	if (result != 0) {
+		cl_io_fini(env, io);
+		RETURN(io->ci_result);
+	}
+
+	do {
+		result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF,
+					     page_prune_cb, NULL);
+		if (result == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (result != CLP_GANG_OKAY);
+
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_pages_prune);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *pg,
+		  int from, int to)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to);
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip),
+		       (const struct lu_env *,
+			const struct cl_page_slice *,int, int),
+		       from, to);
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t printer, const struct cl_page *pg)
+{
+	(*printer)(env, cookie,
+		   "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n",
+		   pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+		   pg->cp_index, pg->cp_parent, pg->cp_child,
+		   pg->cp_state, pg->cp_error, pg->cp_type,
+		   pg->cp_owner, pg->cp_req, pg->cp_flags);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_page *pg)
+{
+	struct cl_page *scan;
+
+	for (scan = cl_page_top((struct cl_page *)pg);
+	     scan != NULL; scan = scan->cp_child)
+		cl_page_header_print(env, cookie, printer, scan);
+	CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print),
+		       (const struct lu_env *env,
+			const struct cl_page_slice *slice,
+			void *cookie, lu_printer_t p), cookie, printer);
+	(*printer)(env, cookie, "end page@%p\n", pg);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Cancel a page which is still in a transfer.
+ */
+int cl_page_cancel(const struct lu_env *env, struct cl_page *page)
+{
+	return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel),
+			      (const struct lu_env *,
+			       const struct cl_page_slice *));
+}
+EXPORT_SYMBOL(cl_page_cancel);
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+	/*
+	 * XXX for now.
+	 */
+	return (loff_t)idx << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+	/*
+	 * XXX for now.
+	 */
+	return offset >> PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+int cl_page_size(const struct cl_object *obj)
+{
+	return 1 << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_page_operations *ops)
+{
+	ENTRY;
+	list_add_tail(&slice->cpl_linkage, &page->cp_layers);
+	slice->cpl_obj  = obj;
+	slice->cpl_ops  = ops;
+	slice->cpl_page = page;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+int  cl_page_init(void)
+{
+	return 0;
+}
+
+void cl_page_fini(void)
+{
+}

diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
new file mode 100644
index 0000000..af1c2d0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c

@@ -0,0 +1,689 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+# include <asm/atomic.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_build_version.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "llog_internal.h"
+
+
+struct obd_device *obd_devs[MAX_OBD_DEVICES];
+EXPORT_SYMBOL(obd_devs);
+struct list_head obd_types;
+DEFINE_RWLOCK(obd_dev_lock);
+
+__u64 obd_max_pages = 0;
+__u64 obd_max_alloc = 0;
+DEFINE_SPINLOCK(obd_updatemax_lock);
+
+/* The following are visible and mutable through /proc/sys/lustre/. */
+unsigned int obd_alloc_fail_rate = 0;
+EXPORT_SYMBOL(obd_alloc_fail_rate);
+unsigned int obd_debug_peer_on_timeout;
+EXPORT_SYMBOL(obd_debug_peer_on_timeout);
+unsigned int obd_dump_on_timeout;
+EXPORT_SYMBOL(obd_dump_on_timeout);
+unsigned int obd_dump_on_eviction;
+EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_max_dirty_pages = 256;
+EXPORT_SYMBOL(obd_max_dirty_pages);
+atomic_t obd_dirty_pages;
+EXPORT_SYMBOL(obd_dirty_pages);
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
+EXPORT_SYMBOL(obd_timeout);
+unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(ldlm_timeout);
+unsigned int obd_timeout_set;
+EXPORT_SYMBOL(obd_timeout_set);
+unsigned int ldlm_timeout_set;
+EXPORT_SYMBOL(ldlm_timeout_set);
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+EXPORT_SYMBOL(at_min);
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+unsigned int at_history = 600;
+EXPORT_SYMBOL(at_history);
+int at_early_margin = 5;
+EXPORT_SYMBOL(at_early_margin);
+int at_extra = 30;
+EXPORT_SYMBOL(at_extra);
+
+atomic_t obd_dirty_transit_pages;
+EXPORT_SYMBOL(obd_dirty_transit_pages);
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+EXPORT_SYMBOL(obd_jobid_var);
+
+/* Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * TODO:
+ * It's better to cache the jobid for later use if there is any
+ * efficient way, the cl_env code probably could be reused for this
+ * purpose.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/api. Then, the jobid must be cached.
+ */
+int lustre_get_jobid(char *jobid)
+{
+	int jobid_len = JOBSTATS_JOBID_SIZE;
+	int rc = 0;
+	ENTRY;
+
+	memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+	/* Jobstats isn't enabled */
+	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+		RETURN(0);
+
+	/* Use process name + fsuid as jobid */
+	if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+		snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u",
+			 current_comm(), current_fsuid());
+		RETURN(0);
+	}
+
+	rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len);
+	if (rc) {
+		if (rc == -EOVERFLOW) {
+			/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+			 * variable length strings instead of just numbers), it
+			 * might make sense to keep the unique parts for JobID,
+			 * instead of just returning an error.  That means a
+			 * larger temp buffer for cfs_get_environ(), then
+			 * truncating the string at some separator to fit into
+			 * the specified jobid_len.  Fix later if needed. */
+			static bool printed;
+			if (unlikely(!printed)) {
+				LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
+						   "for JobID buffer (%d)\n",
+						   obd_jobid_var, jobid_len);
+				printed = true;
+			}
+		} else {
+			CDEBUG((rc == -ENOENT || rc == -EINVAL ||
+				rc == -EDEADLK) ? D_INFO : D_ERROR,
+			       "Get jobid for (%s) failed: rc = %d\n",
+			       obd_jobid_var, rc);
+		}
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+		   size_t size, const char *file, int line)
+{
+	if (ptr == NULL ||
+	    (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) {
+		CERROR("%s%salloc of %s ("LPU64" bytes) failed at %s:%d\n",
+		       ptr ? "force " :"", type, name, (__u64)size, file,
+		       line);
+		CERROR(LPU64" total bytes and "LPU64" total pages "
+		       "("LPU64" bytes) allocated by Lustre, "
+		       "%d total bytes by LNET\n",
+		       obd_memory_sum(),
+		       obd_pages_sum() << PAGE_CACHE_SHIFT,
+		       obd_pages_sum(),
+			atomic_read(&libcfs_kmemory));
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(obd_alloc_fail);
+
+static inline void obd_data2conn(struct lustre_handle *conn,
+				 struct obd_ioctl_data *data)
+{
+	memset(conn, 0, sizeof *conn);
+	conn->cookie = data->ioc_cookie;
+}
+
+static inline void obd_conn2data(struct obd_ioctl_data *data,
+				 struct lustre_handle *conn)
+{
+	data->ioc_cookie = conn->cookie;
+}
+
+int class_resolve_dev_name(__u32 len, const char *name)
+{
+	int rc;
+	int dev;
+
+	ENTRY;
+	if (!len || !name) {
+		CERROR("No name passed,!\n");
+		GOTO(out, rc = -EINVAL);
+	}
+	if (name[len - 1] != 0) {
+		CERROR("Name not nul terminated!\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CDEBUG(D_IOCTL, "device name %s\n", name);
+	dev = class_name2dev(name);
+	if (dev == -1) {
+		CDEBUG(D_IOCTL, "No device for name %s!\n", name);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev);
+	rc = dev;
+
+out:
+	RETURN(rc);
+}
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+	char *buf = NULL;
+	struct obd_ioctl_data *data;
+	struct libcfs_debug_ioctl_data *debug_data;
+	struct obd_device *obd = NULL;
+	int err = 0, len = 0;
+	ENTRY;
+
+	/* only for debugging */
+	if (cmd == LIBCFS_IOC_DEBUG_MASK) {
+		debug_data = (struct libcfs_debug_ioctl_data*)arg;
+		libcfs_subsystem_debug = debug_data->subs;
+		libcfs_debug = debug_data->debug;
+		return 0;
+	}
+
+	CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+	if (obd_ioctl_getdata(&buf, &len, (void *)arg)) {
+		CERROR("OBD ioctl: data error\n");
+		RETURN(-EINVAL);
+	}
+	data = (struct obd_ioctl_data *)buf;
+
+	switch (cmd) {
+	case OBD_IOC_PROCESS_CFG: {
+		struct lustre_cfg *lcfg;
+
+		if (!data->ioc_plen1 || !data->ioc_pbuf1) {
+			CERROR("No config buffer passed!\n");
+			GOTO(out, err = -EINVAL);
+		}
+		OBD_ALLOC(lcfg, data->ioc_plen1);
+		if (lcfg == NULL)
+			GOTO(out, err = -ENOMEM);
+		err = copy_from_user(lcfg, data->ioc_pbuf1,
+					 data->ioc_plen1);
+		if (!err)
+			err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
+		if (!err)
+			err = class_process_config(lcfg);
+
+		OBD_FREE(lcfg, data->ioc_plen1);
+		GOTO(out, err);
+	}
+
+	case OBD_GET_VERSION:
+		if (!data->ioc_inlbuf1) {
+			CERROR("No buffer passed in ioctl\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) {
+			CERROR("ioctl buffer too small to hold version\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		memcpy(data->ioc_bulk, BUILD_VERSION,
+		       strlen(BUILD_VERSION) + 1);
+
+		err = obd_ioctl_popdata((void *)arg, data, len);
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+
+	case OBD_IOC_NAME2DEV: {
+		/* Resolve a device name.  This does not change the
+		 * currently selected device.
+		 */
+		int dev;
+
+		dev = class_resolve_dev_name(data->ioc_inllen1,
+					     data->ioc_inlbuf1);
+		data->ioc_dev = dev;
+		if (dev < 0)
+			GOTO(out, err = -EINVAL);
+
+		err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+
+	case OBD_IOC_UUID2DEV: {
+		/* Resolve a device uuid.  This does not change the
+		 * currently selected device.
+		 */
+		int dev;
+		struct obd_uuid uuid;
+
+		if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
+			CERROR("No UUID passed!\n");
+			GOTO(out, err = -EINVAL);
+		}
+		if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+			CERROR("UUID not NUL terminated!\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
+		obd_str2uuid(&uuid, data->ioc_inlbuf1);
+		dev = class_uuid2dev(&uuid);
+		data->ioc_dev = dev;
+		if (dev == -1) {
+			CDEBUG(D_IOCTL, "No device for UUID %s!\n",
+			       data->ioc_inlbuf1);
+			GOTO(out, err = -EINVAL);
+		}
+
+		CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
+		       dev);
+		err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+
+	case OBD_IOC_CLOSE_UUID: {
+		CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n",
+		       data->ioc_inlbuf1);
+		GOTO(out, err = 0);
+	}
+
+	case OBD_IOC_GETDEVICE: {
+		int     index = data->ioc_count;
+		char    *status, *str;
+
+		if (!data->ioc_inlbuf1) {
+			CERROR("No buffer passed in ioctl\n");
+			GOTO(out, err = -EINVAL);
+		}
+		if (data->ioc_inllen1 < 128) {
+			CERROR("ioctl buffer too small to hold version\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		obd = class_num2obd(index);
+		if (!obd)
+			GOTO(out, err = -ENOENT);
+
+		if (obd->obd_stopping)
+			status = "ST";
+		else if (obd->obd_set_up)
+			status = "UP";
+		else if (obd->obd_attached)
+			status = "AT";
+		else
+			status = "--";
+		str = (char *)data->ioc_bulk;
+		snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
+			 (int)index, status, obd->obd_type->typ_name,
+			 obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+		err = obd_ioctl_popdata((void *)arg, data, len);
+
+		GOTO(out, err = 0);
+	}
+
+	}
+
+	if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+		if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL)
+			GOTO(out, err = -EINVAL);
+		if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME)
+			GOTO(out, err = -EINVAL);
+		obd = class_name2obd(data->ioc_inlbuf4);
+	} else if (data->ioc_dev < class_devno_max()) {
+		obd = class_num2obd(data->ioc_dev);
+	} else {
+		CERROR("OBD ioctl: No device\n");
+		GOTO(out, err = -EINVAL);
+	}
+
+	if (obd == NULL) {
+		CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+		GOTO(out, err = -EINVAL);
+	}
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+
+	if (!obd->obd_set_up || obd->obd_stopping) {
+		CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev);
+		GOTO(out, err = -EINVAL);
+	}
+
+	switch(cmd) {
+	case OBD_IOC_NO_TRANSNO: {
+		if (!obd->obd_attached) {
+			CERROR("Device %d not attached\n", obd->obd_minor);
+			GOTO(out, err = -ENODEV);
+		}
+		CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+		       obd->obd_name);
+		obd->obd_no_transno = 1;
+		GOTO(out, err = 0);
+	}
+
+	default: {
+		err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL);
+		if (err)
+			GOTO(out, err);
+
+		err = obd_ioctl_popdata((void *)arg, data, len);
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+	}
+
+ out:
+	if (buf)
+		obd_ioctl_freedata(buf, len);
+	RETURN(err);
+} /* class_handle_ioctl */
+
+extern psdev_t obd_psdev;
+
+#define OBD_INIT_CHECK
+int obd_init_checks(void)
+{
+	__u64 u64val, div64val;
+	char buf[64];
+	int len, ret = 0;
+
+	CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", LPU64, LPD64, LPX64);
+
+	CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", (__u64)OBD_OBJECT_EOF);
+
+	u64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+	if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), LPX64, u64val);
+	if (len != 18) {
+		CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+		ret = -EINVAL;
+	}
+
+	div64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+	if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EOVERFLOW;
+	}
+	if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+		CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		return -EOVERFLOW;
+	}
+	if (do_div(div64val, 256) != (u64val & 255)) {
+		CERROR("do_div("LPX64",256) != "LPU64"\n", u64val, u64val &255);
+		return -EOVERFLOW;
+	}
+	if (u64val >> 8 != div64val) {
+		CERROR("do_div("LPX64",256) "LPU64" != "LPU64"\n",
+		       u64val, div64val, u64val >> 8);
+		return -EOVERFLOW;
+	}
+	len = snprintf(buf, sizeof(buf), LPX64, u64val);
+	if (len != 18) {
+		CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), LPU64, u64val);
+	if (len != 20) {
+		CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), LPD64, u64val);
+	if (len != 2) {
+		CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+		ret = -EINVAL;
+	}
+	if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) {
+		CWARN("mask failed: u64val "LPU64" >= "LPU64"\n", u64val,
+		      (__u64)PAGE_CACHE_SIZE);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+extern spinlock_t obd_types_lock;
+extern int class_procfs_init(void);
+extern int class_procfs_clean(void);
+
+static int __init init_obdclass(void)
+{
+	int i, err;
+	int lustre_register_fs(void);
+
+	for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++)
+		INIT_LIST_HEAD(&capa_list[i]);
+
+	LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n");
+
+	spin_lock_init(&obd_types_lock);
+	obd_zombie_impexp_init();
+#ifdef LPROCFS
+	obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+					 LPROCFS_STATS_FLAG_NONE |
+					 LPROCFS_STATS_FLAG_IRQ_SAFE);
+	if (obd_memory == NULL) {
+		CERROR("kmalloc of 'obd_memory' failed\n");
+		RETURN(-ENOMEM);
+	}
+
+	lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+			     LPROCFS_CNTR_AVGMINMAX,
+			     "memused", "bytes");
+	lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT,
+			     LPROCFS_CNTR_AVGMINMAX,
+			     "pagesused", "pages");
+#endif
+	err = obd_init_checks();
+	if (err == -EOVERFLOW)
+		return err;
+
+	class_init_uuidlist();
+	err = class_handle_init();
+	if (err)
+		return err;
+
+	INIT_LIST_HEAD(&obd_types);
+
+	err = misc_register(&obd_psdev);
+	if (err) {
+		CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err);
+		return err;
+	}
+
+	/* This struct is already zeroed for us (static global) */
+	for (i = 0; i < class_devno_max(); i++)
+		obd_devs[i] = NULL;
+
+	/* Default the dirty page cache cap to 1/2 of system memory.
+	 * For clients with less memory, a larger fraction is needed
+	 * for other purposes (mostly for BGL). */
+	if (num_physpages <= 512 << (20 - PAGE_CACHE_SHIFT))
+		obd_max_dirty_pages = num_physpages / 4;
+	else
+		obd_max_dirty_pages = num_physpages / 2;
+
+	err = obd_init_caches();
+	if (err)
+		return err;
+	err = class_procfs_init();
+	if (err)
+		return err;
+
+	err = lu_global_init();
+	if (err)
+		return err;
+
+	err = cl_global_init();
+	if (err != 0)
+		return err;
+
+
+	err = llog_info_init();
+	if (err)
+		return err;
+
+	err = lustre_register_fs();
+
+	return err;
+}
+
+void obd_update_maxusage(void)
+{
+	__u64 max1, max2;
+
+	max1 = obd_pages_sum();
+	max2 = obd_memory_sum();
+
+	spin_lock(&obd_updatemax_lock);
+	if (max1 > obd_max_pages)
+		obd_max_pages = max1;
+	if (max2 > obd_max_alloc)
+		obd_max_alloc = max2;
+	spin_unlock(&obd_updatemax_lock);
+}
+EXPORT_SYMBOL(obd_update_maxusage);
+
+#ifdef LPROCFS
+__u64 obd_memory_max(void)
+{
+	__u64 ret;
+
+	spin_lock(&obd_updatemax_lock);
+	ret = obd_max_alloc;
+	spin_unlock(&obd_updatemax_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_memory_max);
+
+__u64 obd_pages_max(void)
+{
+	__u64 ret;
+
+	spin_lock(&obd_updatemax_lock);
+	ret = obd_max_pages;
+	spin_unlock(&obd_updatemax_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_pages_max);
+#endif
+
+/* liblustre doesn't call cleanup_obdclass, apparently.  we carry on in this
+ * ifdef to the end of the file to cover module and versioning goo.*/
+static void cleanup_obdclass(void)
+{
+	int i;
+	int lustre_unregister_fs(void);
+	__u64 memory_leaked, pages_leaked;
+	__u64 memory_max, pages_max;
+	ENTRY;
+
+	lustre_unregister_fs();
+
+	misc_deregister(&obd_psdev);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+		if (obd && obd->obd_set_up &&
+		    OBT(obd) && OBP(obd, detach)) {
+			/* XXX should this call generic detach otherwise? */
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			OBP(obd, detach)(obd);
+		}
+	}
+	llog_info_fini();
+	cl_global_fini();
+	lu_global_fini();
+
+	obd_cleanup_caches();
+	obd_sysctl_clean();
+
+	class_procfs_clean();
+
+	class_handle_cleanup();
+	class_exit_uuidlist();
+	obd_zombie_impexp_stop();
+
+	memory_leaked = obd_memory_sum();
+	pages_leaked = obd_pages_sum();
+
+	memory_max = obd_memory_max();
+	pages_max = obd_pages_max();
+
+	lprocfs_free_stats(&obd_memory);
+	CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+	       "obd_memory max: "LPU64", leaked: "LPU64"\n",
+	       memory_max, memory_leaked);
+	CDEBUG((pages_leaked) ? D_ERROR : D_INFO,
+	       "obd_memory_pages max: "LPU64", leaked: "LPU64"\n",
+	       pages_max, pages_leaked);
+
+	EXIT;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
+MODULE_LICENSE("GPL");
+
+cfs_module(obdclass, LUSTRE_VERSION_STRING, init_obdclass, cleanup_obdclass);

diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c
new file mode 100644
index 0000000..15f71bb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/debug.c

@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/debug.c
+ *
+ * Helper routines for dumping data structs for debugging.
+ */
+
+#define DEBUG_SUBSYSTEM D_OTHER
+
+
+#include <obd_ost.h>
+#include <obd_support.h>
+#include <lustre_debug.h>
+#include <lustre_net.h>
+
+void dump_lniobuf(struct niobuf_local *nb)
+{
+	CDEBUG(D_RPCTRACE,
+	       "niobuf_local: file_offset="LPD64", len=%d, page=%p, rc=%d\n",
+	       nb->lnb_file_offset, nb->len, nb->page, nb->rc);
+	CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n",
+			nb->page ? page_index(nb->page) : -1);
+}
+EXPORT_SYMBOL(dump_lniobuf);
+
+void dump_lsm(int level, struct lov_stripe_md *lsm)
+{
+	CDEBUG(level, "lsm %p, objid "DOSTID", maxbytes "LPX64", magic 0x%08X,"
+	       " stripe_size %u, stripe_count %u, refc: %d,"
+	       " layout_gen %u, pool ["LOV_POOLNAMEF"]\n", lsm,
+	       POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic,
+	       lsm->lsm_stripe_size, lsm->lsm_stripe_count,
+	       atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen,
+	       lsm->lsm_pool_name);
+}
+EXPORT_SYMBOL(dump_lsm);
+
+#define LPDS sizeof(__u64)
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id)
+{
+	LASSERT(addr);
+
+	off = cpu_to_le64 (off);
+	id = cpu_to_le64 (id);
+	memcpy(addr, (char *)&off, LPDS);
+	memcpy(addr + LPDS, (char *)&id, LPDS);
+
+	addr += len - LPDS - LPDS;
+	memcpy(addr, (char *)&off, LPDS);
+	memcpy(addr + LPDS, (char *)&id, LPDS);
+
+	return 0;
+}
+EXPORT_SYMBOL(block_debug_setup);
+
+int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
+{
+	__u64 ne_off;
+	int err = 0;
+
+	LASSERT(addr);
+
+	ne_off = le64_to_cpu (off);
+	id = le64_to_cpu (id);
+	if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" off: "LPX64" != "
+		       LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+		err = -EINVAL;
+	}
+	if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n",
+		       who, id, off, *(__u64 *)(addr + LPDS), id);
+		err = -EINVAL;
+	}
+
+	addr += end - LPDS - LPDS;
+	if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end off: "LPX64" != "
+		       LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+		err = -EINVAL;
+	}
+	if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end id: "LPX64" != "
+		       LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(block_debug_check);
+#undef LPDS

diff --git a/drivers/staging/lustre/lustre/obdclass/dt_object.c b/drivers/staging/lustre/lustre/obdclass/dt_object.c
new file mode 100644
index 0000000..1c962dd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/dt_object.c

@@ -0,0 +1,1055 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/dt_object.c
+ *
+ * Dt Object.
+ * Generic functions from dt_object.h
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <dt_object.h>
+#include <linux/list.h>
+/* fid_be_to_cpu() */
+#include <lustre_fid.h>
+
+#include <lustre_quota.h>
+
+/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */
+LU_KEY_INIT(dt_global, struct dt_thread_info);
+LU_KEY_FINI(dt_global, struct dt_thread_info);
+
+struct lu_context_key dt_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+	.lct_init = dt_global_key_init,
+	.lct_fini = dt_global_key_fini
+};
+EXPORT_SYMBOL(dt_key);
+
+/* no lock is necessary to protect the list, because call-backs
+ * are added during system startup. Please refer to "struct dt_device".
+ */
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks);
+}
+EXPORT_SYMBOL(dt_txn_callback_add);
+
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_del_init(&cb->dtc_linkage);
+}
+EXPORT_SYMBOL(dt_txn_callback_del);
+
+int dt_txn_hook_start(const struct lu_env *env,
+		      struct dt_device *dev, struct thandle *th)
+{
+	int rc = 0;
+	struct dt_txn_callback *cb;
+
+	if (th->th_local)
+		return 0;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		if (cb->dtc_txn_start == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+		rc = cb->dtc_txn_start(env, th, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_start);
+
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn)
+{
+	struct dt_device       *dev = txn->th_dev;
+	struct dt_txn_callback *cb;
+	int		     rc = 0;
+
+	if (txn->th_local)
+		return 0;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		if (cb->dtc_txn_stop == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+		rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_stop);
+
+void dt_txn_hook_commit(struct thandle *txn)
+{
+	struct dt_txn_callback *cb;
+
+	if (txn->th_local)
+		return;
+
+	list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks,
+				dtc_linkage) {
+		if (cb->dtc_txn_commit)
+			cb->dtc_txn_commit(txn, cb->dtc_cookie);
+	}
+}
+EXPORT_SYMBOL(dt_txn_hook_commit);
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
+{
+
+	INIT_LIST_HEAD(&dev->dd_txn_callbacks);
+	return lu_device_init(&dev->dd_lu_dev, t);
+}
+EXPORT_SYMBOL(dt_device_init);
+
+void dt_device_fini(struct dt_device *dev)
+{
+	lu_device_fini(&dev->dd_lu_dev);
+}
+EXPORT_SYMBOL(dt_device_fini);
+
+int dt_object_init(struct dt_object *obj,
+		   struct lu_object_header *h, struct lu_device *d)
+
+{
+	return lu_object_init(&obj->do_lu, h, d);
+}
+EXPORT_SYMBOL(dt_object_init);
+
+void dt_object_fini(struct dt_object *obj)
+{
+	lu_object_fini(&obj->do_lu);
+}
+EXPORT_SYMBOL(dt_object_fini);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
+{
+	if (obj->do_index_ops == NULL)
+		obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+	return obj->do_index_ops != NULL;
+}
+EXPORT_SYMBOL(dt_try_as_dir);
+
+enum dt_format_type dt_mode_to_dft(__u32 mode)
+{
+	enum dt_format_type result;
+
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		result = DFT_DIR;
+		break;
+	case S_IFREG:
+		result = DFT_REGULAR;
+		break;
+	case S_IFLNK:
+		result = DFT_SYM;
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		result = DFT_NODE;
+		break;
+	default:
+		LBUG();
+		break;
+	}
+	return result;
+}
+EXPORT_SYMBOL(dt_mode_to_dft);
+
+/**
+ * lookup fid for object named \a name in directory \a dir.
+ */
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+		  const char *name, struct lu_fid *fid)
+{
+	if (dt_try_as_dir(env, dir))
+		return dt_lookup(env, dir, (struct dt_rec *)fid,
+				 (const struct dt_key *)name, BYPASS_CAPA);
+	return -ENOTDIR;
+}
+EXPORT_SYMBOL(dt_lookup_dir);
+
+/* this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site */
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev, const struct lu_fid *fid,
+			       struct lu_device *top_dev)
+{
+	struct lu_object *lo, *n;
+	ENTRY;
+
+	lo = lu_object_find_at(env, top_dev, fid, NULL);
+	if (IS_ERR(lo))
+		return (void *)lo;
+
+	LASSERT(lo != NULL);
+
+	list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) {
+		if (n->lo_dev == &dev->dd_lu_dev)
+			return container_of0(n, struct dt_object, do_lu);
+	}
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(dt_locate_at);
+
+/**
+ * find a object named \a entry in given \a dfh->dfh_o directory.
+ */
+static int dt_find_entry(const struct lu_env *env, const char *entry, void *data)
+{
+	struct dt_find_hint  *dfh = data;
+	struct dt_device     *dt = dfh->dfh_dt;
+	struct lu_fid	*fid = dfh->dfh_fid;
+	struct dt_object     *obj = dfh->dfh_o;
+	int		   result;
+
+	result = dt_lookup_dir(env, obj, entry, fid);
+	lu_object_put(env, &obj->do_lu);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (IS_ERR(obj))
+			result = PTR_ERR(obj);
+	}
+	dfh->dfh_o = obj;
+	return result;
+}
+
+/**
+ * Abstract function which parses path name. This function feeds
+ * path component to \a entry_func.
+ */
+int dt_path_parser(const struct lu_env *env,
+		   char *path, dt_entry_func_t entry_func,
+		   void *data)
+{
+	char *e;
+	int rc = 0;
+
+	while (1) {
+		e = strsep(&path, "/");
+		if (e == NULL)
+			break;
+
+		if (e[0] == 0) {
+			if (!path || path[0] == '\0')
+				break;
+			continue;
+		}
+		rc = entry_func(env, e, data);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid)
+{
+	struct dt_thread_info *info = dt_info(env);
+	struct dt_find_hint   *dfh = &info->dti_dfh;
+	struct dt_object      *obj;
+	char		      *local = info->dti_buf;
+	int		       result;
+
+
+	dfh->dfh_dt = dt;
+	dfh->dfh_fid = fid;
+
+	strncpy(local, path, DT_MAX_PATH);
+	local[DT_MAX_PATH - 1] = '\0';
+
+	result = dt->dd_ops->dt_root_get(env, dt, fid);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (!IS_ERR(obj)) {
+			dfh->dfh_o = obj;
+			result = dt_path_parser(env, local, dt_find_entry, dfh);
+			if (result != 0)
+				obj = ERR_PTR(result);
+			else
+				obj = dfh->dfh_o;
+		}
+	} else {
+		obj = ERR_PTR(result);
+	}
+	return obj;
+}
+EXPORT_SYMBOL(dt_store_resolve);
+
+static struct dt_object *dt_reg_open(const struct lu_env *env,
+				     struct dt_device *dt,
+				     struct dt_object *p,
+				     const char *name,
+				     struct lu_fid *fid)
+{
+	struct dt_object *o;
+	int result;
+
+	result = dt_lookup_dir(env, p, name, fid);
+	if (result == 0){
+		o = dt_locate(env, dt, fid);
+	}
+	else
+		o = ERR_PTR(result);
+
+	return o;
+}
+
+/**
+ * Open dt object named \a filename from \a dirname directory.
+ *      \param  dt      dt device
+ *      \param  fid     on success, object fid is stored in *fid
+ */
+struct dt_object *dt_store_open(const struct lu_env *env,
+				struct dt_device *dt,
+				const char *dirname,
+				const char *filename,
+				struct lu_fid *fid)
+{
+	struct dt_object *file;
+	struct dt_object *dir;
+
+	dir = dt_store_resolve(env, dt, dirname, fid);
+	if (!IS_ERR(dir)) {
+		file = dt_reg_open(env, dt, dir,
+				   filename, fid);
+		lu_object_put(env, &dir->do_lu);
+	} else {
+		file = dir;
+	}
+	return file;
+}
+EXPORT_SYMBOL(dt_store_open);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object_format *dof,
+				    struct lu_attr *at)
+{
+	struct dt_object *dto;
+	struct thandle *th;
+	int rc;
+
+	ENTRY;
+
+	dto = dt_locate(env, dt, fid);
+	if (IS_ERR(dto))
+		RETURN(dto);
+
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		RETURN(dto);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
+
+	CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+
+	rc = dt_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		GOTO(unlock, rc);
+	LASSERT(dt_object_exists(dto));
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, dt, th);
+out:
+	if (rc) {
+		lu_object_put(env, &dto->do_lu);
+		RETURN(ERR_PTR(rc));
+	}
+	RETURN(dto);
+}
+EXPORT_SYMBOL(dt_find_or_create);
+
+/* dt class init function. */
+int dt_global_init(void)
+{
+	int result;
+
+	LU_CONTEXT_KEY_INIT(&dt_key);
+	result = lu_context_key_register(&dt_key);
+	return result;
+}
+
+void dt_global_fini(void)
+{
+	lu_context_key_degister(&dt_key);
+}
+
+/**
+ * Generic read helper. May return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval real size of data read
+ * \retval -ve errno on failure
+ */
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+	    struct lu_buf *buf, loff_t *pos)
+{
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+	return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+}
+EXPORT_SYMBOL(dt_read);
+
+/**
+ * Read structures of fixed size from storage.  Unlike dt_read(), using
+ * dt_record_read() will return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval 0 on successfully reading full buffer
+ * \retval -EFAULT on short read
+ * \retval -ve errno on failure
+ */
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+		   struct lu_buf *buf, loff_t *pos)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+
+	rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+
+	if (rc == buf->lb_len)
+		rc = 0;
+	else if (rc >= 0)
+		rc = -EFAULT;
+	return rc;
+}
+EXPORT_SYMBOL(dt_record_read);
+
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_write);
+	rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1);
+	if (rc == buf->lb_len)
+		rc = 0;
+	else if (rc >= 0)
+		rc = -EFAULT;
+	return rc;
+}
+EXPORT_SYMBOL(dt_record_write);
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+			   struct thandle *th)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+
+	LASSERT(o);
+	vbuf.lb_buf = NULL;
+	vbuf.lb_len = sizeof(dt_obj_version_t);
+	return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+
+}
+EXPORT_SYMBOL(dt_declare_version_set);
+
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+		    dt_obj_version_t version, struct thandle *th)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
+
+	rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA);
+	if (rc < 0)
+		CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+	return;
+}
+EXPORT_SYMBOL(dt_version_set);
+
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	dt_obj_version_t version;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
+	rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA);
+	if (rc != sizeof(version)) {
+		CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+		version = 0;
+	}
+	return version;
+}
+EXPORT_SYMBOL(dt_version_get);
+
+/* list of all supported index types */
+
+/* directories */
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
+/* scrub iterator */
+const struct dt_index_features dt_otable_features;
+EXPORT_SYMBOL(dt_otable_features);
+
+/* lfsck */
+const struct dt_index_features dt_lfsck_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(struct lu_fid),
+	.dif_keysize_max	= sizeof(struct lu_fid),
+	.dif_recsize_min	= sizeof(__u8),
+	.dif_recsize_max	= sizeof(__u8),
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_lfsck_features);
+
+/* accounting indexes */
+const struct dt_index_features dt_acct_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_acct_features);
+
+/* global quota files */
+const struct dt_index_features dt_quota_glb_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_glb_features);
+
+/* slave quota files */
+const struct dt_index_features dt_quota_slv_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_slv_features);
+
+/* helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
+								   __u32 mode)
+{
+	if (seq == FID_SEQ_QUOTA_GLB) {
+		/* global quota index */
+		if (!S_ISREG(mode))
+			/* global quota index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_glb_features;
+	} else if (seq == FID_SEQ_QUOTA) {
+		/* quota slave index */
+		if (!S_ISREG(mode))
+			/* slave index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_slv_features;
+	} else if (seq >= FID_SEQ_NORMAL) {
+		/* object is part of the namespace, verify that it is a
+		 * directory */
+		if (!S_ISDIR(mode))
+			/* sorry, we can only deal with directory */
+			return ERR_PTR(-ENOTDIR);
+		return &dt_directory_features;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+/*
+ * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ
+ * RPC
+ *
+ * \param env - is the environment passed by the caller
+ * \param lp  - is a pointer to the lu_page to fill
+ * \param nob - is the maximum number of bytes that should be copied
+ * \param iops - is the index operation vector associated with the index object
+ * \param it   - is a pointer to the current iterator
+ * \param attr - is the index attribute to pass to iops->rec()
+ * \param arg  - is a pointer to the idx_info structure
+ */
+static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
+			       int nob, const struct dt_it_ops *iops,
+			       struct dt_it *it, __u32 attr, void *arg)
+{
+	struct idx_info		*ii = (struct idx_info *)arg;
+	struct lu_idxpage	*lip = &lp->lp_idx;
+	char			*entry;
+	int			 rc, size;
+	ENTRY;
+
+	/* no support for variable key & record size for now */
+	LASSERT((ii->ii_flags & II_FL_VARKEY) == 0);
+	LASSERT((ii->ii_flags & II_FL_VARREC) == 0);
+
+	/* initialize the header of the new container */
+	memset(lip, 0, LIP_HDR_SIZE);
+	lip->lip_magic = LIP_MAGIC;
+	nob	   -= LIP_HDR_SIZE;
+
+	/* compute size needed to store a key/record pair */
+	size = ii->ii_recsize + ii->ii_keysize;
+	if ((ii->ii_flags & II_FL_NOHASH) == 0)
+		/* add hash if the client wants it */
+		size += sizeof(__u64);
+
+	entry = lip->lip_entries;
+	do {
+		char		*tmp_entry = entry;
+		struct dt_key	*key;
+		__u64		 hash;
+
+		/* fetch 64-bit hash value */
+		hash = iops->store(env, it);
+		ii->ii_hash_end = hash;
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+			if (lip->lip_nr != 0)
+				GOTO(out, rc = 0);
+		}
+
+		if (nob < size) {
+			if (lip->lip_nr == 0)
+				GOTO(out, rc = -EINVAL);
+			GOTO(out, rc = 0);
+		}
+
+		if ((ii->ii_flags & II_FL_NOHASH) == 0) {
+			/* client wants to the 64-bit hash value associated with
+			 * each record */
+			memcpy(tmp_entry, &hash, sizeof(hash));
+			tmp_entry += sizeof(hash);
+		}
+
+		/* then the key value */
+		LASSERT(iops->key_size(env, it) == ii->ii_keysize);
+		key = iops->key(env, it);
+		memcpy(tmp_entry, key, ii->ii_keysize);
+		tmp_entry += ii->ii_keysize;
+
+		/* and finally the record */
+		rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
+		if (rc != -ESTALE) {
+			if (rc != 0)
+				GOTO(out, rc);
+
+			/* hash/key/record successfully copied! */
+			lip->lip_nr++;
+			if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
+				ii->ii_hash_start = hash;
+			entry = tmp_entry + ii->ii_recsize;
+			nob -= size;
+		}
+
+		/* move on to the next record */
+		do {
+			rc = iops->next(env, it);
+		} while (rc == -ESTALE);
+
+	} while (rc == 0);
+
+	GOTO(out, rc);
+out:
+	if (rc >= 0 && lip->lip_nr > 0)
+		/* one more container */
+		ii->ii_count++;
+	if (rc > 0)
+		/* no more entries */
+		ii->ii_hash_end = II_END_OFF;
+	return rc;
+}
+
+/*
+ * Walk index and fill lu_page containers with key/record pairs
+ *
+ * \param env - is the environment passed by the caller
+ * \param obj - is the index object to parse
+ * \param rdpg - is the lu_rdpg descriptor associated with the transfer
+ * \param filler - is the callback function responsible for filling a lu_page
+ *		 with key/record pairs in the format wanted by the caller
+ * \param arg    - is an opaq argument passed to the filler function
+ *
+ * \retval sum (in bytes) of all filled lu_pages
+ * \retval -ve errno on failure
+ */
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg)
+{
+	struct dt_it		*it;
+	const struct dt_it_ops	*iops;
+	unsigned int		 pageidx, nob, nlupgs = 0;
+	int			 rc;
+	ENTRY;
+
+	LASSERT(rdpg->rp_pages != NULL);
+	LASSERT(obj->do_index_ops != NULL);
+
+	nob = rdpg->rp_count;
+	if (nob <= 0)
+		RETURN(-EFAULT);
+
+	/* Iterate through index and fill containers from @rdpg */
+	iops = &obj->do_index_ops->dio_it;
+	LASSERT(iops != NULL);
+	it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA);
+	if (IS_ERR(it))
+		RETURN(PTR_ERR(it));
+
+	rc = iops->load(env, it, rdpg->rp_hash);
+	if (rc == 0) {
+		/*
+		 * Iterator didn't find record with exactly the key requested.
+		 *
+		 * It is currently either
+		 *
+		 *     - positioned above record with key less than
+		 *     requested---skip it.
+		 *     - or not positioned at all (is in IAM_IT_SKEWED
+		 *     state)---position it on the next item.
+		 */
+		rc = iops->next(env, it);
+	} else if (rc > 0) {
+		rc = 0;
+	}
+
+	/* Fill containers one after the other. There might be multiple
+	 * containers per physical page.
+	 *
+	 * At this point and across for-loop:
+	 *  rc == 0 -> ok, proceed.
+	 *  rc >  0 -> end of index.
+	 *  rc <  0 -> error. */
+	for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
+		union lu_page	*lp;
+		int		 i;
+
+		LASSERT(pageidx < rdpg->rp_npages);
+		lp = kmap(rdpg->rp_pages[pageidx]);
+
+		/* fill lu pages */
+		for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) {
+			rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE),
+				    iops, it, rdpg->rp_attrs, arg);
+			if (rc < 0)
+				break;
+			/* one more lu_page */
+			nlupgs++;
+			if (rc > 0)
+				/* end of index */
+				break;
+		}
+		kunmap(rdpg->rp_pages[i]);
+	}
+
+	iops->put(env, it);
+	iops->fini(env, it);
+
+	if (rc >= 0)
+		rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(dt_index_walk);
+
+/**
+ * Walk key/record pairs of an index and copy them into 4KB containers to be
+ * transferred over the network. This is the common handler for OBD_IDX_READ
+ * RPC processing.
+ *
+ * \param env - is the environment passed by the caller
+ * \param dev - is the dt_device storing the index
+ * \param ii  - is the idx_info structure packed by the client in the
+ *	      OBD_IDX_READ request
+ * \param rdpg - is the lu_rdpg descriptor
+ *
+ * \retval on success, return sum (in bytes) of all filled containers
+ * \retval appropriate error otherwise.
+ */
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg)
+{
+	const struct dt_index_features	*feat;
+	struct dt_object		*obj;
+	int				 rc;
+	ENTRY;
+
+	/* rp_count shouldn't be null and should be a multiple of the container
+	 * size */
+	if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
+		RETURN(-EFAULT);
+
+	if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL)
+		/* we don't support directory transfer via OBD_IDX_READ for the
+		 * time being */
+		RETURN(-EOPNOTSUPP);
+
+	if (!fid_is_quota(&ii->ii_fid))
+		/* block access to all local files except quota files */
+		RETURN(-EPERM);
+
+	/* lookup index object subject to the transfer */
+	obj = dt_locate(env, dev, &ii->ii_fid);
+	if (IS_ERR(obj))
+		RETURN(PTR_ERR(obj));
+	if (dt_object_exists(obj) == 0)
+		GOTO(out, rc = -ENOENT);
+
+	/* fetch index features associated with index object */
+	feat = dt_index_feat_select(fid_seq(&ii->ii_fid),
+				    lu_object_attr(&obj->do_lu));
+	if (IS_ERR(feat))
+		GOTO(out, rc = PTR_ERR(feat));
+
+	/* load index feature if not done already */
+	if (obj->do_index_ops == NULL) {
+		rc = obj->do_ops->do_index_try(env, obj, feat);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	/* fill ii_flags with supported index features */
+	ii->ii_flags &= II_FL_NOHASH;
+
+	ii->ii_keysize = feat->dif_keysize_max;
+	if ((feat->dif_flags & DT_IND_VARKEY) != 0) {
+		/* key size is variable */
+		ii->ii_flags |= II_FL_VARKEY;
+		/* we don't support variable key size for the time being */
+		GOTO(out, rc = -EOPNOTSUPP);
+	}
+
+	ii->ii_recsize = feat->dif_recsize_max;
+	if ((feat->dif_flags & DT_IND_VARREC) != 0) {
+		/* record size is variable */
+		ii->ii_flags |= II_FL_VARREC;
+		/* we don't support variable record size for the time being */
+		GOTO(out, rc = -EOPNOTSUPP);
+	}
+
+	if ((feat->dif_flags & DT_IND_NONUNQ) != 0)
+		/* key isn't necessarily unique */
+		ii->ii_flags |= II_FL_NONUNQ;
+
+	dt_read_lock(env, obj, 0);
+	/* fetch object version before walking the index */
+	ii->ii_version = dt_version_get(env, obj);
+
+	/* walk the index and fill lu_idxpages with key/record pairs */
+	rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii);
+	dt_read_unlock(env, obj);
+
+	if (rc == 0) {
+		/* index is empty */
+		LASSERT(ii->ii_count == 0);
+		ii->ii_hash_end = II_END_OFF;
+	}
+
+	GOTO(out, rc);
+out:
+	lu_object_put(env, &obj->do_lu);
+	return rc;
+}
+EXPORT_SYMBOL(dt_index_read);
+
+#ifdef LPROCFS
+
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		*eof = 1;
+		rc = snprintf(page, count, "%u\n",
+				(unsigned) osfs.os_bsize);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_blksize);
+
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal);
+
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree);
+
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail);
+
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", osfs.os_files);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filestotal);
+
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filesfree);
+
+#endif /* LPROCFS */

diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c
new file mode 100644
index 0000000..d96876e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/genops.c

@@ -0,0 +1,1853 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/genops.c
+ *
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_ost.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+extern struct list_head obd_types;
+spinlock_t obd_types_lock;
+
+struct kmem_cache *obd_device_cachep;
+struct kmem_cache *obdo_cachep;
+EXPORT_SYMBOL(obdo_cachep);
+struct kmem_cache *import_cachep;
+
+struct list_head      obd_zombie_imports;
+struct list_head      obd_zombie_exports;
+spinlock_t  obd_zombie_impexp_lock;
+static void obd_zombie_impexp_notify(void);
+static void obd_zombie_export_add(struct obd_export *exp);
+static void obd_zombie_import_add(struct obd_import *imp);
+static void print_export_data(struct obd_export *exp,
+			      const char *status, int locks);
+
+int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
+
+/*
+ * support functions: we could use inter-module communication, but this
+ * is more portable to other OS's
+ */
+static struct obd_device *obd_device_alloc(void)
+{
+	struct obd_device *obd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, __GFP_IO);
+	if (obd != NULL) {
+		obd->obd_magic = OBD_DEVICE_MAGIC;
+	}
+	return obd;
+}
+
+static void obd_device_free(struct obd_device *obd)
+{
+	LASSERT(obd != NULL);
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	if (obd->obd_namespace != NULL) {
+		CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n",
+		       obd, obd->obd_namespace, obd->obd_force);
+		LBUG();
+	}
+	lu_ref_fini(&obd->obd_reference);
+	OBD_SLAB_FREE_PTR(obd, obd_device_cachep);
+}
+
+struct obd_type *class_search_type(const char *name)
+{
+	struct list_head *tmp;
+	struct obd_type *type;
+
+	spin_lock(&obd_types_lock);
+	list_for_each(tmp, &obd_types) {
+		type = list_entry(tmp, struct obd_type, typ_chain);
+		if (strcmp(type->typ_name, name) == 0) {
+			spin_unlock(&obd_types_lock);
+			return type;
+		}
+	}
+	spin_unlock(&obd_types_lock);
+	return NULL;
+}
+EXPORT_SYMBOL(class_search_type);
+
+struct obd_type *class_get_type(const char *name)
+{
+	struct obd_type *type = class_search_type(name);
+
+	if (!type) {
+		const char *modname = name;
+
+		if (strcmp(modname, "obdfilter") == 0)
+			modname = "ofd";
+
+		if (strcmp(modname, LUSTRE_LWP_NAME) == 0)
+			modname = LUSTRE_OSP_NAME;
+
+		if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)))
+			modname = LUSTRE_MDT_NAME;
+
+		if (!request_module("%s", modname)) {
+			CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
+			type = class_search_type(name);
+		} else {
+			LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n",
+					   modname);
+		}
+	}
+	if (type) {
+		spin_lock(&type->obd_type_lock);
+		type->typ_refcnt++;
+		try_module_get(type->typ_dt_ops->o_owner);
+		spin_unlock(&type->obd_type_lock);
+	}
+	return type;
+}
+EXPORT_SYMBOL(class_get_type);
+
+void class_put_type(struct obd_type *type)
+{
+	LASSERT(type);
+	spin_lock(&type->obd_type_lock);
+	type->typ_refcnt--;
+	module_put(type->typ_dt_ops->o_owner);
+	spin_unlock(&type->obd_type_lock);
+}
+EXPORT_SYMBOL(class_put_type);
+
+#define CLASS_MAX_NAME 1024
+
+int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
+			struct lprocfs_vars *vars, const char *name,
+			struct lu_device_type *ldt)
+{
+	struct obd_type *type;
+	int rc = 0;
+	ENTRY;
+
+	/* sanity check */
+	LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+
+	if (class_search_type(name)) {
+		CDEBUG(D_IOCTL, "Type %s already registered\n", name);
+		RETURN(-EEXIST);
+	}
+
+	rc = -ENOMEM;
+	OBD_ALLOC(type, sizeof(*type));
+	if (type == NULL)
+		RETURN(rc);
+
+	OBD_ALLOC_PTR(type->typ_dt_ops);
+	OBD_ALLOC_PTR(type->typ_md_ops);
+	OBD_ALLOC(type->typ_name, strlen(name) + 1);
+
+	if (type->typ_dt_ops == NULL ||
+	    type->typ_md_ops == NULL ||
+	    type->typ_name == NULL)
+		GOTO (failed, rc);
+
+	*(type->typ_dt_ops) = *dt_ops;
+	/* md_ops is optional */
+	if (md_ops)
+		*(type->typ_md_ops) = *md_ops;
+	strcpy(type->typ_name, name);
+	spin_lock_init(&type->obd_type_lock);
+
+#ifdef LPROCFS
+	type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
+					      vars, type);
+	if (IS_ERR(type->typ_procroot)) {
+		rc = PTR_ERR(type->typ_procroot);
+		type->typ_procroot = NULL;
+		GOTO (failed, rc);
+	}
+#endif
+	if (ldt != NULL) {
+		type->typ_lu = ldt;
+		rc = lu_device_type_init(ldt);
+		if (rc != 0)
+			GOTO (failed, rc);
+	}
+
+	spin_lock(&obd_types_lock);
+	list_add(&type->typ_chain, &obd_types);
+	spin_unlock(&obd_types_lock);
+
+	RETURN (0);
+
+ failed:
+	if (type->typ_name != NULL)
+		OBD_FREE(type->typ_name, strlen(name) + 1);
+	if (type->typ_md_ops != NULL)
+		OBD_FREE_PTR(type->typ_md_ops);
+	if (type->typ_dt_ops != NULL)
+		OBD_FREE_PTR(type->typ_dt_ops);
+	OBD_FREE(type, sizeof(*type));
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_register_type);
+
+int class_unregister_type(const char *name)
+{
+	struct obd_type *type = class_search_type(name);
+	ENTRY;
+
+	if (!type) {
+		CERROR("unknown obd type\n");
+		RETURN(-EINVAL);
+	}
+
+	if (type->typ_refcnt) {
+		CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt);
+		/* This is a bad situation, let's make the best of it */
+		/* Remove ops, but leave the name for debugging */
+		OBD_FREE_PTR(type->typ_dt_ops);
+		OBD_FREE_PTR(type->typ_md_ops);
+		RETURN(-EBUSY);
+	}
+
+	if (type->typ_procroot) {
+		lprocfs_remove(&type->typ_procroot);
+	}
+
+	if (type->typ_lu)
+		lu_device_type_fini(type->typ_lu);
+
+	spin_lock(&obd_types_lock);
+	list_del(&type->typ_chain);
+	spin_unlock(&obd_types_lock);
+	OBD_FREE(type->typ_name, strlen(name) + 1);
+	if (type->typ_dt_ops != NULL)
+		OBD_FREE_PTR(type->typ_dt_ops);
+	if (type->typ_md_ops != NULL)
+		OBD_FREE_PTR(type->typ_md_ops);
+	OBD_FREE(type, sizeof(*type));
+	RETURN(0);
+} /* class_unregister_type */
+EXPORT_SYMBOL(class_unregister_type);
+
+/**
+ * Create a new obd device.
+ *
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ *
+ * \param[in] type_name obd device type string.
+ * \param[in] name      obd device name.
+ *
+ * \retval NULL if create fails, otherwise return the obd device
+ *	 pointer created.
+ */
+struct obd_device *class_newdev(const char *type_name, const char *name)
+{
+	struct obd_device *result = NULL;
+	struct obd_device *newdev;
+	struct obd_type *type = NULL;
+	int i;
+	int new_obd_minor = 0;
+	ENTRY;
+
+	if (strlen(name) >= MAX_OBD_NAME) {
+		CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+
+	type = class_get_type(type_name);
+	if (type == NULL){
+		CERROR("OBD: unknown type: %s\n", type_name);
+		RETURN(ERR_PTR(-ENODEV));
+	}
+
+	newdev = obd_device_alloc();
+	if (newdev == NULL)
+		GOTO(out_type, result = ERR_PTR(-ENOMEM));
+
+	LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+
+	write_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && (strcmp(name, obd->obd_name) == 0)) {
+			CERROR("Device %s already exists at %d, won't add\n",
+			       name, i);
+			if (result) {
+				LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
+					 "%p obd_magic %08x != %08x\n", result,
+					 result->obd_magic, OBD_DEVICE_MAGIC);
+				LASSERTF(result->obd_minor == new_obd_minor,
+					 "%p obd_minor %d != %d\n", result,
+					 result->obd_minor, new_obd_minor);
+
+				obd_devs[result->obd_minor] = NULL;
+				result->obd_name[0]='\0';
+			 }
+			result = ERR_PTR(-EEXIST);
+			break;
+		}
+		if (!result && !obd) {
+			result = newdev;
+			result->obd_minor = i;
+			new_obd_minor = i;
+			result->obd_type = type;
+			strncpy(result->obd_name, name,
+				sizeof(result->obd_name) - 1);
+			obd_devs[i] = result;
+		}
+	}
+	write_unlock(&obd_dev_lock);
+
+	if (result == NULL && i >= class_devno_max()) {
+		CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
+		       class_devno_max());
+		GOTO(out, result = ERR_PTR(-EOVERFLOW));
+	}
+
+	if (IS_ERR(result))
+		GOTO(out, result);
+
+	CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+	       result->obd_name, result);
+
+	RETURN(result);
+out:
+	obd_device_free(newdev);
+out_type:
+	class_put_type(type);
+	return result;
+}
+
+void class_release_dev(struct obd_device *obd)
+{
+	struct obd_type *obd_type = obd->obd_type;
+
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+	LASSERT(obd_type != NULL);
+
+	CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
+	       obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+
+	write_lock(&obd_dev_lock);
+	obd_devs[obd->obd_minor] = NULL;
+	write_unlock(&obd_dev_lock);
+	obd_device_free(obd);
+
+	class_put_type(obd_type);
+}
+
+int class_name2dev(const char *name)
+{
+	int i;
+
+	if (!name)
+		return -1;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && strcmp(name, obd->obd_name) == 0) {
+			/* Make sure we finished attaching before we give
+			   out any references */
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			if (obd->obd_attached) {
+				read_unlock(&obd_dev_lock);
+				return i;
+			}
+			break;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return -1;
+}
+EXPORT_SYMBOL(class_name2dev);
+
+struct obd_device *class_name2obd(const char *name)
+{
+	int dev = class_name2dev(name);
+
+	if (dev < 0 || dev > class_devno_max())
+		return NULL;
+	return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_name2obd);
+
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			read_unlock(&obd_dev_lock);
+			return i;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return -1;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
+{
+	int dev = class_uuid2dev(uuid);
+	if (dev < 0)
+		return NULL;
+	return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_uuid2obd);
+
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ *	 otherwise return the obd device there.
+ */
+struct obd_device *class_num2obd(int num)
+{
+	struct obd_device *obd = NULL;
+
+	if (num < class_devno_max()) {
+		obd = obd_devs[num];
+		if (obd == NULL)
+			return NULL;
+
+		LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+			 "%p obd_magic %08x != %08x\n",
+			 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+		LASSERTF(obd->obd_minor == num,
+			 "%p obd_minor %0d != %0d\n",
+			 obd, obd->obd_minor, num);
+	}
+
+	return obd;
+}
+EXPORT_SYMBOL(class_num2obd);
+
+/**
+ * Get obd devices count. Device in any
+ *    state are counted
+ * \retval obd device count
+ */
+int get_devices_count(void)
+{
+	int index, max_index = class_devno_max(), dev_count = 0;
+
+	read_lock(&obd_dev_lock);
+	for (index = 0; index <= max_index; index++) {
+		struct obd_device *obd = class_num2obd(index);
+		if (obd != NULL)
+			dev_count++;
+	}
+	read_unlock(&obd_dev_lock);
+
+	return dev_count;
+}
+EXPORT_SYMBOL(get_devices_count);
+
+void class_obd_list(void)
+{
+	char *status;
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if (obd->obd_stopping)
+			status = "ST";
+		else if (obd->obd_set_up)
+			status = "UP";
+		else if (obd->obd_attached)
+			status = "AT";
+		else
+			status = "--";
+		LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+			 i, status, obd->obd_type->typ_name,
+			 obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+	}
+	read_unlock(&obd_dev_lock);
+	return;
+}
+
+/* Search for a client OBD connected to tgt_uuid.  If grp_uuid is
+   specified, then only the client with that uuid is returned,
+   otherwise any client connected to the tgt is returned. */
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+					  const char * typ_name,
+					  struct obd_uuid *grp_uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if ((strncmp(obd->obd_type->typ_name, typ_name,
+			     strlen(typ_name)) == 0)) {
+			if (obd_uuid_equals(tgt_uuid,
+					    &obd->u.cli.cl_target_uuid) &&
+			    ((grp_uuid)? obd_uuid_equals(grp_uuid,
+							 &obd->obd_uuid) : 1)) {
+				read_unlock(&obd_dev_lock);
+				return obd;
+			}
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return NULL;
+}
+EXPORT_SYMBOL(class_find_client_obd);
+
+/* Iterate the obd_device list looking devices have grp_uuid. Start
+   searching at *next, and if a device is found, the next index to look
+   at is saved in *next. If next is NULL, then the first matching device
+   will always be returned. */
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
+{
+	int i;
+
+	if (next == NULL)
+		i = 0;
+	else if (*next >= 0 && *next < class_devno_max())
+		i = *next;
+	else
+		return NULL;
+
+	read_lock(&obd_dev_lock);
+	for (; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) {
+			if (next != NULL)
+				*next = i+1;
+			read_unlock(&obd_dev_lock);
+			return obd;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return NULL;
+}
+EXPORT_SYMBOL(class_devices_in_group);
+
+/**
+ * to notify sptlrpc log for \a fsname has changed, let every relevant OBD
+ * adjust sptlrpc settings accordingly.
+ */
+int class_notify_sptlrpc_conf(const char *fsname, int namelen)
+{
+	struct obd_device  *obd;
+	const char	 *type;
+	int		 i, rc = 0, rc2;
+
+	LASSERT(namelen > 0);
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		obd = class_num2obd(i);
+
+		if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping)
+			continue;
+
+		/* only notify mdc, osc, mdt, ost */
+		type = obd->obd_type->typ_name;
+		if (strcmp(type, LUSTRE_MDC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OSC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_MDT_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OST_NAME) != 0)
+			continue;
+
+		if (strncmp(obd->obd_name, fsname, namelen))
+			continue;
+
+		class_incref(obd, __FUNCTION__, obd);
+		read_unlock(&obd_dev_lock);
+		rc2 = obd_set_info_async(NULL, obd->obd_self_export,
+					 sizeof(KEY_SPTLRPC_CONF),
+					 KEY_SPTLRPC_CONF, 0, NULL, NULL);
+		rc = rc ? rc : rc2;
+		class_decref(obd, __FUNCTION__, obd);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+	return rc;
+}
+EXPORT_SYMBOL(class_notify_sptlrpc_conf);
+
+void obd_cleanup_caches(void)
+{
+	ENTRY;
+	if (obd_device_cachep) {
+		kmem_cache_destroy(obd_device_cachep);
+		obd_device_cachep = NULL;
+	}
+	if (obdo_cachep) {
+		kmem_cache_destroy(obdo_cachep);
+		obdo_cachep = NULL;
+	}
+	if (import_cachep) {
+		kmem_cache_destroy(import_cachep);
+		import_cachep = NULL;
+	}
+	if (capa_cachep) {
+		kmem_cache_destroy(capa_cachep);
+		capa_cachep = NULL;
+	}
+	EXIT;
+}
+
+int obd_init_caches(void)
+{
+	ENTRY;
+
+	LASSERT(obd_device_cachep == NULL);
+	obd_device_cachep = kmem_cache_create("ll_obd_dev_cache",
+						 sizeof(struct obd_device),
+						 0, 0, NULL);
+	if (!obd_device_cachep)
+		GOTO(out, -ENOMEM);
+
+	LASSERT(obdo_cachep == NULL);
+	obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
+					   0, 0, NULL);
+	if (!obdo_cachep)
+		GOTO(out, -ENOMEM);
+
+	LASSERT(import_cachep == NULL);
+	import_cachep = kmem_cache_create("ll_import_cache",
+					     sizeof(struct obd_import),
+					     0, 0, NULL);
+	if (!import_cachep)
+		GOTO(out, -ENOMEM);
+
+	LASSERT(capa_cachep == NULL);
+	capa_cachep = kmem_cache_create("capa_cache",
+					   sizeof(struct obd_capa), 0, 0, NULL);
+	if (!capa_cachep)
+		GOTO(out, -ENOMEM);
+
+	RETURN(0);
+ out:
+	obd_cleanup_caches();
+	RETURN(-ENOMEM);
+
+}
+
+/* map connection to client */
+struct obd_export *class_conn2export(struct lustre_handle *conn)
+{
+	struct obd_export *export;
+	ENTRY;
+
+	if (!conn) {
+		CDEBUG(D_CACHE, "looking for null handle\n");
+		RETURN(NULL);
+	}
+
+	if (conn->cookie == -1) {  /* this means assign a new connection */
+		CDEBUG(D_CACHE, "want a new connection\n");
+		RETURN(NULL);
+	}
+
+	CDEBUG(D_INFO, "looking for export cookie "LPX64"\n", conn->cookie);
+	export = class_handle2object(conn->cookie);
+	RETURN(export);
+}
+EXPORT_SYMBOL(class_conn2export);
+
+struct obd_device *class_exp2obd(struct obd_export *exp)
+{
+	if (exp)
+		return exp->exp_obd;
+	return NULL;
+}
+EXPORT_SYMBOL(class_exp2obd);
+
+struct obd_device *class_conn2obd(struct lustre_handle *conn)
+{
+	struct obd_export *export;
+	export = class_conn2export(conn);
+	if (export) {
+		struct obd_device *obd = export->exp_obd;
+		class_export_put(export);
+		return obd;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(class_conn2obd);
+
+struct obd_import *class_exp2cliimp(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+	if (obd == NULL)
+		return NULL;
+	return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_exp2cliimp);
+
+struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
+{
+	struct obd_device *obd = class_conn2obd(conn);
+	if (obd == NULL)
+		return NULL;
+	return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_conn2cliimp);
+
+/* Export management functions */
+static void class_export_destroy(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+	ENTRY;
+
+	LASSERT_ATOMIC_ZERO(&exp->exp_refcount);
+	LASSERT(obd != NULL);
+
+	CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp,
+	       exp->exp_client_uuid.uuid, obd->obd_name);
+
+	/* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+	if (exp->exp_connection)
+		ptlrpc_put_connection_superhack(exp->exp_connection);
+
+	LASSERT(list_empty(&exp->exp_outstanding_replies));
+	LASSERT(list_empty(&exp->exp_uncommitted_replies));
+	LASSERT(list_empty(&exp->exp_req_replay_queue));
+	LASSERT(list_empty(&exp->exp_hp_rpcs));
+	obd_destroy_export(exp);
+	class_decref(obd, "export", exp);
+
+	OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
+	EXIT;
+}
+
+static void export_handle_addref(void *export)
+{
+	class_export_get(export);
+}
+
+static struct portals_handle_ops export_handle_ops = {
+	.hop_addref = export_handle_addref,
+	.hop_free   = NULL,
+};
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+	atomic_inc(&exp->exp_refcount);
+	CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp,
+	       atomic_read(&exp->exp_refcount));
+	return exp;
+}
+EXPORT_SYMBOL(class_export_get);
+
+void class_export_put(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+	       atomic_read(&exp->exp_refcount) - 1);
+
+	if (atomic_dec_and_test(&exp->exp_refcount)) {
+		LASSERT(!list_empty(&exp->exp_obd_chain));
+		CDEBUG(D_IOCTL, "final put %p/%s\n",
+		       exp, exp->exp_client_uuid.uuid);
+
+		/* release nid stat refererence */
+		lprocfs_exp_cleanup(exp);
+
+		obd_zombie_export_add(exp);
+	}
+}
+EXPORT_SYMBOL(class_export_put);
+
+/* Creates a new export, adds it to the hash table, and returns a
+ * pointer to it. The refcount is 2: one for the hash reference, and
+ * one for the pointer returned by this function. */
+struct obd_export *class_new_export(struct obd_device *obd,
+				    struct obd_uuid *cluuid)
+{
+	struct obd_export *export;
+	cfs_hash_t *hash = NULL;
+	int rc = 0;
+	ENTRY;
+
+	OBD_ALLOC_PTR(export);
+	if (!export)
+		return ERR_PTR(-ENOMEM);
+
+	export->exp_conn_cnt = 0;
+	export->exp_lock_hash = NULL;
+	export->exp_flock_hash = NULL;
+	atomic_set(&export->exp_refcount, 2);
+	atomic_set(&export->exp_rpc_count, 0);
+	atomic_set(&export->exp_cb_count, 0);
+	atomic_set(&export->exp_locks_count, 0);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&export->exp_locks_list);
+	spin_lock_init(&export->exp_locks_list_guard);
+#endif
+	atomic_set(&export->exp_replay_count, 0);
+	export->exp_obd = obd;
+	INIT_LIST_HEAD(&export->exp_outstanding_replies);
+	spin_lock_init(&export->exp_uncommitted_replies_lock);
+	INIT_LIST_HEAD(&export->exp_uncommitted_replies);
+	INIT_LIST_HEAD(&export->exp_req_replay_queue);
+	INIT_LIST_HEAD(&export->exp_handle.h_link);
+	INIT_LIST_HEAD(&export->exp_hp_rpcs);
+	class_handle_hash(&export->exp_handle, &export_handle_ops);
+	export->exp_last_request_time = cfs_time_current_sec();
+	spin_lock_init(&export->exp_lock);
+	spin_lock_init(&export->exp_rpc_lock);
+	INIT_HLIST_NODE(&export->exp_uuid_hash);
+	INIT_HLIST_NODE(&export->exp_nid_hash);
+	spin_lock_init(&export->exp_bl_list_lock);
+	INIT_LIST_HEAD(&export->exp_bl_list);
+
+	export->exp_sp_peer = LUSTRE_SP_ANY;
+	export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
+	export->exp_client_uuid = *cluuid;
+	obd_init_export(export);
+
+	spin_lock(&obd->obd_dev_lock);
+	/* shouldn't happen, but might race */
+	if (obd->obd_stopping)
+		GOTO(exit_unlock, rc = -ENODEV);
+
+	hash = cfs_hash_getref(obd->obd_uuid_hash);
+	if (hash == NULL)
+		GOTO(exit_unlock, rc = -ENODEV);
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+		rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
+		if (rc != 0) {
+			LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
+				      obd->obd_name, cluuid->uuid, rc);
+			GOTO(exit_err, rc = -EALREADY);
+		}
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+		GOTO(exit_unlock, rc = -ENODEV);
+	}
+
+	class_incref(obd, "export", export);
+	list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+	list_add_tail(&export->exp_obd_chain_timed,
+			  &export->exp_obd->obd_exports_timed);
+	export->exp_obd->obd_num_exports++;
+	spin_unlock(&obd->obd_dev_lock);
+	cfs_hash_putref(hash);
+	RETURN(export);
+
+exit_unlock:
+	spin_unlock(&obd->obd_dev_lock);
+exit_err:
+	if (hash)
+		cfs_hash_putref(hash);
+	class_handle_unhash(&export->exp_handle);
+	LASSERT(hlist_unhashed(&export->exp_uuid_hash));
+	obd_destroy_export(export);
+	OBD_FREE_PTR(export);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(class_new_export);
+
+void class_unlink_export(struct obd_export *exp)
+{
+	class_handle_unhash(&exp->exp_handle);
+
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	/* delete an uuid-export hashitem from hashtables */
+	if (!hlist_unhashed(&exp->exp_uuid_hash))
+		cfs_hash_del(exp->exp_obd->obd_uuid_hash,
+			     &exp->exp_client_uuid,
+			     &exp->exp_uuid_hash);
+
+	list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
+	list_del_init(&exp->exp_obd_chain_timed);
+	exp->exp_obd->obd_num_exports--;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	class_export_put(exp);
+}
+EXPORT_SYMBOL(class_unlink_export);
+
+/* Import management functions */
+void class_import_destroy(struct obd_import *imp)
+{
+	ENTRY;
+
+	CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp,
+		imp->imp_obd->obd_name);
+
+	LASSERT_ATOMIC_ZERO(&imp->imp_refcount);
+
+	ptlrpc_put_connection_superhack(imp->imp_connection);
+
+	while (!list_empty(&imp->imp_conn_list)) {
+		struct obd_import_conn *imp_conn;
+
+		imp_conn = list_entry(imp->imp_conn_list.next,
+					  struct obd_import_conn, oic_item);
+		list_del_init(&imp_conn->oic_item);
+		ptlrpc_put_connection_superhack(imp_conn->oic_conn);
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+	}
+
+	LASSERT(imp->imp_sec == NULL);
+	class_decref(imp->imp_obd, "import", imp);
+	OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
+	EXIT;
+}
+
+static void import_handle_addref(void *import)
+{
+	class_import_get(import);
+}
+
+static struct portals_handle_ops import_handle_ops = {
+	.hop_addref = import_handle_addref,
+	.hop_free   = NULL,
+};
+
+struct obd_import *class_import_get(struct obd_import *import)
+{
+	atomic_inc(&import->imp_refcount);
+	CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+	       atomic_read(&import->imp_refcount),
+	       import->imp_obd->obd_name);
+	return import;
+}
+EXPORT_SYMBOL(class_import_get);
+
+void class_import_put(struct obd_import *imp)
+{
+	ENTRY;
+
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+	LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
+
+	CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
+	       atomic_read(&imp->imp_refcount) - 1,
+	       imp->imp_obd->obd_name);
+
+	if (atomic_dec_and_test(&imp->imp_refcount)) {
+		CDEBUG(D_INFO, "final put import %p\n", imp);
+		obd_zombie_import_add(imp);
+	}
+
+	/* catch possible import put race */
+	LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
+	EXIT;
+}
+EXPORT_SYMBOL(class_import_put);
+
+static void init_imp_at(struct imp_at *at) {
+	int i;
+	at_init(&at->iat_net_latency, 0, 0);
+	for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		/* max service estimates are tracked on the server side, so
+		   don't use the AT history here, just use the last reported
+		   val. (But keep hist for proc histogram, worst_ever) */
+		at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+			AT_FLG_NOHIST);
+	}
+}
+
+struct obd_import *class_new_import(struct obd_device *obd)
+{
+	struct obd_import *imp;
+
+	OBD_ALLOC(imp, sizeof(*imp));
+	if (imp == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&imp->imp_pinger_chain);
+	INIT_LIST_HEAD(&imp->imp_zombie_chain);
+	INIT_LIST_HEAD(&imp->imp_replay_list);
+	INIT_LIST_HEAD(&imp->imp_sending_list);
+	INIT_LIST_HEAD(&imp->imp_delayed_list);
+	spin_lock_init(&imp->imp_lock);
+	imp->imp_last_success_conn = 0;
+	imp->imp_state = LUSTRE_IMP_NEW;
+	imp->imp_obd = class_incref(obd, "import", imp);
+	mutex_init(&imp->imp_sec_mutex);
+	init_waitqueue_head(&imp->imp_recovery_waitq);
+
+	atomic_set(&imp->imp_refcount, 2);
+	atomic_set(&imp->imp_unregistering, 0);
+	atomic_set(&imp->imp_inflight, 0);
+	atomic_set(&imp->imp_replay_inflight, 0);
+	atomic_set(&imp->imp_inval_count, 0);
+	INIT_LIST_HEAD(&imp->imp_conn_list);
+	INIT_LIST_HEAD(&imp->imp_handle.h_link);
+	class_handle_hash(&imp->imp_handle, &import_handle_ops);
+	init_imp_at(&imp->imp_at);
+
+	/* the default magic is V2, will be used in connect RPC, and
+	 * then adjusted according to the flags in request/reply. */
+	imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
+
+	return imp;
+}
+EXPORT_SYMBOL(class_new_import);
+
+void class_destroy_import(struct obd_import *import)
+{
+	LASSERT(import != NULL);
+	LASSERT(import != LP_POISON);
+
+	class_handle_unhash(&import->imp_handle);
+
+	spin_lock(&import->imp_lock);
+	import->imp_generation++;
+	spin_unlock(&import->imp_lock);
+	class_import_put(import);
+}
+EXPORT_SYMBOL(class_destroy_import);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+
+	LASSERT(lock->l_exp_refs_nr >= 0);
+
+	if (lock->l_exp_refs_target != NULL &&
+	    lock->l_exp_refs_target != exp) {
+		LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n",
+			      exp, lock, lock->l_exp_refs_target);
+	}
+	if ((lock->l_exp_refs_nr ++) == 0) {
+		list_add(&lock->l_exp_refs_link, &exp->exp_locks_list);
+		lock->l_exp_refs_target = exp;
+	}
+	CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+	       lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_add_lock_ref);
+
+void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+	LASSERT(lock->l_exp_refs_nr > 0);
+	if (lock->l_exp_refs_target != exp) {
+		LCONSOLE_WARN("lock %p, "
+			      "mismatching export pointers: %p, %p\n",
+			      lock, lock->l_exp_refs_target, exp);
+	}
+	if (-- lock->l_exp_refs_nr == 0) {
+		list_del_init(&lock->l_exp_refs_link);
+		lock->l_exp_refs_target = NULL;
+	}
+	CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+	       lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_del_lock_ref);
+#endif
+
+/* A connection defines an export context in which preallocation can
+   be managed. This releases the export pointer reference, and returns
+   the export handle, so the export refcount is 1 when this function
+   returns. */
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+		  struct obd_uuid *cluuid)
+{
+	struct obd_export *export;
+	LASSERT(conn != NULL);
+	LASSERT(obd != NULL);
+	LASSERT(cluuid != NULL);
+	ENTRY;
+
+	export = class_new_export(obd, cluuid);
+	if (IS_ERR(export))
+		RETURN(PTR_ERR(export));
+
+	conn->cookie = export->exp_handle.h_cookie;
+	class_export_put(export);
+
+	CDEBUG(D_IOCTL, "connect: client %s, cookie "LPX64"\n",
+	       cluuid->uuid, conn->cookie);
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_connect);
+
+/* if export is involved in recovery then clean up related things */
+void class_export_recovery_cleanup(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (exp->exp_delayed)
+		obd->obd_delayed_clients--;
+	if (obd->obd_recovering) {
+		if (exp->exp_in_recovery) {
+			spin_lock(&exp->exp_lock);
+			exp->exp_in_recovery = 0;
+			spin_unlock(&exp->exp_lock);
+			LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+			atomic_dec(&obd->obd_connected_clients);
+		}
+
+		/* if called during recovery then should update
+		 * obd_stale_clients counter,
+		 * lightweight exports are not counted */
+		if (exp->exp_failed &&
+		    (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0)
+			exp->exp_obd->obd_stale_clients++;
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+	/** Cleanup req replay fields */
+	if (exp->exp_req_replay_needed) {
+		spin_lock(&exp->exp_lock);
+		exp->exp_req_replay_needed = 0;
+		spin_unlock(&exp->exp_lock);
+		LASSERT(atomic_read(&obd->obd_req_replay_clients));
+		atomic_dec(&obd->obd_req_replay_clients);
+	}
+	/** Cleanup lock replay data */
+	if (exp->exp_lock_replay_needed) {
+		spin_lock(&exp->exp_lock);
+		exp->exp_lock_replay_needed = 0;
+		spin_unlock(&exp->exp_lock);
+		LASSERT(atomic_read(&obd->obd_lock_replay_clients));
+		atomic_dec(&obd->obd_lock_replay_clients);
+	}
+}
+
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
+int class_disconnect(struct obd_export *export)
+{
+	int already_disconnected;
+	ENTRY;
+
+	if (export == NULL) {
+		CWARN("attempting to free NULL export %p\n", export);
+		RETURN(-EINVAL);
+	}
+
+	spin_lock(&export->exp_lock);
+	already_disconnected = export->exp_disconnected;
+	export->exp_disconnected = 1;
+	spin_unlock(&export->exp_lock);
+
+	/* class_cleanup(), abort_recovery(), and class_fail_export()
+	 * all end up in here, and if any of them race we shouldn't
+	 * call extra class_export_puts(). */
+	if (already_disconnected) {
+		LASSERT(hlist_unhashed(&export->exp_nid_hash));
+		GOTO(no_disconn, already_disconnected);
+	}
+
+	CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n",
+	       export->exp_handle.h_cookie);
+
+	if (!hlist_unhashed(&export->exp_nid_hash))
+		cfs_hash_del(export->exp_obd->obd_nid_hash,
+			     &export->exp_connection->c_peer.nid,
+			     &export->exp_nid_hash);
+
+	class_export_recovery_cleanup(export);
+	class_unlink_export(export);
+no_disconn:
+	class_export_put(export);
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_disconnect);
+
+/* Return non-zero for a fully connected export */
+int class_connected_export(struct obd_export *exp)
+{
+	if (exp) {
+		int connected;
+		spin_lock(&exp->exp_lock);
+		connected = (exp->exp_conn_cnt > 0);
+		spin_unlock(&exp->exp_lock);
+		return connected;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(class_connected_export);
+
+static void class_disconnect_export_list(struct list_head *list,
+					 enum obd_option flags)
+{
+	int rc;
+	struct obd_export *exp;
+	ENTRY;
+
+	/* It's possible that an export may disconnect itself, but
+	 * nothing else will be added to this list. */
+	while (!list_empty(list)) {
+		exp = list_entry(list->next, struct obd_export,
+				     exp_obd_chain);
+		/* need for safe call CDEBUG after obd_disconnect */
+		class_export_get(exp);
+
+		spin_lock(&exp->exp_lock);
+		exp->exp_flags = flags;
+		spin_unlock(&exp->exp_lock);
+
+		if (obd_uuid_equals(&exp->exp_client_uuid,
+				    &exp->exp_obd->obd_uuid)) {
+			CDEBUG(D_HA,
+			       "exp %p export uuid == obd uuid, don't discon\n",
+			       exp);
+			/* Need to delete this now so we don't end up pointing
+			 * to work_list later when this export is cleaned up. */
+			list_del_init(&exp->exp_obd_chain);
+			class_export_put(exp);
+			continue;
+		}
+
+		class_export_get(exp);
+		CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
+		       "last request at "CFS_TIME_T"\n",
+		       exp->exp_obd->obd_name, obd_export_nid2str(exp),
+		       exp, exp->exp_last_request_time);
+		/* release one export reference anyway */
+		rc = obd_disconnect(exp);
+
+		CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+		       obd_export_nid2str(exp), exp, rc);
+		class_export_put(exp);
+	}
+	EXIT;
+}
+
+void class_disconnect_exports(struct obd_device *obd)
+{
+	struct list_head work_list;
+	ENTRY;
+
+	/* Move all of the exports from obd_exports to a work list, en masse. */
+	INIT_LIST_HEAD(&work_list);
+	spin_lock(&obd->obd_dev_lock);
+	list_splice_init(&obd->obd_exports, &work_list);
+	list_splice_init(&obd->obd_delayed_exports, &work_list);
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (!list_empty(&work_list)) {
+		CDEBUG(D_HA, "OBD device %d (%p) has exports, "
+		       "disconnecting them\n", obd->obd_minor, obd);
+		class_disconnect_export_list(&work_list,
+					     exp_flags_from_obd(obd));
+	} else
+		CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
+		       obd->obd_minor, obd);
+	EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_exports);
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd,
+				    int (*test_export)(struct obd_export *))
+{
+	struct list_head work_list;
+	struct obd_export *exp, *n;
+	int evicted = 0;
+	ENTRY;
+
+	INIT_LIST_HEAD(&work_list);
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry_safe(exp, n, &obd->obd_exports,
+				     exp_obd_chain) {
+		/* don't count self-export as client */
+		if (obd_uuid_equals(&exp->exp_client_uuid,
+				    &exp->exp_obd->obd_uuid))
+			continue;
+
+		/* don't evict clients which have no slot in last_rcvd
+		 * (e.g. lightweight connection) */
+		if (exp->exp_target_data.ted_lr_idx == -1)
+			continue;
+
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_failed || test_export(exp)) {
+			spin_unlock(&exp->exp_lock);
+			continue;
+		}
+		exp->exp_failed = 1;
+		spin_unlock(&exp->exp_lock);
+
+		list_move(&exp->exp_obd_chain, &work_list);
+		evicted++;
+		CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid,
+		       exp->exp_connection == NULL ? "<unknown>" :
+		       libcfs_nid2str(exp->exp_connection->c_peer.nid));
+		print_export_data(exp, "EVICTING", 0);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (evicted)
+		LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+			      obd->obd_name, evicted);
+
+	class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+						 OBD_OPT_ABORT_RECOV);
+	EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_stale_exports);
+
+void class_fail_export(struct obd_export *exp)
+{
+	int rc, already_failed;
+
+	spin_lock(&exp->exp_lock);
+	already_failed = exp->exp_failed;
+	exp->exp_failed = 1;
+	spin_unlock(&exp->exp_lock);
+
+	if (already_failed) {
+		CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+		       exp, exp->exp_client_uuid.uuid);
+		return;
+	}
+
+	CDEBUG(D_HA, "disconnecting export %p/%s\n",
+	       exp, exp->exp_client_uuid.uuid);
+
+	if (obd_dump_on_timeout)
+		libcfs_debug_dumplog();
+
+	/* need for safe call CDEBUG after obd_disconnect */
+	class_export_get(exp);
+
+	/* Most callers into obd_disconnect are removing their own reference
+	 * (request, for example) in addition to the one from the hash table.
+	 * We don't have such a reference here, so make one. */
+	class_export_get(exp);
+	rc = obd_disconnect(exp);
+	if (rc)
+		CERROR("disconnecting export %p failed: %d\n", exp, rc);
+	else
+		CDEBUG(D_HA, "disconnected export %p/%s\n",
+		       exp, exp->exp_client_uuid.uuid);
+	class_export_put(exp);
+}
+EXPORT_SYMBOL(class_fail_export);
+
+char *obd_export_nid2str(struct obd_export *exp)
+{
+	if (exp->exp_connection != NULL)
+		return libcfs_nid2str(exp->exp_connection->c_peer.nid);
+
+	return "(no nid)";
+}
+EXPORT_SYMBOL(obd_export_nid2str);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
+{
+	cfs_hash_t *nid_hash;
+	struct obd_export *doomed_exp = NULL;
+	int exports_evicted = 0;
+
+	lnet_nid_t nid_key = libcfs_str2nid((char *)nid);
+
+	spin_lock(&obd->obd_dev_lock);
+	/* umount has run already, so evict thread should leave
+	 * its task to umount thread now */
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	nid_hash = obd->obd_nid_hash;
+	cfs_hash_getref(nid_hash);
+	spin_unlock(&obd->obd_dev_lock);
+
+	do {
+		doomed_exp = cfs_hash_lookup(nid_hash, &nid_key);
+		if (doomed_exp == NULL)
+			break;
+
+		LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key,
+			 "nid %s found, wanted nid %s, requested nid %s\n",
+			 obd_export_nid2str(doomed_exp),
+			 libcfs_nid2str(nid_key), nid);
+		LASSERTF(doomed_exp != obd->obd_self_export,
+			 "self-export is hashed by NID?\n");
+		exports_evicted++;
+		LCONSOLE_WARN("%s: evicting %s (at %s) by administrative "
+			      "request\n", obd->obd_name,
+			      obd_uuid2str(&doomed_exp->exp_client_uuid),
+			      obd_export_nid2str(doomed_exp));
+		class_fail_export(doomed_exp);
+		class_export_put(doomed_exp);
+	} while (1);
+
+	cfs_hash_putref(nid_hash);
+
+	if (!exports_evicted)
+		CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n",
+		       obd->obd_name, nid);
+	return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_nid);
+
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
+{
+	cfs_hash_t *uuid_hash;
+	struct obd_export *doomed_exp = NULL;
+	struct obd_uuid doomed_uuid;
+	int exports_evicted = 0;
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	uuid_hash = obd->obd_uuid_hash;
+	cfs_hash_getref(uuid_hash);
+	spin_unlock(&obd->obd_dev_lock);
+
+	obd_str2uuid(&doomed_uuid, uuid);
+	if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
+		CERROR("%s: can't evict myself\n", obd->obd_name);
+		cfs_hash_putref(uuid_hash);
+		return exports_evicted;
+	}
+
+	doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid);
+
+	if (doomed_exp == NULL) {
+		CERROR("%s: can't disconnect %s: no exports found\n",
+		       obd->obd_name, uuid);
+	} else {
+		CWARN("%s: evicting %s at adminstrative request\n",
+		       obd->obd_name, doomed_exp->exp_client_uuid.uuid);
+		class_fail_export(doomed_exp);
+		class_export_put(doomed_exp);
+		exports_evicted++;
+	}
+	cfs_hash_putref(uuid_hash);
+
+	return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_uuid);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void (*class_export_dump_hook)(struct obd_export*) = NULL;
+EXPORT_SYMBOL(class_export_dump_hook);
+#endif
+
+static void print_export_data(struct obd_export *exp, const char *status,
+			      int locks)
+{
+	struct ptlrpc_reply_state *rs;
+	struct ptlrpc_reply_state *first_reply = NULL;
+	int nreplies = 0;
+
+	spin_lock(&exp->exp_lock);
+	list_for_each_entry(rs, &exp->exp_outstanding_replies,
+				rs_exp_list) {
+		if (nreplies == 0)
+			first_reply = rs;
+		nreplies++;
+	}
+	spin_unlock(&exp->exp_lock);
+
+	CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s "LPU64"\n",
+	       exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+	       obd_export_nid2str(exp), atomic_read(&exp->exp_refcount),
+	       atomic_read(&exp->exp_rpc_count),
+	       atomic_read(&exp->exp_cb_count),
+	       atomic_read(&exp->exp_locks_count),
+	       exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+	       nreplies, first_reply, nreplies > 3 ? "..." : "",
+	       exp->exp_last_committed);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	if (locks && class_export_dump_hook != NULL)
+		class_export_dump_hook(exp);
+#endif
+}
+
+void dump_exports(struct obd_device *obd, int locks)
+{
+	struct obd_export *exp;
+
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+		print_export_data(exp, "ACTIVE", locks);
+	list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+		print_export_data(exp, "UNLINKED", locks);
+	list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+		print_export_data(exp, "DELAYED", locks);
+	spin_unlock(&obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+		print_export_data(exp, "ZOMBIE", locks);
+	spin_unlock(&obd_zombie_impexp_lock);
+}
+EXPORT_SYMBOL(dump_exports);
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+	int waited = 2;
+	LASSERT(list_empty(&obd->obd_exports));
+	spin_lock(&obd->obd_dev_lock);
+	while (!list_empty(&obd->obd_unlinked_exports)) {
+		spin_unlock(&obd->obd_dev_lock);
+		schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+						   cfs_time_seconds(waited));
+		if (waited > 5 && IS_PO2(waited)) {
+			LCONSOLE_WARN("%s is waiting for obd_unlinked_exports "
+				      "more than %d seconds. "
+				      "The obd refcount = %d. Is it stuck?\n",
+				      obd->obd_name, waited,
+				      atomic_read(&obd->obd_refcount));
+			dump_exports(obd, 1);
+		}
+		waited *= 2;
+		spin_lock(&obd->obd_dev_lock);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
+/* Total amount of zombies to be destroyed */
+static int zombies_count = 0;
+
+/**
+ * kill zombie imports and exports
+ */
+void obd_zombie_impexp_cull(void)
+{
+	struct obd_import *import;
+	struct obd_export *export;
+	ENTRY;
+
+	do {
+		spin_lock(&obd_zombie_impexp_lock);
+
+		import = NULL;
+		if (!list_empty(&obd_zombie_imports)) {
+			import = list_entry(obd_zombie_imports.next,
+						struct obd_import,
+						imp_zombie_chain);
+			list_del_init(&import->imp_zombie_chain);
+		}
+
+		export = NULL;
+		if (!list_empty(&obd_zombie_exports)) {
+			export = list_entry(obd_zombie_exports.next,
+						struct obd_export,
+						exp_obd_chain);
+			list_del_init(&export->exp_obd_chain);
+		}
+
+		spin_unlock(&obd_zombie_impexp_lock);
+
+		if (import != NULL) {
+			class_import_destroy(import);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		if (export != NULL) {
+			class_export_destroy(export);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		cond_resched();
+	} while (import != NULL || export != NULL);
+	EXIT;
+}
+
+static struct completion	obd_zombie_start;
+static struct completion	obd_zombie_stop;
+static unsigned long		obd_zombie_flags;
+static wait_queue_head_t		obd_zombie_waitq;
+static pid_t			obd_zombie_pid;
+
+enum {
+	OBD_ZOMBIE_STOP		= 0x0001,
+};
+
+/**
+ * check for work for kill zombie import/export thread.
+ */
+static int obd_zombie_impexp_check(void *arg)
+{
+	int rc;
+
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0) &&
+	     !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Add export to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_export_add(struct obd_export *exp) {
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	LASSERT(!list_empty(&exp->exp_obd_chain));
+	list_del_init(&exp->exp_obd_chain);
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	zombies_count++;
+	list_add(&exp->exp_obd_chain, &obd_zombie_exports);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * Add import to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_import_add(struct obd_import *imp) {
+	LASSERT(imp->imp_sec == NULL);
+	LASSERT(imp->imp_rq_pool == NULL);
+	spin_lock(&obd_zombie_impexp_lock);
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+	zombies_count++;
+	list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * notify import/export destroy thread about new zombie.
+ */
+static void obd_zombie_impexp_notify(void)
+{
+	/*
+	 * Make sure obd_zomebie_impexp_thread get this notification.
+	 * It is possible this signal only get by obd_zombie_barrier, and
+	 * barrier gulps this notification and sleeps away and hangs ensues
+	 */
+	wake_up_all(&obd_zombie_waitq);
+}
+
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+	int rc;
+
+	LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0);
+	spin_unlock(&obd_zombie_impexp_lock);
+	return rc;
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+	struct l_wait_info lwi = { 0 };
+
+	if (obd_zombie_pid == current_pid())
+		/* don't wait for myself */
+		return;
+	l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
+
+/**
+ * destroy zombie export/import thread.
+ */
+static int obd_zombie_impexp_thread(void *unused)
+{
+	unshare_fs_struct();
+	complete(&obd_zombie_start);
+
+	obd_zombie_pid = current_pid();
+
+	while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
+		struct l_wait_info lwi = { 0 };
+
+		l_wait_event(obd_zombie_waitq,
+			     !obd_zombie_impexp_check(NULL), &lwi);
+		obd_zombie_impexp_cull();
+
+		/*
+		 * Notify obd_zombie_barrier callers that queues
+		 * may be empty.
+		 */
+		wake_up(&obd_zombie_waitq);
+	}
+
+	complete(&obd_zombie_stop);
+
+	RETURN(0);
+}
+
+
+/**
+ * start destroy zombie import/export thread
+ */
+int obd_zombie_impexp_init(void)
+{
+	task_t *task;
+
+	INIT_LIST_HEAD(&obd_zombie_imports);
+	INIT_LIST_HEAD(&obd_zombie_exports);
+	spin_lock_init(&obd_zombie_impexp_lock);
+	init_completion(&obd_zombie_start);
+	init_completion(&obd_zombie_stop);
+	init_waitqueue_head(&obd_zombie_waitq);
+	obd_zombie_pid = 0;
+
+	task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
+	if (IS_ERR(task))
+		RETURN(PTR_ERR(task));
+
+	wait_for_completion(&obd_zombie_start);
+	RETURN(0);
+}
+/**
+ * stop destroy zombie import/export thread
+ */
+void obd_zombie_impexp_stop(void)
+{
+	set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+	obd_zombie_impexp_notify();
+	wait_for_completion(&obd_zombie_stop);
+}
+
+/***** Kernel-userspace comm helpers *******/
+
+/* Get length of entire message, including header */
+int kuc_len(int payload_len)
+{
+	return sizeof(struct kuc_hdr) + payload_len;
+}
+EXPORT_SYMBOL(kuc_len);
+
+/* Get a pointer to kuc header, given a ptr to the payload
+ * @param p Pointer to payload area
+ * @returns Pointer to kuc header
+ */
+struct kuc_hdr * kuc_ptr(void *p)
+{
+	struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1;
+	LASSERT(lh->kuc_magic == KUC_MAGIC);
+	return lh;
+}
+EXPORT_SYMBOL(kuc_ptr);
+
+/* Test if payload is part of kuc message
+ * @param p Pointer to payload area
+ * @returns boolean
+ */
+int kuc_ispayload(void *p)
+{
+	struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1;
+
+	if (kh->kuc_magic == KUC_MAGIC)
+		return 1;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(kuc_ispayload);
+
+/* Alloc space for a message, and fill in header
+ * @return Pointer to payload area
+ */
+void *kuc_alloc(int payload_len, int transport, int type)
+{
+	struct kuc_hdr *lh;
+	int len = kuc_len(payload_len);
+
+	OBD_ALLOC(lh, len);
+	if (lh == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	lh->kuc_magic = KUC_MAGIC;
+	lh->kuc_transport = transport;
+	lh->kuc_msgtype = type;
+	lh->kuc_msglen = len;
+
+	return (void *)(lh + 1);
+}
+EXPORT_SYMBOL(kuc_alloc);
+
+/* Takes pointer to payload area */
+inline void kuc_free(void *p, int payload_len)
+{
+	struct kuc_hdr *lh = kuc_ptr(p);
+	OBD_FREE(lh, kuc_len(payload_len));
+}
+EXPORT_SYMBOL(kuc_free);

diff --git a/drivers/staging/lustre/lustre/obdclass/idmap.c b/drivers/staging/lustre/lustre/obdclass/idmap.c
new file mode 100644
index 0000000..622f8d1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/idmap.c

@@ -0,0 +1,474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/idmap.c
+ *
+ * Lustre user identity mapping.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <lustre_idmap.h>
+#include <md_object.h>
+#include <obd_support.h>
+
+#define lustre_get_group_info(group_info) do {	     \
+	atomic_inc(&(group_info)->usage);	      \
+} while (0)
+
+#define lustre_put_group_info(group_info) do {	     \
+	if (atomic_dec_and_test(&(group_info)->usage)) \
+		groups_free(group_info);	       \
+} while (0)
+
+/*
+ * groups_search() is copied from linux kernel!
+ * A simple bsearch.
+ */
+static int lustre_groups_search(group_info_t *group_info,
+				gid_t grp)
+{
+	int left, right;
+
+	if (!group_info)
+		return 0;
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		int mid = (left + right) / 2;
+		int cmp = grp - CFS_GROUP_AT(group_info, mid);
+
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist)
+{
+	int i;
+	int count = ginfo->ngroups;
+
+	/* fill group_info from gid array */
+	for (i = 0; i < ginfo->nblocks && count > 0; i++) {
+		int cp_count = min(CFS_NGROUPS_PER_BLOCK, count);
+		int off = i * CFS_NGROUPS_PER_BLOCK;
+		int len = cp_count * sizeof(*glist);
+
+		memcpy(ginfo->blocks[i], glist + off, len);
+		count -= cp_count;
+	}
+}
+EXPORT_SYMBOL(lustre_groups_from_list);
+
+/* groups_sort() is copied from linux kernel! */
+/* a simple shell-metzner sort */
+void lustre_groups_sort(group_info_t *group_info)
+{
+	int base, max, stride;
+	int gidsetsize = group_info->ngroups;
+
+	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+		; /* nothing */
+	stride /= 3;
+
+	while (stride) {
+		max = gidsetsize - stride;
+		for (base = 0; base < max; base++) {
+			int left = base;
+			int right = left + stride;
+			gid_t tmp = CFS_GROUP_AT(group_info, right);
+
+			while (left >= 0 &&
+			       CFS_GROUP_AT(group_info, left) > tmp) {
+				CFS_GROUP_AT(group_info, right) =
+				    CFS_GROUP_AT(group_info, left);
+				right = left;
+				left -= stride;
+			}
+			CFS_GROUP_AT(group_info, right) = tmp;
+		}
+		stride /= 3;
+	}
+}
+EXPORT_SYMBOL(lustre_groups_sort);
+
+int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
+{
+	int rc = 1;
+
+	if (grp != mu->uc_fsgid) {
+		group_info_t *group_info = NULL;
+
+		if (mu->uc_ginfo || !mu->uc_identity ||
+		    mu->uc_valid == UCRED_OLD)
+			if (grp == mu->uc_suppgids[0] ||
+			    grp == mu->uc_suppgids[1])
+				return 1;
+
+		if (mu->uc_ginfo)
+			group_info = mu->uc_ginfo;
+		else if (mu->uc_identity)
+			group_info = mu->uc_identity->mi_ginfo;
+
+		if (!group_info)
+			return 0;
+
+		lustre_get_group_info(group_info);
+		rc = lustre_groups_search(group_info, grp);
+		lustre_put_group_info(group_info);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lustre_in_group_p);
+
+struct lustre_idmap_entry {
+	struct list_head       lie_rmt_uid_hash; /* hashed as lie_rmt_uid; */
+	struct list_head       lie_lcl_uid_hash; /* hashed as lie_lcl_uid; */
+	struct list_head       lie_rmt_gid_hash; /* hashed as lie_rmt_gid; */
+	struct list_head       lie_lcl_gid_hash; /* hashed as lie_lcl_gid; */
+	uid_t	    lie_rmt_uid;      /* remote uid */
+	uid_t	    lie_lcl_uid;      /* local uid */
+	gid_t	    lie_rmt_gid;      /* remote gid */
+	gid_t	    lie_lcl_gid;      /* local gid */
+};
+
+static inline __u32 lustre_idmap_hashfunc(__u32 id)
+{
+	return id & (CFS_IDMAP_HASHSIZE - 1);
+}
+
+static
+struct lustre_idmap_entry *idmap_entry_alloc(uid_t rmt_uid, uid_t lcl_uid,
+					     gid_t rmt_gid, gid_t lcl_gid)
+{
+	struct lustre_idmap_entry *e;
+
+	OBD_ALLOC_PTR(e);
+	if (e == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&e->lie_rmt_uid_hash);
+	INIT_LIST_HEAD(&e->lie_lcl_uid_hash);
+	INIT_LIST_HEAD(&e->lie_rmt_gid_hash);
+	INIT_LIST_HEAD(&e->lie_lcl_gid_hash);
+	e->lie_rmt_uid = rmt_uid;
+	e->lie_lcl_uid = lcl_uid;
+	e->lie_rmt_gid = rmt_gid;
+	e->lie_lcl_gid = lcl_gid;
+
+	return e;
+}
+
+static void idmap_entry_free(struct lustre_idmap_entry *e)
+{
+	if (!list_empty(&e->lie_rmt_uid_hash))
+		list_del(&e->lie_rmt_uid_hash);
+	if (!list_empty(&e->lie_lcl_uid_hash))
+		list_del(&e->lie_lcl_uid_hash);
+	if (!list_empty(&e->lie_rmt_gid_hash))
+		list_del(&e->lie_rmt_gid_hash);
+	if (!list_empty(&e->lie_lcl_gid_hash))
+		list_del(&e->lie_lcl_gid_hash);
+	OBD_FREE_PTR(e);
+}
+
+/*
+ * return value
+ * NULL: not found entry
+ * ERR_PTR(-EACCES): found 1(remote):N(local) mapped entry
+ * others: found normal entry
+ */
+static
+struct lustre_idmap_entry *idmap_search_entry(struct lustre_idmap_table *t,
+					      uid_t rmt_uid, uid_t lcl_uid,
+					      gid_t rmt_gid, gid_t lcl_gid)
+{
+	struct list_head *head;
+	struct lustre_idmap_entry *e;
+
+	head = &t->lit_idmaps[RMT_UIDMAP_IDX][lustre_idmap_hashfunc(rmt_uid)];
+	list_for_each_entry(e, head, lie_rmt_uid_hash)
+		if (e->lie_rmt_uid == rmt_uid) {
+			if (e->lie_lcl_uid == lcl_uid) {
+				if (e->lie_rmt_gid == rmt_gid &&
+				    e->lie_lcl_gid == lcl_gid)
+					/* must be quaternion match */
+					return e;
+			} else {
+				/* 1:N uid mapping */
+				CERROR("rmt uid %u already be mapped to %u"
+				       " (new %u)\n", e->lie_rmt_uid,
+				       e->lie_lcl_uid, lcl_uid);
+				return ERR_PTR(-EACCES);
+			}
+		}
+
+	head = &t->lit_idmaps[RMT_GIDMAP_IDX][lustre_idmap_hashfunc(rmt_gid)];
+	list_for_each_entry(e, head, lie_rmt_gid_hash)
+		if (e->lie_rmt_gid == rmt_gid) {
+			if (e->lie_lcl_gid == lcl_gid) {
+				if (unlikely(e->lie_rmt_uid == rmt_uid &&
+				    e->lie_lcl_uid == lcl_uid))
+					/* after uid mapping search above,
+					 * we should never come here */
+					LBUG();
+			} else {
+				/* 1:N gid mapping */
+				CERROR("rmt gid %u already be mapped to %u"
+				       " (new %u)\n", e->lie_rmt_gid,
+				       e->lie_lcl_gid, lcl_gid);
+				return ERR_PTR(-EACCES);
+			}
+		}
+
+	return NULL;
+}
+
+static __u32 idmap_lookup_uid(struct list_head *hash, int reverse,
+			      __u32 uid)
+{
+	struct list_head *head = &hash[lustre_idmap_hashfunc(uid)];
+	struct lustre_idmap_entry *e;
+
+	if (!reverse) {
+		list_for_each_entry(e, head, lie_rmt_uid_hash)
+			if (e->lie_rmt_uid == uid)
+				return e->lie_lcl_uid;
+	} else {
+		list_for_each_entry(e, head, lie_lcl_uid_hash)
+			if (e->lie_lcl_uid == uid)
+				return e->lie_rmt_uid;
+	}
+
+	return CFS_IDMAP_NOTFOUND;
+}
+
+static __u32 idmap_lookup_gid(struct list_head *hash, int reverse, __u32 gid)
+{
+	struct list_head *head = &hash[lustre_idmap_hashfunc(gid)];
+	struct lustre_idmap_entry *e;
+
+	if (!reverse) {
+		list_for_each_entry(e, head, lie_rmt_gid_hash)
+			if (e->lie_rmt_gid == gid)
+				return e->lie_lcl_gid;
+	} else {
+		list_for_each_entry(e, head, lie_lcl_gid_hash)
+			if (e->lie_lcl_gid == gid)
+				return e->lie_rmt_gid;
+	}
+
+	return CFS_IDMAP_NOTFOUND;
+}
+
+int lustre_idmap_add(struct lustre_idmap_table *t,
+		     uid_t ruid, uid_t luid,
+		     gid_t rgid, gid_t lgid)
+{
+	struct lustre_idmap_entry *e0, *e1;
+
+	LASSERT(t);
+
+	spin_lock(&t->lit_lock);
+	e0 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+	spin_unlock(&t->lit_lock);
+	if (!e0) {
+		e0 = idmap_entry_alloc(ruid, luid, rgid, lgid);
+		if (!e0)
+			return -ENOMEM;
+
+		spin_lock(&t->lit_lock);
+		e1 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+		if (e1 == NULL) {
+			list_add_tail(&e0->lie_rmt_uid_hash,
+					  &t->lit_idmaps[RMT_UIDMAP_IDX]
+					  [lustre_idmap_hashfunc(ruid)]);
+			list_add_tail(&e0->lie_lcl_uid_hash,
+					  &t->lit_idmaps[LCL_UIDMAP_IDX]
+					  [lustre_idmap_hashfunc(luid)]);
+			list_add_tail(&e0->lie_rmt_gid_hash,
+					  &t->lit_idmaps[RMT_GIDMAP_IDX]
+					  [lustre_idmap_hashfunc(rgid)]);
+			list_add_tail(&e0->lie_lcl_gid_hash,
+					  &t->lit_idmaps[LCL_GIDMAP_IDX]
+					  [lustre_idmap_hashfunc(lgid)]);
+		}
+		spin_unlock(&t->lit_lock);
+		if (e1 != NULL) {
+			idmap_entry_free(e0);
+			if (IS_ERR(e1))
+				return PTR_ERR(e1);
+		}
+	} else if (IS_ERR(e0)) {
+		return PTR_ERR(e0);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(lustre_idmap_add);
+
+int lustre_idmap_del(struct lustre_idmap_table *t,
+		    uid_t ruid, uid_t luid,
+		    gid_t rgid, gid_t lgid)
+{
+	struct lustre_idmap_entry *e;
+	int rc = 0;
+
+	LASSERT(t);
+
+	spin_lock(&t->lit_lock);
+	e = idmap_search_entry(t, ruid, luid, rgid, lgid);
+	if (IS_ERR(e))
+		rc = PTR_ERR(e);
+	else if (e)
+		idmap_entry_free(e);
+	spin_unlock(&t->lit_lock);
+
+	return rc;
+}
+EXPORT_SYMBOL(lustre_idmap_del);
+
+int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+			    struct lustre_idmap_table *t,
+			    int reverse, uid_t uid)
+{
+	struct list_head *hash;
+
+	if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+		if (!reverse) {
+			if (uid == mu->uc_o_uid)
+				return mu->uc_uid;
+			else if (uid == mu->uc_o_fsuid)
+				return mu->uc_fsuid;
+		} else {
+			if (uid == mu->uc_uid)
+				return mu->uc_o_uid;
+			else if (uid == mu->uc_fsuid)
+				return mu->uc_o_fsuid;
+		}
+	}
+
+	if (t == NULL)
+		return CFS_IDMAP_NOTFOUND;
+
+	hash = t->lit_idmaps[reverse ? LCL_UIDMAP_IDX : RMT_UIDMAP_IDX];
+
+	spin_lock(&t->lit_lock);
+	uid = idmap_lookup_uid(hash, reverse, uid);
+	spin_unlock(&t->lit_lock);
+
+	return uid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_uid);
+
+int lustre_idmap_lookup_gid(struct lu_ucred *mu, struct lustre_idmap_table *t,
+			    int reverse, gid_t gid)
+{
+	struct list_head *hash;
+
+	if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+		if (!reverse) {
+			if (gid == mu->uc_o_gid)
+				return mu->uc_gid;
+			else if (gid == mu->uc_o_fsgid)
+				return mu->uc_fsgid;
+		} else {
+			if (gid == mu->uc_gid)
+				return mu->uc_o_gid;
+			else if (gid == mu->uc_fsgid)
+				return mu->uc_o_fsgid;
+		}
+	}
+
+	if (t == NULL)
+		return CFS_IDMAP_NOTFOUND;
+
+	hash = t->lit_idmaps[reverse ? LCL_GIDMAP_IDX : RMT_GIDMAP_IDX];
+
+	spin_lock(&t->lit_lock);
+	gid = idmap_lookup_gid(hash, reverse, gid);
+	spin_unlock(&t->lit_lock);
+
+	return gid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_gid);
+
+struct lustre_idmap_table *lustre_idmap_init(void)
+{
+	struct lustre_idmap_table *t;
+	int i, j;
+
+	OBD_ALLOC_PTR(t);
+	if(unlikely(t == NULL))
+		return (ERR_PTR(-ENOMEM));
+
+	spin_lock_init(&t->lit_lock);
+	for (i = 0; i < ARRAY_SIZE(t->lit_idmaps); i++)
+		for (j = 0; j < ARRAY_SIZE(t->lit_idmaps[i]); j++)
+			INIT_LIST_HEAD(&t->lit_idmaps[i][j]);
+
+	return t;
+}
+EXPORT_SYMBOL(lustre_idmap_init);
+
+void lustre_idmap_fini(struct lustre_idmap_table *t)
+{
+	struct list_head *list;
+	struct lustre_idmap_entry *e;
+	int i;
+	LASSERT(t);
+
+	list = t->lit_idmaps[RMT_UIDMAP_IDX];
+	spin_lock(&t->lit_lock);
+	for (i = 0; i < CFS_IDMAP_HASHSIZE; i++)
+		while (!list_empty(&list[i])) {
+			e = list_entry(list[i].next,
+					   struct lustre_idmap_entry,
+					   lie_rmt_uid_hash);
+			idmap_entry_free(e);
+		}
+	spin_unlock(&t->lit_lock);
+
+	OBD_FREE_PTR(t);
+}
+EXPORT_SYMBOL(lustre_idmap_fini);

diff --git a/drivers/staging/lustre/lustre/obdclass/linkea.c b/drivers/staging/lustre/lustre/obdclass/linkea.c
new file mode 100644
index 0000000..b5c19ac
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linkea.c

@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <lustre_linkea.h>
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf)
+{
+	ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_CACHE_SIZE);
+	if (ldata->ld_buf->lb_buf == NULL)
+		return -ENOMEM;
+	ldata->ld_leh = ldata->ld_buf->lb_buf;
+	ldata->ld_leh->leh_magic = LINK_EA_MAGIC;
+	ldata->ld_leh->leh_len = sizeof(struct link_ea_header);
+	ldata->ld_leh->leh_reccount = 0;
+	return 0;
+}
+EXPORT_SYMBOL(linkea_data_new);
+
+int linkea_init(struct linkea_data *ldata)
+{
+	struct link_ea_header *leh;
+
+	LASSERT(ldata->ld_buf != NULL);
+	leh = ldata->ld_buf->lb_buf;
+	if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
+		leh->leh_magic = LINK_EA_MAGIC;
+		leh->leh_reccount = __swab32(leh->leh_reccount);
+		leh->leh_len = __swab64(leh->leh_len);
+		/* entries are swabbed by linkea_entry_unpack */
+	}
+	if (leh->leh_magic != LINK_EA_MAGIC)
+		return -EINVAL;
+	if (leh->leh_reccount == 0)
+		return -ENODATA;
+
+	ldata->ld_leh = leh;
+	return 0;
+}
+EXPORT_SYMBOL(linkea_init);
+
+/**
+ * Pack a link_ea_entry.
+ * All elements are stored as chars to avoid alignment issues.
+ * Numbers are always big-endian
+ * \retval record length
+ */
+static int linkea_entry_pack(struct link_ea_entry *lee,
+			     const struct lu_name *lname,
+			     const struct lu_fid *pfid)
+{
+	struct lu_fid   tmpfid;
+	int	     reclen;
+
+	fid_cpu_to_be(&tmpfid, pfid);
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH))
+		tmpfid.f_ver = ~0;
+	memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid));
+	memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen);
+	reclen = sizeof(struct link_ea_entry) + lname->ln_namelen;
+
+	lee->lee_reclen[0] = (reclen >> 8) & 0xff;
+	lee->lee_reclen[1] = reclen & 0xff;
+	return reclen;
+}
+
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+			 struct lu_name *lname, struct lu_fid *pfid)
+{
+	*reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1];
+	memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid));
+	fid_be_to_cpu(pfid, pfid);
+	lname->ln_name = lee->lee_name;
+	lname->ln_namelen = *reclen - sizeof(struct link_ea_entry);
+}
+EXPORT_SYMBOL(linkea_entry_unpack);
+
+/**
+ * Add a record to the end of link ea buf
+ **/
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		   const struct lu_fid *pfid)
+{
+	LASSERT(ldata->ld_leh != NULL);
+
+	if (lname == NULL || pfid == NULL)
+		return -EINVAL;
+
+	ldata->ld_reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
+	if (ldata->ld_leh->leh_len + ldata->ld_reclen >
+	    ldata->ld_buf->lb_len) {
+		if (lu_buf_check_and_grow(ldata->ld_buf,
+					  ldata->ld_leh->leh_len +
+					  ldata->ld_reclen) < 0)
+			return -ENOMEM;
+	}
+
+	ldata->ld_leh = ldata->ld_buf->lb_buf;
+	ldata->ld_lee = ldata->ld_buf->lb_buf + ldata->ld_leh->leh_len;
+	ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid);
+	ldata->ld_leh->leh_len += ldata->ld_reclen;
+	ldata->ld_leh->leh_reccount++;
+	CDEBUG(D_INODE, "New link_ea name '%.*s' is added\n",
+	       lname->ln_namelen, lname->ln_name);
+	return 0;
+}
+EXPORT_SYMBOL(linkea_add_buf);
+
+/** Del the current record from the link ea buf */
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname)
+{
+	LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL);
+
+	ldata->ld_leh->leh_reccount--;
+	ldata->ld_leh->leh_len -= ldata->ld_reclen;
+	memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen,
+		(char *)ldata->ld_leh + ldata->ld_leh->leh_len -
+		(char *)ldata->ld_lee);
+	CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n",
+	       lname->ln_namelen, lname->ln_name);
+}
+EXPORT_SYMBOL(linkea_del_buf);
+
+/**
+ * Check if such a link exists in linkEA.
+ *
+ * \param ldata link data the search to be done on
+ * \param lname name in the parent's directory entry pointing to this object
+ * \param pfid parent fid the link to be found for
+ *
+ * \retval   0 success
+ * \retval -ENOENT link does not exist
+ * \retval -ve on error
+ */
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+		      const struct lu_fid  *pfid)
+{
+	struct lu_name tmpname;
+	struct lu_fid  tmpfid;
+	int count;
+
+	LASSERT(ldata->ld_leh != NULL);
+
+	/* link #0 */
+	ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1);
+
+	for (count = 0; count < ldata->ld_leh->leh_reccount; count++) {
+		linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen,
+				    &tmpname, &tmpfid);
+		if (tmpname.ln_namelen == lname->ln_namelen &&
+		    lu_fid_eq(&tmpfid, pfid) &&
+		    (strncmp(tmpname.ln_name, lname->ln_name,
+			     tmpname.ln_namelen) == 0))
+			break;
+		ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+							 ldata->ld_reclen);
+	}
+
+	if (count == ldata->ld_leh->leh_reccount) {
+		CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n",
+		       lname->ln_namelen, lname->ln_name);
+		ldata->ld_lee = NULL;
+		return -ENOENT;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(linkea_links_find);

diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
new file mode 100644
index 0000000..d2c3072
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c

@@ -0,0 +1,408 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lprocfs_status.h>
+#include <lustre_ver.h>
+#include <lustre/lustre_build_version.h>
+
+int proc_version;
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void *arg)
+{
+	struct obd_ioctl_hdr hdr;
+	struct obd_ioctl_data *data;
+	int err;
+	int offset = 0;
+	ENTRY;
+
+	err = copy_from_user(&hdr, (void *)arg, sizeof(hdr));
+	if ( err )
+		RETURN(err);
+
+	if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+		CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+		       OBD_IOCTL_VERSION, hdr.ioc_version);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("User buffer len %d exceeds %d max buffer\n",
+		       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+		CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	/* When there are lots of processes calling vmalloc on multi-core
+	 * system, the high lock contention will hurt performance badly,
+	 * obdfilter-survey is an example, which relies on ioctl. So we'd
+	 * better avoid vmalloc on ioctl path. LU-66 */
+	OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+	if (*buf == NULL) {
+		CERROR("Cannot allocate control buffer of len %d\n",
+		       hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+	*len = hdr.ioc_len;
+	data = (struct obd_ioctl_data *)*buf;
+
+	err = copy_from_user(*buf, (void *)arg, hdr.ioc_len);
+	if ( err ) {
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(err);
+	}
+
+	if (obd_ioctl_is_invalid(data)) {
+		CERROR("ioctl not correctly formatted\n");
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	if (data->ioc_inllen1) {
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+		offset += cfs_size_round(data->ioc_inllen1);
+	}
+
+	if (data->ioc_inllen2) {
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen2);
+	}
+
+	if (data->ioc_inllen3) {
+		data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen3);
+	}
+
+	if (data->ioc_inllen4) {
+		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+	}
+
+	EXIT;
+	return 0;
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+int obd_ioctl_popdata(void *arg, void *data, int len)
+{
+	int err;
+
+	err = copy_to_user(arg, data, len);
+	if (err)
+		err = -EFAULT;
+	return err;
+}
+EXPORT_SYMBOL(obd_ioctl_popdata);
+
+/*  opening /dev/obd */
+static int obd_class_open(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	try_module_get(THIS_MODULE);
+	RETURN(0);
+}
+
+/*  closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	module_put(THIS_MODULE);
+	RETURN(0);
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+			    unsigned long arg)
+{
+	int err = 0;
+	ENTRY;
+
+	/* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+		RETURN(err = -EACCES);
+	if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+		RETURN(err = -ENOTTY);
+
+	err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+	RETURN(err);
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+	.owner	  = THIS_MODULE,
+	.unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
+	.open	   = obd_class_open,      /* open */
+	.release	= obd_class_release,   /* release */
+};
+
+/* modules setup */
+psdev_t obd_psdev = {
+	.minor = OBD_DEV_MINOR,
+	.name  = OBD_DEV_NAME,
+	.fops  = &obd_psdev_fops,
+};
+
+
+#ifdef LPROCFS
+int obd_proc_version_seq_show(struct seq_file *m, void *v)
+{
+	return seq_printf(m, "lustre: %s\nkernel: %s\nbuild:  %s\n",
+			LUSTRE_VERSION_STRING, "patchless_client",
+			BUILD_VERSION);
+}
+LPROC_SEQ_FOPS_RO(obd_proc_version);
+
+int obd_proc_pinger_seq_show(struct seq_file *m, void *v)
+{
+	return seq_printf(m, "%s\n", "on");
+}
+LPROC_SEQ_FOPS_RO(obd_proc_pinger);
+
+static int obd_proc_health_seq_show(struct seq_file *m, void *v)
+{
+	int rc = 0, i;
+
+	if (libcfs_catastrophe)
+		seq_printf(m, "LBUG\n");
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd;
+
+		obd = class_num2obd(i);
+		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+			continue;
+
+		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+		if (obd->obd_stopping)
+			continue;
+
+		class_incref(obd, __FUNCTION__, current);
+		read_unlock(&obd_dev_lock);
+
+		if (obd_health_check(NULL, obd)) {
+			seq_printf(m, "device %s reported unhealthy\n",
+				      obd->obd_name);
+			rc++;
+		}
+		class_decref(obd, __FUNCTION__, current);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+
+	if (rc == 0)
+		return seq_printf(m, "healthy\n");
+
+	seq_printf(m, "NOT HEALTHY\n");
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(obd_proc_health);
+
+static int obd_proc_jobid_var_seq_show(struct seq_file *m, void *v)
+{
+	return seq_printf(m, "%s\n", obd_jobid_var);
+}
+
+static ssize_t obd_proc_jobid_var_seq_write(struct file *file, const char *buffer,
+					size_t count, loff_t *off)
+{
+	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+		return -EINVAL;
+
+	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+	/* Trim the trailing '\n' if any */
+	memcpy(obd_jobid_var, buffer, count - (buffer[count - 1] == '\n'));
+	return count;
+}
+LPROC_SEQ_FOPS(obd_proc_jobid_var);
+
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root = NULL;
+EXPORT_SYMBOL(proc_lustre_root);
+
+struct lprocfs_vars lprocfs_base[] = {
+	{ "version", &obd_proc_version_fops },
+	{ "pinger", &obd_proc_pinger_fops },
+	{ "health_check", &obd_proc_health_fops },
+	{ "jobid_var", &obd_proc_jobid_var_fops },
+	{ 0 }
+};
+#else
+#define lprocfs_base NULL
+#endif /* LPROCFS */
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+	loff_t index = *(loff_t *)v;
+	struct obd_device *obd = class_num2obd((int)index);
+	char *status;
+
+	if (obd == NULL)
+		return 0;
+
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	if (obd->obd_stopping)
+		status = "ST";
+	else if (obd->obd_inactive)
+		status = "IN";
+	else if (obd->obd_set_up)
+		status = "UP";
+	else if (obd->obd_attached)
+		status = "AT";
+	else
+		status = "--";
+
+	return seq_printf(p, "%3d %s %s %s %s %d\n",
+			  (int)index, status, obd->obd_type->typ_name,
+			  obd->obd_name, obd->obd_uuid.uuid,
+			  atomic_read(&obd->obd_refcount));
+}
+
+struct seq_operations obd_device_list_sops = {
+	.start = obd_device_list_seq_start,
+	.stop = obd_device_list_seq_stop,
+	.next = obd_device_list_seq_next,
+	.show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = seq_open(file, &obd_device_list_sops);
+
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+
+	return 0;
+}
+
+struct file_operations obd_device_list_fops = {
+	.owner   = THIS_MODULE,
+	.open    = obd_device_list_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+int class_procfs_init(void)
+{
+	int rc;
+	ENTRY;
+
+	obd_sysctl_init();
+	proc_lustre_root = lprocfs_register("fs/lustre", NULL,
+					    lprocfs_base, NULL);
+	rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444,
+				&obd_device_list_fops, NULL);
+	if (rc)
+		CERROR("error adding /proc/fs/lustre/devices file\n");
+	RETURN(0);
+}
+
+int class_procfs_clean(void)
+{
+	ENTRY;
+	if (proc_lustre_root) {
+		lprocfs_remove(&proc_lustre_root);
+	}
+	RETURN(0);
+}

diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
new file mode 100644
index 0000000..6ee3471
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c

@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/fs.h>
+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid)
+{
+	obd_flag newvalid = 0;
+
+	if (valid & LA_ATIME) {
+		dst->o_atime = la->la_atime;
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & LA_MTIME) {
+		dst->o_mtime = la->la_mtime;
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & LA_CTIME) {
+		dst->o_ctime = la->la_ctime;
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & LA_SIZE) {
+		dst->o_size = la->la_size;
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = la->la_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & LA_TYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (la->la_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & LA_MODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (la->la_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & LA_UID) {
+		dst->o_uid = la->la_uid;
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & LA_GID) {
+		dst->o_gid = la->la_gid;
+		newvalid |= OBD_MD_FLGID;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, obd_flag valid)
+{
+	__u64 newvalid = 0;
+
+	valid &= obdo->o_valid;
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->la_atime = obdo->o_atime;
+		newvalid |= LA_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->la_mtime = obdo->o_mtime;
+		newvalid |= LA_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->la_ctime = obdo->o_ctime;
+		newvalid |= LA_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->la_size = obdo->o_size;
+		newvalid |= LA_SIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {
+		dst->la_blocks = obdo->o_blocks;
+		newvalid |= LA_BLOCKS;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->la_mode = (dst->la_mode & S_IALLUGO) |
+			       (obdo->o_mode & S_IFMT);
+		newvalid |= LA_TYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->la_mode = (dst->la_mode & S_IFMT) |
+			       (obdo->o_mode & S_IALLUGO);
+		newvalid |= LA_MODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->la_uid = obdo->o_uid;
+		newvalid |= LA_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->la_gid = obdo->o_gid;
+		newvalid |= LA_GID;
+	}
+	dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
+
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+	valid &= src->o_valid;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE,
+		       "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+		       src->o_valid, LTIME_S(dst->i_mtime),
+		       LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+	if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime))
+		LTIME_S(dst->i_atime) = src->o_atime;
+	if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime))
+		LTIME_S(dst->i_mtime) = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+		LTIME_S(dst->i_ctime) = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		i_size_write(dst, src->o_size);
+	/* optimum IO size */
+	if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits))
+		dst->i_blkbits = ffs(src->o_blksize) - 1;
+
+	if (dst->i_blkbits < PAGE_CACHE_SHIFT)
+		dst->i_blkbits = PAGE_CACHE_SHIFT;
+
+	/* allocation of space */
+	if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks)
+		/*
+		 * XXX shouldn't overflow be checked here like in
+		 * obdo_to_inode().
+		 */
+		dst->i_blocks = src->o_blocks;
+}
+EXPORT_SYMBOL(obdo_refresh_inode);
+
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+	valid &= src->o_valid;
+
+	LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID |
+			    OBD_MD_FLID | OBD_MD_FLGROUP)),
+		 "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid);
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE,
+		       "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+		       src->o_valid, LTIME_S(dst->i_mtime),
+		       LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+	if (valid & OBD_MD_FLATIME)
+		LTIME_S(dst->i_atime) = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		LTIME_S(dst->i_mtime) = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+		LTIME_S(dst->i_ctime) = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		i_size_write(dst, src->o_size);
+	if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */
+		dst->i_blocks = src->o_blocks;
+		if (dst->i_blocks < src->o_blocks) /* overflow */
+			dst->i_blocks = -1;
+
+	}
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->i_blkbits = ffs(src->o_blksize)-1;
+	if (valid & OBD_MD_FLMODE)
+		dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->i_uid = src->o_uid;
+	if (valid & OBD_MD_FLGID)
+		dst->i_gid = src->o_gid;
+	if (valid & OBD_MD_FLFLAGS)
+		dst->i_flags = src->o_flags;
+}
+EXPORT_SYMBOL(obdo_to_inode);

diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
new file mode 100644
index 0000000..46aad68
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c

@@ -0,0 +1,445 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <asm/bitops.h>
+#include <asm/uaccess.h>
+#include <linux/utsname.h>
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#ifdef CONFIG_SYSCTL
+ctl_table_header_t *obd_table_header = NULL;
+#endif
+
+
+#define OBD_SYSCTL 300
+
+enum {
+	OBD_TIMEOUT = 3,	/* RPC timeout before recovery/intr */
+	OBD_DUMP_ON_TIMEOUT,    /* dump kernel debug log upon eviction */
+	OBD_MEMUSED,	    /* bytes currently OBD_ALLOCated */
+	OBD_PAGESUSED,	  /* pages currently OBD_PAGE_ALLOCated */
+	OBD_MAXMEMUSED,	 /* maximum bytes OBD_ALLOCated concurrently */
+	OBD_MAXPAGESUSED,       /* maximum pages OBD_PAGE_ALLOCated concurrently */
+	OBD_SYNCFILTER,	 /* XXX temporary, as we play with sync osts.. */
+	OBD_LDLM_TIMEOUT,       /* LDLM timeout for ASTs before client eviction */
+	OBD_DUMP_ON_EVICTION,   /* dump kernel debug log upon eviction */
+	OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
+	OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
+	OBD_MAX_DIRTY_PAGES,    /* maximum dirty pages */
+	OBD_AT_MIN,	     /* Adaptive timeouts params */
+	OBD_AT_MAX,
+	OBD_AT_EXTRA,
+	OBD_AT_EARLY_MARGIN,
+	OBD_AT_HISTORY,
+};
+
+
+int LL_PROC_PROTO(proc_set_timeout)
+{
+	int rc;
+
+	rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	if (ldlm_timeout >= obd_timeout)
+		ldlm_timeout = max(obd_timeout / 3, 1U);
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_memory_alloc)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_sum());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_alloc)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_sum());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_mem_max)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_max());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_max)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_max());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_max_dirty_pages_in_mb)
+{
+	int rc = 0;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write) {
+		rc = lprocfs_write_frac_helper(buffer, *lenp,
+					       (unsigned int*)table->data,
+					       1 << (20 - PAGE_CACHE_SHIFT));
+		/* Don't allow them to let dirty pages exceed 90% of system
+		 * memory and set a hard minimum of 4MB. */
+		if (obd_max_dirty_pages > ((num_physpages / 10) * 9)) {
+			CERROR("Refusing to set max dirty pages to %u, which "
+			       "is more than 90%% of available RAM; setting "
+			       "to %lu\n", obd_max_dirty_pages,
+			       ((num_physpages / 10) * 9));
+			obd_max_dirty_pages = ((num_physpages / 10) * 9);
+		} else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) {
+			obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT);
+		}
+	} else {
+		char buf[21];
+		int len;
+
+		len = lprocfs_read_frac_helper(buf, sizeof(buf),
+					       *(unsigned int*)table->data,
+					       1 << (20 - PAGE_CACHE_SHIFT));
+		if (len > *lenp)
+			len = *lenp;
+		buf[len] = '\0';
+		if (copy_to_user(buffer, buf, len))
+			return -EFAULT;
+		*lenp = len;
+	}
+	*ppos += *lenp;
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_alloc_fail_rate)
+{
+	int rc	  = 0;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write) {
+		rc = lprocfs_write_frac_helper(buffer, *lenp,
+					       (unsigned int*)table->data,
+					       OBD_ALLOC_FAIL_MULT);
+	} else {
+		char buf[21];
+		int  len;
+
+		len = lprocfs_read_frac_helper(buf, 21,
+					       *(unsigned int*)table->data,
+					       OBD_ALLOC_FAIL_MULT);
+		if (len > *lenp)
+			len = *lenp;
+		buf[len] = '\0';
+		if (copy_to_user(buffer, buf, len))
+			return -EFAULT;
+		*lenp = len;
+	}
+	*ppos += *lenp;
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_at_min)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_max)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_extra)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_early_margin)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_history)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_t obd_table[] = {
+	{
+		INIT_CTL_NAME(OBD_TIMEOUT)
+		.procname = "timeout",
+		.data     = &obd_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_set_timeout
+	},
+	{
+		INIT_CTL_NAME(OBD_DEBUG_PEER_ON_TIMEOUT)
+		.procname = "debug_peer_on_timeout",
+		.data     = &obd_debug_peer_on_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(OBD_DUMP_ON_TIMEOUT)
+		.procname = "dump_on_timeout",
+		.data     = &obd_dump_on_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(OBD_DUMP_ON_EVICTION)
+		.procname = "dump_on_eviction",
+		.data     = &obd_dump_on_eviction,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(OBD_MEMUSED)
+		.procname = "memused",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_memory_alloc
+	},
+	{
+		INIT_CTL_NAME(OBD_PAGESUSED)
+		.procname = "pagesused",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_pages_alloc
+	},
+	{
+		INIT_CTL_NAME(OBD_MAXMEMUSED)
+		.procname = "memused_max",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_mem_max
+	},
+	{
+		INIT_CTL_NAME(OBD_MAXPAGESUSED)
+		.procname = "pagesused_max",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_pages_max
+	},
+	{
+		INIT_CTL_NAME(OBD_LDLM_TIMEOUT)
+		.procname = "ldlm_timeout",
+		.data     = &ldlm_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_set_timeout
+	},
+	{
+		INIT_CTL_NAME(OBD_ALLOC_FAIL_RATE)
+		.procname = "alloc_fail_rate",
+		.data     = &obd_alloc_fail_rate,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_alloc_fail_rate
+	},
+	{
+		INIT_CTL_NAME(OBD_MAX_DIRTY_PAGES)
+		.procname = "max_dirty_mb",
+		.data     = &obd_max_dirty_pages,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_max_dirty_pages_in_mb
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_MIN)
+		.procname = "at_min",
+		.data     = &at_min,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_min
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_MAX)
+		.procname = "at_max",
+		.data     = &at_max,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_max
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_EXTRA)
+		.procname = "at_extra",
+		.data     = &at_extra,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_extra
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_EARLY_MARGIN)
+		.procname = "at_early_margin",
+		.data     = &at_early_margin,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_early_margin
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_HISTORY)
+		.procname = "at_history",
+		.data     = &at_history,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_history
+	},
+	{       INIT_CTL_NAME(0)    }
+};
+
+static ctl_table_t parent_table[] = {
+	{
+		INIT_CTL_NAME(OBD_SYSCTL)
+		.procname = "lustre",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = obd_table
+	},
+	{       INIT_CTL_NAME(0)   }
+};
+#endif
+
+void obd_sysctl_init (void)
+{
+#ifdef CONFIG_SYSCTL
+	if ( !obd_table_header )
+		obd_table_header = cfs_register_sysctl_table(parent_table, 0);
+#endif
+}
+
+void obd_sysctl_clean (void)
+{
+#ifdef CONFIG_SYSCTL
+	if ( obd_table_header )
+		unregister_sysctl_table(obd_table_header);
+	obd_table_header = NULL;
+#endif
+}

diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c
new file mode 100644
index 0000000..b1d215e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog.c

@@ -0,0 +1,966 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Mikhail Pershin <tappro@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/*
+ * Allocate a new log or catalog handle
+ * Used inside llog_open().
+ */
+struct llog_handle *llog_alloc_handle(void)
+{
+	struct llog_handle *loghandle;
+
+	OBD_ALLOC_PTR(loghandle);
+	if (loghandle == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	init_rwsem(&loghandle->lgh_lock);
+	spin_lock_init(&loghandle->lgh_hdr_lock);
+	INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
+	atomic_set(&loghandle->lgh_refcount, 1);
+
+	return loghandle;
+}
+
+/*
+ * Free llog handle and header data if exists. Used in llog_close() only
+ */
+void llog_free_handle(struct llog_handle *loghandle)
+{
+	LASSERT(loghandle != NULL);
+
+	/* failed llog_init_handle */
+	if (!loghandle->lgh_hdr)
+		goto out;
+
+	if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)
+		LASSERT(list_empty(&loghandle->u.phd.phd_entry));
+	else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		LASSERT(list_empty(&loghandle->u.chd.chd_head));
+	LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE);
+	OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE);
+out:
+	OBD_FREE_PTR(loghandle);
+}
+
+void llog_handle_get(struct llog_handle *loghandle)
+{
+	atomic_inc(&loghandle->lgh_refcount);
+}
+
+void llog_handle_put(struct llog_handle *loghandle)
+{
+	LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
+	if (atomic_dec_and_test(&loghandle->lgh_refcount))
+		llog_free_handle(loghandle);
+}
+
+/* returns negative on error; 0 if success; 1 if success & log destroyed */
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index)
+{
+	struct llog_log_hdr *llh = loghandle->lgh_hdr;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n",
+	       index, POSTID(&loghandle->lgh_id.lgl_oi));
+
+	if (index == 0) {
+		CERROR("Can't cancel index 0 which is header\n");
+		RETURN(-EINVAL);
+	}
+
+	spin_lock(&loghandle->lgh_hdr_lock);
+	if (!ext2_clear_bit(index, llh->llh_bitmap)) {
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index);
+		RETURN(-ENOENT);
+	}
+
+	llh->llh_count--;
+
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    (llh->llh_count == 1) &&
+	    (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) {
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		rc = llog_destroy(env, loghandle);
+		if (rc < 0) {
+			CERROR("%s: can't destroy empty llog #"DOSTID
+			       "#%08x: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, rc);
+			GOTO(out_err, rc);
+		}
+		RETURN(1);
+	}
+	spin_unlock(&loghandle->lgh_hdr_lock);
+
+	rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0);
+	if (rc < 0) {
+		CERROR("%s: fail to write header for llog #"DOSTID
+		       "#%08x: rc = %d\n",
+		       loghandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&loghandle->lgh_id.lgl_oi),
+		       loghandle->lgh_id.lgl_ogen, rc);
+		GOTO(out_err, rc);
+	}
+	RETURN(0);
+out_err:
+	spin_lock(&loghandle->lgh_hdr_lock);
+	ext2_set_bit(index, llh->llh_bitmap);
+	llh->llh_count++;
+	spin_unlock(&loghandle->lgh_hdr_lock);
+	return rc;
+}
+EXPORT_SYMBOL(llog_cancel_rec);
+
+static int llog_read_header(const struct lu_env *env,
+			    struct llog_handle *handle,
+			    struct obd_uuid *uuid)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+
+	if (lop->lop_read_header == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_read_header(env, handle);
+	if (rc == LLOG_EEMPTY) {
+		struct llog_log_hdr *llh = handle->lgh_hdr;
+
+		handle->lgh_last_idx = 0; /* header is record with index 0 */
+		llh->llh_count = 1;	 /* for the header record */
+		llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC;
+		llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE;
+		llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0;
+		llh->llh_timestamp = cfs_time_current_sec();
+		if (uuid)
+			memcpy(&llh->llh_tgtuuid, uuid,
+			       sizeof(llh->llh_tgtuuid));
+		llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap);
+		ext2_set_bit(0, llh->llh_bitmap);
+		rc = 0;
+	}
+	return rc;
+}
+
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid)
+{
+	struct llog_log_hdr	*llh;
+	int			 rc;
+
+	ENTRY;
+	LASSERT(handle->lgh_hdr == NULL);
+
+	OBD_ALLOC_PTR(llh);
+	if (llh == NULL)
+		RETURN(-ENOMEM);
+	handle->lgh_hdr = llh;
+	/* first assign flags to use llog_client_ops */
+	llh->llh_flags = flags;
+	rc = llog_read_header(env, handle, uuid);
+	if (rc == 0) {
+		if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN &&
+			      flags & LLOG_F_IS_CAT) ||
+			     (llh->llh_flags & LLOG_F_IS_CAT &&
+			      flags & LLOG_F_IS_PLAIN))) {
+			CERROR("%s: llog type is %s but initializing %s\n",
+			       handle->lgh_ctxt->loc_obd->obd_name,
+			       llh->llh_flags & LLOG_F_IS_CAT ?
+			       "catalog" : "plain",
+			       flags & LLOG_F_IS_CAT ? "catalog" : "plain");
+			GOTO(out, rc = -EINVAL);
+		} else if (llh->llh_flags &
+			   (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) {
+			/*
+			 * it is possible to open llog without specifying llog
+			 * type so it is taken from llh_flags
+			 */
+			flags = llh->llh_flags;
+		} else {
+			/* for some reason the llh_flags has no type set */
+			CERROR("llog type is not specified!\n");
+			GOTO(out, rc = -EINVAL);
+		}
+		if (unlikely(uuid &&
+			     !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
+			CERROR("%s: llog uuid mismatch: %s/%s\n",
+			       handle->lgh_ctxt->loc_obd->obd_name,
+			       (char *)uuid->uuid,
+			       (char *)llh->llh_tgtuuid.uuid);
+			GOTO(out, rc = -EEXIST);
+		}
+	}
+	if (flags & LLOG_F_IS_CAT) {
+		LASSERT(list_empty(&handle->u.chd.chd_head));
+		INIT_LIST_HEAD(&handle->u.chd.chd_head);
+		llh->llh_size = sizeof(struct llog_logid_rec);
+	} else if (!(flags & LLOG_F_IS_PLAIN)) {
+		CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
+		       handle->lgh_ctxt->loc_obd->obd_name,
+		       flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+		rc = -EINVAL;
+	}
+out:
+	if (rc) {
+		OBD_FREE_PTR(llh);
+		handle->lgh_hdr = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_init_handle);
+
+int llog_copy_handler(const struct lu_env *env,
+		      struct llog_handle *llh,
+		      struct llog_rec_hdr *rec,
+		      void *data)
+{
+	struct llog_rec_hdr local_rec = *rec;
+	struct llog_handle *local_llh = (struct llog_handle *)data;
+	char *cfg_buf = (char*) (rec + 1);
+	struct lustre_cfg *lcfg;
+	int rc = 0;
+	ENTRY;
+
+	/* Append all records */
+	local_rec.lrh_len -= sizeof(*rec) + sizeof(struct llog_rec_tail);
+	rc = llog_write(env, local_llh, &local_rec, NULL, 0,
+			(void *)cfg_buf, -1);
+
+	lcfg = (struct lustre_cfg *)cfg_buf;
+	CDEBUG(D_INFO, "idx=%d, rc=%d, len=%d, cmd %x %s %s\n",
+	       rec->lrh_index, rc, rec->lrh_len, lcfg->lcfg_command,
+	       lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_copy_handler);
+
+static int llog_process_thread(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct llog_handle		*loghandle = lpi->lpi_loghandle;
+	struct llog_log_hdr		*llh = loghandle->lgh_hdr;
+	struct llog_process_cat_data	*cd  = lpi->lpi_catdata;
+	char				*buf;
+	__u64				 cur_offset = LLOG_CHUNK_SIZE;
+	__u64				 last_offset;
+	int				 rc = 0, index = 1, last_index;
+	int				 saved_index = 0;
+	int				 last_called_index = 0;
+
+	ENTRY;
+
+	LASSERT(llh);
+
+	OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+	if (!buf) {
+		lpi->lpi_rc = -ENOMEM;
+		RETURN(0);
+	}
+
+	if (cd != NULL) {
+		last_called_index = cd->lpcd_first_idx;
+		index = cd->lpcd_first_idx + 1;
+	}
+	if (cd != NULL && cd->lpcd_last_idx)
+		last_index = cd->lpcd_last_idx;
+	else
+		last_index = LLOG_BITMAP_BYTES * 8 - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+
+		/* skip records not set in bitmap */
+		while (index <= last_index &&
+		       !ext2_test_bit(index, llh->llh_bitmap))
+			++index;
+
+		LASSERT(index <= last_index + 1);
+		if (index == last_index + 1)
+			break;
+repeat:
+		CDEBUG(D_OTHER, "index: %d last_index %d\n",
+		       index, last_index);
+
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, LLOG_CHUNK_SIZE);
+		last_offset = cur_offset;
+		rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
+				     index, &cur_offset, buf, LLOG_CHUNK_SIZE);
+		if (rc)
+			GOTO(out, rc);
+
+		/* NB: when rec->lrh_len is accessed it is already swabbed
+		 * since it is used at the "end" of the loop and the rec
+		 * swabbing is done at the beginning of the loop. */
+		for (rec = (struct llog_rec_hdr *)buf;
+		     (char *)rec < buf + LLOG_CHUNK_SIZE;
+		     rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){
+
+			CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+			       rec, rec->lrh_type);
+
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+
+			CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
+			       rec->lrh_type, rec->lrh_index);
+
+			if (rec->lrh_index == 0) {
+				/* probably another rec just got added? */
+				if (index <= loghandle->lgh_last_idx)
+					GOTO(repeat, rc = 0);
+				GOTO(out, rc = 0); /* no more records */
+			}
+			if (rec->lrh_len == 0 ||
+			    rec->lrh_len > LLOG_CHUNK_SIZE) {
+				CWARN("invalid length %d in llog record for "
+				      "index %d/%d\n", rec->lrh_len,
+				      rec->lrh_index, index);
+				GOTO(out, rc = -EINVAL);
+			}
+
+			if (rec->lrh_index < index) {
+				CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+				       rec->lrh_index);
+				continue;
+			}
+
+			CDEBUG(D_OTHER,
+			       "lrh_index: %d lrh_len: %d (%d remains)\n",
+			       rec->lrh_index, rec->lrh_len,
+			       (int)(buf + LLOG_CHUNK_SIZE - (char *)rec));
+
+			loghandle->lgh_cur_idx = rec->lrh_index;
+			loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
+						    last_offset;
+
+			/* if set, process the callback on this record */
+			if (ext2_test_bit(index, llh->llh_bitmap)) {
+				rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
+						 lpi->lpi_cbdata);
+				last_called_index = index;
+				if (rc == LLOG_PROC_BREAK) {
+					GOTO(out, rc);
+				} else if (rc == LLOG_DEL_RECORD) {
+					llog_cancel_rec(lpi->lpi_env,
+							loghandle,
+							rec->lrh_index);
+					rc = 0;
+				}
+				if (rc)
+					GOTO(out, rc);
+			} else {
+				CDEBUG(D_OTHER, "Skipped index %d\n", index);
+			}
+
+			/* next record, still in buffer? */
+			++index;
+			if (index > last_index)
+				GOTO(out, rc = 0);
+		}
+	}
+
+out:
+	if (cd != NULL)
+		cd->lpcd_last_idx = last_called_index;
+
+	OBD_FREE(buf, LLOG_CHUNK_SIZE);
+	lpi->lpi_rc = rc;
+	return 0;
+}
+
+static int llog_process_thread_daemonize(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct lu_env			 env;
+	int				 rc;
+
+	unshare_fs_struct();
+
+	/* client env has no keys, tags is just 0 */
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		goto out;
+	lpi->lpi_env = &env;
+
+	rc = llog_process_thread(arg);
+
+	lu_env_fini(&env);
+out:
+	complete(&lpi->lpi_completion);
+	return rc;
+}
+
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork)
+{
+	struct llog_process_info *lpi;
+	int		      rc;
+
+	ENTRY;
+
+	OBD_ALLOC_PTR(lpi);
+	if (lpi == NULL) {
+		CERROR("cannot alloc pointer\n");
+		RETURN(-ENOMEM);
+	}
+	lpi->lpi_loghandle = loghandle;
+	lpi->lpi_cb	= cb;
+	lpi->lpi_cbdata    = data;
+	lpi->lpi_catdata   = catdata;
+
+	if (fork) {
+		/* The new thread can't use parent env,
+		 * init the new one in llog_process_thread_daemonize. */
+		lpi->lpi_env = NULL;
+		init_completion(&lpi->lpi_completion);
+		rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi,
+					     "llog_process_thread"));
+		if (IS_ERR_VALUE(rc)) {
+			CERROR("%s: cannot start thread: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+			OBD_FREE_PTR(lpi);
+			RETURN(rc);
+		}
+		wait_for_completion(&lpi->lpi_completion);
+	} else {
+		lpi->lpi_env = env;
+		llog_process_thread(lpi);
+	}
+	rc = lpi->lpi_rc;
+	OBD_FREE_PTR(lpi);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_process_or_fork);
+
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata)
+{
+	return llog_process_or_fork(env, loghandle, cb, data, catdata, true);
+}
+EXPORT_SYMBOL(llog_process);
+
+inline int llog_get_size(struct llog_handle *loghandle)
+{
+	if (loghandle && loghandle->lgh_hdr)
+		return loghandle->lgh_hdr->llh_count;
+	return 0;
+}
+EXPORT_SYMBOL(llog_get_size);
+
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata)
+{
+	struct llog_log_hdr *llh = loghandle->lgh_hdr;
+	struct llog_process_cat_data *cd = catdata;
+	void *buf;
+	int rc = 0, first_index = 1, index, idx;
+	ENTRY;
+
+	OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+	if (!buf)
+		RETURN(-ENOMEM);
+
+	if (cd != NULL)
+		first_index = cd->lpcd_first_idx + 1;
+	if (cd != NULL && cd->lpcd_last_idx)
+		index = cd->lpcd_last_idx;
+	else
+		index = LLOG_BITMAP_BYTES * 8 - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+		struct llog_rec_tail *tail;
+
+		/* skip records not set in bitmap */
+		while (index >= first_index &&
+		       !ext2_test_bit(index, llh->llh_bitmap))
+			--index;
+
+		LASSERT(index >= first_index - 1);
+		if (index == first_index - 1)
+			break;
+
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, LLOG_CHUNK_SIZE);
+		rc = llog_prev_block(env, loghandle, index, buf,
+				     LLOG_CHUNK_SIZE);
+		if (rc)
+			GOTO(out, rc);
+
+		rec = buf;
+		idx = rec->lrh_index;
+		CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx);
+		while (idx < index) {
+			rec = (void *)rec + rec->lrh_len;
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+			idx ++;
+		}
+		LASSERT(idx == index);
+		tail = (void *)rec + rec->lrh_len - sizeof(*tail);
+
+		/* process records in buffer, starting where we found one */
+		while ((void *)tail > buf) {
+			if (tail->lrt_index == 0)
+				GOTO(out, rc = 0); /* no more records */
+
+			/* if set, process the callback on this record */
+			if (ext2_test_bit(index, llh->llh_bitmap)) {
+				rec = (void *)tail - tail->lrt_len +
+				      sizeof(*tail);
+
+				rc = cb(env, loghandle, rec, data);
+				if (rc == LLOG_PROC_BREAK) {
+					GOTO(out, rc);
+				} else if (rc == LLOG_DEL_RECORD) {
+					llog_cancel_rec(env, loghandle,
+							tail->lrt_index);
+					rc = 0;
+				}
+				if (rc)
+					GOTO(out, rc);
+			}
+
+			/* previous record, still in buffer? */
+			--index;
+			if (index < first_index)
+				GOTO(out, rc = 0);
+			tail = (void *)tail - tail->lrt_len;
+		}
+	}
+
+out:
+	if (buf)
+		OBD_FREE(buf, LLOG_CHUNK_SIZE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_reverse_process);
+
+/**
+ * new llog API
+ *
+ * API functions:
+ *      llog_open - open llog, may not exist
+ *      llog_exist - check if llog exists
+ *      llog_close - close opened llog, pair for open, frees llog_handle
+ *      llog_declare_create - declare llog creation
+ *      llog_create - create new llog on disk, need transaction handle
+ *      llog_declare_write_rec - declaration of llog write
+ *      llog_write_rec - write llog record on disk, need transaction handle
+ *      llog_declare_add - declare llog catalog record addition
+ *      llog_add - add llog record in catalog, need transaction handle
+ */
+int llog_exist(struct llog_handle *loghandle)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_exist == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_exist(loghandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_exist);
+
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_declare_create == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_declare_create(env, loghandle, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_create);
+
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_create == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_create(env, handle, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_create);
+
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	LASSERT(lop);
+	if (lop->lop_declare_write_rec == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_declare_write_rec(env, handle, rec, idx, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_write_rec);
+
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int numcookies, void *buf, int idx, struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc, buflen;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(lop);
+	if (lop->lop_write_rec == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	if (buf)
+		buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) +
+			 sizeof(struct llog_rec_tail);
+	else
+		buflen = rec->lrh_len;
+	LASSERT(cfs_size_round(buflen) == buflen);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies,
+				buf, idx, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write_rec);
+
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     void *buf, struct thandle *th)
+{
+	int raised, rc;
+
+	ENTRY;
+
+	if (lgh->lgh_logops->lop_add == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_add);
+
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	int raised, rc;
+
+	ENTRY;
+
+	if (lgh->lgh_logops->lop_declare_add == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_add);
+
+/**
+ * Helper function to open llog or create it if doesn't exist.
+ * It hides all transaction handling from caller.
+ */
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name)
+{
+	struct thandle	*th;
+	int		 rc;
+
+	ENTRY;
+
+	rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW);
+	if (rc)
+		RETURN(rc);
+
+	if (llog_exist(*res))
+		RETURN(0);
+
+	if ((*res)->lgh_obj != NULL) {
+		struct dt_device *d;
+
+		d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
+
+		th = dt_trans_create(env, d);
+		if (IS_ERR(th))
+			GOTO(out, rc = PTR_ERR(th));
+
+		rc = llog_declare_create(env, *res, th);
+		if (rc == 0) {
+			rc = dt_trans_start_local(env, d, th);
+			if (rc == 0)
+				rc = llog_create(env, *res, th);
+		}
+		dt_trans_stop(env, d, th);
+	} else {
+		/* lvfs compat code */
+		LASSERT((*res)->lgh_file == NULL);
+		rc = llog_create(env, *res, NULL);
+	}
+out:
+	if (rc)
+		llog_close(env, *res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open_create);
+
+/**
+ * Helper function to delete existent llog.
+ */
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name)
+{
+	struct llog_handle	*handle;
+	int			 rc = 0, rc2;
+
+	ENTRY;
+
+	/* nothing to erase */
+	if (name == NULL && logid == NULL)
+		RETURN(0);
+
+	rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS);
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL);
+	if (rc == 0)
+		rc = llog_destroy(env, handle);
+
+	rc2 = llog_close(env, handle);
+	if (rc == 0)
+		rc = rc2;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_erase);
+
+/*
+ * Helper function for write record in llog.
+ * It hides all transaction handling from caller.
+ * Valid only with local llog.
+ */
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+	       int cookiecount, void *buf, int idx)
+{
+	int rc;
+
+	ENTRY;
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	if (loghandle->lgh_obj != NULL) {
+		struct dt_device	*dt;
+		struct thandle		*th;
+
+		dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+		th = dt_trans_create(env, dt);
+		if (IS_ERR(th))
+			RETURN(PTR_ERR(th));
+
+		rc = llog_declare_write_rec(env, loghandle, rec, idx, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dt, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		down_write(&loghandle->lgh_lock);
+		rc = llog_write_rec(env, loghandle, rec, reccookie,
+				    cookiecount, buf, idx, th);
+		up_write(&loghandle->lgh_lock);
+out_trans:
+		dt_trans_stop(env, dt, th);
+	} else { /* lvfs compatibility */
+		down_write(&loghandle->lgh_lock);
+		rc = llog_write_rec(env, loghandle, rec, reccookie,
+				    cookiecount, buf, idx, NULL);
+		up_write(&loghandle->lgh_lock);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write);
+
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param)
+{
+	int	 raised;
+	int	 rc;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_logops);
+
+	if (ctxt->loc_logops->lop_open == NULL) {
+		*lgh = NULL;
+		RETURN(-EOPNOTSUPP);
+	}
+
+	*lgh = llog_alloc_handle();
+	if (*lgh == NULL)
+		RETURN(-ENOMEM);
+	(*lgh)->lgh_ctxt = ctxt;
+	(*lgh)->lgh_logops = ctxt->loc_logops;
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	if (rc) {
+		llog_free_handle(*lgh);
+		*lgh = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open);
+
+int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		GOTO(out, rc);
+	if (lop->lop_close == NULL)
+		GOTO(out, rc = -EOPNOTSUPP);
+	rc = lop->lop_close(env, loghandle);
+out:
+	llog_handle_put(loghandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_close);

diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
new file mode 100644
index 0000000..cf00b2f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_cat.c

@@ -0,0 +1,833 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_cat.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+
+#include "llog_internal.h"
+
+/* Create a new log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ */
+static int llog_cat_new_log(const struct lu_env *env,
+			    struct llog_handle *cathandle,
+			    struct llog_handle *loghandle,
+			    struct thandle *th)
+{
+
+	struct llog_log_hdr *llh;
+	struct llog_logid_rec rec = { { 0 }, };
+	int rc, index, bitmap_size;
+	ENTRY;
+
+	llh = cathandle->lgh_hdr;
+	bitmap_size = LLOG_BITMAP_SIZE(llh);
+
+	index = (cathandle->lgh_last_idx + 1) % bitmap_size;
+
+	/* maximum number of available slots in catlog is bitmap_size - 2 */
+	if (llh->llh_cat_idx == index) {
+		CERROR("no free catalog slots for log...\n");
+		RETURN(-ENOSPC);
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
+		RETURN(-ENOSPC);
+
+	rc = llog_create(env, loghandle, th);
+	/* if llog is already created, no need to initialize it */
+	if (rc == -EEXIST) {
+		RETURN(0);
+	} else if (rc != 0) {
+		CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
+		       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, loghandle,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &cathandle->lgh_hdr->llh_tgtuuid);
+	if (rc)
+		GOTO(out_destroy, rc);
+
+	if (index == 0)
+		index = 1;
+
+	spin_lock(&loghandle->lgh_hdr_lock);
+	llh->llh_count++;
+	if (ext2_set_bit(index, llh->llh_bitmap)) {
+		CERROR("argh, index %u already set in log bitmap?\n",
+		       index);
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		LBUG(); /* should never happen */
+	}
+	spin_unlock(&loghandle->lgh_hdr_lock);
+
+	cathandle->lgh_last_idx = index;
+	llh->llh_tail.lrt_index = index;
+
+	CDEBUG(D_RPCTRACE,"new recovery log "DOSTID":%x for index %u of catalog"
+	       DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+	       loghandle->lgh_id.lgl_ogen, index,
+	       POSTID(&cathandle->lgh_id.lgl_oi));
+	/* build the record for this log in the catalog */
+	rec.lid_hdr.lrh_len = sizeof(rec);
+	rec.lid_hdr.lrh_index = index;
+	rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+	rec.lid_id = loghandle->lgh_id;
+	rec.lid_tail.lrt_len = sizeof(rec);
+	rec.lid_tail.lrt_index = index;
+
+	/* update the catalog: header and record */
+	rc = llog_write_rec(env, cathandle, &rec.lid_hdr,
+			    &loghandle->u.phd.phd_cookie, 1, NULL, index, th);
+	if (rc < 0)
+		GOTO(out_destroy, rc);
+
+	loghandle->lgh_hdr->llh_cat_idx = index;
+	RETURN(0);
+out_destroy:
+	llog_destroy(env, loghandle);
+	RETURN(rc);
+}
+
+/* Open an existent log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ * We return a lock on the handle to ensure nobody yanks it from us.
+ *
+ * This takes extra reference on llog_handle via llog_handle_get() and require
+ * this reference to be put by caller using llog_handle_put()
+ */
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid)
+{
+	struct llog_handle	*loghandle;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (cathandle == NULL)
+		RETURN(-EBADF);
+
+	down_write(&cathandle->lgh_lock);
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+				u.phd.phd_entry) {
+		struct llog_logid *cgl = &loghandle->lgh_id;
+
+		if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) &&
+		    ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
+			if (cgl->lgl_ogen != logid->lgl_ogen) {
+				CERROR("%s: log "DOSTID" generation %x != %x\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       POSTID(&logid->lgl_oi), cgl->lgl_ogen,
+				       logid->lgl_ogen);
+				continue;
+			}
+			loghandle->u.phd.phd_cat_handle = cathandle;
+			up_write(&cathandle->lgh_lock);
+			GOTO(out, rc = 0);
+		}
+	}
+	up_write(&cathandle->lgh_lock);
+
+	rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL,
+		       LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL);
+	if (rc < 0) {
+		llog_close(env, loghandle);
+		loghandle = NULL;
+		RETURN(rc);
+	}
+
+	down_write(&cathandle->lgh_lock);
+	list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
+	up_write(&cathandle->lgh_lock);
+
+	loghandle->u.phd.phd_cat_handle = cathandle;
+	loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
+	loghandle->u.phd.phd_cookie.lgc_index =
+				loghandle->lgh_hdr->llh_cat_idx;
+	EXIT;
+out:
+	llog_handle_get(loghandle);
+	*res = loghandle;
+	return 0;
+}
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
+{
+	struct llog_handle	*loghandle, *n;
+	int			 rc;
+
+	ENTRY;
+
+	list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head,
+				     u.phd.phd_entry) {
+		struct llog_log_hdr	*llh = loghandle->lgh_hdr;
+		int			 index;
+
+		/* unlink open-not-created llogs */
+		list_del_init(&loghandle->u.phd.phd_entry);
+		llh = loghandle->lgh_hdr;
+		if (loghandle->lgh_obj != NULL && llh != NULL &&
+		    (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+		    (llh->llh_count == 1)) {
+			rc = llog_destroy(env, loghandle);
+			if (rc)
+				CERROR("%s: failure destroying log during "
+				       "cleanup: rc = %d\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       rc);
+
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			llog_cat_cleanup(env, cathandle, NULL, index);
+		}
+		llog_close(env, loghandle);
+	}
+	/* if handle was stored in ctxt, remove it too */
+	if (cathandle->lgh_ctxt->loc_handle == cathandle)
+		cathandle->lgh_ctxt->loc_handle = NULL;
+	rc = llog_close(env, cathandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_close);
+
+/**
+ * lockdep markers for nested struct llog_handle::lgh_lock locking.
+ */
+enum {
+	LLOGH_CAT,
+	LLOGH_LOG
+};
+
+/** Return the currently active log handle.  If the current log handle doesn't
+ * have enough space left for the current record, start a new one.
+ *
+ * If reclen is 0, we only want to know what the currently active log is,
+ * otherwise we get a lock on this log so nobody can steal our space.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ *
+ * NOTE: loghandle is write-locked upon successful return
+ */
+static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
+						struct thandle *th)
+{
+	struct llog_handle *loghandle = NULL;
+	ENTRY;
+
+	down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	loghandle = cathandle->u.chd.chd_current_log;
+	if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		if (llh == NULL ||
+		    loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+			up_read(&cathandle->lgh_lock);
+			RETURN(loghandle);
+		} else {
+			up_write(&loghandle->lgh_lock);
+		}
+	}
+	up_read(&cathandle->lgh_lock);
+
+	/* time to use next log */
+
+	/* first, we have to make sure the state hasn't changed */
+	down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	loghandle = cathandle->u.chd.chd_current_log;
+	if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		LASSERT(llh);
+		if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+			up_write(&cathandle->lgh_lock);
+			RETURN(loghandle);
+		} else {
+			up_write(&loghandle->lgh_lock);
+		}
+	}
+
+	CDEBUG(D_INODE, "use next log\n");
+
+	loghandle = cathandle->u.chd.chd_next_log;
+	cathandle->u.chd.chd_current_log = loghandle;
+	cathandle->u.chd.chd_next_log = NULL;
+	down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+	up_write(&cathandle->lgh_lock);
+	LASSERT(loghandle);
+	RETURN(loghandle);
+}
+
+/* Add a single record to the recovery log(s) using a catalog
+ * Returns as llog_write_record
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     void *buf, struct thandle *th)
+{
+	struct llog_handle *loghandle;
+	int rc;
+	ENTRY;
+
+	LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE);
+	loghandle = llog_cat_current_log(cathandle, th);
+	LASSERT(!IS_ERR(loghandle));
+
+	/* loghandle is already locked by llog_cat_current_log() for us */
+	if (!llog_exist(loghandle)) {
+		rc = llog_cat_new_log(env, cathandle, loghandle, th);
+		if (rc < 0) {
+			up_write(&loghandle->lgh_lock);
+			RETURN(rc);
+		}
+	}
+	/* now let's try to add the record */
+	rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th);
+	if (rc < 0)
+		CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR,
+			     "llog_write_rec %d: lh=%p\n", rc, loghandle);
+	up_write(&loghandle->lgh_lock);
+	if (rc == -ENOSPC) {
+		/* try to use next log */
+		loghandle = llog_cat_current_log(cathandle, th);
+		LASSERT(!IS_ERR(loghandle));
+		/* new llog can be created concurrently */
+		if (!llog_exist(loghandle)) {
+			rc = llog_cat_new_log(env, cathandle, loghandle, th);
+			if (rc < 0) {
+				up_write(&loghandle->lgh_lock);
+				RETURN(rc);
+			}
+		}
+		/* now let's try to add the record */
+		rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf,
+				    -1, th);
+		if (rc < 0)
+			CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle);
+		up_write(&loghandle->lgh_lock);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add_rec);
+
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	struct llog_handle	*loghandle, *next;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (cathandle->u.chd.chd_current_log == NULL) {
+		/* declare new plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == NULL) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_current_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+						  &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	} else if (cathandle->u.chd.chd_next_log == NULL) {
+		/* declare next plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_next_log == NULL) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_next_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+						  &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	}
+	if (rc)
+		GOTO(out, rc);
+
+	if (!llog_exist(cathandle->u.chd.chd_current_log)) {
+		rc = llog_declare_create(env, cathandle->u.chd.chd_current_log,
+					 th);
+		if (rc)
+			GOTO(out, rc);
+		llog_declare_write_rec(env, cathandle, NULL, -1, th);
+	}
+	/* declare records in the llogs */
+	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
+				    rec, -1, th);
+	if (rc)
+		GOTO(out, rc);
+
+	next = cathandle->u.chd.chd_next_log;
+	if (next) {
+		if (!llog_exist(next)) {
+			rc = llog_declare_create(env, next, th);
+			llog_declare_write_rec(env, cathandle, NULL, -1, th);
+		}
+		llog_declare_write_rec(env, next, rec, -1, th);
+	}
+out:
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_declare_add_rec);
+
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		 void *buf)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_device	*dt;
+	struct thandle		*th = NULL;
+	int			 rc;
+
+	ctxt = cathandle->lgh_ctxt;
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+
+	if (cathandle->lgh_obj != NULL) {
+		dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+		LASSERT(dt);
+
+		th = dt_trans_create(env, dt);
+		if (IS_ERR(th))
+			RETURN(PTR_ERR(th));
+
+		rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dt, th);
+		if (rc)
+			GOTO(out_trans, rc);
+		rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th);
+out_trans:
+		dt_trans_stop(env, dt, th);
+	} else { /* lvfs compat code */
+		LASSERT(cathandle->lgh_file != NULL);
+		rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+		if (rc == 0)
+			rc = llog_cat_add_rec(env, cathandle, rec, reccookie,
+					      buf, th);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add);
+
+/* For each cookie in the cookie array, we clear the log in-use bit and either:
+ * - the log is empty, so mark it free in the catalog header and delete it
+ * - the log is not empty, just write out the log header
+ *
+ * The cookies may be in different log files, so we need to get new logs
+ * each time.
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies)
+{
+	int i, index, rc = 0, failed = 0;
+
+	ENTRY;
+
+	for (i = 0; i < count; i++, cookies++) {
+		struct llog_handle	*loghandle;
+		struct llog_logid	*lgl = &cookies->lgc_lgl;
+		int			 lrc;
+
+		rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
+		if (rc) {
+			CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+			       cathandle->lgh_ctxt->loc_obd->obd_name,
+			       POSTID(&lgl->lgl_oi), rc);
+			failed++;
+			continue;
+		}
+
+		lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index);
+		if (lrc == 1) {	  /* log has been destroyed */
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			rc = llog_cat_cleanup(env, cathandle, loghandle,
+					      index);
+		} else if (lrc == -ENOENT) {
+			if (rc == 0) /* ENOENT shouldn't rewrite any error */
+				rc = lrc;
+		} else if (lrc < 0) {
+			failed++;
+			rc = lrc;
+		}
+		llog_handle_put(loghandle);
+	}
+	if (rc)
+		CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
+		       rc);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_cancel_records);
+
+int llog_cat_process_cb(const struct lu_env *env, struct llog_handle *cat_llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	int rc;
+
+	ENTRY;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		RETURN(rc);
+	}
+
+	if (rec->lrh_index < d->lpd_startcat)
+		/* Skip processing of the logs until startcat */
+		RETURN(0);
+
+	if (d->lpd_startidx > 0) {
+		struct llog_process_cat_data cd;
+
+		cd.lpcd_first_idx = d->lpd_startidx;
+		cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  &cd, false);
+		/* Continue processing the next log from idx 0 */
+		d->lpd_startidx = 0;
+	} else {
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  NULL, false);
+	}
+	llog_handle_put(llh);
+
+	RETURN(rc);
+}
+
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     llog_cb_t cb, void *data, int startcat,
+			     int startidx, bool fork)
+{
+	struct llog_process_data d;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
+	ENTRY;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+	d.lpd_startcat = startcat;
+	d.lpd_startidx = startidx;
+
+	if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+		struct llog_process_cat_data cd;
+
+		CWARN("catlog "DOSTID" crosses index zero\n",
+		      POSTID(&cat_llh->lgh_id.lgl_oi));
+
+		cd.lpcd_first_idx = llh->llh_cat_idx;
+		cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, &cd, fork);
+		if (rc != 0)
+			RETURN(rc);
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, &cd, fork);
+	} else {
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, NULL, fork);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_process_or_fork);
+
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx)
+{
+	return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat,
+					startidx, false);
+}
+EXPORT_SYMBOL(llog_cat_process);
+
+static int llog_cat_reverse_process_cb(const struct lu_env *env,
+				       struct llog_handle *cat_llh,
+				       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	int rc;
+
+	if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		RETURN(rc);
+	}
+
+	rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
+	llog_handle_put(llh);
+	RETURN(rc);
+}
+
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     llog_cb_t cb, void *data)
+{
+	struct llog_process_data d;
+	struct llog_process_cat_data cd;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
+	ENTRY;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+
+	if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+		CWARN("catalog "DOSTID" crosses index zero\n",
+		      POSTID(&cat_llh->lgh_id.lgl_oi));
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+		if (rc != 0)
+			RETURN(rc);
+
+		cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+		cd.lpcd_last_idx = 0;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+	} else {
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, NULL);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
+int llog_cat_set_first_idx(struct llog_handle *cathandle, int index)
+{
+	struct llog_log_hdr *llh = cathandle->lgh_hdr;
+	int i, bitmap_size, idx;
+	ENTRY;
+
+	bitmap_size = LLOG_BITMAP_SIZE(llh);
+	if (llh->llh_cat_idx == (index - 1)) {
+		idx = llh->llh_cat_idx + 1;
+		llh->llh_cat_idx = idx;
+		if (idx == cathandle->lgh_last_idx)
+			goto out;
+		for (i = (index + 1) % bitmap_size;
+		     i != cathandle->lgh_last_idx;
+		     i = (i + 1) % bitmap_size) {
+			if (!ext2_test_bit(i, llh->llh_bitmap)) {
+				idx = llh->llh_cat_idx + 1;
+				llh->llh_cat_idx = idx;
+			} else if (i == 0) {
+				llh->llh_cat_idx = 0;
+			} else {
+				break;
+			}
+		}
+out:
+		CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n",
+		       POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx);
+	}
+
+	RETURN(0);
+}
+
+/* Cleanup deleted plain llog traces from catalog */
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index)
+{
+	int rc;
+
+	LASSERT(index);
+	if (loghandle != NULL) {
+		/* remove destroyed llog from catalog list and
+		 * chd_current_log variable */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == loghandle)
+			cathandle->u.chd.chd_current_log = NULL;
+		list_del_init(&loghandle->u.phd.phd_entry);
+		up_write(&cathandle->lgh_lock);
+		LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index);
+		/* llog was opened and keep in a list, close it now */
+		llog_close(env, loghandle);
+	}
+	/* remove plain llog entry from catalog by index */
+	llog_cat_set_first_idx(cathandle, index);
+	rc = llog_cancel_rec(env, cathandle, index);
+	if (rc == 0)
+		CDEBUG(D_HA, "cancel plain log at index"
+		       " %u of catalog "DOSTID"\n",
+		       index, POSTID(&cathandle->lgh_id.lgl_oi));
+	return rc;
+}
+
+int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle,
+		  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	struct llog_handle	*loghandle;
+	struct llog_log_hdr	*llh;
+	int			 rc;
+
+	ENTRY;
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		if (rc == -ENOENT || rc == -ESTALE) {
+			/* remove index from catalog */
+			llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index);
+		}
+		RETURN(rc);
+	}
+
+	llh = loghandle->lgh_hdr;
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    (llh->llh_count == 1)) {
+		rc = llog_destroy(env, loghandle);
+		if (rc)
+			CERROR("%s: fail to destroy empty log: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+
+		llog_cat_cleanup(env, cathandle, loghandle,
+				 loghandle->u.phd.phd_cookie.lgc_index);
+	}
+	llog_handle_put(loghandle);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cat_cancel_cb);
+
+/* helper to initialize catalog llog and process it to cancel */
+int llog_cat_init_and_process(const struct lu_env *env,
+			      struct llog_handle *llh)
+{
+	int rc;
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false);
+	if (rc)
+		CERROR("%s: llog_process() with cat_cancel_cb failed: rc = "
+		       "%d\n", llh->lgh_ctxt->loc_obd->obd_name, rc);
+	RETURN(0);
+}
+EXPORT_SYMBOL(llog_cat_init_and_process);

diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
new file mode 100644
index 0000000..539e1d4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_internal.h

@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LLOG_INTERNAL_H__
+#define __LLOG_INTERNAL_H__
+
+#include <lustre_log.h>
+
+struct llog_process_info {
+	struct llog_handle *lpi_loghandle;
+	llog_cb_t	   lpi_cb;
+	void	       *lpi_cbdata;
+	void	       *lpi_catdata;
+	int		 lpi_rc;
+	struct completion	lpi_completion;
+	const struct lu_env	*lpi_env;
+
+};
+
+struct llog_thread_info {
+	struct lu_attr			 lgi_attr;
+	struct lu_fid			 lgi_fid;
+	struct dt_object_format		 lgi_dof;
+	struct lu_buf			 lgi_buf;
+	loff_t				 lgi_off;
+	struct llog_rec_hdr		 lgi_lrh;
+	struct llog_rec_tail		 lgi_tail;
+};
+
+extern struct lu_context_key llog_thread_key;
+
+static inline struct llog_thread_info *llog_info(const struct lu_env *env)
+{
+	struct llog_thread_info *lgi;
+
+	lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key);
+	LASSERT(lgi);
+	return lgi;
+}
+
+static inline void
+lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen)
+{
+	ostid_set_seq_llog(&logid->lgl_oi);
+	ostid_set_id(&logid->lgl_oi, ino);
+	logid->lgl_ogen = gen;
+}
+
+int llog_info_init(void);
+void llog_info_fini(void);
+
+void llog_handle_get(struct llog_handle *loghandle);
+void llog_handle_put(struct llog_handle *loghandle);
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid);
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index);
+#endif

diff --git a/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c
new file mode 100644
index 0000000..0732874
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c

@@ -0,0 +1,427 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+static int str2logid(struct llog_logid *logid, char *str, int len)
+{
+	char *start, *end, *endp;
+	__u64 id, seq;
+
+	ENTRY;
+	start = str;
+	if (*start != '#')
+		RETURN(-EINVAL);
+
+	start++;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	end = strchr(start, '#');
+	if (end == NULL || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	id = simple_strtoull(start, &endp, 0);
+	if (endp != end)
+		RETURN(-EINVAL);
+
+	start = ++end;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	end = strchr(start, '#');
+	if (end == NULL || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	seq = simple_strtoull(start, &endp, 0);
+	if (endp != end)
+		RETURN(-EINVAL);
+
+	ostid_set_seq(&logid->lgl_oi, seq);
+	ostid_set_id(&logid->lgl_oi, id);
+
+	start = ++end;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	logid->lgl_ogen = simple_strtoul(start, &endp, 16);
+	if (*endp != '\0')
+		RETURN(-EINVAL);
+
+	RETURN(0);
+}
+
+static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_rec_hdr *rec, void *data)
+{
+	struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	static int l, remains, from, to;
+	static char *out;
+	char *endp;
+	int cur_index, rc = 0;
+
+	ENTRY;
+
+	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			cfs_size_round(ioc_data->ioc_inllen1) +
+			cfs_size_round(ioc_data->ioc_inllen2) +
+			cfs_size_round(ioc_data->ioc_inllen3);
+		from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		ioc_data->ioc_inllen1 = 0;
+		out = ioc_data->ioc_bulk;
+	}
+
+	cur_index = rec->lrh_index;
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+		struct llog_handle	*loghandle;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			l = snprintf(out, remains, "[index]: %05d  [type]: "
+				     "%02x  [len]: %04d failed\n",
+				     cur_index, rec->lrh_type,
+				     rec->lrh_len);
+		}
+		if (handle->lgh_ctxt == NULL)
+			RETURN(-EOPNOTSUPP);
+		rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
+		if (rc) {
+			CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+			       POSTID(&lir->lid_id.lgl_oi),
+			       lir->lid_id.lgl_ogen);
+			RETURN(rc);
+		}
+		rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
+		llog_handle_put(loghandle);
+	} else {
+		bool ok;
+
+		switch (rec->lrh_type) {
+		case OST_SZ_REC:
+		case MDS_UNLINK_REC:
+		case MDS_UNLINK64_REC:
+		case MDS_SETATTR64_REC:
+		case OBD_CFG_REC:
+		case LLOG_GEN_REC:
+		case LLOG_HDR_MAGIC:
+			ok = true;
+			break;
+		default:
+			ok = false;
+		}
+
+		l = snprintf(out, remains, "[index]: %05d  [type]: "
+			     "%02x  [len]: %04d %s\n",
+			     cur_index, rec->lrh_type, rec->lrh_len,
+			     ok ? "ok" : "failed");
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			CERROR("%s: no space to print log records\n",
+			       handle->lgh_ctxt->loc_obd->obd_name);
+			RETURN(-LLOG_EEMPTY);
+		}
+	}
+	RETURN(rc);
+}
+
+static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_rec_hdr *rec, void *data)
+{
+	struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	static int l, remains, from, to;
+	static char *out;
+	char *endp;
+	int cur_index;
+
+	ENTRY;
+	if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) {
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			cfs_size_round(ioc_data->ioc_inllen1) +
+			cfs_size_round(ioc_data->ioc_inllen2) +
+			cfs_size_round(ioc_data->ioc_inllen3);
+		from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		out = ioc_data->ioc_bulk;
+		ioc_data->ioc_inllen1 = 0;
+	}
+
+	cur_index = rec->lrh_index;
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			CERROR("invalid record in catalog\n");
+			RETURN(-EINVAL);
+		}
+
+		l = snprintf(out, remains,
+			     "[index]: %05d  [logid]: #"DOSTID"#%08x\n",
+			     cur_index, POSTID(&lir->lid_id.lgl_oi),
+			     lir->lid_id.lgl_ogen);
+	} else if (rec->lrh_type == OBD_CFG_REC) {
+		int rc;
+
+		rc = class_config_parse_rec(rec, out, remains);
+		if (rc < 0)
+			RETURN(rc);
+		l = rc;
+	} else {
+		l = snprintf(out, remains,
+			     "[index]: %05d  [type]: %02x  [len]: %04d\n",
+			     cur_index, rec->lrh_type, rec->lrh_len);
+	}
+	out += l;
+	remains -= l;
+	if (remains <= 0) {
+		CERROR("not enough space for print log records\n");
+		RETURN(-LLOG_EEMPTY);
+	}
+
+	RETURN(0);
+}
+static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
+			   struct llog_logid *logid)
+{
+	struct llog_handle	*log;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_cat_id2handle(env, cat, &log, logid);
+	if (rc) {
+		CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+		       POSTID(&logid->lgl_oi), logid->lgl_ogen);
+		RETURN(-ENOENT);
+	}
+
+	rc = llog_destroy(env, log);
+	if (rc) {
+		CDEBUG(D_IOCTL, "cannot destroy log\n");
+		GOTO(out, rc);
+	}
+	llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
+out:
+	llog_handle_put(log);
+	RETURN(rc);
+
+}
+
+static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	int			 rc;
+
+	ENTRY;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC)
+		RETURN(-EINVAL);
+	rc = llog_remove_log(env, handle, &lir->lid_id);
+
+	RETURN(rc);
+}
+
+
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+	       struct obd_ioctl_data *data)
+{
+	struct llog_logid	 logid;
+	int			 rc = 0;
+	struct llog_handle	*handle = NULL;
+
+	ENTRY;
+
+	if (*data->ioc_inlbuf1 == '#') {
+		rc = str2logid(&logid, data->ioc_inlbuf1, data->ioc_inllen1);
+		if (rc)
+			RETURN(rc);
+		rc = llog_open(env, ctxt, &handle, &logid, NULL,
+			       LLOG_OPEN_EXISTS);
+		if (rc)
+			RETURN(rc);
+	} else if (*data->ioc_inlbuf1 == '$') {
+		char *name = data->ioc_inlbuf1 + 1;
+
+		rc = llog_open(env, ctxt, &handle, NULL, name,
+			       LLOG_OPEN_EXISTS);
+		if (rc)
+			RETURN(rc);
+	} else {
+		RETURN(-EINVAL);
+	}
+
+	rc = llog_init_handle(env, handle, 0, NULL);
+	if (rc)
+		GOTO(out_close, rc = -ENOENT);
+
+	switch (cmd) {
+	case OBD_IOC_LLOG_INFO: {
+		int	 l;
+		int	 remains = data->ioc_inllen2 +
+				   cfs_size_round(data->ioc_inllen1);
+		char	*out = data->ioc_bulk;
+
+		l = snprintf(out, remains,
+			     "logid:	    #"DOSTID"#%08x\n"
+			     "flags:	    %x (%s)\n"
+			     "records count:    %d\n"
+			     "last index:       %d\n",
+			     POSTID(&handle->lgh_id.lgl_oi),
+			     handle->lgh_id.lgl_ogen,
+			     handle->lgh_hdr->llh_flags,
+			     handle->lgh_hdr->llh_flags &
+			     LLOG_F_IS_CAT ? "cat" : "plain",
+			     handle->lgh_hdr->llh_count,
+			     handle->lgh_last_idx);
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			CERROR("%s: not enough space for log header info\n",
+			       ctxt->loc_obd->obd_name);
+			rc = -ENOSPC;
+		}
+		break;
+	}
+	case OBD_IOC_LLOG_CHECK:
+		LASSERT(data->ioc_inllen1 > 0);
+		rc = llog_process(env, handle, llog_check_cb, data, NULL);
+		if (rc == -LLOG_EEMPTY)
+			rc = 0;
+		else if (rc)
+			GOTO(out_close, rc);
+		break;
+	case OBD_IOC_LLOG_PRINT:
+		LASSERT(data->ioc_inllen1 > 0);
+		rc = llog_process(env, handle, llog_print_cb, data, NULL);
+		if (rc == -LLOG_EEMPTY)
+			rc = 0;
+		else if (rc)
+			GOTO(out_close, rc);
+		break;
+	case OBD_IOC_LLOG_CANCEL: {
+		struct llog_cookie cookie;
+		struct llog_logid plain;
+		char *endp;
+
+		cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			GOTO(out_close, rc = -EINVAL);
+
+		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+			rc = llog_cancel_rec(NULL, handle, cookie.lgc_index);
+			GOTO(out_close, rc);
+		} else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+			GOTO(out_close, rc = -EINVAL);
+		}
+
+		if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */
+			GOTO(out_close, rc = -ENOTTY);
+
+		rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2);
+		if (rc)
+			GOTO(out_close, rc);
+		cookie.lgc_lgl = plain;
+		rc = llog_cat_cancel_records(env, handle, 1, &cookie);
+		if (rc)
+			GOTO(out_close, rc);
+		break;
+	}
+	case OBD_IOC_LLOG_REMOVE: {
+		struct llog_logid plain;
+
+		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+			rc = llog_destroy(env, handle);
+			GOTO(out_close, rc);
+		} else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+			GOTO(out_close, rc = -EINVAL);
+		}
+
+		if (data->ioc_inlbuf2 > 0) {
+			/* remove indicate log from the catalog */
+			rc = str2logid(&plain, data->ioc_inlbuf2,
+				       data->ioc_inllen2);
+			if (rc)
+				GOTO(out_close, rc);
+			rc = llog_remove_log(env, handle, &plain);
+		} else {
+			/* remove all the log of the catalog */
+			rc = llog_process(env, handle, llog_delete_cb, NULL,
+					  NULL);
+			if (rc)
+				GOTO(out_close, rc);
+		}
+		break;
+	}
+	default:
+		CERROR("%s: Unknown ioctl cmd %#x\n",
+		       ctxt->loc_obd->obd_name, cmd);
+		GOTO(out_close, rc = -ENOTTY);
+	}
+
+out_close:
+	if (handle->lgh_hdr &&
+	    handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		llog_cat_close(env, handle);
+	else
+		llog_close(env, handle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_ioctl);

diff --git a/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c
new file mode 100644
index 0000000..7e12dc6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c

@@ -0,0 +1,862 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_lvfs.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <obd_ost.h>
+#include <linux/list.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include "llog_internal.h"
+
+#if  defined(LLOG_LVFS)
+
+static int llog_lvfs_pad(struct obd_device *obd, struct l_file *file,
+				int len, int index)
+{
+	struct llog_rec_hdr rec = { 0 };
+	struct llog_rec_tail tail;
+	int rc;
+	ENTRY;
+
+	LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+	tail.lrt_len = rec.lrh_len = len;
+	tail.lrt_index = rec.lrh_index = index;
+	rec.lrh_type = LLOG_PAD_MAGIC;
+
+	rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing padding record: rc %d\n", rc);
+		goto out;
+	}
+
+	file->f_pos += len - sizeof(rec) - sizeof(tail);
+	rc = fsfilt_write_record(obd, file, &tail, sizeof(tail),&file->f_pos,0);
+	if (rc) {
+		CERROR("error writing padding record: rc %d\n", rc);
+		goto out;
+	}
+
+ out:
+	RETURN(rc);
+}
+
+static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file,
+				struct llog_rec_hdr *rec, void *buf, loff_t off)
+{
+	int rc;
+	struct llog_rec_tail end;
+	loff_t saved_off = file->f_pos;
+	int buflen = rec->lrh_len;
+
+	ENTRY;
+
+	file->f_pos = off;
+
+	if (buflen == 0)
+		CWARN("0-length record\n");
+
+	if (!buf) {
+		rc = fsfilt_write_record(obd, file, rec, buflen,&file->f_pos,0);
+		if (rc) {
+			CERROR("error writing log record: rc %d\n", rc);
+			goto out;
+		}
+		GOTO(out, rc = 0);
+	}
+
+	/* the buf case */
+	rec->lrh_len = sizeof(*rec) + buflen + sizeof(end);
+	rc = fsfilt_write_record(obd, file, rec, sizeof(*rec), &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing log hdr: rc %d\n", rc);
+		goto out;
+	}
+
+	rc = fsfilt_write_record(obd, file, buf, buflen, &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing log buffer: rc %d\n", rc);
+		goto out;
+	}
+
+	end.lrt_len = rec->lrh_len;
+	end.lrt_index = rec->lrh_index;
+	rc = fsfilt_write_record(obd, file, &end, sizeof(end), &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing log tail: rc %d\n", rc);
+		goto out;
+	}
+
+	rc = 0;
+ out:
+	if (saved_off > file->f_pos)
+		file->f_pos = saved_off;
+	LASSERT(rc <= 0);
+	RETURN(rc);
+}
+
+static int llog_lvfs_read_blob(struct obd_device *obd, struct l_file *file,
+				void *buf, int size, loff_t off)
+{
+	loff_t offset = off;
+	int rc;
+	ENTRY;
+
+	rc = fsfilt_read_record(obd, file, buf, size, &offset);
+	if (rc) {
+		CERROR("error reading log record: rc %d\n", rc);
+		RETURN(rc);
+	}
+	RETURN(0);
+}
+
+static int llog_lvfs_read_header(const struct lu_env *env,
+				 struct llog_handle *handle)
+{
+	struct obd_device *obd;
+	int rc;
+	ENTRY;
+
+	LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+	obd = handle->lgh_ctxt->loc_exp->exp_obd;
+
+	if (i_size_read(handle->lgh_file->f_dentry->d_inode) == 0) {
+		CDEBUG(D_HA, "not reading header from 0-byte log\n");
+		RETURN(LLOG_EEMPTY);
+	}
+
+	rc = llog_lvfs_read_blob(obd, handle->lgh_file, handle->lgh_hdr,
+				 LLOG_CHUNK_SIZE, 0);
+	if (rc) {
+		CERROR("error reading log header from %.*s\n",
+		       handle->lgh_file->f_dentry->d_name.len,
+		       handle->lgh_file->f_dentry->d_name.name);
+	} else {
+		struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr;
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+			lustre_swab_llog_hdr(handle->lgh_hdr);
+
+		if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+			CERROR("bad log %.*s header magic: %#x (expected %#x)\n",
+			       handle->lgh_file->f_dentry->d_name.len,
+			       handle->lgh_file->f_dentry->d_name.name,
+			       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+			rc = -EIO;
+		} else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+			CERROR("incorrectly sized log %.*s header: %#x "
+			       "(expected %#x)\n",
+			       handle->lgh_file->f_dentry->d_name.len,
+			       handle->lgh_file->f_dentry->d_name.name,
+			       llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+			CERROR("you may need to re-run lconf --write_conf.\n");
+			rc = -EIO;
+		}
+	}
+
+	handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+	handle->lgh_file->f_pos = i_size_read(handle->lgh_file->f_dentry->d_inode);
+
+	RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_lvfs_write_rec(const struct lu_env *env,
+			       struct llog_handle *loghandle,
+			       struct llog_rec_hdr *rec,
+			       struct llog_cookie *reccookie, int cookiecount,
+			       void *buf, int idx, struct thandle *th)
+{
+	struct llog_log_hdr *llh;
+	int reclen = rec->lrh_len, index, rc;
+	struct llog_rec_tail *lrt;
+	struct obd_device *obd;
+	struct file *file;
+	size_t left;
+	ENTRY;
+
+	llh = loghandle->lgh_hdr;
+	file = loghandle->lgh_file;
+	obd = loghandle->lgh_ctxt->loc_exp->exp_obd;
+
+	/* record length should not bigger than LLOG_CHUNK_SIZE */
+	if (buf)
+		rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+		      sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+	else
+		rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+	if (rc)
+		RETURN(rc);
+
+	if (buf)
+		/* write_blob adds header and tail to lrh_len. */
+		reclen = sizeof(*rec) + rec->lrh_len +
+			 sizeof(struct llog_rec_tail);
+
+	if (idx != -1) {
+		loff_t saved_offset;
+
+		/* no header: only allowed to insert record 1 */
+		if (idx != 1 && !i_size_read(file->f_dentry->d_inode)) {
+			CERROR("idx != -1 in empty log\n");
+			LBUG();
+		}
+
+		if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+			RETURN(-EINVAL);
+
+		if (!ext2_test_bit(idx, llh->llh_bitmap))
+			CERROR("Modify unset record %u\n", idx);
+		if (idx != rec->lrh_index)
+			CERROR("Index mismatch %d %u\n", idx, rec->lrh_index);
+
+		rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+		/* we are done if we only write the header or on error */
+		if (rc || idx == 0)
+			RETURN(rc);
+
+		if (buf) {
+			/* We assume that caller has set lgh_cur_* */
+			saved_offset = loghandle->lgh_cur_offset;
+			CDEBUG(D_OTHER,
+			       "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+			       "offset %llu\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi), idx, rec->lrh_index,
+			       loghandle->lgh_cur_idx, rec->lrh_len,
+			       (long long)(saved_offset - sizeof(*llh)));
+			if (rec->lrh_index != loghandle->lgh_cur_idx) {
+				CERROR("modify idx mismatch %u/%d\n",
+				       idx, loghandle->lgh_cur_idx);
+				RETURN(-EFAULT);
+			}
+		} else {
+			/* Assumes constant lrh_len */
+			saved_offset = sizeof(*llh) + (idx - 1) * reclen;
+		}
+
+		rc = llog_lvfs_write_blob(obd, file, rec, buf, saved_offset);
+		if (rc == 0 && reccookie) {
+			reccookie->lgc_lgl = loghandle->lgh_id;
+			reccookie->lgc_index = idx;
+			rc = 1;
+		}
+		RETURN(rc);
+	}
+
+	/* Make sure that records don't cross a chunk boundary, so we can
+	 * process them page-at-a-time if needed.  If it will cross a chunk
+	 * boundary, write in a fake (but referenced) entry to pad the chunk.
+	 *
+	 * We know that llog_current_log() will return a loghandle that is
+	 * big enough to hold reclen, so all we care about is padding here.
+	 */
+	left = LLOG_CHUNK_SIZE - (file->f_pos & (LLOG_CHUNK_SIZE - 1));
+
+	/* NOTE: padding is a record, but no bit is set */
+	if (left != 0 && left != reclen &&
+	    left < (reclen + LLOG_MIN_REC_SIZE)) {
+		 index = loghandle->lgh_last_idx + 1;
+		 rc = llog_lvfs_pad(obd, file, left, index);
+		 if (rc)
+			 RETURN(rc);
+		 loghandle->lgh_last_idx++; /*for pad rec*/
+	 }
+	 /* if it's the last idx in log file, then return -ENOSPC */
+	 if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+		 RETURN(-ENOSPC);
+	loghandle->lgh_last_idx++;
+	index = loghandle->lgh_last_idx;
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	rec->lrh_index = index;
+	if (buf == NULL) {
+		lrt = (struct llog_rec_tail *)
+			((char *)rec + rec->lrh_len - sizeof(*lrt));
+		lrt->lrt_len = rec->lrh_len;
+		lrt->lrt_index = rec->lrh_index;
+	}
+	/*The caller should make sure only 1 process access the lgh_last_idx,
+	 *Otherwise it might hit the assert.*/
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	spin_lock(&loghandle->lgh_hdr_lock);
+	if (ext2_set_bit(index, llh->llh_bitmap)) {
+		CERROR("argh, index %u already set in log bitmap?\n", index);
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		LBUG(); /* should never happen */
+	}
+	llh->llh_count++;
+	spin_unlock(&loghandle->lgh_hdr_lock);
+	llh->llh_tail.lrt_index = index;
+
+	rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_lvfs_write_blob(obd, file, rec, buf, file->f_pos);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u \n",
+	       POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+	if (rc == 0 && reccookie) {
+		reccookie->lgc_lgl = loghandle->lgh_id;
+		reccookie->lgc_index = index;
+		if ((rec->lrh_type == MDS_UNLINK_REC) ||
+		    (rec->lrh_type == MDS_SETATTR64_REC))
+			reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+		else if (rec->lrh_type == OST_SZ_REC)
+			reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+		else
+			reccookie->lgc_subsys = -1;
+		rc = 1;
+	}
+	if (rc == 0 && rec->lrh_type == LLOG_GEN_REC)
+		rc = 1;
+
+	RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+* minimum sized log records we are skipping.  If it turns out
+* that we are not far enough along the log (because the
+* actual records are larger than minimum size) we just skip
+* some more records. */
+
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+	if (goal <= curr)
+		return;
+	*off = (*off + (goal-curr-1) * LLOG_MIN_REC_SIZE) &
+		~(LLOG_CHUNK_SIZE - 1);
+}
+
+
+/* sets:
+ *  - cur_offset to the furthest point read in the log file
+ *  - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_lvfs_next_block(const struct lu_env *env,
+				struct llog_handle *loghandle, int *cur_idx,
+				int next_idx, __u64 *cur_offset, void *buf,
+				int len)
+{
+	int rc;
+	ENTRY;
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+	       next_idx, *cur_idx, *cur_offset);
+
+	while (*cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+		struct llog_rec_hdr *rec, *last_rec;
+		struct llog_rec_tail *tail;
+		loff_t ppos;
+		int llen;
+
+		llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+		/* read up to next LLOG_CHUNK_SIZE block */
+		ppos = *cur_offset;
+		llen = LLOG_CHUNK_SIZE - (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+		rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+					loghandle->lgh_file, buf, llen,
+					cur_offset);
+		if (rc < 0) {
+			CERROR("Cant read llog block at log id "DOSTID
+			       "/%u offset "LPU64"\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen,
+			       *cur_offset);
+			RETURN(rc);
+		}
+
+		/* put number of bytes read into rc to make code simpler */
+		rc = *cur_offset - ppos;
+		if (rc < len) {
+			/* signal the end of the valid buffer to llog_process */
+			memset(buf + rc, 0, len - rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			RETURN(0);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			RETURN(-EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)(buf + rc -
+						sizeof(struct llog_rec_tail));
+
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)(buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		*cur_idx = tail->lrt_index;
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("Invalid llog tail at log id "DOSTID"/%u offset "
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			RETURN(-EINVAL);
+		}
+		if (tail->lrt_index < next_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > next_idx) {
+			CERROR("missed desired record? %u > %u\n",
+			       rec->lrh_index, next_idx);
+			RETURN(-ENOENT);
+		}
+		RETURN(0);
+	}
+	RETURN(-EIO);
+}
+
+static int llog_lvfs_prev_block(const struct lu_env *env,
+				struct llog_handle *loghandle,
+				int prev_idx, void *buf, int len)
+{
+	__u64 cur_offset;
+	int rc;
+	ENTRY;
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+	cur_offset = LLOG_CHUNK_SIZE;
+	llog_skip_over(&cur_offset, 0, prev_idx);
+
+	while (cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+		struct llog_rec_hdr *rec, *last_rec;
+		struct llog_rec_tail *tail;
+		loff_t ppos = cur_offset;
+
+		rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+					loghandle->lgh_file, buf, len,
+					&cur_offset);
+		if (rc < 0) {
+			CERROR("Cant read llog block at log id "DOSTID
+			       "/%u offset "LPU64"\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen,
+			       cur_offset);
+			RETURN(rc);
+		}
+
+		/* put number of bytes read into rc to make code simpler */
+		rc = cur_offset - ppos;
+
+		if (rc == 0) /* end of file, nothing to do */
+			RETURN(0);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			RETURN(-EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)(buf + rc -
+						sizeof(struct llog_rec_tail));
+
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)(buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("Invalid llog tail at log id "DOSTID"/%u offset"
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			RETURN(-EINVAL);
+		}
+		if (tail->lrt_index < prev_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > prev_idx) {
+			CERROR("missed desired record? %u > %u\n",
+			       rec->lrh_index, prev_idx);
+			RETURN(-ENOENT);
+		}
+		RETURN(0);
+	}
+	RETURN(-EIO);
+}
+
+static struct file *llog_filp_open(char *dir, char *name, int flags, int mode)
+{
+	char *logname;
+	struct file *filp;
+	int len;
+
+	OBD_ALLOC(logname, PATH_MAX);
+	if (logname == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	len = snprintf(logname, PATH_MAX, "%s/%s", dir, name);
+	if (len >= PATH_MAX - 1) {
+		filp = ERR_PTR(-ENAMETOOLONG);
+	} else {
+		filp = l_filp_open(logname, flags, mode);
+		if (IS_ERR(filp) && PTR_ERR(filp) != -ENOENT)
+			CERROR("logfile creation %s: %ld\n", logname,
+			       PTR_ERR(filp));
+	}
+	OBD_FREE(logname, PATH_MAX);
+	return filp;
+}
+
+static int llog_lvfs_open(const struct lu_env *env,  struct llog_handle *handle,
+			  struct llog_logid *logid, char *name,
+			  enum llog_open_param open_param)
+{
+	struct llog_ctxt	*ctxt = handle->lgh_ctxt;
+	struct l_dentry		*dchild = NULL;
+	struct obd_device	*obd;
+	int			 rc = 0;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	LASSERT(ctxt->loc_exp->exp_obd);
+	obd = ctxt->loc_exp->exp_obd;
+
+	LASSERT(handle);
+	if (logid != NULL) {
+		dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &logid->lgl_oi,
+					     logid->lgl_ogen);
+		if (IS_ERR(dchild)) {
+			rc = PTR_ERR(dchild);
+			CERROR("%s: error looking up logfile #"DOSTID "#%08x:"
+			       " rc = %d\n", ctxt->loc_obd->obd_name,
+			       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+			GOTO(out, rc);
+		}
+		if (dchild->d_inode == NULL) {
+			l_dput(dchild);
+			rc = -ENOENT;
+			CERROR("%s: nonexistent llog #"DOSTID"#%08x:"
+			       "rc = %d\n", ctxt->loc_obd->obd_name,
+			       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+			GOTO(out, rc);
+		}
+		handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild,
+						 O_RDWR | O_LARGEFILE);
+		l_dput(dchild);
+		if (IS_ERR(handle->lgh_file)) {
+			rc = PTR_ERR(handle->lgh_file);
+			handle->lgh_file = NULL;
+			CERROR("%s: error opening llog #"DOSTID"#%08x:"
+			       "rc = %d\n", ctxt->loc_obd->obd_name,
+			       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+			GOTO(out, rc);
+		}
+		handle->lgh_id = *logid;
+	} else if (name) {
+		handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, name,
+						  O_RDWR | O_LARGEFILE, 0644);
+		if (IS_ERR(handle->lgh_file)) {
+			rc = PTR_ERR(handle->lgh_file);
+			handle->lgh_file = NULL;
+			if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+				OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+				if (handle->lgh_name)
+					strcpy(handle->lgh_name, name);
+				else
+					GOTO(out, rc = -ENOMEM);
+				rc = 0;
+			} else {
+				GOTO(out, rc);
+			}
+		} else {
+			lustre_build_llog_lvfs_oid(&handle->lgh_id,
+			    handle->lgh_file->f_dentry->d_inode->i_ino,
+			    handle->lgh_file->f_dentry->d_inode->i_generation);
+		}
+	} else {
+		LASSERTF(open_param == LLOG_OPEN_NEW, "%#x\n", open_param);
+		handle->lgh_file = NULL;
+	}
+
+	/* No new llog is expected but doesn't exist */
+	if (open_param != LLOG_OPEN_NEW && handle->lgh_file == NULL)
+		GOTO(out_name, rc = -ENOENT);
+
+	RETURN(0);
+out_name:
+	if (handle->lgh_name != NULL)
+		OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+	RETURN(rc);
+}
+
+static int llog_lvfs_exist(struct llog_handle *handle)
+{
+	return (handle->lgh_file != NULL);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_lvfs_create(const struct lu_env *env,
+			    struct llog_handle *handle,
+			    struct thandle *th)
+{
+	struct llog_ctxt	*ctxt = handle->lgh_ctxt;
+	struct obd_device	*obd;
+	struct l_dentry		*dchild = NULL;
+	struct file		*file;
+	struct obdo		*oa = NULL;
+	int			 rc = 0;
+	int			 open_flags = O_RDWR | O_CREAT | O_LARGEFILE;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	obd = ctxt->loc_exp->exp_obd;
+	LASSERT(handle->lgh_file == NULL);
+
+	if (handle->lgh_name) {
+		file = llog_filp_open(MOUNT_CONFIGS_DIR, handle->lgh_name,
+				      open_flags, 0644);
+		if (IS_ERR(file))
+			RETURN(PTR_ERR(file));
+
+		lustre_build_llog_lvfs_oid(&handle->lgh_id,
+				file->f_dentry->d_inode->i_ino,
+				file->f_dentry->d_inode->i_generation);
+		handle->lgh_file = file;
+	} else {
+		OBDO_ALLOC(oa);
+		if (oa == NULL)
+			RETURN(-ENOMEM);
+
+		ostid_set_seq_llog(&oa->o_oi);
+		oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
+
+		rc = obd_create(NULL, ctxt->loc_exp, oa, NULL, NULL);
+		if (rc)
+			GOTO(out, rc);
+
+		/* FIXME: rationalize the misuse of o_generation in
+		 *	this API along with mds_obd_{create,destroy}.
+		 *	Hopefully it is only an internal API issue. */
+#define o_generation o_parent_oid
+		dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &oa->o_oi,
+					     oa->o_generation);
+		if (IS_ERR(dchild))
+			GOTO(out, rc = PTR_ERR(dchild));
+
+		file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, open_flags);
+		l_dput(dchild);
+		if (IS_ERR(file))
+			GOTO(out, rc = PTR_ERR(file));
+		handle->lgh_id.lgl_oi = oa->o_oi;
+		handle->lgh_id.lgl_ogen = oa->o_generation;
+		handle->lgh_file = file;
+out:
+		OBDO_FREE(oa);
+	}
+	RETURN(rc);
+}
+
+static int llog_lvfs_close(const struct lu_env *env,
+			   struct llog_handle *handle)
+{
+	int rc;
+
+	ENTRY;
+
+	if (handle->lgh_file == NULL)
+		RETURN(0);
+	rc = filp_close(handle->lgh_file, 0);
+	if (rc)
+		CERROR("%s: error closing llog #"DOSTID"#%08x: "
+		       "rc = %d\n", handle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&handle->lgh_id.lgl_oi),
+		       handle->lgh_id.lgl_ogen, rc);
+	handle->lgh_file = NULL;
+	if (handle->lgh_name) {
+		OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+		handle->lgh_name = NULL;
+	}
+	RETURN(rc);
+}
+
+static int llog_lvfs_destroy(const struct lu_env *env,
+			     struct llog_handle *handle)
+{
+	struct dentry *fdentry;
+	struct obdo *oa;
+	struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd;
+	char *dir;
+	void *th;
+	struct inode *inode;
+	int rc, rc1;
+	ENTRY;
+
+	dir = MOUNT_CONFIGS_DIR;
+
+	LASSERT(handle->lgh_file);
+	fdentry = handle->lgh_file->f_dentry;
+	inode = fdentry->d_parent->d_inode;
+	if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) {
+		struct lvfs_run_ctxt saved;
+		struct vfsmount *mnt = mntget(handle->lgh_file->f_vfsmnt);
+
+		push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+		dget(fdentry);
+		rc = llog_lvfs_close(env, handle);
+		if (rc == 0) {
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+			rc = ll_vfs_unlink(inode, fdentry, mnt);
+			mutex_unlock(&inode->i_mutex);
+		}
+		mntput(mnt);
+
+		dput(fdentry);
+		pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+		RETURN(rc);
+	}
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		RETURN(-ENOMEM);
+
+	oa->o_oi = handle->lgh_id.lgl_oi;
+	oa->o_generation = handle->lgh_id.lgl_ogen;
+#undef o_generation
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER;
+
+	rc = llog_lvfs_close(env, handle);
+	if (rc)
+		GOTO(out, rc);
+
+	th = fsfilt_start_log(obd, inode, FSFILT_OP_UNLINK, NULL, 1);
+	if (IS_ERR(th)) {
+		CERROR("fsfilt_start failed: %ld\n", PTR_ERR(th));
+		GOTO(out, rc = PTR_ERR(th));
+	}
+
+	rc = obd_destroy(NULL, handle->lgh_ctxt->loc_exp, oa,
+			 NULL, NULL, NULL, NULL);
+
+	rc1 = fsfilt_commit(obd, inode, th, 0);
+	if (rc == 0 && rc1 != 0)
+		rc = rc1;
+ out:
+	OBDO_FREE(oa);
+	RETURN(rc);
+}
+
+static int llog_lvfs_declare_create(const struct lu_env *env,
+				    struct llog_handle *res,
+				    struct thandle *th)
+{
+	return 0;
+}
+
+static int llog_lvfs_declare_write_rec(const struct lu_env *env,
+				       struct llog_handle *loghandle,
+				       struct llog_rec_hdr *rec,
+				       int idx, struct thandle *th)
+{
+	return 0;
+}
+
+struct llog_operations llog_lvfs_ops = {
+	.lop_write_rec		= llog_lvfs_write_rec,
+	.lop_next_block		= llog_lvfs_next_block,
+	.lop_prev_block		= llog_lvfs_prev_block,
+	.lop_read_header	= llog_lvfs_read_header,
+	.lop_create		= llog_lvfs_create,
+	.lop_destroy		= llog_lvfs_destroy,
+	.lop_close		= llog_lvfs_close,
+	.lop_open		= llog_lvfs_open,
+	.lop_exist		= llog_lvfs_exist,
+	.lop_declare_create	= llog_lvfs_declare_create,
+	.lop_declare_write_rec	= llog_lvfs_declare_write_rec,
+};
+EXPORT_SYMBOL(llog_lvfs_ops);
+#else /* !__KERNEL__ */
+struct llog_operations llog_lvfs_ops = {};
+#endif

diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
new file mode 100644
index 0000000..7e22907
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_obd.c

@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/* helper functions for calling the llog obd methods */
+static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	OBD_ALLOC_PTR(ctxt);
+	if (!ctxt)
+		return NULL;
+
+	ctxt->loc_obd = obd;
+	atomic_set(&ctxt->loc_refcount, 1);
+
+	return ctxt;
+}
+
+static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
+{
+	if (ctxt->loc_exp) {
+		class_export_put(ctxt->loc_exp);
+		ctxt->loc_exp = NULL;
+	}
+	if (ctxt->loc_imp) {
+		class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = NULL;
+	}
+	OBD_FREE_PTR(ctxt);
+}
+
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct obd_llog_group *olg = ctxt->loc_olg;
+	struct obd_device *obd;
+	int rc = 0;
+
+	spin_lock(&olg->olg_lock);
+	if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
+		spin_unlock(&olg->olg_lock);
+		return rc;
+	}
+	olg->olg_ctxts[ctxt->loc_idx] = NULL;
+	spin_unlock(&olg->olg_lock);
+
+	obd = ctxt->loc_obd;
+	spin_lock(&obd->obd_dev_lock);
+	/* sync with llog ctxt user thread */
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* obd->obd_starting is needed for the case of cleanup
+	 * in error case while obd is starting up. */
+	LASSERTF(obd->obd_starting == 1 ||
+		 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+		 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+		 !!obd->obd_stopping, !!obd->obd_set_up);
+
+	/* cleanup the llog ctxt here */
+	if (CTXTP(ctxt, cleanup))
+		rc = CTXTP(ctxt, cleanup)(env, ctxt);
+
+	llog_ctxt_destroy(ctxt);
+	wake_up(&olg->olg_waitq);
+	return rc;
+}
+EXPORT_SYMBOL(__llog_ctxt_put);
+
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	struct obd_llog_group *olg;
+	int rc, idx;
+	ENTRY;
+
+	LASSERT(ctxt != NULL);
+	LASSERT(ctxt != LP_POISON);
+
+	olg = ctxt->loc_olg;
+	LASSERT(olg != NULL);
+	LASSERT(olg != LP_POISON);
+
+	idx = ctxt->loc_idx;
+
+	/*
+	 * Banlance the ctxt get when calling llog_cleanup()
+	 */
+	LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
+	LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
+	llog_ctxt_put(ctxt);
+
+	/*
+	 * Try to free the ctxt.
+	 */
+	rc = __llog_ctxt_put(env, ctxt);
+	if (rc)
+		CERROR("Error %d while cleaning up ctxt %p\n",
+		       rc, ctxt);
+
+	l_wait_event(olg->olg_waitq,
+		     llog_group_ctxt_null(olg, idx), &lwi);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cleanup);
+
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, struct llog_operations *op)
+{
+	struct llog_ctxt *ctxt;
+	int rc = 0;
+	ENTRY;
+
+	if (index < 0 || index >= LLOG_MAX_CTXTS)
+		RETURN(-EINVAL);
+
+	LASSERT(olg != NULL);
+
+	ctxt = llog_new_ctxt(obd);
+	if (!ctxt)
+		RETURN(-ENOMEM);
+
+	ctxt->loc_obd = obd;
+	ctxt->loc_olg = olg;
+	ctxt->loc_idx = index;
+	ctxt->loc_logops = op;
+	mutex_init(&ctxt->loc_mutex);
+	ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
+	ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
+
+	rc = llog_group_set_ctxt(olg, ctxt, index);
+	if (rc) {
+		llog_ctxt_destroy(ctxt);
+		if (rc == -EEXIST) {
+			ctxt = llog_group_get_ctxt(olg, index);
+			if (ctxt) {
+				/*
+				 * mds_lov_update_desc() might call here multiple
+				 * times. So if the llog is already set up then
+				 * don't to do it again.
+				 */
+				CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n",
+				       obd->obd_name, index);
+				LASSERT(ctxt->loc_olg == olg);
+				LASSERT(ctxt->loc_obd == obd);
+				LASSERT(ctxt->loc_exp == disk_obd->obd_self_export);
+				LASSERT(ctxt->loc_logops == op);
+				llog_ctxt_put(ctxt);
+			}
+			rc = 0;
+		}
+		RETURN(rc);
+	}
+
+	if (op->lop_setup) {
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+			rc = -EOPNOTSUPP;
+		else
+			rc = op->lop_setup(env, obd, olg, index, disk_obd);
+	}
+
+	if (rc) {
+		CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n",
+		       obd->obd_name, index, op->lop_setup, rc);
+		llog_group_clear_ctxt(olg, index);
+		llog_ctxt_destroy(ctxt);
+	} else {
+		CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+		       obd->obd_name, index);
+		ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_setup);
+
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (!ctxt)
+		RETURN(0);
+
+	if (CTXTP(ctxt, sync))
+		rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_sync);
+
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+		 struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+		 struct llog_cookie *logcookies, int numcookies)
+{
+	int raised, rc;
+	ENTRY;
+
+	if (!ctxt) {
+		CERROR("No ctxt\n");
+		RETURN(-ENODEV);
+	}
+
+	if (ctxt->loc_flags & LLOG_CTXT_FLAG_UNINITIALIZED)
+		RETURN(-ENXIO);
+
+	CTXT_CHECK_OP(ctxt, obd_add, -EOPNOTSUPP);
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = CTXTP(ctxt, obd_add)(env, ctxt, rec, lsm, logcookies,
+				  numcookies);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_obd_add);
+
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct lov_stripe_md *lsm, int count,
+		struct llog_cookie *cookies, int flags)
+{
+	int rc;
+	ENTRY;
+
+	if (!ctxt) {
+		CERROR("No ctxt\n");
+		RETURN(-ENODEV);
+	}
+
+	CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
+	rc = CTXTP(ctxt, cancel)(env, ctxt, lsm, count, cookies, flags);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cancel);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+		  struct obd_device *disk_obd, int *index)
+{
+	int rc;
+	ENTRY;
+	OBD_CHECK_DT_OP(obd, llog_init, 0);
+	OBD_COUNTER_INCREMENT(obd, llog_init);
+
+	rc = OBP(obd, llog_init)(obd, olg, disk_obd, index);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_init);
+
+int obd_llog_finish(struct obd_device *obd, int count)
+{
+	int rc;
+	ENTRY;
+	OBD_CHECK_DT_OP(obd, llog_finish, 0);
+	OBD_COUNTER_INCREMENT(obd, llog_finish);
+
+	rc = OBP(obd, llog_finish)(obd, count);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_finish);
+
+/* context key constructor/destructor: llog_key_init, llog_key_fini */
+LU_KEY_INIT_FINI(llog, struct llog_thread_info);
+/* context key: llog_thread_key */
+LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL);
+LU_KEY_INIT_GENERIC(llog);
+EXPORT_SYMBOL(llog_thread_key);
+
+int llog_info_init(void)
+{
+	llog_key_init_generic(&llog_thread_key, NULL);
+	lu_context_key_register(&llog_thread_key);
+	return 0;
+}
+
+void llog_info_fini(void)
+{
+	lu_context_key_degister(&llog_thread_key);
+}

diff --git a/drivers/staging/lustre/lustre/obdclass/llog_osd.c b/drivers/staging/lustre/lustre/obdclass/llog_osd.c
new file mode 100644
index 0000000..6dbd21a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_osd.c

@@ -0,0 +1,1323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_osd.c - low level llog routines on top of OSD API
+ *
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#ifndef EXPORT_SYMTAB
+#define EXPORT_SYMTAB
+#endif
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <dt_object.h>
+
+#include "llog_internal.h"
+#include "local_storage.h"
+
+/*
+ * - multi-chunks or big-declaration approach
+ * - use unique sequence instead of llog sb tracking unique ids
+ * - re-use existing environment
+ * - named llog support (can be used for testing only at the present)
+ * - llog_origin_connect() work with OSD API
+ */
+
+static int llog_osd_declare_new_object(const struct lu_env *env,
+				       struct local_oid_storage *los,
+				       struct dt_object *o,
+				       struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+
+	lgi->lgi_attr.la_valid = LA_MODE;
+	lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	return local_object_declare_create(env, los, o, &lgi->lgi_attr,
+					   &lgi->lgi_dof, th);
+}
+
+static int llog_osd_create_new_object(const struct lu_env *env,
+				      struct local_oid_storage *los,
+				      struct dt_object *o,
+				      struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+
+	lgi->lgi_attr.la_valid = LA_MODE;
+	lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	return local_object_create(env, los, o, &lgi->lgi_attr,
+				   &lgi->lgi_dof, th);
+}
+
+static int llog_osd_pad(const struct lu_env *env, struct dt_object *o,
+			loff_t *off, int len, int index, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(th);
+	LASSERT(off);
+	LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+	lgi->lgi_tail.lrt_len = lgi->lgi_lrh.lrh_len = len;
+	lgi->lgi_tail.lrt_index = lgi->lgi_lrh.lrh_index = index;
+	lgi->lgi_lrh.lrh_type = LLOG_PAD_MAGIC;
+
+	lgi->lgi_buf.lb_buf = &lgi->lgi_lrh;
+	lgi->lgi_buf.lb_len = sizeof(lgi->lgi_lrh);
+	dt_write_lock(env, o, 0);
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc) {
+		CERROR("%s: error writing padding record: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+	lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+	*off += len - sizeof(lgi->lgi_lrh) - sizeof(lgi->lgi_tail);
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc)
+		CERROR("%s: error writing padding record: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+out:
+	dt_write_unlock(env, o);
+	RETURN(rc);
+}
+
+static int llog_osd_write_blob(const struct lu_env *env, struct dt_object *o,
+			       struct llog_rec_hdr *rec, void *buf,
+			       loff_t *off, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	int			 buflen = rec->lrh_len;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(o);
+
+	if (buflen == 0)
+		CWARN("0-length record\n");
+
+	CDEBUG(D_OTHER, "write blob with type %x, buf %p/%u at off %llu\n",
+	       rec->lrh_type, buf, buflen, *off);
+
+	lgi->lgi_attr.la_valid = LA_SIZE;
+	lgi->lgi_attr.la_size = *off;
+
+	if (!buf) {
+		lgi->lgi_buf.lb_len = buflen;
+		lgi->lgi_buf.lb_buf = rec;
+		rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+		if (rc)
+			CERROR("%s: error writing log record: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	/* the buf case */
+	/* protect the following 3 writes from concurrent read */
+	dt_write_lock(env, o, 0);
+	rec->lrh_len = sizeof(*rec) + buflen + sizeof(lgi->lgi_tail);
+	lgi->lgi_buf.lb_len = sizeof(*rec);
+	lgi->lgi_buf.lb_buf = rec;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc) {
+		CERROR("%s: error writing log hdr: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+		GOTO(out_unlock, rc);
+	}
+
+	lgi->lgi_buf.lb_len = buflen;
+	lgi->lgi_buf.lb_buf = buf;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc) {
+		CERROR("%s: error writing log buffer: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+		GOTO(out_unlock, rc);
+	}
+
+	lgi->lgi_tail.lrt_len = rec->lrh_len;
+	lgi->lgi_tail.lrt_index = rec->lrh_index;
+	lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+	lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc)
+		CERROR("%s: error writing log tail: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+
+out_unlock:
+	dt_write_unlock(env, o);
+
+out:
+	/* cleanup the content written above */
+	if (rc) {
+		dt_punch(env, o, lgi->lgi_attr.la_size, OBD_OBJECT_EOF, th,
+			 BYPASS_CAPA);
+		dt_attr_set(env, o, &lgi->lgi_attr, th, BYPASS_CAPA);
+	}
+
+	RETURN(rc);
+}
+
+static int llog_osd_read_header(const struct lu_env *env,
+				struct llog_handle *handle)
+{
+	struct llog_rec_hdr	*llh_hdr;
+	struct dt_object	*o;
+	struct llog_thread_info	*lgi;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+	o = handle->lgh_obj;
+	LASSERT(o);
+
+	lgi = llog_info(env);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+
+	if (lgi->lgi_attr.la_size == 0) {
+		CDEBUG(D_HA, "not reading header from 0-byte log\n");
+		RETURN(LLOG_EEMPTY);
+	}
+
+	lgi->lgi_off = 0;
+	lgi->lgi_buf.lb_buf = handle->lgh_hdr;
+	lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE;
+
+	rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+	if (rc) {
+		CERROR("%s: error reading log header from "DFID": rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       PFID(lu_object_fid(&o->do_lu)), rc);
+		RETURN(rc);
+	}
+
+	llh_hdr = &handle->lgh_hdr->llh_hdr;
+	if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+		lustre_swab_llog_hdr(handle->lgh_hdr);
+
+	if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+		CERROR("%s: bad log %s "DFID" header magic: %#x "
+		       "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+		RETURN(-EIO);
+	} else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+		CERROR("%s: incorrectly sized log %s "DFID" header: "
+		       "%#x (expected %#x)\n"
+		       "you may need to re-run lconf --write_conf.\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+		RETURN(-EIO);
+	}
+
+	handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+	RETURN(0);
+}
+
+static int llog_osd_declare_write_rec(const struct lu_env *env,
+				      struct llog_handle *loghandle,
+				      struct llog_rec_hdr *rec,
+				      int idx, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(th);
+	LASSERT(loghandle);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+
+	/* each time we update header */
+	rc = dt_declare_record_write(env, o, sizeof(struct llog_log_hdr), 0,
+				     th);
+	if (rc || idx == 0) /* if error or just header */
+		RETURN(rc);
+
+	if (dt_object_exists(o)) {
+		rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+		lgi->lgi_off = lgi->lgi_attr.la_size;
+		LASSERT(ergo(rc == 0, lgi->lgi_attr.la_valid & LA_SIZE));
+		if (rc)
+			RETURN(rc);
+
+		rc = dt_declare_punch(env, o, lgi->lgi_off, OBD_OBJECT_EOF, th);
+		if (rc)
+			RETURN(rc);
+	} else {
+		lgi->lgi_off = 0;
+	}
+
+	/* XXX: implement declared window or multi-chunks approach */
+	rc = dt_declare_record_write(env, o, 32 * 1024, lgi->lgi_off, th);
+
+	RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_osd_write_rec(const struct lu_env *env,
+			      struct llog_handle *loghandle,
+			      struct llog_rec_hdr *rec,
+			      struct llog_cookie *reccookie, int cookiecount,
+			      void *buf, int idx, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct llog_log_hdr	*llh;
+	int			 reclen = rec->lrh_len;
+	int			 index, rc, old_tail_idx;
+	struct llog_rec_tail	*lrt;
+	struct dt_object	*o;
+	size_t			 left;
+
+	ENTRY;
+
+	LASSERT(env);
+	llh = loghandle->lgh_hdr;
+	LASSERT(llh);
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(th);
+
+	CDEBUG(D_OTHER, "new record %x to "DFID"\n",
+	       rec->lrh_type, PFID(lu_object_fid(&o->do_lu)));
+
+	/* record length should not bigger than LLOG_CHUNK_SIZE */
+	if (buf)
+		rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+		      sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+	else
+		rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+	if (rc)
+		RETURN(rc);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+	if (rc)
+		RETURN(rc);
+
+	if (buf)
+		/* write_blob adds header and tail to lrh_len. */
+		reclen = sizeof(*rec) + rec->lrh_len +
+			 sizeof(struct llog_rec_tail);
+
+	if (idx != -1) {
+		/* no header: only allowed to insert record 1 */
+		if (idx != 1 && lgi->lgi_attr.la_size == 0)
+			LBUG();
+
+		if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+			RETURN(-EINVAL);
+
+		if (!ext2_test_bit(idx, llh->llh_bitmap))
+			CERROR("%s: modify unset record %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, idx);
+		if (idx != rec->lrh_index)
+			CERROR("%s: index mismatch %d %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, idx,
+			       rec->lrh_index);
+
+		lgi->lgi_off = 0;
+		rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+					 &lgi->lgi_off, th);
+		/* we are done if we only write the header or on error */
+		if (rc || idx == 0)
+			RETURN(rc);
+
+		if (buf) {
+			/* We assume that caller has set lgh_cur_* */
+			lgi->lgi_off = loghandle->lgh_cur_offset;
+			CDEBUG(D_OTHER,
+			       "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+			       "offset %llu\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi), idx,
+			       rec->lrh_index,
+			       loghandle->lgh_cur_idx, rec->lrh_len,
+			       (long long)(lgi->lgi_off - sizeof(*llh)));
+			if (rec->lrh_index != loghandle->lgh_cur_idx) {
+				CERROR("%s: modify idx mismatch %u/%d\n",
+				       o->do_lu.lo_dev->ld_obd->obd_name, idx,
+				       loghandle->lgh_cur_idx);
+				RETURN(-EFAULT);
+			}
+		} else {
+			/* Assumes constant lrh_len */
+			lgi->lgi_off = sizeof(*llh) + (idx - 1) * reclen;
+		}
+
+		rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+		if (rc == 0 && reccookie) {
+			reccookie->lgc_lgl = loghandle->lgh_id;
+			reccookie->lgc_index = idx;
+			rc = 1;
+		}
+		RETURN(rc);
+	}
+
+	/* Make sure that records don't cross a chunk boundary, so we can
+	 * process them page-at-a-time if needed.  If it will cross a chunk
+	 * boundary, write in a fake (but referenced) entry to pad the chunk.
+	 *
+	 * We know that llog_current_log() will return a loghandle that is
+	 * big enough to hold reclen, so all we care about is padding here.
+	 */
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+	lgi->lgi_off = lgi->lgi_attr.la_size;
+	left = LLOG_CHUNK_SIZE - (lgi->lgi_off & (LLOG_CHUNK_SIZE - 1));
+	/* NOTE: padding is a record, but no bit is set */
+	if (left != 0 && left != reclen &&
+	    left < (reclen + LLOG_MIN_REC_SIZE)) {
+		index = loghandle->lgh_last_idx + 1;
+		rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th);
+		if (rc)
+			RETURN(rc);
+		loghandle->lgh_last_idx++; /*for pad rec*/
+	}
+	/* if it's the last idx in log file, then return -ENOSPC */
+	if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+		RETURN(-ENOSPC);
+
+	loghandle->lgh_last_idx++;
+	index = loghandle->lgh_last_idx;
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	rec->lrh_index = index;
+	if (buf == NULL) {
+		lrt = (struct llog_rec_tail *)((char *)rec + rec->lrh_len -
+					       sizeof(*lrt));
+		lrt->lrt_len = rec->lrh_len;
+		lrt->lrt_index = rec->lrh_index;
+	}
+	/* The caller should make sure only 1 process access the lgh_last_idx,
+	 * Otherwise it might hit the assert.*/
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	spin_lock(&loghandle->lgh_hdr_lock);
+	if (ext2_set_bit(index, llh->llh_bitmap)) {
+		CERROR("%s: index %u already set in log bitmap\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, index);
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		LBUG(); /* should never happen */
+	}
+	llh->llh_count++;
+	spin_unlock(&loghandle->lgh_hdr_lock);
+	old_tail_idx = llh->llh_tail.lrt_index;
+	llh->llh_tail.lrt_index = index;
+
+	lgi->lgi_off = 0;
+	rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, &lgi->lgi_off,
+				 th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+	if (rc)
+		GOTO(out, rc);
+
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+	lgi->lgi_off = lgi->lgi_attr.la_size;
+
+	rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+
+out:
+	/* cleanup llog for error case */
+	if (rc) {
+		spin_lock(&loghandle->lgh_hdr_lock);
+		ext2_clear_bit(index, llh->llh_bitmap);
+		llh->llh_count--;
+		spin_unlock(&loghandle->lgh_hdr_lock);
+
+		/* restore the header */
+		loghandle->lgh_last_idx--;
+		llh->llh_tail.lrt_index = old_tail_idx;
+		lgi->lgi_off = 0;
+		llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+				    &lgi->lgi_off, th);
+	}
+
+	CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u\n",
+	       POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+	if (rc == 0 && reccookie) {
+		reccookie->lgc_lgl = loghandle->lgh_id;
+		reccookie->lgc_index = index;
+		if ((rec->lrh_type == MDS_UNLINK_REC) ||
+		    (rec->lrh_type == MDS_SETATTR64_REC))
+			reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+		else if (rec->lrh_type == OST_SZ_REC)
+			reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+		else
+			reccookie->lgc_subsys = -1;
+		rc = 1;
+	}
+	RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+ * minimum sized log records we are skipping.  If it turns out
+ * that we are not far enough along the log (because the
+ * actual records are larger than minimum size) we just skip
+ * some more records.
+ */
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+	if (goal <= curr)
+		return;
+	*off = (*off + (goal - curr - 1) * LLOG_MIN_REC_SIZE) &
+		~(LLOG_CHUNK_SIZE - 1);
+}
+
+/* sets:
+ *  - cur_offset to the furthest point read in the log file
+ *  - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_osd_next_block(const struct lu_env *env,
+			       struct llog_handle *loghandle, int *cur_idx,
+			       int next_idx, __u64 *cur_offset, void *buf,
+			       int len)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	struct dt_device	*dt;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(lgi);
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+	       next_idx, *cur_idx, *cur_offset);
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(dt_object_exists(o));
+	dt = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(dt);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	while (*cur_offset < lgi->lgi_attr.la_size) {
+		struct llog_rec_hdr	*rec, *last_rec;
+		struct llog_rec_tail	*tail;
+
+		llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+		/* read up to next LLOG_CHUNK_SIZE block */
+		lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE -
+				      (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+		lgi->lgi_buf.lb_buf = buf;
+
+		/* Note: read lock is not needed around la_size get above at
+		 * the time of dt_attr_get(). There are only two cases that
+		 * matter. Either la_size == cur_offset, in which case the
+		 * entire read is skipped, or la_size > cur_offset and the loop
+		 * is entered and this thread is blocked at dt_read_lock()
+		 * until the write is completed. When the write completes, then
+		 * the dt_read() will be done with the full length, and will
+		 * get the full data.
+		 */
+		dt_read_lock(env, o, 0);
+		rc = dt_read(env, o, &lgi->lgi_buf, cur_offset);
+		dt_read_unlock(env, o);
+		if (rc < 0) {
+			CERROR("%s: can't read llog block from log "DFID
+			       " offset "LPU64": rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(lu_object_fid(&o->do_lu)), *cur_offset,
+			       rc);
+			GOTO(out, rc);
+		}
+
+		if (rc < len) {
+			/* signal the end of the valid buffer to
+			 * llog_process */
+			memset(buf + rc, 0, len - rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			GOTO(out, rc);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)((char *)buf + rc -
+						sizeof(struct llog_rec_tail));
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		*cur_idx = tail->lrt_index;
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+		if (tail->lrt_index < next_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > next_idx) {
+			CERROR("%s: missed desired record? %u > %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       rec->lrh_index, next_idx);
+			GOTO(out, rc = -ENOENT);
+		}
+		GOTO(out, rc = 0);
+	}
+	GOTO(out, rc = -EIO);
+out:
+	return rc;
+}
+
+static int llog_osd_prev_block(const struct lu_env *env,
+			       struct llog_handle *loghandle,
+			       int prev_idx, void *buf, int len)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	struct dt_device	*dt;
+	loff_t			 cur_offset;
+	int			 rc;
+
+	ENTRY;
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(dt_object_exists(o));
+	dt = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(dt);
+
+	cur_offset = LLOG_CHUNK_SIZE;
+	llog_skip_over(&cur_offset, 0, prev_idx);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	while (cur_offset < lgi->lgi_attr.la_size) {
+		struct llog_rec_hdr	*rec, *last_rec;
+		struct llog_rec_tail	*tail;
+
+		lgi->lgi_buf.lb_len = len;
+		lgi->lgi_buf.lb_buf = buf;
+		/* It is OK to have locking around dt_read() only, see
+		 * comment in llog_osd_next_block for details
+		 */
+		dt_read_lock(env, o, 0);
+		rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset);
+		dt_read_unlock(env, o);
+		if (rc < 0) {
+			CERROR("%s: can't read llog block from log "DFID
+			       " offset "LPU64": rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(lu_object_fid(&o->do_lu)), cur_offset, rc);
+			GOTO(out, rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			GOTO(out, rc);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)((char *)buf + rc -
+						sizeof(struct llog_rec_tail));
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+		if (tail->lrt_index < prev_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > prev_idx) {
+			CERROR("%s: missed desired record? %u > %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       rec->lrh_index, prev_idx);
+			GOTO(out, rc = -ENOENT);
+		}
+		GOTO(out, rc = 0);
+	}
+	GOTO(out, rc = -EIO);
+out:
+	return rc;
+}
+
+struct dt_object *llog_osd_dir_get(const struct lu_env *env,
+				   struct llog_ctxt *ctxt)
+{
+	struct dt_device	*dt;
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dir;
+	int			 rc;
+
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	if (ctxt->loc_dir == NULL) {
+		rc = dt_root_get(env, dt, &dti->dti_fid);
+		if (rc)
+			return ERR_PTR(rc);
+		dir = dt_locate(env, dt, &dti->dti_fid);
+	} else {
+		lu_object_get(&ctxt->loc_dir->do_lu);
+		dir = ctxt->loc_dir;
+	}
+
+	return dir;
+}
+
+static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_logid *logid, char *name,
+			 enum llog_open_param open_param)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct llog_ctxt		*ctxt = handle->lgh_ctxt;
+	struct dt_object		*o;
+	struct dt_device		*dt;
+	struct ls_device		*ls;
+	struct local_oid_storage	*los;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	LASSERT(ctxt->loc_exp->exp_obd);
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	LASSERT(dt);
+
+	ls = ls_device_get(dt);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG);
+	mutex_unlock(&ls->ls_los_mutex);
+	LASSERT(los);
+	ls_device_put(env, ls);
+
+	LASSERT(handle);
+
+	if (logid != NULL) {
+		logid_to_fid(logid, &lgi->lgi_fid);
+	} else if (name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			GOTO(out, rc = PTR_ERR(llog_dir));
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid);
+		dt_read_unlock(env, llog_dir);
+		lu_object_put(env, &llog_dir->do_lu);
+		if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+			/* generate fid for new llog */
+			rc = local_object_fid_generate(env, los,
+						       &lgi->lgi_fid);
+		}
+		if (rc < 0)
+			GOTO(out, rc);
+		OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+		if (handle->lgh_name)
+			strcpy(handle->lgh_name, name);
+		else
+			GOTO(out, rc = -ENOMEM);
+	} else {
+		LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param);
+		/* generate fid for new llog */
+		rc = local_object_fid_generate(env, los, &lgi->lgi_fid);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+	o = ls_locate(env, ls, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		GOTO(out_name, rc = PTR_ERR(o));
+
+	/* No new llog is expected but doesn't exist */
+	if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o))
+		GOTO(out_put, rc = -ENOENT);
+
+	fid_to_logid(&lgi->lgi_fid, &handle->lgh_id);
+	handle->lgh_obj = o;
+	handle->private_data = los;
+	LASSERT(handle->lgh_ctxt);
+
+	RETURN(rc);
+
+out_put:
+	lu_object_put(env, &o->do_lu);
+out_name:
+	if (handle->lgh_name != NULL)
+		OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+	dt_los_put(los);
+	RETURN(rc);
+}
+
+static int llog_osd_exist(struct llog_handle *handle)
+{
+	LASSERT(handle->lgh_obj);
+	return (dt_object_exists(handle->lgh_obj) &&
+		!lu_object_is_dying(handle->lgh_obj->do_lu.lo_header));
+}
+
+static int llog_osd_declare_create(const struct lu_env *env,
+				   struct llog_handle *res, struct thandle *th)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct local_oid_storage	*los;
+	struct dt_object		*o;
+	int				 rc;
+
+	ENTRY;
+
+	LASSERT(res->lgh_obj);
+	LASSERT(th);
+
+	/* object can be created by another thread */
+	o = res->lgh_obj;
+	if (dt_object_exists(o))
+		RETURN(0);
+
+	los = res->private_data;
+	LASSERT(los);
+
+	rc = llog_osd_declare_new_object(env, los, o, th);
+	if (rc)
+		RETURN(rc);
+
+	rc = dt_declare_record_write(env, o, LLOG_CHUNK_SIZE, 0, th);
+	if (rc)
+		RETURN(rc);
+
+	if (res->lgh_name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+		logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+		rc = dt_declare_insert(env, llog_dir,
+				       (struct dt_rec *)&lgi->lgi_fid,
+				       (struct dt_key *)res->lgh_name, th);
+		lu_object_put(env, &llog_dir->do_lu);
+		if (rc)
+			CERROR("%s: can't declare named llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       res->lgh_name, rc);
+	}
+	RETURN(rc);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
+			   struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	struct local_oid_storage *los;
+	struct dt_object	*o;
+	int		      rc = 0;
+
+	ENTRY;
+
+	LASSERT(env);
+	o = res->lgh_obj;
+	LASSERT(o);
+
+	/* llog can be already created */
+	if (dt_object_exists(o))
+		RETURN(-EEXIST);
+
+	los = res->private_data;
+	LASSERT(los);
+
+	dt_write_lock(env, o, 0);
+	if (!dt_object_exists(o))
+		rc = llog_osd_create_new_object(env, los, o, th);
+	else
+		rc = -EEXIST;
+
+	dt_write_unlock(env, o);
+	if (rc)
+		RETURN(rc);
+
+	if (res->lgh_name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+
+		logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_insert(env, llog_dir,
+			       (struct dt_rec *)&lgi->lgi_fid,
+			       (struct dt_key *)res->lgh_name,
+			       th, BYPASS_CAPA, 1);
+		dt_read_unlock(env, llog_dir);
+		lu_object_put(env, &llog_dir->do_lu);
+		if (rc)
+			CERROR("%s: can't create named llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       res->lgh_name, rc);
+	}
+	RETURN(rc);
+}
+
+static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle)
+{
+	struct local_oid_storage	*los;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(handle->lgh_obj);
+
+	lu_object_put(env, &handle->lgh_obj->do_lu);
+
+	los = handle->private_data;
+	LASSERT(los);
+	dt_los_put(los);
+
+	if (handle->lgh_name)
+		OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+
+	RETURN(rc);
+}
+
+static int llog_osd_destroy(const struct lu_env *env,
+			    struct llog_handle *loghandle)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o, *llog_dir = NULL;
+	struct dt_device	*d;
+	struct thandle		*th;
+	char			*name = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = loghandle->lgh_ctxt;
+	LASSERT(ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+
+	d = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(d);
+	LASSERT(d == ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt);
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	if (loghandle->lgh_name) {
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			GOTO(out_trans, rc = PTR_ERR(llog_dir));
+
+		name = loghandle->lgh_name;
+		rc = dt_declare_delete(env, llog_dir,
+				       (struct dt_key *)name, th);
+		if (rc)
+			GOTO(out_trans, rc);
+	}
+
+	dt_declare_ref_del(env, o, th);
+
+	rc = dt_declare_destroy(env, o, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	rc = dt_trans_start_local(env, d, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	dt_write_lock(env, o, 0);
+	if (dt_object_exists(o)) {
+		if (name) {
+			dt_read_lock(env, llog_dir, 0);
+			rc = dt_delete(env, llog_dir,
+				       (struct dt_key *) name,
+				       th, BYPASS_CAPA);
+			dt_read_unlock(env, llog_dir);
+			if (rc) {
+				CERROR("%s: can't remove llog %s: rc = %d\n",
+				       o->do_lu.lo_dev->ld_obd->obd_name,
+				       name, rc);
+				GOTO(out_unlock, rc);
+			}
+		}
+		dt_ref_del(env, o, th);
+		rc = dt_destroy(env, o, th);
+		if (rc)
+			GOTO(out_unlock, rc);
+	}
+out_unlock:
+	dt_write_unlock(env, o);
+out_trans:
+	dt_trans_stop(env, d, th);
+	if (llog_dir != NULL)
+		lu_object_put(env, &llog_dir->do_lu);
+	RETURN(rc);
+}
+
+static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd,
+			  struct obd_llog_group *olg, int ctxt_idx,
+			  struct obd_device *disk_obd)
+{
+	struct local_oid_storage	*los;
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct llog_ctxt		*ctxt;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(obd);
+	LASSERT(olg->olg_ctxts[ctxt_idx]);
+
+	ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]);
+	LASSERT(ctxt);
+
+	/* initialize data allowing to generate new fids,
+	 * literally we need a sequece */
+	lgi->lgi_fid.f_seq = FID_SEQ_LLOG;
+	lgi->lgi_fid.f_oid = 1;
+	lgi->lgi_fid.f_ver = 0;
+	rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+				    &lgi->lgi_fid, &los);
+	if (rc < 0)
+		return rc;
+
+	lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME;
+	lgi->lgi_fid.f_oid = 1;
+	lgi->lgi_fid.f_ver = 0;
+	rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+				    &lgi->lgi_fid, &los);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct dt_device		*dt;
+	struct ls_device		*ls;
+	struct local_oid_storage	*los, *nlos;
+
+	LASSERT(ctxt->loc_exp->exp_obd);
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	ls = ls_device_get(dt);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	los = dt_los_find(ls, FID_SEQ_LLOG);
+	nlos = dt_los_find(ls, FID_SEQ_LLOG_NAME);
+	mutex_unlock(&ls->ls_los_mutex);
+	if (los != NULL) {
+		dt_los_put(los);
+		local_oid_storage_fini(env, los);
+	}
+	if (nlos != NULL) {
+		dt_los_put(nlos);
+		local_oid_storage_fini(env, nlos);
+	}
+	ls_device_put(env, ls);
+	return 0;
+}
+
+struct llog_operations llog_osd_ops = {
+	.lop_next_block		= llog_osd_next_block,
+	.lop_prev_block		= llog_osd_prev_block,
+	.lop_read_header	= llog_osd_read_header,
+	.lop_destroy		= llog_osd_destroy,
+	.lop_setup		= llog_osd_setup,
+	.lop_cleanup		= llog_osd_cleanup,
+	.lop_open		= llog_osd_open,
+	.lop_exist		= llog_osd_exist,
+	.lop_declare_create	= llog_osd_declare_create,
+	.lop_create		= llog_osd_create,
+	.lop_declare_write_rec	= llog_osd_declare_write_rec,
+	.lop_write_rec		= llog_osd_write_rec,
+	.lop_close		= llog_osd_close,
+};
+EXPORT_SYMBOL(llog_osd_ops);
+
+/* reads the catalog list */
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc, size;
+
+	ENTRY;
+
+	LASSERT(d);
+
+	size = sizeof(*idarray) * count;
+	lgi->lgi_off = idx *  sizeof(*idarray);
+
+	lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+	o = dt_locate(env, d, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		RETURN(PTR_ERR(o));
+
+	if (!dt_object_exists(o)) {
+		th = dt_trans_create(env, d);
+		if (IS_ERR(th))
+			GOTO(out, rc = PTR_ERR(th));
+
+		lgi->lgi_attr.la_valid = LA_MODE;
+		lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, d, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		dt_write_lock(env, o, 0);
+		if (!dt_object_exists(o))
+			rc = dt_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		dt_write_unlock(env, o);
+out_trans:
+		dt_trans_stop(env, d, th);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+		CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       lgi->lgi_attr.la_mode);
+		GOTO(out, rc = -ENOENT);
+	}
+
+	CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n",
+	       (int)lgi->lgi_attr.la_size, size);
+
+	/* return just number of llogs */
+	if (idarray == NULL) {
+		rc = lgi->lgi_attr.la_size / sizeof(*idarray);
+		GOTO(out, rc);
+	}
+
+	/* read for new ost index or for empty file */
+	memset(idarray, 0, size);
+	if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+		GOTO(out, rc = 0);
+	if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+		size = lgi->lgi_attr.la_size - lgi->lgi_off;
+
+	lgi->lgi_buf.lb_buf = idarray;
+	lgi->lgi_buf.lb_len = size;
+	rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+	if (rc) {
+		CERROR("%s: error reading CATALOGS: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+		GOTO(out, rc);
+	}
+
+	EXIT;
+out:
+	lu_object_put(env, &o->do_lu);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_get_cat_list);
+
+/* writes the cat list */
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc, size;
+
+	if (!count)
+		RETURN(0);
+
+	LASSERT(d);
+
+	size = sizeof(*idarray) * count;
+	lgi->lgi_off = idx * sizeof(*idarray);
+
+	lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+	o = dt_locate(env, d, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		RETURN(PTR_ERR(o));
+
+	if (!dt_object_exists(o))
+		GOTO(out, rc = -ENOENT);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+		CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       lgi->lgi_attr.la_mode);
+		GOTO(out, rc = -ENOENT);
+	}
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, o, size, lgi->lgi_off, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_trans_start_local(env, d, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	lgi->lgi_buf.lb_buf = idarray;
+	lgi->lgi_buf.lb_len = size;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+	if (rc)
+		CDEBUG(D_INODE, "error writeing CATALOGS: rc = %d\n", rc);
+out_trans:
+	dt_trans_stop(env, d, th);
+out:
+	lu_object_put(env, &o->do_lu);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_put_cat_list);

diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
new file mode 100644
index 0000000..dedfecf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_swab.c

@@ -0,0 +1,407 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_swab.c
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ * Author: jacob berkman  <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <lustre_log.h>
+
+static void print_llogd_body(struct llogd_body *d)
+{
+	CDEBUG(D_OTHER, "llogd body: %p\n", d);
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n",
+	       POSTID(&d->lgd_logid.lgl_oi));
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+	CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+	CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+	CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+	CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+	CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+	CDEBUG(D_OTHER, "\tlgd_cur_offset: "LPX64"\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+	__swab64s (&fid->f_seq);
+	__swab32s (&fid->f_oid);
+	__swab32s (&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
+void lustre_swab_ost_id(struct ost_id *oid)
+{
+	if (fid_seq_is_mdt0(oid->oi.oi_seq)) {
+		__swab64s(&oid->oi.oi_id);
+		__swab64s(&oid->oi.oi_seq);
+	} else {
+		lustre_swab_lu_fid(&oid->oi_fid);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_ost_id);
+
+void lustre_swab_llog_id(struct llog_logid *log_id)
+{
+	__swab64s(&log_id->lgl_oi.oi.oi_id);
+	__swab64s(&log_id->lgl_oi.oi.oi_seq);
+        __swab32s(&log_id->lgl_ogen);
+}
+EXPORT_SYMBOL(lustre_swab_llog_id);
+
+void lustre_swab_llogd_body (struct llogd_body *d)
+{
+	ENTRY;
+	print_llogd_body(d);
+	lustre_swab_llog_id(&d->lgd_logid);
+	__swab32s (&d->lgd_ctxt_idx);
+	__swab32s (&d->lgd_llh_flags);
+	__swab32s (&d->lgd_index);
+	__swab32s (&d->lgd_saved_index);
+	__swab32s (&d->lgd_len);
+	__swab64s (&d->lgd_cur_offset);
+	print_llogd_body(d);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
+{
+	__swab64s (&d->lgdc_gen.mnt_cnt);
+	__swab64s (&d->lgdc_gen.conn_cnt);
+	lustre_swab_llog_id(&d->lgdc_logid);
+	__swab32s (&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid(struct ll_fid *fid)
+{
+	__swab64s (&fid->id);
+	__swab32s (&fid->generation);
+	__swab32s (&fid->f_type);
+}
+EXPORT_SYMBOL(lustre_swab_ll_fid);
+
+void lustre_swab_lu_seq_range(struct lu_seq_range *range)
+{
+	__swab64s (&range->lsr_start);
+	__swab64s (&range->lsr_end);
+	__swab32s (&range->lsr_index);
+	__swab32s (&range->lsr_flags);
+}
+EXPORT_SYMBOL(lustre_swab_lu_seq_range);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
+{
+	struct llog_rec_tail *tail = NULL;
+
+	__swab32s(&rec->lrh_len);
+	__swab32s(&rec->lrh_index);
+	__swab32s(&rec->lrh_type);
+	__swab32s(&rec->lrh_id);
+
+	switch (rec->lrh_type) {
+	case OST_SZ_REC:
+	{
+		struct llog_size_change_rec *lsc =
+			(struct llog_size_change_rec *)rec;
+
+		lustre_swab_ll_fid(&lsc->lsc_fid);
+		__swab32s(&lsc->lsc_ioepoch);
+		tail = &lsc->lsc_tail;
+		break;
+	}
+	case MDS_UNLINK_REC:
+	{
+		struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+		__swab64s(&lur->lur_oid);
+		__swab32s(&lur->lur_oseq);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case MDS_UNLINK64_REC:
+	{
+		struct llog_unlink64_rec *lur =
+			(struct llog_unlink64_rec *)rec;
+
+		lustre_swab_lu_fid(&lur->lur_fid);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case CHANGELOG_REC:
+	{
+		struct llog_changelog_rec *cr = (struct llog_changelog_rec*)rec;
+
+		__swab16s(&cr->cr.cr_namelen);
+		__swab16s(&cr->cr.cr_flags);
+		__swab32s(&cr->cr.cr_type);
+		__swab64s(&cr->cr.cr_index);
+		__swab64s(&cr->cr.cr_prev);
+		__swab64s(&cr->cr.cr_time);
+		lustre_swab_lu_fid(&cr->cr.cr_tfid);
+		lustre_swab_lu_fid(&cr->cr.cr_pfid);
+		if (CHANGELOG_REC_EXTENDED(&cr->cr)) {
+			struct llog_changelog_ext_rec *ext =
+				(struct llog_changelog_ext_rec *)rec;
+
+			lustre_swab_lu_fid(&ext->cr.cr_sfid);
+			lustre_swab_lu_fid(&ext->cr.cr_spfid);
+			tail = &ext->cr_tail;
+		} else {
+			tail = &cr->cr_tail;
+		}
+		break;
+	}
+	case CHANGELOG_USER_REC:
+	{
+		struct llog_changelog_user_rec *cur =
+			(struct llog_changelog_user_rec*)rec;
+
+		__swab32s(&cur->cur_id);
+		__swab64s(&cur->cur_endrec);
+		tail = &cur->cur_tail;
+		break;
+	}
+
+	case MDS_SETATTR64_REC:
+	{
+		struct llog_setattr64_rec *lsr =
+			(struct llog_setattr64_rec *)rec;
+
+		lustre_swab_ost_id(&lsr->lsr_oi);
+		__swab32s(&lsr->lsr_uid);
+		__swab32s(&lsr->lsr_uid_h);
+		__swab32s(&lsr->lsr_gid);
+		__swab32s(&lsr->lsr_gid_h);
+		tail = &lsr->lsr_tail;
+		break;
+	}
+	case OBD_CFG_REC:
+		/* these are swabbed as they are consumed */
+		break;
+	case LLOG_HDR_MAGIC:
+	{
+		struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+		__swab64s(&llh->llh_timestamp);
+		__swab32s(&llh->llh_count);
+		__swab32s(&llh->llh_bitmap_offset);
+		__swab32s(&llh->llh_flags);
+		__swab32s(&llh->llh_size);
+		__swab32s(&llh->llh_cat_idx);
+		tail = &llh->llh_tail;
+		break;
+	}
+	case LLOG_LOGID_MAGIC:
+	{
+		struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+		lustre_swab_llog_id(&lid->lid_id);
+		tail = &lid->lid_tail;
+		break;
+	}
+	case LLOG_GEN_REC:
+	{
+		struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+		__swab64s(&lgr->lgr_gen.mnt_cnt);
+		__swab64s(&lgr->lgr_gen.conn_cnt);
+		tail = &lgr->lgr_tail;
+		break;
+	}
+	case LLOG_PAD_MAGIC:
+		break;
+	default:
+		CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+		       rec->lrh_type, rec);
+	}
+
+	if (tail) {
+		__swab32s(&tail->lrt_len);
+		__swab32s(&tail->lrt_index);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_llog_rec);
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+	CDEBUG(D_OTHER, "llog header: %p\n", h);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+	CDEBUG(D_OTHER, "\tllh_timestamp: "LPX64"\n", h->llh_timestamp);
+	CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+	CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+	CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+	CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+	CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len);
+}
+
+void lustre_swab_llog_hdr (struct llog_log_hdr *h)
+{
+	ENTRY;
+	print_llog_hdr(h);
+
+	lustre_swab_llog_rec(&h->llh_hdr);
+
+	print_llog_hdr(h);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+static void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+	int i;
+	ENTRY;
+
+	if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+		return;
+	CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+	if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+		for (i = 0; i < lcfg->lcfg_bufcount; i++)
+			CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n",
+			       i, lcfg->lcfg_buflens[i]);
+	EXIT;
+}
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+	int i;
+	ENTRY;
+
+	__swab32s(&lcfg->lcfg_version);
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+		CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+		       lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+		EXIT;
+		return;
+	}
+
+	__swab32s(&lcfg->lcfg_command);
+	__swab32s(&lcfg->lcfg_num);
+	__swab32s(&lcfg->lcfg_flags);
+	__swab64s(&lcfg->lcfg_nid);
+	__swab32s(&lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+		__swab32s(&lcfg->lcfg_buflens[i]);
+
+	print_lustre_cfg(lcfg);
+	EXIT;
+	return;
+}
+EXPORT_SYMBOL(lustre_swab_lustre_cfg);
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+	__u32   cm_step;
+	__u32   cm_flags;
+	__u32   cm_vers;
+	__u32   padding;
+	__u32   cm_createtime;
+	__u32   cm_canceltime;
+	char    cm_tgtname[MTI_NAME_MAXLEN];
+	char    cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
+	(sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+	struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
+	ENTRY;
+
+	if (swab) {
+		__swab32s(&marker->cm_step);
+		__swab32s(&marker->cm_flags);
+		__swab32s(&marker->cm_vers);
+	}
+	if (size == sizeof(*cm32)) {
+		__u32 createtime, canceltime;
+		/* There was a problem with the original declaration of
+		 * cfg_marker on 32-bit systems because it used time_t as
+		 * a wire protocol structure, and didn't verify this in
+		 * wirecheck.  We now have to convert the offsets of the
+		 * later fields in order to work on 32- and 64-bit systems.
+		 *
+		 * Fortunately, the cm_comment field has no functional use
+		 * so can be sacrificed when converting the timestamp size.
+		 *
+		 * Overwrite fields from the end first, so they are not
+		 * clobbered, and use memmove() instead of memcpy() because
+		 * the source and target buffers overlap.  bug 16771 */
+		createtime = cm32->cm_createtime;
+		canceltime = cm32->cm_canceltime;
+		memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+		marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+		memmove(marker->cm_tgtname, cm32->cm_tgtname,
+			sizeof(marker->cm_tgtname));
+		if (swab) {
+			__swab32s(&createtime);
+			__swab32s(&canceltime);
+		}
+		marker->cm_createtime = createtime;
+		marker->cm_canceltime = canceltime;
+		CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
+		       "for target %s, converting\n",
+		       marker->cm_tgtname);
+	} else if (swab) {
+		__swab64s(&marker->cm_createtime);
+		__swab64s(&marker->cm_canceltime);
+	}
+
+	EXIT;
+	return;
+}
+EXPORT_SYMBOL(lustre_swab_cfg_marker);

diff --git a/drivers/staging/lustre/lustre/obdclass/llog_test.c b/drivers/staging/lustre/lustre/obdclass/llog_test.c
new file mode 100644
index 0000000..d397f78
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_test.c

@@ -0,0 +1,1087 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_test.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lustre_log.h>
+
+/* This is slightly more than the number of records that can fit into a
+ * single llog file, because the llog_log_header takes up some of the
+ * space in the first block that cannot be used for the bitmap. */
+#define LLOG_TEST_RECNUM  (LLOG_CHUNK_SIZE * 8)
+
+static int llog_test_rand;
+static struct obd_uuid uuid = { .uuid = "test_uuid" };
+static struct llog_logid cat_logid;
+
+struct llog_mini_rec {
+	struct llog_rec_hdr     lmr_hdr;
+	struct llog_rec_tail    lmr_tail;
+} __attribute__((packed));
+
+static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
+{
+	int i;
+	int last_idx = 0;
+	int active_recs = 0;
+
+	for (i = 0; i < LLOG_BITMAP_BYTES * 8; i++) {
+		if (ext2_test_bit(i, llh->lgh_hdr->llh_bitmap)) {
+			last_idx = i;
+			active_recs++;
+		}
+	}
+
+	if (active_recs != num_recs) {
+		CERROR("%s: expected %d active recs after write, found %d\n",
+		       test, num_recs, active_recs);
+		RETURN(-ERANGE);
+	}
+
+	if (llh->lgh_hdr->llh_count != num_recs) {
+		CERROR("%s: handle->count is %d, expected %d after write\n",
+		       test, llh->lgh_hdr->llh_count, num_recs);
+		RETURN(-ERANGE);
+	}
+
+	if (llh->lgh_last_idx < last_idx) {
+		CERROR("%s: handle->last_idx is %d, expected %d after write\n",
+		       test, llh->lgh_last_idx, last_idx);
+		RETURN(-ERANGE);
+	}
+
+	RETURN(0);
+}
+
+/* Test named-log create/open, close */
+static int llog_test_1(const struct lu_env *env,
+		       struct obd_device *obd, char *name)
+{
+	struct llog_handle	*llh;
+	struct llog_ctxt	*ctxt;
+	int rc;
+	int rc2;
+
+	ENTRY;
+
+	CWARN("1a: create a log with name: %s\n", name);
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, name);
+	if (rc) {
+		CERROR("1a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(out, rc);
+	}
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("1a: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	rc = verify_handle("1", llh, 1);
+
+	CWARN("1b: close newly-created log\n");
+out_close:
+	rc2 = llog_close(env, llh);
+	if (rc2) {
+		CERROR("1b: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+/* Test named-log reopen; returns opened log on success */
+static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
+		       char *name, struct llog_handle **llh)
+{
+	struct llog_ctxt	*ctxt;
+	struct llog_handle	*loghandle;
+	struct llog_logid	 logid;
+	int			 rc;
+
+	ENTRY;
+
+	CWARN("2a: re-open a log with name: %s\n", name);
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("2a: re-open log with name %s failed: %d\n", name, rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2a: can't init llog handle: %d\n", rc);
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = verify_handle("2", *llh, 1);
+	if (rc)
+		GOTO(out_close_llh, rc);
+
+	/* XXX: there is known issue with tests 2b, MGS is not able to create
+	 * anonymous llog, exit now to allow following tests run.
+	 * It is fixed in upcoming llog over OSD code */
+	GOTO(out_put, rc);
+
+	CWARN("2b: create a log without specified NAME & LOGID\n");
+	rc = llog_open_create(env, ctxt, &loghandle, NULL, NULL);
+	if (rc) {
+		CERROR("2b: create log failed\n");
+		GOTO(out_close_llh, rc);
+	}
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2b: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	logid = loghandle->lgh_id;
+	llog_close(env, loghandle);
+
+	CWARN("2c: re-open the log by LOGID\n");
+	rc = llog_open(env, ctxt, &loghandle, &logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("2c: re-open log by LOGID failed\n");
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2c: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	CWARN("2b: destroy this log\n");
+	rc = llog_destroy(env, loghandle);
+	if (rc)
+		CERROR("2d: destroy log failed\n");
+out_close:
+	llog_close(env, loghandle);
+out_close_llh:
+	if (rc)
+		llog_close(env, *llh);
+out_put:
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+/* Test record writing, single and in bulk */
+static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
+		       struct llog_handle *llh)
+{
+	struct llog_gen_rec	 lgr;
+	int			 rc, i;
+	int			 num_recs = 1; /* 1 for the header */
+
+	ENTRY;
+
+	lgr.lgr_hdr.lrh_len = lgr.lgr_tail.lrt_len = sizeof(lgr);
+	lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+	CWARN("3a: write one create_rec\n");
+	rc = llog_write(env, llh,  &lgr.lgr_hdr, NULL, 0, NULL, -1);
+	num_recs++;
+	if (rc < 0) {
+		CERROR("3a: write one log record failed: %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = verify_handle("3a", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	CWARN("3b: write 10 cfg log records with 8 bytes bufs\n");
+	for (i = 0; i < 10; i++) {
+		struct llog_rec_hdr	hdr;
+		char			buf[8];
+
+		hdr.lrh_len = 8;
+		hdr.lrh_type = OBD_CFG_REC;
+		memset(buf, 0, sizeof buf);
+		rc = llog_write(env, llh, &hdr, NULL, 0, buf, -1);
+		if (rc < 0) {
+			CERROR("3b: write 10 records failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	rc = verify_handle("3b", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	CWARN("3c: write 1000 more log records\n");
+	for (i = 0; i < 1000; i++) {
+		rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1);
+		if (rc < 0) {
+			CERROR("3c: write 1000 records failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	rc = verify_handle("3c", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	CWARN("3d: write log more than BITMAP_SIZE, return -ENOSPC\n");
+	for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr) + 1; i++) {
+		struct llog_rec_hdr	hdr;
+		char			buf_even[24];
+		char			buf_odd[32];
+
+		memset(buf_odd, 0, sizeof buf_odd);
+		memset(buf_even, 0, sizeof buf_even);
+		if ((i % 2) == 0) {
+			hdr.lrh_len = 24;
+			hdr.lrh_type = OBD_CFG_REC;
+			rc = llog_write(env, llh, &hdr, NULL, 0, buf_even, -1);
+		} else {
+			hdr.lrh_len = 32;
+			hdr.lrh_type = OBD_CFG_REC;
+			rc = llog_write(env, llh, &hdr, NULL, 0, buf_odd, -1);
+		}
+		if (rc == -ENOSPC) {
+			break;
+		} else if (rc < 0) {
+			CERROR("3d: write recs failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+	if (rc != -ENOSPC) {
+		CWARN("3d: write record more than BITMAP size!\n");
+		RETURN(-EINVAL);
+	}
+	CWARN("3d: wrote %d more records before end of llog is reached\n",
+	      num_recs);
+
+	rc = verify_handle("3d", llh, num_recs);
+
+	RETURN(rc);
+}
+
+/* Test catalogue additions */
+static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*cath;
+	char			 name[10];
+	int			 rc, rc2, i, buflen;
+	struct llog_mini_rec	 lmr;
+	struct llog_cookie	 cookie;
+	struct llog_ctxt	*ctxt;
+	int			 num_recs = 0;
+	char			*buf;
+	struct llog_rec_hdr	 rec;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+	sprintf(name, "%x", llog_test_rand + 1);
+	CWARN("4a: create a catalog log with name: %s\n", name);
+	rc = llog_open_create(env, ctxt, &cath, NULL, name);
+	if (rc) {
+		CERROR("4a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(ctxt_release, rc);
+	}
+	rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("4a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	num_recs++;
+	cat_logid = cath->lgh_id;
+
+	CWARN("4b: write 1 record into the catalog\n");
+	rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie, NULL);
+	if (rc != 1) {
+		CERROR("4b: write 1 catalog record failed at: %d\n", rc);
+		GOTO(out, rc);
+	}
+	num_recs++;
+	rc = verify_handle("4b", cath, 2);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4c: cancel 1 log record\n");
+	rc = llog_cat_cancel_records(env, cath, 1, &cookie);
+	if (rc) {
+		CERROR("4c: cancel 1 catalog based record failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	num_recs--;
+
+	rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL, NULL);
+		if (rc) {
+			CERROR("4d: write %d records failed at #%d: %d\n",
+			       LLOG_TEST_RECNUM, i + 1, rc);
+			GOTO(out, rc);
+		}
+		num_recs++;
+	}
+
+	/* make sure new plain llog appears */
+	rc = verify_handle("4d", cath, 3);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4e: add 5 large records, one record per block\n");
+	buflen = LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+		 sizeof(struct llog_rec_tail);
+	OBD_ALLOC(buf, buflen);
+	if (buf == NULL)
+		GOTO(out, rc = -ENOMEM);
+	for (i = 0; i < 5; i++) {
+		rec.lrh_len = buflen;
+		rec.lrh_type = OBD_CFG_REC;
+		rc = llog_cat_add(env, cath, &rec, NULL, buf);
+		if (rc) {
+			CERROR("4e: write 5 records failed at #%d: %d\n",
+			       i + 1, rc);
+			GOTO(out_free, rc);
+		}
+		num_recs++;
+	}
+out_free:
+	OBD_FREE(buf, buflen);
+out:
+	CWARN("4f: put newly-created catalog\n");
+	rc2 = llog_cat_close(env, cath);
+	if (rc2) {
+		CERROR("4: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static int cat_counter;
+
+static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	struct lu_fid		 fid = {0};
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&lir->lid_id, &fid);
+
+	CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+	      rec->lrh_index, PFID(&fid),
+	      PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+	cat_counter++;
+
+	RETURN(0);
+}
+
+static int plain_counter;
+
+static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct lu_fid fid = {0};
+
+	if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+		CERROR("log is not plain\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&llh->lgh_id, &fid);
+
+	CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n",
+	       rec->lrh_index, PFID(&fid));
+
+	plain_counter++;
+
+	RETURN(0);
+}
+
+static int cancel_count;
+
+static int llog_cancel_rec_cb(const struct lu_env *env,
+			      struct llog_handle *llh,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_cookie cookie;
+
+	if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+		CERROR("log is not plain\n");
+		RETURN(-EINVAL);
+	}
+
+	cookie.lgc_lgl = llh->lgh_id;
+	cookie.lgc_index = rec->lrh_index;
+
+	llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
+	cancel_count++;
+	if (cancel_count == LLOG_TEST_RECNUM)
+		RETURN(-LLOG_EEMPTY);
+	RETURN(0);
+}
+
+/* Test log and catalogue processing */
+static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*llh = NULL;
+	char			 name[10];
+	int			 rc, rc2;
+	struct llog_mini_rec	 lmr;
+	struct llog_ctxt	*ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+	CWARN("5a: re-open catalog by id\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("5a: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("5a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("5b: print the catalog entries.. we expect 2\n");
+	cat_counter = 0;
+	rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+	if (rc) {
+		CERROR("5b: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 2) {
+		CERROR("5b: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	cancel_count = 0;
+	rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("5c: process with cat_cancel_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("5c: print the catalog entries.. we expect 1\n");
+	cat_counter = 0;
+	rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+	if (rc) {
+		CERROR("5c: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 1) {
+		CERROR("5c: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5d: add 1 record to the log with many canceled empty pages\n");
+	rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL, NULL);
+	if (rc) {
+		CERROR("5d: add record to the log with many canceled empty "
+		       "pages failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("5e: print plain log entries.. expect 6\n");
+	plain_counter = 0;
+	rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0);
+	if (rc) {
+		CERROR("5e: process with plain_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (plain_counter != 6) {
+		CERROR("5e: found %d records\n", plain_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5f: print plain log entries reversely.. expect 6\n");
+	plain_counter = 0;
+	rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar");
+	if (rc) {
+		CERROR("5f: reversely process with plain_print_cb failed:"
+		       "%d\n", rc);
+		GOTO(out, rc);
+	}
+	if (plain_counter != 6) {
+		CERROR("5f: found %d records\n", plain_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+out:
+	CWARN("5g: close re-opened catalog\n");
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("5g: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out_put:
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+/* Test client api; open log by name and process */
+static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
+		       char *name)
+{
+	struct obd_device	*mgc_obd;
+	struct llog_ctxt	*ctxt;
+	struct obd_uuid		*mgs_uuid;
+	struct obd_export	*exp;
+	struct obd_uuid		 uuid = { "LLOG_TEST6_UUID" };
+	struct llog_handle	*llh = NULL;
+	struct llog_ctxt	*nctxt;
+	int			 rc, rc2;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+	mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid;
+
+	CWARN("6a: re-open log %s using client API\n", name);
+	mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL);
+	if (mgc_obd == NULL) {
+		CERROR("6a: no MGC devices connected to %s found.\n",
+		       mgs_uuid->uuid);
+		GOTO(ctxt_release, rc = -ENOENT);
+	}
+
+	rc = obd_connect(NULL, &exp, mgc_obd, &uuid,
+			 NULL /* obd_connect_data */, NULL);
+	if (rc != -EALREADY) {
+		CERROR("6a: connect on connected MGC (%s) failed to return"
+		       " -EALREADY", mgc_obd->obd_name);
+		if (rc == 0)
+			obd_disconnect(exp);
+		GOTO(ctxt_release, rc = -EINVAL);
+	}
+
+	nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT);
+	rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("6a: llog_open failed %d\n", rc);
+		GOTO(nctxt_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc) {
+		CERROR("6a: llog_init_handle failed %d\n", rc);
+		GOTO(parse_out, rc);
+	}
+
+	plain_counter = 1; /* llog header is first record */
+	CWARN("6b: process log %s using client API\n", name);
+	rc = llog_process(env, llh, plain_print_cb, NULL, NULL);
+	if (rc)
+		CERROR("6b: llog_process failed %d\n", rc);
+	CWARN("6b: processed %d records\n", plain_counter);
+
+	rc = verify_handle("6b", llh, plain_counter);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	plain_counter = 1; /* llog header is first record */
+	CWARN("6c: process log %s reversely using client API\n", name);
+	rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL);
+	if (rc)
+		CERROR("6c: llog_reverse_process failed %d\n", rc);
+	CWARN("6c: processed %d records\n", plain_counter);
+
+	rc = verify_handle("6c", llh, plain_counter);
+	if (rc)
+		GOTO(parse_out, rc);
+
+parse_out:
+	rc2 = llog_close(env, llh);
+	if (rc2) {
+		CERROR("6: llog_close failed: rc = %d\n", rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+nctxt_put:
+	llog_ctxt_put(nctxt);
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static union {
+	struct llog_rec_hdr		lrh;   /* common header */
+	struct llog_logid_rec		llr;   /* LLOG_LOGID_MAGIC */
+	struct llog_unlink64_rec	lur;   /* MDS_UNLINK64_REC */
+	struct llog_setattr64_rec	lsr64; /* MDS_SETATTR64_REC */
+	struct llog_size_change_rec	lscr;  /* OST_SZ_REC */
+	struct llog_changelog_rec	lcr;   /* CHANGELOG_REC */
+	struct llog_changelog_user_rec	lcur;  /* CHANGELOG_USER_REC */
+	struct llog_gen_rec		lgr;   /* LLOG_GEN_REC */
+} llog_records;
+
+static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			   struct llog_rec_hdr *rec, void *data)
+{
+	struct lu_fid fid = {0};
+
+	logid_to_fid(&llh->lgh_id, &fid);
+
+	CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n",
+	       rec->lrh_type, rec->lrh_index, PFID(&fid));
+
+	plain_counter++;
+	return 0;
+}
+
+static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
+			    struct llog_rec_hdr *rec, void *data)
+{
+	plain_counter++;
+	/* test LLOG_DEL_RECORD is working */
+	return LLOG_DEL_RECORD;
+}
+
+static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct llog_handle	*llh;
+	int			 rc = 0, i, process_count;
+	int			 num_recs = 0;
+
+	ENTRY;
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, NULL);
+	if (rc) {
+		CERROR("7_sub: create log failed\n");
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, llh,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &uuid);
+	if (rc) {
+		CERROR("7_sub: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr); i++) {
+		rc = llog_write(env, llh, &llog_records.lrh, NULL, 0,
+				NULL, -1);
+		if (rc == -ENOSPC) {
+			break;
+		} else if (rc < 0) {
+			CERROR("7_sub: write recs failed at #%d: %d\n",
+			       i + 1, rc);
+			GOTO(out_close, rc);
+		}
+		num_recs++;
+	}
+	if (rc != -ENOSPC) {
+		CWARN("7_sub: write record more than BITMAP size!\n");
+		GOTO(out_close, rc = -EINVAL);
+	}
+
+	rc = verify_handle("7_sub", llh, num_recs + 1);
+	if (rc) {
+		CERROR("7_sub: verify handle failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	if (num_recs < LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1)
+		CWARN("7_sub: records are not aligned, written %d from %u\n",
+		      num_recs, LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1);
+
+	plain_counter = 0;
+	rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL);
+	if (rc) {
+		CERROR("7_sub: llog process failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	process_count = plain_counter;
+	if (process_count != num_recs) {
+		CERROR("7_sub: processed %d records from %d total\n",
+		       process_count, num_recs);
+		GOTO(out_close, rc = -EINVAL);
+	}
+
+	plain_counter = 0;
+	rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL);
+	if (rc) {
+		CERROR("7_sub: reverse llog process failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	if (process_count != plain_counter) {
+		CERROR("7_sub: Reverse/direct processing found different"
+		       "number of records: %d/%d\n",
+		       plain_counter, process_count);
+		GOTO(out_close, rc = -EINVAL);
+	}
+	if (llog_exist(llh)) {
+		CERROR("7_sub: llog exists but should be zapped\n");
+		GOTO(out_close, rc = -EEXIST);
+	}
+
+	rc = verify_handle("7_sub", llh, 1);
+out_close:
+	if (rc)
+		llog_destroy(env, llh);
+	llog_close(env, llh);
+	RETURN(rc);
+}
+
+/* Test all llog records writing and processing */
+static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+
+	CWARN("7a: test llog_logid_rec\n");
+	llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7a: llog_logid_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7b: test llog_unlink64_rec\n");
+	llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur);
+	llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur);
+	llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7b: llog_unlink_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7c: test llog_setattr64_rec\n");
+	llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64);
+	llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64);
+	llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7c: llog_setattr64_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7d: test llog_size_change_rec\n");
+	llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7d: llog_size_change_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7e: test llog_changelog_rec\n");
+	llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_tail.lrt_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7e: llog_changelog_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7f: test llog_changelog_user_rec\n");
+	llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7f: llog_changelog_user_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7g: test llog_gen_rec\n");
+	llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr);
+	llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr);
+	llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7g: llog_size_change_rec test failed\n");
+		GOTO(out, rc);
+	}
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+/* -------------------------------------------------------------------------
+ * Tests above, boring obd functions below
+ * ------------------------------------------------------------------------- */
+static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*llh = NULL;
+	struct llog_ctxt	*ctxt;
+	int			 rc, err;
+	char			 name[10];
+
+	ENTRY;
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	sprintf(name, "%x", llog_test_rand);
+
+	rc = llog_test_1(env, obd, name);
+	if (rc)
+		GOTO(cleanup_ctxt, rc);
+
+	rc = llog_test_2(env, obd, name, &llh);
+	if (rc)
+		GOTO(cleanup_ctxt, rc);
+
+	rc = llog_test_3(env, obd, llh);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_4(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_5(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_6(env, obd, name);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_7(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+cleanup:
+	err = llog_destroy(env, llh);
+	if (err)
+		CERROR("cleanup: llog_destroy failed: %d\n", err);
+	llog_close(env, llh);
+	if (rc == 0)
+		rc = err;
+cleanup_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+#ifdef LPROCFS
+static struct lprocfs_vars lprocfs_llog_test_obd_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_llog_test_module_vars[] = { {0} };
+static void lprocfs_llog_test_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_llog_test_module_vars;
+    lvars->obd_vars     = lprocfs_llog_test_obd_vars;
+}
+#endif
+
+static int llog_test_cleanup(struct obd_device *obd)
+{
+	struct obd_device	*tgt;
+	struct lu_env		 env;
+	int			 rc;
+
+	ENTRY;
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd;
+	rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT));
+	if (rc)
+		CERROR("failed to llog_test_llog_finish: %d\n", rc);
+	lu_env_fini(&env);
+	RETURN(rc);
+}
+
+static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_device	*tgt;
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o;
+	struct lu_env		 env;
+	struct lu_context	 test_session;
+	int			 rc;
+
+	ENTRY;
+
+	if (lcfg->lcfg_bufcount < 2) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	if (lcfg->lcfg_buflens[1] < 1) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	/* disk obd */
+	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+		CERROR("target device not attached or not set up (%s)\n",
+		       lustre_cfg_string(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	rc = lu_context_init(&test_session, LCT_SESSION);
+	if (rc)
+		GOTO(cleanup_env, rc);
+	test_session.lc_thread = (struct ptlrpc_thread *)current;
+	lu_context_enter(&test_session);
+	env.le_ses = &test_session;
+
+	CWARN("Setup llog-test device over %s device\n",
+	      lustre_cfg_string(lcfg, 1));
+
+	OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+	obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev);
+
+	rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt,
+			&llog_osd_ops);
+	if (rc)
+		GOTO(cleanup_session, rc);
+
+	/* use MGS llog dir for tests */
+	ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT);
+	LASSERT(ctxt);
+	o = ctxt->loc_dir;
+	llog_ctxt_put(ctxt);
+
+	ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+	ctxt->loc_dir = o;
+	llog_ctxt_put(ctxt);
+
+	llog_test_rand = cfs_rand();
+
+	rc = llog_run_tests(&env, tgt);
+	if (rc)
+		llog_test_cleanup(obd);
+cleanup_session:
+	lu_context_exit(&test_session);
+	lu_context_fini(&test_session);
+cleanup_env:
+	lu_env_fini(&env);
+	RETURN(rc);
+}
+
+static struct obd_ops llog_obd_ops = {
+	.o_owner       = THIS_MODULE,
+	.o_setup       = llog_test_setup,
+	.o_cleanup     = llog_test_cleanup,
+};
+
+static int __init llog_test_init(void)
+{
+	struct lprocfs_static_vars lvars;
+
+	lprocfs_llog_test_init_vars(&lvars);
+	return class_register_type(&llog_obd_ops, NULL,
+				   lvars.module_vars, "llog_test", NULL);
+}
+
+static void __exit llog_test_exit(void)
+{
+	class_unregister_type("llog_test");
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("llog test module");
+MODULE_LICENSE("GPL");
+
+module_init(llog_test_init);
+module_exit(llog_test_exit);

diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.c b/drivers/staging/lustre/lustre/obdclass/local_storage.c
new file mode 100644
index 0000000..3be35a8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/local_storage.c

@@ -0,0 +1,903 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "local_storage.h"
+
+/* all initialized local storages on this node are linked on this */
+static LIST_HEAD(ls_list_head);
+static DEFINE_MUTEX(ls_list_mutex);
+
+static int ls_object_init(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_object_conf *unused)
+{
+	struct ls_device	*ls;
+	struct lu_object	*below;
+	struct lu_device	*under;
+
+	ENTRY;
+
+	ls = container_of0(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev);
+	under = &ls->ls_osd->dd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
+	if (below == NULL)
+		RETURN(-ENOMEM);
+
+	lu_object_add(o, below);
+
+	RETURN(0);
+}
+
+static void ls_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	struct ls_object	*obj = lu2ls_obj(o);
+	struct lu_object_header	*h = o->lo_header;
+
+	dt_object_fini(&obj->ls_obj);
+	lu_object_header_fini(h);
+	OBD_FREE_PTR(obj);
+}
+
+struct lu_object_operations ls_lu_obj_ops = {
+	.loo_object_init  = ls_object_init,
+	.loo_object_free  = ls_object_free,
+};
+
+struct lu_object *ls_object_alloc(const struct lu_env *env,
+				  const struct lu_object_header *_h,
+				  struct lu_device *d)
+{
+	struct lu_object_header	*h;
+	struct ls_object	*o;
+	struct lu_object	*l;
+
+	LASSERT(_h == NULL);
+
+	OBD_ALLOC_PTR(o);
+	if (o != NULL) {
+		l = &o->ls_obj.do_lu;
+		h = &o->ls_header;
+
+		lu_object_header_init(h);
+		dt_object_init(&o->ls_obj, h, d);
+		lu_object_add_top(h, l);
+
+		l->lo_ops = &ls_lu_obj_ops;
+
+		return l;
+	} else {
+		return NULL;
+	}
+}
+
+static struct lu_device_operations ls_lu_dev_ops = {
+	.ldo_object_alloc =	ls_object_alloc
+};
+
+static struct ls_device *__ls_find_dev(struct dt_device *dev)
+{
+	struct ls_device *ls, *ret = NULL;
+
+	list_for_each_entry(ls, &ls_list_head, ls_linkage) {
+		if (ls->ls_osd == dev) {
+			atomic_inc(&ls->ls_refcount);
+			ret = ls;
+			break;
+		}
+	}
+	return ret;
+}
+
+struct ls_device *ls_find_dev(struct dt_device *dev)
+{
+	struct ls_device *ls;
+
+	mutex_lock(&ls_list_mutex);
+	ls = __ls_find_dev(dev);
+	mutex_unlock(&ls_list_mutex);
+
+	return ls;
+}
+
+static struct lu_device_type_operations ls_device_type_ops = {
+	.ldto_start = NULL,
+	.ldto_stop  = NULL,
+};
+
+static struct lu_device_type ls_lu_type = {
+	.ldt_name = "local_storage",
+	.ldt_ops  = &ls_device_type_ops,
+};
+
+struct ls_device *ls_device_get(struct dt_device *dev)
+{
+	struct ls_device *ls;
+
+	ENTRY;
+
+	mutex_lock(&ls_list_mutex);
+	ls = __ls_find_dev(dev);
+	if (ls)
+		GOTO(out_ls, ls);
+
+	/* not found, then create */
+	OBD_ALLOC_PTR(ls);
+	if (ls == NULL)
+		GOTO(out_ls, ls = ERR_PTR(-ENOMEM));
+
+	atomic_set(&ls->ls_refcount, 1);
+	INIT_LIST_HEAD(&ls->ls_los_list);
+	mutex_init(&ls->ls_los_mutex);
+
+	ls->ls_osd = dev;
+
+	LASSERT(dev->dd_lu_dev.ld_site);
+	lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type);
+	ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops;
+	ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site;
+
+	/* finally add ls to the list */
+	list_add(&ls->ls_linkage, &ls_list_head);
+out_ls:
+	mutex_unlock(&ls_list_mutex);
+	RETURN(ls);
+}
+
+void ls_device_put(const struct lu_env *env, struct ls_device *ls)
+{
+	LASSERT(env);
+	if (!atomic_dec_and_test(&ls->ls_refcount))
+		return;
+
+	mutex_lock(&ls_list_mutex);
+	if (atomic_read(&ls->ls_refcount) == 0) {
+		LASSERT(list_empty(&ls->ls_los_list));
+		list_del(&ls->ls_linkage);
+		lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0);
+		lu_device_fini(&ls->ls_top_dev.dd_lu_dev);
+		OBD_FREE_PTR(ls);
+	}
+	mutex_unlock(&ls_list_mutex);
+}
+
+/**
+ * local file fid generation
+ */
+int local_object_fid_generate(const struct lu_env *env,
+			      struct local_oid_storage *los,
+			      struct lu_fid *fid)
+{
+	LASSERT(los->los_dev);
+	LASSERT(los->los_obj);
+
+	/* take next OID */
+
+	/* to make it unique after reboot we store
+	 * the latest generated fid atomically with
+	 * object creation see local_object_create() */
+
+	mutex_lock(&los->los_id_lock);
+	fid->f_seq = los->los_seq;
+	fid->f_oid = ++los->los_last_oid;
+	fid->f_ver = 0;
+	mutex_unlock(&los->los_id_lock);
+
+	return 0;
+}
+
+int local_object_declare_create(const struct lu_env *env,
+				struct local_oid_storage *los,
+				struct dt_object *o, struct lu_attr *attr,
+				struct dt_object_format *dof,
+				struct thandle *th)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	/* update fid generation file */
+	if (los != NULL) {
+		LASSERT(dt_object_exists(los->los_obj));
+		rc = dt_declare_record_write(env, los->los_obj,
+					     sizeof(struct los_ondisk), 0, th);
+		if (rc)
+			RETURN(rc);
+	}
+
+	rc = dt_declare_create(env, o, attr, NULL, dof, th);
+	if (rc)
+		RETURN(rc);
+
+	dti->dti_lb.lb_buf = NULL;
+	dti->dti_lb.lb_len = sizeof(dti->dti_lma);
+	rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th);
+
+	RETURN(rc);
+}
+
+int local_object_create(const struct lu_env *env,
+			struct local_oid_storage *los,
+			struct dt_object *o, struct lu_attr *attr,
+			struct dt_object_format *dof, struct thandle *th)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	obd_id			 lastid;
+	int			 rc;
+
+	ENTRY;
+
+	rc = dt_create(env, o, attr, NULL, dof, th);
+	if (rc)
+		RETURN(rc);
+
+	if (los == NULL)
+		RETURN(rc);
+
+	LASSERT(los->los_obj);
+	LASSERT(dt_object_exists(los->los_obj));
+
+	/* many threads can be updated this, serialize
+	 * them here to avoid the race where one thread
+	 * takes the value first, but writes it last */
+	mutex_lock(&los->los_id_lock);
+
+	/* update local oid number on disk so that
+	 * we know the last one used after reboot */
+	lastid = cpu_to_le64(los->los_last_oid);
+
+	dti->dti_off = 0;
+	dti->dti_lb.lb_buf = &lastid;
+	dti->dti_lb.lb_len = sizeof(lastid);
+	rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off,
+			     th);
+	mutex_unlock(&los->los_id_lock);
+
+	RETURN(rc);
+}
+
+/*
+ * Create local named object (file, directory or index) in parent directory.
+ */
+struct dt_object *__local_file_create(const struct lu_env *env,
+				      const struct lu_fid *fid,
+				      struct local_oid_storage *los,
+				      struct ls_device *ls,
+				      struct dt_object *parent,
+				      const char *name, struct lu_attr *attr,
+				      struct dt_object_format *dof)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	struct thandle		*th;
+	int			 rc;
+
+	dto = ls_locate(env, ls, fid);
+	if (unlikely(IS_ERR(dto)))
+		RETURN(dto);
+
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		GOTO(out, rc = -EEXIST);
+
+	th = dt_trans_create(env, ls->ls_osd);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = local_object_declare_create(env, los, dto, attr, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+		dt_declare_ref_add(env, dto, th);
+		dt_declare_ref_add(env, parent, th);
+	}
+
+	rc = dt_declare_insert(env, parent, (void *)fid, (void *)name, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	rc = dt_trans_start_local(env, ls->ls_osd, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
+
+	CDEBUG(D_OTHER, "create new object "DFID"\n",
+	       PFID(lu_object_fid(&dto->do_lu)));
+	rc = local_object_create(env, los, dto, attr, dof, th);
+	if (rc)
+		GOTO(unlock, rc);
+	LASSERT(dt_object_exists(dto));
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+		if (!dt_try_as_dir(env, dto))
+			GOTO(destroy, rc = -ENOTDIR);
+		/* Add "." and ".." for newly created dir */
+		rc = dt_insert(env, dto, (void *)fid, (void *)".", th,
+			       BYPASS_CAPA, 1);
+		if (rc)
+			GOTO(destroy, rc);
+		dt_ref_add(env, dto, th);
+		rc = dt_insert(env, dto, (void *)lu_object_fid(&parent->do_lu),
+			       (void *)"..", th, BYPASS_CAPA, 1);
+		if (rc)
+			GOTO(destroy, rc);
+	}
+
+	dt_write_lock(env, parent, 0);
+	rc = dt_insert(env, parent, (const struct dt_rec *)fid,
+		       (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+	if (dti->dti_dof.dof_type == DFT_DIR)
+		dt_ref_add(env, parent, th);
+	dt_write_unlock(env, parent);
+	if (rc)
+		GOTO(destroy, rc);
+destroy:
+	if (rc)
+		dt_destroy(env, dto, th);
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, ls->ls_osd, th);
+out:
+	if (rc) {
+		lu_object_put_nocache(env, &dto->do_lu);
+		dto = ERR_PTR(rc);
+	}
+	RETURN(dto);
+}
+
+/*
+ * Look up and create (if it does not exist) a local named file or directory in
+ * parent directory.
+ */
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+					    struct local_oid_storage *los,
+					    struct dt_object *parent,
+					    const char *name, __u32 mode)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0)
+		/* name is found, get the object */
+		dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+	else if (rc != -ENOENT)
+		dto = ERR_PTR(rc);
+	else {
+		rc = local_object_fid_generate(env, los, &dti->dti_fid);
+		if (rc < 0) {
+			dto = ERR_PTR(rc);
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid	= LA_MODE;
+			dti->dti_attr.la_mode	= mode;
+			dti->dti_dof.dof_type	= dt_mode_to_dft(mode & S_IFMT);
+			dto = __local_file_create(env, &dti->dti_fid, los,
+						  dt2ls_dev(los->los_dev),
+						  parent, name, &dti->dti_attr,
+						  &dti->dti_dof);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create);
+
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+						     struct dt_device *dt,
+						     const struct lu_fid *fid,
+						     struct dt_object *parent,
+						     const char *name,
+						     __u32 mode)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		dto = dt_locate(env, dt, &dti->dti_fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		struct ls_device *ls;
+
+		ls = ls_device_get(dt);
+		if (IS_ERR(ls)) {
+			dto = ERR_PTR(PTR_ERR(ls));
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid	= LA_MODE;
+			dti->dti_attr.la_mode	= mode;
+			dti->dti_dof.dof_type	= dt_mode_to_dft(mode & S_IFMT);
+			dto = __local_file_create(env, fid, NULL, ls, parent,
+						  name, &dti->dti_attr,
+						  &dti->dti_dof);
+			/* ls_device_put() will finalize the ls device, we
+			 * have to open the object in other device stack */
+			if (!IS_ERR(dto)) {
+				dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+				lu_object_put_nocache(env, &dto->do_lu);
+				dto = dt_locate(env, dt, &dti->dti_fid);
+			}
+			ls_device_put(env, ls);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create_with_fid);
+
+/*
+ * Look up and create (if it does not exist) a local named index file in parent
+ * directory.
+ */
+struct dt_object *local_index_find_or_create(const struct lu_env *env,
+					     struct local_oid_storage *los,
+					     struct dt_object *parent,
+					     const char *name, __u32 mode,
+					     const struct dt_index_features *ft)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		/* name is found, get the object */
+		dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		rc = local_object_fid_generate(env, los, &dti->dti_fid);
+		if (rc < 0) {
+			dto = ERR_PTR(rc);
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid		= LA_MODE;
+			dti->dti_attr.la_mode		= mode;
+			dti->dti_dof.dof_type		= DFT_INDEX;
+			dti->dti_dof.u.dof_idx.di_feat	= ft;
+			dto = __local_file_create(env, &dti->dti_fid, los,
+						  dt2ls_dev(los->los_dev),
+						  parent, name, &dti->dti_attr,
+						  &dti->dti_dof);
+		}
+	}
+	return dto;
+
+}
+EXPORT_SYMBOL(local_index_find_or_create);
+
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object *parent,
+				    const char *name, __u32 mode,
+				    const struct dt_index_features *ft)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		/* name is found, get the object */
+		if (!lu_fid_eq(fid, &dti->dti_fid))
+			dto = ERR_PTR(-EINVAL);
+		else
+			dto = dt_locate(env, dt, fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		struct ls_device *ls;
+
+		ls = ls_device_get(dt);
+		if (IS_ERR(ls)) {
+			dto = ERR_PTR(PTR_ERR(ls));
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid		= LA_MODE;
+			dti->dti_attr.la_mode		= mode;
+			dti->dti_dof.dof_type		= DFT_INDEX;
+			dti->dti_dof.u.dof_idx.di_feat  = ft;
+			dto = __local_file_create(env, fid, NULL, ls, parent,
+						  name, &dti->dti_attr,
+						  &dti->dti_dof);
+			/* ls_device_put() will finalize the ls device, we
+			 * have to open the object in other device stack */
+			if (!IS_ERR(dto)) {
+				dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+				lu_object_put_nocache(env, &dto->do_lu);
+				dto = dt_locate(env, dt, &dti->dti_fid);
+			}
+			ls_device_put(env, ls);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_index_find_or_create_with_fid);
+
+static int local_object_declare_unlink(const struct lu_env *env,
+				       struct dt_device *dt,
+				       struct dt_object *p,
+				       struct dt_object *c, const char *name,
+				       struct thandle *th)
+{
+	int rc;
+
+	rc = dt_declare_delete(env, p, (const struct dt_key *)name, th);
+	if (rc < 0)
+		return rc;
+
+	rc = dt_declare_ref_del(env, c, th);
+	if (rc < 0)
+		return rc;
+
+	return dt_declare_destroy(env, c, th);
+}
+
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+			struct dt_object *parent, const char *name)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	struct thandle		*th;
+	int			 rc;
+
+	ENTRY;
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == -ENOENT)
+		RETURN(0);
+	else if (rc < 0)
+		RETURN(rc);
+
+	dto = dt_locate(env, dt, &dti->dti_fid);
+	if (unlikely(IS_ERR(dto)))
+		RETURN(PTR_ERR(dto));
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = local_object_declare_unlink(env, dt, parent, dto, name, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	rc = dt_delete(env, parent, (struct dt_key *)name, th, BYPASS_CAPA);
+	if (rc < 0)
+		GOTO(unlock, rc);
+
+	rc = dt_ref_del(env, dto, th);
+	if (rc < 0) {
+		rc = dt_insert(env, parent,
+			       (const struct dt_rec *)&dti->dti_fid,
+			       (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+		GOTO(unlock, rc);
+	}
+
+	rc = dt_destroy(env, dto, th);
+unlock:
+	dt_write_unlock(env, dto);
+stop:
+	dt_trans_stop(env, dt, th);
+out:
+	lu_object_put_nocache(env, &dto->do_lu);
+	return rc;
+}
+EXPORT_SYMBOL(local_object_unlink);
+
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq)
+{
+	struct local_oid_storage *los, *ret = NULL;
+
+	list_for_each_entry(los, &ls->ls_los_list, los_list) {
+		if (los->los_seq == seq) {
+			atomic_inc(&los->los_refcount);
+			ret = los;
+			break;
+		}
+	}
+	return ret;
+}
+
+void dt_los_put(struct local_oid_storage *los)
+{
+	if (atomic_dec_and_test(&los->los_refcount))
+		/* should never happen, only local_oid_storage_fini should
+		 * drop refcount to zero */
+		LBUG();
+	return;
+}
+
+/* after Lustre 2.3 release there may be old file to store last generated FID
+ * If such file exists then we have to read its content
+ */
+int lastid_compat_check(const struct lu_env *env, struct dt_device *dev,
+			__u64 lastid_seq, __u32 *first_oid, struct ls_device *ls)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*root = NULL;
+	struct los_ondisk	 losd;
+	struct dt_object	*o = NULL;
+	int			 rc = 0;
+
+	rc = dt_root_get(env, dev, &dti->dti_fid);
+	if (rc)
+		return rc;
+
+	root = ls_locate(env, ls, &dti->dti_fid);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	/* find old last_id file */
+	snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-"LPX64"-lastid",
+		 lastid_seq);
+	rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid);
+	lu_object_put_nocache(env, &root->do_lu);
+	if (rc == -ENOENT) {
+		/* old llog lastid accessed by FID only */
+		if (lastid_seq != FID_SEQ_LLOG)
+			return 0;
+		dti->dti_fid.f_seq = FID_SEQ_LLOG;
+		dti->dti_fid.f_oid = 1;
+		dti->dti_fid.f_ver = 0;
+		o = ls_locate(env, ls, &dti->dti_fid);
+		if (IS_ERR(o))
+			return PTR_ERR(o);
+
+		if (!dt_object_exists(o)) {
+			lu_object_put_nocache(env, &o->do_lu);
+			return 0;
+		}
+		CDEBUG(D_INFO, "Found old llog lastid file\n");
+	} else if (rc < 0) {
+		return rc;
+	} else {
+		CDEBUG(D_INFO, "Found old lastid file for sequence "LPX64"\n",
+		       lastid_seq);
+		o = ls_locate(env, ls, &dti->dti_fid);
+		if (IS_ERR(o))
+			return PTR_ERR(o);
+	}
+	/* let's read seq-NNNNNN-lastid file value */
+	LASSERT(dt_object_exists(o));
+	dti->dti_off = 0;
+	dti->dti_lb.lb_buf = &losd;
+	dti->dti_lb.lb_len = sizeof(losd);
+	dt_read_lock(env, o, 0);
+	rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+	dt_read_unlock(env, o);
+	lu_object_put_nocache(env, &o->do_lu);
+	if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) {
+		CERROR("%s: wrong content of seq-"LPX64"-lastid file, magic %x\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq,
+		       le32_to_cpu(losd.lso_magic));
+		return -EINVAL;
+	} else if (rc < 0) {
+		CERROR("%s: failed to read seq-"LPX64"-lastid: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, rc);
+		return rc;
+	}
+	*first_oid = le32_to_cpu(losd.lso_next_oid);
+	return rc;
+}
+
+/**
+ * Initialize local OID storage for required sequence.
+ * That may be needed for services that uses local files and requires
+ * dynamic OID allocation for them.
+ *
+ * Per each sequence we have an object with 'first_fid' identificator
+ * containing the counter for OIDs of locally created files with that
+ * sequence.
+ *
+ * It is used now by llog subsystem and MGS for NID tables
+ *
+ * Function gets first_fid to create counter object.
+ * All dynamic fids will be generated with the same sequence and incremented
+ * OIDs
+ *
+ * Returned local_oid_storage is in-memory representaion of OID storage
+ */
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+			   const struct lu_fid *first_fid,
+			   struct local_oid_storage **los)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct ls_device	*ls;
+	obd_id			 lastid;
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	__u32			 first_oid = fid_oid(first_fid);
+	int			 rc = 0;
+
+	ENTRY;
+
+	ls = ls_device_get(dev);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	*los = dt_los_find(ls, fid_seq(first_fid));
+	if (*los != NULL)
+		GOTO(out, rc = 0);
+
+	/* not found, then create */
+	OBD_ALLOC_PTR(*los);
+	if (*los == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	atomic_set(&(*los)->los_refcount, 1);
+	mutex_init(&(*los)->los_id_lock);
+	(*los)->los_dev = &ls->ls_top_dev;
+	atomic_inc(&ls->ls_refcount);
+	list_add(&(*los)->los_list, &ls->ls_los_list);
+
+	/* Use {seq, 0, 0} to create the LAST_ID file for every
+	 * sequence.  OIDs start at LUSTRE_FID_INIT_OID.
+	 */
+	dti->dti_fid.f_seq = fid_seq(first_fid);
+	dti->dti_fid.f_oid = LUSTRE_FID_LASTID_OID;
+	dti->dti_fid.f_ver = 0;
+	o = ls_locate(env, ls, &dti->dti_fid);
+	if (IS_ERR(o))
+		GOTO(out_los, rc = PTR_ERR(o));
+
+	if (!dt_object_exists(o)) {
+		rc = lastid_compat_check(env, dev, fid_seq(first_fid),
+					 &first_oid, ls);
+		if (rc < 0)
+			GOTO(out_los, rc);
+
+		th = dt_trans_create(env, dev);
+		if (IS_ERR(th))
+			GOTO(out_los, rc = PTR_ERR(th));
+
+		dti->dti_attr.la_valid = LA_MODE | LA_TYPE;
+		dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		rc = dt_declare_create(env, o, &dti->dti_attr, NULL,
+				       &dti->dti_dof, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_declare_record_write(env, o, sizeof(lastid), 0, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dev, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		dt_write_lock(env, o, 0);
+		if (dt_object_exists(o))
+			GOTO(out_lock, rc = 0);
+
+		rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof,
+			       th);
+		if (rc)
+			GOTO(out_lock, rc);
+
+		lastid = cpu_to_le64(first_oid);
+
+		dti->dti_off = 0;
+		dti->dti_lb.lb_buf = &lastid;
+		dti->dti_lb.lb_len = sizeof(lastid);
+		rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th);
+		if (rc)
+			GOTO(out_lock, rc);
+out_lock:
+		dt_write_unlock(env, o);
+out_trans:
+		dt_trans_stop(env, dev, th);
+	} else {
+		dti->dti_off = 0;
+		dti->dti_lb.lb_buf = &lastid;
+		dti->dti_lb.lb_len = sizeof(lastid);
+		dt_read_lock(env, o, 0);
+		rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+		dt_read_unlock(env, o);
+		if (rc == 0 && le64_to_cpu(lastid) > OBIF_MAX_OID) {
+			CERROR("%s: bad oid "LPU64" is read from LAST_ID\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       le64_to_cpu(lastid));
+			rc = -EINVAL;
+		}
+	}
+out_los:
+	if (rc != 0) {
+		list_del(&(*los)->los_list);
+		atomic_dec(&ls->ls_refcount);
+		OBD_FREE_PTR(*los);
+		*los = NULL;
+		if (o != NULL && !IS_ERR(o))
+			lu_object_put_nocache(env, &o->do_lu);
+	} else {
+		(*los)->los_seq = fid_seq(first_fid);
+		(*los)->los_last_oid = le64_to_cpu(lastid);
+		(*los)->los_obj = o;
+		/* read value should not be less than initial one */
+		LASSERTF((*los)->los_last_oid >= first_oid, "%u < %u\n",
+			 (*los)->los_last_oid, first_oid);
+	}
+out:
+	mutex_unlock(&ls->ls_los_mutex);
+	ls_device_put(env, ls);
+	return rc;
+}
+EXPORT_SYMBOL(local_oid_storage_init);
+
+void local_oid_storage_fini(const struct lu_env *env,
+			    struct local_oid_storage *los)
+{
+	struct ls_device *ls;
+
+	if (!atomic_dec_and_test(&los->los_refcount))
+		return;
+
+	LASSERT(env);
+	LASSERT(los->los_dev);
+	ls = dt2ls_dev(los->los_dev);
+
+	mutex_lock(&ls->ls_los_mutex);
+	if (atomic_read(&los->los_refcount) == 0) {
+		if (los->los_obj)
+			lu_object_put_nocache(env, &los->los_obj->do_lu);
+		list_del(&los->los_list);
+		OBD_FREE_PTR(los);
+	}
+	mutex_unlock(&ls->ls_los_mutex);
+	ls_device_put(env, ls);
+}
+EXPORT_SYMBOL(local_oid_storage_fini);

diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.h b/drivers/staging/lustre/lustre/obdclass/local_storage.h
new file mode 100644
index 0000000..d553c37
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/local_storage.h

@@ -0,0 +1,88 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#include <dt_object.h>
+#include <obd.h>
+#include <lustre_fid.h>
+#include <lustre_disk.h>
+
+struct ls_device {
+	struct dt_device	 ls_top_dev;
+	/* all initialized ls_devices on this node linked by this */
+	struct list_head		 ls_linkage;
+	/* how many handle's reference this local storage */
+	atomic_t		 ls_refcount;
+	/* underlaying OSD device */
+	struct dt_device	*ls_osd;
+	/* list of all local OID storages */
+	struct list_head		 ls_los_list;
+	struct mutex		 ls_los_mutex;
+};
+
+static inline struct ls_device *dt2ls_dev(struct dt_device *d)
+{
+	return container_of0(d, struct ls_device, ls_top_dev);
+}
+
+struct ls_object {
+	struct lu_object_header	 ls_header;
+	struct dt_object	 ls_obj;
+};
+
+static inline struct ls_object *lu2ls_obj(struct lu_object *o)
+{
+	return container_of0(o, struct ls_object, ls_obj.do_lu);
+}
+
+static inline struct dt_object *ls_locate(const struct lu_env *env,
+					  struct ls_device *ls,
+					  const struct lu_fid *fid)
+{
+	return dt_locate_at(env, ls->ls_osd, fid, &ls->ls_top_dev.dd_lu_dev);
+}
+
+struct ls_device *ls_device_get(struct dt_device *dev);
+void ls_device_put(const struct lu_env *env, struct ls_device *ls);
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq);
+void dt_los_put(struct local_oid_storage *los);
+
+/* Lustre 2.3 on-disk structure describing local object OIDs storage
+ * the structure to be used with any sequence managed by
+ * local object library.
+ * Obsoleted since 2.4 but is kept for compatibility reasons,
+ * see lastid_compat_check() in obdclass/local_storage.c */
+struct los_ondisk {
+	__u32 lso_magic;
+	__u32 lso_next_oid;
+};
+
+#define LOS_MAGIC	0xdecafbee

diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c
new file mode 100644
index 0000000..e2d57fe
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c

@@ -0,0 +1,562 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu Yawei <niu@whamcloud.com>
+ */
+/*
+ * lustre/obdclass/lprocfs_jobstats.c
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+#if defined(LPROCFS)
+
+/*
+ * JobID formats & JobID environment variable names for supported
+ * job schedulers:
+ *
+ * SLURM:
+ *   JobID format:  32 bit integer.
+ *   JobID env var: SLURM_JOB_ID.
+ * SGE:
+ *   JobID format:  Decimal integer range to 99999.
+ *   JobID env var: JOB_ID.
+ * LSF:
+ *   JobID format:  6 digit integer by default (up to 999999), can be
+ *		  increased to 10 digit (up to 2147483646).
+ *   JobID env var: LSB_JOBID.
+ * Loadleveler:
+ *   JobID format:  String of machine_name.cluster_id.process_id, for
+ *		  example: fr2n02.32.0
+ *   JobID env var: LOADL_STEP_ID.
+ * PBS:
+ *   JobID format:  String of sequence_number[.server_name][@server].
+ *   JobID env var: PBS_JOBID.
+ * Maui/MOAB:
+ *   JobID format:  Same as PBS.
+ *   JobID env var: Same as PBS.
+ */
+
+struct job_stat {
+	struct hlist_node      js_hash;
+	struct list_head	    js_list;
+	atomic_t	  js_refcount;
+	char		  js_jobid[JOBSTATS_JOBID_SIZE];
+	time_t		js_timestamp; /* seconds */
+	struct lprocfs_stats *js_stats;
+	struct obd_job_stats *js_jobstats;
+};
+
+static unsigned job_stat_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static void *job_stat_key(struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	return job->js_jobid;
+}
+
+static int job_stat_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	return (strlen(job->js_jobid) == strlen(key)) &&
+	       !strncmp(job->js_jobid, key, strlen(key));
+}
+
+static void *job_stat_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct job_stat, js_hash);
+}
+
+static void job_stat_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	atomic_inc(&job->js_refcount);
+}
+
+static void job_free(struct job_stat *job)
+{
+	LASSERT(atomic_read(&job->js_refcount) == 0);
+	LASSERT(job->js_jobstats);
+
+	write_lock(&job->js_jobstats->ojs_lock);
+	list_del_init(&job->js_list);
+	write_unlock(&job->js_jobstats->ojs_lock);
+
+	lprocfs_free_stats(&job->js_stats);
+	OBD_FREE_PTR(job);
+}
+
+static void job_putref(struct job_stat *job)
+{
+	LASSERT(atomic_read(&job->js_refcount) > 0);
+	if (atomic_dec_and_test(&job->js_refcount))
+		job_free(job);
+}
+
+static void job_stat_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	job_putref(job);
+}
+
+static void job_stat_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	CERROR("Should not have any items!");
+}
+
+static cfs_hash_ops_t job_stats_hash_ops = {
+	.hs_hash       = job_stat_hash,
+	.hs_key	= job_stat_key,
+	.hs_keycmp     = job_stat_keycmp,
+	.hs_object     = job_stat_object,
+	.hs_get	= job_stat_get,
+	.hs_put_locked = job_stat_put_locked,
+	.hs_exit       = job_stat_exit,
+};
+
+static int job_iter_callback(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			     struct hlist_node *hnode, void *data)
+{
+	time_t oldest = *((time_t *)data);
+	struct job_stat *job;
+
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	if (!oldest || job->js_timestamp < oldest)
+		cfs_hash_bd_del_locked(hs, bd, hnode);
+
+	return 0;
+}
+
+static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool force)
+{
+	time_t oldest, now;
+
+	if (stats->ojs_cleanup_interval == 0)
+		return;
+
+	now = cfs_time_current_sec();
+	if (!force && now < stats->ojs_last_cleanup +
+			    stats->ojs_cleanup_interval)
+		return;
+
+	oldest = now - stats->ojs_cleanup_interval;
+	cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+			       &oldest);
+	stats->ojs_last_cleanup = cfs_time_current_sec();
+}
+
+static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
+{
+	struct job_stat *job;
+
+	LASSERT(jobs->ojs_cntr_num && jobs->ojs_cntr_init_fn);
+
+	OBD_ALLOC_PTR(job);
+	if (job == NULL)
+		return NULL;
+
+	job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0);
+	if (job->js_stats == NULL) {
+		OBD_FREE_PTR(job);
+		return NULL;
+	}
+
+	jobs->ojs_cntr_init_fn(job->js_stats);
+
+	memcpy(job->js_jobid, jobid, JOBSTATS_JOBID_SIZE);
+	job->js_timestamp = cfs_time_current_sec();
+	job->js_jobstats = jobs;
+	INIT_HLIST_NODE(&job->js_hash);
+	INIT_LIST_HEAD(&job->js_list);
+	atomic_set(&job->js_refcount, 1);
+
+	return job;
+}
+
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+			  int event, long amount)
+{
+	struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+	struct job_stat *job, *job2;
+	ENTRY;
+
+	LASSERT(stats && stats->ojs_hash);
+
+	lprocfs_job_cleanup(stats, false);
+
+	if (!jobid || !strlen(jobid))
+		RETURN(-EINVAL);
+
+	if (strlen(jobid) >= JOBSTATS_JOBID_SIZE) {
+		CERROR("Invalid jobid size (%lu), expect(%d)\n",
+		       (unsigned long)strlen(jobid) + 1, JOBSTATS_JOBID_SIZE);
+		RETURN(-EINVAL);
+	}
+
+	job = cfs_hash_lookup(stats->ojs_hash, jobid);
+	if (job)
+		goto found;
+
+	job = job_alloc(jobid, stats);
+	if (job == NULL)
+		RETURN(-ENOMEM);
+
+	job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid,
+				       &job->js_hash);
+	if (job2 != job) {
+		job_putref(job);
+		job = job2;
+		/* We cannot LASSERT(!list_empty(&job->js_list)) here,
+		 * since we just lost the race for inserting "job" into the
+		 * ojs_list, and some other thread is doing it _right_now_.
+		 * Instead, be content the other thread is doing this, since
+		 * "job2" was initialized in job_alloc() already. LU-2163 */
+	} else {
+		LASSERT(list_empty(&job->js_list));
+		write_lock(&stats->ojs_lock);
+		list_add_tail(&job->js_list, &stats->ojs_list);
+		write_unlock(&stats->ojs_lock);
+	}
+
+found:
+	LASSERT(stats == job->js_jobstats);
+	LASSERT(stats->ojs_cntr_num > event);
+	job->js_timestamp = cfs_time_current_sec();
+	lprocfs_counter_add(job->js_stats, event, amount);
+
+	job_putref(job);
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_log);
+
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{
+	struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+	time_t oldest = 0;
+
+	if (stats->ojs_hash == NULL)
+		return;
+	cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, &oldest);
+	cfs_hash_putref(stats->ojs_hash);
+	stats->ojs_hash = NULL;
+	LASSERT(list_empty(&stats->ojs_list));
+}
+EXPORT_SYMBOL(lprocfs_job_stats_fini);
+
+static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_job_stats *stats = p->private;
+	loff_t off = *pos;
+	struct job_stat *job;
+
+	read_lock(&stats->ojs_lock);
+	if (off == 0)
+		return SEQ_START_TOKEN;
+	off--;
+	list_for_each_entry(job, &stats->ojs_list, js_list) {
+		if (!off--)
+			return job;
+	}
+	return NULL;
+}
+
+static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v)
+{
+	struct obd_job_stats *stats = p->private;
+
+	read_unlock(&stats->ojs_lock);
+}
+
+static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_job_stats *stats = p->private;
+	struct job_stat *job;
+	struct list_head *next;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) {
+		next = stats->ojs_list.next;
+	} else {
+		job = (struct job_stat *)v;
+		next = job->js_list.next;
+	}
+
+	return next == &stats->ojs_list ? NULL :
+		list_entry(next, struct job_stat, js_list);
+}
+
+/*
+ * Example of output on MDT:
+ *
+ * job_stats:
+ * - job_id:	test_id.222.25844
+ *   snapshot_time: 1322494486
+ *   open:	  { samples:	       3, unit: reqs }
+ *   close:	 { samples:	       3, unit: reqs }
+ *   mknod:	 { samples:	       0, unit: reqs }
+ *   link:	  { samples:	       0, unit: reqs }
+ *   unlink:	{ samples:	       0, unit: reqs }
+ *   mkdir:	 { samples:	       0, unit: reqs }
+ *   rmdir:	 { samples:	       0, unit: reqs }
+ *   rename:	{ samples:	       1, unit: reqs }
+ *   getattr:       { samples:	       7, unit: reqs }
+ *   setattr:       { samples:	       0, unit: reqs }
+ *   getxattr:      { samples:	       0, unit: reqs }
+ *   setxattr:      { samples:	       0, unit: reqs }
+ *   statfs:	{ samples:	       0, unit: reqs }
+ *   sync:	  { samples:	       0, unit: reqs }
+ *
+ * Example of output on OST:
+ *
+ * job_stats:
+ * - job_id	 4854
+ *   snapshot_time: 1322494602
+ *   read:	  { samples:  0, unit: bytes, min:  0, max:  0, sum:  0 }
+ *   write:	 { samples:  1, unit: bytes, min: 10, max: 10, sum: 10 }
+ *   setattr:       { samples:  0, unit: reqs }
+ *   punch:	 { samples:  0, unit: reqs }
+ *   sync:	  { samples:  0, unit: reqs }
+ */
+
+static const char spaces[] = "		    ";
+
+static int inline width(const char *str, int len)
+{
+	return len - min((int)strlen(str), 15);
+}
+
+static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
+{
+	struct job_stat			*job = v;
+	struct lprocfs_stats		*s;
+	struct lprocfs_counter		ret;
+	struct lprocfs_counter		*cntr;
+	struct lprocfs_counter_header	*cntr_header;
+	int				i;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(p, "job_stats:\n");
+		return 0;
+	}
+
+	seq_printf(p, "- %-16s %s\n", "job_id:", job->js_jobid);
+	seq_printf(p, "  %-16s %ld\n", "snapshot_time:", job->js_timestamp);
+
+	s = job->js_stats;
+	for (i = 0; i < s->ls_num; i++) {
+		cntr = lprocfs_stats_counter_get(s, 0, i);
+		cntr_header = &s->ls_cnt_header[i];
+		lprocfs_stats_collect(s, i, &ret);
+
+		seq_printf(p, "  %s:%.*s { samples: %11"LPF64"u",
+			   cntr_header->lc_name,
+			   width(cntr_header->lc_name, 15), spaces,
+			   ret.lc_count);
+		if (cntr_header->lc_units[0] != '\0')
+			seq_printf(p, ", unit: %5s", cntr_header->lc_units);
+
+		if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+			seq_printf(p, ", min:%8"LPF64"u, max:%8"LPF64"u,"
+				   " sum:%16"LPF64"u",
+				   ret.lc_count ? ret.lc_min : 0,
+				   ret.lc_count ? ret.lc_max : 0,
+				   ret.lc_count ? ret.lc_sum : 0);
+		}
+		if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) {
+			seq_printf(p, ", sumsq: %18"LPF64"u",
+				   ret.lc_count ? ret.lc_sumsquare : 0);
+		}
+
+		seq_printf(p, " }\n");
+
+	}
+	return 0;
+}
+
+struct seq_operations lprocfs_jobstats_seq_sops = {
+	start: lprocfs_jobstats_seq_start,
+	stop:  lprocfs_jobstats_seq_stop,
+	next:  lprocfs_jobstats_seq_next,
+	show:  lprocfs_jobstats_seq_show,
+};
+
+static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = seq_open(file, &lprocfs_jobstats_seq_sops);
+	if (rc)
+		return rc;
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+	return 0;
+}
+
+static ssize_t lprocfs_jobstats_seq_write(struct file *file, const char *buf,
+					  size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_job_stats *stats = seq->private;
+	char jobid[JOBSTATS_JOBID_SIZE];
+	int all = 0;
+	struct job_stat *job;
+
+	if (!memcmp(buf, "clear", strlen("clear"))) {
+		all = 1;
+	} else if (len < JOBSTATS_JOBID_SIZE) {
+		memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+		/* Trim '\n' if any */
+		if (buf[len - 1] == '\n')
+			memcpy(jobid, buf, len - 1);
+		else
+			memcpy(jobid, buf, len);
+	} else {
+		return -EINVAL;
+	}
+
+	LASSERT(stats->ojs_hash);
+	if (all) {
+		time_t oldest = 0;
+		cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+				       &oldest);
+		return len;
+	}
+
+	if (!strlen(jobid))
+		return -EINVAL;
+
+	job = cfs_hash_lookup(stats->ojs_hash, jobid);
+	if (!job)
+		return -EINVAL;
+
+	cfs_hash_del_key(stats->ojs_hash, jobid);
+
+	job_putref(job);
+	return len;
+}
+
+struct file_operations lprocfs_jobstats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_jobstats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_jobstats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback init_fn)
+{
+	struct proc_dir_entry *entry;
+	struct obd_job_stats *stats;
+	ENTRY;
+
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_type->typ_name);
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) {
+		CERROR("Invalid obd device type.\n");
+		RETURN(-EINVAL);
+	}
+	stats = &obd->u.obt.obt_jobstats;
+
+	LASSERT(stats->ojs_hash == NULL);
+	stats->ojs_hash = cfs_hash_create("JOB_STATS",
+					  HASH_JOB_STATS_CUR_BITS,
+					  HASH_JOB_STATS_MAX_BITS,
+					  HASH_JOB_STATS_BKT_BITS, 0,
+					  CFS_HASH_MIN_THETA,
+					  CFS_HASH_MAX_THETA,
+					  &job_stats_hash_ops,
+					  CFS_HASH_DEFAULT);
+	if (stats->ojs_hash == NULL)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&stats->ojs_list);
+	rwlock_init(&stats->ojs_lock);
+	stats->ojs_cntr_num = cntr_num;
+	stats->ojs_cntr_init_fn = init_fn;
+	stats->ojs_cleanup_interval = 600; /* 10 mins by default */
+	stats->ojs_last_cleanup = cfs_time_current_sec();
+
+	entry = proc_create_data("job_stats", 0644, obd->obd_proc_entry,
+				 &lprocfs_jobstats_seq_fops, stats);
+	if (entry)
+		RETURN(0);
+	else
+		RETURN(-ENOMEM);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_init);
+
+int lprocfs_rd_job_interval(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_job_stats *stats;
+
+	LASSERT(obd != NULL);
+	stats = &obd->u.obt.obt_jobstats;
+	return seq_printf(m, "%d\n", stats->ojs_cleanup_interval);
+}
+EXPORT_SYMBOL(lprocfs_rd_job_interval);
+
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_job_stats *stats;
+	int val, rc;
+
+	LASSERT(obd != NULL);
+	stats = &obd->u.obt.obt_jobstats;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	stats->ojs_cleanup_interval = val;
+	lprocfs_job_cleanup(stats, true);
+
+	return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_job_interval);
+
+#endif /* LPROCFS*/

diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
new file mode 100644
index 0000000..3b157f8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c

@@ -0,0 +1,1985 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <linux/seq_file.h>
+
+#if defined(LPROCFS)
+
+static int lprocfs_no_percpu_stats = 0;
+CFS_MODULE_PARM(lprocfs_no_percpu_stats, "i", int, 0644,
+		"Do not alloc percpu data for lprocfs stats");
+
+#define MAX_STRING_SIZE 128
+
+int lprocfs_single_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_single_release);
+
+int lprocfs_seq_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_seq_release);
+
+/* lprocfs API calls */
+
+proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+				     char *name, void *data,
+				     struct file_operations *fops)
+{
+	proc_dir_entry_t *proc;
+	mode_t mode = 0;
+
+	if (root == NULL || name == NULL || fops == NULL)
+		return ERR_PTR(-EINVAL);
+
+	if (fops->read)
+		mode = 0444;
+	if (fops->write)
+		mode |= 0200;
+	proc = proc_create_data(name, mode, root, fops, data);
+	if (!proc) {
+		CERROR("LprocFS: No memory to create /proc entry %s", name);
+		return ERR_PTR(-ENOMEM);
+	}
+	return proc;
+}
+EXPORT_SYMBOL(lprocfs_add_simple);
+
+struct proc_dir_entry *lprocfs_add_symlink(const char *name,
+			struct proc_dir_entry *parent, const char *format, ...)
+{
+	struct proc_dir_entry *entry;
+	char *dest;
+	va_list ap;
+
+	if (parent == NULL || format == NULL)
+		return NULL;
+
+	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+	if (dest == NULL)
+		return NULL;
+
+	va_start(ap, format);
+	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+	va_end(ap);
+
+	entry = proc_symlink(name, parent, dest);
+	if (entry == NULL)
+		CERROR("LprocFS: Could not create symbolic link from %s to %s",
+			name, dest);
+
+	OBD_FREE(dest, MAX_STRING_SIZE + 1);
+	return entry;
+}
+EXPORT_SYMBOL(lprocfs_add_symlink);
+
+static struct file_operations lprocfs_generic_fops = { };
+
+/**
+ * Add /proc entries.
+ *
+ * \param root [in]  The parent proc entry on which new entry will be added.
+ * \param list [in]  Array of proc entries to be added.
+ * \param data [in]  The argument to be passed when entries read/write routines
+ *		   are called through /proc file.
+ *
+ * \retval 0   on success
+ *	 < 0 on error
+ */
+int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
+		     void *data)
+{
+	if (root == NULL || list == NULL)
+		return -EINVAL;
+
+	while (list->name != NULL) {
+		struct proc_dir_entry *proc;
+		mode_t mode = 0;
+
+		if (list->proc_mode != 0000) {
+			mode = list->proc_mode;
+		} else if (list->fops) {
+			if (list->fops->read)
+				mode = 0444;
+			if (list->fops->write)
+				mode |= 0200;
+		}
+		proc = proc_create_data(list->name, mode, root,
+					list->fops ?: &lprocfs_generic_fops,
+					list->data ?: data);
+		if (proc == NULL)
+			return -ENOMEM;
+		list++;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_add_vars);
+
+void lprocfs_remove(struct proc_dir_entry **rooth)
+{
+	proc_remove(*rooth);
+	*rooth = NULL;
+}
+EXPORT_SYMBOL(lprocfs_remove);
+
+void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+	LASSERT(parent != NULL);
+	remove_proc_entry(name, parent);
+}
+EXPORT_SYMBOL(lprocfs_remove_proc_entry);
+
+struct proc_dir_entry *lprocfs_register(const char *name,
+					struct proc_dir_entry *parent,
+					struct lprocfs_vars *list, void *data)
+{
+	struct proc_dir_entry *newchild;
+
+	newchild = proc_mkdir(name, parent);
+	if (newchild != NULL && list != NULL) {
+		int rc = lprocfs_add_vars(newchild, list, data);
+		if (rc) {
+			lprocfs_remove(&newchild);
+			return ERR_PTR(rc);
+		}
+	}
+	return newchild;
+}
+EXPORT_SYMBOL(lprocfs_register);
+
+/* Generic callbacks */
+int lprocfs_rd_uint(struct seq_file *m, void *data)
+{
+	return seq_printf(m, "%u\n", *(unsigned int *)data);
+}
+EXPORT_SYMBOL(lprocfs_rd_uint);
+
+int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+		    unsigned long count, void *data)
+{
+	unsigned *p = data;
+	char dummy[MAX_STRING_SIZE + 1], *end;
+	unsigned long tmp;
+
+	dummy[MAX_STRING_SIZE] = '\0';
+	if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+		return -EFAULT;
+
+	tmp = simple_strtoul(dummy, &end, 0);
+	if (dummy == end)
+		return -EINVAL;
+
+	*p = (unsigned int)tmp;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_uint);
+
+int lprocfs_rd_u64(struct seq_file *m, void *data)
+{
+	return seq_printf(m, LPU64"\n", *(__u64 *)data);
+}
+EXPORT_SYMBOL(lprocfs_rd_u64);
+
+int lprocfs_rd_atomic(struct seq_file *m, void *data)
+{
+	atomic_t *atom = data;
+	LASSERT(atom != NULL);
+	return seq_printf(m, "%d\n", atomic_read(atom));
+}
+EXPORT_SYMBOL(lprocfs_rd_atomic);
+
+int lprocfs_wr_atomic(struct file *file, const char __user *buffer,
+		      unsigned long count, void *data)
+{
+	atomic_t *atm = data;
+	int val = 0;
+	int rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val <= 0)
+		return -ERANGE;
+
+	atomic_set(atm, val);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_atomic);
+
+int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	return seq_printf(m, "%s\n", obd->obd_uuid.uuid);
+}
+EXPORT_SYMBOL(lprocfs_rd_uuid);
+
+int lprocfs_rd_name(struct seq_file *m, void *data)
+{
+	struct obd_device *dev = data;
+
+	LASSERT(dev != NULL);
+	return seq_printf(m, "%s\n", dev->obd_name);
+}
+EXPORT_SYMBOL(lprocfs_rd_name);
+
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		rc = seq_printf(m, "%u\n", osfs.os_bsize);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_blksize);
+
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		rc = seq_printf(m, LPU64"\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
+
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		rc = seq_printf(m, LPU64"\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
+
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		rc = seq_printf(m, LPU64"\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesavail);
+
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		rc = seq_printf(m, LPU64"\n", osfs.os_files);
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filestotal);
+
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		rc = seq_printf(m, LPU64"\n", osfs.os_ffree);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filesfree);
+
+int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_import *imp;
+	char *imp_state_name = NULL;
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+	rc = seq_printf(m, "%s\t%s%s\n", obd2cli_tgt(obd), imp_state_name,
+			imp->imp_deactive ? "\tDEACTIVATED" : "");
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_server_uuid);
+
+int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct ptlrpc_connection *conn;
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+
+	LPROCFS_CLIMP_CHECK(obd);
+	conn = obd->u.cli.cl_import->imp_connection;
+	if (conn && obd->u.cli.cl_import)
+		rc = seq_printf(m, "%s\n", conn->c_remote_uuid.uuid);
+	else
+		rc = seq_printf(m, "%s\n", "<none>");
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
+
+/** add up per-cpu counters */
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt)
+{
+	unsigned int			num_entry;
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*cntr_header;
+	int				i;
+	unsigned long			flags = 0;
+
+	memset(cnt, 0, sizeof(*cnt));
+
+	if (stats == NULL) {
+		/* set count to 1 to avoid divide-by-zero errs in callers */
+		cnt->lc_count = 1;
+		return;
+	}
+
+	cnt->lc_min = LC_MIN_INIT;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		cntr_header = &stats->ls_cnt_header[idx];
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
+
+		cnt->lc_count += percpu_cntr->lc_count;
+		cnt->lc_sum += percpu_cntr->lc_sum;
+		if (percpu_cntr->lc_min < cnt->lc_min)
+			cnt->lc_min = percpu_cntr->lc_min;
+		if (percpu_cntr->lc_max > cnt->lc_max)
+			cnt->lc_max = percpu_cntr->lc_max;
+		cnt->lc_sumsquare += percpu_cntr->lc_sumsquare;
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_stats_collect);
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(flag, first)						\
+	do {								\
+		if (imp->imp_##flag)					\
+		     seq_printf(m, "%s" #flag, first ? "" : ", ");	\
+	} while (0)
+static int obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
+{
+	bool first = true;
+
+	if (imp->imp_obd->obd_no_recov) {
+		seq_printf(m, "no_recov");
+		first = false;
+	}
+
+	flag2str(invalid, first);
+	first = false;
+	flag2str(deactive, first);
+	flag2str(replayable, first);
+	flag2str(pingable, first);
+	return 0;
+}
+#undef flags2str
+
+static const char *obd_connect_names[] = {
+	"read_only",
+	"lov_index",
+	"unused",
+	"write_grant",
+	"server_lock",
+	"version",
+	"request_portal",
+	"acl",
+	"xattr",
+	"create_on_write",
+	"truncate_lock",
+	"initial_transno",
+	"inode_bit_locks",
+	"join_file(obsolete)",
+	"getattr_by_fid",
+	"no_oh_for_devices",
+	"remote_client",
+	"remote_client_by_force",
+	"max_byte_per_rpc",
+	"64bit_qdata",
+	"mds_capability",
+	"oss_capability",
+	"early_lock_cancel",
+	"som",
+	"adaptive_timeouts",
+	"lru_resize",
+	"mds_mds_connection",
+	"real_conn",
+	"change_qunit_size",
+	"alt_checksum_algorithm",
+	"fid_is_enabled",
+	"version_recovery",
+	"pools",
+	"grant_shrink",
+	"skip_orphan",
+	"large_ea",
+	"full20",
+	"layout_lock",
+	"64bithash",
+	"object_max_bytes",
+	"imp_recov",
+	"jobstats",
+	"umask",
+	"einprogress",
+	"grant_param",
+	"flock_owner",
+	"lvb_type",
+	"nanoseconds_times",
+	"lightweight_conn",
+	"short_io",
+	"pingless",
+	"unknown",
+	NULL
+};
+
+static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, char *sep)
+{
+	__u64 mask = 1;
+	int i;
+	bool first = true;
+
+	for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags & mask) {
+			seq_printf(m, "%s%s",
+					first ? sep : "", obd_connect_names[i]);
+			first = false;
+		}
+	}
+	if (flags & ~(mask - 1))
+		seq_printf(m, "%sunknown flags "LPX64,
+				first ? sep : "", flags & ~(mask - 1));
+}
+
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep)
+{
+	__u64 mask = 1;
+	int i, ret = 0;
+
+	for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags & mask)
+			ret += snprintf(page + ret, count - ret, "%s%s",
+					ret ? sep : "", obd_connect_names[i]);
+	}
+	if (flags & ~(mask - 1))
+		ret += snprintf(page + ret, count - ret,
+				"%sunknown flags "LPX64,
+				ret ? sep : "", flags & ~(mask - 1));
+	return ret;
+}
+EXPORT_SYMBOL(obd_connect_flags2str);
+
+int lprocfs_rd_import(struct seq_file *m, void *data)
+{
+	struct lprocfs_counter		ret;
+	struct lprocfs_counter_header	*header;
+	struct obd_device		*obd	= (struct obd_device *)data;
+	struct obd_import		*imp;
+	struct obd_import_conn		*conn;
+	int				j;
+	int				k;
+	int				rw	= 0;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	seq_printf(m,
+		     "import:\n"
+		     "    name: %s\n"
+		     "    target: %s\n"
+		     "    state: %s\n"
+		     "    instance: %u\n"
+		     "    connect_flags: [",
+		     obd->obd_name,
+		     obd2cli_tgt(obd),
+		     ptlrpc_import_state_name(imp->imp_state),
+		     imp->imp_connect_data.ocd_instance);
+	obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, ", ");
+	seq_printf(m,
+		      "]\n"
+		      "    import_flags: [");
+	obd_import_flags2str(imp, m);
+
+	seq_printf(m,
+		      "]\n"
+		      "    connection:\n"
+		      "       failover_nids: [");
+	spin_lock(&imp->imp_lock);
+	j = 0;
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		seq_printf(m, "%s%s", j ? ", " : "",
+			   libcfs_nid2str(conn->oic_conn->c_peer.nid));
+		j++;
+	}
+	seq_printf(m,
+		      "]\n"
+		      "       current_connection: %s\n"
+		      "       connection_attempts: %u\n"
+		      "       generation: %u\n"
+		      "       in-progress_invalidations: %u\n",
+		      imp->imp_connection == NULL ? "<none>" :
+			      libcfs_nid2str(imp->imp_connection->c_peer.nid),
+		      imp->imp_conn_cnt,
+		      imp->imp_generation,
+		      atomic_read(&imp->imp_inval_count));
+	spin_unlock(&imp->imp_lock);
+
+	if (obd->obd_svc_stats == NULL)
+		goto out_climp;
+
+	header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
+	lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret);
+	if (ret.lc_count != 0) {
+		/* first argument to do_div MUST be __u64 */
+		__u64 sum = ret.lc_sum;
+		do_div(sum, ret.lc_count);
+		ret.lc_sum = sum;
+	} else
+		ret.lc_sum = 0;
+	seq_printf(m,
+		      "    rpcs:\n"
+		      "       inflight: %u\n"
+		      "       unregistering: %u\n"
+		      "       timeouts: %u\n"
+		      "       avg_waittime: "LPU64" %s\n",
+		      atomic_read(&imp->imp_inflight),
+		      atomic_read(&imp->imp_unregistering),
+		      atomic_read(&imp->imp_timeouts),
+		      ret.lc_sum, header->lc_units);
+
+	k = 0;
+	for(j = 0; j < IMP_AT_MAX_PORTALS; j++) {
+		if (imp->imp_at.iat_portal[j] == 0)
+			break;
+		k = max_t(unsigned int, k,
+			  at_get(&imp->imp_at.iat_service_estimate[j]));
+	}
+	seq_printf(m,
+		      "    service_estimates:\n"
+		      "       services: %u sec\n"
+		      "       network: %u sec\n",
+		      k,
+		      at_get(&imp->imp_at.iat_net_latency));
+
+	seq_printf(m,
+		      "    transactions:\n"
+		      "       last_replay: "LPU64"\n"
+		      "       peer_committed: "LPU64"\n"
+		      "       last_checked: "LPU64"\n",
+		      imp->imp_last_replay_transno,
+		      imp->imp_peer_committed_transno,
+		      imp->imp_last_transno_checked);
+
+	/* avg data rates */
+	for (rw = 0; rw <= 1; rw++) {
+		lprocfs_stats_collect(obd->obd_svc_stats,
+				      PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw,
+				      &ret);
+		if (ret.lc_sum > 0 && ret.lc_count > 0) {
+			/* first argument to do_div MUST be __u64 */
+			__u64 sum = ret.lc_sum;
+			do_div(sum, ret.lc_count);
+			ret.lc_sum = sum;
+			seq_printf(m,
+				      "    %s_data_averages:\n"
+				      "       bytes_per_rpc: "LPU64"\n",
+				      rw ? "write" : "read",
+				      ret.lc_sum);
+		}
+		k = (int)ret.lc_sum;
+		j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES;
+		header = &obd->obd_svc_stats->ls_cnt_header[j];
+		lprocfs_stats_collect(obd->obd_svc_stats, j, &ret);
+		if (ret.lc_sum > 0 && ret.lc_count != 0) {
+			/* first argument to do_div MUST be __u64 */
+			__u64 sum = ret.lc_sum;
+			do_div(sum, ret.lc_count);
+			ret.lc_sum = sum;
+			seq_printf(m,
+				      "       %s_per_rpc: "LPU64"\n",
+				      header->lc_units, ret.lc_sum);
+			j = (int)ret.lc_sum;
+			if (j > 0)
+				seq_printf(m,
+					      "       MB_per_sec: %u.%.02u\n",
+					      k / j, (100 * k / j) % 100);
+		}
+	}
+
+out_climp:
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_import);
+
+int lprocfs_rd_state(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	int j, k;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	seq_printf(m, "current_state: %s\n",
+		     ptlrpc_import_state_name(imp->imp_state));
+	seq_printf(m, "state_history:\n");
+	k = imp->imp_state_hist_idx;
+	for (j = 0; j < IMP_STATE_HIST_LEN; j++) {
+		struct import_state_hist *ish =
+			&imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN];
+		if (ish->ish_state == 0)
+			continue;
+		seq_printf(m, " - ["CFS_TIME_T", %s]\n",
+			      ish->ish_time,
+			      ptlrpc_import_state_name(ish->ish_state));
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_state);
+
+int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at)
+{
+	int i;
+	for (i = 0; i < AT_BINS; i++)
+		seq_printf(m, "%3u ", at->at_hist[i]);
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	unsigned int cur, worst;
+	time_t now, worstt;
+	struct dhms ts;
+	int i;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	now = cfs_time_current_sec();
+
+	/* Some network health info for kicks */
+	s2dhms(&ts, now - imp->imp_last_reply_time);
+	seq_printf(m, "%-10s : %ld, "DHMS_FMT" ago\n",
+		       "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+	cur = at_get(&imp->imp_at.iat_net_latency);
+	worst = imp->imp_at.iat_net_latency.at_worst_ever;
+	worstt = imp->imp_at.iat_net_latency.at_worst_time;
+	s2dhms(&ts, now - worstt);
+	seq_printf(m, "%-10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
+		       "network", cur, worst, worstt, DHMS_VARS(&ts));
+	lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency);
+
+	for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		if (imp->imp_at.iat_portal[i] == 0)
+			break;
+		cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+		worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+		worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+		s2dhms(&ts, now - worstt);
+		seq_printf(m, "portal %-2d  : cur %3u  worst %3u (at %ld, "
+			       DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
+			       cur, worst, worstt, DHMS_VARS(&ts));
+		lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]);
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
+
+int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	__u64 flags;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
+	seq_printf(m, "flags="LPX64"\n", flags);
+	obd_connect_seq_flags2str(m, flags, "\n");
+	seq_printf(m, "\n");
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_connect_flags);
+
+int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	return seq_printf(m, "%u\n", obd->obd_num_exports);
+}
+EXPORT_SYMBOL(lprocfs_rd_num_exports);
+
+int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{
+	struct obd_type *class = (struct obd_type*) data;
+
+	LASSERT(class != NULL);
+	return seq_printf(m, "%d\n", class->typ_refcnt);
+}
+EXPORT_SYMBOL(lprocfs_rd_numrefs);
+
+int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list)
+{
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_procroot != NULL);
+
+	obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+					       obd->obd_type->typ_procroot,
+					       list, obd);
+	if (IS_ERR(obd->obd_proc_entry)) {
+		rc = PTR_ERR(obd->obd_proc_entry);
+		CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
+		obd->obd_proc_entry = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_setup);
+
+int lprocfs_obd_cleanup(struct obd_device *obd)
+{
+	if (!obd)
+		return -EINVAL;
+	if (obd->obd_proc_exports_entry) {
+		/* Should be no exports left */
+		lprocfs_remove(&obd->obd_proc_exports_entry);
+		obd->obd_proc_exports_entry = NULL;
+	}
+	if (obd->obd_proc_entry) {
+		lprocfs_remove(&obd->obd_proc_entry);
+		obd->obd_proc_entry = NULL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_cleanup);
+
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
+{
+	CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat,
+	       client_stat->nid_proc, client_stat->nid_stats);
+
+	LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+		 "nid %s:count %d\n", libcfs_nid2str(client_stat->nid),
+		 atomic_read(&client_stat->nid_exp_ref_count));
+
+	if (client_stat->nid_proc)
+		lprocfs_remove(&client_stat->nid_proc);
+
+	if (client_stat->nid_stats)
+		lprocfs_free_stats(&client_stat->nid_stats);
+
+	if (client_stat->nid_ldlm_stats)
+		lprocfs_free_stats(&client_stat->nid_ldlm_stats);
+
+	OBD_FREE_PTR(client_stat);
+	return;
+
+}
+
+void lprocfs_free_per_client_stats(struct obd_device *obd)
+{
+	cfs_hash_t *hash = obd->obd_nid_stats_hash;
+	struct nid_stat *stat;
+	ENTRY;
+
+	/* we need extra list - because hash_exit called to early */
+	/* not need locking because all clients is died */
+	while (!list_empty(&obd->obd_nid_stats)) {
+		stat = list_entry(obd->obd_nid_stats.next,
+				      struct nid_stat, nid_list);
+		list_del_init(&stat->nid_list);
+		cfs_hash_del(hash, &stat->nid, &stat->nid_hash);
+		lprocfs_free_client_stats(stat);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(lprocfs_free_per_client_stats);
+
+struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
+					  enum lprocfs_stats_flags flags)
+{
+	struct lprocfs_stats	*stats;
+	unsigned int		num_entry;
+	unsigned int		percpusize = 0;
+	int			i;
+
+	if (num == 0)
+		return NULL;
+
+	if (lprocfs_no_percpu_stats != 0)
+		flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+
+	if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	/* alloc percpu pointers for all possible cpu slots */
+	LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+	if (stats == NULL)
+		return NULL;
+
+	stats->ls_num = num;
+	stats->ls_flags = flags;
+	spin_lock_init(&stats->ls_lock);
+
+	/* alloc num of counter headers */
+	LIBCFS_ALLOC(stats->ls_cnt_header,
+		     stats->ls_num * sizeof(struct lprocfs_counter_header));
+	if (stats->ls_cnt_header == NULL)
+		goto fail;
+
+	if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
+		/* contains only one set counters */
+		percpusize = lprocfs_stats_counter_size(stats);
+		LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
+		if (stats->ls_percpu[0] == NULL)
+			goto fail;
+		stats->ls_biggest_alloc_num = 1;
+	} else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
+		/* alloc all percpu data, currently only obd_memory use this */
+		for (i = 0; i < num_entry; ++i)
+			if (lprocfs_stats_alloc_one(stats, i) < 0)
+				goto fail;
+	}
+
+	return stats;
+
+fail:
+	lprocfs_free_stats(&stats);
+	return NULL;
+}
+EXPORT_SYMBOL(lprocfs_alloc_stats);
+
+void lprocfs_free_stats(struct lprocfs_stats **statsh)
+{
+	struct lprocfs_stats *stats = *statsh;
+	unsigned int num_entry;
+	unsigned int percpusize;
+	unsigned int i;
+
+	if (stats == NULL || stats->ls_num == 0)
+		return;
+	*statsh = NULL;
+
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	for (i = 0; i < num_entry; i++)
+		if (stats->ls_percpu[i] != NULL)
+			LIBCFS_FREE(stats->ls_percpu[i], percpusize);
+	if (stats->ls_cnt_header != NULL)
+		LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
+					sizeof(struct lprocfs_counter_header));
+	LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+}
+EXPORT_SYMBOL(lprocfs_free_stats);
+
+void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				i;
+	int				j;
+	unsigned int			num_entry;
+	unsigned long			flags = 0;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		for (j = 0; j < stats->ls_num; j++) {
+			header = &stats->ls_cnt_header[j];
+			percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
+			percpu_cntr->lc_count		= 0;
+			percpu_cntr->lc_min		= LC_MIN_INIT;
+			percpu_cntr->lc_max		= 0;
+			percpu_cntr->lc_sumsquare	= 0;
+			percpu_cntr->lc_sum		= 0;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				percpu_cntr->lc_sum_irq	= 0;
+		}
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_clear_stats);
+
+static ssize_t lprocfs_stats_seq_write(struct file *file,
+				       const char __user *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct lprocfs_stats *stats = seq->private;
+
+	lprocfs_clear_stats(stats);
+
+	return len;
+}
+
+static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct lprocfs_stats *stats = p->private;
+
+	return (*pos < stats->ls_num) ? pos : NULL;
+}
+
+static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return lprocfs_stats_seq_start(p, pos);
+}
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
+{
+	struct lprocfs_stats		*stats	= p->private;
+	struct lprocfs_counter_header   *hdr;
+	struct lprocfs_counter           ctr;
+	int                              idx    = *(loff_t *)v;
+	int                              rc     = 0;
+
+	if (idx == 0) {
+		struct timeval now;
+		do_gettimeofday(&now);
+		rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n",
+				"snapshot_time", now.tv_sec, now.tv_usec);
+		if (rc < 0)
+			return rc;
+	}
+	hdr = &stats->ls_cnt_header[idx];
+	lprocfs_stats_collect(stats, idx, &ctr);
+
+	if (ctr.lc_count == 0)
+		goto out;
+
+	rc = seq_printf(p, "%-25s "LPD64" samples [%s]", hdr->lc_name,
+			ctr.lc_count, hdr->lc_units);
+
+	if (rc < 0)
+		goto out;
+
+	if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && (ctr.lc_count > 0)) {
+		rc = seq_printf(p, " "LPD64" "LPD64" "LPD64,
+				ctr.lc_min, ctr.lc_max, ctr.lc_sum);
+		if (rc < 0)
+			goto out;
+		if (hdr->lc_config & LPROCFS_CNTR_STDDEV)
+			rc = seq_printf(p, " "LPD64, ctr.lc_sumsquare);
+		if (rc < 0)
+			goto out;
+	}
+	rc = seq_printf(p, "\n");
+out:
+	return (rc < 0) ? rc : 0;
+}
+
+struct seq_operations lprocfs_stats_seq_sops = {
+	.start	= lprocfs_stats_seq_start,
+	.stop	= lprocfs_stats_seq_stop,
+	.next	= lprocfs_stats_seq_next,
+	.show	= lprocfs_stats_seq_show,
+};
+
+static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = seq_open(file, &lprocfs_stats_seq_sops);
+	if (rc)
+		return rc;
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+	return 0;
+}
+
+struct file_operations lprocfs_stats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_stats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_stats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+
+int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+			   struct lprocfs_stats *stats)
+{
+	struct proc_dir_entry *entry;
+	LASSERT(root != NULL);
+
+	entry = proc_create_data(name, 0644, root,
+				 &lprocfs_stats_seq_fops, stats);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_register_stats);
+
+void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+			  unsigned conf, const char *name, const char *units)
+{
+	struct lprocfs_counter_header	*header;
+	struct lprocfs_counter		*percpu_cntr;
+	unsigned long			flags = 0;
+	unsigned int			i;
+	unsigned int			num_cpu;
+
+	LASSERT(stats != NULL);
+
+	header = &stats->ls_cnt_header[index];
+	LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n",
+		 index, name, units);
+
+	header->lc_config = conf;
+	header->lc_name   = name;
+	header->lc_units  = units;
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; ++i) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
+		percpu_cntr->lc_count		= 0;
+		percpu_cntr->lc_min		= LC_MIN_INIT;
+		percpu_cntr->lc_max		= 0;
+		percpu_cntr->lc_sumsquare	= 0;
+		percpu_cntr->lc_sum		= 0;
+		if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq	= 0;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_init);
+
+#define LPROCFS_OBD_OP_INIT(base, stats, op)			       \
+do {								       \
+	unsigned int coffset = base + OBD_COUNTER_OFFSET(op);	      \
+	LASSERT(coffset < stats->ls_num);				  \
+	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	      \
+} while (0)
+
+void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, migrate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, copy);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iterate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, extent_calc);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_connect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
+}
+EXPORT_SYMBOL(lprocfs_init_ops_stats);
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	LASSERT(obd->obd_stats == NULL);
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_cntr_base == 0);
+
+	num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
+		num_private_stats - 1 /* o_owner */;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_ops_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		/* If this LBUGs, it is likely that an obd
+		 * operation was added to struct obd_ops in
+		 * <obd.h>, and that the corresponding line item
+		 * LPROCFS_OBD_OP_INIT(.., .., opname)
+		 * is missing from the list above. */
+		LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
+			 "Missing obd_stat initializer obd_op "
+			 "operation at offset %d.\n", i - num_private_stats);
+	}
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->obd_stats  = stats;
+		obd->obd_cntr_base = num_private_stats;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
+
+void lprocfs_free_obd_stats(struct obd_device *obd)
+{
+	if (obd->obd_stats)
+		lprocfs_free_stats(&obd->obd_stats);
+}
+EXPORT_SYMBOL(lprocfs_free_obd_stats);
+
+#define LPROCFS_MD_OP_INIT(base, stats, op)			     \
+do {								    \
+	unsigned int coffset = base + MD_COUNTER_OFFSET(op);	    \
+	LASSERT(coffset < stats->ls_num);			       \
+	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	   \
+} while (0)
+
+void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, sync);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
+}
+EXPORT_SYMBOL(lprocfs_init_mps_stats);
+
+int lprocfs_alloc_md_stats(struct obd_device *obd,
+			   unsigned num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	LASSERT(obd->md_stats == NULL);
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->md_cntr_base == 0);
+
+	num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) +
+		    num_private_stats;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_mps_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		if (stats->ls_cnt_header[i].lc_name == NULL) {
+			CERROR("Missing md_stat initializer md_op "
+			       "operation at offset %d. Aborting.\n",
+			       i - num_private_stats);
+			LBUG();
+		}
+	}
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->md_stats  = stats;
+		obd->md_cntr_base = num_private_stats;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_md_stats);
+
+void lprocfs_free_md_stats(struct obd_device *obd)
+{
+	struct lprocfs_stats *stats = obd->md_stats;
+
+	if (stats != NULL) {
+		obd->md_stats = NULL;
+		obd->md_cntr_base = 0;
+		lprocfs_free_stats(&stats);
+	}
+}
+EXPORT_SYMBOL(lprocfs_free_md_stats);
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_ENQUEUE - LDLM_FIRST_OPC,
+			     0, "ldlm_enqueue", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CONVERT - LDLM_FIRST_OPC,
+			     0, "ldlm_convert", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CANCEL - LDLM_FIRST_OPC,
+			     0, "ldlm_cancel", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_bl_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_cp_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_gl_callback", "reqs");
+}
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
+
+int lprocfs_exp_print_uuid(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			   struct hlist_node *hnode, void *data)
+
+{
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+	struct seq_file *m = (struct seq_file *)data;
+
+	if (exp->exp_nid_stats)
+		seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid));
+
+	return 0;
+}
+
+static int
+lproc_exp_uuid_seq_show(struct seq_file *m, void *unused)
+{
+	struct nid_stat *stats = (struct nid_stat *)m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_uuid, m);
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_uuid);
+
+struct exp_hash_cb_data {
+	struct seq_file *m;
+	bool		first;
+};
+
+int lprocfs_exp_print_hash(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			   struct hlist_node *hnode, void *cb_data)
+
+{
+	struct exp_hash_cb_data *data = (struct exp_hash_cb_data *)cb_data;
+	struct obd_export       *exp = cfs_hash_object(hs, hnode);
+
+	if (exp->exp_lock_hash != NULL) {
+		if (data->first) {
+			cfs_hash_debug_header(data->m);
+			data->first = false;
+		}
+		cfs_hash_debug_str(hs, data->m);
+	}
+
+	return 0;
+}
+
+static int
+lproc_exp_hash_seq_show(struct seq_file *m, void *unused)
+{
+	struct nid_stat *stats = (struct nid_stat *)m->private;
+	struct obd_device *obd = stats->nid_obd;
+	struct exp_hash_cb_data cb_data = {m, true};
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_hash, &cb_data);
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_hash);
+
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{
+	return seq_printf(m, "%s\n",
+			"Write into this file to clear all nid stats and "
+			"stale nid entries");
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_read);
+
+static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
+{
+	struct nid_stat *stat = obj;
+	ENTRY;
+
+	CDEBUG(D_INFO,"refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+	if (atomic_read(&stat->nid_exp_ref_count) == 1) {
+		/* object has only hash references. */
+		spin_lock(&stat->nid_obd->obd_nid_lock);
+		list_move(&stat->nid_list, data);
+		spin_unlock(&stat->nid_obd->obd_nid_lock);
+		RETURN(1);
+	}
+	/* we has reference to object - only clear data*/
+	if (stat->nid_stats)
+		lprocfs_clear_stats(stat->nid_stats);
+
+	RETURN(0);
+}
+
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct nid_stat *client_stat;
+	LIST_HEAD(free_list);
+
+	cfs_hash_cond_del(obd->obd_nid_stats_hash,
+			  lprocfs_nid_stats_clear_write_cb, &free_list);
+
+	while (!list_empty(&free_list)) {
+		client_stat = list_entry(free_list.next, struct nid_stat,
+					     nid_list);
+		list_del_init(&client_stat->nid_list);
+		lprocfs_free_client_stats(client_stat);
+	}
+
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
+
+int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
+{
+	struct nid_stat *new_stat, *old_stat;
+	struct obd_device *obd = NULL;
+	proc_dir_entry_t *entry;
+	char *buffer = NULL;
+	int rc = 0;
+	ENTRY;
+
+	*newnid = 0;
+
+	if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
+	    !exp->exp_obd->obd_nid_stats_hash)
+		RETURN(-EINVAL);
+
+	/* not test against zero because eric say:
+	 * You may only test nid against another nid, or LNET_NID_ANY.
+	 * Anything else is nonsense.*/
+	if (!nid || *nid == LNET_NID_ANY)
+		RETURN(0);
+
+	obd = exp->exp_obd;
+
+	CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
+
+	OBD_ALLOC_PTR(new_stat);
+	if (new_stat == NULL)
+		RETURN(-ENOMEM);
+
+	new_stat->nid	       = *nid;
+	new_stat->nid_obd	   = exp->exp_obd;
+	/* we need set default refcount to 1 to balance obd_disconnect */
+	atomic_set(&new_stat->nid_exp_ref_count, 1);
+
+	old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash,
+					   nid, &new_stat->nid_hash);
+	CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+	       old_stat, libcfs_nid2str(*nid),
+	       atomic_read(&new_stat->nid_exp_ref_count));
+
+	/* We need to release old stats because lprocfs_exp_cleanup() hasn't
+	 * been and will never be called. */
+	if (exp->exp_nid_stats) {
+		nidstat_putref(exp->exp_nid_stats);
+		exp->exp_nid_stats = NULL;
+	}
+
+	/* Return -EALREADY here so that we know that the /proc
+	 * entry already has been created */
+	if (old_stat != new_stat) {
+		exp->exp_nid_stats = old_stat;
+		GOTO(destroy_new, rc = -EALREADY);
+	}
+	/* not found - create */
+	OBD_ALLOC(buffer, LNET_NIDSTR_SIZE);
+	if (buffer == NULL)
+		GOTO(destroy_new, rc = -ENOMEM);
+
+	memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE);
+	new_stat->nid_proc = lprocfs_register(buffer,
+					      obd->obd_proc_exports_entry,
+					      NULL, NULL);
+	OBD_FREE(buffer, LNET_NIDSTR_SIZE);
+
+	if (new_stat->nid_proc == NULL) {
+		CERROR("Error making export directory for nid %s\n",
+		       libcfs_nid2str(*nid));
+		GOTO(destroy_new_ns, rc = -ENOMEM);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "uuid",
+				   new_stat, &lproc_exp_uuid_fops);
+	if (IS_ERR(entry)) {
+		CWARN("Error adding the NID stats file\n");
+		rc = PTR_ERR(entry);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "hash",
+				   new_stat, &lproc_exp_hash_fops);
+	if (IS_ERR(entry)) {
+		CWARN("Error adding the hash file\n");
+		rc = PTR_ERR(entry);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	exp->exp_nid_stats = new_stat;
+	*newnid = 1;
+	/* protect competitive add to list, not need locking on destroy */
+	spin_lock(&obd->obd_nid_lock);
+	list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+	spin_unlock(&obd->obd_nid_lock);
+
+	RETURN(rc);
+
+destroy_new_ns:
+	if (new_stat->nid_proc != NULL)
+		lprocfs_remove(&new_stat->nid_proc);
+	cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
+
+destroy_new:
+	nidstat_putref(new_stat);
+	OBD_FREE_PTR(new_stat);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_exp_setup);
+
+int lprocfs_exp_cleanup(struct obd_export *exp)
+{
+	struct nid_stat *stat = exp->exp_nid_stats;
+
+	if(!stat || !exp->exp_obd)
+		RETURN(0);
+
+	nidstat_putref(exp->exp_nid_stats);
+	exp->exp_nid_stats = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_exp_cleanup);
+
+int lprocfs_write_helper(const char *buffer, unsigned long count,
+			 int *val)
+{
+	return lprocfs_write_frac_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_helper);
+
+int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+			      int *val, int mult)
+{
+	char kernbuf[20], *end, *pbuf;
+
+	if (count > (sizeof(kernbuf) - 1))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
+	pbuf = kernbuf;
+	if (*pbuf == '-') {
+		mult = -mult;
+		pbuf++;
+	}
+
+	*val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+	if (pbuf == end)
+		return -EINVAL;
+
+	if (end != NULL && *end == '.') {
+		int temp_val, pow = 1;
+		int i;
+
+		pbuf = end + 1;
+		if (strlen(pbuf) > 5)
+			pbuf[5] = '\0'; /*only allow 5bits fractional*/
+
+		temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+
+		if (pbuf < end) {
+			for (i = 0; i < (end - pbuf); i++)
+				pow *= 10;
+
+			*val += temp_val / pow;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_helper);
+
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+			     int mult)
+{
+	long decimal_val, frac_val;
+	int prtn;
+
+	if (count < 10)
+		return -EINVAL;
+
+	decimal_val = val / mult;
+	prtn = snprintf(buffer, count, "%ld", decimal_val);
+	frac_val = val % mult;
+
+	if (prtn < (count - 4) && frac_val > 0) {
+		long temp_frac;
+		int i, temp_mult = 1, frac_bits = 0;
+
+		temp_frac = frac_val * 10;
+		buffer[prtn++] = '.';
+		while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
+			/* only reserved 2 bits fraction */
+			buffer[prtn++] ='0';
+			temp_frac *= 10;
+			frac_bits++;
+		}
+		/*
+		 * Need to think these cases :
+		 *      1. #echo x.00 > /proc/xxx       output result : x
+		 *      2. #echo x.0x > /proc/xxx       output result : x.0x
+		 *      3. #echo x.x0 > /proc/xxx       output result : x.x
+		 *      4. #echo x.xx > /proc/xxx       output result : x.xx
+		 *      Only reserved 2 bits fraction.
+		 */
+		for (i = 0; i < (5 - prtn); i++)
+			temp_mult *= 10;
+
+		frac_bits = min((int)count - prtn, 3 - frac_bits);
+		prtn += snprintf(buffer + prtn, frac_bits, "%ld",
+				 frac_val * temp_mult / mult);
+
+		prtn--;
+		while(buffer[prtn] < '1' || buffer[prtn] > '9') {
+			prtn--;
+			if (buffer[prtn] == '.') {
+				prtn--;
+				break;
+			}
+		}
+		prtn++;
+	}
+	buffer[prtn++] ='\n';
+	return prtn;
+}
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
+
+int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult)
+{
+	long decimal_val, frac_val;
+
+	decimal_val = val / mult;
+	seq_printf(m, "%ld", decimal_val);
+	frac_val = val % mult;
+
+	if (frac_val > 0) {
+		frac_val *= 100;
+		frac_val /= mult;
+	}
+	if (frac_val > 0) {
+		/* Three cases: x0, xx, 0x */
+		if ((frac_val % 10) != 0)
+			seq_printf(m, ".%ld", frac_val);
+		else
+			seq_printf(m, ".%ld", frac_val / 10);
+	}
+
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_seq_read_frac_helper);
+
+int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val)
+{
+	return lprocfs_write_frac_u64_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_u64_helper);
+
+int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
+			      __u64 *val, int mult)
+{
+	char kernbuf[22], *end, *pbuf;
+	__u64 whole, frac = 0, units;
+	unsigned frac_d = 1;
+
+	if (count > (sizeof(kernbuf) - 1))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
+	pbuf = kernbuf;
+	if (*pbuf == '-') {
+		mult = -mult;
+		pbuf++;
+	}
+
+	whole = simple_strtoull(pbuf, &end, 10);
+	if (pbuf == end)
+		return -EINVAL;
+
+	if (end != NULL && *end == '.') {
+		int i;
+		pbuf = end + 1;
+
+		/* need to limit frac_d to a __u32 */
+		if (strlen(pbuf) > 10)
+			pbuf[10] = '\0';
+
+		frac = simple_strtoull(pbuf, &end, 10);
+		/* count decimal places */
+		for (i = 0; i < (end - pbuf); i++)
+			frac_d *= 10;
+	}
+
+	units = 1;
+	switch(*end) {
+	case 'p': case 'P':
+		units <<= 10;
+	case 't': case 'T':
+		units <<= 10;
+	case 'g': case 'G':
+		units <<= 10;
+	case 'm': case 'M':
+		units <<= 10;
+	case 'k': case 'K':
+		units <<= 10;
+	}
+	/* Specified units override the multiplier */
+	if (units)
+		mult = mult < 0 ? -units : units;
+
+	frac *= mult;
+	do_div(frac, frac_d);
+	*val = whole * mult + frac;
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_u64_helper);
+
+static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len)
+{
+	size_t l2;
+
+	l2 = strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	while (len >= l2) {
+		len--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
+
+/**
+ * Find the string \a name in the input \a buffer, and return a pointer to the
+ * value immediately following \a name, reducing \a count appropriately.
+ * If \a name is not found the original \a buffer is returned.
+ */
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+				unsigned long *count)
+{
+	char *val;
+	size_t buflen = *count;
+
+	/* there is no strnstr() in rhel5 and ubuntu kernels */
+	val = lprocfs_strnstr(buffer, name, buflen);
+	if (val == NULL)
+		return (char *)buffer;
+
+	val += strlen(name);			     /* skip prefix */
+	while (val < buffer + buflen && isspace(*val)) /* skip separator */
+		val++;
+
+	*count = 0;
+	while (val < buffer + buflen && isalnum(*val)) {
+		++*count;
+		++val;
+	}
+
+	return val - *count;
+}
+EXPORT_SYMBOL(lprocfs_find_named_value);
+
+int lprocfs_seq_create(proc_dir_entry_t *parent,
+		       const char *name,
+		       mode_t mode,
+		       const struct file_operations *seq_fops,
+		       void *data)
+{
+	struct proc_dir_entry *entry;
+	ENTRY;
+
+	/* Disallow secretly (un)writable entries. */
+	LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0));
+	entry = proc_create_data(name, mode, parent, seq_fops, data);
+
+	if (entry == NULL)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_seq_create);
+
+int lprocfs_obd_seq_create(struct obd_device *dev,
+			   const char *name,
+			   mode_t mode,
+			   const struct file_operations *seq_fops,
+			   void *data)
+{
+	return (lprocfs_seq_create(dev->obd_proc_entry, name,
+				   mode, seq_fops, data));
+}
+EXPORT_SYMBOL(lprocfs_obd_seq_create);
+
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{
+	if (value >= OBD_HIST_MAX)
+		value = OBD_HIST_MAX - 1;
+
+	spin_lock(&oh->oh_lock);
+	oh->oh_buckets[value]++;
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally);
+
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{
+	unsigned int val;
+
+	for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++)
+		;
+
+	lprocfs_oh_tally(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2);
+
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{
+	unsigned long ret = 0;
+	int i;
+
+	for (i = 0; i < OBD_HIST_MAX; i++)
+		ret +=  oh->oh_buckets[i];
+	return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum);
+
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{
+	spin_lock(&oh->oh_lock);
+	memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear);
+
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc);
+
+#endif /* LPROCFS*/

diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
new file mode 100644
index 0000000..fdf0ed3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c

@@ -0,0 +1,2185 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+
+# include <linux/module.h>
+
+/* hash_long() */
+#include <linux/libcfs/libcfs_hash.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+#include <lu_ref.h>
+#include <linux/list.h>
+
+static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+
+/**
+ * Decrease reference counter on object. If last reference is freed, return
+ * object to the cache, unless lu_object_is_dying(o) holds. In the latter
+ * case, free object immediately.
+ */
+void lu_object_put(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object_header *top;
+	struct lu_site	  *site;
+	struct lu_object	*orig;
+	cfs_hash_bd_t	    bd;
+	const struct lu_fid     *fid;
+
+	top  = o->lo_header;
+	site = o->lo_dev->ld_site;
+	orig = o;
+
+	/*
+	 * till we have full fids-on-OST implemented anonymous objects
+	 * are possible in OSP. such an object isn't listed in the site
+	 * so we should not remove it from the site.
+	 */
+	fid = lu_object_fid(o);
+	if (fid_is_zero(fid)) {
+		LASSERT(top->loh_hash.next == NULL
+			&& top->loh_hash.pprev == NULL);
+		LASSERT(list_empty(&top->loh_lru));
+		if (!atomic_dec_and_test(&top->loh_ref))
+			return;
+		list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+			if (o->lo_ops->loo_object_release != NULL)
+				o->lo_ops->loo_object_release(env, o);
+		}
+		lu_object_free(env, orig);
+		return;
+	}
+
+	cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
+	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+
+	if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
+		if (lu_object_is_dying(top)) {
+
+			/*
+			 * somebody may be waiting for this, currently only
+			 * used for cl_object, see cl_object_put_last().
+			 */
+			wake_up_all(&bkt->lsb_marche_funebre);
+		}
+		return;
+	}
+
+	LASSERT(bkt->lsb_busy > 0);
+	bkt->lsb_busy--;
+	/*
+	 * When last reference is released, iterate over object
+	 * layers, and notify them that object is no longer busy.
+	 */
+	list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+		if (o->lo_ops->loo_object_release != NULL)
+			o->lo_ops->loo_object_release(env, o);
+	}
+
+	if (!lu_object_is_dying(top)) {
+		LASSERT(list_empty(&top->loh_lru));
+		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+		cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+		return;
+	}
+
+	/*
+	 * If object is dying (will not be cached), removed it
+	 * from hash table and LRU.
+	 *
+	 * This is done with hash table and LRU lists locked. As the only
+	 * way to acquire first reference to previously unreferenced
+	 * object is through hash-table lookup (lu_object_find()),
+	 * or LRU scanning (lu_site_purge()), that are done under hash-table
+	 * and LRU lock, no race with concurrent object lookup is possible
+	 * and we can safely destroy object below.
+	 */
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+		cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
+	cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+	/*
+	 * Object was already removed from hash and lru above, can
+	 * kill it.
+	 */
+	lu_object_free(env, orig);
+}
+EXPORT_SYMBOL(lu_object_put);
+
+/**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
+	return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_object_header *top;
+
+	top = o->lo_header;
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+		cfs_hash_t *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+		cfs_hash_bd_t bd;
+
+		cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
+		list_del_init(&top->loh_lru);
+		cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
+		cfs_hash_bd_unlock(obj_hash, &bd, 1);
+	}
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
+ * Allocate new object.
+ *
+ * This follows object creation protocol, described in the comment within
+ * struct lu_device_operations definition.
+ */
+static struct lu_object *lu_object_alloc(const struct lu_env *env,
+					 struct lu_device *dev,
+					 const struct lu_fid *f,
+					 const struct lu_object_conf *conf)
+{
+	struct lu_object *scan;
+	struct lu_object *top;
+	struct list_head *layers;
+	int clean;
+	int result;
+	ENTRY;
+
+	/*
+	 * Create top-level object slice. This will also create
+	 * lu_object_header.
+	 */
+	top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+	if (top == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+	if (IS_ERR(top))
+		RETURN(top);
+	/*
+	 * This is the only place where object fid is assigned. It's constant
+	 * after this point.
+	 */
+	top->lo_header->loh_fid = *f;
+	layers = &top->lo_header->loh_layers;
+	do {
+		/*
+		 * Call ->loo_object_init() repeatedly, until no more new
+		 * object slices are created.
+		 */
+		clean = 1;
+		list_for_each_entry(scan, layers, lo_linkage) {
+			if (scan->lo_flags & LU_OBJECT_ALLOCATED)
+				continue;
+			clean = 0;
+			scan->lo_header = top->lo_header;
+			result = scan->lo_ops->loo_object_init(env, scan, conf);
+			if (result != 0) {
+				lu_object_free(env, top);
+				RETURN(ERR_PTR(result));
+			}
+			scan->lo_flags |= LU_OBJECT_ALLOCATED;
+		}
+	} while (!clean);
+
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+		if (scan->lo_ops->loo_object_start != NULL) {
+			result = scan->lo_ops->loo_object_start(env, scan);
+			if (result != 0) {
+				lu_object_free(env, top);
+				RETURN(ERR_PTR(result));
+			}
+		}
+	}
+
+	lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+	RETURN(top);
+}
+
+/**
+ * Free an object.
+ */
+static void lu_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_site	  *site;
+	struct lu_object	*scan;
+	struct list_head	      *layers;
+	struct list_head	       splice;
+
+	site   = o->lo_dev->ld_site;
+	layers = &o->lo_header->loh_layers;
+	bkt    = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+	/*
+	 * First call ->loo_object_delete() method to release all resources.
+	 */
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+		if (scan->lo_ops->loo_object_delete != NULL)
+			scan->lo_ops->loo_object_delete(env, scan);
+	}
+
+	/*
+	 * Then, splice object layers into stand-alone list, and call
+	 * ->loo_object_free() on all layers to free memory. Splice is
+	 * necessary, because lu_object_header is freed together with the
+	 * top-level slice.
+	 */
+	INIT_LIST_HEAD(&splice);
+	list_splice_init(layers, &splice);
+	while (!list_empty(&splice)) {
+		/*
+		 * Free layers in bottom-to-top order, so that object header
+		 * lives as long as possible and ->loo_object_free() methods
+		 * can look at its contents.
+		 */
+		o = container_of0(splice.prev, struct lu_object, lo_linkage);
+		list_del_init(&o->lo_linkage);
+		LASSERT(o->lo_ops->loo_object_free != NULL);
+		o->lo_ops->loo_object_free(env, o);
+	}
+
+	if (waitqueue_active(&bkt->lsb_marche_funebre))
+		wake_up_all(&bkt->lsb_marche_funebre);
+}
+
+/**
+ * Free \a nr objects from the cold end of the site LRU list.
+ */
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
+{
+	struct lu_object_header *h;
+	struct lu_object_header *temp;
+	struct lu_site_bkt_data *bkt;
+	cfs_hash_bd_t	    bd;
+	cfs_hash_bd_t	    bd2;
+	struct list_head	       dispose;
+	int		      did_sth;
+	int		      start;
+	int		      count;
+	int		      bnr;
+	int		      i;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU))
+		RETURN(0);
+
+	INIT_LIST_HEAD(&dispose);
+	/*
+	 * Under LRU list lock, scan LRU list and move unreferenced objects to
+	 * the dispose list, removing them from LRU and hash table.
+	 */
+	start = s->ls_purge_start;
+	bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1;
+ again:
+	did_sth = 0;
+	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+		if (i < start)
+			continue;
+		count = bnr;
+		cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1);
+		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+
+		list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) {
+			LASSERT(atomic_read(&h->loh_ref) == 0);
+
+			cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2);
+			LASSERT(bd.bd_bucket == bd2.bd_bucket);
+
+			cfs_hash_bd_del_locked(s->ls_obj_hash,
+					       &bd2, &h->loh_hash);
+			list_move(&h->loh_lru, &dispose);
+			if (did_sth == 0)
+				did_sth = 1;
+
+			if (nr != ~0 && --nr == 0)
+				break;
+
+			if (count > 0 && --count == 0)
+				break;
+
+		}
+		cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1);
+		cond_resched();
+		/*
+		 * Free everything on the dispose list. This is safe against
+		 * races due to the reasons described in lu_object_put().
+		 */
+		while (!list_empty(&dispose)) {
+			h = container_of0(dispose.next,
+					  struct lu_object_header, loh_lru);
+			list_del_init(&h->loh_lru);
+			lu_object_free(env, lu_object_top(h));
+			lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
+		}
+
+		if (nr == 0)
+			break;
+	}
+
+	if (nr != 0 && did_sth && start != 0) {
+		start = 0; /* restart from the first bucket */
+		goto again;
+	}
+	/* race on s->ls_purge_start, but nobody cares */
+	s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash);
+
+	return nr;
+}
+EXPORT_SYMBOL(lu_site_purge);
+
+/*
+ * Object printing.
+ *
+ * Code below has to jump through certain loops to output object description
+ * into libcfs_debug_msg-based log. The problem is that lu_object_print()
+ * composes object description from strings that are parts of _lines_ of
+ * output (i.e., strings that are not terminated by newline). This doesn't fit
+ * very well into libcfs_debug_msg() interface that assumes that each message
+ * supplied to it is a self-contained output line.
+ *
+ * To work around this, strings are collected in a temporary buffer
+ * (implemented as a value of lu_cdebug_key key), until terminating newline
+ * character is detected.
+ *
+ */
+
+enum {
+	/**
+	 * Maximal line size.
+	 *
+	 * XXX overflow is not handled correctly.
+	 */
+	LU_CDEBUG_LINE = 512
+};
+
+struct lu_cdebug_data {
+	/**
+	 * Temporary buffer.
+	 */
+	char lck_area[LU_CDEBUG_LINE];
+};
+
+/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */
+LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data);
+
+/**
+ * Key, holding temporary buffer. This key is registered very early by
+ * lu_global_init().
+ */
+struct lu_context_key lu_global_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD |
+		    LCT_MG_THREAD | LCT_CL_THREAD,
+	.lct_init = lu_global_key_init,
+	.lct_fini = lu_global_key_fini
+};
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+		      void *cookie, const char *format, ...)
+{
+	struct libcfs_debug_msg_data *msgdata = cookie;
+	struct lu_cdebug_data	*key;
+	int used;
+	int complete;
+	va_list args;
+
+	va_start(args, format);
+
+	key = lu_context_key_get(&env->le_ctx, &lu_global_key);
+	LASSERT(key != NULL);
+
+	used = strlen(key->lck_area);
+	complete = format[strlen(format) - 1] == '\n';
+	/*
+	 * Append new chunk to the buffer.
+	 */
+	vsnprintf(key->lck_area + used,
+		  ARRAY_SIZE(key->lck_area) - used, format, args);
+	if (complete) {
+		if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys))
+			libcfs_debug_msg(msgdata, "%s", key->lck_area);
+		key->lck_area[0] = 0;
+	}
+	va_end(args);
+	return 0;
+}
+EXPORT_SYMBOL(lu_cdebug_printer);
+
+/**
+ * Print object header.
+ */
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t printer,
+			    const struct lu_object_header *hdr)
+{
+	(*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
+		   hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
+		   PFID(&hdr->loh_fid),
+		   hlist_unhashed(&hdr->loh_hash) ? "" : " hash",
+		   list_empty((struct list_head *)&hdr->loh_lru) ? \
+		   "" : " lru",
+		   hdr->loh_attr & LOHA_EXISTS ? " exist":"");
+}
+EXPORT_SYMBOL(lu_object_header_print);
+
+/**
+ * Print human readable representation of the \a o to the \a printer.
+ */
+void lu_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t printer, const struct lu_object *o)
+{
+	static const char ruler[] = "........................................";
+	struct lu_object_header *top;
+	int depth;
+
+	top = o->lo_header;
+	lu_object_header_print(env, cookie, printer, top);
+	(*printer)(env, cookie, "{ \n");
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+		depth = o->lo_depth + 4;
+
+		/*
+		 * print `.' \a depth times followed by type name and address
+		 */
+		(*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
+			   o->lo_dev->ld_type->ldt_name, o);
+		if (o->lo_ops->loo_object_print != NULL)
+			o->lo_ops->loo_object_print(env, cookie, printer, o);
+		(*printer)(env, cookie, "\n");
+	}
+	(*printer)(env, cookie, "} header@%p\n", top);
+}
+EXPORT_SYMBOL(lu_object_print);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o)
+{
+	struct lu_object_header *top;
+
+	top = o->lo_header;
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+		if (o->lo_ops->loo_object_invariant != NULL &&
+		    !o->lo_ops->loo_object_invariant(o))
+			return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(lu_object_invariant);
+
+static struct lu_object *htable_lookup(struct lu_site *s,
+				       cfs_hash_bd_t *bd,
+				       const struct lu_fid *f,
+				       wait_queue_t *waiter,
+				       __u64 *version)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object_header *h;
+	struct hlist_node	*hnode;
+	__u64  ver = cfs_hash_bd_version_get(bd);
+
+	if (*version == ver)
+		return NULL;
+
+	*version = ver;
+	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
+	/* cfs_hash_bd_peek_locked is a somehow "internal" function
+	 * of cfs_hash, it doesn't add refcount on object. */
+	hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
+	if (hnode == NULL) {
+		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+		return NULL;
+	}
+
+	h = container_of0(hnode, struct lu_object_header, loh_hash);
+	if (likely(!lu_object_is_dying(h))) {
+		cfs_hash_get(s->ls_obj_hash, hnode);
+		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
+		list_del_init(&h->loh_lru);
+		return lu_object_top(h);
+	}
+
+	/*
+	 * Lookup found an object being destroyed this object cannot be
+	 * returned (to assure that references to dying objects are eventually
+	 * drained), and moreover, lookup has to wait until object is freed.
+	 */
+
+	init_waitqueue_entry_current(waiter);
+	add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE);
+	return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * Search cache for an object with the fid \a f. If such object is found,
+ * return it. Otherwise, create new object, insert it into cache and return
+ * it. In any case, additional reference is acquired on the returned object.
+ */
+struct lu_object *lu_object_find(const struct lu_env *env,
+				 struct lu_device *dev, const struct lu_fid *f,
+				 const struct lu_object_conf *conf)
+{
+	return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf);
+}
+EXPORT_SYMBOL(lu_object_find);
+
+static struct lu_object *lu_object_new(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf)
+{
+	struct lu_object	*o;
+	cfs_hash_t	      *hs;
+	cfs_hash_bd_t	    bd;
+	struct lu_site_bkt_data *bkt;
+
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	hs = dev->ld_site->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	bkt->lsb_busy++;
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	return o;
+}
+
+/**
+ * Core logic of lu_object_find*() functions.
+ */
+static struct lu_object *lu_object_find_try(const struct lu_env *env,
+					    struct lu_device *dev,
+					    const struct lu_fid *f,
+					    const struct lu_object_conf *conf,
+					    wait_queue_t *waiter)
+{
+	struct lu_object      *o;
+	struct lu_object      *shadow;
+	struct lu_site	*s;
+	cfs_hash_t	    *hs;
+	cfs_hash_bd_t	  bd;
+	__u64		  version = 0;
+
+	/*
+	 * This uses standard index maintenance protocol:
+	 *
+	 *     - search index under lock, and return object if found;
+	 *     - otherwise, unlock index, allocate new object;
+	 *     - lock index and search again;
+	 *     - if nothing is found (usual case), insert newly created
+	 *       object into index;
+	 *     - otherwise (race: other thread inserted object), free
+	 *       object just allocated.
+	 *     - unlock index;
+	 *     - return object.
+	 *
+	 * For "LOC_F_NEW" case, we are sure the object is new established.
+	 * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
+	 * just alloc and insert directly.
+	 *
+	 * If dying object is found during index search, add @waiter to the
+	 * site wait-queue and return ERR_PTR(-EAGAIN).
+	 */
+	if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+		return lu_object_new(env, dev, f, conf);
+
+	s  = dev->ld_site;
+	hs = s->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	o = htable_lookup(s, &bd, f, waiter, &version);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	if (o != NULL)
+		return o;
+
+	/*
+	 * Allocate new object. This may result in rather complicated
+	 * operations, including fld queries, inode loading, etc.
+	 */
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	LASSERT(lu_fid_eq(lu_object_fid(o), f));
+
+	cfs_hash_bd_lock(hs, &bd, 1);
+
+	shadow = htable_lookup(s, &bd, f, waiter, &version);
+	if (likely(shadow == NULL)) {
+		struct lu_site_bkt_data *bkt;
+
+		bkt = cfs_hash_bd_extra_get(hs, &bd);
+		cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+		bkt->lsb_busy++;
+		cfs_hash_bd_unlock(hs, &bd, 1);
+		return o;
+	}
+
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	lu_object_free(env, o);
+	return shadow;
+}
+
+/**
+ * Much like lu_object_find(), but top level device of object is specifically
+ * \a dev rather than top level device of the site. This interface allows
+ * objects of different "stacking" to be created within the same site.
+ */
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+				    struct lu_device *dev,
+				    const struct lu_fid *f,
+				    const struct lu_object_conf *conf)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object	*obj;
+	wait_queue_t	   wait;
+
+	while (1) {
+		obj = lu_object_find_try(env, dev, f, conf, &wait);
+		if (obj != ERR_PTR(-EAGAIN))
+			return obj;
+		/*
+		 * lu_object_find_try() already added waiter into the
+		 * wait queue.
+		 */
+		waitq_wait(&wait, TASK_UNINTERRUPTIBLE);
+		bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f);
+		remove_wait_queue(&bkt->lsb_marche_funebre, &wait);
+	}
+}
+EXPORT_SYMBOL(lu_object_find_at);
+
+/**
+ * Find object with given fid, and return its slice belonging to given device.
+ */
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf)
+{
+	struct lu_object *top;
+	struct lu_object *obj;
+
+	top = lu_object_find(env, dev, f, conf);
+	if (!IS_ERR(top)) {
+		obj = lu_object_locate(top->lo_header, dev->ld_type);
+		if (obj == NULL)
+			lu_object_put(env, top);
+	} else
+		obj = top;
+	return obj;
+}
+EXPORT_SYMBOL(lu_object_find_slice);
+
+/**
+ * Global list of all device types.
+ */
+static LIST_HEAD(lu_device_types);
+
+int lu_device_type_init(struct lu_device_type *ldt)
+{
+	int result = 0;
+
+	INIT_LIST_HEAD(&ldt->ldt_linkage);
+	if (ldt->ldt_ops->ldto_init)
+		result = ldt->ldt_ops->ldto_init(ldt);
+	if (result == 0)
+		list_add(&ldt->ldt_linkage, &lu_device_types);
+	return result;
+}
+EXPORT_SYMBOL(lu_device_type_init);
+
+void lu_device_type_fini(struct lu_device_type *ldt)
+{
+	list_del_init(&ldt->ldt_linkage);
+	if (ldt->ldt_ops->ldto_fini)
+		ldt->ldt_ops->ldto_fini(ldt);
+}
+EXPORT_SYMBOL(lu_device_type_fini);
+
+void lu_types_stop(void)
+{
+	struct lu_device_type *ldt;
+
+	list_for_each_entry(ldt, &lu_device_types, ldt_linkage) {
+		if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop)
+			ldt->ldt_ops->ldto_stop(ldt);
+	}
+}
+EXPORT_SYMBOL(lu_types_stop);
+
+/**
+ * Global list of all sites on this node
+ */
+static LIST_HEAD(lu_sites);
+static DEFINE_MUTEX(lu_sites_guard);
+
+/**
+ * Global environment used by site shrinker.
+ */
+static struct lu_env lu_shrink_env;
+
+struct lu_site_print_arg {
+	struct lu_env   *lsp_env;
+	void	    *lsp_cookie;
+	lu_printer_t     lsp_printer;
+};
+
+static int
+lu_site_obj_print(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		  struct hlist_node *hnode, void *data)
+{
+	struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data;
+	struct lu_object_header  *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	if (!list_empty(&h->loh_layers)) {
+		const struct lu_object *o;
+
+		o = lu_object_top(h);
+		lu_object_print(arg->lsp_env, arg->lsp_cookie,
+				arg->lsp_printer, o);
+	} else {
+		lu_object_header_print(arg->lsp_env, arg->lsp_cookie,
+				       arg->lsp_printer, h);
+	}
+	return 0;
+}
+
+/**
+ * Print all objects in \a s.
+ */
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+		   lu_printer_t printer)
+{
+	struct lu_site_print_arg arg = {
+		.lsp_env     = (struct lu_env *)env,
+		.lsp_cookie  = cookie,
+		.lsp_printer = printer,
+	};
+
+	cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg);
+}
+EXPORT_SYMBOL(lu_site_print);
+
+enum {
+	LU_CACHE_PERCENT_MAX     = 50,
+	LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644,
+		"Percentage of memory to be used as lu_object cache");
+
+/**
+ * Return desired hash table order.
+ */
+static int lu_htable_order(void)
+{
+	unsigned long cache_size;
+	int bits;
+
+	/*
+	 * Calculate hash table size, assuming that we want reasonable
+	 * performance when 20% of total memory is occupied by cache of
+	 * lu_objects.
+	 *
+	 * Size of lu_object is (arbitrary) taken as 1K (together with inode).
+	 */
+	cache_size = num_physpages;
+
+#if BITS_PER_LONG == 32
+	/* limit hashtable size for lowmem systems to low RAM */
+	if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT))
+		cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4;
+#endif
+
+	/* clear off unreasonable cache setting. */
+	if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) {
+		CWARN("obdclass: invalid lu_cache_percent: %u, it must be in"
+		      " the range of (0, %u]. Will use default value: %u.\n",
+		      lu_cache_percent, LU_CACHE_PERCENT_MAX,
+		      LU_CACHE_PERCENT_DEFAULT);
+
+		lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+	}
+	cache_size = cache_size / 100 * lu_cache_percent *
+		(PAGE_CACHE_SIZE / 1024);
+
+	for (bits = 1; (1 << bits) < cache_size; ++bits) {
+		;
+	}
+	return bits;
+}
+
+static unsigned lu_obj_hop_hash(cfs_hash_t *hs,
+				const void *key, unsigned mask)
+{
+	struct lu_fid  *fid = (struct lu_fid *)key;
+	__u32	   hash;
+
+	hash = fid_flatten32(fid);
+	hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+	hash = cfs_hash_long(hash, hs->hs_bkt_bits);
+
+	/* give me another random factor */
+	hash -= cfs_hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3);
+
+	hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+	hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1);
+
+	return hash & mask;
+}
+
+static void *lu_obj_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct lu_object_header, loh_hash);
+}
+
+static void *lu_obj_hop_key(struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	return &h->loh_fid;
+}
+
+static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key);
+}
+
+static void lu_obj_hop_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	if (atomic_add_return(1, &h->loh_ref) == 1) {
+		struct lu_site_bkt_data *bkt;
+		cfs_hash_bd_t	    bd;
+
+		cfs_hash_bd_get(hs, &h->loh_fid, &bd);
+		bkt = cfs_hash_bd_extra_get(hs, &bd);
+		bkt->lsb_busy++;
+	}
+}
+
+static void lu_obj_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	LBUG(); /* we should never called it */
+}
+
+cfs_hash_ops_t lu_site_hash_ops = {
+	.hs_hash	= lu_obj_hop_hash,
+	.hs_key	 = lu_obj_hop_key,
+	.hs_keycmp      = lu_obj_hop_keycmp,
+	.hs_object      = lu_obj_hop_object,
+	.hs_get	 = lu_obj_hop_get,
+	.hs_put_locked  = lu_obj_hop_put_locked,
+};
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	if (list_empty(&d->ld_linkage))
+		list_add(&d->ld_linkage, &s->ls_ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_add_linkage);
+
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	list_del_init(&d->ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_del_linkage);
+
+/**
+ * Initialize site \a s, with \a d as the top level device.
+ */
+#define LU_SITE_BITS_MIN    12
+#define LU_SITE_BITS_MAX    24
+/**
+ * total 256 buckets, we don't want too many buckets because:
+ * - consume too much memory
+ * - avoid unbalanced LRU list
+ */
+#define LU_SITE_BKT_BITS    8
+
+int lu_site_init(struct lu_site *s, struct lu_device *top)
+{
+	struct lu_site_bkt_data *bkt;
+	cfs_hash_bd_t bd;
+	char name[16];
+	int bits;
+	int i;
+	ENTRY;
+
+	memset(s, 0, sizeof *s);
+	bits = lu_htable_order();
+	snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name);
+	for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX);
+	     bits >= LU_SITE_BITS_MIN; bits--) {
+		s->ls_obj_hash = cfs_hash_create(name, bits, bits,
+						 bits - LU_SITE_BKT_BITS,
+						 sizeof(*bkt), 0, 0,
+						 &lu_site_hash_ops,
+						 CFS_HASH_SPIN_BKTLOCK |
+						 CFS_HASH_NO_ITEMREF |
+						 CFS_HASH_DEPTH |
+						 CFS_HASH_ASSERT_EMPTY);
+		if (s->ls_obj_hash != NULL)
+			break;
+	}
+
+	if (s->ls_obj_hash == NULL) {
+		CERROR("failed to create lu_site hash with bits: %d\n", bits);
+		return -ENOMEM;
+	}
+
+	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+		INIT_LIST_HEAD(&bkt->lsb_lru);
+		init_waitqueue_head(&bkt->lsb_marche_funebre);
+	}
+
+	s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
+	if (s->ls_stats == NULL) {
+		cfs_hash_putref(s->ls_obj_hash);
+		s->ls_obj_hash = NULL;
+		return -ENOMEM;
+	}
+
+	lprocfs_counter_init(s->ls_stats, LU_SS_CREATED,
+			     0, "created", "created");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT,
+			     0, "cache_hit", "cache_hit");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS,
+			     0, "cache_miss", "cache_miss");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE,
+			     0, "cache_race", "cache_race");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE,
+			     0, "cache_death_race", "cache_death_race");
+	lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
+			     0, "lru_purged", "lru_purged");
+
+	INIT_LIST_HEAD(&s->ls_linkage);
+	s->ls_top_dev = top;
+	top->ld_site = s;
+	lu_device_get(top);
+	lu_ref_add(&top->ld_reference, "site-top", s);
+
+	INIT_LIST_HEAD(&s->ls_ld_linkage);
+	spin_lock_init(&s->ls_ld_lock);
+
+	lu_dev_add_linkage(s, top);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lu_site_init);
+
+/**
+ * Finalize \a s and release its resources.
+ */
+void lu_site_fini(struct lu_site *s)
+{
+	mutex_lock(&lu_sites_guard);
+	list_del_init(&s->ls_linkage);
+	mutex_unlock(&lu_sites_guard);
+
+	if (s->ls_obj_hash != NULL) {
+		cfs_hash_putref(s->ls_obj_hash);
+		s->ls_obj_hash = NULL;
+	}
+
+	if (s->ls_top_dev != NULL) {
+		s->ls_top_dev->ld_site = NULL;
+		lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s);
+		lu_device_put(s->ls_top_dev);
+		s->ls_top_dev = NULL;
+	}
+
+	if (s->ls_stats != NULL)
+		lprocfs_free_stats(&s->ls_stats);
+}
+EXPORT_SYMBOL(lu_site_fini);
+
+/**
+ * Called when initialization of stack for this site is completed.
+ */
+int lu_site_init_finish(struct lu_site *s)
+{
+	int result;
+	mutex_lock(&lu_sites_guard);
+	result = lu_context_refill(&lu_shrink_env.le_ctx);
+	if (result == 0)
+		list_add(&s->ls_linkage, &lu_sites);
+	mutex_unlock(&lu_sites_guard);
+	return result;
+}
+EXPORT_SYMBOL(lu_site_init_finish);
+
+/**
+ * Acquire additional reference on device \a d
+ */
+void lu_device_get(struct lu_device *d)
+{
+	atomic_inc(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_get);
+
+/**
+ * Release reference on device \a d.
+ */
+void lu_device_put(struct lu_device *d)
+{
+	LASSERT(atomic_read(&d->ld_ref) > 0);
+	atomic_dec(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_put);
+
+/**
+ * Initialize device \a d of type \a t.
+ */
+int lu_device_init(struct lu_device *d, struct lu_device_type *t)
+{
+	if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL)
+		t->ldt_ops->ldto_start(t);
+	memset(d, 0, sizeof *d);
+	atomic_set(&d->ld_ref, 0);
+	d->ld_type = t;
+	lu_ref_init(&d->ld_reference);
+	INIT_LIST_HEAD(&d->ld_linkage);
+	return 0;
+}
+EXPORT_SYMBOL(lu_device_init);
+
+/**
+ * Finalize device \a d.
+ */
+void lu_device_fini(struct lu_device *d)
+{
+	struct lu_device_type *t;
+
+	t = d->ld_type;
+	if (d->ld_obd != NULL) {
+		d->ld_obd->obd_lu_dev = NULL;
+		d->ld_obd = NULL;
+	}
+
+	lu_ref_fini(&d->ld_reference);
+	LASSERTF(atomic_read(&d->ld_ref) == 0,
+		 "Refcount is %u\n", atomic_read(&d->ld_ref));
+	LASSERT(t->ldt_device_nr > 0);
+	if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL)
+		t->ldt_ops->ldto_stop(t);
+}
+EXPORT_SYMBOL(lu_device_fini);
+
+/**
+ * Initialize object \a o that is part of compound object \a h and was created
+ * by device \a d.
+ */
+int lu_object_init(struct lu_object *o,
+		   struct lu_object_header *h, struct lu_device *d)
+{
+	memset(o, 0, sizeof *o);
+	o->lo_header = h;
+	o->lo_dev    = d;
+	lu_device_get(d);
+	o->lo_dev_ref = lu_ref_add(&d->ld_reference, "lu_object", o);
+	INIT_LIST_HEAD(&o->lo_linkage);
+	return 0;
+}
+EXPORT_SYMBOL(lu_object_init);
+
+/**
+ * Finalize object and release its resources.
+ */
+void lu_object_fini(struct lu_object *o)
+{
+	struct lu_device *dev = o->lo_dev;
+
+	LASSERT(list_empty(&o->lo_linkage));
+
+	if (dev != NULL) {
+		lu_ref_del_at(&dev->ld_reference,
+			      o->lo_dev_ref , "lu_object", o);
+		lu_device_put(dev);
+		o->lo_dev = NULL;
+	}
+}
+EXPORT_SYMBOL(lu_object_fini);
+
+/**
+ * Add object \a o as first layer of compound object \a h
+ *
+ * This is typically called by the ->ldo_object_alloc() method of top-level
+ * device.
+ */
+void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &h->loh_layers);
+}
+EXPORT_SYMBOL(lu_object_add_top);
+
+/**
+ * Add object \a o as a layer of compound object, going after \a before.
+ *
+ * This is typically called by the ->ldo_object_alloc() method of \a
+ * before->lo_dev.
+ */
+void lu_object_add(struct lu_object *before, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &before->lo_linkage);
+}
+EXPORT_SYMBOL(lu_object_add);
+
+/**
+ * Initialize compound object.
+ */
+int lu_object_header_init(struct lu_object_header *h)
+{
+	memset(h, 0, sizeof *h);
+	atomic_set(&h->loh_ref, 1);
+	INIT_HLIST_NODE(&h->loh_hash);
+	INIT_LIST_HEAD(&h->loh_lru);
+	INIT_LIST_HEAD(&h->loh_layers);
+	lu_ref_init(&h->loh_reference);
+	return 0;
+}
+EXPORT_SYMBOL(lu_object_header_init);
+
+/**
+ * Finalize compound object.
+ */
+void lu_object_header_fini(struct lu_object_header *h)
+{
+	LASSERT(list_empty(&h->loh_layers));
+	LASSERT(list_empty(&h->loh_lru));
+	LASSERT(hlist_unhashed(&h->loh_hash));
+	lu_ref_fini(&h->loh_reference);
+}
+EXPORT_SYMBOL(lu_object_header_fini);
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+				   const struct lu_device_type *dtype)
+{
+	struct lu_object *o;
+
+	list_for_each_entry(o, &h->loh_layers, lo_linkage) {
+		if (o->lo_dev->ld_type == dtype)
+			return o;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(lu_object_locate);
+
+
+
+/**
+ * Finalize and free devices in the device stack.
+ *
+ * Finalize device stack by purging object cache, and calling
+ * lu_device_type_operations::ldto_device_fini() and
+ * lu_device_type_operations::ldto_device_free() on all devices in the stack.
+ */
+void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
+{
+	struct lu_site   *site = top->ld_site;
+	struct lu_device *scan;
+	struct lu_device *next;
+
+	lu_site_purge(env, site, ~0);
+	for (scan = top; scan != NULL; scan = next) {
+		next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan);
+		lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init);
+		lu_device_put(scan);
+	}
+
+	/* purge again. */
+	lu_site_purge(env, site, ~0);
+
+	for (scan = top; scan != NULL; scan = next) {
+		const struct lu_device_type *ldt = scan->ld_type;
+		struct obd_type	     *type;
+
+		next = ldt->ldt_ops->ldto_device_free(env, scan);
+		type = ldt->ldt_obd_type;
+		if (type != NULL) {
+			type->typ_refcnt--;
+			class_put_type(type);
+		}
+	}
+}
+EXPORT_SYMBOL(lu_stack_fini);
+
+enum {
+	/**
+	 * Maximal number of tld slots.
+	 */
+	LU_CONTEXT_KEY_NR = 40
+};
+
+static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
+
+static DEFINE_SPINLOCK(lu_keys_guard);
+
+/**
+ * Global counter incremented whenever key is registered, unregistered,
+ * revived or quiesced. This is used to void unnecessary calls to
+ * lu_context_refill(). No locking is provided, as initialization and shutdown
+ * are supposed to be externally serialized.
+ */
+static unsigned key_set_version = 0;
+
+/**
+ * Register new key.
+ */
+int lu_context_key_register(struct lu_context_key *key)
+{
+	int result;
+	int i;
+
+	LASSERT(key->lct_init != NULL);
+	LASSERT(key->lct_fini != NULL);
+	LASSERT(key->lct_tags != 0);
+	LASSERT(key->lct_owner != NULL);
+
+	result = -ENFILE;
+	spin_lock(&lu_keys_guard);
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		if (lu_keys[i] == NULL) {
+			key->lct_index = i;
+			atomic_set(&key->lct_used, 1);
+			lu_keys[i] = key;
+			lu_ref_init(&key->lct_reference);
+			result = 0;
+			++key_set_version;
+			break;
+		}
+	}
+	spin_unlock(&lu_keys_guard);
+	return result;
+}
+EXPORT_SYMBOL(lu_context_key_register);
+
+static void key_fini(struct lu_context *ctx, int index)
+{
+	if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) {
+		struct lu_context_key *key;
+
+		key = lu_keys[index];
+		LASSERT(key != NULL);
+		LASSERT(key->lct_fini != NULL);
+		LASSERT(atomic_read(&key->lct_used) > 1);
+
+		key->lct_fini(ctx, key, ctx->lc_value[index]);
+		lu_ref_del(&key->lct_reference, "ctx", ctx);
+		atomic_dec(&key->lct_used);
+
+		LASSERT(key->lct_owner != NULL);
+		if ((ctx->lc_tags & LCT_NOREF) == 0) {
+#ifdef CONFIG_MODULE_UNLOAD
+			LINVRNT(module_refcount(key->lct_owner) > 0);
+#endif
+			module_put(key->lct_owner);
+		}
+		ctx->lc_value[index] = NULL;
+	}
+}
+
+/**
+ * Deregister key.
+ */
+void lu_context_key_degister(struct lu_context_key *key)
+{
+	LASSERT(atomic_read(&key->lct_used) >= 1);
+	LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+
+	lu_context_key_quiesce(key);
+
+	++key_set_version;
+	spin_lock(&lu_keys_guard);
+	key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+	if (lu_keys[key->lct_index]) {
+		lu_keys[key->lct_index] = NULL;
+		lu_ref_fini(&key->lct_reference);
+	}
+	spin_unlock(&lu_keys_guard);
+
+	LASSERTF(atomic_read(&key->lct_used) == 1,
+		 "key has instances: %d\n",
+		 atomic_read(&key->lct_used));
+}
+EXPORT_SYMBOL(lu_context_key_degister);
+
+/**
+ * Register a number of keys. This has to be called after all keys have been
+ * initialized by a call to LU_CONTEXT_KEY_INIT().
+ */
+int lu_context_key_register_many(struct lu_context_key *k, ...)
+{
+	struct lu_context_key *key = k;
+	va_list args;
+	int result;
+
+	va_start(args, k);
+	do {
+		result = lu_context_key_register(key);
+		if (result)
+			break;
+		key = va_arg(args, struct lu_context_key *);
+	} while (key != NULL);
+	va_end(args);
+
+	if (result != 0) {
+		va_start(args, k);
+		while (k != key) {
+			lu_context_key_degister(k);
+			k = va_arg(args, struct lu_context_key *);
+		}
+		va_end(args);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(lu_context_key_register_many);
+
+/**
+ * De-register a number of keys. This is a dual to
+ * lu_context_key_register_many().
+ */
+void lu_context_key_degister_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_degister(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_degister_many);
+
+/**
+ * Revive a number of keys.
+ */
+void lu_context_key_revive_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_revive(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_revive_many);
+
+/**
+ * Quiescent a number of keys.
+ */
+void lu_context_key_quiesce_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_quiesce(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_quiesce_many);
+
+/**
+ * Return value associated with key \a key in context \a ctx.
+ */
+void *lu_context_key_get(const struct lu_context *ctx,
+			 const struct lu_context_key *key)
+{
+	LINVRNT(ctx->lc_state == LCS_ENTERED);
+	LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+	LASSERT(lu_keys[key->lct_index] == key);
+	return ctx->lc_value[key->lct_index];
+}
+EXPORT_SYMBOL(lu_context_key_get);
+
+/**
+ * List of remembered contexts. XXX document me.
+ */
+static LIST_HEAD(lu_context_remembered);
+
+/**
+ * Destroy \a key in all remembered contexts. This is used to destroy key
+ * values in "shared" contexts (like service threads), when a module owning
+ * the key is about to be unloaded.
+ */
+void lu_context_key_quiesce(struct lu_context_key *key)
+{
+	struct lu_context *ctx;
+
+	if (!(key->lct_tags & LCT_QUIESCENT)) {
+		/*
+		 * XXX layering violation.
+		 */
+		key->lct_tags |= LCT_QUIESCENT;
+		/*
+		 * XXX memory barrier has to go here.
+		 */
+		spin_lock(&lu_keys_guard);
+		list_for_each_entry(ctx, &lu_context_remembered,
+					lc_remember)
+			key_fini(ctx, key->lct_index);
+		spin_unlock(&lu_keys_guard);
+		++key_set_version;
+	}
+}
+EXPORT_SYMBOL(lu_context_key_quiesce);
+
+void lu_context_key_revive(struct lu_context_key *key)
+{
+	key->lct_tags &= ~LCT_QUIESCENT;
+	++key_set_version;
+}
+EXPORT_SYMBOL(lu_context_key_revive);
+
+static void keys_fini(struct lu_context *ctx)
+{
+	int	i;
+
+	if (ctx->lc_value == NULL)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i)
+		key_fini(ctx, i);
+
+	OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+	ctx->lc_value = NULL;
+}
+
+static int keys_fill(struct lu_context *ctx)
+{
+	int i;
+
+	LINVRNT(ctx->lc_value != NULL);
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		struct lu_context_key *key;
+
+		key = lu_keys[i];
+		if (ctx->lc_value[i] == NULL && key != NULL &&
+		    (key->lct_tags & ctx->lc_tags) &&
+		    /*
+		     * Don't create values for a LCT_QUIESCENT key, as this
+		     * will pin module owning a key.
+		     */
+		    !(key->lct_tags & LCT_QUIESCENT)) {
+			void *value;
+
+			LINVRNT(key->lct_init != NULL);
+			LINVRNT(key->lct_index == i);
+
+			value = key->lct_init(ctx, key);
+			if (unlikely(IS_ERR(value)))
+				return PTR_ERR(value);
+
+			LASSERT(key->lct_owner != NULL);
+			if (!(ctx->lc_tags & LCT_NOREF))
+				try_module_get(key->lct_owner);
+			lu_ref_add_atomic(&key->lct_reference, "ctx", ctx);
+			atomic_inc(&key->lct_used);
+			/*
+			 * This is the only place in the code, where an
+			 * element of ctx->lc_value[] array is set to non-NULL
+			 * value.
+			 */
+			ctx->lc_value[i] = value;
+			if (key->lct_exit != NULL)
+				ctx->lc_tags |= LCT_HAS_EXIT;
+		}
+		ctx->lc_version = key_set_version;
+	}
+	return 0;
+}
+
+static int keys_init(struct lu_context *ctx)
+{
+	OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+	if (likely(ctx->lc_value != NULL))
+		return keys_fill(ctx);
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize context data-structure. Create values for all keys.
+ */
+int lu_context_init(struct lu_context *ctx, __u32 tags)
+{
+	int	rc;
+
+	memset(ctx, 0, sizeof *ctx);
+	ctx->lc_state = LCS_INITIALIZED;
+	ctx->lc_tags = tags;
+	if (tags & LCT_REMEMBER) {
+		spin_lock(&lu_keys_guard);
+		list_add(&ctx->lc_remember, &lu_context_remembered);
+		spin_unlock(&lu_keys_guard);
+	} else {
+		INIT_LIST_HEAD(&ctx->lc_remember);
+	}
+
+	rc = keys_init(ctx);
+	if (rc != 0)
+		lu_context_fini(ctx);
+
+	return rc;
+}
+EXPORT_SYMBOL(lu_context_init);
+
+/**
+ * Finalize context data-structure. Destroy key values.
+ */
+void lu_context_fini(struct lu_context *ctx)
+{
+	LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+	ctx->lc_state = LCS_FINALIZED;
+
+	if ((ctx->lc_tags & LCT_REMEMBER) == 0) {
+		LASSERT(list_empty(&ctx->lc_remember));
+		keys_fini(ctx);
+
+	} else { /* could race with key degister */
+		spin_lock(&lu_keys_guard);
+		keys_fini(ctx);
+		list_del_init(&ctx->lc_remember);
+		spin_unlock(&lu_keys_guard);
+	}
+}
+EXPORT_SYMBOL(lu_context_fini);
+
+/**
+ * Called before entering context.
+ */
+void lu_context_enter(struct lu_context *ctx)
+{
+	LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+	ctx->lc_state = LCS_ENTERED;
+}
+EXPORT_SYMBOL(lu_context_enter);
+
+/**
+ * Called after exiting from \a ctx
+ */
+void lu_context_exit(struct lu_context *ctx)
+{
+	int i;
+
+	LINVRNT(ctx->lc_state == LCS_ENTERED);
+	ctx->lc_state = LCS_LEFT;
+	if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) {
+		for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+			if (ctx->lc_value[i] != NULL) {
+				struct lu_context_key *key;
+
+				key = lu_keys[i];
+				LASSERT(key != NULL);
+				if (key->lct_exit != NULL)
+					key->lct_exit(ctx,
+						      key, ctx->lc_value[i]);
+			}
+		}
+	}
+}
+EXPORT_SYMBOL(lu_context_exit);
+
+/**
+ * Allocate for context all missing keys that were registered after context
+ * creation. key_set_version is only changed in rare cases when modules
+ * are loaded and removed.
+ */
+int lu_context_refill(struct lu_context *ctx)
+{
+	return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx);
+}
+EXPORT_SYMBOL(lu_context_refill);
+
+/**
+ * lu_ctx_tags/lu_ses_tags will be updated if there are new types of
+ * obd being added. Currently, this is only used on client side, specifically
+ * for echo device client, for other stack (like ptlrpc threads), context are
+ * predefined when the lu_device type are registered, during the module probe
+ * phase.
+ */
+__u32 lu_context_tags_default = 0;
+__u32 lu_session_tags_default = 0;
+
+void lu_context_tags_update(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_context_tags_default |= tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_update);
+
+void lu_context_tags_clear(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_context_tags_default &= ~tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_clear);
+
+void lu_session_tags_update(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_session_tags_default |= tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_update);
+
+void lu_session_tags_clear(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_session_tags_default &= ~tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_clear);
+
+int lu_env_init(struct lu_env *env, __u32 tags)
+{
+	int result;
+
+	env->le_ses = NULL;
+	result = lu_context_init(&env->le_ctx, tags);
+	if (likely(result == 0))
+		lu_context_enter(&env->le_ctx);
+	return result;
+}
+EXPORT_SYMBOL(lu_env_init);
+
+void lu_env_fini(struct lu_env *env)
+{
+	lu_context_exit(&env->le_ctx);
+	lu_context_fini(&env->le_ctx);
+	env->le_ses = NULL;
+}
+EXPORT_SYMBOL(lu_env_fini);
+
+int lu_env_refill(struct lu_env *env)
+{
+	int result;
+
+	result = lu_context_refill(&env->le_ctx);
+	if (result == 0 && env->le_ses != NULL)
+		result = lu_context_refill(env->le_ses);
+	return result;
+}
+EXPORT_SYMBOL(lu_env_refill);
+
+/**
+ * Currently, this API will only be used by echo client.
+ * Because echo client and normal lustre client will share
+ * same cl_env cache. So echo client needs to refresh
+ * the env context after it get one from the cache, especially
+ * when normal client and echo client co-exist in the same client.
+ */
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
+			  __u32 stags)
+{
+	int    result;
+
+	if ((env->le_ctx.lc_tags & ctags) != ctags) {
+		env->le_ctx.lc_version = 0;
+		env->le_ctx.lc_tags |= ctags;
+	}
+
+	if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) {
+		env->le_ses->lc_version = 0;
+		env->le_ses->lc_tags |= stags;
+	}
+
+	result = lu_env_refill(env);
+
+	return result;
+}
+EXPORT_SYMBOL(lu_env_refill_by_tags);
+
+static struct shrinker *lu_site_shrinker = NULL;
+
+typedef struct lu_site_stats{
+	unsigned	lss_populated;
+	unsigned	lss_max_search;
+	unsigned	lss_total;
+	unsigned	lss_busy;
+} lu_site_stats_t;
+
+static void lu_site_stats_get(cfs_hash_t *hs,
+			      lu_site_stats_t *stats, int populated)
+{
+	cfs_hash_bd_t bd;
+	int	   i;
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
+		struct hlist_head	*hhead;
+
+		cfs_hash_bd_lock(hs, &bd, 1);
+		stats->lss_busy  += bkt->lsb_busy;
+		stats->lss_total += cfs_hash_bd_count_get(&bd);
+		stats->lss_max_search = max((int)stats->lss_max_search,
+					    cfs_hash_bd_depmax_get(&bd));
+		if (!populated) {
+			cfs_hash_bd_unlock(hs, &bd, 1);
+			continue;
+		}
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			if (!hlist_empty(hhead))
+				stats->lss_populated++;
+		}
+		cfs_hash_bd_unlock(hs, &bd, 1);
+	}
+}
+
+
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the  lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
+static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	lu_site_stats_t stats;
+	struct lu_site *s;
+	struct lu_site *tmp;
+	int cached = 0;
+	int remain = shrink_param(sc, nr_to_scan);
+	LIST_HEAD(splice);
+
+	if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) {
+		if (remain != 0)
+			return -1;
+		else
+			/* We must not take the lu_sites_guard lock when
+			 * __GFP_FS is *not* set because of the deadlock
+			 * possibility detailed above. Additionally,
+			 * since we cannot determine the number of
+			 * objects in the cache without taking this
+			 * lock, we're in a particularly tough spot. As
+			 * a result, we'll just lie and say our cache is
+			 * empty. This _should_ be ok, as we can't
+			 * reclaim objects when __GFP_FS is *not* set
+			 * anyways.
+			 */
+			return 0;
+	}
+
+	CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+
+	mutex_lock(&lu_sites_guard);
+	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+		if (shrink_param(sc, nr_to_scan) != 0) {
+			remain = lu_site_purge(&lu_shrink_env, s, remain);
+			/*
+			 * Move just shrunk site to the tail of site list to
+			 * assure shrinking fairness.
+			 */
+			list_move_tail(&s->ls_linkage, &splice);
+		}
+
+		memset(&stats, 0, sizeof(stats));
+		lu_site_stats_get(s->ls_obj_hash, &stats, 0);
+		cached += stats.lss_total - stats.lss_busy;
+		if (shrink_param(sc, nr_to_scan) && remain <= 0)
+			break;
+	}
+	list_splice(&splice, lu_sites.prev);
+	mutex_unlock(&lu_sites_guard);
+
+	cached = (cached / 100) * sysctl_vfs_cache_pressure;
+	if (shrink_param(sc, nr_to_scan) == 0)
+		CDEBUG(D_INODE, "%d objects cached\n", cached);
+	return cached;
+}
+
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+		      void *unused, const char *format, ...)
+{
+	va_list args;
+
+	va_start(args, format);
+	vprintk(format, args);
+	va_end(args);
+	return 0;
+}
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void)
+{
+	int result;
+
+	CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys);
+
+	result = lu_ref_global_init();
+	if (result != 0)
+		return result;
+
+	LU_CONTEXT_KEY_INIT(&lu_global_key);
+	result = lu_context_key_register(&lu_global_key);
+	if (result != 0)
+		return result;
+
+	/*
+	 * At this level, we don't know what tags are needed, so allocate them
+	 * conservatively. This should not be too bad, because this
+	 * environment is global.
+	 */
+	mutex_lock(&lu_sites_guard);
+	result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+	mutex_unlock(&lu_sites_guard);
+	if (result != 0)
+		return result;
+
+	/*
+	 * seeks estimation: 3 seeks to read a record from oi, one to read
+	 * inode, one for ea. Unfortunately setting this high value results in
+	 * lu_object/inode cache consuming all the memory.
+	 */
+	lu_site_shrinker = set_shrinker(DEFAULT_SEEKS, lu_cache_shrink);
+	if (lu_site_shrinker == NULL)
+		return -ENOMEM;
+
+	return result;
+}
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void)
+{
+	if (lu_site_shrinker != NULL) {
+		remove_shrinker(lu_site_shrinker);
+		lu_site_shrinker = NULL;
+	}
+
+	lu_context_key_degister(&lu_global_key);
+
+	/*
+	 * Tear shrinker environment down _after_ de-registering
+	 * lu_global_key, because the latter has a value in the former.
+	 */
+	mutex_lock(&lu_sites_guard);
+	lu_env_fini(&lu_shrink_env);
+	mutex_unlock(&lu_sites_guard);
+
+	lu_ref_global_fini();
+}
+
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
+{
+#ifdef LPROCFS
+	struct lprocfs_counter ret;
+
+	lprocfs_stats_collect(stats, idx, &ret);
+	return (__u32)ret.lc_count;
+#else
+	return 0;
+#endif
+}
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * lprocfs_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
+{
+	lu_site_stats_t stats;
+
+	memset(&stats, 0, sizeof(stats));
+	lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+
+	return seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+			stats.lss_busy,
+			stats.lss_total,
+			stats.lss_populated,
+			CFS_HASH_NHLIST(s->ls_obj_hash),
+			stats.lss_max_search,
+			ls_stats_read(s->ls_stats, LU_SS_CREATED),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
+			ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+}
+EXPORT_SYMBOL(lu_site_stats_print);
+
+/**
+ * Helper function to initialize a number of kmem slab caches at once.
+ */
+int lu_kmem_init(struct lu_kmem_descr *caches)
+{
+	int result;
+	struct lu_kmem_descr *iter = caches;
+
+	for (result = 0; iter->ckd_cache != NULL; ++iter) {
+		*iter->ckd_cache = kmem_cache_create(iter->ckd_name,
+							iter->ckd_size,
+							0, 0, NULL);
+		if (*iter->ckd_cache == NULL) {
+			result = -ENOMEM;
+			/* free all previously allocated caches */
+			lu_kmem_fini(caches);
+			break;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(lu_kmem_init);
+
+/**
+ * Helper function to finalize a number of kmem slab cached at once. Dual to
+ * lu_kmem_init().
+ */
+void lu_kmem_fini(struct lu_kmem_descr *caches)
+{
+	for (; caches->ckd_cache != NULL; ++caches) {
+		if (*caches->ckd_cache != NULL) {
+			kmem_cache_destroy(*caches->ckd_cache);
+			*caches->ckd_cache = NULL;
+		}
+	}
+}
+EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid)
+{
+	struct lu_site		*s = o->lo_dev->ld_site;
+	struct lu_fid		*old = &o->lo_header->loh_fid;
+	struct lu_site_bkt_data	*bkt;
+	struct lu_object	*shadow;
+	wait_queue_t		 waiter;
+	cfs_hash_t		*hs;
+	cfs_hash_bd_t		 bd;
+	__u64			 version = 0;
+
+	LASSERT(fid_is_zero(old));
+
+	hs = s->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1);
+	shadow = htable_lookup(s, &bd, fid, &waiter, &version);
+	/* supposed to be unique */
+	LASSERT(shadow == NULL);
+	*old = *fid;
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	bkt->lsb_busy++;
+	cfs_hash_bd_unlock(hs, &bd, 1);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ *      till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf)
+{
+	struct lu_fid     fid;
+	struct lu_object *o;
+
+	fid_zero(&fid);
+	o = lu_object_alloc(env, dev, &fid, conf);
+
+	return o;
+}
+EXPORT_SYMBOL(lu_object_anon);
+
+struct lu_buf LU_BUF_NULL = {
+	.lb_buf = NULL,
+	.lb_len = 0
+};
+EXPORT_SYMBOL(LU_BUF_NULL);
+
+void lu_buf_free(struct lu_buf *buf)
+{
+	LASSERT(buf);
+	if (buf->lb_buf) {
+		LASSERT(buf->lb_len > 0);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+		buf->lb_buf = NULL;
+		buf->lb_len = 0;
+	}
+}
+EXPORT_SYMBOL(lu_buf_free);
+
+void lu_buf_alloc(struct lu_buf *buf, int size)
+{
+	LASSERT(buf);
+	LASSERT(buf->lb_buf == NULL);
+	LASSERT(buf->lb_len == 0);
+	OBD_ALLOC_LARGE(buf->lb_buf, size);
+	if (likely(buf->lb_buf))
+		buf->lb_len = size;
+}
+EXPORT_SYMBOL(lu_buf_alloc);
+
+void lu_buf_realloc(struct lu_buf *buf, int size)
+{
+	lu_buf_free(buf);
+	lu_buf_alloc(buf, size);
+}
+EXPORT_SYMBOL(lu_buf_realloc);
+
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len)
+{
+	if (buf->lb_buf == NULL && buf->lb_len == 0)
+		lu_buf_alloc(buf, len);
+
+	if ((len > buf->lb_len) && (buf->lb_buf != NULL))
+		lu_buf_realloc(buf, len);
+
+	return buf;
+}
+EXPORT_SYMBOL(lu_buf_check_and_alloc);
+
+/**
+ * Increase the size of the \a buf.
+ * preserves old data in buffer
+ * old buffer remains unchanged on error
+ * \retval 0 or -ENOMEM
+ */
+int lu_buf_check_and_grow(struct lu_buf *buf, int len)
+{
+	char *ptr;
+
+	if (len <= buf->lb_len)
+		return 0;
+
+	OBD_ALLOC_LARGE(ptr, len);
+	if (ptr == NULL)
+		return -ENOMEM;
+
+	/* Free the old buf */
+	if (buf->lb_buf != NULL) {
+		memcpy(ptr, buf->lb_buf, buf->lb_len);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+	}
+
+	buf->lb_buf = ptr;
+	buf->lb_len = len;
+	return 0;
+}
+EXPORT_SYMBOL(lu_buf_check_and_grow);

diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
new file mode 100644
index 0000000..23a76f1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_ref.c

@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_ref.c
+ *
+ * Lustre reference.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lu_ref.h>

diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ucred.c b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c
new file mode 100644
index 0000000..229db6c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c

@@ -0,0 +1,107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <lu_object.h>
+#include <md_object.h>
+
+/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */
+LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred);
+
+static struct lu_context_key lu_ucred_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = lu_ucred_key_init,
+	.lct_fini = lu_ucred_key_fini
+};
+
+/**
+ * Get ucred key if session exists and ucred key is allocated on it.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred(const struct lu_env *env)
+{
+	if (!env->le_ses)
+		return NULL;
+	return lu_context_key_get(env->le_ses, &lu_ucred_key);
+}
+EXPORT_SYMBOL(lu_ucred);
+
+/**
+ * Get ucred key and check if it is properly initialized.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred_check(const struct lu_env *env)
+{
+	struct lu_ucred *uc = lu_ucred(env);
+	if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW)
+		return NULL;
+	return uc;
+}
+EXPORT_SYMBOL(lu_ucred_check);
+
+/**
+ * Get ucred key, which must exist and must be properly initialized.
+ * Assert otherwise.
+ */
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env)
+{
+	struct lu_ucred *uc = lu_ucred_check(env);
+	LASSERT(uc != NULL);
+	return uc;
+}
+EXPORT_SYMBOL(lu_ucred_assert);
+
+int lu_ucred_global_init(void)
+{
+	LU_CONTEXT_KEY_INIT(&lu_ucred_key);
+	return lu_context_key_register(&lu_ucred_key);
+}
+
+void lu_ucred_global_fini(void)
+{
+	lu_context_key_degister(&lu_ucred_key);
+}

diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
new file mode 100644
index 0000000..69d6499
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c

@@ -0,0 +1,263 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lustre_handles.h>
+#include <lustre_lib.h>
+
+
+static __u64 handle_base;
+#define HANDLE_INCR 7
+static spinlock_t handle_base_lock;
+
+static struct handle_bucket {
+	spinlock_t	lock;
+	struct list_head	head;
+} *handle_hash;
+
+#define HANDLE_HASH_SIZE (1 << 16)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
+void class_handle_hash(struct portals_handle *h,
+		       struct portals_handle_ops *ops)
+{
+	struct handle_bucket *bucket;
+	ENTRY;
+
+	LASSERT(h != NULL);
+	LASSERT(list_empty(&h->h_link));
+
+	/*
+	 * This is fast, but simplistic cookie generation algorithm, it will
+	 * need a re-do at some point in the future for security.
+	 */
+	spin_lock(&handle_base_lock);
+	handle_base += HANDLE_INCR;
+
+	if (unlikely(handle_base == 0)) {
+		/*
+		 * Cookie of zero is "dangerous", because in many places it's
+		 * assumed that 0 means "unassigned" handle, not bound to any
+		 * object.
+		 */
+		CWARN("The universe has been exhausted: cookie wrap-around.\n");
+		handle_base += HANDLE_INCR;
+	}
+	h->h_cookie = handle_base;
+	spin_unlock(&handle_base_lock);
+
+	h->h_ops = ops;
+	spin_lock_init(&h->h_lock);
+
+	bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK];
+	spin_lock(&bucket->lock);
+	list_add_rcu(&h->h_link, &bucket->head);
+	h->h_in = 1;
+	spin_unlock(&bucket->lock);
+
+	CDEBUG(D_INFO, "added object %p with handle "LPX64" to hash\n",
+	       h, h->h_cookie);
+	EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash);
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+	if (list_empty(&h->h_link)) {
+		CERROR("removing an already-removed handle ("LPX64")\n",
+		       h->h_cookie);
+		return;
+	}
+
+	CDEBUG(D_INFO, "removing object %p with handle "LPX64" from hash\n",
+	       h, h->h_cookie);
+
+	spin_lock(&h->h_lock);
+	if (h->h_in == 0) {
+		spin_unlock(&h->h_lock);
+		return;
+	}
+	h->h_in = 0;
+	spin_unlock(&h->h_lock);
+	list_del_rcu(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	class_handle_unhash_nolock(h);
+	spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_unhash);
+
+void class_handle_hash_back(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+	ENTRY;
+
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	list_add_rcu(&h->h_link, &bucket->head);
+	h->h_in = 1;
+	spin_unlock(&bucket->lock);
+
+	EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash_back);
+
+void *class_handle2object(__u64 cookie)
+{
+	struct handle_bucket *bucket;
+	struct portals_handle *h;
+	void *retval = NULL;
+	ENTRY;
+
+	LASSERT(handle_hash != NULL);
+
+	/* Be careful when you want to change this code. See the
+	 * rcu_read_lock() definition on top this file. - jxiong */
+	bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(h, &bucket->head, h_link) {
+		if (h->h_cookie != cookie)
+			continue;
+
+		spin_lock(&h->h_lock);
+		if (likely(h->h_in != 0)) {
+			h->h_ops->hop_addref(h);
+			retval = h;
+		}
+		spin_unlock(&h->h_lock);
+		break;
+	}
+	rcu_read_unlock();
+
+	RETURN(retval);
+}
+EXPORT_SYMBOL(class_handle2object);
+
+void class_handle_free_cb(cfs_rcu_head_t *rcu)
+{
+	struct portals_handle *h = RCU2HANDLE(rcu);
+	void *ptr = (void *)(unsigned long)h->h_cookie;
+
+	if (h->h_ops->hop_free != NULL)
+		h->h_ops->hop_free(ptr, h->h_size);
+	else
+		OBD_FREE(ptr, h->h_size);
+}
+EXPORT_SYMBOL(class_handle_free_cb);
+
+int class_handle_init(void)
+{
+	struct handle_bucket *bucket;
+	struct timeval tv;
+	int seed[2];
+
+	LASSERT(handle_hash == NULL);
+
+	OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+	if (handle_hash == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&handle_base_lock);
+	for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+	     bucket--) {
+		INIT_LIST_HEAD(&bucket->head);
+		spin_lock_init(&bucket->lock);
+	}
+
+	/** bug 21430: add randomness to the initial base */
+	cfs_get_random_bytes(seed, sizeof(seed));
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+	cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+	LASSERT(handle_base != 0ULL);
+
+	return 0;
+}
+
+static int cleanup_all_handles(void)
+{
+	int rc;
+	int i;
+
+	for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) {
+		struct portals_handle *h;
+
+		spin_lock(&handle_hash[i].lock);
+		list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) {
+			CERROR("force clean handle "LPX64" addr %p ops %p\n",
+			       h->h_cookie, h, h->h_ops);
+
+			class_handle_unhash_nolock(h);
+			rc++;
+		}
+		spin_unlock(&handle_hash[i].lock);
+	}
+
+	return rc;
+}
+
+void class_handle_cleanup(void)
+{
+	int count;
+	LASSERT(handle_hash != NULL);
+
+	count = cleanup_all_handles();
+
+	OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+	handle_hash = NULL;
+
+	if (count != 0)
+		CERROR("handle_count at cleanup: %d\n", count);
+}

diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
new file mode 100644
index 0000000..2fa2589
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c

@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+
+#define NIDS_MAX	32
+
+struct uuid_nid_data {
+	struct list_head       un_list;
+	struct obd_uuid  un_uuid;
+	int	      un_nid_count;
+	lnet_nid_t       un_nids[NIDS_MAX];
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static struct list_head	g_uuid_list;
+static spinlock_t	g_uuid_lock;
+
+void class_init_uuidlist(void)
+{
+	INIT_LIST_HEAD(&g_uuid_list);
+	spin_lock_init(&g_uuid_lock);
+}
+
+void class_exit_uuidlist(void)
+{
+	/* delete all */
+	class_del_uuid(NULL);
+}
+
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index)
+{
+	struct uuid_nid_data *data;
+	struct obd_uuid tmp;
+	int rc = -ENOENT;
+
+	obd_str2uuid(&tmp, uuid);
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(data, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+			if (index >= data->un_nid_count)
+				break;
+
+			rc = 0;
+			*peer_nid = data->un_nids[index];
+			break;
+		}
+	}
+	spin_unlock(&g_uuid_lock);
+	return rc;
+}
+EXPORT_SYMBOL(lustre_uuid_to_peer);
+
+/* Add a nid to a niduuid.  Multiple nids can be added to a single uuid;
+   LNET will choose the best one. */
+int class_add_uuid(const char *uuid, __u64 nid)
+{
+	struct uuid_nid_data *data, *entry;
+	int found = 0;
+
+	LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+
+	if (strlen(uuid) > UUID_MAX - 1)
+		return -EOVERFLOW;
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		return -ENOMEM;
+
+	obd_str2uuid(&data->un_uuid, uuid);
+	data->un_nids[0] = nid;
+	data->un_nid_count = 1;
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+			int i;
+
+			found = 1;
+			for (i = 0; i < entry->un_nid_count; i++)
+				if (nid == entry->un_nids[i])
+					break;
+
+			if (i == entry->un_nid_count) {
+				LASSERT(entry->un_nid_count < NIDS_MAX);
+				entry->un_nids[entry->un_nid_count++] = nid;
+			}
+			break;
+		}
+	}
+	if (!found)
+		list_add(&data->un_list, &g_uuid_list);
+	spin_unlock(&g_uuid_lock);
+
+	if (found) {
+		CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+		       libcfs_nid2str(nid), entry->un_nid_count);
+		OBD_FREE(data, sizeof(*data));
+	} else {
+		CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+	}
+	return 0;
+}
+EXPORT_SYMBOL(class_add_uuid);
+
+/* Delete the nids for one uuid if specified, otherwise delete all */
+int class_del_uuid(const char *uuid)
+{
+	LIST_HEAD(deathrow);
+	struct uuid_nid_data *data;
+
+	spin_lock(&g_uuid_lock);
+	if (uuid != NULL) {
+		struct obd_uuid tmp;
+
+		obd_str2uuid(&tmp, uuid);
+		list_for_each_entry(data, &g_uuid_list, un_list) {
+			if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+				list_move(&data->un_list, &deathrow);
+				break;
+			}
+		}
+	} else
+		list_splice_init(&g_uuid_list, &deathrow);
+	spin_unlock(&g_uuid_lock);
+
+	if (uuid != NULL && list_empty(&deathrow)) {
+		CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid);
+		return -EINVAL;
+	}
+
+	while (!list_empty(&deathrow)) {
+		data = list_entry(deathrow.next, struct uuid_nid_data,
+				      un_list);
+		list_del(&data->un_list);
+
+		CDEBUG(D_INFO, "del uuid %s %s/%d\n",
+		       obd_uuid2str(&data->un_uuid),
+		       libcfs_nid2str(data->un_nids[0]),
+		       data->un_nid_count);
+
+		OBD_FREE(data, sizeof(*data));
+	}
+
+	return 0;
+}
+
+/* check if @nid exists in nid list of @uuid */
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
+{
+	struct uuid_nid_data *entry;
+	int found = 0;
+	ENTRY;
+
+	CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+	       obd_uuid2str(uuid), libcfs_nid2str(nid));
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		int i;
+
+		if (!obd_uuid_equals(&entry->un_uuid, uuid))
+			continue;
+
+		/* found the uuid, check if it has @nid */
+		for (i = 0; i < entry->un_nid_count; i++) {
+			if (entry->un_nids[i] == nid) {
+				found = 1;
+				break;
+			}
+		}
+		break;
+	}
+	spin_unlock(&g_uuid_lock);
+	RETURN(found);
+}
+EXPORT_SYMBOL(class_check_uuid);

diff --git a/drivers/staging/lustre/lustre/obdclass/md_attrs.c b/drivers/staging/lustre/lustre/obdclass/md_attrs.c
new file mode 100644
index 0000000..b71344a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/md_attrs.c

@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <md_object.h>
+
+/**
+ * Initialize new \a lma. Only fid is stored.
+ *
+ * \param lma - is the new LMA structure to be initialized
+ * \param fid - is the FID of the object this LMA belongs to
+ * \param incompat - features that MDS must understand to access object
+ */
+void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
+		     __u32 incompat)
+{
+	lma->lma_compat   = 0;
+	lma->lma_incompat = incompat;
+	lma->lma_self_fid = *fid;
+
+	/* If a field is added in struct lustre_mdt_attrs, zero it explicitly
+	 * and change the test below. */
+	LASSERT(sizeof(*lma) ==
+		(offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+		 sizeof(lma->lma_self_fid)));
+};
+EXPORT_SYMBOL(lustre_lma_init);
+
+/**
+ * Swab, if needed, LMA structure which is stored on-disk in little-endian order.
+ *
+ * \param lma - is a pointer to the LMA structure to be swabbed.
+ */
+void lustre_lma_swab(struct lustre_mdt_attrs *lma)
+{
+	/* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+	if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+		__swab32s(&lma->lma_compat);
+		__swab32s(&lma->lma_incompat);
+		lustre_swab_lu_fid(&lma->lma_self_fid);
+	}
+};
+EXPORT_SYMBOL(lustre_lma_swab);
+
+/**
+ * Swab, if needed, SOM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the SOM structure to be swabbed.
+ */
+void lustre_som_swab(struct som_attrs *attrs)
+{
+	/* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+	if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+		__swab32s(&attrs->som_compat);
+		__swab32s(&attrs->som_incompat);
+		__swab64s(&attrs->som_ioepoch);
+		__swab64s(&attrs->som_size);
+		__swab64s(&attrs->som_blocks);
+		__swab64s(&attrs->som_mountid);
+	}
+};
+EXPORT_SYMBOL(lustre_som_swab);
+
+/*
+ * Swab and extract SOM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk SOM extended attribute.
+ * \param rc  - is the SOM xattr stored in \a buf
+ * \param msd - is the md_som_data structure where to extract SOM attributes.
+ */
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd)
+{
+	struct som_attrs *attrs = (struct som_attrs *)buf;
+	ENTRY;
+
+	if (rc == 0 ||  rc == -ENODATA)
+		/* no SOM attributes */
+		RETURN(-ENODATA);
+
+	if (rc < 0)
+		/* error hit while fetching xattr */
+		RETURN(rc);
+
+	/* check SOM compatibility */
+	if (attrs->som_incompat & ~cpu_to_le32(SOM_INCOMPAT_SUPP))
+		RETURN(-ENODATA);
+
+	/* unpack SOM attributes */
+	lustre_som_swab(attrs);
+
+	/* fill in-memory msd structure */
+	msd->msd_compat   = attrs->som_compat;
+	msd->msd_incompat = attrs->som_incompat;
+	msd->msd_ioepoch  = attrs->som_ioepoch;
+	msd->msd_size     = attrs->som_size;
+	msd->msd_blocks   = attrs->som_blocks;
+	msd->msd_mountid  = attrs->som_mountid;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2som);
+
+/**
+ * Swab, if needed, HSM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the HSM structure to be swabbed.
+ */
+void lustre_hsm_swab(struct hsm_attrs *attrs)
+{
+	/* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+	if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+		__swab32s(&attrs->hsm_compat);
+		__swab32s(&attrs->hsm_flags);
+		__swab64s(&attrs->hsm_arch_id);
+		__swab64s(&attrs->hsm_arch_ver);
+	}
+};
+EXPORT_SYMBOL(lustre_hsm_swab);
+
+/*
+ * Swab and extract HSM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk HSM extended attribute.
+ * \param rc  - is the HSM xattr stored in \a buf
+ * \param mh  - is the md_hsm structure where to extract HSM attributes.
+ */
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh)
+{
+	struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+	ENTRY;
+
+	if (rc == 0 ||  rc == -ENODATA)
+		/* no HSM attributes */
+		RETURN(-ENODATA);
+
+	if (rc < 0)
+		/* error hit while fetching xattr */
+		RETURN(rc);
+
+	/* unpack HSM attributes */
+	lustre_hsm_swab(attrs);
+
+	/* fill md_hsm structure */
+	mh->mh_compat   = attrs->hsm_compat;
+	mh->mh_flags    = attrs->hsm_flags;
+	mh->mh_arch_id  = attrs->hsm_arch_id;
+	mh->mh_arch_ver = attrs->hsm_arch_ver;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2hsm);
+
+/*
+ * Pack HSM attributes.
+ *
+ * \param buf - is the output buffer where to pack the on-disk HSM xattr.
+ * \param mh  - is the md_hsm structure to pack.
+ */
+void lustre_hsm2buf(void *buf, struct md_hsm *mh)
+{
+	struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+	ENTRY;
+
+	/* copy HSM attributes */
+	attrs->hsm_compat   = mh->mh_compat;
+	attrs->hsm_flags    = mh->mh_flags;
+	attrs->hsm_arch_id  = mh->mh_arch_id;
+	attrs->hsm_arch_ver = mh->mh_arch_ver;
+
+	/* pack xattr */
+	lustre_hsm_swab(attrs);
+}
+EXPORT_SYMBOL(lustre_hsm2buf);

diff --git a/drivers/staging/lustre/lustre/obdclass/mea.c b/drivers/staging/lustre/lustre/obdclass/mea.c
new file mode 100644
index 0000000..c4f0dbc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/mea.c

@@ -0,0 +1,112 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/kmod.h>   /* for request_module() */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+static int mea_last_char_hash(int count, char *name, int namelen)
+{
+	unsigned int c;
+
+	c = name[namelen - 1];
+	if (c == 0)
+		CWARN("looks like wrong len is passed\n");
+	c = c % count;
+	return c;
+}
+
+static int mea_all_chars_hash(int count, char *name, int namelen)
+{
+	unsigned int c = 0;
+
+	while (--namelen >= 0)
+		c += name[namelen];
+	c = c % count;
+	return c;
+}
+
+int raw_name2idx(int hashtype, int count, const char *name, int namelen)
+{
+	unsigned int	c = 0;
+	int		idx;
+
+	LASSERT(namelen > 0);
+
+	if (filename_is_volatile(name, namelen, &idx)) {
+		if ((idx >= 0) && (idx < count))
+			return idx;
+		goto hashchoice;
+	}
+
+	if (count <= 1)
+		return 0;
+
+hashchoice:
+	switch (hashtype) {
+	case MEA_MAGIC_LAST_CHAR:
+		c = mea_last_char_hash(count, (char *)name, namelen);
+		break;
+	case MEA_MAGIC_ALL_CHARS:
+		c = mea_all_chars_hash(count, (char *)name, namelen);
+		break;
+	case MEA_MAGIC_HASH_SEGMENT:
+		CERROR("Unsupported hash type MEA_MAGIC_HASH_SEGMENT\n");
+		break;
+	default:
+		CERROR("Unknown hash type 0x%x\n", hashtype);
+	}
+
+	LASSERT(c < count);
+	return c;
+}
+EXPORT_SYMBOL(raw_name2idx);
+
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen)
+{
+	unsigned int c;
+
+	LASSERT(mea && mea->mea_count);
+
+	c = raw_name2idx(mea->mea_magic, mea->mea_count, name, namelen);
+
+	LASSERT(c < mea->mea_count);
+	return c;
+}
+EXPORT_SYMBOL(mea_name2idx);

diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c
new file mode 100644
index 0000000..bbf06d0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_config.c

@@ -0,0 +1,1904 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/string.h>
+#include <lustre_log.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+
+#include "llog_internal.h"
+
+static cfs_hash_ops_t uuid_hash_ops;
+static cfs_hash_ops_t nid_hash_ops;
+static cfs_hash_ops_t nid_stat_hash_ops;
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+	char *ptr;
+
+	if (!buf)
+		return 1;
+
+	if ((ptr = strstr(buf, key)) == NULL)
+		return 1;
+
+	if (valp)
+		*valp = ptr + strlen(key);
+
+	return 0;
+}
+EXPORT_SYMBOL(class_find_param);
+
+/**
+ * Check whether the proc parameter \a param is an old parameter or not from
+ * the array \a ptr which contains the mapping from old parameters to new ones.
+ * If it's an old one, then return the pointer to the cfg_interop_param struc-
+ * ture which contains both the old and new parameters.
+ *
+ * \param param			proc parameter
+ * \param ptr			an array which contains the mapping from
+ *				old parameters to new ones
+ *
+ * \retval valid-pointer	pointer to the cfg_interop_param structure
+ *				which contains the old and new parameters
+ * \retval NULL			\a param or \a ptr is NULL,
+ *				or \a param is not an old parameter
+ */
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr)
+{
+	char *value = NULL;
+	int   name_len = 0;
+
+	if (param == NULL || ptr == NULL)
+		RETURN(NULL);
+
+	value = strchr(param, '=');
+	if (value == NULL)
+		name_len = strlen(param);
+	else
+		name_len = value - param;
+
+	while (ptr->old_param != NULL) {
+		if (strncmp(param, ptr->old_param, name_len) == 0 &&
+		    name_len == strlen(ptr->old_param))
+			RETURN(ptr);
+		ptr++;
+	}
+
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(class_find_old_param);
+
+/**
+ * Finds a parameter in \a params and copies it to \a copy.
+ *
+ * Leading spaces are skipped. Next space or end of string is the
+ * parameter terminator with the exception that spaces inside single or double
+ * quotes get included into a parameter. The parameter is copied into \a copy
+ * which has to be allocated big enough by a caller, quotes are stripped in
+ * the copy and the copy is terminated by 0.
+ *
+ * On return \a params is set to next parameter or to NULL if last
+ * parameter is returned.
+ *
+ * \retval 0 if parameter is returned in \a copy
+ * \retval 1 otherwise
+ * \retval -EINVAL if unbalanced quota is found
+ */
+int class_get_next_param(char **params, char *copy)
+{
+	char *q1, *q2, *str;
+	int len;
+
+	str = *params;
+	while (*str == ' ')
+		str++;
+
+	if (*str == '\0') {
+		*params = NULL;
+		return 1;
+	}
+
+	while (1) {
+		q1 = strpbrk(str, " '\"");
+		if (q1 == NULL) {
+			len = strlen(str);
+			memcpy(copy, str, len);
+			copy[len] = '\0';
+			*params = NULL;
+			return 0;
+		}
+		len = q1 - str;
+		if (*q1 == ' ') {
+			memcpy(copy, str, len);
+			copy[len] = '\0';
+			*params = str + len;
+			return 0;
+		}
+
+		memcpy(copy, str, len);
+		copy += len;
+
+		/* search for the matching closing quote */
+		str = q1 + 1;
+		q2 = strchr(str, *q1);
+		if (q2 == NULL) {
+			CERROR("Unbalanced quota in parameters: \"%s\"\n",
+			       *params);
+			return -EINVAL;
+		}
+		len = q2 - str;
+		memcpy(copy, str, len);
+		copy += len;
+		str = q2 + 1;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(class_get_next_param);
+
+/* returns 0 if this is the first key in the buffer, else 1.
+   valp points to first char after key. */
+int class_match_param(char *buf, char *key, char **valp)
+{
+	if (!buf)
+		return 1;
+
+	if (memcmp(buf, key, strlen(key)) != 0)
+		return 1;
+
+	if (valp)
+		*valp = buf + strlen(key);
+
+	return 0;
+}
+EXPORT_SYMBOL(class_match_param);
+
+static int parse_nid(char *buf, void *value, int quiet)
+{
+	lnet_nid_t *nid = (lnet_nid_t *)value;
+
+	*nid = libcfs_str2nid(buf);
+	if (*nid != LNET_NID_ANY)
+		return 0;
+
+	if (!quiet)
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf);
+	return -EINVAL;
+}
+
+static int parse_net(char *buf, void *value)
+{
+	__u32 *net = (__u32 *)value;
+
+	*net = libcfs_str2net(buf);
+	CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net));
+	return 0;
+}
+
+enum {
+	CLASS_PARSE_NID = 1,
+	CLASS_PARSE_NET,
+};
+
+/* 0 is good nid,
+   1 not found
+   < 0 error
+   endh is set to next separator */
+static int class_parse_value(char *buf, int opc, void *value, char **endh,
+			     int quiet)
+{
+	char *endp;
+	char  tmp;
+	int   rc = 0;
+
+	if (!buf)
+		return 1;
+	while (*buf == ',' || *buf == ':')
+		buf++;
+	if (*buf == ' ' || *buf == '/' || *buf == '\0')
+		return 1;
+
+	/* nid separators or end of nids */
+	endp = strpbrk(buf, ",: /");
+	if (endp == NULL)
+		endp = buf + strlen(buf);
+
+	tmp = *endp;
+	*endp = '\0';
+	switch (opc) {
+	default:
+		LBUG();
+	case CLASS_PARSE_NID:
+		rc = parse_nid(buf, value, quiet);
+		break;
+	case CLASS_PARSE_NET:
+		rc = parse_net(buf, value);
+		break;
+	}
+	*endp = tmp;
+	if (rc != 0)
+		return rc;
+	if (endh)
+		*endh = endp;
+	return 0;
+}
+
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_nid);
+
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1);
+}
+EXPORT_SYMBOL(class_parse_nid_quiet);
+
+int class_parse_net(char *buf, __u32 *net, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_net);
+
+/* 1 param contains key and match
+ * 0 param contains key and not match
+ * -1 param does not contain key
+ */
+int class_match_nid(char *buf, char *key, lnet_nid_t nid)
+{
+	lnet_nid_t tmp;
+	int   rc = -1;
+
+	while (class_find_param(buf, key, &buf) == 0) {
+		/* please restrict to the nids pertaining to
+		 * the specified nids */
+		while (class_parse_nid(buf, &tmp, &buf) == 0) {
+			if (tmp == nid)
+				return 1;
+		}
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_match_nid);
+
+int class_match_net(char *buf, char *key, __u32 net)
+{
+	__u32 tmp;
+	int   rc = -1;
+
+	while (class_find_param(buf, key, &buf) == 0) {
+		/* please restrict to the nids pertaining to
+		 * the specified networks */
+		while (class_parse_net(buf, &tmp, &buf) == 0) {
+			if (tmp == net)
+				return 1;
+		}
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_match_net);
+
+/********************** class fns **********************/
+
+/**
+ * Create a new obd device and set the type, name and uuid.  If successful,
+ * the new device can be accessed by either name or uuid.
+ */
+int class_attach(struct lustre_cfg *lcfg)
+{
+	struct obd_device *obd = NULL;
+	char *typename, *name, *uuid;
+	int rc, len;
+	ENTRY;
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("No type passed!\n");
+		RETURN(-EINVAL);
+	}
+	typename = lustre_cfg_string(lcfg, 1);
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
+		CERROR("No name passed!\n");
+		RETURN(-EINVAL);
+	}
+	name = lustre_cfg_string(lcfg, 0);
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
+		CERROR("No UUID passed!\n");
+		RETURN(-EINVAL);
+	}
+	uuid = lustre_cfg_string(lcfg, 2);
+
+	CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
+	       MKSTR(typename), MKSTR(name), MKSTR(uuid));
+
+	obd = class_newdev(typename, name);
+	if (IS_ERR(obd)) {
+		/* Already exists or out of obds */
+		rc = PTR_ERR(obd);
+		obd = NULL;
+		CERROR("Cannot create device %s of type %s : %d\n",
+		       name, typename, rc);
+		GOTO(out, rc);
+	}
+	LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
+		 name, typename);
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+		 "obd %p obd_magic %08X != %08X\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+		 "%p obd_name %s != %s\n", obd, obd->obd_name, name);
+
+	rwlock_init(&obd->obd_pool_lock);
+	obd->obd_pool_limit = 0;
+	obd->obd_pool_slv = 0;
+
+	INIT_LIST_HEAD(&obd->obd_exports);
+	INIT_LIST_HEAD(&obd->obd_unlinked_exports);
+	INIT_LIST_HEAD(&obd->obd_delayed_exports);
+	INIT_LIST_HEAD(&obd->obd_exports_timed);
+	INIT_LIST_HEAD(&obd->obd_nid_stats);
+	spin_lock_init(&obd->obd_nid_lock);
+	spin_lock_init(&obd->obd_dev_lock);
+	mutex_init(&obd->obd_dev_mutex);
+	spin_lock_init(&obd->obd_osfs_lock);
+	/* obd->obd_osfs_age must be set to a value in the distant
+	 * past to guarantee a fresh statfs is fetched on mount. */
+	obd->obd_osfs_age = cfs_time_shift_64(-1000);
+
+	/* XXX belongs in setup not attach  */
+	init_rwsem(&obd->obd_observer_link_sem);
+	/* recovery data */
+	cfs_init_timer(&obd->obd_recovery_timer);
+	spin_lock_init(&obd->obd_recovery_task_lock);
+	init_waitqueue_head(&obd->obd_next_transno_waitq);
+	init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
+	INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_final_req_queue);
+	INIT_LIST_HEAD(&obd->obd_evict_list);
+
+	llog_group_init(&obd->obd_olg, FID_SEQ_LLOG);
+
+	obd->obd_conn_inprogress = 0;
+
+	len = strlen(uuid);
+	if (len >= sizeof(obd->obd_uuid)) {
+		CERROR("uuid must be < %d bytes long\n",
+		       (int)sizeof(obd->obd_uuid));
+		GOTO(out, rc = -EINVAL);
+	}
+	memcpy(obd->obd_uuid.uuid, uuid, len);
+
+	/* do the attach */
+	if (OBP(obd, attach)) {
+		rc = OBP(obd,attach)(obd, sizeof *lcfg, lcfg);
+		if (rc)
+			GOTO(out, rc = -EINVAL);
+	}
+
+	/* Detach drops this */
+	spin_lock(&obd->obd_dev_lock);
+	atomic_set(&obd->obd_refcount, 1);
+	spin_unlock(&obd->obd_dev_lock);
+	lu_ref_init(&obd->obd_reference);
+	lu_ref_add(&obd->obd_reference, "attach", obd);
+
+	obd->obd_attached = 1;
+	CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+	       obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
+	RETURN(0);
+ out:
+	if (obd != NULL) {
+		class_release_dev(obd);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_attach);
+
+/** Create hashes, self-export, and call type-specific setup.
+ * Setup is effectively the "start this obd" call.
+ */
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+	struct obd_export *exp;
+	ENTRY;
+
+	LASSERT(obd != NULL);
+	LASSERTF(obd == class_num2obd(obd->obd_minor),
+		 "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, class_num2obd(obd->obd_minor));
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+		 "obd %p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+
+	/* have we attached a type to this device? */
+	if (!obd->obd_attached) {
+		CERROR("Device %d not attached\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+
+	if (obd->obd_set_up) {
+		CERROR("Device %d already setup (type %s)\n",
+		       obd->obd_minor, obd->obd_type->typ_name);
+		RETURN(-EEXIST);
+	}
+
+	/* is someone else setting us up right now? (attach inits spinlock) */
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_starting) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("Device %d setup in progress (type %s)\n",
+		       obd->obd_minor, obd->obd_type->typ_name);
+		RETURN(-EEXIST);
+	}
+	/* just leave this on forever.  I can't use obd_set_up here because
+	   other fns check that status, and we're not actually set up yet. */
+	obd->obd_starting = 1;
+	obd->obd_uuid_hash = NULL;
+	obd->obd_nid_hash = NULL;
+	obd->obd_nid_stats_hash = NULL;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* create an uuid-export lustre hash */
+	obd->obd_uuid_hash = cfs_hash_create("UUID_HASH",
+					     HASH_UUID_CUR_BITS,
+					     HASH_UUID_MAX_BITS,
+					     HASH_UUID_BKT_BITS, 0,
+					     CFS_HASH_MIN_THETA,
+					     CFS_HASH_MAX_THETA,
+					     &uuid_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_uuid_hash)
+		GOTO(err_hash, err = -ENOMEM);
+
+	/* create a nid-export lustre hash */
+	obd->obd_nid_hash = cfs_hash_create("NID_HASH",
+					    HASH_NID_CUR_BITS,
+					    HASH_NID_MAX_BITS,
+					    HASH_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nid_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_nid_hash)
+		GOTO(err_hash, err = -ENOMEM);
+
+	/* create a nid-stats lustre hash */
+	obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
+						  HASH_NID_STATS_CUR_BITS,
+						  HASH_NID_STATS_MAX_BITS,
+						  HASH_NID_STATS_BKT_BITS, 0,
+						  CFS_HASH_MIN_THETA,
+						  CFS_HASH_MAX_THETA,
+						  &nid_stat_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_nid_stats_hash)
+		GOTO(err_hash, err = -ENOMEM);
+
+	exp = class_new_export(obd, &obd->obd_uuid);
+	if (IS_ERR(exp))
+		GOTO(err_hash, err = PTR_ERR(exp));
+
+	obd->obd_self_export = exp;
+	list_del_init(&exp->exp_obd_chain_timed);
+	class_export_put(exp);
+
+	err = obd_setup(obd, lcfg);
+	if (err)
+		GOTO(err_exp, err);
+
+	obd->obd_set_up = 1;
+
+	spin_lock(&obd->obd_dev_lock);
+	/* cleanup drops this */
+	class_incref(obd, "setup", obd);
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
+	       obd->obd_name, obd->obd_uuid.uuid);
+
+	RETURN(0);
+err_exp:
+	if (obd->obd_self_export) {
+		class_unlink_export(obd->obd_self_export);
+		obd->obd_self_export = NULL;
+	}
+err_hash:
+	if (obd->obd_uuid_hash) {
+		cfs_hash_putref(obd->obd_uuid_hash);
+		obd->obd_uuid_hash = NULL;
+	}
+	if (obd->obd_nid_hash) {
+		cfs_hash_putref(obd->obd_nid_hash);
+		obd->obd_nid_hash = NULL;
+	}
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+	obd->obd_starting = 0;
+	CERROR("setup %s failed (%d)\n", obd->obd_name, err);
+	return err;
+}
+EXPORT_SYMBOL(class_setup);
+
+/** We have finished using this obd and are ready to destroy it.
+ * There can be no more references to this obd.
+ */
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	ENTRY;
+
+	if (obd->obd_set_up) {
+		CERROR("OBD device %d still set up\n", obd->obd_minor);
+		RETURN(-EBUSY);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_attached) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD device %d not attached\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+	obd->obd_attached = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
+	       obd->obd_name, obd->obd_uuid.uuid);
+
+	class_decref(obd, "attach", obd);
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_detach);
+
+/** Start shutting down the obd.  There may be in-progess ops when
+ * this is called.  We tell them to start shutting down with a call
+ * to class_disconnect_exports().
+ */
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+	char *flag;
+	ENTRY;
+
+	OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
+
+	if (!obd->obd_set_up) {
+		CERROR("Device %d not setup\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD %d already stopping\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+	/* Leave this on forever */
+	obd->obd_stopping = 1;
+
+	/* wait for already-arrived-connections to finish. */
+	while (obd->obd_conn_inprogress > 0) {
+		spin_unlock(&obd->obd_dev_lock);
+
+		cond_resched();
+
+		spin_lock(&obd->obd_dev_lock);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+		for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
+			switch (*flag) {
+			case 'F':
+				obd->obd_force = 1;
+				break;
+			case 'A':
+				LCONSOLE_WARN("Failing over %s\n",
+					      obd->obd_name);
+				obd->obd_fail = 1;
+				obd->obd_no_transno = 1;
+				obd->obd_no_recov = 1;
+				if (OBP(obd, iocontrol)) {
+					obd_iocontrol(OBD_IOC_SYNC,
+						      obd->obd_self_export,
+						      0, NULL, NULL);
+				}
+				break;
+			default:
+				CERROR("Unrecognised flag '%c'\n", *flag);
+			}
+	}
+
+	LASSERT(obd->obd_self_export);
+
+	/* The three references that should be remaining are the
+	 * obd_self_export and the attach and setup references. */
+	if (atomic_read(&obd->obd_refcount) > 3) {
+		/* refcounf - 3 might be the number of real exports
+		   (excluding self export). But class_incref is called
+		   by other things as well, so don't count on it. */
+		CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
+		       obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
+		dump_exports(obd, 0);
+		class_disconnect_exports(obd);
+	}
+
+	/* Precleanup, we must make sure all exports get destroyed. */
+	err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS);
+	if (err)
+		CERROR("Precleanup %s returned %d\n",
+		       obd->obd_name, err);
+
+	/* destroy an uuid-export hash body */
+	if (obd->obd_uuid_hash) {
+		cfs_hash_putref(obd->obd_uuid_hash);
+		obd->obd_uuid_hash = NULL;
+	}
+
+	/* destroy a nid-export hash body */
+	if (obd->obd_nid_hash) {
+		cfs_hash_putref(obd->obd_nid_hash);
+		obd->obd_nid_hash = NULL;
+	}
+
+	/* destroy a nid-stats hash body */
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+
+	class_decref(obd, "setup", obd);
+	obd->obd_set_up = 0;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_cleanup);
+
+struct obd_device *class_incref(struct obd_device *obd,
+				const char *scope, const void *source)
+{
+	lu_ref_add_atomic(&obd->obd_reference, scope, source);
+	atomic_inc(&obd->obd_refcount);
+	CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+	       atomic_read(&obd->obd_refcount));
+
+	return obd;
+}
+EXPORT_SYMBOL(class_incref);
+
+void class_decref(struct obd_device *obd, const char *scope, const void *source)
+{
+	int err;
+	int refs;
+
+	spin_lock(&obd->obd_dev_lock);
+	atomic_dec(&obd->obd_refcount);
+	refs = atomic_read(&obd->obd_refcount);
+	spin_unlock(&obd->obd_dev_lock);
+	lu_ref_del(&obd->obd_reference, scope, source);
+
+	CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+
+	if ((refs == 1) && obd->obd_stopping) {
+		/* All exports have been destroyed; there should
+		   be no more in-progress ops by this point.*/
+
+		spin_lock(&obd->obd_self_export->exp_lock);
+		obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
+		spin_unlock(&obd->obd_self_export->exp_lock);
+
+		/* note that we'll recurse into class_decref again */
+		class_unlink_export(obd->obd_self_export);
+		return;
+	}
+
+	if (refs == 0) {
+		CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+		       obd->obd_name, obd->obd_uuid.uuid);
+		LASSERT(!obd->obd_attached);
+		if (obd->obd_stopping) {
+			/* If we're not stopping, we were never set up */
+			err = obd_cleanup(obd);
+			if (err)
+				CERROR("Cleanup %s returned %d\n",
+				       obd->obd_name, err);
+		}
+		if (OBP(obd, detach)) {
+			err = OBP(obd, detach)(obd);
+			if (err)
+				CERROR("Detach returned %d\n", err);
+		}
+		class_release_dev(obd);
+	}
+}
+EXPORT_SYMBOL(class_decref);
+
+/** Add a failover nid location.
+ * Client obd types contact server obd types using this nid list.
+ */
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_import *imp;
+	struct obd_uuid uuid;
+	int rc;
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+	    LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+		CERROR("invalid conn_uuid\n");
+		RETURN(-EINVAL);
+	}
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+		CERROR("can't add connection on non-client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	imp = obd->u.cli.cl_import;
+	if (!imp) {
+		CERROR("try to add conn on immature client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+	rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_add_conn);
+
+/** Remove a failover nid location.
+ */
+int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_import *imp;
+	struct obd_uuid uuid;
+	int rc;
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+	    LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+		CERROR("invalid conn_uuid\n");
+		RETURN(-EINVAL);
+	}
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+		CERROR("can't del connection on non-client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	imp = obd->u.cli.cl_import;
+	if (!imp) {
+		CERROR("try to del conn on immature client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+	rc = obd_del_conn(imp, &uuid);
+
+	RETURN(rc);
+}
+
+LIST_HEAD(lustre_profile_list);
+
+struct lustre_profile *class_get_profile(const char * prof)
+{
+	struct lustre_profile *lprof;
+
+	ENTRY;
+	list_for_each_entry(lprof, &lustre_profile_list, lp_list) {
+		if (!strcmp(lprof->lp_profile, prof)) {
+			RETURN(lprof);
+		}
+	}
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(class_get_profile);
+
+/** Create a named "profile".
+ * This defines the mdc and osc names to use for a client.
+ * This also is used to define the lov to be used by a mdt.
+ */
+int class_add_profile(int proflen, char *prof, int osclen, char *osc,
+		      int mdclen, char *mdc)
+{
+	struct lustre_profile *lprof;
+	int err = 0;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Add profile %s\n", prof);
+
+	OBD_ALLOC(lprof, sizeof(*lprof));
+	if (lprof == NULL)
+		RETURN(-ENOMEM);
+	INIT_LIST_HEAD(&lprof->lp_list);
+
+	LASSERT(proflen == (strlen(prof) + 1));
+	OBD_ALLOC(lprof->lp_profile, proflen);
+	if (lprof->lp_profile == NULL)
+		GOTO(out, err = -ENOMEM);
+	memcpy(lprof->lp_profile, prof, proflen);
+
+	LASSERT(osclen == (strlen(osc) + 1));
+	OBD_ALLOC(lprof->lp_dt, osclen);
+	if (lprof->lp_dt == NULL)
+		GOTO(out, err = -ENOMEM);
+	memcpy(lprof->lp_dt, osc, osclen);
+
+	if (mdclen > 0) {
+		LASSERT(mdclen == (strlen(mdc) + 1));
+		OBD_ALLOC(lprof->lp_md, mdclen);
+		if (lprof->lp_md == NULL)
+			GOTO(out, err = -ENOMEM);
+		memcpy(lprof->lp_md, mdc, mdclen);
+	}
+
+	list_add(&lprof->lp_list, &lustre_profile_list);
+	RETURN(err);
+
+out:
+	if (lprof->lp_md)
+		OBD_FREE(lprof->lp_md, mdclen);
+	if (lprof->lp_dt)
+		OBD_FREE(lprof->lp_dt, osclen);
+	if (lprof->lp_profile)
+		OBD_FREE(lprof->lp_profile, proflen);
+	OBD_FREE(lprof, sizeof(*lprof));
+	RETURN(err);
+}
+
+void class_del_profile(const char *prof)
+{
+	struct lustre_profile *lprof;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Del profile %s\n", prof);
+
+	lprof = class_get_profile(prof);
+	if (lprof) {
+		list_del(&lprof->lp_list);
+		OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+		OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+		if (lprof->lp_md)
+			OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+		OBD_FREE(lprof, sizeof *lprof);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(class_del_profile);
+
+/* COMPAT_146 */
+void class_del_profiles(void)
+{
+	struct lustre_profile *lprof, *n;
+	ENTRY;
+
+	list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) {
+		list_del(&lprof->lp_list);
+		OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+		OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+		if (lprof->lp_md)
+			OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+		OBD_FREE(lprof, sizeof *lprof);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(class_del_profiles);
+
+static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
+{
+	ENTRY;
+	if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+		at_min = val;
+	else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+		at_max = val;
+	else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+		at_extra = val;
+	else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+		at_early_margin = val;
+	else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+		at_history = val;
+	else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
+		strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
+			JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+	else
+		RETURN(-EINVAL);
+
+	CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+	RETURN(0);
+}
+
+
+/* We can't call ll_process_config or lquota_process_config directly because
+ * it lives in a module that must be loaded after this one. */
+static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
+{
+	client_process_config = cpc;
+}
+EXPORT_SYMBOL(lustre_register_client_process_config);
+
+/**
+ * Rename the proc parameter in \a cfg with a new name \a new_name.
+ *
+ * \param cfg	   config structure which contains the proc parameter
+ * \param new_name new name of the proc parameter
+ *
+ * \retval valid-pointer    pointer to the newly-allocated config structure
+ *			    which contains the renamed proc parameter
+ * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does
+ *			    not contain a proc parameter
+ * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs
+ */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name)
+{
+	struct lustre_cfg_bufs	*bufs = NULL;
+	struct lustre_cfg	*new_cfg = NULL;
+	char			*param = NULL;
+	char			*new_param = NULL;
+	char			*value = NULL;
+	int			 name_len = 0;
+	int			 new_len = 0;
+	ENTRY;
+
+	if (cfg == NULL || new_name == NULL)
+		RETURN(ERR_PTR(-EINVAL));
+
+	param = lustre_cfg_string(cfg, 1);
+	if (param == NULL)
+		RETURN(ERR_PTR(-EINVAL));
+
+	value = strchr(param, '=');
+	if (value == NULL)
+		name_len = strlen(param);
+	else
+		name_len = value - param;
+
+	new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len;
+
+	OBD_ALLOC(new_param, new_len);
+	if (new_param == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	strcpy(new_param, new_name);
+	if (value != NULL)
+		strcat(new_param, value);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL) {
+		OBD_FREE(new_param, new_len);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	lustre_cfg_bufs_reset(bufs, NULL);
+	lustre_cfg_bufs_init(bufs, cfg);
+	lustre_cfg_bufs_set_string(bufs, 1, new_param);
+
+	new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs);
+
+	OBD_FREE(new_param, new_len);
+	OBD_FREE_PTR(bufs);
+	if (new_cfg == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	new_cfg->lcfg_num = cfg->lcfg_num;
+	new_cfg->lcfg_flags = cfg->lcfg_flags;
+	new_cfg->lcfg_nid = cfg->lcfg_nid;
+	new_cfg->lcfg_nal = cfg->lcfg_nal;
+
+	RETURN(new_cfg);
+}
+EXPORT_SYMBOL(lustre_cfg_rename);
+
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
+{
+	quota_process_config = qpc;
+}
+EXPORT_SYMBOL(lustre_register_quota_process_config);
+
+/** Process configuration commands given in lustre_cfg form.
+ * These may come from direct calls (e.g. class_manual_cleanup)
+ * or processing the config llog, or ioctl from lctl.
+ */
+int class_process_config(struct lustre_cfg *lcfg)
+{
+	struct obd_device *obd;
+	int err;
+
+	LASSERT(lcfg && !IS_ERR(lcfg));
+	CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
+
+	/* Commands that don't need a device */
+	switch(lcfg->lcfg_command) {
+	case LCFG_ATTACH: {
+		err = class_attach(lcfg);
+		GOTO(out, err);
+	}
+	case LCFG_ADD_UUID: {
+		CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64
+		       " (%s)\n", lustre_cfg_string(lcfg, 1),
+		       lcfg->lcfg_nid, libcfs_nid2str(lcfg->lcfg_nid));
+
+		err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid);
+		GOTO(out, err);
+	}
+	case LCFG_DEL_UUID: {
+		CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+		       (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0)
+		       ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
+
+		err = class_del_uuid(lustre_cfg_string(lcfg, 1));
+		GOTO(out, err);
+	}
+	case LCFG_MOUNTOPT: {
+		CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
+		       lustre_cfg_string(lcfg, 1),
+		       lustre_cfg_string(lcfg, 2),
+		       lustre_cfg_string(lcfg, 3));
+		/* set these mount options somewhere, so ll_fill_super
+		 * can find them. */
+		err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+					lustre_cfg_string(lcfg, 1),
+					LUSTRE_CFG_BUFLEN(lcfg, 2),
+					lustre_cfg_string(lcfg, 2),
+					LUSTRE_CFG_BUFLEN(lcfg, 3),
+					lustre_cfg_string(lcfg, 3));
+		GOTO(out, err);
+	}
+	case LCFG_DEL_MOUNTOPT: {
+		CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		class_del_profile(lustre_cfg_string(lcfg, 1));
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_TIMEOUT: {
+		CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
+		       obd_timeout, lcfg->lcfg_num);
+		obd_timeout = max(lcfg->lcfg_num, 1U);
+		obd_timeout_set = 1;
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_LDLM_TIMEOUT: {
+		CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n",
+		       ldlm_timeout, lcfg->lcfg_num);
+		ldlm_timeout = max(lcfg->lcfg_num, 1U);
+		if (ldlm_timeout >= obd_timeout)
+			ldlm_timeout = max(obd_timeout / 3, 1U);
+		ldlm_timeout_set = 1;
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_UPCALL: {
+		LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n");
+		/* COMPAT_146 Don't fail on old configs */
+		GOTO(out, err = 0);
+	}
+	case LCFG_MARKER: {
+		struct cfg_marker *marker;
+		marker = lustre_cfg_buf(lcfg, 1);
+		CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+		       marker->cm_flags, marker->cm_tgtname, marker->cm_comment);
+		GOTO(out, err = 0);
+	}
+	case LCFG_PARAM: {
+		char *tmp;
+		/* llite has no obd */
+		if ((class_match_param(lustre_cfg_string(lcfg, 1),
+				       PARAM_LLITE, 0) == 0) &&
+		    client_process_config) {
+			err = (*client_process_config)(lcfg);
+			GOTO(out, err);
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_SYS, &tmp) == 0)) {
+			/* Global param settings */
+			err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+			/*
+			 * Client or server should not fail to mount if
+			 * it hits an unknown configuration parameter.
+			 */
+			if (err != 0)
+				CWARN("Ignoring unknown param %s\n", tmp);
+
+			GOTO(out, err = 0);
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_QUOTA, &tmp) == 0) &&
+			   quota_process_config) {
+			err = (*quota_process_config)(lcfg);
+			GOTO(out, err);
+		}
+		/* Fall through */
+		break;
+	}
+	}
+
+	/* Commands that require a device */
+	obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+	if (obd == NULL) {
+		if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
+			CERROR("this lcfg command requires a device name\n");
+		else
+			CERROR("no device for: %s\n",
+			       lustre_cfg_string(lcfg, 0));
+
+		GOTO(out, err = -EINVAL);
+	}
+
+	switch(lcfg->lcfg_command) {
+	case LCFG_SETUP: {
+		err = class_setup(obd, lcfg);
+		GOTO(out, err);
+	}
+	case LCFG_DETACH: {
+		err = class_detach(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_CLEANUP: {
+		err = class_cleanup(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_ADD_CONN: {
+		err = class_add_conn(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_DEL_CONN: {
+		err = class_del_conn(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_POOL_NEW: {
+		err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+		GOTO(out, err = 0);
+		break;
+	}
+	case LCFG_POOL_ADD: {
+		err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+				   lustre_cfg_string(lcfg, 3));
+		GOTO(out, err = 0);
+		break;
+	}
+	case LCFG_POOL_REM: {
+		err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+				   lustre_cfg_string(lcfg, 3));
+		GOTO(out, err = 0);
+		break;
+	}
+	case LCFG_POOL_DEL: {
+		err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+		GOTO(out, err = 0);
+		break;
+	}
+	default: {
+		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+		GOTO(out, err);
+
+	}
+	}
+out:
+	if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+		CWARN("Ignoring error %d on optional command %#x\n", err,
+		      lcfg->lcfg_command);
+		err = 0;
+	}
+	return err;
+}
+EXPORT_SYMBOL(class_process_config);
+
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data)
+{
+	struct lprocfs_vars *var;
+	struct file fakefile;
+	struct seq_file fake_seqfile;
+	char *key, *sval;
+	int i, keylen, vallen;
+	int matched = 0, j = 0;
+	int rc = 0;
+	int skip = 0;
+	ENTRY;
+
+	if (lcfg->lcfg_command != LCFG_PARAM) {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		RETURN(-EINVAL);
+	}
+
+	/* fake a seq file so that var->fops->write can work... */
+	fakefile.private_data = &fake_seqfile;
+	fake_seqfile.private = data;
+	/* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+	   or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+	   or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+		key = lustre_cfg_buf(lcfg, i);
+		/* Strip off prefix */
+		class_match_param(key, prefix, &key);
+		sval = strchr(key, '=');
+		if (!sval || (*(sval + 1) == 0)) {
+			CERROR("Can't parse param %s (missing '=')\n", key);
+			/* rc = -EINVAL;	continue parsing other params */
+			continue;
+		}
+		keylen = sval - key;
+		sval++;
+		vallen = strlen(sval);
+		matched = 0;
+		j = 0;
+		/* Search proc entries */
+		while (lvars[j].name) {
+			var = &lvars[j];
+			if (class_match_param(key, (char *)var->name, 0) == 0 &&
+			    keylen == strlen(var->name)) {
+				matched++;
+				rc = -EROFS;
+				if (var->fops && var->fops->write) {
+					mm_segment_t oldfs;
+					oldfs = get_fs();
+					set_fs(KERNEL_DS);
+					rc = (var->fops->write)(&fakefile, sval,
+								vallen, NULL);
+					set_fs(oldfs);
+				}
+				break;
+			}
+			j++;
+		}
+		if (!matched) {
+			/* If the prefix doesn't match, return error so we
+			   can pass it down the stack */
+			if (strnchr(key, keylen, '.'))
+			    RETURN(-ENOSYS);
+			CERROR("%s: unknown param %s\n",
+			       (char *)lustre_cfg_string(lcfg, 0), key);
+			/* rc = -EINVAL;	continue parsing other params */
+			skip++;
+		} else if (rc < 0) {
+			CERROR("writing proc entry %s err %d\n",
+			       var->name, rc);
+			rc = 0;
+		} else {
+			CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n",
+					 lustre_cfg_string(lcfg, 0),
+					 (int)strlen(prefix) - 1, prefix,
+					 (int)(sval - key - 1), key, sval);
+		}
+	}
+
+	if (rc > 0)
+		rc = 0;
+	if (!rc && skip)
+		rc = skip;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_process_proc_param);
+
+extern int lustre_check_exclusion(struct super_block *sb, char *svname);
+
+/** Parse a configuration llog, doing various manipulations on them
+ * for various reasons, (modifications for compatibility, skip obsolete
+ * records, change uuids, etc), then class_process_config() resulting
+ * net records.
+ */
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct config_llog_instance *clli = data;
+	int cfg_len = rec->lrh_len;
+	char *cfg_buf = (char*) (rec + 1);
+	int rc = 0;
+	ENTRY;
+
+	//class_config_dump_handler(handle, rec, data);
+
+	switch (rec->lrh_type) {
+	case OBD_CFG_REC: {
+		struct lustre_cfg *lcfg, *lcfg_new;
+		struct lustre_cfg_bufs bufs;
+		char *inst_name = NULL;
+		int inst_len = 0;
+		int inst = 0, swab = 0;
+
+		lcfg = (struct lustre_cfg *)cfg_buf;
+		if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+			lustre_swab_lustre_cfg(lcfg);
+			swab = 1;
+		}
+
+		rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+		if (rc)
+			GOTO(out, rc);
+
+		/* Figure out config state info */
+		if (lcfg->lcfg_command == LCFG_MARKER) {
+			struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+			lustre_swab_cfg_marker(marker, swab,
+					       LUSTRE_CFG_BUFLEN(lcfg, 1));
+			CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
+			       clli->cfg_flags, marker->cm_flags);
+			if (marker->cm_flags & CM_START) {
+				/* all previous flags off */
+				clli->cfg_flags = CFG_F_MARKER;
+				if (marker->cm_flags & CM_SKIP) {
+					clli->cfg_flags |= CFG_F_SKIP;
+					CDEBUG(D_CONFIG, "SKIP #%d\n",
+					       marker->cm_step);
+				} else if ((marker->cm_flags & CM_EXCLUDE) ||
+					   (clli->cfg_sb &&
+					    lustre_check_exclusion(clli->cfg_sb,
+							 marker->cm_tgtname))) {
+					clli->cfg_flags |= CFG_F_EXCLUDE;
+					CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+					       marker->cm_step);
+				}
+			} else if (marker->cm_flags & CM_END) {
+				clli->cfg_flags = 0;
+			}
+		}
+		/* A config command without a start marker before it is
+		   illegal (post 146) */
+		if (!(clli->cfg_flags & CFG_F_COMPAT146) &&
+		    !(clli->cfg_flags & CFG_F_MARKER) &&
+		    (lcfg->lcfg_command != LCFG_MARKER)) {
+			CWARN("Config not inside markers, ignoring! "
+			      "(inst: %p, uuid: %s, flags: %#x)\n",
+			      clli->cfg_instance,
+			      clli->cfg_uuid.uuid, clli->cfg_flags);
+			clli->cfg_flags |= CFG_F_SKIP;
+		}
+		if (clli->cfg_flags & CFG_F_SKIP) {
+			CDEBUG(D_CONFIG, "skipping %#x\n",
+			       clli->cfg_flags);
+			rc = 0;
+			/* No processing! */
+			break;
+		}
+
+		/*
+		 * For interoperability between 1.8 and 2.0,
+		 * rename "mds" obd device type to "mdt".
+		 */
+		{
+			char *typename = lustre_cfg_string(lcfg, 1);
+			char *index = lustre_cfg_string(lcfg, 2);
+
+			if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+			     strcmp(typename, "mds") == 0)) {
+				CWARN("For 1.8 interoperability, rename obd "
+				       "type from mds to mdt\n");
+				typename[2] = 't';
+			}
+			if ((lcfg->lcfg_command == LCFG_SETUP && index &&
+			     strcmp(index, "type") == 0)) {
+				CDEBUG(D_INFO, "For 1.8 interoperability, "
+				       "set this index to '0'\n");
+				index[0] = '0';
+				index[1] = 0;
+			}
+		}
+
+
+		if ((clli->cfg_flags & CFG_F_EXCLUDE) &&
+		    (lcfg->lcfg_command == LCFG_LOV_ADD_OBD))
+			/* Add inactive instead */
+			lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+
+		lustre_cfg_bufs_init(&bufs, lcfg);
+
+		if (clli && clli->cfg_instance &&
+		    LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){
+			inst = 1;
+			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+				   sizeof(clli->cfg_instance) * 2 + 4;
+			OBD_ALLOC(inst_name, inst_len);
+			if (inst_name == NULL)
+				GOTO(out, rc = -ENOMEM);
+			sprintf(inst_name, "%s-%p",
+				lustre_cfg_string(lcfg, 0),
+				clli->cfg_instance);
+			lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+			CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
+			       lcfg->lcfg_command, inst_name);
+		}
+
+		/* we override the llog's uuid for clients, to insure they
+		are unique */
+		if (clli && clli->cfg_instance != NULL &&
+		    lcfg->lcfg_command == LCFG_ATTACH) {
+			lustre_cfg_bufs_set_string(&bufs, 2,
+						   clli->cfg_uuid.uuid);
+		}
+		/*
+		 * sptlrpc config record, we expect 2 data segments:
+		 *  [0]: fs_name/target_name,
+		 *  [1]: rule string
+		 * moving them to index [1] and [2], and insert MGC's
+		 * obdname at index [0].
+		 */
+		if (clli && clli->cfg_instance == NULL &&
+		    lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+			lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
+					    bufs.lcfg_buflen[1]);
+			lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
+					    bufs.lcfg_buflen[0]);
+			lustre_cfg_bufs_set_string(&bufs, 0,
+						   clli->cfg_obdname);
+		}
+
+		lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs);
+
+		lcfg_new->lcfg_num   = lcfg->lcfg_num;
+		lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+
+		/* XXX Hack to try to remain binary compatible with
+		 * pre-newconfig logs */
+		if (lcfg->lcfg_nal != 0 &&      /* pre-newconfig log? */
+		    (lcfg->lcfg_nid >> 32) == 0) {
+			__u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff);
+
+			lcfg_new->lcfg_nid =
+				LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr);
+			CWARN("Converted pre-newconfig NAL %d NID %x to %s\n",
+			      lcfg->lcfg_nal, addr,
+			      libcfs_nid2str(lcfg_new->lcfg_nid));
+		} else {
+			lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+		}
+
+		lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */
+
+		rc = class_process_config(lcfg_new);
+		lustre_cfg_free(lcfg_new);
+
+		if (inst)
+			OBD_FREE(inst_name, inst_len);
+		break;
+	}
+	default:
+		CERROR("Unknown llog record type %#x encountered\n",
+		       rec->lrh_type);
+		break;
+	}
+out:
+	if (rc) {
+		CERROR("%s: cfg command failed: rc = %d\n",
+		       handle->lgh_ctxt->loc_obd->obd_name, rc);
+		class_config_dump_handler(NULL, handle, rec, data);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_llog_handler);
+
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg)
+{
+	struct llog_process_cat_data	 cd = {0, 0};
+	struct llog_handle		*llh;
+	llog_cb_t			 callback;
+	int				 rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "looking up llog %s\n", name);
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	/* continue processing from where we last stopped to end-of-log */
+	if (cfg) {
+		cd.lpcd_first_idx = cfg->cfg_last_idx;
+		callback = cfg->cfg_callback;
+		LASSERT(callback != NULL);
+	} else {
+		callback = class_config_llog_handler;
+	}
+
+	cd.lpcd_last_idx = 0;
+
+	rc = llog_process(env, llh, callback, cfg, &cd);
+
+	CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+	       cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
+	if (cfg)
+		cfg->cfg_last_idx = cd.lpcd_last_idx;
+
+parse_out:
+	llog_close(env, llh);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_parse_llog);
+
+/**
+ * parse config record and output dump in supplied buffer.
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ */
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size)
+{
+	struct lustre_cfg	*lcfg = (struct lustre_cfg *)(rec + 1);
+	char			*ptr = buf;
+	char			*end = buf + size;
+	int			 rc = 0;
+
+	ENTRY;
+
+	LASSERT(rec->lrh_type == OBD_CFG_REC);
+	rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+	if (rc < 0)
+		RETURN(rc);
+
+	ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command);
+	if (lcfg->lcfg_flags)
+		ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+				lcfg->lcfg_flags);
+
+	if (lcfg->lcfg_num)
+		ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num);
+
+	if (lcfg->lcfg_nid)
+		ptr += snprintf(ptr, end-ptr, "nid=%s("LPX64")\n     ",
+				libcfs_nid2str(lcfg->lcfg_nid),
+				lcfg->lcfg_nid);
+
+	if (lcfg->lcfg_command == LCFG_MARKER) {
+		struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+		ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+				marker->cm_step, marker->cm_flags,
+				marker->cm_tgtname, marker->cm_comment);
+	} else {
+		int i;
+
+		for (i = 0; i <  lcfg->lcfg_bufcount; i++) {
+			ptr += snprintf(ptr, end-ptr, "%d:%s  ", i,
+					lustre_cfg_string(lcfg, i));
+		}
+	}
+	/* return consumed bytes */
+	rc = ptr - buf;
+	RETURN(rc);
+}
+
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	char	*outstr;
+	int	 rc = 0;
+
+	ENTRY;
+
+	OBD_ALLOC(outstr, 256);
+	if (outstr == NULL)
+		RETURN(-ENOMEM);
+
+	if (rec->lrh_type == OBD_CFG_REC) {
+		class_config_parse_rec(rec, outstr, 256);
+		LCONSOLE(D_WARNING, "   %s\n", outstr);
+	} else {
+		LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
+		rc = -EINVAL;
+	}
+
+	OBD_FREE(outstr, 256);
+	RETURN(rc);
+}
+
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			   char *name, struct config_llog_instance *cfg)
+{
+	struct llog_handle	*llh;
+	int			 rc;
+
+	ENTRY;
+
+	LCONSOLE_INFO("Dumping config log %s\n", name);
+
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL);
+parse_out:
+	llog_close(env, llh);
+
+	LCONSOLE_INFO("End config log %s\n", name);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_dump_llog);
+
+/** Call class_cleanup and class_detach.
+ * "Manual" only in the sense that we're faking lcfg commands.
+ */
+int class_manual_cleanup(struct obd_device *obd)
+{
+	char		    flags[3] = "";
+	struct lustre_cfg      *lcfg;
+	struct lustre_cfg_bufs  bufs;
+	int		     rc;
+	ENTRY;
+
+	if (!obd) {
+		CERROR("empty cleanup\n");
+		RETURN(-EALREADY);
+	}
+
+	if (obd->obd_force)
+		strcat(flags, "F");
+	if (obd->obd_fail)
+		strcat(flags, "A");
+
+	CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n",
+	       obd->obd_name, flags);
+
+	lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, flags);
+	lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+	if (!lcfg)
+		RETURN(-ENOMEM);
+
+	rc = class_process_config(lcfg);
+	if (rc) {
+		CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+		GOTO(out, rc);
+	}
+
+	/* the lcfg is almost the same for both ops */
+	lcfg->lcfg_command = LCFG_DETACH;
+	rc = class_process_config(lcfg);
+	if (rc)
+		CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
+	lustre_cfg_free(lcfg);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_manual_cleanup);
+
+/*
+ * uuid<->export lustre hash operations
+ */
+
+static unsigned
+uuid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid,
+				  sizeof(((struct obd_uuid *)key)->uuid), mask);
+}
+
+static void *
+uuid_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+	return &exp->exp_client_uuid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+uuid_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+	return obd_uuid_equals(key, &exp->exp_client_uuid) &&
+	       !exp->exp_failed;
+}
+
+static void *
+uuid_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+}
+
+static void
+uuid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+	class_export_get(exp);
+}
+
+static void
+uuid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+	class_export_put(exp);
+}
+
+static cfs_hash_ops_t uuid_hash_ops = {
+	.hs_hash	= uuid_hash,
+	.hs_key	 = uuid_key,
+	.hs_keycmp      = uuid_keycmp,
+	.hs_object      = uuid_export_object,
+	.hs_get	 = uuid_export_get,
+	.hs_put_locked  = uuid_export_put_locked,
+};
+
+
+/*
+ * nid<->export hash operations
+ */
+
+static unsigned
+nid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static void *
+nid_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+	RETURN(&exp->exp_connection->c_peer.nid);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+nid_kepcmp(const void *key, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+	RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key &&
+	       !exp->exp_failed);
+}
+
+static void *
+nid_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_nid_hash);
+}
+
+static void
+nid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+	class_export_get(exp);
+}
+
+static void
+nid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+	class_export_put(exp);
+}
+
+static cfs_hash_ops_t nid_hash_ops = {
+	.hs_hash	= nid_hash,
+	.hs_key	 = nid_key,
+	.hs_keycmp      = nid_kepcmp,
+	.hs_object      = nid_export_object,
+	.hs_get	 = nid_export_get,
+	.hs_put_locked  = nid_export_put_locked,
+};
+
+
+/*
+ * nid<->nidstats hash operations
+ */
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+	return &ns->nid;
+}
+
+static int
+nidstats_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key;
+}
+
+static void *
+nidstats_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nid_stat, nid_hash);
+}
+
+static void
+nidstats_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+	nidstat_getref(ns);
+}
+
+static void
+nidstats_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+	nidstat_putref(ns);
+}
+
+static cfs_hash_ops_t nid_stat_hash_ops = {
+	.hs_hash	= nid_hash,
+	.hs_key	 = nidstats_key,
+	.hs_keycmp      = nidstats_keycmp,
+	.hs_object      = nidstats_object,
+	.hs_get	 = nidstats_get,
+	.hs_put_locked  = nidstats_put_locked,
+};

diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
new file mode 100644
index 0000000..99adad9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_mount.c

@@ -0,0 +1,1321 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */)
+#define PRINT_CMD CDEBUG
+
+#include <obd.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+#include <linux/version.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+
+static int (*client_fill_super)(struct super_block *sb,
+				struct vfsmount *mnt);
+
+static void (*kill_super_cb)(struct super_block *sb);
+
+/**************** config llog ********************/
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Continue to process new statements appended to the logs
+ * (whenever the config lock is revoked) until lustre_end_log
+ * is called.
+ * @param sb The superblock is used by the MGC to write to the local copy of
+ *   the config log
+ * @param logname The name of the llog to replicate from the MGS
+ * @param cfg Since the same mgc may be used to follow multiple config logs
+ *   (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
+ *   this log, and is added to the mgc's list of logs to follow.
+ */
+int lustre_process_log(struct super_block *sb, char *logname,
+		     struct config_llog_instance *cfg)
+{
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs *bufs;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int rc;
+	ENTRY;
+
+	LASSERT(mgc);
+	LASSERT(cfg);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		RETURN(-ENOMEM);
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(bufs, 1, logname);
+	lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
+	lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
+	lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	lustre_cfg_free(lcfg);
+
+	OBD_FREE_PTR(bufs);
+
+	if (rc == -EINVAL)
+		LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
+				   "failed from the MGS (%d).  Make sure this "
+				   "client and the MGS are running compatible "
+				   "versions of Lustre.\n",
+				   mgc->obd_name, logname, rc);
+
+	if (rc)
+		LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
+				   "failed (%d). This may be the result of "
+				   "communication errors between this node and "
+				   "the MGS, a bad configuration, or other "
+				   "errors. See the syslog for more "
+				   "information.\n", mgc->obd_name, logname,
+				   rc);
+
+	/* class_obd_list(); */
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_process_log);
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname,
+		       struct config_llog_instance *cfg)
+{
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs bufs;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int rc;
+	ENTRY;
+
+	if (!mgc)
+		RETURN(-ENOENT);
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, logname);
+	if (cfg)
+		lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+	lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	lustre_cfg_free(lcfg);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_end_log);
+
+/**************** obd start *******************/
+
+/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
+ * lctl (and do for echo cli/srv.
+ */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+	    char *s1, char *s2, char *s3, char *s4)
+{
+	struct lustre_cfg_bufs bufs;
+	struct lustre_cfg    * lcfg = NULL;
+	int rc;
+
+	CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+	       cmd, s1, s2, s3, s4);
+
+	lustre_cfg_bufs_reset(&bufs, cfgname);
+	if (s1)
+		lustre_cfg_bufs_set_string(&bufs, 1, s1);
+	if (s2)
+		lustre_cfg_bufs_set_string(&bufs, 2, s2);
+	if (s3)
+		lustre_cfg_bufs_set_string(&bufs, 3, s3);
+	if (s4)
+		lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+	lcfg = lustre_cfg_new(cmd, &bufs);
+	lcfg->lcfg_nid = nid;
+	rc = class_process_config(lcfg);
+	lustre_cfg_free(lcfg);
+	return(rc);
+}
+EXPORT_SYMBOL(do_lcfg);
+
+/** Call class_attach and class_setup.  These methods in turn call
+ * obd type-specific methods.
+ */
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4)
+{
+	int rc;
+	CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
+
+	rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
+	if (rc) {
+		CERROR("%s attach error %d\n", obdname, rc);
+		return rc;
+	}
+	rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
+	if (rc) {
+		CERROR("%s setup error %d\n", obdname, rc);
+		do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
+	}
+	return rc;
+}
+
+DEFINE_MUTEX(mgc_start_lock);
+
+/** Set up a mgc obd to process startup logs
+ *
+ * \param sb [in] super block of the mgc obd
+ *
+ * \retval 0 success, otherwise error code
+ */
+int lustre_start_mgc(struct super_block *sb)
+{
+	struct obd_connect_data *data = NULL;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *obd;
+	struct obd_export *exp;
+	struct obd_uuid *uuid;
+	class_uuid_t uuidc;
+	lnet_nid_t nid;
+	char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+	char *ptr;
+	int recov_bk;
+	int rc = 0, i = 0, j, len;
+	ENTRY;
+
+	LASSERT(lsi->lsi_lmd);
+
+	/* Find the first non-lo MGS nid for our MGC name */
+	if (IS_SERVER(lsi)) {
+		/* mount -o mgsnode=nid */
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		if (lsi->lsi_lmd->lmd_mgs &&
+		    (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
+			i++;
+		} else if (IS_MGS(lsi)) {
+			lnet_process_id_t id;
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+					continue;
+				nid = id.nid;
+				i++;
+				break;
+			}
+		}
+	} else { /* client */
+		/* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+		ptr = lsi->lsi_lmd->lmd_dev;
+		if (class_parse_nid(ptr, &nid, &ptr) == 0)
+			i++;
+	}
+	if (i == 0) {
+		CERROR("No valid MGS nids found.\n");
+		RETURN(-EINVAL);
+	}
+
+	mutex_lock(&mgc_start_lock);
+
+	len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
+	OBD_ALLOC(mgcname, len);
+	OBD_ALLOC(niduuid, len + 2);
+	if (!mgcname || !niduuid)
+		GOTO(out_free, rc = -ENOMEM);
+	sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
+
+	mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		GOTO(out_free, rc = -ENOMEM);
+
+	obd = class_name2obd(mgcname);
+	if (obd && !obd->obd_stopping) {
+		rc = obd_set_info_async(NULL, obd->obd_self_export,
+					strlen(KEY_MGSSEC), KEY_MGSSEC,
+					strlen(mgssec), mgssec, NULL);
+		if (rc)
+			GOTO(out_free, rc);
+
+		/* Re-using an existing MGC */
+		atomic_inc(&obd->u.cli.cl_mgc_refcount);
+
+		/* IR compatibility check, only for clients */
+		if (lmd_is_client(lsi->lsi_lmd)) {
+			int has_ir;
+			int vallen = sizeof(*data);
+			__u32 *flags = &lsi->lsi_lmd->lmd_flags;
+
+			rc = obd_get_info(NULL, obd->obd_self_export,
+					  strlen(KEY_CONN_DATA), KEY_CONN_DATA,
+					  &vallen, data, NULL);
+			LASSERT(rc == 0);
+			has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
+			if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
+				/* LMD_FLG_NOIR is for test purpose only */
+				LCONSOLE_WARN(
+				    "Trying to mount a client with IR setting "
+				    "not compatible with current mgc. "
+				    "Force to use current mgc setting that is "
+				    "IR %s.\n",
+				    has_ir ? "enabled" : "disabled");
+				if (has_ir)
+					*flags &= ~LMD_FLG_NOIR;
+				else
+					*flags |= LMD_FLG_NOIR;
+			}
+		}
+
+		recov_bk = 0;
+		/* If we are restarting the MGS, don't try to keep the MGC's
+		   old connection, or registration will fail. */
+		if (IS_MGS(lsi)) {
+			CDEBUG(D_MOUNT, "New MGS with live MGC\n");
+			recov_bk = 1;
+		}
+
+		/* Try all connections, but only once (again).
+		   We don't want to block another target from starting
+		   (using its local copy of the log), but we do want to connect
+		   if at all possible. */
+		recov_bk++;
+		CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
+		rc = obd_set_info_async(NULL, obd->obd_self_export,
+					sizeof(KEY_INIT_RECOV_BACKUP),
+					KEY_INIT_RECOV_BACKUP,
+					sizeof(recov_bk), &recov_bk, NULL);
+		GOTO(out, rc = 0);
+	}
+
+	CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
+
+	/* Add the primary nids for the MGS */
+	i = 0;
+	sprintf(niduuid, "%s_%x", mgcname, i);
+	if (IS_SERVER(lsi)) {
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		if (IS_MGS(lsi)) {
+			/* Use local nids (including LO) */
+			lnet_process_id_t id;
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				rc = do_lcfg(mgcname, id.nid,
+					     LCFG_ADD_UUID, niduuid, 0,0,0);
+			}
+		} else {
+			/* Use mgsnode= nids */
+			/* mount -o mgsnode=nid */
+			if (lsi->lsi_lmd->lmd_mgs) {
+				ptr = lsi->lsi_lmd->lmd_mgs;
+			} else if (class_find_param(ptr, PARAM_MGSNODE,
+						    &ptr) != 0) {
+				CERROR("No MGS nids given.\n");
+				GOTO(out_free, rc = -EINVAL);
+			}
+			while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+				rc = do_lcfg(mgcname, nid,
+					     LCFG_ADD_UUID, niduuid, 0,0,0);
+				i++;
+			}
+		}
+	} else { /* client */
+		/* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+		ptr = lsi->lsi_lmd->lmd_dev;
+		while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+			rc = do_lcfg(mgcname, nid,
+				     LCFG_ADD_UUID, niduuid, 0,0,0);
+			i++;
+			/* Stop at the first failover nid */
+			if (*ptr == ':')
+				break;
+		}
+	}
+	if (i == 0) {
+		CERROR("No valid MGS nids found.\n");
+		GOTO(out_free, rc = -EINVAL);
+	}
+	lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+	/* Random uuid for MGC allows easier reconnects */
+	OBD_ALLOC_PTR(uuid);
+	ll_generate_random_uuid(uuidc);
+	class_uuid_unparse(uuidc, uuid);
+
+	/* Start the MGC */
+	rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
+				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+				 niduuid, 0, 0);
+	OBD_FREE_PTR(uuid);
+	if (rc)
+		GOTO(out_free, rc);
+
+	/* Add any failover MGS nids */
+	i = 1;
+	while (ptr && ((*ptr == ':' ||
+	       class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
+		/* New failover node */
+		sprintf(niduuid, "%s_%x", mgcname, i);
+		j = 0;
+		while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+			j++;
+			rc = do_lcfg(mgcname, nid,
+				     LCFG_ADD_UUID, niduuid, 0,0,0);
+			if (*ptr == ':')
+				break;
+		}
+		if (j > 0) {
+			rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
+				     niduuid, 0, 0, 0);
+			i++;
+		} else {
+			/* at ":/fsname" */
+			break;
+		}
+	}
+	lsi->lsi_lmd->lmd_mgs_failnodes = i;
+
+	obd = class_name2obd(mgcname);
+	if (!obd) {
+		CERROR("Can't find mgcobd %s\n", mgcname);
+		GOTO(out_free, rc = -ENOTCONN);
+	}
+
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				strlen(KEY_MGSSEC), KEY_MGSSEC,
+				strlen(mgssec), mgssec, NULL);
+	if (rc)
+		GOTO(out_free, rc);
+
+	/* Keep a refcount of servers/clients who started with "mount",
+	   so we know when we can get rid of the mgc. */
+	atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+	/* Try all connections, but only once. */
+	recov_bk = 1;
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				sizeof(KEY_INIT_RECOV_BACKUP),
+				KEY_INIT_RECOV_BACKUP,
+				sizeof(recov_bk), &recov_bk, NULL);
+	if (rc)
+		/* nonfatal */
+		CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
+
+	/* We connect to the MGS at setup, and don't disconnect until cleanup */
+	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+				  OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
+				  OBD_CONNECT_LVB_TYPE;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+	data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+	if (lmd_is_client(lsi->lsi_lmd) &&
+	    lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
+		data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+	rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
+	if (rc) {
+		CERROR("connect failed %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	obd->u.cli.cl_mgc_mgsexp = exp;
+
+out:
+	/* Keep the mgc info in the sb. Note that many lsi's can point
+	   to the same mgc.*/
+	lsi->lsi_mgc = obd;
+out_free:
+	mutex_unlock(&mgc_start_lock);
+
+	if (data)
+		OBD_FREE_PTR(data);
+	if (mgcname)
+		OBD_FREE(mgcname, len);
+	if (niduuid)
+		OBD_FREE(niduuid, len + 2);
+	RETURN(rc);
+}
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *obd;
+	char *niduuid = 0, *ptr = 0;
+	int i, rc = 0, len = 0;
+	ENTRY;
+
+	if (!lsi)
+		RETURN(-ENOENT);
+	obd = lsi->lsi_mgc;
+	if (!obd)
+		RETURN(-ENOENT);
+	lsi->lsi_mgc = NULL;
+
+	mutex_lock(&mgc_start_lock);
+	LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
+	if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+		/* This is not fatal, every client that stops
+		   will call in here. */
+		CDEBUG(D_MOUNT, "mgc still has %d references.\n",
+		       atomic_read(&obd->u.cli.cl_mgc_refcount));
+		GOTO(out, rc = -EBUSY);
+	}
+
+	/* The MGC has no recoverable data in any case.
+	 * force shotdown set in umount_begin */
+	obd->obd_no_recov = 1;
+
+	if (obd->u.cli.cl_mgc_mgsexp) {
+		/* An error is not fatal, if we are unable to send the
+		   disconnect mgs ping evictor cleans up the export */
+		rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+		if (rc)
+			CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
+	}
+
+	/* Save the obdname for cleaning the nid uuids, which are
+	   obdname_XX */
+	len = strlen(obd->obd_name) + 6;
+	OBD_ALLOC(niduuid, len);
+	if (niduuid) {
+		strcpy(niduuid, obd->obd_name);
+		ptr = niduuid + strlen(niduuid);
+	}
+
+	rc = class_manual_cleanup(obd);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Clean the nid uuids */
+	if (!niduuid)
+		GOTO(out, rc = -ENOMEM);
+
+	for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+		sprintf(ptr, "_%x", i);
+		rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
+			     niduuid, 0, 0, 0);
+		if (rc)
+			CERROR("del MDC UUID %s failed: rc = %d\n",
+			       niduuid, rc);
+	}
+out:
+	if (niduuid)
+		OBD_FREE(niduuid, len);
+
+	/* class_import_put will get rid of the additional connections */
+	mutex_unlock(&mgc_start_lock);
+	RETURN(rc);
+}
+
+/***************** lustre superblock **************/
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi;
+	ENTRY;
+
+	OBD_ALLOC_PTR(lsi);
+	if (!lsi)
+		RETURN(NULL);
+	OBD_ALLOC_PTR(lsi->lsi_lmd);
+	if (!lsi->lsi_lmd) {
+		OBD_FREE_PTR(lsi);
+		RETURN(NULL);
+	}
+
+	lsi->lsi_lmd->lmd_exclude_count = 0;
+	lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+	lsi->lsi_lmd->lmd_recovery_time_hard = 0;
+	s2lsi_nocast(sb) = lsi;
+	/* we take 1 extra ref for our setup */
+	atomic_set(&lsi->lsi_mounts, 1);
+
+	/* Default umount style */
+	lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+
+	RETURN(lsi);
+}
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	LASSERT(lsi != NULL);
+	CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
+
+	/* someone didn't call server_put_mount. */
+	LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+	if (lsi->lsi_lmd != NULL) {
+		if (lsi->lsi_lmd->lmd_dev != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_dev,
+				 strlen(lsi->lsi_lmd->lmd_dev) + 1);
+		if (lsi->lsi_lmd->lmd_profile != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_profile,
+				 strlen(lsi->lsi_lmd->lmd_profile) + 1);
+		if (lsi->lsi_lmd->lmd_mgssec != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
+				 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
+		if (lsi->lsi_lmd->lmd_opts != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_opts,
+				 strlen(lsi->lsi_lmd->lmd_opts) + 1);
+		if (lsi->lsi_lmd->lmd_exclude_count)
+			OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+				 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
+				 lsi->lsi_lmd->lmd_exclude_count);
+		if (lsi->lsi_lmd->lmd_mgs != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgs,
+				 strlen(lsi->lsi_lmd->lmd_mgs) + 1);
+		if (lsi->lsi_lmd->lmd_osd_type != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
+				 strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
+		if (lsi->lsi_lmd->lmd_params != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_params, 4096);
+
+		OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+	}
+
+	LASSERT(lsi->lsi_llsbi == NULL);
+	OBD_FREE(lsi, sizeof(*lsi));
+	s2lsi_nocast(sb) = NULL;
+
+	RETURN(0);
+}
+
+/* The lsi has one reference for every server that is using the disk -
+   e.g. MDT, MGS, and potentially MGC */
+int lustre_put_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	LASSERT(lsi != NULL);
+
+	CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+	if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+		if (IS_SERVER(lsi) && lsi->lsi_osd_exp) {
+			obd_disconnect(lsi->lsi_osd_exp);
+			/* wait till OSD is gone */
+			obd_zombie_barrier();
+		}
+		lustre_free_lsi(sb);
+		RETURN(1);
+	}
+	RETURN(0);
+}
+
+/** Get the fsname ("lustre") from the server name ("lustre-OST003F").
+ * @param [in] svname server name including type and index
+ * @param [out] fsname Buffer to copy filesystem name prefix into.
+ *  Must have at least 'strlen(fsname) + 1' chars.
+ * @param [out] endptr if endptr isn't NULL it is set to end of fsname
+ * rc < 0  on error
+ */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr)
+{
+	const char *dash = strrchr(svname, '-');
+	if (!dash) {
+		dash = strrchr(svname, ':');
+		if (!dash)
+			return -EINVAL;
+	}
+
+	/* interpret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
+	 * in the fsname, then determine the server index */
+	if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
+		dash--;
+		for (; dash > svname && *dash != '-' && *dash != ':'; dash--)
+			;
+		if (dash == svname)
+			return -EINVAL;
+	}
+
+	if (fsname != NULL) {
+		strncpy(fsname, svname, dash - svname);
+		fsname[dash - svname] = '\0';
+	}
+
+	if (endptr != NULL)
+		*endptr = dash;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2fsname);
+
+/**
+ * Get service name (svname) from string
+ * rc < 0 on error
+ * if endptr isn't NULL it is set to end of fsname *
+ */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize)
+{
+	int rc;
+	const const char *dash;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(label, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	if (*dash != '-')
+		return -1;
+
+	if (strlcpy(svname, dash + 1, svsize) >= svsize)
+		return -E2BIG;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2svname);
+
+
+/* Get the index from the obd name.
+   rc = server type, or
+   rc < 0  on error
+   if endptr isn't NULL it is set to end of name */
+int server_name2index(const char *svname, __u32 *idx, const char **endptr)
+{
+	unsigned long index;
+	int rc;
+	const char *dash;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(svname, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	if (*dash != '-')
+		return -EINVAL;
+
+	dash++;
+
+	if (strncmp(dash, "MDT", 3) == 0)
+		rc = LDD_F_SV_TYPE_MDT;
+	else if (strncmp(dash, "OST", 3) == 0)
+		rc = LDD_F_SV_TYPE_OST;
+	else
+		return -EINVAL;
+
+	dash += 3;
+
+	if (strcmp(dash, "all") == 0)
+		return rc | LDD_F_SV_ALL;
+
+	index = simple_strtoul(dash, (char **)endptr, 16);
+	*idx = index;
+
+	return rc;
+}
+EXPORT_SYMBOL(server_name2index);
+
+/*************** mount common betweeen server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+
+	/* Drop a ref to the MGC */
+	rc = lustre_stop_mgc(sb);
+	if (rc && (rc != -ENOENT)) {
+		if (rc != -EBUSY) {
+			CERROR("Can't stop MGC: %d\n", rc);
+			RETURN(rc);
+		}
+		/* BUSY just means that there's some other obd that
+		   needs the mgc.  Let him clean it up. */
+		CDEBUG(D_MOUNT, "MGC still in use\n");
+	}
+	/* Drop a ref to the mounted disk */
+	lustre_put_lsi(sb);
+	lu_types_stop();
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_common_put_super);
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+	int i;
+
+	PRINT_CMD(D_MOUNT, "  mount data:\n");
+	if (lmd_is_client(lmd))
+		PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile);
+	PRINT_CMD(D_MOUNT, "device:  %s\n", lmd->lmd_dev);
+	PRINT_CMD(D_MOUNT, "flags:   %x\n", lmd->lmd_flags);
+
+	if (lmd->lmd_opts)
+		PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts);
+
+	if (lmd->lmd_recovery_time_soft)
+		PRINT_CMD(D_MOUNT, "recovery time soft: %d\n",
+			  lmd->lmd_recovery_time_soft);
+
+	if (lmd->lmd_recovery_time_hard)
+		PRINT_CMD(D_MOUNT, "recovery time hard: %d\n",
+			  lmd->lmd_recovery_time_hard);
+
+	for (i = 0; i < lmd->lmd_exclude_count; i++) {
+		PRINT_CMD(D_MOUNT, "exclude %d:  OST%04x\n", i,
+			  lmd->lmd_exclude[i]);
+	}
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct lustre_mount_data *lmd = lsi->lsi_lmd;
+	__u32 index;
+	int i, rc;
+	ENTRY;
+
+	rc = server_name2index(svname, &index, NULL);
+	if (rc != LDD_F_SV_TYPE_OST)
+		/* Only exclude OSTs */
+		RETURN(0);
+
+	CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
+	       index, lmd->lmd_exclude_count, lmd->lmd_dev);
+
+	for(i = 0; i < lmd->lmd_exclude_count; i++) {
+		if (index == lmd->lmd_exclude[i]) {
+			CWARN("Excluding %s (on exclusion list)\n", svname);
+			RETURN(1);
+		}
+	}
+	RETURN(0);
+}
+
+/* mount -v  -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr)
+{
+	const char *s1 = ptr, *s2;
+	__u32 index, *exclude_list;
+	int rc = 0, devmax;
+	ENTRY;
+
+	/* The shortest an ost name can be is 8 chars: -OST0000.
+	   We don't actually know the fsname at this time, so in fact
+	   a user could specify any fsname. */
+	devmax = strlen(ptr) / 8 + 1;
+
+	/* temp storage until we figure out how many we have */
+	OBD_ALLOC(exclude_list, sizeof(index) * devmax);
+	if (!exclude_list)
+		RETURN(-ENOMEM);
+
+	/* we enter this fn pointing at the '=' */
+	while (*s1 && *s1 != ' ' && *s1 != ',') {
+		s1++;
+		rc = server_name2index(s1, &index, &s2);
+		if (rc < 0) {
+			CERROR("Can't parse server name '%s'\n", s1);
+			break;
+		}
+		if (rc == LDD_F_SV_TYPE_OST)
+			exclude_list[lmd->lmd_exclude_count++] = index;
+		else
+			CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
+		s1 = s2;
+		/* now we are pointing at ':' (next exclude)
+		   or ',' (end of excludes) */
+		if (lmd->lmd_exclude_count >= devmax)
+			break;
+	}
+	if (rc >= 0) /* non-err */
+		rc = 0;
+
+	if (lmd->lmd_exclude_count) {
+		/* permanent, freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
+			  lmd->lmd_exclude_count);
+		if (lmd->lmd_exclude) {
+			memcpy(lmd->lmd_exclude, exclude_list,
+			       sizeof(index) * lmd->lmd_exclude_count);
+		} else {
+			rc = -ENOMEM;
+			lmd->lmd_exclude_count = 0;
+		}
+	}
+	OBD_FREE(exclude_list, sizeof(index) * devmax);
+	RETURN(rc);
+}
+
+static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
+{
+	char   *tail;
+	int     length;
+
+	if (lmd->lmd_mgssec != NULL) {
+		OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
+		lmd->lmd_mgssec = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(lmd->lmd_mgssec, length + 1);
+	if (lmd->lmd_mgssec == NULL)
+		return -ENOMEM;
+
+	memcpy(lmd->lmd_mgssec, ptr, length);
+	lmd->lmd_mgssec[length] = '\0';
+	return 0;
+}
+
+static int lmd_parse_string(char **handle, char *ptr)
+{
+	char   *tail;
+	int     length;
+
+	if ((handle == NULL) || (ptr == NULL))
+		return -EINVAL;
+
+	if (*handle != NULL) {
+		OBD_FREE(*handle, strlen(*handle) + 1);
+		*handle = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(*handle, length + 1);
+	if (*handle == NULL)
+		return -ENOMEM;
+
+	memcpy(*handle, ptr, length);
+	(*handle)[length] = '\0';
+
+	return 0;
+}
+
+/* Collect multiple values for mgsnid specifiers */
+static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
+{
+	lnet_nid_t nid;
+	char *tail = *ptr;
+	char *mgsnid;
+	int   length;
+	int   oldlen = 0;
+
+	/* Find end of nidlist */
+	while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {}
+	length = tail - *ptr;
+	if (length == 0) {
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
+		return -EINVAL;
+	}
+
+	if (lmd->lmd_mgs != NULL)
+		oldlen = strlen(lmd->lmd_mgs) + 1;
+
+	OBD_ALLOC(mgsnid, oldlen + length + 1);
+	if (mgsnid == NULL)
+		return -ENOMEM;
+
+	if (lmd->lmd_mgs != NULL) {
+		/* Multiple mgsnid= are taken to mean failover locations */
+		memcpy(mgsnid, lmd->lmd_mgs, oldlen);
+		mgsnid[oldlen - 1] = ':';
+		OBD_FREE(lmd->lmd_mgs, oldlen);
+	}
+	memcpy(mgsnid + oldlen, *ptr, length);
+	mgsnid[oldlen + length] = '\0';
+	lmd->lmd_mgs = mgsnid;
+	*ptr = tail;
+
+	return 0;
+}
+
+/** Parse mount line options
+ * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
+ * dev is passed as device=uml1:/lustre by mount.lustre
+ */
+static int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+	char *s1, *s2, *devname = NULL;
+	struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(lmd);
+	if (!options) {
+		LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
+				   "/sbin/mount.lustre is installed.\n");
+		RETURN(-EINVAL);
+	}
+
+	/* Options should be a string - try to detect old lmd data */
+	if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
+		LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
+				   "/sbin/mount.lustre.  Please install "
+				   "version %s\n", LUSTRE_VERSION_STRING);
+		RETURN(-EINVAL);
+	}
+	lmd->lmd_magic = LMD_MAGIC;
+
+	OBD_ALLOC(lmd->lmd_params, 4096);
+	if (lmd->lmd_params == NULL)
+		RETURN(-ENOMEM);
+	lmd->lmd_params[0] = '\0';
+
+	/* Set default flags here */
+
+	s1 = options;
+	while (*s1) {
+		int clear = 0;
+		int time_min = OBD_RECOVERY_TIME_MIN;
+
+		/* Skip whitespace and extra commas */
+		while (*s1 == ' ' || *s1 == ',')
+			s1++;
+
+		/* Client options are parsed in ll_options: eg. flock,
+		   user_xattr, acl */
+
+		/* Parse non-ldiskfs options here. Rather than modifying
+		   ldiskfs, we just zero these out here */
+		if (strncmp(s1, "abort_recov", 11) == 0) {
+			lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+			clear++;
+		} else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+			lmd->lmd_recovery_time_soft = max_t(int,
+				simple_strtoul(s1 + 19, NULL, 10), time_min);
+			clear++;
+		} else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+			lmd->lmd_recovery_time_hard = max_t(int,
+				simple_strtoul(s1 + 19, NULL, 10), time_min);
+			clear++;
+		} else if (strncmp(s1, "noir", 4) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
+			clear++;
+		} else if (strncmp(s1, "nosvc", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSVC;
+			clear++;
+		} else if (strncmp(s1, "nomgs", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOMGS;
+			clear++;
+		} else if (strncmp(s1, "noscrub", 7) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSCRUB;
+			clear++;
+		} else if (strncmp(s1, PARAM_MGSNODE,
+				   sizeof(PARAM_MGSNODE) - 1) == 0) {
+			s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
+			/* Assume the next mount opt is the first
+			   invalid nid we get to. */
+			rc = lmd_parse_mgs(lmd, &s2);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "writeconf", 9) == 0) {
+			lmd->lmd_flags |= LMD_FLG_WRITECONF;
+			clear++;
+		} else if (strncmp(s1, "update", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_UPDATE;
+			clear++;
+		} else if (strncmp(s1, "virgin", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_VIRGIN;
+			clear++;
+		} else if (strncmp(s1, "noprimnode", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE;
+			clear++;
+		} else if (strncmp(s1, "mgssec=", 7) == 0) {
+			rc = lmd_parse_mgssec(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		/* ost exclusion list */
+		} else if (strncmp(s1, "exclude=", 8) == 0) {
+			rc = lmd_make_exclusion(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "mgs", 3) == 0) {
+			/* We are an MGS */
+			lmd->lmd_flags |= LMD_FLG_MGS;
+			clear++;
+		} else if (strncmp(s1, "svname=", 7) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "param=", 6) == 0) {
+			int length;
+			char *tail = strchr(s1 + 6, ',');
+			if (tail == NULL)
+				length = strlen(s1);
+			else
+				length = tail - s1;
+			length -= 6;
+			strncat(lmd->lmd_params, s1 + 6, length);
+			strcat(lmd->lmd_params, " ");
+			clear++;
+		} else if (strncmp(s1, "osd=", 4) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
+			if (rc)
+				goto invalid;
+			clear++;
+		}
+		/* Linux 2.4 doesn't pass the device, so we stuck it at the
+		   end of the options. */
+		else if (strncmp(s1, "device=", 7) == 0) {
+			devname = s1 + 7;
+			/* terminate options right before device.  device
+			   must be the last one. */
+			*s1 = '\0';
+			break;
+		}
+
+		/* Find next opt */
+		s2 = strchr(s1, ',');
+		if (s2 == NULL) {
+			if (clear)
+				*s1 = '\0';
+			break;
+		}
+		s2++;
+		if (clear)
+			memmove(s1, s2, strlen(s2) + 1);
+		else
+			s1 = s2;
+	}
+
+	if (!devname) {
+		LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
+				   "(need mount option 'device=...')\n");
+		goto invalid;
+	}
+
+	s1 = strstr(devname, ":/");
+	if (s1) {
+		++s1;
+		lmd->lmd_flags |= LMD_FLG_CLIENT;
+		/* Remove leading /s from fsname */
+		while (*++s1 == '/') ;
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
+		if (!lmd->lmd_profile)
+			RETURN(-ENOMEM);
+		sprintf(lmd->lmd_profile, "%s-client", s1);
+	}
+
+	/* Freed in lustre_free_lsi */
+	OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+	if (!lmd->lmd_dev)
+		RETURN(-ENOMEM);
+	strcpy(lmd->lmd_dev, devname);
+
+	/* Save mount options */
+	s1 = options + strlen(options) - 1;
+	while (s1 >= options && (*s1 == ',' || *s1 == ' '))
+		*s1-- = 0;
+	if (*options != 0) {
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+		if (!lmd->lmd_opts)
+			RETURN(-ENOMEM);
+		strcpy(lmd->lmd_opts, options);
+	}
+
+	lmd_print(lmd);
+	lmd->lmd_magic = LMD_MAGIC;
+
+	RETURN(rc);
+
+invalid:
+	CERROR("Bad mount options %s\n", options);
+	RETURN(-EINVAL);
+}
+
+struct lustre_mount_data2 {
+	void *lmd2_data;
+	struct vfsmount *lmd2_mnt;
+};
+
+/** This is the entry point for the mount call into Lustre.
+ * This is called when a server or client is mounted,
+ * and this is where we start setting things up.
+ * @param data Mount options (e.g. -o flock,abort_recov)
+ */
+int lustre_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct lustre_mount_data *lmd;
+	struct lustre_mount_data2 *lmd2 = data;
+	struct lustre_sb_info *lsi;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+	lsi = lustre_init_lsi(sb);
+	if (!lsi)
+		RETURN(-ENOMEM);
+	lmd = lsi->lsi_lmd;
+
+	/*
+	 * Disable lockdep during mount, because mount locking patterns are
+	 * `special'.
+	 */
+	lockdep_off();
+
+	/*
+	 * LU-639: the obd cleanup of last mount may not finish yet, wait here.
+	 */
+	obd_zombie_barrier();
+
+	/* Figure out the lmd from the mount options */
+	if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
+		lustre_put_lsi(sb);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (lmd_is_client(lmd)) {
+		CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+		if (!client_fill_super) {
+			LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
+					   "client mount! Is the 'lustre' "
+					   "module loaded?\n");
+			lustre_put_lsi(sb);
+			rc = -ENODEV;
+		} else {
+			rc = lustre_start_mgc(sb);
+			if (rc) {
+				lustre_put_lsi(sb);
+				GOTO(out, rc);
+			}
+			/* Connect and start */
+			/* (should always be ll_fill_super) */
+			rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
+			/* c_f_s will call lustre_common_put_super on failure */
+		}
+	} else {
+		CERROR("This is client-side-only module, "
+		       "cannot handle server mount.\n");
+		rc = -EINVAL;
+	}
+
+	/* If error happens in fill_super() call, @lsi will be killed there.
+	 * This is why we do not put it here. */
+	GOTO(out, rc);
+out:
+	if (rc) {
+		CERROR("Unable to mount %s (%d)\n",
+		       s2lsi(sb) ? lmd->lmd_dev : "", rc);
+	} else {
+		CDEBUG(D_SUPER, "Mount %s complete\n",
+		       lmd->lmd_dev);
+	}
+	lockdep_on();
+	return rc;
+}
+
+
+/* We can't call ll_fill_super by name because it lives in a module that
+   must be loaded after this one. */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+						  struct vfsmount *mnt))
+{
+	client_fill_super = cfs;
+}
+EXPORT_SYMBOL(lustre_register_client_fill_super);
+
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
+{
+	kill_super_cb = cfs;
+}
+EXPORT_SYMBOL(lustre_register_kill_super_cb);
+
+/***************** FS registration ******************/
+struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
+				const char *devname, void *data)
+{
+	struct lustre_mount_data2 lmd2 = { data, NULL };
+
+	return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
+}
+
+void lustre_kill_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	if (kill_super_cb && lsi && !IS_SERVER(lsi))
+		(*kill_super_cb)(sb);
+
+	kill_anon_super(sb);
+}
+
+/** Register the "lustre" fs type
+ */
+struct file_system_type lustre_fs_type = {
+	.owner	= THIS_MODULE,
+	.name	 = "lustre",
+	.mount	= lustre_mount,
+	.kill_sb      = lustre_kill_super,
+	.fs_flags     = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
+			FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+};
+
+int lustre_register_fs(void)
+{
+	return register_filesystem(&lustre_fs_type);
+}
+
+int lustre_unregister_fs(void)
+{
+	return unregister_filesystem(&lustre_fs_type);
+}

diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c
new file mode 100644
index 0000000..01a0e1f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obdo.c

@@ -0,0 +1,362 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
+{
+	dst->o_parent_oid = fid_oid(parent);
+	dst->o_parent_seq = fid_seq(parent);
+	dst->o_parent_ver = fid_ver(parent);
+	dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+}
+EXPORT_SYMBOL(obdo_set_parent_fid);
+
+/* WARNING: the file systems must take care not to tinker with
+   attributes they don't manage (such as blocks). */
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid)
+{
+	obd_flag newvalid = 0;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
+		       valid, LTIME_S(src->i_mtime),
+		       LTIME_S(src->i_ctime));
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->o_atime = LTIME_S(src->i_atime);
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->o_mtime = LTIME_S(src->i_mtime);
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->o_ctime = LTIME_S(src->i_ctime);
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->o_size = i_size_read(src);
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = src->i_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & OBD_MD_FLBLKSZ) {   /* optimal block size */
+		dst->o_blksize = ll_inode_blksize(src);
+		newvalid |= OBD_MD_FLBLKSZ;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (src->i_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (src->i_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->o_uid = src->i_uid;
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->o_gid = src->i_gid;
+		newvalid |= OBD_MD_FLGID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->o_flags = ll_inode_flags(src);
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_inode);
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid)
+{
+	CDEBUG(D_INODE, "src obdo "DOSTID" valid "LPX64", dst obdo "DOSTID"\n",
+	       POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+	if (valid & OBD_MD_FLATIME)
+		dst->o_atime = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		dst->o_mtime = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME)
+		dst->o_ctime = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		dst->o_size = src->o_size;
+	if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+		dst->o_blocks = src->o_blocks;
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->o_blksize = src->o_blksize;
+	if (valid & OBD_MD_FLTYPE)
+		dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+	if (valid & OBD_MD_FLMODE)
+		dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->o_uid = src->o_uid;
+	if (valid & OBD_MD_FLGID)
+		dst->o_gid = src->o_gid;
+	if (valid & OBD_MD_FLFLAGS)
+		dst->o_flags = src->o_flags;
+	if (valid & OBD_MD_FLFID) {
+		dst->o_parent_seq = src->o_parent_seq;
+		dst->o_parent_ver = src->o_parent_ver;
+	}
+	if (valid & OBD_MD_FLGENER)
+		dst->o_parent_oid = src->o_parent_oid;
+	if (valid & OBD_MD_FLHANDLE)
+		dst->o_handle = src->o_handle;
+	if (valid & OBD_MD_FLCOOKIE)
+		dst->o_lcookie = src->o_lcookie;
+
+	dst->o_valid |= valid;
+}
+EXPORT_SYMBOL(obdo_cpy_md);
+
+/* returns FALSE if comparison (by flags) is same, TRUE if changed */
+int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare)
+{
+	int res = 0;
+
+	if ( compare & OBD_MD_FLATIME )
+		res = (res || (dst->o_atime != src->o_atime));
+	if ( compare & OBD_MD_FLMTIME )
+		res = (res || (dst->o_mtime != src->o_mtime));
+	if ( compare & OBD_MD_FLCTIME )
+		res = (res || (dst->o_ctime != src->o_ctime));
+	if ( compare & OBD_MD_FLSIZE )
+		res = (res || (dst->o_size != src->o_size));
+	if ( compare & OBD_MD_FLBLOCKS ) /* allocation of space */
+		res = (res || (dst->o_blocks != src->o_blocks));
+	if ( compare & OBD_MD_FLBLKSZ )
+		res = (res || (dst->o_blksize != src->o_blksize));
+	if ( compare & OBD_MD_FLTYPE )
+		res = (res || (((dst->o_mode ^ src->o_mode) & S_IFMT) != 0));
+	if ( compare & OBD_MD_FLMODE )
+		res = (res || (((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0));
+	if ( compare & OBD_MD_FLUID )
+		res = (res || (dst->o_uid != src->o_uid));
+	if ( compare & OBD_MD_FLGID )
+		res = (res || (dst->o_gid != src->o_gid));
+	if ( compare & OBD_MD_FLFLAGS )
+		res = (res || (dst->o_flags != src->o_flags));
+	if ( compare & OBD_MD_FLNLINK )
+		res = (res || (dst->o_nlink != src->o_nlink));
+	if ( compare & OBD_MD_FLFID ) {
+		res = (res || (dst->o_parent_seq != src->o_parent_seq));
+		res = (res || (dst->o_parent_ver != src->o_parent_ver));
+	}
+	if ( compare & OBD_MD_FLGENER )
+		res = (res || (dst->o_parent_oid != src->o_parent_oid));
+	/* XXX Don't know if thses should be included here - wasn't previously
+	if ( compare & OBD_MD_FLINLINE )
+		res = (res || memcmp(dst->o_inline, src->o_inline));
+	*/
+	return res;
+}
+EXPORT_SYMBOL(obdo_cmp_md);
+
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj)
+{
+	ioobj->ioo_oid = oa->o_oi;
+	if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
+		ostid_set_seq_mdt0(&ioobj->ioo_oid);
+
+	/* Since 2.4 this does not contain o_mode in the low 16 bits.
+	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+	ioobj->ioo_max_brw = 0;
+}
+EXPORT_SYMBOL(obdo_to_ioobj);
+
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid)
+{
+	if (ia_valid & ATTR_ATIME) {
+		oa->o_atime = LTIME_S(attr->ia_atime);
+		oa->o_valid |= OBD_MD_FLATIME;
+	}
+	if (ia_valid & ATTR_MTIME) {
+		oa->o_mtime = LTIME_S(attr->ia_mtime);
+		oa->o_valid |= OBD_MD_FLMTIME;
+	}
+	if (ia_valid & ATTR_CTIME) {
+		oa->o_ctime = LTIME_S(attr->ia_ctime);
+		oa->o_valid |= OBD_MD_FLCTIME;
+	}
+	if (ia_valid & ATTR_SIZE) {
+		oa->o_size = attr->ia_size;
+		oa->o_valid |= OBD_MD_FLSIZE;
+	}
+	if (ia_valid & ATTR_MODE) {
+		oa->o_mode = attr->ia_mode;
+		oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE;
+		if (!current_is_in_group(oa->o_gid) &&
+		    !cfs_capable(CFS_CAP_FSETID))
+			oa->o_mode &= ~S_ISGID;
+	}
+	if (ia_valid & ATTR_UID) {
+		oa->o_uid = attr->ia_uid;
+		oa->o_valid |= OBD_MD_FLUID;
+	}
+	if (ia_valid & ATTR_GID) {
+		oa->o_gid = attr->ia_gid;
+		oa->o_valid |= OBD_MD_FLGID;
+	}
+}
+EXPORT_SYMBOL(obdo_from_iattr);
+
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid)
+{
+	valid &= oa->o_valid;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE, "valid "LPX64", new time "LPU64"/"LPU64"\n",
+		       oa->o_valid, oa->o_mtime, oa->o_ctime);
+
+	attr->ia_valid = 0;
+	if (valid & OBD_MD_FLATIME) {
+		LTIME_S(attr->ia_atime) = oa->o_atime;
+		attr->ia_valid |= ATTR_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		LTIME_S(attr->ia_mtime) = oa->o_mtime;
+		attr->ia_valid |= ATTR_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		LTIME_S(attr->ia_ctime) = oa->o_ctime;
+		attr->ia_valid |= ATTR_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		attr->ia_size = oa->o_size;
+		attr->ia_valid |= ATTR_SIZE;
+	}
+#if 0   /* you shouldn't be able to change a file's type with setattr */
+	if (valid & OBD_MD_FLTYPE) {
+		attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT);
+		attr->ia_valid |= ATTR_MODE;
+	}
+#endif
+	if (valid & OBD_MD_FLMODE) {
+		attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT);
+		attr->ia_valid |= ATTR_MODE;
+		if (!current_is_in_group(oa->o_gid) &&
+		    !cfs_capable(CFS_CAP_FSETID))
+			attr->ia_mode &= ~S_ISGID;
+	}
+	if (valid & OBD_MD_FLUID) {
+		attr->ia_uid = oa->o_uid;
+		attr->ia_valid |= ATTR_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		attr->ia_gid = oa->o_gid;
+		attr->ia_valid |= ATTR_GID;
+	}
+}
+EXPORT_SYMBOL(iattr_from_obdo);
+
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid)
+{
+	iattr_from_obdo(&op_data->op_attr, oa, valid);
+	if (valid & OBD_MD_FLBLOCKS) {
+		op_data->op_attr_blocks = oa->o_blocks;
+		op_data->op_attr.ia_valid |= ATTR_BLOCKS;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+			oa->o_flags;
+		op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+	}
+}
+EXPORT_SYMBOL(md_from_obdo);
+
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+		  unsigned int valid)
+{
+	obdo_from_iattr(oa, &op_data->op_attr, valid);
+	if (valid & ATTR_BLOCKS) {
+		oa->o_blocks = op_data->op_attr_blocks;
+		oa->o_valid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & ATTR_ATTR_FLAG) {
+		oa->o_flags =
+			((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+		oa->o_valid |= OBD_MD_FLFLAGS;
+	}
+}
+EXPORT_SYMBOL(obdo_from_md);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo)
+{
+	dobdo->o_size = cpu_to_le64(sobdo->o_size);
+	dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime);
+	dobdo->o_atime = cpu_to_le64(sobdo->o_atime);
+	dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime);
+	dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks);
+	dobdo->o_mode = cpu_to_le32(sobdo->o_mode);
+	dobdo->o_uid = cpu_to_le32(sobdo->o_uid);
+	dobdo->o_gid = cpu_to_le32(sobdo->o_gid);
+	dobdo->o_flags = cpu_to_le32(sobdo->o_flags);
+	dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink);
+	dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize);
+	dobdo->o_valid = cpu_to_le64(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_cpu_to_le);
+
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo)
+{
+	dobdo->o_size = le64_to_cpu(sobdo->o_size);
+	dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime);
+	dobdo->o_atime = le64_to_cpu(sobdo->o_atime);
+	dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime);
+	dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks);
+	dobdo->o_mode = le32_to_cpu(sobdo->o_mode);
+	dobdo->o_uid = le32_to_cpu(sobdo->o_uid);
+	dobdo->o_gid = le32_to_cpu(sobdo->o_gid);
+	dobdo->o_flags = le32_to_cpu(sobdo->o_flags);
+	dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink);
+	dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize);
+	dobdo->o_valid = le64_to_cpu(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_le_to_cpu);

diff --git a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
new file mode 100644
index 0000000..c3b7a78
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c

@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/statfs_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <lustre_export.h>
+#include <lustre_net.h>
+#include <obd_support.h>
+#include <obd_class.h>
+
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
+{
+	memset(osfs, 0, sizeof(*osfs));
+	osfs->os_type = sfs->f_type;
+	osfs->os_blocks = sfs->f_blocks;
+	osfs->os_bfree = sfs->f_bfree;
+	osfs->os_bavail = sfs->f_bavail;
+	osfs->os_files = sfs->f_files;
+	osfs->os_ffree = sfs->f_ffree;
+	osfs->os_bsize = sfs->f_bsize;
+	osfs->os_namelen = sfs->f_namelen;
+}
+EXPORT_SYMBOL(statfs_pack);
+
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
+{
+	memset(sfs, 0, sizeof(*sfs));
+	sfs->f_type = osfs->os_type;
+	sfs->f_blocks = osfs->os_blocks;
+	sfs->f_bfree = osfs->os_bfree;
+	sfs->f_bavail = osfs->os_bavail;
+	sfs->f_files = osfs->os_files;
+	sfs->f_ffree = osfs->os_ffree;
+	sfs->f_bsize = osfs->os_bsize;
+	sfs->f_namelen = osfs->os_namelen;
+}
+EXPORT_SYMBOL(statfs_unpack);

diff --git a/drivers/staging/lustre/lustre/obdclass/uuid.c b/drivers/staging/lustre/lustre/obdclass/uuid.c
new file mode 100644
index 0000000..af5f27f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/uuid.c

@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
+ *
+ * Public include file for the UUID library
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+
+
+static inline __u32 consume(int nob, __u8 **ptr)
+{
+	__u32 value;
+
+	LASSERT(nob <= sizeof value);
+
+	for (value = 0; nob > 0; --nob)
+		value = (value << 8) | *((*ptr)++);
+	return value;
+}
+
+#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
+
+static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr)
+{
+	__u8 *ptr = in;
+
+	LASSERT(nr * sizeof *uu == sizeof(class_uuid_t));
+
+	while (nr-- > 0)
+		CONSUME(uu[nr], &ptr);
+}
+
+void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+	/* uu as an array of __u16's */
+	__u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
+
+	CLASSERT(ARRAY_SIZE(uuid) == 8);
+
+	uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
+	sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
+		uuid[0], uuid[1], uuid[2], uuid[3],
+		uuid[4], uuid[5], uuid[6], uuid[7]);
+}
+EXPORT_SYMBOL(class_uuid_unparse);

diff --git a/drivers/staging/lustre/lustre/obdecho/Makefile b/drivers/staging/lustre/lustre/obdecho/Makefile
new file mode 100644
index 0000000..4c48e24
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/Makefile

@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += obdecho.o
+obdecho-y := echo_client.o lproc_echo.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/obdecho/echo.c b/drivers/staging/lustre/lustre/obdecho/echo.c
new file mode 100644
index 0000000..9e64939
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/echo.c

@@ -0,0 +1,679 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+
+#include "echo_internal.h"
+
+/* The echo objid needs to be below 2^32, because regular FID numbers are
+ * limited to 2^32 objects in f_oid for the FID_SEQ_ECHO range. b=23335 */
+#define ECHO_INIT_OID	0x10000000ULL
+#define ECHO_HANDLE_MAGIC    0xabcd0123fedc9876ULL
+
+#define ECHO_PERSISTENT_PAGES (ECHO_PERSISTENT_SIZE >> PAGE_CACHE_SHIFT)
+static struct page *echo_persistent_pages[ECHO_PERSISTENT_PAGES];
+
+enum {
+	LPROC_ECHO_READ_BYTES = 1,
+	LPROC_ECHO_WRITE_BYTES = 2,
+	LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES +1
+};
+
+static int echo_connect(const struct lu_env *env,
+			struct obd_export **exp, struct obd_device *obd,
+			struct obd_uuid *cluuid, struct obd_connect_data *data,
+			void *localdata)
+{
+	struct lustre_handle conn = { 0 };
+	int rc;
+
+	data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED;
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc) {
+		CERROR("can't connect %d\n", rc);
+		return rc;
+	}
+	*exp = class_conn2export(&conn);
+
+	return 0;
+}
+
+static int echo_disconnect(struct obd_export *exp)
+{
+	LASSERT (exp != NULL);
+
+	return server_disconnect_export(exp);
+}
+
+static int echo_init_export(struct obd_export *exp)
+{
+	return ldlm_init_export(exp);
+}
+
+static int echo_destroy_export(struct obd_export *exp)
+{
+	ENTRY;
+
+	target_destroy_export(exp);
+	ldlm_destroy_export(exp);
+
+	RETURN(0);
+}
+
+ static __u64 echo_next_id(struct obd_device *obddev)
+{
+	obd_id id;
+
+	spin_lock(&obddev->u.echo.eo_lock);
+	id = ++obddev->u.echo.eo_lastino;
+	spin_unlock(&obddev->u.echo.eo_lock);
+
+	return id;
+}
+
+static int echo_create(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa, struct lov_stripe_md **ea,
+		       struct obd_trans_info *oti)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+
+	if (!obd) {
+		CERROR("invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		return -EINVAL;
+	}
+
+	if (!(oa->o_mode && S_IFMT)) {
+		CERROR("echo obd: no type!\n");
+		return -ENOENT;
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLTYPE)) {
+		CERROR("invalid o_valid "LPX64"\n", oa->o_valid);
+		return -EINVAL;
+	}
+
+	ostid_set_seq_echo(&oa->o_oi);
+	ostid_set_id(&oa->o_oi, echo_next_id(obd));
+	oa->o_valid = OBD_MD_FLID;
+
+	return 0;
+}
+
+static int echo_destroy(const struct lu_env *env, struct obd_export *exp,
+			struct obdo *oa, struct lov_stripe_md *ea,
+			struct obd_trans_info *oti, struct obd_export *md_exp,
+			void *capa)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+
+	ENTRY;
+	if (!obd) {
+		CERROR("invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	if (!(oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: "LPX64"\n", oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
+	    ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
+		CERROR("bad destroy objid: "DOSTID"\n", POSTID(&oa->o_oi));
+		RETURN(-EINVAL);
+	}
+
+	RETURN(0);
+}
+
+static int echo_getattr(const struct lu_env *env, struct obd_export *exp,
+			struct obd_info *oinfo)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	obd_id id = ostid_id(&oinfo->oi_oa->o_oi);
+
+	ENTRY;
+	if (!obd) {
+		CERROR("invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: "LPX64"\n",
+		       oinfo->oi_oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	obdo_cpy_md(oinfo->oi_oa, &obd->u.echo.eo_oa, oinfo->oi_oa->o_valid);
+	ostid_set_seq_echo(&oinfo->oi_oa->o_oi);
+	ostid_set_id(&oinfo->oi_oa->o_oi, id);
+
+	RETURN(0);
+}
+
+static int echo_setattr(const struct lu_env *env, struct obd_export *exp,
+			struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+
+	ENTRY;
+	if (!obd) {
+		CERROR("invalid client cookie "LPX64"\n",
+		       exp->exp_handle.h_cookie);
+		RETURN(-EINVAL);
+	}
+
+	if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
+		CERROR("obdo missing FLID valid flag: "LPX64"\n",
+		       oinfo->oi_oa->o_valid);
+		RETURN(-EINVAL);
+	}
+
+	memcpy(&obd->u.echo.eo_oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
+
+	if (ostid_id(&oinfo->oi_oa->o_oi) & 4) {
+		/* Save lock to force ACKed reply */
+		ldlm_lock_addref (&obd->u.echo.eo_nl_lock, LCK_NL);
+		oti->oti_ack_locks[0].mode = LCK_NL;
+		oti->oti_ack_locks[0].lock = obd->u.echo.eo_nl_lock;
+	}
+
+	RETURN(0);
+}
+
+static void
+echo_page_debug_setup(struct page *page, int rw, obd_id id,
+		      __u64 offset, int len)
+{
+	int   page_offset = offset & ~CFS_PAGE_MASK;
+	char *addr	= ((char *)kmap(page)) + page_offset;
+
+	if (len % OBD_ECHO_BLOCK_SIZE != 0)
+		CERROR("Unexpected block size %d\n", len);
+
+	while (len > 0) {
+		if (rw & OBD_BRW_READ)
+			block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+					  offset, id);
+		else
+			block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+					  0xecc0ecc0ecc0ecc0ULL,
+					  0xecc0ecc0ecc0ecc0ULL);
+
+		addr   += OBD_ECHO_BLOCK_SIZE;
+		offset += OBD_ECHO_BLOCK_SIZE;
+		len    -= OBD_ECHO_BLOCK_SIZE;
+	}
+
+	kunmap(page);
+}
+
+static int
+echo_page_debug_check(struct page *page, obd_id id,
+		      __u64 offset, int len)
+{
+	int   page_offset = offset & ~CFS_PAGE_MASK;
+	char *addr	= ((char *)kmap(page)) + page_offset;
+	int   rc	  = 0;
+	int   rc2;
+
+	if (len % OBD_ECHO_BLOCK_SIZE != 0)
+		CERROR("Unexpected block size %d\n", len);
+
+	while (len > 0) {
+		rc2 = block_debug_check("echo", addr, OBD_ECHO_BLOCK_SIZE,
+					offset, id);
+
+		if (rc2 != 0 && rc == 0)
+			rc = rc2;
+
+		addr   += OBD_ECHO_BLOCK_SIZE;
+		offset += OBD_ECHO_BLOCK_SIZE;
+		len    -= OBD_ECHO_BLOCK_SIZE;
+	}
+
+	kunmap(page);
+
+	return (rc);
+}
+
+/* This allows us to verify that desc_private is passed unmolested */
+#define DESC_PRIV 0x10293847
+
+static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj,
+			     struct niobuf_remote *nb, int *pages,
+			     struct niobuf_local *lb, int cmd, int *left)
+{
+	int gfp_mask = (ostid_id(&obj->ioo_oid) & 1) ?
+			GFP_HIGHUSER : GFP_IOFS;
+	int ispersistent = ostid_id(&obj->ioo_oid) == ECHO_PERSISTENT_OBJID;
+	int debug_setup = (!ispersistent &&
+			   (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+			   (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+	struct niobuf_local *res = lb;
+	obd_off offset = nb->offset;
+	int len = nb->len;
+
+	while (len > 0) {
+		int plen = PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1));
+		if (len < plen)
+			plen = len;
+
+		/* check for local buf overflow */
+		if (*left == 0)
+			return -EINVAL;
+
+		res->lnb_file_offset = offset;
+		res->len = plen;
+		LASSERT((res->lnb_file_offset & ~CFS_PAGE_MASK) + res->len <=
+			PAGE_CACHE_SIZE);
+
+		if (ispersistent &&
+		    ((res->lnb_file_offset >> PAGE_CACHE_SHIFT) <
+		      ECHO_PERSISTENT_PAGES)) {
+			res->page =
+				echo_persistent_pages[res->lnb_file_offset >>
+						      PAGE_CACHE_SHIFT];
+			/* Take extra ref so __free_pages() can be called OK */
+			get_page (res->page);
+		} else {
+			OBD_PAGE_ALLOC(res->page, gfp_mask);
+			if (res->page == NULL) {
+				CERROR("can't get page for id " DOSTID"\n",
+				       POSTID(&obj->ioo_oid));
+				return -ENOMEM;
+			}
+		}
+
+		CDEBUG(D_PAGE, "$$$$ get page %p @ "LPU64" for %d\n",
+		       res->page, res->lnb_file_offset, res->len);
+
+		if (cmd & OBD_BRW_READ)
+			res->rc = res->len;
+
+		if (debug_setup)
+			echo_page_debug_setup(res->page, cmd,
+					      ostid_id(&obj->ioo_oid),
+					      res->lnb_file_offset, res->len);
+
+		offset += plen;
+		len -= plen;
+		res++;
+
+		(*left)--;
+		(*pages)++;
+	}
+
+	return 0;
+}
+
+static int echo_finalize_lb(struct obdo *oa, struct obd_ioobj *obj,
+			    struct niobuf_remote *rb, int *pgs,
+			    struct niobuf_local *lb, int verify)
+{
+	struct niobuf_local *res = lb;
+	obd_off start  = rb->offset >> PAGE_CACHE_SHIFT;
+	obd_off end    = (rb->offset + rb->len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int     count  = (int)(end - start);
+	int     rc     = 0;
+	int     i;
+
+	for (i = 0; i < count; i++, (*pgs) ++, res++) {
+		struct page *page = res->page;
+		void       *addr;
+
+		if (page == NULL) {
+			CERROR("null page objid "LPU64":%p, buf %d/%d\n",
+			       ostid_id(&obj->ioo_oid), page, i,
+			       obj->ioo_bufcnt);
+			return -EFAULT;
+		}
+
+		addr = kmap(page);
+
+		CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n",
+		       res->page, addr, res->lnb_file_offset);
+
+		if (verify) {
+			int vrc = echo_page_debug_check(page,
+							ostid_id(&obj->ioo_oid),
+							res->lnb_file_offset,
+							res->len);
+			/* check all the pages always */
+			if (vrc != 0 && rc == 0)
+				rc = vrc;
+		}
+
+		kunmap(page);
+		/* NB see comment above regarding persistent pages */
+		OBD_PAGE_FREE(page);
+	}
+
+	return rc;
+}
+
+static int echo_preprw(const struct lu_env *env, int cmd,
+		       struct obd_export *export, struct obdo *oa,
+		       int objcount, struct obd_ioobj *obj,
+		       struct niobuf_remote *nb, int *pages,
+		       struct niobuf_local *res, struct obd_trans_info *oti,
+		       struct lustre_capa *unused)
+{
+	struct obd_device *obd;
+	int tot_bytes = 0;
+	int rc = 0;
+	int i, left;
+	ENTRY;
+
+	obd = export->exp_obd;
+	if (obd == NULL)
+		RETURN(-EINVAL);
+
+	/* Temp fix to stop falling foul of osc_announce_cached() */
+	oa->o_valid &= ~(OBD_MD_FLBLOCKS | OBD_MD_FLGRANT);
+
+	memset(res, 0, sizeof(*res) * *pages);
+
+	CDEBUG(D_PAGE, "%s %d obdos with %d IOs\n",
+	       cmd == OBD_BRW_READ ? "reading" : "writing", objcount, *pages);
+
+	if (oti)
+		oti->oti_handle = (void *)DESC_PRIV;
+
+	left = *pages;
+	*pages = 0;
+
+	for (i = 0; i < objcount; i++, obj++) {
+		int j;
+
+		for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++) {
+
+			rc = echo_map_nb_to_lb(oa, obj, nb, pages,
+					       res + *pages, cmd, &left);
+			if (rc)
+				GOTO(preprw_cleanup, rc);
+
+			tot_bytes += nb->len;
+		}
+	}
+
+	atomic_add(*pages, &obd->u.echo.eo_prep);
+
+	if (cmd & OBD_BRW_READ)
+		lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+				    tot_bytes);
+	else
+		lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+				    tot_bytes);
+
+	CDEBUG(D_PAGE, "%d pages allocated after prep\n",
+	       atomic_read(&obd->u.echo.eo_prep));
+
+	RETURN(0);
+
+preprw_cleanup:
+	/* It is possible that we would rather handle errors by  allow
+	 * any already-set-up pages to complete, rather than tearing them
+	 * all down again.  I believe that this is what the in-kernel
+	 * prep/commit operations do.
+	 */
+	CERROR("cleaning up %u pages (%d obdos)\n", *pages, objcount);
+	for (i = 0; i < *pages; i++) {
+		kunmap(res[i].page);
+		/* NB if this is a persistent page, __free_pages will just
+		 * lose the extra ref gained above */
+		OBD_PAGE_FREE(res[i].page);
+		res[i].page = NULL;
+		atomic_dec(&obd->u.echo.eo_prep);
+	}
+
+	return rc;
+}
+
+static int echo_commitrw(const struct lu_env *env, int cmd,
+			 struct obd_export *export, struct obdo *oa,
+			 int objcount, struct obd_ioobj *obj,
+			 struct niobuf_remote *rb, int niocount,
+			 struct niobuf_local *res, struct obd_trans_info *oti,
+			 int rc)
+{
+	struct obd_device *obd;
+	int pgs = 0;
+	int i;
+	ENTRY;
+
+	obd = export->exp_obd;
+	if (obd == NULL)
+		RETURN(-EINVAL);
+
+	if (rc)
+		GOTO(commitrw_cleanup, rc);
+
+	if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) {
+		CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n",
+		       objcount, niocount);
+	} else {
+		CDEBUG(D_PAGE, "writing %d obdos with %d IOs\n",
+		       objcount, niocount);
+	}
+
+	if (niocount && res == NULL) {
+		CERROR("NULL res niobuf with niocount %d\n", niocount);
+		RETURN(-EINVAL);
+	}
+
+	LASSERT(oti == NULL || oti->oti_handle == (void *)DESC_PRIV);
+
+	for (i = 0; i < objcount; i++, obj++) {
+		int verify = (rc == 0 &&
+			     ostid_id(&obj->ioo_oid) != ECHO_PERSISTENT_OBJID &&
+			      (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+			      (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+		int j;
+
+		for (j = 0 ; j < obj->ioo_bufcnt ; j++, rb++) {
+			int vrc = echo_finalize_lb(oa, obj, rb, &pgs, &res[pgs],
+						   verify);
+			if (vrc == 0)
+				continue;
+
+			if (vrc == -EFAULT)
+				GOTO(commitrw_cleanup, rc = vrc);
+
+			if (rc == 0)
+				rc = vrc;
+		}
+
+	}
+
+	atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+	CDEBUG(D_PAGE, "%d pages remain after commit\n",
+	       atomic_read(&obd->u.echo.eo_prep));
+	RETURN(rc);
+
+commitrw_cleanup:
+	atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+	CERROR("cleaning up %d pages (%d obdos)\n",
+	       niocount - pgs - 1, objcount);
+
+	while (pgs < niocount) {
+		struct page *page = res[pgs++].page;
+
+		if (page == NULL)
+			continue;
+
+		/* NB see comment above regarding persistent pages */
+		OBD_PAGE_FREE(page);
+		atomic_dec(&obd->u.echo.eo_prep);
+	}
+	return rc;
+}
+
+static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars;
+	int			rc;
+	__u64		      lock_flags = 0;
+	struct ldlm_res_id	 res_id = {.name = {1}};
+	char		       ns_name[48];
+	ENTRY;
+
+	obd->u.echo.eo_obt.obt_magic = OBT_MAGIC;
+	spin_lock_init(&obd->u.echo.eo_lock);
+	obd->u.echo.eo_lastino = ECHO_INIT_OID;
+
+	sprintf(ns_name, "echotgt-%s", obd->obd_uuid.uuid);
+	obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
+						LDLM_NAMESPACE_SERVER,
+						LDLM_NAMESPACE_MODEST,
+						LDLM_NS_TYPE_OST);
+	if (obd->obd_namespace == NULL) {
+		LBUG();
+		RETURN(-ENOMEM);
+	}
+
+	rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_PLAIN,
+				    NULL, LCK_NL, &lock_flags, NULL,
+				    ldlm_completion_ast, NULL, NULL, 0,
+				    LVB_T_NONE, NULL, &obd->u.echo.eo_nl_lock);
+	LASSERT (rc == ELDLM_OK);
+
+	lprocfs_echo_init_vars(&lvars);
+	if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0 &&
+	    lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) {
+		lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+				     LPROCFS_CNTR_AVGMINMAX,
+				     "read_bytes", "bytes");
+		lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+				     LPROCFS_CNTR_AVGMINMAX,
+				     "write_bytes", "bytes");
+	}
+
+	ptlrpc_init_client (LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
+			    "echo_ldlm_cb_client", &obd->obd_ldlm_client);
+	RETURN(0);
+}
+
+static int echo_cleanup(struct obd_device *obd)
+{
+	int leaked;
+	ENTRY;
+
+	lprocfs_obd_cleanup(obd);
+	lprocfs_free_obd_stats(obd);
+
+	ldlm_lock_decref(&obd->u.echo.eo_nl_lock, LCK_NL);
+
+	/* XXX Bug 3413; wait for a bit to ensure the BL callback has
+	 * happened before calling ldlm_namespace_free() */
+	schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE, cfs_time_seconds(1));
+
+	ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
+	obd->obd_namespace = NULL;
+
+	leaked = atomic_read(&obd->u.echo.eo_prep);
+	if (leaked != 0)
+		CERROR("%d prep/commitrw pages leaked\n", leaked);
+
+	RETURN(0);
+}
+
+struct obd_ops echo_obd_ops = {
+	.o_owner	   = THIS_MODULE,
+	.o_connect	 = echo_connect,
+	.o_disconnect      = echo_disconnect,
+	.o_init_export     = echo_init_export,
+	.o_destroy_export  = echo_destroy_export,
+	.o_create	  = echo_create,
+	.o_destroy	 = echo_destroy,
+	.o_getattr	 = echo_getattr,
+	.o_setattr	 = echo_setattr,
+	.o_preprw	  = echo_preprw,
+	.o_commitrw	= echo_commitrw,
+	.o_setup	   = echo_setup,
+	.o_cleanup	 = echo_cleanup
+};
+
+void echo_persistent_pages_fini(void)
+{
+	int     i;
+
+	for (i = 0; i < ECHO_PERSISTENT_PAGES; i++)
+		if (echo_persistent_pages[i] != NULL) {
+			OBD_PAGE_FREE(echo_persistent_pages[i]);
+			echo_persistent_pages[i] = NULL;
+		}
+}
+
+int echo_persistent_pages_init(void)
+{
+	struct page *pg;
+	int	  i;
+
+	for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) {
+		int gfp_mask = (i < ECHO_PERSISTENT_PAGES/2) ?
+			GFP_IOFS : GFP_HIGHUSER;
+
+		OBD_PAGE_ALLOC(pg, gfp_mask);
+		if (pg == NULL) {
+			echo_persistent_pages_fini ();
+			return (-ENOMEM);
+		}
+
+		memset (kmap (pg), 0, PAGE_CACHE_SIZE);
+		kunmap (pg);
+
+		echo_persistent_pages[i] = pg;
+	}
+
+	return (0);
+}

diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c
new file mode 100644
index 0000000..184195f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c

@@ -0,0 +1,3223 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <cl_object.h>
+#include <lustre_fid.h>
+#include <lustre_acl.h>
+#include <lustre_net.h>
+#include <obd_lov.h>
+
+#include "echo_internal.h"
+
+/** \defgroup echo_client Echo Client
+ * @{
+ */
+
+struct echo_device {
+	struct cl_device	ed_cl;
+	struct echo_client_obd *ed_ec;
+
+	struct cl_site	  ed_site_myself;
+	struct cl_site	 *ed_site;
+	struct lu_device       *ed_next;
+	int		     ed_next_islov;
+	int		     ed_next_ismd;
+	struct lu_client_seq   *ed_cl_seq;
+};
+
+struct echo_object {
+	struct cl_object	eo_cl;
+	struct cl_object_header eo_hdr;
+
+	struct echo_device     *eo_dev;
+	struct list_head	      eo_obj_chain;
+	struct lov_stripe_md   *eo_lsm;
+	atomic_t	    eo_npages;
+	int		     eo_deleted;
+};
+
+struct echo_object_conf {
+	struct cl_object_conf  eoc_cl;
+	struct lov_stripe_md **eoc_md;
+};
+
+struct echo_page {
+	struct cl_page_slice   ep_cl;
+	struct mutex		ep_lock;
+	struct page	    *ep_vmpage;
+};
+
+struct echo_lock {
+	struct cl_lock_slice   el_cl;
+	struct list_head	     el_chain;
+	struct echo_object    *el_object;
+	__u64		  el_cookie;
+	atomic_t	   el_refcount;
+};
+
+struct echo_io {
+	struct cl_io_slice     ei_cl;
+};
+
+#if 0
+struct echo_req {
+	struct cl_req_slice er_cl;
+};
+#endif
+
+static int echo_client_setup(const struct lu_env *env,
+			     struct obd_device *obddev,
+			     struct lustre_cfg *lcfg);
+static int echo_client_cleanup(struct obd_device *obddev);
+
+
+/** \defgroup echo_helpers Helper functions
+ * @{
+ */
+static inline struct echo_device *cl2echo_dev(const struct cl_device *dev)
+{
+	return container_of0(dev, struct echo_device, ed_cl);
+}
+
+static inline struct cl_device *echo_dev2cl(struct echo_device *d)
+{
+	return &d->ed_cl;
+}
+
+static inline struct echo_device *obd2echo_dev(const struct obd_device *obd)
+{
+	return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev));
+}
+
+static inline struct cl_object *echo_obj2cl(struct echo_object *eco)
+{
+	return &eco->eo_cl;
+}
+
+static inline struct echo_object *cl2echo_obj(const struct cl_object *o)
+{
+	return container_of(o, struct echo_object, eo_cl);
+}
+
+static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s)
+{
+	return container_of(s, struct echo_page, ep_cl);
+}
+
+static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s)
+{
+	return container_of(s, struct echo_lock, el_cl);
+}
+
+static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl)
+{
+	return ecl->el_cl.cls_lock;
+}
+
+static struct lu_context_key echo_thread_key;
+static inline struct echo_thread_info *echo_env_info(const struct lu_env *env)
+{
+	struct echo_thread_info *info;
+	info = lu_context_key_get(&env->le_ctx, &echo_thread_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline
+struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c)
+{
+	return container_of(c, struct echo_object_conf, eoc_cl);
+}
+
+/** @} echo_helpers */
+
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+					       struct lov_stripe_md **lsm);
+static int cl_echo_object_put(struct echo_object *eco);
+static int cl_echo_enqueue   (struct echo_object *eco, obd_off start,
+			      obd_off end, int mode, __u64 *cookie);
+static int cl_echo_cancel    (struct echo_device *d, __u64 cookie);
+static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset,
+			      struct page **pages, int npages, int async);
+
+static struct echo_thread_info *echo_env_info(const struct lu_env *env);
+
+struct echo_thread_info {
+	struct echo_object_conf eti_conf;
+	struct lustre_md	eti_md;
+
+	struct cl_2queue	eti_queue;
+	struct cl_io	    eti_io;
+	struct cl_lock_descr    eti_descr;
+	struct lu_fid	   eti_fid;
+	struct lu_fid		eti_fid2;
+	struct md_op_spec       eti_spec;
+	struct lov_mds_md_v3    eti_lmm;
+	struct lov_user_md_v3   eti_lum;
+	struct md_attr	  eti_ma;
+	struct lu_name	  eti_lname;
+	/* per-thread values, can be re-used */
+	void			*eti_big_lmm;
+	int			eti_big_lmmsize;
+	char		    eti_name[20];
+	struct lu_buf	   eti_buf;
+	char		    eti_xattr_buf[LUSTRE_POSIX_ACL_MAX_SIZE];
+};
+
+/* No session used right now */
+struct echo_session_info {
+	unsigned long dummy;
+};
+
+static struct kmem_cache *echo_lock_kmem;
+static struct kmem_cache *echo_object_kmem;
+static struct kmem_cache *echo_thread_kmem;
+static struct kmem_cache *echo_session_kmem;
+//static struct kmem_cache *echo_req_kmem;
+
+static struct lu_kmem_descr echo_caches[] = {
+	{
+		.ckd_cache = &echo_lock_kmem,
+		.ckd_name  = "echo_lock_kmem",
+		.ckd_size  = sizeof (struct echo_lock)
+	},
+	{
+		.ckd_cache = &echo_object_kmem,
+		.ckd_name  = "echo_object_kmem",
+		.ckd_size  = sizeof (struct echo_object)
+	},
+	{
+		.ckd_cache = &echo_thread_kmem,
+		.ckd_name  = "echo_thread_kmem",
+		.ckd_size  = sizeof (struct echo_thread_info)
+	},
+	{
+		.ckd_cache = &echo_session_kmem,
+		.ckd_name  = "echo_session_kmem",
+		.ckd_size  = sizeof (struct echo_session_info)
+	},
+#if 0
+	{
+		.ckd_cache = &echo_req_kmem,
+		.ckd_name  = "echo_req_kmem",
+		.ckd_size  = sizeof (struct echo_req)
+	},
+#endif
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/** \defgroup echo_page Page operations
+ *
+ * Echo page operations.
+ *
+ * @{
+ */
+static struct page *echo_page_vmpage(const struct lu_env *env,
+				    const struct cl_page_slice *slice)
+{
+	return cl2echo_page(slice)->ep_vmpage;
+}
+
+static int echo_page_own(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *io, int nonblock)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	if (!nonblock)
+		mutex_lock(&ep->ep_lock);
+	else if (!mutex_trylock(&ep->ep_lock))
+		return -EAGAIN;
+	return 0;
+}
+
+static void echo_page_disown(const struct lu_env *env,
+			     const struct cl_page_slice *slice,
+			     struct cl_io *io)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	LASSERT(mutex_is_locked(&ep->ep_lock));
+	mutex_unlock(&ep->ep_lock);
+}
+
+static void echo_page_discard(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *unused)
+{
+	cl_page_delete(env, slice->cpl_page);
+}
+
+static int echo_page_is_vmlocked(const struct lu_env *env,
+				 const struct cl_page_slice *slice)
+{
+	if (mutex_is_locked(&cl2echo_page(slice)->ep_lock))
+		return -EBUSY;
+	return -ENODATA;
+}
+
+static void echo_page_completion(const struct lu_env *env,
+				 const struct cl_page_slice *slice,
+				 int ioret)
+{
+	LASSERT(slice->cpl_page->cp_sync_io != NULL);
+}
+
+static void echo_page_fini(const struct lu_env *env,
+			   struct cl_page_slice *slice)
+{
+	struct echo_page *ep    = cl2echo_page(slice);
+	struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
+	struct page *vmpage      = ep->ep_vmpage;
+	ENTRY;
+
+	atomic_dec(&eco->eo_npages);
+	page_cache_release(vmpage);
+	EXIT;
+}
+
+static int echo_page_prep(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *unused)
+{
+	return 0;
+}
+
+static int echo_page_print(const struct lu_env *env,
+			   const struct cl_page_slice *slice,
+			   void *cookie, lu_printer_t printer)
+{
+	struct echo_page *ep = cl2echo_page(slice);
+
+	(*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n",
+		   ep, mutex_is_locked(&ep->ep_lock), ep->ep_vmpage);
+	return 0;
+}
+
+static const struct cl_page_operations echo_page_ops = {
+	.cpo_own	   = echo_page_own,
+	.cpo_disown	= echo_page_disown,
+	.cpo_discard       = echo_page_discard,
+	.cpo_vmpage	= echo_page_vmpage,
+	.cpo_fini	  = echo_page_fini,
+	.cpo_print	 = echo_page_print,
+	.cpo_is_vmlocked   = echo_page_is_vmlocked,
+	.io = {
+		[CRT_READ] = {
+			.cpo_prep	= echo_page_prep,
+			.cpo_completion  = echo_page_completion,
+		},
+		[CRT_WRITE] = {
+			.cpo_prep	= echo_page_prep,
+			.cpo_completion  = echo_page_completion,
+		}
+	}
+};
+/** @} echo_page */
+
+/** \defgroup echo_lock Locking
+ *
+ * echo lock operations
+ *
+ * @{
+ */
+static void echo_lock_fini(const struct lu_env *env,
+			   struct cl_lock_slice *slice)
+{
+	struct echo_lock *ecl = cl2echo_lock(slice);
+
+	LASSERT(list_empty(&ecl->el_chain));
+	OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem);
+}
+
+static void echo_lock_delete(const struct lu_env *env,
+			     const struct cl_lock_slice *slice)
+{
+	struct echo_lock *ecl      = cl2echo_lock(slice);
+
+	LASSERT(list_empty(&ecl->el_chain));
+}
+
+static int echo_lock_fits_into(const struct lu_env *env,
+			       const struct cl_lock_slice *slice,
+			       const struct cl_lock_descr *need,
+			       const struct cl_io *unused)
+{
+	return 1;
+}
+
+static struct cl_lock_operations echo_lock_ops = {
+	.clo_fini      = echo_lock_fini,
+	.clo_delete    = echo_lock_delete,
+	.clo_fits_into = echo_lock_fits_into
+};
+
+/** @} echo_lock */
+
+/** \defgroup echo_cl_ops cl_object operations
+ *
+ * operations for cl_object
+ *
+ * @{
+ */
+static int echo_page_init(const struct lu_env *env, struct cl_object *obj,
+			struct cl_page *page, struct page *vmpage)
+{
+	struct echo_page *ep = cl_object_page_slice(obj, page);
+	struct echo_object *eco = cl2echo_obj(obj);
+	ENTRY;
+
+	ep->ep_vmpage = vmpage;
+	page_cache_get(vmpage);
+	mutex_init(&ep->ep_lock);
+	cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops);
+	atomic_inc(&eco->eo_npages);
+	RETURN(0);
+}
+
+static int echo_io_init(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io)
+{
+	return 0;
+}
+
+static int echo_lock_init(const struct lu_env *env,
+			  struct cl_object *obj, struct cl_lock *lock,
+			  const struct cl_io *unused)
+{
+	struct echo_lock *el;
+	ENTRY;
+
+	OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, __GFP_IO);
+	if (el != NULL) {
+		cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops);
+		el->el_object = cl2echo_obj(obj);
+		INIT_LIST_HEAD(&el->el_chain);
+		atomic_set(&el->el_refcount, 0);
+	}
+	RETURN(el == NULL ? -ENOMEM : 0);
+}
+
+static int echo_conf_set(const struct lu_env *env, struct cl_object *obj,
+			 const struct cl_object_conf *conf)
+{
+	return 0;
+}
+
+static const struct cl_object_operations echo_cl_obj_ops = {
+	.coo_page_init = echo_page_init,
+	.coo_lock_init = echo_lock_init,
+	.coo_io_init   = echo_io_init,
+	.coo_conf_set  = echo_conf_set
+};
+/** @} echo_cl_ops */
+
+/** \defgroup echo_lu_ops lu_object operations
+ *
+ * operations for echo lu object.
+ *
+ * @{
+ */
+static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
+			    const struct lu_object_conf *conf)
+{
+	struct echo_device *ed	 = cl2echo_dev(lu2cl_dev(obj->lo_dev));
+	struct echo_client_obd *ec     = ed->ed_ec;
+	struct echo_object *eco	= cl2echo_obj(lu2cl(obj));
+	ENTRY;
+
+	if (ed->ed_next) {
+		struct lu_object  *below;
+		struct lu_device  *under;
+
+		under = ed->ed_next;
+		below = under->ld_ops->ldo_object_alloc(env, obj->lo_header,
+							under);
+		if (below == NULL)
+			RETURN(-ENOMEM);
+		lu_object_add(obj, below);
+	}
+
+	if (!ed->ed_next_ismd) {
+		const struct cl_object_conf *cconf = lu2cl_conf(conf);
+		struct echo_object_conf *econf = cl2echo_conf(cconf);
+
+		LASSERT(econf->eoc_md);
+		eco->eo_lsm = *econf->eoc_md;
+		/* clear the lsm pointer so that it won't get freed. */
+		*econf->eoc_md = NULL;
+	} else {
+		eco->eo_lsm = NULL;
+	}
+
+	eco->eo_dev = ed;
+	atomic_set(&eco->eo_npages, 0);
+	cl_object_page_init(lu2cl(obj), sizeof(struct echo_page));
+
+	spin_lock(&ec->ec_lock);
+	list_add_tail(&eco->eo_obj_chain, &ec->ec_objects);
+	spin_unlock(&ec->ec_lock);
+
+	RETURN(0);
+}
+
+/* taken from osc_unpackmd() */
+static int echo_alloc_memmd(struct echo_device *ed,
+			    struct lov_stripe_md **lsmp)
+{
+	int lsm_size;
+
+	ENTRY;
+
+	/* If export is lov/osc then use their obd method */
+	if (ed->ed_next != NULL)
+		return obd_alloc_memmd(ed->ed_ec->ec_exp, lsmp);
+	/* OFD has no unpackmd method, do everything here */
+	lsm_size = lov_stripe_md_size(1);
+
+	LASSERT(*lsmp == NULL);
+	OBD_ALLOC(*lsmp, lsm_size);
+	if (*lsmp == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+	if ((*lsmp)->lsm_oinfo[0] == NULL) {
+		OBD_FREE(*lsmp, lsm_size);
+		RETURN(-ENOMEM);
+	}
+
+	loi_init((*lsmp)->lsm_oinfo[0]);
+	(*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+	ostid_set_seq_echo(&(*lsmp)->lsm_oi);
+
+	RETURN(lsm_size);
+}
+
+static int echo_free_memmd(struct echo_device *ed, struct lov_stripe_md **lsmp)
+{
+	int lsm_size;
+
+	ENTRY;
+
+	/* If export is lov/osc then use their obd method */
+	if (ed->ed_next != NULL)
+		return obd_free_memmd(ed->ed_ec->ec_exp, lsmp);
+	/* OFD has no unpackmd method, do everything here */
+	lsm_size = lov_stripe_md_size(1);
+
+	LASSERT(*lsmp != NULL);
+	OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+	OBD_FREE(*lsmp, lsm_size);
+	*lsmp = NULL;
+	RETURN(0);
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+	struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+	ENTRY;
+
+	LASSERT(atomic_read(&eco->eo_npages) == 0);
+
+	spin_lock(&ec->ec_lock);
+	list_del_init(&eco->eo_obj_chain);
+	spin_unlock(&ec->ec_lock);
+
+	lu_object_fini(obj);
+	lu_object_header_fini(obj->lo_header);
+
+	if (eco->eo_lsm)
+		echo_free_memmd(eco->eo_dev, &eco->eo_lsm);
+	OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
+	EXIT;
+}
+
+static int echo_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *o)
+{
+	struct echo_object *obj = cl2echo_obj(lu2cl(o));
+
+	return (*p)(env, cookie, "echoclient-object@%p", obj);
+}
+
+static const struct lu_object_operations echo_lu_obj_ops = {
+	.loo_object_init      = echo_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = echo_object_free,
+	.loo_object_print     = echo_object_print,
+	.loo_object_invariant = NULL
+};
+/** @} echo_lu_ops */
+
+/** \defgroup echo_lu_dev_ops  lu_device operations
+ *
+ * Operations for echo lu device.
+ *
+ * @{
+ */
+static struct lu_object *echo_object_alloc(const struct lu_env *env,
+					   const struct lu_object_header *hdr,
+					   struct lu_device *dev)
+{
+	struct echo_object *eco;
+	struct lu_object *obj = NULL;
+	ENTRY;
+
+	/* we're the top dev. */
+	LASSERT(hdr == NULL);
+	OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, __GFP_IO);
+	if (eco != NULL) {
+		struct cl_object_header *hdr = &eco->eo_hdr;
+
+		obj = &echo_obj2cl(eco)->co_lu;
+		cl_object_header_init(hdr);
+		lu_object_init(obj, &hdr->coh_lu, dev);
+		lu_object_add_top(&hdr->coh_lu, obj);
+
+		eco->eo_cl.co_ops = &echo_cl_obj_ops;
+		obj->lo_ops       = &echo_lu_obj_ops;
+	}
+	RETURN(obj);
+}
+
+static struct lu_device_operations echo_device_lu_ops = {
+	.ldo_object_alloc   = echo_object_alloc,
+};
+
+/** @} echo_lu_dev_ops */
+
+static struct cl_device_operations echo_device_cl_ops = {
+};
+
+/** \defgroup echo_init Setup and teardown
+ *
+ * Init and fini functions for echo client.
+ *
+ * @{
+ */
+static int echo_site_init(const struct lu_env *env, struct echo_device *ed)
+{
+	struct cl_site *site = &ed->ed_site_myself;
+	int rc;
+
+	/* initialize site */
+	rc = cl_site_init(site, &ed->ed_cl);
+	if (rc) {
+		CERROR("Cannot initilize site for echo client(%d)\n", rc);
+		return rc;
+	}
+
+	rc = lu_site_init_finish(&site->cs_lu);
+	if (rc)
+		return rc;
+
+	ed->ed_site = site;
+	return 0;
+}
+
+static void echo_site_fini(const struct lu_env *env, struct echo_device *ed)
+{
+	if (ed->ed_site) {
+		if (!ed->ed_next_ismd)
+			cl_site_fini(ed->ed_site);
+		ed->ed_site = NULL;
+	}
+}
+
+static void *echo_thread_key_init(const struct lu_context *ctx,
+			  struct lu_context_key *key)
+{
+	struct echo_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void echo_thread_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct echo_thread_info *info = data;
+	OBD_SLAB_FREE_PTR(info, echo_thread_kmem);
+}
+
+static void echo_thread_key_exit(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_thread_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = echo_thread_key_init,
+	.lct_fini = echo_thread_key_fini,
+	.lct_exit = echo_thread_key_exit
+};
+
+static void *echo_session_key_init(const struct lu_context *ctx,
+				  struct lu_context_key *key)
+{
+	struct echo_session_info *session;
+
+	OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, __GFP_IO);
+	if (session == NULL)
+		session = ERR_PTR(-ENOMEM);
+	return session;
+}
+
+static void echo_session_key_fini(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+	struct echo_session_info *session = data;
+	OBD_SLAB_FREE_PTR(session, echo_session_kmem);
+}
+
+static void echo_session_key_exit(const struct lu_context *ctx,
+				 struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = echo_session_key_init,
+	.lct_fini = echo_session_key_fini,
+	.lct_exit = echo_session_key_exit
+};
+
+LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key);
+
+#define ECHO_SEQ_WIDTH 0xffffffff
+static int echo_fid_init(struct echo_device *ed, char *obd_name,
+			 struct seq_server_site *ss)
+{
+	char *prefix;
+	int rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(ed->ed_cl_seq);
+	if (ed->ed_cl_seq == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+	if (prefix == NULL)
+		GOTO(out_free_seq, rc = -ENOMEM);
+
+	snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", obd_name);
+
+	/* Init client side sequence-manager */
+	rc = seq_client_init(ed->ed_cl_seq, NULL,
+			     LUSTRE_SEQ_METADATA,
+			     prefix, ss->ss_server_seq);
+	ed->ed_cl_seq->lcs_width = ECHO_SEQ_WIDTH;
+	OBD_FREE(prefix, MAX_OBD_NAME + 5);
+	if (rc)
+		GOTO(out_free_seq, rc);
+
+	RETURN(0);
+
+out_free_seq:
+	OBD_FREE_PTR(ed->ed_cl_seq);
+	ed->ed_cl_seq = NULL;
+	RETURN(rc);
+}
+
+static int echo_fid_fini(struct obd_device *obddev)
+{
+	struct echo_device *ed = obd2echo_dev(obddev);
+	ENTRY;
+
+	if (ed->ed_cl_seq != NULL) {
+		seq_client_fini(ed->ed_cl_seq);
+		OBD_FREE_PTR(ed->ed_cl_seq);
+		ed->ed_cl_seq = NULL;
+	}
+
+	RETURN(0);
+}
+
+static struct lu_device *echo_device_alloc(const struct lu_env *env,
+					   struct lu_device_type *t,
+					   struct lustre_cfg *cfg)
+{
+	struct lu_device   *next;
+	struct echo_device *ed;
+	struct cl_device   *cd;
+	struct obd_device  *obd = NULL; /* to keep compiler happy */
+	struct obd_device  *tgt;
+	const char *tgt_type_name;
+	int rc;
+	int cleanup = 0;
+	ENTRY;
+
+	OBD_ALLOC_PTR(ed);
+	if (ed == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	cleanup = 1;
+	cd = &ed->ed_cl;
+	rc = cl_device_init(cd, t);
+	if (rc)
+		GOTO(out, rc);
+
+	cd->cd_lu_dev.ld_ops = &echo_device_lu_ops;
+	cd->cd_ops = &echo_device_cl_ops;
+
+	cleanup = 2;
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	LASSERT(env != NULL);
+
+	tgt = class_name2obd(lustre_cfg_string(cfg, 1));
+	if (tgt == NULL) {
+		CERROR("Can not find tgt device %s\n",
+			lustre_cfg_string(cfg, 1));
+		GOTO(out, rc = -ENODEV);
+	}
+
+	next = tgt->obd_lu_dev;
+	if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+		ed->ed_next_ismd = 1;
+	} else {
+		ed->ed_next_ismd = 0;
+		rc = echo_site_init(env, ed);
+		if (rc)
+			GOTO(out, rc);
+	}
+	cleanup = 3;
+
+	rc = echo_client_setup(env, obd, cfg);
+	if (rc)
+		GOTO(out, rc);
+
+	ed->ed_ec = &obd->u.echo_client;
+	cleanup = 4;
+
+	if (ed->ed_next_ismd) {
+		/* Suppose to connect to some Metadata layer */
+		struct lu_site *ls;
+		struct lu_device *ld;
+		int    found = 0;
+
+		if (next == NULL) {
+			CERROR("%s is not lu device type!\n",
+			       lustre_cfg_string(cfg, 1));
+			GOTO(out, rc = -EINVAL);
+		}
+
+		tgt_type_name = lustre_cfg_string(cfg, 2);
+		if (!tgt_type_name) {
+			CERROR("%s no type name for echo %s setup\n",
+				lustre_cfg_string(cfg, 1),
+				tgt->obd_type->typ_name);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		ls = next->ld_site;
+
+		spin_lock(&ls->ls_ld_lock);
+		list_for_each_entry(ld, &ls->ls_ld_linkage, ld_linkage) {
+			if (strcmp(ld->ld_type->ldt_name, tgt_type_name) == 0) {
+				found = 1;
+				break;
+			}
+		}
+		spin_unlock(&ls->ls_ld_lock);
+
+		if (found == 0) {
+			CERROR("%s is not lu device type!\n",
+			       lustre_cfg_string(cfg, 1));
+			GOTO(out, rc = -EINVAL);
+		}
+
+		next = ld;
+		/* For MD echo client, it will use the site in MDS stack */
+		ed->ed_site_myself.cs_lu = *ls;
+		ed->ed_site = &ed->ed_site_myself;
+		ed->ed_cl.cd_lu_dev.ld_site = &ed->ed_site_myself.cs_lu;
+		rc = echo_fid_init(ed, obd->obd_name, lu_site2seq(ls));
+		if (rc) {
+			CERROR("echo fid init error %d\n", rc);
+			GOTO(out, rc);
+		}
+	} else {
+		 /* if echo client is to be stacked upon ost device, the next is
+		  * NULL since ost is not a clio device so far */
+		if (next != NULL && !lu_device_is_cl(next))
+			next = NULL;
+
+		tgt_type_name = tgt->obd_type->typ_name;
+		if (next != NULL) {
+			LASSERT(next != NULL);
+			if (next->ld_site != NULL)
+				GOTO(out, rc = -EBUSY);
+
+			next->ld_site = &ed->ed_site->cs_lu;
+			rc = next->ld_type->ldt_ops->ldto_device_init(env, next,
+						     next->ld_type->ldt_name,
+						     NULL);
+			if (rc)
+				GOTO(out, rc);
+
+			/* Tricky case, I have to determine the obd type since
+			 * CLIO uses the different parameters to initialize
+			 * objects for lov & osc. */
+			if (strcmp(tgt_type_name, LUSTRE_LOV_NAME) == 0)
+				ed->ed_next_islov = 1;
+			else
+				LASSERT(strcmp(tgt_type_name,
+					       LUSTRE_OSC_NAME) == 0);
+		} else
+			LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0);
+	}
+
+	ed->ed_next = next;
+	RETURN(&cd->cd_lu_dev);
+out:
+	switch(cleanup) {
+	case 4: {
+		int rc2;
+		rc2 = echo_client_cleanup(obd);
+		if (rc2)
+			CERROR("Cleanup obd device %s error(%d)\n",
+			       obd->obd_name, rc2);
+	}
+
+	case 3:
+		echo_site_fini(env, ed);
+	case 2:
+		cl_device_fini(&ed->ed_cl);
+	case 1:
+		OBD_FREE_PTR(ed);
+	case 0:
+	default:
+		break;
+	}
+	return(ERR_PTR(rc));
+}
+
+static int echo_device_init(const struct lu_env *env, struct lu_device *d,
+			  const char *name, struct lu_device *next)
+{
+	LBUG();
+	return 0;
+}
+
+static struct lu_device *echo_device_fini(const struct lu_env *env,
+					  struct lu_device *d)
+{
+	struct echo_device *ed = cl2echo_dev(lu2cl_dev(d));
+	struct lu_device *next = ed->ed_next;
+
+	while (next && !ed->ed_next_ismd)
+		next = next->ld_type->ldt_ops->ldto_device_fini(env, next);
+	return NULL;
+}
+
+static void echo_lock_release(const struct lu_env *env,
+			      struct echo_lock *ecl,
+			      int still_used)
+{
+	struct cl_lock *clk = echo_lock2cl(ecl);
+
+	cl_lock_get(clk);
+	cl_unuse(env, clk);
+	cl_lock_release(env, clk, "ec enqueue", ecl->el_object);
+	if (!still_used) {
+		cl_lock_mutex_get(env, clk);
+		cl_lock_cancel(env, clk);
+		cl_lock_delete(env, clk);
+		cl_lock_mutex_put(env, clk);
+	}
+	cl_lock_put(env, clk);
+}
+
+static struct lu_device *echo_device_free(const struct lu_env *env,
+					  struct lu_device *d)
+{
+	struct echo_device     *ed   = cl2echo_dev(lu2cl_dev(d));
+	struct echo_client_obd *ec   = ed->ed_ec;
+	struct echo_object     *eco;
+	struct lu_device       *next = ed->ed_next;
+
+	CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n",
+	       ed, next);
+
+	lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+	/* check if there are objects still alive.
+	 * It shouldn't have any object because lu_site_purge would cleanup
+	 * all of cached objects. Anyway, probably the echo device is being
+	 * parallelly accessed.
+	 */
+	spin_lock(&ec->ec_lock);
+	list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain)
+		eco->eo_deleted = 1;
+	spin_unlock(&ec->ec_lock);
+
+	/* purge again */
+	lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+	CDEBUG(D_INFO,
+	       "Waiting for the reference of echo object to be dropped\n");
+
+	/* Wait for the last reference to be dropped. */
+	spin_lock(&ec->ec_lock);
+	while (!list_empty(&ec->ec_objects)) {
+		spin_unlock(&ec->ec_lock);
+		CERROR("echo_client still has objects at cleanup time, "
+		       "wait for 1 second\n");
+		schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+						   cfs_time_seconds(1));
+		lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+		spin_lock(&ec->ec_lock);
+	}
+	spin_unlock(&ec->ec_lock);
+
+	LASSERT(list_empty(&ec->ec_locks));
+
+	CDEBUG(D_INFO, "No object exists, exiting...\n");
+
+	echo_client_cleanup(d->ld_obd);
+	echo_fid_fini(d->ld_obd);
+	while (next && !ed->ed_next_ismd)
+		next = next->ld_type->ldt_ops->ldto_device_free(env, next);
+
+	LASSERT(ed->ed_site == lu2cl_site(d->ld_site));
+	echo_site_fini(env, ed);
+	cl_device_fini(&ed->ed_cl);
+	OBD_FREE_PTR(ed);
+
+	return NULL;
+}
+
+static const struct lu_device_type_operations echo_device_type_ops = {
+	.ldto_init = echo_type_init,
+	.ldto_fini = echo_type_fini,
+
+	.ldto_start = echo_type_start,
+	.ldto_stop  = echo_type_stop,
+
+	.ldto_device_alloc = echo_device_alloc,
+	.ldto_device_free  = echo_device_free,
+	.ldto_device_init  = echo_device_init,
+	.ldto_device_fini  = echo_device_fini
+};
+
+static struct lu_device_type echo_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_ECHO_CLIENT_NAME,
+	.ldt_ops      = &echo_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD | LCT_MD_THREAD | LCT_DT_THREAD,
+};
+/** @} echo_init */
+
+/** \defgroup echo_exports Exported operations
+ *
+ * exporting functions to echo client
+ *
+ * @{
+ */
+
+/* Interfaces to echo client obd device */
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+					       struct lov_stripe_md **lsmp)
+{
+	struct lu_env *env;
+	struct echo_thread_info *info;
+	struct echo_object_conf *conf;
+	struct lov_stripe_md    *lsm;
+	struct echo_object *eco;
+	struct cl_object   *obj;
+	struct lu_fid *fid;
+	int refcheck;
+	int rc;
+	ENTRY;
+
+	LASSERT(lsmp);
+	lsm = *lsmp;
+	LASSERT(lsm);
+	LASSERTF(ostid_id(&lsm->lsm_oi) != 0, DOSTID"\n", POSTID(&lsm->lsm_oi));
+	LASSERTF(ostid_seq(&lsm->lsm_oi) == FID_SEQ_ECHO, DOSTID"\n",
+		 POSTID(&lsm->lsm_oi));
+
+	/* Never return an object if the obd is to be freed. */
+	if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping)
+		RETURN(ERR_PTR(-ENODEV));
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN((void *)env);
+
+	info = echo_env_info(env);
+	conf = &info->eti_conf;
+	if (d->ed_next) {
+		if (!d->ed_next_islov) {
+			struct lov_oinfo *oinfo = lsm->lsm_oinfo[0];
+			LASSERT(oinfo != NULL);
+			oinfo->loi_oi = lsm->lsm_oi;
+			conf->eoc_cl.u.coc_oinfo = oinfo;
+		} else {
+			struct lustre_md *md;
+			md = &info->eti_md;
+			memset(md, 0, sizeof *md);
+			md->lsm = lsm;
+			conf->eoc_cl.u.coc_md = md;
+		}
+	}
+	conf->eoc_md = lsmp;
+
+	fid  = &info->eti_fid;
+	rc = ostid_to_fid(fid, &lsm->lsm_oi, 0);
+	if (rc != 0)
+		GOTO(out, eco = ERR_PTR(rc));
+
+	/* In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl);
+	if (IS_ERR(obj))
+		GOTO(out, eco = (void*)obj);
+
+	eco = cl2echo_obj(obj);
+	if (eco->eo_deleted) {
+		cl_object_put(env, obj);
+		eco = ERR_PTR(-EAGAIN);
+	}
+
+out:
+	cl_env_put(env, &refcheck);
+	RETURN(eco);
+}
+
+static int cl_echo_object_put(struct echo_object *eco)
+{
+	struct lu_env *env;
+	struct cl_object *obj = echo_obj2cl(eco);
+	int refcheck;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	/* an external function to kill an object? */
+	if (eco->eo_deleted) {
+		struct lu_object_header *loh = obj->co_lu.lo_header;
+		LASSERT(&eco->eo_hdr == luh2coh(loh));
+		set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags);
+	}
+
+	cl_object_put(env, obj);
+	cl_env_put(env, &refcheck);
+	RETURN(0);
+}
+
+static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco,
+			    obd_off start, obd_off end, int mode,
+			    __u64 *cookie , __u32 enqflags)
+{
+	struct cl_io *io;
+	struct cl_lock *lck;
+	struct cl_object *obj;
+	struct cl_lock_descr *descr;
+	struct echo_thread_info *info;
+	int rc = -ENOMEM;
+	ENTRY;
+
+	info = echo_env_info(env);
+	io = &info->eti_io;
+	descr = &info->eti_descr;
+	obj = echo_obj2cl(eco);
+
+	descr->cld_obj   = obj;
+	descr->cld_start = cl_index(obj, start);
+	descr->cld_end   = cl_index(obj, end);
+	descr->cld_mode  = mode == LCK_PW ? CLM_WRITE : CLM_READ;
+	descr->cld_enq_flags = enqflags;
+	io->ci_obj = obj;
+
+	lck = cl_lock_request(env, io, descr, "ec enqueue", eco);
+	if (lck) {
+		struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+		struct echo_lock *el;
+
+		rc = cl_wait(env, lck);
+		if (rc == 0) {
+			el = cl2echo_lock(cl_lock_at(lck, &echo_device_type));
+			spin_lock(&ec->ec_lock);
+			if (list_empty(&el->el_chain)) {
+				list_add(&el->el_chain, &ec->ec_locks);
+				el->el_cookie = ++ec->ec_unique;
+			}
+			atomic_inc(&el->el_refcount);
+			*cookie = el->el_cookie;
+			spin_unlock(&ec->ec_lock);
+		} else {
+			cl_lock_release(env, lck, "ec enqueue", current);
+		}
+	}
+	RETURN(rc);
+}
+
+static int cl_echo_enqueue(struct echo_object *eco, obd_off start, obd_off end,
+			   int mode, __u64 *cookie)
+{
+	struct echo_thread_info *info;
+	struct lu_env *env;
+	struct cl_io *io;
+	int refcheck;
+	int result;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	info = echo_env_info(env);
+	io = &info->eti_io;
+
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, echo_obj2cl(eco));
+	if (result < 0)
+		GOTO(out, result);
+	LASSERT(result == 0);
+
+	result = cl_echo_enqueue0(env, eco, start, end, mode, cookie, 0);
+	cl_io_fini(env, io);
+
+	EXIT;
+out:
+	cl_env_put(env, &refcheck);
+	return result;
+}
+
+static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed,
+			   __u64 cookie)
+{
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct echo_lock       *ecl = NULL;
+	struct list_head	     *el;
+	int found = 0, still_used = 0;
+	ENTRY;
+
+	LASSERT(ec != NULL);
+	spin_lock(&ec->ec_lock);
+	list_for_each (el, &ec->ec_locks) {
+		ecl = list_entry (el, struct echo_lock, el_chain);
+		CDEBUG(D_INFO, "ecl: %p, cookie: "LPX64"\n", ecl, ecl->el_cookie);
+		found = (ecl->el_cookie == cookie);
+		if (found) {
+			if (atomic_dec_and_test(&ecl->el_refcount))
+				list_del_init(&ecl->el_chain);
+			else
+				still_used = 1;
+			break;
+		}
+	}
+	spin_unlock(&ec->ec_lock);
+
+	if (!found)
+		RETURN(-ENOENT);
+
+	echo_lock_release(env, ecl, still_used);
+	RETURN(0);
+}
+
+static int cl_echo_cancel(struct echo_device *ed, __u64 cookie)
+{
+	struct lu_env *env;
+	int refcheck;
+	int rc;
+	ENTRY;
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	rc = cl_echo_cancel0(env, ed, cookie);
+
+	cl_env_put(env, &refcheck);
+	RETURN(rc);
+}
+
+static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io,
+			     enum cl_req_type unused, struct cl_2queue *queue)
+{
+	struct cl_page *clp;
+	struct cl_page *temp;
+	int result = 0;
+	ENTRY;
+
+	cl_page_list_for_each_safe(clp, temp, &queue->c2_qin) {
+		int rc;
+		rc = cl_page_cache_add(env, io, clp, CRT_WRITE);
+		if (rc == 0)
+			continue;
+		result = result ?: rc;
+	}
+	RETURN(result);
+}
+
+static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset,
+			      struct page **pages, int npages, int async)
+{
+	struct lu_env	   *env;
+	struct echo_thread_info *info;
+	struct cl_object	*obj = echo_obj2cl(eco);
+	struct echo_device      *ed  = eco->eo_dev;
+	struct cl_2queue	*queue;
+	struct cl_io	    *io;
+	struct cl_page	  *clp;
+	struct lustre_handle    lh = { 0 };
+	int page_size = cl_page_size(obj);
+	int refcheck;
+	int rc;
+	int i;
+	ENTRY;
+
+	LASSERT((offset & ~CFS_PAGE_MASK) == 0);
+	LASSERT(ed->ed_next != NULL);
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	info    = echo_env_info(env);
+	io      = &info->eti_io;
+	queue   = &info->eti_queue;
+
+	cl_2queue_init(queue);
+
+	io->ci_ignore_layout = 1;
+	rc = cl_io_init(env, io, CIT_MISC, obj);
+	if (rc < 0)
+		GOTO(out, rc);
+	LASSERT(rc == 0);
+
+
+	rc = cl_echo_enqueue0(env, eco, offset,
+			      offset + npages * PAGE_CACHE_SIZE - 1,
+			      rw == READ ? LCK_PR : LCK_PW, &lh.cookie,
+			      CEF_NEVER);
+	if (rc < 0)
+		GOTO(error_lock, rc);
+
+	for (i = 0; i < npages; i++) {
+		LASSERT(pages[i]);
+		clp = cl_page_find(env, obj, cl_index(obj, offset),
+				   pages[i], CPT_TRANSIENT);
+		if (IS_ERR(clp)) {
+			rc = PTR_ERR(clp);
+			break;
+		}
+		LASSERT(clp->cp_type == CPT_TRANSIENT);
+
+		rc = cl_page_own(env, io, clp);
+		if (rc) {
+			LASSERT(clp->cp_state == CPS_FREEING);
+			cl_page_put(env, clp);
+			break;
+		}
+
+		cl_2queue_add(queue, clp);
+
+		/* drop the reference count for cl_page_find, so that the page
+		 * will be freed in cl_2queue_fini. */
+		cl_page_put(env, clp);
+		cl_page_clip(env, clp, 0, page_size);
+
+		offset += page_size;
+	}
+
+	if (rc == 0) {
+		enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE;
+
+		async = async && (typ == CRT_WRITE);
+		if (async)
+			rc = cl_echo_async_brw(env, io, typ, queue);
+		else
+			rc = cl_io_submit_sync(env, io, typ, queue, 0);
+		CDEBUG(D_INFO, "echo_client %s write returns %d\n",
+		       async ? "async" : "sync", rc);
+	}
+
+	cl_echo_cancel0(env, ed, lh.cookie);
+	EXIT;
+error_lock:
+	cl_2queue_discard(env, io, queue);
+	cl_2queue_disown(env, io, queue);
+	cl_2queue_fini(env, queue);
+	cl_io_fini(env, io);
+out:
+	cl_env_put(env, &refcheck);
+	return rc;
+}
+/** @} echo_exports */
+
+
+static obd_id last_object_id;
+
+static int
+echo_copyout_lsm (struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob)
+{
+	struct lov_stripe_md *ulsm = _ulsm;
+	int nob, i;
+
+	nob = offsetof (struct lov_stripe_md, lsm_oinfo[lsm->lsm_stripe_count]);
+	if (nob > ulsm_nob)
+		return (-EINVAL);
+
+	if (copy_to_user (ulsm, lsm, sizeof(ulsm)))
+		return (-EFAULT);
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		if (copy_to_user (ulsm->lsm_oinfo[i], lsm->lsm_oinfo[i],
+				      sizeof(lsm->lsm_oinfo[0])))
+			return (-EFAULT);
+	}
+	return 0;
+}
+
+static int
+echo_copyin_lsm (struct echo_device *ed, struct lov_stripe_md *lsm,
+		 void *ulsm, int ulsm_nob)
+{
+	struct echo_client_obd *ec = ed->ed_ec;
+	int		     i;
+
+	if (ulsm_nob < sizeof (*lsm))
+		return (-EINVAL);
+
+	if (copy_from_user (lsm, ulsm, sizeof (*lsm)))
+		return (-EFAULT);
+
+	if (lsm->lsm_stripe_count > ec->ec_nstripes ||
+	    lsm->lsm_magic != LOV_MAGIC ||
+	    (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 ||
+	    ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL))
+		return (-EINVAL);
+
+
+	for (i = 0; i < lsm->lsm_stripe_count; i++) {
+		if (copy_from_user(lsm->lsm_oinfo[i],
+				       ((struct lov_stripe_md *)ulsm)-> \
+				       lsm_oinfo[i],
+				       sizeof(lsm->lsm_oinfo[0])))
+			return (-EFAULT);
+	}
+	return (0);
+}
+
+static inline void echo_md_build_name(struct lu_name *lname, char *name,
+				      __u64 id)
+{
+	sprintf(name, LPU64, id);
+	lname->ln_name = name;
+	lname->ln_namelen = strlen(name);
+}
+
+/* similar to mdt_attr_get_complex */
+static int echo_big_lmm_get(const struct lu_env *env, struct md_object *o,
+			    struct md_attr *ma)
+{
+	struct echo_thread_info	*info = echo_env_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(ma->ma_lmm_size > 0);
+
+	rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LOV);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* big_lmm may need to be grown */
+	if (info->eti_big_lmmsize < rc) {
+		int size = size_roundup_power2(rc);
+
+		if (info->eti_big_lmmsize > 0) {
+			/* free old buffer */
+			LASSERT(info->eti_big_lmm);
+			OBD_FREE_LARGE(info->eti_big_lmm,
+				       info->eti_big_lmmsize);
+			info->eti_big_lmm = NULL;
+			info->eti_big_lmmsize = 0;
+		}
+
+		OBD_ALLOC_LARGE(info->eti_big_lmm, size);
+		if (info->eti_big_lmm == NULL)
+			RETURN(-ENOMEM);
+		info->eti_big_lmmsize = size;
+	}
+	LASSERT(info->eti_big_lmmsize >= rc);
+
+	info->eti_buf.lb_buf = info->eti_big_lmm;
+	info->eti_buf.lb_len = info->eti_big_lmmsize;
+	rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LOV);
+	if (rc < 0)
+		RETURN(rc);
+
+	ma->ma_valid |= MA_LOV;
+	ma->ma_lmm = info->eti_big_lmm;
+	ma->ma_lmm_size = rc;
+
+	RETURN(0);
+}
+
+int echo_attr_get_complex(const struct lu_env *env, struct md_object *next,
+			  struct md_attr *ma)
+{
+	struct echo_thread_info	*info = echo_env_info(env);
+	struct lu_buf		*buf = &info->eti_buf;
+	umode_t		 mode = lu_object_attr(&next->mo_lu);
+	int			 need = ma->ma_need;
+	int			 rc = 0, rc2;
+
+	ENTRY;
+
+	ma->ma_valid = 0;
+
+	if (need & MA_INODE) {
+		ma->ma_need = MA_INODE;
+		rc = mo_attr_get(env, next, ma);
+		if (rc)
+			GOTO(out, rc);
+		ma->ma_valid |= MA_INODE;
+	}
+
+	if (need & MA_LOV) {
+		if (S_ISREG(mode) || S_ISDIR(mode)) {
+			LASSERT(ma->ma_lmm_size > 0);
+			buf->lb_buf = ma->ma_lmm;
+			buf->lb_len = ma->ma_lmm_size;
+			rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV);
+			if (rc2 > 0) {
+				ma->ma_lmm_size = rc2;
+				ma->ma_valid |= MA_LOV;
+			} else if (rc2 == -ENODATA) {
+				/* no LOV EA */
+				ma->ma_lmm_size = 0;
+			} else if (rc2 == -ERANGE) {
+				rc2 = echo_big_lmm_get(env, next, ma);
+				if (rc2 < 0)
+					GOTO(out, rc = rc2);
+			} else {
+				GOTO(out, rc = rc2);
+			}
+		}
+	}
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (need & MA_ACL_DEF && S_ISDIR(mode)) {
+		buf->lb_buf = ma->ma_acl;
+		buf->lb_len = ma->ma_acl_size;
+		rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT);
+		if (rc2 > 0) {
+			ma->ma_acl_size = rc2;
+			ma->ma_valid |= MA_ACL_DEF;
+		} else if (rc2 == -ENODATA) {
+			/* no ACLs */
+			ma->ma_acl_size = 0;
+		} else {
+			GOTO(out, rc = rc2);
+		}
+	}
+#endif
+out:
+	ma->ma_need = need;
+	CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = "LPX64" ma_lmm=%p\n",
+	       rc, ma->ma_valid, ma->ma_lmm);
+	RETURN(rc);
+}
+
+static int
+echo_md_create_internal(const struct lu_env *env, struct echo_device *ed,
+			struct md_object *parent, struct lu_fid *fid,
+			struct lu_name *lname, struct md_op_spec *spec,
+			struct md_attr *ma)
+{
+	struct lu_object	*ec_child, *child;
+	struct lu_device	*ld = ed->ed_next;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_fid		*fid2 = &info->eti_fid2;
+	struct lu_object_conf    conf = { .loc_flags = LOC_F_NEW };
+	int			 rc;
+
+	ENTRY;
+
+	rc = mdo_lookup(env, parent, lname, fid2, spec);
+	if (rc == 0)
+		return -EEXIST;
+	else if (rc != -ENOENT)
+		return rc;
+
+	ec_child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev,
+				     fid, &conf);
+	if (IS_ERR(ec_child)) {
+		CERROR("Can not find the child "DFID": rc = %ld\n", PFID(fid),
+			PTR_ERR(ec_child));
+		RETURN(PTR_ERR(ec_child));
+	}
+
+	child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+	if (child == NULL) {
+		CERROR("Can not locate the child "DFID"\n", PFID(fid));
+		GOTO(out_put, rc = -EINVAL);
+	}
+
+	CDEBUG(D_RPCTRACE, "Start creating object "DFID" %s %p\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+	/*
+	 * Do not perform lookup sanity check. We know that name does not exist.
+	 */
+	spec->sp_cr_lookup = 0;
+	rc = mdo_create(env, parent, lname, lu2md(child), spec, ma);
+	if (rc) {
+		CERROR("Can not create child "DFID": rc = %d\n", PFID(fid), rc);
+		GOTO(out_put, rc);
+	}
+	CDEBUG(D_RPCTRACE, "End creating object "DFID" %s %p rc  = %d\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent, rc);
+	EXIT;
+out_put:
+	lu_object_put(env, ec_child);
+	return rc;
+}
+
+static int echo_set_lmm_size(const struct lu_env *env, struct lu_device *ld,
+			     struct md_attr *ma)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+
+	if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+		ma->ma_lmm = (void *)&info->eti_lmm;
+		ma->ma_lmm_size = sizeof(info->eti_lmm);
+	} else {
+		LASSERT(info->eti_big_lmmsize);
+		ma->ma_lmm = info->eti_big_lmm;
+		ma->ma_lmm_size = info->eti_big_lmmsize;
+	}
+
+	return 0;
+}
+
+static int echo_create_md_object(const struct lu_env *env,
+				 struct echo_device *ed,
+				 struct lu_object *ec_parent,
+				 struct lu_fid *fid,
+				 char *name, int namelen,
+				 __u64 id, __u32 mode, int count,
+				 int stripe_count, int stripe_offset)
+{
+	struct lu_object	*parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name	  *lname = &info->eti_lname;
+	struct md_op_spec       *spec = &info->eti_spec;
+	struct md_attr	  *ma = &info->eti_ma;
+	struct lu_device	*ld = ed->ed_next;
+	int		      rc = 0;
+	int		      i;
+
+	ENTRY;
+
+	if (ec_parent == NULL)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-ENXIO);
+
+	memset(ma, 0, sizeof(*ma));
+	memset(spec, 0, sizeof(*spec));
+	if (stripe_count != 0) {
+		spec->sp_cr_flags |= FMODE_WRITE;
+		echo_set_lmm_size(env, ld, ma);
+		if (stripe_count != -1) {
+			struct lov_user_md_v3 *lum = &info->eti_lum;
+
+			lum->lmm_magic = LOV_USER_MAGIC_V3;
+			lum->lmm_stripe_count = stripe_count;
+			lum->lmm_stripe_offset = stripe_offset;
+			lum->lmm_pattern = 0;
+			spec->u.sp_ea.eadata = lum;
+			spec->u.sp_ea.eadatalen = sizeof(*lum);
+			spec->sp_cr_flags |= MDS_OPEN_HAS_EA;
+		}
+	}
+
+	ma->ma_attr.la_mode = mode;
+	ma->ma_attr.la_valid = LA_CTIME | LA_MODE;
+	ma->ma_attr.la_ctime = cfs_time_current_64();
+
+	if (name != NULL) {
+		lname->ln_name = name;
+		lname->ln_namelen = namelen;
+		/* If name is specified, only create one object by name */
+		rc = echo_md_create_internal(env, ed, lu2md(parent), fid, lname,
+					     spec, ma);
+		RETURN(rc);
+	}
+
+	/* Create multiple object sequenced by id */
+	for (i = 0; i < count; i++) {
+		char *tmp_name = info->eti_name;
+
+		echo_md_build_name(lname, tmp_name, id);
+
+		rc = echo_md_create_internal(env, ed, lu2md(parent), fid, lname,
+					     spec, ma);
+		if (rc) {
+			CERROR("Can not create child %s: rc = %d\n", tmp_name,
+				rc);
+			break;
+		}
+		id++;
+		fid->f_oid++;
+	}
+
+	RETURN(rc);
+}
+
+static struct lu_object *echo_md_lookup(const struct lu_env *env,
+					struct echo_device *ed,
+					struct md_object *parent,
+					struct lu_name *lname)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_fid	   *fid = &info->eti_fid;
+	struct lu_object	*child;
+	int    rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "lookup %s in parent "DFID" %p\n", lname->ln_name,
+	       PFID(fid), parent);
+	rc = mdo_lookup(env, parent, lname, fid, NULL);
+	if (rc) {
+		CERROR("lookup %s: rc = %d\n", lname->ln_name, rc);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+
+	RETURN(child);
+}
+
+static int echo_setattr_object(const struct lu_env *env,
+			       struct echo_device *ed,
+			       struct lu_object *ec_parent,
+			       __u64 id, int count)
+{
+	struct lu_object	*parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name	  *lname = &info->eti_lname;
+	char		    *name = info->eti_name;
+	struct lu_device	*ld = ed->ed_next;
+	struct lu_buf	   *buf = &info->eti_buf;
+	int		      rc = 0;
+	int		      i;
+
+	ENTRY;
+
+	if (ec_parent == NULL)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-ENXIO);
+
+	for (i = 0; i < count; i++) {
+		struct lu_object *ec_child, *child;
+
+		echo_md_build_name(lname, name, id);
+
+		ec_child = echo_md_lookup(env, ed, lu2md(parent), lname);
+		if (IS_ERR(ec_child)) {
+			CERROR("Can't find child %s: rc = %ld\n",
+				lname->ln_name, PTR_ERR(ec_child));
+			RETURN(PTR_ERR(ec_child));
+		}
+
+		child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+		if (child == NULL) {
+			CERROR("Can not locate the child %s\n", lname->ln_name);
+			lu_object_put(env, ec_child);
+			rc = -EINVAL;
+			break;
+		}
+
+		CDEBUG(D_RPCTRACE, "Start setattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+
+		buf->lb_buf = info->eti_xattr_buf;
+		buf->lb_len = sizeof(info->eti_xattr_buf);
+
+		sprintf(name, "%s.test1", XATTR_USER_PREFIX);
+		rc = mo_xattr_set(env, lu2md(child), buf, name,
+				  LU_XATTR_CREATE);
+		if (rc < 0) {
+			CERROR("Can not setattr child "DFID": rc = %d\n",
+				PFID(lu_object_fid(child)), rc);
+			lu_object_put(env, ec_child);
+			break;
+		}
+		CDEBUG(D_RPCTRACE, "End setattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+		id++;
+		lu_object_put(env, ec_child);
+	}
+	RETURN(rc);
+}
+
+static int echo_getattr_object(const struct lu_env *env,
+			       struct echo_device *ed,
+			       struct lu_object *ec_parent,
+			       __u64 id, int count)
+{
+	struct lu_object	*parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name	  *lname = &info->eti_lname;
+	char		    *name = info->eti_name;
+	struct md_attr	  *ma = &info->eti_ma;
+	struct lu_device	*ld = ed->ed_next;
+	int		      rc = 0;
+	int		      i;
+
+	ENTRY;
+
+	if (ec_parent == NULL)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-ENXIO);
+
+	memset(ma, 0, sizeof(*ma));
+	ma->ma_need |= MA_INODE | MA_LOV | MA_PFID | MA_HSM | MA_ACL_DEF;
+	ma->ma_acl = info->eti_xattr_buf;
+	ma->ma_acl_size = sizeof(info->eti_xattr_buf);
+
+	for (i = 0; i < count; i++) {
+		struct lu_object *ec_child, *child;
+
+		ma->ma_valid = 0;
+		echo_md_build_name(lname, name, id);
+		echo_set_lmm_size(env, ld, ma);
+
+		ec_child = echo_md_lookup(env, ed, lu2md(parent), lname);
+		if (IS_ERR(ec_child)) {
+			CERROR("Can't find child %s: rc = %ld\n",
+			       lname->ln_name, PTR_ERR(ec_child));
+			RETURN(PTR_ERR(ec_child));
+		}
+
+		child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+		if (child == NULL) {
+			CERROR("Can not locate the child %s\n", lname->ln_name);
+			lu_object_put(env, ec_child);
+			RETURN(-EINVAL);
+		}
+
+		CDEBUG(D_RPCTRACE, "Start getattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+		rc = echo_attr_get_complex(env, lu2md(child), ma);
+		if (rc) {
+			CERROR("Can not getattr child "DFID": rc = %d\n",
+				PFID(lu_object_fid(child)), rc);
+			lu_object_put(env, ec_child);
+			break;
+		}
+		CDEBUG(D_RPCTRACE, "End getattr object "DFID"\n",
+		       PFID(lu_object_fid(child)));
+		id++;
+		lu_object_put(env, ec_child);
+	}
+
+	RETURN(rc);
+}
+
+static int echo_lookup_object(const struct lu_env *env,
+			      struct echo_device *ed,
+			      struct lu_object *ec_parent,
+			      __u64 id, int count)
+{
+	struct lu_object	*parent;
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name	  *lname = &info->eti_lname;
+	char		    *name = info->eti_name;
+	struct lu_fid	   *fid = &info->eti_fid;
+	struct lu_device	*ld = ed->ed_next;
+	int		      rc = 0;
+	int		      i;
+
+	if (ec_parent == NULL)
+		return -1;
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		return -ENXIO;
+
+	/*prepare the requests*/
+	for (i = 0; i < count; i++) {
+		echo_md_build_name(lname, name, id);
+
+		CDEBUG(D_RPCTRACE, "Start lookup object "DFID" %s %p\n",
+		       PFID(lu_object_fid(parent)), lname->ln_name, parent);
+
+		rc = mdo_lookup(env, lu2md(parent), lname, fid, NULL);
+		if (rc) {
+			CERROR("Can not lookup child %s: rc = %d\n", name, rc);
+			break;
+		}
+		CDEBUG(D_RPCTRACE, "End lookup object "DFID" %s %p\n",
+		       PFID(lu_object_fid(parent)), lname->ln_name, parent);
+
+		id++;
+	}
+	return rc;
+}
+
+static int echo_md_destroy_internal(const struct lu_env *env,
+				    struct echo_device *ed,
+				    struct md_object *parent,
+				    struct lu_name *lname,
+				    struct md_attr *ma)
+{
+	struct lu_device   *ld = ed->ed_next;
+	struct lu_object   *ec_child;
+	struct lu_object   *child;
+	int		 rc;
+
+	ENTRY;
+
+	ec_child = echo_md_lookup(env, ed, parent, lname);
+	if (IS_ERR(ec_child)) {
+		CERROR("Can't find child %s: rc = %ld\n", lname->ln_name,
+			PTR_ERR(ec_child));
+		RETURN(PTR_ERR(ec_child));
+	}
+
+	child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+	if (child == NULL) {
+		CERROR("Can not locate the child %s\n", lname->ln_name);
+		GOTO(out_put, rc = -EINVAL);
+	}
+
+	CDEBUG(D_RPCTRACE, "Start destroy object "DFID" %s %p\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+	rc = mdo_unlink(env, parent, lu2md(child), lname, ma, 0);
+	if (rc) {
+		CERROR("Can not unlink child %s: rc = %d\n",
+			lname->ln_name, rc);
+		GOTO(out_put, rc);
+	}
+	CDEBUG(D_RPCTRACE, "End destroy object "DFID" %s %p\n",
+	       PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+out_put:
+	lu_object_put(env, ec_child);
+	return rc;
+}
+
+static int echo_destroy_object(const struct lu_env *env,
+			       struct echo_device *ed,
+			       struct lu_object *ec_parent,
+			       char *name, int namelen,
+			       __u64 id, __u32 mode,
+			       int count)
+{
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_name	  *lname = &info->eti_lname;
+	struct md_attr	  *ma = &info->eti_ma;
+	struct lu_device	*ld = ed->ed_next;
+	struct lu_object	*parent;
+	int		      rc = 0;
+	int		      i;
+	ENTRY;
+
+	parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+	if (parent == NULL)
+		RETURN(-EINVAL);
+
+	memset(ma, 0, sizeof(*ma));
+	ma->ma_attr.la_mode = mode;
+	ma->ma_attr.la_valid = LA_CTIME;
+	ma->ma_attr.la_ctime = cfs_time_current_64();
+	ma->ma_need = MA_INODE;
+	ma->ma_valid = 0;
+
+	if (name != NULL) {
+		lname->ln_name = name;
+		lname->ln_namelen = namelen;
+		rc = echo_md_destroy_internal(env, ed, lu2md(parent), lname,
+					      ma);
+		RETURN(rc);
+	}
+
+	/*prepare the requests*/
+	for (i = 0; i < count; i++) {
+		char *tmp_name = info->eti_name;
+
+		ma->ma_valid = 0;
+		echo_md_build_name(lname, tmp_name, id);
+
+		rc = echo_md_destroy_internal(env, ed, lu2md(parent), lname,
+					      ma);
+		if (rc) {
+			CERROR("Can not unlink child %s: rc = %d\n", name, rc);
+			break;
+		}
+		id++;
+	}
+
+	RETURN(rc);
+}
+
+static struct lu_object *echo_resolve_path(const struct lu_env *env,
+					   struct echo_device *ed, char *path,
+					   int path_len)
+{
+	struct lu_device	*ld = ed->ed_next;
+	struct md_device	*md = lu2md_dev(ld);
+	struct echo_thread_info *info = echo_env_info(env);
+	struct lu_fid	   *fid = &info->eti_fid;
+	struct lu_name	  *lname = &info->eti_lname;
+	struct lu_object	*parent = NULL;
+	struct lu_object	*child = NULL;
+	int rc = 0;
+	ENTRY;
+
+	/*Only support MDD layer right now*/
+	rc = md->md_ops->mdo_root_get(env, md, fid);
+	if (rc) {
+		CERROR("get root error: rc = %d\n", rc);
+		RETURN(ERR_PTR(rc));
+	}
+
+	/* In the function below, .hs_keycmp resolves to
+	 * lu_obj_hop_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	parent = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+	if (IS_ERR(parent)) {
+		CERROR("Can not find the parent "DFID": rc = %ld\n",
+			PFID(fid), PTR_ERR(parent));
+		RETURN(parent);
+	}
+
+	while (1) {
+		struct lu_object *ld_parent;
+		char *e;
+
+		e = strsep(&path, "/");
+		if (e == NULL)
+			break;
+
+		if (e[0] == 0) {
+			if (!path || path[0] == '\0')
+				break;
+			continue;
+		}
+
+		lname->ln_name = e;
+		lname->ln_namelen = strlen(e);
+
+		ld_parent = lu_object_locate(parent->lo_header, ld->ld_type);
+		if (ld_parent == NULL) {
+			lu_object_put(env, parent);
+			rc = -EINVAL;
+			break;
+		}
+
+		child = echo_md_lookup(env, ed, lu2md(ld_parent), lname);
+		lu_object_put(env, parent);
+		if (IS_ERR(child)) {
+			rc = (int)PTR_ERR(child);
+			CERROR("lookup %s under parent "DFID": rc = %d\n",
+				lname->ln_name, PFID(lu_object_fid(ld_parent)),
+				rc);
+			break;
+		}
+		parent = child;
+	}
+	if (rc)
+		RETURN(ERR_PTR(rc));
+
+	RETURN(parent);
+}
+
+static void echo_ucred_init(struct lu_env *env)
+{
+	struct lu_ucred *ucred = lu_ucred(env);
+
+	ucred->uc_valid = UCRED_INVALID;
+
+	ucred->uc_suppgids[0] = -1;
+	ucred->uc_suppgids[1] = -1;
+
+	ucred->uc_uid   = ucred->uc_o_uid   = current_uid();
+	ucred->uc_gid   = ucred->uc_o_gid   = current_gid();
+	ucred->uc_fsuid = ucred->uc_o_fsuid = current_fsuid();
+	ucred->uc_fsgid = ucred->uc_o_fsgid = current_fsgid();
+	ucred->uc_cap   = cfs_curproc_cap_pack();
+
+	/* remove fs privilege for non-root user. */
+	if (ucred->uc_fsuid)
+		ucred->uc_cap &= ~CFS_CAP_FS_MASK;
+	ucred->uc_valid = UCRED_NEW;
+}
+
+static void echo_ucred_fini(struct lu_env *env)
+{
+	struct lu_ucred *ucred = lu_ucred(env);
+	ucred->uc_valid = UCRED_INIT;
+}
+
+#define ECHO_MD_CTX_TAG (LCT_REMEMBER | LCT_MD_THREAD)
+#define ECHO_MD_SES_TAG (LCT_REMEMBER | LCT_SESSION)
+static int echo_md_handler(struct echo_device *ed, int command,
+			   char *path, int path_len, __u64 id, int count,
+			   struct obd_ioctl_data *data)
+{
+	struct echo_thread_info *info;
+	struct lu_device      *ld = ed->ed_next;
+	struct lu_env	 *env;
+	int		    refcheck;
+	struct lu_object      *parent;
+	char		  *name = NULL;
+	int		    namelen = data->ioc_plen2;
+	int		    rc = 0;
+	ENTRY;
+
+	if (ld == NULL) {
+		CERROR("MD echo client is not being initialized properly\n");
+		RETURN(-EINVAL);
+	}
+
+	if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+		CERROR("Only support MDD layer right now!\n");
+		RETURN(-EINVAL);
+	}
+
+	env = cl_env_get(&refcheck);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	rc = lu_env_refill_by_tags(env, ECHO_MD_CTX_TAG, ECHO_MD_SES_TAG);
+	if (rc != 0)
+		GOTO(out_env, rc);
+
+	/* init big_lmm buffer */
+	info = echo_env_info(env);
+	LASSERT(info->eti_big_lmm == NULL);
+	OBD_ALLOC_LARGE(info->eti_big_lmm, MIN_MD_SIZE);
+	if (info->eti_big_lmm == NULL)
+		GOTO(out_env, rc = -ENOMEM);
+	info->eti_big_lmmsize = MIN_MD_SIZE;
+
+	parent = echo_resolve_path(env, ed, path, path_len);
+	if (IS_ERR(parent)) {
+		CERROR("Can not resolve the path %s: rc = %ld\n", path,
+			PTR_ERR(parent));
+		GOTO(out_free, rc = PTR_ERR(parent));
+	}
+
+	if (namelen > 0) {
+		OBD_ALLOC(name, namelen + 1);
+		if (name == NULL)
+			GOTO(out_put, rc = -ENOMEM);
+		if (copy_from_user(name, data->ioc_pbuf2, namelen))
+			GOTO(out_name, rc = -EFAULT);
+	}
+
+	echo_ucred_init(env);
+
+	switch (command) {
+	case ECHO_MD_CREATE:
+	case ECHO_MD_MKDIR: {
+		struct echo_thread_info *info = echo_env_info(env);
+		__u32 mode = data->ioc_obdo2.o_mode;
+		struct lu_fid *fid = &info->eti_fid;
+		int stripe_count = (int)data->ioc_obdo2.o_misc;
+		int stripe_index = (int)data->ioc_obdo2.o_stripe_idx;
+
+		rc = ostid_to_fid(fid, &data->ioc_obdo1.o_oi, 0);
+		if (rc != 0)
+			break;
+
+		/* In the function below, .hs_keycmp resolves to
+		 * lu_obj_hop_keycmp() */
+		/* coverity[overrun-buffer-val] */
+		rc = echo_create_md_object(env, ed, parent, fid, name, namelen,
+					   id, mode, count, stripe_count,
+					   stripe_index);
+		break;
+	}
+	case ECHO_MD_DESTROY:
+	case ECHO_MD_RMDIR: {
+		__u32 mode = data->ioc_obdo2.o_mode;
+
+		rc = echo_destroy_object(env, ed, parent, name, namelen,
+					 id, mode, count);
+		break;
+	}
+	case ECHO_MD_LOOKUP:
+		rc = echo_lookup_object(env, ed, parent, id, count);
+		break;
+	case ECHO_MD_GETATTR:
+		rc = echo_getattr_object(env, ed, parent, id, count);
+		break;
+	case ECHO_MD_SETATTR:
+		rc = echo_setattr_object(env, ed, parent, id, count);
+		break;
+	default:
+		CERROR("unknown command %d\n", command);
+		rc = -EINVAL;
+		break;
+	}
+	echo_ucred_fini(env);
+
+out_name:
+	if (name != NULL)
+		OBD_FREE(name, namelen + 1);
+out_put:
+	lu_object_put(env, parent);
+out_free:
+	LASSERT(info->eti_big_lmm);
+	OBD_FREE_LARGE(info->eti_big_lmm, info->eti_big_lmmsize);
+	info->eti_big_lmm = NULL;
+	info->eti_big_lmmsize = 0;
+out_env:
+	cl_env_put(env, &refcheck);
+	return rc;
+}
+
+static int echo_create_object(const struct lu_env *env, struct echo_device *ed,
+			      int on_target, struct obdo *oa, void *ulsm,
+			      int ulsm_nob, struct obd_trans_info *oti)
+{
+	struct echo_object     *eco;
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct lov_stripe_md   *lsm = NULL;
+	int		     rc;
+	int		     created = 0;
+	ENTRY;
+
+	if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */
+	    (on_target ||		       /* set_stripe */
+	     ec->ec_nstripes != 0)) {	   /* LOV */
+		CERROR ("No valid oid\n");
+		RETURN(-EINVAL);
+	}
+
+	rc = echo_alloc_memmd(ed, &lsm);
+	if (rc < 0) {
+		CERROR("Cannot allocate md: rc = %d\n", rc);
+		GOTO(failed, rc);
+	}
+
+	if (ulsm != NULL) {
+		int i, idx;
+
+		rc = echo_copyin_lsm (ed, lsm, ulsm, ulsm_nob);
+		if (rc != 0)
+			GOTO(failed, rc);
+
+		if (lsm->lsm_stripe_count == 0)
+			lsm->lsm_stripe_count = ec->ec_nstripes;
+
+		if (lsm->lsm_stripe_size == 0)
+			lsm->lsm_stripe_size = PAGE_CACHE_SIZE;
+
+		idx = cfs_rand();
+
+		/* setup stripes: indices + default ids if required */
+		for (i = 0; i < lsm->lsm_stripe_count; i++) {
+			if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) == 0)
+				lsm->lsm_oinfo[i]->loi_oi = lsm->lsm_oi;
+
+			lsm->lsm_oinfo[i]->loi_ost_idx =
+				(idx + i) % ec->ec_nstripes;
+		}
+	}
+
+	/* setup object ID here for !on_target and LOV hint */
+	if (oa->o_valid & OBD_MD_FLID) {
+		LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+		lsm->lsm_oi = oa->o_oi;
+	}
+
+	if (ostid_id(&lsm->lsm_oi) == 0)
+		ostid_set_id(&lsm->lsm_oi, ++last_object_id);
+
+	rc = 0;
+	if (on_target) {
+		/* Only echo objects are allowed to be created */
+		LASSERT((oa->o_valid & OBD_MD_FLGROUP) &&
+			(ostid_seq(&oa->o_oi) == FID_SEQ_ECHO));
+		rc = obd_create(env, ec->ec_exp, oa, &lsm, oti);
+		if (rc != 0) {
+			CERROR("Cannot create objects: rc = %d\n", rc);
+			GOTO(failed, rc);
+		}
+		created = 1;
+	}
+
+	/* See what object ID we were given */
+	oa->o_oi = lsm->lsm_oi;
+	oa->o_valid |= OBD_MD_FLID;
+
+	eco = cl_echo_object_find(ed, &lsm);
+	if (IS_ERR(eco))
+		GOTO(failed, rc = PTR_ERR(eco));
+	cl_echo_object_put(eco);
+
+	CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi));
+	EXIT;
+
+ failed:
+	if (created && rc)
+		obd_destroy(env, ec->ec_exp, oa, lsm, oti, NULL, NULL);
+	if (lsm)
+		echo_free_memmd(ed, &lsm);
+	if (rc)
+		CERROR("create object failed with: rc = %d\n", rc);
+	return (rc);
+}
+
+static int echo_get_object(struct echo_object **ecop, struct echo_device *ed,
+			   struct obdo *oa)
+{
+	struct lov_stripe_md   *lsm = NULL;
+	struct echo_object     *eco;
+	int		     rc;
+	ENTRY;
+
+	if ((oa->o_valid & OBD_MD_FLID) == 0 || ostid_id(&oa->o_oi) == 0) {
+		/* disallow use of object id 0 */
+		CERROR ("No valid oid\n");
+		RETURN(-EINVAL);
+	}
+
+	rc = echo_alloc_memmd(ed, &lsm);
+	if (rc < 0)
+		RETURN(rc);
+
+	lsm->lsm_oi = oa->o_oi;
+	if (!(oa->o_valid & OBD_MD_FLGROUP))
+		ostid_set_seq_echo(&lsm->lsm_oi);
+
+	rc = 0;
+	eco = cl_echo_object_find(ed, &lsm);
+	if (!IS_ERR(eco))
+		*ecop = eco;
+	else
+		rc = PTR_ERR(eco);
+	if (lsm)
+		echo_free_memmd(ed, &lsm);
+	RETURN(rc);
+}
+
+static void echo_put_object(struct echo_object *eco)
+{
+	if (cl_echo_object_put(eco))
+		CERROR("echo client: drop an object failed");
+}
+
+static void
+echo_get_stripe_off_id (struct lov_stripe_md *lsm, obd_off *offp, obd_id *idp)
+{
+	unsigned long stripe_count;
+	unsigned long stripe_size;
+	unsigned long width;
+	unsigned long woffset;
+	int	   stripe_index;
+	obd_off       offset;
+
+	if (lsm->lsm_stripe_count <= 1)
+		return;
+
+	offset       = *offp;
+	stripe_size  = lsm->lsm_stripe_size;
+	stripe_count = lsm->lsm_stripe_count;
+
+	/* width = # bytes in all stripes */
+	width = stripe_size * stripe_count;
+
+	/* woffset = offset within a width; offset = whole number of widths */
+	woffset = do_div (offset, width);
+
+	stripe_index = woffset / stripe_size;
+
+	*idp = ostid_id(&lsm->lsm_oinfo[stripe_index]->loi_oi);
+	*offp = offset * stripe_size + woffset % stripe_size;
+}
+
+static void
+echo_client_page_debug_setup(struct lov_stripe_md *lsm,
+			     struct page *page, int rw, obd_id id,
+			     obd_off offset, obd_off count)
+{
+	char    *addr;
+	obd_off  stripe_off;
+	obd_id   stripe_id;
+	int      delta;
+
+	/* no partial pages on the client */
+	LASSERT(count == PAGE_CACHE_SIZE);
+
+	addr = kmap(page);
+
+	for (delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+		if (rw == OBD_BRW_WRITE) {
+			stripe_off = offset + delta;
+			stripe_id = id;
+			echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id);
+		} else {
+			stripe_off = 0xdeadbeef00c0ffeeULL;
+			stripe_id = 0xdeadbeef00c0ffeeULL;
+		}
+		block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE,
+				  stripe_off, stripe_id);
+	}
+
+	kunmap(page);
+}
+
+static int echo_client_page_debug_check(struct lov_stripe_md *lsm,
+					struct page *page, obd_id id,
+					obd_off offset, obd_off count)
+{
+	obd_off stripe_off;
+	obd_id  stripe_id;
+	char   *addr;
+	int     delta;
+	int     rc;
+	int     rc2;
+
+	/* no partial pages on the client */
+	LASSERT(count == PAGE_CACHE_SIZE);
+
+	addr = kmap(page);
+
+	for (rc = delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+		stripe_off = offset + delta;
+		stripe_id = id;
+		echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id);
+
+		rc2 = block_debug_check("test_brw",
+					addr + delta, OBD_ECHO_BLOCK_SIZE,
+					stripe_off, stripe_id);
+		if (rc2 != 0) {
+			CERROR ("Error in echo object "LPX64"\n", id);
+			rc = rc2;
+		}
+	}
+
+	kunmap(page);
+	return rc;
+}
+
+static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
+			    struct echo_object *eco, obd_off offset,
+			    obd_size count, int async,
+			    struct obd_trans_info *oti)
+{
+	struct lov_stripe_md   *lsm = eco->eo_lsm;
+	obd_count	       npages;
+	struct brw_page	*pga;
+	struct brw_page	*pgp;
+	struct page	    **pages;
+	obd_off		 off;
+	int		     i;
+	int		     rc;
+	int		     verify;
+	int		     gfp_mask;
+	int		     brw_flags = 0;
+	ENTRY;
+
+	verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID &&
+		  (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+		  (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+
+	gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_IOFS : GFP_HIGHUSER;
+
+	LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
+	LASSERT(lsm != NULL);
+	LASSERT(ostid_id(&lsm->lsm_oi) == ostid_id(&oa->o_oi));
+
+	if (count <= 0 ||
+	    (count & (~CFS_PAGE_MASK)) != 0)
+		RETURN(-EINVAL);
+
+	/* XXX think again with misaligned I/O */
+	npages = count >> PAGE_CACHE_SHIFT;
+
+	if (rw == OBD_BRW_WRITE)
+		brw_flags = OBD_BRW_ASYNC;
+
+	OBD_ALLOC(pga, npages * sizeof(*pga));
+	if (pga == NULL)
+		RETURN(-ENOMEM);
+
+	OBD_ALLOC(pages, npages * sizeof(*pages));
+	if (pages == NULL) {
+		OBD_FREE(pga, npages * sizeof(*pga));
+		RETURN(-ENOMEM);
+	}
+
+	for (i = 0, pgp = pga, off = offset;
+	     i < npages;
+	     i++, pgp++, off += PAGE_CACHE_SIZE) {
+
+		LASSERT (pgp->pg == NULL);      /* for cleanup */
+
+		rc = -ENOMEM;
+		OBD_PAGE_ALLOC(pgp->pg, gfp_mask);
+		if (pgp->pg == NULL)
+			goto out;
+
+		pages[i] = pgp->pg;
+		pgp->count = PAGE_CACHE_SIZE;
+		pgp->off = off;
+		pgp->flag = brw_flags;
+
+		if (verify)
+			echo_client_page_debug_setup(lsm, pgp->pg, rw,
+						     ostid_id(&oa->o_oi), off,
+						     pgp->count);
+	}
+
+	/* brw mode can only be used at client */
+	LASSERT(ed->ed_next != NULL);
+	rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async);
+
+ out:
+	if (rc != 0 || rw != OBD_BRW_READ)
+		verify = 0;
+
+	for (i = 0, pgp = pga; i < npages; i++, pgp++) {
+		if (pgp->pg == NULL)
+			continue;
+
+		if (verify) {
+			int vrc;
+			vrc = echo_client_page_debug_check(lsm, pgp->pg,
+							   ostid_id(&oa->o_oi),
+							   pgp->off, pgp->count);
+			if (vrc != 0 && rc == 0)
+				rc = vrc;
+		}
+		OBD_PAGE_FREE(pgp->pg);
+	}
+	OBD_FREE(pga, npages * sizeof(*pga));
+	OBD_FREE(pages, npages * sizeof(*pages));
+	RETURN(rc);
+}
+
+static int echo_client_prep_commit(const struct lu_env *env,
+				   struct obd_export *exp, int rw,
+				   struct obdo *oa, struct echo_object *eco,
+				   obd_off offset, obd_size count,
+				   obd_size batch, struct obd_trans_info *oti,
+				   int async)
+{
+	struct lov_stripe_md *lsm = eco->eo_lsm;
+	struct obd_ioobj ioo;
+	struct niobuf_local *lnb;
+	struct niobuf_remote *rnb;
+	obd_off off;
+	obd_size npages, tot_pages;
+	int i, ret = 0, brw_flags = 0;
+
+	ENTRY;
+
+	if (count <= 0 || (count & (~CFS_PAGE_MASK)) != 0 ||
+	    (lsm != NULL && ostid_id(&lsm->lsm_oi) != ostid_id(&oa->o_oi)))
+		RETURN(-EINVAL);
+
+	npages = batch >> PAGE_CACHE_SHIFT;
+	tot_pages = count >> PAGE_CACHE_SHIFT;
+
+	OBD_ALLOC(lnb, npages * sizeof(struct niobuf_local));
+	OBD_ALLOC(rnb, npages * sizeof(struct niobuf_remote));
+
+	if (lnb == NULL || rnb == NULL)
+		GOTO(out, ret = -ENOMEM);
+
+	if (rw == OBD_BRW_WRITE && async)
+		brw_flags |= OBD_BRW_ASYNC;
+
+	obdo_to_ioobj(oa, &ioo);
+
+	off = offset;
+
+	for(; tot_pages; tot_pages -= npages) {
+		int lpages;
+
+		if (tot_pages < npages)
+			npages = tot_pages;
+
+		for (i = 0; i < npages; i++, off += PAGE_CACHE_SIZE) {
+			rnb[i].offset = off;
+			rnb[i].len = PAGE_CACHE_SIZE;
+			rnb[i].flags = brw_flags;
+		}
+
+		ioo.ioo_bufcnt = npages;
+		oti->oti_transno = 0;
+
+		lpages = npages;
+		ret = obd_preprw(env, rw, exp, oa, 1, &ioo, rnb, &lpages,
+				 lnb, oti, NULL);
+		if (ret != 0)
+			GOTO(out, ret);
+		LASSERT(lpages == npages);
+
+		for (i = 0; i < lpages; i++) {
+			struct page *page = lnb[i].page;
+
+			/* read past eof? */
+			if (page == NULL && lnb[i].rc == 0)
+				continue;
+
+			if (async)
+				lnb[i].flags |= OBD_BRW_ASYNC;
+
+			if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID ||
+			    (oa->o_valid & OBD_MD_FLFLAGS) == 0 ||
+			    (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0)
+				continue;
+
+			if (rw == OBD_BRW_WRITE)
+				echo_client_page_debug_setup(lsm, page, rw,
+							    ostid_id(&oa->o_oi),
+							     rnb[i].offset,
+							     rnb[i].len);
+			else
+				echo_client_page_debug_check(lsm, page,
+							    ostid_id(&oa->o_oi),
+							     rnb[i].offset,
+							     rnb[i].len);
+		}
+
+		ret = obd_commitrw(env, rw, exp, oa, 1, &ioo,
+				   rnb, npages, lnb, oti, ret);
+		if (ret != 0)
+			GOTO(out, ret);
+
+		/* Reset oti otherwise it would confuse ldiskfs. */
+		memset(oti, 0, sizeof(*oti));
+
+		/* Reuse env context. */
+		lu_context_exit((struct lu_context *)&env->le_ctx);
+		lu_context_enter((struct lu_context *)&env->le_ctx);
+	}
+
+out:
+	if (lnb)
+		OBD_FREE(lnb, npages * sizeof(struct niobuf_local));
+	if (rnb)
+		OBD_FREE(rnb, npages * sizeof(struct niobuf_remote));
+	RETURN(ret);
+}
+
+static int echo_client_brw_ioctl(const struct lu_env *env, int rw,
+				 struct obd_export *exp,
+				 struct obd_ioctl_data *data,
+				 struct obd_trans_info *dummy_oti)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct echo_device *ed = obd2echo_dev(obd);
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct obdo *oa = &data->ioc_obdo1;
+	struct echo_object *eco;
+	int rc;
+	int async = 1;
+	long test_mode;
+	ENTRY;
+
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+	rc = echo_get_object(&eco, ed, oa);
+	if (rc)
+		RETURN(rc);
+
+	oa->o_valid &= ~OBD_MD_FLHANDLE;
+
+	/* OFD/obdfilter works only via prep/commit */
+	test_mode = (long)data->ioc_pbuf1;
+	if (test_mode == 1)
+		async = 0;
+
+	if (ed->ed_next == NULL && test_mode != 3) {
+		test_mode = 3;
+		data->ioc_plen1 = data->ioc_count;
+	}
+
+	/* Truncate batch size to maximum */
+	if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE)
+		data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE;
+
+	switch (test_mode) {
+	case 1:
+		/* fall through */
+	case 2:
+		rc = echo_client_kbrw(ed, rw, oa,
+				      eco, data->ioc_offset,
+				      data->ioc_count, async, dummy_oti);
+		break;
+	case 3:
+		rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa,
+					     eco, data->ioc_offset,
+					     data->ioc_count, data->ioc_plen1,
+					     dummy_oti, async);
+		break;
+	default:
+		rc = -EINVAL;
+	}
+	echo_put_object(eco);
+	RETURN(rc);
+}
+
+static int
+echo_client_enqueue(struct obd_export *exp, struct obdo *oa,
+		    int mode, obd_off offset, obd_size nob)
+{
+	struct echo_device     *ed = obd2echo_dev(exp->exp_obd);
+	struct lustre_handle   *ulh = &oa->o_handle;
+	struct echo_object     *eco;
+	obd_off		 end;
+	int		     rc;
+	ENTRY;
+
+	if (ed->ed_next == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	if (!(mode == LCK_PR || mode == LCK_PW))
+		RETURN(-EINVAL);
+
+	if ((offset & (~CFS_PAGE_MASK)) != 0 ||
+	    (nob & (~CFS_PAGE_MASK)) != 0)
+		RETURN(-EINVAL);
+
+	rc = echo_get_object (&eco, ed, oa);
+	if (rc != 0)
+		RETURN(rc);
+
+	end = (nob == 0) ? ((obd_off) -1) : (offset + nob - 1);
+	rc = cl_echo_enqueue(eco, offset, end, mode, &ulh->cookie);
+	if (rc == 0) {
+		oa->o_valid |= OBD_MD_FLHANDLE;
+		CDEBUG(D_INFO, "Cookie is "LPX64"\n", ulh->cookie);
+	}
+	echo_put_object(eco);
+	RETURN(rc);
+}
+
+static int
+echo_client_cancel(struct obd_export *exp, struct obdo *oa)
+{
+	struct echo_device *ed     = obd2echo_dev(exp->exp_obd);
+	__u64	       cookie = oa->o_handle.cookie;
+
+	if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
+		return -EINVAL;
+
+	CDEBUG(D_INFO, "Cookie is "LPX64"\n", cookie);
+	return cl_echo_cancel(ed, cookie);
+}
+
+static int
+echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+		      void *karg, void *uarg)
+{
+	struct obd_device      *obd = exp->exp_obd;
+	struct echo_device     *ed = obd2echo_dev(obd);
+	struct echo_client_obd *ec = ed->ed_ec;
+	struct echo_object     *eco;
+	struct obd_ioctl_data  *data = karg;
+	struct obd_trans_info   dummy_oti;
+	struct lu_env	  *env;
+	struct oti_req_ack_lock *ack_lock;
+	struct obdo	    *oa;
+	struct lu_fid	   fid;
+	int		     rw = OBD_BRW_READ;
+	int		     rc = 0;
+	int		     i;
+	ENTRY;
+
+	memset(&dummy_oti, 0, sizeof(dummy_oti));
+
+	oa = &data->ioc_obdo1;
+	if (!(oa->o_valid & OBD_MD_FLGROUP)) {
+		oa->o_valid |= OBD_MD_FLGROUP;
+		ostid_set_seq_echo(&oa->o_oi);
+	}
+
+	/* This FID is unpacked just for validation at this point */
+	rc = ostid_to_fid(&fid, &oa->o_oi, 0);
+	if (rc < 0)
+		RETURN(rc);
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL)
+		RETURN(-ENOMEM);
+
+	rc = lu_env_init(env, LCT_DT_THREAD);
+	if (rc)
+		GOTO(out, rc = -ENOMEM);
+
+	switch (cmd) {
+	case OBD_IOC_CREATE:		    /* may create echo object */
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		rc = echo_create_object(env, ed, 1, oa, data->ioc_pbuf1,
+					data->ioc_plen1, &dummy_oti);
+		GOTO(out, rc);
+
+	case OBD_IOC_ECHO_MD: {
+		int count;
+		int cmd;
+		char *dir = NULL;
+		int dirlen;
+		__u64 id;
+
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		count = data->ioc_count;
+		cmd = data->ioc_command;
+
+		id = ostid_id(&data->ioc_obdo2.o_oi);
+
+		dirlen = data->ioc_plen1;
+		OBD_ALLOC(dir, dirlen + 1);
+		if (dir == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		if (copy_from_user(dir, data->ioc_pbuf1, dirlen)) {
+			OBD_FREE(dir, data->ioc_plen1 + 1);
+			GOTO(out, rc = -EFAULT);
+		}
+
+		rc = echo_md_handler(ed, cmd, dir, dirlen, id, count, data);
+		OBD_FREE(dir, dirlen + 1);
+		GOTO(out, rc);
+	}
+	case OBD_IOC_ECHO_ALLOC_SEQ: {
+		struct lu_env   *cl_env;
+		int	      refcheck;
+		__u64	    seq;
+		int	      max_count;
+
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO(out, rc = -EPERM);
+
+		cl_env = cl_env_get(&refcheck);
+		if (IS_ERR(cl_env))
+			GOTO(out, rc = PTR_ERR(cl_env));
+
+		rc = lu_env_refill_by_tags(cl_env, ECHO_MD_CTX_TAG,
+					    ECHO_MD_SES_TAG);
+		if (rc != 0) {
+			cl_env_put(cl_env, &refcheck);
+			GOTO(out, rc);
+		}
+
+		rc = seq_client_get_seq(cl_env, ed->ed_cl_seq, &seq);
+		cl_env_put(cl_env, &refcheck);
+		if (rc < 0) {
+			CERROR("%s: Can not alloc seq: rc = %d\n",
+			       obd->obd_name, rc);
+			GOTO(out, rc);
+		}
+
+		if (copy_to_user(data->ioc_pbuf1, &seq, data->ioc_plen1))
+			return -EFAULT;
+
+		max_count = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+		if (copy_to_user(data->ioc_pbuf2, &max_count,
+				     data->ioc_plen2))
+			return -EFAULT;
+		GOTO(out, rc);
+	}
+	case OBD_IOC_DESTROY:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			rc = obd_destroy(env, ec->ec_exp, oa, eco->eo_lsm,
+					 &dummy_oti, NULL, NULL);
+			if (rc == 0)
+				eco->eo_deleted = 1;
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case OBD_IOC_GETATTR:
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			struct obd_info oinfo = { { { 0 } } };
+			oinfo.oi_md = eco->eo_lsm;
+			oinfo.oi_oa = oa;
+			rc = obd_getattr(env, ec->ec_exp, &oinfo);
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case OBD_IOC_SETATTR:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			struct obd_info oinfo = { { { 0 } } };
+			oinfo.oi_oa = oa;
+			oinfo.oi_md = eco->eo_lsm;
+
+			rc = obd_setattr(env, ec->ec_exp, &oinfo, NULL);
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case OBD_IOC_BRW_WRITE:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		rw = OBD_BRW_WRITE;
+		/* fall through */
+	case OBD_IOC_BRW_READ:
+		rc = echo_client_brw_ioctl(env, rw, exp, data, &dummy_oti);
+		GOTO(out, rc);
+
+	case ECHO_IOC_GET_STRIPE:
+		rc = echo_get_object(&eco, ed, oa);
+		if (rc == 0) {
+			rc = echo_copyout_lsm(eco->eo_lsm, data->ioc_pbuf1,
+					      data->ioc_plen1);
+			echo_put_object(eco);
+		}
+		GOTO(out, rc);
+
+	case ECHO_IOC_SET_STRIPE:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		if (data->ioc_pbuf1 == NULL) {  /* unset */
+			rc = echo_get_object(&eco, ed, oa);
+			if (rc == 0) {
+				eco->eo_deleted = 1;
+				echo_put_object(eco);
+			}
+		} else {
+			rc = echo_create_object(env, ed, 0, oa,
+						data->ioc_pbuf1,
+						data->ioc_plen1, &dummy_oti);
+		}
+		GOTO (out, rc);
+
+	case ECHO_IOC_ENQUEUE:
+		if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+			GOTO (out, rc = -EPERM);
+
+		rc = echo_client_enqueue(exp, oa,
+					 data->ioc_conn1, /* lock mode */
+					 data->ioc_offset,
+					 data->ioc_count);/*extent*/
+		GOTO (out, rc);
+
+	case ECHO_IOC_CANCEL:
+		rc = echo_client_cancel(exp, oa);
+		GOTO (out, rc);
+
+	default:
+		CERROR ("echo_ioctl(): unrecognised ioctl %#x\n", cmd);
+		GOTO (out, rc = -ENOTTY);
+	}
+
+	EXIT;
+out:
+	lu_env_fini(env);
+	OBD_FREE_PTR(env);
+
+	/* XXX this should be in a helper also called by target_send_reply */
+	for (ack_lock = dummy_oti.oti_ack_locks, i = 0; i < 4;
+	     i++, ack_lock++) {
+		if (!ack_lock->mode)
+			break;
+		ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
+	}
+
+	return rc;
+}
+
+static int echo_client_setup(const struct lu_env *env,
+			     struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+	struct echo_client_obd *ec = &obddev->u.echo_client;
+	struct obd_device *tgt;
+	struct obd_uuid echo_uuid = { "ECHO_UUID" };
+	struct obd_connect_data *ocd = NULL;
+	int rc;
+	ENTRY;
+
+	if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+		CERROR("device not attached or not set up (%s)\n",
+		       lustre_cfg_string(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	spin_lock_init(&ec->ec_lock);
+	INIT_LIST_HEAD (&ec->ec_objects);
+	INIT_LIST_HEAD (&ec->ec_locks);
+	ec->ec_unique = 0;
+	ec->ec_nstripes = 0;
+
+	if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+		lu_context_tags_update(ECHO_MD_CTX_TAG);
+		lu_session_tags_update(ECHO_MD_SES_TAG);
+		RETURN(0);
+	}
+
+	OBD_ALLOC(ocd, sizeof(*ocd));
+	if (ocd == NULL) {
+		CERROR("Can't alloc ocd connecting to %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		return -ENOMEM;
+	}
+
+	ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL |
+				 OBD_CONNECT_BRW_SIZE |
+				 OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 |
+				 OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE |
+				 OBD_CONNECT_FID;
+	ocd->ocd_brw_size = DT_MAX_BRW_SIZE;
+	ocd->ocd_version = LUSTRE_VERSION_CODE;
+	ocd->ocd_group = FID_SEQ_ECHO;
+
+	rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
+	if (rc == 0) {
+		/* Turn off pinger because it connects to tgt obd directly. */
+		spin_lock(&tgt->obd_dev_lock);
+		list_del_init(&ec->ec_exp->exp_obd_chain_timed);
+		spin_unlock(&tgt->obd_dev_lock);
+	}
+
+	OBD_FREE(ocd, sizeof(*ocd));
+
+	if (rc != 0) {
+		CERROR("fail to connect to device %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		return (rc);
+	}
+
+	RETURN(rc);
+}
+
+static int echo_client_cleanup(struct obd_device *obddev)
+{
+	struct echo_device *ed = obd2echo_dev(obddev);
+	struct echo_client_obd *ec = &obddev->u.echo_client;
+	int rc;
+	ENTRY;
+
+	/*Do nothing for Metadata echo client*/
+	if (ed == NULL )
+		RETURN(0);
+
+	if (ed->ed_next_ismd) {
+		lu_context_tags_clear(ECHO_MD_CTX_TAG);
+		lu_session_tags_clear(ECHO_MD_SES_TAG);
+		RETURN(0);
+	}
+
+	if (!list_empty(&obddev->obd_exports)) {
+		CERROR("still has clients!\n");
+		RETURN(-EBUSY);
+	}
+
+	LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0);
+	rc = obd_disconnect(ec->ec_exp);
+	if (rc != 0)
+		CERROR("fail to disconnect device: %d\n", rc);
+
+	RETURN(rc);
+}
+
+static int echo_client_connect(const struct lu_env *env,
+			       struct obd_export **exp,
+			       struct obd_device *src, struct obd_uuid *cluuid,
+			       struct obd_connect_data *data, void *localdata)
+{
+	int		rc;
+	struct lustre_handle conn = { 0 };
+
+	ENTRY;
+	rc = class_connect(&conn, src, cluuid);
+	if (rc == 0) {
+		*exp = class_conn2export(&conn);
+	}
+
+	RETURN (rc);
+}
+
+static int echo_client_disconnect(struct obd_export *exp)
+{
+#if 0
+	struct obd_device      *obd;
+	struct echo_client_obd *ec;
+	struct ec_lock	 *ecl;
+#endif
+	int		     rc;
+	ENTRY;
+
+	if (exp == NULL)
+		GOTO(out, rc = -EINVAL);
+
+#if 0
+	obd = exp->exp_obd;
+	ec = &obd->u.echo_client;
+
+	/* no more contention on export's lock list */
+	while (!list_empty (&exp->exp_ec_data.eced_locks)) {
+		ecl = list_entry (exp->exp_ec_data.eced_locks.next,
+				      struct ec_lock, ecl_exp_chain);
+		list_del (&ecl->ecl_exp_chain);
+
+		rc = obd_cancel(ec->ec_exp, ecl->ecl_object->eco_lsm,
+				 ecl->ecl_mode, &ecl->ecl_lock_handle);
+
+		CDEBUG (D_INFO, "Cancel lock on object "LPX64" on disconnect "
+			"(%d)\n", ecl->ecl_object->eco_id, rc);
+
+		echo_put_object (ecl->ecl_object);
+		OBD_FREE (ecl, sizeof (*ecl));
+	}
+#endif
+
+	rc = class_disconnect(exp);
+	GOTO(out, rc);
+ out:
+	return rc;
+}
+
+static struct obd_ops echo_client_obd_ops = {
+	.o_owner       = THIS_MODULE,
+
+#if 0
+	.o_setup       = echo_client_setup,
+	.o_cleanup     = echo_client_cleanup,
+#endif
+
+	.o_iocontrol   = echo_client_iocontrol,
+	.o_connect     = echo_client_connect,
+	.o_disconnect  = echo_client_disconnect
+};
+
+int echo_client_init(void)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc;
+
+	lprocfs_echo_init_vars(&lvars);
+
+	rc = lu_kmem_init(echo_caches);
+	if (rc == 0) {
+		rc = class_register_type(&echo_client_obd_ops, NULL,
+					 lvars.module_vars,
+					 LUSTRE_ECHO_CLIENT_NAME,
+					 &echo_device_type);
+		if (rc)
+			lu_kmem_fini(echo_caches);
+	}
+	return rc;
+}
+
+void echo_client_exit(void)
+{
+	class_unregister_type(LUSTRE_ECHO_CLIENT_NAME);
+	lu_kmem_fini(echo_caches);
+}
+
+static int __init obdecho_init(void)
+{
+	struct lprocfs_static_vars lvars;
+	int rc;
+
+	ENTRY;
+	LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n");
+
+	LASSERT(PAGE_CACHE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
+
+	lprocfs_echo_init_vars(&lvars);
+
+
+	rc = echo_client_init();
+
+	RETURN(rc);
+}
+
+static void /*__exit*/ obdecho_exit(void)
+{
+	echo_client_exit();
+
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Testing Echo OBD driver");
+MODULE_LICENSE("GPL");
+
+cfs_module(obdecho, LUSTRE_VERSION_STRING, obdecho_init, obdecho_exit);
+
+/** @} echo_client */

diff --git a/drivers/staging/lustre/lustre/obdecho/echo_internal.h b/drivers/staging/lustre/lustre/obdecho/echo_internal.h
new file mode 100644
index 0000000..8e9dbc2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/echo_internal.h

@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo_internal.h
+ */
+
+#ifndef _ECHO_INTERNAL_H
+#define _ECHO_INTERNAL_H
+
+/* The persistent object (i.e. actually stores stuff!) */
+#define ECHO_PERSISTENT_OBJID    1ULL
+#define ECHO_PERSISTENT_SIZE     ((__u64)(1<<20))
+
+/* block size to use for data verification */
+#define OBD_ECHO_BLOCK_SIZE	(4<<10)
+
+
+#endif

diff --git a/drivers/staging/lustre/lustre/obdecho/lproc_echo.c b/drivers/staging/lustre/lustre/obdecho/lproc_echo.c
new file mode 100644
index 0000000..b9abac1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/lproc_echo.c

@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+#ifdef LPROCFS
+LPROC_SEQ_FOPS_RO_TYPE(echo, uuid);
+static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
+	{ "uuid",	 &echo_uuid_fops,	0, 0 },
+	{ 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(echo, numrefs);
+static struct lprocfs_vars lprocfs_echo_module_vars[] = {
+	{ "num_refs",     &echo_numrefs_fops,     0, 0 },
+	{ 0 }
+};
+
+void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_echo_module_vars;
+    lvars->obd_vars     = lprocfs_echo_obd_vars;
+}
+#endif /* LPROCFS */

diff --git a/drivers/staging/lustre/lustre/osc/Makefile b/drivers/staging/lustre/lustre/osc/Makefile
new file mode 100644
index 0000000..bbd2f77
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/Makefile

@@ -0,0 +1,7 @@
+obj-$(CONFIG_LUSTRE_FS) += osc.o
+osc-y := osc_request.o lproc_osc.o osc_dev.o osc_object.o \
+	 osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o
+
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c
new file mode 100644
index 0000000..198cf3b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c

@@ -0,0 +1,728 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <linux/seq_file.h>
+#include "osc_internal.h"
+
+#ifdef LPROCFS
+static int osc_active_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	int rc;
+
+	LPROCFS_CLIMP_CHECK(dev);
+	rc = seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+	LPROCFS_CLIMP_EXIT(dev);
+	return rc;
+}
+
+static ssize_t osc_active_seq_write(struct file *file, const char *buffer,
+				    size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+	if (val < 0 || val > 1)
+		return -ERANGE;
+
+	/* opposite senses */
+	if (dev->u.cli.cl_import->imp_deactive == val)
+		rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
+	else
+		CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n", val);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_active);
+
+static int osc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
+			const char *buffer, size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 1 || val > OSC_MAX_RIF_MAX)
+		return -ERANGE;
+
+	LPROCFS_CLIMP_CHECK(dev);
+	if (pool && val > cli->cl_max_rpcs_in_flight)
+		pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_rpcs_in_flight = val;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_EXIT(dev);
+	return count;
+}
+LPROC_SEQ_FOPS(osc_max_rpcs_in_flight);
+
+static int osc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	long val;
+	int mult;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	val = cli->cl_dirty_max;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	mult = 1 << 20;
+	return lprocfs_seq_read_frac_helper(m, val, mult);
+}
+
+static ssize_t osc_max_dirty_mb_seq_write(struct file *file, const char *buffer,
+				      size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	int pages_number, mult, rc;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number <= 0 ||
+	    pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_CACHE_SHIFT) ||
+	    pages_number > num_physpages / 4) /* 1/4 of RAM */
+		return -ERANGE;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_dirty_max = (obd_count)(pages_number << PAGE_CACHE_SHIFT);
+	osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_max_dirty_mb);
+
+static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	int shift = 20 - PAGE_CACHE_SHIFT;
+	int rc;
+
+	rc = seq_printf(m,
+		      "used_mb: %d\n"
+		      "busy_cnt: %d\n",
+		      (atomic_read(&cli->cl_lru_in_list) +
+			atomic_read(&cli->cl_lru_busy)) >> shift,
+		      atomic_read(&cli->cl_lru_busy));
+
+	return rc;
+}
+
+/* shrink the number of caching pages to a specific number */
+static ssize_t osc_cached_mb_seq_write(struct file *file, const char *buffer,
+				   size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	int pages_number, mult, rc;
+
+	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	buffer = lprocfs_find_named_value(buffer, "used_mb:", &count);
+	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+	if (rc)
+		return rc;
+
+	if (pages_number < 0)
+		return -ERANGE;
+
+	rc = atomic_read(&cli->cl_lru_in_list) - pages_number;
+	if (rc > 0)
+		(void)osc_lru_shrink(cli, rc);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_cached_mb);
+
+static int osc_cur_dirty_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = seq_printf(m, "%lu\n", cli->cl_dirty);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_dirty_bytes);
+
+static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = seq_printf(m, "%lu\n", cli->cl_avail_grant);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+static ssize_t osc_cur_grant_bytes_seq_write(struct file *file, const char *buffer,
+				  size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &obd->u.cli;
+	int		rc;
+	__u64	      val;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	/* this is only for shrinking grant */
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (val >= cli->cl_avail_grant) {
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		return 0;
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_CHECK(obd);
+	if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
+		rc = osc_shrink_grant_to_target(cli, val);
+	LPROCFS_CLIMP_EXIT(obd);
+	if (rc)
+		return rc;
+	return count;
+}
+LPROC_SEQ_FOPS(osc_cur_grant_bytes);
+
+static int osc_cur_lost_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *dev = m->private;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = seq_printf(m, "%lu\n", cli->cl_lost_grant);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_lost_grant_bytes);
+
+static int osc_grant_shrink_interval_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	if (obd == NULL)
+		return 0;
+	return seq_printf(m, "%d\n",
+			obd->u.cli.cl_grant_shrink_interval);
+}
+
+static ssize_t osc_grant_shrink_interval_seq_write(struct file *file,
+				const char *buffer, size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int val, rc;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val <= 0)
+		return -ERANGE;
+
+	obd->u.cli.cl_grant_shrink_interval = val;
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_grant_shrink_interval);
+
+static int osc_checksum_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	if (obd == NULL)
+		return 0;
+
+	return seq_printf(m, "%d\n",
+			obd->u.cli.cl_checksum ? 1 : 0);
+}
+
+static ssize_t osc_checksum_seq_write(struct file *file, const char *buffer,
+			   size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int val, rc;
+
+	if (obd == NULL)
+		return 0;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	obd->u.cli.cl_checksum = (val ? 1 : 0);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_checksum);
+
+static int osc_checksum_type_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	int i;
+	DECLARE_CKSUM_NAME;
+
+	if (obd == NULL)
+		return 0;
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+		if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+			continue;
+		if (obd->u.cli.cl_cksum_type == (1 << i))
+			seq_printf(m, "[%s] ", cksum_name[i]);
+		else
+			seq_printf(m, "%s ", cksum_name[i]);
+	}
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static ssize_t osc_checksum_type_seq_write(struct file *file, const char *buffer,
+				size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int i;
+	DECLARE_CKSUM_NAME;
+	char kernbuf[10];
+
+	if (obd == NULL)
+		return 0;
+
+	if (count > sizeof(kernbuf) - 1)
+		return -EINVAL;
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+	if (count > 0 && kernbuf[count - 1] == '\n')
+		kernbuf[count - 1] = '\0';
+	else
+		kernbuf[count] = '\0';
+
+	for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+		if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+			continue;
+		if (!strcmp(kernbuf, cksum_name[i])) {
+		       obd->u.cli.cl_cksum_type = 1 << i;
+		       return count;
+		}
+	}
+	return -EINVAL;
+}
+LPROC_SEQ_FOPS(osc_checksum_type);
+
+static int osc_resend_count_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+
+	return seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_resends));
+}
+
+static ssize_t osc_resend_count_seq_write(struct file *file, const char *buffer,
+			       size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	int val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	if (val < 0)
+	       return -EINVAL;
+
+	atomic_set(&obd->u.cli.cl_resends, val);
+
+	return count;
+}
+LPROC_SEQ_FOPS(osc_resend_count);
+
+static int osc_contention_seconds_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	return seq_printf(m, "%u\n", od->od_contention_time);
+}
+
+static ssize_t osc_contention_seconds_seq_write(struct file *file, const char *buffer,
+				     size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	return lprocfs_write_helper(buffer, count, &od->od_contention_time) ?:
+		count;
+}
+LPROC_SEQ_FOPS(osc_contention_seconds);
+
+static int osc_lockless_truncate_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	return seq_printf(m, "%u\n", od->od_lockless_truncate);
+}
+
+static ssize_t osc_lockless_truncate_seq_write(struct file *file, const char *buffer,
+				    size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct osc_device *od  = obd2osc_dev(obd);
+
+	return lprocfs_write_helper(buffer, count, &od->od_lockless_truncate) ?:
+		count;
+}
+LPROC_SEQ_FOPS(osc_lockless_truncate);
+
+static int osc_destroys_in_flight_seq_show(struct seq_file *m, void *v)
+{
+	struct obd_device *obd = m->private;
+	return seq_printf(m, "%u\n",
+			atomic_read(&obd->u.cli.cl_destroy_in_flight));
+}
+LPROC_SEQ_FOPS_RO(osc_destroys_in_flight);
+
+static int osc_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *v)
+{
+	return lprocfs_obd_rd_max_pages_per_rpc(m, m->private);
+}
+
+static ssize_t osc_obd_max_pages_per_rpc_seq_write(struct file *file,
+				const char *buffer, size_t count, loff_t *off)
+{
+	struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
+	int chunk_mask, rc;
+	__u64 val;
+
+	rc = lprocfs_write_u64_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	/* if the max_pages is specified in bytes, convert to pages */
+	if (val >= ONE_MB_BRW_SIZE)
+		val >>= PAGE_CACHE_SHIFT;
+
+	LPROCFS_CLIMP_CHECK(dev);
+
+	chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_CACHE_SHIFT)) - 1);
+	/* max_pages_per_rpc must be chunk aligned */
+	val = (val + ~chunk_mask) & chunk_mask;
+	if (val == 0 || val > ocd->ocd_brw_size >> PAGE_CACHE_SHIFT) {
+		LPROCFS_CLIMP_EXIT(dev);
+		return -ERANGE;
+	}
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_max_pages_per_rpc = val;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	LPROCFS_CLIMP_EXIT(dev);
+	return count;
+}
+LPROC_SEQ_FOPS(osc_obd_max_pages_per_rpc);
+
+LPROC_SEQ_FOPS_RO_TYPE(osc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(osc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(osc, state);
+
+LPROC_SEQ_FOPS_WR_ONLY(osc, ping);
+
+LPROC_SEQ_FOPS_RW_TYPE(osc, import);
+LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov);
+
+static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
+	{ "uuid",	     &osc_uuid_fops,	0, 0 },
+	{ "ping",	     &osc_ping_fops,    0, 0222 },
+	{ "connect_flags",   &osc_connect_flags_fops, 0, 0 },
+	{ "blocksize",       &osc_blksize_fops,     0, 0 },
+	{ "kbytestotal",     &osc_kbytestotal_fops, 0, 0 },
+	{ "kbytesfree",      &osc_kbytesfree_fops,  0, 0 },
+	{ "kbytesavail",     &osc_kbytesavail_fops, 0, 0 },
+	{ "filestotal",      &osc_filestotal_fops,  0, 0 },
+	{ "filesfree",       &osc_filesfree_fops,   0, 0 },
+	//{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },
+	{ "ost_server_uuid", &osc_server_uuid_fops, 0, 0 },
+	{ "ost_conn_uuid",   &osc_conn_uuid_fops, 0, 0 },
+	{ "active",	     &osc_active_fops, 0 },
+	{ "max_pages_per_rpc", &osc_obd_max_pages_per_rpc_fops, 0 },
+	{ "max_rpcs_in_flight", &osc_max_rpcs_in_flight_fops, 0 },
+	{ "destroys_in_flight", &osc_destroys_in_flight_fops, 0, 0 },
+	{ "max_dirty_mb",    &osc_max_dirty_mb_fops, 0 },
+	{ "osc_cached_mb",   &osc_cached_mb_fops, 0 },
+	{ "cur_dirty_bytes", &osc_cur_dirty_bytes_fops, 0, 0 },
+	{ "cur_grant_bytes", &osc_cur_grant_bytes_fops, 0 },
+	{ "cur_lost_grant_bytes", &osc_cur_lost_grant_bytes_fops, 0, 0},
+	{ "grant_shrink_interval", &osc_grant_shrink_interval_fops, 0 },
+	{ "checksums",       &osc_checksum_fops, 0 },
+	{ "checksum_type",   &osc_checksum_type_fops, 0 },
+	{ "resend_count",    &osc_resend_count_fops, 0},
+	{ "timeouts",	     &osc_timeouts_fops, 0, 0 },
+	{ "contention_seconds", &osc_contention_seconds_fops, 0 },
+	{ "lockless_truncate",  &osc_lockless_truncate_fops, 0 },
+	{ "import",		&osc_import_fops, 0 },
+	{ "state",		&osc_state_fops, 0, 0 },
+	{ "pinger_recov",	&osc_pinger_recov_fops, 0 },
+	{ 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(osc, numrefs);
+static struct lprocfs_vars lprocfs_osc_module_vars[] = {
+	{ "num_refs",	&osc_numrefs_fops,     0, 0 },
+	{ 0 }
+};
+
+#define pct(a,b) (b ? a * 100 / b : 0)
+
+static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+	int i;
+
+	do_gettimeofday(&now);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, now.tv_usec);
+	seq_printf(seq, "read RPCs in flight:  %d\n",
+		   cli->cl_r_in_flight);
+	seq_printf(seq, "write RPCs in flight: %d\n",
+		   cli->cl_w_in_flight);
+	seq_printf(seq, "pending write pages:  %d\n",
+		   atomic_read(&cli->cl_pending_w_pages));
+	seq_printf(seq, "pending read pages:   %d\n",
+		   atomic_read(&cli->cl_pending_r_pages));
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "pages per rpc	 rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+				 1 << i, r, pct(r, read_tot),
+				 pct(read_cum, read_tot), w,
+				 pct(w, write_tot),
+				 pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "rpcs in flight	rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+				 i, r, pct(r, read_tot),
+				 pct(read_cum, read_tot), w,
+				 pct(w, write_tot),
+				 pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+	seq_printf(seq, "offset		rpcs   %% cum %% |");
+	seq_printf(seq, "       rpcs   %% cum %%\n");
+
+	read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+	write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+	read_cum = 0;
+	write_cum = 0;
+	for (i = 0; i < OBD_HIST_MAX; i++) {
+		unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+		unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+		read_cum += r;
+		write_cum += w;
+		seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+			   (i == 0) ? 0 : 1 << (i - 1),
+			   r, pct(r, read_tot), pct(read_cum, read_tot),
+			   w, pct(w, write_tot), pct(write_cum, write_tot));
+		if (read_cum == read_tot && write_cum == write_tot)
+			break;
+	}
+
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return 0;
+}
+#undef pct
+
+static ssize_t osc_rpc_stats_seq_write(struct file *file, const char *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+
+	lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+	lprocfs_oh_clear(&cli->cl_read_page_hist);
+	lprocfs_oh_clear(&cli->cl_write_page_hist);
+	lprocfs_oh_clear(&cli->cl_read_offset_hist);
+	lprocfs_oh_clear(&cli->cl_write_offset_hist);
+
+	return len;
+}
+
+LPROC_SEQ_FOPS(osc_rpc_stats);
+
+static int osc_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct timeval now;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	do_gettimeofday(&now);
+
+	seq_printf(seq, "snapshot_time:	 %lu.%lu (secs.usecs)\n",
+		   now.tv_sec, now.tv_usec);
+	seq_printf(seq, "lockless_write_bytes\t\t"LPU64"\n",
+		   stats->os_lockless_writes);
+	seq_printf(seq, "lockless_read_bytes\t\t"LPU64"\n",
+		   stats->os_lockless_reads);
+	seq_printf(seq, "lockless_truncate\t\t"LPU64"\n",
+		   stats->os_lockless_truncates);
+	return 0;
+}
+
+static ssize_t osc_stats_seq_write(struct file *file, const char *buf,
+				   size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_device *dev = seq->private;
+	struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+	memset(stats, 0, sizeof(*stats));
+	return len;
+}
+
+LPROC_SEQ_FOPS(osc_stats);
+
+int lproc_osc_attach_seqstat(struct obd_device *dev)
+{
+	int rc;
+
+	rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0644,
+				&osc_stats_fops, dev);
+	if (rc == 0)
+		rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0644,
+					    &osc_rpc_stats_fops, dev);
+
+	return rc;
+}
+
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	lvars->module_vars = lprocfs_osc_module_vars;
+	lvars->obd_vars    = lprocfs_osc_obd_vars;
+}
+#endif /* LPROCFS */

diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c
new file mode 100644
index 0000000..0a0ec6f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c

@@ -0,0 +1,2916 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * osc cache management.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+#include "osc_internal.h"
+
+static int extent_debug; /* set it to be true for more debug */
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta);
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+			   int state);
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+			      struct osc_async_page *oap, int sent, int rc);
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd);
+static int osc_refresh_count(const struct lu_env *env,
+			     struct osc_async_page *oap, int cmd);
+static int osc_io_unplug_async(const struct lu_env *env,
+			       struct client_obd *cli, struct osc_object *osc);
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+			   unsigned int lost_grant);
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+				  const char *func, int line);
+#define osc_extent_tree_dump(lvl, obj) \
+	osc_extent_tree_dump0(lvl, obj, __func__, __LINE__)
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/* ------------------ osc extent ------------------ */
+static inline char *ext_flags(struct osc_extent *ext, char *flags)
+{
+	char *buf = flags;
+	*buf++ = ext->oe_rw ? 'r' : 'w';
+	if (ext->oe_intree)
+		*buf++ = 'i';
+	if (ext->oe_srvlock)
+		*buf++ = 's';
+	if (ext->oe_hp)
+		*buf++ = 'h';
+	if (ext->oe_urgent)
+		*buf++ = 'u';
+	if (ext->oe_memalloc)
+		*buf++ = 'm';
+	if (ext->oe_trunc_pending)
+		*buf++ = 't';
+	if (ext->oe_fsync_wait)
+		*buf++ = 'Y';
+	*buf = 0;
+	return flags;
+}
+
+static inline char list_empty_marker(struct list_head *list)
+{
+	return list_empty(list) ? '-' : '+';
+}
+
+#define EXTSTR       "[%lu -> %lu/%lu]"
+#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end
+static const char *oes_strings[] = {
+	"inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL };
+
+#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do {			      \
+	struct osc_extent *__ext = (extent);				      \
+	char __buf[16];							      \
+									      \
+	CDEBUG(lvl,							      \
+		"extent %p@{" EXTSTR ", "				      \
+		"[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt,	      \
+		/* ----- extent part 0 ----- */				      \
+		__ext, EXTPARA(__ext),					      \
+		/* ----- part 1 ----- */				      \
+		atomic_read(&__ext->oe_refc),			      \
+		atomic_read(&__ext->oe_users),			      \
+		list_empty_marker(&__ext->oe_link),			      \
+		oes_strings[__ext->oe_state], ext_flags(__ext, __buf),	      \
+		__ext->oe_obj,						      \
+		/* ----- part 2 ----- */				      \
+		__ext->oe_grants, __ext->oe_nr_pages,			      \
+		list_empty_marker(&__ext->oe_pages),			      \
+		waitqueue_active(&__ext->oe_waitq) ? '+' : '-',		      \
+		__ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner,	      \
+		/* ----- part 4 ----- */				      \
+		## __VA_ARGS__);					      \
+} while (0)
+
+#undef EASSERTF
+#define EASSERTF(expr, ext, fmt, args...) do {				\
+	if (!(expr)) {							\
+		OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args);		\
+		osc_extent_tree_dump(D_ERROR, (ext)->oe_obj);		\
+		LASSERT(expr);						\
+	}								\
+} while (0)
+
+#undef EASSERT
+#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n")
+
+static inline struct osc_extent *rb_extent(struct rb_node *n)
+{
+	if (n == NULL)
+		return NULL;
+
+	return container_of(n, struct osc_extent, oe_node);
+}
+
+static inline struct osc_extent *next_extent(struct osc_extent *ext)
+{
+	if (ext == NULL)
+		return NULL;
+
+	LASSERT(ext->oe_intree);
+	return rb_extent(rb_next(&ext->oe_node));
+}
+
+static inline struct osc_extent *prev_extent(struct osc_extent *ext)
+{
+	if (ext == NULL)
+		return NULL;
+
+	LASSERT(ext->oe_intree);
+	return rb_extent(rb_prev(&ext->oe_node));
+}
+
+static inline struct osc_extent *first_extent(struct osc_object *obj)
+{
+	return rb_extent(rb_first(&obj->oo_root));
+}
+
+/* object must be locked by caller. */
+static int osc_extent_sanity_check0(struct osc_extent *ext,
+				    const char *func, const int line)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct osc_async_page *oap;
+	int page_count;
+	int rc = 0;
+
+	if (!osc_object_is_locked(obj))
+		GOTO(out, rc = 9);
+
+	if (ext->oe_state >= OES_STATE_MAX)
+		GOTO(out, rc = 10);
+
+	if (atomic_read(&ext->oe_refc) <= 0)
+		GOTO(out, rc = 20);
+
+	if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users))
+		GOTO(out, rc = 30);
+
+	switch (ext->oe_state) {
+	case OES_INV:
+		if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages))
+			GOTO(out, rc = 35);
+		GOTO(out, rc = 0);
+		break;
+	case OES_ACTIVE:
+		if (atomic_read(&ext->oe_users) == 0)
+			GOTO(out, rc = 40);
+		if (ext->oe_hp)
+			GOTO(out, rc = 50);
+		if (ext->oe_fsync_wait && !ext->oe_urgent)
+			GOTO(out, rc = 55);
+		break;
+	case OES_CACHE:
+		if (ext->oe_grants == 0)
+			GOTO(out, rc = 60);
+		if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp)
+			GOTO(out, rc = 65);
+	default:
+		if (atomic_read(&ext->oe_users) > 0)
+			GOTO(out, rc = 70);
+	}
+
+	if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start)
+		GOTO(out, rc = 80);
+
+	if (ext->oe_osclock == NULL && ext->oe_grants > 0)
+		GOTO(out, rc = 90);
+
+	if (ext->oe_osclock) {
+		struct cl_lock_descr *descr;
+		descr = &ext->oe_osclock->cll_descr;
+		if (!(descr->cld_start <= ext->oe_start &&
+		      descr->cld_end >= ext->oe_max_end))
+			GOTO(out, rc = 100);
+	}
+
+	if (ext->oe_nr_pages > ext->oe_mppr)
+		GOTO(out, rc = 105);
+
+	/* Do not verify page list if extent is in RPC. This is because an
+	 * in-RPC extent is supposed to be exclusively accessible w/o lock. */
+	if (ext->oe_state > OES_CACHE)
+		GOTO(out, rc = 0);
+
+	if (!extent_debug)
+		GOTO(out, rc = 0);
+
+	page_count = 0;
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		pgoff_t index = oap2cl_page(oap)->cp_index;
+		++page_count;
+		if (index > ext->oe_end || index < ext->oe_start)
+			GOTO(out, rc = 110);
+	}
+	if (page_count != ext->oe_nr_pages)
+		GOTO(out, rc = 120);
+
+out:
+	if (rc != 0)
+		OSC_EXTENT_DUMP(D_ERROR, ext,
+				"%s:%d sanity check %p failed with rc = %d\n",
+				func, line, ext, rc);
+	return rc;
+}
+
+#define sanity_check_nolock(ext) \
+	osc_extent_sanity_check0(ext, __func__, __LINE__)
+
+#define sanity_check(ext) ({						   \
+	int __res;							     \
+	osc_object_lock((ext)->oe_obj);					\
+	__res = sanity_check_nolock(ext);				      \
+	osc_object_unlock((ext)->oe_obj);				      \
+	__res;								 \
+})
+
+
+/**
+ * sanity check - to make sure there is no overlapped extent in the tree.
+ */
+static int osc_extent_is_overlapped(struct osc_object *obj,
+				    struct osc_extent *ext)
+{
+	struct osc_extent *tmp;
+
+	LASSERT(osc_object_is_locked(obj));
+
+	if (!extent_debug)
+		return 0;
+
+	for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) {
+		if (tmp == ext)
+			continue;
+		if (tmp->oe_end >= ext->oe_start &&
+		    tmp->oe_start <= ext->oe_end)
+			return 1;
+	}
+	return 0;
+}
+
+static void osc_extent_state_set(struct osc_extent *ext, int state)
+{
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	LASSERT(state >= OES_INV && state < OES_STATE_MAX);
+
+	/* Never try to sanity check a state changing extent :-) */
+	/* LASSERT(sanity_check_nolock(ext) == 0); */
+
+	/* TODO: validate the state machine */
+	ext->oe_state = state;
+	wake_up_all(&ext->oe_waitq);
+}
+
+static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
+{
+	struct osc_extent *ext;
+
+	OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS);
+	if (ext == NULL)
+		return NULL;
+
+	RB_CLEAR_NODE(&ext->oe_node);
+	ext->oe_obj = obj;
+	atomic_set(&ext->oe_refc, 1);
+	atomic_set(&ext->oe_users, 0);
+	INIT_LIST_HEAD(&ext->oe_link);
+	ext->oe_state = OES_INV;
+	INIT_LIST_HEAD(&ext->oe_pages);
+	init_waitqueue_head(&ext->oe_waitq);
+	ext->oe_osclock = NULL;
+
+	return ext;
+}
+
+static void osc_extent_free(struct osc_extent *ext)
+{
+	OBD_SLAB_FREE_PTR(ext, osc_extent_kmem);
+}
+
+static struct osc_extent *osc_extent_get(struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) >= 0);
+	atomic_inc(&ext->oe_refc);
+	return ext;
+}
+
+static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) > 0);
+	if (atomic_dec_and_test(&ext->oe_refc)) {
+		LASSERT(list_empty(&ext->oe_link));
+		LASSERT(atomic_read(&ext->oe_users) == 0);
+		LASSERT(ext->oe_state == OES_INV);
+		LASSERT(!ext->oe_intree);
+
+		if (ext->oe_osclock) {
+			cl_lock_put(env, ext->oe_osclock);
+			ext->oe_osclock = NULL;
+		}
+		osc_extent_free(ext);
+	}
+}
+
+/**
+ * osc_extent_put_trust() is a special version of osc_extent_put() when
+ * it's known that the caller is not the last user. This is to address the
+ * problem of lacking of lu_env ;-).
+ */
+static void osc_extent_put_trust(struct osc_extent *ext)
+{
+	LASSERT(atomic_read(&ext->oe_refc) > 1);
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	atomic_dec(&ext->oe_refc);
+}
+
+/**
+ * Return the extent which includes pgoff @index, or return the greatest
+ * previous extent in the tree.
+ */
+static struct osc_extent *osc_extent_search(struct osc_object *obj,
+					    pgoff_t index)
+{
+	struct rb_node    *n = obj->oo_root.rb_node;
+	struct osc_extent *tmp, *p = NULL;
+
+	LASSERT(osc_object_is_locked(obj));
+	while (n != NULL) {
+		tmp = rb_extent(n);
+		if (index < tmp->oe_start) {
+			n = n->rb_left;
+		} else if (index > tmp->oe_end) {
+			p = rb_extent(n);
+			n = n->rb_right;
+		} else {
+			return tmp;
+		}
+	}
+	return p;
+}
+
+/*
+ * Return the extent covering @index, otherwise return NULL.
+ * caller must have held object lock.
+ */
+static struct osc_extent *osc_extent_lookup(struct osc_object *obj,
+					    pgoff_t index)
+{
+	struct osc_extent *ext;
+
+	ext = osc_extent_search(obj, index);
+	if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end)
+		return osc_extent_get(ext);
+	return NULL;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext)
+{
+	struct rb_node   **n      = &obj->oo_root.rb_node;
+	struct rb_node    *parent = NULL;
+	struct osc_extent *tmp;
+
+	LASSERT(ext->oe_intree == 0);
+	LASSERT(ext->oe_obj == obj);
+	LASSERT(osc_object_is_locked(obj));
+	while (*n != NULL) {
+		tmp = rb_extent(*n);
+		parent = *n;
+
+		if (ext->oe_end < tmp->oe_start)
+			n = &(*n)->rb_left;
+		else if (ext->oe_start > tmp->oe_end)
+			n = &(*n)->rb_right;
+		else
+			EASSERTF(0, tmp, EXTSTR, EXTPARA(ext));
+	}
+	rb_link_node(&ext->oe_node, parent, n);
+	rb_insert_color(&ext->oe_node, &obj->oo_root);
+	osc_extent_get(ext);
+	ext->oe_intree = 1;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_erase(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+	LASSERT(osc_object_is_locked(obj));
+	if (ext->oe_intree) {
+		rb_erase(&ext->oe_node, &obj->oo_root);
+		ext->oe_intree = 0;
+		/* rbtree held a refcount */
+		osc_extent_put_trust(ext);
+	}
+}
+
+static struct osc_extent *osc_extent_hold(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	LASSERT(osc_object_is_locked(obj));
+	LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE);
+	if (ext->oe_state == OES_CACHE) {
+		osc_extent_state_set(ext, OES_ACTIVE);
+		osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages);
+	}
+	atomic_inc(&ext->oe_users);
+	list_del_init(&ext->oe_link);
+	return osc_extent_get(ext);
+}
+
+static void __osc_extent_remove(struct osc_extent *ext)
+{
+	LASSERT(osc_object_is_locked(ext->oe_obj));
+	LASSERT(list_empty(&ext->oe_pages));
+	osc_extent_erase(ext);
+	list_del_init(&ext->oe_link);
+	osc_extent_state_set(ext, OES_INV);
+	OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n");
+}
+
+static void osc_extent_remove(struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+
+	osc_object_lock(obj);
+	__osc_extent_remove(ext);
+	osc_object_unlock(obj);
+}
+
+/**
+ * This function is used to merge extents to get better performance. It checks
+ * if @cur and @victim are contiguous at chunk level.
+ */
+static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
+			    struct osc_extent *victim)
+{
+	struct osc_object *obj = cur->oe_obj;
+	pgoff_t chunk_start;
+	pgoff_t chunk_end;
+	int ppc_bits;
+
+	LASSERT(cur->oe_state == OES_CACHE);
+	LASSERT(osc_object_is_locked(obj));
+	if (victim == NULL)
+		return -EINVAL;
+
+	if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait)
+		return -EBUSY;
+
+	if (cur->oe_max_end != victim->oe_max_end)
+		return -ERANGE;
+
+	LASSERT(cur->oe_osclock == victim->oe_osclock);
+	ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT;
+	chunk_start = cur->oe_start >> ppc_bits;
+	chunk_end   = cur->oe_end   >> ppc_bits;
+	if (chunk_start   != (victim->oe_end >> ppc_bits) + 1 &&
+	    chunk_end + 1 != victim->oe_start >> ppc_bits)
+		return -ERANGE;
+
+	OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur);
+
+	cur->oe_start     = min(cur->oe_start, victim->oe_start);
+	cur->oe_end       = max(cur->oe_end,   victim->oe_end);
+	cur->oe_grants   += victim->oe_grants;
+	cur->oe_nr_pages += victim->oe_nr_pages;
+	/* only the following bits are needed to merge */
+	cur->oe_urgent   |= victim->oe_urgent;
+	cur->oe_memalloc |= victim->oe_memalloc;
+	list_splice_init(&victim->oe_pages, &cur->oe_pages);
+	list_del_init(&victim->oe_link);
+	victim->oe_nr_pages = 0;
+
+	osc_extent_get(victim);
+	__osc_extent_remove(victim);
+	osc_extent_put(env, victim);
+
+	OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim);
+	return 0;
+}
+
+/**
+ * Drop user count of osc_extent, and unplug IO asynchronously.
+ */
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
+{
+	struct osc_object *obj = ext->oe_obj;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(atomic_read(&ext->oe_users) > 0);
+	LASSERT(sanity_check(ext) == 0);
+	LASSERT(ext->oe_grants > 0);
+
+	if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) {
+		LASSERT(ext->oe_state == OES_ACTIVE);
+		if (ext->oe_trunc_pending) {
+			/* a truncate process is waiting for this extent.
+			 * This may happen due to a race, check
+			 * osc_cache_truncate_start(). */
+			osc_extent_state_set(ext, OES_TRUNC);
+			ext->oe_trunc_pending = 0;
+		} else {
+			osc_extent_state_set(ext, OES_CACHE);
+			osc_update_pending(obj, OBD_BRW_WRITE,
+					   ext->oe_nr_pages);
+
+			/* try to merge the previous and next extent. */
+			osc_extent_merge(env, ext, prev_extent(ext));
+			osc_extent_merge(env, ext, next_extent(ext));
+
+			if (ext->oe_urgent)
+				list_move_tail(&ext->oe_link,
+						   &obj->oo_urgent_exts);
+		}
+		osc_object_unlock(obj);
+
+		osc_io_unplug_async(env, osc_cli(obj), obj);
+	}
+	osc_extent_put(env, ext);
+	RETURN(rc);
+}
+
+static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2)
+{
+	return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
+}
+
+/**
+ * Find or create an extent which includes @index, core function to manage
+ * extent tree.
+ */
+struct osc_extent *osc_extent_find(const struct lu_env *env,
+				   struct osc_object *obj, pgoff_t index,
+				   int *grants)
+
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct cl_lock    *lock;
+	struct osc_extent *cur;
+	struct osc_extent *ext;
+	struct osc_extent *conflict = NULL;
+	struct osc_extent *found = NULL;
+	pgoff_t    chunk;
+	pgoff_t    max_end;
+	int	max_pages; /* max_pages_per_rpc */
+	int	chunksize;
+	int	ppc_bits; /* pages per chunk bits */
+	int	chunk_mask;
+	int	rc;
+	ENTRY;
+
+	cur = osc_extent_alloc(obj);
+	if (cur == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0);
+	LASSERT(lock != NULL);
+	LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+
+	LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT);
+	ppc_bits   = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	chunk_mask = ~((1 << ppc_bits) - 1);
+	chunksize  = 1 << cli->cl_chunkbits;
+	chunk      = index >> ppc_bits;
+
+	/* align end to rpc edge, rpc size may not be a power 2 integer. */
+	max_pages = cli->cl_max_pages_per_rpc;
+	LASSERT((max_pages & ~chunk_mask) == 0);
+	max_end = index - (index % max_pages) + max_pages - 1;
+	max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end);
+
+	/* initialize new extent by parameters so far */
+	cur->oe_max_end = max_end;
+	cur->oe_start   = index & chunk_mask;
+	cur->oe_end     = ((index + ~chunk_mask + 1) & chunk_mask) - 1;
+	if (cur->oe_start < lock->cll_descr.cld_start)
+		cur->oe_start = lock->cll_descr.cld_start;
+	if (cur->oe_end > max_end)
+		cur->oe_end = max_end;
+	cur->oe_osclock = lock;
+	cur->oe_grants  = 0;
+	cur->oe_mppr    = max_pages;
+
+	/* grants has been allocated by caller */
+	LASSERTF(*grants >= chunksize + cli->cl_extent_tax,
+		 "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax);
+	LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur));
+
+restart:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, cur->oe_start);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	while (ext != NULL) {
+		loff_t ext_chk_start = ext->oe_start >> ppc_bits;
+		loff_t ext_chk_end   = ext->oe_end   >> ppc_bits;
+
+		LASSERT(sanity_check_nolock(ext) == 0);
+		if (chunk > ext_chk_end + 1)
+			break;
+
+		/* if covering by different locks, no chance to match */
+		if (lock != ext->oe_osclock) {
+			EASSERTF(!overlapped(ext, cur), ext,
+				 EXTSTR, EXTPARA(cur));
+
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* discontiguous chunks? */
+		if (chunk + 1 < ext_chk_start) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* ok, from now on, ext and cur have these attrs:
+		 * 1. covered by the same lock
+		 * 2. contiguous at chunk level or overlapping. */
+
+		if (overlapped(ext, cur)) {
+			/* cur is the minimum unit, so overlapping means
+			 * full contain. */
+			EASSERTF((ext->oe_start <= cur->oe_start &&
+				  ext->oe_end >= cur->oe_end),
+				 ext, EXTSTR, EXTPARA(cur));
+
+			if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) {
+				/* for simplicity, we wait for this extent to
+				 * finish before going forward. */
+				conflict = osc_extent_get(ext);
+				break;
+			}
+
+			found = osc_extent_hold(ext);
+			break;
+		}
+
+		/* non-overlapped extent */
+		if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) {
+			/* we can't do anything for a non OES_CACHE extent, or
+			 * if there is someone waiting for this extent to be
+			 * flushed, try next one. */
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* check if they belong to the same rpc slot before trying to
+		 * merge. the extents are not overlapped and contiguous at
+		 * chunk level to get here. */
+		if (ext->oe_max_end != max_end) {
+			/* if they don't belong to the same RPC slot or
+			 * max_pages_per_rpc has ever changed, do not merge. */
+			ext = next_extent(ext);
+			continue;
+		}
+
+		/* it's required that an extent must be contiguous at chunk
+		 * level so that we know the whole extent is covered by grant
+		 * (the pages in the extent are NOT required to be contiguous).
+		 * Otherwise, it will be too much difficult to know which
+		 * chunks have grants allocated. */
+
+		/* try to do front merge - extend ext's start */
+		if (chunk + 1 == ext_chk_start) {
+			/* ext must be chunk size aligned */
+			EASSERT((ext->oe_start & ~chunk_mask) == 0, ext);
+
+			/* pull ext's start back to cover cur */
+			ext->oe_start   = cur->oe_start;
+			ext->oe_grants += chunksize;
+			*grants -= chunksize;
+
+			found = osc_extent_hold(ext);
+		} else if (chunk == ext_chk_end + 1) {
+			/* rear merge */
+			ext->oe_end     = cur->oe_end;
+			ext->oe_grants += chunksize;
+			*grants -= chunksize;
+
+			/* try to merge with the next one because we just fill
+			 * in a gap */
+			if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
+				/* we can save extent tax from next extent */
+				*grants += cli->cl_extent_tax;
+
+			found = osc_extent_hold(ext);
+		}
+		if (found != NULL)
+			break;
+
+		ext = next_extent(ext);
+	}
+
+	osc_extent_tree_dump(D_CACHE, obj);
+	if (found != NULL) {
+		LASSERT(conflict == NULL);
+		if (!IS_ERR(found)) {
+			LASSERT(found->oe_osclock == cur->oe_osclock);
+			OSC_EXTENT_DUMP(D_CACHE, found,
+					"found caching ext for %lu.\n", index);
+		}
+	} else if (conflict == NULL) {
+		/* create a new extent */
+		EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur);
+		cur->oe_grants = chunksize + cli->cl_extent_tax;
+		*grants -= cur->oe_grants;
+		LASSERT(*grants >= 0);
+
+		cur->oe_state = OES_CACHE;
+		found = osc_extent_hold(cur);
+		osc_extent_insert(obj, cur);
+		OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n",
+				index, lock->cll_descr.cld_end);
+	}
+	osc_object_unlock(obj);
+
+	if (conflict != NULL) {
+		LASSERT(found == NULL);
+
+		/* waiting for IO to finish. Please notice that it's impossible
+		 * to be an OES_TRUNC extent. */
+		rc = osc_extent_wait(env, conflict, OES_INV);
+		osc_extent_put(env, conflict);
+		conflict = NULL;
+		if (rc < 0)
+			GOTO(out, found = ERR_PTR(rc));
+
+		goto restart;
+	}
+	EXIT;
+
+out:
+	osc_extent_put(env, cur);
+	LASSERT(*grants >= 0);
+	return found;
+}
+
+/**
+ * Called when IO is finished to an extent.
+ */
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc)
+{
+	struct client_obd *cli = osc_cli(ext->oe_obj);
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	int nr_pages = ext->oe_nr_pages;
+	int lost_grant = 0;
+	int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096;
+	__u64 last_off = 0;
+	int last_count = -1;
+	ENTRY;
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n");
+
+	ext->oe_rc = rc ?: ext->oe_nr_pages;
+	EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext);
+	list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+				     oap_pending_item) {
+		list_del_init(&oap->oap_rpc_item);
+		list_del_init(&oap->oap_pending_item);
+		if (last_off <= oap->oap_obj_off) {
+			last_off = oap->oap_obj_off;
+			last_count = oap->oap_count;
+		}
+
+		--ext->oe_nr_pages;
+		osc_ap_completion(env, cli, oap, sent, rc);
+	}
+	EASSERT(ext->oe_nr_pages == 0, ext);
+
+	if (!sent) {
+		lost_grant = ext->oe_grants;
+	} else if (blocksize < PAGE_CACHE_SIZE &&
+		   last_count != PAGE_CACHE_SIZE) {
+		/* For short writes we shouldn't count parts of pages that
+		 * span a whole chunk on the OST side, or our accounting goes
+		 * wrong.  Should match the code in filter_grant_check. */
+		int offset = oap->oap_page_off & ~CFS_PAGE_MASK;
+		int count = oap->oap_count + (offset & (blocksize - 1));
+		int end = (offset + oap->oap_count) & (blocksize - 1);
+		if (end)
+			count += blocksize - end;
+
+		lost_grant = PAGE_CACHE_SIZE - count;
+	}
+	if (ext->oe_grants > 0)
+		osc_free_grant(cli, nr_pages, lost_grant);
+
+	osc_extent_remove(ext);
+	/* put the refcount for RPC */
+	osc_extent_put(env, ext);
+	RETURN(0);
+}
+
+static int extent_wait_cb(struct osc_extent *ext, int state)
+{
+	int ret;
+
+	osc_object_lock(ext->oe_obj);
+	ret = ext->oe_state == state;
+	osc_object_unlock(ext->oe_obj);
+
+	return ret;
+}
+
+/**
+ * Wait for the extent's state to become @state.
+ */
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+			   int state)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL,
+						  LWI_ON_SIGNAL_NOOP, NULL);
+	int rc = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	LASSERT(sanity_check_nolock(ext) == 0);
+	/* `Kick' this extent only if the caller is waiting for it to be
+	 * written out. */
+	if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp) {
+		if (ext->oe_state == OES_ACTIVE) {
+			ext->oe_urgent = 1;
+		} else if (ext->oe_state == OES_CACHE) {
+			ext->oe_urgent = 1;
+			osc_extent_hold(ext);
+			rc = 1;
+		}
+	}
+	osc_object_unlock(obj);
+	if (rc == 1)
+		osc_extent_release(env, ext);
+
+	/* wait for the extent until its state becomes @state */
+	rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi);
+	if (rc == -ETIMEDOUT) {
+		OSC_EXTENT_DUMP(D_ERROR, ext,
+			"%s: wait ext to %d timedout, recovery in progress?\n",
+			osc_export(obj)->exp_obd->obd_name, state);
+
+		lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state),
+				  &lwi);
+	}
+	if (rc == 0 && ext->oe_rc < 0)
+		rc = ext->oe_rc;
+	RETURN(rc);
+}
+
+/**
+ * Discard pages with index greater than @size. If @ext is overlapped with
+ * @size, then partial truncate happens.
+ */
+static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
+				bool partial)
+{
+	struct cl_env_nest     nest;
+	struct lu_env	 *env;
+	struct cl_io	  *io;
+	struct osc_object     *obj = ext->oe_obj;
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_async_page *oap;
+	struct osc_async_page *tmp;
+	int		    pages_in_chunk = 0;
+	int		    ppc_bits    = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	__u64		  trunc_chunk = trunc_index >> ppc_bits;
+	int		    grants   = 0;
+	int		    nr_pages = 0;
+	int		    rc       = 0;
+	ENTRY;
+
+	LASSERT(sanity_check(ext) == 0);
+	LASSERT(ext->oe_state == OES_TRUNC);
+	LASSERT(!ext->oe_urgent);
+
+	/* Request new lu_env.
+	 * We can't use that env from osc_cache_truncate_start() because
+	 * it's from lov_io_sub and not fully initialized. */
+	env = cl_env_nested_get(&nest);
+	io  = &osc_env_info(env)->oti_io;
+	io->ci_obj = cl_object_top(osc2cl(obj));
+	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* discard all pages with index greater then trunc_index */
+	list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+				     oap_pending_item) {
+		struct cl_page  *sub  = oap2cl_page(oap);
+		struct cl_page  *page = cl_page_top(sub);
+
+		LASSERT(list_empty(&oap->oap_rpc_item));
+
+		/* only discard the pages with their index greater than
+		 * trunc_index, and ... */
+		if (sub->cp_index < trunc_index ||
+		    (sub->cp_index == trunc_index && partial)) {
+			/* accounting how many pages remaining in the chunk
+			 * so that we can calculate grants correctly. */
+			if (sub->cp_index >> ppc_bits == trunc_chunk)
+				++pages_in_chunk;
+			continue;
+		}
+
+		list_del_init(&oap->oap_pending_item);
+
+		cl_page_get(page);
+		lu_ref_add(&page->cp_reference, "truncate", current);
+
+		if (cl_page_own(env, io, page) == 0) {
+			cl_page_unmap(env, io, page);
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+			LASSERT(0);
+		}
+
+		lu_ref_del(&page->cp_reference, "truncate", current);
+		cl_page_put(env, page);
+
+		--ext->oe_nr_pages;
+		++nr_pages;
+	}
+	EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
+		      ext->oe_nr_pages == 0),
+		ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
+
+	osc_object_lock(obj);
+	if (ext->oe_nr_pages == 0) {
+		LASSERT(pages_in_chunk == 0);
+		grants = ext->oe_grants;
+		ext->oe_grants = 0;
+	} else { /* calculate how many grants we can free */
+		int     chunks = (ext->oe_end >> ppc_bits) - trunc_chunk;
+		pgoff_t last_index;
+
+
+		/* if there is no pages in this chunk, we can also free grants
+		 * for the last chunk */
+		if (pages_in_chunk == 0) {
+			/* if this is the 1st chunk and no pages in this chunk,
+			 * ext->oe_nr_pages must be zero, so we should be in
+			 * the other if-clause. */
+			LASSERT(trunc_chunk > 0);
+			--trunc_chunk;
+			++chunks;
+		}
+
+		/* this is what we can free from this extent */
+		grants	  = chunks << cli->cl_chunkbits;
+		ext->oe_grants -= grants;
+		last_index      = ((trunc_chunk + 1) << ppc_bits) - 1;
+		ext->oe_end     = min(last_index, ext->oe_max_end);
+		LASSERT(ext->oe_end >= ext->oe_start);
+		LASSERT(ext->oe_grants > 0);
+	}
+	osc_object_unlock(obj);
+
+	if (grants > 0 || nr_pages > 0)
+		osc_free_grant(cli, nr_pages, grants);
+
+out:
+	cl_io_fini(env, io);
+	cl_env_nested_put(&nest, env);
+	RETURN(rc);
+}
+
+/**
+ * This function is used to make the extent prepared for transfer.
+ * A race with flusing page - ll_writepage() has to be handled cautiously.
+ */
+static int osc_extent_make_ready(const struct lu_env *env,
+				 struct osc_extent *ext)
+{
+	struct osc_async_page *oap;
+	struct osc_async_page *last = NULL;
+	struct osc_object *obj = ext->oe_obj;
+	int page_count = 0;
+	int rc;
+	ENTRY;
+
+	/* we're going to grab page lock, so object lock must not be taken. */
+	LASSERT(sanity_check(ext) == 0);
+	/* in locking state, any process should not touch this extent. */
+	EASSERT(ext->oe_state == OES_LOCKING, ext);
+	EASSERT(ext->oe_owner != NULL, ext);
+
+	OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n");
+
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		++page_count;
+		if (last == NULL || last->oap_obj_off < oap->oap_obj_off)
+			last = oap;
+
+		/* checking ASYNC_READY is race safe */
+		if ((oap->oap_async_flags & ASYNC_READY) != 0)
+			continue;
+
+		rc = osc_make_ready(env, oap, OBD_BRW_WRITE);
+		switch (rc) {
+		case 0:
+			spin_lock(&oap->oap_lock);
+			oap->oap_async_flags |= ASYNC_READY;
+			spin_unlock(&oap->oap_lock);
+			break;
+		case -EALREADY:
+			LASSERT((oap->oap_async_flags & ASYNC_READY) != 0);
+			break;
+		default:
+			LASSERTF(0, "unknown return code: %d\n", rc);
+		}
+	}
+
+	LASSERT(page_count == ext->oe_nr_pages);
+	LASSERT(last != NULL);
+	/* the last page is the only one we need to refresh its count by
+	 * the size of file. */
+	if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
+		last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
+		LASSERT(last->oap_count > 0);
+		LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE);
+		last->oap_async_flags |= ASYNC_COUNT_STABLE;
+	}
+
+	/* for the rest of pages, we don't need to call osf_refresh_count()
+	 * because it's known they are not the last page */
+	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+		if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
+			oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off;
+			oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+		}
+	}
+
+	osc_object_lock(obj);
+	osc_extent_state_set(ext, OES_RPC);
+	osc_object_unlock(obj);
+	/* get a refcount for RPC. */
+	osc_extent_get(ext);
+
+	RETURN(0);
+}
+
+/**
+ * Quick and simple version of osc_extent_find(). This function is frequently
+ * called to expand the extent for the same IO. To expand the extent, the
+ * page index must be in the same or next chunk of ext->oe_end.
+ */
+static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants)
+{
+	struct osc_object *obj = ext->oe_obj;
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *next;
+	int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	pgoff_t chunk = index >> ppc_bits;
+	pgoff_t end_chunk;
+	pgoff_t end_index;
+	int chunksize = 1 << cli->cl_chunkbits;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(ext->oe_max_end >= index && ext->oe_start <= index);
+	osc_object_lock(obj);
+	LASSERT(sanity_check_nolock(ext) == 0);
+	end_chunk = ext->oe_end >> ppc_bits;
+	if (chunk > end_chunk + 1)
+		GOTO(out, rc = -ERANGE);
+
+	if (end_chunk >= chunk)
+		GOTO(out, rc = 0);
+
+	LASSERT(end_chunk + 1 == chunk);
+	/* try to expand this extent to cover @index */
+	end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1);
+
+	next = next_extent(ext);
+	if (next != NULL && next->oe_start <= end_index)
+		/* complex mode - overlapped with the next extent,
+		 * this case will be handled by osc_extent_find() */
+		GOTO(out, rc = -EAGAIN);
+
+	ext->oe_end = end_index;
+	ext->oe_grants += chunksize;
+	*grants -= chunksize;
+	LASSERT(*grants >= 0);
+	EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext,
+		 "overlapped after expanding for %lu.\n", index);
+	EXIT;
+
+out:
+	osc_object_unlock(obj);
+	RETURN(rc);
+}
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+				  const char *func, int line)
+{
+	struct osc_extent *ext;
+	int cnt;
+
+	CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n",
+	       obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc);
+
+	/* osc_object_lock(obj); */
+	cnt = 1;
+	for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext))
+		OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_hp_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++);
+
+	cnt = 1;
+	list_for_each_entry(ext, &obj->oo_reading_exts, oe_link)
+		OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++);
+	/* osc_object_unlock(obj); */
+}
+
+/* ------------------ osc extent end ------------------ */
+
+static inline int osc_is_ready(struct osc_object *osc)
+{
+	return !list_empty(&osc->oo_ready_item) ||
+	       !list_empty(&osc->oo_hp_ready_item);
+}
+
+#define OSC_IO_DEBUG(OSC, STR, args...)					       \
+	CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR,     \
+	       (OSC), osc_is_ready(OSC),				       \
+	       list_empty_marker(&(OSC)->oo_hp_ready_item),		       \
+	       list_empty_marker(&(OSC)->oo_ready_item),		       \
+	       atomic_read(&(OSC)->oo_nr_writes),			       \
+	       list_empty_marker(&(OSC)->oo_hp_exts),			       \
+	       list_empty_marker(&(OSC)->oo_urgent_exts),		       \
+	       atomic_read(&(OSC)->oo_nr_reads),			       \
+	       list_empty_marker(&(OSC)->oo_reading_exts),		       \
+	       ##args)
+
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd)
+{
+	struct osc_page *opg  = oap2osc_page(oap);
+	struct cl_page  *page = cl_page_top(oap2cl_page(oap));
+	int result;
+
+	LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */
+
+	ENTRY;
+	result = cl_page_make_ready(env, page, CRT_WRITE);
+	if (result == 0)
+		opg->ops_submit_time = cfs_time_current();
+	RETURN(result);
+}
+
+static int osc_refresh_count(const struct lu_env *env,
+			     struct osc_async_page *oap, int cmd)
+{
+	struct osc_page  *opg = oap2osc_page(oap);
+	struct cl_page   *page = oap2cl_page(oap);
+	struct cl_object *obj;
+	struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
+
+	int result;
+	loff_t kms;
+
+	/* readpage queues with _COUNT_STABLE, shouldn't get here. */
+	LASSERT(!(cmd & OBD_BRW_READ));
+	LASSERT(opg != NULL);
+	obj = opg->ops_cl.cpl_obj;
+
+	cl_object_attr_lock(obj);
+	result = cl_object_attr_get(env, obj, attr);
+	cl_object_attr_unlock(obj);
+	if (result < 0)
+		return result;
+	kms = attr->cat_kms;
+	if (cl_offset(obj, page->cp_index) >= kms)
+		/* catch race with truncate */
+		return 0;
+	else if (cl_offset(obj, page->cp_index + 1) > kms)
+		/* catch sub-page write at end of file */
+		return kms % PAGE_CACHE_SIZE;
+	else
+		return PAGE_CACHE_SIZE;
+}
+
+static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
+			  int cmd, int rc)
+{
+	struct osc_page   *opg  = oap2osc_page(oap);
+	struct cl_page    *page = cl_page_top(oap2cl_page(oap));
+	struct osc_object *obj  = cl2osc(opg->ops_cl.cpl_obj);
+	enum cl_req_type   crt;
+	int srvlock;
+
+	ENTRY;
+
+	cmd &= ~OBD_BRW_NOQUOTA;
+	LASSERT(equi(page->cp_state == CPS_PAGEIN,  cmd == OBD_BRW_READ));
+	LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE));
+	LASSERT(opg->ops_transfer_pinned);
+
+	/*
+	 * page->cp_req can be NULL if io submission failed before
+	 * cl_req was allocated.
+	 */
+	if (page->cp_req != NULL)
+		cl_req_page_done(env, page);
+	LASSERT(page->cp_req == NULL);
+
+	crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
+	/* Clear opg->ops_transfer_pinned before VM lock is released. */
+	opg->ops_transfer_pinned = 0;
+
+	spin_lock(&obj->oo_seatbelt);
+	LASSERT(opg->ops_submitter != NULL);
+	LASSERT(!list_empty(&opg->ops_inflight));
+	list_del_init(&opg->ops_inflight);
+	opg->ops_submitter = NULL;
+	spin_unlock(&obj->oo_seatbelt);
+
+	opg->ops_submit_time = 0;
+	srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
+
+	/* statistic */
+	if (rc == 0 && srvlock) {
+		struct lu_device *ld    = opg->ops_cl.cpl_obj->co_lu.lo_dev;
+		struct osc_stats *stats = &lu2osc_dev(ld)->od_stats;
+		int bytes = oap->oap_count;
+
+		if (crt == CRT_READ)
+			stats->os_lockless_reads += bytes;
+		else
+			stats->os_lockless_writes += bytes;
+	}
+
+	/*
+	 * This has to be the last operation with the page, as locks are
+	 * released in cl_page_completion() and nothing except for the
+	 * reference counter protects page from concurrent reclaim.
+	 */
+	lu_ref_del(&page->cp_reference, "transfer", page);
+
+	cl_page_completion(env, page, crt, rc);
+
+	RETURN(0);
+}
+
+#define OSC_DUMP_GRANT(cli, fmt, args...) do {				      \
+	struct client_obd *__tmp = (cli);				      \
+	CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d "	      \
+	       "dropped: %ld avail: %ld, reserved: %ld, flight: %d } " fmt,   \
+	       __tmp->cl_import->imp_obd->obd_name,			      \
+	       __tmp->cl_dirty, __tmp->cl_dirty_max,			      \
+	       atomic_read(&obd_dirty_pages), obd_max_dirty_pages,	      \
+	       __tmp->cl_lost_grant, __tmp->cl_avail_grant,		      \
+	       __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args);      \
+} while (0)
+
+/* caller must hold loi_list_lock */
+static void osc_consume_write_grant(struct client_obd *cli,
+				    struct brw_page *pga)
+{
+	LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock));
+	LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
+	atomic_inc(&obd_dirty_pages);
+	cli->cl_dirty += PAGE_CACHE_SIZE;
+	pga->flag |= OBD_BRW_FROM_GRANT;
+	CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
+	       PAGE_CACHE_SIZE, pga, pga->pg);
+	osc_update_next_shrink(cli);
+}
+
+/* the companion to osc_consume_write_grant, called when a brw has completed.
+ * must be called with the loi lock held. */
+static void osc_release_write_grant(struct client_obd *cli,
+				    struct brw_page *pga)
+{
+	ENTRY;
+
+	LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock));
+	if (!(pga->flag & OBD_BRW_FROM_GRANT)) {
+		EXIT;
+		return;
+	}
+
+	pga->flag &= ~OBD_BRW_FROM_GRANT;
+	atomic_dec(&obd_dirty_pages);
+	cli->cl_dirty -= PAGE_CACHE_SIZE;
+	if (pga->flag & OBD_BRW_NOCACHE) {
+		pga->flag &= ~OBD_BRW_NOCACHE;
+		atomic_dec(&obd_dirty_transit_pages);
+		cli->cl_dirty_transit -= PAGE_CACHE_SIZE;
+	}
+	EXIT;
+}
+
+/**
+ * To avoid sleeping with object lock held, it's good for us allocate enough
+ * grants before entering into critical section.
+ *
+ * client_obd_list_lock held by caller
+ */
+static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes)
+{
+	int rc = -EDQUOT;
+
+	if (cli->cl_avail_grant >= bytes) {
+		cli->cl_avail_grant    -= bytes;
+		cli->cl_reserved_grant += bytes;
+		rc = 0;
+	}
+	return rc;
+}
+
+static void __osc_unreserve_grant(struct client_obd *cli,
+				  unsigned int reserved, unsigned int unused)
+{
+	/* it's quite normal for us to get more grant than reserved.
+	 * Thinking about a case that two extents merged by adding a new
+	 * chunk, we can save one extent tax. If extent tax is greater than
+	 * one chunk, we can save more grant by adding a new chunk */
+	cli->cl_reserved_grant -= reserved;
+	if (unused > reserved) {
+		cli->cl_avail_grant += reserved;
+		cli->cl_lost_grant  += unused - reserved;
+	} else {
+		cli->cl_avail_grant += unused;
+	}
+}
+
+void osc_unreserve_grant(struct client_obd *cli,
+			 unsigned int reserved, unsigned int unused)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	__osc_unreserve_grant(cli, reserved, unused);
+	if (unused > 0)
+		osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Free grant after IO is finished or canceled.
+ *
+ * @lost_grant is used to remember how many grants we have allocated but not
+ * used, we should return these grants to OST. There're two cases where grants
+ * can be lost:
+ * 1. truncate;
+ * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was
+ *    written. In this case OST may use less chunks to serve this partial
+ *    write. OSTs don't actually know the page size on the client side. so
+ *    clients have to calculate lost grant by the blocksize on the OST.
+ *    See filter_grant_check() for details.
+ */
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+			   unsigned int lost_grant)
+{
+	int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	atomic_sub(nr_pages, &obd_dirty_pages);
+	cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT;
+	cli->cl_lost_grant += lost_grant;
+	if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) {
+		/* borrow some grant from truncate to avoid the case that
+		 * truncate uses up all avail grant */
+		cli->cl_lost_grant -= grant;
+		cli->cl_avail_grant += grant;
+	}
+	osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n",
+	       lost_grant, cli->cl_lost_grant,
+	       cli->cl_avail_grant, cli->cl_dirty);
+}
+
+/**
+ * The companion to osc_enter_cache(), called when @oap is no longer part of
+ * the dirty accounting due to error.
+ */
+static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	osc_release_write_grant(cli, &oap->oap_brw_page);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Non-blocking version of osc_enter_cache() that consumes grant only when it
+ * is available.
+ */
+static int osc_enter_cache_try(struct client_obd *cli,
+			       struct osc_async_page *oap,
+			       int bytes, int transient)
+{
+	int rc;
+
+	OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+	rc = osc_reserve_grant(cli, bytes);
+	if (rc < 0)
+		return 0;
+
+	if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max &&
+	    atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) {
+		osc_consume_write_grant(cli, &oap->oap_brw_page);
+		if (transient) {
+			cli->cl_dirty_transit += PAGE_CACHE_SIZE;
+			atomic_inc(&obd_dirty_transit_pages);
+			oap->oap_brw_flags |= OBD_BRW_NOCACHE;
+		}
+		rc = 1;
+	} else {
+		__osc_unreserve_grant(cli, bytes, bytes);
+		rc = 0;
+	}
+	return rc;
+}
+
+static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+{
+	int rc;
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = list_empty(&ocw->ocw_entry);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+
+/**
+ * The main entry to reserve dirty page accounting. Usually the grant reserved
+ * in this function will be freed in bulk in osc_free_grant() unless it fails
+ * to add osc cache, in that case, it will be freed in osc_exit_cache().
+ *
+ * The process will be put into sleep if it's already run out of grant.
+ */
+static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
+			   struct osc_async_page *oap, int bytes)
+{
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo  *loi = osc->oo_oinfo;
+	struct osc_cache_waiter ocw;
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	int rc = -EDQUOT;
+	ENTRY;
+
+	OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+
+	/* force the caller to try sync io.  this can jump the list
+	 * of queued writes and create a discontiguous rpc stream */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) ||
+	    cli->cl_dirty_max < PAGE_CACHE_SIZE     ||
+	    cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync)
+		GOTO(out, rc = -EDQUOT);
+
+	/* Hopefully normal case - cache space and write credits available */
+	if (osc_enter_cache_try(cli, oap, bytes, 0))
+		GOTO(out, rc = 0);
+
+	/* We can get here for two reasons: too many dirty pages in cache, or
+	 * run out of grants. In both cases we should write dirty pages out.
+	 * Adding a cache waiter will trigger urgent write-out no matter what
+	 * RPC size will be.
+	 * The exiting condition is no avail grants and no dirty pages caching,
+	 * that really means there is no space on the OST. */
+	init_waitqueue_head(&ocw.ocw_waitq);
+	ocw.ocw_oap   = oap;
+	ocw.ocw_grant = bytes;
+	while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) {
+		list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
+		ocw.ocw_rc = 0;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+		osc_io_unplug_async(env, cli, NULL);
+
+		CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
+		       cli->cl_import->imp_obd->obd_name, &ocw, oap);
+
+		rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
+
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+
+		/* l_wait_event is interrupted by signal */
+		if (rc < 0) {
+			list_del_init(&ocw.ocw_entry);
+			GOTO(out, rc);
+		}
+
+		LASSERT(list_empty(&ocw.ocw_entry));
+		rc = ocw.ocw_rc;
+
+		if (rc != -EDQUOT)
+			GOTO(out, rc);
+		if (osc_enter_cache_try(cli, oap, bytes, 0))
+			GOTO(out, rc = 0);
+	}
+	EXIT;
+out:
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	OSC_DUMP_GRANT(cli, "returned %d.\n", rc);
+	RETURN(rc);
+}
+
+/* caller must hold loi_list_lock */
+void osc_wake_cache_waiters(struct client_obd *cli)
+{
+	struct list_head *l, *tmp;
+	struct osc_cache_waiter *ocw;
+
+	ENTRY;
+	list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+		ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
+		list_del_init(&ocw->ocw_entry);
+
+		ocw->ocw_rc = -EDQUOT;
+		/* we can't dirty more */
+		if ((cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max) ||
+		    (atomic_read(&obd_dirty_pages) + 1 >
+		     obd_max_dirty_pages)) {
+			CDEBUG(D_CACHE, "no dirty room: dirty: %ld "
+			       "osc max %ld, sys max %d\n", cli->cl_dirty,
+			       cli->cl_dirty_max, obd_max_dirty_pages);
+			goto wakeup;
+		}
+
+		ocw->ocw_rc = 0;
+		if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
+			ocw->ocw_rc = -EDQUOT;
+
+wakeup:
+		CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
+		       ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
+
+		wake_up(&ocw->ocw_waitq);
+	}
+
+	EXIT;
+}
+
+static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
+{
+	int hprpc = !!list_empty(&osc->oo_hp_exts);
+	return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
+}
+
+/* This maintains the lists of pending pages to read/write for a given object
+ * (lop).  This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint()
+ * to quickly find objects that are ready to send an RPC. */
+static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
+			 int cmd)
+{
+	int invalid_import = 0;
+	ENTRY;
+
+	/* if we have an invalid import we want to drain the queued pages
+	 * by forcing them through rpcs that immediately fail and complete
+	 * the pages.  recovery relies on this to empty the queued pages
+	 * before canceling the locks and evicting down the llite pages */
+	if ((cli->cl_import == NULL || cli->cl_import->imp_invalid))
+		invalid_import = 1;
+
+	if (cmd & OBD_BRW_WRITE) {
+		if (atomic_read(&osc->oo_nr_writes) == 0)
+			RETURN(0);
+		if (invalid_import) {
+			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+			RETURN(1);
+		}
+		if (!list_empty(&osc->oo_hp_exts)) {
+			CDEBUG(D_CACHE, "high prio request forcing RPC\n");
+			RETURN(1);
+		}
+		if (!list_empty(&osc->oo_urgent_exts)) {
+			CDEBUG(D_CACHE, "urgent request forcing RPC\n");
+			RETURN(1);
+		}
+		/* trigger a write rpc stream as long as there are dirtiers
+		 * waiting for space.  as they're waiting, they're not going to
+		 * create more pages to coalesce with what's waiting.. */
+		if (!list_empty(&cli->cl_cache_waiters)) {
+			CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
+			RETURN(1);
+		}
+		if (atomic_read(&osc->oo_nr_writes) >=
+		    cli->cl_max_pages_per_rpc)
+			RETURN(1);
+	} else {
+		if (atomic_read(&osc->oo_nr_reads) == 0)
+			RETURN(0);
+		if (invalid_import) {
+			CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+			RETURN(1);
+		}
+		/* all read are urgent. */
+		if (!list_empty(&osc->oo_reading_exts))
+			RETURN(1);
+	}
+
+	RETURN(0);
+}
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta)
+{
+	struct client_obd *cli = osc_cli(obj);
+	if (cmd & OBD_BRW_WRITE) {
+		atomic_add(delta, &obj->oo_nr_writes);
+		atomic_add(delta, &cli->cl_pending_w_pages);
+		LASSERT(atomic_read(&obj->oo_nr_writes) >= 0);
+	} else {
+		atomic_add(delta, &obj->oo_nr_reads);
+		atomic_add(delta, &cli->cl_pending_r_pages);
+		LASSERT(atomic_read(&obj->oo_nr_reads) >= 0);
+	}
+	OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta);
+}
+
+static int osc_makes_hprpc(struct osc_object *obj)
+{
+	return !list_empty(&obj->oo_hp_exts);
+}
+
+static void on_list(struct list_head *item, struct list_head *list, int should_be_on)
+{
+	if (list_empty(item) && should_be_on)
+		list_add_tail(item, list);
+	else if (!list_empty(item) && !should_be_on)
+		list_del_init(item);
+}
+
+/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc
+ * can find pages to build into rpcs quickly */
+static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+	if (osc_makes_hprpc(osc)) {
+		/* HP rpc */
+		on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0);
+		on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1);
+	} else {
+		on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0);
+		on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list,
+			osc_makes_rpc(cli, osc, OBD_BRW_WRITE) ||
+			osc_makes_rpc(cli, osc, OBD_BRW_READ));
+	}
+
+	on_list(&osc->oo_write_item, &cli->cl_loi_write_list,
+		atomic_read(&osc->oo_nr_writes) > 0);
+
+	on_list(&osc->oo_read_item, &cli->cl_loi_read_list,
+		atomic_read(&osc->oo_nr_reads) > 0);
+
+	return osc_is_ready(osc);
+}
+
+static int osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+	int is_ready;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	is_ready = __osc_list_maint(cli, osc);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return is_ready;
+}
+
+/* this is trying to propogate async writeback errors back up to the
+ * application.  As an async write fails we record the error code for later if
+ * the app does an fsync.  As long as errors persist we force future rpcs to be
+ * sync so that the app can get a sync error and break the cycle of queueing
+ * pages for which writeback will fail. */
+static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
+			   int rc)
+{
+	if (rc) {
+		if (!ar->ar_rc)
+			ar->ar_rc = rc;
+
+		ar->ar_force_sync = 1;
+		ar->ar_min_xid = ptlrpc_sample_next_xid();
+		return;
+
+	}
+
+	if (ar->ar_force_sync && (xid >= ar->ar_min_xid))
+		ar->ar_force_sync = 0;
+}
+
+
+/* this must be called holding the loi list lock to give coverage to exit_cache,
+ * async_flag maintenance, and oap_request */
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+			      struct osc_async_page *oap, int sent, int rc)
+{
+	struct osc_object *osc = oap->oap_obj;
+	struct lov_oinfo  *loi = osc->oo_oinfo;
+	__u64 xid = 0;
+
+	ENTRY;
+	if (oap->oap_request != NULL) {
+		xid = ptlrpc_req_xid(oap->oap_request);
+		ptlrpc_req_finished(oap->oap_request);
+		oap->oap_request = NULL;
+	}
+
+	/* As the transfer for this page is being done, clear the flags */
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags = 0;
+	spin_unlock(&oap->oap_lock);
+	oap->oap_interrupted = 0;
+
+	if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) {
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		osc_process_ar(&cli->cl_ar, xid, rc);
+		osc_process_ar(&loi->loi_ar, xid, rc);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+	}
+
+	rc = osc_completion(env, oap, oap->oap_cmd, rc);
+	if (rc)
+		CERROR("completion on oap %p obj %p returns %d.\n",
+		       oap, osc, rc);
+
+	EXIT;
+}
+
+/**
+ * Try to add extent to one RPC. We need to think about the following things:
+ * - # of pages must not be over max_pages_per_rpc
+ * - extent must be compatible with previous ones
+ */
+static int try_to_add_extent_for_io(struct client_obd *cli,
+				    struct osc_extent *ext, struct list_head *rpclist,
+				    int *pc, unsigned int *max_pages)
+{
+	struct osc_extent *tmp;
+	ENTRY;
+
+	EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
+		ext);
+
+	*max_pages = max(ext->oe_mppr, *max_pages);
+	if (*pc + ext->oe_nr_pages > *max_pages)
+		RETURN(0);
+
+	list_for_each_entry(tmp, rpclist, oe_link) {
+		EASSERT(tmp->oe_owner == current, tmp);
+#if 0
+		if (overlapped(tmp, ext)) {
+			OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext);
+			EASSERT(0, ext);
+		}
+#endif
+
+		if (tmp->oe_srvlock != ext->oe_srvlock ||
+		    !tmp->oe_grants != !ext->oe_grants)
+			RETURN(0);
+
+		/* remove break for strict check */
+		break;
+	}
+
+	*pc += ext->oe_nr_pages;
+	list_move_tail(&ext->oe_link, rpclist);
+	ext->oe_owner = current;
+	RETURN(1);
+}
+
+/**
+ * In order to prevent multiple ptlrpcd from breaking contiguous extents,
+ * get_write_extent() takes all appropriate extents in atomic.
+ *
+ * The following policy is used to collect extents for IO:
+ * 1. Add as many HP extents as possible;
+ * 2. Add the first urgent extent in urgent extent list and take it out of
+ *    urgent list;
+ * 3. Add subsequent extents of this urgent extent;
+ * 4. If urgent list is not empty, goto 2;
+ * 5. Traverse the extent tree from the 1st extent;
+ * 6. Above steps exit if there is no space in this RPC.
+ */
+static int get_write_extents(struct osc_object *obj, struct list_head *rpclist)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	int page_count = 0;
+	unsigned int max_pages = cli->cl_max_pages_per_rpc;
+
+	LASSERT(osc_object_is_locked(obj));
+	while (!list_empty(&obj->oo_hp_exts)) {
+		ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
+				     oe_link);
+		LASSERT(ext->oe_state == OES_CACHE);
+		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+					      &max_pages))
+			return page_count;
+		EASSERT(ext->oe_nr_pages <= max_pages, ext);
+	}
+	if (page_count == max_pages)
+		return page_count;
+
+	while (!list_empty(&obj->oo_urgent_exts)) {
+		ext = list_entry(obj->oo_urgent_exts.next,
+				     struct osc_extent, oe_link);
+		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+					      &max_pages))
+			return page_count;
+
+		if (!ext->oe_intree)
+			continue;
+
+		while ((ext = next_extent(ext)) != NULL) {
+			if ((ext->oe_state != OES_CACHE) ||
+			    (!list_empty(&ext->oe_link) &&
+			     ext->oe_owner != NULL))
+				continue;
+
+			if (!try_to_add_extent_for_io(cli, ext, rpclist,
+						      &page_count, &max_pages))
+				return page_count;
+		}
+	}
+	if (page_count == max_pages)
+		return page_count;
+
+	ext = first_extent(obj);
+	while (ext != NULL) {
+		if ((ext->oe_state != OES_CACHE) ||
+		    /* this extent may be already in current rpclist */
+		    (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+					      &max_pages))
+			return page_count;
+
+		ext = next_extent(ext);
+	}
+	return page_count;
+}
+
+static int
+osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, pdl_policy_t pol)
+{
+	LIST_HEAD(rpclist);
+	struct osc_extent *ext;
+	struct osc_extent *tmp;
+	struct osc_extent *first = NULL;
+	obd_count page_count = 0;
+	int srvlock = 0;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(osc_object_is_locked(osc));
+
+	page_count = get_write_extents(osc, &rpclist);
+	LASSERT(equi(page_count == 0, list_empty(&rpclist)));
+
+	if (list_empty(&rpclist))
+		RETURN(0);
+
+	osc_update_pending(osc, OBD_BRW_WRITE, -page_count);
+
+	list_for_each_entry(ext, &rpclist, oe_link) {
+		LASSERT(ext->oe_state == OES_CACHE ||
+			ext->oe_state == OES_LOCK_DONE);
+		if (ext->oe_state == OES_CACHE)
+			osc_extent_state_set(ext, OES_LOCKING);
+		else
+			osc_extent_state_set(ext, OES_RPC);
+	}
+
+	/* we're going to grab page lock, so release object lock because
+	 * lock order is page lock -> object lock. */
+	osc_object_unlock(osc);
+
+	list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) {
+		if (ext->oe_state == OES_LOCKING) {
+			rc = osc_extent_make_ready(env, ext);
+			if (unlikely(rc < 0)) {
+				list_del_init(&ext->oe_link);
+				osc_extent_finish(env, ext, 0, rc);
+				continue;
+			}
+		}
+		if (first == NULL) {
+			first = ext;
+			srvlock = ext->oe_srvlock;
+		} else {
+			LASSERT(srvlock == ext->oe_srvlock);
+		}
+	}
+
+	if (!list_empty(&rpclist)) {
+		LASSERT(page_count > 0);
+		rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE, pol);
+		LASSERT(list_empty(&rpclist));
+	}
+
+	osc_object_lock(osc);
+	RETURN(rc);
+}
+
+/**
+ * prepare pages for ASYNC io and put pages in send queue.
+ *
+ * \param cmd OBD_BRW_* macroses
+ * \param lop pending pages
+ *
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
+ */
+static int
+osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct osc_object *osc, pdl_policy_t pol)
+{
+	struct osc_extent *ext;
+	struct osc_extent *next;
+	LIST_HEAD(rpclist);
+	int page_count = 0;
+	unsigned int max_pages = cli->cl_max_pages_per_rpc;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(osc_object_is_locked(osc));
+	list_for_each_entry_safe(ext, next,
+				     &osc->oo_reading_exts, oe_link) {
+		EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+		if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count,
+					      &max_pages))
+			break;
+		osc_extent_state_set(ext, OES_RPC);
+		EASSERT(ext->oe_nr_pages <= max_pages, ext);
+	}
+	LASSERT(page_count <= max_pages);
+
+	osc_update_pending(osc, OBD_BRW_READ, -page_count);
+
+	if (!list_empty(&rpclist)) {
+		osc_object_unlock(osc);
+
+		LASSERT(page_count > 0);
+		rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ, pol);
+		LASSERT(list_empty(&rpclist));
+
+		osc_object_lock(osc);
+	}
+	RETURN(rc);
+}
+
+#define list_to_obj(list, item) ({					      \
+	struct list_head *__tmp = (list)->next;				      \
+	list_del_init(__tmp);					      \
+	list_entry(__tmp, struct osc_object, oo_##item);		      \
+})
+
+/* This is called by osc_check_rpcs() to find which objects have pages that
+ * we could be sending.  These lists are maintained by osc_makes_rpc(). */
+static struct osc_object *osc_next_obj(struct client_obd *cli)
+{
+	ENTRY;
+
+	/* First return objects that have blocked locks so that they
+	 * will be flushed quickly and other clients can get the lock,
+	 * then objects which have pages ready to be stuffed into RPCs */
+	if (!list_empty(&cli->cl_loi_hp_ready_list))
+		RETURN(list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item));
+	if (!list_empty(&cli->cl_loi_ready_list))
+		RETURN(list_to_obj(&cli->cl_loi_ready_list, ready_item));
+
+	/* then if we have cache waiters, return all objects with queued
+	 * writes.  This is especially important when many small files
+	 * have filled up the cache and not been fired into rpcs because
+	 * they don't pass the nr_pending/object threshhold */
+	if (!list_empty(&cli->cl_cache_waiters) &&
+	    !list_empty(&cli->cl_loi_write_list))
+		RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
+
+	/* then return all queued objects when we have an invalid import
+	 * so that they get flushed */
+	if (cli->cl_import == NULL || cli->cl_import->imp_invalid) {
+		if (!list_empty(&cli->cl_loi_write_list))
+			RETURN(list_to_obj(&cli->cl_loi_write_list,
+					   write_item));
+		if (!list_empty(&cli->cl_loi_read_list))
+			RETURN(list_to_obj(&cli->cl_loi_read_list,
+					   read_item));
+	}
+	RETURN(NULL);
+}
+
+/* called with the loi list lock held */
+static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli,
+			   pdl_policy_t pol)
+{
+	struct osc_object *osc;
+	int rc = 0;
+	ENTRY;
+
+	while ((osc = osc_next_obj(cli)) != NULL) {
+		struct cl_object *obj = osc2cl(osc);
+		struct lu_ref_link *link;
+
+		OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
+
+		if (osc_max_rpc_in_flight(cli, osc)) {
+			__osc_list_maint(cli, osc);
+			break;
+		}
+
+		cl_object_get(obj);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		link = lu_object_ref_add(&obj->co_lu, "check", current);
+
+		/* attempt some read/write balancing by alternating between
+		 * reads and writes in an object.  The makes_rpc checks here
+		 * would be redundant if we were getting read/write work items
+		 * instead of objects.  we don't want send_oap_rpc to drain a
+		 * partial read pending queue when we're given this object to
+		 * do io on writes while there are cache waiters */
+		osc_object_lock(osc);
+		if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) {
+			rc = osc_send_write_rpc(env, cli, osc, pol);
+			if (rc < 0) {
+				CERROR("Write request failed with %d\n", rc);
+
+				/* osc_send_write_rpc failed, mostly because of
+				 * memory pressure.
+				 *
+				 * It can't break here, because if:
+				 *  - a page was submitted by osc_io_submit, so
+				 *    page locked;
+				 *  - no request in flight
+				 *  - no subsequent request
+				 * The system will be in live-lock state,
+				 * because there is no chance to call
+				 * osc_io_unplug() and osc_check_rpcs() any
+				 * more. pdflush can't help in this case,
+				 * because it might be blocked at grabbing
+				 * the page lock as we mentioned.
+				 *
+				 * Anyway, continue to drain pages. */
+				/* break; */
+			}
+		}
+		if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) {
+			rc = osc_send_read_rpc(env, cli, osc, pol);
+			if (rc < 0)
+				CERROR("Read request failed with %d\n", rc);
+		}
+		osc_object_unlock(osc);
+
+		osc_list_maint(cli, osc);
+		lu_object_ref_del_at(&obj->co_lu, link, "check", current);
+		cl_object_put(env, obj);
+
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+	}
+}
+
+static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+			  struct osc_object *osc, pdl_policy_t pol, int async)
+{
+	int rc = 0;
+
+	if (osc != NULL && osc_list_maint(cli, osc) == 0)
+		return 0;
+
+	if (!async) {
+		/* disable osc_lru_shrink() temporarily to avoid
+		 * potential stack overrun problem. LU-2859 */
+		atomic_inc(&cli->cl_lru_shrinkers);
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		osc_check_rpcs(env, cli, pol);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		atomic_dec(&cli->cl_lru_shrinkers);
+	} else {
+		CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli);
+		LASSERT(cli->cl_writeback_work != NULL);
+		rc = ptlrpcd_queue_work(cli->cl_writeback_work);
+	}
+	return rc;
+}
+
+static int osc_io_unplug_async(const struct lu_env *env,
+				struct client_obd *cli, struct osc_object *osc)
+{
+	/* XXX: policy is no use actually. */
+	return osc_io_unplug0(env, cli, osc, PDL_POLICY_ROUND, 1);
+}
+
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, pdl_policy_t pol)
+{
+	(void)osc_io_unplug0(env, cli, osc, pol, 0);
+}
+
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+			struct page *page, loff_t offset)
+{
+	struct obd_export     *exp = osc_export(osc);
+	struct osc_async_page *oap = &ops->ops_oap;
+	ENTRY;
+
+	if (!page)
+		return cfs_size_round(sizeof(*oap));
+
+	oap->oap_magic = OAP_MAGIC;
+	oap->oap_cli = &exp->exp_obd->u.cli;
+	oap->oap_obj = osc;
+
+	oap->oap_page = page;
+	oap->oap_obj_off = offset;
+	LASSERT(!(offset & ~CFS_PAGE_MASK));
+
+	if (!client_is_remote(exp) && cfs_capable(CFS_CAP_SYS_RESOURCE))
+		oap->oap_brw_flags = OBD_BRW_NOQUOTA;
+
+	INIT_LIST_HEAD(&oap->oap_pending_item);
+	INIT_LIST_HEAD(&oap->oap_rpc_item);
+
+	spin_lock_init(&oap->oap_lock);
+	CDEBUG(D_INFO, "oap %p page %p obj off "LPU64"\n",
+	       oap, page, oap->oap_obj_off);
+	RETURN(0);
+}
+
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+		       struct osc_page *ops)
+{
+	struct osc_io *oio = osc_env_io(env);
+	struct osc_extent     *ext = NULL;
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct client_obd     *cli = oap->oap_cli;
+	struct osc_object     *osc = oap->oap_obj;
+	pgoff_t index;
+	int    grants = 0;
+	int    brw_flags = OBD_BRW_ASYNC;
+	int    cmd = OBD_BRW_WRITE;
+	int    need_release = 0;
+	int    rc = 0;
+	ENTRY;
+
+	if (oap->oap_magic != OAP_MAGIC)
+		RETURN(-EINVAL);
+
+	if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
+		RETURN(-EIO);
+
+	if (!list_empty(&oap->oap_pending_item) ||
+	    !list_empty(&oap->oap_rpc_item))
+		RETURN(-EBUSY);
+
+	/* Set the OBD_BRW_SRVLOCK before the page is queued. */
+	brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
+	if (!client_is_remote(osc_export(osc)) &&
+	    cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+		brw_flags |= OBD_BRW_NOQUOTA;
+		cmd |= OBD_BRW_NOQUOTA;
+	}
+
+	/* check if the file's owner/group is over quota */
+	if (!(cmd & OBD_BRW_NOQUOTA)) {
+		struct cl_object *obj;
+		struct cl_attr   *attr;
+		unsigned int qid[MAXQUOTAS];
+
+		obj = cl_object_top(&osc->oo_cl);
+		attr = &osc_env_info(env)->oti_attr;
+
+		cl_object_attr_lock(obj);
+		rc = cl_object_attr_get(env, obj, attr);
+		cl_object_attr_unlock(obj);
+
+		qid[USRQUOTA] = attr->cat_uid;
+		qid[GRPQUOTA] = attr->cat_gid;
+		if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA)
+			rc = -EDQUOT;
+		if (rc)
+			RETURN(rc);
+	}
+
+	oap->oap_cmd = cmd;
+	oap->oap_page_off = ops->ops_from;
+	oap->oap_count = ops->ops_to - ops->ops_from;
+	oap->oap_async_flags = 0;
+	oap->oap_brw_flags = brw_flags;
+
+	OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n",
+		     oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK);
+
+	index = oap2cl_page(oap)->cp_index;
+
+	/* Add this page into extent by the following steps:
+	 * 1. if there exists an active extent for this IO, mostly this page
+	 *    can be added to the active extent and sometimes we need to
+	 *    expand extent to accomodate this page;
+	 * 2. otherwise, a new extent will be allocated. */
+
+	ext = oio->oi_active;
+	if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) {
+		/* one chunk plus extent overhead must be enough to write this
+		 * page */
+		grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+		if (ext->oe_end >= index)
+			grants = 0;
+
+		/* it doesn't need any grant to dirty this page */
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		rc = osc_enter_cache_try(cli, oap, grants, 0);
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		if (rc == 0) { /* try failed */
+			grants = 0;
+			need_release = 1;
+		} else if (ext->oe_end < index) {
+			int tmp = grants;
+			/* try to expand this extent */
+			rc = osc_extent_expand(ext, index, &tmp);
+			if (rc < 0) {
+				need_release = 1;
+				/* don't free reserved grant */
+			} else {
+				OSC_EXTENT_DUMP(D_CACHE, ext,
+						"expanded for %lu.\n", index);
+				osc_unreserve_grant(cli, grants, tmp);
+				grants = 0;
+			}
+		}
+		rc = 0;
+	} else if (ext != NULL) {
+		/* index is located outside of active extent */
+		need_release = 1;
+	}
+	if (need_release) {
+		osc_extent_release(env, ext);
+		oio->oi_active = NULL;
+		ext = NULL;
+	}
+
+	if (ext == NULL) {
+		int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+		/* try to find new extent to cover this page */
+		LASSERT(oio->oi_active == NULL);
+		/* we may have allocated grant for this page if we failed
+		 * to expand the previous active extent. */
+		LASSERT(ergo(grants > 0, grants >= tmp));
+
+		rc = 0;
+		if (grants == 0) {
+			/* we haven't allocated grant for this page. */
+			rc = osc_enter_cache(env, cli, oap, tmp);
+			if (rc == 0)
+				grants = tmp;
+		}
+
+		tmp = grants;
+		if (rc == 0) {
+			ext = osc_extent_find(env, osc, index, &tmp);
+			if (IS_ERR(ext)) {
+				LASSERT(tmp == grants);
+				osc_exit_cache(cli, oap);
+				rc = PTR_ERR(ext);
+				ext = NULL;
+			} else {
+				oio->oi_active = ext;
+			}
+		}
+		if (grants > 0)
+			osc_unreserve_grant(cli, grants, tmp);
+	}
+
+	LASSERT(ergo(rc == 0, ext != NULL));
+	if (ext != NULL) {
+		EASSERTF(ext->oe_end >= index && ext->oe_start <= index,
+			 ext, "index = %lu.\n", index);
+		LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0);
+
+		osc_object_lock(osc);
+		if (ext->oe_nr_pages == 0)
+			ext->oe_srvlock = ops->ops_srvlock;
+		else
+			LASSERT(ext->oe_srvlock == ops->ops_srvlock);
+		++ext->oe_nr_pages;
+		list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
+		osc_object_unlock(osc);
+	}
+	RETURN(rc);
+}
+
+int osc_teardown_async_page(const struct lu_env *env,
+			    struct osc_object *obj, struct osc_page *ops)
+{
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct osc_extent     *ext = NULL;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oap->oap_magic == OAP_MAGIC);
+
+	CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n",
+	       oap, ops, oap2cl_page(oap)->cp_index);
+
+	osc_object_lock(obj);
+	if (!list_empty(&oap->oap_rpc_item)) {
+		CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap);
+		rc = -EBUSY;
+	} else if (!list_empty(&oap->oap_pending_item)) {
+		ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index);
+		/* only truncated pages are allowed to be taken out.
+		 * See osc_extent_truncate() and osc_cache_truncate_start()
+		 * for details. */
+		if (ext != NULL && ext->oe_state != OES_TRUNC) {
+			OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n",
+					oap2cl_page(oap)->cp_index);
+			rc = -EBUSY;
+		}
+	}
+	osc_object_unlock(obj);
+	if (ext != NULL)
+		osc_extent_put(env, ext);
+	RETURN(rc);
+}
+
+/**
+ * This is called when a page is picked up by kernel to write out.
+ *
+ * We should find out the corresponding extent and add the whole extent
+ * into urgent list. The extent may be being truncated or used, handle it
+ * carefully.
+ */
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+			 struct osc_page *ops)
+{
+	struct osc_extent *ext   = NULL;
+	struct osc_object *obj   = cl2osc(ops->ops_cl.cpl_obj);
+	struct cl_page    *cp    = ops->ops_cl.cpl_page;
+	pgoff_t	    index = cp->cp_index;
+	struct osc_async_page *oap = &ops->ops_oap;
+	bool unplug = false;
+	int rc = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	ext = osc_extent_lookup(obj, index);
+	if (ext == NULL) {
+		osc_extent_tree_dump(D_ERROR, obj);
+		LASSERTF(0, "page index %lu is NOT covered.\n", index);
+	}
+
+	switch (ext->oe_state) {
+	case OES_RPC:
+	case OES_LOCK_DONE:
+		CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp),
+			      "flush an in-rpc page?\n");
+		LASSERT(0);
+		break;
+	case OES_LOCKING:
+		/* If we know this extent is being written out, we should abort
+		 * so that the writer can make this page ready. Otherwise, there
+		 * exists a deadlock problem because other process can wait for
+		 * page writeback bit holding page lock; and meanwhile in
+		 * vvp_page_make_ready(), we need to grab page lock before
+		 * really sending the RPC. */
+	case OES_TRUNC:
+		/* race with truncate, page will be redirtied */
+		GOTO(out, rc = -EAGAIN);
+	default:
+		break;
+	}
+
+	rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE);
+	if (rc)
+		GOTO(out, rc);
+
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT;
+	spin_unlock(&oap->oap_lock);
+
+	if (memory_pressure_get())
+		ext->oe_memalloc = 1;
+
+	ext->oe_urgent = 1;
+	if (ext->oe_state == OES_CACHE) {
+		OSC_EXTENT_DUMP(D_CACHE, ext,
+				"flush page %p make it urgent.\n", oap);
+		if (list_empty(&ext->oe_link))
+			list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		unplug = true;
+	}
+	rc = 0;
+	EXIT;
+
+out:
+	osc_object_unlock(obj);
+	osc_extent_put(env, ext);
+	if (unplug)
+		osc_io_unplug_async(env, osc_cli(obj), obj);
+	return rc;
+}
+
+/**
+ * this is called when a sync waiter receives an interruption.  Its job is to
+ * get the caller woken as soon as possible.  If its page hasn't been put in an
+ * rpc yet it can dequeue immediately.  Otherwise it has to mark the rpc as
+ * desiring interruption which will forcefully complete the rpc once the rpc
+ * has timed out.
+ */
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
+{
+	struct osc_async_page *oap = &ops->ops_oap;
+	struct osc_object     *obj = oap->oap_obj;
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_extent     *ext;
+	struct osc_extent     *found = NULL;
+	struct list_head	    *plist;
+	pgoff_t index = oap2cl_page(oap)->cp_index;
+	int     rc = -EBUSY;
+	int     cmd;
+	ENTRY;
+
+	LASSERT(!oap->oap_interrupted);
+	oap->oap_interrupted = 1;
+
+	/* Find out the caching extent */
+	osc_object_lock(obj);
+	if (oap->oap_cmd & OBD_BRW_WRITE) {
+		plist = &obj->oo_urgent_exts;
+		cmd   = OBD_BRW_WRITE;
+	} else {
+		plist = &obj->oo_reading_exts;
+		cmd   = OBD_BRW_READ;
+	}
+	list_for_each_entry(ext, plist, oe_link) {
+		if (ext->oe_start <= index && ext->oe_end >= index) {
+			LASSERT(ext->oe_state == OES_LOCK_DONE);
+			/* For OES_LOCK_DONE state extent, it has already held
+			 * a refcount for RPC. */
+			found = osc_extent_get(ext);
+			break;
+		}
+	}
+	if (found != NULL) {
+		list_del_init(&found->oe_link);
+		osc_update_pending(obj, cmd, -found->oe_nr_pages);
+		osc_object_unlock(obj);
+
+		osc_extent_finish(env, found, 0, -EINTR);
+		osc_extent_put(env, found);
+		rc = 0;
+	} else {
+		osc_object_unlock(obj);
+		/* ok, it's been put in an rpc. only one oap gets a request
+		 * reference */
+		if (oap->oap_request != NULL) {
+			ptlrpc_mark_interrupted(oap->oap_request);
+			ptlrpcd_wake(oap->oap_request);
+			ptlrpc_req_finished(oap->oap_request);
+			oap->oap_request = NULL;
+		}
+	}
+
+	osc_list_maint(cli, obj);
+	RETURN(rc);
+}
+
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+			 struct list_head *list, int cmd, int brw_flags)
+{
+	struct client_obd     *cli = osc_cli(obj);
+	struct osc_extent     *ext;
+	struct osc_async_page *oap, *tmp;
+	int     page_count = 0;
+	int     mppr       = cli->cl_max_pages_per_rpc;
+	pgoff_t start      = CL_PAGE_EOF;
+	pgoff_t end	= 0;
+	ENTRY;
+
+	list_for_each_entry(oap, list, oap_pending_item) {
+		struct cl_page *cp = oap2cl_page(oap);
+		if (cp->cp_index > end)
+			end = cp->cp_index;
+		if (cp->cp_index < start)
+			start = cp->cp_index;
+		++page_count;
+		mppr <<= (page_count > mppr);
+	}
+
+	ext = osc_extent_alloc(obj);
+	if (ext == NULL) {
+		list_for_each_entry_safe(oap, tmp, list, oap_pending_item) {
+			list_del_init(&oap->oap_pending_item);
+			osc_ap_completion(env, cli, oap, 0, -ENOMEM);
+		}
+		RETURN(-ENOMEM);
+	}
+
+	ext->oe_rw = !!(cmd & OBD_BRW_READ);
+	ext->oe_urgent = 1;
+	ext->oe_start = start;
+	ext->oe_end = ext->oe_max_end = end;
+	ext->oe_obj = obj;
+	ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+	ext->oe_nr_pages = page_count;
+	ext->oe_mppr = mppr;
+	list_splice_init(list, &ext->oe_pages);
+
+	osc_object_lock(obj);
+	/* Reuse the initial refcount for RPC, don't drop it */
+	osc_extent_state_set(ext, OES_LOCK_DONE);
+	if (cmd & OBD_BRW_WRITE) {
+		list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+		osc_update_pending(obj, OBD_BRW_WRITE, page_count);
+	} else {
+		list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
+		osc_update_pending(obj, OBD_BRW_READ, page_count);
+	}
+	osc_object_unlock(obj);
+
+	osc_io_unplug(env, cli, obj, PDL_POLICY_ROUND);
+	RETURN(0);
+}
+
+/**
+ * Called by osc_io_setattr_start() to freeze and destroy covering extents.
+ */
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+			     struct osc_object *obj, __u64 size)
+{
+	struct client_obd *cli = osc_cli(obj);
+	struct osc_extent *ext;
+	struct osc_extent *waiting = NULL;
+	pgoff_t index;
+	LIST_HEAD(list);
+	int result = 0;
+	bool partial;
+	ENTRY;
+
+	/* pages with index greater or equal to index will be truncated. */
+	index = cl_index(osc2cl(obj), size);
+	partial = size > cl_offset(osc2cl(obj), index);
+
+again:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, index);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < index)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		EASSERT(ext->oe_state != OES_TRUNC, ext);
+
+		if (ext->oe_state > OES_CACHE || ext->oe_urgent) {
+			/* if ext is in urgent state, it means there must exist
+			 * a page already having been flushed by write_page().
+			 * We have to wait for this extent because we can't
+			 * truncate that page. */
+			LASSERT(!ext->oe_hp);
+			OSC_EXTENT_DUMP(D_CACHE, ext,
+					"waiting for busy extent\n");
+			waiting = osc_extent_get(ext);
+			break;
+		}
+
+		OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:"LPU64".\n", size);
+
+		osc_extent_get(ext);
+		if (ext->oe_state == OES_ACTIVE) {
+			/* though we grab inode mutex for write path, but we
+			 * release it before releasing extent(in osc_io_end()),
+			 * so there is a race window that an extent is still
+			 * in OES_ACTIVE when truncate starts. */
+			LASSERT(!ext->oe_trunc_pending);
+			ext->oe_trunc_pending = 1;
+		} else {
+			EASSERT(ext->oe_state == OES_CACHE, ext);
+			osc_extent_state_set(ext, OES_TRUNC);
+			osc_update_pending(obj, OBD_BRW_WRITE,
+					   -ext->oe_nr_pages);
+		}
+		EASSERT(list_empty(&ext->oe_link), ext);
+		list_add_tail(&ext->oe_link, &list);
+
+		ext = next_extent(ext);
+	}
+	osc_object_unlock(obj);
+
+	osc_list_maint(cli, obj);
+
+	while (!list_empty(&list)) {
+		int rc;
+
+		ext = list_entry(list.next, struct osc_extent, oe_link);
+		list_del_init(&ext->oe_link);
+
+		/* extent may be in OES_ACTIVE state because inode mutex
+		 * is released before osc_io_end() in file write case */
+		if (ext->oe_state != OES_TRUNC)
+			osc_extent_wait(env, ext, OES_TRUNC);
+
+		rc = osc_extent_truncate(ext, index, partial);
+		if (rc < 0) {
+			if (result == 0)
+				result = rc;
+
+			OSC_EXTENT_DUMP(D_ERROR, ext,
+					"truncate error %d\n", rc);
+		} else if (ext->oe_nr_pages == 0) {
+			osc_extent_remove(ext);
+		} else {
+			/* this must be an overlapped extent which means only
+			 * part of pages in this extent have been truncated.
+			 */
+			EASSERTF(ext->oe_start <= index, ext,
+				 "trunc index = %lu/%d.\n", index, partial);
+			/* fix index to skip this partially truncated extent */
+			index = ext->oe_end + 1;
+			partial = false;
+
+			/* we need to hold this extent in OES_TRUNC state so
+			 * that no writeback will happen. This is to avoid
+			 * BUG 17397. */
+			LASSERT(oio->oi_trunc == NULL);
+			oio->oi_trunc = osc_extent_get(ext);
+			OSC_EXTENT_DUMP(D_CACHE, ext,
+					"trunc at "LPU64"\n", size);
+		}
+		osc_extent_put(env, ext);
+	}
+	if (waiting != NULL) {
+		int rc;
+
+		/* ignore the result of osc_extent_wait the write initiator
+		 * should take care of it. */
+		rc = osc_extent_wait(env, waiting, OES_INV);
+		if (rc < 0)
+			OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc);
+
+		osc_extent_put(env, waiting);
+		waiting = NULL;
+		goto again;
+	}
+	RETURN(result);
+}
+
+/**
+ * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
+ */
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+			    struct osc_object *obj)
+{
+	struct osc_extent *ext = oio->oi_trunc;
+
+	oio->oi_trunc = NULL;
+	if (ext != NULL) {
+		bool unplug = false;
+
+		EASSERT(ext->oe_nr_pages > 0, ext);
+		EASSERT(ext->oe_state == OES_TRUNC, ext);
+		EASSERT(!ext->oe_urgent, ext);
+
+		OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n");
+		osc_object_lock(obj);
+		osc_extent_state_set(ext, OES_CACHE);
+		if (ext->oe_fsync_wait && !ext->oe_urgent) {
+			ext->oe_urgent = 1;
+			list_move_tail(&ext->oe_link, &obj->oo_urgent_exts);
+			unplug = true;
+		}
+		osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages);
+		osc_object_unlock(obj);
+		osc_extent_put(env, ext);
+
+		if (unplug)
+			osc_io_unplug_async(env, osc_cli(obj), obj);
+	}
+}
+
+/**
+ * Wait for extents in a specific range to be written out.
+ * The caller must have called osc_cache_writeback_range() to issue IO
+ * otherwise it will take a long time for this function to finish.
+ *
+ * Caller must hold inode_mutex , or cancel exclusive dlm lock so that
+ * nobody else can dirty this range of file while we're waiting for
+ * extents to be written.
+ */
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+			 pgoff_t start, pgoff_t end)
+{
+	struct osc_extent *ext;
+	pgoff_t index = start;
+	int     result = 0;
+	ENTRY;
+
+again:
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, index);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < index)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		int rc;
+
+		if (ext->oe_start > end)
+			break;
+
+		if (!ext->oe_fsync_wait) {
+			ext = next_extent(ext);
+			continue;
+		}
+
+		EASSERT(ergo(ext->oe_state == OES_CACHE,
+			     ext->oe_hp || ext->oe_urgent), ext);
+		EASSERT(ergo(ext->oe_state == OES_ACTIVE,
+			     !ext->oe_hp && ext->oe_urgent), ext);
+
+		index = ext->oe_end + 1;
+		osc_extent_get(ext);
+		osc_object_unlock(obj);
+
+		rc = osc_extent_wait(env, ext, OES_INV);
+		if (result == 0)
+			result = rc;
+		osc_extent_put(env, ext);
+		goto again;
+	}
+	osc_object_unlock(obj);
+
+	OSC_IO_DEBUG(obj, "sync file range.\n");
+	RETURN(result);
+}
+
+/**
+ * Called to write out a range of osc object.
+ *
+ * @hp     : should be set this is caused by lock cancel;
+ * @discard: is set if dirty pages should be dropped - file will be deleted or
+ *	   truncated, this implies there is no partially discarding extents.
+ *
+ * Return how many pages will be issued, or error code if error occurred.
+ */
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+			      pgoff_t start, pgoff_t end, int hp, int discard)
+{
+	struct osc_extent *ext;
+	LIST_HEAD(discard_list);
+	bool unplug = false;
+	int result = 0;
+	ENTRY;
+
+	osc_object_lock(obj);
+	ext = osc_extent_search(obj, start);
+	if (ext == NULL)
+		ext = first_extent(obj);
+	else if (ext->oe_end < start)
+		ext = next_extent(ext);
+	while (ext != NULL) {
+		if (ext->oe_start > end)
+			break;
+
+		ext->oe_fsync_wait = 1;
+		switch (ext->oe_state) {
+		case OES_CACHE:
+			result += ext->oe_nr_pages;
+			if (!discard) {
+				struct list_head *list = NULL;
+				if (hp) {
+					EASSERT(!ext->oe_hp, ext);
+					ext->oe_hp = 1;
+					list = &obj->oo_hp_exts;
+				} else if (!ext->oe_urgent) {
+					ext->oe_urgent = 1;
+					list = &obj->oo_urgent_exts;
+				}
+				if (list != NULL)
+					list_move_tail(&ext->oe_link, list);
+				unplug = true;
+			} else {
+				/* the only discarder is lock cancelling, so
+				 * [start, end] must contain this extent */
+				EASSERT(ext->oe_start >= start &&
+					ext->oe_max_end <= end, ext);
+				osc_extent_state_set(ext, OES_LOCKING);
+				ext->oe_owner = current;
+				list_move_tail(&ext->oe_link,
+						   &discard_list);
+				osc_update_pending(obj, OBD_BRW_WRITE,
+						   -ext->oe_nr_pages);
+			}
+			break;
+		case OES_ACTIVE:
+			/* It's pretty bad to wait for ACTIVE extents, because
+			 * we don't know how long we will wait for it to be
+			 * flushed since it may be blocked at awaiting more
+			 * grants. We do this for the correctness of fsync. */
+			LASSERT(hp == 0 && discard == 0);
+			ext->oe_urgent = 1;
+			break;
+		case OES_TRUNC:
+			/* this extent is being truncated, can't do anything
+			 * for it now. it will be set to urgent after truncate
+			 * is finished in osc_cache_truncate_end(). */
+		default:
+			break;
+		}
+		ext = next_extent(ext);
+	}
+	osc_object_unlock(obj);
+
+	LASSERT(ergo(!discard, list_empty(&discard_list)));
+	if (!list_empty(&discard_list)) {
+		struct osc_extent *tmp;
+		int rc;
+
+		osc_list_maint(osc_cli(obj), obj);
+		list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) {
+			list_del_init(&ext->oe_link);
+			EASSERT(ext->oe_state == OES_LOCKING, ext);
+
+			/* Discard caching pages. We don't actually write this
+			 * extent out but we complete it as if we did. */
+			rc = osc_extent_make_ready(env, ext);
+			if (unlikely(rc < 0)) {
+				OSC_EXTENT_DUMP(D_ERROR, ext,
+						"make_ready returned %d\n", rc);
+				if (result >= 0)
+					result = rc;
+			}
+
+			/* finish the extent as if the pages were sent */
+			osc_extent_finish(env, ext, 0, 0);
+		}
+	}
+
+	if (unplug)
+		osc_io_unplug(env, osc_cli(obj), obj, PDL_POLICY_ROUND);
+
+	if (hp || discard) {
+		int rc;
+		rc = osc_cache_wait_range(env, obj, start, end);
+		if (result >= 0 && rc < 0)
+			result = rc;
+	}
+
+	OSC_IO_DEBUG(obj, "cache page out.\n");
+	RETURN(result);
+}
+
+/** @} osc */

diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
new file mode 100644
index 0000000..158e8ff
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h

@@ -0,0 +1,677 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#ifndef OSC_CL_INTERNAL_H
+#define OSC_CL_INTERNAL_H
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+/* osc_build_res_name() */
+#include <obd_ost.h>
+#include <cl_object.h>
+#include <lclient.h>
+#include "osc_internal.h"
+
+/** \defgroup osc osc
+ *  @{
+ */
+
+struct osc_extent;
+
+/**
+ * State maintained by osc layer for each IO context.
+ */
+struct osc_io {
+	/** super class */
+	struct cl_io_slice oi_cl;
+	/** true if this io is lockless. */
+	int		oi_lockless;
+	/** active extents, we know how many bytes is going to be written,
+	 * so having an active extent will prevent it from being fragmented */
+	struct osc_extent *oi_active;
+	/** partially truncated extent, we need to hold this extent to prevent
+	 * page writeback from happening. */
+	struct osc_extent *oi_trunc;
+
+	struct obd_info    oi_info;
+	struct obdo	oi_oa;
+	struct osc_async_cbargs {
+		bool		  opc_rpc_sent;
+		int	       opc_rc;
+		struct completion	opc_sync;
+	} oi_cbarg;
+};
+
+/**
+ * State of transfer for osc.
+ */
+struct osc_req {
+	struct cl_req_slice    or_cl;
+};
+
+/**
+ * State maintained by osc layer for the duration of a system call.
+ */
+struct osc_session {
+	struct osc_io       os_io;
+};
+
+#define OTI_PVEC_SIZE 64
+struct osc_thread_info {
+	struct ldlm_res_id      oti_resname;
+	ldlm_policy_data_t      oti_policy;
+	struct cl_lock_descr    oti_descr;
+	struct cl_attr	  oti_attr;
+	struct lustre_handle    oti_handle;
+	struct cl_page_list     oti_plist;
+	struct cl_io		oti_io;
+	struct cl_page	       *oti_pvec[OTI_PVEC_SIZE];
+};
+
+struct osc_object {
+	struct cl_object   oo_cl;
+	struct lov_oinfo  *oo_oinfo;
+	/**
+	 * True if locking against this stripe got -EUSERS.
+	 */
+	int		oo_contended;
+	cfs_time_t	 oo_contention_time;
+	/**
+	 * List of pages in transfer.
+	 */
+	struct list_head	 oo_inflight[CRT_NR];
+	/**
+	 * Lock, protecting ccc_object::cob_inflight, because a seat-belt is
+	 * locked during take-off and landing.
+	 */
+	spinlock_t	   oo_seatbelt;
+
+	/**
+	 * used by the osc to keep track of what objects to build into rpcs.
+	 * Protected by client_obd->cli_loi_list_lock.
+	 */
+	struct list_head	   oo_ready_item;
+	struct list_head	   oo_hp_ready_item;
+	struct list_head	   oo_write_item;
+	struct list_head	   oo_read_item;
+
+	/**
+	 * extent is a red black tree to manage (async) dirty pages.
+	 */
+	struct rb_root       oo_root;
+	/**
+	 * Manage write(dirty) extents.
+	 */
+	struct list_head	   oo_hp_exts; /* list of hp extents */
+	struct list_head	   oo_urgent_exts; /* list of writeback extents */
+	struct list_head	   oo_rpc_exts;
+
+	struct list_head	   oo_reading_exts;
+
+	atomic_t	 oo_nr_reads;
+	atomic_t	 oo_nr_writes;
+
+	/** Protect extent tree. Will be used to protect
+	 * oo_{read|write}_pages soon. */
+	spinlock_t	    oo_lock;
+};
+
+static inline void osc_object_lock(struct osc_object *obj)
+{
+	spin_lock(&obj->oo_lock);
+}
+
+static inline int osc_object_trylock(struct osc_object *obj)
+{
+	return spin_trylock(&obj->oo_lock);
+}
+
+static inline void osc_object_unlock(struct osc_object *obj)
+{
+	spin_unlock(&obj->oo_lock);
+}
+
+static inline int osc_object_is_locked(struct osc_object *obj)
+{
+	return spin_is_locked(&obj->oo_lock);
+}
+
+/*
+ * Lock "micro-states" for osc layer.
+ */
+enum osc_lock_state {
+	OLS_NEW,
+	OLS_ENQUEUED,
+	OLS_UPCALL_RECEIVED,
+	OLS_GRANTED,
+	OLS_RELEASED,
+	OLS_BLOCKED,
+	OLS_CANCELLED
+};
+
+/**
+ * osc-private state of cl_lock.
+ *
+ * Interaction with DLM.
+ *
+ * CLIO enqueues all DLM locks through ptlrpcd (that is, in "async" mode).
+ *
+ * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in
+ * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_lock.
+ *
+ * This pointer is protected through a reference, acquired by
+ * osc_lock_upcall0(). Also, an additional reference is acquired by
+ * ldlm_lock_addref() call protecting the lock from cancellation, until
+ * osc_lock_unuse() releases it.
+ *
+ * Below is a description of how lock references are acquired and released
+ * inside of DLM.
+ *
+ * - When new lock is created and enqueued to the server (ldlm_cli_enqueue())
+ *      - ldlm_lock_create()
+ *	  - ldlm_lock_new(): initializes a lock with 2 references. One for
+ *	    the caller (released when reply from the server is received, or on
+ *	    error), and another for the hash table.
+ *      - ldlm_lock_addref_internal(): protects the lock from cancellation.
+ *
+ * - When reply is received from the server (osc_enqueue_interpret())
+ *      - ldlm_cli_enqueue_fini()
+ *	  - LDLM_LOCK_PUT(): releases caller reference acquired by
+ *	    ldlm_lock_new().
+ *	  - if (rc != 0)
+ *		ldlm_lock_decref(): error case: matches ldlm_cli_enqueue().
+ *      - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue().
+ *
+ * - When lock is being cancelled (ldlm_lock_cancel())
+ *      - ldlm_lock_destroy()
+ *	  - LDLM_LOCK_PUT(): releases hash-table reference acquired by
+ *	    ldlm_lock_new().
+ *
+ * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called
+ * either when lock is cancelled (osc_lock_blocking()), or when locks is
+ * deleted without cancellation (e.g., from cl_locks_prune()). In the latter
+ * case ldlm lock remains in memory, and can be re-attached to osc_lock in the
+ * future.
+ */
+struct osc_lock {
+	struct cl_lock_slice     ols_cl;
+	/** underlying DLM lock */
+	struct ldlm_lock	*ols_lock;
+	/** lock value block */
+	struct ost_lvb	   ols_lvb;
+	/** DLM flags with which osc_lock::ols_lock was enqueued */
+	__u64		    ols_flags;
+	/** osc_lock::ols_lock handle */
+	struct lustre_handle     ols_handle;
+	struct ldlm_enqueue_info ols_einfo;
+	enum osc_lock_state      ols_state;
+
+	/**
+	 * How many pages are using this lock for io, currently only used by
+	 * read-ahead. If non-zero, the underlying dlm lock won't be cancelled
+	 * during recovery to avoid deadlock. see bz16774.
+	 *
+	 * \see osc_page::ops_lock
+	 * \see osc_page_addref_lock(), osc_page_putref_lock()
+	 */
+	atomic_t	     ols_pageref;
+
+	/**
+	 * true, if ldlm_lock_addref() was called against
+	 * osc_lock::ols_lock. This is used for sanity checking.
+	 *
+	 * \see osc_lock::ols_has_ref
+	 */
+	unsigned		  ols_hold :1,
+	/**
+	 * this is much like osc_lock::ols_hold, except that this bit is
+	 * cleared _after_ reference in released in osc_lock_unuse(). This
+	 * fine distinction is needed because:
+	 *
+	 *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+	 *       to return associated cl_lock (so that a flag is needed that is
+	 *       cleared after ldlm_lock_decref() returned), and
+	 *
+	 *     - ldlm_lock_decref() can invoke blocking ast (for a
+	 *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+	 *       osc_lock_cancel() called from there need to know whether to
+	 *       release lock reference (so that a flag is needed that is
+	 *       cleared before ldlm_lock_decref() is called).
+	 */
+				 ols_has_ref:1,
+	/**
+	 * inherit the lockless attribute from top level cl_io.
+	 * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+	 */
+				 ols_locklessable:1,
+	/**
+	 * set by osc_lock_use() to wait until blocking AST enters into
+	 * osc_ldlm_blocking_ast0(), so that cl_lock mutex can be used for
+	 * further synchronization.
+	 */
+				 ols_ast_wait:1,
+	/**
+	 * If the data of this lock has been flushed to server side.
+	 */
+				 ols_flush:1,
+	/**
+	 * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+	 * the EVAVAIL error as torerable, this will make upper logic happy
+	 * to wait all glimpse locks to each OSTs to be completed.
+	 * Glimpse lock converts to normal lock if the server lock is
+	 * granted.
+	 * Glimpse lock should be destroyed immediately after use.
+	 */
+				 ols_glimpse:1,
+	/**
+	 * For async glimpse lock.
+	 */
+				 ols_agl:1;
+	/**
+	 * IO that owns this lock. This field is used for a dead-lock
+	 * avoidance by osc_lock_enqueue_wait().
+	 *
+	 * XXX: unfortunately, the owner of a osc_lock is not unique,
+	 * the lock may have multiple users, if the lock is granted and
+	 * then matched.
+	 */
+	struct osc_io	   *ols_owner;
+};
+
+
+/**
+ * Page state private for osc layer.
+ */
+struct osc_page {
+	struct cl_page_slice  ops_cl;
+	/**
+	 * Page queues used by osc to detect when RPC can be formed.
+	 */
+	struct osc_async_page ops_oap;
+	/**
+	 * An offset within page from which next transfer starts. This is used
+	 * by cl_page_clip() to submit partial page transfers.
+	 */
+	int		   ops_from;
+	/**
+	 * An offset within page at which next transfer ends.
+	 *
+	 * \see osc_page::ops_from.
+	 */
+	int		   ops_to;
+	/**
+	 * Boolean, true iff page is under transfer. Used for sanity checking.
+	 */
+	unsigned	      ops_transfer_pinned:1,
+	/**
+	 * True for a `temporary page' created by read-ahead code, probably
+	 * outside of any DLM lock.
+	 */
+			      ops_temp:1,
+	/**
+	 * in LRU?
+	 */
+			      ops_in_lru:1,
+	/**
+	 * Set if the page must be transferred with OBD_BRW_SRVLOCK.
+	 */
+			      ops_srvlock:1;
+	union {
+		/**
+		 * lru page list. ops_inflight and ops_lru are exclusive so
+		 * that they can share the same data.
+		 */
+		struct list_head	      ops_lru;
+		/**
+		 * Linkage into a per-osc_object list of pages in flight. For
+		 * debugging.
+		 */
+		struct list_head	    ops_inflight;
+	};
+	/**
+	 * Thread that submitted this page for transfer. For debugging.
+	 */
+	task_t	   *ops_submitter;
+	/**
+	 * Submit time - the time when the page is starting RPC. For debugging.
+	 */
+	cfs_time_t	    ops_submit_time;
+
+	/**
+	 * A lock of which we hold a reference covers this page. Only used by
+	 * read-ahead: for a readahead page, we hold it's covering lock to
+	 * prevent it from being canceled during recovery.
+	 *
+	 * \see osc_lock::ols_pageref
+	 * \see osc_page_addref_lock(), osc_page_putref_lock().
+	 */
+	struct cl_lock       *ops_lock;
+};
+
+extern struct kmem_cache *osc_lock_kmem;
+extern struct kmem_cache *osc_object_kmem;
+extern struct kmem_cache *osc_thread_kmem;
+extern struct kmem_cache *osc_session_kmem;
+extern struct kmem_cache *osc_req_kmem;
+extern struct kmem_cache *osc_extent_kmem;
+
+extern struct lu_device_type osc_device_type;
+extern struct lu_context_key osc_key;
+extern struct lu_context_key osc_session_key;
+
+#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
+
+int osc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *io);
+int osc_io_init  (const struct lu_env *env,
+		  struct cl_object *obj, struct cl_io *io);
+int osc_req_init (const struct lu_env *env, struct cl_device *dev,
+		  struct cl_req *req);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *hdr,
+				   struct lu_device *dev);
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+		  struct cl_page *page, struct page *vmpage);
+
+void osc_index2policy  (ldlm_policy_data_t *policy, const struct cl_object *obj,
+			pgoff_t start, pgoff_t end);
+int  osc_lvb_print     (const struct lu_env *env, void *cookie,
+			lu_printer_t p, const struct ost_lvb *lvb);
+
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+		     enum cl_req_type crt, int brw_flags);
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
+int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
+			obd_flag async_flags);
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+			struct page *page, loff_t offset);
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+		       struct osc_page *ops);
+int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
+			    struct osc_page *ops);
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+			 struct osc_page *ops);
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+			 struct list_head *list, int cmd, int brw_flags);
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+			     struct osc_object *obj, __u64 size);
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+			    struct osc_object *obj);
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+			      pgoff_t start, pgoff_t end, int hp, int discard);
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+			 pgoff_t start, pgoff_t end);
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+		   struct osc_object *osc, pdl_policy_t pol);
+
+void osc_object_set_contended  (struct osc_object *obj);
+void osc_object_clear_contended(struct osc_object *obj);
+int  osc_object_is_contended   (struct osc_object *obj);
+
+int  osc_lock_is_lockless      (const struct osc_lock *olck);
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
+{
+	struct osc_thread_info *info;
+
+	info = lu_context_key_get(&env->le_ctx, &osc_key);
+	LASSERT(info != NULL);
+	return info;
+}
+
+static inline struct osc_session *osc_env_session(const struct lu_env *env)
+{
+	struct osc_session *ses;
+
+	ses = lu_context_key_get(env->le_ses, &osc_session_key);
+	LASSERT(ses != NULL);
+	return ses;
+}
+
+static inline struct osc_io *osc_env_io(const struct lu_env *env)
+{
+	return &osc_env_session(env)->os_io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+	return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
+{
+	LINVRNT(d->ld_type == &osc_device_type);
+	return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+}
+
+static inline struct obd_export *osc_export(const struct osc_object *obj)
+{
+	return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+}
+
+static inline struct client_obd *osc_cli(const struct osc_object *obj)
+{
+	return &osc_export(obj)->exp_obd->u.cli;
+}
+
+static inline struct osc_object *cl2osc(const struct cl_object *obj)
+{
+	LINVRNT(osc_is_object(&obj->co_lu));
+	return container_of0(obj, struct osc_object, oo_cl);
+}
+
+static inline struct cl_object *osc2cl(const struct osc_object *obj)
+{
+	return (struct cl_object *)&obj->oo_cl;
+}
+
+static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode)
+{
+	LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
+	if (mode == CLM_READ)
+		return LCK_PR;
+	else if (mode == CLM_WRITE)
+		return LCK_PW;
+	else
+		return LCK_GROUP;
+}
+
+static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode)
+{
+	LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP);
+	if (mode == LCK_PR)
+		return CLM_READ;
+	else if (mode == LCK_PW)
+		return CLM_WRITE;
+	else
+		return CLM_GROUP;
+}
+
+static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
+{
+	LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
+	return container_of0(slice, struct osc_page, ops_cl);
+}
+
+static inline struct osc_page *oap2osc(struct osc_async_page *oap)
+{
+	return container_of0(oap, struct osc_page, ops_oap);
+}
+
+static inline struct cl_page *oap2cl_page(struct osc_async_page *oap)
+{
+	return oap2osc(oap)->ops_cl.cpl_page;
+}
+
+static inline struct osc_page *oap2osc_page(struct osc_async_page *oap)
+{
+	return (struct osc_page *)container_of(oap, struct osc_page, ops_oap);
+}
+
+static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
+{
+	LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
+	return container_of0(slice, struct osc_lock, ols_cl);
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+	return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+static inline int osc_io_srvlock(struct osc_io *oio)
+{
+	return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+}
+
+enum osc_extent_state {
+	OES_INV       = 0, /** extent is just initialized or destroyed */
+	OES_ACTIVE    = 1, /** process is using this extent */
+	OES_CACHE     = 2, /** extent is ready for IO */
+	OES_LOCKING   = 3, /** locking page to prepare IO */
+	OES_LOCK_DONE = 4, /** locking finished, ready to send */
+	OES_RPC       = 5, /** in RPC */
+	OES_TRUNC     = 6, /** being truncated */
+	OES_STATE_MAX
+};
+
+/**
+ * osc_extent data to manage dirty pages.
+ * osc_extent has the following attributes:
+ * 1. all pages in the same must be in one RPC in write back;
+ * 2. # of pages must be less than max_pages_per_rpc - implied by 1;
+ * 3. must be covered by only 1 osc_lock;
+ * 4. exclusive. It's impossible to have overlapped osc_extent.
+ *
+ * The lifetime of an extent is from when the 1st page is dirtied to when
+ * all pages inside it are written out.
+ *
+ * LOCKING ORDER
+ * =============
+ * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock)
+ */
+struct osc_extent {
+	/** red-black tree node */
+	struct rb_node     oe_node;
+	/** osc_object of this extent */
+	struct osc_object *oe_obj;
+	/** refcount, removed from red-black tree if reaches zero. */
+	atomic_t       oe_refc;
+	/** busy if non-zero */
+	atomic_t       oe_users;
+	/** link list of osc_object's oo_{hp|urgent|locking}_exts. */
+	struct list_head	 oe_link;
+	/** state of this extent */
+	unsigned int       oe_state;
+	/** flags for this extent. */
+	unsigned int       oe_intree:1,
+	/** 0 is write, 1 is read */
+			   oe_rw:1,
+			   oe_srvlock:1,
+			   oe_memalloc:1,
+	/** an ACTIVE extent is going to be truncated, so when this extent
+	 * is released, it will turn into TRUNC state instead of CACHE. */
+			   oe_trunc_pending:1,
+	/** this extent should be written asap and someone may wait for the
+	 * write to finish. This bit is usually set along with urgent if
+	 * the extent was CACHE state.
+	 * fsync_wait extent can't be merged because new extent region may
+	 * exceed fsync range. */
+			   oe_fsync_wait:1,
+	/** covering lock is being canceled */
+			   oe_hp:1,
+	/** this extent should be written back asap. set if one of pages is
+	 * called by page WB daemon, or sync write or reading requests. */
+			   oe_urgent:1;
+	/** how many grants allocated for this extent.
+	 *  Grant allocated for this extent. There is no grant allocated
+	 *  for reading extents and sync write extents. */
+	unsigned int       oe_grants;
+	/** # of dirty pages in this extent */
+	unsigned int       oe_nr_pages;
+	/** list of pending oap pages. Pages in this list are NOT sorted. */
+	struct list_head	 oe_pages;
+	/** Since an extent has to be written out in atomic, this is used to
+	 * remember the next page need to be locked to write this extent out.
+	 * Not used right now.
+	 */
+	struct osc_page   *oe_next_page;
+	/** start and end index of this extent, include start and end
+	 * themselves. Page offset here is the page index of osc_pages.
+	 * oe_start is used as keyword for red-black tree. */
+	pgoff_t	    oe_start;
+	pgoff_t	    oe_end;
+	/** maximum ending index of this extent, this is limited by
+	 * max_pages_per_rpc, lock extent and chunk size. */
+	pgoff_t	    oe_max_end;
+	/** waitqueue - for those who want to be notified if this extent's
+	 * state has changed. */
+	wait_queue_head_t	oe_waitq;
+	/** lock covering this extent */
+	struct cl_lock    *oe_osclock;
+	/** terminator of this extent. Must be true if this extent is in IO. */
+	task_t	*oe_owner;
+	/** return value of writeback. If somebody is waiting for this extent,
+	 * this value can be known by outside world. */
+	int		oe_rc;
+	/** max pages per rpc when this extent was created */
+	unsigned int       oe_mppr;
+};
+
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+		      int sent, int rc);
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+
+/** @} osc */
+
+#endif /* OSC_CL_INTERNAL_H */

diff --git a/drivers/staging/lustre/lustre/osc/osc_dev.c b/drivers/staging/lustre/lustre/osc/osc_dev.c
new file mode 100644
index 0000000..4208ddf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_dev.c

@@ -0,0 +1,261 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device, cl_req for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ * @{
+ */
+
+struct kmem_cache *osc_lock_kmem;
+struct kmem_cache *osc_object_kmem;
+struct kmem_cache *osc_thread_kmem;
+struct kmem_cache *osc_session_kmem;
+struct kmem_cache *osc_req_kmem;
+struct kmem_cache *osc_extent_kmem;
+struct kmem_cache *osc_quota_kmem;
+
+struct lu_kmem_descr osc_caches[] = {
+	{
+		.ckd_cache = &osc_lock_kmem,
+		.ckd_name  = "osc_lock_kmem",
+		.ckd_size  = sizeof (struct osc_lock)
+	},
+	{
+		.ckd_cache = &osc_object_kmem,
+		.ckd_name  = "osc_object_kmem",
+		.ckd_size  = sizeof (struct osc_object)
+	},
+	{
+		.ckd_cache = &osc_thread_kmem,
+		.ckd_name  = "osc_thread_kmem",
+		.ckd_size  = sizeof (struct osc_thread_info)
+	},
+	{
+		.ckd_cache = &osc_session_kmem,
+		.ckd_name  = "osc_session_kmem",
+		.ckd_size  = sizeof (struct osc_session)
+	},
+	{
+		.ckd_cache = &osc_req_kmem,
+		.ckd_name  = "osc_req_kmem",
+		.ckd_size  = sizeof (struct osc_req)
+	},
+	{
+		.ckd_cache = &osc_extent_kmem,
+		.ckd_name  = "osc_extent_kmem",
+		.ckd_size  = sizeof (struct osc_extent)
+	},
+	{
+		.ckd_cache = &osc_quota_kmem,
+		.ckd_name  = "osc_quota_kmem",
+		.ckd_size  = sizeof(struct osc_quota_info)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+struct lock_class_key osc_ast_guard_class;
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+	return &osc->od_cl.cd_lu_dev;
+}
+
+/*****************************************************************************
+ *
+ * Osc device and device type functions.
+ *
+ */
+
+static void *osc_key_init(const struct lu_context *ctx,
+			 struct lu_context_key *key)
+{
+	struct osc_thread_info *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void osc_key_fini(const struct lu_context *ctx,
+			 struct lu_context_key *key, void *data)
+{
+	struct osc_thread_info *info = data;
+	OBD_SLAB_FREE_PTR(info, osc_thread_kmem);
+}
+
+struct lu_context_key osc_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = osc_key_init,
+	.lct_fini = osc_key_fini
+};
+
+static void *osc_session_init(const struct lu_context *ctx,
+			      struct lu_context_key *key)
+{
+	struct osc_session *info;
+
+	OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, __GFP_IO);
+	if (info == NULL)
+		info = ERR_PTR(-ENOMEM);
+	return info;
+}
+
+static void osc_session_fini(const struct lu_context *ctx,
+			     struct lu_context_key *key, void *data)
+{
+	struct osc_session *info = data;
+	OBD_SLAB_FREE_PTR(info, osc_session_kmem);
+}
+
+struct lu_context_key osc_session_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = osc_session_init,
+	.lct_fini = osc_session_fini
+};
+
+/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
+
+static int osc_cl_process_config(const struct lu_env *env,
+				 struct lu_device *d, struct lustre_cfg *cfg)
+{
+	ENTRY;
+	RETURN(osc_process_config_base(d->ld_obd, cfg));
+}
+
+static const struct lu_device_operations osc_lu_ops = {
+	.ldo_object_alloc      = osc_object_alloc,
+	.ldo_process_config    = osc_cl_process_config,
+	.ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations osc_cl_ops = {
+	.cdo_req_init = osc_req_init
+};
+
+static int osc_device_init(const struct lu_env *env, struct lu_device *d,
+			   const char *name, struct lu_device *next)
+{
+	RETURN(0);
+}
+
+static struct lu_device *osc_device_fini(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	return 0;
+}
+
+static struct lu_device *osc_device_free(const struct lu_env *env,
+					 struct lu_device *d)
+{
+	struct osc_device *od = lu2osc_dev(d);
+
+	cl_device_fini(lu2cl_dev(d));
+	OBD_FREE_PTR(od);
+	return NULL;
+}
+
+static struct lu_device *osc_device_alloc(const struct lu_env *env,
+					  struct lu_device_type *t,
+					  struct lustre_cfg *cfg)
+{
+	struct lu_device *d;
+	struct osc_device *od;
+	struct obd_device *obd;
+	int rc;
+
+	OBD_ALLOC_PTR(od);
+	if (od == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	cl_device_init(&od->od_cl, t);
+	d = osc2lu_dev(od);
+	d->ld_ops = &osc_lu_ops;
+	od->od_cl.cd_ops = &osc_cl_ops;
+
+	/* Setup OSC OBD */
+	obd = class_name2obd(lustre_cfg_string(cfg, 0));
+	LASSERT(obd != NULL);
+	rc = osc_setup(obd, cfg);
+	if (rc) {
+		osc_device_free(env, d);
+		RETURN(ERR_PTR(rc));
+	}
+	od->od_exp = obd->obd_self_export;
+	RETURN(d);
+}
+
+static const struct lu_device_type_operations osc_device_type_ops = {
+	.ldto_init = osc_type_init,
+	.ldto_fini = osc_type_fini,
+
+	.ldto_start = osc_type_start,
+	.ldto_stop  = osc_type_stop,
+
+	.ldto_device_alloc = osc_device_alloc,
+	.ldto_device_free  = osc_device_free,
+
+	.ldto_device_init    = osc_device_init,
+	.ldto_device_fini    = osc_device_fini
+};
+
+struct lu_device_type osc_device_type = {
+	.ldt_tags     = LU_DEVICE_CL,
+	.ldt_name     = LUSTRE_OSC_NAME,
+	.ldt_ops      = &osc_device_type_ops,
+	.ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */

diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h
new file mode 100644
index 0000000..efc5db4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_internal.h

@@ -0,0 +1,208 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef OSC_INTERNAL_H
+#define OSC_INTERNAL_H
+
+#define OAP_MAGIC 8675309
+
+struct lu_env;
+
+enum async_flags {
+	ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+			      page is added to an rpc */
+	ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+	ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+				     to give the caller a chance to update
+				     or cancel the size of the io */
+	ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+	int		     oap_magic;
+	unsigned short	  oap_cmd;
+	unsigned short	  oap_interrupted:1;
+
+	struct list_head	      oap_pending_item;
+	struct list_head	      oap_rpc_item;
+
+	obd_off		 oap_obj_off;
+	unsigned		oap_page_off;
+	enum async_flags	oap_async_flags;
+
+	struct brw_page	 oap_brw_page;
+
+	struct ptlrpc_request   *oap_request;
+	struct client_obd       *oap_cli;
+	struct osc_object       *oap_obj;
+
+	struct ldlm_lock	*oap_ldlm_lock;
+	spinlock_t		 oap_lock;
+};
+
+#define oap_page	oap_brw_page.pg
+#define oap_count       oap_brw_page.count
+#define oap_brw_flags   oap_brw_page.flag
+
+struct osc_cache_waiter {
+	struct list_head	      ocw_entry;
+	wait_queue_head_t	     ocw_waitq;
+	struct osc_async_page  *ocw_oap;
+	int		     ocw_grant;
+	int		     ocw_rc;
+};
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+	       struct obdo *oa, struct lov_stripe_md **ea,
+	       struct obd_trans_info *oti);
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+		    struct lov_stripe_md **ea, struct obd_trans_info *oti);
+void osc_wake_cache_waiters(struct client_obd *cli);
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
+void osc_update_next_shrink(struct client_obd *cli);
+
+/*
+ * cl integration.
+ */
+#include <cl_object.h>
+
+extern struct ptlrpc_request_set *PTLRPCD_SET;
+
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		     __u64 *flags, ldlm_policy_data_t *policy,
+		     struct ost_lvb *lvb, int kms_valid,
+		     obd_enqueue_update_f upcall,
+		     void *cookie, struct ldlm_enqueue_info *einfo,
+		     struct lustre_handle *lockh,
+		     struct ptlrpc_request_set *rqset, int async, int agl);
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode);
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		   __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+		   int *flags, void *data, struct lustre_handle *lockh,
+		   int unref);
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+			   struct obd_trans_info *oti,
+			   obd_enqueue_update_f upcall, void *cookie,
+			   struct ptlrpc_request_set *rqset);
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+		   obd_enqueue_update_f upcall, void *cookie,
+		   struct ptlrpc_request_set *rqset);
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+		  obd_enqueue_update_f upcall, void *cookie,
+		  struct ptlrpc_request_set *rqset);
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct list_head *ext_list, int cmd, pdl_policy_t p);
+int osc_lru_shrink(struct client_obd *cli, int target);
+
+extern spinlock_t osc_ast_guard;
+
+int osc_cleanup(struct obd_device *obd);
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+#ifdef LPROCFS
+int lproc_osc_attach_seqstat(struct obd_device *dev);
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
+static inline void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+	memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+extern struct lu_device_type osc_device_type;
+
+static inline int osc_recoverable_error(int rc)
+{
+	return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
+		rc == -EAGAIN || rc == -EINPROGRESS);
+}
+
+static inline unsigned long rpcs_in_flight(struct client_obd *cli)
+{
+	return cli->cl_r_in_flight + cli->cl_w_in_flight;
+}
+
+#ifndef min_t
+#define min_t(type,x,y) \
+	({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+#endif
+
+struct osc_device {
+	struct cl_device    od_cl;
+	struct obd_export  *od_exp;
+
+	/* Write stats is actually protected by client_obd's lock. */
+	struct osc_stats {
+		uint64_t     os_lockless_writes;	  /* by bytes */
+		uint64_t     os_lockless_reads;	   /* by bytes */
+		uint64_t     os_lockless_truncates;       /* by times */
+	} od_stats;
+
+	/* configuration item(s) */
+	int		 od_contention_time;
+	int		 od_lockless_truncate;
+};
+
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+	return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm);
+
+extern struct kmem_cache *osc_quota_kmem;
+struct osc_quota_info {
+	/** linkage for quota hash table */
+	struct hlist_node oqi_hash;
+	obd_uid	  oqi_id;
+};
+int osc_quota_setup(struct obd_device *obd);
+int osc_quota_cleanup(struct obd_device *obd);
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+		    obd_flag valid, obd_flag flags);
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+		 struct obd_quotactl *oqctl);
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+		   struct obd_quotactl *oqctl);
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
+
+#endif /* OSC_INTERNAL_H */

diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c
new file mode 100644
index 0000000..1b27704
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_io.c

@@ -0,0 +1,836 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct osc_req *cl2osc_req(const struct cl_req_slice *slice)
+{
+	LINVRNT(slice->crs_dev->cd_lu_dev.ld_type == &osc_device_type);
+	return container_of0(slice, struct osc_req, or_cl);
+}
+
+static struct osc_io *cl2osc_io(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+	LINVRNT(oio == osc_env_io(env));
+	return oio;
+}
+
+static struct osc_page *osc_cl_page_osc(struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	slice = cl_page_at(page, &osc_device_type);
+	LASSERT(slice != NULL);
+
+	return cl2osc_page(slice);
+}
+
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
+{
+}
+
+/**
+ * An implementation of cl_io_operations::cio_io_submit() method for osc
+ * layer. Iterates over pages in the in-queue, prepares each for io by calling
+ * cl_page_prep() and then either submits them through osc_io_submit_page()
+ * or, if page is already submitted, changes osc flags through
+ * osc_set_async_flags().
+ */
+static int osc_io_submit(const struct lu_env *env,
+			 const struct cl_io_slice *ios,
+			 enum cl_req_type crt, struct cl_2queue *queue)
+{
+	struct cl_page    *page;
+	struct cl_page    *tmp;
+	struct client_obd *cli  = NULL;
+	struct osc_object *osc  = NULL; /* to keep gcc happy */
+	struct osc_page   *opg;
+	struct cl_io      *io;
+	LIST_HEAD     (list);
+
+	struct cl_page_list *qin      = &queue->c2_qin;
+	struct cl_page_list *qout     = &queue->c2_qout;
+	int queued = 0;
+	int result = 0;
+	int cmd;
+	int brw_flags;
+	int max_pages;
+
+	LASSERT(qin->pl_nr > 0);
+
+	CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt);
+
+	osc = cl2osc(ios->cis_obj);
+	cli = osc_cli(osc);
+	max_pages = cli->cl_max_pages_per_rpc;
+
+	cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+
+	/*
+	 * NOTE: here @page is a top-level page. This is done to avoid
+	 *       creation of sub-page-list.
+	 */
+	cl_page_list_for_each_safe(page, tmp, qin) {
+		struct osc_async_page *oap;
+
+		/* Top level IO. */
+		io = page->cp_owner;
+		LASSERT(io != NULL);
+
+		opg = osc_cl_page_osc(page);
+		oap = &opg->ops_oap;
+		LASSERT(osc == oap->oap_obj);
+
+		if (!list_empty(&oap->oap_pending_item) ||
+		    !list_empty(&oap->oap_rpc_item)) {
+			CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
+			       oap, opg);
+			result = -EBUSY;
+			break;
+		}
+
+		result = cl_page_prep(env, io, page, crt);
+		if (result != 0) {
+			LASSERT(result < 0);
+			if (result != -EALREADY)
+				break;
+			/*
+			 * Handle -EALREADY error: for read case, the page is
+			 * already in UPTODATE state; for write, the page
+			 * is not dirty.
+			 */
+			result = 0;
+			continue;
+		}
+
+		cl_page_list_move(qout, qin, page);
+		oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY;
+		oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+
+		osc_page_submit(env, opg, crt, brw_flags);
+		list_add_tail(&oap->oap_pending_item, &list);
+		if (++queued == max_pages) {
+			queued = 0;
+			result = osc_queue_sync_pages(env, osc, &list, cmd,
+						      brw_flags);
+			if (result < 0)
+				break;
+		}
+	}
+
+	if (queued > 0)
+		result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+
+	CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
+	return qout->pl_nr > 0 ? 0 : result;
+}
+
+static void osc_page_touch_at(const struct lu_env *env,
+			      struct cl_object *obj, pgoff_t idx, unsigned to)
+{
+	struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+	struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+	int valid;
+	__u64 kms;
+
+	/* offset within stripe */
+	kms = cl_offset(obj, idx) + to;
+
+	cl_object_attr_lock(obj);
+	/*
+	 * XXX old code used
+	 *
+	 *	 ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
+	 *
+	 * here
+	 */
+	CDEBUG(D_INODE, "stripe KMS %sincreasing "LPU64"->"LPU64" "LPU64"\n",
+	       kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+	       loi->loi_lvb.lvb_size);
+
+	valid = 0;
+	if (kms > loi->loi_kms) {
+		attr->cat_kms = kms;
+		valid |= CAT_KMS;
+	}
+	if (kms > loi->loi_lvb.lvb_size) {
+		attr->cat_size = kms;
+		valid |= CAT_SIZE;
+	}
+	cl_object_attr_set(env, obj, attr, valid);
+	cl_object_attr_unlock(obj);
+}
+
+/**
+ * This is called when a page is accessed within file in a way that creates
+ * new page, if one were missing (i.e., if there were a hole at that place in
+ * the file, or accessed page is beyond the current file size). Examples:
+ * ->commit_write() and ->nopage() methods.
+ *
+ * Expand stripe KMS if necessary.
+ */
+static void osc_page_touch(const struct lu_env *env,
+			   struct osc_page *opage, unsigned to)
+{
+	struct cl_page    *page = opage->ops_cl.cpl_page;
+	struct cl_object  *obj  = opage->ops_cl.cpl_obj;
+
+	osc_page_touch_at(env, obj, page->cp_index, to);
+}
+
+/**
+ * Implements cl_io_operations::cio_prepare_write() method for osc layer.
+ *
+ * \retval -EIO transfer initiated against this osc will most likely fail
+ * \retval 0    transfer initiated against this osc will most likely succeed.
+ *
+ * The reason for this check is to immediately return an error to the caller
+ * in the case of a deactivated import. Note, that import can be deactivated
+ * later, while pages, dirtied by this IO, are still in the cache, but this is
+ * irrelevant, because that would still return an error to the application (if
+ * it does fsync), but many applications don't do fsync because of performance
+ * issues, and we wanted to return an -EIO at write time to notify the
+ * application.
+ */
+static int osc_io_prepare_write(const struct lu_env *env,
+				const struct cl_io_slice *ios,
+				const struct cl_page_slice *slice,
+				unsigned from, unsigned to)
+{
+	struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev);
+	struct obd_import *imp = class_exp2cliimp(dev->od_exp);
+	struct osc_io     *oio = cl2osc_io(env, ios);
+	int result = 0;
+	ENTRY;
+
+	/*
+	 * This implements OBD_BRW_CHECK logic from old client.
+	 */
+
+	if (imp == NULL || imp->imp_invalid)
+		result = -EIO;
+	if (result == 0 && oio->oi_lockless)
+		/* this page contains `invalid' data, but who cares?
+		 * nobody can access the invalid data.
+		 * in osc_io_commit_write(), we're going to write exact
+		 * [from, to) bytes of this page to OST. -jay */
+		cl_page_export(env, slice->cpl_page, 1);
+
+	RETURN(result);
+}
+
+static int osc_io_commit_write(const struct lu_env *env,
+			       const struct cl_io_slice *ios,
+			       const struct cl_page_slice *slice,
+			       unsigned from, unsigned to)
+{
+	struct osc_io	 *oio = cl2osc_io(env, ios);
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_object     *obj = cl2osc(opg->ops_cl.cpl_obj);
+	struct osc_async_page *oap = &opg->ops_oap;
+	ENTRY;
+
+	LASSERT(to > 0);
+	/*
+	 * XXX instead of calling osc_page_touch() here and in
+	 * osc_io_fault_start() it might be more logical to introduce
+	 * cl_page_touch() method, that generic cl_io_commit_write() and page
+	 * fault code calls.
+	 */
+	osc_page_touch(env, cl2osc_page(slice), to);
+	if (!client_is_remote(osc_export(obj)) &&
+	    cfs_capable(CFS_CAP_SYS_RESOURCE))
+		oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+
+	if (oio->oi_lockless)
+		/* see osc_io_prepare_write() for lockless io handling. */
+		cl_page_clip(env, slice->cpl_page, from, to);
+
+	RETURN(0);
+}
+
+static int osc_io_fault_start(const struct lu_env *env,
+			      const struct cl_io_slice *ios)
+{
+	struct cl_io       *io;
+	struct cl_fault_io *fio;
+
+	ENTRY;
+
+	io  = ios->cis_io;
+	fio = &io->u.ci_fault;
+	CDEBUG(D_INFO, "%lu %d %d\n",
+	       fio->ft_index, fio->ft_writable, fio->ft_nob);
+	/*
+	 * If mapping is writeable, adjust kms to cover this page,
+	 * but do not extend kms beyond actual file size.
+	 * See bug 10919.
+	 */
+	if (fio->ft_writable)
+		osc_page_touch_at(env, ios->cis_obj,
+				  fio->ft_index, fio->ft_nob);
+	RETURN(0);
+}
+
+static int osc_async_upcall(void *a, int rc)
+{
+	struct osc_async_cbargs *args = a;
+
+	args->opc_rc = rc;
+	complete(&args->opc_sync);
+	return 0;
+}
+
+/**
+ * Checks that there are no pages being written in the extent being truncated.
+ */
+static int trunc_check_cb(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page, void *cbdata)
+{
+	const struct cl_page_slice *slice;
+	struct osc_page *ops;
+	struct osc_async_page *oap;
+	__u64 start = *(__u64 *)cbdata;
+
+	slice = cl_page_at(page, &osc_device_type);
+	LASSERT(slice != NULL);
+	ops = cl2osc_page(slice);
+	oap = &ops->ops_oap;
+
+	if (oap->oap_cmd & OBD_BRW_WRITE &&
+	    !list_empty(&oap->oap_pending_item))
+		CL_PAGE_DEBUG(D_ERROR, env, page, "exists " LPU64 "/%s.\n",
+				start, current->comm);
+
+	{
+		struct page *vmpage = cl_page_vmpage(env, page);
+		if (PageLocked(vmpage))
+			CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n",
+			       ops, page->cp_index,
+			       (oap->oap_cmd & OBD_BRW_RWMASK));
+	}
+
+	return CLP_GANG_OKAY;
+}
+
+static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
+			    struct osc_io *oio, __u64 size)
+{
+	struct cl_object *clob;
+	int     partial;
+	pgoff_t start;
+
+	clob    = oio->oi_cl.cis_obj;
+	start   = cl_index(clob, size);
+	partial = cl_offset(clob, start) < size;
+
+	/*
+	 * Complain if there are pages in the truncated region.
+	 */
+	cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF,
+			    trunc_check_cb, (void *)&size);
+}
+
+static int osc_io_setattr_start(const struct lu_env *env,
+				const struct cl_io_slice *slice)
+{
+	struct cl_io	    *io     = slice->cis_io;
+	struct osc_io	   *oio    = cl2osc_io(env, slice);
+	struct cl_object	*obj    = slice->cis_obj;
+	struct lov_oinfo	*loi    = cl2osc(obj)->oo_oinfo;
+	struct cl_attr	  *attr   = &osc_env_info(env)->oti_attr;
+	struct obdo	     *oa     = &oio->oi_oa;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	__u64		    size   = io->u.ci_setattr.sa_attr.lvb_size;
+	unsigned int	     ia_valid = io->u.ci_setattr.sa_valid;
+	int		      result = 0;
+	struct obd_info	  oinfo = { { { 0 } } };
+
+	/* truncate cache dirty pages first */
+	if (cl_io_is_trunc(io))
+		result = osc_cache_truncate_start(env, oio, cl2osc(obj), size);
+
+	if (result == 0 && oio->oi_lockless == 0) {
+		cl_object_attr_lock(obj);
+		result = cl_object_attr_get(env, obj, attr);
+		if (result == 0) {
+			struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+			unsigned int cl_valid = 0;
+
+			if (ia_valid & ATTR_SIZE) {
+				attr->cat_size = attr->cat_kms = size;
+				cl_valid = (CAT_SIZE | CAT_KMS);
+			}
+			if (ia_valid & ATTR_MTIME_SET) {
+				attr->cat_mtime = lvb->lvb_mtime;
+				cl_valid |= CAT_MTIME;
+			}
+			if (ia_valid & ATTR_ATIME_SET) {
+				attr->cat_atime = lvb->lvb_atime;
+				cl_valid |= CAT_ATIME;
+			}
+			if (ia_valid & ATTR_CTIME_SET) {
+				attr->cat_ctime = lvb->lvb_ctime;
+				cl_valid |= CAT_CTIME;
+			}
+			result = cl_object_attr_set(env, obj, attr, cl_valid);
+		}
+		cl_object_attr_unlock(obj);
+	}
+	memset(oa, 0, sizeof(*oa));
+	if (result == 0) {
+		oa->o_oi = loi->loi_oi;
+		oa->o_mtime = attr->cat_mtime;
+		oa->o_atime = attr->cat_atime;
+		oa->o_ctime = attr->cat_ctime;
+		oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
+			OBD_MD_FLCTIME | OBD_MD_FLMTIME;
+		if (ia_valid & ATTR_SIZE) {
+			oa->o_size = size;
+			oa->o_blocks = OBD_OBJECT_EOF;
+			oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+			if (oio->oi_lockless) {
+				oa->o_flags = OBD_FL_SRVLOCK;
+				oa->o_valid |= OBD_MD_FLFLAGS;
+			}
+		} else {
+			LASSERT(oio->oi_lockless == 0);
+		}
+
+		oinfo.oi_oa = oa;
+		oinfo.oi_capa = io->u.ci_setattr.sa_capa;
+		init_completion(&cbargs->opc_sync);
+
+		if (ia_valid & ATTR_SIZE)
+			result = osc_punch_base(osc_export(cl2osc(obj)),
+						&oinfo, osc_async_upcall,
+						cbargs, PTLRPCD_SET);
+		else
+			result = osc_setattr_async_base(osc_export(cl2osc(obj)),
+							&oinfo, NULL,
+							osc_async_upcall,
+							cbargs, PTLRPCD_SET);
+		cbargs->opc_rpc_sent = result == 0;
+	}
+	return result;
+}
+
+static void osc_io_setattr_end(const struct lu_env *env,
+			       const struct cl_io_slice *slice)
+{
+	struct cl_io     *io  = slice->cis_io;
+	struct osc_io    *oio = cl2osc_io(env, slice);
+	struct cl_object *obj = slice->cis_obj;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	int result = 0;
+
+	if (cbargs->opc_rpc_sent) {
+		wait_for_completion(&cbargs->opc_sync);
+		result = io->ci_result = cbargs->opc_rc;
+	}
+	if (result == 0) {
+		if (oio->oi_lockless) {
+			/* lockless truncate */
+			struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+			LASSERT(cl_io_is_trunc(io));
+			/* XXX: Need a lock. */
+			osd->od_stats.os_lockless_truncates++;
+		}
+	}
+
+	if (cl_io_is_trunc(io)) {
+		__u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+		osc_trunc_check(env, io, oio, size);
+		if (oio->oi_trunc != NULL) {
+			osc_cache_truncate_end(env, oio, cl2osc(obj));
+			oio->oi_trunc = NULL;
+		}
+	}
+}
+
+static int osc_io_read_start(const struct lu_env *env,
+			     const struct cl_io_slice *slice)
+{
+	struct osc_io    *oio   = cl2osc_io(env, slice);
+	struct cl_object *obj   = slice->cis_obj;
+	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+	int	      result = 0;
+	ENTRY;
+
+	if (oio->oi_lockless == 0) {
+		cl_object_attr_lock(obj);
+		result = cl_object_attr_get(env, obj, attr);
+		if (result == 0) {
+			attr->cat_atime = LTIME_S(CFS_CURRENT_TIME);
+			result = cl_object_attr_set(env, obj, attr,
+						    CAT_ATIME);
+		}
+		cl_object_attr_unlock(obj);
+	}
+	RETURN(result);
+}
+
+static int osc_io_write_start(const struct lu_env *env,
+			      const struct cl_io_slice *slice)
+{
+	struct osc_io    *oio   = cl2osc_io(env, slice);
+	struct cl_object *obj   = slice->cis_obj;
+	struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+	int	      result = 0;
+	ENTRY;
+
+	if (oio->oi_lockless == 0) {
+		OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
+		cl_object_attr_lock(obj);
+		result = cl_object_attr_get(env, obj, attr);
+		if (result == 0) {
+			attr->cat_mtime = attr->cat_ctime =
+				LTIME_S(CFS_CURRENT_TIME);
+			result = cl_object_attr_set(env, obj, attr,
+						    CAT_MTIME | CAT_CTIME);
+		}
+		cl_object_attr_unlock(obj);
+	}
+	RETURN(result);
+}
+
+static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+			 struct cl_fsync_io *fio)
+{
+	struct osc_io    *oio   = osc_env_io(env);
+	struct obdo      *oa    = &oio->oi_oa;
+	struct obd_info  *oinfo = &oio->oi_info;
+	struct lov_oinfo *loi   = obj->oo_oinfo;
+	struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+	int rc = 0;
+	ENTRY;
+
+	memset(oa, 0, sizeof(*oa));
+	oa->o_oi = loi->loi_oi;
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+	/* reload size abd blocks for start and end of sync range */
+	oa->o_size = fio->fi_start;
+	oa->o_blocks = fio->fi_end;
+	oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+	obdo_set_parent_fid(oa, fio->fi_fid);
+
+	memset(oinfo, 0, sizeof(*oinfo));
+	oinfo->oi_oa = oa;
+	oinfo->oi_capa = fio->fi_capa;
+	init_completion(&cbargs->opc_sync);
+
+	rc = osc_sync_base(osc_export(obj), oinfo, osc_async_upcall, cbargs,
+			   PTLRPCD_SET);
+	RETURN(rc);
+}
+
+static int osc_io_fsync_start(const struct lu_env *env,
+			      const struct cl_io_slice *slice)
+{
+	struct cl_io       *io  = slice->cis_io;
+	struct cl_fsync_io *fio = &io->u.ci_fsync;
+	struct cl_object   *obj = slice->cis_obj;
+	struct osc_object  *osc = cl2osc(obj);
+	pgoff_t start  = cl_index(obj, fio->fi_start);
+	pgoff_t end    = cl_index(obj, fio->fi_end);
+	int     result = 0;
+	ENTRY;
+
+	if (fio->fi_end == OBD_OBJECT_EOF)
+		end = CL_PAGE_EOF;
+
+	result = osc_cache_writeback_range(env, osc, start, end, 0,
+					   fio->fi_mode == CL_FSYNC_DISCARD);
+	if (result > 0) {
+		fio->fi_nr_written += result;
+		result = 0;
+	}
+	if (fio->fi_mode == CL_FSYNC_ALL) {
+		int rc;
+
+		/* we have to wait for writeback to finish before we can
+		 * send OST_SYNC RPC. This is bad because it causes extents
+		 * to be written osc by osc. However, we usually start
+		 * writeback before CL_FSYNC_ALL so this won't have any real
+		 * problem. */
+		rc = osc_cache_wait_range(env, osc, start, end);
+		if (result == 0)
+			result = rc;
+		rc = osc_fsync_ost(env, osc, fio);
+		if (result == 0)
+			result = rc;
+	}
+
+	RETURN(result);
+}
+
+static void osc_io_fsync_end(const struct lu_env *env,
+			     const struct cl_io_slice *slice)
+{
+	struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
+	struct cl_object   *obj = slice->cis_obj;
+	pgoff_t start = cl_index(obj, fio->fi_start);
+	pgoff_t end   = cl_index(obj, fio->fi_end);
+	int result = 0;
+
+	if (fio->fi_mode == CL_FSYNC_LOCAL) {
+		result = osc_cache_wait_range(env, cl2osc(obj), start, end);
+	} else if (fio->fi_mode == CL_FSYNC_ALL) {
+		struct osc_io	   *oio    = cl2osc_io(env, slice);
+		struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+		wait_for_completion(&cbargs->opc_sync);
+		if (result == 0)
+			result = cbargs->opc_rc;
+	}
+	slice->cis_io->ci_result = result;
+}
+
+static void osc_io_end(const struct lu_env *env,
+		       const struct cl_io_slice *slice)
+{
+	struct osc_io *oio = cl2osc_io(env, slice);
+
+	if (oio->oi_active) {
+		osc_extent_release(env, oio->oi_active);
+		oio->oi_active = NULL;
+	}
+}
+
+static const struct cl_io_operations osc_io_ops = {
+	.op = {
+		[CIT_READ] = {
+			.cio_start  = osc_io_read_start,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_WRITE] = {
+			.cio_start  = osc_io_write_start,
+			.cio_end    = osc_io_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_SETATTR] = {
+			.cio_start  = osc_io_setattr_start,
+			.cio_end    = osc_io_setattr_end
+		},
+		[CIT_FAULT] = {
+			.cio_start  = osc_io_fault_start,
+			.cio_end    = osc_io_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_FSYNC] = {
+			.cio_start  = osc_io_fsync_start,
+			.cio_end    = osc_io_fsync_end,
+			.cio_fini   = osc_io_fini
+		},
+		[CIT_MISC] = {
+			.cio_fini   = osc_io_fini
+		}
+	},
+	.req_op = {
+		 [CRT_READ] = {
+			 .cio_submit    = osc_io_submit
+		 },
+		 [CRT_WRITE] = {
+			 .cio_submit    = osc_io_submit
+		 }
+	 },
+	.cio_prepare_write = osc_io_prepare_write,
+	.cio_commit_write  = osc_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+static int osc_req_prep(const struct lu_env *env,
+			const struct cl_req_slice *slice)
+{
+	return 0;
+}
+
+static void osc_req_completion(const struct lu_env *env,
+			       const struct cl_req_slice *slice, int ioret)
+{
+	struct osc_req *or;
+
+	or = cl2osc_req(slice);
+	OBD_SLAB_FREE_PTR(or, osc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for osc
+ * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void osc_req_attr_set(const struct lu_env *env,
+			     const struct cl_req_slice *slice,
+			     const struct cl_object *obj,
+			     struct cl_req_attr *attr, obd_valid flags)
+{
+	struct lov_oinfo *oinfo;
+	struct cl_req    *clerq;
+	struct cl_page   *apage; /* _some_ page in @clerq */
+	struct cl_lock   *lock;  /* _some_ lock protecting @apage */
+	struct osc_lock  *olck;
+	struct osc_page  *opg;
+	struct obdo      *oa;
+	struct ost_lvb   *lvb;
+
+	oinfo	= cl2osc(obj)->oo_oinfo;
+	lvb	= &oinfo->loi_lvb;
+	oa	= attr->cra_oa;
+
+	if ((flags & OBD_MD_FLMTIME) != 0) {
+		oa->o_mtime = lvb->lvb_mtime;
+		oa->o_valid |= OBD_MD_FLMTIME;
+	}
+	if ((flags & OBD_MD_FLATIME) != 0) {
+		oa->o_atime = lvb->lvb_atime;
+		oa->o_valid |= OBD_MD_FLATIME;
+	}
+	if ((flags & OBD_MD_FLCTIME) != 0) {
+		oa->o_ctime = lvb->lvb_ctime;
+		oa->o_valid |= OBD_MD_FLCTIME;
+	}
+	if (flags & OBD_MD_FLGROUP) {
+		ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi));
+		oa->o_valid |= OBD_MD_FLGROUP;
+	}
+	if (flags & OBD_MD_FLID) {
+		ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi));
+		oa->o_valid |= OBD_MD_FLID;
+	}
+	if (flags & OBD_MD_FLHANDLE) {
+		clerq = slice->crs_req;
+		LASSERT(!list_empty(&clerq->crq_pages));
+		apage = container_of(clerq->crq_pages.next,
+				     struct cl_page, cp_flight);
+		opg = osc_cl_page_osc(apage);
+		apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */
+		lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1);
+		if (lock == NULL) {
+			struct cl_object_header *head;
+			struct cl_lock	  *scan;
+
+			head = cl_object_header(apage->cp_obj);
+			list_for_each_entry(scan, &head->coh_locks,
+						cll_linkage)
+				CL_LOCK_DEBUG(D_ERROR, env, scan,
+					      "no cover page!\n");
+			CL_PAGE_DEBUG(D_ERROR, env, apage,
+				      "dump uncover page!\n");
+			libcfs_debug_dumpstack(NULL);
+			LBUG();
+		}
+
+		olck = osc_lock_at(lock);
+		LASSERT(olck != NULL);
+		LASSERT(ergo(opg->ops_srvlock, olck->ols_lock == NULL));
+		/* check for lockless io. */
+		if (olck->ols_lock != NULL) {
+			oa->o_handle = olck->ols_lock->l_remote_handle;
+			oa->o_valid |= OBD_MD_FLHANDLE;
+		}
+		cl_lock_put(env, lock);
+	}
+}
+
+static const struct cl_req_operations osc_req_ops = {
+	.cro_prep       = osc_req_prep,
+	.cro_attr_set   = osc_req_attr_set,
+	.cro_completion = osc_req_completion
+};
+
+
+int osc_io_init(const struct lu_env *env,
+		struct cl_object *obj, struct cl_io *io)
+{
+	struct osc_io *oio = osc_env_io(env);
+
+	CL_IO_SLICE_CLEAN(oio, oi_cl);
+	cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+	return 0;
+}
+
+int osc_req_init(const struct lu_env *env, struct cl_device *dev,
+		 struct cl_req *req)
+{
+	struct osc_req *or;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(or, osc_req_kmem, __GFP_IO);
+	if (or != NULL) {
+		cl_req_slice_add(req, &or->or_cl, dev, &osc_req_ops);
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+/** @} osc */

diff --git a/drivers/staging/lustre/lustre/osc/osc_lock.c b/drivers/staging/lustre/lustre/osc/osc_lock.c
new file mode 100644
index 0000000..640bc3d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_lock.c

@@ -0,0 +1,1663 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+# include <linux/libcfs/libcfs.h>
+/* fid_build_reg_res_name() */
+#include <lustre_fid.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+#define _PAGEREF_MAGIC  (-10000000)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static const struct cl_lock_operations osc_lock_ops;
+static const struct cl_lock_operations osc_lock_lockless_ops;
+static void osc_lock_to_lockless(const struct lu_env *env,
+				 struct osc_lock *ols, int force);
+static int osc_lock_has_pages(struct osc_lock *olck);
+
+int osc_lock_is_lockless(const struct osc_lock *olck)
+{
+	return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
+}
+
+/**
+ * Returns a weak pointer to the ldlm lock identified by a handle. Returned
+ * pointer cannot be dereferenced, as lock is not protected from concurrent
+ * reclaim. This function is a helper for osc_lock_invariant().
+ */
+static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle)
+{
+	struct ldlm_lock *lock;
+
+	lock = ldlm_handle2lock(handle);
+	if (lock != NULL)
+		LDLM_LOCK_PUT(lock);
+	return lock;
+}
+
+/**
+ * Invariant that has to be true all of the time.
+ */
+static int osc_lock_invariant(struct osc_lock *ols)
+{
+	struct ldlm_lock *lock	= osc_handle_ptr(&ols->ols_handle);
+	struct ldlm_lock *olock       = ols->ols_lock;
+	int	       handle_used = lustre_handle_is_used(&ols->ols_handle);
+
+	return
+		ergo(osc_lock_is_lockless(ols),
+		     ols->ols_locklessable && ols->ols_lock == NULL)  ||
+		(ergo(olock != NULL, handle_used) &&
+		 ergo(olock != NULL,
+		      olock->l_handle.h_cookie == ols->ols_handle.cookie) &&
+		 /*
+		  * Check that ->ols_handle and ->ols_lock are consistent, but
+		  * take into account that they are set at the different time.
+		  */
+		 ergo(handle_used,
+		      ergo(lock != NULL && olock != NULL, lock == olock) &&
+		      ergo(lock == NULL, olock == NULL)) &&
+		 ergo(ols->ols_state == OLS_CANCELLED,
+		      olock == NULL && !handle_used) &&
+		 /*
+		  * DLM lock is destroyed only after we have seen cancellation
+		  * ast.
+		  */
+		 ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
+		      !olock->l_destroyed) &&
+		 ergo(ols->ols_state == OLS_GRANTED,
+		      olock != NULL &&
+		      olock->l_req_mode == olock->l_granted_mode &&
+		      ols->ols_hold));
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+/**
+ * Breaks a link between osc_lock and dlm_lock.
+ */
+static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
+{
+	struct ldlm_lock *dlmlock;
+
+	spin_lock(&osc_ast_guard);
+	dlmlock = olck->ols_lock;
+	if (dlmlock == NULL) {
+		spin_unlock(&osc_ast_guard);
+		return;
+	}
+
+	olck->ols_lock = NULL;
+	/* wb(); --- for all who checks (ols->ols_lock != NULL) before
+	 * call to osc_lock_detach() */
+	dlmlock->l_ast_data = NULL;
+	olck->ols_handle.cookie = 0ULL;
+	spin_unlock(&osc_ast_guard);
+
+	lock_res_and_lock(dlmlock);
+	if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+		struct cl_object *obj = olck->ols_cl.cls_obj;
+		struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+		__u64 old_kms;
+
+		cl_object_attr_lock(obj);
+		/* Must get the value under the lock to avoid possible races. */
+		old_kms = cl2osc(obj)->oo_oinfo->loi_kms;
+		/* Update the kms. Need to loop all granted locks.
+		 * Not a problem for the client */
+		attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms);
+
+		cl_object_attr_set(env, obj, attr, CAT_KMS);
+		cl_object_attr_unlock(obj);
+	}
+	unlock_res_and_lock(dlmlock);
+
+	/* release a reference taken in osc_lock_upcall0(). */
+	LASSERT(olck->ols_has_ref);
+	lu_ref_del(&dlmlock->l_reference, "osc_lock", olck);
+	LDLM_LOCK_RELEASE(dlmlock);
+	olck->ols_has_ref = 0;
+}
+
+static int osc_lock_unhold(struct osc_lock *ols)
+{
+	int result = 0;
+
+	if (ols->ols_hold) {
+		ols->ols_hold = 0;
+		result = osc_cancel_base(&ols->ols_handle,
+					 ols->ols_einfo.ei_mode);
+	}
+	return result;
+}
+
+static int osc_lock_unuse(const struct lu_env *env,
+			  const struct cl_lock_slice *slice)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(ols));
+
+	switch (ols->ols_state) {
+	case OLS_NEW:
+		LASSERT(!ols->ols_hold);
+		LASSERT(ols->ols_agl);
+		return 0;
+	case OLS_UPCALL_RECEIVED:
+		osc_lock_unhold(ols);
+	case OLS_ENQUEUED:
+		LASSERT(!ols->ols_hold);
+		osc_lock_detach(env, ols);
+		ols->ols_state = OLS_NEW;
+		return 0;
+	case OLS_GRANTED:
+		LASSERT(!ols->ols_glimpse);
+		LASSERT(ols->ols_hold);
+		/*
+		 * Move lock into OLS_RELEASED state before calling
+		 * osc_cancel_base() so that possible synchronous cancellation
+		 * (that always happens e.g., for liblustre) sees that lock is
+		 * released.
+		 */
+		ols->ols_state = OLS_RELEASED;
+		return osc_lock_unhold(ols);
+	default:
+		CERROR("Impossible state: %d\n", ols->ols_state);
+		LBUG();
+	}
+}
+
+static void osc_lock_fini(const struct lu_env *env,
+			  struct cl_lock_slice *slice)
+{
+	struct osc_lock  *ols = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(ols));
+	/*
+	 * ->ols_hold can still be true at this point if, for example, a
+	 * thread that requested a lock was killed (and released a reference
+	 * to the lock), before reply from a server was received. In this case
+	 * lock is destroyed immediately after upcall.
+	 */
+	osc_lock_unhold(ols);
+	LASSERT(ols->ols_lock == NULL);
+	LASSERT(atomic_read(&ols->ols_pageref) == 0 ||
+		atomic_read(&ols->ols_pageref) == _PAGEREF_MAGIC);
+
+	OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
+}
+
+static void osc_lock_build_policy(const struct lu_env *env,
+				  const struct cl_lock *lock,
+				  ldlm_policy_data_t *policy)
+{
+	const struct cl_lock_descr *d = &lock->cll_descr;
+
+	osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
+	policy->l_extent.gid = d->cld_gid;
+}
+
+static __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+	__u64 result = 0;
+
+	LASSERT((enqflags & ~CEF_MASK) == 0);
+
+	if (enqflags & CEF_NONBLOCK)
+		result |= LDLM_FL_BLOCK_NOWAIT;
+	if (enqflags & CEF_ASYNC)
+		result |= LDLM_FL_HAS_INTENT;
+	if (enqflags & CEF_DISCARD_DATA)
+		result |= LDLM_AST_DISCARD_DATA;
+	return result;
+}
+
+/**
+ * Global spin-lock protecting consistency of ldlm_lock::l_ast_data
+ * pointers. Initialized in osc_init().
+ */
+spinlock_t osc_ast_guard;
+
+static struct osc_lock *osc_ast_data_get(struct ldlm_lock *dlm_lock)
+{
+	struct osc_lock *olck;
+
+	lock_res_and_lock(dlm_lock);
+	spin_lock(&osc_ast_guard);
+	olck = dlm_lock->l_ast_data;
+	if (olck != NULL) {
+		struct cl_lock *lock = olck->ols_cl.cls_lock;
+		/*
+		 * If osc_lock holds a reference on ldlm lock, return it even
+		 * when cl_lock is in CLS_FREEING state. This way
+		 *
+		 *	 osc_ast_data_get(dlmlock) == NULL
+		 *
+		 * guarantees that all osc references on dlmlock were
+		 * released. osc_dlm_blocking_ast0() relies on that.
+		 */
+		if (lock->cll_state < CLS_FREEING || olck->ols_has_ref) {
+			cl_lock_get_trust(lock);
+			lu_ref_add_atomic(&lock->cll_reference,
+					  "ast", current);
+		} else
+			olck = NULL;
+	}
+	spin_unlock(&osc_ast_guard);
+	unlock_res_and_lock(dlm_lock);
+	return olck;
+}
+
+static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck)
+{
+	struct cl_lock *lock;
+
+	lock = olck->ols_cl.cls_lock;
+	lu_ref_del(&lock->cll_reference, "ast", current);
+	cl_lock_put(env, lock);
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server. Copy of osc_update_enqueue()
+ * logic.
+ *
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
+ * Called under lock and resource spin-locks.
+ */
+static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
+				int rc)
+{
+	struct ost_lvb    *lvb;
+	struct cl_object  *obj;
+	struct lov_oinfo  *oinfo;
+	struct cl_attr    *attr;
+	unsigned	   valid;
+
+	ENTRY;
+
+	if (!(olck->ols_flags & LDLM_FL_LVB_READY))
+		RETURN_EXIT;
+
+	lvb   = &olck->ols_lvb;
+	obj   = olck->ols_cl.cls_obj;
+	oinfo = cl2osc(obj)->oo_oinfo;
+	attr  = &osc_env_info(env)->oti_attr;
+	valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE;
+	cl_lvb2attr(attr, lvb);
+
+	cl_object_attr_lock(obj);
+	if (rc == 0) {
+		struct ldlm_lock  *dlmlock;
+		__u64 size;
+
+		dlmlock = olck->ols_lock;
+		LASSERT(dlmlock != NULL);
+
+		/* re-grab LVB from a dlm lock under DLM spin-locks. */
+		*lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+		size = lvb->lvb_size;
+		/* Extend KMS up to the end of this lock and no further
+		 * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+		if (size > dlmlock->l_policy_data.l_extent.end)
+			size = dlmlock->l_policy_data.l_extent.end + 1;
+		if (size >= oinfo->loi_kms) {
+			LDLM_DEBUG(dlmlock, "lock acquired, setting rss="LPU64
+				   ", kms="LPU64, lvb->lvb_size, size);
+			valid |= CAT_KMS;
+			attr->cat_kms = size;
+		} else {
+			LDLM_DEBUG(dlmlock, "lock acquired, setting rss="
+				   LPU64"; leaving kms="LPU64", end="LPU64,
+				   lvb->lvb_size, oinfo->loi_kms,
+				   dlmlock->l_policy_data.l_extent.end);
+		}
+		ldlm_lock_allow_match_locked(dlmlock);
+	} else if (rc == -ENAVAIL && olck->ols_glimpse) {
+		CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
+		       " kms="LPU64"\n", lvb->lvb_size, oinfo->loi_kms);
+	} else
+		valid = 0;
+
+	if (valid != 0)
+		cl_object_attr_set(env, obj, attr, valid);
+
+	cl_object_attr_unlock(obj);
+
+	EXIT;
+}
+
+/**
+ * Called when a lock is granted, from an upcall (when server returned a
+ * granted lock), or from completion AST, when server returned a blocked lock.
+ *
+ * Called under lock and resource spin-locks, that are released temporarily
+ * here.
+ */
+static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck,
+			     struct ldlm_lock *dlmlock, int rc)
+{
+	struct ldlm_extent   *ext;
+	struct cl_lock       *lock;
+	struct cl_lock_descr *descr;
+
+	LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
+
+	ENTRY;
+	if (olck->ols_state < OLS_GRANTED) {
+		lock  = olck->ols_cl.cls_lock;
+		ext   = &dlmlock->l_policy_data.l_extent;
+		descr = &osc_env_info(env)->oti_descr;
+		descr->cld_obj = lock->cll_descr.cld_obj;
+
+		/* XXX check that ->l_granted_mode is valid. */
+		descr->cld_mode  = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+		descr->cld_start = cl_index(descr->cld_obj, ext->start);
+		descr->cld_end   = cl_index(descr->cld_obj, ext->end);
+		descr->cld_gid   = ext->gid;
+		/*
+		 * tell upper layers the extent of the lock that was actually
+		 * granted
+		 */
+		olck->ols_state = OLS_GRANTED;
+		osc_lock_lvb_update(env, olck, rc);
+
+		/* release DLM spin-locks to allow cl_lock_{modify,signal}()
+		 * to take a semaphore on a parent lock. This is safe, because
+		 * spin-locks are needed to protect consistency of
+		 * dlmlock->l_*_mode and LVB, and we have finished processing
+		 * them. */
+		unlock_res_and_lock(dlmlock);
+		cl_lock_modify(env, lock, descr);
+		cl_lock_signal(env, lock);
+		LINVRNT(osc_lock_invariant(olck));
+		lock_res_and_lock(dlmlock);
+	}
+	EXIT;
+}
+
+static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck)
+
+{
+	struct ldlm_lock *dlmlock;
+
+	ENTRY;
+
+	dlmlock = ldlm_handle2lock_long(&olck->ols_handle, 0);
+	LASSERT(dlmlock != NULL);
+
+	lock_res_and_lock(dlmlock);
+	spin_lock(&osc_ast_guard);
+	LASSERT(dlmlock->l_ast_data == olck);
+	LASSERT(olck->ols_lock == NULL);
+	olck->ols_lock = dlmlock;
+	spin_unlock(&osc_ast_guard);
+
+	/*
+	 * Lock might be not yet granted. In this case, completion ast
+	 * (osc_ldlm_completion_ast()) comes later and finishes lock
+	 * granting.
+	 */
+	if (dlmlock->l_granted_mode == dlmlock->l_req_mode)
+		osc_lock_granted(env, olck, dlmlock, 0);
+	unlock_res_and_lock(dlmlock);
+
+	/*
+	 * osc_enqueue_interpret() decrefs asynchronous locks, counter
+	 * this.
+	 */
+	ldlm_lock_addref(&olck->ols_handle, olck->ols_einfo.ei_mode);
+	olck->ols_hold = 1;
+
+	/* lock reference taken by ldlm_handle2lock_long() is owned by
+	 * osc_lock and released in osc_lock_detach() */
+	lu_ref_add(&dlmlock->l_reference, "osc_lock", olck);
+	olck->ols_has_ref = 1;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int osc_lock_upcall(void *cookie, int errcode)
+{
+	struct osc_lock	 *olck  = cookie;
+	struct cl_lock_slice    *slice = &olck->ols_cl;
+	struct cl_lock	  *lock  = slice->cls_lock;
+	struct lu_env	   *env;
+	struct cl_env_nest       nest;
+
+	ENTRY;
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		int rc;
+
+		cl_lock_mutex_get(env, lock);
+
+		LASSERT(lock->cll_state >= CLS_QUEUING);
+		if (olck->ols_state == OLS_ENQUEUED) {
+			olck->ols_state = OLS_UPCALL_RECEIVED;
+			rc = ldlm_error2errno(errcode);
+		} else if (olck->ols_state == OLS_CANCELLED) {
+			rc = -EIO;
+		} else {
+			CERROR("Impossible state: %d\n", olck->ols_state);
+			LBUG();
+		}
+		if (rc) {
+			struct ldlm_lock *dlmlock;
+
+			dlmlock = ldlm_handle2lock(&olck->ols_handle);
+			if (dlmlock != NULL) {
+				lock_res_and_lock(dlmlock);
+				spin_lock(&osc_ast_guard);
+				LASSERT(olck->ols_lock == NULL);
+				dlmlock->l_ast_data = NULL;
+				olck->ols_handle.cookie = 0ULL;
+				spin_unlock(&osc_ast_guard);
+				ldlm_lock_fail_match_locked(dlmlock);
+				unlock_res_and_lock(dlmlock);
+				LDLM_LOCK_PUT(dlmlock);
+			}
+		} else {
+			if (olck->ols_glimpse)
+				olck->ols_glimpse = 0;
+			osc_lock_upcall0(env, olck);
+		}
+
+		/* Error handling, some errors are tolerable. */
+		if (olck->ols_locklessable && rc == -EUSERS) {
+			/* This is a tolerable error, turn this lock into
+			 * lockless lock.
+			 */
+			osc_object_set_contended(cl2osc(slice->cls_obj));
+			LASSERT(slice->cls_ops == &osc_lock_ops);
+
+			/* Change this lock to ldlmlock-less lock. */
+			osc_lock_to_lockless(env, olck, 1);
+			olck->ols_state = OLS_GRANTED;
+			rc = 0;
+		} else if (olck->ols_glimpse && rc == -ENAVAIL) {
+			osc_lock_lvb_update(env, olck, rc);
+			cl_lock_delete(env, lock);
+			/* Hide the error. */
+			rc = 0;
+		}
+
+		if (rc == 0) {
+			/* For AGL case, the RPC sponsor may exits the cl_lock
+			*  processing without wait() called before related OSC
+			*  lock upcall(). So update the lock status according
+			*  to the enqueue result inside AGL upcall(). */
+			if (olck->ols_agl) {
+				lock->cll_flags |= CLF_FROM_UPCALL;
+				cl_wait_try(env, lock);
+				lock->cll_flags &= ~CLF_FROM_UPCALL;
+				if (!olck->ols_glimpse)
+					olck->ols_agl = 0;
+			}
+			cl_lock_signal(env, lock);
+			/* del user for lock upcall cookie */
+			cl_unuse_try(env, lock);
+		} else {
+			/* del user for lock upcall cookie */
+			cl_lock_user_del(env, lock);
+			cl_lock_error(env, lock, rc);
+		}
+
+		/* release cookie reference, acquired by osc_lock_enqueue() */
+		cl_lock_hold_release(env, lock, "upcall", lock);
+		cl_lock_mutex_put(env, lock);
+
+		lu_ref_del(&lock->cll_reference, "upcall", lock);
+		/* This maybe the last reference, so must be called after
+		 * cl_lock_mutex_put(). */
+		cl_lock_put(env, lock);
+
+		cl_env_nested_put(&nest, env);
+	} else {
+		/* should never happen, similar to osc_ldlm_blocking_ast(). */
+		LBUG();
+	}
+	RETURN(errcode);
+}
+
+/**
+ * Core of osc_dlm_blocking_ast() logic.
+ */
+static void osc_lock_blocking(const struct lu_env *env,
+			      struct ldlm_lock *dlmlock,
+			      struct osc_lock *olck, int blocking)
+{
+	struct cl_lock *lock = olck->ols_cl.cls_lock;
+
+	LASSERT(olck->ols_lock == dlmlock);
+	CLASSERT(OLS_BLOCKED < OLS_CANCELLED);
+	LASSERT(!osc_lock_is_lockless(olck));
+
+	/*
+	 * Lock might be still addref-ed here, if e.g., blocking ast
+	 * is sent for a failed lock.
+	 */
+	osc_lock_unhold(olck);
+
+	if (blocking && olck->ols_state < OLS_BLOCKED)
+		/*
+		 * Move osc_lock into OLS_BLOCKED before canceling the lock,
+		 * because it recursively re-enters osc_lock_blocking(), with
+		 * the state set to OLS_CANCELLED.
+		 */
+		olck->ols_state = OLS_BLOCKED;
+	/*
+	 * cancel and destroy lock at least once no matter how blocking ast is
+	 * entered (see comment above osc_ldlm_blocking_ast() for use
+	 * cases). cl_lock_cancel() and cl_lock_delete() are idempotent.
+	 */
+	cl_lock_cancel(env, lock);
+	cl_lock_delete(env, lock);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int osc_dlm_blocking_ast0(const struct lu_env *env,
+				 struct ldlm_lock *dlmlock,
+				 void *data, int flag)
+{
+	struct osc_lock *olck;
+	struct cl_lock  *lock;
+	int result;
+	int cancel;
+
+	LASSERT(flag == LDLM_CB_BLOCKING || flag == LDLM_CB_CANCELING);
+
+	cancel = 0;
+	olck = osc_ast_data_get(dlmlock);
+	if (olck != NULL) {
+		lock = olck->ols_cl.cls_lock;
+		cl_lock_mutex_get(env, lock);
+		LINVRNT(osc_lock_invariant(olck));
+		if (olck->ols_ast_wait) {
+			/* wake up osc_lock_use() */
+			cl_lock_signal(env, lock);
+			olck->ols_ast_wait = 0;
+		}
+		/*
+		 * Lock might have been canceled while this thread was
+		 * sleeping for lock mutex, but olck is pinned in memory.
+		 */
+		if (olck == dlmlock->l_ast_data) {
+			/*
+			 * NOTE: DLM sends blocking AST's for failed locks
+			 *       (that are still in pre-OLS_GRANTED state)
+			 *       too, and they have to be canceled otherwise
+			 *       DLM lock is never destroyed and stuck in
+			 *       the memory.
+			 *
+			 *       Alternatively, ldlm_cli_cancel() can be
+			 *       called here directly for osc_locks with
+			 *       ols_state < OLS_GRANTED to maintain an
+			 *       invariant that ->clo_cancel() is only called
+			 *       for locks that were granted.
+			 */
+			LASSERT(data == olck);
+			osc_lock_blocking(env, dlmlock,
+					  olck, flag == LDLM_CB_BLOCKING);
+		} else
+			cancel = 1;
+		cl_lock_mutex_put(env, lock);
+		osc_ast_data_put(env, olck);
+	} else
+		/*
+		 * DLM lock exists, but there is no cl_lock attached to it.
+		 * This is a `normal' race. cl_object and its cl_lock's can be
+		 * removed by memory pressure, together with all pages.
+		 */
+		cancel = (flag == LDLM_CB_BLOCKING);
+
+	if (cancel) {
+		struct lustre_handle *lockh;
+
+		lockh = &osc_env_info(env)->oti_handle;
+		ldlm_lock2handle(dlmlock, lockh);
+		result = ldlm_cli_cancel(lockh, LCF_ASYNC);
+	} else
+		result = 0;
+	return result;
+}
+
+/**
+ * Blocking ast invoked by ldlm when dlm lock is either blocking progress of
+ * some other lock, or is canceled. This function is installed as a
+ * ldlm_lock::l_blocking_ast() for client extent locks.
+ *
+ * Control flow is tricky, because ldlm uses the same call-back
+ * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's.
+ *
+ * \param dlmlock lock for which ast occurred.
+ *
+ * \param new description of a conflicting lock in case of blocking ast.
+ *
+ * \param data value of dlmlock->l_ast_data
+ *
+ * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish
+ *	     cancellation and blocking ast's.
+ *
+ * Possible use cases:
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel
+ *       lock due to lock lru pressure, or explicit user request to purge
+ *       locks.
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify
+ *       us that dlmlock conflicts with another lock that some client is
+ *       enqueing. Lock is canceled.
+ *
+ *	   - cl_lock_cancel() is called. osc_lock_cancel() calls
+ *	     ldlm_cli_cancel() that calls
+ *
+ *		  dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ *	     recursively entering osc_ldlm_blocking_ast().
+ *
+ *     - client cancels lock voluntary (e.g., as a part of early cancellation):
+ *
+ *	   cl_lock_cancel()->
+ *	     osc_lock_cancel()->
+ *	       ldlm_cli_cancel()->
+ *		 dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ */
+static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+				 struct ldlm_lock_desc *new, void *data,
+				 int flag)
+{
+	struct lu_env     *env;
+	struct cl_env_nest nest;
+	int		result;
+
+	/*
+	 * This can be called in the context of outer IO, e.g.,
+	 *
+	 *     cl_enqueue()->...
+	 *       ->osc_enqueue_base()->...
+	 *	 ->ldlm_prep_elc_req()->...
+	 *	   ->ldlm_cancel_callback()->...
+	 *	     ->osc_ldlm_blocking_ast()
+	 *
+	 * new environment has to be created to not corrupt outer context.
+	 */
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
+		cl_env_nested_put(&nest, env);
+	} else {
+		result = PTR_ERR(env);
+		/*
+		 * XXX This should never happen, as cl_lock is
+		 * stuck. Pre-allocated environment a la vvp_inode_fini_env
+		 * should be used.
+		 */
+		LBUG();
+	}
+	if (result != 0) {
+		if (result == -ENODATA)
+			result = 0;
+		else
+			CERROR("BAST failed: %d\n", result);
+	}
+	return result;
+}
+
+static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
+				   __u64 flags, void *data)
+{
+	struct cl_env_nest nest;
+	struct lu_env     *env;
+	struct osc_lock   *olck;
+	struct cl_lock    *lock;
+	int result;
+	int dlmrc;
+
+	/* first, do dlm part of the work */
+	dlmrc = ldlm_completion_ast_async(dlmlock, flags, data);
+	/* then, notify cl_lock */
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		olck = osc_ast_data_get(dlmlock);
+		if (olck != NULL) {
+			lock = olck->ols_cl.cls_lock;
+			cl_lock_mutex_get(env, lock);
+			/*
+			 * ldlm_handle_cp_callback() copied LVB from request
+			 * to lock->l_lvb_data, store it in osc_lock.
+			 */
+			LASSERT(dlmlock->l_lvb_data != NULL);
+			lock_res_and_lock(dlmlock);
+			olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+			if (olck->ols_lock == NULL) {
+				/*
+				 * upcall (osc_lock_upcall()) hasn't yet been
+				 * called. Do nothing now, upcall will bind
+				 * olck to dlmlock and signal the waiters.
+				 *
+				 * This maintains an invariant that osc_lock
+				 * and ldlm_lock are always bound when
+				 * osc_lock is in OLS_GRANTED state.
+				 */
+			} else if (dlmlock->l_granted_mode ==
+				   dlmlock->l_req_mode) {
+				osc_lock_granted(env, olck, dlmlock, dlmrc);
+			}
+			unlock_res_and_lock(dlmlock);
+
+			if (dlmrc != 0) {
+				CL_LOCK_DEBUG(D_ERROR, env, lock,
+					      "dlmlock returned %d\n", dlmrc);
+				cl_lock_error(env, lock, dlmrc);
+			}
+			cl_lock_mutex_put(env, lock);
+			osc_ast_data_put(env, olck);
+			result = 0;
+		} else
+			result = -ELDLM_NO_LOCK_DATA;
+		cl_env_nested_put(&nest, env);
+	} else
+		result = PTR_ERR(env);
+	return dlmrc ?: result;
+}
+
+static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+	struct ptlrpc_request  *req  = data;
+	struct osc_lock	*olck;
+	struct cl_lock	 *lock;
+	struct cl_object       *obj;
+	struct cl_env_nest      nest;
+	struct lu_env	  *env;
+	struct ost_lvb	 *lvb;
+	struct req_capsule     *cap;
+	int		     result;
+
+	LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
+
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		/* osc_ast_data_get() has to go after environment is
+		 * allocated, because osc_ast_data() acquires a
+		 * reference to a lock, and it can only be released in
+		 * environment.
+		 */
+		olck = osc_ast_data_get(dlmlock);
+		if (olck != NULL) {
+			lock = olck->ols_cl.cls_lock;
+			/* Do not grab the mutex of cl_lock for glimpse.
+			 * See LU-1274 for details.
+			 * BTW, it's okay for cl_lock to be cancelled during
+			 * this period because server can handle this race.
+			 * See ldlm_server_glimpse_ast() for details.
+			 * cl_lock_mutex_get(env, lock); */
+			cap = &req->rq_pill;
+			req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK);
+			req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER,
+					     sizeof *lvb);
+			result = req_capsule_server_pack(cap);
+			if (result == 0) {
+				lvb = req_capsule_server_get(cap, &RMF_DLM_LVB);
+				obj = lock->cll_descr.cld_obj;
+				result = cl_object_glimpse(env, obj, lvb);
+			}
+			if (!exp_connect_lvb_type(req->rq_export))
+				req_capsule_shrink(&req->rq_pill,
+						   &RMF_DLM_LVB,
+						   sizeof(struct ost_lvb_v1),
+						   RCL_SERVER);
+			osc_ast_data_put(env, olck);
+		} else {
+			/*
+			 * These errors are normal races, so we don't want to
+			 * fill the console with messages by calling
+			 * ptlrpc_error()
+			 */
+			lustre_pack_reply(req, 1, NULL, NULL);
+			result = -ELDLM_NO_LOCK_DATA;
+		}
+		cl_env_nested_put(&nest, env);
+	} else
+		result = PTR_ERR(env);
+	req->rq_status = result;
+	return result;
+}
+
+static unsigned long osc_lock_weigh(const struct lu_env *env,
+				    const struct cl_lock_slice *slice)
+{
+	/*
+	 * don't need to grab coh_page_guard since we don't care the exact #
+	 * of pages..
+	 */
+	return cl_object_header(slice->cls_obj)->coh_pages;
+}
+
+/**
+ * Get the weight of dlm lock for early cancellation.
+ *
+ * XXX: it should return the pages covered by this \a dlmlock.
+ */
+static unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
+{
+	struct cl_env_nest       nest;
+	struct lu_env	   *env;
+	struct osc_lock	 *lock;
+	struct cl_lock	  *cll;
+	unsigned long	    weight;
+	ENTRY;
+
+	might_sleep();
+	/*
+	 * osc_ldlm_weigh_ast has a complex context since it might be called
+	 * because of lock canceling, or from user's input. We have to make
+	 * a new environment for it. Probably it is implementation safe to use
+	 * the upper context because cl_lock_put don't modify environment
+	 * variables. But in case of ..
+	 */
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		/* Mostly because lack of memory, tend to eliminate this lock*/
+		RETURN(0);
+
+	LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
+	lock = osc_ast_data_get(dlmlock);
+	if (lock == NULL) {
+		/* cl_lock was destroyed because of memory pressure.
+		 * It is much reasonable to assign this type of lock
+		 * a lower cost.
+		 */
+		GOTO(out, weight = 0);
+	}
+
+	cll = lock->ols_cl.cls_lock;
+	cl_lock_mutex_get(env, cll);
+	weight = cl_lock_weigh(env, cll);
+	cl_lock_mutex_put(env, cll);
+	osc_ast_data_put(env, lock);
+	EXIT;
+
+out:
+	cl_env_nested_put(&nest, env);
+	return weight;
+}
+
+static void osc_lock_build_einfo(const struct lu_env *env,
+				 const struct cl_lock *clock,
+				 struct osc_lock *lock,
+				 struct ldlm_enqueue_info *einfo)
+{
+	enum cl_lock_mode mode;
+
+	mode = clock->cll_descr.cld_mode;
+	if (mode == CLM_PHANTOM)
+		/*
+		 * For now, enqueue all glimpse locks in read mode. In the
+		 * future, client might choose to enqueue LCK_PW lock for
+		 * glimpse on a file opened for write.
+		 */
+		mode = CLM_READ;
+
+	einfo->ei_type   = LDLM_EXTENT;
+	einfo->ei_mode   = osc_cl_lock2ldlm(mode);
+	einfo->ei_cb_bl  = osc_ldlm_blocking_ast;
+	einfo->ei_cb_cp  = osc_ldlm_completion_ast;
+	einfo->ei_cb_gl  = osc_ldlm_glimpse_ast;
+	einfo->ei_cb_wg  = osc_ldlm_weigh_ast;
+	einfo->ei_cbdata = lock; /* value to be put into ->l_ast_data */
+}
+
+/**
+ * Determine if the lock should be converted into a lockless lock.
+ *
+ * Steps to check:
+ * - if the lock has an explicite requirment for a non-lockless lock;
+ * - if the io lock request type ci_lockreq;
+ * - send the enqueue rpc to ost to make the further decision;
+ * - special treat to truncate lockless lock
+ *
+ *  Additional policy can be implemented here, e.g., never do lockless-io
+ *  for large extents.
+ */
+static void osc_lock_to_lockless(const struct lu_env *env,
+				 struct osc_lock *ols, int force)
+{
+	struct cl_lock_slice *slice = &ols->ols_cl;
+
+	LASSERT(ols->ols_state == OLS_NEW ||
+		ols->ols_state == OLS_UPCALL_RECEIVED);
+
+	if (force) {
+		ols->ols_locklessable = 1;
+		slice->cls_ops = &osc_lock_lockless_ops;
+	} else {
+		struct osc_io *oio     = osc_env_io(env);
+		struct cl_io  *io      = oio->oi_cl.cis_io;
+		struct cl_object *obj  = slice->cls_obj;
+		struct osc_object *oob = cl2osc(obj);
+		const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+		struct obd_connect_data *ocd;
+
+		LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+			io->ci_lockreq == CILR_MAYBE ||
+			io->ci_lockreq == CILR_NEVER);
+
+		ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+		ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+				(io->ci_lockreq == CILR_MAYBE) &&
+				(ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
+		if (io->ci_lockreq == CILR_NEVER ||
+			/* lockless IO */
+		    (ols->ols_locklessable && osc_object_is_contended(oob)) ||
+			/* lockless truncate */
+		    (cl_io_is_trunc(io) &&
+		     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
+		      osd->od_lockless_truncate)) {
+			ols->ols_locklessable = 1;
+			slice->cls_ops = &osc_lock_lockless_ops;
+		}
+	}
+	LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+}
+
+static int osc_lock_compatible(const struct osc_lock *qing,
+			       const struct osc_lock *qed)
+{
+	enum cl_lock_mode qing_mode;
+	enum cl_lock_mode qed_mode;
+
+	qing_mode = qing->ols_cl.cls_lock->cll_descr.cld_mode;
+	if (qed->ols_glimpse &&
+	    (qed->ols_state >= OLS_UPCALL_RECEIVED || qing_mode == CLM_READ))
+		return 1;
+
+	qed_mode = qed->ols_cl.cls_lock->cll_descr.cld_mode;
+	return ((qing_mode == CLM_READ) && (qed_mode == CLM_READ));
+}
+
+/**
+ * Cancel all conflicting locks and wait for them to be destroyed.
+ *
+ * This function is used for two purposes:
+ *
+ *     - early cancel all conflicting locks before starting IO, and
+ *
+ *     - guarantee that pages added to the page cache by lockless IO are never
+ *       covered by locks other than lockless IO lock, and, hence, are not
+ *       visible to other threads.
+ */
+static int osc_lock_enqueue_wait(const struct lu_env *env,
+				 const struct osc_lock *olck)
+{
+	struct cl_lock	  *lock    = olck->ols_cl.cls_lock;
+	struct cl_lock_descr    *descr   = &lock->cll_descr;
+	struct cl_object_header *hdr     = cl_object_header(descr->cld_obj);
+	struct cl_lock	  *scan;
+	struct cl_lock	  *conflict= NULL;
+	int lockless		     = osc_lock_is_lockless(olck);
+	int rc			   = 0;
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+
+	/* make it enqueue anyway for glimpse lock, because we actually
+	 * don't need to cancel any conflicting locks. */
+	if (olck->ols_glimpse)
+		return 0;
+
+	spin_lock(&hdr->coh_lock_guard);
+	list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+		struct cl_lock_descr *cld = &scan->cll_descr;
+		const struct osc_lock *scan_ols;
+
+		if (scan == lock)
+			break;
+
+		if (scan->cll_state < CLS_QUEUING ||
+		    scan->cll_state == CLS_FREEING ||
+		    cld->cld_start > descr->cld_end ||
+		    cld->cld_end < descr->cld_start)
+			continue;
+
+		/* overlapped and living locks. */
+
+		/* We're not supposed to give up group lock. */
+		if (scan->cll_descr.cld_mode == CLM_GROUP) {
+			LASSERT(descr->cld_mode != CLM_GROUP ||
+				descr->cld_gid != scan->cll_descr.cld_gid);
+			continue;
+		}
+
+		scan_ols = osc_lock_at(scan);
+
+		/* We need to cancel the compatible locks if we're enqueuing
+		 * a lockless lock, for example:
+		 * imagine that client has PR lock on [0, 1000], and thread T0
+		 * is doing lockless IO in [500, 1500] region. Concurrent
+		 * thread T1 can see lockless data in [500, 1000], which is
+		 * wrong, because these data are possibly stale. */
+		if (!lockless && osc_lock_compatible(olck, scan_ols))
+			continue;
+
+		cl_lock_get_trust(scan);
+		conflict = scan;
+		break;
+	}
+	spin_unlock(&hdr->coh_lock_guard);
+
+	if (conflict) {
+		if (lock->cll_descr.cld_mode == CLM_GROUP) {
+			/* we want a group lock but a previous lock request
+			 * conflicts, we do not wait but return 0 so the
+			 * request is send to the server
+			 */
+			CDEBUG(D_DLMTRACE, "group lock %p is conflicted "
+					   "with %p, no wait, send to server\n",
+			       lock, conflict);
+			cl_lock_put(env, conflict);
+			rc = 0;
+		} else {
+			CDEBUG(D_DLMTRACE, "lock %p is conflicted with %p, "
+					   "will wait\n",
+			       lock, conflict);
+			LASSERT(lock->cll_conflict == NULL);
+			lu_ref_add(&conflict->cll_reference, "cancel-wait",
+				   lock);
+			lock->cll_conflict = conflict;
+			rc = CLO_WAIT;
+		}
+	}
+	RETURN(rc);
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int osc_lock_enqueue(const struct lu_env *env,
+			    const struct cl_lock_slice *slice,
+			    struct cl_io *unused, __u32 enqflags)
+{
+	struct osc_lock	  *ols     = cl2osc_lock(slice);
+	struct cl_lock	   *lock    = ols->ols_cl.cls_lock;
+	int result;
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERTF(ols->ols_state == OLS_NEW,
+		 "Impossible state: %d\n", ols->ols_state);
+
+	LASSERTF(ergo(ols->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+		"lock = %p, ols = %p\n", lock, ols);
+
+	result = osc_lock_enqueue_wait(env, ols);
+	if (result == 0) {
+		if (!osc_lock_is_lockless(ols)) {
+			struct osc_object	*obj = cl2osc(slice->cls_obj);
+			struct osc_thread_info   *info = osc_env_info(env);
+			struct ldlm_res_id       *resname = &info->oti_resname;
+			ldlm_policy_data_t       *policy = &info->oti_policy;
+			struct ldlm_enqueue_info *einfo = &ols->ols_einfo;
+
+			/* lock will be passed as upcall cookie,
+			 * hold ref to prevent to be released. */
+			cl_lock_hold_add(env, lock, "upcall", lock);
+			/* a user for lock also */
+			cl_lock_user_add(env, lock);
+			ols->ols_state = OLS_ENQUEUED;
+
+			/*
+			 * XXX: this is possible blocking point as
+			 * ldlm_lock_match(LDLM_FL_LVB_READY) waits for
+			 * LDLM_CP_CALLBACK.
+			 */
+			ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname);
+			osc_lock_build_policy(env, lock, policy);
+			result = osc_enqueue_base(osc_export(obj), resname,
+					  &ols->ols_flags, policy,
+					  &ols->ols_lvb,
+					  obj->oo_oinfo->loi_kms_valid,
+					  osc_lock_upcall,
+					  ols, einfo, &ols->ols_handle,
+					  PTLRPCD_SET, 1, ols->ols_agl);
+			if (result != 0) {
+				cl_lock_user_del(env, lock);
+				cl_lock_unhold(env, lock, "upcall", lock);
+				if (unlikely(result == -ECANCELED)) {
+					ols->ols_state = OLS_NEW;
+					result = 0;
+				}
+			}
+		} else {
+			ols->ols_state = OLS_GRANTED;
+			ols->ols_owner = osc_env_io(env);
+		}
+	}
+	LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+	RETURN(result);
+}
+
+static int osc_lock_wait(const struct lu_env *env,
+			 const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck = cl2osc_lock(slice);
+	struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+	LINVRNT(osc_lock_invariant(olck));
+
+	if (olck->ols_glimpse && olck->ols_state >= OLS_UPCALL_RECEIVED) {
+		if (olck->ols_flags & LDLM_FL_LVB_READY) {
+			return 0;
+		} else if (olck->ols_agl) {
+			if (lock->cll_flags & CLF_FROM_UPCALL)
+				/* It is from enqueue RPC reply upcall for
+				 * updating state. Do not re-enqueue. */
+				return -ENAVAIL;
+			else
+				olck->ols_state = OLS_NEW;
+		} else {
+			LASSERT(lock->cll_error);
+			return lock->cll_error;
+		}
+	}
+
+	if (olck->ols_state == OLS_NEW) {
+		int rc;
+
+		LASSERT(olck->ols_agl);
+		olck->ols_agl = 0;
+		rc = osc_lock_enqueue(env, slice, NULL, CEF_ASYNC | CEF_MUST);
+		if (rc != 0)
+			return rc;
+		else
+			return CLO_REENQUEUED;
+	}
+
+	LASSERT(equi(olck->ols_state >= OLS_UPCALL_RECEIVED &&
+		     lock->cll_error == 0, olck->ols_lock != NULL));
+
+	return lock->cll_error ?: olck->ols_state >= OLS_GRANTED ? 0 : CLO_WAIT;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_use() method that pins cached
+ * lock.
+ */
+static int osc_lock_use(const struct lu_env *env,
+			const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck = cl2osc_lock(slice);
+	int rc;
+
+	LASSERT(!olck->ols_hold);
+
+	/*
+	 * Atomically check for LDLM_FL_CBPENDING and addref a lock if this
+	 * flag is not set. This protects us from a concurrent blocking ast.
+	 */
+	rc = ldlm_lock_addref_try(&olck->ols_handle, olck->ols_einfo.ei_mode);
+	if (rc == 0) {
+		olck->ols_hold = 1;
+		olck->ols_state = OLS_GRANTED;
+	} else {
+		struct cl_lock *lock;
+
+		/*
+		 * Lock is being cancelled somewhere within
+		 * ldlm_handle_bl_callback(): LDLM_FL_CBPENDING is already
+		 * set, but osc_ldlm_blocking_ast() hasn't yet acquired
+		 * cl_lock mutex.
+		 */
+		lock = slice->cls_lock;
+		LASSERT(lock->cll_state == CLS_INTRANSIT);
+		LASSERT(lock->cll_users > 0);
+		/* set a flag for osc_dlm_blocking_ast0() to signal the
+		 * lock.*/
+		olck->ols_ast_wait = 1;
+		rc = CLO_WAIT;
+	}
+	return rc;
+}
+
+static int osc_lock_flush(struct osc_lock *ols, int discard)
+{
+	struct cl_lock       *lock  = ols->ols_cl.cls_lock;
+	struct cl_env_nest    nest;
+	struct lu_env	*env;
+	int result = 0;
+	ENTRY;
+
+	env = cl_env_nested_get(&nest);
+	if (!IS_ERR(env)) {
+		struct osc_object    *obj   = cl2osc(ols->ols_cl.cls_obj);
+		struct cl_lock_descr *descr = &lock->cll_descr;
+		int rc = 0;
+
+		if (descr->cld_mode >= CLM_WRITE) {
+			result = osc_cache_writeback_range(env, obj,
+					descr->cld_start, descr->cld_end,
+					1, discard);
+			LDLM_DEBUG(ols->ols_lock,
+				"lock %p: %d pages were %s.\n", lock, result,
+				discard ? "discarded" : "written");
+			if (result > 0)
+				result = 0;
+		}
+
+		rc = cl_lock_discard_pages(env, lock);
+		if (result == 0 && rc < 0)
+			result = rc;
+
+		cl_env_nested_put(&nest, env);
+	} else
+		result = PTR_ERR(env);
+	if (result == 0) {
+		ols->ols_flush = 1;
+		LINVRNT(!osc_lock_has_pages(ols));
+	}
+	RETURN(result);
+}
+
+/**
+ * Implements cl_lock_operations::clo_cancel() method for osc layer. This is
+ * called (as part of cl_lock_cancel()) when lock is canceled either voluntary
+ * (LRU pressure, early cancellation, umount, etc.) or due to the conflict
+ * with some other lock some where in the cluster. This function does the
+ * following:
+ *
+ *     - invalidates all pages protected by this lock (after sending dirty
+ *       ones to the server, as necessary);
+ *
+ *     - decref's underlying ldlm lock;
+ *
+ *     - cancels ldlm lock (ldlm_cli_cancel()).
+ */
+static void osc_lock_cancel(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct cl_lock   *lock    = slice->cls_lock;
+	struct osc_lock  *olck    = cl2osc_lock(slice);
+	struct ldlm_lock *dlmlock = olck->ols_lock;
+	int	       result  = 0;
+	int	       discard;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LINVRNT(osc_lock_invariant(olck));
+
+	if (dlmlock != NULL) {
+		int do_cancel;
+
+		discard = !!(dlmlock->l_flags & LDLM_FL_DISCARD_DATA);
+		if (olck->ols_state >= OLS_GRANTED)
+			result = osc_lock_flush(olck, discard);
+		osc_lock_unhold(olck);
+
+		lock_res_and_lock(dlmlock);
+		/* Now that we're the only user of dlm read/write reference,
+		 * mostly the ->l_readers + ->l_writers should be zero.
+		 * However, there is a corner case.
+		 * See bug 18829 for details.*/
+		do_cancel = (dlmlock->l_readers == 0 &&
+			     dlmlock->l_writers == 0);
+		dlmlock->l_flags |= LDLM_FL_CBPENDING;
+		unlock_res_and_lock(dlmlock);
+		if (do_cancel)
+			result = ldlm_cli_cancel(&olck->ols_handle, LCF_ASYNC);
+		if (result < 0)
+			CL_LOCK_DEBUG(D_ERROR, env, lock,
+				      "lock %p cancel failure with error(%d)\n",
+				      lock, result);
+	}
+	olck->ols_state = OLS_CANCELLED;
+	olck->ols_flags &= ~LDLM_FL_LVB_READY;
+	osc_lock_detach(env, olck);
+}
+
+static int osc_lock_has_pages(struct osc_lock *olck)
+{
+	return 0;
+}
+
+static void osc_lock_delete(const struct lu_env *env,
+			    const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck;
+
+	olck = cl2osc_lock(slice);
+	if (olck->ols_glimpse) {
+		LASSERT(!olck->ols_hold);
+		LASSERT(!olck->ols_lock);
+		return;
+	}
+
+	LINVRNT(osc_lock_invariant(olck));
+	LINVRNT(!osc_lock_has_pages(olck));
+
+	osc_lock_unhold(olck);
+	osc_lock_detach(env, olck);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for osc layer.
+ *
+ * Maintains osc_lock::ols_owner field.
+ *
+ * This assumes that lock always enters CLS_HELD (from some other state) in
+ * the same IO context as one that requested the lock. This should not be a
+ * problem, because context is by definition shared by all activity pertaining
+ * to the same high-level IO.
+ */
+static void osc_lock_state(const struct lu_env *env,
+			   const struct cl_lock_slice *slice,
+			   enum cl_lock_state state)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	/*
+	 * XXX multiple io contexts can use the lock at the same time.
+	 */
+	LINVRNT(osc_lock_invariant(lock));
+	if (state == CLS_HELD && slice->cls_lock->cll_state != CLS_HELD) {
+		struct osc_io *oio = osc_env_io(env);
+
+		LASSERT(lock->ols_owner == NULL);
+		lock->ols_owner = oio;
+	} else if (state != CLS_HELD)
+		lock->ols_owner = NULL;
+}
+
+static int osc_lock_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t p, const struct cl_lock_slice *slice)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	/*
+	 * XXX print ldlm lock and einfo properly.
+	 */
+	(*p)(env, cookie, "%p %#16llx "LPX64" %d %p ",
+	     lock->ols_lock, lock->ols_flags, lock->ols_handle.cookie,
+	     lock->ols_state, lock->ols_owner);
+	osc_lvb_print(env, cookie, p, &lock->ols_lvb);
+	return 0;
+}
+
+static int osc_lock_fits_into(const struct lu_env *env,
+			      const struct cl_lock_slice *slice,
+			      const struct cl_lock_descr *need,
+			      const struct cl_io *io)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+
+	if (need->cld_enq_flags & CEF_NEVER)
+		return 0;
+
+	if (ols->ols_state >= OLS_CANCELLED)
+		return 0;
+
+	if (need->cld_mode == CLM_PHANTOM) {
+		if (ols->ols_agl)
+			return !(ols->ols_state > OLS_RELEASED);
+
+		/*
+		 * Note: the QUEUED lock can't be matched here, otherwise
+		 * it might cause the deadlocks.
+		 * In read_process,
+		 * P1: enqueued read lock, create sublock1
+		 * P2: enqueued write lock, create sublock2(conflicted
+		 *     with sublock1).
+		 * P1: Grant read lock.
+		 * P1: enqueued glimpse lock(with holding sublock1_read),
+		 *     matched with sublock2, waiting sublock2 to be granted.
+		 *     But sublock2 can not be granted, because P1
+		 *     will not release sublock1. Bang!
+		 */
+		if (ols->ols_state < OLS_GRANTED ||
+		    ols->ols_state > OLS_RELEASED)
+			return 0;
+	} else if (need->cld_enq_flags & CEF_MUST) {
+		/*
+		 * If the lock hasn't ever enqueued, it can't be matched
+		 * because enqueue process brings in many information
+		 * which can be used to determine things such as lockless,
+		 * CEF_MUST, etc.
+		 */
+		if (ols->ols_state < OLS_UPCALL_RECEIVED &&
+		    ols->ols_locklessable)
+			return 0;
+	}
+	return 1;
+}
+
+static const struct cl_lock_operations osc_lock_ops = {
+	.clo_fini    = osc_lock_fini,
+	.clo_enqueue = osc_lock_enqueue,
+	.clo_wait    = osc_lock_wait,
+	.clo_unuse   = osc_lock_unuse,
+	.clo_use     = osc_lock_use,
+	.clo_delete  = osc_lock_delete,
+	.clo_state   = osc_lock_state,
+	.clo_cancel  = osc_lock_cancel,
+	.clo_weigh   = osc_lock_weigh,
+	.clo_print   = osc_lock_print,
+	.clo_fits_into = osc_lock_fits_into,
+};
+
+static int osc_lock_lockless_unuse(const struct lu_env *env,
+				   const struct cl_lock_slice *slice)
+{
+	struct osc_lock *ols = cl2osc_lock(slice);
+	struct cl_lock *lock = slice->cls_lock;
+
+	LASSERT(ols->ols_state == OLS_GRANTED);
+	LINVRNT(osc_lock_invariant(ols));
+
+	cl_lock_cancel(env, lock);
+	cl_lock_delete(env, lock);
+	return 0;
+}
+
+static void osc_lock_lockless_cancel(const struct lu_env *env,
+				     const struct cl_lock_slice *slice)
+{
+	struct osc_lock   *ols  = cl2osc_lock(slice);
+	int result;
+
+	result = osc_lock_flush(ols, 0);
+	if (result)
+		CERROR("Pages for lockless lock %p were not purged(%d)\n",
+		       ols, result);
+	ols->ols_state = OLS_CANCELLED;
+}
+
+static int osc_lock_lockless_wait(const struct lu_env *env,
+				  const struct cl_lock_slice *slice)
+{
+	struct osc_lock *olck = cl2osc_lock(slice);
+	struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+	LINVRNT(osc_lock_invariant(olck));
+	LASSERT(olck->ols_state >= OLS_UPCALL_RECEIVED);
+
+	return lock->cll_error;
+}
+
+static void osc_lock_lockless_state(const struct lu_env *env,
+				    const struct cl_lock_slice *slice,
+				    enum cl_lock_state state)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	LINVRNT(osc_lock_invariant(lock));
+	if (state == CLS_HELD) {
+		struct osc_io *oio  = osc_env_io(env);
+
+		LASSERT(ergo(lock->ols_owner, lock->ols_owner == oio));
+		lock->ols_owner = oio;
+
+		/* set the io to be lockless if this lock is for io's
+		 * host object */
+		if (cl_object_same(oio->oi_cl.cis_obj, slice->cls_obj))
+			oio->oi_lockless = 1;
+	}
+}
+
+static int osc_lock_lockless_fits_into(const struct lu_env *env,
+				       const struct cl_lock_slice *slice,
+				       const struct cl_lock_descr *need,
+				       const struct cl_io *io)
+{
+	struct osc_lock *lock = cl2osc_lock(slice);
+
+	if (!(need->cld_enq_flags & CEF_NEVER))
+		return 0;
+
+	/* lockless lock should only be used by its owning io. b22147 */
+	return (lock->ols_owner == osc_env_io(env));
+}
+
+static const struct cl_lock_operations osc_lock_lockless_ops = {
+	.clo_fini      = osc_lock_fini,
+	.clo_enqueue   = osc_lock_enqueue,
+	.clo_wait      = osc_lock_lockless_wait,
+	.clo_unuse     = osc_lock_lockless_unuse,
+	.clo_state     = osc_lock_lockless_state,
+	.clo_fits_into = osc_lock_lockless_fits_into,
+	.clo_cancel    = osc_lock_lockless_cancel,
+	.clo_print     = osc_lock_print
+};
+
+int osc_lock_init(const struct lu_env *env,
+		  struct cl_object *obj, struct cl_lock *lock,
+		  const struct cl_io *unused)
+{
+	struct osc_lock *clk;
+	int result;
+
+	OBD_SLAB_ALLOC_PTR_GFP(clk, osc_lock_kmem, __GFP_IO);
+	if (clk != NULL) {
+		__u32 enqflags = lock->cll_descr.cld_enq_flags;
+
+		osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo);
+		atomic_set(&clk->ols_pageref, 0);
+		clk->ols_state = OLS_NEW;
+
+		clk->ols_flags = osc_enq2ldlm_flags(enqflags);
+		clk->ols_agl = !!(enqflags & CEF_AGL);
+		if (clk->ols_agl)
+			clk->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
+		if (clk->ols_flags & LDLM_FL_HAS_INTENT)
+			clk->ols_glimpse = 1;
+
+		cl_lock_slice_add(lock, &clk->ols_cl, obj, &osc_lock_ops);
+
+		if (!(enqflags & CEF_MUST))
+			/* try to convert this lock to a lockless lock */
+			osc_lock_to_lockless(env, clk, (enqflags & CEF_NEVER));
+		if (clk->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
+			clk->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
+
+		LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx\n",
+				lock, clk, clk->ols_flags);
+
+		result = 0;
+	} else
+		result = -ENOMEM;
+	return result;
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm)
+{
+	struct osc_lock *olock;
+	int	      rc = 0;
+
+	spin_lock(&osc_ast_guard);
+	olock = dlm->l_ast_data;
+	/*
+	 * there's a very rare race with osc_page_addref_lock(), but that
+	 * doesn't matter because in the worst case we don't cancel a lock
+	 * which we actually can, that's no harm.
+	 */
+	if (olock != NULL &&
+	    atomic_add_return(_PAGEREF_MAGIC,
+				  &olock->ols_pageref) != _PAGEREF_MAGIC) {
+		atomic_sub(_PAGEREF_MAGIC, &olock->ols_pageref);
+		rc = 1;
+	}
+	spin_unlock(&osc_ast_guard);
+	return rc;
+}
+
+/** @} osc */

diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c
new file mode 100644
index 0000000..ca94e63
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_object.c

@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_object *osc2lu(struct osc_object *osc)
+{
+	return &osc->oo_cl.co_lu;
+}
+
+static struct osc_object *lu2osc(const struct lu_object *obj)
+{
+	LINVRNT(osc_is_object(obj));
+	return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+			   const struct lu_object_conf *conf)
+{
+	struct osc_object	   *osc   = lu2osc(obj);
+	const struct cl_object_conf *cconf = lu2cl_conf(conf);
+	int i;
+
+	osc->oo_oinfo = cconf->u.coc_oinfo;
+	spin_lock_init(&osc->oo_seatbelt);
+	for (i = 0; i < CRT_NR; ++i)
+		INIT_LIST_HEAD(&osc->oo_inflight[i]);
+
+	INIT_LIST_HEAD(&osc->oo_ready_item);
+	INIT_LIST_HEAD(&osc->oo_hp_ready_item);
+	INIT_LIST_HEAD(&osc->oo_write_item);
+	INIT_LIST_HEAD(&osc->oo_read_item);
+
+	osc->oo_root.rb_node = NULL;
+	INIT_LIST_HEAD(&osc->oo_hp_exts);
+	INIT_LIST_HEAD(&osc->oo_urgent_exts);
+	INIT_LIST_HEAD(&osc->oo_rpc_exts);
+	INIT_LIST_HEAD(&osc->oo_reading_exts);
+	atomic_set(&osc->oo_nr_reads, 0);
+	atomic_set(&osc->oo_nr_writes, 0);
+	spin_lock_init(&osc->oo_lock);
+
+	cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
+
+	return 0;
+}
+
+static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+	struct osc_object *osc = lu2osc(obj);
+	int i;
+
+	for (i = 0; i < CRT_NR; ++i)
+		LASSERT(list_empty(&osc->oo_inflight[i]));
+
+	LASSERT(list_empty(&osc->oo_ready_item));
+	LASSERT(list_empty(&osc->oo_hp_ready_item));
+	LASSERT(list_empty(&osc->oo_write_item));
+	LASSERT(list_empty(&osc->oo_read_item));
+
+	LASSERT(osc->oo_root.rb_node == NULL);
+	LASSERT(list_empty(&osc->oo_hp_exts));
+	LASSERT(list_empty(&osc->oo_urgent_exts));
+	LASSERT(list_empty(&osc->oo_rpc_exts));
+	LASSERT(list_empty(&osc->oo_reading_exts));
+	LASSERT(atomic_read(&osc->oo_nr_reads) == 0);
+	LASSERT(atomic_read(&osc->oo_nr_writes) == 0);
+
+	lu_object_fini(obj);
+	OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
+}
+
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+		  lu_printer_t p, const struct ost_lvb *lvb)
+{
+	return (*p)(env, cookie, "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+		    "ctime: "LPU64" blocks: "LPU64,
+		    lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+		    lvb->lvb_ctime, lvb->lvb_blocks);
+}
+
+static int osc_object_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t p, const struct lu_object *obj)
+{
+	struct osc_object   *osc   = lu2osc(obj);
+	struct lov_oinfo    *oinfo = osc->oo_oinfo;
+	struct osc_async_rc *ar    = &oinfo->loi_ar;
+
+	(*p)(env, cookie, "id: "DOSTID" "
+	     "idx: %d gen: %d kms_valid: %u kms "LPU64" "
+	     "rc: %d force_sync: %d min_xid: "LPU64" ",
+	     POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx,
+	     oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms,
+	     ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid);
+	osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
+	return 0;
+}
+
+
+static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+			struct cl_attr *attr)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	cl_lvb2attr(attr, &oinfo->loi_lvb);
+	attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+	return 0;
+}
+
+int osc_attr_set(const struct lu_env *env, struct cl_object *obj,
+		 const struct cl_attr *attr, unsigned valid)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+	struct ost_lvb   *lvb   = &oinfo->loi_lvb;
+
+	if (valid & CAT_SIZE)
+		lvb->lvb_size = attr->cat_size;
+	if (valid & CAT_MTIME)
+		lvb->lvb_mtime = attr->cat_mtime;
+	if (valid & CAT_ATIME)
+		lvb->lvb_atime = attr->cat_atime;
+	if (valid & CAT_CTIME)
+		lvb->lvb_ctime = attr->cat_ctime;
+	if (valid & CAT_BLOCKS)
+		lvb->lvb_blocks = attr->cat_blocks;
+	if (valid & CAT_KMS) {
+		CDEBUG(D_CACHE, "set kms from "LPU64"to "LPU64"\n",
+		       oinfo->loi_kms, (__u64)attr->cat_kms);
+		loi_kms_set(oinfo, attr->cat_kms);
+	}
+	return 0;
+}
+
+static int osc_object_glimpse(const struct lu_env *env,
+			      const struct cl_object *obj, struct ost_lvb *lvb)
+{
+	struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+	ENTRY;
+	lvb->lvb_size   = oinfo->loi_kms;
+	lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+	RETURN(0);
+}
+
+
+void osc_object_set_contended(struct osc_object *obj)
+{
+	obj->oo_contention_time = cfs_time_current();
+	/* mb(); */
+	obj->oo_contended = 1;
+}
+
+void osc_object_clear_contended(struct osc_object *obj)
+{
+	obj->oo_contended = 0;
+}
+
+int osc_object_is_contended(struct osc_object *obj)
+{
+	struct osc_device *dev  = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+	int osc_contention_time = dev->od_contention_time;
+	cfs_time_t cur_time     = cfs_time_current();
+	cfs_time_t retry_time;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
+		return 1;
+
+	if (!obj->oo_contended)
+		return 0;
+
+	/*
+	 * I like copy-paste. the code is copied from
+	 * ll_file_is_contended.
+	 */
+	retry_time = cfs_time_add(obj->oo_contention_time,
+				  cfs_time_seconds(osc_contention_time));
+	if (cfs_time_after(cur_time, retry_time)) {
+		osc_object_clear_contended(obj);
+		return 0;
+	}
+	return 1;
+}
+
+static const struct cl_object_operations osc_ops = {
+	.coo_page_init = osc_page_init,
+	.coo_lock_init = osc_lock_init,
+	.coo_io_init   = osc_io_init,
+	.coo_attr_get  = osc_attr_get,
+	.coo_attr_set  = osc_attr_set,
+	.coo_glimpse   = osc_object_glimpse
+};
+
+static const struct lu_object_operations osc_lu_obj_ops = {
+	.loo_object_init      = osc_object_init,
+	.loo_object_delete    = NULL,
+	.loo_object_release   = NULL,
+	.loo_object_free      = osc_object_free,
+	.loo_object_print     = osc_object_print,
+	.loo_object_invariant = NULL
+};
+
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+				   const struct lu_object_header *unused,
+				   struct lu_device *dev)
+{
+	struct osc_object *osc;
+	struct lu_object  *obj;
+
+	OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, __GFP_IO);
+	if (osc != NULL) {
+		obj = osc2lu(osc);
+		lu_object_init(obj, NULL, dev);
+		osc->oo_cl.co_ops = &osc_ops;
+		obj->lo_ops = &osc_lu_obj_ops;
+	} else
+		obj = NULL;
+	return obj;
+}
+
+/** @} osc */

diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c
new file mode 100644
index 0000000..baba959
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_page.c

@@ -0,0 +1,927 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del);
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg);
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+			   struct osc_page *opg);
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*
+ * Comment out osc_page_protected because it may sleep inside the
+ * the client_obd_list_lock.
+ * client_obd_list_lock -> osc_ap_completion -> osc_completion ->
+ *   -> osc_page_protected -> osc_page_is_dlocked -> osc_match_base
+ *   -> ldlm_lock_match -> sptlrpc_import_check_ctx -> sleep.
+ */
+#if 0
+static int osc_page_is_dlocked(const struct lu_env *env,
+			       const struct osc_page *opg,
+			       enum cl_lock_mode mode, int pending, int unref)
+{
+	struct cl_page	 *page;
+	struct osc_object      *obj;
+	struct osc_thread_info *info;
+	struct ldlm_res_id     *resname;
+	struct lustre_handle   *lockh;
+	ldlm_policy_data_t     *policy;
+	ldlm_mode_t	     dlmmode;
+	int		     flags;
+
+	might_sleep();
+
+	info = osc_env_info(env);
+	resname = &info->oti_resname;
+	policy = &info->oti_policy;
+	lockh = &info->oti_handle;
+	page = opg->ops_cl.cpl_page;
+	obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED;
+	if (pending)
+		flags |= LDLM_FL_CBPENDING;
+
+	dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW;
+	osc_lock_build_res(env, obj, resname);
+	osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index);
+	return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
+			      dlmmode, &flags, NULL, lockh, unref);
+}
+
+/**
+ * Checks an invariant that a page in the cache is covered by a lock, as
+ * needed.
+ */
+static int osc_page_protected(const struct lu_env *env,
+			      const struct osc_page *opg,
+			      enum cl_lock_mode mode, int unref)
+{
+	struct cl_object_header *hdr;
+	struct cl_lock	  *scan;
+	struct cl_page	  *page;
+	struct cl_lock_descr    *descr;
+	int result;
+
+	LINVRNT(!opg->ops_temp);
+
+	page = opg->ops_cl.cpl_page;
+	if (page->cp_owner != NULL &&
+	    cl_io_top(page->cp_owner)->ci_lockreq == CILR_NEVER)
+		/*
+		 * If IO is done without locks (liblustre, or lloop), lock is
+		 * not required.
+		 */
+		result = 1;
+	else
+		/* otherwise check for a DLM lock */
+	result = osc_page_is_dlocked(env, opg, mode, 1, unref);
+	if (result == 0) {
+		/* maybe this page is a part of a lockless io? */
+		hdr = cl_object_header(opg->ops_cl.cpl_obj);
+		descr = &osc_env_info(env)->oti_descr;
+		descr->cld_mode = mode;
+		descr->cld_start = page->cp_index;
+		descr->cld_end   = page->cp_index;
+		spin_lock(&hdr->coh_lock_guard);
+		list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+			/*
+			 * Lock-less sub-lock has to be either in HELD state
+			 * (when io is actively going on), or in CACHED state,
+			 * when top-lock is being unlocked:
+			 * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse().
+			 */
+			if ((scan->cll_state == CLS_HELD ||
+			     scan->cll_state == CLS_CACHED) &&
+			    cl_lock_ext_match(&scan->cll_descr, descr)) {
+				struct osc_lock *olck;
+
+				olck = osc_lock_at(scan);
+				result = osc_lock_is_lockless(olck);
+				break;
+			}
+		}
+		spin_unlock(&hdr->coh_lock_guard);
+	}
+	return result;
+}
+#else
+static int osc_page_protected(const struct lu_env *env,
+			      const struct osc_page *opg,
+			      enum cl_lock_mode mode, int unref)
+{
+	return 1;
+}
+#endif
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+static void osc_page_fini(const struct lu_env *env,
+			  struct cl_page_slice *slice)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	CDEBUG(D_TRACE, "%p\n", opg);
+	LASSERT(opg->ops_lock == NULL);
+}
+
+static void osc_page_transfer_get(struct osc_page *opg, const char *label)
+{
+	struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+	LASSERT(!opg->ops_transfer_pinned);
+	cl_page_get(page);
+	lu_ref_add_atomic(&page->cp_reference, label, page);
+	opg->ops_transfer_pinned = 1;
+}
+
+static void osc_page_transfer_put(const struct lu_env *env,
+				  struct osc_page *opg)
+{
+	struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+	if (opg->ops_transfer_pinned) {
+		lu_ref_del(&page->cp_reference, "transfer", page);
+		opg->ops_transfer_pinned = 0;
+		cl_page_put(env, page);
+	}
+}
+
+/**
+ * This is called once for every page when it is submitted for a transfer
+ * either opportunistic (osc_page_cache_add()), or immediate
+ * (osc_page_submit()).
+ */
+static void osc_page_transfer_add(const struct lu_env *env,
+				  struct osc_page *opg, enum cl_req_type crt)
+{
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	/* ops_lru and ops_inflight share the same field, so take it from LRU
+	 * first and then use it as inflight. */
+	osc_lru_del(osc_cli(obj), opg, false);
+
+	spin_lock(&obj->oo_seatbelt);
+	list_add(&opg->ops_inflight, &obj->oo_inflight[crt]);
+	opg->ops_submitter = current;
+	spin_unlock(&obj->oo_seatbelt);
+}
+
+static int osc_page_cache_add(const struct lu_env *env,
+			      const struct cl_page_slice *slice,
+			      struct cl_io *io)
+{
+	struct osc_io   *oio = osc_env_io(env);
+	struct osc_page *opg = cl2osc_page(slice);
+	int result;
+	ENTRY;
+
+	LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0));
+
+	osc_page_transfer_get(opg, "transfer\0cache");
+	result = osc_queue_async_io(env, io, opg);
+	if (result != 0)
+		osc_page_transfer_put(env, opg);
+	else
+		osc_page_transfer_add(env, opg, CRT_WRITE);
+
+	/* for sync write, kernel will wait for this page to be flushed before
+	 * osc_io_end() is called, so release it earlier.
+	 * for mkwrite(), it's known there is no further pages. */
+	if (cl_io_is_sync_write(io) || cl_io_is_mkwrite(io)) {
+		if (oio->oi_active != NULL) {
+			osc_extent_release(env, oio->oi_active);
+			oio->oi_active = NULL;
+		}
+	}
+
+	RETURN(result);
+}
+
+void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj,
+		      pgoff_t start, pgoff_t end)
+{
+	memset(policy, 0, sizeof *policy);
+	policy->l_extent.start = cl_offset(obj, start);
+	policy->l_extent.end   = cl_offset(obj, end + 1) - 1;
+}
+
+static int osc_page_addref_lock(const struct lu_env *env,
+				struct osc_page *opg,
+				struct cl_lock *lock)
+{
+	struct osc_lock *olock;
+	int	      rc;
+
+	LASSERT(opg->ops_lock == NULL);
+
+	olock = osc_lock_at(lock);
+	if (atomic_inc_return(&olock->ols_pageref) <= 0) {
+		atomic_dec(&olock->ols_pageref);
+		rc = -ENODATA;
+	} else {
+		cl_lock_get(lock);
+		opg->ops_lock = lock;
+		rc = 0;
+	}
+	return rc;
+}
+
+static void osc_page_putref_lock(const struct lu_env *env,
+				 struct osc_page *opg)
+{
+	struct cl_lock  *lock = opg->ops_lock;
+	struct osc_lock *olock;
+
+	LASSERT(lock != NULL);
+	olock = osc_lock_at(lock);
+
+	atomic_dec(&olock->ols_pageref);
+	opg->ops_lock = NULL;
+
+	cl_lock_put(env, lock);
+}
+
+static int osc_page_is_under_lock(const struct lu_env *env,
+				  const struct cl_page_slice *slice,
+				  struct cl_io *unused)
+{
+	struct cl_lock *lock;
+	int	     result = -ENODATA;
+
+	ENTRY;
+	lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page,
+			       NULL, 1, 0);
+	if (lock != NULL) {
+		if (osc_page_addref_lock(env, cl2osc_page(slice), lock) == 0)
+			result = -EBUSY;
+		cl_lock_put(env, lock);
+	}
+	RETURN(result);
+}
+
+static void osc_page_disown(const struct lu_env *env,
+			    const struct cl_page_slice *slice,
+			    struct cl_io *io)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+
+	if (unlikely(opg->ops_lock))
+		osc_page_putref_lock(env, opg);
+}
+
+static void osc_page_completion_read(const struct lu_env *env,
+				     const struct cl_page_slice *slice,
+				     int ioret)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+	if (likely(opg->ops_lock))
+		osc_page_putref_lock(env, opg);
+	osc_lru_add(osc_cli(obj), opg);
+}
+
+static void osc_page_completion_write(const struct lu_env *env,
+				      const struct cl_page_slice *slice,
+				      int ioret)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(slice->cpl_obj);
+
+	osc_lru_add(osc_cli(obj), opg);
+}
+
+static int osc_page_fail(const struct lu_env *env,
+			 const struct cl_page_slice *slice,
+			 struct cl_io *unused)
+{
+	/*
+	 * Cached read?
+	 */
+	LBUG();
+	return 0;
+}
+
+
+static const char *osc_list(struct list_head *head)
+{
+	return list_empty(head) ? "-" : "+";
+}
+
+static inline cfs_time_t osc_submit_duration(struct osc_page *opg)
+{
+	if (opg->ops_submit_time == 0)
+		return 0;
+
+	return (cfs_time_current() - opg->ops_submit_time);
+}
+
+static int osc_page_print(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  void *cookie, lu_printer_t printer)
+{
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_async_page *oap = &opg->ops_oap;
+	struct osc_object     *obj = cl2osc(slice->cpl_obj);
+	struct client_obd     *cli = &osc_export(obj)->exp_obd->u.cli;
+
+	return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p: "
+			  "1< %#x %d %u %s %s > "
+			  "2< "LPU64" %u %u %#x %#x | %p %p %p > "
+			  "3< %s %p %d %lu %d > "
+			  "4< %d %d %d %lu %s | %s %s %s %s > "
+			  "5< %s %s %s %s | %d %s | %d %s %s>\n",
+			  opg,
+			  /* 1 */
+			  oap->oap_magic, oap->oap_cmd,
+			  oap->oap_interrupted,
+			  osc_list(&oap->oap_pending_item),
+			  osc_list(&oap->oap_rpc_item),
+			  /* 2 */
+			  oap->oap_obj_off, oap->oap_page_off, oap->oap_count,
+			  oap->oap_async_flags, oap->oap_brw_flags,
+			  oap->oap_request, oap->oap_cli, obj,
+			  /* 3 */
+			  osc_list(&opg->ops_inflight),
+			  opg->ops_submitter, opg->ops_transfer_pinned,
+			  osc_submit_duration(opg), opg->ops_srvlock,
+			  /* 4 */
+			  cli->cl_r_in_flight, cli->cl_w_in_flight,
+			  cli->cl_max_rpcs_in_flight,
+			  cli->cl_avail_grant,
+			  osc_list(&cli->cl_cache_waiters),
+			  osc_list(&cli->cl_loi_ready_list),
+			  osc_list(&cli->cl_loi_hp_ready_list),
+			  osc_list(&cli->cl_loi_write_list),
+			  osc_list(&cli->cl_loi_read_list),
+			  /* 5 */
+			  osc_list(&obj->oo_ready_item),
+			  osc_list(&obj->oo_hp_ready_item),
+			  osc_list(&obj->oo_write_item),
+			  osc_list(&obj->oo_read_item),
+			  atomic_read(&obj->oo_nr_reads),
+			  osc_list(&obj->oo_reading_exts),
+			  atomic_read(&obj->oo_nr_writes),
+			  osc_list(&obj->oo_hp_exts),
+			  osc_list(&obj->oo_urgent_exts));
+}
+
+static void osc_page_delete(const struct lu_env *env,
+			    const struct cl_page_slice *slice)
+{
+	struct osc_page   *opg = cl2osc_page(slice);
+	struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+	int rc;
+
+	LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1));
+
+	ENTRY;
+	CDEBUG(D_TRACE, "%p\n", opg);
+	osc_page_transfer_put(env, opg);
+	rc = osc_teardown_async_page(env, obj, opg);
+	if (rc) {
+		CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(slice->cpl_page),
+			      "Trying to teardown failed: %d\n", rc);
+		LASSERT(0);
+	}
+
+	spin_lock(&obj->oo_seatbelt);
+	if (opg->ops_submitter != NULL) {
+		LASSERT(!list_empty(&opg->ops_inflight));
+		list_del_init(&opg->ops_inflight);
+		opg->ops_submitter = NULL;
+	}
+	spin_unlock(&obj->oo_seatbelt);
+
+	osc_lru_del(osc_cli(obj), opg, true);
+	EXIT;
+}
+
+void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice,
+		   int from, int to)
+{
+	struct osc_page       *opg = cl2osc_page(slice);
+	struct osc_async_page *oap = &opg->ops_oap;
+
+	LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+	opg->ops_from = from;
+	opg->ops_to   = to;
+	spin_lock(&oap->oap_lock);
+	oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+	spin_unlock(&oap->oap_lock);
+}
+
+static int osc_page_cancel(const struct lu_env *env,
+			   const struct cl_page_slice *slice)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	int rc = 0;
+
+	LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+	/* Check if the transferring against this page
+	 * is completed, or not even queued. */
+	if (opg->ops_transfer_pinned)
+		/* FIXME: may not be interrupted.. */
+		rc = osc_cancel_async_page(env, opg);
+	LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0));
+	return rc;
+}
+
+static int osc_page_flush(const struct lu_env *env,
+			  const struct cl_page_slice *slice,
+			  struct cl_io *io)
+{
+	struct osc_page *opg = cl2osc_page(slice);
+	int rc = 0;
+	ENTRY;
+	rc = osc_flush_async_page(env, io, opg);
+	RETURN(rc);
+}
+
+static const struct cl_page_operations osc_page_ops = {
+	.cpo_fini	  = osc_page_fini,
+	.cpo_print	 = osc_page_print,
+	.cpo_delete	= osc_page_delete,
+	.cpo_is_under_lock = osc_page_is_under_lock,
+	.cpo_disown	= osc_page_disown,
+	.io = {
+		[CRT_READ] = {
+			.cpo_cache_add  = osc_page_fail,
+			.cpo_completion = osc_page_completion_read
+		},
+		[CRT_WRITE] = {
+			.cpo_cache_add  = osc_page_cache_add,
+			.cpo_completion = osc_page_completion_write
+		}
+	},
+	.cpo_clip	   = osc_page_clip,
+	.cpo_cancel	 = osc_page_cancel,
+	.cpo_flush	  = osc_page_flush
+};
+
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+		struct cl_page *page, struct page *vmpage)
+{
+	struct osc_object *osc = cl2osc(obj);
+	struct osc_page   *opg = cl_object_page_slice(obj, page);
+	int result;
+
+	opg->ops_from = 0;
+	opg->ops_to   = PAGE_CACHE_SIZE;
+
+	result = osc_prep_async_page(osc, opg, vmpage,
+					cl_offset(obj, page->cp_index));
+	if (result == 0) {
+		struct osc_io *oio = osc_env_io(env);
+		opg->ops_srvlock = osc_io_srvlock(oio);
+		cl_page_slice_add(page, &opg->ops_cl, obj,
+				&osc_page_ops);
+	}
+	/*
+	 * Cannot assert osc_page_protected() here as read-ahead
+	 * creates temporary pages outside of a lock.
+	 */
+	/* ops_inflight and ops_lru are the same field, but it doesn't
+	 * hurt to initialize it twice :-) */
+	INIT_LIST_HEAD(&opg->ops_inflight);
+	INIT_LIST_HEAD(&opg->ops_lru);
+
+	/* reserve an LRU space for this page */
+	if (page->cp_type == CPT_CACHEABLE && result == 0)
+		result = osc_lru_reserve(env, osc, opg);
+
+	return result;
+}
+
+/**
+ * Helper function called by osc_io_submit() for every page in an immediate
+ * transfer (i.e., transferred synchronously).
+ */
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+		     enum cl_req_type crt, int brw_flags)
+{
+	struct osc_async_page *oap = &opg->ops_oap;
+	struct osc_object     *obj = oap->oap_obj;
+
+	LINVRNT(osc_page_protected(env, opg,
+				   crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1));
+
+	LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, "
+		 "magic 0x%x\n", oap, oap->oap_magic);
+	LASSERT(oap->oap_async_flags & ASYNC_READY);
+	LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE);
+
+	oap->oap_cmd       = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+	oap->oap_page_off  = opg->ops_from;
+	oap->oap_count     = opg->ops_to - opg->ops_from;
+	oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
+
+	if (!client_is_remote(osc_export(obj)) &&
+			cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+		oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+		oap->oap_cmd |= OBD_BRW_NOQUOTA;
+	}
+
+	opg->ops_submit_time = cfs_time_current();
+	osc_page_transfer_get(opg, "transfer\0imm");
+	osc_page_transfer_add(env, opg, crt);
+}
+
+/* --------------- LRU page management ------------------ */
+
+/* OSC is a natural place to manage LRU pages as applications are specialized
+ * to write OSC by OSC. Ideally, if one OSC is used more frequently it should
+ * occupy more LRU slots. On the other hand, we should avoid using up all LRU
+ * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep
+ * for free LRU slots - this will be very bad so the algorithm requires each
+ * OSC to free slots voluntarily to maintain a reasonable number of free slots
+ * at any time.
+ */
+
+static CFS_DECL_WAITQ(osc_lru_waitq);
+static atomic_t osc_lru_waiters = ATOMIC_INIT(0);
+/* LRU pages are freed in batch mode. OSC should at least free this
+ * number of pages to avoid running out of LRU budget, and.. */
+static const int lru_shrink_min = 2 << (20 - PAGE_CACHE_SHIFT);  /* 2M */
+/* free this number at most otherwise it will take too long time to finsih. */
+static const int lru_shrink_max = 32 << (20 - PAGE_CACHE_SHIFT); /* 32M */
+
+/* Check if we can free LRU slots from this OSC. If there exists LRU waiters,
+ * we should free slots aggressively. In this way, slots are freed in a steady
+ * step to maintain fairness among OSCs.
+ *
+ * Return how many LRU pages should be freed. */
+static int osc_cache_too_much(struct client_obd *cli)
+{
+	struct cl_client_cache *cache = cli->cl_cache;
+	int pages = atomic_read(&cli->cl_lru_in_list) >> 1;
+
+	if (atomic_read(&osc_lru_waiters) > 0 &&
+	    atomic_read(cli->cl_lru_left) < lru_shrink_max)
+		/* drop lru pages aggressively */
+		return min(pages, lru_shrink_max);
+
+	/* if it's going to run out LRU slots, we should free some, but not
+	 * too much to maintain faireness among OSCs. */
+	if (atomic_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) {
+		unsigned long tmp;
+
+		tmp = cache->ccc_lru_max / atomic_read(&cache->ccc_users);
+		if (pages > tmp)
+			return min(pages, lru_shrink_max);
+
+		return pages > lru_shrink_min ? lru_shrink_min : 0;
+	}
+
+	return 0;
+}
+
+/* Return how many pages are not discarded in @pvec. */
+static int discard_pagevec(const struct lu_env *env, struct cl_io *io,
+			   struct cl_page **pvec, int max_index)
+{
+	int count;
+	int i;
+
+	for (count = 0, i = 0; i < max_index; i++) {
+		struct cl_page *page = pvec[i];
+		if (cl_page_own_try(env, io, page) == 0) {
+			/* free LRU page only if nobody is using it.
+			 * This check is necessary to avoid freeing the pages
+			 * having already been removed from LRU and pinned
+			 * for IO. */
+			if (!cl_page_in_use(page)) {
+				cl_page_unmap(env, io, page);
+				cl_page_discard(env, io, page);
+				++count;
+			}
+			cl_page_disown(env, io, page);
+		}
+		cl_page_put(env, page);
+		pvec[i] = NULL;
+	}
+	return max_index - count;
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+int osc_lru_shrink(struct client_obd *cli, int target)
+{
+	struct cl_env_nest nest;
+	struct lu_env *env;
+	struct cl_io *io;
+	struct cl_object *clobj = NULL;
+	struct cl_page **pvec;
+	struct osc_page *opg;
+	int maxscan = 0;
+	int count = 0;
+	int index = 0;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(atomic_read(&cli->cl_lru_in_list) >= 0);
+	if (atomic_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+		RETURN(0);
+
+	env = cl_env_nested_get(&nest);
+	if (IS_ERR(env))
+		RETURN(PTR_ERR(env));
+
+	pvec = osc_env_info(env)->oti_pvec;
+	io = &osc_env_info(env)->oti_io;
+
+	client_obd_list_lock(&cli->cl_lru_list_lock);
+	atomic_inc(&cli->cl_lru_shrinkers);
+	maxscan = min(target << 1, atomic_read(&cli->cl_lru_in_list));
+	while (!list_empty(&cli->cl_lru_list)) {
+		struct cl_page *page;
+
+		if (--maxscan < 0)
+			break;
+
+		opg = list_entry(cli->cl_lru_list.next, struct osc_page,
+				     ops_lru);
+		page = cl_page_top(opg->ops_cl.cpl_page);
+		if (cl_page_in_use_noref(page)) {
+			list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+			continue;
+		}
+
+		LASSERT(page->cp_obj != NULL);
+		if (clobj != page->cp_obj) {
+			struct cl_object *tmp = page->cp_obj;
+
+			cl_object_get(tmp);
+			client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+			if (clobj != NULL) {
+				count -= discard_pagevec(env, io, pvec, index);
+				index = 0;
+
+				cl_io_fini(env, io);
+				cl_object_put(env, clobj);
+				clobj = NULL;
+			}
+
+			clobj = tmp;
+			io->ci_obj = clobj;
+			io->ci_ignore_layout = 1;
+			rc = cl_io_init(env, io, CIT_MISC, clobj);
+
+			client_obd_list_lock(&cli->cl_lru_list_lock);
+
+			if (rc != 0)
+				break;
+
+			++maxscan;
+			continue;
+		}
+
+		/* move this page to the end of list as it will be discarded
+		 * soon. The page will be finally removed from LRU list in
+		 * osc_page_delete().  */
+		list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+
+		/* it's okay to grab a refcount here w/o holding lock because
+		 * it has to grab cl_lru_list_lock to delete the page. */
+		cl_page_get(page);
+		pvec[index++] = page;
+		if (++count >= target)
+			break;
+
+		if (unlikely(index == OTI_PVEC_SIZE)) {
+			client_obd_list_unlock(&cli->cl_lru_list_lock);
+			count -= discard_pagevec(env, io, pvec, index);
+			index = 0;
+
+			client_obd_list_lock(&cli->cl_lru_list_lock);
+		}
+	}
+	client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+	if (clobj != NULL) {
+		count -= discard_pagevec(env, io, pvec, index);
+
+		cl_io_fini(env, io);
+		cl_object_put(env, clobj);
+	}
+	cl_env_nested_put(&nest, env);
+
+	atomic_dec(&cli->cl_lru_shrinkers);
+	RETURN(count > 0 ? count : rc);
+}
+
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg)
+{
+	bool wakeup = false;
+
+	if (!opg->ops_in_lru)
+		return;
+
+	atomic_dec(&cli->cl_lru_busy);
+	client_obd_list_lock(&cli->cl_lru_list_lock);
+	if (list_empty(&opg->ops_lru)) {
+		list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+		atomic_inc_return(&cli->cl_lru_in_list);
+		wakeup = atomic_read(&osc_lru_waiters) > 0;
+	}
+	client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+	if (wakeup) {
+		osc_lru_shrink(cli, osc_cache_too_much(cli));
+		wake_up_all(&osc_lru_waitq);
+	}
+}
+
+/* delete page from LRUlist. The page can be deleted from LRUlist for two
+ * reasons: redirtied or deleted from page cache. */
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del)
+{
+	if (opg->ops_in_lru) {
+		client_obd_list_lock(&cli->cl_lru_list_lock);
+		if (!list_empty(&opg->ops_lru)) {
+			LASSERT(atomic_read(&cli->cl_lru_in_list) > 0);
+			list_del_init(&opg->ops_lru);
+			atomic_dec(&cli->cl_lru_in_list);
+			if (!del)
+				atomic_inc(&cli->cl_lru_busy);
+		} else if (del) {
+			LASSERT(atomic_read(&cli->cl_lru_busy) > 0);
+			atomic_dec(&cli->cl_lru_busy);
+		}
+		client_obd_list_unlock(&cli->cl_lru_list_lock);
+		if (del) {
+			atomic_inc(cli->cl_lru_left);
+			/* this is a great place to release more LRU pages if
+			 * this osc occupies too many LRU pages and kernel is
+			 * stealing one of them.
+			 * cl_lru_shrinkers is to avoid recursive call in case
+			 * we're already in the context of osc_lru_shrink(). */
+			if (atomic_read(&cli->cl_lru_shrinkers) == 0 &&
+			    !memory_pressure_get())
+				osc_lru_shrink(cli, osc_cache_too_much(cli));
+			wake_up(&osc_lru_waitq);
+		}
+	} else {
+		LASSERT(list_empty(&opg->ops_lru));
+	}
+}
+
+static inline int max_to_shrink(struct client_obd *cli)
+{
+	return min(atomic_read(&cli->cl_lru_in_list) >> 1, lru_shrink_max);
+}
+
+static int osc_lru_reclaim(struct client_obd *cli)
+{
+	struct cl_client_cache *cache = cli->cl_cache;
+	int max_scans;
+	int rc;
+
+	LASSERT(cache != NULL);
+	LASSERT(!list_empty(&cache->ccc_lru));
+
+	rc = osc_lru_shrink(cli, lru_shrink_min);
+	if (rc != 0) {
+		CDEBUG(D_CACHE, "%s: Free %d pages from own LRU: %p.\n",
+			cli->cl_import->imp_obd->obd_name, rc, cli);
+		return rc;
+	}
+
+	CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %d, busy: %d.\n",
+		cli->cl_import->imp_obd->obd_name, cli,
+		atomic_read(&cli->cl_lru_in_list),
+		atomic_read(&cli->cl_lru_busy));
+
+	/* Reclaim LRU slots from other client_obd as it can't free enough
+	 * from its own. This should rarely happen. */
+	spin_lock(&cache->ccc_lru_lock);
+	cache->ccc_lru_shrinkers++;
+	list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+
+	max_scans = atomic_read(&cache->ccc_users);
+	while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) {
+		cli = list_entry(cache->ccc_lru.next, struct client_obd,
+					cl_lru_osc);
+
+		CDEBUG(D_CACHE, "%s: cli %p LRU pages: %d, busy: %d.\n",
+			cli->cl_import->imp_obd->obd_name, cli,
+			atomic_read(&cli->cl_lru_in_list),
+			atomic_read(&cli->cl_lru_busy));
+
+		list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+		if (atomic_read(&cli->cl_lru_in_list) > 0) {
+			spin_unlock(&cache->ccc_lru_lock);
+
+			rc = osc_lru_shrink(cli, max_to_shrink(cli));
+			spin_lock(&cache->ccc_lru_lock);
+			if (rc != 0)
+				break;
+		}
+	}
+	spin_unlock(&cache->ccc_lru_lock);
+
+	CDEBUG(D_CACHE, "%s: cli %p freed %d pages.\n",
+		cli->cl_import->imp_obd->obd_name, cli, rc);
+	return rc;
+}
+
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+			   struct osc_page *opg)
+{
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	struct client_obd *cli = osc_cli(obj);
+	int rc = 0;
+	ENTRY;
+
+	if (cli->cl_cache == NULL) /* shall not be in LRU */
+		RETURN(0);
+
+	LASSERT(atomic_read(cli->cl_lru_left) >= 0);
+	while (!cfs_atomic_add_unless(cli->cl_lru_left, -1, 0)) {
+		int gen;
+
+		/* run out of LRU spaces, try to drop some by itself */
+		rc = osc_lru_reclaim(cli);
+		if (rc < 0)
+			break;
+		if (rc > 0)
+			continue;
+
+		cond_resched();
+
+		/* slowest case, all of caching pages are busy, notifying
+		 * other OSCs that we're lack of LRU slots. */
+		atomic_inc(&osc_lru_waiters);
+
+		gen = atomic_read(&cli->cl_lru_in_list);
+		rc = l_wait_event(osc_lru_waitq,
+				atomic_read(cli->cl_lru_left) > 0 ||
+				(atomic_read(&cli->cl_lru_in_list) > 0 &&
+				 gen != atomic_read(&cli->cl_lru_in_list)),
+				&lwi);
+
+		atomic_dec(&osc_lru_waiters);
+		if (rc < 0)
+			break;
+	}
+
+	if (rc >= 0) {
+		atomic_inc(&cli->cl_lru_busy);
+		opg->ops_in_lru = 1;
+		rc = 0;
+	}
+
+	RETURN(rc);
+}
+
+/** @} osc */

diff --git a/drivers/staging/lustre/lustre/osc/osc_quota.c b/drivers/staging/lustre/lustre/osc/osc_quota.c
new file mode 100644
index 0000000..69caab7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_quota.c

@@ -0,0 +1,332 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Code originally extracted from quota directory
+ */
+
+#include <obd_ost.h>
+#include "osc_internal.h"
+
+static inline struct osc_quota_info *osc_oqi_alloc(obd_uid id)
+{
+	struct osc_quota_info *oqi;
+
+	OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem);
+	if (oqi != NULL)
+		oqi->oqi_id = id;
+
+	return oqi;
+}
+
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[])
+{
+	int type;
+	ENTRY;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		struct osc_quota_info *oqi;
+
+		oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+		if (oqi) {
+			obd_uid id = oqi->oqi_id;
+
+			LASSERTF(id == qid[type],
+				 "The ids don't match %u != %u\n",
+				 id, qid[type]);
+
+			/* the slot is busy, the user is about to run out of
+			 * quota space on this OST */
+			CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n",
+			       type == USRQUOTA ? "user" : "grout", qid[type]);
+			RETURN(NO_QUOTA);
+		}
+	}
+
+	RETURN(QUOTA_OK);
+}
+
+#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \
+						: OBD_MD_FLGRPQUOTA)
+#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \
+						: OBD_FL_NO_GRPQUOTA)
+
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+		    obd_flag valid, obd_flag flags)
+{
+	int type;
+	int rc = 0;
+	ENTRY;
+
+	if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0)
+		RETURN(0);
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		struct osc_quota_info *oqi;
+
+		if ((valid & MD_QUOTA_FLAG(type)) == 0)
+			continue;
+
+		/* lookup the ID in the per-type hash table */
+		oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+		if ((flags & FL_QUOTA_FLAG(type)) != 0) {
+			/* This ID is getting close to its quota limit, let's
+			 * switch to sync I/O */
+			if (oqi != NULL)
+				continue;
+
+			oqi = osc_oqi_alloc(qid[type]);
+			if (oqi == NULL) {
+				rc = -ENOMEM;
+				break;
+			}
+
+			rc = cfs_hash_add_unique(cli->cl_quota_hash[type],
+						 &qid[type], &oqi->oqi_hash);
+			/* race with others? */
+			if (rc == -EALREADY) {
+				rc = 0;
+				OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+			}
+
+			CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n",
+			       cli->cl_import->imp_obd->obd_name,
+			       type == USRQUOTA ? "user" : "group",
+			       qid[type], rc);
+		} else {
+			/* This ID is now off the hook, let's remove it from
+			 * the hash table */
+			if (oqi == NULL)
+				continue;
+
+			oqi = cfs_hash_del_key(cli->cl_quota_hash[type],
+					       &qid[type]);
+			if (oqi)
+				OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+
+			CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n",
+			       cli->cl_import->imp_obd->obd_name,
+			       type == USRQUOTA ? "user" : "group",
+			       qid[type], oqi);
+		}
+	}
+
+	RETURN(rc);
+}
+
+/*
+ * Hash operations for uid/gid <-> osc_quota_info
+ */
+static unsigned
+oqi_hashfn(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_u32_hash(*((__u32*)key), mask);
+}
+
+static int
+oqi_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+	obd_uid uid;
+
+	LASSERT(key != NULL);
+	uid = *((obd_uid*)key);
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+	return uid == oqi->oqi_id;
+}
+
+static void *
+oqi_key(struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+	return &oqi->oqi_id;
+}
+
+static void *
+oqi_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+}
+
+static void
+oqi_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct osc_quota_info *oqi;
+
+	oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+	OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+}
+
+#define HASH_QUOTA_BKT_BITS 5
+#define HASH_QUOTA_CUR_BITS 5
+#define HASH_QUOTA_MAX_BITS 15
+
+static cfs_hash_ops_t quota_hash_ops = {
+	.hs_hash	= oqi_hashfn,
+	.hs_keycmp	= oqi_keycmp,
+	.hs_key		= oqi_key,
+	.hs_object	= oqi_object,
+	.hs_get		= oqi_get,
+	.hs_put_locked	= oqi_put_locked,
+	.hs_exit	= oqi_exit,
+};
+
+int osc_quota_setup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int i, type;
+	ENTRY;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
+							   HASH_QUOTA_CUR_BITS,
+							   HASH_QUOTA_MAX_BITS,
+							   HASH_QUOTA_BKT_BITS,
+							   0,
+							   CFS_HASH_MIN_THETA,
+							   CFS_HASH_MAX_THETA,
+							   &quota_hash_ops,
+							   CFS_HASH_DEFAULT);
+		if (cli->cl_quota_hash[type] == NULL)
+			break;
+	}
+
+	if (type == MAXQUOTAS)
+		RETURN(0);
+
+	for (i = 0; i < type; i++)
+		cfs_hash_putref(cli->cl_quota_hash[i]);
+
+	RETURN(-ENOMEM);
+}
+
+int osc_quota_cleanup(struct obd_device *obd)
+{
+	struct client_obd     *cli = &obd->u.cli;
+	int type;
+	ENTRY;
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		cfs_hash_putref(cli->cl_quota_hash[type]);
+
+	RETURN(0);
+}
+
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+		 struct obd_quotactl *oqctl)
+{
+	struct ptlrpc_request *req;
+	struct obd_quotactl   *oqc;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_OST_QUOTACTL, LUSTRE_OST_VERSION,
+					OST_QUOTACTL);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*oqc = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+	ptlrpc_at_set_req_timeout(req);
+	req->rq_no_resend = 1;
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+	if (req->rq_repmsg &&
+	    (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+		*oqctl = *oqc;
+	} else if (!rc) {
+		CERROR ("Can't unpack obd_quotactl\n");
+		rc = -EPROTO;
+	}
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+		   struct obd_quotactl *oqctl)
+{
+	struct client_obd       *cli = &exp->exp_obd->u.cli;
+	struct ptlrpc_request   *req;
+	struct obd_quotactl     *body;
+	int		      rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+					&RQF_OST_QUOTACHECK, LUSTRE_OST_VERSION,
+					OST_QUOTACHECK);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+	*body = *oqctl;
+
+	ptlrpc_request_set_replen(req);
+
+	/* the next poll will find -ENODATA, that means quotacheck is
+	 * going on */
+	cli->cl_qchk_stat = -ENODATA;
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		cli->cl_qchk_stat = rc;
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk)
+{
+	struct client_obd *cli = &exp->exp_obd->u.cli;
+	int rc;
+	ENTRY;
+
+	qchk->obd_uuid = cli->cl_target_uuid;
+	memcpy(qchk->obd_type, LUSTRE_OST_NAME, strlen(LUSTRE_OST_NAME));
+
+	rc = cli->cl_qchk_stat;
+	/* the client is not the previous one */
+	if (rc == CL_NOT_QUOTACHECKED)
+		rc = -EINTR;
+	RETURN(rc);
+}

diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c
new file mode 100644
index 0000000..53d6a35
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_request.c

@@ -0,0 +1,3708 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include <linux/libcfs/libcfs.h>
+
+
+#include <lustre_dlm.h>
+#include <lustre_net.h>
+#include <lustre/lustre_user.h>
+#include <obd_cksum.h>
+#include <obd_ost.h>
+#include <obd_lov.h>
+
+#ifdef  __CYGWIN__
+# include <ctype.h>
+#endif
+
+#include <lustre_ha.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+#include <lustre_debug.h>
+#include <lustre_param.h>
+#include <lustre_fid.h>
+#include "osc_internal.h"
+#include "osc_cl_internal.h"
+
+static void osc_release_ppga(struct brw_page **ppga, obd_count count);
+static int brw_interpret(const struct lu_env *env,
+			 struct ptlrpc_request *req, void *data, int rc);
+int osc_cleanup(struct obd_device *obd);
+
+/* Pack OSC object metadata for disk storage (LE byte order). */
+static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+		      struct lov_stripe_md *lsm)
+{
+	int lmm_size;
+	ENTRY;
+
+	lmm_size = sizeof(**lmmp);
+	if (lmmp == NULL)
+		RETURN(lmm_size);
+
+	if (*lmmp != NULL && lsm == NULL) {
+		OBD_FREE(*lmmp, lmm_size);
+		*lmmp = NULL;
+		RETURN(0);
+	} else if (unlikely(lsm != NULL && ostid_id(&lsm->lsm_oi) == 0)) {
+		RETURN(-EBADF);
+	}
+
+	if (*lmmp == NULL) {
+		OBD_ALLOC(*lmmp, lmm_size);
+		if (*lmmp == NULL)
+			RETURN(-ENOMEM);
+	}
+
+	if (lsm)
+		ostid_cpu_to_le(&lsm->lsm_oi, &(*lmmp)->lmm_oi);
+
+	RETURN(lmm_size);
+}
+
+/* Unpack OSC object metadata from disk storage (LE byte order). */
+static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+			struct lov_mds_md *lmm, int lmm_bytes)
+{
+	int lsm_size;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	ENTRY;
+
+	if (lmm != NULL) {
+		if (lmm_bytes < sizeof(*lmm)) {
+			CERROR("%s: lov_mds_md too small: %d, need %d\n",
+			       exp->exp_obd->obd_name, lmm_bytes,
+			       (int)sizeof(*lmm));
+			RETURN(-EINVAL);
+		}
+		/* XXX LOV_MAGIC etc check? */
+
+		if (unlikely(ostid_id(&lmm->lmm_oi) == 0)) {
+			CERROR("%s: zero lmm_object_id: rc = %d\n",
+			       exp->exp_obd->obd_name, -EINVAL);
+			RETURN(-EINVAL);
+		}
+	}
+
+	lsm_size = lov_stripe_md_size(1);
+	if (lsmp == NULL)
+		RETURN(lsm_size);
+
+	if (*lsmp != NULL && lmm == NULL) {
+		OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+		OBD_FREE(*lsmp, lsm_size);
+		*lsmp = NULL;
+		RETURN(0);
+	}
+
+	if (*lsmp == NULL) {
+		OBD_ALLOC(*lsmp, lsm_size);
+		if (unlikely(*lsmp == NULL))
+			RETURN(-ENOMEM);
+		OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+		if (unlikely((*lsmp)->lsm_oinfo[0] == NULL)) {
+			OBD_FREE(*lsmp, lsm_size);
+			RETURN(-ENOMEM);
+		}
+		loi_init((*lsmp)->lsm_oinfo[0]);
+	} else if (unlikely(ostid_id(&(*lsmp)->lsm_oi) == 0)) {
+		RETURN(-EBADF);
+	}
+
+	if (lmm != NULL)
+		/* XXX zero *lsmp? */
+		ostid_le_to_cpu(&lmm->lmm_oi, &(*lsmp)->lsm_oi);
+
+	if (imp != NULL &&
+	    (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES))
+		(*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+	else
+		(*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+
+	RETURN(lsm_size);
+}
+
+static inline void osc_pack_capa(struct ptlrpc_request *req,
+				 struct ost_body *body, void *capa)
+{
+	struct obd_capa *oc = (struct obd_capa *)capa;
+	struct lustre_capa *c;
+
+	if (!capa)
+		return;
+
+	c = req_capsule_client_get(&req->rq_pill, &RMF_CAPA1);
+	LASSERT(c);
+	capa_cpy(c, oc);
+	body->oa.o_valid |= OBD_MD_FLOSSCAPA;
+	DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+static inline void osc_pack_req_body(struct ptlrpc_request *req,
+				     struct obd_info *oinfo)
+{
+	struct ost_body *body;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+			     oinfo->oi_oa);
+	osc_pack_capa(req, body, oinfo->oi_capa);
+}
+
+static inline void osc_set_capa_size(struct ptlrpc_request *req,
+				     const struct req_msg_field *field,
+				     struct obd_capa *oc)
+{
+	if (oc == NULL)
+		req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+	else
+		/* it is already calculated as sizeof struct obd_capa */
+		;
+}
+
+static int osc_getattr_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_async_args *aa, int rc)
+{
+	struct ost_body *body;
+	ENTRY;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body) {
+		CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+		lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
+				     aa->aa_oi->oi_oa, &body->oa);
+
+		/* This should really be sent by the OST */
+		aa->aa_oi->oi_oa->o_blksize = DT_MAX_BRW_SIZE;
+		aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+	} else {
+		CDEBUG(D_INFO, "can't unpack ost_body\n");
+		rc = -EPROTO;
+		aa->aa_oi->oi_oa->o_valid = 0;
+	}
+out:
+	rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+	RETURN(rc);
+}
+
+static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			     struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	struct osc_async_args *aa;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_getattr_interpret;
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oi = oinfo;
+
+	ptlrpc_set_add_req(set, req);
+	RETURN(0);
+}
+
+static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_info *oinfo)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa,
+			     &body->oa);
+
+	oinfo->oi_oa->o_blksize = cli_brw_size(exp->exp_obd);
+	oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+
+	EXIT;
+ out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
+		       struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(oinfo->oi_oa->o_valid & OBD_MD_FLGROUP);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa,
+			     &body->oa);
+
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int osc_setattr_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_setattr_args *sa, int rc)
+{
+	struct ost_body *body;
+	ENTRY;
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EPROTO);
+
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
+			     &body->oa);
+out:
+	rc = sa->sa_upcall(sa->sa_cookie, rc);
+	RETURN(rc);
+}
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+			   struct obd_trans_info *oti,
+			   obd_enqueue_update_f upcall, void *cookie,
+			   struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request   *req;
+	struct osc_setattr_args *sa;
+	int		      rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	if (oti && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+		oinfo->oi_oa->o_lcookie = *oti->oti_logcookies;
+
+	osc_pack_req_body(req, oinfo);
+
+	ptlrpc_request_set_replen(req);
+
+	/* do mds to ost setattr asynchronously */
+	if (!rqset) {
+		/* Do not wait for response. */
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	} else {
+		req->rq_interpret_reply =
+			(ptlrpc_interpterer_t)osc_setattr_interpret;
+
+		CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+		sa = ptlrpc_req_async_args(req);
+		sa->sa_oa = oinfo->oi_oa;
+		sa->sa_upcall = upcall;
+		sa->sa_cookie = cookie;
+
+		if (rqset == PTLRPCD_SET)
+			ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+		else
+			ptlrpc_set_add_req(rqset, req);
+	}
+
+	RETURN(0);
+}
+
+static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+			     struct obd_trans_info *oti,
+			     struct ptlrpc_request_set *rqset)
+{
+	return osc_setattr_async_base(exp, oinfo, oti,
+				      oinfo->oi_cb_up, oinfo, rqset);
+}
+
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+		    struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	struct lov_stripe_md  *lsm;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(oa);
+	LASSERT(ea);
+
+	lsm = *ea;
+	if (!lsm) {
+		rc = obd_alloc_memmd(exp, &lsm);
+		if (rc < 0)
+			RETURN(rc);
+	}
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	ptlrpc_request_set_replen(req);
+
+	if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+	    oa->o_flags == OBD_FL_DELORPHAN) {
+		DEBUG_REQ(D_HA, req,
+			  "delorphan from OST integration");
+		/* Don't resend the delorphan req */
+		req->rq_no_resend = req->rq_no_delay = 1;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out_req, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL)
+		GOTO(out_req, rc = -EPROTO);
+
+	CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
+	lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
+
+	oa->o_blksize = cli_brw_size(exp->exp_obd);
+	oa->o_valid |= OBD_MD_FLBLKSZ;
+
+	/* XXX LOV STACKING: the lsm that is passed to us from LOV does not
+	 * have valid lsm_oinfo data structs, so don't go touching that.
+	 * This needs to be fixed in a big way.
+	 */
+	lsm->lsm_oi = oa->o_oi;
+	*ea = lsm;
+
+	if (oti != NULL) {
+		oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+
+		if (oa->o_valid & OBD_MD_FLCOOKIE) {
+			if (!oti->oti_logcookies)
+				oti_alloc_cookies(oti, 1);
+			*oti->oti_logcookies = oa->o_lcookie;
+		}
+	}
+
+	CDEBUG(D_HA, "transno: "LPD64"\n",
+	       lustre_msg_get_transno(req->rq_repmsg));
+out_req:
+	ptlrpc_req_finished(req);
+out:
+	if (rc && !*ea)
+		obd_free_memmd(exp, &lsm);
+	RETURN(rc);
+}
+
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+		   obd_enqueue_update_f upcall, void *cookie,
+		   struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request   *req;
+	struct osc_setattr_args *sa;
+	struct ost_body	 *body;
+	int		      rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+	ptlrpc_at_set_req_timeout(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+			     oinfo->oi_oa);
+	osc_pack_capa(req, body, oinfo->oi_capa);
+
+	ptlrpc_request_set_replen(req);
+
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
+	CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+	sa = ptlrpc_req_async_args(req);
+	sa->sa_oa     = oinfo->oi_oa;
+	sa->sa_upcall = upcall;
+	sa->sa_cookie = cookie;
+	if (rqset == PTLRPCD_SET)
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	else
+		ptlrpc_set_add_req(rqset, req);
+
+	RETURN(0);
+}
+
+static int osc_punch(const struct lu_env *env, struct obd_export *exp,
+		     struct obd_info *oinfo, struct obd_trans_info *oti,
+		     struct ptlrpc_request_set *rqset)
+{
+	oinfo->oi_oa->o_size   = oinfo->oi_policy.l_extent.start;
+	oinfo->oi_oa->o_blocks = oinfo->oi_policy.l_extent.end;
+	oinfo->oi_oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+	return osc_punch_base(exp, oinfo,
+			      oinfo->oi_cb_up, oinfo, rqset);
+}
+
+static int osc_sync_interpret(const struct lu_env *env,
+			      struct ptlrpc_request *req,
+			      void *arg, int rc)
+{
+	struct osc_fsync_args *fa = arg;
+	struct ost_body *body;
+	ENTRY;
+
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		CERROR ("can't unpack ost_body\n");
+		GOTO(out, rc = -EPROTO);
+	}
+
+	*fa->fa_oi->oi_oa = body->oa;
+out:
+	rc = fa->fa_upcall(fa->fa_cookie, rc);
+	RETURN(rc);
+}
+
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+		  obd_enqueue_update_f upcall, void *cookie,
+		  struct ptlrpc_request_set *rqset)
+{
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	struct osc_fsync_args *fa;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	/* overload the size and blocks fields in the oa with start/end */
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+			     oinfo->oi_oa);
+	osc_pack_capa(req, body, oinfo->oi_capa);
+
+	ptlrpc_request_set_replen(req);
+	req->rq_interpret_reply = osc_sync_interpret;
+
+	CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args));
+	fa = ptlrpc_req_async_args(req);
+	fa->fa_oi = oinfo;
+	fa->fa_upcall = upcall;
+	fa->fa_cookie = cookie;
+
+	if (rqset == PTLRPCD_SET)
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	else
+		ptlrpc_set_add_req(rqset, req);
+
+	RETURN (0);
+}
+
+static int osc_sync(const struct lu_env *env, struct obd_export *exp,
+		    struct obd_info *oinfo, obd_size start, obd_size end,
+		    struct ptlrpc_request_set *set)
+{
+	ENTRY;
+
+	if (!oinfo->oi_oa) {
+		CDEBUG(D_INFO, "oa NULL\n");
+		RETURN(-EINVAL);
+	}
+
+	oinfo->oi_oa->o_size = start;
+	oinfo->oi_oa->o_blocks = end;
+	oinfo->oi_oa->o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+
+	RETURN(osc_sync_base(exp, oinfo, oinfo->oi_cb_up, oinfo, set));
+}
+
+/* Find and cancel locally locks matched by @mode in the resource found by
+ * @objid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
+				   struct list_head *cancels,
+				   ldlm_mode_t mode, int lock_flags)
+{
+	struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+	struct ldlm_res_id res_id;
+	struct ldlm_resource *res;
+	int count;
+	ENTRY;
+
+	/* Return, i.e. cancel nothing, only if ELC is supported (flag in
+	 * export) but disabled through procfs (flag in NS).
+	 *
+	 * This distinguishes from a case when ELC is not supported originally,
+	 * when we still want to cancel locks in advance and just cancel them
+	 * locally, without sending any RPC. */
+	if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+		RETURN(0);
+
+	ostid_build_res_name(&oa->o_oi, &res_id);
+	res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+	if (res == NULL)
+		RETURN(0);
+
+	LDLM_RESOURCE_ADDREF(res);
+	count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
+					   lock_flags, 0, NULL);
+	LDLM_RESOURCE_DELREF(res);
+	ldlm_resource_putref(res);
+	RETURN(count);
+}
+
+static int osc_destroy_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req, void *data,
+				 int rc)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+
+	atomic_dec(&cli->cl_destroy_in_flight);
+	wake_up(&cli->cl_destroy_waitq);
+	return 0;
+}
+
+static int osc_can_send_destroy(struct client_obd *cli)
+{
+	if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
+	    cli->cl_max_rpcs_in_flight) {
+		/* The destroy request can be sent */
+		return 1;
+	}
+	if (atomic_dec_return(&cli->cl_destroy_in_flight) <
+	    cli->cl_max_rpcs_in_flight) {
+		/*
+		 * The counter has been modified between the two atomic
+		 * operations.
+		 */
+		wake_up(&cli->cl_destroy_waitq);
+	}
+	return 0;
+}
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+	       struct obdo *oa, struct lov_stripe_md **ea,
+	       struct obd_trans_info *oti)
+{
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(oa);
+	LASSERT(ea);
+	LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+	if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+	    oa->o_flags == OBD_FL_RECREATE_OBJS) {
+		RETURN(osc_real_create(exp, oa, ea, oti));
+	}
+
+	if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi)))
+		RETURN(osc_real_create(exp, oa, ea, oti));
+
+	/* we should not get here anymore */
+	LBUG();
+
+	RETURN(rc);
+}
+
+/* Destroy requests can be async always on the client, and we don't even really
+ * care about the return code since the client cannot do anything at all about
+ * a destroy failure.
+ * When the MDS is unlinking a filename, it saves the file objects into a
+ * recovery llog, and these object records are cancelled when the OST reports
+ * they were destroyed and sync'd to disk (i.e. transaction committed).
+ * If the client dies, or the OST is down when the object should be destroyed,
+ * the records are not cancelled, and when the OST reconnects to the MDS next,
+ * it will retrieve the llog unlink logs and then sends the log cancellation
+ * cookies to the MDS after committing destroy transactions. */
+static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
+		       struct obdo *oa, struct lov_stripe_md *ea,
+		       struct obd_trans_info *oti, struct obd_export *md_export,
+		       void *capa)
+{
+	struct client_obd     *cli = &exp->exp_obd->u.cli;
+	struct ptlrpc_request *req;
+	struct ost_body       *body;
+	LIST_HEAD(cancels);
+	int rc, count;
+	ENTRY;
+
+	if (!oa) {
+		CDEBUG(D_INFO, "oa NULL\n");
+		RETURN(-EINVAL);
+	}
+
+	count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
+					LDLM_FL_DISCARD_DATA);
+
+	req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
+	if (req == NULL) {
+		ldlm_lock_list_put(&cancels, l_bl_ast, count);
+		RETURN(-ENOMEM);
+	}
+
+	osc_set_capa_size(req, &RMF_CAPA1, (struct obd_capa *)capa);
+	rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
+			       0, &cancels, count);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+	ptlrpc_at_set_req_timeout(req);
+
+	if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE)
+		oa->o_lcookie = *oti->oti_logcookies;
+	body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	osc_pack_capa(req, body, (struct obd_capa *)capa);
+	ptlrpc_request_set_replen(req);
+
+	/* If osc_destory is for destroying the unlink orphan,
+	 * sent from MDT to OST, which should not be blocked here,
+	 * because the process might be triggered by ptlrpcd, and
+	 * it is not good to block ptlrpcd thread (b=16006)*/
+	if (!(oa->o_flags & OBD_FL_DELORPHAN)) {
+		req->rq_interpret_reply = osc_destroy_interpret;
+		if (!osc_can_send_destroy(cli)) {
+			struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP,
+							  NULL);
+
+			/*
+			 * Wait until the number of on-going destroy RPCs drops
+			 * under max_rpc_in_flight
+			 */
+			l_wait_event_exclusive(cli->cl_destroy_waitq,
+					       osc_can_send_destroy(cli), &lwi);
+		}
+	}
+
+	/* Do not wait for response */
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	RETURN(0);
+}
+
+static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
+				long writing_bytes)
+{
+	obd_flag bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT;
+
+	LASSERT(!(oa->o_valid & bits));
+
+	oa->o_valid |= bits;
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	oa->o_dirty = cli->cl_dirty;
+	if (unlikely(cli->cl_dirty - cli->cl_dirty_transit >
+		     cli->cl_dirty_max)) {
+		CERROR("dirty %lu - %lu > dirty_max %lu\n",
+		       cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max);
+		oa->o_undirty = 0;
+	} else if (unlikely(atomic_read(&obd_dirty_pages) -
+			    atomic_read(&obd_dirty_transit_pages) >
+			    (long)(obd_max_dirty_pages + 1))) {
+		/* The atomic_read() allowing the atomic_inc() are
+		 * not covered by a lock thus they may safely race and trip
+		 * this CERROR() unless we add in a small fudge factor (+1). */
+		CERROR("dirty %d - %d > system dirty_max %d\n",
+		       atomic_read(&obd_dirty_pages),
+		       atomic_read(&obd_dirty_transit_pages),
+		       obd_max_dirty_pages);
+		oa->o_undirty = 0;
+	} else if (unlikely(cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff)) {
+		CERROR("dirty %lu - dirty_max %lu too big???\n",
+		       cli->cl_dirty, cli->cl_dirty_max);
+		oa->o_undirty = 0;
+	} else {
+		long max_in_flight = (cli->cl_max_pages_per_rpc <<
+				      PAGE_CACHE_SHIFT)*
+				     (cli->cl_max_rpcs_in_flight + 1);
+		oa->o_undirty = max(cli->cl_dirty_max, max_in_flight);
+	}
+	oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
+	oa->o_dropped = cli->cl_lost_grant;
+	cli->cl_lost_grant = 0;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	CDEBUG(D_CACHE,"dirty: "LPU64" undirty: %u dropped %u grant: "LPU64"\n",
+	       oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
+
+}
+
+void osc_update_next_shrink(struct client_obd *cli)
+{
+	cli->cl_next_shrink_grant =
+		cfs_time_shift(cli->cl_grant_shrink_interval);
+	CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
+	       cli->cl_next_shrink_grant);
+}
+
+static void __osc_update_grant(struct client_obd *cli, obd_size grant)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	cli->cl_avail_grant += grant;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
+{
+	if (body->oa.o_valid & OBD_MD_FLGRANT) {
+		CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant);
+		__osc_update_grant(cli, body->oa.o_grant);
+	}
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      obd_count keylen, void *key, obd_count vallen,
+			      void *val, struct ptlrpc_request_set *set);
+
+static int osc_shrink_grant_interpret(const struct lu_env *env,
+				      struct ptlrpc_request *req,
+				      void *aa, int rc)
+{
+	struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+	struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
+	struct ost_body *body;
+
+	if (rc != 0) {
+		__osc_update_grant(cli, oa->o_grant);
+		GOTO(out, rc);
+	}
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	LASSERT(body);
+	osc_update_grant(cli, body);
+out:
+	OBDO_FREE(oa);
+	return rc;
+}
+
+static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
+{
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	oa->o_grant = cli->cl_avail_grant / 4;
+	cli->cl_avail_grant -= oa->o_grant;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
+		oa->o_valid |= OBD_MD_FLFLAGS;
+		oa->o_flags = 0;
+	}
+	oa->o_flags |= OBD_FL_SHRINK_GRANT;
+	osc_update_next_shrink(cli);
+}
+
+/* Shrink the current grant, either from some large amount to enough for a
+ * full set of in-flight RPCs, or if we have already shrunk to that limit
+ * then to enough for a single RPC.  This avoids keeping more grant than
+ * needed, and avoids shrinking the grant piecemeal. */
+static int osc_shrink_grant(struct client_obd *cli)
+{
+	__u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
+			     (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_avail_grant <= target_bytes)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	return osc_shrink_grant_to_target(cli, target_bytes);
+}
+
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
+{
+	int			rc = 0;
+	struct ost_body	*body;
+	ENTRY;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	/* Don't shrink if we are already above or below the desired limit
+	 * We don't want to shrink below a single RPC, as that will negatively
+	 * impact block allocation and long-term performance. */
+	if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+	if (target_bytes >= cli->cl_avail_grant) {
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		RETURN(0);
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	OBD_ALLOC_PTR(body);
+	if (!body)
+		RETURN(-ENOMEM);
+
+	osc_announce_cached(cli, &body->oa, 0);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	body->oa.o_grant = cli->cl_avail_grant - target_bytes;
+	cli->cl_avail_grant = target_bytes;
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
+		body->oa.o_valid |= OBD_MD_FLFLAGS;
+		body->oa.o_flags = 0;
+	}
+	body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
+	osc_update_next_shrink(cli);
+
+	rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
+				sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
+				sizeof(*body), body, NULL);
+	if (rc != 0)
+		__osc_update_grant(cli, body->oa.o_grant);
+	OBD_FREE_PTR(body);
+	RETURN(rc);
+}
+
+static int osc_should_shrink_grant(struct client_obd *client)
+{
+	cfs_time_t time = cfs_time_current();
+	cfs_time_t next_shrink = client->cl_next_shrink_grant;
+
+	if ((client->cl_import->imp_connect_data.ocd_connect_flags &
+	     OBD_CONNECT_GRANT_SHRINK) == 0)
+		return 0;
+
+	if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+		/* Get the current RPC size directly, instead of going via:
+		 * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
+		 * Keep comment here so that it can be found by searching. */
+		int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+		if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
+		    client->cl_avail_grant > brw_size)
+			return 1;
+		else
+			osc_update_next_shrink(client);
+	}
+	return 0;
+}
+
+static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
+{
+	struct client_obd *client;
+
+	list_for_each_entry(client, &item->ti_obd_list,
+				cl_grant_shrink_list) {
+		if (osc_should_shrink_grant(client))
+			osc_shrink_grant(client);
+	}
+	return 0;
+}
+
+static int osc_add_shrink_grant(struct client_obd *client)
+{
+	int rc;
+
+	rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
+				       TIMEOUT_GRANT,
+				       osc_grant_shrink_grant_cb, NULL,
+				       &client->cl_grant_shrink_list);
+	if (rc) {
+		CERROR("add grant client %s error %d\n",
+			client->cl_import->imp_obd->obd_name, rc);
+		return rc;
+	}
+	CDEBUG(D_CACHE, "add grant client %s \n",
+	       client->cl_import->imp_obd->obd_name);
+	osc_update_next_shrink(client);
+	return 0;
+}
+
+static int osc_del_shrink_grant(struct client_obd *client)
+{
+	return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
+					 TIMEOUT_GRANT);
+}
+
+static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+{
+	/*
+	 * ocd_grant is the total grant amount we're expect to hold: if we've
+	 * been evicted, it's the new avail_grant amount, cl_dirty will drop
+	 * to 0 as inflight RPCs fail out; otherwise, it's avail_grant + dirty.
+	 *
+	 * race is tolerable here: if we're evicted, but imp_state already
+	 * left EVICTED state, then cl_dirty must be 0 already.
+	 */
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED)
+		cli->cl_avail_grant = ocd->ocd_grant;
+	else
+		cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty;
+
+	if (cli->cl_avail_grant < 0) {
+		CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n",
+		      cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant,
+		      ocd->ocd_grant, cli->cl_dirty);
+		/* workaround for servers which do not have the patch from
+		 * LU-2679 */
+		cli->cl_avail_grant = ocd->ocd_grant;
+	}
+
+	/* determine the appropriate chunk size used by osc_extent. */
+	cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld."
+		"chunk bits: %d.\n", cli->cl_import->imp_obd->obd_name,
+		cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits);
+
+	if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
+	    list_empty(&cli->cl_grant_shrink_list))
+		osc_add_shrink_grant(cli);
+}
+
+/* We assume that the reason this OSC got a short read is because it read
+ * beyond the end of a stripe file; i.e. lustre is reading a sparse file
+ * via the LOV, and it _knows_ it's reading inside the file, it's just that
+ * this stripe never got written at or beyond this stripe offset yet. */
+static void handle_short_read(int nob_read, obd_count page_count,
+			      struct brw_page **pga)
+{
+	char *ptr;
+	int i = 0;
+
+	/* skip bytes read OK */
+	while (nob_read > 0) {
+		LASSERT (page_count > 0);
+
+		if (pga[i]->count > nob_read) {
+			/* EOF inside this page */
+			ptr = kmap(pga[i]->pg) +
+				(pga[i]->off & ~CFS_PAGE_MASK);
+			memset(ptr + nob_read, 0, pga[i]->count - nob_read);
+			kunmap(pga[i]->pg);
+			page_count--;
+			i++;
+			break;
+		}
+
+		nob_read -= pga[i]->count;
+		page_count--;
+		i++;
+	}
+
+	/* zero remaining pages */
+	while (page_count-- > 0) {
+		ptr = kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK);
+		memset(ptr, 0, pga[i]->count);
+		kunmap(pga[i]->pg);
+		i++;
+	}
+}
+
+static int check_write_rcs(struct ptlrpc_request *req,
+			   int requested_nob, int niocount,
+			   obd_count page_count, struct brw_page **pga)
+{
+	int     i;
+	__u32   *remote_rcs;
+
+	remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
+						  sizeof(*remote_rcs) *
+						  niocount);
+	if (remote_rcs == NULL) {
+		CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
+		return(-EPROTO);
+	}
+
+	/* return error if any niobuf was in error */
+	for (i = 0; i < niocount; i++) {
+		if ((int)remote_rcs[i] < 0)
+			return(remote_rcs[i]);
+
+		if (remote_rcs[i] != 0) {
+			CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
+				i, remote_rcs[i], req);
+			return(-EPROTO);
+		}
+	}
+
+	if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+		CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
+		       req->rq_bulk->bd_nob_transferred, requested_nob);
+		return(-EPROTO);
+	}
+
+	return (0);
+}
+
+static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
+{
+	if (p1->flag != p2->flag) {
+		unsigned mask = ~(OBD_BRW_FROM_GRANT| OBD_BRW_NOCACHE|
+				  OBD_BRW_SYNC|OBD_BRW_ASYNC|OBD_BRW_NOQUOTA);
+
+		/* warn if we try to combine flags that we don't know to be
+		 * safe to combine */
+		if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
+			CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
+			      "report this at http://bugs.whamcloud.com/\n",
+			      p1->flag, p2->flag);
+		}
+		return 0;
+	}
+
+	return (p1->off + p1->count == p2->off);
+}
+
+static obd_count osc_checksum_bulk(int nob, obd_count pg_count,
+				   struct brw_page **pga, int opc,
+				   cksum_type_t cksum_type)
+{
+	__u32				cksum;
+	int				i = 0;
+	struct cfs_crypto_hash_desc	*hdesc;
+	unsigned int			bufsize;
+	int				err;
+	unsigned char			cfs_alg = cksum_obd2cfs(cksum_type);
+
+	LASSERT(pg_count > 0);
+
+	hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+	if (IS_ERR(hdesc)) {
+		CERROR("Unable to initialize checksum hash %s\n",
+		       cfs_crypto_hash_name(cfs_alg));
+		return PTR_ERR(hdesc);
+	}
+
+	while (nob > 0 && pg_count > 0) {
+		int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+		/* corrupt the data before we compute the checksum, to
+		 * simulate an OST->client data error */
+		if (i == 0 && opc == OST_READ &&
+		    OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
+			unsigned char *ptr = kmap(pga[i]->pg);
+			int off = pga[i]->off & ~CFS_PAGE_MASK;
+			memcpy(ptr + off, "bad1", min(4, nob));
+			kunmap(pga[i]->pg);
+		}
+		cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
+				  pga[i]->off & ~CFS_PAGE_MASK,
+				  count);
+		LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
+			       (int)(pga[i]->off & ~CFS_PAGE_MASK));
+
+		nob -= pga[i]->count;
+		pg_count--;
+		i++;
+	}
+
+	bufsize = 4;
+	err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+
+	if (err)
+		cfs_crypto_hash_final(hdesc, NULL, NULL);
+
+	/* For sending we only compute the wrong checksum instead
+	 * of corrupting the data so it is still correct on a redo */
+	if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+		cksum++;
+
+	return cksum;
+}
+
+static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
+				struct lov_stripe_md *lsm, obd_count page_count,
+				struct brw_page **pga,
+				struct ptlrpc_request **reqp,
+				struct obd_capa *ocapa, int reserve,
+				int resend)
+{
+	struct ptlrpc_request   *req;
+	struct ptlrpc_bulk_desc *desc;
+	struct ost_body	 *body;
+	struct obd_ioobj	*ioobj;
+	struct niobuf_remote    *niobuf;
+	int niocount, i, requested_nob, opc, rc;
+	struct osc_brw_async_args *aa;
+	struct req_capsule      *pill;
+	struct brw_page *pg_prev;
+
+	ENTRY;
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
+		RETURN(-ENOMEM); /* Recoverable */
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
+		RETURN(-EINVAL); /* Fatal */
+
+	if ((cmd & OBD_BRW_WRITE) != 0) {
+		opc = OST_WRITE;
+		req = ptlrpc_request_alloc_pool(cli->cl_import,
+						cli->cl_import->imp_rq_pool,
+						&RQF_OST_BRW_WRITE);
+	} else {
+		opc = OST_READ;
+		req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
+	}
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	for (niocount = i = 1; i < page_count; i++) {
+		if (!can_merge_pages(pga[i - 1], pga[i]))
+			niocount++;
+	}
+
+	pill = &req->rq_pill;
+	req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
+			     sizeof(*ioobj));
+	req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
+			     niocount * sizeof(*niobuf));
+	osc_set_capa_size(req, &RMF_CAPA1, ocapa);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+	ptlrpc_at_set_req_timeout(req);
+	/* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
+	 * retry logic */
+	req->rq_no_retry_einprogress = 1;
+
+	desc = ptlrpc_prep_bulk_imp(req, page_count,
+		cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
+		opc == OST_WRITE ? BULK_GET_SOURCE : BULK_PUT_SINK,
+		OST_BULK_PORTAL);
+
+	if (desc == NULL)
+		GOTO(out, rc = -ENOMEM);
+	/* NB request now owns desc and will free it when it gets freed */
+
+	body = req_capsule_client_get(pill, &RMF_OST_BODY);
+	ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
+	niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+	LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
+
+	lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+	obdo_to_ioobj(oa, ioobj);
+	ioobj->ioo_bufcnt = niocount;
+	/* The high bits of ioo_max_brw tells server _maximum_ number of bulks
+	 * that might be send for this request.  The actual number is decided
+	 * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
+	 * "max - 1" for old client compatibility sending "0", and also so the
+	 * the actual maximum is a power-of-two number, not one less. LU-1431 */
+	ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+	osc_pack_capa(req, body, ocapa);
+	LASSERT(page_count > 0);
+	pg_prev = pga[0];
+	for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
+		struct brw_page *pg = pga[i];
+		int poff = pg->off & ~CFS_PAGE_MASK;
+
+		LASSERT(pg->count > 0);
+		/* make sure there is no gap in the middle of page array */
+		LASSERTF(page_count == 1 ||
+			 (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) &&
+			  ergo(i > 0 && i < page_count - 1,
+			       poff == 0 && pg->count == PAGE_CACHE_SIZE)   &&
+			  ergo(i == page_count - 1, poff == 0)),
+			 "i: %d/%d pg: %p off: "LPU64", count: %u\n",
+			 i, page_count, pg, pg->off, pg->count);
+		LASSERTF(i == 0 || pg->off > pg_prev->off,
+			 "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64
+			 " prev_pg %p [pri %lu ind %lu] off "LPU64"\n",
+			 i, page_count,
+			 pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
+			 pg_prev->pg, page_private(pg_prev->pg),
+			 pg_prev->pg->index, pg_prev->off);
+		LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
+			(pg->flag & OBD_BRW_SRVLOCK));
+
+		ptlrpc_prep_bulk_page_pin(desc, pg->pg, poff, pg->count);
+		requested_nob += pg->count;
+
+		if (i > 0 && can_merge_pages(pg_prev, pg)) {
+			niobuf--;
+			niobuf->len += pg->count;
+		} else {
+			niobuf->offset = pg->off;
+			niobuf->len    = pg->count;
+			niobuf->flags  = pg->flag;
+		}
+		pg_prev = pg;
+	}
+
+	LASSERTF((void *)(niobuf - niocount) ==
+		req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
+		"want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
+		&RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
+
+	osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
+	if (resend) {
+		if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+			body->oa.o_valid |= OBD_MD_FLFLAGS;
+			body->oa.o_flags = 0;
+		}
+		body->oa.o_flags |= OBD_FL_RECOV_RESEND;
+	}
+
+	if (osc_should_shrink_grant(cli))
+		osc_shrink_grant_local(cli, &body->oa);
+
+	/* size[REQ_REC_OFF] still sizeof (*body) */
+	if (opc == OST_WRITE) {
+		if (cli->cl_checksum &&
+		    !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+			/* store cl_cksum_type in a local variable since
+			 * it can be changed via lprocfs */
+			cksum_type_t cksum_type = cli->cl_cksum_type;
+
+			if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+				oa->o_flags &= OBD_FL_LOCAL_MASK;
+				body->oa.o_flags = 0;
+			}
+			body->oa.o_flags |= cksum_type_pack(cksum_type);
+			body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+			body->oa.o_cksum = osc_checksum_bulk(requested_nob,
+							     page_count, pga,
+							     OST_WRITE,
+							     cksum_type);
+			CDEBUG(D_PAGE, "checksum at write origin: %x\n",
+			       body->oa.o_cksum);
+			/* save this in 'oa', too, for later checking */
+			oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+			oa->o_flags |= cksum_type_pack(cksum_type);
+		} else {
+			/* clear out the checksum flag, in case this is a
+			 * resend but cl_checksum is no longer set. b=11238 */
+			oa->o_valid &= ~OBD_MD_FLCKSUM;
+		}
+		oa->o_cksum = body->oa.o_cksum;
+		/* 1 RC per niobuf */
+		req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
+				     sizeof(__u32) * niocount);
+	} else {
+		if (cli->cl_checksum &&
+		    !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+			if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
+				body->oa.o_flags = 0;
+			body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
+			body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+		}
+	}
+	ptlrpc_request_set_replen(req);
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oa = oa;
+	aa->aa_requested_nob = requested_nob;
+	aa->aa_nio_count = niocount;
+	aa->aa_page_count = page_count;
+	aa->aa_resends = 0;
+	aa->aa_ppga = pga;
+	aa->aa_cli = cli;
+	INIT_LIST_HEAD(&aa->aa_oaps);
+	if (ocapa && reserve)
+		aa->aa_ocapa = capa_get(ocapa);
+
+	*reqp = req;
+	RETURN(0);
+
+ out:
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
+				__u32 client_cksum, __u32 server_cksum, int nob,
+				obd_count page_count, struct brw_page **pga,
+				cksum_type_t client_cksum_type)
+{
+	__u32 new_cksum;
+	char *msg;
+	cksum_type_t cksum_type;
+
+	if (server_cksum == client_cksum) {
+		CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+		return 0;
+	}
+
+	cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+				       oa->o_flags : 0);
+	new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE,
+				      cksum_type);
+
+	if (cksum_type != client_cksum_type)
+		msg = "the server did not use the checksum type specified in "
+		      "the original request - likely a protocol problem";
+	else if (new_cksum == server_cksum)
+		msg = "changed on the client after we checksummed it - "
+		      "likely false positive due to mmap IO (bug 11742)";
+	else if (new_cksum == client_cksum)
+		msg = "changed in transit before arrival at OST";
+	else
+		msg = "changed in transit AND doesn't match the original - "
+		      "likely false positive due to mmap IO (bug 11742)";
+
+	LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID
+			   " object "DOSTID" extent ["LPU64"-"LPU64"]\n",
+			   msg, libcfs_nid2str(peer->nid),
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+			   oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+			   POSTID(&oa->o_oi), pga[0]->off,
+			   pga[page_count-1]->off + pga[page_count-1]->count - 1);
+	CERROR("original client csum %x (type %x), server csum %x (type %x), "
+	       "client csum now %x\n", client_cksum, client_cksum_type,
+	       server_cksum, cksum_type, new_cksum);
+	return 1;
+}
+
+/* Note rc enters this function as number of bytes transferred */
+static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
+{
+	struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+	const lnet_process_id_t *peer =
+			&req->rq_import->imp_connection->c_peer;
+	struct client_obd *cli = aa->aa_cli;
+	struct ost_body *body;
+	__u32 client_cksum = 0;
+	ENTRY;
+
+	if (rc < 0 && rc != -EDQUOT) {
+		DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc);
+		RETURN(rc);
+	}
+
+	LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
+	body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+	if (body == NULL) {
+		DEBUG_REQ(D_INFO, req, "Can't unpack body\n");
+		RETURN(-EPROTO);
+	}
+
+	/* set/clear over quota flag for a uid/gid */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
+	    body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) {
+		unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid };
+
+		CDEBUG(D_QUOTA, "setdq for [%u %u] with valid "LPX64", flags %x\n",
+		       body->oa.o_uid, body->oa.o_gid, body->oa.o_valid,
+		       body->oa.o_flags);
+		osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags);
+	}
+
+	osc_update_grant(cli, body);
+
+	if (rc < 0)
+		RETURN(rc);
+
+	if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
+		client_cksum = aa->aa_oa->o_cksum; /* save for later */
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
+		if (rc > 0) {
+			CERROR("Unexpected +ve rc %d\n", rc);
+			RETURN(-EPROTO);
+		}
+		LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
+
+		if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+			RETURN(-EAGAIN);
+
+		if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
+		    check_write_checksum(&body->oa, peer, client_cksum,
+					 body->oa.o_cksum, aa->aa_requested_nob,
+					 aa->aa_page_count, aa->aa_ppga,
+					 cksum_type_unpack(aa->aa_oa->o_flags)))
+			RETURN(-EAGAIN);
+
+		rc = check_write_rcs(req, aa->aa_requested_nob,aa->aa_nio_count,
+				     aa->aa_page_count, aa->aa_ppga);
+		GOTO(out, rc);
+	}
+
+	/* The rest of this function executes only for OST_READs */
+
+	/* if unwrap_bulk failed, return -EAGAIN to retry */
+	rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+	if (rc < 0)
+		GOTO(out, rc = -EAGAIN);
+
+	if (rc > aa->aa_requested_nob) {
+		CERROR("Unexpected rc %d (%d requested)\n", rc,
+		       aa->aa_requested_nob);
+		RETURN(-EPROTO);
+	}
+
+	if (rc != req->rq_bulk->bd_nob_transferred) {
+		CERROR ("Unexpected rc %d (%d transferred)\n",
+			rc, req->rq_bulk->bd_nob_transferred);
+		return (-EPROTO);
+	}
+
+	if (rc < aa->aa_requested_nob)
+		handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
+
+	if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+		static int cksum_counter;
+		__u32      server_cksum = body->oa.o_cksum;
+		char      *via;
+		char      *router;
+		cksum_type_t cksum_type;
+
+		cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
+					       body->oa.o_flags : 0);
+		client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
+						 aa->aa_ppga, OST_READ,
+						 cksum_type);
+
+		if (peer->nid == req->rq_bulk->bd_sender) {
+			via = router = "";
+		} else {
+			via = " via ";
+			router = libcfs_nid2str(req->rq_bulk->bd_sender);
+		}
+
+		if (server_cksum == ~0 && rc > 0) {
+			CERROR("Protocol error: server %s set the 'checksum' "
+			       "bit, but didn't send a checksum.  Not fatal, "
+			       "but please notify on http://bugs.whamcloud.com/\n",
+			       libcfs_nid2str(peer->nid));
+		} else if (server_cksum != client_cksum) {
+			LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
+					   "%s%s%s inode "DFID" object "DOSTID
+					   " extent ["LPU64"-"LPU64"]\n",
+					   req->rq_import->imp_obd->obd_name,
+					   libcfs_nid2str(peer->nid),
+					   via, router,
+					   body->oa.o_valid & OBD_MD_FLFID ?
+						body->oa.o_parent_seq : (__u64)0,
+					   body->oa.o_valid & OBD_MD_FLFID ?
+						body->oa.o_parent_oid : 0,
+					   body->oa.o_valid & OBD_MD_FLFID ?
+						body->oa.o_parent_ver : 0,
+					   POSTID(&body->oa.o_oi),
+					   aa->aa_ppga[0]->off,
+					   aa->aa_ppga[aa->aa_page_count-1]->off +
+					   aa->aa_ppga[aa->aa_page_count-1]->count -
+									1);
+			CERROR("client %x, server %x, cksum_type %x\n",
+			       client_cksum, server_cksum, cksum_type);
+			cksum_counter = 0;
+			aa->aa_oa->o_cksum = client_cksum;
+			rc = -EAGAIN;
+		} else {
+			cksum_counter++;
+			CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+			rc = 0;
+		}
+	} else if (unlikely(client_cksum)) {
+		static int cksum_missed;
+
+		cksum_missed++;
+		if ((cksum_missed & (-cksum_missed)) == cksum_missed)
+			CERROR("Checksum %u requested from %s but not sent\n",
+			       cksum_missed, libcfs_nid2str(peer->nid));
+	} else {
+		rc = 0;
+	}
+out:
+	if (rc >= 0)
+		lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
+				     aa->aa_oa, &body->oa);
+
+	RETURN(rc);
+}
+
+static int osc_brw_internal(int cmd, struct obd_export *exp, struct obdo *oa,
+			    struct lov_stripe_md *lsm,
+			    obd_count page_count, struct brw_page **pga,
+			    struct obd_capa *ocapa)
+{
+	struct ptlrpc_request *req;
+	int		    rc;
+	wait_queue_head_t	    waitq;
+	int		    generation, resends = 0;
+	struct l_wait_info     lwi;
+
+	ENTRY;
+
+	init_waitqueue_head(&waitq);
+	generation = exp->exp_obd->u.cli.cl_import->imp_generation;
+
+restart_bulk:
+	rc = osc_brw_prep_request(cmd, &exp->exp_obd->u.cli, oa, lsm,
+				  page_count, pga, &req, ocapa, 0, resends);
+	if (rc != 0)
+		return (rc);
+
+	if (resends) {
+		req->rq_generation_set = 1;
+		req->rq_import_generation = generation;
+		req->rq_sent = cfs_time_current_sec() + resends;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+
+	if (rc == -ETIMEDOUT && req->rq_resend) {
+		DEBUG_REQ(D_HA, req,  "BULK TIMEOUT");
+		ptlrpc_req_finished(req);
+		goto restart_bulk;
+	}
+
+	rc = osc_brw_fini_request(req, rc);
+
+	ptlrpc_req_finished(req);
+	/* When server return -EINPROGRESS, client should always retry
+	 * regardless of the number of times the bulk was resent already.*/
+	if (osc_recoverable_error(rc)) {
+		resends++;
+		if (rc != -EINPROGRESS &&
+		    !client_should_resend(resends, &exp->exp_obd->u.cli)) {
+			CERROR("%s: too many resend retries for object: "
+			       ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name,
+			       POSTID(&oa->o_oi), rc);
+			goto out;
+		}
+		if (generation !=
+		    exp->exp_obd->u.cli.cl_import->imp_generation) {
+			CDEBUG(D_HA, "%s: resend cross eviction for object: "
+			       ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name,
+			       POSTID(&oa->o_oi), rc);
+			goto out;
+		}
+
+		lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL,
+				       NULL);
+		l_wait_event(waitq, 0, &lwi);
+
+		goto restart_bulk;
+	}
+out:
+	if (rc == -EAGAIN || rc == -EINPROGRESS)
+		rc = -EIO;
+	RETURN (rc);
+}
+
+static int osc_brw_redo_request(struct ptlrpc_request *request,
+				struct osc_brw_async_args *aa, int rc)
+{
+	struct ptlrpc_request *new_req;
+	struct osc_brw_async_args *new_aa;
+	struct osc_async_page *oap;
+	ENTRY;
+
+	DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
+		  "redo for recoverable error %d", rc);
+
+	rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
+					OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ,
+				  aa->aa_cli, aa->aa_oa,
+				  NULL /* lsm unused by osc currently */,
+				  aa->aa_page_count, aa->aa_ppga,
+				  &new_req, aa->aa_ocapa, 0, 1);
+	if (rc)
+		RETURN(rc);
+
+	list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+		if (oap->oap_request != NULL) {
+			LASSERTF(request == oap->oap_request,
+				 "request %p != oap_request %p\n",
+				 request, oap->oap_request);
+			if (oap->oap_interrupted) {
+				ptlrpc_req_finished(new_req);
+				RETURN(-EINTR);
+			}
+		}
+	}
+	/* New request takes over pga and oaps from old request.
+	 * Note that copying a list_head doesn't work, need to move it... */
+	aa->aa_resends++;
+	new_req->rq_interpret_reply = request->rq_interpret_reply;
+	new_req->rq_async_args = request->rq_async_args;
+	/* cap resend delay to the current request timeout, this is similar to
+	 * what ptlrpc does (see after_reply()) */
+	if (aa->aa_resends > new_req->rq_timeout)
+		new_req->rq_sent = cfs_time_current_sec() + new_req->rq_timeout;
+	else
+		new_req->rq_sent = cfs_time_current_sec() + aa->aa_resends;
+	new_req->rq_generation_set = 1;
+	new_req->rq_import_generation = request->rq_import_generation;
+
+	new_aa = ptlrpc_req_async_args(new_req);
+
+	INIT_LIST_HEAD(&new_aa->aa_oaps);
+	list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
+	INIT_LIST_HEAD(&new_aa->aa_exts);
+	list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
+	new_aa->aa_resends = aa->aa_resends;
+
+	list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
+		if (oap->oap_request) {
+			ptlrpc_req_finished(oap->oap_request);
+			oap->oap_request = ptlrpc_request_addref(new_req);
+		}
+	}
+
+	new_aa->aa_ocapa = aa->aa_ocapa;
+	aa->aa_ocapa = NULL;
+
+	/* XXX: This code will run into problem if we're going to support
+	 * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
+	 * and wait for all of them to be finished. We should inherit request
+	 * set from old request. */
+	ptlrpcd_add_req(new_req, PDL_POLICY_SAME, -1);
+
+	DEBUG_REQ(D_INFO, new_req, "new request");
+	RETURN(0);
+}
+
+/*
+ * ugh, we want disk allocation on the target to happen in offset order.  we'll
+ * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
+ * fine for our small page arrays and doesn't require allocation.  its an
+ * insertion sort that swaps elements that are strides apart, shrinking the
+ * stride down until its '1' and the array is sorted.
+ */
+static void sort_brw_pages(struct brw_page **array, int num)
+{
+	int stride, i, j;
+	struct brw_page *tmp;
+
+	if (num == 1)
+		return;
+	for (stride = 1; stride < num ; stride = (stride * 3) + 1)
+		;
+
+	do {
+		stride /= 3;
+		for (i = stride ; i < num ; i++) {
+			tmp = array[i];
+			j = i;
+			while (j >= stride && array[j - stride]->off > tmp->off) {
+				array[j] = array[j - stride];
+				j -= stride;
+			}
+			array[j] = tmp;
+		}
+	} while (stride > 1);
+}
+
+static obd_count max_unfragmented_pages(struct brw_page **pg, obd_count pages)
+{
+	int count = 1;
+	int offset;
+	int i = 0;
+
+	LASSERT (pages > 0);
+	offset = pg[i]->off & ~CFS_PAGE_MASK;
+
+	for (;;) {
+		pages--;
+		if (pages == 0)	 /* that's all */
+			return count;
+
+		if (offset + pg[i]->count < PAGE_CACHE_SIZE)
+			return count;   /* doesn't end on page boundary */
+
+		i++;
+		offset = pg[i]->off & ~CFS_PAGE_MASK;
+		if (offset != 0)	/* doesn't start on page boundary */
+			return count;
+
+		count++;
+	}
+}
+
+static struct brw_page **osc_build_ppga(struct brw_page *pga, obd_count count)
+{
+	struct brw_page **ppga;
+	int i;
+
+	OBD_ALLOC(ppga, sizeof(*ppga) * count);
+	if (ppga == NULL)
+		return NULL;
+
+	for (i = 0; i < count; i++)
+		ppga[i] = pga + i;
+	return ppga;
+}
+
+static void osc_release_ppga(struct brw_page **ppga, obd_count count)
+{
+	LASSERT(ppga != NULL);
+	OBD_FREE(ppga, sizeof(*ppga) * count);
+}
+
+static int osc_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+		   obd_count page_count, struct brw_page *pga,
+		   struct obd_trans_info *oti)
+{
+	struct obdo *saved_oa = NULL;
+	struct brw_page **ppga, **orig;
+	struct obd_import *imp = class_exp2cliimp(exp);
+	struct client_obd *cli;
+	int rc, page_count_orig;
+	ENTRY;
+
+	LASSERT((imp != NULL) && (imp->imp_obd != NULL));
+	cli = &imp->imp_obd->u.cli;
+
+	if (cmd & OBD_BRW_CHECK) {
+		/* The caller just wants to know if there's a chance that this
+		 * I/O can succeed */
+
+		if (imp->imp_invalid)
+			RETURN(-EIO);
+		RETURN(0);
+	}
+
+	/* test_brw with a failed create can trip this, maybe others. */
+	LASSERT(cli->cl_max_pages_per_rpc);
+
+	rc = 0;
+
+	orig = ppga = osc_build_ppga(pga, page_count);
+	if (ppga == NULL)
+		RETURN(-ENOMEM);
+	page_count_orig = page_count;
+
+	sort_brw_pages(ppga, page_count);
+	while (page_count) {
+		obd_count pages_per_brw;
+
+		if (page_count > cli->cl_max_pages_per_rpc)
+			pages_per_brw = cli->cl_max_pages_per_rpc;
+		else
+			pages_per_brw = page_count;
+
+		pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw);
+
+		if (saved_oa != NULL) {
+			/* restore previously saved oa */
+			*oinfo->oi_oa = *saved_oa;
+		} else if (page_count > pages_per_brw) {
+			/* save a copy of oa (brw will clobber it) */
+			OBDO_ALLOC(saved_oa);
+			if (saved_oa == NULL)
+				GOTO(out, rc = -ENOMEM);
+			*saved_oa = *oinfo->oi_oa;
+		}
+
+		rc = osc_brw_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md,
+				      pages_per_brw, ppga, oinfo->oi_capa);
+
+		if (rc != 0)
+			break;
+
+		page_count -= pages_per_brw;
+		ppga += pages_per_brw;
+	}
+
+out:
+	osc_release_ppga(orig, page_count_orig);
+
+	if (saved_oa != NULL)
+		OBDO_FREE(saved_oa);
+
+	RETURN(rc);
+}
+
+static int brw_interpret(const struct lu_env *env,
+			 struct ptlrpc_request *req, void *data, int rc)
+{
+	struct osc_brw_async_args *aa = data;
+	struct osc_extent *ext;
+	struct osc_extent *tmp;
+	struct cl_object  *obj = NULL;
+	struct client_obd *cli = aa->aa_cli;
+	ENTRY;
+
+	rc = osc_brw_fini_request(req, rc);
+	CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
+	/* When server return -EINPROGRESS, client should always retry
+	 * regardless of the number of times the bulk was resent already. */
+	if (osc_recoverable_error(rc)) {
+		if (req->rq_import_generation !=
+		    req->rq_import->imp_generation) {
+			CDEBUG(D_HA, "%s: resend cross eviction for object: "
+			       ""DOSTID", rc = %d.\n",
+			       req->rq_import->imp_obd->obd_name,
+			       POSTID(&aa->aa_oa->o_oi), rc);
+		} else if (rc == -EINPROGRESS ||
+		    client_should_resend(aa->aa_resends, aa->aa_cli)) {
+			rc = osc_brw_redo_request(req, aa, rc);
+		} else {
+			CERROR("%s: too many resent retries for object: "
+			       ""LPU64":"LPU64", rc = %d.\n",
+			       req->rq_import->imp_obd->obd_name,
+			       POSTID(&aa->aa_oa->o_oi), rc);
+		}
+
+		if (rc == 0)
+			RETURN(0);
+		else if (rc == -EAGAIN || rc == -EINPROGRESS)
+			rc = -EIO;
+	}
+
+	if (aa->aa_ocapa) {
+		capa_put(aa->aa_ocapa);
+		aa->aa_ocapa = NULL;
+	}
+
+	list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
+		if (obj == NULL && rc == 0) {
+			obj = osc2cl(ext->oe_obj);
+			cl_object_get(obj);
+		}
+
+		list_del_init(&ext->oe_link);
+		osc_extent_finish(env, ext, 1, rc);
+	}
+	LASSERT(list_empty(&aa->aa_exts));
+	LASSERT(list_empty(&aa->aa_oaps));
+
+	if (obj != NULL) {
+		struct obdo *oa = aa->aa_oa;
+		struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+		unsigned long valid = 0;
+
+		LASSERT(rc == 0);
+		if (oa->o_valid & OBD_MD_FLBLOCKS) {
+			attr->cat_blocks = oa->o_blocks;
+			valid |= CAT_BLOCKS;
+		}
+		if (oa->o_valid & OBD_MD_FLMTIME) {
+			attr->cat_mtime = oa->o_mtime;
+			valid |= CAT_MTIME;
+		}
+		if (oa->o_valid & OBD_MD_FLATIME) {
+			attr->cat_atime = oa->o_atime;
+			valid |= CAT_ATIME;
+		}
+		if (oa->o_valid & OBD_MD_FLCTIME) {
+			attr->cat_ctime = oa->o_ctime;
+			valid |= CAT_CTIME;
+		}
+		if (valid != 0) {
+			cl_object_attr_lock(obj);
+			cl_object_attr_set(env, obj, attr, valid);
+			cl_object_attr_unlock(obj);
+		}
+		cl_object_put(env, obj);
+	}
+	OBDO_FREE(aa->aa_oa);
+
+	cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc :
+			  req->rq_bulk->bd_nob_transferred);
+	osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
+	ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	/* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
+	 * is called so we know whether to go to sync BRWs or wait for more
+	 * RPCs to complete */
+	if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
+		cli->cl_w_in_flight--;
+	else
+		cli->cl_r_in_flight--;
+	osc_wake_cache_waiters(cli);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+	RETURN(rc);
+}
+
+/**
+ * Build an RPC by the list of extent @ext_list. The caller must ensure
+ * that the total pages in this list are NOT over max pages per RPC.
+ * Extents in the list must be in OES_RPC state.
+ */
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+		  struct list_head *ext_list, int cmd, pdl_policy_t pol)
+{
+	struct ptlrpc_request		*req = NULL;
+	struct osc_extent		*ext;
+	struct brw_page			**pga = NULL;
+	struct osc_brw_async_args	*aa = NULL;
+	struct obdo			*oa = NULL;
+	struct osc_async_page		*oap;
+	struct osc_async_page		*tmp;
+	struct cl_req			*clerq = NULL;
+	enum cl_req_type		crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE :
+								      CRT_READ;
+	struct ldlm_lock		*lock = NULL;
+	struct cl_req_attr		*crattr = NULL;
+	obd_off				starting_offset = OBD_OBJECT_EOF;
+	obd_off				ending_offset = 0;
+	int				mpflag = 0;
+	int				mem_tight = 0;
+	int				page_count = 0;
+	int				i;
+	int				rc;
+	LIST_HEAD(rpc_list);
+
+	ENTRY;
+	LASSERT(!list_empty(ext_list));
+
+	/* add pages into rpc_list to build BRW rpc */
+	list_for_each_entry(ext, ext_list, oe_link) {
+		LASSERT(ext->oe_state == OES_RPC);
+		mem_tight |= ext->oe_memalloc;
+		list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+			++page_count;
+			list_add_tail(&oap->oap_rpc_item, &rpc_list);
+			if (starting_offset > oap->oap_obj_off)
+				starting_offset = oap->oap_obj_off;
+			else
+				LASSERT(oap->oap_page_off == 0);
+			if (ending_offset < oap->oap_obj_off + oap->oap_count)
+				ending_offset = oap->oap_obj_off +
+						oap->oap_count;
+			else
+				LASSERT(oap->oap_page_off + oap->oap_count ==
+					PAGE_CACHE_SIZE);
+		}
+	}
+
+	if (mem_tight)
+		mpflag = cfs_memory_pressure_get_and_set();
+
+	OBD_ALLOC(crattr, sizeof(*crattr));
+	if (crattr == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	OBD_ALLOC(pga, sizeof(*pga) * page_count);
+	if (pga == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	i = 0;
+	list_for_each_entry(oap, &rpc_list, oap_rpc_item) {
+		struct cl_page *page = oap2cl_page(oap);
+		if (clerq == NULL) {
+			clerq = cl_req_alloc(env, page, crt,
+					     1 /* only 1-object rpcs for now */);
+			if (IS_ERR(clerq))
+				GOTO(out, rc = PTR_ERR(clerq));
+			lock = oap->oap_ldlm_lock;
+		}
+		if (mem_tight)
+			oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
+		pga[i] = &oap->oap_brw_page;
+		pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
+		CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n",
+		       pga[i]->pg, page_index(oap->oap_page), oap,
+		       pga[i]->flag);
+		i++;
+		cl_req_page_add(env, clerq, page);
+	}
+
+	/* always get the data for the obdo for the rpc */
+	LASSERT(clerq != NULL);
+	crattr->cra_oa = oa;
+	cl_req_attr_set(env, clerq, crattr, ~0ULL);
+	if (lock) {
+		oa->o_handle = lock->l_remote_handle;
+		oa->o_valid |= OBD_MD_FLHANDLE;
+	}
+
+	rc = cl_req_prep(env, clerq);
+	if (rc != 0) {
+		CERROR("cl_req_prep failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	sort_brw_pages(pga, page_count);
+	rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count,
+			pga, &req, crattr->cra_capa, 1, 0);
+	if (rc != 0) {
+		CERROR("prep_req failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	req->rq_interpret_reply = brw_interpret;
+
+	if (mem_tight != 0)
+		req->rq_memalloc = 1;
+
+	/* Need to update the timestamps after the request is built in case
+	 * we race with setattr (locally or in queue at OST).  If OST gets
+	 * later setattr before earlier BRW (as determined by the request xid),
+	 * the OST will not use BRW timestamps.  Sadly, there is no obvious
+	 * way to do this in a single call.  bug 10150 */
+	cl_req_attr_set(env, clerq, crattr,
+			OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME);
+
+	lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
+
+	CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	INIT_LIST_HEAD(&aa->aa_oaps);
+	list_splice_init(&rpc_list, &aa->aa_oaps);
+	INIT_LIST_HEAD(&aa->aa_exts);
+	list_splice_init(ext_list, &aa->aa_exts);
+	aa->aa_clerq = clerq;
+
+	/* queued sync pages can be torn down while the pages
+	 * were between the pending list and the rpc */
+	tmp = NULL;
+	list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+		/* only one oap gets a request reference */
+		if (tmp == NULL)
+			tmp = oap;
+		if (oap->oap_interrupted && !req->rq_intr) {
+			CDEBUG(D_INODE, "oap %p in req %p interrupted\n",
+					oap, req);
+			ptlrpc_mark_interrupted(req);
+		}
+	}
+	if (tmp != NULL)
+		tmp->oap_request = ptlrpc_request_addref(req);
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	starting_offset >>= PAGE_CACHE_SHIFT;
+	if (cmd == OBD_BRW_READ) {
+		cli->cl_r_in_flight++;
+		lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
+		lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
+		lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
+				      starting_offset + 1);
+	} else {
+		cli->cl_w_in_flight++;
+		lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
+		lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
+		lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
+				      starting_offset + 1);
+	}
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+	DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight",
+		  page_count, aa, cli->cl_r_in_flight,
+		  cli->cl_w_in_flight);
+
+	/* XXX: Maybe the caller can check the RPC bulk descriptor to
+	 * see which CPU/NUMA node the majority of pages were allocated
+	 * on, and try to assign the async RPC to the CPU core
+	 * (PDL_POLICY_PREFERRED) to reduce cross-CPU memory traffic.
+	 *
+	 * But on the other hand, we expect that multiple ptlrpcd
+	 * threads and the initial write sponsor can run in parallel,
+	 * especially when data checksum is enabled, which is CPU-bound
+	 * operation and single ptlrpcd thread cannot process in time.
+	 * So more ptlrpcd threads sharing BRW load
+	 * (with PDL_POLICY_ROUND) seems better.
+	 */
+	ptlrpcd_add_req(req, pol, -1);
+	rc = 0;
+	EXIT;
+
+out:
+	if (mem_tight != 0)
+		cfs_memory_pressure_restore(mpflag);
+
+	if (crattr != NULL) {
+		capa_put(crattr->cra_capa);
+		OBD_FREE(crattr, sizeof(*crattr));
+	}
+
+	if (rc != 0) {
+		LASSERT(req == NULL);
+
+		if (oa)
+			OBDO_FREE(oa);
+		if (pga)
+			OBD_FREE(pga, sizeof(*pga) * page_count);
+		/* this should happen rarely and is pretty bad, it makes the
+		 * pending list not follow the dirty order */
+		while (!list_empty(ext_list)) {
+			ext = list_entry(ext_list->next, struct osc_extent,
+					     oe_link);
+			list_del_init(&ext->oe_link);
+			osc_extent_finish(env, ext, 0, rc);
+		}
+		if (clerq && !IS_ERR(clerq))
+			cl_req_completion(env, clerq, rc);
+	}
+	RETURN(rc);
+}
+
+static int osc_set_lock_data_with_check(struct ldlm_lock *lock,
+					struct ldlm_enqueue_info *einfo)
+{
+	void *data = einfo->ei_cbdata;
+	int set = 0;
+
+	LASSERT(lock != NULL);
+	LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl);
+	LASSERT(lock->l_resource->lr_type == einfo->ei_type);
+	LASSERT(lock->l_completion_ast == einfo->ei_cb_cp);
+	LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl);
+
+	lock_res_and_lock(lock);
+	spin_lock(&osc_ast_guard);
+
+	if (lock->l_ast_data == NULL)
+		lock->l_ast_data = data;
+	if (lock->l_ast_data == data)
+		set = 1;
+
+	spin_unlock(&osc_ast_guard);
+	unlock_res_and_lock(lock);
+
+	return set;
+}
+
+static int osc_set_data_with_check(struct lustre_handle *lockh,
+				   struct ldlm_enqueue_info *einfo)
+{
+	struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+	int set = 0;
+
+	if (lock != NULL) {
+		set = osc_set_lock_data_with_check(lock, einfo);
+		LDLM_LOCK_PUT(lock);
+	} else
+		CERROR("lockh %p, data %p - client evicted?\n",
+		       lockh, einfo->ei_cbdata);
+	return set;
+}
+
+static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
+			     ldlm_iterator_t replace, void *data)
+{
+	struct ldlm_res_id res_id;
+	struct obd_device *obd = class_exp2obd(exp);
+
+	ostid_build_res_name(&lsm->lsm_oi, &res_id);
+	ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
+	return 0;
+}
+
+/* find any ldlm lock of the inode in osc
+ * return 0    not find
+ *	1    find one
+ *      < 0    error */
+static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
+			   ldlm_iterator_t replace, void *data)
+{
+	struct ldlm_res_id res_id;
+	struct obd_device *obd = class_exp2obd(exp);
+	int rc = 0;
+
+	ostid_build_res_name(&lsm->lsm_oi, &res_id);
+	rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
+	if (rc == LDLM_ITER_STOP)
+		return(1);
+	if (rc == LDLM_ITER_CONTINUE)
+		return(0);
+	return(rc);
+}
+
+static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb,
+			    obd_enqueue_update_f upcall, void *cookie,
+			    __u64 *flags, int agl, int rc)
+{
+	int intent = *flags & LDLM_FL_HAS_INTENT;
+	ENTRY;
+
+	if (intent) {
+		/* The request was created before ldlm_cli_enqueue call. */
+		if (rc == ELDLM_LOCK_ABORTED) {
+			struct ldlm_reply *rep;
+			rep = req_capsule_server_get(&req->rq_pill,
+						     &RMF_DLM_REP);
+
+			LASSERT(rep != NULL);
+			if (rep->lock_policy_res1)
+				rc = rep->lock_policy_res1;
+		}
+	}
+
+	if ((intent != 0 && rc == ELDLM_LOCK_ABORTED && agl == 0) ||
+	    (rc == 0)) {
+		*flags |= LDLM_FL_LVB_READY;
+		CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n",
+		       lvb->lvb_size, lvb->lvb_blocks, lvb->lvb_mtime);
+	}
+
+	/* Call the update callback. */
+	rc = (*upcall)(cookie, rc);
+	RETURN(rc);
+}
+
+static int osc_enqueue_interpret(const struct lu_env *env,
+				 struct ptlrpc_request *req,
+				 struct osc_enqueue_args *aa, int rc)
+{
+	struct ldlm_lock *lock;
+	struct lustre_handle handle;
+	__u32 mode;
+	struct ost_lvb *lvb;
+	__u32 lvb_len;
+	__u64 *flags = aa->oa_flags;
+
+	/* Make a local copy of a lock handle and a mode, because aa->oa_*
+	 * might be freed anytime after lock upcall has been called. */
+	lustre_handle_copy(&handle, aa->oa_lockh);
+	mode = aa->oa_ei->ei_mode;
+
+	/* ldlm_cli_enqueue is holding a reference on the lock, so it must
+	 * be valid. */
+	lock = ldlm_handle2lock(&handle);
+
+	/* Take an additional reference so that a blocking AST that
+	 * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+	 * to arrive after an upcall has been executed by
+	 * osc_enqueue_fini(). */
+	ldlm_lock_addref(&handle, mode);
+
+	/* Let CP AST to grant the lock first. */
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+	if (aa->oa_agl && rc == ELDLM_LOCK_ABORTED) {
+		lvb = NULL;
+		lvb_len = 0;
+	} else {
+		lvb = aa->oa_lvb;
+		lvb_len = sizeof(*aa->oa_lvb);
+	}
+
+	/* Complete obtaining the lock procedure. */
+	rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1,
+				   mode, flags, lvb, lvb_len, &handle, rc);
+	/* Complete osc stuff. */
+	rc = osc_enqueue_fini(req, aa->oa_lvb, aa->oa_upcall, aa->oa_cookie,
+			      flags, aa->oa_agl, rc);
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+	/* Release the lock for async request. */
+	if (lustre_handle_is_used(&handle) && rc == ELDLM_OK)
+		/*
+		 * Releases a reference taken by ldlm_cli_enqueue(), if it is
+		 * not already released by
+		 * ldlm_cli_enqueue_fini()->failed_lock_cleanup()
+		 */
+		ldlm_lock_decref(&handle, mode);
+
+	LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n",
+		 aa->oa_lockh, req, aa);
+	ldlm_lock_decref(&handle, mode);
+	LDLM_LOCK_PUT(lock);
+	return rc;
+}
+
+void osc_update_enqueue(struct lustre_handle *lov_lockhp,
+			struct lov_oinfo *loi, int flags,
+			struct ost_lvb *lvb, __u32 mode, int rc)
+{
+	struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp);
+
+	if (rc == ELDLM_OK) {
+		__u64 tmp;
+
+		LASSERT(lock != NULL);
+		loi->loi_lvb = *lvb;
+		tmp = loi->loi_lvb.lvb_size;
+		/* Extend KMS up to the end of this lock and no further
+		 * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+		if (tmp > lock->l_policy_data.l_extent.end)
+			tmp = lock->l_policy_data.l_extent.end + 1;
+		if (tmp >= loi->loi_kms) {
+			LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64
+				   ", kms="LPU64, loi->loi_lvb.lvb_size, tmp);
+			loi_kms_set(loi, tmp);
+		} else {
+			LDLM_DEBUG(lock, "lock acquired, setting rss="
+				   LPU64"; leaving kms="LPU64", end="LPU64,
+				   loi->loi_lvb.lvb_size, loi->loi_kms,
+				   lock->l_policy_data.l_extent.end);
+		}
+		ldlm_lock_allow_match(lock);
+	} else if (rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT)) {
+		LASSERT(lock != NULL);
+		loi->loi_lvb = *lvb;
+		ldlm_lock_allow_match(lock);
+		CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
+		       " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms);
+		rc = ELDLM_OK;
+	}
+
+	if (lock != NULL) {
+		if (rc != ELDLM_OK)
+			ldlm_lock_fail_match(lock);
+
+		LDLM_LOCK_PUT(lock);
+	}
+}
+EXPORT_SYMBOL(osc_update_enqueue);
+
+struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is excluded from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		     __u64 *flags, ldlm_policy_data_t *policy,
+		     struct ost_lvb *lvb, int kms_valid,
+		     obd_enqueue_update_f upcall, void *cookie,
+		     struct ldlm_enqueue_info *einfo,
+		     struct lustre_handle *lockh,
+		     struct ptlrpc_request_set *rqset, int async, int agl)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct ptlrpc_request *req = NULL;
+	int intent = *flags & LDLM_FL_HAS_INTENT;
+	int match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY);
+	ldlm_mode_t mode;
+	int rc;
+	ENTRY;
+
+	/* Filesystem lock extents are extended to page boundaries so that
+	 * dealing with the page cache is a little smoother.  */
+	policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+	policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+	/*
+	 * kms is not valid when either object is completely fresh (so that no
+	 * locks are cached), or object was evicted. In the latter case cached
+	 * lock cannot be used, because it would prime inode state with
+	 * potentially stale LVB.
+	 */
+	if (!kms_valid)
+		goto no_match;
+
+	/* Next, search for already existing extent locks that will cover us */
+	/* If we're trying to read, we also search for an existing PW lock.  The
+	 * VFS and page cache already protect us locally, so lots of readers/
+	 * writers can share a single PW lock.
+	 *
+	 * There are problems with conversion deadlocks, so instead of
+	 * converting a read lock to a write lock, we'll just enqueue a new
+	 * one.
+	 *
+	 * At some point we should cancel the read lock instead of making them
+	 * send us a blocking callback, but there are problems with canceling
+	 * locks out from other users right now, too. */
+	mode = einfo->ei_mode;
+	if (einfo->ei_mode == LCK_PR)
+		mode |= LCK_PW;
+	mode = ldlm_lock_match(obd->obd_namespace, *flags | match_lvb, res_id,
+			       einfo->ei_type, policy, mode, lockh, 0);
+	if (mode) {
+		struct ldlm_lock *matched = ldlm_handle2lock(lockh);
+
+		if ((agl != 0) && !(matched->l_flags & LDLM_FL_LVB_READY)) {
+			/* For AGL, if enqueue RPC is sent but the lock is not
+			 * granted, then skip to process this strpe.
+			 * Return -ECANCELED to tell the caller. */
+			ldlm_lock_decref(lockh, mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(-ECANCELED);
+		} else if (osc_set_lock_data_with_check(matched, einfo)) {
+			*flags |= LDLM_FL_LVB_READY;
+			/* addref the lock only if not async requests and PW
+			 * lock is matched whereas we asked for PR. */
+			if (!rqset && einfo->ei_mode != mode)
+				ldlm_lock_addref(lockh, LCK_PR);
+			if (intent) {
+				/* I would like to be able to ASSERT here that
+				 * rss <= kms, but I can't, for reasons which
+				 * are explained in lov_enqueue() */
+			}
+
+			/* We already have a lock, and it's referenced.
+			 *
+			 * At this point, the cl_lock::cll_state is CLS_QUEUING,
+			 * AGL upcall may change it to CLS_HELD directly. */
+			(*upcall)(cookie, ELDLM_OK);
+
+			if (einfo->ei_mode != mode)
+				ldlm_lock_decref(lockh, LCK_PW);
+			else if (rqset)
+				/* For async requests, decref the lock. */
+				ldlm_lock_decref(lockh, einfo->ei_mode);
+			LDLM_LOCK_PUT(matched);
+			RETURN(ELDLM_OK);
+		} else {
+			ldlm_lock_decref(lockh, mode);
+			LDLM_LOCK_PUT(matched);
+		}
+	}
+
+ no_match:
+	if (intent) {
+		LIST_HEAD(cancels);
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					   &RQF_LDLM_ENQUEUE_LVB);
+		if (req == NULL)
+			RETURN(-ENOMEM);
+
+		rc = ldlm_prep_enqueue_req(exp, req, &cancels, 0);
+		if (rc) {
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+
+		req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+				     sizeof *lvb);
+		ptlrpc_request_set_replen(req);
+	}
+
+	/* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
+	*flags &= ~LDLM_FL_BLOCK_GRANTED;
+
+	rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
+			      sizeof(*lvb), LVB_T_OST, lockh, async);
+	if (rqset) {
+		if (!rc) {
+			struct osc_enqueue_args *aa;
+			CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+			aa = ptlrpc_req_async_args(req);
+			aa->oa_ei = einfo;
+			aa->oa_exp = exp;
+			aa->oa_flags  = flags;
+			aa->oa_upcall = upcall;
+			aa->oa_cookie = cookie;
+			aa->oa_lvb    = lvb;
+			aa->oa_lockh  = lockh;
+			aa->oa_agl    = !!agl;
+
+			req->rq_interpret_reply =
+				(ptlrpc_interpterer_t)osc_enqueue_interpret;
+			if (rqset == PTLRPCD_SET)
+				ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+			else
+				ptlrpc_set_add_req(rqset, req);
+		} else if (intent) {
+			ptlrpc_req_finished(req);
+		}
+		RETURN(rc);
+	}
+
+	rc = osc_enqueue_fini(req, lvb, upcall, cookie, flags, agl, rc);
+	if (intent)
+		ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+
+static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+		       struct ldlm_enqueue_info *einfo,
+		       struct ptlrpc_request_set *rqset)
+{
+	struct ldlm_res_id res_id;
+	int rc;
+	ENTRY;
+
+	ostid_build_res_name(&oinfo->oi_md->lsm_oi, &res_id);
+	rc = osc_enqueue_base(exp, &res_id, &oinfo->oi_flags, &oinfo->oi_policy,
+			      &oinfo->oi_md->lsm_oinfo[0]->loi_lvb,
+			      oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid,
+			      oinfo->oi_cb_up, oinfo, einfo, oinfo->oi_lockh,
+			      rqset, rqset != NULL, 0);
+	RETURN(rc);
+}
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+		   __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+		   int *flags, void *data, struct lustre_handle *lockh,
+		   int unref)
+{
+	struct obd_device *obd = exp->exp_obd;
+	int lflags = *flags;
+	ldlm_mode_t rc;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
+		RETURN(-EIO);
+
+	/* Filesystem lock extents are extended to page boundaries so that
+	 * dealing with the page cache is a little smoother */
+	policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+	policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+	/* Next, search for already existing extent locks that will cover us */
+	/* If we're trying to read, we also search for an existing PW lock.  The
+	 * VFS and page cache already protect us locally, so lots of readers/
+	 * writers can share a single PW lock. */
+	rc = mode;
+	if (mode == LCK_PR)
+		rc |= LCK_PW;
+	rc = ldlm_lock_match(obd->obd_namespace, lflags,
+			     res_id, type, policy, rc, lockh, unref);
+	if (rc) {
+		if (data != NULL) {
+			if (!osc_set_data_with_check(lockh, data)) {
+				if (!(lflags & LDLM_FL_TEST_LOCK))
+					ldlm_lock_decref(lockh, rc);
+				RETURN(0);
+			}
+		}
+		if (!(lflags & LDLM_FL_TEST_LOCK) && mode != rc) {
+			ldlm_lock_addref(lockh, LCK_PR);
+			ldlm_lock_decref(lockh, LCK_PW);
+		}
+		RETURN(rc);
+	}
+	RETURN(rc);
+}
+
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode)
+{
+	ENTRY;
+
+	if (unlikely(mode == LCK_GROUP))
+		ldlm_lock_decref_and_cancel(lockh, mode);
+	else
+		ldlm_lock_decref(lockh, mode);
+
+	RETURN(0);
+}
+
+static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
+		      __u32 mode, struct lustre_handle *lockh)
+{
+	ENTRY;
+	RETURN(osc_cancel_base(lockh, mode));
+}
+
+static int osc_cancel_unused(struct obd_export *exp,
+			     struct lov_stripe_md *lsm,
+			     ldlm_cancel_flags_t flags,
+			     void *opaque)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct ldlm_res_id res_id, *resp = NULL;
+
+	if (lsm != NULL) {
+		ostid_build_res_name(&lsm->lsm_oi, &res_id);
+		resp = &res_id;
+	}
+
+	return ldlm_cli_cancel_unused(obd->obd_namespace, resp, flags, opaque);
+}
+
+static int osc_statfs_interpret(const struct lu_env *env,
+				struct ptlrpc_request *req,
+				struct osc_async_args *aa, int rc)
+{
+	struct obd_statfs *msfs;
+	ENTRY;
+
+	if (rc == -EBADR)
+		/* The request has in fact never been sent
+		 * due to issues at a higher level (LOV).
+		 * Exit immediately since the caller is
+		 * aware of the problem and takes care
+		 * of the clean up */
+		 RETURN(rc);
+
+	if ((rc == -ENOTCONN || rc == -EAGAIN) &&
+	    (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
+		GOTO(out, rc = 0);
+
+	if (rc != 0)
+		GOTO(out, rc);
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL) {
+		GOTO(out, rc = -EPROTO);
+	}
+
+	*aa->aa_oi->oi_osfs = *msfs;
+out:
+	rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+	RETURN(rc);
+}
+
+static int osc_statfs_async(struct obd_export *exp,
+			    struct obd_info *oinfo, __u64 max_age,
+			    struct ptlrpc_request_set *rqset)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct ptlrpc_request *req;
+	struct osc_async_args *aa;
+	int		    rc;
+	ENTRY;
+
+	/* We could possibly pass max_age in the request (as an absolute
+	 * timestamp or a "seconds.usec ago") so the target can avoid doing
+	 * extra calls into the filesystem if that isn't necessary (e.g.
+	 * during mount that would help a bit).  Having relative timestamps
+	 * is not so great if request processing is slow, while absolute
+	 * timestamps are not ideal because they need time synchronization. */
+	req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+	CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	aa->aa_oi = oinfo;
+
+	ptlrpc_set_add_req(rqset, req);
+	RETURN(0);
+}
+
+static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
+		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+	struct obd_device     *obd = class_exp2obd(exp);
+	struct obd_statfs     *msfs;
+	struct ptlrpc_request *req;
+	struct obd_import     *imp = NULL;
+	int rc;
+	ENTRY;
+
+	/*Since the request might also come from lprocfs, so we need
+	 *sync this with client_disconnect_export Bug15684*/
+	down_read(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import)
+		imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+	if (!imp)
+		RETURN(-ENODEV);
+
+	/* We could possibly pass max_age in the request (as an absolute
+	 * timestamp or a "seconds.usec ago") so the target can avoid doing
+	 * extra calls into the filesystem if that isn't necessary (e.g.
+	 * during mount that would help a bit).  Having relative timestamps
+	 * is not so great if request processing is slow, while absolute
+	 * timestamps are not ideal because they need time synchronization. */
+	req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+
+	class_import_put(imp);
+
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+	ptlrpc_request_set_replen(req);
+	req->rq_request_portal = OST_CREATE_PORTAL;
+	ptlrpc_at_set_req_timeout(req);
+
+	if (flags & OBD_STATFS_NODELAY) {
+		/* procfs requests not want stat in wait for avoid deadlock */
+		req->rq_no_resend = 1;
+		req->rq_no_delay = 1;
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+	if (msfs == NULL) {
+		GOTO(out, rc = -EPROTO);
+	}
+
+	*osfs = *msfs;
+
+	EXIT;
+ out:
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+/* Retrieve object striping information.
+ *
+ * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_MAGIC (we only use 1 slot here).
+ */
+static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump)
+{
+	/* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+	struct lov_user_md_v3 lum, *lumk;
+	struct lov_user_ost_data_v1 *lmm_objects;
+	int rc = 0, lum_size;
+	ENTRY;
+
+	if (!lsm)
+		RETURN(-ENODATA);
+
+	/* we only need the header part from user space to get lmm_magic and
+	 * lmm_stripe_count, (the header part is common to v1 and v3) */
+	lum_size = sizeof(struct lov_user_md_v1);
+	if (copy_from_user(&lum, lump, lum_size))
+		RETURN(-EFAULT);
+
+	if ((lum.lmm_magic != LOV_USER_MAGIC_V1) &&
+	    (lum.lmm_magic != LOV_USER_MAGIC_V3))
+		RETURN(-EINVAL);
+
+	/* lov_user_md_vX and lov_mds_md_vX must have the same size */
+	LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1));
+	LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3));
+	LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0]));
+
+	/* we can use lov_mds_md_size() to compute lum_size
+	 * because lov_user_md_vX and lov_mds_md_vX have the same size */
+	if (lum.lmm_stripe_count > 0) {
+		lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic);
+		OBD_ALLOC(lumk, lum_size);
+		if (!lumk)
+			RETURN(-ENOMEM);
+
+		if (lum.lmm_magic == LOV_USER_MAGIC_V1)
+			lmm_objects =
+			    &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]);
+		else
+			lmm_objects = &(lumk->lmm_objects[0]);
+		lmm_objects->l_ost_oi = lsm->lsm_oi;
+	} else {
+		lum_size = lov_mds_md_size(0, lum.lmm_magic);
+		lumk = &lum;
+	}
+
+	lumk->lmm_oi = lsm->lsm_oi;
+	lumk->lmm_stripe_count = 1;
+
+	if (copy_to_user(lump, lumk, lum_size))
+		rc = -EFAULT;
+
+	if (lumk != &lum)
+		OBD_FREE(lumk, lum_size);
+
+	RETURN(rc);
+}
+
+
+static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+			 void *karg, void *uarg)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct obd_ioctl_data *data = karg;
+	int err = 0;
+	ENTRY;
+
+	if (!try_module_get(THIS_MODULE)) {
+		CERROR("Can't get module. Is it alive?");
+		return -EINVAL;
+	}
+	switch (cmd) {
+	case OBD_IOC_LOV_GET_CONFIG: {
+		char *buf;
+		struct lov_desc *desc;
+		struct obd_uuid uuid;
+
+		buf = NULL;
+		len = 0;
+		if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
+			GOTO(out, err = -EINVAL);
+
+		data = (struct obd_ioctl_data *)buf;
+
+		if (sizeof(*desc) > data->ioc_inllen1) {
+			obd_ioctl_freedata(buf, len);
+			GOTO(out, err = -EINVAL);
+		}
+
+		if (data->ioc_inllen2 < sizeof(uuid)) {
+			obd_ioctl_freedata(buf, len);
+			GOTO(out, err = -EINVAL);
+		}
+
+		desc = (struct lov_desc *)data->ioc_inlbuf1;
+		desc->ld_tgt_count = 1;
+		desc->ld_active_tgt_count = 1;
+		desc->ld_default_stripe_count = 1;
+		desc->ld_default_stripe_size = 0;
+		desc->ld_default_stripe_offset = 0;
+		desc->ld_pattern = 0;
+		memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid));
+
+		memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid));
+
+		err = copy_to_user((void *)uarg, buf, len);
+		if (err)
+			err = -EFAULT;
+		obd_ioctl_freedata(buf, len);
+		GOTO(out, err);
+	}
+	case LL_IOC_LOV_SETSTRIPE:
+		err = obd_alloc_memmd(exp, karg);
+		if (err > 0)
+			err = 0;
+		GOTO(out, err);
+	case LL_IOC_LOV_GETSTRIPE:
+		err = osc_getstripe(karg, uarg);
+		GOTO(out, err);
+	case OBD_IOC_CLIENT_RECOVER:
+		err = ptlrpc_recover_import(obd->u.cli.cl_import,
+					    data->ioc_inlbuf1, 0);
+		if (err > 0)
+			err = 0;
+		GOTO(out, err);
+	case IOC_OSC_SET_ACTIVE:
+		err = ptlrpc_set_import_active(obd->u.cli.cl_import,
+					       data->ioc_offset);
+		GOTO(out, err);
+	case OBD_IOC_POLL_QUOTACHECK:
+		err = osc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+		GOTO(out, err);
+	case OBD_IOC_PING_TARGET:
+		err = ptlrpc_obd_ping(obd);
+		GOTO(out, err);
+	default:
+		CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
+		       cmd, current_comm());
+		GOTO(out, err = -ENOTTY);
+	}
+out:
+	module_put(THIS_MODULE);
+	return err;
+}
+
+static int osc_get_info(const struct lu_env *env, struct obd_export *exp,
+			obd_count keylen, void *key, __u32 *vallen, void *val,
+			struct lov_stripe_md *lsm)
+{
+	ENTRY;
+	if (!vallen || !val)
+		RETURN(-EFAULT);
+
+	if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+		__u32 *stripe = val;
+		*vallen = sizeof(*stripe);
+		*stripe = 0;
+		RETURN(0);
+	} else if (KEY_IS(KEY_LAST_ID)) {
+		struct ptlrpc_request *req;
+		obd_id		*reply;
+		char		  *tmp;
+		int		    rc;
+
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					   &RQF_OST_GET_INFO_LAST_ID);
+		if (req == NULL)
+			RETURN(-ENOMEM);
+
+		req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+				     RCL_CLIENT, keylen);
+		rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+		if (rc) {
+			ptlrpc_request_free(req);
+			RETURN(rc);
+		}
+
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+		memcpy(tmp, key, keylen);
+
+		req->rq_no_delay = req->rq_no_resend = 1;
+		ptlrpc_request_set_replen(req);
+		rc = ptlrpc_queue_wait(req);
+		if (rc)
+			GOTO(out, rc);
+
+		reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID);
+		if (reply == NULL)
+			GOTO(out, rc = -EPROTO);
+
+		*((obd_id *)val) = *reply;
+	out:
+		ptlrpc_req_finished(req);
+		RETURN(rc);
+	} else if (KEY_IS(KEY_FIEMAP)) {
+		struct ll_fiemap_info_key *fm_key =
+				(struct ll_fiemap_info_key *)key;
+		struct ldlm_res_id	 res_id;
+		ldlm_policy_data_t	 policy;
+		struct lustre_handle	 lockh;
+		ldlm_mode_t		 mode = 0;
+		struct ptlrpc_request	*req;
+		struct ll_user_fiemap	*reply;
+		char			*tmp;
+		int			 rc;
+
+		if (!(fm_key->fiemap.fm_flags & FIEMAP_FLAG_SYNC))
+			goto skip_locking;
+
+		policy.l_extent.start = fm_key->fiemap.fm_start &
+						CFS_PAGE_MASK;
+
+		if (OBD_OBJECT_EOF - fm_key->fiemap.fm_length <=
+		    fm_key->fiemap.fm_start + PAGE_CACHE_SIZE - 1)
+			policy.l_extent.end = OBD_OBJECT_EOF;
+		else
+			policy.l_extent.end = (fm_key->fiemap.fm_start +
+				fm_key->fiemap.fm_length +
+				PAGE_CACHE_SIZE - 1) & CFS_PAGE_MASK;
+
+		ostid_build_res_name(&fm_key->oa.o_oi, &res_id);
+		mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
+				       LDLM_FL_BLOCK_GRANTED |
+				       LDLM_FL_LVB_READY,
+				       &res_id, LDLM_EXTENT, &policy,
+				       LCK_PR | LCK_PW, &lockh, 0);
+		if (mode) { /* lock is cached on client */
+			if (mode != LCK_PR) {
+				ldlm_lock_addref(&lockh, LCK_PR);
+				ldlm_lock_decref(&lockh, LCK_PW);
+			}
+		} else { /* no cached lock, needs acquire lock on server side */
+			fm_key->oa.o_valid |= OBD_MD_FLFLAGS;
+			fm_key->oa.o_flags |= OBD_FL_SRVLOCK;
+		}
+
+skip_locking:
+		req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+					   &RQF_OST_GET_INFO_FIEMAP);
+		if (req == NULL)
+			GOTO(drop_lock, rc = -ENOMEM);
+
+		req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY,
+				     RCL_CLIENT, keylen);
+		req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+				     RCL_CLIENT, *vallen);
+		req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+				     RCL_SERVER, *vallen);
+
+		rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+		if (rc) {
+			ptlrpc_request_free(req);
+			GOTO(drop_lock, rc);
+		}
+
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY);
+		memcpy(tmp, key, keylen);
+		tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+		memcpy(tmp, val, *vallen);
+
+		ptlrpc_request_set_replen(req);
+		rc = ptlrpc_queue_wait(req);
+		if (rc)
+			GOTO(fini_req, rc);
+
+		reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+		if (reply == NULL)
+			GOTO(fini_req, rc = -EPROTO);
+
+		memcpy(val, reply, *vallen);
+fini_req:
+		ptlrpc_req_finished(req);
+drop_lock:
+		if (mode)
+			ldlm_lock_decref(&lockh, LCK_PR);
+		RETURN(rc);
+	}
+
+	RETURN(-EINVAL);
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+			      obd_count keylen, void *key, obd_count vallen,
+			      void *val, struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	struct obd_device     *obd = exp->exp_obd;
+	struct obd_import     *imp = class_exp2cliimp(exp);
+	char		  *tmp;
+	int		    rc;
+	ENTRY;
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
+
+	if (KEY_IS(KEY_CHECKSUM)) {
+		if (vallen != sizeof(int))
+			RETURN(-EINVAL);
+		exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
+		RETURN(0);
+	}
+
+	if (KEY_IS(KEY_SPTLRPC_CONF)) {
+		sptlrpc_conf_client_adapt(obd);
+		RETURN(0);
+	}
+
+	if (KEY_IS(KEY_FLUSH_CTX)) {
+		sptlrpc_import_flush_my_ctx(imp);
+		RETURN(0);
+	}
+
+	if (KEY_IS(KEY_CACHE_SET)) {
+		struct client_obd *cli = &obd->u.cli;
+
+		LASSERT(cli->cl_cache == NULL); /* only once */
+		cli->cl_cache = (struct cl_client_cache *)val;
+		atomic_inc(&cli->cl_cache->ccc_users);
+		cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
+
+		/* add this osc into entity list */
+		LASSERT(list_empty(&cli->cl_lru_osc));
+		spin_lock(&cli->cl_cache->ccc_lru_lock);
+		list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
+		spin_unlock(&cli->cl_cache->ccc_lru_lock);
+
+		RETURN(0);
+	}
+
+	if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
+		struct client_obd *cli = &obd->u.cli;
+		int nr = atomic_read(&cli->cl_lru_in_list) >> 1;
+		int target = *(int *)val;
+
+		nr = osc_lru_shrink(cli, min(nr, target));
+		*(int *)val -= nr;
+		RETURN(0);
+	}
+
+	if (!set && !KEY_IS(KEY_GRANT_SHRINK))
+		RETURN(-EINVAL);
+
+	/* We pass all other commands directly to OST. Since nobody calls osc
+	   methods directly and everybody is supposed to go through LOV, we
+	   assume lov checked invalid values for us.
+	   The only recognised values so far are evict_by_nid and mds_conn.
+	   Even if something bad goes through, we'd get a -EINVAL from OST
+	   anyway. */
+
+	req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
+						&RQF_OST_SET_GRANT_INFO :
+						&RQF_OBD_SET_INFO);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	if (!KEY_IS(KEY_GRANT_SHRINK))
+		req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+				     RCL_CLIENT, vallen);
+	rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
+							&RMF_OST_BODY :
+							&RMF_SETINFO_VAL);
+	memcpy(tmp, val, vallen);
+
+	if (KEY_IS(KEY_GRANT_SHRINK)) {
+		struct osc_grant_args *aa;
+		struct obdo *oa;
+
+		CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+		aa = ptlrpc_req_async_args(req);
+		OBDO_ALLOC(oa);
+		if (!oa) {
+			ptlrpc_req_finished(req);
+			RETURN(-ENOMEM);
+		}
+		*oa = ((struct ost_body *)val)->oa;
+		aa->aa_oa = oa;
+		req->rq_interpret_reply = osc_shrink_grant_interpret;
+	}
+
+	ptlrpc_request_set_replen(req);
+	if (!KEY_IS(KEY_GRANT_SHRINK)) {
+		LASSERT(set != NULL);
+		ptlrpc_set_add_req(set, req);
+		ptlrpc_check_set(NULL, set);
+	} else
+		ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+	RETURN(0);
+}
+
+
+static int osc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+			 struct obd_device *disk_obd, int *index)
+{
+	/* this code is not supposed to be used with LOD/OSP
+	 * to be removed soon */
+	LBUG();
+	return 0;
+}
+
+static int osc_llog_finish(struct obd_device *obd, int count)
+{
+	struct llog_ctxt *ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+	if (ctxt) {
+		llog_cat_close(NULL, ctxt->loc_handle);
+		llog_cleanup(NULL, ctxt);
+	}
+
+	ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+	if (ctxt)
+		llog_cleanup(NULL, ctxt);
+	RETURN(0);
+}
+
+static int osc_reconnect(const struct lu_env *env,
+			 struct obd_export *exp, struct obd_device *obd,
+			 struct obd_uuid *cluuid,
+			 struct obd_connect_data *data,
+			 void *localdata)
+{
+	struct client_obd *cli = &obd->u.cli;
+
+	if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+		long lost_grant;
+
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?:
+				2 * cli_brw_size(obd);
+		lost_grant = cli->cl_lost_grant;
+		cli->cl_lost_grant = 0;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+		CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d"
+		       " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags,
+		       data->ocd_version, data->ocd_grant, lost_grant);
+	}
+
+	RETURN(0);
+}
+
+static int osc_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = class_exp2obd(exp);
+	struct llog_ctxt  *ctxt;
+	int rc;
+
+	ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+	if (ctxt) {
+		if (obd->u.cli.cl_conn_count == 1) {
+			/* Flush any remaining cancel messages out to the
+			 * target */
+			llog_sync(ctxt, exp, 0);
+		}
+		llog_ctxt_put(ctxt);
+	} else {
+		CDEBUG(D_HA, "No LLOG_SIZE_REPL_CTXT found in obd %p\n",
+		       obd);
+	}
+
+	rc = client_disconnect_export(exp);
+	/**
+	 * Initially we put del_shrink_grant before disconnect_export, but it
+	 * causes the following problem if setup (connect) and cleanup
+	 * (disconnect) are tangled together.
+	 *      connect p1		     disconnect p2
+	 *   ptlrpc_connect_import
+	 *     ...............	       class_manual_cleanup
+	 *				     osc_disconnect
+	 *				     del_shrink_grant
+	 *   ptlrpc_connect_interrupt
+	 *     init_grant_shrink
+	 *   add this client to shrink list
+	 *				      cleanup_osc
+	 * Bang! pinger trigger the shrink.
+	 * So the osc should be disconnected from the shrink list, after we
+	 * are sure the import has been destroyed. BUG18662
+	 */
+	if (obd->u.cli.cl_import == NULL)
+		osc_del_shrink_grant(&obd->u.cli);
+	return rc;
+}
+
+static int osc_import_event(struct obd_device *obd,
+			    struct obd_import *imp,
+			    enum obd_import_event event)
+{
+	struct client_obd *cli;
+	int rc = 0;
+
+	ENTRY;
+	LASSERT(imp->imp_obd == obd);
+
+	switch (event) {
+	case IMP_EVENT_DISCON: {
+		cli = &obd->u.cli;
+		client_obd_list_lock(&cli->cl_loi_list_lock);
+		cli->cl_avail_grant = 0;
+		cli->cl_lost_grant = 0;
+		client_obd_list_unlock(&cli->cl_loi_list_lock);
+		break;
+	}
+	case IMP_EVENT_INACTIVE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+		break;
+	}
+	case IMP_EVENT_INVALIDATE: {
+		struct ldlm_namespace *ns = obd->obd_namespace;
+		struct lu_env	 *env;
+		int		    refcheck;
+
+		env = cl_env_get(&refcheck);
+		if (!IS_ERR(env)) {
+			/* Reset grants */
+			cli = &obd->u.cli;
+			/* all pages go to failing rpcs due to the invalid
+			 * import */
+			osc_io_unplug(env, cli, NULL, PDL_POLICY_ROUND);
+
+			ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+			cl_env_put(env, &refcheck);
+		} else
+			rc = PTR_ERR(env);
+		break;
+	}
+	case IMP_EVENT_ACTIVE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+		break;
+	}
+	case IMP_EVENT_OCD: {
+		struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+		if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
+			osc_init_grant(&obd->u.cli, ocd);
+
+		/* See bug 7198 */
+		if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+			imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
+
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+		break;
+	}
+	case IMP_EVENT_DEACTIVATE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL);
+		break;
+	}
+	case IMP_EVENT_ACTIVATE: {
+		rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL);
+		break;
+	}
+	default:
+		CERROR("Unknown import event %d\n", event);
+		LBUG();
+	}
+	RETURN(rc);
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying the lock
+ * during recovery, see bug16774 for detailed information.
+ *
+ * \retval zero the lock can't be canceled
+ * \retval other ok to cancel
+ */
+static int osc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+	check_res_locked(lock->l_resource);
+
+	/*
+	 * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR.
+	 *
+	 * XXX as a future improvement, we can also cancel unused write lock
+	 * if it doesn't have dirty data and active mmaps.
+	 */
+	if (lock->l_resource->lr_type == LDLM_EXTENT &&
+	    (lock->l_granted_mode == LCK_PR ||
+	     lock->l_granted_mode == LCK_CR) &&
+	    (osc_dlm_lock_pageref(lock) == 0))
+		RETURN(1);
+
+	RETURN(0);
+}
+
+static int brw_queue_work(const struct lu_env *env, void *data)
+{
+	struct client_obd *cli = data;
+
+	CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
+
+	osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+	RETURN(0);
+}
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	struct client_obd	  *cli = &obd->u.cli;
+	void		       *handler;
+	int			rc;
+	ENTRY;
+
+	rc = ptlrpcd_addref();
+	if (rc)
+		RETURN(rc);
+
+	rc = client_obd_setup(obd, lcfg);
+	if (rc)
+		GOTO(out_ptlrpcd, rc);
+
+	handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
+	if (IS_ERR(handler))
+		GOTO(out_client_setup, rc = PTR_ERR(handler));
+	cli->cl_writeback_work = handler;
+
+	rc = osc_quota_setup(obd);
+	if (rc)
+		GOTO(out_ptlrpcd_work, rc);
+
+	cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+	lprocfs_osc_init_vars(&lvars);
+	if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
+		lproc_osc_attach_seqstat(obd);
+		sptlrpc_lprocfs_cliobd_attach(obd);
+		ptlrpc_lprocfs_register_obd(obd);
+	}
+
+	/* We need to allocate a few requests more, because
+	 * brw_interpret tries to create new requests before freeing
+	 * previous ones, Ideally we want to have 2x max_rpcs_in_flight
+	 * reserved, but I'm afraid that might be too much wasted RAM
+	 * in fact, so 2 is just my guess and still should work. */
+	cli->cl_import->imp_rq_pool =
+		ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
+				    OST_MAXREQSIZE,
+				    ptlrpc_add_rqs_to_pool);
+
+	INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
+	ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery);
+	RETURN(rc);
+
+out_ptlrpcd_work:
+	ptlrpcd_destroy_work(handler);
+out_client_setup:
+	client_obd_cleanup(obd);
+out_ptlrpcd:
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+	int rc = 0;
+	ENTRY;
+
+	switch (stage) {
+	case OBD_CLEANUP_EARLY: {
+		struct obd_import *imp;
+		imp = obd->u.cli.cl_import;
+		CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name);
+		/* ptlrpc_abort_inflight to stop an mds_lov_synchronize */
+		ptlrpc_deactivate_import(imp);
+		spin_lock(&imp->imp_lock);
+		imp->imp_pingable = 0;
+		spin_unlock(&imp->imp_lock);
+		break;
+	}
+	case OBD_CLEANUP_EXPORTS: {
+		struct client_obd *cli = &obd->u.cli;
+		/* LU-464
+		 * for echo client, export may be on zombie list, wait for
+		 * zombie thread to cull it, because cli.cl_import will be
+		 * cleared in client_disconnect_export():
+		 *   class_export_destroy() -> obd_cleanup() ->
+		 *   echo_device_free() -> echo_client_cleanup() ->
+		 *   obd_disconnect() -> osc_disconnect() ->
+		 *   client_disconnect_export()
+		 */
+		obd_zombie_barrier();
+		if (cli->cl_writeback_work) {
+			ptlrpcd_destroy_work(cli->cl_writeback_work);
+			cli->cl_writeback_work = NULL;
+		}
+		obd_cleanup_client_import(obd);
+		ptlrpc_lprocfs_unregister_obd(obd);
+		lprocfs_obd_cleanup(obd);
+		rc = obd_llog_finish(obd, 0);
+		if (rc != 0)
+			CERROR("failed to cleanup llogging subsystems\n");
+		break;
+		}
+	}
+	RETURN(rc);
+}
+
+int osc_cleanup(struct obd_device *obd)
+{
+	struct client_obd *cli = &obd->u.cli;
+	int rc;
+
+	ENTRY;
+
+	/* lru cleanup */
+	if (cli->cl_cache != NULL) {
+		LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
+		spin_lock(&cli->cl_cache->ccc_lru_lock);
+		list_del_init(&cli->cl_lru_osc);
+		spin_unlock(&cli->cl_cache->ccc_lru_lock);
+		cli->cl_lru_left = NULL;
+		atomic_dec(&cli->cl_cache->ccc_users);
+		cli->cl_cache = NULL;
+	}
+
+	/* free memory of osc quota cache */
+	osc_quota_cleanup(obd);
+
+	rc = client_obd_cleanup(obd);
+
+	ptlrpcd_decref();
+	RETURN(rc);
+}
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc = 0;
+
+	lprocfs_osc_init_vars(&lvars);
+
+	switch (lcfg->lcfg_command) {
+	default:
+		rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars,
+					      lcfg, obd);
+		if (rc > 0)
+			rc = 0;
+		break;
+	}
+
+	return(rc);
+}
+
+static int osc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+	return osc_process_config_base(obd, buf);
+}
+
+struct obd_ops osc_obd_ops = {
+	.o_owner		= THIS_MODULE,
+	.o_setup		= osc_setup,
+	.o_precleanup	   = osc_precleanup,
+	.o_cleanup	      = osc_cleanup,
+	.o_add_conn	     = client_import_add_conn,
+	.o_del_conn	     = client_import_del_conn,
+	.o_connect	      = client_connect_import,
+	.o_reconnect	    = osc_reconnect,
+	.o_disconnect	   = osc_disconnect,
+	.o_statfs	       = osc_statfs,
+	.o_statfs_async	 = osc_statfs_async,
+	.o_packmd	       = osc_packmd,
+	.o_unpackmd	     = osc_unpackmd,
+	.o_create	       = osc_create,
+	.o_destroy	      = osc_destroy,
+	.o_getattr	      = osc_getattr,
+	.o_getattr_async	= osc_getattr_async,
+	.o_setattr	      = osc_setattr,
+	.o_setattr_async	= osc_setattr_async,
+	.o_brw		  = osc_brw,
+	.o_punch		= osc_punch,
+	.o_sync		 = osc_sync,
+	.o_enqueue	      = osc_enqueue,
+	.o_change_cbdata	= osc_change_cbdata,
+	.o_find_cbdata	  = osc_find_cbdata,
+	.o_cancel	       = osc_cancel,
+	.o_cancel_unused	= osc_cancel_unused,
+	.o_iocontrol	    = osc_iocontrol,
+	.o_get_info	     = osc_get_info,
+	.o_set_info_async       = osc_set_info_async,
+	.o_import_event	 = osc_import_event,
+	.o_llog_init	    = osc_llog_init,
+	.o_llog_finish	  = osc_llog_finish,
+	.o_process_config       = osc_process_config,
+	.o_quotactl	     = osc_quotactl,
+	.o_quotacheck	   = osc_quotacheck,
+};
+
+extern struct lu_kmem_descr osc_caches[];
+extern spinlock_t osc_ast_guard;
+extern struct lock_class_key osc_ast_guard_class;
+
+int __init osc_init(void)
+{
+	struct lprocfs_static_vars lvars = { 0 };
+	int rc;
+	ENTRY;
+
+	/* print an address of _any_ initialized kernel symbol from this
+	 * module, to allow debugging with gdb that doesn't support data
+	 * symbols from modules.*/
+	CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
+
+	rc = lu_kmem_init(osc_caches);
+
+	lprocfs_osc_init_vars(&lvars);
+
+	rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars,
+				 LUSTRE_OSC_NAME, &osc_device_type);
+	if (rc) {
+		lu_kmem_fini(osc_caches);
+		RETURN(rc);
+	}
+
+	spin_lock_init(&osc_ast_guard);
+	lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class);
+
+	RETURN(rc);
+}
+
+static void /*__exit*/ osc_exit(void)
+{
+	class_unregister_type(LUSTRE_OSC_NAME);
+	lu_kmem_fini(osc_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
+MODULE_LICENSE("GPL");
+
+cfs_module(osc, LUSTRE_VERSION_STRING, osc_init, osc_exit);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/Makefile b/drivers/staging/lustre/lustre/ptlrpc/Makefile
new file mode 100644
index 0000000..983eb66
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/Makefile

@@ -0,0 +1,23 @@
+obj-$(CONFIG_LUSTRE_FS) += ptlrpc.o
+LDLM := ../../lustre/ldlm/
+
+ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o
+ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o
+ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
+ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
+ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
+ldlm_objs += $(LDLM)ldlm_pool.o
+ldlm_objs += $(LDLM)interval_tree.o
+ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
+ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
+ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
+ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
+ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
+ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o
+
+ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs)
+
+obj-$(CONFIG_PTLRPC_GSS) += gss/
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c
new file mode 100644
index 0000000..22f7e65
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/client.c

@@ -0,0 +1,3059 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** Implementation of client-side PortalRPC interfaces */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+static int ptlrpc_send_new_req(struct ptlrpc_request *req);
+
+/**
+ * Initialize passed in client structure \a cl.
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+			struct ptlrpc_client *cl)
+{
+	cl->cli_request_portal = req_portal;
+	cl->cli_reply_portal   = rep_portal;
+	cl->cli_name	   = name;
+}
+EXPORT_SYMBOL(ptlrpc_init_client);
+
+/**
+ * Return PortalRPC connection for remore uud \a uuid
+ */
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
+{
+	struct ptlrpc_connection *c;
+	lnet_nid_t		self;
+	lnet_process_id_t	 peer;
+	int		       err;
+
+	/* ptlrpc_uuid_to_peer() initializes its 2nd parameter
+	 * before accessing its values. */
+	/* coverity[uninit_use_in_call] */
+	err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
+	if (err != 0) {
+		CNETERR("cannot find peer %s!\n", uuid->uuid);
+		return NULL;
+	}
+
+	c = ptlrpc_connection_get(peer, self, uuid);
+	if (c) {
+		memcpy(c->c_remote_uuid.uuid,
+		       uuid->uuid, sizeof(c->c_remote_uuid.uuid));
+	}
+
+	CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
+
+	return c;
+}
+EXPORT_SYMBOL(ptlrpc_uuid_to_connection);
+
+/**
+ * Allocate and initialize new bulk descriptor on the sender.
+ * Returns pointer to the descriptor or NULL on error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+					 unsigned type, unsigned portal)
+{
+	struct ptlrpc_bulk_desc *desc;
+	int i;
+
+	OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]));
+	if (!desc)
+		return NULL;
+
+	spin_lock_init(&desc->bd_lock);
+	init_waitqueue_head(&desc->bd_waitq);
+	desc->bd_max_iov = npages;
+	desc->bd_iov_count = 0;
+	desc->bd_portal = portal;
+	desc->bd_type = type;
+	desc->bd_md_count = 0;
+	LASSERT(max_brw > 0);
+	desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
+	/* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
+	 * node. Negotiated ocd_brw_size will always be <= this number. */
+	for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
+		LNetInvalidateHandle(&desc->bd_mds[i]);
+
+	return desc;
+}
+
+/**
+ * Prepare bulk descriptor for specified outgoing request \a req that
+ * can fit \a npages * pages. \a type is bulk type. \a portal is where
+ * the bulk to be sent. Used on client-side.
+ * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
+ * error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+					      unsigned npages, unsigned max_brw,
+					      unsigned type, unsigned portal)
+{
+	struct obd_import *imp = req->rq_import;
+	struct ptlrpc_bulk_desc *desc;
+
+	ENTRY;
+	LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
+	desc = ptlrpc_new_bulk(npages, max_brw, type, portal);
+	if (desc == NULL)
+		RETURN(NULL);
+
+	desc->bd_import_generation = req->rq_import_generation;
+	desc->bd_import = class_import_get(imp);
+	desc->bd_req = req;
+
+	desc->bd_cbid.cbid_fn  = client_bulk_callback;
+	desc->bd_cbid.cbid_arg = desc;
+
+	/* This makes req own desc, and free it when she frees herself */
+	req->rq_bulk = desc;
+
+	return desc;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
+
+/**
+ * Add a page \a page to the bulk descriptor \a desc.
+ * Data to transfer in the page starts at offset \a pageoffset and
+ * amount of data to transfer from the page is \a len
+ */
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+			     struct page *page, int pageoffset, int len, int pin)
+{
+	LASSERT(desc->bd_iov_count < desc->bd_max_iov);
+	LASSERT(page != NULL);
+	LASSERT(pageoffset >= 0);
+	LASSERT(len > 0);
+	LASSERT(pageoffset + len <= PAGE_CACHE_SIZE);
+
+	desc->bd_nob += len;
+
+	if (pin)
+		page_cache_get(page);
+
+	ptlrpc_add_bulk_page(desc, page, pageoffset, len);
+}
+EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
+
+/**
+ * Uninitialize and free bulk descriptor \a desc.
+ * Works on bulk descriptors both from server and client side.
+ */
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin)
+{
+	int i;
+	ENTRY;
+
+	LASSERT(desc != NULL);
+	LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
+	LASSERT(desc->bd_md_count == 0);	 /* network hands off */
+	LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
+
+	sptlrpc_enc_pool_put_pages(desc);
+
+	if (desc->bd_export)
+		class_export_put(desc->bd_export);
+	else
+		class_import_put(desc->bd_import);
+
+	if (unpin) {
+		for (i = 0; i < desc->bd_iov_count ; i++)
+			page_cache_release(desc->bd_iov[i].kiov_page);
+	}
+
+	OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
+				bd_iov[desc->bd_max_iov]));
+	EXIT;
+}
+EXPORT_SYMBOL(__ptlrpc_free_bulk);
+
+/**
+ * Set server timelimit for this req, i.e. how long are we willing to wait
+ * for reply before timing out this request.
+ */
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
+{
+	__u32 serv_est;
+	int idx;
+	struct imp_at *at;
+
+	LASSERT(req->rq_import);
+
+	if (AT_OFF) {
+		/* non-AT settings */
+		/**
+		 * \a imp_server_timeout means this is reverse import and
+		 * we send (currently only) ASTs to the client and cannot afford
+		 * to wait too long for the reply, otherwise the other client
+		 * (because of which we are sending this request) would
+		 * timeout waiting for us
+		 */
+		req->rq_timeout = req->rq_import->imp_server_timeout ?
+				  obd_timeout / 2 : obd_timeout;
+	} else {
+		at = &req->rq_import->imp_at;
+		idx = import_at_get_index(req->rq_import,
+					  req->rq_request_portal);
+		serv_est = at_get(&at->iat_service_estimate[idx]);
+		req->rq_timeout = at_est2timeout(serv_est);
+	}
+	/* We could get even fancier here, using history to predict increased
+	   loading... */
+
+	/* Let the server know what this RPC timeout is by putting it in the
+	   reqmsg*/
+	lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+				  unsigned int serv_est)
+{
+	int idx;
+	unsigned int oldse;
+	struct imp_at *at;
+
+	LASSERT(req->rq_import);
+	at = &req->rq_import->imp_at;
+
+	idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+	/* max service estimates are tracked on the server side,
+	   so just keep minimal history here */
+	oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
+	if (oldse != 0)
+		CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d "
+		       "has changed from %d to %d\n",
+		       req->rq_import->imp_obd->obd_name,req->rq_request_portal,
+		       oldse, at_get(&at->iat_service_estimate[idx]));
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
+{
+	return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+				      unsigned int service_time)
+{
+	unsigned int nl, oldnl;
+	struct imp_at *at;
+	time_t now = cfs_time_current_sec();
+
+	LASSERT(req->rq_import);
+	at = &req->rq_import->imp_at;
+
+	/* Network latency is total time less server processing time */
+	nl = max_t(int, now - req->rq_sent - service_time, 0) +1/*st rounding*/;
+	if (service_time > now - req->rq_sent + 3 /* bz16408 */)
+		CWARN("Reported service time %u > total measured time "
+		      CFS_DURATION_T"\n", service_time,
+		      cfs_time_sub(now, req->rq_sent));
+
+	oldnl = at_measured(&at->iat_net_latency, nl);
+	if (oldnl != 0)
+		CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) "
+		       "has changed from %d to %d\n",
+		       req->rq_import->imp_obd->obd_name,
+		       obd_uuid2str(
+			       &req->rq_import->imp_connection->c_remote_uuid),
+		       oldnl, at_get(&at->iat_net_latency));
+}
+
+static int unpack_reply(struct ptlrpc_request *req)
+{
+	int rc;
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+		rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
+		if (rc) {
+			DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
+			return(-EPROTO);
+		}
+	}
+
+	rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+	if (rc) {
+		DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
+		return(-EPROTO);
+	}
+	return 0;
+}
+
+/**
+ * Handle an early reply message, called with the rq_lock held.
+ * If anything goes wrong just ignore it - same as if it never happened
+ */
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request *early_req;
+	time_t		 olddl;
+	int		    rc;
+	ENTRY;
+
+	req->rq_early = 0;
+	spin_unlock(&req->rq_lock);
+
+	rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+	if (rc) {
+		spin_lock(&req->rq_lock);
+		RETURN(rc);
+	}
+
+	rc = unpack_reply(early_req);
+	if (rc == 0) {
+		/* Expecting to increase the service time estimate here */
+		ptlrpc_at_adj_service(req,
+			lustre_msg_get_timeout(early_req->rq_repmsg));
+		ptlrpc_at_adj_net_latency(req,
+			lustre_msg_get_service_time(early_req->rq_repmsg));
+	}
+
+	sptlrpc_cli_finish_early_reply(early_req);
+
+	if (rc != 0) {
+		spin_lock(&req->rq_lock);
+		RETURN(rc);
+	}
+
+	/* Adjust the local timeout for this req */
+	ptlrpc_at_set_req_timeout(req);
+
+	spin_lock(&req->rq_lock);
+	olddl = req->rq_deadline;
+	/* server assumes it now has rq_timeout from when it sent the
+	 * early reply, so client should give it at least that long. */
+	req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
+			   ptlrpc_at_get_net_latency(req);
+
+	DEBUG_REQ(D_ADAPTTO, req,
+		  "Early reply #%d, new deadline in "CFS_DURATION_T"s "
+		  "("CFS_DURATION_T"s)", req->rq_early_count,
+		  cfs_time_sub(req->rq_deadline, cfs_time_current_sec()),
+		  cfs_time_sub(req->rq_deadline, olddl));
+
+	RETURN(rc);
+}
+
+/**
+ * Wind down request pool \a pool.
+ * Frees all requests from the pool too
+ */
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
+{
+	struct list_head *l, *tmp;
+	struct ptlrpc_request *req;
+
+	LASSERT(pool != NULL);
+
+	spin_lock(&pool->prp_lock);
+	list_for_each_safe(l, tmp, &pool->prp_req_list) {
+		req = list_entry(l, struct ptlrpc_request, rq_list);
+		list_del(&req->rq_list);
+		LASSERT(req->rq_reqbuf);
+		LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
+		OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
+		OBD_FREE(req, sizeof(*req));
+	}
+	spin_unlock(&pool->prp_lock);
+	OBD_FREE(pool, sizeof(*pool));
+}
+EXPORT_SYMBOL(ptlrpc_free_rq_pool);
+
+/**
+ * Allocates, initializes and adds \a num_rq requests to the pool \a pool
+ */
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
+{
+	int i;
+	int size = 1;
+
+	while (size < pool->prp_rq_size)
+		size <<= 1;
+
+	LASSERTF(list_empty(&pool->prp_req_list) ||
+		 size == pool->prp_rq_size,
+		 "Trying to change pool size with nonempty pool "
+		 "from %d to %d bytes\n", pool->prp_rq_size, size);
+
+	spin_lock(&pool->prp_lock);
+	pool->prp_rq_size = size;
+	for (i = 0; i < num_rq; i++) {
+		struct ptlrpc_request *req;
+		struct lustre_msg *msg;
+
+		spin_unlock(&pool->prp_lock);
+		OBD_ALLOC(req, sizeof(struct ptlrpc_request));
+		if (!req)
+			return;
+		OBD_ALLOC_LARGE(msg, size);
+		if (!msg) {
+			OBD_FREE(req, sizeof(struct ptlrpc_request));
+			return;
+		}
+		req->rq_reqbuf = msg;
+		req->rq_reqbuf_len = size;
+		req->rq_pool = pool;
+		spin_lock(&pool->prp_lock);
+		list_add_tail(&req->rq_list, &pool->prp_req_list);
+	}
+	spin_unlock(&pool->prp_lock);
+	return;
+}
+EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
+
+/**
+ * Create and initialize new request pool with given attributes:
+ * \a num_rq - initial number of requests to create for the pool
+ * \a msgsize - maximum message size possible for requests in thid pool
+ * \a populate_pool - function to be called when more requests need to be added
+ *		    to the pool
+ * Returns pointer to newly created pool or NULL on error.
+ */
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int num_rq, int msgsize,
+		    void (*populate_pool)(struct ptlrpc_request_pool *, int))
+{
+	struct ptlrpc_request_pool *pool;
+
+	OBD_ALLOC(pool, sizeof (struct ptlrpc_request_pool));
+	if (!pool)
+		return NULL;
+
+	/* Request next power of two for the allocation, because internally
+	   kernel would do exactly this */
+
+	spin_lock_init(&pool->prp_lock);
+	INIT_LIST_HEAD(&pool->prp_req_list);
+	pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
+	pool->prp_populate = populate_pool;
+
+	populate_pool(pool, num_rq);
+
+	if (list_empty(&pool->prp_req_list)) {
+		/* have not allocated a single request for the pool */
+		OBD_FREE(pool, sizeof (struct ptlrpc_request_pool));
+		pool = NULL;
+	}
+	return pool;
+}
+EXPORT_SYMBOL(ptlrpc_init_rq_pool);
+
+/**
+ * Fetches one request from pool \a pool
+ */
+static struct ptlrpc_request *
+ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request;
+	struct lustre_msg *reqbuf;
+
+	if (!pool)
+		return NULL;
+
+	spin_lock(&pool->prp_lock);
+
+	/* See if we have anything in a pool, and bail out if nothing,
+	 * in writeout path, where this matters, this is safe to do, because
+	 * nothing is lost in this case, and when some in-flight requests
+	 * complete, this code will be called again. */
+	if (unlikely(list_empty(&pool->prp_req_list))) {
+		spin_unlock(&pool->prp_lock);
+		return NULL;
+	}
+
+	request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
+				 rq_list);
+	list_del_init(&request->rq_list);
+	spin_unlock(&pool->prp_lock);
+
+	LASSERT(request->rq_reqbuf);
+	LASSERT(request->rq_pool);
+
+	reqbuf = request->rq_reqbuf;
+	memset(request, 0, sizeof(*request));
+	request->rq_reqbuf = reqbuf;
+	request->rq_reqbuf_len = pool->prp_rq_size;
+	request->rq_pool = pool;
+
+	return request;
+}
+
+/**
+ * Returns freed \a request to pool.
+ */
+static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
+{
+	struct ptlrpc_request_pool *pool = request->rq_pool;
+
+	spin_lock(&pool->prp_lock);
+	LASSERT(list_empty(&request->rq_list));
+	LASSERT(!request->rq_receiving_reply);
+	list_add_tail(&request->rq_list, &pool->prp_req_list);
+	spin_unlock(&pool->prp_lock);
+}
+
+static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+				      __u32 version, int opcode,
+				      int count, __u32 *lengths, char **bufs,
+				      struct ptlrpc_cli_ctx *ctx)
+{
+	struct obd_import  *imp = request->rq_import;
+	int		 rc;
+	ENTRY;
+
+	if (unlikely(ctx))
+		request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
+	else {
+		rc = sptlrpc_req_get_ctx(request);
+		if (rc)
+			GOTO(out_free, rc);
+	}
+
+	sptlrpc_req_set_flavor(request, opcode);
+
+	rc = lustre_pack_request(request, imp->imp_msg_magic, count,
+				 lengths, bufs);
+	if (rc) {
+		LASSERT(!request->rq_pool);
+		GOTO(out_ctx, rc);
+	}
+
+	lustre_msg_add_version(request->rq_reqmsg, version);
+	request->rq_send_state = LUSTRE_IMP_FULL;
+	request->rq_type = PTL_RPC_MSG_REQUEST;
+	request->rq_export = NULL;
+
+	request->rq_req_cbid.cbid_fn  = request_out_callback;
+	request->rq_req_cbid.cbid_arg = request;
+
+	request->rq_reply_cbid.cbid_fn  = reply_in_callback;
+	request->rq_reply_cbid.cbid_arg = request;
+
+	request->rq_reply_deadline = 0;
+	request->rq_phase = RQ_PHASE_NEW;
+	request->rq_next_phase = RQ_PHASE_UNDEFINED;
+
+	request->rq_request_portal = imp->imp_client->cli_request_portal;
+	request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+
+	ptlrpc_at_set_req_timeout(request);
+
+	spin_lock_init(&request->rq_lock);
+	INIT_LIST_HEAD(&request->rq_list);
+	INIT_LIST_HEAD(&request->rq_timed_list);
+	INIT_LIST_HEAD(&request->rq_replay_list);
+	INIT_LIST_HEAD(&request->rq_ctx_chain);
+	INIT_LIST_HEAD(&request->rq_set_chain);
+	INIT_LIST_HEAD(&request->rq_history_list);
+	INIT_LIST_HEAD(&request->rq_exp_list);
+	init_waitqueue_head(&request->rq_reply_waitq);
+	init_waitqueue_head(&request->rq_set_waitq);
+	request->rq_xid = ptlrpc_next_xid();
+	atomic_set(&request->rq_refcount, 1);
+
+	lustre_msg_set_opc(request->rq_reqmsg, opcode);
+
+	RETURN(0);
+out_ctx:
+	sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
+out_free:
+	class_import_put(imp);
+	return rc;
+}
+
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+			     __u32 version, int opcode, char **bufs,
+			     struct ptlrpc_cli_ctx *ctx)
+{
+	int count;
+
+	count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
+	return __ptlrpc_request_bufs_pack(request, version, opcode, count,
+					  request->rq_pill.rc_area[RCL_CLIENT],
+					  bufs, ctx);
+}
+EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
+
+/**
+ * Pack request buffers for network transfer, performing necessary encryption
+ * steps if necessary.
+ */
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+			__u32 version, int opcode)
+{
+	int rc;
+	rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
+	if (rc)
+		return rc;
+
+	/* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
+	 * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
+	 * have to send old ptlrpc_body to keep interoprability with these
+	 * clients.
+	 *
+	 * Only three kinds of server->client RPCs so far:
+	 *  - LDLM_BL_CALLBACK
+	 *  - LDLM_CP_CALLBACK
+	 *  - LDLM_GL_CALLBACK
+	 *
+	 * XXX This should be removed whenever we drop the interoprability with
+	 *     the these old clients.
+	 */
+	if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
+	    opcode == LDLM_GL_CALLBACK)
+		req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
+				   sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
+
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_request_pack);
+
+/**
+ * Helper function to allocate new request on import \a imp
+ * and possibly using existing request from pool \a pool if provided.
+ * Returns allocated request structure with import field filled or
+ * NULL on error.
+ */
+static inline
+struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
+					      struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request = NULL;
+
+	if (pool)
+		request = ptlrpc_prep_req_from_pool(pool);
+
+	if (!request)
+		OBD_ALLOC_PTR(request);
+
+	if (request) {
+		LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
+		LASSERT(imp != LP_POISON);
+		LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
+			imp->imp_client);
+		LASSERT(imp->imp_client != LP_POISON);
+
+		request->rq_import = class_import_get(imp);
+	} else {
+		CERROR("request allocation out of memory\n");
+	}
+
+	return request;
+}
+
+/**
+ * Helper function for creating a request.
+ * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
+ * buffer structures according to capsule template \a format.
+ * Returns allocated request structure pointer or NULL on error.
+ */
+static struct ptlrpc_request *
+ptlrpc_request_alloc_internal(struct obd_import *imp,
+			      struct ptlrpc_request_pool * pool,
+			      const struct req_format *format)
+{
+	struct ptlrpc_request *request;
+
+	request = __ptlrpc_request_alloc(imp, pool);
+	if (request == NULL)
+		return NULL;
+
+	req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
+	req_capsule_set(&request->rq_pill, format);
+	return request;
+}
+
+/**
+ * Allocate new request structure for import \a imp and initialize its
+ * buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+					    const struct req_format *format)
+{
+	return ptlrpc_request_alloc_internal(imp, NULL, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc);
+
+/**
+ * Allocate new request structure for import \a imp from pool \a pool and
+ * initialize its buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+					    struct ptlrpc_request_pool * pool,
+					    const struct req_format *format)
+{
+	return ptlrpc_request_alloc_internal(imp, pool, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
+
+/**
+ * For requests not from pool, free memory of the request structure.
+ * For requests obtained from a pool earlier, return request back to pool.
+ */
+void ptlrpc_request_free(struct ptlrpc_request *request)
+{
+	if (request->rq_pool)
+		__ptlrpc_free_req_to_pool(request);
+	else
+		OBD_FREE_PTR(request);
+}
+EXPORT_SYMBOL(ptlrpc_request_free);
+
+/**
+ * Allocate new request for operatione \a opcode and immediatelly pack it for
+ * network transfer.
+ * Only used for simple requests like OBD_PING where the only important
+ * part of the request is operation itself.
+ * Returns allocated request or NULL on error.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+						const struct req_format *format,
+						__u32 version, int opcode)
+{
+	struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
+	int		    rc;
+
+	if (req) {
+		rc = ptlrpc_request_pack(req, version, opcode);
+		if (rc) {
+			ptlrpc_request_free(req);
+			req = NULL;
+		}
+	}
+	return req;
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
+
+/**
+ * Prepare request (fetched from pool \a poolif not NULL) on import \a imp
+ * for operation \a opcode. Request would contain \a count buffers.
+ * Sizes of buffers are described in array \a lengths and buffers themselves
+ * are provided by a pointer \a bufs.
+ * Returns prepared request structure pointer or NULL on error.
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req_pool(struct obd_import *imp,
+		     __u32 version, int opcode,
+		     int count, __u32 *lengths, char **bufs,
+		     struct ptlrpc_request_pool *pool)
+{
+	struct ptlrpc_request *request;
+	int		    rc;
+
+	request = __ptlrpc_request_alloc(imp, pool);
+	if (!request)
+		return NULL;
+
+	rc = __ptlrpc_request_bufs_pack(request, version, opcode, count,
+					lengths, bufs, NULL);
+	if (rc) {
+		ptlrpc_request_free(request);
+		request = NULL;
+	}
+	return request;
+}
+EXPORT_SYMBOL(ptlrpc_prep_req_pool);
+
+/**
+ * Same as ptlrpc_prep_req_pool, but without pool
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count,
+		__u32 *lengths, char **bufs)
+{
+	return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs,
+				    NULL);
+}
+EXPORT_SYMBOL(ptlrpc_prep_req);
+
+/**
+ * Allocate and initialize new request set structure.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_set(void)
+{
+	struct ptlrpc_request_set *set;
+
+	ENTRY;
+	OBD_ALLOC(set, sizeof *set);
+	if (!set)
+		RETURN(NULL);
+	atomic_set(&set->set_refcount, 1);
+	INIT_LIST_HEAD(&set->set_requests);
+	init_waitqueue_head(&set->set_waitq);
+	atomic_set(&set->set_new_count, 0);
+	atomic_set(&set->set_remaining, 0);
+	spin_lock_init(&set->set_new_req_lock);
+	INIT_LIST_HEAD(&set->set_new_requests);
+	INIT_LIST_HEAD(&set->set_cblist);
+	set->set_max_inflight = UINT_MAX;
+	set->set_producer     = NULL;
+	set->set_producer_arg = NULL;
+	set->set_rc	   = 0;
+
+	RETURN(set);
+}
+EXPORT_SYMBOL(ptlrpc_prep_set);
+
+/**
+ * Allocate and initialize new request set structure with flow control
+ * extension. This extension allows to control the number of requests in-flight
+ * for the whole set. A callback function to generate requests must be provided
+ * and the request set will keep the number of requests sent over the wire to
+ * @max_inflight.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+					     void *arg)
+
+{
+	struct ptlrpc_request_set *set;
+
+	set = ptlrpc_prep_set();
+	if (!set)
+		RETURN(NULL);
+
+	set->set_max_inflight  = max;
+	set->set_producer      = func;
+	set->set_producer_arg  = arg;
+
+	RETURN(set);
+}
+EXPORT_SYMBOL(ptlrpc_prep_fcset);
+
+/**
+ * Wind down and free request set structure previously allocated with
+ * ptlrpc_prep_set.
+ * Ensures that all requests on the set have completed and removes
+ * all requests from the request list in a set.
+ * If any unsent request happen to be on the list, pretends that they got
+ * an error in flight and calls their completion handler.
+ */
+void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
+{
+	struct list_head       *tmp;
+	struct list_head       *next;
+	int	       expected_phase;
+	int	       n = 0;
+	ENTRY;
+
+	/* Requests on the set should either all be completed, or all be new */
+	expected_phase = (atomic_read(&set->set_remaining) == 0) ?
+			 RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
+	list_for_each (tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+
+		LASSERT(req->rq_phase == expected_phase);
+		n++;
+	}
+
+	LASSERTF(atomic_read(&set->set_remaining) == 0 ||
+		 atomic_read(&set->set_remaining) == n, "%d / %d\n",
+		 atomic_read(&set->set_remaining), n);
+
+	list_for_each_safe(tmp, next, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+		list_del_init(&req->rq_set_chain);
+
+		LASSERT(req->rq_phase == expected_phase);
+
+		if (req->rq_phase == RQ_PHASE_NEW) {
+			ptlrpc_req_interpret(NULL, req, -EBADR);
+			atomic_dec(&set->set_remaining);
+		}
+
+		spin_lock(&req->rq_lock);
+		req->rq_set = NULL;
+		req->rq_invalid_rqset = 0;
+		spin_unlock(&req->rq_lock);
+
+		ptlrpc_req_finished (req);
+	}
+
+	LASSERT(atomic_read(&set->set_remaining) == 0);
+
+	ptlrpc_reqset_put(set);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_set_destroy);
+
+/**
+ * Add a callback function \a fn to the set.
+ * This function would be called when all requests on this set are completed.
+ * The function will be passed \a data argument.
+ */
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+		      set_interpreter_func fn, void *data)
+{
+	struct ptlrpc_set_cbdata *cbdata;
+
+	OBD_ALLOC_PTR(cbdata);
+	if (cbdata == NULL)
+		RETURN(-ENOMEM);
+
+	cbdata->psc_interpret = fn;
+	cbdata->psc_data = data;
+	list_add_tail(&cbdata->psc_item, &set->set_cblist);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_cb);
+
+/**
+ * Add a new request to the general purpose request set.
+ * Assumes request reference from the caller.
+ */
+void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
+			struct ptlrpc_request *req)
+{
+	LASSERT(list_empty(&req->rq_set_chain));
+
+	/* The set takes over the caller's request reference */
+	list_add_tail(&req->rq_set_chain, &set->set_requests);
+	req->rq_set = set;
+	atomic_inc(&set->set_remaining);
+	req->rq_queued_time = cfs_time_current();
+
+	if (req->rq_reqmsg != NULL)
+		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+	if (set->set_producer != NULL)
+		/* If the request set has a producer callback, the RPC must be
+		 * sent straight away */
+		ptlrpc_send_new_req(req);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_req);
+
+/**
+ * Add a request to a request with dedicated server thread
+ * and wake the thread to make any necessary processing.
+ * Currently only used for ptlrpcd.
+ */
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+			   struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *set = pc->pc_set;
+	int count, i;
+
+	LASSERT(req->rq_set == NULL);
+	LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
+
+	spin_lock(&set->set_new_req_lock);
+	/*
+	 * The set takes over the caller's request reference.
+	 */
+	req->rq_set = set;
+	req->rq_queued_time = cfs_time_current();
+	list_add_tail(&req->rq_set_chain, &set->set_new_requests);
+	count = atomic_inc_return(&set->set_new_count);
+	spin_unlock(&set->set_new_req_lock);
+
+	/* Only need to call wakeup once for the first entry. */
+	if (count == 1) {
+		wake_up(&set->set_waitq);
+
+		/* XXX: It maybe unnecessary to wakeup all the partners. But to
+		 *      guarantee the async RPC can be processed ASAP, we have
+		 *      no other better choice. It maybe fixed in future. */
+		for (i = 0; i < pc->pc_npartners; i++)
+			wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+	}
+}
+EXPORT_SYMBOL(ptlrpc_set_add_new_req);
+
+/**
+ * Based on the current state of the import, determine if the request
+ * can be sent, is an error, or should be delayed.
+ *
+ * Returns true if this request should be delayed. If false, and
+ * *status is set, then the request can not be sent and *status is the
+ * error code.  If false and status is 0, then request can be sent.
+ *
+ * The imp->imp_lock must be held.
+ */
+static int ptlrpc_import_delay_req(struct obd_import *imp,
+				   struct ptlrpc_request *req, int *status)
+{
+	int delay = 0;
+	ENTRY;
+
+	LASSERT (status != NULL);
+	*status = 0;
+
+	if (req->rq_ctx_init || req->rq_ctx_fini) {
+		/* always allow ctx init/fini rpc go through */
+	} else if (imp->imp_state == LUSTRE_IMP_NEW) {
+		DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
+		*status = -EIO;
+	} else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		/* pings may safely race with umount */
+		DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
+			  D_HA : D_ERROR, req, "IMP_CLOSED ");
+		*status = -EIO;
+	} else if (ptlrpc_send_limit_expired(req)) {
+		/* probably doesn't need to be a D_ERROR after initial testing */
+		DEBUG_REQ(D_ERROR, req, "send limit expired ");
+		*status = -EIO;
+	} else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
+		   imp->imp_state == LUSTRE_IMP_CONNECTING) {
+		/* allow CONNECT even if import is invalid */ ;
+		if (atomic_read(&imp->imp_inval_count) != 0) {
+			DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+			*status = -EIO;
+		}
+	} else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
+		if (!imp->imp_deactive)
+			DEBUG_REQ(D_NET, req, "IMP_INVALID");
+		*status = -ESHUTDOWN; /* bz 12940 */
+	} else if (req->rq_import_generation != imp->imp_generation) {
+		DEBUG_REQ(D_ERROR, req, "req wrong generation:");
+		*status = -EIO;
+	} else if (req->rq_send_state != imp->imp_state) {
+		/* invalidate in progress - any requests should be drop */
+		if (atomic_read(&imp->imp_inval_count) != 0) {
+			DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+			*status = -EIO;
+		} else if (imp->imp_dlm_fake || req->rq_no_delay) {
+			*status = -EWOULDBLOCK;
+		} else if (req->rq_allow_replay &&
+			  (imp->imp_state == LUSTRE_IMP_REPLAY ||
+			   imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
+			   imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
+			   imp->imp_state == LUSTRE_IMP_RECOVER)) {
+			DEBUG_REQ(D_HA, req, "allow during recovery.\n");
+		} else {
+			delay = 1;
+		}
+	}
+
+	RETURN(delay);
+}
+
+/**
+ * Decide if the eror message regarding provided request \a req
+ * should be printed to the console or not.
+ * Makes it's decision on request status and other properties.
+ * Returns 1 to print error on the system console or 0 if not.
+ */
+static int ptlrpc_console_allow(struct ptlrpc_request *req)
+{
+	__u32 opc;
+	int err;
+
+	LASSERT(req->rq_reqmsg != NULL);
+	opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	/* Suppress particular reconnect errors which are to be expected.  No
+	 * errors are suppressed for the initial connection on an import */
+	if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
+	    (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
+
+		/* Suppress timed out reconnect requests */
+		if (req->rq_timedout)
+			return 0;
+
+		/* Suppress unavailable/again reconnect requests */
+		err = lustre_msg_get_status(req->rq_repmsg);
+		if (err == -ENODEV || err == -EAGAIN)
+			return 0;
+	}
+
+	return 1;
+}
+
+/**
+ * Check request processing status.
+ * Returns the status.
+ */
+static int ptlrpc_check_status(struct ptlrpc_request *req)
+{
+	int err;
+	ENTRY;
+
+	err = lustre_msg_get_status(req->rq_repmsg);
+	if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+		struct obd_import *imp = req->rq_import;
+		__u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+		if (ptlrpc_console_allow(req))
+			LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s,"
+					   " operation %s failed with %d.\n",
+					   imp->imp_obd->obd_name,
+					   libcfs_nid2str(
+					   imp->imp_connection->c_peer.nid),
+					   ll_opcode2str(opc), err);
+		RETURN(err < 0 ? err : -EINVAL);
+	}
+
+	if (err < 0) {
+		DEBUG_REQ(D_INFO, req, "status is %d", err);
+	} else if (err > 0) {
+		/* XXX: translate this error from net to host */
+		DEBUG_REQ(D_INFO, req, "status is %d", err);
+	}
+
+	RETURN(err);
+}
+
+/**
+ * save pre-versions of objects into request for replay.
+ * Versions are obtained from server reply.
+ * used for VBR.
+ */
+static void ptlrpc_save_versions(struct ptlrpc_request *req)
+{
+	struct lustre_msg *repmsg = req->rq_repmsg;
+	struct lustre_msg *reqmsg = req->rq_reqmsg;
+	__u64 *versions = lustre_msg_get_versions(repmsg);
+	ENTRY;
+
+	if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+		return;
+
+	LASSERT(versions);
+	lustre_msg_set_versions(reqmsg, versions);
+	CDEBUG(D_INFO, "Client save versions ["LPX64"/"LPX64"]\n",
+	       versions[0], versions[1]);
+
+	EXIT;
+}
+
+/**
+ * Callback function called when client receives RPC reply for \a req.
+ * Returns 0 on success or error code.
+ * The return alue would be assigned to req->rq_status by the caller
+ * as request processing status.
+ * This function also decides if the request needs to be saved for later replay.
+ */
+static int after_reply(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	struct obd_device *obd = req->rq_import->imp_obd;
+	int rc;
+	struct timeval work_start;
+	long timediff;
+	ENTRY;
+
+	LASSERT(obd != NULL);
+	/* repbuf must be unlinked */
+	LASSERT(!req->rq_receiving_reply && !req->rq_must_unlink);
+
+	if (req->rq_reply_truncate) {
+		if (ptlrpc_no_resend(req)) {
+			DEBUG_REQ(D_ERROR, req, "reply buffer overflow,"
+				  " expected: %d, actual size: %d",
+				  req->rq_nob_received, req->rq_repbuf_len);
+			RETURN(-EOVERFLOW);
+		}
+
+		sptlrpc_cli_free_repbuf(req);
+		/* Pass the required reply buffer size (include
+		 * space for early reply).
+		 * NB: no need to roundup because alloc_repbuf
+		 * will roundup it */
+		req->rq_replen       = req->rq_nob_received;
+		req->rq_nob_received = 0;
+		req->rq_resend       = 1;
+		RETURN(0);
+	}
+
+	/*
+	 * NB Until this point, the whole of the incoming message,
+	 * including buflens, status etc is in the sender's byte order.
+	 */
+	rc = sptlrpc_cli_unwrap_reply(req);
+	if (rc) {
+		DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
+		RETURN(rc);
+	}
+
+	/*
+	 * Security layer unwrap might ask resend this request.
+	 */
+	if (req->rq_resend)
+		RETURN(0);
+
+	rc = unpack_reply(req);
+	if (rc)
+		RETURN(rc);
+
+	/* retry indefinitely on EINPROGRESS */
+	if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
+	    ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
+		time_t	now = cfs_time_current_sec();
+
+		DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
+		req->rq_resend = 1;
+		req->rq_nr_resend++;
+
+		/* allocate new xid to avoid reply reconstruction */
+		if (!req->rq_bulk) {
+			/* new xid is already allocated for bulk in
+			 * ptlrpc_check_set() */
+			req->rq_xid = ptlrpc_next_xid();
+			DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for "
+				  "resend on EINPROGRESS");
+		}
+
+		/* Readjust the timeout for current conditions */
+		ptlrpc_at_set_req_timeout(req);
+		/* delay resend to give a chance to the server to get ready.
+		 * The delay is increased by 1s on every resend and is capped to
+		 * the current request timeout (i.e. obd_timeout if AT is off,
+		 * or AT service time x 125% + 5s, see at_est2timeout) */
+		if (req->rq_nr_resend > req->rq_timeout)
+			req->rq_sent = now + req->rq_timeout;
+		else
+			req->rq_sent = now + req->rq_nr_resend;
+
+		RETURN(0);
+	}
+
+	do_gettimeofday(&work_start);
+	timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL);
+	if (obd->obd_svc_stats != NULL) {
+		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
+				    timediff);
+		ptlrpc_lprocfs_rpc_sent(req, timediff);
+	}
+
+	if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
+	    lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
+		DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
+			  lustre_msg_get_type(req->rq_repmsg));
+		RETURN(-EPROTO);
+	}
+
+	if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+		CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
+	ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+	ptlrpc_at_adj_net_latency(req,
+				  lustre_msg_get_service_time(req->rq_repmsg));
+
+	rc = ptlrpc_check_status(req);
+	imp->imp_connect_error = rc;
+
+	if (rc) {
+		/*
+		 * Either we've been evicted, or the server has failed for
+		 * some reason. Try to reconnect, and if that fails, punt to
+		 * the upcall.
+		 */
+		if (ll_rpc_recoverable_error(rc)) {
+			if (req->rq_send_state != LUSTRE_IMP_FULL ||
+			    imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
+				RETURN(rc);
+			}
+			ptlrpc_request_handle_notconn(req);
+			RETURN(rc);
+		}
+	} else {
+		/*
+		 * Let's look if server sent slv. Do it only for RPC with
+		 * rc == 0.
+		 */
+		ldlm_cli_update_pool(req);
+	}
+
+	/*
+	 * Store transno in reqmsg for replay.
+	 */
+	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+		req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
+		lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
+	}
+
+	if (imp->imp_replayable) {
+		spin_lock(&imp->imp_lock);
+		/*
+		 * No point in adding already-committed requests to the replay
+		 * list, we will just remove them immediately. b=9829
+		 */
+		if (req->rq_transno != 0 &&
+		    (req->rq_transno >
+		     lustre_msg_get_last_committed(req->rq_repmsg) ||
+		     req->rq_replay)) {
+			/** version recovery */
+			ptlrpc_save_versions(req);
+			ptlrpc_retain_replayable_request(req, imp);
+		} else if (req->rq_commit_cb != NULL) {
+			spin_unlock(&imp->imp_lock);
+			req->rq_commit_cb(req);
+			spin_lock(&imp->imp_lock);
+		}
+
+		/*
+		 * Replay-enabled imports return commit-status information.
+		 */
+		if (lustre_msg_get_last_committed(req->rq_repmsg)) {
+			imp->imp_peer_committed_transno =
+				lustre_msg_get_last_committed(req->rq_repmsg);
+		}
+
+		ptlrpc_free_committed(imp);
+
+		if (!list_empty(&imp->imp_replay_list)) {
+			struct ptlrpc_request *last;
+
+			last = list_entry(imp->imp_replay_list.prev,
+					      struct ptlrpc_request,
+					      rq_replay_list);
+			/*
+			 * Requests with rq_replay stay on the list even if no
+			 * commit is expected.
+			 */
+			if (last->rq_transno > imp->imp_peer_committed_transno)
+				ptlrpc_pinger_commit_expected(imp);
+		}
+
+		spin_unlock(&imp->imp_lock);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Helper function to send request \a req over the network for the first time
+ * Also adjusts request phase.
+ * Returns 0 on success or error code.
+ */
+static int ptlrpc_send_new_req(struct ptlrpc_request *req)
+{
+	struct obd_import     *imp = req->rq_import;
+	int rc;
+	ENTRY;
+
+	LASSERT(req->rq_phase == RQ_PHASE_NEW);
+	if (req->rq_sent && (req->rq_sent > cfs_time_current_sec()) &&
+	    (!req->rq_generation_set ||
+	     req->rq_import_generation == imp->imp_generation))
+		RETURN (0);
+
+	ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
+
+	spin_lock(&imp->imp_lock);
+
+	if (!req->rq_generation_set)
+		req->rq_import_generation = imp->imp_generation;
+
+	if (ptlrpc_import_delay_req(imp, req, &rc)) {
+		spin_lock(&req->rq_lock);
+		req->rq_waiting = 1;
+		spin_unlock(&req->rq_lock);
+
+		DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: "
+			  "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg),
+			  ptlrpc_import_state_name(req->rq_send_state),
+			  ptlrpc_import_state_name(imp->imp_state));
+		LASSERT(list_empty(&req->rq_list));
+		list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+		atomic_inc(&req->rq_import->imp_inflight);
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+
+	if (rc != 0) {
+		spin_unlock(&imp->imp_lock);
+		req->rq_status = rc;
+		ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+		RETURN(rc);
+	}
+
+	LASSERT(list_empty(&req->rq_list));
+	list_add_tail(&req->rq_list, &imp->imp_sending_list);
+	atomic_inc(&req->rq_import->imp_inflight);
+	spin_unlock(&imp->imp_lock);
+
+	lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+	rc = sptlrpc_req_refresh_ctx(req, -1);
+	if (rc) {
+		if (req->rq_err) {
+			req->rq_status = rc;
+			RETURN(1);
+		} else {
+			req->rq_wait_ctx = 1;
+			RETURN(0);
+		}
+	}
+
+	CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc"
+	       " %s:%s:%d:"LPU64":%s:%d\n", current_comm(),
+	       imp->imp_obd->obd_uuid.uuid,
+	       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+	       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+	       lustre_msg_get_opc(req->rq_reqmsg));
+
+	rc = ptl_send_rpc(req, 0);
+	if (rc) {
+		DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
+		req->rq_net_err = 1;
+		RETURN(rc);
+	}
+	RETURN(0);
+}
+
+static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
+{
+	int remaining, rc;
+	ENTRY;
+
+	LASSERT(set->set_producer != NULL);
+
+	remaining = atomic_read(&set->set_remaining);
+
+	/* populate the ->set_requests list with requests until we
+	 * reach the maximum number of RPCs in flight for this set */
+	while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
+		rc = set->set_producer(set, set->set_producer_arg);
+		if (rc == -ENOENT) {
+			/* no more RPC to produce */
+			set->set_producer     = NULL;
+			set->set_producer_arg = NULL;
+			RETURN(0);
+		}
+	}
+
+	RETURN((atomic_read(&set->set_remaining) - remaining));
+}
+
+/**
+ * this sends any unsent RPCs in \a set and returns 1 if all are sent
+ * and no more replies are expected.
+ * (it is possible to get less replies than requests sent e.g. due to timed out
+ * requests or requests that we had trouble to send out)
+ */
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *next;
+	int force_timer_recalc = 0;
+	ENTRY;
+
+	if (atomic_read(&set->set_remaining) == 0)
+		RETURN(1);
+
+	list_for_each_safe(tmp, next, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+		struct obd_import *imp = req->rq_import;
+		int unregistered = 0;
+		int rc = 0;
+
+		if (req->rq_phase == RQ_PHASE_NEW &&
+		    ptlrpc_send_new_req(req)) {
+			force_timer_recalc = 1;
+		}
+
+		/* delayed send - skip */
+		if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
+			continue;
+
+		/* delayed resend - skip */
+		if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
+		    req->rq_sent > cfs_time_current_sec())
+			continue;
+
+		if (!(req->rq_phase == RQ_PHASE_RPC ||
+		      req->rq_phase == RQ_PHASE_BULK ||
+		      req->rq_phase == RQ_PHASE_INTERPRET ||
+		      req->rq_phase == RQ_PHASE_UNREGISTERING ||
+		      req->rq_phase == RQ_PHASE_COMPLETE)) {
+			DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
+			LBUG();
+		}
+
+		if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+			LASSERT(req->rq_next_phase != req->rq_phase);
+			LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+
+			/*
+			 * Skip processing until reply is unlinked. We
+			 * can't return to pool before that and we can't
+			 * call interpret before that. We need to make
+			 * sure that all rdma transfers finished and will
+			 * not corrupt any data.
+			 */
+			if (ptlrpc_client_recv_or_unlink(req) ||
+			    ptlrpc_client_bulk_active(req))
+				continue;
+
+			/*
+			 * Turn fail_loc off to prevent it from looping
+			 * forever.
+			 */
+			if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+				OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
+						     OBD_FAIL_ONCE);
+			}
+			if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
+				OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
+						     OBD_FAIL_ONCE);
+			}
+
+			/*
+			 * Move to next phase if reply was successfully
+			 * unlinked.
+			 */
+			ptlrpc_rqphase_move(req, req->rq_next_phase);
+		}
+
+		if (req->rq_phase == RQ_PHASE_COMPLETE)
+			continue;
+
+		if (req->rq_phase == RQ_PHASE_INTERPRET)
+			GOTO(interpret, req->rq_status);
+
+		/*
+		 * Note that this also will start async reply unlink.
+		 */
+		if (req->rq_net_err && !req->rq_timedout) {
+			ptlrpc_expire_one_request(req, 1);
+
+			/*
+			 * Check if we still need to wait for unlink.
+			 */
+			if (ptlrpc_client_recv_or_unlink(req) ||
+			    ptlrpc_client_bulk_active(req))
+				continue;
+			/* If there is no need to resend, fail it now. */
+			if (req->rq_no_resend) {
+				if (req->rq_status == 0)
+					req->rq_status = -EIO;
+				ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+				GOTO(interpret, req->rq_status);
+			} else {
+				continue;
+			}
+		}
+
+		if (req->rq_err) {
+			spin_lock(&req->rq_lock);
+			req->rq_replied = 0;
+			spin_unlock(&req->rq_lock);
+			if (req->rq_status == 0)
+				req->rq_status = -EIO;
+			ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+			GOTO(interpret, req->rq_status);
+		}
+
+		/* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
+		 * so it sets rq_intr regardless of individual rpc
+		 * timeouts. The synchronous IO waiting path sets
+		 * rq_intr irrespective of whether ptlrpcd
+		 * has seen a timeout.  Our policy is to only interpret
+		 * interrupted rpcs after they have timed out, so we
+		 * need to enforce that here.
+		 */
+
+		if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
+				     req->rq_wait_ctx)) {
+			req->rq_status = -EINTR;
+			ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+			GOTO(interpret, req->rq_status);
+		}
+
+		if (req->rq_phase == RQ_PHASE_RPC) {
+			if (req->rq_timedout || req->rq_resend ||
+			    req->rq_waiting || req->rq_wait_ctx) {
+				int status;
+
+				if (!ptlrpc_unregister_reply(req, 1))
+					continue;
+
+				spin_lock(&imp->imp_lock);
+				if (ptlrpc_import_delay_req(imp, req, &status)){
+					/* put on delay list - only if we wait
+					 * recovery finished - before send */
+					list_del_init(&req->rq_list);
+					list_add_tail(&req->rq_list,
+							  &imp->
+							  imp_delayed_list);
+					spin_unlock(&imp->imp_lock);
+					continue;
+				}
+
+				if (status != 0)  {
+					req->rq_status = status;
+					ptlrpc_rqphase_move(req,
+						RQ_PHASE_INTERPRET);
+					spin_unlock(&imp->imp_lock);
+					GOTO(interpret, req->rq_status);
+				}
+				if (ptlrpc_no_resend(req) &&
+				    !req->rq_wait_ctx) {
+					req->rq_status = -ENOTCONN;
+					ptlrpc_rqphase_move(req,
+							    RQ_PHASE_INTERPRET);
+					spin_unlock(&imp->imp_lock);
+					GOTO(interpret, req->rq_status);
+				}
+
+				list_del_init(&req->rq_list);
+				list_add_tail(&req->rq_list,
+						  &imp->imp_sending_list);
+
+				spin_unlock(&imp->imp_lock);
+
+				spin_lock(&req->rq_lock);
+				req->rq_waiting = 0;
+				spin_unlock(&req->rq_lock);
+
+				if (req->rq_timedout || req->rq_resend) {
+					/* This is re-sending anyways,
+					 * let's mark req as resend. */
+					spin_lock(&req->rq_lock);
+					req->rq_resend = 1;
+					spin_unlock(&req->rq_lock);
+					if (req->rq_bulk) {
+						__u64 old_xid;
+
+						if (!ptlrpc_unregister_bulk(req, 1))
+							continue;
+
+						/* ensure previous bulk fails */
+						old_xid = req->rq_xid;
+						req->rq_xid = ptlrpc_next_xid();
+						CDEBUG(D_HA, "resend bulk "
+						       "old x"LPU64
+						       " new x"LPU64"\n",
+						       old_xid, req->rq_xid);
+					}
+				}
+				/*
+				 * rq_wait_ctx is only touched by ptlrpcd,
+				 * so no lock is needed here.
+				 */
+				status = sptlrpc_req_refresh_ctx(req, -1);
+				if (status) {
+					if (req->rq_err) {
+						req->rq_status = status;
+						spin_lock(&req->rq_lock);
+						req->rq_wait_ctx = 0;
+						spin_unlock(&req->rq_lock);
+						force_timer_recalc = 1;
+					} else {
+						spin_lock(&req->rq_lock);
+						req->rq_wait_ctx = 1;
+						spin_unlock(&req->rq_lock);
+					}
+
+					continue;
+				} else {
+					spin_lock(&req->rq_lock);
+					req->rq_wait_ctx = 0;
+					spin_unlock(&req->rq_lock);
+				}
+
+				rc = ptl_send_rpc(req, 0);
+				if (rc) {
+					DEBUG_REQ(D_HA, req,
+						  "send failed: rc = %d", rc);
+					force_timer_recalc = 1;
+					spin_lock(&req->rq_lock);
+					req->rq_net_err = 1;
+					spin_unlock(&req->rq_lock);
+				}
+				/* need to reset the timeout */
+				force_timer_recalc = 1;
+			}
+
+			spin_lock(&req->rq_lock);
+
+			if (ptlrpc_client_early(req)) {
+				ptlrpc_at_recv_early_reply(req);
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			/* Still waiting for a reply? */
+			if (ptlrpc_client_recv(req)) {
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			/* Did we actually receive a reply? */
+			if (!ptlrpc_client_replied(req)) {
+				spin_unlock(&req->rq_lock);
+				continue;
+			}
+
+			spin_unlock(&req->rq_lock);
+
+			/* unlink from net because we are going to
+			 * swab in-place of reply buffer */
+			unregistered = ptlrpc_unregister_reply(req, 1);
+			if (!unregistered)
+				continue;
+
+			req->rq_status = after_reply(req);
+			if (req->rq_resend)
+				continue;
+
+			/* If there is no bulk associated with this request,
+			 * then we're done and should let the interpreter
+			 * process the reply. Similarly if the RPC returned
+			 * an error, and therefore the bulk will never arrive.
+			 */
+			if (req->rq_bulk == NULL || req->rq_status < 0) {
+				ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+				GOTO(interpret, req->rq_status);
+			}
+
+			ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
+		}
+
+		LASSERT(req->rq_phase == RQ_PHASE_BULK);
+		if (ptlrpc_client_bulk_active(req))
+			continue;
+
+		if (req->rq_bulk->bd_failure) {
+			/* The RPC reply arrived OK, but the bulk screwed
+			 * up!  Dead weird since the server told us the RPC
+			 * was good after getting the REPLY for her GET or
+			 * the ACK for her PUT. */
+			DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+			req->rq_status = -EIO;
+		}
+
+		ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+
+	interpret:
+		LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
+
+		/* This moves to "unregistering" phase we need to wait for
+		 * reply unlink. */
+		if (!unregistered && !ptlrpc_unregister_reply(req, 1)) {
+			/* start async bulk unlink too */
+			ptlrpc_unregister_bulk(req, 1);
+			continue;
+		}
+
+		if (!ptlrpc_unregister_bulk(req, 1))
+			continue;
+
+		/* When calling interpret receiving already should be
+		 * finished. */
+		LASSERT(!req->rq_receiving_reply);
+
+		ptlrpc_req_interpret(env, req, req->rq_status);
+
+		ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
+
+		CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0,
+			"Completed RPC pname:cluuid:pid:xid:nid:"
+			"opc %s:%s:%d:"LPU64":%s:%d\n",
+			current_comm(), imp->imp_obd->obd_uuid.uuid,
+			lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+			libcfs_nid2str(imp->imp_connection->c_peer.nid),
+			lustre_msg_get_opc(req->rq_reqmsg));
+
+		spin_lock(&imp->imp_lock);
+		/* Request already may be not on sending or delaying list. This
+		 * may happen in the case of marking it erroneous for the case
+		 * ptlrpc_import_delay_req(req, status) find it impossible to
+		 * allow sending this rpc and returns *status != 0. */
+		if (!list_empty(&req->rq_list)) {
+			list_del_init(&req->rq_list);
+			atomic_dec(&imp->imp_inflight);
+		}
+		spin_unlock(&imp->imp_lock);
+
+		atomic_dec(&set->set_remaining);
+		wake_up_all(&imp->imp_recovery_waitq);
+
+		if (set->set_producer) {
+			/* produce a new request if possible */
+			if (ptlrpc_set_producer(set) > 0)
+				force_timer_recalc = 1;
+
+			/* free the request that has just been completed
+			 * in order not to pollute set->set_requests */
+			list_del_init(&req->rq_set_chain);
+			spin_lock(&req->rq_lock);
+			req->rq_set = NULL;
+			req->rq_invalid_rqset = 0;
+			spin_unlock(&req->rq_lock);
+
+			/* record rq_status to compute the final status later */
+			if (req->rq_status != 0)
+				set->set_rc = req->rq_status;
+			ptlrpc_req_finished(req);
+		}
+	}
+
+	/* If we hit an error, we want to recover promptly. */
+	RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc);
+}
+EXPORT_SYMBOL(ptlrpc_check_set);
+
+/**
+ * Time out request \a req. is \a async_unlink is set, that means do not wait
+ * until LNet actually confirms network buffer unlinking.
+ * Return 1 if we should give up further retrying attempts or 0 otherwise.
+ */
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
+{
+	struct obd_import *imp = req->rq_import;
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&req->rq_lock);
+	req->rq_timedout = 1;
+	spin_unlock(&req->rq_lock);
+
+	DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T
+		  "/real "CFS_DURATION_T"]",
+		  req->rq_net_err ? "failed due to network error" :
+		     ((req->rq_real_sent == 0 ||
+		       cfs_time_before(req->rq_real_sent, req->rq_sent) ||
+		       cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ?
+		      "timed out for sent delay" : "timed out for slow reply"),
+		  req->rq_sent, req->rq_real_sent);
+
+	if (imp != NULL && obd_debug_peer_on_timeout)
+		LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
+
+	ptlrpc_unregister_reply(req, async_unlink);
+	ptlrpc_unregister_bulk(req, async_unlink);
+
+	if (obd_dump_on_timeout)
+		libcfs_debug_dumplog();
+
+	if (imp == NULL) {
+		DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
+		RETURN(1);
+	}
+
+	atomic_inc(&imp->imp_timeouts);
+
+	/* The DLM server doesn't want recovery run on its imports. */
+	if (imp->imp_dlm_fake)
+		RETURN(1);
+
+	/* If this request is for recovery or other primordial tasks,
+	 * then error it out here. */
+	if (req->rq_ctx_init || req->rq_ctx_fini ||
+	    req->rq_send_state != LUSTRE_IMP_FULL ||
+	    imp->imp_obd->obd_no_recov) {
+		DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+			  ptlrpc_import_state_name(req->rq_send_state),
+			  ptlrpc_import_state_name(imp->imp_state));
+		spin_lock(&req->rq_lock);
+		req->rq_status = -ETIMEDOUT;
+		req->rq_err = 1;
+		spin_unlock(&req->rq_lock);
+		RETURN(1);
+	}
+
+	/* if a request can't be resent we can't wait for an answer after
+	   the timeout */
+	if (ptlrpc_no_resend(req)) {
+		DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
+		rc = 1;
+	}
+
+	ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
+
+	RETURN(rc);
+}
+
+/**
+ * Time out all uncompleted requests in request set pointed by \a data
+ * Callback used when waiting on sets with l_wait_event.
+ * Always returns 1.
+ */
+int ptlrpc_expired_set(void *data)
+{
+	struct ptlrpc_request_set *set = data;
+	struct list_head		*tmp;
+	time_t		     now = cfs_time_current_sec();
+	ENTRY;
+
+	LASSERT(set != NULL);
+
+	/*
+	 * A timeout expired. See which reqs it applies to...
+	 */
+	list_for_each (tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+
+		/* don't expire request waiting for context */
+		if (req->rq_wait_ctx)
+			continue;
+
+		/* Request in-flight? */
+		if (!((req->rq_phase == RQ_PHASE_RPC &&
+		       !req->rq_waiting && !req->rq_resend) ||
+		      (req->rq_phase == RQ_PHASE_BULK)))
+			continue;
+
+		if (req->rq_timedout ||     /* already dealt with */
+		    req->rq_deadline > now) /* not expired */
+			continue;
+
+		/* Deal with this guy. Do it asynchronously to not block
+		 * ptlrpcd thread. */
+		ptlrpc_expire_one_request(req, 1);
+	}
+
+	/*
+	 * When waiting for a whole set, we always break out of the
+	 * sleep so we can recalculate the timeout, or enable interrupts
+	 * if everyone's timed out.
+	 */
+	RETURN(1);
+}
+EXPORT_SYMBOL(ptlrpc_expired_set);
+
+/**
+ * Sets rq_intr flag in \a req under spinlock.
+ */
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_lock);
+	req->rq_intr = 1;
+	spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_mark_interrupted);
+
+/**
+ * Interrupts (sets interrupted flag) all uncompleted requests in
+ * a set \a data. Callback for l_wait_event for interruptible waits.
+ */
+void ptlrpc_interrupted_set(void *data)
+{
+	struct ptlrpc_request_set *set = data;
+	struct list_head *tmp;
+
+	LASSERT(set != NULL);
+	CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
+
+	list_for_each(tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_set_chain);
+
+		if (req->rq_phase != RQ_PHASE_RPC &&
+		    req->rq_phase != RQ_PHASE_UNREGISTERING)
+			continue;
+
+		ptlrpc_mark_interrupted(req);
+	}
+}
+EXPORT_SYMBOL(ptlrpc_interrupted_set);
+
+/**
+ * Get the smallest timeout in the set; this does NOT set a timeout.
+ */
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+{
+	struct list_head	    *tmp;
+	time_t		 now = cfs_time_current_sec();
+	int		    timeout = 0;
+	struct ptlrpc_request *req;
+	int		    deadline;
+	ENTRY;
+
+	SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
+
+	list_for_each(tmp, &set->set_requests) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+		/*
+		 * Request in-flight?
+		 */
+		if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+		      (req->rq_phase == RQ_PHASE_BULK) ||
+		      (req->rq_phase == RQ_PHASE_NEW)))
+			continue;
+
+		/*
+		 * Already timed out.
+		 */
+		if (req->rq_timedout)
+			continue;
+
+		/*
+		 * Waiting for ctx.
+		 */
+		if (req->rq_wait_ctx)
+			continue;
+
+		if (req->rq_phase == RQ_PHASE_NEW)
+			deadline = req->rq_sent;
+		else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
+			deadline = req->rq_sent;
+		else
+			deadline = req->rq_sent + req->rq_timeout;
+
+		if (deadline <= now)    /* actually expired already */
+			timeout = 1;    /* ASAP */
+		else if (timeout == 0 || timeout > deadline - now)
+			timeout = deadline - now;
+	}
+	RETURN(timeout);
+}
+EXPORT_SYMBOL(ptlrpc_set_next_timeout);
+
+/**
+ * Send all unset request from the set and then wait untill all
+ * requests in the set complete (either get a reply, timeout, get an
+ * error or otherwise be interrupted).
+ * Returns 0 on success or error code otherwise.
+ */
+int ptlrpc_set_wait(struct ptlrpc_request_set *set)
+{
+	struct list_head	    *tmp;
+	struct ptlrpc_request *req;
+	struct l_wait_info     lwi;
+	int		    rc, timeout;
+	ENTRY;
+
+	if (set->set_producer)
+		(void)ptlrpc_set_producer(set);
+	else
+		list_for_each(tmp, &set->set_requests) {
+			req = list_entry(tmp, struct ptlrpc_request,
+					     rq_set_chain);
+			if (req->rq_phase == RQ_PHASE_NEW)
+				(void)ptlrpc_send_new_req(req);
+		}
+
+	if (list_empty(&set->set_requests))
+		RETURN(0);
+
+	do {
+		timeout = ptlrpc_set_next_timeout(set);
+
+		/* wait until all complete, interrupted, or an in-flight
+		 * req times out */
+		CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
+		       set, timeout);
+
+		if (timeout == 0 && !cfs_signal_pending())
+			/*
+			 * No requests are in-flight (ether timed out
+			 * or delayed), so we can allow interrupts.
+			 * We still want to block for a limited time,
+			 * so we allow interrupts during the timeout.
+			 */
+			lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1),
+						   ptlrpc_expired_set,
+						   ptlrpc_interrupted_set, set);
+		else
+			/*
+			 * At least one request is in flight, so no
+			 * interrupts are allowed. Wait until all
+			 * complete, or an in-flight req times out.
+			 */
+			lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1),
+					  ptlrpc_expired_set, set);
+
+		rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
+
+		/* LU-769 - if we ignored the signal because it was already
+		 * pending when we started, we need to handle it now or we risk
+		 * it being ignored forever */
+		if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr &&
+		    cfs_signal_pending()) {
+			sigset_t blocked_sigs =
+					   cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+			/* In fact we only interrupt for the "fatal" signals
+			 * like SIGINT or SIGKILL. We still ignore less
+			 * important signals since ptlrpc set is not easily
+			 * reentrant from userspace again */
+			if (cfs_signal_pending())
+				ptlrpc_interrupted_set(set);
+			cfs_restore_sigs(blocked_sigs);
+		}
+
+		LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
+
+		/* -EINTR => all requests have been flagged rq_intr so next
+		 * check completes.
+		 * -ETIMEDOUT => someone timed out.  When all reqs have
+		 * timed out, signals are enabled allowing completion with
+		 * EINTR.
+		 * I don't really care if we go once more round the loop in
+		 * the error cases -eeb. */
+		if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
+			list_for_each(tmp, &set->set_requests) {
+				req = list_entry(tmp, struct ptlrpc_request,
+						     rq_set_chain);
+				spin_lock(&req->rq_lock);
+				req->rq_invalid_rqset = 1;
+				spin_unlock(&req->rq_lock);
+			}
+		}
+	} while (rc != 0 || atomic_read(&set->set_remaining) != 0);
+
+	LASSERT(atomic_read(&set->set_remaining) == 0);
+
+	rc = set->set_rc; /* rq_status of already freed requests if any */
+	list_for_each(tmp, &set->set_requests) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+		LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
+		if (req->rq_status != 0)
+			rc = req->rq_status;
+	}
+
+	if (set->set_interpret != NULL) {
+		int (*interpreter)(struct ptlrpc_request_set *set,void *,int) =
+			set->set_interpret;
+		rc = interpreter (set, set->set_arg, rc);
+	} else {
+		struct ptlrpc_set_cbdata *cbdata, *n;
+		int err;
+
+		list_for_each_entry_safe(cbdata, n,
+					 &set->set_cblist, psc_item) {
+			list_del_init(&cbdata->psc_item);
+			err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
+			if (err && !rc)
+				rc = err;
+			OBD_FREE_PTR(cbdata);
+		}
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_wait);
+
+/**
+ * Helper fuction for request freeing.
+ * Called when request count reached zero and request needs to be freed.
+ * Removes request from all sorts of sending/replay lists it might be on,
+ * frees network buffers if any are present.
+ * If \a locked is set, that means caller is already holding import imp_lock
+ * and so we no longer need to reobtain it (for certain lists manipulations)
+ */
+static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
+{
+	ENTRY;
+	if (request == NULL) {
+		EXIT;
+		return;
+	}
+
+	LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
+	LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */
+	LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
+	LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+	LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request);
+	LASSERTF(!request->rq_replay, "req %p\n", request);
+
+	req_capsule_fini(&request->rq_pill);
+
+	/* We must take it off the imp_replay_list first.  Otherwise, we'll set
+	 * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
+	if (request->rq_import != NULL) {
+		if (!locked)
+			spin_lock(&request->rq_import->imp_lock);
+		list_del_init(&request->rq_replay_list);
+		if (!locked)
+			spin_unlock(&request->rq_import->imp_lock);
+	}
+	LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
+
+	if (atomic_read(&request->rq_refcount) != 0) {
+		DEBUG_REQ(D_ERROR, request,
+			  "freeing request with nonzero refcount");
+		LBUG();
+	}
+
+	if (request->rq_repbuf != NULL)
+		sptlrpc_cli_free_repbuf(request);
+	if (request->rq_export != NULL) {
+		class_export_put(request->rq_export);
+		request->rq_export = NULL;
+	}
+	if (request->rq_import != NULL) {
+		class_import_put(request->rq_import);
+		request->rq_import = NULL;
+	}
+	if (request->rq_bulk != NULL)
+		ptlrpc_free_bulk_pin(request->rq_bulk);
+
+	if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL)
+		sptlrpc_cli_free_reqbuf(request);
+
+	if (request->rq_cli_ctx)
+		sptlrpc_req_put_ctx(request, !locked);
+
+	if (request->rq_pool)
+		__ptlrpc_free_req_to_pool(request);
+	else
+		OBD_FREE(request, sizeof(*request));
+	EXIT;
+}
+
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
+/**
+ * Drop one request reference. Must be called with import imp_lock held.
+ * When reference count drops to zero, reuqest is freed.
+ */
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
+{
+	LASSERT(spin_is_locked(&request->rq_import->imp_lock));
+	(void)__ptlrpc_req_finished(request, 1);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock);
+
+/**
+ * Helper function
+ * Drops one reference count for request \a request.
+ * \a locked set indicates that caller holds import imp_lock.
+ * Frees the request whe reference count reaches zero.
+ */
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
+{
+	ENTRY;
+	if (request == NULL)
+		RETURN(1);
+
+	if (request == LP_POISON ||
+	    request->rq_reqmsg == LP_POISON) {
+		CERROR("dereferencing freed request (bug 575)\n");
+		LBUG();
+		RETURN(1);
+	}
+
+	DEBUG_REQ(D_INFO, request, "refcount now %u",
+		  atomic_read(&request->rq_refcount) - 1);
+
+	if (atomic_dec_and_test(&request->rq_refcount)) {
+		__ptlrpc_free_req(request, locked);
+		RETURN(1);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Drops one reference count for a request.
+ */
+void ptlrpc_req_finished(struct ptlrpc_request *request)
+{
+	__ptlrpc_req_finished(request, 0);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished);
+
+/**
+ * Returns xid of a \a request
+ */
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request)
+{
+	return request->rq_xid;
+}
+EXPORT_SYMBOL(ptlrpc_req_xid);
+
+/**
+ * Disengage the client's reply buffer from the network
+ * NB does _NOT_ unregister any client-side bulk.
+ * IDEMPOTENT, but _not_ safe against concurrent callers.
+ * The request owner (i.e. the thread doing the I/O) must call...
+ * Returns 0 on success or 1 if unregistering cannot be made.
+ */
+int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
+{
+	int		rc;
+	wait_queue_head_t       *wq;
+	struct l_wait_info lwi;
+
+	/*
+	 * Might sleep.
+	 */
+	LASSERT(!in_interrupt());
+
+	/*
+	 * Let's setup deadline for reply unlink.
+	 */
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+	    async && request->rq_reply_deadline == 0)
+		request->rq_reply_deadline = cfs_time_current_sec()+LONG_UNLINK;
+
+	/*
+	 * Nothing left to do.
+	 */
+	if (!ptlrpc_client_recv_or_unlink(request))
+		RETURN(1);
+
+	LNetMDUnlink(request->rq_reply_md_h);
+
+	/*
+	 * Let's check it once again.
+	 */
+	if (!ptlrpc_client_recv_or_unlink(request))
+		RETURN(1);
+
+	/*
+	 * Move to "Unregistering" phase as reply was not unlinked yet.
+	 */
+	ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING);
+
+	/*
+	 * Do not wait for unlink to finish.
+	 */
+	if (async)
+		RETURN(0);
+
+	/*
+	 * We have to l_wait_event() whatever the result, to give liblustre
+	 * a chance to run reply_in_callback(), and to make sure we've
+	 * unlinked before returning a req to the pool.
+	 */
+	if (request->rq_set != NULL)
+		wq = &request->rq_set->set_waitq;
+	else
+		wq = &request->rq_reply_waitq;
+
+	for (;;) {
+		/* Network access will complete in finite time but the HUGE
+		 * timeout lets us CWARN for visibility of sluggish NALs */
+		lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+					   cfs_time_seconds(1), NULL, NULL);
+		rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
+				  &lwi);
+		if (rc == 0) {
+			ptlrpc_rqphase_move(request, request->rq_next_phase);
+			RETURN(1);
+		}
+
+		LASSERT(rc == -ETIMEDOUT);
+		DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout "
+			  "rvcng=%d unlnk=%d", request->rq_receiving_reply,
+			  request->rq_must_unlink);
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_reply);
+
+/**
+ * Iterates through replay_list on import and prunes
+ * all requests have transno smaller than last_committed for the
+ * import and don't have rq_replay set.
+ * Since requests are sorted in transno order, stops when meetign first
+ * transno bigger than last_committed.
+ * caller must hold imp->imp_lock
+ */
+void ptlrpc_free_committed(struct obd_import *imp)
+{
+	struct list_head *tmp, *saved;
+	struct ptlrpc_request *req;
+	struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
+	ENTRY;
+
+	LASSERT(imp != NULL);
+
+	LASSERT(spin_is_locked(&imp->imp_lock));
+
+
+	if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
+	    imp->imp_generation == imp->imp_last_generation_checked) {
+		CDEBUG(D_INFO, "%s: skip recheck: last_committed "LPU64"\n",
+		       imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+		EXIT;
+		return;
+	}
+	CDEBUG(D_RPCTRACE, "%s: committing for last_committed "LPU64" gen %d\n",
+	       imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
+	       imp->imp_generation);
+	imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
+	imp->imp_last_generation_checked = imp->imp_generation;
+
+	list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
+		req = list_entry(tmp, struct ptlrpc_request,
+				     rq_replay_list);
+
+		/* XXX ok to remove when 1357 resolved - rread 05/29/03  */
+		LASSERT(req != last_req);
+		last_req = req;
+
+		if (req->rq_transno == 0) {
+			DEBUG_REQ(D_EMERG, req, "zero transno during replay");
+			LBUG();
+		}
+		if (req->rq_import_generation < imp->imp_generation) {
+			DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
+			GOTO(free_req, 0);
+		}
+
+		if (req->rq_replay) {
+			DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
+			continue;
+		}
+
+		/* not yet committed */
+		if (req->rq_transno > imp->imp_peer_committed_transno) {
+			DEBUG_REQ(D_RPCTRACE, req, "stopping search");
+			break;
+		}
+
+		DEBUG_REQ(D_INFO, req, "commit (last_committed "LPU64")",
+			  imp->imp_peer_committed_transno);
+free_req:
+		spin_lock(&req->rq_lock);
+		req->rq_replay = 0;
+		spin_unlock(&req->rq_lock);
+		if (req->rq_commit_cb != NULL)
+			req->rq_commit_cb(req);
+		list_del_init(&req->rq_replay_list);
+		__ptlrpc_req_finished(req, 1);
+	}
+
+	EXIT;
+	return;
+}
+
+void ptlrpc_cleanup_client(struct obd_import *imp)
+{
+	ENTRY;
+	EXIT;
+	return;
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_client);
+
+/**
+ * Schedule previously sent request for resend.
+ * For bulk requests we assign new xid (to avoid problems with
+ * lost replies and therefore several transfers landing into same buffer
+ * from different sending attempts).
+ */
+void ptlrpc_resend_req(struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req, "going to resend");
+	lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
+	req->rq_status = -EAGAIN;
+
+	spin_lock(&req->rq_lock);
+	req->rq_resend = 1;
+	req->rq_net_err = 0;
+	req->rq_timedout = 0;
+	if (req->rq_bulk) {
+		__u64 old_xid = req->rq_xid;
+
+		/* ensure previous bulk fails */
+		req->rq_xid = ptlrpc_next_xid();
+		CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n",
+		       old_xid, req->rq_xid);
+	}
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_resend_req);
+
+/* XXX: this function and rq_status are currently unused */
+void ptlrpc_restart_req(struct ptlrpc_request *req)
+{
+	DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
+	req->rq_status = -ERESTARTSYS;
+
+	spin_lock(&req->rq_lock);
+	req->rq_restart = 1;
+	req->rq_timedout = 0;
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_restart_req);
+
+/**
+ * Grab additional reference on a request \a req
+ */
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
+{
+	ENTRY;
+	atomic_inc(&req->rq_refcount);
+	RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpc_request_addref);
+
+/**
+ * Add a request to import replay_list.
+ * Must be called under imp_lock
+ */
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+				      struct obd_import *imp)
+{
+	struct list_head *tmp;
+
+	LASSERT(spin_is_locked(&imp->imp_lock));
+
+	if (req->rq_transno == 0) {
+		DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
+		LBUG();
+	}
+
+	/* clear this for new requests that were resent as well
+	   as resent replayed requests. */
+	lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
+
+	/* don't re-add requests that have been replayed */
+	if (!list_empty(&req->rq_replay_list))
+		return;
+
+	lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
+
+	LASSERT(imp->imp_replayable);
+	/* Balanced in ptlrpc_free_committed, usually. */
+	ptlrpc_request_addref(req);
+	list_for_each_prev(tmp, &imp->imp_replay_list) {
+		struct ptlrpc_request *iter =
+			list_entry(tmp, struct ptlrpc_request,
+				       rq_replay_list);
+
+		/* We may have duplicate transnos if we create and then
+		 * open a file, or for closes retained if to match creating
+		 * opens, so use req->rq_xid as a secondary key.
+		 * (See bugs 684, 685, and 428.)
+		 * XXX no longer needed, but all opens need transnos!
+		 */
+		if (iter->rq_transno > req->rq_transno)
+			continue;
+
+		if (iter->rq_transno == req->rq_transno) {
+			LASSERT(iter->rq_xid != req->rq_xid);
+			if (iter->rq_xid > req->rq_xid)
+				continue;
+		}
+
+		list_add(&req->rq_replay_list, &iter->rq_replay_list);
+		return;
+	}
+
+	list_add(&req->rq_replay_list, &imp->imp_replay_list);
+}
+EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
+
+/**
+ * Send request and wait until it completes.
+ * Returns request processing status.
+ */
+int ptlrpc_queue_wait(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *set;
+	int rc;
+	ENTRY;
+
+	LASSERT(req->rq_set == NULL);
+	LASSERT(!req->rq_receiving_reply);
+
+	set = ptlrpc_prep_set();
+	if (set == NULL) {
+		CERROR("Unable to allocate ptlrpc set.");
+		RETURN(-ENOMEM);
+	}
+
+	/* for distributed debugging */
+	lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+	/* add a ref for the set (see comment in ptlrpc_set_add_req) */
+	ptlrpc_request_addref(req);
+	ptlrpc_set_add_req(set, req);
+	rc = ptlrpc_set_wait(set);
+	ptlrpc_set_destroy(set);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_queue_wait);
+
+struct ptlrpc_replay_async_args {
+	int praa_old_state;
+	int praa_old_status;
+};
+
+/**
+ * Callback used for replayed requests reply processing.
+ * In case of succesful reply calls registeresd request replay callback.
+ * In case of error restart replay process.
+ */
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+				   struct ptlrpc_request *req,
+				   void * data, int rc)
+{
+	struct ptlrpc_replay_async_args *aa = data;
+	struct obd_import *imp = req->rq_import;
+
+	ENTRY;
+	atomic_dec(&imp->imp_replay_inflight);
+
+	if (!ptlrpc_client_replied(req)) {
+		CERROR("request replay timed out, restarting recovery\n");
+		GOTO(out, rc = -ETIMEDOUT);
+	}
+
+	if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
+	    (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
+	     lustre_msg_get_status(req->rq_repmsg) == -ENODEV))
+		GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg));
+
+	/** VBR: check version failure */
+	if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
+		/** replay was failed due to version mismatch */
+		DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
+		spin_lock(&imp->imp_lock);
+		imp->imp_vbr_failed = 1;
+		imp->imp_no_lock_replay = 1;
+		spin_unlock(&imp->imp_lock);
+		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+	} else {
+		/** The transno had better not change over replay. */
+		LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
+			 lustre_msg_get_transno(req->rq_repmsg) ||
+			 lustre_msg_get_transno(req->rq_repmsg) == 0,
+			 LPX64"/"LPX64"\n",
+			 lustre_msg_get_transno(req->rq_reqmsg),
+			 lustre_msg_get_transno(req->rq_repmsg));
+	}
+
+	spin_lock(&imp->imp_lock);
+	/** if replays by version then gap occur on server, no trust to locks */
+	if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
+		imp->imp_no_lock_replay = 1;
+	imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
+	spin_unlock(&imp->imp_lock);
+	LASSERT(imp->imp_last_replay_transno);
+
+	/* transaction number shouldn't be bigger than the latest replayed */
+	if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
+		DEBUG_REQ(D_ERROR, req,
+			  "Reported transno "LPU64" is bigger than the "
+			  "replayed one: "LPU64, req->rq_transno,
+			  lustre_msg_get_transno(req->rq_reqmsg));
+		GOTO(out, rc = -EINVAL);
+	}
+
+	DEBUG_REQ(D_HA, req, "got rep");
+
+	/* let the callback do fixups, possibly including in the request */
+	if (req->rq_replay_cb)
+		req->rq_replay_cb(req);
+
+	if (ptlrpc_client_replied(req) &&
+	    lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
+		DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
+			  lustre_msg_get_status(req->rq_repmsg),
+			  aa->praa_old_status);
+	} else {
+		/* Put it back for re-replay. */
+		lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+	}
+
+	/*
+	 * Errors while replay can set transno to 0, but
+	 * imp_last_replay_transno shouldn't be set to 0 anyway
+	 */
+	if (req->rq_transno == 0)
+		CERROR("Transno is 0 during replay!\n");
+
+	/* continue with recovery */
+	rc = ptlrpc_import_recovery_state_machine(imp);
+ out:
+	req->rq_send_state = aa->praa_old_state;
+
+	if (rc != 0)
+		/* this replay failed, so restart recovery */
+		ptlrpc_connect_import(imp);
+
+	RETURN(rc);
+}
+
+/**
+ * Prepares and queues request for replay.
+ * Adds it to ptlrpcd queue for actual sending.
+ * Returns 0 on success.
+ */
+int ptlrpc_replay_req(struct ptlrpc_request *req)
+{
+	struct ptlrpc_replay_async_args *aa;
+	ENTRY;
+
+	LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+
+	LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
+	aa = ptlrpc_req_async_args(req);
+	memset(aa, 0, sizeof *aa);
+
+	/* Prepare request to be resent with ptlrpcd */
+	aa->praa_old_state = req->rq_send_state;
+	req->rq_send_state = LUSTRE_IMP_REPLAY;
+	req->rq_phase = RQ_PHASE_NEW;
+	req->rq_next_phase = RQ_PHASE_UNDEFINED;
+	if (req->rq_repmsg)
+		aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+	req->rq_status = 0;
+	req->rq_interpret_reply = ptlrpc_replay_interpret;
+	/* Readjust the timeout for current conditions */
+	ptlrpc_at_set_req_timeout(req);
+
+	/* Tell server the net_latency, so the server can calculate how long
+	 * it should wait for next replay */
+	lustre_msg_set_service_time(req->rq_reqmsg,
+				    ptlrpc_at_get_net_latency(req));
+	DEBUG_REQ(D_HA, req, "REPLAY");
+
+	atomic_inc(&req->rq_import->imp_replay_inflight);
+	ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
+
+	ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_replay_req);
+
+/**
+ * Aborts all in-flight request on import \a imp sending and delayed lists
+ */
+void ptlrpc_abort_inflight(struct obd_import *imp)
+{
+	struct list_head *tmp, *n;
+	ENTRY;
+
+	/* Make sure that no new requests get processed for this import.
+	 * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
+	 * this flag and then putting requests on sending_list or delayed_list.
+	 */
+	spin_lock(&imp->imp_lock);
+
+	/* XXX locking?  Maybe we should remove each request with the list
+	 * locked?  Also, how do we know if the requests on the list are
+	 * being freed at this time?
+	 */
+	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_RPCTRACE, req, "inflight");
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_import_generation < imp->imp_generation) {
+			req->rq_err = 1;
+			req->rq_status = -EIO;
+			ptlrpc_client_wake_req(req);
+		}
+		spin_unlock(&req->rq_lock);
+	}
+
+	list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+		struct ptlrpc_request *req =
+			list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_import_generation < imp->imp_generation) {
+			req->rq_err = 1;
+			req->rq_status = -EIO;
+			ptlrpc_client_wake_req(req);
+		}
+		spin_unlock(&req->rq_lock);
+	}
+
+	/* Last chance to free reqs left on the replay list, but we
+	 * will still leak reqs that haven't committed.  */
+	if (imp->imp_replayable)
+		ptlrpc_free_committed(imp);
+
+	spin_unlock(&imp->imp_lock);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_abort_inflight);
+
+/**
+ * Abort all uncompleted requests in request set \a set
+ */
+void ptlrpc_abort_set(struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *pos;
+
+	LASSERT(set != NULL);
+
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(pos, struct ptlrpc_request,
+				       rq_set_chain);
+
+		spin_lock(&req->rq_lock);
+		if (req->rq_phase != RQ_PHASE_RPC) {
+			spin_unlock(&req->rq_lock);
+			continue;
+		}
+
+		req->rq_err = 1;
+		req->rq_status = -EINTR;
+		ptlrpc_client_wake_req(req);
+		spin_unlock(&req->rq_lock);
+	}
+}
+
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/**
+ * Initialize the XID for the node.  This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing.  It does not need to be sequential.  Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would deliver old data into the wrong RDMA buffer) initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+	time_t now = cfs_time_current_sec();
+
+	spin_lock_init(&ptlrpc_last_xid_lock);
+	if (now < YEAR_2004) {
+		cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+		ptlrpc_last_xid >>= 2;
+		ptlrpc_last_xid |= (1ULL << 61);
+	} else {
+		ptlrpc_last_xid = (__u64)now << 20;
+	}
+
+	/* Need to always be aligned to a power-of-two for mutli-bulk BRW */
+	CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
+	ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
+}
+
+/**
+ * Increase xid and returns resulting new value to the caller.
+ *
+ * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
+ * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
+ * itself uses the last bulk xid needed, so the server can determine the
+ * the number of bulk transfers from the RPC XID and a bitmask.  The starting
+ * xid must align to a power-of-two value.
+ *
+ * This is assumed to be true due to the initial ptlrpc_last_xid
+ * value also being initialized to a power-of-two value. LU-1431
+ */
+__u64 ptlrpc_next_xid(void)
+{
+	__u64 next;
+
+	spin_lock(&ptlrpc_last_xid_lock);
+	next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+	ptlrpc_last_xid = next;
+	spin_unlock(&ptlrpc_last_xid_lock);
+
+	return next;
+}
+EXPORT_SYMBOL(ptlrpc_next_xid);
+
+/**
+ * Get a glimpse at what next xid value might have been.
+ * Returns possible next xid.
+ */
+__u64 ptlrpc_sample_next_xid(void)
+{
+#if BITS_PER_LONG == 32
+	/* need to avoid possible word tearing on 32-bit systems */
+	__u64 next;
+
+	spin_lock(&ptlrpc_last_xid_lock);
+	next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+	spin_unlock(&ptlrpc_last_xid_lock);
+
+	return next;
+#else
+	/* No need to lock, since returned value is racy anyways */
+	return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
+/**
+ * Functions for operating ptlrpc workers.
+ *
+ * A ptlrpc work is a function which will be running inside ptlrpc context.
+ * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
+ *
+ * 1. after a work is created, it can be used many times, that is:
+ *	 handler = ptlrpcd_alloc_work();
+ *	 ptlrpcd_queue_work();
+ *
+ *    queue it again when necessary:
+ *	 ptlrpcd_queue_work();
+ *	 ptlrpcd_destroy_work();
+ * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
+ *    it will only be queued once in any time. Also as its name implies, it may
+ *    have delay before it really runs by ptlrpcd thread.
+ */
+struct ptlrpc_work_async_args {
+	__u64   magic;
+	int   (*cb)(const struct lu_env *, void *);
+	void   *cbdata;
+};
+
+#define PTLRPC_WORK_MAGIC 0x6655436b676f4f44ULL /* magic code */
+
+static int work_interpreter(const struct lu_env *env,
+			    struct ptlrpc_request *req, void *data, int rc)
+{
+	struct ptlrpc_work_async_args *arg = data;
+
+	LASSERT(arg->magic == PTLRPC_WORK_MAGIC);
+	LASSERT(arg->cb != NULL);
+
+	return arg->cb(env, arg->cbdata);
+}
+
+/**
+ * Create a work for ptlrpc.
+ */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+			 int (*cb)(const struct lu_env *, void *), void *cbdata)
+{
+	struct ptlrpc_request	 *req = NULL;
+	struct ptlrpc_work_async_args *args;
+	ENTRY;
+
+	might_sleep();
+
+	if (cb == NULL)
+		RETURN(ERR_PTR(-EINVAL));
+
+	/* copy some code from deprecated fakereq. */
+	OBD_ALLOC_PTR(req);
+	if (req == NULL) {
+		CERROR("ptlrpc: run out of memory!\n");
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+	req->rq_type = PTL_RPC_MSG_REQUEST;
+	req->rq_import = class_import_get(imp);
+	req->rq_export = NULL;
+	req->rq_interpret_reply = work_interpreter;
+	/* don't want reply */
+	req->rq_receiving_reply = 0;
+	req->rq_must_unlink = 0;
+	req->rq_no_delay = req->rq_no_resend = 1;
+
+	spin_lock_init(&req->rq_lock);
+	INIT_LIST_HEAD(&req->rq_list);
+	INIT_LIST_HEAD(&req->rq_replay_list);
+	INIT_LIST_HEAD(&req->rq_set_chain);
+	INIT_LIST_HEAD(&req->rq_history_list);
+	INIT_LIST_HEAD(&req->rq_exp_list);
+	init_waitqueue_head(&req->rq_reply_waitq);
+	init_waitqueue_head(&req->rq_set_waitq);
+	atomic_set(&req->rq_refcount, 1);
+
+	CLASSERT (sizeof(*args) <= sizeof(req->rq_async_args));
+	args = ptlrpc_req_async_args(req);
+	args->magic  = PTLRPC_WORK_MAGIC;
+	args->cb     = cb;
+	args->cbdata = cbdata;
+
+	RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpcd_alloc_work);
+
+void ptlrpcd_destroy_work(void *handler)
+{
+	struct ptlrpc_request *req = handler;
+
+	if (req)
+		ptlrpc_req_finished(req);
+}
+EXPORT_SYMBOL(ptlrpcd_destroy_work);
+
+int ptlrpcd_queue_work(void *handler)
+{
+	struct ptlrpc_request *req = handler;
+
+	/*
+	 * Check if the req is already being queued.
+	 *
+	 * Here comes a trick: it lacks a way of checking if a req is being
+	 * processed reliably in ptlrpc. Here I have to use refcount of req
+	 * for this purpose. This is okay because the caller should use this
+	 * req as opaque data. - Jinshan
+	 */
+	LASSERT(atomic_read(&req->rq_refcount) > 0);
+	if (atomic_read(&req->rq_refcount) > 1)
+		return -EBUSY;
+
+	if (atomic_inc_return(&req->rq_refcount) > 2) { /* race */
+		atomic_dec(&req->rq_refcount);
+		return -EBUSY;
+	}
+
+	/* re-initialize the req */
+	req->rq_timeout	= obd_timeout;
+	req->rq_sent	   = cfs_time_current_sec();
+	req->rq_deadline       = req->rq_sent + req->rq_timeout;
+	req->rq_reply_deadline = req->rq_deadline;
+	req->rq_phase	  = RQ_PHASE_INTERPRET;
+	req->rq_next_phase     = RQ_PHASE_COMPLETE;
+	req->rq_xid	    = ptlrpc_next_xid();
+	req->rq_import_generation = req->rq_import->imp_generation;
+
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpcd_queue_work);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/connection.c b/drivers/staging/lustre/lustre/ptlrpc/connection.c
new file mode 100644
index 0000000..a0757f3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/connection.c

@@ -0,0 +1,248 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+
+#include "ptlrpc_internal.h"
+
+static cfs_hash_t *conn_hash = NULL;
+static cfs_hash_ops_t conn_hash_ops;
+
+struct ptlrpc_connection *
+ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self,
+		      struct obd_uuid *uuid)
+{
+	struct ptlrpc_connection *conn, *conn2;
+	ENTRY;
+
+	conn = cfs_hash_lookup(conn_hash, &peer);
+	if (conn)
+		GOTO(out, conn);
+
+	OBD_ALLOC_PTR(conn);
+	if (!conn)
+		RETURN(NULL);
+
+	conn->c_peer = peer;
+	conn->c_self = self;
+	INIT_HLIST_NODE(&conn->c_hash);
+	atomic_set(&conn->c_refcount, 1);
+	if (uuid)
+		obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
+
+	/*
+	 * Add the newly created conn to the hash, on key collision we
+	 * lost a racing addition and must destroy our newly allocated
+	 * connection.  The object which exists in the has will be
+	 * returned and may be compared against out object.
+	 */
+	/* In the function below, .hs_keycmp resolves to
+	 * conn_keycmp() */
+	/* coverity[overrun-buffer-val] */
+	conn2 = cfs_hash_findadd_unique(conn_hash, &peer, &conn->c_hash);
+	if (conn != conn2) {
+		OBD_FREE_PTR(conn);
+		conn = conn2;
+	}
+	EXIT;
+out:
+	CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+	return conn;
+}
+EXPORT_SYMBOL(ptlrpc_connection_get);
+
+int ptlrpc_connection_put(struct ptlrpc_connection *conn)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (!conn)
+		RETURN(rc);
+
+	LASSERT(atomic_read(&conn->c_refcount) > 1);
+
+	/*
+	 * We do not remove connection from hashtable and
+	 * do not free it even if last caller released ref,
+	 * as we want to have it cached for the case it is
+	 * needed again.
+	 *
+	 * Deallocating it and later creating new connection
+	 * again would be wastful. This way we also avoid
+	 * expensive locking to protect things from get/put
+	 * race when found cached connection is freed by
+	 * ptlrpc_connection_put().
+	 *
+	 * It will be freed later in module unload time,
+	 * when ptlrpc_connection_fini()->lh_exit->conn_exit()
+	 * path is called.
+	 */
+	if (atomic_dec_return(&conn->c_refcount) == 1)
+		rc = 1;
+
+	CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_connection_put);
+
+struct ptlrpc_connection *
+ptlrpc_connection_addref(struct ptlrpc_connection *conn)
+{
+	ENTRY;
+
+	atomic_inc(&conn->c_refcount);
+	CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+	       conn, atomic_read(&conn->c_refcount),
+	       libcfs_nid2str(conn->c_peer.nid));
+
+	RETURN(conn);
+}
+EXPORT_SYMBOL(ptlrpc_connection_addref);
+
+int ptlrpc_connection_init(void)
+{
+	ENTRY;
+
+	conn_hash = cfs_hash_create("CONN_HASH",
+				    HASH_CONN_CUR_BITS,
+				    HASH_CONN_MAX_BITS,
+				    HASH_CONN_BKT_BITS, 0,
+				    CFS_HASH_MIN_THETA,
+				    CFS_HASH_MAX_THETA,
+				    &conn_hash_ops, CFS_HASH_DEFAULT);
+	if (!conn_hash)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_connection_init);
+
+void ptlrpc_connection_fini(void) {
+	ENTRY;
+	cfs_hash_putref(conn_hash);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_connection_fini);
+
+/*
+ * Hash operations for net_peer<->connection
+ */
+static unsigned
+conn_hashfn(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(lnet_process_id_t), mask);
+}
+
+static int
+conn_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+	const lnet_process_id_t *conn_key;
+
+	LASSERT(key != NULL);
+	conn_key = (lnet_process_id_t*)key;
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+
+	return conn_key->nid == conn->c_peer.nid &&
+	       conn_key->pid == conn->c_peer.pid;
+}
+
+static void *
+conn_key(struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	return &conn->c_peer;
+}
+
+static void *
+conn_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+}
+
+static void
+conn_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	atomic_inc(&conn->c_refcount);
+}
+
+static void
+conn_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	atomic_dec(&conn->c_refcount);
+}
+
+static void
+conn_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct ptlrpc_connection *conn;
+
+	conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+	/*
+	 * Nothing should be left. Connection user put it and
+	 * connection also was deleted from table by this time
+	 * so we should have 0 refs.
+	 */
+	LASSERTF(atomic_read(&conn->c_refcount) == 0,
+		 "Busy connection with %d refs\n",
+		 atomic_read(&conn->c_refcount));
+	OBD_FREE_PTR(conn);
+}
+
+static cfs_hash_ops_t conn_hash_ops = {
+	.hs_hash	= conn_hashfn,
+	.hs_keycmp      = conn_keycmp,
+	.hs_key	 = conn_key,
+	.hs_object      = conn_object,
+	.hs_get	 = conn_get,
+	.hs_put_locked  = conn_put_locked,
+	.hs_exit	= conn_exit,
+};

diff --git a/drivers/staging/lustre/lustre/ptlrpc/events.c b/drivers/staging/lustre/lustre/ptlrpc/events.c
new file mode 100644
index 0000000..0264c10
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/events.c

@@ -0,0 +1,595 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# include <linux/libcfs/libcfs.h>
+# ifdef __mips64__
+#  include <linux/kernel.h>
+# endif
+
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ptlrpc_internal.h"
+
+lnet_handle_eq_t   ptlrpc_eq_h;
+
+/*
+ *  Client's outgoing request callback
+ */
+void request_out_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+	struct ptlrpc_request *req = cbid->cbid_arg;
+	ENTRY;
+
+	LASSERT (ev->type == LNET_EVENT_SEND ||
+		 ev->type == LNET_EVENT_UNLINK);
+	LASSERT (ev->unlinked);
+
+	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+	sptlrpc_request_out_callback(req);
+	req->rq_real_sent = cfs_time_current_sec();
+
+	if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) {
+
+		/* Failed send: make it seem like the reply timed out, just
+		 * like failing sends in client.c does currently...  */
+
+		spin_lock(&req->rq_lock);
+		req->rq_net_err = 1;
+		spin_unlock(&req->rq_lock);
+
+		ptlrpc_client_wake_req(req);
+	}
+
+	ptlrpc_req_finished(req);
+
+	EXIT;
+}
+
+/*
+ * Client's incoming reply callback
+ */
+void reply_in_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+	struct ptlrpc_request *req = cbid->cbid_arg;
+	ENTRY;
+
+	DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+	LASSERT (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+	LASSERT (ev->md.start == req->rq_repbuf);
+	LASSERT (ev->offset + ev->mlength <= req->rq_repbuf_len);
+	/* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+	   for adaptive timeouts' early reply. */
+	LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);
+
+	spin_lock(&req->rq_lock);
+
+	req->rq_receiving_reply = 0;
+	req->rq_early = 0;
+	if (ev->unlinked)
+		req->rq_must_unlink = 0;
+
+	if (ev->status)
+		goto out_wake;
+
+	if (ev->type == LNET_EVENT_UNLINK) {
+		LASSERT(ev->unlinked);
+		DEBUG_REQ(D_NET, req, "unlink");
+		goto out_wake;
+	}
+
+	if (ev->mlength < ev->rlength ) {
+		CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req,
+		       req->rq_replen, ev->rlength, ev->offset);
+		req->rq_reply_truncate = 1;
+		req->rq_replied = 1;
+		req->rq_status = -EOVERFLOW;
+		req->rq_nob_received = ev->rlength + ev->offset;
+		goto out_wake;
+	}
+
+	if ((ev->offset == 0) &&
+	    ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
+		/* Early reply */
+		DEBUG_REQ(D_ADAPTTO, req,
+			  "Early reply received: mlen=%u offset=%d replen=%d "
+			  "replied=%d unlinked=%d", ev->mlength, ev->offset,
+			  req->rq_replen, req->rq_replied, ev->unlinked);
+
+		req->rq_early_count++; /* number received, client side */
+
+		if (req->rq_replied)   /* already got the real reply */
+			goto out_wake;
+
+		req->rq_early = 1;
+		req->rq_reply_off = ev->offset;
+		req->rq_nob_received = ev->mlength;
+		/* And we're still receiving */
+		req->rq_receiving_reply = 1;
+	} else {
+		/* Real reply */
+		req->rq_rep_swab_mask = 0;
+		req->rq_replied = 1;
+		req->rq_reply_off = ev->offset;
+		req->rq_nob_received = ev->mlength;
+		/* LNetMDUnlink can't be called under the LNET_LOCK,
+		   so we must unlink in ptlrpc_unregister_reply */
+		DEBUG_REQ(D_INFO, req,
+			  "reply in flags=%x mlen=%u offset=%d replen=%d",
+			  lustre_msg_get_flags(req->rq_reqmsg),
+			  ev->mlength, ev->offset, req->rq_replen);
+	}
+
+	req->rq_import->imp_last_reply_time = cfs_time_current_sec();
+
+out_wake:
+	/* NB don't unlock till after wakeup; req can disappear under us
+	 * since we don't have our own ref */
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+	EXIT;
+}
+
+/*
+ * Client's bulk has been written/read
+ */
+void client_bulk_callback (lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id     *cbid = ev->md.user_ptr;
+	struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
+	struct ptlrpc_request   *req;
+	ENTRY;
+
+	LASSERT ((desc->bd_type == BULK_PUT_SINK &&
+		  ev->type == LNET_EVENT_PUT) ||
+		 (desc->bd_type == BULK_GET_SOURCE &&
+		  ev->type == LNET_EVENT_GET) ||
+		 ev->type == LNET_EVENT_UNLINK);
+	LASSERT (ev->unlinked);
+
+	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+		ev->status = -EIO;
+
+	if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
+		ev->status = -EIO;
+
+	CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+	       "event type %d, status %d, desc %p\n",
+	       ev->type, ev->status, desc);
+
+	spin_lock(&desc->bd_lock);
+	req = desc->bd_req;
+	LASSERT(desc->bd_md_count > 0);
+	desc->bd_md_count--;
+
+	if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
+		desc->bd_nob_transferred += ev->mlength;
+		desc->bd_sender = ev->sender;
+	} else {
+		/* start reconnect and resend if network error hit */
+		spin_lock(&req->rq_lock);
+		req->rq_net_err = 1;
+		spin_unlock(&req->rq_lock);
+	}
+
+	if (ev->status != 0)
+		desc->bd_failure = 1;
+
+	/* NB don't unlock till after wakeup; desc can disappear under us
+	 * otherwise */
+	if (desc->bd_md_count == 0)
+		ptlrpc_client_wake_req(desc->bd_req);
+
+	spin_unlock(&desc->bd_lock);
+	EXIT;
+}
+
+/*
+ * We will have percpt request history list for ptlrpc service in upcoming
+ * patches because we don't want to be serialized by current per-service
+ * history operations. So we require history ID can (somehow) show arriving
+ * order w/o grabbing global lock, and user can sort them in userspace.
+ *
+ * This is how we generate history ID for ptlrpc_request:
+ * ----------------------------------------------------
+ * |  32 bits  |  16 bits  | (16 - X)bits  |  X bits  |
+ * ----------------------------------------------------
+ * |  seconds  | usec / 16 |   sequence    | CPT id   |
+ * ----------------------------------------------------
+ *
+ * it might not be precise but should be good enough.
+ */
+
+#define REQS_CPT_BITS(svcpt)	((svcpt)->scp_service->srv_cpt_bits)
+
+#define REQS_SEC_SHIFT		32
+#define REQS_USEC_SHIFT		16
+#define REQS_SEQ_SHIFT(svcpt)	REQS_CPT_BITS(svcpt)
+
+static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt,
+				   struct ptlrpc_request *req)
+{
+	__u64	sec = req->rq_arrival_time.tv_sec;
+	__u32	usec = req->rq_arrival_time.tv_usec >> 4; /* usec / 16 */
+	__u64	new_seq;
+
+	/* set sequence ID for request and add it to history list,
+	 * it must be called with hold svcpt::scp_lock */
+
+	new_seq = (sec << REQS_SEC_SHIFT) |
+		  (usec << REQS_USEC_SHIFT) |
+		  (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt);
+
+	if (new_seq > svcpt->scp_hist_seq) {
+		/* This handles the initial case of scp_hist_seq == 0 or
+		 * we just jumped into a new time window */
+		svcpt->scp_hist_seq = new_seq;
+	} else {
+		LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT);
+		/* NB: increase sequence number in current usec bucket,
+		 * however, it's possible that we used up all bits for
+		 * sequence and jumped into the next usec bucket (future time),
+		 * then we hope there will be less RPCs per bucket at some
+		 * point, and sequence will catch up again */
+		svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt));
+		new_seq = svcpt->scp_hist_seq;
+	}
+
+	req->rq_history_seq = new_seq;
+
+	list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs);
+}
+
+/*
+ * Server's incoming request callback
+ */
+void request_in_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id		  *cbid = ev->md.user_ptr;
+	struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
+	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
+	struct ptlrpc_service	     *service = svcpt->scp_service;
+	struct ptlrpc_request	     *req;
+	ENTRY;
+
+	LASSERT (ev->type == LNET_EVENT_PUT ||
+		 ev->type == LNET_EVENT_UNLINK);
+	LASSERT ((char *)ev->md.start >= rqbd->rqbd_buffer);
+	LASSERT ((char *)ev->md.start + ev->offset + ev->mlength <=
+		 rqbd->rqbd_buffer + service->srv_buf_size);
+
+	CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+	       "event type %d, status %d, service %s\n",
+	       ev->type, ev->status, service->srv_name);
+
+	if (ev->unlinked) {
+		/* If this is the last request message to fit in the
+		 * request buffer we can use the request object embedded in
+		 * rqbd.  Note that if we failed to allocate a request,
+		 * we'd have to re-post the rqbd, which we can't do in this
+		 * context. */
+		req = &rqbd->rqbd_req;
+		memset(req, 0, sizeof (*req));
+	} else {
+		LASSERT (ev->type == LNET_EVENT_PUT);
+		if (ev->status != 0) {
+			/* We moaned above already... */
+			return;
+		}
+		OBD_ALLOC_GFP(req, sizeof(*req), ALLOC_ATOMIC_TRY);
+		if (req == NULL) {
+			CERROR("Can't allocate incoming request descriptor: "
+			       "Dropping %s RPC from %s\n",
+			       service->srv_name,
+			       libcfs_id2str(ev->initiator));
+			return;
+		}
+	}
+
+	/* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
+	 * flags are reset and scalars are zero.  We only set the message
+	 * size to non-zero if this was a successful receive. */
+	req->rq_xid = ev->match_bits;
+	req->rq_reqbuf = ev->md.start + ev->offset;
+	if (ev->type == LNET_EVENT_PUT && ev->status == 0)
+		req->rq_reqdata_len = ev->mlength;
+	do_gettimeofday(&req->rq_arrival_time);
+	req->rq_peer = ev->initiator;
+	req->rq_self = ev->target.nid;
+	req->rq_rqbd = rqbd;
+	req->rq_phase = RQ_PHASE_NEW;
+	spin_lock_init(&req->rq_lock);
+	INIT_LIST_HEAD(&req->rq_timed_list);
+	INIT_LIST_HEAD(&req->rq_exp_list);
+	atomic_set(&req->rq_refcount, 1);
+	if (ev->type == LNET_EVENT_PUT)
+		CDEBUG(D_INFO, "incoming req@%p x"LPU64" msgsize %u\n",
+		       req, req->rq_xid, ev->mlength);
+
+	CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));
+
+	spin_lock(&svcpt->scp_lock);
+
+	ptlrpc_req_add_history(svcpt, req);
+
+	if (ev->unlinked) {
+		svcpt->scp_nrqbds_posted--;
+		CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
+		       svcpt->scp_nrqbds_posted);
+
+		/* Normally, don't complain about 0 buffers posted; LNET won't
+		 * drop incoming reqs since we set the portal lazy */
+		if (test_req_buffer_pressure &&
+		    ev->type != LNET_EVENT_UNLINK &&
+		    svcpt->scp_nrqbds_posted == 0)
+			CWARN("All %s request buffers busy\n",
+			      service->srv_name);
+
+		/* req takes over the network's ref on rqbd */
+	} else {
+		/* req takes a ref on rqbd */
+		rqbd->rqbd_refcount++;
+	}
+
+	list_add_tail(&req->rq_list, &svcpt->scp_req_incoming);
+	svcpt->scp_nreqs_incoming++;
+
+	/* NB everything can disappear under us once the request
+	 * has been queued and we unlock, so do the wake now... */
+	wake_up(&svcpt->scp_waitq);
+
+	spin_unlock(&svcpt->scp_lock);
+	EXIT;
+}
+
+/*
+ *  Server's outgoing reply callback
+ */
+void reply_out_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id	  *cbid = ev->md.user_ptr;
+	struct ptlrpc_reply_state *rs = cbid->cbid_arg;
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+	ENTRY;
+
+	LASSERT (ev->type == LNET_EVENT_SEND ||
+		 ev->type == LNET_EVENT_ACK ||
+		 ev->type == LNET_EVENT_UNLINK);
+
+	if (!rs->rs_difficult) {
+		/* 'Easy' replies have no further processing so I drop the
+		 * net's ref on 'rs' */
+		LASSERT (ev->unlinked);
+		ptlrpc_rs_decref(rs);
+		EXIT;
+		return;
+	}
+
+	LASSERT (rs->rs_on_net);
+
+	if (ev->unlinked) {
+		/* Last network callback. The net's ref on 'rs' stays put
+		 * until ptlrpc_handle_rs() is done with it */
+		spin_lock(&svcpt->scp_rep_lock);
+		spin_lock(&rs->rs_lock);
+
+		rs->rs_on_net = 0;
+		if (!rs->rs_no_ack ||
+		    rs->rs_transno <=
+		    rs->rs_export->exp_obd->obd_last_committed)
+			ptlrpc_schedule_difficult_reply(rs);
+
+		spin_unlock(&rs->rs_lock);
+		spin_unlock(&svcpt->scp_rep_lock);
+	}
+	EXIT;
+}
+
+
+static void ptlrpc_master_callback(lnet_event_t *ev)
+{
+	struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+	void (*callback)(lnet_event_t *ev) = cbid->cbid_fn;
+
+	/* Honestly, it's best to find out early. */
+	LASSERT (cbid->cbid_arg != LP_POISON);
+	LASSERT (callback == request_out_callback ||
+		 callback == reply_in_callback ||
+		 callback == client_bulk_callback ||
+		 callback == request_in_callback ||
+		 callback == reply_out_callback
+		 );
+
+	callback (ev);
+}
+
+int ptlrpc_uuid_to_peer (struct obd_uuid *uuid,
+			 lnet_process_id_t *peer, lnet_nid_t *self)
+{
+	int	       best_dist = 0;
+	__u32	     best_order = 0;
+	int	       count = 0;
+	int	       rc = -ENOENT;
+	int	       portals_compatibility;
+	int	       dist;
+	__u32	     order;
+	lnet_nid_t	dst_nid;
+	lnet_nid_t	src_nid;
+
+	portals_compatibility = LNetCtl(IOC_LIBCFS_PORTALS_COMPATIBILITY, NULL);
+
+	peer->pid = LUSTRE_SRV_LNET_PID;
+
+	/* Choose the matching UUID that's closest */
+	while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) {
+		dist = LNetDist(dst_nid, &src_nid, &order);
+		if (dist < 0)
+			continue;
+
+		if (dist == 0) {		/* local! use loopback LND */
+			peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
+			rc = 0;
+			break;
+		}
+
+		if (rc < 0 ||
+		    dist < best_dist ||
+		    (dist == best_dist && order < best_order)) {
+			best_dist = dist;
+			best_order = order;
+
+			if (portals_compatibility > 1) {
+				/* Strong portals compatibility: Zero the nid's
+				 * NET, so if I'm reading new config logs, or
+				 * getting configured by (new) lconf I can
+				 * still talk to old servers. */
+				dst_nid = LNET_MKNID(0, LNET_NIDADDR(dst_nid));
+				src_nid = LNET_MKNID(0, LNET_NIDADDR(src_nid));
+			}
+			peer->nid = dst_nid;
+			*self = src_nid;
+			rc = 0;
+		}
+	}
+
+	CDEBUG(D_NET,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
+	return rc;
+}
+
+void ptlrpc_ni_fini(void)
+{
+	wait_queue_head_t	 waitq;
+	struct l_wait_info  lwi;
+	int		 rc;
+	int		 retries;
+
+	/* Wait for the event queue to become idle since there may still be
+	 * messages in flight with pending events (i.e. the fire-and-forget
+	 * messages == client requests and "non-difficult" server
+	 * replies */
+
+	for (retries = 0;; retries++) {
+		rc = LNetEQFree(ptlrpc_eq_h);
+		switch (rc) {
+		default:
+			LBUG();
+
+		case 0:
+			LNetNIFini();
+			return;
+
+		case -EBUSY:
+			if (retries != 0)
+				CWARN("Event queue still busy\n");
+
+			/* Wait for a bit */
+			init_waitqueue_head(&waitq);
+			lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL);
+			l_wait_event(waitq, 0, &lwi);
+			break;
+		}
+	}
+	/* notreached */
+}
+
+lnet_pid_t ptl_get_pid(void)
+{
+	lnet_pid_t	pid;
+
+	pid = LUSTRE_SRV_LNET_PID;
+	return pid;
+}
+
+int ptlrpc_ni_init(void)
+{
+	int	      rc;
+	lnet_pid_t       pid;
+
+	pid = ptl_get_pid();
+	CDEBUG(D_NET, "My pid is: %x\n", pid);
+
+	/* We're not passing any limits yet... */
+	rc = LNetNIInit(pid);
+	if (rc < 0) {
+		CDEBUG (D_NET, "Can't init network interface: %d\n", rc);
+		return (-ENOENT);
+	}
+
+	/* CAVEAT EMPTOR: how we process portals events is _radically_
+	 * different depending on... */
+	/* kernel LNet calls our master callback when there are new event,
+	 * because we are guaranteed to get every event via callback,
+	 * so we just set EQ size to 0 to avoid overhread of serializing
+	 * enqueue/dequeue operations in LNet. */
+	rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h);
+	if (rc == 0)
+		return 0;
+
+	CERROR ("Failed to allocate event queue: %d\n", rc);
+	LNetNIFini();
+
+	return (-ENOMEM);
+}
+
+
+int ptlrpc_init_portals(void)
+{
+	int   rc = ptlrpc_ni_init();
+
+	if (rc != 0) {
+		CERROR("network initialisation failed\n");
+		return -EIO;
+	}
+	rc = ptlrpcd_addref();
+	if (rc == 0)
+		return 0;
+
+	CERROR("rpcd initialisation failed\n");
+	ptlrpc_ni_fini();
+	return rc;
+}
+
+void ptlrpc_exit_portals(void)
+{
+	ptlrpcd_decref();
+	ptlrpc_ni_fini();
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile b/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile
new file mode 100644
index 0000000..8cdfbee
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile

@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTRE_FS) := ptlrpc_gss.o
+
+ptlrpc_gss-y := sec_gss.o gss_bulk.o gss_cli_upcall.o gss_svc_upcall.o	\
+		gss_rawobj.o lproc_gss.o gss_generic_token.o		\
+		gss_mech_switch.o gss_krb5_mech.o
+
+
+ccflags-y := -I$(src)/../include

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h
new file mode 100644
index 0000000..feac6048
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h

@@ -0,0 +1,179 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Somewhat simplified version of the gss api.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2000 The Regents of the University of Michigan
+ *
+ */
+
+#ifndef __PTLRPC_GSS_GSS_API_H_
+#define __PTLRPC_GSS_GSS_API_H_
+
+struct gss_api_mech;
+
+/* The mechanism-independent gss-api context: */
+struct gss_ctx {
+	struct gss_api_mech    *mech_type;
+	void		   *internal_ctx_id;
+};
+
+#define GSS_C_NO_BUFFER	 ((rawobj_t) 0)
+#define GSS_C_NO_CONTEXT	((struct gss_ctx *) 0)
+#define GSS_C_NULL_OID	  ((rawobj_t) 0)
+
+/*
+ * gss-api prototypes; note that these are somewhat simplified versions of
+ * the prototypes specified in RFC 2744.
+ */
+__u32 lgss_import_sec_context(
+		rawobj_t		*input_token,
+		struct gss_api_mech     *mech,
+		struct gss_ctx	 **ctx);
+__u32 lgss_copy_reverse_context(
+		struct gss_ctx	  *ctx,
+		struct gss_ctx	 **ctx_new);
+__u32 lgss_inquire_context(
+		struct gss_ctx	  *ctx,
+		unsigned long	   *endtime);
+__u32 lgss_get_mic(
+		struct gss_ctx	  *ctx,
+		int		      msgcnt,
+		rawobj_t		*msgs,
+		int		      iovcnt,
+		lnet_kiov_t	     *iovs,
+		rawobj_t		*mic_token);
+__u32 lgss_verify_mic(
+		struct gss_ctx	  *ctx,
+		int		      msgcnt,
+		rawobj_t		*msgs,
+		int		      iovcnt,
+		lnet_kiov_t	     *iovs,
+		rawobj_t		*mic_token);
+__u32 lgss_wrap(
+		struct gss_ctx	  *ctx,
+		rawobj_t		*gsshdr,
+		rawobj_t		*msg,
+		int		      msg_buflen,
+		rawobj_t		*out_token);
+__u32 lgss_unwrap(
+		struct gss_ctx	  *ctx,
+		rawobj_t		*gsshdr,
+		rawobj_t		*token,
+		rawobj_t		*out_msg);
+__u32 lgss_prep_bulk(
+		struct gss_ctx	  *gctx,
+		struct ptlrpc_bulk_desc *desc);
+__u32 lgss_wrap_bulk(
+		struct gss_ctx	  *gctx,
+		struct ptlrpc_bulk_desc *desc,
+		rawobj_t		*token,
+		int		      adj_nob);
+__u32 lgss_unwrap_bulk(
+		struct gss_ctx	  *gctx,
+		struct ptlrpc_bulk_desc *desc,
+		rawobj_t		*token,
+		int		      adj_nob);
+__u32 lgss_delete_sec_context(
+		struct gss_ctx	 **ctx);
+int lgss_display(
+		struct gss_ctx	  *ctx,
+		char		    *buf,
+		int		      bufsize);
+
+struct subflavor_desc {
+	__u32	   sf_subflavor;
+	__u32	   sf_qop;
+	__u32	   sf_service;
+	char	   *sf_name;
+};
+
+/* Each mechanism is described by the following struct: */
+struct gss_api_mech {
+	struct list_head	      gm_list;
+	module_t	   *gm_owner;
+	char		   *gm_name;
+	rawobj_t		gm_oid;
+	atomic_t	    gm_count;
+	struct gss_api_ops     *gm_ops;
+	int		     gm_sf_num;
+	struct subflavor_desc  *gm_sfs;
+};
+
+/* and must provide the following operations: */
+struct gss_api_ops {
+	__u32 (*gss_import_sec_context)(
+			rawobj_t	       *input_token,
+			struct gss_ctx	 *ctx);
+	__u32 (*gss_copy_reverse_context)(
+			struct gss_ctx	 *ctx,
+			struct gss_ctx	 *ctx_new);
+	__u32 (*gss_inquire_context)(
+			struct gss_ctx	 *ctx,
+			unsigned long	  *endtime);
+	__u32 (*gss_get_mic)(
+			struct gss_ctx	 *ctx,
+			int		     msgcnt,
+			rawobj_t	       *msgs,
+			int		     iovcnt,
+			lnet_kiov_t	    *iovs,
+			rawobj_t	       *mic_token);
+	__u32 (*gss_verify_mic)(
+			struct gss_ctx	 *ctx,
+			int		     msgcnt,
+			rawobj_t	       *msgs,
+			int		     iovcnt,
+			lnet_kiov_t	    *iovs,
+			rawobj_t	       *mic_token);
+	__u32 (*gss_wrap)(
+			struct gss_ctx	 *ctx,
+			rawobj_t	       *gsshdr,
+			rawobj_t	       *msg,
+			int		     msg_buflen,
+			rawobj_t	       *out_token);
+	__u32 (*gss_unwrap)(
+			struct gss_ctx	 *ctx,
+			rawobj_t	       *gsshdr,
+			rawobj_t	       *token,
+			rawobj_t	       *out_msg);
+	__u32 (*gss_prep_bulk)(
+			struct gss_ctx	 *gctx,
+			struct ptlrpc_bulk_desc *desc);
+	__u32 (*gss_wrap_bulk)(
+			struct gss_ctx	 *gctx,
+			struct ptlrpc_bulk_desc *desc,
+			rawobj_t	       *token,
+			int		     adj_nob);
+	__u32 (*gss_unwrap_bulk)(
+			struct gss_ctx	 *gctx,
+			struct ptlrpc_bulk_desc *desc,
+			rawobj_t	       *token,
+			int		     adj_nob);
+	void (*gss_delete_sec_context)(
+			void		   *ctx);
+	int  (*gss_display)(
+			struct gss_ctx	 *ctx,
+			char		   *buf,
+			int		     bufsize);
+};
+
+int lgss_mech_register(struct gss_api_mech *mech);
+void lgss_mech_unregister(struct gss_api_mech *mech);
+
+struct gss_api_mech * lgss_OID_to_mech(rawobj_t *oid);
+struct gss_api_mech * lgss_name_to_mech(char *name);
+struct gss_api_mech * lgss_subflavor_to_mech(__u32 subflavor);
+
+struct gss_api_mech * lgss_mech_get(struct gss_api_mech *mech);
+void lgss_mech_put(struct gss_api_mech *mech);
+
+#endif /* __PTLRPC_GSS_GSS_API_H_ */

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h
new file mode 100644
index 0000000..c70eb00
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h

@@ -0,0 +1,84 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  minimal asn1 for generic encoding/decoding of gss tokens
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#define SIZEOF_INT 4
+
+/* from gssapi_err_generic.h */
+#define G_BAD_SERVICE_NAME		       (-2045022976L)
+#define G_BAD_STRING_UID			 (-2045022975L)
+#define G_NOUSER				 (-2045022974L)
+#define G_VALIDATE_FAILED			(-2045022973L)
+#define G_BUFFER_ALLOC			   (-2045022972L)
+#define G_BAD_MSG_CTX			    (-2045022971L)
+#define G_WRONG_SIZE			     (-2045022970L)
+#define G_BAD_USAGE			      (-2045022969L)
+#define G_UNKNOWN_QOP			    (-2045022968L)
+#define G_NO_HOSTNAME			    (-2045022967L)
+#define G_BAD_HOSTNAME			   (-2045022966L)
+#define G_WRONG_MECH			     (-2045022965L)
+#define G_BAD_TOK_HEADER			 (-2045022964L)
+#define G_BAD_DIRECTION			  (-2045022963L)
+#define G_TOK_TRUNC			      (-2045022962L)
+#define G_REFLECT				(-2045022961L)
+#define G_WRONG_TOKID			    (-2045022960L)
+
+#define g_OID_equal(o1,o2) \
+   (((o1)->len == (o2)->len) && \
+    (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0))
+
+__u32 g_verify_token_header(rawobj_t *mech,
+			    int *body_size,
+			    unsigned char **buf_in,
+			    int toksize);
+
+__u32 g_get_mech_oid(rawobj_t *mech,
+		     rawobj_t *in_buf);
+
+int g_token_size(rawobj_t *mech,
+		 unsigned int body_size);
+
+void g_make_token_header(rawobj_t *mech,
+			 int body_size,
+			 unsigned char **buf);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c
new file mode 100644
index 0000000..ed95bbb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c

@@ -0,0 +1,512 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_bulk.c
+ *
+ * Author: Eric Mei <eric.mei@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			  struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_cli_ctx	      *gctx;
+	struct lustre_msg	       *msg;
+	struct ptlrpc_bulk_sec_desc     *bsd;
+	rawobj_t			 token;
+	__u32			    maj;
+	int			      offset;
+	int			      rc;
+	ENTRY;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+	LASSERT(gctx->gc_mechctx);
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+		LASSERT(req->rq_reqbuf->lm_bufcount >= 3);
+		msg = req->rq_reqbuf;
+		offset = msg->lm_bufcount - 1;
+		break;
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(req->rq_reqbuf->lm_bufcount >= 4);
+		msg = req->rq_reqbuf;
+		offset = msg->lm_bufcount - 2;
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(req->rq_clrbuf->lm_bufcount >= 2);
+		msg = req->rq_clrbuf;
+		offset = msg->lm_bufcount - 1;
+		break;
+	default:
+		LBUG();
+	}
+
+	bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+	bsd->bsd_version = 0;
+	bsd->bsd_flags = 0;
+	bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+	if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		RETURN(0);
+
+	LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+		bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+	if (req->rq_bulk_read) {
+		/*
+		 * bulk read: prepare receiving pages only for privacy mode.
+		 */
+		if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+			return gss_cli_prep_bulk(req, desc);
+	} else {
+		/*
+		 * bulk write: sign or encrypt bulk pages.
+		 */
+		bsd->bsd_nob = desc->bd_nob;
+
+		if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+			/* integrity mode */
+			token.data = bsd->bsd_data;
+			token.len = lustre_msg_buflen(msg, offset) -
+				    sizeof(*bsd);
+
+			maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL,
+					   desc->bd_iov_count, desc->bd_iov,
+					   &token);
+			if (maj != GSS_S_COMPLETE) {
+				CWARN("failed to sign bulk data: %x\n", maj);
+				RETURN(-EACCES);
+			}
+		} else {
+			/* privacy mode */
+			if (desc->bd_iov_count == 0)
+				RETURN(0);
+
+			rc = sptlrpc_enc_pool_get_pages(desc);
+			if (rc) {
+				CERROR("bulk write: failed to allocate "
+				       "encryption pages: %d\n", rc);
+				RETURN(rc);
+			}
+
+			token.data = bsd->bsd_data;
+			token.len = lustre_msg_buflen(msg, offset) -
+				    sizeof(*bsd);
+
+			maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0);
+			if (maj != GSS_S_COMPLETE) {
+				CWARN("fail to encrypt bulk data: %x\n", maj);
+				RETURN(-EACCES);
+			}
+		}
+	}
+
+	RETURN(0);
+}
+
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			    struct ptlrpc_request *req,
+			    struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_cli_ctx	      *gctx;
+	struct lustre_msg	       *rmsg, *vmsg;
+	struct ptlrpc_bulk_sec_desc     *bsdr, *bsdv;
+	rawobj_t			 token;
+	__u32			    maj;
+	int			      roff, voff;
+	ENTRY;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+		vmsg = req->rq_repdata;
+		voff = vmsg->lm_bufcount - 1;
+		LASSERT(vmsg && vmsg->lm_bufcount >= 3);
+
+		rmsg = req->rq_reqbuf;
+		roff = rmsg->lm_bufcount - 1; /* last segment */
+		LASSERT(rmsg && rmsg->lm_bufcount >= 3);
+		break;
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		vmsg = req->rq_repdata;
+		voff = vmsg->lm_bufcount - 2;
+		LASSERT(vmsg && vmsg->lm_bufcount >= 4);
+
+		rmsg = req->rq_reqbuf;
+		roff = rmsg->lm_bufcount - 2; /* second last segment */
+		LASSERT(rmsg && rmsg->lm_bufcount >= 4);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		vmsg = req->rq_repdata;
+		voff = vmsg->lm_bufcount - 1;
+		LASSERT(vmsg && vmsg->lm_bufcount >= 2);
+
+		rmsg = req->rq_clrbuf;
+		roff = rmsg->lm_bufcount - 1; /* last segment */
+		LASSERT(rmsg && rmsg->lm_bufcount >= 2);
+		break;
+	default:
+		LBUG();
+	}
+
+	bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr));
+	bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv));
+	LASSERT(bsdr && bsdv);
+
+	if (bsdr->bsd_version != bsdv->bsd_version ||
+	    bsdr->bsd_type != bsdv->bsd_type ||
+	    bsdr->bsd_svc != bsdv->bsd_svc) {
+		CERROR("bulk security descriptor mismatch: "
+		       "(%u,%u,%u) != (%u,%u,%u)\n",
+		       bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc,
+		       bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc);
+		RETURN(-EPROTO);
+	}
+
+	LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL ||
+		bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+		bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+	/*
+	 * in privacy mode if return success, make sure bd_nob_transferred
+	 * is the actual size of the clear text, otherwise upper layer
+	 * may be surprised.
+	 */
+	if (req->rq_bulk_write) {
+		if (bsdv->bsd_flags & BSD_FL_ERR) {
+			CERROR("server reported bulk i/o failure\n");
+			RETURN(-EIO);
+		}
+
+		if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+			desc->bd_nob_transferred = desc->bd_nob;
+	} else {
+		/*
+		 * bulk read, upon return success, bd_nob_transferred is
+		 * the size of plain text actually received.
+		 */
+		gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+		LASSERT(gctx->gc_mechctx);
+
+		if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+			int i, nob;
+
+			/* fix the actual data size */
+			for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+				if (desc->bd_iov[i].kiov_len + nob >
+				    desc->bd_nob_transferred) {
+					desc->bd_iov[i].kiov_len =
+						desc->bd_nob_transferred - nob;
+				}
+				nob += desc->bd_iov[i].kiov_len;
+			}
+
+			token.data = bsdv->bsd_data;
+			token.len = lustre_msg_buflen(vmsg, voff) -
+				    sizeof(*bsdv);
+
+			maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL,
+					      desc->bd_iov_count, desc->bd_iov,
+					      &token);
+			if (maj != GSS_S_COMPLETE) {
+				CERROR("failed to verify bulk read: %x\n", maj);
+				RETURN(-EACCES);
+			}
+		} else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) {
+			desc->bd_nob = bsdv->bsd_nob;
+			if (desc->bd_nob == 0)
+				RETURN(0);
+
+			token.data = bsdv->bsd_data;
+			token.len = lustre_msg_buflen(vmsg, voff) -
+				    sizeof(*bsdr);
+
+			maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc,
+					       &token, 1);
+			if (maj != GSS_S_COMPLETE) {
+				CERROR("failed to decrypt bulk read: %x\n",
+				       maj);
+				RETURN(-EACCES);
+			}
+
+			desc->bd_nob_transferred = desc->bd_nob;
+		}
+	}
+
+	RETURN(0);
+}
+
+static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc,
+			 struct gss_ctx *mechctx)
+{
+	int     rc;
+
+	if (desc->bd_iov_count == 0)
+		return 0;
+
+	rc = sptlrpc_enc_pool_get_pages(desc);
+	if (rc)
+		return rc;
+
+	if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE)
+		return -EACCES;
+
+	return 0;
+}
+
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc)
+{
+	int	     rc;
+	ENTRY;
+
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_read);
+
+	if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV)
+		RETURN(0);
+
+	rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx);
+	if (rc)
+		CERROR("bulk read: failed to prepare encryption "
+		       "pages: %d\n", rc);
+
+	RETURN(rc);
+}
+
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_svc_reqctx	*grctx;
+	struct ptlrpc_bulk_sec_desc  *bsd;
+	int			   rc;
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_write);
+
+	grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	LASSERT(grctx->src_reqbsd);
+	LASSERT(grctx->src_repbsd);
+	LASSERT(grctx->src_ctx);
+	LASSERT(grctx->src_ctx->gsc_mechctx);
+
+	bsd = grctx->src_reqbsd;
+	if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)
+		RETURN(0);
+
+	rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx);
+	if (rc)
+		CERROR("bulk write: failed to prepare encryption "
+		       "pages: %d\n", rc);
+
+	RETURN(rc);
+}
+
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_svc_reqctx	*grctx;
+	struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+	rawobj_t		      token;
+	__u32			 maj;
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_write);
+
+	grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+	LASSERT(grctx->src_reqbsd);
+	LASSERT(grctx->src_repbsd);
+	LASSERT(grctx->src_ctx);
+	LASSERT(grctx->src_ctx->gsc_mechctx);
+
+	bsdr = grctx->src_reqbsd;
+	bsdv = grctx->src_repbsd;
+
+	/* bsdr has been sanity checked during unpacking */
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	switch (bsdv->bsd_svc) {
+	case SPTLRPC_BULK_SVC_INTG:
+		token.data = bsdr->bsd_data;
+		token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+		maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+				      desc->bd_iov_count, desc->bd_iov, &token);
+		if (maj != GSS_S_COMPLETE) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("failed to verify bulk signature: %x\n", maj);
+			RETURN(-EACCES);
+		}
+		break;
+	case SPTLRPC_BULK_SVC_PRIV:
+		if (bsdr->bsd_nob != desc->bd_nob) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("prepared nob %d doesn't match the actual "
+			       "nob %d\n", desc->bd_nob, bsdr->bsd_nob);
+			RETURN(-EPROTO);
+		}
+
+		if (desc->bd_iov_count == 0) {
+			LASSERT(desc->bd_nob == 0);
+			break;
+		}
+
+		token.data = bsdr->bsd_data;
+		token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+		maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx,
+				       desc, &token, 0);
+		if (maj != GSS_S_COMPLETE) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("failed decrypt bulk data: %x\n", maj);
+			RETURN(-EACCES);
+		}
+		break;
+	}
+
+	RETURN(0);
+}
+
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc)
+{
+	struct gss_svc_reqctx	*grctx;
+	struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+	rawobj_t		      token;
+	__u32			 maj;
+	int			   rc;
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_bulk_read);
+
+	grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+	LASSERT(grctx->src_reqbsd);
+	LASSERT(grctx->src_repbsd);
+	LASSERT(grctx->src_ctx);
+	LASSERT(grctx->src_ctx->gsc_mechctx);
+
+	bsdr = grctx->src_reqbsd;
+	bsdv = grctx->src_repbsd;
+
+	/* bsdr has been sanity checked during unpacking */
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	switch (bsdv->bsd_svc) {
+	case SPTLRPC_BULK_SVC_INTG:
+		token.data = bsdv->bsd_data;
+		token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+		maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+				   desc->bd_iov_count, desc->bd_iov, &token);
+		if (maj != GSS_S_COMPLETE) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("failed to sign bulk data: %x\n", maj);
+			RETURN(-EACCES);
+		}
+		break;
+	case SPTLRPC_BULK_SVC_PRIV:
+		bsdv->bsd_nob = desc->bd_nob;
+
+		if (desc->bd_iov_count == 0) {
+			LASSERT(desc->bd_nob == 0);
+			break;
+		}
+
+		rc = sptlrpc_enc_pool_get_pages(desc);
+		if (rc) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("bulk read: failed to allocate encryption "
+			       "pages: %d\n", rc);
+			RETURN(rc);
+		}
+
+		token.data = bsdv->bsd_data;
+		token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+		maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx,
+				     desc, &token, 1);
+		if (maj != GSS_S_COMPLETE) {
+			bsdv->bsd_flags |= BSD_FL_ERR;
+			CERROR("failed to encrypt bulk data: %x\n", maj);
+			RETURN(-EACCES);
+		}
+		break;
+	}
+
+	RETURN(0);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c
new file mode 100644
index 0000000..142c789
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c

@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_cli_upcall.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+/**********************************************
+ * gss context init/fini helper	       *
+ **********************************************/
+
+static
+int ctx_init_pack_request(struct obd_import *imp,
+			  struct ptlrpc_request *req,
+			  int lustre_srv,
+			  uid_t uid, gid_t gid,
+			  long token_size,
+			  char __user *token)
+{
+	struct lustre_msg       *msg = req->rq_reqbuf;
+	struct gss_sec	  *gsec;
+	struct gss_header       *ghdr;
+	struct ptlrpc_user_desc *pud;
+	__u32		   *p, size, offset = 2;
+	rawobj_t		 obj;
+
+	LASSERT(msg->lm_bufcount <= 4);
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_cli_ctx->cc_sec);
+
+	/* gss hdr */
+	ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
+	ghdr->gh_seq = 0;
+	ghdr->gh_svc = SPTLRPC_SVC_NULL;
+	ghdr->gh_handle.len = 0;
+
+	/* fix the user desc */
+	if (req->rq_pack_udesc) {
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+		pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+		LASSERT(pud);
+		pud->pud_uid = pud->pud_fsuid = uid;
+		pud->pud_gid = pud->pud_fsgid = gid;
+		pud->pud_cap = 0;
+		pud->pud_ngroups = 0;
+		offset++;
+	}
+
+	/* security payload */
+	p = lustre_msg_buf(msg, offset, 0);
+	size = msg->lm_buflens[offset];
+	LASSERT(p);
+
+	/* 1. lustre svc type */
+	LASSERT(size > 4);
+	*p++ = cpu_to_le32(lustre_srv);
+	size -= 4;
+
+	/* 2. target uuid */
+	obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
+	obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
+	if (rawobj_serialize(&obj, &p, &size))
+		LBUG();
+
+	/* 3. reverse context handle. actually only needed by root user,
+	 *    but we send it anyway. */
+	gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
+	obj.len = sizeof(gsec->gs_rvs_hdl);
+	obj.data = (__u8 *) &gsec->gs_rvs_hdl;
+	if (rawobj_serialize(&obj, &p, &size))
+		LBUG();
+
+	/* 4. now the token */
+	LASSERT(size >= (sizeof(__u32) + token_size));
+	*p++ = cpu_to_le32(((__u32) token_size));
+	if (copy_from_user(p, token, token_size)) {
+		CERROR("can't copy token\n");
+		return -EFAULT;
+	}
+	size -= sizeof(__u32) + cfs_size_round4(token_size);
+
+	req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
+						msg->lm_buflens[offset] - size, 0);
+	return 0;
+}
+
+static
+int ctx_init_parse_reply(struct lustre_msg *msg, int swabbed,
+			 char __user *outbuf, long outlen)
+{
+	struct gss_rep_header   *ghdr;
+	__u32		    obj_len, round_len;
+	__u32		    status, effective = 0;
+
+	if (msg->lm_bufcount != 3) {
+		CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+		return -EPROTO;
+	}
+
+	ghdr = (struct gss_rep_header *) gss_swab_header(msg, 0, swabbed);
+	if (ghdr == NULL) {
+		CERROR("unable to extract gss reply header\n");
+		return -EPROTO;
+	}
+
+	if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+		CERROR("invalid gss version %u\n", ghdr->gh_version);
+		return -EPROTO;
+	}
+
+	if (outlen < (4 + 2) * 4 + cfs_size_round4(ghdr->gh_handle.len) +
+		     cfs_size_round4(msg->lm_buflens[2])) {
+		CERROR("output buffer size %ld too small\n", outlen);
+		return -EFAULT;
+	}
+
+	status = 0;
+	effective = 0;
+
+	if (copy_to_user(outbuf, &status, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_major, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_minor, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, &ghdr->gh_seqwin, 4))
+		return -EFAULT;
+	outbuf += 4;
+	effective += 4 * 4;
+
+	/* handle */
+	obj_len = ghdr->gh_handle.len;
+	round_len = (obj_len + 3) & ~ 3;
+	if (copy_to_user(outbuf, &obj_len, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, (char *) ghdr->gh_handle.data, round_len))
+		return -EFAULT;
+	outbuf += round_len;
+	effective += 4 + round_len;
+
+	/* out token */
+	obj_len = msg->lm_buflens[2];
+	round_len = (obj_len + 3) & ~ 3;
+	if (copy_to_user(outbuf, &obj_len, 4))
+		return -EFAULT;
+	outbuf += 4;
+	if (copy_to_user(outbuf, lustre_msg_buf(msg, 2, 0), round_len))
+		return -EFAULT;
+	outbuf += round_len;
+	effective += 4 + round_len;
+
+	return effective;
+}
+
+/* XXX move to where lgssd could see */
+struct lgssd_ioctl_param {
+	int	     version;	/* in   */
+	int	     secid;	  /* in   */
+	char	   *uuid;	   /* in   */
+	int	     lustre_svc;     /* in   */
+	uid_t	   uid;	    /* in   */
+	gid_t	   gid;	    /* in   */
+	long	    send_token_size;/* in   */
+	char	   *send_token;     /* in   */
+	long	    reply_buf_size; /* in   */
+	char	   *reply_buf;      /* in   */
+	long	    status;	 /* out  */
+	long	    reply_length;   /* out  */
+};
+
+int gss_do_ctx_init_rpc(__user char *buffer, unsigned long count)
+{
+	struct obd_import	*imp;
+	struct ptlrpc_request    *req;
+	struct lgssd_ioctl_param  param;
+	struct obd_device	*obd;
+	char		      obdname[64];
+	long		      lsize;
+	int		       rc;
+
+	if (count != sizeof(param)) {
+		CERROR("ioctl size %lu, expect %lu, please check lgss_keyring "
+		       "version\n", count, (unsigned long) sizeof(param));
+		RETURN(-EINVAL);
+	}
+	if (copy_from_user(&param, buffer, sizeof(param))) {
+		CERROR("failed copy data from lgssd\n");
+		RETURN(-EFAULT);
+	}
+
+	if (param.version != GSSD_INTERFACE_VERSION) {
+		CERROR("gssd interface version %d (expect %d)\n",
+			param.version, GSSD_INTERFACE_VERSION);
+		RETURN(-EINVAL);
+	}
+
+	/* take name */
+	if (strncpy_from_user(obdname, param.uuid, sizeof(obdname)) <= 0) {
+		CERROR("Invalid obdname pointer\n");
+		RETURN(-EFAULT);
+	}
+
+	obd = class_name2obd(obdname);
+	if (!obd) {
+		CERROR("no such obd %s\n", obdname);
+		RETURN(-EINVAL);
+	}
+
+	if (unlikely(!obd->obd_set_up)) {
+		CERROR("obd %s not setup\n", obdname);
+		RETURN(-EINVAL);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		CERROR("obd %s has stopped\n", obdname);
+		spin_unlock(&obd->obd_dev_lock);
+		RETURN(-EINVAL);
+	}
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+		CERROR("obd %s is not a client device\n", obdname);
+		spin_unlock(&obd->obd_dev_lock);
+		RETURN(-EINVAL);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	down_read(&obd->u.cli.cl_sem);
+	if (obd->u.cli.cl_import == NULL) {
+		CERROR("obd %s: import has gone\n", obd->obd_name);
+		up_read(&obd->u.cli.cl_sem);
+		RETURN(-EINVAL);
+	}
+	imp = class_import_get(obd->u.cli.cl_import);
+	up_read(&obd->u.cli.cl_sem);
+
+	if (imp->imp_deactive) {
+		CERROR("import has been deactivated\n");
+		class_import_put(imp);
+		RETURN(-EINVAL);
+	}
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_SEC_CTX, LUSTRE_OBD_VERSION,
+					SEC_CTX_INIT);
+	if (req == NULL) {
+		param.status = -ENOMEM;
+		goto out_copy;
+	}
+
+	if (req->rq_cli_ctx->cc_sec->ps_id != param.secid) {
+		CWARN("original secid %d, now has changed to %d, "
+		      "cancel this negotiation\n", param.secid,
+		      req->rq_cli_ctx->cc_sec->ps_id);
+		param.status = -EINVAL;
+		goto out_copy;
+	}
+
+	/* get token */
+	rc = ctx_init_pack_request(imp, req,
+				   param.lustre_svc,
+				   param.uid, param.gid,
+				   param.send_token_size,
+				   param.send_token);
+	if (rc) {
+		param.status = rc;
+		goto out_copy;
+	}
+
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc) {
+		/* If any _real_ denial be made, we expect server return
+		 * -EACCES reply or return success but indicate gss error
+		 * inside reply messsage. All other errors are treated as
+		 * timeout, caller might try the negotiation repeatedly,
+		 * leave recovery decisions to general ptlrpc layer.
+		 *
+		 * FIXME maybe some other error code shouldn't be treated
+		 * as timeout. */
+		param.status = rc;
+		if (rc != -EACCES)
+			param.status = -ETIMEDOUT;
+		goto out_copy;
+	}
+
+	LASSERT(req->rq_repdata);
+	lsize = ctx_init_parse_reply(req->rq_repdata,
+				     ptlrpc_rep_need_swab(req),
+				     param.reply_buf, param.reply_buf_size);
+	if (lsize < 0) {
+		param.status = (int) lsize;
+		goto out_copy;
+	}
+
+	param.status = 0;
+	param.reply_length = lsize;
+
+out_copy:
+	if (copy_to_user(buffer, &param, sizeof(param)))
+		rc = -EFAULT;
+	else
+		rc = 0;
+
+	class_import_put(imp);
+	ptlrpc_req_finished(req);
+	RETURN(rc);
+}
+
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx)
+{
+	struct ptlrpc_cli_ctx   *ctx = &gctx->gc_base;
+	struct obd_import       *imp = ctx->cc_sec->ps_import;
+	struct ptlrpc_request   *req;
+	struct ptlrpc_user_desc *pud;
+	int		      rc;
+	ENTRY;
+
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	if (cli_ctx_is_error(ctx) || !cli_ctx_is_uptodate(ctx)) {
+		CDEBUG(D_SEC, "ctx %p(%u->%s) not uptodate, "
+		       "don't send destroy rpc\n", ctx,
+		       ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+		RETURN(0);
+	}
+
+	might_sleep();
+
+	CWARN("%s ctx %p idx "LPX64" (%u->%s)\n",
+	      sec_is_reverse(ctx->cc_sec) ?
+	      "server finishing reverse" : "client finishing forward",
+	      ctx, gss_handle_to_u64(&gctx->gc_handle),
+	      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+	gctx->gc_proc = PTLRPC_GSS_PROC_DESTROY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_SEC_CTX);
+	if (req == NULL) {
+		CWARN("ctx %p(%u): fail to prepare rpc, destroy locally\n",
+		      ctx, ctx->cc_vcred.vc_uid);
+		GOTO(out, rc = -ENOMEM);
+	}
+
+	rc = ptlrpc_request_bufs_pack(req, LUSTRE_OBD_VERSION, SEC_CTX_FINI,
+				      NULL, ctx);
+	if (rc) {
+		ptlrpc_request_free(req);
+		GOTO(out_ref, rc);
+	}
+
+	/* fix the user desc */
+	if (req->rq_pack_udesc) {
+		/* we rely the fact that this request is in AUTH mode,
+		 * and user_desc at offset 2. */
+		pud = lustre_msg_buf(req->rq_reqbuf, 2, sizeof(*pud));
+		LASSERT(pud);
+		pud->pud_uid = pud->pud_fsuid = ctx->cc_vcred.vc_uid;
+		pud->pud_gid = pud->pud_fsgid = ctx->cc_vcred.vc_gid;
+		pud->pud_cap = 0;
+		pud->pud_ngroups = 0;
+	}
+
+	req->rq_phase = RQ_PHASE_RPC;
+	rc = ptl_send_rpc(req, 1);
+	if (rc)
+		CWARN("ctx %p(%u->%s): rpc error %d, destroy locally\n", ctx,
+		      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), rc);
+
+out_ref:
+	ptlrpc_req_finished(req);
+out:
+	RETURN(rc);
+}
+
+int __init gss_init_cli_upcall(void)
+{
+	return 0;
+}
+
+void __exit gss_exit_cli_upcall(void)
+{
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h
new file mode 100644
index 0000000..1342579
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h

@@ -0,0 +1,193 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __PTLRPC_GSS_GSS_ERR_H_
+#define __PTLRPC_GSS_GSS_ERR_H_
+
+typedef unsigned int OM_uint32;
+
+/*
+ * Flag bits for context-level services.
+ */
+#define GSS_C_DELEG_FLAG	(1)
+#define GSS_C_MUTUAL_FLAG       (2)
+#define GSS_C_REPLAY_FLAG       (4)
+#define GSS_C_SEQUENCE_FLAG     (8)
+#define GSS_C_CONF_FLAG	 (16)
+#define GSS_C_INTEG_FLAG	(32)
+#define GSS_C_ANON_FLAG	 (64)
+#define GSS_C_PROT_READY_FLAG   (128)
+#define GSS_C_TRANS_FLAG	(256)
+
+/*
+ * Credential usage options
+ */
+#define GSS_C_BOTH	      (0)
+#define GSS_C_INITIATE	  (1)
+#define GSS_C_ACCEPT	    (2)
+
+/*
+ * Status code types for gss_display_status
+ */
+#define GSS_C_GSS_CODE	  (1)
+#define GSS_C_MECH_CODE	 (2)
+
+
+/*
+ * Define the default Quality of Protection for per-message services.  Note
+ * that an implementation that offers multiple levels of QOP may either reserve
+ * a value (for example zero, as assumed here) to mean "default protection", or
+ * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit
+ * QOP value.  However a value of 0 should always be interpreted by a GSSAPI
+ * implementation as a request for the default protection level.
+ */
+#define GSS_C_QOP_DEFAULT       (0)
+
+/*
+ * Expiration time of 2^32-1 seconds means infinite lifetime for a
+ * credential or security context
+ */
+#define GSS_C_INDEFINITE	((OM_uint32) 0xfffffffful)
+
+
+/* Major status codes */
+
+#define GSS_S_COMPLETE	  (0)
+
+/*
+ * Some "helper" definitions to make the status code macros obvious.
+ */
+#define GSS_C_CALLING_ERROR_OFFSET      (24)
+#define GSS_C_ROUTINE_ERROR_OFFSET      (16)
+#define GSS_C_SUPPLEMENTARY_OFFSET      (0)
+#define GSS_C_CALLING_ERROR_MASK	((OM_uint32) 0377ul)
+#define GSS_C_ROUTINE_ERROR_MASK	((OM_uint32) 0377ul)
+#define GSS_C_SUPPLEMENTARY_MASK	((OM_uint32) 0177777ul)
+
+/*
+ * The macros that test status codes for error conditions.  Note that the
+ * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now
+ * evaluates its argument only once.
+ */
+#define GSS_CALLING_ERROR(x) \
+  ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET))
+#define GSS_ROUTINE_ERROR(x) \
+  ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))
+#define GSS_SUPPLEMENTARY_INFO(x) \
+  ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET))
+#define GSS_ERROR(x) \
+  ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \
+	  (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)))
+
+/*
+ * Now the actual status code definitions
+ */
+
+/*
+ * Calling errors:
+ */
+#define GSS_S_CALL_INACCESSIBLE_READ \
+	(((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_INACCESSIBLE_WRITE \
+	(((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_BAD_STRUCTURE \
+	(((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET)
+
+/*
+ * Routine errors:
+ */
+#define GSS_S_BAD_MECH \
+	(((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAME \
+	(((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAMETYPE \
+	(((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_BINDINGS \
+	(((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_STATUS \
+	(((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_SIG \
+	(((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CRED \
+	(((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CONTEXT \
+	(((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_TOKEN \
+	(((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_CREDENTIAL \
+	(((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CREDENTIALS_EXPIRED \
+	(((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CONTEXT_EXPIRED \
+	(((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_FAILURE \
+	(((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_QOP \
+	(((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAUTHORIZED \
+	(((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAVAILABLE \
+	(((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DUPLICATE_ELEMENT \
+	(((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NAME_NOT_MN \
+	(((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+
+/*
+ * Supplementary info bits:
+ */
+#define GSS_S_CONTINUE_NEEDED   (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0))
+#define GSS_S_DUPLICATE_TOKEN   (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1))
+#define GSS_S_OLD_TOKEN	 (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2))
+#define GSS_S_UNSEQ_TOKEN       (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3))
+#define GSS_S_GAP_TOKEN	 (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4))
+
+/* XXXX these are not part of the GSSAPI C bindings!  (but should be) */
+
+#define GSS_CALLING_ERROR_FIELD(x) \
+	(((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK)
+#define GSS_ROUTINE_ERROR_FIELD(x) \
+	(((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK)
+#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \
+	(((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK)
+
+/* XXXX This is a necessary evil until the spec is fixed */
+#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE
+
+#endif /* __PTLRPC_GSS_GSS_ERR_H_ */

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c
new file mode 100644
index 0000000..20b1638
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c

@@ -0,0 +1,285 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_generic_token.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+#include "gss_asn1.h"
+
+
+/* TWRITE_STR from gssapiP_generic.h */
+#define TWRITE_STR(ptr, str, len) \
+	memcpy((ptr), (char *) (str), (len)); \
+	(ptr) += (len);
+
+/* XXXX this code currently makes the assumption that a mech oid will
+   never be longer than 127 bytes.  This assumption is not inherent in
+   the interfaces, so the code can be fixed if the OSI namespace
+   balloons unexpectedly. */
+
+/* Each token looks like this:
+
+0x60				tag for APPLICATION 0, SEQUENCE
+					(constructed, definite-length)
+	<length>		possible multiple bytes, need to parse/generate
+	0x06			tag for OBJECT IDENTIFIER
+		<moid_length>	compile-time constant string (assume 1 byte)
+		<moid_bytes>	compile-time constant string
+	<inner_bytes>		the ANY containing the application token
+					bytes 0,1 are the token type
+					bytes 2,n are the token data
+
+For the purposes of this abstraction, the token "header" consists of
+the sequence tag and length octets, the mech OID DER encoding, and the
+first two inner bytes, which indicate the token type.  The token
+"body" consists of everything else.
+
+*/
+
+static
+int der_length_size(int length)
+{
+	if (length < (1 << 7))
+		return 1;
+	else if (length < (1 << 8))
+		return 2;
+#if (SIZEOF_INT == 2)
+	else
+		return 3;
+#else
+	else if (length < (1 << 16))
+		return 3;
+	else if (length < (1 << 24))
+		return 4;
+	else
+		return 5;
+#endif
+}
+
+static
+void der_write_length(unsigned char **buf, int length)
+{
+	if (length < (1 << 7)) {
+		*(*buf)++ = (unsigned char) length;
+	} else {
+		*(*buf)++ = (unsigned char) (der_length_size(length) + 127);
+#if (SIZEOF_INT > 2)
+		if (length >= (1 << 24))
+			*(*buf)++ = (unsigned char) (length >> 24);
+		if (length >= (1 << 16))
+			*(*buf)++ = (unsigned char) ((length >> 16) & 0xff);
+#endif
+		if (length >= (1 << 8))
+			*(*buf)++ = (unsigned char) ((length >> 8) & 0xff);
+		*(*buf)++ = (unsigned char) (length & 0xff);
+	}
+}
+
+/*
+ * returns decoded length, or < 0 on failure.  Advances buf and
+ * decrements bufsize
+ */
+static
+int der_read_length(unsigned char **buf, int *bufsize)
+{
+	unsigned char sf;
+	int ret;
+
+	if (*bufsize < 1)
+		return -1;
+	sf = *(*buf)++;
+	(*bufsize)--;
+	if (sf & 0x80) {
+		if ((sf &= 0x7f) > ((*bufsize) - 1))
+			return -1;
+		if (sf > SIZEOF_INT)
+			return -1;
+		ret = 0;
+		for (; sf; sf--) {
+			ret = (ret << 8) + (*(*buf)++);
+			(*bufsize)--;
+		}
+	} else {
+		ret = sf;
+	}
+
+	return ret;
+}
+
+/*
+ * returns the length of a token, given the mech oid and the body size
+ */
+int g_token_size(rawobj_t *mech, unsigned int body_size)
+{
+	/* set body_size to sequence contents size */
+	body_size += 4 + (int) mech->len; /* NEED overflow check */
+	return (1 + der_length_size(body_size) + body_size);
+}
+
+/*
+ * fills in a buffer with the token header.  The buffer is assumed to
+ * be the right size.  buf is advanced past the token header
+ */
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf)
+{
+	*(*buf)++ = 0x60;
+	der_write_length(buf, 4 + mech->len + body_size);
+	*(*buf)++ = 0x06;
+	*(*buf)++ = (unsigned char) mech->len;
+	TWRITE_STR(*buf, mech->data, ((int) mech->len));
+}
+
+/*
+ * Given a buffer containing a token, reads and verifies the token,
+ * leaving buf advanced past the token header, and setting body_size
+ * to the number of remaining bytes.  Returns 0 on success,
+ * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
+ * mechanism in the token does not match the mech argument.  buf and
+ * *body_size are left unmodified on error.
+ */
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+			    unsigned char **buf_in, int toksize)
+{
+	unsigned char *buf = *buf_in;
+	int seqsize;
+	rawobj_t toid;
+	int ret = 0;
+
+	if ((toksize -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	if (*buf++ != 0x60)
+		return (G_BAD_TOK_HEADER);
+
+	if ((seqsize = der_read_length(&buf, &toksize)) < 0)
+		return(G_BAD_TOK_HEADER);
+
+	if (seqsize != toksize)
+		return (G_BAD_TOK_HEADER);
+
+	if ((toksize -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	if (*buf++ != 0x06)
+		return (G_BAD_TOK_HEADER);
+
+	if ((toksize -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	toid.len = *buf++;
+
+	if ((toksize -= toid.len) < 0)
+		return (G_BAD_TOK_HEADER);
+	toid.data = buf;
+	buf += toid.len;
+
+	if (!g_OID_equal(&toid, mech))
+		ret = G_WRONG_MECH;
+
+	/* G_WRONG_MECH is not returned immediately because it's more
+	 * important to return G_BAD_TOK_HEADER if the token header is
+	 * in fact bad
+	 */
+	if ((toksize -= 2) < 0)
+		return (G_BAD_TOK_HEADER);
+
+	if (ret)
+		return (ret);
+
+	if (!ret) {
+		*buf_in = buf;
+		*body_size = toksize;
+	}
+
+	return (ret);
+}
+
+/*
+ * Given a buffer containing a token, returns a copy of the mech oid in
+ * the parameter mech.
+ */
+__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t *in_buf)
+{
+	unsigned char *buf = in_buf->data;
+	int len = in_buf->len;
+	int ret = 0;
+	int seqsize;
+
+	if ((len -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	if (*buf++ != 0x60)
+		return (G_BAD_TOK_HEADER);
+
+	if ((seqsize = der_read_length(&buf, &len)) < 0)
+		return (G_BAD_TOK_HEADER);
+
+	if ((len -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	if (*buf++ != 0x06)
+		return (G_BAD_TOK_HEADER);
+
+	if ((len -= 1) < 0)
+		return (G_BAD_TOK_HEADER);
+	mech->len = *buf++;
+
+	if ((len -= mech->len) < 0)
+		return (G_BAD_TOK_HEADER);
+	OBD_ALLOC_LARGE(mech->data, mech->len);
+	if (!mech->data)
+		return (G_BUFFER_ALLOC);
+	memcpy(mech->data, buf, mech->len);
+
+	return ret;
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h
new file mode 100644
index 0000000..cbfc47c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h

@@ -0,0 +1,526 @@
+/*
+ * Modified from NFSv4 project for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#ifndef __PTLRPC_GSS_GSS_INTERNAL_H_
+#define __PTLRPC_GSS_GSS_INTERNAL_H_
+
+#include <lustre_sec.h>
+
+/*
+ * rawobj stuff
+ */
+typedef struct netobj_s {
+	__u32	   len;
+	__u8	    data[0];
+} netobj_t;
+
+#define NETOBJ_EMPTY    ((netobj_t) { 0 })
+
+typedef struct rawobj_s {
+	__u32	   len;
+	__u8	   *data;
+} rawobj_t;
+
+#define RAWOBJ_EMPTY    ((rawobj_t) { 0, NULL })
+
+typedef struct rawobj_buf_s {
+	__u32	   dataoff;
+	__u32	   datalen;
+	__u32	   buflen;
+	__u8	   *buf;
+} rawobj_buf_t;
+
+int rawobj_empty(rawobj_t *obj);
+int rawobj_alloc(rawobj_t *obj, char *buf, int len);
+void rawobj_free(rawobj_t *obj);
+int rawobj_equal(rawobj_t *a, rawobj_t *b);
+int rawobj_dup(rawobj_t *dest, rawobj_t *src);
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj);
+int rawobj_from_netobj_alloc(rawobj_t *obj, netobj_t *netobj);
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+			 void *res, __u32 reslen);
+
+/*
+ * several timeout values. client refresh upcall timeout we using
+ * default in pipefs implemnetation.
+ */
+#define __TIMEOUT_DELTA		 (10)
+
+#define GSS_SECINIT_RPC_TIMEOUT					 \
+	(obd_timeout < __TIMEOUT_DELTA ?				\
+	 __TIMEOUT_DELTA : obd_timeout - __TIMEOUT_DELTA)
+
+#define GSS_SECFINI_RPC_TIMEOUT	 (__TIMEOUT_DELTA)
+#define GSS_SECSVC_UPCALL_TIMEOUT       (GSS_SECINIT_RPC_TIMEOUT)
+
+/*
+ * default gc interval
+ */
+#define GSS_GC_INTERVAL		 (60 * 60) /* 60 minutes */
+
+static inline
+unsigned long gss_round_ctx_expiry(unsigned long expiry,
+				   unsigned long sec_flags)
+{
+	if (sec_flags & PTLRPC_SEC_FL_REVERSE)
+		return expiry;
+
+	if (get_seconds() + __TIMEOUT_DELTA <= expiry)
+		return expiry - __TIMEOUT_DELTA;
+
+	return expiry;
+}
+
+/*
+ * Max encryption element in block cipher algorithms.
+ */
+#define GSS_MAX_CIPHER_BLOCK	       (16)
+
+/*
+ * XXX make it visible of kernel and lgssd/lsvcgssd
+ */
+#define GSSD_INTERFACE_VERSION	  (1)
+
+#define PTLRPC_GSS_VERSION	      (1)
+
+
+enum ptlrpc_gss_proc {
+	PTLRPC_GSS_PROC_DATA	    = 0,
+	PTLRPC_GSS_PROC_INIT	    = 1,
+	PTLRPC_GSS_PROC_CONTINUE_INIT   = 2,
+	PTLRPC_GSS_PROC_DESTROY	 = 3,
+	PTLRPC_GSS_PROC_ERR	     = 4,
+};
+
+enum ptlrpc_gss_tgt {
+	LUSTRE_GSS_TGT_MGS	      = 0,
+	LUSTRE_GSS_TGT_MDS	      = 1,
+	LUSTRE_GSS_TGT_OSS	      = 2,
+};
+
+enum ptlrpc_gss_header_flags {
+	LUSTRE_GSS_PACK_BULK	    = 1,
+	LUSTRE_GSS_PACK_USER	    = 2,
+};
+
+static inline
+__u32 import_to_gss_svc(struct obd_import *imp)
+{
+	const char *name = imp->imp_obd->obd_type->typ_name;
+
+	if (!strcmp(name, LUSTRE_MGC_NAME))
+		return LUSTRE_GSS_TGT_MGS;
+	if (!strcmp(name, LUSTRE_MDC_NAME))
+		return LUSTRE_GSS_TGT_MDS;
+	if (!strcmp(name, LUSTRE_OSC_NAME))
+		return LUSTRE_GSS_TGT_OSS;
+	LBUG();
+	return 0;
+}
+
+/*
+ * following 3 header must have the same size and offset
+ */
+struct gss_header {
+	__u8		    gh_version;     /* gss version */
+	__u8		    gh_sp;	  /* sec part */
+	__u16		   gh_pad0;
+	__u32		   gh_flags;       /* wrap flags */
+	__u32		   gh_proc;	/* proc */
+	__u32		   gh_seq;	 /* sequence */
+	__u32		   gh_svc;	 /* service */
+	__u32		   gh_pad1;
+	__u32		   gh_pad2;
+	__u32		   gh_pad3;
+	netobj_t		gh_handle;      /* context handle */
+};
+
+struct gss_rep_header {
+	__u8		    gh_version;
+	__u8		    gh_sp;
+	__u16		   gh_pad0;
+	__u32		   gh_flags;
+	__u32		   gh_proc;
+	__u32		   gh_major;
+	__u32		   gh_minor;
+	__u32		   gh_seqwin;
+	__u32		   gh_pad2;
+	__u32		   gh_pad3;
+	netobj_t		gh_handle;
+};
+
+struct gss_err_header {
+	__u8		    gh_version;
+	__u8		    gh_sp;
+	__u16		   gh_pad0;
+	__u32		   gh_flags;
+	__u32		   gh_proc;
+	__u32		   gh_major;
+	__u32		   gh_minor;
+	__u32		   gh_pad1;
+	__u32		   gh_pad2;
+	__u32		   gh_pad3;
+	netobj_t		gh_handle;
+};
+
+/*
+ * part of wire context information send from client which be saved and
+ * used later by server.
+ */
+struct gss_wire_ctx {
+	__u32		   gw_flags;
+	__u32		   gw_proc;
+	__u32		   gw_seq;
+	__u32		   gw_svc;
+	rawobj_t		gw_handle;
+};
+
+#define PTLRPC_GSS_MAX_HANDLE_SIZE      (8)
+#define PTLRPC_GSS_HEADER_SIZE	  (sizeof(struct gss_header) + \
+					 PTLRPC_GSS_MAX_HANDLE_SIZE)
+
+
+static inline __u64 gss_handle_to_u64(rawobj_t *handle)
+{
+	if (handle->len != PTLRPC_GSS_MAX_HANDLE_SIZE)
+		return -1;
+	return *((__u64 *) handle->data);
+}
+
+#define GSS_SEQ_WIN		     (2048)
+#define GSS_SEQ_WIN_MAIN		GSS_SEQ_WIN
+#define GSS_SEQ_WIN_BACK		(128)
+#define GSS_SEQ_REPACK_THRESHOLD	(GSS_SEQ_WIN_MAIN / 2 + \
+					 GSS_SEQ_WIN_MAIN / 4)
+
+struct gss_svc_seq_data {
+	spinlock_t		ssd_lock;
+	/*
+	 * highest sequence number seen so far, for main and back window
+	 */
+	__u32		   ssd_max_main;
+	__u32		   ssd_max_back;
+	/*
+	 * main and back window
+	 * for i such that ssd_max - GSS_SEQ_WIN < i <= ssd_max, the i-th bit
+	 * of ssd_win is nonzero iff sequence number i has been seen already.
+	 */
+	unsigned long	   ssd_win_main[GSS_SEQ_WIN_MAIN/BITS_PER_LONG];
+	unsigned long	   ssd_win_back[GSS_SEQ_WIN_BACK/BITS_PER_LONG];
+};
+
+struct gss_svc_ctx {
+	struct gss_ctx	 *gsc_mechctx;
+	struct gss_svc_seq_data gsc_seqdata;
+	rawobj_t		gsc_rvs_hdl;
+	__u32		   gsc_rvs_seq;
+	uid_t		   gsc_uid;
+	gid_t		   gsc_gid;
+	uid_t		   gsc_mapped_uid;
+	unsigned int	    gsc_usr_root:1,
+				gsc_usr_mds:1,
+				gsc_usr_oss:1,
+				gsc_remote:1,
+				gsc_reverse:1;
+};
+
+struct gss_svc_reqctx {
+	struct ptlrpc_svc_ctx	   src_base;
+	/*
+	 * context
+	 */
+	struct gss_wire_ctx	     src_wirectx;
+	struct gss_svc_ctx	     *src_ctx;
+	/*
+	 * record place of bulk_sec_desc in request/reply buffer
+	 */
+	struct ptlrpc_bulk_sec_desc    *src_reqbsd;
+	int			     src_reqbsd_size;
+	struct ptlrpc_bulk_sec_desc    *src_repbsd;
+	int			     src_repbsd_size;
+	/*
+	 * flags
+	 */
+	unsigned int		    src_init:1,
+					src_init_continue:1,
+					src_err_notify:1;
+	int			     src_reserve_len;
+};
+
+struct gss_cli_ctx {
+	struct ptlrpc_cli_ctx   gc_base;
+	__u32		   gc_flavor;
+	__u32		   gc_proc;
+	__u32		   gc_win;
+	atomic_t	    gc_seq;
+	rawobj_t		gc_handle;
+	struct gss_ctx	 *gc_mechctx;
+	/* handle for the buddy svc ctx */
+	rawobj_t		gc_svc_handle;
+};
+
+struct gss_cli_ctx_keyring {
+	struct gss_cli_ctx      gck_base;
+	struct key	     *gck_key;
+	struct timer_list      *gck_timer;
+};
+
+struct gss_sec {
+	struct ptlrpc_sec	gs_base;
+	struct gss_api_mech	*gs_mech;
+	spinlock_t		gs_lock;
+	__u64			gs_rvs_hdl;
+};
+
+struct gss_sec_pipefs {
+	struct gss_sec	  gsp_base;
+	int		     gsp_chash_size;  /* must be 2^n */
+	struct hlist_head	gsp_chash[0];
+};
+
+/*
+ * FIXME cleanup the keyring upcall mutexes
+ */
+#define HAVE_KEYRING_UPCALL_SERIALIZED  1
+
+struct gss_sec_keyring {
+	struct gss_sec	  gsk_base;
+	/*
+	 * all contexts listed here. access is protected by sec spinlock.
+	 */
+	struct hlist_head	gsk_clist;
+	/*
+	 * specially point to root ctx (only one at a time). access is
+	 * protected by sec spinlock.
+	 */
+	struct ptlrpc_cli_ctx  *gsk_root_ctx;
+	/*
+	 * specially serialize upcalls for root context.
+	 */
+	struct mutex			gsk_root_uc_lock;
+
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	struct mutex		gsk_uc_lock;	/* serialize upcalls */
+#endif
+};
+
+static inline struct gss_cli_ctx *ctx2gctx(struct ptlrpc_cli_ctx *ctx)
+{
+	return container_of(ctx, struct gss_cli_ctx, gc_base);
+}
+
+static inline
+struct gss_cli_ctx_keyring *ctx2gctx_keyring(struct ptlrpc_cli_ctx *ctx)
+{
+	return container_of(ctx2gctx(ctx),
+			    struct gss_cli_ctx_keyring, gck_base);
+}
+
+static inline struct gss_sec *sec2gsec(struct ptlrpc_sec *sec)
+{
+	return container_of(sec, struct gss_sec, gs_base);
+}
+
+static inline struct gss_sec_pipefs *sec2gsec_pipefs(struct ptlrpc_sec *sec)
+{
+	return container_of(sec2gsec(sec), struct gss_sec_pipefs, gsp_base);
+}
+
+static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec)
+{
+	return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base);
+}
+
+
+#define GSS_CTX_INIT_MAX_LEN	    (1024)
+
+/*
+ * This only guaranteed be enough for current krb5 des-cbc-crc . We might
+ * adjust this when new enc type or mech added in.
+ */
+#define GSS_PRIVBUF_PREFIX_LEN	 (32)
+#define GSS_PRIVBUF_SUFFIX_LEN	 (32)
+
+static inline
+struct gss_svc_reqctx *gss_svc_ctx2reqctx(struct ptlrpc_svc_ctx *ctx)
+{
+	LASSERT(ctx);
+	return container_of(ctx, struct gss_svc_reqctx, src_base);
+}
+
+static inline
+struct gss_svc_ctx *gss_svc_ctx2gssctx(struct ptlrpc_svc_ctx *ctx)
+{
+	LASSERT(ctx);
+	return gss_svc_ctx2reqctx(ctx)->src_ctx;
+}
+
+/* sec_gss.c */
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred);
+int gss_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+
+int  gss_sec_install_rctx(struct obd_import *imp, struct ptlrpc_sec *sec,
+			  struct ptlrpc_cli_ctx *ctx);
+int  gss_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+		      int msgsize);
+void gss_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+		      int msgsize);
+void gss_free_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_enlarge_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+			int segment, int newsize);
+
+int  gss_svc_accept(struct ptlrpc_sec_policy *policy,
+		    struct ptlrpc_request *req);
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx);
+int  gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  gss_svc_authorize(struct ptlrpc_request *req);
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs);
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx);
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx);
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+			 struct ptlrpc_svc_ctx *svc_ctx);
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+				   int swabbed);
+netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment);
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx);
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor);
+int gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num, int set);
+
+int gss_sec_create_common(struct gss_sec *gsec,
+			  struct ptlrpc_sec_policy *policy,
+			  struct obd_import *imp,
+			  struct ptlrpc_svc_ctx *ctx,
+			  struct sptlrpc_flavor *sf);
+void gss_sec_destroy_common(struct gss_sec *gsec);
+void gss_sec_kill(struct ptlrpc_sec *sec);
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx,
+			    struct ptlrpc_ctx_ops *ctxops,
+			    struct vfs_cred *vcred);
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx);
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize);
+
+/* gss_keyring.c */
+int  __init gss_init_keyring(void);
+void __exit gss_exit_keyring(void);
+
+/* gss_pipefs.c */
+int  __init gss_init_pipefs(void);
+void __exit gss_exit_pipefs(void);
+
+/* gss_bulk.c */
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			  struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			    struct ptlrpc_request *req,
+			    struct ptlrpc_bulk_desc *desc);
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc);
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc);
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+		      struct ptlrpc_bulk_desc *desc);
+
+/* gss_mech_switch.c */
+int init_kerberos_module(void);
+void cleanup_kerberos_module(void);
+
+/* gss_generic_token.c */
+int g_token_size(rawobj_t *mech, unsigned int body_size);
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf);
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+			    unsigned char **buf_in, int toksize);
+
+
+/* gss_cli_upcall.c */
+int gss_do_ctx_init_rpc(char *buffer, unsigned long count);
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx);
+
+int  __init gss_init_cli_upcall(void);
+void __exit gss_exit_cli_upcall(void);
+
+/* gss_svc_upcall.c */
+__u64 gss_get_next_ctx_index(void);
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+				   struct gss_sec *gsec,
+				   struct gss_cli_ctx *gctx);
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle);
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx);
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq);
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+			       struct gss_svc_reqctx *grctx,
+			       struct gss_wire_ctx *gw,
+			       struct obd_device *target,
+			       __u32 lustre_svc,
+			       rawobj_t *rvs_hdl,
+			       rawobj_t *in_token);
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+					   struct gss_wire_ctx *gw);
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx);
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx);
+
+int  __init gss_init_svc_upcall(void);
+void __exit gss_exit_svc_upcall(void);
+
+/* lproc_gss.c */
+void gss_stat_oos_record_cli(int behind);
+void gss_stat_oos_record_svc(int phase, int replay);
+
+int  __init gss_init_lproc(void);
+void __exit gss_exit_lproc(void);
+
+/* gss_krb5_mech.c */
+int __init init_kerberos_module(void);
+void __exit cleanup_kerberos_module(void);
+
+
+/* debug */
+static inline
+void __dbg_memdump(char *name, void *ptr, int size)
+{
+	char *buf, *p = (char *) ptr;
+	int bufsize = size * 2 + 1, i;
+
+	OBD_ALLOC(buf, bufsize);
+	if (!buf) {
+		CDEBUG(D_ERROR, "DUMP ERROR: can't alloc %d bytes\n", bufsize);
+		return;
+	}
+
+	for (i = 0; i < size; i++)
+		sprintf(&buf[i+i], "%02x", (__u8) p[i]);
+	buf[size + size] = '\0';
+	LCONSOLE_INFO("DUMP %s@%p(%d): %s\n", name, ptr, size, buf);
+	OBD_FREE(buf, bufsize);
+}
+
+#endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c
new file mode 100644
index 0000000..bb571ae
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c

@@ -0,0 +1,1424 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_keyring.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_keyring;
+static struct ptlrpc_ctx_ops gss_keyring_ctxops;
+static struct key_type gss_key_type;
+
+static int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+			       struct ptlrpc_svc_ctx *svc_ctx);
+
+/*
+ * the timeout is only for the case that upcall child process die abnormally.
+ * in any other cases it should finally update kernel key.
+ *
+ * FIXME we'd better to incorporate the client & server side upcall timeouts
+ * into the framework of Adaptive Timeouts, but we need to figure out how to
+ * make sure that kernel knows the upcall processes is in-progress or died
+ * unexpectedly.
+ */
+#define KEYRING_UPCALL_TIMEOUT  (obd_timeout + obd_timeout)
+
+/****************************************
+ * internal helpers		     *
+ ****************************************/
+
+#define DUMP_PROCESS_KEYRINGS(tsk)					\
+{									\
+	CWARN("DUMP PK: %s[%u,%u/%u](<-%s[%u,%u/%u]): "			\
+	      "a %d, t %d, p %d, s %d, u %d, us %d, df %d\n",		\
+	      tsk->comm, tsk->pid, tsk->uid, tsk->fsuid,		\
+	      tsk->parent->comm, tsk->parent->pid,			\
+	      tsk->parent->uid, tsk->parent->fsuid,			\
+	      tsk->request_key_auth ?					\
+	      tsk->request_key_auth->serial : 0,			\
+	      key_cred(tsk)->thread_keyring ?				\
+	      key_cred(tsk)->thread_keyring->serial : 0,		\
+	      key_tgcred(tsk)->process_keyring ?			\
+	      key_tgcred(tsk)->process_keyring->serial : 0,		\
+	      key_tgcred(tsk)->session_keyring ?			\
+	      key_tgcred(tsk)->session_keyring->serial : 0,		\
+	      key_cred(tsk)->user->uid_keyring ?			\
+	      key_cred(tsk)->user->uid_keyring->serial : 0,		\
+	      key_cred(tsk)->user->session_keyring ?			\
+	      key_cred(tsk)->user->session_keyring->serial : 0,		\
+	      key_cred(tsk)->jit_keyring				\
+	     );								\
+}
+
+#define DUMP_KEY(key)						   \
+{								       \
+	CWARN("DUMP KEY: %p(%d) ref %d u%u/g%u desc %s\n",	      \
+	      key, key->serial, atomic_read(&key->usage),	       \
+	      key->uid, key->gid,				       \
+	      key->description ? key->description : "n/a"	       \
+	     );							 \
+}
+
+#define key_cred(tsk)   ((tsk)->cred)
+#define key_tgcred(tsk) ((tsk)->cred->tgcred)
+
+static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_lock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void keyring_upcall_unlock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_unlock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void key_revoke_locked(struct key *key)
+{
+	set_bit(KEY_FLAG_REVOKED, &key->flags);
+}
+
+static void ctx_upcall_timeout_kr(unsigned long data)
+{
+	struct ptlrpc_cli_ctx *ctx = (struct ptlrpc_cli_ctx *) data;
+	struct key	    *key = ctx2gctx_keyring(ctx)->gck_key;
+
+	CWARN("ctx %p, key %p\n", ctx, key);
+
+	LASSERT(key);
+
+	cli_ctx_expire(ctx);
+	key_revoke_locked(key);
+}
+
+static
+void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, long timeout)
+{
+	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+	struct timer_list	  *timer = gctx_kr->gck_timer;
+
+	LASSERT(timer);
+
+	CDEBUG(D_SEC, "ctx %p: start timer %lds\n", ctx, timeout);
+	timeout = timeout * HZ + cfs_time_current();
+
+	init_timer(timer);
+	timer->expires = timeout;
+	timer->data = (unsigned long ) ctx;
+	timer->function = ctx_upcall_timeout_kr;
+
+	add_timer(timer);
+}
+
+/*
+ * caller should make sure no race with other threads
+ */
+static
+void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+	struct timer_list	  *timer = gctx_kr->gck_timer;
+
+	if (timer == NULL)
+		return;
+
+	CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key);
+
+	gctx_kr->gck_timer = NULL;
+
+	del_singleshot_timer_sync(timer);
+
+	OBD_FREE_PTR(timer);
+}
+
+static
+struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec,
+				     struct vfs_cred *vcred)
+{
+	struct ptlrpc_cli_ctx      *ctx;
+	struct gss_cli_ctx_keyring *gctx_kr;
+
+	OBD_ALLOC_PTR(gctx_kr);
+	if (gctx_kr == NULL)
+		return NULL;
+
+	OBD_ALLOC_PTR(gctx_kr->gck_timer);
+	if (gctx_kr->gck_timer == NULL) {
+		OBD_FREE_PTR(gctx_kr);
+		return NULL;
+	}
+	init_timer(gctx_kr->gck_timer);
+
+	ctx = &gctx_kr->gck_base.gc_base;
+
+	if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
+		OBD_FREE_PTR(gctx_kr->gck_timer);
+		OBD_FREE_PTR(gctx_kr);
+		return NULL;
+	}
+
+	ctx->cc_expire = cfs_time_current_sec() + KEYRING_UPCALL_TIMEOUT;
+	clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags);
+	atomic_inc(&ctx->cc_refcount); /* for the caller */
+
+	return ctx;
+}
+
+static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_sec	  *sec = ctx->cc_sec;
+	struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+
+	CDEBUG(D_SEC, "destroying ctx %p\n", ctx);
+
+	/* at this time the association with key has been broken. */
+	LASSERT(sec);
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+	LASSERT(gctx_kr->gck_key == NULL);
+
+	ctx_clear_timer_kr(ctx);
+	LASSERT(gctx_kr->gck_timer == NULL);
+
+	if (gss_cli_ctx_fini_common(sec, ctx))
+		return;
+
+	OBD_FREE_PTR(gctx_kr);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static void ctx_release_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	if (sync) {
+		ctx_destroy_kr(ctx);
+	} else {
+		atomic_inc(&ctx->cc_refcount);
+		sptlrpc_gc_add_ctx(ctx);
+	}
+}
+
+static void ctx_put_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	if (atomic_dec_and_test(&ctx->cc_refcount))
+		ctx_release_kr(ctx, sync);
+}
+
+/*
+ * key <-> ctx association and rules:
+ * - ctx might not bind with any key
+ * - key/ctx binding is protected by key semaphore (if the key present)
+ * - key and ctx each take a reference of the other
+ * - ctx enlist/unlist is protected by ctx spinlock
+ * - never enlist a ctx after it's been unlisted
+ * - whoever do enlist should also do bind, lock key before enlist:
+ *   - lock key -> lock ctx -> enlist -> unlock ctx -> bind -> unlock key
+ * - whoever do unlist should also do unbind:
+ *   - lock key -> lock ctx -> unlist -> unlock ctx -> unbind -> unlock key
+ *   - lock ctx -> unlist -> unlock ctx -> lock key -> unbind -> unlock key
+ */
+
+static inline void spin_lock_if(spinlock_t *lock, int condition)
+{
+	if (condition)
+		spin_lock(lock);
+}
+
+static inline void spin_unlock_if(spinlock_t *lock, int condition)
+{
+	if (condition)
+		spin_unlock(lock);
+}
+
+static void ctx_enlist_kr(struct ptlrpc_cli_ctx *ctx, int is_root, int locked)
+{
+	struct ptlrpc_sec      *sec = ctx->cc_sec;
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+
+	LASSERT(!test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	spin_lock_if(&sec->ps_lock, !locked);
+
+	atomic_inc(&ctx->cc_refcount);
+	set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+	hlist_add_head(&ctx->cc_cache, &gsec_kr->gsk_clist);
+	if (is_root)
+		gsec_kr->gsk_root_ctx = ctx;
+
+	spin_unlock_if(&sec->ps_lock, !locked);
+}
+
+/*
+ * Note after this get called, caller should not access ctx again because
+ * it might have been freed, unless caller hold at least one refcount of
+ * the ctx.
+ *
+ * return non-zero if we indeed unlist this ctx.
+ */
+static int ctx_unlist_kr(struct ptlrpc_cli_ctx *ctx, int locked)
+{
+	struct ptlrpc_sec       *sec = ctx->cc_sec;
+	struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+
+	/* if hashed bit has gone, leave the job to somebody who is doing it */
+	if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0)
+		return 0;
+
+	/* drop ref inside spin lock to prevent race with other operations */
+	spin_lock_if(&sec->ps_lock, !locked);
+
+	if (gsec_kr->gsk_root_ctx == ctx)
+		gsec_kr->gsk_root_ctx = NULL;
+	hlist_del_init(&ctx->cc_cache);
+	atomic_dec(&ctx->cc_refcount);
+
+	spin_unlock_if(&sec->ps_lock, !locked);
+
+	return 1;
+}
+
+/*
+ * bind a key with a ctx together.
+ * caller must hold write lock of the key, as well as ref on key & ctx.
+ */
+static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(atomic_read(&key->usage) > 0);
+	LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL);
+	LASSERT(key->payload.data == NULL);
+
+	/* at this time context may or may not in list. */
+	key_get(key);
+	atomic_inc(&ctx->cc_refcount);
+	ctx2gctx_keyring(ctx)->gck_key = key;
+	key->payload.data = ctx;
+}
+
+/*
+ * unbind a key and a ctx.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(key->payload.data == ctx);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+
+	/* must revoke the key, or others may treat it as newly created */
+	key_revoke_locked(key);
+
+	key->payload.data = NULL;
+	ctx2gctx_keyring(ctx)->gck_key = NULL;
+
+	/* once ctx get split from key, the timer is meaningless */
+	ctx_clear_timer_kr(ctx);
+
+	ctx_put_kr(ctx, 1);
+	key_put(key);
+}
+
+/*
+ * given a ctx, unbind with its coupled key, if any.
+ * unbind could only be called once, so we don't worry the key be released
+ * by someone else.
+ */
+static void unbind_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	struct key      *key = ctx2gctx_keyring(ctx)->gck_key;
+
+	if (key) {
+		LASSERT(key->payload.data == ctx);
+
+		key_get(key);
+		down_write(&key->sem);
+		unbind_key_ctx(key, ctx);
+		up_write(&key->sem);
+		key_put(key);
+	}
+}
+
+/*
+ * given a key, unbind with its coupled ctx, if any.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_locked(struct key *key)
+{
+	struct ptlrpc_cli_ctx   *ctx = key->payload.data;
+
+	if (ctx)
+		unbind_key_ctx(key, ctx);
+}
+
+/*
+ * unlist a ctx, and unbind from coupled key
+ */
+static void kill_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	if (ctx_unlist_kr(ctx, 0))
+		unbind_ctx_kr(ctx);
+}
+
+/*
+ * given a key, unlist and unbind with the coupled ctx (if any).
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void kill_key_locked(struct key *key)
+{
+	struct ptlrpc_cli_ctx *ctx = key->payload.data;
+
+	if (ctx && ctx_unlist_kr(ctx, 0))
+		unbind_key_locked(key);
+}
+
+/*
+ * caller should hold one ref on contexts in freelist.
+ */
+static void dispose_ctx_list_kr(struct hlist_head *freelist)
+{
+	struct hlist_node      *next;
+	struct ptlrpc_cli_ctx  *ctx;
+	struct gss_cli_ctx     *gctx;
+
+	hlist_for_each_entry_safe(ctx, next, freelist, cc_cache) {
+		hlist_del_init(&ctx->cc_cache);
+
+		/* reverse ctx: update current seq to buddy svcctx if exist.
+		 * ideally this should be done at gss_cli_ctx_finalize(), but
+		 * the ctx destroy could be delayed by:
+		 *  1) ctx still has reference;
+		 *  2) ctx destroy is asynchronous;
+		 * and reverse import call inval_all_ctx() require this be done
+		 *_immediately_ otherwise newly created reverse ctx might copy
+		 * the very old sequence number from svcctx. */
+		gctx = ctx2gctx(ctx);
+		if (!rawobj_empty(&gctx->gc_svc_handle) &&
+		    sec_is_reverse(gctx->gc_base.cc_sec)) {
+			gss_svc_upcall_update_sequence(&gctx->gc_svc_handle,
+					(__u32) atomic_read(&gctx->gc_seq));
+		}
+
+		/* we need to wakeup waiting reqs here. the context might
+		 * be forced released before upcall finished, then the
+		 * late-arrived downcall can't find the ctx even. */
+		sptlrpc_cli_ctx_wakeup(ctx);
+
+		unbind_ctx_kr(ctx);
+		ctx_put_kr(ctx, 0);
+	}
+}
+
+/*
+ * lookup a root context directly in a sec, return root ctx with a
+ * reference taken or NULL.
+ */
+static
+struct ptlrpc_cli_ctx * sec_lookup_root_ctx_kr(struct ptlrpc_sec *sec)
+{
+	struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+	struct ptlrpc_cli_ctx   *ctx = NULL;
+
+	spin_lock(&sec->ps_lock);
+
+	ctx = gsec_kr->gsk_root_ctx;
+
+	if (ctx == NULL && unlikely(sec_is_reverse(sec))) {
+		struct ptlrpc_cli_ctx  *tmp;
+
+		/* reverse ctx, search root ctx in list, choose the one
+		 * with shortest expire time, which is most possibly have
+		 * an established peer ctx at client side. */
+		hlist_for_each_entry(tmp, &gsec_kr->gsk_clist, cc_cache) {
+			if (ctx == NULL || ctx->cc_expire == 0 ||
+			    ctx->cc_expire > tmp->cc_expire) {
+				ctx = tmp;
+				/* promote to be root_ctx */
+				gsec_kr->gsk_root_ctx = ctx;
+			}
+		}
+	}
+
+	if (ctx) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+		LASSERT(!hlist_empty(&gsec_kr->gsk_clist));
+		atomic_inc(&ctx->cc_refcount);
+	}
+
+	spin_unlock(&sec->ps_lock);
+
+	return ctx;
+}
+
+#define RVS_CTX_EXPIRE_NICE    (10)
+
+static
+void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec,
+				 struct ptlrpc_cli_ctx *new_ctx,
+				 struct key *key)
+{
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct ptlrpc_cli_ctx  *ctx;
+	cfs_time_t	      now;
+	ENTRY;
+
+	LASSERT(sec_is_reverse(sec));
+
+	spin_lock(&sec->ps_lock);
+
+	now = cfs_time_current_sec();
+
+	/* set all existing ctxs short expiry */
+	hlist_for_each_entry(ctx, &gsec_kr->gsk_clist, cc_cache) {
+		if (ctx->cc_expire > now + RVS_CTX_EXPIRE_NICE) {
+			ctx->cc_early_expire = 1;
+			ctx->cc_expire = now + RVS_CTX_EXPIRE_NICE;
+		}
+	}
+
+	/* if there's root_ctx there, instead obsolete the current
+	 * immediately, we leave it continue operating for a little while.
+	 * hopefully when the first backward rpc with newest ctx send out,
+	 * the client side already have the peer ctx well established. */
+	ctx_enlist_kr(new_ctx, gsec_kr->gsk_root_ctx ? 0 : 1, 1);
+
+	if (key)
+		bind_key_ctx(key, new_ctx);
+
+	spin_unlock(&sec->ps_lock);
+}
+
+static void construct_key_desc(void *buf, int bufsize,
+			       struct ptlrpc_sec *sec, uid_t uid)
+{
+	snprintf(buf, bufsize, "%d@%x", uid, sec->ps_id);
+	((char *)buf)[bufsize - 1] = '\0';
+}
+
+/****************************************
+ * sec apis			     *
+ ****************************************/
+
+static
+struct ptlrpc_sec * gss_sec_create_kr(struct obd_import *imp,
+				      struct ptlrpc_svc_ctx *svcctx,
+				      struct sptlrpc_flavor *sf)
+{
+	struct gss_sec_keyring  *gsec_kr;
+	ENTRY;
+
+	OBD_ALLOC(gsec_kr, sizeof(*gsec_kr));
+	if (gsec_kr == NULL)
+		RETURN(NULL);
+
+	INIT_HLIST_HEAD(&gsec_kr->gsk_clist);
+	gsec_kr->gsk_root_ctx = NULL;
+	mutex_init(&gsec_kr->gsk_root_uc_lock);
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+	mutex_init(&gsec_kr->gsk_uc_lock);
+#endif
+
+	if (gss_sec_create_common(&gsec_kr->gsk_base, &gss_policy_keyring,
+				  imp, svcctx, sf))
+		goto err_free;
+
+	if (svcctx != NULL &&
+	    sec_install_rctx_kr(&gsec_kr->gsk_base.gs_base, svcctx)) {
+		gss_sec_destroy_common(&gsec_kr->gsk_base);
+		goto err_free;
+	}
+
+	RETURN(&gsec_kr->gsk_base.gs_base);
+
+err_free:
+	OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+	RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_kr(struct ptlrpc_sec *sec)
+{
+	struct gss_sec	  *gsec = sec2gsec(sec);
+	struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+
+	CDEBUG(D_SEC, "destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+	LASSERT(hlist_empty(&gsec_kr->gsk_clist));
+	LASSERT(gsec_kr->gsk_root_ctx == NULL);
+
+	gss_sec_destroy_common(gsec);
+
+	OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+}
+
+static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred)
+{
+	/* except the ROOTONLY flag, treat it as root user only if real uid
+	 * is 0, euid/fsuid being 0 are handled as setuid scenarios */
+	if (sec_is_rootonly(sec) || (vcred->vc_uid == 0))
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ * unlink request key from it's ring, which is linked during request_key().
+ * sadly, we have to 'guess' which keyring it's linked to.
+ *
+ * FIXME this code is fragile, depend on how request_key_link() is implemented.
+ */
+static void request_key_unlink(struct key *key)
+{
+	struct task_struct *tsk = current;
+	struct key *ring;
+
+	switch (key_cred(tsk)->jit_keyring) {
+	case KEY_REQKEY_DEFL_DEFAULT:
+	case KEY_REQKEY_DEFL_THREAD_KEYRING:
+		ring = key_get(key_cred(tsk)->thread_keyring);
+		if (ring)
+			break;
+	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
+		ring = key_get(key_tgcred(tsk)->process_keyring);
+		if (ring)
+			break;
+	case KEY_REQKEY_DEFL_SESSION_KEYRING:
+		rcu_read_lock();
+		ring = key_get(rcu_dereference(key_tgcred(tsk)
+					       ->session_keyring));
+		rcu_read_unlock();
+		if (ring)
+			break;
+	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
+		ring = key_get(key_cred(tsk)->user->session_keyring);
+		break;
+	case KEY_REQKEY_DEFL_USER_KEYRING:
+		ring = key_get(key_cred(tsk)->user->uid_keyring);
+		break;
+	case KEY_REQKEY_DEFL_GROUP_KEYRING:
+	default:
+		LBUG();
+	}
+
+	LASSERT(ring);
+	key_unlink(ring, key);
+	key_put(ring);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec,
+					      struct vfs_cred *vcred,
+					      int create, int remove_dead)
+{
+	struct obd_import       *imp = sec->ps_import;
+	struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+	struct ptlrpc_cli_ctx   *ctx = NULL;
+	unsigned int	     is_root = 0, create_new = 0;
+	struct key	      *key;
+	char		     desc[24];
+	char		    *coinfo;
+	int		      coinfo_size;
+	char		    *co_flags = "";
+	ENTRY;
+
+	LASSERT(imp != NULL);
+
+	is_root = user_is_root(sec, vcred);
+
+	/* a little bit optimization for root context */
+	if (is_root) {
+		ctx = sec_lookup_root_ctx_kr(sec);
+		/*
+		 * Only lookup directly for REVERSE sec, which should
+		 * always succeed.
+		 */
+		if (ctx || sec_is_reverse(sec))
+			RETURN(ctx);
+	}
+
+	LASSERT(create != 0);
+
+	/* for root context, obtain lock and check again, this time hold
+	 * the root upcall lock, make sure nobody else populated new root
+	 * context after last check. */
+	if (is_root) {
+		mutex_lock(&gsec_kr->gsk_root_uc_lock);
+
+		ctx = sec_lookup_root_ctx_kr(sec);
+		if (ctx)
+			goto out;
+
+		/* update reverse handle for root user */
+		sec2gsec(sec)->gs_rvs_hdl = gss_get_next_ctx_index();
+
+		switch (sec->ps_part) {
+		case LUSTRE_SP_MDT:
+			co_flags = "m";
+			break;
+		case LUSTRE_SP_OST:
+			co_flags = "o";
+			break;
+		case LUSTRE_SP_MGC:
+			co_flags = "rmo";
+			break;
+		case LUSTRE_SP_CLI:
+			co_flags = "r";
+			break;
+		case LUSTRE_SP_MGS:
+		default:
+			LBUG();
+		}
+	}
+
+	/* in case of setuid, key will be constructed as owner of fsuid/fsgid,
+	 * but we do authentication based on real uid/gid. the key permission
+	 * bits will be exactly as POS_ALL, so only processes who subscribed
+	 * this key could have the access, although the quota might be counted
+	 * on others (fsuid/fsgid).
+	 *
+	 * keyring will use fsuid/fsgid as upcall parameters, so we have to
+	 * encode real uid/gid into callout info.
+	 */
+
+	construct_key_desc(desc, sizeof(desc), sec, vcred->vc_uid);
+
+	/* callout info format:
+	 * secid:mech:uid:gid:flags:svc_type:peer_nid:target_uuid
+	 */
+	coinfo_size = sizeof(struct obd_uuid) + MAX_OBD_NAME + 64;
+	OBD_ALLOC(coinfo, coinfo_size);
+	if (coinfo == NULL)
+		goto out;
+
+	snprintf(coinfo, coinfo_size, "%d:%s:%u:%u:%s:%d:"LPX64":%s",
+		 sec->ps_id, sec2gsec(sec)->gs_mech->gm_name,
+		 vcred->vc_uid, vcred->vc_gid,
+		 co_flags, import_to_gss_svc(imp),
+		 imp->imp_connection->c_peer.nid, imp->imp_obd->obd_name);
+
+	CDEBUG(D_SEC, "requesting key for %s\n", desc);
+
+	keyring_upcall_lock(gsec_kr);
+	key = request_key(&gss_key_type, desc, coinfo);
+	keyring_upcall_unlock(gsec_kr);
+
+	OBD_FREE(coinfo, coinfo_size);
+
+	if (IS_ERR(key)) {
+		CERROR("failed request key: %ld\n", PTR_ERR(key));
+		goto out;
+	}
+	CDEBUG(D_SEC, "obtained key %08x for %s\n", key->serial, desc);
+
+	/* once payload.data was pointed to a ctx, it never changes until
+	 * we de-associate them; but parallel request_key() may return
+	 * a key with payload.data == NULL at the same time. so we still
+	 * need wirtelock of key->sem to serialize them. */
+	down_write(&key->sem);
+
+	if (likely(key->payload.data != NULL)) {
+		ctx = key->payload.data;
+
+		LASSERT(atomic_read(&ctx->cc_refcount) >= 1);
+		LASSERT(ctx2gctx_keyring(ctx)->gck_key == key);
+		LASSERT(atomic_read(&key->usage) >= 2);
+
+		/* simply take a ref and return. it's upper layer's
+		 * responsibility to detect & replace dead ctx. */
+		atomic_inc(&ctx->cc_refcount);
+	} else {
+		/* pre initialization with a cli_ctx. this can't be done in
+		 * key_instantiate() because we'v no enough information
+		 * there. */
+		ctx = ctx_create_kr(sec, vcred);
+		if (ctx != NULL) {
+			ctx_enlist_kr(ctx, is_root, 0);
+			bind_key_ctx(key, ctx);
+
+			ctx_start_timer_kr(ctx, KEYRING_UPCALL_TIMEOUT);
+
+			CDEBUG(D_SEC, "installed key %p <-> ctx %p (sec %p)\n",
+			       key, ctx, sec);
+		} else {
+			/* we'd prefer to call key_revoke(), but we more like
+			 * to revoke it within this key->sem locked period. */
+			key_revoke_locked(key);
+		}
+
+		create_new = 1;
+	}
+
+	up_write(&key->sem);
+
+	if (is_root && create_new)
+		request_key_unlink(key);
+
+	key_put(key);
+out:
+	if (is_root)
+		mutex_unlock(&gsec_kr->gsk_root_uc_lock);
+	RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_kr(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx,
+			    int sync)
+{
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	ctx_release_kr(ctx, sync);
+}
+
+/*
+ * flush context of normal user, we must resort to keyring itself to find out
+ * contexts which belong to me.
+ *
+ * Note here we suppose only to flush _my_ context, the "uid" will
+ * be ignored in the search.
+ */
+static
+void flush_user_ctx_cache_kr(struct ptlrpc_sec *sec,
+			     uid_t uid,
+			     int grace, int force)
+{
+	struct key	      *key;
+	char		     desc[24];
+
+	/* nothing to do for reverse or rootonly sec */
+	if (sec_is_reverse(sec) || sec_is_rootonly(sec))
+		return;
+
+	construct_key_desc(desc, sizeof(desc), sec, uid);
+
+	/* there should be only one valid key, but we put it in the
+	 * loop in case of any weird cases */
+	for (;;) {
+		key = request_key(&gss_key_type, desc, NULL);
+		if (IS_ERR(key)) {
+			CDEBUG(D_SEC, "No more key found for current user\n");
+			break;
+		}
+
+		down_write(&key->sem);
+
+		kill_key_locked(key);
+
+		/* kill_key_locked() should usually revoke the key, but we
+		 * revoke it again to make sure, e.g. some case the key may
+		 * not well coupled with a context. */
+		key_revoke_locked(key);
+
+		up_write(&key->sem);
+
+		key_put(key);
+	}
+}
+
+/*
+ * flush context of root or all, we iterate through the list.
+ */
+static
+void flush_spec_ctx_cache_kr(struct ptlrpc_sec *sec,
+			     uid_t uid,
+			     int grace, int force)
+{
+	struct gss_sec_keyring *gsec_kr;
+	struct hlist_head	freelist = HLIST_HEAD_INIT;
+	struct hlist_node      *next;
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	gsec_kr = sec2gsec_keyring(sec);
+
+	spin_lock(&sec->ps_lock);
+	hlist_for_each_entry_safe(ctx, next,
+				      &gsec_kr->gsk_clist, cc_cache) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+		if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+			continue;
+
+		/* at this moment there's at least 2 base reference:
+		 * key association and in-list. */
+		if (atomic_read(&ctx->cc_refcount) > 2) {
+			if (!force)
+				continue;
+			CWARN("flush busy ctx %p(%u->%s, extra ref %d)\n",
+			      ctx, ctx->cc_vcred.vc_uid,
+			      sec2target_str(ctx->cc_sec),
+			      atomic_read(&ctx->cc_refcount) - 2);
+		}
+
+		set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+		if (!grace)
+			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+		atomic_inc(&ctx->cc_refcount);
+
+		if (ctx_unlist_kr(ctx, 1)) {
+			hlist_add_head(&ctx->cc_cache, &freelist);
+		} else {
+			LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+			atomic_dec(&ctx->cc_refcount);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	dispose_ctx_list_kr(&freelist);
+	EXIT;
+}
+
+static
+int gss_sec_flush_ctx_cache_kr(struct ptlrpc_sec *sec,
+			       uid_t uid, int grace, int force)
+{
+	ENTRY;
+
+	CDEBUG(D_SEC, "sec %p(%d, nctx %d), uid %d, grace %d, force %d\n",
+	       sec, atomic_read(&sec->ps_refcount),
+	       atomic_read(&sec->ps_nctx),
+	       uid, grace, force);
+
+	if (uid != -1 && uid != 0)
+		flush_user_ctx_cache_kr(sec, uid, grace, force);
+	else
+		flush_spec_ctx_cache_kr(sec, uid, grace, force);
+
+	RETURN(0);
+}
+
+static
+void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec)
+{
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_head	freelist = HLIST_HEAD_INIT;
+	struct hlist_node      *next;
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	CWARN("running gc\n");
+
+	spin_lock(&sec->ps_lock);
+	hlist_for_each_entry_safe(ctx, next,
+				      &gsec_kr->gsk_clist, cc_cache) {
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+		atomic_inc(&ctx->cc_refcount);
+
+		if (cli_ctx_check_death(ctx) && ctx_unlist_kr(ctx, 1)) {
+			hlist_add_head(&ctx->cc_cache, &freelist);
+			CWARN("unhashed ctx %p\n", ctx);
+		} else {
+			LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+			atomic_dec(&ctx->cc_refcount);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	dispose_ctx_list_kr(&freelist);
+	EXIT;
+	return;
+}
+
+static
+int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
+{
+	struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+	struct hlist_node      *next;
+	struct ptlrpc_cli_ctx  *ctx;
+	struct gss_cli_ctx     *gctx;
+	time_t		  now = cfs_time_current_sec();
+	ENTRY;
+
+	spin_lock(&sec->ps_lock);
+	hlist_for_each_entry_safe(ctx, next,
+				  &gsec_kr->gsk_clist, cc_cache) {
+		struct key	     *key;
+		char		    flags_str[40];
+		char		    mech[40];
+
+		gctx = ctx2gctx(ctx);
+		key = ctx2gctx_keyring(ctx)->gck_key;
+
+		gss_cli_ctx_flags2str(ctx->cc_flags,
+				      flags_str, sizeof(flags_str));
+
+		if (gctx->gc_mechctx)
+			lgss_display(gctx->gc_mechctx, mech, sizeof(mech));
+		else
+			snprintf(mech, sizeof(mech), "N/A");
+		mech[sizeof(mech) - 1] = '\0';
+
+		seq_printf(seq, "%p: uid %u, ref %d, expire %ld(%+ld), fl %s, "
+			   "seq %d, win %u, key %08x(ref %d), "
+			   "hdl "LPX64":"LPX64", mech: %s\n",
+			   ctx, ctx->cc_vcred.vc_uid,
+			   atomic_read(&ctx->cc_refcount),
+			   ctx->cc_expire,
+			   ctx->cc_expire ?  ctx->cc_expire - now : 0,
+			   flags_str,
+			   atomic_read(&gctx->gc_seq),
+			   gctx->gc_win,
+			   key ? key->serial : 0,
+			   key ? atomic_read(&key->usage) : 0,
+			   gss_handle_to_u64(&gctx->gc_handle),
+			   gss_handle_to_u64(&gctx->gc_svc_handle),
+			   mech);
+	}
+	spin_unlock(&sec->ps_lock);
+
+	RETURN(0);
+}
+
+/****************************************
+ * cli_ctx apis			 *
+ ****************************************/
+
+static
+int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	/* upcall is already on the way */
+	return 0;
+}
+
+static
+int gss_cli_ctx_validate_kr(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	if (cli_ctx_check_death(ctx)) {
+		kill_ctx_kr(ctx);
+		return 1;
+	}
+
+	if (cli_ctx_is_ready(ctx))
+		return 0;
+	return 1;
+}
+
+static
+void gss_cli_ctx_die_kr(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	cli_ctx_expire(ctx);
+	kill_ctx_kr(ctx);
+}
+
+/****************************************
+ * (reverse) service		    *
+ ****************************************/
+
+/*
+ * reverse context could have nothing to do with keyrings. here we still keep
+ * the version which bind to a key, for future reference.
+ */
+#define HAVE_REVERSE_CTX_NOKEY
+
+
+static
+int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+			struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct ptlrpc_cli_ctx   *cli_ctx;
+	struct vfs_cred	  vcred = { 0, 0 };
+	int		      rc;
+
+	LASSERT(sec);
+	LASSERT(svc_ctx);
+
+	cli_ctx = ctx_create_kr(sec, &vcred);
+	if (cli_ctx == NULL)
+		return -ENOMEM;
+
+	rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+	if (rc) {
+		CERROR("failed copy reverse cli ctx: %d\n", rc);
+
+		ctx_put_kr(cli_ctx, 1);
+		return rc;
+	}
+
+	rvs_sec_install_root_ctx_kr(sec, cli_ctx, NULL);
+
+	ctx_put_kr(cli_ctx, 1);
+
+	return 0;
+}
+
+
+/****************************************
+ * service apis			 *
+ ****************************************/
+
+static
+int gss_svc_accept_kr(struct ptlrpc_request *req)
+{
+	return gss_svc_accept(&gss_policy_keyring, req);
+}
+
+static
+int gss_svc_install_rctx_kr(struct obd_import *imp,
+			    struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct ptlrpc_sec *sec;
+	int		rc;
+
+	sec = sptlrpc_import_sec_ref(imp);
+	LASSERT(sec);
+
+	rc = sec_install_rctx_kr(sec, svc_ctx);
+	sptlrpc_sec_put(sec);
+
+	return rc;
+}
+
+/****************************************
+ * key apis			     *
+ ****************************************/
+
+static
+int gss_kt_instantiate(struct key *key, const void *data, size_t datalen)
+{
+	int	     rc;
+	ENTRY;
+
+	if (data != NULL || datalen != 0) {
+		CERROR("invalid: data %p, len %lu\n", data, (long)datalen);
+		RETURN(-EINVAL);
+	}
+
+	if (key->payload.data != 0) {
+		CERROR("key already have payload\n");
+		RETURN(-EINVAL);
+	}
+
+	/* link the key to session keyring, so following context negotiation
+	 * rpc fired from user space could find this key. This will be unlinked
+	 * automatically when upcall processes die.
+	 *
+	 * we can't do this through keyctl from userspace, because the upcall
+	 * might be neither possessor nor owner of the key (setuid).
+	 *
+	 * the session keyring is created upon upcall, and don't change all
+	 * the way until upcall finished, so rcu lock is not needed here.
+	 */
+	LASSERT(key_tgcred(current)->session_keyring);
+
+	lockdep_off();
+	rc = key_link(key_tgcred(current)->session_keyring, key);
+	lockdep_on();
+	if (unlikely(rc)) {
+		CERROR("failed to link key %08x to keyring %08x: %d\n",
+		       key->serial,
+		       key_tgcred(current)->session_keyring->serial, rc);
+		RETURN(rc);
+	}
+
+	CDEBUG(D_SEC, "key %p instantiated, ctx %p\n", key, key->payload.data);
+	RETURN(0);
+}
+
+/*
+ * called with key semaphore write locked. it means we can operate
+ * on the context without fear of loosing refcount.
+ */
+static
+int gss_kt_update(struct key *key, const void *data, size_t datalen)
+{
+	struct ptlrpc_cli_ctx   *ctx = key->payload.data;
+	struct gss_cli_ctx      *gctx;
+	rawobj_t		 tmpobj = RAWOBJ_EMPTY;
+	__u32		    datalen32 = (__u32) datalen;
+	int		      rc;
+	ENTRY;
+
+	if (data == NULL || datalen == 0) {
+		CWARN("invalid: data %p, len %lu\n", data, (long)datalen);
+		RETURN(-EINVAL);
+	}
+
+	/* if upcall finished negotiation too fast (mostly likely because
+	 * of local error happened) and call kt_update(), the ctx
+	 * might be still NULL. but the key will finally be associate
+	 * with a context, or be revoked. if key status is fine, return
+	 * -EAGAIN to allow userspace sleep a while and call again. */
+	if (ctx == NULL) {
+		CDEBUG(D_SEC, "update too soon: key %p(%x) flags %lx\n",
+		      key, key->serial, key->flags);
+
+		rc = key_validate(key);
+		if (rc == 0)
+			RETURN(-EAGAIN);
+		else
+			RETURN(rc);
+	}
+
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(ctx->cc_sec);
+
+	ctx_clear_timer_kr(ctx);
+
+	/* don't proceed if already refreshed */
+	if (cli_ctx_is_refreshed(ctx)) {
+		CWARN("ctx already done refresh\n");
+		RETURN(0);
+	}
+
+	sptlrpc_cli_ctx_get(ctx);
+	gctx = ctx2gctx(ctx);
+
+	rc = buffer_extract_bytes(&data, &datalen32, &gctx->gc_win,
+				  sizeof(gctx->gc_win));
+	if (rc) {
+		CERROR("failed extract seq_win\n");
+		goto out;
+	}
+
+	if (gctx->gc_win == 0) {
+		__u32   nego_rpc_err, nego_gss_err;
+
+		rc = buffer_extract_bytes(&data, &datalen32, &nego_rpc_err,
+					  sizeof(nego_rpc_err));
+		if (rc) {
+			CERROR("failed to extrace rpc rc\n");
+			goto out;
+		}
+
+		rc = buffer_extract_bytes(&data, &datalen32, &nego_gss_err,
+					  sizeof(nego_gss_err));
+		if (rc) {
+			CERROR("failed to extrace gss rc\n");
+			goto out;
+		}
+
+		CERROR("negotiation: rpc err %d, gss err %x\n",
+		       nego_rpc_err, nego_gss_err);
+
+		rc = nego_rpc_err ? nego_rpc_err : -EACCES;
+	} else {
+		rc = rawobj_extract_local_alloc(&gctx->gc_handle,
+						(__u32 **) &data, &datalen32);
+		if (rc) {
+			CERROR("failed extract handle\n");
+			goto out;
+		}
+
+		rc = rawobj_extract_local(&tmpobj, (__u32 **) &data,&datalen32);
+		if (rc) {
+			CERROR("failed extract mech\n");
+			goto out;
+		}
+
+		rc = lgss_import_sec_context(&tmpobj,
+					     sec2gsec(ctx->cc_sec)->gs_mech,
+					     &gctx->gc_mechctx);
+		if (rc != GSS_S_COMPLETE)
+			CERROR("failed import context\n");
+		else
+			rc = 0;
+	}
+out:
+	/* we don't care what current status of this ctx, even someone else
+	 * is operating on the ctx at the same time. we just add up our own
+	 * opinions here. */
+	if (rc == 0) {
+		gss_cli_ctx_uptodate(gctx);
+	} else {
+		/* this will also revoke the key. has to be done before
+		 * wakeup waiters otherwise they can find the stale key */
+		kill_key_locked(key);
+
+		cli_ctx_expire(ctx);
+
+		if (rc != -ERESTART)
+			set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+	}
+
+	/* let user space think it's a success */
+	sptlrpc_cli_ctx_put(ctx, 1);
+	RETURN(0);
+}
+
+static
+int gss_kt_match(const struct key *key, const void *desc)
+{
+	return (strcmp(key->description, (const char *) desc) == 0);
+}
+
+static
+void gss_kt_destroy(struct key *key)
+{
+	ENTRY;
+	LASSERT(key->payload.data == NULL);
+	CDEBUG(D_SEC, "destroy key %p\n", key);
+	EXIT;
+}
+
+static
+void gss_kt_describe(const struct key *key, struct seq_file *s)
+{
+	if (key->description == NULL)
+		seq_puts(s, "[null]");
+	else
+		seq_puts(s, key->description);
+}
+
+static struct key_type gss_key_type =
+{
+	.name	   = "lgssc",
+	.def_datalen    = 0,
+	.instantiate    = gss_kt_instantiate,
+	.update	 = gss_kt_update,
+	.match	  = gss_kt_match,
+	.destroy	= gss_kt_destroy,
+	.describe       = gss_kt_describe,
+};
+
+/****************************************
+ * lustre gss keyring policy	    *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_keyring_ctxops = {
+	.match		  = gss_cli_ctx_match,
+	.refresh		= gss_cli_ctx_refresh_kr,
+	.validate	       = gss_cli_ctx_validate_kr,
+	.die		    = gss_cli_ctx_die_kr,
+	.sign		   = gss_cli_ctx_sign,
+	.verify		 = gss_cli_ctx_verify,
+	.seal		   = gss_cli_ctx_seal,
+	.unseal		 = gss_cli_ctx_unseal,
+	.wrap_bulk	      = gss_cli_ctx_wrap_bulk,
+	.unwrap_bulk	    = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_keyring_cops = {
+	.create_sec	     = gss_sec_create_kr,
+	.destroy_sec	    = gss_sec_destroy_kr,
+	.kill_sec	       = gss_sec_kill,
+	.lookup_ctx	     = gss_sec_lookup_ctx_kr,
+	.release_ctx	    = gss_sec_release_ctx_kr,
+	.flush_ctx_cache	= gss_sec_flush_ctx_cache_kr,
+	.gc_ctx		 = gss_sec_gc_ctx_kr,
+	.install_rctx	   = gss_sec_install_rctx,
+	.alloc_reqbuf	   = gss_alloc_reqbuf,
+	.free_reqbuf	    = gss_free_reqbuf,
+	.alloc_repbuf	   = gss_alloc_repbuf,
+	.free_repbuf	    = gss_free_repbuf,
+	.enlarge_reqbuf	 = gss_enlarge_reqbuf,
+	.display		= gss_sec_display_kr,
+};
+
+static struct ptlrpc_sec_sops gss_sec_keyring_sops = {
+	.accept		 = gss_svc_accept_kr,
+	.invalidate_ctx	 = gss_svc_invalidate_ctx,
+	.alloc_rs	       = gss_svc_alloc_rs,
+	.authorize	      = gss_svc_authorize,
+	.free_rs		= gss_svc_free_rs,
+	.free_ctx	       = gss_svc_free_ctx,
+	.prep_bulk	      = gss_svc_prep_bulk,
+	.unwrap_bulk	    = gss_svc_unwrap_bulk,
+	.wrap_bulk	      = gss_svc_wrap_bulk,
+	.install_rctx	   = gss_svc_install_rctx_kr,
+};
+
+static struct ptlrpc_sec_policy gss_policy_keyring = {
+	.sp_owner	       = THIS_MODULE,
+	.sp_name		= "gss.keyring",
+	.sp_policy	      = SPTLRPC_POLICY_GSS,
+	.sp_cops		= &gss_sec_keyring_cops,
+	.sp_sops		= &gss_sec_keyring_sops,
+};
+
+
+int __init gss_init_keyring(void)
+{
+	int rc;
+
+	rc = register_key_type(&gss_key_type);
+	if (rc) {
+		CERROR("failed to register keyring type: %d\n", rc);
+		return rc;
+	}
+
+	rc = sptlrpc_register_policy(&gss_policy_keyring);
+	if (rc) {
+		unregister_key_type(&gss_key_type);
+		return rc;
+	}
+
+	return 0;
+}
+
+void __exit gss_exit_keyring(void)
+{
+	unregister_key_type(&gss_key_type);
+	sptlrpc_unregister_policy(&gss_policy_keyring);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h
new file mode 100644
index 0000000..676d4b9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h

@@ -0,0 +1,163 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/include/linux/sunrpc/gss_krb5_types.h
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ *  Bruce Fields   <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#ifndef PTLRPC_GSS_KRB5_H
+#define PTLRPC_GSS_KRB5_H
+
+/*
+ * RFC 4142
+ */
+
+#define KG_USAGE_ACCEPTOR_SEAL	  22
+#define KG_USAGE_ACCEPTOR_SIGN	  23
+#define KG_USAGE_INITIATOR_SEAL	 24
+#define KG_USAGE_INITIATOR_SIGN	 25
+
+#define KG_TOK_MIC_MSG		  0x0404
+#define KG_TOK_WRAP_MSG		 0x0504
+
+#define FLAG_SENDER_IS_ACCEPTOR	 0x01
+#define FLAG_WRAP_CONFIDENTIAL	  0x02
+#define FLAG_ACCEPTOR_SUBKEY	    0x04
+
+struct krb5_header {
+	__u16	   kh_tok_id;      /* token id */
+	__u8	    kh_flags;       /* acceptor flags */
+	__u8	    kh_filler;      /* 0xff */
+	__u16	   kh_ec;	  /* extra count */
+	__u16	   kh_rrc;	 /* right rotation count */
+	__u64	   kh_seq;	 /* sequence number */
+	__u8	    kh_cksum[0];    /* checksum */
+};
+
+struct krb5_keyblock {
+	rawobj_t		 kb_key;
+	struct ll_crypto_cipher *kb_tfm;
+};
+
+struct krb5_ctx {
+	unsigned int	    kc_initiate:1,
+				kc_cfx:1,
+				kc_seed_init:1,
+				kc_have_acceptor_subkey:1;
+	__s32		   kc_endtime;
+	__u8		    kc_seed[16];
+	__u64		   kc_seq_send;
+	__u64		   kc_seq_recv;
+	__u32		   kc_enctype;
+	struct krb5_keyblock    kc_keye;	/* encryption */
+	struct krb5_keyblock    kc_keyi;	/* integrity */
+	struct krb5_keyblock    kc_keyc;	/* checksum */
+	rawobj_t		kc_mech_used;
+};
+
+enum sgn_alg {
+	SGN_ALG_DES_MAC_MD5	   = 0x0000,
+	SGN_ALG_MD2_5		 = 0x0001,
+	SGN_ALG_DES_MAC	       = 0x0002,
+	SGN_ALG_3		     = 0x0003, /* not published */
+	SGN_ALG_HMAC_MD5	      = 0x0011, /* microsoft w2k; no support */
+	SGN_ALG_HMAC_SHA1_DES3_KD     = 0x0004
+};
+
+enum seal_alg {
+	SEAL_ALG_NONE		 = 0xffff,
+	SEAL_ALG_DES		  = 0x0000,
+	SEAL_ALG_1		    = 0x0001, /* not published */
+	SEAL_ALG_MICROSOFT_RC4	= 0x0010, /* microsoft w2k; no support */
+	SEAL_ALG_DES3KD	       = 0x0002
+};
+
+#define CKSUMTYPE_CRC32		 0x0001
+#define CKSUMTYPE_RSA_MD4	       0x0002
+#define CKSUMTYPE_RSA_MD4_DES	   0x0003
+#define CKSUMTYPE_DESCBC		0x0004
+/* des-mac-k */
+/* rsa-md4-des-k */
+#define CKSUMTYPE_RSA_MD5	       0x0007
+#define CKSUMTYPE_RSA_MD5_DES	   0x0008
+#define CKSUMTYPE_NIST_SHA	      0x0009
+#define CKSUMTYPE_HMAC_SHA1_DES3	0x000c
+#define CKSUMTYPE_HMAC_SHA1_96_AES128   0x000f
+#define CKSUMTYPE_HMAC_SHA1_96_AES256   0x0010
+#define CKSUMTYPE_HMAC_MD5_ARCFOUR      -138
+
+/* from gssapi_err_krb5.h */
+#define KG_CCACHE_NOMATCH			(39756032L)
+#define KG_KEYTAB_NOMATCH			(39756033L)
+#define KG_TGT_MISSING			   (39756034L)
+#define KG_NO_SUBKEY			     (39756035L)
+#define KG_CONTEXT_ESTABLISHED		   (39756036L)
+#define KG_BAD_SIGN_TYPE			 (39756037L)
+#define KG_BAD_LENGTH			    (39756038L)
+#define KG_CTX_INCOMPLETE			(39756039L)
+#define KG_CONTEXT			       (39756040L)
+#define KG_CRED				  (39756041L)
+#define KG_ENC_DESC			      (39756042L)
+#define KG_BAD_SEQ			       (39756043L)
+#define KG_EMPTY_CCACHE			  (39756044L)
+#define KG_NO_CTYPES			     (39756045L)
+
+/* per Kerberos v5 protocol spec crypto types from the wire.
+ * these get mapped to linux kernel crypto routines.
+ */
+#define ENCTYPE_NULL	    0x0000
+#define ENCTYPE_DES_CBC_CRC     0x0001	/* DES cbc mode with CRC-32 */
+#define ENCTYPE_DES_CBC_MD4     0x0002	/* DES cbc mode with RSA-MD4 */
+#define ENCTYPE_DES_CBC_MD5     0x0003	/* DES cbc mode with RSA-MD5 */
+#define ENCTYPE_DES_CBC_RAW     0x0004	/* DES cbc mode raw */
+/* XXX deprecated? */
+#define ENCTYPE_DES3_CBC_SHA    0x0005	/* DES-3 cbc mode with NIST-SHA */
+#define ENCTYPE_DES3_CBC_RAW    0x0006	/* DES-3 cbc mode raw */
+#define ENCTYPE_DES_HMAC_SHA1   0x0008
+#define ENCTYPE_DES3_CBC_SHA1   0x0010
+#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011
+#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012
+#define ENCTYPE_ARCFOUR_HMAC    0x0017
+#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018
+#define ENCTYPE_UNKNOWN	 0x01ff
+
+#endif /* PTLRPC_GSS_KRB5_H */

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c
new file mode 100644
index 0000000..4b28931
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c

@@ -0,0 +1,1786 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_mech.c
+ *  linux/net/sunrpc/gss_krb5_crypto.c
+ *  linux/net/sunrpc/gss_krb5_seal.c
+ *  linux/net/sunrpc/gss_krb5_seqnum.c
+ *  linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_asn1.h"
+#include "gss_krb5.h"
+
+static spinlock_t krb5_seq_lock;
+
+struct krb5_enctype {
+	char	   *ke_dispname;
+	char	   *ke_enc_name;	    /* linux tfm name */
+	char	   *ke_hash_name;	   /* linux tfm name */
+	int	     ke_enc_mode;	    /* linux tfm mode */
+	int	     ke_hash_size;	   /* checksum size */
+	int	     ke_conf_size;	   /* confounder size */
+	unsigned int    ke_hash_hmac:1;	 /* is hmac? */
+};
+
+/*
+ * NOTE: for aes128-cts and aes256-cts, MIT implementation use CTS encryption.
+ * but currently we simply CBC with padding, because linux doesn't support CTS
+ * yet. this need to be fixed in the future.
+ */
+static struct krb5_enctype enctypes[] = {
+	[ENCTYPE_DES_CBC_RAW] = {	       /* des-cbc-md5 */
+		"des-cbc-md5",
+		"cbc(des)",
+		"md5",
+		0,
+		16,
+		8,
+		0,
+	},
+	[ENCTYPE_DES3_CBC_RAW] = {	      /* des3-hmac-sha1 */
+		"des3-hmac-sha1",
+		"cbc(des3_ede)",
+		"hmac(sha1)",
+		0,
+		20,
+		8,
+		1,
+	},
+	[ENCTYPE_AES128_CTS_HMAC_SHA1_96] = {   /* aes128-cts */
+		"aes128-cts-hmac-sha1-96",
+		"cbc(aes)",
+		"hmac(sha1)",
+		0,
+		12,
+		16,
+		1,
+	},
+	[ENCTYPE_AES256_CTS_HMAC_SHA1_96] = {   /* aes256-cts */
+		"aes256-cts-hmac-sha1-96",
+		"cbc(aes)",
+		"hmac(sha1)",
+		0,
+		12,
+		16,
+		1,
+	},
+	[ENCTYPE_ARCFOUR_HMAC] = {	      /* arcfour-hmac-md5 */
+		"arcfour-hmac-md5",
+		"ecb(arc4)",
+		"hmac(md5)",
+		0,
+		16,
+		8,
+		1,
+	},
+};
+
+#define MAX_ENCTYPES    sizeof(enctypes)/sizeof(struct krb5_enctype)
+
+static const char * enctype2str(__u32 enctype)
+{
+	if (enctype < MAX_ENCTYPES && enctypes[enctype].ke_dispname)
+		return enctypes[enctype].ke_dispname;
+
+	return "unknown";
+}
+
+static
+int keyblock_init(struct krb5_keyblock *kb, char *alg_name, int alg_mode)
+{
+	kb->kb_tfm = ll_crypto_alloc_blkcipher(alg_name, alg_mode, 0);
+	if (IS_ERR(kb->kb_tfm)) {
+		CERROR("failed to alloc tfm: %s, mode %d\n",
+		       alg_name, alg_mode);
+		return -1;
+	}
+
+	if (ll_crypto_blkcipher_setkey(kb->kb_tfm, kb->kb_key.data, kb->kb_key.len)) {
+		CERROR("failed to set %s key, len %d\n",
+		       alg_name, kb->kb_key.len);
+		return -1;
+	}
+
+	return 0;
+}
+
+static
+int krb5_init_keys(struct krb5_ctx *kctx)
+{
+	struct krb5_enctype *ke;
+
+	if (kctx->kc_enctype >= MAX_ENCTYPES ||
+	    enctypes[kctx->kc_enctype].ke_hash_size == 0) {
+		CERROR("unsupported enctype %x\n", kctx->kc_enctype);
+		return -1;
+	}
+
+	ke = &enctypes[kctx->kc_enctype];
+
+	/* tfm arc4 is stateful, user should alloc-use-free by his own */
+	if (kctx->kc_enctype != ENCTYPE_ARCFOUR_HMAC &&
+	    keyblock_init(&kctx->kc_keye, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+
+	/* tfm hmac is stateful, user should alloc-use-free by his own */
+	if (ke->ke_hash_hmac == 0 &&
+	    keyblock_init(&kctx->kc_keyi, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+	if (ke->ke_hash_hmac == 0 &&
+	    keyblock_init(&kctx->kc_keyc, ke->ke_enc_name, ke->ke_enc_mode))
+		return -1;
+
+	return 0;
+}
+
+static
+void keyblock_free(struct krb5_keyblock *kb)
+{
+	rawobj_free(&kb->kb_key);
+	if (kb->kb_tfm)
+		ll_crypto_free_blkcipher(kb->kb_tfm);
+}
+
+static
+int keyblock_dup(struct krb5_keyblock *new, struct krb5_keyblock *kb)
+{
+	return rawobj_dup(&new->kb_key, &kb->kb_key);
+}
+
+static
+int get_bytes(char **ptr, const char *end, void *res, int len)
+{
+	char *p, *q;
+	p = *ptr;
+	q = p + len;
+	if (q > end || q < p)
+		return -1;
+	memcpy(res, p, len);
+	*ptr = q;
+	return 0;
+}
+
+static
+int get_rawobj(char **ptr, const char *end, rawobj_t *res)
+{
+	char   *p, *q;
+	__u32   len;
+
+	p = *ptr;
+	if (get_bytes(&p, end, &len, sizeof(len)))
+		return -1;
+
+	q = p + len;
+	if (q > end || q < p)
+		return -1;
+
+	OBD_ALLOC_LARGE(res->data, len);
+	if (!res->data)
+		return -1;
+
+	res->len = len;
+	memcpy(res->data, p, len);
+	*ptr = q;
+	return 0;
+}
+
+static
+int get_keyblock(char **ptr, const char *end,
+		 struct krb5_keyblock *kb, __u32 keysize)
+{
+	char *buf;
+
+	OBD_ALLOC_LARGE(buf, keysize);
+	if (buf == NULL)
+		return -1;
+
+	if (get_bytes(ptr, end, buf, keysize)) {
+		OBD_FREE_LARGE(buf, keysize);
+		return -1;
+	}
+
+	kb->kb_key.len = keysize;
+	kb->kb_key.data = buf;
+	return 0;
+}
+
+static
+void delete_context_kerberos(struct krb5_ctx *kctx)
+{
+	rawobj_free(&kctx->kc_mech_used);
+
+	keyblock_free(&kctx->kc_keye);
+	keyblock_free(&kctx->kc_keyi);
+	keyblock_free(&kctx->kc_keyc);
+}
+
+static
+__u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end)
+{
+	unsigned int    tmp_uint, keysize;
+
+	/* seed_init flag */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+	kctx->kc_seed_init = (tmp_uint != 0);
+
+	/* seed */
+	if (get_bytes(&p, end, kctx->kc_seed, sizeof(kctx->kc_seed)))
+		goto out_err;
+
+	/* sign/seal algorithm, not really used now */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	/* end time */
+	if (get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+		goto out_err;
+
+	/* seq send */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+	kctx->kc_seq_send = tmp_uint;
+
+	/* mech oid */
+	if (get_rawobj(&p, end, &kctx->kc_mech_used))
+		goto out_err;
+
+	/* old style enc/seq keys in format:
+	 *   - enctype (u32)
+	 *   - keysize (u32)
+	 *   - keydata
+	 * we decompose them to fit into the new context
+	 */
+
+	/* enc key */
+	if (get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+		goto out_err;
+
+	if (get_bytes(&p, end, &keysize, sizeof(keysize)))
+		goto out_err;
+
+	if (get_keyblock(&p, end, &kctx->kc_keye, keysize))
+		goto out_err;
+
+	/* seq key */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    tmp_uint != kctx->kc_enctype)
+		goto out_err;
+
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+	    tmp_uint != keysize)
+		goto out_err;
+
+	if (get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+		goto out_err;
+
+	/* old style fallback */
+	if (keyblock_dup(&kctx->kc_keyi, &kctx->kc_keyc))
+		goto out_err;
+
+	if (p != end)
+		goto out_err;
+
+	CDEBUG(D_SEC, "succesfully imported rfc1964 context\n");
+	return 0;
+out_err:
+	return GSS_S_FAILURE;
+}
+
+/* Flags for version 2 context flags */
+#define KRB5_CTX_FLAG_INITIATOR		0x00000001
+#define KRB5_CTX_FLAG_CFX		0x00000002
+#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY	0x00000004
+
+static
+__u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end)
+{
+	unsigned int    tmp_uint, keysize;
+
+	/* end time */
+	if (get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+		goto out_err;
+
+	/* flags */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	if (tmp_uint & KRB5_CTX_FLAG_INITIATOR)
+		kctx->kc_initiate = 1;
+	if (tmp_uint & KRB5_CTX_FLAG_CFX)
+		kctx->kc_cfx = 1;
+	if (tmp_uint & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY)
+		kctx->kc_have_acceptor_subkey = 1;
+
+	/* seq send */
+	if (get_bytes(&p, end, &kctx->kc_seq_send, sizeof(kctx->kc_seq_send)))
+		goto out_err;
+
+	/* enctype */
+	if (get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+		goto out_err;
+
+	/* size of each key */
+	if (get_bytes(&p, end, &keysize, sizeof(keysize)))
+		goto out_err;
+
+	/* number of keys - should always be 3 */
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+		goto out_err;
+
+	if (tmp_uint != 3) {
+		CERROR("Invalid number of keys: %u\n", tmp_uint);
+		goto out_err;
+	}
+
+	/* ke */
+	if (get_keyblock(&p, end, &kctx->kc_keye, keysize))
+		goto out_err;
+	/* ki */
+	if (get_keyblock(&p, end, &kctx->kc_keyi, keysize))
+		goto out_err;
+	/* ki */
+	if (get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+		goto out_err;
+
+	CDEBUG(D_SEC, "succesfully imported v2 context\n");
+	return 0;
+out_err:
+	return GSS_S_FAILURE;
+}
+
+/*
+ * The whole purpose here is trying to keep user level gss context parsing
+ * from nfs-utils unchanged as possible as we can, they are not quite mature
+ * yet, and many stuff still not clear, like heimdal etc.
+ */
+static
+__u32 gss_import_sec_context_kerberos(rawobj_t *inbuf,
+				      struct gss_ctx *gctx)
+{
+	struct krb5_ctx *kctx;
+	char	    *p = (char *) inbuf->data;
+	char	    *end = (char *) (inbuf->data + inbuf->len);
+	unsigned int     tmp_uint, rc;
+
+	if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) {
+		CERROR("Fail to read version\n");
+		return GSS_S_FAILURE;
+	}
+
+	/* only support 0, 1 for the moment */
+	if (tmp_uint > 2) {
+		CERROR("Invalid version %u\n", tmp_uint);
+		return GSS_S_FAILURE;
+	}
+
+	OBD_ALLOC_PTR(kctx);
+	if (!kctx)
+		return GSS_S_FAILURE;
+
+	if (tmp_uint == 0 || tmp_uint == 1) {
+		kctx->kc_initiate = tmp_uint;
+		rc = import_context_rfc1964(kctx, p, end);
+	} else {
+		rc = import_context_rfc4121(kctx, p, end);
+	}
+
+	if (rc == 0)
+		rc = krb5_init_keys(kctx);
+
+	if (rc) {
+		delete_context_kerberos(kctx);
+		OBD_FREE_PTR(kctx);
+
+		return GSS_S_FAILURE;
+	}
+
+	gctx->internal_ctx_id = kctx;
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx,
+					struct gss_ctx *gctx_new)
+{
+	struct krb5_ctx *kctx = gctx->internal_ctx_id;
+	struct krb5_ctx *knew;
+
+	OBD_ALLOC_PTR(knew);
+	if (!knew)
+		return GSS_S_FAILURE;
+
+	knew->kc_initiate = kctx->kc_initiate ? 0 : 1;
+	knew->kc_cfx = kctx->kc_cfx;
+	knew->kc_seed_init = kctx->kc_seed_init;
+	knew->kc_have_acceptor_subkey = kctx->kc_have_acceptor_subkey;
+	knew->kc_endtime = kctx->kc_endtime;
+
+	memcpy(knew->kc_seed, kctx->kc_seed, sizeof(kctx->kc_seed));
+	knew->kc_seq_send = kctx->kc_seq_recv;
+	knew->kc_seq_recv = kctx->kc_seq_send;
+	knew->kc_enctype = kctx->kc_enctype;
+
+	if (rawobj_dup(&knew->kc_mech_used, &kctx->kc_mech_used))
+		goto out_err;
+
+	if (keyblock_dup(&knew->kc_keye, &kctx->kc_keye))
+		goto out_err;
+	if (keyblock_dup(&knew->kc_keyi, &kctx->kc_keyi))
+		goto out_err;
+	if (keyblock_dup(&knew->kc_keyc, &kctx->kc_keyc))
+		goto out_err;
+	if (krb5_init_keys(knew))
+		goto out_err;
+
+	gctx_new->internal_ctx_id = knew;
+	CDEBUG(D_SEC, "succesfully copied reverse context\n");
+	return GSS_S_COMPLETE;
+
+out_err:
+	delete_context_kerberos(knew);
+	OBD_FREE_PTR(knew);
+	return GSS_S_FAILURE;
+}
+
+static
+__u32 gss_inquire_context_kerberos(struct gss_ctx *gctx,
+				   unsigned long  *endtime)
+{
+	struct krb5_ctx *kctx = gctx->internal_ctx_id;
+
+	*endtime = (unsigned long) ((__u32) kctx->kc_endtime);
+	return GSS_S_COMPLETE;
+}
+
+static
+void gss_delete_sec_context_kerberos(void *internal_ctx)
+{
+	struct krb5_ctx *kctx = internal_ctx;
+
+	delete_context_kerberos(kctx);
+	OBD_FREE_PTR(kctx);
+}
+
+static
+void buf_to_sg(struct scatterlist *sg, void *ptr, int len)
+{
+	sg_set_buf(sg, ptr, len);
+}
+
+static
+__u32 krb5_encrypt(struct ll_crypto_cipher *tfm,
+		   int decrypt,
+		   void * iv,
+		   void * in,
+		   void * out,
+		   int length)
+{
+	struct blkcipher_desc desc;
+	struct scatterlist    sg;
+	__u8 local_iv[16] = {0};
+	__u32 ret = -EINVAL;
+
+	LASSERT(tfm);
+	desc.tfm  = tfm;
+	desc.info = local_iv;
+	desc.flags= 0;
+
+	if (length % ll_crypto_blkcipher_blocksize(tfm) != 0) {
+		CERROR("output length %d mismatch blocksize %d\n",
+		       length, ll_crypto_blkcipher_blocksize(tfm));
+		goto out;
+	}
+
+	if (ll_crypto_blkcipher_ivsize(tfm) > 16) {
+		CERROR("iv size too large %d\n", ll_crypto_blkcipher_ivsize(tfm));
+		goto out;
+	}
+
+	if (iv)
+		memcpy(local_iv, iv, ll_crypto_blkcipher_ivsize(tfm));
+
+	memcpy(out, in, length);
+	buf_to_sg(&sg, out, length);
+
+	if (decrypt)
+		ret = ll_crypto_blkcipher_decrypt_iv(&desc, &sg, &sg, length);
+	else
+		ret = ll_crypto_blkcipher_encrypt_iv(&desc, &sg, &sg, length);
+
+out:
+	return(ret);
+}
+
+
+static inline
+int krb5_digest_hmac(struct ll_crypto_hash *tfm,
+		     rawobj_t *key,
+		     struct krb5_header *khdr,
+		     int msgcnt, rawobj_t *msgs,
+		     int iovcnt, lnet_kiov_t *iovs,
+		     rawobj_t *cksum)
+{
+	struct hash_desc   desc;
+	struct scatterlist sg[1];
+	int		i;
+
+	ll_crypto_hash_setkey(tfm, key->data, key->len);
+	desc.tfm  = tfm;
+	desc.flags= 0;
+
+	ll_crypto_hash_init(&desc);
+
+	for (i = 0; i < msgcnt; i++) {
+		if (msgs[i].len == 0)
+			continue;
+		buf_to_sg(sg, (char *) msgs[i].data, msgs[i].len);
+		ll_crypto_hash_update(&desc, sg, msgs[i].len);
+	}
+
+	for (i = 0; i < iovcnt; i++) {
+		if (iovs[i].kiov_len == 0)
+			continue;
+
+		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
+			    iovs[i].kiov_offset);
+		ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+	}
+
+	if (khdr) {
+		buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
+		ll_crypto_hash_update(&desc, sg, sizeof(*khdr));
+	}
+
+	return ll_crypto_hash_final(&desc, cksum->data);
+}
+
+
+static inline
+int krb5_digest_norm(struct ll_crypto_hash *tfm,
+		     struct krb5_keyblock *kb,
+		     struct krb5_header *khdr,
+		     int msgcnt, rawobj_t *msgs,
+		     int iovcnt, lnet_kiov_t *iovs,
+		     rawobj_t *cksum)
+{
+	struct hash_desc   desc;
+	struct scatterlist sg[1];
+	int		i;
+
+	LASSERT(kb->kb_tfm);
+	desc.tfm  = tfm;
+	desc.flags= 0;
+
+	ll_crypto_hash_init(&desc);
+
+	for (i = 0; i < msgcnt; i++) {
+		if (msgs[i].len == 0)
+			continue;
+		buf_to_sg(sg, (char *) msgs[i].data, msgs[i].len);
+		ll_crypto_hash_update(&desc, sg, msgs[i].len);
+	}
+
+	for (i = 0; i < iovcnt; i++) {
+		if (iovs[i].kiov_len == 0)
+			continue;
+
+		sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
+			    iovs[i].kiov_offset);
+		ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+	}
+
+	if (khdr) {
+		buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
+		ll_crypto_hash_update(&desc, sg, sizeof(*khdr));
+	}
+
+	ll_crypto_hash_final(&desc, cksum->data);
+
+	return krb5_encrypt(kb->kb_tfm, 0, NULL, cksum->data,
+			    cksum->data, cksum->len);
+}
+
+/*
+ * compute (keyed/keyless) checksum against the plain text which appended
+ * with krb5 wire token header.
+ */
+static
+__s32 krb5_make_checksum(__u32 enctype,
+			 struct krb5_keyblock *kb,
+			 struct krb5_header *khdr,
+			 int msgcnt, rawobj_t *msgs,
+			 int iovcnt, lnet_kiov_t *iovs,
+			 rawobj_t *cksum)
+{
+	struct krb5_enctype   *ke = &enctypes[enctype];
+	struct ll_crypto_hash *tfm;
+	__u32		  code = GSS_S_FAILURE;
+	int		    rc;
+
+	if (!(tfm = ll_crypto_alloc_hash(ke->ke_hash_name, 0, 0))) {
+		CERROR("failed to alloc TFM: %s\n", ke->ke_hash_name);
+		return GSS_S_FAILURE;
+	}
+
+	cksum->len = ll_crypto_hash_digestsize(tfm);
+	OBD_ALLOC_LARGE(cksum->data, cksum->len);
+	if (!cksum->data) {
+		cksum->len = 0;
+		goto out_tfm;
+	}
+
+	if (ke->ke_hash_hmac)
+		rc = krb5_digest_hmac(tfm, &kb->kb_key,
+				      khdr, msgcnt, msgs, iovcnt, iovs, cksum);
+	else
+		rc = krb5_digest_norm(tfm, kb,
+				      khdr, msgcnt, msgs, iovcnt, iovs, cksum);
+
+	if (rc == 0)
+		code = GSS_S_COMPLETE;
+out_tfm:
+	ll_crypto_free_hash(tfm);
+	return code;
+}
+
+static void fill_krb5_header(struct krb5_ctx *kctx,
+			     struct krb5_header *khdr,
+			     int privacy)
+{
+	unsigned char acceptor_flag;
+
+	acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR;
+
+	if (privacy) {
+		khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG);
+		khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL;
+		khdr->kh_ec = cpu_to_be16(0);
+		khdr->kh_rrc = cpu_to_be16(0);
+	} else {
+		khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG);
+		khdr->kh_flags = acceptor_flag;
+		khdr->kh_ec = cpu_to_be16(0xffff);
+		khdr->kh_rrc = cpu_to_be16(0xffff);
+	}
+
+	khdr->kh_filler = 0xff;
+	spin_lock(&krb5_seq_lock);
+	khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++);
+	spin_unlock(&krb5_seq_lock);
+}
+
+static __u32 verify_krb5_header(struct krb5_ctx *kctx,
+				struct krb5_header *khdr,
+				int privacy)
+{
+	unsigned char acceptor_flag;
+	__u16	 tok_id, ec_rrc;
+
+	acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0;
+
+	if (privacy) {
+		tok_id = KG_TOK_WRAP_MSG;
+		ec_rrc = 0x0;
+	} else {
+		tok_id = KG_TOK_MIC_MSG;
+		ec_rrc = 0xffff;
+	}
+
+	/* sanity checks */
+	if (be16_to_cpu(khdr->kh_tok_id) != tok_id) {
+		CERROR("bad token id\n");
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+	if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) {
+		CERROR("bad direction flag\n");
+		return GSS_S_BAD_SIG;
+	}
+	if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) {
+		CERROR("missing confidential flag\n");
+		return GSS_S_BAD_SIG;
+	}
+	if (khdr->kh_filler != 0xff) {
+		CERROR("bad filler\n");
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+	if (be16_to_cpu(khdr->kh_ec) != ec_rrc ||
+	    be16_to_cpu(khdr->kh_rrc) != ec_rrc) {
+		CERROR("bad EC or RRC\n");
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
+			   int msgcnt,
+			   rawobj_t *msgs,
+			   int iovcnt,
+			   lnet_kiov_t *iovs,
+			   rawobj_t *token)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *) token->data;
+	fill_krb5_header(kctx, khdr, 0);
+
+	/* checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+			       khdr, msgcnt, msgs, iovcnt, iovs, &cksum))
+		return GSS_S_FAILURE;
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
+	memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+
+	token->len = sizeof(*khdr) + ke->ke_hash_size;
+	rawobj_free(&cksum);
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
+			      int msgcnt,
+			      rawobj_t *msgs,
+			      int iovcnt,
+			      lnet_kiov_t *iovs,
+			      rawobj_t *token)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+	__u32		major;
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	khdr = (struct krb5_header *) token->data;
+
+	major = verify_krb5_header(kctx, khdr, 0);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
+
+	if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
+		CERROR("short signature: %u, require %d\n",
+		       token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
+		return GSS_S_FAILURE;
+	}
+
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+			       khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) {
+		CERROR("failed to make checksum\n");
+		return GSS_S_FAILURE;
+	}
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		rawobj_free(&cksum);
+		return GSS_S_BAD_SIG;
+	}
+
+	rawobj_free(&cksum);
+	return GSS_S_COMPLETE;
+}
+
+static
+int add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
+{
+	int padding;
+
+	padding = (blocksize - (msg->len & (blocksize - 1))) &
+		  (blocksize - 1);
+	if (!padding)
+		return 0;
+
+	if (msg->len + padding > msg_buflen) {
+		CERROR("bufsize %u too small: datalen %u, padding %u\n",
+			msg_buflen, msg->len, padding);
+		return -EINVAL;
+	}
+
+	memset(msg->data + msg->len, padding, padding);
+	msg->len += padding;
+	return 0;
+}
+
+static
+int krb5_encrypt_rawobjs(struct ll_crypto_cipher *tfm,
+			 int mode_ecb,
+			 int inobj_cnt,
+			 rawobj_t *inobjs,
+			 rawobj_t *outobj,
+			 int enc)
+{
+	struct blkcipher_desc desc;
+	struct scatterlist    src, dst;
+	__u8		  local_iv[16] = {0}, *buf;
+	__u32		 datalen = 0;
+	int		   i, rc;
+	ENTRY;
+
+	buf = outobj->data;
+	desc.tfm  = tfm;
+	desc.info = local_iv;
+	desc.flags = 0;
+
+	for (i = 0; i < inobj_cnt; i++) {
+		LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len);
+
+		buf_to_sg(&src, inobjs[i].data, inobjs[i].len);
+		buf_to_sg(&dst, buf, outobj->len - datalen);
+
+		if (mode_ecb) {
+			if (enc)
+				rc = ll_crypto_blkcipher_encrypt(
+					&desc, &dst, &src, src.length);
+			else
+				rc = ll_crypto_blkcipher_decrypt(
+					&desc, &dst, &src, src.length);
+		} else {
+			if (enc)
+				rc = ll_crypto_blkcipher_encrypt_iv(
+					&desc, &dst, &src, src.length);
+			else
+				rc = ll_crypto_blkcipher_decrypt_iv(
+					&desc, &dst, &src, src.length);
+		}
+
+		if (rc) {
+			CERROR("encrypt error %d\n", rc);
+			RETURN(rc);
+		}
+
+		datalen += inobjs[i].len;
+		buf += inobjs[i].len;
+	}
+
+	outobj->len = datalen;
+	RETURN(0);
+}
+
+/*
+ * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size.
+ */
+static
+int krb5_encrypt_bulk(struct ll_crypto_cipher *tfm,
+		      struct krb5_header *khdr,
+		      char *confounder,
+		      struct ptlrpc_bulk_desc *desc,
+		      rawobj_t *cipher,
+		      int adj_nob)
+{
+	struct blkcipher_desc   ciph_desc;
+	__u8		    local_iv[16] = {0};
+	struct scatterlist      src, dst;
+	int		     blocksize, i, rc, nob = 0;
+
+	LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_enc_iov);
+
+	blocksize = ll_crypto_blkcipher_blocksize(tfm);
+	LASSERT(blocksize > 1);
+	LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+	ciph_desc.tfm  = tfm;
+	ciph_desc.info = local_iv;
+	ciph_desc.flags = 0;
+
+	/* encrypt confounder */
+	buf_to_sg(&src, confounder, blocksize);
+	buf_to_sg(&dst, cipher->data, blocksize);
+
+	rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src, blocksize);
+	if (rc) {
+		CERROR("error to encrypt confounder: %d\n", rc);
+		return rc;
+	}
+
+	/* encrypt clear pages */
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		sg_set_page(&src, desc->bd_iov[i].kiov_page,
+			    (desc->bd_iov[i].kiov_len + blocksize - 1) &
+			    (~(blocksize - 1)),
+			    desc->bd_iov[i].kiov_offset);
+		if (adj_nob)
+			nob += src.length;
+		sg_set_page(&dst, desc->bd_enc_iov[i].kiov_page, src.length,
+			    src.offset);
+
+		desc->bd_enc_iov[i].kiov_offset = dst.offset;
+		desc->bd_enc_iov[i].kiov_len = dst.length;
+
+		rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src,
+						    src.length);
+		if (rc) {
+			CERROR("error to encrypt page: %d\n", rc);
+			return rc;
+		}
+	}
+
+	/* encrypt krb5 header */
+	buf_to_sg(&src, khdr, sizeof(*khdr));
+	buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr));
+
+	rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc,
+					    &dst, &src, sizeof(*khdr));
+	if (rc) {
+		CERROR("error to encrypt krb5 header: %d\n", rc);
+		return rc;
+	}
+
+	if (adj_nob)
+		desc->bd_nob = nob;
+
+	return 0;
+}
+
+/*
+ * desc->bd_nob_transferred is the size of cipher text received.
+ * desc->bd_nob is the target size of plain text supposed to be.
+ *
+ * if adj_nob != 0, we adjust each page's kiov_len to the actual
+ * plain text size.
+ * - for client read: we don't know data size for each page, so
+ *   bd_iov[]->kiov_len is set to PAGE_SIZE, but actual data received might
+ *   be smaller, so we need to adjust it according to bd_enc_iov[]->kiov_len.
+ *   this means we DO NOT support the situation that server send an odd size
+ *   data in a page which is not the last one.
+ * - for server write: we knows exactly data size for each page being expected,
+ *   thus kiov_len is accurate already, so we should not adjust it at all.
+ *   and bd_enc_iov[]->kiov_len should be round_up(bd_iov[]->kiov_len) which
+ *   should have been done by prep_bulk().
+ */
+static
+int krb5_decrypt_bulk(struct ll_crypto_cipher *tfm,
+		      struct krb5_header *khdr,
+		      struct ptlrpc_bulk_desc *desc,
+		      rawobj_t *cipher,
+		      rawobj_t *plain,
+		      int adj_nob)
+{
+	struct blkcipher_desc   ciph_desc;
+	__u8		    local_iv[16] = {0};
+	struct scatterlist      src, dst;
+	int		     ct_nob = 0, pt_nob = 0;
+	int		     blocksize, i, rc;
+
+	LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_enc_iov);
+	LASSERT(desc->bd_nob_transferred);
+
+	blocksize = ll_crypto_blkcipher_blocksize(tfm);
+	LASSERT(blocksize > 1);
+	LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+	ciph_desc.tfm  = tfm;
+	ciph_desc.info = local_iv;
+	ciph_desc.flags = 0;
+
+	if (desc->bd_nob_transferred % blocksize) {
+		CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
+		return -EPROTO;
+	}
+
+	/* decrypt head (confounder) */
+	buf_to_sg(&src, cipher->data, blocksize);
+	buf_to_sg(&dst, plain->data, blocksize);
+
+	rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src, blocksize);
+	if (rc) {
+		CERROR("error to decrypt confounder: %d\n", rc);
+		return rc;
+	}
+
+	for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred;
+	     i++) {
+		if (desc->bd_enc_iov[i].kiov_offset % blocksize != 0 ||
+		    desc->bd_enc_iov[i].kiov_len % blocksize != 0) {
+			CERROR("page %d: odd offset %u len %u, blocksize %d\n",
+			       i, desc->bd_enc_iov[i].kiov_offset,
+			       desc->bd_enc_iov[i].kiov_len, blocksize);
+			return -EFAULT;
+		}
+
+		if (adj_nob) {
+			if (ct_nob + desc->bd_enc_iov[i].kiov_len >
+			    desc->bd_nob_transferred)
+				desc->bd_enc_iov[i].kiov_len =
+					desc->bd_nob_transferred - ct_nob;
+
+			desc->bd_iov[i].kiov_len = desc->bd_enc_iov[i].kiov_len;
+			if (pt_nob + desc->bd_enc_iov[i].kiov_len >desc->bd_nob)
+				desc->bd_iov[i].kiov_len = desc->bd_nob -pt_nob;
+		} else {
+			/* this should be guaranteed by LNET */
+			LASSERT(ct_nob + desc->bd_enc_iov[i].kiov_len <=
+				desc->bd_nob_transferred);
+			LASSERT(desc->bd_iov[i].kiov_len <=
+				desc->bd_enc_iov[i].kiov_len);
+		}
+
+		if (desc->bd_enc_iov[i].kiov_len == 0)
+			continue;
+
+		sg_set_page(&src, desc->bd_enc_iov[i].kiov_page,
+			    desc->bd_enc_iov[i].kiov_len,
+			    desc->bd_enc_iov[i].kiov_offset);
+		dst = src;
+		if (desc->bd_iov[i].kiov_len % blocksize == 0)
+			sg_assign_page(&dst, desc->bd_iov[i].kiov_page);
+
+		rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src,
+						    src.length);
+		if (rc) {
+			CERROR("error to decrypt page: %d\n", rc);
+			return rc;
+		}
+
+		if (desc->bd_iov[i].kiov_len % blocksize != 0) {
+			memcpy(page_address(desc->bd_iov[i].kiov_page) +
+			       desc->bd_iov[i].kiov_offset,
+			       page_address(desc->bd_enc_iov[i].kiov_page) +
+			       desc->bd_iov[i].kiov_offset,
+			       desc->bd_iov[i].kiov_len);
+		}
+
+		ct_nob += desc->bd_enc_iov[i].kiov_len;
+		pt_nob += desc->bd_iov[i].kiov_len;
+	}
+
+	if (unlikely(ct_nob != desc->bd_nob_transferred)) {
+		CERROR("%d cipher text transferred but only %d decrypted\n",
+		       desc->bd_nob_transferred, ct_nob);
+		return -EFAULT;
+	}
+
+	if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
+		CERROR("%d plain text expected but only %d received\n",
+		       desc->bd_nob, pt_nob);
+		return -EFAULT;
+	}
+
+	/* if needed, clear up the rest unused iovs */
+	if (adj_nob)
+		while (i < desc->bd_iov_count)
+			desc->bd_iov[i++].kiov_len = 0;
+
+	/* decrypt tail (krb5 header) */
+	buf_to_sg(&src, cipher->data + blocksize, sizeof(*khdr));
+	buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr));
+
+	rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc,
+					    &dst, &src, sizeof(*khdr));
+	if (rc) {
+		CERROR("error to decrypt tail: %d\n", rc);
+		return rc;
+	}
+
+	if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
+		CERROR("krb5 header doesn't match\n");
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+static
+__u32 gss_wrap_kerberos(struct gss_ctx *gctx,
+			rawobj_t *gsshdr,
+			rawobj_t *msg,
+			int msg_buflen,
+			rawobj_t *token)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int		  blocksize;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+	rawobj_t	     data_desc[3], cipher;
+	__u8		 conf[GSS_MAX_CIPHER_BLOCK];
+	int		  rc = 0;
+
+	LASSERT(ke);
+	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+	LASSERT(kctx->kc_keye.kb_tfm == NULL ||
+		ke->ke_conf_size >=
+		ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm));
+
+	/*
+	 * final token format:
+	 * ---------------------------------------------------
+	 * | krb5 header | cipher text | checksum (16 bytes) |
+	 * ---------------------------------------------------
+	 */
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *) token->data;
+	fill_krb5_header(kctx, khdr, 1);
+
+	/* generate confounder */
+	cfs_get_random_bytes(conf, ke->ke_conf_size);
+
+	/* get encryption blocksize. note kc_keye might not associated with
+	 * a tfm, currently only for arcfour-hmac */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+	LASSERT(blocksize <= ke->ke_conf_size);
+
+	/* padding the message */
+	if (add_padding(msg, msg_buflen, blocksize))
+		return GSS_S_FAILURE;
+
+	/*
+	 * clear text layout for checksum:
+	 * ------------------------------------------------------
+	 * | confounder | gss header | clear msgs | krb5 header |
+	 * ------------------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+	data_desc[1].data = gsshdr->data;
+	data_desc[1].len = gsshdr->len;
+	data_desc[2].data = msg->data;
+	data_desc[2].len = msg->len;
+
+	/* compute checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 3, data_desc, 0, NULL, &cksum))
+		return GSS_S_FAILURE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	/*
+	 * clear text layout for encryption:
+	 * -----------------------------------------
+	 * | confounder | clear msgs | krb5 header |
+	 * -----------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+	data_desc[1].data = msg->data;
+	data_desc[1].len = msg->len;
+	data_desc[2].data = (__u8 *) khdr;
+	data_desc[2].len = sizeof(*khdr);
+
+	/* cipher text will be directly inplace */
+	cipher.data = (__u8 *) (khdr + 1);
+	cipher.len = token->len - sizeof(*khdr);
+	LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		rawobj_t		 arc4_keye;
+		struct ll_crypto_cipher *arc4_tfm;
+
+		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+			CERROR("failed to obtain arc4 enc key\n");
+			GOTO(arc4_out, rc = -EACCES);
+		}
+
+		arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+		if (IS_ERR(arc4_tfm)) {
+			CERROR("failed to alloc tfm arc4 in ECB mode\n");
+			GOTO(arc4_out_key, rc = -EACCES);
+		}
+
+		if (ll_crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data,
+					       arc4_keye.len)) {
+			CERROR("failed to set arc4 key, len %d\n",
+			       arc4_keye.len);
+			GOTO(arc4_out_tfm, rc = -EACCES);
+		}
+
+		rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
+					  3, data_desc, &cipher, 1);
+arc4_out_tfm:
+		ll_crypto_free_blkcipher(arc4_tfm);
+arc4_out_key:
+		rawobj_free(&arc4_keye);
+arc4_out:
+		do {} while(0); /* just to avoid compile warning */
+	} else {
+		rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
+					  3, data_desc, &cipher, 1);
+	}
+
+	if (rc != 0) {
+		rawobj_free(&cksum);
+		return GSS_S_FAILURE;
+	}
+
+	/* fill in checksum */
+	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+	memcpy((char *)(khdr + 1) + cipher.len,
+	       cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+	rawobj_free(&cksum);
+
+	/* final token length */
+	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
+			     struct ptlrpc_bulk_desc *desc)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	int		  blocksize, i;
+
+	LASSERT(desc->bd_iov_count);
+	LASSERT(desc->bd_enc_iov);
+	LASSERT(kctx->kc_keye.kb_tfm);
+
+	blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(desc->bd_enc_iov[i].kiov_page);
+		/*
+		 * offset should always start at page boundary of either
+		 * client or server side.
+		 */
+		if (desc->bd_iov[i].kiov_offset & blocksize) {
+			CERROR("odd offset %d in page %d\n",
+			       desc->bd_iov[i].kiov_offset, i);
+			return GSS_S_FAILURE;
+		}
+
+		desc->bd_enc_iov[i].kiov_offset = desc->bd_iov[i].kiov_offset;
+		desc->bd_enc_iov[i].kiov_len = (desc->bd_iov[i].kiov_len +
+						blocksize - 1) & (~(blocksize - 1));
+	}
+
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx,
+			     struct ptlrpc_bulk_desc *desc,
+			     rawobj_t *token, int adj_nob)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int		  blocksize;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+	rawobj_t	     data_desc[1], cipher;
+	__u8		 conf[GSS_MAX_CIPHER_BLOCK];
+	int		  rc = 0;
+
+	LASSERT(ke);
+	LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+
+	/*
+	 * final token format:
+	 * --------------------------------------------------
+	 * | krb5 header | head/tail cipher text | checksum |
+	 * --------------------------------------------------
+	 */
+
+	/* fill krb5 header */
+	LASSERT(token->len >= sizeof(*khdr));
+	khdr = (struct krb5_header *) token->data;
+	fill_krb5_header(kctx, khdr, 1);
+
+	/* generate confounder */
+	cfs_get_random_bytes(conf, ke->ke_conf_size);
+
+	/* get encryption blocksize. note kc_keye might not associated with
+	 * a tfm, currently only for arcfour-hmac */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+
+	/*
+	 * we assume the size of krb5_header (16 bytes) must be n * blocksize.
+	 * the bulk token size would be exactly (sizeof(krb5_header) +
+	 * blocksize + sizeof(krb5_header) + hashsize)
+	 */
+	LASSERT(blocksize <= ke->ke_conf_size);
+	LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+	LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16);
+
+	/*
+	 * clear text layout for checksum:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+
+	/* compute checksum */
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 1, data_desc,
+			       desc->bd_iov_count, desc->bd_iov,
+			       &cksum))
+		return GSS_S_FAILURE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	/*
+	 * clear text layout for encryption:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 *	|	      |	     |
+	 *	----------  (cipher pages)   |
+	 * result token:   |		   |
+	 * -------------------------------------------
+	 * | krb5 header | cipher text | cipher text |
+	 * -------------------------------------------
+	 */
+	data_desc[0].data = conf;
+	data_desc[0].len = ke->ke_conf_size;
+
+	cipher.data = (__u8 *) (khdr + 1);
+	cipher.len = blocksize + sizeof(*khdr);
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LBUG();
+		rc = 0;
+	} else {
+		rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+				       conf, desc, &cipher, adj_nob);
+	}
+
+	if (rc != 0) {
+		rawobj_free(&cksum);
+		return GSS_S_FAILURE;
+	}
+
+	/* fill in checksum */
+	LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+	memcpy((char *)(khdr + 1) + cipher.len,
+	       cksum.data + cksum.len - ke->ke_hash_size,
+	       ke->ke_hash_size);
+	rawobj_free(&cksum);
+
+	/* final token length */
+	token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+	return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
+			  rawobj_t	*gsshdr,
+			  rawobj_t	*token,
+			  rawobj_t	*msg)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	unsigned char       *tmpbuf;
+	int		  blocksize, bodysize;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+	rawobj_t	     cipher_in, plain_out;
+	rawobj_t	     hash_objs[3];
+	int		  rc = 0;
+	__u32		major;
+
+	LASSERT(ke);
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	khdr = (struct krb5_header *) token->data;
+
+	major = verify_krb5_header(kctx, khdr, 1);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
+
+	/* block size */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+
+	/* expected token layout:
+	 * ----------------------------------------
+	 * | krb5 header | cipher text | checksum |
+	 * ----------------------------------------
+	 */
+	bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
+
+	if (bodysize % blocksize) {
+		CERROR("odd bodysize %d\n", bodysize);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
+		CERROR("incomplete token: bodysize %d\n", bodysize);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
+		CERROR("buffer too small: %u, require %d\n",
+		       msg->len, bodysize - ke->ke_conf_size);
+		return GSS_S_FAILURE;
+	}
+
+	/* decrypting */
+	OBD_ALLOC_LARGE(tmpbuf, bodysize);
+	if (!tmpbuf)
+		return GSS_S_FAILURE;
+
+	major = GSS_S_FAILURE;
+
+	cipher_in.data = (__u8 *) (khdr + 1);
+	cipher_in.len = bodysize;
+	plain_out.data = tmpbuf;
+	plain_out.len = bodysize;
+
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		rawobj_t		 arc4_keye;
+		struct ll_crypto_cipher *arc4_tfm;
+
+		cksum.data = token->data + token->len - ke->ke_hash_size;
+		cksum.len = ke->ke_hash_size;
+
+		if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+				       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+			CERROR("failed to obtain arc4 enc key\n");
+			GOTO(arc4_out, rc = -EACCES);
+		}
+
+		arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+		if (IS_ERR(arc4_tfm)) {
+			CERROR("failed to alloc tfm arc4 in ECB mode\n");
+			GOTO(arc4_out_key, rc = -EACCES);
+		}
+
+		if (ll_crypto_blkcipher_setkey(arc4_tfm,
+					 arc4_keye.data, arc4_keye.len)) {
+			CERROR("failed to set arc4 key, len %d\n",
+			       arc4_keye.len);
+			GOTO(arc4_out_tfm, rc = -EACCES);
+		}
+
+		rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
+					  1, &cipher_in, &plain_out, 0);
+arc4_out_tfm:
+		ll_crypto_free_blkcipher(arc4_tfm);
+arc4_out_key:
+		rawobj_free(&arc4_keye);
+arc4_out:
+		cksum = RAWOBJ_EMPTY;
+	} else {
+		rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
+					  1, &cipher_in, &plain_out, 0);
+	}
+
+	if (rc != 0) {
+		CERROR("error decrypt\n");
+		goto out_free;
+	}
+	LASSERT(plain_out.len == bodysize);
+
+	/* expected clear text layout:
+	 * -----------------------------------------
+	 * | confounder | clear msgs | krb5 header |
+	 * -----------------------------------------
+	 */
+
+	/* verify krb5 header in token is not modified */
+	if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
+		   sizeof(*khdr))) {
+		CERROR("decrypted krb5 header mismatch\n");
+		goto out_free;
+	}
+
+	/* verify checksum, compose clear text as layout:
+	 * ------------------------------------------------------
+	 * | confounder | gss header | clear msgs | krb5 header |
+	 * ------------------------------------------------------
+	 */
+	hash_objs[0].len = ke->ke_conf_size;
+	hash_objs[0].data = plain_out.data;
+	hash_objs[1].len = gsshdr->len;
+	hash_objs[1].data = gsshdr->data;
+	hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
+	hash_objs[2].data = plain_out.data + ke->ke_conf_size;
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 3, hash_objs, 0, NULL, &cksum))
+		goto out_free;
+
+	LASSERT(cksum.len >= ke->ke_hash_size);
+	if (memcmp((char *)(khdr + 1) + bodysize,
+		   cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		goto out_free;
+	}
+
+	msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
+	memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
+
+	major = GSS_S_COMPLETE;
+out_free:
+	OBD_FREE_LARGE(tmpbuf, bodysize);
+	rawobj_free(&cksum);
+	return major;
+}
+
+static
+__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx,
+			       struct ptlrpc_bulk_desc *desc,
+			       rawobj_t *token, int adj_nob)
+{
+	struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+	struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+	struct krb5_header  *khdr;
+	int		  blocksize;
+	rawobj_t	     cksum = RAWOBJ_EMPTY;
+	rawobj_t	     cipher, plain;
+	rawobj_t	     data_desc[1];
+	int		  rc;
+	__u32		major;
+
+	LASSERT(ke);
+
+	if (token->len < sizeof(*khdr)) {
+		CERROR("short signature: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	khdr = (struct krb5_header *) token->data;
+
+	major = verify_krb5_header(kctx, khdr, 1);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("bad krb5 header\n");
+		return major;
+	}
+
+	/* block size */
+	if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+		LASSERT(kctx->kc_keye.kb_tfm == NULL);
+		blocksize = 1;
+		LBUG();
+	} else {
+		LASSERT(kctx->kc_keye.kb_tfm);
+		blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+	}
+	LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+
+	/*
+	 * token format is expected as:
+	 * -----------------------------------------------
+	 * | krb5 header | head/tail cipher text | cksum |
+	 * -----------------------------------------------
+	 */
+	if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) +
+			 ke->ke_hash_size) {
+		CERROR("short token size: %u\n", token->len);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	cipher.data = (__u8 *) (khdr + 1);
+	cipher.len = blocksize + sizeof(*khdr);
+	plain.data = cipher.data;
+	plain.len = cipher.len;
+
+	rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+			       desc, &cipher, &plain, adj_nob);
+	if (rc)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	/*
+	 * verify checksum, compose clear text as layout:
+	 * ------------------------------------------
+	 * | confounder | clear pages | krb5 header |
+	 * ------------------------------------------
+	 */
+	data_desc[0].data = plain.data;
+	data_desc[0].len = blocksize;
+
+	if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+			       khdr, 1, data_desc,
+			       desc->bd_iov_count, desc->bd_iov,
+			       &cksum))
+		return GSS_S_FAILURE;
+	LASSERT(cksum.len >= ke->ke_hash_size);
+
+	if (memcmp(plain.data + blocksize + sizeof(*khdr),
+		   cksum.data + cksum.len - ke->ke_hash_size,
+		   ke->ke_hash_size)) {
+		CERROR("checksum mismatch\n");
+		rawobj_free(&cksum);
+		return GSS_S_BAD_SIG;
+	}
+
+	rawobj_free(&cksum);
+	return GSS_S_COMPLETE;
+}
+
+int gss_display_kerberos(struct gss_ctx	*ctx,
+			 char		  *buf,
+			 int		    bufsize)
+{
+	struct krb5_ctx    *kctx = ctx->internal_ctx_id;
+	int		 written;
+
+	written = snprintf(buf, bufsize, "krb5 (%s)",
+			   enctype2str(kctx->kc_enctype));
+	return written;
+}
+
+static struct gss_api_ops gss_kerberos_ops = {
+	.gss_import_sec_context     = gss_import_sec_context_kerberos,
+	.gss_copy_reverse_context   = gss_copy_reverse_context_kerberos,
+	.gss_inquire_context	= gss_inquire_context_kerberos,
+	.gss_get_mic		= gss_get_mic_kerberos,
+	.gss_verify_mic	     = gss_verify_mic_kerberos,
+	.gss_wrap		   = gss_wrap_kerberos,
+	.gss_unwrap		 = gss_unwrap_kerberos,
+	.gss_prep_bulk	      = gss_prep_bulk_kerberos,
+	.gss_wrap_bulk	      = gss_wrap_bulk_kerberos,
+	.gss_unwrap_bulk	    = gss_unwrap_bulk_kerberos,
+	.gss_delete_sec_context     = gss_delete_sec_context_kerberos,
+	.gss_display		= gss_display_kerberos,
+};
+
+static struct subflavor_desc gss_kerberos_sfs[] = {
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_KRB5N,
+		.sf_qop	 = 0,
+		.sf_service     = SPTLRPC_SVC_NULL,
+		.sf_name	= "krb5n"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_KRB5A,
+		.sf_qop	 = 0,
+		.sf_service     = SPTLRPC_SVC_AUTH,
+		.sf_name	= "krb5a"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_KRB5I,
+		.sf_qop	 = 0,
+		.sf_service     = SPTLRPC_SVC_INTG,
+		.sf_name	= "krb5i"
+	},
+	{
+		.sf_subflavor   = SPTLRPC_SUBFLVR_KRB5P,
+		.sf_qop	 = 0,
+		.sf_service     = SPTLRPC_SVC_PRIV,
+		.sf_name	= "krb5p"
+	},
+};
+
+/*
+ * currently we leave module owner NULL
+ */
+static struct gss_api_mech gss_kerberos_mech = {
+	.gm_owner       = NULL, /*THIS_MODULE, */
+	.gm_name	= "krb5",
+	.gm_oid	 = (rawobj_t)
+				{9, "\052\206\110\206\367\022\001\002\002"},
+	.gm_ops	 = &gss_kerberos_ops,
+	.gm_sf_num      = 4,
+	.gm_sfs	 = gss_kerberos_sfs,
+};
+
+int __init init_kerberos_module(void)
+{
+	int status;
+
+	spin_lock_init(&krb5_seq_lock);
+
+	status = lgss_mech_register(&gss_kerberos_mech);
+	if (status)
+		CERROR("Failed to register kerberos gss mechanism!\n");
+	return status;
+}
+
+void __exit cleanup_kerberos_module(void)
+{
+	lgss_mech_unregister(&gss_kerberos_mech);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c
new file mode 100644
index 0000000..8cdad80
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c

@@ -0,0 +1,359 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_mech_switch.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  J. Bruce Fields   <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static LIST_HEAD(registered_mechs);
+static DEFINE_SPINLOCK(registered_mechs_lock);
+
+int lgss_mech_register(struct gss_api_mech *gm)
+{
+	spin_lock(&registered_mechs_lock);
+	list_add(&gm->gm_list, &registered_mechs);
+	spin_unlock(&registered_mechs_lock);
+	CWARN("Register %s mechanism\n", gm->gm_name);
+	return 0;
+}
+
+void lgss_mech_unregister(struct gss_api_mech *gm)
+{
+	spin_lock(&registered_mechs_lock);
+	list_del(&gm->gm_list);
+	spin_unlock(&registered_mechs_lock);
+	CWARN("Unregister %s mechanism\n", gm->gm_name);
+}
+
+
+struct gss_api_mech *lgss_mech_get(struct gss_api_mech *gm)
+{
+	__module_get(gm->gm_owner);
+	return gm;
+}
+
+struct gss_api_mech *lgss_name_to_mech(char *name)
+{
+	struct gss_api_mech *pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (0 == strcmp(name, pos->gm_name)) {
+			if (!try_module_get(pos->gm_owner))
+				continue;
+			gm = pos;
+			break;
+		}
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+
+}
+
+static inline
+int mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor)
+{
+	int i;
+
+	for (i = 0; i < gm->gm_sf_num; i++) {
+		if (gm->gm_sfs[i].sf_subflavor == subflavor)
+			return 1;
+	}
+	return 0;
+}
+
+struct gss_api_mech *lgss_subflavor_to_mech(__u32 subflavor)
+{
+	struct gss_api_mech *pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (!try_module_get(pos->gm_owner))
+			continue;
+		if (!mech_supports_subflavor(pos, subflavor)) {
+			module_put(pos->gm_owner);
+			continue;
+		}
+		gm = pos;
+		break;
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+}
+
+void lgss_mech_put(struct gss_api_mech *gm)
+{
+	module_put(gm->gm_owner);
+}
+
+/* The mech could probably be determined from the token instead, but it's just
+ * as easy for now to pass it in. */
+__u32 lgss_import_sec_context(rawobj_t *input_token,
+			      struct gss_api_mech *mech,
+			      struct gss_ctx **ctx_id)
+{
+	OBD_ALLOC_PTR(*ctx_id);
+	if (*ctx_id == NULL)
+		return GSS_S_FAILURE;
+
+	(*ctx_id)->mech_type = lgss_mech_get(mech);
+
+	LASSERT(mech);
+	LASSERT(mech->gm_ops);
+	LASSERT(mech->gm_ops->gss_import_sec_context);
+	return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+}
+
+__u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id,
+				struct gss_ctx **ctx_id_new)
+{
+	struct gss_api_mech *mech = ctx_id->mech_type;
+	__u32		major;
+
+	LASSERT(mech);
+
+	OBD_ALLOC_PTR(*ctx_id_new);
+	if (*ctx_id_new == NULL)
+		return GSS_S_FAILURE;
+
+	(*ctx_id_new)->mech_type = lgss_mech_get(mech);
+
+	LASSERT(mech);
+	LASSERT(mech->gm_ops);
+	LASSERT(mech->gm_ops->gss_copy_reverse_context);
+
+	major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
+	if (major != GSS_S_COMPLETE) {
+		lgss_mech_put(mech);
+		OBD_FREE_PTR(*ctx_id_new);
+		*ctx_id_new = NULL;
+	}
+	return major;
+}
+
+/*
+ * this interface is much simplified, currently we only need endtime.
+ */
+__u32 lgss_inquire_context(struct gss_ctx *context_handle,
+			   unsigned long  *endtime)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context);
+
+	return context_handle->mech_type->gm_ops
+		->gss_inquire_context(context_handle,
+				      endtime);
+}
+
+/* gss_get_mic: compute a mic over message and return mic_token. */
+__u32 lgss_get_mic(struct gss_ctx *context_handle,
+		   int msgcnt,
+		   rawobj_t *msg,
+		   int iovcnt,
+		   lnet_kiov_t *iovs,
+		   rawobj_t *mic_token)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_get_mic);
+
+	return context_handle->mech_type->gm_ops
+		->gss_get_mic(context_handle,
+			      msgcnt,
+			      msg,
+			      iovcnt,
+			      iovs,
+			      mic_token);
+}
+
+/* gss_verify_mic: check whether the provided mic_token verifies message. */
+__u32 lgss_verify_mic(struct gss_ctx *context_handle,
+		      int msgcnt,
+		      rawobj_t *msg,
+		      int iovcnt,
+		      lnet_kiov_t *iovs,
+		      rawobj_t *mic_token)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic);
+
+	return context_handle->mech_type->gm_ops
+		->gss_verify_mic(context_handle,
+				 msgcnt,
+				 msg,
+				 iovcnt,
+				 iovs,
+				 mic_token);
+}
+
+__u32 lgss_wrap(struct gss_ctx *context_handle,
+		rawobj_t *gsshdr,
+		rawobj_t *msg,
+		int msg_buflen,
+		rawobj_t *out_token)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_wrap);
+
+	return context_handle->mech_type->gm_ops
+		->gss_wrap(context_handle, gsshdr, msg, msg_buflen, out_token);
+}
+
+__u32 lgss_unwrap(struct gss_ctx *context_handle,
+		  rawobj_t *gsshdr,
+		  rawobj_t *token,
+		  rawobj_t *out_msg)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_unwrap);
+
+	return context_handle->mech_type->gm_ops
+		->gss_unwrap(context_handle, gsshdr, token, out_msg);
+}
+
+
+__u32 lgss_prep_bulk(struct gss_ctx *context_handle,
+		     struct ptlrpc_bulk_desc *desc)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk);
+
+	return context_handle->mech_type->gm_ops
+		->gss_prep_bulk(context_handle, desc);
+}
+
+__u32 lgss_wrap_bulk(struct gss_ctx *context_handle,
+		     struct ptlrpc_bulk_desc *desc,
+		     rawobj_t *token,
+		     int adj_nob)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk);
+
+	return context_handle->mech_type->gm_ops
+		->gss_wrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle,
+		       struct ptlrpc_bulk_desc *desc,
+		       rawobj_t *token,
+		       int adj_nob)
+{
+	LASSERT(context_handle);
+	LASSERT(context_handle->mech_type);
+	LASSERT(context_handle->mech_type->gm_ops);
+	LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk);
+
+	return context_handle->mech_type->gm_ops
+		->gss_unwrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+/* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+
+__u32 lgss_delete_sec_context(struct gss_ctx **context_handle)
+{
+	struct gss_api_mech *mech;
+
+	CDEBUG(D_SEC, "deleting %p\n", *context_handle);
+
+	if (!*context_handle)
+		return(GSS_S_NO_CONTEXT);
+
+	mech = (*context_handle)->mech_type;
+	if ((*context_handle)->internal_ctx_id != 0) {
+		LASSERT(mech);
+		LASSERT(mech->gm_ops);
+		LASSERT(mech->gm_ops->gss_delete_sec_context);
+		mech->gm_ops->gss_delete_sec_context(
+					(*context_handle)->internal_ctx_id);
+	}
+	if (mech)
+		lgss_mech_put(mech);
+
+	OBD_FREE_PTR(*context_handle);
+	*context_handle=NULL;
+	return GSS_S_COMPLETE;
+}
+
+int lgss_display(struct gss_ctx *ctx,
+		 char	   *buf,
+		 int	     bufsize)
+{
+	LASSERT(ctx);
+	LASSERT(ctx->mech_type);
+	LASSERT(ctx->mech_type->gm_ops);
+	LASSERT(ctx->mech_type->gm_ops->gss_display);
+
+	return ctx->mech_type->gm_ops->gss_display(ctx, buf, bufsize);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c
new file mode 100644
index 0000000..3df7257
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c

@@ -0,0 +1,1252 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+#include <asm/atomic.h>
+struct rpc_clnt; /* for rpc_pipefs */
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_pipefs;
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops;
+
+static int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx);
+
+static int gss_sec_pipe_upcall_init(struct gss_sec *gsec)
+{
+	return 0;
+}
+
+static void gss_sec_pipe_upcall_fini(struct gss_sec *gsec)
+{
+}
+
+/****************************************
+ * internel context helpers	     *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *ctx_create_pf(struct ptlrpc_sec *sec,
+				     struct vfs_cred *vcred)
+{
+	struct gss_cli_ctx *gctx;
+	int		 rc;
+
+	OBD_ALLOC_PTR(gctx);
+	if (gctx == NULL)
+		return NULL;
+
+	rc = gss_cli_ctx_init_common(sec, &gctx->gc_base,
+				     &gss_pipefs_ctxops, vcred);
+	if (rc) {
+		OBD_FREE_PTR(gctx);
+		return NULL;
+	}
+
+	return &gctx->gc_base;
+}
+
+static
+void ctx_destroy_pf(struct ptlrpc_sec *sec, struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+	if (gss_cli_ctx_fini_common(sec, ctx))
+		return;
+
+	OBD_FREE_PTR(gctx);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static
+void ctx_enhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *hash)
+{
+	set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+	atomic_inc(&ctx->cc_refcount);
+	hlist_add_head(&ctx->cc_cache, hash);
+}
+
+/*
+ * caller must hold spinlock
+ */
+static
+void ctx_unhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *freelist)
+{
+	LASSERT(spin_is_locked(&ctx->cc_sec->ps_lock));
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+	LASSERT(!hlist_unhashed(&ctx->cc_cache));
+
+	clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+
+	if (atomic_dec_and_test(&ctx->cc_refcount)) {
+		__hlist_del(&ctx->cc_cache);
+		hlist_add_head(&ctx->cc_cache, freelist);
+	} else {
+		hlist_del_init(&ctx->cc_cache);
+	}
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+static
+int ctx_check_death_pf(struct ptlrpc_cli_ctx *ctx,
+		       struct hlist_head *freelist)
+{
+	if (cli_ctx_check_death(ctx)) {
+		if (freelist)
+			ctx_unhash_pf(ctx, freelist);
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline
+int ctx_check_death_locked_pf(struct ptlrpc_cli_ctx *ctx,
+			      struct hlist_head *freelist)
+{
+	LASSERT(ctx->cc_sec);
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+
+	return ctx_check_death_pf(ctx, freelist);
+}
+
+static inline
+int ctx_match_pf(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+	/* a little bit optimization for null policy */
+	if (!ctx->cc_ops->match)
+		return 1;
+
+	return ctx->cc_ops->match(ctx, vcred);
+}
+
+static
+void ctx_list_destroy_pf(struct hlist_head *head)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	while (!hlist_empty(head)) {
+		ctx = hlist_entry(head->first, struct ptlrpc_cli_ctx,
+				      cc_cache);
+
+		LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+		LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT,
+				     &ctx->cc_flags) == 0);
+
+		hlist_del_init(&ctx->cc_cache);
+		ctx_destroy_pf(ctx->cc_sec, ctx);
+	}
+}
+
+/****************************************
+ * context apis			 *
+ ****************************************/
+
+static
+int gss_cli_ctx_validate_pf(struct ptlrpc_cli_ctx *ctx)
+{
+	if (ctx_check_death_pf(ctx, NULL))
+		return 1;
+	if (cli_ctx_is_ready(ctx))
+		return 0;
+	return 1;
+}
+
+static
+void gss_cli_ctx_die_pf(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+	LASSERT(ctx->cc_sec);
+	LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+	cli_ctx_expire(ctx);
+
+	spin_lock(&ctx->cc_sec->ps_lock);
+
+	if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)) {
+		LASSERT(!hlist_unhashed(&ctx->cc_cache));
+		LASSERT(atomic_read(&ctx->cc_refcount) > 1);
+
+		hlist_del_init(&ctx->cc_cache);
+		if (atomic_dec_and_test(&ctx->cc_refcount))
+			LBUG();
+	}
+
+	spin_unlock(&ctx->cc_sec->ps_lock);
+}
+
+/****************************************
+ * reverse context installation	 *
+ ****************************************/
+
+static inline
+unsigned int ctx_hash_index(int hashsize, __u64 key)
+{
+	return (unsigned int) (key & ((__u64) hashsize - 1));
+}
+
+static
+void gss_sec_ctx_replace_pf(struct gss_sec *gsec,
+			    struct ptlrpc_cli_ctx *new)
+{
+	struct gss_sec_pipefs *gsec_pf;
+	struct ptlrpc_cli_ctx *ctx;
+	struct hlist_node     *next;
+	HLIST_HEAD(freelist);
+	unsigned int hash;
+	ENTRY;
+
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+	hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+			      (__u64) new->cc_vcred.vc_uid);
+	LASSERT(hash < gsec_pf->gsp_chash_size);
+
+	spin_lock(&gsec->gs_base.ps_lock);
+
+	hlist_for_each_entry_safe(ctx, next,
+				      &gsec_pf->gsp_chash[hash], cc_cache) {
+		if (!ctx_match_pf(ctx, &new->cc_vcred))
+			continue;
+
+		cli_ctx_expire(ctx);
+		ctx_unhash_pf(ctx, &freelist);
+		break;
+	}
+
+	ctx_enhash_pf(new, &gsec_pf->gsp_chash[hash]);
+
+	spin_unlock(&gsec->gs_base.ps_lock);
+
+	ctx_list_destroy_pf(&freelist);
+	EXIT;
+}
+
+static
+int gss_install_rvs_cli_ctx_pf(struct gss_sec *gsec,
+			       struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct vfs_cred	  vcred;
+	struct ptlrpc_cli_ctx   *cli_ctx;
+	int		      rc;
+	ENTRY;
+
+	vcred.vc_uid = 0;
+	vcred.vc_gid = 0;
+
+	cli_ctx = ctx_create_pf(&gsec->gs_base, &vcred);
+	if (!cli_ctx)
+		RETURN(-ENOMEM);
+
+	rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+	if (rc) {
+		ctx_destroy_pf(cli_ctx->cc_sec, cli_ctx);
+		RETURN(rc);
+	}
+
+	gss_sec_ctx_replace_pf(gsec, cli_ctx);
+	RETURN(0);
+}
+
+static
+void gss_ctx_cache_gc_pf(struct gss_sec_pipefs *gsec_pf,
+			 struct hlist_head *freelist)
+{
+	struct ptlrpc_sec       *sec;
+	struct ptlrpc_cli_ctx   *ctx;
+	struct hlist_node       *next;
+	int i;
+	ENTRY;
+
+	sec = &gsec_pf->gsp_base.gs_base;
+
+	CDEBUG(D_SEC, "do gc on sec %s@%p\n", sec->ps_policy->sp_name, sec);
+
+	for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+		hlist_for_each_entry_safe(ctx, next,
+					      &gsec_pf->gsp_chash[i], cc_cache)
+			ctx_check_death_locked_pf(ctx, freelist);
+	}
+
+	sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+	EXIT;
+}
+
+static
+struct ptlrpc_sec* gss_sec_create_pf(struct obd_import *imp,
+				     struct ptlrpc_svc_ctx *ctx,
+				     struct sptlrpc_flavor *sf)
+{
+	struct gss_sec_pipefs   *gsec_pf;
+	int		      alloc_size, hash_size, i;
+	ENTRY;
+
+#define GSS_SEC_PIPEFS_CTX_HASH_SIZE    (32)
+
+	if (ctx ||
+	    sf->sf_flags & (PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_REVERSE))
+		hash_size = 1;
+	else
+		hash_size = GSS_SEC_PIPEFS_CTX_HASH_SIZE;
+
+	alloc_size = sizeof(*gsec_pf) +
+		     sizeof(struct hlist_head) * hash_size;
+
+	OBD_ALLOC(gsec_pf, alloc_size);
+	if (!gsec_pf)
+		RETURN(NULL);
+
+	gsec_pf->gsp_chash_size = hash_size;
+	for (i = 0; i < hash_size; i++)
+		INIT_HLIST_HEAD(&gsec_pf->gsp_chash[i]);
+
+	if (gss_sec_create_common(&gsec_pf->gsp_base, &gss_policy_pipefs,
+				  imp, ctx, sf))
+		goto err_free;
+
+	if (ctx == NULL) {
+		if (gss_sec_pipe_upcall_init(&gsec_pf->gsp_base))
+			goto err_destroy;
+	} else {
+		if (gss_install_rvs_cli_ctx_pf(&gsec_pf->gsp_base, ctx))
+			goto err_destroy;
+	}
+
+	RETURN(&gsec_pf->gsp_base.gs_base);
+
+err_destroy:
+	gss_sec_destroy_common(&gsec_pf->gsp_base);
+err_free:
+	OBD_FREE(gsec_pf, alloc_size);
+	RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_pf(struct ptlrpc_sec *sec)
+{
+	struct gss_sec_pipefs   *gsec_pf;
+	struct gss_sec	  *gsec;
+
+	CWARN("destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+	LASSERT(gsec_pf->gsp_chash);
+	LASSERT(gsec_pf->gsp_chash_size);
+
+	gss_sec_pipe_upcall_fini(gsec);
+
+	gss_sec_destroy_common(gsec);
+
+	OBD_FREE(gsec, sizeof(*gsec_pf) +
+		       sizeof(struct hlist_head) * gsec_pf->gsp_chash_size);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_pf(struct ptlrpc_sec *sec,
+					      struct vfs_cred *vcred,
+					      int create, int remove_dead)
+{
+	struct gss_sec	 *gsec;
+	struct gss_sec_pipefs  *gsec_pf;
+	struct ptlrpc_cli_ctx  *ctx = NULL, *new = NULL;
+	struct hlist_head       *hash_head;
+	struct hlist_node       *next;
+	HLIST_HEAD(freelist);
+	unsigned int	    hash, gc = 0, found = 0;
+	ENTRY;
+
+	might_sleep();
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+	hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+			      (__u64) vcred->vc_uid);
+	hash_head = &gsec_pf->gsp_chash[hash];
+	LASSERT(hash < gsec_pf->gsp_chash_size);
+
+retry:
+	spin_lock(&sec->ps_lock);
+
+	/* gc_next == 0 means never do gc */
+	if (remove_dead && sec->ps_gc_next &&
+	    cfs_time_after(cfs_time_current_sec(), sec->ps_gc_next)) {
+		gss_ctx_cache_gc_pf(gsec_pf, &freelist);
+		gc = 1;
+	}
+
+	hlist_for_each_entry_safe(ctx, next, hash_head, cc_cache) {
+		if (gc == 0 &&
+		    ctx_check_death_locked_pf(ctx,
+					      remove_dead ? &freelist : NULL))
+			continue;
+
+		if (ctx_match_pf(ctx, vcred)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		if (new && new != ctx) {
+			/* lost the race, just free it */
+			hlist_add_head(&new->cc_cache, &freelist);
+			new = NULL;
+		}
+
+		/* hot node, move to head */
+		if (hash_head->first != &ctx->cc_cache) {
+			__hlist_del(&ctx->cc_cache);
+			hlist_add_head(&ctx->cc_cache, hash_head);
+		}
+	} else {
+		/* don't allocate for reverse sec */
+		if (sec_is_reverse(sec)) {
+			spin_unlock(&sec->ps_lock);
+			RETURN(NULL);
+		}
+
+		if (new) {
+			ctx_enhash_pf(new, hash_head);
+			ctx = new;
+		} else if (create) {
+			spin_unlock(&sec->ps_lock);
+			new = ctx_create_pf(sec, vcred);
+			if (new) {
+				clear_bit(PTLRPC_CTX_NEW_BIT, &new->cc_flags);
+				goto retry;
+			}
+		} else {
+			ctx = NULL;
+		}
+	}
+
+	/* hold a ref */
+	if (ctx)
+		atomic_inc(&ctx->cc_refcount);
+
+	spin_unlock(&sec->ps_lock);
+
+	/* the allocator of the context must give the first push to refresh */
+	if (new) {
+		LASSERT(new == ctx);
+		gss_cli_ctx_refresh_pf(new);
+	}
+
+	ctx_list_destroy_pf(&freelist);
+	RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_pf(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx,
+			    int sync)
+{
+	LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+	LASSERT(hlist_unhashed(&ctx->cc_cache));
+
+	/* if required async, we must clear the UPTODATE bit to prevent extra
+	 * rpcs during destroy procedure. */
+	if (!sync)
+		clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+	/* destroy this context */
+	ctx_destroy_pf(sec, ctx);
+}
+
+/*
+ * @uid: which user. "-1" means flush all.
+ * @grace: mark context DEAD, allow graceful destroy like notify
+ *	 server side, etc.
+ * @force: also flush busy entries.
+ *
+ * return the number of busy context encountered.
+ *
+ * In any cases, never touch "eternal" contexts.
+ */
+static
+int gss_sec_flush_ctx_cache_pf(struct ptlrpc_sec *sec,
+			       uid_t uid,
+			       int grace, int force)
+{
+	struct gss_sec	  *gsec;
+	struct gss_sec_pipefs   *gsec_pf;
+	struct ptlrpc_cli_ctx   *ctx;
+	struct hlist_node       *next;
+	HLIST_HEAD(freelist);
+	int i, busy = 0;
+	ENTRY;
+
+	might_sleep_if(grace);
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+	spin_lock(&sec->ps_lock);
+	for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+		hlist_for_each_entry_safe(ctx, next,
+					      &gsec_pf->gsp_chash[i],
+					      cc_cache) {
+			LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+			if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+				continue;
+
+			if (atomic_read(&ctx->cc_refcount) > 1) {
+				busy++;
+				if (!force)
+					continue;
+
+				CWARN("flush busy(%d) ctx %p(%u->%s) by force, "
+				      "grace %d\n",
+				      atomic_read(&ctx->cc_refcount),
+				      ctx, ctx->cc_vcred.vc_uid,
+				      sec2target_str(ctx->cc_sec), grace);
+			}
+			ctx_unhash_pf(ctx, &freelist);
+
+			set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+			if (!grace)
+				clear_bit(PTLRPC_CTX_UPTODATE_BIT,
+					  &ctx->cc_flags);
+		}
+	}
+	spin_unlock(&sec->ps_lock);
+
+	ctx_list_destroy_pf(&freelist);
+	RETURN(busy);
+}
+
+/****************************************
+ * service apis			 *
+ ****************************************/
+
+static
+int gss_svc_accept_pf(struct ptlrpc_request *req)
+{
+	return gss_svc_accept(&gss_policy_pipefs, req);
+}
+
+static
+int gss_svc_install_rctx_pf(struct obd_import *imp,
+			    struct ptlrpc_svc_ctx *ctx)
+{
+	struct ptlrpc_sec *sec;
+	int		rc;
+
+	sec = sptlrpc_import_sec_ref(imp);
+	LASSERT(sec);
+	rc = gss_install_rvs_cli_ctx_pf(sec2gsec(sec), ctx);
+
+	sptlrpc_sec_put(sec);
+	return rc;
+}
+
+/****************************************
+ * rpc_pipefs definitions	       *
+ ****************************************/
+
+#define LUSTRE_PIPE_ROOT	"/lustre"
+#define LUSTRE_PIPE_KRB5	LUSTRE_PIPE_ROOT"/krb5"
+
+struct gss_upcall_msg_data {
+	__u32			   gum_seq;
+	__u32			   gum_uid;
+	__u32			   gum_gid;
+	__u32			   gum_svc;	/* MDS/OSS... */
+	__u64			   gum_nid;	/* peer NID */
+	__u8			    gum_obd[64];    /* client obd name */
+};
+
+struct gss_upcall_msg {
+	struct rpc_pipe_msg	     gum_base;
+	atomic_t		    gum_refcount;
+	struct list_head		      gum_list;
+	__u32			   gum_mechidx;
+	struct gss_sec		 *gum_gsec;
+	struct gss_cli_ctx	     *gum_gctx;
+	struct gss_upcall_msg_data      gum_data;
+};
+
+static atomic_t upcall_seq = ATOMIC_INIT(0);
+
+static inline
+__u32 upcall_get_sequence(void)
+{
+	return (__u32) atomic_inc_return(&upcall_seq);
+}
+
+enum mech_idx_t {
+	MECH_KRB5   = 0,
+	MECH_MAX
+};
+
+static inline
+__u32 mech_name2idx(const char *name)
+{
+	LASSERT(!strcmp(name, "krb5"));
+	return MECH_KRB5;
+}
+
+/* pipefs dentries for each mechanisms */
+static struct dentry *de_pipes[MECH_MAX] = { NULL, };
+/* all upcall messgaes linked here */
+static struct list_head upcall_lists[MECH_MAX];
+/* and protected by this */
+static spinlock_t upcall_locks[MECH_MAX];
+
+static inline
+void upcall_list_lock(int idx)
+{
+	spin_lock(&upcall_locks[idx]);
+}
+
+static inline
+void upcall_list_unlock(int idx)
+{
+	spin_unlock(&upcall_locks[idx]);
+}
+
+static
+void upcall_msg_enlist(struct gss_upcall_msg *msg)
+{
+	__u32 idx = msg->gum_mechidx;
+
+	upcall_list_lock(idx);
+	list_add(&msg->gum_list, &upcall_lists[idx]);
+	upcall_list_unlock(idx);
+}
+
+static
+void upcall_msg_delist(struct gss_upcall_msg *msg)
+{
+	__u32 idx = msg->gum_mechidx;
+
+	upcall_list_lock(idx);
+	list_del_init(&msg->gum_list);
+	upcall_list_unlock(idx);
+}
+
+/****************************************
+ * rpc_pipefs upcall helpers	    *
+ ****************************************/
+
+static
+void gss_release_msg(struct gss_upcall_msg *gmsg)
+{
+	ENTRY;
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+	if (!atomic_dec_and_test(&gmsg->gum_refcount)) {
+		EXIT;
+		return;
+	}
+
+	if (gmsg->gum_gctx) {
+		sptlrpc_cli_ctx_wakeup(&gmsg->gum_gctx->gc_base);
+		sptlrpc_cli_ctx_put(&gmsg->gum_gctx->gc_base, 1);
+		gmsg->gum_gctx = NULL;
+	}
+
+	LASSERT(list_empty(&gmsg->gum_list));
+	LASSERT(list_empty(&gmsg->gum_base.list));
+	OBD_FREE_PTR(gmsg);
+	EXIT;
+}
+
+static
+void gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg)
+{
+	__u32 idx = gmsg->gum_mechidx;
+
+	LASSERT(idx < MECH_MAX);
+	LASSERT(spin_is_locked(&upcall_locks[idx]));
+
+	if (list_empty(&gmsg->gum_list))
+		return;
+
+	list_del_init(&gmsg->gum_list);
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 1);
+	atomic_dec(&gmsg->gum_refcount);
+}
+
+static
+void gss_unhash_msg(struct gss_upcall_msg *gmsg)
+{
+	__u32 idx = gmsg->gum_mechidx;
+
+	LASSERT(idx < MECH_MAX);
+	upcall_list_lock(idx);
+	gss_unhash_msg_nolock(gmsg);
+	upcall_list_unlock(idx);
+}
+
+static
+void gss_msg_fail_ctx(struct gss_upcall_msg *gmsg)
+{
+	if (gmsg->gum_gctx) {
+		struct ptlrpc_cli_ctx *ctx = &gmsg->gum_gctx->gc_base;
+
+		LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+		sptlrpc_cli_ctx_expire(ctx);
+		set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+	}
+}
+
+static
+struct gss_upcall_msg * gss_find_upcall(__u32 mechidx, __u32 seq)
+{
+	struct gss_upcall_msg *gmsg;
+
+	upcall_list_lock(mechidx);
+	list_for_each_entry(gmsg, &upcall_lists[mechidx], gum_list) {
+		if (gmsg->gum_data.gum_seq != seq)
+			continue;
+
+		LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+		LASSERT(gmsg->gum_mechidx == mechidx);
+
+		atomic_inc(&gmsg->gum_refcount);
+		upcall_list_unlock(mechidx);
+		return gmsg;
+	}
+	upcall_list_unlock(mechidx);
+	return NULL;
+}
+
+static
+int simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen)
+{
+	if (*buflen < reslen) {
+		CERROR("buflen %u < %u\n", *buflen, reslen);
+		return -EINVAL;
+	}
+
+	memcpy(res, *buf, reslen);
+	*buf += reslen;
+	*buflen -= reslen;
+	return 0;
+}
+
+/****************************************
+ * rpc_pipefs apis		      *
+ ****************************************/
+
+static
+ssize_t gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+			char *dst, size_t buflen)
+{
+	char *data = (char *)msg->data + msg->copied;
+	ssize_t mlen = msg->len;
+	ssize_t left;
+	ENTRY;
+
+	if (mlen > buflen)
+		mlen = buflen;
+	left = copy_to_user(dst, data, mlen);
+	if (left < 0) {
+		msg->errno = left;
+		RETURN(left);
+	}
+	mlen -= left;
+	msg->copied += mlen;
+	msg->errno = 0;
+	RETURN(mlen);
+}
+
+static
+ssize_t gss_pipe_downcall(struct file *filp, const char *src, size_t mlen)
+{
+	struct rpc_inode	*rpci = RPC_I(filp->f_dentry->d_inode);
+	struct gss_upcall_msg   *gss_msg;
+	struct ptlrpc_cli_ctx   *ctx;
+	struct gss_cli_ctx      *gctx = NULL;
+	char		    *buf, *data;
+	int		      datalen;
+	int		      timeout, rc;
+	__u32		    mechidx, seq, gss_err;
+	ENTRY;
+
+	mechidx = (__u32) (long) rpci->private;
+	LASSERT(mechidx < MECH_MAX);
+
+	OBD_ALLOC(buf, mlen);
+	if (!buf)
+		RETURN(-ENOMEM);
+
+	if (copy_from_user(buf, src, mlen)) {
+		CERROR("failed copy user space data\n");
+		GOTO(out_free, rc = -EFAULT);
+	}
+	data = buf;
+	datalen = mlen;
+
+	/* data passed down format:
+	 *  - seq
+	 *  - timeout
+	 *  - gc_win / error
+	 *  - wire_ctx (rawobj)
+	 *  - mech_ctx (rawobj)
+	 */
+	if (simple_get_bytes(&data, &datalen, &seq, sizeof(seq))) {
+		CERROR("fail to get seq\n");
+		GOTO(out_free, rc = -EFAULT);
+	}
+
+	gss_msg = gss_find_upcall(mechidx, seq);
+	if (!gss_msg) {
+		CERROR("upcall %u has aborted earlier\n", seq);
+		GOTO(out_free, rc = -EINVAL);
+	}
+
+	gss_unhash_msg(gss_msg);
+	gctx = gss_msg->gum_gctx;
+	LASSERT(gctx);
+	LASSERT(atomic_read(&gctx->gc_base.cc_refcount) > 0);
+
+	/* timeout is not in use for now */
+	if (simple_get_bytes(&data, &datalen, &timeout, sizeof(timeout)))
+		GOTO(out_msg, rc = -EFAULT);
+
+	/* lgssd signal an error by gc_win == 0 */
+	if (simple_get_bytes(&data, &datalen, &gctx->gc_win,
+			     sizeof(gctx->gc_win)))
+		GOTO(out_msg, rc = -EFAULT);
+
+	if (gctx->gc_win == 0) {
+		/* followed by:
+		 * - rpc error
+		 * - gss error
+		 */
+		if (simple_get_bytes(&data, &datalen, &rc, sizeof(rc)))
+			GOTO(out_msg, rc = -EFAULT);
+		if (simple_get_bytes(&data, &datalen, &gss_err,sizeof(gss_err)))
+			GOTO(out_msg, rc = -EFAULT);
+
+		if (rc == 0 && gss_err == GSS_S_COMPLETE) {
+			CWARN("both rpc & gss error code not set\n");
+			rc = -EPERM;
+		}
+	} else {
+		rawobj_t tmpobj;
+
+		/* handle */
+		if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+			GOTO(out_msg, rc = -EFAULT);
+		if (rawobj_dup(&gctx->gc_handle, &tmpobj))
+			GOTO(out_msg, rc = -ENOMEM);
+
+		/* mechctx */
+		if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+			GOTO(out_msg, rc = -EFAULT);
+		gss_err = lgss_import_sec_context(&tmpobj,
+						  gss_msg->gum_gsec->gs_mech,
+						  &gctx->gc_mechctx);
+		rc = 0;
+	}
+
+	if (likely(rc == 0 && gss_err == GSS_S_COMPLETE)) {
+		gss_cli_ctx_uptodate(gctx);
+	} else {
+		ctx = &gctx->gc_base;
+		sptlrpc_cli_ctx_expire(ctx);
+		if (rc != -ERESTART || gss_err != GSS_S_COMPLETE)
+			set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+
+		CERROR("refresh ctx %p(uid %d) failed: %d/0x%08x: %s\n",
+		       ctx, ctx->cc_vcred.vc_uid, rc, gss_err,
+		       test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags) ?
+		       "fatal error" : "non-fatal");
+	}
+
+	rc = mlen;
+
+out_msg:
+	gss_release_msg(gss_msg);
+
+out_free:
+	OBD_FREE(buf, mlen);
+	/* FIXME
+	 * hack pipefs: always return asked length unless all following
+	 * downcalls might be messed up. */
+	rc = mlen;
+	RETURN(rc);
+}
+
+static
+void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+	struct gss_upcall_msg	  *gmsg;
+	struct gss_upcall_msg_data     *gumd;
+	static cfs_time_t	       ratelimit = 0;
+	ENTRY;
+
+	LASSERT(list_empty(&msg->list));
+
+	/* normally errno is >= 0 */
+	if (msg->errno >= 0) {
+		EXIT;
+		return;
+	}
+
+	gmsg = container_of(msg, struct gss_upcall_msg, gum_base);
+	gumd = &gmsg->gum_data;
+	LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+	CERROR("failed msg %p (seq %u, uid %u, svc %u, nid "LPX64", obd %.*s): "
+	       "errno %d\n", msg, gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+	       gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+	       gumd->gum_obd, msg->errno);
+
+	atomic_inc(&gmsg->gum_refcount);
+	gss_unhash_msg(gmsg);
+	if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) {
+		cfs_time_t now = cfs_time_current_sec();
+
+		if (cfs_time_after(now, ratelimit)) {
+			CWARN("upcall timed out, is lgssd running?\n");
+			ratelimit = now + 15;
+		}
+	}
+	gss_msg_fail_ctx(gmsg);
+	gss_release_msg(gmsg);
+	EXIT;
+}
+
+static
+void gss_pipe_release(struct inode *inode)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	__u32	     idx;
+	ENTRY;
+
+	idx = (__u32) (long) rpci->private;
+	LASSERT(idx < MECH_MAX);
+
+	upcall_list_lock(idx);
+	while (!list_empty(&upcall_lists[idx])) {
+		struct gss_upcall_msg      *gmsg;
+		struct gss_upcall_msg_data *gumd;
+
+		gmsg = list_entry(upcall_lists[idx].next,
+				      struct gss_upcall_msg, gum_list);
+		gumd = &gmsg->gum_data;
+		LASSERT(list_empty(&gmsg->gum_base.list));
+
+		CERROR("failing remaining msg %p:seq %u, uid %u, svc %u, "
+		       "nid "LPX64", obd %.*s\n", gmsg,
+		       gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+		       gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+		       gumd->gum_obd);
+
+		gmsg->gum_base.errno = -EPIPE;
+		atomic_inc(&gmsg->gum_refcount);
+		gss_unhash_msg_nolock(gmsg);
+
+		gss_msg_fail_ctx(gmsg);
+
+		upcall_list_unlock(idx);
+		gss_release_msg(gmsg);
+		upcall_list_lock(idx);
+	}
+	upcall_list_unlock(idx);
+	EXIT;
+}
+
+static struct rpc_pipe_ops gss_upcall_ops = {
+	.upcall	 = gss_pipe_upcall,
+	.downcall       = gss_pipe_downcall,
+	.destroy_msg    = gss_pipe_destroy_msg,
+	.release_pipe   = gss_pipe_release,
+};
+
+/****************************************
+ * upcall helper functions	      *
+ ****************************************/
+
+static
+int gss_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+	struct obd_import	  *imp;
+	struct gss_sec	     *gsec;
+	struct gss_upcall_msg      *gmsg;
+	int			 rc = 0;
+	ENTRY;
+
+	might_sleep();
+
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_import);
+	LASSERT(ctx->cc_sec->ps_import->imp_obd);
+
+	imp = ctx->cc_sec->ps_import;
+	if (!imp->imp_connection) {
+		CERROR("import has no connection set\n");
+		RETURN(-EINVAL);
+	}
+
+	gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+
+	OBD_ALLOC_PTR(gmsg);
+	if (!gmsg)
+		RETURN(-ENOMEM);
+
+	/* initialize pipefs base msg */
+	INIT_LIST_HEAD(&gmsg->gum_base.list);
+	gmsg->gum_base.data = &gmsg->gum_data;
+	gmsg->gum_base.len = sizeof(gmsg->gum_data);
+	gmsg->gum_base.copied = 0;
+	gmsg->gum_base.errno = 0;
+
+	/* init upcall msg */
+	atomic_set(&gmsg->gum_refcount, 1);
+	gmsg->gum_mechidx = mech_name2idx(gsec->gs_mech->gm_name);
+	gmsg->gum_gsec = gsec;
+	gmsg->gum_gctx = container_of(sptlrpc_cli_ctx_get(ctx),
+				      struct gss_cli_ctx, gc_base);
+	gmsg->gum_data.gum_seq = upcall_get_sequence();
+	gmsg->gum_data.gum_uid = ctx->cc_vcred.vc_uid;
+	gmsg->gum_data.gum_gid = 0; /* not used for now */
+	gmsg->gum_data.gum_svc = import_to_gss_svc(imp);
+	gmsg->gum_data.gum_nid = imp->imp_connection->c_peer.nid;
+	strncpy(gmsg->gum_data.gum_obd, imp->imp_obd->obd_name,
+		sizeof(gmsg->gum_data.gum_obd));
+
+	/* This only could happen when sysadmin set it dead/expired
+	 * using lctl by force. */
+	if (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK) {
+		CWARN("ctx %p(%u->%s) was set flags %lx unexpectedly\n",
+		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+		      ctx->cc_flags);
+
+		LASSERT(!(ctx->cc_flags & PTLRPC_CTX_UPTODATE));
+		ctx->cc_flags |= PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR;
+
+		rc = -EIO;
+		goto err_free;
+	}
+
+	upcall_msg_enlist(gmsg);
+
+	rc = rpc_queue_upcall(de_pipes[gmsg->gum_mechidx]->d_inode,
+			      &gmsg->gum_base);
+	if (rc) {
+		CERROR("rpc_queue_upcall failed: %d\n", rc);
+
+		upcall_msg_delist(gmsg);
+		goto err_free;
+	}
+
+	RETURN(0);
+err_free:
+	OBD_FREE_PTR(gmsg);
+	RETURN(rc);
+}
+
+static
+int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+	/* if we are refreshing for root, also update the reverse
+	 * handle index, do not confuse reverse contexts. */
+	if (ctx->cc_vcred.vc_uid == 0) {
+		struct gss_sec *gsec;
+
+		gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+		gsec->gs_rvs_hdl = gss_get_next_ctx_index();
+	}
+
+	return gss_ctx_refresh_pf(ctx);
+}
+
+/****************************************
+ * lustre gss pipefs policy	     *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops = {
+	.match		  = gss_cli_ctx_match,
+	.refresh		= gss_cli_ctx_refresh_pf,
+	.validate	       = gss_cli_ctx_validate_pf,
+	.die		    = gss_cli_ctx_die_pf,
+	.sign		   = gss_cli_ctx_sign,
+	.verify		 = gss_cli_ctx_verify,
+	.seal		   = gss_cli_ctx_seal,
+	.unseal		 = gss_cli_ctx_unseal,
+	.wrap_bulk	      = gss_cli_ctx_wrap_bulk,
+	.unwrap_bulk	    = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_pipefs_cops = {
+	.create_sec	     = gss_sec_create_pf,
+	.destroy_sec	    = gss_sec_destroy_pf,
+	.kill_sec	       = gss_sec_kill,
+	.lookup_ctx	     = gss_sec_lookup_ctx_pf,
+	.release_ctx	    = gss_sec_release_ctx_pf,
+	.flush_ctx_cache	= gss_sec_flush_ctx_cache_pf,
+	.install_rctx	   = gss_sec_install_rctx,
+	.alloc_reqbuf	   = gss_alloc_reqbuf,
+	.free_reqbuf	    = gss_free_reqbuf,
+	.alloc_repbuf	   = gss_alloc_repbuf,
+	.free_repbuf	    = gss_free_repbuf,
+	.enlarge_reqbuf	 = gss_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops gss_sec_pipefs_sops = {
+	.accept		 = gss_svc_accept_pf,
+	.invalidate_ctx	 = gss_svc_invalidate_ctx,
+	.alloc_rs	       = gss_svc_alloc_rs,
+	.authorize	      = gss_svc_authorize,
+	.free_rs		= gss_svc_free_rs,
+	.free_ctx	       = gss_svc_free_ctx,
+	.unwrap_bulk	    = gss_svc_unwrap_bulk,
+	.wrap_bulk	      = gss_svc_wrap_bulk,
+	.install_rctx	   = gss_svc_install_rctx_pf,
+};
+
+static struct ptlrpc_sec_policy gss_policy_pipefs = {
+	.sp_owner	       = THIS_MODULE,
+	.sp_name		= "gss.pipefs",
+	.sp_policy	      = SPTLRPC_POLICY_GSS_PIPEFS,
+	.sp_cops		= &gss_sec_pipefs_cops,
+	.sp_sops		= &gss_sec_pipefs_sops,
+};
+
+static
+int __init gss_init_pipefs_upcall(void)
+{
+	struct dentry   *de;
+
+	/* pipe dir */
+	de = rpc_mkdir(LUSTRE_PIPE_ROOT, NULL);
+	if (IS_ERR(de) && PTR_ERR(de) != -EEXIST) {
+		CERROR("Failed to create gss pipe dir: %ld\n", PTR_ERR(de));
+		return PTR_ERR(de);
+	}
+
+	/* FIXME hack pipefs: dput will sometimes cause oops during module
+	 * unload and lgssd close the pipe fds. */
+
+	/* krb5 mechanism */
+	de = rpc_mkpipe(LUSTRE_PIPE_KRB5, (void *) MECH_KRB5, &gss_upcall_ops,
+			RPC_PIPE_WAIT_FOR_OPEN);
+	if (!de || IS_ERR(de)) {
+		CERROR("failed to make rpc_pipe %s: %ld\n",
+		       LUSTRE_PIPE_KRB5, PTR_ERR(de));
+		rpc_rmdir(LUSTRE_PIPE_ROOT);
+		return PTR_ERR(de);
+	}
+
+	de_pipes[MECH_KRB5] = de;
+	INIT_LIST_HEAD(&upcall_lists[MECH_KRB5]);
+	spin_lock_init(&upcall_locks[MECH_KRB5]);
+
+	return 0;
+}
+
+static
+void __exit gss_exit_pipefs_upcall(void)
+{
+	__u32   i;
+
+	for (i = 0; i < MECH_MAX; i++) {
+		LASSERT(list_empty(&upcall_lists[i]));
+
+		/* dput pipe dentry here might cause lgssd oops. */
+		de_pipes[i] = NULL;
+	}
+
+	rpc_unlink(LUSTRE_PIPE_KRB5);
+	rpc_rmdir(LUSTRE_PIPE_ROOT);
+}
+
+int __init gss_init_pipefs(void)
+{
+	int rc;
+
+	rc = gss_init_pipefs_upcall();
+	if (rc)
+		return rc;
+
+	rc = sptlrpc_register_policy(&gss_policy_pipefs);
+	if (rc) {
+		gss_exit_pipefs_upcall();
+		return rc;
+	}
+
+	return 0;
+}
+
+void __exit gss_exit_pipefs(void)
+{
+	gss_exit_pipefs_upcall();
+	sptlrpc_unregister_policy(&gss_policy_pipefs);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c
new file mode 100644
index 0000000..474ecf8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c

@@ -0,0 +1,242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_rawobj.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_sec.h>
+
+#include "gss_internal.h"
+
+int rawobj_empty(rawobj_t *obj)
+{
+	LASSERT(equi(obj->len, obj->data));
+	return (obj->len == 0);
+}
+
+int rawobj_alloc(rawobj_t *obj, char *buf, int len)
+{
+	LASSERT(obj);
+	LASSERT(len >= 0);
+
+	obj->len = len;
+	if (len) {
+		OBD_ALLOC_LARGE(obj->data, len);
+		if (!obj->data) {
+			obj->len = 0;
+			RETURN(-ENOMEM);
+		}
+		memcpy(obj->data, buf, len);
+	} else
+		obj->data = NULL;
+	return 0;
+}
+
+void rawobj_free(rawobj_t *obj)
+{
+	LASSERT(obj);
+
+	if (obj->len) {
+		LASSERT(obj->data);
+		OBD_FREE_LARGE(obj->data, obj->len);
+		obj->len = 0;
+		obj->data = NULL;
+	} else
+		LASSERT(!obj->data);
+}
+
+int rawobj_equal(rawobj_t *a, rawobj_t *b)
+{
+	LASSERT(a && b);
+
+	return (a->len == b->len &&
+		(!a->len || !memcmp(a->data, b->data, a->len)));
+}
+
+int rawobj_dup(rawobj_t *dest, rawobj_t *src)
+{
+	LASSERT(src && dest);
+
+	dest->len = src->len;
+	if (dest->len) {
+		OBD_ALLOC_LARGE(dest->data, dest->len);
+		if (!dest->data) {
+			dest->len = 0;
+			return -ENOMEM;
+		}
+		memcpy(dest->data, src->data, dest->len);
+	} else
+		dest->data = NULL;
+	return 0;
+}
+
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+	__u32 len;
+
+	LASSERT(obj);
+	LASSERT(buf);
+	LASSERT(buflen);
+
+	len = cfs_size_round4(obj->len);
+
+	if (*buflen < 4 + len) {
+		CERROR("buflen %u <  %u\n", *buflen, 4 + len);
+		return -EINVAL;
+	}
+
+	*(*buf)++ = cpu_to_le32(obj->len);
+	memcpy(*buf, obj->data, obj->len);
+	*buf += (len >> 2);
+	*buflen -= (4 + len);
+
+	return 0;
+}
+
+static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen,
+			    int alloc, int local)
+{
+	__u32 len;
+
+	if (*buflen < sizeof(__u32)) {
+		CERROR("buflen %u\n", *buflen);
+		return -EINVAL;
+	}
+
+	obj->len = *(*buf)++;
+	if (!local)
+		obj->len = le32_to_cpu(obj->len);
+	*buflen -= sizeof(__u32);
+
+	if (!obj->len) {
+		obj->data = NULL;
+		return 0;
+	}
+
+	len = local ? obj->len : cfs_size_round4(obj->len);
+	if (*buflen < len) {
+		CERROR("buflen %u < %u\n", *buflen, len);
+		obj->len = 0;
+		return -EINVAL;
+	}
+
+	if (!alloc)
+		obj->data = (__u8 *) *buf;
+	else {
+		OBD_ALLOC_LARGE(obj->data, obj->len);
+		if (!obj->data) {
+			CERROR("fail to alloc %u bytes\n", obj->len);
+			obj->len = 0;
+			return -ENOMEM;
+		}
+		memcpy(obj->data, *buf, obj->len);
+	}
+
+	*((char **)buf) += len;
+	*buflen -= len;
+
+	return 0;
+}
+
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+	return __rawobj_extract(obj, buf, buflen, 0, 0);
+}
+
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+	return __rawobj_extract(obj, buf, buflen, 1, 0);
+}
+
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+	return __rawobj_extract(obj, buf, buflen, 0, 1);
+}
+
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+	return __rawobj_extract(obj, buf, buflen, 1, 1);
+}
+
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj)
+{
+	rawobj->len = netobj->len;
+	rawobj->data = netobj->data;
+	return 0;
+}
+
+int rawobj_from_netobj_alloc(rawobj_t *rawobj, netobj_t *netobj)
+{
+	rawobj->len = 0;
+	rawobj->data = NULL;
+
+	if (netobj->len == 0)
+		return 0;
+
+	OBD_ALLOC_LARGE(rawobj->data, netobj->len);
+	if (rawobj->data == NULL)
+		return -ENOMEM;
+
+	rawobj->len = netobj->len;
+	memcpy(rawobj->data, netobj->data, netobj->len);
+	return 0;
+}
+
+/****************************************
+ * misc more			    *
+ ****************************************/
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+			 void *res, __u32 reslen)
+{
+	if (*buflen < reslen) {
+		CERROR("buflen %u < %u\n", *buflen, reslen);
+		return -EINVAL;
+	}
+
+	memcpy(res, *buf, reslen);
+	*buf += reslen;
+	*buflen -= reslen;
+	return 0;
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c
new file mode 100644
index 0000000..31b50ea
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c

@@ -0,0 +1,1099 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Neil Brown <neilb@cse.unsw.edu.au>
+ * J. Bruce Fields <bfields@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ * Dug Song <dugsong@monkey.org>
+ *
+ * RPCSEC_GSS server authentication.
+ * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078
+ * (gssapi)
+ *
+ * The RPCSEC_GSS involves three stages:
+ *  1/ context creation
+ *  2/ data exchange
+ *  3/ context destruction
+ *
+ * Context creation is handled largely by upcalls to user-space.
+ *  In particular, GSS_Accept_sec_context is handled by an upcall
+ * Data exchange is handled entirely within the kernel
+ *  In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel.
+ * Context destruction is handled in-kernel
+ *  GSS_Delete_sec_context is in-kernel
+ *
+ * Context creation is initiated by a RPCSEC_GSS_INIT request arriving.
+ * The context handle and gss_token are used as a key into the rpcsec_init cache.
+ * The content of this cache includes some of the outputs of GSS_Accept_sec_context,
+ * being major_status, minor_status, context_handle, reply_token.
+ * These are sent back to the client.
+ * Sequence window management is handled by the kernel.  The window size if currently
+ * a compile time constant.
+ *
+ * When user-space is happy that a context is established, it places an entry
+ * in the rpcsec_context cache. The key for this cache is the context_handle.
+ * The content includes:
+ *   uid/gidlist - for determining access rights
+ *   mechanism type
+ *   mechanism specific information, such as a key
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/mutex.h>
+#include <linux/sunrpc/cache.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#define GSS_SVC_UPCALL_TIMEOUT  (20)
+
+static spinlock_t __ctx_index_lock;
+static __u64 __ctx_index;
+
+__u64 gss_get_next_ctx_index(void)
+{
+	__u64 idx;
+
+	spin_lock(&__ctx_index_lock);
+	idx = __ctx_index++;
+	spin_unlock(&__ctx_index_lock);
+
+	return idx;
+}
+
+static inline unsigned long hash_mem(char *buf, int length, int bits)
+{
+	unsigned long hash = 0;
+	unsigned long l = 0;
+	int len = 0;
+	unsigned char c;
+
+	do {
+		if (len == length) {
+			c = (char) len;
+			len = -1;
+		} else
+			c = *buf++;
+
+		l = (l << 8) | c;
+		len++;
+
+		if ((len & (BITS_PER_LONG/8-1)) == 0)
+			hash = cfs_hash_long(hash^l, BITS_PER_LONG);
+	} while (len);
+
+	return hash >> (BITS_PER_LONG - bits);
+}
+
+/****************************************
+ * rsi cache			    *
+ ****************************************/
+
+#define RSI_HASHBITS    (6)
+#define RSI_HASHMAX     (1 << RSI_HASHBITS)
+#define RSI_HASHMASK    (RSI_HASHMAX - 1)
+
+struct rsi {
+	struct cache_head       h;
+	__u32		   lustre_svc;
+	__u64		   nid;
+	wait_queue_head_t	     waitq;
+	rawobj_t		in_handle, in_token;
+	rawobj_t		out_handle, out_token;
+	int		     major_status, minor_status;
+};
+
+static struct cache_head *rsi_table[RSI_HASHMAX];
+static struct cache_detail rsi_cache;
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old);
+static struct rsi *rsi_lookup(struct rsi *item);
+
+static inline int rsi_hash(struct rsi *item)
+{
+	return hash_mem((char *)item->in_handle.data, item->in_handle.len,
+			RSI_HASHBITS) ^
+	       hash_mem((char *)item->in_token.data, item->in_token.len,
+			RSI_HASHBITS);
+}
+
+static inline int __rsi_match(struct rsi *item, struct rsi *tmp)
+{
+	return (rawobj_equal(&item->in_handle, &tmp->in_handle) &&
+		rawobj_equal(&item->in_token, &tmp->in_token));
+}
+
+static void rsi_free(struct rsi *rsi)
+{
+	rawobj_free(&rsi->in_handle);
+	rawobj_free(&rsi->in_token);
+	rawobj_free(&rsi->out_handle);
+	rawobj_free(&rsi->out_token);
+}
+
+static void rsi_request(struct cache_detail *cd,
+			struct cache_head *h,
+			char **bpp, int *blen)
+{
+	struct rsi *rsi = container_of(h, struct rsi, h);
+	__u64 index = 0;
+
+	/* if in_handle is null, provide kernel suggestion */
+	if (rsi->in_handle.len == 0)
+		index = gss_get_next_ctx_index();
+
+	qword_addhex(bpp, blen, (char *) &rsi->lustre_svc,
+		     sizeof(rsi->lustre_svc));
+	qword_addhex(bpp, blen, (char *) &rsi->nid, sizeof(rsi->nid));
+	qword_addhex(bpp, blen, (char *) &index, sizeof(index));
+	qword_addhex(bpp, blen, rsi->in_handle.data, rsi->in_handle.len);
+	qword_addhex(bpp, blen, rsi->in_token.data, rsi->in_token.len);
+	(*bpp)[-1] = '\n';
+}
+
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, rsi_request);
+}
+
+static inline void __rsi_init(struct rsi *new, struct rsi *item)
+{
+	new->out_handle = RAWOBJ_EMPTY;
+	new->out_token = RAWOBJ_EMPTY;
+
+	new->in_handle = item->in_handle;
+	item->in_handle = RAWOBJ_EMPTY;
+	new->in_token = item->in_token;
+	item->in_token = RAWOBJ_EMPTY;
+
+	new->lustre_svc = item->lustre_svc;
+	new->nid = item->nid;
+	init_waitqueue_head(&new->waitq);
+}
+
+static inline void __rsi_update(struct rsi *new, struct rsi *item)
+{
+	LASSERT(new->out_handle.len == 0);
+	LASSERT(new->out_token.len == 0);
+
+	new->out_handle = item->out_handle;
+	item->out_handle = RAWOBJ_EMPTY;
+	new->out_token = item->out_token;
+	item->out_token = RAWOBJ_EMPTY;
+
+	new->major_status = item->major_status;
+	new->minor_status = item->minor_status;
+}
+
+static void rsi_put(struct kref *ref)
+{
+	struct rsi *rsi = container_of(ref, struct rsi, h.ref);
+
+	LASSERT(rsi->h.next == NULL);
+	rsi_free(rsi);
+	OBD_FREE_PTR(rsi);
+}
+
+static int rsi_match(struct cache_head *a, struct cache_head *b)
+{
+	struct rsi *item = container_of(a, struct rsi, h);
+	struct rsi *tmp = container_of(b, struct rsi, h);
+
+	return __rsi_match(item, tmp);
+}
+
+static void rsi_init(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct rsi *new = container_of(cnew, struct rsi, h);
+	struct rsi *item = container_of(citem, struct rsi, h);
+
+	__rsi_init(new, item);
+}
+
+static void update_rsi(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct rsi *new = container_of(cnew, struct rsi, h);
+	struct rsi *item = container_of(citem, struct rsi, h);
+
+	__rsi_update(new, item);
+}
+
+static struct cache_head *rsi_alloc(void)
+{
+	struct rsi *rsi;
+
+	OBD_ALLOC_PTR(rsi);
+	if (rsi)
+		return &rsi->h;
+	else
+		return NULL;
+}
+
+static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+	char	   *buf = mesg;
+	char	   *ep;
+	int	     len;
+	struct rsi      rsii, *rsip = NULL;
+	time_t	  expiry;
+	int	     status = -EINVAL;
+	ENTRY;
+
+
+	memset(&rsii, 0, sizeof(rsii));
+
+	/* handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.in_handle, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	/* token */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.in_token, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	rsip = rsi_lookup(&rsii);
+	if (!rsip)
+		goto out;
+
+	rsii.h.flags = 0;
+	/* expiry */
+	expiry = get_expiry(&mesg);
+	if (expiry == 0)
+		goto out;
+
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0)
+		goto out;
+
+	/* major */
+	rsii.major_status = simple_strtol(buf, &ep, 10);
+	if (*ep)
+		goto out;
+
+	/* minor */
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0)
+		goto out;
+	rsii.minor_status = simple_strtol(buf, &ep, 10);
+	if (*ep)
+		goto out;
+
+	/* out_handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.out_handle, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	/* out_token */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	if (rawobj_alloc(&rsii.out_token, buf, len)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	rsii.h.expiry_time = expiry;
+	rsip = rsi_update(&rsii, rsip);
+	status = 0;
+out:
+	rsi_free(&rsii);
+	if (rsip) {
+		wake_up_all(&rsip->waitq);
+		cache_put(&rsip->h, &rsi_cache);
+	} else {
+		status = -ENOMEM;
+	}
+
+	if (status)
+		CERROR("rsi parse error %d\n", status);
+	RETURN(status);
+}
+
+static struct cache_detail rsi_cache = {
+	.hash_size      = RSI_HASHMAX,
+	.hash_table     = rsi_table,
+	.name	   = "auth.sptlrpc.init",
+	.cache_put      = rsi_put,
+	.cache_upcall   = rsi_upcall,
+	.cache_parse    = rsi_parse,
+	.match	  = rsi_match,
+	.init	   = rsi_init,
+	.update	 = update_rsi,
+	.alloc	  = rsi_alloc,
+};
+
+static struct rsi *rsi_lookup(struct rsi *item)
+{
+	struct cache_head *ch;
+	int hash = rsi_hash(item);
+
+	ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash);
+	if (ch)
+		return container_of(ch, struct rsi, h);
+	else
+		return NULL;
+}
+
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old)
+{
+	struct cache_head *ch;
+	int hash = rsi_hash(new);
+
+	ch = sunrpc_cache_update(&rsi_cache, &new->h, &old->h, hash);
+	if (ch)
+		return container_of(ch, struct rsi, h);
+	else
+		return NULL;
+}
+
+/****************************************
+ * rsc cache			    *
+ ****************************************/
+
+#define RSC_HASHBITS    (10)
+#define RSC_HASHMAX     (1 << RSC_HASHBITS)
+#define RSC_HASHMASK    (RSC_HASHMAX - 1)
+
+struct rsc {
+	struct cache_head       h;
+	struct obd_device      *target;
+	rawobj_t		handle;
+	struct gss_svc_ctx      ctx;
+};
+
+static struct cache_head *rsc_table[RSC_HASHMAX];
+static struct cache_detail rsc_cache;
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old);
+static struct rsc *rsc_lookup(struct rsc *item);
+
+static void rsc_free(struct rsc *rsci)
+{
+	rawobj_free(&rsci->handle);
+	rawobj_free(&rsci->ctx.gsc_rvs_hdl);
+	lgss_delete_sec_context(&rsci->ctx.gsc_mechctx);
+}
+
+static inline int rsc_hash(struct rsc *rsci)
+{
+	return hash_mem((char *)rsci->handle.data,
+			rsci->handle.len, RSC_HASHBITS);
+}
+
+static inline int __rsc_match(struct rsc *new, struct rsc *tmp)
+{
+	return rawobj_equal(&new->handle, &tmp->handle);
+}
+
+static inline void __rsc_init(struct rsc *new, struct rsc *tmp)
+{
+	new->handle = tmp->handle;
+	tmp->handle = RAWOBJ_EMPTY;
+
+	new->target = NULL;
+	memset(&new->ctx, 0, sizeof(new->ctx));
+	new->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+}
+
+static inline void __rsc_update(struct rsc *new, struct rsc *tmp)
+{
+	new->ctx = tmp->ctx;
+	tmp->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+	tmp->ctx.gsc_mechctx = NULL;
+
+	memset(&new->ctx.gsc_seqdata, 0, sizeof(new->ctx.gsc_seqdata));
+	spin_lock_init(&new->ctx.gsc_seqdata.ssd_lock);
+}
+
+static void rsc_put(struct kref *ref)
+{
+	struct rsc *rsci = container_of(ref, struct rsc, h.ref);
+
+	LASSERT(rsci->h.next == NULL);
+	rsc_free(rsci);
+	OBD_FREE_PTR(rsci);
+}
+
+static int rsc_match(struct cache_head *a, struct cache_head *b)
+{
+	struct rsc *new = container_of(a, struct rsc, h);
+	struct rsc *tmp = container_of(b, struct rsc, h);
+
+	return __rsc_match(new, tmp);
+}
+
+static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
+{
+	struct rsc *new = container_of(cnew, struct rsc, h);
+	struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+	__rsc_init(new, tmp);
+}
+
+static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
+{
+	struct rsc *new = container_of(cnew, struct rsc, h);
+	struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+	__rsc_update(new, tmp);
+}
+
+static struct cache_head * rsc_alloc(void)
+{
+	struct rsc *rsc;
+
+	OBD_ALLOC_PTR(rsc);
+	if (rsc)
+		return &rsc->h;
+	else
+		return NULL;
+}
+
+static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+	char		*buf = mesg;
+	int		  len, rv, tmp_int;
+	struct rsc	   rsci, *rscp = NULL;
+	time_t	       expiry;
+	int		  status = -EINVAL;
+	struct gss_api_mech *gm = NULL;
+
+	memset(&rsci, 0, sizeof(rsci));
+
+	/* context handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0) goto out;
+	status = -ENOMEM;
+	if (rawobj_alloc(&rsci.handle, buf, len))
+		goto out;
+
+	rsci.h.flags = 0;
+	/* expiry */
+	expiry = get_expiry(&mesg);
+	status = -EINVAL;
+	if (expiry == 0)
+		goto out;
+
+	/* remote flag */
+	rv = get_int(&mesg, &tmp_int);
+	if (rv) {
+		CERROR("fail to get remote flag\n");
+		goto out;
+	}
+	rsci.ctx.gsc_remote = (tmp_int != 0);
+
+	/* root user flag */
+	rv = get_int(&mesg, &tmp_int);
+	if (rv) {
+		CERROR("fail to get oss user flag\n");
+		goto out;
+	}
+	rsci.ctx.gsc_usr_root = (tmp_int != 0);
+
+	/* mds user flag */
+	rv = get_int(&mesg, &tmp_int);
+	if (rv) {
+		CERROR("fail to get mds user flag\n");
+		goto out;
+	}
+	rsci.ctx.gsc_usr_mds = (tmp_int != 0);
+
+	/* oss user flag */
+	rv = get_int(&mesg, &tmp_int);
+	if (rv) {
+		CERROR("fail to get oss user flag\n");
+		goto out;
+	}
+	rsci.ctx.gsc_usr_oss = (tmp_int != 0);
+
+	/* mapped uid */
+	rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid);
+	if (rv) {
+		CERROR("fail to get mapped uid\n");
+		goto out;
+	}
+
+	rscp = rsc_lookup(&rsci);
+	if (!rscp)
+		goto out;
+
+	/* uid, or NEGATIVE */
+	rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid);
+	if (rv == -EINVAL)
+		goto out;
+	if (rv == -ENOENT) {
+		CERROR("NOENT? set rsc entry negative\n");
+		set_bit(CACHE_NEGATIVE, &rsci.h.flags);
+	} else {
+		rawobj_t tmp_buf;
+		unsigned long ctx_expiry;
+
+		/* gid */
+		if (get_int(&mesg, (int *) &rsci.ctx.gsc_gid))
+			goto out;
+
+		/* mech name */
+		len = qword_get(&mesg, buf, mlen);
+		if (len < 0)
+			goto out;
+		gm = lgss_name_to_mech(buf);
+		status = -EOPNOTSUPP;
+		if (!gm)
+			goto out;
+
+		status = -EINVAL;
+		/* mech-specific data: */
+		len = qword_get(&mesg, buf, mlen);
+		if (len < 0)
+			goto out;
+
+		tmp_buf.len = len;
+		tmp_buf.data = (unsigned char *)buf;
+		if (lgss_import_sec_context(&tmp_buf, gm,
+					    &rsci.ctx.gsc_mechctx))
+			goto out;
+
+		/* currently the expiry time passed down from user-space
+		 * is invalid, here we retrive it from mech. */
+		if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+			CERROR("unable to get expire time, drop it\n");
+			goto out;
+		}
+		expiry = (time_t) ctx_expiry;
+	}
+
+	rsci.h.expiry_time = expiry;
+	rscp = rsc_update(&rsci, rscp);
+	status = 0;
+out:
+	if (gm)
+		lgss_mech_put(gm);
+	rsc_free(&rsci);
+	if (rscp)
+		cache_put(&rscp->h, &rsc_cache);
+	else
+		status = -ENOMEM;
+
+	if (status)
+		CERROR("parse rsc error %d\n", status);
+	return status;
+}
+
+static struct cache_detail rsc_cache = {
+	.hash_size      = RSC_HASHMAX,
+	.hash_table     = rsc_table,
+	.name	   = "auth.sptlrpc.context",
+	.cache_put      = rsc_put,
+	.cache_parse    = rsc_parse,
+	.match	  = rsc_match,
+	.init	   = rsc_init,
+	.update	 = update_rsc,
+	.alloc	  = rsc_alloc,
+};
+
+static struct rsc *rsc_lookup(struct rsc *item)
+{
+	struct cache_head *ch;
+	int		hash = rsc_hash(item);
+
+	ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash);
+	if (ch)
+		return container_of(ch, struct rsc, h);
+	else
+		return NULL;
+}
+
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old)
+{
+	struct cache_head *ch;
+	int		hash = rsc_hash(new);
+
+	ch = sunrpc_cache_update(&rsc_cache, &new->h, &old->h, hash);
+	if (ch)
+		return container_of(ch, struct rsc, h);
+	else
+		return NULL;
+}
+
+#define COMPAT_RSC_PUT(item, cd)	cache_put((item), (cd))
+
+/****************************************
+ * rsc cache flush		      *
+ ****************************************/
+
+typedef int rsc_entry_match(struct rsc *rscp, long data);
+
+static void rsc_flush(rsc_entry_match *match, long data)
+{
+	struct cache_head **ch;
+	struct rsc *rscp;
+	int n;
+	ENTRY;
+
+	write_lock(&rsc_cache.hash_lock);
+	for (n = 0; n < RSC_HASHMAX; n++) {
+		for (ch = &rsc_cache.hash_table[n]; *ch;) {
+			rscp = container_of(*ch, struct rsc, h);
+
+			if (!match(rscp, data)) {
+				ch = &((*ch)->next);
+				continue;
+			}
+
+			/* it seems simply set NEGATIVE doesn't work */
+			*ch = (*ch)->next;
+			rscp->h.next = NULL;
+			cache_get(&rscp->h);
+			set_bit(CACHE_NEGATIVE, &rscp->h.flags);
+			COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+			rsc_cache.entries--;
+		}
+	}
+	write_unlock(&rsc_cache.hash_lock);
+	EXIT;
+}
+
+static int match_uid(struct rsc *rscp, long uid)
+{
+	if ((int) uid == -1)
+		return 1;
+	return ((int) rscp->ctx.gsc_uid == (int) uid);
+}
+
+static int match_target(struct rsc *rscp, long target)
+{
+	return (rscp->target == (struct obd_device *) target);
+}
+
+static inline void rsc_flush_uid(int uid)
+{
+	if (uid == -1)
+		CWARN("flush all gss contexts...\n");
+
+	rsc_flush(match_uid, (long) uid);
+}
+
+static inline void rsc_flush_target(struct obd_device *target)
+{
+	rsc_flush(match_target, (long) target);
+}
+
+void gss_secsvc_flush(struct obd_device *target)
+{
+	rsc_flush_target(target);
+}
+EXPORT_SYMBOL(gss_secsvc_flush);
+
+static struct rsc *gss_svc_searchbyctx(rawobj_t *handle)
+{
+	struct rsc  rsci;
+	struct rsc *found;
+
+	memset(&rsci, 0, sizeof(rsci));
+	if (rawobj_dup(&rsci.handle, handle))
+		return NULL;
+
+	found = rsc_lookup(&rsci);
+	rsc_free(&rsci);
+	if (!found)
+		return NULL;
+	if (cache_check(&rsc_cache, &found->h, NULL))
+		return NULL;
+	return found;
+}
+
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+				   struct gss_sec *gsec,
+				   struct gss_cli_ctx *gctx)
+{
+	struct rsc      rsci, *rscp = NULL;
+	unsigned long   ctx_expiry;
+	__u32	   major;
+	int	     rc;
+	ENTRY;
+
+	memset(&rsci, 0, sizeof(rsci));
+
+	if (rawobj_alloc(&rsci.handle, (char *) &gsec->gs_rvs_hdl,
+			 sizeof(gsec->gs_rvs_hdl)))
+		GOTO(out, rc = -ENOMEM);
+
+	rscp = rsc_lookup(&rsci);
+	if (rscp == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	major = lgss_copy_reverse_context(gctx->gc_mechctx,
+					  &rsci.ctx.gsc_mechctx);
+	if (major != GSS_S_COMPLETE)
+		GOTO(out, rc = -ENOMEM);
+
+	if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+		CERROR("unable to get expire time, drop it\n");
+		GOTO(out, rc = -EINVAL);
+	}
+	rsci.h.expiry_time = (time_t) ctx_expiry;
+
+	if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)
+		rsci.ctx.gsc_usr_mds = 1;
+	else if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0)
+		rsci.ctx.gsc_usr_oss = 1;
+	else
+		rsci.ctx.gsc_usr_root = 1;
+
+	rscp = rsc_update(&rsci, rscp);
+	if (rscp == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rscp->target = imp->imp_obd;
+	rawobj_dup(&gctx->gc_svc_handle, &rscp->handle);
+
+	CWARN("create reverse svc ctx %p to %s: idx "LPX64"\n",
+	      &rscp->ctx, obd2cli_tgt(imp->imp_obd), gsec->gs_rvs_hdl);
+	rc = 0;
+out:
+	if (rscp)
+		cache_put(&rscp->h, &rsc_cache);
+	rsc_free(&rsci);
+
+	if (rc)
+		CERROR("create reverse svc ctx: idx "LPX64", rc %d\n",
+		       gsec->gs_rvs_hdl, rc);
+	RETURN(rc);
+}
+
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle)
+{
+	const cfs_time_t	expire = 20;
+	struct rsc	     *rscp;
+
+	rscp = gss_svc_searchbyctx(handle);
+	if (rscp) {
+		CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n",
+		       &rscp->ctx, rscp);
+
+		rscp->h.expiry_time = cfs_time_current_sec() + expire;
+		COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+	}
+	return 0;
+}
+
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx)
+{
+	struct rsc *rscp = container_of(ctx, struct rsc, ctx);
+
+	return rawobj_dup(handle, &rscp->handle);
+}
+
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq)
+{
+	struct rsc	     *rscp;
+
+	rscp = gss_svc_searchbyctx(handle);
+	if (rscp) {
+		CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) update seq to %u\n",
+		       &rscp->ctx, rscp, seq + 1);
+
+		rscp->ctx.gsc_rvs_seq = seq + 1;
+		COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+	}
+	return 0;
+}
+
+static struct cache_deferred_req* cache_upcall_defer(struct cache_req *req)
+{
+	return NULL;
+}
+static struct cache_req cache_upcall_chandle = { cache_upcall_defer };
+
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+			       struct gss_svc_reqctx *grctx,
+			       struct gss_wire_ctx *gw,
+			       struct obd_device *target,
+			       __u32 lustre_svc,
+			       rawobj_t *rvs_hdl,
+			       rawobj_t *in_token)
+{
+	struct ptlrpc_reply_state *rs;
+	struct rsc		*rsci = NULL;
+	struct rsi		*rsip = NULL, rsikey;
+	wait_queue_t	     wait;
+	int			replen = sizeof(struct ptlrpc_body);
+	struct gss_rep_header     *rephdr;
+	int			first_check = 1;
+	int			rc = SECSVC_DROP;
+	ENTRY;
+
+	memset(&rsikey, 0, sizeof(rsikey));
+	rsikey.lustre_svc = lustre_svc;
+	rsikey.nid = (__u64) req->rq_peer.nid;
+
+	/* duplicate context handle. for INIT it always 0 */
+	if (rawobj_dup(&rsikey.in_handle, &gw->gw_handle)) {
+		CERROR("fail to dup context handle\n");
+		GOTO(out, rc);
+	}
+
+	if (rawobj_dup(&rsikey.in_token, in_token)) {
+		CERROR("can't duplicate token\n");
+		rawobj_free(&rsikey.in_handle);
+		GOTO(out, rc);
+	}
+
+	rsip = rsi_lookup(&rsikey);
+	rsi_free(&rsikey);
+	if (!rsip) {
+		CERROR("error in rsi_lookup.\n");
+
+		if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0))
+			rc = SECSVC_COMPLETE;
+
+		GOTO(out, rc);
+	}
+
+	cache_get(&rsip->h); /* take an extra ref */
+	init_waitqueue_head(&rsip->waitq);
+	init_waitqueue_entry_current(&wait);
+	add_wait_queue(&rsip->waitq, &wait);
+
+cache_check:
+	/* Note each time cache_check() will drop a reference if return
+	 * non-zero. We hold an extra reference on initial rsip, but must
+	 * take care of following calls. */
+	rc = cache_check(&rsi_cache, &rsip->h, &cache_upcall_chandle);
+	switch (rc) {
+	case -EAGAIN: {
+		int valid;
+
+		if (first_check) {
+			first_check = 0;
+
+			read_lock(&rsi_cache.hash_lock);
+			valid = test_bit(CACHE_VALID, &rsip->h.flags);
+			if (valid == 0)
+				set_current_state(TASK_INTERRUPTIBLE);
+			read_unlock(&rsi_cache.hash_lock);
+
+			if (valid == 0)
+				schedule_timeout(GSS_SVC_UPCALL_TIMEOUT *
+						     HZ);
+
+			cache_get(&rsip->h);
+			goto cache_check;
+		}
+		CWARN("waited %ds timeout, drop\n", GSS_SVC_UPCALL_TIMEOUT);
+		break;
+	}
+	case -ENOENT:
+		CWARN("cache_check return ENOENT, drop\n");
+		break;
+	case 0:
+		/* if not the first check, we have to release the extra
+		 * reference we just added on it. */
+		if (!first_check)
+			cache_put(&rsip->h, &rsi_cache);
+		CDEBUG(D_SEC, "cache_check is good\n");
+		break;
+	}
+
+	remove_wait_queue(&rsip->waitq, &wait);
+	cache_put(&rsip->h, &rsi_cache);
+
+	if (rc)
+		GOTO(out, rc = SECSVC_DROP);
+
+	rc = SECSVC_DROP;
+	rsci = gss_svc_searchbyctx(&rsip->out_handle);
+	if (!rsci) {
+		CERROR("authentication failed\n");
+
+		if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0))
+			rc = SECSVC_COMPLETE;
+
+		GOTO(out, rc);
+	} else {
+		cache_get(&rsci->h);
+		grctx->src_ctx = &rsci->ctx;
+	}
+
+	if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) {
+		CERROR("failed duplicate reverse handle\n");
+		GOTO(out, rc);
+	}
+
+	rsci->target = target;
+
+	CDEBUG(D_SEC, "server create rsc %p(%u->%s)\n",
+	       rsci, rsci->ctx.gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+	if (rsip->out_handle.len > PTLRPC_GSS_MAX_HANDLE_SIZE) {
+		CERROR("handle size %u too large\n", rsip->out_handle.len);
+		GOTO(out, rc = SECSVC_DROP);
+	}
+
+	grctx->src_init = 1;
+	grctx->src_reserve_len = cfs_size_round4(rsip->out_token.len);
+
+	rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+	if (rc) {
+		CERROR("failed to pack reply: %d\n", rc);
+		GOTO(out, rc = SECSVC_DROP);
+	}
+
+	rs = req->rq_reply_state;
+	LASSERT(rs->rs_repbuf->lm_bufcount == 3);
+	LASSERT(rs->rs_repbuf->lm_buflens[0] >=
+		sizeof(*rephdr) + rsip->out_handle.len);
+	LASSERT(rs->rs_repbuf->lm_buflens[2] >= rsip->out_token.len);
+
+	rephdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+	rephdr->gh_version = PTLRPC_GSS_VERSION;
+	rephdr->gh_flags = 0;
+	rephdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+	rephdr->gh_major = rsip->major_status;
+	rephdr->gh_minor = rsip->minor_status;
+	rephdr->gh_seqwin = GSS_SEQ_WIN;
+	rephdr->gh_handle.len = rsip->out_handle.len;
+	memcpy(rephdr->gh_handle.data, rsip->out_handle.data,
+	       rsip->out_handle.len);
+
+	memcpy(lustre_msg_buf(rs->rs_repbuf, 2, 0), rsip->out_token.data,
+	       rsip->out_token.len);
+
+	rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2,
+					       rsip->out_token.len, 0);
+
+	rc = SECSVC_OK;
+
+out:
+	/* it looks like here we should put rsip also, but this mess up
+	 * with NFS cache mgmt code... FIXME */
+#if 0
+	if (rsip)
+		rsi_put(&rsip->h, &rsi_cache);
+#endif
+
+	if (rsci) {
+		/* if anything went wrong, we don't keep the context too */
+		if (rc != SECSVC_OK)
+			set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+		else
+			CDEBUG(D_SEC, "create rsc with idx "LPX64"\n",
+			       gss_handle_to_u64(&rsci->handle));
+
+		COMPAT_RSC_PUT(&rsci->h, &rsc_cache);
+	}
+	RETURN(rc);
+}
+
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+					   struct gss_wire_ctx *gw)
+{
+	struct rsc *rsc;
+
+	rsc = gss_svc_searchbyctx(&gw->gw_handle);
+	if (!rsc) {
+		CWARN("Invalid gss ctx idx "LPX64" from %s\n",
+		      gss_handle_to_u64(&gw->gw_handle),
+		      libcfs_nid2str(req->rq_peer.nid));
+		return NULL;
+	}
+
+	return &rsc->ctx;
+}
+
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx)
+{
+	struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+	COMPAT_RSC_PUT(&rsc->h, &rsc_cache);
+}
+
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx)
+{
+	struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+	/* can't be found */
+	set_bit(CACHE_NEGATIVE, &rsc->h.flags);
+	/* to be removed at next scan */
+	rsc->h.expiry_time = 1;
+}
+
+int __init gss_init_svc_upcall(void)
+{
+	int     i;
+
+	spin_lock_init(&__ctx_index_lock);
+	/*
+	 * this helps reducing context index confliction. after server reboot,
+	 * conflicting request from clients might be filtered out by initial
+	 * sequence number checking, thus no chance to sent error notification
+	 * back to clients.
+	 */
+	cfs_get_random_bytes(&__ctx_index, sizeof(__ctx_index));
+
+
+	cache_register(&rsi_cache);
+	cache_register(&rsc_cache);
+
+	/* FIXME this looks stupid. we intend to give lsvcgssd a chance to open
+	 * the init upcall channel, otherwise there's big chance that the first
+	 * upcall issued before the channel be opened thus nfsv4 cache code will
+	 * drop the request direclty, thus lead to unnecessary recovery time.
+	 * here we wait at miximum 1.5 seconds. */
+	for (i = 0; i < 6; i++) {
+		if (atomic_read(&rsi_cache.readers) > 0)
+			break;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		LASSERT(HZ >= 4);
+		schedule_timeout(HZ / 4);
+	}
+
+	if (atomic_read(&rsi_cache.readers) == 0)
+		CWARN("Init channel is not opened by lsvcgssd, following "
+		      "request might be dropped until lsvcgssd is active\n");
+
+	return 0;
+}
+
+void __exit gss_exit_svc_upcall(void)
+{
+	cache_purge(&rsi_cache);
+	cache_unregister(&rsi_cache);
+
+	cache_purge(&rsc_cache);
+	cache_unregister(&rsc_cache);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c
new file mode 100644
index 0000000..3404000
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c

@@ -0,0 +1,219 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct proc_dir_entry *gss_proc_root = NULL;
+static struct proc_dir_entry *gss_proc_lk = NULL;
+
+/*
+ * statistic of "out-of-sequence-window"
+ */
+static struct {
+	spinlock_t  oos_lock;
+	atomic_t    oos_cli_count;       /* client occurrence */
+	int	     oos_cli_behind;      /* client max seqs behind */
+	atomic_t    oos_svc_replay[3];   /* server replay detected */
+	atomic_t    oos_svc_pass[3];     /* server verified ok */
+} gss_stat_oos = {
+	.oos_cli_count  = ATOMIC_INIT(0),
+	.oos_cli_behind = 0,
+	.oos_svc_replay = { ATOMIC_INIT(0), },
+	.oos_svc_pass   = { ATOMIC_INIT(0), },
+};
+
+void gss_stat_oos_record_cli(int behind)
+{
+	atomic_inc(&gss_stat_oos.oos_cli_count);
+
+	spin_lock(&gss_stat_oos.oos_lock);
+	if (behind > gss_stat_oos.oos_cli_behind)
+		gss_stat_oos.oos_cli_behind = behind;
+	spin_unlock(&gss_stat_oos.oos_lock);
+}
+
+void gss_stat_oos_record_svc(int phase, int replay)
+{
+	LASSERT(phase >= 0 && phase <= 2);
+
+	if (replay)
+		atomic_inc(&gss_stat_oos.oos_svc_replay[phase]);
+	else
+		atomic_inc(&gss_stat_oos.oos_svc_pass[phase]);
+}
+
+static int gss_proc_oos_seq_show(struct seq_file *m, void *v)
+{
+	return seq_printf(m,
+			"seqwin:		%u\n"
+			"backwin:	       %u\n"
+			"client fall behind seqwin\n"
+			"  occurrence:	  %d\n"
+			"  max seq behind:      %d\n"
+			"server replay detected:\n"
+			"  phase 0:	     %d\n"
+			"  phase 1:	     %d\n"
+			"  phase 2:	     %d\n"
+			"server verify ok:\n"
+			"  phase 2:	     %d\n",
+			GSS_SEQ_WIN_MAIN,
+			GSS_SEQ_WIN_BACK,
+			atomic_read(&gss_stat_oos.oos_cli_count),
+			gss_stat_oos.oos_cli_behind,
+			atomic_read(&gss_stat_oos.oos_svc_replay[0]),
+			atomic_read(&gss_stat_oos.oos_svc_replay[1]),
+			atomic_read(&gss_stat_oos.oos_svc_replay[2]),
+			atomic_read(&gss_stat_oos.oos_svc_pass[2]));
+}
+LPROC_SEQ_FOPS_RO(gss_proc_oos);
+
+static int gss_proc_write_secinit(struct file *file, const char *buffer,
+				  size_t count, off_t *off)
+{
+	int rc;
+
+	rc = gss_do_ctx_init_rpc((char *) buffer, count);
+	if (rc) {
+		LASSERT(rc < 0);
+		return rc;
+	}
+
+	return count;
+}
+
+static const struct file_operations gss_proc_secinit = {
+	.write = gss_proc_write_secinit,
+};
+
+static struct lprocfs_vars gss_lprocfs_vars[] = {
+	{ "replays", &gss_proc_oos_fops },
+	{ "init_channel", &gss_proc_secinit, NULL, 0222 },
+	{ NULL }
+};
+
+/*
+ * for userspace helper lgss_keyring.
+ *
+ * debug_level: [0, 4], defined in utils/gss/lgss_utils.h
+ */
+static int gss_lk_debug_level = 1;
+
+static int gss_lk_proc_dl_seq_show(struct seq_file *m, void *v)
+{
+	return seq_printf(m, "%u\n", gss_lk_debug_level);
+}
+
+static int gss_lk_proc_dl_seq_write(struct file *file, const char *buffer,
+				    size_t count, off_t *off)
+{
+	int     val, rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0 || val > 4)
+		return -ERANGE;
+
+	gss_lk_debug_level = val;
+	return count;
+}
+LPROC_SEQ_FOPS(gss_lk_proc_dl);
+
+static struct lprocfs_vars gss_lk_lprocfs_vars[] = {
+	{ "debug_level", &gss_lk_proc_dl_fops },
+	{ NULL }
+};
+
+void gss_exit_lproc(void)
+{
+	if (gss_proc_lk) {
+		lprocfs_remove(&gss_proc_lk);
+		gss_proc_lk = NULL;
+	}
+
+	if (gss_proc_root) {
+		lprocfs_remove(&gss_proc_root);
+		gss_proc_root = NULL;
+	}
+}
+
+int gss_init_lproc(void)
+{
+	int     rc;
+
+	spin_lock_init(&gss_stat_oos.oos_lock);
+
+	gss_proc_root = lprocfs_register("gss", sptlrpc_proc_root,
+					 gss_lprocfs_vars, NULL);
+	if (IS_ERR(gss_proc_root)) {
+		gss_proc_root = NULL;
+		GOTO(err_out, rc = PTR_ERR(gss_proc_root));
+	}
+
+	gss_proc_lk = lprocfs_register("lgss_keyring", gss_proc_root,
+				       gss_lk_lprocfs_vars, NULL);
+	if (IS_ERR(gss_proc_lk)) {
+		gss_proc_lk = NULL;
+		GOTO(err_out, rc = PTR_ERR(gss_proc_root));
+	}
+
+	return 0;
+
+err_out:
+	CERROR("failed to initialize gss lproc entries: %d\n", rc);
+	gss_exit_lproc();
+	return rc;
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c
new file mode 100644
index 0000000..ebca858
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c

@@ -0,0 +1,2916 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#include <linux/crypto.h>
+#include <linux/crc32.h>
+
+/*
+ * early reply have fixed size, respectively in privacy and integrity mode.
+ * so we calculate them only once.
+ */
+static int gss_at_reply_off_integ;
+static int gss_at_reply_off_priv;
+
+
+static inline int msg_last_segidx(struct lustre_msg *msg)
+{
+	LASSERT(msg->lm_bufcount > 0);
+	return msg->lm_bufcount - 1;
+}
+static inline int msg_last_seglen(struct lustre_msg *msg)
+{
+	return msg->lm_buflens[msg_last_segidx(msg)];
+}
+
+/********************************************
+ * wire data swabber			*
+ ********************************************/
+
+static
+void gss_header_swabber(struct gss_header *ghdr)
+{
+	__swab32s(&ghdr->gh_flags);
+	__swab32s(&ghdr->gh_proc);
+	__swab32s(&ghdr->gh_seq);
+	__swab32s(&ghdr->gh_svc);
+	__swab32s(&ghdr->gh_pad1);
+	__swab32s(&ghdr->gh_handle.len);
+}
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+				   int swabbed)
+{
+	struct gss_header *ghdr;
+
+	ghdr = lustre_msg_buf(msg, segment, sizeof(*ghdr));
+	if (ghdr == NULL)
+		return NULL;
+
+	if (swabbed)
+		gss_header_swabber(ghdr);
+
+	if (sizeof(*ghdr) + ghdr->gh_handle.len > msg->lm_buflens[segment]) {
+		CERROR("gss header has length %d, now %u received\n",
+		       (int) sizeof(*ghdr) + ghdr->gh_handle.len,
+		       msg->lm_buflens[segment]);
+		return NULL;
+	}
+
+	return ghdr;
+}
+
+#if 0
+static
+void gss_netobj_swabber(netobj_t *obj)
+{
+	__swab32s(&obj->len);
+}
+
+netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment)
+{
+	netobj_t  *obj;
+
+	obj = lustre_swab_buf(msg, segment, sizeof(*obj), gss_netobj_swabber);
+	if (obj && sizeof(*obj) + obj->len > msg->lm_buflens[segment]) {
+		CERROR("netobj require length %u but only %u received\n",
+		       (unsigned int) sizeof(*obj) + obj->len,
+		       msg->lm_buflens[segment]);
+		return NULL;
+	}
+
+	return obj;
+}
+#endif
+
+/*
+ * payload should be obtained from mechanism. but currently since we
+ * only support kerberos, we could simply use fixed value.
+ * krb5 "meta" data:
+ *  - krb5 header:      16
+ *  - krb5 checksum:    20
+ *
+ * for privacy mode, payload also include the cipher text which has the same
+ * size as plain text, plus possible confounder, padding both at maximum cipher
+ * block size.
+ */
+#define GSS_KRB5_INTEG_MAX_PAYLOAD      (40)
+
+static inline
+int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy)
+{
+	if (privacy)
+		return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize;
+	else
+		return GSS_KRB5_INTEG_MAX_PAYLOAD;
+}
+
+/*
+ * return signature size, otherwise < 0 to indicate error
+ */
+static int gss_sign_msg(struct lustre_msg *msg,
+			struct gss_ctx *mechctx,
+			enum lustre_sec_part sp,
+			__u32 flags, __u32 proc, __u32 seq, __u32 svc,
+			rawobj_t *handle)
+{
+	struct gss_header      *ghdr;
+	rawobj_t		text[4], mic;
+	int		     textcnt, max_textcnt, mic_idx;
+	__u32		   major;
+
+	LASSERT(msg->lm_bufcount >= 2);
+
+	/* gss hdr */
+	LASSERT(msg->lm_buflens[0] >=
+		sizeof(*ghdr) + (handle ? handle->len : 0));
+	ghdr = lustre_msg_buf(msg, 0, 0);
+
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = (__u8) sp;
+	ghdr->gh_flags = flags;
+	ghdr->gh_proc = proc;
+	ghdr->gh_seq = seq;
+	ghdr->gh_svc = svc;
+	if (!handle) {
+		/* fill in a fake one */
+		ghdr->gh_handle.len = 0;
+	} else {
+		ghdr->gh_handle.len = handle->len;
+		memcpy(ghdr->gh_handle.data, handle->data, handle->len);
+	}
+
+	/* no actual signature for null mode */
+	if (svc == SPTLRPC_SVC_NULL)
+		return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+	/* MIC */
+	mic_idx = msg_last_segidx(msg);
+	max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+	for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+		text[textcnt].len = msg->lm_buflens[textcnt];
+		text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+	}
+
+	mic.len = msg->lm_buflens[mic_idx];
+	mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+	major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("fail to generate MIC: %08x\n", major);
+		return -EPERM;
+	}
+	LASSERT(mic.len <= msg->lm_buflens[mic_idx]);
+
+	return lustre_shrink_msg(msg, mic_idx, mic.len, 0);
+}
+
+/*
+ * return gss error
+ */
+static
+__u32 gss_verify_msg(struct lustre_msg *msg,
+		     struct gss_ctx *mechctx,
+		     __u32 svc)
+{
+	rawobj_t	text[4], mic;
+	int	     textcnt, max_textcnt;
+	int	     mic_idx;
+	__u32	   major;
+
+	LASSERT(msg->lm_bufcount >= 2);
+
+	if (svc == SPTLRPC_SVC_NULL)
+		return GSS_S_COMPLETE;
+
+	mic_idx = msg_last_segidx(msg);
+	max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+	for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+		text[textcnt].len = msg->lm_buflens[textcnt];
+		text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+	}
+
+	mic.len = msg->lm_buflens[mic_idx];
+	mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+	major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic);
+	if (major != GSS_S_COMPLETE)
+		CERROR("mic verify error: %08x\n", major);
+
+	return major;
+}
+
+/*
+ * return gss error code
+ */
+static
+__u32 gss_unseal_msg(struct gss_ctx *mechctx,
+		   struct lustre_msg *msgbuf,
+		   int *msg_len, int msgbuf_len)
+{
+	rawobj_t		 clear_obj, hdrobj, token;
+	__u8		    *clear_buf;
+	int		      clear_buflen;
+	__u32		    major;
+	ENTRY;
+
+	if (msgbuf->lm_bufcount != 2) {
+		CERROR("invalid bufcount %d\n", msgbuf->lm_bufcount);
+		RETURN(GSS_S_FAILURE);
+	}
+
+	/* allocate a temporary clear text buffer, same sized as token,
+	 * we assume the final clear text size <= token size */
+	clear_buflen = lustre_msg_buflen(msgbuf, 1);
+	OBD_ALLOC_LARGE(clear_buf, clear_buflen);
+	if (!clear_buf)
+		RETURN(GSS_S_FAILURE);
+
+	/* buffer objects */
+	hdrobj.len = lustre_msg_buflen(msgbuf, 0);
+	hdrobj.data = lustre_msg_buf(msgbuf, 0, 0);
+	token.len = lustre_msg_buflen(msgbuf, 1);
+	token.data = lustre_msg_buf(msgbuf, 1, 0);
+	clear_obj.len = clear_buflen;
+	clear_obj.data = clear_buf;
+
+	major = lgss_unwrap(mechctx, &hdrobj, &token, &clear_obj);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("unwrap message error: %08x\n", major);
+		GOTO(out_free, major = GSS_S_FAILURE);
+	}
+	LASSERT(clear_obj.len <= clear_buflen);
+	LASSERT(clear_obj.len <= msgbuf_len);
+
+	/* now the decrypted message */
+	memcpy(msgbuf, clear_obj.data, clear_obj.len);
+	*msg_len = clear_obj.len;
+
+	major = GSS_S_COMPLETE;
+out_free:
+	OBD_FREE_LARGE(clear_buf, clear_buflen);
+	RETURN(major);
+}
+
+/********************************************
+ * gss client context manipulation helpers  *
+ ********************************************/
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->cc_refcount));
+
+	if (!test_and_set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags)) {
+		if (!ctx->cc_early_expire)
+			clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+		CWARN("ctx %p(%u->%s) get expired: %lu(%+lds)\n",
+		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+		      ctx->cc_expire,
+		      ctx->cc_expire == 0 ? 0 :
+		      cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+
+		sptlrpc_cli_ctx_wakeup(ctx);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
+{
+	if (unlikely(cli_ctx_is_dead(ctx)))
+		return 1;
+
+	/* expire is 0 means never expire. a newly created gss context
+	 * which during upcall may has 0 expiration */
+	if (ctx->cc_expire == 0)
+		return 0;
+
+	/* check real expiration */
+	if (cfs_time_after(ctx->cc_expire, cfs_time_current_sec()))
+		return 0;
+
+	cli_ctx_expire(ctx);
+	return 1;
+}
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
+{
+	struct ptlrpc_cli_ctx  *ctx = &gctx->gc_base;
+	unsigned long	   ctx_expiry;
+
+	if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) {
+		CERROR("ctx %p(%u): unable to inquire, expire it now\n",
+		       gctx, ctx->cc_vcred.vc_uid);
+		ctx_expiry = 1; /* make it expired now */
+	}
+
+	ctx->cc_expire = gss_round_ctx_expiry(ctx_expiry,
+					      ctx->cc_sec->ps_flvr.sf_flags);
+
+	/* At this point this ctx might have been marked as dead by
+	 * someone else, in which case nobody will make further use
+	 * of it. we don't care, and mark it UPTODATE will help
+	 * destroying server side context when it be destroied. */
+	set_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+	if (sec_is_reverse(ctx->cc_sec)) {
+		CWARN("server installed reverse ctx %p idx "LPX64", "
+		      "expiry %lu(%+lds)\n", ctx,
+		      gss_handle_to_u64(&gctx->gc_handle),
+		      ctx->cc_expire, ctx->cc_expire - cfs_time_current_sec());
+	} else {
+		CWARN("client refreshed ctx %p idx "LPX64" (%u->%s), "
+		      "expiry %lu(%+lds)\n", ctx,
+		      gss_handle_to_u64(&gctx->gc_handle),
+		      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+		      ctx->cc_expire, ctx->cc_expire - cfs_time_current_sec());
+
+		/* install reverse svc ctx for root context */
+		if (ctx->cc_vcred.vc_uid == 0)
+			gss_sec_install_rctx(ctx->cc_sec->ps_import,
+					     ctx->cc_sec, ctx);
+	}
+
+	sptlrpc_cli_ctx_wakeup(ctx);
+}
+
+static void gss_cli_ctx_finalize(struct gss_cli_ctx *gctx)
+{
+	LASSERT(gctx->gc_base.cc_sec);
+
+	if (gctx->gc_mechctx) {
+		lgss_delete_sec_context(&gctx->gc_mechctx);
+		gctx->gc_mechctx = NULL;
+	}
+
+	if (!rawobj_empty(&gctx->gc_svc_handle)) {
+		/* forward ctx: mark buddy reverse svcctx soon-expire. */
+		if (!sec_is_reverse(gctx->gc_base.cc_sec) &&
+		    !rawobj_empty(&gctx->gc_svc_handle))
+			gss_svc_upcall_expire_rvs_ctx(&gctx->gc_svc_handle);
+
+		rawobj_free(&gctx->gc_svc_handle);
+	}
+
+	rawobj_free(&gctx->gc_handle);
+}
+
+/*
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * modified for our own problem: arriving request has valid sequence number,
+ * but unwrapping request might cost a long time, after that its sequence
+ * are not valid anymore (fall behind the window). It rarely happen, mostly
+ * under extreme load.
+ *
+ * note we should not check sequence before verify the integrity of incoming
+ * request, because just one attacking request with high sequence number might
+ * cause all following request be dropped.
+ *
+ * so here we use a multi-phase approach: prepare 2 sequence windows,
+ * "main window" for normal sequence and "back window" for fall behind sequence.
+ * and 3-phase checking mechanism:
+ *  0 - before integrity verification, perform a initial sequence checking in
+ *      main window, which only try and don't actually set any bits. if the
+ *      sequence is high above the window or fit in the window and the bit
+ *      is 0, then accept and proceed to integrity verification. otherwise
+ *      reject this sequence.
+ *  1 - after integrity verification, check in main window again. if this
+ *      sequence is high above the window or fit in the window and the bit
+ *      is 0, then set the bit and accept; if it fit in the window but bit
+ *      already set, then reject; if it fall behind the window, then proceed
+ *      to phase 2.
+ *  2 - check in back window. if it is high above the window or fit in the
+ *      window and the bit is 0, then set the bit and accept. otherwise reject.
+ *
+ * return value:
+ *   1: looks like a replay
+ *   0: is ok
+ *  -1: is a replay
+ *
+ * note phase 0 is necessary, because otherwise replay attacking request of
+ * sequence which between the 2 windows can't be detected.
+ *
+ * this mechanism can't totally solve the problem, but could help much less
+ * number of valid requests be dropped.
+ */
+static
+int gss_do_check_seq(unsigned long *window, __u32 win_size, __u32 *max_seq,
+		     __u32 seq_num, int phase)
+{
+	LASSERT(phase >= 0 && phase <= 2);
+
+	if (seq_num > *max_seq) {
+		/*
+		 * 1. high above the window
+		 */
+		if (phase == 0)
+			return 0;
+
+		if (seq_num >= *max_seq + win_size) {
+			memset(window, 0, win_size / 8);
+			*max_seq = seq_num;
+		} else {
+			while(*max_seq < seq_num) {
+				(*max_seq)++;
+				__clear_bit((*max_seq) % win_size, window);
+			}
+		}
+		__set_bit(seq_num % win_size, window);
+	} else if (seq_num + win_size <= *max_seq) {
+		/*
+		 * 2. low behind the window
+		 */
+		if (phase == 0 || phase == 2)
+			goto replay;
+
+		CWARN("seq %u is %u behind (size %d), check backup window\n",
+		      seq_num, *max_seq - win_size - seq_num, win_size);
+		return 1;
+	} else {
+		/*
+		 * 3. fit into the window
+		 */
+		switch (phase) {
+		case 0:
+			if (test_bit(seq_num % win_size, window))
+				goto replay;
+			break;
+		case 1:
+		case 2:
+		     if (__test_and_set_bit(seq_num % win_size, window))
+				goto replay;
+			break;
+		}
+	}
+
+	return 0;
+
+replay:
+	CERROR("seq %u (%s %s window) is a replay: max %u, winsize %d\n",
+	       seq_num,
+	       seq_num + win_size > *max_seq ? "in" : "behind",
+	       phase == 2 ? "backup " : "main",
+	       *max_seq, win_size);
+	return -1;
+}
+
+/*
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * if @set == 0: initial check, don't set any bit in window
+ * if @sec == 1: final check, set bit in window
+ */
+int gss_check_seq_num(struct gss_svc_seq_data *ssd, __u32 seq_num, int set)
+{
+	int rc = 0;
+
+	spin_lock(&ssd->ssd_lock);
+
+	if (set == 0) {
+		/*
+		 * phase 0 testing
+		 */
+		rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+				      &ssd->ssd_max_main, seq_num, 0);
+		if (unlikely(rc))
+			gss_stat_oos_record_svc(0, 1);
+	} else {
+		/*
+		 * phase 1 checking main window
+		 */
+		rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+				      &ssd->ssd_max_main, seq_num, 1);
+		switch (rc) {
+		case -1:
+			gss_stat_oos_record_svc(1, 1);
+			/* fall through */
+		case 0:
+			goto exit;
+		}
+		/*
+		 * phase 2 checking back window
+		 */
+		rc = gss_do_check_seq(ssd->ssd_win_back, GSS_SEQ_WIN_BACK,
+				      &ssd->ssd_max_back, seq_num, 2);
+		if (rc)
+			gss_stat_oos_record_svc(2, 1);
+		else
+			gss_stat_oos_record_svc(2, 0);
+	}
+exit:
+	spin_unlock(&ssd->ssd_lock);
+	return rc;
+}
+
+/***************************************
+ * cred APIs			   *
+ ***************************************/
+
+static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx,
+				  int msgsize, int privacy)
+{
+	return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx,
+				struct sptlrpc_flavor *flvr,
+				int reply, int read)
+{
+	int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+	LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT);
+
+	if ((!reply && !read) || (reply && read)) {
+		switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+		case SPTLRPC_BULK_SVC_NULL:
+			break;
+		case SPTLRPC_BULK_SVC_INTG:
+			payload += gss_cli_payload(ctx, 0, 0);
+			break;
+		case SPTLRPC_BULK_SVC_PRIV:
+			payload += gss_cli_payload(ctx, 0, 1);
+			break;
+		case SPTLRPC_BULK_SVC_AUTH:
+		default:
+			LBUG();
+		}
+	}
+
+	return payload;
+}
+
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+	return (ctx->cc_vcred.vc_uid == vcred->vc_uid);
+}
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_CTX_NEW)
+		strncat(buf, "new,", bufsize);
+	if (flags & PTLRPC_CTX_UPTODATE)
+		strncat(buf, "uptodate,", bufsize);
+	if (flags & PTLRPC_CTX_DEAD)
+		strncat(buf, "dead,", bufsize);
+	if (flags & PTLRPC_CTX_ERROR)
+		strncat(buf, "error,", bufsize);
+	if (flags & PTLRPC_CTX_CACHED)
+		strncat(buf, "cached,", bufsize);
+	if (flags & PTLRPC_CTX_ETERNAL)
+		strncat(buf, "eternal,", bufsize);
+	if (buf[0] == '\0')
+		strncat(buf, "-,", bufsize);
+
+	buf[strlen(buf) - 1] = '\0';
+}
+
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx,
+		     struct ptlrpc_request *req)
+{
+	struct gss_cli_ctx      *gctx = ctx2gctx(ctx);
+	__u32		    flags = 0, seq, svc;
+	int		      rc;
+	ENTRY;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+	LASSERT(req->rq_cli_ctx == ctx);
+
+	/* nothing to do for context negotiation RPCs */
+	if (req->rq_ctx_init)
+		RETURN(0);
+
+	svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+	if (req->rq_pack_bulk)
+		flags |= LUSTRE_GSS_PACK_BULK;
+	if (req->rq_pack_udesc)
+		flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+	seq = atomic_inc_return(&gctx->gc_seq);
+
+	rc = gss_sign_msg(req->rq_reqbuf, gctx->gc_mechctx,
+			  ctx->cc_sec->ps_part,
+			  flags, gctx->gc_proc, seq, svc,
+			  &gctx->gc_handle);
+	if (rc < 0)
+		RETURN(rc);
+
+	/* gss_sign_msg() msg might take long time to finish, in which period
+	 * more rpcs could be wrapped up and sent out. if we found too many
+	 * of them we should repack this rpc, because sent it too late might
+	 * lead to the sequence number fall behind the window on server and
+	 * be dropped. also applies to gss_cli_ctx_seal().
+	 *
+	 * Note: null mode dosen't check sequence number. */
+	if (svc != SPTLRPC_SVC_NULL &&
+	    atomic_read(&gctx->gc_seq) - seq > GSS_SEQ_REPACK_THRESHOLD) {
+		int behind = atomic_read(&gctx->gc_seq) - seq;
+
+		gss_stat_oos_record_cli(behind);
+		CWARN("req %p: %u behind, retry signing\n", req, behind);
+		goto redo;
+	}
+
+	req->rq_reqdata_len = rc;
+	RETURN(0);
+}
+
+static
+int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx,
+				  struct ptlrpc_request *req,
+				  struct gss_header *ghdr)
+{
+	struct gss_err_header *errhdr;
+	int rc;
+
+	LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR);
+
+	errhdr = (struct gss_err_header *) ghdr;
+
+	CWARN("req x"LPU64"/t"LPU64", ctx %p idx "LPX64"(%u->%s): "
+	      "%sserver respond (%08x/%08x)\n",
+	      req->rq_xid, req->rq_transno, ctx,
+	      gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle),
+	      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+	      sec_is_reverse(ctx->cc_sec) ? "reverse" : "",
+	      errhdr->gh_major, errhdr->gh_minor);
+
+	/* context fini rpc, let it failed */
+	if (req->rq_ctx_fini) {
+		CWARN("context fini rpc failed\n");
+		return -EINVAL;
+	}
+
+	/* reverse sec, just return error, don't expire this ctx because it's
+	 * crucial to callback rpcs. note if the callback rpc failed because
+	 * of bit flip during network transfer, the client will be evicted
+	 * directly. so more gracefully we probably want let it retry for
+	 * number of times. */
+	if (sec_is_reverse(ctx->cc_sec))
+		return -EINVAL;
+
+	if (errhdr->gh_major != GSS_S_NO_CONTEXT &&
+	    errhdr->gh_major != GSS_S_BAD_SIG)
+		return -EACCES;
+
+	/* server return NO_CONTEXT might be caused by context expire
+	 * or server reboot/failover. we try to refresh a new ctx which
+	 * be transparent to upper layer.
+	 *
+	 * In some cases, our gss handle is possible to be incidentally
+	 * identical to another handle since the handle itself is not
+	 * fully random. In krb5 case, the GSS_S_BAD_SIG will be
+	 * returned, maybe other gss error for other mechanism.
+	 *
+	 * if we add new mechanism, make sure the correct error are
+	 * returned in this case. */
+	CWARN("%s: server might lost the context, retrying\n",
+	      errhdr->gh_major == GSS_S_NO_CONTEXT ?  "NO_CONTEXT" : "BAD_SIG");
+
+	sptlrpc_cli_ctx_expire(ctx);
+
+	/* we need replace the ctx right here, otherwise during
+	 * resent we'll hit the logic in sptlrpc_req_refresh_ctx()
+	 * which keep the ctx with RESEND flag, thus we'll never
+	 * get rid of this ctx. */
+	rc = sptlrpc_req_replace_dead_ctx(req);
+	if (rc == 0)
+		req->rq_resend = 1;
+
+	return rc;
+}
+
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx,
+		       struct ptlrpc_request *req)
+{
+	struct gss_cli_ctx     *gctx;
+	struct gss_header      *ghdr, *reqhdr;
+	struct lustre_msg      *msg = req->rq_repdata;
+	__u32		   major;
+	int		     pack_bulk, swabbed, rc = 0;
+	ENTRY;
+
+	LASSERT(req->rq_cli_ctx == ctx);
+	LASSERT(msg);
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+	/* special case for context negotiation, rq_repmsg/rq_replen actually
+	 * are not used currently. but early reply always be treated normally */
+	if (req->rq_ctx_init && !req->rq_early) {
+		req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+		req->rq_replen = msg->lm_buflens[1];
+		RETURN(0);
+	}
+
+	if (msg->lm_bufcount < 2 || msg->lm_bufcount > 4) {
+		CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+		RETURN(-EPROTO);
+	}
+
+	swabbed = ptlrpc_rep_need_swab(req);
+
+	ghdr = gss_swab_header(msg, 0, swabbed);
+	if (ghdr == NULL) {
+		CERROR("can't decode gss header\n");
+		RETURN(-EPROTO);
+	}
+
+	/* sanity checks */
+	reqhdr = lustre_msg_buf(msg, 0, sizeof(*reqhdr));
+	LASSERT(reqhdr);
+
+	if (ghdr->gh_version != reqhdr->gh_version) {
+		CERROR("gss version %u mismatch, expect %u\n",
+		       ghdr->gh_version, reqhdr->gh_version);
+		RETURN(-EPROTO);
+	}
+
+	switch (ghdr->gh_proc) {
+	case PTLRPC_GSS_PROC_DATA:
+		pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+		if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+			CERROR("%s bulk flag in reply\n",
+			       req->rq_pack_bulk ? "missing" : "unexpected");
+			RETURN(-EPROTO);
+		}
+
+		if (ghdr->gh_seq != reqhdr->gh_seq) {
+			CERROR("seqnum %u mismatch, expect %u\n",
+			       ghdr->gh_seq, reqhdr->gh_seq);
+			RETURN(-EPROTO);
+		}
+
+		if (ghdr->gh_svc != reqhdr->gh_svc) {
+			CERROR("svc %u mismatch, expect %u\n",
+			       ghdr->gh_svc, reqhdr->gh_svc);
+			RETURN(-EPROTO);
+		}
+
+		if (swabbed)
+			gss_header_swabber(ghdr);
+
+		major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc);
+		if (major != GSS_S_COMPLETE) {
+			CERROR("failed to verify reply: %x\n", major);
+			RETURN(-EPERM);
+		}
+
+		if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) {
+			__u32 cksum;
+
+			cksum = crc32_le(!(__u32) 0,
+					 lustre_msg_buf(msg, 1, 0),
+					 lustre_msg_buflen(msg, 1));
+			if (cksum != msg->lm_cksum) {
+				CWARN("early reply checksum mismatch: "
+				      "%08x != %08x\n", cksum, msg->lm_cksum);
+				RETURN(-EPROTO);
+			}
+		}
+
+		if (pack_bulk) {
+			/* bulk checksum is right after the lustre msg */
+			if (msg->lm_bufcount < 3) {
+				CERROR("Invalid reply bufcount %u\n",
+				       msg->lm_bufcount);
+				RETURN(-EPROTO);
+			}
+
+			rc = bulk_sec_desc_unpack(msg, 2, swabbed);
+			if (rc) {
+				CERROR("unpack bulk desc: %d\n", rc);
+				RETURN(rc);
+			}
+		}
+
+		req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+		req->rq_replen = msg->lm_buflens[1];
+		break;
+	case PTLRPC_GSS_PROC_ERR:
+		if (req->rq_early) {
+			CERROR("server return error with early reply\n");
+			rc = -EPROTO;
+		} else {
+			rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+		}
+		break;
+	default:
+		CERROR("unknown gss proc %d\n", ghdr->gh_proc);
+		rc = -EPROTO;
+	}
+
+	RETURN(rc);
+}
+
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx,
+		     struct ptlrpc_request *req)
+{
+	struct gss_cli_ctx      *gctx;
+	rawobj_t		 hdrobj, msgobj, token;
+	struct gss_header       *ghdr;
+	__u32		    buflens[2], major;
+	int		      wiresize, rc;
+	ENTRY;
+
+	LASSERT(req->rq_clrbuf);
+	LASSERT(req->rq_cli_ctx == ctx);
+	LASSERT(req->rq_reqlen);
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+	/* final clear data length */
+	req->rq_clrdata_len = lustre_msg_size_v2(req->rq_clrbuf->lm_bufcount,
+						 req->rq_clrbuf->lm_buflens);
+
+	/* calculate wire data length */
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = gss_cli_payload(&gctx->gc_base, req->rq_clrdata_len, 1);
+	wiresize = lustre_msg_size_v2(2, buflens);
+
+	/* allocate wire buffer */
+	if (req->rq_pool) {
+		/* pre-allocated */
+		LASSERT(req->rq_reqbuf);
+		LASSERT(req->rq_reqbuf != req->rq_clrbuf);
+		LASSERT(req->rq_reqbuf_len >= wiresize);
+	} else {
+		OBD_ALLOC_LARGE(req->rq_reqbuf, wiresize);
+		if (!req->rq_reqbuf)
+			RETURN(-ENOMEM);
+		req->rq_reqbuf_len = wiresize;
+	}
+
+	lustre_init_msg_v2(req->rq_reqbuf, 2, buflens, NULL);
+	req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	/* gss header */
+	ghdr = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = (__u8) ctx->cc_sec->ps_part;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = gctx->gc_proc;
+	ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+	ghdr->gh_handle.len = gctx->gc_handle.len;
+	memcpy(ghdr->gh_handle.data, gctx->gc_handle.data, gctx->gc_handle.len);
+	if (req->rq_pack_bulk)
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+	if (req->rq_pack_udesc)
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+	ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+
+	/* buffer objects */
+	hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+	hdrobj.data = (__u8 *) ghdr;
+	msgobj.len = req->rq_clrdata_len;
+	msgobj.data = (__u8 *) req->rq_clrbuf;
+	token.len = lustre_msg_buflen(req->rq_reqbuf, 1);
+	token.data = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+
+	major = lgss_wrap(gctx->gc_mechctx, &hdrobj, &msgobj,
+			  req->rq_clrbuf_len, &token);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("priv: wrap message error: %08x\n", major);
+		GOTO(err_free, rc = -EPERM);
+	}
+	LASSERT(token.len <= buflens[1]);
+
+	/* see explain in gss_cli_ctx_sign() */
+	if (unlikely(atomic_read(&gctx->gc_seq) - ghdr->gh_seq >
+		     GSS_SEQ_REPACK_THRESHOLD)) {
+		int behind = atomic_read(&gctx->gc_seq) - ghdr->gh_seq;
+
+		gss_stat_oos_record_cli(behind);
+		CWARN("req %p: %u behind, retry sealing\n", req, behind);
+
+		ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+		goto redo;
+	}
+
+	/* now set the final wire data length */
+	req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, 1, token.len,0);
+	RETURN(0);
+
+err_free:
+	if (!req->rq_pool) {
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+	RETURN(rc);
+}
+
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx,
+		       struct ptlrpc_request *req)
+{
+	struct gss_cli_ctx      *gctx;
+	struct gss_header       *ghdr;
+	struct lustre_msg       *msg = req->rq_repdata;
+	int		      msglen, pack_bulk, swabbed, rc;
+	__u32		    major;
+	ENTRY;
+
+	LASSERT(req->rq_cli_ctx == ctx);
+	LASSERT(req->rq_ctx_init == 0);
+	LASSERT(msg);
+
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+	swabbed = ptlrpc_rep_need_swab(req);
+
+	ghdr = gss_swab_header(msg, 0, swabbed);
+	if (ghdr == NULL) {
+		CERROR("can't decode gss header\n");
+		RETURN(-EPROTO);
+	}
+
+	/* sanity checks */
+	if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+		CERROR("gss version %u mismatch, expect %u\n",
+		       ghdr->gh_version, PTLRPC_GSS_VERSION);
+		RETURN(-EPROTO);
+	}
+
+	switch (ghdr->gh_proc) {
+	case PTLRPC_GSS_PROC_DATA:
+		pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+		if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+			CERROR("%s bulk flag in reply\n",
+			       req->rq_pack_bulk ? "missing" : "unexpected");
+			RETURN(-EPROTO);
+		}
+
+		if (swabbed)
+			gss_header_swabber(ghdr);
+
+		/* use rq_repdata_len as buffer size, which assume unseal
+		 * doesn't need extra memory space. for precise control, we'd
+		 * better calculate out actual buffer size as
+		 * (repbuf_len - offset - repdata_len) */
+		major = gss_unseal_msg(gctx->gc_mechctx, msg,
+				       &msglen, req->rq_repdata_len);
+		if (major != GSS_S_COMPLETE) {
+			CERROR("failed to unwrap reply: %x\n", major);
+			rc = -EPERM;
+			break;
+		}
+
+		swabbed = __lustre_unpack_msg(msg, msglen);
+		if (swabbed < 0) {
+			CERROR("Failed to unpack after decryption\n");
+			RETURN(-EPROTO);
+		}
+
+		if (msg->lm_bufcount < 1) {
+			CERROR("Invalid reply buffer: empty\n");
+			RETURN(-EPROTO);
+		}
+
+		if (pack_bulk) {
+			if (msg->lm_bufcount < 2) {
+				CERROR("bufcount %u: missing bulk sec desc\n",
+				       msg->lm_bufcount);
+				RETURN(-EPROTO);
+			}
+
+			/* bulk checksum is the last segment */
+			if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1,
+						 swabbed))
+				RETURN(-EPROTO);
+		}
+
+		req->rq_repmsg = lustre_msg_buf(msg, 0, 0);
+		req->rq_replen = msg->lm_buflens[0];
+
+		rc = 0;
+		break;
+	case PTLRPC_GSS_PROC_ERR:
+		if (req->rq_early) {
+			CERROR("server return error with early reply\n");
+			rc = -EPROTO;
+		} else {
+			rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+		}
+		break;
+	default:
+		CERROR("unexpected proc %d\n", ghdr->gh_proc);
+		rc = -EPERM;
+	}
+
+	RETURN(rc);
+}
+
+/*********************************************
+ * reverse context installation	      *
+ *********************************************/
+
+static inline
+int gss_install_rvs_svc_ctx(struct obd_import *imp,
+			    struct gss_sec *gsec,
+			    struct gss_cli_ctx *gctx)
+{
+	return gss_svc_upcall_install_rvs_ctx(imp, gsec, gctx);
+}
+
+/*********************************************
+ * GSS security APIs			 *
+ *********************************************/
+int gss_sec_create_common(struct gss_sec *gsec,
+			  struct ptlrpc_sec_policy *policy,
+			  struct obd_import *imp,
+			  struct ptlrpc_svc_ctx *svcctx,
+			  struct sptlrpc_flavor *sf)
+{
+	struct ptlrpc_sec   *sec;
+
+	LASSERT(imp);
+	LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS);
+
+	gsec->gs_mech = lgss_subflavor_to_mech(
+				SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+	if (!gsec->gs_mech) {
+		CERROR("gss backend 0x%x not found\n",
+		       SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock_init(&gsec->gs_lock);
+	gsec->gs_rvs_hdl = 0ULL;
+
+	/* initialize upper ptlrpc_sec */
+	sec = &gsec->gs_base;
+	sec->ps_policy = policy;
+	atomic_set(&sec->ps_refcount, 0);
+	atomic_set(&sec->ps_nctx, 0);
+	sec->ps_id = sptlrpc_get_next_secid();
+	sec->ps_flvr = *sf;
+	sec->ps_import = class_import_get(imp);
+	spin_lock_init(&sec->ps_lock);
+	INIT_LIST_HEAD(&sec->ps_gc_list);
+
+	if (!svcctx) {
+		sec->ps_gc_interval = GSS_GC_INTERVAL;
+	} else {
+		LASSERT(sec_is_reverse(sec));
+
+		/* never do gc on reverse sec */
+		sec->ps_gc_interval = 0;
+	}
+
+	if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+		sptlrpc_enc_pool_add_user();
+
+	CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""),
+	       policy->sp_name, gsec);
+	return 0;
+}
+
+void gss_sec_destroy_common(struct gss_sec *gsec)
+{
+	struct ptlrpc_sec      *sec = &gsec->gs_base;
+	ENTRY;
+
+	LASSERT(sec->ps_import);
+	LASSERT(atomic_read(&sec->ps_refcount) == 0);
+	LASSERT(atomic_read(&sec->ps_nctx) == 0);
+
+	if (gsec->gs_mech) {
+		lgss_mech_put(gsec->gs_mech);
+		gsec->gs_mech = NULL;
+	}
+
+	class_import_put(sec->ps_import);
+
+	if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+		sptlrpc_enc_pool_del_user();
+
+	EXIT;
+}
+
+void gss_sec_kill(struct ptlrpc_sec *sec)
+{
+	sec->ps_dying = 1;
+}
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx,
+			    struct ptlrpc_ctx_ops *ctxops,
+			    struct vfs_cred *vcred)
+{
+	struct gss_cli_ctx    *gctx = ctx2gctx(ctx);
+
+	gctx->gc_win = 0;
+	atomic_set(&gctx->gc_seq, 0);
+
+	INIT_HLIST_NODE(&ctx->cc_cache);
+	atomic_set(&ctx->cc_refcount, 0);
+	ctx->cc_sec = sec;
+	ctx->cc_ops = ctxops;
+	ctx->cc_expire = 0;
+	ctx->cc_flags = PTLRPC_CTX_NEW;
+	ctx->cc_vcred = *vcred;
+	spin_lock_init(&ctx->cc_lock);
+	INIT_LIST_HEAD(&ctx->cc_req_list);
+	INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+	/* take a ref on belonging sec, balanced in ctx destroying */
+	atomic_inc(&sec->ps_refcount);
+	/* statistic only */
+	atomic_inc(&sec->ps_nctx);
+
+	CDEBUG(D_SEC, "%s@%p: create ctx %p(%u->%s)\n",
+	       sec->ps_policy->sp_name, ctx->cc_sec,
+	       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+	return 0;
+}
+
+/*
+ * return value:
+ *   1: the context has been taken care of by someone else
+ *   0: proceed to really destroy the context locally
+ */
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+			    struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	LASSERT(ctx->cc_sec == sec);
+
+	/*
+	 * remove UPTODATE flag of reverse ctx thus we won't send fini rpc,
+	 * this is to avoid potential problems of client side reverse svc ctx
+	 * be mis-destroyed in various recovery senarios. anyway client can
+	 * manage its reverse ctx well by associating it with its buddy ctx.
+	 */
+	if (sec_is_reverse(sec))
+		ctx->cc_flags &= ~PTLRPC_CTX_UPTODATE;
+
+	if (gctx->gc_mechctx) {
+		/* the final context fini rpc will use this ctx too, and it's
+		 * asynchronous which finished by request_out_callback(). so
+		 * we add refcount, whoever drop finally drop the refcount to
+		 * 0 should responsible for the rest of destroy. */
+		atomic_inc(&ctx->cc_refcount);
+
+		gss_do_ctx_fini_rpc(gctx);
+		gss_cli_ctx_finalize(gctx);
+
+		if (!atomic_dec_and_test(&ctx->cc_refcount))
+			return 1;
+	}
+
+	if (sec_is_reverse(sec))
+		CWARN("reverse sec %p: destroy ctx %p\n",
+		      ctx->cc_sec, ctx);
+	else
+		CWARN("%s@%p: destroy ctx %p(%u->%s)\n",
+		      sec->ps_policy->sp_name, ctx->cc_sec,
+		      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+	return 0;
+}
+
+static
+int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec,
+			  struct ptlrpc_request *req,
+			  int svc, int msgsize)
+{
+	int		       bufsize, txtsize;
+	int		       bufcnt = 2;
+	__u32		     buflens[5];
+	ENTRY;
+
+	/*
+	 * on-wire data layout:
+	 * - gss header
+	 * - lustre message
+	 * - user descriptor (optional)
+	 * - bulk sec descriptor (optional)
+	 * - signature (optional)
+	 *   - svc == NULL: NULL
+	 *   - svc == AUTH: signature of gss header
+	 *   - svc == INTG: signature of all above
+	 *
+	 * if this is context negotiation, reserver fixed space
+	 * at the last (signature) segment regardless of svc mode.
+	 */
+
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	txtsize = buflens[0];
+
+	buflens[1] = msgsize;
+	if (svc == SPTLRPC_SVC_INTG)
+		txtsize += buflens[1];
+
+	if (req->rq_pack_udesc) {
+		buflens[bufcnt] = sptlrpc_current_user_desc_size();
+		if (svc == SPTLRPC_SVC_INTG)
+			txtsize += buflens[bufcnt];
+		bufcnt++;
+	}
+
+	if (req->rq_pack_bulk) {
+		buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+						       &req->rq_flvr,
+						       0, req->rq_bulk_read);
+		if (svc == SPTLRPC_SVC_INTG)
+			txtsize += buflens[bufcnt];
+		bufcnt++;
+	}
+
+	if (req->rq_ctx_init)
+		buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+	else if (svc != SPTLRPC_SVC_NULL)
+		buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+	bufsize = lustre_msg_size_v2(bufcnt, buflens);
+
+	if (!req->rq_reqbuf) {
+		bufsize = size_roundup_power2(bufsize);
+
+		OBD_ALLOC_LARGE(req->rq_reqbuf, bufsize);
+		if (!req->rq_reqbuf)
+			RETURN(-ENOMEM);
+
+		req->rq_reqbuf_len = bufsize;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= bufsize);
+		memset(req->rq_reqbuf, 0, bufsize);
+	}
+
+	lustre_init_msg_v2(req->rq_reqbuf, bufcnt, buflens, NULL);
+	req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, msgsize);
+	LASSERT(req->rq_reqmsg);
+
+	/* pack user desc here, later we might leave current user's process */
+	if (req->rq_pack_udesc)
+		sptlrpc_pack_user_desc(req->rq_reqbuf, 2);
+
+	RETURN(0);
+}
+
+static
+int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec,
+			  struct ptlrpc_request *req,
+			  int msgsize)
+{
+	__u32		     ibuflens[3], wbuflens[2];
+	int		       ibufcnt;
+	int		       clearsize, wiresize;
+	ENTRY;
+
+	LASSERT(req->rq_clrbuf == NULL);
+	LASSERT(req->rq_clrbuf_len == 0);
+
+	/* Inner (clear) buffers
+	 *  - lustre message
+	 *  - user descriptor (optional)
+	 *  - bulk checksum (optional)
+	 */
+	ibufcnt = 1;
+	ibuflens[0] = msgsize;
+
+	if (req->rq_pack_udesc)
+		ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size();
+	if (req->rq_pack_bulk)
+		ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+							   &req->rq_flvr, 0,
+							   req->rq_bulk_read);
+
+	clearsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+	/* to allow append padding during encryption */
+	clearsize += GSS_MAX_CIPHER_BLOCK;
+
+	/* Wrapper (wire) buffers
+	 *  - gss header
+	 *  - cipher text
+	 */
+	wbuflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	wbuflens[1] = gss_cli_payload(req->rq_cli_ctx, clearsize, 1);
+	wiresize = lustre_msg_size_v2(2, wbuflens);
+
+	if (req->rq_pool) {
+		/* rq_reqbuf is preallocated */
+		LASSERT(req->rq_reqbuf);
+		LASSERT(req->rq_reqbuf_len >= wiresize);
+
+		memset(req->rq_reqbuf, 0, req->rq_reqbuf_len);
+
+		/* if the pre-allocated buffer is big enough, we just pack
+		 * both clear buf & request buf in it, to avoid more alloc. */
+		if (clearsize + wiresize <= req->rq_reqbuf_len) {
+			req->rq_clrbuf =
+				(void *) (((char *) req->rq_reqbuf) + wiresize);
+		} else {
+			CWARN("pre-allocated buf size %d is not enough for "
+			      "both clear (%d) and cipher (%d) text, proceed "
+			      "with extra allocation\n", req->rq_reqbuf_len,
+			      clearsize, wiresize);
+		}
+	}
+
+	if (!req->rq_clrbuf) {
+		clearsize = size_roundup_power2(clearsize);
+
+		OBD_ALLOC_LARGE(req->rq_clrbuf, clearsize);
+		if (!req->rq_clrbuf)
+			RETURN(-ENOMEM);
+	}
+	req->rq_clrbuf_len = clearsize;
+
+	lustre_init_msg_v2(req->rq_clrbuf, ibufcnt, ibuflens, NULL);
+	req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, msgsize);
+
+	if (req->rq_pack_udesc)
+		sptlrpc_pack_user_desc(req->rq_clrbuf, 1);
+
+	RETURN(0);
+}
+
+/*
+ * NOTE: any change of request buffer allocation should also consider
+ * changing enlarge_reqbuf() series functions.
+ */
+int gss_alloc_reqbuf(struct ptlrpc_sec *sec,
+		     struct ptlrpc_request *req,
+		     int msgsize)
+{
+	int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+	LASSERT(!req->rq_pack_bulk ||
+		(req->rq_bulk_read || req->rq_bulk_write));
+
+	switch (svc) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		return gss_alloc_reqbuf_intg(sec, req, svc, msgsize);
+	case SPTLRPC_SVC_PRIV:
+		return gss_alloc_reqbuf_priv(sec, req, msgsize);
+	default:
+		LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		return 0;
+	}
+}
+
+void gss_free_reqbuf(struct ptlrpc_sec *sec,
+		     struct ptlrpc_request *req)
+{
+	int     privacy;
+	ENTRY;
+
+	LASSERT(!req->rq_pool || req->rq_reqbuf);
+	privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV;
+
+	if (!req->rq_clrbuf)
+		goto release_reqbuf;
+
+	/* release clear buffer */
+	LASSERT(privacy);
+	LASSERT(req->rq_clrbuf_len);
+
+	if (req->rq_pool == NULL ||
+	    req->rq_clrbuf < req->rq_reqbuf ||
+	    (char *) req->rq_clrbuf >=
+	    (char *) req->rq_reqbuf + req->rq_reqbuf_len)
+		OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+
+	req->rq_clrbuf = NULL;
+	req->rq_clrbuf_len = 0;
+
+release_reqbuf:
+	if (!req->rq_pool && req->rq_reqbuf) {
+		LASSERT(req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+
+	EXIT;
+}
+
+static int do_alloc_repbuf(struct ptlrpc_request *req, int bufsize)
+{
+	bufsize = size_roundup_power2(bufsize);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, bufsize);
+	if (!req->rq_repbuf)
+		return -ENOMEM;
+
+	req->rq_repbuf_len = bufsize;
+	return 0;
+}
+
+static
+int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec,
+			  struct ptlrpc_request *req,
+			  int svc, int msgsize)
+{
+	int	     txtsize;
+	__u32	   buflens[4];
+	int	     bufcnt = 2;
+	int	     alloc_size;
+
+	/*
+	 * on-wire data layout:
+	 * - gss header
+	 * - lustre message
+	 * - bulk sec descriptor (optional)
+	 * - signature (optional)
+	 *   - svc == NULL: NULL
+	 *   - svc == AUTH: signature of gss header
+	 *   - svc == INTG: signature of all above
+	 *
+	 * if this is context negotiation, reserver fixed space
+	 * at the last (signature) segment regardless of svc mode.
+	 */
+
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	txtsize = buflens[0];
+
+	buflens[1] = msgsize;
+	if (svc == SPTLRPC_SVC_INTG)
+		txtsize += buflens[1];
+
+	if (req->rq_pack_bulk) {
+		buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+						       &req->rq_flvr,
+						       1, req->rq_bulk_read);
+		if (svc == SPTLRPC_SVC_INTG)
+			txtsize += buflens[bufcnt];
+		bufcnt++;
+	}
+
+	if (req->rq_ctx_init)
+		buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+	else if (svc != SPTLRPC_SVC_NULL)
+		buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+	alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+
+	/* add space for early reply */
+	alloc_size += gss_at_reply_off_integ;
+
+	return do_alloc_repbuf(req, alloc_size);
+}
+
+static
+int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec,
+			  struct ptlrpc_request *req,
+			  int msgsize)
+{
+	int	     txtsize;
+	__u32	   buflens[2];
+	int	     bufcnt;
+	int	     alloc_size;
+
+	/* inner buffers */
+	bufcnt = 1;
+	buflens[0] = msgsize;
+
+	if (req->rq_pack_bulk)
+		buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+							 &req->rq_flvr,
+							 1, req->rq_bulk_read);
+	txtsize = lustre_msg_size_v2(bufcnt, buflens);
+	txtsize += GSS_MAX_CIPHER_BLOCK;
+
+	/* wrapper buffers */
+	bufcnt = 2;
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1);
+
+	alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+	/* add space for early reply */
+	alloc_size += gss_at_reply_off_priv;
+
+	return do_alloc_repbuf(req, alloc_size);
+}
+
+int gss_alloc_repbuf(struct ptlrpc_sec *sec,
+		     struct ptlrpc_request *req,
+		     int msgsize)
+{
+	int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+	ENTRY;
+
+	LASSERT(!req->rq_pack_bulk ||
+		(req->rq_bulk_read || req->rq_bulk_write));
+
+	switch (svc) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		return gss_alloc_repbuf_intg(sec, req, svc, msgsize);
+	case SPTLRPC_SVC_PRIV:
+		return gss_alloc_repbuf_priv(sec, req, msgsize);
+	default:
+		LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		return 0;
+	}
+}
+
+void gss_free_repbuf(struct ptlrpc_sec *sec,
+		     struct ptlrpc_request *req)
+{
+	OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+	req->rq_repdata = NULL;
+	req->rq_repdata_len = 0;
+}
+
+static int get_enlarged_msgsize(struct lustre_msg *msg,
+				int segment, int newsize)
+{
+	int save, newmsg_size;
+
+	LASSERT(newsize >= msg->lm_buflens[segment]);
+
+	save = msg->lm_buflens[segment];
+	msg->lm_buflens[segment] = newsize;
+	newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	msg->lm_buflens[segment] = save;
+
+	return newmsg_size;
+}
+
+static int get_enlarged_msgsize2(struct lustre_msg *msg,
+				 int segment1, int newsize1,
+				 int segment2, int newsize2)
+{
+	int save1, save2, newmsg_size;
+
+	LASSERT(newsize1 >= msg->lm_buflens[segment1]);
+	LASSERT(newsize2 >= msg->lm_buflens[segment2]);
+
+	save1 = msg->lm_buflens[segment1];
+	save2 = msg->lm_buflens[segment2];
+	msg->lm_buflens[segment1] = newsize1;
+	msg->lm_buflens[segment2] = newsize2;
+	newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	msg->lm_buflens[segment1] = save1;
+	msg->lm_buflens[segment2] = save2;
+
+	return newmsg_size;
+}
+
+static
+int gss_enlarge_reqbuf_intg(struct ptlrpc_sec *sec,
+			    struct ptlrpc_request *req,
+			    int svc,
+			    int segment, int newsize)
+{
+	struct lustre_msg      *newbuf;
+	int		     txtsize, sigsize = 0, i;
+	int		     newmsg_size, newbuf_size;
+
+	/*
+	 * gss header is at seg 0;
+	 * embedded msg is at seg 1;
+	 * signature (if any) is at the last seg
+	 */
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf_len > req->rq_reqlen);
+	LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+	LASSERT(lustre_msg_buf(req->rq_reqbuf, 1, 0) == req->rq_reqmsg);
+
+	/* 1. compute new embedded msg size */
+	newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+	LASSERT(newmsg_size >= req->rq_reqbuf->lm_buflens[1]);
+
+	/* 2. compute new wrapper msg size */
+	if (svc == SPTLRPC_SVC_NULL) {
+		/* no signature, get size directly */
+		newbuf_size = get_enlarged_msgsize(req->rq_reqbuf,
+						   1, newmsg_size);
+	} else {
+		txtsize = req->rq_reqbuf->lm_buflens[0];
+
+		if (svc == SPTLRPC_SVC_INTG) {
+			for (i = 1; i < req->rq_reqbuf->lm_bufcount; i++)
+				txtsize += req->rq_reqbuf->lm_buflens[i];
+			txtsize += newmsg_size - req->rq_reqbuf->lm_buflens[1];
+		}
+
+		sigsize = gss_cli_payload(req->rq_cli_ctx, txtsize, 0);
+		LASSERT(sigsize >= msg_last_seglen(req->rq_reqbuf));
+
+		newbuf_size = get_enlarged_msgsize2(
+					req->rq_reqbuf,
+					1, newmsg_size,
+					msg_last_segidx(req->rq_reqbuf),
+					sigsize);
+	}
+
+	/* request from pool should always have enough buffer */
+	LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+	if (req->rq_reqbuf_len < newbuf_size) {
+		newbuf_size = size_roundup_power2(newbuf_size);
+
+		OBD_ALLOC_LARGE(newbuf, newbuf_size);
+		if (newbuf == NULL)
+			RETURN(-ENOMEM);
+
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = newbuf;
+		req->rq_reqbuf_len = newbuf_size;
+		req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+	}
+
+	/* do enlargement, from wrapper to embedded, from end to begin */
+	if (svc != SPTLRPC_SVC_NULL)
+		_sptlrpc_enlarge_msg_inplace(req->rq_reqbuf,
+					     msg_last_segidx(req->rq_reqbuf),
+					     sigsize);
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, 1, newmsg_size);
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+	req->rq_reqlen = newmsg_size;
+	RETURN(0);
+}
+
+static
+int gss_enlarge_reqbuf_priv(struct ptlrpc_sec *sec,
+			    struct ptlrpc_request *req,
+			    int segment, int newsize)
+{
+	struct lustre_msg      *newclrbuf;
+	int		     newmsg_size, newclrbuf_size, newcipbuf_size;
+	__u32		   buflens[3];
+
+	/*
+	 * embedded msg is at seg 0 of clear buffer;
+	 * cipher text is at seg 2 of cipher buffer;
+	 */
+	LASSERT(req->rq_pool ||
+		(req->rq_reqbuf == NULL && req->rq_reqbuf_len == 0));
+	LASSERT(req->rq_reqbuf == NULL ||
+		(req->rq_pool && req->rq_reqbuf->lm_bufcount == 3));
+	LASSERT(req->rq_clrbuf);
+	LASSERT(req->rq_clrbuf_len > req->rq_reqlen);
+	LASSERT(lustre_msg_buf(req->rq_clrbuf, 0, 0) == req->rq_reqmsg);
+
+	/* compute new embedded msg size */
+	newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+
+	/* compute new clear buffer size */
+	newclrbuf_size = get_enlarged_msgsize(req->rq_clrbuf, 0, newmsg_size);
+	newclrbuf_size += GSS_MAX_CIPHER_BLOCK;
+
+	/* compute new cipher buffer size */
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0);
+	buflens[2] = gss_cli_payload(req->rq_cli_ctx, newclrbuf_size, 1);
+	newcipbuf_size = lustre_msg_size_v2(3, buflens);
+
+	/* handle the case that we put both clear buf and cipher buf into
+	 * pre-allocated single buffer. */
+	if (unlikely(req->rq_pool) &&
+	    req->rq_clrbuf >= req->rq_reqbuf &&
+	    (char *) req->rq_clrbuf <
+	    (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+		/* it couldn't be better we still fit into the
+		 * pre-allocated buffer. */
+		if (newclrbuf_size + newcipbuf_size <= req->rq_reqbuf_len) {
+			void *src, *dst;
+
+			/* move clear text backward. */
+			src = req->rq_clrbuf;
+			dst = (char *) req->rq_reqbuf + newcipbuf_size;
+
+			memmove(dst, src, req->rq_clrbuf_len);
+
+			req->rq_clrbuf = (struct lustre_msg *) dst;
+			req->rq_clrbuf_len = newclrbuf_size;
+			req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+		} else {
+			/* sadly we have to split out the clear buffer */
+			LASSERT(req->rq_reqbuf_len >= newcipbuf_size);
+			LASSERT(req->rq_clrbuf_len < newclrbuf_size);
+		}
+	}
+
+	if (req->rq_clrbuf_len < newclrbuf_size) {
+		newclrbuf_size = size_roundup_power2(newclrbuf_size);
+
+		OBD_ALLOC_LARGE(newclrbuf, newclrbuf_size);
+		if (newclrbuf == NULL)
+			RETURN(-ENOMEM);
+
+		memcpy(newclrbuf, req->rq_clrbuf, req->rq_clrbuf_len);
+
+		if (req->rq_reqbuf == NULL ||
+		    req->rq_clrbuf < req->rq_reqbuf ||
+		    (char *) req->rq_clrbuf >=
+		    (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+			OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+		}
+
+		req->rq_clrbuf = newclrbuf;
+		req->rq_clrbuf_len = newclrbuf_size;
+		req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_clrbuf, 0, newmsg_size);
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+	req->rq_reqlen = newmsg_size;
+
+	RETURN(0);
+}
+
+int gss_enlarge_reqbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req,
+		       int segment, int newsize)
+{
+	int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+	LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini);
+
+	switch (svc) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		return gss_enlarge_reqbuf_intg(sec, req, svc, segment, newsize);
+	case SPTLRPC_SVC_PRIV:
+		return gss_enlarge_reqbuf_priv(sec, req, segment, newsize);
+	default:
+		LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		return 0;
+	}
+}
+
+int gss_sec_install_rctx(struct obd_import *imp,
+			 struct ptlrpc_sec *sec,
+			 struct ptlrpc_cli_ctx *ctx)
+{
+	struct gss_sec     *gsec;
+	struct gss_cli_ctx *gctx;
+	int		 rc;
+
+	gsec = container_of(sec, struct gss_sec, gs_base);
+	gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+	rc = gss_install_rvs_svc_ctx(imp, gsec, gctx);
+	return rc;
+}
+
+/********************************************
+ * server side API			  *
+ ********************************************/
+
+static inline
+int gss_svc_reqctx_is_special(struct gss_svc_reqctx *grctx)
+{
+	LASSERT(grctx);
+	return (grctx->src_init || grctx->src_init_continue ||
+		grctx->src_err_notify);
+}
+
+static
+void gss_svc_reqctx_free(struct gss_svc_reqctx *grctx)
+{
+	if (grctx->src_ctx)
+		gss_svc_upcall_put_ctx(grctx->src_ctx);
+
+	sptlrpc_policy_put(grctx->src_base.sc_policy);
+	OBD_FREE_PTR(grctx);
+}
+
+static inline
+void gss_svc_reqctx_addref(struct gss_svc_reqctx *grctx)
+{
+	LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+	atomic_inc(&grctx->src_base.sc_refcount);
+}
+
+static inline
+void gss_svc_reqctx_decref(struct gss_svc_reqctx *grctx)
+{
+	LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+
+	if (atomic_dec_and_test(&grctx->src_base.sc_refcount))
+		gss_svc_reqctx_free(grctx);
+}
+
+static
+int gss_svc_sign(struct ptlrpc_request *req,
+		 struct ptlrpc_reply_state *rs,
+		 struct gss_svc_reqctx *grctx,
+		 __u32 svc)
+{
+	__u32   flags = 0;
+	int     rc;
+	ENTRY;
+
+	LASSERT(rs->rs_msg == lustre_msg_buf(rs->rs_repbuf, 1, 0));
+
+	/* embedded lustre_msg might have been shrinked */
+	if (req->rq_replen != rs->rs_repbuf->lm_buflens[1])
+		lustre_shrink_msg(rs->rs_repbuf, 1, req->rq_replen, 1);
+
+	if (req->rq_pack_bulk)
+		flags |= LUSTRE_GSS_PACK_BULK;
+
+	rc = gss_sign_msg(rs->rs_repbuf, grctx->src_ctx->gsc_mechctx,
+			  LUSTRE_SP_ANY, flags, PTLRPC_GSS_PROC_DATA,
+			  grctx->src_wirectx.gw_seq, svc, NULL);
+	if (rc < 0)
+		RETURN(rc);
+
+	rs->rs_repdata_len = rc;
+
+	if (likely(req->rq_packed_final)) {
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+			req->rq_reply_off = gss_at_reply_off_integ;
+		else
+			req->rq_reply_off = 0;
+	} else {
+		if (svc == SPTLRPC_SVC_NULL)
+			rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0,
+					lustre_msg_buf(rs->rs_repbuf, 1, 0),
+					lustre_msg_buflen(rs->rs_repbuf, 1));
+		req->rq_reply_off = 0;
+	}
+
+	RETURN(0);
+}
+
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor)
+{
+	struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	struct ptlrpc_reply_state *rs;
+	struct gss_err_header     *ghdr;
+	int			replen = sizeof(struct ptlrpc_body);
+	int			rc;
+	ENTRY;
+
+	//if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_SVCGSS_ERR_NOTIFY, OBD_FAIL_ONCE))
+	//      RETURN(-EINVAL);
+
+	grctx->src_err_notify = 1;
+	grctx->src_reserve_len = 0;
+
+	rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+	if (rc) {
+		CERROR("could not pack reply, err %d\n", rc);
+		RETURN(rc);
+	}
+
+	/* gss hdr */
+	rs = req->rq_reply_state;
+	LASSERT(rs->rs_repbuf->lm_buflens[1] >= sizeof(*ghdr));
+	ghdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+	ghdr->gh_major = major;
+	ghdr->gh_minor = minor;
+	ghdr->gh_handle.len = 0; /* fake context handle */
+
+	rs->rs_repdata_len = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+						rs->rs_repbuf->lm_buflens);
+
+	CDEBUG(D_SEC, "prepare gss error notify(0x%x/0x%x) to %s\n",
+	       major, minor, libcfs_nid2str(req->rq_peer.nid));
+	RETURN(0);
+}
+
+static
+int gss_svc_handle_init(struct ptlrpc_request *req,
+			struct gss_wire_ctx *gw)
+{
+	struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	struct lustre_msg	 *reqbuf = req->rq_reqbuf;
+	struct obd_uuid	   *uuid;
+	struct obd_device	 *target;
+	rawobj_t		   uuid_obj, rvs_hdl, in_token;
+	__u32		      lustre_svc;
+	__u32		     *secdata, seclen;
+	int			swabbed, rc;
+	ENTRY;
+
+	CDEBUG(D_SEC, "processing gss init(%d) request from %s\n", gw->gw_proc,
+	       libcfs_nid2str(req->rq_peer.nid));
+
+	req->rq_ctx_init = 1;
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+		CERROR("unexpected bulk flag\n");
+		RETURN(SECSVC_DROP);
+	}
+
+	if (gw->gw_proc == PTLRPC_GSS_PROC_INIT && gw->gw_handle.len != 0) {
+		CERROR("proc %u: invalid handle length %u\n",
+		       gw->gw_proc, gw->gw_handle.len);
+		RETURN(SECSVC_DROP);
+	}
+
+	if (reqbuf->lm_bufcount < 3 || reqbuf->lm_bufcount > 4){
+		CERROR("Invalid bufcount %d\n", reqbuf->lm_bufcount);
+		RETURN(SECSVC_DROP);
+	}
+
+	swabbed = ptlrpc_req_need_swab(req);
+
+	/* ctx initiate payload is in last segment */
+	secdata = lustre_msg_buf(reqbuf, reqbuf->lm_bufcount - 1, 0);
+	seclen = reqbuf->lm_buflens[reqbuf->lm_bufcount - 1];
+
+	if (seclen < 4 + 4) {
+		CERROR("sec size %d too small\n", seclen);
+		RETURN(SECSVC_DROP);
+	}
+
+	/* lustre svc type */
+	lustre_svc = le32_to_cpu(*secdata++);
+	seclen -= 4;
+
+	/* extract target uuid, note this code is somewhat fragile
+	 * because touched internal structure of obd_uuid */
+	if (rawobj_extract(&uuid_obj, &secdata, &seclen)) {
+		CERROR("failed to extract target uuid\n");
+		RETURN(SECSVC_DROP);
+	}
+	uuid_obj.data[uuid_obj.len - 1] = '\0';
+
+	uuid = (struct obd_uuid *) uuid_obj.data;
+	target = class_uuid2obd(uuid);
+	if (!target || target->obd_stopping || !target->obd_set_up) {
+		CERROR("target '%s' is not available for context init (%s)\n",
+		       uuid->uuid, target == NULL ? "no target" :
+		       (target->obd_stopping ? "stopping" : "not set up"));
+		RETURN(SECSVC_DROP);
+	}
+
+	/* extract reverse handle */
+	if (rawobj_extract(&rvs_hdl, &secdata, &seclen)) {
+		CERROR("failed extract reverse handle\n");
+		RETURN(SECSVC_DROP);
+	}
+
+	/* extract token */
+	if (rawobj_extract(&in_token, &secdata, &seclen)) {
+		CERROR("can't extract token\n");
+		RETURN(SECSVC_DROP);
+	}
+
+	rc = gss_svc_upcall_handle_init(req, grctx, gw, target, lustre_svc,
+					&rvs_hdl, &in_token);
+	if (rc != SECSVC_OK)
+		RETURN(rc);
+
+	if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
+	    grctx->src_ctx->gsc_usr_root)
+		CWARN("create svc ctx %p: user from %s authenticated as %s\n",
+		      grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
+		      grctx->src_ctx->gsc_usr_mds ? "mds" :
+			(grctx->src_ctx->gsc_usr_oss ? "oss" : "root"));
+	else
+		CWARN("create svc ctx %p: accept user %u from %s\n",
+		      grctx->src_ctx, grctx->src_ctx->gsc_uid,
+		      libcfs_nid2str(req->rq_peer.nid));
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+		if (reqbuf->lm_bufcount < 4) {
+			CERROR("missing user descriptor\n");
+			RETURN(SECSVC_DROP);
+		}
+		if (sptlrpc_unpack_user_desc(reqbuf, 2, swabbed)) {
+			CERROR("Mal-formed user descriptor\n");
+			RETURN(SECSVC_DROP);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(reqbuf, 2, 0);
+	}
+
+	req->rq_reqmsg = lustre_msg_buf(reqbuf, 1, 0);
+	req->rq_reqlen = lustre_msg_buflen(reqbuf, 1);
+
+	RETURN(rc);
+}
+
+/*
+ * last segment must be the gss signature.
+ */
+static
+int gss_svc_verify_request(struct ptlrpc_request *req,
+			   struct gss_svc_reqctx *grctx,
+			   struct gss_wire_ctx *gw,
+			   __u32 *major)
+{
+	struct gss_svc_ctx *gctx = grctx->src_ctx;
+	struct lustre_msg  *msg = req->rq_reqbuf;
+	int		 offset = 2;
+	int		 swabbed;
+	ENTRY;
+
+	*major = GSS_S_COMPLETE;
+
+	if (msg->lm_bufcount < 2) {
+		CERROR("Too few segments (%u) in request\n", msg->lm_bufcount);
+		RETURN(-EINVAL);
+	}
+
+	if (gw->gw_svc == SPTLRPC_SVC_NULL)
+		goto verified;
+
+	if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+		CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+		*major = GSS_S_DUPLICATE_TOKEN;
+		RETURN(-EACCES);
+	}
+
+	*major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc);
+	if (*major != GSS_S_COMPLETE) {
+		CERROR("failed to verify request: %x\n", *major);
+		RETURN(-EACCES);
+	}
+
+	if (gctx->gsc_reverse == 0 &&
+	    gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+		CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+		*major = GSS_S_DUPLICATE_TOKEN;
+		RETURN(-EACCES);
+	}
+
+verified:
+	swabbed = ptlrpc_req_need_swab(req);
+
+	/* user descriptor */
+	if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+		if (msg->lm_bufcount < (offset + 1)) {
+			CERROR("no user desc included\n");
+			RETURN(-EINVAL);
+		}
+
+		if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+			CERROR("Mal-formed user descriptor\n");
+			RETURN(-EINVAL);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+		offset++;
+	}
+
+	/* check bulk_sec_desc data */
+	if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+		if (msg->lm_bufcount < (offset + 1)) {
+			CERROR("missing bulk sec descriptor\n");
+			RETURN(-EINVAL);
+		}
+
+		if (bulk_sec_desc_unpack(msg, offset, swabbed))
+			RETURN(-EINVAL);
+
+		req->rq_pack_bulk = 1;
+		grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+		grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+	}
+
+	req->rq_reqmsg = lustre_msg_buf(msg, 1, 0);
+	req->rq_reqlen = msg->lm_buflens[1];
+	RETURN(0);
+}
+
+static
+int gss_svc_unseal_request(struct ptlrpc_request *req,
+			   struct gss_svc_reqctx *grctx,
+			   struct gss_wire_ctx *gw,
+			   __u32 *major)
+{
+	struct gss_svc_ctx *gctx = grctx->src_ctx;
+	struct lustre_msg  *msg = req->rq_reqbuf;
+	int		 swabbed, msglen, offset = 1;
+	ENTRY;
+
+	if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+		CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+		*major = GSS_S_DUPLICATE_TOKEN;
+		RETURN(-EACCES);
+	}
+
+	*major = gss_unseal_msg(gctx->gsc_mechctx, msg,
+			       &msglen, req->rq_reqdata_len);
+	if (*major != GSS_S_COMPLETE) {
+		CERROR("failed to unwrap request: %x\n", *major);
+		RETURN(-EACCES);
+	}
+
+	if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+		CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+		*major = GSS_S_DUPLICATE_TOKEN;
+		RETURN(-EACCES);
+	}
+
+	swabbed = __lustre_unpack_msg(msg, msglen);
+	if (swabbed < 0) {
+		CERROR("Failed to unpack after decryption\n");
+		RETURN(-EINVAL);
+	}
+	req->rq_reqdata_len = msglen;
+
+	if (msg->lm_bufcount < 1) {
+		CERROR("Invalid buffer: is empty\n");
+		RETURN(-EINVAL);
+	}
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+		if (msg->lm_bufcount < offset + 1) {
+			CERROR("no user descriptor included\n");
+			RETURN(-EINVAL);
+		}
+
+		if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+			CERROR("Mal-formed user descriptor\n");
+			RETURN(-EINVAL);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+		offset++;
+	}
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+		if (msg->lm_bufcount < offset + 1) {
+			CERROR("no bulk checksum included\n");
+			RETURN(-EINVAL);
+		}
+
+		if (bulk_sec_desc_unpack(msg, offset, swabbed))
+			RETURN(-EINVAL);
+
+		req->rq_pack_bulk = 1;
+		grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+		grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+	}
+
+	req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+	req->rq_reqlen = req->rq_reqbuf->lm_buflens[0];
+	RETURN(0);
+}
+
+static
+int gss_svc_handle_data(struct ptlrpc_request *req,
+			struct gss_wire_ctx *gw)
+{
+	struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	__u32		  major = 0;
+	int		    rc = 0;
+	ENTRY;
+
+	grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+	if (!grctx->src_ctx) {
+		major = GSS_S_NO_CONTEXT;
+		goto error;
+	}
+
+	switch (gw->gw_svc) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		rc = gss_svc_verify_request(req, grctx, gw, &major);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		rc = gss_svc_unseal_request(req, grctx, gw, &major);
+		break;
+	default:
+		CERROR("unsupported gss service %d\n", gw->gw_svc);
+		rc = -EINVAL;
+	}
+
+	if (rc == 0)
+		RETURN(SECSVC_OK);
+
+	CERROR("svc %u failed: major 0x%08x: req xid "LPU64" ctx %p idx "
+	       LPX64"(%u->%s)\n", gw->gw_svc, major, req->rq_xid,
+	       grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+	       grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+error:
+	/* we only notify client in case of NO_CONTEXT/BAD_SIG, which
+	 * might happen after server reboot, to allow recovery. */
+	if ((major == GSS_S_NO_CONTEXT || major == GSS_S_BAD_SIG) &&
+	    gss_pack_err_notify(req, major, 0) == 0)
+		RETURN(SECSVC_COMPLETE);
+
+	RETURN(SECSVC_DROP);
+}
+
+static
+int gss_svc_handle_destroy(struct ptlrpc_request *req,
+			   struct gss_wire_ctx *gw)
+{
+	struct gss_svc_reqctx  *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	__u32		   major;
+	ENTRY;
+
+	req->rq_ctx_fini = 1;
+	req->rq_no_reply = 1;
+
+	grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+	if (!grctx->src_ctx) {
+		CDEBUG(D_SEC, "invalid gss context handle for destroy.\n");
+		RETURN(SECSVC_DROP);
+	}
+
+	if (gw->gw_svc != SPTLRPC_SVC_INTG) {
+		CERROR("svc %u is not supported in destroy.\n", gw->gw_svc);
+		RETURN(SECSVC_DROP);
+	}
+
+	if (gss_svc_verify_request(req, grctx, gw, &major))
+		RETURN(SECSVC_DROP);
+
+	CWARN("destroy svc ctx %p idx "LPX64" (%u->%s)\n",
+	      grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+	      grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+	gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+	if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+		if (req->rq_reqbuf->lm_bufcount < 4) {
+			CERROR("missing user descriptor, ignore it\n");
+			RETURN(SECSVC_OK);
+		}
+		if (sptlrpc_unpack_user_desc(req->rq_reqbuf, 2,
+					     ptlrpc_req_need_swab(req))) {
+			CERROR("Mal-formed user descriptor, ignore it\n");
+			RETURN(SECSVC_OK);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(req->rq_reqbuf, 2, 0);
+	}
+
+	RETURN(SECSVC_OK);
+}
+
+int gss_svc_accept(struct ptlrpc_sec_policy *policy, struct ptlrpc_request *req)
+{
+	struct gss_header      *ghdr;
+	struct gss_svc_reqctx  *grctx;
+	struct gss_wire_ctx    *gw;
+	int		     swabbed, rc;
+	ENTRY;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_svc_ctx == NULL);
+
+	if (req->rq_reqbuf->lm_bufcount < 2) {
+		CERROR("buf count only %d\n", req->rq_reqbuf->lm_bufcount);
+		RETURN(SECSVC_DROP);
+	}
+
+	swabbed = ptlrpc_req_need_swab(req);
+
+	ghdr = gss_swab_header(req->rq_reqbuf, 0, swabbed);
+	if (ghdr == NULL) {
+		CERROR("can't decode gss header\n");
+		RETURN(SECSVC_DROP);
+	}
+
+	/* sanity checks */
+	if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+		CERROR("gss version %u, expect %u\n", ghdr->gh_version,
+		       PTLRPC_GSS_VERSION);
+		RETURN(SECSVC_DROP);
+	}
+
+	req->rq_sp_from = ghdr->gh_sp;
+
+	/* alloc grctx data */
+	OBD_ALLOC_PTR(grctx);
+	if (!grctx)
+		RETURN(SECSVC_DROP);
+
+	grctx->src_base.sc_policy = sptlrpc_policy_get(policy);
+	atomic_set(&grctx->src_base.sc_refcount, 1);
+	req->rq_svc_ctx = &grctx->src_base;
+	gw = &grctx->src_wirectx;
+
+	/* save wire context */
+	gw->gw_flags = ghdr->gh_flags;
+	gw->gw_proc = ghdr->gh_proc;
+	gw->gw_seq = ghdr->gh_seq;
+	gw->gw_svc = ghdr->gh_svc;
+	rawobj_from_netobj(&gw->gw_handle, &ghdr->gh_handle);
+
+	/* keep original wire header which subject to checksum verification */
+	if (swabbed)
+		gss_header_swabber(ghdr);
+
+	switch(ghdr->gh_proc) {
+	case PTLRPC_GSS_PROC_INIT:
+	case PTLRPC_GSS_PROC_CONTINUE_INIT:
+		rc = gss_svc_handle_init(req, gw);
+		break;
+	case PTLRPC_GSS_PROC_DATA:
+		rc = gss_svc_handle_data(req, gw);
+		break;
+	case PTLRPC_GSS_PROC_DESTROY:
+		rc = gss_svc_handle_destroy(req, gw);
+		break;
+	default:
+		CERROR("unknown proc %u\n", gw->gw_proc);
+		rc = SECSVC_DROP;
+		break;
+	}
+
+	switch (rc) {
+	case SECSVC_OK:
+		LASSERT (grctx->src_ctx);
+
+		req->rq_auth_gss = 1;
+		req->rq_auth_remote = grctx->src_ctx->gsc_remote;
+		req->rq_auth_usr_mdt = grctx->src_ctx->gsc_usr_mds;
+		req->rq_auth_usr_ost = grctx->src_ctx->gsc_usr_oss;
+		req->rq_auth_usr_root = grctx->src_ctx->gsc_usr_root;
+		req->rq_auth_uid = grctx->src_ctx->gsc_uid;
+		req->rq_auth_mapped_uid = grctx->src_ctx->gsc_mapped_uid;
+		break;
+	case SECSVC_COMPLETE:
+		break;
+	case SECSVC_DROP:
+		gss_svc_reqctx_free(grctx);
+		req->rq_svc_ctx = NULL;
+		break;
+	}
+
+	RETURN(rc);
+}
+
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct gss_svc_reqctx  *grctx;
+	ENTRY;
+
+	if (svc_ctx == NULL) {
+		EXIT;
+		return;
+	}
+
+	grctx = gss_svc_ctx2reqctx(svc_ctx);
+
+	CWARN("gss svc invalidate ctx %p(%u)\n",
+	      grctx->src_ctx, grctx->src_ctx->gsc_uid);
+	gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+	EXIT;
+}
+
+static inline
+int gss_svc_payload(struct gss_svc_reqctx *grctx, int early,
+		    int msgsize, int privacy)
+{
+	/* we should treat early reply normally, but which is actually sharing
+	 * the same ctx with original request, so in this case we should
+	 * ignore the special ctx's special flags */
+	if (early == 0 && gss_svc_reqctx_is_special(grctx))
+		return grctx->src_reserve_len;
+
+	return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx,
+				struct sptlrpc_flavor *flvr,
+				int read)
+{
+	int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+	if (read) {
+		switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+		case SPTLRPC_BULK_SVC_NULL:
+			break;
+		case SPTLRPC_BULK_SVC_INTG:
+			payload += gss_mech_payload(NULL, 0, 0);
+			break;
+		case SPTLRPC_BULK_SVC_PRIV:
+			payload += gss_mech_payload(NULL, 0, 1);
+			break;
+		case SPTLRPC_BULK_SVC_AUTH:
+		default:
+			LBUG();
+		}
+	}
+
+	return payload;
+}
+
+int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+	struct gss_svc_reqctx       *grctx;
+	struct ptlrpc_reply_state   *rs;
+	int			  early, privacy, svc, bsd_off = 0;
+	__u32			ibuflens[2], buflens[4];
+	int			  ibufcnt = 0, bufcnt;
+	int			  txtsize, wmsg_size, rs_size;
+	ENTRY;
+
+	LASSERT(msglen % 8 == 0);
+
+	if (req->rq_pack_bulk && !req->rq_bulk_read && !req->rq_bulk_write) {
+		CERROR("client request bulk sec on non-bulk rpc\n");
+		RETURN(-EPROTO);
+	}
+
+	svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+	early = (req->rq_packed_final == 0);
+
+	grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	if (!early && gss_svc_reqctx_is_special(grctx))
+		privacy = 0;
+	else
+		privacy = (svc == SPTLRPC_SVC_PRIV);
+
+	if (privacy) {
+		/* inner clear buffers */
+		ibufcnt = 1;
+		ibuflens[0] = msglen;
+
+		if (req->rq_pack_bulk) {
+			LASSERT(grctx->src_reqbsd);
+
+			bsd_off = ibufcnt;
+			ibuflens[ibufcnt++] = gss_svc_bulk_payload(
+							grctx->src_ctx,
+							&req->rq_flvr,
+							req->rq_bulk_read);
+		}
+
+		txtsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+		txtsize += GSS_MAX_CIPHER_BLOCK;
+
+		/* wrapper buffer */
+		bufcnt = 2;
+		buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+		buflens[1] = gss_svc_payload(grctx, early, txtsize, 1);
+	} else {
+		bufcnt = 2;
+		buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+		buflens[1] = msglen;
+
+		txtsize = buflens[0];
+		if (svc == SPTLRPC_SVC_INTG)
+			txtsize += buflens[1];
+
+		if (req->rq_pack_bulk) {
+			LASSERT(grctx->src_reqbsd);
+
+			bsd_off = bufcnt;
+			buflens[bufcnt] = gss_svc_bulk_payload(
+							grctx->src_ctx,
+							&req->rq_flvr,
+							req->rq_bulk_read);
+			if (svc == SPTLRPC_SVC_INTG)
+				txtsize += buflens[bufcnt];
+			bufcnt++;
+		}
+
+		if ((!early && gss_svc_reqctx_is_special(grctx)) ||
+		    svc != SPTLRPC_SVC_NULL)
+			buflens[bufcnt++] = gss_svc_payload(grctx, early,
+							    txtsize, 0);
+	}
+
+	wmsg_size = lustre_msg_size_v2(bufcnt, buflens);
+
+	rs_size = sizeof(*rs) + wmsg_size;
+	rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			RETURN(-ENOMEM);
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = wmsg_size;
+
+	/* initialize the buffer */
+	if (privacy) {
+		lustre_init_msg_v2(rs->rs_repbuf, ibufcnt, ibuflens, NULL);
+		rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 0, msglen);
+	} else {
+		lustre_init_msg_v2(rs->rs_repbuf, bufcnt, buflens, NULL);
+		rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+		rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 1, 0);
+	}
+
+	if (bsd_off) {
+		grctx->src_repbsd = lustre_msg_buf(rs->rs_repbuf, bsd_off, 0);
+		grctx->src_repbsd_size = lustre_msg_buflen(rs->rs_repbuf,
+							   bsd_off);
+	}
+
+	gss_svc_reqctx_addref(grctx);
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+
+	LASSERT(rs->rs_msg);
+	req->rq_reply_state = rs;
+	RETURN(0);
+}
+
+static int gss_svc_seal(struct ptlrpc_request *req,
+			struct ptlrpc_reply_state *rs,
+			struct gss_svc_reqctx *grctx)
+{
+	struct gss_svc_ctx      *gctx = grctx->src_ctx;
+	rawobj_t		 hdrobj, msgobj, token;
+	struct gss_header       *ghdr;
+	__u8		    *token_buf;
+	int		      token_buflen;
+	__u32		    buflens[2], major;
+	int		      msglen, rc;
+	ENTRY;
+
+	/* get clear data length. note embedded lustre_msg might
+	 * have been shrinked */
+	if (req->rq_replen != lustre_msg_buflen(rs->rs_repbuf, 0))
+		msglen = lustre_shrink_msg(rs->rs_repbuf, 0, req->rq_replen, 1);
+	else
+		msglen = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+					    rs->rs_repbuf->lm_buflens);
+
+	/* temporarily use tail of buffer to hold gss header data */
+	LASSERT(msglen + PTLRPC_GSS_HEADER_SIZE <= rs->rs_repbuf_len);
+	ghdr = (struct gss_header *) ((char *) rs->rs_repbuf +
+				rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE);
+	ghdr->gh_version = PTLRPC_GSS_VERSION;
+	ghdr->gh_sp = LUSTRE_SP_ANY;
+	ghdr->gh_flags = 0;
+	ghdr->gh_proc = PTLRPC_GSS_PROC_DATA;
+	ghdr->gh_seq = grctx->src_wirectx.gw_seq;
+	ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+	ghdr->gh_handle.len = 0;
+	if (req->rq_pack_bulk)
+		ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+
+	/* allocate temporary cipher buffer */
+	token_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1);
+	OBD_ALLOC_LARGE(token_buf, token_buflen);
+	if (token_buf == NULL)
+		RETURN(-ENOMEM);
+
+	hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+	hdrobj.data = (__u8 *) ghdr;
+	msgobj.len = msglen;
+	msgobj.data = (__u8 *) rs->rs_repbuf;
+	token.len = token_buflen;
+	token.data = token_buf;
+
+	major = lgss_wrap(gctx->gsc_mechctx, &hdrobj, &msgobj,
+			  rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE, &token);
+	if (major != GSS_S_COMPLETE) {
+		CERROR("wrap message error: %08x\n", major);
+		GOTO(out_free, rc = -EPERM);
+	}
+	LASSERT(token.len <= token_buflen);
+
+	/* we are about to override data at rs->rs_repbuf, nullify pointers
+	 * to which to catch further illegal usage. */
+	if (req->rq_pack_bulk) {
+		grctx->src_repbsd = NULL;
+		grctx->src_repbsd_size = 0;
+	}
+
+	/* now fill the actual wire data
+	 * - gss header
+	 * - gss token
+	 */
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = token.len;
+
+	rs->rs_repdata_len = lustre_msg_size_v2(2, buflens);
+	LASSERT(rs->rs_repdata_len <= rs->rs_repbuf_len);
+
+	lustre_init_msg_v2(rs->rs_repbuf, 2, buflens, NULL);
+	rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	memcpy(lustre_msg_buf(rs->rs_repbuf, 0, 0), ghdr,
+	       PTLRPC_GSS_HEADER_SIZE);
+	memcpy(lustre_msg_buf(rs->rs_repbuf, 1, 0), token.data, token.len);
+
+	/* reply offset */
+	if (req->rq_packed_final &&
+	    (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))
+		req->rq_reply_off = gss_at_reply_off_priv;
+	else
+		req->rq_reply_off = 0;
+
+	/* to catch upper layer's further access */
+	rs->rs_msg = NULL;
+	req->rq_repmsg = NULL;
+	req->rq_replen = 0;
+
+	rc = 0;
+out_free:
+	OBD_FREE_LARGE(token_buf, token_buflen);
+	RETURN(rc);
+}
+
+int gss_svc_authorize(struct ptlrpc_request *req)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+	struct gss_wire_ctx       *gw = &grctx->src_wirectx;
+	int			early, rc;
+	ENTRY;
+
+	early = (req->rq_packed_final == 0);
+
+	if (!early && gss_svc_reqctx_is_special(grctx)) {
+		LASSERT(rs->rs_repdata_len != 0);
+
+		req->rq_reply_off = gss_at_reply_off_integ;
+		RETURN(0);
+	}
+
+	/* early reply could happen in many cases */
+	if (!early &&
+	    gw->gw_proc != PTLRPC_GSS_PROC_DATA &&
+	    gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) {
+		CERROR("proc %d not support\n", gw->gw_proc);
+		RETURN(-EINVAL);
+	}
+
+	LASSERT(grctx->src_ctx);
+
+	switch (gw->gw_svc) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		rc = gss_svc_sign(req, rs, grctx, gw->gw_svc);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		rc = gss_svc_seal(req, rs, grctx);
+		break;
+	default:
+		CERROR("Unknown service %d\n", gw->gw_svc);
+		GOTO(out, rc = -EINVAL);
+	}
+	rc = 0;
+
+out:
+	RETURN(rc);
+}
+
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+	struct gss_svc_reqctx *grctx;
+
+	LASSERT(rs->rs_svc_ctx);
+	grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base);
+
+	gss_svc_reqctx_decref(grctx);
+	rs->rs_svc_ctx = NULL;
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx)
+{
+	LASSERT(atomic_read(&ctx->sc_refcount) == 0);
+	gss_svc_reqctx_free(gss_svc_ctx2reqctx(ctx));
+}
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+			 struct ptlrpc_svc_ctx *svc_ctx)
+{
+	struct gss_cli_ctx     *cli_gctx = ctx2gctx(cli_ctx);
+	struct gss_svc_ctx     *svc_gctx = gss_svc_ctx2gssctx(svc_ctx);
+	struct gss_ctx	 *mechctx = NULL;
+
+	LASSERT(cli_gctx);
+	LASSERT(svc_gctx && svc_gctx->gsc_mechctx);
+
+	cli_gctx->gc_proc = PTLRPC_GSS_PROC_DATA;
+	cli_gctx->gc_win = GSS_SEQ_WIN;
+
+	/* The problem is the reverse ctx might get lost in some recovery
+	 * situations, and the same svc_ctx will be used to re-create it.
+	 * if there's callback be sentout before that, new reverse ctx start
+	 * with sequence 0 will lead to future callback rpc be treated as
+	 * replay.
+	 *
+	 * each reverse root ctx will record its latest sequence number on its
+	 * buddy svcctx before be destroied, so here we continue use it.
+	 */
+	atomic_set(&cli_gctx->gc_seq, svc_gctx->gsc_rvs_seq);
+
+	if (gss_svc_upcall_dup_handle(&cli_gctx->gc_svc_handle, svc_gctx)) {
+		CERROR("failed to dup svc handle\n");
+		goto err_out;
+	}
+
+	if (lgss_copy_reverse_context(svc_gctx->gsc_mechctx, &mechctx) !=
+	    GSS_S_COMPLETE) {
+		CERROR("failed to copy mech context\n");
+		goto err_svc_handle;
+	}
+
+	if (rawobj_dup(&cli_gctx->gc_handle, &svc_gctx->gsc_rvs_hdl)) {
+		CERROR("failed to dup reverse handle\n");
+		goto err_ctx;
+	}
+
+	cli_gctx->gc_mechctx = mechctx;
+	gss_cli_ctx_uptodate(cli_gctx);
+
+	return 0;
+
+err_ctx:
+	lgss_delete_sec_context(&mechctx);
+err_svc_handle:
+	rawobj_free(&cli_gctx->gc_svc_handle);
+err_out:
+	return -ENOMEM;
+}
+
+static void gss_init_at_reply_offset(void)
+{
+	__u32 buflens[3];
+	int clearsize;
+
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = lustre_msg_early_size();
+	buflens[2] = gss_cli_payload(NULL, buflens[1], 0);
+	gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens);
+
+	buflens[0] = lustre_msg_early_size();
+	clearsize = lustre_msg_size_v2(1, buflens);
+	buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+	buflens[1] = gss_cli_payload(NULL, clearsize, 0);
+	buflens[2] = gss_cli_payload(NULL, clearsize, 1);
+	gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens);
+}
+
+int __init sptlrpc_gss_init(void)
+{
+	int rc;
+
+	rc = gss_init_lproc();
+	if (rc)
+		return rc;
+
+	rc = gss_init_cli_upcall();
+	if (rc)
+		goto out_lproc;
+
+	rc = gss_init_svc_upcall();
+	if (rc)
+		goto out_cli_upcall;
+
+	rc = init_kerberos_module();
+	if (rc)
+		goto out_svc_upcall;
+
+	/* register policy after all other stuff be intialized, because it
+	 * might be in used immediately after the registration. */
+
+	rc = gss_init_keyring();
+	if (rc)
+		goto out_kerberos;
+
+#ifdef HAVE_GSS_PIPEFS
+	rc = gss_init_pipefs();
+	if (rc)
+		goto out_keyring;
+#endif
+
+	gss_init_at_reply_offset();
+
+	return 0;
+
+#ifdef HAVE_GSS_PIPEFS
+out_keyring:
+	gss_exit_keyring();
+#endif
+
+out_kerberos:
+	cleanup_kerberos_module();
+out_svc_upcall:
+	gss_exit_svc_upcall();
+out_cli_upcall:
+	gss_exit_cli_upcall();
+out_lproc:
+	gss_exit_lproc();
+	return rc;
+}
+
+static void __exit sptlrpc_gss_exit(void)
+{
+	gss_exit_keyring();
+#ifdef HAVE_GSS_PIPEFS
+	gss_exit_pipefs();
+#endif
+	cleanup_kerberos_module();
+	gss_exit_svc_upcall();
+	gss_exit_cli_upcall();
+	gss_exit_lproc();
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("GSS security policy for Lustre");
+MODULE_LICENSE("GPL");
+
+module_init(sptlrpc_gss_init);
+module_exit(sptlrpc_gss_exit);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c b/drivers/staging/lustre/lustre/ptlrpc/import.c
new file mode 100644
index 0000000..47a3c05
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/import.c

@@ -0,0 +1,1613 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/import.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpc_connect_async_args {
+	 __u64 pcaa_peer_committed;
+	int pcaa_initial_connect;
+};
+
+/**
+ * Updates import \a imp current state to provided \a state value
+ * Helper function. Must be called under imp_lock.
+ */
+static void __import_set_state(struct obd_import *imp,
+			       enum lustre_imp_state state)
+{
+	imp->imp_state = state;
+	imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
+	imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
+		cfs_time_current_sec();
+	imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
+		IMP_STATE_HIST_LEN;
+}
+
+/* A CLOSED import should remain so. */
+#define IMPORT_SET_STATE_NOLOCK(imp, state)				    \
+do {									   \
+	if (imp->imp_state != LUSTRE_IMP_CLOSED) {			     \
+	       CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",    \
+		      imp, obd2cli_tgt(imp->imp_obd),			  \
+		      ptlrpc_import_state_name(imp->imp_state),		\
+		      ptlrpc_import_state_name(state));			\
+	       __import_set_state(imp, state);				 \
+	}								      \
+} while(0)
+
+#define IMPORT_SET_STATE(imp, state)					\
+do {									\
+	spin_lock(&imp->imp_lock);					\
+	IMPORT_SET_STATE_NOLOCK(imp, state);				\
+	spin_unlock(&imp->imp_lock);					\
+} while(0)
+
+
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+				    struct ptlrpc_request *request,
+				    void * data, int rc);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+
+/* Only this function is allowed to change the import state when it is
+ * CLOSED. I would rather refcount the import and free it after
+ * disconnection like we do with exports. To do that, the client_obd
+ * will need to save the peer info somewhere other than in the import,
+ * though. */
+int ptlrpc_init_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+
+	imp->imp_generation++;
+	imp->imp_state =  LUSTRE_IMP_NEW;
+
+	spin_unlock(&imp->imp_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_init_import);
+
+#define UUID_STR "_UUID"
+void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
+{
+	*uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
+		? uuid : uuid + strlen(prefix);
+
+	*uuid_len = strlen(*uuid_start);
+
+	if (*uuid_len < strlen(UUID_STR))
+		return;
+
+	if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
+		    UUID_STR, strlen(UUID_STR)))
+		*uuid_len -= strlen(UUID_STR);
+}
+EXPORT_SYMBOL(deuuidify);
+
+/**
+ * Returns true if import was FULL, false if import was already not
+ * connected.
+ * @imp - import to be disconnected
+ * @conn_cnt - connection count (epoch) of the request that timed out
+ *	     and caused the disconnection.  In some cases, multiple
+ *	     inflight requests can fail to a single target (e.g. OST
+ *	     bulk requests) and if one has already caused a reconnection
+ *	     (increasing the import->conn_cnt) the older failure should
+ *	     not also cause a reconnection.  If zero it forces a reconnect.
+ */
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
+{
+	int rc = 0;
+
+	spin_lock(&imp->imp_lock);
+
+	if (imp->imp_state == LUSTRE_IMP_FULL &&
+	    (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
+		char *target_start;
+		int   target_len;
+
+		deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+			  &target_start, &target_len);
+
+		if (imp->imp_replayable) {
+			LCONSOLE_WARN("%s: Connection to %.*s (at %s) was "
+			       "lost; in progress operations using this "
+			       "service will wait for recovery to complete\n",
+			       imp->imp_obd->obd_name, target_len, target_start,
+			       libcfs_nid2str(imp->imp_connection->c_peer.nid));
+		} else {
+			LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
+			       "%.*s (at %s) was lost; in progress "
+			       "operations using this service will fail\n",
+			       imp->imp_obd->obd_name,
+			       target_len, target_start,
+			       libcfs_nid2str(imp->imp_connection->c_peer.nid));
+		}
+		ptlrpc_deactivate_timeouts(imp);
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+		spin_unlock(&imp->imp_lock);
+
+		if (obd_dump_on_timeout)
+			libcfs_debug_dumplog();
+
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+		rc = 1;
+	} else {
+		spin_unlock(&imp->imp_lock);
+		CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
+		       imp->imp_client->cli_name, imp,
+		       (imp->imp_state == LUSTRE_IMP_FULL &&
+			imp->imp_conn_cnt > conn_cnt) ?
+		       "reconnected" : "not connected", imp->imp_conn_cnt,
+		       conn_cnt, ptlrpc_import_state_name(imp->imp_state));
+	}
+
+	return rc;
+}
+
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
+{
+	ENTRY;
+	LASSERT(spin_is_locked(&imp->imp_lock));
+
+	CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+	imp->imp_invalid = 1;
+	imp->imp_generation++;
+	spin_unlock(&imp->imp_lock);
+
+	ptlrpc_abort_inflight(imp);
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+
+	EXIT;
+}
+
+/*
+ * This acts as a barrier; all existing requests are rejected, and
+ * no new requests will be accepted until the import is valid again.
+ */
+void ptlrpc_deactivate_import(struct obd_import *imp)
+{
+	spin_lock(&imp->imp_lock);
+	ptlrpc_deactivate_and_unlock_import(imp);
+}
+EXPORT_SYMBOL(ptlrpc_deactivate_import);
+
+static unsigned int
+ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now)
+{
+	long dl;
+
+	if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+	      (req->rq_phase == RQ_PHASE_BULK) ||
+	      (req->rq_phase == RQ_PHASE_NEW)))
+		return 0;
+
+	if (req->rq_timedout)
+		return 0;
+
+	if (req->rq_phase == RQ_PHASE_NEW)
+		dl = req->rq_sent;
+	else
+		dl = req->rq_deadline;
+
+	if (dl <= now)
+		return 0;
+
+	return dl - now;
+}
+
+static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
+{
+	time_t now = cfs_time_current_sec();
+	struct list_head *tmp, *n;
+	struct ptlrpc_request *req;
+	unsigned int timeout = 0;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_list);
+		timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
+	}
+	spin_unlock(&imp->imp_lock);
+	return timeout;
+}
+
+/**
+ * This function will invalidate the import, if necessary, then block
+ * for all the RPC completions, and finally notify the obd to
+ * invalidate its state (ie cancel locks, clear pending requests,
+ * etc).
+ */
+void ptlrpc_invalidate_import(struct obd_import *imp)
+{
+	struct list_head *tmp, *n;
+	struct ptlrpc_request *req;
+	struct l_wait_info lwi;
+	unsigned int timeout;
+	int rc;
+
+	atomic_inc(&imp->imp_inval_count);
+
+	if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
+		ptlrpc_deactivate_import(imp);
+
+	LASSERT(imp->imp_invalid);
+
+	/* Wait forever until inflight == 0. We really can't do it another
+	 * way because in some cases we need to wait for very long reply
+	 * unlink. We can't do anything before that because there is really
+	 * no guarantee that some rdma transfer is not in progress right now. */
+	do {
+		/* Calculate max timeout for waiting on rpcs to error
+		 * out. Use obd_timeout if calculated value is smaller
+		 * than it. */
+		if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+			timeout = ptlrpc_inflight_timeout(imp);
+			timeout += timeout / 3;
+
+			if (timeout == 0)
+				timeout = obd_timeout;
+		} else {
+			/* decrease the interval to increase race condition */
+			timeout = 1;
+		}
+
+		CDEBUG(D_RPCTRACE,"Sleeping %d sec for inflight to error out\n",
+		       timeout);
+
+		/* Wait for all requests to error out and call completion
+		 * callbacks. Cap it at obd_timeout -- these should all
+		 * have been locally cancelled by ptlrpc_abort_inflight. */
+		lwi = LWI_TIMEOUT_INTERVAL(
+			cfs_timeout_cap(cfs_time_seconds(timeout)),
+			(timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
+			NULL, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  (atomic_read(&imp->imp_inflight) == 0),
+				  &lwi);
+		if (rc) {
+			const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
+
+			CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
+			       cli_tgt, rc,
+			       atomic_read(&imp->imp_inflight));
+
+			spin_lock(&imp->imp_lock);
+			if (atomic_read(&imp->imp_inflight) == 0) {
+				int count = atomic_read(&imp->imp_unregistering);
+
+				/* We know that "unregistering" rpcs only can
+				 * survive in sending or delaying lists (they
+				 * maybe waiting for long reply unlink in
+				 * sluggish nets). Let's check this. If there
+				 * is no inflight and unregistering != 0, this
+				 * is bug. */
+				LASSERTF(count == 0, "Some RPCs are still "
+					 "unregistering: %d\n", count);
+
+				/* Let's save one loop as soon as inflight have
+				 * dropped to zero. No new inflights possible at
+				 * this point. */
+				rc = 0;
+			} else {
+				list_for_each_safe(tmp, n,
+						       &imp->imp_sending_list) {
+					req = list_entry(tmp,
+							     struct ptlrpc_request,
+							     rq_list);
+					DEBUG_REQ(D_ERROR, req,
+						  "still on sending list");
+				}
+				list_for_each_safe(tmp, n,
+						       &imp->imp_delayed_list) {
+					req = list_entry(tmp,
+							     struct ptlrpc_request,
+							     rq_list);
+					DEBUG_REQ(D_ERROR, req,
+						  "still on delayed list");
+				}
+
+				CERROR("%s: RPCs in \"%s\" phase found (%d). "
+				       "Network is sluggish? Waiting them "
+				       "to error out.\n", cli_tgt,
+				       ptlrpc_phase2str(RQ_PHASE_UNREGISTERING),
+				       atomic_read(&imp->
+						       imp_unregistering));
+			}
+			spin_unlock(&imp->imp_lock);
+		  }
+	} while (rc != 0);
+
+	/*
+	 * Let's additionally check that no new rpcs added to import in
+	 * "invalidate" state.
+	 */
+	LASSERT(atomic_read(&imp->imp_inflight) == 0);
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
+	sptlrpc_import_flush_all_ctx(imp);
+
+	atomic_dec(&imp->imp_inval_count);
+	wake_up_all(&imp->imp_recovery_waitq);
+}
+EXPORT_SYMBOL(ptlrpc_invalidate_import);
+
+/* unset imp_invalid */
+void ptlrpc_activate_import(struct obd_import *imp)
+{
+	struct obd_device *obd = imp->imp_obd;
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_invalid = 0;
+	ptlrpc_activate_timeouts(imp);
+	spin_unlock(&imp->imp_lock);
+	obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
+}
+EXPORT_SYMBOL(ptlrpc_activate_import);
+
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
+{
+	ENTRY;
+
+	LASSERT(!imp->imp_dlm_fake);
+
+	if (ptlrpc_set_import_discon(imp, conn_cnt)) {
+		if (!imp->imp_replayable) {
+			CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+			       "auto-deactivating\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid,
+			       imp->imp_obd->obd_name);
+			ptlrpc_deactivate_import(imp);
+		}
+
+		CDEBUG(D_HA, "%s: waking up pinger\n",
+		       obd2cli_tgt(imp->imp_obd));
+
+		spin_lock(&imp->imp_lock);
+		imp->imp_force_verify = 1;
+		spin_unlock(&imp->imp_lock);
+
+		ptlrpc_pinger_wake_up();
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_fail_import);
+
+int ptlrpc_reconnect_import(struct obd_import *imp)
+{
+	ptlrpc_set_import_discon(imp, 0);
+	/* Force a new connect attempt */
+	ptlrpc_invalidate_import(imp);
+	/* Do a fresh connect next time by zeroing the handle */
+	ptlrpc_disconnect_import(imp, 1);
+	/* Wait for all invalidate calls to finish */
+	if (atomic_read(&imp->imp_inval_count) > 0) {
+		int rc;
+		struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  (atomic_read(&imp->imp_inval_count) == 0),
+				  &lwi);
+		if (rc)
+			CERROR("Interrupted, inval=%d\n",
+			       atomic_read(&imp->imp_inval_count));
+	}
+
+	/* Allow reconnect attempts */
+	imp->imp_obd->obd_no_recov = 0;
+	/* Remove 'invalid' flag */
+	ptlrpc_activate_import(imp);
+	/* Attempt a new connect */
+	ptlrpc_recover_import(imp, NULL, 0);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_reconnect_import);
+
+/**
+ * Connection on import \a imp is changed to another one (if more than one is
+ * present). We typically chose connection that we have not tried to connect to
+ * the longest
+ */
+static int import_select_connection(struct obd_import *imp)
+{
+	struct obd_import_conn *imp_conn = NULL, *conn;
+	struct obd_export *dlmexp;
+	char *target_start;
+	int target_len, tried_all = 1;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+
+	if (list_empty(&imp->imp_conn_list)) {
+		CERROR("%s: no connections available\n",
+		       imp->imp_obd->obd_name);
+		spin_unlock(&imp->imp_lock);
+		RETURN(-EINVAL);
+	}
+
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		CDEBUG(D_HA, "%s: connect to NID %s last attempt "LPU64"\n",
+		       imp->imp_obd->obd_name,
+		       libcfs_nid2str(conn->oic_conn->c_peer.nid),
+		       conn->oic_last_attempt);
+
+		/* If we have not tried this connection since
+		   the last successful attempt, go with this one */
+		if ((conn->oic_last_attempt == 0) ||
+		    cfs_time_beforeq_64(conn->oic_last_attempt,
+				       imp->imp_last_success_conn)) {
+			imp_conn = conn;
+			tried_all = 0;
+			break;
+		}
+
+		/* If all of the connections have already been tried
+		   since the last successful connection; just choose the
+		   least recently used */
+		if (!imp_conn)
+			imp_conn = conn;
+		else if (cfs_time_before_64(conn->oic_last_attempt,
+					    imp_conn->oic_last_attempt))
+			imp_conn = conn;
+	}
+
+	/* if not found, simply choose the current one */
+	if (!imp_conn || imp->imp_force_reconnect) {
+		LASSERT(imp->imp_conn_current);
+		imp_conn = imp->imp_conn_current;
+		tried_all = 0;
+	}
+	LASSERT(imp_conn->oic_conn);
+
+	/* If we've tried everything, and we're back to the beginning of the
+	   list, increase our timeout and try again. It will be reset when
+	   we do finally connect. (FIXME: really we should wait for all network
+	   state associated with the last connection attempt to drain before
+	   trying to reconnect on it.) */
+	if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+		struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+		if (at_get(at) < CONNECTION_SWITCH_MAX) {
+			at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
+			if (at_get(at) > CONNECTION_SWITCH_MAX)
+				at_reset(at, CONNECTION_SWITCH_MAX);
+		}
+		LASSERT(imp_conn->oic_last_attempt);
+		CDEBUG(D_HA, "%s: tried all connections, increasing latency "
+			"to %ds\n", imp->imp_obd->obd_name, at_get(at));
+	}
+
+	imp_conn->oic_last_attempt = cfs_time_current_64();
+
+	/* switch connection, don't mind if it's same as the current one */
+	if (imp->imp_connection)
+		ptlrpc_connection_put(imp->imp_connection);
+	imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+
+	dlmexp =  class_conn2export(&imp->imp_dlm_handle);
+	LASSERT(dlmexp != NULL);
+	if (dlmexp->exp_connection)
+		ptlrpc_connection_put(dlmexp->exp_connection);
+	dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+	class_export_put(dlmexp);
+
+	if (imp->imp_conn_current != imp_conn) {
+		if (imp->imp_conn_current) {
+			deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+				  &target_start, &target_len);
+
+			CDEBUG(D_HA, "%s: Connection changing to"
+			       " %.*s (at %s)\n",
+			       imp->imp_obd->obd_name,
+			       target_len, target_start,
+			       libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+		}
+
+		imp->imp_conn_current = imp_conn;
+	}
+
+	CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
+	       imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
+	       libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+
+	spin_unlock(&imp->imp_lock);
+
+	RETURN(0);
+}
+
+/*
+ * must be called under imp_lock
+ */
+static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
+{
+	struct ptlrpc_request *req;
+	struct list_head *tmp;
+
+	if (list_empty(&imp->imp_replay_list))
+		return 0;
+	tmp = imp->imp_replay_list.next;
+	req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+	*transno = req->rq_transno;
+	if (req->rq_transno == 0) {
+		DEBUG_REQ(D_ERROR, req, "zero transno in replay");
+		LBUG();
+	}
+
+	return 1;
+}
+
+/**
+ * Attempt to (re)connect import \a imp. This includes all preparations,
+ * initializing CONNECT RPC request and passing it to ptlrpcd for
+ * actual sending.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_connect_import(struct obd_import *imp)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int initial_connect = 0;
+	int set_transno = 0;
+	__u64 committed_before_reconnect = 0;
+	struct ptlrpc_request *request;
+	char *bufs[] = { NULL,
+			 obd2cli_tgt(imp->imp_obd),
+			 obd->obd_uuid.uuid,
+			 (char *)&imp->imp_dlm_handle,
+			 (char *)&imp->imp_connect_data };
+	struct ptlrpc_connect_async_args *aa;
+	int rc;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("can't connect to a closed import\n");
+		RETURN(-EINVAL);
+	} else if (imp->imp_state == LUSTRE_IMP_FULL) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("already connected\n");
+		RETURN(0);
+	} else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
+		spin_unlock(&imp->imp_lock);
+		CERROR("already connecting\n");
+		RETURN(-EALREADY);
+	}
+
+	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
+
+	imp->imp_conn_cnt++;
+	imp->imp_resend_replay = 0;
+
+	if (!lustre_handle_is_used(&imp->imp_remote_handle))
+		initial_connect = 1;
+	else
+		committed_before_reconnect = imp->imp_peer_committed_transno;
+
+	set_transno = ptlrpc_first_transno(imp,
+					   &imp->imp_connect_data.ocd_transno);
+	spin_unlock(&imp->imp_lock);
+
+	rc = import_select_connection(imp);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = sptlrpc_import_sec_adapt(imp, NULL, 0);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Reset connect flags to the originally requested flags, in case
+	 * the server is updated on-the-fly we will get the new features. */
+	imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
+	/* Reset ocd_version each time so the server knows the exact versions */
+	imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
+	imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+	imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+	rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
+			   &obd->obd_uuid, &imp->imp_connect_data, NULL);
+	if (rc)
+		GOTO(out, rc);
+
+	request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
+	if (request == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
+				      imp->imp_connect_op, bufs, NULL);
+	if (rc) {
+		ptlrpc_request_free(request);
+		GOTO(out, rc);
+	}
+
+	/* Report the rpc service time to the server so that it knows how long
+	 * to wait for clients to join recovery */
+	lustre_msg_set_service_time(request->rq_reqmsg,
+				    at_timeout2est(request->rq_timeout));
+
+	/* The amount of time we give the server to process the connect req.
+	 * import_select_connection will increase the net latency on
+	 * repeated reconnect attempts to cover slow networks.
+	 * We override/ignore the server rpc completion estimate here,
+	 * which may be large if this is a reconnect attempt */
+	request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+	lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
+	lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER);
+
+	request->rq_no_resend = request->rq_no_delay = 1;
+	request->rq_send_state = LUSTRE_IMP_CONNECTING;
+	/* Allow a slightly larger reply for future growth compatibility */
+	req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
+			     sizeof(struct obd_connect_data)+16*sizeof(__u64));
+	ptlrpc_request_set_replen(request);
+	request->rq_interpret_reply = ptlrpc_connect_interpret;
+
+	CLASSERT(sizeof (*aa) <= sizeof (request->rq_async_args));
+	aa = ptlrpc_req_async_args(request);
+	memset(aa, 0, sizeof *aa);
+
+	aa->pcaa_peer_committed = committed_before_reconnect;
+	aa->pcaa_initial_connect = initial_connect;
+
+	if (aa->pcaa_initial_connect) {
+		spin_lock(&imp->imp_lock);
+		imp->imp_replayable = 1;
+		spin_unlock(&imp->imp_lock);
+		lustre_msg_add_op_flags(request->rq_reqmsg,
+					MSG_CONNECT_INITIAL);
+	}
+
+	if (set_transno)
+		lustre_msg_add_op_flags(request->rq_reqmsg,
+					MSG_CONNECT_TRANSNO);
+
+	DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
+		  request->rq_timeout);
+	ptlrpcd_add_req(request, PDL_POLICY_ROUND, -1);
+	rc = 0;
+out:
+	if (rc != 0) {
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_connect_import);
+
+static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
+{
+	int force_verify;
+
+	spin_lock(&imp->imp_lock);
+	force_verify = imp->imp_force_verify != 0;
+	spin_unlock(&imp->imp_lock);
+
+	if (force_verify)
+		ptlrpc_pinger_wake_up();
+}
+
+static int ptlrpc_busy_reconnect(int rc)
+{
+	return (rc == -EBUSY) || (rc == -EAGAIN);
+}
+
+/**
+ * interpret_reply callback for connect RPCs.
+ * Looks into returned status of connect operation and decides
+ * what to do with the import - i.e enter recovery, promote it to
+ * full state for normal operations of disconnect it due to an error.
+ */
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+				    struct ptlrpc_request *request,
+				    void *data, int rc)
+{
+	struct ptlrpc_connect_async_args *aa = data;
+	struct obd_import *imp = request->rq_import;
+	struct client_obd *cli = &imp->imp_obd->u.cli;
+	struct lustre_handle old_hdl;
+	__u64 old_connect_flags;
+	int msg_flags;
+	struct obd_connect_data *ocd;
+	struct obd_export *exp;
+	int ret;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+		imp->imp_connect_tried = 1;
+		spin_unlock(&imp->imp_lock);
+		RETURN(0);
+	}
+
+	if (rc) {
+		/* if this reconnect to busy export - not need select new target
+		 * for connecting*/
+		imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
+		spin_unlock(&imp->imp_lock);
+		ptlrpc_maybe_ping_import_soon(imp);
+		GOTO(out, rc);
+	}
+	spin_unlock(&imp->imp_lock);
+
+	LASSERT(imp->imp_conn_current);
+
+	msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
+
+	ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
+				   RCL_SERVER);
+	/* server replied obd_connect_data is always bigger */
+	ocd = req_capsule_server_sized_get(&request->rq_pill,
+					   &RMF_CONNECT_DATA, ret);
+
+	if (ocd == NULL) {
+		CERROR("%s: no connect data from server\n",
+		       imp->imp_obd->obd_name);
+		rc = -EPROTO;
+		GOTO(out, rc);
+	}
+
+	spin_lock(&imp->imp_lock);
+
+	/* All imports are pingable */
+	imp->imp_pingable = 1;
+	imp->imp_force_reconnect = 0;
+	imp->imp_force_verify = 0;
+
+	imp->imp_connect_data = *ocd;
+
+	CDEBUG(D_HA, "%s: connect to target with instance %u\n",
+	       imp->imp_obd->obd_name, ocd->ocd_instance);
+	exp = class_conn2export(&imp->imp_dlm_handle);
+
+	spin_unlock(&imp->imp_lock);
+
+	/* check that server granted subset of flags we asked for. */
+	if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
+	    ocd->ocd_connect_flags) {
+		CERROR("%s: Server didn't granted asked subset of flags: "
+		       "asked="LPX64" grranted="LPX64"\n",
+		       imp->imp_obd->obd_name,imp->imp_connect_flags_orig,
+		       ocd->ocd_connect_flags);
+		GOTO(out, rc = -EPROTO);
+	}
+
+	if (!exp) {
+		/* This could happen if export is cleaned during the
+		   connect attempt */
+		CERROR("%s: missing export after connect\n",
+		       imp->imp_obd->obd_name);
+		GOTO(out, rc = -ENODEV);
+	}
+	old_connect_flags = exp_connect_flags(exp);
+	exp->exp_connect_data = *ocd;
+	imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
+	class_export_put(exp);
+
+	obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
+
+	if (aa->pcaa_initial_connect) {
+		spin_lock(&imp->imp_lock);
+		if (msg_flags & MSG_CONNECT_REPLAYABLE) {
+			imp->imp_replayable = 1;
+			spin_unlock(&imp->imp_lock);
+			CDEBUG(D_HA, "connected to replayable target: %s\n",
+			       obd2cli_tgt(imp->imp_obd));
+		} else {
+			imp->imp_replayable = 0;
+			spin_unlock(&imp->imp_lock);
+		}
+
+		/* if applies, adjust the imp->imp_msg_magic here
+		 * according to reply flags */
+
+		imp->imp_remote_handle =
+				*lustre_msg_get_handle(request->rq_repmsg);
+
+		/* Initial connects are allowed for clients with non-random
+		 * uuids when servers are in recovery.  Simply signal the
+		 * servers replay is complete and wait in REPLAY_WAIT. */
+		if (msg_flags & MSG_CONNECT_RECOVERING) {
+			CDEBUG(D_HA, "connect to %s during recovery\n",
+			       obd2cli_tgt(imp->imp_obd));
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+		} else {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+			ptlrpc_activate_import(imp);
+		}
+
+		GOTO(finish, rc = 0);
+	}
+
+	/* Determine what recovery state to move the import to. */
+	if (MSG_CONNECT_RECONNECT & msg_flags) {
+		memset(&old_hdl, 0, sizeof(old_hdl));
+		if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
+			    sizeof (old_hdl))) {
+			LCONSOLE_WARN("Reconnect to %s (at @%s) failed due "
+				      "bad handle "LPX64"\n",
+				      obd2cli_tgt(imp->imp_obd),
+				      imp->imp_connection->c_remote_uuid.uuid,
+				      imp->imp_dlm_handle.cookie);
+			GOTO(out, rc = -ENOTCONN);
+		}
+
+		if (memcmp(&imp->imp_remote_handle,
+			   lustre_msg_get_handle(request->rq_repmsg),
+			   sizeof(imp->imp_remote_handle))) {
+			int level = msg_flags & MSG_CONNECT_RECOVERING ?
+				D_HA : D_WARNING;
+
+			/* Bug 16611/14775: if server handle have changed,
+			 * that means some sort of disconnection happened.
+			 * If the server is not in recovery, that also means it
+			 * already erased all of our state because of previous
+			 * eviction. If it is in recovery - we are safe to
+			 * participate since we can reestablish all of our state
+			 * with server again */
+			if ((MSG_CONNECT_RECOVERING & msg_flags)) {
+				CDEBUG(level,"%s@%s changed server handle from "
+				       LPX64" to "LPX64
+				       " but is still in recovery\n",
+				       obd2cli_tgt(imp->imp_obd),
+				       imp->imp_connection->c_remote_uuid.uuid,
+				       imp->imp_remote_handle.cookie,
+				       lustre_msg_get_handle(
+				       request->rq_repmsg)->cookie);
+			} else {
+				LCONSOLE_WARN("Evicted from %s (at %s) "
+					      "after server handle changed from "
+					      LPX64" to "LPX64"\n",
+					      obd2cli_tgt(imp->imp_obd),
+					      imp->imp_connection-> \
+					      c_remote_uuid.uuid,
+					      imp->imp_remote_handle.cookie,
+					      lustre_msg_get_handle(
+					      request->rq_repmsg)->cookie);
+			}
+
+
+			imp->imp_remote_handle =
+				     *lustre_msg_get_handle(request->rq_repmsg);
+
+			if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
+				IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+				GOTO(finish, rc = 0);
+			}
+
+		} else {
+			CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid);
+		}
+
+		if (imp->imp_invalid) {
+			CDEBUG(D_HA, "%s: reconnected but import is invalid; "
+			       "marking evicted\n", imp->imp_obd->obd_name);
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+		} else if (MSG_CONNECT_RECOVERING & msg_flags) {
+			CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+			       imp->imp_obd->obd_name,
+			       obd2cli_tgt(imp->imp_obd));
+
+			spin_lock(&imp->imp_lock);
+			imp->imp_resend_replay = 1;
+			spin_unlock(&imp->imp_lock);
+
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+		} else {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+		}
+	} else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
+		LASSERT(imp->imp_replayable);
+		imp->imp_remote_handle =
+				*lustre_msg_get_handle(request->rq_repmsg);
+		imp->imp_last_replay_transno = 0;
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+	} else {
+		DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags"
+			  " not set: %x)", imp->imp_obd->obd_name, msg_flags);
+		imp->imp_remote_handle =
+				*lustre_msg_get_handle(request->rq_repmsg);
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+	}
+
+	/* Sanity checks for a reconnected import. */
+	if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
+		CERROR("imp_replayable flag does not match server "
+		       "after reconnect. We should LBUG right here.\n");
+	}
+
+	if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
+	    lustre_msg_get_last_committed(request->rq_repmsg) <
+	    aa->pcaa_peer_committed) {
+		CERROR("%s went back in time (transno "LPD64
+		       " was previously committed, server now claims "LPD64
+		       ")!  See https://bugzilla.lustre.org/show_bug.cgi?"
+		       "id=9646\n",
+		       obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
+		       lustre_msg_get_last_committed(request->rq_repmsg));
+	}
+
+finish:
+	rc = ptlrpc_import_recovery_state_machine(imp);
+	if (rc != 0) {
+		if (rc == -ENOTCONN) {
+			CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;"
+			       "invalidating and reconnecting\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid);
+			ptlrpc_connect_import(imp);
+			imp->imp_connect_tried = 1;
+			RETURN(0);
+		}
+	} else {
+
+		spin_lock(&imp->imp_lock);
+		list_del(&imp->imp_conn_current->oic_item);
+		list_add(&imp->imp_conn_current->oic_item,
+			     &imp->imp_conn_list);
+		imp->imp_last_success_conn =
+			imp->imp_conn_current->oic_last_attempt;
+
+		spin_unlock(&imp->imp_lock);
+
+		if (!ocd->ocd_ibits_known &&
+		    ocd->ocd_connect_flags & OBD_CONNECT_IBITS)
+			CERROR("Inodebits aware server returned zero compatible"
+			       " bits?\n");
+
+		if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+		    (ocd->ocd_version > LUSTRE_VERSION_CODE +
+					LUSTRE_VERSION_OFFSET_WARN ||
+		     ocd->ocd_version < LUSTRE_VERSION_CODE -
+					LUSTRE_VERSION_OFFSET_WARN)) {
+			/* Sigh, some compilers do not like #ifdef in the middle
+			   of macro arguments */
+			const char *older = "older. Consider upgrading server "
+					    "or downgrading client";
+			const char *newer = "newer than client version. "
+					    "Consider upgrading client";
+
+			LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) "
+				      "is much %s (%s)\n",
+				      obd2cli_tgt(imp->imp_obd),
+				      OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+				      OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+				      OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+				      OBD_OCD_VERSION_FIX(ocd->ocd_version),
+				      ocd->ocd_version > LUSTRE_VERSION_CODE ?
+				      newer : older, LUSTRE_VERSION_STRING);
+		}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+		/* Check if server has LU-1252 fix applied to not always swab
+		 * the IR MNE entries. Do this only once per connection.  This
+		 * fixup is version-limited, because we don't want to carry the
+		 * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
+		 * need interop with unpatched 2.2 servers.  For newer servers,
+		 * the client will do MNE swabbing only as needed.  LU-1644 */
+		if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+			     !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
+			     OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
+			     OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
+			     OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
+			     strcmp(imp->imp_obd->obd_type->typ_name,
+				    LUSTRE_MGC_NAME) == 0))
+			imp->imp_need_mne_swab = 1;
+		else /* clear if server was upgraded since last connect */
+			imp->imp_need_mne_swab = 0;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+		if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
+			/* We sent to the server ocd_cksum_types with bits set
+			 * for algorithms we understand. The server masked off
+			 * the checksum types it doesn't support */
+			if ((ocd->ocd_cksum_types &
+			     cksum_types_supported_client()) == 0) {
+				LCONSOLE_WARN("The negotiation of the checksum "
+					      "alogrithm to use with server %s "
+					      "failed (%x/%x), disabling "
+					      "checksums\n",
+					      obd2cli_tgt(imp->imp_obd),
+					      ocd->ocd_cksum_types,
+					      cksum_types_supported_client());
+				cli->cl_checksum = 0;
+				cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+			} else {
+				cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
+			}
+		} else {
+			/* The server does not support OBD_CONNECT_CKSUM.
+			 * Enforce ADLER for backward compatibility*/
+			cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+		}
+		cli->cl_cksum_type =cksum_type_select(cli->cl_supp_cksum_types);
+
+		if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+			cli->cl_max_pages_per_rpc =
+				min(ocd->ocd_brw_size >> PAGE_CACHE_SHIFT,
+				    cli->cl_max_pages_per_rpc);
+		else if (imp->imp_connect_op == MDS_CONNECT ||
+			 imp->imp_connect_op == MGS_CONNECT)
+			cli->cl_max_pages_per_rpc = 1;
+
+		/* Reset ns_connect_flags only for initial connect. It might be
+		 * changed in while using FS and if we reset it in reconnect
+		 * this leads to losing user settings done before such as
+		 * disable lru_resize, etc. */
+		if (old_connect_flags != exp_connect_flags(exp) ||
+		    aa->pcaa_initial_connect) {
+			CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server "
+			       "flags: "LPX64"\n", imp->imp_obd->obd_name,
+			      ocd->ocd_connect_flags);
+			imp->imp_obd->obd_namespace->ns_connect_flags =
+				ocd->ocd_connect_flags;
+			imp->imp_obd->obd_namespace->ns_orig_connect_flags =
+				ocd->ocd_connect_flags;
+		}
+
+		if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
+		    (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+			/* We need a per-message support flag, because
+			   a. we don't know if the incoming connect reply
+			      supports AT or not (in reply_in_callback)
+			      until we unpack it.
+			   b. failovered server means export and flags are gone
+			      (in ptlrpc_send_reply).
+			   Can only be set when we know AT is supported at
+			   both ends */
+			imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+		else
+			imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
+		if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) &&
+		    (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+			imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
+		else
+			imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+		LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
+			(cli->cl_max_pages_per_rpc > 0));
+	}
+
+out:
+	imp->imp_connect_tried = 1;
+
+	if (rc != 0) {
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+		if (rc == -EACCES) {
+			/*
+			 * Give up trying to reconnect
+			 * EACCES means client has no permission for connection
+			 */
+			imp->imp_obd->obd_no_recov = 1;
+			ptlrpc_deactivate_import(imp);
+		}
+
+		if (rc == -EPROTO) {
+			struct obd_connect_data *ocd;
+
+			/* reply message might not be ready */
+			if (request->rq_repmsg == NULL)
+				RETURN(-EPROTO);
+
+			ocd = req_capsule_server_get(&request->rq_pill,
+						     &RMF_CONNECT_DATA);
+			if (ocd &&
+			    (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+			    (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+			   /* Actually servers are only supposed to refuse
+			      connection from liblustre clients, so we should
+			      never see this from VFS context */
+				LCONSOLE_ERROR_MSG(0x16a, "Server %s version "
+					"(%d.%d.%d.%d)"
+					" refused connection from this client "
+					"with an incompatible version (%s).  "
+					"Client must be recompiled\n",
+					obd2cli_tgt(imp->imp_obd),
+					OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+					OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+					OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+					OBD_OCD_VERSION_FIX(ocd->ocd_version),
+					LUSTRE_VERSION_STRING);
+				ptlrpc_deactivate_import(imp);
+				IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
+			}
+			RETURN(-EPROTO);
+		}
+
+		ptlrpc_maybe_ping_import_soon(imp);
+
+		CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
+	}
+
+	wake_up_all(&imp->imp_recovery_waitq);
+	RETURN(rc);
+}
+
+/**
+ * interpret callback for "completed replay" RPCs.
+ * \see signal_completed_replay
+ */
+static int completed_replay_interpret(const struct lu_env *env,
+				      struct ptlrpc_request *req,
+				      void * data, int rc)
+{
+	ENTRY;
+	atomic_dec(&req->rq_import->imp_replay_inflight);
+	if (req->rq_status == 0 &&
+	    !req->rq_import->imp_vbr_failed) {
+		ptlrpc_import_recovery_state_machine(req->rq_import);
+	} else {
+		if (req->rq_import->imp_vbr_failed) {
+			CDEBUG(D_WARNING,
+			       "%s: version recovery fails, reconnecting\n",
+			       req->rq_import->imp_obd->obd_name);
+		} else {
+			CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
+				     "reconnecting\n",
+			       req->rq_import->imp_obd->obd_name,
+			       req->rq_status);
+		}
+		ptlrpc_connect_import(req->rq_import);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Let server know that we have no requests to replay anymore.
+ * Achieved by just sending a PING request
+ */
+static int signal_completed_replay(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
+		RETURN(0);
+
+	LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+	atomic_inc(&imp->imp_replay_inflight);
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
+					OBD_PING);
+	if (req == NULL) {
+		atomic_dec(&imp->imp_replay_inflight);
+		RETURN(-ENOMEM);
+	}
+
+	ptlrpc_request_set_replen(req);
+	req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
+	lustre_msg_add_flags(req->rq_reqmsg,
+			     MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
+	if (AT_OFF)
+		req->rq_timeout *= 3;
+	req->rq_interpret_reply = completed_replay_interpret;
+
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+	RETURN(0);
+}
+
+/**
+ * In kernel code all import invalidation happens in its own
+ * separate thread, so that whatever application happened to encounter
+ * a problem could still be killed or otherwise continue
+ */
+static int ptlrpc_invalidate_import_thread(void *data)
+{
+	struct obd_import *imp = data;
+
+	ENTRY;
+
+	unshare_fs_struct();
+
+	CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
+	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_connection->c_remote_uuid.uuid);
+
+	ptlrpc_invalidate_import(imp);
+
+	if (obd_dump_on_eviction) {
+		CERROR("dump the log upon eviction\n");
+		libcfs_debug_dumplog();
+	}
+
+	IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+	ptlrpc_import_recovery_state_machine(imp);
+
+	class_import_put(imp);
+	RETURN(0);
+}
+
+/**
+ * This is the state machine for client-side recovery on import.
+ *
+ * Typicaly we have two possibly paths. If we came to server and it is not
+ * in recovery, we just enter IMP_EVICTED state, invalidate our import
+ * state and reconnect from scratch.
+ * If we came to server that is in recovery, we enter IMP_REPLAY import state.
+ * We go through our list of requests to replay and send them to server one by
+ * one.
+ * After sending all request from the list we change import state to
+ * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
+ * and also all the locks we don't yet have and wait for server to grant us.
+ * After that we send a special "replay completed" request and change import
+ * state to IMP_REPLAY_WAIT.
+ * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
+ * state and resend all requests from sending list.
+ * After that we promote import to FULL state and send all delayed requests
+ * and import is fully operational after that.
+ *
+ */
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
+{
+	int rc = 0;
+	int inflight;
+	char *target_start;
+	int target_len;
+
+	ENTRY;
+	if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+		deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+			  &target_start, &target_len);
+		/* Don't care about MGC eviction */
+		if (strcmp(imp->imp_obd->obd_type->typ_name,
+			   LUSTRE_MGC_NAME) != 0) {
+			LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted "
+					   "by %.*s; in progress operations "
+					   "using this service will fail.\n",
+					   imp->imp_obd->obd_name, target_len,
+					   target_start);
+		}
+		CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connection->c_remote_uuid.uuid);
+		/* reset vbr_failed flag upon eviction */
+		spin_lock(&imp->imp_lock);
+		imp->imp_vbr_failed = 0;
+		spin_unlock(&imp->imp_lock);
+
+		{
+		task_t *task;
+		/* bug 17802:  XXX client_disconnect_export vs connect request
+		 * race. if client will evicted at this time, we start
+		 * invalidate thread without reference to import and import can
+		 * be freed at same time. */
+		class_import_get(imp);
+		task = kthread_run(ptlrpc_invalidate_import_thread, imp,
+				     "ll_imp_inval");
+		if (IS_ERR(task)) {
+			class_import_put(imp);
+			CERROR("error starting invalidate thread: %d\n", rc);
+			rc = PTR_ERR(task);
+		} else {
+			rc = 0;
+		}
+		RETURN(rc);
+		}
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY) {
+		CDEBUG(D_HA, "replay requested by %s\n",
+		       obd2cli_tgt(imp->imp_obd));
+		rc = ptlrpc_replay_next(imp, &inflight);
+		if (inflight == 0 &&
+		    atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+			rc = ldlm_replay_locks(imp);
+			if (rc)
+				GOTO(out, rc);
+		}
+		rc = 0;
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
+		if (atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
+			rc = signal_completed_replay(imp);
+			if (rc)
+				GOTO(out, rc);
+		}
+
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
+		if (atomic_read(&imp->imp_replay_inflight) == 0) {
+			IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+		}
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+		CDEBUG(D_HA, "reconnected to %s@%s\n",
+		       obd2cli_tgt(imp->imp_obd),
+		       imp->imp_connection->c_remote_uuid.uuid);
+
+		rc = ptlrpc_resend(imp);
+		if (rc)
+			GOTO(out, rc);
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+		ptlrpc_activate_import(imp);
+
+		deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+			  &target_start, &target_len);
+		LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n",
+			      imp->imp_obd->obd_name,
+			      target_len, target_start,
+			      libcfs_nid2str(imp->imp_connection->c_peer.nid));
+	}
+
+	if (imp->imp_state == LUSTRE_IMP_FULL) {
+		wake_up_all(&imp->imp_recovery_waitq);
+		ptlrpc_wake_delayed(imp);
+	}
+
+out:
+	RETURN(rc);
+}
+
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+{
+	struct ptlrpc_request *req;
+	int rq_opc, rc = 0;
+	int nowait = imp->imp_obd->obd_force;
+	ENTRY;
+
+	if (nowait)
+		GOTO(set_state, rc);
+
+	switch (imp->imp_connect_op) {
+	case OST_CONNECT: rq_opc = OST_DISCONNECT; break;
+	case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break;
+	case MGS_CONNECT: rq_opc = MGS_DISCONNECT; break;
+	default:
+		CERROR("don't know how to disconnect from %s (connect_op %d)\n",
+		       obd2cli_tgt(imp->imp_obd), imp->imp_connect_op);
+		RETURN(-EINVAL);
+	}
+
+	if (ptlrpc_import_in_recovery(imp)) {
+		struct l_wait_info lwi;
+		cfs_duration_t timeout;
+
+
+		if (AT_OFF) {
+			if (imp->imp_server_timeout)
+				timeout = cfs_time_seconds(obd_timeout / 2);
+			else
+				timeout = cfs_time_seconds(obd_timeout);
+		} else {
+			int idx = import_at_get_index(imp,
+				imp->imp_client->cli_request_portal);
+			timeout = cfs_time_seconds(
+				at_get(&imp->imp_at.iat_service_estimate[idx]));
+		}
+
+		lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
+				       back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  !ptlrpc_import_in_recovery(imp), &lwi);
+
+	}
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_FULL)
+		GOTO(out, 0);
+
+	spin_unlock(&imp->imp_lock);
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+					LUSTRE_OBD_VERSION, rq_opc);
+	if (req) {
+		/* We are disconnecting, do not retry a failed DISCONNECT rpc if
+		 * it fails.  We can get through the above with a down server
+		 * if the client doesn't know the server is gone yet. */
+		req->rq_no_resend = 1;
+
+		/* We want client umounts to happen quickly, no matter the
+		   server state... */
+		req->rq_timeout = min_t(int, req->rq_timeout,
+					INITIAL_CONNECT_TIMEOUT);
+
+		IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
+		req->rq_send_state =  LUSTRE_IMP_CONNECTING;
+		ptlrpc_request_set_replen(req);
+		rc = ptlrpc_queue_wait(req);
+		ptlrpc_req_finished(req);
+	}
+
+set_state:
+	spin_lock(&imp->imp_lock);
+out:
+	if (noclose)
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+	else
+		IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+	memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+	spin_unlock(&imp->imp_lock);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_import);
+
+void ptlrpc_cleanup_imp(struct obd_import *imp)
+{
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+	imp->imp_generation++;
+	spin_unlock(&imp->imp_lock);
+	ptlrpc_abort_inflight(imp);
+
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_imp);
+
+/* Adaptive Timeout utils */
+extern unsigned int at_min, at_max, at_history;
+
+/* Bin into timeslices using AT_BINS bins.
+   This gives us a max of the last binlimit*AT_BINS secs without the storage,
+   but still smoothing out a return to normalcy from a slow response.
+   (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
+int at_measured(struct adaptive_timeout *at, unsigned int val)
+{
+	unsigned int old = at->at_current;
+	time_t now = cfs_time_current_sec();
+	time_t binlimit = max_t(time_t, at_history / AT_BINS, 1);
+
+	LASSERT(at);
+	CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
+	       val, at, now - at->at_binstart, at->at_current,
+	       at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+
+	if (val == 0)
+		/* 0's don't count, because we never want our timeout to
+		   drop to 0, and because 0 could mean an error */
+		return 0;
+
+	spin_lock(&at->at_lock);
+
+	if (unlikely(at->at_binstart == 0)) {
+		/* Special case to remove default from history */
+		at->at_current = val;
+		at->at_worst_ever = val;
+		at->at_worst_time = now;
+		at->at_hist[0] = val;
+		at->at_binstart = now;
+	} else if (now - at->at_binstart < binlimit ) {
+		/* in bin 0 */
+		at->at_hist[0] = max(val, at->at_hist[0]);
+		at->at_current = max(val, at->at_current);
+	} else {
+		int i, shift;
+		unsigned int maxv = val;
+		/* move bins over */
+		shift = (now - at->at_binstart) / binlimit;
+		LASSERT(shift > 0);
+		for(i = AT_BINS - 1; i >= 0; i--) {
+			if (i >= shift) {
+				at->at_hist[i] = at->at_hist[i - shift];
+				maxv = max(maxv, at->at_hist[i]);
+			} else {
+				at->at_hist[i] = 0;
+			}
+		}
+		at->at_hist[0] = val;
+		at->at_current = maxv;
+		at->at_binstart += shift * binlimit;
+	}
+
+	if (at->at_current > at->at_worst_ever) {
+		at->at_worst_ever = at->at_current;
+		at->at_worst_time = now;
+	}
+
+	if (at->at_flags & AT_FLG_NOHIST)
+		/* Only keep last reported val; keeping the rest of the history
+		   for proc only */
+		at->at_current = val;
+
+	if (at_max > 0)
+		at->at_current =  min(at->at_current, at_max);
+	at->at_current =  max(at->at_current, at_min);
+
+	if (at->at_current != old)
+		CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d "
+		       "(val=%u) hist %u %u %u %u\n", at,
+		       old, at->at_current, at->at_current - old, val,
+		       at->at_hist[0], at->at_hist[1], at->at_hist[2],
+		       at->at_hist[3]);
+
+	/* if we changed, report the old value */
+	old = (at->at_current != old) ? old : 0;
+
+	spin_unlock(&at->at_lock);
+	return old;
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal)
+{
+	struct imp_at *at = &imp->imp_at;
+	int i;
+
+	for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		if (at->iat_portal[i] == portal)
+			return i;
+		if (at->iat_portal[i] == 0)
+			/* unused */
+			break;
+	}
+
+	/* Not found in list, add it under a lock */
+	spin_lock(&imp->imp_lock);
+
+	/* Check unused under lock */
+	for (; i < IMP_AT_MAX_PORTALS; i++) {
+		if (at->iat_portal[i] == portal)
+			goto out;
+		if (at->iat_portal[i] == 0)
+			/* unused */
+			break;
+	}
+
+	/* Not enough portals? */
+	LASSERT(i < IMP_AT_MAX_PORTALS);
+
+	at->iat_portal[i] = portal;
+out:
+	spin_unlock(&imp->imp_lock);
+	return i;
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/layout.c b/drivers/staging/lustre/lustre/ptlrpc/layout.c
new file mode 100644
index 0000000..2f55ce2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/layout.c

@@ -0,0 +1,2396 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/layout.c
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+/*
+ * This file contains the "capsule/pill" abstraction layered above PTLRPC.
+ *
+ * Every struct ptlrpc_request contains a "pill", which points to a description
+ * of the format that the request conforms to.
+ */
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/module.h>
+
+/* LUSTRE_VERSION_CODE */
+#include <lustre_ver.h>
+
+#include <obd_support.h>
+/* lustre_swab_mdt_body */
+#include <lustre/lustre_idl.h>
+/* obd2cli_tgt() (required by DEBUG_REQ()) */
+#include <obd.h>
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_req_layout.h>
+#include <lustre_update.h>
+#include <lustre_acl.h>
+#include <lustre_debug.h>
+
+/*
+ * RQFs (see below) refer to two struct req_msg_field arrays describing the
+ * client request and server reply, respectively.
+ */
+/* empty set of fields... for suitable definition of emptiness. */
+static const struct req_msg_field *empty[] = {
+	&RMF_PTLRPC_BODY
+};
+
+static const struct req_msg_field *mgs_target_info_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_TARGET_INFO
+};
+
+static const struct req_msg_field *mgs_set_info[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_SEND_PARAM
+};
+
+static const struct req_msg_field *mgs_config_read_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_CONFIG_BODY
+};
+
+static const struct req_msg_field *mgs_config_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MGS_CONFIG_RES
+};
+
+static const struct req_msg_field *log_cancel_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LOGCOOKIES
+};
+
+static const struct req_msg_field *mdt_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY
+};
+
+static const struct req_msg_field *mdt_body_capa[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *quotactl_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OBD_QUOTACTL
+};
+
+static const struct req_msg_field *quota_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_DLM_LVB,
+	&RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *mdt_close_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_EPOCH,
+	&RMF_REC_REINT,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *obd_statfs_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OBD_STATFS
+};
+
+static const struct req_msg_field *seq_query_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SEQ_OPC,
+	&RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *seq_query_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *fld_query_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FLD_OPC,
+	&RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_query_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *mds_getattr_name_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT
+};
+
+static const struct req_msg_field *mds_reint_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_create_slave_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_rmt_acl_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_sym_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_open_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mds_reint_open_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_unlink_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_link_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_rename_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_SYMTGT,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_last_unlink_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_LOGCOOKIES,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_setattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_MDT_EPOCH,
+	&RMF_EADATA,
+	&RMF_LOGCOOKIES,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_setxattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_REC_REINT,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mdt_swap_layouts[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_SWAP_LAYOUTS,
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *obd_connect_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_TGTUUID,
+	&RMF_CLUUID,
+	&RMF_CONN,
+	&RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_connect_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_set_info_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SETINFO_KEY,
+	&RMF_SETINFO_VAL
+};
+
+static const struct req_msg_field *ost_grant_shrink_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SETINFO_KEY,
+	&RMF_OST_BODY
+};
+
+static const struct req_msg_field *mds_getinfo_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GETINFO_KEY,
+	&RMF_GETINFO_VALLEN
+};
+
+static const struct req_msg_field *mds_getinfo_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GETINFO_VAL,
+};
+
+static const struct req_msg_field *ldlm_enqueue_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ
+};
+
+static const struct req_msg_field *ldlm_enqueue_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP
+};
+
+static const struct req_msg_field *ldlm_enqueue_lvb_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_cp_callback_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_gl_callback_desc_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_DLM_GL_DESC
+};
+
+static const struct req_msg_field *ldlm_gl_callback_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_intent_basic_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+};
+
+static const struct req_msg_field *ldlm_intent_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT
+};
+
+static const struct req_msg_field *ldlm_intent_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL
+};
+
+static const struct req_msg_field *ldlm_intent_layout_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_LAYOUT_INTENT,
+	&RMF_EADATA /* for new layout to be set up */
+};
+static const struct req_msg_field *ldlm_intent_open_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REP,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *ldlm_intent_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_create_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_open_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_open_client[] */
+	&RMF_CAPA1,
+	&RMF_CAPA2,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_unlink_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_DLM_REQ,
+	&RMF_LDLM_INTENT,
+	&RMF_REC_REINT,    /* coincides with mds_reint_unlink_client[] */
+	&RMF_CAPA1,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *mds_getxattr_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_NAME,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getxattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_setattr_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDT_MD,
+	&RMF_ACL,
+	&RMF_CAPA1,
+	&RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_update_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_UPDATE,
+};
+
+static const struct req_msg_field *mds_update_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_UPDATE_REPLY,
+};
+
+static const struct req_msg_field *llog_origin_handle_create_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_BODY,
+	&RMF_NAME
+};
+
+static const struct req_msg_field *llogd_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_BODY
+};
+
+static const struct req_msg_field *llog_log_hdr_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOG_LOG_HDR
+};
+
+static const struct req_msg_field *llogd_conn_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_CONN_BODY
+};
+
+static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_LLOGD_BODY,
+	&RMF_EADATA
+};
+
+static const struct req_msg_field *obd_idx_read_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_IDX_INFO
+};
+
+static const struct req_msg_field *obd_idx_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_IDX_INFO
+};
+
+static const struct req_msg_field *ost_body_only[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_body_capa[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_destroy_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_DLM_REQ,
+	&RMF_CAPA1
+};
+
+
+static const struct req_msg_field *ost_brw_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_OBD_IOOBJ,
+	&RMF_NIOBUF_REMOTE,
+	&RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_brw_read_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_brw_write_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OST_BODY,
+	&RMF_RCS
+};
+
+static const struct req_msg_field *ost_get_info_generic_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_GENERIC_DATA,
+};
+
+static const struct req_msg_field *ost_get_info_generic_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_SETINFO_KEY
+};
+
+static const struct req_msg_field *ost_get_last_id_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_OBD_ID
+};
+
+static const struct req_msg_field *ost_get_last_fid_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FID,
+};
+
+static const struct req_msg_field *ost_get_fiemap_client[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FIEMAP_KEY,
+	&RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *ost_get_fiemap_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *mdt_hsm_progress[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_PROGRESS,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_register[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_ARCHIVE,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_unregister[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+};
+
+static const struct req_msg_field *mdt_hsm_action_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_CURRENT_ACTION,
+};
+
+static const struct req_msg_field *mdt_hsm_state_get_server[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_HSM_USER_STATE,
+};
+
+static const struct req_msg_field *mdt_hsm_state_set[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_CAPA1,
+	&RMF_HSM_STATE_SET,
+};
+
+static const struct req_msg_field *mdt_hsm_request[] = {
+	&RMF_PTLRPC_BODY,
+	&RMF_MDT_BODY,
+	&RMF_MDS_HSM_REQUEST,
+	&RMF_MDS_HSM_USER_ITEM,
+	&RMF_GENERIC_DATA,
+};
+
+static struct req_format *req_formats[] = {
+	&RQF_OBD_PING,
+	&RQF_OBD_SET_INFO,
+	&RQF_OBD_IDX_READ,
+	&RQF_SEC_CTX,
+	&RQF_MGS_TARGET_REG,
+	&RQF_MGS_SET_INFO,
+	&RQF_MGS_CONFIG_READ,
+	&RQF_SEQ_QUERY,
+	&RQF_FLD_QUERY,
+	&RQF_MDS_CONNECT,
+	&RQF_MDS_DISCONNECT,
+	&RQF_MDS_GET_INFO,
+	&RQF_MDS_GETSTATUS,
+	&RQF_MDS_STATFS,
+	&RQF_MDS_GETATTR,
+	&RQF_MDS_GETATTR_NAME,
+	&RQF_MDS_GETXATTR,
+	&RQF_MDS_SYNC,
+	&RQF_MDS_CLOSE,
+	&RQF_MDS_PIN,
+	&RQF_MDS_UNPIN,
+	&RQF_MDS_READPAGE,
+	&RQF_MDS_WRITEPAGE,
+	&RQF_MDS_IS_SUBDIR,
+	&RQF_MDS_DONE_WRITING,
+	&RQF_MDS_REINT,
+	&RQF_MDS_REINT_CREATE,
+	&RQF_MDS_REINT_CREATE_RMT_ACL,
+	&RQF_MDS_REINT_CREATE_SLAVE,
+	&RQF_MDS_REINT_CREATE_SYM,
+	&RQF_MDS_REINT_OPEN,
+	&RQF_MDS_REINT_UNLINK,
+	&RQF_MDS_REINT_LINK,
+	&RQF_MDS_REINT_RENAME,
+	&RQF_MDS_REINT_SETATTR,
+	&RQF_MDS_REINT_SETXATTR,
+	&RQF_MDS_QUOTACHECK,
+	&RQF_MDS_QUOTACTL,
+	&RQF_MDS_HSM_PROGRESS,
+	&RQF_MDS_HSM_CT_REGISTER,
+	&RQF_MDS_HSM_CT_UNREGISTER,
+	&RQF_MDS_HSM_STATE_GET,
+	&RQF_MDS_HSM_STATE_SET,
+	&RQF_MDS_HSM_ACTION,
+	&RQF_MDS_HSM_REQUEST,
+	&RQF_MDS_SWAP_LAYOUTS,
+	&RQF_UPDATE_OBJ,
+	&RQF_QC_CALLBACK,
+	&RQF_OST_CONNECT,
+	&RQF_OST_DISCONNECT,
+	&RQF_OST_QUOTACHECK,
+	&RQF_OST_QUOTACTL,
+	&RQF_OST_GETATTR,
+	&RQF_OST_SETATTR,
+	&RQF_OST_CREATE,
+	&RQF_OST_PUNCH,
+	&RQF_OST_SYNC,
+	&RQF_OST_DESTROY,
+	&RQF_OST_BRW_READ,
+	&RQF_OST_BRW_WRITE,
+	&RQF_OST_STATFS,
+	&RQF_OST_SET_GRANT_INFO,
+	&RQF_OST_GET_INFO_GENERIC,
+	&RQF_OST_GET_INFO_LAST_ID,
+	&RQF_OST_GET_INFO_LAST_FID,
+	&RQF_OST_SET_INFO_LAST_FID,
+	&RQF_OST_GET_INFO_FIEMAP,
+	&RQF_LDLM_ENQUEUE,
+	&RQF_LDLM_ENQUEUE_LVB,
+	&RQF_LDLM_CONVERT,
+	&RQF_LDLM_CANCEL,
+	&RQF_LDLM_CALLBACK,
+	&RQF_LDLM_CP_CALLBACK,
+	&RQF_LDLM_BL_CALLBACK,
+	&RQF_LDLM_GL_CALLBACK,
+	&RQF_LDLM_GL_DESC_CALLBACK,
+	&RQF_LDLM_INTENT,
+	&RQF_LDLM_INTENT_BASIC,
+	&RQF_LDLM_INTENT_LAYOUT,
+	&RQF_LDLM_INTENT_GETATTR,
+	&RQF_LDLM_INTENT_OPEN,
+	&RQF_LDLM_INTENT_CREATE,
+	&RQF_LDLM_INTENT_UNLINK,
+	&RQF_LDLM_INTENT_QUOTA,
+	&RQF_QUOTA_DQACQ,
+	&RQF_LOG_CANCEL,
+	&RQF_LLOG_ORIGIN_HANDLE_CREATE,
+	&RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+	&RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+	&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+	&RQF_LLOG_ORIGIN_CONNECT
+};
+
+struct req_msg_field {
+	const __u32 rmf_flags;
+	const char  *rmf_name;
+	/**
+	 * Field length. (-1) means "variable length".  If the
+	 * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length,
+	 * but the actual size must be a whole multiple of \a rmf_size.
+	 */
+	const int   rmf_size;
+	void	(*rmf_swabber)(void *);
+	void	(*rmf_dumper)(void *);
+	int	 rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR];
+};
+
+enum rmf_flags {
+	/**
+	 * The field is a string, must be NUL-terminated.
+	 */
+	RMF_F_STRING = 1 << 0,
+	/**
+	 * The field's buffer size need not match the declared \a rmf_size.
+	 */
+	RMF_F_NO_SIZE_CHECK = 1 << 1,
+	/**
+	 * The field's buffer size must be a whole multiple of the declared \a
+	 * rmf_size and the \a rmf_swabber function must work on the declared \a
+	 * rmf_size worth of bytes.
+	 */
+	RMF_F_STRUCT_ARRAY = 1 << 2
+};
+
+struct req_capsule;
+
+/*
+ * Request fields.
+ */
+#define DEFINE_MSGF(name, flags, size, swabber, dumper) {       \
+	.rmf_name    = (name),				  \
+	.rmf_flags   = (flags),				 \
+	.rmf_size    = (size),				  \
+	.rmf_swabber = (void (*)(void*))(swabber),	      \
+	.rmf_dumper  = (void (*)(void*))(dumper)		\
+}
+
+struct req_msg_field RMF_GENERIC_DATA =
+	DEFINE_MSGF("generic_data", 0,
+		    -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GENERIC_DATA);
+
+struct req_msg_field RMF_MGS_TARGET_INFO =
+	DEFINE_MSGF("mgs_target_info", 0,
+		    sizeof(struct mgs_target_info),
+		    lustre_swab_mgs_target_info, NULL);
+EXPORT_SYMBOL(RMF_MGS_TARGET_INFO);
+
+struct req_msg_field RMF_MGS_SEND_PARAM =
+	DEFINE_MSGF("mgs_send_param", 0,
+		    sizeof(struct mgs_send_param),
+		    NULL, NULL);
+EXPORT_SYMBOL(RMF_MGS_SEND_PARAM);
+
+struct req_msg_field RMF_MGS_CONFIG_BODY =
+	DEFINE_MSGF("mgs_config_read request", 0,
+		    sizeof(struct mgs_config_body),
+		    lustre_swab_mgs_config_body, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY);
+
+struct req_msg_field RMF_MGS_CONFIG_RES =
+	DEFINE_MSGF("mgs_config_read reply ", 0,
+		    sizeof(struct mgs_config_res),
+		    lustre_swab_mgs_config_res, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
+
+struct req_msg_field RMF_U32 =
+	DEFINE_MSGF("generic u32", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_U32);
+
+struct req_msg_field RMF_SETINFO_VAL =
+	DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_VAL);
+
+struct req_msg_field RMF_GETINFO_KEY =
+	DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_KEY);
+
+struct req_msg_field RMF_GETINFO_VALLEN =
+	DEFINE_MSGF("getinfo_vallen", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VALLEN);
+
+struct req_msg_field RMF_GETINFO_VAL =
+	DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VAL);
+
+struct req_msg_field RMF_SEQ_OPC =
+	DEFINE_MSGF("seq_query_opc", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_SEQ_OPC);
+
+struct req_msg_field RMF_SEQ_RANGE =
+	DEFINE_MSGF("seq_query_range", 0,
+		    sizeof(struct lu_seq_range),
+		    lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_SEQ_RANGE);
+
+struct req_msg_field RMF_FLD_OPC =
+	DEFINE_MSGF("fld_query_opc", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_FLD_OPC);
+
+struct req_msg_field RMF_FLD_MDFLD =
+	DEFINE_MSGF("fld_query_mdfld", 0,
+		    sizeof(struct lu_seq_range),
+		    lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_FLD_MDFLD);
+
+struct req_msg_field RMF_MDT_BODY =
+	DEFINE_MSGF("mdt_body", 0,
+		    sizeof(struct mdt_body), lustre_swab_mdt_body, NULL);
+EXPORT_SYMBOL(RMF_MDT_BODY);
+
+struct req_msg_field RMF_OBD_QUOTACTL =
+	DEFINE_MSGF("obd_quotactl", 0,
+		    sizeof(struct obd_quotactl),
+		    lustre_swab_obd_quotactl, NULL);
+EXPORT_SYMBOL(RMF_OBD_QUOTACTL);
+
+struct req_msg_field RMF_QUOTA_BODY =
+	DEFINE_MSGF("quota_body", 0,
+		    sizeof(struct quota_body), lustre_swab_quota_body, NULL);
+EXPORT_SYMBOL(RMF_QUOTA_BODY);
+
+struct req_msg_field RMF_MDT_EPOCH =
+	DEFINE_MSGF("mdt_ioepoch", 0,
+		    sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL);
+EXPORT_SYMBOL(RMF_MDT_EPOCH);
+
+struct req_msg_field RMF_PTLRPC_BODY =
+	DEFINE_MSGF("ptlrpc_body", 0,
+		    sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL);
+EXPORT_SYMBOL(RMF_PTLRPC_BODY);
+
+struct req_msg_field RMF_OBD_STATFS =
+	DEFINE_MSGF("obd_statfs", 0,
+		    sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL);
+EXPORT_SYMBOL(RMF_OBD_STATFS);
+
+struct req_msg_field RMF_SETINFO_KEY =
+	DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_KEY);
+
+struct req_msg_field RMF_NAME =
+	DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_NAME);
+
+struct req_msg_field RMF_SYMTGT =
+	DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SYMTGT);
+
+struct req_msg_field RMF_TGTUUID =
+	DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+	NULL);
+EXPORT_SYMBOL(RMF_TGTUUID);
+
+struct req_msg_field RMF_CLUUID =
+	DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+	NULL);
+EXPORT_SYMBOL(RMF_CLUUID);
+
+struct req_msg_field RMF_STRING =
+	DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_STRING);
+
+struct req_msg_field RMF_LLOGD_BODY =
+	DEFINE_MSGF("llogd_body", 0,
+		    sizeof(struct llogd_body), lustre_swab_llogd_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_BODY);
+
+struct req_msg_field RMF_LLOG_LOG_HDR =
+	DEFINE_MSGF("llog_log_hdr", 0,
+		    sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL);
+EXPORT_SYMBOL(RMF_LLOG_LOG_HDR);
+
+struct req_msg_field RMF_LLOGD_CONN_BODY =
+	DEFINE_MSGF("llogd_conn_body", 0,
+		    sizeof(struct llogd_conn_body),
+		    lustre_swab_llogd_conn_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY);
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ *
+ * No swabbing needed because struct lustre_handle contains only a 64-bit cookie
+ * that the client does not interpret at all.
+ */
+struct req_msg_field RMF_CONN =
+	DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL);
+EXPORT_SYMBOL(RMF_CONN);
+
+struct req_msg_field RMF_CONNECT_DATA =
+	DEFINE_MSGF("cdata",
+		    RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */,
+#if LUSTRE_VERSION_CODE > OBD_OCD_VERSION(2, 7, 50, 0)
+		    sizeof(struct obd_connect_data),
+#else
+/* For interoperability with 1.8 and 2.0 clients/servers.
+ * The RPC verification code allows larger RPC buffers, but not
+ * smaller buffers.  Until we no longer need to keep compatibility
+ * with older servers/clients we can only check that the buffer
+ * size is at least as large as obd_connect_data_v1.  That is not
+ * not in itself harmful, since the chance of just corrupting this
+ * field is low.  See JIRA LU-16 for details. */
+		    sizeof(struct obd_connect_data_v1),
+#endif
+		    lustre_swab_connect, NULL);
+EXPORT_SYMBOL(RMF_CONNECT_DATA);
+
+struct req_msg_field RMF_DLM_REQ =
+	DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */,
+		    sizeof(struct ldlm_request),
+		    lustre_swab_ldlm_request, NULL);
+EXPORT_SYMBOL(RMF_DLM_REQ);
+
+struct req_msg_field RMF_DLM_REP =
+	DEFINE_MSGF("dlm_rep", 0,
+		    sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL);
+EXPORT_SYMBOL(RMF_DLM_REP);
+
+struct req_msg_field RMF_LDLM_INTENT =
+	DEFINE_MSGF("ldlm_intent", 0,
+		    sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL);
+EXPORT_SYMBOL(RMF_LDLM_INTENT);
+
+struct req_msg_field RMF_DLM_LVB =
+	DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_DLM_LVB);
+
+struct req_msg_field RMF_DLM_GL_DESC =
+	DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc),
+		    lustre_swab_gl_desc, NULL);
+EXPORT_SYMBOL(RMF_DLM_GL_DESC);
+
+struct req_msg_field RMF_MDT_MD =
+	DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_MDT_MD);
+
+struct req_msg_field RMF_REC_REINT =
+	DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint),
+		    lustre_swab_mdt_rec_reint, NULL);
+EXPORT_SYMBOL(RMF_REC_REINT);
+
+/* FIXME: this length should be defined as a macro */
+struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1,
+						    NULL, NULL);
+EXPORT_SYMBOL(RMF_EADATA);
+
+struct req_msg_field RMF_ACL =
+	DEFINE_MSGF("acl", RMF_F_NO_SIZE_CHECK,
+		    LUSTRE_POSIX_ACL_MAX_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_ACL);
+
+/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */
+struct req_msg_field RMF_LOGCOOKIES =
+	DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */,
+		    sizeof(struct llog_cookie), NULL, NULL);
+EXPORT_SYMBOL(RMF_LOGCOOKIES);
+
+struct req_msg_field RMF_CAPA1 =
+	DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+		    lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA1);
+
+struct req_msg_field RMF_CAPA2 =
+	DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+		    lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA2);
+
+struct req_msg_field RMF_LAYOUT_INTENT =
+	DEFINE_MSGF("layout_intent", 0,
+		    sizeof(struct layout_intent), lustre_swab_layout_intent,
+		    NULL);
+EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
+
+/*
+ * OST request field.
+ */
+struct req_msg_field RMF_OST_BODY =
+	DEFINE_MSGF("ost_body", 0,
+		    sizeof(struct ost_body), lustre_swab_ost_body, dump_ost_body);
+EXPORT_SYMBOL(RMF_OST_BODY);
+
+struct req_msg_field RMF_OBD_IOOBJ =
+	DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo);
+EXPORT_SYMBOL(RMF_OBD_IOOBJ);
+
+struct req_msg_field RMF_NIOBUF_REMOTE =
+	DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+		    dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
+
+struct req_msg_field RMF_RCS =
+	DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+		    lustre_swab_generic_32s, dump_rcs);
+EXPORT_SYMBOL(RMF_RCS);
+
+struct req_msg_field RMF_OBD_ID =
+	DEFINE_MSGF("obd_id", 0,
+		    sizeof(obd_id), lustre_swab_ost_last_id, NULL);
+EXPORT_SYMBOL(RMF_OBD_ID);
+
+struct req_msg_field RMF_FID =
+	DEFINE_MSGF("fid", 0,
+		    sizeof(struct lu_fid), lustre_swab_lu_fid, NULL);
+EXPORT_SYMBOL(RMF_FID);
+
+struct req_msg_field RMF_OST_ID =
+	DEFINE_MSGF("ost_id", 0,
+		    sizeof(struct ost_id), lustre_swab_ost_id, NULL);
+EXPORT_SYMBOL(RMF_OST_ID);
+
+struct req_msg_field RMF_FIEMAP_KEY =
+	DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key),
+		    lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_KEY);
+
+struct req_msg_field RMF_FIEMAP_VAL =
+	DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_VAL);
+
+struct req_msg_field RMF_IDX_INFO =
+	DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
+		    lustre_swab_idx_info, NULL);
+EXPORT_SYMBOL(RMF_IDX_INFO);
+struct req_msg_field RMF_HSM_USER_STATE =
+	DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
+		    lustre_swab_hsm_user_state, NULL);
+EXPORT_SYMBOL(RMF_HSM_USER_STATE);
+
+struct req_msg_field RMF_HSM_STATE_SET =
+	DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set),
+		    lustre_swab_hsm_state_set, NULL);
+EXPORT_SYMBOL(RMF_HSM_STATE_SET);
+
+struct req_msg_field RMF_MDS_HSM_PROGRESS =
+	DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel),
+		    lustre_swab_hsm_progress_kernel, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS);
+
+struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION =
+	DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action),
+		    lustre_swab_hsm_current_action, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION);
+
+struct req_msg_field RMF_MDS_HSM_USER_ITEM =
+	DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY,
+		    sizeof(struct hsm_user_item), lustre_swab_hsm_user_item,
+		    NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
+
+struct req_msg_field RMF_MDS_HSM_ARCHIVE =
+	DEFINE_MSGF("hsm_archive", 0,
+		    sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
+
+struct req_msg_field RMF_MDS_HSM_REQUEST =
+	DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request),
+		    lustre_swab_hsm_request, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST);
+
+struct req_msg_field RMF_UPDATE = DEFINE_MSGF("update", 0, -1,
+					      lustre_swab_update_buf, NULL);
+EXPORT_SYMBOL(RMF_UPDATE);
+
+struct req_msg_field RMF_UPDATE_REPLY = DEFINE_MSGF("update_reply", 0, -1,
+						lustre_swab_update_reply_buf,
+						    NULL);
+EXPORT_SYMBOL(RMF_UPDATE_REPLY);
+
+struct req_msg_field RMF_SWAP_LAYOUTS =
+	DEFINE_MSGF("swap_layouts", 0, sizeof(struct  mdc_swap_layouts),
+		    lustre_swab_swap_layouts, NULL);
+EXPORT_SYMBOL(RMF_SWAP_LAYOUTS);
+/*
+ * Request formats.
+ */
+
+struct req_format {
+	const char *rf_name;
+	int	 rf_idx;
+	struct {
+		int			  nr;
+		const struct req_msg_field **d;
+	} rf_fields[RCL_NR];
+};
+
+#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) {    \
+	.rf_name   = name,					      \
+	.rf_fields = {						  \
+		[RCL_CLIENT] = {					\
+			.nr = client_nr,				\
+			.d  = client				    \
+		},						      \
+		[RCL_SERVER] = {					\
+			.nr = server_nr,				\
+			.d  = server				    \
+		}						       \
+	}							       \
+}
+
+#define DEFINE_REQ_FMT0(name, client, server)				  \
+DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server))
+
+struct req_format RQF_OBD_PING =
+	DEFINE_REQ_FMT0("OBD_PING", empty, empty);
+EXPORT_SYMBOL(RQF_OBD_PING);
+
+struct req_format RQF_OBD_SET_INFO =
+	DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty);
+EXPORT_SYMBOL(RQF_OBD_SET_INFO);
+
+/* Read index file through the network */
+struct req_format RQF_OBD_IDX_READ =
+	DEFINE_REQ_FMT0("OBD_IDX_READ",
+			obd_idx_read_client, obd_idx_read_server);
+EXPORT_SYMBOL(RQF_OBD_IDX_READ);
+
+struct req_format RQF_SEC_CTX =
+	DEFINE_REQ_FMT0("SEC_CTX", empty, empty);
+EXPORT_SYMBOL(RQF_SEC_CTX);
+
+struct req_format RQF_MGS_TARGET_REG =
+	DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only,
+			 mgs_target_info_only);
+EXPORT_SYMBOL(RQF_MGS_TARGET_REG);
+
+struct req_format RQF_MGS_SET_INFO =
+	DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info,
+			 mgs_set_info);
+EXPORT_SYMBOL(RQF_MGS_SET_INFO);
+
+struct req_format RQF_MGS_CONFIG_READ =
+	DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client,
+			 mgs_config_read_server);
+EXPORT_SYMBOL(RQF_MGS_CONFIG_READ);
+
+struct req_format RQF_SEQ_QUERY =
+	DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server);
+EXPORT_SYMBOL(RQF_SEQ_QUERY);
+
+struct req_format RQF_FLD_QUERY =
+	DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server);
+EXPORT_SYMBOL(RQF_FLD_QUERY);
+
+struct req_format RQF_LOG_CANCEL =
+	DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty);
+EXPORT_SYMBOL(RQF_LOG_CANCEL);
+
+struct req_format RQF_MDS_QUOTACHECK =
+	DEFINE_REQ_FMT0("MDS_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_MDS_QUOTACHECK);
+
+struct req_format RQF_OST_QUOTACHECK =
+	DEFINE_REQ_FMT0("OST_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_OST_QUOTACHECK);
+
+struct req_format RQF_MDS_QUOTACTL =
+	DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
+
+struct req_format RQF_OST_QUOTACTL =
+	DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_OST_QUOTACTL);
+
+struct req_format RQF_QC_CALLBACK =
+	DEFINE_REQ_FMT0("QC_CALLBACK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_QC_CALLBACK);
+
+struct req_format RQF_QUOTA_DQACQ =
+	DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only);
+EXPORT_SYMBOL(RQF_QUOTA_DQACQ);
+
+struct req_format RQF_LDLM_INTENT_QUOTA =
+	DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA",
+			ldlm_intent_quota_client,
+			ldlm_intent_quota_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA);
+
+struct req_format RQF_MDS_GETSTATUS =
+	DEFINE_REQ_FMT0("MDS_GETSTATUS", mdt_body_only, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_GETSTATUS);
+
+struct req_format RQF_MDS_STATFS =
+	DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS);
+
+struct req_format RQF_MDS_SYNC =
+	DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_SYNC);
+
+struct req_format RQF_MDS_GETATTR =
+	DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR);
+
+struct req_format RQF_MDS_GETXATTR =
+	DEFINE_REQ_FMT0("MDS_GETXATTR",
+			mds_getxattr_client, mds_getxattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETXATTR);
+
+struct req_format RQF_MDS_GETATTR_NAME =
+	DEFINE_REQ_FMT0("MDS_GETATTR_NAME",
+			mds_getattr_name_client, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME);
+
+struct req_format RQF_MDS_REINT =
+	DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT);
+
+struct req_format RQF_MDS_REINT_CREATE =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE",
+			mds_reint_create_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE);
+
+struct req_format RQF_MDS_REINT_CREATE_RMT_ACL =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_RMT_ACL",
+			mds_reint_create_rmt_acl_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_RMT_ACL);
+
+struct req_format RQF_MDS_REINT_CREATE_SLAVE =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA",
+			mds_reint_create_slave_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE);
+
+struct req_format RQF_MDS_REINT_CREATE_SYM =
+	DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM",
+			mds_reint_create_sym_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM);
+
+struct req_format RQF_MDS_REINT_OPEN =
+	DEFINE_REQ_FMT0("MDS_REINT_OPEN",
+			mds_reint_open_client, mds_reint_open_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_OPEN);
+
+struct req_format RQF_MDS_REINT_UNLINK =
+	DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client,
+			mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK);
+
+struct req_format RQF_MDS_REINT_LINK =
+	DEFINE_REQ_FMT0("MDS_REINT_LINK",
+			mds_reint_link_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_LINK);
+
+struct req_format RQF_MDS_REINT_RENAME =
+	DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client,
+			mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_RENAME);
+
+struct req_format RQF_MDS_REINT_SETATTR =
+	DEFINE_REQ_FMT0("MDS_REINT_SETATTR",
+			mds_reint_setattr_client, mds_setattr_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR);
+
+struct req_format RQF_MDS_REINT_SETXATTR =
+	DEFINE_REQ_FMT0("MDS_REINT_SETXATTR",
+			mds_reint_setxattr_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
+
+struct req_format RQF_MDS_CONNECT =
+	DEFINE_REQ_FMT0("MDS_CONNECT",
+			obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_MDS_CONNECT);
+
+struct req_format RQF_MDS_DISCONNECT =
+	DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_MDS_DISCONNECT);
+
+struct req_format RQF_MDS_GET_INFO =
+	DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client,
+			mds_getinfo_server);
+EXPORT_SYMBOL(RQF_MDS_GET_INFO);
+
+struct req_format RQF_UPDATE_OBJ =
+	DEFINE_REQ_FMT0("OBJECT_UPDATE_OBJ", mds_update_client,
+			mds_update_server);
+EXPORT_SYMBOL(RQF_UPDATE_OBJ);
+
+struct req_format RQF_LDLM_ENQUEUE =
+	DEFINE_REQ_FMT0("LDLM_ENQUEUE",
+			ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE);
+
+struct req_format RQF_LDLM_ENQUEUE_LVB =
+	DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB",
+			ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB);
+
+struct req_format RQF_LDLM_CONVERT =
+	DEFINE_REQ_FMT0("LDLM_CONVERT",
+			ldlm_enqueue_client, ldlm_enqueue_server);
+EXPORT_SYMBOL(RQF_LDLM_CONVERT);
+
+struct req_format RQF_LDLM_CANCEL =
+	DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CANCEL);
+
+struct req_format RQF_LDLM_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CALLBACK);
+
+struct req_format RQF_LDLM_CP_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK);
+
+struct req_format RQF_LDLM_BL_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client,
+			ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_DESC_CALLBACK =
+	DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
+			ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK);
+
+struct req_format RQF_LDLM_INTENT_BASIC =
+	DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
+			ldlm_intent_basic_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC);
+
+struct req_format RQF_LDLM_INTENT =
+	DEFINE_REQ_FMT0("LDLM_INTENT",
+			ldlm_intent_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT);
+
+struct req_format RQF_LDLM_INTENT_LAYOUT =
+	DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ",
+			ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
+
+struct req_format RQF_LDLM_INTENT_GETATTR =
+	DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR",
+			ldlm_intent_getattr_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR);
+
+struct req_format RQF_LDLM_INTENT_OPEN =
+	DEFINE_REQ_FMT0("LDLM_INTENT_OPEN",
+			ldlm_intent_open_client, ldlm_intent_open_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN);
+
+struct req_format RQF_LDLM_INTENT_CREATE =
+	DEFINE_REQ_FMT0("LDLM_INTENT_CREATE",
+			ldlm_intent_create_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
+
+struct req_format RQF_LDLM_INTENT_UNLINK =
+	DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK",
+			ldlm_intent_unlink_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK);
+
+struct req_format RQF_MDS_CLOSE =
+	DEFINE_REQ_FMT0("MDS_CLOSE",
+			mdt_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE);
+
+struct req_format RQF_MDS_PIN =
+	DEFINE_REQ_FMT0("MDS_PIN",
+			mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_PIN);
+
+struct req_format RQF_MDS_UNPIN =
+	DEFINE_REQ_FMT0("MDS_UNPIN", mdt_body_only, empty);
+EXPORT_SYMBOL(RQF_MDS_UNPIN);
+
+struct req_format RQF_MDS_DONE_WRITING =
+	DEFINE_REQ_FMT0("MDS_DONE_WRITING",
+			mdt_close_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_DONE_WRITING);
+
+struct req_format RQF_MDS_READPAGE =
+	DEFINE_REQ_FMT0("MDS_READPAGE",
+			mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_READPAGE);
+
+struct req_format RQF_MDS_HSM_ACTION =
+	DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_ACTION);
+
+struct req_format RQF_MDS_HSM_PROGRESS =
+	DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS);
+
+struct req_format RQF_MDS_HSM_CT_REGISTER =
+	DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER);
+
+struct req_format RQF_MDS_HSM_CT_UNREGISTER =
+	DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER);
+
+struct req_format RQF_MDS_HSM_STATE_GET =
+	DEFINE_REQ_FMT0("MDS_HSM_STATE_GET",
+			mdt_body_capa, mdt_hsm_state_get_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET);
+
+struct req_format RQF_MDS_HSM_STATE_SET =
+	DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET);
+
+struct req_format RQF_MDS_HSM_REQUEST =
+	DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST);
+
+struct req_format RQF_MDS_SWAP_LAYOUTS =
+	DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS",
+			mdt_swap_layouts, empty);
+EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
+
+/* This is for split */
+struct req_format RQF_MDS_WRITEPAGE =
+	DEFINE_REQ_FMT0("MDS_WRITEPAGE",
+			mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_WRITEPAGE);
+
+struct req_format RQF_MDS_IS_SUBDIR =
+	DEFINE_REQ_FMT0("MDS_IS_SUBDIR",
+			mdt_body_only, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_IS_SUBDIR);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
+			llog_origin_handle_create_client, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY",
+			llogd_body_only, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
+			llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK",
+			llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER",
+			llogd_body_only, llog_log_hdr_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+
+struct req_format RQF_LLOG_ORIGIN_CONNECT =
+	DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT);
+
+struct req_format RQF_OST_CONNECT =
+	DEFINE_REQ_FMT0("OST_CONNECT",
+			obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_OST_CONNECT);
+
+struct req_format RQF_OST_DISCONNECT =
+	DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_OST_DISCONNECT);
+
+struct req_format RQF_OST_GETATTR =
+	DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_GETATTR);
+
+struct req_format RQF_OST_SETATTR =
+	DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SETATTR);
+
+struct req_format RQF_OST_CREATE =
+	DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_CREATE);
+
+struct req_format RQF_OST_PUNCH =
+	DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_PUNCH);
+
+struct req_format RQF_OST_SYNC =
+	DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SYNC);
+
+struct req_format RQF_OST_DESTROY =
+	DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_DESTROY);
+
+struct req_format RQF_OST_BRW_READ =
+	DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server);
+EXPORT_SYMBOL(RQF_OST_BRW_READ);
+
+struct req_format RQF_OST_BRW_WRITE =
+	DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server);
+EXPORT_SYMBOL(RQF_OST_BRW_WRITE);
+
+struct req_format RQF_OST_STATFS =
+	DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_OST_STATFS);
+
+struct req_format RQF_OST_SET_GRANT_INFO =
+	DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client,
+			 ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO);
+
+struct req_format RQF_OST_GET_INFO_GENERIC =
+	DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client,
+					ost_get_info_generic_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_GENERIC);
+
+struct req_format RQF_OST_GET_INFO_LAST_ID =
+	DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client,
+						ost_get_last_id_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID);
+
+struct req_format RQF_OST_GET_INFO_LAST_FID =
+	DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", obd_set_info_client,
+						 ost_get_last_fid_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID);
+
+struct req_format RQF_OST_SET_INFO_LAST_FID =
+	DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client,
+						 empty);
+EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID);
+
+struct req_format RQF_OST_GET_INFO_FIEMAP =
+	DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client,
+					       ost_get_fiemap_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP);
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* Convenience macro */
+#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)]
+
+/**
+ * Initializes the capsule abstraction by computing and setting the \a rf_idx
+ * field of RQFs and the \a rmf_offset field of RMFs.
+ */
+int req_layout_init(void)
+{
+	int i;
+	int j;
+	int k;
+	struct req_format *rf = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(req_formats); ++i) {
+		rf = req_formats[i];
+		rf->rf_idx = i;
+		for (j = 0; j < RCL_NR; ++j) {
+			LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR);
+			for (k = 0; k < rf->rf_fields[j].nr; ++k) {
+				struct req_msg_field *field;
+
+				field = (typeof(field))rf->rf_fields[j].d[k];
+				LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY)
+					|| field->rmf_size > 0);
+				LASSERT(field->rmf_offset[i][j] == 0);
+				/*
+				 * k + 1 to detect unused format/field
+				 * combinations.
+				 */
+				field->rmf_offset[i][j] = k + 1;
+			}
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(req_layout_init);
+
+void req_layout_fini(void)
+{
+}
+EXPORT_SYMBOL(req_layout_fini);
+
+/**
+ * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1.
+ *
+ * Actual/expected field sizes are set elsewhere in functions in this file:
+ * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and
+ * req_capsule_msg_size().  The \a rc_area information is used by.
+ * ptlrpc_request_set_replen().
+ */
+void req_capsule_init_area(struct req_capsule *pill)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) {
+		pill->rc_area[RCL_CLIENT][i] = -1;
+		pill->rc_area[RCL_SERVER][i] = -1;
+	}
+}
+EXPORT_SYMBOL(req_capsule_init_area);
+
+/**
+ * Initialize a pill.
+ *
+ * The \a location indicates whether the caller is executing on the client side
+ * (RCL_CLIENT) or server side (RCL_SERVER)..
+ */
+void req_capsule_init(struct req_capsule *pill,
+		      struct ptlrpc_request *req,
+		      enum req_location location)
+{
+	LASSERT(location == RCL_SERVER || location == RCL_CLIENT);
+
+	/*
+	 * Today all capsules are embedded in ptlrpc_request structs,
+	 * but just in case that ever isn't the case, we don't reach
+	 * into req unless req != NULL and pill is the one embedded in
+	 * the req.
+	 *
+	 * The req->rq_pill_init flag makes it safe to initialize a pill
+	 * twice, which might happen in the OST paths as a result of the
+	 * high-priority RPC queue getting peeked at before ost_handle()
+	 * handles an OST RPC.
+	 */
+	if (req != NULL && pill == &req->rq_pill && req->rq_pill_init)
+		return;
+
+	memset(pill, 0, sizeof *pill);
+	pill->rc_req = req;
+	pill->rc_loc = location;
+	req_capsule_init_area(pill);
+
+	if (req != NULL && pill == &req->rq_pill)
+		req->rq_pill_init = 1;
+}
+EXPORT_SYMBOL(req_capsule_init);
+
+void req_capsule_fini(struct req_capsule *pill)
+{
+}
+EXPORT_SYMBOL(req_capsule_fini);
+
+static int __req_format_is_sane(const struct req_format *fmt)
+{
+	return
+		0 <= fmt->rf_idx && fmt->rf_idx < ARRAY_SIZE(req_formats) &&
+		req_formats[fmt->rf_idx] == fmt;
+}
+
+static struct lustre_msg *__req_msg(const struct req_capsule *pill,
+				    enum req_location loc)
+{
+	struct ptlrpc_request *req;
+
+	req = pill->rc_req;
+	return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg;
+}
+
+/**
+ * Set the format (\a fmt) of a \a pill; format changes are not allowed here
+ * (see req_capsule_extend()).
+ */
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt)
+{
+	LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt);
+	LASSERT(__req_format_is_sane(fmt));
+
+	pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_set);
+
+/**
+ * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in
+ * yet.
+
+ * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of
+ * variable-sized fields.  The field sizes come from the declared \a rmf_size
+ * field of a \a pill's \a rc_fmt's RMF's.
+ */
+int req_capsule_filled_sizes(struct req_capsule *pill,
+			   enum req_location loc)
+{
+	const struct req_format *fmt = pill->rc_fmt;
+	int		      i;
+
+	LASSERT(fmt != NULL);
+
+	for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+		if (pill->rc_area[loc][i] == -1) {
+			pill->rc_area[loc][i] =
+					    fmt->rf_fields[loc].d[i]->rmf_size;
+			if (pill->rc_area[loc][i] == -1) {
+				/*
+				 * Skip the following fields.
+				 *
+				 * If this LASSERT() trips then you're missing a
+				 * call to req_capsule_set_size().
+				 */
+				LASSERT(loc != RCL_SERVER);
+				break;
+			}
+		}
+	}
+	return i;
+}
+EXPORT_SYMBOL(req_capsule_filled_sizes);
+
+/**
+ * Capsule equivalent of lustre_pack_request() and lustre_pack_reply().
+ *
+ * This function uses the \a pill's \a rc_area as filled in by
+ * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by
+ * this function).
+ */
+int req_capsule_server_pack(struct req_capsule *pill)
+{
+	const struct req_format *fmt;
+	int		      count;
+	int		      rc;
+
+	LASSERT(pill->rc_loc == RCL_SERVER);
+	fmt = pill->rc_fmt;
+	LASSERT(fmt != NULL);
+
+	count = req_capsule_filled_sizes(pill, RCL_SERVER);
+	rc = lustre_pack_reply(pill->rc_req, count,
+			       pill->rc_area[RCL_SERVER], NULL);
+	if (rc != 0) {
+		DEBUG_REQ(D_ERROR, pill->rc_req,
+		       "Cannot pack %d fields in format `%s': ",
+		       count, fmt->rf_name);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(req_capsule_server_pack);
+
+/**
+ * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill
+ * corresponding to the given RMF (\a field).
+ */
+static int __req_capsule_offset(const struct req_capsule *pill,
+				const struct req_msg_field *field,
+				enum req_location loc)
+{
+	int offset;
+
+	offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+	LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n",
+			    pill->rc_fmt->rf_name,
+			    field->rmf_name, offset, loc);
+	offset --;
+
+	LASSERT(0 <= offset && offset < REQ_MAX_FIELD_NR);
+	return offset;
+}
+
+/**
+ * Helper for __req_capsule_get(); swabs value / array of values and/or dumps
+ * them if desired.
+ */
+static
+void
+swabber_dumper_helper(struct req_capsule *pill,
+		      const struct req_msg_field *field,
+		      enum req_location loc,
+		      int offset,
+		      void *value, int len, int dump, void (*swabber)( void *))
+{
+	void    *p;
+	int     i;
+	int     n;
+	int     do_swab;
+	int     inout = loc == RCL_CLIENT;
+
+	swabber = swabber ?: field->rmf_swabber;
+
+	if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) &&
+	    swabber != NULL && value != NULL)
+		do_swab = 1;
+	else
+		do_swab = 0;
+
+	if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) {
+		if (dump && field->rmf_dumper) {
+			CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n",
+			       do_swab ? "unswabbed " : "", field->rmf_name);
+			field->rmf_dumper(value);
+		}
+		if (!do_swab)
+			return;
+		swabber(value);
+		ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+		if (dump) {
+			CDEBUG(D_RPCTRACE, "Dump of swabbed field %s "
+			       "follows\n", field->rmf_name);
+			field->rmf_dumper(value);
+		}
+
+		return;
+	}
+
+	/*
+	 * We're swabbing an array; swabber() swabs a single array element, so
+	 * swab every element.
+	 */
+	LASSERT((len % field->rmf_size) == 0);
+	for (p = value, i = 0, n = len / field->rmf_size;
+	     i < n;
+	     i++, p += field->rmf_size) {
+		if (dump && field->rmf_dumper) {
+			CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, "
+			       "element %d follows\n",
+			       do_swab ? "unswabbed " : "", field->rmf_name, i);
+			field->rmf_dumper(p);
+		}
+		if (!do_swab)
+			continue;
+		swabber(p);
+		if (dump && field->rmf_dumper) {
+			CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, "
+			       "element %d follows\n", field->rmf_name, i);
+			field->rmf_dumper(value);
+		}
+	}
+	if (do_swab)
+		ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+}
+
+/**
+ * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill
+ * corresponding to the given RMF (\a field).
+ *
+ * The buffer will be swabbed using the given \a swabber.  If \a swabber == NULL
+ * then the \a rmf_swabber from the RMF will be used.  Soon there will be no
+ * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then
+ * be removed.  Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each
+ * element of the array swabbed.
+ */
+static void *__req_capsule_get(struct req_capsule *pill,
+			       const struct req_msg_field *field,
+			       enum req_location loc,
+			       void (*swabber)( void *),
+			       int dump)
+{
+	const struct req_format *fmt;
+	struct lustre_msg       *msg;
+	void		    *value;
+	int		      len;
+	int		      offset;
+
+	void *(*getter)(struct lustre_msg *m, int n, int minlen);
+
+	static const char *rcl_names[RCL_NR] = {
+		[RCL_CLIENT] = "client",
+		[RCL_SERVER] = "server"
+	};
+
+	LASSERT(pill != NULL);
+	LASSERT(pill != LP_POISON);
+	fmt = pill->rc_fmt;
+	LASSERT(fmt != NULL);
+	LASSERT(fmt != LP_POISON);
+	LASSERT(__req_format_is_sane(fmt));
+
+	offset = __req_capsule_offset(pill, field, loc);
+
+	msg = __req_msg(pill, loc);
+	LASSERT(msg != NULL);
+
+	getter = (field->rmf_flags & RMF_F_STRING) ?
+		(typeof(getter))lustre_msg_string : lustre_msg_buf;
+
+	if (field->rmf_flags & RMF_F_STRUCT_ARRAY) {
+		/*
+		 * We've already asserted that field->rmf_size > 0 in
+		 * req_layout_init().
+		 */
+		len = lustre_msg_buflen(msg, offset);
+		if ((len % field->rmf_size) != 0) {
+			CERROR("%s: array field size mismatch "
+			       "%d modulo %d != 0 (%d)\n",
+			       field->rmf_name, len, field->rmf_size, loc);
+			return NULL;
+		}
+	} else if (pill->rc_area[loc][offset] != -1) {
+		len = pill->rc_area[loc][offset];
+	} else {
+		len = max(field->rmf_size, 0);
+	}
+	value = getter(msg, offset, len);
+
+	if (value == NULL) {
+		DEBUG_REQ(D_ERROR, pill->rc_req,
+			  "Wrong buffer for field `%s' (%d of %d) "
+			  "in format `%s': %d vs. %d (%s)\n",
+			  field->rmf_name, offset, lustre_msg_bufcount(msg),
+			  fmt->rf_name, lustre_msg_buflen(msg, offset), len,
+			  rcl_names[loc]);
+	} else {
+		swabber_dumper_helper(pill, field, loc, offset, value, len,
+				      dump, swabber);
+	}
+
+	return value;
+}
+
+/**
+ * Dump a request and/or reply
+ */
+void __req_capsule_dump(struct req_capsule *pill, enum req_location loc)
+{
+	const struct    req_format *fmt;
+	const struct    req_msg_field *field;
+	int	     len;
+	int	     i;
+
+	fmt = pill->rc_fmt;
+
+	DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP\n");
+	for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+		field = FMT_FIELD(fmt, loc, i);
+		if (field->rmf_dumper == NULL) {
+			/*
+			 * FIXME Add a default hex dumper for fields that don't
+			 * have a specific dumper
+			 */
+			len = req_capsule_get_size(pill, field, loc);
+			CDEBUG(D_RPCTRACE, "Field %s has no dumper function;"
+			       "field size is %d\n", field->rmf_name, len);
+		} else {
+			/* It's the dumping side-effect that we're interested in */
+			(void) __req_capsule_get(pill, field, loc, NULL, 1);
+		}
+	}
+	CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n");
+}
+
+/**
+ * Dump a request.
+ */
+void req_capsule_client_dump(struct req_capsule *pill)
+{
+	__req_capsule_dump(pill, RCL_CLIENT);
+}
+EXPORT_SYMBOL(req_capsule_client_dump);
+
+/**
+ * Dump a reply
+ */
+void req_capsule_server_dump(struct req_capsule *pill)
+{
+	__req_capsule_dump(pill, RCL_SERVER);
+}
+EXPORT_SYMBOL(req_capsule_server_dump);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_client_get(struct req_capsule *pill,
+			     const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_get);
+
+/**
+ * Same as req_capsule_client_get(), but with a \a swabber argument.
+ *
+ * Currently unused; will be removed when req_capsule_server_swab_get() is
+ * unused too.
+ */
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber)
+{
+	return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_client_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len)
+{
+	req_capsule_set_size(pill, field, RCL_CLIENT, len);
+	return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_sized_get);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_server_get(struct req_capsule *pill,
+			     const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_get);
+
+/**
+ * Same as req_capsule_server_get(), but with a \a swabber argument.
+ *
+ * Ideally all swabbing should be done pursuant to RMF definitions, with no
+ * swabbing done outside this capsule abstraction.
+ */
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+				  const struct req_msg_field *field,
+				  void *swabber)
+{
+	return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_server_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+				   const struct req_msg_field *field,
+				   int len)
+{
+	req_capsule_set_size(pill, field, RCL_SERVER, len);
+	return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_get);
+
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+					const struct req_msg_field *field,
+					int len, void *swabber)
+{
+	req_capsule_set_size(pill, field, RCL_SERVER, len);
+	return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_swab_get);
+
+/**
+ * Returns the buffer of a \a pill corresponding to the given \a field from the
+ * request (if the caller is executing on the server-side) or reply (if the
+ * caller is executing on the client-side).
+ *
+ * This function convienient for use is code that could be executed on the
+ * client and server alike.
+ */
+const void *req_capsule_other_get(struct req_capsule *pill,
+				  const struct req_msg_field *field)
+{
+	return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_other_get);
+
+/**
+ * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a
+ * field of the given \a pill.
+ *
+ * This function must be used when constructing variable sized fields of a
+ * request or reply.
+ */
+void req_capsule_set_size(struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc, int size)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	if ((size != field->rmf_size) &&
+	    (field->rmf_size != -1) &&
+	    !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+	    (size > 0)) {
+		if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+		    (size % field->rmf_size != 0)) {
+			CERROR("%s: array field size mismatch "
+			       "%d %% %d != 0 (%d)\n",
+			       field->rmf_name, size, field->rmf_size, loc);
+			LBUG();
+		} else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+		    size < field->rmf_size) {
+			CERROR("%s: field size mismatch %d != %d (%d)\n",
+			       field->rmf_name, size, field->rmf_size, loc);
+			LBUG();
+		}
+	}
+
+	pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size;
+}
+EXPORT_SYMBOL(req_capsule_set_size);
+
+/**
+ * Return the actual PTLRPC buffer length of a request or reply (\a loc)
+ * for the given \a pill's given \a field.
+ *
+ * NB: this function doesn't correspond with req_capsule_set_size(), which
+ * actually sets the size in pill.rc_area[loc][offset], but this function
+ * returns the message buflen[offset], maybe we should use another name.
+ */
+int req_capsule_get_size(const struct req_capsule *pill,
+			 const struct req_msg_field *field,
+			 enum req_location loc)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	return lustre_msg_buflen(__req_msg(pill, loc),
+				 __req_capsule_offset(pill, field, loc));
+}
+EXPORT_SYMBOL(req_capsule_get_size);
+
+/**
+ * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the
+ * given \a pill's request or reply (\a loc) given the field size recorded in
+ * the \a pill's rc_area.
+ *
+ * See also req_capsule_set_size().
+ */
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc)
+{
+	return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic,
+			       pill->rc_fmt->rf_fields[loc].nr,
+			       pill->rc_area[loc]);
+}
+
+/**
+ * While req_capsule_msg_size() computes the size of a PTLRPC request or reply
+ * (\a loc) given a \a pill's \a rc_area, this function computes the size of a
+ * PTLRPC request or reply given only an RQF (\a fmt).
+ *
+ * This function should not be used for formats which contain variable size
+ * fields.
+ */
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+			 enum req_location loc)
+{
+	int size, i = 0;
+
+	/*
+	 * This function should probably LASSERT() that fmt has no fields with
+	 * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many
+	 * elements in the array there will ultimately be, but then, we could
+	 * assume that there will be at least one element, and that's just what
+	 * we do.
+	 */
+	size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr);
+	if (size < 0)
+		return size;
+
+	for (; i < fmt->rf_fields[loc].nr; ++i)
+		if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+			size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+					       rmf_size);
+	return size;
+}
+
+/**
+ * Changes the format of an RPC.
+ *
+ * The pill must already have been initialized, which means that it already has
+ * a request format.  The new format \a fmt must be an extension of the pill's
+ * old format.  Specifically: the new format must have as many request and reply
+ * fields as the old one, and all fields shared by the old and new format must
+ * be at least as large in the new format.
+ *
+ * The new format's fields may be of different "type" than the old format, but
+ * only for fields that are "opaque" blobs: fields which have a) have no
+ * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a
+ * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK.  For example,
+ * OBD_SET_INFO has a key field and an opaque value field that gets interpreted
+ * according to the key field.  When the value, according to the key, contains a
+ * structure (or array thereof) to be swabbed, the format should be changed to
+ * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set
+ * accordingly.
+ */
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt)
+{
+	int i;
+	int j;
+
+	const struct req_format *old;
+
+	LASSERT(pill->rc_fmt != NULL);
+	LASSERT(__req_format_is_sane(fmt));
+
+	old = pill->rc_fmt;
+	/*
+	 * Sanity checking...
+	 */
+	for (i = 0; i < RCL_NR; ++i) {
+		LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr);
+		for (j = 0; j < old->rf_fields[i].nr - 1; ++j) {
+			const struct req_msg_field *ofield = FMT_FIELD(old, i, j);
+
+			/* "opaque" fields can be transmogrified */
+			if (ofield->rmf_swabber == NULL &&
+			    (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 &&
+			    (ofield->rmf_size == -1 ||
+			    ofield->rmf_flags == RMF_F_NO_SIZE_CHECK))
+				continue;
+			LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j));
+		}
+		/*
+		 * Last field in old format can be shorter than in new.
+		 */
+		LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >=
+			FMT_FIELD(old, i, j)->rmf_size);
+	}
+
+	pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_extend);
+
+/**
+ * This function returns a non-zero value if the given \a field is present in
+ * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it
+ * returns 0.
+ */
+int req_capsule_has_field(const struct req_capsule *pill,
+			  const struct req_msg_field *field,
+			  enum req_location loc)
+{
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+	return field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+}
+EXPORT_SYMBOL(req_capsule_has_field);
+
+/**
+ * Returns a non-zero value if the given \a field is present in the given \a
+ * pill's PTLRPC request or reply (\a loc), else it returns 0.
+ */
+int req_capsule_field_present(const struct req_capsule *pill,
+			      const struct req_msg_field *field,
+			      enum req_location loc)
+{
+	int offset;
+
+	LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+	LASSERT(req_capsule_has_field(pill, field, loc));
+
+	offset = __req_capsule_offset(pill, field, loc);
+	return lustre_msg_bufcount(__req_msg(pill, loc)) > offset;
+}
+EXPORT_SYMBOL(req_capsule_field_present);
+
+/**
+ * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC
+ * request or reply (\a loc).
+ *
+ * This is not the opposite of req_capsule_extend().
+ */
+void req_capsule_shrink(struct req_capsule *pill,
+			const struct req_msg_field *field,
+			unsigned int newlen,
+			enum req_location loc)
+{
+	const struct req_format *fmt;
+	struct lustre_msg       *msg;
+	int		      len;
+	int		      offset;
+
+	fmt = pill->rc_fmt;
+	LASSERT(fmt != NULL);
+	LASSERT(__req_format_is_sane(fmt));
+	LASSERT(req_capsule_has_field(pill, field, loc));
+	LASSERT(req_capsule_field_present(pill, field, loc));
+
+	offset = __req_capsule_offset(pill, field, loc);
+
+	msg = __req_msg(pill, loc);
+	len = lustre_msg_buflen(msg, offset);
+	LASSERTF(newlen <= len, "%s:%s, oldlen=%d, newlen=%d\n",
+				fmt->rf_name, field->rmf_name, len, newlen);
+
+	if (loc == RCL_CLIENT)
+		pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen,
+							    1);
+	else
+		pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen,
+							    1);
+}
+EXPORT_SYMBOL(req_capsule_shrink);
+
+int req_capsule_server_grow(struct req_capsule *pill,
+			    const struct req_msg_field *field,
+			    unsigned int newlen)
+{
+	struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs;
+	char *from, *to;
+	int offset, len, rc;
+
+	LASSERT(pill->rc_fmt != NULL);
+	LASSERT(__req_format_is_sane(pill->rc_fmt));
+	LASSERT(req_capsule_has_field(pill, field, RCL_SERVER));
+	LASSERT(req_capsule_field_present(pill, field, RCL_SERVER));
+
+	len = req_capsule_get_size(pill, field, RCL_SERVER);
+	offset = __req_capsule_offset(pill, field, RCL_SERVER);
+	if (pill->rc_req->rq_repbuf_len >=
+	    lustre_packed_msg_size(pill->rc_req->rq_repmsg) - len + newlen)
+		CERROR("Inplace repack might be done\n");
+
+	pill->rc_req->rq_reply_state = NULL;
+	req_capsule_set_size(pill, field, RCL_SERVER, newlen);
+	rc = req_capsule_server_pack(pill);
+	if (rc) {
+		/* put old rs back, the caller will decide what to do */
+		pill->rc_req->rq_reply_state = rs;
+		return rc;
+	}
+	nrs = pill->rc_req->rq_reply_state;
+	/* Now we need only buffers, copy first chunk */
+	to = lustre_msg_buf(nrs->rs_msg, 0, 0);
+	from = lustre_msg_buf(rs->rs_msg, 0, 0);
+	len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) - from;
+	memcpy(to, from, len);
+	/* check if we have tail and copy it too */
+	if (rs->rs_msg->lm_bufcount > offset + 1) {
+		to = lustre_msg_buf(nrs->rs_msg, offset + 1, 0);
+		from = lustre_msg_buf(rs->rs_msg, offset + 1, 0);
+		offset = rs->rs_msg->lm_bufcount - 1;
+		len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) +
+		      cfs_size_round(rs->rs_msg->lm_buflens[offset]) - from;
+		memcpy(to, from, len);
+	}
+	/* drop old reply if everything is fine */
+	if (rs->rs_difficult) {
+		/* copy rs data */
+		int i;
+
+		nrs->rs_difficult = 1;
+		nrs->rs_no_ack = rs->rs_no_ack;
+		for (i = 0; i < rs->rs_nlocks; i++) {
+			nrs->rs_locks[i] = rs->rs_locks[i];
+			nrs->rs_modes[i] = rs->rs_modes[i];
+			nrs->rs_nlocks++;
+		}
+		rs->rs_nlocks = 0;
+		rs->rs_difficult = 0;
+		rs->rs_no_ack = 0;
+	}
+	ptlrpc_rs_decref(rs);
+	return 0;
+}
+EXPORT_SYMBOL(req_capsule_server_grow);
+/* __REQ_LAYOUT_USER__ */
+#endif

diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_client.c b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c
new file mode 100644
index 0000000..367ca8e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c

@@ -0,0 +1,354 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_client.c
+ *
+ * remote api for llog - client side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+#include <linux/list.h>
+
+#define LLOG_CLIENT_ENTRY(ctxt, imp) do {			     \
+	mutex_lock(&ctxt->loc_mutex);			     \
+	if (ctxt->loc_imp) {					  \
+		imp = class_import_get(ctxt->loc_imp);		\
+	} else {						      \
+		CERROR("ctxt->loc_imp == NULL for context idx %d."    \
+		       "Unable to complete MDS/OSS recovery,"	 \
+		       "but I'll try again next time.  Not fatal.\n", \
+		       ctxt->loc_idx);				\
+		imp = NULL;					   \
+		mutex_unlock(&ctxt->loc_mutex);		   \
+		return (-EINVAL);				     \
+	}							     \
+	mutex_unlock(&ctxt->loc_mutex);			   \
+} while(0)
+
+#define LLOG_CLIENT_EXIT(ctxt, imp) do {			      \
+	mutex_lock(&ctxt->loc_mutex);			     \
+	if (ctxt->loc_imp != imp)				     \
+		CWARN("loc_imp has changed from %p to %p\n",	  \
+		       ctxt->loc_imp, imp);			   \
+	class_import_put(imp);					\
+	mutex_unlock(&ctxt->loc_mutex);			   \
+} while(0)
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_client_open(const struct lu_env *env,
+			    struct llog_handle *lgh, struct llog_logid *logid,
+			    char *name, enum llog_open_param open_param)
+{
+	struct obd_import     *imp;
+	struct llogd_body     *body;
+	struct llog_ctxt      *ctxt = lgh->lgh_ctxt;
+	struct ptlrpc_request *req = NULL;
+	int		    rc;
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(ctxt, imp);
+
+	/* client cannot create llog */
+	LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param);
+	LASSERT(lgh);
+
+	req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+	if (req == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	if (name)
+		req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+				     strlen(name) + 1);
+
+	rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION,
+				 LLOG_ORIGIN_HANDLE_CREATE);
+	if (rc) {
+		ptlrpc_request_free(req);
+		req = NULL;
+		GOTO(out, rc);
+	}
+	ptlrpc_request_set_replen(req);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (logid)
+		body->lgd_logid = *logid;
+	body->lgd_ctxt_idx = ctxt->loc_idx - 1;
+
+	if (name) {
+		char *tmp;
+		tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME,
+						   strlen(name) + 1);
+		LASSERT(tmp);
+		strcpy(tmp, name);
+	}
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		GOTO(out, rc = -EFAULT);
+
+	lgh->lgh_id = body->lgd_logid;
+	lgh->lgh_ctxt = ctxt;
+	EXIT;
+out:
+	LLOG_CLIENT_EXIT(ctxt, imp);
+	ptlrpc_req_finished(req);
+	return rc;
+}
+
+static int llog_client_destroy(const struct lu_env *env,
+			       struct llog_handle *loghandle)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	int		    rc;
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_DESTROY);
+	if (req == NULL)
+		GOTO(err_exit, rc =-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+
+	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+		CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name,
+		       body->lgd_llh_flags);
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	RETURN(rc);
+}
+
+
+static int llog_client_next_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int *cur_idx, int next_idx,
+				  __u64 *cur_offset, void *buf, int len)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	void		  *ptr;
+	int		    rc;
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+	if (req == NULL)
+		GOTO(err_exit, rc =-ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+	body->lgd_index = next_idx;
+	body->lgd_saved_index = *cur_idx;
+	body->lgd_len = len;
+	body->lgd_cur_offset = *cur_offset;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		GOTO(out, rc =-EFAULT);
+
+	/* The log records are swabbed as they are processed */
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	if (ptr == NULL)
+		GOTO(out, rc =-EFAULT);
+
+	*cur_idx = body->lgd_saved_index;
+	*cur_offset = body->lgd_cur_offset;
+
+	memcpy(buf, ptr, len);
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_prev_block(const struct lu_env *env,
+				  struct llog_handle *loghandle,
+				  int prev_idx, void *buf, int len)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	void		  *ptr;
+	int		    rc;
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+	if (req == NULL)
+		GOTO(err_exit, rc = -ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+	body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+	body->lgd_index = prev_idx;
+	body->lgd_len = len;
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+	ptlrpc_request_set_replen(req);
+
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		GOTO(out, rc =-EFAULT);
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	if (ptr == NULL)
+		GOTO(out, rc =-EFAULT);
+
+	memcpy(buf, ptr, len);
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_read_header(const struct lu_env *env,
+				   struct llog_handle *handle)
+{
+	struct obd_import     *imp;
+	struct ptlrpc_request *req = NULL;
+	struct llogd_body     *body;
+	struct llog_log_hdr   *hdr;
+	struct llog_rec_hdr   *llh_hdr;
+	int		    rc;
+	ENTRY;
+
+	LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp);
+	req = ptlrpc_request_alloc_pack(imp,&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+					LUSTRE_LOG_VERSION,
+					LLOG_ORIGIN_HANDLE_READ_HEADER);
+	if (req == NULL)
+		GOTO(err_exit, rc = -ENOMEM);
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = handle->lgh_id;
+	body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1;
+	body->lgd_llh_flags = handle->lgh_hdr->llh_flags;
+
+	ptlrpc_request_set_replen(req);
+	rc = ptlrpc_queue_wait(req);
+	if (rc)
+		GOTO(out, rc);
+
+	hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+	if (hdr == NULL)
+		GOTO(out, rc =-EFAULT);
+
+	memcpy(handle->lgh_hdr, hdr, sizeof (*hdr));
+	handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+	/* sanity checks */
+	llh_hdr = &handle->lgh_hdr->llh_hdr;
+	if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+		CERROR("bad log header magic: %#x (expecting %#x)\n",
+		       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+		rc = -EIO;
+	} else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+		CERROR("incorrectly sized log header: %#x "
+		       "(expecting %#x)\n",
+		       llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+		CERROR("you may need to re-run lconf --write_conf.\n");
+		rc = -EIO;
+	}
+	EXIT;
+out:
+	ptlrpc_req_finished(req);
+err_exit:
+	LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp);
+	return rc;
+}
+
+static int llog_client_close(const struct lu_env *env,
+			     struct llog_handle *handle)
+{
+	/* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because
+	   the servers all close the file at the end of every
+	   other LLOG_ RPC. */
+	return(0);
+}
+
+struct llog_operations llog_client_ops = {
+	.lop_next_block		= llog_client_next_block,
+	.lop_prev_block		= llog_client_prev_block,
+	.lop_read_header	= llog_client_read_header,
+	.lop_open		= llog_client_open,
+	.lop_destroy		= llog_client_destroy,
+	.lop_close		= llog_client_close,
+};
+EXPORT_SYMBOL(llog_client_ops);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_net.c b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c
new file mode 100644
index 0000000..a81f557
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c

@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_net.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <linux/list.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+
+int llog_initiator_connect(struct llog_ctxt *ctxt)
+{
+	struct obd_import *new_imp;
+	ENTRY;
+
+	LASSERT(ctxt);
+	new_imp = ctxt->loc_obd->u.cli.cl_import;
+	LASSERTF(ctxt->loc_imp == NULL || ctxt->loc_imp == new_imp,
+		 "%p - %p\n", ctxt->loc_imp, new_imp);
+	mutex_lock(&ctxt->loc_mutex);
+	if (ctxt->loc_imp != new_imp) {
+		if (ctxt->loc_imp)
+			class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = class_import_get(new_imp);
+	}
+	mutex_unlock(&ctxt->loc_mutex);
+	RETURN(0);
+}
+EXPORT_SYMBOL(llog_initiator_connect);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_server.c b/drivers/staging/lustre/lustre/ptlrpc/llog_server.c
new file mode 100644
index 0000000..bc1fcd8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/llog_server.c

@@ -0,0 +1,466 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_server.c
+ *
+ * remote api for llog - server side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+#include <lustre_fsfilt.h>
+
+#if  defined(LUSTRE_LOG_SERVER)
+static int llog_origin_close(const struct lu_env *env, struct llog_handle *lgh)
+{
+	if (lgh->lgh_hdr != NULL && lgh->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		return llog_cat_close(env, lgh);
+	else
+		return llog_close(env, lgh);
+}
+
+/* Only open is supported, no new llog can be created remotely */
+int llog_origin_handle_open(struct ptlrpc_request *req)
+{
+	struct obd_export	*exp = req->rq_export;
+	struct obd_device	*obd = exp->exp_obd;
+	struct obd_device	*disk_obd;
+	struct lvfs_run_ctxt	 saved;
+	struct llog_handle	*loghandle;
+	struct llogd_body	*body;
+	struct llog_logid	*logid = NULL;
+	struct llog_ctxt	*ctxt;
+	char			*name = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+		logid = &body->lgd_logid;
+
+	if (req_capsule_field_present(&req->rq_pill, &RMF_NAME, RCL_CLIENT)) {
+		name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+		if (name == NULL)
+			RETURN(-EFAULT);
+		CDEBUG(D_INFO, "%s: opening log %s\n", obd->obd_name, name);
+	}
+
+	ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL) {
+		CDEBUG(D_WARNING, "%s: no ctxt. group=%p idx=%d name=%s\n",
+		       obd->obd_name, &obd->obd_olg, body->lgd_ctxt_idx, name);
+		RETURN(-ENODEV);
+	}
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, logid,
+		       name, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_pop, rc);
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out_close, rc = -ENOMEM);
+
+	body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	body->lgd_logid = loghandle->lgh_id;
+
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_open);
+
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+	struct obd_device	*disk_obd;
+	struct lvfs_run_ctxt	 saved;
+	struct llogd_body	*body;
+	struct llog_logid	*logid = NULL;
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+		logid = &body->lgd_logid;
+
+	if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+		CERROR("%s: wrong llog flags %x\n",
+		       req->rq_export->exp_obd->obd_name, body->lgd_llh_flags);
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	/* erase only if no error and logid is valid */
+	if (rc == 0)
+		rc = llog_erase(req->rq_svc_thread->t_env, ctxt, logid, NULL);
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_origin_handle_destroy);
+
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+	struct obd_device   *disk_obd;
+	struct llog_handle  *loghandle;
+	struct llogd_body   *body;
+	struct llogd_body   *repbody;
+	struct lvfs_run_ctxt saved;
+	struct llog_ctxt    *ctxt;
+	__u32		flags;
+	void		*ptr;
+	int		  rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+		       &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_pop, rc);
+
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+			     LLOG_CHUNK_SIZE);
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out_close, rc = -ENOMEM);
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	*repbody = *body;
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	rc = llog_next_block(req->rq_svc_thread->t_env, loghandle,
+			     &repbody->lgd_saved_index, repbody->lgd_index,
+			     &repbody->lgd_cur_offset, ptr, LLOG_CHUNK_SIZE);
+	if (rc)
+		GOTO(out_close, rc);
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_next_block);
+
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+	struct llog_handle   *loghandle;
+	struct llogd_body    *body;
+	struct llogd_body    *repbody;
+	struct obd_device    *disk_obd;
+	struct lvfs_run_ctxt  saved;
+	struct llog_ctxt     *ctxt;
+	__u32		 flags;
+	void		 *ptr;
+	int		   rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+			 &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_pop, rc);
+
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+			     LLOG_CHUNK_SIZE);
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out_close, rc = -ENOMEM);
+
+	repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	*repbody = *body;
+
+	ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+	rc = llog_prev_block(req->rq_svc_thread->t_env, loghandle,
+			     body->lgd_index, ptr, LLOG_CHUNK_SIZE);
+	if (rc)
+		GOTO(out_close, rc);
+
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_prev_block);
+
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+	struct obd_device    *disk_obd;
+	struct llog_handle   *loghandle;
+	struct llogd_body    *body;
+	struct llog_log_hdr  *hdr;
+	struct lvfs_run_ctxt  saved;
+	struct llog_ctxt     *ctxt;
+	__u32		 flags;
+	int		   rc;
+
+	ENTRY;
+
+	body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+	if (body == NULL)
+		RETURN(-EFAULT);
+
+	ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+	rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+		       &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc)
+		GOTO(out_pop, rc);
+
+	/*
+	 * llog_init_handle() reads the llog header
+	 */
+	flags = body->lgd_llh_flags;
+	rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+			      NULL);
+	if (rc)
+		GOTO(out_close, rc);
+	flags = loghandle->lgh_hdr->llh_flags;
+
+	rc = req_capsule_server_pack(&req->rq_pill);
+	if (rc)
+		GOTO(out_close, rc = -ENOMEM);
+
+	hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+	*hdr = *loghandle->lgh_hdr;
+	EXIT;
+out_close:
+	llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_read_header);
+
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+	ENTRY;
+	/* Nothing to do */
+	RETURN(0);
+}
+EXPORT_SYMBOL(llog_origin_handle_close);
+
+int llog_origin_handle_cancel(struct ptlrpc_request *req)
+{
+	int num_cookies, rc = 0, err, i, failed = 0;
+	struct obd_device *disk_obd;
+	struct llog_cookie *logcookies;
+	struct llog_ctxt *ctxt = NULL;
+	struct lvfs_run_ctxt saved;
+	struct llog_handle *cathandle;
+	struct inode *inode;
+	void *handle;
+	ENTRY;
+
+	logcookies = req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES);
+	num_cookies = req_capsule_get_size(&req->rq_pill, &RMF_LOGCOOKIES,
+					   RCL_CLIENT) / sizeof(*logcookies);
+	if (logcookies == NULL || num_cookies == 0) {
+		DEBUG_REQ(D_HA, req, "No llog cookies sent");
+		RETURN(-EFAULT);
+	}
+
+	ctxt = llog_get_context(req->rq_export->exp_obd,
+				logcookies->lgc_subsys);
+	if (ctxt == NULL)
+		RETURN(-ENODEV);
+
+	disk_obd = ctxt->loc_exp->exp_obd;
+	push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	for (i = 0; i < num_cookies; i++, logcookies++) {
+		cathandle = ctxt->loc_handle;
+		LASSERT(cathandle != NULL);
+		inode = cathandle->lgh_file->f_dentry->d_inode;
+
+		handle = fsfilt_start_log(disk_obd, inode,
+					  FSFILT_OP_CANCEL_UNLINK, NULL, 1);
+		if (IS_ERR(handle)) {
+			CERROR("fsfilt_start_log() failed: %ld\n",
+			       PTR_ERR(handle));
+			GOTO(pop_ctxt, rc = PTR_ERR(handle));
+		}
+
+		rc = llog_cat_cancel_records(req->rq_svc_thread->t_env,
+					     cathandle, 1, logcookies);
+
+		/*
+		 * Do not raise -ENOENT errors for resent rpcs. This rec already
+		 * might be killed.
+		 */
+		if (rc == -ENOENT &&
+		    (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) {
+			/*
+			 * Do not change this message, reply-single.sh test_59b
+			 * expects to find this in log.
+			 */
+			CDEBUG(D_RPCTRACE, "RESENT cancel req %p - ignored\n",
+			       req);
+			rc = 0;
+		} else if (rc == 0) {
+			CDEBUG(D_RPCTRACE, "Canceled %d llog-records\n",
+			       num_cookies);
+		}
+
+		err = fsfilt_commit(disk_obd, inode, handle, 0);
+		if (err) {
+			CERROR("Error committing transaction: %d\n", err);
+			if (!rc)
+				rc = err;
+			failed++;
+			GOTO(pop_ctxt, rc);
+		} else if (rc)
+			failed++;
+	}
+	GOTO(pop_ctxt, rc);
+pop_ctxt:
+	pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+	if (rc)
+		CERROR("Cancel %d of %d llog-records failed: %d\n",
+		       failed, num_cookies, rc);
+
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_cancel);
+
+#else /* !__KERNEL__ */
+int llog_origin_handle_open(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+int llog_origin_handle_cancel(struct ptlrpc_request *req)
+{
+	LBUG();
+	return 0;
+}
+#endif

diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
new file mode 100644
index 0000000..3e73254
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c

@@ -0,0 +1,1345 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_support.h>
+#include <obd.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+
+struct ll_rpc_opcode {
+     __u32       opcode;
+     const char *opname;
+} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = {
+	{ OST_REPLY,	"ost_reply" },
+	{ OST_GETATTR,      "ost_getattr" },
+	{ OST_SETATTR,      "ost_setattr" },
+	{ OST_READ,	 "ost_read" },
+	{ OST_WRITE,	"ost_write" },
+	{ OST_CREATE ,      "ost_create" },
+	{ OST_DESTROY,      "ost_destroy" },
+	{ OST_GET_INFO,     "ost_get_info" },
+	{ OST_CONNECT,      "ost_connect" },
+	{ OST_DISCONNECT,   "ost_disconnect" },
+	{ OST_PUNCH,	"ost_punch" },
+	{ OST_OPEN,	 "ost_open" },
+	{ OST_CLOSE,	"ost_close" },
+	{ OST_STATFS,       "ost_statfs" },
+	{ 14,		NULL },    /* formerly OST_SAN_READ */
+	{ 15,		NULL },    /* formerly OST_SAN_WRITE */
+	{ OST_SYNC,	 "ost_sync" },
+	{ OST_SET_INFO,     "ost_set_info" },
+	{ OST_QUOTACHECK,   "ost_quotacheck" },
+	{ OST_QUOTACTL,     "ost_quotactl" },
+	{ OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" },
+	{ MDS_GETATTR,      "mds_getattr" },
+	{ MDS_GETATTR_NAME, "mds_getattr_lock" },
+	{ MDS_CLOSE,	"mds_close" },
+	{ MDS_REINT,	"mds_reint" },
+	{ MDS_READPAGE,     "mds_readpage" },
+	{ MDS_CONNECT,      "mds_connect" },
+	{ MDS_DISCONNECT,   "mds_disconnect" },
+	{ MDS_GETSTATUS,    "mds_getstatus" },
+	{ MDS_STATFS,       "mds_statfs" },
+	{ MDS_PIN,	  "mds_pin" },
+	{ MDS_UNPIN,	"mds_unpin" },
+	{ MDS_SYNC,	 "mds_sync" },
+	{ MDS_DONE_WRITING, "mds_done_writing" },
+	{ MDS_SET_INFO,     "mds_set_info" },
+	{ MDS_QUOTACHECK,   "mds_quotacheck" },
+	{ MDS_QUOTACTL,     "mds_quotactl" },
+	{ MDS_GETXATTR,     "mds_getxattr" },
+	{ MDS_SETXATTR,     "mds_setxattr" },
+	{ MDS_WRITEPAGE,    "mds_writepage" },
+	{ MDS_IS_SUBDIR,    "mds_is_subdir" },
+	{ MDS_GET_INFO,     "mds_get_info" },
+	{ MDS_HSM_STATE_GET, "mds_hsm_state_get" },
+	{ MDS_HSM_STATE_SET, "mds_hsm_state_set" },
+	{ MDS_HSM_ACTION,   "mds_hsm_action" },
+	{ MDS_HSM_PROGRESS, "mds_hsm_progress" },
+	{ MDS_HSM_REQUEST,  "mds_hsm_request" },
+	{ MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
+	{ MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
+	{ MDS_SWAP_LAYOUTS,	"mds_swap_layouts" },
+	{ LDLM_ENQUEUE,     "ldlm_enqueue" },
+	{ LDLM_CONVERT,     "ldlm_convert" },
+	{ LDLM_CANCEL,      "ldlm_cancel" },
+	{ LDLM_BL_CALLBACK, "ldlm_bl_callback" },
+	{ LDLM_CP_CALLBACK, "ldlm_cp_callback" },
+	{ LDLM_GL_CALLBACK, "ldlm_gl_callback" },
+	{ LDLM_SET_INFO,    "ldlm_set_info" },
+	{ MGS_CONNECT,      "mgs_connect" },
+	{ MGS_DISCONNECT,   "mgs_disconnect" },
+	{ MGS_EXCEPTION,    "mgs_exception" },
+	{ MGS_TARGET_REG,   "mgs_target_reg" },
+	{ MGS_TARGET_DEL,   "mgs_target_del" },
+	{ MGS_SET_INFO,     "mgs_set_info" },
+	{ MGS_CONFIG_READ,  "mgs_config_read" },
+	{ OBD_PING,	 "obd_ping" },
+	{ OBD_LOG_CANCEL,   "llog_origin_handle_cancel" },
+	{ OBD_QC_CALLBACK,  "obd_quota_callback" },
+	{ OBD_IDX_READ,	    "dt_index_read" },
+	{ LLOG_ORIGIN_HANDLE_CREATE,     "llog_origin_handle_create" },
+	{ LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
+	{ LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" },
+	{ LLOG_ORIGIN_HANDLE_WRITE_REC,  "llog_origin_handle_write_rec" },
+	{ LLOG_ORIGIN_HANDLE_CLOSE,      "llog_origin_handle_close" },
+	{ LLOG_ORIGIN_CONNECT,	   "llog_origin_connect" },
+	{ LLOG_CATINFO,		  "llog_catinfo" },
+	{ LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
+	{ LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
+	{ QUOTA_DQACQ,      "quota_acquire" },
+	{ QUOTA_DQREL,      "quota_release" },
+	{ SEQ_QUERY,	"seq_query" },
+	{ SEC_CTX_INIT,     "sec_ctx_init" },
+	{ SEC_CTX_INIT_CONT,"sec_ctx_init_cont" },
+	{ SEC_CTX_FINI,     "sec_ctx_fini" },
+	{ FLD_QUERY,	"fld_query" },
+	{ UPDATE_OBJ,	    "update_obj" },
+};
+
+struct ll_eopcode {
+     __u32       opcode;
+     const char *opname;
+} ll_eopcode_table[EXTRA_LAST_OPC] = {
+	{ LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+	{ LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+	{ LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+	{ LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+	{ LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+	{ MDS_REINT_SETATTR,    "mds_reint_setattr" },
+	{ MDS_REINT_CREATE,     "mds_reint_create" },
+	{ MDS_REINT_LINK,       "mds_reint_link" },
+	{ MDS_REINT_UNLINK,     "mds_reint_unlink" },
+	{ MDS_REINT_RENAME,     "mds_reint_rename" },
+	{ MDS_REINT_OPEN,       "mds_reint_open" },
+	{ MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+	{ BRW_READ_BYTES,       "read_bytes" },
+	{ BRW_WRITE_BYTES,      "write_bytes" },
+};
+
+const char *ll_opcode2str(__u32 opcode)
+{
+	/* When one of the assertions below fail, chances are that:
+	 *     1) A new opcode was added in include/lustre/lustre_idl.h,
+	 *	but is missing from the table above.
+	 * or  2) The opcode space was renumbered or rearranged,
+	 *	and the opcode_offset() function in
+	 *	ptlrpc_internal.h needs to be modified.
+	 */
+	__u32 offset = opcode_offset(opcode);
+	LASSERTF(offset < LUSTRE_MAX_OPCODES,
+		 "offset %u >= LUSTRE_MAX_OPCODES %u\n",
+		 offset, LUSTRE_MAX_OPCODES);
+	LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode,
+		 "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n",
+		 offset, ll_rpc_opcode_table[offset].opcode, opcode);
+	return ll_rpc_opcode_table[offset].opname;
+}
+
+const char* ll_eopcode2str(__u32 opcode)
+{
+	LASSERT(ll_eopcode_table[opcode].opcode == opcode);
+	return ll_eopcode_table[opcode].opname;
+}
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
+			     char *name, struct proc_dir_entry **procroot_ret,
+			     struct lprocfs_stats **stats_ret)
+{
+	struct proc_dir_entry *svc_procroot;
+	struct lprocfs_stats *svc_stats;
+	int i, rc;
+	unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
+					  LPROCFS_CNTR_STDDEV;
+
+	LASSERT(*procroot_ret == NULL);
+	LASSERT(*stats_ret == NULL);
+
+	svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,0);
+	if (svc_stats == NULL)
+		return;
+
+	if (dir) {
+		svc_procroot = lprocfs_register(dir, root, NULL, NULL);
+		if (IS_ERR(svc_procroot)) {
+			lprocfs_free_stats(&svc_stats);
+			return;
+		}
+	} else {
+		svc_procroot = root;
+	}
+
+	lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
+			     svc_counter_config, "req_waittime", "usec");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR,
+			     svc_counter_config, "req_qdepth", "reqs");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
+			     svc_counter_config, "req_active", "reqs");
+	lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+			     svc_counter_config, "req_timeout", "sec");
+	lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
+			     svc_counter_config, "reqbuf_avail", "bufs");
+	for (i = 0; i < EXTRA_LAST_OPC; i++) {
+		char *units;
+
+		switch(i) {
+		case BRW_WRITE_BYTES:
+		case BRW_READ_BYTES:
+			units = "bytes";
+			break;
+		default:
+			units = "reqs";
+			break;
+		}
+		lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
+				     svc_counter_config,
+				     ll_eopcode2str(i), units);
+	}
+	for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
+		__u32 opcode = ll_rpc_opcode_table[i].opcode;
+		lprocfs_counter_init(svc_stats,
+				     EXTRA_MAX_OPCODES + i, svc_counter_config,
+				     ll_opcode2str(opcode), "usec");
+	}
+
+	rc = lprocfs_register_stats(svc_procroot, name, svc_stats);
+	if (rc < 0) {
+		if (dir)
+			lprocfs_remove(&svc_procroot);
+		lprocfs_free_stats(&svc_stats);
+	} else {
+		if (dir)
+			*procroot_ret = svc_procroot;
+		*stats_ret = svc_stats;
+	}
+}
+
+static int
+ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svcpt->scp_hist_nrqbds;
+
+	return seq_printf(m, "%d\n", total);
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
+
+static int
+ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svc->srv_hist_nrqbds_cpt_max;
+
+	return seq_printf(m, "%d\n", total);
+}
+
+static ssize_t
+ptlrpc_lprocfs_req_history_max_seq_write(struct file *file, const char *buffer,
+					 size_t count, loff_t *off)
+{
+	struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+	int			    bufpages;
+	int			    val;
+	int			    rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0)
+		return -ERANGE;
+
+	/* This sanity check is more of an insanity check; we can still
+	 * hose a kernel by allowing the request history to grow too
+	 * far. */
+	bufpages = (svc->srv_buf_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (val > num_physpages/(2 * bufpages))
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+
+	if (val == 0)
+		svc->srv_hist_nrqbds_cpt_max = 0;
+	else
+		svc->srv_hist_nrqbds_cpt_max = max(1, (val / svc->srv_ncpts));
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
+
+static int
+ptlrpc_lprocfs_threads_min_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+
+	return seq_printf(m, "%d\n",
+			svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+}
+
+static ssize_t
+ptlrpc_lprocfs_threads_min_seq_write(struct file *file, const char *buffer,
+				     size_t count, loff_t *off)
+{
+	struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+	int	val;
+	int	rc = lprocfs_write_helper(buffer, count, &val);
+
+	if (rc < 0)
+		return rc;
+
+	if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) {
+		spin_unlock(&svc->srv_lock);
+		return -ERANGE;
+	}
+
+	svc->srv_nthrs_cpt_init = val / svc->srv_ncpts;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_min);
+
+static int
+ptlrpc_lprocfs_threads_started_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+	struct ptlrpc_service_part *svcpt;
+	int	total = 0;
+	int	i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		total += svcpt->scp_nthrs_running;
+
+	return seq_printf(m, "%d\n", total);
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_threads_started);
+
+static int
+ptlrpc_lprocfs_threads_max_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service *svc = m->private;
+
+	return seq_printf(m, "%d\n",
+			svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
+}
+
+static ssize_t
+ptlrpc_lprocfs_threads_max_seq_write(struct file *file, const char *buffer,
+				     size_t count, loff_t *off)
+{
+	struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+	int	val;
+	int	rc = lprocfs_write_helper(buffer, count, &val);
+
+	if (rc < 0)
+		return rc;
+
+	if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) {
+		spin_unlock(&svc->srv_lock);
+		return -ERANGE;
+	}
+
+	svc->srv_nthrs_cpt_limit = val / svc->srv_ncpts;
+
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_max);
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+extern struct nrs_core nrs_core;
+
+/**
+ * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
+ *
+ * \param[in] state The policy state
+ */
+static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state)
+{
+	switch (state) {
+	default:
+		LBUG();
+	case NRS_POL_STATE_INVALID:
+		return "invalid";
+	case NRS_POL_STATE_STOPPED:
+		return "stopped";
+	case NRS_POL_STATE_STOPPING:
+		return "stopping";
+	case NRS_POL_STATE_STARTING:
+		return "starting";
+	case NRS_POL_STATE_STARTED:
+		return "started";
+	}
+}
+
+/**
+ * Obtains status information for \a policy.
+ *
+ * Information is copied in \a info.
+ *
+ * \param[in] policy The policy
+ * \param[out] info  Holds returned status information
+ */
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+				struct ptlrpc_nrs_pol_info *info)
+{
+	LASSERT(policy != NULL);
+	LASSERT(info != NULL);
+	LASSERT(spin_is_locked(&policy->pol_nrs->nrs_lock));
+
+	memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
+
+	info->pi_fallback    = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK);
+	info->pi_state	     = policy->pol_state;
+	/**
+	 * XXX: These are accessed without holding
+	 * ptlrpc_service_part::scp_req_lock.
+	 */
+	info->pi_req_queued  = policy->pol_req_queued;
+	info->pi_req_started = policy->pol_req_started;
+}
+
+/**
+ * Reads and prints policy status information for all policies of a PTLRPC
+ * service.
+ */
+static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service	       *svc = m->private;
+	struct ptlrpc_service_part     *svcpt;
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_pol_info     *infos;
+	struct ptlrpc_nrs_pol_info	tmp;
+	unsigned			num_pols;
+	unsigned			pol_idx = 0;
+	bool				hp = false;
+	int				i;
+	int				rc = 0;
+	ENTRY;
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Use the first service partition's regular NRS head in order to obtain
+	 * the number of policies registered with NRS heads of this service. All
+	 * service partitions will have the same number of policies.
+	 */
+	nrs = nrs_svcpt2nrs(svc->srv_parts[0], false);
+
+	spin_lock(&nrs->nrs_lock);
+	num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols;
+	spin_unlock(&nrs->nrs_lock);
+
+	OBD_ALLOC(infos, num_pols * sizeof(*infos));
+	if (infos == NULL)
+		GOTO(out, rc = -ENOMEM);
+again:
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		nrs = nrs_svcpt2nrs(svcpt, hp);
+		spin_lock(&nrs->nrs_lock);
+
+		pol_idx = 0;
+
+		list_for_each_entry(policy, &nrs->nrs_policy_list,
+					pol_list) {
+			LASSERT(pol_idx < num_pols);
+
+			nrs_policy_get_info_locked(policy, &tmp);
+			/**
+			 * Copy values when handling the first service
+			 * partition.
+			 */
+			if (i == 0) {
+				memcpy(infos[pol_idx].pi_name, tmp.pi_name,
+				       NRS_POL_NAME_MAX);
+				memcpy(&infos[pol_idx].pi_state, &tmp.pi_state,
+				       sizeof(tmp.pi_state));
+				infos[pol_idx].pi_fallback = tmp.pi_fallback;
+				/**
+				 * For the rest of the service partitions
+				 * sanity-check the values we get.
+				 */
+			} else {
+				LASSERT(strncmp(infos[pol_idx].pi_name,
+						tmp.pi_name,
+						NRS_POL_NAME_MAX) == 0);
+				/**
+				 * Not asserting ptlrpc_nrs_pol_info::pi_state,
+				 * because it may be different between
+				 * instances of the same policy in different
+				 * service partitions.
+				 */
+				LASSERT(infos[pol_idx].pi_fallback ==
+					tmp.pi_fallback);
+			}
+
+			infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
+			infos[pol_idx].pi_req_started += tmp.pi_req_started;
+
+			pol_idx++;
+		}
+		spin_unlock(&nrs->nrs_lock);
+	}
+
+	/**
+	 * Policy status information output is in YAML format.
+	 * For example:
+	 *
+	 *	regular_requests:
+	 *	  - name: fifo
+	 *	    state: started
+	 *	    fallback: yes
+	 *	    queued: 0
+	 *	    active: 0
+	 *
+	 *	  - name: crrn
+	 *	    state: started
+	 *	    fallback: no
+	 *	    queued: 2015
+	 *	    active: 384
+	 *
+	 *	high_priority_requests:
+	 *	  - name: fifo
+	 *	    state: started
+	 *	    fallback: yes
+	 *	    queued: 0
+	 *	    active: 2
+	 *
+	 *	  - name: crrn
+	 *	    state: stopped
+	 *	    fallback: no
+	 *	    queued: 0
+	 *	    active: 0
+	 */
+	seq_printf(m, "%s\n",
+		      !hp ?  "\nregular_requests:" : "high_priority_requests:");
+
+	for (pol_idx = 0; pol_idx < num_pols; pol_idx++) {
+		seq_printf(m,  "  - name: %s\n"
+			       "    state: %s\n"
+			       "    fallback: %s\n"
+			       "    queued: %-20d\n"
+			       "    active: %-20d\n\n",
+			       infos[pol_idx].pi_name,
+			       nrs_state2str(infos[pol_idx].pi_state),
+			       infos[pol_idx].pi_fallback ? "yes" : "no",
+			       (int)infos[pol_idx].pi_req_queued,
+			       (int)infos[pol_idx].pi_req_started);
+	}
+
+	if (!hp && nrs_svc_has_hp(svc)) {
+		memset(infos, 0, num_pols * sizeof(*infos));
+
+		/**
+		 * Redo the processing for the service's HP NRS heads' policies.
+		 */
+		hp = true;
+		goto again;
+	}
+
+out:
+	if (infos)
+		OBD_FREE(infos, num_pols * sizeof(*infos));
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+
+/**
+ * The longest valid command string is the maxium policy name size, plus the
+ * length of the " reg" substring
+ */
+#define LPROCFS_NRS_WR_MAX_CMD	(NRS_POL_NAME_MAX + sizeof(" reg") - 1)
+
+/**
+ * Starts and stops a given policy on a PTLRPC service.
+ *
+ * Commands consist of the policy name, followed by an optional [reg|hp] token;
+ * if the optional token is omitted, the operation is performed on both the
+ * regular and high-priority (if the service has one) NRS head.
+ */
+static ssize_t ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char *buffer,
+					size_t count, loff_t *off)
+{
+	struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+	enum ptlrpc_nrs_queue_type	queue = PTLRPC_NRS_QUEUE_BOTH;
+	char			       *cmd;
+	char			       *cmd_copy = NULL;
+	char			       *token;
+	int				rc = 0;
+	ENTRY;
+
+	if (count >= LPROCFS_NRS_WR_MAX_CMD)
+		GOTO(out, rc = -EINVAL);
+
+	OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD);
+	if (cmd == NULL)
+		GOTO(out, rc = -ENOMEM);
+	/**
+	 * strsep() modifies its argument, so keep a copy
+	 */
+	cmd_copy = cmd;
+
+	if (copy_from_user(cmd, buffer, count))
+		GOTO(out, rc = -EFAULT);
+
+	cmd[count] = '\0';
+
+	token = strsep(&cmd, " ");
+
+	if (strlen(token) > NRS_POL_NAME_MAX - 1)
+		GOTO(out, rc = -EINVAL);
+
+	/**
+	 * No [reg|hp] token has been specified
+	 */
+	if (cmd == NULL)
+		goto default_queue;
+
+	/**
+	 * The second token is either NULL, or an optional [reg|hp] string
+	 */
+	if (strcmp(cmd, "reg") == 0)
+		queue = PTLRPC_NRS_QUEUE_REG;
+	else if (strcmp(cmd, "hp") == 0)
+		queue = PTLRPC_NRS_QUEUE_HP;
+	else
+		GOTO(out, rc = -EINVAL);
+
+default_queue:
+
+	if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc))
+		GOTO(out, rc = -ENODEV);
+	else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc))
+		queue = PTLRPC_NRS_QUEUE_REG;
+
+	/**
+	 * Serialize NRS core lprocfs operations with policy registration/
+	 * unregistration.
+	 */
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	rc = ptlrpc_nrs_policy_control(svc, queue, token, PTLRPC_NRS_CTL_START,
+				       false, NULL);
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+out:
+	if (cmd_copy)
+		OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD);
+
+	RETURN(rc < 0 ? rc : count);
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs);
+
+/** @} nrs */
+
+struct ptlrpc_srh_iterator {
+	int			srhi_idx;
+	__u64			srhi_seq;
+	struct ptlrpc_request	*srhi_req;
+};
+
+int
+ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt,
+				    struct ptlrpc_srh_iterator *srhi,
+				    __u64 seq)
+{
+	struct list_head		*e;
+	struct ptlrpc_request	*req;
+
+	if (srhi->srhi_req != NULL &&
+	    srhi->srhi_seq > svcpt->scp_hist_seq_culled &&
+	    srhi->srhi_seq <= seq) {
+		/* If srhi_req was set previously, hasn't been culled and
+		 * we're searching for a seq on or after it (i.e. more
+		 * recent), search from it onwards.
+		 * Since the service history is LRU (i.e. culled reqs will
+		 * be near the head), we shouldn't have to do long
+		 * re-scans */
+		LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq,
+			 "%s:%d: seek seq "LPU64", request seq "LPU64"\n",
+			 svcpt->scp_service->srv_name, svcpt->scp_cpt,
+			 srhi->srhi_seq, srhi->srhi_req->rq_history_seq);
+		LASSERTF(!list_empty(&svcpt->scp_hist_reqs),
+			 "%s:%d: seek offset "LPU64", request seq "LPU64", "
+			 "last culled "LPU64"\n",
+			 svcpt->scp_service->srv_name, svcpt->scp_cpt,
+			 seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled);
+		e = &srhi->srhi_req->rq_history_list;
+	} else {
+		/* search from start */
+		e = svcpt->scp_hist_reqs.next;
+	}
+
+	while (e != &svcpt->scp_hist_reqs) {
+		req = list_entry(e, struct ptlrpc_request, rq_history_list);
+
+		if (req->rq_history_seq >= seq) {
+			srhi->srhi_seq = req->rq_history_seq;
+			srhi->srhi_req = req;
+			return 0;
+		}
+		e = e->next;
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * ptlrpc history sequence is used as "position" of seq_file, in some case,
+ * seq_read() will increase "position" to indicate reading the next
+ * element, however, low bits of history sequence are reserved for CPT id
+ * (check the details from comments before ptlrpc_req_add_history), which
+ * means seq_read() might change CPT id of history sequence and never
+ * finish reading of requests on a CPT. To make it work, we have to shift
+ * CPT id to high bits and timestamp to low bits, so seq_read() will only
+ * increase timestamp which can correctly indicate the next position.
+ */
+
+/* convert seq_file pos to cpt */
+#define PTLRPC_REQ_POS2CPT(svc, pos)			\
+	((svc)->srv_cpt_bits == 0 ? 0 :			\
+	 (__u64)(pos) >> (64 - (svc)->srv_cpt_bits))
+
+/* make up seq_file pos from cpt */
+#define PTLRPC_REQ_CPT2POS(svc, cpt)			\
+	((svc)->srv_cpt_bits == 0 ? 0 :			\
+	 (cpt) << (64 - (svc)->srv_cpt_bits))
+
+/* convert sequence to position */
+#define PTLRPC_REQ_SEQ2POS(svc, seq)			\
+	((svc)->srv_cpt_bits == 0 ? (seq) :		\
+	 ((seq) >> (svc)->srv_cpt_bits) |		\
+	 ((seq) << (64 - (svc)->srv_cpt_bits)))
+
+/* convert position to sequence */
+#define PTLRPC_REQ_POS2SEQ(svc, pos)			\
+	((svc)->srv_cpt_bits == 0 ? (pos) :		\
+	 ((__u64)(pos) << (svc)->srv_cpt_bits) |	\
+	 ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits)))
+
+static void *
+ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_srh_iterator	*srhi;
+	unsigned int			cpt;
+	int				rc;
+	int				i;
+
+	if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */
+		CWARN("Failed to read request history because size of loff_t "
+		      "%d can't match size of u64\n", (int)sizeof(loff_t));
+		return NULL;
+	}
+
+	OBD_ALLOC(srhi, sizeof(*srhi));
+	if (srhi == NULL)
+		return NULL;
+
+	srhi->srhi_seq = 0;
+	srhi->srhi_req = NULL;
+
+	cpt = PTLRPC_REQ_POS2CPT(svc, *pos);
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (i < cpt) /* skip */
+			continue;
+		if (i > cpt) /* make up the lowest position for this CPT */
+			*pos = PTLRPC_REQ_CPT2POS(svc, i);
+
+		spin_lock(&svcpt->scp_lock);
+		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
+				PTLRPC_REQ_POS2SEQ(svc, *pos));
+		spin_unlock(&svcpt->scp_lock);
+		if (rc == 0) {
+			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+			srhi->srhi_idx = i;
+			return srhi;
+		}
+	}
+
+	OBD_FREE(srhi, sizeof(*srhi));
+	return NULL;
+}
+
+static void
+ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter)
+{
+	struct ptlrpc_srh_iterator *srhi = iter;
+
+	if (srhi != NULL)
+		OBD_FREE(srhi, sizeof(*srhi));
+}
+
+static void *
+ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
+				    void *iter, loff_t *pos)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_srh_iterator	*srhi = iter;
+	struct ptlrpc_service_part	*svcpt;
+	__u64				seq;
+	int				rc;
+	int				i;
+
+	for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) {
+		svcpt = svc->srv_parts[i];
+
+		if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */
+			srhi->srhi_req = NULL;
+			seq = srhi->srhi_seq = 0;
+		} else { /* the next sequence */
+			seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
+		}
+
+		spin_lock(&svcpt->scp_lock);
+		rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
+		spin_unlock(&svcpt->scp_lock);
+		if (rc == 0) {
+			*pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+			srhi->srhi_idx = i;
+			return srhi;
+		}
+	}
+
+	OBD_FREE(srhi, sizeof(*srhi));
+	return NULL;
+}
+
+/* common ost/mdt so_req_printer */
+void target_print_req(void *seq_file, struct ptlrpc_request *req)
+{
+	/* Called holding srv_lock with irqs disabled.
+	 * Print specific req contents and a newline.
+	 * CAVEAT EMPTOR: check request message length before printing!!!
+	 * You might have received any old crap so you must be just as
+	 * careful here as the service's request parser!!! */
+	struct seq_file *sf = seq_file;
+
+	switch (req->rq_phase) {
+	case RQ_PHASE_NEW:
+		/* still awaiting a service thread's attention, or rejected
+		 * because the generic request message didn't unpack */
+		seq_printf(sf, "<not swabbed>\n");
+		break;
+	case RQ_PHASE_INTERPRET:
+		/* being handled, so basic msg swabbed, and opc is valid
+		 * but racing with mds_handle() */
+	case RQ_PHASE_COMPLETE:
+		/* been handled by mds_handle() reply state possibly still
+		 * volatile */
+		seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
+		break;
+	default:
+		DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase);
+	}
+}
+EXPORT_SYMBOL(target_print_req);
+
+static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
+{
+	struct ptlrpc_service		*svc = s->private;
+	struct ptlrpc_srh_iterator	*srhi = iter;
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_request		*req;
+	int				rc;
+
+	LASSERT(srhi->srhi_idx < svc->srv_ncpts);
+
+	svcpt = svc->srv_parts[srhi->srhi_idx];
+
+	spin_lock(&svcpt->scp_lock);
+
+	rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
+
+	if (rc == 0) {
+		req = srhi->srhi_req;
+
+		/* Print common req fields.
+		 * CAVEAT EMPTOR: we're racing with the service handler
+		 * here.  The request could contain any old crap, so you
+		 * must be just as careful as the service's request
+		 * parser. Currently I only print stuff here I know is OK
+		 * to look at coz it was set up in request_in_callback()!!! */
+		seq_printf(s, LPD64":%s:%s:x"LPU64":%d:%s:%ld:%lds(%+lds) ",
+			   req->rq_history_seq, libcfs_nid2str(req->rq_self),
+			   libcfs_id2str(req->rq_peer), req->rq_xid,
+			   req->rq_reqlen, ptlrpc_rqphase2str(req),
+			   req->rq_arrival_time.tv_sec,
+			   req->rq_sent - req->rq_arrival_time.tv_sec,
+			   req->rq_sent - req->rq_deadline);
+		if (svc->srv_ops.so_req_printer == NULL)
+			seq_printf(s, "\n");
+		else
+			svc->srv_ops.so_req_printer(s, srhi->srhi_req);
+	}
+
+	spin_unlock(&svcpt->scp_lock);
+	return rc;
+}
+
+static int
+ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
+{
+	static struct seq_operations sops = {
+		.start = ptlrpc_lprocfs_svc_req_history_start,
+		.stop  = ptlrpc_lprocfs_svc_req_history_stop,
+		.next  = ptlrpc_lprocfs_svc_req_history_next,
+		.show  = ptlrpc_lprocfs_svc_req_history_show,
+	};
+	struct seq_file       *seqf;
+	int		    rc;
+
+	rc = seq_open(file, &sops);
+	if (rc)
+		return rc;
+
+	seqf = file->private_data;
+	seqf->private = PDE_DATA(inode);
+	return 0;
+}
+
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n)
+{
+	struct ptlrpc_service		*svc = m->private;
+	struct ptlrpc_service_part	*svcpt;
+	struct dhms			ts;
+	time_t				worstt;
+	unsigned int			cur;
+	unsigned int			worst;
+	int				i;
+
+	if (AT_OFF) {
+		seq_printf(m, "adaptive timeouts off, using obd_timeout %u\n",
+			       obd_timeout);
+		return 0;
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		cur	= at_get(&svcpt->scp_at_estimate);
+		worst	= svcpt->scp_at_estimate.at_worst_ever;
+		worstt	= svcpt->scp_at_estimate.at_worst_time;
+		s2dhms(&ts, cfs_time_current_sec() - worstt);
+
+		seq_printf(m, "%10s : cur %3u  worst %3u (at %ld, "
+			      DHMS_FMT" ago) ", "service",
+			      cur, worst, worstt, DHMS_VARS(&ts));
+
+		lprocfs_at_hist_helper(m, &svcpt->scp_at_estimate);
+	}
+
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
+
+static int ptlrpc_lprocfs_hp_ratio_seq_show(struct seq_file *m, void *v)
+{
+	struct ptlrpc_service *svc = m->private;
+	return seq_printf(m, "%d", svc->srv_hpreq_ratio);
+}
+
+static ssize_t ptlrpc_lprocfs_hp_ratio_seq_write(struct file *file,
+					     const char *buffer,
+					     size_t count,
+					     loff_t *off)
+{
+	struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+	int	rc;
+	int	val;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val < 0)
+		return -ERANGE;
+
+	spin_lock(&svc->srv_lock);
+	svc->srv_hpreq_ratio = val;
+	spin_unlock(&svc->srv_lock);
+
+	return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_hp_ratio);
+
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
+				     struct ptlrpc_service *svc)
+{
+	struct lprocfs_vars lproc_vars[] = {
+		{.name       = "high_priority_ratio",
+		 .fops	     = &ptlrpc_lprocfs_hp_ratio_fops,
+		 .data       = svc},
+		{.name       = "req_buffer_history_len",
+		 .fops	     = &ptlrpc_lprocfs_req_history_len_fops,
+		 .data       = svc},
+		{.name       = "req_buffer_history_max",
+		 .fops	     = &ptlrpc_lprocfs_req_history_max_fops,
+		 .data       = svc},
+		{.name       = "threads_min",
+		 .fops	     = &ptlrpc_lprocfs_threads_min_fops,
+		 .data       = svc},
+		{.name       = "threads_max",
+		 .fops	     = &ptlrpc_lprocfs_threads_max_fops,
+		 .data       = svc},
+		{.name       = "threads_started",
+		 .fops	     = &ptlrpc_lprocfs_threads_started_fops,
+		 .data       = svc},
+		{.name       = "timeouts",
+		 .fops	     = &ptlrpc_lprocfs_timeouts_fops,
+		 .data       = svc},
+		{.name       = "nrs_policies",
+		 .fops	     = &ptlrpc_lprocfs_nrs_fops,
+		 .data	     = svc},
+		{NULL}
+	};
+	static struct file_operations req_history_fops = {
+		.owner       = THIS_MODULE,
+		.open	= ptlrpc_lprocfs_svc_req_history_open,
+		.read	= seq_read,
+		.llseek      = seq_lseek,
+		.release     = lprocfs_seq_release,
+	};
+
+	int rc;
+
+	ptlrpc_lprocfs_register(entry, svc->srv_name,
+				"stats", &svc->srv_procroot,
+				&svc->srv_stats);
+
+	if (svc->srv_procroot == NULL)
+		return;
+
+	lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL);
+
+	rc = lprocfs_seq_create(svc->srv_procroot, "req_history",
+				0400, &req_history_fops, svc);
+	if (rc)
+		CWARN("Error adding the req_history file\n");
+}
+
+void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
+{
+	ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
+				&obddev->obd_svc_procroot,
+				&obddev->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
+
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount)
+{
+	struct lprocfs_stats *svc_stats;
+	__u32 op = lustre_msg_get_opc(req->rq_reqmsg);
+	int opc = opcode_offset(op);
+
+	svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+	if (svc_stats == NULL || opc <= 0)
+		return;
+	LASSERT(opc < LUSTRE_MAX_OPCODES);
+	if (!(op == LDLM_ENQUEUE || op == MDS_REINT))
+		lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount);
+}
+
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes)
+{
+	struct lprocfs_stats *svc_stats;
+	int idx;
+
+	if (!req->rq_import)
+		return;
+	svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+	if (!svc_stats)
+		return;
+	idx = lustre_msg_get_opc(req->rq_reqmsg);
+	switch (idx) {
+	case OST_READ:
+		idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR;
+		break;
+	case OST_WRITE:
+		idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR;
+		break;
+	default:
+		LASSERTF(0, "unsupported opcode %u\n", idx);
+		break;
+	}
+
+	lprocfs_counter_add(svc_stats, idx, bytes);
+}
+
+EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
+
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
+{
+	if (svc->srv_procroot != NULL)
+		lprocfs_remove(&svc->srv_procroot);
+
+	if (svc->srv_stats)
+		lprocfs_free_stats(&svc->srv_stats);
+}
+
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
+{
+	if (obd->obd_svc_procroot)
+		lprocfs_remove(&obd->obd_svc_procroot);
+
+	if (obd->obd_svc_stats)
+		lprocfs_free_stats(&obd->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
+
+
+#define BUFLEN (UUID_MAX + 5)
+
+int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+			    size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	char	      *kbuf;
+	char	      *tmpbuf;
+
+	OBD_ALLOC(kbuf, BUFLEN);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	/*
+	 * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1
+	 * bytes into kbuf, to ensure that the string is NUL-terminated.
+	 * UUID_MAX should include a trailing NUL already.
+	 */
+	if (copy_from_user(kbuf, buffer,
+			       min_t(unsigned long, BUFLEN - 1, count))) {
+		count = -EFAULT;
+		goto out;
+	}
+	tmpbuf = cfs_firststr(kbuf, min_t(unsigned long, BUFLEN - 1, count));
+	/* Kludge code(deadlock situation): the lprocfs lock has been held
+	 * since the client is evicted by writting client's
+	 * uuid/nid to procfs "evict_client" entry. However,
+	 * obd_export_evict_by_uuid() will call lprocfs_remove() to destroy
+	 * the proc entries under the being destroyed export{}, so I have
+	 * to drop the lock at first here.
+	 * - jay, jxiong@clusterfs.com */
+	class_incref(obd, __FUNCTION__, current);
+
+	if (strncmp(tmpbuf, "nid:", 4) == 0)
+		obd_export_evict_by_nid(obd, tmpbuf + 4);
+	else if (strncmp(tmpbuf, "uuid:", 5) == 0)
+		obd_export_evict_by_uuid(obd, tmpbuf + 5);
+	else
+		obd_export_evict_by_uuid(obd, tmpbuf);
+
+	class_decref(obd, __FUNCTION__, current);
+
+out:
+	OBD_FREE(kbuf, BUFLEN);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_evict_client);
+
+#undef BUFLEN
+
+int lprocfs_wr_ping(struct file *file, const char *buffer,
+		    size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct ptlrpc_request *req;
+	int		    rc;
+	ENTRY;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+	LPROCFS_CLIMP_EXIT(obd);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+
+	rc = ptlrpc_queue_wait(req);
+
+	ptlrpc_req_finished(req);
+	if (rc >= 0)
+		RETURN(count);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_wr_ping);
+
+/* Write the connection UUID to this file to attempt to connect to that node.
+ * The connection UUID is a node's primary NID. For example,
+ * "echo connection=192.168.0.1@tcp0::instance > .../import".
+ */
+int lprocfs_wr_import(struct file *file, const char *buffer,
+		      size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct obd_import *imp = obd->u.cli.cl_import;
+	char *kbuf = NULL;
+	char *uuid;
+	char *ptr;
+	int do_reconn = 1;
+	const char prefix[] = "connection=";
+	const int prefix_len = sizeof(prefix) - 1;
+
+	if (count > PAGE_CACHE_SIZE - 1 || count <= prefix_len)
+		return -EINVAL;
+
+	OBD_ALLOC(kbuf, count + 1);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(kbuf, buffer, count))
+		GOTO(out, count = -EFAULT);
+
+	kbuf[count] = 0;
+
+	/* only support connection=uuid::instance now */
+	if (strncmp(prefix, kbuf, prefix_len) != 0)
+		GOTO(out, count = -EINVAL);
+
+	uuid = kbuf + prefix_len;
+	ptr = strstr(uuid, "::");
+	if (ptr) {
+		__u32 inst;
+		char *endptr;
+
+		*ptr = 0;
+		do_reconn = 0;
+		ptr += strlen("::");
+		inst = simple_strtol(ptr, &endptr, 10);
+		if (*endptr) {
+			CERROR("config: wrong instance # %s\n", ptr);
+		} else if (inst != imp->imp_connect_data.ocd_instance) {
+			CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted "
+			       "target(%u/%u), reconnecting...\n",
+			       imp->imp_obd->obd_name,
+			       imp->imp_connect_data.ocd_instance, inst);
+			do_reconn = 1;
+		} else {
+			CDEBUG(D_INFO, "IR: %s has already been connecting to "
+			       "new target(%u)\n",
+			       imp->imp_obd->obd_name, inst);
+		}
+	}
+
+	if (do_reconn)
+		ptlrpc_recover_import(imp, uuid, 1);
+
+out:
+	OBD_FREE(kbuf, count + 1);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_import);
+
+int lprocfs_rd_pinger_recov(struct seq_file *m, void *n)
+{
+	struct obd_device *obd = m->private;
+	struct obd_import *imp = obd->u.cli.cl_import;
+	int rc;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	rc = seq_printf(m, "%d\n", !imp->imp_no_pinger_recover);
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_pinger_recov);
+
+int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+		      size_t count, loff_t *off)
+{
+	struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+	struct client_obd *cli = &obd->u.cli;
+	struct obd_import *imp = cli->cl_import;
+	int rc, val;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val != 0 && val != 1)
+		return -ERANGE;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	spin_lock(&imp->imp_lock);
+	imp->imp_no_pinger_recover = !val;
+	spin_unlock(&imp->imp_lock);
+	LPROCFS_CLIMP_EXIT(obd);
+
+	return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_pinger_recov);
+
+#endif /* LPROCFS */

diff --git a/drivers/staging/lustre/lustre/ptlrpc/niobuf.c b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c
new file mode 100644
index 0000000..de3f0db
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c

@@ -0,0 +1,728 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_lib.h>
+#include <obd.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * Helper function. Sends \a len bytes from \a base at offset \a offset
+ * over \a conn connection to portal \a portal.
+ * Returns 0 on success or error code.
+ */
+static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
+			 lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+			 struct ptlrpc_connection *conn, int portal, __u64 xid,
+			 unsigned int offset)
+{
+	int	      rc;
+	lnet_md_t	 md;
+	ENTRY;
+
+	LASSERT (portal != 0);
+	LASSERT (conn != NULL);
+	CDEBUG (D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer));
+	md.start     = base;
+	md.length    = len;
+	md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
+	md.options   = PTLRPC_MD_OPTIONS;
+	md.user_ptr  = cbid;
+	md.eq_handle = ptlrpc_eq_h;
+
+	if (unlikely(ack == LNET_ACK_REQ &&
+		     OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
+		/* don't ask for the ack to simulate failing client */
+		ack = LNET_NOACK_REQ;
+	}
+
+	rc = LNetMDBind (md, LNET_UNLINK, mdh);
+	if (unlikely(rc != 0)) {
+		CERROR ("LNetMDBind failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+		RETURN (-ENOMEM);
+	}
+
+	CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n",
+	       len, portal, xid, offset);
+
+	rc = LNetPut (conn->c_self, *mdh, ack,
+		      conn->c_peer, portal, xid, offset, 0);
+	if (unlikely(rc != 0)) {
+		int rc2;
+		/* We're going to get an UNLINK event when I unlink below,
+		 * which will complete just like any other failed send, so
+		 * I fall through and return success here! */
+		CERROR("LNetPut(%s, %d, "LPD64") failed: %d\n",
+		       libcfs_id2str(conn->c_peer), portal, xid, rc);
+		rc2 = LNetMDUnlink(*mdh);
+		LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
+	}
+
+	RETURN (0);
+}
+
+static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		LNetMDUnlink(bd_mds[i]);
+}
+
+
+/**
+ * Register bulk at the sender for later transfer.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_register_bulk(struct ptlrpc_request *req)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	lnet_process_id_t peer;
+	int rc = 0;
+	int rc2;
+	int posted_md;
+	int total_md;
+	__u64 xid;
+	lnet_handle_me_t  me_h;
+	lnet_md_t	 md;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET))
+		RETURN(0);
+
+	/* NB no locking required until desc is on the network */
+	LASSERT(desc->bd_nob > 0);
+	LASSERT(desc->bd_md_count == 0);
+	LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
+	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+	LASSERT(desc->bd_req != NULL);
+	LASSERT(desc->bd_type == BULK_PUT_SINK ||
+		desc->bd_type == BULK_GET_SOURCE);
+
+	/* cleanup the state of the bulk for it will be reused */
+	if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY)
+		desc->bd_nob_transferred = 0;
+	else
+		LASSERT(desc->bd_nob_transferred == 0);
+
+	desc->bd_failure = 0;
+
+	peer = desc->bd_import->imp_connection->c_peer;
+
+	LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
+	LASSERT(desc->bd_cbid.cbid_arg == desc);
+
+	/* An XID is only used for a single request from the client.
+	 * For retried bulk transfers, a new XID will be allocated in
+	 * in ptlrpc_check_set() if it needs to be resent, so it is not
+	 * using the same RDMA match bits after an error.
+	 *
+	 * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The
+	 * first bulk XID is power-of-two aligned before rq_xid. LU-1431 */
+	xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1);
+	LASSERTF(!(desc->bd_registered &&
+		   req->rq_send_state != LUSTRE_IMP_REPLAY) ||
+		 xid != desc->bd_last_xid,
+		 "registered: %d  rq_xid: "LPU64" bd_last_xid: "LPU64"\n",
+		 desc->bd_registered, xid, desc->bd_last_xid);
+
+	total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV;
+	desc->bd_registered = 1;
+	desc->bd_last_xid = xid;
+	desc->bd_md_count = total_md;
+	md.user_ptr = &desc->bd_cbid;
+	md.eq_handle = ptlrpc_eq_h;
+	md.threshold = 1;		       /* PUT or GET */
+
+	for (posted_md = 0; posted_md < total_md; posted_md++, xid++) {
+		md.options = PTLRPC_MD_OPTIONS |
+			     ((desc->bd_type == BULK_GET_SOURCE) ?
+			      LNET_MD_OP_GET : LNET_MD_OP_PUT);
+		ptlrpc_fill_bulk_md(&md, desc, posted_md);
+
+		rc = LNetMEAttach(desc->bd_portal, peer, xid, 0,
+				  LNET_UNLINK, LNET_INS_AFTER, &me_h);
+		if (rc != 0) {
+			CERROR("%s: LNetMEAttach failed x"LPU64"/%d: rc = %d\n",
+			       desc->bd_export->exp_obd->obd_name, xid,
+			       posted_md, rc);
+			break;
+		}
+
+		/* About to let the network at it... */
+		rc = LNetMDAttach(me_h, md, LNET_UNLINK,
+				  &desc->bd_mds[posted_md]);
+		if (rc != 0) {
+			CERROR("%s: LNetMDAttach failed x"LPU64"/%d: rc = %d\n",
+			       desc->bd_export->exp_obd->obd_name, xid,
+			       posted_md, rc);
+			rc2 = LNetMEUnlink(me_h);
+			LASSERT(rc2 == 0);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		LASSERT(rc == -ENOMEM);
+		spin_lock(&desc->bd_lock);
+		desc->bd_md_count -= total_md - posted_md;
+		spin_unlock(&desc->bd_lock);
+		LASSERT(desc->bd_md_count >= 0);
+		mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+		req->rq_status = -ENOMEM;
+		RETURN(-ENOMEM);
+	}
+
+	/* Set rq_xid to matchbits of the final bulk so that server can
+	 * infer the number of bulks that were prepared */
+	req->rq_xid = --xid;
+	LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK),
+		 "bd_last_xid = x"LPU64", rq_xid = x"LPU64"\n",
+		 desc->bd_last_xid, req->rq_xid);
+
+	spin_lock(&desc->bd_lock);
+	/* Holler if peer manages to touch buffers before he knows the xid */
+	if (desc->bd_md_count != total_md)
+		CWARN("%s: Peer %s touched %d buffers while I registered\n",
+		      desc->bd_export->exp_obd->obd_name, libcfs_id2str(peer),
+		      total_md - desc->bd_md_count);
+	spin_unlock(&desc->bd_lock);
+
+	CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, "
+	       "xid x"LPX64"-"LPX64", portal %u\n", desc->bd_md_count,
+	       desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
+	       desc->bd_iov_count, desc->bd_nob,
+	       desc->bd_last_xid, req->rq_xid, desc->bd_portal);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_register_bulk);
+
+/**
+ * Disconnect a bulk desc from the network. Idempotent. Not
+ * thread-safe (i.e. only interlocks with completion callback).
+ * Returns 1 on success or 0 if network unregistration failed for whatever
+ * reason.
+ */
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
+{
+	struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+	wait_queue_head_t	     *wq;
+	struct l_wait_info       lwi;
+	int		      rc;
+	ENTRY;
+
+	LASSERT(!in_interrupt());     /* might sleep */
+
+	/* Let's setup deadline for reply unlink. */
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+	    async && req->rq_bulk_deadline == 0)
+		req->rq_bulk_deadline = cfs_time_current_sec() + LONG_UNLINK;
+
+	if (ptlrpc_client_bulk_active(req) == 0)	/* completed or */
+		RETURN(1);				/* never registered */
+
+	LASSERT(desc->bd_req == req);  /* bd_req NULL until registered */
+
+	/* the unlink ensures the callback happens ASAP and is the last
+	 * one.  If it fails, it must be because completion just happened,
+	 * but we must still l_wait_event() in this case to give liblustre
+	 * a chance to run client_bulk_callback() */
+	mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+
+	if (ptlrpc_client_bulk_active(req) == 0)	/* completed or */
+		RETURN(1);				/* never registered */
+
+	/* Move to "Unregistering" phase as bulk was not unlinked yet. */
+	ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING);
+
+	/* Do not wait for unlink to finish. */
+	if (async)
+		RETURN(0);
+
+	if (req->rq_set != NULL)
+		wq = &req->rq_set->set_waitq;
+	else
+		wq = &req->rq_reply_waitq;
+
+	for (;;) {
+		/* Network access will complete in finite time but the HUGE
+		 * timeout lets us CWARN for visibility of sluggish NALs */
+		lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+					   cfs_time_seconds(1), NULL, NULL);
+		rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi);
+		if (rc == 0) {
+			ptlrpc_rqphase_move(req, req->rq_next_phase);
+			RETURN(1);
+		}
+
+		LASSERT(rc == -ETIMEDOUT);
+		DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
+			  desc);
+	}
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_bulk);
+
+static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
+{
+	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_service		*svc = svcpt->scp_service;
+	int service_time = max_t(int, cfs_time_current_sec() -
+				 req->rq_arrival_time.tv_sec, 1);
+
+	if (!(flags & PTLRPC_REPLY_EARLY) &&
+	    (req->rq_type != PTL_RPC_MSG_ERR) &&
+	    (req->rq_reqmsg != NULL) &&
+	    !(lustre_msg_get_flags(req->rq_reqmsg) &
+	      (MSG_RESENT | MSG_REPLAY |
+	       MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
+		/* early replies, errors and recovery requests don't count
+		 * toward our service time estimate */
+		int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
+
+		if (oldse != 0) {
+			DEBUG_REQ(D_ADAPTTO, req,
+				  "svc %s changed estimate from %d to %d",
+				  svc->srv_name, oldse,
+				  at_get(&svcpt->scp_at_estimate));
+		}
+	}
+	/* Report actual service time for client latency calc */
+	lustre_msg_set_service_time(req->rq_repmsg, service_time);
+	/* Report service time estimate for future client reqs, but report 0
+	 * (to be ignored by client) if it's a error reply during recovery.
+	 * (bz15815) */
+	if (req->rq_type == PTL_RPC_MSG_ERR &&
+	    (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering))
+		lustre_msg_set_timeout(req->rq_repmsg, 0);
+	else
+		lustre_msg_set_timeout(req->rq_repmsg,
+				       at_get(&svcpt->scp_at_estimate));
+
+	if (req->rq_reqmsg &&
+	    !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+		CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
+		       "req_flags=%#x magic=%d:%x/%x len=%d\n",
+		       flags, lustre_msg_get_flags(req->rq_reqmsg),
+		       lustre_msg_is_v1(req->rq_reqmsg),
+		       lustre_msg_get_magic(req->rq_reqmsg),
+		       lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+	}
+}
+
+/**
+ * Send request reply from request \a req reply buffer.
+ * \a flags defines reply types
+ * Returns 0 on sucess or error code
+ */
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct ptlrpc_connection  *conn;
+	int			rc;
+
+	/* We must already have a reply buffer (only ptlrpc_error() may be
+	 * called without one). The reply generated by sptlrpc layer (e.g.
+	 * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must
+	 * have a request buffer which is either the actual (swabbed) incoming
+	 * request, or a saved copy if this is a req saved in
+	 * target_queue_final_reply().
+	 */
+	LASSERT (req->rq_no_reply == 0);
+	LASSERT (req->rq_reqbuf != NULL);
+	LASSERT (rs != NULL);
+	LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
+	LASSERT (req->rq_repmsg != NULL);
+	LASSERT (req->rq_repmsg == rs->rs_msg);
+	LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
+	LASSERT (rs->rs_cb_id.cbid_arg == rs);
+
+	/* There may be no rq_export during failover */
+
+	if (unlikely(req->rq_export && req->rq_export->exp_obd &&
+		     req->rq_export->exp_obd->obd_fail)) {
+		/* Failed obd's only send ENODEV */
+		req->rq_type = PTL_RPC_MSG_ERR;
+		req->rq_status = -ENODEV;
+		CDEBUG(D_HA, "sending ENODEV from failed obd %d\n",
+		       req->rq_export->exp_obd->obd_minor);
+	}
+
+	/* In order to keep interoprability with the client (< 2.3) which
+	 * doesn't have pb_jobid in ptlrpc_body, We have to shrink the
+	 * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the
+	 * reply buffer on client will be overflow.
+	 *
+	 * XXX Remove this whenver we drop the interoprability with such client.
+	 */
+	req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0,
+					   sizeof(struct ptlrpc_body_v2), 1);
+
+	if (req->rq_type != PTL_RPC_MSG_ERR)
+		req->rq_type = PTL_RPC_MSG_REPLY;
+
+	lustre_msg_set_type(req->rq_repmsg, req->rq_type);
+	lustre_msg_set_status(req->rq_repmsg, req->rq_status);
+	lustre_msg_set_opc(req->rq_repmsg,
+		req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0);
+
+	target_pack_pool_reply(req);
+
+	ptlrpc_at_set_reply(req, flags);
+
+	if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
+		conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
+	else
+		conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
+
+	if (unlikely(conn == NULL)) {
+		CERROR("not replying on NULL connection\n"); /* bug 9635 */
+		return -ENOTCONN;
+	}
+	ptlrpc_rs_addref(rs);		   /* +1 ref for the network */
+
+	rc = sptlrpc_svc_wrap_reply(req);
+	if (unlikely(rc))
+		goto out;
+
+	req->rq_sent = cfs_time_current_sec();
+
+	rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+			   (rs->rs_difficult && !rs->rs_no_ack) ?
+			   LNET_ACK_REQ : LNET_NOACK_REQ,
+			   &rs->rs_cb_id, conn,
+			   ptlrpc_req2svc(req)->srv_rep_portal,
+			   req->rq_xid, req->rq_reply_off);
+out:
+	if (unlikely(rc != 0))
+		ptlrpc_req_drop_rs(req);
+	ptlrpc_connection_put(conn);
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_send_reply);
+
+int ptlrpc_reply (struct ptlrpc_request *req)
+{
+	if (req->rq_no_reply)
+		return 0;
+	else
+		return (ptlrpc_send_reply(req, 0));
+}
+EXPORT_SYMBOL(ptlrpc_reply);
+
+/**
+ * For request \a req send an error reply back. Create empty
+ * reply buffers if necessary.
+ */
+int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult)
+{
+	int rc;
+	ENTRY;
+
+	if (req->rq_no_reply)
+		RETURN(0);
+
+	if (!req->rq_repmsg) {
+		rc = lustre_pack_reply(req, 1, NULL, NULL);
+		if (rc)
+			RETURN(rc);
+	}
+
+	if (req->rq_status != -ENOSPC && req->rq_status != -EACCES &&
+	    req->rq_status != -EPERM && req->rq_status != -ENOENT &&
+	    req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT)
+		req->rq_type = PTL_RPC_MSG_ERR;
+
+	rc = ptlrpc_send_reply(req, may_be_difficult);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_send_error);
+
+int ptlrpc_error(struct ptlrpc_request *req)
+{
+	return ptlrpc_send_error(req, 0);
+}
+EXPORT_SYMBOL(ptlrpc_error);
+
+/**
+ * Send request \a request.
+ * if \a noreply is set, don't expect any reply back and don't set up
+ * reply buffers.
+ * Returns 0 on success or error code.
+ */
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
+{
+	int rc;
+	int rc2;
+	int mpflag = 0;
+	struct ptlrpc_connection *connection;
+	lnet_handle_me_t  reply_me_h;
+	lnet_md_t	 reply_md;
+	struct obd_device *obd = request->rq_import->imp_obd;
+	ENTRY;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
+		RETURN(0);
+
+	LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
+	LASSERT(request->rq_wait_ctx == 0);
+
+	/* If this is a re-transmit, we're required to have disengaged
+	 * cleanly from the previous attempt */
+	LASSERT(!request->rq_receiving_reply);
+
+	if (request->rq_import->imp_obd &&
+	    request->rq_import->imp_obd->obd_fail) {
+		CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
+		       request->rq_import->imp_obd->obd_name);
+		/* this prevents us from waiting in ptlrpc_queue_wait */
+		request->rq_err = 1;
+		request->rq_status = -ENODEV;
+		RETURN(-ENODEV);
+	}
+
+	connection = request->rq_import->imp_connection;
+
+	lustre_msg_set_handle(request->rq_reqmsg,
+			      &request->rq_import->imp_remote_handle);
+	lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
+	lustre_msg_set_conn_cnt(request->rq_reqmsg,
+				request->rq_import->imp_conn_cnt);
+	lustre_msghdr_set_flags(request->rq_reqmsg,
+				request->rq_import->imp_msghdr_flags);
+
+	if (request->rq_resend)
+		lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+
+	if (request->rq_memalloc)
+		mpflag = cfs_memory_pressure_get_and_set();
+
+	rc = sptlrpc_cli_wrap_request(request);
+	if (rc)
+		GOTO(out, rc);
+
+	/* bulk register should be done after wrap_request() */
+	if (request->rq_bulk != NULL) {
+		rc = ptlrpc_register_bulk (request);
+		if (rc != 0)
+			GOTO(out, rc);
+	}
+
+	if (!noreply) {
+		LASSERT (request->rq_replen != 0);
+		if (request->rq_repbuf == NULL) {
+			LASSERT(request->rq_repdata == NULL);
+			LASSERT(request->rq_repmsg == NULL);
+			rc = sptlrpc_cli_alloc_repbuf(request,
+						      request->rq_replen);
+			if (rc) {
+				/* this prevents us from looping in
+				 * ptlrpc_queue_wait */
+				request->rq_err = 1;
+				request->rq_status = rc;
+				GOTO(cleanup_bulk, rc);
+			}
+		} else {
+			request->rq_repdata = NULL;
+			request->rq_repmsg = NULL;
+		}
+
+		rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
+				  connection->c_peer, request->rq_xid, 0,
+				  LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
+		if (rc != 0) {
+			CERROR("LNetMEAttach failed: %d\n", rc);
+			LASSERT (rc == -ENOMEM);
+			GOTO(cleanup_bulk, rc = -ENOMEM);
+		}
+	}
+
+	spin_lock(&request->rq_lock);
+	/* If the MD attach succeeds, there _will_ be a reply_in callback */
+	request->rq_receiving_reply = !noreply;
+	/* We are responsible for unlinking the reply buffer */
+	request->rq_must_unlink = !noreply;
+	/* Clear any flags that may be present from previous sends. */
+	request->rq_replied = 0;
+	request->rq_err = 0;
+	request->rq_timedout = 0;
+	request->rq_net_err = 0;
+	request->rq_resend = 0;
+	request->rq_restart = 0;
+	request->rq_reply_truncate = 0;
+	spin_unlock(&request->rq_lock);
+
+	if (!noreply) {
+		reply_md.start     = request->rq_repbuf;
+		reply_md.length    = request->rq_repbuf_len;
+		/* Allow multiple early replies */
+		reply_md.threshold = LNET_MD_THRESH_INF;
+		/* Manage remote for early replies */
+		reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+			LNET_MD_MANAGE_REMOTE |
+			LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
+		reply_md.user_ptr  = &request->rq_reply_cbid;
+		reply_md.eq_handle = ptlrpc_eq_h;
+
+		/* We must see the unlink callback to unset rq_must_unlink,
+		   so we can't auto-unlink */
+		rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
+				  &request->rq_reply_md_h);
+		if (rc != 0) {
+			CERROR("LNetMDAttach failed: %d\n", rc);
+			LASSERT (rc == -ENOMEM);
+			spin_lock(&request->rq_lock);
+			/* ...but the MD attach didn't succeed... */
+			request->rq_receiving_reply = 0;
+			spin_unlock(&request->rq_lock);
+			GOTO(cleanup_me, rc = -ENOMEM);
+		}
+
+		CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
+		       ", portal %u\n",
+		       request->rq_repbuf_len, request->rq_xid,
+		       request->rq_reply_portal);
+	}
+
+	/* add references on request for request_out_callback */
+	ptlrpc_request_addref(request);
+	if (obd->obd_svc_stats != NULL)
+		lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
+			atomic_read(&request->rq_import->imp_inflight));
+
+	OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
+
+	do_gettimeofday(&request->rq_arrival_time);
+	request->rq_sent = cfs_time_current_sec();
+	/* We give the server rq_timeout secs to process the req, and
+	   add the network latency for our local timeout. */
+	request->rq_deadline = request->rq_sent + request->rq_timeout +
+		ptlrpc_at_get_net_latency(request);
+
+	ptlrpc_pinger_sending_on_import(request->rq_import);
+
+	DEBUG_REQ(D_INFO, request, "send flg=%x",
+		  lustre_msg_get_flags(request->rq_reqmsg));
+	rc = ptl_send_buf(&request->rq_req_md_h,
+			  request->rq_reqbuf, request->rq_reqdata_len,
+			  LNET_NOACK_REQ, &request->rq_req_cbid,
+			  connection,
+			  request->rq_request_portal,
+			  request->rq_xid, 0);
+	if (rc == 0)
+		GOTO(out, rc);
+
+	ptlrpc_req_finished(request);
+	if (noreply)
+		GOTO(out, rc);
+
+ cleanup_me:
+	/* MEUnlink is safe; the PUT didn't even get off the ground, and
+	 * nobody apart from the PUT's target has the right nid+XID to
+	 * access the reply buffer. */
+	rc2 = LNetMEUnlink(reply_me_h);
+	LASSERT (rc2 == 0);
+	/* UNLINKED callback called synchronously */
+	LASSERT(!request->rq_receiving_reply);
+
+ cleanup_bulk:
+	/* We do sync unlink here as there was no real transfer here so
+	 * the chance to have long unlink to sluggish net is smaller here. */
+	ptlrpc_unregister_bulk(request, 0);
+ out:
+	if (request->rq_memalloc)
+		cfs_memory_pressure_restore(mpflag);
+	return rc;
+}
+EXPORT_SYMBOL(ptl_send_rpc);
+
+/**
+ * Register request buffer descriptor for request receiving.
+ */
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+	struct ptlrpc_service	  *service = rqbd->rqbd_svcpt->scp_service;
+	static lnet_process_id_t  match_id = {LNET_NID_ANY, LNET_PID_ANY};
+	int			  rc;
+	lnet_md_t		 md;
+	lnet_handle_me_t	  me_h;
+
+	CDEBUG(D_NET, "LNetMEAttach: portal %d\n",
+	       service->srv_req_portal);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD))
+		return (-ENOMEM);
+
+	/* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL,
+	 * which means buffer can only be attached on local CPT, and LND
+	 * threads can find it by grabbing a local lock */
+	rc = LNetMEAttach(service->srv_req_portal,
+			  match_id, 0, ~0, LNET_UNLINK,
+			  rqbd->rqbd_svcpt->scp_cpt >= 0 ?
+			  LNET_INS_LOCAL : LNET_INS_AFTER, &me_h);
+	if (rc != 0) {
+		CERROR("LNetMEAttach failed: %d\n", rc);
+		return (-ENOMEM);
+	}
+
+	LASSERT(rqbd->rqbd_refcount == 0);
+	rqbd->rqbd_refcount = 1;
+
+	md.start     = rqbd->rqbd_buffer;
+	md.length    = service->srv_buf_size;
+	md.max_size  = service->srv_max_req_size;
+	md.threshold = LNET_MD_THRESH_INF;
+	md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE;
+	md.user_ptr  = &rqbd->rqbd_cbid;
+	md.eq_handle = ptlrpc_eq_h;
+
+	rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h);
+	if (rc == 0)
+		return (0);
+
+	CERROR("LNetMDAttach failed: %d; \n", rc);
+	LASSERT (rc == -ENOMEM);
+	rc = LNetMEUnlink (me_h);
+	LASSERT (rc == 0);
+	rqbd->rqbd_refcount = 0;
+
+	return (-ENOMEM);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs.c b/drivers/staging/lustre/lustre/ptlrpc/nrs.c
new file mode 100644
index 0000000..1996431
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/nrs.c

@@ -0,0 +1,1790 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs.c
+ *
+ * Network Request Scheduler (NRS)
+ *
+ * Allows to reorder the handling of RPCs at servers.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+#include <linux/libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/* XXX: This is just for liblustre. Remove the #if defined directive when the
+ * "cfs_" prefix is dropped from cfs_list_head. */
+extern struct list_head ptlrpc_all_services;
+
+/**
+ * NRS core object.
+ */
+struct nrs_core nrs_core;
+
+static int nrs_policy_init(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_desc->pd_ops->op_policy_init != NULL ?
+	       policy->pol_desc->pd_ops->op_policy_init(policy) : 0;
+}
+
+static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_ref == 0);
+	LASSERT(policy->pol_req_queued == 0);
+
+	if (policy->pol_desc->pd_ops->op_policy_fini != NULL)
+		policy->pol_desc->pd_ops->op_policy_fini(policy);
+}
+
+static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy,
+				 enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	/**
+	 * The policy may be stopped, but the lprocfs files and
+	 * ptlrpc_nrs_policy instances remain present until unregistration time.
+	 * Do not perform the ctl operation if the policy is stopped, as
+	 * policy->pol_private will be NULL in such a case.
+	 */
+	if (policy->pol_state == NRS_POL_STATE_STOPPED)
+		RETURN(-ENODEV);
+
+	RETURN(policy->pol_desc->pd_ops->op_policy_ctl != NULL ?
+	       policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) :
+	       -ENOSYS);
+}
+
+static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs *nrs = policy->pol_nrs;
+	ENTRY;
+
+	if (policy->pol_desc->pd_ops->op_policy_stop != NULL) {
+		spin_unlock(&nrs->nrs_lock);
+
+		policy->pol_desc->pd_ops->op_policy_stop(policy);
+
+		spin_lock(&nrs->nrs_lock);
+	}
+
+	LASSERT(list_empty(&policy->pol_list_queued));
+	LASSERT(policy->pol_req_queued == 0 &&
+		policy->pol_req_started == 0);
+
+	policy->pol_private = NULL;
+
+	policy->pol_state = NRS_POL_STATE_STOPPED;
+
+	if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+		module_put(policy->pol_desc->pd_owner);
+
+	EXIT;
+}
+
+static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs *nrs = policy->pol_nrs;
+	ENTRY;
+
+	if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping)
+		RETURN(-EPERM);
+
+	if (policy->pol_state == NRS_POL_STATE_STARTING)
+		RETURN(-EAGAIN);
+
+	/* In progress or already stopped */
+	if (policy->pol_state != NRS_POL_STATE_STARTED)
+		RETURN(0);
+
+	policy->pol_state = NRS_POL_STATE_STOPPING;
+
+	/* Immediately make it invisible */
+	if (nrs->nrs_policy_primary == policy) {
+		nrs->nrs_policy_primary = NULL;
+
+	} else {
+		LASSERT(nrs->nrs_policy_fallback == policy);
+		nrs->nrs_policy_fallback = NULL;
+	}
+
+	/* I have the only refcount */
+	if (policy->pol_ref == 1)
+		nrs_policy_stop0(policy);
+
+	RETURN(0);
+}
+
+/**
+ * Transitions the \a nrs NRS head's primary policy to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no
+ * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED.
+ *
+ * \param[in] nrs the NRS head to carry out this operation on
+ */
+static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs)
+{
+	struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary;
+	ENTRY;
+
+	if (tmp == NULL) {
+		/**
+		 * XXX: This should really be RETURN_EXIT, but the latter does
+		 * not currently print anything out, and possibly should be
+		 * fixed to do so.
+		 */
+		EXIT;
+		return;
+	}
+
+	nrs->nrs_policy_primary = NULL;
+
+	LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED);
+	tmp->pol_state = NRS_POL_STATE_STOPPING;
+
+	if (tmp->pol_ref == 0)
+		nrs_policy_stop0(tmp);
+	EXIT;
+}
+
+/**
+ * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in
+ * response to an lprocfs command to start a policy.
+ *
+ * If a primary policy different to the current one is specified, this function
+ * will transition the new policy to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition
+ * the old primary policy (if there is one) to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED.
+ *
+ * If the fallback policy is specified, this is taken to indicate an instruction
+ * to stop the current primary policy, without substituting it with another
+ * primary policy, so the primary policy (if any) is transitioned to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In
+ * this case, the fallback policy is only left active in the NRS head.
+ */
+static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy)
+{
+	struct ptlrpc_nrs      *nrs = policy->pol_nrs;
+	int			rc = 0;
+	ENTRY;
+
+	/**
+	 * Don't allow multiple starting which is too complex, and has no real
+	 * benefit.
+	 */
+	if (nrs->nrs_policy_starting)
+		RETURN(-EAGAIN);
+
+	LASSERT(policy->pol_state != NRS_POL_STATE_STARTING);
+
+	if (policy->pol_state == NRS_POL_STATE_STOPPING)
+		RETURN(-EAGAIN);
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+		/**
+		 * This is for cases in which the user sets the policy to the
+		 * fallback policy (currently fifo for all services); i.e. the
+		 * user is resetting the policy to the default; so we stop the
+		 * primary policy, if any.
+		 */
+		if (policy == nrs->nrs_policy_fallback) {
+			nrs_policy_stop_primary(nrs);
+			RETURN(0);
+		}
+
+		/**
+		 * If we reach here, we must be setting up the fallback policy
+		 * at service startup time, and only a single policy with the
+		 * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can
+		 * register with NRS core.
+		 */
+		LASSERT(nrs->nrs_policy_fallback == NULL);
+	} else {
+		/**
+		 * Shouldn't start primary policy if w/o fallback policy.
+		 */
+		if (nrs->nrs_policy_fallback == NULL)
+			RETURN(-EPERM);
+
+		if (policy->pol_state == NRS_POL_STATE_STARTED)
+			RETURN(0);
+	}
+
+	/**
+	 * Increase the module usage count for policies registering from other
+	 * modules.
+	 */
+	if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 &&
+	    !try_module_get(policy->pol_desc->pd_owner)) {
+		atomic_dec(&policy->pol_desc->pd_refs);
+		CERROR("NRS: cannot get module for policy %s; is it alive?\n",
+		       policy->pol_desc->pd_name);
+		RETURN(-ENODEV);
+	}
+
+	/**
+	 * Serialize policy starting across the NRS head
+	 */
+	nrs->nrs_policy_starting = 1;
+
+	policy->pol_state = NRS_POL_STATE_STARTING;
+
+	if (policy->pol_desc->pd_ops->op_policy_start) {
+		spin_unlock(&nrs->nrs_lock);
+
+		rc = policy->pol_desc->pd_ops->op_policy_start(policy);
+
+		spin_lock(&nrs->nrs_lock);
+		if (rc != 0) {
+			if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+				module_put(policy->pol_desc->pd_owner);
+
+			policy->pol_state = NRS_POL_STATE_STOPPED;
+			GOTO(out, rc);
+		}
+	}
+
+	policy->pol_state = NRS_POL_STATE_STARTED;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+		/**
+		 * This path is only used at PTLRPC service setup time.
+		 */
+		nrs->nrs_policy_fallback = policy;
+	} else {
+		/*
+		 * Try to stop the current primary policy if there is one.
+		 */
+		nrs_policy_stop_primary(nrs);
+
+		/**
+		 * And set the newly-started policy as the primary one.
+		 */
+		nrs->nrs_policy_primary = policy;
+	}
+
+out:
+	nrs->nrs_policy_starting = 0;
+
+	RETURN(rc);
+}
+
+/**
+ * Increases the policy's usage reference count.
+ */
+static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy)
+{
+	policy->pol_ref++;
+}
+
+/**
+ * Decreases the policy's usage reference count, and stops the policy in case it
+ * was already stopping and have no more outstanding usage references (which
+ * indicates it has no more queued or started requests, and can be safely
+ * stopped).
+ */
+static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_ref > 0);
+
+	policy->pol_ref--;
+	if (unlikely(policy->pol_ref == 0 &&
+	    policy->pol_state == NRS_POL_STATE_STOPPING))
+		nrs_policy_stop0(policy);
+}
+
+static void nrs_policy_put(struct ptlrpc_nrs_policy *policy)
+{
+	spin_lock(&policy->pol_nrs->nrs_lock);
+	nrs_policy_put_locked(policy);
+	spin_unlock(&policy->pol_nrs->nrs_lock);
+}
+
+/**
+ * Find and return a policy by name.
+ */
+static struct ptlrpc_nrs_policy * nrs_policy_find_locked(struct ptlrpc_nrs *nrs,
+							 char *name)
+{
+	struct ptlrpc_nrs_policy *tmp;
+
+	list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) {
+		if (strncmp(tmp->pol_desc->pd_name, name,
+			    NRS_POL_NAME_MAX) == 0) {
+			nrs_policy_get_locked(tmp);
+			return tmp;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Release references for the resource hierarchy moving upwards towards the
+ * policy instance resource.
+ */
+static void nrs_resource_put(struct ptlrpc_nrs_resource *res)
+{
+	struct ptlrpc_nrs_policy *policy = res->res_policy;
+
+	if (policy->pol_desc->pd_ops->op_res_put != NULL) {
+		struct ptlrpc_nrs_resource *parent;
+
+		for (; res != NULL; res = parent) {
+			parent = res->res_parent;
+			policy->pol_desc->pd_ops->op_res_put(policy, res);
+		}
+	}
+}
+
+/**
+ * Obtains references for each resource in the resource hierarchy for request
+ * \a nrq if it is to be handled by \a policy.
+ *
+ * \param[in] policy	  the policy
+ * \param[in] nrq	  the request
+ * \param[in] moving_req  denotes whether this is a call to the function by
+ *			  ldlm_lock_reorder_req(), in order to move \a nrq to
+ *			  the high-priority NRS head; we should not sleep when
+ *			  set.
+ *
+ * \retval NULL		  resource hierarchy references not obtained
+ * \retval valid-pointer  the bottom level of the resource hierarchy
+ *
+ * \see ptlrpc_nrs_pol_ops::op_res_get()
+ */
+static
+struct ptlrpc_nrs_resource * nrs_resource_get(struct ptlrpc_nrs_policy *policy,
+					      struct ptlrpc_nrs_request *nrq,
+					      bool moving_req)
+{
+	/**
+	 * Set to NULL to traverse the resource hierarchy from the top.
+	 */
+	struct ptlrpc_nrs_resource *res = NULL;
+	struct ptlrpc_nrs_resource *tmp = NULL;
+	int			    rc;
+
+	while (1) {
+		rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res,
+							  &tmp, moving_req);
+		if (rc < 0) {
+			if (res != NULL)
+				nrs_resource_put(res);
+			return NULL;
+		}
+
+		LASSERT(tmp != NULL);
+		tmp->res_parent = res;
+		tmp->res_policy = policy;
+		res = tmp;
+		tmp = NULL;
+		/**
+		 * Return once we have obtained a reference to the bottom level
+		 * of the resource hierarchy.
+		 */
+		if (rc > 0)
+			return res;
+	}
+}
+
+/**
+ * Obtains resources for the resource hierarchies and policy references for
+ * the fallback and current primary policy (if any), that will later be used
+ * to handle request \a nrq.
+ *
+ * \param[in]  nrs  the NRS head instance that will be handling request \a nrq.
+ * \param[in]  nrq  the request that is being handled.
+ * \param[out] resp the array where references to the resource hierarchy are
+ *		    stored.
+ * \param[in]  moving_req  is set when obtaining resources while moving a
+ *			   request from a policy on the regular NRS head to a
+ *			   policy on the HP NRS head (via
+ *			   ldlm_lock_reorder_req()). It signifies that
+ *			   allocations to get resources should be atomic; for
+ *			   a full explanation, see comment in
+ *			   ptlrpc_nrs_pol_ops::op_res_get().
+ */
+static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs,
+				  struct ptlrpc_nrs_request *nrq,
+				  struct ptlrpc_nrs_resource **resp,
+				  bool moving_req)
+{
+	struct ptlrpc_nrs_policy   *primary = NULL;
+	struct ptlrpc_nrs_policy   *fallback = NULL;
+
+	memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX);
+
+	/**
+	 * Obtain policy references.
+	 */
+	spin_lock(&nrs->nrs_lock);
+
+	fallback = nrs->nrs_policy_fallback;
+	nrs_policy_get_locked(fallback);
+
+	primary = nrs->nrs_policy_primary;
+	if (primary != NULL)
+		nrs_policy_get_locked(primary);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	/**
+	 * Obtain resource hierarchy references.
+	 */
+	resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req);
+	LASSERT(resp[NRS_RES_FALLBACK] != NULL);
+
+	if (primary != NULL) {
+		resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq,
+							 moving_req);
+		/**
+		 * A primary policy may exist which may not wish to serve a
+		 * particular request for different reasons; release the
+		 * reference on the policy as it will not be used for this
+		 * request.
+		 */
+		if (resp[NRS_RES_PRIMARY] == NULL)
+			nrs_policy_put(primary);
+	}
+}
+
+/**
+ * Releases references to resource hierarchies and policies, because they are no
+ * longer required; used when request handling has been completed, or the
+ * request is moving to the high priority NRS head.
+ *
+ * \param resp	the resource hierarchy that is being released
+ *
+ * \see ptlrpcnrs_req_hp_move()
+ * \see ptlrpc_nrs_req_finalize()
+ */
+static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp)
+{
+	struct ptlrpc_nrs_policy *pols[NRS_RES_MAX];
+	struct ptlrpc_nrs	 *nrs = NULL;
+	int			  i;
+
+	for (i = 0; i < NRS_RES_MAX; i++) {
+		if (resp[i] != NULL) {
+			pols[i] = resp[i]->res_policy;
+			nrs_resource_put(resp[i]);
+			resp[i] = NULL;
+		} else {
+			pols[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < NRS_RES_MAX; i++) {
+		if (pols[i] == NULL)
+			continue;
+
+		if (nrs == NULL) {
+			nrs = pols[i]->pol_nrs;
+			spin_lock(&nrs->nrs_lock);
+		}
+		nrs_policy_put_locked(pols[i]);
+	}
+
+	if (nrs != NULL)
+		spin_unlock(&nrs->nrs_lock);
+}
+
+/**
+ * Obtains an NRS request from \a policy for handling or examination; the
+ * request should be removed in the 'handling' case.
+ *
+ * Calling into this function implies we already know the policy has a request
+ * waiting to be handled.
+ *
+ * \param[in] policy the policy from which a request
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  when set, it will force a policy to return a request if it
+ *		     has one pending
+ *
+ * \retval the NRS request to be handled
+ */
+static inline
+struct ptlrpc_nrs_request * nrs_request_get(struct ptlrpc_nrs_policy *policy,
+					    bool peek, bool force)
+{
+	struct ptlrpc_nrs_request *nrq;
+
+	LASSERT(policy->pol_req_queued > 0);
+
+	nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force);
+
+	LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy));
+
+	return nrq;
+}
+
+/**
+ * Enqueues request \a nrq for later handling, via one one the policies for
+ * which resources where earlier obtained via nrs_resource_get_safe(). The
+ * function attempts to enqueue the request first on the primary policy
+ * (if any), since this is the preferred choice.
+ *
+ * \param nrq the request being enqueued
+ *
+ * \see nrs_resource_get_safe()
+ */
+static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_nrs_policy *policy;
+	int			  rc;
+	int			  i;
+
+	/**
+	 * Try in descending order, because the primary policy (if any) is
+	 * the preferred choice.
+	 */
+	for (i = NRS_RES_MAX - 1; i >= 0; i--) {
+		if (nrq->nr_res_ptrs[i] == NULL)
+			continue;
+
+		nrq->nr_res_idx = i;
+		policy = nrq->nr_res_ptrs[i]->res_policy;
+
+		rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq);
+		if (rc == 0) {
+			policy->pol_nrs->nrs_req_queued++;
+			policy->pol_req_queued++;
+			return;
+		}
+	}
+	/**
+	 * Should never get here, as at least the primary policy's
+	 * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always
+	 * succeed.
+	 */
+	LBUG();
+}
+
+/**
+ * Called when a request has been handled
+ *
+ * \param[in] nrs the request that has been handled; can be used for
+ *		  job/resource control.
+ *
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq);
+
+	if (policy->pol_desc->pd_ops->op_req_stop)
+		policy->pol_desc->pd_ops->op_req_stop(policy, nrq);
+
+	LASSERT(policy->pol_nrs->nrs_req_started > 0);
+	LASSERT(policy->pol_req_started > 0);
+
+	policy->pol_nrs->nrs_req_started--;
+	policy->pol_req_started--;
+}
+
+/**
+ * Handler for operations that can be carried out on policies.
+ *
+ * Handles opcodes that are common to all policy types within NRS core, and
+ * passes any unknown opcodes to the policy-specific control function.
+ *
+ * \param[in]	  nrs  the NRS head this policy belongs to.
+ * \param[in]	  name the human-readable policy name; should be the same as
+ *		       ptlrpc_nrs_pol_desc::pd_name.
+ * \param[in]	  opc  the opcode of the operation being carried out.
+ * \param[in,out] arg  can be used to pass information in and out between when
+ *		       carrying an operation; usually data that is private to
+ *		       the policy at some level, or generic policy status
+ *		       information.
+ *
+ * \retval -ve error condition
+ * \retval   0 operation was carried out successfully
+ */
+static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name,
+			  enum ptlrpc_nrs_ctl opc, void *arg)
+{
+	struct ptlrpc_nrs_policy       *policy;
+	int				rc = 0;
+	ENTRY;
+
+	spin_lock(&nrs->nrs_lock);
+
+	policy = nrs_policy_find_locked(nrs, name);
+	if (policy == NULL)
+		GOTO(out, rc = -ENOENT);
+
+	switch (opc) {
+		/**
+		 * Unknown opcode, pass it down to the policy-specific control
+		 * function for handling.
+		 */
+	default:
+		rc = nrs_policy_ctl_locked(policy, opc, arg);
+		break;
+
+		/**
+		 * Start \e policy
+		 */
+	case PTLRPC_NRS_CTL_START:
+		rc = nrs_policy_start_locked(policy);
+		break;
+	}
+out:
+	if (policy != NULL)
+		nrs_policy_put_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Unregisters a policy by name.
+ *
+ * \param[in] nrs  the NRS head this policy belongs to.
+ * \param[in] name the human-readable policy name; should be the same as
+ *		   ptlrpc_nrs_pol_desc::pd_name
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name)
+{
+	struct ptlrpc_nrs_policy *policy = NULL;
+	ENTRY;
+
+	spin_lock(&nrs->nrs_lock);
+
+	policy = nrs_policy_find_locked(nrs, name);
+	if (policy == NULL) {
+		spin_unlock(&nrs->nrs_lock);
+
+		CERROR("Can't find NRS policy %s\n", name);
+		RETURN(-ENOENT);
+	}
+
+	if (policy->pol_ref > 1) {
+		CERROR("Policy %s is busy with %d references\n", name,
+		       (int)policy->pol_ref);
+		nrs_policy_put_locked(policy);
+
+		spin_unlock(&nrs->nrs_lock);
+		RETURN(-EBUSY);
+	}
+
+	LASSERT(policy->pol_req_queued == 0);
+	LASSERT(policy->pol_req_started == 0);
+
+	if (policy->pol_state != NRS_POL_STATE_STOPPED) {
+		nrs_policy_stop_locked(policy);
+		LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED);
+	}
+
+	list_del(&policy->pol_list);
+	nrs->nrs_num_pols--;
+
+	nrs_policy_put_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	nrs_policy_fini(policy);
+
+	LASSERT(policy->pol_private == NULL);
+	OBD_FREE_PTR(policy);
+
+	RETURN(0);
+}
+
+/**
+ * Register a policy from \policy descriptor \a desc with NRS head \a nrs.
+ *
+ * \param[in] nrs   the NRS head on which the policy will be registered.
+ * \param[in] desc  the policy descriptor from which the information will be
+ *		    obtained to register the policy.
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_register(struct ptlrpc_nrs *nrs,
+			       struct ptlrpc_nrs_pol_desc *desc)
+{
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_policy       *tmp;
+	struct ptlrpc_service_part     *svcpt = nrs->nrs_svcpt;
+	int				rc;
+	ENTRY;
+
+	LASSERT(svcpt != NULL);
+	LASSERT(desc->pd_ops != NULL);
+	LASSERT(desc->pd_ops->op_res_get != NULL);
+	LASSERT(desc->pd_ops->op_req_get != NULL);
+	LASSERT(desc->pd_ops->op_req_enqueue != NULL);
+	LASSERT(desc->pd_ops->op_req_dequeue != NULL);
+	LASSERT(desc->pd_compat != NULL);
+
+	OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable,
+			  svcpt->scp_cpt, sizeof(*policy), __GFP_IO);
+	if (policy == NULL)
+		RETURN(-ENOMEM);
+
+	policy->pol_nrs     = nrs;
+	policy->pol_desc    = desc;
+	policy->pol_state   = NRS_POL_STATE_STOPPED;
+	policy->pol_flags   = desc->pd_flags;
+
+	INIT_LIST_HEAD(&policy->pol_list);
+	INIT_LIST_HEAD(&policy->pol_list_queued);
+
+	rc = nrs_policy_init(policy);
+	if (rc != 0) {
+		OBD_FREE_PTR(policy);
+		RETURN(rc);
+	}
+
+	spin_lock(&nrs->nrs_lock);
+
+	tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name);
+	if (tmp != NULL) {
+		CERROR("NRS policy %s has been registered, can't register it "
+		       "for %s\n", policy->pol_desc->pd_name,
+		       svcpt->scp_service->srv_name);
+		nrs_policy_put_locked(tmp);
+
+		spin_unlock(&nrs->nrs_lock);
+		nrs_policy_fini(policy);
+		OBD_FREE_PTR(policy);
+
+		RETURN(-EEXIST);
+	}
+
+	list_add_tail(&policy->pol_list, &nrs->nrs_policy_list);
+	nrs->nrs_num_pols++;
+
+	if (policy->pol_flags & PTLRPC_NRS_FL_REG_START)
+		rc = nrs_policy_start_locked(policy);
+
+	spin_unlock(&nrs->nrs_lock);
+
+	if (rc != 0)
+		(void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+
+	RETURN(rc);
+}
+
+/**
+ * Enqueue request \a req using one of the policies its resources are referring
+ * to.
+ *
+ * \param[in] req the request to enqueue.
+ */
+static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_policy       *policy;
+
+	LASSERT(req->rq_nrq.nr_initialized);
+	LASSERT(!req->rq_nrq.nr_enqueued);
+
+	nrs_request_enqueue(&req->rq_nrq);
+	req->rq_nrq.nr_enqueued = 1;
+
+	policy = nrs_request_policy(&req->rq_nrq);
+	/**
+	 * Add the policy to the NRS head's list of policies with enqueued
+	 * requests, if it has not been added there.
+	 */
+	if (unlikely(list_empty(&policy->pol_list_queued)))
+		list_add_tail(&policy->pol_list_queued,
+				  &policy->pol_nrs->nrs_policy_queued);
+}
+
+/**
+ * Enqueue a request on the high priority NRS head.
+ *
+ * \param req the request to enqueue.
+ */
+static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req)
+{
+	int	opc = lustre_msg_get_opc(req->rq_reqmsg);
+	ENTRY;
+
+	spin_lock(&req->rq_lock);
+	req->rq_hp = 1;
+	ptlrpc_nrs_req_add_nolock(req);
+	if (opc != OBD_PING)
+		DEBUG_REQ(D_NET, req, "high priority req");
+	spin_unlock(&req->rq_lock);
+	EXIT;
+}
+
+/**
+ * Returns a boolean predicate indicating whether the policy described by
+ * \a desc is adequate for use with service \a svc.
+ *
+ * \param[in] svc  the service
+ * \param[in] desc the policy descriptor
+ *
+ * \retval false the policy is not compatible with the service
+ * \retval true	 the policy is compatible with the service
+ */
+static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc,
+					 const struct ptlrpc_nrs_pol_desc *desc)
+{
+	return desc->pd_compat(svc, desc);
+}
+
+/**
+ * Registers all compatible policies in nrs_core.nrs_policies, for NRS head
+ * \a nrs.
+ *
+ * \param[in] nrs the NRS head
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ *
+ * \see ptlrpc_service_nrs_setup()
+ */
+static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs)
+{
+	struct ptlrpc_nrs_pol_desc *desc;
+	/* for convenience */
+	struct ptlrpc_service_part	 *svcpt = nrs->nrs_svcpt;
+	struct ptlrpc_service		 *svc = svcpt->scp_service;
+	int				  rc = -EINVAL;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (nrs_policy_compatible(svc, desc)) {
+			rc = nrs_policy_register(nrs, desc);
+			if (rc != 0) {
+				CERROR("Failed to register NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svc->srv_name, rc);
+				/**
+				 * Fail registration if any of the policies'
+				 * registration fails.
+				 */
+				break;
+			}
+		}
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Initializes NRS head \a nrs of service partition \a svcpt, and registers all
+ * compatible policies in NRS core, with the NRS head.
+ *
+ * \param[in] nrs   the NRS head
+ * \param[in] svcpt the PTLRPC service partition to setup
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs,
+				   struct ptlrpc_service_part *svcpt)
+{
+	int				rc;
+	enum ptlrpc_nrs_queue_type	queue;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	if (nrs == &svcpt->scp_nrs_reg)
+		queue = PTLRPC_NRS_QUEUE_REG;
+	else if (nrs == svcpt->scp_nrs_hp)
+		queue = PTLRPC_NRS_QUEUE_HP;
+	else
+		LBUG();
+
+	nrs->nrs_svcpt = svcpt;
+	nrs->nrs_queue_type = queue;
+	spin_lock_init(&nrs->nrs_lock);
+	INIT_LIST_HEAD(&nrs->nrs_policy_list);
+	INIT_LIST_HEAD(&nrs->nrs_policy_queued);
+
+	rc = nrs_register_policies_locked(nrs);
+
+	RETURN(rc);
+}
+
+/**
+ * Allocates a regular and optionally a high-priority NRS head (if the service
+ * handles high-priority RPCs), and then registers all available compatible
+ * policies on those NRS heads.
+ *
+ * \param[in,out] svcpt the PTLRPC service partition to setup
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_nrs	       *nrs;
+	int				rc;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+	/**
+	 * Initialize the regular NRS head.
+	 */
+	nrs = nrs_svcpt2nrs(svcpt, false);
+	rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/**
+	 * Optionally allocate a high-priority NRS head.
+	 */
+	if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL)
+		GOTO(out, rc);
+
+	OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp,
+			  svcpt->scp_service->srv_cptable,
+			  svcpt->scp_cpt);
+	if (svcpt->scp_nrs_hp == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	nrs = nrs_svcpt2nrs(svcpt, true);
+	rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+
+out:
+	RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all available NRS heads in a service partition;
+ * called at PTLRPC service unregistration time.
+ *
+ * \param[in] svcpt the PTLRPC service partition
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_nrs_policy       *policy;
+	struct ptlrpc_nrs_policy       *tmp;
+	int				rc;
+	bool				hp = false;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+again:
+	nrs = nrs_svcpt2nrs(svcpt, hp);
+	nrs->nrs_stopping = 1;
+
+	list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list,
+				     pol_list) {
+		rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+		LASSERT(rc == 0);
+	}
+
+	/**
+	 * If the service partition has an HP NRS head, clean that up as well.
+	 */
+	if (!hp && nrs_svcpt_has_hp(svcpt)) {
+		hp = true;
+		goto again;
+	}
+
+	if (hp)
+		OBD_FREE_PTR(nrs);
+
+	EXIT;
+}
+
+/**
+ * Returns the descriptor for a policy as identified by by \a name.
+ *
+ * \param[in] name the policy name
+ *
+ * \retval the policy descriptor
+ * \retval NULL
+ */
+static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name)
+{
+	struct ptlrpc_nrs_pol_desc     *tmp;
+	ENTRY;
+
+	list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) {
+		if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0)
+			RETURN(tmp);
+	}
+	RETURN(NULL);
+}
+
+/**
+ * Removes the policy from all supported NRS heads of all partitions of all
+ * PTLRPC services.
+ *
+ * \param[in] desc the policy descriptor to unregister
+ *
+ * \retval -ve error
+ * \retval  0  successfully unregistered policy on all supported NRS heads
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ * \pre mutex_is_locked(&ptlrpc_all_services_mutex)
+ */
+static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc)
+{
+	struct ptlrpc_nrs	       *nrs;
+	struct ptlrpc_service	       *svc;
+	struct ptlrpc_service_part     *svcpt;
+	int				i;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+	LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex));
+
+	list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+
+		if (!nrs_policy_compatible(svc, desc) ||
+		    unlikely(svc->srv_is_stopping))
+			continue;
+
+		ptlrpc_service_for_each_part(svcpt, i, svc) {
+			bool hp = false;
+
+again:
+			nrs = nrs_svcpt2nrs(svcpt, hp);
+			rc = nrs_policy_unregister(nrs, desc->pd_name);
+			/**
+			 * Ignore -ENOENT as the policy may not have registered
+			 * successfully on all service partitions.
+			 */
+			if (rc == -ENOENT) {
+				rc = 0;
+			} else if (rc != 0) {
+				CERROR("Failed to unregister NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svcpt->scp_service->srv_name, rc);
+				RETURN(rc);
+			}
+
+			if (!hp && nrs_svc_has_hp(svc)) {
+				hp = true;
+				goto again;
+			}
+		}
+
+		if (desc->pd_ops->op_lprocfs_fini != NULL)
+			desc->pd_ops->op_lprocfs_fini(svc);
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Registers a new policy with NRS core.
+ *
+ * The function will only succeed if policy registration with all compatible
+ * service partitions (if any) is successful.
+ *
+ * N.B. This function should be called either at ptlrpc module initialization
+ *	time when registering a policy that ships with NRS core, or in a
+ *	module's init() function for policies registering from other modules.
+ *
+ * \param[in] conf configuration information for the new policy to register
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf)
+{
+	struct ptlrpc_service	       *svc;
+	struct ptlrpc_nrs_pol_desc     *desc;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(conf != NULL);
+	LASSERT(conf->nc_ops != NULL);
+	LASSERT(conf->nc_compat != NULL);
+	LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one,
+		conf->nc_compat_svc_name != NULL));
+	LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0,
+		     conf->nc_owner != NULL));
+
+	conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+	/**
+	 * External policies are not allowed to start immediately upon
+	 * registration, as there is a relatively higher chance that their
+	 * registration might fail. In such a case, some policy instances may
+	 * already have requests queued wen unregistration needs to happen as
+	 * part o cleanup; since there is currently no way to drain requests
+	 * from a policy unless the service is unregistering, we just disallow
+	 * this.
+	 */
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) &&
+	    (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK |
+			       PTLRPC_NRS_FL_REG_START))) {
+		CERROR("NRS: failing to register policy %s. Please check "
+		       "policy flags; external policies cannot act as fallback "
+		       "policies, or be started immediately upon registration "
+		       "without interaction with lprocfs\n", conf->nc_name);
+		RETURN(-EINVAL);
+	}
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) {
+		CERROR("NRS: failing to register policy %s which has already "
+		       "been registered with NRS core!\n",
+		       conf->nc_name);
+		GOTO(fail, rc = -EEXIST);
+	}
+
+	OBD_ALLOC_PTR(desc);
+	if (desc == NULL)
+		GOTO(fail, rc = -ENOMEM);
+
+	strncpy(desc->pd_name, conf->nc_name, NRS_POL_NAME_MAX);
+	desc->pd_ops		 = conf->nc_ops;
+	desc->pd_compat		 = conf->nc_compat;
+	desc->pd_compat_svc_name = conf->nc_compat_svc_name;
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0)
+		desc->pd_owner	 = conf->nc_owner;
+	desc->pd_flags		 = conf->nc_flags;
+	atomic_set(&desc->pd_refs, 0);
+
+	/**
+	 * For policies that are held in the same module as NRS (currently
+	 * ptlrpc), do not register the policy with all compatible services,
+	 * as the services will not have started at this point, since we are
+	 * calling from ptlrpc module initialization code. In such cases each
+	 * service will register all compatible policies later, via
+	 * ptlrpc_service_nrs_setup().
+	 */
+	if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0)
+		goto internal;
+
+	/**
+	 * Register the new policy on all compatible services
+	 */
+	mutex_lock(&ptlrpc_all_services_mutex);
+
+	list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+		struct ptlrpc_service_part     *svcpt;
+		int				i;
+		int				rc2;
+
+		if (!nrs_policy_compatible(svc, desc) ||
+		    unlikely(svc->srv_is_stopping))
+			continue;
+
+		ptlrpc_service_for_each_part(svcpt, i, svc) {
+			struct ptlrpc_nrs      *nrs;
+			bool			hp = false;
+again:
+			nrs = nrs_svcpt2nrs(svcpt, hp);
+			rc = nrs_policy_register(nrs, desc);
+			if (rc != 0) {
+				CERROR("Failed to register NRS policy %s for "
+				       "partition %d of service %s: %d\n",
+				       desc->pd_name, svcpt->scp_cpt,
+				       svcpt->scp_service->srv_name, rc);
+
+				rc2 = nrs_policy_unregister_locked(desc);
+				/**
+				 * Should not fail at this point
+				 */
+				LASSERT(rc2 == 0);
+				mutex_unlock(&ptlrpc_all_services_mutex);
+				OBD_FREE_PTR(desc);
+				GOTO(fail, rc);
+			}
+
+			if (!hp && nrs_svc_has_hp(svc)) {
+				hp = true;
+				goto again;
+			}
+		}
+
+		/**
+		 * No need to take a reference to other modules here, as we
+		 * will be calling from the module's init() function.
+		 */
+		if (desc->pd_ops->op_lprocfs_init != NULL) {
+			rc = desc->pd_ops->op_lprocfs_init(svc);
+			if (rc != 0) {
+				rc2 = nrs_policy_unregister_locked(desc);
+				/**
+				 * Should not fail at this point
+				 */
+				LASSERT(rc2 == 0);
+				mutex_unlock(&ptlrpc_all_services_mutex);
+				OBD_FREE_PTR(desc);
+				GOTO(fail, rc);
+			}
+		}
+	}
+
+	mutex_unlock(&ptlrpc_all_services_mutex);
+internal:
+	list_add_tail(&desc->pd_list, &nrs_core.nrs_policies);
+fail:
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_register);
+
+/**
+ * Unregisters a previously registered policy with NRS core. All instances of
+ * the policy on all NRS heads of all supported services are removed.
+ *
+ * N.B. This function should only be called from a module's exit() function.
+ *	Although it can be used for policies that ship alongside NRS core, the
+ *	function is primarily intended for policies that register externally,
+ *	from other modules.
+ *
+ * \param[in] conf configuration information for the policy to unregister
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf)
+{
+	struct ptlrpc_nrs_pol_desc	*desc;
+	int				 rc;
+	ENTRY;
+
+	LASSERT(conf != NULL);
+
+	if (conf->nc_flags & PTLRPC_NRS_FL_FALLBACK) {
+		CERROR("Unable to unregister a fallback policy, unless the "
+		       "PTLRPC service is stopping.\n");
+		RETURN(-EPERM);
+	}
+
+	conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	desc = nrs_policy_find_desc_locked(conf->nc_name);
+	if (desc == NULL) {
+		CERROR("Failing to unregister NRS policy %s which has "
+		       "not been registered with NRS core!\n",
+		       conf->nc_name);
+		GOTO(not_exist, rc = -ENOENT);
+	}
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+
+	rc = nrs_policy_unregister_locked(desc);
+	if (rc < 0) {
+		if (rc == -EBUSY)
+			CERROR("Please first stop policy %s on all service "
+			       "partitions and then retry to unregister the "
+			       "policy.\n", conf->nc_name);
+		GOTO(fail, rc);
+	}
+
+	CDEBUG(D_INFO, "Unregistering policy %s from NRS core.\n",
+	       conf->nc_name);
+
+	list_del(&desc->pd_list);
+	OBD_FREE_PTR(desc);
+
+fail:
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+not_exist:
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_unregister);
+
+/**
+ * Setup NRS heads on all service partitions of service \a svc, and register
+ * all compatible policies on those NRS heads.
+ *
+ * To be called from withing ptl
+ * \param[in] svc the service to setup
+ *
+ * \retval -ve error, the calling logic should eventually call
+ *		      ptlrpc_service_nrs_cleanup() to undo any work performed
+ *		      by this function.
+ *
+ * \see ptlrpc_register_service()
+ * \see ptlrpc_service_nrs_cleanup()
+ */
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	       *svcpt;
+	const struct ptlrpc_nrs_pol_desc       *desc;
+	int					i;
+	int					rc = 0;
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Initialize NRS heads on all service CPTs.
+	 */
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		rc = nrs_svcpt_setup_locked(svcpt);
+		if (rc != 0)
+			GOTO(failed, rc);
+	}
+
+	/**
+	 * Set up lprocfs interfaces for all supported policies for the
+	 * service.
+	 */
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (!nrs_policy_compatible(svc, desc))
+			continue;
+
+		if (desc->pd_ops->op_lprocfs_init != NULL) {
+			rc = desc->pd_ops->op_lprocfs_init(svc);
+			if (rc != 0)
+				GOTO(failed, rc);
+		}
+	}
+
+failed:
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+
+	RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all service partitions of service \a svc.
+ *
+ * \param[in] svc the PTLRPC service to unregister
+ */
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	     *svcpt;
+	const struct ptlrpc_nrs_pol_desc     *desc;
+	int				      i;
+
+	mutex_lock(&nrs_core.nrs_mutex);
+
+	/**
+	 * Clean up NRS heads on all service partitions
+	 */
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		nrs_svcpt_cleanup_locked(svcpt);
+
+	/**
+	 * Clean up lprocfs interfaces for all supported policies for the
+	 * service.
+	 */
+	list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+		if (!nrs_policy_compatible(svc, desc))
+			continue;
+
+		if (desc->pd_ops->op_lprocfs_fini != NULL)
+			desc->pd_ops->op_lprocfs_fini(svc);
+	}
+
+	mutex_unlock(&nrs_core.nrs_mutex);
+}
+
+/**
+ * Obtains NRS head resources for request \a req.
+ *
+ * These could be either on the regular or HP NRS head of \a svcpt; resources
+ * taken on the regular head can later be swapped for HP head resources by
+ * ldlm_lock_reorder_req().
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request
+ * \param[in] hp    which NRS head of \a svcpt to use
+ */
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+			       struct ptlrpc_request *req, bool hp)
+{
+	struct ptlrpc_nrs	*nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	memset(&req->rq_nrq, 0, sizeof(req->rq_nrq));
+	nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs,
+			      false);
+
+	/**
+	 * It is fine to access \e nr_initialized without locking as there is
+	 * no contention at this early stage.
+	 */
+	req->rq_nrq.nr_initialized = 1;
+}
+
+/**
+ * Releases resources for a request; is called after the request has been
+ * handled.
+ *
+ * \param[in] req the request
+ *
+ * \see ptlrpc_server_finish_request()
+ */
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req)
+{
+	if (req->rq_nrq.nr_initialized) {
+		nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs);
+		/* no protection on bit nr_initialized because no
+		 * contention at this late stage */
+		req->rq_nrq.nr_finalized = 1;
+	}
+}
+
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req)
+{
+	if (req->rq_nrq.nr_started)
+		nrs_request_stop(&req->rq_nrq);
+}
+
+/**
+ * Enqueues request \a req on either the regular or high-priority NRS head
+ * of service partition \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request to be enqueued
+ * \param[in] hp    whether to enqueue the request on the regular or
+ *		    high-priority NRS head.
+ */
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+			struct ptlrpc_request *req, bool hp)
+{
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (hp)
+		ptlrpc_nrs_hpreq_add_nolock(req);
+	else
+		ptlrpc_nrs_req_add_nolock(req);
+
+	spin_unlock(&svcpt->scp_req_lock);
+}
+
+static void nrs_request_removed(struct ptlrpc_nrs_policy *policy)
+{
+	LASSERT(policy->pol_nrs->nrs_req_queued > 0);
+	LASSERT(policy->pol_req_queued > 0);
+
+	policy->pol_nrs->nrs_req_queued--;
+	policy->pol_req_queued--;
+
+	/**
+	 * If the policy has no more requests queued, remove it from
+	 * ptlrpc_nrs::nrs_policy_queued.
+	 */
+	if (unlikely(policy->pol_req_queued == 0)) {
+		list_del_init(&policy->pol_list_queued);
+
+		/**
+		 * If there are other policies with queued requests, move the
+		 * current policy to the end so that we can round robin over
+		 * all policies and drain the requests.
+		 */
+	} else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) {
+		LASSERT(policy->pol_req_queued <
+			policy->pol_nrs->nrs_req_queued);
+
+		list_move_tail(&policy->pol_list_queued,
+				   &policy->pol_nrs->nrs_policy_queued);
+	}
+}
+
+/**
+ * Obtains a request for handling from an NRS head of service partition
+ * \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] hp    whether to obtain a request from the regular or
+ *		    high-priority NRS head.
+ * \param[in] peek  when set, signifies that we just want to examine the
+ *		    request, and not handle it, so the request is not removed
+ *		    from the policy.
+ * \param[in] force when set, it will force a policy to return a request if it
+ *		    has one pending
+ *
+ * \retval the	request to be handled
+ * \retval NULL the head has no requests to serve
+ */
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+			   bool peek, bool force)
+{
+	struct ptlrpc_nrs	  *nrs = nrs_svcpt2nrs(svcpt, hp);
+	struct ptlrpc_nrs_policy  *policy;
+	struct ptlrpc_nrs_request *nrq;
+
+	/**
+	 * Always try to drain requests from all NRS polices even if they are
+	 * inactive, because the user can change policy status at runtime.
+	 */
+	list_for_each_entry(policy, &nrs->nrs_policy_queued,
+				pol_list_queued) {
+		nrq = nrs_request_get(policy, peek, force);
+		if (nrq != NULL) {
+			if (likely(!peek)) {
+				nrq->nr_started = 1;
+
+				policy->pol_req_started++;
+				policy->pol_nrs->nrs_req_started++;
+
+				nrs_request_removed(policy);
+			}
+
+			return container_of(nrq, struct ptlrpc_request, rq_nrq);
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * Dequeues request \a req from the policy it has been enqueued on.
+ *
+ * \param[in] req the request
+ */
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req)
+{
+	struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq);
+
+	policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq);
+
+	req->rq_nrq.nr_enqueued = 0;
+
+	nrs_request_removed(policy);
+}
+
+/**
+ * Returns whether there are any requests currently enqueued on any of the
+ * policies of service partition's \a svcpt NRS head specified by \a hp. Should
+ * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable
+ * result.
+ *
+ * \param[in] svcpt the service partition to enquire.
+ * \param[in] hp    whether the regular or high-priority NRS head is to be
+ *		    enquired.
+ *
+ * \retval false the indicated NRS head has no enqueued requests.
+ * \retval true	 the indicated NRS head has some enqueued requests.
+ */
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+	return nrs->nrs_req_queued > 0;
+};
+
+/**
+ * Moves request \a req from the regular to the high-priority NRS head.
+ *
+ * \param[in] req the request to move
+ */
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part	*svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_nrs_request	*nrq = &req->rq_nrq;
+	struct ptlrpc_nrs_resource	*res1[NRS_RES_MAX];
+	struct ptlrpc_nrs_resource	*res2[NRS_RES_MAX];
+	ENTRY;
+
+	/**
+	 * Obtain the high-priority NRS head resources.
+	 */
+	nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true);
+
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (!ptlrpc_nrs_req_can_move(req))
+		goto out;
+
+	ptlrpc_nrs_req_del_nolock(req);
+
+	memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0]));
+	memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0]));
+
+	ptlrpc_nrs_hpreq_add_nolock(req);
+
+	memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0]));
+out:
+	spin_unlock(&svcpt->scp_req_lock);
+
+	/**
+	 * Release either the regular NRS head resources if we moved the
+	 * request, or the high-priority NRS head resources if we took a
+	 * reference earlier in this function and ptlrpc_nrs_req_can_move()
+	 * returned false.
+	 */
+	nrs_resource_put_safe(res1);
+	EXIT;
+}
+
+/**
+ * Carries out a control operation \a opc on the policy identified by the
+ * human-readable \a name, on either all partitions, or only on the first
+ * partition of service \a svc.
+ *
+ * \param[in]	  svc	 the service the policy belongs to.
+ * \param[in]	  queue  whether to carry out the command on the policy which
+ *			 belongs to the regular, high-priority, or both NRS
+ *			 heads of service partitions of \a svc.
+ * \param[in]	  name   the policy to act upon, by human-readable name
+ * \param[in]	  opc	 the opcode of the operation to carry out
+ * \param[in]	  single when set, the operation will only be carried out on the
+ *			 NRS heads of the first service partition of \a svc.
+ *			 This is useful for some policies which e.g. share
+ *			 identical values on the same parameters of different
+ *			 service partitions; when reading these parameters via
+ *			 lprocfs, these policies may just want to obtain and
+ *			 print out the values from the first service partition.
+ *			 Storing these values centrally elsewhere then could be
+ *			 another solution for this.
+ * \param[in,out] arg	 can be used as a generic in/out buffer between control
+ *			 operations and the user environment.
+ *
+ *\retval -ve error condition
+ *\retval   0 operation was carried out successfully
+ */
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+			      enum ptlrpc_nrs_queue_type queue, char *name,
+			      enum ptlrpc_nrs_ctl opc, bool single, void *arg)
+{
+	struct ptlrpc_service_part     *svcpt;
+	int				i;
+	int				rc = 0;
+	ENTRY;
+
+	LASSERT(opc != PTLRPC_NRS_CTL_INVALID);
+
+	if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0)
+		return -EINVAL;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+			rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name,
+					    opc, arg);
+			if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG &&
+					single))
+				GOTO(out, rc);
+		}
+
+		if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+			/**
+			 * XXX: We could optionally check for
+			 * nrs_svc_has_hp(svc) here, and return an error if it
+			 * is false. Right now we rely on the policies' lprocfs
+			 * handlers that call the present function to make this
+			 * check; if they fail to do so, they might hit the
+			 * assertion inside nrs_svcpt2nrs() below.
+			 */
+			rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name,
+					    opc, arg);
+			if (rc != 0 || single)
+				GOTO(out, rc);
+		}
+	}
+out:
+	RETURN(rc);
+}
+
+
+/* ptlrpc/nrs_fifo.c */
+extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo;
+
+/**
+ * Adds all policies that ship with the ptlrpc module, to NRS core's list of
+ * policies \e nrs_core.nrs_policies.
+ *
+ * \retval 0 all policies have been registered successfully
+ * \retval -ve error
+ */
+int ptlrpc_nrs_init(void)
+{
+	int	rc;
+	ENTRY;
+
+	mutex_init(&nrs_core.nrs_mutex);
+	INIT_LIST_HEAD(&nrs_core.nrs_policies);
+
+	rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo);
+	if (rc != 0)
+		GOTO(fail, rc);
+
+
+	RETURN(rc);
+fail:
+	/**
+	 * Since no PTLRPC services have been started at this point, all we need
+	 * to do for cleanup is to free the descriptors.
+	 */
+	ptlrpc_nrs_fini();
+
+	RETURN(rc);
+}
+
+/**
+ * Removes all policy desciptors from nrs_core::nrs_policies, and frees the
+ * policy descriptors.
+ *
+ * Since all PTLRPC services are stopped at this point, there are no more
+ * instances of any policies, because each service will have stopped its policy
+ * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the
+ * descriptors here.
+ */
+void ptlrpc_nrs_fini(void)
+{
+	struct ptlrpc_nrs_pol_desc *desc;
+	struct ptlrpc_nrs_pol_desc *tmp;
+
+	list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies,
+				     pd_list) {
+		list_del_init(&desc->pd_list);
+		OBD_FREE_PTR(desc);
+	}
+}
+
+/** @} nrs */

diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c
new file mode 100644
index 0000000..ddfb510
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c

@@ -0,0 +1,40 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_crr.c
+ *
+ * Network Request Scheduler (NRS) CRR-N policy
+ *
+ * Request ordering in a batched Round-Robin manner over client NIDs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */

diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c
new file mode 100644
index 0000000..7d3ee97
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c

@@ -0,0 +1,270 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_fifo.c
+ *
+ * Network Request Scheduler (NRS) FIFO policy
+ *
+ * Handles RPCs in a FIFO manner, as received from the network. This policy is
+ * a logical wrapper around previous, non-NRS functionality. It is used as the
+ * default and fallback policy for all types of RPCs on all PTLRPC service
+ * partitions, for both regular and high-priority NRS heads. Default here means
+ * the policy is the one enabled at PTLRPC service partition startup time, and
+ * fallback means the policy is used to handle RPCs that are not handled
+ * successfully or are not handled at all by any primary policy that may be
+ * enabled on a given NRS head.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name fifo
+ *
+ * The FIFO policy is a logical wrapper around previous, non-NRS functionality.
+ * It schedules RPCs in the same order as they are queued from LNet.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_FIFO	"fifo"
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a
+ * policy-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0	   success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_fifo_head *head;
+
+	OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+	if (head == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&head->fh_list);
+	policy->pol_private = head;
+	return 0;
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy)
+{
+	struct nrs_fifo_head *head = policy->pol_private;
+
+	LASSERT(head != NULL);
+	LASSERT(list_empty(&head->fh_list));
+
+	OBD_FREE_PTR(head);
+}
+
+/**
+ * Is called for obtaining a FIFO policy resource.
+ *
+ * \param[in]  policy	  The policy on which the request is being asked for
+ * \param[in]  nrq	  The request for which resources are being taken
+ * \param[in]  parent	  Parent resource, unused in this policy
+ * \param[out] resp	  Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *			  policy
+ *
+ * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since
+ *	     it implements a simple scheduling algorithm in which request
+ *	     priority is determined on the request arrival order, it does not
+ *	     need to maintain a set of resources that would otherwise be used
+ *	     to calculate a request's priority.
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq,
+			    const struct ptlrpc_nrs_resource *parent,
+			    struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+	/**
+	 * Just return the resource embedded inside nrs_fifo_head, and end this
+	 * resource hierarchy reference request.
+	 */
+	*resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res;
+	return 1;
+}
+
+/**
+ * Called when getting a request from the FIFO policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *		     request, and not handle it, so the request is not removed
+ *		     from the policy.
+ * \param[in] force  Force the policy to return a request; unused in this
+ *		     policy
+ *
+ * \retval The request to be handled; this is the next request in the FIFO
+ *	   queue
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request * nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy,
+					     bool peek, bool force)
+{
+	struct nrs_fifo_head	  *head = policy->pol_private;
+	struct ptlrpc_nrs_request *nrq;
+
+	nrq = unlikely(list_empty(&head->fh_list)) ? NULL :
+	      list_entry(head->fh_list.next, struct ptlrpc_nrs_request,
+			     nr_u.fifo.fr_list);
+
+	if (likely(!peek && nrq != NULL)) {
+		struct ptlrpc_request *req = container_of(nrq,
+							  struct ptlrpc_request,
+							  rq_nrq);
+
+		list_del_init(&nrq->nr_u.fifo.fr_list);
+
+		CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: "LPU64
+		       "\n", policy->pol_desc->pd_name,
+		       libcfs_id2str(req->rq_peer), nrq->nr_u.fifo.fr_sequence);
+	}
+
+	return nrq;
+}
+
+/**
+ * Adds request \a nrq to \a policy's list of queued requests
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 success; nrs_request_enqueue() assumes this function will always
+ *		      succeed
+ */
+static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy,
+			    struct ptlrpc_nrs_request *nrq)
+{
+	struct nrs_fifo_head *head;
+
+	head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head,
+			    fh_res);
+	/**
+	 * Only used for debugging
+	 */
+	nrq->nr_u.fifo.fr_sequence = head->fh_sequence++;
+	list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list);
+
+	return 0;
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy,
+			     struct ptlrpc_nrs_request *nrq)
+{
+	LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list));
+	list_del_init(&nrq->nr_u.fifo.fr_list);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy,
+			      struct ptlrpc_nrs_request *nrq)
+{
+	struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+						  rq_nrq);
+
+	CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: "LPU64"\n",
+	       policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+	       nrq->nr_u.fifo.fr_sequence);
+}
+
+/**
+ * FIFO policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = {
+	.op_policy_start	= nrs_fifo_start,
+	.op_policy_stop		= nrs_fifo_stop,
+	.op_res_get		= nrs_fifo_res_get,
+	.op_req_get		= nrs_fifo_req_get,
+	.op_req_enqueue		= nrs_fifo_req_add,
+	.op_req_dequeue		= nrs_fifo_req_del,
+	.op_req_stop		= nrs_fifo_req_stop,
+};
+
+/**
+ * FIFO policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_fifo = {
+	.nc_name		= NRS_POL_NAME_FIFO,
+	.nc_ops			= &nrs_fifo_ops,
+	.nc_compat		= nrs_policy_compat_all,
+	.nc_flags		= PTLRPC_NRS_FL_FALLBACK |
+				  PTLRPC_NRS_FL_REG_START
+};
+
+/** @} fifo */
+
+/** @} nrs */

diff --git a/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c
new file mode 100644
index 0000000..1437636
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c

@@ -0,0 +1,2575 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pack_generic.c
+ *
+ * (Un)packing of OST requests
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <obd_cksum.h>
+#include <lustre/ll_fiemap.h>
+
+static inline int lustre_msg_hdr_size_v2(int count)
+{
+	return cfs_size_round(offsetof(struct lustre_msg_v2,
+				       lm_buflens[count]));
+}
+
+int lustre_msg_hdr_size(__u32 magic, int count)
+{
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_hdr_size_v2(count);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_hdr_size);
+
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+			    int index)
+{
+	if (inout)
+		lustre_set_req_swabbed(req, index);
+	else
+		lustre_set_rep_swabbed(req, index);
+}
+EXPORT_SYMBOL(ptlrpc_buf_set_swabbed);
+
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+			 int index)
+{
+	if (inout)
+		return (ptlrpc_req_need_swab(req) &&
+			!lustre_req_swabbed(req, index));
+	else
+		return (ptlrpc_rep_need_swab(req) &&
+			!lustre_rep_swabbed(req, index));
+}
+EXPORT_SYMBOL(ptlrpc_buf_need_swab);
+
+static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
+					      __u32 version)
+{
+	__u32 ver = lustre_msg_get_version(msg);
+	return (ver & LUSTRE_VERSION_MASK) != version;
+}
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		CERROR("msg v1 not supported - please upgrade you system\n");
+		return -EINVAL;
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_check_version_v2(msg, version);
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_check_version);
+
+/* early reply size */
+int lustre_msg_early_size()
+{
+	static int size = 0;
+	if (!size) {
+		/* Always reply old ptlrpc_body_v2 to keep interoprability
+		 * with the old client (< 2.3) which doesn't have pb_jobid
+		 * in the ptlrpc_body.
+		 *
+		 * XXX Remove this whenever we dorp interoprability with such
+		 *     client.
+		 */
+		__u32 pblen = sizeof(struct ptlrpc_body_v2);
+		size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen);
+	}
+	return size;
+}
+EXPORT_SYMBOL(lustre_msg_early_size);
+
+int lustre_msg_size_v2(int count, __u32 *lengths)
+{
+	int size;
+	int i;
+
+	size = lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++)
+		size += cfs_size_round(lengths[i]);
+
+	return size;
+}
+EXPORT_SYMBOL(lustre_msg_size_v2);
+
+/* This returns the size of the buffer that is required to hold a lustre_msg
+ * with the given sub-buffer lengths.
+ * NOTE: this should only be used for NEW requests, and should always be
+ *       in the form of a v2 request.  If this is a connection to a v1
+ *       target then the first buffer will be stripped because the ptlrpc
+ *       data is part of the lustre_msg_v1 header. b=14043 */
+int lustre_msg_size(__u32 magic, int count, __u32 *lens)
+{
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2));
+
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_size_v2(count, lens);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_size);
+
+/* This is used to determine the size of a buffer that was already packed
+ * and will correctly handle the different message formats. */
+int lustre_packed_msg_size(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_packed_msg_size);
+
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+			char **bufs)
+{
+	char *ptr;
+	int i;
+
+	msg->lm_bufcount = count;
+	/* XXX: lm_secflvr uninitialized here */
+	msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+
+	for (i = 0; i < count; i++)
+		msg->lm_buflens[i] = lens[i];
+
+	if (bufs == NULL)
+		return;
+
+	ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
+	for (i = 0; i < count; i++) {
+		char *tmp = bufs[i];
+		LOGL(tmp, lens[i], ptr);
+	}
+}
+EXPORT_SYMBOL(lustre_init_msg_v2);
+
+static int lustre_pack_request_v2(struct ptlrpc_request *req,
+				  int count, __u32 *lens, char **bufs)
+{
+	int reqlen, rc;
+
+	reqlen = lustre_msg_size_v2(count, lens);
+
+	rc = sptlrpc_cli_alloc_reqbuf(req, reqlen);
+	if (rc)
+		return rc;
+
+	req->rq_reqlen = reqlen;
+
+	lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs);
+	lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION);
+	return 0;
+}
+
+int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count,
+			__u32 *lens, char **bufs)
+{
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+	/* only use new format, we don't need to be compatible with 1.4 */
+	magic = LUSTRE_MSG_MAGIC_V2;
+
+	switch (magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_pack_request_v2(req, count, lens, bufs);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_pack_request);
+
+#if RS_DEBUG
+LIST_HEAD(ptlrpc_rs_debug_lru);
+spinlock_t ptlrpc_rs_debug_lock;
+
+#define PTLRPC_RS_DEBUG_LRU_ADD(rs)					\
+do {									\
+	spin_lock(&ptlrpc_rs_debug_lock);				\
+	list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru);	\
+	spin_unlock(&ptlrpc_rs_debug_lock);				\
+} while (0)
+
+#define PTLRPC_RS_DEBUG_LRU_DEL(rs)					\
+do {									\
+	spin_lock(&ptlrpc_rs_debug_lock);				\
+	list_del(&(rs)->rs_debug_list);				\
+	spin_unlock(&ptlrpc_rs_debug_lock);				\
+} while (0)
+#else
+# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0)
+# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0)
+#endif
+
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_reply_state *rs = NULL;
+
+	spin_lock(&svcpt->scp_rep_lock);
+
+	/* See if we have anything in a pool, and wait if nothing */
+	while (list_empty(&svcpt->scp_rep_idle)) {
+		struct l_wait_info	lwi;
+		int			rc;
+
+		spin_unlock(&svcpt->scp_rep_lock);
+		/* If we cannot get anything for some long time, we better
+		 * bail out instead of waiting infinitely */
+		lwi = LWI_TIMEOUT(cfs_time_seconds(10), NULL, NULL);
+		rc = l_wait_event(svcpt->scp_rep_waitq,
+				  !list_empty(&svcpt->scp_rep_idle), &lwi);
+		if (rc != 0)
+			goto out;
+		spin_lock(&svcpt->scp_rep_lock);
+	}
+
+	rs = list_entry(svcpt->scp_rep_idle.next,
+			    struct ptlrpc_reply_state, rs_list);
+	list_del(&rs->rs_list);
+
+	spin_unlock(&svcpt->scp_rep_lock);
+
+	memset(rs, 0, svcpt->scp_service->srv_max_reply_size);
+	rs->rs_svcpt = svcpt;
+	rs->rs_prealloc = 1;
+out:
+	return rs;
+}
+
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	spin_lock(&svcpt->scp_rep_lock);
+	list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+	spin_unlock(&svcpt->scp_rep_lock);
+	wake_up(&svcpt->scp_rep_waitq);
+}
+
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+			 __u32 *lens, char **bufs, int flags)
+{
+	struct ptlrpc_reply_state *rs;
+	int			msg_len, rc;
+	ENTRY;
+
+	LASSERT(req->rq_reply_state == NULL);
+
+	if ((flags & LPRFL_EARLY_REPLY) == 0) {
+		spin_lock(&req->rq_lock);
+		req->rq_packed_final = 1;
+		spin_unlock(&req->rq_lock);
+	}
+
+	msg_len = lustre_msg_size_v2(count, lens);
+	rc = sptlrpc_svc_alloc_rs(req, msg_len);
+	if (rc)
+		RETURN(rc);
+
+	rs = req->rq_reply_state;
+	atomic_set(&rs->rs_refcount, 1);    /* 1 ref for rq_reply_state */
+	rs->rs_cb_id.cbid_fn = reply_out_callback;
+	rs->rs_cb_id.cbid_arg = rs;
+	rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt;
+	INIT_LIST_HEAD(&rs->rs_exp_list);
+	INIT_LIST_HEAD(&rs->rs_obd_list);
+	INIT_LIST_HEAD(&rs->rs_list);
+	spin_lock_init(&rs->rs_lock);
+
+	req->rq_replen = msg_len;
+	req->rq_reply_state = rs;
+	req->rq_repmsg = rs->rs_msg;
+
+	lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+	lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+
+	PTLRPC_RS_DEBUG_LRU_ADD(rs);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_pack_reply_v2);
+
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens,
+			    char **bufs, int flags)
+{
+	int rc = 0;
+	__u32 size[] = { sizeof(struct ptlrpc_body) };
+
+	if (!lens) {
+		LASSERT(count == 1);
+		lens = size;
+	}
+
+	LASSERT(count > 0);
+	LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		rc = lustre_pack_reply_v2(req, count, lens, bufs, flags);
+		break;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n",
+			 req->rq_reqmsg->lm_magic);
+		rc = -EINVAL;
+	}
+	if (rc != 0)
+		CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc,
+		       lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens));
+	return rc;
+}
+EXPORT_SYMBOL(lustre_pack_reply_flags);
+
+int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens,
+		      char **bufs)
+{
+	return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+EXPORT_SYMBOL(lustre_pack_reply);
+
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size)
+{
+	int i, offset, buflen, bufcount;
+
+	LASSERT(m != NULL);
+	LASSERT(n >= 0);
+
+	bufcount = m->lm_bufcount;
+	if (unlikely(n >= bufcount)) {
+		CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+		       m, n, bufcount);
+		return NULL;
+	}
+
+	buflen = m->lm_buflens[n];
+	if (unlikely(buflen < min_size)) {
+		CERROR("msg %p buffer[%d] size %d too small "
+		       "(required %d, opc=%d)\n", m, n, buflen, min_size,
+		       n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+		return NULL;
+	}
+
+	offset = lustre_msg_hdr_size_v2(bufcount);
+	for (i = 0; i < n; i++)
+		offset += cfs_size_round(m->lm_buflens[i]);
+
+	return (char *)m + offset;
+}
+
+void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_buf_v2(m, n, min_size);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x(msg:%p)\n", m->lm_magic, m);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_buf);
+
+int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, int segment,
+			 unsigned int newlen, int move_data)
+{
+	char   *tail = NULL, *newpos;
+	int     tail_len = 0, n;
+
+	LASSERT(msg);
+	LASSERT(msg->lm_bufcount > segment);
+	LASSERT(msg->lm_buflens[segment] >= newlen);
+
+	if (msg->lm_buflens[segment] == newlen)
+		goto out;
+
+	if (move_data && msg->lm_bufcount > segment + 1) {
+		tail = lustre_msg_buf_v2(msg, segment + 1, 0);
+		for (n = segment + 1; n < msg->lm_bufcount; n++)
+			tail_len += cfs_size_round(msg->lm_buflens[n]);
+	}
+
+	msg->lm_buflens[segment] = newlen;
+
+	if (tail && tail_len) {
+		newpos = lustre_msg_buf_v2(msg, segment + 1, 0);
+		LASSERT(newpos <= tail);
+		if (newpos != tail)
+			memmove(newpos, tail, tail_len);
+	}
+out:
+	return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+}
+
+/*
+ * for @msg, shrink @segment to size @newlen. if @move_data is non-zero,
+ * we also move data forward from @segment + 1.
+ *
+ * if @newlen == 0, we remove the segment completely, but we still keep the
+ * totally bufcount the same to save possible data moving. this will leave a
+ * unused segment with size 0 at the tail, but that's ok.
+ *
+ * return new msg size after shrinking.
+ *
+ * CAUTION:
+ * + if any buffers higher than @segment has been filled in, must call shrink
+ *   with non-zero @move_data.
+ * + caller should NOT keep pointers to msg buffers which higher than @segment
+ *   after call shrink.
+ */
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+		      unsigned int newlen, int move_data)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_shrink_msg_v2(msg, segment, newlen, move_data);
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_shrink_msg);
+
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
+{
+	PTLRPC_RS_DEBUG_LRU_DEL(rs);
+
+	LASSERT (atomic_read(&rs->rs_refcount) == 0);
+	LASSERT (!rs->rs_difficult || rs->rs_handled);
+	LASSERT (!rs->rs_on_net);
+	LASSERT (!rs->rs_scheduled);
+	LASSERT (rs->rs_export == NULL);
+	LASSERT (rs->rs_nlocks == 0);
+	LASSERT (list_empty(&rs->rs_exp_list));
+	LASSERT (list_empty(&rs->rs_obd_list));
+
+	sptlrpc_svc_free_rs(rs);
+}
+EXPORT_SYMBOL(lustre_free_reply_state);
+
+static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
+{
+	int swabbed, required_len, i;
+
+	/* Now we know the sender speaks my language. */
+	required_len = lustre_msg_hdr_size_v2(0);
+	if (len < required_len) {
+		/* can't even look inside the message */
+		CERROR("message length %d too small for lustre_msg\n", len);
+		return -EINVAL;
+	}
+
+	swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+	if (swabbed) {
+		__swab32s(&m->lm_magic);
+		__swab32s(&m->lm_bufcount);
+		__swab32s(&m->lm_secflvr);
+		__swab32s(&m->lm_repsize);
+		__swab32s(&m->lm_cksum);
+		__swab32s(&m->lm_flags);
+		CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
+		CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
+	}
+
+	required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+	if (len < required_len) {
+		/* didn't receive all the buffer lengths */
+		CERROR ("message length %d too small for %d buflens\n",
+			len, m->lm_bufcount);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < m->lm_bufcount; i++) {
+		if (swabbed)
+			__swab32s(&m->lm_buflens[i]);
+		required_len += cfs_size_round(m->lm_buflens[i]);
+	}
+
+	if (len < required_len) {
+		CERROR("len: %d, required_len %d\n", len, required_len);
+		CERROR("bufcount: %d\n", m->lm_bufcount);
+		for (i = 0; i < m->lm_bufcount; i++)
+			CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+		return -EINVAL;
+	}
+
+	return swabbed;
+}
+
+int __lustre_unpack_msg(struct lustre_msg *m, int len)
+{
+	int required_len, rc;
+	ENTRY;
+
+	/* We can provide a slightly better error log, if we check the
+	 * message magic and version first.  In the future, struct
+	 * lustre_msg may grow, and we'd like to log a version mismatch,
+	 * rather than a short message.
+	 *
+	 */
+	required_len = offsetof(struct lustre_msg, lm_magic) +
+		       sizeof(m->lm_magic);
+	if (len < required_len) {
+		/* can't even look inside the message */
+		CERROR("message length %d too small for magic/version check\n",
+		       len);
+		RETURN(-EINVAL);
+	}
+
+	rc = lustre_unpack_msg_v2(m, len);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(__lustre_unpack_msg);
+
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len)
+{
+	int rc;
+	rc = __lustre_unpack_msg(req->rq_reqmsg, len);
+	if (rc == 1) {
+		lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_req_msg);
+
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len)
+{
+	int rc;
+	rc = __lustre_unpack_msg(req->rq_repmsg, len);
+	if (rc == 1) {
+		lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_rep_msg);
+
+static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req,
+					       const int inout, int offset)
+{
+	struct ptlrpc_body *pb;
+	struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg;
+
+	pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2));
+	if (!pb) {
+		CERROR("error unpacking ptlrpc body\n");
+		return -EFAULT;
+	}
+	if (ptlrpc_buf_need_swab(req, inout, offset)) {
+		lustre_swab_ptlrpc_body(pb);
+		ptlrpc_buf_set_swabbed(req, inout, offset);
+	}
+
+	if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
+		 CERROR("wrong lustre_msg version %08x\n", pb->pb_version);
+		 return -EINVAL;
+	}
+
+	return 0;
+}
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_unpack_ptlrpc_body_v2(req, 1, offset);
+	default:
+		CERROR("bad lustre msg magic: %08x\n",
+		       req->rq_reqmsg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+	switch (req->rq_repmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_unpack_ptlrpc_body_v2(req, 0, offset);
+	default:
+		CERROR("bad lustre msg magic: %08x\n",
+		       req->rq_repmsg->lm_magic);
+		return -EINVAL;
+	}
+}
+
+static inline int lustre_msg_buflen_v2(struct lustre_msg_v2 *m, int n)
+{
+	if (n >= m->lm_bufcount)
+		return 0;
+
+	return m->lm_buflens[n];
+}
+
+/**
+ * lustre_msg_buflen - return the length of buffer \a n in message \a m
+ * \param m lustre_msg (request or reply) to look at
+ * \param n message index (base 0)
+ *
+ * returns zero for non-existent message indices
+ */
+int lustre_msg_buflen(struct lustre_msg *m, int n)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_msg_buflen_v2(m, n);
+	default:
+		CERROR("incorrect message magic: %08x\n", m->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_buflen);
+
+static inline void
+lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, int n, int len)
+{
+	if (n >= m->lm_bufcount)
+		LBUG();
+
+	m->lm_buflens[n] = len;
+}
+
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		lustre_msg_set_buflen_v2(m, n, len);
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+	}
+}
+
+EXPORT_SYMBOL(lustre_msg_set_buflen);
+
+/* NB return the bufcount for lustre_msg_v2 format, so if message is packed
+ * in V1 format, the result is one bigger. (add struct ptlrpc_body). */
+int lustre_msg_bufcount(struct lustre_msg *m)
+{
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return m->lm_bufcount;
+	default:
+		CERROR("incorrect message magic: %08x\n", m->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_bufcount);
+
+char *lustre_msg_string(struct lustre_msg *m, int index, int max_len)
+{
+	/* max_len == 0 means the string should fill the buffer */
+	char *str;
+	int slen, blen;
+
+	switch (m->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		str = lustre_msg_buf_v2(m, index, 0);
+		blen = lustre_msg_buflen_v2(m, index);
+		break;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+	}
+
+	if (str == NULL) {
+		CERROR ("can't unpack string in msg %p buffer[%d]\n", m, index);
+		return NULL;
+	}
+
+	slen = strnlen(str, blen);
+
+	if (slen == blen) {		     /* not NULL terminated */
+		CERROR("can't unpack non-NULL terminated string in "
+			"msg %p buffer[%d] len %d\n", m, index, blen);
+		return NULL;
+	}
+
+	if (max_len == 0) {
+		if (slen != blen - 1) {
+			CERROR("can't unpack short string in msg %p "
+			       "buffer[%d] len %d: strlen %d\n",
+			       m, index, blen, slen);
+			return NULL;
+		}
+	} else if (slen > max_len) {
+		CERROR("can't unpack oversized string in msg %p "
+		       "buffer[%d] len %d strlen %d: max %d expected\n",
+		       m, index, blen, slen, max_len);
+		return NULL;
+	}
+
+	return str;
+}
+EXPORT_SYMBOL(lustre_msg_string);
+
+/* Wrap up the normal fixed length cases */
+static inline void *__lustre_swab_buf(struct lustre_msg *msg, int index,
+				      int min_size, void *swabber)
+{
+	void *ptr = NULL;
+
+	LASSERT(msg != NULL);
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		ptr = lustre_msg_buf_v2(msg, index, min_size);
+		break;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+	}
+
+	if (ptr && swabber)
+		((void (*)(void *))swabber)(ptr);
+
+	return ptr;
+}
+
+static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
+{
+	return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+				 sizeof(struct ptlrpc_body_v2));
+}
+
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 0;
+	case LUSTRE_MSG_MAGIC_V2:
+		/* already in host endian */
+		return msg->lm_flags;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msghdr_get_flags);
+
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2:
+		msg->lm_flags = flags;
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+__u32 lustre_msg_get_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_flags;
+	}
+	default:
+		/* flags might be printed in debug code while message
+		 * uninitialized */
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_flags);
+
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_flags);
+
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags = flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_flags);
+
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_clear_flags);
+
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_op_flags;
+	}
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_op_flags);
+
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_op_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_op_flags);
+
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_op_flags |= flags;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_op_flags);
+
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return NULL;
+		}
+		return &pb->pb_handle;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_handle);
+
+__u32 lustre_msg_get_type(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return PTL_RPC_MSG_ERR;
+		}
+		return pb->pb_type;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return PTL_RPC_MSG_ERR;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_type);
+
+__u32 lustre_msg_get_version(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_version;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_version);
+
+void lustre_msg_add_version(struct lustre_msg *msg, int version)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_version |= version;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_add_version);
+
+__u32 lustre_msg_get_opc(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_opc;
+	}
+	default:
+		CERROR("incorrect message magic: %08x(msg:%p)\n", msg->lm_magic, msg);
+		LBUG();
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_opc);
+
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_last_xid;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_last_xid);
+
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_last_committed;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_last_committed);
+
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return NULL;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return NULL;
+		}
+		return pb->pb_pre_versions;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_versions);
+
+__u64 lustre_msg_get_transno(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_transno;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_transno);
+
+int lustre_msg_get_status(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_status;
+	}
+	default:
+		/* status might be printed in debug code while message
+		 * uninitialized */
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_status);
+
+__u64 lustre_msg_get_slv(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_slv;
+	}
+	default:
+		CERROR("invalid msg magic %08x\n", msg->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_slv);
+
+
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return;
+		}
+		pb->pb_slv = slv;
+		return;
+	}
+	default:
+		CERROR("invalid msg magic %x\n", msg->lm_magic);
+		return;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_slv);
+
+__u32 lustre_msg_get_limit(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return -EINVAL;
+		}
+		return pb->pb_limit;
+	}
+	default:
+		CERROR("invalid msg magic %x\n", msg->lm_magic);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_limit);
+
+
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return;
+		}
+		pb->pb_limit = limit;
+		return;
+	}
+	default:
+		CERROR("invalid msg magic %08x\n", msg->lm_magic);
+		return;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_limit);
+
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+		}
+		return pb->pb_conn_cnt;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+
+int lustre_msg_is_v1(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 1;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_is_v1);
+
+__u32 lustre_msg_get_magic(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return msg->lm_magic;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_magic);
+
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 0;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+
+		}
+		return pb->pb_timeout;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return 0;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		if (!pb) {
+			CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+			return 0;
+
+		}
+		return pb->pb_service_time;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+char *lustre_msg_get_jobid(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+	case LUSTRE_MSG_MAGIC_V1_SWABBED:
+		return NULL;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb =
+			lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+					  sizeof(struct ptlrpc_body));
+		if (!pb)
+			return NULL;
+
+		return pb->pb_jobid;
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(lustre_msg_get_jobid);
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return msg->lm_cksum;
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+/*
+ * In 1.6 and 1.8 the checksum was computed only on struct ptlrpc_body as
+ * it was in 1.6 (88 bytes, smaller than the full size in 1.8).  It makes
+ * more sense to compute the checksum on the full ptlrpc_body, regardless
+ * of what size it is, but in order to keep interoperability with 1.8 we
+ * can optionally also checksum only the first 88 bytes (caller decides). */
+# define ptlrpc_body_cksum_size_compat18	 88
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18)
+#else
+# warning "remove checksum compatibility support for b1_8"
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
+#endif
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+		__u32 crc;
+		unsigned int hsize = 4;
+		__u32 len = compat18 ? ptlrpc_body_cksum_size_compat18 :
+			    lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+				       len, NULL, 0, (unsigned char *)&crc,
+				       &hsize);
+		return crc;
+#else
+# warning "remove checksum compatibility support for b1_8"
+		__u32 crc;
+		unsigned int hsize = 4;
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+				   lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF),
+				   NULL, 0, (unsigned char *)&crc, &hsize);
+		return crc;
+#endif
+	}
+	default:
+		CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+		return 0;
+	}
+}
+
+void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_handle = *handle;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_handle);
+
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_type = type;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_type);
+
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_opc = opc;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_opc);
+
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_last_xid = last_xid;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_last_xid);
+
+void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_last_committed = last_committed;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_last_committed);
+
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_pre_versions[0] = versions[0];
+		pb->pb_pre_versions[1] = versions[1];
+		pb->pb_pre_versions[2] = versions[2];
+		pb->pb_pre_versions[3] = versions[3];
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_versions);
+
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_transno = transno;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_transno);
+
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_status = status;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_status);
+
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_conn_cnt = conn_cnt;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_conn_cnt);
+
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_timeout = timeout;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+		pb->pb_service_time = service_time;
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2: {
+		__u32 opc = lustre_msg_get_opc(msg);
+		struct ptlrpc_body *pb;
+
+		/* Don't set jobid for ldlm ast RPCs, they've been shrinked.
+		 * See the comment in ptlrpc_request_pack(). */
+		if (!opc || opc == LDLM_BL_CALLBACK ||
+		    opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK)
+			return;
+
+		pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+				       sizeof(struct ptlrpc_body));
+		LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+
+		if (jobid != NULL)
+			memcpy(pb->pb_jobid, jobid, JOBSTATS_JOBID_SIZE);
+		else if (pb->pb_jobid[0] == '\0')
+			lustre_get_jobid(pb->pb_jobid);
+		return;
+	}
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+EXPORT_SYMBOL(lustre_msg_set_jobid);
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+	switch (msg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V1:
+		return;
+	case LUSTRE_MSG_MAGIC_V2:
+		msg->lm_cksum = cksum;
+		return;
+	default:
+		LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+	}
+}
+
+
+void ptlrpc_request_set_replen(struct ptlrpc_request *req)
+{
+	int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER);
+
+	req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count,
+					 req->rq_pill.rc_area[RCL_SERVER]);
+	if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+		req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_request_set_replen);
+
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens)
+{
+	req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+	if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+		req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_req_set_repsize);
+
+/**
+ * Send a remote set_info_async.
+ *
+ * This may go from client to server or server to client.
+ */
+int do_set_info_async(struct obd_import *imp,
+		      int opcode, int version,
+		      obd_count keylen, void *key,
+		      obd_count vallen, void *val,
+		      struct ptlrpc_request_set *set)
+{
+	struct ptlrpc_request *req;
+	char		  *tmp;
+	int		    rc;
+	ENTRY;
+
+	req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+			     RCL_CLIENT, keylen);
+	req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+			     RCL_CLIENT, vallen);
+	rc = ptlrpc_request_pack(req, version, opcode);
+	if (rc) {
+		ptlrpc_request_free(req);
+		RETURN(rc);
+	}
+
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+	memcpy(tmp, key, keylen);
+	tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+	memcpy(tmp, val, vallen);
+
+	ptlrpc_request_set_replen(req);
+
+	if (set) {
+		ptlrpc_set_add_req(set, req);
+		ptlrpc_check_set(NULL, set);
+	} else {
+		rc = ptlrpc_queue_wait(req);
+		ptlrpc_req_finished(req);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(do_set_info_async);
+
+/* byte flipping routines for all wire types declared in
+ * lustre_idl.h implemented here.
+ */
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
+{
+	__swab32s (&b->pb_type);
+	__swab32s (&b->pb_version);
+	__swab32s (&b->pb_opc);
+	__swab32s (&b->pb_status);
+	__swab64s (&b->pb_last_xid);
+	__swab64s (&b->pb_last_seen);
+	__swab64s (&b->pb_last_committed);
+	__swab64s (&b->pb_transno);
+	__swab32s (&b->pb_flags);
+	__swab32s (&b->pb_op_flags);
+	__swab32s (&b->pb_conn_cnt);
+	__swab32s (&b->pb_timeout);
+	__swab32s (&b->pb_service_time);
+	__swab32s (&b->pb_limit);
+	__swab64s (&b->pb_slv);
+	__swab64s (&b->pb_pre_versions[0]);
+	__swab64s (&b->pb_pre_versions[1]);
+	__swab64s (&b->pb_pre_versions[2]);
+	__swab64s (&b->pb_pre_versions[3]);
+	CLASSERT(offsetof(typeof(*b), pb_padding) != 0);
+	/* While we need to maintain compatibility between
+	 * clients and servers without ptlrpc_body_v2 (< 2.3)
+	 * do not swab any fields beyond pb_jobid, as we are
+	 * using this swab function for both ptlrpc_body
+	 * and ptlrpc_body_v2. */
+	CLASSERT(offsetof(typeof(*b), pb_jobid) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_ptlrpc_body);
+
+void lustre_swab_connect(struct obd_connect_data *ocd)
+{
+	__swab64s(&ocd->ocd_connect_flags);
+	__swab32s(&ocd->ocd_version);
+	__swab32s(&ocd->ocd_grant);
+	__swab64s(&ocd->ocd_ibits_known);
+	__swab32s(&ocd->ocd_index);
+	__swab32s(&ocd->ocd_brw_size);
+	/* ocd_blocksize and ocd_inodespace don't need to be swabbed because
+	 * they are 8-byte values */
+	__swab16s(&ocd->ocd_grant_extent);
+	__swab32s(&ocd->ocd_unused);
+	__swab64s(&ocd->ocd_transno);
+	__swab32s(&ocd->ocd_group);
+	__swab32s(&ocd->ocd_cksum_types);
+	__swab32s(&ocd->ocd_instance);
+	/* Fields after ocd_cksum_types are only accessible by the receiver
+	 * if the corresponding flag in ocd_connect_flags is set. Accessing
+	 * any field after ocd_maxbytes on the receiver without a valid flag
+	 * may result in out-of-bound memory access and kernel oops. */
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)
+		__swab32s(&ocd->ocd_max_easize);
+	if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
+		__swab64s(&ocd->ocd_maxbytes);
+	CLASSERT(offsetof(typeof(*ocd), padding1) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding2) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding3) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding4) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding5) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding6) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding7) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding8) != 0);
+	CLASSERT(offsetof(typeof(*ocd), padding9) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingA) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingB) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingC) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingD) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingE) != 0);
+	CLASSERT(offsetof(typeof(*ocd), paddingF) != 0);
+}
+
+void lustre_swab_obdo (struct obdo  *o)
+{
+	__swab64s (&o->o_valid);
+	lustre_swab_ost_id(&o->o_oi);
+	__swab64s (&o->o_parent_seq);
+	__swab64s (&o->o_size);
+	__swab64s (&o->o_mtime);
+	__swab64s (&o->o_atime);
+	__swab64s (&o->o_ctime);
+	__swab64s (&o->o_blocks);
+	__swab64s (&o->o_grant);
+	__swab32s (&o->o_blksize);
+	__swab32s (&o->o_mode);
+	__swab32s (&o->o_uid);
+	__swab32s (&o->o_gid);
+	__swab32s (&o->o_flags);
+	__swab32s (&o->o_nlink);
+	__swab32s (&o->o_parent_oid);
+	__swab32s (&o->o_misc);
+	__swab64s (&o->o_ioepoch);
+	__swab32s (&o->o_stripe_idx);
+	__swab32s (&o->o_parent_ver);
+	/* o_handle is opaque */
+	/* o_lcookie is swabbed elsewhere */
+	__swab32s (&o->o_uid_h);
+	__swab32s (&o->o_gid_h);
+	__swab64s (&o->o_data_version);
+	CLASSERT(offsetof(typeof(*o), o_padding_4) != 0);
+	CLASSERT(offsetof(typeof(*o), o_padding_5) != 0);
+	CLASSERT(offsetof(typeof(*o), o_padding_6) != 0);
+
+}
+EXPORT_SYMBOL(lustre_swab_obdo);
+
+void lustre_swab_obd_statfs (struct obd_statfs *os)
+{
+	__swab64s (&os->os_type);
+	__swab64s (&os->os_blocks);
+	__swab64s (&os->os_bfree);
+	__swab64s (&os->os_bavail);
+	__swab64s (&os->os_files);
+	__swab64s (&os->os_ffree);
+	/* no need to swab os_fsid */
+	__swab32s (&os->os_bsize);
+	__swab32s (&os->os_namelen);
+	__swab64s (&os->os_maxbytes);
+	__swab32s (&os->os_state);
+	CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare2) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
+	CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_obd_statfs);
+
+void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
+{
+	lustre_swab_ost_id(&ioo->ioo_oid);
+	__swab32s(&ioo->ioo_max_brw);
+	__swab32s(&ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(lustre_swab_obd_ioobj);
+
+void lustre_swab_niobuf_remote (struct niobuf_remote *nbr)
+{
+	__swab64s (&nbr->offset);
+	__swab32s (&nbr->len);
+	__swab32s (&nbr->flags);
+}
+EXPORT_SYMBOL(lustre_swab_niobuf_remote);
+
+void lustre_swab_ost_body (struct ost_body *b)
+{
+	lustre_swab_obdo (&b->oa);
+}
+EXPORT_SYMBOL(lustre_swab_ost_body);
+
+void lustre_swab_ost_last_id(obd_id *id)
+{
+	__swab64s(id);
+}
+EXPORT_SYMBOL(lustre_swab_ost_last_id);
+
+void lustre_swab_generic_32s(__u32 *val)
+{
+	__swab32s(val);
+}
+EXPORT_SYMBOL(lustre_swab_generic_32s);
+
+void lustre_swab_gl_desc(union ldlm_gl_desc *desc)
+{
+	lustre_swab_lu_fid(&desc->lquota_desc.gl_id.qid_fid);
+	__swab64s(&desc->lquota_desc.gl_flags);
+	__swab64s(&desc->lquota_desc.gl_ver);
+	__swab64s(&desc->lquota_desc.gl_hardlimit);
+	__swab64s(&desc->lquota_desc.gl_softlimit);
+	__swab64s(&desc->lquota_desc.gl_time);
+	CLASSERT(offsetof(typeof(desc->lquota_desc), gl_pad2) != 0);
+}
+
+void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb)
+{
+	__swab64s(&lvb->lvb_size);
+	__swab64s(&lvb->lvb_mtime);
+	__swab64s(&lvb->lvb_atime);
+	__swab64s(&lvb->lvb_ctime);
+	__swab64s(&lvb->lvb_blocks);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb_v1);
+
+void lustre_swab_ost_lvb(struct ost_lvb *lvb)
+{
+	__swab64s(&lvb->lvb_size);
+	__swab64s(&lvb->lvb_mtime);
+	__swab64s(&lvb->lvb_atime);
+	__swab64s(&lvb->lvb_ctime);
+	__swab64s(&lvb->lvb_blocks);
+	__swab32s(&lvb->lvb_mtime_ns);
+	__swab32s(&lvb->lvb_atime_ns);
+	__swab32s(&lvb->lvb_ctime_ns);
+	__swab32s(&lvb->lvb_padding);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb);
+
+void lustre_swab_lquota_lvb(struct lquota_lvb *lvb)
+{
+	__swab64s(&lvb->lvb_flags);
+	__swab64s(&lvb->lvb_id_may_rel);
+	__swab64s(&lvb->lvb_id_rel);
+	__swab64s(&lvb->lvb_id_qunit);
+	__swab64s(&lvb->lvb_pad1);
+}
+EXPORT_SYMBOL(lustre_swab_lquota_lvb);
+
+void lustre_swab_mdt_body (struct mdt_body *b)
+{
+	lustre_swab_lu_fid (&b->fid1);
+	lustre_swab_lu_fid (&b->fid2);
+	/* handle is opaque */
+	__swab64s (&b->valid);
+	__swab64s (&b->size);
+	__swab64s (&b->mtime);
+	__swab64s (&b->atime);
+	__swab64s (&b->ctime);
+	__swab64s (&b->blocks);
+	__swab64s (&b->ioepoch);
+	CLASSERT(offsetof(typeof(*b), unused1) != 0);
+	__swab32s (&b->fsuid);
+	__swab32s (&b->fsgid);
+	__swab32s (&b->capability);
+	__swab32s (&b->mode);
+	__swab32s (&b->uid);
+	__swab32s (&b->gid);
+	__swab32s (&b->flags);
+	__swab32s (&b->rdev);
+	__swab32s (&b->nlink);
+	CLASSERT(offsetof(typeof(*b), unused2) != 0);
+	__swab32s (&b->suppgid);
+	__swab32s (&b->eadatasize);
+	__swab32s (&b->aclsize);
+	__swab32s (&b->max_mdsize);
+	__swab32s (&b->max_cookiesize);
+	__swab32s (&b->uid_h);
+	__swab32s (&b->gid_h);
+	CLASSERT(offsetof(typeof(*b), padding_5) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_body);
+
+void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b)
+{
+	/* handle is opaque */
+	 __swab64s (&b->ioepoch);
+	 __swab32s (&b->flags);
+	 CLASSERT(offsetof(typeof(*b), padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_ioepoch);
+
+void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
+{
+	int i;
+	__swab32s(&mti->mti_lustre_ver);
+	__swab32s(&mti->mti_stripe_index);
+	__swab32s(&mti->mti_config_ver);
+	__swab32s(&mti->mti_flags);
+	__swab32s(&mti->mti_instance);
+	__swab32s(&mti->mti_nid_count);
+	CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+	for (i = 0; i < MTI_NIDS_MAX; i++)
+		__swab64s(&mti->mti_nids[i]);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_target_info);
+
+void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
+{
+	int i;
+
+	__swab64s(&entry->mne_version);
+	__swab32s(&entry->mne_instance);
+	__swab32s(&entry->mne_index);
+	__swab32s(&entry->mne_length);
+
+	/* mne_nid_(count|type) must be one byte size because we're gonna
+	 * access it w/o swapping. */
+	CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+	CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+	/* remove this assertion if ipv6 is supported. */
+	LASSERT(entry->mne_nid_type == 0);
+	for (i = 0; i < entry->mne_nid_count; i++) {
+		CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+		__swab64s(&entry->u.nids[i]);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
+
+void lustre_swab_mgs_config_body(struct mgs_config_body *body)
+{
+	__swab64s(&body->mcb_offset);
+	__swab32s(&body->mcb_units);
+	__swab16s(&body->mcb_type);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_body);
+
+void lustre_swab_mgs_config_res(struct mgs_config_res *body)
+{
+	__swab64s(&body->mcr_offset);
+	__swab64s(&body->mcr_size);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_res);
+
+static void lustre_swab_obd_dqinfo (struct obd_dqinfo *i)
+{
+	__swab64s (&i->dqi_bgrace);
+	__swab64s (&i->dqi_igrace);
+	__swab32s (&i->dqi_flags);
+	__swab32s (&i->dqi_valid);
+}
+
+static void lustre_swab_obd_dqblk (struct obd_dqblk *b)
+{
+	__swab64s (&b->dqb_ihardlimit);
+	__swab64s (&b->dqb_isoftlimit);
+	__swab64s (&b->dqb_curinodes);
+	__swab64s (&b->dqb_bhardlimit);
+	__swab64s (&b->dqb_bsoftlimit);
+	__swab64s (&b->dqb_curspace);
+	__swab64s (&b->dqb_btime);
+	__swab64s (&b->dqb_itime);
+	__swab32s (&b->dqb_valid);
+	CLASSERT(offsetof(typeof(*b), dqb_padding) != 0);
+}
+
+void lustre_swab_obd_quotactl (struct obd_quotactl *q)
+{
+	__swab32s (&q->qc_cmd);
+	__swab32s (&q->qc_type);
+	__swab32s (&q->qc_id);
+	__swab32s (&q->qc_stat);
+	lustre_swab_obd_dqinfo (&q->qc_dqinfo);
+	lustre_swab_obd_dqblk (&q->qc_dqblk);
+}
+EXPORT_SYMBOL(lustre_swab_obd_quotactl);
+
+void lustre_swab_mdt_remote_perm (struct mdt_remote_perm *p)
+{
+	__swab32s (&p->rp_uid);
+	__swab32s (&p->rp_gid);
+	__swab32s (&p->rp_fsuid);
+	__swab32s (&p->rp_fsuid_h);
+	__swab32s (&p->rp_fsgid);
+	__swab32s (&p->rp_fsgid_h);
+	__swab32s (&p->rp_access_perm);
+	__swab32s (&p->rp_padding);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_remote_perm);
+
+void lustre_swab_fid2path(struct getinfo_fid2path *gf)
+{
+	lustre_swab_lu_fid(&gf->gf_fid);
+	__swab64s(&gf->gf_recno);
+	__swab32s(&gf->gf_linkno);
+	__swab32s(&gf->gf_pathlen);
+}
+EXPORT_SYMBOL(lustre_swab_fid2path);
+
+void lustre_swab_fiemap_extent(struct ll_fiemap_extent *fm_extent)
+{
+	__swab64s(&fm_extent->fe_logical);
+	__swab64s(&fm_extent->fe_physical);
+	__swab64s(&fm_extent->fe_length);
+	__swab32s(&fm_extent->fe_flags);
+	__swab32s(&fm_extent->fe_device);
+}
+
+void lustre_swab_fiemap(struct ll_user_fiemap *fiemap)
+{
+	int i;
+
+	__swab64s(&fiemap->fm_start);
+	__swab64s(&fiemap->fm_length);
+	__swab32s(&fiemap->fm_flags);
+	__swab32s(&fiemap->fm_mapped_extents);
+	__swab32s(&fiemap->fm_extent_count);
+	__swab32s(&fiemap->fm_reserved);
+
+	for (i = 0; i < fiemap->fm_mapped_extents; i++)
+		lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
+}
+EXPORT_SYMBOL(lustre_swab_fiemap);
+
+void lustre_swab_idx_info(struct idx_info *ii)
+{
+	__swab32s(&ii->ii_magic);
+	__swab32s(&ii->ii_flags);
+	__swab16s(&ii->ii_count);
+	__swab32s(&ii->ii_attrs);
+	lustre_swab_lu_fid(&ii->ii_fid);
+	__swab64s(&ii->ii_version);
+	__swab64s(&ii->ii_hash_start);
+	__swab64s(&ii->ii_hash_end);
+	__swab16s(&ii->ii_keysize);
+	__swab16s(&ii->ii_recsize);
+}
+
+void lustre_swab_lip_header(struct lu_idxpage *lip)
+{
+	/* swab header */
+	__swab32s(&lip->lip_magic);
+	__swab16s(&lip->lip_flags);
+	__swab16s(&lip->lip_nr);
+}
+EXPORT_SYMBOL(lustre_swab_lip_header);
+
+void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
+{
+	__swab32s(&rr->rr_opcode);
+	__swab32s(&rr->rr_cap);
+	__swab32s(&rr->rr_fsuid);
+	/* rr_fsuid_h is unused */
+	__swab32s(&rr->rr_fsgid);
+	/* rr_fsgid_h is unused */
+	__swab32s(&rr->rr_suppgid1);
+	/* rr_suppgid1_h is unused */
+	__swab32s(&rr->rr_suppgid2);
+	/* rr_suppgid2_h is unused */
+	lustre_swab_lu_fid(&rr->rr_fid1);
+	lustre_swab_lu_fid(&rr->rr_fid2);
+	__swab64s(&rr->rr_mtime);
+	__swab64s(&rr->rr_atime);
+	__swab64s(&rr->rr_ctime);
+	__swab64s(&rr->rr_size);
+	__swab64s(&rr->rr_blocks);
+	__swab32s(&rr->rr_bias);
+	__swab32s(&rr->rr_mode);
+	__swab32s(&rr->rr_flags);
+	__swab32s(&rr->rr_flags_h);
+	__swab32s(&rr->rr_umask);
+
+	CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_rec_reint);
+
+void lustre_swab_lov_desc (struct lov_desc *ld)
+{
+	__swab32s (&ld->ld_tgt_count);
+	__swab32s (&ld->ld_active_tgt_count);
+	__swab32s (&ld->ld_default_stripe_count);
+	__swab32s (&ld->ld_pattern);
+	__swab64s (&ld->ld_default_stripe_size);
+	__swab64s (&ld->ld_default_stripe_offset);
+	__swab32s (&ld->ld_qos_maxage);
+	/* uuid endian insensitive */
+}
+EXPORT_SYMBOL(lustre_swab_lov_desc);
+
+void lustre_swab_lmv_desc (struct lmv_desc *ld)
+{
+	__swab32s (&ld->ld_tgt_count);
+	__swab32s (&ld->ld_active_tgt_count);
+	__swab32s (&ld->ld_default_stripe_count);
+	__swab32s (&ld->ld_pattern);
+	__swab64s (&ld->ld_default_hash_size);
+	__swab32s (&ld->ld_qos_maxage);
+	/* uuid endian insensitive */
+}
+
+void lustre_swab_lmv_stripe_md (struct lmv_stripe_md *mea)
+{
+	__swab32s(&mea->mea_magic);
+	__swab32s(&mea->mea_count);
+	__swab32s(&mea->mea_master);
+	CLASSERT(offsetof(typeof(*mea), mea_padding) != 0);
+}
+
+void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
+{
+	int i;
+
+	__swab32s(&lum->lum_magic);
+	__swab32s(&lum->lum_stripe_count);
+	__swab32s(&lum->lum_stripe_offset);
+	__swab32s(&lum->lum_hash_type);
+	__swab32s(&lum->lum_type);
+	CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
+	CLASSERT(offsetof(typeof(*lum), lum_padding2) != 0);
+	CLASSERT(offsetof(typeof(*lum), lum_padding3) != 0);
+
+	for (i = 0; i < lum->lum_stripe_count; i++) {
+		__swab32s(&lum->lum_objects[i].lum_mds);
+		lustre_swab_lu_fid(&lum->lum_objects[i].lum_fid);
+	}
+
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md);
+
+static void print_lum (struct lov_user_md *lum)
+{
+	CDEBUG(D_OTHER, "lov_user_md %p:\n", lum);
+	CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic);
+	CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern);
+	CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lmm_oi_id(&lum->lmm_oi));
+	CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lmm_oi_seq(&lum->lmm_oi));
+	CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size);
+	CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count);
+	CDEBUG(D_OTHER, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n",
+			lum->lmm_stripe_offset);
+}
+
+static void lustre_swab_lmm_oi(struct ost_id *oi)
+{
+	__swab64s(&oi->oi.oi_id);
+	__swab64s(&oi->oi.oi_seq);
+}
+
+static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum)
+{
+	ENTRY;
+	__swab32s(&lum->lmm_magic);
+	__swab32s(&lum->lmm_pattern);
+	lustre_swab_lmm_oi(&lum->lmm_oi);
+	__swab32s(&lum->lmm_stripe_size);
+	__swab16s(&lum->lmm_stripe_count);
+	__swab16s(&lum->lmm_stripe_offset);
+	print_lum(lum);
+	EXIT;
+}
+
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n");
+	lustre_swab_lov_user_md_common(lum);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
+
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n");
+	lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum);
+	/* lmm_pool_name nothing to do with char */
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
+
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+	ENTRY;
+	CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+	__swab32s(&lmm->lmm_magic);
+	__swab32s(&lmm->lmm_pattern);
+	lustre_swab_lmm_oi(&lmm->lmm_oi);
+	__swab32s(&lmm->lmm_stripe_size);
+	__swab16s(&lmm->lmm_stripe_count);
+	__swab16s(&lmm->lmm_layout_gen);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+				     int stripe_count)
+{
+	int i;
+	ENTRY;
+	for (i = 0; i < stripe_count; i++) {
+		lustre_swab_ost_id(&(lod[i].l_ost_oi));
+		__swab32s(&(lod[i].l_ost_gen));
+		__swab32s(&(lod[i].l_ost_idx));
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
+
+void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
+{
+	int  i;
+
+	for (i = 0; i < RES_NAME_SIZE; i++)
+		__swab64s (&id->name[i]);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
+
+void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d)
+{
+	/* the lock data is a union and the first two fields are always an
+	 * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock
+	 * data the same way. */
+	__swab64s(&d->l_extent.start);
+	__swab64s(&d->l_extent.end);
+	__swab64s(&d->l_extent.gid);
+	__swab64s(&d->l_flock.lfw_owner);
+	__swab32s(&d->l_flock.lfw_pid);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_policy_data);
+
+void lustre_swab_ldlm_intent (struct ldlm_intent *i)
+{
+	__swab64s (&i->opc);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_intent);
+
+void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r)
+{
+	__swab32s (&r->lr_type);
+	CLASSERT(offsetof(typeof(*r), lr_padding) != 0);
+	lustre_swab_ldlm_res_id (&r->lr_name);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc);
+
+void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l)
+{
+	lustre_swab_ldlm_resource_desc (&l->l_resource);
+	__swab32s (&l->l_req_mode);
+	__swab32s (&l->l_granted_mode);
+	lustre_swab_ldlm_policy_data (&l->l_policy_data);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc);
+
+void lustre_swab_ldlm_request (struct ldlm_request *rq)
+{
+	__swab32s (&rq->lock_flags);
+	lustre_swab_ldlm_lock_desc (&rq->lock_desc);
+	__swab32s (&rq->lock_count);
+	/* lock_handle[] opaque */
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_request);
+
+void lustre_swab_ldlm_reply (struct ldlm_reply *r)
+{
+	__swab32s (&r->lock_flags);
+	CLASSERT(offsetof(typeof(*r), lock_padding) != 0);
+	lustre_swab_ldlm_lock_desc (&r->lock_desc);
+	/* lock_handle opaque */
+	__swab64s (&r->lock_policy_res1);
+	__swab64s (&r->lock_policy_res2);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_reply);
+
+void lustre_swab_quota_body(struct quota_body *b)
+{
+	lustre_swab_lu_fid(&b->qb_fid);
+	lustre_swab_lu_fid((struct lu_fid *)&b->qb_id);
+	__swab32s(&b->qb_flags);
+	__swab64s(&b->qb_count);
+	__swab64s(&b->qb_usage);
+	__swab64s(&b->qb_slv_ver);
+}
+
+/* Dump functions */
+void dump_ioo(struct obd_ioobj *ioo)
+{
+	CDEBUG(D_RPCTRACE,
+	       "obd_ioobj: ioo_oid="DOSTID", ioo_max_brw=%#x, "
+	       "ioo_bufct=%d\n", POSTID(&ioo->ioo_oid), ioo->ioo_max_brw,
+	       ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(dump_ioo);
+
+void dump_rniobuf(struct niobuf_remote *nb)
+{
+	CDEBUG(D_RPCTRACE, "niobuf_remote: offset="LPU64", len=%d, flags=%x\n",
+	       nb->offset, nb->len, nb->flags);
+}
+EXPORT_SYMBOL(dump_rniobuf);
+
+void dump_obdo(struct obdo *oa)
+{
+	__u32 valid = oa->o_valid;
+
+	CDEBUG(D_RPCTRACE, "obdo: o_valid = %08x\n", valid);
+	if (valid & OBD_MD_FLID)
+		CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi));
+	if (valid & OBD_MD_FLFID)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = "LPX64"\n",
+		       oa->o_parent_seq);
+	if (valid & OBD_MD_FLSIZE)
+		CDEBUG(D_RPCTRACE, "obdo: o_size = "LPD64"\n", oa->o_size);
+	if (valid & OBD_MD_FLMTIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_mtime = "LPD64"\n", oa->o_mtime);
+	if (valid & OBD_MD_FLATIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_atime = "LPD64"\n", oa->o_atime);
+	if (valid & OBD_MD_FLCTIME)
+		CDEBUG(D_RPCTRACE, "obdo: o_ctime = "LPD64"\n", oa->o_ctime);
+	if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+		CDEBUG(D_RPCTRACE, "obdo: o_blocks = "LPD64"\n", oa->o_blocks);
+	if (valid & OBD_MD_FLGRANT)
+		CDEBUG(D_RPCTRACE, "obdo: o_grant = "LPD64"\n", oa->o_grant);
+	if (valid & OBD_MD_FLBLKSZ)
+		CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+	if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+		CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+		       oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
+				     (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+	if (valid & OBD_MD_FLUID)
+		CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+	if (valid & OBD_MD_FLGID)
+		CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+	if (valid & OBD_MD_FLFLAGS)
+		CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+	if (valid & OBD_MD_FLNLINK)
+		CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+	else if (valid & OBD_MD_FLCKSUM)
+		CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+		       oa->o_nlink);
+	if (valid & OBD_MD_FLGENER)
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+		       oa->o_parent_oid);
+	if (valid & OBD_MD_FLEPOCH)
+		CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = "LPD64"\n",
+		       oa->o_ioepoch);
+	if (valid & OBD_MD_FLFID) {
+		CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+		       oa->o_stripe_idx);
+		CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+		       oa->o_parent_ver);
+	}
+	if (valid & OBD_MD_FLHANDLE)
+		CDEBUG(D_RPCTRACE, "obdo: o_handle = "LPD64"\n",
+		       oa->o_handle.cookie);
+	if (valid & OBD_MD_FLCOOKIE)
+		CDEBUG(D_RPCTRACE, "obdo: o_lcookie = "
+		       "(llog_cookie dumping not yet implemented)\n");
+}
+EXPORT_SYMBOL(dump_obdo);
+
+void dump_ost_body(struct ost_body *ob)
+{
+	dump_obdo(&ob->oa);
+}
+EXPORT_SYMBOL(dump_ost_body);
+
+void dump_rcs(__u32 *rc)
+{
+	CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc);
+}
+EXPORT_SYMBOL(dump_rcs);
+
+static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_reqmsg);
+
+	switch (req->rq_reqmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF);
+	default:
+		CERROR("bad lustre msg magic: %#08X\n",
+		       req->rq_reqmsg->lm_magic);
+	}
+	return 0;
+}
+
+static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repmsg);
+
+	switch (req->rq_repmsg->lm_magic) {
+	case LUSTRE_MSG_MAGIC_V2:
+		return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF);
+	default:
+		/* uninitialized yet */
+		return 0;
+	}
+}
+
+void _debug_req(struct ptlrpc_request *req,
+		struct libcfs_debug_msg_data *msgdata,
+		const char *fmt, ... )
+{
+	int req_ok = req->rq_reqmsg != NULL;
+	int rep_ok = req->rq_repmsg != NULL;
+	lnet_nid_t nid = LNET_NID_ANY;
+	va_list args;
+
+	if (ptlrpc_req_need_swab(req)) {
+		req_ok = req_ok && req_ptlrpc_body_swabbed(req);
+		rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req);
+	}
+
+	if (req->rq_import && req->rq_import->imp_connection)
+		nid = req->rq_import->imp_connection->c_peer.nid;
+	else if (req->rq_export && req->rq_export->exp_connection)
+		nid = req->rq_export->exp_connection->c_peer.nid;
+
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+			   " req@%p x"LPU64"/t"LPD64"("LPD64") o%d->%s@%s:%d/%d"
+			   " lens %d/%d e %d to %d dl "CFS_TIME_T" ref %d "
+			   "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
+			   req, req->rq_xid, req->rq_transno,
+			   req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0,
+			   req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
+			   req->rq_import ?
+				req->rq_import->imp_obd->obd_name :
+				req->rq_export ?
+				     req->rq_export->exp_client_uuid.uuid :
+				     "<?>",
+			   libcfs_nid2str(nid),
+			   req->rq_request_portal, req->rq_reply_portal,
+			   req->rq_reqlen, req->rq_replen,
+			   req->rq_early_count, req->rq_timedout,
+			   req->rq_deadline,
+			   atomic_read(&req->rq_refcount),
+			   DEBUG_REQ_FLAGS(req),
+			   req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1,
+			   rep_ok ? lustre_msg_get_flags(req->rq_repmsg) : -1,
+			   req->rq_status,
+			   rep_ok ? lustre_msg_get_status(req->rq_repmsg) : -1);
+}
+EXPORT_SYMBOL(_debug_req);
+
+void lustre_swab_lustre_capa(struct lustre_capa *c)
+{
+	lustre_swab_lu_fid(&c->lc_fid);
+	__swab64s (&c->lc_opc);
+	__swab64s (&c->lc_uid);
+	__swab64s (&c->lc_gid);
+	__swab32s (&c->lc_flags);
+	__swab32s (&c->lc_keyid);
+	__swab32s (&c->lc_timeout);
+	__swab32s (&c->lc_expiry);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa);
+
+void lustre_swab_lustre_capa_key(struct lustre_capa_key *k)
+{
+	__swab64s (&k->lk_seq);
+	__swab32s (&k->lk_keyid);
+	CLASSERT(offsetof(typeof(*k), lk_padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa_key);
+
+void lustre_swab_hsm_user_state(struct hsm_user_state *state)
+{
+	__swab32s(&state->hus_states);
+	__swab32s(&state->hus_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_state);
+
+void lustre_swab_hsm_state_set(struct hsm_state_set *hss)
+{
+	__swab32s(&hss->hss_valid);
+	__swab64s(&hss->hss_setmask);
+	__swab64s(&hss->hss_clearmask);
+	__swab32s(&hss->hss_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_state_set);
+
+void lustre_swab_hsm_extent(struct hsm_extent *extent)
+{
+	__swab64s(&extent->offset);
+	__swab64s(&extent->length);
+}
+
+void lustre_swab_hsm_current_action(struct hsm_current_action *action)
+{
+	__swab32s(&action->hca_state);
+	__swab32s(&action->hca_action);
+	lustre_swab_hsm_extent(&action->hca_location);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_current_action);
+
+void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
+{
+	lustre_swab_lu_fid(&hui->hui_fid);
+	lustre_swab_hsm_extent(&hui->hui_extent);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_item);
+
+void lustre_swab_layout_intent(struct layout_intent *li)
+{
+	__swab32s(&li->li_opc);
+	__swab32s(&li->li_flags);
+	__swab64s(&li->li_start);
+	__swab64s(&li->li_end);
+}
+EXPORT_SYMBOL(lustre_swab_layout_intent);
+
+void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
+{
+	lustre_swab_lu_fid(&hpk->hpk_fid);
+	__swab64s(&hpk->hpk_cookie);
+	__swab64s(&hpk->hpk_extent.offset);
+	__swab64s(&hpk->hpk_extent.length);
+	__swab16s(&hpk->hpk_flags);
+	__swab16s(&hpk->hpk_errval);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_progress_kernel);
+
+void lustre_swab_hsm_request(struct hsm_request *hr)
+{
+	__swab32s(&hr->hr_action);
+	__swab32s(&hr->hr_archive_id);
+	__swab64s(&hr->hr_flags);
+	__swab32s(&hr->hr_itemcount);
+	__swab32s(&hr->hr_data_len);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_request);
+
+void lustre_swab_update_buf(struct update_buf *ub)
+{
+	__swab32s(&ub->ub_magic);
+	__swab32s(&ub->ub_count);
+}
+EXPORT_SYMBOL(lustre_swab_update_buf);
+
+void lustre_swab_update_reply_buf(struct update_reply *ur)
+{
+	int i;
+
+	__swab32s(&ur->ur_version);
+	__swab32s(&ur->ur_count);
+	for (i = 0; i < ur->ur_count; i++)
+		__swab32s(&ur->ur_lens[i]);
+}
+EXPORT_SYMBOL(lustre_swab_update_reply_buf);
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl)
+{
+	__swab64s(&msl->msl_flags);
+}
+EXPORT_SYMBOL(lustre_swab_swap_layouts);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/pers.c b/drivers/staging/lustre/lustre/ptlrpc/pers.c
new file mode 100644
index 0000000..d926d2b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/pers.c

@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+
+#include "ptlrpc_internal.h"
+
+
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+			 int mdidx)
+{
+	CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON);
+
+	LASSERT(mdidx < desc->bd_md_max_brw);
+	LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+	LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV |
+				 LNET_MD_PHYS)));
+
+	md->options |= LNET_MD_KIOV;
+	md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV);
+	md->length = min_t(unsigned int, LNET_MAX_IOV, md->length);
+	if (desc->bd_enc_iov)
+		md->start = &desc->bd_enc_iov[mdidx * LNET_MAX_IOV];
+	else
+		md->start = &desc->bd_iov[mdidx * LNET_MAX_IOV];
+}
+
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+			  int pageoffset, int len)
+{
+	lnet_kiov_t *kiov = &desc->bd_iov[desc->bd_iov_count];
+
+	kiov->kiov_page = page;
+	kiov->kiov_offset = pageoffset;
+	kiov->kiov_len = len;
+
+	desc->bd_iov_count++;
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
new file mode 100644
index 0000000..ef5269a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/pinger.c

@@ -0,0 +1,763 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pinger.c
+ *
+ * Portal-RPC reconnection and replay operations, for use in recovery.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+static int suppress_pings;
+CFS_MODULE_PARM(suppress_pings, "i", int, 0644, "Suppress pings");
+
+struct mutex pinger_mutex;
+static LIST_HEAD(pinger_imports);
+static struct list_head timeout_list = LIST_HEAD_INIT(timeout_list);
+
+int ptlrpc_pinger_suppress_pings()
+{
+	return suppress_pings;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings);
+
+struct ptlrpc_request *
+ptlrpc_prep_ping(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+
+	req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING,
+					LUSTRE_OBD_VERSION, OBD_PING);
+	if (req) {
+		ptlrpc_request_set_replen(req);
+		req->rq_no_resend = req->rq_no_delay = 1;
+	}
+	return req;
+}
+
+int ptlrpc_obd_ping(struct obd_device *obd)
+{
+	int rc;
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+	if (req == NULL)
+		RETURN(-ENOMEM);
+
+	req->rq_send_state = LUSTRE_IMP_FULL;
+
+	rc = ptlrpc_queue_wait(req);
+
+	ptlrpc_req_finished(req);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_obd_ping);
+
+int ptlrpc_ping(struct obd_import *imp)
+{
+	struct ptlrpc_request *req;
+	ENTRY;
+
+	req = ptlrpc_prep_ping(imp);
+	if (req == NULL) {
+		CERROR("OOM trying to ping %s->%s\n",
+		       imp->imp_obd->obd_uuid.uuid,
+		       obd2cli_tgt(imp->imp_obd));
+		RETURN(-ENOMEM);
+	}
+
+	DEBUG_REQ(D_INFO, req, "pinging %s->%s",
+		  imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+	RETURN(0);
+}
+
+void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+	int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+	if (imp->imp_state == LUSTRE_IMP_DISCON) {
+		int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+				  AT_OFF ? 0 :
+				  at_get(&imp->imp_at.iat_net_latency));
+		time = min(time, dtime);
+	}
+	imp->imp_next_ping = cfs_time_shift(time);
+}
+
+void ptlrpc_ping_import_soon(struct obd_import *imp)
+{
+	imp->imp_next_ping = cfs_time_current();
+}
+
+static inline int imp_is_deactive(struct obd_import *imp)
+{
+	return (imp->imp_deactive ||
+		OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
+}
+
+static inline int ptlrpc_next_reconnect(struct obd_import *imp)
+{
+	if (imp->imp_server_timeout)
+		return cfs_time_shift(obd_timeout / 2);
+	else
+		return cfs_time_shift(obd_timeout);
+}
+
+static atomic_t suspend_timeouts = ATOMIC_INIT(0);
+static cfs_time_t suspend_wakeup_time = 0;
+
+cfs_duration_t pinger_check_timeout(cfs_time_t time)
+{
+	struct timeout_item *item;
+	cfs_time_t timeout = PING_INTERVAL;
+
+	/* The timeout list is a increase order sorted list */
+	mutex_lock(&pinger_mutex);
+	list_for_each_entry(item, &timeout_list, ti_chain) {
+		int ti_timeout = item->ti_timeout;
+		if (timeout > ti_timeout)
+			timeout = ti_timeout;
+		break;
+	}
+	mutex_unlock(&pinger_mutex);
+
+	return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)),
+					 cfs_time_current());
+}
+
+static wait_queue_head_t suspend_timeouts_waitq;
+
+cfs_time_t ptlrpc_suspend_wakeup_time(void)
+{
+	return suspend_wakeup_time;
+}
+
+void ptlrpc_deactivate_timeouts(struct obd_import *imp)
+{
+	/*XXX: disabled for now, will be replaced by adaptive timeouts */
+#if 0
+	if (imp->imp_no_timeout)
+		return;
+	imp->imp_no_timeout = 1;
+	atomic_inc(&suspend_timeouts);
+	CDEBUG(D_HA|D_WARNING, "deactivate timeouts %u\n",
+	       atomic_read(&suspend_timeouts));
+#endif
+}
+
+void ptlrpc_activate_timeouts(struct obd_import *imp)
+{
+	/*XXX: disabled for now, will be replaced by adaptive timeouts */
+#if 0
+	if (!imp->imp_no_timeout)
+		return;
+	imp->imp_no_timeout = 0;
+	LASSERT(atomic_read(&suspend_timeouts) > 0);
+	if (atomic_dec_and_test(&suspend_timeouts)) {
+		suspend_wakeup_time = cfs_time_current();
+		wake_up(&suspend_timeouts_waitq);
+	}
+	CDEBUG(D_HA|D_WARNING, "activate timeouts %u\n",
+	       atomic_read(&suspend_timeouts));
+#endif
+}
+
+int ptlrpc_check_suspend(void)
+{
+	if (atomic_read(&suspend_timeouts))
+		return 1;
+	return 0;
+}
+
+int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req)
+{
+	struct l_wait_info lwi;
+
+	if (atomic_read(&suspend_timeouts)) {
+		DEBUG_REQ(D_NET, req, "-- suspend %d regular timeout",
+			  atomic_read(&suspend_timeouts));
+		lwi = LWI_INTR(NULL, NULL);
+		l_wait_event(suspend_timeouts_waitq,
+			     atomic_read(&suspend_timeouts) == 0, &lwi);
+		DEBUG_REQ(D_NET, req, "-- recharge regular timeout");
+		return 1;
+	}
+	return 0;
+}
+
+
+static bool ir_up;
+
+void ptlrpc_pinger_ir_up(void)
+{
+	CDEBUG(D_HA, "IR up\n");
+	ir_up = true;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_up);
+
+void ptlrpc_pinger_ir_down(void)
+{
+	CDEBUG(D_HA, "IR down\n");
+	ir_up = false;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
+
+static void ptlrpc_pinger_process_import(struct obd_import *imp,
+					 unsigned long this_ping)
+{
+	int level;
+	int force;
+	int force_next;
+	int suppress;
+
+	spin_lock(&imp->imp_lock);
+
+	level = imp->imp_state;
+	force = imp->imp_force_verify;
+	force_next = imp->imp_force_next_verify;
+	/*
+	 * This will be used below only if the import is "FULL".
+	 */
+	suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS);
+
+	imp->imp_force_verify = 0;
+
+	if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
+	    !force) {
+		spin_unlock(&imp->imp_lock);
+		return;
+	}
+
+	imp->imp_force_next_verify = 0;
+
+	spin_unlock(&imp->imp_lock);
+
+	CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u "
+	       "force %u force_next %u deactive %u pingable %u suppress %u\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+	       ptlrpc_import_state_name(level), level, force, force_next,
+	       imp->imp_deactive, imp->imp_pingable, suppress);
+
+	if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
+		/* wait for a while before trying recovery again */
+		imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+		if (!imp->imp_no_pinger_recover)
+			ptlrpc_initiate_recovery(imp);
+	} else if (level != LUSTRE_IMP_FULL ||
+		   imp->imp_obd->obd_no_recov ||
+		   imp_is_deactive(imp)) {
+		CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
+		       "or recovery disabled: %s)\n",
+		       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+		       ptlrpc_import_state_name(level));
+	} else if ((imp->imp_pingable && !suppress) || force_next || force) {
+		ptlrpc_ping(imp);
+	}
+}
+
+static int ptlrpc_pinger_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+	ENTRY;
+
+	/* Record that the thread is running */
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	/* And now, loop forever, pinging as needed. */
+	while (1) {
+		cfs_time_t this_ping = cfs_time_current();
+		struct l_wait_info lwi;
+		cfs_duration_t time_to_next_wake;
+		struct timeout_item *item;
+		struct list_head *iter;
+
+		mutex_lock(&pinger_mutex);
+		list_for_each_entry(item, &timeout_list, ti_chain) {
+			item->ti_cb(item, item->ti_cb_data);
+		}
+		list_for_each(iter, &pinger_imports) {
+			struct obd_import *imp =
+				list_entry(iter, struct obd_import,
+					       imp_pinger_chain);
+
+			ptlrpc_pinger_process_import(imp, this_ping);
+			/* obd_timeout might have changed */
+			if (imp->imp_pingable && imp->imp_next_ping &&
+			    cfs_time_after(imp->imp_next_ping,
+					   cfs_time_add(this_ping,
+							cfs_time_seconds(PING_INTERVAL))))
+				ptlrpc_update_next_ping(imp, 0);
+		}
+		mutex_unlock(&pinger_mutex);
+		/* update memory usage info */
+		obd_update_maxusage();
+
+		/* Wait until the next ping time, or until we're stopped. */
+		time_to_next_wake = pinger_check_timeout(this_ping);
+		/* The ping sent by ptlrpc_send_rpc may get sent out
+		   say .01 second after this.
+		   ptlrpc_pinger_sending_on_import will then set the
+		   next ping time to next_ping + .01 sec, which means
+		   we will SKIP the next ping at next_ping, and the
+		   ping will get sent 2 timeouts from now!  Beware. */
+		CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" ("
+		       CFS_TIME_T")\n", time_to_next_wake,
+		       cfs_time_add(this_ping,cfs_time_seconds(PING_INTERVAL)));
+		if (time_to_next_wake > 0) {
+			lwi = LWI_TIMEOUT(max_t(cfs_duration_t,
+						time_to_next_wake,
+						cfs_time_seconds(1)),
+					  NULL, NULL);
+			l_wait_event(thread->t_ctl_waitq,
+				     thread_is_stopping(thread) ||
+				     thread_is_event(thread),
+				     &lwi);
+			if (thread_test_and_clear_flags(thread, SVC_STOPPING)) {
+				EXIT;
+				break;
+			} else {
+				/* woken after adding import to reset timer */
+				thread_test_and_clear_flags(thread, SVC_EVENT);
+			}
+		}
+	}
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+
+	CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid());
+	return 0;
+}
+
+static struct ptlrpc_thread *pinger_thread = NULL;
+
+int ptlrpc_start_pinger(void)
+{
+	struct l_wait_info lwi = { 0 };
+	int rc;
+	ENTRY;
+
+	if (pinger_thread != NULL)
+		RETURN(-EALREADY);
+
+	OBD_ALLOC_PTR(pinger_thread);
+	if (pinger_thread == NULL)
+		RETURN(-ENOMEM);
+	init_waitqueue_head(&pinger_thread->t_ctl_waitq);
+	init_waitqueue_head(&suspend_timeouts_waitq);
+
+	strcpy(pinger_thread->t_name, "ll_ping");
+
+	/* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
+	 * just drop the VM and FILES in cfs_daemonize_ctxt() right away. */
+	rc = PTR_ERR(kthread_run(ptlrpc_pinger_main,
+				 pinger_thread, pinger_thread->t_name));
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("cannot start thread: %d\n", rc);
+		OBD_FREE(pinger_thread, sizeof(*pinger_thread));
+		pinger_thread = NULL;
+		RETURN(rc);
+	}
+	l_wait_event(pinger_thread->t_ctl_waitq,
+		     thread_is_running(pinger_thread), &lwi);
+
+	if (suppress_pings)
+		CWARN("Pings will be suppressed at the request of the "
+		      "administrator.  The configuration shall meet the "
+		      "additional requirements described in the manual.  "
+		      "(Search for the \"suppress_pings\" kernel module "
+		      "parameter.)\n");
+
+	RETURN(0);
+}
+
+int ptlrpc_pinger_remove_timeouts(void);
+
+int ptlrpc_stop_pinger(void)
+{
+	struct l_wait_info lwi = { 0 };
+	int rc = 0;
+	ENTRY;
+
+	if (pinger_thread == NULL)
+		RETURN(-EALREADY);
+
+	ptlrpc_pinger_remove_timeouts();
+	mutex_lock(&pinger_mutex);
+	thread_set_flags(pinger_thread, SVC_STOPPING);
+	wake_up(&pinger_thread->t_ctl_waitq);
+	mutex_unlock(&pinger_mutex);
+
+	l_wait_event(pinger_thread->t_ctl_waitq,
+		     thread_is_stopped(pinger_thread), &lwi);
+
+	OBD_FREE_PTR(pinger_thread);
+	pinger_thread = NULL;
+	RETURN(rc);
+}
+
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
+{
+	ptlrpc_update_next_ping(imp, 0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import);
+
+void ptlrpc_pinger_commit_expected(struct obd_import *imp)
+{
+	ptlrpc_update_next_ping(imp, 1);
+	LASSERT(spin_is_locked(&imp->imp_lock));
+	/*
+	 * Avoid reading stale imp_connect_data.  When not sure if pings are
+	 * expected or not on next connection, we assume they are not and force
+	 * one anyway to guarantee the chance of updating
+	 * imp_peer_committed_transno.
+	 */
+	if (imp->imp_state != LUSTRE_IMP_FULL ||
+	    OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS))
+		imp->imp_force_next_verify = 1;
+}
+
+int ptlrpc_pinger_add_import(struct obd_import *imp)
+{
+	ENTRY;
+	if (!list_empty(&imp->imp_pinger_chain))
+		RETURN(-EALREADY);
+
+	mutex_lock(&pinger_mutex);
+	CDEBUG(D_HA, "adding pingable import %s->%s\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* if we add to pinger we want recovery on this import */
+	imp->imp_obd->obd_no_recov = 0;
+	ptlrpc_update_next_ping(imp, 0);
+	/* XXX sort, blah blah */
+	list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
+	class_import_get(imp);
+
+	ptlrpc_pinger_wake_up();
+	mutex_unlock(&pinger_mutex);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_add_import);
+
+int ptlrpc_pinger_del_import(struct obd_import *imp)
+{
+	ENTRY;
+	if (list_empty(&imp->imp_pinger_chain))
+		RETURN(-ENOENT);
+
+	mutex_lock(&pinger_mutex);
+	list_del_init(&imp->imp_pinger_chain);
+	CDEBUG(D_HA, "removing pingable import %s->%s\n",
+	       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+	/* if we remove from pinger we don't want recovery on this import */
+	imp->imp_obd->obd_no_recov = 1;
+	class_import_put(imp);
+	mutex_unlock(&pinger_mutex);
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_del_import);
+
+/**
+ * Register a timeout callback to the pinger list, and the callback will
+ * be called when timeout happens.
+ */
+struct timeout_item* ptlrpc_new_timeout(int time, enum timeout_event event,
+					timeout_cb_t cb, void *data)
+{
+	struct timeout_item *ti;
+
+	OBD_ALLOC_PTR(ti);
+	if (!ti)
+		return(NULL);
+
+	INIT_LIST_HEAD(&ti->ti_obd_list);
+	INIT_LIST_HEAD(&ti->ti_chain);
+	ti->ti_timeout = time;
+	ti->ti_event = event;
+	ti->ti_cb = cb;
+	ti->ti_cb_data = data;
+
+	return ti;
+}
+
+/**
+ * Register timeout event on the the pinger thread.
+ * Note: the timeout list is an sorted list with increased timeout value.
+ */
+static struct timeout_item*
+ptlrpc_pinger_register_timeout(int time, enum timeout_event event,
+			       timeout_cb_t cb, void *data)
+{
+	struct timeout_item *item, *tmp;
+
+	LASSERT(mutex_is_locked(&pinger_mutex));
+
+	list_for_each_entry(item, &timeout_list, ti_chain)
+		if (item->ti_event == event)
+			goto out;
+
+	item = ptlrpc_new_timeout(time, event, cb, data);
+	if (item) {
+		list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) {
+			if (tmp->ti_timeout < time) {
+				list_add(&item->ti_chain, &tmp->ti_chain);
+				goto out;
+			}
+		}
+		list_add(&item->ti_chain, &timeout_list);
+	}
+out:
+	return item;
+}
+
+/* Add a client_obd to the timeout event list, when timeout(@time)
+ * happens, the callback(@cb) will be called.
+ */
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+			      timeout_cb_t cb, void *data,
+			      struct list_head *obd_list)
+{
+	struct timeout_item *ti;
+
+	mutex_lock(&pinger_mutex);
+	ti = ptlrpc_pinger_register_timeout(time, event, cb, data);
+	if (!ti) {
+		mutex_unlock(&pinger_mutex);
+		return (-EINVAL);
+	}
+	list_add(obd_list, &ti->ti_obd_list);
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_add_timeout_client);
+
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+			      enum timeout_event event)
+{
+	struct timeout_item *ti = NULL, *item;
+
+	if (list_empty(obd_list))
+		return 0;
+	mutex_lock(&pinger_mutex);
+	list_del_init(obd_list);
+	/**
+	 * If there are no obd attached to the timeout event
+	 * list, remove this timeout event from the pinger
+	 */
+	list_for_each_entry(item, &timeout_list, ti_chain) {
+		if (item->ti_event == event) {
+			ti = item;
+			break;
+		}
+	}
+	LASSERTF(ti != NULL, "ti is NULL ! \n");
+	if (list_empty(&ti->ti_obd_list)) {
+		list_del(&ti->ti_chain);
+		OBD_FREE_PTR(ti);
+	}
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_del_timeout_client);
+
+int ptlrpc_pinger_remove_timeouts(void)
+{
+	struct timeout_item *item, *tmp;
+
+	mutex_lock(&pinger_mutex);
+	list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) {
+		LASSERT(list_empty(&item->ti_obd_list));
+		list_del(&item->ti_chain);
+		OBD_FREE_PTR(item);
+	}
+	mutex_unlock(&pinger_mutex);
+	return 0;
+}
+
+void ptlrpc_pinger_wake_up()
+{
+	thread_add_flags(pinger_thread, SVC_EVENT);
+	wake_up(&pinger_thread->t_ctl_waitq);
+}
+
+/* Ping evictor thread */
+#define PET_READY     1
+#define PET_TERMINATE 2
+
+static int	       pet_refcount = 0;
+static int	       pet_state;
+static wait_queue_head_t       pet_waitq;
+LIST_HEAD(pet_list);
+static DEFINE_SPINLOCK(pet_lock);
+
+int ping_evictor_wake(struct obd_export *exp)
+{
+	struct obd_device *obd;
+
+	spin_lock(&pet_lock);
+	if (pet_state != PET_READY) {
+		/* eventually the new obd will call here again. */
+		spin_unlock(&pet_lock);
+		return 1;
+	}
+
+	obd = class_exp2obd(exp);
+	if (list_empty(&obd->obd_evict_list)) {
+		class_incref(obd, "evictor", obd);
+		list_add(&obd->obd_evict_list, &pet_list);
+	}
+	spin_unlock(&pet_lock);
+
+	wake_up(&pet_waitq);
+	return 0;
+}
+
+static int ping_evictor_main(void *arg)
+{
+	struct obd_device *obd;
+	struct obd_export *exp;
+	struct l_wait_info lwi = { 0 };
+	time_t expire_time;
+	ENTRY;
+
+	unshare_fs_struct();
+
+	CDEBUG(D_HA, "Starting Ping Evictor\n");
+	pet_state = PET_READY;
+	while (1) {
+		l_wait_event(pet_waitq, (!list_empty(&pet_list)) ||
+			     (pet_state == PET_TERMINATE), &lwi);
+
+		/* loop until all obd's will be removed */
+		if ((pet_state == PET_TERMINATE) && list_empty(&pet_list))
+			break;
+
+		/* we only get here if pet_exp != NULL, and the end of this
+		 * loop is the only place which sets it NULL again, so lock
+		 * is not strictly necessary. */
+		spin_lock(&pet_lock);
+		obd = list_entry(pet_list.next, struct obd_device,
+				     obd_evict_list);
+		spin_unlock(&pet_lock);
+
+		expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
+
+		CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+		       obd->obd_name, expire_time);
+
+		/* Exports can't be deleted out of the list while we hold
+		 * the obd lock (class_unlink_export), which means we can't
+		 * lose the last ref on the export.  If they've already been
+		 * removed from the list, we won't find them here. */
+		spin_lock(&obd->obd_dev_lock);
+		while (!list_empty(&obd->obd_exports_timed)) {
+			exp = list_entry(obd->obd_exports_timed.next,
+					     struct obd_export,
+					     exp_obd_chain_timed);
+			if (expire_time > exp->exp_last_request_time) {
+				class_export_get(exp);
+				spin_unlock(&obd->obd_dev_lock);
+				LCONSOLE_WARN("%s: haven't heard from client %s"
+					      " (at %s) in %ld seconds. I think"
+					      " it's dead, and I am evicting"
+					      " it. exp %p, cur %ld expire %ld"
+					      " last %ld\n",
+					      obd->obd_name,
+					      obd_uuid2str(&exp->exp_client_uuid),
+					      obd_export_nid2str(exp),
+					      (long)(cfs_time_current_sec() -
+						     exp->exp_last_request_time),
+					      exp, (long)cfs_time_current_sec(),
+					      (long)expire_time,
+					      (long)exp->exp_last_request_time);
+				CDEBUG(D_HA, "Last request was at %ld\n",
+				       exp->exp_last_request_time);
+				class_fail_export(exp);
+				class_export_put(exp);
+				spin_lock(&obd->obd_dev_lock);
+			} else {
+				/* List is sorted, so everyone below is ok */
+				break;
+			}
+		}
+		spin_unlock(&obd->obd_dev_lock);
+
+		spin_lock(&pet_lock);
+		list_del_init(&obd->obd_evict_list);
+		spin_unlock(&pet_lock);
+
+		class_decref(obd, "evictor", obd);
+	}
+	CDEBUG(D_HA, "Exiting Ping Evictor\n");
+
+	RETURN(0);
+}
+
+void ping_evictor_start(void)
+{
+	task_t *task;
+
+	if (++pet_refcount > 1)
+		return;
+
+	init_waitqueue_head(&pet_waitq);
+
+	task = kthread_run(ping_evictor_main, NULL, "ll_evictor");
+	if (IS_ERR(task)) {
+		pet_refcount--;
+		CERROR("Cannot start ping evictor thread: %ld\n",
+			PTR_ERR(task));
+	}
+}
+EXPORT_SYMBOL(ping_evictor_start);
+
+void ping_evictor_stop(void)
+{
+	if (--pet_refcount > 0)
+		return;
+
+	pet_state = PET_TERMINATE;
+	wake_up(&pet_waitq);
+}
+EXPORT_SYMBOL(ping_evictor_stop);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
new file mode 100644
index 0000000..ab36347
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h

@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/* Intramodule declarations for ptlrpc. */
+
+#ifndef PTLRPC_INTERNAL_H
+#define PTLRPC_INTERNAL_H
+
+#include "../ldlm/ldlm_internal.h"
+
+struct ldlm_namespace;
+struct obd_import;
+struct ldlm_res_id;
+struct ptlrpc_request_set;
+extern int test_req_buffer_pressure;
+extern struct mutex ptlrpc_all_services_mutex;
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait);
+/* ptlrpcd.c */
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc);
+
+/* client.c */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+					 unsigned type, unsigned portal);
+void ptlrpc_init_xid(void);
+
+/* events.c */
+int ptlrpc_init_portals(void);
+void ptlrpc_exit_portals(void);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
+void lustre_assert_wire_constants(void);
+int ptlrpc_import_in_recovery(struct obd_import *imp);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
+void ptlrpc_handle_failed_import(struct obd_import *imp);
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
+void ptlrpc_initiate_recovery(struct obd_import *imp);
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
+
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry,
+				     struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
+void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
+				     long q_usec, long work_usec);
+#else
+#define ptlrpc_lprocfs_register_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
+#define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
+#endif /* LPROCFS */
+
+/* NRS */
+
+/**
+ * NRS core object.
+ *
+ * Holds NRS core fields.
+ */
+struct nrs_core {
+	/**
+	 * Protects nrs_core::nrs_policies, serializes external policy
+	 * registration/unregistration, and NRS core lprocfs operations.
+	 */
+	struct mutex nrs_mutex;
+	/* XXX: This is just for liblustre. Remove the #if defined directive
+	 * when the * "cfs_" prefix is dropped from cfs_list_head. */
+	/**
+	 * List of all policy descriptors registered with NRS core; protected
+	 * by nrs_core::nrs_mutex.
+	 */
+	struct list_head nrs_policies;
+
+};
+
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc);
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc);
+
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+			       struct ptlrpc_request *req, bool hp);
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+			struct ptlrpc_request *req, bool hp);
+
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+			   bool peek, bool force);
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp,
+			  bool force)
+{
+	return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force);
+}
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false);
+}
+
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req);
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp);
+
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+			      enum ptlrpc_nrs_queue_type queue, char *name,
+			      enum ptlrpc_nrs_ctl opc, bool single, void *arg);
+
+int ptlrpc_nrs_init(void);
+void ptlrpc_nrs_fini(void);
+
+static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nrs_hp != NULL;
+}
+
+static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc)
+{
+	/**
+	 * If the first service partition has an HP NRS head, all service
+	 * partitions will.
+	 */
+	return nrs_svcpt_has_hp(svc->srv_parts[0]);
+}
+
+static inline
+struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp)
+{
+	LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt)));
+	return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg;
+}
+
+static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt->scp_cpt;
+}
+
+static inline
+struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt->scp_service;
+}
+
+static inline
+struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy)
+{
+	return policy->pol_nrs->nrs_svcpt;
+}
+
+static inline
+struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy)
+{
+	return nrs_pol2svc(policy)->srv_cptable;
+}
+
+static inline struct ptlrpc_nrs_resource *
+nrs_request_resource(struct ptlrpc_nrs_request *nrq)
+{
+	LASSERT(nrq->nr_initialized);
+	LASSERT(!nrq->nr_finalized);
+
+	return nrq->nr_res_ptrs[nrq->nr_res_idx];
+}
+
+static inline
+struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq)
+{
+	return nrs_request_resource(nrq)->res_policy;
+}
+
+#define NRS_LPROCFS_QUANTUM_NAME_REG	"reg_quantum:"
+#define NRS_LPROCFS_QUANTUM_NAME_HP	"hp_quantum:"
+
+/**
+ * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum.
+ */
+#define LPROCFS_NRS_QUANTUM_MAX		65535
+
+/**
+ * Max valid command string is the size of the labels, plus "65535" twice, plus
+ * a separating space character.
+ */
+#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD					       \
+ sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " "  \
+	NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX))
+
+/* recovd_thread.c */
+
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink);
+
+/* pers.c */
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+			 int mdcnt);
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+			  int pageoffset, int len);
+
+/* pack_generic.c */
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt);
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs);
+
+/* pinger.c */
+int ptlrpc_start_pinger(void);
+int ptlrpc_stop_pinger(void);
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
+void ptlrpc_pinger_commit_expected(struct obd_import *imp);
+void ptlrpc_pinger_wake_up(void);
+void ptlrpc_ping_import_soon(struct obd_import *imp);
+int ping_evictor_wake(struct obd_export *exp);
+
+/* sec_null.c */
+int  sptlrpc_null_init(void);
+void sptlrpc_null_fini(void);
+
+/* sec_plain.c */
+int  sptlrpc_plain_init(void);
+void sptlrpc_plain_fini(void);
+
+/* sec_bulk.c */
+int  sptlrpc_enc_pool_init(void);
+void sptlrpc_enc_pool_fini(void);
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v);
+
+/* sec_lproc.c */
+int  sptlrpc_lproc_init(void);
+void sptlrpc_lproc_fini(void);
+
+/* sec_gc.c */
+int sptlrpc_gc_init(void);
+void sptlrpc_gc_fini(void);
+
+/* sec_config.c */
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+				enum lustre_sec_part to,
+				struct obd_uuid *target,
+				lnet_nid_t nid,
+				struct sptlrpc_flavor *sf);
+int  sptlrpc_conf_init(void);
+void sptlrpc_conf_fini(void);
+
+/* sec.c */
+int  sptlrpc_init(void);
+void sptlrpc_fini(void);
+
+static inline int ll_rpc_recoverable_error(int rc)
+{
+	return (rc == -ENOTCONN || rc == -ENODEV);
+}
+
+static inline int tgt_mod_init(void)
+{
+	return 0;
+}
+
+static inline void tgt_mod_exit(void)
+{
+	return;
+}
+
+static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set)
+{
+	if (atomic_dec_and_test(&set->set_refcount))
+		OBD_FREE_PTR(set);
+}
+#endif /* PTLRPC_INTERNAL_H */

diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c
new file mode 100644
index 0000000..f6ea80f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c

@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+extern spinlock_t ptlrpc_last_xid_lock;
+#if RS_DEBUG
+extern spinlock_t ptlrpc_rs_debug_lock;
+#endif
+extern struct mutex pinger_mutex;
+extern struct mutex ptlrpcd_mutex;
+
+__init int ptlrpc_init(void)
+{
+	int rc, cleanup_phase = 0;
+	ENTRY;
+
+	lustre_assert_wire_constants();
+#if RS_DEBUG
+	spin_lock_init(&ptlrpc_rs_debug_lock);
+#endif
+	mutex_init(&ptlrpc_all_services_mutex);
+	mutex_init(&pinger_mutex);
+	mutex_init(&ptlrpcd_mutex);
+	ptlrpc_init_xid();
+
+	rc = req_layout_init();
+	if (rc)
+		RETURN(rc);
+
+	rc = ptlrpc_hr_init();
+	if (rc)
+		RETURN(rc);
+
+	cleanup_phase = 1;
+
+	rc = ptlrpc_init_portals();
+	if (rc)
+		GOTO(cleanup, rc);
+	cleanup_phase = 2;
+
+	rc = ptlrpc_connection_init();
+	if (rc)
+		GOTO(cleanup, rc);
+	cleanup_phase = 3;
+
+	ptlrpc_put_connection_superhack = ptlrpc_connection_put;
+
+	rc = ptlrpc_start_pinger();
+	if (rc)
+		GOTO(cleanup, rc);
+	cleanup_phase = 4;
+
+	rc = ldlm_init();
+	if (rc)
+		GOTO(cleanup, rc);
+	cleanup_phase = 5;
+
+	rc = sptlrpc_init();
+	if (rc)
+		GOTO(cleanup, rc);
+
+	cleanup_phase = 7;
+	rc = ptlrpc_nrs_init();
+	if (rc)
+		GOTO(cleanup, rc);
+
+	cleanup_phase = 8;
+	rc = tgt_mod_init();
+	if (rc)
+		GOTO(cleanup, rc);
+	RETURN(0);
+
+cleanup:
+	switch(cleanup_phase) {
+	case 8:
+		ptlrpc_nrs_fini();
+	case 7:
+		sptlrpc_fini();
+	case 5:
+		ldlm_exit();
+	case 4:
+		ptlrpc_stop_pinger();
+	case 3:
+		ptlrpc_connection_fini();
+	case 2:
+		ptlrpc_exit_portals();
+	case 1:
+		ptlrpc_hr_fini();
+		req_layout_fini();
+	default: ;
+	}
+
+	return rc;
+}
+
+static void __exit ptlrpc_exit(void)
+{
+	tgt_mod_exit();
+	ptlrpc_nrs_fini();
+	sptlrpc_fini();
+	ldlm_exit();
+	ptlrpc_stop_pinger();
+	ptlrpc_exit_portals();
+	ptlrpc_hr_fini();
+	ptlrpc_connection_fini();
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Request Processor and Lock Management");
+MODULE_LICENSE("GPL");
+
+cfs_module(ptlrpc, "1.0.0", ptlrpc_init, ptlrpc_exit);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
new file mode 100644
index 0000000..5a66a1b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c

@@ -0,0 +1,827 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/ptlrpcd.c
+ */
+
+/** \defgroup ptlrpcd PortalRPC daemon
+ *
+ * ptlrpcd is a special thread with its own set where other user might add
+ * requests when they don't want to wait for their completion.
+ * PtlRPCD will take care of sending such requests and then processing their
+ * replies and calling completion callbacks as necessary.
+ * The callbacks are called directly from ptlrpcd context.
+ * It is important to never significantly block (esp. on RPCs!) within such
+ * completion handler or a deadlock might occur where ptlrpcd enters some
+ * callback that attempts to send another RPC and wait for it to return,
+ * during which time ptlrpcd is completely blocked, so e.g. if import
+ * fails, recovery cannot progress because connection requests are also
+ * sent by ptlrpcd.
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_net.h>
+# include <lustre_lib.h>
+
+#include <lustre_ha.h>
+#include <obd_class.h>   /* for obd_zombie */
+#include <obd_support.h> /* for OBD_FAIL_CHECK */
+#include <cl_object.h> /* cl_env_{get,put}() */
+#include <lprocfs_status.h>
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpcd {
+	int		pd_size;
+	int		pd_index;
+	int		pd_nthreads;
+	struct ptlrpcd_ctl pd_thread_rcv;
+	struct ptlrpcd_ctl pd_threads[0];
+};
+
+static int max_ptlrpcds;
+CFS_MODULE_PARM(max_ptlrpcds, "i", int, 0644,
+		"Max ptlrpcd thread count to be started.");
+
+static int ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+CFS_MODULE_PARM(ptlrpcd_bind_policy, "i", int, 0644,
+		"Ptlrpcd threads binding mode.");
+static struct ptlrpcd *ptlrpcds;
+
+struct mutex ptlrpcd_mutex;
+static int ptlrpcd_users = 0;
+
+void ptlrpcd_wake(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_set *rq_set = req->rq_set;
+
+	LASSERT(rq_set != NULL);
+
+	wake_up(&rq_set->set_waitq);
+}
+EXPORT_SYMBOL(ptlrpcd_wake);
+
+static struct ptlrpcd_ctl *
+ptlrpcd_select_pc(struct ptlrpc_request *req, pdl_policy_t policy, int index)
+{
+	int idx = 0;
+
+	if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL)
+		return &ptlrpcds->pd_thread_rcv;
+
+	switch (policy) {
+	case PDL_POLICY_SAME:
+		idx = smp_processor_id() % ptlrpcds->pd_nthreads;
+		break;
+	case PDL_POLICY_LOCAL:
+		/* Before CPU partition patches available, process it the same
+		 * as "PDL_POLICY_ROUND". */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix this code to use new CPU partition APIs"
+# endif
+		/* Fall through to PDL_POLICY_ROUND until the CPU
+		 * CPU partition patches are available. */
+		index = -1;
+	case PDL_POLICY_PREFERRED:
+		if (index >= 0 && index < num_online_cpus()) {
+			idx = index % ptlrpcds->pd_nthreads;
+			break;
+		}
+		/* Fall through to PDL_POLICY_ROUND for bad index. */
+	default:
+		/* Fall through to PDL_POLICY_ROUND for unknown policy. */
+	case PDL_POLICY_ROUND:
+		/* We do not care whether it is strict load balance. */
+		idx = ptlrpcds->pd_index + 1;
+		if (idx == smp_processor_id())
+			idx++;
+		idx %= ptlrpcds->pd_nthreads;
+		ptlrpcds->pd_index = idx;
+		break;
+	}
+
+	return &ptlrpcds->pd_threads[idx];
+}
+
+/**
+ * Move all request from an existing request set to the ptlrpcd queue.
+ * All requests from the set must be in phase RQ_PHASE_NEW.
+ */
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpcd_ctl *pc;
+	struct ptlrpc_request_set *new;
+	int count, i;
+
+	pc = ptlrpcd_select_pc(NULL, PDL_POLICY_LOCAL, -1);
+	new = pc->pc_set;
+
+	list_for_each_safe(pos, tmp, &set->set_requests) {
+		struct ptlrpc_request *req =
+			list_entry(pos, struct ptlrpc_request,
+				       rq_set_chain);
+
+		LASSERT(req->rq_phase == RQ_PHASE_NEW);
+		req->rq_set = new;
+		req->rq_queued_time = cfs_time_current();
+	}
+
+	spin_lock(&new->set_new_req_lock);
+	list_splice_init(&set->set_requests, &new->set_new_requests);
+	i = atomic_read(&set->set_remaining);
+	count = atomic_add_return(i, &new->set_new_count);
+	atomic_set(&set->set_remaining, 0);
+	spin_unlock(&new->set_new_req_lock);
+	if (count == i) {
+		wake_up(&new->set_waitq);
+
+		/* XXX: It maybe unnecessary to wakeup all the partners. But to
+		 *      guarantee the async RPC can be processed ASAP, we have
+		 *      no other better choice. It maybe fixed in future. */
+		for (i = 0; i < pc->pc_npartners; i++)
+			wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+	}
+}
+EXPORT_SYMBOL(ptlrpcd_add_rqset);
+
+/**
+ * Return transferred RPCs count.
+ */
+static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des,
+			       struct ptlrpc_request_set *src)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+	int rc = 0;
+
+	spin_lock(&src->set_new_req_lock);
+	if (likely(!list_empty(&src->set_new_requests))) {
+		list_for_each_safe(pos, tmp, &src->set_new_requests) {
+			req = list_entry(pos, struct ptlrpc_request,
+					     rq_set_chain);
+			req->rq_set = des;
+		}
+		list_splice_init(&src->set_new_requests,
+				     &des->set_requests);
+		rc = atomic_read(&src->set_new_count);
+		atomic_add(rc, &des->set_remaining);
+		atomic_set(&src->set_new_count, 0);
+	}
+	spin_unlock(&src->set_new_req_lock);
+	return rc;
+}
+
+/**
+ * Requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set().
+ */
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx)
+{
+	struct ptlrpcd_ctl *pc;
+
+	if (req->rq_reqmsg)
+		lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+	spin_lock(&req->rq_lock);
+	if (req->rq_invalid_rqset) {
+		struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(5),
+						     back_to_sleep, NULL);
+
+		req->rq_invalid_rqset = 0;
+		spin_unlock(&req->rq_lock);
+		l_wait_event(req->rq_set_waitq, (req->rq_set == NULL), &lwi);
+	} else if (req->rq_set) {
+		/* If we have a vaid "rq_set", just reuse it to avoid double
+		 * linked. */
+		LASSERT(req->rq_phase == RQ_PHASE_NEW);
+		LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY);
+
+		/* ptlrpc_check_set will decrease the count */
+		atomic_inc(&req->rq_set->set_remaining);
+		spin_unlock(&req->rq_lock);
+		wake_up(&req->rq_set->set_waitq);
+		return;
+	} else {
+		spin_unlock(&req->rq_lock);
+	}
+
+	pc = ptlrpcd_select_pc(req, policy, idx);
+
+	DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]",
+		  req, pc->pc_name, pc->pc_index);
+
+	ptlrpc_set_add_new_req(pc, req);
+}
+EXPORT_SYMBOL(ptlrpcd_add_req);
+
+static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set)
+{
+	atomic_inc(&set->set_refcount);
+}
+
+/**
+ * Check if there is more work to do on ptlrpcd set.
+ * Returns 1 if yes.
+ */
+static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+	struct ptlrpc_request_set *set = pc->pc_set;
+	int rc = 0;
+	int rc2;
+	ENTRY;
+
+	if (atomic_read(&set->set_new_count)) {
+		spin_lock(&set->set_new_req_lock);
+		if (likely(!list_empty(&set->set_new_requests))) {
+			list_splice_init(&set->set_new_requests,
+					     &set->set_requests);
+			atomic_add(atomic_read(&set->set_new_count),
+				       &set->set_remaining);
+			atomic_set(&set->set_new_count, 0);
+			/*
+			 * Need to calculate its timeout.
+			 */
+			rc = 1;
+		}
+		spin_unlock(&set->set_new_req_lock);
+	}
+
+	/* We should call lu_env_refill() before handling new requests to make
+	 * sure that env key the requests depending on really exists.
+	 */
+	rc2 = lu_env_refill(env);
+	if (rc2 != 0) {
+		/*
+		 * XXX This is very awkward situation, because
+		 * execution can neither continue (request
+		 * interpreters assume that env is set up), nor repeat
+		 * the loop (as this potentially results in a tight
+		 * loop of -ENOMEM's).
+		 *
+		 * Fortunately, refill only ever does something when
+		 * new modules are loaded, i.e., early during boot up.
+		 */
+		CERROR("Failure to refill session: %d\n", rc2);
+		RETURN(rc);
+	}
+
+	if (atomic_read(&set->set_remaining))
+		rc |= ptlrpc_check_set(env, set);
+
+	if (!list_empty(&set->set_requests)) {
+		/*
+		 * XXX: our set never completes, so we prune the completed
+		 * reqs after each iteration. boy could this be smarter.
+		 */
+		list_for_each_safe(pos, tmp, &set->set_requests) {
+			req = list_entry(pos, struct ptlrpc_request,
+					     rq_set_chain);
+			if (req->rq_phase != RQ_PHASE_COMPLETE)
+				continue;
+
+			list_del_init(&req->rq_set_chain);
+			req->rq_set = NULL;
+			ptlrpc_req_finished(req);
+		}
+	}
+
+	if (rc == 0) {
+		/*
+		 * If new requests have been added, make sure to wake up.
+		 */
+		rc = atomic_read(&set->set_new_count);
+
+		/* If we have nothing to do, check whether we can take some
+		 * work from our partner threads. */
+		if (rc == 0 && pc->pc_npartners > 0) {
+			struct ptlrpcd_ctl *partner;
+			struct ptlrpc_request_set *ps;
+			int first = pc->pc_cursor;
+
+			do {
+				partner = pc->pc_partners[pc->pc_cursor++];
+				if (pc->pc_cursor >= pc->pc_npartners)
+					pc->pc_cursor = 0;
+				if (partner == NULL)
+					continue;
+
+				spin_lock(&partner->pc_lock);
+				ps = partner->pc_set;
+				if (ps == NULL) {
+					spin_unlock(&partner->pc_lock);
+					continue;
+				}
+
+				ptlrpc_reqset_get(ps);
+				spin_unlock(&partner->pc_lock);
+
+				if (atomic_read(&ps->set_new_count)) {
+					rc = ptlrpcd_steal_rqset(set, ps);
+					if (rc > 0)
+						CDEBUG(D_RPCTRACE, "transfer %d"
+						       " async RPCs [%d->%d]\n",
+							rc, partner->pc_index,
+							pc->pc_index);
+				}
+				ptlrpc_reqset_put(ps);
+			} while (rc == 0 && pc->pc_cursor != first);
+		}
+	}
+
+	RETURN(rc);
+}
+
+/**
+ * Main ptlrpcd thread.
+ * ptlrpc's code paths like to execute in process context, so we have this
+ * thread which spins on a set which contains the rpcs and sends them.
+ *
+ */
+static int ptlrpcd(void *arg)
+{
+	struct ptlrpcd_ctl *pc = arg;
+	struct ptlrpc_request_set *set = pc->pc_set;
+	struct lu_env env = { .le_ses = NULL };
+	int rc, exit = 0;
+	ENTRY;
+
+	unshare_fs_struct();
+#if defined(CONFIG_SMP)
+	if (test_bit(LIOD_BIND, &pc->pc_flags)) {
+		int index = pc->pc_index;
+
+		if (index >= 0 && index < num_possible_cpus()) {
+			while (!cpu_online(index)) {
+				if (++index >= num_possible_cpus())
+					index = 0;
+			}
+			set_cpus_allowed_ptr(current,
+					cpumask_of_node(cpu_to_node(index)));
+		}
+	}
+#endif
+	/*
+	 * XXX So far only "client" ptlrpcd uses an environment. In
+	 * the future, ptlrpcd thread (or a thread-set) has to given
+	 * an argument, describing its "scope".
+	 */
+	rc = lu_context_init(&env.le_ctx,
+			     LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF);
+	complete(&pc->pc_starting);
+
+	if (rc != 0)
+		RETURN(rc);
+
+	/*
+	 * This mainloop strongly resembles ptlrpc_set_wait() except that our
+	 * set never completes.  ptlrpcd_check() calls ptlrpc_check_set() when
+	 * there are requests in the set. New requests come in on the set's
+	 * new_req_list and ptlrpcd_check() moves them into the set.
+	 */
+	do {
+		struct l_wait_info lwi;
+		int timeout;
+
+		timeout = ptlrpc_set_next_timeout(set);
+		lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
+				  ptlrpc_expired_set, set);
+
+		lu_context_enter(&env.le_ctx);
+		l_wait_event(set->set_waitq,
+			     ptlrpcd_check(&env, pc), &lwi);
+		lu_context_exit(&env.le_ctx);
+
+		/*
+		 * Abort inflight rpcs for forced stop case.
+		 */
+		if (test_bit(LIOD_STOP, &pc->pc_flags)) {
+			if (test_bit(LIOD_FORCE, &pc->pc_flags))
+				ptlrpc_abort_set(set);
+			exit++;
+		}
+
+		/*
+		 * Let's make one more loop to make sure that ptlrpcd_check()
+		 * copied all raced new rpcs into the set so we can kill them.
+		 */
+	} while (exit < 2);
+
+	/*
+	 * Wait for inflight requests to drain.
+	 */
+	if (!list_empty(&set->set_requests))
+		ptlrpc_set_wait(set);
+	lu_context_fini(&env.le_ctx);
+
+	complete(&pc->pc_finishing);
+
+	return 0;
+}
+
+/* XXX: We want multiple CPU cores to share the async RPC load. So we start many
+ *      ptlrpcd threads. We also want to reduce the ptlrpcd overhead caused by
+ *      data transfer cross-CPU cores. So we bind ptlrpcd thread to specified
+ *      CPU core. But binding all ptlrpcd threads maybe cause response delay
+ *      because of some CPU core(s) busy with other loads.
+ *
+ *      For example: "ls -l", some async RPCs for statahead are assigned to
+ *      ptlrpcd_0, and ptlrpcd_0 is bound to CPU_0, but CPU_0 may be quite busy
+ *      with other non-ptlrpcd, like "ls -l" itself (we want to the "ls -l"
+ *      thread, statahead thread, and ptlrpcd thread can run in parallel), under
+ *      such case, the statahead async RPCs can not be processed in time, it is
+ *      unexpected. If ptlrpcd_0 can be re-scheduled on other CPU core, it may
+ *      be better. But it breaks former data transfer policy.
+ *
+ *      So we shouldn't be blind for avoiding the data transfer. We make some
+ *      compromise: divide the ptlrpcd threds pool into two parts. One part is
+ *      for bound mode, each ptlrpcd thread in this part is bound to some CPU
+ *      core. The other part is for free mode, all the ptlrpcd threads in the
+ *      part can be scheduled on any CPU core. We specify some partnership
+ *      between bound mode ptlrpcd thread(s) and free mode ptlrpcd thread(s),
+ *      and the async RPC load within the partners are shared.
+ *
+ *      It can partly avoid data transfer cross-CPU (if the bound mode ptlrpcd
+ *      thread can be scheduled in time), and try to guarantee the async RPC
+ *      processed ASAP (as long as the free mode ptlrpcd thread can be scheduled
+ *      on any CPU core).
+ *
+ *      As for how to specify the partnership between bound mode ptlrpcd
+ *      thread(s) and free mode ptlrpcd thread(s), the simplest way is to use
+ *      <free bound> pair. In future, we can specify some more complex
+ *      partnership based on the patches for CPU partition. But before such
+ *      patches are available, we prefer to use the simplest one.
+ */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix ptlrpcd_bind() to use new CPU partition APIs"
+# endif
+static int ptlrpcd_bind(int index, int max)
+{
+	struct ptlrpcd_ctl *pc;
+	int rc = 0;
+#if defined(CONFIG_NUMA)
+	cpumask_t mask;
+#endif
+	ENTRY;
+
+	LASSERT(index <= max - 1);
+	pc = &ptlrpcds->pd_threads[index];
+	switch (ptlrpcd_bind_policy) {
+	case PDB_POLICY_NONE:
+		pc->pc_npartners = -1;
+		break;
+	case PDB_POLICY_FULL:
+		pc->pc_npartners = 0;
+		set_bit(LIOD_BIND, &pc->pc_flags);
+		break;
+	case PDB_POLICY_PAIR:
+		LASSERT(max % 2 == 0);
+		pc->pc_npartners = 1;
+		break;
+	case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+	{
+		int i;
+		mask = *cpumask_of_node(cpu_to_node(index));
+		for (i = max; i < num_online_cpus(); i++)
+			cpu_clear(i, mask);
+		pc->pc_npartners = cpus_weight(mask) - 1;
+		set_bit(LIOD_BIND, &pc->pc_flags);
+	}
+#else
+		LASSERT(max >= 3);
+		pc->pc_npartners = 2;
+#endif
+		break;
+	default:
+		CERROR("unknown ptlrpcd bind policy %d\n", ptlrpcd_bind_policy);
+		rc = -EINVAL;
+	}
+
+	if (rc == 0 && pc->pc_npartners > 0) {
+		OBD_ALLOC(pc->pc_partners,
+			  sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+		if (pc->pc_partners == NULL) {
+			pc->pc_npartners = 0;
+			rc = -ENOMEM;
+		} else {
+			switch (ptlrpcd_bind_policy) {
+			case PDB_POLICY_PAIR:
+				if (index & 0x1) {
+					set_bit(LIOD_BIND, &pc->pc_flags);
+					pc->pc_partners[0] = &ptlrpcds->
+						pd_threads[index - 1];
+					ptlrpcds->pd_threads[index - 1].
+						pc_partners[0] = pc;
+				}
+				break;
+			case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+			{
+				struct ptlrpcd_ctl *ppc;
+				int i, pidx;
+				/* partners are cores in the same NUMA node.
+				 * setup partnership only with ptlrpcd threads
+				 * that are already initialized
+				 */
+				for (pidx = 0, i = 0; i < index; i++) {
+					if (cpu_isset(i, mask)) {
+						ppc = &ptlrpcds->pd_threads[i];
+						pc->pc_partners[pidx++] = ppc;
+						ppc->pc_partners[ppc->
+							  pc_npartners++] = pc;
+					}
+				}
+				/* adjust number of partners to the number
+				 * of partnership really setup */
+				pc->pc_npartners = pidx;
+			}
+#else
+				if (index & 0x1)
+					set_bit(LIOD_BIND, &pc->pc_flags);
+				if (index > 0) {
+					pc->pc_partners[0] = &ptlrpcds->
+						pd_threads[index - 1];
+					ptlrpcds->pd_threads[index - 1].
+						pc_partners[1] = pc;
+					if (index == max - 1) {
+						pc->pc_partners[1] =
+						&ptlrpcds->pd_threads[0];
+						ptlrpcds->pd_threads[0].
+						pc_partners[0] = pc;
+					}
+				}
+#endif
+				break;
+			}
+		}
+	}
+
+	RETURN(rc);
+}
+
+
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc)
+{
+	int rc;
+	int env = 0;
+	ENTRY;
+
+	/*
+	 * Do not allow start second thread for one pc.
+	 */
+	if (test_and_set_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Starting second thread (%s) for same pc %p\n",
+		      name, pc);
+		RETURN(0);
+	}
+
+	pc->pc_index = index;
+	init_completion(&pc->pc_starting);
+	init_completion(&pc->pc_finishing);
+	spin_lock_init(&pc->pc_lock);
+	strncpy(pc->pc_name, name, sizeof(pc->pc_name) - 1);
+	pc->pc_set = ptlrpc_prep_set();
+	if (pc->pc_set == NULL)
+		GOTO(out, rc = -ENOMEM);
+	/*
+	 * So far only "client" ptlrpcd uses an environment. In the future,
+	 * ptlrpcd thread (or a thread-set) has to be given an argument,
+	 * describing its "scope".
+	 */
+	rc = lu_context_init(&pc->pc_env.le_ctx, LCT_CL_THREAD|LCT_REMEMBER);
+	if (rc != 0)
+		GOTO(out, rc);
+
+	env = 1;
+	{
+		task_t *task;
+		if (index >= 0) {
+			rc = ptlrpcd_bind(index, max);
+			if (rc < 0)
+				GOTO(out, rc);
+		}
+
+		task = kthread_run(ptlrpcd, pc, pc->pc_name);
+		if (IS_ERR(task))
+			GOTO(out, rc = PTR_ERR(task));
+
+		rc = 0;
+		wait_for_completion(&pc->pc_starting);
+	}
+out:
+	if (rc) {
+		if (pc->pc_set != NULL) {
+			struct ptlrpc_request_set *set = pc->pc_set;
+
+			spin_lock(&pc->pc_lock);
+			pc->pc_set = NULL;
+			spin_unlock(&pc->pc_lock);
+			ptlrpc_set_destroy(set);
+		}
+		if (env != 0)
+			lu_context_fini(&pc->pc_env.le_ctx);
+		clear_bit(LIOD_BIND, &pc->pc_flags);
+		clear_bit(LIOD_START, &pc->pc_flags);
+	}
+	RETURN(rc);
+}
+
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force)
+{
+	ENTRY;
+
+	if (!test_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Thread for pc %p was not started\n", pc);
+		goto out;
+	}
+
+	set_bit(LIOD_STOP, &pc->pc_flags);
+	if (force)
+		set_bit(LIOD_FORCE, &pc->pc_flags);
+	wake_up(&pc->pc_set->set_waitq);
+
+out:
+	EXIT;
+}
+
+void ptlrpcd_free(struct ptlrpcd_ctl *pc)
+{
+	struct ptlrpc_request_set *set = pc->pc_set;
+	ENTRY;
+
+	if (!test_bit(LIOD_START, &pc->pc_flags)) {
+		CWARN("Thread for pc %p was not started\n", pc);
+		goto out;
+	}
+
+	wait_for_completion(&pc->pc_finishing);
+	lu_context_fini(&pc->pc_env.le_ctx);
+
+	spin_lock(&pc->pc_lock);
+	pc->pc_set = NULL;
+	spin_unlock(&pc->pc_lock);
+	ptlrpc_set_destroy(set);
+
+	clear_bit(LIOD_START, &pc->pc_flags);
+	clear_bit(LIOD_STOP, &pc->pc_flags);
+	clear_bit(LIOD_FORCE, &pc->pc_flags);
+	clear_bit(LIOD_BIND, &pc->pc_flags);
+
+out:
+	if (pc->pc_npartners > 0) {
+		LASSERT(pc->pc_partners != NULL);
+
+		OBD_FREE(pc->pc_partners,
+			 sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+		pc->pc_partners = NULL;
+	}
+	pc->pc_npartners = 0;
+	EXIT;
+}
+
+static void ptlrpcd_fini(void)
+{
+	int i;
+	ENTRY;
+
+	if (ptlrpcds != NULL) {
+		for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+			ptlrpcd_stop(&ptlrpcds->pd_threads[i], 0);
+		for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+			ptlrpcd_free(&ptlrpcds->pd_threads[i]);
+		ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+		ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+		OBD_FREE(ptlrpcds, ptlrpcds->pd_size);
+		ptlrpcds = NULL;
+	}
+
+	EXIT;
+}
+
+static int ptlrpcd_init(void)
+{
+	int nthreads = num_online_cpus();
+	char name[16];
+	int size, i = -1, j, rc = 0;
+	ENTRY;
+
+	if (max_ptlrpcds > 0 && max_ptlrpcds < nthreads)
+		nthreads = max_ptlrpcds;
+	if (nthreads < 2)
+		nthreads = 2;
+	if (nthreads < 3 && ptlrpcd_bind_policy == PDB_POLICY_NEIGHBOR)
+		ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+	else if (nthreads % 2 != 0 && ptlrpcd_bind_policy == PDB_POLICY_PAIR)
+		nthreads &= ~1; /* make sure it is even */
+
+	size = offsetof(struct ptlrpcd, pd_threads[nthreads]);
+	OBD_ALLOC(ptlrpcds, size);
+	if (ptlrpcds == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	snprintf(name, 15, "ptlrpcd_rcv");
+	set_bit(LIOD_RECOVERY, &ptlrpcds->pd_thread_rcv.pc_flags);
+	rc = ptlrpcd_start(-1, nthreads, name, &ptlrpcds->pd_thread_rcv);
+	if (rc < 0)
+		GOTO(out, rc);
+
+	/* XXX: We start nthreads ptlrpc daemons. Each of them can process any
+	 *      non-recovery async RPC to improve overall async RPC efficiency.
+	 *
+	 *      But there are some issues with async I/O RPCs and async non-I/O
+	 *      RPCs processed in the same set under some cases. The ptlrpcd may
+	 *      be blocked by some async I/O RPC(s), then will cause other async
+	 *      non-I/O RPC(s) can not be processed in time.
+	 *
+	 *      Maybe we should distinguish blocked async RPCs from non-blocked
+	 *      async RPCs, and process them in different ptlrpcd sets to avoid
+	 *      unnecessary dependency. But how to distribute async RPCs load
+	 *      among all the ptlrpc daemons becomes another trouble. */
+	for (i = 0; i < nthreads; i++) {
+		snprintf(name, 15, "ptlrpcd_%d", i);
+		rc = ptlrpcd_start(i, nthreads, name, &ptlrpcds->pd_threads[i]);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+	ptlrpcds->pd_size = size;
+	ptlrpcds->pd_index = 0;
+	ptlrpcds->pd_nthreads = nthreads;
+
+out:
+	if (rc != 0 && ptlrpcds != NULL) {
+		for (j = 0; j <= i; j++)
+			ptlrpcd_stop(&ptlrpcds->pd_threads[j], 0);
+		for (j = 0; j <= i; j++)
+			ptlrpcd_free(&ptlrpcds->pd_threads[j]);
+		ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+		ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+		OBD_FREE(ptlrpcds, size);
+		ptlrpcds = NULL;
+	}
+
+	RETURN(0);
+}
+
+int ptlrpcd_addref(void)
+{
+	int rc = 0;
+	ENTRY;
+
+	mutex_lock(&ptlrpcd_mutex);
+	if (++ptlrpcd_users == 1)
+		rc = ptlrpcd_init();
+	mutex_unlock(&ptlrpcd_mutex);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpcd_addref);
+
+void ptlrpcd_decref(void)
+{
+	mutex_lock(&ptlrpcd_mutex);
+	if (--ptlrpcd_users == 0)
+		ptlrpcd_fini();
+	mutex_unlock(&ptlrpcd_mutex);
+}
+EXPORT_SYMBOL(ptlrpcd_decref);
+/** @} ptlrpcd */

diff --git a/drivers/staging/lustre/lustre/ptlrpc/recover.c b/drivers/staging/lustre/lustre/ptlrpc/recover.c
new file mode 100644
index 0000000..2960889
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/recover.c

@@ -0,0 +1,357 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/recover.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+# include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_ost.h>
+#include <obd_class.h>
+#include <obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */
+#include <linux/list.h>
+
+#include "ptlrpc_internal.h"
+
+/**
+ * Start recovery on disconnected import.
+ * This is done by just attempting a connect
+ */
+void ptlrpc_initiate_recovery(struct obd_import *imp)
+{
+	ENTRY;
+
+	CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd));
+	ptlrpc_connect_import(imp);
+
+	EXIT;
+}
+
+/**
+ * Identify what request from replay list needs to be replayed next
+ * (based on what we have already replayed) and send it to server.
+ */
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
+{
+	int rc = 0;
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req = NULL;
+	__u64 last_transno;
+	ENTRY;
+
+	*inflight = 0;
+
+	/* It might have committed some after we last spoke, so make sure we
+	 * get rid of them now.
+	 */
+	spin_lock(&imp->imp_lock);
+	imp->imp_last_transno_checked = 0;
+	ptlrpc_free_committed(imp);
+	last_transno = imp->imp_last_replay_transno;
+	spin_unlock(&imp->imp_lock);
+
+	CDEBUG(D_HA, "import %p from %s committed "LPU64" last "LPU64"\n",
+	       imp, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_peer_committed_transno, last_transno);
+
+	/* Do I need to hold a lock across this iteration?  We shouldn't be
+	 * racing with any additions to the list, because we're in recovery
+	 * and are therefore not processing additional requests to add.  Calls
+	 * to ptlrpc_free_committed might commit requests, but nothing "newer"
+	 * than the one we're replaying (it can't be committed until it's
+	 * replayed, and we're doing that here).  l_f_e_safe protects against
+	 * problems with the current request being committed, in the unlikely
+	 * event of that race.  So, in conclusion, I think that it's safe to
+	 * perform this list-walk without the imp_lock held.
+	 *
+	 * But, the {mdc,osc}_replay_open callbacks both iterate
+	 * request lists, and have comments saying they assume the
+	 * imp_lock is being held by ptlrpc_replay, but it's not. it's
+	 * just a little race...
+	 */
+	list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
+		req = list_entry(tmp, struct ptlrpc_request,
+				     rq_replay_list);
+
+		/* If need to resend the last sent transno (because a
+		   reconnect has occurred), then stop on the matching
+		   req and send it again. If, however, the last sent
+		   transno has been committed then we continue replay
+		   from the next request. */
+		if (req->rq_transno > last_transno) {
+			if (imp->imp_resend_replay)
+				lustre_msg_add_flags(req->rq_reqmsg,
+						     MSG_RESENT);
+			break;
+		}
+		req = NULL;
+	}
+
+	spin_lock(&imp->imp_lock);
+	imp->imp_resend_replay = 0;
+	spin_unlock(&imp->imp_lock);
+
+	if (req != NULL) {
+		rc = ptlrpc_replay_req(req);
+		if (rc) {
+			CERROR("recovery replay error %d for req "
+			       LPU64"\n", rc, req->rq_xid);
+			RETURN(rc);
+		}
+		*inflight = 1;
+	}
+	RETURN(rc);
+}
+
+/**
+ * Schedule resending of request on sending_list. This is done after
+ * we completed replaying of requests and locks.
+ */
+int ptlrpc_resend(struct obd_import *imp)
+{
+	struct ptlrpc_request *req, *next;
+
+	ENTRY;
+
+	/* As long as we're in recovery, nothing should be added to the sending
+	 * list, so we don't need to hold the lock during this iteration and
+	 * resend process.
+	 */
+	/* Well... what if lctl recover is called twice at the same time?
+	 */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_RECOVER) {
+		spin_unlock(&imp->imp_lock);
+		RETURN(-1);
+	}
+
+	list_for_each_entry_safe(req, next, &imp->imp_sending_list,
+				     rq_list) {
+		LASSERTF((long)req > PAGE_CACHE_SIZE && req != LP_POISON,
+			 "req %p bad\n", req);
+		LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req);
+		if (!ptlrpc_no_resend(req))
+			ptlrpc_resend_req(req);
+	}
+	spin_unlock(&imp->imp_lock);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_resend);
+
+/**
+ * Go through all requests in delayed list and wake their threads
+ * for resending
+ */
+void ptlrpc_wake_delayed(struct obd_import *imp)
+{
+	struct list_head *tmp, *pos;
+	struct ptlrpc_request *req;
+
+	spin_lock(&imp->imp_lock);
+	list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
+		req = list_entry(tmp, struct ptlrpc_request, rq_list);
+
+		DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_wake_delayed);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
+{
+	struct obd_import *imp = failed_req->rq_import;
+	ENTRY;
+
+	CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+	       imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+	       imp->imp_connection->c_remote_uuid.uuid);
+
+	if (ptlrpc_set_import_discon(imp,
+			      lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) {
+		if (!imp->imp_replayable) {
+			CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+			       "auto-deactivating\n",
+			       obd2cli_tgt(imp->imp_obd),
+			       imp->imp_connection->c_remote_uuid.uuid,
+			       imp->imp_obd->obd_name);
+			ptlrpc_deactivate_import(imp);
+		}
+		/* to control recovery via lctl {disable|enable}_recovery */
+		if (imp->imp_deactive == 0)
+			ptlrpc_connect_import(imp);
+	}
+
+	/* Wait for recovery to complete and resend. If evicted, then
+	   this request will be errored out later.*/
+	spin_lock(&failed_req->rq_lock);
+	if (!failed_req->rq_no_resend)
+		failed_req->rq_resend = 1;
+	spin_unlock(&failed_req->rq_lock);
+
+	EXIT;
+}
+
+/**
+ * Administratively active/deactive a client.
+ * This should only be called by the ioctl interface, currently
+ *  - the lctl deactivate and activate commands
+ *  - echo 0/1 >> /proc/osc/XXX/active
+ *  - client umount -f (ll_umount_begin)
+ */
+int ptlrpc_set_import_active(struct obd_import *imp, int active)
+{
+	struct obd_device *obd = imp->imp_obd;
+	int rc = 0;
+
+	ENTRY;
+	LASSERT(obd);
+
+	/* When deactivating, mark import invalid, and abort in-flight
+	 * requests. */
+	if (!active) {
+		LCONSOLE_WARN("setting import %s INACTIVE by administrator "
+			      "request\n", obd2cli_tgt(imp->imp_obd));
+
+		/* set before invalidate to avoid messages about imp_inval
+		 * set without imp_deactive in ptlrpc_import_delay_req */
+		spin_lock(&imp->imp_lock);
+		imp->imp_deactive = 1;
+		spin_unlock(&imp->imp_lock);
+
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE);
+
+		ptlrpc_invalidate_import(imp);
+	}
+
+	/* When activating, mark import valid, and attempt recovery */
+	if (active) {
+		CDEBUG(D_HA, "setting import %s VALID\n",
+		       obd2cli_tgt(imp->imp_obd));
+
+		spin_lock(&imp->imp_lock);
+		imp->imp_deactive = 0;
+		spin_unlock(&imp->imp_lock);
+		obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE);
+
+		rc = ptlrpc_recover_import(imp, NULL, 0);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_import_active);
+
+/* Attempt to reconnect an import */
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
+{
+	int rc = 0;
+	ENTRY;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive ||
+	    atomic_read(&imp->imp_inval_count))
+		rc = -EINVAL;
+	spin_unlock(&imp->imp_lock);
+	if (rc)
+		GOTO(out, rc);
+
+	/* force import to be disconnected. */
+	ptlrpc_set_import_discon(imp, 0);
+
+	if (new_uuid) {
+		struct obd_uuid uuid;
+
+		/* intruct import to use new uuid */
+		obd_str2uuid(&uuid, new_uuid);
+		rc = import_set_conn_priority(imp, &uuid);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	/* Check if reconnect is already in progress */
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state != LUSTRE_IMP_DISCON) {
+		imp->imp_force_verify = 1;
+		rc = -EALREADY;
+	}
+	spin_unlock(&imp->imp_lock);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = ptlrpc_connect_import(imp);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!async) {
+		struct l_wait_info lwi;
+		int secs = cfs_time_seconds(obd_timeout);
+
+		CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+		       obd2cli_tgt(imp->imp_obd), secs);
+
+		lwi = LWI_TIMEOUT(secs, NULL, NULL);
+		rc = l_wait_event(imp->imp_recovery_waitq,
+				  !ptlrpc_import_in_recovery(imp), &lwi);
+		CDEBUG(D_HA, "%s: recovery finished\n",
+		       obd2cli_tgt(imp->imp_obd));
+	}
+	EXIT;
+
+out:
+	return rc;
+}
+EXPORT_SYMBOL(ptlrpc_recover_import);
+
+int ptlrpc_import_in_recovery(struct obd_import *imp)
+{
+	int in_recovery = 1;
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_state == LUSTRE_IMP_FULL ||
+	    imp->imp_state == LUSTRE_IMP_CLOSED ||
+	    imp->imp_state == LUSTRE_IMP_DISCON)
+		in_recovery = 0;
+	spin_unlock(&imp->imp_lock);
+	return in_recovery;
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c
new file mode 100644
index 0000000..36e8bed5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c

@@ -0,0 +1,2465 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+/***********************************************
+ * policy registers			    *
+ ***********************************************/
+
+static rwlock_t policy_lock;
+static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = {
+	NULL,
+};
+
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy)
+{
+	__u16 number = policy->sp_policy;
+
+	LASSERT(policy->sp_name);
+	LASSERT(policy->sp_cops);
+	LASSERT(policy->sp_sops);
+
+	if (number >= SPTLRPC_POLICY_MAX)
+		return -EINVAL;
+
+	write_lock(&policy_lock);
+	if (unlikely(policies[number])) {
+		write_unlock(&policy_lock);
+		return -EALREADY;
+	}
+	policies[number] = policy;
+	write_unlock(&policy_lock);
+
+	CDEBUG(D_SEC, "%s: registered\n", policy->sp_name);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_register_policy);
+
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy)
+{
+	__u16 number = policy->sp_policy;
+
+	LASSERT(number < SPTLRPC_POLICY_MAX);
+
+	write_lock(&policy_lock);
+	if (unlikely(policies[number] == NULL)) {
+		write_unlock(&policy_lock);
+		CERROR("%s: already unregistered\n", policy->sp_name);
+		return -EINVAL;
+	}
+
+	LASSERT(policies[number] == policy);
+	policies[number] = NULL;
+	write_unlock(&policy_lock);
+
+	CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unregister_policy);
+
+static
+struct ptlrpc_sec_policy * sptlrpc_wireflavor2policy(__u32 flavor)
+{
+	static DEFINE_MUTEX(load_mutex);
+	static atomic_t       loaded = ATOMIC_INIT(0);
+	struct ptlrpc_sec_policy *policy;
+	__u16		     number = SPTLRPC_FLVR_POLICY(flavor);
+	__u16		     flag = 0;
+
+	if (number >= SPTLRPC_POLICY_MAX)
+		return NULL;
+
+	while (1) {
+		read_lock(&policy_lock);
+		policy = policies[number];
+		if (policy && !try_module_get(policy->sp_owner))
+			policy = NULL;
+		if (policy == NULL)
+			flag = atomic_read(&loaded);
+		read_unlock(&policy_lock);
+
+		if (policy != NULL || flag != 0 ||
+		    number != SPTLRPC_POLICY_GSS)
+			break;
+
+		/* try to load gss module, once */
+		mutex_lock(&load_mutex);
+		if (atomic_read(&loaded) == 0) {
+			if (request_module("ptlrpc_gss") == 0)
+				CDEBUG(D_SEC,
+				       "module ptlrpc_gss loaded on demand\n");
+			else
+				CERROR("Unable to load module ptlrpc_gss\n");
+
+			atomic_set(&loaded, 1);
+		}
+		mutex_unlock(&load_mutex);
+	}
+
+	return policy;
+}
+
+__u32 sptlrpc_name2flavor_base(const char *name)
+{
+	if (!strcmp(name, "null"))
+		return SPTLRPC_FLVR_NULL;
+	if (!strcmp(name, "plain"))
+		return SPTLRPC_FLVR_PLAIN;
+	if (!strcmp(name, "krb5n"))
+		return SPTLRPC_FLVR_KRB5N;
+	if (!strcmp(name, "krb5a"))
+		return SPTLRPC_FLVR_KRB5A;
+	if (!strcmp(name, "krb5i"))
+		return SPTLRPC_FLVR_KRB5I;
+	if (!strcmp(name, "krb5p"))
+		return SPTLRPC_FLVR_KRB5P;
+
+	return SPTLRPC_FLVR_INVALID;
+}
+EXPORT_SYMBOL(sptlrpc_name2flavor_base);
+
+const char *sptlrpc_flavor2name_base(__u32 flvr)
+{
+	__u32   base = SPTLRPC_FLVR_BASE(flvr);
+
+	if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL))
+		return "null";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN))
+		return "plain";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N))
+		return "krb5n";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A))
+		return "krb5a";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I))
+		return "krb5i";
+	else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P))
+		return "krb5p";
+
+	CERROR("invalid wire flavor 0x%x\n", flvr);
+	return "invalid";
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_base);
+
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+			       char *buf, int bufsize)
+{
+	if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN)
+		snprintf(buf, bufsize, "hash:%s",
+			 sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg));
+	else
+		snprintf(buf, bufsize, "%s",
+			 sptlrpc_flavor2name_base(sf->sf_rpc));
+
+	buf[bufsize - 1] = '\0';
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_bulk);
+
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize)
+{
+	snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc));
+
+	/*
+	 * currently we don't support customized bulk specification for
+	 * flavors other than plain
+	 */
+	if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) {
+		char bspec[16];
+
+		bspec[0] = '-';
+		sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1);
+		strncat(buf, bspec, bufsize);
+	}
+
+	buf[bufsize - 1] = '\0';
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name);
+
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_SEC_FL_REVERSE)
+		strlcat(buf, "reverse,", bufsize);
+	if (flags & PTLRPC_SEC_FL_ROOTONLY)
+		strlcat(buf, "rootonly,", bufsize);
+	if (flags & PTLRPC_SEC_FL_UDESC)
+		strlcat(buf, "udesc,", bufsize);
+	if (flags & PTLRPC_SEC_FL_BULK)
+		strlcat(buf, "bulk,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+
+	return buf;
+}
+EXPORT_SYMBOL(sptlrpc_secflags2str);
+
+/**************************************************
+ * client context APIs			    *
+ **************************************************/
+
+static
+struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec)
+{
+	struct vfs_cred vcred;
+	int create = 1, remove_dead = 1;
+
+	LASSERT(sec);
+	LASSERT(sec->ps_policy->sp_cops->lookup_ctx);
+
+	if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE |
+				     PTLRPC_SEC_FL_ROOTONLY)) {
+		vcred.vc_uid = 0;
+		vcred.vc_gid = 0;
+		if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) {
+			create = 0;
+			remove_dead = 0;
+		}
+	} else {
+		vcred.vc_uid = current_uid();
+		vcred.vc_gid = current_gid();
+	}
+
+	return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred,
+						   create, remove_dead);
+}
+
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx)
+{
+	atomic_inc(&ctx->cc_refcount);
+	return ctx;
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_get);
+
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	struct ptlrpc_sec *sec = ctx->cc_sec;
+
+	LASSERT(sec);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (!atomic_dec_and_test(&ctx->cc_refcount))
+		return;
+
+	sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_put);
+
+/**
+ * Expire the client context immediately.
+ *
+ * \pre Caller must hold at least 1 reference on the \a ctx.
+ */
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(ctx->cc_ops->die);
+	ctx->cc_ops->die(ctx, 0);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_expire);
+
+/**
+ * To wake up the threads who are waiting for this client context. Called
+ * after some status change happened on \a ctx.
+ */
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_request *req, *next;
+
+	spin_lock(&ctx->cc_lock);
+	list_for_each_entry_safe(req, next, &ctx->cc_req_list,
+				     rq_ctx_chain) {
+		list_del_init(&req->rq_ctx_chain);
+		ptlrpc_client_wake_req(req);
+	}
+	spin_unlock(&ctx->cc_lock);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup);
+
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize)
+{
+	LASSERT(ctx->cc_ops);
+
+	if (ctx->cc_ops->display == NULL)
+		return 0;
+
+	return ctx->cc_ops->display(ctx, buf, bufsize);
+}
+
+static int import_sec_check_expire(struct obd_import *imp)
+{
+	int     adapt = 0;
+
+	spin_lock(&imp->imp_lock);
+	if (imp->imp_sec_expire &&
+	    imp->imp_sec_expire < cfs_time_current_sec()) {
+		adapt = 1;
+		imp->imp_sec_expire = 0;
+	}
+	spin_unlock(&imp->imp_lock);
+
+	if (!adapt)
+		return 0;
+
+	CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n");
+	return sptlrpc_import_sec_adapt(imp, NULL, 0);
+}
+
+static int import_sec_validate_get(struct obd_import *imp,
+				   struct ptlrpc_sec **sec)
+{
+	int     rc;
+
+	if (unlikely(imp->imp_sec_expire)) {
+		rc = import_sec_check_expire(imp);
+		if (rc)
+			return rc;
+	}
+
+	*sec = sptlrpc_import_sec_ref(imp);
+	if (*sec == NULL) {
+		CERROR("import %p (%s) with no sec\n",
+		       imp, ptlrpc_import_state_name(imp->imp_state));
+		return -EACCES;
+	}
+
+	if (unlikely((*sec)->ps_dying)) {
+		CERROR("attempt to use dying sec %p\n", sec);
+		sptlrpc_sec_put(*sec);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+/**
+ * Given a \a req, find or allocate a appropriate context for it.
+ * \pre req->rq_cli_ctx == NULL.
+ *
+ * \retval 0 succeed, and req->rq_cli_ctx is set.
+ * \retval -ev error number, and req->rq_cli_ctx == NULL.
+ */
+int sptlrpc_req_get_ctx(struct ptlrpc_request *req)
+{
+	struct obd_import *imp = req->rq_import;
+	struct ptlrpc_sec *sec;
+	int		rc;
+	ENTRY;
+
+	LASSERT(!req->rq_cli_ctx);
+	LASSERT(imp);
+
+	rc = import_sec_validate_get(imp, &sec);
+	if (rc)
+		RETURN(rc);
+
+	req->rq_cli_ctx = get_my_ctx(sec);
+
+	sptlrpc_sec_put(sec);
+
+	if (!req->rq_cli_ctx) {
+		CERROR("req %p: fail to get context\n", req);
+		RETURN(-ENOMEM);
+	}
+
+	RETURN(0);
+}
+
+/**
+ * Drop the context for \a req.
+ * \pre req->rq_cli_ctx != NULL.
+ * \post req->rq_cli_ctx == NULL.
+ *
+ * If \a sync == 0, this function should return quickly without sleep;
+ * otherwise it might trigger and wait for the whole process of sending
+ * an context-destroying rpc to server.
+ */
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync)
+{
+	ENTRY;
+
+	LASSERT(req);
+	LASSERT(req->rq_cli_ctx);
+
+	/* request might be asked to release earlier while still
+	 * in the context waiting list.
+	 */
+	if (!list_empty(&req->rq_ctx_chain)) {
+		spin_lock(&req->rq_cli_ctx->cc_lock);
+		list_del_init(&req->rq_ctx_chain);
+		spin_unlock(&req->rq_cli_ctx->cc_lock);
+	}
+
+	sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync);
+	req->rq_cli_ctx = NULL;
+	EXIT;
+}
+
+static
+int sptlrpc_req_ctx_switch(struct ptlrpc_request *req,
+			   struct ptlrpc_cli_ctx *oldctx,
+			   struct ptlrpc_cli_ctx *newctx)
+{
+	struct sptlrpc_flavor   old_flvr;
+	char		   *reqmsg = NULL; /* to workaround old gcc */
+	int		     reqmsg_size;
+	int		     rc = 0;
+
+	LASSERT(req->rq_reqmsg);
+	LASSERT(req->rq_reqlen);
+	LASSERT(req->rq_replen);
+
+	CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), "
+	       "switch sec %p(%s) -> %p(%s)\n", req,
+	       oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec),
+	       newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec),
+	       oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name,
+	       newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name);
+
+	/* save flavor */
+	old_flvr = req->rq_flvr;
+
+	/* save request message */
+	reqmsg_size = req->rq_reqlen;
+	if (reqmsg_size != 0) {
+		OBD_ALLOC_LARGE(reqmsg, reqmsg_size);
+		if (reqmsg == NULL)
+			return -ENOMEM;
+		memcpy(reqmsg, req->rq_reqmsg, reqmsg_size);
+	}
+
+	/* release old req/rep buf */
+	req->rq_cli_ctx = oldctx;
+	sptlrpc_cli_free_reqbuf(req);
+	sptlrpc_cli_free_repbuf(req);
+	req->rq_cli_ctx = newctx;
+
+	/* recalculate the flavor */
+	sptlrpc_req_set_flavor(req, 0);
+
+	/* alloc new request buffer
+	 * we don't need to alloc reply buffer here, leave it to the
+	 * rest procedure of ptlrpc */
+	if (reqmsg_size != 0) {
+		rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size);
+		if (!rc) {
+			LASSERT(req->rq_reqmsg);
+			memcpy(req->rq_reqmsg, reqmsg, reqmsg_size);
+		} else {
+			CWARN("failed to alloc reqbuf: %d\n", rc);
+			req->rq_flvr = old_flvr;
+		}
+
+		OBD_FREE_LARGE(reqmsg, reqmsg_size);
+	}
+	return rc;
+}
+
+/**
+ * If current context of \a req is dead somehow, e.g. we just switched flavor
+ * thus marked original contexts dead, we'll find a new context for it. if
+ * no switch is needed, \a req will end up with the same context.
+ *
+ * \note a request must have a context, to keep other parts of code happy.
+ * In any case of failure during the switching, we must restore the old one.
+ */
+int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx;
+	struct ptlrpc_cli_ctx *newctx;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(oldctx);
+
+	sptlrpc_cli_ctx_get(oldctx);
+	sptlrpc_req_put_ctx(req, 0);
+
+	rc = sptlrpc_req_get_ctx(req);
+	if (unlikely(rc)) {
+		LASSERT(!req->rq_cli_ctx);
+
+		/* restore old ctx */
+		req->rq_cli_ctx = oldctx;
+		RETURN(rc);
+	}
+
+	newctx = req->rq_cli_ctx;
+	LASSERT(newctx);
+
+	if (unlikely(newctx == oldctx &&
+		     test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) {
+		/*
+		 * still get the old dead ctx, usually means system too busy
+		 */
+		CDEBUG(D_SEC,
+		       "ctx (%p, fl %lx) doesn't switch, relax a little bit\n",
+		       newctx, newctx->cc_flags);
+
+		schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+						   HZ);
+	} else {
+		/*
+		 * it's possible newctx == oldctx if we're switching
+		 * subflavor with the same sec.
+		 */
+		rc = sptlrpc_req_ctx_switch(req, oldctx, newctx);
+		if (rc) {
+			/* restore old ctx */
+			sptlrpc_req_put_ctx(req, 0);
+			req->rq_cli_ctx = oldctx;
+			RETURN(rc);
+		}
+
+		LASSERT(req->rq_cli_ctx == newctx);
+	}
+
+	sptlrpc_cli_ctx_put(oldctx, 1);
+	RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx);
+
+static
+int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	if (cli_ctx_is_refreshed(ctx))
+		return 1;
+	return 0;
+}
+
+static
+int ctx_refresh_timeout(void *data)
+{
+	struct ptlrpc_request *req = data;
+	int rc;
+
+	/* conn_cnt is needed in expire_one_request */
+	lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt);
+
+	rc = ptlrpc_expire_one_request(req, 1);
+	/* if we started recovery, we should mark this ctx dead; otherwise
+	 * in case of lgssd died nobody would retire this ctx, following
+	 * connecting will still find the same ctx thus cause deadlock.
+	 * there's an assumption that expire time of the request should be
+	 * later than the context refresh expire time.
+	 */
+	if (rc == 0)
+		req->rq_cli_ctx->cc_ops->die(req->rq_cli_ctx, 0);
+	return rc;
+}
+
+static
+void ctx_refresh_interrupt(void *data)
+{
+	struct ptlrpc_request *req = data;
+
+	spin_lock(&req->rq_lock);
+	req->rq_intr = 1;
+	spin_unlock(&req->rq_lock);
+}
+
+static
+void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx)
+{
+	spin_lock(&ctx->cc_lock);
+	if (!list_empty(&req->rq_ctx_chain))
+		list_del_init(&req->rq_ctx_chain);
+	spin_unlock(&ctx->cc_lock);
+}
+
+/**
+ * To refresh the context of \req, if it's not up-to-date.
+ * \param timeout
+ * - < 0: don't wait
+ * - = 0: wait until success or fatal error occur
+ * - > 0: timeout value (in seconds)
+ *
+ * The status of the context could be subject to be changed by other threads
+ * at any time. We allow this race, but once we return with 0, the caller will
+ * suppose it's uptodated and keep using it until the owning rpc is done.
+ *
+ * \retval 0 only if the context is uptodated.
+ * \retval -ev error number.
+ */
+int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
+{
+	struct ptlrpc_cli_ctx  *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec      *sec;
+	struct l_wait_info      lwi;
+	int		     rc;
+	ENTRY;
+
+	LASSERT(ctx);
+
+	if (req->rq_ctx_init || req->rq_ctx_fini)
+		RETURN(0);
+
+	/*
+	 * during the process a request's context might change type even
+	 * (e.g. from gss ctx to null ctx), so each loop we need to re-check
+	 * everything
+	 */
+again:
+	rc = import_sec_validate_get(req->rq_import, &sec);
+	if (rc)
+		RETURN(rc);
+
+	if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+		CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n",
+		      req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc);
+		req_off_ctx_list(req, ctx);
+		sptlrpc_req_replace_dead_ctx(req);
+		ctx = req->rq_cli_ctx;
+	}
+	sptlrpc_sec_put(sec);
+
+	if (cli_ctx_is_eternal(ctx))
+		RETURN(0);
+
+	if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
+		LASSERT(ctx->cc_ops->refresh);
+		ctx->cc_ops->refresh(ctx);
+	}
+	LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
+
+	LASSERT(ctx->cc_ops->validate);
+	if (ctx->cc_ops->validate(ctx) == 0) {
+		req_off_ctx_list(req, ctx);
+		RETURN(0);
+	}
+
+	if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) {
+		spin_lock(&req->rq_lock);
+		req->rq_err = 1;
+		spin_unlock(&req->rq_lock);
+		req_off_ctx_list(req, ctx);
+		RETURN(-EPERM);
+	}
+
+	/*
+	 * There's a subtle issue for resending RPCs, suppose following
+	 * situation:
+	 *  1. the request was sent to server.
+	 *  2. recovery was kicked start, after finished the request was
+	 *     marked as resent.
+	 *  3. resend the request.
+	 *  4. old reply from server received, we accept and verify the reply.
+	 *     this has to be success, otherwise the error will be aware
+	 *     by application.
+	 *  5. new reply from server received, dropped by LNet.
+	 *
+	 * Note the xid of old & new request is the same. We can't simply
+	 * change xid for the resent request because the server replies on
+	 * it for reply reconstruction.
+	 *
+	 * Commonly the original context should be uptodate because we
+	 * have a expiry nice time; server will keep its context because
+	 * we at least hold a ref of old context which prevent context
+	 * destroying RPC being sent. So server still can accept the request
+	 * and finish the RPC. But if that's not the case:
+	 *  1. If server side context has been trimmed, a NO_CONTEXT will
+	 *     be returned, gss_cli_ctx_verify/unseal will switch to new
+	 *     context by force.
+	 *  2. Current context never be refreshed, then we are fine: we
+	 *     never really send request with old context before.
+	 */
+	if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) &&
+	    unlikely(req->rq_reqmsg) &&
+	    lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
+		req_off_ctx_list(req, ctx);
+		RETURN(0);
+	}
+
+	if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) {
+		req_off_ctx_list(req, ctx);
+		/*
+		 * don't switch ctx if import was deactivated
+		 */
+		if (req->rq_import->imp_deactive) {
+			spin_lock(&req->rq_lock);
+			req->rq_err = 1;
+			spin_unlock(&req->rq_lock);
+			RETURN(-EINTR);
+		}
+
+		rc = sptlrpc_req_replace_dead_ctx(req);
+		if (rc) {
+			LASSERT(ctx == req->rq_cli_ctx);
+			CERROR("req %p: failed to replace dead ctx %p: %d\n",
+			       req, ctx, rc);
+			spin_lock(&req->rq_lock);
+			req->rq_err = 1;
+			spin_unlock(&req->rq_lock);
+			RETURN(rc);
+		}
+
+		ctx = req->rq_cli_ctx;
+		goto again;
+	}
+
+	/*
+	 * Now we're sure this context is during upcall, add myself into
+	 * waiting list
+	 */
+	spin_lock(&ctx->cc_lock);
+	if (list_empty(&req->rq_ctx_chain))
+		list_add(&req->rq_ctx_chain, &ctx->cc_req_list);
+	spin_unlock(&ctx->cc_lock);
+
+	if (timeout < 0)
+		RETURN(-EWOULDBLOCK);
+
+	/* Clear any flags that may be present from previous sends */
+	LASSERT(req->rq_receiving_reply == 0);
+	spin_lock(&req->rq_lock);
+	req->rq_err = 0;
+	req->rq_timedout = 0;
+	req->rq_resend = 0;
+	req->rq_restart = 0;
+	spin_unlock(&req->rq_lock);
+
+	lwi = LWI_TIMEOUT_INTR(timeout * HZ, ctx_refresh_timeout,
+			       ctx_refresh_interrupt, req);
+	rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi);
+
+	/*
+	 * following cases could lead us here:
+	 * - successfully refreshed;
+	 * - interrupted;
+	 * - timedout, and we don't want recover from the failure;
+	 * - timedout, and waked up upon recovery finished;
+	 * - someone else mark this ctx dead by force;
+	 * - someone invalidate the req and call ptlrpc_client_wake_req(),
+	 *   e.g. ptlrpc_abort_inflight();
+	 */
+	if (!cli_ctx_is_refreshed(ctx)) {
+		/* timed out or interruptted */
+		req_off_ctx_list(req, ctx);
+
+		LASSERT(rc != 0);
+		RETURN(rc);
+	}
+
+	goto again;
+}
+
+/**
+ * Initialize flavor settings for \a req, according to \a opcode.
+ *
+ * \note this could be called in two situations:
+ * - new request from ptlrpc_pre_req(), with proper @opcode
+ * - old request which changed ctx in the middle, with @opcode == 0
+ */
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
+{
+	struct ptlrpc_sec *sec;
+
+	LASSERT(req->rq_import);
+	LASSERT(req->rq_cli_ctx);
+	LASSERT(req->rq_cli_ctx->cc_sec);
+	LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0);
+
+	/* special security flags accoding to opcode */
+	switch (opcode) {
+	case OST_READ:
+	case MDS_READPAGE:
+	case MGS_CONFIG_READ:
+	case OBD_IDX_READ:
+		req->rq_bulk_read = 1;
+		break;
+	case OST_WRITE:
+	case MDS_WRITEPAGE:
+		req->rq_bulk_write = 1;
+		break;
+	case SEC_CTX_INIT:
+		req->rq_ctx_init = 1;
+		break;
+	case SEC_CTX_FINI:
+		req->rq_ctx_fini = 1;
+		break;
+	case 0:
+		/* init/fini rpc won't be resend, so can't be here */
+		LASSERT(req->rq_ctx_init == 0);
+		LASSERT(req->rq_ctx_fini == 0);
+
+		/* cleanup flags, which should be recalculated */
+		req->rq_pack_udesc = 0;
+		req->rq_pack_bulk = 0;
+		break;
+	}
+
+	sec = req->rq_cli_ctx->cc_sec;
+
+	spin_lock(&sec->ps_lock);
+	req->rq_flvr = sec->ps_flvr;
+	spin_unlock(&sec->ps_lock);
+
+	/* force SVC_NULL for context initiation rpc, SVC_INTG for context
+	 * destruction rpc */
+	if (unlikely(req->rq_ctx_init))
+		flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL);
+	else if (unlikely(req->rq_ctx_fini))
+		flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG);
+
+	/* user descriptor flag, null security can't do it anyway */
+	if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) &&
+	    (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL))
+		req->rq_pack_udesc = 1;
+
+	/* bulk security flag */
+	if ((req->rq_bulk_read || req->rq_bulk_write) &&
+	    sptlrpc_flavor_has_bulk(&req->rq_flvr))
+		req->rq_pack_bulk = 1;
+}
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req)
+{
+	if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV)
+		return;
+
+	LASSERT(req->rq_clrbuf);
+	if (req->rq_pool || !req->rq_reqbuf)
+		return;
+
+	OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len);
+	req->rq_reqbuf = NULL;
+	req->rq_reqbuf_len = 0;
+}
+
+/**
+ * Given an import \a imp, check whether current user has a valid context
+ * or not. We may create a new context and try to refresh it, and try
+ * repeatedly try in case of non-fatal errors. Return 0 means success.
+ */
+int sptlrpc_import_check_ctx(struct obd_import *imp)
+{
+	struct ptlrpc_sec     *sec;
+	struct ptlrpc_cli_ctx *ctx;
+	struct ptlrpc_request *req = NULL;
+	int rc;
+	ENTRY;
+
+	might_sleep();
+
+	sec = sptlrpc_import_sec_ref(imp);
+	ctx = get_my_ctx(sec);
+	sptlrpc_sec_put(sec);
+
+	if (!ctx)
+		RETURN(-ENOMEM);
+
+	if (cli_ctx_is_eternal(ctx) ||
+	    ctx->cc_ops->validate(ctx) == 0) {
+		sptlrpc_cli_ctx_put(ctx, 1);
+		RETURN(0);
+	}
+
+	if (cli_ctx_is_error(ctx)) {
+		sptlrpc_cli_ctx_put(ctx, 1);
+		RETURN(-EACCES);
+	}
+
+	OBD_ALLOC_PTR(req);
+	if (!req)
+		RETURN(-ENOMEM);
+
+	spin_lock_init(&req->rq_lock);
+	atomic_set(&req->rq_refcount, 10000);
+	INIT_LIST_HEAD(&req->rq_ctx_chain);
+	init_waitqueue_head(&req->rq_reply_waitq);
+	init_waitqueue_head(&req->rq_set_waitq);
+	req->rq_import = imp;
+	req->rq_flvr = sec->ps_flvr;
+	req->rq_cli_ctx = ctx;
+
+	rc = sptlrpc_req_refresh_ctx(req, 0);
+	LASSERT(list_empty(&req->rq_ctx_chain));
+	sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1);
+	OBD_FREE_PTR(req);
+
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform the pre-defined security transformation
+ * upon the request message of \a req. After this function called,
+ * req->rq_reqmsg is still accessible as clear text.
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+	/* we wrap bulk request here because now we can be sure
+	 * the context is uptodate.
+	 */
+	if (req->rq_bulk) {
+		rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk);
+		if (rc)
+			RETURN(rc);
+	}
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(ctx->cc_ops->sign);
+		rc = ctx->cc_ops->sign(ctx, req);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(ctx->cc_ops->seal);
+		rc = ctx->cc_ops->seal(ctx, req);
+		break;
+	default:
+		LBUG();
+	}
+
+	if (rc == 0) {
+		LASSERT(req->rq_reqdata_len);
+		LASSERT(req->rq_reqdata_len % 8 == 0);
+		LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len);
+	}
+
+	RETURN(rc);
+}
+
+static int do_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata);
+	LASSERT(req->rq_repmsg == NULL);
+
+	req->rq_rep_swab_mask = 0;
+
+	rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len);
+	switch (rc) {
+	case 1:
+		lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+	case 0:
+		break;
+	default:
+		CERROR("failed unpack reply: x"LPU64"\n", req->rq_xid);
+		RETURN(-EPROTO);
+	}
+
+	if (req->rq_repdata_len < sizeof(struct lustre_msg)) {
+		CERROR("replied data length %d too small\n",
+		       req->rq_repdata_len);
+		RETURN(-EPROTO);
+	}
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) !=
+	    SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
+		CERROR("reply policy %u doesn't match request policy %u\n",
+		       SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr),
+		       SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc));
+		RETURN(-EPROTO);
+	}
+
+	switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+	case SPTLRPC_SVC_NULL:
+	case SPTLRPC_SVC_AUTH:
+	case SPTLRPC_SVC_INTG:
+		LASSERT(ctx->cc_ops->verify);
+		rc = ctx->cc_ops->verify(ctx, req);
+		break;
+	case SPTLRPC_SVC_PRIV:
+		LASSERT(ctx->cc_ops->unseal);
+		rc = ctx->cc_ops->unseal(ctx, req);
+		break;
+	default:
+		LBUG();
+	}
+	LASSERT(rc || req->rq_repmsg || req->rq_resend);
+
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL &&
+	    !req->rq_ctx_init)
+		req->rq_rep_swab_mask = 0;
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the reply
+ * message of \a req. After return successfully, req->rq_repmsg points to
+ * the reply message in clear text.
+ *
+ * \pre the reply buffer should have been un-posted from LNet, so nothing is
+ * going to change.
+ */
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+	LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len);
+
+	if (req->rq_reply_off == 0 &&
+	    (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+		CERROR("real reply with offset 0\n");
+		return -EPROTO;
+	}
+
+	if (req->rq_reply_off % 8 != 0) {
+		CERROR("reply at odd offset %u\n", req->rq_reply_off);
+		return -EPROTO;
+	}
+
+	req->rq_repdata = (struct lustre_msg *)
+				(req->rq_repbuf + req->rq_reply_off);
+	req->rq_repdata_len = req->rq_nob_received;
+
+	return do_cli_unwrap_reply(req);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the early
+ * reply message of \a req. We expect the rq_reply_off is 0, and
+ * rq_nob_received is the early reply size.
+ *
+ * Because the receive buffer might be still posted, the reply data might be
+ * changed at any time, no matter we're holding rq_lock or not. For this reason
+ * we allocate a separate ptlrpc_request and reply buffer for early reply
+ * processing.
+ *
+ * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request.
+ * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned
+ * \a *req_ret to release it.
+ * \retval -ev error number, and \a req_ret will not be set.
+ */
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+				   struct ptlrpc_request **req_ret)
+{
+	struct ptlrpc_request  *early_req;
+	char		   *early_buf;
+	int		     early_bufsz, early_size;
+	int		     rc;
+	ENTRY;
+
+	OBD_ALLOC_PTR(early_req);
+	if (early_req == NULL)
+		RETURN(-ENOMEM);
+
+	early_size = req->rq_nob_received;
+	early_bufsz = size_roundup_power2(early_size);
+	OBD_ALLOC_LARGE(early_buf, early_bufsz);
+	if (early_buf == NULL)
+		GOTO(err_req, rc = -ENOMEM);
+
+	/* sanity checkings and copy data out, do it inside spinlock */
+	spin_lock(&req->rq_lock);
+
+	if (req->rq_replied) {
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EALREADY);
+	}
+
+	LASSERT(req->rq_repbuf);
+	LASSERT(req->rq_repdata == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+
+	if (req->rq_reply_off != 0) {
+		CERROR("early reply with offset %u\n", req->rq_reply_off);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EPROTO);
+	}
+
+	if (req->rq_nob_received != early_size) {
+		/* even another early arrived the size should be the same */
+		CERROR("data size has changed from %u to %u\n",
+		       early_size, req->rq_nob_received);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EINVAL);
+	}
+
+	if (req->rq_nob_received < sizeof(struct lustre_msg)) {
+		CERROR("early reply length %d too small\n",
+		       req->rq_nob_received);
+		spin_unlock(&req->rq_lock);
+		GOTO(err_buf, rc = -EALREADY);
+	}
+
+	memcpy(early_buf, req->rq_repbuf, early_size);
+	spin_unlock(&req->rq_lock);
+
+	spin_lock_init(&early_req->rq_lock);
+	early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx);
+	early_req->rq_flvr = req->rq_flvr;
+	early_req->rq_repbuf = early_buf;
+	early_req->rq_repbuf_len = early_bufsz;
+	early_req->rq_repdata = (struct lustre_msg *) early_buf;
+	early_req->rq_repdata_len = early_size;
+	early_req->rq_early = 1;
+	early_req->rq_reqmsg = req->rq_reqmsg;
+
+	rc = do_cli_unwrap_reply(early_req);
+	if (rc) {
+		DEBUG_REQ(D_ADAPTTO, early_req,
+			  "error %d unwrap early reply", rc);
+		GOTO(err_ctx, rc);
+	}
+
+	LASSERT(early_req->rq_repmsg);
+	*req_ret = early_req;
+	RETURN(0);
+
+err_ctx:
+	sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+err_buf:
+	OBD_FREE_LARGE(early_buf, early_bufsz);
+err_req:
+	OBD_FREE_PTR(early_req);
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to release a processed early reply \a early_req.
+ *
+ * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply().
+ */
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req)
+{
+	LASSERT(early_req->rq_repbuf);
+	LASSERT(early_req->rq_repdata);
+	LASSERT(early_req->rq_repmsg);
+
+	sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+	OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len);
+	OBD_FREE_PTR(early_req);
+}
+
+/**************************************************
+ * sec ID					 *
+ **************************************************/
+
+/*
+ * "fixed" sec (e.g. null) use sec_id < 0
+ */
+static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1);
+
+int sptlrpc_get_next_secid(void)
+{
+	return atomic_inc_return(&sptlrpc_sec_id);
+}
+EXPORT_SYMBOL(sptlrpc_get_next_secid);
+
+/**************************************************
+ * client side high-level security APIs	   *
+ **************************************************/
+
+static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid,
+				   int grace, int force)
+{
+	struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+	LASSERT(policy->sp_cops);
+	LASSERT(policy->sp_cops->flush_ctx_cache);
+
+	return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force);
+}
+
+static void sec_cop_destroy_sec(struct ptlrpc_sec *sec)
+{
+	struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+	LASSERT_ATOMIC_ZERO(&sec->ps_refcount);
+	LASSERT_ATOMIC_ZERO(&sec->ps_nctx);
+	LASSERT(policy->sp_cops->destroy_sec);
+
+	CDEBUG(D_SEC, "%s@%p: being destroied\n", sec->ps_policy->sp_name, sec);
+
+	policy->sp_cops->destroy_sec(sec);
+	sptlrpc_policy_put(policy);
+}
+
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec)
+{
+	sec_cop_destroy_sec(sec);
+}
+EXPORT_SYMBOL(sptlrpc_sec_destroy);
+
+static void sptlrpc_sec_kill(struct ptlrpc_sec *sec)
+{
+	LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+	if (sec->ps_policy->sp_cops->kill_sec) {
+		sec->ps_policy->sp_cops->kill_sec(sec);
+
+		sec_cop_flush_ctx_cache(sec, -1, 1, 1);
+	}
+}
+
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec)
+{
+	if (sec)
+		atomic_inc(&sec->ps_refcount);
+
+	return sec;
+}
+EXPORT_SYMBOL(sptlrpc_sec_get);
+
+void sptlrpc_sec_put(struct ptlrpc_sec *sec)
+{
+	if (sec) {
+		LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+		if (atomic_dec_and_test(&sec->ps_refcount)) {
+			sptlrpc_gc_del_sec(sec);
+			sec_cop_destroy_sec(sec);
+		}
+	}
+}
+EXPORT_SYMBOL(sptlrpc_sec_put);
+
+/*
+ * policy module is responsible for taking refrence of import
+ */
+static
+struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp,
+				       struct ptlrpc_svc_ctx *svc_ctx,
+				       struct sptlrpc_flavor *sf,
+				       enum lustre_sec_part sp)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct ptlrpc_sec	*sec;
+	char		      str[32];
+	ENTRY;
+
+	if (svc_ctx) {
+		LASSERT(imp->imp_dlm_fake == 1);
+
+		CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n",
+		       imp->imp_obd->obd_type->typ_name,
+		       imp->imp_obd->obd_name,
+		       sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+		policy = sptlrpc_policy_get(svc_ctx->sc_policy);
+		sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY;
+	} else {
+		LASSERT(imp->imp_dlm_fake == 0);
+
+		CDEBUG(D_SEC, "%s %s: select security flavor %s\n",
+		       imp->imp_obd->obd_type->typ_name,
+		       imp->imp_obd->obd_name,
+		       sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+		policy = sptlrpc_wireflavor2policy(sf->sf_rpc);
+		if (!policy) {
+			CERROR("invalid flavor 0x%x\n", sf->sf_rpc);
+			RETURN(NULL);
+		}
+	}
+
+	sec = policy->sp_cops->create_sec(imp, svc_ctx, sf);
+	if (sec) {
+		atomic_inc(&sec->ps_refcount);
+
+		sec->ps_part = sp;
+
+		if (sec->ps_gc_interval && policy->sp_cops->gc_ctx)
+			sptlrpc_gc_add_sec(sec);
+	} else {
+		sptlrpc_policy_put(policy);
+	}
+
+	RETURN(sec);
+}
+
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp)
+{
+	struct ptlrpc_sec *sec;
+
+	spin_lock(&imp->imp_lock);
+	sec = sptlrpc_sec_get(imp->imp_sec);
+	spin_unlock(&imp->imp_lock);
+
+	return sec;
+}
+EXPORT_SYMBOL(sptlrpc_import_sec_ref);
+
+static void sptlrpc_import_sec_install(struct obd_import *imp,
+				       struct ptlrpc_sec *sec)
+{
+	struct ptlrpc_sec *old_sec;
+
+	LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+	spin_lock(&imp->imp_lock);
+	old_sec = imp->imp_sec;
+	imp->imp_sec = sec;
+	spin_unlock(&imp->imp_lock);
+
+	if (old_sec) {
+		sptlrpc_sec_kill(old_sec);
+
+		/* balance the ref taken by this import */
+		sptlrpc_sec_put(old_sec);
+	}
+}
+
+static inline
+int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2)
+{
+	return (memcmp(sf1, sf2, sizeof(*sf1)) == 0);
+}
+
+static inline
+void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src)
+{
+	*dst = *src;
+}
+
+static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp,
+					     struct ptlrpc_sec *sec,
+					     struct sptlrpc_flavor *sf)
+{
+	char    str1[32], str2[32];
+
+	if (sec->ps_flvr.sf_flags != sf->sf_flags)
+		CDEBUG(D_SEC, "changing sec flags: %s -> %s\n",
+		       sptlrpc_secflags2str(sec->ps_flvr.sf_flags,
+					    str1, sizeof(str1)),
+		       sptlrpc_secflags2str(sf->sf_flags,
+					    str2, sizeof(str2)));
+
+	spin_lock(&sec->ps_lock);
+	flavor_copy(&sec->ps_flvr, sf);
+	spin_unlock(&sec->ps_lock);
+}
+
+/**
+ * To get an appropriate ptlrpc_sec for the \a imp, according to the current
+ * configuration. Upon called, imp->imp_sec may or may not be NULL.
+ *
+ *  - regular import: \a svc_ctx should be NULL and \a flvr is ignored;
+ *  - reverse import: \a svc_ctx and \a flvr are obtained from incoming request.
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+			     struct ptlrpc_svc_ctx *svc_ctx,
+			     struct sptlrpc_flavor *flvr)
+{
+	struct ptlrpc_connection   *conn;
+	struct sptlrpc_flavor       sf;
+	struct ptlrpc_sec	  *sec, *newsec;
+	enum lustre_sec_part	sp;
+	char			str[24];
+	int			 rc = 0;
+	ENTRY;
+
+	might_sleep();
+
+	if (imp == NULL)
+		RETURN(0);
+
+	conn = imp->imp_connection;
+
+	if (svc_ctx == NULL) {
+		struct client_obd *cliobd = &imp->imp_obd->u.cli;
+		/*
+		 * normal import, determine flavor from rule set, except
+		 * for mgc the flavor is predetermined.
+		 */
+		if (cliobd->cl_sp_me == LUSTRE_SP_MGC)
+			sf = cliobd->cl_flvr_mgc;
+		else
+			sptlrpc_conf_choose_flavor(cliobd->cl_sp_me,
+						   cliobd->cl_sp_to,
+						   &cliobd->cl_target_uuid,
+						   conn->c_self, &sf);
+
+		sp = imp->imp_obd->u.cli.cl_sp_me;
+	} else {
+		/* reverse import, determine flavor from incoming reqeust */
+		sf = *flvr;
+
+		if (sf.sf_rpc != SPTLRPC_FLVR_NULL)
+			sf.sf_flags = PTLRPC_SEC_FL_REVERSE |
+				      PTLRPC_SEC_FL_ROOTONLY;
+
+		sp = sptlrpc_target_sec_part(imp->imp_obd);
+	}
+
+	sec = sptlrpc_import_sec_ref(imp);
+	if (sec) {
+		char    str2[24];
+
+		if (flavor_equal(&sf, &sec->ps_flvr))
+			GOTO(out, rc);
+
+		CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid),
+		       sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)),
+		       sptlrpc_flavor2name(&sf, str2, sizeof(str2)));
+
+		if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) ==
+		    SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) &&
+		    SPTLRPC_FLVR_MECH(sf.sf_rpc) ==
+		    SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) {
+			sptlrpc_import_sec_adapt_inplace(imp, sec, &sf);
+			GOTO(out, rc);
+		}
+	} else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) !=
+		   SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) {
+		CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid),
+		       LNET_NIDNET(conn->c_self),
+		       sptlrpc_flavor2name(&sf, str, sizeof(str)));
+	}
+
+	mutex_lock(&imp->imp_sec_mutex);
+
+	newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp);
+	if (newsec) {
+		sptlrpc_import_sec_install(imp, newsec);
+	} else {
+		CERROR("import %s->%s: failed to create new sec\n",
+		       imp->imp_obd->obd_name,
+		       obd_uuid2str(&conn->c_remote_uuid));
+		rc = -EPERM;
+	}
+
+	mutex_unlock(&imp->imp_sec_mutex);
+out:
+	sptlrpc_sec_put(sec);
+	RETURN(rc);
+}
+
+void sptlrpc_import_sec_put(struct obd_import *imp)
+{
+	if (imp->imp_sec) {
+		sptlrpc_sec_kill(imp->imp_sec);
+
+		sptlrpc_sec_put(imp->imp_sec);
+		imp->imp_sec = NULL;
+	}
+}
+
+static void import_flush_ctx_common(struct obd_import *imp,
+				    uid_t uid, int grace, int force)
+{
+	struct ptlrpc_sec *sec;
+
+	if (imp == NULL)
+		return;
+
+	sec = sptlrpc_import_sec_ref(imp);
+	if (sec == NULL)
+		return;
+
+	sec_cop_flush_ctx_cache(sec, uid, grace, force);
+	sptlrpc_sec_put(sec);
+}
+
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp)
+{
+	/* it's important to use grace mode, see explain in
+	 * sptlrpc_req_refresh_ctx() */
+	import_flush_ctx_common(imp, 0, 1, 1);
+}
+
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp)
+{
+	import_flush_ctx_common(imp, current_uid(), 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx);
+
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp)
+{
+	import_flush_ctx_common(imp, -1, 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx);
+
+/**
+ * Used by ptlrpc client to allocate request buffer of \a req. Upon return
+ * successfully, req->rq_reqmsg points to a buffer with size \a msgsize.
+ */
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+	int rc;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT(req->rq_reqmsg == NULL);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	policy = ctx->cc_sec->ps_policy;
+	rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize);
+	if (!rc) {
+		LASSERT(req->rq_reqmsg);
+		LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+		/* zeroing preallocated buffer */
+		if (req->rq_pool)
+			memset(req->rq_reqmsg, 0, msgsize);
+	}
+
+	return rc;
+}
+
+/**
+ * Used by ptlrpc client to free request buffer of \a req. After this
+ * req->rq_reqmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL)
+		return;
+
+	policy = ctx->cc_sec->ps_policy;
+	policy->sp_cops->free_reqbuf(ctx->cc_sec, req);
+	req->rq_reqmsg = NULL;
+}
+
+/*
+ * NOTE caller must guarantee the buffer size is enough for the enlargement
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+				  int segment, int newsize)
+{
+	void   *src, *dst;
+	int     oldsize, oldmsg_size, movesize;
+
+	LASSERT(segment < msg->lm_bufcount);
+	LASSERT(msg->lm_buflens[segment] <= newsize);
+
+	if (msg->lm_buflens[segment] == newsize)
+		return;
+
+	/* nothing to do if we are enlarging the last segment */
+	if (segment == msg->lm_bufcount - 1) {
+		msg->lm_buflens[segment] = newsize;
+		return;
+	}
+
+	oldsize = msg->lm_buflens[segment];
+
+	src = lustre_msg_buf(msg, segment + 1, 0);
+	msg->lm_buflens[segment] = newsize;
+	dst = lustre_msg_buf(msg, segment + 1, 0);
+	msg->lm_buflens[segment] = oldsize;
+
+	/* move from segment + 1 to end segment */
+	LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2);
+	oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+	movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg);
+	LASSERT(movesize >= 0);
+
+	if (movesize)
+		memmove(dst, src, movesize);
+
+	/* note we don't clear the ares where old data live, not secret */
+
+	/* finally set new segment size */
+	msg->lm_buflens[segment] = newsize;
+}
+EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace);
+
+/**
+ * Used by ptlrpc client to enlarge the \a segment of request message pointed
+ * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be
+ * preserved after the enlargement. this must be called after original request
+ * buffer being allocated.
+ *
+ * \note after this be called, rq_reqmsg and rq_reqlen might have been changed,
+ * so caller should refresh its local pointers if needed.
+ */
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+			       int segment, int newsize)
+{
+	struct ptlrpc_cli_ctx    *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_cops   *cops;
+	struct lustre_msg	*msg = req->rq_reqmsg;
+
+	LASSERT(ctx);
+	LASSERT(msg);
+	LASSERT(msg->lm_bufcount > segment);
+	LASSERT(msg->lm_buflens[segment] <= newsize);
+
+	if (msg->lm_buflens[segment] == newsize)
+		return 0;
+
+	cops = ctx->cc_sec->ps_policy->sp_cops;
+	LASSERT(cops->enlarge_reqbuf);
+	return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize);
+}
+EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf);
+
+/**
+ * Used by ptlrpc client to allocate reply buffer of \a req.
+ *
+ * \note After this, req->rq_repmsg is still not accessible.
+ */
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+
+	if (req->rq_repbuf)
+		RETURN(0);
+
+	policy = ctx->cc_sec->ps_policy;
+	RETURN(policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize));
+}
+
+/**
+ * Used by ptlrpc client to free reply buffer of \a req. After this
+ * req->rq_repmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
+{
+	struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+	struct ptlrpc_sec_policy *policy;
+	ENTRY;
+
+	LASSERT(ctx);
+	LASSERT(ctx->cc_sec);
+	LASSERT(ctx->cc_sec->ps_policy);
+	LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+	if (req->rq_repbuf == NULL)
+		return;
+	LASSERT(req->rq_repbuf_len);
+
+	policy = ctx->cc_sec->ps_policy;
+	policy->sp_cops->free_repbuf(ctx->cc_sec, req);
+	req->rq_repmsg = NULL;
+	EXIT;
+}
+
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_cli_ctx *ctx)
+{
+	struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy;
+
+	if (!policy->sp_cops->install_rctx)
+		return 0;
+	return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx);
+}
+
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+				struct ptlrpc_svc_ctx *ctx)
+{
+	struct ptlrpc_sec_policy *policy = ctx->sc_policy;
+
+	if (!policy->sp_sops->install_rctx)
+		return 0;
+	return policy->sp_sops->install_rctx(imp, ctx);
+}
+
+/****************************************
+ * server side security		 *
+ ****************************************/
+
+static int flavor_allowed(struct sptlrpc_flavor *exp,
+			  struct ptlrpc_request *req)
+{
+	struct sptlrpc_flavor *flvr = &req->rq_flvr;
+
+	if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc)
+		return 1;
+
+	if ((req->rq_ctx_init || req->rq_ctx_fini) &&
+	    SPTLRPC_FLVR_POLICY(exp->sf_rpc) ==
+	    SPTLRPC_FLVR_POLICY(flvr->sf_rpc) &&
+	    SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc))
+		return 1;
+
+	return 0;
+}
+
+#define EXP_FLVR_UPDATE_EXPIRE      (OBD_TIMEOUT_DEFAULT + 10)
+
+/**
+ * Given an export \a exp, check whether the flavor of incoming \a req
+ * is allowed by the export \a exp. Main logic is about taking care of
+ * changing configurations. Return 0 means success.
+ */
+int sptlrpc_target_export_check(struct obd_export *exp,
+				struct ptlrpc_request *req)
+{
+	struct sptlrpc_flavor   flavor;
+
+	if (exp == NULL)
+		return 0;
+
+	/* client side export has no imp_reverse, skip
+	 * FIXME maybe we should check flavor this as well??? */
+	if (exp->exp_imp_reverse == NULL)
+		return 0;
+
+	/* don't care about ctx fini rpc */
+	if (req->rq_ctx_fini)
+		return 0;
+
+	spin_lock(&exp->exp_lock);
+
+	/* if flavor just changed (exp->exp_flvr_changed != 0), we wait for
+	 * the first req with the new flavor, then treat it as current flavor,
+	 * adapt reverse sec according to it.
+	 * note the first rpc with new flavor might not be with root ctx, in
+	 * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. */
+	if (unlikely(exp->exp_flvr_changed) &&
+	    flavor_allowed(&exp->exp_flvr_old[1], req)) {
+		/* make the new flavor as "current", and old ones as
+		 * about-to-expire */
+		CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp,
+		       exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc);
+		flavor = exp->exp_flvr_old[1];
+		exp->exp_flvr_old[1] = exp->exp_flvr_old[0];
+		exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0];
+		exp->exp_flvr_old[0] = exp->exp_flvr;
+		exp->exp_flvr_expire[0] = cfs_time_current_sec() +
+					  EXP_FLVR_UPDATE_EXPIRE;
+		exp->exp_flvr = flavor;
+
+		/* flavor change finished */
+		exp->exp_flvr_changed = 0;
+		LASSERT(exp->exp_flvr_adapt == 1);
+
+		/* if it's gss, we only interested in root ctx init */
+		if (req->rq_auth_gss &&
+		    !(req->rq_ctx_init &&
+		      (req->rq_auth_usr_root || req->rq_auth_usr_mdt ||
+		       req->rq_auth_usr_ost))) {
+			spin_unlock(&exp->exp_lock);
+			CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n",
+			       req->rq_auth_gss, req->rq_ctx_init,
+			       req->rq_auth_usr_root, req->rq_auth_usr_mdt,
+			       req->rq_auth_usr_ost);
+			return 0;
+		}
+
+		exp->exp_flvr_adapt = 0;
+		spin_unlock(&exp->exp_lock);
+
+		return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+						req->rq_svc_ctx, &flavor);
+	}
+
+	/* if it equals to the current flavor, we accept it, but need to
+	 * dealing with reverse sec/ctx */
+	if (likely(flavor_allowed(&exp->exp_flvr, req))) {
+		/* most cases should return here, we only interested in
+		 * gss root ctx init */
+		if (!req->rq_auth_gss || !req->rq_ctx_init ||
+		    (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+		     !req->rq_auth_usr_ost)) {
+			spin_unlock(&exp->exp_lock);
+			return 0;
+		}
+
+		/* if flavor just changed, we should not proceed, just leave
+		 * it and current flavor will be discovered and replaced
+		 * shortly, and let _this_ rpc pass through */
+		if (exp->exp_flvr_changed) {
+			LASSERT(exp->exp_flvr_adapt);
+			spin_unlock(&exp->exp_lock);
+			return 0;
+		}
+
+		if (exp->exp_flvr_adapt) {
+			exp->exp_flvr_adapt = 0;
+			CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n",
+			       exp, exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[0].sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+			flavor = exp->exp_flvr;
+			spin_unlock(&exp->exp_lock);
+
+			return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+							req->rq_svc_ctx,
+							&flavor);
+		} else {
+			CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, "
+			       "install rvs ctx\n", exp, exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[0].sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+			spin_unlock(&exp->exp_lock);
+
+			return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse,
+							   req->rq_svc_ctx);
+		}
+	}
+
+	if (exp->exp_flvr_expire[0]) {
+		if (exp->exp_flvr_expire[0] >= cfs_time_current_sec()) {
+			if (flavor_allowed(&exp->exp_flvr_old[0], req)) {
+				CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the "
+				       "middle one ("CFS_DURATION_T")\n", exp,
+				       exp->exp_flvr.sf_rpc,
+				       exp->exp_flvr_old[0].sf_rpc,
+				       exp->exp_flvr_old[1].sf_rpc,
+				       exp->exp_flvr_expire[0] -
+						cfs_time_current_sec());
+				spin_unlock(&exp->exp_lock);
+				return 0;
+			}
+		} else {
+			CDEBUG(D_SEC, "mark middle expired\n");
+			exp->exp_flvr_expire[0] = 0;
+		}
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp,
+		       exp->exp_flvr.sf_rpc,
+		       exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+		       req->rq_flvr.sf_rpc);
+	}
+
+	/* now it doesn't match the current flavor, the only chance we can
+	 * accept it is match the old flavors which is not expired. */
+	if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) {
+		if (exp->exp_flvr_expire[1] >= cfs_time_current_sec()) {
+			if (flavor_allowed(&exp->exp_flvr_old[1], req)) {
+				CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the "
+				       "oldest one ("CFS_DURATION_T")\n", exp,
+				       exp->exp_flvr.sf_rpc,
+				       exp->exp_flvr_old[0].sf_rpc,
+				       exp->exp_flvr_old[1].sf_rpc,
+				       exp->exp_flvr_expire[1] -
+						cfs_time_current_sec());
+				spin_unlock(&exp->exp_lock);
+				return 0;
+			}
+		} else {
+			CDEBUG(D_SEC, "mark oldest expired\n");
+			exp->exp_flvr_expire[1] = 0;
+		}
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n",
+		       exp, exp->exp_flvr.sf_rpc,
+		       exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+		       req->rq_flvr.sf_rpc);
+	} else {
+		CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n",
+		       exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc,
+		       exp->exp_flvr_old[1].sf_rpc);
+	}
+
+	spin_unlock(&exp->exp_lock);
+
+	CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with "
+	      "unauthorized flavor %x, expect %x|%x(%+ld)|%x(%+ld)\n",
+	      exp, exp->exp_obd->obd_name,
+	      req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini,
+	      req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost,
+	      req->rq_flvr.sf_rpc,
+	      exp->exp_flvr.sf_rpc,
+	      exp->exp_flvr_old[0].sf_rpc,
+	      exp->exp_flvr_expire[0] ?
+	      (unsigned long) (exp->exp_flvr_expire[0] -
+			       cfs_time_current_sec()) : 0,
+	      exp->exp_flvr_old[1].sf_rpc,
+	      exp->exp_flvr_expire[1] ?
+	      (unsigned long) (exp->exp_flvr_expire[1] -
+			       cfs_time_current_sec()) : 0);
+	return -EACCES;
+}
+EXPORT_SYMBOL(sptlrpc_target_export_check);
+
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+				      struct sptlrpc_rule_set *rset)
+{
+	struct obd_export       *exp;
+	struct sptlrpc_flavor    new_flvr;
+
+	LASSERT(obd);
+
+	spin_lock(&obd->obd_dev_lock);
+
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+		if (exp->exp_connection == NULL)
+			continue;
+
+		/* note if this export had just been updated flavor
+		 * (exp_flvr_changed == 1), this will override the
+		 * previous one. */
+		spin_lock(&exp->exp_lock);
+		sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer,
+					     exp->exp_connection->c_peer.nid,
+					     &new_flvr);
+		if (exp->exp_flvr_changed ||
+		    !flavor_equal(&new_flvr, &exp->exp_flvr)) {
+			exp->exp_flvr_old[1] = new_flvr;
+			exp->exp_flvr_expire[1] = 0;
+			exp->exp_flvr_changed = 1;
+			exp->exp_flvr_adapt = 1;
+
+			CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n",
+			       exp, sptlrpc_part2name(exp->exp_sp_peer),
+			       exp->exp_flvr.sf_rpc,
+			       exp->exp_flvr_old[1].sf_rpc);
+		}
+		spin_unlock(&exp->exp_lock);
+	}
+
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor);
+
+static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc)
+{
+	/* peer's claim is unreliable unless gss is being used */
+	if (!req->rq_auth_gss || svc_rc == SECSVC_DROP)
+		return svc_rc;
+
+	switch (req->rq_sp_from) {
+	case LUSTRE_SP_CLI:
+		if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) {
+			DEBUG_REQ(D_ERROR, req, "faked source CLI");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_MDT:
+		if (!req->rq_auth_usr_mdt) {
+			DEBUG_REQ(D_ERROR, req, "faked source MDT");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_OST:
+		if (!req->rq_auth_usr_ost) {
+			DEBUG_REQ(D_ERROR, req, "faked source OST");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_MGS:
+	case LUSTRE_SP_MGC:
+		if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+		    !req->rq_auth_usr_ost) {
+			DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS");
+			svc_rc = SECSVC_DROP;
+		}
+		break;
+	case LUSTRE_SP_ANY:
+	default:
+		DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from);
+		svc_rc = SECSVC_DROP;
+	}
+
+	return svc_rc;
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon request message of
+ * incoming \a req. This must be the first thing to do with a incoming
+ * request in ptlrpc layer.
+ *
+ * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in
+ * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set.
+ * \retval SECSVC_COMPLETE success, the request has been fully processed, and
+ * reply message has been prepared.
+ * \retval SECSVC_DROP failed, this request should be dropped.
+ */
+int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct lustre_msg	*msg = req->rq_reqbuf;
+	int		       rc;
+	ENTRY;
+
+	LASSERT(msg);
+	LASSERT(req->rq_reqmsg == NULL);
+	LASSERT(req->rq_repmsg == NULL);
+	LASSERT(req->rq_svc_ctx == NULL);
+
+	req->rq_req_swab_mask = 0;
+
+	rc = __lustre_unpack_msg(msg, req->rq_reqdata_len);
+	switch (rc) {
+	case 1:
+		lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+	case 0:
+		break;
+	default:
+		CERROR("error unpacking request from %s x"LPU64"\n",
+		       libcfs_id2str(req->rq_peer), req->rq_xid);
+		RETURN(SECSVC_DROP);
+	}
+
+	req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr);
+	req->rq_sp_from = LUSTRE_SP_ANY;
+	req->rq_auth_uid = INVALID_UID;
+	req->rq_auth_mapped_uid = INVALID_UID;
+
+	policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc);
+	if (!policy) {
+		CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		RETURN(SECSVC_DROP);
+	}
+
+	LASSERT(policy->sp_sops->accept);
+	rc = policy->sp_sops->accept(req);
+	sptlrpc_policy_put(policy);
+	LASSERT(req->rq_reqmsg || rc != SECSVC_OK);
+	LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP);
+
+	/*
+	 * if it's not null flavor (which means embedded packing msg),
+	 * reset the swab mask for the comming inner msg unpacking.
+	 */
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL)
+		req->rq_req_swab_mask = 0;
+
+	/* sanity check for the request source */
+	rc = sptlrpc_svc_check_from(req, rc);
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed,
+ * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to
+ * a buffer of \a msglen size.
+ */
+int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+	struct ptlrpc_sec_policy *policy;
+	struct ptlrpc_reply_state *rs;
+	int rc;
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_svc_ctx->sc_policy);
+
+	policy = req->rq_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->alloc_rs);
+
+	rc = policy->sp_sops->alloc_rs(req, msglen);
+	if (unlikely(rc == -ENOMEM)) {
+		/* failed alloc, try emergency pool */
+		rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_svcpt);
+		if (rs == NULL)
+			RETURN(-ENOMEM);
+
+		req->rq_reply_state = rs;
+		rc = policy->sp_sops->alloc_rs(req, msglen);
+		if (rc) {
+			lustre_put_emerg_rs(rs);
+			req->rq_reply_state = NULL;
+		}
+	}
+
+	LASSERT(rc != 0 ||
+		(req->rq_reply_state && req->rq_reply_state->rs_msg));
+
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon reply message.
+ *
+ * \post req->rq_reply_off is set to approriate server-controlled reply offset.
+ * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible.
+ */
+int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_sec_policy *policy;
+	int rc;
+	ENTRY;
+
+	LASSERT(req->rq_svc_ctx);
+	LASSERT(req->rq_svc_ctx->sc_policy);
+
+	policy = req->rq_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->authorize);
+
+	rc = policy->sp_sops->authorize(req);
+	LASSERT(rc || req->rq_reply_state->rs_repdata_len);
+
+	RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to free reply_state.
+ */
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_sec_policy *policy;
+	unsigned int prealloc;
+	ENTRY;
+
+	LASSERT(rs->rs_svc_ctx);
+	LASSERT(rs->rs_svc_ctx->sc_policy);
+
+	policy = rs->rs_svc_ctx->sc_policy;
+	LASSERT(policy->sp_sops->free_rs);
+
+	prealloc = rs->rs_prealloc;
+	policy->sp_sops->free_rs(rs);
+
+	if (prealloc)
+		lustre_put_emerg_rs(rs);
+	EXIT;
+}
+
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx != NULL)
+		atomic_inc(&ctx->sc_refcount);
+}
+
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx == NULL)
+		return;
+
+	LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+	if (atomic_dec_and_test(&ctx->sc_refcount)) {
+		if (ctx->sc_policy->sp_sops->free_ctx)
+			ctx->sc_policy->sp_sops->free_ctx(ctx);
+	}
+	req->rq_svc_ctx = NULL;
+}
+
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req)
+{
+	struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+	if (ctx == NULL)
+		return;
+
+	LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+	if (ctx->sc_policy->sp_sops->invalidate_ctx)
+		ctx->sc_policy->sp_sops->invalidate_ctx(ctx);
+}
+EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate);
+
+/****************************************
+ * bulk security			*
+ ****************************************/
+
+/**
+ * Perform transformation upon bulk data pointed by \a desc. This is called
+ * before transforming the request message.
+ */
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->wrap_bulk)
+		return ctx->cc_ops->wrap_bulk(ctx, req, desc);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk);
+
+/**
+ * This is called after unwrap the reply message.
+ * return nob of actual plain text size received, or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+				 struct ptlrpc_bulk_desc *desc,
+				 int nob)
+{
+	struct ptlrpc_cli_ctx  *ctx;
+	int		     rc;
+
+	LASSERT(req->rq_bulk_read && !req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return desc->bd_nob_transferred;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->unwrap_bulk) {
+		rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+		if (rc < 0)
+			return rc;
+	}
+	return desc->bd_nob_transferred;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read);
+
+/**
+ * This is called after unwrap the reply message.
+ * return 0 for success or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+				  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_cli_ctx  *ctx;
+	int		     rc;
+
+	LASSERT(!req->rq_bulk_read && req->rq_bulk_write);
+
+	if (!req->rq_pack_bulk)
+		return 0;
+
+	ctx = req->rq_cli_ctx;
+	if (ctx->cc_ops->unwrap_bulk) {
+		rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+		if (rc < 0)
+			return rc;
+	}
+
+	/*
+	 * if everything is going right, nob should equals to nob_transferred.
+	 * in case of privacy mode, nob_transferred needs to be adjusted.
+	 */
+	if (desc->bd_nob != desc->bd_nob_transferred) {
+		CERROR("nob %d doesn't match transferred nob %d",
+		       desc->bd_nob, desc->bd_nob_transferred);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write);
+
+
+/****************************************
+ * user descriptor helpers	      *
+ ****************************************/
+
+int sptlrpc_current_user_desc_size(void)
+{
+	int ngroups;
+
+	ngroups = current_ngroups;
+
+	if (ngroups > LUSTRE_MAX_GROUPS)
+		ngroups = LUSTRE_MAX_GROUPS;
+	return sptlrpc_user_desc_size(ngroups);
+}
+EXPORT_SYMBOL(sptlrpc_current_user_desc_size);
+
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset)
+{
+	struct ptlrpc_user_desc *pud;
+
+	pud = lustre_msg_buf(msg, offset, 0);
+
+	pud->pud_uid = current_uid();
+	pud->pud_gid = current_gid();
+	pud->pud_fsuid = current_fsuid();
+	pud->pud_fsgid = current_fsgid();
+	pud->pud_cap = cfs_curproc_cap_pack();
+	pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4;
+
+	task_lock(current);
+	if (pud->pud_ngroups > current_ngroups)
+		pud->pud_ngroups = current_ngroups;
+	memcpy(pud->pud_groups, current_cred()->group_info->blocks[0],
+	       pud->pud_ngroups * sizeof(__u32));
+	task_unlock(current);
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_pack_user_desc);
+
+int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed)
+{
+	struct ptlrpc_user_desc *pud;
+	int		      i;
+
+	pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+	if (!pud)
+		return -EINVAL;
+
+	if (swabbed) {
+		__swab32s(&pud->pud_uid);
+		__swab32s(&pud->pud_gid);
+		__swab32s(&pud->pud_fsuid);
+		__swab32s(&pud->pud_fsgid);
+		__swab32s(&pud->pud_cap);
+		__swab32s(&pud->pud_ngroups);
+	}
+
+	if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) {
+		CERROR("%u groups is too large\n", pud->pud_ngroups);
+		return -EINVAL;
+	}
+
+	if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) >
+	    msg->lm_buflens[offset]) {
+		CERROR("%u groups are claimed but bufsize only %u\n",
+		       pud->pud_ngroups, msg->lm_buflens[offset]);
+		return -EINVAL;
+	}
+
+	if (swabbed) {
+		for (i = 0; i < pud->pud_ngroups; i++)
+			__swab32s(&pud->pud_groups[i]);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unpack_user_desc);
+
+/****************************************
+ * misc helpers			 *
+ ****************************************/
+
+const char * sec2target_str(struct ptlrpc_sec *sec)
+{
+	if (!sec || !sec->ps_import || !sec->ps_import->imp_obd)
+		return "*";
+	if (sec_is_reverse(sec))
+		return "c";
+	return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid);
+}
+EXPORT_SYMBOL(sec2target_str);
+
+/*
+ * return true if the bulk data is protected
+ */
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr)
+{
+	switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+	case SPTLRPC_BULK_SVC_INTG:
+	case SPTLRPC_BULK_SVC_PRIV:
+		return 1;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(sptlrpc_flavor_has_bulk);
+
+/****************************************
+ * crypto API helper/alloc blkciper     *
+ ****************************************/
+
+/****************************************
+ * initialize/finalize		  *
+ ****************************************/
+
+int sptlrpc_init(void)
+{
+	int rc;
+
+	rwlock_init(&policy_lock);
+
+	rc = sptlrpc_gc_init();
+	if (rc)
+		goto out;
+
+	rc = sptlrpc_conf_init();
+	if (rc)
+		goto out_gc;
+
+	rc = sptlrpc_enc_pool_init();
+	if (rc)
+		goto out_conf;
+
+	rc = sptlrpc_null_init();
+	if (rc)
+		goto out_pool;
+
+	rc = sptlrpc_plain_init();
+	if (rc)
+		goto out_null;
+
+	rc = sptlrpc_lproc_init();
+	if (rc)
+		goto out_plain;
+
+	return 0;
+
+out_plain:
+	sptlrpc_plain_fini();
+out_null:
+	sptlrpc_null_fini();
+out_pool:
+	sptlrpc_enc_pool_fini();
+out_conf:
+	sptlrpc_conf_fini();
+out_gc:
+	sptlrpc_gc_fini();
+out:
+	return rc;
+}
+
+void sptlrpc_fini(void)
+{
+	sptlrpc_lproc_fini();
+	sptlrpc_plain_fini();
+	sptlrpc_null_fini();
+	sptlrpc_enc_pool_fini();
+	sptlrpc_conf_fini();
+	sptlrpc_gc_fini();
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
new file mode 100644
index 0000000..bf53f1b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c

@@ -0,0 +1,880 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_bulk.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+/****************************************
+ * bulk encryption page pools	   *
+ ****************************************/
+
+
+#define PTRS_PER_PAGE   (PAGE_CACHE_SIZE / sizeof(void *))
+#define PAGES_PER_POOL  (PTRS_PER_PAGE)
+
+#define IDLE_IDX_MAX	    (100)
+#define IDLE_IDX_WEIGHT	 (3)
+
+#define CACHE_QUIESCENT_PERIOD  (20)
+
+static struct ptlrpc_enc_page_pool {
+	/*
+	 * constants
+	 */
+	unsigned long    epp_max_pages;   /* maximum pages can hold, const */
+	unsigned int     epp_max_pools;   /* number of pools, const */
+
+	/*
+	 * wait queue in case of not enough free pages.
+	 */
+	wait_queue_head_t      epp_waitq;       /* waiting threads */
+	unsigned int     epp_waitqlen;    /* wait queue length */
+	unsigned long    epp_pages_short; /* # of pages wanted of in-q users */
+	unsigned int     epp_growing:1;   /* during adding pages */
+
+	/*
+	 * indicating how idle the pools are, from 0 to MAX_IDLE_IDX
+	 * this is counted based on each time when getting pages from
+	 * the pools, not based on time. which means in case that system
+	 * is idled for a while but the idle_idx might still be low if no
+	 * activities happened in the pools.
+	 */
+	unsigned long    epp_idle_idx;
+
+	/* last shrink time due to mem tight */
+	long	     epp_last_shrink;
+	long	     epp_last_access;
+
+	/*
+	 * in-pool pages bookkeeping
+	 */
+	spinlock_t	 epp_lock;	   /* protect following fields */
+	unsigned long    epp_total_pages; /* total pages in pools */
+	unsigned long    epp_free_pages;  /* current pages available */
+
+	/*
+	 * statistics
+	 */
+	unsigned long    epp_st_max_pages;      /* # of pages ever reached */
+	unsigned int     epp_st_grows;	  /* # of grows */
+	unsigned int     epp_st_grow_fails;     /* # of add pages failures */
+	unsigned int     epp_st_shrinks;	/* # of shrinks */
+	unsigned long    epp_st_access;	 /* # of access */
+	unsigned long    epp_st_missings;       /* # of cache missing */
+	unsigned long    epp_st_lowfree;	/* lowest free pages reached */
+	unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
+	cfs_time_t       epp_st_max_wait;       /* in jeffies */
+	/*
+	 * pointers to pools
+	 */
+	struct page    ***epp_pools;
+} page_pools;
+
+/*
+ * memory shrinker
+ */
+const int pools_shrinker_seeks = DEFAULT_SEEKS;
+static struct shrinker *pools_shrinker = NULL;
+
+
+/*
+ * /proc/fs/lustre/sptlrpc/encrypt_page_pools
+ */
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
+{
+	int     rc;
+
+	spin_lock(&page_pools.epp_lock);
+
+	rc = seq_printf(m,
+		      "physical pages:	  %lu\n"
+		      "pages per pool:	  %lu\n"
+		      "max pages:	       %lu\n"
+		      "max pools:	       %u\n"
+		      "total pages:	     %lu\n"
+		      "total free:	      %lu\n"
+		      "idle index:	      %lu/100\n"
+		      "last shrink:	     %lds\n"
+		      "last access:	     %lds\n"
+		      "max pages reached:       %lu\n"
+		      "grows:		   %u\n"
+		      "grows failure:	   %u\n"
+		      "shrinks:		 %u\n"
+		      "cache access:	    %lu\n"
+		      "cache missing:	   %lu\n"
+		      "low free mark:	   %lu\n"
+		      "max waitqueue depth:     %u\n"
+		      "max wait time:	   "CFS_TIME_T"/%u\n"
+		      ,
+		      num_physpages,
+		      PAGES_PER_POOL,
+		      page_pools.epp_max_pages,
+		      page_pools.epp_max_pools,
+		      page_pools.epp_total_pages,
+		      page_pools.epp_free_pages,
+		      page_pools.epp_idle_idx,
+		      cfs_time_current_sec() - page_pools.epp_last_shrink,
+		      cfs_time_current_sec() - page_pools.epp_last_access,
+		      page_pools.epp_st_max_pages,
+		      page_pools.epp_st_grows,
+		      page_pools.epp_st_grow_fails,
+		      page_pools.epp_st_shrinks,
+		      page_pools.epp_st_access,
+		      page_pools.epp_st_missings,
+		      page_pools.epp_st_lowfree,
+		      page_pools.epp_st_max_wqlen,
+		      page_pools.epp_st_max_wait, HZ
+		     );
+
+	spin_unlock(&page_pools.epp_lock);
+	return rc;
+}
+
+static void enc_pools_release_free_pages(long npages)
+{
+	int     p_idx, g_idx;
+	int     p_idx_max1, p_idx_max2;
+
+	LASSERT(npages > 0);
+	LASSERT(npages <= page_pools.epp_free_pages);
+	LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages);
+
+	/* max pool index before the release */
+	p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL;
+
+	page_pools.epp_free_pages -= npages;
+	page_pools.epp_total_pages -= npages;
+
+	/* max pool index after the release */
+	p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 :
+		     ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL);
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+	LASSERT(page_pools.epp_pools[p_idx]);
+
+	while (npages--) {
+		LASSERT(page_pools.epp_pools[p_idx]);
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+
+		__free_page(page_pools.epp_pools[p_idx][g_idx]);
+		page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	};
+
+	/* free unused pools */
+	while (p_idx_max1 < p_idx_max2) {
+		LASSERT(page_pools.epp_pools[p_idx_max2]);
+		OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_CACHE_SIZE);
+		page_pools.epp_pools[p_idx_max2] = NULL;
+		p_idx_max2--;
+	}
+}
+
+/*
+ * could be called frequently for query (@nr_to_scan == 0).
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static int enc_pools_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	if (unlikely(shrink_param(sc, nr_to_scan) != 0)) {
+		spin_lock(&page_pools.epp_lock);
+		shrink_param(sc, nr_to_scan) = min_t(unsigned long,
+						   shrink_param(sc, nr_to_scan),
+						   page_pools.epp_free_pages -
+						   PTLRPC_MAX_BRW_PAGES);
+		if (shrink_param(sc, nr_to_scan) > 0) {
+			enc_pools_release_free_pages(shrink_param(sc,
+								  nr_to_scan));
+			CDEBUG(D_SEC, "released %ld pages, %ld left\n",
+			       (long)shrink_param(sc, nr_to_scan),
+			       page_pools.epp_free_pages);
+
+			page_pools.epp_st_shrinks++;
+			page_pools.epp_last_shrink = cfs_time_current_sec();
+		}
+		spin_unlock(&page_pools.epp_lock);
+	}
+
+	/*
+	 * if no pool access for a long time, we consider it's fully idle.
+	 * a little race here is fine.
+	 */
+	if (unlikely(cfs_time_current_sec() - page_pools.epp_last_access >
+		     CACHE_QUIESCENT_PERIOD)) {
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_idle_idx = IDLE_IDX_MAX;
+		spin_unlock(&page_pools.epp_lock);
+	}
+
+	LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+	return max((int)page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) *
+		(IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX;
+}
+
+static inline
+int npages_to_npools(unsigned long npages)
+{
+	return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL);
+}
+
+/*
+ * return how many pages cleaned up.
+ */
+static unsigned long enc_pools_cleanup(struct page ***pools, int npools)
+{
+	unsigned long cleaned = 0;
+	int	   i, j;
+
+	for (i = 0; i < npools; i++) {
+		if (pools[i]) {
+			for (j = 0; j < PAGES_PER_POOL; j++) {
+				if (pools[i][j]) {
+					__free_page(pools[i][j]);
+					cleaned++;
+				}
+			}
+			OBD_FREE(pools[i], PAGE_CACHE_SIZE);
+			pools[i] = NULL;
+		}
+	}
+
+	return cleaned;
+}
+
+/*
+ * merge @npools pointed by @pools which contains @npages new pages
+ * into current pools.
+ *
+ * we have options to avoid most memory copy with some tricks. but we choose
+ * the simplest way to avoid complexity. It's not frequently called.
+ */
+static void enc_pools_insert(struct page ***pools, int npools, int npages)
+{
+	int     freeslot;
+	int     op_idx, np_idx, og_idx, ng_idx;
+	int     cur_npools, end_npools;
+
+	LASSERT(npages > 0);
+	LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages);
+	LASSERT(npages_to_npools(npages) == npools);
+	LASSERT(page_pools.epp_growing);
+
+	spin_lock(&page_pools.epp_lock);
+
+	/*
+	 * (1) fill all the free slots of current pools.
+	 */
+	/* free slots are those left by rent pages, and the extra ones with
+	 * index >= total_pages, locate at the tail of last pool. */
+	freeslot = page_pools.epp_total_pages % PAGES_PER_POOL;
+	if (freeslot != 0)
+		freeslot = PAGES_PER_POOL - freeslot;
+	freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages;
+
+	op_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	og_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+	np_idx = npools - 1;
+	ng_idx = (npages - 1) % PAGES_PER_POOL;
+
+	while (freeslot) {
+		LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL);
+		LASSERT(pools[np_idx][ng_idx] != NULL);
+
+		page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx];
+		pools[np_idx][ng_idx] = NULL;
+
+		freeslot--;
+
+		if (++og_idx == PAGES_PER_POOL) {
+			op_idx++;
+			og_idx = 0;
+		}
+		if (--ng_idx < 0) {
+			if (np_idx == 0)
+				break;
+			np_idx--;
+			ng_idx = PAGES_PER_POOL - 1;
+		}
+	}
+
+	/*
+	 * (2) add pools if needed.
+	 */
+	cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) /
+		     PAGES_PER_POOL;
+	end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL -1) /
+		     PAGES_PER_POOL;
+	LASSERT(end_npools <= page_pools.epp_max_pools);
+
+	np_idx = 0;
+	while (cur_npools < end_npools) {
+		LASSERT(page_pools.epp_pools[cur_npools] == NULL);
+		LASSERT(np_idx < npools);
+		LASSERT(pools[np_idx] != NULL);
+
+		page_pools.epp_pools[cur_npools++] = pools[np_idx];
+		pools[np_idx++] = NULL;
+	}
+
+	page_pools.epp_total_pages += npages;
+	page_pools.epp_free_pages += npages;
+	page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+	if (page_pools.epp_total_pages > page_pools.epp_st_max_pages)
+		page_pools.epp_st_max_pages = page_pools.epp_total_pages;
+
+	CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
+	       page_pools.epp_total_pages);
+
+	spin_unlock(&page_pools.epp_lock);
+}
+
+static int enc_pools_add_pages(int npages)
+{
+	static DEFINE_MUTEX(add_pages_mutex);
+	struct page   ***pools;
+	int	     npools, alloced = 0;
+	int	     i, j, rc = -ENOMEM;
+
+	if (npages < PTLRPC_MAX_BRW_PAGES)
+		npages = PTLRPC_MAX_BRW_PAGES;
+
+	mutex_lock(&add_pages_mutex);
+
+	if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages)
+		npages = page_pools.epp_max_pages - page_pools.epp_total_pages;
+	LASSERT(npages > 0);
+
+	page_pools.epp_st_grows++;
+
+	npools = npages_to_npools(npages);
+	OBD_ALLOC(pools, npools * sizeof(*pools));
+	if (pools == NULL)
+		goto out;
+
+	for (i = 0; i < npools; i++) {
+		OBD_ALLOC(pools[i], PAGE_CACHE_SIZE);
+		if (pools[i] == NULL)
+			goto out_pools;
+
+		for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) {
+			pools[i][j] = alloc_page(__GFP_IO |
+						     __GFP_HIGHMEM);
+			if (pools[i][j] == NULL)
+				goto out_pools;
+
+			alloced++;
+		}
+	}
+	LASSERT(alloced == npages);
+
+	enc_pools_insert(pools, npools, npages);
+	CDEBUG(D_SEC, "added %d pages into pools\n", npages);
+	rc = 0;
+
+out_pools:
+	enc_pools_cleanup(pools, npools);
+	OBD_FREE(pools, npools * sizeof(*pools));
+out:
+	if (rc) {
+		page_pools.epp_st_grow_fails++;
+		CERROR("Failed to allocate %d enc pages\n", npages);
+	}
+
+	mutex_unlock(&add_pages_mutex);
+	return rc;
+}
+
+static inline void enc_pools_wakeup(void)
+{
+	LASSERT(spin_is_locked(&page_pools.epp_lock));
+	LASSERT(page_pools.epp_waitqlen >= 0);
+
+	if (unlikely(page_pools.epp_waitqlen)) {
+		LASSERT(waitqueue_active(&page_pools.epp_waitq));
+		wake_up_all(&page_pools.epp_waitq);
+	}
+}
+
+static int enc_pools_should_grow(int page_needed, long now)
+{
+	/* don't grow if someone else is growing the pools right now,
+	 * or the pools has reached its full capacity
+	 */
+	if (page_pools.epp_growing ||
+	    page_pools.epp_total_pages == page_pools.epp_max_pages)
+		return 0;
+
+	/* if total pages is not enough, we need to grow */
+	if (page_pools.epp_total_pages < page_needed)
+		return 1;
+
+	/*
+	 * we wanted to return 0 here if there was a shrink just happened
+	 * moment ago, but this may cause deadlock if both client and ost
+	 * live on single node.
+	 */
+#if 0
+	if (now - page_pools.epp_last_shrink < 2)
+		return 0;
+#endif
+
+	/*
+	 * here we perhaps need consider other factors like wait queue
+	 * length, idle index, etc. ?
+	 */
+
+	/* grow the pools in any other cases */
+	return 1;
+}
+
+/*
+ * we allocate the requested pages atomically.
+ */
+int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
+{
+	wait_queue_t  waitlink;
+	unsigned long   this_idle = -1;
+	cfs_time_t      tick = 0;
+	long	    now;
+	int	     p_idx, g_idx;
+	int	     i;
+
+	LASSERT(desc->bd_iov_count > 0);
+	LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages);
+
+	/* resent bulk, enc iov might have been allocated previously */
+	if (desc->bd_enc_iov != NULL)
+		return 0;
+
+	OBD_ALLOC(desc->bd_enc_iov,
+		  desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+	if (desc->bd_enc_iov == NULL)
+		return -ENOMEM;
+
+	spin_lock(&page_pools.epp_lock);
+
+	page_pools.epp_st_access++;
+again:
+	if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
+		if (tick == 0)
+			tick = cfs_time_current();
+
+		now = cfs_time_current_sec();
+
+		page_pools.epp_st_missings++;
+		page_pools.epp_pages_short += desc->bd_iov_count;
+
+		if (enc_pools_should_grow(desc->bd_iov_count, now)) {
+			page_pools.epp_growing = 1;
+
+			spin_unlock(&page_pools.epp_lock);
+			enc_pools_add_pages(page_pools.epp_pages_short / 2);
+			spin_lock(&page_pools.epp_lock);
+
+			page_pools.epp_growing = 0;
+
+			enc_pools_wakeup();
+		} else {
+			if (++page_pools.epp_waitqlen >
+			    page_pools.epp_st_max_wqlen)
+				page_pools.epp_st_max_wqlen =
+						page_pools.epp_waitqlen;
+
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			init_waitqueue_entry_current(&waitlink);
+			add_wait_queue(&page_pools.epp_waitq, &waitlink);
+
+			spin_unlock(&page_pools.epp_lock);
+			waitq_wait(&waitlink, TASK_UNINTERRUPTIBLE);
+			remove_wait_queue(&page_pools.epp_waitq, &waitlink);
+			LASSERT(page_pools.epp_waitqlen > 0);
+			spin_lock(&page_pools.epp_lock);
+			page_pools.epp_waitqlen--;
+		}
+
+		LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count);
+		page_pools.epp_pages_short -= desc->bd_iov_count;
+
+		this_idle = 0;
+		goto again;
+	}
+
+	/* record max wait time */
+	if (unlikely(tick != 0)) {
+		tick = cfs_time_current() - tick;
+		if (tick > page_pools.epp_st_max_wait)
+			page_pools.epp_st_max_wait = tick;
+	}
+
+	/* proceed with rest of allocation */
+	page_pools.epp_free_pages -= desc->bd_iov_count;
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+		desc->bd_enc_iov[i].kiov_page =
+					page_pools.epp_pools[p_idx][g_idx];
+		page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	if (page_pools.epp_free_pages < page_pools.epp_st_lowfree)
+		page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+	/*
+	 * new idle index = (old * weight + new) / (weight + 1)
+	 */
+	if (this_idle == -1) {
+		this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX /
+			    page_pools.epp_total_pages;
+	}
+	page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT +
+				   this_idle) /
+				  (IDLE_IDX_WEIGHT + 1);
+
+	page_pools.epp_last_access = cfs_time_current_sec();
+
+	spin_unlock(&page_pools.epp_lock);
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages);
+
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
+{
+	int     p_idx, g_idx;
+	int     i;
+
+	if (desc->bd_enc_iov == NULL)
+		return;
+
+	LASSERT(desc->bd_iov_count > 0);
+
+	spin_lock(&page_pools.epp_lock);
+
+	p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+	g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+	LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <=
+		page_pools.epp_total_pages);
+	LASSERT(page_pools.epp_pools[p_idx]);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		LASSERT(desc->bd_enc_iov[i].kiov_page != NULL);
+		LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]);
+		LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL);
+
+		page_pools.epp_pools[p_idx][g_idx] =
+					desc->bd_enc_iov[i].kiov_page;
+
+		if (++g_idx == PAGES_PER_POOL) {
+			p_idx++;
+			g_idx = 0;
+		}
+	}
+
+	page_pools.epp_free_pages += desc->bd_iov_count;
+
+	enc_pools_wakeup();
+
+	spin_unlock(&page_pools.epp_lock);
+
+	OBD_FREE(desc->bd_enc_iov,
+		 desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+	desc->bd_enc_iov = NULL;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages);
+
+/*
+ * we don't do much stuff for add_user/del_user anymore, except adding some
+ * initial pages in add_user() if current pools are empty, rest would be
+ * handled by the pools's self-adaption.
+ */
+int sptlrpc_enc_pool_add_user(void)
+{
+	int     need_grow = 0;
+
+	spin_lock(&page_pools.epp_lock);
+	if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) {
+		page_pools.epp_growing = 1;
+		need_grow = 1;
+	}
+	spin_unlock(&page_pools.epp_lock);
+
+	if (need_grow) {
+		enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES +
+				    PTLRPC_MAX_BRW_PAGES);
+
+		spin_lock(&page_pools.epp_lock);
+		page_pools.epp_growing = 0;
+		enc_pools_wakeup();
+		spin_unlock(&page_pools.epp_lock);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_add_user);
+
+int sptlrpc_enc_pool_del_user(void)
+{
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_del_user);
+
+static inline void enc_pools_alloc(void)
+{
+	LASSERT(page_pools.epp_max_pools);
+	OBD_ALLOC_LARGE(page_pools.epp_pools,
+			page_pools.epp_max_pools *
+			sizeof(*page_pools.epp_pools));
+}
+
+static inline void enc_pools_free(void)
+{
+	LASSERT(page_pools.epp_max_pools);
+	LASSERT(page_pools.epp_pools);
+
+	OBD_FREE_LARGE(page_pools.epp_pools,
+		       page_pools.epp_max_pools *
+		       sizeof(*page_pools.epp_pools));
+}
+
+int sptlrpc_enc_pool_init(void)
+{
+	/*
+	 * maximum capacity is 1/8 of total physical memory.
+	 * is the 1/8 a good number?
+	 */
+	page_pools.epp_max_pages = num_physpages / 8;
+	page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);
+
+	init_waitqueue_head(&page_pools.epp_waitq);
+	page_pools.epp_waitqlen = 0;
+	page_pools.epp_pages_short = 0;
+
+	page_pools.epp_growing = 0;
+
+	page_pools.epp_idle_idx = 0;
+	page_pools.epp_last_shrink = cfs_time_current_sec();
+	page_pools.epp_last_access = cfs_time_current_sec();
+
+	spin_lock_init(&page_pools.epp_lock);
+	page_pools.epp_total_pages = 0;
+	page_pools.epp_free_pages = 0;
+
+	page_pools.epp_st_max_pages = 0;
+	page_pools.epp_st_grows = 0;
+	page_pools.epp_st_grow_fails = 0;
+	page_pools.epp_st_shrinks = 0;
+	page_pools.epp_st_access = 0;
+	page_pools.epp_st_missings = 0;
+	page_pools.epp_st_lowfree = 0;
+	page_pools.epp_st_max_wqlen = 0;
+	page_pools.epp_st_max_wait = 0;
+
+	enc_pools_alloc();
+	if (page_pools.epp_pools == NULL)
+		return -ENOMEM;
+
+	pools_shrinker = set_shrinker(pools_shrinker_seeks,
+					  enc_pools_shrink);
+	if (pools_shrinker == NULL) {
+		enc_pools_free();
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void sptlrpc_enc_pool_fini(void)
+{
+	unsigned long cleaned, npools;
+
+	LASSERT(pools_shrinker);
+	LASSERT(page_pools.epp_pools);
+	LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages);
+
+	remove_shrinker(pools_shrinker);
+
+	npools = npages_to_npools(page_pools.epp_total_pages);
+	cleaned = enc_pools_cleanup(page_pools.epp_pools, npools);
+	LASSERT(cleaned == page_pools.epp_total_pages);
+
+	enc_pools_free();
+
+	if (page_pools.epp_st_access > 0) {
+		CDEBUG(D_SEC,
+		       "max pages %lu, grows %u, grow fails %u, shrinks %u, "
+		       "access %lu, missing %lu, max qlen %u, max wait "
+		       CFS_TIME_T"/%d\n",
+		       page_pools.epp_st_max_pages, page_pools.epp_st_grows,
+		       page_pools.epp_st_grow_fails,
+		       page_pools.epp_st_shrinks, page_pools.epp_st_access,
+		       page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
+		       page_pools.epp_st_max_wait, HZ);
+	}
+}
+
+
+static int cfs_hash_alg_id[] = {
+	[BULK_HASH_ALG_NULL]	= CFS_HASH_ALG_NULL,
+	[BULK_HASH_ALG_ADLER32]	= CFS_HASH_ALG_ADLER32,
+	[BULK_HASH_ALG_CRC32]	= CFS_HASH_ALG_CRC32,
+	[BULK_HASH_ALG_MD5]	= CFS_HASH_ALG_MD5,
+	[BULK_HASH_ALG_SHA1]	= CFS_HASH_ALG_SHA1,
+	[BULK_HASH_ALG_SHA256]	= CFS_HASH_ALG_SHA256,
+	[BULK_HASH_ALG_SHA384]	= CFS_HASH_ALG_SHA384,
+	[BULK_HASH_ALG_SHA512]	= CFS_HASH_ALG_SHA512,
+};
+const char * sptlrpc_get_hash_name(__u8 hash_alg)
+{
+	return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_name);
+
+__u8 sptlrpc_get_hash_alg(const char *algname)
+{
+	return cfs_crypto_hash_alg(algname);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_alg);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+	int			  size = msg->lm_buflens[offset];
+
+	bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+	if (bsd == NULL) {
+		CERROR("Invalid bulk sec desc: size %d\n", size);
+		return -EINVAL;
+	}
+
+	if (swabbed) {
+		__swab32s(&bsd->bsd_nob);
+	}
+
+	if (unlikely(bsd->bsd_version != 0)) {
+		CERROR("Unexpected version %u\n", bsd->bsd_version);
+		return -EPROTO;
+	}
+
+	if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) {
+		CERROR("Invalid type %u\n", bsd->bsd_type);
+		return -EPROTO;
+	}
+
+	/* FIXME more sanity check here */
+
+	if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+		     bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG &&
+		     bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) {
+		CERROR("Invalid svc %u\n", bsd->bsd_svc);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bulk_sec_desc_unpack);
+
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+			      void *buf, int buflen)
+{
+	struct cfs_crypto_hash_desc	*hdesc;
+	int				hashsize;
+	char				hashbuf[64];
+	unsigned int			bufsize;
+	int				i, err;
+
+	LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
+	LASSERT(buflen >= 4);
+
+	hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+	if (IS_ERR(hdesc)) {
+		CERROR("Unable to initialize checksum hash %s\n",
+		       cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
+		return PTR_ERR(hdesc);
+	}
+
+	hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page,
+				  desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK,
+				  desc->bd_iov[i].kiov_len);
+	}
+	if (hashsize > buflen) {
+		bufsize = sizeof(hashbuf);
+		err = cfs_crypto_hash_final(hdesc, (unsigned char *)hashbuf,
+					    &bufsize);
+		memcpy(buf, hashbuf, buflen);
+	} else {
+		bufsize = buflen;
+		err = cfs_crypto_hash_final(hdesc, (unsigned char *)buf,
+					    &bufsize);
+	}
+
+	if (err)
+		cfs_crypto_hash_final(hdesc, NULL, NULL);
+	return err;
+}
+EXPORT_SYMBOL(sptlrpc_get_bulk_checksum);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
new file mode 100644
index 0000000..a45a392
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c

@@ -0,0 +1,1233 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_dlm.h>
+#include <lustre_param.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+const char *sptlrpc_part2name(enum lustre_sec_part part)
+{
+	switch (part) {
+	case LUSTRE_SP_CLI:
+		return "cli";
+	case LUSTRE_SP_MDT:
+		return "mdt";
+	case LUSTRE_SP_OST:
+		return "ost";
+	case LUSTRE_SP_MGC:
+		return "mgc";
+	case LUSTRE_SP_MGS:
+		return "mgs";
+	case LUSTRE_SP_ANY:
+		return "any";
+	default:
+		return "err";
+	}
+}
+EXPORT_SYMBOL(sptlrpc_part2name);
+
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd)
+{
+	const char *type = obd->obd_type->typ_name;
+
+	if (!strcmp(type, LUSTRE_MDT_NAME))
+		return LUSTRE_SP_MDT;
+	if (!strcmp(type, LUSTRE_OST_NAME))
+		return LUSTRE_SP_OST;
+	if (!strcmp(type, LUSTRE_MGS_NAME))
+		return LUSTRE_SP_MGS;
+
+	CERROR("unknown target %p(%s)\n", obd, type);
+	return LUSTRE_SP_ANY;
+}
+EXPORT_SYMBOL(sptlrpc_target_sec_part);
+
+/****************************************
+ * user supplied flavor string parsing  *
+ ****************************************/
+
+/*
+ * format: <base_flavor>[-<bulk_type:alg_spec>]
+ */
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr)
+{
+	char	    buf[32];
+	char	   *bulk, *alg;
+
+	memset(flvr, 0, sizeof(*flvr));
+
+	if (str == NULL || str[0] == '\0') {
+		flvr->sf_rpc = SPTLRPC_FLVR_INVALID;
+		return 0;
+	}
+
+	strncpy(buf, str, sizeof(buf));
+	buf[sizeof(buf) - 1] = '\0';
+
+	bulk = strchr(buf, '-');
+	if (bulk)
+		*bulk++ = '\0';
+
+	flvr->sf_rpc = sptlrpc_name2flavor_base(buf);
+	if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID)
+		goto err_out;
+
+	/*
+	 * currently only base flavor "plain" can have bulk specification.
+	 */
+	if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) {
+		flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32;
+		if (bulk) {
+			/*
+			 * format: plain-hash:<hash_alg>
+			 */
+			alg = strchr(bulk, ':');
+			if (alg == NULL)
+				goto err_out;
+			*alg++ = '\0';
+
+			if (strcmp(bulk, "hash"))
+				goto err_out;
+
+			flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg);
+			if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX)
+				goto err_out;
+		}
+
+		if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL)
+			flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL);
+		else
+			flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG);
+	} else {
+		if (bulk)
+			goto err_out;
+	}
+
+	flvr->sf_flags = 0;
+	return 0;
+
+err_out:
+	CERROR("invalid flavor string: %s\n", str);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(sptlrpc_parse_flavor);
+
+/****************************************
+ * configure rules		      *
+ ****************************************/
+
+static void get_default_flavor(struct sptlrpc_flavor *sf)
+{
+	memset(sf, 0, sizeof(*sf));
+
+	sf->sf_rpc = SPTLRPC_FLVR_NULL;
+	sf->sf_flags = 0;
+}
+
+static void sptlrpc_rule_init(struct sptlrpc_rule *rule)
+{
+	rule->sr_netid = LNET_NIDNET(LNET_NID_ANY);
+	rule->sr_from = LUSTRE_SP_ANY;
+	rule->sr_to = LUSTRE_SP_ANY;
+	rule->sr_padding = 0;
+
+	get_default_flavor(&rule->sr_flvr);
+}
+
+/*
+ * format: network[.direction]=flavor
+ */
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule)
+{
+	char	   *flavor, *dir;
+	int	     rc;
+
+	sptlrpc_rule_init(rule);
+
+	flavor = strchr(param, '=');
+	if (flavor == NULL) {
+		CERROR("invalid param, no '='\n");
+		RETURN(-EINVAL);
+	}
+	*flavor++ = '\0';
+
+	dir = strchr(param, '.');
+	if (dir)
+		*dir++ = '\0';
+
+	/* 1.1 network */
+	if (strcmp(param, "default")) {
+		rule->sr_netid = libcfs_str2net(param);
+		if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) {
+			CERROR("invalid network name: %s\n", param);
+			RETURN(-EINVAL);
+		}
+	}
+
+	/* 1.2 direction */
+	if (dir) {
+		if (!strcmp(dir, "mdt2ost")) {
+			rule->sr_from = LUSTRE_SP_MDT;
+			rule->sr_to = LUSTRE_SP_OST;
+		} else if (!strcmp(dir, "mdt2mdt")) {
+			rule->sr_from = LUSTRE_SP_MDT;
+			rule->sr_to = LUSTRE_SP_MDT;
+		} else if (!strcmp(dir, "cli2ost")) {
+			rule->sr_from = LUSTRE_SP_CLI;
+			rule->sr_to = LUSTRE_SP_OST;
+		} else if (!strcmp(dir, "cli2mdt")) {
+			rule->sr_from = LUSTRE_SP_CLI;
+			rule->sr_to = LUSTRE_SP_MDT;
+		} else {
+			CERROR("invalid rule dir segment: %s\n", dir);
+			RETURN(-EINVAL);
+		}
+	}
+
+	/* 2.1 flavor */
+	rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr);
+	if (rc)
+		RETURN(-EINVAL);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_parse_rule);
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset)
+{
+	LASSERT(rset->srs_nslot ||
+		(rset->srs_nrule == 0 && rset->srs_rules == NULL));
+
+	if (rset->srs_nslot) {
+		OBD_FREE(rset->srs_rules,
+			 rset->srs_nslot * sizeof(*rset->srs_rules));
+		sptlrpc_rule_set_init(rset);
+	}
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_free);
+
+/*
+ * return 0 if the rule set could accomodate one more rule.
+ */
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule *rules;
+	int nslot;
+
+	might_sleep();
+
+	if (rset->srs_nrule < rset->srs_nslot)
+		return 0;
+
+	nslot = rset->srs_nslot + 8;
+
+	/* better use realloc() if available */
+	OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules));
+	if (rules == NULL)
+		return -ENOMEM;
+
+	if (rset->srs_nrule) {
+		LASSERT(rset->srs_nslot && rset->srs_rules);
+		memcpy(rules, rset->srs_rules,
+		       rset->srs_nrule * sizeof(*rset->srs_rules));
+
+		OBD_FREE(rset->srs_rules,
+			 rset->srs_nslot * sizeof(*rset->srs_rules));
+	}
+
+	rset->srs_rules = rules;
+	rset->srs_nslot = nslot;
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_expand);
+
+static inline int rule_spec_dir(struct sptlrpc_rule *rule)
+{
+	return (rule->sr_from != LUSTRE_SP_ANY ||
+		rule->sr_to != LUSTRE_SP_ANY);
+}
+static inline int rule_spec_net(struct sptlrpc_rule *rule)
+{
+	return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY));
+}
+static inline int rule_match_dir(struct sptlrpc_rule *r1,
+				 struct sptlrpc_rule *r2)
+{
+	return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to);
+}
+static inline int rule_match_net(struct sptlrpc_rule *r1,
+				 struct sptlrpc_rule *r2)
+{
+	return (r1->sr_netid == r2->sr_netid);
+}
+
+/*
+ * merge @rule into @rset.
+ * the @rset slots might be expanded.
+ */
+int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset,
+			   struct sptlrpc_rule *rule)
+{
+	struct sptlrpc_rule      *p = rset->srs_rules;
+	int		       spec_dir, spec_net;
+	int		       rc, n, match = 0;
+
+	might_sleep();
+
+	spec_net = rule_spec_net(rule);
+	spec_dir = rule_spec_dir(rule);
+
+	for (n = 0; n < rset->srs_nrule; n++) {
+		p = &rset->srs_rules[n];
+
+		/* test network match, if failed:
+		 * - spec rule: skip rules which is also spec rule match, until
+		 *   we hit a wild rule, which means no more chance
+		 * - wild rule: skip until reach the one which is also wild
+		 *   and matches
+		 */
+		if (!rule_match_net(p, rule)) {
+			if (spec_net) {
+				if (rule_spec_net(p))
+					continue;
+				else
+					break;
+			} else {
+				continue;
+			}
+		}
+
+		/* test dir match, same logic as net matching */
+		if (!rule_match_dir(p, rule)) {
+			if (spec_dir) {
+				if (rule_spec_dir(p))
+					continue;
+				else
+					break;
+			} else {
+				continue;
+			}
+		}
+
+		/* find a match */
+		match = 1;
+		break;
+	}
+
+	if (match) {
+		LASSERT(n >= 0 && n < rset->srs_nrule);
+
+		if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+			/* remove this rule */
+			if (n < rset->srs_nrule - 1)
+				memmove(&rset->srs_rules[n],
+					&rset->srs_rules[n + 1],
+					(rset->srs_nrule - n - 1) *
+					sizeof(*rule));
+			rset->srs_nrule--;
+		} else {
+			/* override the rule */
+			memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+		}
+	} else {
+		LASSERT(n >= 0 && n <= rset->srs_nrule);
+
+		if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) {
+			rc = sptlrpc_rule_set_expand(rset);
+			if (rc)
+				return rc;
+
+			if (n < rset->srs_nrule)
+				memmove(&rset->srs_rules[n + 1],
+					&rset->srs_rules[n],
+					(rset->srs_nrule - n) * sizeof(*rule));
+			memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+			rset->srs_nrule++;
+		} else {
+			CDEBUG(D_CONFIG, "ignore the unmatched deletion\n");
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_merge);
+
+/**
+ * given from/to/nid, determine a matching flavor in ruleset.
+ * return 1 if a match found, otherwise return 0.
+ */
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+			    enum lustre_sec_part from,
+			    enum lustre_sec_part to,
+			    lnet_nid_t nid,
+			    struct sptlrpc_flavor *sf)
+{
+	struct sptlrpc_rule    *r;
+	int		     n;
+
+	for (n = 0; n < rset->srs_nrule; n++) {
+		r = &rset->srs_rules[n];
+
+		if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) &&
+		    r->sr_netid != LNET_NIDNET(LNET_NID_ANY) &&
+		    LNET_NIDNET(nid) != r->sr_netid)
+			continue;
+
+		if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY &&
+		    from != r->sr_from)
+			continue;
+
+		if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY &&
+		    to != r->sr_to)
+			continue;
+
+		*sf = r->sr_flvr;
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_choose);
+
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule *r;
+	int     n;
+
+	for (n = 0; n < rset->srs_nrule; n++) {
+		r = &rset->srs_rules[n];
+		CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n,
+		       r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc);
+	}
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_dump);
+
+static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen,
+				    struct sptlrpc_rule_set *tgt,
+				    enum lustre_sec_part from,
+				    enum lustre_sec_part to,
+				    struct sptlrpc_rule_set *rset)
+{
+	struct sptlrpc_rule_set *src[2] = { gen, tgt };
+	struct sptlrpc_rule     *rule;
+	int		      i, n, rc;
+
+	might_sleep();
+
+	/* merge general rules firstly, then target-specific rules */
+	for (i = 0; i < 2; i++) {
+		if (src[i] == NULL)
+			continue;
+
+		for (n = 0; n < src[i]->srs_nrule; n++) {
+			rule = &src[i]->srs_rules[n];
+
+			if (from != LUSTRE_SP_ANY &&
+			    rule->sr_from != LUSTRE_SP_ANY &&
+			    rule->sr_from != from)
+				continue;
+			if (to != LUSTRE_SP_ANY &&
+			    rule->sr_to != LUSTRE_SP_ANY &&
+			    rule->sr_to != to)
+				continue;
+
+			rc = sptlrpc_rule_set_merge(rset, rule);
+			if (rc) {
+				CERROR("can't merge: %d\n", rc);
+				return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/**********************************
+ * sptlrpc configuration support  *
+ **********************************/
+
+struct sptlrpc_conf_tgt {
+	struct list_head	      sct_list;
+	char		    sct_name[MAX_OBD_NAME];
+	struct sptlrpc_rule_set sct_rset;
+};
+
+struct sptlrpc_conf {
+	struct list_head	      sc_list;
+	char		    sc_fsname[MTI_NAME_MAXLEN];
+	unsigned int	    sc_modified;  /* modified during updating */
+	unsigned int	    sc_updated:1, /* updated copy from MGS */
+				sc_local:1;   /* local copy from target */
+	struct sptlrpc_rule_set sc_rset;      /* fs general rules */
+	struct list_head	      sc_tgts;      /* target-specific rules */
+};
+
+static struct mutex sptlrpc_conf_lock;
+static LIST_HEAD(sptlrpc_confs);
+
+static inline int is_hex(char c)
+{
+	return ((c >= '0' && c <= '9') ||
+		(c >= 'a' && c <= 'f'));
+}
+
+static void target2fsname(const char *tgt, char *fsname, int buflen)
+{
+	const char     *ptr;
+	int	     len;
+
+	ptr = strrchr(tgt, '-');
+	if (ptr) {
+		if ((strncmp(ptr, "-MDT", 4) != 0 &&
+		     strncmp(ptr, "-OST", 4) != 0) ||
+		    !is_hex(ptr[4]) || !is_hex(ptr[5]) ||
+		    !is_hex(ptr[6]) || !is_hex(ptr[7]))
+			ptr = NULL;
+	}
+
+	/* if we didn't find the pattern, treat the whole string as fsname */
+	if (ptr == NULL)
+		len = strlen(tgt);
+	else
+		len = ptr - tgt;
+
+	len = min(len, buflen - 1);
+	memcpy(fsname, tgt, len);
+	fsname[len] = '\0';
+}
+
+static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf)
+{
+	struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next;
+
+	sptlrpc_rule_set_free(&conf->sc_rset);
+
+	list_for_each_entry_safe(conf_tgt, conf_tgt_next,
+				     &conf->sc_tgts, sct_list) {
+		sptlrpc_rule_set_free(&conf_tgt->sct_rset);
+		list_del(&conf_tgt->sct_list);
+		OBD_FREE_PTR(conf_tgt);
+	}
+	LASSERT(list_empty(&conf->sc_tgts));
+
+	conf->sc_updated = 0;
+	conf->sc_local = 0;
+}
+
+static void sptlrpc_conf_free(struct sptlrpc_conf *conf)
+{
+	CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname);
+
+	sptlrpc_conf_free_rsets(conf);
+	list_del(&conf->sc_list);
+	OBD_FREE_PTR(conf);
+}
+
+static
+struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf,
+					      const char *name,
+					      int create)
+{
+	struct sptlrpc_conf_tgt *conf_tgt;
+
+	list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+		if (strcmp(conf_tgt->sct_name, name) == 0)
+			return conf_tgt;
+	}
+
+	if (!create)
+		return NULL;
+
+	OBD_ALLOC_PTR(conf_tgt);
+	if (conf_tgt) {
+		strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name));
+		sptlrpc_rule_set_init(&conf_tgt->sct_rset);
+		list_add(&conf_tgt->sct_list, &conf->sc_tgts);
+	}
+
+	return conf_tgt;
+}
+
+static
+struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname,
+				      int create)
+{
+	struct sptlrpc_conf *conf;
+
+	list_for_each_entry(conf, &sptlrpc_confs, sc_list) {
+		if (strcmp(conf->sc_fsname, fsname) == 0)
+			return conf;
+	}
+
+	if (!create)
+		return NULL;
+
+	OBD_ALLOC_PTR(conf);
+	if (conf == NULL)
+		return NULL;
+
+	strcpy(conf->sc_fsname, fsname);
+	sptlrpc_rule_set_init(&conf->sc_rset);
+	INIT_LIST_HEAD(&conf->sc_tgts);
+	list_add(&conf->sc_list, &sptlrpc_confs);
+
+	CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname);
+	return conf;
+}
+
+/**
+ * caller must hold conf_lock already.
+ */
+static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf,
+				   const char *target,
+				   struct sptlrpc_rule *rule)
+{
+	struct sptlrpc_conf_tgt  *conf_tgt;
+	struct sptlrpc_rule_set  *rule_set;
+
+	/* fsname == target means general rules for the whole fs */
+	if (strcmp(conf->sc_fsname, target) == 0) {
+		rule_set = &conf->sc_rset;
+	} else {
+		conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1);
+		if (conf_tgt) {
+			rule_set = &conf_tgt->sct_rset;
+		} else {
+			CERROR("out of memory, can't merge rule!\n");
+			return -ENOMEM;
+		}
+	}
+
+	return sptlrpc_rule_set_merge(rule_set, rule);
+}
+
+/**
+ * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we
+ * find one through the target name in the record inside conf_lock;
+ * otherwise means caller already hold conf_lock.
+ */
+static int __sptlrpc_process_config(struct lustre_cfg *lcfg,
+				    struct sptlrpc_conf *conf)
+{
+	char		   *target, *param;
+	char		    fsname[MTI_NAME_MAXLEN];
+	struct sptlrpc_rule     rule;
+	int		     rc;
+	ENTRY;
+
+	target = lustre_cfg_string(lcfg, 1);
+	if (target == NULL) {
+		CERROR("missing target name\n");
+		RETURN(-EINVAL);
+	}
+
+	param = lustre_cfg_string(lcfg, 2);
+	if (param == NULL) {
+		CERROR("missing parameter\n");
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param);
+
+	/* parse rule to make sure the format is correct */
+	if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) {
+		CERROR("Invalid sptlrpc parameter: %s\n", param);
+		RETURN(-EINVAL);
+	}
+	param += sizeof(PARAM_SRPC_FLVR) - 1;
+
+	rc = sptlrpc_parse_rule(param, &rule);
+	if (rc)
+		RETURN(-EINVAL);
+
+	if (conf == NULL) {
+		target2fsname(target, fsname, sizeof(fsname));
+
+		mutex_lock(&sptlrpc_conf_lock);
+		conf = sptlrpc_conf_get(fsname, 0);
+		if (conf == NULL) {
+			CERROR("can't find conf\n");
+			rc = -ENOMEM;
+		} else {
+			rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+		}
+		mutex_unlock(&sptlrpc_conf_lock);
+	} else {
+		LASSERT(mutex_is_locked(&sptlrpc_conf_lock));
+		rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+	}
+
+	if (rc == 0)
+		conf->sc_modified++;
+
+	RETURN(rc);
+}
+
+int sptlrpc_process_config(struct lustre_cfg *lcfg)
+{
+	return __sptlrpc_process_config(lcfg, NULL);
+}
+EXPORT_SYMBOL(sptlrpc_process_config);
+
+static int logname2fsname(const char *logname, char *buf, int buflen)
+{
+	char   *ptr;
+	int     len;
+
+	ptr = strrchr(logname, '-');
+	if (ptr == NULL || strcmp(ptr, "-sptlrpc")) {
+		CERROR("%s is not a sptlrpc config log\n", logname);
+		return -EINVAL;
+	}
+
+	len = min((int) (ptr - logname), buflen - 1);
+
+	memcpy(buf, logname, len);
+	buf[len] = '\0';
+	return 0;
+}
+
+void sptlrpc_conf_log_update_begin(const char *logname)
+{
+	struct sptlrpc_conf *conf;
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf && conf->sc_local) {
+		LASSERT(conf->sc_updated == 0);
+		sptlrpc_conf_free_rsets(conf);
+	}
+	conf->sc_modified = 0;
+
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_begin);
+
+/**
+ * mark a config log has been updated
+ */
+void sptlrpc_conf_log_update_end(const char *logname)
+{
+	struct sptlrpc_conf *conf;
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf) {
+		/*
+		 * if original state is not updated, make sure the
+		 * modified counter > 0 to enforce updating local copy.
+		 */
+		if (conf->sc_updated == 0)
+			conf->sc_modified++;
+
+		conf->sc_updated = 1;
+	}
+
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_end);
+
+void sptlrpc_conf_log_start(const char *logname)
+{
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	sptlrpc_conf_get(fsname, 1);
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_start);
+
+void sptlrpc_conf_log_stop(const char *logname)
+{
+	struct sptlrpc_conf *conf;
+	char		 fsname[16];
+
+	if (logname2fsname(logname, fsname, sizeof(fsname)))
+		return;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf)
+		sptlrpc_conf_free(conf);
+	mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_stop);
+
+static void inline flavor_set_flags(struct sptlrpc_flavor *sf,
+				    enum lustre_sec_part from,
+				    enum lustre_sec_part to,
+				    unsigned int fl_udesc)
+{
+	/*
+	 * null flavor doesn't need to set any flavor, and in fact
+	 * we'd better not do that because everybody share a single sec.
+	 */
+	if (sf->sf_rpc == SPTLRPC_FLVR_NULL)
+		return;
+
+	if (from == LUSTRE_SP_MDT) {
+		/* MDT->MDT; MDT->OST */
+		sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
+	} else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) {
+		/* CLI->OST */
+		sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK;
+	} else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) {
+		/* CLI->MDT */
+		if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL)
+			sf->sf_flags |= PTLRPC_SEC_FL_UDESC;
+	}
+}
+
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+				enum lustre_sec_part to,
+				struct obd_uuid *target,
+				lnet_nid_t nid,
+				struct sptlrpc_flavor *sf)
+{
+	struct sptlrpc_conf     *conf;
+	struct sptlrpc_conf_tgt *conf_tgt;
+	char		     name[MTI_NAME_MAXLEN];
+	int		      len, rc = 0;
+
+	target2fsname(target->uuid, name, sizeof(name));
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(name, 0);
+	if (conf == NULL)
+		goto out;
+
+	/* convert uuid name (supposed end with _UUID) to target name */
+	len = strlen(target->uuid);
+	LASSERT(len > 5);
+	memcpy(name, target->uuid, len - 5);
+	name[len - 5] = '\0';
+
+	conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0);
+	if (conf_tgt) {
+		rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset,
+					     from, to, nid, sf);
+		if (rc)
+			goto out;
+	}
+
+	rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf);
+out:
+	mutex_unlock(&sptlrpc_conf_lock);
+
+	if (rc == 0)
+		get_default_flavor(sf);
+
+	flavor_set_flags(sf, from, to, 1);
+}
+
+/**
+ * called by target devices, determine the expected flavor from
+ * certain peer (from, nid).
+ */
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+				  enum lustre_sec_part from,
+				  lnet_nid_t nid,
+				  struct sptlrpc_flavor *sf)
+{
+	if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0)
+		get_default_flavor(sf);
+}
+EXPORT_SYMBOL(sptlrpc_target_choose_flavor);
+
+#define SEC_ADAPT_DELAY	 (10)
+
+/**
+ * called by client devices, notify the sptlrpc config has changed and
+ * do import_sec_adapt later.
+ */
+void sptlrpc_conf_client_adapt(struct obd_device *obd)
+{
+	struct obd_import  *imp;
+	ENTRY;
+
+	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) ==0);
+	CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid);
+
+	/* serialize with connect/disconnect import */
+	down_read(&obd->u.cli.cl_sem);
+
+	imp = obd->u.cli.cl_import;
+	if (imp) {
+		spin_lock(&imp->imp_lock);
+		if (imp->imp_sec)
+			imp->imp_sec_expire = cfs_time_current_sec() +
+				SEC_ADAPT_DELAY;
+		spin_unlock(&imp->imp_lock);
+	}
+
+	up_read(&obd->u.cli.cl_sem);
+	EXIT;
+}
+EXPORT_SYMBOL(sptlrpc_conf_client_adapt);
+
+
+static void rule2string(struct sptlrpc_rule *r, char *buf, int buflen)
+{
+	char    dirbuf[8];
+	char   *net;
+	char   *ptr = buf;
+
+	if (r->sr_netid == LNET_NIDNET(LNET_NID_ANY))
+		net = "default";
+	else
+		net = libcfs_net2str(r->sr_netid);
+
+	if (r->sr_from == LUSTRE_SP_ANY && r->sr_to == LUSTRE_SP_ANY)
+		dirbuf[0] = '\0';
+	else
+		snprintf(dirbuf, sizeof(dirbuf), ".%s2%s",
+			 sptlrpc_part2name(r->sr_from),
+			 sptlrpc_part2name(r->sr_to));
+
+	ptr += snprintf(buf, buflen, "srpc.flavor.%s%s=", net, dirbuf);
+
+	sptlrpc_flavor2name(&r->sr_flvr, ptr, buflen - (ptr - buf));
+	buf[buflen - 1] = '\0';
+}
+
+static int sptlrpc_record_rule_set(struct llog_handle *llh,
+				   char *target,
+				   struct sptlrpc_rule_set *rset)
+{
+	struct lustre_cfg_bufs  bufs;
+	struct lustre_cfg      *lcfg;
+	struct llog_rec_hdr     rec;
+	int		     buflen;
+	char		    param[48];
+	int		     i, rc;
+
+	for (i = 0; i < rset->srs_nrule; i++) {
+		rule2string(&rset->srs_rules[i], param, sizeof(param));
+
+		lustre_cfg_bufs_reset(&bufs, NULL);
+		lustre_cfg_bufs_set_string(&bufs, 1, target);
+		lustre_cfg_bufs_set_string(&bufs, 2, param);
+		lcfg = lustre_cfg_new(LCFG_SPTLRPC_CONF, &bufs);
+		LASSERT(lcfg);
+
+		buflen = lustre_cfg_len(lcfg->lcfg_bufcount,
+					lcfg->lcfg_buflens);
+		rec.lrh_len = llog_data_len(buflen);
+		rec.lrh_type = OBD_CFG_REC;
+		rc = llog_write(NULL, llh, &rec, NULL, 0, (void *)lcfg, -1);
+		if (rc)
+			CERROR("failed to write a rec: rc = %d\n", rc);
+		lustre_cfg_free(lcfg);
+	}
+	return 0;
+}
+
+static int sptlrpc_record_rules(struct llog_handle *llh,
+				struct sptlrpc_conf *conf)
+{
+	struct sptlrpc_conf_tgt *conf_tgt;
+
+	sptlrpc_record_rule_set(llh, conf->sc_fsname, &conf->sc_rset);
+
+	list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+		sptlrpc_record_rule_set(llh, conf_tgt->sct_name,
+					&conf_tgt->sct_rset);
+	}
+	return 0;
+}
+
+#define LOG_SPTLRPC_TMP "sptlrpc.tmp"
+#define LOG_SPTLRPC     "sptlrpc"
+
+static
+int sptlrpc_target_local_copy_conf(struct obd_device *obd,
+				   struct sptlrpc_conf *conf)
+{
+	struct llog_handle   *llh = NULL;
+	struct llog_ctxt     *ctxt;
+	struct lvfs_run_ctxt  saved;
+	struct dentry	*dentry;
+	int		   rc;
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	if (ctxt == NULL)
+		RETURN(-EINVAL);
+
+	push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+	dentry = ll_lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
+				   strlen(MOUNT_CONFIGS_DIR));
+	if (IS_ERR(dentry)) {
+		rc = PTR_ERR(dentry);
+		CERROR("cannot lookup %s directory: rc = %d\n",
+		       MOUNT_CONFIGS_DIR, rc);
+		GOTO(out_ctx, rc);
+	}
+
+	/* erase the old tmp log */
+	rc = llog_erase(NULL, ctxt, NULL, LOG_SPTLRPC_TMP);
+	if (rc < 0 && rc != -ENOENT) {
+		CERROR("%s: cannot erase temporary sptlrpc log: rc = %d\n",
+		       obd->obd_name, rc);
+		GOTO(out_dput, rc);
+	}
+
+	/* write temporary log */
+	rc = llog_open_create(NULL, ctxt, &llh, NULL, LOG_SPTLRPC_TMP);
+	if (rc)
+		GOTO(out_dput, rc);
+	rc = llog_init_handle(NULL, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	rc = sptlrpc_record_rules(llh, conf);
+
+out_close:
+	llog_close(NULL, llh);
+	if (rc == 0)
+		rc = lustre_rename(dentry, obd->obd_lvfs_ctxt.pwdmnt,
+				   LOG_SPTLRPC_TMP, LOG_SPTLRPC);
+out_dput:
+	l_dput(dentry);
+out_ctx:
+	pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	CDEBUG(D_SEC, "target %s: write local sptlrpc conf: rc = %d\n",
+	       obd->obd_name, rc);
+	RETURN(rc);
+}
+
+static int local_read_handler(const struct lu_env *env,
+			      struct llog_handle *llh,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct sptlrpc_conf  *conf = (struct sptlrpc_conf *) data;
+	struct lustre_cfg    *lcfg = (struct lustre_cfg *)(rec + 1);
+	int		   cfg_len, rc;
+	ENTRY;
+
+	if (rec->lrh_type != OBD_CFG_REC) {
+		CERROR("unhandled lrh_type: %#x\n", rec->lrh_type);
+		RETURN(-EINVAL);
+	}
+
+	cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) -
+		  sizeof(struct llog_rec_tail);
+
+	rc = lustre_cfg_sanity_check(lcfg, cfg_len);
+	if (rc) {
+		CERROR("Insane cfg\n");
+		RETURN(rc);
+	}
+
+	if (lcfg->lcfg_command != LCFG_SPTLRPC_CONF) {
+		CERROR("invalid command (%x)\n", lcfg->lcfg_command);
+		RETURN(-EINVAL);
+	}
+
+	RETURN(__sptlrpc_process_config(lcfg, conf));
+}
+
+static
+int sptlrpc_target_local_read_conf(struct obd_device *obd,
+				   struct sptlrpc_conf *conf)
+{
+	struct llog_handle    *llh = NULL;
+	struct llog_ctxt      *ctxt;
+	struct lvfs_run_ctxt   saved;
+	int		    rc;
+	ENTRY;
+
+	LASSERT(conf->sc_updated == 0 && conf->sc_local == 0);
+
+	ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+	if (ctxt == NULL) {
+		CERROR("missing llog context\n");
+		RETURN(-EINVAL);
+	}
+
+	push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+	rc = llog_open(NULL, ctxt, &llh, NULL, LOG_SPTLRPC, LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		if (rc == -ENOENT)
+			rc = 0;
+		GOTO(out_pop, rc);
+	}
+
+	rc = llog_init_handle(NULL, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(out_close, rc);
+
+	if (llog_get_size(llh) <= 1) {
+		CDEBUG(D_SEC, "no local sptlrpc copy found\n");
+		GOTO(out_close, rc = 0);
+	}
+
+	rc = llog_process(NULL, llh, local_read_handler, (void *)conf, NULL);
+
+	if (rc == 0) {
+		conf->sc_local = 1;
+	} else {
+		sptlrpc_conf_free_rsets(conf);
+	}
+
+out_close:
+	llog_close(NULL, llh);
+out_pop:
+	pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+	llog_ctxt_put(ctxt);
+	CDEBUG(D_SEC, "target %s: read local sptlrpc conf: rc = %d\n",
+	       obd->obd_name, rc);
+	RETURN(rc);
+}
+
+
+/**
+ * called by target devices, extract sptlrpc rules which applies to
+ * this target, to be used for future rpc flavor checking.
+ */
+int sptlrpc_conf_target_get_rules(struct obd_device *obd,
+				  struct sptlrpc_rule_set *rset,
+				  int initial)
+{
+	struct sptlrpc_conf      *conf;
+	struct sptlrpc_conf_tgt  *conf_tgt;
+	enum lustre_sec_part      sp_dst;
+	char		      fsname[MTI_NAME_MAXLEN];
+	int		       rc = 0;
+	ENTRY;
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) {
+		sp_dst = LUSTRE_SP_MDT;
+	} else if (strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) == 0) {
+		sp_dst = LUSTRE_SP_OST;
+	} else {
+		CERROR("unexpected obd type %s\n", obd->obd_type->typ_name);
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_SEC, "get rules for target %s\n", obd->obd_uuid.uuid);
+
+	target2fsname(obd->obd_uuid.uuid, fsname, sizeof(fsname));
+
+	mutex_lock(&sptlrpc_conf_lock);
+
+	conf = sptlrpc_conf_get(fsname, 0);
+	if (conf == NULL) {
+		CERROR("missing sptlrpc config log\n");
+		GOTO(out, rc);
+	}
+
+	if (conf->sc_updated  == 0) {
+		/*
+		 * always read from local copy. here another option is
+		 * if we already have a local copy (read from another
+		 * target device hosted on the same node) we simply use that.
+		 */
+		if (conf->sc_local)
+			sptlrpc_conf_free_rsets(conf);
+
+		sptlrpc_target_local_read_conf(obd, conf);
+	} else {
+		LASSERT(conf->sc_local == 0);
+
+		/* write a local copy */
+		if (initial || conf->sc_modified)
+			sptlrpc_target_local_copy_conf(obd, conf);
+		else
+			CDEBUG(D_SEC, "unchanged, skip updating local copy\n");
+	}
+
+	/* extract rule set for this target */
+	conf_tgt = sptlrpc_conf_get_tgt(conf, obd->obd_name, 0);
+
+	rc = sptlrpc_rule_set_extract(&conf->sc_rset,
+				      conf_tgt ? &conf_tgt->sct_rset: NULL,
+				      LUSTRE_SP_ANY, sp_dst, rset);
+out:
+	mutex_unlock(&sptlrpc_conf_lock);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(sptlrpc_conf_target_get_rules);
+
+int  sptlrpc_conf_init(void)
+{
+	mutex_init(&sptlrpc_conf_lock);
+	return 0;
+}
+
+void sptlrpc_conf_fini(void)
+{
+	struct sptlrpc_conf  *conf, *conf_next;
+
+	mutex_lock(&sptlrpc_conf_lock);
+	list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) {
+		sptlrpc_conf_free(conf);
+	}
+	LASSERT(list_empty(&sptlrpc_confs));
+	mutex_unlock(&sptlrpc_conf_lock);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c
new file mode 100644
index 0000000..4c96a14a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c

@@ -0,0 +1,250 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_gc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+#define SEC_GC_INTERVAL (30 * 60)
+
+
+static struct mutex sec_gc_mutex;
+static LIST_HEAD(sec_gc_list);
+static spinlock_t sec_gc_list_lock;
+
+static LIST_HEAD(sec_gc_ctx_list);
+static spinlock_t sec_gc_ctx_list_lock;
+
+static struct ptlrpc_thread sec_gc_thread;
+static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
+
+
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+	LASSERT(sec->ps_gc_interval > 0);
+	LASSERT(list_empty(&sec->ps_gc_list));
+
+	sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+
+	spin_lock(&sec_gc_list_lock);
+	list_add_tail(&sec_gc_list, &sec->ps_gc_list);
+	spin_unlock(&sec_gc_list_lock);
+
+	CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_sec);
+
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
+{
+	if (list_empty(&sec->ps_gc_list))
+		return;
+
+	might_sleep();
+
+	/* signal before list_del to make iteration in gc thread safe */
+	atomic_inc(&sec_gc_wait_del);
+
+	spin_lock(&sec_gc_list_lock);
+	list_del_init(&sec->ps_gc_list);
+	spin_unlock(&sec_gc_list_lock);
+
+	/* barrier */
+	mutex_lock(&sec_gc_mutex);
+	mutex_unlock(&sec_gc_mutex);
+
+	atomic_dec(&sec_gc_wait_del);
+
+	CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_del_sec);
+
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
+{
+	LASSERT(list_empty(&ctx->cc_gc_chain));
+
+	CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n",
+	       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+	spin_lock(&sec_gc_ctx_list_lock);
+	list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
+	spin_unlock(&sec_gc_ctx_list_lock);
+
+	thread_add_flags(&sec_gc_thread, SVC_SIGNAL);
+	wake_up(&sec_gc_thread.t_ctl_waitq);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
+
+static void sec_process_ctx_list(void)
+{
+	struct ptlrpc_cli_ctx *ctx;
+
+	spin_lock(&sec_gc_ctx_list_lock);
+
+	while (!list_empty(&sec_gc_ctx_list)) {
+		ctx = list_entry(sec_gc_ctx_list.next,
+				     struct ptlrpc_cli_ctx, cc_gc_chain);
+		list_del_init(&ctx->cc_gc_chain);
+		spin_unlock(&sec_gc_ctx_list_lock);
+
+		LASSERT(ctx->cc_sec);
+		LASSERT(atomic_read(&ctx->cc_refcount) == 1);
+		CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n",
+		       ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+		sptlrpc_cli_ctx_put(ctx, 1);
+
+		spin_lock(&sec_gc_ctx_list_lock);
+	}
+
+	spin_unlock(&sec_gc_ctx_list_lock);
+}
+
+static void sec_do_gc(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+
+	if (unlikely(sec->ps_gc_next == 0)) {
+		CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n",
+		      sec, sec->ps_policy->sp_name);
+		return;
+	}
+
+	CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+
+	if (cfs_time_after(sec->ps_gc_next, cfs_time_current_sec()))
+		return;
+
+	sec->ps_policy->sp_cops->gc_ctx(sec);
+	sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+}
+
+static int sec_gc_main(void *arg)
+{
+	struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg;
+	struct l_wait_info    lwi;
+
+	unshare_fs_struct();
+
+	/* Record that the thread is running */
+	thread_set_flags(thread, SVC_RUNNING);
+	wake_up(&thread->t_ctl_waitq);
+
+	while (1) {
+		struct ptlrpc_sec *sec;
+
+		thread_clear_flags(thread, SVC_SIGNAL);
+		sec_process_ctx_list();
+again:
+		/* go through sec list do gc.
+		 * FIXME here we iterate through the whole list each time which
+		 * is not optimal. we perhaps want to use balanced binary tree
+		 * to trace each sec as order of expiry time.
+		 * another issue here is we wakeup as fixed interval instead of
+		 * according to each sec's expiry time */
+		mutex_lock(&sec_gc_mutex);
+		list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+			/* if someone is waiting to be deleted, let it
+			 * proceed as soon as possible. */
+			if (atomic_read(&sec_gc_wait_del)) {
+				CDEBUG(D_SEC, "deletion pending, start over\n");
+				mutex_unlock(&sec_gc_mutex);
+				goto again;
+			}
+
+			sec_do_gc(sec);
+		}
+		mutex_unlock(&sec_gc_mutex);
+
+		/* check ctx list again before sleep */
+		sec_process_ctx_list();
+
+		lwi = LWI_TIMEOUT(SEC_GC_INTERVAL * HZ, NULL, NULL);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopping(thread) ||
+			     thread_is_signal(thread),
+			     &lwi);
+
+		if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+			break;
+	}
+
+	thread_set_flags(thread, SVC_STOPPED);
+	wake_up(&thread->t_ctl_waitq);
+	return 0;
+}
+
+int sptlrpc_gc_init(void)
+{
+	struct l_wait_info lwi = { 0 };
+	task_t *task;
+
+	mutex_init(&sec_gc_mutex);
+	spin_lock_init(&sec_gc_list_lock);
+	spin_lock_init(&sec_gc_ctx_list_lock);
+
+	/* initialize thread control */
+	memset(&sec_gc_thread, 0, sizeof(sec_gc_thread));
+	init_waitqueue_head(&sec_gc_thread.t_ctl_waitq);
+
+	task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc");
+	if (IS_ERR(task)) {
+		CERROR("can't start gc thread: %ld\n", PTR_ERR(task));
+		return PTR_ERR(task);
+	}
+
+	l_wait_event(sec_gc_thread.t_ctl_waitq,
+		     thread_is_running(&sec_gc_thread), &lwi);
+	return 0;
+}
+
+void sptlrpc_gc_fini(void)
+{
+	struct l_wait_info lwi = { 0 };
+
+	thread_set_flags(&sec_gc_thread, SVC_STOPPING);
+	wake_up(&sec_gc_thread.t_ctl_waitq);
+
+	l_wait_event(sec_gc_thread.t_ctl_waitq,
+		     thread_is_stopped(&sec_gc_thread), &lwi);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c
new file mode 100644
index 0000000..1213621
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c

@@ -0,0 +1,199 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_lproc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+
+struct proc_dir_entry *sptlrpc_proc_root = NULL;
+EXPORT_SYMBOL(sptlrpc_proc_root);
+
+char *sec_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+	buf[0] = '\0';
+
+	if (flags & PTLRPC_SEC_FL_REVERSE)
+		strlcat(buf, "reverse,", bufsize);
+	if (flags & PTLRPC_SEC_FL_ROOTONLY)
+		strlcat(buf, "rootonly,", bufsize);
+	if (flags & PTLRPC_SEC_FL_UDESC)
+		strlcat(buf, "udesc,", bufsize);
+	if (flags & PTLRPC_SEC_FL_BULK)
+		strlcat(buf, "bulk,", bufsize);
+	if (buf[0] == '\0')
+		strlcat(buf, "-,", bufsize);
+
+	return buf;
+}
+
+static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_sec *sec = NULL;
+	char	       str[32];
+
+	LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+	if (cli->cl_import)
+		sec = sptlrpc_import_sec_ref(cli->cl_import);
+	if (sec == NULL)
+		goto out;
+
+	sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str));
+
+	seq_printf(seq, "rpc flavor:    %s\n",
+		   sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc));
+	seq_printf(seq, "bulk flavor:   %s\n",
+		   sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str)));
+	seq_printf(seq, "flags:	 %s\n",
+		   sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)));
+	seq_printf(seq, "id:	    %d\n", sec->ps_id);
+	seq_printf(seq, "refcount:      %d\n",
+		   atomic_read(&sec->ps_refcount));
+	seq_printf(seq, "nctx:	  %d\n", atomic_read(&sec->ps_nctx));
+	seq_printf(seq, "gc internal    %ld\n", sec->ps_gc_interval);
+	seq_printf(seq, "gc next	%ld\n",
+		   sec->ps_gc_interval ?
+		   sec->ps_gc_next - cfs_time_current_sec() : 0);
+
+	sptlrpc_sec_put(sec);
+out:
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+
+static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+	struct obd_device *dev = seq->private;
+	struct client_obd *cli = &dev->u.cli;
+	struct ptlrpc_sec *sec = NULL;
+
+	LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+		strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+	if (cli->cl_import)
+		sec = sptlrpc_import_sec_ref(cli->cl_import);
+	if (sec == NULL)
+		goto out;
+
+	if (sec->ps_policy->sp_cops->display)
+		sec->ps_policy->sp_cops->display(sec, seq);
+
+	sptlrpc_sec_put(sec);
+out:
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
+{
+	int     rc;
+
+	if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
+	    strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
+	    strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0) {
+		CERROR("can't register lproc for obd type %s\n",
+		       dev->obd_type->typ_name);
+		return -EINVAL;
+	}
+
+	rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444,
+				    &sptlrpc_info_lprocfs_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_info for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
+
+	rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444,
+				    &sptlrpc_ctxs_lprocfs_fops, dev);
+	if (rc) {
+		CERROR("create proc entry srpc_contexts for %s: %d\n",
+		       dev->obd_name, rc);
+		return rc;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
+
+LPROC_SEQ_FOPS_RO(sptlrpc_proc_enc_pool);
+static struct lprocfs_vars sptlrpc_lprocfs_vars[] = {
+	{ "encrypt_page_pools", &sptlrpc_proc_enc_pool_fops },
+	{ NULL }
+};
+
+int sptlrpc_lproc_init(void)
+{
+	int     rc;
+
+	LASSERT(sptlrpc_proc_root == NULL);
+
+	sptlrpc_proc_root = lprocfs_register("sptlrpc", proc_lustre_root,
+					     sptlrpc_lprocfs_vars, NULL);
+	if (IS_ERR(sptlrpc_proc_root)) {
+		rc = PTR_ERR(sptlrpc_proc_root);
+		sptlrpc_proc_root = NULL;
+		return rc;
+	}
+	return 0;
+}
+
+void sptlrpc_lproc_fini(void)
+{
+	if (sptlrpc_proc_root) {
+		lprocfs_remove(&sptlrpc_proc_root);
+		sptlrpc_proc_root = NULL;
+	}
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_null.c b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c
new file mode 100644
index 0000000..ff1137f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c

@@ -0,0 +1,464 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_null.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+static struct ptlrpc_sec_policy null_policy;
+static struct ptlrpc_sec	null_sec;
+static struct ptlrpc_cli_ctx    null_cli_ctx;
+static struct ptlrpc_svc_ctx    null_svc_ctx;
+
+/*
+ * we can temporarily use the topmost 8-bits of lm_secflvr to identify
+ * the source sec part.
+ */
+static inline
+void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
+{
+	msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24;
+}
+
+static inline
+enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
+{
+	return (msg->lm_secflvr >> 24) & 0xFF;
+}
+
+static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	/* should never reach here */
+	LBUG();
+	return 0;
+}
+
+static
+int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+
+	if (!req->rq_import->imp_dlm_fake) {
+		struct obd_device *obd = req->rq_import->imp_obd;
+		null_encode_sec_part(req->rq_reqbuf,
+				     obd->u.cli.cl_sp_me);
+	}
+	req->rq_reqdata_len = req->rq_reqlen;
+	return 0;
+}
+
+static
+int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	__u32   cksums, cksumc;
+
+	LASSERT(req->rq_repdata);
+
+	req->rq_repmsg = req->rq_repdata;
+	req->rq_replen = req->rq_repdata_len;
+
+	if (req->rq_early) {
+		cksums = lustre_msg_get_cksum(req->rq_repdata);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) &
+		    MSGHDR_CKSUM_INCOMPAT18)
+			cksumc = lustre_msg_calc_cksum(req->rq_repmsg, 0);
+		else
+			cksumc = lustre_msg_calc_cksum(req->rq_repmsg, 1);
+#else
+# warning "remove checksum compatibility support for b1_8"
+		cksumc = lustre_msg_calc_cksum(req->rq_repmsg);
+#endif
+		if (cksumc != cksums) {
+			CDEBUG(D_SEC,
+			       "early reply checksum mismatch: %08x != %08x\n",
+			       cksumc, cksums);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static
+struct ptlrpc_sec *null_create_sec(struct obd_import *imp,
+				   struct ptlrpc_svc_ctx *svc_ctx,
+				   struct sptlrpc_flavor *sf)
+{
+	LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL);
+
+	/* general layer has take a module reference for us, because we never
+	 * really destroy the sec, simply release the reference here.
+	 */
+	sptlrpc_policy_put(&null_policy);
+	return &null_sec;
+}
+
+static
+void null_destroy_sec(struct ptlrpc_sec *sec)
+{
+	LASSERT(sec == &null_sec);
+}
+
+static
+struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec,
+				       struct vfs_cred *vcred,
+				       int create, int remove_dead)
+{
+	atomic_inc(&null_cli_ctx.cc_refcount);
+	return &null_cli_ctx;
+}
+
+static
+int null_flush_ctx_cache(struct ptlrpc_sec *sec,
+			 uid_t uid,
+			 int grace, int force)
+{
+	return 0;
+}
+
+static
+int null_alloc_reqbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req,
+		      int msgsize)
+{
+	if (!req->rq_reqbuf) {
+		int alloc_size = size_roundup_power2(msgsize);
+
+		LASSERT(!req->rq_pool);
+		OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size);
+		if (!req->rq_reqbuf)
+			return -ENOMEM;
+
+		req->rq_reqbuf_len = alloc_size;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= msgsize);
+		memset(req->rq_reqbuf, 0, msgsize);
+	}
+
+	req->rq_reqmsg = req->rq_reqbuf;
+	return 0;
+}
+
+static
+void null_free_reqbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req)
+{
+	if (!req->rq_pool) {
+		LASSERTF(req->rq_reqmsg == req->rq_reqbuf,
+			 "req %p: reqmsg %p is not reqbuf %p in null sec\n",
+			 req, req->rq_reqmsg, req->rq_reqbuf);
+		LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen,
+			 "req %p: reqlen %d should smaller than buflen %d\n",
+			 req, req->rq_reqlen, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+}
+
+static
+int null_alloc_repbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req,
+		      int msgsize)
+{
+	/* add space for early replied */
+	msgsize += lustre_msg_early_size();
+
+	msgsize = size_roundup_power2(msgsize);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, msgsize);
+	if (!req->rq_repbuf)
+		return -ENOMEM;
+
+	req->rq_repbuf_len = msgsize;
+	return 0;
+}
+
+static
+void null_free_repbuf(struct ptlrpc_sec *sec,
+		      struct ptlrpc_request *req)
+{
+	LASSERT(req->rq_repbuf);
+
+	OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+}
+
+static
+int null_enlarge_reqbuf(struct ptlrpc_sec *sec,
+			struct ptlrpc_request *req,
+			int segment, int newsize)
+{
+	struct lustre_msg      *newbuf;
+	struct lustre_msg      *oldbuf = req->rq_reqmsg;
+	int		     oldsize, newmsg_size, alloc_size;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf == req->rq_reqmsg);
+	LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+	LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf));
+
+	/* compute new message size */
+	oldsize = req->rq_reqbuf->lm_buflens[segment];
+	req->rq_reqbuf->lm_buflens[segment] = newsize;
+	newmsg_size = lustre_packed_msg_size(oldbuf);
+	req->rq_reqbuf->lm_buflens[segment] = oldsize;
+
+	/* request from pool should always have enough buffer */
+	LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size);
+
+	if (req->rq_reqbuf_len < newmsg_size) {
+		alloc_size = size_roundup_power2(newmsg_size);
+
+		OBD_ALLOC_LARGE(newbuf, alloc_size);
+		if (newbuf == NULL)
+			return -ENOMEM;
+
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = req->rq_reqmsg = newbuf;
+		req->rq_reqbuf_len = alloc_size;
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+	req->rq_reqlen = newmsg_size;
+
+	return 0;
+}
+
+static struct ptlrpc_svc_ctx null_svc_ctx = {
+	.sc_refcount    = ATOMIC_INIT(1),
+	.sc_policy      = &null_policy,
+};
+
+static
+int null_accept(struct ptlrpc_request *req)
+{
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_NULL);
+
+	if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) {
+		CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc);
+		return SECSVC_DROP;
+	}
+
+	req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf);
+
+	req->rq_reqmsg = req->rq_reqbuf;
+	req->rq_reqlen = req->rq_reqdata_len;
+
+	req->rq_svc_ctx = &null_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	return SECSVC_OK;
+}
+
+static
+int null_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_reply_state *rs;
+	int rs_size = sizeof(*rs) + msgsize;
+
+	LASSERT(msgsize % 8 == 0);
+
+	rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			return -ENOMEM;
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = rs_size - sizeof(*rs);
+	rs->rs_msg = rs->rs_repbuf;
+
+	req->rq_reply_state = rs;
+	return 0;
+}
+
+static
+void null_free_rs(struct ptlrpc_reply_state *rs)
+{
+	LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1);
+	atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+static
+int null_authorize(struct ptlrpc_request *req)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+
+	LASSERT(rs);
+
+	rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+	rs->rs_repdata_len = req->rq_replen;
+
+	if (likely(req->rq_packed_final)) {
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+			req->rq_reply_off = lustre_msg_early_size();
+		else
+			req->rq_reply_off = 0;
+	} else {
+		__u32 cksum;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) &
+		    MSGHDR_CKSUM_INCOMPAT18)
+			cksum = lustre_msg_calc_cksum(rs->rs_repbuf, 0);
+		else
+			cksum = lustre_msg_calc_cksum(rs->rs_repbuf, 1);
+#else
+# warning "remove checksum compatibility support for b1_8"
+		cksum = lustre_msg_calc_cksum(rs->rs_repbuf);
+#endif
+		lustre_msg_set_cksum(rs->rs_repbuf, cksum);
+		req->rq_reply_off = 0;
+	}
+
+	return 0;
+}
+
+static struct ptlrpc_ctx_ops null_ctx_ops = {
+	.refresh		= null_ctx_refresh,
+	.sign		   = null_ctx_sign,
+	.verify		 = null_ctx_verify,
+};
+
+static struct ptlrpc_sec_cops null_sec_cops = {
+	.create_sec	     = null_create_sec,
+	.destroy_sec	    = null_destroy_sec,
+	.lookup_ctx	     = null_lookup_ctx,
+	.flush_ctx_cache	= null_flush_ctx_cache,
+	.alloc_reqbuf	   = null_alloc_reqbuf,
+	.alloc_repbuf	   = null_alloc_repbuf,
+	.free_reqbuf	    = null_free_reqbuf,
+	.free_repbuf	    = null_free_repbuf,
+	.enlarge_reqbuf	 = null_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops null_sec_sops = {
+	.accept		 = null_accept,
+	.alloc_rs	       = null_alloc_rs,
+	.authorize	      = null_authorize,
+	.free_rs		= null_free_rs,
+};
+
+static struct ptlrpc_sec_policy null_policy = {
+	.sp_owner	       = THIS_MODULE,
+	.sp_name		= "sec.null",
+	.sp_policy	      = SPTLRPC_POLICY_NULL,
+	.sp_cops		= &null_sec_cops,
+	.sp_sops		= &null_sec_sops,
+};
+
+static void null_init_internal(void)
+{
+	static HLIST_HEAD(__list);
+
+	null_sec.ps_policy = &null_policy;
+	atomic_set(&null_sec.ps_refcount, 1);     /* always busy */
+	null_sec.ps_id = -1;
+	null_sec.ps_import = NULL;
+	null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
+	null_sec.ps_flvr.sf_flags = 0;
+	null_sec.ps_part = LUSTRE_SP_ANY;
+	null_sec.ps_dying = 0;
+	spin_lock_init(&null_sec.ps_lock);
+	atomic_set(&null_sec.ps_nctx, 1);	 /* for "null_cli_ctx" */
+	INIT_LIST_HEAD(&null_sec.ps_gc_list);
+	null_sec.ps_gc_interval = 0;
+	null_sec.ps_gc_next = 0;
+
+	hlist_add_head(&null_cli_ctx.cc_cache, &__list);
+	atomic_set(&null_cli_ctx.cc_refcount, 1);    /* for hash */
+	null_cli_ctx.cc_sec = &null_sec;
+	null_cli_ctx.cc_ops = &null_ctx_ops;
+	null_cli_ctx.cc_expire = 0;
+	null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL |
+				PTLRPC_CTX_UPTODATE;
+	null_cli_ctx.cc_vcred.vc_uid = 0;
+	spin_lock_init(&null_cli_ctx.cc_lock);
+	INIT_LIST_HEAD(&null_cli_ctx.cc_req_list);
+	INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain);
+}
+
+int sptlrpc_null_init(void)
+{
+	int rc;
+
+	null_init_internal();
+
+	rc = sptlrpc_register_policy(&null_policy);
+	if (rc)
+		CERROR("failed to register %s: %d\n", null_policy.sp_name, rc);
+
+	return rc;
+}
+
+void sptlrpc_null_fini(void)
+{
+	int rc;
+
+	rc = sptlrpc_unregister_policy(&null_policy);
+	if (rc)
+		CERROR("failed to unregister %s: %d\n", null_policy.sp_name,rc);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c
new file mode 100644
index 0000000..f552d2f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c

@@ -0,0 +1,1021 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_plain.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+struct plain_sec {
+	struct ptlrpc_sec       pls_base;
+	rwlock_t	    pls_lock;
+	struct ptlrpc_cli_ctx  *pls_ctx;
+};
+
+static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec)
+{
+	return container_of(sec, struct plain_sec, pls_base);
+}
+
+static struct ptlrpc_sec_policy plain_policy;
+static struct ptlrpc_ctx_ops    plain_ctx_ops;
+static struct ptlrpc_svc_ctx    plain_svc_ctx;
+
+static unsigned int plain_at_offset;
+
+/*
+ * for simplicity, plain policy rpc use fixed layout.
+ */
+#define PLAIN_PACK_SEGMENTS	     (4)
+
+#define PLAIN_PACK_HDR_OFF	      (0)
+#define PLAIN_PACK_MSG_OFF	      (1)
+#define PLAIN_PACK_USER_OFF	     (2)
+#define PLAIN_PACK_BULK_OFF	     (3)
+
+#define PLAIN_FL_USER		   (0x01)
+#define PLAIN_FL_BULK		   (0x02)
+
+struct plain_header {
+	__u8	    ph_ver;	    /* 0 */
+	__u8	    ph_flags;
+	__u8	    ph_sp;	     /* source */
+	__u8	    ph_bulk_hash_alg;  /* complete flavor desc */
+	__u8	    ph_pad[4];
+};
+
+struct plain_bulk_token {
+	__u8	    pbt_hash[8];
+};
+
+#define PLAIN_BSD_SIZE \
+	(sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token))
+
+/****************************************
+ * bulk checksum helpers		*
+ ****************************************/
+
+static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+
+	if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed))
+		return -EPROTO;
+
+	bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE);
+	if (bsd == NULL) {
+		CERROR("bulk sec desc has short size %d\n",
+		       lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF));
+		return -EPROTO;
+	}
+
+	if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+	    bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) {
+		CERROR("invalid bulk svc %u\n", bsd->bsd_svc);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc,
+				    __u8 hash_alg,
+				    struct plain_bulk_token *token)
+{
+	if (hash_alg == BULK_HASH_ALG_NULL)
+		return 0;
+
+	memset(token->pbt_hash, 0, sizeof(token->pbt_hash));
+	return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash,
+					 sizeof(token->pbt_hash));
+}
+
+static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc,
+				  __u8 hash_alg,
+				  struct plain_bulk_token *tokenr)
+{
+	struct plain_bulk_token tokenv;
+	int		     rc;
+
+	if (hash_alg == BULK_HASH_ALG_NULL)
+		return 0;
+
+	memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash));
+	rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash,
+				       sizeof(tokenv.pbt_hash));
+	if (rc)
+		return rc;
+
+	if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash)))
+		return -EACCES;
+	return 0;
+}
+
+static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
+{
+	char	   *ptr;
+	unsigned int    off, i;
+
+	for (i = 0; i < desc->bd_iov_count; i++) {
+		if (desc->bd_iov[i].kiov_len == 0)
+			continue;
+
+		ptr = kmap(desc->bd_iov[i].kiov_page);
+		off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
+		ptr[off] ^= 0x1;
+		kunmap(desc->bd_iov[i].kiov_page);
+		return;
+	}
+}
+
+/****************************************
+ * cli_ctx apis			 *
+ ****************************************/
+
+static
+int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+	/* should never reach here */
+	LBUG();
+	return 0;
+}
+
+static
+int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx)
+{
+	return 0;
+}
+
+static
+int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	struct lustre_msg   *msg = req->rq_reqbuf;
+	struct plain_header *phdr;
+	ENTRY;
+
+	msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+	phdr->ph_ver = 0;
+	phdr->ph_flags = 0;
+	phdr->ph_sp = ctx->cc_sec->ps_part;
+	phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+	if (req->rq_pack_udesc)
+		phdr->ph_flags |= PLAIN_FL_USER;
+	if (req->rq_pack_bulk)
+		phdr->ph_flags |= PLAIN_FL_BULK;
+
+	req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount,
+						 msg->lm_buflens);
+	RETURN(0);
+}
+
+static
+int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+	struct lustre_msg   *msg = req->rq_repdata;
+	struct plain_header *phdr;
+	__u32		cksum;
+	int		  swabbed;
+	ENTRY;
+
+	if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
+		CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
+		RETURN(-EPROTO);
+	}
+
+	swabbed = ptlrpc_rep_need_swab(req);
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+	if (phdr == NULL) {
+		CERROR("missing plain header\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_ver != 0) {
+		CERROR("Invalid header version\n");
+		RETURN(-EPROTO);
+	}
+
+	/* expect no user desc in reply */
+	if (phdr->ph_flags & PLAIN_FL_USER) {
+		CERROR("Unexpected udesc flag in reply\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) {
+		CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg,
+		       req->rq_flvr.u_bulk.hash.hash_alg);
+		RETURN(-EPROTO);
+	}
+
+	if (unlikely(req->rq_early)) {
+		unsigned int hsize = 4;
+
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+				lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+				lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+				NULL, 0, (unsigned char *)&cksum, &hsize);
+		if (cksum != msg->lm_cksum) {
+			CDEBUG(D_SEC,
+			       "early reply checksum mismatch: %08x != %08x\n",
+			       cpu_to_le32(cksum), msg->lm_cksum);
+			RETURN(-EINVAL);
+		}
+	} else {
+		/* whether we sent with bulk or not, we expect the same
+		 * in reply, except for early reply */
+		if (!req->rq_early &&
+		    !equi(req->rq_pack_bulk == 1,
+			  phdr->ph_flags & PLAIN_FL_BULK)) {
+			CERROR("%s bulk checksum in reply\n",
+			       req->rq_pack_bulk ? "Missing" : "Unexpected");
+			RETURN(-EPROTO);
+		}
+
+		if (phdr->ph_flags & PLAIN_FL_BULK) {
+			if (plain_unpack_bsd(msg, swabbed))
+				RETURN(-EPROTO);
+		}
+	}
+
+	req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+	req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF);
+	RETURN(0);
+}
+
+static
+int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_bulk_sec_desc *bsd;
+	struct plain_bulk_token     *token;
+	int			  rc;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+	bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	token = (struct plain_bulk_token *) bsd->bsd_data;
+
+	bsd->bsd_version = 0;
+	bsd->bsd_flags = 0;
+	bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+	if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		RETURN(0);
+
+	if (req->rq_bulk_read)
+		RETURN(0);
+
+	rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				      token);
+	if (rc) {
+		CERROR("bulk write: failed to compute checksum: %d\n", rc);
+	} else {
+		/*
+		 * for sending we only compute the wrong checksum instead
+		 * of corrupting the data so it is still correct on a redo
+		 */
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) &&
+		    req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL)
+			token->pbt_hash[0] ^= 0x1;
+	}
+
+	return rc;
+}
+
+static
+int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+			  struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_bulk_sec_desc *bsdv;
+	struct plain_bulk_token     *tokenv;
+	int			  rc;
+	int			  i, nob;
+
+	LASSERT(req->rq_pack_bulk);
+	LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+	LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+	bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0);
+	tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+	if (req->rq_bulk_write) {
+		if (bsdv->bsd_flags & BSD_FL_ERR)
+			return -EIO;
+		return 0;
+	}
+
+	/* fix the actual data size */
+	for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+		if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) {
+			desc->bd_iov[i].kiov_len =
+				desc->bd_nob_transferred - nob;
+		}
+		nob += desc->bd_iov[i].kiov_len;
+	}
+
+	rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				    tokenv);
+	if (rc)
+		CERROR("bulk read: client verify failed: %d\n", rc);
+
+	return rc;
+}
+
+/****************************************
+ * sec apis			     *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec)
+{
+	struct ptlrpc_cli_ctx  *ctx, *ctx_new;
+
+	OBD_ALLOC_PTR(ctx_new);
+
+	write_lock(&plsec->pls_lock);
+
+	ctx = plsec->pls_ctx;
+	if (ctx) {
+		atomic_inc(&ctx->cc_refcount);
+
+		if (ctx_new)
+			OBD_FREE_PTR(ctx_new);
+	} else if (ctx_new) {
+		ctx = ctx_new;
+
+		atomic_set(&ctx->cc_refcount, 1); /* for cache */
+		ctx->cc_sec = &plsec->pls_base;
+		ctx->cc_ops = &plain_ctx_ops;
+		ctx->cc_expire = 0;
+		ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE;
+		ctx->cc_vcred.vc_uid = 0;
+		spin_lock_init(&ctx->cc_lock);
+		INIT_LIST_HEAD(&ctx->cc_req_list);
+		INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+		plsec->pls_ctx = ctx;
+		atomic_inc(&plsec->pls_base.ps_nctx);
+		atomic_inc(&plsec->pls_base.ps_refcount);
+
+		atomic_inc(&ctx->cc_refcount); /* for caller */
+	}
+
+	write_unlock(&plsec->pls_lock);
+
+	return ctx;
+}
+
+static
+void plain_destroy_sec(struct ptlrpc_sec *sec)
+{
+	struct plain_sec       *plsec = sec2plsec(sec);
+	ENTRY;
+
+	LASSERT(sec->ps_policy == &plain_policy);
+	LASSERT(sec->ps_import);
+	LASSERT(atomic_read(&sec->ps_refcount) == 0);
+	LASSERT(atomic_read(&sec->ps_nctx) == 0);
+	LASSERT(plsec->pls_ctx == NULL);
+
+	class_import_put(sec->ps_import);
+
+	OBD_FREE_PTR(plsec);
+	EXIT;
+}
+
+static
+void plain_kill_sec(struct ptlrpc_sec *sec)
+{
+	sec->ps_dying = 1;
+}
+
+static
+struct ptlrpc_sec *plain_create_sec(struct obd_import *imp,
+				    struct ptlrpc_svc_ctx *svc_ctx,
+				    struct sptlrpc_flavor *sf)
+{
+	struct plain_sec       *plsec;
+	struct ptlrpc_sec      *sec;
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN);
+
+	OBD_ALLOC_PTR(plsec);
+	if (plsec == NULL)
+		RETURN(NULL);
+
+	/*
+	 * initialize plain_sec
+	 */
+	rwlock_init(&plsec->pls_lock);
+	plsec->pls_ctx = NULL;
+
+	sec = &plsec->pls_base;
+	sec->ps_policy = &plain_policy;
+	atomic_set(&sec->ps_refcount, 0);
+	atomic_set(&sec->ps_nctx, 0);
+	sec->ps_id = sptlrpc_get_next_secid();
+	sec->ps_import = class_import_get(imp);
+	sec->ps_flvr = *sf;
+	spin_lock_init(&sec->ps_lock);
+	INIT_LIST_HEAD(&sec->ps_gc_list);
+	sec->ps_gc_interval = 0;
+	sec->ps_gc_next = 0;
+
+	/* install ctx immediately if this is a reverse sec */
+	if (svc_ctx) {
+		ctx = plain_sec_install_ctx(plsec);
+		if (ctx == NULL) {
+			plain_destroy_sec(sec);
+			RETURN(NULL);
+		}
+		sptlrpc_cli_ctx_put(ctx, 1);
+	}
+
+	RETURN(sec);
+}
+
+static
+struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec,
+					struct vfs_cred *vcred,
+					int create, int remove_dead)
+{
+	struct plain_sec       *plsec = sec2plsec(sec);
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	read_lock(&plsec->pls_lock);
+	ctx = plsec->pls_ctx;
+	if (ctx)
+		atomic_inc(&ctx->cc_refcount);
+	read_unlock(&plsec->pls_lock);
+
+	if (unlikely(ctx == NULL))
+		ctx = plain_sec_install_ctx(plsec);
+
+	RETURN(ctx);
+}
+
+static
+void plain_release_ctx(struct ptlrpc_sec *sec,
+		       struct ptlrpc_cli_ctx *ctx, int sync)
+{
+	LASSERT(atomic_read(&sec->ps_refcount) > 0);
+	LASSERT(atomic_read(&sec->ps_nctx) > 0);
+	LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+	LASSERT(ctx->cc_sec == sec);
+
+	OBD_FREE_PTR(ctx);
+
+	atomic_dec(&sec->ps_nctx);
+	sptlrpc_sec_put(sec);
+}
+
+static
+int plain_flush_ctx_cache(struct ptlrpc_sec *sec,
+			  uid_t uid, int grace, int force)
+{
+	struct plain_sec       *plsec = sec2plsec(sec);
+	struct ptlrpc_cli_ctx  *ctx;
+	ENTRY;
+
+	/* do nothing unless caller want to flush for 'all' */
+	if (uid != -1)
+		RETURN(0);
+
+	write_lock(&plsec->pls_lock);
+	ctx = plsec->pls_ctx;
+	plsec->pls_ctx = NULL;
+	write_unlock(&plsec->pls_lock);
+
+	if (ctx)
+		sptlrpc_cli_ctx_put(ctx, 1);
+	RETURN(0);
+}
+
+static
+int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req,
+		       int msgsize)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int   alloc_len;
+	ENTRY;
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_udesc)
+		buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size();
+
+	if (req->rq_pack_bulk) {
+		LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+	}
+
+	alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	if (!req->rq_reqbuf) {
+		LASSERT(!req->rq_pool);
+
+		alloc_len = size_roundup_power2(alloc_len);
+		OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len);
+		if (!req->rq_reqbuf)
+			RETURN(-ENOMEM);
+
+		req->rq_reqbuf_len = alloc_len;
+	} else {
+		LASSERT(req->rq_pool);
+		LASSERT(req->rq_reqbuf_len >= alloc_len);
+		memset(req->rq_reqbuf, 0, alloc_len);
+	}
+
+	lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+	req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0);
+
+	if (req->rq_pack_udesc)
+		sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF);
+
+	RETURN(0);
+}
+
+static
+void plain_free_reqbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req)
+{
+	ENTRY;
+	if (!req->rq_pool) {
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = NULL;
+		req->rq_reqbuf_len = 0;
+	}
+	EXIT;
+}
+
+static
+int plain_alloc_repbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req,
+		       int msgsize)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int alloc_len;
+	ENTRY;
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_bulk) {
+		LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+	}
+
+	alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	/* add space for early reply */
+	alloc_len += plain_at_offset;
+
+	alloc_len = size_roundup_power2(alloc_len);
+
+	OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len);
+	if (!req->rq_repbuf)
+		RETURN(-ENOMEM);
+
+	req->rq_repbuf_len = alloc_len;
+	RETURN(0);
+}
+
+static
+void plain_free_repbuf(struct ptlrpc_sec *sec,
+		       struct ptlrpc_request *req)
+{
+	ENTRY;
+	OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+	req->rq_repbuf = NULL;
+	req->rq_repbuf_len = 0;
+	EXIT;
+}
+
+static
+int plain_enlarge_reqbuf(struct ptlrpc_sec *sec,
+			 struct ptlrpc_request *req,
+			 int segment, int newsize)
+{
+	struct lustre_msg      *newbuf;
+	int		     oldsize;
+	int		     newmsg_size, newbuf_size;
+	ENTRY;
+
+	LASSERT(req->rq_reqbuf);
+	LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+	LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) ==
+		req->rq_reqmsg);
+
+	/* compute new embedded msg size.  */
+	oldsize = req->rq_reqmsg->lm_buflens[segment];
+	req->rq_reqmsg->lm_buflens[segment] = newsize;
+	newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount,
+					 req->rq_reqmsg->lm_buflens);
+	req->rq_reqmsg->lm_buflens[segment] = oldsize;
+
+	/* compute new wrapper msg size.  */
+	oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF];
+	req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size;
+	newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount,
+					 req->rq_reqbuf->lm_buflens);
+	req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize;
+
+	/* request from pool should always have enough buffer */
+	LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+	if (req->rq_reqbuf_len < newbuf_size) {
+		newbuf_size = size_roundup_power2(newbuf_size);
+
+		OBD_ALLOC_LARGE(newbuf, newbuf_size);
+		if (newbuf == NULL)
+			RETURN(-ENOMEM);
+
+		memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+		OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+		req->rq_reqbuf = newbuf;
+		req->rq_reqbuf_len = newbuf_size;
+		req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf,
+						PLAIN_PACK_MSG_OFF, 0);
+	}
+
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF,
+				     newmsg_size);
+	_sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+	req->rq_reqlen = newmsg_size;
+	RETURN(0);
+}
+
+/****************************************
+ * service apis			 *
+ ****************************************/
+
+static struct ptlrpc_svc_ctx plain_svc_ctx = {
+	.sc_refcount    = ATOMIC_INIT(1),
+	.sc_policy      = &plain_policy,
+};
+
+static
+int plain_accept(struct ptlrpc_request *req)
+{
+	struct lustre_msg   *msg = req->rq_reqbuf;
+	struct plain_header *phdr;
+	int		  swabbed;
+	ENTRY;
+
+	LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+		SPTLRPC_POLICY_PLAIN);
+
+	if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
+	    SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
+	    SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) !=
+	    SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) {
+		CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc);
+		RETURN(SECSVC_DROP);
+	}
+
+	if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) {
+		CERROR("unexpected request buf count %u\n", msg->lm_bufcount);
+		RETURN(SECSVC_DROP);
+	}
+
+	swabbed = ptlrpc_req_need_swab(req);
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+	if (phdr == NULL) {
+		CERROR("missing plain header\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_ver != 0) {
+		CERROR("Invalid header version\n");
+		RETURN(-EPROTO);
+	}
+
+	if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) {
+		CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg);
+		RETURN(-EPROTO);
+	}
+
+	req->rq_sp_from = phdr->ph_sp;
+	req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg;
+
+	if (phdr->ph_flags & PLAIN_FL_USER) {
+		if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF,
+					     swabbed)) {
+			CERROR("Mal-formed user descriptor\n");
+			RETURN(SECSVC_DROP);
+		}
+
+		req->rq_pack_udesc = 1;
+		req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0);
+	}
+
+	if (phdr->ph_flags & PLAIN_FL_BULK) {
+		if (plain_unpack_bsd(msg, swabbed))
+			RETURN(SECSVC_DROP);
+
+		req->rq_pack_bulk = 1;
+	}
+
+	req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+	req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF];
+
+	req->rq_svc_ctx = &plain_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+	RETURN(SECSVC_OK);
+}
+
+static
+int plain_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+	struct ptlrpc_reply_state   *rs;
+	__u32			buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int			  rs_size = sizeof(*rs);
+	ENTRY;
+
+	LASSERT(msgsize % 8 == 0);
+
+	buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+	buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+	if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write))
+		buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+
+	rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	rs = req->rq_reply_state;
+
+	if (rs) {
+		/* pre-allocated */
+		LASSERT(rs->rs_size >= rs_size);
+	} else {
+		OBD_ALLOC_LARGE(rs, rs_size);
+		if (rs == NULL)
+			RETURN(-ENOMEM);
+
+		rs->rs_size = rs_size;
+	}
+
+	rs->rs_svc_ctx = req->rq_svc_ctx;
+	atomic_inc(&req->rq_svc_ctx->sc_refcount);
+	rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+	rs->rs_repbuf_len = rs_size - sizeof(*rs);
+
+	lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+	rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0);
+
+	req->rq_reply_state = rs;
+	RETURN(0);
+}
+
+static
+void plain_free_rs(struct ptlrpc_reply_state *rs)
+{
+	ENTRY;
+
+	LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1);
+	atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+	if (!rs->rs_prealloc)
+		OBD_FREE_LARGE(rs, rs->rs_size);
+	EXIT;
+}
+
+static
+int plain_authorize(struct ptlrpc_request *req)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	struct lustre_msg_v2      *msg = rs->rs_repbuf;
+	struct plain_header       *phdr;
+	int			len;
+	ENTRY;
+
+	LASSERT(rs);
+	LASSERT(msg);
+
+	if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF])
+		len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF,
+					req->rq_replen, 1);
+	else
+		len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+	msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+	phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+	phdr->ph_ver = 0;
+	phdr->ph_flags = 0;
+	phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+	if (req->rq_pack_bulk)
+		phdr->ph_flags |= PLAIN_FL_BULK;
+
+	rs->rs_repdata_len = len;
+
+	if (likely(req->rq_packed_final)) {
+		if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+			req->rq_reply_off = plain_at_offset;
+		else
+			req->rq_reply_off = 0;
+	} else {
+		unsigned int hsize = 4;
+
+		cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+			lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+			lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+			NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize);
+			req->rq_reply_off = 0;
+	}
+
+	RETURN(0);
+}
+
+static
+int plain_svc_unwrap_bulk(struct ptlrpc_request *req,
+			  struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+	struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+	struct plain_bulk_token     *tokenr;
+	int			  rc;
+
+	LASSERT(req->rq_bulk_write);
+	LASSERT(req->rq_pack_bulk);
+
+	bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+	bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		return 0;
+
+	rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				    tokenr);
+	if (rc) {
+		bsdv->bsd_flags |= BSD_FL_ERR;
+		CERROR("bulk write: server verify failed: %d\n", rc);
+	}
+
+	return rc;
+}
+
+static
+int plain_svc_wrap_bulk(struct ptlrpc_request *req,
+			struct ptlrpc_bulk_desc *desc)
+{
+	struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+	struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+	struct plain_bulk_token     *tokenv;
+	int			  rc;
+
+	LASSERT(req->rq_bulk_read);
+	LASSERT(req->rq_pack_bulk);
+
+	bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+	bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+	tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+	bsdv->bsd_version = 0;
+	bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+	bsdv->bsd_svc = bsdr->bsd_svc;
+	bsdv->bsd_flags = 0;
+
+	if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+		return 0;
+
+	rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+				      tokenv);
+	if (rc) {
+		CERROR("bulk read: server failed to compute "
+		       "checksum: %d\n", rc);
+	} else {
+		if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))
+			corrupt_bulk_data(desc);
+	}
+
+	return rc;
+}
+
+static struct ptlrpc_ctx_ops plain_ctx_ops = {
+	.refresh		= plain_ctx_refresh,
+	.validate	       = plain_ctx_validate,
+	.sign		   = plain_ctx_sign,
+	.verify		 = plain_ctx_verify,
+	.wrap_bulk	      = plain_cli_wrap_bulk,
+	.unwrap_bulk	    = plain_cli_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops plain_sec_cops = {
+	.create_sec	     = plain_create_sec,
+	.destroy_sec	    = plain_destroy_sec,
+	.kill_sec	       = plain_kill_sec,
+	.lookup_ctx	     = plain_lookup_ctx,
+	.release_ctx	    = plain_release_ctx,
+	.flush_ctx_cache	= plain_flush_ctx_cache,
+	.alloc_reqbuf	   = plain_alloc_reqbuf,
+	.free_reqbuf	    = plain_free_reqbuf,
+	.alloc_repbuf	   = plain_alloc_repbuf,
+	.free_repbuf	    = plain_free_repbuf,
+	.enlarge_reqbuf	 = plain_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops plain_sec_sops = {
+	.accept		 = plain_accept,
+	.alloc_rs	       = plain_alloc_rs,
+	.authorize	      = plain_authorize,
+	.free_rs		= plain_free_rs,
+	.unwrap_bulk	    = plain_svc_unwrap_bulk,
+	.wrap_bulk	      = plain_svc_wrap_bulk,
+};
+
+static struct ptlrpc_sec_policy plain_policy = {
+	.sp_owner	       = THIS_MODULE,
+	.sp_name		= "plain",
+	.sp_policy	      = SPTLRPC_POLICY_PLAIN,
+	.sp_cops		= &plain_sec_cops,
+	.sp_sops		= &plain_sec_sops,
+};
+
+int sptlrpc_plain_init(void)
+{
+	__u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+	int rc;
+
+	buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size();
+	plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+	rc = sptlrpc_register_policy(&plain_policy);
+	if (rc)
+		CERROR("failed to register: %d\n", rc);
+
+	return rc;
+}
+
+void sptlrpc_plain_fini(void)
+{
+	int rc;
+
+	rc = sptlrpc_unregister_policy(&plain_policy);
+	if (rc)
+		CERROR("cannot unregister: %d\n", rc);
+}

diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
new file mode 100644
index 0000000..1667b8e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c

@@ -0,0 +1,3129 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lu_object.h>
+#include <linux/lnet/types.h>
+#include "ptlrpc_internal.h"
+
+/* The following are visible and mutable through /sys/module/ptlrpc */
+int test_req_buffer_pressure = 0;
+CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444,
+		"set non-zero to put pressure on request buffer pools");
+CFS_MODULE_PARM(at_min, "i", int, 0644,
+		"Adaptive timeout minimum (sec)");
+CFS_MODULE_PARM(at_max, "i", int, 0644,
+		"Adaptive timeout maximum (sec)");
+CFS_MODULE_PARM(at_history, "i", int, 0644,
+		"Adaptive timeouts remember the slowest event that took place "
+		"within this period (sec)");
+CFS_MODULE_PARM(at_early_margin, "i", int, 0644,
+		"How soon before an RPC deadline to send an early reply");
+CFS_MODULE_PARM(at_extra, "i", int, 0644,
+		"How much extra time to give with each early reply");
+
+
+/* forward ref */
+static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt);
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req);
+static void ptlrpc_at_remove_timed(struct ptlrpc_request *req);
+
+/** Holds a list of all PTLRPC services */
+LIST_HEAD(ptlrpc_all_services);
+/** Used to protect the \e ptlrpc_all_services list */
+struct mutex ptlrpc_all_services_mutex;
+
+struct ptlrpc_request_buffer_desc *
+ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	struct ptlrpc_request_buffer_desc *rqbd;
+
+	OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt);
+	if (rqbd == NULL)
+		return NULL;
+
+	rqbd->rqbd_svcpt = svcpt;
+	rqbd->rqbd_refcount = 0;
+	rqbd->rqbd_cbid.cbid_fn = request_in_callback;
+	rqbd->rqbd_cbid.cbid_arg = rqbd;
+	INIT_LIST_HEAD(&rqbd->rqbd_reqs);
+	OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable,
+			    svcpt->scp_cpt, svc->srv_buf_size);
+	if (rqbd->rqbd_buffer == NULL) {
+		OBD_FREE_PTR(rqbd);
+		return NULL;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+	list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+	svcpt->scp_nrqbds_total++;
+	spin_unlock(&svcpt->scp_lock);
+
+	return rqbd;
+}
+
+void
+ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+	struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
+
+	LASSERT(rqbd->rqbd_refcount == 0);
+	LASSERT(list_empty(&rqbd->rqbd_reqs));
+
+	spin_lock(&svcpt->scp_lock);
+	list_del(&rqbd->rqbd_list);
+	svcpt->scp_nrqbds_total--;
+	spin_unlock(&svcpt->scp_lock);
+
+	OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size);
+	OBD_FREE_PTR(rqbd);
+}
+
+int
+ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
+{
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	struct ptlrpc_request_buffer_desc *rqbd;
+	int				rc = 0;
+	int				i;
+
+	if (svcpt->scp_rqbd_allocating)
+		goto try_post;
+
+	spin_lock(&svcpt->scp_lock);
+	/* check again with lock */
+	if (svcpt->scp_rqbd_allocating) {
+		/* NB: we might allow more than one thread in the future */
+		LASSERT(svcpt->scp_rqbd_allocating == 1);
+		spin_unlock(&svcpt->scp_lock);
+		goto try_post;
+	}
+
+	svcpt->scp_rqbd_allocating++;
+	spin_unlock(&svcpt->scp_lock);
+
+
+	for (i = 0; i < svc->srv_nbuf_per_group; i++) {
+		/* NB: another thread might have recycled enough rqbds, we
+		 * need to make sure it wouldn't over-allocate, see LU-1212. */
+		if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group)
+			break;
+
+		rqbd = ptlrpc_alloc_rqbd(svcpt);
+
+		if (rqbd == NULL) {
+			CERROR("%s: Can't allocate request buffer\n",
+			       svc->srv_name);
+			rc = -ENOMEM;
+			break;
+		}
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	LASSERT(svcpt->scp_rqbd_allocating == 1);
+	svcpt->scp_rqbd_allocating--;
+
+	spin_unlock(&svcpt->scp_lock);
+
+	CDEBUG(D_RPCTRACE,
+	       "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n",
+	       svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted,
+	       svcpt->scp_nrqbds_total, rc);
+
+ try_post:
+	if (post && rc == 0)
+		rc = ptlrpc_server_post_idle_rqbds(svcpt);
+
+	return rc;
+}
+
+/**
+ * Part of Rep-Ack logic.
+ * Puts a lock and its mode into reply state assotiated to request reply.
+ */
+void
+ptlrpc_save_lock(struct ptlrpc_request *req,
+		 struct lustre_handle *lock, int mode, int no_ack)
+{
+	struct ptlrpc_reply_state *rs = req->rq_reply_state;
+	int			idx;
+
+	LASSERT(rs != NULL);
+	LASSERT(rs->rs_nlocks < RS_MAX_LOCKS);
+
+	if (req->rq_export->exp_disconnected) {
+		ldlm_lock_decref(lock, mode);
+	} else {
+		idx = rs->rs_nlocks++;
+		rs->rs_locks[idx] = *lock;
+		rs->rs_modes[idx] = mode;
+		rs->rs_difficult = 1;
+		rs->rs_no_ack = !!no_ack;
+	}
+}
+EXPORT_SYMBOL(ptlrpc_save_lock);
+
+
+struct ptlrpc_hr_partition;
+
+struct ptlrpc_hr_thread {
+	int				hrt_id;		/* thread ID */
+	spinlock_t			hrt_lock;
+	wait_queue_head_t			hrt_waitq;
+	struct list_head			hrt_queue;	/* RS queue */
+	struct ptlrpc_hr_partition	*hrt_partition;
+};
+
+struct ptlrpc_hr_partition {
+	/* # of started threads */
+	atomic_t			hrp_nstarted;
+	/* # of stopped threads */
+	atomic_t			hrp_nstopped;
+	/* cpu partition id */
+	int				hrp_cpt;
+	/* round-robin rotor for choosing thread */
+	int				hrp_rotor;
+	/* total number of threads on this partition */
+	int				hrp_nthrs;
+	/* threads table */
+	struct ptlrpc_hr_thread		*hrp_thrs;
+};
+
+#define HRT_RUNNING 0
+#define HRT_STOPPING 1
+
+struct ptlrpc_hr_service {
+	/* CPU partition table, it's just cfs_cpt_table for now */
+	struct cfs_cpt_table		*hr_cpt_table;
+	/** controller sleep waitq */
+	wait_queue_head_t			hr_waitq;
+	unsigned int			hr_stopping;
+	/** roundrobin rotor for non-affinity service */
+	unsigned int			hr_rotor;
+	/* partition data */
+	struct ptlrpc_hr_partition	**hr_partitions;
+};
+
+struct rs_batch {
+	struct list_head			rsb_replies;
+	unsigned int			rsb_n_replies;
+	struct ptlrpc_service_part	*rsb_svcpt;
+};
+
+/** reply handling service. */
+static struct ptlrpc_hr_service		ptlrpc_hr;
+
+/**
+ * maximum mumber of replies scheduled in one batch
+ */
+#define MAX_SCHEDULED 256
+
+/**
+ * Initialize a reply batch.
+ *
+ * \param b batch
+ */
+static void rs_batch_init(struct rs_batch *b)
+{
+	memset(b, 0, sizeof *b);
+	INIT_LIST_HEAD(&b->rsb_replies);
+}
+
+/**
+ * Choose an hr thread to dispatch requests to.
+ */
+static struct ptlrpc_hr_thread *
+ptlrpc_hr_select(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	unsigned int			rotor;
+
+	if (svcpt->scp_cpt >= 0 &&
+	    svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) {
+		/* directly match partition */
+		hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt];
+
+	} else {
+		rotor = ptlrpc_hr.hr_rotor++;
+		rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table);
+
+		hrp = ptlrpc_hr.hr_partitions[rotor];
+	}
+
+	rotor = hrp->hrp_rotor++;
+	return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs];
+}
+
+/**
+ * Dispatch all replies accumulated in the batch to one from
+ * dedicated reply handling threads.
+ *
+ * \param b batch
+ */
+static void rs_batch_dispatch(struct rs_batch *b)
+{
+	if (b->rsb_n_replies != 0) {
+		struct ptlrpc_hr_thread	*hrt;
+
+		hrt = ptlrpc_hr_select(b->rsb_svcpt);
+
+		spin_lock(&hrt->hrt_lock);
+		list_splice_init(&b->rsb_replies, &hrt->hrt_queue);
+		spin_unlock(&hrt->hrt_lock);
+
+		wake_up(&hrt->hrt_waitq);
+		b->rsb_n_replies = 0;
+	}
+}
+
+/**
+ * Add a reply to a batch.
+ * Add one reply object to a batch, schedule batched replies if overload.
+ *
+ * \param b batch
+ * \param rs reply
+ */
+static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+	if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) {
+		if (b->rsb_svcpt != NULL) {
+			rs_batch_dispatch(b);
+			spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+		}
+		spin_lock(&svcpt->scp_rep_lock);
+		b->rsb_svcpt = svcpt;
+	}
+	spin_lock(&rs->rs_lock);
+	rs->rs_scheduled_ever = 1;
+	if (rs->rs_scheduled == 0) {
+		list_move(&rs->rs_list, &b->rsb_replies);
+		rs->rs_scheduled = 1;
+		b->rsb_n_replies++;
+	}
+	rs->rs_committed = 1;
+	spin_unlock(&rs->rs_lock);
+}
+
+/**
+ * Reply batch finalization.
+ * Dispatch remaining replies from the batch
+ * and release remaining spinlock.
+ *
+ * \param b batch
+ */
+static void rs_batch_fini(struct rs_batch *b)
+{
+	if (b->rsb_svcpt != NULL) {
+		rs_batch_dispatch(b);
+		spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+	}
+}
+
+#define DECLARE_RS_BATCH(b)     struct rs_batch b
+
+
+/**
+ * Put reply state into a queue for processing because we received
+ * ACK from the client
+ */
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_hr_thread *hrt;
+	ENTRY;
+
+	LASSERT(list_empty(&rs->rs_list));
+
+	hrt = ptlrpc_hr_select(rs->rs_svcpt);
+
+	spin_lock(&hrt->hrt_lock);
+	list_add_tail(&rs->rs_list, &hrt->hrt_queue);
+	spin_unlock(&hrt->hrt_lock);
+
+	wake_up(&hrt->hrt_waitq);
+	EXIT;
+}
+
+void
+ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+	ENTRY;
+
+	LASSERT(spin_is_locked(&rs->rs_svcpt->scp_rep_lock));
+	LASSERT(spin_is_locked(&rs->rs_lock));
+	LASSERT (rs->rs_difficult);
+	rs->rs_scheduled_ever = 1;  /* flag any notification attempt */
+
+	if (rs->rs_scheduled) {     /* being set up or already notified */
+		EXIT;
+		return;
+	}
+
+	rs->rs_scheduled = 1;
+	list_del_init(&rs->rs_list);
+	ptlrpc_dispatch_difficult_reply(rs);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
+
+void ptlrpc_commit_replies(struct obd_export *exp)
+{
+	struct ptlrpc_reply_state *rs, *nxt;
+	DECLARE_RS_BATCH(batch);
+	ENTRY;
+
+	rs_batch_init(&batch);
+	/* Find any replies that have been committed and get their service
+	 * to attend to complete them. */
+
+	/* CAVEAT EMPTOR: spinlock ordering!!! */
+	spin_lock(&exp->exp_uncommitted_replies_lock);
+	list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies,
+				     rs_obd_list) {
+		LASSERT (rs->rs_difficult);
+		/* VBR: per-export last_committed */
+		LASSERT(rs->rs_export);
+		if (rs->rs_transno <= exp->exp_last_committed) {
+			list_del_init(&rs->rs_obd_list);
+			rs_batch_add(&batch, rs);
+		}
+	}
+	spin_unlock(&exp->exp_uncommitted_replies_lock);
+	rs_batch_fini(&batch);
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_commit_replies);
+
+static int
+ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_request_buffer_desc *rqbd;
+	int				  rc;
+	int				  posted = 0;
+
+	for (;;) {
+		spin_lock(&svcpt->scp_lock);
+
+		if (list_empty(&svcpt->scp_rqbd_idle)) {
+			spin_unlock(&svcpt->scp_lock);
+			return posted;
+		}
+
+		rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+				      struct ptlrpc_request_buffer_desc,
+				      rqbd_list);
+		list_del(&rqbd->rqbd_list);
+
+		/* assume we will post successfully */
+		svcpt->scp_nrqbds_posted++;
+		list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
+
+		spin_unlock(&svcpt->scp_lock);
+
+		rc = ptlrpc_register_rqbd(rqbd);
+		if (rc != 0)
+			break;
+
+		posted = 1;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	svcpt->scp_nrqbds_posted--;
+	list_del(&rqbd->rqbd_list);
+	list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+
+	/* Don't complain if no request buffers are posted right now; LNET
+	 * won't drop requests because we set the portal lazy! */
+
+	spin_unlock(&svcpt->scp_lock);
+
+	return -1;
+}
+
+static void ptlrpc_at_timer(unsigned long castmeharder)
+{
+	struct ptlrpc_service_part *svcpt;
+
+	svcpt = (struct ptlrpc_service_part *)castmeharder;
+
+	svcpt->scp_at_check = 1;
+	svcpt->scp_at_checktime = cfs_time_current();
+	wake_up(&svcpt->scp_waitq);
+}
+
+static void
+ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
+			     struct ptlrpc_service_conf *conf)
+{
+	struct ptlrpc_service_thr_conf	*tc = &conf->psc_thr;
+	unsigned			init;
+	unsigned			total;
+	unsigned			nthrs;
+	int				weight;
+
+	/*
+	 * Common code for estimating & validating threads number.
+	 * CPT affinity service could have percpt thread-pool instead
+	 * of a global thread-pool, which means user might not always
+	 * get the threads number they give it in conf::tc_nthrs_user
+	 * even they did set. It's because we need to validate threads
+	 * number for each CPT to guarantee each pool will have enough
+	 * threads to keep the service healthy.
+	 */
+	init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL);
+	init = max_t(int, init, tc->tc_nthrs_init);
+
+	/* NB: please see comments in lustre_lnet.h for definition
+	 * details of these members */
+	LASSERT(tc->tc_nthrs_max != 0);
+
+	if (tc->tc_nthrs_user != 0) {
+		/* In case there is a reason to test a service with many
+		 * threads, we give a less strict check here, it can
+		 * be up to 8 * nthrs_max */
+		total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user);
+		nthrs = total / svc->srv_ncpts;
+		init  = max(init, nthrs);
+		goto out;
+	}
+
+	total = tc->tc_nthrs_max;
+	if (tc->tc_nthrs_base == 0) {
+		/* don't care about base threads number per partition,
+		 * this is most for non-affinity service */
+		nthrs = total / svc->srv_ncpts;
+		goto out;
+	}
+
+	nthrs = tc->tc_nthrs_base;
+	if (svc->srv_ncpts == 1) {
+		int	i;
+
+		/* NB: Increase the base number if it's single partition
+		 * and total number of cores/HTs is larger or equal to 4.
+		 * result will always < 2 * nthrs_base */
+		weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY);
+		for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */
+			    (tc->tc_nthrs_base >> i) != 0; i++)
+			nthrs += tc->tc_nthrs_base >> i;
+	}
+
+	if (tc->tc_thr_factor != 0) {
+		int	  factor = tc->tc_thr_factor;
+		const int fade = 4;
+
+		/*
+		 * User wants to increase number of threads with for
+		 * each CPU core/HT, most likely the factor is larger then
+		 * one thread/core because service threads are supposed to
+		 * be blocked by lock or wait for IO.
+		 */
+		/*
+		 * Amdahl's law says that adding processors wouldn't give
+		 * a linear increasing of parallelism, so it's nonsense to
+		 * have too many threads no matter how many cores/HTs
+		 * there are.
+		 */
+		if (cfs_cpu_ht_nsiblings(0) > 1) { /* weight is # of HTs */
+			/* depress thread factor for hyper-thread */
+			factor = factor - (factor >> 1) + (factor >> 3);
+		}
+
+		weight = cfs_cpt_weight(svc->srv_cptable, 0);
+		LASSERT(weight > 0);
+
+		for (; factor > 0 && weight > 0; factor--, weight -= fade)
+			nthrs += min(weight, fade) * factor;
+	}
+
+	if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+		nthrs = max(tc->tc_nthrs_base,
+			    tc->tc_nthrs_max / svc->srv_ncpts);
+	}
+ out:
+	nthrs = max(nthrs, tc->tc_nthrs_init);
+	svc->srv_nthrs_cpt_limit = nthrs;
+	svc->srv_nthrs_cpt_init = init;
+
+	if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+		CDEBUG(D_OTHER, "%s: This service may have more threads (%d) "
+		       "than the given soft limit (%d)\n",
+		       svc->srv_name, nthrs * svc->srv_ncpts,
+		       tc->tc_nthrs_max);
+	}
+}
+
+/**
+ * Initialize percpt data for a service
+ */
+static int
+ptlrpc_service_part_init(struct ptlrpc_service *svc,
+			 struct ptlrpc_service_part *svcpt, int cpt)
+{
+	struct ptlrpc_at_array	*array;
+	int			size;
+	int			index;
+	int			rc;
+
+	svcpt->scp_cpt = cpt;
+	INIT_LIST_HEAD(&svcpt->scp_threads);
+
+	/* rqbd and incoming request queue */
+	spin_lock_init(&svcpt->scp_lock);
+	INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
+	INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
+	INIT_LIST_HEAD(&svcpt->scp_req_incoming);
+	init_waitqueue_head(&svcpt->scp_waitq);
+	/* history request & rqbd list */
+	INIT_LIST_HEAD(&svcpt->scp_hist_reqs);
+	INIT_LIST_HEAD(&svcpt->scp_hist_rqbds);
+
+	/* acitve requests and hp requests */
+	spin_lock_init(&svcpt->scp_req_lock);
+
+	/* reply states */
+	spin_lock_init(&svcpt->scp_rep_lock);
+	INIT_LIST_HEAD(&svcpt->scp_rep_active);
+	INIT_LIST_HEAD(&svcpt->scp_rep_idle);
+	init_waitqueue_head(&svcpt->scp_rep_waitq);
+	atomic_set(&svcpt->scp_nreps_difficult, 0);
+
+	/* adaptive timeout */
+	spin_lock_init(&svcpt->scp_at_lock);
+	array = &svcpt->scp_at_array;
+
+	size = at_est2timeout(at_max);
+	array->paa_size     = size;
+	array->paa_count    = 0;
+	array->paa_deadline = -1;
+
+	/* allocate memory for scp_at_array (ptlrpc_at_array) */
+	OBD_CPT_ALLOC(array->paa_reqs_array,
+		      svc->srv_cptable, cpt, sizeof(struct list_head) * size);
+	if (array->paa_reqs_array == NULL)
+		return -ENOMEM;
+
+	for (index = 0; index < size; index++)
+		INIT_LIST_HEAD(&array->paa_reqs_array[index]);
+
+	OBD_CPT_ALLOC(array->paa_reqs_count,
+		      svc->srv_cptable, cpt, sizeof(__u32) * size);
+	if (array->paa_reqs_count == NULL)
+		goto failed;
+
+	cfs_timer_init(&svcpt->scp_at_timer, ptlrpc_at_timer, svcpt);
+	/* At SOW, service time should be quick; 10s seems generous. If client
+	 * timeout is less than this, we'll be sending an early reply. */
+	at_init(&svcpt->scp_at_estimate, 10, 0);
+
+	/* assign this before call ptlrpc_grow_req_bufs */
+	svcpt->scp_service = svc;
+	/* Now allocate the request buffers, but don't post them now */
+	rc = ptlrpc_grow_req_bufs(svcpt, 0);
+	/* We shouldn't be under memory pressure at startup, so
+	 * fail if we can't allocate all our buffers at this time. */
+	if (rc != 0)
+		goto failed;
+
+	return 0;
+
+ failed:
+	if (array->paa_reqs_count != NULL) {
+		OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size);
+		array->paa_reqs_count = NULL;
+	}
+
+	if (array->paa_reqs_array != NULL) {
+		OBD_FREE(array->paa_reqs_array,
+			 sizeof(struct list_head) * array->paa_size);
+		array->paa_reqs_array = NULL;
+	}
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize service on a given portal.
+ * This includes starting serving threads , allocating and posting rqbds and
+ * so on.
+ */
+struct ptlrpc_service *
+ptlrpc_register_service(struct ptlrpc_service_conf *conf,
+			proc_dir_entry_t *proc_entry)
+{
+	struct ptlrpc_service_cpt_conf	*cconf = &conf->psc_cpt;
+	struct ptlrpc_service		*service;
+	struct ptlrpc_service_part	*svcpt;
+	struct cfs_cpt_table		*cptable;
+	__u32				*cpts = NULL;
+	int				ncpts;
+	int				cpt;
+	int				rc;
+	int				i;
+	ENTRY;
+
+	LASSERT(conf->psc_buf.bc_nbufs > 0);
+	LASSERT(conf->psc_buf.bc_buf_size >=
+		conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD);
+	LASSERT(conf->psc_thr.tc_ctx_tags != 0);
+
+	cptable = cconf->cc_cptable;
+	if (cptable == NULL)
+		cptable = cfs_cpt_table;
+
+	if (!conf->psc_thr.tc_cpu_affinity) {
+		ncpts = 1;
+	} else {
+		ncpts = cfs_cpt_number(cptable);
+		if (cconf->cc_pattern != NULL) {
+			struct cfs_expr_list	*el;
+
+			rc = cfs_expr_list_parse(cconf->cc_pattern,
+						 strlen(cconf->cc_pattern),
+						 0, ncpts - 1, &el);
+			if (rc != 0) {
+				CERROR("%s: invalid CPT pattern string: %s",
+				       conf->psc_name, cconf->cc_pattern);
+				RETURN(ERR_PTR(-EINVAL));
+			}
+
+			rc = cfs_expr_list_values(el, ncpts, &cpts);
+			cfs_expr_list_free(el);
+			if (rc <= 0) {
+				CERROR("%s: failed to parse CPT array %s: %d\n",
+				       conf->psc_name, cconf->cc_pattern, rc);
+				if (cpts != NULL)
+					OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+				RETURN(ERR_PTR(rc < 0 ? rc : -EINVAL));
+			}
+			ncpts = rc;
+		}
+	}
+
+	OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts]));
+	if (service == NULL) {
+		if (cpts != NULL)
+			OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	service->srv_cptable		= cptable;
+	service->srv_cpts		= cpts;
+	service->srv_ncpts		= ncpts;
+
+	service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
+	while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
+		service->srv_cpt_bits++;
+
+	/* public members */
+	spin_lock_init(&service->srv_lock);
+	service->srv_name		= conf->psc_name;
+	service->srv_watchdog_factor	= conf->psc_watchdog_factor;
+	INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */
+
+	/* buffer configuration */
+	service->srv_nbuf_per_group	= test_req_buffer_pressure ?
+					  1 : conf->psc_buf.bc_nbufs;
+	service->srv_max_req_size	= conf->psc_buf.bc_req_max_size +
+					  SPTLRPC_MAX_PAYLOAD;
+	service->srv_buf_size		= conf->psc_buf.bc_buf_size;
+	service->srv_rep_portal		= conf->psc_buf.bc_rep_portal;
+	service->srv_req_portal		= conf->psc_buf.bc_req_portal;
+
+	/* Increase max reply size to next power of two */
+	service->srv_max_reply_size = 1;
+	while (service->srv_max_reply_size <
+	       conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD)
+		service->srv_max_reply_size <<= 1;
+
+	service->srv_thread_name	= conf->psc_thr.tc_thr_name;
+	service->srv_ctx_tags		= conf->psc_thr.tc_ctx_tags;
+	service->srv_hpreq_ratio	= PTLRPC_SVC_HP_RATIO;
+	service->srv_ops		= conf->psc_ops;
+
+	for (i = 0; i < ncpts; i++) {
+		if (!conf->psc_thr.tc_cpu_affinity)
+			cpt = CFS_CPT_ANY;
+		else
+			cpt = cpts != NULL ? cpts[i] : i;
+
+		OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt));
+		if (svcpt == NULL)
+			GOTO(failed, rc = -ENOMEM);
+
+		service->srv_parts[i] = svcpt;
+		rc = ptlrpc_service_part_init(service, svcpt, cpt);
+		if (rc != 0)
+			GOTO(failed, rc);
+	}
+
+	ptlrpc_server_nthreads_check(service, conf);
+
+	rc = LNetSetLazyPortal(service->srv_req_portal);
+	LASSERT(rc == 0);
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+	list_add (&service->srv_list, &ptlrpc_all_services);
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+	if (proc_entry != NULL)
+		ptlrpc_lprocfs_register_service(proc_entry, service);
+
+	rc = ptlrpc_service_nrs_setup(service);
+	if (rc != 0)
+		GOTO(failed, rc);
+
+	CDEBUG(D_NET, "%s: Started, listening on portal %d\n",
+	       service->srv_name, service->srv_req_portal);
+
+	rc = ptlrpc_start_threads(service);
+	if (rc != 0) {
+		CERROR("Failed to start threads for service %s: %d\n",
+		       service->srv_name, rc);
+		GOTO(failed, rc);
+	}
+
+	RETURN(service);
+failed:
+	ptlrpc_unregister_service(service);
+	RETURN(ERR_PTR(rc));
+}
+EXPORT_SYMBOL(ptlrpc_register_service);
+
+/**
+ * to actually free the request, must be called without holding svc_lock.
+ * note it's caller's responsibility to unlink req->rq_list.
+ */
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+	LASSERT(atomic_read(&req->rq_refcount) == 0);
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	 /* DEBUG_REQ() assumes the reply state of a request with a valid
+	  * ref will not be destroyed until that reference is dropped. */
+	ptlrpc_req_drop_rs(req);
+
+	sptlrpc_svc_ctx_decref(req);
+
+	if (req != &req->rq_rqbd->rqbd_req) {
+		/* NB request buffers use an embedded
+		 * req if the incoming req unlinked the
+		 * MD; this isn't one of them! */
+		OBD_FREE(req, sizeof(*req));
+	}
+}
+
+/**
+ * drop a reference count of the request. if it reaches 0, we either
+ * put it into history list, or free it immediately.
+ */
+void ptlrpc_server_drop_request(struct ptlrpc_request *req)
+{
+	struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
+	struct ptlrpc_service_part	  *svcpt = rqbd->rqbd_svcpt;
+	struct ptlrpc_service		  *svc = svcpt->scp_service;
+	int				refcount;
+	struct list_head			*tmp;
+	struct list_head			*nxt;
+
+	if (!atomic_dec_and_test(&req->rq_refcount))
+		return;
+
+	if (req->rq_at_linked) {
+		spin_lock(&svcpt->scp_at_lock);
+		/* recheck with lock, in case it's unlinked by
+		 * ptlrpc_at_check_timed() */
+		if (likely(req->rq_at_linked))
+			ptlrpc_at_remove_timed(req);
+		spin_unlock(&svcpt->scp_at_lock);
+	}
+
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	/* finalize request */
+	if (req->rq_export) {
+		class_export_put(req->rq_export);
+		req->rq_export = NULL;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	list_add(&req->rq_list, &rqbd->rqbd_reqs);
+
+	refcount = --(rqbd->rqbd_refcount);
+	if (refcount == 0) {
+		/* request buffer is now idle: add to history */
+		list_del(&rqbd->rqbd_list);
+
+		list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
+		svcpt->scp_hist_nrqbds++;
+
+		/* cull some history?
+		 * I expect only about 1 or 2 rqbds need to be recycled here */
+		while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) {
+			rqbd = list_entry(svcpt->scp_hist_rqbds.next,
+					      struct ptlrpc_request_buffer_desc,
+					      rqbd_list);
+
+			list_del(&rqbd->rqbd_list);
+			svcpt->scp_hist_nrqbds--;
+
+			/* remove rqbd's reqs from svc's req history while
+			 * I've got the service lock */
+			list_for_each(tmp, &rqbd->rqbd_reqs) {
+				req = list_entry(tmp, struct ptlrpc_request,
+						     rq_list);
+				/* Track the highest culled req seq */
+				if (req->rq_history_seq >
+				    svcpt->scp_hist_seq_culled) {
+					svcpt->scp_hist_seq_culled =
+						req->rq_history_seq;
+				}
+				list_del(&req->rq_history_list);
+			}
+
+			spin_unlock(&svcpt->scp_lock);
+
+			list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) {
+				req = list_entry(rqbd->rqbd_reqs.next,
+						     struct ptlrpc_request,
+						     rq_list);
+				list_del(&req->rq_list);
+				ptlrpc_server_free_request(req);
+			}
+
+			spin_lock(&svcpt->scp_lock);
+			/*
+			 * now all reqs including the embedded req has been
+			 * disposed, schedule request buffer for re-use.
+			 */
+			LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) ==
+				0);
+			list_add_tail(&rqbd->rqbd_list,
+					  &svcpt->scp_rqbd_idle);
+		}
+
+		spin_unlock(&svcpt->scp_lock);
+	} else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
+		/* If we are low on memory, we are not interested in history */
+		list_del(&req->rq_list);
+		list_del_init(&req->rq_history_list);
+
+		/* Track the highest culled req seq */
+		if (req->rq_history_seq > svcpt->scp_hist_seq_culled)
+			svcpt->scp_hist_seq_culled = req->rq_history_seq;
+
+		spin_unlock(&svcpt->scp_lock);
+
+		ptlrpc_server_free_request(req);
+	} else {
+		spin_unlock(&svcpt->scp_lock);
+	}
+}
+
+/** Change request export and move hp request from old export to new */
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+				  struct obd_export *export)
+{
+	if (req->rq_export != NULL) {
+		if (!list_empty(&req->rq_exp_list)) {
+			/* remove rq_exp_list from last export */
+			spin_lock_bh(&req->rq_export->exp_rpc_lock);
+			list_del_init(&req->rq_exp_list);
+			spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+
+			/* export has one reference already, so it`s safe to
+			 * add req to export queue here and get another
+			 * reference for request later */
+			spin_lock_bh(&export->exp_rpc_lock);
+			list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
+			spin_unlock_bh(&export->exp_rpc_lock);
+		}
+		class_export_rpc_dec(req->rq_export);
+		class_export_put(req->rq_export);
+	}
+
+	/* request takes one export refcount */
+	req->rq_export = class_export_get(export);
+	class_export_rpc_inc(export);
+
+	return;
+}
+
+/**
+ * to finish a request: stop sending more early replies, and release
+ * the request.
+ */
+static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt,
+					 struct ptlrpc_request *req)
+{
+	ptlrpc_server_hpreq_fini(req);
+
+	ptlrpc_server_drop_request(req);
+}
+
+/**
+ * to finish a active request: stop sending more early replies, and release
+ * the request. should be called after we finished handling the request.
+ */
+static void ptlrpc_server_finish_active_request(
+					struct ptlrpc_service_part *svcpt,
+					struct ptlrpc_request *req)
+{
+	spin_lock(&svcpt->scp_req_lock);
+	ptlrpc_nrs_req_stop_nolock(req);
+	svcpt->scp_nreqs_active--;
+	if (req->rq_hp)
+		svcpt->scp_nhreqs_active--;
+	spin_unlock(&svcpt->scp_req_lock);
+
+	ptlrpc_nrs_req_finalize(req);
+
+	if (req->rq_export != NULL)
+		class_export_rpc_dec(req->rq_export);
+
+	ptlrpc_server_finish_request(svcpt, req);
+}
+
+/**
+ * This function makes sure dead exports are evicted in a timely manner.
+ * This function is only called when some export receives a message (i.e.,
+ * the network is up.)
+ */
+static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+{
+	struct obd_export *oldest_exp;
+	time_t oldest_time, new_time;
+
+	ENTRY;
+
+	LASSERT(exp);
+
+	/* Compensate for slow machines, etc, by faking our request time
+	   into the future.  Although this can break the strict time-ordering
+	   of the list, we can be really lazy here - we don't have to evict
+	   at the exact right moment.  Eventually, all silent exports
+	   will make it to the top of the list. */
+
+	/* Do not pay attention on 1sec or smaller renewals. */
+	new_time = cfs_time_current_sec() + extra_delay;
+	if (exp->exp_last_request_time + 1 /*second */ >= new_time)
+		RETURN_EXIT;
+
+	exp->exp_last_request_time = new_time;
+	CDEBUG(D_HA, "updating export %s at "CFS_TIME_T" exp %p\n",
+	       exp->exp_client_uuid.uuid,
+	       exp->exp_last_request_time, exp);
+
+	/* exports may get disconnected from the chain even though the
+	   export has references, so we must keep the spin lock while
+	   manipulating the lists */
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+
+	if (list_empty(&exp->exp_obd_chain_timed)) {
+		/* this one is not timed */
+		spin_unlock(&exp->exp_obd->obd_dev_lock);
+		RETURN_EXIT;
+	}
+
+	list_move_tail(&exp->exp_obd_chain_timed,
+			   &exp->exp_obd->obd_exports_timed);
+
+	oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+				    struct obd_export, exp_obd_chain_timed);
+	oldest_time = oldest_exp->exp_last_request_time;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+	if (exp->exp_obd->obd_recovering) {
+		/* be nice to everyone during recovery */
+		EXIT;
+		return;
+	}
+
+	/* Note - racing to start/reset the obd_eviction timer is safe */
+	if (exp->exp_obd->obd_eviction_timer == 0) {
+		/* Check if the oldest entry is expired. */
+		if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
+					      extra_delay)) {
+			/* We need a second timer, in case the net was down and
+			 * it just came back. Since the pinger may skip every
+			 * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+			 * we better wait for 3. */
+			exp->exp_obd->obd_eviction_timer =
+				cfs_time_current_sec() + 3 * PING_INTERVAL;
+			CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n",
+			       exp->exp_obd->obd_name,
+			       obd_export_nid2str(oldest_exp), oldest_time);
+		}
+	} else {
+		if (cfs_time_current_sec() >
+		    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+			/* The evictor won't evict anyone who we've heard from
+			 * recently, so we don't have to check before we start
+			 * it. */
+			if (!ping_evictor_wake(exp))
+				exp->exp_obd->obd_eviction_timer = 0;
+		}
+	}
+
+	EXIT;
+}
+
+/**
+ * Sanity check request \a req.
+ * Return 0 if all is ok, error code otherwise.
+ */
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+	int rc = 0;
+
+	if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) <
+		     req->rq_export->exp_conn_cnt)) {
+		DEBUG_REQ(D_RPCTRACE, req,
+			  "DROPPING req from old connection %d < %d",
+			  lustre_msg_get_conn_cnt(req->rq_reqmsg),
+			  req->rq_export->exp_conn_cnt);
+		return -EEXIST;
+	}
+	if (unlikely(req->rq_export->exp_obd &&
+		     req->rq_export->exp_obd->obd_fail)) {
+	     /* Failing over, don't handle any more reqs, send
+		error response instead. */
+		CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+		       req, req->rq_export->exp_obd->obd_name);
+		rc = -ENODEV;
+	} else if (lustre_msg_get_flags(req->rq_reqmsg) &
+		   (MSG_REPLAY | MSG_REQ_REPLAY_DONE) &&
+		   !(req->rq_export->exp_obd->obd_recovering)) {
+			DEBUG_REQ(D_ERROR, req,
+				  "Invalid replay without recovery");
+			class_fail_export(req->rq_export);
+			rc = -ENODEV;
+	} else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 &&
+		   !(req->rq_export->exp_obd->obd_recovering)) {
+			DEBUG_REQ(D_ERROR, req, "Invalid req with transno "
+				  LPU64" without recovery",
+				  lustre_msg_get_transno(req->rq_reqmsg));
+			class_fail_export(req->rq_export);
+			rc = -ENODEV;
+	}
+
+	if (unlikely(rc < 0)) {
+		req->rq_status = rc;
+		ptlrpc_error(req);
+	}
+	return rc;
+}
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	__s32 next;
+
+	if (array->paa_count == 0) {
+		cfs_timer_disarm(&svcpt->scp_at_timer);
+		return;
+	}
+
+	/* Set timer for closest deadline */
+	next = (__s32)(array->paa_deadline - cfs_time_current_sec() -
+		       at_early_margin);
+	if (next <= 0) {
+		ptlrpc_at_timer((unsigned long)svcpt);
+	} else {
+		cfs_timer_arm(&svcpt->scp_at_timer, cfs_time_shift(next));
+		CDEBUG(D_INFO, "armed %s at %+ds\n",
+		       svcpt->scp_service->srv_name, next);
+	}
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	struct ptlrpc_request *rq = NULL;
+	__u32 index;
+
+	if (AT_OFF)
+		return(0);
+
+	if (req->rq_no_reply)
+		return 0;
+
+	if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
+		return(-ENOSYS);
+
+	spin_lock(&svcpt->scp_at_lock);
+	LASSERT(list_empty(&req->rq_timed_list));
+
+	index = (unsigned long)req->rq_deadline % array->paa_size;
+	if (array->paa_reqs_count[index] > 0) {
+		/* latest rpcs will have the latest deadlines in the list,
+		 * so search backward. */
+		list_for_each_entry_reverse(rq,
+						&array->paa_reqs_array[index],
+						rq_timed_list) {
+			if (req->rq_deadline >= rq->rq_deadline) {
+				list_add(&req->rq_timed_list,
+					     &rq->rq_timed_list);
+				break;
+			}
+		}
+	}
+
+	/* Add the request at the head of the list */
+	if (list_empty(&req->rq_timed_list))
+		list_add(&req->rq_timed_list,
+			     &array->paa_reqs_array[index]);
+
+	spin_lock(&req->rq_lock);
+	req->rq_at_linked = 1;
+	spin_unlock(&req->rq_lock);
+	req->rq_at_index = index;
+	array->paa_reqs_count[index]++;
+	array->paa_count++;
+	if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) {
+		array->paa_deadline = req->rq_deadline;
+		ptlrpc_at_set_timer(svcpt);
+	}
+	spin_unlock(&svcpt->scp_at_lock);
+
+	return 0;
+}
+
+static void
+ptlrpc_at_remove_timed(struct ptlrpc_request *req)
+{
+	struct ptlrpc_at_array *array;
+
+	array = &req->rq_rqbd->rqbd_svcpt->scp_at_array;
+
+	/* NB: must call with hold svcpt::scp_at_lock */
+	LASSERT(!list_empty(&req->rq_timed_list));
+	list_del_init(&req->rq_timed_list);
+
+	spin_lock(&req->rq_lock);
+	req->rq_at_linked = 0;
+	spin_unlock(&req->rq_lock);
+
+	array->paa_reqs_count[req->rq_at_index]--;
+	array->paa_count--;
+}
+
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	struct ptlrpc_request *reqcopy;
+	struct lustre_msg *reqmsg;
+	cfs_duration_t olddl = req->rq_deadline - cfs_time_current_sec();
+	time_t newdl;
+	int rc;
+	ENTRY;
+
+	/* deadline is when the client expects us to reply, margin is the
+	   difference between clients' and servers' expectations */
+	DEBUG_REQ(D_ADAPTTO, req,
+		  "%ssending early reply (deadline %+lds, margin %+lds) for "
+		  "%d+%d", AT_OFF ? "AT off - not " : "",
+		  olddl, olddl - at_get(&svcpt->scp_at_estimate),
+		  at_get(&svcpt->scp_at_estimate), at_extra);
+
+	if (AT_OFF)
+		RETURN(0);
+
+	if (olddl < 0) {
+		DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), "
+			  "not sending early reply. Consider increasing "
+			  "at_early_margin (%d)?", olddl, at_early_margin);
+
+		/* Return an error so we're not re-added to the timed list. */
+		RETURN(-ETIMEDOUT);
+	}
+
+	if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){
+		DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, "
+			  "but no AT support");
+		RETURN(-ENOSYS);
+	}
+
+	if (req->rq_export &&
+	    lustre_msg_get_flags(req->rq_reqmsg) &
+	    (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+		/* During recovery, we don't want to send too many early
+		 * replies, but on the other hand we want to make sure the
+		 * client has enough time to resend if the rpc is lost. So
+		 * during the recovery period send at least 4 early replies,
+		 * spacing them every at_extra if we can. at_estimate should
+		 * always equal this fixed value during recovery. */
+		at_measured(&svcpt->scp_at_estimate, min(at_extra,
+			    req->rq_export->exp_obd->obd_recovery_timeout / 4));
+	} else {
+		/* Fake our processing time into the future to ask the clients
+		 * for some extra amount of time */
+		at_measured(&svcpt->scp_at_estimate, at_extra +
+			    cfs_time_current_sec() -
+			    req->rq_arrival_time.tv_sec);
+
+		/* Check to see if we've actually increased the deadline -
+		 * we may be past adaptive_max */
+		if (req->rq_deadline >= req->rq_arrival_time.tv_sec +
+		    at_get(&svcpt->scp_at_estimate)) {
+			DEBUG_REQ(D_WARNING, req, "Couldn't add any time "
+				  "(%ld/%ld), not sending early reply\n",
+				  olddl, req->rq_arrival_time.tv_sec +
+				  at_get(&svcpt->scp_at_estimate) -
+				  cfs_time_current_sec());
+			RETURN(-ETIMEDOUT);
+		}
+	}
+	newdl = cfs_time_current_sec() + at_get(&svcpt->scp_at_estimate);
+
+	OBD_ALLOC(reqcopy, sizeof *reqcopy);
+	if (reqcopy == NULL)
+		RETURN(-ENOMEM);
+	OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen);
+	if (!reqmsg) {
+		OBD_FREE(reqcopy, sizeof *reqcopy);
+		RETURN(-ENOMEM);
+	}
+
+	*reqcopy = *req;
+	reqcopy->rq_reply_state = NULL;
+	reqcopy->rq_rep_swab_mask = 0;
+	reqcopy->rq_pack_bulk = 0;
+	reqcopy->rq_pack_udesc = 0;
+	reqcopy->rq_packed_final = 0;
+	sptlrpc_svc_ctx_addref(reqcopy);
+	/* We only need the reqmsg for the magic */
+	reqcopy->rq_reqmsg = reqmsg;
+	memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+	LASSERT(atomic_read(&req->rq_refcount));
+	/** if it is last refcount then early reply isn't needed */
+	if (atomic_read(&req->rq_refcount) == 1) {
+		DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, "
+			  "abort sending early reply\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* Connection ref */
+	reqcopy->rq_export = class_conn2export(
+				     lustre_msg_get_handle(reqcopy->rq_reqmsg));
+	if (reqcopy->rq_export == NULL)
+		GOTO(out, rc = -ENODEV);
+
+	/* RPC ref */
+	class_export_rpc_inc(reqcopy->rq_export);
+	if (reqcopy->rq_export->exp_obd &&
+	    reqcopy->rq_export->exp_obd->obd_fail)
+		GOTO(out_put, rc = -ENODEV);
+
+	rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+	if (rc)
+		GOTO(out_put, rc);
+
+	rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
+
+	if (!rc) {
+		/* Adjust our own deadline to what we told the client */
+		req->rq_deadline = newdl;
+		req->rq_early_count++; /* number sent, server side */
+	} else {
+		DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+	}
+
+	/* Free the (early) reply state from lustre_pack_reply.
+	   (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+	ptlrpc_req_drop_rs(reqcopy);
+
+out_put:
+	class_export_rpc_dec(reqcopy->rq_export);
+	class_export_put(reqcopy->rq_export);
+out:
+	sptlrpc_svc_ctx_decref(reqcopy);
+	OBD_FREE_LARGE(reqmsg, req->rq_reqlen);
+	OBD_FREE(reqcopy, sizeof *reqcopy);
+	RETURN(rc);
+}
+
+/* Send early replies to everybody expiring within at_early_margin
+   asking for at_extra time */
+static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+	struct ptlrpc_request *rq, *n;
+	struct list_head work_list;
+	__u32  index, count;
+	time_t deadline;
+	time_t now = cfs_time_current_sec();
+	cfs_duration_t delay;
+	int first, counter = 0;
+	ENTRY;
+
+	spin_lock(&svcpt->scp_at_lock);
+	if (svcpt->scp_at_check == 0) {
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+	delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime);
+	svcpt->scp_at_check = 0;
+
+	if (array->paa_count == 0) {
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+
+	/* The timer went off, but maybe the nearest rpc already completed. */
+	first = array->paa_deadline - now;
+	if (first > at_early_margin) {
+		/* We've still got plenty of time.  Reset the timer. */
+		ptlrpc_at_set_timer(svcpt);
+		spin_unlock(&svcpt->scp_at_lock);
+		RETURN(0);
+	}
+
+	/* We're close to a timeout, and we don't know how much longer the
+	   server will take. Send early replies to everyone expiring soon. */
+	INIT_LIST_HEAD(&work_list);
+	deadline = -1;
+	index = (unsigned long)array->paa_deadline % array->paa_size;
+	count = array->paa_count;
+	while (count > 0) {
+		count -= array->paa_reqs_count[index];
+		list_for_each_entry_safe(rq, n,
+					     &array->paa_reqs_array[index],
+					     rq_timed_list) {
+			if (rq->rq_deadline > now + at_early_margin) {
+				/* update the earliest deadline */
+				if (deadline == -1 ||
+				    rq->rq_deadline < deadline)
+					deadline = rq->rq_deadline;
+				break;
+			}
+
+			ptlrpc_at_remove_timed(rq);
+			/**
+			 * ptlrpc_server_drop_request() may drop
+			 * refcount to 0 already. Let's check this and
+			 * don't add entry to work_list
+			 */
+			if (likely(atomic_inc_not_zero(&rq->rq_refcount)))
+				list_add(&rq->rq_timed_list, &work_list);
+			counter++;
+		}
+
+		if (++index >= array->paa_size)
+			index = 0;
+	}
+	array->paa_deadline = deadline;
+	/* we have a new earliest deadline, restart the timer */
+	ptlrpc_at_set_timer(svcpt);
+
+	spin_unlock(&svcpt->scp_at_lock);
+
+	CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
+	       "replies\n", first, at_extra, counter);
+	if (first < 0) {
+		/* We're already past request deadlines before we even get a
+		   chance to send early replies */
+		LCONSOLE_WARN("%s: This server is not able to keep up with "
+			      "request traffic (cpu-bound).\n",
+			      svcpt->scp_service->srv_name);
+		CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, "
+		      "delay="CFS_DURATION_T"(jiff)\n",
+		      counter, svcpt->scp_nreqs_incoming,
+		      svcpt->scp_nreqs_active,
+		      at_get(&svcpt->scp_at_estimate), delay);
+	}
+
+	/* we took additional refcount so entries can't be deleted from list, no
+	 * locking is needed */
+	while (!list_empty(&work_list)) {
+		rq = list_entry(work_list.next, struct ptlrpc_request,
+				    rq_timed_list);
+		list_del_init(&rq->rq_timed_list);
+
+		if (ptlrpc_at_send_early_reply(rq) == 0)
+			ptlrpc_at_add_timed(rq);
+
+		ptlrpc_server_drop_request(rq);
+	}
+
+	RETURN(1); /* return "did_something" for liblustre */
+}
+
+/**
+ * Put the request to the export list if the request may become
+ * a high priority one.
+ */
+static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
+				    struct ptlrpc_request *req)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (svcpt->scp_service->srv_ops.so_hpreq_handler) {
+		rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req);
+		if (rc < 0)
+			RETURN(rc);
+		LASSERT(rc == 0);
+	}
+	if (req->rq_export && req->rq_ops) {
+		/* Perform request specific check. We should do this check
+		 * before the request is added into exp_hp_rpcs list otherwise
+		 * it may hit swab race at LU-1044. */
+		if (req->rq_ops->hpreq_check) {
+			rc = req->rq_ops->hpreq_check(req);
+			/**
+			 * XXX: Out of all current
+			 * ptlrpc_hpreq_ops::hpreq_check(), only
+			 * ldlm_cancel_hpreq_check() can return an error code;
+			 * other functions assert in similar places, which seems
+			 * odd. What also does not seem right is that handlers
+			 * for those RPCs do not assert on the same checks, but
+			 * rather handle the error cases. e.g. see
+			 * ost_rw_hpreq_check(), and ost_brw_read(),
+			 * ost_brw_write().
+			 */
+			if (rc < 0)
+				RETURN(rc);
+			LASSERT(rc == 0 || rc == 1);
+		}
+
+		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		list_add(&req->rq_exp_list,
+			     &req->rq_export->exp_hp_rpcs);
+		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+	}
+
+	ptlrpc_nrs_req_initialize(svcpt, req, rc);
+
+	RETURN(rc);
+}
+
+/** Remove the request from the export list. */
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
+{
+	ENTRY;
+	if (req->rq_export && req->rq_ops) {
+		/* refresh lock timeout again so that client has more
+		 * room to send lock cancel RPC. */
+		if (req->rq_ops->hpreq_fini)
+			req->rq_ops->hpreq_fini(req);
+
+		spin_lock_bh(&req->rq_export->exp_rpc_lock);
+		list_del_init(&req->rq_exp_list);
+		spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+	}
+	EXIT;
+}
+
+static int ptlrpc_hpreq_check(struct ptlrpc_request *req)
+{
+	return 1;
+}
+
+static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = {
+	.hpreq_check       = ptlrpc_hpreq_check,
+};
+
+/* Hi-Priority RPC check by RPC operation code. */
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req)
+{
+	int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+	/* Check for export to let only reconnects for not yet evicted
+	 * export to become a HP rpc. */
+	if ((req->rq_export != NULL) &&
+	    (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT))
+		req->rq_ops = &ptlrpc_hpreq_common;
+
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_hpreq_handler);
+
+static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
+				     struct ptlrpc_request *req)
+{
+	int	rc;
+	ENTRY;
+
+	rc = ptlrpc_server_hpreq_init(svcpt, req);
+	if (rc < 0)
+		RETURN(rc);
+
+	ptlrpc_nrs_req_add(svcpt, req, !!rc);
+
+	RETURN(0);
+}
+
+/**
+ * Allow to handle high priority request
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt,
+				     bool force)
+{
+	int running = svcpt->scp_nthrs_running;
+
+	if (!nrs_svcpt_has_hp(svcpt))
+		return false;
+
+	if (force)
+		return true;
+
+	if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+		     CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+		/* leave just 1 thread for normal RPCs */
+		running = PTLRPC_NTHRS_INIT;
+		if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+			running += 1;
+	}
+
+	if (svcpt->scp_nreqs_active >= running - 1)
+		return false;
+
+	if (svcpt->scp_nhreqs_active == 0)
+		return true;
+
+	return !ptlrpc_nrs_req_pending_nolock(svcpt, false) ||
+	       svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio;
+}
+
+static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt,
+				       bool force)
+{
+	return ptlrpc_server_allow_high(svcpt, force) &&
+	       ptlrpc_nrs_req_pending_nolock(svcpt, true);
+}
+
+/**
+ * Only allow normal priority requests on a service that has a high-priority
+ * queue if forced (i.e. cleanup), if there are other high priority requests
+ * already being processed (i.e. those threads can service more high-priority
+ * requests), or if there are enough idle threads that a later thread can do
+ * a high priority request.
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt,
+				       bool force)
+{
+	int running = svcpt->scp_nthrs_running;
+	if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+		     CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+		/* leave just 1 thread for normal RPCs */
+		running = PTLRPC_NTHRS_INIT;
+		if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+			running += 1;
+	}
+
+	if (force ||
+	    svcpt->scp_nreqs_active < running - 2)
+		return true;
+
+	if (svcpt->scp_nreqs_active >= running - 1)
+		return false;
+
+	return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt);
+}
+
+static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt,
+					 bool force)
+{
+	return ptlrpc_server_allow_normal(svcpt, force) &&
+	       ptlrpc_nrs_req_pending_nolock(svcpt, false);
+}
+
+/**
+ * Returns true if there are requests available in incoming
+ * request queue for processing and it is allowed to fetch them.
+ * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock
+ * to get reliable result
+ * \see ptlrpc_server_allow_normal
+ * \see ptlrpc_server_allow high
+ */
+static inline bool
+ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force)
+{
+	return ptlrpc_server_high_pending(svcpt, force) ||
+	       ptlrpc_server_normal_pending(svcpt, force);
+}
+
+/**
+ * Fetch a request for processing from queue of unprocessed requests.
+ * Favors high-priority requests.
+ * Returns a pointer to fetched request.
+ */
+static struct ptlrpc_request *
+ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force)
+{
+	struct ptlrpc_request *req = NULL;
+	ENTRY;
+
+	spin_lock(&svcpt->scp_req_lock);
+
+	if (ptlrpc_server_high_pending(svcpt, force)) {
+		req = ptlrpc_nrs_req_get_nolock(svcpt, true, force);
+		if (req != NULL) {
+			svcpt->scp_hreq_count++;
+			goto got_request;
+		}
+	}
+
+	if (ptlrpc_server_normal_pending(svcpt, force)) {
+		req = ptlrpc_nrs_req_get_nolock(svcpt, false, force);
+		if (req != NULL) {
+			svcpt->scp_hreq_count = 0;
+			goto got_request;
+		}
+	}
+
+	spin_unlock(&svcpt->scp_req_lock);
+	RETURN(NULL);
+
+got_request:
+	svcpt->scp_nreqs_active++;
+	if (req->rq_hp)
+		svcpt->scp_nhreqs_active++;
+
+	spin_unlock(&svcpt->scp_req_lock);
+
+	if (likely(req->rq_export))
+		class_export_rpc_inc(req->rq_export);
+
+	RETURN(req);
+}
+
+/**
+ * Handle freshly incoming reqs, add to timed early reply list,
+ * pass on to regular request queue.
+ * All incoming requests pass through here before getting into
+ * ptlrpc_server_handle_req later on.
+ */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
+			    struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service	*svc = svcpt->scp_service;
+	struct ptlrpc_request	*req;
+	__u32			deadline;
+	int			rc;
+	ENTRY;
+
+	spin_lock(&svcpt->scp_lock);
+	if (list_empty(&svcpt->scp_req_incoming)) {
+		spin_unlock(&svcpt->scp_lock);
+		RETURN(0);
+	}
+
+	req = list_entry(svcpt->scp_req_incoming.next,
+			     struct ptlrpc_request, rq_list);
+	list_del_init(&req->rq_list);
+	svcpt->scp_nreqs_incoming--;
+	/* Consider this still a "queued" request as far as stats are
+	 * concerned */
+	spin_unlock(&svcpt->scp_lock);
+
+	/* go through security check/transform */
+	rc = sptlrpc_svc_unwrap_request(req);
+	switch (rc) {
+	case SECSVC_OK:
+		break;
+	case SECSVC_COMPLETE:
+		target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+		goto err_req;
+	case SECSVC_DROP:
+		goto err_req;
+	default:
+		LBUG();
+	}
+
+	/*
+	 * for null-flavored rpc, msg has been unpacked by sptlrpc, although
+	 * redo it wouldn't be harmful.
+	 */
+	if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+		rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen);
+		if (rc != 0) {
+			CERROR("error unpacking request: ptl %d from %s "
+			       "x"LPU64"\n", svc->srv_req_portal,
+			       libcfs_id2str(req->rq_peer), req->rq_xid);
+			goto err_req;
+		}
+	}
+
+	rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+	if (rc) {
+		CERROR ("error unpacking ptlrpc body: ptl %d from %s x"
+			LPU64"\n", svc->srv_req_portal,
+			libcfs_id2str(req->rq_peer), req->rq_xid);
+		goto err_req;
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) &&
+	    lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) {
+		CERROR("drop incoming rpc opc %u, x"LPU64"\n",
+		       cfs_fail_val, req->rq_xid);
+		goto err_req;
+	}
+
+	rc = -EINVAL;
+	if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+		CERROR("wrong packet type received (type=%u) from %s\n",
+		       lustre_msg_get_type(req->rq_reqmsg),
+		       libcfs_id2str(req->rq_peer));
+		goto err_req;
+	}
+
+	switch(lustre_msg_get_opc(req->rq_reqmsg)) {
+	case MDS_WRITEPAGE:
+	case OST_WRITE:
+		req->rq_bulk_write = 1;
+		break;
+	case MDS_READPAGE:
+	case OST_READ:
+	case MGS_CONFIG_READ:
+		req->rq_bulk_read = 1;
+		break;
+	}
+
+	CDEBUG(D_RPCTRACE, "got req x"LPU64"\n", req->rq_xid);
+
+	req->rq_export = class_conn2export(
+		lustre_msg_get_handle(req->rq_reqmsg));
+	if (req->rq_export) {
+		rc = ptlrpc_check_req(req);
+		if (rc == 0) {
+			rc = sptlrpc_target_export_check(req->rq_export, req);
+			if (rc)
+				DEBUG_REQ(D_ERROR, req, "DROPPING req with "
+					  "illegal security flavor,");
+		}
+
+		if (rc)
+			goto err_req;
+		ptlrpc_update_export_timer(req->rq_export, 0);
+	}
+
+	/* req_in handling should/must be fast */
+	if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5)
+		DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s",
+			  cfs_time_sub(cfs_time_current_sec(),
+				       req->rq_arrival_time.tv_sec));
+
+	/* Set rpc server deadline and add it to the timed list */
+	deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
+		    MSGHDR_AT_SUPPORT) ?
+		   /* The max time the client expects us to take */
+		   lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+	req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+	if (unlikely(deadline == 0)) {
+		DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
+		goto err_req;
+	}
+
+	req->rq_svc_thread = thread;
+
+	ptlrpc_at_add_timed(req);
+
+	/* Move it over to the request processing queue */
+	rc = ptlrpc_server_request_add(svcpt, req);
+	if (rc)
+		GOTO(err_req, rc);
+
+	wake_up(&svcpt->scp_waitq);
+	RETURN(1);
+
+err_req:
+	ptlrpc_server_finish_request(svcpt, req);
+
+	RETURN(1);
+}
+
+/**
+ * Main incoming request handling logic.
+ * Calls handler function from service to do actual processing.
+ */
+static int
+ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
+			     struct ptlrpc_thread *thread)
+{
+	struct ptlrpc_service *svc = svcpt->scp_service;
+	struct ptlrpc_request *request;
+	struct timeval	 work_start;
+	struct timeval	 work_end;
+	long		   timediff;
+	int		    rc;
+	int		    fail_opc = 0;
+	ENTRY;
+
+	request = ptlrpc_server_request_get(svcpt, false);
+	if (request == NULL)
+		RETURN(0);
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
+		fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
+	else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+		fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT;
+
+	if (unlikely(fail_opc)) {
+		if (request->rq_export && request->rq_ops)
+			OBD_FAIL_TIMEOUT(fail_opc, 4);
+	}
+
+	ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET);
+
+	if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG))
+		libcfs_debug_dumplog();
+
+	do_gettimeofday(&work_start);
+	timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL);
+	if (likely(svc->srv_stats != NULL)) {
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
+				    timediff);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
+				    svcpt->scp_nreqs_incoming);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
+				    svcpt->scp_nreqs_active);
+		lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+				    at_get(&svcpt->scp_at_estimate));
+	}
+
+	rc = lu_context_init(&request->rq_session, LCT_SESSION | LCT_NOREF);
+	if (rc) {
+		CERROR("Failure to initialize session: %d\n", rc);
+		goto out_req;
+	}
+	request->rq_session.lc_thread = thread;
+	request->rq_session.lc_cookie = 0x5;
+	lu_context_enter(&request->rq_session);
+
+	CDEBUG(D_NET, "got req "LPU64"\n", request->rq_xid);
+
+	request->rq_svc_thread = thread;
+	if (thread)
+		request->rq_svc_thread->t_env->le_ses = &request->rq_session;
+
+	if (likely(request->rq_export)) {
+		if (unlikely(ptlrpc_check_req(request)))
+			goto put_conn;
+		ptlrpc_update_export_timer(request->rq_export, timediff >> 19);
+	}
+
+	/* Discard requests queued for longer than the deadline.
+	   The deadline is increased if we send an early reply. */
+	if (cfs_time_current_sec() > request->rq_deadline) {
+		DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s"
+			  ": deadline "CFS_DURATION_T":"CFS_DURATION_T"s ago\n",
+			  libcfs_id2str(request->rq_peer),
+			  cfs_time_sub(request->rq_deadline,
+			  request->rq_arrival_time.tv_sec),
+			  cfs_time_sub(cfs_time_current_sec(),
+			  request->rq_deadline));
+		goto put_conn;
+	}
+
+	CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc "
+	       "%s:%s+%d:%d:x"LPU64":%s:%d\n", current_comm(),
+	       (request->rq_export ?
+		(char *)request->rq_export->exp_client_uuid.uuid : "0"),
+	       (request->rq_export ?
+		atomic_read(&request->rq_export->exp_refcount) : -99),
+	       lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
+	       libcfs_id2str(request->rq_peer),
+	       lustre_msg_get_opc(request->rq_reqmsg));
+
+	if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
+		CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
+
+	rc = svc->srv_ops.so_req_handler(request);
+
+	ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);
+
+put_conn:
+	lu_context_exit(&request->rq_session);
+	lu_context_fini(&request->rq_session);
+
+	if (unlikely(cfs_time_current_sec() > request->rq_deadline)) {
+		     DEBUG_REQ(D_WARNING, request, "Request took longer "
+			       "than estimated ("CFS_DURATION_T":"CFS_DURATION_T"s);"
+			       " client may timeout.",
+			       cfs_time_sub(request->rq_deadline,
+					    request->rq_arrival_time.tv_sec),
+			       cfs_time_sub(cfs_time_current_sec(),
+					    request->rq_deadline));
+	}
+
+	do_gettimeofday(&work_end);
+	timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+	CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
+	       "%s:%s+%d:%d:x"LPU64":%s:%d Request procesed in "
+	       "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
+		current_comm(),
+		(request->rq_export ?
+		 (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+		(request->rq_export ?
+		 atomic_read(&request->rq_export->exp_refcount) : -99),
+		lustre_msg_get_status(request->rq_reqmsg),
+		request->rq_xid,
+		libcfs_id2str(request->rq_peer),
+		lustre_msg_get_opc(request->rq_reqmsg),
+		timediff,
+		cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL),
+		(request->rq_repmsg ?
+		 lustre_msg_get_transno(request->rq_repmsg) :
+		 request->rq_transno),
+		request->rq_status,
+		(request->rq_repmsg ?
+		 lustre_msg_get_status(request->rq_repmsg) : -999));
+	if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
+		__u32 op = lustre_msg_get_opc(request->rq_reqmsg);
+		int opc = opcode_offset(op);
+		if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
+			LASSERT(opc < LUSTRE_MAX_OPCODES);
+			lprocfs_counter_add(svc->srv_stats,
+					    opc + EXTRA_MAX_OPCODES,
+					    timediff);
+		}
+	}
+	if (unlikely(request->rq_early_count)) {
+		DEBUG_REQ(D_ADAPTTO, request,
+			  "sent %d early replies before finishing in "
+			  CFS_DURATION_T"s",
+			  request->rq_early_count,
+			  cfs_time_sub(work_end.tv_sec,
+			  request->rq_arrival_time.tv_sec));
+	}
+
+out_req:
+	ptlrpc_server_finish_active_request(svcpt, request);
+
+	RETURN(1);
+}
+
+/**
+ * An internal function to process a single reply state object.
+ */
+static int
+ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
+{
+	struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+	struct ptlrpc_service     *svc = svcpt->scp_service;
+	struct obd_export	 *exp;
+	int			nlocks;
+	int			been_handled;
+	ENTRY;
+
+	exp = rs->rs_export;
+
+	LASSERT (rs->rs_difficult);
+	LASSERT (rs->rs_scheduled);
+	LASSERT (list_empty(&rs->rs_list));
+
+	spin_lock(&exp->exp_lock);
+	/* Noop if removed already */
+	list_del_init (&rs->rs_exp_list);
+	spin_unlock(&exp->exp_lock);
+
+	/* The disk commit callback holds exp_uncommitted_replies_lock while it
+	 * iterates over newly committed replies, removing them from
+	 * exp_uncommitted_replies.  It then drops this lock and schedules the
+	 * replies it found for handling here.
+	 *
+	 * We can avoid contention for exp_uncommitted_replies_lock between the
+	 * HRT threads and further commit callbacks by checking rs_committed
+	 * which is set in the commit callback while it holds both
+	 * rs_lock and exp_uncommitted_reples.
+	 *
+	 * If we see rs_committed clear, the commit callback _may_ not have
+	 * handled this reply yet and we race with it to grab
+	 * exp_uncommitted_replies_lock before removing the reply from
+	 * exp_uncommitted_replies.  Note that if we lose the race and the
+	 * reply has already been removed, list_del_init() is a noop.
+	 *
+	 * If we see rs_committed set, we know the commit callback is handling,
+	 * or has handled this reply since store reordering might allow us to
+	 * see rs_committed set out of sequence.  But since this is done
+	 * holding rs_lock, we can be sure it has all completed once we hold
+	 * rs_lock, which we do right next.
+	 */
+	if (!rs->rs_committed) {
+		spin_lock(&exp->exp_uncommitted_replies_lock);
+		list_del_init(&rs->rs_obd_list);
+		spin_unlock(&exp->exp_uncommitted_replies_lock);
+	}
+
+	spin_lock(&rs->rs_lock);
+
+	been_handled = rs->rs_handled;
+	rs->rs_handled = 1;
+
+	nlocks = rs->rs_nlocks;		 /* atomic "steal", but */
+	rs->rs_nlocks = 0;		      /* locks still on rs_locks! */
+
+	if (nlocks == 0 && !been_handled) {
+		/* If we see this, we should already have seen the warning
+		 * in mds_steal_ack_locks()  */
+		CDEBUG(D_HA, "All locks stolen from rs %p x"LPD64".t"LPD64
+		       " o%d NID %s\n",
+		       rs,
+		       rs->rs_xid, rs->rs_transno, rs->rs_opc,
+		       libcfs_nid2str(exp->exp_connection->c_peer.nid));
+	}
+
+	if ((!been_handled && rs->rs_on_net) || nlocks > 0) {
+		spin_unlock(&rs->rs_lock);
+
+		if (!been_handled && rs->rs_on_net) {
+			LNetMDUnlink(rs->rs_md_h);
+			/* Ignore return code; we're racing with completion */
+		}
+
+		while (nlocks-- > 0)
+			ldlm_lock_decref(&rs->rs_locks[nlocks],
+					 rs->rs_modes[nlocks]);
+
+		spin_lock(&rs->rs_lock);
+	}
+
+	rs->rs_scheduled = 0;
+
+	if (!rs->rs_on_net) {
+		/* Off the net */
+		spin_unlock(&rs->rs_lock);
+
+		class_export_put (exp);
+		rs->rs_export = NULL;
+		ptlrpc_rs_decref (rs);
+		if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) &&
+		    svc->srv_is_stopping)
+			wake_up_all(&svcpt->scp_waitq);
+		RETURN(1);
+	}
+
+	/* still on the net; callback will schedule */
+	spin_unlock(&rs->rs_lock);
+	RETURN(1);
+}
+
+
+static void
+ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt)
+{
+	int avail = svcpt->scp_nrqbds_posted;
+	int low_water = test_req_buffer_pressure ? 0 :
+			svcpt->scp_service->srv_nbuf_per_group / 2;
+
+	/* NB I'm not locking; just looking. */
+
+	/* CAVEAT EMPTOR: We might be allocating buffers here because we've
+	 * allowed the request history to grow out of control.  We could put a
+	 * sanity check on that here and cull some history if we need the
+	 * space. */
+
+	if (avail <= low_water)
+		ptlrpc_grow_req_bufs(svcpt, 1);
+
+	if (svcpt->scp_service->srv_stats) {
+		lprocfs_counter_add(svcpt->scp_service->srv_stats,
+				    PTLRPC_REQBUF_AVAIL_CNTR, avail);
+	}
+}
+
+static int
+ptlrpc_retry_rqbds(void *arg)
+{
+	struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg;
+
+	svcpt->scp_rqbd_timeout = 0;
+	return -ETIMEDOUT;
+}
+
+static inline int
+ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nreqs_active <
+	       svcpt->scp_nthrs_running - 1 -
+	       (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL);
+}
+
+/**
+ * allowed to create more threads
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_nthrs_running +
+	       svcpt->scp_nthrs_starting <
+	       svcpt->scp_service->srv_nthrs_cpt_limit;
+}
+
+/**
+ * too many requests and allowed to create more threads
+ */
+static inline int
+ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt)
+{
+	return !ptlrpc_threads_enough(svcpt) &&
+		ptlrpc_threads_increasable(svcpt);
+}
+
+static inline int
+ptlrpc_thread_stopping(struct ptlrpc_thread *thread)
+{
+	return thread_is_stopping(thread) ||
+	       thread->t_svcpt->scp_service->srv_is_stopping;
+}
+
+static inline int
+ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt)
+{
+	return !list_empty(&svcpt->scp_rqbd_idle) &&
+	       svcpt->scp_rqbd_timeout == 0;
+}
+
+static inline int
+ptlrpc_at_check(struct ptlrpc_service_part *svcpt)
+{
+	return svcpt->scp_at_check;
+}
+
+/**
+ * requests wait on preprocessing
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt)
+{
+	return !list_empty(&svcpt->scp_req_incoming);
+}
+
+static __attribute__((__noinline__)) int
+ptlrpc_wait_event(struct ptlrpc_service_part *svcpt,
+		  struct ptlrpc_thread *thread)
+{
+	/* Don't exit while there are replies to be handled */
+	struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout,
+					     ptlrpc_retry_rqbds, svcpt);
+
+	lc_watchdog_disable(thread->t_watchdog);
+
+	cond_resched();
+
+	l_wait_event_exclusive_head(svcpt->scp_waitq,
+				ptlrpc_thread_stopping(thread) ||
+				ptlrpc_server_request_incoming(svcpt) ||
+				ptlrpc_server_request_pending(svcpt, false) ||
+				ptlrpc_rqbd_pending(svcpt) ||
+				ptlrpc_at_check(svcpt), &lwi);
+
+	if (ptlrpc_thread_stopping(thread))
+		return -EINTR;
+
+	lc_watchdog_touch(thread->t_watchdog,
+			  ptlrpc_server_get_timeout(svcpt));
+	return 0;
+}
+
+/**
+ * Main thread body for service threads.
+ * Waits in a loop waiting for new requests to process to appear.
+ * Every time an incoming requests is added to its queue, a waitq
+ * is woken up and one of the threads will handle it.
+ */
+static int ptlrpc_main(void *arg)
+{
+	struct ptlrpc_thread		*thread = (struct ptlrpc_thread *)arg;
+	struct ptlrpc_service_part	*svcpt = thread->t_svcpt;
+	struct ptlrpc_service		*svc = svcpt->scp_service;
+	struct ptlrpc_reply_state	*rs;
+#ifdef WITH_GROUP_INFO
+	group_info_t *ginfo = NULL;
+#endif
+	struct lu_env *env;
+	int counter = 0, rc = 0;
+	ENTRY;
+
+	thread->t_pid = current_pid();
+	unshare_fs_struct();
+
+	/* NB: we will call cfs_cpt_bind() for all threads, because we
+	 * might want to run lustre server only on a subset of system CPUs,
+	 * in that case ->scp_cpt is CFS_CPT_ANY */
+	rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+	if (rc != 0) {
+		CWARN("%s: failed to bind %s on CPT %d\n",
+		      svc->srv_name, thread->t_name, svcpt->scp_cpt);
+	}
+
+#ifdef WITH_GROUP_INFO
+	ginfo = groups_alloc(0);
+	if (!ginfo) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	set_current_groups(ginfo);
+	put_group_info(ginfo);
+#endif
+
+	if (svc->srv_ops.so_thr_init != NULL) {
+		rc = svc->srv_ops.so_thr_init(thread);
+		if (rc)
+			goto out;
+	}
+
+	OBD_ALLOC_PTR(env);
+	if (env == NULL) {
+		rc = -ENOMEM;
+		goto out_srv_fini;
+	}
+
+	rc = lu_context_init(&env->le_ctx,
+			     svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+	if (rc)
+		goto out_srv_fini;
+
+	thread->t_env = env;
+	env->le_ctx.lc_thread = thread;
+	env->le_ctx.lc_cookie = 0x6;
+
+	while (!list_empty(&svcpt->scp_rqbd_idle)) {
+		rc = ptlrpc_server_post_idle_rqbds(svcpt);
+		if (rc >= 0)
+			continue;
+
+		CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
+			svc->srv_name, svcpt->scp_cpt, rc);
+		goto out_srv_fini;
+	}
+
+	/* Alloc reply state structure for this one */
+	OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
+	if (!rs) {
+		rc = -ENOMEM;
+		goto out_srv_fini;
+	}
+
+	spin_lock(&svcpt->scp_lock);
+
+	LASSERT(thread_is_starting(thread));
+	thread_clear_flags(thread, SVC_STARTING);
+
+	LASSERT(svcpt->scp_nthrs_starting == 1);
+	svcpt->scp_nthrs_starting--;
+
+	/* SVC_STOPPING may already be set here if someone else is trying
+	 * to stop the service while this new thread has been dynamically
+	 * forked. We still set SVC_RUNNING to let our creator know that
+	 * we are now running, however we will exit as soon as possible */
+	thread_add_flags(thread, SVC_RUNNING);
+	svcpt->scp_nthrs_running++;
+	spin_unlock(&svcpt->scp_lock);
+
+	/* wake up our creator in case he's still waiting. */
+	wake_up(&thread->t_ctl_waitq);
+
+	thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt),
+					     NULL, NULL);
+
+	spin_lock(&svcpt->scp_rep_lock);
+	list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+	wake_up(&svcpt->scp_rep_waitq);
+	spin_unlock(&svcpt->scp_rep_lock);
+
+	CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id,
+	       svcpt->scp_nthrs_running);
+
+	/* XXX maintain a list of all managed devices: insert here */
+	while (!ptlrpc_thread_stopping(thread)) {
+		if (ptlrpc_wait_event(svcpt, thread))
+			break;
+
+		ptlrpc_check_rqbd_pool(svcpt);
+
+		if (ptlrpc_threads_need_create(svcpt)) {
+			/* Ignore return code - we tried... */
+			ptlrpc_start_thread(svcpt, 0);
+		}
+
+		/* Process all incoming reqs before handling any */
+		if (ptlrpc_server_request_incoming(svcpt)) {
+			lu_context_enter(&env->le_ctx);
+			env->le_ses = NULL;
+			ptlrpc_server_handle_req_in(svcpt, thread);
+			lu_context_exit(&env->le_ctx);
+
+			/* but limit ourselves in case of flood */
+			if (counter++ < 100)
+				continue;
+			counter = 0;
+		}
+
+		if (ptlrpc_at_check(svcpt))
+			ptlrpc_at_check_timed(svcpt);
+
+		if (ptlrpc_server_request_pending(svcpt, false)) {
+			lu_context_enter(&env->le_ctx);
+			ptlrpc_server_handle_request(svcpt, thread);
+			lu_context_exit(&env->le_ctx);
+		}
+
+		if (ptlrpc_rqbd_pending(svcpt) &&
+		    ptlrpc_server_post_idle_rqbds(svcpt) < 0) {
+			/* I just failed to repost request buffers.
+			 * Wait for a timeout (unless something else
+			 * happens) before I try again */
+			svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10;
+			CDEBUG(D_RPCTRACE, "Posted buffers: %d\n",
+			       svcpt->scp_nrqbds_posted);
+		}
+	}
+
+	lc_watchdog_delete(thread->t_watchdog);
+	thread->t_watchdog = NULL;
+
+out_srv_fini:
+	/*
+	 * deconstruct service specific state created by ptlrpc_start_thread()
+	 */
+	if (svc->srv_ops.so_thr_done != NULL)
+		svc->srv_ops.so_thr_done(thread);
+
+	if (env != NULL) {
+		lu_context_fini(&env->le_ctx);
+		OBD_FREE_PTR(env);
+	}
+out:
+	CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n",
+	       thread, thread->t_pid, thread->t_id, rc);
+
+	spin_lock(&svcpt->scp_lock);
+	if (thread_test_and_clear_flags(thread, SVC_STARTING))
+		svcpt->scp_nthrs_starting--;
+
+	if (thread_test_and_clear_flags(thread, SVC_RUNNING)) {
+		/* must know immediately */
+		svcpt->scp_nthrs_running--;
+	}
+
+	thread->t_id = rc;
+	thread_add_flags(thread, SVC_STOPPED);
+
+	wake_up(&thread->t_ctl_waitq);
+	spin_unlock(&svcpt->scp_lock);
+
+	return rc;
+}
+
+static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt,
+			  struct list_head *replies)
+{
+	int result;
+
+	spin_lock(&hrt->hrt_lock);
+
+	list_splice_init(&hrt->hrt_queue, replies);
+	result = ptlrpc_hr.hr_stopping || !list_empty(replies);
+
+	spin_unlock(&hrt->hrt_lock);
+	return result;
+}
+
+/**
+ * Main body of "handle reply" function.
+ * It processes acked reply states
+ */
+static int ptlrpc_hr_main(void *arg)
+{
+	struct ptlrpc_hr_thread		*hrt = (struct ptlrpc_hr_thread *)arg;
+	struct ptlrpc_hr_partition	*hrp = hrt->hrt_partition;
+	LIST_HEAD			(replies);
+	char				threadname[20];
+	int				rc;
+
+	snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d",
+		 hrp->hrp_cpt, hrt->hrt_id);
+	unshare_fs_struct();
+
+	rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt);
+	if (rc != 0) {
+		CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n",
+		      threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
+	}
+
+	atomic_inc(&hrp->hrp_nstarted);
+	wake_up(&ptlrpc_hr.hr_waitq);
+
+	while (!ptlrpc_hr.hr_stopping) {
+		l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies));
+
+		while (!list_empty(&replies)) {
+			struct ptlrpc_reply_state *rs;
+
+			rs = list_entry(replies.prev,
+					    struct ptlrpc_reply_state,
+					    rs_list);
+			list_del_init(&rs->rs_list);
+			ptlrpc_handle_rs(rs);
+		}
+	}
+
+	atomic_inc(&hrp->hrp_nstopped);
+	wake_up(&ptlrpc_hr.hr_waitq);
+
+	return 0;
+}
+
+static void ptlrpc_stop_hr_threads(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+	int				j;
+
+	ptlrpc_hr.hr_stopping = 1;
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs == NULL)
+			continue; /* uninitialized */
+		for (j = 0; j < hrp->hrp_nthrs; j++)
+			wake_up_all(&hrp->hrp_thrs[j].hrt_waitq);
+	}
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs == NULL)
+			continue; /* uninitialized */
+		wait_event(ptlrpc_hr.hr_waitq,
+			       atomic_read(&hrp->hrp_nstopped) ==
+			       atomic_read(&hrp->hrp_nstarted));
+	}
+}
+
+static int ptlrpc_start_hr_threads(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+	int				j;
+	ENTRY;
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		int	rc = 0;
+
+		for (j = 0; j < hrp->hrp_nthrs; j++) {
+			struct	ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j];
+			rc = PTR_ERR(kthread_run(ptlrpc_hr_main,
+						 &hrp->hrp_thrs[j],
+						 "ptlrpc_hr%02d_%03d",
+						 hrp->hrp_cpt,
+						 hrt->hrt_id));
+			if (IS_ERR_VALUE(rc))
+				break;
+		}
+		wait_event(ptlrpc_hr.hr_waitq,
+			       atomic_read(&hrp->hrp_nstarted) == j);
+		if (!IS_ERR_VALUE(rc))
+			continue;
+
+		CERROR("Reply handling thread %d:%d Failed on starting: "
+		       "rc = %d\n", i, j, rc);
+		ptlrpc_stop_hr_threads();
+		RETURN(rc);
+	}
+	RETURN(0);
+}
+
+static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
+{
+	struct l_wait_info	lwi = { 0 };
+	struct ptlrpc_thread	*thread;
+	LIST_HEAD		(zombie);
+
+	ENTRY;
+
+	CDEBUG(D_INFO, "Stopping threads for service %s\n",
+	       svcpt->scp_service->srv_name);
+
+	spin_lock(&svcpt->scp_lock);
+	/* let the thread know that we would like it to stop asap */
+	list_for_each_entry(thread, &svcpt->scp_threads, t_link) {
+		CDEBUG(D_INFO, "Stopping thread %s #%u\n",
+		       svcpt->scp_service->srv_thread_name, thread->t_id);
+		thread_add_flags(thread, SVC_STOPPING);
+	}
+
+	wake_up_all(&svcpt->scp_waitq);
+
+	while (!list_empty(&svcpt->scp_threads)) {
+		thread = list_entry(svcpt->scp_threads.next,
+					struct ptlrpc_thread, t_link);
+		if (thread_is_stopped(thread)) {
+			list_del(&thread->t_link);
+			list_add(&thread->t_link, &zombie);
+			continue;
+		}
+		spin_unlock(&svcpt->scp_lock);
+
+		CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n",
+		       svcpt->scp_service->srv_thread_name, thread->t_id);
+		l_wait_event(thread->t_ctl_waitq,
+			     thread_is_stopped(thread), &lwi);
+
+		spin_lock(&svcpt->scp_lock);
+	}
+
+	spin_unlock(&svcpt->scp_lock);
+
+	while (!list_empty(&zombie)) {
+		thread = list_entry(zombie.next,
+					struct ptlrpc_thread, t_link);
+		list_del(&thread->t_link);
+		OBD_FREE_PTR(thread);
+	}
+	EXIT;
+}
+
+/**
+ * Stops all threads of a particular service \a svc
+ */
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part *svcpt;
+	int			   i;
+	ENTRY;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service != NULL)
+			ptlrpc_svcpt_stop_threads(svcpt);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_stop_all_threads);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc)
+{
+	int	rc = 0;
+	int	i;
+	int	j;
+	ENTRY;
+
+	/* We require 2 threads min, see note in ptlrpc_server_handle_request */
+	LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT);
+
+	for (i = 0; i < svc->srv_ncpts; i++) {
+		for (j = 0; j < svc->srv_nthrs_cpt_init; j++) {
+			rc = ptlrpc_start_thread(svc->srv_parts[i], 1);
+			if (rc == 0)
+				continue;
+
+			if (rc != -EMFILE)
+				goto failed;
+			/* We have enough threads, don't start more. b=15759 */
+			break;
+		}
+	}
+
+	RETURN(0);
+ failed:
+	CERROR("cannot start %s thread #%d_%d: rc %d\n",
+	       svc->srv_thread_name, i, j, rc);
+	ptlrpc_stop_all_threads(svc);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_start_threads);
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait)
+{
+	struct l_wait_info	lwi = { 0 };
+	struct ptlrpc_thread	*thread;
+	struct ptlrpc_service	*svc;
+	int			rc;
+	ENTRY;
+
+	LASSERT(svcpt != NULL);
+
+	svc = svcpt->scp_service;
+
+	CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n",
+	       svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running,
+	       svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit);
+
+ again:
+	if (unlikely(svc->srv_is_stopping))
+		RETURN(-ESRCH);
+
+	if (!ptlrpc_threads_increasable(svcpt) ||
+	    (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) &&
+	     svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1))
+		RETURN(-EMFILE);
+
+	OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt);
+	if (thread == NULL)
+		RETURN(-ENOMEM);
+	init_waitqueue_head(&thread->t_ctl_waitq);
+
+	spin_lock(&svcpt->scp_lock);
+	if (!ptlrpc_threads_increasable(svcpt)) {
+		spin_unlock(&svcpt->scp_lock);
+		OBD_FREE_PTR(thread);
+		RETURN(-EMFILE);
+	}
+
+	if (svcpt->scp_nthrs_starting != 0) {
+		/* serialize starting because some modules (obdfilter)
+		 * might require unique and contiguous t_id */
+		LASSERT(svcpt->scp_nthrs_starting == 1);
+		spin_unlock(&svcpt->scp_lock);
+		OBD_FREE_PTR(thread);
+		if (wait) {
+			CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n",
+			       svc->srv_thread_name, svcpt->scp_thr_nextid);
+			schedule();
+			goto again;
+		}
+
+		CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n",
+		       svc->srv_thread_name, svcpt->scp_thr_nextid);
+		RETURN(-EAGAIN);
+	}
+
+	svcpt->scp_nthrs_starting++;
+	thread->t_id = svcpt->scp_thr_nextid++;
+	thread_add_flags(thread, SVC_STARTING);
+	thread->t_svcpt = svcpt;
+
+	list_add(&thread->t_link, &svcpt->scp_threads);
+	spin_unlock(&svcpt->scp_lock);
+
+	if (svcpt->scp_cpt >= 0) {
+		snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d",
+			 svc->srv_thread_name, svcpt->scp_cpt, thread->t_id);
+	} else {
+		snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d",
+			 svc->srv_thread_name, thread->t_id);
+	}
+
+	CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name);
+	rc = PTR_ERR(kthread_run(ptlrpc_main, thread, thread->t_name));
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("cannot start thread '%s': rc %d\n",
+		       thread->t_name, rc);
+		spin_lock(&svcpt->scp_lock);
+		list_del(&thread->t_link);
+		--svcpt->scp_nthrs_starting;
+		spin_unlock(&svcpt->scp_lock);
+
+		OBD_FREE(thread, sizeof(*thread));
+		RETURN(rc);
+	}
+
+	if (!wait)
+		RETURN(0);
+
+	l_wait_event(thread->t_ctl_waitq,
+		     thread_is_running(thread) || thread_is_stopped(thread),
+		     &lwi);
+
+	rc = thread_is_stopped(thread) ? thread->t_id : 0;
+	RETURN(rc);
+}
+
+int ptlrpc_hr_init(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	struct ptlrpc_hr_thread		*hrt;
+	int				rc;
+	int				i;
+	int				j;
+	ENTRY;
+
+	memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr));
+	ptlrpc_hr.hr_cpt_table = cfs_cpt_table;
+
+	ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table,
+						   sizeof(*hrp));
+	if (ptlrpc_hr.hr_partitions == NULL)
+		RETURN(-ENOMEM);
+
+	init_waitqueue_head(&ptlrpc_hr.hr_waitq);
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		hrp->hrp_cpt = i;
+
+		atomic_set(&hrp->hrp_nstarted, 0);
+		atomic_set(&hrp->hrp_nstopped, 0);
+
+		hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i);
+		hrp->hrp_nthrs /= cfs_cpu_ht_nsiblings(0);
+
+		LASSERT(hrp->hrp_nthrs > 0);
+		OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, i,
+			      hrp->hrp_nthrs * sizeof(*hrt));
+		if (hrp->hrp_thrs == NULL)
+			GOTO(out, rc = -ENOMEM);
+
+		for (j = 0; j < hrp->hrp_nthrs; j++) {
+			hrt = &hrp->hrp_thrs[j];
+
+			hrt->hrt_id = j;
+			hrt->hrt_partition = hrp;
+			init_waitqueue_head(&hrt->hrt_waitq);
+			spin_lock_init(&hrt->hrt_lock);
+			INIT_LIST_HEAD(&hrt->hrt_queue);
+		}
+	}
+
+	rc = ptlrpc_start_hr_threads();
+out:
+	if (rc != 0)
+		ptlrpc_hr_fini();
+	RETURN(rc);
+}
+
+void ptlrpc_hr_fini(void)
+{
+	struct ptlrpc_hr_partition	*hrp;
+	int				i;
+
+	if (ptlrpc_hr.hr_partitions == NULL)
+		return;
+
+	ptlrpc_stop_hr_threads();
+
+	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+		if (hrp->hrp_thrs != NULL) {
+			OBD_FREE(hrp->hrp_thrs,
+				 hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0]));
+		}
+	}
+
+	cfs_percpt_free(ptlrpc_hr.hr_partitions);
+	ptlrpc_hr.hr_partitions = NULL;
+}
+
+
+/**
+ * Wait until all already scheduled replies are processed.
+ */
+static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt)
+{
+	while (1) {
+		int rc;
+		struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10),
+						     NULL, NULL);
+
+		rc = l_wait_event(svcpt->scp_waitq,
+		     atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi);
+		if (rc == 0)
+			break;
+		CWARN("Unexpectedly long timeout %s %p\n",
+		      svcpt->scp_service->srv_name, svcpt->scp_service);
+	}
+}
+
+static void
+ptlrpc_service_del_atimer(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	int				i;
+
+	/* early disarm AT timer... */
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service != NULL)
+			cfs_timer_disarm(&svcpt->scp_at_timer);
+	}
+}
+
+static void
+ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	  *svcpt;
+	struct ptlrpc_request_buffer_desc *rqbd;
+	struct l_wait_info		  lwi;
+	int				  rc;
+	int				  i;
+
+	/* All history will be culled when the next request buffer is
+	 * freed in ptlrpc_service_purge_all() */
+	svc->srv_hist_nrqbds_cpt_max = 0;
+
+	rc = LNetClearLazyPortal(svc->srv_req_portal);
+	LASSERT(rc == 0);
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* Unlink all the request buffers.  This forces a 'final'
+		 * event with its 'unlink' flag set for each posted rqbd */
+		list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted,
+					rqbd_list) {
+			rc = LNetMDUnlink(rqbd->rqbd_md_h);
+			LASSERT(rc == 0 || rc == -ENOENT);
+		}
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* Wait for the network to release any buffers
+		 * it's currently filling */
+		spin_lock(&svcpt->scp_lock);
+		while (svcpt->scp_nrqbds_posted != 0) {
+			spin_unlock(&svcpt->scp_lock);
+			/* Network access will complete in finite time but
+			 * the HUGE timeout lets us CWARN for visibility
+			 * of sluggish NALs */
+			lwi = LWI_TIMEOUT_INTERVAL(
+					cfs_time_seconds(LONG_UNLINK),
+					cfs_time_seconds(1), NULL, NULL);
+			rc = l_wait_event(svcpt->scp_waitq,
+					  svcpt->scp_nrqbds_posted == 0, &lwi);
+			if (rc == -ETIMEDOUT) {
+				CWARN("Service %s waiting for "
+				      "request buffers\n",
+				      svcpt->scp_service->srv_name);
+			}
+			spin_lock(&svcpt->scp_lock);
+		}
+		spin_unlock(&svcpt->scp_lock);
+	}
+}
+
+static void
+ptlrpc_service_purge_all(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part		*svcpt;
+	struct ptlrpc_request_buffer_desc	*rqbd;
+	struct ptlrpc_request			*req;
+	struct ptlrpc_reply_state		*rs;
+	int					i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		spin_lock(&svcpt->scp_rep_lock);
+		while (!list_empty(&svcpt->scp_rep_active)) {
+			rs = list_entry(svcpt->scp_rep_active.next,
+					    struct ptlrpc_reply_state, rs_list);
+			spin_lock(&rs->rs_lock);
+			ptlrpc_schedule_difficult_reply(rs);
+			spin_unlock(&rs->rs_lock);
+		}
+		spin_unlock(&svcpt->scp_rep_lock);
+
+		/* purge the request queue.  NB No new replies (rqbds
+		 * all unlinked) and no service threads, so I'm the only
+		 * thread noodling the request queue now */
+		while (!list_empty(&svcpt->scp_req_incoming)) {
+			req = list_entry(svcpt->scp_req_incoming.next,
+					     struct ptlrpc_request, rq_list);
+
+			list_del(&req->rq_list);
+			svcpt->scp_nreqs_incoming--;
+			ptlrpc_server_finish_request(svcpt, req);
+		}
+
+		while (ptlrpc_server_request_pending(svcpt, true)) {
+			req = ptlrpc_server_request_get(svcpt, true);
+			ptlrpc_server_finish_active_request(svcpt, req);
+		}
+
+		LASSERT(list_empty(&svcpt->scp_rqbd_posted));
+		LASSERT(svcpt->scp_nreqs_incoming == 0);
+		LASSERT(svcpt->scp_nreqs_active == 0);
+		/* history should have been culled by
+		 * ptlrpc_server_finish_request */
+		LASSERT(svcpt->scp_hist_nrqbds == 0);
+
+		/* Now free all the request buffers since nothing
+		 * references them any more... */
+
+		while (!list_empty(&svcpt->scp_rqbd_idle)) {
+			rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+					      struct ptlrpc_request_buffer_desc,
+					      rqbd_list);
+			ptlrpc_free_rqbd(rqbd);
+		}
+		ptlrpc_wait_replies(svcpt);
+
+		while (!list_empty(&svcpt->scp_rep_idle)) {
+			rs = list_entry(svcpt->scp_rep_idle.next,
+					    struct ptlrpc_reply_state,
+					    rs_list);
+			list_del(&rs->rs_list);
+			OBD_FREE_LARGE(rs, svc->srv_max_reply_size);
+		}
+	}
+}
+
+static void
+ptlrpc_service_free(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	struct ptlrpc_at_array		*array;
+	int				i;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		if (svcpt->scp_service == NULL)
+			break;
+
+		/* In case somebody rearmed this in the meantime */
+		cfs_timer_disarm(&svcpt->scp_at_timer);
+		array = &svcpt->scp_at_array;
+
+		if (array->paa_reqs_array != NULL) {
+			OBD_FREE(array->paa_reqs_array,
+				 sizeof(struct list_head) * array->paa_size);
+			array->paa_reqs_array = NULL;
+		}
+
+		if (array->paa_reqs_count != NULL) {
+			OBD_FREE(array->paa_reqs_count,
+				 sizeof(__u32) * array->paa_size);
+			array->paa_reqs_count = NULL;
+		}
+	}
+
+	ptlrpc_service_for_each_part(svcpt, i, svc)
+		OBD_FREE_PTR(svcpt);
+
+	if (svc->srv_cpts != NULL)
+		cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts);
+
+	OBD_FREE(svc, offsetof(struct ptlrpc_service,
+			       srv_parts[svc->srv_ncpts]));
+}
+
+int ptlrpc_unregister_service(struct ptlrpc_service *service)
+{
+	ENTRY;
+
+	CDEBUG(D_NET, "%s: tearing down\n", service->srv_name);
+
+	service->srv_is_stopping = 1;
+
+	mutex_lock(&ptlrpc_all_services_mutex);
+	list_del_init(&service->srv_list);
+	mutex_unlock(&ptlrpc_all_services_mutex);
+
+	ptlrpc_service_del_atimer(service);
+	ptlrpc_stop_all_threads(service);
+
+	ptlrpc_service_unlink_rqbd(service);
+	ptlrpc_service_purge_all(service);
+	ptlrpc_service_nrs_cleanup(service);
+
+	ptlrpc_lprocfs_unregister_service(service);
+
+	ptlrpc_service_free(service);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_service);
+
+/**
+ * Returns 0 if the service is healthy.
+ *
+ * Right now, it just checks to make sure that requests aren't languishing
+ * in the queue.  We'll use this health check to govern whether a node needs
+ * to be shot, so it's intentionally non-aggressive. */
+int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
+{
+	struct ptlrpc_request		*request = NULL;
+	struct timeval			right_now;
+	long				timediff;
+
+	do_gettimeofday(&right_now);
+
+	spin_lock(&svcpt->scp_req_lock);
+	/* How long has the next entry been waiting? */
+	if (ptlrpc_server_high_pending(svcpt, true))
+		request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
+	else if (ptlrpc_server_normal_pending(svcpt, true))
+		request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
+
+	if (request == NULL) {
+		spin_unlock(&svcpt->scp_req_lock);
+		return 0;
+	}
+
+	timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
+	spin_unlock(&svcpt->scp_req_lock);
+
+	if ((timediff / ONE_MILLION) >
+	    (AT_OFF ? obd_timeout * 3 / 2 : at_max)) {
+		CERROR("%s: unhealthy - request has been waiting %lds\n",
+		       svcpt->scp_service->srv_name, timediff / ONE_MILLION);
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+ptlrpc_service_health_check(struct ptlrpc_service *svc)
+{
+	struct ptlrpc_service_part	*svcpt;
+	int				i;
+
+	if (svc == NULL)
+		return 0;
+
+	ptlrpc_service_for_each_part(svcpt, i, svc) {
+		int rc = ptlrpc_svcpt_health_check(svcpt);
+
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ptlrpc_service_health_check);

diff --git a/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c
new file mode 100644
index 0000000..93bc40b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c

@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/fs.h>
+#  include <linux/posix_acl_xattr.h>
+# endif
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_disk.h>

diff --git a/drivers/staging/lustre/lustre/ptlrpc/wiretest.c b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c
new file mode 100644
index 0000000..9890bd9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c

@@ -0,0 +1,4474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/fs.h>
+#  include <linux/posix_acl_xattr.h>
+# endif
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_disk.h>
+void lustre_assert_wire_constants(void)
+{
+	 /* Wire protocol assertions generated by 'wirecheck'
+	  * (make -C lustre/utils newwiretest)
+	  * running on Linux deva 2.6.32.279.lustre #5 SMP Tue Apr 9 22:52:17 CST 2013 x86_64 x86_64 x
+	  * with gcc version 4.4.4 20100726 (Red Hat 4.4.4-13) (GCC)  */
+
+
+	/* Constants... */
+	LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
+		 (long long)PTL_RPC_MSG_REQUEST);
+	LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n",
+		 (long long)PTL_RPC_MSG_ERR);
+	LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n",
+		 (long long)PTL_RPC_MSG_REPLY);
+	LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n",
+		 MDS_DIR_END_OFF);
+	LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n",
+		 DEAD_HANDLE_MAGIC);
+	CLASSERT(MTI_NAME_MAXLEN == 64);
+	LASSERTF(OST_REPLY == 0, "found %lld\n",
+		 (long long)OST_REPLY);
+	LASSERTF(OST_GETATTR == 1, "found %lld\n",
+		 (long long)OST_GETATTR);
+	LASSERTF(OST_SETATTR == 2, "found %lld\n",
+		 (long long)OST_SETATTR);
+	LASSERTF(OST_READ == 3, "found %lld\n",
+		 (long long)OST_READ);
+	LASSERTF(OST_WRITE == 4, "found %lld\n",
+		 (long long)OST_WRITE);
+	LASSERTF(OST_CREATE == 5, "found %lld\n",
+		 (long long)OST_CREATE);
+	LASSERTF(OST_DESTROY == 6, "found %lld\n",
+		 (long long)OST_DESTROY);
+	LASSERTF(OST_GET_INFO == 7, "found %lld\n",
+		 (long long)OST_GET_INFO);
+	LASSERTF(OST_CONNECT == 8, "found %lld\n",
+		 (long long)OST_CONNECT);
+	LASSERTF(OST_DISCONNECT == 9, "found %lld\n",
+		 (long long)OST_DISCONNECT);
+	LASSERTF(OST_PUNCH == 10, "found %lld\n",
+		 (long long)OST_PUNCH);
+	LASSERTF(OST_OPEN == 11, "found %lld\n",
+		 (long long)OST_OPEN);
+	LASSERTF(OST_CLOSE == 12, "found %lld\n",
+		 (long long)OST_CLOSE);
+	LASSERTF(OST_STATFS == 13, "found %lld\n",
+		 (long long)OST_STATFS);
+	LASSERTF(OST_SYNC == 16, "found %lld\n",
+		 (long long)OST_SYNC);
+	LASSERTF(OST_SET_INFO == 17, "found %lld\n",
+		 (long long)OST_SET_INFO);
+	LASSERTF(OST_QUOTACHECK == 18, "found %lld\n",
+		 (long long)OST_QUOTACHECK);
+	LASSERTF(OST_QUOTACTL == 19, "found %lld\n",
+		 (long long)OST_QUOTACTL);
+	LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n",
+		 (long long)OST_QUOTA_ADJUST_QUNIT);
+	LASSERTF(OST_LAST_OPC == 21, "found %lld\n",
+		 (long long)OST_LAST_OPC);
+	LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+		 OBD_OBJECT_EOF);
+	LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n",
+		 (long long)OST_MIN_PRECREATE);
+	LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n",
+		 (long long)OST_MAX_PRECREATE);
+	LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n",
+		 OST_LVB_ERR_INIT);
+	LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n",
+		 OST_LVB_ERR_MASK);
+	LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n",
+		 (long long)MDS_FIRST_OPC);
+	LASSERTF(MDS_GETATTR == 33, "found %lld\n",
+		 (long long)MDS_GETATTR);
+	LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n",
+		 (long long)MDS_GETATTR_NAME);
+	LASSERTF(MDS_CLOSE == 35, "found %lld\n",
+		 (long long)MDS_CLOSE);
+	LASSERTF(MDS_REINT == 36, "found %lld\n",
+		 (long long)MDS_REINT);
+	LASSERTF(MDS_READPAGE == 37, "found %lld\n",
+		 (long long)MDS_READPAGE);
+	LASSERTF(MDS_CONNECT == 38, "found %lld\n",
+		 (long long)MDS_CONNECT);
+	LASSERTF(MDS_DISCONNECT == 39, "found %lld\n",
+		 (long long)MDS_DISCONNECT);
+	LASSERTF(MDS_GETSTATUS == 40, "found %lld\n",
+		 (long long)MDS_GETSTATUS);
+	LASSERTF(MDS_STATFS == 41, "found %lld\n",
+		 (long long)MDS_STATFS);
+	LASSERTF(MDS_PIN == 42, "found %lld\n",
+		 (long long)MDS_PIN);
+	LASSERTF(MDS_UNPIN == 43, "found %lld\n",
+		 (long long)MDS_UNPIN);
+	LASSERTF(MDS_SYNC == 44, "found %lld\n",
+		 (long long)MDS_SYNC);
+	LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n",
+		 (long long)MDS_DONE_WRITING);
+	LASSERTF(MDS_SET_INFO == 46, "found %lld\n",
+		 (long long)MDS_SET_INFO);
+	LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n",
+		 (long long)MDS_QUOTACHECK);
+	LASSERTF(MDS_QUOTACTL == 48, "found %lld\n",
+		 (long long)MDS_QUOTACTL);
+	LASSERTF(MDS_GETXATTR == 49, "found %lld\n",
+		 (long long)MDS_GETXATTR);
+	LASSERTF(MDS_SETXATTR == 50, "found %lld\n",
+		 (long long)MDS_SETXATTR);
+	LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n",
+		 (long long)MDS_WRITEPAGE);
+	LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n",
+		 (long long)MDS_IS_SUBDIR);
+	LASSERTF(MDS_GET_INFO == 53, "found %lld\n",
+		 (long long)MDS_GET_INFO);
+	LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n",
+		 (long long)MDS_HSM_STATE_GET);
+	LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n",
+		 (long long)MDS_HSM_STATE_SET);
+	LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n",
+		 (long long)MDS_HSM_ACTION);
+	LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n",
+		 (long long)MDS_HSM_PROGRESS);
+	LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n",
+		 (long long)MDS_HSM_REQUEST);
+	LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n",
+		 (long long)MDS_HSM_CT_REGISTER);
+	LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n",
+		 (long long)MDS_HSM_CT_UNREGISTER);
+	LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
+		 (long long)MDS_SWAP_LAYOUTS);
+	LASSERTF(MDS_LAST_OPC == 62, "found %lld\n",
+		 (long long)MDS_LAST_OPC);
+	LASSERTF(REINT_SETATTR == 1, "found %lld\n",
+		 (long long)REINT_SETATTR);
+	LASSERTF(REINT_CREATE == 2, "found %lld\n",
+		 (long long)REINT_CREATE);
+	LASSERTF(REINT_LINK == 3, "found %lld\n",
+		 (long long)REINT_LINK);
+	LASSERTF(REINT_UNLINK == 4, "found %lld\n",
+		 (long long)REINT_UNLINK);
+	LASSERTF(REINT_RENAME == 5, "found %lld\n",
+		 (long long)REINT_RENAME);
+	LASSERTF(REINT_OPEN == 6, "found %lld\n",
+		 (long long)REINT_OPEN);
+	LASSERTF(REINT_SETXATTR == 7, "found %lld\n",
+		 (long long)REINT_SETXATTR);
+	LASSERTF(REINT_RMENTRY == 8, "found %lld\n",
+		 (long long)REINT_RMENTRY);
+	LASSERTF(REINT_MAX == 9, "found %lld\n",
+		 (long long)REINT_MAX);
+	LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_IT_EXECD);
+	LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_EXECD);
+	LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_NEG);
+	LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_LOOKUP_POS);
+	LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_CREATE);
+	LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_OPEN);
+	LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_COMPLETE);
+	LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_OPEN_REF);
+	LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_ENQ_CREATE_REF);
+	LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n",
+		(unsigned)DISP_OPEN_LOCK);
+	LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n",
+		 (long long)MDS_STATUS_CONN);
+	LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n",
+		 (long long)MDS_STATUS_LOV);
+	LASSERTF(LUSTRE_BFLAG_UNCOMMITTED_WRITES == 1, "found %lld\n",
+		 (long long)LUSTRE_BFLAG_UNCOMMITTED_WRITES);
+	LASSERTF(MF_SOM_CHANGE == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MF_SOM_CHANGE);
+	LASSERTF(MF_EPOCH_OPEN == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MF_EPOCH_OPEN);
+	LASSERTF(MF_EPOCH_CLOSE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MF_EPOCH_CLOSE);
+	LASSERTF(MF_MDC_CANCEL_FID1 == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID1);
+	LASSERTF(MF_MDC_CANCEL_FID2 == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID2);
+	LASSERTF(MF_MDC_CANCEL_FID3 == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID3);
+	LASSERTF(MF_MDC_CANCEL_FID4 == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MF_MDC_CANCEL_FID4);
+	LASSERTF(MF_SOM_AU == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MF_SOM_AU);
+	LASSERTF(MF_GETATTR_LOCK == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MF_GETATTR_LOCK);
+	LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MODE);
+	LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_UID);
+	LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_GID);
+	LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_SIZE);
+	LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATIME);
+	LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MTIME);
+	LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_CTIME);
+	LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATIME_SET);
+	LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_MTIME_SET);
+	LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_FORCE);
+	LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_ATTR_FLAG);
+	LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_KILL_SUID);
+	LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_KILL_SGID);
+	LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_CTIME_SET);
+	LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_FROM_OPEN);
+	LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
+			(long long)MDS_ATTR_BLOCKS);
+	LASSERTF(FLD_QUERY == 900, "found %lld\n",
+		 (long long)FLD_QUERY);
+	LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n",
+		 (long long)FLD_FIRST_OPC);
+	LASSERTF(FLD_LAST_OPC == 901, "found %lld\n",
+		 (long long)FLD_LAST_OPC);
+	LASSERTF(SEQ_QUERY == 700, "found %lld\n",
+		 (long long)SEQ_QUERY);
+	LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n",
+		 (long long)SEQ_FIRST_OPC);
+	LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n",
+		 (long long)SEQ_LAST_OPC);
+	LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n",
+		 (long long)SEQ_ALLOC_SUPER);
+	LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n",
+		 (long long)SEQ_ALLOC_META);
+	LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n",
+		 (long long)LDLM_ENQUEUE);
+	LASSERTF(LDLM_CONVERT == 102, "found %lld\n",
+		 (long long)LDLM_CONVERT);
+	LASSERTF(LDLM_CANCEL == 103, "found %lld\n",
+		 (long long)LDLM_CANCEL);
+	LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n",
+		 (long long)LDLM_BL_CALLBACK);
+	LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n",
+		 (long long)LDLM_CP_CALLBACK);
+	LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n",
+		 (long long)LDLM_GL_CALLBACK);
+	LASSERTF(LDLM_SET_INFO == 107, "found %lld\n",
+		 (long long)LDLM_SET_INFO);
+	LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n",
+		 (long long)LDLM_LAST_OPC);
+	LASSERTF(LCK_MINMODE == 0, "found %lld\n",
+		 (long long)LCK_MINMODE);
+	LASSERTF(LCK_EX == 1, "found %lld\n",
+		 (long long)LCK_EX);
+	LASSERTF(LCK_PW == 2, "found %lld\n",
+		 (long long)LCK_PW);
+	LASSERTF(LCK_PR == 4, "found %lld\n",
+		 (long long)LCK_PR);
+	LASSERTF(LCK_CW == 8, "found %lld\n",
+		 (long long)LCK_CW);
+	LASSERTF(LCK_CR == 16, "found %lld\n",
+		 (long long)LCK_CR);
+	LASSERTF(LCK_NL == 32, "found %lld\n",
+		 (long long)LCK_NL);
+	LASSERTF(LCK_GROUP == 64, "found %lld\n",
+		 (long long)LCK_GROUP);
+	LASSERTF(LCK_COS == 128, "found %lld\n",
+		 (long long)LCK_COS);
+	LASSERTF(LCK_MAXMODE == 129, "found %lld\n",
+		 (long long)LCK_MAXMODE);
+	LASSERTF(LCK_MODE_NUM == 8, "found %lld\n",
+		 (long long)LCK_MODE_NUM);
+	CLASSERT(LDLM_PLAIN == 10);
+	CLASSERT(LDLM_EXTENT == 11);
+	CLASSERT(LDLM_FLOCK == 12);
+	CLASSERT(LDLM_IBITS == 13);
+	CLASSERT(LDLM_MAX_TYPE == 14);
+	CLASSERT(LUSTRE_RES_ID_SEQ_OFF == 0);
+	CLASSERT(LUSTRE_RES_ID_VER_OID_OFF == 1);
+	LASSERTF(UPDATE_OBJ == 1000, "found %lld\n",
+		 (long long)UPDATE_OBJ);
+	LASSERTF(UPDATE_LAST_OPC == 1001, "found %lld\n",
+		 (long long)UPDATE_LAST_OPC);
+	CLASSERT(LUSTRE_RES_ID_QUOTA_SEQ_OFF == 2);
+	CLASSERT(LUSTRE_RES_ID_QUOTA_VER_OID_OFF == 3);
+	CLASSERT(LUSTRE_RES_ID_HSH_OFF == 3);
+	CLASSERT(LQUOTA_TYPE_USR == 0);
+	CLASSERT(LQUOTA_TYPE_GRP == 1);
+	CLASSERT(LQUOTA_RES_MD == 1);
+	CLASSERT(LQUOTA_RES_DT == 2);
+	LASSERTF(OBD_PING == 400, "found %lld\n",
+		 (long long)OBD_PING);
+	LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n",
+		 (long long)OBD_LOG_CANCEL);
+	LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n",
+		 (long long)OBD_QC_CALLBACK);
+	LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
+		 (long long)OBD_IDX_READ);
+	LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
+		 (long long)OBD_LAST_OPC);
+	LASSERTF(QUOTA_DQACQ == 601, "found %lld\n",
+		 (long long)QUOTA_DQACQ);
+	LASSERTF(QUOTA_DQREL == 602, "found %lld\n",
+		 (long long)QUOTA_DQREL);
+	LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n",
+		 (long long)QUOTA_LAST_OPC);
+	LASSERTF(MGS_CONNECT == 250, "found %lld\n",
+		 (long long)MGS_CONNECT);
+	LASSERTF(MGS_DISCONNECT == 251, "found %lld\n",
+		 (long long)MGS_DISCONNECT);
+	LASSERTF(MGS_EXCEPTION == 252, "found %lld\n",
+		 (long long)MGS_EXCEPTION);
+	LASSERTF(MGS_TARGET_REG == 253, "found %lld\n",
+		 (long long)MGS_TARGET_REG);
+	LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n",
+		 (long long)MGS_TARGET_DEL);
+	LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
+		 (long long)MGS_SET_INFO);
+	LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
+		 (long long)MGS_LAST_OPC);
+	LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
+		 (long long)SEC_CTX_INIT);
+	LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n",
+		 (long long)SEC_CTX_INIT_CONT);
+	LASSERTF(SEC_CTX_FINI == 803, "found %lld\n",
+		 (long long)SEC_CTX_FINI);
+	LASSERTF(SEC_LAST_OPC == 804, "found %lld\n",
+		 (long long)SEC_LAST_OPC);
+	/* Sizes and Offsets */
+
+	/* Checks for struct obd_uuid */
+	LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_uuid));
+
+	/* Checks for struct lu_seq_range */
+	LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_seq_range));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_start));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_end));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_index));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index));
+	LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_seq_range, lsr_flags));
+	LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags));
+	LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n",
+		 (long long)LU_SEQ_RANGE_MDT);
+	LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n",
+		 (long long)LU_SEQ_RANGE_OST);
+
+	/* Checks for struct lustre_mdt_attrs */
+	LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_mdt_attrs));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat));
+	LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid));
+	LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid));
+	LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LMAI_RELEASED);
+	LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_HSM);
+	LASSERTF(LMAC_SOM == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LMAC_SOM);
+	LASSERTF(OBJ_CREATE == 1, "found %lld\n",
+		 (long long)OBJ_CREATE);
+	LASSERTF(OBJ_DESTROY == 2, "found %lld\n",
+		 (long long)OBJ_DESTROY);
+	LASSERTF(OBJ_REF_ADD == 3, "found %lld\n",
+		 (long long)OBJ_REF_ADD);
+	LASSERTF(OBJ_REF_DEL == 4, "found %lld\n",
+		 (long long)OBJ_REF_DEL);
+	LASSERTF(OBJ_ATTR_SET == 5, "found %lld\n",
+		 (long long)OBJ_ATTR_SET);
+	LASSERTF(OBJ_ATTR_GET == 6, "found %lld\n",
+		 (long long)OBJ_ATTR_GET);
+	LASSERTF(OBJ_XATTR_SET == 7, "found %lld\n",
+		 (long long)OBJ_XATTR_SET);
+	LASSERTF(OBJ_XATTR_GET == 8, "found %lld\n",
+		 (long long)OBJ_XATTR_GET);
+	LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+		 (long long)OBJ_INDEX_LOOKUP);
+	LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+		 (long long)OBJ_INDEX_LOOKUP);
+	LASSERTF(OBJ_INDEX_INSERT == 10, "found %lld\n",
+		 (long long)OBJ_INDEX_INSERT);
+	LASSERTF(OBJ_INDEX_DELETE == 11, "found %lld\n",
+		 (long long)OBJ_INDEX_DELETE);
+
+	/* Checks for struct som_attrs */
+	LASSERTF((int)sizeof(struct som_attrs) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct som_attrs));
+	LASSERTF((int)offsetof(struct som_attrs, som_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_compat));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_compat));
+	LASSERTF((int)offsetof(struct som_attrs, som_incompat) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_incompat));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_incompat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_incompat));
+	LASSERTF((int)offsetof(struct som_attrs, som_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_ioepoch));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_ioepoch));
+	LASSERTF((int)offsetof(struct som_attrs, som_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_size));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_size));
+	LASSERTF((int)offsetof(struct som_attrs, som_blocks) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_blocks));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_blocks));
+	LASSERTF((int)offsetof(struct som_attrs, som_mountid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct som_attrs, som_mountid));
+	LASSERTF((int)sizeof(((struct som_attrs *)0)->som_mountid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct som_attrs *)0)->som_mountid));
+
+	/* Checks for struct hsm_attrs */
+	LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_attrs));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_compat) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_compat));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_compat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_compat));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_flags));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_flags));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_arch_id));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id));
+	LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_ver) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_attrs, hsm_arch_ver));
+	LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver));
+
+	/* Checks for struct ost_id */
+	LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_id));
+	LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_id, oi));
+	LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_id *)0)->oi));
+	LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n",
+		 (long long)LUSTRE_FID_INIT_OID);
+	LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n",
+		 (long long)FID_SEQ_OST_MDT0);
+	LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n",
+		 (long long)FID_SEQ_LLOG);
+	LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n",
+		 (long long)FID_SEQ_ECHO);
+	LASSERTF(FID_SEQ_OST_MDT1 == 3, "found %lld\n",
+		 (long long)FID_SEQ_OST_MDT1);
+	LASSERTF(FID_SEQ_OST_MAX == 9, "found %lld\n",
+		 (long long)FID_SEQ_OST_MAX);
+	LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n",
+		 (long long)FID_SEQ_RSVD);
+	LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n",
+		 (long long)FID_SEQ_IGIF);
+	LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IGIF_MAX);
+	LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IDIF);
+	LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_IDIF_MAX);
+	LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_START);
+	LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOCAL_FILE);
+	LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_DOT_LUSTRE);
+	LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_SPECIAL);
+	LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_QUOTA);
+	LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_QUOTA_GLB);
+	LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_ROOT);
+	LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_NORMAL);
+	LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+			(long long)FID_SEQ_LOV_DEFAULT);
+	LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_SPECIAL_BFL);
+	LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE);
+	LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)FID_OID_DOT_LUSTRE_OBF);
+
+	/* Checks for struct lu_dirent */
+	LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_dirent));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_fid));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_hash));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_reclen));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_namelen));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_attrs));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs));
+	LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirent, lde_name[0]));
+	LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0]));
+	LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_FID);
+	LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_TYPE);
+	LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)LUDA_64BITHASH);
+
+	/* Checks for struct luda_type */
+	LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(struct luda_type));
+	LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct luda_type, lt_type));
+	LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct luda_type *)0)->lt_type));
+
+	/* Checks for struct lu_dirpage */
+	LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_dirpage));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_flags));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_pad0));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0));
+	LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0]));
+	LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]));
+	LASSERTF(LDF_EMPTY == 1, "found %lld\n",
+		 (long long)LDF_EMPTY);
+	LASSERTF(LDF_COLLIDE == 2, "found %lld\n",
+		 (long long)LDF_COLLIDE);
+	LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n",
+		 (long long)LU_PAGE_SIZE);
+	/* Checks for union lu_page */
+	LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
+		 (long long)(int)sizeof(union lu_page));
+
+	/* Checks for struct lustre_handle */
+	LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_handle));
+	LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_handle, cookie));
+	LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_handle *)0)->cookie));
+
+	/* Checks for struct lustre_msg_v2 */
+	LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_msg_v2));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_magic));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_flags));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3));
+	LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
+	LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
+	LASSERTF(LUSTRE_MSG_MAGIC_V1 == 0x0BD00BD0, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V1);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V2);
+	LASSERTF(LUSTRE_MSG_MAGIC_V1_SWABBED == 0xD00BD00B, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V1_SWABBED);
+	LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n",
+		LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+	/* Checks for struct ptlrpc_body */
+	LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
+		 (long long)(int)sizeof(struct ptlrpc_body_v3));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_seen));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv));
+	CLASSERT(PTLRPC_NUM_VERSIONS == 4);
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding));
+	CLASSERT(JOBSTATS_JOBID_SIZE == 32);
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == (int)offsetof(struct ptlrpc_body_v2, pb_last_seen), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_seen), (int)offsetof(struct ptlrpc_body_v2, pb_last_seen));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions));
+	LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == (int)offsetof(struct ptlrpc_body_v2, pb_padding), "%d != %d\n",
+		 (int)offsetof(struct ptlrpc_body_v3, pb_padding), (int)offsetof(struct ptlrpc_body_v2, pb_padding));
+	LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding), "%d != %d\n",
+		 (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding));
+	LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n",
+		 (long long)MSG_PTLRPC_BODY_OFF);
+	LASSERTF(REQ_REC_OFF == 1, "found %lld\n",
+		 (long long)REQ_REC_OFF);
+	LASSERTF(REPLY_REC_OFF == 1, "found %lld\n",
+		 (long long)REPLY_REC_OFF);
+	LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n",
+		 (long long)DLM_LOCKREQ_OFF);
+	LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n",
+		 (long long)DLM_REQ_REC_OFF);
+	LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n",
+		 (long long)DLM_INTENT_IT_OFF);
+	LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n",
+		 (long long)DLM_INTENT_REC_OFF);
+	LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n",
+		 (long long)DLM_LOCKREPLY_OFF);
+	LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n",
+		 (long long)DLM_REPLY_REC_OFF);
+	LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
+		 (long long)MSG_PTLRPC_HEADER_OFF);
+	LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n",
+		PTLRPC_MSG_VERSION);
+	LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n",
+		LUSTRE_VERSION_MASK);
+	LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n",
+		LUSTRE_OBD_VERSION);
+	LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n",
+		LUSTRE_MDS_VERSION);
+	LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n",
+		LUSTRE_OST_VERSION);
+	LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n",
+		LUSTRE_DLM_VERSION);
+	LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n",
+		LUSTRE_LOG_VERSION);
+	LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n",
+		LUSTRE_MGS_VERSION);
+	LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
+		 (long long)MSGHDR_AT_SUPPORT);
+	LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
+		 (long long)MSGHDR_CKSUM_INCOMPAT18);
+	LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_OP_FLAG_MASK);
+	LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n",
+		 (long long)MSG_OP_FLAG_SHIFT);
+	LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n",
+		(unsigned)MSG_GEN_FLAG_MASK);
+	LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_LAST_REPLAY);
+	LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_RESENT);
+	LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_REPLAY);
+	LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_DELAY_REPLAY);
+	LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_VERSION_REPLAY);
+	LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_REQ_REPLAY_DONE);
+	LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_LOCK_REPLAY_DONE);
+	LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_RECOVERING);
+	LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_RECONNECT);
+	LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_REPLAYABLE);
+	LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_LIBCLIENT);
+	LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_INITIAL);
+	LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_ASYNC);
+	LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_NEXT_VER);
+	LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)MSG_CONNECT_TRANSNO);
+
+	/* Checks for struct obd_connect_data */
+	LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_connect_data));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_version));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_index));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_unused));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_transno));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_group));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_instance));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance));
+	LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding1));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding2));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding3));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding4));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding5));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding6));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding7));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding8));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8));
+	LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, padding9));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingA));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingB));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingC));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingD));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingE));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE));
+	LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_connect_data, paddingF));
+	LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF));
+	LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RDONLY);
+	LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_INDEX);
+	LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS);
+	LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT);
+	LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SRVLOCK);
+	LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_VERSION);
+	LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_REQPORTAL);
+	LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_ACL);
+	LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_XATTR);
+	LASSERTF(OBD_CONNECT_CROW == 0x200ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CROW);
+	LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_TRUNCLOCK);
+	LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_TRANSNO);
+	LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_IBITS);
+	LASSERTF(OBD_CONNECT_JOIN == 0x2000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_JOIN);
+	LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_ATTRFID);
+	LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_NODEVOH);
+	LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RMT_CLIENT);
+	LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_RMT_CLIENT_FORCE);
+	LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_BRW_SIZE);
+	LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_QUOTA64);
+	LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS_CAPA);
+	LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_OSS_CAPA);
+	LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CANCELSET);
+	LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SOM);
+	LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_AT);
+	LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LRU_RESIZE);
+	LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MDS_MDS);
+	LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_REAL);
+	LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CHANGE_QS);
+	LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_CKSUM);
+	LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FID);
+	LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_VBR);
+	LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LOV_V3);
+	LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT_SHRINK);
+	LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SKIP_ORPHAN);
+	LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MAX_EASIZE);
+	LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FULL20);
+	LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LAYOUTLOCK);
+	LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_64BITHASH);
+	LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_MAXBYTES);
+	LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_IMP_RECOV);
+	LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_JOBSTATS);
+	LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_UMASK);
+	LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_EINPROGRESS);
+	LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_GRANT_PARAM);
+	LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_FLOCK_OWNER);
+	LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LVB_TYPE);
+	LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_NANOSEC_TIME);
+	LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_LIGHTWEIGHT);
+	LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_SHORTIO);
+	LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n",
+		 OBD_CONNECT_PINGLESS);
+	LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_CRC32);
+	LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_ADLER);
+	LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)OBD_CKSUM_CRC32C);
+
+	/* Checks for struct obdo */
+	LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n",
+		 (long long)(int)sizeof(struct obdo));
+	LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_valid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_valid));
+	LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_oi));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_oi));
+	LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_seq));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq));
+	LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_size));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_size));
+	LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_mtime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_mtime));
+	LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_atime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+	LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_ctime));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_ctime));
+	LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_blocks));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_blocks));
+	LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_grant));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_grant));
+	LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_blksize));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_blksize));
+	LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_mode));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_mode));
+	LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_uid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_uid));
+	LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_gid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_gid));
+	LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_flags));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_flags));
+	LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_nlink));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_nlink));
+	LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_oid));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid));
+	LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_misc));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_misc));
+	LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_ioepoch));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch));
+	LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_stripe_idx));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx));
+	LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_parent_ver));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver));
+	LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_handle));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_handle));
+	LASSERTF((int)offsetof(struct obdo, o_lcookie) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_lcookie));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_lcookie) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_lcookie));
+	LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_uid_h));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_uid_h));
+	LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_gid_h));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_gid_h));
+	LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_data_version));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_data_version));
+	LASSERTF((int)offsetof(struct obdo, o_padding_4) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_4));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_4));
+	LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_5));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_5));
+	LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n",
+		 (long long)(int)offsetof(struct obdo, o_padding_6));
+	LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obdo *)0)->o_padding_6));
+	LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLID);
+	LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLATIME);
+	LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMTIME);
+	LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCTIME);
+	LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLSIZE);
+	LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLBLOCKS);
+	LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLBLKSZ);
+	LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMODE);
+	LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLTYPE);
+	LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLUID);
+	LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGID);
+	LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLFLAGS);
+	LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLNLINK);
+	LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGENER);
+	LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRDEV);
+	LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLEASIZE);
+	LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_LINKNAME);
+	LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLHANDLE);
+	LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCKSUM);
+	LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLQOS);
+	LASSERTF(OBD_MD_FLCOOKIE == (0x00800000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCOOKIE);
+	LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGROUP);
+	LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLFID);
+	LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLEPOCH);
+	LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGRANT);
+	LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLDIREA);
+	LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLUSRQUOTA);
+	LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGRPQUOTA);
+	LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMODEASIZE);
+	LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_MDS);
+	LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_REINT);
+	LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_MEA);
+	LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTR);
+	LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTRLS);
+	LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLXATTRRM);
+	LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLACL);
+	LASSERTF(OBD_MD_FLRMTPERM == (0x0000010000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTPERM);
+	LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLMDSCAPA);
+	LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLOSSCAPA);
+	LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCKSPLIT);
+	LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLCROSSREF);
+	LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLGETATTRLOCK);
+	LASSERTF(OBD_MD_FLRMTLSETFACL == (0x0001000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTLSETFACL);
+	LASSERTF(OBD_MD_FLRMTLGETFACL == (0x0002000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTLGETFACL);
+	LASSERTF(OBD_MD_FLRMTRSETFACL == (0x0004000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTRSETFACL);
+	LASSERTF(OBD_MD_FLRMTRGETFACL == (0x0008000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLRMTRGETFACL);
+	LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n",
+		 OBD_MD_FLDATAVERSION);
+	CLASSERT(OBD_FL_INLINEDATA == 0x00000001);
+	CLASSERT(OBD_FL_OBDMDEXISTS == 0x00000002);
+	CLASSERT(OBD_FL_DELORPHAN == 0x00000004);
+	CLASSERT(OBD_FL_NORPC == 0x00000008);
+	CLASSERT(OBD_FL_IDONLY == 0x00000010);
+	CLASSERT(OBD_FL_RECREATE_OBJS == 0x00000020);
+	CLASSERT(OBD_FL_DEBUG_CHECK == 0x00000040);
+	CLASSERT(OBD_FL_NO_USRQUOTA == 0x00000100);
+	CLASSERT(OBD_FL_NO_GRPQUOTA == 0x00000200);
+	CLASSERT(OBD_FL_CREATE_CROW == 0x00000400);
+	CLASSERT(OBD_FL_SRVLOCK == 0x00000800);
+	CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000);
+	CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000);
+	CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000);
+	CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000);
+	CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000);
+	CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000);
+	CLASSERT(OBD_FL_MMAP == 0x00040000);
+	CLASSERT(OBD_FL_RECOV_RESEND == 0x00080000);
+	CLASSERT(OBD_FL_NOSPC_BLK == 0x00100000);
+	CLASSERT(OBD_FL_LOCAL_MASK == 0xf0000000);
+
+	/* Checks for struct lov_ost_data_v1 */
+	LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_ost_data_v1));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen));
+	LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
+	LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
+
+	/* Checks for struct lov_mds_md_v1 */
+	LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_mds_md_v1));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen));
+	LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]));
+	CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0);
+
+	/* Checks for struct lov_mds_md_v3 */
+	LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_mds_md_v3));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen));
+	CLASSERT(LOV_MAXPOOLNAME == 16);
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]));
+	LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0]));
+	LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]));
+	CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0);
+	LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_RAID0);
+	LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_RAID1);
+	LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_FIRST);
+	LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n",
+		(unsigned)LOV_PATTERN_CMOBD);
+
+	/* Checks for struct obd_statfs */
+	LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_statfs));
+	LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_type));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_type));
+	LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_blocks));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bfree));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bavail));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
+	LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_ffree));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree));
+	LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_fsid));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid));
+	LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_bsize));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize));
+	LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_namelen));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
+	LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_state));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
+	LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare2));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare3));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare4));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare5));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare6));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare7));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare8));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8));
+	LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_statfs, os_spare9));
+	LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
+
+	/* Checks for struct obd_ioobj */
+	LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_ioobj));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_oid));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw));
+	LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt));
+	LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt));
+
+	/* Checks for union lquota_id */
+	LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(union lquota_id));
+
+	LASSERTF(QUOTABLOCK_BITS == 10, "found %lld\n",
+		 (long long)QUOTABLOCK_BITS);
+	LASSERTF(QUOTABLOCK_SIZE == 1024, "found %lld\n",
+		 (long long)QUOTABLOCK_SIZE);
+
+	/* Checks for struct obd_quotactl */
+	LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_quotactl));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_cmd));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_type));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_id));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_stat));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo));
+	LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_quotactl, qc_dqblk));
+	LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk));
+
+	/* Checks for struct obd_dqinfo */
+	LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_dqinfo));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_flags));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags));
+	LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqinfo, dqi_valid));
+	LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid));
+
+	/* Checks for struct obd_dqblk */
+	LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct obd_dqblk));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_curspace));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_btime));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_itime));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_valid));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid));
+	LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct obd_dqblk, dqb_padding));
+	LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding));
+	LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n",
+		Q_QUOTACHECK);
+	LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n",
+		Q_INITQUOTA);
+	LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n",
+		Q_GETOINFO);
+	LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n",
+		Q_GETOQUOTA);
+	LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n",
+		Q_FINVALIDATE);
+
+	/* Checks for struct lquota_acct_rec */
+	LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_acct_rec));
+	LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_acct_rec, bspace));
+	LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace));
+	LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_acct_rec, ispace));
+	LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace));
+
+	/* Checks for struct lquota_glb_rec */
+	LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_glb_rec));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_time));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time));
+	LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted));
+	LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted));
+
+	/* Checks for struct lquota_slv_rec */
+	LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_slv_rec));
+	LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted));
+	LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted));
+
+	/* Checks for struct idx_info */
+	LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct idx_info));
+	LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_magic));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_magic));
+	LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_flags));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_flags));
+	LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_count));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_count));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad0));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0));
+	LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_attrs));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs));
+	LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_fid));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_fid));
+	LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_version));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_version));
+	LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_hash_start));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start));
+	LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_hash_end));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end));
+	LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_keysize));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize));
+	LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_recsize));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad1));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad2));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2));
+	LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct idx_info, ii_pad3));
+	LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3));
+	CLASSERT(IDX_INFO_MAGIC == 0x3D37CC37);
+
+	/* Checks for struct lu_idxpage */
+	LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct lu_idxpage));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_magic));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_flags));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_nr));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr));
+	LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lu_idxpage, lip_pad0));
+	LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0));
+	CLASSERT(LIP_MAGIC == 0x8A6D6B6C);
+	LASSERTF(LIP_HDR_SIZE == 16, "found %lld\n",
+		 (long long)LIP_HDR_SIZE);
+	LASSERTF(II_FL_NOHASH == 1, "found %lld\n",
+		 (long long)II_FL_NOHASH);
+	LASSERTF(II_FL_VARKEY == 2, "found %lld\n",
+		 (long long)II_FL_VARKEY);
+	LASSERTF(II_FL_VARREC == 4, "found %lld\n",
+		 (long long)II_FL_VARREC);
+	LASSERTF(II_FL_NONUNQ == 8, "found %lld\n",
+		 (long long)II_FL_NONUNQ);
+
+	/* Checks for struct niobuf_remote */
+	LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct niobuf_remote));
+	LASSERTF((int)offsetof(struct niobuf_remote, offset) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, offset));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->offset));
+	LASSERTF((int)offsetof(struct niobuf_remote, len) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, len));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->len));
+	LASSERTF((int)offsetof(struct niobuf_remote, flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct niobuf_remote, flags));
+	LASSERTF((int)sizeof(((struct niobuf_remote *)0)->flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct niobuf_remote *)0)->flags));
+	LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n",
+		OBD_BRW_READ);
+	LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n",
+		OBD_BRW_WRITE);
+	LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n",
+		OBD_BRW_SYNC);
+	LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n",
+		OBD_BRW_CHECK);
+	LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n",
+		OBD_BRW_FROM_GRANT);
+	LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n",
+		OBD_BRW_GRANTED);
+	LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n",
+		OBD_BRW_NOCACHE);
+	LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n",
+		OBD_BRW_NOQUOTA);
+	LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n",
+		OBD_BRW_SRVLOCK);
+	LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n",
+		OBD_BRW_ASYNC);
+	LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n",
+		OBD_BRW_MEMALLOC);
+
+	/* Checks for struct ost_body */
+	LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_body));
+	LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_body, oa));
+	LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_body *)0)->oa));
+
+	/* Checks for struct ll_fid */
+	LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fid));
+	LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, id));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->id));
+	LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, generation));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->generation));
+	LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fid, f_type));
+	LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
+
+	/* Checks for struct mdt_body */
+	LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_body));
+	LASSERTF((int)offsetof(struct mdt_body, fid1) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fid1));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fid1));
+	LASSERTF((int)offsetof(struct mdt_body, fid2) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fid2));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fid2));
+	LASSERTF((int)offsetof(struct mdt_body, handle) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, handle));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->handle));
+	LASSERTF((int)offsetof(struct mdt_body, valid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, valid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->valid));
+	LASSERTF((int)offsetof(struct mdt_body, size) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, size));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->size));
+	LASSERTF((int)offsetof(struct mdt_body, mtime) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mtime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mtime));
+	LASSERTF((int)offsetof(struct mdt_body, atime) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, atime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->atime));
+	LASSERTF((int)offsetof(struct mdt_body, ctime) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, ctime));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->ctime));
+	LASSERTF((int)offsetof(struct mdt_body, blocks) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, blocks));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->blocks));
+	LASSERTF((int)offsetof(struct mdt_body, unused1) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, unused1));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->unused1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->unused1));
+	LASSERTF((int)offsetof(struct mdt_body, fsuid) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fsuid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fsuid));
+	LASSERTF((int)offsetof(struct mdt_body, fsgid) == 108, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, fsgid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->fsgid));
+	LASSERTF((int)offsetof(struct mdt_body, capability) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, capability));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->capability) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->capability));
+	LASSERTF((int)offsetof(struct mdt_body, mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, mode));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->mode));
+	LASSERTF((int)offsetof(struct mdt_body, uid) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, uid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->uid));
+	LASSERTF((int)offsetof(struct mdt_body, gid) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, gid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->gid));
+	LASSERTF((int)offsetof(struct mdt_body, flags) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, flags));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->flags));
+	LASSERTF((int)offsetof(struct mdt_body, rdev) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, rdev));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->rdev) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->rdev));
+	LASSERTF((int)offsetof(struct mdt_body, nlink) == 136, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, nlink));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->nlink) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->nlink));
+	LASSERTF((int)offsetof(struct mdt_body, unused2) == 140, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, unused2));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->unused2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->unused2));
+	LASSERTF((int)offsetof(struct mdt_body, suppgid) == 144, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, suppgid));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->suppgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->suppgid));
+	LASSERTF((int)offsetof(struct mdt_body, eadatasize) == 148, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, eadatasize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->eadatasize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->eadatasize));
+	LASSERTF((int)offsetof(struct mdt_body, aclsize) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, aclsize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->aclsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->aclsize));
+	LASSERTF((int)offsetof(struct mdt_body, max_mdsize) == 156, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, max_mdsize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->max_mdsize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->max_mdsize));
+	LASSERTF((int)offsetof(struct mdt_body, max_cookiesize) == 160, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, max_cookiesize));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->max_cookiesize) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->max_cookiesize));
+	LASSERTF((int)offsetof(struct mdt_body, uid_h) == 164, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, uid_h));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->uid_h));
+	LASSERTF((int)offsetof(struct mdt_body, gid_h) == 168, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, gid_h));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->gid_h));
+	LASSERTF((int)offsetof(struct mdt_body, padding_5) == 172, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_5));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_5));
+	LASSERTF((int)offsetof(struct mdt_body, padding_6) == 176, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_6));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_6));
+	LASSERTF((int)offsetof(struct mdt_body, padding_7) == 184, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_7));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_7));
+	LASSERTF((int)offsetof(struct mdt_body, padding_8) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_8));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_8) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_8));
+	LASSERTF((int)offsetof(struct mdt_body, padding_9) == 200, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_9));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_9) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_9));
+	LASSERTF((int)offsetof(struct mdt_body, padding_10) == 208, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_body, padding_10));
+	LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_10) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_body *)0)->padding_10));
+	LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_CLOSED);
+	LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n",
+		MDS_FMODE_EXEC);
+	LASSERTF(MDS_FMODE_EPOCH == 000001000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_EPOCH);
+	LASSERTF(MDS_FMODE_TRUNC == 000002000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_TRUNC);
+	LASSERTF(MDS_FMODE_SOM == 000004000000UL, "found 0%.11oUL\n",
+		MDS_FMODE_SOM);
+	LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
+		MDS_OPEN_CREATED);
+	LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n",
+		MDS_OPEN_CROSS);
+	LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
+		MDS_OPEN_CREAT);
+	LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
+		MDS_OPEN_EXCL);
+	LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n",
+		MDS_OPEN_TRUNC);
+	LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n",
+		MDS_OPEN_APPEND);
+	LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n",
+		MDS_OPEN_SYNC);
+	LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n",
+		MDS_OPEN_DIRECTORY);
+	LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_BY_FID);
+	LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_DELAY_CREATE);
+	LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_OWNEROVERRIDE);
+	LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_JOIN_FILE);
+	LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_LOCK);
+	LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_HAS_EA);
+	LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n",
+		MDS_OPEN_HAS_OBJS);
+	LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_NORESTORE);
+	LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_NEWSTRIPE);
+	LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n",
+			(long long)MDS_OPEN_VOLATILE);
+	LASSERTF(LUSTRE_SYNC_FL == 0x00000008, "found 0x%.8x\n",
+		LUSTRE_SYNC_FL);
+	LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010, "found 0x%.8x\n",
+		LUSTRE_IMMUTABLE_FL);
+	LASSERTF(LUSTRE_APPEND_FL == 0x00000020, "found 0x%.8x\n",
+		LUSTRE_APPEND_FL);
+	LASSERTF(LUSTRE_NOATIME_FL == 0x00000080, "found 0x%.8x\n",
+		LUSTRE_NOATIME_FL);
+	LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000, "found 0x%.8x\n",
+		LUSTRE_DIRSYNC_FL);
+	LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n",
+		MDS_INODELOCK_LOOKUP);
+	LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n",
+		MDS_INODELOCK_UPDATE);
+	LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n",
+		MDS_INODELOCK_OPEN);
+	LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n",
+		MDS_INODELOCK_LAYOUT);
+
+	/* Checks for struct mdt_ioepoch */
+	LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_ioepoch));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, handle) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, handle));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->handle));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, ioepoch) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, ioepoch));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->ioepoch));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, flags));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->flags));
+	LASSERTF((int)offsetof(struct mdt_ioepoch, padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_ioepoch, padding));
+	LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_ioepoch *)0)->padding));
+
+	/* Checks for struct mdt_remote_perm */
+	LASSERTF((int)sizeof(struct mdt_remote_perm) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_remote_perm));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_uid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_uid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_uid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_gid) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_gid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_gid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_access_perm) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_access_perm));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm));
+	LASSERTF((int)offsetof(struct mdt_remote_perm, rp_padding) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_remote_perm, rp_padding));
+	LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_padding));
+	LASSERTF(CFS_SETUID_PERM == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_SETUID_PERM);
+	LASSERTF(CFS_SETGID_PERM == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_SETGID_PERM);
+	LASSERTF(CFS_SETGRP_PERM == 0x00000004UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_SETGRP_PERM);
+	LASSERTF(CFS_RMTACL_PERM == 0x00000008UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_RMTACL_PERM);
+	LASSERTF(CFS_RMTOWN_PERM == 0x00000010UL, "found 0x%.8xUL\n",
+		(unsigned)CFS_RMTOWN_PERM);
+
+	/* Checks for struct mdt_rec_setattr */
+	LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_setattr));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_3) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5));
+
+	/* Checks for struct mdt_rec_create */
+	LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_create));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_rdev));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_umask));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask));
+	LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4));
+
+	/* Checks for struct mdt_rec_link */
+	LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_link));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9));
+
+	/* Checks for struct mdt_rec_unlink */
+	LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_unlink));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9));
+
+	/* Checks for struct mdt_rec_rename */
+	LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_rename));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8));
+
+	/* Checks for struct mdt_rec_setxattr */
+	LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_setxattr));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10));
+	LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11));
+	LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
+
+	/* Checks for struct mdt_rec_reint */
+	LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
+		 (long long)(int)sizeof(struct mdt_rec_reint));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_cap));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_atime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_size));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_bias));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_mode));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_flags));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
+	LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n",
+		 (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
+	LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
+
+	/* Checks for struct lmv_desc */
+	LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct lmv_desc));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_tgt_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_pattern));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_1));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_2));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_3));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_padding_4));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4));
+	LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_desc, ld_uuid));
+	LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid));
+
+	/* Checks for struct lmv_stripe_md */
+	LASSERTF((int)sizeof(struct lmv_stripe_md) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct lmv_stripe_md));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_magic));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_magic));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_count));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_count));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_master) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_master));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_master) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_master));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_padding));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_padding));
+	CLASSERT(LOV_MAXPOOLNAME == 16);
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_pool_name[16]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_pool_name[16]));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]));
+	LASSERTF((int)offsetof(struct lmv_stripe_md, mea_ids[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lmv_stripe_md, mea_ids[0]));
+	LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]));
+
+	/* Checks for struct lov_desc */
+	LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct lov_desc));
+	LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_tgt_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count));
+	LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_pattern));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size));
+	LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_0));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0));
+	LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_1));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1));
+	LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_padding_2));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2));
+	LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lov_desc, ld_uuid));
+	LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid));
+	CLASSERT(LOV_DESC_MAGIC == 0xB0CCDE5C);
+
+	/* Checks for struct ldlm_res_id */
+	LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_res_id));
+	CLASSERT(RES_NAME_SIZE == 4);
+	LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_res_id, name[4]));
+	LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4]));
+
+	/* Checks for struct ldlm_extent */
+	LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_extent));
+	LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, start));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->start));
+	LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, end));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->end));
+	LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_extent, gid));
+	LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
+
+	/* Checks for struct ldlm_inodebits */
+	LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_inodebits));
+	LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_inodebits, bits));
+	LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+
+	/* Checks for struct ldlm_flock_wire */
+	LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_flock_wire));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding));
+	LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid));
+	LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid));
+
+	/* Checks for struct ldlm_intent */
+	LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_intent));
+	LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_intent, opc));
+	LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_intent *)0)->opc));
+
+	/* Checks for struct ldlm_resource_desc */
+	LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_resource_desc));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_type));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding));
+	LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_resource_desc, lr_name));
+	LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name));
+
+	/* Checks for struct ldlm_lock_desc */
+	LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_resource));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode));
+	LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data));
+	LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data));
+
+	/* Checks for struct ldlm_request */
+	LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_request));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_flags));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_count));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_desc));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_request, lock_handle));
+	LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle));
+
+	/* Checks for struct ldlm_reply */
+	LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_reply));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_flags));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_padding));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_desc));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_handle));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1));
+	LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2));
+	LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2));
+
+	/* Checks for struct ost_lvb_v1 */
+	LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_lvb_v1));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_size));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime));
+	LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks));
+	LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks));
+
+	/* Checks for struct ost_lvb */
+	LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct ost_lvb));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_size));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_mtime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_atime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_ctime));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_blocks));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns));
+	LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct ost_lvb, lvb_padding));
+	LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding));
+
+	/* Checks for struct lquota_lvb */
+	LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct lquota_lvb));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_flags));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit));
+	LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lquota_lvb, lvb_pad1));
+	LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1));
+	LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n",
+		 (long long)LQUOTA_FL_EDQUOT);
+
+	/* Checks for struct ldlm_gl_lquota_desc */
+	LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct ldlm_gl_lquota_desc));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time));
+	LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2));
+	LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2));
+
+	/* Checks for struct mgs_send_param */
+	LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_send_param));
+	CLASSERT(MGS_PARAM_MAXLEN == 1024);
+	LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024]));
+	LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]));
+
+	/* Checks for struct cfg_marker */
+	LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n",
+		 (long long)(int)sizeof(struct cfg_marker));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_step));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_flags));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_vers));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_padding));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_createtime));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_canceltime));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_tgtname));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname));
+	LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n",
+		 (long long)(int)offsetof(struct cfg_marker, cm_comment));
+	LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment));
+
+	/* Checks for struct llog_logid */
+	LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_logid));
+	LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid, lgl_oi));
+	LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi));
+	LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid, lgl_ogen));
+	LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen));
+	CLASSERT(OST_SZ_REC == 274730752);
+	CLASSERT(MDS_UNLINK_REC == 274801668);
+	CLASSERT(MDS_UNLINK64_REC == 275325956);
+	CLASSERT(MDS_SETATTR64_REC == 275325953);
+	CLASSERT(OBD_CFG_REC == 274857984);
+	CLASSERT(LLOG_GEN_REC == 274989056);
+	CLASSERT(CHANGELOG_REC == 275120128);
+	CLASSERT(CHANGELOG_USER_REC == 275185664);
+	CLASSERT(LLOG_HDR_MAGIC == 275010873);
+	CLASSERT(LLOG_LOGID_MAGIC == 275010875);
+
+	/* Checks for struct llog_catid */
+	LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_catid));
+	LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_logid));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding1));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding2));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2));
+	LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_catid, lci_padding3));
+	LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3));
+
+	/* Checks for struct llog_rec_hdr */
+	LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_rec_hdr));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_len));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_index));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_type));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type));
+	LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_hdr, lrh_id));
+	LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id));
+
+	/* Checks for struct llog_rec_tail */
+	LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_rec_tail));
+	LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_tail, lrt_len));
+	LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len));
+	LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_rec_tail, lrt_index));
+	LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index));
+
+	/* Checks for struct llog_logid_rec */
+	LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_logid_rec));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_hdr));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_id));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding1));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding2));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_padding3));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3));
+	LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_logid_rec, lid_tail));
+	LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail));
+
+	/* Checks for struct llog_unlink_rec */
+	LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_unlink_rec));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_oid));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_count));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count));
+	LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
+	LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail));
+	/* Checks for struct llog_unlink64_rec */
+	LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_unlink64_rec));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_count));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2));
+	LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3));
+	LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3));
+
+	/* Checks for struct llog_setattr64_rec */
+	LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_setattr64_rec));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_padding) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_padding));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding));
+	LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+	LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
+
+	/* Checks for struct llog_size_change_rec */
+	LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_size_change_rec));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3));
+	LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail));
+	LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail));
+
+	/* Checks for struct changelog_rec */
+	LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_rec));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_namelen));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_flags));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_type));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_index));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_prev));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_time));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_tfid));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid));
+	LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_rec, cr_pfid));
+	LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid));
+
+	/* Checks for struct changelog_ext_rec */
+	LASSERTF((int)sizeof(struct changelog_ext_rec) == 96, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_ext_rec));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_namelen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_namelen));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_flags) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_flags));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_flags));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_type) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_type));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_type));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_index) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_index));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_index) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_index));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_prev) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_prev));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_prev) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_prev));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_time) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_time));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_time) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_time));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_tfid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_tfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_pfid) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_pfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_sfid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_sfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid));
+	LASSERTF((int)offsetof(struct changelog_ext_rec, cr_spfid) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_ext_rec, cr_spfid));
+	LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid));
+
+	/* Checks for struct changelog_setinfo */
+	LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n",
+		 (long long)(int)sizeof(struct changelog_setinfo));
+	LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_setinfo, cs_recno));
+	LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno));
+	LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct changelog_setinfo, cs_id));
+	LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id));
+
+	/* Checks for struct llog_changelog_rec */
+	LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_changelog_rec));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr));
+	LASSERTF((int)offsetof(struct llog_changelog_rec, cr_tail) == 80, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_rec, cr_tail));
+	LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_tail));
+
+	/* Checks for struct llog_changelog_user_rec */
+	LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_changelog_user_rec));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec));
+	LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail));
+	LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail));
+
+	/* Checks for struct llog_gen */
+	LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_gen));
+	LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen, mnt_cnt));
+	LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt));
+	LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen, conn_cnt));
+	LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt));
+
+	/* Checks for struct llog_gen_rec */
+	LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_gen_rec));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_gen));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen));
+	LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_gen_rec, lgr_tail));
+	LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail));
+
+	/* Checks for struct llog_log_hdr */
+	LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_log_hdr));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_hdr));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_count));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_size));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_flags));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_reserved) == 84, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_reserved));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_reserved) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_reserved));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap) == 8096, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap));
+	LASSERTF((int)offsetof(struct llog_log_hdr, llh_tail) == 8184, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_log_hdr, llh_tail));
+	LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tail) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tail));
+
+	/* Checks for struct llog_cookie */
+	LASSERTF((int)sizeof(struct llog_cookie) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct llog_cookie));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_lgl) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_lgl));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_lgl) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_lgl));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_subsys) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_subsys));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_subsys) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_subsys));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_index) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_index));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index));
+	LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llog_cookie, lgc_padding));
+	LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding));
+
+	/* Checks for struct llogd_body */
+	LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct llogd_body));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_logid));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_llh_flags));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_index));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_saved_index));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_len));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len));
+	LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_body, lgd_cur_offset));
+	LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset));
+	CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
+	CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
+	CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
+	CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
+	CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
+	CLASSERT(LLOG_ORIGIN_CONNECT == 506);
+	CLASSERT(LLOG_CATINFO == 507);
+	CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
+	CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
+	CLASSERT(LLOG_FIRST_OPC == 501);
+	CLASSERT(LLOG_LAST_OPC == 510);
+
+	/* Checks for struct llogd_conn_body */
+	LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n",
+		 (long long)(int)sizeof(struct llogd_conn_body));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid));
+	LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx));
+	LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
+
+	/* Checks for struct ll_fiemap_info_key */
+	LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fiemap_info_key));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, name[8]) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, name[8]));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, oa) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, oa));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->oa) == 208, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->oa));
+	LASSERTF((int)offsetof(struct ll_fiemap_info_key, fiemap) == 216, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_info_key, fiemap));
+	LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap));
+
+	/* Checks for struct quota_body */
+	LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n",
+		 (long long)(int)sizeof(struct quota_body));
+	LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_fid));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_fid));
+	LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_id));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_id));
+	LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_flags));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_flags));
+	LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_padding));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_padding));
+	LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_count));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_count));
+	LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_usage));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_usage));
+	LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_slv_ver));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver));
+	LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_lockh));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh));
+	LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_glb_lockh));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh));
+	LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n",
+		 (long long)(int)offsetof(struct quota_body, qb_padding1[4]));
+	LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4]));
+
+	/* Checks for struct mgs_target_info */
+	LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n",
+		 (long long)(int)sizeof(struct mgs_target_info));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_config_ver));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_flags));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_nid_count));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_instance));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_fsname));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_svname));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_uuid));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_nids));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids));
+	LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n",
+		 (long long)(int)offsetof(struct mgs_target_info, mti_params));
+	LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n",
+		 (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params));
+
+	/* Checks for struct lustre_capa */
+	LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_capa));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_fid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_opc));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_uid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_gid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_flags));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_keyid));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_timeout));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout));
+	LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_expiry));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry));
+	CLASSERT(CAPA_HMAC_MAX_LEN == 64);
+	LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa, lc_hmac[64]));
+	LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]));
+
+	/* Checks for struct lustre_capa_key */
+	LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct lustre_capa_key));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_seq));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_keyid));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid));
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_padding));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding));
+	CLASSERT(CAPA_HMAC_KEY_MAX_LEN == 56);
+	LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct lustre_capa_key, lk_key[56]));
+	LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]));
+
+	/* Checks for struct getinfo_fid2path */
+	LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct getinfo_fid2path));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_fid));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_recno));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen));
+	LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0]));
+	LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n",
+		 (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]));
+
+	/* Checks for struct ll_user_fiemap */
+	LASSERTF((int)sizeof(struct ll_user_fiemap) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_user_fiemap));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_start) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_start));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_start));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_length) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_length));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_length) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_length));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_flags));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_flags));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_mapped_extents) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_mapped_extents));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extent_count) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_extent_count));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_reserved) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_reserved));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved));
+	LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extents) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_user_fiemap, fm_extents));
+	LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extents) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extents));
+	CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001);
+	CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002);
+	CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
+
+	/* Checks for struct ll_fiemap_extent */
+	LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct ll_fiemap_extent));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_physical) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_physical));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_length) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_length));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags));
+	LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct ll_fiemap_extent, fe_device));
+	LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device));
+	CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
+	CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
+	CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
+	CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008);
+	CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
+	CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
+	CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
+	CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
+	CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
+	CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
+	CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000);
+	CLASSERT(FIEMAP_EXTENT_NET == 0x80000000);
+
+	/* Checks for type posix_acl_xattr_entry */
+	LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n",
+		 (long long)(int)sizeof(posix_acl_xattr_entry));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_tag));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_perm));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm));
+	LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_entry, e_id));
+	LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id));
+
+	/* Checks for type posix_acl_xattr_header */
+	LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n",
+		 (long long)(int)sizeof(posix_acl_xattr_header));
+	LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_header, a_version));
+	LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version));
+	LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n",
+		 (long long)(int)offsetof(posix_acl_xattr_header, a_entries));
+	LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries));
+
+	/* Checks for struct link_ea_header */
+	LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct link_ea_header));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_magic));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_reccount));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount));
+	LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, leh_len));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len));
+	LASSERTF((int)offsetof(struct link_ea_header, padding1) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, padding1));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->padding1));
+	LASSERTF((int)offsetof(struct link_ea_header, padding2) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_header, padding2));
+	LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding2) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_header *)0)->padding2));
+	CLASSERT(LINK_EA_MAGIC == 0x11EAF1DFUL);
+
+	/* Checks for struct link_ea_entry */
+	LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n",
+		 (long long)(int)sizeof(struct link_ea_entry));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_reclen));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid));
+	LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n",
+		 (long long)(int)offsetof(struct link_ea_entry, lee_name));
+	LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name));
+
+	/* Checks for struct layout_intent */
+	LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct layout_intent));
+	LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_opc));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_opc));
+	LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_flags));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
+	LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_start));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
+	LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct layout_intent, li_end));
+	LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+	LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
+		 (long long)LAYOUT_INTENT_ACCESS);
+	LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
+		 (long long)LAYOUT_INTENT_READ);
+	LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n",
+		 (long long)LAYOUT_INTENT_WRITE);
+	LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n",
+		 (long long)LAYOUT_INTENT_GLIMPSE);
+	LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n",
+		 (long long)LAYOUT_INTENT_TRUNC);
+	LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n",
+		 (long long)LAYOUT_INTENT_RELEASE);
+	LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n",
+		 (long long)LAYOUT_INTENT_RESTORE);
+
+	/* Checks for struct hsm_action_item */
+	LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_action_item));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_len));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_action));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_fid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_dfid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_extent));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_cookie));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_gid));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid));
+	LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_item, hai_data));
+	LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data));
+
+	/* Checks for struct hsm_action_list */
+	LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_action_list));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_version));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_count));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_compound_id));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_flags));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id));
+	LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, padding1));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1));
+	LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_action_list, hal_fsname));
+	LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname));
+
+	/* Checks for struct hsm_progress */
+	LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_progress));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_fid));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_cookie));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_extent));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_flags));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags));
+	LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, hp_errval));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval));
+	LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress, padding));
+	LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress *)0)->padding));
+	LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n",
+		HP_FLAG_COMPLETED);
+	LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n",
+		HP_FLAG_RETRY);
+
+	LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_data_version));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_flags));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_errval));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval));
+	LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, padding));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->padding));
+	LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_copy, hc_hai));
+	LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai));
+
+	/* Checks for struct hsm_progress_kernel */
+	LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_progress_kernel));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version));
+	LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2));
+	LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2));
+
+	/* Checks for struct hsm_user_item */
+	LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_item));
+	LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_item, hui_fid));
+	LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid));
+	LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_item, hui_extent));
+	LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent));
+
+	/* Checks for struct hsm_user_state */
+	LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_state));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_states));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action));
+	LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location));
+	LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location));
+
+	/* Checks for struct hsm_state_set */
+	LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_state_set));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_valid));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_setmask));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask));
+	LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_state_set, hss_clearmask));
+	LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask));
+
+	/* Checks for struct hsm_current_action */
+	LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_current_action));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_state));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_action));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action));
+	LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_current_action, hca_location));
+	LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location));
+
+	/* Checks for struct hsm_request */
+	LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_request));
+	LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_action));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_action));
+	LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_archive_id));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id));
+	LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_flags));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags));
+	LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_itemcount));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount));
+	LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_request, hr_data_len));
+	LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len));
+	LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n",
+		(unsigned)HSM_FORCE_ACTION);
+	LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n",
+		(unsigned)HSM_GHOST_COPY);
+
+	/* Checks for struct hsm_user_request */
+	LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(struct hsm_user_request));
+	LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_request, hur_request));
+	LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request));
+	LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct hsm_user_request, hur_user_item));
+	LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item));
+
+	/* Checks for struct update_buf */
+	LASSERTF((int)sizeof(struct update_buf) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct update_buf));
+	LASSERTF((int)offsetof(struct update_buf, ub_magic) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_buf, ub_magic));
+	LASSERTF((int)sizeof(((struct update_buf *)0)->ub_magic) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_buf *)0)->ub_magic));
+	LASSERTF((int)offsetof(struct update_buf, ub_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct update_buf, ub_count));
+	LASSERTF((int)sizeof(((struct update_buf *)0)->ub_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_buf *)0)->ub_count));
+	LASSERTF((int)offsetof(struct update_buf, ub_bufs) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update_buf, ub_bufs));
+	LASSERTF((int)sizeof(((struct update_buf *)0)->ub_bufs) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_buf *)0)->ub_bufs));
+
+	/* Checks for struct update_reply */
+	LASSERTF((int)sizeof(struct update_reply) == 8, "found %lld\n",
+		 (long long)(int)sizeof(struct update_reply));
+	LASSERTF((int)offsetof(struct update_reply, ur_version) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update_reply, ur_version));
+	LASSERTF((int)sizeof(((struct update_reply *)0)->ur_version) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_reply *)0)->ur_version));
+	LASSERTF((int)offsetof(struct update_reply, ur_count) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct update_reply, ur_count));
+	LASSERTF((int)sizeof(((struct update_reply *)0)->ur_count) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_reply *)0)->ur_count));
+	LASSERTF((int)offsetof(struct update_reply, ur_lens) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update_reply, ur_lens));
+	LASSERTF((int)sizeof(((struct update_reply *)0)->ur_lens) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update_reply *)0)->ur_lens));
+
+	/* Checks for struct update */
+	LASSERTF((int)sizeof(struct update) == 56, "found %lld\n",
+		 (long long)(int)sizeof(struct update));
+	LASSERTF((int)offsetof(struct update, u_type) == 0, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_type));
+	LASSERTF((int)sizeof(((struct update *)0)->u_type) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_type));
+	LASSERTF((int)offsetof(struct update, u_batchid) == 4, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_batchid));
+	LASSERTF((int)sizeof(((struct update *)0)->u_batchid) == 4, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_batchid));
+	LASSERTF((int)offsetof(struct update, u_fid) == 8, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_fid));
+	LASSERTF((int)sizeof(((struct update *)0)->u_fid) == 16, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_fid));
+	LASSERTF((int)offsetof(struct update, u_lens) == 24, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_lens));
+	LASSERTF((int)sizeof(((struct update *)0)->u_lens) == 32, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_lens));
+	LASSERTF((int)offsetof(struct update, u_bufs) == 56, "found %lld\n",
+		 (long long)(int)offsetof(struct update, u_bufs));
+	LASSERTF((int)sizeof(((struct update *)0)->u_bufs) == 0, "found %lld\n",
+		 (long long)(int)sizeof(((struct update *)0)->u_bufs));
+}

diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c
index dd98cb1..46eabd0 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c

@@ -896,7 +896,7 @@
 		return err;
 	}
 
-	pr_info("Registerd mdio bus id : %s\n", priv->mii_bus->id);
+	pr_info("Registered mdio bus id : %s\n", priv->mii_bus->id);
 	err = xlr_mii_probe(priv);
 	if (err) {
 		mdiobus_free(priv->mii_bus);
@@ -1020,12 +1020,11 @@
 		goto err_gmac;
 	}
 
-	ndev->base_addr = (unsigned long) devm_request_and_ioremap
+	ndev->base_addr = (unsigned long) devm_ioremap_resource
 		(&pdev->dev, res);
-	if (!ndev->base_addr) {
-		dev_err(&pdev->dev,
-				"devm_request_and_ioremap failed\n");
-		return -EBUSY;
+	if (IS_ERR_VALUE(ndev->base_addr)) {
+		err = ndev->base_addr;
+		goto err_gmac;
 	}
 
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);

diff --git a/drivers/staging/nvec/nvec.c b/drivers/staging/nvec/nvec.c
index 197c393..10393da 100644
--- a/drivers/staging/nvec/nvec.c
+++ b/drivers/staging/nvec/nvec.c

@@ -33,7 +33,6 @@
 #include <linux/mfd/core.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
-#include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
@@ -772,11 +771,31 @@
 	nvec_write_async(nvec_power_handle, ap_pwr_down, 2);
 }
 
+/*
+ *  Parse common device tree data
+ */
+static int nvec_i2c_parse_dt_pdata(struct nvec_chip *nvec)
+{
+	nvec->gpio = of_get_named_gpio(nvec->dev->of_node, "request-gpios", 0);
+
+	if (nvec->gpio < 0) {
+		dev_err(nvec->dev, "no gpio specified");
+		return -ENODEV;
+	}
+
+	if (of_property_read_u32(nvec->dev->of_node, "slave-addr",
+				&nvec->i2c_addr)) {
+		dev_err(nvec->dev, "no i2c address specified");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
 static int tegra_nvec_probe(struct platform_device *pdev)
 {
 	int err, ret;
 	struct clk *i2c_clk;
-	struct nvec_platform_data *pdata = pdev->dev.platform_data;
 	struct nvec_chip *nvec;
 	struct nvec_msg *msg;
 	struct resource *res;
@@ -785,6 +804,11 @@
 		unmute_speakers[] = { NVEC_OEM0, 0x10, 0x59, 0x95 },
 		enable_event[7] = { NVEC_SYS, CNF_EVENT_REPORTING, true };
 
+	if(!pdev->dev.of_node) {
+		dev_err(&pdev->dev, "must be instantiated using device tree\n");
+		return -ENODEV;
+	}
+
 	nvec = devm_kzalloc(&pdev->dev, sizeof(struct nvec_chip), GFP_KERNEL);
 	if (nvec == NULL) {
 		dev_err(&pdev->dev, "failed to reserve memory\n");
@@ -793,25 +817,9 @@
 	platform_set_drvdata(pdev, nvec);
 	nvec->dev = &pdev->dev;
 
-	if (pdata) {
-		nvec->gpio = pdata->gpio;
-		nvec->i2c_addr = pdata->i2c_addr;
-	} else if (nvec->dev->of_node) {
-		nvec->gpio = of_get_named_gpio(nvec->dev->of_node,
-					"request-gpios", 0);
-		if (nvec->gpio < 0) {
-			dev_err(&pdev->dev, "no gpio specified");
-			return -ENODEV;
-		}
-		if (of_property_read_u32(nvec->dev->of_node,
-					"slave-addr", &nvec->i2c_addr)) {
-			dev_err(&pdev->dev, "no i2c address specified");
-			return -ENODEV;
-		}
-	} else {
-		dev_err(&pdev->dev, "no platform data\n");
-		return -ENODEV;
-	}
+	err = nvec_i2c_parse_dt_pdata(nvec);
+	if (err < 0)
+		return err;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	base = devm_ioremap_resource(&pdev->dev, res);

diff --git a/drivers/staging/nvec/nvec.h b/drivers/staging/nvec/nvec.h
index 2b1316d..e880518 100644
--- a/drivers/staging/nvec/nvec.h
+++ b/drivers/staging/nvec/nvec.h

@@ -103,31 +103,6 @@
 };
 
 /**
- * struct nvec_subdev - A subdevice of nvec, such as nvec_kbd
- * @name: The name of the sub device
- * @platform_data: Platform data
- * @id: Identifier of the sub device
- */
-struct nvec_subdev {
-	const char *name;
-	void *platform_data;
-	int id;
-};
-
-/**
- * struct nvec_platform_data - platform data for a tegra slave controller
- * @i2c_addr: number of i2c slave adapter the ec is connected to
- * @gpio: gpio number for the ec request line
- *
- * Platform data, to be used in board definitions. For an example, take a
- * look at the paz00 board in arch/arm/mach-tegra/board-paz00.c
- */
-struct nvec_platform_data {
-	int i2c_addr;
-	int gpio;
-};
-
-/**
  * struct nvec_chip - A single connection to an NVIDIA Embedded controller
  * @dev: The device
  * @gpio: The same as for &struct nvec_platform_data

diff --git a/drivers/staging/nvec/nvec_kbd.c b/drivers/staging/nvec/nvec_kbd.c
index a0ec52a..c17a1c3 100644
--- a/drivers/staging/nvec/nvec_kbd.c
+++ b/drivers/staging/nvec/nvec_kbd.c

@@ -126,7 +126,7 @@
 	for (i = 0; i < ARRAY_SIZE(extcode_tab_us102); ++i)
 		keycodes[j++] = extcode_tab_us102[i];
 
-	idev = input_allocate_device();
+	idev = devm_input_allocate_device(&pdev->dev);
 	idev->name = "nvec keyboard";
 	idev->phys = "nvec";
 	idev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP) | BIT_MASK(EV_LED);
@@ -142,7 +142,7 @@
 	clear_bit(0, idev->keybit);
 	err = input_register_device(idev);
 	if (err)
-		goto fail;
+		return err;
 
 	keys_dev.input = idev;
 	keys_dev.notifier.notifier_call = nvec_keys_notifier;
@@ -161,10 +161,6 @@
 	nvec_write_async(nvec, clear_leds, sizeof(clear_leds));
 
 	return 0;
-
-fail:
-	input_free_device(idev);
-	return err;
 }
 
 static int nvec_kbd_remove(struct platform_device *pdev)
@@ -177,8 +173,6 @@
 	nvec_write_async(nvec, disable_kbd, 2);
 	nvec_unregister_notifier(nvec, &keys_dev.notifier);
 
-	input_unregister_device(keys_dev.input);
-
 	return 0;
 }
 

diff --git a/drivers/staging/octeon-usb/Kconfig b/drivers/staging/octeon-usb/Kconfig
new file mode 100644
index 0000000..018af6d
--- /dev/null
+++ b/drivers/staging/octeon-usb/Kconfig

@@ -0,0 +1,10 @@
+config OCTEON_USB
+	tristate "Cavium Networks Octeon USB support"
+	depends on CPU_CAVIUM_OCTEON && USB
+	help
+	  This driver supports USB host controller on some Cavium
+	  Networks' products in the Octeon family.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called octeon-usb.
+

diff --git a/drivers/staging/octeon-usb/Makefile b/drivers/staging/octeon-usb/Makefile
new file mode 100644
index 0000000..89df1ad
--- /dev/null
+++ b/drivers/staging/octeon-usb/Makefile

@@ -0,0 +1,3 @@
+obj-${CONFIG_OCTEON_USB} := octeon-usb.o
+octeon-usb-y := octeon-hcd.o
+octeon-usb-y += cvmx-usb.o

diff --git a/drivers/staging/octeon-usb/TODO b/drivers/staging/octeon-usb/TODO
new file mode 100644
index 0000000..cc58a7e
--- /dev/null
+++ b/drivers/staging/octeon-usb/TODO

@@ -0,0 +1,11 @@
+This driver is functional and has been tested on EdgeRouter Lite with
+USB mass storage.
+
+TODO:
+	- kernel coding style
+	- checkpatch warnings
+	- dead code elimination
+	- device tree bindings
+	- possibly eliminate the extra "hardware abstraction layer"
+
+Contact: Aaro Koskinen <aaro.koskinen@iki.fi>

diff --git a/drivers/staging/octeon-usb/cvmx-usb.c b/drivers/staging/octeon-usb/cvmx-usb.c
new file mode 100644
index 0000000..bf36649
--- /dev/null
+++ b/drivers/staging/octeon-usb/cvmx-usb.c

@@ -0,0 +1,3229 @@
+/***********************license start***************
+ * Copyright (c) 2003-2010  Cavium Networks (support@cavium.com). All rights
+ * reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+
+ *   * Neither the name of Cavium Networks nor the names of
+ *     its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written
+ *     permission.
+
+ * This Software, including technical data, may be subject to U.S. export  control
+ * laws, including the U.S. Export Administration Act and its  associated
+ * regulations, and may be subject to export or import  regulations in other
+ * countries.
+
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+ * AND WITH ALL FAULTS AND CAVIUM  NETWORKS MAKES NO PROMISES, REPRESENTATIONS OR
+ * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT TO
+ * THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY REPRESENTATION OR
+ * DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT DEFECTS, AND CAVIUM
+ * SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES OF TITLE,
+ * MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR PURPOSE, LACK OF
+ * VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT, QUIET POSSESSION OR
+ * CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK ARISING OUT OF USE OR
+ * PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
+ ***********************license end**************************************/
+
+
+/**
+ * @file
+ *
+ * "cvmx-usb.c" defines a set of low level USB functions to help
+ * developers create Octeon USB drivers for various operating
+ * systems. These functions provide a generic API to the Octeon
+ * USB blocks, hiding the internal hardware specific
+ * operations.
+ *
+ * <hr>$Revision: 32636 $<hr>
+ */
+#include <linux/delay.h>
+#include <asm/octeon/cvmx.h>
+#include <asm/octeon/octeon.h>
+#include <asm/octeon/cvmx-sysinfo.h>
+#include "cvmx-usbnx-defs.h"
+#include "cvmx-usbcx-defs.h"
+#include "cvmx-usb.h"
+#include <asm/octeon/cvmx-helper.h>
+#include <asm/octeon/cvmx-helper-board.h>
+
+#define CVMX_PREFETCH0(address) CVMX_PREFETCH(address, 0)
+#define CVMX_PREFETCH128(address) CVMX_PREFETCH(address, 128)
+// a normal prefetch
+#define CVMX_PREFETCH(address, offset) CVMX_PREFETCH_PREF0(address, offset)
+// normal prefetches that use the pref instruction
+#define CVMX_PREFETCH_PREFX(X, address, offset) asm volatile ("pref %[type], %[off](%[rbase])" : : [rbase] "d" (address), [off] "I" (offset), [type] "n" (X))
+#define CVMX_PREFETCH_PREF0(address, offset) CVMX_PREFETCH_PREFX(0, address, offset)
+#define CVMX_CLZ(result, input) asm ("clz %[rd],%[rs]" : [rd] "=d" (result) : [rs] "d" (input))
+
+#define cvmx_likely likely
+#define cvmx_wait_usec udelay
+#define cvmx_unlikely unlikely
+#define cvmx_le16_to_cpu le16_to_cpu
+
+#define MAX_RETRIES         3   /* Maximum number of times to retry failed transactions */
+#define MAX_PIPES           32  /* Maximum number of pipes that can be open at once */
+#define MAX_TRANSACTIONS    256 /* Maximum number of outstanding transactions across all pipes */
+#define MAX_CHANNELS        8   /* Maximum number of hardware channels supported by the USB block */
+#define MAX_USB_ADDRESS     127 /* The highest valid USB device address */
+#define MAX_USB_ENDPOINT    15  /* The highest valid USB endpoint number */
+#define MAX_USB_HUB_PORT    15  /* The highest valid port number on a hub */
+#define MAX_TRANSFER_BYTES  ((1<<19)-1) /* The low level hardware can transfer a maximum of this number of bytes in each transfer. The field is 19 bits wide */
+#define MAX_TRANSFER_PACKETS ((1<<10)-1) /* The low level hardware can transfer a maximum of this number of packets in each transfer. The field is 10 bits wide */
+
+/* These defines disable the normal read and write csr. This is so I can add
+    extra debug stuff to the usb specific version and I won't use the normal
+    version by mistake */
+#define cvmx_read_csr use_cvmx_usb_read_csr64_instead_of_cvmx_read_csr
+#define cvmx_write_csr use_cvmx_usb_write_csr64_instead_of_cvmx_write_csr
+
+typedef enum {
+    __CVMX_USB_TRANSACTION_FLAGS_IN_USE = 1<<16,
+} cvmx_usb_transaction_flags_t;
+
+enum {
+	USB_CLOCK_TYPE_REF_12,
+	USB_CLOCK_TYPE_REF_24,
+	USB_CLOCK_TYPE_REF_48,
+	USB_CLOCK_TYPE_CRYSTAL_12,
+};
+
+/**
+ * Logical transactions may take numerous low level
+ * transactions, especially when splits are concerned. This
+ * enum represents all of the possible stages a transaction can
+ * be in. Note that split completes are always even. This is so
+ * the NAK handler can backup to the previous low level
+ * transaction with a simple clearing of bit 0.
+ */
+typedef enum {
+    CVMX_USB_STAGE_NON_CONTROL,
+    CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE,
+    CVMX_USB_STAGE_SETUP,
+    CVMX_USB_STAGE_SETUP_SPLIT_COMPLETE,
+    CVMX_USB_STAGE_DATA,
+    CVMX_USB_STAGE_DATA_SPLIT_COMPLETE,
+    CVMX_USB_STAGE_STATUS,
+    CVMX_USB_STAGE_STATUS_SPLIT_COMPLETE,
+} cvmx_usb_stage_t;
+
+/**
+ * This structure describes each pending USB transaction
+ * regardless of type. These are linked together to form a list
+ * of pending requests for a pipe.
+ */
+typedef struct cvmx_usb_transaction {
+    struct cvmx_usb_transaction *prev;  /**< Transaction before this one in the pipe */
+    struct cvmx_usb_transaction *next;  /**< Transaction after this one in the pipe */
+    cvmx_usb_transfer_t type;           /**< Type of transaction, duplicated of the pipe */
+    cvmx_usb_transaction_flags_t flags; /**< State flags for this transaction */
+    uint64_t buffer;                    /**< User's physical buffer address to read/write */
+    int buffer_length;                  /**< Size of the user's buffer in bytes */
+    uint64_t control_header;            /**< For control transactions, physical address of the 8 byte standard header */
+    int iso_start_frame;                /**< For ISO transactions, the starting frame number */
+    int iso_number_packets;             /**< For ISO transactions, the number of packets in the request */
+    cvmx_usb_iso_packet_t *iso_packets; /**< For ISO transactions, the sub packets in the request */
+    int xfersize;
+    int pktcnt;
+    int retries;
+    int actual_bytes;                   /**< Actual bytes transfer for this transaction */
+    cvmx_usb_stage_t stage;             /**< For control transactions, the current stage */
+    cvmx_usb_callback_func_t callback;  /**< User's callback function when complete */
+    void *callback_data;                /**< User's data */
+} cvmx_usb_transaction_t;
+
+/**
+ * A pipe represents a virtual connection between Octeon and some
+ * USB device. It contains a list of pending request to the device.
+ */
+typedef struct cvmx_usb_pipe {
+    struct cvmx_usb_pipe *prev;         /**< Pipe before this one in the list */
+    struct cvmx_usb_pipe *next;         /**< Pipe after this one in the list */
+    cvmx_usb_transaction_t *head;       /**< The first pending transaction */
+    cvmx_usb_transaction_t *tail;       /**< The last pending transaction */
+    uint64_t interval;                  /**< For periodic pipes, the interval between packets in frames */
+    uint64_t next_tx_frame;             /**< The next frame this pipe is allowed to transmit on */
+    cvmx_usb_pipe_flags_t flags;        /**< State flags for this pipe */
+    cvmx_usb_speed_t device_speed;      /**< Speed of device connected to this pipe */
+    cvmx_usb_transfer_t transfer_type;  /**< Type of transaction supported by this pipe */
+    cvmx_usb_direction_t transfer_dir;  /**< IN or OUT. Ignored for Control */
+    int multi_count;                    /**< Max packet in a row for the device */
+    uint16_t max_packet;                /**< The device's maximum packet size in bytes */
+    uint8_t device_addr;                /**< USB device address at other end of pipe */
+    uint8_t endpoint_num;               /**< USB endpoint number at other end of pipe */
+    uint8_t hub_device_addr;            /**< Hub address this device is connected to */
+    uint8_t hub_port;                   /**< Hub port this device is connected to */
+    uint8_t pid_toggle;                 /**< This toggles between 0/1 on every packet send to track the data pid needed */
+    uint8_t channel;                    /**< Hardware DMA channel for this pipe */
+    int8_t  split_sc_frame;             /**< The low order bits of the frame number the split complete should be sent on */
+} cvmx_usb_pipe_t;
+
+typedef struct {
+    cvmx_usb_pipe_t *head;              /**< Head of the list, or NULL if empty */
+    cvmx_usb_pipe_t *tail;              /**< Tail if the list, or NULL if empty */
+} cvmx_usb_pipe_list_t;
+
+typedef struct {
+    struct {
+        int channel;
+        int size;
+        uint64_t address;
+    } entry[MAX_CHANNELS+1];
+    int head;
+    int tail;
+} cvmx_usb_tx_fifo_t;
+
+/**
+ * The state of the USB block is stored in this structure
+ */
+typedef struct {
+    int init_flags;                     /**< Flags passed to initialize */
+    int index;                          /**< Which USB block this is for */
+    int idle_hardware_channels;         /**< Bit set for every idle hardware channel */
+    cvmx_usbcx_hprt_t usbcx_hprt;       /**< Stored port status so we don't need to read a CSR to determine splits */
+    cvmx_usb_pipe_t *pipe_for_channel[MAX_CHANNELS];    /**< Map channels to pipes */
+    cvmx_usb_transaction_t *free_transaction_head;      /**< List of free transactions head */
+    cvmx_usb_transaction_t *free_transaction_tail;      /**< List of free transactions tail */
+    cvmx_usb_pipe_t pipe[MAX_PIPES];                    /**< Storage for pipes */
+    cvmx_usb_transaction_t transaction[MAX_TRANSACTIONS];       /**< Storage for transactions */
+    cvmx_usb_callback_func_t callback[__CVMX_USB_CALLBACK_END]; /**< User global callbacks */
+    void *callback_data[__CVMX_USB_CALLBACK_END];               /**< User data for each callback */
+    int indent;                         /**< Used by debug output to indent functions */
+    cvmx_usb_port_status_t port_status; /**< Last port status used for change notification */
+    cvmx_usb_pipe_list_t free_pipes;    /**< List of all pipes that are currently closed */
+    cvmx_usb_pipe_list_t idle_pipes;    /**< List of open pipes that have no transactions */
+    cvmx_usb_pipe_list_t active_pipes[4]; /**< Active pipes indexed by transfer type */
+    uint64_t frame_number;              /**< Increments every SOF interrupt for time keeping */
+    cvmx_usb_transaction_t *active_split; /**< Points to the current active split, or NULL */
+    cvmx_usb_tx_fifo_t periodic;
+    cvmx_usb_tx_fifo_t nonperiodic;
+} cvmx_usb_internal_state_t;
+
+/* This macro logs out whenever a function is called if debugging is on */
+#define CVMX_USB_LOG_CALLED() \
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS)) \
+        cvmx_dprintf("%*s%s: called\n", 2*usb->indent++, "", __FUNCTION__);
+
+/* This macro logs out each function parameter if debugging is on */
+#define CVMX_USB_LOG_PARAM(format, param) \
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS)) \
+        cvmx_dprintf("%*s%s: param %s = " format "\n", 2*usb->indent, "", __FUNCTION__, #param, param);
+
+/* This macro logs out when a function returns a value */
+#define CVMX_USB_RETURN(v)                                              \
+    do {                                                                \
+        typeof(v) r = v;                                                \
+        if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS))    \
+            cvmx_dprintf("%*s%s: returned %s(%d)\n", 2*--usb->indent, "", __FUNCTION__, #v, r); \
+        return r;                                                       \
+    } while (0);
+
+/* This macro logs out when a function doesn't return a value */
+#define CVMX_USB_RETURN_NOTHING()                                       \
+    do {                                                                \
+        if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS))    \
+            cvmx_dprintf("%*s%s: returned\n", 2*--usb->indent, "", __FUNCTION__); \
+        return;                                                         \
+    } while (0);
+
+/* This macro spins on a field waiting for it to reach a value */
+#define CVMX_WAIT_FOR_FIELD32(address, type, field, op, value, timeout_usec)\
+    ({int result;                                                       \
+    do {                                                                \
+        uint64_t done = cvmx_get_cycle() + (uint64_t)timeout_usec *     \
+			octeon_get_clock_rate() / 1000000;		\
+        type c;                                                         \
+        while (1)                                                       \
+        {                                                               \
+            c.u32 = __cvmx_usb_read_csr32(usb, address);                \
+            if (c.s.field op (value)) {                                 \
+                result = 0;                                             \
+                break;                                                  \
+            } else if (cvmx_get_cycle() > done) {                       \
+                result = -1;                                            \
+                break;                                                  \
+            } else                                                      \
+                cvmx_wait(100);                                         \
+        }                                                               \
+    } while (0);                                                        \
+    result;})
+
+/* This macro logically sets a single field in a CSR. It does the sequence
+    read, modify, and write */
+#define USB_SET_FIELD32(address, type, field, value)\
+    do {                                            \
+        type c;                                     \
+        c.u32 = __cvmx_usb_read_csr32(usb, address);\
+        c.s.field = value;                          \
+        __cvmx_usb_write_csr32(usb, address, c.u32);\
+    } while (0)
+
+/* Returns the IO address to push/pop stuff data from the FIFOs */
+#define USB_FIFO_ADDRESS(channel, usb_index) (CVMX_USBCX_GOTGCTL(usb_index) + ((channel)+1)*0x1000)
+
+static int octeon_usb_get_clock_type(void)
+{
+	switch (cvmx_sysinfo_get()->board_type) {
+	case CVMX_BOARD_TYPE_BBGW_REF:
+	case CVMX_BOARD_TYPE_LANAI2_A:
+	case CVMX_BOARD_TYPE_LANAI2_U:
+	case CVMX_BOARD_TYPE_LANAI2_G:
+		return USB_CLOCK_TYPE_CRYSTAL_12;
+	}
+
+	/* FIXME: This should use CVMX_BOARD_TYPE_UBNT_E100 */
+	if (OCTEON_IS_MODEL(OCTEON_CN50XX) &&
+	    cvmx_sysinfo_get()->board_type == 20002)
+		return USB_CLOCK_TYPE_CRYSTAL_12;
+
+	return USB_CLOCK_TYPE_REF_48;
+}
+
+/**
+ * @INTERNAL
+ * Read a USB 32bit CSR. It performs the necessary address swizzle
+ * for 32bit CSRs and logs the value in a readable format if
+ * debugging is on.
+ *
+ * @param usb     USB block this access is for
+ * @param address 64bit address to read
+ *
+ * @return Result of the read
+ */
+static inline uint32_t __cvmx_usb_read_csr32(cvmx_usb_internal_state_t *usb,
+                                             uint64_t address)
+{
+    uint32_t result = cvmx_read64_uint32(address ^ 4);
+    return result;
+}
+
+
+/**
+ * @INTERNAL
+ * Write a USB 32bit CSR. It performs the necessary address
+ * swizzle for 32bit CSRs and logs the value in a readable format
+ * if debugging is on.
+ *
+ * @param usb     USB block this access is for
+ * @param address 64bit address to write
+ * @param value   Value to write
+ */
+static inline void __cvmx_usb_write_csr32(cvmx_usb_internal_state_t *usb,
+                                          uint64_t address, uint32_t value)
+{
+    cvmx_write64_uint32(address ^ 4, value);
+    cvmx_read64_uint64(CVMX_USBNX_DMA0_INB_CHN0(usb->index));
+}
+
+
+/**
+ * @INTERNAL
+ * Read a USB 64bit CSR. It logs the value in a readable format if
+ * debugging is on.
+ *
+ * @param usb     USB block this access is for
+ * @param address 64bit address to read
+ *
+ * @return Result of the read
+ */
+static inline uint64_t __cvmx_usb_read_csr64(cvmx_usb_internal_state_t *usb,
+                                             uint64_t address)
+{
+    uint64_t result = cvmx_read64_uint64(address);
+    return result;
+}
+
+
+/**
+ * @INTERNAL
+ * Write a USB 64bit CSR. It logs the value in a readable format
+ * if debugging is on.
+ *
+ * @param usb     USB block this access is for
+ * @param address 64bit address to write
+ * @param value   Value to write
+ */
+static inline void __cvmx_usb_write_csr64(cvmx_usb_internal_state_t *usb,
+                                          uint64_t address, uint64_t value)
+{
+    cvmx_write64_uint64(address, value);
+}
+
+
+/**
+ * @INTERNAL
+ * Utility function to convert complete codes into strings
+ *
+ * @param complete_code
+ *               Code to convert
+ *
+ * @return Human readable string
+ */
+static const char *__cvmx_usb_complete_to_string(cvmx_usb_complete_t complete_code)
+{
+    switch (complete_code)
+    {
+        case CVMX_USB_COMPLETE_SUCCESS: return "SUCCESS";
+        case CVMX_USB_COMPLETE_SHORT:   return "SHORT";
+        case CVMX_USB_COMPLETE_CANCEL:  return "CANCEL";
+        case CVMX_USB_COMPLETE_ERROR:   return "ERROR";
+        case CVMX_USB_COMPLETE_STALL:   return "STALL";
+        case CVMX_USB_COMPLETE_XACTERR: return "XACTERR";
+        case CVMX_USB_COMPLETE_DATATGLERR: return "DATATGLERR";
+        case CVMX_USB_COMPLETE_BABBLEERR: return "BABBLEERR";
+        case CVMX_USB_COMPLETE_FRAMEERR: return "FRAMEERR";
+    }
+    return "Update __cvmx_usb_complete_to_string";
+}
+
+
+/**
+ * @INTERNAL
+ * Return non zero if this pipe connects to a non HIGH speed
+ * device through a high speed hub.
+ *
+ * @param usb    USB block this access is for
+ * @param pipe   Pipe to check
+ *
+ * @return Non zero if we need to do split transactions
+ */
+static inline int __cvmx_usb_pipe_needs_split(cvmx_usb_internal_state_t *usb, cvmx_usb_pipe_t *pipe)
+{
+    return ((pipe->device_speed != CVMX_USB_SPEED_HIGH) && (usb->usbcx_hprt.s.prtspd == CVMX_USB_SPEED_HIGH));
+}
+
+
+/**
+ * @INTERNAL
+ * Trivial utility function to return the correct PID for a pipe
+ *
+ * @param pipe   pipe to check
+ *
+ * @return PID for pipe
+ */
+static inline int __cvmx_usb_get_data_pid(cvmx_usb_pipe_t *pipe)
+{
+    if (pipe->pid_toggle)
+        return 2; /* Data1 */
+    else
+        return 0; /* Data0 */
+}
+
+
+/**
+ * Return the number of USB ports supported by this Octeon
+ * chip. If the chip doesn't support USB, or is not supported
+ * by this API, a zero will be returned. Most Octeon chips
+ * support one usb port, but some support two ports.
+ * cvmx_usb_initialize() must be called on independent
+ * cvmx_usb_state_t structures.
+ *
+ * @return Number of port, zero if usb isn't supported
+ */
+int cvmx_usb_get_num_ports(void)
+{
+    int arch_ports = 0;
+
+    if (OCTEON_IS_MODEL(OCTEON_CN56XX))
+        arch_ports = 1;
+    else if (OCTEON_IS_MODEL(OCTEON_CN52XX))
+        arch_ports = 2;
+    else if (OCTEON_IS_MODEL(OCTEON_CN50XX))
+        arch_ports = 1;
+    else if (OCTEON_IS_MODEL(OCTEON_CN31XX))
+        arch_ports = 1;
+    else if (OCTEON_IS_MODEL(OCTEON_CN30XX))
+        arch_ports = 1;
+    else
+        arch_ports = 0;
+
+    return arch_ports;
+}
+
+
+/**
+ * @INTERNAL
+ * Allocate a usb transaction for use
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return Transaction or NULL
+ */
+static inline cvmx_usb_transaction_t *__cvmx_usb_alloc_transaction(cvmx_usb_internal_state_t *usb)
+{
+    cvmx_usb_transaction_t *t;
+    t = usb->free_transaction_head;
+    if (t) {
+        usb->free_transaction_head = t->next;
+        if (!usb->free_transaction_head)
+            usb->free_transaction_tail = NULL;
+    }
+    else if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+        cvmx_dprintf("%s: Failed to allocate a transaction\n", __FUNCTION__);
+    if (t) {
+        memset(t, 0, sizeof(*t));
+        t->flags = __CVMX_USB_TRANSACTION_FLAGS_IN_USE;
+    }
+    return t;
+}
+
+
+/**
+ * @INTERNAL
+ * Free a usb transaction
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param transaction
+ *               Transaction to free
+ */
+static inline void __cvmx_usb_free_transaction(cvmx_usb_internal_state_t *usb,
+                                        cvmx_usb_transaction_t *transaction)
+{
+    transaction->flags = 0;
+    transaction->prev = NULL;
+    transaction->next = NULL;
+    if (usb->free_transaction_tail)
+        usb->free_transaction_tail->next = transaction;
+    else
+        usb->free_transaction_head = transaction;
+    usb->free_transaction_tail = transaction;
+}
+
+
+/**
+ * @INTERNAL
+ * Add a pipe to the tail of a list
+ * @param list   List to add pipe to
+ * @param pipe   Pipe to add
+ */
+static inline void __cvmx_usb_append_pipe(cvmx_usb_pipe_list_t *list, cvmx_usb_pipe_t *pipe)
+{
+    pipe->next = NULL;
+    pipe->prev = list->tail;
+    if (list->tail)
+        list->tail->next = pipe;
+    else
+        list->head = pipe;
+    list->tail = pipe;
+}
+
+
+/**
+ * @INTERNAL
+ * Remove a pipe from a list
+ * @param list   List to remove pipe from
+ * @param pipe   Pipe to remove
+ */
+static inline void __cvmx_usb_remove_pipe(cvmx_usb_pipe_list_t *list, cvmx_usb_pipe_t *pipe)
+{
+    if (list->head == pipe) {
+        list->head = pipe->next;
+        pipe->next = NULL;
+        if (list->head)
+            list->head->prev = NULL;
+        else
+            list->tail = NULL;
+    }
+    else if (list->tail == pipe) {
+        list->tail = pipe->prev;
+        list->tail->next = NULL;
+        pipe->prev = NULL;
+    }
+    else {
+        pipe->prev->next = pipe->next;
+        pipe->next->prev = pipe->prev;
+        pipe->prev = NULL;
+        pipe->next = NULL;
+    }
+}
+
+
+/**
+ * Initialize a USB port for use. This must be called before any
+ * other access to the Octeon USB port is made. The port starts
+ * off in the disabled state.
+ *
+ * @param state  Pointer to an empty cvmx_usb_state_t structure
+ *               that will be populated by the initialize call.
+ *               This structure is then passed to all other USB
+ *               functions.
+ * @param usb_port_number
+ *               Which Octeon USB port to initialize.
+ * @param flags  Flags to control hardware initialization. See
+ *               cvmx_usb_initialize_flags_t for the flag
+ *               definitions. Some flags are mandatory.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_initialize(cvmx_usb_state_t *state,
+                                      int usb_port_number,
+                                      cvmx_usb_initialize_flags_t flags)
+{
+    cvmx_usbnx_clk_ctl_t usbn_clk_ctl;
+    cvmx_usbnx_usbp_ctl_status_t usbn_usbp_ctl_status;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    usb->init_flags = flags;
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", usb_port_number);
+    CVMX_USB_LOG_PARAM("0x%x", flags);
+
+    /* Make sure that state is large enough to store the internal state */
+    if (sizeof(*state) < sizeof(*usb))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    /* At first allow 0-1 for the usb port number */
+    if ((usb_port_number < 0) || (usb_port_number > 1))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    /* For all chips except 52XX there is only one port */
+    if (!OCTEON_IS_MODEL(OCTEON_CN52XX) && (usb_port_number > 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    /* Try to determine clock type automatically */
+    if ((flags & (CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_XI |
+                  CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_GND)) == 0) {
+        if (octeon_usb_get_clock_type() == USB_CLOCK_TYPE_CRYSTAL_12)
+            flags |= CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_XI;  /* Only 12 MHZ crystals are supported */
+        else
+            flags |= CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_GND;
+    }
+
+    if (flags & CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_GND) {
+        /* Check for auto ref clock frequency */
+        if (!(flags & CVMX_USB_INITIALIZE_FLAGS_CLOCK_MHZ_MASK))
+            switch (octeon_usb_get_clock_type()) {
+                case USB_CLOCK_TYPE_REF_12:
+                    flags |= CVMX_USB_INITIALIZE_FLAGS_CLOCK_12MHZ;
+                    break;
+                case USB_CLOCK_TYPE_REF_24:
+                    flags |= CVMX_USB_INITIALIZE_FLAGS_CLOCK_24MHZ;
+                    break;
+                case USB_CLOCK_TYPE_REF_48:
+                    flags |= CVMX_USB_INITIALIZE_FLAGS_CLOCK_48MHZ;
+                    break;
+                default:
+                    CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+                    break;
+            }
+    }
+
+    memset(usb, 0, sizeof(usb));
+    usb->init_flags = flags;
+
+    /* Initialize the USB state structure */
+    {
+        int i;
+        usb->index = usb_port_number;
+
+        /* Initialize the transaction double linked list */
+        usb->free_transaction_head = NULL;
+        usb->free_transaction_tail = NULL;
+        for (i=0; i<MAX_TRANSACTIONS; i++)
+            __cvmx_usb_free_transaction(usb, usb->transaction + i);
+        for (i=0; i<MAX_PIPES; i++)
+            __cvmx_usb_append_pipe(&usb->free_pipes, usb->pipe + i);
+    }
+
+    /* Power On Reset and PHY Initialization */
+
+    /* 1. Wait for DCOK to assert (nothing to do) */
+    /* 2a. Write USBN0/1_CLK_CTL[POR] = 1 and
+        USBN0/1_CLK_CTL[HRST,PRST,HCLK_RST] = 0 */
+    usbn_clk_ctl.u64 = __cvmx_usb_read_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index));
+    usbn_clk_ctl.s.por = 1;
+    usbn_clk_ctl.s.hrst = 0;
+    usbn_clk_ctl.s.prst = 0;
+    usbn_clk_ctl.s.hclk_rst = 0;
+    usbn_clk_ctl.s.enable = 0;
+    /* 2b. Select the USB reference clock/crystal parameters by writing
+        appropriate values to USBN0/1_CLK_CTL[P_C_SEL, P_RTYPE, P_COM_ON] */
+    if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_GND) {
+        /* The USB port uses 12/24/48MHz 2.5V board clock
+            source at USB_XO. USB_XI should be tied to GND.
+            Most Octeon evaluation boards require this setting */
+        if (OCTEON_IS_MODEL(OCTEON_CN3XXX)) {
+            usbn_clk_ctl.cn31xx.p_rclk  = 1; /* From CN31XX,CN30XX manual */
+            usbn_clk_ctl.cn31xx.p_xenbn = 0;
+        }
+        else if (OCTEON_IS_MODEL(OCTEON_CN56XX) || OCTEON_IS_MODEL(OCTEON_CN50XX))
+            usbn_clk_ctl.cn56xx.p_rtype = 2; /* From CN56XX,CN50XX manual */
+        else
+            usbn_clk_ctl.cn52xx.p_rtype = 1; /* From CN52XX manual */
+
+        switch (flags & CVMX_USB_INITIALIZE_FLAGS_CLOCK_MHZ_MASK) {
+            case CVMX_USB_INITIALIZE_FLAGS_CLOCK_12MHZ:
+                usbn_clk_ctl.s.p_c_sel = 0;
+                break;
+            case CVMX_USB_INITIALIZE_FLAGS_CLOCK_24MHZ:
+                usbn_clk_ctl.s.p_c_sel = 1;
+                break;
+            case CVMX_USB_INITIALIZE_FLAGS_CLOCK_48MHZ:
+                usbn_clk_ctl.s.p_c_sel = 2;
+                break;
+        }
+    }
+    else {
+        /* The USB port uses a 12MHz crystal as clock source
+            at USB_XO and USB_XI */
+        if (OCTEON_IS_MODEL(OCTEON_CN3XXX)) {
+            usbn_clk_ctl.cn31xx.p_rclk  = 1; /* From CN31XX,CN30XX manual */
+            usbn_clk_ctl.cn31xx.p_xenbn = 1;
+        }
+        else if (OCTEON_IS_MODEL(OCTEON_CN56XX) || OCTEON_IS_MODEL(OCTEON_CN50XX))
+            usbn_clk_ctl.cn56xx.p_rtype = 0; /* From CN56XX,CN50XX manual */
+        else
+            usbn_clk_ctl.cn52xx.p_rtype = 0; /* From CN52XX manual */
+
+        usbn_clk_ctl.s.p_c_sel = 0;
+    }
+    /* 2c. Select the HCLK via writing USBN0/1_CLK_CTL[DIVIDE, DIVIDE2] and
+        setting USBN0/1_CLK_CTL[ENABLE] = 1.  Divide the core clock down such
+        that USB is as close as possible to 125Mhz */
+    {
+        int divisor = (octeon_get_clock_rate()+125000000-1)/125000000;
+        if (divisor < 4)  /* Lower than 4 doesn't seem to work properly */
+            divisor = 4;
+        usbn_clk_ctl.s.divide = divisor;
+        usbn_clk_ctl.s.divide2 = 0;
+    }
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    /* 2d. Write USBN0/1_CLK_CTL[HCLK_RST] = 1 */
+    usbn_clk_ctl.s.hclk_rst = 1;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    /* 2e.  Wait 64 core-clock cycles for HCLK to stabilize */
+    cvmx_wait(64);
+    /* 3. Program the power-on reset field in the USBN clock-control register:
+        USBN_CLK_CTL[POR] = 0 */
+    usbn_clk_ctl.s.por = 0;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    /* 4. Wait 1 ms for PHY clock to start */
+    cvmx_wait_usec(1000);
+    /* 5. Program the Reset input from automatic test equipment field in the
+        USBP control and status register: USBN_USBP_CTL_STATUS[ATE_RESET] = 1 */
+    usbn_usbp_ctl_status.u64 = __cvmx_usb_read_csr64(usb, CVMX_USBNX_USBP_CTL_STATUS(usb->index));
+    usbn_usbp_ctl_status.s.ate_reset = 1;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_USBP_CTL_STATUS(usb->index),
+                           usbn_usbp_ctl_status.u64);
+    /* 6. Wait 10 cycles */
+    cvmx_wait(10);
+    /* 7. Clear ATE_RESET field in the USBN clock-control register:
+        USBN_USBP_CTL_STATUS[ATE_RESET] = 0 */
+    usbn_usbp_ctl_status.s.ate_reset = 0;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_USBP_CTL_STATUS(usb->index),
+                           usbn_usbp_ctl_status.u64);
+    /* 8. Program the PHY reset field in the USBN clock-control register:
+        USBN_CLK_CTL[PRST] = 1 */
+    usbn_clk_ctl.s.prst = 1;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    /* 9. Program the USBP control and status register to select host or
+        device mode. USBN_USBP_CTL_STATUS[HST_MODE] = 0 for host, = 1 for
+        device */
+    usbn_usbp_ctl_status.s.hst_mode = 0;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_USBP_CTL_STATUS(usb->index),
+                           usbn_usbp_ctl_status.u64);
+    /* 10. Wait 1 us */
+    cvmx_wait_usec(1);
+    /* 11. Program the hreset_n field in the USBN clock-control register:
+        USBN_CLK_CTL[HRST] = 1 */
+    usbn_clk_ctl.s.hrst = 1;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    /* 12. Proceed to USB core initialization */
+    usbn_clk_ctl.s.enable = 1;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    cvmx_wait_usec(1);
+
+    /* USB Core Initialization */
+
+    /* 1. Read USBC_GHWCFG1, USBC_GHWCFG2, USBC_GHWCFG3, USBC_GHWCFG4 to
+        determine USB core configuration parameters. */
+    /* Nothing needed */
+    /* 2. Program the following fields in the global AHB configuration
+        register (USBC_GAHBCFG)
+        DMA mode, USBC_GAHBCFG[DMAEn]: 1 = DMA mode, 0 = slave mode
+        Burst length, USBC_GAHBCFG[HBSTLEN] = 0
+        Nonperiodic TxFIFO empty level (slave mode only),
+        USBC_GAHBCFG[NPTXFEMPLVL]
+        Periodic TxFIFO empty level (slave mode only),
+        USBC_GAHBCFG[PTXFEMPLVL]
+        Global interrupt mask, USBC_GAHBCFG[GLBLINTRMSK] = 1 */
+    {
+        cvmx_usbcx_gahbcfg_t usbcx_gahbcfg;
+        /* Due to an errata, CN31XX doesn't support DMA */
+        if (OCTEON_IS_MODEL(OCTEON_CN31XX))
+            usb->init_flags |= CVMX_USB_INITIALIZE_FLAGS_NO_DMA;
+        usbcx_gahbcfg.u32 = 0;
+        usbcx_gahbcfg.s.dmaen = !(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA);
+        if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)
+            usb->idle_hardware_channels = 0x1;  /* Only use one channel with non DMA */
+        else if (OCTEON_IS_MODEL(OCTEON_CN5XXX))
+            usb->idle_hardware_channels = 0xf7; /* CN5XXX have an errata with channel 3 */
+        else
+            usb->idle_hardware_channels = 0xff;
+        usbcx_gahbcfg.s.hbstlen = 0;
+        usbcx_gahbcfg.s.nptxfemplvl = 1;
+        usbcx_gahbcfg.s.ptxfemplvl = 1;
+        usbcx_gahbcfg.s.glblintrmsk = 1;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_GAHBCFG(usb->index),
+                               usbcx_gahbcfg.u32);
+    }
+    /* 3. Program the following fields in USBC_GUSBCFG register.
+        HS/FS timeout calibration, USBC_GUSBCFG[TOUTCAL] = 0
+        ULPI DDR select, USBC_GUSBCFG[DDRSEL] = 0
+        USB turnaround time, USBC_GUSBCFG[USBTRDTIM] = 0x5
+        PHY low-power clock select, USBC_GUSBCFG[PHYLPWRCLKSEL] = 0 */
+    {
+        cvmx_usbcx_gusbcfg_t usbcx_gusbcfg;
+        usbcx_gusbcfg.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GUSBCFG(usb->index));
+        usbcx_gusbcfg.s.toutcal = 0;
+        usbcx_gusbcfg.s.ddrsel = 0;
+        usbcx_gusbcfg.s.usbtrdtim = 0x5;
+        usbcx_gusbcfg.s.phylpwrclksel = 0;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_GUSBCFG(usb->index),
+                               usbcx_gusbcfg.u32);
+    }
+    /* 4. The software must unmask the following bits in the USBC_GINTMSK
+        register.
+        OTG interrupt mask, USBC_GINTMSK[OTGINTMSK] = 1
+        Mode mismatch interrupt mask, USBC_GINTMSK[MODEMISMSK] = 1 */
+    {
+        cvmx_usbcx_gintmsk_t usbcx_gintmsk;
+        int channel;
+
+        usbcx_gintmsk.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GINTMSK(usb->index));
+        usbcx_gintmsk.s.otgintmsk = 1;
+        usbcx_gintmsk.s.modemismsk = 1;
+        usbcx_gintmsk.s.hchintmsk = 1;
+        usbcx_gintmsk.s.sofmsk = 0;
+        /* We need RX FIFO interrupts if we don't have DMA */
+        if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)
+            usbcx_gintmsk.s.rxflvlmsk = 1;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_GINTMSK(usb->index),
+                               usbcx_gintmsk.u32);
+
+        /* Disable all channel interrupts. We'll enable them per channel later */
+        for (channel=0; channel<8; channel++)
+            __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCINTMSKX(channel, usb->index), 0);
+    }
+
+    {
+        /* Host Port Initialization */
+        if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+            cvmx_dprintf("%s: USB%d is in host mode\n", __FUNCTION__, usb->index);
+
+        /* 1. Program the host-port interrupt-mask field to unmask,
+            USBC_GINTMSK[PRTINT] = 1 */
+        USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t,
+                        prtintmsk, 1);
+        USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t,
+                        disconnintmsk, 1);
+        /* 2. Program the USBC_HCFG register to select full-speed host or
+            high-speed host. */
+        {
+            cvmx_usbcx_hcfg_t usbcx_hcfg;
+            usbcx_hcfg.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCFG(usb->index));
+            usbcx_hcfg.s.fslssupp = 0;
+            usbcx_hcfg.s.fslspclksel = 0;
+            __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCFG(usb->index), usbcx_hcfg.u32);
+        }
+        /* 3. Program the port power bit to drive VBUS on the USB,
+            USBC_HPRT[PRTPWR] = 1 */
+        USB_SET_FIELD32(CVMX_USBCX_HPRT(usb->index), cvmx_usbcx_hprt_t, prtpwr, 1);
+
+        /* Steps 4-15 from the manual are done later in the port enable */
+    }
+
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Shutdown a USB port after a call to cvmx_usb_initialize().
+ * The port should be disabled with all pipes closed when this
+ * function is called.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_shutdown(cvmx_usb_state_t *state)
+{
+    cvmx_usbnx_clk_ctl_t usbn_clk_ctl;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    /* Make sure all pipes are closed */
+    if (usb->idle_pipes.head ||
+        usb->active_pipes[CVMX_USB_TRANSFER_ISOCHRONOUS].head ||
+        usb->active_pipes[CVMX_USB_TRANSFER_INTERRUPT].head ||
+        usb->active_pipes[CVMX_USB_TRANSFER_CONTROL].head ||
+        usb->active_pipes[CVMX_USB_TRANSFER_BULK].head)
+        CVMX_USB_RETURN(CVMX_USB_BUSY);
+
+    /* Disable the clocks and put them in power on reset */
+    usbn_clk_ctl.u64 = __cvmx_usb_read_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index));
+    usbn_clk_ctl.s.enable = 1;
+    usbn_clk_ctl.s.por = 1;
+    usbn_clk_ctl.s.hclk_rst = 1;
+    usbn_clk_ctl.s.prst = 0;
+    usbn_clk_ctl.s.hrst = 0;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Enable a USB port. After this call succeeds, the USB port is
+ * online and servicing requests.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_enable(cvmx_usb_state_t *state)
+{
+    cvmx_usbcx_ghwcfg3_t usbcx_ghwcfg3;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    usb->usbcx_hprt.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPRT(usb->index));
+
+    /* If the port is already enabled the just return. We don't need to do
+        anything */
+    if (usb->usbcx_hprt.s.prtena)
+        CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+
+    /* If there is nothing plugged into the port then fail immediately */
+    if (!usb->usbcx_hprt.s.prtconnsts) {
+        if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+            cvmx_dprintf("%s: USB%d Nothing plugged into the port\n", __FUNCTION__, usb->index);
+        CVMX_USB_RETURN(CVMX_USB_TIMEOUT);
+    }
+
+    /* Program the port reset bit to start the reset process */
+    USB_SET_FIELD32(CVMX_USBCX_HPRT(usb->index), cvmx_usbcx_hprt_t, prtrst, 1);
+
+    /* Wait at least 50ms (high speed), or 10ms (full speed) for the reset
+        process to complete. */
+    cvmx_wait_usec(50000);
+
+    /* Program the port reset bit to 0, USBC_HPRT[PRTRST] = 0 */
+    USB_SET_FIELD32(CVMX_USBCX_HPRT(usb->index), cvmx_usbcx_hprt_t, prtrst, 0);
+
+    /* Wait for the USBC_HPRT[PRTENA]. */
+    if (CVMX_WAIT_FOR_FIELD32(CVMX_USBCX_HPRT(usb->index), cvmx_usbcx_hprt_t,
+                              prtena, ==, 1, 100000)) {
+        if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+            cvmx_dprintf("%s: Timeout waiting for the port to finish reset\n",
+                         __FUNCTION__);
+        CVMX_USB_RETURN(CVMX_USB_TIMEOUT);
+    }
+
+    /* Read the port speed field to get the enumerated speed, USBC_HPRT[PRTSPD]. */
+    usb->usbcx_hprt.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPRT(usb->index));
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+        cvmx_dprintf("%s: USB%d is in %s speed mode\n", __FUNCTION__, usb->index,
+                     (usb->usbcx_hprt.s.prtspd == CVMX_USB_SPEED_HIGH) ? "high" :
+                     (usb->usbcx_hprt.s.prtspd == CVMX_USB_SPEED_FULL) ? "full" :
+                     "low");
+
+    usbcx_ghwcfg3.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GHWCFG3(usb->index));
+
+    /* 13. Program the USBC_GRXFSIZ register to select the size of the receive
+        FIFO (25%). */
+    USB_SET_FIELD32(CVMX_USBCX_GRXFSIZ(usb->index), cvmx_usbcx_grxfsiz_t,
+                    rxfdep, usbcx_ghwcfg3.s.dfifodepth / 4);
+    /* 14. Program the USBC_GNPTXFSIZ register to select the size and the
+        start address of the non- periodic transmit FIFO for nonperiodic
+        transactions (50%). */
+    {
+        cvmx_usbcx_gnptxfsiz_t siz;
+        siz.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GNPTXFSIZ(usb->index));
+        siz.s.nptxfdep = usbcx_ghwcfg3.s.dfifodepth / 2;
+        siz.s.nptxfstaddr = usbcx_ghwcfg3.s.dfifodepth / 4;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_GNPTXFSIZ(usb->index), siz.u32);
+    }
+    /* 15. Program the USBC_HPTXFSIZ register to select the size and start
+        address of the periodic transmit FIFO for periodic transactions (25%). */
+    {
+        cvmx_usbcx_hptxfsiz_t siz;
+        siz.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPTXFSIZ(usb->index));
+        siz.s.ptxfsize = usbcx_ghwcfg3.s.dfifodepth / 4;
+        siz.s.ptxfstaddr = 3 * usbcx_ghwcfg3.s.dfifodepth / 4;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HPTXFSIZ(usb->index), siz.u32);
+    }
+    /* Flush all FIFOs */
+    USB_SET_FIELD32(CVMX_USBCX_GRSTCTL(usb->index), cvmx_usbcx_grstctl_t, txfnum, 0x10);
+    USB_SET_FIELD32(CVMX_USBCX_GRSTCTL(usb->index), cvmx_usbcx_grstctl_t, txfflsh, 1);
+    CVMX_WAIT_FOR_FIELD32(CVMX_USBCX_GRSTCTL(usb->index), cvmx_usbcx_grstctl_t,
+                          txfflsh, ==, 0, 100);
+    USB_SET_FIELD32(CVMX_USBCX_GRSTCTL(usb->index), cvmx_usbcx_grstctl_t, rxfflsh, 1);
+    CVMX_WAIT_FOR_FIELD32(CVMX_USBCX_GRSTCTL(usb->index), cvmx_usbcx_grstctl_t,
+                          rxfflsh, ==, 0, 100);
+
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Disable a USB port. After this call the USB port will not
+ * generate data transfers and will not generate events.
+ * Transactions in process will fail and call their
+ * associated callbacks.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_disable(cvmx_usb_state_t *state)
+{
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    /* Disable the port */
+    USB_SET_FIELD32(CVMX_USBCX_HPRT(usb->index), cvmx_usbcx_hprt_t, prtena, 1);
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Get the current state of the USB port. Use this call to
+ * determine if the usb port has anything connected, is enabled,
+ * or has some sort of error condition. The return value of this
+ * call has "changed" bits to signal of the value of some fields
+ * have changed between calls. These "changed" fields are based
+ * on the last call to cvmx_usb_set_status(). In order to clear
+ * them, you must update the status through cvmx_usb_set_status().
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return Port status information
+ */
+cvmx_usb_port_status_t cvmx_usb_get_status(cvmx_usb_state_t *state)
+{
+    cvmx_usbcx_hprt_t usbc_hprt;
+    cvmx_usb_port_status_t result;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    memset(&result, 0, sizeof(result));
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    usbc_hprt.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPRT(usb->index));
+    result.port_enabled = usbc_hprt.s.prtena;
+    result.port_over_current = usbc_hprt.s.prtovrcurract;
+    result.port_powered = usbc_hprt.s.prtpwr;
+    result.port_speed = usbc_hprt.s.prtspd;
+    result.connected = usbc_hprt.s.prtconnsts;
+    result.connect_change = (result.connected != usb->port_status.connected);
+
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS))
+        cvmx_dprintf("%*s%s: returned port enabled=%d, over_current=%d, powered=%d, speed=%d, connected=%d, connect_change=%d\n",
+                     2*(--usb->indent), "", __FUNCTION__,
+                     result.port_enabled,
+                     result.port_over_current,
+                     result.port_powered,
+                     result.port_speed,
+                     result.connected,
+                     result.connect_change);
+    return result;
+}
+
+
+/**
+ * Set the current state of the USB port. The status is used as
+ * a reference for the "changed" bits returned by
+ * cvmx_usb_get_status(). Other than serving as a reference, the
+ * status passed to this function is not used. No fields can be
+ * changed through this call.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param port_status
+ *               Port status to set, most like returned by cvmx_usb_get_status()
+ */
+void cvmx_usb_set_status(cvmx_usb_state_t *state, cvmx_usb_port_status_t port_status)
+{
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    usb->port_status = port_status;
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Convert a USB transaction into a handle
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param transaction
+ *               Transaction to get handle for
+ *
+ * @return Handle
+ */
+static inline int __cvmx_usb_get_submit_handle(cvmx_usb_internal_state_t *usb,
+                                        cvmx_usb_transaction_t *transaction)
+{
+    return ((unsigned long)transaction - (unsigned long)usb->transaction) /
+            sizeof(*transaction);
+}
+
+
+/**
+ * @INTERNAL
+ * Convert a USB pipe into a handle
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe   Pipe to get handle for
+ *
+ * @return Handle
+ */
+static inline int __cvmx_usb_get_pipe_handle(cvmx_usb_internal_state_t *usb,
+                                        cvmx_usb_pipe_t *pipe)
+{
+    return ((unsigned long)pipe - (unsigned long)usb->pipe) / sizeof(*pipe);
+}
+
+
+/**
+ * Open a virtual pipe between the host and a USB device. A pipe
+ * must be opened before data can be transferred between a device
+ * and Octeon.
+ *
+ * @param state      USB device state populated by
+ *                   cvmx_usb_initialize().
+ * @param flags      Optional pipe flags defined in
+ *                   cvmx_usb_pipe_flags_t.
+ * @param device_addr
+ *                   USB device address to open the pipe to
+ *                   (0-127).
+ * @param endpoint_num
+ *                   USB endpoint number to open the pipe to
+ *                   (0-15).
+ * @param device_speed
+ *                   The speed of the device the pipe is going
+ *                   to. This must match the device's speed,
+ *                   which may be different than the port speed.
+ * @param max_packet The maximum packet length the device can
+ *                   transmit/receive (low speed=0-8, full
+ *                   speed=0-1023, high speed=0-1024). This value
+ *                   comes from the standard endpoint descriptor
+ *                   field wMaxPacketSize bits <10:0>.
+ * @param transfer_type
+ *                   The type of transfer this pipe is for.
+ * @param transfer_dir
+ *                   The direction the pipe is in. This is not
+ *                   used for control pipes.
+ * @param interval   For ISOCHRONOUS and INTERRUPT transfers,
+ *                   this is how often the transfer is scheduled
+ *                   for. All other transfers should specify
+ *                   zero. The units are in frames (8000/sec at
+ *                   high speed, 1000/sec for full speed).
+ * @param multi_count
+ *                   For high speed devices, this is the maximum
+ *                   allowed number of packet per microframe.
+ *                   Specify zero for non high speed devices. This
+ *                   value comes from the standard endpoint descriptor
+ *                   field wMaxPacketSize bits <12:11>.
+ * @param hub_device_addr
+ *                   Hub device address this device is connected
+ *                   to. Devices connected directly to Octeon
+ *                   use zero. This is only used when the device
+ *                   is full/low speed behind a high speed hub.
+ *                   The address will be of the high speed hub,
+ *                   not and full speed hubs after it.
+ * @param hub_port   Which port on the hub the device is
+ *                   connected. Use zero for devices connected
+ *                   directly to Octeon. Like hub_device_addr,
+ *                   this is only used for full/low speed
+ *                   devices behind a high speed hub.
+ *
+ * @return A non negative value is a pipe handle. Negative
+ *         values are failure codes from cvmx_usb_status_t.
+ */
+int cvmx_usb_open_pipe(cvmx_usb_state_t *state, cvmx_usb_pipe_flags_t flags,
+                       int device_addr, int endpoint_num,
+                       cvmx_usb_speed_t device_speed, int max_packet,
+                       cvmx_usb_transfer_t transfer_type,
+                       cvmx_usb_direction_t transfer_dir, int interval,
+                       int multi_count, int hub_device_addr, int hub_port)
+{
+    cvmx_usb_pipe_t *pipe;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("0x%x", flags);
+    CVMX_USB_LOG_PARAM("%d", device_addr);
+    CVMX_USB_LOG_PARAM("%d", endpoint_num);
+    CVMX_USB_LOG_PARAM("%d", device_speed);
+    CVMX_USB_LOG_PARAM("%d", max_packet);
+    CVMX_USB_LOG_PARAM("%d", transfer_type);
+    CVMX_USB_LOG_PARAM("%d", transfer_dir);
+    CVMX_USB_LOG_PARAM("%d", interval);
+    CVMX_USB_LOG_PARAM("%d", multi_count);
+    CVMX_USB_LOG_PARAM("%d", hub_device_addr);
+    CVMX_USB_LOG_PARAM("%d", hub_port);
+
+    if (cvmx_unlikely((device_addr < 0) || (device_addr > MAX_USB_ADDRESS)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((endpoint_num < 0) || (endpoint_num > MAX_USB_ENDPOINT)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(device_speed > CVMX_USB_SPEED_LOW))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((max_packet <= 0) || (max_packet > 1024)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(transfer_type > CVMX_USB_TRANSFER_INTERRUPT))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((transfer_dir != CVMX_USB_DIRECTION_OUT) &&
+        (transfer_dir != CVMX_USB_DIRECTION_IN)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(interval < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((transfer_type == CVMX_USB_TRANSFER_CONTROL) && interval))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(multi_count < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((device_speed != CVMX_USB_SPEED_HIGH) &&
+        (multi_count != 0)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((hub_device_addr < 0) || (hub_device_addr > MAX_USB_ADDRESS)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((hub_port < 0) || (hub_port > MAX_USB_HUB_PORT)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Find a free pipe */
+    pipe = usb->free_pipes.head;
+    if (!pipe)
+        CVMX_USB_RETURN(CVMX_USB_NO_MEMORY);
+    __cvmx_usb_remove_pipe(&usb->free_pipes, pipe);
+    pipe->flags = flags | __CVMX_USB_PIPE_FLAGS_OPEN;
+    if ((device_speed == CVMX_USB_SPEED_HIGH) &&
+        (transfer_dir == CVMX_USB_DIRECTION_OUT) &&
+        (transfer_type == CVMX_USB_TRANSFER_BULK))
+        pipe->flags |= __CVMX_USB_PIPE_FLAGS_NEED_PING;
+    pipe->device_addr = device_addr;
+    pipe->endpoint_num = endpoint_num;
+    pipe->device_speed = device_speed;
+    pipe->max_packet = max_packet;
+    pipe->transfer_type = transfer_type;
+    pipe->transfer_dir = transfer_dir;
+    /* All pipes use interval to rate limit NAK processing. Force an interval
+        if one wasn't supplied */
+    if (!interval)
+        interval = 1;
+    if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+        pipe->interval = interval*8;
+        /* Force start splits to be schedule on uFrame 0 */
+        pipe->next_tx_frame = ((usb->frame_number+7)&~7) + pipe->interval;
+    }
+    else {
+        pipe->interval = interval;
+        pipe->next_tx_frame = usb->frame_number + pipe->interval;
+    }
+    pipe->multi_count = multi_count;
+    pipe->hub_device_addr = hub_device_addr;
+    pipe->hub_port = hub_port;
+    pipe->pid_toggle = 0;
+    pipe->split_sc_frame = -1;
+    __cvmx_usb_append_pipe(&usb->idle_pipes, pipe);
+
+    /* We don't need to tell the hardware about this pipe yet since
+        it doesn't have any submitted requests */
+
+    CVMX_USB_RETURN(__cvmx_usb_get_pipe_handle(usb, pipe));
+}
+
+
+/**
+ * @INTERNAL
+ * Poll the RX FIFOs and remove data as needed. This function is only used
+ * in non DMA mode. It is very important that this function be called quickly
+ * enough to prevent FIFO overflow.
+ *
+ * @param usb     USB device state populated by
+ *                cvmx_usb_initialize().
+ */
+static void __cvmx_usb_poll_rx_fifo(cvmx_usb_internal_state_t *usb)
+{
+    cvmx_usbcx_grxstsph_t rx_status;
+    int channel;
+    int bytes;
+    uint64_t address;
+    uint32_t *ptr;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+
+    rx_status.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GRXSTSPH(usb->index));
+    /* Only read data if IN data is there */
+    if (rx_status.s.pktsts != 2)
+        CVMX_USB_RETURN_NOTHING();
+    /* Check if no data is available */
+    if (!rx_status.s.bcnt)
+        CVMX_USB_RETURN_NOTHING();
+
+    channel = rx_status.s.chnum;
+    bytes = rx_status.s.bcnt;
+    if (!bytes)
+        CVMX_USB_RETURN_NOTHING();
+
+    /* Get where the DMA engine would have written this data */
+    address = __cvmx_usb_read_csr64(usb, CVMX_USBNX_DMA0_INB_CHN0(usb->index) + channel*8);
+    ptr = cvmx_phys_to_ptr(address);
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_DMA0_INB_CHN0(usb->index) + channel*8, address + bytes);
+
+    /* Loop writing the FIFO data for this packet into memory */
+    while (bytes > 0) {
+        *ptr++ = __cvmx_usb_read_csr32(usb, USB_FIFO_ADDRESS(channel, usb->index));
+        bytes -= 4;
+    }
+    CVMX_SYNCW;
+
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * Fill the TX hardware fifo with data out of the software
+ * fifos
+ *
+ * @param usb       USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param fifo      Software fifo to use
+ * @param available Amount of space in the hardware fifo
+ *
+ * @return Non zero if the hardware fifo was too small and needs
+ *         to be serviced again.
+ */
+static int __cvmx_usb_fill_tx_hw(cvmx_usb_internal_state_t *usb, cvmx_usb_tx_fifo_t *fifo, int available)
+{
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%p", fifo);
+    CVMX_USB_LOG_PARAM("%d", available);
+
+    /* We're done either when there isn't anymore space or the software FIFO
+        is empty */
+    while (available && (fifo->head != fifo->tail)) {
+        int i = fifo->tail;
+        const uint32_t *ptr = cvmx_phys_to_ptr(fifo->entry[i].address);
+        uint64_t csr_address = USB_FIFO_ADDRESS(fifo->entry[i].channel, usb->index) ^ 4;
+        int words = available;
+
+        /* Limit the amount of data to waht the SW fifo has */
+        if (fifo->entry[i].size <= available) {
+            words = fifo->entry[i].size;
+            fifo->tail++;
+            if (fifo->tail > MAX_CHANNELS)
+                fifo->tail = 0;
+        }
+
+        /* Update the next locations and counts */
+        available -= words;
+        fifo->entry[i].address += words * 4;
+        fifo->entry[i].size -= words;
+
+        /* Write the HW fifo data. The read every three writes is due
+            to an errata on CN3XXX chips */
+        while (words > 3) {
+            cvmx_write64_uint32(csr_address, *ptr++);
+            cvmx_write64_uint32(csr_address, *ptr++);
+            cvmx_write64_uint32(csr_address, *ptr++);
+            cvmx_read64_uint64(CVMX_USBNX_DMA0_INB_CHN0(usb->index));
+            words -= 3;
+        }
+        cvmx_write64_uint32(csr_address, *ptr++);
+        if (--words) {
+            cvmx_write64_uint32(csr_address, *ptr++);
+            if (--words)
+                cvmx_write64_uint32(csr_address, *ptr++);
+        }
+        cvmx_read64_uint64(CVMX_USBNX_DMA0_INB_CHN0(usb->index));
+    }
+    CVMX_USB_RETURN(fifo->head != fifo->tail);
+}
+
+
+/**
+ * Check the hardware FIFOs and fill them as needed
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ */
+static void __cvmx_usb_poll_tx_fifo(cvmx_usb_internal_state_t *usb)
+{
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+
+    if (usb->periodic.head != usb->periodic.tail) {
+        cvmx_usbcx_hptxsts_t tx_status;
+        tx_status.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPTXSTS(usb->index));
+        if (__cvmx_usb_fill_tx_hw(usb, &usb->periodic, tx_status.s.ptxfspcavail))
+            USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t, ptxfempmsk, 1);
+        else
+            USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t, ptxfempmsk, 0);
+    }
+
+    if (usb->nonperiodic.head != usb->nonperiodic.tail) {
+        cvmx_usbcx_gnptxsts_t tx_status;
+        tx_status.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GNPTXSTS(usb->index));
+        if (__cvmx_usb_fill_tx_hw(usb, &usb->nonperiodic, tx_status.s.nptxfspcavail))
+            USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t, nptxfempmsk, 1);
+        else
+            USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t, nptxfempmsk, 0);
+    }
+
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Fill the TX FIFO with an outgoing packet
+ *
+ * @param usb     USB device state populated by
+ *                cvmx_usb_initialize().
+ * @param channel Channel number to get packet from
+ */
+static void __cvmx_usb_fill_tx_fifo(cvmx_usb_internal_state_t *usb, int channel)
+{
+    cvmx_usbcx_hccharx_t hcchar;
+    cvmx_usbcx_hcspltx_t usbc_hcsplt;
+    cvmx_usbcx_hctsizx_t usbc_hctsiz;
+    cvmx_usb_tx_fifo_t *fifo;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%d", channel);
+
+    /* We only need to fill data on outbound channels */
+    hcchar.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index));
+    if (hcchar.s.epdir != CVMX_USB_DIRECTION_OUT)
+        CVMX_USB_RETURN_NOTHING();
+
+    /* OUT Splits only have data on the start and not the complete */
+    usbc_hcsplt.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCSPLTX(channel, usb->index));
+    if (usbc_hcsplt.s.spltena && usbc_hcsplt.s.compsplt)
+        CVMX_USB_RETURN_NOTHING();
+
+    /* Find out how many bytes we need to fill and convert it into 32bit words */
+    usbc_hctsiz.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index));
+    if (!usbc_hctsiz.s.xfersize)
+        CVMX_USB_RETURN_NOTHING();
+
+    if ((hcchar.s.eptype == CVMX_USB_TRANSFER_INTERRUPT) ||
+        (hcchar.s.eptype == CVMX_USB_TRANSFER_ISOCHRONOUS))
+        fifo = &usb->periodic;
+    else
+        fifo = &usb->nonperiodic;
+
+    fifo->entry[fifo->head].channel = channel;
+    fifo->entry[fifo->head].address = __cvmx_usb_read_csr64(usb, CVMX_USBNX_DMA0_OUTB_CHN0(usb->index) + channel*8);
+    fifo->entry[fifo->head].size = (usbc_hctsiz.s.xfersize+3)>>2;
+    fifo->head++;
+    if (fifo->head > MAX_CHANNELS)
+        fifo->head = 0;
+
+    __cvmx_usb_poll_tx_fifo(usb);
+
+    CVMX_USB_RETURN_NOTHING();
+}
+
+/**
+ * @INTERNAL
+ * Perform channel specific setup for Control transactions. All
+ * the generic stuff will already have been done in
+ * __cvmx_usb_start_channel()
+ *
+ * @param usb     USB device state populated by
+ *                cvmx_usb_initialize().
+ * @param channel Channel to setup
+ * @param pipe    Pipe for control transaction
+ */
+static void __cvmx_usb_start_channel_control(cvmx_usb_internal_state_t *usb,
+                                             int channel,
+                                             cvmx_usb_pipe_t *pipe)
+{
+    cvmx_usb_transaction_t *transaction = pipe->head;
+    cvmx_usb_control_header_t *header = cvmx_phys_to_ptr(transaction->control_header);
+    int bytes_to_transfer = transaction->buffer_length - transaction->actual_bytes;
+    int packets_to_transfer;
+    cvmx_usbcx_hctsizx_t usbc_hctsiz;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%d", channel);
+    CVMX_USB_LOG_PARAM("%p", pipe);
+
+    usbc_hctsiz.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index));
+
+    switch (transaction->stage) {
+        case CVMX_USB_STAGE_NON_CONTROL:
+        case CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE:
+            cvmx_dprintf("%s: ERROR - Non control stage\n", __FUNCTION__);
+            break;
+        case CVMX_USB_STAGE_SETUP:
+            usbc_hctsiz.s.pid = 3; /* Setup */
+            bytes_to_transfer = sizeof(*header);
+            /* All Control operations start with a setup going OUT */
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index), cvmx_usbcx_hccharx_t, epdir, CVMX_USB_DIRECTION_OUT);
+            /* Setup send the control header instead of the buffer data. The
+                buffer data will be used in the next stage */
+            __cvmx_usb_write_csr64(usb, CVMX_USBNX_DMA0_OUTB_CHN0(usb->index) + channel*8, transaction->control_header);
+            break;
+        case CVMX_USB_STAGE_SETUP_SPLIT_COMPLETE:
+            usbc_hctsiz.s.pid = 3; /* Setup */
+            bytes_to_transfer = 0;
+            /* All Control operations start with a setup going OUT */
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index), cvmx_usbcx_hccharx_t, epdir, CVMX_USB_DIRECTION_OUT);
+            USB_SET_FIELD32(CVMX_USBCX_HCSPLTX(channel, usb->index), cvmx_usbcx_hcspltx_t, compsplt, 1);
+            break;
+        case CVMX_USB_STAGE_DATA:
+            usbc_hctsiz.s.pid = __cvmx_usb_get_data_pid(pipe);
+            if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                if (header->s.request_type & 0x80)
+                    bytes_to_transfer = 0;
+                else if (bytes_to_transfer > pipe->max_packet)
+                    bytes_to_transfer = pipe->max_packet;
+            }
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index),
+                            cvmx_usbcx_hccharx_t, epdir,
+                            ((header->s.request_type & 0x80) ?
+                             CVMX_USB_DIRECTION_IN :
+                             CVMX_USB_DIRECTION_OUT));
+            break;
+        case CVMX_USB_STAGE_DATA_SPLIT_COMPLETE:
+            usbc_hctsiz.s.pid = __cvmx_usb_get_data_pid(pipe);
+            if (!(header->s.request_type & 0x80))
+                bytes_to_transfer = 0;
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index),
+                            cvmx_usbcx_hccharx_t, epdir,
+                            ((header->s.request_type & 0x80) ?
+                             CVMX_USB_DIRECTION_IN :
+                             CVMX_USB_DIRECTION_OUT));
+            USB_SET_FIELD32(CVMX_USBCX_HCSPLTX(channel, usb->index), cvmx_usbcx_hcspltx_t, compsplt, 1);
+            break;
+        case CVMX_USB_STAGE_STATUS:
+            usbc_hctsiz.s.pid = __cvmx_usb_get_data_pid(pipe);
+            bytes_to_transfer = 0;
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index), cvmx_usbcx_hccharx_t, epdir,
+                            ((header->s.request_type & 0x80) ?
+                             CVMX_USB_DIRECTION_OUT :
+                             CVMX_USB_DIRECTION_IN));
+            break;
+        case CVMX_USB_STAGE_STATUS_SPLIT_COMPLETE:
+            usbc_hctsiz.s.pid = __cvmx_usb_get_data_pid(pipe);
+            bytes_to_transfer = 0;
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index), cvmx_usbcx_hccharx_t, epdir,
+                            ((header->s.request_type & 0x80) ?
+                             CVMX_USB_DIRECTION_OUT :
+                             CVMX_USB_DIRECTION_IN));
+            USB_SET_FIELD32(CVMX_USBCX_HCSPLTX(channel, usb->index), cvmx_usbcx_hcspltx_t, compsplt, 1);
+            break;
+    }
+
+    /* Make sure the transfer never exceeds the byte limit of the hardware.
+        Further bytes will be sent as continued transactions */
+    if (bytes_to_transfer > MAX_TRANSFER_BYTES) {
+        /* Round MAX_TRANSFER_BYTES to a multiple of out packet size */
+        bytes_to_transfer = MAX_TRANSFER_BYTES / pipe->max_packet;
+        bytes_to_transfer *= pipe->max_packet;
+    }
+
+    /* Calculate the number of packets to transfer. If the length is zero
+        we still need to transfer one packet */
+    packets_to_transfer = (bytes_to_transfer + pipe->max_packet - 1) / pipe->max_packet;
+    if (packets_to_transfer == 0)
+        packets_to_transfer = 1;
+    else if ((packets_to_transfer>1) && (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)) {
+        /* Limit to one packet when not using DMA. Channels must be restarted
+            between every packet for IN transactions, so there is no reason to
+            do multiple packets in a row */
+        packets_to_transfer = 1;
+        bytes_to_transfer = packets_to_transfer * pipe->max_packet;
+    }
+    else if (packets_to_transfer > MAX_TRANSFER_PACKETS) {
+        /* Limit the number of packet and data transferred to what the
+            hardware can handle */
+        packets_to_transfer = MAX_TRANSFER_PACKETS;
+        bytes_to_transfer = packets_to_transfer * pipe->max_packet;
+    }
+
+    usbc_hctsiz.s.xfersize = bytes_to_transfer;
+    usbc_hctsiz.s.pktcnt = packets_to_transfer;
+
+    __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index), usbc_hctsiz.u32);
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Start a channel to perform the pipe's head transaction
+ *
+ * @param usb     USB device state populated by
+ *                cvmx_usb_initialize().
+ * @param channel Channel to setup
+ * @param pipe    Pipe to start
+ */
+static void __cvmx_usb_start_channel(cvmx_usb_internal_state_t *usb,
+                                     int channel,
+                                     cvmx_usb_pipe_t *pipe)
+{
+    cvmx_usb_transaction_t *transaction = pipe->head;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%d", channel);
+    CVMX_USB_LOG_PARAM("%p", pipe);
+
+    if (cvmx_unlikely((usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS) ||
+        (pipe->flags & CVMX_USB_PIPE_FLAGS_DEBUG_TRANSFERS)))
+        cvmx_dprintf("%s: Channel %d started. Pipe %d transaction %d stage %d\n",
+                     __FUNCTION__, channel, __cvmx_usb_get_pipe_handle(usb, pipe),
+                     __cvmx_usb_get_submit_handle(usb, transaction),
+                     transaction->stage);
+
+    /* Make sure all writes to the DMA region get flushed */
+    CVMX_SYNCW;
+
+    /* Attach the channel to the pipe */
+    usb->pipe_for_channel[channel] = pipe;
+    pipe->channel = channel;
+    pipe->flags |= __CVMX_USB_PIPE_FLAGS_SCHEDULED;
+
+    /* Mark this channel as in use */
+    usb->idle_hardware_channels &= ~(1<<channel);
+
+    /* Enable the channel interrupt bits */
+    {
+        cvmx_usbcx_hcintx_t usbc_hcint;
+        cvmx_usbcx_hcintmskx_t usbc_hcintmsk;
+        cvmx_usbcx_haintmsk_t usbc_haintmsk;
+
+        /* Clear all channel status bits */
+        usbc_hcint.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCINTX(channel, usb->index));
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCINTX(channel, usb->index), usbc_hcint.u32);
+
+        usbc_hcintmsk.u32 = 0;
+        usbc_hcintmsk.s.chhltdmsk = 1;
+        if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA) {
+            /* Channels need these extra interrupts when we aren't in DMA mode */
+            usbc_hcintmsk.s.datatglerrmsk = 1;
+            usbc_hcintmsk.s.frmovrunmsk = 1;
+            usbc_hcintmsk.s.bblerrmsk = 1;
+            usbc_hcintmsk.s.xacterrmsk = 1;
+            if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                /* Splits don't generate xfercompl, so we need ACK and NYET */
+                usbc_hcintmsk.s.nyetmsk = 1;
+                usbc_hcintmsk.s.ackmsk = 1;
+            }
+            usbc_hcintmsk.s.nakmsk = 1;
+            usbc_hcintmsk.s.stallmsk = 1;
+            usbc_hcintmsk.s.xfercomplmsk = 1;
+        }
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCINTMSKX(channel, usb->index), usbc_hcintmsk.u32);
+
+        /* Enable the channel interrupt to propagate */
+        usbc_haintmsk.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HAINTMSK(usb->index));
+        usbc_haintmsk.s.haintmsk |= 1<<channel;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HAINTMSK(usb->index), usbc_haintmsk.u32);
+    }
+
+    /* Setup the locations the DMA engines use  */
+    {
+        uint64_t dma_address = transaction->buffer + transaction->actual_bytes;
+        if (transaction->type == CVMX_USB_TRANSFER_ISOCHRONOUS)
+            dma_address = transaction->buffer + transaction->iso_packets[0].offset + transaction->actual_bytes;
+        __cvmx_usb_write_csr64(usb, CVMX_USBNX_DMA0_OUTB_CHN0(usb->index) + channel*8, dma_address);
+        __cvmx_usb_write_csr64(usb, CVMX_USBNX_DMA0_INB_CHN0(usb->index) + channel*8, dma_address);
+    }
+
+    /* Setup both the size of the transfer and the SPLIT characteristics */
+    {
+        cvmx_usbcx_hcspltx_t usbc_hcsplt = {.u32 = 0};
+        cvmx_usbcx_hctsizx_t usbc_hctsiz = {.u32 = 0};
+        int packets_to_transfer;
+        int bytes_to_transfer = transaction->buffer_length - transaction->actual_bytes;
+
+        /* ISOCHRONOUS transactions store each individual transfer size in the
+            packet structure, not the global buffer_length */
+        if (transaction->type == CVMX_USB_TRANSFER_ISOCHRONOUS)
+            bytes_to_transfer = transaction->iso_packets[0].length - transaction->actual_bytes;
+
+        /* We need to do split transactions when we are talking to non high
+            speed devices that are behind a high speed hub */
+        if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+            /* On the start split phase (stage is even) record the frame number we
+                will need to send the split complete. We only store the lower two bits
+                since the time ahead can only be two frames */
+            if ((transaction->stage&1) == 0) {
+                if (transaction->type == CVMX_USB_TRANSFER_BULK)
+                    pipe->split_sc_frame = (usb->frame_number + 1) & 0x7f;
+                else
+                    pipe->split_sc_frame = (usb->frame_number + 2) & 0x7f;
+            }
+            else
+                pipe->split_sc_frame = -1;
+
+            usbc_hcsplt.s.spltena = 1;
+            usbc_hcsplt.s.hubaddr = pipe->hub_device_addr;
+            usbc_hcsplt.s.prtaddr = pipe->hub_port;
+            usbc_hcsplt.s.compsplt = (transaction->stage == CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE);
+
+            /* SPLIT transactions can only ever transmit one data packet so
+                limit the transfer size to the max packet size */
+            if (bytes_to_transfer > pipe->max_packet)
+                bytes_to_transfer = pipe->max_packet;
+
+            /* ISOCHRONOUS OUT splits are unique in that they limit
+                data transfers to 188 byte chunks representing the
+                begin/middle/end of the data or all */
+            if (!usbc_hcsplt.s.compsplt &&
+                (pipe->transfer_dir == CVMX_USB_DIRECTION_OUT) &&
+                (pipe->transfer_type == CVMX_USB_TRANSFER_ISOCHRONOUS)) {
+                /* Clear the split complete frame number as there isn't going
+                    to be a split complete */
+                pipe->split_sc_frame = -1;
+                /* See if we've started this transfer and sent data */
+                if (transaction->actual_bytes == 0) {
+                    /* Nothing sent yet, this is either a begin or the
+                        entire payload */
+                    if (bytes_to_transfer <= 188)
+                        usbc_hcsplt.s.xactpos = 3; /* Entire payload in one go */
+                    else
+                        usbc_hcsplt.s.xactpos = 2; /* First part of payload */
+                }
+                else {
+                    /* Continuing the previous data, we must either be
+                        in the middle or at the end */
+                    if (bytes_to_transfer <= 188)
+                        usbc_hcsplt.s.xactpos = 1; /* End of payload */
+                    else
+                        usbc_hcsplt.s.xactpos = 0; /* Middle of payload */
+                }
+                /* Again, the transfer size is limited to 188 bytes */
+                if (bytes_to_transfer > 188)
+                    bytes_to_transfer = 188;
+            }
+        }
+
+        /* Make sure the transfer never exceeds the byte limit of the hardware.
+            Further bytes will be sent as continued transactions */
+        if (bytes_to_transfer > MAX_TRANSFER_BYTES) {
+            /* Round MAX_TRANSFER_BYTES to a multiple of out packet size */
+            bytes_to_transfer = MAX_TRANSFER_BYTES / pipe->max_packet;
+            bytes_to_transfer *= pipe->max_packet;
+        }
+
+        /* Calculate the number of packets to transfer. If the length is zero
+            we still need to transfer one packet */
+        packets_to_transfer = (bytes_to_transfer + pipe->max_packet - 1) / pipe->max_packet;
+        if (packets_to_transfer == 0)
+            packets_to_transfer = 1;
+        else if ((packets_to_transfer>1) && (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)) {
+            /* Limit to one packet when not using DMA. Channels must be restarted
+                between every packet for IN transactions, so there is no reason to
+                do multiple packets in a row */
+            packets_to_transfer = 1;
+            bytes_to_transfer = packets_to_transfer * pipe->max_packet;
+        }
+        else if (packets_to_transfer > MAX_TRANSFER_PACKETS) {
+            /* Limit the number of packet and data transferred to what the
+                hardware can handle */
+            packets_to_transfer = MAX_TRANSFER_PACKETS;
+            bytes_to_transfer = packets_to_transfer * pipe->max_packet;
+        }
+
+        usbc_hctsiz.s.xfersize = bytes_to_transfer;
+        usbc_hctsiz.s.pktcnt = packets_to_transfer;
+
+        /* Update the DATA0/DATA1 toggle */
+        usbc_hctsiz.s.pid = __cvmx_usb_get_data_pid(pipe);
+        /* High speed pipes may need a hardware ping before they start */
+        if (pipe->flags & __CVMX_USB_PIPE_FLAGS_NEED_PING)
+            usbc_hctsiz.s.dopng = 1;
+
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCSPLTX(channel, usb->index), usbc_hcsplt.u32);
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index), usbc_hctsiz.u32);
+    }
+
+    /* Setup the Host Channel Characteristics Register */
+    {
+        cvmx_usbcx_hccharx_t usbc_hcchar = {.u32 = 0};
+
+        /* Set the startframe odd/even properly. This is only used for periodic */
+        usbc_hcchar.s.oddfrm = usb->frame_number&1;
+
+        /* Set the number of back to back packets allowed by this endpoint.
+            Split transactions interpret "ec" as the number of immediate
+            retries of failure. These retries happen too quickly, so we
+            disable these entirely for splits */
+        if (__cvmx_usb_pipe_needs_split(usb, pipe))
+            usbc_hcchar.s.ec = 1;
+        else if (pipe->multi_count < 1)
+            usbc_hcchar.s.ec = 1;
+        else if (pipe->multi_count > 3)
+            usbc_hcchar.s.ec = 3;
+        else
+            usbc_hcchar.s.ec = pipe->multi_count;
+
+        /* Set the rest of the endpoint specific settings */
+        usbc_hcchar.s.devaddr = pipe->device_addr;
+        usbc_hcchar.s.eptype = transaction->type;
+        usbc_hcchar.s.lspddev = (pipe->device_speed == CVMX_USB_SPEED_LOW);
+        usbc_hcchar.s.epdir = pipe->transfer_dir;
+        usbc_hcchar.s.epnum = pipe->endpoint_num;
+        usbc_hcchar.s.mps = pipe->max_packet;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index), usbc_hcchar.u32);
+    }
+
+    /* Do transaction type specific fixups as needed */
+    switch (transaction->type) {
+        case CVMX_USB_TRANSFER_CONTROL:
+            __cvmx_usb_start_channel_control(usb, channel, pipe);
+            break;
+        case CVMX_USB_TRANSFER_BULK:
+        case CVMX_USB_TRANSFER_INTERRUPT:
+            break;
+        case CVMX_USB_TRANSFER_ISOCHRONOUS:
+            if (!__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                /* ISO transactions require different PIDs depending on direction
+                    and how many packets are needed */
+                if (pipe->transfer_dir == CVMX_USB_DIRECTION_OUT) {
+                    if (pipe->multi_count < 2) /* Need DATA0 */
+                        USB_SET_FIELD32(CVMX_USBCX_HCTSIZX(channel, usb->index), cvmx_usbcx_hctsizx_t, pid, 0);
+                    else /* Need MDATA */
+                        USB_SET_FIELD32(CVMX_USBCX_HCTSIZX(channel, usb->index), cvmx_usbcx_hctsizx_t, pid, 3);
+                }
+            }
+            break;
+    }
+    {
+        cvmx_usbcx_hctsizx_t usbc_hctsiz = {.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index))};
+        transaction->xfersize = usbc_hctsiz.s.xfersize;
+        transaction->pktcnt = usbc_hctsiz.s.pktcnt;
+    }
+    /* Remeber when we start a split transaction */
+    if (__cvmx_usb_pipe_needs_split(usb, pipe))
+        usb->active_split = transaction;
+    USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index), cvmx_usbcx_hccharx_t, chena, 1);
+    if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)
+        __cvmx_usb_fill_tx_fifo(usb, channel);
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Find a pipe that is ready to be scheduled to hardware.
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param list   Pipe list to search
+ * @param current_frame
+ *               Frame counter to use as a time reference.
+ *
+ * @return Pipe or NULL if none are ready
+ */
+static cvmx_usb_pipe_t *__cvmx_usb_find_ready_pipe(cvmx_usb_internal_state_t *usb, cvmx_usb_pipe_list_t *list, uint64_t current_frame)
+{
+    cvmx_usb_pipe_t *pipe = list->head;
+    while (pipe) {
+        if (!(pipe->flags & __CVMX_USB_PIPE_FLAGS_SCHEDULED) && pipe->head &&
+            (pipe->next_tx_frame <= current_frame) &&
+            ((pipe->split_sc_frame == -1) || ((((int)current_frame - (int)pipe->split_sc_frame) & 0x7f) < 0x40)) &&
+            (!usb->active_split || (usb->active_split == pipe->head))) {
+            CVMX_PREFETCH(pipe, 128);
+            CVMX_PREFETCH(pipe->head, 0);
+            return pipe;
+        }
+        pipe = pipe->next;
+    }
+    return NULL;
+}
+
+
+/**
+ * @INTERNAL
+ * Called whenever a pipe might need to be scheduled to the
+ * hardware.
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param is_sof True if this schedule was called on a SOF interrupt.
+ */
+static void __cvmx_usb_schedule(cvmx_usb_internal_state_t *usb, int is_sof)
+{
+    int channel;
+    cvmx_usb_pipe_t *pipe;
+    int need_sof;
+    cvmx_usb_transfer_t ttype;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+
+    if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA) {
+        /* Without DMA we need to be careful to not schedule something at the end of a frame and cause an overrun */
+        cvmx_usbcx_hfnum_t hfnum = {.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HFNUM(usb->index))};
+        cvmx_usbcx_hfir_t hfir = {.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HFIR(usb->index))};
+        if (hfnum.s.frrem < hfir.s.frint/4)
+            goto done;
+    }
+
+    while (usb->idle_hardware_channels) {
+        /* Find an idle channel */
+        CVMX_CLZ(channel, usb->idle_hardware_channels);
+        channel = 31 - channel;
+        if (cvmx_unlikely(channel > 7)) {
+            if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+                cvmx_dprintf("%s: Idle hardware channels has a channel higher than 7. This is wrong\n", __FUNCTION__);
+            break;
+        }
+
+        /* Find a pipe needing service */
+        pipe = NULL;
+        if (is_sof) {
+            /* Only process periodic pipes on SOF interrupts. This way we are
+                sure that the periodic data is sent in the beginning of the
+                frame */
+            pipe = __cvmx_usb_find_ready_pipe(usb, usb->active_pipes + CVMX_USB_TRANSFER_ISOCHRONOUS, usb->frame_number);
+            if (cvmx_likely(!pipe))
+                pipe = __cvmx_usb_find_ready_pipe(usb, usb->active_pipes + CVMX_USB_TRANSFER_INTERRUPT, usb->frame_number);
+        }
+        if (cvmx_likely(!pipe)) {
+            pipe = __cvmx_usb_find_ready_pipe(usb, usb->active_pipes + CVMX_USB_TRANSFER_CONTROL, usb->frame_number);
+            if (cvmx_likely(!pipe))
+                pipe = __cvmx_usb_find_ready_pipe(usb, usb->active_pipes + CVMX_USB_TRANSFER_BULK, usb->frame_number);
+        }
+        if (!pipe)
+            break;
+
+        CVMX_USB_LOG_PARAM("%d", channel);
+        CVMX_USB_LOG_PARAM("%p", pipe);
+
+        if (cvmx_unlikely((usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS) ||
+            (pipe->flags & CVMX_USB_PIPE_FLAGS_DEBUG_TRANSFERS))) {
+            cvmx_usb_transaction_t *transaction = pipe->head;
+            const cvmx_usb_control_header_t *header = (transaction->control_header) ? cvmx_phys_to_ptr(transaction->control_header) : NULL;
+            const char *dir = (pipe->transfer_dir == CVMX_USB_DIRECTION_IN) ? "IN" : "OUT";
+            const char *type;
+            switch (pipe->transfer_type) {
+                case CVMX_USB_TRANSFER_CONTROL:
+                    type = "SETUP";
+                    dir = (header->s.request_type & 0x80) ? "IN" : "OUT";
+                    break;
+                case CVMX_USB_TRANSFER_ISOCHRONOUS:
+                    type = "ISOCHRONOUS";
+                    break;
+                case CVMX_USB_TRANSFER_BULK:
+                    type = "BULK";
+                    break;
+                default: /* CVMX_USB_TRANSFER_INTERRUPT */
+                    type = "INTERRUPT";
+                    break;
+            }
+            cvmx_dprintf("%s: Starting pipe %d, transaction %d on channel %d. %s %s len=%d header=0x%llx\n",
+                         __FUNCTION__, __cvmx_usb_get_pipe_handle(usb, pipe),
+                         __cvmx_usb_get_submit_handle(usb, transaction),
+                         channel, type, dir,
+                         transaction->buffer_length,
+                         (header) ? (unsigned long long)header->u64 : 0ull);
+        }
+        __cvmx_usb_start_channel(usb, channel, pipe);
+    }
+
+done:
+    /* Only enable SOF interrupts when we have transactions pending in the
+        future that might need to be scheduled */
+    need_sof = 0;
+    for (ttype=CVMX_USB_TRANSFER_CONTROL; ttype<=CVMX_USB_TRANSFER_INTERRUPT; ttype++) {
+        pipe = usb->active_pipes[ttype].head;
+        while (pipe) {
+            if (pipe->next_tx_frame > usb->frame_number) {
+                need_sof = 1;
+                break;
+            }
+            pipe=pipe->next;
+        }
+    }
+    USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t, sofmsk, need_sof);
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Call a user's callback for a specific reason.
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe   Pipe the callback is for or NULL
+ * @param transaction
+ *               Transaction the callback is for or NULL
+ * @param reason Reason this callback is being called
+ * @param complete_code
+ *               Completion code for the transaction, if any
+ */
+static void __cvmx_usb_perform_callback(cvmx_usb_internal_state_t *usb,
+                                        cvmx_usb_pipe_t *pipe,
+                                        cvmx_usb_transaction_t *transaction,
+                                        cvmx_usb_callback_t reason,
+                                        cvmx_usb_complete_t complete_code)
+{
+    cvmx_usb_callback_func_t callback = usb->callback[reason];
+    void *user_data = usb->callback_data[reason];
+    int submit_handle = -1;
+    int pipe_handle = -1;
+    int bytes_transferred = 0;
+
+    if (pipe)
+        pipe_handle = __cvmx_usb_get_pipe_handle(usb, pipe);
+
+    if (transaction) {
+        submit_handle = __cvmx_usb_get_submit_handle(usb, transaction);
+        bytes_transferred = transaction->actual_bytes;
+        /* Transactions are allowed to override the default callback */
+        if ((reason == CVMX_USB_CALLBACK_TRANSFER_COMPLETE) && transaction->callback) {
+            callback = transaction->callback;
+            user_data = transaction->callback_data;
+        }
+    }
+
+    if (!callback)
+        return;
+
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLBACKS))
+        cvmx_dprintf("%*s%s: calling callback %p(usb=%p, complete_code=%s, "
+                     "pipe_handle=%d, submit_handle=%d, bytes_transferred=%d, user_data=%p);\n",
+                     2*usb->indent, "", __FUNCTION__, callback, usb,
+                     __cvmx_usb_complete_to_string(complete_code),
+                     pipe_handle, submit_handle, bytes_transferred, user_data);
+
+    callback((cvmx_usb_state_t *)usb, reason, complete_code, pipe_handle, submit_handle,
+             bytes_transferred, user_data);
+
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLBACKS))
+        cvmx_dprintf("%*s%s: callback %p complete\n", 2*usb->indent, "",
+                      __FUNCTION__, callback);
+}
+
+
+/**
+ * @INTERNAL
+ * Signal the completion of a transaction and free it. The
+ * transaction will be removed from the pipe transaction list.
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe   Pipe the transaction is on
+ * @param transaction
+ *               Transaction that completed
+ * @param complete_code
+ *               Completion code
+ */
+static void __cvmx_usb_perform_complete(cvmx_usb_internal_state_t * usb,
+                                        cvmx_usb_pipe_t *pipe,
+                                        cvmx_usb_transaction_t *transaction,
+                                        cvmx_usb_complete_t complete_code)
+{
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%p", pipe);
+    CVMX_USB_LOG_PARAM("%p", transaction);
+    CVMX_USB_LOG_PARAM("%d", complete_code);
+
+    /* If this was a split then clear our split in progress marker */
+    if (usb->active_split == transaction)
+        usb->active_split = NULL;
+
+    /* Isochronous transactions need extra processing as they might not be done
+        after a single data transfer */
+    if (cvmx_unlikely(transaction->type == CVMX_USB_TRANSFER_ISOCHRONOUS)) {
+        /* Update the number of bytes transferred in this ISO packet */
+        transaction->iso_packets[0].length = transaction->actual_bytes;
+        transaction->iso_packets[0].status = complete_code;
+
+        /* If there are more ISOs pending and we succeeded, schedule the next
+            one */
+        if ((transaction->iso_number_packets > 1) && (complete_code == CVMX_USB_COMPLETE_SUCCESS)) {
+            transaction->actual_bytes = 0;      /* No bytes transferred for this packet as of yet */
+            transaction->iso_number_packets--;  /* One less ISO waiting to transfer */
+            transaction->iso_packets++;         /* Increment to the next location in our packet array */
+            transaction->stage = CVMX_USB_STAGE_NON_CONTROL;
+            goto done;
+        }
+    }
+
+    /* Remove the transaction from the pipe list */
+    if (transaction->next)
+        transaction->next->prev = transaction->prev;
+    else
+        pipe->tail = transaction->prev;
+    if (transaction->prev)
+        transaction->prev->next = transaction->next;
+    else
+        pipe->head = transaction->next;
+    if (!pipe->head) {
+        __cvmx_usb_remove_pipe(usb->active_pipes + pipe->transfer_type, pipe);
+        __cvmx_usb_append_pipe(&usb->idle_pipes, pipe);
+
+    }
+    __cvmx_usb_perform_callback(usb, pipe, transaction,
+                                CVMX_USB_CALLBACK_TRANSFER_COMPLETE,
+                                complete_code);
+    __cvmx_usb_free_transaction(usb, transaction);
+done:
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Submit a usb transaction to a pipe. Called for all types
+ * of transactions.
+ *
+ * @param usb
+ * @param pipe_handle
+ *                  Which pipe to submit to. Will be validated in this function.
+ * @param type      Transaction type
+ * @param flags     Flags for the transaction
+ * @param buffer    User buffer for the transaction
+ * @param buffer_length
+ *                  User buffer's length in bytes
+ * @param control_header
+ *                  For control transactions, the 8 byte standard header
+ * @param iso_start_frame
+ *                  For ISO transactions, the start frame
+ * @param iso_number_packets
+ *                  For ISO, the number of packet in the transaction.
+ * @param iso_packets
+ *                  A description of each ISO packet
+ * @param callback  User callback to call when the transaction completes
+ * @param user_data User's data for the callback
+ *
+ * @return Submit handle or negative on failure. Matches the result
+ *         in the external API.
+ */
+static int __cvmx_usb_submit_transaction(cvmx_usb_internal_state_t *usb,
+                                         int pipe_handle,
+                                         cvmx_usb_transfer_t type,
+                                         int flags,
+                                         uint64_t buffer,
+                                         int buffer_length,
+                                         uint64_t control_header,
+                                         int iso_start_frame,
+                                         int iso_number_packets,
+                                         cvmx_usb_iso_packet_t *iso_packets,
+                                         cvmx_usb_callback_func_t callback,
+                                         void *user_data)
+{
+    int submit_handle;
+    cvmx_usb_transaction_t *transaction;
+    cvmx_usb_pipe_t *pipe = usb->pipe + pipe_handle;
+
+    CVMX_USB_LOG_CALLED();
+    if (cvmx_unlikely((pipe_handle < 0) || (pipe_handle >= MAX_PIPES)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    /* Fail if the pipe isn't open */
+    if (cvmx_unlikely((pipe->flags & __CVMX_USB_PIPE_FLAGS_OPEN) == 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(pipe->transfer_type != type))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    transaction = __cvmx_usb_alloc_transaction(usb);
+    if (cvmx_unlikely(!transaction))
+        CVMX_USB_RETURN(CVMX_USB_NO_MEMORY);
+
+    transaction->type = type;
+    transaction->flags |= flags;
+    transaction->buffer = buffer;
+    transaction->buffer_length = buffer_length;
+    transaction->control_header = control_header;
+    transaction->iso_start_frame = iso_start_frame; // FIXME: This is not used, implement it
+    transaction->iso_number_packets = iso_number_packets;
+    transaction->iso_packets = iso_packets;
+    transaction->callback = callback;
+    transaction->callback_data = user_data;
+    if (transaction->type == CVMX_USB_TRANSFER_CONTROL)
+        transaction->stage = CVMX_USB_STAGE_SETUP;
+    else
+        transaction->stage = CVMX_USB_STAGE_NON_CONTROL;
+
+    transaction->next = NULL;
+    if (pipe->tail) {
+        transaction->prev = pipe->tail;
+        transaction->prev->next = transaction;
+    }
+    else {
+        if (pipe->next_tx_frame < usb->frame_number)
+            pipe->next_tx_frame = usb->frame_number + pipe->interval -
+                (usb->frame_number - pipe->next_tx_frame) % pipe->interval;
+        transaction->prev = NULL;
+        pipe->head = transaction;
+        __cvmx_usb_remove_pipe(&usb->idle_pipes, pipe);
+        __cvmx_usb_append_pipe(usb->active_pipes + pipe->transfer_type, pipe);
+    }
+    pipe->tail = transaction;
+
+    submit_handle = __cvmx_usb_get_submit_handle(usb, transaction);
+
+    /* We may need to schedule the pipe if this was the head of the pipe */
+    if (!transaction->prev)
+        __cvmx_usb_schedule(usb, 0);
+
+    CVMX_USB_RETURN(submit_handle);
+}
+
+
+/**
+ * Call to submit a USB Bulk transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+int cvmx_usb_submit_bulk(cvmx_usb_state_t *state, int pipe_handle,
+                                uint64_t buffer, int buffer_length,
+                                cvmx_usb_callback_func_t callback,
+                                void *user_data)
+{
+    int submit_handle;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    CVMX_USB_LOG_PARAM("0x%llx", (unsigned long long)buffer);
+    CVMX_USB_LOG_PARAM("%d", buffer_length);
+
+    /* Pipe handle checking is done later in a common place */
+    if (cvmx_unlikely(!buffer))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(buffer_length < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    submit_handle = __cvmx_usb_submit_transaction(usb, pipe_handle,
+                                         CVMX_USB_TRANSFER_BULK,
+                                         0, /* flags */
+                                         buffer,
+                                         buffer_length,
+                                         0, /* control_header */
+                                         0, /* iso_start_frame */
+                                         0, /* iso_number_packets */
+                                         NULL, /* iso_packets */
+                                         callback,
+                                         user_data);
+    CVMX_USB_RETURN(submit_handle);
+}
+
+
+/**
+ * Call to submit a USB Interrupt transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+int cvmx_usb_submit_interrupt(cvmx_usb_state_t *state, int pipe_handle,
+                              uint64_t buffer, int buffer_length,
+                              cvmx_usb_callback_func_t callback,
+                              void *user_data)
+{
+    int submit_handle;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    CVMX_USB_LOG_PARAM("0x%llx", (unsigned long long)buffer);
+    CVMX_USB_LOG_PARAM("%d", buffer_length);
+
+    /* Pipe handle checking is done later in a common place */
+    if (cvmx_unlikely(!buffer))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(buffer_length < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    submit_handle = __cvmx_usb_submit_transaction(usb, pipe_handle,
+                                         CVMX_USB_TRANSFER_INTERRUPT,
+                                         0, /* flags */
+                                         buffer,
+                                         buffer_length,
+                                         0, /* control_header */
+                                         0, /* iso_start_frame */
+                                         0, /* iso_number_packets */
+                                         NULL, /* iso_packets */
+                                         callback,
+                                         user_data);
+    CVMX_USB_RETURN(submit_handle);
+}
+
+
+/**
+ * Call to submit a USB Control transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param control_header
+ *                  USB 8 byte control header physical address.
+ *                  Note that this is NOT A POINTER, but the
+ *                  full 64bit physical address of the buffer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+int cvmx_usb_submit_control(cvmx_usb_state_t *state, int pipe_handle,
+                            uint64_t control_header,
+                            uint64_t buffer, int buffer_length,
+                            cvmx_usb_callback_func_t callback,
+                            void *user_data)
+{
+    int submit_handle;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    cvmx_usb_control_header_t *header = cvmx_phys_to_ptr(control_header);
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    CVMX_USB_LOG_PARAM("0x%llx", (unsigned long long)control_header);
+    CVMX_USB_LOG_PARAM("0x%llx", (unsigned long long)buffer);
+    CVMX_USB_LOG_PARAM("%d", buffer_length);
+
+    /* Pipe handle checking is done later in a common place */
+    if (cvmx_unlikely(!control_header))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    /* Some drivers send a buffer with a zero length. God only knows why */
+    if (cvmx_unlikely(buffer && (buffer_length < 0)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(!buffer && (buffer_length != 0)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if ((header->s.request_type & 0x80) == 0)
+        buffer_length = cvmx_le16_to_cpu(header->s.length);
+
+    submit_handle = __cvmx_usb_submit_transaction(usb, pipe_handle,
+                                         CVMX_USB_TRANSFER_CONTROL,
+                                         0, /* flags */
+                                         buffer,
+                                         buffer_length,
+                                         control_header,
+                                         0, /* iso_start_frame */
+                                         0, /* iso_number_packets */
+                                         NULL, /* iso_packets */
+                                         callback,
+                                         user_data);
+    CVMX_USB_RETURN(submit_handle);
+}
+
+
+/**
+ * Call to submit a USB Isochronous transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param start_frame
+ *                  Number of frames into the future to schedule
+ *                  this transaction.
+ * @param flags     Flags to control the transfer. See
+ *                  cvmx_usb_isochronous_flags_t for the flag
+ *                  definitions.
+ * @param number_packets
+ *                  Number of sequential packets to transfer.
+ *                  "packets" is a pointer to an array of this
+ *                  many packet structures.
+ * @param packets   Description of each transfer packet as
+ *                  defined by cvmx_usb_iso_packet_t. The array
+ *                  pointed to here must stay valid until the
+ *                  complete callback is called.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+int cvmx_usb_submit_isochronous(cvmx_usb_state_t *state, int pipe_handle,
+                                int start_frame, int flags,
+                                int number_packets,
+                                cvmx_usb_iso_packet_t packets[],
+                                uint64_t buffer, int buffer_length,
+                                cvmx_usb_callback_func_t callback,
+                                void *user_data)
+{
+    int submit_handle;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    CVMX_USB_LOG_PARAM("%d", start_frame);
+    CVMX_USB_LOG_PARAM("0x%x", flags);
+    CVMX_USB_LOG_PARAM("%d", number_packets);
+    CVMX_USB_LOG_PARAM("%p", packets);
+    CVMX_USB_LOG_PARAM("0x%llx", (unsigned long long)buffer);
+    CVMX_USB_LOG_PARAM("%d", buffer_length);
+
+    /* Pipe handle checking is done later in a common place */
+    if (cvmx_unlikely(start_frame < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(flags & ~(CVMX_USB_ISOCHRONOUS_FLAGS_ALLOW_SHORT | CVMX_USB_ISOCHRONOUS_FLAGS_ASAP)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(number_packets < 1))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(!packets))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(!buffer))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(buffer_length < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    submit_handle = __cvmx_usb_submit_transaction(usb, pipe_handle,
+                                         CVMX_USB_TRANSFER_ISOCHRONOUS,
+                                         flags,
+                                         buffer,
+                                         buffer_length,
+                                         0, /* control_header */
+                                         start_frame,
+                                         number_packets,
+                                         packets,
+                                         callback,
+                                         user_data);
+    CVMX_USB_RETURN(submit_handle);
+}
+
+
+/**
+ * Cancel one outstanding request in a pipe. Canceling a request
+ * can fail if the transaction has already completed before cancel
+ * is called. Even after a successful cancel call, it may take
+ * a frame or two for the cvmx_usb_poll() function to call the
+ * associated callback.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to cancel requests in.
+ * @param submit_handle
+ *               Handle to transaction to cancel, returned by the submit function.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_cancel(cvmx_usb_state_t *state, int pipe_handle,
+                                  int submit_handle)
+{
+    cvmx_usb_transaction_t *transaction;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    cvmx_usb_pipe_t *pipe = usb->pipe + pipe_handle;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    CVMX_USB_LOG_PARAM("%d", submit_handle);
+
+    if (cvmx_unlikely((pipe_handle < 0) || (pipe_handle >= MAX_PIPES)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((submit_handle < 0) || (submit_handle >= MAX_TRANSACTIONS)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Fail if the pipe isn't open */
+    if (cvmx_unlikely((pipe->flags & __CVMX_USB_PIPE_FLAGS_OPEN) == 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    transaction = usb->transaction + submit_handle;
+
+    /* Fail if this transaction already completed */
+    if (cvmx_unlikely((transaction->flags & __CVMX_USB_TRANSACTION_FLAGS_IN_USE) == 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* If the transaction is the HEAD of the queue and scheduled. We need to
+        treat it special */
+    if ((pipe->head == transaction) &&
+        (pipe->flags & __CVMX_USB_PIPE_FLAGS_SCHEDULED)) {
+        cvmx_usbcx_hccharx_t usbc_hcchar;
+
+        usb->pipe_for_channel[pipe->channel] = NULL;
+        pipe->flags &= ~__CVMX_USB_PIPE_FLAGS_SCHEDULED;
+
+        CVMX_SYNCW;
+
+        usbc_hcchar.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCCHARX(pipe->channel, usb->index));
+        /* If the channel isn't enabled then the transaction already completed */
+        if (usbc_hcchar.s.chena) {
+            usbc_hcchar.s.chdis = 1;
+            __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCCHARX(pipe->channel, usb->index), usbc_hcchar.u32);
+        }
+    }
+    __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_CANCEL);
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Cancel all outstanding requests in a pipe. Logically all this
+ * does is call cvmx_usb_cancel() in a loop.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to cancel requests in.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_cancel_all(cvmx_usb_state_t *state, int pipe_handle)
+{
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    cvmx_usb_pipe_t *pipe = usb->pipe + pipe_handle;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    if (cvmx_unlikely((pipe_handle < 0) || (pipe_handle >= MAX_PIPES)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Fail if the pipe isn't open */
+    if (cvmx_unlikely((pipe->flags & __CVMX_USB_PIPE_FLAGS_OPEN) == 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Simply loop through and attempt to cancel each transaction */
+    while (pipe->head) {
+        cvmx_usb_status_t result = cvmx_usb_cancel(state, pipe_handle,
+            __cvmx_usb_get_submit_handle(usb, pipe->head));
+        if (cvmx_unlikely(result != CVMX_USB_SUCCESS))
+            CVMX_USB_RETURN(result);
+    }
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Close a pipe created with cvmx_usb_open_pipe().
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to close.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t. CVMX_USB_BUSY is returned if the
+ *         pipe has outstanding transfers.
+ */
+cvmx_usb_status_t cvmx_usb_close_pipe(cvmx_usb_state_t *state, int pipe_handle)
+{
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    cvmx_usb_pipe_t *pipe = usb->pipe + pipe_handle;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    if (cvmx_unlikely((pipe_handle < 0) || (pipe_handle >= MAX_PIPES)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Fail if the pipe isn't open */
+    if (cvmx_unlikely((pipe->flags & __CVMX_USB_PIPE_FLAGS_OPEN) == 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Fail if the pipe has pending transactions */
+    if (cvmx_unlikely(pipe->head))
+        CVMX_USB_RETURN(CVMX_USB_BUSY);
+
+    pipe->flags = 0;
+    __cvmx_usb_remove_pipe(&usb->idle_pipes, pipe);
+    __cvmx_usb_append_pipe(&usb->free_pipes, pipe);
+
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Register a function to be called when various USB events occur.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param reason    Which event to register for.
+ * @param callback  Function to call when the event occurs.
+ * @param user_data User data parameter to the function.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_register_callback(cvmx_usb_state_t *state,
+                                             cvmx_usb_callback_t reason,
+                                             cvmx_usb_callback_func_t callback,
+                                             void *user_data)
+{
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", reason);
+    CVMX_USB_LOG_PARAM("%p", callback);
+    CVMX_USB_LOG_PARAM("%p", user_data);
+    if (cvmx_unlikely(reason >= __CVMX_USB_CALLBACK_END))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(!callback))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    usb->callback[reason] = callback;
+    usb->callback_data[reason] = user_data;
+
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Get the current USB protocol level frame number. The frame
+ * number is always in the range of 0-0x7ff.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return USB frame number
+ */
+int cvmx_usb_get_frame_number(cvmx_usb_state_t *state)
+{
+    int frame_number;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    cvmx_usbcx_hfnum_t usbc_hfnum;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    usbc_hfnum.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HFNUM(usb->index));
+    frame_number = usbc_hfnum.s.frnum;
+
+    CVMX_USB_RETURN(frame_number);
+}
+
+
+/**
+ * @INTERNAL
+ * Poll a channel for status
+ *
+ * @param usb     USB device
+ * @param channel Channel to poll
+ *
+ * @return Zero on success
+ */
+static int __cvmx_usb_poll_channel(cvmx_usb_internal_state_t *usb, int channel)
+{
+    cvmx_usbcx_hcintx_t usbc_hcint;
+    cvmx_usbcx_hctsizx_t usbc_hctsiz;
+    cvmx_usbcx_hccharx_t usbc_hcchar;
+    cvmx_usb_pipe_t *pipe;
+    cvmx_usb_transaction_t *transaction;
+    int bytes_this_transfer;
+    int bytes_in_last_packet;
+    int packets_processed;
+    int buffer_space_left;
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%d", channel);
+
+    /* Read the interrupt status bits for the channel */
+    usbc_hcint.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCINTX(channel, usb->index));
+
+    if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA) {
+        usbc_hcchar.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index));
+
+        if (usbc_hcchar.s.chena && usbc_hcchar.s.chdis) {
+            /* There seems to be a bug in CN31XX which can cause interrupt
+                IN transfers to get stuck until we do a write of HCCHARX
+                without changing things */
+            __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index), usbc_hcchar.u32);
+            CVMX_USB_RETURN(0);
+        }
+
+        /* In non DMA mode the channels don't halt themselves. We need to
+            manually disable channels that are left running */
+        if (!usbc_hcint.s.chhltd) {
+            if (usbc_hcchar.s.chena) {
+                cvmx_usbcx_hcintmskx_t hcintmsk;
+                /* Disable all interrupts except CHHLTD */
+                hcintmsk.u32 = 0;
+                hcintmsk.s.chhltdmsk = 1;
+                __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCINTMSKX(channel, usb->index), hcintmsk.u32);
+                usbc_hcchar.s.chdis = 1;
+                __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index), usbc_hcchar.u32);
+                CVMX_USB_RETURN(0);
+            }
+            else if (usbc_hcint.s.xfercompl) {
+                /* Successful IN/OUT with transfer complete. Channel halt isn't needed */
+            }
+            else {
+                cvmx_dprintf("USB%d: Channel %d interrupt without halt\n", usb->index, channel);
+                CVMX_USB_RETURN(0);
+            }
+        }
+    }
+    else {
+        /* There is are no interrupts that we need to process when the channel is
+            still running */
+        if (!usbc_hcint.s.chhltd)
+            CVMX_USB_RETURN(0);
+    }
+
+    /* Disable the channel interrupts now that it is done */
+    __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCINTMSKX(channel, usb->index), 0);
+    usb->idle_hardware_channels |= (1<<channel);
+
+    /* Make sure this channel is tied to a valid pipe */
+    pipe = usb->pipe_for_channel[channel];
+    CVMX_PREFETCH(pipe, 0);
+    CVMX_PREFETCH(pipe, 128);
+    if (!pipe)
+        CVMX_USB_RETURN(0);
+    transaction = pipe->head;
+    CVMX_PREFETCH0(transaction);
+
+    /* Disconnect this pipe from the HW channel. Later the schedule function will
+        figure out which pipe needs to go */
+    usb->pipe_for_channel[channel] = NULL;
+    pipe->flags &= ~__CVMX_USB_PIPE_FLAGS_SCHEDULED;
+
+    /* Read the channel config info so we can figure out how much data
+        transfered */
+    usbc_hcchar.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index));
+    usbc_hctsiz.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index));
+
+    /* Calculating the number of bytes successfully transferred is dependent on
+        the transfer direction */
+    packets_processed = transaction->pktcnt - usbc_hctsiz.s.pktcnt;
+    if (usbc_hcchar.s.epdir) {
+        /* IN transactions are easy. For every byte received the hardware
+            decrements xfersize. All we need to do is subtract the current
+            value of xfersize from its starting value and we know how many
+            bytes were written to the buffer */
+        bytes_this_transfer = transaction->xfersize - usbc_hctsiz.s.xfersize;
+    }
+    else {
+        /* OUT transaction don't decrement xfersize. Instead pktcnt is
+            decremented on every successful packet send. The hardware does
+            this when it receives an ACK, or NYET. If it doesn't
+            receive one of these responses pktcnt doesn't change */
+        bytes_this_transfer = packets_processed * usbc_hcchar.s.mps;
+        /* The last packet may not be a full transfer if we didn't have
+            enough data */
+        if (bytes_this_transfer > transaction->xfersize)
+            bytes_this_transfer = transaction->xfersize;
+    }
+    /* Figure out how many bytes were in the last packet of the transfer */
+    if (packets_processed)
+        bytes_in_last_packet = bytes_this_transfer - (packets_processed-1) * usbc_hcchar.s.mps;
+    else
+        bytes_in_last_packet = bytes_this_transfer;
+
+    /* As a special case, setup transactions output the setup header, not
+        the user's data. For this reason we don't count setup data as bytes
+        transferred */
+    if ((transaction->stage == CVMX_USB_STAGE_SETUP) ||
+        (transaction->stage == CVMX_USB_STAGE_SETUP_SPLIT_COMPLETE))
+        bytes_this_transfer = 0;
+
+    /* Optional debug output */
+    if (cvmx_unlikely((usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS) ||
+        (pipe->flags & CVMX_USB_PIPE_FLAGS_DEBUG_TRANSFERS)))
+        cvmx_dprintf("%s: Channel %d halted. Pipe %d transaction %d stage %d bytes=%d\n",
+                     __FUNCTION__, channel,
+                     __cvmx_usb_get_pipe_handle(usb, pipe),
+                     __cvmx_usb_get_submit_handle(usb, transaction),
+                     transaction->stage, bytes_this_transfer);
+
+    /* Add the bytes transferred to the running total. It is important that
+        bytes_this_transfer doesn't count any data that needs to be
+        retransmitted */
+    transaction->actual_bytes += bytes_this_transfer;
+    if (transaction->type == CVMX_USB_TRANSFER_ISOCHRONOUS)
+        buffer_space_left = transaction->iso_packets[0].length - transaction->actual_bytes;
+    else
+        buffer_space_left = transaction->buffer_length - transaction->actual_bytes;
+
+    /* We need to remember the PID toggle state for the next transaction. The
+        hardware already updated it for the next transaction */
+    pipe->pid_toggle = !(usbc_hctsiz.s.pid == 0);
+
+    /* For high speed bulk out, assume the next transaction will need to do a
+        ping before proceeding. If this isn't true the ACK processing below
+        will clear this flag */
+    if ((pipe->device_speed == CVMX_USB_SPEED_HIGH) &&
+        (pipe->transfer_type == CVMX_USB_TRANSFER_BULK) &&
+        (pipe->transfer_dir == CVMX_USB_DIRECTION_OUT))
+        pipe->flags |= __CVMX_USB_PIPE_FLAGS_NEED_PING;
+
+    if (usbc_hcint.s.stall) {
+        /* STALL as a response means this transaction cannot be completed
+            because the device can't process transactions. Tell the user. Any
+            data that was transferred will be counted on the actual bytes
+            transferred */
+        pipe->pid_toggle = 0;
+        __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_STALL);
+    }
+    else if (usbc_hcint.s.xacterr) {
+        /* We know at least one packet worked if we get a ACK or NAK. Reset the retry counter */
+        if (usbc_hcint.s.nak || usbc_hcint.s.ack)
+            transaction->retries = 0;
+        transaction->retries++;
+        if (transaction->retries > MAX_RETRIES) {
+            /* XactErr as a response means the device signaled something wrong with
+                the transfer. For example, PID toggle errors cause these */
+            __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_XACTERR);
+        }
+        else {
+            /* If this was a split then clear our split in progress marker */
+            if (usb->active_split == transaction)
+                usb->active_split = NULL;
+            /* Rewind to the beginning of the transaction by anding off the
+                split complete bit */
+            transaction->stage &= ~1;
+            pipe->split_sc_frame = -1;
+            pipe->next_tx_frame += pipe->interval;
+            if (pipe->next_tx_frame < usb->frame_number)
+                pipe->next_tx_frame = usb->frame_number + pipe->interval -
+                    (usb->frame_number - pipe->next_tx_frame) % pipe->interval;
+        }
+    }
+    else if (usbc_hcint.s.bblerr)
+    {
+        /* Babble Error (BblErr) */
+        __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_BABBLEERR);
+    }
+    else if (usbc_hcint.s.datatglerr)
+    {
+        /* We'll retry the exact same transaction again */
+        transaction->retries++;
+    }
+    else if (usbc_hcint.s.nyet) {
+        /* NYET as a response is only allowed in three cases: as a response to
+            a ping, as a response to a split transaction, and as a response to
+            a bulk out. The ping case is handled by hardware, so we only have
+            splits and bulk out */
+        if (!__cvmx_usb_pipe_needs_split(usb, pipe)) {
+            transaction->retries = 0;
+            /* If there is more data to go then we need to try again. Otherwise
+                this transaction is complete */
+            if ((buffer_space_left == 0) || (bytes_in_last_packet < pipe->max_packet))
+                __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+        }
+        else {
+            /* Split transactions retry the split complete 4 times then rewind
+                to the start split and do the entire transactions again */
+            transaction->retries++;
+            if ((transaction->retries & 0x3) == 0) {
+                /* Rewind to the beginning of the transaction by anding off the
+                    split complete bit */
+                transaction->stage &= ~1;
+                pipe->split_sc_frame = -1;
+            }
+        }
+    }
+    else if (usbc_hcint.s.ack) {
+        transaction->retries = 0;
+        /* The ACK bit can only be checked after the other error bits. This is
+            because a multi packet transfer may succeed in a number of packets
+            and then get a different response on the last packet. In this case
+            both ACK and the last response bit will be set. If none of the
+            other response bits is set, then the last packet must have been an
+            ACK */
+
+        /* Since we got an ACK, we know we don't need to do a ping on this
+            pipe */
+        pipe->flags &= ~__CVMX_USB_PIPE_FLAGS_NEED_PING;
+
+        switch (transaction->type)
+        {
+            case CVMX_USB_TRANSFER_CONTROL:
+                switch (transaction->stage)
+                {
+                    case CVMX_USB_STAGE_NON_CONTROL:
+                    case CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE:
+                        /* This should be impossible */
+                        __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_ERROR);
+                        break;
+                    case CVMX_USB_STAGE_SETUP:
+                        pipe->pid_toggle = 1;
+                        if (__cvmx_usb_pipe_needs_split(usb, pipe))
+                            transaction->stage = CVMX_USB_STAGE_SETUP_SPLIT_COMPLETE;
+                        else {
+                            cvmx_usb_control_header_t *header = cvmx_phys_to_ptr(transaction->control_header);
+                            if (header->s.length)
+                                transaction->stage = CVMX_USB_STAGE_DATA;
+                            else
+                                transaction->stage = CVMX_USB_STAGE_STATUS;
+                        }
+                        break;
+                    case CVMX_USB_STAGE_SETUP_SPLIT_COMPLETE:
+                        {
+                            cvmx_usb_control_header_t *header = cvmx_phys_to_ptr(transaction->control_header);
+                            if (header->s.length)
+                                transaction->stage = CVMX_USB_STAGE_DATA;
+                            else
+                                transaction->stage = CVMX_USB_STAGE_STATUS;
+                        }
+                        break;
+                    case CVMX_USB_STAGE_DATA:
+                        if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                            transaction->stage = CVMX_USB_STAGE_DATA_SPLIT_COMPLETE;
+                            /* For setup OUT data that are splits, the hardware
+                                doesn't appear to count transferred data. Here
+                                we manually update the data transferred */
+                            if (!usbc_hcchar.s.epdir) {
+                                if (buffer_space_left < pipe->max_packet)
+                                    transaction->actual_bytes += buffer_space_left;
+                                else
+                                    transaction->actual_bytes += pipe->max_packet;
+                            }
+                        }
+                        else if ((buffer_space_left == 0) || (bytes_in_last_packet < pipe->max_packet)) {
+                            pipe->pid_toggle = 1;
+                            transaction->stage = CVMX_USB_STAGE_STATUS;
+                        }
+                        break;
+                    case CVMX_USB_STAGE_DATA_SPLIT_COMPLETE:
+                        if ((buffer_space_left == 0) || (bytes_in_last_packet < pipe->max_packet)) {
+                            pipe->pid_toggle = 1;
+                            transaction->stage = CVMX_USB_STAGE_STATUS;
+                        }
+                        else {
+                            transaction->stage = CVMX_USB_STAGE_DATA;
+                        }
+                        break;
+                    case CVMX_USB_STAGE_STATUS:
+                        if (__cvmx_usb_pipe_needs_split(usb, pipe))
+                            transaction->stage = CVMX_USB_STAGE_STATUS_SPLIT_COMPLETE;
+                        else
+                            __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                        break;
+                    case CVMX_USB_STAGE_STATUS_SPLIT_COMPLETE:
+                        __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                        break;
+                }
+                break;
+            case CVMX_USB_TRANSFER_BULK:
+            case CVMX_USB_TRANSFER_INTERRUPT:
+                /* The only time a bulk transfer isn't complete when
+                    it finishes with an ACK is during a split transaction. For
+                    splits we need to continue the transfer if more data is
+                    needed */
+                if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                    if (transaction->stage == CVMX_USB_STAGE_NON_CONTROL)
+                        transaction->stage = CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE;
+                    else {
+                        if (buffer_space_left && (bytes_in_last_packet == pipe->max_packet))
+                            transaction->stage = CVMX_USB_STAGE_NON_CONTROL;
+                        else {
+                            if (transaction->type == CVMX_USB_TRANSFER_INTERRUPT)
+                                pipe->next_tx_frame += pipe->interval;
+                            __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                        }
+                    }
+                }
+                else {
+                    if ((pipe->device_speed == CVMX_USB_SPEED_HIGH) &&
+                        (pipe->transfer_type == CVMX_USB_TRANSFER_BULK) &&
+                        (pipe->transfer_dir == CVMX_USB_DIRECTION_OUT) &&
+                        (usbc_hcint.s.nak))
+                        pipe->flags |= __CVMX_USB_PIPE_FLAGS_NEED_PING;
+                    if (!buffer_space_left || (bytes_in_last_packet < pipe->max_packet)) {
+                        if (transaction->type == CVMX_USB_TRANSFER_INTERRUPT)
+                            pipe->next_tx_frame += pipe->interval;
+                        __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                    }
+                }
+                break;
+            case CVMX_USB_TRANSFER_ISOCHRONOUS:
+                if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                    /* ISOCHRONOUS OUT splits don't require a complete split stage.
+                        Instead they use a sequence of begin OUT splits to transfer
+                        the data 188 bytes at a time. Once the transfer is complete,
+                        the pipe sleeps until the next schedule interval */
+                    if (pipe->transfer_dir == CVMX_USB_DIRECTION_OUT) {
+                        /* If no space left or this wasn't a max size packet then
+                            this transfer is complete. Otherwise start it again
+                            to send the next 188 bytes */
+                        if (!buffer_space_left || (bytes_this_transfer < 188)) {
+                            pipe->next_tx_frame += pipe->interval;
+                            __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                        }
+                    }
+                    else {
+                        if (transaction->stage == CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE) {
+                            /* We are in the incoming data phase. Keep getting
+                                data until we run out of space or get a small
+                                packet */
+                            if ((buffer_space_left == 0) || (bytes_in_last_packet < pipe->max_packet)) {
+                                pipe->next_tx_frame += pipe->interval;
+                                __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                            }
+                        }
+                        else
+                            transaction->stage = CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE;
+                    }
+                }
+                else {
+                    pipe->next_tx_frame += pipe->interval;
+                    __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                }
+                break;
+        }
+    }
+    else if (usbc_hcint.s.nak) {
+        /* If this was a split then clear our split in progress marker */
+        if (usb->active_split == transaction)
+            usb->active_split = NULL;
+        /* NAK as a response means the device couldn't accept the transaction,
+            but it should be retried in the future. Rewind to the beginning of
+            the transaction by anding off the split complete bit. Retry in the
+            next interval */
+        transaction->retries = 0;
+        transaction->stage &= ~1;
+        pipe->next_tx_frame += pipe->interval;
+        if (pipe->next_tx_frame < usb->frame_number)
+            pipe->next_tx_frame = usb->frame_number + pipe->interval -
+                (usb->frame_number - pipe->next_tx_frame) % pipe->interval;
+    }
+    else {
+        cvmx_usb_port_status_t port;
+        port = cvmx_usb_get_status((cvmx_usb_state_t *)usb);
+        if (port.port_enabled)
+        {
+            /* We'll retry the exact same transaction again */
+            transaction->retries++;
+        }
+        else
+        {
+            /* We get channel halted interrupts with no result bits sets when the
+                cable is unplugged */
+            __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_ERROR);
+        }
+    }
+    CVMX_USB_RETURN(0);
+}
+
+
+/**
+ * Poll the USB block for status and call all needed callback
+ * handlers. This function is meant to be called in the interrupt
+ * handler for the USB controller. It can also be called
+ * periodically in a loop for non-interrupt based operation.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_poll(cvmx_usb_state_t *state)
+{
+    cvmx_usbcx_hfnum_t usbc_hfnum;
+    cvmx_usbcx_gintsts_t usbc_gintsts;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_PREFETCH(usb, 0);
+    CVMX_PREFETCH(usb, 1*128);
+    CVMX_PREFETCH(usb, 2*128);
+    CVMX_PREFETCH(usb, 3*128);
+    CVMX_PREFETCH(usb, 4*128);
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    /* Update the frame counter */
+    usbc_hfnum.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HFNUM(usb->index));
+    if ((usb->frame_number&0x3fff) > usbc_hfnum.s.frnum)
+        usb->frame_number += 0x4000;
+    usb->frame_number &= ~0x3fffull;
+    usb->frame_number |= usbc_hfnum.s.frnum;
+
+    /* Read the pending interrupts */
+    usbc_gintsts.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GINTSTS(usb->index));
+
+    /* Clear the interrupts now that we know about them */
+    __cvmx_usb_write_csr32(usb, CVMX_USBCX_GINTSTS(usb->index), usbc_gintsts.u32);
+
+    if (usbc_gintsts.s.rxflvl) {
+        /* RxFIFO Non-Empty (RxFLvl)
+            Indicates that there is at least one packet pending to be read
+            from the RxFIFO. */
+        /* In DMA mode this is handled by hardware */
+        if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)
+            __cvmx_usb_poll_rx_fifo(usb);
+    }
+    if (usbc_gintsts.s.ptxfemp || usbc_gintsts.s.nptxfemp) {
+        /* Fill the Tx FIFOs when not in DMA mode */
+        if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)
+            __cvmx_usb_poll_tx_fifo(usb);
+    }
+    if (usbc_gintsts.s.disconnint || usbc_gintsts.s.prtint) {
+        cvmx_usbcx_hprt_t usbc_hprt;
+        /* Disconnect Detected Interrupt (DisconnInt)
+            Asserted when a device disconnect is detected. */
+
+        /* Host Port Interrupt (PrtInt)
+            The core sets this bit to indicate a change in port status of one
+            of the O2P USB core ports in Host mode. The application must
+            read the Host Port Control and Status (HPRT) register to
+            determine the exact event that caused this interrupt. The
+            application must clear the appropriate status bit in the Host Port
+            Control and Status register to clear this bit. */
+
+        /* Call the user's port callback */
+        __cvmx_usb_perform_callback(usb, NULL, NULL,
+                                    CVMX_USB_CALLBACK_PORT_CHANGED,
+                                    CVMX_USB_COMPLETE_SUCCESS);
+        /* Clear the port change bits */
+        usbc_hprt.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPRT(usb->index));
+        usbc_hprt.s.prtena = 0;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HPRT(usb->index), usbc_hprt.u32);
+    }
+    if (usbc_gintsts.s.hchint) {
+        /* Host Channels Interrupt (HChInt)
+            The core sets this bit to indicate that an interrupt is pending on
+            one of the channels of the core (in Host mode). The application
+            must read the Host All Channels Interrupt (HAINT) register to
+            determine the exact number of the channel on which the
+            interrupt occurred, and then read the corresponding Host
+            Channel-n Interrupt (HCINTn) register to determine the exact
+            cause of the interrupt. The application must clear the
+            appropriate status bit in the HCINTn register to clear this bit. */
+        cvmx_usbcx_haint_t usbc_haint;
+        usbc_haint.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HAINT(usb->index));
+        while (usbc_haint.u32) {
+            int channel;
+            CVMX_CLZ(channel, usbc_haint.u32);
+            channel = 31 - channel;
+            __cvmx_usb_poll_channel(usb, channel);
+            usbc_haint.u32 ^= 1<<channel;
+        }
+    }
+
+    __cvmx_usb_schedule(usb, usbc_gintsts.s.sof);
+
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}

diff --git a/drivers/staging/octeon-usb/cvmx-usb.h b/drivers/staging/octeon-usb/cvmx-usb.h
new file mode 100644
index 0000000..db9cc05
--- /dev/null
+++ b/drivers/staging/octeon-usb/cvmx-usb.h

@@ -0,0 +1,1085 @@
+/***********************license start***************
+ * Copyright (c) 2003-2010  Cavium Networks (support@cavium.com). All rights
+ * reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+
+ *   * Neither the name of Cavium Networks nor the names of
+ *     its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written
+ *     permission.
+
+ * This Software, including technical data, may be subject to U.S. export  control
+ * laws, including the U.S. Export Administration Act and its  associated
+ * regulations, and may be subject to export or import  regulations in other
+ * countries.
+
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+ * AND WITH ALL FAULTS AND CAVIUM  NETWORKS MAKES NO PROMISES, REPRESENTATIONS OR
+ * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT TO
+ * THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY REPRESENTATION OR
+ * DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT DEFECTS, AND CAVIUM
+ * SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES OF TITLE,
+ * MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR PURPOSE, LACK OF
+ * VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT, QUIET POSSESSION OR
+ * CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK ARISING OUT OF USE OR
+ * PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
+ ***********************license end**************************************/
+
+
+/**
+ * @file
+ *
+ * "cvmx-usb.h" defines a set of low level USB functions to help
+ * developers create Octeon USB drivers for various operating
+ * systems. These functions provide a generic API to the Octeon
+ * USB blocks, hiding the internal hardware specific
+ * operations.
+ *
+ * At a high level the device driver needs to:
+ *
+ * -# Call cvmx_usb_get_num_ports() to get the number of
+ *  supported ports.
+ * -# Call cvmx_usb_initialize() for each Octeon USB port.
+ * -# Enable the port using cvmx_usb_enable().
+ * -# Either periodically, or in an interrupt handler, call
+ *  cvmx_usb_poll() to service USB events.
+ * -# Manage pipes using cvmx_usb_open_pipe() and
+ *  cvmx_usb_close_pipe().
+ * -# Manage transfers using cvmx_usb_submit_*() and
+ *  cvmx_usb_cancel*().
+ * -# Shutdown USB on unload using cvmx_usb_shutdown().
+ *
+ * To monitor USB status changes, the device driver must use
+ * cvmx_usb_register_callback() to register for events that it
+ * is interested in. Below are a few hints on successfully
+ * implementing a driver on top of this API.
+ *
+ * <h2>Initialization</h2>
+ *
+ * When a driver is first loaded, it is normally not necessary
+ * to bring up the USB port completely. Most operating systems
+ * expect to initialize and enable the port in two independent
+ * steps. Normally an operating system will probe hardware,
+ * initialize anything found, and then enable the hardware.
+ *
+ * In the probe phase you should:
+ * -# Use cvmx_usb_get_num_ports() to determine the number of
+ *  USB port to be supported.
+ * -# Allocate space for a cvmx_usb_state_t structure for each
+ *  port.
+ * -# Tell the operating system about each port
+ *
+ * In the initialization phase you should:
+ * -# Use cvmx_usb_initialize() on each port.
+ * -# Do not call cvmx_usb_enable(). This leaves the USB port in
+ *  the disabled state until the operating system is ready.
+ *
+ * Finally, in the enable phase you should:
+ * -# Call cvmx_usb_enable() on the appropriate port.
+ * -# Note that some operating system use a RESET instead of an
+ *  enable call. To implement RESET, you should call
+ *  cvmx_usb_disable() followed by cvmx_usb_enable().
+ *
+ * <h2>Locking</h2>
+ *
+ * All of the functions in the cvmx-usb API assume exclusive
+ * access to the USB hardware and internal data structures. This
+ * means that the driver must provide locking as necessary.
+ *
+ * In the single CPU state it is normally enough to disable
+ * interrupts before every call to cvmx_usb*() and enable them
+ * again after the call is complete. Keep in mind that it is
+ * very common for the callback handlers to make additional
+ * calls into cvmx-usb, so the disable/enable must be protected
+ * against recursion. As an example, the Linux kernel
+ * local_irq_save() and local_irq_restore() are perfect for this
+ * in the non SMP case.
+ *
+ * In the SMP case, locking is more complicated. For SMP you not
+ * only need to disable interrupts on the local core, but also
+ * take a lock to make sure that another core cannot call
+ * cvmx-usb.
+ *
+ * <h2>Port callback</h2>
+ *
+ * The port callback prototype needs to look as follows:
+ *
+ * void port_callback(cvmx_usb_state_t *usb,
+ *                    cvmx_usb_callback_t reason,
+ *                    cvmx_usb_complete_t status,
+ *                    int pipe_handle,
+ *                    int submit_handle,
+ *                    int bytes_transferred,
+ *                    void *user_data);
+ * - @b usb is the cvmx_usb_state_t for the port.
+ * - @b reason will always be
+ *   CVMX_USB_CALLBACK_PORT_CHANGED.
+ * - @b status will always be CVMX_USB_COMPLETE_SUCCESS.
+ * - @b pipe_handle will always be -1.
+ * - @b submit_handle will always be -1.
+ * - @b bytes_transferred will always be 0.
+ * - @b user_data is the void pointer originally passed along
+ *   with the callback. Use this for any state information you
+ *   need.
+ *
+ * The port callback will be called whenever the user plugs /
+ * unplugs a device from the port. It will not be called when a
+ * device is plugged / unplugged from a hub connected to the
+ * root port. Normally all the callback needs to do is tell the
+ * operating system to poll the root hub for status. Under
+ * Linux, this is performed by calling usb_hcd_poll_rh_status().
+ * In the Linux driver we use @b user_data. to pass around the
+ * Linux "hcd" structure. Once the port callback completes,
+ * Linux automatically calls octeon_usb_hub_status_data() which
+ * uses cvmx_usb_get_status() to determine the root port status.
+ *
+ * <h2>Complete callback</h2>
+ *
+ * The completion callback prototype needs to look as follows:
+ *
+ * void complete_callback(cvmx_usb_state_t *usb,
+ *                        cvmx_usb_callback_t reason,
+ *                        cvmx_usb_complete_t status,
+ *                        int pipe_handle,
+ *                        int submit_handle,
+ *                        int bytes_transferred,
+ *                        void *user_data);
+ * - @b usb is the cvmx_usb_state_t for the port.
+ * - @b reason will always be
+ *   CVMX_USB_CALLBACK_TRANSFER_COMPLETE.
+ * - @b status will be one of the cvmx_usb_complete_t
+ *   enumerations.
+ * - @b pipe_handle is the handle to the pipe the transaction
+ *   was originally submitted on.
+ * - @b submit_handle is the handle returned by the original
+ *   cvmx_usb_submit_* call.
+ * - @b bytes_transferred is the number of bytes successfully
+ *   transferred in the transaction. This will be zero on most
+ *   error conditions.
+ * - @b user_data is the void pointer originally passed along
+ *   with the callback. Use this for any state information you
+ *   need. For example, the Linux "urb" is stored in here in the
+ *   Linux driver.
+ *
+ * In general your callback handler should use @b status and @b
+ * bytes_transferred to tell the operating system the how the
+ * transaction completed. Normally the pipe is not changed in
+ * this callback.
+ *
+ * <h2>Canceling transactions</h2>
+ *
+ * When a transaction is cancelled using cvmx_usb_cancel*(), the
+ * actual length of time until the complete callback is called
+ * can vary greatly. It may be called before cvmx_usb_cancel*()
+ * returns, or it may be called a number of usb frames in the
+ * future once the hardware frees the transaction. In either of
+ * these cases, the complete handler will receive
+ * CVMX_USB_COMPLETE_CANCEL.
+ *
+ * <h2>Handling pipes</h2>
+ *
+ * USB "pipes" is a software construct created by this API to
+ * enable the ordering of usb transactions to a device endpoint.
+ * Octeon's underlying hardware doesn't have any concept
+ * equivalent to "pipes". The hardware instead has eight
+ * channels that can be used simultaneously to have up to eight
+ * transaction in process at the same time. In order to maintain
+ * ordering in a pipe, the transactions for a pipe will only be
+ * active in one hardware channel at a time. From an API user's
+ * perspective, this doesn't matter but it can be helpful to
+ * keep this in mind when you are probing hardware while
+ * debugging.
+ *
+ * Also keep in mind that usb transactions contain state
+ * information about the previous transaction to the same
+ * endpoint. Each transaction has a PID toggle that changes 0/1
+ * between each sub packet. This is maintained in the pipe data
+ * structures. For this reason, you generally cannot create and
+ * destroy a pipe for every transaction. A sequence of
+ * transaction to the same endpoint must use the same pipe.
+ *
+ * <h2>Root Hub</h2>
+ *
+ * Some operating systems view the usb root port as a normal usb
+ * hub. These systems attempt to control the root hub with
+ * messages similar to the usb 2.0 spec for hub control and
+ * status. For these systems it may be necessary to write
+ * function to decode standard usb control messages into
+ * equivalent cvmx-usb API calls. As an example, the following
+ * code is used under Linux for some of the basic hub control
+ * messages.
+ *
+ * @code
+ * static int octeon_usb_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength)
+ * {
+ *     cvmx_usb_state_t *usb = (cvmx_usb_state_t *)hcd->hcd_priv;
+ *     cvmx_usb_port_status_t usb_port_status;
+ *     int port_status;
+ *     struct usb_hub_descriptor *desc;
+ *     unsigned long flags;
+ *
+ *     switch (typeReq)
+ *     {
+ *         case ClearHubFeature:
+ *             DEBUG_ROOT_HUB("OcteonUSB: ClearHubFeature\n");
+ *             switch (wValue)
+ *             {
+ *                 case C_HUB_LOCAL_POWER:
+ *                 case C_HUB_OVER_CURRENT:
+ *                     // Nothing required here
+ *                     break;
+ *                 default:
+ *                     return -EINVAL;
+ *             }
+ *             break;
+ *         case ClearPortFeature:
+ *             DEBUG_ROOT_HUB("OcteonUSB: ClearPortFeature");
+ *             if (wIndex != 1)
+ *             {
+ *                 DEBUG_ROOT_HUB(" INVALID\n");
+ *                 return -EINVAL;
+ *             }
+ *
+ *             switch (wValue)
+ *             {
+ *                 case USB_PORT_FEAT_ENABLE:
+ *                     DEBUG_ROOT_HUB(" ENABLE");
+ *                     local_irq_save(flags);
+ *                     cvmx_usb_disable(usb);
+ *                     local_irq_restore(flags);
+ *                     break;
+ *                 case USB_PORT_FEAT_SUSPEND:
+ *                     DEBUG_ROOT_HUB(" SUSPEND");
+ *                     // Not supported on Octeon
+ *                     break;
+ *                 case USB_PORT_FEAT_POWER:
+ *                     DEBUG_ROOT_HUB(" POWER");
+ *                     // Not supported on Octeon
+ *                     break;
+ *                 case USB_PORT_FEAT_INDICATOR:
+ *                     DEBUG_ROOT_HUB(" INDICATOR");
+ *                     // Port inidicator not supported
+ *                     break;
+ *                 case USB_PORT_FEAT_C_CONNECTION:
+ *                     DEBUG_ROOT_HUB(" C_CONNECTION");
+ *                     // Clears drivers internal connect status change flag
+ *                     cvmx_usb_set_status(usb, cvmx_usb_get_status(usb));
+ *                     break;
+ *                 case USB_PORT_FEAT_C_RESET:
+ *                     DEBUG_ROOT_HUB(" C_RESET");
+ *                     // Clears the driver's internal Port Reset Change flag
+ *                     cvmx_usb_set_status(usb, cvmx_usb_get_status(usb));
+ *                     break;
+ *                 case USB_PORT_FEAT_C_ENABLE:
+ *                     DEBUG_ROOT_HUB(" C_ENABLE");
+ *                     // Clears the driver's internal Port Enable/Disable Change flag
+ *                     cvmx_usb_set_status(usb, cvmx_usb_get_status(usb));
+ *                     break;
+ *                 case USB_PORT_FEAT_C_SUSPEND:
+ *                     DEBUG_ROOT_HUB(" C_SUSPEND");
+ *                     // Clears the driver's internal Port Suspend Change flag,
+ *                         which is set when resume signaling on the host port is
+ *                         complete
+ *                     break;
+ *                 case USB_PORT_FEAT_C_OVER_CURRENT:
+ *                     DEBUG_ROOT_HUB(" C_OVER_CURRENT");
+ *                     // Clears the driver's overcurrent Change flag
+ *                     cvmx_usb_set_status(usb, cvmx_usb_get_status(usb));
+ *                     break;
+ *                 default:
+ *                     DEBUG_ROOT_HUB(" UNKNOWN\n");
+ *                     return -EINVAL;
+ *             }
+ *             DEBUG_ROOT_HUB("\n");
+ *             break;
+ *         case GetHubDescriptor:
+ *             DEBUG_ROOT_HUB("OcteonUSB: GetHubDescriptor\n");
+ *             desc = (struct usb_hub_descriptor *)buf;
+ *             desc->bDescLength = 9;
+ *             desc->bDescriptorType = 0x29;
+ *             desc->bNbrPorts = 1;
+ *             desc->wHubCharacteristics = 0x08;
+ *             desc->bPwrOn2PwrGood = 1;
+ *             desc->bHubContrCurrent = 0;
+ *             desc->bitmap[0] = 0;
+ *             desc->bitmap[1] = 0xff;
+ *             break;
+ *         case GetHubStatus:
+ *             DEBUG_ROOT_HUB("OcteonUSB: GetHubStatus\n");
+ *             *(__le32 *)buf = 0;
+ *             break;
+ *         case GetPortStatus:
+ *             DEBUG_ROOT_HUB("OcteonUSB: GetPortStatus");
+ *             if (wIndex != 1)
+ *             {
+ *                 DEBUG_ROOT_HUB(" INVALID\n");
+ *                 return -EINVAL;
+ *             }
+ *
+ *             usb_port_status = cvmx_usb_get_status(usb);
+ *             port_status = 0;
+ *
+ *             if (usb_port_status.connect_change)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_C_CONNECTION);
+ *                 DEBUG_ROOT_HUB(" C_CONNECTION");
+ *             }
+ *
+ *             if (usb_port_status.port_enabled)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_C_ENABLE);
+ *                 DEBUG_ROOT_HUB(" C_ENABLE");
+ *             }
+ *
+ *             if (usb_port_status.connected)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_CONNECTION);
+ *                 DEBUG_ROOT_HUB(" CONNECTION");
+ *             }
+ *
+ *             if (usb_port_status.port_enabled)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_ENABLE);
+ *                 DEBUG_ROOT_HUB(" ENABLE");
+ *             }
+ *
+ *             if (usb_port_status.port_over_current)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_OVER_CURRENT);
+ *                 DEBUG_ROOT_HUB(" OVER_CURRENT");
+ *             }
+ *
+ *             if (usb_port_status.port_powered)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_POWER);
+ *                 DEBUG_ROOT_HUB(" POWER");
+ *             }
+ *
+ *             if (usb_port_status.port_speed == CVMX_USB_SPEED_HIGH)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_HIGHSPEED);
+ *                 DEBUG_ROOT_HUB(" HIGHSPEED");
+ *             }
+ *             else if (usb_port_status.port_speed == CVMX_USB_SPEED_LOW)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_LOWSPEED);
+ *                 DEBUG_ROOT_HUB(" LOWSPEED");
+ *             }
+ *
+ *             *((__le32 *)buf) = cpu_to_le32(port_status);
+ *             DEBUG_ROOT_HUB("\n");
+ *             break;
+ *         case SetHubFeature:
+ *             DEBUG_ROOT_HUB("OcteonUSB: SetHubFeature\n");
+ *             // No HUB features supported
+ *             break;
+ *         case SetPortFeature:
+ *             DEBUG_ROOT_HUB("OcteonUSB: SetPortFeature");
+ *             if (wIndex != 1)
+ *             {
+ *                 DEBUG_ROOT_HUB(" INVALID\n");
+ *                 return -EINVAL;
+ *             }
+ *
+ *             switch (wValue)
+ *             {
+ *                 case USB_PORT_FEAT_SUSPEND:
+ *                     DEBUG_ROOT_HUB(" SUSPEND\n");
+ *                     return -EINVAL;
+ *                 case USB_PORT_FEAT_POWER:
+ *                     DEBUG_ROOT_HUB(" POWER\n");
+ *                     return -EINVAL;
+ *                 case USB_PORT_FEAT_RESET:
+ *                     DEBUG_ROOT_HUB(" RESET\n");
+ *                     local_irq_save(flags);
+ *                     cvmx_usb_disable(usb);
+ *                     if (cvmx_usb_enable(usb))
+ *                         DEBUG_ERROR("Failed to enable the port\n");
+ *                     local_irq_restore(flags);
+ *                     return 0;
+ *                 case USB_PORT_FEAT_INDICATOR:
+ *                     DEBUG_ROOT_HUB(" INDICATOR\n");
+ *                     // Not supported
+ *                     break;
+ *                 default:
+ *                     DEBUG_ROOT_HUB(" UNKNOWN\n");
+ *                     return -EINVAL;
+ *             }
+ *             break;
+ *         default:
+ *             DEBUG_ROOT_HUB("OcteonUSB: Unknown root hub request\n");
+ *             return -EINVAL;
+ *     }
+ *     return 0;
+ * }
+ * @endcode
+ *
+ * <h2>Interrupts</h2>
+ *
+ * If you plan on using usb interrupts, cvmx_usb_poll() must be
+ * called on every usb interrupt. It will read the usb state,
+ * call any needed callbacks, and schedule transactions as
+ * needed. Your device driver needs only to hookup an interrupt
+ * handler and call cvmx_usb_poll(). Octeon's usb port 0 causes
+ * CIU bit CIU_INT*_SUM0[USB] to be set (bit 56). For port 1,
+ * CIU bit CIU_INT_SUM1[USB1] is set (bit 17). How these bits
+ * are turned into interrupt numbers is operating system
+ * specific. For Linux, there are the convenient defines
+ * OCTEON_IRQ_USB0 and OCTEON_IRQ_USB1 for the IRQ numbers.
+ *
+ * If you aren't using interrupts, simple call cvmx_usb_poll()
+ * in your main processing loop.
+ *
+ * <hr>$Revision: 32636 $<hr>
+ */
+
+#ifndef __CVMX_USB_H__
+#define __CVMX_USB_H__
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/**
+ * Enumerations representing the status of function calls.
+ */
+typedef enum
+{
+    CVMX_USB_SUCCESS = 0,           /**< There were no errors */
+    CVMX_USB_INVALID_PARAM = -1,    /**< A parameter to the function was invalid */
+    CVMX_USB_NO_MEMORY = -2,        /**< Insufficient resources were available for the request */
+    CVMX_USB_BUSY = -3,             /**< The resource is busy and cannot service the request */
+    CVMX_USB_TIMEOUT = -4,          /**< Waiting for an action timed out */
+    CVMX_USB_INCORRECT_MODE = -5,   /**< The function call doesn't work in the current USB
+                                         mode. This happens when host only functions are
+                                         called in device mode or vice versa */
+} cvmx_usb_status_t;
+
+/**
+ * Enumerations representing the possible USB device speeds
+ */
+typedef enum
+{
+    CVMX_USB_SPEED_HIGH = 0,        /**< Device is operation at 480Mbps */
+    CVMX_USB_SPEED_FULL = 1,        /**< Device is operation at 12Mbps */
+    CVMX_USB_SPEED_LOW = 2,         /**< Device is operation at 1.5Mbps */
+} cvmx_usb_speed_t;
+
+/**
+ * Enumeration representing the possible USB transfer types.
+ */
+typedef enum
+{
+    CVMX_USB_TRANSFER_CONTROL = 0,      /**< USB transfer type control for hub and status transfers */
+    CVMX_USB_TRANSFER_ISOCHRONOUS = 1,  /**< USB transfer type isochronous for low priority periodic transfers */
+    CVMX_USB_TRANSFER_BULK = 2,         /**< USB transfer type bulk for large low priority transfers */
+    CVMX_USB_TRANSFER_INTERRUPT = 3,    /**< USB transfer type interrupt for high priority periodic transfers */
+} cvmx_usb_transfer_t;
+
+/**
+ * Enumeration of the transfer directions
+ */
+typedef enum
+{
+    CVMX_USB_DIRECTION_OUT,         /**< Data is transferring from Octeon to the device/host */
+    CVMX_USB_DIRECTION_IN,          /**< Data is transferring from the device/host to Octeon */
+} cvmx_usb_direction_t;
+
+/**
+ * Enumeration of all possible status codes passed to callback
+ * functions.
+ */
+typedef enum
+{
+    CVMX_USB_COMPLETE_SUCCESS,      /**< The transaction / operation finished without any errors */
+    CVMX_USB_COMPLETE_SHORT,        /**< FIXME: This is currently not implemented */
+    CVMX_USB_COMPLETE_CANCEL,       /**< The transaction was canceled while in flight by a user call to cvmx_usb_cancel* */
+    CVMX_USB_COMPLETE_ERROR,        /**< The transaction aborted with an unexpected error status */
+    CVMX_USB_COMPLETE_STALL,        /**< The transaction received a USB STALL response from the device */
+    CVMX_USB_COMPLETE_XACTERR,      /**< The transaction failed with an error from the device even after a number of retries */
+    CVMX_USB_COMPLETE_DATATGLERR,   /**< The transaction failed with a data toggle error even after a number of retries */
+    CVMX_USB_COMPLETE_BABBLEERR,    /**< The transaction failed with a babble error */
+    CVMX_USB_COMPLETE_FRAMEERR,     /**< The transaction failed with a frame error even after a number of retries */
+} cvmx_usb_complete_t;
+
+/**
+ * Structure returned containing the USB port status information.
+ */
+typedef struct
+{
+    uint32_t reserved           : 25;
+    uint32_t port_enabled       : 1; /**< 1 = Usb port is enabled, 0 = disabled */
+    uint32_t port_over_current  : 1; /**< 1 = Over current detected, 0 = Over current not detected. Octeon doesn't support over current detection */
+    uint32_t port_powered       : 1; /**< 1 = Port power is being supplied to the device, 0 = power is off. Octeon doesn't support turning port power off */
+    cvmx_usb_speed_t port_speed : 2; /**< Current port speed */
+    uint32_t connected          : 1; /**< 1 = A device is connected to the port, 0 = No device is connected */
+    uint32_t connect_change     : 1; /**< 1 = Device connected state changed since the last set status call */
+} cvmx_usb_port_status_t;
+
+/**
+ * This is the structure of a Control packet header
+ */
+typedef union
+{
+    uint64_t u64;
+    struct
+    {
+        uint64_t request_type   : 8;  /**< Bit 7 tells the direction: 1=IN, 0=OUT */
+        uint64_t request        : 8;  /**< The standard usb request to make */
+        uint64_t value          : 16; /**< Value parameter for the request in little endian format */
+        uint64_t index          : 16; /**< Index for the request in little endian format */
+        uint64_t length         : 16; /**< Length of the data associated with this request in little endian format */
+    } s;
+} cvmx_usb_control_header_t;
+
+/**
+ * Descriptor for Isochronous packets
+ */
+typedef struct
+{
+    int offset;                     /**< This is the offset in bytes into the main buffer where this data is stored */
+    int length;                     /**< This is the length in bytes of the data */
+    cvmx_usb_complete_t status;     /**< This is the status of this individual packet transfer */
+} cvmx_usb_iso_packet_t;
+
+/**
+ * Possible callback reasons for the USB API.
+ */
+typedef enum
+{
+    CVMX_USB_CALLBACK_TRANSFER_COMPLETE,
+                                    /**< A callback of this type is called when a submitted transfer
+                                        completes. The completion callback will be called even if the
+                                        transfer fails or is canceled. The status parameter will
+                                        contain details of why he callback was called. */
+    CVMX_USB_CALLBACK_PORT_CHANGED, /**< The status of the port changed. For example, someone may have
+                                        plugged a device in. The status parameter contains
+                                        CVMX_USB_COMPLETE_SUCCESS. Use cvmx_usb_get_status() to get
+                                        the new port status. */
+    __CVMX_USB_CALLBACK_END         /**< Do not use. Used internally for array bounds */
+} cvmx_usb_callback_t;
+
+/**
+ * USB state internal data. The contents of this structure
+ * may change in future SDKs. No data in it should be referenced
+ * by user's of this API.
+ */
+typedef struct
+{
+    char data[65536];
+} cvmx_usb_state_t;
+
+/**
+ * USB callback functions are always of the following type.
+ * The parameters are as follows:
+ *      - state = USB device state populated by
+ *        cvmx_usb_initialize().
+ *      - reason = The cvmx_usb_callback_t used to register
+ *        the callback.
+ *      - status = The cvmx_usb_complete_t representing the
+ *        status code of a transaction.
+ *      - pipe_handle = The Pipe that caused this callback, or
+ *        -1 if this callback wasn't associated with a pipe.
+ *      - submit_handle = Transfer submit handle causing this
+ *        callback, or -1 if this callback wasn't associated
+ *        with a transfer.
+ *      - Actual number of bytes transfer.
+ *      - user_data = The user pointer supplied to the
+ *        function cvmx_usb_submit() or
+ *        cvmx_usb_register_callback() */
+typedef void (*cvmx_usb_callback_func_t)(cvmx_usb_state_t *state,
+                                         cvmx_usb_callback_t reason,
+                                         cvmx_usb_complete_t status,
+                                         int pipe_handle, int submit_handle,
+                                         int bytes_transferred, void *user_data);
+
+/**
+ * Flags to pass the initialization function.
+ */
+typedef enum
+{
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_XI = 1<<0,       /**< The USB port uses a 12MHz crystal as clock source
+                                                            at USB_XO and USB_XI. */
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_GND = 1<<1,      /**< The USB port uses 12/24/48MHz 2.5V board clock
+                                                            source at USB_XO. USB_XI should be tied to GND.*/
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_AUTO = 0,           /**< Automatically determine clock type based on function
+                                                             in cvmx-helper-board.c. */
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_MHZ_MASK  = 3<<3,       /**< Mask for clock speed field */
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_12MHZ = 1<<3,       /**< Speed of reference clock or crystal */
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_24MHZ = 2<<3,       /**< Speed of reference clock */
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_48MHZ = 3<<3,       /**< Speed of reference clock */
+    /* Bits 3-4 used to encode the clock frequency */
+    CVMX_USB_INITIALIZE_FLAGS_NO_DMA = 1<<5,            /**< Disable DMA and used polled IO for data transfer use for the USB  */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS = 1<<16,  /**< Enable extra console output for debugging USB transfers */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLBACKS = 1<<17,  /**< Enable extra console output for debugging USB callbacks */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO = 1<<18,       /**< Enable extra console output for USB informational data */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS = 1<<19,      /**< Enable extra console output for every function call */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_CSRS = 1<<20,       /**< Enable extra console output for every CSR access */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_ALL = ((CVMX_USB_INITIALIZE_FLAGS_DEBUG_CSRS<<1)-1) - (CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS-1),
+} cvmx_usb_initialize_flags_t;
+
+/**
+ * Flags for passing when a pipe is created. Currently no flags
+ * need to be passed.
+ */
+typedef enum
+{
+    CVMX_USB_PIPE_FLAGS_DEBUG_TRANSFERS = 1<<15,/**< Used to display CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS for a specific pipe only */
+    __CVMX_USB_PIPE_FLAGS_OPEN = 1<<16,         /**< Used internally to determine if a pipe is open. Do not use */
+    __CVMX_USB_PIPE_FLAGS_SCHEDULED = 1<<17,    /**< Used internally to determine if a pipe is actively using hardware. Do not use */
+    __CVMX_USB_PIPE_FLAGS_NEED_PING = 1<<18,    /**< Used internally to determine if a high speed pipe is in the ping state. Do not use */
+} cvmx_usb_pipe_flags_t;
+
+/**
+ * Return the number of USB ports supported by this Octeon
+ * chip. If the chip doesn't support USB, or is not supported
+ * by this API, a zero will be returned. Most Octeon chips
+ * support one usb port, but some support two ports.
+ * cvmx_usb_initialize() must be called on independent
+ * cvmx_usb_state_t structures.
+ *
+ * @return Number of port, zero if usb isn't supported
+ */
+extern int cvmx_usb_get_num_ports(void);
+
+/**
+ * Initialize a USB port for use. This must be called before any
+ * other access to the Octeon USB port is made. The port starts
+ * off in the disabled state.
+ *
+ * @param state  Pointer to an empty cvmx_usb_state_t structure
+ *               that will be populated by the initialize call.
+ *               This structure is then passed to all other USB
+ *               functions.
+ * @param usb_port_number
+ *               Which Octeon USB port to initialize.
+ * @param flags  Flags to control hardware initialization. See
+ *               cvmx_usb_initialize_flags_t for the flag
+ *               definitions. Some flags are mandatory.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_initialize(cvmx_usb_state_t *state,
+                                             int usb_port_number,
+                                             cvmx_usb_initialize_flags_t flags);
+
+/**
+ * Shutdown a USB port after a call to cvmx_usb_initialize().
+ * The port should be disabled with all pipes closed when this
+ * function is called.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_shutdown(cvmx_usb_state_t *state);
+
+/**
+ * Enable a USB port. After this call succeeds, the USB port is
+ * online and servicing requests.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_enable(cvmx_usb_state_t *state);
+
+/**
+ * Disable a USB port. After this call the USB port will not
+ * generate data transfers and will not generate events.
+ * Transactions in process will fail and call their
+ * associated callbacks.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_disable(cvmx_usb_state_t *state);
+
+/**
+ * Get the current state of the USB port. Use this call to
+ * determine if the usb port has anything connected, is enabled,
+ * or has some sort of error condition. The return value of this
+ * call has "changed" bits to signal of the value of some fields
+ * have changed between calls. These "changed" fields are based
+ * on the last call to cvmx_usb_set_status(). In order to clear
+ * them, you must update the status through cvmx_usb_set_status().
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return Port status information
+ */
+extern cvmx_usb_port_status_t cvmx_usb_get_status(cvmx_usb_state_t *state);
+
+/**
+ * Set the current state of the USB port. The status is used as
+ * a reference for the "changed" bits returned by
+ * cvmx_usb_get_status(). Other than serving as a reference, the
+ * status passed to this function is not used. No fields can be
+ * changed through this call.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param port_status
+ *               Port status to set, most like returned by cvmx_usb_get_status()
+ */
+extern void cvmx_usb_set_status(cvmx_usb_state_t *state, cvmx_usb_port_status_t port_status);
+
+/**
+ * Open a virtual pipe between the host and a USB device. A pipe
+ * must be opened before data can be transferred between a device
+ * and Octeon.
+ *
+ * @param state      USB device state populated by
+ *                   cvmx_usb_initialize().
+ * @param flags      Optional pipe flags defined in
+ *                   cvmx_usb_pipe_flags_t.
+ * @param device_addr
+ *                   USB device address to open the pipe to
+ *                   (0-127).
+ * @param endpoint_num
+ *                   USB endpoint number to open the pipe to
+ *                   (0-15).
+ * @param device_speed
+ *                   The speed of the device the pipe is going
+ *                   to. This must match the device's speed,
+ *                   which may be different than the port speed.
+ * @param max_packet The maximum packet length the device can
+ *                   transmit/receive (low speed=0-8, full
+ *                   speed=0-1023, high speed=0-1024). This value
+ *                   comes from the standard endpoint descriptor
+ *                   field wMaxPacketSize bits <10:0>.
+ * @param transfer_type
+ *                   The type of transfer this pipe is for.
+ * @param transfer_dir
+ *                   The direction the pipe is in. This is not
+ *                   used for control pipes.
+ * @param interval   For ISOCHRONOUS and INTERRUPT transfers,
+ *                   this is how often the transfer is scheduled
+ *                   for. All other transfers should specify
+ *                   zero. The units are in frames (8000/sec at
+ *                   high speed, 1000/sec for full speed).
+ * @param multi_count
+ *                   For high speed devices, this is the maximum
+ *                   allowed number of packet per microframe.
+ *                   Specify zero for non high speed devices. This
+ *                   value comes from the standard endpoint descriptor
+ *                   field wMaxPacketSize bits <12:11>.
+ * @param hub_device_addr
+ *                   Hub device address this device is connected
+ *                   to. Devices connected directly to Octeon
+ *                   use zero. This is only used when the device
+ *                   is full/low speed behind a high speed hub.
+ *                   The address will be of the high speed hub,
+ *                   not and full speed hubs after it.
+ * @param hub_port   Which port on the hub the device is
+ *                   connected. Use zero for devices connected
+ *                   directly to Octeon. Like hub_device_addr,
+ *                   this is only used for full/low speed
+ *                   devices behind a high speed hub.
+ *
+ * @return A non negative value is a pipe handle. Negative
+ *         values are failure codes from cvmx_usb_status_t.
+ */
+extern int cvmx_usb_open_pipe(cvmx_usb_state_t *state,
+                              cvmx_usb_pipe_flags_t flags,
+                              int device_addr, int endpoint_num,
+                              cvmx_usb_speed_t device_speed, int max_packet,
+                              cvmx_usb_transfer_t transfer_type,
+                              cvmx_usb_direction_t transfer_dir, int interval,
+                              int multi_count, int hub_device_addr,
+                              int hub_port);
+
+/**
+ * Call to submit a USB Bulk transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+extern int cvmx_usb_submit_bulk(cvmx_usb_state_t *state, int pipe_handle,
+                                uint64_t buffer, int buffer_length,
+                                cvmx_usb_callback_func_t callback,
+                                void *user_data);
+
+/**
+ * Call to submit a USB Interrupt transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+extern int cvmx_usb_submit_interrupt(cvmx_usb_state_t *state, int pipe_handle,
+                                     uint64_t buffer, int buffer_length,
+                                     cvmx_usb_callback_func_t callback,
+                                     void *user_data);
+
+/**
+ * Call to submit a USB Control transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param control_header
+ *                  USB 8 byte control header physical address.
+ *                  Note that this is NOT A POINTER, but the
+ *                  full 64bit physical address of the buffer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+extern int cvmx_usb_submit_control(cvmx_usb_state_t *state, int pipe_handle,
+                                   uint64_t control_header,
+                                   uint64_t buffer, int buffer_length,
+                                   cvmx_usb_callback_func_t callback,
+                                   void *user_data);
+
+/**
+ * Flags to pass the cvmx_usb_submit_isochronous() function.
+ */
+typedef enum
+{
+    CVMX_USB_ISOCHRONOUS_FLAGS_ALLOW_SHORT = 1<<0,  /**< Do not return an error if a transfer is less than the maximum packet size of the device */
+    CVMX_USB_ISOCHRONOUS_FLAGS_ASAP = 1<<1,         /**< Schedule the transaction as soon as possible */
+} cvmx_usb_isochronous_flags_t;
+
+/**
+ * Call to submit a USB Isochronous transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param start_frame
+ *                  Number of frames into the future to schedule
+ *                  this transaction.
+ * @param flags     Flags to control the transfer. See
+ *                  cvmx_usb_isochronous_flags_t for the flag
+ *                  definitions.
+ * @param number_packets
+ *                  Number of sequential packets to transfer.
+ *                  "packets" is a pointer to an array of this
+ *                  many packet structures.
+ * @param packets   Description of each transfer packet as
+ *                  defined by cvmx_usb_iso_packet_t. The array
+ *                  pointed to here must stay valid until the
+ *                  complete callback is called.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+extern int cvmx_usb_submit_isochronous(cvmx_usb_state_t *state, int pipe_handle,
+                                       int start_frame, int flags,
+                                       int number_packets,
+                                       cvmx_usb_iso_packet_t packets[],
+                                       uint64_t buffer, int buffer_length,
+                                       cvmx_usb_callback_func_t callback,
+                                       void *user_data);
+
+/**
+ * Cancel one outstanding request in a pipe. Canceling a request
+ * can fail if the transaction has already completed before cancel
+ * is called. Even after a successful cancel call, it may take
+ * a frame or two for the cvmx_usb_poll() function to call the
+ * associated callback.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to cancel requests in.
+ * @param submit_handle
+ *               Handle to transaction to cancel, returned by the submit function.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_cancel(cvmx_usb_state_t *state,
+                                         int pipe_handle, int submit_handle);
+
+
+/**
+ * Cancel all outstanding requests in a pipe. Logically all this
+ * does is call cvmx_usb_cancel() in a loop.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to cancel requests in.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_cancel_all(cvmx_usb_state_t *state,
+                                             int pipe_handle);
+
+/**
+ * Close a pipe created with cvmx_usb_open_pipe().
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to close.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t. CVMX_USB_BUSY is returned if the
+ *         pipe has outstanding transfers.
+ */
+extern cvmx_usb_status_t cvmx_usb_close_pipe(cvmx_usb_state_t *state,
+                                             int pipe_handle);
+
+/**
+ * Register a function to be called when various USB events occur.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param reason    Which event to register for.
+ * @param callback  Function to call when the event occurs.
+ * @param user_data User data parameter to the function.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_register_callback(cvmx_usb_state_t *state,
+                                                    cvmx_usb_callback_t reason,
+                                                    cvmx_usb_callback_func_t callback,
+                                                    void *user_data);
+
+/**
+ * Get the current USB protocol level frame number. The frame
+ * number is always in the range of 0-0x7ff.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return USB frame number
+ */
+extern int cvmx_usb_get_frame_number(cvmx_usb_state_t *state);
+
+/**
+ * Poll the USB block for status and call all needed callback
+ * handlers. This function is meant to be called in the interrupt
+ * handler for the USB controller. It can also be called
+ * periodically in a loop for non-interrupt based operation.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_poll(cvmx_usb_state_t *state);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif  /* __CVMX_USB_H__ */

diff --git a/drivers/staging/octeon-usb/cvmx-usbcx-defs.h b/drivers/staging/octeon-usb/cvmx-usbcx-defs.h
new file mode 100644
index 0000000..394e846
--- /dev/null
+++ b/drivers/staging/octeon-usb/cvmx-usbcx-defs.h

@@ -0,0 +1,1551 @@
+/***********************license start***************
+ * Copyright (c) 2003-2010  Cavium Networks (support@cavium.com). All rights
+ * reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+
+ *   * Neither the name of Cavium Networks nor the names of
+ *     its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written
+ *     permission.
+
+ * This Software, including technical data, may be subject to U.S. export
+ * control laws, including the U.S. Export Administration Act and its associated
+ * regulations, and may be subject to export or import  regulations in other
+ * countries.
+
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+ * AND WITH ALL FAULTS AND CAVIUM NETWORKS MAKES NO PROMISES, REPRESENTATIONS OR
+ * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT TO
+ * THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY REPRESENTATION
+ * OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT DEFECTS, AND CAVIUM
+ * SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES OF TITLE,
+ * MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR PURPOSE, LACK OF
+ * VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT, QUIET POSSESSION OR
+ * CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK ARISING OUT OF USE OR
+ * PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
+ ***********************license end**************************************/
+
+
+/**
+ * cvmx-usbcx-defs.h
+ *
+ * Configuration and status register (CSR) type definitions for
+ * Octeon usbcx.
+ *
+ */
+#ifndef __CVMX_USBCX_TYPEDEFS_H__
+#define __CVMX_USBCX_TYPEDEFS_H__
+
+#define CVMX_USBCXBASE 0x00016F0010000000ull
+#define CVMX_USBCXREG1(reg, bid) \
+	(CVMX_ADD_IO_SEG(CVMX_USBCXBASE | reg) + \
+	 ((bid) & 1) * 0x100000000000ull)
+#define CVMX_USBCXREG2(reg, bid, off) \
+	(CVMX_ADD_IO_SEG(CVMX_USBCXBASE | reg) + \
+	 (((off) & 7) + ((bid) & 1) * 0x8000000000ull) * 32)
+
+#define CVMX_USBCX_GAHBCFG(bid)		CVMX_USBCXREG1(0x008, bid)
+#define CVMX_USBCX_GHWCFG3(bid)		CVMX_USBCXREG1(0x04c, bid)
+#define CVMX_USBCX_GINTMSK(bid)		CVMX_USBCXREG1(0x018, bid)
+#define CVMX_USBCX_GINTSTS(bid)		CVMX_USBCXREG1(0x014, bid)
+#define CVMX_USBCX_GNPTXFSIZ(bid)	CVMX_USBCXREG1(0x028, bid)
+#define CVMX_USBCX_GNPTXSTS(bid)	CVMX_USBCXREG1(0x02c, bid)
+#define CVMX_USBCX_GOTGCTL(bid)		CVMX_USBCXREG1(0x000, bid)
+#define CVMX_USBCX_GRSTCTL(bid)		CVMX_USBCXREG1(0x010, bid)
+#define CVMX_USBCX_GRXFSIZ(bid)		CVMX_USBCXREG1(0x024, bid)
+#define CVMX_USBCX_GRXSTSPH(bid)	CVMX_USBCXREG1(0x020, bid)
+#define CVMX_USBCX_GUSBCFG(bid)		CVMX_USBCXREG1(0x00c, bid)
+#define CVMX_USBCX_HAINT(bid)		CVMX_USBCXREG1(0x414, bid)
+#define CVMX_USBCX_HAINTMSK(bid)	CVMX_USBCXREG1(0x418, bid)
+#define CVMX_USBCX_HCCHARX(off, bid)	CVMX_USBCXREG2(0x500, bid, off)
+#define CVMX_USBCX_HCFG(bid)		CVMX_USBCXREG1(0x400, bid)
+#define CVMX_USBCX_HCINTMSKX(off, bid)	CVMX_USBCXREG2(0x50c, bid, off)
+#define CVMX_USBCX_HCINTX(off, bid)	CVMX_USBCXREG2(0x508, bid, off)
+#define CVMX_USBCX_HCSPLTX(off, bid)	CVMX_USBCXREG2(0x504, bid, off)
+#define CVMX_USBCX_HCTSIZX(off, bid)	CVMX_USBCXREG2(0x510, bid, off)
+#define CVMX_USBCX_HFIR(bid)		CVMX_USBCXREG1(0x404, bid)
+#define CVMX_USBCX_HFNUM(bid)		CVMX_USBCXREG1(0x408, bid)
+#define CVMX_USBCX_HPRT(bid)		CVMX_USBCXREG1(0x440, bid)
+#define CVMX_USBCX_HPTXFSIZ(bid)	CVMX_USBCXREG1(0x100, bid)
+#define CVMX_USBCX_HPTXSTS(bid)		CVMX_USBCXREG1(0x410, bid)
+
+/**
+ * cvmx_usbc#_gahbcfg
+ *
+ * Core AHB Configuration Register (GAHBCFG)
+ *
+ * This register can be used to configure the core after power-on or a change in
+ * mode of operation. This register mainly contains AHB system-related
+ * configuration parameters. The AHB is the processor interface to the O2P USB
+ * core. In general, software need not know about this interface except to
+ * program the values as specified.
+ *
+ * The application must program this register as part of the O2P USB core
+ * initialization. Do not change this register after the initial programming.
+ */
+union cvmx_usbcx_gahbcfg {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_gahbcfg_s
+	 * @ptxfemplvl: Periodic TxFIFO Empty Level (PTxFEmpLvl)
+	 *	Software should set this bit to 0x1.
+	 *	Indicates when the Periodic TxFIFO Empty Interrupt bit in the
+	 *	Core Interrupt register (GINTSTS.PTxFEmp) is triggered. This
+	 *	bit is used only in Slave mode.
+	 *	* 1'b0: GINTSTS.PTxFEmp interrupt indicates that the Periodic
+	 *	TxFIFO is half empty
+	 *	* 1'b1: GINTSTS.PTxFEmp interrupt indicates that the Periodic
+	 *	TxFIFO is completely empty
+	 * @nptxfemplvl: Non-Periodic TxFIFO Empty Level (NPTxFEmpLvl)
+	 *	Software should set this bit to 0x1.
+	 *	Indicates when the Non-Periodic TxFIFO Empty Interrupt bit in
+	 *	the Core Interrupt register (GINTSTS.NPTxFEmp) is triggered.
+	 *	This bit is used only in Slave mode.
+	 *	* 1'b0: GINTSTS.NPTxFEmp interrupt indicates that the Non-
+	 *	Periodic TxFIFO is half empty
+	 *	* 1'b1: GINTSTS.NPTxFEmp interrupt indicates that the Non-
+	 *	Periodic TxFIFO is completely empty
+	 * @dmaen: DMA Enable (DMAEn)
+	 *	* 1'b0: Core operates in Slave mode
+	 *	* 1'b1: Core operates in a DMA mode
+	 * @hbstlen: Burst Length/Type (HBstLen)
+	 *	This field has not effect and should be left as 0x0.
+	 * @glblintrmsk: Global Interrupt Mask (GlblIntrMsk)
+	 *	Software should set this field to 0x1.
+	 *	The application uses this bit to mask or unmask the interrupt
+	 *	line assertion to itself. Irrespective of this bit's setting,
+	 *	the interrupt status registers are updated by the core.
+	 *	* 1'b0: Mask the interrupt assertion to the application.
+	 *	* 1'b1: Unmask the interrupt assertion to the application.
+	 */
+	struct cvmx_usbcx_gahbcfg_s {
+		uint32_t reserved_9_31	: 23;
+		uint32_t ptxfemplvl	: 1;
+		uint32_t nptxfemplvl	: 1;
+		uint32_t reserved_6_6	: 1;
+		uint32_t dmaen		: 1;
+		uint32_t hbstlen	: 4;
+		uint32_t glblintrmsk	: 1;
+	} s;
+};
+typedef union cvmx_usbcx_gahbcfg cvmx_usbcx_gahbcfg_t;
+
+/**
+ * cvmx_usbc#_ghwcfg3
+ *
+ * User HW Config3 Register (GHWCFG3)
+ *
+ * This register contains the configuration options of the O2P USB core.
+ */
+union cvmx_usbcx_ghwcfg3 {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_ghwcfg3_s
+	 * @dfifodepth: DFIFO Depth (DfifoDepth)
+	 *	This value is in terms of 32-bit words.
+	 *	* Minimum value is 32
+	 *	* Maximum value is 32768
+	 * @ahbphysync: AHB and PHY Synchronous (AhbPhySync)
+	 *	Indicates whether AHB and PHY clocks are synchronous to
+	 *	each other.
+	 *	* 1'b0: No
+	 *	* 1'b1: Yes
+	 *	This bit is tied to 1.
+	 * @rsttype: Reset Style for Clocked always Blocks in RTL (RstType)
+	 *	* 1'b0: Asynchronous reset is used in the core
+	 *	* 1'b1: Synchronous reset is used in the core
+	 * @optfeature: Optional Features Removed (OptFeature)
+	 *	Indicates whether the User ID register, GPIO interface ports,
+	 *	and SOF toggle and counter ports were removed for gate count
+	 *	optimization.
+	 * @vendor_control_interface_support: Vendor Control Interface Support
+	 *	* 1'b0: Vendor Control Interface is not available on the core.
+	 *	* 1'b1: Vendor Control Interface is available.
+	 * @i2c_selection: I2C Selection
+	 *	* 1'b0: I2C Interface is not available on the core.
+	 *	* 1'b1: I2C Interface is available on the core.
+	 * @otgen: OTG Function Enabled (OtgEn)
+	 *	The application uses this bit to indicate the O2P USB core's
+	 *	OTG capabilities.
+	 *	* 1'b0: Not OTG capable
+	 *	* 1'b1: OTG Capable
+	 * @pktsizewidth: Width of Packet Size Counters (PktSizeWidth)
+	 *	* 3'b000: 4 bits
+	 *	* 3'b001: 5 bits
+	 *	* 3'b010: 6 bits
+	 *	* 3'b011: 7 bits
+	 *	* 3'b100: 8 bits
+	 *	* 3'b101: 9 bits
+	 *	* 3'b110: 10 bits
+	 *	* Others: Reserved
+	 * @xfersizewidth: Width of Transfer Size Counters (XferSizeWidth)
+	 *	* 4'b0000: 11 bits
+	 *	* 4'b0001: 12 bits
+	 *	- ...
+	 *	* 4'b1000: 19 bits
+	 *	* Others: Reserved
+	 */
+	struct cvmx_usbcx_ghwcfg3_s {
+		uint32_t dfifodepth				: 16;
+		uint32_t reserved_13_15				: 3;
+		uint32_t ahbphysync				: 1;
+		uint32_t rsttype				: 1;
+		uint32_t optfeature				: 1;
+		uint32_t vendor_control_interface_support	: 1;
+		uint32_t i2c_selection				: 1;
+		uint32_t otgen					: 1;
+		uint32_t pktsizewidth				: 3;
+		uint32_t xfersizewidth				: 4;
+	} s;
+};
+typedef union cvmx_usbcx_ghwcfg3 cvmx_usbcx_ghwcfg3_t;
+
+/**
+ * cvmx_usbc#_gintmsk
+ *
+ * Core Interrupt Mask Register (GINTMSK)
+ *
+ * This register works with the Core Interrupt register to interrupt the
+ * application. When an interrupt bit is masked, the interrupt associated with
+ * that bit will not be generated. However, the Core Interrupt (GINTSTS)
+ * register bit corresponding to that interrupt will still be set.
+ * Mask interrupt: 1'b0, Unmask interrupt: 1'b1
+ */
+union cvmx_usbcx_gintmsk {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_gintmsk_s
+	 * @wkupintmsk: Resume/Remote Wakeup Detected Interrupt Mask
+	 *	(WkUpIntMsk)
+	 * @sessreqintmsk: Session Request/New Session Detected Interrupt Mask
+	 *	(SessReqIntMsk)
+	 * @disconnintmsk: Disconnect Detected Interrupt Mask (DisconnIntMsk)
+	 * @conidstschngmsk: Connector ID Status Change Mask (ConIDStsChngMsk)
+	 * @ptxfempmsk: Periodic TxFIFO Empty Mask (PTxFEmpMsk)
+	 * @hchintmsk: Host Channels Interrupt Mask (HChIntMsk)
+	 * @prtintmsk: Host Port Interrupt Mask (PrtIntMsk)
+	 * @fetsuspmsk: Data Fetch Suspended Mask (FetSuspMsk)
+	 * @incomplpmsk: Incomplete Periodic Transfer Mask (incomplPMsk)
+	 *	Incomplete Isochronous OUT Transfer Mask
+	 *	(incompISOOUTMsk)
+	 * @incompisoinmsk: Incomplete Isochronous IN Transfer Mask
+	 *		    (incompISOINMsk)
+	 * @oepintmsk: OUT Endpoints Interrupt Mask (OEPIntMsk)
+	 * @inepintmsk: IN Endpoints Interrupt Mask (INEPIntMsk)
+	 * @epmismsk: Endpoint Mismatch Interrupt Mask (EPMisMsk)
+	 * @eopfmsk: End of Periodic Frame Interrupt Mask (EOPFMsk)
+	 * @isooutdropmsk: Isochronous OUT Packet Dropped Interrupt Mask
+	 *	(ISOOutDropMsk)
+	 * @enumdonemsk: Enumeration Done Mask (EnumDoneMsk)
+	 * @usbrstmsk: USB Reset Mask (USBRstMsk)
+	 * @usbsuspmsk: USB Suspend Mask (USBSuspMsk)
+	 * @erlysuspmsk: Early Suspend Mask (ErlySuspMsk)
+	 * @i2cint: I2C Interrupt Mask (I2CINT)
+	 * @ulpickintmsk: ULPI Carkit Interrupt Mask (ULPICKINTMsk)
+	 *	I2C Carkit Interrupt Mask (I2CCKINTMsk)
+	 * @goutnakeffmsk: Global OUT NAK Effective Mask (GOUTNakEffMsk)
+	 * @ginnakeffmsk: Global Non-Periodic IN NAK Effective Mask
+	 *		  (GINNakEffMsk)
+	 * @nptxfempmsk: Non-Periodic TxFIFO Empty Mask (NPTxFEmpMsk)
+	 * @rxflvlmsk: Receive FIFO Non-Empty Mask (RxFLvlMsk)
+	 * @sofmsk: Start of (micro)Frame Mask (SofMsk)
+	 * @otgintmsk: OTG Interrupt Mask (OTGIntMsk)
+	 * @modemismsk: Mode Mismatch Interrupt Mask (ModeMisMsk)
+	 */
+	struct cvmx_usbcx_gintmsk_s {
+		uint32_t wkupintmsk		: 1;
+		uint32_t sessreqintmsk		: 1;
+		uint32_t disconnintmsk		: 1;
+		uint32_t conidstschngmsk	: 1;
+		uint32_t reserved_27_27		: 1;
+		uint32_t ptxfempmsk		: 1;
+		uint32_t hchintmsk		: 1;
+		uint32_t prtintmsk		: 1;
+		uint32_t reserved_23_23		: 1;
+		uint32_t fetsuspmsk		: 1;
+		uint32_t incomplpmsk		: 1;
+		uint32_t incompisoinmsk		: 1;
+		uint32_t oepintmsk		: 1;
+		uint32_t inepintmsk		: 1;
+		uint32_t epmismsk		: 1;
+		uint32_t reserved_16_16		: 1;
+		uint32_t eopfmsk		: 1;
+		uint32_t isooutdropmsk		: 1;
+		uint32_t enumdonemsk		: 1;
+		uint32_t usbrstmsk		: 1;
+		uint32_t usbsuspmsk		: 1;
+		uint32_t erlysuspmsk		: 1;
+		uint32_t i2cint			: 1;
+		uint32_t ulpickintmsk		: 1;
+		uint32_t goutnakeffmsk		: 1;
+		uint32_t ginnakeffmsk		: 1;
+		uint32_t nptxfempmsk		: 1;
+		uint32_t rxflvlmsk		: 1;
+		uint32_t sofmsk			: 1;
+		uint32_t otgintmsk		: 1;
+		uint32_t modemismsk		: 1;
+		uint32_t reserved_0_0		: 1;
+	} s;
+};
+typedef union cvmx_usbcx_gintmsk cvmx_usbcx_gintmsk_t;
+
+/**
+ * cvmx_usbc#_gintsts
+ *
+ * Core Interrupt Register (GINTSTS)
+ *
+ * This register interrupts the application for system-level events in the
+ * current mode of operation (Device mode or Host mode). It is shown in
+ * Interrupt. Some of the bits in this register are valid only in Host mode,
+ * while others are valid in Device mode only. This register also indicates the
+ * current mode of operation. In order to clear the interrupt status bits of
+ * type R_SS_WC, the application must write 1'b1 into the bit. The FIFO status
+ * interrupts are read only; once software reads from or writes to the FIFO
+ * while servicing these interrupts, FIFO interrupt conditions are cleared
+ * automatically.
+ */
+union cvmx_usbcx_gintsts {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_gintsts_s
+	 * @wkupint: Resume/Remote Wakeup Detected Interrupt (WkUpInt)
+	 *	In Device mode, this interrupt is asserted when a resume is
+	 *	detected on the USB. In Host mode, this interrupt is asserted
+	 *	when a remote wakeup is detected on the USB.
+	 *	For more information on how to use this interrupt, see "Partial
+	 *	Power-Down and Clock Gating Programming Model" on
+	 *	page 353.
+	 * @sessreqint: Session Request/New Session Detected Interrupt
+	 *		(SessReqInt)
+	 *	In Host mode, this interrupt is asserted when a session request
+	 *	is detected from the device. In Device mode, this interrupt is
+	 *	asserted when the utmiotg_bvalid signal goes high.
+	 *	For more information on how to use this interrupt, see "Partial
+	 *	Power-Down and Clock Gating Programming Model" on
+	 *	page 353.
+	 * @disconnint: Disconnect Detected Interrupt (DisconnInt)
+	 *	Asserted when a device disconnect is detected.
+	 * @conidstschng: Connector ID Status Change (ConIDStsChng)
+	 *	The core sets this bit when there is a change in connector ID
+	 *	status.
+	 * @ptxfemp: Periodic TxFIFO Empty (PTxFEmp)
+	 *	Asserted when the Periodic Transmit FIFO is either half or
+	 *	completely empty and there is space for at least one entry to be
+	 *	written in the Periodic Request Queue. The half or completely
+	 *	empty status is determined by the Periodic TxFIFO Empty Level
+	 *	bit in the Core AHB Configuration register
+	 *	(GAHBCFG.PTxFEmpLvl).
+	 * @hchint: Host Channels Interrupt (HChInt)
+	 *	The core sets this bit to indicate that an interrupt is pending
+	 *	on one of the channels of the core (in Host mode). The
+	 *	application must read the Host All Channels Interrupt (HAINT)
+	 *	register to determine the exact number of the channel on which
+	 *	the interrupt occurred, and then read the corresponding Host
+	 *	Channel-n Interrupt (HCINTn) register to determine the exact
+	 *	cause of the interrupt. The application must clear the
+	 *	appropriate status bit in the HCINTn register to clear this bit.
+	 * @prtint: Host Port Interrupt (PrtInt)
+	 *	The core sets this bit to indicate a change in port status of
+	 *	one of the O2P USB core ports in Host mode. The application must
+	 *	read the Host Port Control and Status (HPRT) register to
+	 *	determine the exact event that caused this interrupt. The
+	 *	application must clear the appropriate status bit in the Host
+	 *	Port Control and Status register to clear this bit.
+	 * @fetsusp: Data Fetch Suspended (FetSusp)
+	 *	This interrupt is valid only in DMA mode. This interrupt
+	 *	indicates that the core has stopped fetching data for IN
+	 *	endpoints due to the unavailability of TxFIFO space or Request
+	 *	Queue space. This interrupt is used by the application for an
+	 *	endpoint mismatch algorithm.
+	 * @incomplp: Incomplete Periodic Transfer (incomplP)
+	 *	In Host mode, the core sets this interrupt bit when there are
+	 *	incomplete periodic transactions still pending which are
+	 *	scheduled for the current microframe.
+	 *	Incomplete Isochronous OUT Transfer (incompISOOUT)
+	 *	The Device mode, the core sets this interrupt to indicate that
+	 *	there is at least one isochronous OUT endpoint on which the
+	 *	transfer is not completed in the current microframe. This
+	 *	interrupt is asserted along with the End of Periodic Frame
+	 *	Interrupt (EOPF) bit in this register.
+	 * @incompisoin: Incomplete Isochronous IN Transfer (incompISOIN)
+	 *	The core sets this interrupt to indicate that there is at least
+	 *	one isochronous IN endpoint on which the transfer is not
+	 *	completed in the current microframe. This interrupt is asserted
+	 *	along with the End of Periodic Frame Interrupt (EOPF) bit in
+	 *	this register.
+	 * @oepint: OUT Endpoints Interrupt (OEPInt)
+	 *	The core sets this bit to indicate that an interrupt is pending
+	 *	on one of the OUT endpoints of the core (in Device mode). The
+	 *	application must read the Device All Endpoints Interrupt
+	 *	(DAINT) register to determine the exact number of the OUT
+	 *	endpoint on which the interrupt occurred, and then read the
+	 *	corresponding Device OUT Endpoint-n Interrupt (DOEPINTn)
+	 *	register to determine the exact cause of the interrupt. The
+	 *	application must clear the appropriate status bit in the
+	 *	corresponding DOEPINTn register to clear this bit.
+	 * @iepint: IN Endpoints Interrupt (IEPInt)
+	 *	The core sets this bit to indicate that an interrupt is pending
+	 *	on one of the IN endpoints of the core (in Device mode). The
+	 *	application must read the Device All Endpoints Interrupt
+	 *	(DAINT) register to determine the exact number of the IN
+	 *	endpoint on which the interrupt occurred, and then read the
+	 *	corresponding Device IN Endpoint-n Interrupt (DIEPINTn)
+	 *	register to determine the exact cause of the interrupt. The
+	 *	application must clear the appropriate status bit in the
+	 *	corresponding DIEPINTn register to clear this bit.
+	 * @epmis: Endpoint Mismatch Interrupt (EPMis)
+	 *	Indicates that an IN token has been received for a non-periodic
+	 *	endpoint, but the data for another endpoint is present in the
+	 *	top of the Non-Periodic Transmit FIFO and the IN endpoint
+	 *	mismatch count programmed by the application has expired.
+	 * @eopf: End of Periodic Frame Interrupt (EOPF)
+	 *	Indicates that the period specified in the Periodic Frame
+	 *	Interval field of the Device Configuration register
+	 *	(DCFG.PerFrInt) has been reached in the current microframe.
+	 * @isooutdrop: Isochronous OUT Packet Dropped Interrupt (ISOOutDrop)
+	 *	The core sets this bit when it fails to write an isochronous OUT
+	 *	packet into the RxFIFO because the RxFIFO doesn't have
+	 *	enough space to accommodate a maximum packet size packet
+	 *	for the isochronous OUT endpoint.
+	 * @enumdone: Enumeration Done (EnumDone)
+	 *	The core sets this bit to indicate that speed enumeration is
+	 *	complete. The application must read the Device Status (DSTS)
+	 *	register to obtain the enumerated speed.
+	 * @usbrst: USB Reset (USBRst)
+	 *	The core sets this bit to indicate that a reset is detected on
+	 *	the USB.
+	 * @usbsusp: USB Suspend (USBSusp)
+	 *	The core sets this bit to indicate that a suspend was detected
+	 *	on the USB. The core enters the Suspended state when there
+	 *	is no activity on the phy_line_state_i signal for an extended
+	 *	period of time.
+	 * @erlysusp: Early Suspend (ErlySusp)
+	 *	The core sets this bit to indicate that an Idle state has been
+	 *	detected on the USB for 3 ms.
+	 * @i2cint: I2C Interrupt (I2CINT)
+	 *	This bit is always 0x0.
+	 * @ulpickint: ULPI Carkit Interrupt (ULPICKINT)
+	 *	This bit is always 0x0.
+	 * @goutnakeff: Global OUT NAK Effective (GOUTNakEff)
+	 *	Indicates that the Set Global OUT NAK bit in the Device Control
+	 *	register (DCTL.SGOUTNak), set by the application, has taken
+	 *	effect in the core. This bit can be cleared by writing the Clear
+	 *	Global OUT NAK bit in the Device Control register
+	 *	(DCTL.CGOUTNak).
+	 * @ginnakeff: Global IN Non-Periodic NAK Effective (GINNakEff)
+	 *	Indicates that the Set Global Non-Periodic IN NAK bit in the
+	 *	Device Control register (DCTL.SGNPInNak), set by the
+	 *	application, has taken effect in the core. That is, the core has
+	 *	sampled the Global IN NAK bit set by the application. This bit
+	 *	can be cleared by clearing the Clear Global Non-Periodic IN
+	 *	NAK bit in the Device Control register (DCTL.CGNPInNak).
+	 *	This interrupt does not necessarily mean that a NAK handshake
+	 *	is sent out on the USB. The STALL bit takes precedence over
+	 *	the NAK bit.
+	 * @nptxfemp: Non-Periodic TxFIFO Empty (NPTxFEmp)
+	 *	This interrupt is asserted when the Non-Periodic TxFIFO is
+	 *	either half or completely empty, and there is space for at least
+	 *	one entry to be written to the Non-Periodic Transmit Request
+	 *	Queue. The half or completely empty status is determined by
+	 *	the Non-Periodic TxFIFO Empty Level bit in the Core AHB
+	 *	Configuration register (GAHBCFG.NPTxFEmpLvl).
+	 * @rxflvl: RxFIFO Non-Empty (RxFLvl)
+	 *	Indicates that there is at least one packet pending to be read
+	 *	from the RxFIFO.
+	 * @sof: Start of (micro)Frame (Sof)
+	 *	In Host mode, the core sets this bit to indicate that an SOF
+	 *	(FS), micro-SOF (HS), or Keep-Alive (LS) is transmitted on the
+	 *	USB. The application must write a 1 to this bit to clear the
+	 *	interrupt.
+	 *	In Device mode, in the core sets this bit to indicate that an
+	 *	SOF token has been received on the USB. The application can read
+	 *	the Device Status register to get the current (micro)frame
+	 *	number. This interrupt is seen only when the core is operating
+	 *	at either HS or FS.
+	 * @otgint: OTG Interrupt (OTGInt)
+	 *	The core sets this bit to indicate an OTG protocol event. The
+	 *	application must read the OTG Interrupt Status (GOTGINT)
+	 *	register to determine the exact event that caused this
+	 *	interrupt. The application must clear the appropriate status bit
+	 *	in the GOTGINT register to clear this bit.
+	 * @modemis: Mode Mismatch Interrupt (ModeMis)
+	 *	The core sets this bit when the application is trying to access:
+	 *	* A Host mode register, when the core is operating in Device
+	 *	mode
+	 *	* A Device mode register, when the core is operating in Host
+	 *	mode
+	 *	The register access is completed on the AHB with an OKAY
+	 *	response, but is ignored by the core internally and doesn't
+	 *	affect the operation of the core.
+	 * @curmod: Current Mode of Operation (CurMod)
+	 *	Indicates the current mode of operation.
+	 *	* 1'b0: Device mode
+	 *	* 1'b1: Host mode
+	 */
+	struct cvmx_usbcx_gintsts_s {
+		uint32_t wkupint	: 1;
+		uint32_t sessreqint	: 1;
+		uint32_t disconnint	: 1;
+		uint32_t conidstschng	: 1;
+		uint32_t reserved_27_27	: 1;
+		uint32_t ptxfemp	: 1;
+		uint32_t hchint		: 1;
+		uint32_t prtint		: 1;
+		uint32_t reserved_23_23	: 1;
+		uint32_t fetsusp	: 1;
+		uint32_t incomplp	: 1;
+		uint32_t incompisoin	: 1;
+		uint32_t oepint		: 1;
+		uint32_t iepint		: 1;
+		uint32_t epmis		: 1;
+		uint32_t reserved_16_16	: 1;
+		uint32_t eopf		: 1;
+		uint32_t isooutdrop	: 1;
+		uint32_t enumdone	: 1;
+		uint32_t usbrst		: 1;
+		uint32_t usbsusp	: 1;
+		uint32_t erlysusp	: 1;
+		uint32_t i2cint		: 1;
+		uint32_t ulpickint	: 1;
+		uint32_t goutnakeff	: 1;
+		uint32_t ginnakeff	: 1;
+		uint32_t nptxfemp	: 1;
+		uint32_t rxflvl		: 1;
+		uint32_t sof		: 1;
+		uint32_t otgint		: 1;
+		uint32_t modemis	: 1;
+		uint32_t curmod		: 1;
+	} s;
+};
+typedef union cvmx_usbcx_gintsts cvmx_usbcx_gintsts_t;
+
+/**
+ * cvmx_usbc#_gnptxfsiz
+ *
+ * Non-Periodic Transmit FIFO Size Register (GNPTXFSIZ)
+ *
+ * The application can program the RAM size and the memory start address for the
+ * Non-Periodic TxFIFO.
+ */
+union cvmx_usbcx_gnptxfsiz {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_gnptxfsiz_s
+	 * @nptxfdep: Non-Periodic TxFIFO Depth (NPTxFDep)
+	 *	This value is in terms of 32-bit words.
+	 *	Minimum value is 16
+	 *	Maximum value is 32768
+	 * @nptxfstaddr: Non-Periodic Transmit RAM Start Address (NPTxFStAddr)
+	 *	This field contains the memory start address for Non-Periodic
+	 *	Transmit FIFO RAM.
+	 */
+	struct cvmx_usbcx_gnptxfsiz_s {
+		uint32_t nptxfdep	: 16;
+		uint32_t nptxfstaddr	: 16;
+	} s;
+};
+typedef union cvmx_usbcx_gnptxfsiz cvmx_usbcx_gnptxfsiz_t;
+
+/**
+ * cvmx_usbc#_gnptxsts
+ *
+ * Non-Periodic Transmit FIFO/Queue Status Register (GNPTXSTS)
+ *
+ * This read-only register contains the free space information for the
+ * Non-Periodic TxFIFO and the Non-Periodic Transmit Request Queue.
+ */
+union cvmx_usbcx_gnptxsts {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_gnptxsts_s
+	 * @nptxqtop: Top of the Non-Periodic Transmit Request Queue (NPTxQTop)
+	 *	Entry in the Non-Periodic Tx Request Queue that is currently
+	 *	being processed by the MAC.
+	 *	* Bits [30:27]: Channel/endpoint number
+	 *	* Bits [26:25]:
+	 *	- 2'b00: IN/OUT token
+	 *	- 2'b01: Zero-length transmit packet (device IN/host OUT)
+	 *	- 2'b10: PING/CSPLIT token
+	 *	- 2'b11: Channel halt command
+	 *	* Bit [24]: Terminate (last entry for selected channel/endpoint)
+	 * @nptxqspcavail: Non-Periodic Transmit Request Queue Space Available
+	 *	(NPTxQSpcAvail)
+	 *	Indicates the amount of free space available in the Non-
+	 *	Periodic Transmit Request Queue. This queue holds both IN
+	 *	and OUT requests in Host mode. Device mode has only IN
+	 *	requests.
+	 *	* 8'h0: Non-Periodic Transmit Request Queue is full
+	 *	* 8'h1: 1 location available
+	 *	* 8'h2: 2 locations available
+	 *	* n: n locations available (0..8)
+	 *	* Others: Reserved
+	 * @nptxfspcavail: Non-Periodic TxFIFO Space Avail (NPTxFSpcAvail)
+	 *	Indicates the amount of free space available in the Non-
+	 *	Periodic TxFIFO.
+	 *	Values are in terms of 32-bit words.
+	 *	* 16'h0: Non-Periodic TxFIFO is full
+	 *	* 16'h1: 1 word available
+	 *	* 16'h2: 2 words available
+	 *	* 16'hn: n words available (where 0..32768)
+	 *	* 16'h8000: 32768 words available
+	 *	* Others: Reserved
+	 */
+	struct cvmx_usbcx_gnptxsts_s {
+		uint32_t reserved_31_31	: 1;
+		uint32_t nptxqtop	: 7;
+		uint32_t nptxqspcavail	: 8;
+		uint32_t nptxfspcavail	: 16;
+	} s;
+};
+typedef union cvmx_usbcx_gnptxsts cvmx_usbcx_gnptxsts_t;
+
+/**
+ * cvmx_usbc#_grstctl
+ *
+ * Core Reset Register (GRSTCTL)
+ *
+ * The application uses this register to reset various hardware features inside
+ * the core.
+ */
+union cvmx_usbcx_grstctl {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_grstctl_s
+	 * @ahbidle: AHB Master Idle (AHBIdle)
+	 *	Indicates that the AHB Master State Machine is in the IDLE
+	 *	condition.
+	 * @dmareq: DMA Request Signal (DMAReq)
+	 *	Indicates that the DMA request is in progress. Used for debug.
+	 * @txfnum: TxFIFO Number (TxFNum)
+	 *	This is the FIFO number that must be flushed using the TxFIFO
+	 *	Flush bit. This field must not be changed until the core clears
+	 *	the TxFIFO Flush bit.
+	 *	* 5'h0: Non-Periodic TxFIFO flush
+	 *	* 5'h1: Periodic TxFIFO 1 flush in Device mode or Periodic
+	 *	TxFIFO flush in Host mode
+	 *	* 5'h2: Periodic TxFIFO 2 flush in Device mode
+	 *	- ...
+	 *	* 5'hF: Periodic TxFIFO 15 flush in Device mode
+	 *	* 5'h10: Flush all the Periodic and Non-Periodic TxFIFOs in the
+	 *	core
+	 * @txfflsh: TxFIFO Flush (TxFFlsh)
+	 *	This bit selectively flushes a single or all transmit FIFOs, but
+	 *	cannot do so if the core is in the midst of a transaction.
+	 *	The application must only write this bit after checking that the
+	 *	core is neither writing to the TxFIFO nor reading from the
+	 *	TxFIFO.
+	 *	The application must wait until the core clears this bit before
+	 *	performing any operations. This bit takes 8 clocks (of phy_clk
+	 *	or hclk, whichever is slower) to clear.
+	 * @rxfflsh: RxFIFO Flush (RxFFlsh)
+	 *	The application can flush the entire RxFIFO using this bit, but
+	 *	must first ensure that the core is not in the middle of a
+	 *	transaction.
+	 *	The application must only write to this bit after checking that
+	 *	the core is neither reading from the RxFIFO nor writing to the
+	 *	RxFIFO.
+	 *	The application must wait until the bit is cleared before
+	 *	performing any other operations. This bit will take 8 clocks
+	 *	(slowest of PHY or AHB clock) to clear.
+	 * @intknqflsh: IN Token Sequence Learning Queue Flush (INTknQFlsh)
+	 *	The application writes this bit to flush the IN Token Sequence
+	 *	Learning Queue.
+	 * @frmcntrrst: Host Frame Counter Reset (FrmCntrRst)
+	 *	The application writes this bit to reset the (micro)frame number
+	 *	counter inside the core. When the (micro)frame counter is reset,
+	 *	the subsequent SOF sent out by the core will have a
+	 *	(micro)frame number of 0.
+	 * @hsftrst: HClk Soft Reset (HSftRst)
+	 *	The application uses this bit to flush the control logic in the
+	 *	AHB Clock domain. Only AHB Clock Domain pipelines are reset.
+	 *	* FIFOs are not flushed with this bit.
+	 *	* All state machines in the AHB clock domain are reset to the
+	 *	Idle state after terminating the transactions on the AHB,
+	 *	following the protocol.
+	 *	* CSR control bits used by the AHB clock domain state
+	 *	machines are cleared.
+	 *	* To clear this interrupt, status mask bits that control the
+	 *	interrupt status and are generated by the AHB clock domain
+	 *	state machine are cleared.
+	 *	* Because interrupt status bits are not cleared, the application
+	 *	can get the status of any core events that occurred after it set
+	 *	this bit.
+	 *	This is a self-clearing bit that the core clears after all
+	 *	necessary logic is reset in the core. This may take several
+	 *	clocks, depending on the core's current state.
+	 * @csftrst: Core Soft Reset (CSftRst)
+	 *	Resets the hclk and phy_clock domains as follows:
+	 *	* Clears the interrupts and all the CSR registers except the
+	 *	following register bits:
+	 *	- PCGCCTL.RstPdwnModule
+	 *	- PCGCCTL.GateHclk
+	 *	- PCGCCTL.PwrClmp
+	 *	- PCGCCTL.StopPPhyLPwrClkSelclk
+	 *	- GUSBCFG.PhyLPwrClkSel
+	 *	- GUSBCFG.DDRSel
+	 *	- GUSBCFG.PHYSel
+	 *	- GUSBCFG.FSIntf
+	 *	- GUSBCFG.ULPI_UTMI_Sel
+	 *	- GUSBCFG.PHYIf
+	 *	- HCFG.FSLSPclkSel
+	 *	- DCFG.DevSpd
+	 *	* All module state machines (except the AHB Slave Unit) are
+	 *	reset to the IDLE state, and all the transmit FIFOs and the
+	 *	receive FIFO are flushed.
+	 *	* Any transactions on the AHB Master are terminated as soon
+	 *	as possible, after gracefully completing the last data phase of
+	 *	an AHB transfer. Any transactions on the USB are terminated
+	 *	immediately.
+	 *	The application can write to this bit any time it wants to reset
+	 *	the core. This is a self-clearing bit and the core clears this
+	 *	bit after all the necessary logic is reset in the core, which
+	 *	may take several clocks, depending on the current state of the
+	 *	core. Once this bit is cleared software should wait at least 3
+	 *	PHY clocks before doing any access to the PHY domain
+	 *	(synchronization delay). Software should also should check that
+	 *	bit 31 of this register is 1 (AHB Master is IDLE) before
+	 *	starting any operation.
+	 *	Typically software reset is used during software development
+	 *	and also when you dynamically change the PHY selection bits
+	 *	in the USB configuration registers listed above. When you
+	 *	change the PHY, the corresponding clock for the PHY is
+	 *	selected and used in the PHY domain. Once a new clock is
+	 *	selected, the PHY domain has to be reset for proper operation.
+	 */
+	struct cvmx_usbcx_grstctl_s {
+		uint32_t ahbidle	: 1;
+		uint32_t dmareq		: 1;
+		uint32_t reserved_11_29	: 19;
+		uint32_t txfnum		: 5;
+		uint32_t txfflsh	: 1;
+		uint32_t rxfflsh	: 1;
+		uint32_t intknqflsh	: 1;
+		uint32_t frmcntrrst	: 1;
+		uint32_t hsftrst	: 1;
+		uint32_t csftrst	: 1;
+	} s;
+};
+typedef union cvmx_usbcx_grstctl cvmx_usbcx_grstctl_t;
+
+/**
+ * cvmx_usbc#_grxfsiz
+ *
+ * Receive FIFO Size Register (GRXFSIZ)
+ *
+ * The application can program the RAM size that must be allocated to the
+ * RxFIFO.
+ */
+union cvmx_usbcx_grxfsiz {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_grxfsiz_s
+	 * @rxfdep: RxFIFO Depth (RxFDep)
+	 *	This value is in terms of 32-bit words.
+	 *	* Minimum value is 16
+	 *	* Maximum value is 32768
+	 */
+	struct cvmx_usbcx_grxfsiz_s {
+		uint32_t reserved_16_31	: 16;
+		uint32_t rxfdep		: 16;
+	} s;
+};
+typedef union cvmx_usbcx_grxfsiz cvmx_usbcx_grxfsiz_t;
+
+/**
+ * cvmx_usbc#_grxstsph
+ *
+ * Receive Status Read and Pop Register, Host Mode (GRXSTSPH)
+ *
+ * A read to the Receive Status Read and Pop register returns and additionally
+ * pops the top data entry out of the RxFIFO.
+ * This Description is only valid when the core is in Host Mode. For Device Mode
+ * use USBC_GRXSTSPD instead.
+ * NOTE: GRXSTSPH and GRXSTSPD are physically the same register and share the
+ *	 same offset in the O2P USB core. The offset difference shown in this
+ *	 document is for software clarity and is actually ignored by the
+ *       hardware.
+ */
+union cvmx_usbcx_grxstsph {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_grxstsph_s
+	 * @pktsts: Packet Status (PktSts)
+	 *	Indicates the status of the received packet
+	 *	* 4'b0010: IN data packet received
+	 *	* 4'b0011: IN transfer completed (triggers an interrupt)
+	 *	* 4'b0101: Data toggle error (triggers an interrupt)
+	 *	* 4'b0111: Channel halted (triggers an interrupt)
+	 *	* Others: Reserved
+	 * @dpid: Data PID (DPID)
+	 *	* 2'b00: DATA0
+	 *	* 2'b10: DATA1
+	 *	* 2'b01: DATA2
+	 *	* 2'b11: MDATA
+	 * @bcnt: Byte Count (BCnt)
+	 *	Indicates the byte count of the received IN data packet
+	 * @chnum: Channel Number (ChNum)
+	 *	Indicates the channel number to which the current received
+	 *	packet belongs.
+	 */
+	struct cvmx_usbcx_grxstsph_s {
+		uint32_t reserved_21_31	: 11;
+		uint32_t pktsts		: 4;
+		uint32_t dpid		: 2;
+		uint32_t bcnt		: 11;
+		uint32_t chnum		: 4;
+	} s;
+};
+typedef union cvmx_usbcx_grxstsph cvmx_usbcx_grxstsph_t;
+
+/**
+ * cvmx_usbc#_gusbcfg
+ *
+ * Core USB Configuration Register (GUSBCFG)
+ *
+ * This register can be used to configure the core after power-on or a changing
+ * to Host mode or Device mode. It contains USB and USB-PHY related
+ * configuration parameters. The application must program this register before
+ * starting any transactions on either the AHB or the USB. Do not make changes
+ * to this register after the initial programming.
+ */
+union cvmx_usbcx_gusbcfg {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_gusbcfg_s
+	 * @otgi2csel: UTMIFS or I2C Interface Select (OtgI2CSel)
+	 *	This bit is always 0x0.
+	 * @phylpwrclksel: PHY Low-Power Clock Select (PhyLPwrClkSel)
+	 *	Software should set this bit to 0x0.
+	 *	Selects either 480-MHz or 48-MHz (low-power) PHY mode. In
+	 *	FS and LS modes, the PHY can usually operate on a 48-MHz
+	 *	clock to save power.
+	 *	* 1'b0: 480-MHz Internal PLL clock
+	 *	* 1'b1: 48-MHz External Clock
+	 *	In 480 MHz mode, the UTMI interface operates at either 60 or
+	 *	30-MHz, depending upon whether 8- or 16-bit data width is
+	 *	selected. In 48-MHz mode, the UTMI interface operates at 48
+	 *	MHz in FS mode and at either 48 or 6 MHz in LS mode
+	 *	(depending on the PHY vendor).
+	 *	This bit drives the utmi_fsls_low_power core output signal, and
+	 *	is valid only for UTMI+ PHYs.
+	 * @usbtrdtim: USB Turnaround Time (USBTrdTim)
+	 *	Sets the turnaround time in PHY clocks.
+	 *	Specifies the response time for a MAC request to the Packet
+	 *	FIFO Controller (PFC) to fetch data from the DFIFO (SPRAM).
+	 *	This must be programmed to 0x5.
+	 * @hnpcap: HNP-Capable (HNPCap)
+	 *	This bit is always 0x0.
+	 * @srpcap: SRP-Capable (SRPCap)
+	 *	This bit is always 0x0.
+	 * @ddrsel: ULPI DDR Select (DDRSel)
+	 *	Software should set this bit to 0x0.
+	 * @physel: USB 2.0 High-Speed PHY or USB 1.1 Full-Speed Serial
+	 *	Software should set this bit to 0x0.
+	 * @fsintf: Full-Speed Serial Interface Select (FSIntf)
+	 *	Software should set this bit to 0x0.
+	 * @ulpi_utmi_sel: ULPI or UTMI+ Select (ULPI_UTMI_Sel)
+	 *	This bit is always 0x0.
+	 * @phyif: PHY Interface (PHYIf)
+	 *	This bit is always 0x1.
+	 * @toutcal: HS/FS Timeout Calibration (TOutCal)
+	 *	The number of PHY clocks that the application programs in this
+	 *	field is added to the high-speed/full-speed interpacket timeout
+	 *	duration in the core to account for any additional delays
+	 *	introduced by the PHY. This may be required, since the delay
+	 *	introduced by the PHY in generating the linestate condition may
+	 *	vary from one PHY to another.
+	 *	The USB standard timeout value for high-speed operation is
+	 *	736 to 816 (inclusive) bit times. The USB standard timeout
+	 *	value for full-speed operation is 16 to 18 (inclusive) bit
+	 *	times. The application must program this field based on the
+	 *	speed of enumeration. The number of bit times added per PHY
+	 *	clock are:
+	 *	High-speed operation:
+	 *	* One 30-MHz PHY clock = 16 bit times
+	 *	* One 60-MHz PHY clock = 8 bit times
+	 *	Full-speed operation:
+	 *	* One 30-MHz PHY clock = 0.4 bit times
+	 *	* One 60-MHz PHY clock = 0.2 bit times
+	 *	* One 48-MHz PHY clock = 0.25 bit times
+	 */
+	struct cvmx_usbcx_gusbcfg_s {
+		uint32_t reserved_17_31	: 15;
+		uint32_t otgi2csel	: 1;
+		uint32_t phylpwrclksel	: 1;
+		uint32_t reserved_14_14	: 1;
+		uint32_t usbtrdtim	: 4;
+		uint32_t hnpcap		: 1;
+		uint32_t srpcap		: 1;
+		uint32_t ddrsel		: 1;
+		uint32_t physel		: 1;
+		uint32_t fsintf		: 1;
+		uint32_t ulpi_utmi_sel	: 1;
+		uint32_t phyif		: 1;
+		uint32_t toutcal	: 3;
+	} s;
+};
+typedef union cvmx_usbcx_gusbcfg cvmx_usbcx_gusbcfg_t;
+
+/**
+ * cvmx_usbc#_haint
+ *
+ * Host All Channels Interrupt Register (HAINT)
+ *
+ * When a significant event occurs on a channel, the Host All Channels Interrupt
+ * register interrupts the application using the Host Channels Interrupt bit of
+ * the Core Interrupt register (GINTSTS.HChInt). This is shown in Interrupt.
+ * There is one interrupt bit per channel, up to a maximum of 16 bits. Bits in
+ * this register are set and cleared when the application sets and clears bits
+ * in the corresponding Host Channel-n Interrupt register.
+ */
+union cvmx_usbcx_haint {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_haint_s
+	 * @haint: Channel Interrupts (HAINT)
+	 *	One bit per channel: Bit 0 for Channel 0, bit 15 for Channel 15
+	 */
+	struct cvmx_usbcx_haint_s {
+		uint32_t reserved_16_31	: 16;
+		uint32_t haint		: 16;
+	} s;
+};
+typedef union cvmx_usbcx_haint cvmx_usbcx_haint_t;
+
+/**
+ * cvmx_usbc#_haintmsk
+ *
+ * Host All Channels Interrupt Mask Register (HAINTMSK)
+ *
+ * The Host All Channel Interrupt Mask register works with the Host All Channel
+ * Interrupt register to interrupt the application when an event occurs on a
+ * channel. There is one interrupt mask bit per channel, up to a maximum of 16
+ * bits.
+ * Mask interrupt: 1'b0 Unmask interrupt: 1'b1
+ */
+union cvmx_usbcx_haintmsk {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_haintmsk_s
+	 * @haintmsk: Channel Interrupt Mask (HAINTMsk)
+	 *	One bit per channel: Bit 0 for channel 0, bit 15 for channel 15
+	 */
+	struct cvmx_usbcx_haintmsk_s {
+		uint32_t reserved_16_31	: 16;
+		uint32_t haintmsk	: 16;
+	} s;
+};
+typedef union cvmx_usbcx_haintmsk cvmx_usbcx_haintmsk_t;
+
+/**
+ * cvmx_usbc#_hcchar#
+ *
+ * Host Channel-n Characteristics Register (HCCHAR)
+ *
+ */
+union cvmx_usbcx_hccharx {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hccharx_s
+	 * @chena: Channel Enable (ChEna)
+	 *	This field is set by the application and cleared by the OTG
+	 *	host.
+	 *	* 1'b0: Channel disabled
+	 *	* 1'b1: Channel enabled
+	 * @chdis: Channel Disable (ChDis)
+	 *	The application sets this bit to stop transmitting/receiving
+	 *	data on a channel, even before the transfer for that channel is
+	 *	complete. The application must wait for the Channel Disabled
+	 *	interrupt before treating the channel as disabled.
+	 * @oddfrm: Odd Frame (OddFrm)
+	 *	This field is set (reset) by the application to indicate that
+	 *	the OTG host must perform a transfer in an odd (micro)frame.
+	 *	This field is applicable for only periodic (isochronous and
+	 *	interrupt) transactions.
+	 *	* 1'b0: Even (micro)frame
+	 *	* 1'b1: Odd (micro)frame
+	 * @devaddr: Device Address (DevAddr)
+	 *	This field selects the specific device serving as the data
+	 *	source or sink.
+	 * @ec: Multi Count (MC) / Error Count (EC)
+	 *	When the Split Enable bit of the Host Channel-n Split Control
+	 *	register (HCSPLTn.SpltEna) is reset (1'b0), this field indicates
+	 *	to the host the number of transactions that should be executed
+	 *	per microframe for this endpoint.
+	 *	* 2'b00: Reserved. This field yields undefined results.
+	 *	* 2'b01: 1 transaction
+	 *	* 2'b10: 2 transactions to be issued for this endpoint per
+	 *	microframe
+	 *	* 2'b11: 3 transactions to be issued for this endpoint per
+	 *	microframe
+	 *	When HCSPLTn.SpltEna is set (1'b1), this field indicates the
+	 *	number of immediate retries to be performed for a periodic split
+	 *	transactions on transaction errors. This field must be set to at
+	 *	least 2'b01.
+	 * @eptype: Endpoint Type (EPType)
+	 *	Indicates the transfer type selected.
+	 *	* 2'b00: Control
+	 *	* 2'b01: Isochronous
+	 *	* 2'b10: Bulk
+	 *	* 2'b11: Interrupt
+	 * @lspddev: Low-Speed Device (LSpdDev)
+	 *	This field is set by the application to indicate that this
+	 *	channel is communicating to a low-speed device.
+	 * @epdir: Endpoint Direction (EPDir)
+	 *	Indicates whether the transaction is IN or OUT.
+	 *	* 1'b0: OUT
+	 *	* 1'b1: IN
+	 * @epnum: Endpoint Number (EPNum)
+	 *	Indicates the endpoint number on the device serving as the
+	 *	data source or sink.
+	 * @mps: Maximum Packet Size (MPS)
+	 *	Indicates the maximum packet size of the associated endpoint.
+	 */
+	struct cvmx_usbcx_hccharx_s {
+		uint32_t chena		: 1;
+		uint32_t chdis		: 1;
+		uint32_t oddfrm		: 1;
+		uint32_t devaddr	: 7;
+		uint32_t ec		: 2;
+		uint32_t eptype		: 2;
+		uint32_t lspddev	: 1;
+		uint32_t reserved_16_16	: 1;
+		uint32_t epdir		: 1;
+		uint32_t epnum		: 4;
+		uint32_t mps		: 11;
+	} s;
+};
+typedef union cvmx_usbcx_hccharx cvmx_usbcx_hccharx_t;
+
+/**
+ * cvmx_usbc#_hcfg
+ *
+ * Host Configuration Register (HCFG)
+ *
+ * This register configures the core after power-on. Do not make changes to this
+ * register after initializing the host.
+ */
+union cvmx_usbcx_hcfg {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hcfg_s
+	 * @fslssupp: FS- and LS-Only Support (FSLSSupp)
+	 *	The application uses this bit to control the core's enumeration
+	 *	speed. Using this bit, the application can make the core
+	 *	enumerate as a FS host, even if the connected device supports
+	 *	HS traffic. Do not make changes to this field after initial
+	 *	programming.
+	 *	* 1'b0: HS/FS/LS, based on the maximum speed supported by
+	 *	the connected device
+	 *	* 1'b1: FS/LS-only, even if the connected device can support HS
+	 * @fslspclksel: FS/LS PHY Clock Select (FSLSPclkSel)
+	 *	When the core is in FS Host mode
+	 *	* 2'b00: PHY clock is running at 30/60 MHz
+	 *	* 2'b01: PHY clock is running at 48 MHz
+	 *	* Others: Reserved
+	 *	When the core is in LS Host mode
+	 *	* 2'b00: PHY clock is running at 30/60 MHz. When the
+	 *	UTMI+/ULPI PHY Low Power mode is not selected, use
+	 *	30/60 MHz.
+	 *	* 2'b01: PHY clock is running at 48 MHz. When the UTMI+
+	 *	PHY Low Power mode is selected, use 48MHz if the PHY
+	 *	supplies a 48 MHz clock during LS mode.
+	 *	* 2'b10: PHY clock is running at 6 MHz. In USB 1.1 FS mode,
+	 *	use 6 MHz when the UTMI+ PHY Low Power mode is
+	 *	selected and the PHY supplies a 6 MHz clock during LS
+	 *	mode. If you select a 6 MHz clock during LS mode, you must
+	 *	do a soft reset.
+	 *	* 2'b11: Reserved
+	 */
+	struct cvmx_usbcx_hcfg_s {
+		uint32_t reserved_3_31	: 29;
+		uint32_t fslssupp	: 1;
+		uint32_t fslspclksel	: 2;
+	} s;
+};
+typedef union cvmx_usbcx_hcfg cvmx_usbcx_hcfg_t;
+
+/**
+ * cvmx_usbc#_hcint#
+ *
+ * Host Channel-n Interrupt Register (HCINT)
+ *
+ * This register indicates the status of a channel with respect to USB- and
+ * AHB-related events. The application must read this register when the Host
+ * Channels Interrupt bit of the Core Interrupt register (GINTSTS.HChInt) is
+ * set. Before the application can read this register, it must first read
+ * the Host All Channels Interrupt (HAINT) register to get the exact channel
+ * number for the Host Channel-n Interrupt register. The application must clear
+ * the appropriate bit in this register to clear the corresponding bits in the
+ * HAINT and GINTSTS registers.
+ */
+union cvmx_usbcx_hcintx {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hcintx_s
+	 * @datatglerr: Data Toggle Error (DataTglErr)
+	 * @frmovrun: Frame Overrun (FrmOvrun)
+	 * @bblerr: Babble Error (BblErr)
+	 * @xacterr: Transaction Error (XactErr)
+	 * @nyet: NYET Response Received Interrupt (NYET)
+	 * @ack: ACK Response Received Interrupt (ACK)
+	 * @nak: NAK Response Received Interrupt (NAK)
+	 * @stall: STALL Response Received Interrupt (STALL)
+	 * @ahberr: This bit is always 0x0.
+	 * @chhltd: Channel Halted (ChHltd)
+	 *	Indicates the transfer completed abnormally either because of
+	 *	any USB transaction error or in response to disable request by
+	 *	the application.
+	 * @xfercompl: Transfer Completed (XferCompl)
+	 *	Transfer completed normally without any errors.
+	 */
+	struct cvmx_usbcx_hcintx_s {
+		uint32_t reserved_11_31	: 21;
+		uint32_t datatglerr	: 1;
+		uint32_t frmovrun	: 1;
+		uint32_t bblerr		: 1;
+		uint32_t xacterr	: 1;
+		uint32_t nyet		: 1;
+		uint32_t ack		: 1;
+		uint32_t nak		: 1;
+		uint32_t stall		: 1;
+		uint32_t ahberr		: 1;
+		uint32_t chhltd		: 1;
+		uint32_t xfercompl	: 1;
+	} s;
+};
+typedef union cvmx_usbcx_hcintx cvmx_usbcx_hcintx_t;
+
+/**
+ * cvmx_usbc#_hcintmsk#
+ *
+ * Host Channel-n Interrupt Mask Register (HCINTMSKn)
+ *
+ * This register reflects the mask for each channel status described in the
+ * previous section.
+ * Mask interrupt: 1'b0 Unmask interrupt: 1'b1
+ */
+union cvmx_usbcx_hcintmskx {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hcintmskx_s
+	 * @datatglerrmsk: Data Toggle Error Mask (DataTglErrMsk)
+	 * @frmovrunmsk: Frame Overrun Mask (FrmOvrunMsk)
+	 * @bblerrmsk: Babble Error Mask (BblErrMsk)
+	 * @xacterrmsk: Transaction Error Mask (XactErrMsk)
+	 * @nyetmsk: NYET Response Received Interrupt Mask (NyetMsk)
+	 * @ackmsk: ACK Response Received Interrupt Mask (AckMsk)
+	 * @nakmsk: NAK Response Received Interrupt Mask (NakMsk)
+	 * @stallmsk: STALL Response Received Interrupt Mask (StallMsk)
+	 * @ahberrmsk: AHB Error Mask (AHBErrMsk)
+	 * @chhltdmsk: Channel Halted Mask (ChHltdMsk)
+	 * @xfercomplmsk: Transfer Completed Mask (XferComplMsk)
+	 */
+	struct cvmx_usbcx_hcintmskx_s {
+		uint32_t reserved_11_31	: 21;
+		uint32_t datatglerrmsk	: 1;
+		uint32_t frmovrunmsk	: 1;
+		uint32_t bblerrmsk	: 1;
+		uint32_t xacterrmsk	: 1;
+		uint32_t nyetmsk	: 1;
+		uint32_t ackmsk		: 1;
+		uint32_t nakmsk		: 1;
+		uint32_t stallmsk	: 1;
+		uint32_t ahberrmsk	: 1;
+		uint32_t chhltdmsk	: 1;
+		uint32_t xfercomplmsk	: 1;
+	} s;
+};
+typedef union cvmx_usbcx_hcintmskx cvmx_usbcx_hcintmskx_t;
+
+/**
+ * cvmx_usbc#_hcsplt#
+ *
+ * Host Channel-n Split Control Register (HCSPLT)
+ *
+ */
+union cvmx_usbcx_hcspltx {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hcspltx_s
+	 * @spltena: Split Enable (SpltEna)
+	 *	The application sets this field to indicate that this channel is
+	 *	enabled to perform split transactions.
+	 * @compsplt: Do Complete Split (CompSplt)
+	 *	The application sets this field to request the OTG host to
+	 *	perform a complete split transaction.
+	 * @xactpos: Transaction Position (XactPos)
+	 *	This field is used to determine whether to send all, first,
+	 *	middle, or last payloads with each OUT transaction.
+	 *	* 2'b11: All. This is the entire data payload is of this
+	 *	transaction (which is less than or equal to 188 bytes).
+	 *	* 2'b10: Begin. This is the first data payload of this
+	 *	transaction (which is larger than 188 bytes).
+	 *	* 2'b00: Mid. This is the middle payload of this transaction
+	 *	(which is larger than 188 bytes).
+	 *	* 2'b01: End. This is the last payload of this transaction
+	 *	(which is larger than 188 bytes).
+	 * @hubaddr: Hub Address (HubAddr)
+	 *	This field holds the device address of the transaction
+	 *	translator's hub.
+	 * @prtaddr: Port Address (PrtAddr)
+	 *	This field is the port number of the recipient transaction
+	 *	translator.
+	 */
+	struct cvmx_usbcx_hcspltx_s {
+		uint32_t spltena	: 1;
+		uint32_t reserved_17_30	: 14;
+		uint32_t compsplt	: 1;
+		uint32_t xactpos	: 2;
+		uint32_t hubaddr	: 7;
+		uint32_t prtaddr	: 7;
+	} s;
+};
+typedef union cvmx_usbcx_hcspltx cvmx_usbcx_hcspltx_t;
+
+/**
+ * cvmx_usbc#_hctsiz#
+ *
+ * Host Channel-n Transfer Size Register (HCTSIZ)
+ *
+ */
+union cvmx_usbcx_hctsizx {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hctsizx_s
+	 * @dopng: Do Ping (DoPng)
+	 *	Setting this field to 1 directs the host to do PING protocol.
+	 * @pid: PID (Pid)
+	 *	The application programs this field with the type of PID to use
+	 *	for the initial transaction. The host will maintain this field
+	 *	for the rest of the transfer.
+	 *	* 2'b00: DATA0
+	 *	* 2'b01: DATA2
+	 *	* 2'b10: DATA1
+	 *	* 2'b11: MDATA (non-control)/SETUP (control)
+	 * @pktcnt: Packet Count (PktCnt)
+	 *	This field is programmed by the application with the expected
+	 *	number of packets to be transmitted (OUT) or received (IN).
+	 *	The host decrements this count on every successful
+	 *	transmission or reception of an OUT/IN packet. Once this count
+	 *	reaches zero, the application is interrupted to indicate normal
+	 *	completion.
+	 * @xfersize: Transfer Size (XferSize)
+	 *	For an OUT, this field is the number of data bytes the host will
+	 *	send during the transfer.
+	 *	For an IN, this field is the buffer size that the application
+	 *	has reserved for the transfer. The application is expected to
+	 *	program this field as an integer multiple of the maximum packet
+	 *	size for IN transactions (periodic and non-periodic).
+	 */
+	struct cvmx_usbcx_hctsizx_s {
+		uint32_t dopng		: 1;
+		uint32_t pid		: 2;
+		uint32_t pktcnt		: 10;
+		uint32_t xfersize	: 19;
+	} s;
+};
+typedef union cvmx_usbcx_hctsizx cvmx_usbcx_hctsizx_t;
+
+/**
+ * cvmx_usbc#_hfir
+ *
+ * Host Frame Interval Register (HFIR)
+ *
+ * This register stores the frame interval information for the current speed to
+ * which the O2P USB core has enumerated.
+ */
+union cvmx_usbcx_hfir {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hfir_s
+	 * @frint: Frame Interval (FrInt)
+	 *	The value that the application programs to this field specifies
+	 *	the interval between two consecutive SOFs (FS) or micro-
+	 *	SOFs (HS) or Keep-Alive tokens (HS). This field contains the
+	 *	number of PHY clocks that constitute the required frame
+	 *	interval. The default value set in this field for a FS operation
+	 *	when the PHY clock frequency is 60 MHz. The application can
+	 *	write a value to this register only after the Port Enable bit of
+	 *	the Host Port Control and Status register (HPRT.PrtEnaPort)
+	 *	has been set. If no value is programmed, the core calculates
+	 *	the value based on the PHY clock specified in the FS/LS PHY
+	 *	Clock Select field of the Host Configuration register
+	 *	(HCFG.FSLSPclkSel). Do not change the value of this field
+	 *	after the initial configuration.
+	 *	* 125 us (PHY clock frequency for HS)
+	 *	* 1 ms (PHY clock frequency for FS/LS)
+	 */
+	struct cvmx_usbcx_hfir_s {
+		uint32_t reserved_16_31	: 16;
+		uint32_t frint		: 16;
+	} s;
+};
+typedef union cvmx_usbcx_hfir cvmx_usbcx_hfir_t;
+
+/**
+ * cvmx_usbc#_hfnum
+ *
+ * Host Frame Number/Frame Time Remaining Register (HFNUM)
+ *
+ * This register indicates the current frame number.
+ * It also indicates the time remaining (in terms of the number of PHY clocks)
+ * in the current (micro)frame.
+ */
+union cvmx_usbcx_hfnum {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hfnum_s
+	 * @frrem: Frame Time Remaining (FrRem)
+	 *	Indicates the amount of time remaining in the current
+	 *	microframe (HS) or frame (FS/LS), in terms of PHY clocks.
+	 *	This field decrements on each PHY clock. When it reaches
+	 *	zero, this field is reloaded with the value in the Frame
+	 *	Interval register and a new SOF is transmitted on the USB.
+	 * @frnum: Frame Number (FrNum)
+	 *	This field increments when a new SOF is transmitted on the
+	 *	USB, and is reset to 0 when it reaches 16'h3FFF.
+	 */
+	struct cvmx_usbcx_hfnum_s {
+		uint32_t frrem	: 16;
+		uint32_t frnum	: 16;
+	} s;
+};
+typedef union cvmx_usbcx_hfnum cvmx_usbcx_hfnum_t;
+
+/**
+ * cvmx_usbc#_hprt
+ *
+ * Host Port Control and Status Register (HPRT)
+ *
+ * This register is available in both Host and Device modes.
+ * Currently, the OTG Host supports only one port.
+ * A single register holds USB port-related information such as USB reset,
+ * enable, suspend, resume, connect status, and test mode for each port. The
+ * R_SS_WC bits in this register can trigger an interrupt to the application
+ * through the Host Port Interrupt bit of the Core Interrupt register
+ * (GINTSTS.PrtInt). On a Port Interrupt, the application must read this
+ * register and clear the bit that caused the interrupt. For the R_SS_WC bits,
+ * the application must write a 1 to the bit to clear the interrupt.
+ */
+union cvmx_usbcx_hprt {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hprt_s
+	 * @prtspd: Port Speed (PrtSpd)
+	 *	Indicates the speed of the device attached to this port.
+	 *	* 2'b00: High speed
+	 *	* 2'b01: Full speed
+	 *	* 2'b10: Low speed
+	 *	* 2'b11: Reserved
+	 * @prttstctl: Port Test Control (PrtTstCtl)
+	 *	The application writes a nonzero value to this field to put
+	 *	the port into a Test mode, and the corresponding pattern is
+	 *	signaled on the port.
+	 *	* 4'b0000: Test mode disabled
+	 *	* 4'b0001: Test_J mode
+	 *	* 4'b0010: Test_K mode
+	 *	* 4'b0011: Test_SE0_NAK mode
+	 *	* 4'b0100: Test_Packet mode
+	 *	* 4'b0101: Test_Force_Enable
+	 *	* Others: Reserved
+	 *	PrtSpd must be zero (i.e. the interface must be in high-speed
+	 *	mode) to use the PrtTstCtl test modes.
+	 * @prtpwr: Port Power (PrtPwr)
+	 *	The application uses this field to control power to this port,
+	 *	and the core clears this bit on an overcurrent condition.
+	 *	* 1'b0: Power off
+	 *	* 1'b1: Power on
+	 * @prtlnsts: Port Line Status (PrtLnSts)
+	 *	Indicates the current logic level USB data lines
+	 *	* Bit [10]: Logic level of D-
+	 *	* Bit [11]: Logic level of D+
+	 * @prtrst: Port Reset (PrtRst)
+	 *	When the application sets this bit, a reset sequence is
+	 *	started on this port. The application must time the reset
+	 *	period and clear this bit after the reset sequence is
+	 *	complete.
+	 *	* 1'b0: Port not in reset
+	 *	* 1'b1: Port in reset
+	 *	The application must leave this bit set for at least a
+	 *	minimum duration mentioned below to start a reset on the
+	 *	port. The application can leave it set for another 10 ms in
+	 *	addition to the required minimum duration, before clearing
+	 *	the bit, even though there is no maximum limit set by the
+	 *	USB standard.
+	 *	* High speed: 50 ms
+	 *	* Full speed/Low speed: 10 ms
+	 * @prtsusp: Port Suspend (PrtSusp)
+	 *	The application sets this bit to put this port in Suspend
+	 *	mode. The core only stops sending SOFs when this is set.
+	 *	To stop the PHY clock, the application must set the Port
+	 *	Clock Stop bit, which will assert the suspend input pin of
+	 *	the PHY.
+	 *	The read value of this bit reflects the current suspend
+	 *	status of the port. This bit is cleared by the core after a
+	 *	remote wakeup signal is detected or the application sets
+	 *	the Port Reset bit or Port Resume bit in this register or the
+	 *	Resume/Remote Wakeup Detected Interrupt bit or
+	 *	Disconnect Detected Interrupt bit in the Core Interrupt
+	 *	register (GINTSTS.WkUpInt or GINTSTS.DisconnInt,
+	 *	respectively).
+	 *	* 1'b0: Port not in Suspend mode
+	 *	* 1'b1: Port in Suspend mode
+	 * @prtres: Port Resume (PrtRes)
+	 *	The application sets this bit to drive resume signaling on
+	 *	the port. The core continues to drive the resume signal
+	 *	until the application clears this bit.
+	 *	If the core detects a USB remote wakeup sequence, as
+	 *	indicated by the Port Resume/Remote Wakeup Detected
+	 *	Interrupt bit of the Core Interrupt register
+	 *	(GINTSTS.WkUpInt), the core starts driving resume
+	 *	signaling without application intervention and clears this bit
+	 *	when it detects a disconnect condition. The read value of
+	 *	this bit indicates whether the core is currently driving
+	 *	resume signaling.
+	 *	* 1'b0: No resume driven
+	 *	* 1'b1: Resume driven
+	 * @prtovrcurrchng: Port Overcurrent Change (PrtOvrCurrChng)
+	 *	The core sets this bit when the status of the Port
+	 *	Overcurrent Active bit (bit 4) in this register changes.
+	 * @prtovrcurract: Port Overcurrent Active (PrtOvrCurrAct)
+	 *	Indicates the overcurrent condition of the port.
+	 *	* 1'b0: No overcurrent condition
+	 *	* 1'b1: Overcurrent condition
+	 * @prtenchng: Port Enable/Disable Change (PrtEnChng)
+	 *	The core sets this bit when the status of the Port Enable bit
+	 *	[2] of this register changes.
+	 * @prtena: Port Enable (PrtEna)
+	 *	A port is enabled only by the core after a reset sequence,
+	 *	and is disabled by an overcurrent condition, a disconnect
+	 *	condition, or by the application clearing this bit. The
+	 *	application cannot set this bit by a register write. It can only
+	 *	clear it to disable the port. This bit does not trigger any
+	 *	interrupt to the application.
+	 *	* 1'b0: Port disabled
+	 *	* 1'b1: Port enabled
+	 * @prtconndet: Port Connect Detected (PrtConnDet)
+	 *	The core sets this bit when a device connection is detected
+	 *	to trigger an interrupt to the application using the Host Port
+	 *	Interrupt bit of the Core Interrupt register (GINTSTS.PrtInt).
+	 *	The application must write a 1 to this bit to clear the
+	 *	interrupt.
+	 * @prtconnsts: Port Connect Status (PrtConnSts)
+	 *	* 0: No device is attached to the port.
+	 *	* 1: A device is attached to the port.
+	 */
+	struct cvmx_usbcx_hprt_s {
+		uint32_t reserved_19_31	: 13;
+		uint32_t prtspd		: 2;
+		uint32_t prttstctl	: 4;
+		uint32_t prtpwr		: 1;
+		uint32_t prtlnsts	: 2;
+		uint32_t reserved_9_9	: 1;
+		uint32_t prtrst		: 1;
+		uint32_t prtsusp	: 1;
+		uint32_t prtres		: 1;
+		uint32_t prtovrcurrchng	: 1;
+		uint32_t prtovrcurract	: 1;
+		uint32_t prtenchng	: 1;
+		uint32_t prtena		: 1;
+		uint32_t prtconndet	: 1;
+		uint32_t prtconnsts	: 1;
+	} s;
+};
+typedef union cvmx_usbcx_hprt cvmx_usbcx_hprt_t;
+
+/**
+ * cvmx_usbc#_hptxfsiz
+ *
+ * Host Periodic Transmit FIFO Size Register (HPTXFSIZ)
+ *
+ * This register holds the size and the memory start address of the Periodic
+ * TxFIFO, as shown in Figures 310 and 311.
+ */
+union cvmx_usbcx_hptxfsiz {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hptxfsiz_s
+	 * @ptxfsize: Host Periodic TxFIFO Depth (PTxFSize)
+	 *	This value is in terms of 32-bit words.
+	 *	* Minimum value is 16
+	 *	* Maximum value is 32768
+	 * @ptxfstaddr: Host Periodic TxFIFO Start Address (PTxFStAddr)
+	 */
+	struct cvmx_usbcx_hptxfsiz_s {
+		uint32_t ptxfsize	: 16;
+		uint32_t ptxfstaddr	: 16;
+	} s;
+};
+typedef union cvmx_usbcx_hptxfsiz cvmx_usbcx_hptxfsiz_t;
+
+/**
+ * cvmx_usbc#_hptxsts
+ *
+ * Host Periodic Transmit FIFO/Queue Status Register (HPTXSTS)
+ *
+ * This read-only register contains the free space information for the Periodic
+ * TxFIFO and the Periodic Transmit Request Queue
+ */
+union cvmx_usbcx_hptxsts {
+	uint32_t u32;
+	/**
+	 * struct cvmx_usbcx_hptxsts_s
+	 * @ptxqtop: Top of the Periodic Transmit Request Queue (PTxQTop)
+	 *	This indicates the entry in the Periodic Tx Request Queue that
+	 *	is currently being processes by the MAC.
+	 *	This register is used for debugging.
+	 *	* Bit [31]: Odd/Even (micro)frame
+	 *	- 1'b0: send in even (micro)frame
+	 *	- 1'b1: send in odd (micro)frame
+	 *	* Bits [30:27]: Channel/endpoint number
+	 *	* Bits [26:25]: Type
+	 *	- 2'b00: IN/OUT
+	 *	- 2'b01: Zero-length packet
+	 *	- 2'b10: CSPLIT
+	 *	- 2'b11: Disable channel command
+	 *	* Bit [24]: Terminate (last entry for the selected
+	 *	channel/endpoint)
+	 * @ptxqspcavail: Periodic Transmit Request Queue Space Available
+	 *	(PTxQSpcAvail)
+	 *	Indicates the number of free locations available to be written
+	 *	in the Periodic Transmit Request Queue. This queue holds both
+	 *	IN and OUT requests.
+	 *	* 8'h0: Periodic Transmit Request Queue is full
+	 *	* 8'h1: 1 location available
+	 *	* 8'h2: 2 locations available
+	 *	* n: n locations available (0..8)
+	 *	* Others: Reserved
+	 * @ptxfspcavail: Periodic Transmit Data FIFO Space Available
+	 *		  (PTxFSpcAvail)
+	 *	Indicates the number of free locations available to be written
+	 *	to in the Periodic TxFIFO.
+	 *	Values are in terms of 32-bit words
+	 *	* 16'h0: Periodic TxFIFO is full
+	 *	* 16'h1: 1 word available
+	 *	* 16'h2: 2 words available
+	 *	* 16'hn: n words available (where 0..32768)
+	 *	* 16'h8000: 32768 words available
+	 *	* Others: Reserved
+	 */
+	struct cvmx_usbcx_hptxsts_s {
+		uint32_t ptxqtop	: 8;
+		uint32_t ptxqspcavail	: 8;
+		uint32_t ptxfspcavail	: 16;
+	} s;
+};
+typedef union cvmx_usbcx_hptxsts cvmx_usbcx_hptxsts_t;
+
+#endif

diff --git a/drivers/staging/octeon-usb/cvmx-usbnx-defs.h b/drivers/staging/octeon-usb/cvmx-usbnx-defs.h
new file mode 100644
index 0000000..96d7067
--- /dev/null
+++ b/drivers/staging/octeon-usb/cvmx-usbnx-defs.h

@@ -0,0 +1,887 @@
+/***********************license start***************
+ * Copyright (c) 2003-2010  Cavium Networks (support@cavium.com). All rights
+ * reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+
+ *   * Neither the name of Cavium Networks nor the names of
+ *     its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written
+ *     permission.
+
+ * This Software, including technical data, may be subject to U.S. export
+ * control laws, including the U.S. Export Administration Act and its associated
+ * regulations, and may be subject to export or import  regulations in other
+ * countries.
+
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+ * AND WITH ALL FAULTS AND CAVIUM NETWORKS MAKES NO PROMISES, REPRESENTATIONS OR
+ * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT TO
+ * THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY REPRESENTATION
+ * OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT DEFECTS, AND CAVIUM
+ * SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES OF TITLE,
+ * MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR PURPOSE, LACK OF
+ * VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT, QUIET POSSESSION OR
+ * CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK ARISING OUT OF USE OR
+ * PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
+ ***********************license end**************************************/
+
+
+/**
+ * cvmx-usbnx-defs.h
+ *
+ * Configuration and status register (CSR) type definitions for
+ * Octeon usbnx.
+ *
+ */
+#ifndef __CVMX_USBNX_TYPEDEFS_H__
+#define __CVMX_USBNX_TYPEDEFS_H__
+
+#define CVMX_USBNXBID1(bid) (((bid) & 1) * 0x10000000ull)
+#define CVMX_USBNXBID2(bid) (((bid) & 1) * 0x100000000000ull)
+
+#define CVMX_USBNXREG1(reg, bid) \
+	(CVMX_ADD_IO_SEG(0x0001180068000000ull | reg) + CVMX_USBNXBID1(bid))
+#define CVMX_USBNXREG2(reg, bid) \
+	(CVMX_ADD_IO_SEG(0x00016F0000000000ull | reg) + CVMX_USBNXBID2(bid))
+
+#define CVMX_USBNX_CLK_CTL(bid)		CVMX_USBNXREG1(0x10, bid)
+#define CVMX_USBNX_DMA0_INB_CHN0(bid)	CVMX_USBNXREG2(0x818, bid)
+#define CVMX_USBNX_DMA0_OUTB_CHN0(bid)	CVMX_USBNXREG2(0x858, bid)
+#define CVMX_USBNX_USBP_CTL_STATUS(bid)	CVMX_USBNXREG1(0x18, bid)
+
+/**
+ * cvmx_usbn#_clk_ctl
+ *
+ * USBN_CLK_CTL = USBN's Clock Control
+ *
+ * This register is used to control the frequency of the hclk and the
+ * hreset and phy_rst signals.
+ */
+union cvmx_usbnx_clk_ctl {
+	uint64_t u64;
+	/**
+	 * struct cvmx_usbnx_clk_ctl_s
+	 * @divide2: The 'hclk' used by the USB subsystem is derived
+	 *	from the eclk.
+	 *	Also see the field DIVIDE. DIVIDE2<1> must currently
+	 *	be zero because it is not implemented, so the maximum
+	 *	ratio of eclk/hclk is currently 16.
+	 *	The actual divide number for hclk is:
+	 *	(DIVIDE2 + 1) * (DIVIDE + 1)
+	 * @hclk_rst: When this field is '0' the HCLK-DIVIDER used to
+	 *	generate the hclk in the USB Subsystem is held
+	 *	in reset. This bit must be set to '0' before
+	 *	changing the value os DIVIDE in this register.
+	 *	The reset to the HCLK_DIVIDERis also asserted
+	 *	when core reset is asserted.
+	 * @p_x_on: Force USB-PHY on during suspend.
+	 *	'1' USB-PHY XO block is powered-down during
+	 *	suspend.
+	 *	'0' USB-PHY XO block is powered-up during
+	 *	suspend.
+	 *	The value of this field must be set while POR is
+	 *	active.
+	 * @p_com_on: '0' Force USB-PHY XO Bias, Bandgap and PLL to
+	 *	remain powered in Suspend Mode.
+	 *	'1' The USB-PHY XO Bias, Bandgap and PLL are
+	 *	powered down in suspend mode.
+	 *	The value of this field must be set while POR is
+	 *	active.
+	 * @p_c_sel: Phy clock speed select.
+	 *	Selects the reference clock / crystal frequency.
+	 *	'11': Reserved
+	 *	'10': 48 MHz (reserved when a crystal is used)
+	 *	'01': 24 MHz (reserved when a crystal is used)
+	 *	'00': 12 MHz
+	 *	The value of this field must be set while POR is
+	 *	active.
+	 *	NOTE: if a crystal is used as a reference clock,
+	 *	this field must be set to 12 MHz.
+	 * @cdiv_byp: Used to enable the bypass input to the USB_CLK_DIV.
+	 * @sd_mode: Scaledown mode for the USBC. Control timing events
+	 *	in the USBC, for normal operation this must be '0'.
+	 * @s_bist: Starts bist on the hclk memories, during the '0'
+	 *	to '1' transition.
+	 * @por: Power On Reset for the PHY.
+	 *	Resets all the PHYS registers and state machines.
+	 * @enable: When '1' allows the generation of the hclk. When
+	 *	'0' the hclk will not be generated. SEE DIVIDE
+	 *	field of this register.
+	 * @prst: When this field is '0' the reset associated with
+	 *	the phy_clk functionality in the USB Subsystem is
+	 *	help in reset. This bit should not be set to '1'
+	 *	until the time it takes 6 clocks (hclk or phy_clk,
+	 *	whichever is slower) has passed. Under normal
+	 *	operation once this bit is set to '1' it should not
+	 *	be set to '0'.
+	 * @hrst: When this field is '0' the reset associated with
+	 *	the hclk functioanlity in the USB Subsystem is
+	 *	held in reset.This bit should not be set to '1'
+	 *	until 12ms after phy_clk is stable. Under normal
+	 *	operation, once this bit is set to '1' it should
+	 *	not be set to '0'.
+	 * @divide: The frequency of 'hclk' used by the USB subsystem
+	 *	is the eclk frequency divided by the value of
+	 *	(DIVIDE2 + 1) * (DIVIDE + 1), also see the field
+	 *	DIVIDE2 of this register.
+	 *	The hclk frequency should be less than 125Mhz.
+	 *	After writing a value to this field the SW should
+	 *	read the field for the value written.
+	 *	The ENABLE field of this register should not be set
+	 *	until AFTER this field is set and then read.
+	 */
+	struct cvmx_usbnx_clk_ctl_s {
+		uint64_t reserved_20_63	: 44;
+		uint64_t divide2	: 2;
+		uint64_t hclk_rst	: 1;
+		uint64_t p_x_on		: 1;
+		uint64_t reserved_14_15	: 2;
+		uint64_t p_com_on	: 1;
+		uint64_t p_c_sel	: 2;
+		uint64_t cdiv_byp	: 1;
+		uint64_t sd_mode	: 2;
+		uint64_t s_bist		: 1;
+		uint64_t por		: 1;
+		uint64_t enable		: 1;
+		uint64_t prst		: 1;
+		uint64_t hrst		: 1;
+		uint64_t divide		: 3;
+	} s;
+	/**
+	 * struct cvmx_usbnx_clk_ctl_cn30xx
+	 * @hclk_rst: When this field is '0' the HCLK-DIVIDER used to
+	 *	generate the hclk in the USB Subsystem is held
+	 *	in reset. This bit must be set to '0' before
+	 *	changing the value os DIVIDE in this register.
+	 *	The reset to the HCLK_DIVIDERis also asserted
+	 *	when core reset is asserted.
+	 * @p_x_on: Force USB-PHY on during suspend.
+	 *	'1' USB-PHY XO block is powered-down during
+	 *	suspend.
+	 *	'0' USB-PHY XO block is powered-up during
+	 *	suspend.
+	 *	The value of this field must be set while POR is
+	 *	active.
+	 * @p_rclk: Phy refrence clock enable.
+	 *	'1' The PHY PLL uses the XO block output as a
+	 *	reference.
+	 *	'0' Reserved.
+	 * @p_xenbn: Phy external clock enable.
+	 *	'1' The XO block uses the clock from a crystal.
+	 *	'0' The XO block uses an external clock supplied
+	 *	on the XO pin. USB_XI should be tied to
+	 *	ground for this usage.
+	 * @p_com_on: '0' Force USB-PHY XO Bias, Bandgap and PLL to
+	 *	remain powered in Suspend Mode.
+	 *	'1' The USB-PHY XO Bias, Bandgap and PLL are
+	 *	powered down in suspend mode.
+	 *	The value of this field must be set while POR is
+	 *	active.
+	 * @p_c_sel: Phy clock speed select.
+	 *	Selects the reference clock / crystal frequency.
+	 *	'11': Reserved
+	 *	'10': 48 MHz
+	 *	'01': 24 MHz
+	 *	'00': 12 MHz
+	 *	The value of this field must be set while POR is
+	 *	active.
+	 * @cdiv_byp: Used to enable the bypass input to the USB_CLK_DIV.
+	 * @sd_mode: Scaledown mode for the USBC. Control timing events
+	 *	in the USBC, for normal operation this must be '0'.
+	 * @s_bist: Starts bist on the hclk memories, during the '0'
+	 *	to '1' transition.
+	 * @por: Power On Reset for the PHY.
+	 *	Resets all the PHYS registers and state machines.
+	 * @enable: When '1' allows the generation of the hclk. When
+	 *	'0' the hclk will not be generated.
+	 * @prst: When this field is '0' the reset associated with
+	 *	the phy_clk functionality in the USB Subsystem is
+	 *	help in reset. This bit should not be set to '1'
+	 *	until the time it takes 6 clocks (hclk or phy_clk,
+	 *	whichever is slower) has passed. Under normal
+	 *	operation once this bit is set to '1' it should not
+	 *	be set to '0'.
+	 * @hrst: When this field is '0' the reset associated with
+	 *	the hclk functioanlity in the USB Subsystem is
+	 *	held in reset.This bit should not be set to '1'
+	 *	until 12ms after phy_clk is stable. Under normal
+	 *	operation, once this bit is set to '1' it should
+	 *	not be set to '0'.
+	 * @divide: The 'hclk' used by the USB subsystem is derived
+	 *	from the eclk. The eclk will be divided by the
+	 *	value of this field +1 to determine the hclk
+	 *	frequency. (Also see HRST of this register).
+	 *	The hclk frequency must be less than 125 MHz.
+	 */
+	struct cvmx_usbnx_clk_ctl_cn30xx {
+		uint64_t reserved_18_63	: 46;
+		uint64_t hclk_rst	: 1;
+		uint64_t p_x_on		: 1;
+		uint64_t p_rclk		: 1;
+		uint64_t p_xenbn	: 1;
+		uint64_t p_com_on	: 1;
+		uint64_t p_c_sel	: 2;
+		uint64_t cdiv_byp	: 1;
+		uint64_t sd_mode	: 2;
+		uint64_t s_bist		: 1;
+		uint64_t por		: 1;
+		uint64_t enable		: 1;
+		uint64_t prst		: 1;
+		uint64_t hrst		: 1;
+		uint64_t divide		: 3;
+	} cn30xx;
+	struct cvmx_usbnx_clk_ctl_cn30xx cn31xx;
+	/**
+	 * struct cvmx_usbnx_clk_ctl_cn50xx
+	 * @divide2: The 'hclk' used by the USB subsystem is derived
+	 *	from the eclk.
+	 *	Also see the field DIVIDE. DIVIDE2<1> must currently
+	 *	be zero because it is not implemented, so the maximum
+	 *	ratio of eclk/hclk is currently 16.
+	 *	The actual divide number for hclk is:
+	 *	(DIVIDE2 + 1) * (DIVIDE + 1)
+	 * @hclk_rst: When this field is '0' the HCLK-DIVIDER used to
+	 *	generate the hclk in the USB Subsystem is held
+	 *	in reset. This bit must be set to '0' before
+	 *	changing the value os DIVIDE in this register.
+	 *	The reset to the HCLK_DIVIDERis also asserted
+	 *	when core reset is asserted.
+	 * @p_rtype: PHY reference clock type
+	 *	'0' The USB-PHY uses a 12MHz crystal as a clock
+	 *	source at the USB_XO and USB_XI pins
+	 *	'1' Reserved
+	 *	'2' The USB_PHY uses 12/24/48MHz 2.5V board clock
+	 *	at the USB_XO pin. USB_XI should be tied to
+	 *	ground in this case.
+	 *	'3' Reserved
+	 *	(bit 14 was P_XENBN on 3xxx)
+	 *	(bit 15 was P_RCLK on 3xxx)
+	 * @p_com_on: '0' Force USB-PHY XO Bias, Bandgap and PLL to
+	 *	remain powered in Suspend Mode.
+	 *	'1' The USB-PHY XO Bias, Bandgap and PLL are
+	 *	powered down in suspend mode.
+	 *	The value of this field must be set while POR is
+	 *	active.
+	 * @p_c_sel: Phy clock speed select.
+	 *	Selects the reference clock / crystal frequency.
+	 *	'11': Reserved
+	 *	'10': 48 MHz (reserved when a crystal is used)
+	 *	'01': 24 MHz (reserved when a crystal is used)
+	 *	'00': 12 MHz
+	 *	The value of this field must be set while POR is
+	 *	active.
+	 *	NOTE: if a crystal is used as a reference clock,
+	 *	this field must be set to 12 MHz.
+	 * @cdiv_byp: Used to enable the bypass input to the USB_CLK_DIV.
+	 * @sd_mode: Scaledown mode for the USBC. Control timing events
+	 *	in the USBC, for normal operation this must be '0'.
+	 * @s_bist: Starts bist on the hclk memories, during the '0'
+	 *	to '1' transition.
+	 * @por: Power On Reset for the PHY.
+	 *	Resets all the PHYS registers and state machines.
+	 * @enable: When '1' allows the generation of the hclk. When
+	 *	'0' the hclk will not be generated. SEE DIVIDE
+	 *	field of this register.
+	 * @prst: When this field is '0' the reset associated with
+	 *	the phy_clk functionality in the USB Subsystem is
+	 *	help in reset. This bit should not be set to '1'
+	 *	until the time it takes 6 clocks (hclk or phy_clk,
+	 *	whichever is slower) has passed. Under normal
+	 *	operation once this bit is set to '1' it should not
+	 *	be set to '0'.
+	 * @hrst: When this field is '0' the reset associated with
+	 *	the hclk functioanlity in the USB Subsystem is
+	 *	held in reset.This bit should not be set to '1'
+	 *	until 12ms after phy_clk is stable. Under normal
+	 *	operation, once this bit is set to '1' it should
+	 *	not be set to '0'.
+	 * @divide: The frequency of 'hclk' used by the USB subsystem
+	 *	is the eclk frequency divided by the value of
+	 *	(DIVIDE2 + 1) * (DIVIDE + 1), also see the field
+	 *	DIVIDE2 of this register.
+	 *	The hclk frequency should be less than 125Mhz.
+	 *	After writing a value to this field the SW should
+	 *	read the field for the value written.
+	 *	The ENABLE field of this register should not be set
+	 *	until AFTER this field is set and then read.
+	 */
+	struct cvmx_usbnx_clk_ctl_cn50xx {
+		uint64_t reserved_20_63	: 44;
+		uint64_t divide2	: 2;
+		uint64_t hclk_rst	: 1;
+		uint64_t reserved_16_16 : 1;
+		uint64_t p_rtype	: 2;
+		uint64_t p_com_on	: 1;
+		uint64_t p_c_sel	: 2;
+		uint64_t cdiv_byp	: 1;
+		uint64_t sd_mode	: 2;
+		uint64_t s_bist		: 1;
+		uint64_t por		: 1;
+		uint64_t enable		: 1;
+		uint64_t prst		: 1;
+		uint64_t hrst		: 1;
+		uint64_t divide		: 3;
+	} cn50xx;
+	struct cvmx_usbnx_clk_ctl_cn50xx cn52xx;
+	struct cvmx_usbnx_clk_ctl_cn50xx cn56xx;
+};
+typedef union cvmx_usbnx_clk_ctl cvmx_usbnx_clk_ctl_t;
+
+/**
+ * cvmx_usbn#_usbp_ctl_status
+ *
+ * USBN_USBP_CTL_STATUS = USBP Control And Status Register
+ *
+ * Contains general control and status information for the USBN block.
+ */
+union cvmx_usbnx_usbp_ctl_status {
+	uint64_t u64;
+	/**
+	 * struct cvmx_usbnx_usbp_ctl_status_s
+	 * @txrisetune: HS Transmitter Rise/Fall Time Adjustment
+	 * @txvreftune: HS DC Voltage Level Adjustment
+	 * @txfslstune: FS/LS Source Impedence Adjustment
+	 * @txhsxvtune: Transmitter High-Speed Crossover Adjustment
+	 * @sqrxtune: Squelch Threshold Adjustment
+	 * @compdistune: Disconnect Threshold Adjustment
+	 * @otgtune: VBUS Valid Threshold Adjustment
+	 * @otgdisable: OTG Block Disable
+	 * @portreset: Per_Port Reset
+	 * @drvvbus: Drive VBUS
+	 * @lsbist: Low-Speed BIST Enable.
+	 * @fsbist: Full-Speed BIST Enable.
+	 * @hsbist: High-Speed BIST Enable.
+	 * @bist_done: PHY Bist Done.
+	 *	Asserted at the end of the PHY BIST sequence.
+	 * @bist_err: PHY Bist Error.
+	 *	Indicates an internal error was detected during
+	 *	the BIST sequence.
+	 * @tdata_out: PHY Test Data Out.
+	 *	Presents either internaly generated signals or
+	 *	test register contents, based upon the value of
+	 *	test_data_out_sel.
+	 * @siddq: Drives the USBP (USB-PHY) SIDDQ input.
+	 *	Normally should be set to zero.
+	 *	When customers have no intent to use USB PHY
+	 *	interface, they should:
+	 *	- still provide 3.3V to USB_VDD33, and
+	 *	- tie USB_REXT to 3.3V supply, and
+	 *	- set USBN*_USBP_CTL_STATUS[SIDDQ]=1
+	 * @txpreemphasistune: HS Transmitter Pre-Emphasis Enable
+	 * @dma_bmode: When set to 1 the L2C DMA address will be updated
+	 *	with byte-counts between packets. When set to 0
+	 *	the L2C DMA address is incremented to the next
+	 *	4-byte aligned address after adding byte-count.
+	 * @usbc_end: Bigendian input to the USB Core. This should be
+	 *	set to '0' for operation.
+	 * @usbp_bist: PHY, This is cleared '0' to run BIST on the USBP.
+	 * @tclk: PHY Test Clock, used to load TDATA_IN to the USBP.
+	 * @dp_pulld: PHY DP_PULLDOWN input to the USB-PHY.
+	 *	This signal enables the pull-down resistance on
+	 *	the D+ line. '1' pull down-resistance is connected
+	 *	to D+/ '0' pull down resistance is not connected
+	 *	to D+. When an A/B device is acting as a host
+	 *	(downstream-facing port), dp_pulldown and
+	 *	dm_pulldown are enabled. This must not toggle
+	 *	during normal opeartion.
+	 * @dm_pulld: PHY DM_PULLDOWN input to the USB-PHY.
+	 *	This signal enables the pull-down resistance on
+	 *	the D- line. '1' pull down-resistance is connected
+	 *	to D-. '0' pull down resistance is not connected
+	 *	to D-. When an A/B device is acting as a host
+	 *	(downstream-facing port), dp_pulldown and
+	 *	dm_pulldown are enabled. This must not toggle
+	 *	during normal opeartion.
+	 * @hst_mode: When '0' the USB is acting as HOST, when '1'
+	 *	USB is acting as device. This field needs to be
+	 *	set while the USB is in reset.
+	 * @tuning: Transmitter Tuning for High-Speed Operation.
+	 *	Tunes the current supply and rise/fall output
+	 *	times for high-speed operation.
+	 *	[20:19] == 11: Current supply increased
+	 *	approximately 9%
+	 *	[20:19] == 10: Current supply increased
+	 *	approximately 4.5%
+	 *	[20:19] == 01: Design default.
+	 *	[20:19] == 00: Current supply decreased
+	 *	approximately 4.5%
+	 *	[22:21] == 11: Rise and fall times are increased.
+	 *	[22:21] == 10: Design default.
+	 *	[22:21] == 01: Rise and fall times are decreased.
+	 *	[22:21] == 00: Rise and fall times are decreased
+	 *	further as compared to the 01 setting.
+	 * @tx_bs_enh: Transmit Bit Stuffing on [15:8].
+	 *	Enables or disables bit stuffing on data[15:8]
+	 *	when bit-stuffing is enabled.
+	 * @tx_bs_en: Transmit Bit Stuffing on [7:0].
+	 *	Enables or disables bit stuffing on data[7:0]
+	 *	when bit-stuffing is enabled.
+	 * @loop_enb: PHY Loopback Test Enable.
+	 *	'1': During data transmission the receive is
+	 *	enabled.
+	 *	'0': During data transmission the receive is
+	 *	disabled.
+	 *	Must be '0' for normal operation.
+	 * @vtest_enb: Analog Test Pin Enable.
+	 *	'1' The PHY's analog_test pin is enabled for the
+	 *	input and output of applicable analog test signals.
+	 *	'0' THe analog_test pin is disabled.
+	 * @bist_enb: Built-In Self Test Enable.
+	 *	Used to activate BIST in the PHY.
+	 * @tdata_sel: Test Data Out Select.
+	 *	'1' test_data_out[3:0] (PHY) register contents
+	 *	are output. '0' internaly generated signals are
+	 *	output.
+	 * @taddr_in: Mode Address for Test Interface.
+	 *	Specifies the register address for writing to or
+	 *	reading from the PHY test interface register.
+	 * @tdata_in: Internal Testing Register Input Data and Select
+	 *	This is a test bus. Data is present on [3:0],
+	 *	and its corresponding select (enable) is present
+	 *	on bits [7:4].
+	 * @ate_reset: Reset input from automatic test equipment.
+	 *	This is a test signal. When the USB Core is
+	 *	powered up (not in Susned Mode), an automatic
+	 *	tester can use this to disable phy_clock and
+	 *	free_clk, then re-eanable them with an aligned
+	 *	phase.
+	 *	'1': The phy_clk and free_clk outputs are
+	 *	disabled. "0": The phy_clock and free_clk outputs
+	 *	are available within a specific period after the
+	 *	de-assertion.
+	 */
+	struct cvmx_usbnx_usbp_ctl_status_s {
+		uint64_t txrisetune		: 1;
+		uint64_t txvreftune		: 4;
+		uint64_t txfslstune		: 4;
+		uint64_t txhsxvtune		: 2;
+		uint64_t sqrxtune		: 3;
+		uint64_t compdistune		: 3;
+		uint64_t otgtune		: 3;
+		uint64_t otgdisable		: 1;
+		uint64_t portreset		: 1;
+		uint64_t drvvbus		: 1;
+		uint64_t lsbist			: 1;
+		uint64_t fsbist			: 1;
+		uint64_t hsbist			: 1;
+		uint64_t bist_done		: 1;
+		uint64_t bist_err		: 1;
+		uint64_t tdata_out		: 4;
+		uint64_t siddq			: 1;
+		uint64_t txpreemphasistune	: 1;
+		uint64_t dma_bmode		: 1;
+		uint64_t usbc_end		: 1;
+		uint64_t usbp_bist		: 1;
+		uint64_t tclk			: 1;
+		uint64_t dp_pulld		: 1;
+		uint64_t dm_pulld		: 1;
+		uint64_t hst_mode		: 1;
+		uint64_t tuning			: 4;
+		uint64_t tx_bs_enh		: 1;
+		uint64_t tx_bs_en		: 1;
+		uint64_t loop_enb		: 1;
+		uint64_t vtest_enb		: 1;
+		uint64_t bist_enb		: 1;
+		uint64_t tdata_sel		: 1;
+		uint64_t taddr_in		: 4;
+		uint64_t tdata_in		: 8;
+		uint64_t ate_reset		: 1;
+	} s;
+	/**
+	 * struct cvmx_usbnx_usbp_ctl_status_cn30xx
+	 * @bist_done: PHY Bist Done.
+	 *	Asserted at the end of the PHY BIST sequence.
+	 * @bist_err: PHY Bist Error.
+	 *	Indicates an internal error was detected during
+	 *	the BIST sequence.
+	 * @tdata_out: PHY Test Data Out.
+	 *	Presents either internaly generated signals or
+	 *	test register contents, based upon the value of
+	 *	test_data_out_sel.
+	 * @dma_bmode: When set to 1 the L2C DMA address will be updated
+	 *	with byte-counts between packets. When set to 0
+	 *	the L2C DMA address is incremented to the next
+	 *	4-byte aligned address after adding byte-count.
+	 * @usbc_end: Bigendian input to the USB Core. This should be
+	 *	set to '0' for operation.
+	 * @usbp_bist: PHY, This is cleared '0' to run BIST on the USBP.
+	 * @tclk: PHY Test Clock, used to load TDATA_IN to the USBP.
+	 * @dp_pulld: PHY DP_PULLDOWN input to the USB-PHY.
+	 *	This signal enables the pull-down resistance on
+	 *	the D+ line. '1' pull down-resistance is connected
+	 *	to D+/ '0' pull down resistance is not connected
+	 *	to D+. When an A/B device is acting as a host
+	 *	(downstream-facing port), dp_pulldown and
+	 *	dm_pulldown are enabled. This must not toggle
+	 *	during normal opeartion.
+	 * @dm_pulld: PHY DM_PULLDOWN input to the USB-PHY.
+	 *	This signal enables the pull-down resistance on
+	 *	the D- line. '1' pull down-resistance is connected
+	 *	to D-. '0' pull down resistance is not connected
+	 *	to D-. When an A/B device is acting as a host
+	 *	(downstream-facing port), dp_pulldown and
+	 *	dm_pulldown are enabled. This must not toggle
+	 *	during normal opeartion.
+	 * @hst_mode: When '0' the USB is acting as HOST, when '1'
+	 *	USB is acting as device. This field needs to be
+	 *	set while the USB is in reset.
+	 * @tuning: Transmitter Tuning for High-Speed Operation.
+	 *	Tunes the current supply and rise/fall output
+	 *	times for high-speed operation.
+	 *	[20:19] == 11: Current supply increased
+	 *	approximately 9%
+	 *	[20:19] == 10: Current supply increased
+	 *	approximately 4.5%
+	 *	[20:19] == 01: Design default.
+	 *	[20:19] == 00: Current supply decreased
+	 *	approximately 4.5%
+	 *	[22:21] == 11: Rise and fall times are increased.
+	 *	[22:21] == 10: Design default.
+	 *	[22:21] == 01: Rise and fall times are decreased.
+	 *	[22:21] == 00: Rise and fall times are decreased
+	 *	further as compared to the 01 setting.
+	 * @tx_bs_enh: Transmit Bit Stuffing on [15:8].
+	 *	Enables or disables bit stuffing on data[15:8]
+	 *	when bit-stuffing is enabled.
+	 * @tx_bs_en: Transmit Bit Stuffing on [7:0].
+	 *	Enables or disables bit stuffing on data[7:0]
+	 *	when bit-stuffing is enabled.
+	 * @loop_enb: PHY Loopback Test Enable.
+	 *	'1': During data transmission the receive is
+	 *	enabled.
+	 *	'0': During data transmission the receive is
+	 *	disabled.
+	 *	Must be '0' for normal operation.
+	 * @vtest_enb: Analog Test Pin Enable.
+	 *	'1' The PHY's analog_test pin is enabled for the
+	 *	input and output of applicable analog test signals.
+	 *	'0' THe analog_test pin is disabled.
+	 * @bist_enb: Built-In Self Test Enable.
+	 *	Used to activate BIST in the PHY.
+	 * @tdata_sel: Test Data Out Select.
+	 *	'1' test_data_out[3:0] (PHY) register contents
+	 *	are output. '0' internaly generated signals are
+	 *	output.
+	 * @taddr_in: Mode Address for Test Interface.
+	 *	Specifies the register address for writing to or
+	 *	reading from the PHY test interface register.
+	 * @tdata_in: Internal Testing Register Input Data and Select
+	 *	This is a test bus. Data is present on [3:0],
+	 *	and its corresponding select (enable) is present
+	 *	on bits [7:4].
+	 * @ate_reset: Reset input from automatic test equipment.
+	 *	This is a test signal. When the USB Core is
+	 *	powered up (not in Susned Mode), an automatic
+	 *	tester can use this to disable phy_clock and
+	 *	free_clk, then re-eanable them with an aligned
+	 *	phase.
+	 *	'1': The phy_clk and free_clk outputs are
+	 *	disabled. "0": The phy_clock and free_clk outputs
+	 *	are available within a specific period after the
+	 *	de-assertion.
+	 */
+	struct cvmx_usbnx_usbp_ctl_status_cn30xx {
+		uint64_t reserved_38_63	: 26;
+		uint64_t bist_done	: 1;
+		uint64_t bist_err	: 1;
+		uint64_t tdata_out	: 4;
+		uint64_t reserved_30_31	: 2;
+		uint64_t dma_bmode	: 1;
+		uint64_t usbc_end	: 1;
+		uint64_t usbp_bist	: 1;
+		uint64_t tclk		: 1;
+		uint64_t dp_pulld	: 1;
+		uint64_t dm_pulld	: 1;
+		uint64_t hst_mode	: 1;
+		uint64_t tuning		: 4;
+		uint64_t tx_bs_enh	: 1;
+		uint64_t tx_bs_en	: 1;
+		uint64_t loop_enb	: 1;
+		uint64_t vtest_enb	: 1;
+		uint64_t bist_enb	: 1;
+		uint64_t tdata_sel	: 1;
+		uint64_t taddr_in	: 4;
+		uint64_t tdata_in	: 8;
+		uint64_t ate_reset	: 1;
+	} cn30xx;
+	/**
+	 * struct cvmx_usbnx_usbp_ctl_status_cn50xx
+	 * @txrisetune: HS Transmitter Rise/Fall Time Adjustment
+	 * @txvreftune: HS DC Voltage Level Adjustment
+	 * @txfslstune: FS/LS Source Impedence Adjustment
+	 * @txhsxvtune: Transmitter High-Speed Crossover Adjustment
+	 * @sqrxtune: Squelch Threshold Adjustment
+	 * @compdistune: Disconnect Threshold Adjustment
+	 * @otgtune: VBUS Valid Threshold Adjustment
+	 * @otgdisable: OTG Block Disable
+	 * @portreset: Per_Port Reset
+	 * @drvvbus: Drive VBUS
+	 * @lsbist: Low-Speed BIST Enable.
+	 * @fsbist: Full-Speed BIST Enable.
+	 * @hsbist: High-Speed BIST Enable.
+	 * @bist_done: PHY Bist Done.
+	 *	Asserted at the end of the PHY BIST sequence.
+	 * @bist_err: PHY Bist Error.
+	 *	Indicates an internal error was detected during
+	 *	the BIST sequence.
+	 * @tdata_out: PHY Test Data Out.
+	 *	Presents either internaly generated signals or
+	 *	test register contents, based upon the value of
+	 *	test_data_out_sel.
+	 * @txpreemphasistune: HS Transmitter Pre-Emphasis Enable
+	 * @dma_bmode: When set to 1 the L2C DMA address will be updated
+	 *	with byte-counts between packets. When set to 0
+	 *	the L2C DMA address is incremented to the next
+	 *	4-byte aligned address after adding byte-count.
+	 * @usbc_end: Bigendian input to the USB Core. This should be
+	 *	set to '0' for operation.
+	 * @usbp_bist: PHY, This is cleared '0' to run BIST on the USBP.
+	 * @tclk: PHY Test Clock, used to load TDATA_IN to the USBP.
+	 * @dp_pulld: PHY DP_PULLDOWN input to the USB-PHY.
+	 *	This signal enables the pull-down resistance on
+	 *	the D+ line. '1' pull down-resistance is connected
+	 *	to D+/ '0' pull down resistance is not connected
+	 *	to D+. When an A/B device is acting as a host
+	 *	(downstream-facing port), dp_pulldown and
+	 *	dm_pulldown are enabled. This must not toggle
+	 *	during normal opeartion.
+	 * @dm_pulld: PHY DM_PULLDOWN input to the USB-PHY.
+	 *	This signal enables the pull-down resistance on
+	 *	the D- line. '1' pull down-resistance is connected
+	 *	to D-. '0' pull down resistance is not connected
+	 *	to D-. When an A/B device is acting as a host
+	 *	(downstream-facing port), dp_pulldown and
+	 *	dm_pulldown are enabled. This must not toggle
+	 *	during normal opeartion.
+	 * @hst_mode: When '0' the USB is acting as HOST, when '1'
+	 *	USB is acting as device. This field needs to be
+	 *	set while the USB is in reset.
+	 * @tx_bs_enh: Transmit Bit Stuffing on [15:8].
+	 *	Enables or disables bit stuffing on data[15:8]
+	 *	when bit-stuffing is enabled.
+	 * @tx_bs_en: Transmit Bit Stuffing on [7:0].
+	 *	Enables or disables bit stuffing on data[7:0]
+	 *	when bit-stuffing is enabled.
+	 * @loop_enb: PHY Loopback Test Enable.
+	 *	'1': During data transmission the receive is
+	 *	enabled.
+	 *	'0': During data transmission the receive is
+	 *	disabled.
+	 *	Must be '0' for normal operation.
+	 * @vtest_enb: Analog Test Pin Enable.
+	 *	'1' The PHY's analog_test pin is enabled for the
+	 *	input and output of applicable analog test signals.
+	 *	'0' THe analog_test pin is disabled.
+	 * @bist_enb: Built-In Self Test Enable.
+	 *	Used to activate BIST in the PHY.
+	 * @tdata_sel: Test Data Out Select.
+	 *	'1' test_data_out[3:0] (PHY) register contents
+	 *	are output. '0' internaly generated signals are
+	 *	output.
+	 * @taddr_in: Mode Address for Test Interface.
+	 *	Specifies the register address for writing to or
+	 *	reading from the PHY test interface register.
+	 * @tdata_in: Internal Testing Register Input Data and Select
+	 *	This is a test bus. Data is present on [3:0],
+	 *	and its corresponding select (enable) is present
+	 *	on bits [7:4].
+	 * @ate_reset: Reset input from automatic test equipment.
+	 *	This is a test signal. When the USB Core is
+	 *	powered up (not in Susned Mode), an automatic
+	 *	tester can use this to disable phy_clock and
+	 *	free_clk, then re-eanable them with an aligned
+	 *	phase.
+	 *	'1': The phy_clk and free_clk outputs are
+	 *	disabled. "0": The phy_clock and free_clk outputs
+	 *	are available within a specific period after the
+	 *	de-assertion.
+	 */
+	struct cvmx_usbnx_usbp_ctl_status_cn50xx {
+		uint64_t txrisetune		: 1;
+		uint64_t txvreftune		: 4;
+		uint64_t txfslstune		: 4;
+		uint64_t txhsxvtune		: 2;
+		uint64_t sqrxtune		: 3;
+		uint64_t compdistune		: 3;
+		uint64_t otgtune		: 3;
+		uint64_t otgdisable		: 1;
+		uint64_t portreset		: 1;
+		uint64_t drvvbus		: 1;
+		uint64_t lsbist			: 1;
+		uint64_t fsbist			: 1;
+		uint64_t hsbist			: 1;
+		uint64_t bist_done		: 1;
+		uint64_t bist_err		: 1;
+		uint64_t tdata_out		: 4;
+		uint64_t reserved_31_31		: 1;
+		uint64_t txpreemphasistune	: 1;
+		uint64_t dma_bmode		: 1;
+		uint64_t usbc_end		: 1;
+		uint64_t usbp_bist		: 1;
+		uint64_t tclk			: 1;
+		uint64_t dp_pulld		: 1;
+		uint64_t dm_pulld		: 1;
+		uint64_t hst_mode		: 1;
+		uint64_t reserved_19_22		: 4;
+		uint64_t tx_bs_enh		: 1;
+		uint64_t tx_bs_en		: 1;
+		uint64_t loop_enb		: 1;
+		uint64_t vtest_enb		: 1;
+		uint64_t bist_enb		: 1;
+		uint64_t tdata_sel		: 1;
+		uint64_t taddr_in		: 4;
+		uint64_t tdata_in		: 8;
+		uint64_t ate_reset		: 1;
+	} cn50xx;
+	/**
+	 * struct cvmx_usbnx_usbp_ctl_status_cn52xx
+	 * @txrisetune: HS Transmitter Rise/Fall Time Adjustment
+	 * @txvreftune: HS DC Voltage Level Adjustment
+	 * @txfslstune: FS/LS Source Impedence Adjustment
+	 * @txhsxvtune: Transmitter High-Speed Crossover Adjustment
+	 * @sqrxtune: Squelch Threshold Adjustment
+	 * @compdistune: Disconnect Threshold Adjustment
+	 * @otgtune: VBUS Valid Threshold Adjustment
+	 * @otgdisable: OTG Block Disable
+	 * @portreset: Per_Port Reset
+	 * @drvvbus: Drive VBUS
+	 * @lsbist: Low-Speed BIST Enable.
+	 * @fsbist: Full-Speed BIST Enable.
+	 * @hsbist: High-Speed BIST Enable.
+	 * @bist_done: PHY Bist Done.
+	 *	Asserted at the end of the PHY BIST sequence.
+	 * @bist_err: PHY Bist Error.
+	 *	Indicates an internal error was detected during
+	 *	the BIST sequence.
+	 * @tdata_out: PHY Test Data Out.
+	 *	Presents either internaly generated signals or
+	 *	test register contents, based upon the value of
+	 *	test_data_out_sel.
+	 * @siddq: Drives the USBP (USB-PHY) SIDDQ input.
+	 *	Normally should be set to zero.
+	 *	When customers have no intent to use USB PHY
+	 *	interface, they should:
+	 *	- still provide 3.3V to USB_VDD33, and
+	 *	- tie USB_REXT to 3.3V supply, and
+	 *	- set USBN*_USBP_CTL_STATUS[SIDDQ]=1
+	 * @txpreemphasistune: HS Transmitter Pre-Emphasis Enable
+	 * @dma_bmode: When set to 1 the L2C DMA address will be updated
+	 *	with byte-counts between packets. When set to 0
+	 *	the L2C DMA address is incremented to the next
+	 *	4-byte aligned address after adding byte-count.
+	 * @usbc_end: Bigendian input to the USB Core. This should be
+	 *	set to '0' for operation.
+	 * @usbp_bist: PHY, This is cleared '0' to run BIST on the USBP.
+	 * @tclk: PHY Test Clock, used to load TDATA_IN to the USBP.
+	 * @dp_pulld: PHY DP_PULLDOWN input to the USB-PHY.
+	 *	This signal enables the pull-down resistance on
+	 *	the D+ line. '1' pull down-resistance is connected
+	 *	to D+/ '0' pull down resistance is not connected
+	 *	to D+. When an A/B device is acting as a host
+	 *	(downstream-facing port), dp_pulldown and
+	 *	dm_pulldown are enabled. This must not toggle
+	 *	during normal opeartion.
+	 * @dm_pulld: PHY DM_PULLDOWN input to the USB-PHY.
+	 *	This signal enables the pull-down resistance on
+	 *	the D- line. '1' pull down-resistance is connected
+	 *	to D-. '0' pull down resistance is not connected
+	 *	to D-. When an A/B device is acting as a host
+	 *	(downstream-facing port), dp_pulldown and
+	 *	dm_pulldown are enabled. This must not toggle
+	 *	during normal opeartion.
+	 * @hst_mode: When '0' the USB is acting as HOST, when '1'
+	 *	USB is acting as device. This field needs to be
+	 *	set while the USB is in reset.
+	 * @tx_bs_enh: Transmit Bit Stuffing on [15:8].
+	 *	Enables or disables bit stuffing on data[15:8]
+	 *	when bit-stuffing is enabled.
+	 * @tx_bs_en: Transmit Bit Stuffing on [7:0].
+	 *	Enables or disables bit stuffing on data[7:0]
+	 *	when bit-stuffing is enabled.
+	 * @loop_enb: PHY Loopback Test Enable.
+	 *	'1': During data transmission the receive is
+	 *	enabled.
+	 *	'0': During data transmission the receive is
+	 *	disabled.
+	 *	Must be '0' for normal operation.
+	 * @vtest_enb: Analog Test Pin Enable.
+	 *	'1' The PHY's analog_test pin is enabled for the
+	 *	input and output of applicable analog test signals.
+	 *	'0' THe analog_test pin is disabled.
+	 * @bist_enb: Built-In Self Test Enable.
+	 *	Used to activate BIST in the PHY.
+	 * @tdata_sel: Test Data Out Select.
+	 *	'1' test_data_out[3:0] (PHY) register contents
+	 *	are output. '0' internaly generated signals are
+	 *	output.
+	 * @taddr_in: Mode Address for Test Interface.
+	 *	Specifies the register address for writing to or
+	 *	reading from the PHY test interface register.
+	 * @tdata_in: Internal Testing Register Input Data and Select
+	 *	This is a test bus. Data is present on [3:0],
+	 *	and its corresponding select (enable) is present
+	 *	on bits [7:4].
+	 * @ate_reset: Reset input from automatic test equipment.
+	 *	This is a test signal. When the USB Core is
+	 *	powered up (not in Susned Mode), an automatic
+	 *	tester can use this to disable phy_clock and
+	 *	free_clk, then re-eanable them with an aligned
+	 *	phase.
+	 *	'1': The phy_clk and free_clk outputs are
+	 *	disabled. "0": The phy_clock and free_clk outputs
+	 *	are available within a specific period after the
+	 *	de-assertion.
+	 */
+	struct cvmx_usbnx_usbp_ctl_status_cn52xx {
+		uint64_t txrisetune		: 1;
+		uint64_t txvreftune		: 4;
+		uint64_t txfslstune		: 4;
+		uint64_t txhsxvtune		: 2;
+		uint64_t sqrxtune		: 3;
+		uint64_t compdistune		: 3;
+		uint64_t otgtune		: 3;
+		uint64_t otgdisable		: 1;
+		uint64_t portreset		: 1;
+		uint64_t drvvbus		: 1;
+		uint64_t lsbist			: 1;
+		uint64_t fsbist			: 1;
+		uint64_t hsbist			: 1;
+		uint64_t bist_done		: 1;
+		uint64_t bist_err		: 1;
+		uint64_t tdata_out		: 4;
+		uint64_t siddq			: 1;
+		uint64_t txpreemphasistune	: 1;
+		uint64_t dma_bmode		: 1;
+		uint64_t usbc_end		: 1;
+		uint64_t usbp_bist		: 1;
+		uint64_t tclk			: 1;
+		uint64_t dp_pulld		: 1;
+		uint64_t dm_pulld		: 1;
+		uint64_t hst_mode		: 1;
+		uint64_t reserved_19_22		: 4;
+		uint64_t tx_bs_enh		: 1;
+		uint64_t tx_bs_en		: 1;
+		uint64_t loop_enb		: 1;
+		uint64_t vtest_enb		: 1;
+		uint64_t bist_enb		: 1;
+		uint64_t tdata_sel		: 1;
+		uint64_t taddr_in		: 4;
+		uint64_t tdata_in		: 8;
+		uint64_t ate_reset		: 1;
+	} cn52xx;
+};
+typedef union cvmx_usbnx_usbp_ctl_status cvmx_usbnx_usbp_ctl_status_t;
+
+#endif

diff --git a/drivers/staging/octeon-usb/octeon-hcd.c b/drivers/staging/octeon-usb/octeon-hcd.c
new file mode 100644
index 0000000..d156b60
--- /dev/null
+++ b/drivers/staging/octeon-usb/octeon-hcd.c

@@ -0,0 +1,832 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2008 Cavium Networks
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/usb.h>
+
+#include <linux/time.h>
+#include <linux/delay.h>
+
+#include <asm/octeon/cvmx.h>
+#include "cvmx-usb.h"
+#include <asm/octeon/cvmx-iob-defs.h>
+
+#include <linux/usb/hcd.h>
+
+#include <linux/err.h>
+
+struct octeon_hcd {
+	spinlock_t lock;
+	cvmx_usb_state_t usb;
+	struct tasklet_struct dequeue_tasklet;
+	struct list_head dequeue_list;
+};
+
+/* convert between an HCD pointer and the corresponding struct octeon_hcd */
+static inline struct octeon_hcd *hcd_to_octeon(struct usb_hcd *hcd)
+{
+	return (struct octeon_hcd *)(hcd->hcd_priv);
+}
+
+static inline struct usb_hcd *octeon_to_hcd(struct octeon_hcd *p)
+{
+	return container_of((void *)p, struct usb_hcd, hcd_priv);
+}
+
+static inline struct octeon_hcd *cvmx_usb_to_octeon(cvmx_usb_state_t *p)
+{
+	return container_of(p, struct octeon_hcd, usb);
+}
+
+static irqreturn_t octeon_usb_irq(struct usb_hcd *hcd)
+{
+	struct octeon_hcd *priv = hcd_to_octeon(hcd);
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	cvmx_usb_poll(&priv->usb);
+	spin_unlock_irqrestore(&priv->lock, flags);
+	return IRQ_HANDLED;
+}
+
+static void octeon_usb_port_callback(cvmx_usb_state_t *usb,
+				     cvmx_usb_callback_t reason,
+				     cvmx_usb_complete_t status,
+				     int pipe_handle,
+				     int submit_handle,
+				     int bytes_transferred,
+				     void *user_data)
+{
+	struct octeon_hcd *priv = cvmx_usb_to_octeon(usb);
+
+	spin_unlock(&priv->lock);
+	usb_hcd_poll_rh_status(octeon_to_hcd(priv));
+	spin_lock(&priv->lock);
+}
+
+static int octeon_usb_start(struct usb_hcd *hcd)
+{
+	struct octeon_hcd *priv = hcd_to_octeon(hcd);
+	unsigned long flags;
+
+	hcd->state = HC_STATE_RUNNING;
+	spin_lock_irqsave(&priv->lock, flags);
+	cvmx_usb_register_callback(&priv->usb, CVMX_USB_CALLBACK_PORT_CHANGED,
+				   octeon_usb_port_callback, NULL);
+	spin_unlock_irqrestore(&priv->lock, flags);
+	return 0;
+}
+
+static void octeon_usb_stop(struct usb_hcd *hcd)
+{
+	struct octeon_hcd *priv = hcd_to_octeon(hcd);
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	cvmx_usb_register_callback(&priv->usb, CVMX_USB_CALLBACK_PORT_CHANGED,
+				   NULL, NULL);
+	spin_unlock_irqrestore(&priv->lock, flags);
+	hcd->state = HC_STATE_HALT;
+}
+
+static int octeon_usb_get_frame_number(struct usb_hcd *hcd)
+{
+	struct octeon_hcd *priv = hcd_to_octeon(hcd);
+
+	return cvmx_usb_get_frame_number(&priv->usb);
+}
+
+static void octeon_usb_urb_complete_callback(cvmx_usb_state_t *usb,
+					     cvmx_usb_callback_t reason,
+					     cvmx_usb_complete_t status,
+					     int pipe_handle,
+					     int submit_handle,
+					     int bytes_transferred,
+					     void *user_data)
+{
+	struct octeon_hcd *priv = cvmx_usb_to_octeon(usb);
+	struct usb_hcd *hcd = octeon_to_hcd(priv);
+	struct device *dev = hcd->self.controller;
+	struct urb *urb = user_data;
+
+	urb->actual_length = bytes_transferred;
+	urb->hcpriv = NULL;
+
+	if (!list_empty(&urb->urb_list)) {
+		/*
+		 * It is on the dequeue_list, but we are going to call
+		 * usb_hcd_giveback_urb(), so we must clear it from
+		 * the list.  We got to it before the
+		 * octeon_usb_urb_dequeue_work() tasklet did.
+		 */
+		list_del(&urb->urb_list);
+		/* No longer on the dequeue_list. */
+		INIT_LIST_HEAD(&urb->urb_list);
+	}
+
+	/* For Isochronous transactions we need to update the URB packet status
+	   list from data in our private copy */
+	if (usb_pipetype(urb->pipe) == PIPE_ISOCHRONOUS) {
+		int i;
+		/*
+		 * The pointer to the private list is stored in the setup_packet
+		 * field.
+		 */
+		cvmx_usb_iso_packet_t *iso_packet = (cvmx_usb_iso_packet_t *) urb->setup_packet;
+		/* Recalculate the transfer size by adding up each packet */
+		urb->actual_length = 0;
+		for (i = 0; i < urb->number_of_packets; i++) {
+			if (iso_packet[i].status == CVMX_USB_COMPLETE_SUCCESS) {
+				urb->iso_frame_desc[i].status = 0;
+				urb->iso_frame_desc[i].actual_length = iso_packet[i].length;
+				urb->actual_length += urb->iso_frame_desc[i].actual_length;
+			} else {
+				dev_dbg(dev, "ISOCHRONOUS packet=%d of %d status=%d pipe=%d submit=%d size=%d\n",
+					i, urb->number_of_packets,
+					iso_packet[i].status, pipe_handle,
+					submit_handle, iso_packet[i].length);
+				urb->iso_frame_desc[i].status = -EREMOTEIO;
+			}
+		}
+		/* Free the private list now that we don't need it anymore */
+		kfree(iso_packet);
+		urb->setup_packet = NULL;
+	}
+
+	switch (status) {
+	case CVMX_USB_COMPLETE_SUCCESS:
+		urb->status = 0;
+		break;
+	case CVMX_USB_COMPLETE_CANCEL:
+		if (urb->status == 0)
+			urb->status = -ENOENT;
+		break;
+	case CVMX_USB_COMPLETE_STALL:
+		dev_dbg(dev, "status=stall pipe=%d submit=%d size=%d\n",
+			pipe_handle, submit_handle, bytes_transferred);
+		urb->status = -EPIPE;
+		break;
+	case CVMX_USB_COMPLETE_BABBLEERR:
+		dev_dbg(dev, "status=babble pipe=%d submit=%d size=%d\n",
+			pipe_handle, submit_handle, bytes_transferred);
+		urb->status = -EPIPE;
+		break;
+	case CVMX_USB_COMPLETE_SHORT:
+		dev_dbg(dev, "status=short pipe=%d submit=%d size=%d\n",
+			pipe_handle, submit_handle, bytes_transferred);
+		urb->status = -EREMOTEIO;
+		break;
+	case CVMX_USB_COMPLETE_ERROR:
+	case CVMX_USB_COMPLETE_XACTERR:
+	case CVMX_USB_COMPLETE_DATATGLERR:
+	case CVMX_USB_COMPLETE_FRAMEERR:
+		dev_dbg(dev, "status=%d pipe=%d submit=%d size=%d\n",
+			status, pipe_handle, submit_handle, bytes_transferred);
+		urb->status = -EPROTO;
+		break;
+	}
+	spin_unlock(&priv->lock);
+	usb_hcd_giveback_urb(octeon_to_hcd(priv), urb, urb->status);
+	spin_lock(&priv->lock);
+}
+
+static int octeon_usb_urb_enqueue(struct usb_hcd *hcd,
+				  struct urb *urb,
+				  gfp_t mem_flags)
+{
+	struct octeon_hcd *priv = hcd_to_octeon(hcd);
+	struct device *dev = hcd->self.controller;
+	int submit_handle = -1;
+	int pipe_handle;
+	unsigned long flags;
+	cvmx_usb_iso_packet_t *iso_packet;
+	struct usb_host_endpoint *ep = urb->ep;
+
+	urb->status = 0;
+	INIT_LIST_HEAD(&urb->urb_list);	/* not enqueued on dequeue_list */
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (!ep->hcpriv) {
+		cvmx_usb_transfer_t transfer_type;
+		cvmx_usb_speed_t speed;
+		int split_device = 0;
+		int split_port = 0;
+		switch (usb_pipetype(urb->pipe)) {
+		case PIPE_ISOCHRONOUS:
+			transfer_type = CVMX_USB_TRANSFER_ISOCHRONOUS;
+			break;
+		case PIPE_INTERRUPT:
+			transfer_type = CVMX_USB_TRANSFER_INTERRUPT;
+			break;
+		case PIPE_CONTROL:
+			transfer_type = CVMX_USB_TRANSFER_CONTROL;
+			break;
+		default:
+			transfer_type = CVMX_USB_TRANSFER_BULK;
+			break;
+		}
+		switch (urb->dev->speed) {
+		case USB_SPEED_LOW:
+			speed = CVMX_USB_SPEED_LOW;
+			break;
+		case USB_SPEED_FULL:
+			speed = CVMX_USB_SPEED_FULL;
+			break;
+		default:
+			speed = CVMX_USB_SPEED_HIGH;
+			break;
+		}
+		/*
+		 * For slow devices on high speed ports we need to find the hub
+		 * that does the speed translation so we know where to send the
+		 * split transactions.
+		 */
+		if (speed != CVMX_USB_SPEED_HIGH) {
+			/*
+			 * Start at this device and work our way up the usb
+			 * tree.
+			 */
+			struct usb_device *dev = urb->dev;
+			while (dev->parent) {
+				/*
+				 * If our parent is high speed then he'll
+				 * receive the splits.
+				 */
+				if (dev->parent->speed == USB_SPEED_HIGH) {
+					split_device = dev->parent->devnum;
+					split_port = dev->portnum;
+					break;
+				}
+				/*
+				 * Move up the tree one level. If we make it all
+				 * the way up the tree, then the port must not
+				 * be in high speed mode and we don't need a
+				 * split.
+				 */
+				dev = dev->parent;
+			}
+		}
+		pipe_handle = cvmx_usb_open_pipe(&priv->usb,
+						 0,
+						 usb_pipedevice(urb->pipe),
+						 usb_pipeendpoint(urb->pipe),
+						 speed,
+						 le16_to_cpu(ep->desc.wMaxPacketSize) & 0x7ff,
+						 transfer_type,
+						 usb_pipein(urb->pipe) ? CVMX_USB_DIRECTION_IN : CVMX_USB_DIRECTION_OUT,
+						 urb->interval,
+						 (le16_to_cpu(ep->desc.wMaxPacketSize) >> 11) & 0x3,
+						 split_device,
+						 split_port);
+		if (pipe_handle < 0) {
+			spin_unlock_irqrestore(&priv->lock, flags);
+			dev_dbg(dev, "Failed to create pipe\n");
+			return -ENOMEM;
+		}
+		ep->hcpriv = (void *)(0x10000L + pipe_handle);
+	} else {
+		pipe_handle = 0xffff & (long)ep->hcpriv;
+	}
+
+	switch (usb_pipetype(urb->pipe)) {
+	case PIPE_ISOCHRONOUS:
+		dev_dbg(dev, "Submit isochronous to %d.%d\n",
+			usb_pipedevice(urb->pipe), usb_pipeendpoint(urb->pipe));
+		/*
+		 * Allocate a structure to use for our private list of
+		 * isochronous packets.
+		 */
+		iso_packet = kmalloc(urb->number_of_packets * sizeof(cvmx_usb_iso_packet_t), GFP_ATOMIC);
+		if (iso_packet) {
+			int i;
+			/* Fill the list with the data from the URB */
+			for (i = 0; i < urb->number_of_packets; i++) {
+				iso_packet[i].offset = urb->iso_frame_desc[i].offset;
+				iso_packet[i].length = urb->iso_frame_desc[i].length;
+				iso_packet[i].status = CVMX_USB_COMPLETE_ERROR;
+			}
+			/*
+			 * Store a pointer to the list in the URB setup_packet
+			 * field. We know this currently isn't being used and
+			 * this saves us a bunch of logic.
+			 */
+			urb->setup_packet = (char *)iso_packet;
+			submit_handle = cvmx_usb_submit_isochronous(&priv->usb, pipe_handle,
+							urb->start_frame,
+							0 /* flags */ ,
+							urb->number_of_packets,
+							iso_packet,
+							urb->transfer_dma,
+							urb->transfer_buffer_length,
+							octeon_usb_urb_complete_callback,
+							urb);
+			/*
+			 * If submit failed we need to free our private packet
+			 * list.
+			 */
+			if (submit_handle < 0) {
+				urb->setup_packet = NULL;
+				kfree(iso_packet);
+			}
+		}
+		break;
+	case PIPE_INTERRUPT:
+		dev_dbg(dev, "Submit interrupt to %d.%d\n",
+			usb_pipedevice(urb->pipe), usb_pipeendpoint(urb->pipe));
+		submit_handle = cvmx_usb_submit_interrupt(&priv->usb, pipe_handle,
+					      urb->transfer_dma,
+					      urb->transfer_buffer_length,
+					      octeon_usb_urb_complete_callback,
+					      urb);
+		break;
+	case PIPE_CONTROL:
+		dev_dbg(dev, "Submit control to %d.%d\n",
+			usb_pipedevice(urb->pipe), usb_pipeendpoint(urb->pipe));
+		submit_handle = cvmx_usb_submit_control(&priv->usb, pipe_handle,
+					    urb->setup_dma,
+					    urb->transfer_dma,
+					    urb->transfer_buffer_length,
+					    octeon_usb_urb_complete_callback,
+					    urb);
+		break;
+	case PIPE_BULK:
+		dev_dbg(dev, "Submit bulk to %d.%d\n",
+			usb_pipedevice(urb->pipe), usb_pipeendpoint(urb->pipe));
+		submit_handle = cvmx_usb_submit_bulk(&priv->usb, pipe_handle,
+					 urb->transfer_dma,
+					 urb->transfer_buffer_length,
+					 octeon_usb_urb_complete_callback,
+					 urb);
+		break;
+	}
+	if (submit_handle < 0) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		dev_dbg(dev, "Failed to submit\n");
+		return -ENOMEM;
+	}
+	urb->hcpriv = (void *)(long)(((submit_handle & 0xffff) << 16) | pipe_handle);
+	spin_unlock_irqrestore(&priv->lock, flags);
+	return 0;
+}
+
+static void octeon_usb_urb_dequeue_work(unsigned long arg)
+{
+	unsigned long flags;
+	struct octeon_hcd *priv = (struct octeon_hcd *)arg;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while (!list_empty(&priv->dequeue_list)) {
+		int pipe_handle;
+		int submit_handle;
+		struct urb *urb = container_of(priv->dequeue_list.next, struct urb, urb_list);
+		list_del(&urb->urb_list);
+		/* not enqueued on dequeue_list */
+		INIT_LIST_HEAD(&urb->urb_list);
+		pipe_handle = 0xffff & (long)urb->hcpriv;
+		submit_handle = ((long)urb->hcpriv) >> 16;
+		cvmx_usb_cancel(&priv->usb, pipe_handle, submit_handle);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static int octeon_usb_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
+{
+	struct octeon_hcd *priv = hcd_to_octeon(hcd);
+	unsigned long flags;
+
+	if (!urb->dev)
+		return -EINVAL;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	urb->status = status;
+	list_add_tail(&urb->urb_list, &priv->dequeue_list);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	tasklet_schedule(&priv->dequeue_tasklet);
+
+	return 0;
+}
+
+static void octeon_usb_endpoint_disable(struct usb_hcd *hcd, struct usb_host_endpoint *ep)
+{
+	struct device *dev = hcd->self.controller;
+
+	if (ep->hcpriv) {
+		struct octeon_hcd *priv = hcd_to_octeon(hcd);
+		int pipe_handle = 0xffff & (long)ep->hcpriv;
+		unsigned long flags;
+		spin_lock_irqsave(&priv->lock, flags);
+		cvmx_usb_cancel_all(&priv->usb, pipe_handle);
+		if (cvmx_usb_close_pipe(&priv->usb, pipe_handle))
+			dev_dbg(dev, "Closing pipe %d failed\n", pipe_handle);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		ep->hcpriv = NULL;
+	}
+}
+
+static int octeon_usb_hub_status_data(struct usb_hcd *hcd, char *buf)
+{
+	struct octeon_hcd *priv = hcd_to_octeon(hcd);
+	cvmx_usb_port_status_t port_status;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	port_status = cvmx_usb_get_status(&priv->usb);
+	spin_unlock_irqrestore(&priv->lock, flags);
+	buf[0] = 0;
+	buf[0] = port_status.connect_change << 1;
+
+	return (buf[0] != 0);
+}
+
+static int octeon_usb_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength)
+{
+	struct octeon_hcd *priv = hcd_to_octeon(hcd);
+	struct device *dev = hcd->self.controller;
+	cvmx_usb_port_status_t usb_port_status;
+	int port_status;
+	struct usb_hub_descriptor *desc;
+	unsigned long flags;
+
+	switch (typeReq) {
+	case ClearHubFeature:
+		dev_dbg(dev, "ClearHubFeature\n");
+		switch (wValue) {
+		case C_HUB_LOCAL_POWER:
+		case C_HUB_OVER_CURRENT:
+			/* Nothing required here */
+			break;
+		default:
+			return -EINVAL;
+		}
+		break;
+	case ClearPortFeature:
+		dev_dbg(dev, "ClearPortFeature\n");
+		if (wIndex != 1) {
+			dev_dbg(dev, " INVALID\n");
+			return -EINVAL;
+		}
+
+		switch (wValue) {
+		case USB_PORT_FEAT_ENABLE:
+			dev_dbg(dev, " ENABLE\n");
+			spin_lock_irqsave(&priv->lock, flags);
+			cvmx_usb_disable(&priv->usb);
+			spin_unlock_irqrestore(&priv->lock, flags);
+			break;
+		case USB_PORT_FEAT_SUSPEND:
+			dev_dbg(dev, " SUSPEND\n");
+			/* Not supported on Octeon */
+			break;
+		case USB_PORT_FEAT_POWER:
+			dev_dbg(dev, " POWER\n");
+			/* Not supported on Octeon */
+			break;
+		case USB_PORT_FEAT_INDICATOR:
+			dev_dbg(dev, " INDICATOR\n");
+			/* Port inidicator not supported */
+			break;
+		case USB_PORT_FEAT_C_CONNECTION:
+			dev_dbg(dev, " C_CONNECTION\n");
+			/* Clears drivers internal connect status change flag */
+			spin_lock_irqsave(&priv->lock, flags);
+			cvmx_usb_set_status(&priv->usb, cvmx_usb_get_status(&priv->usb));
+			spin_unlock_irqrestore(&priv->lock, flags);
+			break;
+		case USB_PORT_FEAT_C_RESET:
+			dev_dbg(dev, " C_RESET\n");
+			/*
+			 * Clears the driver's internal Port Reset Change flag.
+			 */
+			spin_lock_irqsave(&priv->lock, flags);
+			cvmx_usb_set_status(&priv->usb, cvmx_usb_get_status(&priv->usb));
+			spin_unlock_irqrestore(&priv->lock, flags);
+			break;
+		case USB_PORT_FEAT_C_ENABLE:
+			dev_dbg(dev, " C_ENABLE\n");
+			/*
+			 * Clears the driver's internal Port Enable/Disable
+			 * Change flag.
+			 */
+			spin_lock_irqsave(&priv->lock, flags);
+			cvmx_usb_set_status(&priv->usb, cvmx_usb_get_status(&priv->usb));
+			spin_unlock_irqrestore(&priv->lock, flags);
+			break;
+		case USB_PORT_FEAT_C_SUSPEND:
+			dev_dbg(dev, " C_SUSPEND\n");
+			/*
+			 * Clears the driver's internal Port Suspend Change
+			 * flag, which is set when resume signaling on the host
+			 * port is complete.
+			 */
+			break;
+		case USB_PORT_FEAT_C_OVER_CURRENT:
+			dev_dbg(dev, " C_OVER_CURRENT\n");
+			/* Clears the driver's overcurrent Change flag */
+			spin_lock_irqsave(&priv->lock, flags);
+			cvmx_usb_set_status(&priv->usb, cvmx_usb_get_status(&priv->usb));
+			spin_unlock_irqrestore(&priv->lock, flags);
+			break;
+		default:
+			dev_dbg(dev, " UNKNOWN\n");
+			return -EINVAL;
+		}
+		break;
+	case GetHubDescriptor:
+		dev_dbg(dev, "GetHubDescriptor\n");
+		desc = (struct usb_hub_descriptor *)buf;
+		desc->bDescLength = 9;
+		desc->bDescriptorType = 0x29;
+		desc->bNbrPorts = 1;
+		desc->wHubCharacteristics = 0x08;
+		desc->bPwrOn2PwrGood = 1;
+		desc->bHubContrCurrent = 0;
+		desc->u.hs.DeviceRemovable[0] = 0;
+		desc->u.hs.DeviceRemovable[1] = 0xff;
+		break;
+	case GetHubStatus:
+		dev_dbg(dev, "GetHubStatus\n");
+		*(__le32 *) buf = 0;
+		break;
+	case GetPortStatus:
+		dev_dbg(dev, "GetPortStatus\n");
+		if (wIndex != 1) {
+			dev_dbg(dev, " INVALID\n");
+			return -EINVAL;
+		}
+
+		spin_lock_irqsave(&priv->lock, flags);
+		usb_port_status = cvmx_usb_get_status(&priv->usb);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		port_status = 0;
+
+		if (usb_port_status.connect_change) {
+			port_status |= (1 << USB_PORT_FEAT_C_CONNECTION);
+			dev_dbg(dev, " C_CONNECTION\n");
+		}
+
+		if (usb_port_status.port_enabled) {
+			port_status |= (1 << USB_PORT_FEAT_C_ENABLE);
+			dev_dbg(dev, " C_ENABLE\n");
+		}
+
+		if (usb_port_status.connected) {
+			port_status |= (1 << USB_PORT_FEAT_CONNECTION);
+			dev_dbg(dev, " CONNECTION\n");
+		}
+
+		if (usb_port_status.port_enabled) {
+			port_status |= (1 << USB_PORT_FEAT_ENABLE);
+			dev_dbg(dev, " ENABLE\n");
+		}
+
+		if (usb_port_status.port_over_current) {
+			port_status |= (1 << USB_PORT_FEAT_OVER_CURRENT);
+			dev_dbg(dev, " OVER_CURRENT\n");
+		}
+
+		if (usb_port_status.port_powered) {
+			port_status |= (1 << USB_PORT_FEAT_POWER);
+			dev_dbg(dev, " POWER\n");
+		}
+
+		if (usb_port_status.port_speed == CVMX_USB_SPEED_HIGH) {
+			port_status |= USB_PORT_STAT_HIGH_SPEED;
+			dev_dbg(dev, " HIGHSPEED\n");
+		} else if (usb_port_status.port_speed == CVMX_USB_SPEED_LOW) {
+			port_status |= (1 << USB_PORT_FEAT_LOWSPEED);
+			dev_dbg(dev, " LOWSPEED\n");
+		}
+
+		*((__le32 *) buf) = cpu_to_le32(port_status);
+		break;
+	case SetHubFeature:
+		dev_dbg(dev, "SetHubFeature\n");
+		/* No HUB features supported */
+		break;
+	case SetPortFeature:
+		dev_dbg(dev, "SetPortFeature\n");
+		if (wIndex != 1) {
+			dev_dbg(dev, " INVALID\n");
+			return -EINVAL;
+		}
+
+		switch (wValue) {
+		case USB_PORT_FEAT_SUSPEND:
+			dev_dbg(dev, " SUSPEND\n");
+			return -EINVAL;
+		case USB_PORT_FEAT_POWER:
+			dev_dbg(dev, " POWER\n");
+			return -EINVAL;
+		case USB_PORT_FEAT_RESET:
+			dev_dbg(dev, " RESET\n");
+			spin_lock_irqsave(&priv->lock, flags);
+			cvmx_usb_disable(&priv->usb);
+			if (cvmx_usb_enable(&priv->usb))
+				dev_dbg(dev, "Failed to enable the port\n");
+			spin_unlock_irqrestore(&priv->lock, flags);
+			return 0;
+		case USB_PORT_FEAT_INDICATOR:
+			dev_dbg(dev, " INDICATOR\n");
+			/* Not supported */
+			break;
+		default:
+			dev_dbg(dev, " UNKNOWN\n");
+			return -EINVAL;
+		}
+		break;
+	default:
+		dev_dbg(dev, "Unknown root hub request\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+
+static const struct hc_driver octeon_hc_driver = {
+	.description		= "Octeon USB",
+	.product_desc		= "Octeon Host Controller",
+	.hcd_priv_size		= sizeof(struct octeon_hcd),
+	.irq			= octeon_usb_irq,
+	.flags			= HCD_MEMORY | HCD_USB2,
+	.start			= octeon_usb_start,
+	.stop			= octeon_usb_stop,
+	.urb_enqueue		= octeon_usb_urb_enqueue,
+	.urb_dequeue		= octeon_usb_urb_dequeue,
+	.endpoint_disable	= octeon_usb_endpoint_disable,
+	.get_frame_number	= octeon_usb_get_frame_number,
+	.hub_status_data	= octeon_usb_hub_status_data,
+	.hub_control		= octeon_usb_hub_control,
+};
+
+
+static int octeon_usb_driver_probe(struct device *dev)
+{
+	int status;
+	int usb_num = to_platform_device(dev)->id;
+	int irq = platform_get_irq(to_platform_device(dev), 0);
+	struct octeon_hcd *priv;
+	struct usb_hcd *hcd;
+	unsigned long flags;
+
+	/*
+	 * Set the DMA mask to 64bits so we get buffers already translated for
+	 * DMA.
+	 */
+	dev->coherent_dma_mask = ~0;
+	dev->dma_mask = &dev->coherent_dma_mask;
+
+	hcd = usb_create_hcd(&octeon_hc_driver, dev, dev_name(dev));
+	if (!hcd) {
+		dev_dbg(dev, "Failed to allocate memory for HCD\n");
+		return -1;
+	}
+	hcd->uses_new_polling = 1;
+	priv = (struct octeon_hcd *)hcd->hcd_priv;
+
+	spin_lock_init(&priv->lock);
+
+	tasklet_init(&priv->dequeue_tasklet, octeon_usb_urb_dequeue_work, (unsigned long)priv);
+	INIT_LIST_HEAD(&priv->dequeue_list);
+
+	status = cvmx_usb_initialize(&priv->usb, usb_num, CVMX_USB_INITIALIZE_FLAGS_CLOCK_AUTO);
+	if (status) {
+		dev_dbg(dev, "USB initialization failed with %d\n", status);
+		kfree(hcd);
+		return -1;
+	}
+
+	/* This delay is needed for CN3010, but I don't know why... */
+	mdelay(10);
+
+	spin_lock_irqsave(&priv->lock, flags);
+	cvmx_usb_poll(&priv->usb);
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	status = usb_add_hcd(hcd, irq, IRQF_SHARED);
+	if (status) {
+		dev_dbg(dev, "USB add HCD failed with %d\n", status);
+		kfree(hcd);
+		return -1;
+	}
+
+	dev_dbg(dev, "Registered HCD for port %d on irq %d\n", usb_num, irq);
+
+	return 0;
+}
+
+static int octeon_usb_driver_remove(struct device *dev)
+{
+	int status;
+	struct usb_hcd *hcd = dev_get_drvdata(dev);
+	struct octeon_hcd *priv = hcd_to_octeon(hcd);
+	unsigned long flags;
+
+	usb_remove_hcd(hcd);
+	tasklet_kill(&priv->dequeue_tasklet);
+	spin_lock_irqsave(&priv->lock, flags);
+	status = cvmx_usb_shutdown(&priv->usb);
+	spin_unlock_irqrestore(&priv->lock, flags);
+	if (status)
+		dev_dbg(dev, "USB shutdown failed with %d\n", status);
+
+	kfree(hcd);
+
+	return 0;
+}
+
+static struct device_driver octeon_usb_driver = {
+	.name	= "OcteonUSB",
+	.bus	= &platform_bus_type,
+	.probe	= octeon_usb_driver_probe,
+	.remove	= octeon_usb_driver_remove,
+};
+
+
+#define MAX_USB_PORTS   10
+static struct platform_device *pdev_glob[MAX_USB_PORTS];
+static int octeon_usb_registered;
+static int __init octeon_usb_module_init(void)
+{
+	int num_devices = cvmx_usb_get_num_ports();
+	int device;
+
+	if (usb_disabled() || num_devices == 0)
+		return -ENODEV;
+
+	if (driver_register(&octeon_usb_driver))
+		return -ENOMEM;
+
+	octeon_usb_registered = 1;
+
+	/*
+	 * Only cn52XX and cn56XX have DWC_OTG USB hardware and the
+	 * IOB priority registers.  Under heavy network load USB
+	 * hardware can be starved by the IOB causing a crash.  Give
+	 * it a priority boost if it has been waiting more than 400
+	 * cycles to avoid this situation.
+	 *
+	 * Testing indicates that a cnt_val of 8192 is not sufficient,
+	 * but no failures are seen with 4096.  We choose a value of
+	 * 400 to give a safety factor of 10.
+	 */
+	if (OCTEON_IS_MODEL(OCTEON_CN52XX) || OCTEON_IS_MODEL(OCTEON_CN56XX)) {
+		union cvmx_iob_n2c_l2c_pri_cnt pri_cnt;
+
+		pri_cnt.u64 = 0;
+		pri_cnt.s.cnt_enb = 1;
+		pri_cnt.s.cnt_val = 400;
+		cvmx_write_csr(CVMX_IOB_N2C_L2C_PRI_CNT, pri_cnt.u64);
+	}
+
+	for (device = 0; device < num_devices; device++) {
+		struct resource irq_resource;
+		struct platform_device *pdev;
+		memset(&irq_resource, 0, sizeof(irq_resource));
+		irq_resource.start = (device == 0) ? OCTEON_IRQ_USB0 : OCTEON_IRQ_USB1;
+		irq_resource.end = irq_resource.start;
+		irq_resource.flags = IORESOURCE_IRQ;
+		pdev = platform_device_register_simple((char *)octeon_usb_driver.  name, device, &irq_resource, 1);
+		if (IS_ERR(pdev)) {
+			driver_unregister(&octeon_usb_driver);
+			octeon_usb_registered = 0;
+			return PTR_ERR(pdev);
+		}
+		if (device < MAX_USB_PORTS)
+			pdev_glob[device] = pdev;
+
+	}
+	return 0;
+}
+
+static void __exit octeon_usb_module_cleanup(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_USB_PORTS; i++)
+		if (pdev_glob[i]) {
+			platform_device_unregister(pdev_glob[i]);
+			pdev_glob[i] = NULL;
+		}
+	if (octeon_usb_registered)
+		driver_unregister(&octeon_usb_driver);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cavium Networks <support@caviumnetworks.com>");
+MODULE_DESCRIPTION("Cavium Networks Octeon USB Host driver.");
+module_init(octeon_usb_module_init);
+module_exit(octeon_usb_module_cleanup);

diff --git a/drivers/staging/ozwpan/Kbuild b/drivers/staging/ozwpan/Kbuild
index 6cc84cb..1766a26 100644
--- a/drivers/staging/ozwpan/Kbuild
+++ b/drivers/staging/ozwpan/Kbuild

@@ -13,7 +13,6 @@
 	ozproto.o \
 	ozcdev.o \
 	ozurbparanoia.o \
-	oztrace.o \
-	ozevent.o
+	oztrace.o
 
 

diff --git a/drivers/staging/ozwpan/ozappif.h b/drivers/staging/ozwpan/ozappif.h
index 449a6ba..ea1b271 100644
--- a/drivers/staging/ozwpan/ozappif.h
+++ b/drivers/staging/ozwpan/ozappif.h

@@ -6,8 +6,6 @@
 #ifndef _OZAPPIF_H
 #define _OZAPPIF_H
 
-#include "ozeventdef.h"
-
 #define OZ_IOCTL_MAGIC	0xf4
 
 struct oz_mac_addr {

diff --git a/drivers/staging/ozwpan/ozcdev.c b/drivers/staging/ozwpan/ozcdev.c
index 27d0666..374fdc3 100644
--- a/drivers/staging/ozwpan/ozcdev.c
+++ b/drivers/staging/ozwpan/ozcdev.c

@@ -18,7 +18,6 @@
 #include "ozeltbuf.h"
 #include "ozpd.h"
 #include "ozproto.h"
-#include "ozevent.h"
 #include "ozcdev.h"
 /*------------------------------------------------------------------------------
  */
@@ -355,11 +354,13 @@
 	g_oz_class = class_create(THIS_MODULE, "ozmo_wpan");
 	if (IS_ERR(g_oz_class)) {
 		oz_trace("Failed to register ozmo_wpan class\n");
+		err = PTR_ERR(g_oz_class);
 		goto out1;
 	}
 	dev = device_create(g_oz_class, NULL, g_cdev.devnum, NULL, "ozwpan");
 	if (IS_ERR(dev)) {
 		oz_trace("Failed to create sysfs entry for cdev\n");
+		err = PTR_ERR(dev);
 		goto out1;
 	}
 	return 0;
@@ -388,7 +389,6 @@
  */
 int oz_cdev_init(void)
 {
-	oz_event_log(OZ_EVT_SERVICE, 1, OZ_APPID_SERIAL, NULL, 0);
 	oz_app_enable(OZ_APPID_SERIAL, 1);
 	return 0;
 }
@@ -397,7 +397,6 @@
  */
 void oz_cdev_term(void)
 {
-	oz_event_log(OZ_EVT_SERVICE, 2, OZ_APPID_SERIAL, NULL, 0);
 	oz_app_enable(OZ_APPID_SERIAL, 0);
 }
 /*------------------------------------------------------------------------------
@@ -407,7 +406,6 @@
 {
 	struct oz_serial_ctx *ctx;
 	struct oz_serial_ctx *old_ctx;
-	oz_event_log(OZ_EVT_SERVICE, 3, OZ_APPID_SERIAL, NULL, resume);
 	if (resume) {
 		oz_trace("Serial service resumed.\n");
 		return 0;
@@ -443,7 +441,6 @@
 void oz_cdev_stop(struct oz_pd *pd, int pause)
 {
 	struct oz_serial_ctx *ctx;
-	oz_event_log(OZ_EVT_SERVICE, 4, OZ_APPID_SERIAL, NULL, pause);
 	if (pause) {
 		oz_trace("Serial service paused.\n");
 		return;

diff --git a/drivers/staging/ozwpan/ozconfig.h b/drivers/staging/ozwpan/ozconfig.h
index 43e6373..087c322 100644
--- a/drivers/staging/ozwpan/ozconfig.h
+++ b/drivers/staging/ozwpan/ozconfig.h

@@ -12,7 +12,6 @@
 /* #define WANT_URB_PARANOIA */
 
 /* #define WANT_PRE_2_6_39 */
-#define WANT_EVENT_TRACE
 
 /* These defines determine what verbose trace is displayed. */
 #ifdef WANT_VERBOSE_TRACE

diff --git a/drivers/staging/ozwpan/ozevent.c b/drivers/staging/ozwpan/ozevent.c
deleted file mode 100644
index 77e8675..0000000
--- a/drivers/staging/ozwpan/ozevent.c
+++ /dev/null

@@ -1,195 +0,0 @@
-/* -----------------------------------------------------------------------------
- * Copyright (c) 2011 Ozmo Inc
- * Released under the GNU General Public License Version 2 (GPLv2).
- * -----------------------------------------------------------------------------
- */
-#include "ozconfig.h"
-#ifdef WANT_EVENT_TRACE
-#include <linux/module.h>
-#include <linux/debugfs.h>
-#include <linux/jiffies.h>
-#include <linux/uaccess.h>
-#include "oztrace.h"
-#include "ozevent.h"
-#include "ozappif.h"
-/*------------------------------------------------------------------------------
- * Although the event mask is logically part of the oz_evtdev structure, it is
- * needed outside of this file so define it separately to avoid the need to
- * export definition of struct oz_evtdev.
- */
-u32 g_evt_mask;
-/*------------------------------------------------------------------------------
- */
-#define OZ_MAX_EVTS	2048	/* Must be power of 2 */
-struct oz_evtdev {
-	struct dentry *root_dir;
-	int evt_in;
-	int evt_out;
-	int missed_events;
-	int present;
-	atomic_t users;
-	spinlock_t lock;
-	struct oz_event evts[OZ_MAX_EVTS];
-};
-
-static struct oz_evtdev g_evtdev;
-
-/*------------------------------------------------------------------------------
- * Context: process
- */
-void oz_event_init(void)
-{
-	/* Because g_evtdev is static external all fields initially zero so no
-	 * need to reinitialized those.
-	 */
-	oz_trace("Event tracing initialized\n");
-	spin_lock_init(&g_evtdev.lock);
-	atomic_set(&g_evtdev.users, 0);
-}
-/*------------------------------------------------------------------------------
- * Context: process
- */
-void oz_event_term(void)
-{
-	oz_trace("Event tracing terminated\n");
-}
-/*------------------------------------------------------------------------------
- * Context: any
- */
-void oz_event_log2(u8 evt, u8 ctx1, u16 ctx2, void *ctx3, unsigned ctx4)
-{
-	unsigned long irqstate;
-	int ix;
-	spin_lock_irqsave(&g_evtdev.lock, irqstate);
-	ix = (g_evtdev.evt_in + 1) & (OZ_MAX_EVTS - 1);
-	if (ix != g_evtdev.evt_out) {
-		struct oz_event *e = &g_evtdev.evts[g_evtdev.evt_in];
-		e->jiffies = jiffies;
-		e->evt = evt;
-		e->ctx1 = ctx1;
-		e->ctx2 = ctx2;
-		e->ctx3 = (__u32)(unsigned long)ctx3;
-		e->ctx4 = ctx4;
-		g_evtdev.evt_in = ix;
-	} else {
-		g_evtdev.missed_events++;
-	}
-	spin_unlock_irqrestore(&g_evtdev.lock, irqstate);
-}
-/*------------------------------------------------------------------------------
- * Context: process
- */
-#ifdef CONFIG_DEBUG_FS
-static void oz_events_clear(struct oz_evtdev *dev)
-{
-	unsigned long irqstate;
-	oz_trace("Clearing events\n");
-	spin_lock_irqsave(&dev->lock, irqstate);
-	dev->evt_in = dev->evt_out = 0;
-	dev->missed_events = 0;
-	spin_unlock_irqrestore(&dev->lock, irqstate);
-}
-/*------------------------------------------------------------------------------
- * Context: process
- */
-static int oz_events_open(struct inode *inode, struct file *filp)
-{
-	oz_trace("oz_evt_open()\n");
-	oz_trace("Open flags: 0x%x\n", filp->f_flags);
-	if (atomic_add_return(1, &g_evtdev.users) == 1) {
-		oz_events_clear(&g_evtdev);
-		return nonseekable_open(inode, filp);
-	} else {
-		atomic_dec(&g_evtdev.users);
-		return -EBUSY;
-	}
-}
-/*------------------------------------------------------------------------------
- * Context: process
- */
-static int oz_events_release(struct inode *inode, struct file *filp)
-{
-	oz_events_clear(&g_evtdev);
-	atomic_dec(&g_evtdev.users);
-	g_evt_mask = 0;
-	oz_trace("oz_evt_release()\n");
-	return 0;
-}
-/*------------------------------------------------------------------------------
- * Context: process
- */
-static ssize_t oz_events_read(struct file *filp, char __user *buf, size_t count,
-		loff_t *fpos)
-{
-	struct oz_evtdev *dev = &g_evtdev;
-	int rc = 0;
-	int nb_evts = count / sizeof(struct oz_event);
-	int n;
-	int sz;
-
-	n = dev->evt_in - dev->evt_out;
-	if (n < 0)
-		n += OZ_MAX_EVTS;
-	if (nb_evts > n)
-		nb_evts = n;
-	if (nb_evts == 0)
-		goto out;
-	n = OZ_MAX_EVTS - dev->evt_out;
-	if (n > nb_evts)
-		n = nb_evts;
-	sz = n * sizeof(struct oz_event);
-	if (copy_to_user(buf, &dev->evts[dev->evt_out], sz)) {
-		rc = -EFAULT;
-		goto out;
-	}
-	if (n == nb_evts)
-		goto out2;
-	n = nb_evts - n;
-	if (copy_to_user(buf + sz, dev->evts, n * sizeof(struct oz_event))) {
-		rc = -EFAULT;
-		goto out;
-	}
-out2:
-	dev->evt_out = (dev->evt_out + nb_evts) & (OZ_MAX_EVTS - 1);
-	rc = nb_evts * sizeof(struct oz_event);
-out:
-	return rc;
-}
-/*------------------------------------------------------------------------------
- */
-static const struct file_operations oz_events_fops = {
-	.owner =	THIS_MODULE,
-	.open =		oz_events_open,
-	.release =	oz_events_release,
-	.read =		oz_events_read,
-};
-/*------------------------------------------------------------------------------
- * Context: process
- */
-void oz_debugfs_init(void)
-{
-	struct dentry *parent;
-
-	parent = debugfs_create_dir("ozwpan", NULL);
-	if (parent  == NULL) {
-		oz_trace("Failed to create debugfs directory ozmo\n");
-		return;
-	} else {
-		g_evtdev.root_dir = parent;
-		if (debugfs_create_file("events", S_IRUSR, parent, NULL,
-						&oz_events_fops) == NULL)
-			oz_trace("Failed to create file ozmo/events\n");
-		if (debugfs_create_x32("event_mask", S_IRUSR | S_IWUSR, parent,
-							&g_evt_mask) == NULL)
-			oz_trace("Failed to create file ozmo/event_mask\n");
-	}
-}
-/*------------------------------------------------------------------------------
- * Context: process
- */
-void oz_debugfs_remove(void)
-{
-	debugfs_remove_recursive(g_evtdev.root_dir);
-}
-#endif /* CONFIG_DEBUG_FS */
-#endif /* WANT_EVENT_TRACE */

diff --git a/drivers/staging/ozwpan/ozevent.h b/drivers/staging/ozwpan/ozevent.h
deleted file mode 100644
index 32f6f98..0000000
--- a/drivers/staging/ozwpan/ozevent.h
+++ /dev/null

@@ -1,32 +0,0 @@
-/* -----------------------------------------------------------------------------
- * Copyright (c) 2011 Ozmo Inc
- * Released under the GNU General Public License Version 2 (GPLv2).
- * -----------------------------------------------------------------------------
- */
-#ifndef _OZEVENT_H
-#define _OZEVENT_H
-#include "ozconfig.h"
-#include "ozeventdef.h"
-
-#ifdef WANT_EVENT_TRACE
-extern u32 g_evt_mask;
-void oz_event_init(void);
-void oz_event_term(void);
-void oz_event_log2(u8 evt, u8 ctx1, u16 ctx2, void *ctx3, unsigned ctx4);
-void oz_debugfs_init(void);
-void oz_debugfs_remove(void);
-#define oz_event_log(__evt, __ctx1, __ctx2, __ctx3, __ctx4) \
-	do { \
-		if ((1<<(__evt)) & g_evt_mask) \
-			oz_event_log2(__evt, __ctx1, __ctx2, __ctx3, __ctx4); \
-	} while (0)
-
-#else
-#define oz_event_init()
-#define oz_event_term()
-#define oz_event_log(__evt, __ctx1, __ctx2, __ctx3, __ctx4)
-#define oz_debugfs_init()
-#define oz_debugfs_remove()
-#endif /* WANT_EVENT_TRACE */
-
-#endif /* _OZEVENT_H */

diff --git a/drivers/staging/ozwpan/ozeventdef.h b/drivers/staging/ozwpan/ozeventdef.h
deleted file mode 100644
index 4b93898..0000000
--- a/drivers/staging/ozwpan/ozeventdef.h
+++ /dev/null

@@ -1,40 +0,0 @@
-/* -----------------------------------------------------------------------------
- * Copyright (c) 2011 Ozmo Inc
- * Released under the GNU General Public License Version 2 (GPLv2).
- * -----------------------------------------------------------------------------
- */
-#ifndef _OZEVENTDEF_H
-#define _OZEVENTDEF_H
-
-#define OZ_EVT_RX_FRAME		0
-#define OZ_EVT_RX_PROCESS	1
-#define OZ_EVT_TX_FRAME		2
-#define OZ_EVT_TX_ISOC		3
-#define OZ_EVT_URB_SUBMIT	4
-#define OZ_EVT_URB_DONE		5
-#define OZ_EVT_URB_CANCEL	6
-#define OZ_EVT_CTRL_REQ		7
-#define OZ_EVT_CTRL_CNF		8
-#define OZ_EVT_CTRL_LOCAL	9
-#define OZ_EVT_CONNECT_REQ	10
-#define OZ_EVT_CONNECT_RSP	11
-#define OZ_EVT_EP_CREDIT	12
-#define OZ_EVT_EP_BUFFERING	13
-#define OZ_EVT_TX_ISOC_DONE	14
-#define OZ_EVT_TX_ISOC_DROP	15
-#define OZ_EVT_TIMER_CTRL	16
-#define OZ_EVT_TIMER		17
-#define OZ_EVT_PD_STATE		18
-#define OZ_EVT_SERVICE		19
-#define OZ_EVT_DEBUG		20
-
-struct oz_event {
-	__u32 jiffies;
-	__u8 evt;
-	__u8 ctx1;
-	__u16 ctx2;
-	__u32 ctx3;
-	__u32 ctx4;
-};
-
-#endif /* _OZEVENTDEF_H */

diff --git a/drivers/staging/ozwpan/ozhcd.c b/drivers/staging/ozwpan/ozhcd.c
index 8ac26f5..d68d63a 100644
--- a/drivers/staging/ozwpan/ozhcd.c
+++ b/drivers/staging/ozwpan/ozhcd.c

@@ -35,7 +35,6 @@
 #include "ozusbif.h"
 #include "oztrace.h"
 #include "ozurbparanoia.h"
-#include "ozevent.h"
 #include "ozhcd.h"
 /*------------------------------------------------------------------------------
  * Number of units of buffering to capture for an isochronous IN endpoint before
@@ -381,7 +380,6 @@
 			jiffies, urb, status, jiffies-submit_jiffies,
 			jiffies-last_time, atomic_read(&g_pending_urbs));
 		last_time = jiffies;
-		oz_event_log(OZ_EVT_URB_DONE, 0, 0, urb, status);
 		usb_hcd_giveback_urb(hcd, urb, status);
 	}
 	spin_lock(&g_tasklet_lock);
@@ -508,8 +506,6 @@
 		if (!in_dir && ep_addr && (ep->credit < 0)) {
 			ep->last_jiffies = jiffies;
 			ep->credit = 0;
-			oz_event_log(OZ_EVT_EP_CREDIT, ep->ep_num,
-					0, NULL, ep->credit);
 		}
 	} else {
 		err = -EPIPE;
@@ -766,7 +762,6 @@
 	struct urb *urb;
 	int err = 0;
 
-	oz_event_log(OZ_EVT_CTRL_CNF, 0, req_id, NULL, status);
 	oz_trace("oz_hcd_get_desc_cnf length = %d offs = %d tot_size = %d\n",
 			length, offset, total_size);
 	urb = oz_find_urb_by_id(port, 0, req_id);
@@ -905,7 +900,6 @@
 	unsigned windex;
 	unsigned wvalue;
 
-	oz_event_log(OZ_EVT_CTRL_CNF, 0, req_id, NULL, rcode);
 	oz_trace("oz_hcd_control_cnf rcode=%u len=%d\n", rcode, data_len);
 	urb = oz_find_urb_by_id(port, 0, req_id);
 	if (!urb) {
@@ -1059,8 +1053,6 @@
 		ep->credit += jiffies_to_msecs(now - ep->last_jiffies);
 		if (ep->credit > ep->credit_ceiling)
 			ep->credit = ep->credit_ceiling;
-		oz_event_log(OZ_EVT_EP_CREDIT, ep->ep_num, 0, NULL,
-			     ep->credit);
 		ep->last_jiffies = now;
 		while (ep->credit && !list_empty(&ep->urb_list)) {
 			urbl = list_first_entry(&ep->urb_list,
@@ -1069,8 +1061,6 @@
 			if ((ep->credit + 1) < urb->number_of_packets)
 				break;
 			ep->credit -= urb->number_of_packets;
-			oz_event_log(OZ_EVT_EP_CREDIT, ep->ep_num, 0, NULL,
-				     ep->credit);
 			list_move_tail(&urbl->link, &xfr_list);
 		}
 	}
@@ -1098,19 +1088,12 @@
 			if (ep->buffered_units >= OZ_IN_BUFFERING_UNITS) {
 				ep->flags &= ~OZ_F_EP_BUFFERING;
 				ep->credit = 0;
-				oz_event_log(OZ_EVT_EP_CREDIT,
-					ep->ep_num | USB_DIR_IN,
-					0, NULL, ep->credit);
 				ep->last_jiffies = now;
 				ep->start_frame = 0;
-				oz_event_log(OZ_EVT_EP_BUFFERING,
-					ep->ep_num | USB_DIR_IN, 0, NULL, 0);
 			}
 			continue;
 		}
 		ep->credit += jiffies_to_msecs(now - ep->last_jiffies);
-		oz_event_log(OZ_EVT_EP_CREDIT, ep->ep_num | USB_DIR_IN,
-			0, NULL, ep->credit);
 		ep->last_jiffies = now;
 		while (!list_empty(&ep->urb_list)) {
 			struct oz_urb_link *urbl =
@@ -1154,8 +1137,6 @@
 			ep->start_frame += urb->number_of_packets;
 			list_move_tail(&urbl->link, &xfr_list);
 			ep->credit -= urb->number_of_packets;
-			oz_event_log(OZ_EVT_EP_CREDIT, ep->ep_num | USB_DIR_IN,
-				0, NULL, ep->credit);
 		}
 	}
 	if (!list_empty(&port->isoc_out_ep) || !list_empty(&port->isoc_in_ep))
@@ -1243,12 +1224,10 @@
 		if ((ep->attrib & USB_ENDPOINT_XFERTYPE_MASK)
 			== USB_ENDPOINT_XFER_ISOC) {
 			oz_trace("wMaxPacketSize = %d\n",
-				hep->desc.wMaxPacketSize);
+				usb_endpoint_maxp(&hep->desc));
 			ep->credit_ceiling = 200;
 			if (ep_addr & USB_ENDPOINT_DIR_MASK) {
 				ep->flags |= OZ_F_EP_BUFFERING;
-				oz_event_log(OZ_EVT_EP_BUFFERING,
-					ep->ep_num | USB_DIR_IN, 1, NULL, 0);
 			} else {
 				ep->flags |= OZ_F_EP_HAVE_STREAM;
 				if (oz_usb_stream_create(port->hpd, ep_num))
@@ -1455,8 +1434,6 @@
 			oz_trace("USB_REQ_GET_DESCRIPTOR - req\n");
 			break;
 		case USB_REQ_SET_ADDRESS:
-			oz_event_log(OZ_EVT_CTRL_LOCAL, setup->bRequest,
-				0, NULL, setup->bRequestType);
 			oz_trace("USB_REQ_SET_ADDRESS - req\n");
 			oz_trace("Port %d address is 0x%x\n", ozhcd->conn_port,
 				(u8)le16_to_cpu(setup->wValue));
@@ -1477,8 +1454,6 @@
 			/* We short circuit this case and reply directly since
 			 * we have the selected configuration number cached.
 			 */
-			oz_event_log(OZ_EVT_CTRL_LOCAL, setup->bRequest, 0,
-				     NULL, setup->bRequestType);
 			oz_trace("USB_REQ_GET_CONFIGURATION - reply now\n");
 			if (urb->transfer_buffer_length >= 1) {
 				urb->actual_length = 1;
@@ -1493,8 +1468,6 @@
 			/* We short circuit this case and reply directly since
 			 * we have the selected interface alternative cached.
 			 */
-			oz_event_log(OZ_EVT_CTRL_LOCAL, setup->bRequest, 0,
-				     NULL, setup->bRequestType);
 			oz_trace("USB_REQ_GET_INTERFACE - reply now\n");
 			if (urb->transfer_buffer_length >= 1) {
 				urb->actual_length = 1;
@@ -1744,20 +1717,6 @@
 	oz_trace("oz_hcd_shutdown()\n");
 }
 /*------------------------------------------------------------------------------
- * Context: any
- */
-#ifdef WANT_EVENT_TRACE
-static u8 oz_get_irq_ctx(void)
-{
-	u8 irq_info = 0;
-	if (in_interrupt())
-		irq_info |= 1;
-	if (in_irq())
-		irq_info |= 2;
-	return irq_info;
-}
-#endif /* WANT_EVENT_TRACE */
-/*------------------------------------------------------------------------------
  * Called to queue an urb for the device.
  * This function should return a non-zero error code if it fails the urb but
  * should not call usb_hcd_giveback_urb().
@@ -1774,8 +1733,6 @@
 	struct oz_urb_link *urbl;
 	oz_trace2(OZ_TRACE_URB, "%lu: oz_hcd_urb_enqueue(%p)\n",
 		jiffies, urb);
-	oz_event_log(OZ_EVT_URB_SUBMIT, oz_get_irq_ctx(),
-		(u16)urb->number_of_packets, urb, urb->pipe);
 	if (unlikely(ozhcd == NULL)) {
 		oz_trace2(OZ_TRACE_URB, "%lu: Refused urb(%p) not ozhcd.\n",
 			jiffies, urb);
@@ -1835,10 +1792,6 @@
 				ep->credit -= urb->number_of_packets;
 				if (ep->credit < 0)
 					ep->credit = 0;
-				oz_event_log(OZ_EVT_EP_CREDIT,
-					usb_pipein(urb->pipe) ?
-					(ep->ep_num | USB_DIR_IN) : ep->ep_num,
-					0, NULL, ep->credit);
 			}
 			return urbl;
 		}

diff --git a/drivers/staging/ozwpan/ozmain.c b/drivers/staging/ozwpan/ozmain.c
index 57a0cbd..51fe9e9 100644
--- a/drivers/staging/ozwpan/ozmain.c
+++ b/drivers/staging/ozwpan/ozmain.c

@@ -15,7 +15,6 @@
 #include "ozproto.h"
 #include "ozcdev.h"
 #include "oztrace.h"
-#include "ozevent.h"
 /*------------------------------------------------------------------------------
  * The name of the 802.11 mac device. Empty string is the default value but a
  * value can be supplied as a parameter to the module. An empty string means
@@ -28,14 +27,10 @@
  */
 static int __init ozwpan_init(void)
 {
-	oz_event_init();
 	oz_cdev_register();
 	oz_protocol_init(g_net_dev);
 	oz_app_enable(OZ_APPID_USB, 1);
 	oz_apps_init();
-#ifdef CONFIG_DEBUG_FS
-	oz_debugfs_init();
-#endif
 	return 0;
 }
 /*------------------------------------------------------------------------------
@@ -46,10 +41,6 @@
 	oz_protocol_term();
 	oz_apps_term();
 	oz_cdev_deregister();
-	oz_event_term();
-#ifdef CONFIG_DEBUG_FS
-	oz_debugfs_remove();
-#endif
 }
 /*------------------------------------------------------------------------------
  */

diff --git a/drivers/staging/ozwpan/ozpd.c b/drivers/staging/ozwpan/ozpd.c
index f8b9da0..d67dff2 100644
--- a/drivers/staging/ozwpan/ozpd.c
+++ b/drivers/staging/ozwpan/ozpd.c

@@ -15,7 +15,6 @@
 #include "ozpd.h"
 #include "ozproto.h"
 #include "oztrace.h"
-#include "ozevent.h"
 #include "ozcdev.h"
 #include "ozusbsvc.h"
 #include <asm/unaligned.h>
@@ -121,7 +120,6 @@
 void oz_pd_set_state(struct oz_pd *pd, unsigned state)
 {
 	pd->state = state;
-	oz_event_log(OZ_EVT_PD_STATE, 0, 0, NULL, state);
 #ifdef WANT_TRACE
 	switch (state) {
 	case OZ_PD_S_IDLE:
@@ -544,7 +542,6 @@
 			if (dev_queue_xmit(skb) < 0) {
 				oz_trace2(OZ_TRACE_TX_FRAMES,
 						"Dropping ISOC Frame\n");
-				oz_event_log(OZ_EVT_TX_ISOC_DROP, 0, 0, NULL, 0);
 				return -1;
 			}
 			atomic_inc(&g_submitted_isoc);
@@ -555,7 +552,6 @@
 		} else {
 			kfree_skb(skb);
 			oz_trace2(OZ_TRACE_TX_FRAMES, "Dropping ISOC Frame>\n");
-			oz_event_log(OZ_EVT_TX_ISOC_DROP, 0, 0, NULL, 0);
 			return -1;
 		}
 	}
@@ -567,10 +563,6 @@
 		oz_set_more_bit(skb);
 	oz_trace2(OZ_TRACE_TX_FRAMES, "TX frame PN=0x%x\n", f->hdr.pkt_num);
 	if (skb) {
-		oz_event_log(OZ_EVT_TX_FRAME,
-			0,
-			(((u16)f->hdr.control)<<8)|f->hdr.last_pkt_num,
-			NULL, f->hdr.pkt_num);
 		if (dev_queue_xmit(skb) < 0)
 			return -1;
 
@@ -659,7 +651,6 @@
 		memcpy(elt, ei->data, ei->length);
 		elt = oz_next_elt(elt);
 	}
-	oz_event_log(OZ_EVT_TX_ISOC, 0, 0, NULL, 0);
 	dev_queue_xmit(skb);
 	oz_elt_info_free_chain(&pd->elt_buff, &list);
 	return 0;
@@ -768,8 +759,6 @@
 static void oz_isoc_destructor(struct sk_buff *skb)
 {
 	atomic_dec(&g_submitted_isoc);
-	oz_event_log(OZ_EVT_TX_ISOC_DONE, atomic_read(&g_submitted_isoc),
-		0, skb, 0);
 }
 /*------------------------------------------------------------------------------
  * Context: softirq
@@ -863,25 +852,19 @@
 			oz_trace2(OZ_TRACE_TX_FRAMES,
 			"Added ISOC Frame to Tx Queue isoc_nb= %d, nb= %d\n",
 			pd->nb_queued_isoc_frames, pd->nb_queued_frames);
-			oz_event_log(OZ_EVT_TX_ISOC, nb_units, iso.frame_number,
-					skb, atomic_read(&g_submitted_isoc));
 			return 0;
 		}
 
 		/*In ANYTIME mode Xmit unit immediately*/
 		if (atomic_read(&g_submitted_isoc) < OZ_MAX_SUBMITTED_ISOC) {
 			atomic_inc(&g_submitted_isoc);
-			oz_event_log(OZ_EVT_TX_ISOC, nb_units, iso.frame_number,
-					skb, atomic_read(&g_submitted_isoc));
-			if (dev_queue_xmit(skb) < 0) {
-				oz_event_log(OZ_EVT_TX_ISOC_DROP, 0, 0, NULL, 0);
+			if (dev_queue_xmit(skb) < 0)
 				return -1;
-			} else
+			else
 				return 0;
 		}
 
-out:	oz_event_log(OZ_EVT_TX_ISOC_DROP, 0, 0, NULL, 0);
-	kfree_skb(skb);
+out:	kfree_skb(skb);
 	return -1;
 
 	}

diff --git a/drivers/staging/ozwpan/ozproto.c b/drivers/staging/ozwpan/ozproto.c
index 3badf15..79ac7b5 100644
--- a/drivers/staging/ozwpan/ozproto.c
+++ b/drivers/staging/ozwpan/ozproto.c

@@ -18,7 +18,6 @@
 #include "ozusbsvc.h"
 #include "oztrace.h"
 #include "ozappif.h"
-#include "ozevent.h"
 #include <asm/unaligned.h>
 #include <linux/uaccess.h>
 #include <net/psnap.h>
@@ -116,7 +115,6 @@
 	oz_hdr->control = (OZ_PROTOCOL_VERSION<<OZ_VERSION_SHIFT);
 	oz_hdr->last_pkt_num = 0;
 	put_unaligned(0, &oz_hdr->pkt_num);
-	oz_event_log(OZ_EVT_CONNECT_RSP, 0, 0, NULL, 0);
 	elt->type = OZ_ELT_CONNECT_RSP;
 	elt->length = sizeof(struct oz_elt_connect_rsp);
 	memset(body, 0, sizeof(struct oz_elt_connect_rsp));
@@ -345,9 +343,6 @@
 	int dup = 0;
 	u32 pkt_num;
 
-	oz_event_log(OZ_EVT_RX_PROCESS, 0,
-		(((u16)oz_hdr->control)<<8)|oz_hdr->last_pkt_num,
-		NULL, oz_hdr->pkt_num);
 	oz_trace2(OZ_TRACE_RX_FRAMES,
 		"RX frame PN=0x%x LPN=0x%x control=0x%x\n",
 		oz_hdr->pkt_num, oz_hdr->last_pkt_num, oz_hdr->control);
@@ -402,7 +397,6 @@
 			break;
 		switch (elt->type) {
 		case OZ_ELT_CONNECT_REQ:
-			oz_event_log(OZ_EVT_CONNECT_REQ, 0, 0, NULL, 0);
 			oz_trace("RX: OZ_ELT_CONNECT_REQ\n");
 			pd = oz_connect_req(pd, elt, src_addr, skb->dev);
 			break;
@@ -534,7 +528,6 @@
 		/* This happens if we remove the current timer but can't stop
 		 * the timer from firing. In this case just get out.
 		 */
-		oz_event_log(OZ_EVT_TIMER, 0, 0, NULL, 0);
 		spin_unlock_bh(&g_polling_lock);
 		return;
 	}
@@ -545,7 +538,6 @@
 	spin_unlock_bh(&g_polling_lock);
 	do {
 		pd = t->pd;
-		oz_event_log(OZ_EVT_TIMER, 0, t->type, NULL, 0);
 		oz_pd_handle_timer(pd, t->type);
 		spin_lock_bh(&g_polling_lock);
 		if (g_timer_pool_count < OZ_MAX_TIMER_POOL_SIZE) {
@@ -582,14 +574,8 @@
 		g_cur_timer =
 			container_of(g_timer_list.next, struct oz_timer, link);
 		if (g_timer_state == OZ_TIMER_SET) {
-			oz_event_log(OZ_EVT_TIMER_CTRL, 3,
-				(u16)g_cur_timer->type, NULL,
-				(unsigned)g_cur_timer->due_time);
 			mod_timer(&g_timer, g_cur_timer->due_time);
 		} else {
-			oz_event_log(OZ_EVT_TIMER_CTRL, 4,
-				(u16)g_cur_timer->type, NULL,
-				(unsigned)g_cur_timer->due_time);
 			g_timer.expires = g_cur_timer->due_time;
 			g_timer.function = oz_protocol_timer;
 			g_timer.data = 0;
@@ -610,7 +596,6 @@
 	struct list_head *e;
 	struct oz_timer *t = NULL;
 	int restart_needed = 0;
-	oz_event_log(OZ_EVT_TIMER_CTRL, 1, (u16)type, NULL, (unsigned)due_time);
 	spin_lock(&g_polling_lock);
 	if (remove) {
 		list_for_each(e, &g_timer_list) {
@@ -673,7 +658,6 @@
 	struct oz_timer *n;
 	int restart_needed = 0;
 	int release = 0;
-	oz_event_log(OZ_EVT_TIMER_CTRL, 2, (u16)type, NULL, 0);
 	spin_lock(&g_polling_lock);
 	list_for_each_entry_safe(t, n, &g_timer_list, link) {
 		if ((t->pd == pd) && ((type == 0) || (t->type == type))) {
@@ -770,7 +754,6 @@
 static int oz_pkt_recv(struct sk_buff *skb, struct net_device *dev,
 		struct packet_type *pt, struct net_device *orig_dev)
 {
-	oz_event_log(OZ_EVT_RX_FRAME, 0, 0, NULL, 0);
 	skb = skb_share_check(skb, GFP_ATOMIC);
 	if (skb == NULL)
 		return 0;

diff --git a/drivers/staging/ozwpan/ozusbsvc.c b/drivers/staging/ozwpan/ozusbsvc.c
index 543a941..1676328 100644
--- a/drivers/staging/ozwpan/ozusbsvc.c
+++ b/drivers/staging/ozwpan/ozusbsvc.c

@@ -27,14 +27,12 @@
 #include "ozhcd.h"
 #include "oztrace.h"
 #include "ozusbsvc.h"
-#include "ozevent.h"
 /*------------------------------------------------------------------------------
  * This is called once when the driver is loaded to initialise the USB service.
  * Context: process
  */
 int oz_usb_init(void)
 {
-	oz_event_log(OZ_EVT_SERVICE, 1, OZ_APPID_USB, NULL, 0);
 	return oz_hcd_init();
 }
 /*------------------------------------------------------------------------------
@@ -43,7 +41,6 @@
  */
 void oz_usb_term(void)
 {
-	oz_event_log(OZ_EVT_SERVICE, 2, OZ_APPID_USB, NULL, 0);
 	oz_hcd_term();
 }
 /*------------------------------------------------------------------------------
@@ -55,7 +52,6 @@
 	int rc = 0;
 	struct oz_usb_ctx *usb_ctx;
 	struct oz_usb_ctx *old_ctx;
-	oz_event_log(OZ_EVT_SERVICE, 3, OZ_APPID_USB, NULL, resume);
 	if (resume) {
 		oz_trace("USB service resumed.\n");
 		return 0;
@@ -117,7 +113,6 @@
 void oz_usb_stop(struct oz_pd *pd, int pause)
 {
 	struct oz_usb_ctx *usb_ctx;
-	oz_event_log(OZ_EVT_SERVICE, 4, OZ_APPID_USB, NULL, pause);
 	if (pause) {
 		oz_trace("USB service paused.\n");
 		return;

diff --git a/drivers/staging/ozwpan/ozusbsvc1.c b/drivers/staging/ozwpan/ozusbsvc1.c
index 4e4b650..16e6078 100644
--- a/drivers/staging/ozwpan/ozusbsvc1.c
+++ b/drivers/staging/ozwpan/ozusbsvc1.c

@@ -22,7 +22,6 @@
 #include "ozhcd.h"
 #include "oztrace.h"
 #include "ozusbsvc.h"
-#include "ozevent.h"
 /*------------------------------------------------------------------------------
  */
 #define MAX_ISOC_FIXED_DATA	(253-sizeof(struct oz_isoc_fixed))
@@ -190,10 +189,6 @@
 	unsigned windex = le16_to_cpu(setup->wIndex);
 	unsigned wlength = le16_to_cpu(setup->wLength);
 	int rc = 0;
-	oz_event_log(OZ_EVT_CTRL_REQ, setup->bRequest, req_id,
-		(void *)(((unsigned long)(setup->wValue))<<16 |
-			((unsigned long)setup->wIndex)),
-		setup->bRequestType);
 	if ((setup->bRequestType & USB_TYPE_MASK) == USB_TYPE_STANDARD) {
 		switch (setup->bRequest) {
 		case USB_REQ_GET_DESCRIPTOR:

diff --git a/drivers/staging/panel/panel.c b/drivers/staging/panel/panel.c
index c54df39..cbc15c1 100644
--- a/drivers/staging/panel/panel.c
+++ b/drivers/staging/panel/panel.c

@@ -1756,17 +1756,18 @@
 
 			if (input->high_timer == 0) {
 				char *press_str = input->u.kbd.press_str;
-				if (press_str[0])
-					keypad_send_key(press_str,
-							sizeof(input->u.kbd.press_str));
+				if (press_str[0]) {
+					int s = sizeof(input->u.kbd.press_str);
+					keypad_send_key(press_str, s);
+				}
 			}
 
 			if (input->u.kbd.repeat_str[0]) {
 				char *repeat_str = input->u.kbd.repeat_str;
 				if (input->high_timer >= KEYPAD_REP_START) {
+					int s = sizeof(input->u.kbd.repeat_str);
 					input->high_timer -= KEYPAD_REP_DELAY;
-					keypad_send_key(repeat_str,
-							sizeof(input->u.kbd.repeat_str));
+					keypad_send_key(repeat_str, s);
 				}
 				/* we will need to come back here soon */
 				inputs_stable = 0;
@@ -1802,10 +1803,11 @@
 
 			if (input->u.kbd.repeat_str[0]) {
 				char *repeat_str = input->u.kbd.repeat_str;
-				if (input->high_timer >= KEYPAD_REP_START)
+				if (input->high_timer >= KEYPAD_REP_START) {
+					int s = sizeof(input->u.kbd.repeat_str);
 					input->high_timer -= KEYPAD_REP_DELAY;
-					keypad_send_key(repeat_str,
-							sizeof(input->u.kbd.repeat_str));
+					keypad_send_key(repeat_str, s);
+				}
 				/* we will need to come back here soon */
 				inputs_stable = 0;
 			}
@@ -1822,9 +1824,10 @@
 				release_fct(input->u.std.release_data);
 		} else if (input->type == INPUT_TYPE_KBD) {
 			char *release_str = input->u.kbd.release_str;
-			if (release_str[0])
-				keypad_send_key(release_str,
-						sizeof(input->u.kbd.release_str));
+			if (release_str[0]) {
+				int s = sizeof(input->u.kbd.release_str);
+				keypad_send_key(release_str, s);
+			}
 		}
 
 		input->state = INPUT_ST_LOW;

diff --git a/drivers/staging/rtl8187se/ieee80211/ieee80211_rx.c b/drivers/staging/rtl8187se/ieee80211/ieee80211_rx.c
index e303159..d5df0d6 100644
--- a/drivers/staging/rtl8187se/ieee80211/ieee80211_rx.c
+++ b/drivers/staging/rtl8187se/ieee80211/ieee80211_rx.c

@@ -399,8 +399,8 @@
 		struct ieee_ibss_seq *entry = NULL;
 		u8 *mac = header->addr2;
 		int index = mac[5] % IEEE_IBSS_MAC_HASH_SIZE;
-		//for (pos = (head)->next; pos != (head); pos = pos->next)
-		__list_for_each(p, &ieee->ibss_mac_hash[index]) {
+
+		list_for_each(p, &ieee->ibss_mac_hash[index]) {
 			entry = list_entry(p, struct ieee_ibss_seq, list);
 			if (!memcmp(entry->mac, mac, ETH_ALEN))
 				break;

diff --git a/drivers/staging/rtl8192e/rtl8192e/r8192E_cmdpkt.c b/drivers/staging/rtl8192e/rtl8192e/r8192E_cmdpkt.c
index ea91744..5f10e40 100644
--- a/drivers/staging/rtl8192e/rtl8192e/r8192E_cmdpkt.c
+++ b/drivers/staging/rtl8192e/rtl8192e/r8192E_cmdpkt.c

@@ -20,20 +20,7 @@
 #include "rtl_core.h"
 #include "r8192E_hw.h"
 #include "r8192E_cmdpkt.h"
-/*---------------------------Define Local Constant---------------------------*/
-/* Debug constant*/
-#define		CMPK_DEBOUNCE_CNT			1
-#define		CMPK_PRINT(Address)\
-{\
-	unsigned char	i;\
-	u32	temp[10];\
-	\
-	memcpy(temp, Address, 40);\
-	for (i = 0; i < 40; i += 4)\
-		printk(KERN_INFO "\r\n %08x", temp[i]);\
-}
 
-/*---------------------------Define functions---------------------------------*/
 bool cmpk_message_handle_tx(
 	struct net_device *dev,
 	u8	*code_virtual_address,
@@ -100,7 +87,7 @@
 	write_nic_byte(dev, TPPoll, TPPoll_CQ);
 Failed:
 	return rt_status;
-}	/* CMPK_Message_Handle_Tx */
+}
 
 static	void
 cmpk_count_txstatistic(
@@ -149,23 +136,19 @@
 
 	priv->stats.txretrycount += pstx_fb->retry_cnt;
 	priv->stats.txfeedbackretry += pstx_fb->retry_cnt;
-
-}	/* cmpk_CountTxStatistic */
-
-
+}
 
 static void cmpk_handle_tx_feedback(struct net_device *dev, u8 *pmsg)
 {
 	struct r8192_priv *priv = rtllib_priv(dev);
-	struct cmpk_txfb rx_tx_fb;	/* */
+	struct cmpk_txfb rx_tx_fb;
 
 	priv->stats.txfeedback++;
 
 
 	memcpy((u8 *)&rx_tx_fb, pmsg, sizeof(struct cmpk_txfb));
 	cmpk_count_txstatistic(dev, &rx_tx_fb);
-
-}	/* cmpk_Handle_Tx_Feedback */
+}
 
 static void cmdpkt_beacontimerinterrupt_819xusb(struct net_device *dev)
 {
@@ -182,7 +165,6 @@
 		tx_rate = 10;
 		DMESG("send beacon frame  tx rate is 1Mbpm\n");
 	}
-
 }
 
 static void cmpk_handle_interrupt_status(struct net_device *dev, u8 *pmsg)
@@ -192,14 +174,12 @@
 
 	DMESG("---> cmpk_Handle_Interrupt_Status()\n");
 
-
 	rx_intr_status.length = pmsg[1];
 	if (rx_intr_status.length != (sizeof(struct cmpk_intr_sta) - 2)) {
 		DMESG("cmpk_Handle_Interrupt_Status: wrong length!\n");
 		return;
 	}
 
-
 	if (priv->rtllib->iw_mode == IW_MODE_ADHOC) {
 		rx_intr_status.interrupt_status = *((u32 *)(pmsg + 4));
 
@@ -220,12 +200,11 @@
 
 	DMESG("<---- cmpk_handle_interrupt_status()\n");
 
-}	/* cmpk_handle_interrupt_status */
-
+}
 
 static	void cmpk_handle_query_config_rx(struct net_device *dev, u8 *pmsg)
 {
-	cmpk_query_cfg_t	rx_query_cfg;	/* */
+	cmpk_query_cfg_t	rx_query_cfg;
 
 
 	rx_query_cfg.cfg_action = (pmsg[4] & 0x80000000)>>31;
@@ -238,8 +217,7 @@
 	rx_query_cfg.mask = (pmsg[12] << 24) | (pmsg[13] << 16) |
 			    (pmsg[14] << 8) | (pmsg[15] << 0);
 
-}	/* cmpk_Handle_Query_Config_Rx */
-
+}
 
 static void cmpk_count_tx_status(struct net_device *dev,
 				 struct cmpk_tx_status *pstx_status)
@@ -280,13 +258,11 @@
 	priv->stats.txbytesunicast		+= pstx_status->txuclength;
 
 	priv->stats.last_packet_rate		= pstx_status->rate;
-}	/* cmpk_CountTxStatus */
-
-
+}
 
 static	void cmpk_handle_tx_status(struct net_device *dev, u8 *pmsg)
 {
-	struct cmpk_tx_status rx_tx_sts;	/* */
+	struct cmpk_tx_status rx_tx_sts;
 
 	memcpy((void *)&rx_tx_sts, (void *)pmsg, sizeof(struct cmpk_tx_status));
 	cmpk_count_tx_status(dev, &rx_tx_sts);
@@ -300,7 +276,6 @@
 	u32 *ptemp;
 	struct r8192_priv *priv = rtllib_priv(dev);
 
-
 #ifdef ENABLE_PS
 	pAdapter->HalFunc.GetHwRegHandler(pAdapter, HW_VAR_RF_STATE,
 					 (pu1Byte)(&rtState));
@@ -335,10 +310,8 @@
 			priv->stats.txrate.ht_mcs[j][i] +=
 							 ptxrate->ht_mcs[j][i];
 	}
-
 }
 
-
 u32 cmpk_message_handle_rx(struct net_device *dev,
 			   struct rtllib_rx_stats *pstats)
 {
@@ -349,12 +322,8 @@
 
 	RT_TRACE(COMP_CMDPKT, "---->cmpk_message_handle_rx()\n");
 
-	if (pstats == NULL) {
-		/* Print error message. */
-		/*RT_TRACE(COMP_SEND, DebugLevel,
-				("\n\r[CMPK]-->Err queue id or pointer"));*/
+	if (pstats == NULL)
 		return 0;
-	}
 
 	total_length = pstats->Length;
 

diff --git a/drivers/staging/rtl8192e/rtl8192e/r8192E_dev.c b/drivers/staging/rtl8192e/rtl8192e/r8192E_dev.c
index a9d78e9..50c7bb7 100644
--- a/drivers/staging/rtl8192e/rtl8192e/r8192E_dev.c
+++ b/drivers/staging/rtl8192e/rtl8192e/r8192E_dev.c

@@ -2128,10 +2128,11 @@
 	struct rtllib_device *ieee = priv->rtllib;
 	u8 *pMcsRate = ieee->dot11HTOperationalRateSet;
 	u32 ratr_value = 0;
+	u16 rate_config = 0;
 	u8 rate_index = 0;
 
-	rtl8192_config_rate(dev, (u16 *)(&ratr_value));
-	ratr_value |= (*(u16 *)(pMcsRate)) << 12;
+	rtl8192_config_rate(dev, &rate_config);
+	ratr_value = rate_config | *pMcsRate << 12;
 	switch (ieee->mode) {
 	case IEEE_A:
 		ratr_value &= 0x00000FF0;

diff --git a/drivers/staging/rtl8192e/rtllib_rx.c b/drivers/staging/rtl8192e/rtllib_rx.c
index 8b8a5c6..e75364e 100644
--- a/drivers/staging/rtl8192e/rtllib_rx.c
+++ b/drivers/staging/rtl8192e/rtllib_rx.c

@@ -1822,7 +1822,7 @@
 				network->rates_ex[i] = info_element->data[i];
 				p += snprintf(p, sizeof(rates_str) -
 					      (p - rates_str), "%02X ",
-					      network->rates[i]);
+					      network->rates_ex[i]);
 				if (rtllib_is_ofdm_rate
 				    (info_element->data[i])) {
 					network->flags |= NETWORK_HAS_OFDM;

diff --git a/drivers/staging/rtl8192e/rtllib_softmac.c b/drivers/staging/rtl8192e/rtllib_softmac.c
index 4feecec..aefffac 100644
--- a/drivers/staging/rtl8192e/rtllib_softmac.c
+++ b/drivers/staging/rtl8192e/rtllib_softmac.c

@@ -1801,8 +1801,9 @@
 
 		if (*(t++) == MFIE_TYPE_CHALLENGE) {
 			*chlen = *(t++);
-			*challenge = kmalloc(*chlen, GFP_ATOMIC);
-			memcpy(*challenge, t, *chlen);	/*TODO - check here*/
+			*challenge = kmemdup(t, *chlen, GFP_ATOMIC);
+			if (!*challenge)
+				return -ENOMEM;
 		}
 	}
 	return cpu_to_le16(a->status);

diff --git a/drivers/staging/rtl8192u/ieee80211/dot11d.c b/drivers/staging/rtl8192u/ieee80211/dot11d.c
index f10fd5a..34edcfa 100644
--- a/drivers/staging/rtl8192u/ieee80211/dot11d.c
+++ b/drivers/staging/rtl8192u/ieee80211/dot11d.c

@@ -67,9 +67,9 @@
 void
 Dot11d_UpdateCountryIe(
 	struct ieee80211_device *dev,
-	u8 *		pTaddr,
+	u8 *pTaddr,
 	u16	CoutryIeLen,
-	u8 * pCoutryIe
+	u8 *pCoutryIe
 	)
 {
 	PRT_DOT11D_INFO pDot11dInfo = GET_DOT11D_INFO(dev);
@@ -101,7 +101,7 @@
 			MaxChnlNum = pTriple->FirstChnl + j;
 		}
 
-		pTriple = (PCHNL_TXPOWER_TRIPLE)((u8*)pTriple + 3);
+		pTriple = (PCHNL_TXPOWER_TRIPLE)((u8 *)pTriple + 3);
 	}
 	//printk("Dot11d_UpdateCountryIe(): Channel List:\n");
 	printk("Channel List:");
@@ -143,12 +143,12 @@
 
 void
 DOT11D_ScanComplete(
-	struct ieee80211_device * dev
+	struct ieee80211_device *dev
 	)
 {
 	PRT_DOT11D_INFO pDot11dInfo = GET_DOT11D_INFO(dev);
 
-	switch(pDot11dInfo->State)
+	switch (pDot11dInfo->State)
 	{
 	case DOT11D_STATE_LEARNED:
 		pDot11dInfo->State = DOT11D_STATE_DONE;
@@ -166,7 +166,7 @@
 }
 
 int IsLegalChannel(
-	struct ieee80211_device * dev,
+	struct ieee80211_device *dev,
 	u8 channel
 )
 {
@@ -183,7 +183,7 @@
 }
 
 int ToLegalChannel(
-	struct ieee80211_device * dev,
+	struct ieee80211_device *dev,
 	u8 channel
 )
 {

diff --git a/drivers/staging/rtl8192u/ieee80211/dot11d.h b/drivers/staging/rtl8192u/ieee80211/dot11d.h
index 54f2b4c..6aa8c15 100644
--- a/drivers/staging/rtl8192u/ieee80211/dot11d.h
+++ b/drivers/staging/rtl8192u/ieee80211/dot11d.h

@@ -71,9 +71,9 @@
 void
 Dot11d_UpdateCountryIe(
 	struct ieee80211_device *dev,
-	u8 *		pTaddr,
+	u8 *pTaddr,
 	u16	CoutryIeLen,
-	u8 * pCoutryIe
+	u8 *pCoutryIe
 	);
 
 u8
@@ -84,16 +84,16 @@
 
 void
 DOT11D_ScanComplete(
-	struct ieee80211_device * dev
+	struct ieee80211_device *dev
 	);
 
 int IsLegalChannel(
-	struct ieee80211_device * dev,
+	struct ieee80211_device *dev,
 	u8 channel
 );
 
 int ToLegalChannel(
-	struct ieee80211_device * dev,
+	struct ieee80211_device *dev,
 	u8 channel
 );
 #endif // #ifndef __INC_DOT11D_H

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211.h b/drivers/staging/rtl8192u/ieee80211/ieee80211.h
index 210898c..c9f3bb3 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211.h
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211.h

@@ -493,8 +493,8 @@
 #define IsDataFrame(pdu)			( ((pdu[0] & 0x0C)==0x08) ? true : false )
 #define	IsLegacyDataFrame(pdu)	(IsDataFrame(pdu) && (!(pdu[0]&FC_QOS_BIT)) )
 //added by wb. Is this right?
-#define IsQoSDataFrame(pframe)  ((*(u16*)pframe&(IEEE80211_STYPE_QOS_DATA|IEEE80211_FTYPE_DATA)) == (IEEE80211_STYPE_QOS_DATA|IEEE80211_FTYPE_DATA))
-#define Frame_Order(pframe)     (*(u16*)pframe&IEEE80211_FCTL_ORDER)
+#define IsQoSDataFrame(pframe)  ((*(u16 *)pframe&(IEEE80211_STYPE_QOS_DATA|IEEE80211_FTYPE_DATA)) == (IEEE80211_STYPE_QOS_DATA|IEEE80211_FTYPE_DATA))
+#define Frame_Order(pframe)     (*(u16 *)pframe&IEEE80211_FCTL_ORDER)
 #define SN_LESS(a, b)		(((a-b)&0x800)!=0)
 #define SN_EQUAL(a, b)	(a == b)
 #define MAX_DEV_ADDR_SIZE 8
@@ -538,7 +538,7 @@
 	do{ if ((ieee80211_debug_level & (level)) == (level))	\
 		{	\
 			int i;					\
-			u8* pdata = (u8*) data;			\
+			u8 *pdata = (u8 *) data;			\
 			printk(KERN_DEBUG "ieee80211: %s()\n", __FUNCTION__);	\
 			for(i=0; i<(int)(datalen); i++)			\
 			{						\
@@ -914,7 +914,7 @@
 	bool      bIsCCK;
 	bool      bPacketToSelf;
 	//added by amy
-	u8*       virtual_address;
+	u8        *virtual_address;
 	u16          packetlength;              // Total packet length: Must equal to sum of all FragLength
 	u16          fraglength;                        // FragLength should equal to PacketLength in non-fragment case
 	u16          fragoffset;                        // Data offset for this fragment
@@ -1366,13 +1366,13 @@
 	return ((u32)type >= ARRAY_SIZE(eap_types)) ? "Unknown" : eap_types[type];
 }
 //added by amy for reorder
-static inline u8 Frame_QoSTID(u8* buf)
+static inline u8 Frame_QoSTID(u8 *buf)
 {
 	struct ieee80211_hdr_3addr *hdr;
 	u16 fc;
 	hdr = (struct ieee80211_hdr_3addr *)buf;
 	fc = le16_to_cpu(hdr->frame_ctl);
-	return (u8)((frameqos*)(buf + (((fc & IEEE80211_FCTL_TODS)&&(fc & IEEE80211_FCTL_FROMDS))? 30 : 24)))->field.tid;
+	return (u8)((frameqos *)(buf + (((fc & IEEE80211_FCTL_TODS)&&(fc & IEEE80211_FCTL_FROMDS))? 30 : 24)))->field.tid;
 }
 
 //added by amy for reorder
@@ -1670,7 +1670,7 @@
 typedef struct _RX_REORDER_ENTRY {
 	struct list_head	List;
 	u16			SeqNum;
-	struct ieee80211_rxb* prxb;
+	struct ieee80211_rxb *prxb;
 } RX_REORDER_ENTRY, *PRX_REORDER_ENTRY;
 //added by amy for order
 typedef enum _Fsync_State{
@@ -1965,7 +1965,7 @@
 
 	/* map of allowed channels. 0 is dummy */
 	// FIXME: remember to default to a basic channel plan depending of the PHY type
-	void* pDot11dInfo;
+	void *pDot11dInfo;
 	bool bGlobalDomain;
 	int rate;       /* current rate */
 	int basic_rate;
@@ -2107,10 +2107,10 @@
 			       struct net_device *dev);
 
 	int (*reset_port)(struct net_device *dev);
-	int (*is_queue_full) (struct net_device * dev, int pri);
+	int (*is_queue_full) (struct net_device *dev, int pri);
 
-	int (*handle_management) (struct net_device * dev,
-				  struct ieee80211_network * network, u16 type);
+	int (*handle_management) (struct net_device *dev,
+				  struct ieee80211_network *network, u16 type);
 	int (*is_qos_active) (struct net_device *dev, struct sk_buff *skb);
 
 	/* Softmac-generated frames (management) are TXed via this
@@ -2187,8 +2187,8 @@
 	void (*ps_request_tx_ack) (struct net_device *dev);
 	void (*enter_sleep_state) (struct net_device *dev, u32 th, u32 tl);
 	short (*ps_is_queue_empty) (struct net_device *dev);
-	int (*handle_beacon) (struct net_device * dev, struct ieee80211_beacon * beacon, struct ieee80211_network * network);
-	int (*handle_assoc_response) (struct net_device * dev, struct ieee80211_assoc_response_frame * resp, struct ieee80211_network * network);
+	int (*handle_beacon) (struct net_device *dev, struct ieee80211_beacon *beacon, struct ieee80211_network *network);
+	int (*handle_assoc_response) (struct net_device *dev, struct ieee80211_assoc_response_frame *resp, struct ieee80211_network *network);
 
 
 	/* check whether Tx hw resource available */
@@ -2197,9 +2197,9 @@
 //	void (*SwChnlByTimerHandler)(struct net_device *dev, int channel);
 	void (*SetBWModeHandler)(struct net_device *dev, HT_CHANNEL_WIDTH Bandwidth, HT_EXTCHNL_OFFSET Offset);
 //	void (*UpdateHalRATRTableHandler)(struct net_device* dev, u8* pMcsRate);
-	bool (*GetNmodeSupportBySecCfg)(struct net_device* dev);
-	void (*SetWirelessMode)(struct net_device* dev, u8 wireless_mode);
-	bool (*GetHalfNmodeSupportByAPsHandler)(struct net_device* dev);
+	bool (*GetNmodeSupportBySecCfg)(struct net_device *dev);
+	void (*SetWirelessMode)(struct net_device *dev, u8 wireless_mode);
+	bool (*GetHalfNmodeSupportByAPsHandler)(struct net_device *dev);
 	void (*InitialGainHandler)(struct net_device *dev, u8 Operation);
 
 	/* This must be the last item so that it points to the data
@@ -2401,10 +2401,10 @@
 #if WIRELESS_EXT >= 18
 extern int ieee80211_wx_get_encode_ext(struct ieee80211_device *ieee,
 			    struct iw_request_info *info,
-			    union iwreq_data* wrqu, char *extra);
+			    union iwreq_data *wrqu, char *extra);
 extern int ieee80211_wx_set_encode_ext(struct ieee80211_device *ieee,
 			    struct iw_request_info *info,
-			    union iwreq_data* wrqu, char *extra);
+			    union iwreq_data *wrqu, char *extra);
 extern int ieee80211_wx_set_auth(struct ieee80211_device *ieee,
 			       struct iw_request_info *info,
 			       struct iw_param *data, char *extra);
@@ -2422,7 +2422,7 @@
 			u16 stype);
 extern void ieee80211_softmac_new_net(struct ieee80211_device *ieee, struct ieee80211_network *net);
 
-void SendDisassociation(struct ieee80211_device *ieee, u8* asSta, u8 asRsn);
+void SendDisassociation(struct ieee80211_device *ieee, u8 *asSta, u8 asRsn);
 extern void ieee80211_softmac_xmit(struct ieee80211_txb *txb, struct ieee80211_device *ieee);
 
 extern void ieee80211_stop_send_beacons(struct ieee80211_device *ieee);
@@ -2528,52 +2528,52 @@
 			     union iwreq_data *wrqu, char *extra);
 //HT
 #define MAX_RECEIVE_BUFFER_SIZE 9100  //
-extern void HTDebugHTCapability(u8* CapIE, u8* TitleString );
-extern void HTDebugHTInfo(u8*  InfoIE, u8* TitleString);
+extern void HTDebugHTCapability(u8 *CapIE, u8 *TitleString );
+extern void HTDebugHTInfo(u8 *InfoIE, u8 *TitleString);
 
-void HTSetConnectBwMode(struct ieee80211_device* ieee, HT_CHANNEL_WIDTH Bandwidth, HT_EXTCHNL_OFFSET    Offset);
-extern void HTUpdateDefaultSetting(struct ieee80211_device* ieee);
-extern void HTConstructCapabilityElement(struct ieee80211_device* ieee, u8* posHTCap, u8* len, u8 isEncrypt);
-extern void HTConstructInfoElement(struct ieee80211_device* ieee, u8* posHTInfo, u8* len, u8 isEncrypt);
-extern void HTConstructRT2RTAggElement(struct ieee80211_device* ieee, u8* posRT2RTAgg, u8* len);
+void HTSetConnectBwMode(struct ieee80211_device *ieee, HT_CHANNEL_WIDTH Bandwidth, HT_EXTCHNL_OFFSET    Offset);
+extern void HTUpdateDefaultSetting(struct ieee80211_device *ieee);
+extern void HTConstructCapabilityElement(struct ieee80211_device *ieee, u8 *posHTCap, u8 *len, u8 isEncrypt);
+extern void HTConstructInfoElement(struct ieee80211_device *ieee, u8 *posHTInfo, u8 *len, u8 isEncrypt);
+extern void HTConstructRT2RTAggElement(struct ieee80211_device *ieee, u8 *posRT2RTAgg, u8 *len);
 extern void HTOnAssocRsp(struct ieee80211_device *ieee);
-extern void HTInitializeHTInfo(struct ieee80211_device* ieee);
+extern void HTInitializeHTInfo(struct ieee80211_device *ieee);
 extern void HTInitializeBssDesc(PBSS_HT pBssHT);
-extern void HTResetSelfAndSavePeerSetting(struct ieee80211_device* ieee, struct ieee80211_network * pNetwork);
-extern void HTUpdateSelfAndPeerSetting(struct ieee80211_device* ieee,   struct ieee80211_network * pNetwork);
-extern u8 HTGetHighestMCSRate(struct ieee80211_device* ieee, u8* pMCSRateSet, u8* pMCSFilter);
+extern void HTResetSelfAndSavePeerSetting(struct ieee80211_device *ieee, struct ieee80211_network *pNetwork);
+extern void HTUpdateSelfAndPeerSetting(struct ieee80211_device *ieee,   struct ieee80211_network *pNetwork);
+extern u8 HTGetHighestMCSRate(struct ieee80211_device *ieee, u8 *pMCSRateSet, u8 *pMCSFilter);
 extern u8 MCS_FILTER_ALL[];
 extern u16 MCS_DATA_RATE[2][2][77] ;
-extern u8 HTCCheck(struct ieee80211_device* ieee, u8*   pFrame);
+extern u8 HTCCheck(struct ieee80211_device *ieee, u8 *pFrame);
 //extern void HTSetConnectBwModeCallback(unsigned long data);
 extern void HTResetIOTSetting(PRT_HIGH_THROUGHPUT  pHTInfo);
-extern bool IsHTHalfNmodeAPs(struct ieee80211_device* ieee);
-extern u16 HTHalfMcsToDataRate(struct ieee80211_device* ieee,  u8      nMcsRate);
-extern u16 HTMcsToDataRate( struct ieee80211_device* ieee, u8 nMcsRate);
-extern u16  TxCountToDataRate( struct ieee80211_device* ieee, u8 nDataRate);
+extern bool IsHTHalfNmodeAPs(struct ieee80211_device *ieee);
+extern u16 HTHalfMcsToDataRate(struct ieee80211_device *ieee,  u8      nMcsRate);
+extern u16 HTMcsToDataRate( struct ieee80211_device *ieee, u8 nMcsRate);
+extern u16  TxCountToDataRate( struct ieee80211_device *ieee, u8 nDataRate);
 //function in BAPROC.c
-extern int ieee80211_rx_ADDBAReq( struct ieee80211_device* ieee, struct sk_buff *skb);
-extern int ieee80211_rx_ADDBARsp( struct ieee80211_device* ieee, struct sk_buff *skb);
-extern int ieee80211_rx_DELBA(struct ieee80211_device* ieee,struct sk_buff *skb);
-extern void TsInitAddBA( struct ieee80211_device* ieee, PTX_TS_RECORD   pTS, u8 Policy, u8 bOverwritePending);
-extern void TsInitDelBA( struct ieee80211_device* ieee, PTS_COMMON_INFO pTsCommonInfo, TR_SELECT TxRxSelect);
+extern int ieee80211_rx_ADDBAReq( struct ieee80211_device *ieee, struct sk_buff *skb);
+extern int ieee80211_rx_ADDBARsp( struct ieee80211_device *ieee, struct sk_buff *skb);
+extern int ieee80211_rx_DELBA(struct ieee80211_device *ieee,struct sk_buff *skb);
+extern void TsInitAddBA( struct ieee80211_device *ieee, PTX_TS_RECORD   pTS, u8 Policy, u8 bOverwritePending);
+extern void TsInitDelBA( struct ieee80211_device *ieee, PTS_COMMON_INFO pTsCommonInfo, TR_SELECT TxRxSelect);
 extern void BaSetupTimeOut(unsigned long data);
 extern void TxBaInactTimeout(unsigned long data);
 extern void RxBaInactTimeout(unsigned long data);
 extern void ResetBaEntry( PBA_RECORD pBA);
 //function in TS.c
 extern bool GetTs(
-	struct ieee80211_device*        ieee,
+	struct ieee80211_device		*ieee,
 	PTS_COMMON_INFO                 *ppTS,
-	u8*                             Addr,
+	u8                              *Addr,
 	u8                              TID,
 	TR_SELECT                       TxRxSelect,  //Rx:1, Tx:0
 	bool                            bAddNewTs
 	);
 extern void TSInitialize(struct ieee80211_device *ieee);
-extern  void TsStartAddBaProcess(struct ieee80211_device* ieee, PTX_TS_RECORD   pTxTS);
-extern void RemovePeerTS(struct ieee80211_device* ieee, u8* Addr);
-extern void RemoveAllTS(struct ieee80211_device* ieee);
+extern  void TsStartAddBaProcess(struct ieee80211_device *ieee, PTX_TS_RECORD   pTxTS);
+extern void RemovePeerTS(struct ieee80211_device *ieee, u8 *Addr);
+extern void RemoveAllTS(struct ieee80211_device *ieee);
 void ieee80211_softmac_scan_syncro(struct ieee80211_device *ieee);
 
 extern const long ieee80211_wlan_frequencies[];
@@ -2623,6 +2623,6 @@
 		struct ieee80211_network *network,
 		struct ieee80211_rx_stats *stats);
 
-void ieee80211_indicate_packets(struct ieee80211_device *ieee, struct ieee80211_rxb** prxbIndicateArray,u8  index);
+void ieee80211_indicate_packets(struct ieee80211_device *ieee, struct ieee80211_rxb **prxbIndicateArray,u8  index);
 #define RT_ASOC_RETRY_LIMIT	5
 #endif /* IEEE80211_H */

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.c
index a464d11..5533221 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.c

@@ -155,7 +155,7 @@
 }
 
 
-struct ieee80211_crypto_ops * ieee80211_get_crypto_ops(const char *name)
+struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name)
 {
 	unsigned long flags;
 	struct list_head *ptr;
@@ -182,7 +182,7 @@
 }
 
 
-static void * ieee80211_crypt_null_init(int keyidx) { return (void *) 1; }
+static void *ieee80211_crypt_null_init(int keyidx) { return (void *) 1; }
 static void ieee80211_crypt_null_deinit(void *priv) {}
 
 static struct ieee80211_crypto_ops ieee80211_crypt_null = {

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h
index b58a3bc..0b4ea43 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h

@@ -77,7 +77,7 @@
 
 int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops);
 int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops);
-struct ieee80211_crypto_ops * ieee80211_get_crypto_ops(const char *name);
+struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name);
 void ieee80211_crypt_deinit_entries(struct ieee80211_device *, int);
 void ieee80211_crypt_deinit_handler(unsigned long);
 void ieee80211_crypt_delayed_deinit(struct ieee80211_device *ieee,

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c
index fec0176..f2b1677 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c

@@ -60,10 +60,10 @@
 void ieee80211_ccmp_aes_encrypt(struct crypto_tfm *tfm,
 			     const u8 pt[16], u8 ct[16])
 {
-	crypto_cipher_encrypt_one((void*)tfm, ct, pt);
+	crypto_cipher_encrypt_one((void *)tfm, ct, pt);
 }
 
-static void * ieee80211_ccmp_init(int key_idx)
+static void *ieee80211_ccmp_init(int key_idx)
 {
 	struct ieee80211_ccmp_data *priv;
 
@@ -72,7 +72,7 @@
 		goto fail;
 	priv->key_idx = key_idx;
 
-       priv->tfm = (void*)crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+       priv->tfm = (void *)crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(priv->tfm)) {
 		printk(KERN_DEBUG "ieee80211_crypt_ccmp: could not allocate "
 		       "crypto API aes\n");
@@ -85,7 +85,7 @@
 fail:
 	if (priv) {
 		if (priv->tfm)
-			crypto_free_cipher((void*)priv->tfm);
+			crypto_free_cipher((void *)priv->tfm);
 		kfree(priv);
 	}
 
@@ -98,7 +98,7 @@
 	struct ieee80211_ccmp_data *_priv = priv;
 
 	if (_priv && _priv->tfm)
-		crypto_free_cipher((void*)_priv->tfm);
+		crypto_free_cipher((void *)_priv->tfm);
 	kfree(priv);
 }
 
@@ -393,7 +393,7 @@
 			data->rx_pn[4] = seq[1];
 			data->rx_pn[5] = seq[0];
 		}
-		crypto_cipher_setkey((void*)data->tfm, data->key, CCMP_TK_LEN);
+		crypto_cipher_setkey((void *)data->tfm, data->key, CCMP_TK_LEN);
 	} else if (len == 0)
 		data->key_set = 0;
 	else
@@ -427,7 +427,7 @@
 }
 
 
-static char * ieee80211_ccmp_print_stats(char *p, void *priv)
+static char *ieee80211_ccmp_print_stats(char *p, void *priv)
 {
 	struct ieee80211_ccmp_data *ccmp = priv;
 	p += sprintf(p, "key[%d] alg=CCMP key_set=%d "

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_tkip.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_tkip.c
index 555eb80..93121b4 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_tkip.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_tkip.c

@@ -62,7 +62,7 @@
 	u8 rx_hdr[16], tx_hdr[16];
 };
 
-static void * ieee80211_tkip_init(int key_idx)
+static void *ieee80211_tkip_init(int key_idx)
 {
 	struct ieee80211_tkip_data *priv;
 
@@ -499,8 +499,8 @@
 	return keyidx;
 }
 
-static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr,
-		       u8 * data, size_t data_len, u8 * mic)
+static int michael_mic(struct crypto_hash *tfm_michael, u8 *key, u8 *hdr,
+		       u8 *data, size_t data_len, u8 *mic)
 {
 	struct hash_desc desc;
 	struct scatterlist sg[2];
@@ -718,7 +718,7 @@
 }
 
 
-static char * ieee80211_tkip_print_stats(char *p, void *priv)
+static char *ieee80211_tkip_print_stats(char *p, void *priv)
 {
 	struct ieee80211_tkip_data *tkip = priv;
 	p += sprintf(p, "key[%d] alg=TKIP key_set=%d "

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_wep.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_wep.c
index 3801f12..f202236 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_wep.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_wep.c

@@ -38,7 +38,7 @@
 };
 
 
-static void * prism2_wep_init(int keyidx)
+static void *prism2_wep_init(int keyidx)
 {
 	struct prism2_wep_data *priv;
 
@@ -253,7 +253,7 @@
 }
 
 
-static char * prism2_wep_print_stats(char *p, void *priv)
+static char *prism2_wep_print_stats(char *p, void *priv)
 {
 	struct prism2_wep_data *wep = priv;
 	p += sprintf(p, "key[%d] alg=WEP len=%d\n",

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c
index ee7ce5f..a6b18409 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c

@@ -218,7 +218,7 @@
 	 * this is not mandatory.... but seems that the probe
 	 * response parser uses it
 	 */
-	struct ieee80211_hdr_3addr * hdr = (struct ieee80211_hdr_3addr *)skb->data;
+	struct ieee80211_hdr_3addr *hdr = (struct ieee80211_hdr_3addr *)skb->data;
 
 	rx_stats->len = skb->len;
 	ieee80211_rx_mgt(ieee,(struct ieee80211_hdr_4addr *)skb->data,rx_stats);
@@ -336,7 +336,7 @@
 
 /* Called only as a tasklet (software IRQ), by ieee80211_rx */
 static inline int
-ieee80211_rx_frame_decrypt(struct ieee80211_device* ieee, struct sk_buff *skb,
+ieee80211_rx_frame_decrypt(struct ieee80211_device *ieee, struct sk_buff *skb,
 			   struct ieee80211_crypt_data *crypt)
 {
 	struct ieee80211_hdr_4addr *hdr;
@@ -385,7 +385,7 @@
 
 /* Called only as a tasklet (software IRQ), by ieee80211_rx */
 static inline int
-ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device* ieee, struct sk_buff *skb,
+ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device *ieee, struct sk_buff *skb,
 			     int keyidx, struct ieee80211_crypt_data *crypt)
 {
 	struct ieee80211_hdr_4addr *hdr;
@@ -439,7 +439,7 @@
 	  tid = UP2AC(tid);
 	  tid ++;
 	} else if(IEEE80211_QOS_HAS_SEQ(fc)) { //QoS
-	  hdr_3addrqos = (struct ieee80211_hdr_3addrqos*)header;
+	  hdr_3addrqos = (struct ieee80211_hdr_3addrqos *)header;
 	  tid = le16_to_cpu(hdr_3addrqos->qos_ctl) & IEEE80211_QCTL_TID;
 	  tid = UP2AC(tid);
 	  tid ++;
@@ -454,8 +454,7 @@
 		struct ieee_ibss_seq *entry = NULL;
 		u8 *mac = header->addr2;
 		int index = mac[5] % IEEE_IBSS_MAC_HASH_SIZE;
-		//for (pos = (head)->next; pos != (head); pos = pos->next)
-		//__list_for_each(p, &ieee->ibss_mac_hash[index]) {
+
 		list_for_each(p, &ieee->ibss_mac_hash[index]) {
 			entry = list_entry(p, struct ieee_ibss_seq, list);
 			if (!memcmp(entry->mac, mac, ETH_ALEN))
@@ -548,7 +547,7 @@
 	return true;
 }
 
-void ieee80211_indicate_packets(struct ieee80211_device *ieee, struct ieee80211_rxb** prxbIndicateArray,u8  index)
+void ieee80211_indicate_packets(struct ieee80211_device *ieee, struct ieee80211_rxb **prxbIndicateArray,u8  index)
 {
 	u8 i = 0 , j=0;
 	u16 ethertype;
@@ -557,7 +556,7 @@
 	for(j = 0; j<index; j++)
 	{
 //added by amy for reorder
-		struct ieee80211_rxb* prxb = prxbIndicateArray[j];
+		struct ieee80211_rxb *prxb = prxbIndicateArray[j];
 		for(i = 0; i<prxb->nr_subframes; i++) {
 			struct sk_buff *sub_skb = prxb->subframes[i];
 
@@ -603,13 +602,13 @@
 
 
 void RxReorderIndicatePacket( struct ieee80211_device *ieee,
-		struct ieee80211_rxb* prxb,
+		struct ieee80211_rxb *prxb,
 		PRX_TS_RECORD		pTS,
 		u16			SeqNum)
 {
 	PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
 	PRX_REORDER_ENTRY	pReorderEntry = NULL;
-	struct ieee80211_rxb* prxbIndicateArray[REORDER_WIN_SIZE];
+	struct ieee80211_rxb *prxbIndicateArray[REORDER_WIN_SIZE];
 	u8			WinSize = pHTInfo->RxReorderWinSize;
 	u16			WinEnd = (pTS->RxIndicateSeq + WinSize -1)%4096;
 	u8			index = 0;
@@ -774,9 +773,9 @@
 
 u8 parse_subframe(struct sk_buff *skb,
 		  struct ieee80211_rx_stats *rx_stats,
-		  struct ieee80211_rxb *rxb,u8* src,u8* dst)
+		  struct ieee80211_rxb *rxb,u8 *src,u8 *dst)
 {
-	struct ieee80211_hdr_3addr  *hdr = (struct ieee80211_hdr_3addr* )skb->data;
+	struct ieee80211_hdr_3addr  *hdr = (struct ieee80211_hdr_3addr *)skb->data;
 	u16		fc = le16_to_cpu(hdr->frame_ctl);
 
 	u16		LLCOffset= sizeof(struct ieee80211_hdr_3addr);
@@ -831,7 +830,7 @@
 		memcpy(rxb->dst,dst,ETH_ALEN);
 		while(skb->len > ETHERNET_HEADER_SIZE) {
 			/* Offset 12 denote 2 mac address */
-			nSubframe_Length = *((u16*)(skb->data + 12));
+			nSubframe_Length = *((u16 *)(skb->data + 12));
 			//==m==>change the length order
 			nSubframe_Length = (nSubframe_Length>>8) + (nSubframe_Length<<8);
 
@@ -926,7 +925,7 @@
 	int keyidx = 0;
 
 	int i;
-	struct ieee80211_rxb* rxb = NULL;
+	struct ieee80211_rxb *rxb = NULL;
 	// cheat the the hdr type
 	hdr = (struct ieee80211_hdr_4addr *)skb->data;
 	stats = &ieee->stats;
@@ -1035,9 +1034,9 @@
 			//IEEE80211_DEBUG(IEEE80211_DL_REORDER,"%s(): QOS ENABLE AND RECEIVE QOS DATA , we will get Ts, tid:%d\n",__FUNCTION__, tid);
 		if(GetTs(
 				ieee,
-				(PTS_COMMON_INFO*) &pRxTS,
+				(PTS_COMMON_INFO *) &pRxTS,
 				hdr->addr2,
-				(u8)Frame_QoSTID((u8*)(skb->data)),
+				(u8)Frame_QoSTID((u8 *)(skb->data)),
 				RX_DIR,
 				true))
 		{
@@ -1289,7 +1288,7 @@
 	{
 		TID = Frame_QoSTID(skb->data);
 		SeqNum = WLAN_GET_SEQ_SEQ(sc);
-		GetTs(ieee,(PTS_COMMON_INFO*) &pTS,hdr->addr2,TID,RX_DIR,true);
+		GetTs(ieee,(PTS_COMMON_INFO *) &pTS,hdr->addr2,TID,RX_DIR,true);
 		if(TID !=0 && TID !=3)
 		{
 			ieee->bis_any_nonbepkts = true;
@@ -1597,7 +1596,7 @@
 	struct ieee80211_device *ieee,
 	struct ieee80211_info_element *info_element,
 	struct ieee80211_network *network,
-	u8 * addr2
+	u8 *addr2
 )
 {
 	if(IS_DOT11D_ENABLE(ieee))
@@ -2275,7 +2274,7 @@
 }
 
 static inline int is_same_network(struct ieee80211_network *src,
-				  struct ieee80211_network *dst, struct ieee80211_device* ieee)
+				  struct ieee80211_network *dst, struct ieee80211_device *ieee)
 {
 	/* A network is only a duplicate if the channel, BSSID, ESSID
 	 * and the capability field (in particular IBSS and BSS) all match.

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c
index 454f889..8a0075d 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c

@@ -688,7 +688,7 @@
 }
 
 
-static struct sk_buff* ieee80211_probe_resp(struct ieee80211_device *ieee, u8 *dest)
+static struct sk_buff *ieee80211_probe_resp(struct ieee80211_device *ieee, u8 *dest)
 {
 	u8 *tag;
 	int beacon_size;
@@ -696,7 +696,7 @@
 	struct sk_buff *skb = NULL;
 	int encrypt;
 	int atim_len,erp_len;
-	struct ieee80211_crypt_data* crypt;
+	struct ieee80211_crypt_data *crypt;
 
 	char *ssid = ieee->current_network.ssid;
 	int ssid_len = ieee->current_network.ssid_len;
@@ -705,12 +705,12 @@
 	int wpa_ie_len = ieee->wpa_ie_len;
 	u8 erpinfo_content = 0;
 
-	u8* tmp_ht_cap_buf;
+	u8 *tmp_ht_cap_buf;
 	u8 tmp_ht_cap_len=0;
-	u8* tmp_ht_info_buf;
+	u8 *tmp_ht_info_buf;
 	u8 tmp_ht_info_len=0;
 	PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
-	u8* tmp_generic_ie_buf=NULL;
+	u8 *tmp_generic_ie_buf=NULL;
 	u8 tmp_generic_ie_len=0;
 
 	if(rate_ex_len > 0) rate_ex_len+=2;
@@ -732,9 +732,9 @@
 	encrypt = ieee->host_encrypt && crypt && crypt->ops &&
 		((0 == strcmp(crypt->ops->name, "WEP") || wpa_ie_len));
 	//HT ralated element
-	tmp_ht_cap_buf =(u8*) &(ieee->pHTInfo->SelfHTCap);
+	tmp_ht_cap_buf =(u8 *) &(ieee->pHTInfo->SelfHTCap);
 	tmp_ht_cap_len = sizeof(ieee->pHTInfo->SelfHTCap);
-	tmp_ht_info_buf =(u8*) &(ieee->pHTInfo->SelfHTInfo);
+	tmp_ht_info_buf =(u8 *) &(ieee->pHTInfo->SelfHTInfo);
 	tmp_ht_info_len = sizeof(ieee->pHTInfo->SelfHTInfo);
 	HTConstructCapabilityElement(ieee, tmp_ht_cap_buf, &tmp_ht_cap_len,encrypt);
 	HTConstructInfoElement(ieee,tmp_ht_info_buf,&tmp_ht_info_len, encrypt);
@@ -764,7 +764,7 @@
 	if (!skb)
 		return NULL;
 	skb_reserve(skb, ieee->tx_headroom);
-	beacon_buf = (struct ieee80211_probe_response*) skb_put(skb, (beacon_size - ieee->tx_headroom));
+	beacon_buf = (struct ieee80211_probe_response *) skb_put(skb, (beacon_size - ieee->tx_headroom));
 	memcpy (beacon_buf->header.addr1, dest,ETH_ALEN);
 	memcpy (beacon_buf->header.addr2, ieee->dev->dev_addr, ETH_ALEN);
 	memcpy (beacon_buf->header.addr3, ieee->current_network.bssid, ETH_ALEN);
@@ -789,7 +789,7 @@
 	beacon_buf->info_element[0].id = MFIE_TYPE_SSID;
 	beacon_buf->info_element[0].len = ssid_len;
 
-	tag = (u8*) beacon_buf->info_element[0].data;
+	tag = (u8 *) beacon_buf->info_element[0].data;
 
 	memcpy(tag, ssid, ssid_len);
 
@@ -841,12 +841,12 @@
 }
 
 
-struct sk_buff* ieee80211_assoc_resp(struct ieee80211_device *ieee, u8 *dest)
+struct sk_buff *ieee80211_assoc_resp(struct ieee80211_device *ieee, u8 *dest)
 {
 	struct sk_buff *skb;
-	u8* tag;
+	u8 *tag;
 
-	struct ieee80211_crypt_data* crypt;
+	struct ieee80211_crypt_data *crypt;
 	struct ieee80211_assoc_response_frame *assoc;
 	short encrypt;
 
@@ -888,7 +888,7 @@
 	if (ieee->assoc_id == 0x2007) ieee->assoc_id=0;
 	else ieee->assoc_id++;
 
-	tag = (u8*) skb_put(skb, rate_len);
+	tag = (u8 *) skb_put(skb, rate_len);
 
 	ieee80211_MFIE_Brate(ieee, &tag);
 	ieee80211_MFIE_Grate(ieee, &tag);
@@ -896,7 +896,7 @@
 	return skb;
 }
 
-struct sk_buff* ieee80211_auth_resp(struct ieee80211_device *ieee,int status, u8 *dest)
+struct sk_buff *ieee80211_auth_resp(struct ieee80211_device *ieee,int status, u8 *dest)
 {
 	struct sk_buff *skb;
 	struct ieee80211_authentication *auth;
@@ -924,17 +924,17 @@
 
 }
 
-struct sk_buff* ieee80211_null_func(struct ieee80211_device *ieee,short pwr)
+struct sk_buff *ieee80211_null_func(struct ieee80211_device *ieee,short pwr)
 {
 	struct sk_buff *skb;
-	struct ieee80211_hdr_3addr* hdr;
+	struct ieee80211_hdr_3addr *hdr;
 
 	skb = dev_alloc_skb(sizeof(struct ieee80211_hdr_3addr));
 
 	if (!skb)
 		return NULL;
 
-	hdr = (struct ieee80211_hdr_3addr*)skb_put(skb,sizeof(struct ieee80211_hdr_3addr));
+	hdr = (struct ieee80211_hdr_3addr *)skb_put(skb,sizeof(struct ieee80211_hdr_3addr));
 
 	memcpy(hdr->addr1, ieee->current_network.bssid, ETH_ALEN);
 	memcpy(hdr->addr2, ieee->dev->dev_addr, ETH_ALEN);
@@ -950,7 +950,7 @@
 }
 
 
-void ieee80211_resp_to_assoc_rq(struct ieee80211_device *ieee, u8* dest)
+void ieee80211_resp_to_assoc_rq(struct ieee80211_device *ieee, u8 *dest)
 {
 	struct sk_buff *buf = ieee80211_assoc_resp(ieee, dest);
 
@@ -959,7 +959,7 @@
 }
 
 
-void ieee80211_resp_to_auth(struct ieee80211_device *ieee, int s, u8* dest)
+void ieee80211_resp_to_auth(struct ieee80211_device *ieee, int s, u8 *dest)
 {
 	struct sk_buff *buf = ieee80211_auth_resp(ieee, s, dest);
 
@@ -991,15 +991,15 @@
 	//u8 suit_select = 0;
 	//unsigned int wpa_len = beacon->wpa_ie_len;
 	//for HT
-	u8* ht_cap_buf = NULL;
+	u8 *ht_cap_buf = NULL;
 	u8 ht_cap_len=0;
-	u8* realtek_ie_buf=NULL;
+	u8 *realtek_ie_buf=NULL;
 	u8 realtek_ie_len=0;
 	int wpa_ie_len= ieee->wpa_ie_len;
 	unsigned int ckip_ie_len=0;
 	unsigned int ccxrm_ie_len=0;
 	unsigned int cxvernum_ie_len=0;
-	struct ieee80211_crypt_data* crypt;
+	struct ieee80211_crypt_data *crypt;
 	int encrypt;
 
 	unsigned int rate_len = ieee80211_MFIE_rate_len(ieee);
@@ -1016,7 +1016,7 @@
 	//Include High Throuput capability && Realtek proprietary
 	if(ieee->pHTInfo->bCurrentHTSupport&&ieee->pHTInfo->bEnableHT)
 	{
-		ht_cap_buf = (u8*)&(ieee->pHTInfo->SelfHTCap);
+		ht_cap_buf = (u8 *)&(ieee->pHTInfo->SelfHTCap);
 		ht_cap_len = sizeof(ieee->pHTInfo->SelfHTCap);
 		HTConstructCapabilityElement(ieee, ht_cap_buf, &ht_cap_len, encrypt);
 		if(ieee->pHTInfo->bCurrentRT2RTAggregation)
@@ -1314,7 +1314,7 @@
 
 void ieee80211_associate_step2(struct ieee80211_device *ieee)
 {
-	struct sk_buff* skb;
+	struct sk_buff *skb;
 	struct ieee80211_network *beacon = &ieee->current_network;
 
 	del_timer_sync(&ieee->associate_timer);
@@ -1536,7 +1536,7 @@
 }
 
 
-static inline u16 auth_parse(struct sk_buff *skb, u8** challenge, int *chlen)
+static inline u16 auth_parse(struct sk_buff *skb, u8 **challenge, int *chlen)
 {
 	struct ieee80211_authentication *a;
 	u8 *t;
@@ -1545,7 +1545,7 @@
 		return 0xcafe;
 	}
 	*challenge = NULL;
-	a = (struct ieee80211_authentication*) skb->data;
+	a = (struct ieee80211_authentication *) skb->data;
 	if(skb->len > (sizeof(struct ieee80211_authentication) +3)){
 		t = skb->data + sizeof(struct ieee80211_authentication);
 
@@ -1562,7 +1562,7 @@
 }
 
 
-int auth_rq_parse(struct sk_buff *skb,u8* dest)
+int auth_rq_parse(struct sk_buff *skb,u8 *dest)
 {
 	struct ieee80211_authentication *a;
 
@@ -1570,7 +1570,7 @@
 		IEEE80211_DEBUG_MGMT("invalid len in auth request: %d\n",skb->len);
 		return -1;
 	}
-	a = (struct ieee80211_authentication*) skb->data;
+	a = (struct ieee80211_authentication *) skb->data;
 
 	memcpy(dest,a->header.addr2, ETH_ALEN);
 
@@ -1595,7 +1595,7 @@
 
 	memcpy(src,header->addr2, ETH_ALEN);
 
-	skbend = (u8*)skb->data + skb->len;
+	skbend = (u8 *)skb->data + skb->len;
 
 	tag = skb->data + sizeof (struct ieee80211_hdr_3addr  );
 
@@ -1618,7 +1618,7 @@
 
 }
 
-int assoc_rq_parse(struct sk_buff *skb,u8* dest)
+int assoc_rq_parse(struct sk_buff *skb,u8 *dest)
 {
 	struct ieee80211_assoc_request_frame *a;
 
@@ -1629,7 +1629,7 @@
 		return -1;
 	}
 
-	a = (struct ieee80211_assoc_request_frame*) skb->data;
+	a = (struct ieee80211_assoc_request_frame *) skb->data;
 
 	memcpy(dest,a->header.addr2,ETH_ALEN);
 
@@ -1646,7 +1646,7 @@
 		return 0xcafe;
 	}
 
-	response_head = (struct ieee80211_assoc_response_frame*) skb->data;
+	response_head = (struct ieee80211_assoc_response_frame *) skb->data;
 	*aid = le16_to_cpu(response_head->aid) & 0x3fff;
 
 	status_code = le16_to_cpu(response_head->status);
@@ -1888,10 +1888,10 @@
 	}
 	spin_unlock_irqrestore(&ieee->lock, flags);
 }
-void ieee80211_process_action(struct ieee80211_device* ieee, struct sk_buff* skb)
+void ieee80211_process_action(struct ieee80211_device *ieee, struct sk_buff *skb)
 {
-	struct ieee80211_hdr* header = (struct ieee80211_hdr*)skb->data;
-	u8* act = ieee80211_get_payload(header);
+	struct ieee80211_hdr *header = (struct ieee80211_hdr *)skb->data;
+	u8 *act = ieee80211_get_payload(header);
 	u8 tmp = 0;
 //	IEEE80211_DEBUG_DATA(IEEE80211_DL_DATA|IEEE80211_DL_BA, skb->data, skb->len);
 	if (act == NULL)
@@ -1926,7 +1926,7 @@
 {
 	struct ieee80211_hdr_3addr *header = (struct ieee80211_hdr_3addr *) skb->data;
 	u16 errcode;
-	u8* challenge;
+	u8 *challenge;
 	int chlen=0;
 	int aid;
 	struct ieee80211_assoc_response_frame *assoc_resp;
@@ -1966,7 +1966,7 @@
 				/* station support qos */
 				/* Let the register setting defaultly with Legacy station */
 				if(ieee->qos_support) {
-					assoc_resp = (struct ieee80211_assoc_response_frame*)skb->data;
+					assoc_resp = (struct ieee80211_assoc_response_frame *)skb->data;
 					memset(network, 0, sizeof(*network));
 					if (ieee80211_parse_info_param(ieee,assoc_resp->info_element,\
 								rx_stats->len - sizeof(*assoc_resp),\
@@ -1979,7 +1979,7 @@
 						memcpy(ieee->pHTInfo->PeerHTInfoBuf, network->bssht.bdHTInfoBuf, network->bssht.bdHTInfoLen);
 					}
 					if (ieee->handle_assoc_response != NULL)
-						ieee->handle_assoc_response(ieee->dev, (struct ieee80211_assoc_response_frame*)header, network);
+						ieee->handle_assoc_response(ieee->dev, (struct ieee80211_assoc_response_frame *)header, network);
 				}
 				ieee80211_associate_complete(ieee);
 			} else {
@@ -3124,7 +3124,7 @@
 void
 SendDisassociation(
 		struct ieee80211_device *ieee,
-		u8*					asSta,
+		u8					*asSta,
 		u8						asRsn
 )
 {

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_tx.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_tx.c
index c39e680..9955042 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_tx.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_tx.c

@@ -183,7 +183,7 @@
 	struct sk_buff *frag,
 	int hdr_len)
 {
-	struct ieee80211_crypt_data* crypt = ieee->crypt[ieee->tx_keyidx];
+	struct ieee80211_crypt_data *crypt = ieee->crypt[ieee->tx_keyidx];
 	int res;
 
 	if (!(crypt && crypt->ops))
@@ -243,7 +243,7 @@
 	struct ieee80211_txb *txb;
 	int i;
 	txb = kmalloc(
-		sizeof(struct ieee80211_txb) + (sizeof(u8*) * nr_frags),
+		sizeof(struct ieee80211_txb) + (sizeof(u8 *) * nr_frags),
 		gfp_mask);
 	if (!txb)
 		return NULL;
@@ -303,11 +303,11 @@
 }
 
 #define SN_LESS(a, b)		(((a-b)&0x800)!=0)
-void ieee80211_tx_query_agg_cap(struct ieee80211_device* ieee, struct sk_buff* skb, cb_desc* tcb_desc)
+void ieee80211_tx_query_agg_cap(struct ieee80211_device *ieee, struct sk_buff *skb, cb_desc *tcb_desc)
 {
 	PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
 	PTX_TS_RECORD			pTxTs = NULL;
-	struct ieee80211_hdr_1addr* hdr = (struct ieee80211_hdr_1addr*)skb->data;
+	struct ieee80211_hdr_1addr *hdr = (struct ieee80211_hdr_1addr *)skb->data;
 
 	if (!pHTInfo->bCurrentHTSupport||!pHTInfo->bEnableHT)
 		return;
@@ -330,7 +330,7 @@
 	}
 	if(pHTInfo->bCurrentAMPDUEnable)
 	{
-		if (!GetTs(ieee, (PTS_COMMON_INFO*)(&pTxTs), hdr->addr1, skb->priority, TX_DIR, true))
+		if (!GetTs(ieee, (PTS_COMMON_INFO *)(&pTxTs), hdr->addr1, skb->priority, TX_DIR, true))
 		{
 			printk("===>can't get TS\n");
 			return;
@@ -356,7 +356,7 @@
 		}
 	}
 FORCED_AGG_SETTING:
-	switch(pHTInfo->ForcedAMPDUMode )
+	switch (pHTInfo->ForcedAMPDUMode )
 	{
 		case HT_AGG_AUTO:
 			break;
@@ -377,7 +377,7 @@
 		return;
 }
 
-extern void ieee80211_qurey_ShortPreambleMode(struct ieee80211_device* ieee, cb_desc* tcb_desc)
+extern void ieee80211_qurey_ShortPreambleMode(struct ieee80211_device *ieee, cb_desc *tcb_desc)
 {
 	tcb_desc->bUseShortPreamble = false;
 	if (tcb_desc->data_rate == 2)
@@ -412,7 +412,7 @@
 		tcb_desc->bUseShortGI = true;
 }
 
-void ieee80211_query_BandwidthMode(struct ieee80211_device* ieee, cb_desc *tcb_desc)
+void ieee80211_query_BandwidthMode(struct ieee80211_device *ieee, cb_desc *tcb_desc)
 {
 	PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
 
@@ -432,7 +432,7 @@
 	return;
 }
 
-void ieee80211_query_protectionmode(struct ieee80211_device* ieee, cb_desc* tcb_desc, struct sk_buff* skb)
+void ieee80211_query_protectionmode(struct ieee80211_device *ieee, cb_desc *tcb_desc, struct sk_buff *skb)
 {
 	// Common Settings
 	tcb_desc->bRTSSTBC			= false;
@@ -543,7 +543,7 @@
 }
 
 
-void ieee80211_txrate_selectmode(struct ieee80211_device* ieee, cb_desc* tcb_desc)
+void ieee80211_txrate_selectmode(struct ieee80211_device *ieee, cb_desc *tcb_desc)
 {
 #ifdef TO_DO_LIST
 	if(!IsDataFrame(pFrame))
@@ -573,14 +573,14 @@
 	}
 }
 
-void ieee80211_query_seqnum(struct ieee80211_device*ieee, struct sk_buff* skb, u8* dst)
+void ieee80211_query_seqnum(struct ieee80211_device *ieee, struct sk_buff *skb, u8 *dst)
 {
 	if (is_multicast_ether_addr(dst))
 		return;
 	if (IsQoSDataFrame(skb->data)) //we deal qos data only
 	{
 		PTX_TS_RECORD pTS = NULL;
-		if (!GetTs(ieee, (PTS_COMMON_INFO*)(&pTS), dst, skb->priority, TX_DIR, true))
+		if (!GetTs(ieee, (PTS_COMMON_INFO *)(&pTS), dst, skb->priority, TX_DIR, true))
 		{
 			return;
 		}
@@ -607,7 +607,7 @@
 	u8 dest[ETH_ALEN], src[ETH_ALEN];
 	int qos_actived = ieee->current_network.qos_data.active;
 
-	struct ieee80211_crypt_data* crypt;
+	struct ieee80211_crypt_data *crypt;
 
 	cb_desc *tcb_desc;
 

diff --git a/drivers/staging/rtl8192u/ieee80211/rtl819x_BAProc.c b/drivers/staging/rtl8192u/ieee80211/rtl819x_BAProc.c
index 69735d3..db0db93 100644
--- a/drivers/staging/rtl8192u/ieee80211/rtl819x_BAProc.c
+++ b/drivers/staging/rtl8192u/ieee80211/rtl819x_BAProc.c

@@ -13,7 +13,7 @@
  *	     u16			Time //indicate time delay.
  *  output:  none
 ********************************************************************************************************************/
-void ActivateBAEntry(struct ieee80211_device* ieee, PBA_RECORD pBA, u16 Time)
+void ActivateBAEntry(struct ieee80211_device *ieee, PBA_RECORD pBA, u16 Time)
 {
 	pBA->bValid = true;
 	if(Time != 0)
@@ -25,7 +25,7 @@
  *   input:  PBA_RECORD			pBA  //BA entry to be disabled
  *  output:  none
 ********************************************************************************************************************/
-void DeActivateBAEntry( struct ieee80211_device* ieee, PBA_RECORD pBA)
+void DeActivateBAEntry( struct ieee80211_device *ieee, PBA_RECORD pBA)
 {
 	pBA->bValid = false;
 	del_timer_sync(&pBA->Timer);
@@ -37,7 +37,7 @@
  *  output:  none
  *  notice:  As PTX_TS_RECORD structure will be defined in QOS, so wait to be merged. //FIXME
 ********************************************************************************************************************/
-u8 TxTsDeleteBA( struct ieee80211_device* ieee, PTX_TS_RECORD	pTxTs)
+u8 TxTsDeleteBA( struct ieee80211_device *ieee, PTX_TS_RECORD	pTxTs)
 {
 	PBA_RECORD		pAdmittedBa = &pTxTs->TxAdmittedBARecord;  //These two BA entries must exist in TS structure
 	PBA_RECORD		pPendingBa = &pTxTs->TxPendingBARecord;
@@ -67,7 +67,7 @@
  *  output:  none
  *  notice:  As PRX_TS_RECORD structure will be defined in QOS, so wait to be merged. //FIXME, same with above
 ********************************************************************************************************************/
-u8 RxTsDeleteBA( struct ieee80211_device* ieee, PRX_TS_RECORD	pRxTs)
+u8 RxTsDeleteBA( struct ieee80211_device *ieee, PRX_TS_RECORD	pRxTs)
 {
 	PBA_RECORD		pBa = &pRxTs->RxAdmittedBARecord;
 	u8			bSendDELBA = false;
@@ -105,11 +105,11 @@
  *  output:  none
  *  return:  sk_buff*		skb     //return constructed skb to xmit
 *******************************************************************************************************************************/
-static struct sk_buff* ieee80211_ADDBA(struct ieee80211_device* ieee, u8* Dst, PBA_RECORD pBA, u16 StatusCode, u8 type)
+static struct sk_buff *ieee80211_ADDBA(struct ieee80211_device *ieee, u8 *Dst, PBA_RECORD pBA, u16 StatusCode, u8 type)
 {
 	struct sk_buff *skb = NULL;
-	 struct ieee80211_hdr_3addr* BAReq = NULL;
-	u8* tag = NULL;
+	 struct ieee80211_hdr_3addr *BAReq = NULL;
+	u8 *tag = NULL;
 	u16 tmp = 0;
 	u16 len = ieee->tx_headroom + 9;
 	//category(1) + action field(1) + Dialog Token(1) + BA Parameter Set(2) +  BA Timeout Value(2) +  BA Start SeqCtrl(2)(or StatusCode(2))
@@ -139,7 +139,7 @@
 	BAReq->frame_ctl = cpu_to_le16(IEEE80211_STYPE_MANAGE_ACT); //action frame
 
 	//tag += sizeof( struct ieee80211_hdr_3addr); //move to action field
-	tag = (u8*)skb_put(skb, 9);
+	tag = (u8 *)skb_put(skb, 9);
 	*tag ++= ACT_CAT_BA;
 	*tag ++= type;
 	// Dialog Token
@@ -150,22 +150,22 @@
 		// Status Code
 		printk("=====>to send ADDBARSP\n");
 		tmp = cpu_to_le16(StatusCode);
-		memcpy(tag, (u8*)&tmp, 2);
+		memcpy(tag, (u8 *)&tmp, 2);
 		tag += 2;
 	}
 	// BA Parameter Set
 	tmp = cpu_to_le16(pBA->BaParamSet.shortData);
-	memcpy(tag, (u8*)&tmp, 2);
+	memcpy(tag, (u8 *)&tmp, 2);
 	tag += 2;
 	// BA Timeout Value
 	tmp = cpu_to_le16(pBA->BaTimeoutValue);
-	memcpy(tag, (u8*)&tmp, 2);
+	memcpy(tag, (u8 *)&tmp, 2);
 	tag += 2;
 
 	if (ACT_ADDBAREQ == type)
 	{
 	// BA Start SeqCtrl
-		memcpy(tag,(u8*)&(pBA->BaStartSeqCtrl), 2);
+		memcpy(tag,(u8 *)&(pBA->BaStartSeqCtrl), 2);
 		tag += 2;
 	}
 
@@ -184,9 +184,9 @@
  *  output:  none
  *  return:  sk_buff*		skb     //return constructed skb to xmit
 ********************************************************************************************************************/
-static struct sk_buff* ieee80211_DELBA(
-	struct ieee80211_device* ieee,
-	u8*		         dst,
+static struct sk_buff *ieee80211_DELBA(
+	struct ieee80211_device  *ieee,
+	u8		         *dst,
 	PBA_RECORD		 pBA,
 	TR_SELECT		 TxRxSelect,
 	u16			 ReasonCode
@@ -194,8 +194,8 @@
 {
 	DELBA_PARAM_SET	DelbaParamSet;
 	struct sk_buff *skb = NULL;
-	 struct ieee80211_hdr_3addr* Delba = NULL;
-	u8* tag = NULL;
+	 struct ieee80211_hdr_3addr *Delba = NULL;
+	u8 *tag = NULL;
 	u16 tmp = 0;
 	//len = head len + DELBA Parameter Set(2) + Reason Code(2)
 	u16 len = 6 + ieee->tx_headroom;
@@ -224,18 +224,18 @@
 	memcpy(Delba->addr3, ieee->current_network.bssid, ETH_ALEN);
 	Delba->frame_ctl = cpu_to_le16(IEEE80211_STYPE_MANAGE_ACT); //action frame
 
-	tag = (u8*)skb_put(skb, 6);
+	tag = (u8 *)skb_put(skb, 6);
 
 	*tag ++= ACT_CAT_BA;
 	*tag ++= ACT_DELBA;
 
 	// DELBA Parameter Set
 	tmp = cpu_to_le16(DelbaParamSet.shortData);
-	memcpy(tag, (u8*)&tmp, 2);
+	memcpy(tag, (u8 *)&tmp, 2);
 	tag += 2;
 	// Reason Code
 	tmp = cpu_to_le16(ReasonCode);
-	memcpy(tag, (u8*)&tmp, 2);
+	memcpy(tag, (u8 *)&tmp, 2);
 	tag += 2;
 
 	IEEE80211_DEBUG_DATA(IEEE80211_DL_DATA|IEEE80211_DL_BA, skb->data, skb->len);
@@ -251,7 +251,7 @@
  *  output:  none
  *  notice: If any possible, please hide pBA in ieee. And temporarily use Manage Queue as softmac_mgmt_xmit() usually does
 ********************************************************************************************************************/
-void ieee80211_send_ADDBAReq(struct ieee80211_device* ieee, u8*	dst, PBA_RECORD	pBA)
+void ieee80211_send_ADDBAReq(struct ieee80211_device *ieee, u8 *dst, PBA_RECORD	pBA)
 {
 	struct sk_buff *skb = NULL;
 	skb = ieee80211_ADDBA(ieee, dst, pBA, 0, ACT_ADDBAREQ); //construct ACT_ADDBAREQ frames so set statuscode zero.
@@ -278,7 +278,7 @@
  *  output:  none
  *  notice: If any possible, please hide pBA in ieee. And temporarily use Manage Queue as softmac_mgmt_xmit() usually does
 ********************************************************************************************************************/
-void ieee80211_send_ADDBARsp(struct ieee80211_device* ieee, u8* dst, PBA_RECORD pBA, u16 StatusCode)
+void ieee80211_send_ADDBARsp(struct ieee80211_device *ieee, u8 *dst, PBA_RECORD pBA, u16 StatusCode)
 {
 	struct sk_buff *skb = NULL;
 	skb = ieee80211_ADDBA(ieee, dst, pBA, StatusCode, ACT_ADDBARSP); //construct ACT_ADDBARSP frames
@@ -305,7 +305,7 @@
  *  notice: If any possible, please hide pBA in ieee. And temporarily use Manage Queue as softmac_mgmt_xmit() usually does
 ********************************************************************************************************************/
 
-void ieee80211_send_DELBA(struct ieee80211_device* ieee, u8* dst, PBA_RECORD pBA, TR_SELECT TxRxSelect, u16 ReasonCode)
+void ieee80211_send_DELBA(struct ieee80211_device *ieee, u8 *dst, PBA_RECORD pBA, TR_SELECT TxRxSelect, u16 ReasonCode)
 {
 	struct sk_buff *skb = NULL;
 	skb = ieee80211_DELBA(ieee, dst, pBA, TxRxSelect, ReasonCode); //construct ACT_ADDBARSP frames
@@ -327,14 +327,14 @@
  *  return:  0(pass), other(fail)
  *  notice:  As this function need support of QOS, I comment some code out. And when qos is ready, this code need to be support.
 ********************************************************************************************************************/
-int ieee80211_rx_ADDBAReq( struct ieee80211_device* ieee, struct sk_buff *skb)
+int ieee80211_rx_ADDBAReq( struct ieee80211_device *ieee, struct sk_buff *skb)
 {
-	 struct ieee80211_hdr_3addr* req = NULL;
+	 struct ieee80211_hdr_3addr *req = NULL;
 	u16 rc = 0;
-	u8 * dst = NULL, *pDialogToken = NULL, *tag = NULL;
+	u8 *dst = NULL, *pDialogToken = NULL, *tag = NULL;
 	PBA_RECORD pBA = NULL;
 	PBA_PARAM_SET	pBaParamSet = NULL;
-	u16* pBaTimeoutVal = NULL;
+	u16 *pBaTimeoutVal = NULL;
 	PSEQUENCE_CONTROL pBaStartSeqCtrl = NULL;
 	PRX_TS_RECORD	pTS = NULL;
 
@@ -346,13 +346,13 @@
 
 	IEEE80211_DEBUG_DATA(IEEE80211_DL_DATA|IEEE80211_DL_BA, skb->data, skb->len);
 
-	req = ( struct ieee80211_hdr_3addr*) skb->data;
-	tag = (u8*)req;
-	dst = (u8*)(&req->addr2[0]);
+	req = ( struct ieee80211_hdr_3addr *) skb->data;
+	tag = (u8 *)req;
+	dst = (u8 *)(&req->addr2[0]);
 	tag += sizeof( struct ieee80211_hdr_3addr);
 	pDialogToken = tag + 2;  //category+action
 	pBaParamSet = (PBA_PARAM_SET)(tag + 3);   //+DialogToken
-	pBaTimeoutVal = (u16*)(tag + 5);
+	pBaTimeoutVal = (u16 *)(tag + 5);
 	pBaStartSeqCtrl = (PSEQUENCE_CONTROL)(req + 7);
 
 	printk("====================>rx ADDBAREQ from :%pM\n", dst);
@@ -369,7 +369,7 @@
 	// If there is no matched TS, reject the ADDBA request.
 	if(	!GetTs(
 			ieee,
-			(PTS_COMMON_INFO*)(&pTS),
+			(PTS_COMMON_INFO *)(&pTS),
 			dst,
 			(u8)(pBaParamSet->field.TID),
 			RX_DIR,
@@ -427,13 +427,13 @@
  *  return:  0(pass), other(fail)
  *  notice:  As this function need support of QOS, I comment some code out. And when qos is ready, this code need to be support.
 ********************************************************************************************************************/
-int ieee80211_rx_ADDBARsp( struct ieee80211_device* ieee, struct sk_buff *skb)
+int ieee80211_rx_ADDBARsp( struct ieee80211_device *ieee, struct sk_buff *skb)
 {
-	 struct ieee80211_hdr_3addr* rsp = NULL;
+	 struct ieee80211_hdr_3addr *rsp = NULL;
 	PBA_RECORD		pPendingBA, pAdmittedBA;
 	PTX_TS_RECORD		pTS = NULL;
-	u8* dst = NULL, *pDialogToken = NULL, *tag = NULL;
-	u16* pStatusCode = NULL, *pBaTimeoutVal = NULL;
+	u8 *dst = NULL, *pDialogToken = NULL, *tag = NULL;
+	u16 *pStatusCode = NULL, *pBaTimeoutVal = NULL;
 	PBA_PARAM_SET		pBaParamSet = NULL;
 	u16			ReasonCode;
 
@@ -442,14 +442,14 @@
 		IEEE80211_DEBUG(IEEE80211_DL_ERR, " Invalid skb len in BARSP(%d / %zu)\n", skb->len,	(sizeof( struct ieee80211_hdr_3addr) + 9));
 		return -1;
 	}
-	rsp = ( struct ieee80211_hdr_3addr*)skb->data;
-	tag = (u8*)rsp;
-	dst = (u8*)(&rsp->addr2[0]);
+	rsp = ( struct ieee80211_hdr_3addr *)skb->data;
+	tag = (u8 *)rsp;
+	dst = (u8 *)(&rsp->addr2[0]);
 	tag += sizeof( struct ieee80211_hdr_3addr);
 	pDialogToken = tag + 2;
-	pStatusCode = (u16*)(tag + 3);
+	pStatusCode = (u16 *)(tag + 3);
 	pBaParamSet = (PBA_PARAM_SET)(tag + 5);
-	pBaTimeoutVal = (u16*)(tag + 7);
+	pBaTimeoutVal = (u16 *)(tag + 7);
 
 	// Check the capability
 	// Since we can always receive A-MPDU, we just check if it is under HT mode.
@@ -469,7 +469,7 @@
 	//
 	if (!GetTs(
 			ieee,
-			(PTS_COMMON_INFO*)(&pTS),
+			(PTS_COMMON_INFO *)(&pTS),
 			dst,
 			(u8)(pBaParamSet->field.TID),
 			TX_DIR,
@@ -560,12 +560,12 @@
  *  return:  0(pass), other(fail)
  *  notice:  As this function need support of QOS, I comment some code out. And when qos is ready, this code need to be support.
 ********************************************************************************************************************/
-int ieee80211_rx_DELBA(struct ieee80211_device* ieee,struct sk_buff *skb)
+int ieee80211_rx_DELBA(struct ieee80211_device *ieee,struct sk_buff *skb)
 {
-	 struct ieee80211_hdr_3addr* delba = NULL;
+	 struct ieee80211_hdr_3addr *delba = NULL;
 	PDELBA_PARAM_SET	pDelBaParamSet = NULL;
-	u16*			pReasonCode = NULL;
-	u8*			dst = NULL;
+	u16			*pReasonCode = NULL;
+	u8			*dst = NULL;
 
 	if (skb->len < sizeof( struct ieee80211_hdr_3addr) + 6)
 	{
@@ -581,11 +581,11 @@
 	}
 
 	IEEE80211_DEBUG_DATA(IEEE80211_DL_DATA|IEEE80211_DL_BA, skb->data, skb->len);
-	delba = ( struct ieee80211_hdr_3addr*)skb->data;
-	dst = (u8*)(&delba->addr2[0]);
+	delba = ( struct ieee80211_hdr_3addr *)skb->data;
+	dst = (u8 *)(&delba->addr2[0]);
 	delba += sizeof( struct ieee80211_hdr_3addr);
 	pDelBaParamSet = (PDELBA_PARAM_SET)(delba+2);
-	pReasonCode = (u16*)(delba+4);
+	pReasonCode = (u16 *)(delba+4);
 
 	if(pDelBaParamSet->field.Initiator == 1)
 	{
@@ -593,7 +593,7 @@
 
 		if( !GetTs(
 				ieee,
-				(PTS_COMMON_INFO*)&pRxTs,
+				(PTS_COMMON_INFO *)&pRxTs,
 				dst,
 				(u8)pDelBaParamSet->field.TID,
 				RX_DIR,
@@ -611,7 +611,7 @@
 
 		if(!GetTs(
 			ieee,
-			(PTS_COMMON_INFO*)&pTxTs,
+			(PTS_COMMON_INFO *)&pTxTs,
 			dst,
 			(u8)pDelBaParamSet->field.TID,
 			TX_DIR,
@@ -636,7 +636,7 @@
 //
 void
 TsInitAddBA(
-	struct ieee80211_device* ieee,
+	struct ieee80211_device *ieee,
 	PTX_TS_RECORD	pTS,
 	u8		Policy,
 	u8		bOverwritePending
@@ -665,7 +665,7 @@
 }
 
 void
-TsInitDelBA( struct ieee80211_device* ieee, PTS_COMMON_INFO pTsCommonInfo, TR_SELECT TxRxSelect)
+TsInitDelBA( struct ieee80211_device *ieee, PTS_COMMON_INFO pTsCommonInfo, TR_SELECT TxRxSelect)
 {
 
 	if(TxRxSelect == TX_DIR)

diff --git a/drivers/staging/rtl8192u/ieee80211/rtl819x_HTProc.c b/drivers/staging/rtl8192u/ieee80211/rtl819x_HTProc.c
index 268b270..e956da5 100644
--- a/drivers/staging/rtl8192u/ieee80211/rtl819x_HTProc.c
+++ b/drivers/staging/rtl8192u/ieee80211/rtl819x_HTProc.c

@@ -51,7 +51,7 @@
  *  return:  none
  *  notice:  These value need be modified if any changes.
  * *****************************************************************************************************************/
-void HTUpdateDefaultSetting(struct ieee80211_device* ieee)
+void HTUpdateDefaultSetting(struct ieee80211_device *ieee)
 {
 	PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
 	//const typeof( ((struct ieee80211_device *)0)->pHTInfo ) *__mptr = &pHTInfo;
@@ -121,7 +121,7 @@
  *  return:  none
  *  notice:  Driver should not print out this message by default.
  * *****************************************************************************************************************/
-void HTDebugHTCapability(u8* CapIE, u8* TitleString )
+void HTDebugHTCapability(u8 *CapIE, u8 *TitleString )
 {
 
 	static u8	EWC11NHTCap[] = {0x00, 0x90, 0x4c, 0x33};	// For 11n EWC definition, 2007.07.17, by Emily
@@ -158,7 +158,7 @@
  *  return:  none
  *  notice:  Driver should not print out this message by default.
  * *****************************************************************************************************************/
-void HTDebugHTInfo(u8*	InfoIE, u8* TitleString)
+void HTDebugHTInfo(u8 *InfoIE, u8 *TitleString)
 {
 
 	static u8	EWC11NHTInfo[] = {0x00, 0x90, 0x4c, 0x34};	// For 11n EWC definition, 2007.07.17, by Emily
@@ -177,7 +177,7 @@
 
 	IEEE80211_DEBUG(IEEE80211_DL_HT, "\tPrimary channel = %d\n", pHTInfoEle->ControlChl);
 	IEEE80211_DEBUG(IEEE80211_DL_HT, "\tSenondary channel =");
-	switch(pHTInfoEle->ExtChlOffset)
+	switch (pHTInfoEle->ExtChlOffset)
 	{
 		case 0:
 			IEEE80211_DEBUG(IEEE80211_DL_HT, "Not Present\n");
@@ -195,7 +195,7 @@
 	IEEE80211_DEBUG(IEEE80211_DL_HT, "\tRecommended channel width = %s\n", (pHTInfoEle->RecommemdedTxWidth)?"20Mhz": "40Mhz");
 
 	IEEE80211_DEBUG(IEEE80211_DL_HT, "\tOperation mode for protection = ");
-	switch(pHTInfoEle->OptMode)
+	switch (pHTInfoEle->OptMode)
 	{
 		case 0:
 			IEEE80211_DEBUG(IEEE80211_DL_HT, "No Protection\n");
@@ -219,7 +219,7 @@
 /*
 *	Return:		true if station in half n mode and AP supports 40 bw
 */
-bool IsHTHalfNmode40Bandwidth(struct ieee80211_device* ieee)
+bool IsHTHalfNmode40Bandwidth(struct ieee80211_device *ieee)
 {
 	bool			retValue = false;
 	PRT_HIGH_THROUGHPUT	 pHTInfo = ieee->pHTInfo;
@@ -238,7 +238,7 @@
 	return retValue;
 }
 
-bool IsHTHalfNmodeSGI(struct ieee80211_device* ieee, bool is40MHz)
+bool IsHTHalfNmodeSGI(struct ieee80211_device *ieee, bool is40MHz)
 {
 	bool			retValue = false;
 	PRT_HIGH_THROUGHPUT	 pHTInfo = ieee->pHTInfo;
@@ -265,7 +265,7 @@
 	return retValue;
 }
 
-u16 HTHalfMcsToDataRate(struct ieee80211_device* ieee,	u8	nMcsRate)
+u16 HTHalfMcsToDataRate(struct ieee80211_device *ieee,	u8	nMcsRate)
 {
 
 	u8	is40MHz;
@@ -278,7 +278,7 @@
 }
 
 
-u16 HTMcsToDataRate( struct ieee80211_device* ieee, u8 nMcsRate)
+u16 HTMcsToDataRate( struct ieee80211_device *ieee, u8 nMcsRate)
 {
 	PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
 
@@ -297,7 +297,7 @@
  *  return:  tx rate
  *  notice:  quite unsure about how to use this function //wb
  * *****************************************************************************************************************/
-u16  TxCountToDataRate( struct ieee80211_device* ieee, u8 nDataRate)
+u16  TxCountToDataRate( struct ieee80211_device *ieee, u8 nDataRate)
 {
 	//PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
 	u16		CCKOFDMRate[12] = {0x02 , 0x04 , 0x0b , 0x16 , 0x0c , 0x12 , 0x18 , 0x24 , 0x30 , 0x48 , 0x60 , 0x6c};
@@ -344,10 +344,10 @@
 
 
 
-bool IsHTHalfNmodeAPs(struct ieee80211_device* ieee)
+bool IsHTHalfNmodeAPs(struct ieee80211_device *ieee)
 {
 	bool			retValue = false;
-	struct ieee80211_network* net = &ieee->current_network;
+	struct ieee80211_network *net = &ieee->current_network;
 	if((memcmp(net->bssid, BELKINF5D8233V1_RALINK, 3)==0) ||
 		     (memcmp(net->bssid, BELKINF5D82334V3_RALINK, 3)==0) ||
 		     (memcmp(net->bssid, PCI_RALINK, 3)==0) ||
@@ -376,10 +376,10 @@
  *  return:
  *  notice:
  * *****************************************************************************************************************/
-void HTIOTPeerDetermine(struct ieee80211_device* ieee)
+void HTIOTPeerDetermine(struct ieee80211_device *ieee)
 {
 	PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
-	struct ieee80211_network* net = &ieee->current_network;
+	struct ieee80211_network *net = &ieee->current_network;
 	if(net->bssht.bdRT2RTAggregation)
 		pHTInfo->IOTPeer = HT_IOT_PEER_REALTEK;
 	else if(net->broadcom_cap_exist)
@@ -413,7 +413,7 @@
  *  output:  none
  *  return:  return 1 if driver should declare MCS13 only(otherwise return 0)
   * *****************************************************************************************************************/
-u8 HTIOTActIsDisableMCS14(struct ieee80211_device* ieee, u8* PeerMacAddr)
+u8 HTIOTActIsDisableMCS14(struct ieee80211_device *ieee, u8 *PeerMacAddr)
 {
 	u8 ret = 0;
 	return ret;
@@ -432,7 +432,7 @@
 * Return:	true if driver should disable MCS15
 * 2008.04.15	Emily
 */
-bool HTIOTActIsDisableMCS15(struct ieee80211_device* ieee)
+bool HTIOTActIsDisableMCS15(struct ieee80211_device *ieee)
 {
 	bool retValue = false;
 
@@ -469,7 +469,7 @@
 * Return:	true if driver should disable all two spatial stream packet
 * 2008.04.21	Emily
 */
-bool HTIOTActIsDisableMCSTwoSpatialStream(struct ieee80211_device* ieee, u8 *PeerMacAddr)
+bool HTIOTActIsDisableMCSTwoSpatialStream(struct ieee80211_device *ieee, u8 *PeerMacAddr)
 {
 	bool retValue = false;
 
@@ -486,7 +486,7 @@
  *  output:  none
  *  return:  return 1 if driver should disable EDCA turbo mode(otherwise return 0)
   * *****************************************************************************************************************/
-u8 HTIOTActIsDisableEDCATurbo(struct ieee80211_device*	ieee, u8* PeerMacAddr)
+u8 HTIOTActIsDisableEDCATurbo(struct ieee80211_device *ieee, u8 *PeerMacAddr)
 {
 	u8	retValue = false;	// default enable EDCA Turbo mode.
 	// Set specific EDCA parameter for different AP in DM handler.
@@ -515,7 +515,7 @@
 	return retValue;
 }
 
-u8 HTIOTActIsCCDFsync(u8* PeerMacAddr)
+u8 HTIOTActIsCCDFsync(u8 *PeerMacAddr)
 {
 	u8	retValue = 0;
 	if(	(memcmp(PeerMacAddr, UNKNOWN_BORADCOM, 3)==0) ||
@@ -547,7 +547,7 @@
  *  return:  none
  *  notice:  posHTCap can't be null and should be initialized before.
   * *****************************************************************************************************************/
-void HTConstructCapabilityElement(struct ieee80211_device* ieee, u8* posHTCap, u8* len, u8 IsEncrypt)
+void HTConstructCapabilityElement(struct ieee80211_device *ieee, u8 *posHTCap, u8 *len, u8 IsEncrypt)
 {
 	PRT_HIGH_THROUGHPUT	pHT = ieee->pHTInfo;
 	PHT_CAPABILITY_ELE	pCapELE = NULL;
@@ -666,7 +666,7 @@
  *  return:  none
  *  notice:  posHTCap can't be null and be initialized before. only AP and IBSS sta should do this
   * *****************************************************************************************************************/
-void HTConstructInfoElement(struct ieee80211_device* ieee, u8* posHTInfo, u8* len, u8 IsEncrypt)
+void HTConstructInfoElement(struct ieee80211_device *ieee, u8 *posHTInfo, u8 *len, u8 IsEncrypt)
 {
 	PRT_HIGH_THROUGHPUT	pHT = ieee->pHTInfo;
 	PHT_INFORMATION_ELE		pHTInfoEle = (PHT_INFORMATION_ELE)posHTInfo;
@@ -738,7 +738,7 @@
  *  return:  none
  *  notice:
   * *****************************************************************************************************************/
-void HTConstructRT2RTAggElement(struct ieee80211_device* ieee, u8* posRT2RTAgg, u8* len)
+void HTConstructRT2RTAggElement(struct ieee80211_device *ieee, u8 *posRT2RTAgg, u8 *len)
 {
 	if (posRT2RTAgg == NULL) {
 		IEEE80211_DEBUG(IEEE80211_DL_ERR, "posRT2RTAgg can't be null in HTConstructRT2RTAggElement()\n");
@@ -792,7 +792,7 @@
  *  return:  always we return true
  *  notice:
   * *****************************************************************************************************************/
-u8 HT_PickMCSRate(struct ieee80211_device* ieee, u8* pOperateMCS)
+u8 HT_PickMCSRate(struct ieee80211_device *ieee, u8 *pOperateMCS)
 {
 	u8					i;
 	if (pOperateMCS == NULL)
@@ -801,7 +801,7 @@
 		return false;
 	}
 
-	switch(ieee->mode)
+	switch (ieee->mode)
 	{
 	case IEEE_A:
 	case IEEE_B:
@@ -855,7 +855,7 @@
  *  return:  Highest MCS rate included in pMCSRateSet and filtered by pMCSFilter
  *  notice:
   * *****************************************************************************************************************/
-u8 HTGetHighestMCSRate(struct ieee80211_device* ieee, u8* pMCSRateSet, u8* pMCSFilter)
+u8 HTGetHighestMCSRate(struct ieee80211_device *ieee, u8 *pMCSRateSet, u8 *pMCSFilter)
 {
 	u8		i, j;
 	u8		bitMap;
@@ -907,7 +907,7 @@
 **
 ** \pHTSupportedCap: the connected STA's supported rate Capability element
 */
-u8 HTFilterMCSRate( struct ieee80211_device* ieee, u8* pSupportMCS, u8* pOperateMCS)
+u8 HTFilterMCSRate( struct ieee80211_device *ieee, u8 *pSupportMCS, u8 *pOperateMCS)
 {
 
 	u8 i=0;
@@ -937,14 +937,14 @@
 
 	return true;
 }
-void HTSetConnectBwMode(struct ieee80211_device* ieee, HT_CHANNEL_WIDTH	Bandwidth, HT_EXTCHNL_OFFSET	Offset);
+void HTSetConnectBwMode(struct ieee80211_device *ieee, HT_CHANNEL_WIDTH	Bandwidth, HT_EXTCHNL_OFFSET	Offset);
 void HTOnAssocRsp(struct ieee80211_device *ieee)
 {
 	PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
 	PHT_CAPABILITY_ELE		pPeerHTCap = NULL;
 	PHT_INFORMATION_ELE		pPeerHTInfo = NULL;
 	u16	nMaxAMSDUSize = 0;
-	u8*	pMcsFilter = NULL;
+	u8	*pMcsFilter = NULL;
 
 	static u8				EWC11NHTCap[] = {0x00, 0x90, 0x4c, 0x33};		// For 11n EWC definition, 2007.07.17, by Emily
 	static u8				EWC11NHTInfo[] = {0x00, 0x90, 0x4c, 0x34};	// For 11n EWC definition, 2007.07.17, by Emily
@@ -1115,7 +1115,7 @@
 
 }
 
-void HTSetConnectBwModeCallback(struct ieee80211_device* ieee);
+void HTSetConnectBwModeCallback(struct ieee80211_device *ieee);
 /********************************************************************************************************************
  *function:  initialize HT info(struct PRT_HIGH_THROUGHPUT)
  *   input:  struct ieee80211_device*	ieee
@@ -1124,7 +1124,7 @@
  *  notice: This function is called when *  (1) MPInitialization Phase *  (2) Receiving of Deauthentication from AP
 ********************************************************************************************************************/
 // TODO: Should this funciton be called when receiving of Disassociation?
-void HTInitializeHTInfo(struct ieee80211_device* ieee)
+void HTInitializeHTInfo(struct ieee80211_device *ieee)
 {
 	PRT_HIGH_THROUGHPUT pHTInfo = ieee->pHTInfo;
 
@@ -1160,10 +1160,10 @@
 
 
 	// Initialize all of the parameters related to 11n
-	memset((void*)(&(pHTInfo->SelfHTCap)), 0, sizeof(pHTInfo->SelfHTCap));
-	memset((void*)(&(pHTInfo->SelfHTInfo)), 0, sizeof(pHTInfo->SelfHTInfo));
-	memset((void*)(&(pHTInfo->PeerHTCapBuf)), 0, sizeof(pHTInfo->PeerHTCapBuf));
-	memset((void*)(&(pHTInfo->PeerHTInfoBuf)), 0, sizeof(pHTInfo->PeerHTInfoBuf));
+	memset((void *)(&(pHTInfo->SelfHTCap)), 0, sizeof(pHTInfo->SelfHTCap));
+	memset((void *)(&(pHTInfo->SelfHTInfo)), 0, sizeof(pHTInfo->SelfHTInfo));
+	memset((void *)(&(pHTInfo->PeerHTCapBuf)), 0, sizeof(pHTInfo->PeerHTCapBuf));
+	memset((void *)(&(pHTInfo->PeerHTInfoBuf)), 0, sizeof(pHTInfo->PeerHTInfoBuf));
 
 	pHTInfo->bSwBwInProgress = false;
 	pHTInfo->ChnlOp = CHNLOP_NONE;
@@ -1179,7 +1179,7 @@
 
 	//MCS rate initialized here
 	{
-		u8* RegHTSuppRateSets = &(ieee->RegHTSuppRateSet[0]);
+		u8 *RegHTSuppRateSets = &(ieee->RegHTSuppRateSet[0]);
 		RegHTSuppRateSets[0] = 0xFF;	//support MCS 0~7
 		RegHTSuppRateSets[1] = 0xFF;	//support MCS 8~15
 		RegHTSuppRateSets[4] = 0x01;	//support MCS 32
@@ -1214,7 +1214,7 @@
  *  return:  none
  *  notice: This function should ONLY be called before association
 ********************************************************************************************************************/
-void HTResetSelfAndSavePeerSetting(struct ieee80211_device* ieee,	struct ieee80211_network * pNetwork)
+void HTResetSelfAndSavePeerSetting(struct ieee80211_device *ieee,	struct ieee80211_network *pNetwork)
 {
 	PRT_HIGH_THROUGHPUT		pHTInfo = ieee->pHTInfo;
 //	u16						nMaxAMSDUSize;
@@ -1297,7 +1297,7 @@
 
 }
 
-void HTUpdateSelfAndPeerSetting(struct ieee80211_device* ieee,	struct ieee80211_network * pNetwork)
+void HTUpdateSelfAndPeerSetting(struct ieee80211_device *ieee,	struct ieee80211_network *pNetwork)
 {
 	PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
 //	PHT_CAPABILITY_ELE		pPeerHTCap = (PHT_CAPABILITY_ELE)pNetwork->bssht.bdHTCapBuf;
@@ -1317,7 +1317,7 @@
 	}
 }
 
-void HTUseDefaultSetting(struct ieee80211_device* ieee)
+void HTUseDefaultSetting(struct ieee80211_device *ieee)
 {
 	PRT_HIGH_THROUGHPUT pHTInfo = ieee->pHTInfo;
 //	u8	regBwOpMode;
@@ -1370,7 +1370,7 @@
  *  return:  return true if HT control field exists(false otherwise)
  *  notice:
 ********************************************************************************************************************/
-u8 HTCCheck(struct ieee80211_device* ieee, u8*	pFrame)
+u8 HTCCheck(struct ieee80211_device *ieee, u8 *pFrame)
 {
 	if(ieee->pHTInfo->bCurrentHTSupport)
 	{
@@ -1386,7 +1386,7 @@
 //
 // This function set bandwidth mode in protocol layer.
 //
-void HTSetConnectBwMode(struct ieee80211_device* ieee, HT_CHANNEL_WIDTH	Bandwidth, HT_EXTCHNL_OFFSET	Offset)
+void HTSetConnectBwMode(struct ieee80211_device *ieee, HT_CHANNEL_WIDTH	Bandwidth, HT_EXTCHNL_OFFSET	Offset)
 {
 	PRT_HIGH_THROUGHPUT pHTInfo = ieee->pHTInfo;
 //	u32 flags = 0;
@@ -1435,7 +1435,7 @@
 //	spin_unlock_irqrestore(&(ieee->bw_spinlock), flags);
 }
 
-void HTSetConnectBwModeCallback(struct ieee80211_device* ieee)
+void HTSetConnectBwModeCallback(struct ieee80211_device *ieee)
 {
 	PRT_HIGH_THROUGHPUT pHTInfo = ieee->pHTInfo;
 

diff --git a/drivers/staging/rtl8192u/ieee80211/rtl819x_Qos.h b/drivers/staging/rtl8192u/ieee80211/rtl819x_Qos.h
index 2348ccd..f2d52ca 100644
--- a/drivers/staging/rtl8192u/ieee80211/rtl819x_Qos.h
+++ b/drivers/staging/rtl8192u/ieee80211/rtl819x_Qos.h

@@ -483,7 +483,7 @@
 typedef struct _STA_QOS{
 	//DECLARE_RT_OBJECT(STA_QOS);
 	u8				WMMIEBuf[MAX_WMMELE_LENGTH];
-	u8*				WMMIE;
+	u8				*WMMIE;
 
 	// Part 1. Self QoS Mode.
 	QOS_MODE			QosCapability; //QoS Capability, 2006-06-14 Isaiah
@@ -498,7 +498,7 @@
 	int				NumBcnBeforeTrigger;
 
 	// Part 2. EDCA Parameter (perAC)
-	u8 *				pWMMInfoEle;
+	u8				*pWMMInfoEle;
 	u8				WMMParamEle[WMM_PARAM_ELEMENT_SIZE];
 	u8				WMMPELength;
 
@@ -537,12 +537,12 @@
 	QOS_MODE		bdQoSMode;
 
 	u8			bdWMMIEBuf[MAX_WMMELE_LENGTH];
-	u8*		bdWMMIE;
+	u8		*bdWMMIE;
 
 	QOS_ELE_SUBTYPE		EleSubType;
 
-	u8 *			pWMMInfoEle;
-	u8 *			pWMMParamEle;
+	u8			*pWMMInfoEle;
+	u8			*pWMMParamEle;
 
 	QOS_INFO_FIELD		QosInfoField;
 	AC_PARAM		AcParameter[4];

diff --git a/drivers/staging/rtl8192u/ieee80211/rtl819x_TSProc.c b/drivers/staging/rtl8192u/ieee80211/rtl819x_TSProc.c
index 0310d07..3058120 100644
--- a/drivers/staging/rtl8192u/ieee80211/rtl819x_TSProc.c
+++ b/drivers/staging/rtl8192u/ieee80211/rtl819x_TSProc.c

@@ -234,12 +234,12 @@
 }
 
 
-PTS_COMMON_INFO SearchAdmitTRStream(struct ieee80211_device *ieee, u8*	Addr, u8 TID, TR_SELECT	TxRxSelect)
+PTS_COMMON_INFO SearchAdmitTRStream(struct ieee80211_device *ieee, u8 *Addr, u8 TID, TR_SELECT	TxRxSelect)
 {
 	//DIRECTION_VALUE	dir;
 	u8	dir;
 	bool				search_dir[4] = {0, 0, 0, 0};
-	struct list_head*		psearch_list; //FIXME
+	struct list_head		*psearch_list; //FIXME
 	PTS_COMMON_INFO	pRet = NULL;
 	if(ieee->iw_mode == IW_MODE_MASTER) //ap mode
 	{
@@ -311,7 +311,7 @@
 
 void MakeTSEntry(
 		PTS_COMMON_INFO	pTsCommonInfo,
-		u8*		Addr,
+		u8		*Addr,
 		PTSPEC_BODY	pTSPEC,
 		PQOS_TCLAS	pTCLAS,
 		u8		TCLAS_Num,
@@ -326,10 +326,10 @@
 	memcpy(pTsCommonInfo->Addr, Addr, 6);
 
 	if(pTSPEC != NULL)
-		memcpy((u8*)(&(pTsCommonInfo->TSpec)), (u8*)pTSPEC, sizeof(TSPEC_BODY));
+		memcpy((u8 *)(&(pTsCommonInfo->TSpec)), (u8 *)pTSPEC, sizeof(TSPEC_BODY));
 
 	for(count = 0; count < TCLAS_Num; count++)
-		memcpy((u8*)(&(pTsCommonInfo->TClass[count])), (u8*)pTCLAS, sizeof(QOS_TCLAS));
+		memcpy((u8 *)(&(pTsCommonInfo->TClass[count])), (u8 *)pTCLAS, sizeof(QOS_TCLAS));
 
 	pTsCommonInfo->TClasProc = TCLAS_Proc;
 	pTsCommonInfo->TClasNum = TCLAS_Num;
@@ -337,9 +337,9 @@
 
 
 bool GetTs(
-	struct ieee80211_device*	ieee,
+	struct ieee80211_device		*ieee,
 	PTS_COMMON_INFO			*ppTS,
-	u8*				Addr,
+	u8				*Addr,
 	u8				TID,
 	TR_SELECT			TxRxSelect,  //Rx:1, Tx:0
 	bool				bAddNewTs
@@ -367,7 +367,7 @@
 			return false;
 		}
 
-		switch(TID)
+		switch (TID)
 		{
 		case 0:
 		case 3:
@@ -416,12 +416,12 @@
 			//
 			TSPEC_BODY	TSpec;
 			PQOS_TSINFO		pTSInfo = &TSpec.f.TSInfo;
-			struct list_head*	pUnusedList =
+			struct list_head	*pUnusedList =
 								(TxRxSelect == TX_DIR)?
 								(&ieee->Tx_TS_Unused_List):
 								(&ieee->Rx_TS_Unused_List);
 
-			struct list_head*	pAddmitList =
+			struct list_head	*pAddmitList =
 								(TxRxSelect == TX_DIR)?
 								(&ieee->Tx_TS_Admit_List):
 								(&ieee->Rx_TS_Admit_List);
@@ -473,7 +473,7 @@
 }
 
 void RemoveTsEntry(
-	struct ieee80211_device*	ieee,
+	struct ieee80211_device		*ieee,
 	PTS_COMMON_INFO			pTs,
 	TR_SELECT			TxRxSelect
 	)
@@ -501,7 +501,7 @@
 			list_del_init(&pRxReorderEntry->List);
 			{
 				int i = 0;
-				struct ieee80211_rxb * prxb = pRxReorderEntry->prxb;
+				struct ieee80211_rxb *prxb = pRxReorderEntry->prxb;
 				if (unlikely(!prxb))
 				{
 					spin_unlock_irqrestore(&(ieee->reorder_spinlock), flags);
@@ -527,7 +527,7 @@
 	}
 }
 
-void RemovePeerTS(struct ieee80211_device* ieee, u8* Addr)
+void RemovePeerTS(struct ieee80211_device *ieee, u8 *Addr)
 {
 	PTS_COMMON_INFO	pTS, pTmpTS;
 
@@ -574,7 +574,7 @@
 	}
 }
 
-void RemoveAllTS(struct ieee80211_device* ieee)
+void RemoveAllTS(struct ieee80211_device *ieee)
 {
 	PTS_COMMON_INFO pTS, pTmpTS;
 
@@ -607,7 +607,7 @@
 	}
 }
 
-void TsStartAddBaProcess(struct ieee80211_device* ieee, PTX_TS_RECORD	pTxTS)
+void TsStartAddBaProcess(struct ieee80211_device *ieee, PTX_TS_RECORD	pTxTS)
 {
 	if(pTxTS->bAddBaReqInProgress == false)
 	{

diff --git a/drivers/staging/rtl8192u/r8180_93cx6.c b/drivers/staging/rtl8192u/r8180_93cx6.c
index 7e49ad8..d219998 100644
--- a/drivers/staging/rtl8192u/r8180_93cx6.c
+++ b/drivers/staging/rtl8192u/r8180_93cx6.c

@@ -22,13 +22,15 @@
 
 void eprom_cs(struct net_device *dev, short bit)
 {
-	if(bit)
-		write_nic_byte_E(dev, EPROM_CMD,
-			       (1<<EPROM_CS_SHIFT) | \
-			       read_nic_byte_E(dev, EPROM_CMD)); //enable EPROM
+	u8 cmdreg;
+
+	read_nic_byte_E(dev, EPROM_CMD, &cmdreg);
+	if (bit)
+		/* enable EPROM */
+		write_nic_byte_E(dev, EPROM_CMD, cmdreg | EPROM_CS_BIT);
 	else
-		write_nic_byte_E(dev, EPROM_CMD, read_nic_byte_E(dev, EPROM_CMD)\
-			       &~(1<<EPROM_CS_SHIFT)); //disable EPROM
+		/* disable EPROM */
+		write_nic_byte_E(dev, EPROM_CMD, cmdreg & ~EPROM_CS_BIT);
 
 	force_pci_posting(dev);
 	udelay(EPROM_DELAY);
@@ -37,12 +39,15 @@
 
 void eprom_ck_cycle(struct net_device *dev)
 {
-	write_nic_byte_E(dev, EPROM_CMD,
-		       (1<<EPROM_CK_SHIFT) | read_nic_byte_E(dev,EPROM_CMD));
+	u8 cmdreg;
+
+	read_nic_byte_E(dev, EPROM_CMD, &cmdreg);
+	write_nic_byte_E(dev, EPROM_CMD, cmdreg | EPROM_CK_BIT);
 	force_pci_posting(dev);
 	udelay(EPROM_DELAY);
-	write_nic_byte_E(dev, EPROM_CMD,
-		       read_nic_byte_E(dev, EPROM_CMD) &~ (1<<EPROM_CK_SHIFT));
+
+	read_nic_byte_E(dev, EPROM_CMD, &cmdreg);
+	write_nic_byte_E(dev, EPROM_CMD, cmdreg & ~EPROM_CK_BIT);
 	force_pci_posting(dev);
 	udelay(EPROM_DELAY);
 }
@@ -50,12 +55,13 @@
 
 void eprom_w(struct net_device *dev,short bit)
 {
-	if(bit)
-		write_nic_byte_E(dev, EPROM_CMD, (1<<EPROM_W_SHIFT) | \
-			       read_nic_byte_E(dev,EPROM_CMD));
+	u8 cmdreg;
+
+	read_nic_byte_E(dev, EPROM_CMD, &cmdreg);
+	if (bit)
+		write_nic_byte_E(dev, EPROM_CMD, cmdreg | EPROM_W_BIT);
 	else
-		write_nic_byte_E(dev, EPROM_CMD, read_nic_byte_E(dev,EPROM_CMD)\
-			       &~(1<<EPROM_W_SHIFT));
+		write_nic_byte_E(dev, EPROM_CMD, cmdreg & ~EPROM_W_BIT);
 
 	force_pci_posting(dev);
 	udelay(EPROM_DELAY);
@@ -64,12 +70,14 @@
 
 short eprom_r(struct net_device *dev)
 {
-	short bit;
+	u8 bit;
 
-	bit=(read_nic_byte_E(dev, EPROM_CMD) & (1<<EPROM_R_SHIFT) );
+	read_nic_byte_E(dev, EPROM_CMD, &bit);
 	udelay(EPROM_DELAY);
 
-	if(bit) return 1;
+	if (bit & EPROM_R_BIT)
+		return 1;
+
 	return 0;
 }
 

diff --git a/drivers/staging/rtl8192u/r8190_rtl8256.c b/drivers/staging/rtl8192u/r8190_rtl8256.c
index cf9713f..40b14a2 100644
--- a/drivers/staging/rtl8192u/r8190_rtl8256.c
+++ b/drivers/staging/rtl8192u/r8190_rtl8256.c

@@ -23,7 +23,7 @@
  * Return:      NONE
  * Note:	8226 support both 20M  and 40 MHz
  *---------------------------------------------------------------------------*/
-void PHY_SetRF8256Bandwidth(struct net_device* dev , HT_CHANNEL_WIDTH Bandwidth)	//20M or 40M
+void PHY_SetRF8256Bandwidth(struct net_device *dev , HT_CHANNEL_WIDTH Bandwidth)	//20M or 40M
 {
 	u8	eRFPath;
 	struct r8192_priv *priv = ieee80211_priv(dev);
@@ -34,7 +34,7 @@
 		if (!rtl8192_phy_CheckIsLegalRFPath(dev, eRFPath))
 				continue;
 
-		switch(Bandwidth)
+		switch (Bandwidth)
 		{
 			case HT_CHANNEL_WIDTH_20:
 				if(priv->card_8192_version == VERSION_819xU_A || priv->card_8192_version == VERSION_819xU_B)// 8256 D-cut, E-cut, xiong: consider it later!
@@ -73,7 +73,7 @@
 
 				break;
 			default:
-				RT_TRACE(COMP_ERR, "PHY_SetRF8256Bandwidth(): unknown Bandwidth: %#X\n",Bandwidth );
+				RT_TRACE(COMP_ERR, "PHY_SetRF8256Bandwidth(): unknown Bandwidth: %#X\n",Bandwidth);
 				break;
 
 		}
@@ -86,7 +86,7 @@
  * Output:      NONE
  * Return:      NONE
  *---------------------------------------------------------------------------*/
-void PHY_RF8256_Config(struct net_device* dev)
+void PHY_RF8256_Config(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	// Initialize general global value
@@ -104,7 +104,7 @@
  * Output:      NONE
  * Return:      NONE
  *---------------------------------------------------------------------------*/
-void phy_RF8256_Config_ParaFile(struct net_device* dev)
+void phy_RF8256_Config_ParaFile(struct net_device *dev)
 {
 	u32	u4RegValue = 0;
 	//static s1Byte				szRadioAFile[] = RTL819X_PHY_RADIO_A;
@@ -133,7 +133,7 @@
 	//	pHalData->RfReg0Value[eRFPath] =  rtl8192_phy_QueryRFReg(dev, (RF90_RADIO_PATH_E)eRFPath, rGlobalCtrl, bMaskDWord);
 
 		/*----Store original RFENV control type----*/
-		switch(eRFPath)
+		switch (eRFPath)
 		{
 		case RF90_PATH_A:
 		case RF90_PATH_C:
@@ -168,7 +168,7 @@
 		RetryTimes = ConstRetryTimes;
 		RF3_Final_Value = 0;
 		/*----Initialize RF fom connfiguration file----*/
-		switch(eRFPath)
+		switch (eRFPath)
 		{
 		case RF90_PATH_A:
 			while(RF3_Final_Value!=RegValueToBeCheck && RetryTimes!=0)
@@ -209,7 +209,7 @@
 		}
 
 		/*----Restore RFENV control type----*/;
-		switch(eRFPath)
+		switch (eRFPath)
 		{
 		case RF90_PATH_A:
 		case RF90_PATH_C:
@@ -237,14 +237,14 @@
 }
 
 
-void PHY_SetRF8256CCKTxPower(struct net_device*	dev, u8	powerlevel)
+void PHY_SetRF8256CCKTxPower(struct net_device *dev, u8 powerlevel)
 {
 	u32	TxAGC=0;
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	//modified by vivi, 20080109
 	TxAGC = powerlevel;
 
-	if(priv->bDynamicTxLowPower == TRUE ) //cosa 05/22/2008 for scan
+	if(priv->bDynamicTxLowPower == TRUE) //cosa 05/22/2008 for scan
 	{
 		if(priv->CustomerID == RT_CID_819x_Netcore)
 			TxAGC = 0x22;
@@ -258,7 +258,7 @@
 }
 
 
-void PHY_SetRF8256OFDMTxPower(struct net_device* dev, u8 powerlevel)
+void PHY_SetRF8256OFDMTxPower(struct net_device *dev, u8 powerlevel)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	//Joseph TxPower for 8192 testing

diff --git a/drivers/staging/rtl8192u/r8190_rtl8256.h b/drivers/staging/rtl8192u/r8190_rtl8256.h
index 5c1f650..b64dd66 100644
--- a/drivers/staging/rtl8192u/r8190_rtl8256.h
+++ b/drivers/staging/rtl8192u/r8190_rtl8256.h

@@ -18,10 +18,10 @@
 #else
 #define RTL819X_TOTAL_RF_PATH 2 //for 8192U
 #endif
-extern void PHY_SetRF8256Bandwidth(struct net_device* dev , HT_CHANNEL_WIDTH Bandwidth);
-extern void PHY_RF8256_Config(struct net_device* dev);
-extern void phy_RF8256_Config_ParaFile(struct net_device* dev);
-extern void PHY_SetRF8256CCKTxPower(struct net_device*	dev, u8	powerlevel);
-extern void PHY_SetRF8256OFDMTxPower(struct net_device* dev, u8 powerlevel);
+extern void PHY_SetRF8256Bandwidth(struct net_device *dev , HT_CHANNEL_WIDTH Bandwidth);
+extern void PHY_RF8256_Config(struct net_device *dev);
+extern void phy_RF8256_Config_ParaFile(struct net_device *dev);
+extern void PHY_SetRF8256CCKTxPower(struct net_device *dev, u8	powerlevel);
+extern void PHY_SetRF8256OFDMTxPower(struct net_device *dev, u8 powerlevel);
 
 #endif

diff --git a/drivers/staging/rtl8192u/r8192U.h b/drivers/staging/rtl8192u/r8192U.h
index bedeb33..338e7bc 100644
--- a/drivers/staging/rtl8192u/r8192U.h
+++ b/drivers/staging/rtl8192u/r8192U.h

@@ -1,40 +1,38 @@
 /*
-   This is part of rtl8187 OpenSource driver.
-   Copyright (C) Andrea Merello 2004-2005  <andreamrl@tiscali.it>
-   Released under the terms of GPL (General Public Licence)
-
-   Parts of this driver are based on the GPL part of the
-   official realtek driver
-
-   Parts of this driver are based on the rtl8192 driver skeleton
-   from Patric Schenke & Andres Salomon
-
-   Parts of this driver are based on the Intel Pro Wireless 2100 GPL driver
-
-   We want to thank the Authors of those projects and the Ndiswrapper
-   project Authors.
-*/
+ * This is part of rtl8187 OpenSource driver.
+ * Copyright (C) Andrea Merello 2004-2005  <andreamrl@tiscali.it>
+ * Released under the terms of GPL (General Public Licence)
+ *
+ * Parts of this driver are based on the GPL part of the
+ * official realtek driver
+ *
+ * Parts of this driver are based on the rtl8192 driver skeleton
+ * from Patric Schenke & Andres Salomon
+ *
+ * Parts of this driver are based on the Intel Pro Wireless 2100 GPL driver
+ *
+ * We want to thank the Authors of those projects and the Ndiswrapper
+ * project Authors.
+ */
 
 #ifndef R819xU_H
 #define R819xU_H
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-//#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
-//#include <linux/pci.h>
 #include <linux/usb.h>
 #include <linux/etherdevice.h>
 #include <linux/delay.h>
-#include <linux/rtnetlink.h>	//for rtnl_lock()
+#include <linux/rtnetlink.h>
 #include <linux/wireless.h>
 #include <linux/timer.h>
-#include <linux/proc_fs.h>	// Necessary because we use the proc fs
+#include <linux/proc_fs.h>
 #include <linux/if_arp.h>
 #include <linux/random.h>
 #include <asm/io.h>
@@ -42,7 +40,7 @@
 
 #define RTL8192U
 #define RTL819xU_MODULE_NAME "rtl819xU"
-//added for HW security, john.0629
+/* HW security */
 #define FALSE 0
 #define TRUE 1
 #define MAX_KEY_LEN     61
@@ -81,90 +79,91 @@
 #define BIT30           0x40000000
 #define BIT31           0x80000000
 
-// Rx smooth factor
 #define	Rx_Smooth_Factor		20
-#define DMESG(x,a...)
-#define DMESGW(x,a...)
-#define DMESGE(x,a...)
+#define DMESG(x, a...)
+#define DMESGW(x, a...)
+#define DMESGE(x, a...)
 extern u32 rt_global_debug_component;
 #define RT_TRACE(component, x, args...) \
-do { if(rt_global_debug_component & component) \
-	printk(KERN_DEBUG RTL819xU_MODULE_NAME ":" x "\n" , \
-	       ##args);\
-}while(0);
+	do {							\
+		if (rt_global_debug_component & component)	\
+			pr_debug("RTL8192U: " x "\n", ##args);	\
+	} while (0)
 
-#define COMP_TRACE				BIT0		// For function call tracing.
-#define COMP_DBG				BIT1		// Only for temporary debug message.
-#define COMP_INIT				BIT2		// during driver initialization / halt / reset.
+#define COMP_TRACE              BIT0  /* Function call tracing. */
+#define COMP_DBG                BIT1
+#define COMP_INIT               BIT2  /* Driver initialization/halt/reset. */
 
 
-#define COMP_RECV				BIT3		// Receive data path.
-#define COMP_SEND				BIT4		// Send part path.
-#define COMP_IO					BIT5		// I/O Related. Added by Annie, 2006-03-02.
-#define COMP_POWER				BIT6		// 802.11 Power Save mode or System/Device Power state related.
-#define COMP_EPROM				BIT7		// 802.11 link related: join/start BSS, leave BSS.
-#define COMP_SWBW				BIT8	// For bandwidth switch.
-#define COMP_POWER_TRACKING			BIT9	//FOR 8190 TX POWER TRACKING
-#define COMP_TURBO				BIT10	// For Turbo Mode related. By Annie, 2005-10-21.
-#define COMP_QOS				BIT11	// For QoS.
-#define COMP_RATE				BIT12	// For Rate Adaptive mechanism, 2006.07.02, by rcnjko.
-#define COMP_RM					BIT13	// For Radio Measurement.
-#define COMP_DIG				BIT14	// For DIG, 2006.09.25, by rcnjko.
-#define COMP_PHY				BIT15
-#define COMP_CH					BIT16	//channel setting debug
-#define COMP_TXAGC				BIT17	// For Tx power, 060928, by rcnjko.
-#define COMP_HIPWR				BIT18	// For High Power Mechanism, 060928, by rcnjko.
-#define COMP_HALDM				BIT19	// For HW Dynamic Mechanism, 061010, by rcnjko.
-#define COMP_SEC			        BIT20	// Event handling
-#define COMP_LED				BIT21	// For LED.
-#define COMP_RF					BIT22	// For RF.
-//1!!!!!!!!!!!!!!!!!!!!!!!!!!!
-#define COMP_RXDESC				BIT23	// Show Rx desc information for SD3 debug. Added by Annie, 2006-07-15.
-//1//1Attention Please!!!<11n or 8190 specific code should be put below this line>
-//1!!!!!!!!!!!!!!!!!!!!!!!!!!!
+#define COMP_RECV               BIT3  /* Receive data path. */
+#define COMP_SEND               BIT4  /* Send data path. */
+#define COMP_IO                 BIT5
+/* 802.11 Power Save mode or System/Device Power state. */
+#define COMP_POWER              BIT6
+/* 802.11 link related: join/start BSS, leave BSS. */
+#define COMP_EPROM              BIT7
+#define COMP_SWBW               BIT8  /* Bandwidth switch. */
+#define COMP_POWER_TRACKING     BIT9  /* 8190 TX Power Tracking */
+#define COMP_TURBO              BIT10 /* Turbo Mode */
+#define COMP_QOS                BIT11
+#define COMP_RATE               BIT12 /* Rate Adaptive mechanism */
+#define COMP_RM                 BIT13 /* Radio Measurement */
+#define COMP_DIG                BIT14
+#define COMP_PHY                BIT15
+#define COMP_CH                 BIT16 /* Channel setting debug */
+#define COMP_TXAGC              BIT17 /* Tx power */
+#define COMP_HIPWR              BIT18 /* High Power Mechanism */
+#define COMP_HALDM              BIT19 /* HW Dynamic Mechanism */
+#define COMP_SEC                BIT20 /* Event handling */
+#define COMP_LED                BIT21
+#define COMP_RF                 BIT22
+#define COMP_RXDESC             BIT23 /* Rx desc information for SD3 debug */
 
-#define COMP_FIRMWARE				BIT24	//for firmware downloading
-#define COMP_HT					BIT25	// For 802.11n HT related information. by Emily 2006-8-11
-#define COMP_AMSDU				BIT26	// For A-MSDU Debugging
+/* 11n or 8190 specific code */
 
-#define COMP_SCAN				BIT27
-//#define COMP_RESET				BIT28
-#define COMP_DOWN				BIT29  //for rm driver module
-#define COMP_RESET				BIT30  //for silent reset
-#define COMP_ERR				BIT31 //for error out, always on
+#define COMP_FIRMWARE           BIT24 /* Firmware downloading */
+#define COMP_HT                 BIT25 /* 802.11n HT related information */
+#define COMP_AMSDU              BIT26 /* A-MSDU Debugging */
+#define COMP_SCAN               BIT27
+#define COMP_DOWN               BIT29 /* rm driver module */
+#define COMP_RESET              BIT30 /* Silent reset */
+#define COMP_ERR                BIT31 /* Error out, always on */
 
 #define RTL819x_DEBUG
 #ifdef RTL819x_DEBUG
-#define assert(expr) \
-	if (!(expr)) {                                  \
-		printk( "Assertion failed! %s,%s,%s,line=%d\n", \
-		#expr,__FILE__,__FUNCTION__,__LINE__);          \
-	}
-//wb added to debug out data buf
-//if you want print DATA buffer related BA, please set ieee80211_debug_level to DATA|BA
-#define RT_DEBUG_DATA(level, data, datalen)      \
-	do{ if ((rt_global_debug_component & (level)) == (level))   \
-		{       \
-			int i;                                  \
-			u8* pdata = (u8*) data;                 \
-			printk(KERN_DEBUG RTL819xU_MODULE_NAME ": %s()\n", __FUNCTION__);   \
-			for(i=0; i<(int)(datalen); i++)                 \
-			{                                               \
+#define RTL8192U_ASSERT(expr) \
+	do {								\
+		if (!(expr)) {						\
+			pr_debug("Assertion failed! %s, %s, %s, line = %d\n", \
+				 #expr, __FILE__, __func__, __LINE__);	\
+		}							\
+	} while (0)
+/*
+ * Debug out data buf.
+ * If you want to print DATA buffer related BA,
+ * please set ieee80211_debug_level to DATA|BA
+ */
+#define RT_DEBUG_DATA(level, data, datalen) \
+	do {								\
+		if ((rt_global_debug_component & (level)) == (level)) {	\
+			int i;						\
+			u8 *pdata = (u8 *) data;			\
+			pr_debug("RTL8192U: %s()\n", __func__);		\
+			for (i = 0; i < (int)(datalen); i++) {		\
 				printk("%2x ", pdata[i]);               \
-				if ((i+1)%16 == 0) printk("\n");        \
-			}                               \
-			printk("\n");                   \
-		}                                       \
+				if ((i+1)%16 == 0)			\
+					printk("\n");			\
+			}						\
+			printk("\n");					\
+		}							\
 	} while (0)
 #else
-#define assert(expr) do {} while (0)
-#define RT_DEBUG_DATA(level, data, datalen) do {} while(0)
+#define RTL8192U_ASSERT(expr) do {} while (0)
+#define RT_DEBUG_DATA(level, data, datalen) do {} while (0)
 #endif /* RTL8169_DEBUG */
 
 
-//
-// Queue Select Value in TxDesc
-//
+/* Queue Select Value in TxDesc */
 #define QSLT_BK                                 0x1
 #define QSLT_BE                                 0x0
 #define QSLT_VI                                 0x4
@@ -208,13 +207,13 @@
 
 #define IEEE80211_WATCH_DOG_TIME    2000
 #define		PHY_Beacon_RSSI_SLID_WIN_MAX		10
-//for txpowertracking by amy
+/* For Tx Power Tracking */
 #define		OFDM_Table_Length	19
 #define	CCK_Table_length	12
 
-/* for rtl819x */
+/* For rtl819x */
 typedef struct _tx_desc_819x_usb {
-	//DWORD 0
+	/* DWORD 0 */
 	u16	PktSize;
 	u8	Offset;
 	u8	Reserved0:3;
@@ -224,7 +223,7 @@
 	u8	LINIP:1;
 	u8	OWN:1;
 
-	//DWORD 1
+	/* DWORD 1 */
 	u8	TxFWInfoSize;
 	u8	RATid:3;
 	u8	DISFB:1;
@@ -239,27 +238,26 @@
 	u8	SecDescAssign:1;
 	u8	SecType:2;
 
-	//DWORD 2
+	/* DWORD 2 */
 	u16	TxBufferSize;
-	//u16 Reserved2;
 	u8	ResvForPaddingLen:7;
 	u8	Reserved3:1;
 	u8	Reserved4;
 
-	//DWORD 3, 4, 5
+	/* DWORD 3, 4, 5 */
 	u32	Reserved5;
 	u32	Reserved6;
 	u32	Reserved7;
-}tx_desc_819x_usb, *ptx_desc_819x_usb;
+} tx_desc_819x_usb, *ptx_desc_819x_usb;
 
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
 typedef struct _tx_desc_819x_usb_aggr_subframe {
-	//DWORD 0
+	/* DWORD 0 */
 	u16	PktSize;
 	u8	Offset;
 	u8	TxFWInfoSize;
 
-	//DWORD 1
+	/* DWORD 1 */
 	u8	RATid:3;
 	u8	DISFB:1;
 	u8	USERATE:1;
@@ -274,13 +272,13 @@
 	u8	SecType:2;
 	u8	PacketID:7;
 	u8	OWN:1;
-}tx_desc_819x_usb_aggr_subframe, *ptx_desc_819x_usb_aggr_subframe;
+} tx_desc_819x_usb_aggr_subframe, *ptx_desc_819x_usb_aggr_subframe;
 #endif
 
 
 
 typedef struct _tx_desc_cmd_819x_usb {
-	//DWORD 0
+	/* DWORD 0 */
 	u16	Reserved0;
 	u8	Reserved1;
 	u8	Reserved2:3;
@@ -290,65 +288,64 @@
 	u8	LINIP:1;
 	u8	OWN:1;
 
-	//DOWRD 1
-	//u32	Reserved3;
+	/* DOWRD 1 */
 	u8	TxFWInfoSize;
 	u8	Reserved3;
 	u8	QueueSelect;
 	u8	Reserved4;
 
-	//DOWRD 2
+	/* DOWRD 2 */
 	u16	TxBufferSize;
 	u16	Reserved5;
 
-       //DWORD 3,4,5
-	//u32	TxBufferAddr;
-	//u32	NextDescAddress;
+	/* DWORD 3, 4, 5 */
 	u32	Reserved6;
 	u32	Reserved7;
 	u32	Reserved8;
-}tx_desc_cmd_819x_usb, *ptx_desc_cmd_819x_usb;
+} tx_desc_cmd_819x_usb, *ptx_desc_cmd_819x_usb;
 
 
 typedef struct _tx_fwinfo_819x_usb {
-	//DOWRD 0
-	u8		TxRate:7;
-	u8		CtsEnable:1;
-	u8		RtsRate:7;
-	u8		RtsEnable:1;
-	u8		TxHT:1;
-	u8		Short:1;                //Short PLCP for CCK, or short GI for 11n MCS
-	u8		TxBandwidth:1;          // This is used for HT MCS rate only.
-	u8		TxSubCarrier:2;         // This is used for legacy OFDM rate only.
-	u8		STBC:2;
-	u8		AllowAggregation:1;
-	u8		RtsHT:1;                //Interpret RtsRate field as high throughput data rate
-	u8		RtsShort:1;             //Short PLCP for CCK, or short GI for 11n MCS
-	u8		RtsBandwidth:1;         // This is used for HT MCS rate only.
-	u8		RtsSubcarrier:2;        // This is used for legacy OFDM rate only.
-	u8		RtsSTBC:2;
-	u8		EnableCPUDur:1;         //Enable firmware to recalculate and assign packet duration
+	/* DOWRD 0 */
+	u8	TxRate:7;
+	u8	CtsEnable:1;
+	u8	RtsRate:7;
+	u8	RtsEnable:1;
+	u8	TxHT:1;
+	u8	Short:1;        /* Error out, always on */
+	u8	TxBandwidth:1;	/* Used for HT MCS rate only */
+	u8	TxSubCarrier:2; /* Used for legacy OFDM rate only */
+	u8	STBC:2;
+	u8	AllowAggregation:1;
+	/* Interpret RtsRate field as high throughput data rate */
+	u8	RtsHT:1;
+	u8	RtsShort:1;     /* Short PLCP for CCK or short GI for 11n MCS */
+	u8	RtsBandwidth:1;	/* Used for HT MCS rate only */
+	u8	RtsSubcarrier:2;/* Used for legacy OFDM rate only */
+	u8	RtsSTBC:2;
+	/* Enable firmware to recalculate and assign packet duration */
+	u8	EnableCPUDur:1;
 
-	//DWORD 1
-	u32		RxMF:2;
-	u32		RxAMD:3;
-	u32		TxPerPktInfoFeedback:1;//1 indicate Tx info gathtered by firmware and returned by Rx Cmd
-	u32		Reserved1:2;
-	u32		TxAGCOffSet:4;
-	u32		TxAGCSign:1;
-	u32		Tx_INFO_RSVD:6;
-	u32		PacketID:13;
-	//u32                Reserved;
-}tx_fwinfo_819x_usb, *ptx_fwinfo_819x_usb;
+	/* DWORD 1 */
+	u32	RxMF:2;
+	u32	RxAMD:3;
+	/* 1 indicate Tx info gathered by firmware and returned by Rx Cmd */
+	u32	TxPerPktInfoFeedback:1;
+	u32	Reserved1:2;
+	u32	TxAGCOffSet:4;
+	u32	TxAGCSign:1;
+	u32	Tx_INFO_RSVD:6;
+	u32	PacketID:13;
+} tx_fwinfo_819x_usb, *ptx_fwinfo_819x_usb;
 
 typedef struct rtl8192_rx_info {
 	struct urb *urb;
 	struct net_device *dev;
 	u8 out_pipe;
-}rtl8192_rx_info ;
+} rtl8192_rx_info ;
 
-typedef struct rx_desc_819x_usb{
-	//DOWRD 0
+typedef struct rx_desc_819x_usb {
+	/* DOWRD 0 */
 	u16                 Length:14;
 	u16                 CRC32:1;
 	u16                 ICV:1;
@@ -356,47 +353,32 @@
 	u8                  Shift:2;
 	u8                  PHYStatus:1;
 	u8                  SWDec:1;
-	//u8                LastSeg:1;
-	//u8                FirstSeg:1;
-	//u8                EOR:1;
-	//u8                OWN:1;
 	u8                  Reserved1:4;
 
-	//DWORD 1
+	/* DWORD 1 */
 	u32                 Reserved2;
-
-	//DWORD 2
-	//u32               Reserved3;
-
-	//DWORD 3
-	//u32                BufferAddress;
-
-}rx_desc_819x_usb, *prx_desc_819x_usb;
+} rx_desc_819x_usb, *prx_desc_819x_usb;
 
 #ifdef USB_RX_AGGREGATION_SUPPORT
-typedef struct _rx_desc_819x_usb_aggr_subframe{
-	//DOWRD 0
+typedef struct _rx_desc_819x_usb_aggr_subframe {
+	/* DOWRD 0 */
 	u16			Length:14;
 	u16			CRC32:1;
 	u16			ICV:1;
 	u8			Offset;
 	u8			RxDrvInfoSize;
-	//DOWRD 1
+	/* DOWRD 1 */
 	u8			Shift:2;
 	u8			PHYStatus:1;
 	u8			SWDec:1;
 	u8			Reserved1:4;
 	u8			Reserved2;
 	u16			Reserved3;
-	//DWORD 2
-	//u4Byte		Reserved3;
-	//DWORD 3
-	//u4Byte		BufferAddress;
-}rx_desc_819x_usb_aggr_subframe, *prx_desc_819x_usb_aggr_subframe;
+} rx_desc_819x_usb_aggr_subframe, *prx_desc_819x_usb_aggr_subframe;
 #endif
 
-typedef struct rx_drvinfo_819x_usb{
-	//DWORD 0
+typedef struct rx_drvinfo_819x_usb {
+	/* DWORD 0 */
 	u16                 Reserved1:12;
 	u16                 PartAggr:1;
 	u16                 FirstAGGR:1;
@@ -413,14 +395,15 @@
 	u8                  Bcast:1;
 	u8                  Reserved4:1;
 
-	//DWORD 1
+	/* DWORD 1 */
 	u32                  TSFL;
 
-}rx_drvinfo_819x_usb, *prx_drvinfo_819x_usb;
+} rx_drvinfo_819x_usb, *prx_drvinfo_819x_usb;
 
-
-#define MAX_DEV_ADDR_SIZE		8  /* support till 64 bit bus width OS */
-#define MAX_FIRMWARE_INFORMATION_SIZE   32 /*2006/04/30 by Emily forRTL8190*/
+/* Support till 64 bit bus width OS */
+#define MAX_DEV_ADDR_SIZE		8
+/* For RTL8190 */
+#define MAX_FIRMWARE_INFORMATION_SIZE   32
 #define MAX_802_11_HEADER_LENGTH        (40 + MAX_FIRMWARE_INFORMATION_SIZE)
 #define ENCRYPTION_MAX_OVERHEAD		128
 #define	USB_HWDESC_HEADER_LEN		sizeof(tx_desc_819x_usb)
@@ -438,55 +421,55 @@
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
 #define TX_PACKET_DRVAGGR_SUBFRAME_SHIFT_BYTES (sizeof(tx_desc_819x_usb_aggr_subframe) + sizeof(tx_fwinfo_819x_usb))
 #endif
-#define scrclng					4		// octets for crc32 (FCS, ICV)
+/* Octets for crc32 (FCS, ICV) */
+#define scrclng					4
 
-typedef enum rf_optype
-{
+typedef enum rf_optype {
 	RF_OP_By_SW_3wire = 0,
 	RF_OP_By_FW,
 	RF_OP_MAX
-}rf_op_type;
+} rf_op_type;
 /* 8190 Loopback Mode definition */
-typedef enum _rtl819xUsb_loopback{
+typedef enum _rtl819xUsb_loopback {
 	RTL819xU_NO_LOOPBACK = 0,
 	RTL819xU_MAC_LOOPBACK = 1,
 	RTL819xU_DMA_LOOPBACK = 2,
 	RTL819xU_CCK_LOOPBACK = 3,
-}rtl819xUsb_loopback_e;
+} rtl819xUsb_loopback_e;
 
 /* due to rtl8192 firmware */
-typedef enum _desc_packet_type_e{
+typedef enum _desc_packet_type_e {
 	DESC_PACKET_TYPE_INIT = 0,
 	DESC_PACKET_TYPE_NORMAL = 1,
-}desc_packet_type_e;
+} desc_packet_type_e;
 
-typedef enum _firmware_status{
+typedef enum _firmware_status {
 	FW_STATUS_0_INIT = 0,
 	FW_STATUS_1_MOVE_BOOT_CODE = 1,
 	FW_STATUS_2_MOVE_MAIN_CODE = 2,
 	FW_STATUS_3_TURNON_CPU = 3,
 	FW_STATUS_4_MOVE_DATA_CODE = 4,
 	FW_STATUS_5_READY = 5,
-}firmware_status_e;
+} firmware_status_e;
 
 typedef struct _rt_firmare_seg_container {
 	u16	seg_size;
 	u8	*seg_ptr;
-}fw_seg_container, *pfw_seg_container;
-typedef struct _rt_firmware{
+} fw_seg_container, *pfw_seg_container;
+typedef struct _rt_firmware {
 	firmware_status_e firmware_status;
 	u16               cmdpacket_frag_thresold;
-#define RTL8190_MAX_FIRMWARE_CODE_SIZE  64000   //64k
+#define RTL8190_MAX_FIRMWARE_CODE_SIZE  64000
 	u8                firmware_buf[RTL8190_MAX_FIRMWARE_CODE_SIZE];
 	u16               firmware_buf_size;
-}rt_firmware, *prt_firmware;
+} rt_firmware, *prt_firmware;
 
-//+by amy 080507
-#define MAX_RECEIVE_BUFFER_SIZE	9100	// Add this to 9100 bytes to receive A-MSDU from RT-AP
+/* Add this to 9100 bytes to receive A-MSDU from RT-AP */
+#define MAX_RECEIVE_BUFFER_SIZE	9100
 
-typedef struct _rt_firmware_info_819xUsb{
+typedef struct _rt_firmware_info_819xUsb {
 	u8		sz_info[16];
-}rt_firmware_info_819xUsb, *prt_firmware_info_819xUsb;
+} rt_firmware_info_819xUsb, *prt_firmware_info_819xUsb;
 
 /* Firmware Queue Layout */
 #define NUM_OF_FIRMWARE_QUEUE		10
@@ -527,8 +510,11 @@
 #define RSVD_FW_QUEUE_PAGE_CMD_SHIFT	0x08
 #define RSVD_FW_QUEUE_PAGE_BCN_SHIFT	0x00
 #define RSVD_FW_QUEUE_PAGE_PUB_SHIFT	0x08
-//=================================================================
-//=================================================================
+
+/*
+ * =================================================================
+ * =================================================================
+ */
 
 #define EPROM_93c46 0
 #define EPROM_93c56 1
@@ -557,7 +543,7 @@
 } WIRELESS_MODE;
 
 
-#define RTL_IOCTL_WPA_SUPPLICANT		SIOCIWFIRSTPRIV+30
+#define RTL_IOCTL_WPA_SUPPLICANT		(SIOCIWFIRSTPRIV + 30)
 
 typedef struct buffer {
 	struct buffer *next;
@@ -565,7 +551,7 @@
 
 } buffer;
 
-typedef struct rtl_reg_debug{
+typedef struct rtl_reg_debug {
 	unsigned int  cmd;
 	struct {
 		unsigned char type;
@@ -574,7 +560,7 @@
 		unsigned char length;
 	} head;
 	unsigned char buf[0xff];
-}rtl_reg_debug;
+} rtl_reg_debug;
 
 
 
@@ -584,58 +570,45 @@
 typedef struct _rt_9x_tx_rate_history {
 	u32             cck[4];
 	u32             ofdm[8];
-	// HT_MCS[0][]: BW=0 SG=0
-	// HT_MCS[1][]: BW=1 SG=0
-	// HT_MCS[2][]: BW=0 SG=1
-	// HT_MCS[3][]: BW=1 SG=1
 	u32             ht_mcs[4][16];
-}rt_tx_rahis_t, *prt_tx_rahis_t;
+} rt_tx_rahis_t, *prt_tx_rahis_t;
 typedef struct _RT_SMOOTH_DATA_4RF {
-	char    elements[4][100];//array to store values
-	u32     index;                  //index to current array to store
-	u32     TotalNum;               //num of valid elements
-	u32     TotalVal[4];            //sum of valid elements
-}RT_SMOOTH_DATA_4RF, *PRT_SMOOTH_DATA_4RF;
+	char    elements[4][100]; /* array to store values */
+	u32     index;            /* index to current array to store */
+	u32     TotalNum;         /* num of valid elements */
+	u32     TotalVal[4];      /* sum of valid elements */
+} RT_SMOOTH_DATA_4RF, *PRT_SMOOTH_DATA_4RF;
 
-#define MAX_8192U_RX_SIZE			8192    // This maybe changed for D-cut larger aggregation size
-//stats seems messed up, clean it ASAP
+/* This maybe changed for D-cut larger aggregation size */
+#define MAX_8192U_RX_SIZE			8192
+/* Stats seems messed up, clean it ASAP */
 typedef struct Stats {
 	unsigned long txrdu;
-//	unsigned long rxrdu;
-	//unsigned long rxnolast;
-	//unsigned long rxnodata;
-//	unsigned long rxreset;
-//	unsigned long rxnopointer;
 	unsigned long rxok;
 	unsigned long rxframgment;
 	unsigned long rxurberr;
 	unsigned long rxstaterr;
-	unsigned long received_rate_histogram[4][32];	//0: Total, 1:OK, 2:CRC, 3:ICV, 2007 07 03 cosa
-	unsigned long received_preamble_GI[2][32];		//0: Long preamble/GI, 1:Short preamble/GI
-	unsigned long rx_AMPDUsize_histogram[5]; // level: (<4K), (4K~8K), (8K~16K), (16K~32K), (32K~64K)
-	unsigned long rx_AMPDUnum_histogram[5]; // level: (<5), (5~10), (10~20), (20~40), (>40)
-	unsigned long numpacket_matchbssid;	// debug use only.
-	unsigned long numpacket_toself;		// debug use only.
-	unsigned long num_process_phyinfo;		// debug use only.
+	/* 0: Total, 1: OK, 2: CRC, 3: ICV */
+	unsigned long received_rate_histogram[4][32];
+	/* 0: Long preamble/GI, 1: Short preamble/GI */
+	unsigned long received_preamble_GI[2][32];
+	/* level: (<4K), (4K~8K), (8K~16K), (16K~32K), (32K~64K) */
+	unsigned long rx_AMPDUsize_histogram[5];
+	/* level: (<5), (5~10), (10~20), (20~40), (>40) */
+	unsigned long rx_AMPDUnum_histogram[5];
+	unsigned long numpacket_matchbssid;
+	unsigned long numpacket_toself;
+	unsigned long num_process_phyinfo;
 	unsigned long numqry_phystatus;
 	unsigned long numqry_phystatusCCK;
 	unsigned long numqry_phystatusHT;
-	unsigned long received_bwtype[5];              //0: 20M, 1: funn40M, 2: upper20M, 3: lower20M, 4: duplicate
+	/* 0: 20M, 1: funn40M, 2: upper20M, 3: lower20M, 4: duplicate */
+	unsigned long received_bwtype[5];
 	unsigned long txnperr;
 	unsigned long txnpdrop;
 	unsigned long txresumed;
-//	unsigned long rxerr;
-//	unsigned long rxoverflow;
-//	unsigned long rxint;
 	unsigned long txnpokint;
-//	unsigned long txhpokint;
-//	unsigned long txhperr;
-//	unsigned long ints;
-//	unsigned long shints;
 	unsigned long txoverflow;
-//	unsigned long rxdmafail;
-//	unsigned long txbeacon;
-//	unsigned long txbeaconerr;
 	unsigned long txlpokint;
 	unsigned long txlpdrop;
 	unsigned long txlperr;
@@ -684,30 +657,35 @@
 	u8	      last_packet_rate;
 	unsigned long slide_signal_strength[100];
 	unsigned long slide_evm[100];
-	unsigned long slide_rssi_total;	// For recording sliding window's RSSI value
-	unsigned long slide_evm_total;	// For recording sliding window's EVM value
-	long signal_strength; // Transformed, in dbm. Beautified signal strength for UI, not correct.
+	/* For recording sliding window's RSSI value */
+	unsigned long slide_rssi_total;
+	/* For recording sliding window's EVM value */
+	unsigned long slide_evm_total;
+	/* Transformed in dbm. Beautified signal strength for UI, not correct */
+	long signal_strength;
 	long signal_quality;
 	long last_signal_strength_inpercent;
-	long recv_signal_power;	// Correct smoothed ss in Dbm, only used in driver to report real power now.
+	/* Correct smoothed ss in dbm, only used in driver
+	 * to report real power now */
+	long recv_signal_power;
 	u8 rx_rssi_percentage[4];
 	u8 rx_evm_percentage[2];
 	long rxSNRdB[4];
 	rt_tx_rahis_t txrate;
-	u32 Slide_Beacon_pwdb[100];     //cosa add for beacon rssi
-	u32 Slide_Beacon_Total;         //cosa add for beacon rssi
+	/* For beacon RSSI */
+	u32 Slide_Beacon_pwdb[100];
+	u32 Slide_Beacon_Total;
 	RT_SMOOTH_DATA_4RF              cck_adc_pwdb;
 
 	u32	CurrentShowTxate;
 } Stats;
 
 
-// Bandwidth Offset
+/* Bandwidth Offset */
 #define HAL_PRIME_CHNL_OFFSET_DONT_CARE		0
 #define HAL_PRIME_CHNL_OFFSET_LOWER			1
 #define HAL_PRIME_CHNL_OFFSET_UPPER			2
 
-//+by amy 080507
 
 typedef struct	ChnlAccessSetting {
 	u16 SIFS_Timer;
@@ -716,35 +694,62 @@
 	u16 EIFS_Timer;
 	u16 CWminIndex;
 	u16 CWmaxIndex;
-}*PCHANNEL_ACCESS_SETTING,CHANNEL_ACCESS_SETTING;
+} *PCHANNEL_ACCESS_SETTING, CHANNEL_ACCESS_SETTING;
 
-typedef struct _BB_REGISTER_DEFINITION{
-	u32 rfintfs;			// set software control: //		0x870~0x877[8 bytes]
-	u32 rfintfi;			// readback data: //		0x8e0~0x8e7[8 bytes]
-	u32 rfintfo;			// output data: //		0x860~0x86f [16 bytes]
-	u32 rfintfe;			// output enable: //		0x860~0x86f [16 bytes]
-	u32 rf3wireOffset;		// LSSI data: //		0x840~0x84f [16 bytes]
-	u32 rfLSSI_Select;		// BB Band Select: //		0x878~0x87f [8 bytes]
-	u32 rfTxGainStage;		// Tx gain stage: //		0x80c~0x80f [4 bytes]
-	u32 rfHSSIPara1;		// wire parameter control1 : //		0x820~0x823,0x828~0x82b, 0x830~0x833, 0x838~0x83b [16 bytes]
-	u32 rfHSSIPara2;		// wire parameter control2 : //		0x824~0x827,0x82c~0x82f, 0x834~0x837, 0x83c~0x83f [16 bytes]
-	u32 rfSwitchControl;	//Tx Rx antenna control : //		0x858~0x85f [16 bytes]
-	u32 rfAGCControl1;	//AGC parameter control1 : //		0xc50~0xc53,0xc58~0xc5b, 0xc60~0xc63, 0xc68~0xc6b [16 bytes]
-	u32 rfAGCControl2;	//AGC parameter control2 : //		0xc54~0xc57,0xc5c~0xc5f, 0xc64~0xc67, 0xc6c~0xc6f [16 bytes]
-	u32 rfRxIQImbalance;	//OFDM Rx IQ imbalance matrix : //		0xc14~0xc17,0xc1c~0xc1f, 0xc24~0xc27, 0xc2c~0xc2f [16 bytes]
-	u32 rfRxAFE;			//Rx IQ DC ofset and Rx digital filter, Rx DC notch filter : //		0xc10~0xc13,0xc18~0xc1b, 0xc20~0xc23, 0xc28~0xc2b [16 bytes]
-	u32 rfTxIQImbalance;	//OFDM Tx IQ imbalance matrix //		0xc80~0xc83,0xc88~0xc8b, 0xc90~0xc93, 0xc98~0xc9b [16 bytes]
-	u32 rfTxAFE;			//Tx IQ DC Offset and Tx DFIR type //		0xc84~0xc87,0xc8c~0xc8f, 0xc94~0xc97, 0xc9c~0xc9f [16 bytes]
-	u32 rfLSSIReadBack;	//LSSI RF readback data //		0x8a0~0x8af [16 bytes]
-}BB_REGISTER_DEFINITION_T, *PBB_REGISTER_DEFINITION_T;
+typedef struct _BB_REGISTER_DEFINITION {
+	/* set software control:        0x870~0x877 [8 bytes]  */
+	u32 rfintfs;
+	/* readback data:               0x8e0~0x8e7 [8 bytes]  */
+	u32 rfintfi;
+	/* output data:                 0x860~0x86f [16 bytes] */
+	u32 rfintfo;
+	/* output enable:               0x860~0x86f [16 bytes] */
+	u32 rfintfe;
+	/* LSSI data:                   0x840~0x84f [16 bytes] */
+	u32 rf3wireOffset;
+	/* BB Band Select:              0x878~0x87f [8 bytes]  */
+	u32 rfLSSI_Select;
+	/* Tx gain stage:               0x80c~0x80f [4 bytes]  */
+	u32 rfTxGainStage;
+	/* wire parameter control1:     0x820~0x823, 0x828~0x82b,
+	 *                              0x830~0x833, 0x838~0x83b [16 bytes] */
+	u32 rfHSSIPara1;
+	/* wire parameter control2:     0x824~0x827, 0x82c~0x82f,
+	 *                              0x834~0x837, 0x83c~0x83f [16 bytes] */
+	u32 rfHSSIPara2;
+	/* Tx Rx antenna control:       0x858~0x85f [16 bytes] */
+	u32 rfSwitchControl;
+	/* AGC parameter control1:	0xc50~0xc53, 0xc58~0xc5b,
+	 *                              0xc60~0xc63, 0xc68~0xc6b [16 bytes] */
+	u32 rfAGCControl1;
+	/* AGC parameter control2:      0xc54~0xc57, 0xc5c~0xc5f,
+	 *                              0xc64~0xc67, 0xc6c~0xc6f [16 bytes] */
+	u32 rfAGCControl2;
+	/* OFDM Rx IQ imbalance matrix:	0xc14~0xc17, 0xc1c~0xc1f,
+	 *                              0xc24~0xc27, 0xc2c~0xc2f [16 bytes] */
+	u32 rfRxIQImbalance;
+	/* Rx IQ DC offset and Rx digital filter, Rx DC notch filter:
+	 *                              0xc10~0xc13, 0xc18~0xc1b,
+	 *                              0xc20~0xc23, 0xc28~0xc2b [16 bytes] */
+	u32 rfRxAFE;
+	/* OFDM Tx IQ imbalance matrix:	0xc80~0xc83, 0xc88~0xc8b,
+	 *                              0xc90~0xc93, 0xc98~0xc9b [16 bytes] */
+	u32 rfTxIQImbalance;
+	/* Tx IQ DC Offset and Tx DFIR type:
+	 *                              0xc84~0xc87, 0xc8c~0xc8f,
+	 *                              0xc94~0xc97, 0xc9c~0xc9f [16 bytes] */
+	u32 rfTxAFE;
+	/* LSSI RF readback data:       0x8a0~0x8af [16 bytes] */
+	u32 rfLSSIReadBack;
+} BB_REGISTER_DEFINITION_T, *PBB_REGISTER_DEFINITION_T;
 
-typedef enum _RT_RF_TYPE_819xU{
+typedef enum _RT_RF_TYPE_819xU {
 	RF_TYPE_MIN = 0,
 	RF_8225,
 	RF_8256,
 	RF_8258,
 	RF_PSEUDO_11N = 4,
-}RT_RF_TYPE_819xU, *PRT_RF_TYPE_819xU;
+} RT_RF_TYPE_819xU, *PRT_RF_TYPE_819xU;
 
 typedef struct _rate_adaptive {
 	u8				rate_adaptive_disabled;
@@ -762,9 +767,9 @@
 	u32				low_rssi_threshold_ratr;
 	u32				low_rssi_threshold_ratr_40M;
 	u32				low_rssi_threshold_ratr_20M;
-	u8				ping_rssi_enable;	//cosa add for test
-	u32				ping_rssi_ratr;	//cosa add for test
-	u32				ping_rssi_thresh_for_ra;//cosa add for test
+	u8				ping_rssi_enable;
+	u32				ping_rssi_ratr;
+	u32				ping_rssi_thresh_for_ra;
 	u32				last_ratr;
 
 } rate_adaptive, *prate_adaptive;
@@ -778,9 +783,9 @@
 } txbbgain_struct, *ptxbbgain_struct;
 
 typedef struct _ccktxbbgain_struct {
-	//The Value is from a22 to a29 one Byte one time is much Safer
+	/* The value is from a22 to a29, one byte one time is much safer */
 	u8	ccktxbb_valuearray[8];
-} ccktxbbgain_struct,*pccktxbbgain_struct;
+} ccktxbbgain_struct, *pccktxbbgain_struct;
 
 
 typedef struct _init_gain {
@@ -791,7 +796,6 @@
 	u8				cca;
 
 } init_gain, *pinit_gain;
-//by amy 0606
 
 typedef struct _phy_ofdm_rx_status_report_819xusb {
 	u8	trsw_gain_X[4];
@@ -807,26 +811,26 @@
 	u8	max_ex_pwr;
 	u8	sgi_en;
 	u8  rxsc_sgien_exflg;
-}phy_sts_ofdm_819xusb_t;
+} phy_sts_ofdm_819xusb_t;
 
 typedef struct _phy_cck_rx_status_report_819xusb {
-	/* For CCK rate descriptor. This is a unsigned 8:1 variable. LSB bit presend
-	   0.5. And MSB 7 bts presend a signed value. Range from -64~+63.5. */
+	/* For CCK rate descriptor. This is an unsigned 8:1 variable.
+	 * LSB bit presend 0.5. And MSB 7 bts presend a signed value.
+	 * Range from -64~+63.5. */
 	u8	adc_pwdb_X[4];
 	u8	sq_rpt;
 	u8	cck_agc_rpt;
-}phy_sts_cck_819xusb_t;
+} phy_sts_cck_819xusb_t;
 
 
-typedef struct _phy_ofdm_rx_status_rxsc_sgien_exintfflag{
+typedef struct _phy_ofdm_rx_status_rxsc_sgien_exintfflag {
 	u8			reserved:4;
 	u8			rxsc:2;
 	u8			sgi_en:1;
 	u8			ex_intf_flag:1;
-}phy_ofdm_rx_status_rxsc_sgien_exintfflag;
+} phy_ofdm_rx_status_rxsc_sgien_exintfflag;
 
-typedef enum _RT_CUSTOMER_ID
-{
+typedef enum _RT_CUSTOMER_ID {
 	RT_CID_DEFAULT = 0,
 	RT_CID_8187_ALPHA0 = 1,
 	RT_CID_8187_SERCOMM_PS = 2,
@@ -836,25 +840,28 @@
 	RT_CID_819x_CAMEO  = 6,
 	RT_CID_819x_RUNTOP = 7,
 	RT_CID_819x_Senao = 8,
-	RT_CID_TOSHIBA = 9,	// Merge by Jacken, 2008/01/31.
+	RT_CID_TOSHIBA = 9,
 	RT_CID_819x_Netcore = 10,
 	RT_CID_Nettronix = 11,
 	RT_CID_DLINK = 12,
 	RT_CID_PRONET = 13,
-}RT_CUSTOMER_ID, *PRT_CUSTOMER_ID;
+} RT_CUSTOMER_ID, *PRT_CUSTOMER_ID;
 
-//================================================================================
-// LED customization.
-//================================================================================
+/*
+ * ==========================================================================
+ * LED customization.
+ * ==========================================================================
+ */
 
-typedef	enum _LED_STRATEGY_8190{
-	SW_LED_MODE0, // SW control 1 LED via GPIO0. It is default option.
-	SW_LED_MODE1, // SW control for PCI Express
-	SW_LED_MODE2, // SW control for Cameo.
-	SW_LED_MODE3, // SW contorl for RunTop.
-	SW_LED_MODE4, // SW control for Netcore
-	HW_LED, // HW control 2 LEDs, LED0 and LED1 (there are 4 different control modes)
-}LED_STRATEGY_8190, *PLED_STRATEGY_8190;
+typedef	enum _LED_STRATEGY_8190 {
+	SW_LED_MODE0, /* SW control 1 LED via GPIO0. It is default option. */
+	SW_LED_MODE1, /* SW control for PCI Express */
+	SW_LED_MODE2, /* SW control for Cameo. */
+	SW_LED_MODE3, /* SW control for RunTop. */
+	SW_LED_MODE4, /* SW control for Netcore. */
+	/* HW control 2 LEDs, LED0 and LED1 (4 different control modes) */
+	HW_LED,
+} LED_STRATEGY_8190, *PLED_STRATEGY_8190;
 
 typedef enum _RESET_TYPE {
 	RESET_TYPE_NORESET = 0x00,
@@ -863,7 +870,7 @@
 } RESET_TYPE;
 
 /* The simple tx command OP code. */
-typedef enum _tag_TxCmd_Config_Index{
+typedef enum _tag_TxCmd_Config_Index {
 	TXCMD_TXRA_HISTORY_CTRL				= 0xFF900000,
 	TXCMD_RESET_TX_PKT_BUFF				= 0xFF900001,
 	TXCMD_RESET_RX_PKT_BUFF				= 0xFF900002,
@@ -871,11 +878,11 @@
 	TXCMD_SET_RX_RSSI						= 0xFF900004,
 	TXCMD_SET_TX_PWR_TRACKING			= 0xFF900005,
 	TXCMD_XXXX_CTRL,
-}DCMD_TXCMD_OP;
+} DCMD_TXCMD_OP;
 
 typedef struct r8192_priv {
 	struct usb_device *udev;
-	//added for maintain info from eeprom
+	/* For maintain info from eeprom */
 	short epromtype;
 	u16 eeprom_vid;
 	u16 eeprom_pid;
@@ -887,105 +894,81 @@
 	int irq;
 	struct ieee80211_device *ieee80211;
 
-	short card_8192; /* O: rtl8192, 1:rtl8185 V B/C, 2:rtl8185 V D */
-	u8 card_8192_version; /* if TCR reports card V B/C this discriminates */
-//	short phy_ver; /* meaningful for rtl8225 1:A 2:B 3:C */
+	/* O: rtl8192, 1: rtl8185 V B/C, 2: rtl8185 V D */
+	short card_8192;
+	/* If TCR reports card V B/C, this discriminates */
+	u8 card_8192_version;
 	short enable_gpio0;
-	enum card_type {PCI,MINIPCI,CARDBUS,USB}card_type;
+	enum card_type {
+		PCI, MINIPCI, CARDBUS, USB
+	} card_type;
 	short hw_plcp_len;
 	short plcp_preamble_mode;
 
 	spinlock_t irq_lock;
-//	spinlock_t irq_th_lock;
 	spinlock_t tx_lock;
 	struct mutex mutex;
-	//spinlock_t rf_lock; //used to lock rf write operation added by wb
 
 	u16 irq_mask;
-//	short irq_enabled;
-//	struct net_device *dev; //comment this out.
 	short chan;
 	short sens;
 	short max_sens;
 
 
-	//	u8 chtxpwr[15]; //channels from 1 to 14, 0 not used
-//	u8 chtxpwr_ofdm[15]; //channels from 1 to 14, 0 not used
-//	u8 cck_txpwr_base;
-//	u8 ofdm_txpwr_base;
-//	u8 challow[15]; //channels from 1 to 14, 0 not used
 	short up;
-	short crcmon; //if 1 allow bad crc frame reception in monitor mode
-//	short prism_hdr;
+	/* If 1, allow bad crc frame, reception in monitor mode */
+	short crcmon;
 
-//	struct timer_list scan_timer;
-	/*short scanpending;
-	short stopscan;*/
-//	spinlock_t scan_lock;
-//	u8 active_probe;
-	//u8 active_scan_num;
 	struct semaphore wx_sem;
-	struct semaphore rf_sem; //used to lock rf write operation added by wb, modified by david
-//	short hw_wep;
+	struct semaphore rf_sem;	/* Used to lock rf write operation */
 
-//	short digphy;
-//	short antb;
-//	short diversity;
-//	u8 cs_treshold;
-//	short rcr_csense;
-	u8 rf_type; //0 means 1T2R, 1 means 2T4R
+	u8 rf_type;			/* 0: 1T2R, 1: 2T4R */
 	RT_RF_TYPE_819xU rf_chip;
 
-//	u32 key0[4];
-	short (*rf_set_sens)(struct net_device *dev,short sens);
-	u8 (*rf_set_chan)(struct net_device *dev,u8 ch);
+	short (*rf_set_sens)(struct net_device *dev, short sens);
+	u8 (*rf_set_chan)(struct net_device *dev, u8 ch);
 	void (*rf_close)(struct net_device *dev);
 	void (*rf_init)(struct net_device *dev);
-	//short rate;
 	short promisc;
-	/*stats*/
+	/* Stats */
 	struct Stats stats;
 	struct iw_statistics wstats;
 
-	/*RX stuff*/
-//	u32 *rxring;
-//	u32 *rxringtail;
-//	dma_addr_t rxringdma;
+	/* RX stuff */
 	struct urb **rx_urb;
 	struct urb **rx_cmd_urb;
 #ifdef THOMAS_BEACON
 	u32 *oldaddr;
 #endif
 #ifdef THOMAS_TASKLET
-	atomic_t irt_counter;//count for irq_rx_tasklet
+	atomic_t irt_counter; /* count for irq_rx_tasklet */
 #endif
 #ifdef JACKSON_NEW_RX
 	struct sk_buff **pp_rxskb;
 	int     rx_inx;
 #endif
 
-/* modified by davad for Rx process */
        struct sk_buff_head rx_queue;
        struct sk_buff_head skb_queue;
        struct work_struct qos_activate;
 	short  tx_urb_index;
-	atomic_t tx_pending[0x10];//UART_PRIORITY+1
+	atomic_t tx_pending[0x10]; /* UART_PRIORITY + 1 */
 
 
 	struct tasklet_struct irq_rx_tasklet;
 	struct urb *rxurb_task;
 
-	//2 Tx Related variables
+	/* Tx Related variables */
 	u16	ShortRetryLimit;
 	u16	LongRetryLimit;
 	u32	TransmitConfig;
-	u8	RegCWinMin;		// For turbo mode CW adaptive. Added by Annie, 2005-10-27.
+	u8	RegCWinMin;	/* For turbo mode CW adaptive */
 
 	u32     LastRxDescTSFHigh;
 	u32     LastRxDescTSFLow;
 
 
-	//2 Rx Related variables
+	/* Rx Related variables */
 	u16	EarlyRxThreshold;
 	u32	ReceiveConfig;
 	u8	AcmControl;
@@ -1000,13 +983,13 @@
 	struct work_struct reset_wq;
 
 /**********************************************************/
-	//for rtl819xUsb
+	/* For rtl819xUsb */
 	u16     basic_rate;
 	u8      short_preamble;
 	u8      slot_time;
 	bool	bDcut;
 	bool bCurrentRxAggrEnable;
-	u8 Rf_Mode; //add for Firmware RF -R/W switch
+	u8 Rf_Mode;	/* For Firmware RF -R/W switch */
 	prt_firmware		pFirmware;
 	rtl819xUsb_loopback_e	LoopbackMode;
 	u16 EEPROMTxPowerDiff;
@@ -1014,71 +997,70 @@
 	u8 EEPROMPwDiff;
 	u8 EEPROMCrystalCap;
 	u8 EEPROM_Def_Ver;
-	u8 EEPROMTxPowerLevelCCK;// CCK channel 1~14
+	u8 EEPROMTxPowerLevelCCK;		/* CCK channel 1~14 */
 	u8 EEPROMTxPowerLevelCCK_V1[3];
-	u8 EEPROMTxPowerLevelOFDM24G[3]; // OFDM 2.4G channel 1~14
-	u8 EEPROMTxPowerLevelOFDM5G[24];	// OFDM 5G
+	u8 EEPROMTxPowerLevelOFDM24G[3];	/* OFDM 2.4G channel 1~14 */
+	u8 EEPROMTxPowerLevelOFDM5G[24];	/* OFDM 5G */
 
-/*PHY related*/
-	BB_REGISTER_DEFINITION_T	PHYRegDef[4];	//Radio A/B/C/D
-	// Read/write are allow for following hardware information variables
+	/* PHY related */
+	BB_REGISTER_DEFINITION_T PHYRegDef[4];	/* Radio A/B/C/D */
+	/* Read/write are allow for following hardware information variables */
 	u32	MCSTxPowerLevelOriginalOffset[6];
 	u32	CCKTxPowerLevelOriginalOffset;
-	u8	TxPowerLevelCCK[14];			// CCK channel 1~14
-	u8	TxPowerLevelOFDM24G[14];		// OFDM 2.4G channel 1~14
-	u8	TxPowerLevelOFDM5G[14];			// OFDM 5G
+	u8	TxPowerLevelCCK[14];		/* CCK channel 1~14 */
+	u8	TxPowerLevelOFDM24G[14];	/* OFDM 2.4G channel 1~14 */
+	u8	TxPowerLevelOFDM5G[14];		/* OFDM 5G */
 	u32	Pwr_Track;
 	u8	TxPowerDiff;
-	u8	AntennaTxPwDiff[2];				// Antenna gain offset, index 0 for B, 1 for C, and 2 for D
-	u8	CrystalCap;						// CrystalCap.
-	u8	ThermalMeter[2];				// ThermalMeter, index 0 for RFIC0, and 1 for RFIC1
+	u8	AntennaTxPwDiff[2]; /* Antenna gain offset, 0: B, 1: C, 2: D */
+	u8	CrystalCap;
+	u8	ThermalMeter[2];    /* index 0: RFIC0, index 1: RFIC1 */
 
 	u8	CckPwEnl;
-	// Use to calculate PWBD.
+	/* Use to calculate PWBD */
 	u8	bCckHighPower;
 	long	undecorated_smoothed_pwdb;
 
-	//for set channel
+	/* For set channel */
 	u8	SwChnlInProgress;
 	u8	SwChnlStage;
 	u8	SwChnlStep;
 	u8	SetBWModeInProgress;
 	HT_CHANNEL_WIDTH		CurrentChannelBW;
 	u8      ChannelPlan;
-	// 8190 40MHz mode
-	//
-	u8	nCur40MhzPrimeSC;	// Control channel sub-carrier
-	// Joseph test for shorten RF configuration time.
-	// We save RF reg0 in this variable to reduce RF reading.
-	//
+	/* 8190 40MHz mode */
+	/* Control channel sub-carrier */
+	u8	nCur40MhzPrimeSC;
+	/* Test for shorten RF configuration time.
+	 * We save RF reg0 in this variable to reduce RF reading. */
 	u32					RfReg0Value[4];
 	u8					NumTotalRFPath;
 	bool				brfpath_rxenable[4];
-	//RF set related
+	/* RF set related */
 	bool				SetRFPowerStateInProgress;
-//+by amy 080507
 	struct timer_list watch_dog_timer;
 
-//+by amy 080515 for dynamic mechenism
-	//Add by amy Tx Power Control for Near/Far Range 2008/05/15
-	bool	bdynamic_txpower;  //bDynamicTxPower
-	bool	bDynamicTxHighPower;  // Tx high power state
-	bool	bDynamicTxLowPower;  // Tx low power state
+	/* For dynamic mechanism */
+	/* Tx Power Control for Near/Far Range */
+	bool	bdynamic_txpower;
+	bool	bDynamicTxHighPower;
+	bool	bDynamicTxLowPower;
 	bool	bLastDTPFlag_High;
 	bool	bLastDTPFlag_Low;
 
 	bool	bstore_last_dtpflag;
-	bool	bstart_txctrl_bydtp;   //Define to discriminate on High power State or on sitesuvey to change Tx gain index
-	//Add by amy for Rate Adaptive
+	/* Define to discriminate on High power State or
+	 * on sitesurvey to change Tx gain index */
+	bool	bstart_txctrl_bydtp;
 	rate_adaptive rate_adaptive;
-	//Add by amy for TX power tracking
-	//2008/05/15  Mars OPEN/CLOSE TX POWER TRACKING
-       txbbgain_struct txbbgain_table[TxBBGainTableLength];
-	u8			   txpower_count;//For 6 sec do tracking again
-	bool			   btxpower_trackingInit;
-	u8			   OFDM_index;
-	u8			   CCK_index;
-	//2007/09/10 Mars Add CCK TX Power Tracking
+	/* TX power tracking
+	 * OPEN/CLOSE TX POWER TRACKING */
+	txbbgain_struct txbbgain_table[TxBBGainTableLength];
+	u8		txpower_count; /* For 6 sec do tracking again */
+	bool		btxpower_trackingInit;
+	u8		OFDM_index;
+	u8		CCK_index;
+	/* CCK TX Power Tracking */
 	ccktxbbgain_struct	cck_txbbgain_table[CCKTxBBGainTableLength];
 	ccktxbbgain_struct	cck_txbbgain_ch14_table[CCKTxBBGainTableLength];
 	u8 rfa_txpowertrackingindex;
@@ -1095,15 +1077,14 @@
 	bool bcck_in_ch14;
 	bool btxpowerdata_readfromEEPORM;
 	u16	TSSI_13dBm;
-	//For Backup Initial Gain
 	init_gain initgain_backup;
 	u8 DefaultInitialGain[4];
-	// For EDCA Turbo mode, Added by amy 080515.
+	/* For EDCA Turbo mode */
 	bool		bis_any_nonbepkts;
 	bool		bcurrent_turbo_EDCA;
 	bool		bis_cur_rdlstate;
 	struct timer_list fsync_timer;
-	bool bfsync_processing;	// 500ms Fsync timer is active or not
+	bool bfsync_processing;	/* 500ms Fsync timer is active or not */
 	u32	rate_record;
 	u32	rateCountDiffRecord;
 	u32	ContinueDiffCount;
@@ -1112,17 +1093,14 @@
 	u8	framesync;
 	u32	framesyncC34;
 	u8	framesyncMonitor;
-		//Added by amy 080516  for RX related
 	u16	nrxAMPDU_size;
 	u8	nrxAMPDU_aggr_num;
 
-	//by amy for gpio
+	/* For gpio */
 	 bool bHwRadioOff;
 
-	//by amy for reset_count
 	u32 reset_count;
 	bool bpbc_pressed;
-	//by amy for debug
 	u32 txpower_checkcnt;
 	u32 txpower_tracking_callback_cnt;
 	u8 thermal_read_val[40];
@@ -1131,7 +1109,7 @@
 	u32 ccktxpower_adjustcnt_ch14;
 	u8 tx_fwinfo_force_subcarriermode;
 	u8 tx_fwinfo_force_subcarrierval;
-	//by amy for silent reset
+	/* For silent reset */
 	RESET_TYPE	ResetProgress;
 	bool		bForcedSilentReset;
 	bool		bDisableNormalResetCheck;
@@ -1144,7 +1122,7 @@
 
 	u16		SifsTime;
 
-	//define work item by amy 080526
+	/* Define work item */
 
 	struct delayed_work update_beacon_wq;
 	struct delayed_work watch_dog_wq;
@@ -1153,42 +1131,32 @@
 	struct delayed_work gpio_change_rf_wq;
 	struct delayed_work initialgain_operate_wq;
 	struct workqueue_struct *priv_wq;
-}r8192_priv;
+} r8192_priv;
 
-// for rtl8187
-// now mirging to rtl8187B
-/*
-typedef enum{
-	LOW_PRIORITY = 0x02,
-	NORM_PRIORITY
-	} priority_t;
-*/
-//for rtl8187B
+/* For rtl8187B */
 typedef enum{
 	BULK_PRIORITY = 0x01,
-	//RSVD0,
-	//RSVD1,
 	LOW_PRIORITY,
 	NORM_PRIORITY,
 	VO_PRIORITY,
-	VI_PRIORITY, //0x05
+	VI_PRIORITY,
 	BE_PRIORITY,
 	BK_PRIORITY,
 	RSVD2,
 	RSVD3,
-	BEACON_PRIORITY, //0x0A
+	BEACON_PRIORITY,
 	HIGH_PRIORITY,
 	MANAGE_PRIORITY,
 	RSVD4,
 	RSVD5,
-	UART_PRIORITY //0x0F
+	UART_PRIORITY
 } priority_t;
 
-typedef enum{
+typedef enum {
 	NIC_8192U = 1,
 	NIC_8190P = 2,
 	NIC_8192E = 3,
-	} nic_t;
+} nic_t;
 
 
 #ifdef JOHN_HWSEC
@@ -1200,19 +1168,19 @@
 
 bool init_firmware(struct net_device *dev);
 short rtl819xU_tx_cmd(struct net_device *dev, struct sk_buff *skb);
-short rtl8192_tx(struct net_device *dev, struct sk_buff* skb);
+short rtl8192_tx(struct net_device *dev, struct sk_buff *skb);
 
 u32 read_cam(struct net_device *dev, u8 addr);
 void write_cam(struct net_device *dev, u8 addr, u32 data);
 
-u8 read_nic_byte(struct net_device *dev, int x);
-u8 read_nic_byte_E(struct net_device *dev, int x);
-u32 read_nic_dword(struct net_device *dev, int x);
-u16 read_nic_word(struct net_device *dev, int x) ;
-void write_nic_byte(struct net_device *dev, int x,u8 y);
-void write_nic_byte_E(struct net_device *dev, int x,u8 y);
-void write_nic_word(struct net_device *dev, int x,u16 y);
-void write_nic_dword(struct net_device *dev, int x,u32 y);
+int read_nic_byte(struct net_device *dev, int x, u8 *data);
+int read_nic_byte_E(struct net_device *dev, int x, u8 *data);
+int read_nic_dword(struct net_device *dev, int x, u32 *data);
+int read_nic_word(struct net_device *dev, int x, u16 *data);
+void write_nic_byte(struct net_device *dev, int x, u8 y);
+void write_nic_byte_E(struct net_device *dev, int x, u8 y);
+void write_nic_word(struct net_device *dev, int x, u16 y);
+void write_nic_dword(struct net_device *dev, int x, u32 y);
 void force_pci_posting(struct net_device *dev);
 
 void rtl8192_rtx_disable(struct net_device *);
@@ -1220,26 +1188,24 @@
 void rtl8192_tx_enable(struct net_device *);
 
 void rtl8192_disassociate(struct net_device *dev);
-//void fix_rx_fifo(struct net_device *dev);
-void rtl8185_set_rf_pins_enable(struct net_device *dev,u32 a);
+void rtl8185_set_rf_pins_enable(struct net_device *dev, u32 a);
 
-void rtl8192_set_anaparam(struct net_device *dev,u32 a);
-void rtl8185_set_anaparam2(struct net_device *dev,u32 a);
+void rtl8192_set_anaparam(struct net_device *dev, u32 a);
+void rtl8185_set_anaparam2(struct net_device *dev, u32 a);
 void rtl8192_update_msr(struct net_device *dev);
 int rtl8192_down(struct net_device *dev);
 int rtl8192_up(struct net_device *dev);
 void rtl8192_commit(struct net_device *dev);
-void rtl8192_set_chan(struct net_device *dev,short ch);
+void rtl8192_set_chan(struct net_device *dev, short ch);
 void write_phy(struct net_device *dev, u8 adr, u8 data);
 void write_phy_cck(struct net_device *dev, u8 adr, u32 data);
 void write_phy_ofdm(struct net_device *dev, u8 adr, u32 data);
 void rtl8185_tx_antenna(struct net_device *dev, u8 ant);
 void rtl8192_set_rxconf(struct net_device *dev);
-//short check_nic_enough_desc(struct net_device *dev, priority_t priority);
-extern void rtl819xusb_beacon_tx(struct net_device *dev,u16  tx_rate);
+extern void rtl819xusb_beacon_tx(struct net_device *dev, u16 tx_rate);
 
 void EnableHWSecurityConfig8192(struct net_device *dev);
-void setKey(struct net_device *dev, u8 EntryNo, u8 KeyIndex, u16 KeyType, u8 *MacAddr, u8 DefaultKey, u32 *KeyContent );
+void setKey(struct net_device *dev, u8 EntryNo, u8 KeyIndex, u16 KeyType, u8 *MacAddr, u8 DefaultKey, u32 *KeyContent);
 
 
 #endif

diff --git a/drivers/staging/rtl8192u/r8192U_core.c b/drivers/staging/rtl8192u/r8192U_core.c
index 71f5cde..c880adc 100644
--- a/drivers/staging/rtl8192u/r8192U_core.c
+++ b/drivers/staging/rtl8192u/r8192U_core.c

@@ -25,12 +25,35 @@
  */
 
 #ifndef CONFIG_FORCE_HARD_FLOAT
-double __floatsidf (int i) { return i; }
-unsigned int __fixunsdfsi (double d) { return d; }
-double __adddf3(double a, double b) { return a+b; }
-double __addsf3(float a, float b) { return a+b; }
-double __subdf3(double a, double b) { return a-b; }
-double __extendsfdf2(float a) {return a;}
+double __floatsidf(int i)
+{
+	return i;
+}
+
+unsigned int __fixunsdfsi(double d)
+{
+	return d;
+}
+
+double __adddf3(double a, double b)
+{
+	return a+b;
+}
+
+double __addsf3(float a, float b)
+{
+	return a+b;
+}
+
+double __subdf3(double a, double b)
+{
+	return a-b;
+}
+
+double __extendsfdf2(float a)
+{
+	return a;
+}
 #endif
 
 #undef LOOP_TEST
@@ -68,7 +91,6 @@
 #include "r819xU_phyreg.h"
 #include "r819xU_cmdpkt.h"
 #include "r8192U_dm.h"
-//#include "r8192xU_phyreg.h"
 #include <linux/usb.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
@@ -81,26 +103,9 @@
 
 #include "dot11d.h"
 //set here to open your trace code. //WB
-u32 rt_global_debug_component = \
-			//	COMP_INIT	|
-//				COMP_DBG	|
-			//	COMP_EPROM	|
-//				COMP_PHY	|
-			//	COMP_RF		|
-//				COMP_FIRMWARE	|
-//				COMP_CH		|
-			//	COMP_POWER_TRACKING |
-//				COMP_RATE	|
-			//	COMP_TXAGC	|
-		//		COMP_TRACE	|
-				COMP_DOWN	|
-		//		COMP_RECV	|
-		//              COMP_SWBW	|
+u32 rt_global_debug_component = COMP_DOWN	|
 				COMP_SEC	|
-	//			COMP_RESET	|
-		//		COMP_SEND	|
-			//	COMP_EVENTS	|
-				COMP_ERR ; //always open err flags on
+				COMP_ERR; //always open err flags on
 
 #define TOTAL_CAM_ENTRY 32
 #define CAM_CONTENT_COUNT 8
@@ -130,24 +135,22 @@
 MODULE_DEVICE_TABLE(usb, rtl8192_usb_id_tbl);
 MODULE_DESCRIPTION("Linux driver for Realtek RTL8192 USB WiFi cards");
 
-static char* ifname = "wlan%d";
+static char *ifname = "wlan%d";
 static int hwwep = 1;  //default use hw. set 0 to use software security
 static int channels = 0x3fff;
 
 
 
-module_param(ifname, charp, S_IRUGO|S_IWUSR );
-//module_param(hwseqnum,int, S_IRUGO|S_IWUSR);
-module_param(hwwep,int, S_IRUGO|S_IWUSR);
-module_param(channels,int, S_IRUGO|S_IWUSR);
+module_param(ifname, charp, S_IRUGO|S_IWUSR);
+module_param(hwwep, int, S_IRUGO|S_IWUSR);
+module_param(channels, int, S_IRUGO|S_IWUSR);
 
-MODULE_PARM_DESC(ifname," Net interface name, wlan%d=default");
-//MODULE_PARM_DESC(hwseqnum," Try to use hardware 802.11 header sequence numbers. Zero=default");
-MODULE_PARM_DESC(hwwep," Try to use hardware security support. ");
-MODULE_PARM_DESC(channels," Channel bitmask for specific locales. NYI");
+MODULE_PARM_DESC(ifname, " Net interface name, wlan%d=default");
+MODULE_PARM_DESC(hwwep, " Try to use hardware security support. ");
+MODULE_PARM_DESC(channels, " Channel bitmask for specific locales. NYI");
 
 static int rtl8192_usb_probe(struct usb_interface *intf,
-			 const struct usb_device_id *id);
+			     const struct usb_device_id *id);
 static void rtl8192_usb_disconnect(struct usb_interface *intf);
 
 
@@ -169,7 +172,7 @@
 typedef struct _CHANNEL_LIST {
 	u8	Channel[32];
 	u8	Len;
-}CHANNEL_LIST, *PCHANNEL_LIST;
+} CHANNEL_LIST, *PCHANNEL_LIST;
 
 static CHANNEL_LIST ChannelPlan[] = {
 	{{1,2,3,4,5,6,7,8,9,10,11,36,40,44,48,52,56,60,64,149,153,157,161,165},24},		//FCC
@@ -185,12 +188,11 @@
 	{{1,2,3,4,5,6,7,8,9,10,11,12,13,14},14}					//For Global Domain. 1-11:active scan, 12-14 passive scan. //+YJ, 080626
 };
 
-static void rtl819x_set_channel_map(u8 channel_plan, struct r8192_priv* priv)
+static void rtl819x_set_channel_map(u8 channel_plan, struct r8192_priv *priv)
 {
-	int i, max_chan=-1, min_chan=-1;
-	struct ieee80211_device* ieee = priv->ieee80211;
-	switch (channel_plan)
-	{
+	int i, max_chan = -1, min_chan = -1;
+	struct ieee80211_device *ieee = priv->ieee80211;
+	switch (channel_plan) {
 	case COUNTRY_CODE_FCC:
 	case COUNTRY_CODE_IC:
 	case COUNTRY_CODE_ETSI:
@@ -200,22 +202,21 @@
 	case COUNTRY_CODE_MKK1:
 	case COUNTRY_CODE_ISRAEL:
 	case COUNTRY_CODE_TELEC:
-	case COUNTRY_CODE_MIC:	
+	case COUNTRY_CODE_MIC:
 		Dot11d_Init(ieee);
 		ieee->bGlobalDomain = false;
 		//actually 8225 & 8256 rf chips only support B,G,24N mode
 		if ((priv->rf_chip == RF_8225) || (priv->rf_chip == RF_8256)) {
 			min_chan = 1;
 			max_chan = 14;
-		}
-		else {
-			RT_TRACE(COMP_ERR, "unknown rf chip, can't set channel map in function:%s()\n", __FUNCTION__);
+		} else {
+			RT_TRACE(COMP_ERR, "unknown rf chip, can't set channel map in function:%s()\n", __func__);
 		}
 		if (ChannelPlan[channel_plan].Len != 0) {
 			// Clear old channel map
 			memset(GET_DOT11D_INFO(ieee)->channel_map, 0, sizeof(GET_DOT11D_INFO(ieee)->channel_map));
 			// Set new channel map
-			for (i=0;i<ChannelPlan[channel_plan].Len;i++) {
+			for (i = 0; i < ChannelPlan[channel_plan].Len; i++) {
 				if (ChannelPlan[channel_plan].Channel[i] < min_chan || ChannelPlan[channel_plan].Channel[i] > max_chan)
 					break;
 				GET_DOT11D_INFO(ieee)->channel_map[ChannelPlan[channel_plan].Channel[i]] = 1;
@@ -228,19 +229,13 @@
 		Dot11d_Reset(ieee);
 		ieee->bGlobalDomain = true;
 		break;
-	
+
 	default:
 		break;
 	}
 }
 
 
-#define		rx_hal_is_cck_rate(_pdrvinfo)\
-			(_pdrvinfo->RxRate == DESC90_RATE1M ||\
-			_pdrvinfo->RxRate == DESC90_RATE2M ||\
-			_pdrvinfo->RxRate == DESC90_RATE5_5M ||\
-			_pdrvinfo->RxRate == DESC90_RATE11M) &&\
-			!_pdrvinfo->RxHT\
 
 
 void CamResetAllEntry(struct net_device *dev)
@@ -249,12 +244,6 @@
 	//2004/02/11  In static WEP, OID_ADD_KEY or OID_ADD_WEP are set before STA associate to AP.
 	// However, ResetKey is called on OID_802_11_INFRASTRUCTURE_MODE and MlmeAssociateRequest
 	// In this condition, Cam can not be reset because upper layer will not set this static key again.
-	//if(Adapter->EncAlgorithm == WEP_Encryption)
-	//      return;
-//debug
-	//DbgPrint("========================================\n");
-	//DbgPrint("                            Call ResetAllEntry                                              \n");
-	//DbgPrint("========================================\n\n");
 	ulcommand |= BIT31|BIT30;
 	write_nic_dword(dev, RWCAM, ulcommand);
 
@@ -264,13 +253,16 @@
 void write_cam(struct net_device *dev, u8 addr, u32 data)
 {
 	write_nic_dword(dev, WCAMI, data);
-	write_nic_dword(dev, RWCAM, BIT31|BIT16|(addr&0xff) );
+	write_nic_dword(dev, RWCAM, BIT31|BIT16|(addr&0xff));
 }
 
 u32 read_cam(struct net_device *dev, u8 addr)
 {
-	write_nic_dword(dev, RWCAM, 0x80000000|(addr&0xff) );
-	return read_nic_dword(dev, 0xa8);
+	u32 data;
+
+	write_nic_dword(dev, RWCAM, 0x80000000|(addr&0xff));
+	read_nic_dword(dev, 0xa8, &data);
+	return data;
 }
 
 void write_nic_byte_E(struct net_device *dev, int indx, u8 data)
@@ -280,32 +272,29 @@
 	struct usb_device *udev = priv->udev;
 
 	status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
-			       RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
-			       indx|0xfe00, 0, &data, 1, HZ / 2);
+				 RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
+				 indx|0xfe00, 0, &data, 1, HZ / 2);
 
 	if (status < 0)
-	{
-		printk("write_nic_byte_E TimeOut! status:%d\n", status);
-	}
+		netdev_err(dev, "write_nic_byte_E TimeOut! status: %d\n", status);
 }
 
-u8 read_nic_byte_E(struct net_device *dev, int indx)
+int read_nic_byte_E(struct net_device *dev, int indx, u8 *data)
 {
 	int status;
-	u8 data;
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	struct usb_device *udev = priv->udev;
 
 	status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
-			       RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
-			       indx|0xfe00, 0, &data, 1, HZ / 2);
+				 RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
+				 indx|0xfe00, 0, data, 1, HZ / 2);
 
-	if (status < 0)
-	{
-		printk("read_nic_byte_E TimeOut! status:%d\n", status);
+	if (status < 0) {
+		netdev_err(dev, "%s failure status: %d\n", __func__, status);
+		return status;
 	}
 
-	return data;
+	return 0;
 }
 //as 92U has extend page from 4 to 16, so modify functions below.
 void write_nic_byte(struct net_device *dev, int indx, u8 data)
@@ -316,13 +305,11 @@
 	struct usb_device *udev = priv->udev;
 
 	status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
-			       RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
-			       (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 1, HZ / 2);
+				 RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
+				 (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 1, HZ / 2);
 
 	if (status < 0)
-	{
-		printk("write_nic_byte TimeOut! status:%d\n", status);
-	}
+		netdev_err(dev, "write_nic_byte TimeOut! status: %d\n", status);
 
 
 }
@@ -337,13 +324,11 @@
 	struct usb_device *udev = priv->udev;
 
 	status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
-			       RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
-			       (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 2, HZ / 2);
+				 RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
+				 (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 2, HZ / 2);
 
 	if (status < 0)
-	{
-		printk("write_nic_word TimeOut! status:%d\n", status);
-	}
+		netdev_err(dev, "write_nic_word TimeOut! status: %d\n", status);
 
 }
 
@@ -357,98 +342,92 @@
 	struct usb_device *udev = priv->udev;
 
 	status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
-			       RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
-			       (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 4, HZ / 2);
+				 RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
+				 (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 4, HZ / 2);
 
 
 	if (status < 0)
-	{
-		printk("write_nic_dword TimeOut! status:%d\n", status);
+		netdev_err(dev, "write_nic_dword TimeOut! status: %d\n", status);
+
+}
+
+
+
+int read_nic_byte(struct net_device *dev, int indx, u8 *data)
+{
+	int status;
+	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
+	struct usb_device *udev = priv->udev;
+
+	status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
+				 RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
+				 (indx&0xff)|0xff00, (indx>>8)&0x0f, data, 1, HZ / 2);
+
+	if (status < 0) {
+		netdev_err(dev, "%s failure status: %d\n", __func__, status);
+		return status;
 	}
 
+	return 0;
 }
 
 
 
-u8 read_nic_byte(struct net_device *dev, int indx)
+int read_nic_word(struct net_device *dev, int indx, u16 *data)
 {
-	u8 data;
 	int status;
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	struct usb_device *udev = priv->udev;
 
 	status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
-			       RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
-			       (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 1, HZ / 2);
+				 RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
+				 (indx&0xff)|0xff00, (indx>>8)&0x0f,
+				 data, 2, HZ / 2);
 
-	if (status < 0)
-	{
-		printk("read_nic_byte TimeOut! status:%d\n", status);
+	if (status < 0) {
+		netdev_err(dev, "%s failure status: %d\n", __func__, status);
+		return status;
 	}
 
-	return data;
+	return 0;
 }
 
-
-
-u16 read_nic_word(struct net_device *dev, int indx)
+int read_nic_word_E(struct net_device *dev, int indx, u16 *data)
 {
-	u16 data;
 	int status;
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	struct usb_device *udev = priv->udev;
 
 	status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
-				       RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
-				       (indx&0xff)|0xff00, (indx>>8)&0x0f,
-							&data, 2, HZ / 2);
+				 RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
+				 indx|0xfe00, 0, data, 2, HZ / 2);
 
-	if (status < 0)
-		printk("read_nic_word TimeOut! status:%d\n", status);
+	if (status < 0) {
+		netdev_err(dev, "%s failure status: %d\n", __func__, status);
+		return status;
+	}
 
-	return data;
+	return 0;
 }
 
-u16 read_nic_word_E(struct net_device *dev, int indx)
+int read_nic_dword(struct net_device *dev, int indx, u32 *data)
 {
-	u16 data;
 	int status;
-	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-	struct usb_device *udev = priv->udev;
-
-	status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
-			       RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
-				       indx|0xfe00, 0, &data, 2, HZ / 2);
-
-	if (status < 0)
-		printk("read_nic_word TimeOut! status:%d\n", status);
-
-	return data;
-}
-
-u32 read_nic_dword(struct net_device *dev, int indx)
-{
-	u32 data;
-	int status;
-	/* int result; */
 
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	struct usb_device *udev = priv->udev;
 
 	status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
-				       RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
-					(indx&0xff)|0xff00, (indx>>8)&0x0f,
-							&data, 4, HZ / 2);
-	/* if(0 != result) {
-	 *	printk(KERN_WARNING "read size of data = %d\, date = %d\n",
-	 *							 result, data);
-	 * }
-	 */
+				 RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
+				 (indx&0xff)|0xff00, (indx>>8)&0x0f,
+				 data, 4, HZ / 2);
 
-	if (status < 0)
-		printk("read_nic_dword TimeOut! status:%d\n", status);
+	if (status < 0) {
+		netdev_err(dev, "%s failure status: %d\n", __func__, status);
+		return status;
+	}
 
-	return data;
+	return 0;
 }
 
 /* u8 read_phy_cck(struct net_device *dev, u8 adr); */
@@ -462,9 +441,7 @@
 
 static struct net_device_stats *rtl8192_stats(struct net_device *dev);
 void rtl8192_commit(struct net_device *dev);
-/* void rtl8192_restart(struct net_device *dev); */
 void rtl8192_restart(struct work_struct *work);
-/* void rtl8192_rq_tx_ack(struct work_struct *work); */
 void watch_dog_timer_callback(unsigned long data);
 
 /****************************************************************************
@@ -495,40 +472,38 @@
 static int proc_get_registers(struct seq_file *m, void *v)
 {
 	struct net_device *dev = m->private;
-	int i,n, max = 0xff;
+	int i, n, max = 0xff;
+	u8 byte_rd;
 
 	seq_puts(m, "\n####################page 0##################\n ");
 
-	for (n=0;n<=max;) {
-		//printk( "\nD: %2x> ", n);
-		seq_printf(m, "\nD:  %2x > ",n);
+	for (n = 0; n <= max;) {
+		seq_printf(m, "\nD:  %2x > ", n);
 
-		for (i=0;i<16 && n<=max;i++,n++)
-			seq_printf(m, "%2x ",read_nic_byte(dev,0x000|n));
-
-		//	printk("%2x ",read_nic_byte(dev,n));
+		for (i = 0; i < 16 && n <= max; i++, n++) {
+			read_nic_byte(dev, 0x000|n, &byte_rd);
+			seq_printf(m, "%2x ", byte_rd);
+		}
 	}
 
 	seq_puts(m, "\n####################page 1##################\n ");
-	for (n=0;n<=max;) {
-		//printk( "\nD: %2x> ", n);
-		seq_printf(m, "\nD:  %2x > ",n);
+	for (n = 0; n <= max;) {
+		seq_printf(m, "\nD:  %2x > ", n);
 
-		for (i=0;i<16 && n<=max;i++,n++)
-			seq_printf(m, "%2x ",read_nic_byte(dev,0x100|n));
-
-		//      printk("%2x ",read_nic_byte(dev,n));
+		for (i = 0; i < 16 && n <= max; i++, n++) {
+			read_nic_byte(dev, 0x100|n, &byte_rd);
+			seq_printf(m, "%2x ", byte_rd);
+		}
 	}
 
 	seq_puts(m, "\n####################page 3##################\n ");
-	for (n=0;n<=max;) {
-		//printk( "\nD: %2x> ", n);
-		seq_printf(m, "\nD:  %2x > ",n);
+	for (n = 0; n <= max;) {
+		seq_printf(m, "\nD:  %2x > ", n);
 
-		for(i=0;i<16 && n<=max;i++,n++)
-			seq_printf(m, "%2x ",read_nic_byte(dev,0x300|n));
-
-		//      printk("%2x ",read_nic_byte(dev,n));
+		for (i = 0; i < 16 && n <= max; i++, n++) {
+			read_nic_byte(dev, 0x300|n, &byte_rd);
+			seq_printf(m, "%2x ", byte_rd);
+		}
 	}
 
 	seq_putc(m, '\n');
@@ -541,64 +516,54 @@
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 
 	seq_printf(m,
-		"TX VI priority ok int: %lu\n"
-		"TX VI priority error int: %lu\n"
-		"TX VO priority ok int: %lu\n"
-		"TX VO priority error int: %lu\n"
-		"TX BE priority ok int: %lu\n"
-		"TX BE priority error int: %lu\n"
-		"TX BK priority ok int: %lu\n"
-		"TX BK priority error int: %lu\n"
-		"TX MANAGE priority ok int: %lu\n"
-		"TX MANAGE priority error int: %lu\n"
-		"TX BEACON priority ok int: %lu\n"
-		"TX BEACON priority error int: %lu\n"
-//		"TX high priority ok int: %lu\n"
-//		"TX high priority failed error int: %lu\n"
-		"TX queue resume: %lu\n"
-		"TX queue stopped?: %d\n"
-		"TX fifo overflow: %lu\n"
-//		"TX beacon: %lu\n"
-		"TX VI queue: %d\n"
-		"TX VO queue: %d\n"
-		"TX BE queue: %d\n"
-		"TX BK queue: %d\n"
-//		"TX HW queue: %d\n"
-		"TX VI dropped: %lu\n"
-		"TX VO dropped: %lu\n"
-		"TX BE dropped: %lu\n"
-		"TX BK dropped: %lu\n"
-		"TX total data packets %lu\n",
-//		"TX beacon aborted: %lu\n",
-		priv->stats.txviokint,
-		priv->stats.txvierr,
-		priv->stats.txvookint,
-		priv->stats.txvoerr,
-		priv->stats.txbeokint,
-		priv->stats.txbeerr,
-		priv->stats.txbkokint,
-		priv->stats.txbkerr,
-		priv->stats.txmanageokint,
-		priv->stats.txmanageerr,
-		priv->stats.txbeaconokint,
-		priv->stats.txbeaconerr,
-//		priv->stats.txhpokint,
-//		priv->stats.txhperr,
-		priv->stats.txresumed,
-		netif_queue_stopped(dev),
-		priv->stats.txoverflow,
-//		priv->stats.txbeacon,
-		atomic_read(&(priv->tx_pending[VI_PRIORITY])),
-		atomic_read(&(priv->tx_pending[VO_PRIORITY])),
-		atomic_read(&(priv->tx_pending[BE_PRIORITY])),
-		atomic_read(&(priv->tx_pending[BK_PRIORITY])),
-//		read_nic_byte(dev, TXFIFOCOUNT),
-		priv->stats.txvidrop,
-		priv->stats.txvodrop,
-		priv->stats.txbedrop,
-		priv->stats.txbkdrop,
-		priv->stats.txdatapkt
-//		priv->stats.txbeaconerr
+		   "TX VI priority ok int: %lu\n"
+		   "TX VI priority error int: %lu\n"
+		   "TX VO priority ok int: %lu\n"
+		   "TX VO priority error int: %lu\n"
+		   "TX BE priority ok int: %lu\n"
+		   "TX BE priority error int: %lu\n"
+		   "TX BK priority ok int: %lu\n"
+		   "TX BK priority error int: %lu\n"
+		   "TX MANAGE priority ok int: %lu\n"
+		   "TX MANAGE priority error int: %lu\n"
+		   "TX BEACON priority ok int: %lu\n"
+		   "TX BEACON priority error int: %lu\n"
+		   "TX queue resume: %lu\n"
+		   "TX queue stopped?: %d\n"
+		   "TX fifo overflow: %lu\n"
+		   "TX VI queue: %d\n"
+		   "TX VO queue: %d\n"
+		   "TX BE queue: %d\n"
+		   "TX BK queue: %d\n"
+		   "TX VI dropped: %lu\n"
+		   "TX VO dropped: %lu\n"
+		   "TX BE dropped: %lu\n"
+		   "TX BK dropped: %lu\n"
+		   "TX total data packets %lu\n",
+		   priv->stats.txviokint,
+		   priv->stats.txvierr,
+		   priv->stats.txvookint,
+		   priv->stats.txvoerr,
+		   priv->stats.txbeokint,
+		   priv->stats.txbeerr,
+		   priv->stats.txbkokint,
+		   priv->stats.txbkerr,
+		   priv->stats.txmanageokint,
+		   priv->stats.txmanageerr,
+		   priv->stats.txbeaconokint,
+		   priv->stats.txbeaconerr,
+		   priv->stats.txresumed,
+		   netif_queue_stopped(dev),
+		   priv->stats.txoverflow,
+		   atomic_read(&(priv->tx_pending[VI_PRIORITY])),
+		   atomic_read(&(priv->tx_pending[VO_PRIORITY])),
+		   atomic_read(&(priv->tx_pending[BE_PRIORITY])),
+		   atomic_read(&(priv->tx_pending[BK_PRIORITY])),
+		   priv->stats.txvidrop,
+		   priv->stats.txvodrop,
+		   priv->stats.txbedrop,
+		   priv->stats.txbkdrop,
+		   priv->stats.txdatapkt
 		);
 
 	return 0;
@@ -610,12 +575,12 @@
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 
 	seq_printf(m,
-		"RX packets: %lu\n"
-		"RX urb status error: %lu\n"
-		"RX invalid urb error: %lu\n",
-		priv->stats.rxoktotal,
-		priv->stats.rxstaterr,
-		priv->stats.rxurberr);
+		   "RX packets: %lu\n"
+		   "RX urb status error: %lu\n"
+		   "RX invalid urb error: %lu\n",
+		   priv->stats.rxoktotal,
+		   priv->stats.rxstaterr,
+		   priv->stats.rxurberr);
 
 	return 0;
 }
@@ -700,27 +665,7 @@
    -----------------------------MISC STUFF-------------------------
 *****************************************************************************/
 
-/* this is only for debugging */
-void print_buffer(u32 *buffer, int len)
-{
-	int i;
-	u8 *buf =(u8*)buffer;
-
-	printk("ASCII BUFFER DUMP (len: %x):\n",len);
-
-	for(i=0;i<len;i++)
-		printk("%c",buf[i]);
-
-	printk("\nBINARY BUFFER DUMP (len: %x):\n",len);
-
-	for(i=0;i<len;i++)
-		printk("%x",buf[i]);
-
-	printk("\n");
-}
-
-//short check_nic_enough_desc(struct net_device *dev, priority_t priority)
-short check_nic_enough_desc(struct net_device *dev,int queue_index)
+short check_nic_enough_desc(struct net_device *dev, int queue_index)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	int used = atomic_read(&priv->tx_pending[queue_index]);
@@ -731,10 +676,8 @@
 void tx_timeout(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	//rtl8192_commit(dev);
 
 	schedule_work(&priv->reset_wq);
-	//DMESG("TXTIMEOUT");
 }
 
 
@@ -742,41 +685,24 @@
 void dump_eprom(struct net_device *dev)
 {
 	int i;
-	for(i=0; i<63; i++)
-		RT_TRACE(COMP_EPROM, "EEPROM addr %x : %x", i, eprom_read(dev,i));
+	for (i = 0; i < 63; i++)
+		RT_TRACE(COMP_EPROM, "EEPROM addr %x : %x", i, eprom_read(dev, i));
 }
 
-/* this is only for debug */
-void rtl8192_dump_reg(struct net_device *dev)
-{
-	int i;
-	int n;
-	int max=0x1ff;
-
-	RT_TRACE(COMP_PHY, "Dumping NIC register map");
-
-	for(n=0;n<=max;)
-	{
-		printk( "\nD: %2x> ", n);
-		for(i=0;i<16 && n<=max;i++,n++)
-			printk("%2x ",read_nic_byte(dev,n));
-	}
-	printk("\n");
-}
 
 /****************************************************************************
       ------------------------------HW STUFF---------------------------
 *****************************************************************************/
 
 
-void rtl8192_set_mode(struct net_device *dev,int mode)
+void rtl8192_set_mode(struct net_device *dev, int mode)
 {
 	u8 ecmd;
-	ecmd=read_nic_byte(dev, EPROM_CMD);
-	ecmd=ecmd &~ EPROM_CMD_OPERATING_MODE_MASK;
-	ecmd=ecmd | (mode<<EPROM_CMD_OPERATING_MODE_SHIFT);
-	ecmd=ecmd &~ (1<<EPROM_CS_SHIFT);
-	ecmd=ecmd &~ (1<<EPROM_CK_SHIFT);
+	read_nic_byte(dev, EPROM_CMD, &ecmd);
+	ecmd = ecmd & ~EPROM_CMD_OPERATING_MODE_MASK;
+	ecmd = ecmd | (mode<<EPROM_CMD_OPERATING_MODE_SHIFT);
+	ecmd = ecmd & ~EPROM_CS_BIT;
+	ecmd = ecmd & ~EPROM_CK_BIT;
 	write_nic_byte(dev, EPROM_CMD, ecmd);
 }
 
@@ -786,15 +712,15 @@
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8 msr;
 
-	msr  = read_nic_byte(dev, MSR);
-	msr &= ~ MSR_LINK_MASK;
+	read_nic_byte(dev, MSR, &msr);
+	msr &= ~MSR_LINK_MASK;
 
 	/* do not change in link_state != WLAN_LINK_ASSOCIATED.
 	 * msr must be updated if the state is ASSOCIATING.
 	 * this is intentional and make sense for ad-hoc and
 	 * master (see the create BSS/IBSS func)
 	 */
-	if (priv->ieee80211->state == IEEE80211_LINKED){
+	if (priv->ieee80211->state == IEEE80211_LINKED) {
 
 		if (priv->ieee80211->iw_mode == IW_MODE_INFRA)
 			msr |= (MSR_LINK_MANAGED<<MSR_LINK_SHIFT);
@@ -803,39 +729,31 @@
 		else if (priv->ieee80211->iw_mode == IW_MODE_MASTER)
 			msr |= (MSR_LINK_MASTER<<MSR_LINK_SHIFT);
 
-	}else
+	} else {
 		msr |= (MSR_LINK_NONE<<MSR_LINK_SHIFT);
+	}
 
 	write_nic_byte(dev, MSR, msr);
 }
 
-void rtl8192_set_chan(struct net_device *dev,short ch)
+void rtl8192_set_chan(struct net_device *dev, short ch)
 {
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-//	u32 tx;
-	RT_TRACE(COMP_CH, "=====>%s()====ch:%d\n", __FUNCTION__, ch);
-	priv->chan=ch;
+	RT_TRACE(COMP_CH, "=====>%s()====ch:%d\n", __func__, ch);
+	priv->chan = ch;
 
 	/* this hack should avoid frame TX during channel setting*/
 
-
-//	tx = read_nic_dword(dev,TX_CONF);
-//	tx &= ~TX_LOOPBACK_MASK;
-
 #ifndef LOOP_TEST
-//	write_nic_dword(dev,TX_CONF, tx |( TX_LOOPBACK_MAC<<TX_LOOPBACK_SHIFT));
-
 	//need to implement rf set channel here WB
 
 	if (priv->rf_set_chan)
-	priv->rf_set_chan(dev,priv->chan);
+		priv->rf_set_chan(dev, priv->chan);
 	mdelay(10);
-//	write_nic_dword(dev,TX_CONF,tx | (TX_LOOPBACK_NONE<<TX_LOOPBACK_SHIFT));
 #endif
 }
 
 static void rtl8192_rx_isr(struct urb *urb);
-//static void rtl8192_rx_isr(struct urb *rx_urb);
 
 u32 get_rxpacket_shiftbytes_819xusb(struct ieee80211_rx_stats *pstats)
 {
@@ -847,10 +765,10 @@
 	else
 #endif
 		return (sizeof(rx_desc_819x_usb) + pstats->RxDrvInfoSize
-				+ pstats->RxBufShift);
+			+ pstats->RxBufShift);
 
 }
-static int rtl8192_rx_initiate(struct net_device*dev)
+static int rtl8192_rx_initiate(struct net_device *dev)
 {
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	struct urb *entry;
@@ -867,7 +785,6 @@
 			kfree_skb(skb);
 			break;
 		}
-//		printk("nomal packet IN request!\n");
 		usb_fill_bulk_urb(entry, priv->udev,
 				  usb_rcvbulkpipe(priv->udev, 3), skb_tail_pointer(skb),
 				  RX_URB_SIZE, rtl8192_rx_isr, skb);
@@ -881,8 +798,7 @@
 
 	/* command packet rx procedure */
 	while (skb_queue_len(&priv->rx_queue) < MAX_RX_URB + 3) {
-//		printk("command packet IN request!\n");
-		skb = __dev_alloc_skb(RX_URB_SIZE ,GFP_KERNEL);
+		skb = __dev_alloc_skb(RX_URB_SIZE, GFP_KERNEL);
 		if (!skb)
 			break;
 		entry = usb_alloc_urb(0, GFP_KERNEL);
@@ -896,7 +812,7 @@
 		info = (struct rtl8192_rx_info *) skb->cb;
 		info->urb = entry;
 		info->dev = dev;
-		   info->out_pipe = 9; //denote rx cmd packet queue
+		info->out_pipe = 9; //denote rx cmd packet queue
 		skb_queue_tail(&priv->rx_queue, skb);
 		usb_submit_urb(entry, GFP_KERNEL);
 	}
@@ -909,64 +825,47 @@
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	u32 rxconf;
 
-	rxconf=read_nic_dword(dev,RCR);
-	rxconf = rxconf &~ MAC_FILTER_MASK;
+	read_nic_dword(dev, RCR, &rxconf);
+	rxconf = rxconf & ~MAC_FILTER_MASK;
 	rxconf = rxconf | RCR_AMF;
 	rxconf = rxconf | RCR_ADF;
 	rxconf = rxconf | RCR_AB;
 	rxconf = rxconf | RCR_AM;
-	//rxconf = rxconf | RCR_ACF;
 
-	if (dev->flags & IFF_PROMISC) {DMESG ("NIC in promisc mode");}
+	if (dev->flags & IFF_PROMISC)
+		DMESG("NIC in promisc mode");
 
-	if(priv->ieee80211->iw_mode == IW_MODE_MONITOR || \
-	   dev->flags & IFF_PROMISC){
+	if (priv->ieee80211->iw_mode == IW_MODE_MONITOR ||
+	    dev->flags & IFF_PROMISC) {
 		rxconf = rxconf | RCR_AAP;
-	} /*else if(priv->ieee80211->iw_mode == IW_MODE_MASTER){
-		rxconf = rxconf | (1<<ACCEPT_ALLMAC_FRAME_SHIFT);
-		rxconf = rxconf | (1<<RX_CHECK_BSSID_SHIFT);
-	}*/else{
+	} else {
 		rxconf = rxconf | RCR_APM;
 		rxconf = rxconf | RCR_CBSSID;
 	}
 
 
-	if(priv->ieee80211->iw_mode == IW_MODE_MONITOR){
+	if (priv->ieee80211->iw_mode == IW_MODE_MONITOR) {
 		rxconf = rxconf | RCR_AICV;
 		rxconf = rxconf | RCR_APWRMGT;
 	}
 
-	if( priv->crcmon == 1 && priv->ieee80211->iw_mode == IW_MODE_MONITOR)
+	if (priv->crcmon == 1 && priv->ieee80211->iw_mode == IW_MODE_MONITOR)
 		rxconf = rxconf | RCR_ACRC32;
 
 
-	rxconf = rxconf &~ RX_FIFO_THRESHOLD_MASK;
+	rxconf = rxconf & ~RX_FIFO_THRESHOLD_MASK;
 	rxconf = rxconf | (RX_FIFO_THRESHOLD_NONE<<RX_FIFO_THRESHOLD_SHIFT);
-	rxconf = rxconf &~ MAX_RX_DMA_MASK;
+	rxconf = rxconf & ~MAX_RX_DMA_MASK;
 	rxconf = rxconf | ((u32)7<<RCR_MXDMA_OFFSET);
 
-//	rxconf = rxconf | (1<<RX_AUTORESETPHY_SHIFT);
 	rxconf = rxconf | RCR_ONLYERLPKT;
 
-//	rxconf = rxconf &~ RCR_CS_MASK;
-//	rxconf = rxconf | (1<<RCR_CS_SHIFT);
-
 	write_nic_dword(dev, RCR, rxconf);
-
-	#ifdef DEBUG_RX
-	DMESG("rxconf: %x %x",rxconf ,read_nic_dword(dev,RCR));
-	#endif
 }
 //wait to be removed
 void rtl8192_rx_enable(struct net_device *dev)
 {
-	//u8 cmd;
-
-	//struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-
 	rtl8192_rx_initiate(dev);
-
-//	rtl8192_set_rxconf(dev);
 }
 
 
@@ -983,9 +882,8 @@
 	struct sk_buff *skb;
 	struct rtl8192_rx_info *info;
 
-	cmd=read_nic_byte(dev,CMDR);
-	write_nic_byte(dev, CMDR, cmd &~ \
-		(CR_TE|CR_RE));
+	read_nic_byte(dev, CMDR, &cmd);
+	write_nic_byte(dev, CMDR, cmd & ~(CR_TE|CR_RE));
 	force_pci_posting(dev);
 	mdelay(10);
 
@@ -998,9 +896,8 @@
 		kfree_skb(skb);
 	}
 
-	if (skb_queue_len(&priv->skb_queue)) {
-		printk(KERN_WARNING "skb_queue not empty\n");
-	}
+	if (skb_queue_len(&priv->skb_queue))
+		netdev_warn(dev, "skb_queue not empty\n");
 
 	skb_queue_purge(&priv->skb_queue);
 	return;
@@ -1014,40 +911,40 @@
 
 inline u16 ieeerate2rtlrate(int rate)
 {
-	switch(rate){
+	switch (rate) {
 	case 10:
-	return 0;
+		return 0;
 	case 20:
-	return 1;
+		return 1;
 	case 55:
-	return 2;
+		return 2;
 	case 110:
-	return 3;
+		return 3;
 	case 60:
-	return 4;
+		return 4;
 	case 90:
-	return 5;
+		return 5;
 	case 120:
-	return 6;
+		return 6;
 	case 180:
-	return 7;
+		return 7;
 	case 240:
-	return 8;
+		return 8;
 	case 360:
-	return 9;
+		return 9;
 	case 480:
-	return 10;
+		return 10;
 	case 540:
-	return 11;
+		return 11;
 	default:
-	return 3;
+		return 3;
 
 	}
 }
-static u16 rtl_rate[] = {10,20,55,110,60,90,120,180,240,360,480,540};
+static u16 rtl_rate[] = {10, 20, 55, 110, 60, 90, 120, 180, 240, 360, 480, 540};
 inline u16 rtl8192_rate2rate(short rate)
 {
-	if (rate >11) return 0;
+	if (rate > 11) return 0;
 	return rtl_rate[rate];
 }
 
@@ -1061,14 +958,13 @@
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	int out_pipe = info->out_pipe;
 	int err;
-	if(!priv->up)
+	if (!priv->up)
 		return;
 	if (unlikely(urb->status)) {
 		info->urb = NULL;
 		priv->stats.rxstaterr++;
 		priv->ieee80211->stats.rx_errors++;
 		usb_free_urb(urb);
-	//	printk("%s():rx status err\n",__FUNCTION__);
 		return;
 	}
 	skb_unlink(skb, &priv->rx_queue);
@@ -1080,14 +976,14 @@
 	skb = dev_alloc_skb(RX_URB_SIZE);
 	if (unlikely(!skb)) {
 		usb_free_urb(urb);
-		printk("%s():can,t alloc skb\n",__FUNCTION__);
+		netdev_err(dev, "%s(): can't alloc skb\n", __func__);
 		/* TODO check rx queue length and refill *somewhere* */
 		return;
 	}
 
 	usb_fill_bulk_urb(urb, priv->udev,
-			usb_rcvbulkpipe(priv->udev, out_pipe), skb_tail_pointer(skb),
-			RX_URB_SIZE, rtl8192_rx_isr, skb);
+			  usb_rcvbulkpipe(priv->udev, out_pipe), skb_tail_pointer(skb),
+			  RX_URB_SIZE, rtl8192_rx_isr, skb);
 
 	info = (struct rtl8192_rx_info *) skb->cb;
 	info->urb = urb;
@@ -1098,31 +994,19 @@
 	urb->context = skb;
 	skb_queue_tail(&priv->rx_queue, skb);
 	err = usb_submit_urb(urb, GFP_ATOMIC);
-	if(err && err != EPERM)
-		printk("can not submit rxurb, err is %x,URB status is %x\n",err,urb->status);
+	if (err && err != EPERM)
+		netdev_err(dev, "can not submit rxurb, err is %x, URB status is %x\n", err, urb->status);
 }
 
-u32
-rtl819xusb_rx_command_packet(
-	struct net_device *dev,
-	struct ieee80211_rx_stats *pstats
-	)
+u32 rtl819xusb_rx_command_packet(struct net_device *dev,
+				 struct ieee80211_rx_stats *pstats)
 {
 	u32	status;
 
-	//RT_TRACE(COMP_RECV, DBG_TRACE, ("---> RxCommandPacketHandle819xUsb()\n"));
-
 	status = cmpk_message_handle_rx(dev, pstats);
 	if (status)
-	{
 		DMESG("rxcommandpackethandle819xusb: It is a command packet\n");
-	}
-	else
-	{
-		//RT_TRACE(COMP_RECV, DBG_TRACE, ("RxCommandPacketHandle819xUsb: It is not a command packet\n"));
-	}
 
-	//RT_TRACE(COMP_RECV, DBG_TRACE, ("<--- RxCommandPacketHandle819xUsb()\n"));
 	return status;
 }
 
@@ -1150,24 +1034,17 @@
 	u8 queue_index = tcb_desc->queue_index;
 
 	/* shall not be referred by command packet */
-	assert(queue_index != TXCMD_QUEUE);
+	RTL8192U_ASSERT(queue_index != TXCMD_QUEUE);
 
-	spin_lock_irqsave(&priv->tx_lock,flags);
+	spin_lock_irqsave(&priv->tx_lock, flags);
 
-	memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-//	tcb_desc->RATRIndex = 7;
-//	tcb_desc->bTxDisableRateFallBack = 1;
-//	tcb_desc->bTxUseDriverAssingedRate = 1;
+	memcpy((unsigned char *)(skb->cb), &dev, sizeof(dev));
 	tcb_desc->bTxEnableFwCalcDur = 1;
 	skb_push(skb, priv->ieee80211->tx_headroom);
 	ret = rtl8192_tx(dev, skb);
 
-	//priv->ieee80211->stats.tx_bytes+=(skb->len - priv->ieee80211->tx_headroom);
-	//priv->ieee80211->stats.tx_packets++;
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
 
-	spin_unlock_irqrestore(&priv->tx_lock,flags);
-
-//	return ret;
 	return;
 }
 
@@ -1176,7 +1053,7 @@
  * If the ring is full packet are dropped (for data frame the queue
  * is stopped before this can happen).
  */
-int rtl8192_hard_start_xmit(struct sk_buff *skb,struct net_device *dev)
+int rtl8192_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	int ret;
@@ -1185,21 +1062,21 @@
 	u8 queue_index = tcb_desc->queue_index;
 
 
-	spin_lock_irqsave(&priv->tx_lock,flags);
+	spin_lock_irqsave(&priv->tx_lock, flags);
 
-	memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-	if(queue_index == TXCMD_QUEUE) {
+	memcpy((unsigned char *)(skb->cb), &dev, sizeof(dev));
+	if (queue_index == TXCMD_QUEUE) {
 		skb_push(skb, USB_HWDESC_HEADER_LEN);
 		rtl819xU_tx_cmd(dev, skb);
 		ret = 1;
-		spin_unlock_irqrestore(&priv->tx_lock,flags);
+		spin_unlock_irqrestore(&priv->tx_lock, flags);
 		return ret;
 	} else {
 		skb_push(skb, priv->ieee80211->tx_headroom);
 		ret = rtl8192_tx(dev, skb);
 	}
 
-	spin_unlock_irqrestore(&priv->tx_lock,flags);
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
 
 	return ret;
 }
@@ -1211,7 +1088,7 @@
 u16 DrvAggr_PaddingAdd(struct net_device *dev, struct sk_buff *skb)
 {
 	u16     PaddingNum =  256 - ((skb->len + TX_PACKET_DRVAGGR_SUBFRAME_SHIFT_BYTES) % 256);
-	return  (PaddingNum&0xff);
+	return  PaddingNum & 0xff;
 }
 
 u8 MRateToHwRate8190Pci(u8 rate);
@@ -1239,7 +1116,7 @@
 	/* Get the total aggregation length including the padding space and
 	 * sub frame header.
 	 */
-	for(i = 1; i < pSendList->nr_drv_agg_frames; i++) {
+	for (i = 1; i < pSendList->nr_drv_agg_frames; i++) {
 		TotalLength += DrvAggr_PaddingAdd(dev, skb);
 		skb = pSendList->tx_agg_frames[i];
 		TotalLength += (skb->len + TX_PACKET_DRVAGGR_SUBFRAME_SHIFT_BYTES);
@@ -1250,23 +1127,19 @@
 	memset(agg_skb->data, 0, agg_skb->len);
 	skb_reserve(agg_skb, ieee->tx_headroom);
 
-//	RT_DEBUG_DATA(COMP_SEND, skb->cb, sizeof(skb->cb));
 	/* reserve info for first subframe Tx descriptor to be set in the tx function */
 	skb = pSendList->tx_agg_frames[0];
 	tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
 	tcb_desc->drv_agg_enable = 1;
 	tcb_desc->pkt_size = skb->len;
 	tcb_desc->DrvAggrNum = pSendList->nr_drv_agg_frames;
-	printk("DrvAggNum = %d\n", tcb_desc->DrvAggrNum);
-//	RT_DEBUG_DATA(COMP_SEND, skb->cb, sizeof(skb->cb));
-//	printk("========>skb->data ======> \n");
-//	RT_DEBUG_DATA(COMP_SEND, skb->data, skb->len);
+	netdev_dbg(dev, "DrvAggNum = %d\n", tcb_desc->DrvAggrNum);
 	memcpy(agg_skb->cb, skb->cb, sizeof(skb->cb));
-	memcpy(skb_put(agg_skb,skb->len),skb->data,skb->len);
+	memcpy(skb_put(agg_skb, skb->len), skb->data, skb->len);
 
-	for(i = 1; i < pSendList->nr_drv_agg_frames; i++) {
+	for (i = 1; i < pSendList->nr_drv_agg_frames; i++) {
 		/* push the next sub frame to be 256 byte aline */
-		skb_put(agg_skb,DrvAggr_PaddingAdd(dev,skb));
+		skb_put(agg_skb, DrvAggr_PaddingAdd(dev, skb));
 
 		/* Subframe drv Tx descriptor and firmware info setting */
 		skb = pSendList->tx_agg_frames[i];
@@ -1274,13 +1147,13 @@
 		tx_agg_desc = (tx_desc_819x_usb_aggr_subframe *)agg_skb->tail;
 		tx_fwinfo = (tx_fwinfo_819x_usb *)(agg_skb->tail + sizeof(tx_desc_819x_usb_aggr_subframe));
 
-		memset(tx_fwinfo,0,sizeof(tx_fwinfo_819x_usb));
+		memset(tx_fwinfo, 0, sizeof(tx_fwinfo_819x_usb));
 		/* DWORD 0 */
-		tx_fwinfo->TxHT = (tcb_desc->data_rate&0x80)?1:0;
+		tx_fwinfo->TxHT = (tcb_desc->data_rate&0x80) ? 1 : 0;
 		tx_fwinfo->TxRate = MRateToHwRate8190Pci(tcb_desc->data_rate);
 		tx_fwinfo->EnableCPUDur = tcb_desc->bTxEnableFwCalcDur;
 		tx_fwinfo->Short = QueryIsShort(tx_fwinfo->TxHT, tx_fwinfo->TxRate, tcb_desc);
-		if(tcb_desc->bAMPDUEnable) {//AMPDU enabled
+		if (tcb_desc->bAMPDUEnable) {//AMPDU enabled
 			tx_fwinfo->AllowAggregation = 1;
 			/* DWORD 1 */
 			tx_fwinfo->RxMF = tcb_desc->ampdu_factor;
@@ -1293,20 +1166,19 @@
 		}
 
 		/* Protection mode related */
-		tx_fwinfo->RtsEnable = (tcb_desc->bRTSEnable)?1:0;
-		tx_fwinfo->CtsEnable = (tcb_desc->bCTSEnable)?1:0;
-		tx_fwinfo->RtsSTBC = (tcb_desc->bRTSSTBC)?1:0;
-		tx_fwinfo->RtsHT = (tcb_desc->rts_rate&0x80)?1:0;
+		tx_fwinfo->RtsEnable = (tcb_desc->bRTSEnable) ? 1 : 0;
+		tx_fwinfo->CtsEnable = (tcb_desc->bCTSEnable) ? 1 : 0;
+		tx_fwinfo->RtsSTBC = (tcb_desc->bRTSSTBC) ? 1 : 0;
+		tx_fwinfo->RtsHT = (tcb_desc->rts_rate&0x80) ? 1 : 0;
 		tx_fwinfo->RtsRate =  MRateToHwRate8190Pci((u8)tcb_desc->rts_rate);
-		tx_fwinfo->RtsSubcarrier = (tx_fwinfo->RtsHT==0)?(tcb_desc->RTSSC):0;
-		tx_fwinfo->RtsBandwidth = (tx_fwinfo->RtsHT==1)?((tcb_desc->bRTSBW)?1:0):0;
-		tx_fwinfo->RtsShort = (tx_fwinfo->RtsHT==0)?(tcb_desc->bRTSUseShortPreamble?1:0):\
-				      (tcb_desc->bRTSUseShortGI?1:0);
+		tx_fwinfo->RtsSubcarrier = (tx_fwinfo->RtsHT == 0) ? (tcb_desc->RTSSC) : 0;
+		tx_fwinfo->RtsBandwidth = (tx_fwinfo->RtsHT == 1) ? ((tcb_desc->bRTSBW) ? 1 : 0) : 0;
+		tx_fwinfo->RtsShort = (tx_fwinfo->RtsHT == 0) ? (tcb_desc->bRTSUseShortPreamble ? 1 : 0) :
+				      (tcb_desc->bRTSUseShortGI ? 1 : 0);
 
 		/* Set Bandwidth and sub-channel settings. */
-		if(priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20_40)
-		{
-			if(tcb_desc->bPacketBW) {
+		if (priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20_40) {
+			if (tcb_desc->bPacketBW) {
 				tx_fwinfo->TxBandwidth = 1;
 				tx_fwinfo->TxSubCarrier = 0;    //By SD3's Jerry suggestion, use duplicated mode
 			} else {
@@ -1321,41 +1193,35 @@
 		/* Fill Tx descriptor */
 		memset(tx_agg_desc, 0, sizeof(tx_desc_819x_usb_aggr_subframe));
 		/* DWORD 0 */
-		//tx_agg_desc->LINIP = 0;
-		//tx_agg_desc->CmdInit = 1;
 		tx_agg_desc->Offset =  sizeof(tx_fwinfo_819x_usb) + 8;
 		/* already raw data, need not to subtract header length */
 		tx_agg_desc->PktSize = skb->len & 0xffff;
 
 		/*DWORD 1*/
-		tx_agg_desc->SecCAMID= 0;
+		tx_agg_desc->SecCAMID = 0;
 		tx_agg_desc->RATid = tcb_desc->RATRIndex;
-		{
-			//MPDUOverhead = 0;
-			tx_agg_desc->NoEnc = 1;
-		}
+		tx_agg_desc->NoEnc = 1;
 		tx_agg_desc->SecType = 0x0;
 
 		if (tcb_desc->bHwSec) {
-			switch (priv->ieee80211->pairwise_key_type)
-			{
-				case KEY_TYPE_WEP40:
-				case KEY_TYPE_WEP104:
-					tx_agg_desc->SecType = 0x1;
-					tx_agg_desc->NoEnc = 0;
-					break;
-				case KEY_TYPE_TKIP:
-					tx_agg_desc->SecType = 0x2;
-					tx_agg_desc->NoEnc = 0;
-					break;
-				case KEY_TYPE_CCMP:
-					tx_agg_desc->SecType = 0x3;
-					tx_agg_desc->NoEnc = 0;
-					break;
-				case KEY_TYPE_NA:
-					tx_agg_desc->SecType = 0x0;
-					tx_agg_desc->NoEnc = 1;
-					break;
+			switch (priv->ieee80211->pairwise_key_type) {
+			case KEY_TYPE_WEP40:
+			case KEY_TYPE_WEP104:
+				tx_agg_desc->SecType = 0x1;
+				tx_agg_desc->NoEnc = 0;
+				break;
+			case KEY_TYPE_TKIP:
+				tx_agg_desc->SecType = 0x2;
+				tx_agg_desc->NoEnc = 0;
+				break;
+			case KEY_TYPE_CCMP:
+				tx_agg_desc->SecType = 0x3;
+				tx_agg_desc->NoEnc = 0;
+				break;
+			case KEY_TYPE_NA:
+				tx_agg_desc->SecType = 0x0;
+				tx_agg_desc->NoEnc = 1;
+				break;
 			}
 		}
 
@@ -1369,16 +1235,14 @@
 
 		//DWORD 2
 		/* According windows driver, it seems that there no need to fill this field */
-		//tx_agg_desc->TxBufferSize= (u32)(skb->len - USB_HWDESC_HEADER_LEN);
 
 		/* to fill next packet */
-		skb_put(agg_skb,TX_PACKET_DRVAGGR_SUBFRAME_SHIFT_BYTES);
-		memcpy(skb_put(agg_skb,skb->len),skb->data,skb->len);
+		skb_put(agg_skb, TX_PACKET_DRVAGGR_SUBFRAME_SHIFT_BYTES);
+		memcpy(skb_put(agg_skb, skb->len), skb->data, skb->len);
 	}
 
-	for(i = 0; i < pSendList->nr_drv_agg_frames; i++) {
+	for (i = 0; i < pSendList->nr_drv_agg_frames; i++)
 		dev_kfree_skb_any(pSendList->tx_agg_frames[i]);
-	}
 
 	return agg_skb;
 }
@@ -1388,7 +1252,7 @@
 	If no proper TCB is found to do aggregation, SendList will only contain the input TCB.
 */
 u8 DrvAggr_GetAggregatibleList(struct net_device *dev, struct sk_buff *skb,
-		struct ieee80211_drv_agg_txb *pSendList)
+			       struct ieee80211_drv_agg_txb *pSendList)
 {
 	struct ieee80211_device *ieee = netdev_priv(dev);
 	PRT_HIGH_THROUGHPUT	pHTInfo = ieee->pHTInfo;
@@ -1398,11 +1262,10 @@
 
 	do {
 		pSendList->tx_agg_frames[pSendList->nr_drv_agg_frames++] = skb;
-		if(pSendList->nr_drv_agg_frames >= nMaxAggrNum) {
+		if (pSendList->nr_drv_agg_frames >= nMaxAggrNum)
 			break;
-		}
 
-	} while((skb = skb_dequeue(&ieee->skb_drv_aggQ[QueueID])));
+	} while ((skb = skb_dequeue(&ieee->skb_drv_aggQ[QueueID])));
 
 	RT_TRACE(COMP_AMSDU, "DrvAggr_GetAggregatibleList, nAggrTcbNum = %d \n", pSendList->nr_drv_agg_frames);
 	return pSendList->nr_drv_agg_frames;
@@ -1411,105 +1274,86 @@
 
 static void rtl8192_tx_isr(struct urb *tx_urb)
 {
-	struct sk_buff *skb = (struct sk_buff*)tx_urb->context;
+	struct sk_buff *skb = (struct sk_buff *)tx_urb->context;
 	struct net_device *dev = NULL;
 	struct r8192_priv *priv = NULL;
 	cb_desc *tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
 	u8  queue_index = tcb_desc->queue_index;
-//	bool bToSend0Byte;
-//	u16 BufLen = skb->len;
 
-	memcpy(&dev,(struct net_device*)(skb->cb),sizeof(struct net_device*));
+	memcpy(&dev, (struct net_device *)(skb->cb), sizeof(struct net_device *));
 	priv = ieee80211_priv(dev);
 
-	if(tcb_desc->queue_index != TXCMD_QUEUE) {
-		if(tx_urb->status == 0) {
+	if (tcb_desc->queue_index != TXCMD_QUEUE) {
+		if (tx_urb->status == 0) {
 			dev->trans_start = jiffies;
-			// Act as station mode, destination shall be unicast address.
-			//priv->ieee80211->stats.tx_bytes+=(skb->len - priv->ieee80211->tx_headroom);
-			//priv->ieee80211->stats.tx_packets++;
 			priv->stats.txoktotal++;
 			priv->ieee80211->LinkDetectInfo.NumTxOkInPeriod++;
 			priv->stats.txbytesunicast += (skb->len - priv->ieee80211->tx_headroom);
 		} else {
 			priv->ieee80211->stats.tx_errors++;
-			//priv->stats.txmanageerr++;
 			/* TODO */
 		}
 	}
 
 	/* free skb and tx_urb */
-	if(skb != NULL) {
+	if (skb != NULL) {
 		dev_kfree_skb_any(skb);
 		usb_free_urb(tx_urb);
 		atomic_dec(&priv->tx_pending[queue_index]);
 	}
 
-	{
-		//
-		// Handle HW Beacon:
-		// We had transfer our beacon frame to host controller at this moment.
-		//
-		//
-		// Caution:
-		// Handling the wait queue of command packets.
-		// For Tx command packets, we must not do TCB fragment because it is not handled right now.
-		// We must cut the packets to match the size of TX_CMD_PKT before we send it.
-		//
+	//
+	// Handle HW Beacon:
+	// We had transfer our beacon frame to host controller at this moment.
+	//
+	//
+	// Caution:
+	// Handling the wait queue of command packets.
+	// For Tx command packets, we must not do TCB fragment because it is not handled right now.
+	// We must cut the packets to match the size of TX_CMD_PKT before we send it.
+	//
 
-		/* Handle MPDU in wait queue. */
-		if(queue_index != BEACON_QUEUE) {
-			/* Don't send data frame during scanning.*/
-			if((skb_queue_len(&priv->ieee80211->skb_waitQ[queue_index]) != 0)&&\
-					(!(priv->ieee80211->queue_stop))) {
-				if(NULL != (skb = skb_dequeue(&(priv->ieee80211->skb_waitQ[queue_index]))))
-					priv->ieee80211->softmac_hard_start_xmit(skb, dev);
+	/* Handle MPDU in wait queue. */
+	if (queue_index != BEACON_QUEUE) {
+		/* Don't send data frame during scanning.*/
+		if ((skb_queue_len(&priv->ieee80211->skb_waitQ[queue_index]) != 0) &&
+		    (!(priv->ieee80211->queue_stop))) {
+			if (NULL != (skb = skb_dequeue(&(priv->ieee80211->skb_waitQ[queue_index]))))
+				priv->ieee80211->softmac_hard_start_xmit(skb, dev);
 
-				return; //modified by david to avoid further processing AMSDU
-			}
-#ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
-			else if ((skb_queue_len(&priv->ieee80211->skb_drv_aggQ[queue_index])!= 0)&&\
-				(!(priv->ieee80211->queue_stop))) {
-				// Tx Driver Aggregation process
-				/* The driver will aggregation the packets according to the following stats
-				 * 1. check whether there's tx irq available, for it's a completion return
-				 *    function, it should contain enough tx irq;
-				 * 2. check packet type;
-				 * 3. initialize sendlist, check whether the to-be send packet no greater than 1
-				 * 4. aggregates the packets, and fill firmware info and tx desc into it, etc.
-				 * 5. check whether the packet could be sent, otherwise just insert into wait head
-				 * */
-				skb = skb_dequeue(&priv->ieee80211->skb_drv_aggQ[queue_index]);
-				if(!check_nic_enough_desc(dev, queue_index)) {
-					skb_queue_head(&(priv->ieee80211->skb_drv_aggQ[queue_index]), skb);
-					return;
-				}
-
-				{
-					/*TODO*/
-					/*
-					u8* pHeader = skb->data;
-
-					if(IsMgntQosData(pHeader) ||
-					    IsMgntQData_Ack(pHeader) ||
-					    IsMgntQData_Poll(pHeader) ||
-					    IsMgntQData_Poll_Ack(pHeader)
-					  )
-					*/
-					{
-						struct ieee80211_drv_agg_txb SendList;
-
-						memset(&SendList, 0, sizeof(struct ieee80211_drv_agg_txb));
-						if(DrvAggr_GetAggregatibleList(dev, skb, &SendList) > 1) {
-							skb = DrvAggr_Aggregation(dev, &SendList);
-
-						}
-					}
-					priv->ieee80211->softmac_hard_start_xmit(skb, dev);
-				}
-			}
-#endif
+			return; //modified by david to avoid further processing AMSDU
 		}
+#ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
+		else if ((skb_queue_len(&priv->ieee80211->skb_drv_aggQ[queue_index]) != 0) &&
+			 (!(priv->ieee80211->queue_stop))) {
+			// Tx Driver Aggregation process
+			/* The driver will aggregation the packets according to the following stats
+			 * 1. check whether there's tx irq available, for it's a completion return
+			 *    function, it should contain enough tx irq;
+			 * 2. check packet type;
+			 * 3. initialize sendlist, check whether the to-be send packet no greater than 1
+			 * 4. aggregates the packets, and fill firmware info and tx desc into it, etc.
+			 * 5. check whether the packet could be sent, otherwise just insert into wait head
+			 * */
+			skb = skb_dequeue(&priv->ieee80211->skb_drv_aggQ[queue_index]);
+			if (!check_nic_enough_desc(dev, queue_index)) {
+				skb_queue_head(&(priv->ieee80211->skb_drv_aggQ[queue_index]), skb);
+				return;
+			}
+
+			/*TODO*/
+			{
+				struct ieee80211_drv_agg_txb SendList;
+
+				memset(&SendList, 0, sizeof(struct ieee80211_drv_agg_txb));
+				if (DrvAggr_GetAggregatibleList(dev, skb, &SendList) > 1) {
+					skb = DrvAggr_Aggregation(dev, &SendList);
+
+				}
+			}
+			priv->ieee80211->softmac_hard_start_xmit(skb, dev);
+		}
+#endif
 	}
 
 }
@@ -1519,72 +1363,67 @@
 	u8 msr, msrm, msr2;
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
-	msr  = read_nic_byte(dev, MSR);
+	read_nic_byte(dev, MSR, &msr);
 	msrm = msr & MSR_LINK_MASK;
 	msr2 = msr & ~MSR_LINK_MASK;
 
-	if(NIC_8192U == priv->card_8192) {
+	if (NIC_8192U == priv->card_8192)
 		usb_kill_urb(priv->rx_urb[MAX_RX_URB]);
-	}
 	if ((msrm == (MSR_LINK_ADHOC<<MSR_LINK_SHIFT) ||
-		(msrm == (MSR_LINK_MASTER<<MSR_LINK_SHIFT)))){
+	    (msrm == (MSR_LINK_MASTER<<MSR_LINK_SHIFT)))) {
 		write_nic_byte(dev, MSR, msr2 | MSR_LINK_NONE);
 		write_nic_byte(dev, MSR, msr);
 	}
 }
 
-void rtl8192_config_rate(struct net_device* dev, u16* rate_config)
+void rtl8192_config_rate(struct net_device *dev, u16 *rate_config)
 {
-	 struct r8192_priv *priv = ieee80211_priv(dev);
-	 struct ieee80211_network *net;
-	 u8 i=0, basic_rate = 0;
-	 net = & priv->ieee80211->current_network;
+	struct r8192_priv *priv = ieee80211_priv(dev);
+	struct ieee80211_network *net;
+	u8 i = 0, basic_rate = 0;
+	net = &priv->ieee80211->current_network;
 
-	 for (i=0; i<net->rates_len; i++)
-	 {
-		 basic_rate = net->rates[i]&0x7f;
-		 switch(basic_rate)
-		 {
-			 case MGN_1M:	*rate_config |= RRSR_1M;	break;
-			 case MGN_2M:	*rate_config |= RRSR_2M;	break;
-			 case MGN_5_5M:	*rate_config |= RRSR_5_5M;	break;
-			 case MGN_11M:	*rate_config |= RRSR_11M;	break;
-			 case MGN_6M:	*rate_config |= RRSR_6M;	break;
-			 case MGN_9M:	*rate_config |= RRSR_9M;	break;
-			 case MGN_12M:	*rate_config |= RRSR_12M;	break;
-			 case MGN_18M:	*rate_config |= RRSR_18M;	break;
-			 case MGN_24M:	*rate_config |= RRSR_24M;	break;
-			 case MGN_36M:	*rate_config |= RRSR_36M;	break;
-			 case MGN_48M:	*rate_config |= RRSR_48M;	break;
-			 case MGN_54M:	*rate_config |= RRSR_54M;	break;
-		 }
-	 }
-	 for (i=0; i<net->rates_ex_len; i++)
-	 {
-		 basic_rate = net->rates_ex[i]&0x7f;
-		 switch(basic_rate)
-		 {
-			 case MGN_1M:	*rate_config |= RRSR_1M;	break;
-			 case MGN_2M:	*rate_config |= RRSR_2M;	break;
-			 case MGN_5_5M:	*rate_config |= RRSR_5_5M;	break;
-			 case MGN_11M:	*rate_config |= RRSR_11M;	break;
-			 case MGN_6M:	*rate_config |= RRSR_6M;	break;
-			 case MGN_9M:	*rate_config |= RRSR_9M;	break;
-			 case MGN_12M:	*rate_config |= RRSR_12M;	break;
-			 case MGN_18M:	*rate_config |= RRSR_18M;	break;
-			 case MGN_24M:	*rate_config |= RRSR_24M;	break;
-			 case MGN_36M:	*rate_config |= RRSR_36M;	break;
-			 case MGN_48M:	*rate_config |= RRSR_48M;	break;
-			 case MGN_54M:	*rate_config |= RRSR_54M;	break;
-		 }
-	 }
+	for (i = 0; i < net->rates_len; i++) {
+		basic_rate = net->rates[i]&0x7f;
+		switch (basic_rate) {
+		case MGN_1M:	*rate_config |= RRSR_1M;	break;
+		case MGN_2M:	*rate_config |= RRSR_2M;	break;
+		case MGN_5_5M:	*rate_config |= RRSR_5_5M;	break;
+		case MGN_11M:	*rate_config |= RRSR_11M;	break;
+		case MGN_6M:	*rate_config |= RRSR_6M;	break;
+		case MGN_9M:	*rate_config |= RRSR_9M;	break;
+		case MGN_12M:	*rate_config |= RRSR_12M;	break;
+		case MGN_18M:	*rate_config |= RRSR_18M;	break;
+		case MGN_24M:	*rate_config |= RRSR_24M;	break;
+		case MGN_36M:	*rate_config |= RRSR_36M;	break;
+		case MGN_48M:	*rate_config |= RRSR_48M;	break;
+		case MGN_54M:	*rate_config |= RRSR_54M;	break;
+		}
+	}
+	for (i = 0; i < net->rates_ex_len; i++) {
+		basic_rate = net->rates_ex[i]&0x7f;
+		switch (basic_rate) {
+		case MGN_1M:	*rate_config |= RRSR_1M;	break;
+		case MGN_2M:	*rate_config |= RRSR_2M;	break;
+		case MGN_5_5M:	*rate_config |= RRSR_5_5M;	break;
+		case MGN_11M:	*rate_config |= RRSR_11M;	break;
+		case MGN_6M:	*rate_config |= RRSR_6M;	break;
+		case MGN_9M:	*rate_config |= RRSR_9M;	break;
+		case MGN_12M:	*rate_config |= RRSR_12M;	break;
+		case MGN_18M:	*rate_config |= RRSR_18M;	break;
+		case MGN_24M:	*rate_config |= RRSR_24M;	break;
+		case MGN_36M:	*rate_config |= RRSR_36M;	break;
+		case MGN_48M:	*rate_config |= RRSR_48M;	break;
+		case MGN_54M:	*rate_config |= RRSR_54M;	break;
+		}
+	}
 }
 
 
 #define SHORT_SLOT_TIME 9
 #define NON_SHORT_SLOT_TIME 20
 
-void rtl8192_update_cap(struct net_device* dev, u16 cap)
+void rtl8192_update_cap(struct net_device *dev, u16 cap)
 {
 	u32 tmp = 0;
 	struct r8192_priv *priv = ieee80211_priv(dev);
@@ -1595,13 +1434,10 @@
 		tmp |= BRSR_AckShortPmb;
 	write_nic_dword(dev, RRSR, tmp);
 
-	if (net->mode & (IEEE_G|IEEE_N_24G))
-	{
+	if (net->mode & (IEEE_G|IEEE_N_24G)) {
 		u8 slot_time = 0;
-		if ((cap & WLAN_CAPABILITY_SHORT_SLOT)&&(!priv->ieee80211->pHTInfo->bCurrentRT2RTLongSlotTime))
-		{//short slot time
+		if ((cap & WLAN_CAPABILITY_SHORT_SLOT) && (!priv->ieee80211->pHTInfo->bCurrentRT2RTLongSlotTime)) //short slot time
 			slot_time = SHORT_SLOT_TIME;
-		}
 		else //long slot time
 			slot_time = NON_SHORT_SLOT_TIME;
 		priv->slot_time = slot_time;
@@ -1616,31 +1452,26 @@
 	struct ieee80211_network *net;
 	u16 BcnTimeCfg = 0, BcnCW = 6, BcnIFS = 0xf;
 	u16 rate_config = 0;
-	net = & priv->ieee80211->current_network;
+	net = &priv->ieee80211->current_network;
 
 	rtl8192_config_rate(dev, &rate_config);
 	priv->basic_rate = rate_config &= 0x15f;
 
-	write_nic_dword(dev,BSSIDR,((u32*)net->bssid)[0]);
-	write_nic_word(dev,BSSIDR+4,((u16*)net->bssid)[2]);
-	//for(i=0;i<ETH_ALEN;i++)
-	//	write_nic_byte(dev,BSSID+i,net->bssid[i]);
+	write_nic_dword(dev, BSSIDR, ((u32 *)net->bssid)[0]);
+	write_nic_word(dev, BSSIDR+4, ((u16 *)net->bssid)[2]);
 
 	rtl8192_update_msr(dev);
-//	rtl8192_update_cap(dev, net->capability);
-	if (priv->ieee80211->iw_mode == IW_MODE_ADHOC)
-	{
-	write_nic_word(dev, ATIMWND, 2);
-	write_nic_word(dev, BCN_DMATIME, 1023);
-	write_nic_word(dev, BCN_INTERVAL, net->beacon_interval);
-//	write_nic_word(dev, BcnIntTime, 100);
-	write_nic_word(dev, BCN_DRV_EARLY_INT, 1);
-	write_nic_byte(dev, BCN_ERR_THRESH, 100);
+	if (priv->ieee80211->iw_mode == IW_MODE_ADHOC) {
+		write_nic_word(dev, ATIMWND, 2);
+		write_nic_word(dev, BCN_DMATIME, 1023);
+		write_nic_word(dev, BCN_INTERVAL, net->beacon_interval);
+		write_nic_word(dev, BCN_DRV_EARLY_INT, 1);
+		write_nic_byte(dev, BCN_ERR_THRESH, 100);
 		BcnTimeCfg |= (BcnCW<<BCN_TCFG_CW_SHIFT);
-	// TODO: BcnIFS may required to be changed on ASIC
+		// TODO: BcnIFS may required to be changed on ASIC
 		BcnTimeCfg |= BcnIFS<<BCN_TCFG_IFS;
 
-	write_nic_word(dev, BCN_TCFG, BcnTimeCfg);
+		write_nic_word(dev, BCN_TCFG, BcnTimeCfg);
 	}
 
 
@@ -1649,46 +1480,37 @@
 
 //temporary hw beacon is not used any more.
 //open it when necessary
-void rtl819xusb_beacon_tx(struct net_device *dev,u16  tx_rate)
+void rtl819xusb_beacon_tx(struct net_device *dev, u16  tx_rate)
 {
 
 }
 inline u8 rtl8192_IsWirelessBMode(u16 rate)
 {
-	if( ((rate <= 110) && (rate != 60) && (rate != 90)) || (rate == 220) )
+	if (((rate <= 110) && (rate != 60) && (rate != 90)) || (rate == 220))
 		return 1;
 	else return 0;
 }
 
 u16 N_DBPSOfRate(u16 DataRate);
 
-u16 ComputeTxTime(
-	u16		FrameLength,
-	u16		DataRate,
-	u8		bManagementFrame,
-	u8		bShortPreamble
-)
+u16 ComputeTxTime(u16 FrameLength, u16 DataRate, u8 bManagementFrame,
+		  u8 bShortPreamble)
 {
 	u16	FrameTime;
 	u16	N_DBPS;
 	u16	Ceiling;
 
-	if( rtl8192_IsWirelessBMode(DataRate) )
-	{
-		if( bManagementFrame || !bShortPreamble || DataRate == 10 )
-		{	// long preamble
+	if (rtl8192_IsWirelessBMode(DataRate)) {
+		if (bManagementFrame || !bShortPreamble || DataRate == 10) // long preamble
 			FrameTime = (u16)(144+48+(FrameLength*8/(DataRate/10)));
-		}
-		else
-		{	// Short preamble
+		else // Short preamble
 			FrameTime = (u16)(72+24+(FrameLength*8/(DataRate/10)));
-		}
-		if( ( FrameLength*8 % (DataRate/10) ) != 0 ) //Get the Ceilling
-				FrameTime ++;
+		if ((FrameLength*8 % (DataRate/10)) != 0) //Get the Ceilling
+			FrameTime++;
 	} else {	//802.11g DSSS-OFDM PLCP length field calculation.
 		N_DBPS = N_DBPSOfRate(DataRate);
 		Ceiling = (16 + 8*FrameLength + 6) / N_DBPS
-				+ (((16 + 8*FrameLength + 6) % N_DBPS) ? 1 : 0);
+			+ (((16 + 8*FrameLength + 6) % N_DBPS) ? 1 : 0);
 		FrameTime = (u16)(16 + 4 + 4*Ceiling + 6);
 	}
 	return FrameTime;
@@ -1696,47 +1518,46 @@
 
 u16 N_DBPSOfRate(u16 DataRate)
 {
-	 u16 N_DBPS = 24;
+	u16 N_DBPS = 24;
 
-	 switch(DataRate)
-	 {
-	 case 60:
-	  N_DBPS = 24;
-	  break;
+	switch (DataRate) {
+	case 60:
+		N_DBPS = 24;
+		break;
 
-	 case 90:
-	  N_DBPS = 36;
-	  break;
+	case 90:
+		N_DBPS = 36;
+		break;
 
-	 case 120:
-	  N_DBPS = 48;
-	  break;
+	case 120:
+		N_DBPS = 48;
+		break;
 
-	 case 180:
-	  N_DBPS = 72;
-	  break;
+	case 180:
+		N_DBPS = 72;
+		break;
 
-	 case 240:
-	  N_DBPS = 96;
-	  break;
+	case 240:
+		N_DBPS = 96;
+		break;
 
-	 case 360:
-	  N_DBPS = 144;
-	  break;
+	case 360:
+		N_DBPS = 144;
+		break;
 
-	 case 480:
-	  N_DBPS = 192;
-	  break;
+	case 480:
+		N_DBPS = 192;
+		break;
 
-	 case 540:
-	  N_DBPS = 216;
-	  break;
+	case 540:
+		N_DBPS = 216;
+		break;
 
-	 default:
-	  break;
-	 }
+	default:
+		break;
+	}
 
-	 return N_DBPS;
+	return N_DBPS;
 }
 
 void rtl819xU_cmd_isr(struct urb *tx_cmd_urb, struct pt_regs *regs)
@@ -1744,11 +1565,10 @@
 	usb_free_urb(tx_cmd_urb);
 }
 
-unsigned int txqueue2outpipe(struct r8192_priv* priv,unsigned int tx_queue) {
-
-	if(tx_queue >= 9)
-	{
-		RT_TRACE(COMP_ERR,"%s():Unknown queue ID!!!\n",__FUNCTION__);
+unsigned int txqueue2outpipe(struct r8192_priv *priv, unsigned int tx_queue)
+{
+	if (tx_queue >= 9) {
+		RT_TRACE(COMP_ERR, "%s():Unknown queue ID!!!\n", __func__);
 		return 0x04;
 	}
 	return priv->txqueue_to_outpipemap[tx_queue];
@@ -1757,19 +1577,16 @@
 short rtl819xU_tx_cmd(struct net_device *dev, struct sk_buff *skb)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	//u8			*tx;
 	int			status;
 	struct urb		*tx_urb;
-	//int			urb_buf_len;
 	unsigned int		idx_pipe;
 	tx_desc_cmd_819x_usb *pdesc = (tx_desc_cmd_819x_usb *)skb->data;
 	cb_desc *tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
 	u8 queue_index = tcb_desc->queue_index;
 
-	//printk("\n %s::queue_index = %d\n",__FUNCTION__, queue_index);
 	atomic_inc(&priv->tx_pending[queue_index]);
-	tx_urb = usb_alloc_urb(0,GFP_ATOMIC);
-	if(!tx_urb){
+	tx_urb = usb_alloc_urb(0, GFP_ATOMIC);
+	if (!tx_urb) {
 		dev_kfree_skb(skb);
 		return -ENOMEM;
 	}
@@ -1788,27 +1605,26 @@
 	//----------------------------------------------------------------------------
 	// Get index to out pipe from specified QueueID.
 #ifndef USE_ONE_PIPE
-	idx_pipe = txqueue2outpipe(priv,queue_index);
+	idx_pipe = txqueue2outpipe(priv, queue_index);
 #else
 	idx_pipe = 0x04;
 #endif
 #ifdef JOHN_DUMP_TXDESC
 	int i;
-	printk("<Tx descriptor>--rate %x---",rate);
+	printk("<Tx descriptor>--rate %x---", rate);
 	for (i = 0; i < 8; i++)
 		printk("%8x ", tx[i]);
 	printk("\n");
 #endif
-	usb_fill_bulk_urb(tx_urb,priv->udev, usb_sndbulkpipe(priv->udev,idx_pipe), \
-			skb->data, skb->len, rtl8192_tx_isr, skb);
+	usb_fill_bulk_urb(tx_urb, priv->udev, usb_sndbulkpipe(priv->udev, idx_pipe),
+			  skb->data, skb->len, rtl8192_tx_isr, skb);
 
 	status = usb_submit_urb(tx_urb, GFP_ATOMIC);
 
-	if (!status){
+	if (!status) {
 		return 0;
-	}else{
-		DMESGE("Error TX CMD URB, error %d",
-				status);
+	} else {
+		DMESGE("Error TX CMD URB, error %d", status);
 		return -1;
 	}
 }
@@ -1824,21 +1640,21 @@
 {
 	u8 QueueSelect = 0x0;       //defualt set to
 
-	switch(QueueID) {
+	switch (QueueID) {
 	case BE_QUEUE:
-		QueueSelect = QSLT_BE;  //or QSelect = pTcb->priority;
+		QueueSelect = QSLT_BE;
 		break;
 
 	case BK_QUEUE:
-		QueueSelect = QSLT_BK;  //or QSelect = pTcb->priority;
+		QueueSelect = QSLT_BK;
 		break;
 
 	case VO_QUEUE:
-		QueueSelect = QSLT_VO;  //or QSelect = pTcb->priority;
+		QueueSelect = QSLT_VO;
 		break;
 
 	case VI_QUEUE:
-		QueueSelect = QSLT_VI;  //or QSelect = pTcb->priority;
+		QueueSelect = QSLT_VI;
 		break;
 	case MGNT_QUEUE:
 		QueueSelect = QSLT_MGNT;
@@ -1850,11 +1666,9 @@
 
 		// TODO: 2006.10.30 mark other queue selection until we verify it is OK
 		// TODO: Remove Assertions
-//#if (RTL819X_FPGA_VER & RTL819X_FPGA_GUANGAN_070502)
 	case TXCMD_QUEUE:
 		QueueSelect = QSLT_CMD;
 		break;
-//#endif
 	case HIGH_QUEUE:
 		QueueSelect = QSLT_HIGH;
 		break;
@@ -1870,7 +1684,7 @@
 {
 	u8  ret = DESC90_RATE1M;
 
-	switch(rate) {
+	switch (rate) {
 	case MGN_1M:    ret = DESC90_RATE1M;    break;
 	case MGN_2M:    ret = DESC90_RATE2M;    break;
 	case MGN_5_5M:  ret = DESC90_RATE5_5M;  break;
@@ -1913,9 +1727,9 @@
 {
 	u8   tmp_Short;
 
-	tmp_Short = (TxHT==1)?((tcb_desc->bUseShortGI)?1:0):((tcb_desc->bUseShortPreamble)?1:0);
+	tmp_Short = (TxHT == 1) ? ((tcb_desc->bUseShortGI) ? 1 : 0) : ((tcb_desc->bUseShortPreamble) ? 1 : 0);
 
-	if(TxHT==1 && TxRate != DESC90_RATEMCS15)
+	if (TxHT == 1 && TxRate != DESC90_RATEMCS15)
 		tmp_Short = 0;
 
 	return tmp_Short;
@@ -1931,7 +1745,7 @@
  * skb->cb will contain all the following information,
  * priority, morefrag, rate, &dev.
  * */
-short rtl8192_tx(struct net_device *dev, struct sk_buff* skb)
+short rtl8192_tx(struct net_device *dev, struct sk_buff *skb)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	cb_desc *tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
@@ -1941,35 +1755,32 @@
 	int pend;
 	int status;
 	struct urb *tx_urb = NULL, *tx_urb_zero = NULL;
-	//int urb_len;
 	unsigned int idx_pipe;
-//	RT_DEBUG_DATA(COMP_SEND, tcb_desc, sizeof(cb_desc));
-//	printk("=============> %s\n", __FUNCTION__);
 	pend = atomic_read(&priv->tx_pending[tcb_desc->queue_index]);
 	/* we are locked here so the two atomic_read and inc are executed
 	 * without interleaves
 	 * !!! For debug purpose
 	 */
-	if( pend > MAX_TX_URB){
-		printk("To discard skb packet!\n");
+	if (pend > MAX_TX_URB) {
+		netdev_dbg(dev, "To discard skb packet!\n");
 		dev_kfree_skb_any(skb);
 		return -1;
 	}
 
-	tx_urb = usb_alloc_urb(0,GFP_ATOMIC);
-	if(!tx_urb){
+	tx_urb = usb_alloc_urb(0, GFP_ATOMIC);
+	if (!tx_urb) {
 		dev_kfree_skb_any(skb);
 		return -ENOMEM;
 	}
 
 	/* Fill Tx firmware info */
-	memset(tx_fwinfo,0,sizeof(tx_fwinfo_819x_usb));
+	memset(tx_fwinfo, 0, sizeof(tx_fwinfo_819x_usb));
 	/* DWORD 0 */
-	tx_fwinfo->TxHT = (tcb_desc->data_rate&0x80)?1:0;
+	tx_fwinfo->TxHT = (tcb_desc->data_rate&0x80) ? 1 : 0;
 	tx_fwinfo->TxRate = MRateToHwRate8190Pci(tcb_desc->data_rate);
 	tx_fwinfo->EnableCPUDur = tcb_desc->bTxEnableFwCalcDur;
 	tx_fwinfo->Short = QueryIsShort(tx_fwinfo->TxHT, tx_fwinfo->TxRate, tcb_desc);
-	if(tcb_desc->bAMPDUEnable) {//AMPDU enabled
+	if (tcb_desc->bAMPDUEnable) {//AMPDU enabled
 		tx_fwinfo->AllowAggregation = 1;
 		/* DWORD 1 */
 		tx_fwinfo->RxMF = tcb_desc->ampdu_factor;
@@ -1982,20 +1793,19 @@
 	}
 
 	/* Protection mode related */
-	tx_fwinfo->RtsEnable = (tcb_desc->bRTSEnable)?1:0;
-	tx_fwinfo->CtsEnable = (tcb_desc->bCTSEnable)?1:0;
-	tx_fwinfo->RtsSTBC = (tcb_desc->bRTSSTBC)?1:0;
-	tx_fwinfo->RtsHT = (tcb_desc->rts_rate&0x80)?1:0;
+	tx_fwinfo->RtsEnable = (tcb_desc->bRTSEnable) ? 1 : 0;
+	tx_fwinfo->CtsEnable = (tcb_desc->bCTSEnable) ? 1 : 0;
+	tx_fwinfo->RtsSTBC = (tcb_desc->bRTSSTBC) ? 1 : 0;
+	tx_fwinfo->RtsHT = (tcb_desc->rts_rate&0x80) ? 1 : 0;
 	tx_fwinfo->RtsRate =  MRateToHwRate8190Pci((u8)tcb_desc->rts_rate);
-	tx_fwinfo->RtsSubcarrier = (tx_fwinfo->RtsHT==0)?(tcb_desc->RTSSC):0;
-	tx_fwinfo->RtsBandwidth = (tx_fwinfo->RtsHT==1)?((tcb_desc->bRTSBW)?1:0):0;
-	tx_fwinfo->RtsShort = (tx_fwinfo->RtsHT==0)?(tcb_desc->bRTSUseShortPreamble?1:0):\
-				(tcb_desc->bRTSUseShortGI?1:0);
+	tx_fwinfo->RtsSubcarrier = (tx_fwinfo->RtsHT == 0) ? (tcb_desc->RTSSC) : 0;
+	tx_fwinfo->RtsBandwidth = (tx_fwinfo->RtsHT == 1) ? ((tcb_desc->bRTSBW) ? 1 : 0) : 0;
+	tx_fwinfo->RtsShort = (tx_fwinfo->RtsHT == 0) ? (tcb_desc->bRTSUseShortPreamble ? 1 : 0) :
+		              (tcb_desc->bRTSUseShortGI ? 1 : 0);
 
 	/* Set Bandwidth and sub-channel settings. */
-	if(priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20_40)
-	{
-		if(tcb_desc->bPacketBW) {
+	if (priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20_40) {
+		if (tcb_desc->bPacketBW) {
 			tx_fwinfo->TxBandwidth = 1;
 			tx_fwinfo->TxSubCarrier = 0;    //By SD3's Jerry suggestion, use duplicated mode
 		} else {
@@ -2009,9 +1819,7 @@
 
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
 	if (tcb_desc->drv_agg_enable)
-	{
 		tx_fwinfo->Tx_INFO_RSVD = (tcb_desc->DrvAggrNum & 0x1f) << 1;
-	}
 #endif
 	/* Fill Tx descriptor */
 	memset(tx_desc, 0, sizeof(tx_desc_819x_usb));
@@ -2021,45 +1829,40 @@
 	tx_desc->Offset =  sizeof(tx_fwinfo_819x_usb) + 8;
 
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
-	if (tcb_desc->drv_agg_enable) {
+	if (tcb_desc->drv_agg_enable)
 		tx_desc->PktSize = tcb_desc->pkt_size;
-	} else
+	else
 #endif
 	{
 		tx_desc->PktSize = (skb->len - TX_PACKET_SHIFT_BYTES) & 0xffff;
 	}
 
 	/*DWORD 1*/
-	tx_desc->SecCAMID= 0;
+	tx_desc->SecCAMID = 0;
 	tx_desc->RATid = tcb_desc->RATRIndex;
-	{
-		//MPDUOverhead = 0;
-		tx_desc->NoEnc = 1;
-	}
+	tx_desc->NoEnc = 1;
 	tx_desc->SecType = 0x0;
-		if (tcb_desc->bHwSec)
-			{
-				switch (priv->ieee80211->pairwise_key_type)
-				{
-					case KEY_TYPE_WEP40:
-					case KEY_TYPE_WEP104:
-						 tx_desc->SecType = 0x1;
-						 tx_desc->NoEnc = 0;
-						 break;
-					case KEY_TYPE_TKIP:
-						 tx_desc->SecType = 0x2;
-						 tx_desc->NoEnc = 0;
-						 break;
-					case KEY_TYPE_CCMP:
-						 tx_desc->SecType = 0x3;
-						 tx_desc->NoEnc = 0;
-						 break;
-					case KEY_TYPE_NA:
-						 tx_desc->SecType = 0x0;
-						 tx_desc->NoEnc = 1;
-						 break;
-				}
-			}
+	if (tcb_desc->bHwSec) {
+		switch (priv->ieee80211->pairwise_key_type) {
+		case KEY_TYPE_WEP40:
+		case KEY_TYPE_WEP104:
+			tx_desc->SecType = 0x1;
+			tx_desc->NoEnc = 0;
+			break;
+		case KEY_TYPE_TKIP:
+			tx_desc->SecType = 0x2;
+			tx_desc->NoEnc = 0;
+			break;
+		case KEY_TYPE_CCMP:
+			tx_desc->SecType = 0x3;
+			tx_desc->NoEnc = 0;
+			break;
+		case KEY_TYPE_NA:
+			tx_desc->SecType = 0x0;
+			tx_desc->NoEnc = 1;
+			break;
+		}
+	}
 
 	tx_desc->QueueSelect = MapHwQueueToFirmwareQueue(tcb_desc->queue_index);
 	tx_desc->TxFWInfoSize =  sizeof(tx_fwinfo_819x_usb);
@@ -2084,48 +1887,41 @@
 	}
 	/* Get index to out pipe from specified QueueID */
 #ifndef USE_ONE_PIPE
-	idx_pipe = txqueue2outpipe(priv,tcb_desc->queue_index);
+	idx_pipe = txqueue2outpipe(priv, tcb_desc->queue_index);
 #else
 	idx_pipe = 0x5;
 #endif
 
-	//RT_DEBUG_DATA(COMP_SEND,tx_fwinfo,sizeof(tx_fwinfo_819x_usb));
-	//RT_DEBUG_DATA(COMP_SEND,tx_desc,sizeof(tx_desc_819x_usb));
-
 	/* To submit bulk urb */
-	usb_fill_bulk_urb(tx_urb,udev,
-			usb_sndbulkpipe(udev,idx_pipe), skb->data,
-			skb->len, rtl8192_tx_isr, skb);
+	usb_fill_bulk_urb(tx_urb, udev,
+			  usb_sndbulkpipe(udev, idx_pipe), skb->data,
+			  skb->len, rtl8192_tx_isr, skb);
 
 	status = usb_submit_urb(tx_urb, GFP_ATOMIC);
-	if (!status){
-//we need to send 0 byte packet whenever 512N bytes/64N(HIGN SPEED/NORMAL SPEED) bytes packet has been transmitted. Otherwise, it will be halt to wait for another packet. WB. 2008.08.27
+	if (!status) {
+		//we need to send 0 byte packet whenever 512N bytes/64N(HIGN SPEED/NORMAL SPEED) bytes packet has been transmitted. Otherwise, it will be halt to wait for another packet. WB. 2008.08.27
 		bool bSend0Byte = false;
 		u8 zero = 0;
-		if(udev->speed == USB_SPEED_HIGH)
-		{
+		if (udev->speed == USB_SPEED_HIGH) {
 			if (skb->len > 0 && skb->len % 512 == 0)
 				bSend0Byte = true;
-		}
-		else
-		{
+		} else {
 			if (skb->len > 0 && skb->len % 64 == 0)
 				bSend0Byte = true;
 		}
-		if (bSend0Byte)
-		{
-			tx_urb_zero = usb_alloc_urb(0,GFP_ATOMIC);
-			if(!tx_urb_zero){
+		if (bSend0Byte) {
+			tx_urb_zero = usb_alloc_urb(0, GFP_ATOMIC);
+			if (!tx_urb_zero) {
 				RT_TRACE(COMP_ERR, "can't alloc urb for zero byte\n");
 				return -ENOMEM;
 			}
-			usb_fill_bulk_urb(tx_urb_zero,udev,
-					usb_sndbulkpipe(udev,idx_pipe), &zero,
-					0, tx_zero_isr, dev);
+			usb_fill_bulk_urb(tx_urb_zero, udev,
+					  usb_sndbulkpipe(udev, idx_pipe), &zero,
+					  0, tx_zero_isr, dev);
 			status = usb_submit_urb(tx_urb_zero, GFP_ATOMIC);
-			if (status){
-			RT_TRACE(COMP_ERR, "Error TX URB for zero byte %d, error %d", atomic_read(&priv->tx_pending[tcb_desc->queue_index]), status);
-			return -1;
+			if (status) {
+				RT_TRACE(COMP_ERR, "Error TX URB for zero byte %d, error %d", atomic_read(&priv->tx_pending[tcb_desc->queue_index]), status);
+				return -1;
 			}
 		}
 		dev->trans_start = jiffies;
@@ -2133,7 +1929,7 @@
 		return 0;
 	} else {
 		RT_TRACE(COMP_ERR, "Error TX URB %d, error %d", atomic_read(&priv->tx_pending[tcb_desc->queue_index]),
-				status);
+			 status);
 		return -1;
 	}
 }
@@ -2143,14 +1939,14 @@
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
 	priv->rx_urb = kmalloc(sizeof(struct urb *) * (MAX_RX_URB+1),
-				GFP_KERNEL);
+			       GFP_KERNEL);
 	if (priv->rx_urb == NULL)
 		return -ENOMEM;
 
 #ifndef JACKSON_NEW_RX
-	for(i=0;i<(MAX_RX_URB+1);i++){
+	for (i = 0; i < (MAX_RX_URB+1); i++) {
 
-		priv->rx_urb[i] = usb_alloc_urb(0,GFP_KERNEL);
+		priv->rx_urb[i] = usb_alloc_urb(0, GFP_KERNEL);
 
 		priv->rx_urb[i]->transfer_buffer = kmalloc(RX_URB_SIZE, GFP_KERNEL);
 
@@ -2159,26 +1955,26 @@
 #endif
 
 #ifdef THOMAS_BEACON
-{
-	long align = 0;
-	void *oldaddr, *newaddr;
+	{
+		long align = 0;
+		void *oldaddr, *newaddr;
 
-	priv->rx_urb[16] = usb_alloc_urb(0, GFP_KERNEL);
-	priv->oldaddr = kmalloc(16, GFP_KERNEL);
-	oldaddr = priv->oldaddr;
-	align = ((long)oldaddr) & 3;
-	if (align) {
-		newaddr = oldaddr + 4 - align;
-		priv->rx_urb[16]->transfer_buffer_length = 16 - 4 + align;
-	} else {
-		newaddr = oldaddr;
-		priv->rx_urb[16]->transfer_buffer_length = 16;
+		priv->rx_urb[16] = usb_alloc_urb(0, GFP_KERNEL);
+		priv->oldaddr = kmalloc(16, GFP_KERNEL);
+		oldaddr = priv->oldaddr;
+		align = ((long)oldaddr) & 3;
+		if (align) {
+			newaddr = oldaddr + 4 - align;
+			priv->rx_urb[16]->transfer_buffer_length = 16 - 4 + align;
+		} else {
+			newaddr = oldaddr;
+			priv->rx_urb[16]->transfer_buffer_length = 16;
+		}
+		priv->rx_urb[16]->transfer_buffer = newaddr;
 	}
-	priv->rx_urb[16]->transfer_buffer = newaddr;
-}
 #endif
 
-	memset(priv->rx_urb, 0, sizeof(struct urb*) * MAX_RX_URB);
+	memset(priv->rx_urb, 0, sizeof(struct urb *) * MAX_RX_URB);
 	priv->pp_rxskb = kcalloc(MAX_RX_URB, sizeof(struct sk_buff *),
 				 GFP_KERNEL);
 	if (!priv->pp_rxskb) {
@@ -2191,7 +1987,7 @@
 		return -ENOMEM;
 	}
 
-	printk("End of initendpoints\n");
+	netdev_dbg(dev, "End of initendpoints\n");
 	return 0;
 
 }
@@ -2201,8 +1997,8 @@
 	int i;
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
-	if(priv->rx_urb){
-		for(i=0;i<(MAX_RX_URB+1);i++){
+	if (priv->rx_urb) {
+		for (i = 0; i < (MAX_RX_URB+1); i++) {
 			usb_kill_urb(priv->rx_urb[i]);
 			usb_free_urb(priv->rx_urb[i]);
 		}
@@ -2224,8 +2020,8 @@
 
 #ifndef JACKSON_NEW_RX
 
-	if(priv->rx_urb){
-		for(i=0;i<(MAX_RX_URB+1);i++){
+	if (priv->rx_urb) {
+		for (i = 0; i < (MAX_RX_URB+1); i++) {
 			usb_kill_urb(priv->rx_urb[i]);
 			kfree(priv->rx_urb[i]->transfer_buffer);
 			usb_free_urb(priv->rx_urb[i]);
@@ -2249,54 +2045,45 @@
 }
 #endif
 
-extern void rtl8192_update_ratr_table(struct net_device* dev);
+extern void rtl8192_update_ratr_table(struct net_device *dev);
 void rtl8192_link_change(struct net_device *dev)
 {
-//	int i;
-
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	struct ieee80211_device* ieee = priv->ieee80211;
-	//write_nic_word(dev, BCN_INTR_ITV, net->beacon_interval);
-	if (ieee->state == IEEE80211_LINKED)
-	{
+	struct ieee80211_device *ieee = priv->ieee80211;
+	if (ieee->state == IEEE80211_LINKED) {
 		rtl8192_net_update(dev);
 		rtl8192_update_ratr_table(dev);
 		//add this as in pure N mode, wep encryption will use software way, but there is no chance to set this as wep will not set group key in wext. WB.2008.07.08
 		if ((KEY_TYPE_WEP40 == ieee->pairwise_key_type) || (KEY_TYPE_WEP104 == ieee->pairwise_key_type))
-		EnableHWSecurityConfig8192(dev);
+			EnableHWSecurityConfig8192(dev);
 	}
 	/*update timing params*/
-//	RT_TRACE(COMP_CH, "========>%s(), chan:%d\n", __FUNCTION__, priv->chan);
-//	rtl8192_set_chan(dev, priv->chan);
-	 if (ieee->iw_mode == IW_MODE_INFRA || ieee->iw_mode == IW_MODE_ADHOC)
-	{
+	if (ieee->iw_mode == IW_MODE_INFRA || ieee->iw_mode == IW_MODE_ADHOC) {
 		u32 reg = 0;
-		reg = read_nic_dword(dev, RCR);
+		read_nic_dword(dev, RCR, &reg);
 		if (priv->ieee80211->state == IEEE80211_LINKED)
 			priv->ReceiveConfig = reg |= RCR_CBSSID;
 		else
 			priv->ReceiveConfig = reg &= ~RCR_CBSSID;
 		write_nic_dword(dev, RCR, reg);
 	}
-
-//	rtl8192_set_rxconf(dev);
 }
 
 static struct ieee80211_qos_parameters def_qos_parameters = {
-	{3,3,3,3},/* cw_min */
-	{7,7,7,7},/* cw_max */
-	{2,2,2,2},/* aifs */
-	{0,0,0,0},/* flags */
-	{0,0,0,0} /* tx_op_limit */
+	{3, 3, 3, 3},/* cw_min */
+	{7, 7, 7, 7},/* cw_max */
+	{2, 2, 2, 2},/* aifs */
+	{0, 0, 0, 0},/* flags */
+	{0, 0, 0, 0} /* tx_op_limit */
 };
 
 
-void rtl8192_update_beacon(struct work_struct * work)
+void rtl8192_update_beacon(struct work_struct *work)
 {
 	struct r8192_priv *priv = container_of(work, struct r8192_priv, update_beacon_wq.work);
 	struct net_device *dev = priv->ieee80211->dev;
-	struct ieee80211_device* ieee = priv->ieee80211;
-	struct ieee80211_network* net = &ieee->current_network;
+	struct ieee80211_device *ieee = priv->ieee80211;
+	struct ieee80211_network *net = &ieee->current_network;
 
 	if (ieee->pHTInfo->bCurrentHTSupport)
 		HTUpdateSelfAndPeerSetting(ieee, net);
@@ -2306,14 +2093,13 @@
 /*
 * background support to run QoS activate functionality
 */
-int WDCAPARA_ADD[] = {EDCAPARA_BE,EDCAPARA_BK,EDCAPARA_VI,EDCAPARA_VO};
-void rtl8192_qos_activate(struct work_struct * work)
+int WDCAPARA_ADD[] = {EDCAPARA_BE, EDCAPARA_BK, EDCAPARA_VI, EDCAPARA_VO};
+void rtl8192_qos_activate(struct work_struct *work)
 {
 	struct r8192_priv *priv = container_of(work, struct r8192_priv, qos_activate);
 	struct net_device *dev = priv->ieee80211->dev;
 	struct ieee80211_qos_parameters *qos_parameters = &priv->ieee80211->current_network.qos_data.parameters;
 	u8 mode = priv->ieee80211->current_network.mode;
-	//u32 size = sizeof(struct ieee80211_qos_parameters);
 	u8  u1bAIFS;
 	u32 u4bAcParam;
 	int i;
@@ -2321,37 +2107,36 @@
 	if (priv == NULL)
 		return;
 
-       mutex_lock(&priv->mutex);
-	if(priv->ieee80211->state != IEEE80211_LINKED)
+	mutex_lock(&priv->mutex);
+	if (priv->ieee80211->state != IEEE80211_LINKED)
 		goto success;
-	RT_TRACE(COMP_QOS,"qos active process with associate response received\n");
+	RT_TRACE(COMP_QOS, "qos active process with associate response received\n");
 	/* It better set slot time at first */
 	/* For we just support b/g mode at present, let the slot time at 9/20 selection */
 	/* update the ac parameter to related registers */
-	for(i = 0; i <  QOS_QUEUE_NUM; i++) {
+	for (i = 0; i <  QOS_QUEUE_NUM; i++) {
 		//Mode G/A: slotTimeTimer = 9; Mode B: 20
-		u1bAIFS = qos_parameters->aifs[i] * ((mode&(IEEE_G|IEEE_N_24G)) ?9:20) + aSifsTime;
+		u1bAIFS = qos_parameters->aifs[i] * ((mode&(IEEE_G|IEEE_N_24G)) ? 9 : 20) + aSifsTime;
 		u4bAcParam = ((((u32)(qos_parameters->tx_op_limit[i]))<< AC_PARAM_TXOP_LIMIT_OFFSET)|
-				(((u32)(qos_parameters->cw_max[i]))<< AC_PARAM_ECW_MAX_OFFSET)|
-				(((u32)(qos_parameters->cw_min[i]))<< AC_PARAM_ECW_MIN_OFFSET)|
-				((u32)u1bAIFS << AC_PARAM_AIFS_OFFSET));
+			      (((u32)(qos_parameters->cw_max[i]))<< AC_PARAM_ECW_MAX_OFFSET)|
+			      (((u32)(qos_parameters->cw_min[i]))<< AC_PARAM_ECW_MIN_OFFSET)|
+			      ((u32)u1bAIFS << AC_PARAM_AIFS_OFFSET));
 
 		write_nic_dword(dev, WDCAPARA_ADD[i], u4bAcParam);
-		//write_nic_dword(dev, WDCAPARA_ADD[i], 0x005e4332);
 	}
 
 success:
-       mutex_unlock(&priv->mutex);
+	mutex_unlock(&priv->mutex);
 }
 
 static int rtl8192_qos_handle_probe_response(struct r8192_priv *priv,
-		int active_network,
-		struct ieee80211_network *network)
+					     int active_network,
+					     struct ieee80211_network *network)
 {
 	int ret = 0;
 	u32 size = sizeof(struct ieee80211_qos_parameters);
 
-	if(priv->ieee80211->state !=IEEE80211_LINKED)
+	if (priv->ieee80211->state != IEEE80211_LINKED)
 		return ret;
 
 	if ((priv->ieee80211->iw_mode != IW_MODE_INFRA))
@@ -2359,21 +2144,21 @@
 
 	if (network->flags & NETWORK_HAS_QOS_MASK) {
 		if (active_network &&
-				(network->flags & NETWORK_HAS_QOS_PARAMETERS))
+		    (network->flags & NETWORK_HAS_QOS_PARAMETERS))
 			network->qos_data.active = network->qos_data.supported;
 
 		if ((network->qos_data.active == 1) && (active_network == 1) &&
-				(network->flags & NETWORK_HAS_QOS_PARAMETERS) &&
-				(network->qos_data.old_param_count !=
-				 network->qos_data.param_count)) {
+		    (network->flags & NETWORK_HAS_QOS_PARAMETERS) &&
+		    (network->qos_data.old_param_count !=
+		     network->qos_data.param_count)) {
 			network->qos_data.old_param_count =
 				network->qos_data.param_count;
 			queue_work(priv->priv_wq, &priv->qos_activate);
-			RT_TRACE (COMP_QOS, "QoS parameters change call "
-					"qos_activate\n");
+			RT_TRACE(COMP_QOS, "QoS parameters change call "
+				 "qos_activate\n");
 		}
 	} else {
-		memcpy(&priv->ieee80211->current_network.qos_data.parameters,\
+		memcpy(&priv->ieee80211->current_network.qos_data.parameters,
 		       &def_qos_parameters, size);
 
 		if ((network->qos_data.active == 1) && (active_network == 1)) {
@@ -2388,13 +2173,13 @@
 }
 
 /* handle and manage frame from beacon and probe response */
-static int rtl8192_handle_beacon(struct net_device * dev,
-			      struct ieee80211_beacon * beacon,
-			      struct ieee80211_network * network)
+static int rtl8192_handle_beacon(struct net_device *dev,
+				 struct ieee80211_beacon *beacon,
+				 struct ieee80211_network *network)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
-	rtl8192_qos_handle_probe_response(priv,1,network);
+	rtl8192_qos_handle_probe_response(priv, 1, network);
 	queue_delayed_work(priv->priv_wq, &priv->update_beacon_wq, 0);
 	return 0;
 
@@ -2406,7 +2191,7 @@
 * setting
 */
 static int rtl8192_qos_association_resp(struct r8192_priv *priv,
-				    struct ieee80211_network *network)
+					struct ieee80211_network *network)
 {
 	int ret = 0;
 	unsigned long flags;
@@ -2416,28 +2201,26 @@
 	if ((priv == NULL) || (network == NULL))
 		return ret;
 
-	if(priv->ieee80211->state !=IEEE80211_LINKED)
+	if (priv->ieee80211->state != IEEE80211_LINKED)
 		return ret;
 
 	if ((priv->ieee80211->iw_mode != IW_MODE_INFRA))
 		return ret;
 
 	spin_lock_irqsave(&priv->ieee80211->lock, flags);
-	if(network->flags & NETWORK_HAS_QOS_PARAMETERS) {
-		memcpy(&priv->ieee80211->current_network.qos_data.parameters,\
-			 &network->qos_data.parameters,\
-			sizeof(struct ieee80211_qos_parameters));
+	if (network->flags & NETWORK_HAS_QOS_PARAMETERS) {
+		memcpy(&priv->ieee80211->current_network.qos_data.parameters,
+		       &network->qos_data.parameters,
+		       sizeof(struct ieee80211_qos_parameters));
 		priv->ieee80211->current_network.qos_data.active = 1;
-		 {
-			set_qos_param = 1;
-			/* update qos parameter for current network */
-			priv->ieee80211->current_network.qos_data.old_param_count = \
-				 priv->ieee80211->current_network.qos_data.param_count;
-			priv->ieee80211->current_network.qos_data.param_count = \
-				 network->qos_data.param_count;
-		}
+		set_qos_param = 1;
+		/* update qos parameter for current network */
+		priv->ieee80211->current_network.qos_data.old_param_count =
+			priv->ieee80211->current_network.qos_data.param_count;
+		priv->ieee80211->current_network.qos_data.param_count =
+			network->qos_data.param_count;
 	} else {
-		memcpy(&priv->ieee80211->current_network.qos_data.parameters,\
+		memcpy(&priv->ieee80211->current_network.qos_data.parameters,
 		       &def_qos_parameters, size);
 		priv->ieee80211->current_network.qos_data.active = 0;
 		priv->ieee80211->current_network.qos_data.supported = 0;
@@ -2446,7 +2229,7 @@
 
 	spin_unlock_irqrestore(&priv->ieee80211->lock, flags);
 
-	RT_TRACE(COMP_QOS, "%s: network->flags = %d,%d\n",__FUNCTION__,network->flags ,priv->ieee80211->current_network.qos_data.active);
+	RT_TRACE(COMP_QOS, "%s: network->flags = %d,%d\n", __func__, network->flags, priv->ieee80211->current_network.qos_data.active);
 	if (set_qos_param == 1)
 		queue_work(priv->priv_wq, &priv->qos_activate);
 
@@ -2456,8 +2239,8 @@
 
 
 static int rtl8192_handle_assoc_response(struct net_device *dev,
-				     struct ieee80211_assoc_response_frame *resp,
-				     struct ieee80211_network *network)
+					 struct ieee80211_assoc_response_frame *resp,
+					 struct ieee80211_network *network)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	rtl8192_qos_association_resp(priv, network);
@@ -2465,79 +2248,70 @@
 }
 
 
-void rtl8192_update_ratr_table(struct net_device* dev)
-	//	POCTET_STRING	posLegacyRate,
-	//	u8*			pMcsRate)
-	//	PRT_WLAN_STA	pEntry)
+void rtl8192_update_ratr_table(struct net_device *dev)
 {
-	struct r8192_priv* priv = ieee80211_priv(dev);
-	struct ieee80211_device* ieee = priv->ieee80211;
-	u8* pMcsRate = ieee->dot11HTOperationalRateSet;
-	//struct ieee80211_network *net = &ieee->current_network;
+	struct r8192_priv *priv = ieee80211_priv(dev);
+	struct ieee80211_device *ieee = priv->ieee80211;
+	u8 *pMcsRate = ieee->dot11HTOperationalRateSet;
 	u32 ratr_value = 0;
 	u8 rate_index = 0;
-	rtl8192_config_rate(dev, (u16*)(&ratr_value));
-	ratr_value |= (*(u16*)(pMcsRate)) << 12;
-//	switch (net->mode)
-	switch (ieee->mode)
-	{
-		case IEEE_A:
-			ratr_value &= 0x00000FF0;
-			break;
-		case IEEE_B:
-			ratr_value &= 0x0000000F;
-			break;
-		case IEEE_G:
-			ratr_value &= 0x00000FF7;
-			break;
-		case IEEE_N_24G:
-		case IEEE_N_5G:
-			if (ieee->pHTInfo->PeerMimoPs == 0) //MIMO_PS_STATIC
-				ratr_value &= 0x0007F007;
-			else{
-				if (priv->rf_type == RF_1T2R)
-					ratr_value &= 0x000FF007;
-				else
-					ratr_value &= 0x0F81F007;
-			}
-			break;
-		default:
-			break;
+	rtl8192_config_rate(dev, (u16 *)(&ratr_value));
+	ratr_value |= (*(u16 *)(pMcsRate)) << 12;
+	switch (ieee->mode) {
+	case IEEE_A:
+		ratr_value &= 0x00000FF0;
+		break;
+	case IEEE_B:
+		ratr_value &= 0x0000000F;
+		break;
+	case IEEE_G:
+		ratr_value &= 0x00000FF7;
+		break;
+	case IEEE_N_24G:
+	case IEEE_N_5G:
+		if (ieee->pHTInfo->PeerMimoPs == 0) {//MIMO_PS_STATIC
+			ratr_value &= 0x0007F007;
+		} else {
+			if (priv->rf_type == RF_1T2R)
+				ratr_value &= 0x000FF007;
+			else
+				ratr_value &= 0x0F81F007;
+		}
+		break;
+	default:
+		break;
 	}
 	ratr_value &= 0x0FFFFFFF;
-	if(ieee->pHTInfo->bCurTxBW40MHz && ieee->pHTInfo->bCurShortGI40MHz){
+	if (ieee->pHTInfo->bCurTxBW40MHz && ieee->pHTInfo->bCurShortGI40MHz)
 		ratr_value |= 0x80000000;
-	}else if(!ieee->pHTInfo->bCurTxBW40MHz && ieee->pHTInfo->bCurShortGI20MHz){
+	else if (!ieee->pHTInfo->bCurTxBW40MHz && ieee->pHTInfo->bCurShortGI20MHz)
 		ratr_value |= 0x80000000;
-	}
 	write_nic_dword(dev, RATR0+rate_index*4, ratr_value);
 	write_nic_byte(dev, UFWP, 1);
 }
 
-static u8 ccmp_ie[4] = {0x00,0x50,0xf2,0x04};
+static u8 ccmp_ie[4] = {0x00, 0x50, 0xf2, 0x04};
 static u8 ccmp_rsn_ie[4] = {0x00, 0x0f, 0xac, 0x04};
-bool GetNmodeSupportBySecCfg8192(struct net_device*dev)
+bool GetNmodeSupportBySecCfg8192(struct net_device *dev)
 {
-	struct r8192_priv* priv = ieee80211_priv(dev);
-	struct ieee80211_device* ieee = priv->ieee80211;
-	struct ieee80211_network * network = &ieee->current_network;
-	int wpa_ie_len= ieee->wpa_ie_len;
-	struct ieee80211_crypt_data* crypt;
+	struct r8192_priv *priv = ieee80211_priv(dev);
+	struct ieee80211_device *ieee = priv->ieee80211;
+	struct ieee80211_network *network = &ieee->current_network;
+	int wpa_ie_len = ieee->wpa_ie_len;
+	struct ieee80211_crypt_data *crypt;
 	int encrypt;
 
 	crypt = ieee->crypt[ieee->tx_keyidx];
 	//we use connecting AP's capability instead of only security config on our driver to distinguish whether it should use N mode or G mode
-	encrypt = (network->capability & WLAN_CAPABILITY_PRIVACY) || (ieee->host_encrypt && crypt && crypt->ops && (0 == strcmp(crypt->ops->name,"WEP")));
+	encrypt = (network->capability & WLAN_CAPABILITY_PRIVACY) || (ieee->host_encrypt && crypt && crypt->ops && (0 == strcmp(crypt->ops->name, "WEP")));
 
 	/* simply judge  */
-	if(encrypt && (wpa_ie_len == 0)) {
+	if (encrypt && (wpa_ie_len == 0)) {
 		/* wep encryption, no N mode setting */
 		return false;
-//	} else if((wpa_ie_len != 0)&&(memcmp(&(ieee->wpa_ie[14]),ccmp_ie,4))) {
-	} else if((wpa_ie_len != 0)) {
+	} else if ((wpa_ie_len != 0)) {
 		/* parse pairwise key type */
-		//if((pairwisekey = WEP40)||(pairwisekey = WEP104)||(pairwisekey = TKIP))
-		if (((ieee->wpa_ie[0] == 0xdd) && (!memcmp(&(ieee->wpa_ie[14]),ccmp_ie,4))) || ((ieee->wpa_ie[0] == 0x30) && (!memcmp(&ieee->wpa_ie[10],ccmp_rsn_ie, 4))))
+		if (((ieee->wpa_ie[0] == 0xdd) && (!memcmp(&(ieee->wpa_ie[14]), ccmp_ie, 4))) || ((ieee->wpa_ie[0] == 0x30) && (!memcmp(&ieee->wpa_ie[10], ccmp_rsn_ie, 4))))
 			return true;
 		else
 			return false;
@@ -2548,13 +2322,13 @@
 	return true;
 }
 
-bool GetHalfNmodeSupportByAPs819xUsb(struct net_device* dev)
+bool GetHalfNmodeSupportByAPs819xUsb(struct net_device *dev)
 {
 	bool			Reval;
-	struct r8192_priv* priv = ieee80211_priv(dev);
-	struct ieee80211_device* ieee = priv->ieee80211;
+	struct r8192_priv *priv = ieee80211_priv(dev);
+	struct ieee80211_device *ieee = priv->ieee80211;
 
-	if(ieee->bHalfWirelessN24GMode == true)
+	if (ieee->bHalfWirelessN24GMode == true)
 		Reval = true;
 	else
 		Reval =  false;
@@ -2562,75 +2336,59 @@
 	return Reval;
 }
 
-void rtl8192_refresh_supportrate(struct r8192_priv* priv)
+void rtl8192_refresh_supportrate(struct r8192_priv *priv)
 {
-	struct ieee80211_device* ieee = priv->ieee80211;
+	struct ieee80211_device *ieee = priv->ieee80211;
 	//we do not consider set support rate for ABG mode, only HT MCS rate is set here.
 	if (ieee->mode == WIRELESS_MODE_N_24G || ieee->mode == WIRELESS_MODE_N_5G)
-	{
 		memcpy(ieee->Regdot11HTOperationalRateSet, ieee->RegHTSuppRateSet, 16);
-		//RT_DEBUG_DATA(COMP_INIT, ieee->RegHTSuppRateSet, 16);
-		//RT_DEBUG_DATA(COMP_INIT, ieee->Regdot11HTOperationalRateSet, 16);
-	}
 	else
 		memset(ieee->Regdot11HTOperationalRateSet, 0, 16);
 	return;
 }
 
-u8 rtl8192_getSupportedWireleeMode(struct net_device*dev)
+u8 rtl8192_getSupportedWireleeMode(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8 ret = 0;
-	switch(priv->rf_chip)
-	{
-		case RF_8225:
-		case RF_8256:
-		case RF_PSEUDO_11N:
-			ret = (WIRELESS_MODE_N_24G|WIRELESS_MODE_G|WIRELESS_MODE_B);
-			break;
-		case RF_8258:
-			ret = (WIRELESS_MODE_A|WIRELESS_MODE_N_5G);
-			break;
-		default:
-			ret = WIRELESS_MODE_B;
-			break;
+	switch (priv->rf_chip) {
+	case RF_8225:
+	case RF_8256:
+	case RF_PSEUDO_11N:
+		ret = (WIRELESS_MODE_N_24G|WIRELESS_MODE_G|WIRELESS_MODE_B);
+		break;
+	case RF_8258:
+		ret = (WIRELESS_MODE_A|WIRELESS_MODE_N_5G);
+		break;
+	default:
+		ret = WIRELESS_MODE_B;
+		break;
 	}
 	return ret;
 }
-void rtl8192_SetWirelessMode(struct net_device* dev, u8 wireless_mode)
+void rtl8192_SetWirelessMode(struct net_device *dev, u8 wireless_mode)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8 bSupportMode = rtl8192_getSupportedWireleeMode(dev);
 
-	if ((wireless_mode == WIRELESS_MODE_AUTO) || ((wireless_mode&bSupportMode)==0))
-	{
-		if(bSupportMode & WIRELESS_MODE_N_24G)
-		{
+	if ((wireless_mode == WIRELESS_MODE_AUTO) || ((wireless_mode&bSupportMode) == 0)) {
+		if (bSupportMode & WIRELESS_MODE_N_24G) {
 			wireless_mode = WIRELESS_MODE_N_24G;
-		}
-		else if(bSupportMode & WIRELESS_MODE_N_5G)
-		{
+		} else if (bSupportMode & WIRELESS_MODE_N_5G) {
 			wireless_mode = WIRELESS_MODE_N_5G;
-		}
-		else if((bSupportMode & WIRELESS_MODE_A))
-		{
+		} else if ((bSupportMode & WIRELESS_MODE_A)) {
 			wireless_mode = WIRELESS_MODE_A;
-		}
-		else if((bSupportMode & WIRELESS_MODE_G))
-		{
+		} else if ((bSupportMode & WIRELESS_MODE_G)) {
 			wireless_mode = WIRELESS_MODE_G;
-		}
-		else if((bSupportMode & WIRELESS_MODE_B))
-		{
+		} else if ((bSupportMode & WIRELESS_MODE_B)) {
 			wireless_mode = WIRELESS_MODE_B;
-		}
-		else{
-			RT_TRACE(COMP_ERR, "%s(), No valid wireless mode supported, SupportedWirelessMode(%x)!!!\n", __FUNCTION__,bSupportMode);
+		} else {
+			RT_TRACE(COMP_ERR, "%s(), No valid wireless mode supported, SupportedWirelessMode(%x)!!!\n", __func__, bSupportMode);
 			wireless_mode = WIRELESS_MODE_B;
 		}
 	}
 #ifdef TO_DO_LIST //// TODO: this function doesn't work well at this time, we should wait for FPGA
-	ActUpdateChannelAccessSetting( pAdapter, pHalData->CurrentWirelessMode, &pAdapter->MgntInfo.Info8185.ChannelAccessSetting );
+	ActUpdateChannelAccessSetting(pAdapter, pHalData->CurrentWirelessMode, &pAdapter->MgntInfo.Info8185.ChannelAccessSetting);
 #endif
 	priv->ieee80211->mode = wireless_mode;
 
@@ -2643,7 +2401,7 @@
 
 }
 //init priv variables here. only non_zero value should be initialized here.
-static void rtl8192_init_priv_variable(struct net_device* dev)
+static void rtl8192_init_priv_variable(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8 i;
@@ -2651,13 +2409,13 @@
 	priv->chan = 1; //set to channel 1
 	priv->ieee80211->mode = WIRELESS_MODE_AUTO; //SET AUTO
 	priv->ieee80211->iw_mode = IW_MODE_INFRA;
-	priv->ieee80211->ieee_up=0;
+	priv->ieee80211->ieee_up = 0;
 	priv->retry_rts = DEFAULT_RETRY_RTS;
 	priv->retry_data = DEFAULT_RETRY_DATA;
 	priv->ieee80211->rts = DEFAULT_RTS_THRESHOLD;
 	priv->ieee80211->rate = 110; //11 mbps
 	priv->ieee80211->short_slot = 1;
-	priv->promisc = (dev->flags & IFF_PROMISC) ? 1:0;
+	priv->promisc = (dev->flags & IFF_PROMISC) ? 1 : 0;
 	priv->CckPwEnl = 6;
 	//for silent reset
 	priv->IrpPendingCount = 1;
@@ -2672,14 +2430,14 @@
 	priv->ieee80211->softmac_features  = IEEE_SOFTMAC_SCAN |
 		IEEE_SOFTMAC_ASSOCIATE | IEEE_SOFTMAC_PROBERQ |
 		IEEE_SOFTMAC_PROBERS | IEEE_SOFTMAC_TX_QUEUE |
-		IEEE_SOFTMAC_BEACONS;//added by amy 080604 //|  //IEEE_SOFTMAC_SINGLE_QUEUE;
+		IEEE_SOFTMAC_BEACONS;//added by amy 080604
 
 	priv->ieee80211->active_scan = 1;
 	priv->ieee80211->modulation = IEEE80211_CCK_MODULATION | IEEE80211_OFDM_MODULATION;
 	priv->ieee80211->host_encrypt = 1;
 	priv->ieee80211->host_decrypt = 1;
-	priv->ieee80211->start_send_beacons = NULL;//rtl819xusb_beacon_tx;//-by amy 080604
-	priv->ieee80211->stop_send_beacons = NULL;//rtl8192_beacon_stop;//-by amy 080604
+	priv->ieee80211->start_send_beacons = NULL; //-by amy 080604
+	priv->ieee80211->stop_send_beacons = NULL;  //-by amy 080604
 	priv->ieee80211->softmac_hard_start_xmit = rtl8192_hard_start_xmit;
 	priv->ieee80211->set_chan = rtl8192_set_chan;
 	priv->ieee80211->link_change = rtl8192_link_change;
@@ -2693,7 +2451,6 @@
 	priv->ieee80211->qos_support = 1;
 
 	//added by WB
-//	priv->ieee80211->SwChnlByTimerHandler = rtl8192_phy_SwChnl;
 	priv->ieee80211->SetBWModeHandler = rtl8192_SetBWMode;
 	priv->ieee80211->handle_assoc_response = rtl8192_handle_assoc_response;
 	priv->ieee80211->handle_beacon = rtl8192_handle_beacon;
@@ -2705,36 +2462,31 @@
 	priv->ieee80211->InitialGainHandler = InitialGain819xUsb;
 	priv->card_type = USB;
 #ifdef TO_DO_LIST
-	if(Adapter->bInHctTest)
-	{
+	if (Adapter->bInHctTest) {
 		pHalData->ShortRetryLimit = 7;
 		pHalData->LongRetryLimit = 7;
 	}
 #endif
-	{
-		priv->ShortRetryLimit = 0x30;
-		priv->LongRetryLimit = 0x30;
-	}
+	priv->ShortRetryLimit = 0x30;
+	priv->LongRetryLimit = 0x30;
 	priv->EarlyRxThreshold = 7;
 	priv->enable_gpio0 = 0;
 	priv->TransmitConfig =
-	//	TCR_DurProcMode |	//for RTL8185B, duration setting by HW
-	//?	TCR_DISReqQsize |
 		(TCR_MXDMA_2048<<TCR_MXDMA_OFFSET)|  // Max DMA Burst Size per Tx DMA Burst, 7: reserved.
 		(priv->ShortRetryLimit<<TCR_SRL_OFFSET)|	// Short retry limit
 		(priv->LongRetryLimit<<TCR_LRL_OFFSET) |	// Long retry limit
-		(false ? TCR_SAT: 0);	// FALSE: HW provides PLCP length and LENGEXT, TRUE: SW provides them
+		(false ? TCR_SAT : 0);	// FALSE: HW provides PLCP length and LENGEXT, TRUE: SW provides them
 #ifdef TO_DO_LIST
-	if(Adapter->bInHctTest)
+	if (Adapter->bInHctTest)
 		pHalData->ReceiveConfig	=	pHalData->CSMethod |
-						RCR_AMF | RCR_ADF |	//RCR_AAP |	//accept management/data
+						RCR_AMF | RCR_ADF |	//accept management/data
 						//guangan200710
 						RCR_ACF |	//accept control frame for SW AP needs PS-poll, 2005.07.07, by rcnjko.
 						RCR_AB | RCR_AM | RCR_APM |		//accept BC/MC/UC
 						RCR_AICV | RCR_ACRC32 |			//accept ICV/CRC error packet
 						((u32)7<<RCR_MXDMA_OFFSET) | // Max DMA Burst Size per Rx DMA Burst, 7: unlimited.
 						(pHalData->EarlyRxThreshold<<RCR_FIFO_OFFSET) | // Rx FIFO Threshold, 7: No Rx threshold.
-						(pHalData->EarlyRxThreshold == 7 ? RCR_OnlyErlPkt:0);
+						(pHalData->EarlyRxThreshold == 7 ? RCR_OnlyErlPkt : 0);
 	else
 
 #endif
@@ -2742,10 +2494,9 @@
 		RCR_AMF | RCR_ADF |		//accept management/data
 		RCR_ACF |			//accept control frame for SW AP needs PS-poll, 2005.07.07, by rcnjko.
 		RCR_AB | RCR_AM | RCR_APM |	//accept BC/MC/UC
-		//RCR_AICV | RCR_ACRC32 |	//accept ICV/CRC error packet
 		((u32)7<<RCR_MXDMA_OFFSET)| // Max DMA Burst Size per Rx DMA Burst, 7: unlimited.
 		(priv->EarlyRxThreshold<<RX_FIFO_THRESHOLD_SHIFT) | // Rx FIFO Threshold, 7: No Rx threshold.
-		(priv->EarlyRxThreshold == 7 ? RCR_ONLYERLPKT:0);
+		(priv->EarlyRxThreshold == 7 ? RCR_ONLYERLPKT : 0);
 
 	priv->AcmControl = 0;
 	priv->pFirmware = kzalloc(sizeof(rt_firmware), GFP_KERNEL);
@@ -2755,26 +2506,22 @@
 	skb_queue_head_init(&priv->skb_queue);
 
 	/* Tx related queue */
-	for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-		skb_queue_head_init(&priv->ieee80211->skb_waitQ [i]);
-	}
-	for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-		skb_queue_head_init(&priv->ieee80211->skb_aggQ [i]);
-	}
-	for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-		skb_queue_head_init(&priv->ieee80211->skb_drv_aggQ [i]);
-	}
+	for (i = 0; i < MAX_QUEUE_SIZE; i++)
+		skb_queue_head_init(&priv->ieee80211->skb_waitQ[i]);
+	for (i = 0; i < MAX_QUEUE_SIZE; i++)
+		skb_queue_head_init(&priv->ieee80211->skb_aggQ[i]);
+	for (i = 0; i < MAX_QUEUE_SIZE; i++)
+		skb_queue_head_init(&priv->ieee80211->skb_drv_aggQ[i]);
 	priv->rf_set_chan = rtl8192_phy_SwChnl;
 }
 
 //init lock here
-static void rtl8192_init_priv_lock(struct r8192_priv* priv)
+static void rtl8192_init_priv_lock(struct r8192_priv *priv)
 {
 	spin_lock_init(&priv->tx_lock);
 	spin_lock_init(&priv->irq_lock);//added by thomas
-	//spin_lock_init(&priv->rf_lock);
-	sema_init(&priv->wx_sem,1);
-	sema_init(&priv->rf_sem,1);
+	sema_init(&priv->wx_sem, 1);
+	sema_init(&priv->rf_sem, 1);
 	mutex_init(&priv->mutex);
 }
 
@@ -2783,7 +2530,7 @@
 void rtl8192_irq_rx_tasklet(struct r8192_priv *priv);
 //init tasklet and wait_queue here. only 2.6 above kernel is considered
 #define DRV_NAME "wlan0"
-static void rtl8192_init_priv_task(struct net_device* dev)
+static void rtl8192_init_priv_task(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
@@ -2791,71 +2538,64 @@
 
 	INIT_WORK(&priv->reset_wq, rtl8192_restart);
 
-	//INIT_DELAYED_WORK(&priv->watch_dog_wq, hal_dm_watchdog);
 	INIT_DELAYED_WORK(&priv->watch_dog_wq, rtl819x_watchdog_wqcallback);
 	INIT_DELAYED_WORK(&priv->txpower_tracking_wq,  dm_txpower_trackingcallback);
-//	INIT_DELAYED_WORK(&priv->gpio_change_rf_wq,  dm_gpio_change_rf_callback);
 	INIT_DELAYED_WORK(&priv->rfpath_check_wq,  dm_rf_pathcheck_workitemcallback);
 	INIT_DELAYED_WORK(&priv->update_beacon_wq, rtl8192_update_beacon);
 	INIT_DELAYED_WORK(&priv->initialgain_operate_wq, InitialGainOperateWorkItemCallBack);
-	//INIT_WORK(&priv->SwChnlWorkItem,  rtl8192_SwChnl_WorkItem);
-	//INIT_WORK(&priv->SetBWModeWorkItem,  rtl8192_SetBWModeWorkItem);
 	INIT_WORK(&priv->qos_activate, rtl8192_qos_activate);
 
 	tasklet_init(&priv->irq_rx_tasklet,
-	     (void(*)(unsigned long))rtl8192_irq_rx_tasklet,
-	     (unsigned long)priv);
+		     (void(*)(unsigned long))rtl8192_irq_rx_tasklet,
+		     (unsigned long)priv);
 }
 
-static void rtl8192_get_eeprom_size(struct net_device* dev)
+static void rtl8192_get_eeprom_size(struct net_device *dev)
 {
 	u16 curCR = 0;
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	RT_TRACE(COMP_EPROM, "===========>%s()\n", __FUNCTION__);
-	curCR = read_nic_word_E(dev,EPROM_CMD);
+	RT_TRACE(COMP_EPROM, "===========>%s()\n", __func__);
+	read_nic_word_E(dev, EPROM_CMD, &curCR);
 	RT_TRACE(COMP_EPROM, "read from Reg EPROM_CMD(%x):%x\n", EPROM_CMD, curCR);
 	//whether need I consider BIT5?
 	priv->epromtype = (curCR & Cmd9346CR_9356SEL) ? EPROM_93c56 : EPROM_93c46;
-	RT_TRACE(COMP_EPROM, "<===========%s(), epromtype:%d\n", __FUNCTION__, priv->epromtype);
+	RT_TRACE(COMP_EPROM, "<===========%s(), epromtype:%d\n", __func__, priv->epromtype);
 }
 
 //used to swap endian. as ntohl & htonl are not necessary to swap endian, so use this instead.
-static inline u16 endian_swap(u16* data)
+static inline u16 endian_swap(u16 *data)
 {
 	u16 tmp = *data;
 	*data = (tmp >> 8) | (tmp << 8);
 	return *data;
 }
-static void rtl8192_read_eeprom_info(struct net_device* dev)
+static void rtl8192_read_eeprom_info(struct net_device *dev)
 {
 	u16 wEPROM_ID = 0;
 	u8 bMac_Tmp_Addr[6] = {0x00, 0xe0, 0x4c, 0x00, 0x00, 0x02};
 	u8 bLoad_From_EEPOM = false;
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u16 tmpValue = 0;
-	RT_TRACE(COMP_EPROM, "===========>%s()\n", __FUNCTION__);
+	int i;
+	RT_TRACE(COMP_EPROM, "===========>%s()\n", __func__);
 	wEPROM_ID = eprom_read(dev, 0); //first read EEPROM ID out;
 	RT_TRACE(COMP_EPROM, "EEPROM ID is 0x%x\n", wEPROM_ID);
 
-	if (wEPROM_ID != RTL8190_EEPROM_ID)
-	{
+	if (wEPROM_ID != RTL8190_EEPROM_ID) {
 		RT_TRACE(COMP_ERR, "EEPROM ID is invalid(is 0x%x(should be 0x%x)\n", wEPROM_ID, RTL8190_EEPROM_ID);
-	}
-	else
+	} else {
 		bLoad_From_EEPOM = true;
+	}
 
-	if (bLoad_From_EEPOM)
-	{
+	if (bLoad_From_EEPOM) {
 		tmpValue = eprom_read(dev, (EEPROM_VID>>1));
 		priv->eeprom_vid = endian_swap(&tmpValue);
 		priv->eeprom_pid = eprom_read(dev, (EEPROM_PID>>1));
 		tmpValue = eprom_read(dev, (EEPROM_ChannelPlan>>1));
-		priv->eeprom_ChannelPlan =((tmpValue&0xff00)>>8);
+		priv->eeprom_ChannelPlan = ((tmpValue&0xff00)>>8);
 		priv->btxpowerdata_readfromEEPORM = true;
 		priv->eeprom_CustomerID = eprom_read(dev, (EEPROM_Customer_ID>>1)) >>8;
-	}
-	else
-	{
+	} else {
 		priv->eeprom_vid = 0;
 		priv->eeprom_pid = 0;
 		priv->card_8192_version = VERSION_819xU_B;
@@ -2865,18 +2605,14 @@
 	RT_TRACE(COMP_EPROM, "vid:0x%4x, pid:0x%4x, CustomID:0x%2x, ChanPlan:0x%x\n", priv->eeprom_vid, priv->eeprom_pid, priv->eeprom_CustomerID, priv->eeprom_ChannelPlan);
 	//set channelplan from eeprom
 	priv->ChannelPlan = priv->eeprom_ChannelPlan;
-	if (bLoad_From_EEPOM)
-	{
+	if (bLoad_From_EEPOM) {
 		int i;
-		for (i=0; i<6; i+=2)
-		{
+		for (i = 0; i < 6; i += 2) {
 			u16 tmp = 0;
 			tmp = eprom_read(dev, (u16)((EEPROM_NODE_ADDRESS_BYTE_0 + i)>>1));
-			*(u16*)(&dev->dev_addr[i]) = tmp;
+			*(u16 *)(&dev->dev_addr[i]) = tmp;
 		}
-	}
-	else
-	{
+	} else {
 		memcpy(dev->dev_addr, bMac_Tmp_Addr, 6);
 		//should I set IDR0 here?
 	}
@@ -2884,8 +2620,7 @@
 	priv->rf_type = RTL819X_DEFAULT_RF_TYPE; //default 1T2R
 	priv->rf_chip = RF_8256;
 
-	if (priv->card_8192_version == (u8)VERSION_819xU_A)
-	{
+	if (priv->card_8192_version == (u8)VERSION_819xU_A) {
 		//read Tx power gain offset of legacy OFDM to HT rate
 		if (bLoad_From_EEPOM)
 			priv->EEPROMTxPowerDiff = (eprom_read(dev, (EEPROM_TxPowerDiff>>1))&0xff00) >> 8;
@@ -2918,51 +2653,45 @@
 		else
 			priv->EEPROM_Def_Ver = 1;
 		RT_TRACE(COMP_EPROM, "EEPROM_DEF_VER:%d\n", priv->EEPROM_Def_Ver);
-		if (priv->EEPROM_Def_Ver == 0) //old eeprom definition
-		{
+		if (priv->EEPROM_Def_Ver == 0) { //old eeprom definition
 			int i;
 			if (bLoad_From_EEPOM)
 				priv->EEPROMTxPowerLevelCCK = (eprom_read(dev, (EEPROM_TxPwIndex_CCK>>1))&0xff) >> 8;
 			else
 				priv->EEPROMTxPowerLevelCCK = 0x10;
 			RT_TRACE(COMP_EPROM, "CCK Tx Power Levl: 0x%02x\n", priv->EEPROMTxPowerLevelCCK);
-			for (i=0; i<3; i++)
-			{
-				if (bLoad_From_EEPOM)
-				{
+			for (i = 0; i < 3; i++) {
+				if (bLoad_From_EEPOM) {
 					tmpValue = eprom_read(dev, (EEPROM_TxPwIndex_OFDM_24G+i)>>1);
 					if (((EEPROM_TxPwIndex_OFDM_24G+i) % 2) == 0)
 						tmpValue = tmpValue & 0x00ff;
 					else
 						tmpValue = (tmpValue & 0xff00) >> 8;
-				}
-				else
+				} else {
 					tmpValue = 0x10;
+				}
 				priv->EEPROMTxPowerLevelOFDM24G[i] = (u8) tmpValue;
 				RT_TRACE(COMP_EPROM, "OFDM 2.4G Tx Power Level, Index %d = 0x%02x\n", i, priv->EEPROMTxPowerLevelCCK);
 			}
-		}//end if EEPROM_DEF_VER == 0
-		else if (priv->EEPROM_Def_Ver == 1)
-		{
-			if (bLoad_From_EEPOM)
-			{
+		} else if (priv->EEPROM_Def_Ver == 1) {
+			if (bLoad_From_EEPOM) {
 				tmpValue = eprom_read(dev, (EEPROM_TxPwIndex_CCK_V1>>1));
 				tmpValue = (tmpValue & 0xff00) >> 8;
-			}
-			else
+			} else {
 				tmpValue = 0x10;
+			}
 			priv->EEPROMTxPowerLevelCCK_V1[0] = (u8)tmpValue;
 
 			if (bLoad_From_EEPOM)
 				tmpValue = eprom_read(dev, (EEPROM_TxPwIndex_CCK_V1 + 2)>>1);
 			else
 				tmpValue = 0x1010;
-			*((u16*)(&priv->EEPROMTxPowerLevelCCK_V1[1])) = tmpValue;
+			*((u16 *)(&priv->EEPROMTxPowerLevelCCK_V1[1])) = tmpValue;
 			if (bLoad_From_EEPOM)
 				tmpValue = eprom_read(dev, (EEPROM_TxPwIndex_OFDM_24G_V1>>1));
 			else
 				tmpValue = 0x1010;
-			*((u16*)(&priv->EEPROMTxPowerLevelOFDM24G[0])) = tmpValue;
+			*((u16 *)(&priv->EEPROMTxPowerLevelOFDM24G[0])) = tmpValue;
 			if (bLoad_From_EEPOM)
 				tmpValue = eprom_read(dev, (EEPROM_TxPwIndex_OFDM_24G_V1+2)>>1);
 			else
@@ -2972,42 +2701,34 @@
 
 		//update HAL variables
 		//
-		{
-			int i;
-			for (i=0; i<14; i++)
-			{
-				if (i<=3)
-					priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[0];
-				else if (i>=4 && i<=9)
-					priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[1];
-				else
-					priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[2];
-			}
+		for (i = 0; i < 14; i++) {
+			if (i <= 3)
+				priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[0];
+			else if (i >= 4 && i <= 9)
+				priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[1];
+			else
+				priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[2];
+		}
 
-			for (i=0; i<14; i++)
-			{
-				if (priv->EEPROM_Def_Ver == 0)
-				{
-					if (i<=3)
-						priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelOFDM24G[0] + (priv->EEPROMTxPowerLevelCCK - priv->EEPROMTxPowerLevelOFDM24G[1]);
-					else if (i>=4 && i<=9)
-						priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK;
-					else
-						priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelOFDM24G[2] + (priv->EEPROMTxPowerLevelCCK - priv->EEPROMTxPowerLevelOFDM24G[1]);
-				}
-				else if (priv->EEPROM_Def_Ver == 1)
-				{
-					if (i<=3)
-						priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[0];
-					else if (i>=4 && i<=9)
-						priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[1];
-					else
-						priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[2];
-				}
+		for (i = 0; i < 14; i++) {
+			if (priv->EEPROM_Def_Ver == 0) {
+				if (i <= 3)
+					priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelOFDM24G[0] + (priv->EEPROMTxPowerLevelCCK - priv->EEPROMTxPowerLevelOFDM24G[1]);
+				else if (i >= 4 && i <= 9)
+					priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK;
+				else
+					priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelOFDM24G[2] + (priv->EEPROMTxPowerLevelCCK - priv->EEPROMTxPowerLevelOFDM24G[1]);
+			} else if (priv->EEPROM_Def_Ver == 1) {
+				if (i <= 3)
+					priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[0];
+				else if (i >= 4 && i <= 9)
+					priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[1];
+				else
+					priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[2];
 			}
-		}//end update HAL variables
+		}
 		priv->TxPowerDiff = priv->EEPROMPwDiff;
-// Antenna B gain offset to antenna A, bit0~3
+		// Antenna B gain offset to antenna A, bit0~3
 		priv->AntennaTxPwDiff[0] = (priv->EEPROMTxPowerDiff & 0xf);
 		// Antenna C gain offset to antenna A, bit4~7
 		priv->AntennaTxPwDiff[1] = ((priv->EEPROMTxPowerDiff & 0xf0)>>4);
@@ -3018,46 +2739,41 @@
 		priv->ThermalMeter[0] = priv->EEPROMThermalMeter;
 	}//end if VersionID == VERSION_819xU_A
 
-//added by vivi, for dlink led, 20080416
-	switch(priv->eeprom_CustomerID)
-	{
-		case EEPROM_CID_RUNTOP:
-			priv->CustomerID = RT_CID_819x_RUNTOP;
-			break;
+	//added by vivi, for dlink led, 20080416
+	switch (priv->eeprom_CustomerID) {
+	case EEPROM_CID_RUNTOP:
+		priv->CustomerID = RT_CID_819x_RUNTOP;
+		break;
 
-		case EEPROM_CID_DLINK:
-			priv->CustomerID = RT_CID_DLINK;
-			break;
+	case EEPROM_CID_DLINK:
+		priv->CustomerID = RT_CID_DLINK;
+		break;
 
-		default:
-			priv->CustomerID = RT_CID_DEFAULT;
-			break;
+	default:
+		priv->CustomerID = RT_CID_DEFAULT;
+		break;
 
 	}
 
-	switch(priv->CustomerID)
-	{
-		case RT_CID_819x_RUNTOP:
-			priv->LedStrategy = SW_LED_MODE2;
-			break;
+	switch (priv->CustomerID) {
+	case RT_CID_819x_RUNTOP:
+		priv->LedStrategy = SW_LED_MODE2;
+		break;
 
-		case RT_CID_DLINK:
-			priv->LedStrategy = SW_LED_MODE4;
-			break;
+	case RT_CID_DLINK:
+		priv->LedStrategy = SW_LED_MODE4;
+		break;
 
-		default:
-			priv->LedStrategy = SW_LED_MODE0;
-			break;
+	default:
+		priv->LedStrategy = SW_LED_MODE0;
+		break;
 
 	}
 
 
-	if(priv->rf_type == RF_1T2R)
-	{
+	if (priv->rf_type == RF_1T2R) {
 		RT_TRACE(COMP_EPROM, "\n1T2R config\n");
-	}
-	else
-	{
+	} else {
 		RT_TRACE(COMP_EPROM, "\n2T4R config\n");
 	}
 
@@ -3066,18 +2782,18 @@
 	init_rate_adaptive(dev);
 	//we need init DIG RATR table here again.
 
-	RT_TRACE(COMP_EPROM, "<===========%s()\n", __FUNCTION__);
+	RT_TRACE(COMP_EPROM, "<===========%s()\n", __func__);
 	return;
 }
 
-short rtl8192_get_channel_map(struct net_device * dev)
+short rtl8192_get_channel_map(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	if(priv->ChannelPlan > COUNTRY_CODE_GLOBAL_DOMAIN){
-		printk("rtl8180_init:Error channel plan! Set to default.\n");
-		priv->ChannelPlan= 0;
+	if (priv->ChannelPlan > COUNTRY_CODE_GLOBAL_DOMAIN) {
+		netdev_err(dev, "rtl8180_init: Error channel plan! Set to default.\n");
+		priv->ChannelPlan = 0;
 	}
-	RT_TRACE(COMP_INIT, "Channel plan is %d\n",priv->ChannelPlan);
+	RT_TRACE(COMP_INIT, "Channel plan is %d\n", priv->ChannelPlan);
 
 	rtl819x_set_channel_map(priv->ChannelPlan, priv);
 	return 0;
@@ -3088,24 +2804,18 @@
 
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
-	memset(&(priv->stats),0,sizeof(struct Stats));
-	memset(priv->txqueue_to_outpipemap,0,9);
+	memset(&(priv->stats), 0, sizeof(struct Stats));
+	memset(priv->txqueue_to_outpipemap, 0, 9);
 #ifdef PIPE12
 	{
-		int i=0;
-		u8 queuetopipe[]={3,2,1,0,4,8,7,6,5};
-		memcpy(priv->txqueue_to_outpipemap,queuetopipe,9);
-/*		for(i=0;i<9;i++)
-			printk("%d ",priv->txqueue_to_outpipemap[i]);
-		printk("\n");*/
+		int i = 0;
+		u8 queuetopipe[] = {3, 2, 1, 0, 4, 8, 7, 6, 5};
+		memcpy(priv->txqueue_to_outpipemap, queuetopipe, 9);
 	}
 #else
 	{
-		u8 queuetopipe[]={3,2,1,0,4,4,0,4,4};
-		memcpy(priv->txqueue_to_outpipemap,queuetopipe,9);
-/*		for(i=0;i<9;i++)
-			printk("%d ",priv->txqueue_to_outpipemap[i]);
-		printk("\n");*/
+		u8 queuetopipe[] = {3, 2, 1, 0, 4, 4, 0, 4, 4};
+		memcpy(priv->txqueue_to_outpipemap, queuetopipe, 9);
 	}
 #endif
 	rtl8192_init_priv_variable(dev);
@@ -3118,12 +2828,11 @@
 	init_timer(&priv->watch_dog_timer);
 	priv->watch_dog_timer.data = (unsigned long)dev;
 	priv->watch_dog_timer.function = watch_dog_timer_callback;
-	if(rtl8192_usb_initendpoints(dev)!=0){
+	if (rtl8192_usb_initendpoints(dev) != 0) {
 		DMESG("Endopoints initialization failed");
 		return -ENOMEM;
 	}
 
-	//rtl8192_adapter_start(dev);
 #ifdef DEBUG_EPROM
 	dump_eprom(dev);
 #endif
@@ -3138,16 +2847,16 @@
  *  return:  none
  *  notice:  This part need to modified according to the rate set we filtered
  * ****************************************************************************/
-void rtl8192_hwconfig(struct net_device* dev)
+void rtl8192_hwconfig(struct net_device *dev)
 {
 	u32 regRATR = 0, regRRSR = 0;
 	u8 regBwOpMode = 0, regTmp = 0;
 	struct r8192_priv *priv = ieee80211_priv(dev);
+	u32 ratr_value = 0;
 
-// Set RRSR, RATR, and BW_OPMODE registers
+	// Set RRSR, RATR, and BW_OPMODE registers
 	//
-	switch(priv->ieee80211->mode)
-	{
+	switch (priv->ieee80211->mode) {
 	case WIRELESS_MODE_B:
 		regBwOpMode = BW_OPMODE_20MHZ;
 		regRATR = RATE_ALL_CCK;
@@ -3165,26 +2874,25 @@
 		break;
 	case WIRELESS_MODE_AUTO:
 #ifdef TO_DO_LIST
-		if (Adapter->bInHctTest)
-		{
-		    regBwOpMode = BW_OPMODE_20MHZ;
-		    regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
-		    regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
+		if (Adapter->bInHctTest) {
+			regBwOpMode = BW_OPMODE_20MHZ;
+			regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
+			regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
 		}
 		else
 #endif
 		{
-		    regBwOpMode = BW_OPMODE_20MHZ;
-		    regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG | RATE_ALL_OFDM_1SS | RATE_ALL_OFDM_2SS;
-		    regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
+			regBwOpMode = BW_OPMODE_20MHZ;
+			regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG | RATE_ALL_OFDM_1SS | RATE_ALL_OFDM_2SS;
+			regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
 		}
 		break;
 	case WIRELESS_MODE_N_24G:
 		// It support CCK rate by default.
 		// CCK rate will be filtered out only when associated AP does not support it.
 		regBwOpMode = BW_OPMODE_20MHZ;
-			regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG | RATE_ALL_OFDM_1SS | RATE_ALL_OFDM_2SS;
-			regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
+		regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG | RATE_ALL_OFDM_1SS | RATE_ALL_OFDM_2SS;
+		regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
 		break;
 	case WIRELESS_MODE_N_5G:
 		regBwOpMode = BW_OPMODE_5G;
@@ -3194,17 +2902,12 @@
 	}
 
 	write_nic_byte(dev, BW_OPMODE, regBwOpMode);
-	{
-		u32 ratr_value = 0;
-		ratr_value = regRATR;
-		if (priv->rf_type == RF_1T2R)
-		{
-			ratr_value &= ~(RATE_ALL_OFDM_2SS);
-		}
-		write_nic_dword(dev, RATR0, ratr_value);
-		write_nic_byte(dev, UFWP, 1);
-	}
-	regTmp = read_nic_byte(dev, 0x313);
+	ratr_value = regRATR;
+	if (priv->rf_type == RF_1T2R)
+		ratr_value &= ~(RATE_ALL_OFDM_2SS);
+	write_nic_dword(dev, RATR0, ratr_value);
+	write_nic_byte(dev, UFWP, 1);
+	read_nic_byte(dev, 0x313, &regTmp);
 	regRRSR = ((regTmp) << 24) | (regRRSR & 0x00ffffff);
 	write_nic_dword(dev, RRSR, regRRSR);
 
@@ -3212,8 +2915,8 @@
 	// Set Retry Limit here
 	//
 	write_nic_word(dev, RETRY_LIMIT,
-			priv->ShortRetryLimit << RETRY_LIMIT_SHORT_SHIFT | \
-			priv->LongRetryLimit << RETRY_LIMIT_LONG_SHIFT);
+		       priv->ShortRetryLimit << RETRY_LIMIT_SHORT_SHIFT |
+		       priv->LongRetryLimit << RETRY_LIMIT_LONG_SHIFT);
 	// Set Contention Window here
 
 	// Set Tx AGC
@@ -3232,7 +2935,9 @@
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u32 dwRegRead = 0;
 	bool init_status = true;
-	RT_TRACE(COMP_INIT, "====>%s()\n", __FUNCTION__);
+	u8 SECR_value = 0x0;
+	u8 tmp;
+	RT_TRACE(COMP_INIT, "====>%s()\n", __func__);
 	priv->Rf_Mode = RF_OP_By_SW_3wire;
 	//for ASIC power on sequence
 	write_nic_byte_E(dev, 0x5f, 0x80);
@@ -3242,34 +2947,31 @@
 	write_nic_byte_E(dev, 0x5e, 0x80);
 	write_nic_byte(dev, 0x17, 0x37);
 	mdelay(10);
-//#ifdef TO_DO_LIST
 	priv->pFirmware->firmware_status = FW_STATUS_0_INIT;
 	//config CPUReset Register
 	//Firmware Reset or not?
-	dwRegRead = read_nic_dword(dev, CPU_GEN);
+	read_nic_dword(dev, CPU_GEN, &dwRegRead);
 	if (priv->pFirmware->firmware_status == FW_STATUS_0_INIT)
 		dwRegRead |= CPU_GEN_SYSTEM_RESET; //do nothing here?
 	else if (priv->pFirmware->firmware_status == FW_STATUS_5_READY)
 		dwRegRead |= CPU_GEN_FIRMWARE_RESET;
 	else
-		RT_TRACE(COMP_ERR, "ERROR in %s(): undefined firmware state(%d)\n", __FUNCTION__,   priv->pFirmware->firmware_status);
+		RT_TRACE(COMP_ERR, "ERROR in %s(): undefined firmware state(%d)\n", __func__,   priv->pFirmware->firmware_status);
 
 	write_nic_dword(dev, CPU_GEN, dwRegRead);
-	//mdelay(30);
 	//config BB.
 	rtl8192_BBConfig(dev);
 
 	//Loopback mode or not
 	priv->LoopbackMode = RTL819xU_NO_LOOPBACK;
-//	priv->LoopbackMode = RTL819xU_MAC_LOOPBACK;
 
-	dwRegRead = read_nic_dword(dev, CPU_GEN);
+	read_nic_dword(dev, CPU_GEN, &dwRegRead);
 	if (priv->LoopbackMode == RTL819xU_NO_LOOPBACK)
 		dwRegRead = ((dwRegRead & CPU_GEN_NO_LOOPBACK_MSK) | CPU_GEN_NO_LOOPBACK_SET);
 	else if (priv->LoopbackMode == RTL819xU_MAC_LOOPBACK)
 		dwRegRead |= CPU_CCK_LOOPBACK;
 	else
-		RT_TRACE(COMP_ERR, "Serious error in %s(): wrong loopback mode setting(%d)\n", __FUNCTION__,  priv->LoopbackMode);
+		RT_TRACE(COMP_ERR, "Serious error in %s(): wrong loopback mode setting(%d)\n", __func__,  priv->LoopbackMode);
 
 	write_nic_dword(dev, CPU_GEN, dwRegRead);
 
@@ -3277,7 +2979,8 @@
 	udelay(500);
 
 	//xiong add for new bitfile:usb suspend reset pin set to 1. //do we need?
-	write_nic_byte_E(dev, 0x5f, (read_nic_byte_E(dev, 0x5f)|0x20));
+	read_nic_byte_E(dev, 0x5f, &tmp);
+	write_nic_byte_E(dev, 0x5f, tmp|0x20);
 
 	//Set Hardware
 	rtl8192_hwconfig(dev);
@@ -3286,61 +2989,54 @@
 	write_nic_byte(dev, CMDR, CR_RE|CR_TE);
 
 	//set IDR0 here
-	write_nic_dword(dev, MAC0, ((u32*)dev->dev_addr)[0]);
-	write_nic_word(dev, MAC4, ((u16*)(dev->dev_addr + 4))[0]);
+	write_nic_dword(dev, MAC0, ((u32 *)dev->dev_addr)[0]);
+	write_nic_word(dev, MAC4, ((u16 *)(dev->dev_addr + 4))[0]);
 
 	//set RCR
 	write_nic_dword(dev, RCR, priv->ReceiveConfig);
 
 	//Initialize Number of Reserved Pages in Firmware Queue
-	write_nic_dword(dev, RQPN1,  NUM_OF_PAGE_IN_FW_QUEUE_BK << RSVD_FW_QUEUE_PAGE_BK_SHIFT |\
-						NUM_OF_PAGE_IN_FW_QUEUE_BE << RSVD_FW_QUEUE_PAGE_BE_SHIFT | \
-						NUM_OF_PAGE_IN_FW_QUEUE_VI << RSVD_FW_QUEUE_PAGE_VI_SHIFT | \
-						NUM_OF_PAGE_IN_FW_QUEUE_VO <<RSVD_FW_QUEUE_PAGE_VO_SHIFT);
-	write_nic_dword(dev, RQPN2, NUM_OF_PAGE_IN_FW_QUEUE_MGNT << RSVD_FW_QUEUE_PAGE_MGNT_SHIFT |\
-						NUM_OF_PAGE_IN_FW_QUEUE_CMD << RSVD_FW_QUEUE_PAGE_CMD_SHIFT);
-	write_nic_dword(dev, RQPN3, APPLIED_RESERVED_QUEUE_IN_FW| \
-						NUM_OF_PAGE_IN_FW_QUEUE_BCN<<RSVD_FW_QUEUE_PAGE_BCN_SHIFT
-//						| NUM_OF_PAGE_IN_FW_QUEUE_PUB<<RSVD_FW_QUEUE_PAGE_PUB_SHIFT
-						);
+	write_nic_dword(dev, RQPN1,  NUM_OF_PAGE_IN_FW_QUEUE_BK << RSVD_FW_QUEUE_PAGE_BK_SHIFT |
+			NUM_OF_PAGE_IN_FW_QUEUE_BE << RSVD_FW_QUEUE_PAGE_BE_SHIFT |
+			NUM_OF_PAGE_IN_FW_QUEUE_VI << RSVD_FW_QUEUE_PAGE_VI_SHIFT |
+			NUM_OF_PAGE_IN_FW_QUEUE_VO <<RSVD_FW_QUEUE_PAGE_VO_SHIFT);
+	write_nic_dword(dev, RQPN2, NUM_OF_PAGE_IN_FW_QUEUE_MGNT << RSVD_FW_QUEUE_PAGE_MGNT_SHIFT |
+			NUM_OF_PAGE_IN_FW_QUEUE_CMD << RSVD_FW_QUEUE_PAGE_CMD_SHIFT);
+	write_nic_dword(dev, RQPN3, APPLIED_RESERVED_QUEUE_IN_FW|
+			NUM_OF_PAGE_IN_FW_QUEUE_BCN<<RSVD_FW_QUEUE_PAGE_BCN_SHIFT);
 	write_nic_dword(dev, RATR0+4*7, (RATE_ALL_OFDM_AG | RATE_ALL_CCK));
 
 	//Set AckTimeout
 	// TODO: (it value is only for FPGA version). need to be changed!!2006.12.18, by Emily
 	write_nic_byte(dev, ACK_TIMEOUT, 0x30);
 
-//	RT_TRACE(COMP_INIT, "%s():priv->ResetProgress is %d\n", __FUNCTION__,priv->ResetProgress);
-	if(priv->ResetProgress == RESET_TYPE_NORESET)
-	rtl8192_SetWirelessMode(dev, priv->ieee80211->mode);
-	if(priv->ResetProgress == RESET_TYPE_NORESET){
-	CamResetAllEntry(dev);
-	{
-		u8 SECR_value = 0x0;
+	if (priv->ResetProgress == RESET_TYPE_NORESET)
+		rtl8192_SetWirelessMode(dev, priv->ieee80211->mode);
+	if (priv->ResetProgress == RESET_TYPE_NORESET) {
+		CamResetAllEntry(dev);
 		SECR_value |= SCR_TxEncEnable;
 		SECR_value |= SCR_RxDecEnable;
 		SECR_value |= SCR_NoSKMC;
 		write_nic_byte(dev, SECR, SECR_value);
 	}
-	}
 
 	//Beacon related
 	write_nic_word(dev, ATIMWND, 2);
 	write_nic_word(dev, BCN_INTERVAL, 100);
 
-	{
 #define DEFAULT_EDCA 0x005e4332
+	{
 		int i;
-		for (i=0; i<QOS_QUEUE_NUM; i++)
-		write_nic_dword(dev, WDCAPARA_ADD[i], DEFAULT_EDCA);
+		for (i = 0; i < QOS_QUEUE_NUM; i++)
+			write_nic_dword(dev, WDCAPARA_ADD[i], DEFAULT_EDCA);
 	}
 #ifdef USB_RX_AGGREGATION_SUPPORT
 	//3 For usb rx firmware aggregation control
-	if(priv->ResetProgress == RESET_TYPE_NORESET)
-	{
+	if (priv->ResetProgress == RESET_TYPE_NORESET) {
 		u32 ulValue;
 		PRT_HIGH_THROUGHPUT	pHTInfo = priv->ieee80211->pHTInfo;
 		ulValue = (pHTInfo->UsbRxFwAggrEn<<24) | (pHTInfo->UsbRxFwAggrPageNum<<16) |
-					(pHTInfo->UsbRxFwAggrPacketNum<<8) | (pHTInfo->UsbRxFwAggrTimeout);
+			  (pHTInfo->UsbRxFwAggrPacketNum<<8) | (pHTInfo->UsbRxFwAggrTimeout);
 		/*
 		 * If usb rx firmware aggregation is enabled,
 		 * when anyone of three threshold conditions above is reached,
@@ -3353,63 +3049,52 @@
 
 	rtl8192_phy_configmac(dev);
 
-	if (priv->card_8192_version == (u8) VERSION_819xU_A)
-	{
+	if (priv->card_8192_version == (u8) VERSION_819xU_A) {
 		rtl8192_phy_getTxPower(dev);
 		rtl8192_phy_setTxPower(dev, priv->chan);
 	}
 
 	//Firmware download
 	init_status = init_firmware(dev);
-	if(!init_status)
-	{
-		RT_TRACE(COMP_ERR,"ERR!!! %s(): Firmware download is failed\n", __FUNCTION__);
+	if (!init_status) {
+		RT_TRACE(COMP_ERR, "ERR!!! %s(): Firmware download is failed\n", __func__);
 		return init_status;
 	}
-	RT_TRACE(COMP_INIT, "%s():after firmware download\n", __FUNCTION__);
+	RT_TRACE(COMP_INIT, "%s():after firmware download\n", __func__);
 	//
 #ifdef TO_DO_LIST
-if(Adapter->ResetProgress == RESET_TYPE_NORESET)
-	{
-		if(pMgntInfo->RegRfOff == TRUE)
-		{ // User disable RF via registry.
+	if (Adapter->ResetProgress == RESET_TYPE_NORESET) {
+		if (pMgntInfo->RegRfOff == TRUE) { // User disable RF via registry.
 			RT_TRACE((COMP_INIT|COMP_RF), DBG_LOUD, ("InitializeAdapter819xUsb(): Turn off RF for RegRfOff ----------\n"));
 			MgntActSet_RF_State(Adapter, eRfOff, RF_CHANGE_BY_SW);
 			// Those actions will be discard in MgntActSet_RF_State because of the same state
-			for(eRFPath = 0; eRFPath <pHalData->NumTotalRFPath; eRFPath++)
+			for (eRFPath = 0; eRFPath < pHalData->NumTotalRFPath; eRFPath++)
 				PHY_SetRFReg(Adapter, (RF90_RADIO_PATH_E)eRFPath, 0x4, 0xC00, 0x0);
-		}
-		else if(pMgntInfo->RfOffReason > RF_CHANGE_BY_PS)
-		{ // H/W or S/W RF OFF before sleep.
+		} else if (pMgntInfo->RfOffReason > RF_CHANGE_BY_PS) { // H/W or S/W RF OFF before sleep.
 			RT_TRACE((COMP_INIT|COMP_RF), DBG_LOUD, ("InitializeAdapter819xUsb(): Turn off RF for RfOffReason(%d) ----------\n", pMgntInfo->RfOffReason));
 			MgntActSet_RF_State(Adapter, eRfOff, pMgntInfo->RfOffReason);
-		}
-		else
-		{
+		} else {
 			pHalData->eRFPowerState = eRfOn;
 			pMgntInfo->RfOffReason = 0;
 			RT_TRACE((COMP_INIT|COMP_RF), DBG_LOUD, ("InitializeAdapter819xUsb(): RF is on ----------\n"));
 		}
-	}
-	else
-	{
-		if(pHalData->eRFPowerState == eRfOff)
-		{
+	} else {
+		if (pHalData->eRFPowerState == eRfOff) {
 			MgntActSet_RF_State(Adapter, eRfOff, pMgntInfo->RfOffReason);
 			// Those actions will be discard in MgntActSet_RF_State because of the same state
-			for(eRFPath = 0; eRFPath <pHalData->NumTotalRFPath; eRFPath++)
+			for (eRFPath = 0; eRFPath < pHalData->NumTotalRFPath; eRFPath++)
 				PHY_SetRFReg(Adapter, (RF90_RADIO_PATH_E)eRFPath, 0x4, 0xC00, 0x0);
 		}
 	}
 #endif
 	//config RF.
-	if(priv->ResetProgress == RESET_TYPE_NORESET){
-	rtl8192_phy_RFConfig(dev);
-	RT_TRACE(COMP_INIT, "%s():after phy RF config\n", __FUNCTION__);
+	if (priv->ResetProgress == RESET_TYPE_NORESET) {
+		rtl8192_phy_RFConfig(dev);
+		RT_TRACE(COMP_INIT, "%s():after phy RF config\n", __func__);
 	}
 
 
-	if(priv->ieee80211->FwRWRF)
+	if (priv->ieee80211->FwRWRF)
 		// We can force firmware to do RF-R/W
 		priv->Rf_Mode = RF_OP_By_FW;
 	else
@@ -3421,54 +3106,44 @@
 	rtl8192_setBBreg(dev, rFPGA0_RFMOD, bCCKEn, 0x1);
 	rtl8192_setBBreg(dev, rFPGA0_RFMOD, bOFDMEn, 0x1);
 
-	if(priv->ResetProgress == RESET_TYPE_NORESET)
-	{
+	if (priv->ResetProgress == RESET_TYPE_NORESET) {
 		//if D or C cut
-		u8 tmpvalue = read_nic_byte(dev, 0x301);
-		if(tmpvalue ==0x03)
-		{
+		u8 tmpvalue;
+		read_nic_byte(dev, 0x301, &tmpvalue);
+		if (tmpvalue == 0x03) {
 			priv->bDcut = TRUE;
 			RT_TRACE(COMP_POWER_TRACKING, "D-cut\n");
-		}
-		else
-		{
+		} else {
 			priv->bDcut = FALSE;
 			RT_TRACE(COMP_POWER_TRACKING, "C-cut\n");
 		}
 		dm_initialize_txpower_tracking(dev);
 
-		if(priv->bDcut == TRUE)
-		{
+		if (priv->bDcut == TRUE) {
 			u32 i, TempCCk;
-			u32 tmpRegA= rtl8192_QueryBBReg(dev,rOFDM0_XATxIQImbalance,bMaskDWord);
-		//	u32 tmpRegC= rtl8192_QueryBBReg(dev,rOFDM0_XCTxIQImbalance,bMaskDWord);
-			for(i = 0; i<TxBBGainTableLength; i++)
-			{
-				if(tmpRegA == priv->txbbgain_table[i].txbbgain_value)
-				{
-					priv->rfa_txpowertrackingindex= (u8)i;
-					priv->rfa_txpowertrackingindex_real= (u8)i;
-					priv->rfa_txpowertracking_default= priv->rfa_txpowertrackingindex;
+			u32 tmpRegA = rtl8192_QueryBBReg(dev, rOFDM0_XATxIQImbalance, bMaskDWord);
+			for (i = 0; i < TxBBGainTableLength; i++) {
+				if (tmpRegA == priv->txbbgain_table[i].txbbgain_value) {
+					priv->rfa_txpowertrackingindex = (u8)i;
+					priv->rfa_txpowertrackingindex_real = (u8)i;
+					priv->rfa_txpowertracking_default = priv->rfa_txpowertrackingindex;
 					break;
 				}
 			}
 
 			TempCCk = rtl8192_QueryBBReg(dev, rCCK0_TxFilter1, bMaskByte2);
 
-			for(i=0 ; i<CCKTxBBGainTableLength ; i++)
-			{
+			for (i = 0; i < CCKTxBBGainTableLength; i++) {
 
-				if(TempCCk == priv->cck_txbbgain_table[i].ccktxbb_valuearray[0])
-				{
-					priv->cck_present_attentuation_20Mdefault=(u8) i;
+				if (TempCCk == priv->cck_txbbgain_table[i].ccktxbb_valuearray[0]) {
+					priv->cck_present_attentuation_20Mdefault = (u8) i;
 					break;
 				}
 			}
-			priv->cck_present_attentuation_40Mdefault= 0;
-			priv->cck_present_attentuation_difference= 0;
+			priv->cck_present_attentuation_40Mdefault = 0;
+			priv->cck_present_attentuation_difference = 0;
 			priv->cck_present_attentuation = priv->cck_present_attentuation_20Mdefault;
 
-	//		pMgntInfo->bTXPowerTracking = FALSE;//TEMPLY DISABLE
 		}
 	}
 	write_nic_byte(dev, 0x87, 0x0);
@@ -3492,16 +3167,14 @@
 	return &priv->ieee80211->stats;
 }
 
-bool
-HalTxCheckStuck819xUsb(
-	struct net_device *dev
-	)
+bool HalTxCheckStuck819xUsb(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	u16		RegTxCounter = read_nic_word(dev, 0x128);
+	u16		RegTxCounter;
 	bool		bStuck = FALSE;
-	RT_TRACE(COMP_RESET,"%s():RegTxCounter is %d,TxCounter is %d\n",__FUNCTION__,RegTxCounter,priv->TxCounter);
-	if(priv->TxCounter==RegTxCounter)
+	read_nic_word(dev, 0x128, &RegTxCounter);
+	RT_TRACE(COMP_RESET, "%s():RegTxCounter is %d,TxCounter is %d\n", __func__, RegTxCounter, priv->TxCounter);
+	if (priv->TxCounter == RegTxCounter)
 		bStuck = TRUE;
 
 	priv->TxCounter = RegTxCounter;
@@ -3513,43 +3186,30 @@
 *	<Assumption: RT_TX_SPINLOCK is acquired.>
 *	First added: 2006.11.19 by emily
 */
-RESET_TYPE
-TxCheckStuck(struct net_device *dev)
+RESET_TYPE TxCheckStuck(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8			QueueID;
-//	PRT_TCB			pTcb;
-//	u8			ResetThreshold;
 	bool			bCheckFwTxCnt = false;
-	//unsigned long flags;
 
 	//
 	// Decide such threshold according to current power save mode
 	//
 
-//     RT_TRACE(COMP_RESET, " ==> TxCheckStuck()\n");
-//	     PlatformAcquireSpinLock(Adapter, RT_TX_SPINLOCK);
-//	     spin_lock_irqsave(&priv->ieee80211->lock,flags);
-	     for (QueueID = 0; QueueID<=BEACON_QUEUE;QueueID ++)
-	     {
-			if(QueueID == TXCMD_QUEUE)
-			 continue;
+	for (QueueID = 0; QueueID <= BEACON_QUEUE; QueueID++) {
+		if (QueueID == TXCMD_QUEUE)
+			continue;
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
-			if((skb_queue_len(&priv->ieee80211->skb_waitQ[QueueID]) == 0) && (skb_queue_len(&priv->ieee80211->skb_aggQ[QueueID]) == 0) && (skb_queue_len(&priv->ieee80211->skb_drv_aggQ[QueueID]) == 0))
+		if ((skb_queue_len(&priv->ieee80211->skb_waitQ[QueueID]) == 0) && (skb_queue_len(&priv->ieee80211->skb_aggQ[QueueID]) == 0) && (skb_queue_len(&priv->ieee80211->skb_drv_aggQ[QueueID]) == 0))
 #else
-			if((skb_queue_len(&priv->ieee80211->skb_waitQ[QueueID]) == 0)  && (skb_queue_len(&priv->ieee80211->skb_aggQ[QueueID]) == 0))
+		if ((skb_queue_len(&priv->ieee80211->skb_waitQ[QueueID]) == 0)  && (skb_queue_len(&priv->ieee80211->skb_aggQ[QueueID]) == 0))
 #endif
 				continue;
 
-		     bCheckFwTxCnt = true;
-	     }
-//	     PlatformReleaseSpinLock(Adapter, RT_TX_SPINLOCK);
-//	spin_unlock_irqrestore(&priv->ieee80211->lock,flags);
-//	RT_TRACE(COMP_RESET,"bCheckFwTxCnt is %d\n",bCheckFwTxCnt);
-	if(bCheckFwTxCnt)
-	{
-		if(HalTxCheckStuck819xUsb(dev))
-		{
+		bCheckFwTxCnt = true;
+	}
+	if (bCheckFwTxCnt) {
+		if (HalTxCheckStuck819xUsb(dev)) {
 			RT_TRACE(COMP_RESET, "TxCheckStuck(): Fw indicates no Tx condition! \n");
 			return RESET_TYPE_SILENT;
 		}
@@ -3557,64 +3217,41 @@
 	return RESET_TYPE_NORESET;
 }
 
-bool
-HalRxCheckStuck819xUsb(struct net_device *dev)
+bool HalRxCheckStuck819xUsb(struct net_device *dev)
 {
-	u16	RegRxCounter = read_nic_word(dev, 0x130);
+	u16	RegRxCounter;
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	bool bStuck = FALSE;
 	static u8	rx_chk_cnt;
-	RT_TRACE(COMP_RESET,"%s(): RegRxCounter is %d,RxCounter is %d\n",__FUNCTION__,RegRxCounter,priv->RxCounter);
+	read_nic_word(dev, 0x130, &RegRxCounter);
+	RT_TRACE(COMP_RESET, "%s(): RegRxCounter is %d,RxCounter is %d\n", __func__, RegRxCounter, priv->RxCounter);
 	// If rssi is small, we should check rx for long time because of bad rx.
 	// or maybe it will continuous silent reset every 2 seconds.
 	rx_chk_cnt++;
-	if(priv->undecorated_smoothed_pwdb >= (RateAdaptiveTH_High+5))
-	{
+	if (priv->undecorated_smoothed_pwdb >= (RateAdaptiveTH_High+5)) {
 		rx_chk_cnt = 0;	//high rssi, check rx stuck right now.
-	}
-	else if(priv->undecorated_smoothed_pwdb < (RateAdaptiveTH_High+5) &&
-		((priv->CurrentChannelBW!=HT_CHANNEL_WIDTH_20&&priv->undecorated_smoothed_pwdb>=RateAdaptiveTH_Low_40M) ||
-		(priv->CurrentChannelBW==HT_CHANNEL_WIDTH_20&&priv->undecorated_smoothed_pwdb>=RateAdaptiveTH_Low_20M)) )
-	{
-		if(rx_chk_cnt < 2)
-		{
+	} else if (priv->undecorated_smoothed_pwdb < (RateAdaptiveTH_High+5) &&
+		   ((priv->CurrentChannelBW != HT_CHANNEL_WIDTH_20 && priv->undecorated_smoothed_pwdb >= RateAdaptiveTH_Low_40M) ||
+		    (priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20 && priv->undecorated_smoothed_pwdb >= RateAdaptiveTH_Low_20M))) {
+		if (rx_chk_cnt < 2)
 			return bStuck;
-		}
 		else
-		{
 			rx_chk_cnt = 0;
-		}
-	}
-	else if(((priv->CurrentChannelBW!=HT_CHANNEL_WIDTH_20&&priv->undecorated_smoothed_pwdb<RateAdaptiveTH_Low_40M) ||
-		(priv->CurrentChannelBW==HT_CHANNEL_WIDTH_20&&priv->undecorated_smoothed_pwdb<RateAdaptiveTH_Low_20M)) &&
-		priv->undecorated_smoothed_pwdb >= VeryLowRSSI)
-	{
-		if(rx_chk_cnt < 4)
-		{
-			//DbgPrint("RSSI < %d && RSSI >= %d, no check this time \n", RateAdaptiveTH_Low, VeryLowRSSI);
+	} else if (((priv->CurrentChannelBW != HT_CHANNEL_WIDTH_20 && priv->undecorated_smoothed_pwdb < RateAdaptiveTH_Low_40M) ||
+		    (priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20 && priv->undecorated_smoothed_pwdb < RateAdaptiveTH_Low_20M)) &&
+		     priv->undecorated_smoothed_pwdb >= VeryLowRSSI) {
+		if (rx_chk_cnt < 4)
 			return bStuck;
-		}
 		else
-		{
 			rx_chk_cnt = 0;
-			//DbgPrint("RSSI < %d && RSSI >= %d, check this time \n", RateAdaptiveTH_Low, VeryLowRSSI);
-		}
-	}
-	else
-	{
-		if(rx_chk_cnt < 8)
-		{
-			//DbgPrint("RSSI <= %d, no check this time \n", VeryLowRSSI);
+	} else {
+		if (rx_chk_cnt < 8)
 			return bStuck;
-		}
 		else
-		{
 			rx_chk_cnt = 0;
-			//DbgPrint("RSSI <= %d, check this time \n", VeryLowRSSI);
-		}
 	}
 
-	if(priv->RxCounter==RegRxCounter)
+	if (priv->RxCounter == RegRxCounter)
 		bStuck = TRUE;
 
 	priv->RxCounter = RegRxCounter;
@@ -3622,25 +3259,16 @@
 	return bStuck;
 }
 
-RESET_TYPE
-RxCheckStuck(struct net_device *dev)
+RESET_TYPE RxCheckStuck(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	//int                     i;
 	bool        bRxCheck = FALSE;
 
-//       RT_TRACE(COMP_RESET," ==> RxCheckStuck()\n");
-	//PlatformAcquireSpinLock(Adapter, RT_RX_SPINLOCK);
-
-	 if(priv->IrpPendingCount > 1)
+	if (priv->IrpPendingCount > 1)
 		bRxCheck = TRUE;
-       //PlatformReleaseSpinLock(Adapter, RT_RX_SPINLOCK);
 
-//       RT_TRACE(COMP_RESET,"bRxCheck is %d \n",bRxCheck);
-	if(bRxCheck)
-	{
-		if(HalRxCheckStuck819xUsb(dev))
-		{
+	if (bRxCheck) {
+		if (HalRxCheckStuck819xUsb(dev)) {
 			RT_TRACE(COMP_RESET, "RxStuck Condition\n");
 			return RESET_TYPE_SILENT;
 		}
@@ -3661,8 +3289,7 @@
 *
 *	8185 and 8185b does not implement this function. This is added by Emily at 2006.11.24
 */
-RESET_TYPE
-rtl819x_ifcheck_resetornot(struct net_device *dev)
+RESET_TYPE rtl819x_ifcheck_resetornot(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	RESET_TYPE	TxResetType = RESET_TYPE_NORESET;
@@ -3672,10 +3299,8 @@
 	rfState = priv->ieee80211->eRFPowerState;
 
 	TxResetType = TxCheckStuck(dev);
-	if( rfState != eRfOff ||
-		/*ADAPTER_TEST_STATUS_FLAG(Adapter, ADAPTER_STATUS_FW_DOWNLOAD_FAILURE)) &&*/
-		(priv->ieee80211->iw_mode != IW_MODE_ADHOC))
-	{
+	if (rfState != eRfOff ||
+	    (priv->ieee80211->iw_mode != IW_MODE_ADHOC)) {
 		// If driver is in the status of firmware download failure , driver skips RF initialization and RF is
 		// in turned off state. Driver should check whether Rx stuck and do silent reset. And
 		// if driver is in firmware download failure status, driver should initialize RF in the following
@@ -3686,155 +3311,91 @@
 		// set, STA cannot hear any packet at all. Emily, 2008.04.12
 		RxResetType = RxCheckStuck(dev);
 	}
-	if(TxResetType==RESET_TYPE_NORMAL || RxResetType==RESET_TYPE_NORMAL)
+	if (TxResetType == RESET_TYPE_NORMAL || RxResetType == RESET_TYPE_NORMAL) {
 		return RESET_TYPE_NORMAL;
-	else if(TxResetType==RESET_TYPE_SILENT || RxResetType==RESET_TYPE_SILENT){
-		RT_TRACE(COMP_RESET,"%s():silent reset\n",__FUNCTION__);
+	} else if (TxResetType == RESET_TYPE_SILENT || RxResetType == RESET_TYPE_SILENT) {
+		RT_TRACE(COMP_RESET, "%s():silent reset\n", __func__);
 		return RESET_TYPE_SILENT;
-	}
-	else
+	} else {
 		return RESET_TYPE_NORESET;
+	}
 
 }
 
-void rtl8192_cancel_deferred_work(struct r8192_priv* priv);
+void rtl8192_cancel_deferred_work(struct r8192_priv *priv);
 int _rtl8192_up(struct net_device *dev);
 int rtl8192_close(struct net_device *dev);
 
 
 
-void
-CamRestoreAllEntry(	struct net_device *dev)
+void CamRestoreAllEntry(struct net_device *dev)
 {
 	u8 EntryId = 0;
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	u8*	MacAddr = priv->ieee80211->current_network.bssid;
+	u8	*MacAddr = priv->ieee80211->current_network.bssid;
 
 	static u8	CAM_CONST_ADDR[4][6] = {
 		{0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
 		{0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
 		{0x00, 0x00, 0x00, 0x00, 0x00, 0x02},
-		{0x00, 0x00, 0x00, 0x00, 0x00, 0x03}};
-	static u8	CAM_CONST_BROAD[] =
-		{0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+		{0x00, 0x00, 0x00, 0x00, 0x00, 0x03} };
+	static u8	CAM_CONST_BROAD[] = {
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
 
 	RT_TRACE(COMP_SEC, "CamRestoreAllEntry: \n");
 
 
-	if ((priv->ieee80211->pairwise_key_type == KEY_TYPE_WEP40)||
-	    (priv->ieee80211->pairwise_key_type == KEY_TYPE_WEP104))
-	{
+	if ((priv->ieee80211->pairwise_key_type == KEY_TYPE_WEP40) ||
+	    (priv->ieee80211->pairwise_key_type == KEY_TYPE_WEP104)) {
 
-		for(EntryId=0; EntryId<4; EntryId++)
-		{
-			{
-				MacAddr = CAM_CONST_ADDR[EntryId];
-				setKey(dev,
-						EntryId ,
-						EntryId,
-						priv->ieee80211->pairwise_key_type,
-						MacAddr,
-						0,
-						NULL);
-			}
+		for (EntryId = 0; EntryId < 4; EntryId++) {
+			MacAddr = CAM_CONST_ADDR[EntryId];
+			setKey(dev, EntryId, EntryId,
+			       priv->ieee80211->pairwise_key_type,
+			       MacAddr, 0, NULL);
 		}
 
-	}
-	else if(priv->ieee80211->pairwise_key_type == KEY_TYPE_TKIP)
-	{
+	} else if (priv->ieee80211->pairwise_key_type == KEY_TYPE_TKIP) {
 
-		{
-			if(priv->ieee80211->iw_mode == IW_MODE_ADHOC)
-				setKey(dev,
-						4,
-						0,
-						priv->ieee80211->pairwise_key_type,
-						(u8*)dev->dev_addr,
-						0,
-						NULL);
-			else
-				setKey(dev,
-						4,
-						0,
-						priv->ieee80211->pairwise_key_type,
-						MacAddr,
-						0,
-						NULL);
-		}
-	}
-	else if(priv->ieee80211->pairwise_key_type == KEY_TYPE_CCMP)
-	{
+		if (priv->ieee80211->iw_mode == IW_MODE_ADHOC)
+			setKey(dev, 4, 0, priv->ieee80211->pairwise_key_type,
+			       (u8 *)dev->dev_addr, 0, NULL);
+		else
+			setKey(dev, 4, 0, priv->ieee80211->pairwise_key_type,
+			       MacAddr, 0, NULL);
+	} else if (priv->ieee80211->pairwise_key_type == KEY_TYPE_CCMP) {
 
-		{
-			if(priv->ieee80211->iw_mode == IW_MODE_ADHOC)
-				setKey(dev,
-						4,
-						0,
-						priv->ieee80211->pairwise_key_type,
-						(u8*)dev->dev_addr,
-						0,
-						NULL);
-			else
-				setKey(dev,
-						4,
-						0,
-						priv->ieee80211->pairwise_key_type,
-						MacAddr,
-						0,
-						NULL);
-		}
+		if (priv->ieee80211->iw_mode == IW_MODE_ADHOC)
+			setKey(dev, 4, 0, priv->ieee80211->pairwise_key_type,
+			       (u8 *)dev->dev_addr, 0, NULL);
+		else
+			setKey(dev, 4, 0, priv->ieee80211->pairwise_key_type,
+			       MacAddr, 0, NULL);
 	}
 
 
 
-	if(priv->ieee80211->group_key_type == KEY_TYPE_TKIP)
-	{
+	if (priv->ieee80211->group_key_type == KEY_TYPE_TKIP) {
 		MacAddr = CAM_CONST_BROAD;
-		for(EntryId=1 ; EntryId<4 ; EntryId++)
-		{
-			{
-				setKey(dev,
-						EntryId,
-						EntryId,
-						priv->ieee80211->group_key_type,
-						MacAddr,
-						0,
-						NULL);
-			}
+		for (EntryId = 1; EntryId < 4; EntryId++) {
+			setKey(dev, EntryId, EntryId,
+			       priv->ieee80211->group_key_type,
+			       MacAddr, 0, NULL);
 		}
-		if(priv->ieee80211->iw_mode == IW_MODE_ADHOC)
-				setKey(dev,
-						0,
-						0,
-						priv->ieee80211->group_key_type,
-						CAM_CONST_ADDR[0],
-						0,
-						NULL);
-	}
-	else if(priv->ieee80211->group_key_type == KEY_TYPE_CCMP)
-	{
+		if (priv->ieee80211->iw_mode == IW_MODE_ADHOC)
+			setKey(dev, 0, 0, priv->ieee80211->group_key_type,
+			       CAM_CONST_ADDR[0], 0, NULL);
+	} else if (priv->ieee80211->group_key_type == KEY_TYPE_CCMP) {
 		MacAddr = CAM_CONST_BROAD;
-		for(EntryId=1; EntryId<4 ; EntryId++)
-		{
-			{
-				setKey(dev,
-						EntryId ,
-						EntryId,
-						priv->ieee80211->group_key_type,
-						MacAddr,
-						0,
-						NULL);
-			}
+		for (EntryId = 1; EntryId < 4; EntryId++) {
+			setKey(dev, EntryId, EntryId,
+			       priv->ieee80211->group_key_type,
+			       MacAddr, 0, NULL);
 		}
 
-		if(priv->ieee80211->iw_mode == IW_MODE_ADHOC)
-				setKey(dev,
-						0 ,
-						0,
-						priv->ieee80211->group_key_type,
-						CAM_CONST_ADDR[0],
-						0,
-						NULL);
+		if (priv->ieee80211->iw_mode == IW_MODE_ADHOC)
+			setKey(dev, 0, 0, priv->ieee80211->group_key_type,
+			       CAM_CONST_ADDR[0], 0, NULL);
 	}
 }
 //////////////////////////////////////////////////////////////
@@ -3843,10 +3404,8 @@
 // The method checking Tx/Rx stuck of this function is supported by FW,
 // which reports Tx and Rx counter to register 0x128 and 0x130.
 //////////////////////////////////////////////////////////////
-void
-rtl819x_ifsilentreset(struct net_device *dev)
+void rtl819x_ifsilentreset(struct net_device *dev)
 {
-	//OCTET_STRING asocpdu;
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8	reset_times = 0;
 	int reset_status = 0;
@@ -3856,26 +3415,21 @@
 	// 2007.07.20. If we need to check CCK stop, please uncomment this line.
 	//bStuck = Adapter->HalFunc.CheckHWStopHandler(Adapter);
 
-	if(priv->ResetProgress==RESET_TYPE_NORESET)
-	{
+	if (priv->ResetProgress == RESET_TYPE_NORESET) {
 RESET_START:
 
-		RT_TRACE(COMP_RESET,"=========>Reset progress!! \n");
+		RT_TRACE(COMP_RESET, "=========>Reset progress!! \n");
 
 		// Set the variable for reset.
 		priv->ResetProgress = RESET_TYPE_SILENT;
-//		rtl8192_close(dev);
 		down(&priv->wx_sem);
-		if(priv->up == 0)
-		{
-			RT_TRACE(COMP_ERR,"%s():the driver is not up! return\n",__FUNCTION__);
+		if (priv->up == 0) {
+			RT_TRACE(COMP_ERR, "%s():the driver is not up! return\n", __func__);
 			up(&priv->wx_sem);
-			return ;
+			return;
 		}
 		priv->up = 0;
-		RT_TRACE(COMP_RESET,"%s():======>start to down the driver\n",__FUNCTION__);
-//		if(!netif_queue_stopped(dev))
-//			netif_stop_queue(dev);
+		RT_TRACE(COMP_RESET, "%s():======>start to down the driver\n", __func__);
 
 		rtl8192_rtx_disable(dev);
 		rtl8192_cancel_deferred_work(priv);
@@ -3883,55 +3437,44 @@
 		del_timer_sync(&priv->watch_dog_timer);
 
 		ieee->sync_scan_hurryup = 1;
-		if(ieee->state == IEEE80211_LINKED)
-		{
+		if (ieee->state == IEEE80211_LINKED) {
 			down(&ieee->wx_sem);
-			printk("ieee->state is IEEE80211_LINKED\n");
+			netdev_dbg(dev, "ieee->state is IEEE80211_LINKED\n");
 			ieee80211_stop_send_beacons(priv->ieee80211);
 			del_timer_sync(&ieee->associate_timer);
 			cancel_delayed_work(&ieee->associate_retry_wq);
 			ieee80211_stop_scan(ieee);
 			netif_carrier_off(dev);
 			up(&ieee->wx_sem);
+		} else {
+			netdev_dbg(dev, "ieee->state is NOT LINKED\n");
+			ieee80211_softmac_stop_protocol(priv->ieee80211);
 		}
-		else{
-			printk("ieee->state is NOT LINKED\n");
-			ieee80211_softmac_stop_protocol(priv->ieee80211);			}
 		up(&priv->wx_sem);
-		RT_TRACE(COMP_RESET,"%s():<==========down process is finished\n",__FUNCTION__);
-	//rtl8192_irq_disable(dev);
-		RT_TRACE(COMP_RESET,"%s():===========>start up the driver\n",__FUNCTION__);
+		RT_TRACE(COMP_RESET, "%s():<==========down process is finished\n", __func__);
+		RT_TRACE(COMP_RESET, "%s():===========>start up the driver\n", __func__);
 		reset_status = _rtl8192_up(dev);
 
-		RT_TRACE(COMP_RESET,"%s():<===========up process is finished\n",__FUNCTION__);
-		if(reset_status == -EAGAIN)
-		{
-			if(reset_times < 3)
-			{
+		RT_TRACE(COMP_RESET, "%s():<===========up process is finished\n", __func__);
+		if (reset_status == -EAGAIN) {
+			if (reset_times < 3) {
 				reset_times++;
 				goto RESET_START;
-			}
-			else
-			{
-				RT_TRACE(COMP_ERR," ERR!!! %s():  Reset Failed!!\n", __FUNCTION__);
+			} else {
+				RT_TRACE(COMP_ERR, " ERR!!! %s():  Reset Failed!!\n", __func__);
 			}
 		}
 		ieee->is_silent_reset = 1;
 		EnableHWSecurityConfig8192(dev);
-		if(ieee->state == IEEE80211_LINKED && ieee->iw_mode == IW_MODE_INFRA)
-		{
+		if (ieee->state == IEEE80211_LINKED && ieee->iw_mode == IW_MODE_INFRA) {
 			ieee->set_chan(ieee->dev, ieee->current_network.channel);
 
 			queue_work(ieee->wq, &ieee->associate_complete_wq);
 
-		}
-		else if(ieee->state == IEEE80211_LINKED && ieee->iw_mode == IW_MODE_ADHOC)
-		{
+		} else if (ieee->state == IEEE80211_LINKED && ieee->iw_mode == IW_MODE_ADHOC) {
 			ieee->set_chan(ieee->dev, ieee->current_network.channel);
 			ieee->link_change(ieee->dev);
 
-		//	notify_wx_assoc_event(ieee);
-
 			ieee80211_start_send_beacons(ieee);
 
 			if (ieee->data_hard_resume)
@@ -3944,7 +3487,7 @@
 		priv->ResetProgress = RESET_TYPE_NORESET;
 		priv->reset_count++;
 
-		priv->bForcedSilentReset =false;
+		priv->bForcedSilentReset = false;
 		priv->bResetInProgress = false;
 
 		// For test --> force write UFWP.
@@ -3953,50 +3496,36 @@
 	}
 }
 
-void CAM_read_entry(
-	struct net_device *dev,
-	u32			iIndex
-)
+void CAM_read_entry(struct net_device *dev, u32 iIndex)
 {
-	u32 target_command=0;
-	 u32 target_content=0;
-	 u8 entry_i=0;
-	 u32 ulStatus;
-	s32 i=100;
-//	printk("=======>start read CAM\n");
-	for(entry_i=0;entry_i<CAM_CONTENT_COUNT;entry_i++)
-	{
-	// polling bit, and No Write enable, and address
-		target_command= entry_i+CAM_CONTENT_COUNT*iIndex;
-		target_command= target_command | BIT31;
+	u32 target_command = 0;
+	u32 target_content = 0;
+	u8 entry_i = 0;
+	u32 ulStatus;
+	s32 i = 100;
+	for (entry_i = 0; entry_i < CAM_CONTENT_COUNT; entry_i++) {
+		// polling bit, and No Write enable, and address
+		target_command = entry_i+CAM_CONTENT_COUNT*iIndex;
+		target_command = target_command | BIT31;
 
-	//Check polling bit is clear
-//	mdelay(1);
-		while((i--)>=0)
-		{
-			ulStatus = read_nic_dword(dev, RWCAM);
-			if(ulStatus & BIT31){
+		//Check polling bit is clear
+		while ((i--) >= 0) {
+			read_nic_dword(dev, RWCAM, &ulStatus);
+			if (ulStatus & BIT31)
 				continue;
-			}
-			else{
+			else
 				break;
-			}
 		}
 		write_nic_dword(dev, RWCAM, target_command);
-		RT_TRACE(COMP_SEC,"CAM_read_entry(): WRITE A0: %x \n",target_command);
-	 //	printk("CAM_read_entry(): WRITE A0: %lx \n",target_command);
-		target_content = read_nic_dword(dev, RCAMO);
-		RT_TRACE(COMP_SEC, "CAM_read_entry(): WRITE A8: %x \n",target_content);
-	 //	printk("CAM_read_entry(): WRITE A8: %lx \n",target_content);
+		RT_TRACE(COMP_SEC, "CAM_read_entry(): WRITE A0: %x \n", target_command);
+		read_nic_dword(dev, RCAMO, &target_content);
+		RT_TRACE(COMP_SEC, "CAM_read_entry(): WRITE A8: %x \n", target_content);
 	}
 	printk("\n");
 }
 
-void rtl819x_update_rxcounts(
-	struct r8192_priv *priv,
-	u32* TotalRxBcnNum,
-	u32* TotalRxDataNum
-)
+void rtl819x_update_rxcounts(struct r8192_priv *priv, u32 *TotalRxBcnNum,
+			     u32 *TotalRxDataNum)
 {
 	u16			SlotIndex;
 	u8			i;
@@ -4007,80 +3536,68 @@
 	SlotIndex = (priv->ieee80211->LinkDetectInfo.SlotIndex++)%(priv->ieee80211->LinkDetectInfo.SlotNum);
 	priv->ieee80211->LinkDetectInfo.RxBcnNum[SlotIndex] = priv->ieee80211->LinkDetectInfo.NumRecvBcnInPeriod;
 	priv->ieee80211->LinkDetectInfo.RxDataNum[SlotIndex] = priv->ieee80211->LinkDetectInfo.NumRecvDataInPeriod;
-	for( i=0; i<priv->ieee80211->LinkDetectInfo.SlotNum; i++ ){
+	for (i = 0; i < priv->ieee80211->LinkDetectInfo.SlotNum; i++) {
 		*TotalRxBcnNum += priv->ieee80211->LinkDetectInfo.RxBcnNum[i];
 		*TotalRxDataNum += priv->ieee80211->LinkDetectInfo.RxDataNum[i];
 	}
 }
 
 
-extern	void	rtl819x_watchdog_wqcallback(struct work_struct *work)
+extern void rtl819x_watchdog_wqcallback(struct work_struct *work)
 {
-	struct delayed_work *dwork = container_of(work,struct delayed_work,work);
-       struct r8192_priv *priv = container_of(dwork,struct r8192_priv,watch_dog_wq);
-       struct net_device *dev = priv->ieee80211->dev;
-	struct ieee80211_device* ieee = priv->ieee80211;
+	struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+	struct r8192_priv *priv = container_of(dwork, struct r8192_priv, watch_dog_wq);
+	struct net_device *dev = priv->ieee80211->dev;
+	struct ieee80211_device *ieee = priv->ieee80211;
 	RESET_TYPE	ResetType = RESET_TYPE_NORESET;
 	static u8	check_reset_cnt;
 	bool bBusyTraffic = false;
+	u32	TotalRxBcnNum = 0;
+	u32	TotalRxDataNum = 0;
 
-	if(!priv->up)
+	if (!priv->up)
 		return;
 	hal_dm_watchdog(dev);
 
-	{//to get busy traffic condition
-		if(ieee->state == IEEE80211_LINKED)
-		{
-			if(	ieee->LinkDetectInfo.NumRxOkInPeriod> 666 ||
-				ieee->LinkDetectInfo.NumTxOkInPeriod> 666 ) {
-				bBusyTraffic = true;
-			}
-			ieee->LinkDetectInfo.NumRxOkInPeriod = 0;
-			ieee->LinkDetectInfo.NumTxOkInPeriod = 0;
-			ieee->LinkDetectInfo.bBusyTraffic = bBusyTraffic;
+	//to get busy traffic condition
+	if (ieee->state == IEEE80211_LINKED) {
+		if (ieee->LinkDetectInfo.NumRxOkInPeriod > 666 ||
+		    ieee->LinkDetectInfo.NumTxOkInPeriod > 666 ) {
+			bBusyTraffic = true;
 		}
+		ieee->LinkDetectInfo.NumRxOkInPeriod = 0;
+		ieee->LinkDetectInfo.NumTxOkInPeriod = 0;
+		ieee->LinkDetectInfo.bBusyTraffic = bBusyTraffic;
 	}
 	//added by amy for AP roaming
-	{
-		if(priv->ieee80211->state == IEEE80211_LINKED && priv->ieee80211->iw_mode == IW_MODE_INFRA)
-		{
-			u32	TotalRxBcnNum = 0;
-			u32	TotalRxDataNum = 0;
+	if (priv->ieee80211->state == IEEE80211_LINKED && priv->ieee80211->iw_mode == IW_MODE_INFRA) {
 
-			rtl819x_update_rxcounts(priv, &TotalRxBcnNum, &TotalRxDataNum);
-			if((TotalRxBcnNum+TotalRxDataNum) == 0)
-			{
-				#ifdef TODO
-				if(rfState == eRfOff)
-					RT_TRACE(COMP_ERR,"========>%s()\n",__FUNCTION__);
-				#endif
-				printk("===>%s(): AP is power off,connect another one\n",__FUNCTION__);
-			//	Dot11d_Reset(dev);
-				priv->ieee80211->state = IEEE80211_ASSOCIATING;
-				notify_wx_assoc_event(priv->ieee80211);
-				RemovePeerTS(priv->ieee80211,priv->ieee80211->current_network.bssid);
-				priv->ieee80211->link_change(dev);
-				queue_work(priv->ieee80211->wq, &priv->ieee80211->associate_procedure_wq);
+		rtl819x_update_rxcounts(priv, &TotalRxBcnNum, &TotalRxDataNum);
+		if ((TotalRxBcnNum+TotalRxDataNum) == 0) {
+#ifdef TODO
+			if (rfState == eRfOff)
+				RT_TRACE(COMP_ERR, "========>%s()\n", __func__);
+#endif
+			netdev_dbg(dev, "===>%s(): AP is power off, connect another one\n", __func__);
+			priv->ieee80211->state = IEEE80211_ASSOCIATING;
+			notify_wx_assoc_event(priv->ieee80211);
+			RemovePeerTS(priv->ieee80211, priv->ieee80211->current_network.bssid);
+			priv->ieee80211->link_change(dev);
+			queue_work(priv->ieee80211->wq, &priv->ieee80211->associate_procedure_wq);
 
-			}
 		}
-		priv->ieee80211->LinkDetectInfo.NumRecvBcnInPeriod=0;
-		priv->ieee80211->LinkDetectInfo.NumRecvDataInPeriod=0;
 	}
-//	CAM_read_entry(dev,4);
+	priv->ieee80211->LinkDetectInfo.NumRecvBcnInPeriod = 0;
+	priv->ieee80211->LinkDetectInfo.NumRecvDataInPeriod = 0;
 	//check if reset the driver
-	if(check_reset_cnt++ >= 3)
-	{
+	if (check_reset_cnt++ >= 3) {
 		ResetType = rtl819x_ifcheck_resetornot(dev);
 		check_reset_cnt = 3;
-		//DbgPrint("Start to check silent reset\n");
 	}
-	//	RT_TRACE(COMP_RESET,"%s():priv->force_reset is %d,priv->ResetProgress is %d, priv->bForcedSilentReset is %d,priv->bDisableNormalResetCheck is %d,ResetType is %d\n",__FUNCTION__,priv->force_reset,priv->ResetProgress,priv->bForcedSilentReset,priv->bDisableNormalResetCheck,ResetType);
-	if( (priv->force_reset) || (priv->ResetProgress==RESET_TYPE_NORESET &&
-		(priv->bForcedSilentReset ||
-		(!priv->bDisableNormalResetCheck && ResetType==RESET_TYPE_SILENT)))) // This is control by OID set in Pomelo
-	{
-		RT_TRACE(COMP_RESET,"%s():priv->force_reset is %d,priv->ResetProgress is %d, priv->bForcedSilentReset is %d,priv->bDisableNormalResetCheck is %d,ResetType is %d\n",__FUNCTION__,priv->force_reset,priv->ResetProgress,priv->bForcedSilentReset,priv->bDisableNormalResetCheck,ResetType);
+	if ((priv->force_reset) || (priv->ResetProgress == RESET_TYPE_NORESET &&
+	    (priv->bForcedSilentReset ||
+	    (!priv->bDisableNormalResetCheck && ResetType == RESET_TYPE_SILENT)))) { // This is control by OID set in Pomelo
+		RT_TRACE(COMP_RESET, "%s():priv->force_reset is %d,priv->ResetProgress is %d, priv->bForcedSilentReset is %d,priv->bDisableNormalResetCheck is %d,ResetType is %d\n", __func__, priv->force_reset, priv->ResetProgress, priv->bForcedSilentReset, priv->bDisableNormalResetCheck, ResetType);
 		rtl819x_ifsilentreset(dev);
 	}
 	priv->force_reset = false;
@@ -4093,33 +3610,29 @@
 void watch_dog_timer_callback(unsigned long data)
 {
 	struct r8192_priv *priv = ieee80211_priv((struct net_device *) data);
-	//printk("===============>watch_dog timer\n");
-	queue_delayed_work(priv->priv_wq,&priv->watch_dog_wq, 0);
+	queue_delayed_work(priv->priv_wq, &priv->watch_dog_wq, 0);
 	mod_timer(&priv->watch_dog_timer, jiffies + MSECS(IEEE80211_WATCH_DOG_TIME));
 }
 int _rtl8192_up(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	//int i;
 	int init_status = 0;
-	priv->up=1;
-	priv->ieee80211->ieee_up=1;
+	priv->up = 1;
+	priv->ieee80211->ieee_up = 1;
 	RT_TRACE(COMP_INIT, "Bringing up iface");
 	init_status = rtl8192_adapter_start(dev);
-	if(!init_status)
-	{
-		RT_TRACE(COMP_ERR,"ERR!!! %s(): initialization failed!\n", __FUNCTION__);
-		priv->up=priv->ieee80211->ieee_up = 0;
+	if (!init_status) {
+		RT_TRACE(COMP_ERR, "ERR!!! %s(): initialization failed!\n", __func__);
+		priv->up = priv->ieee80211->ieee_up = 0;
 		return -EAGAIN;
 	}
 	RT_TRACE(COMP_INIT, "start adapter finished\n");
 	rtl8192_rx_enable(dev);
-//	rtl8192_tx_enable(dev);
-	if(priv->ieee80211->state != IEEE80211_LINKED)
-	ieee80211_softmac_start_protocol(priv->ieee80211);
+	if (priv->ieee80211->state != IEEE80211_LINKED)
+		ieee80211_softmac_start_protocol(priv->ieee80211);
 	ieee80211_reset_queue(priv->ieee80211);
 	watch_dog_timer_callback((unsigned long) dev);
-	if(!netif_queue_stopped(dev))
+	if (!netif_queue_stopped(dev))
 		netif_start_queue(dev);
 	else
 		netif_wake_queue(dev);
@@ -4172,40 +3685,35 @@
 
 	if (priv->up == 0) return -1;
 
-	priv->up=0;
+	priv->up = 0;
 	priv->ieee80211->ieee_up = 0;
-	RT_TRACE(COMP_DOWN, "==========>%s()\n", __FUNCTION__);
-/* FIXME */
+	RT_TRACE(COMP_DOWN, "==========>%s()\n", __func__);
+	/* FIXME */
 	if (!netif_queue_stopped(dev))
 		netif_stop_queue(dev);
 
 	rtl8192_rtx_disable(dev);
-	//rtl8192_irq_disable(dev);
 
- /* Tx related queue release */
-	for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-		skb_queue_purge(&priv->ieee80211->skb_waitQ [i]);
-	}
-	for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-		skb_queue_purge(&priv->ieee80211->skb_aggQ [i]);
-	}
+	/* Tx related queue release */
+	for (i = 0; i < MAX_QUEUE_SIZE; i++)
+		skb_queue_purge(&priv->ieee80211->skb_waitQ[i]);
+	for (i = 0; i < MAX_QUEUE_SIZE; i++)
+		skb_queue_purge(&priv->ieee80211->skb_aggQ[i]);
 
-	for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-		skb_queue_purge(&priv->ieee80211->skb_drv_aggQ [i]);
-	}
+	for (i = 0; i < MAX_QUEUE_SIZE; i++)
+		skb_queue_purge(&priv->ieee80211->skb_drv_aggQ[i]);
 
 	//as cancel_delayed_work will del work->timer, so if work is not defined as struct delayed_work, it will corrupt
-//	flush_scheduled_work();
 	rtl8192_cancel_deferred_work(priv);
 	deinit_hal_dm(dev);
 	del_timer_sync(&priv->watch_dog_timer);
 
 
 	ieee80211_softmac_stop_protocol(priv->ieee80211);
-	memset(&priv->ieee80211->current_network, 0 , offsetof(struct ieee80211_network, list));
-	RT_TRACE(COMP_DOWN, "<==========%s()\n", __FUNCTION__);
+	memset(&priv->ieee80211->current_network, 0, offsetof(struct ieee80211_network, list));
+	RT_TRACE(COMP_DOWN, "<==========%s()\n", __func__);
 
-		return 0;
+	return 0;
 }
 
 
@@ -4213,27 +3721,19 @@
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	int reset_status = 0;
-	//u8 reset_times = 0;
-	if (priv->up == 0) return ;
+	if (priv->up == 0) return;
 	priv->up = 0;
 
 	rtl8192_cancel_deferred_work(priv);
 	del_timer_sync(&priv->watch_dog_timer);
-	//cancel_delayed_work(&priv->SwChnlWorkItem);
 
 	ieee80211_softmac_stop_protocol(priv->ieee80211);
 
-	//rtl8192_irq_disable(dev);
 	rtl8192_rtx_disable(dev);
 	reset_status = _rtl8192_up(dev);
 
 }
 
-/*
-void rtl8192_restart(struct net_device *dev)
-{
-	struct r8192_priv *priv = ieee80211_priv(dev);
-*/
 void rtl8192_restart(struct work_struct *work)
 {
 	struct r8192_priv *priv = container_of(work, struct r8192_priv, reset_wq);
@@ -4251,19 +3751,13 @@
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	short promisc;
 
-	//down(&priv->wx_sem);
-
 	/* FIXME FIXME */
 
-	promisc = (dev->flags & IFF_PROMISC) ? 1:0;
+	promisc = (dev->flags & IFF_PROMISC) ? 1 : 0;
 
 	if (promisc != priv->promisc)
-	//	rtl8192_commit(dev);
 
-	priv->promisc = promisc;
-
-	//schedule_work(&priv->reset_wq);
-	//up(&priv->wx_sem);
+		priv->promisc = promisc;
 }
 
 
@@ -4287,99 +3781,90 @@
 {
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	struct iwreq *wrq = (struct iwreq *)rq;
-	int ret=-1;
+	int ret = -1;
 	struct ieee80211_device *ieee = priv->ieee80211;
 	u32 key[4];
-	u8 broadcast_addr[6] = {0xff,0xff,0xff,0xff,0xff,0xff};
+	u8 broadcast_addr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
 	struct iw_point *p = &wrq->u.data;
-	struct ieee_param *ipw = NULL;//(struct ieee_param *)wrq->u.data.pointer;
+	struct ieee_param *ipw = NULL;
 
 	down(&priv->wx_sem);
 
 
-     if (p->length < sizeof(struct ieee_param) || !p->pointer){
-	     ret = -EINVAL;
-	     goto out;
+	if (p->length < sizeof(struct ieee_param) || !p->pointer) {
+		ret = -EINVAL;
+		goto out;
 	}
 
-     ipw = kmalloc(p->length, GFP_KERNEL);
-     if (ipw == NULL){
-	     ret = -ENOMEM;
-	     goto out;
-     }
-     if (copy_from_user(ipw, p->pointer, p->length)) {
+	ipw = kmalloc(p->length, GFP_KERNEL);
+	if (ipw == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	if (copy_from_user(ipw, p->pointer, p->length)) {
 		kfree(ipw);
-	    ret = -EFAULT;
-	    goto out;
+		ret = -EFAULT;
+		goto out;
 	}
 
 	switch (cmd) {
 	case RTL_IOCTL_WPA_SUPPLICANT:
-	//parse here for HW security
-		if (ipw->cmd == IEEE_CMD_SET_ENCRYPTION)
-		{
-			if (ipw->u.crypt.set_tx)
-			{
-				if (strcmp(ipw->u.crypt.alg, "CCMP") == 0)
+		//parse here for HW security
+		if (ipw->cmd == IEEE_CMD_SET_ENCRYPTION) {
+			if (ipw->u.crypt.set_tx) {
+				if (strcmp(ipw->u.crypt.alg, "CCMP") == 0) {
 					ieee->pairwise_key_type = KEY_TYPE_CCMP;
-				else if (strcmp(ipw->u.crypt.alg, "TKIP") == 0)
+				} else if (strcmp(ipw->u.crypt.alg, "TKIP") == 0) {
 					ieee->pairwise_key_type = KEY_TYPE_TKIP;
-				else if (strcmp(ipw->u.crypt.alg, "WEP") == 0)
-				{
+				} else if (strcmp(ipw->u.crypt.alg, "WEP") == 0) {
 					if (ipw->u.crypt.key_len == 13)
 						ieee->pairwise_key_type = KEY_TYPE_WEP104;
 					else if (ipw->u.crypt.key_len == 5)
 						ieee->pairwise_key_type = KEY_TYPE_WEP40;
-				}
-				else
+				} else {
 					ieee->pairwise_key_type = KEY_TYPE_NA;
-
-				if (ieee->pairwise_key_type)
-				{
-					memcpy((u8*)key, ipw->u.crypt.key, 16);
-					EnableHWSecurityConfig8192(dev);
-				//we fill both index entry and 4th entry for pairwise key as in IPW interface, adhoc will only get here, so we need index entry for its default key serching!
-				//added by WB.
-					setKey(dev, 4, ipw->u.crypt.idx, ieee->pairwise_key_type, (u8*)ieee->ap_mac_addr, 0, key);
-					if (ieee->auth_mode != 2)
-					setKey(dev, ipw->u.crypt.idx, ipw->u.crypt.idx, ieee->pairwise_key_type, (u8*)ieee->ap_mac_addr, 0, key);
 				}
-			}
-			else //if (ipw->u.crypt.idx) //group key use idx > 0
-			{
-				memcpy((u8*)key, ipw->u.crypt.key, 16);
-				if (strcmp(ipw->u.crypt.alg, "CCMP") == 0)
-					ieee->group_key_type= KEY_TYPE_CCMP;
-				else if (strcmp(ipw->u.crypt.alg, "TKIP") == 0)
+
+				if (ieee->pairwise_key_type) {
+					memcpy((u8 *)key, ipw->u.crypt.key, 16);
+					EnableHWSecurityConfig8192(dev);
+					//we fill both index entry and 4th entry for pairwise key as in IPW interface, adhoc will only get here, so we need index entry for its default key serching!
+					//added by WB.
+					setKey(dev, 4, ipw->u.crypt.idx, ieee->pairwise_key_type, (u8 *)ieee->ap_mac_addr, 0, key);
+					if (ieee->auth_mode != 2)
+						setKey(dev, ipw->u.crypt.idx, ipw->u.crypt.idx, ieee->pairwise_key_type, (u8 *)ieee->ap_mac_addr, 0, key);
+				}
+			} else {
+				memcpy((u8 *)key, ipw->u.crypt.key, 16);
+				if (strcmp(ipw->u.crypt.alg, "CCMP") == 0) {
+					ieee->group_key_type = KEY_TYPE_CCMP;
+				} else if (strcmp(ipw->u.crypt.alg, "TKIP") == 0) {
 					ieee->group_key_type = KEY_TYPE_TKIP;
-				else if (strcmp(ipw->u.crypt.alg, "WEP") == 0)
-				{
+				} else if (strcmp(ipw->u.crypt.alg, "WEP") == 0) {
 					if (ipw->u.crypt.key_len == 13)
 						ieee->group_key_type = KEY_TYPE_WEP104;
 					else if (ipw->u.crypt.key_len == 5)
 						ieee->group_key_type = KEY_TYPE_WEP40;
-				}
-				else
+				} else {
 					ieee->group_key_type = KEY_TYPE_NA;
+				}
 
-				if (ieee->group_key_type)
-				{
-						setKey(	dev,
-							ipw->u.crypt.idx,
-							ipw->u.crypt.idx,		//KeyIndex
-							ieee->group_key_type,	//KeyType
-							broadcast_addr,	//MacAddr
-							0,		//DefaultKey
-							key);		//KeyContent
+				if (ieee->group_key_type) {
+					setKey(dev, ipw->u.crypt.idx,
+					       ipw->u.crypt.idx,		//KeyIndex
+					       ieee->group_key_type,	//KeyType
+					       broadcast_addr,	//MacAddr
+					       0,		//DefaultKey
+					       key);		//KeyContent
 				}
 			}
 		}
 #ifdef JOHN_HWSEC_DEBUG
 		//john's test 0711
 		printk("@@ wrq->u pointer = ");
-		for(i=0;i<wrq->u.data.length;i++){
-			if(i%10==0) printk("\n");
-			printk( "%8x|", ((u32*)wrq->u.data.pointer)[i] );
+		for (i = 0; i < wrq->u.data.length; i++) {
+			if (i%10 == 0) printk("\n");
+			printk("%8x|", ((u32 *)wrq->u.data.pointer)[i]);
 		}
 		printk("\n");
 #endif /*JOHN_HWSEC_DEBUG*/
@@ -4401,8 +3886,8 @@
 {
 	u8  ret_rate = 0xff;
 
-	if(!bIsHT) {
-		switch(rate) {
+	if (!bIsHT) {
+		switch (rate) {
 		case DESC90_RATE1M:   ret_rate = MGN_1M;         break;
 		case DESC90_RATE2M:   ret_rate = MGN_2M;         break;
 		case DESC90_RATE5_5M: ret_rate = MGN_5_5M;       break;
@@ -4423,7 +3908,7 @@
 		}
 
 	} else {
-		switch(rate) {
+		switch (rate) {
 		case DESC90_RATEMCS0:   ret_rate = MGN_MCS0;    break;
 		case DESC90_RATEMCS1:   ret_rate = MGN_MCS1;    break;
 		case DESC90_RATEMCS2:   ret_rate = MGN_MCS2;    break;
@@ -4444,7 +3929,7 @@
 
 		default:
 			ret_rate = 0xff;
-			RT_TRACE(COMP_RECV, "HwRateToMRate90(): Non supported Rate [%x], bIsHT = %d!!!\n",rate, bIsHT);
+			RT_TRACE(COMP_RECV, "HwRateToMRate90(): Non supported Rate [%x], bIsHT = %d!!!\n", rate, bIsHT);
 			break;
 		}
 	}
@@ -4467,11 +3952,11 @@
  * Return:
  *               None
  */
-void UpdateRxPktTimeStamp8190 (struct net_device *dev, struct ieee80211_rx_stats *stats)
+void UpdateRxPktTimeStamp8190(struct net_device *dev, struct ieee80211_rx_stats *stats)
 {
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 
-	if(stats->bIsAMPDU && !stats->bFirstMPDU) {
+	if (stats->bIsAMPDU && !stats->bFirstMPDU) {
 		stats->mac_time[0] = priv->LastRxDescTSFLow;
 		stats->mac_time[1] = priv->LastRxDescTSFHigh;
 	} else {
@@ -4482,7 +3967,7 @@
 
 //by amy 080606
 
-long rtl819x_translate_todbm(u8 signal_strength_index	)// 0-100 index.
+long rtl819x_translate_todbm(u8 signal_strength_index)// 0-100 index.
 {
 	long	signal_power; // in dBm.
 
@@ -4498,12 +3983,11 @@
     be a local static. Otherwise, it may increase when we return from S3/S4. The
     value will be kept in memory or disk. Declare the value in the adaptor
     and it will be reinitialized when returned from S3/S4. */
-void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee80211_rx_stats * pprevious_stats, struct ieee80211_rx_stats * pcurrent_stats)
+void rtl8192_process_phyinfo(struct r8192_priv *priv, u8 *buffer, struct ieee80211_rx_stats *pprevious_stats, struct ieee80211_rx_stats *pcurrent_stats)
 {
 	bool bcheck = false;
 	u8	rfpath;
 	u32	nspatial_stream, tmp_val;
-	//u8	i;
 	static u32 slide_rssi_index, slide_rssi_statistics;
 	static u32 slide_evm_index, slide_evm_statistics;
 	static u32 last_rssi, last_evm;
@@ -4512,8 +3996,8 @@
 	static u32 last_beacon_adc_pwdb;
 
 	struct ieee80211_hdr_3addr *hdr;
-	u16 sc ;
-	unsigned int frag,seq;
+	u16 sc;
+	unsigned int frag, seq;
 	hdr = (struct ieee80211_hdr_3addr *)buffer;
 	sc = le16_to_cpu(hdr->seq_ctl);
 	frag = WLAN_GET_SEQ_FRAG(sc);
@@ -4523,14 +4007,12 @@
 	//
 	// Check whether we should take the previous packet into accounting
 	//
-	if(!pprevious_stats->bIsAMPDU)
-	{
+	if (!pprevious_stats->bIsAMPDU) {
 		// if previous packet is not aggregated packet
 		bcheck = true;
 	}
 
-	if(slide_rssi_statistics++ >= PHY_RSSI_SLID_WIN_MAX)
-	{
+	if (slide_rssi_statistics++ >= PHY_RSSI_SLID_WIN_MAX) {
 		slide_rssi_statistics = PHY_RSSI_SLID_WIN_MAX;
 		last_rssi = priv->stats.slide_signal_strength[slide_rssi_index];
 		priv->stats.slide_rssi_total -= last_rssi;
@@ -4538,7 +4020,7 @@
 	priv->stats.slide_rssi_total += pprevious_stats->SignalStrength;
 
 	priv->stats.slide_signal_strength[slide_rssi_index++] = pprevious_stats->SignalStrength;
-	if(slide_rssi_index >= PHY_RSSI_SLID_WIN_MAX)
+	if (slide_rssi_index >= PHY_RSSI_SLID_WIN_MAX)
 		slide_rssi_index = 0;
 
 	// <1> Showed on UI for user, in dbm
@@ -4548,13 +4030,12 @@
 	//
 	// If the previous packet does not match the criteria, neglect it
 	//
-	if(!pprevious_stats->bPacketMatchBSSID)
-	{
-		if(!pprevious_stats->bToSelfBA)
+	if (!pprevious_stats->bPacketMatchBSSID) {
+		if (!pprevious_stats->bToSelfBA)
 			return;
 	}
 
-	if(!bcheck)
+	if (!bcheck)
 		return;
 
 
@@ -4570,33 +4051,25 @@
 
 	// <2> Showed on UI for engineering
 	// hardware does not provide rssi information for each rf path in CCK
-	if(!pprevious_stats->bIsCCK && (pprevious_stats->bPacketToSelf || pprevious_stats->bToSelfBA))
-	{
-		for (rfpath = RF90_PATH_A; rfpath < priv->NumTotalRFPath; rfpath++)
-		{
-		     if (!rtl8192_phy_CheckIsLegalRFPath(priv->ieee80211->dev, rfpath))
-				 continue;
+	if (!pprevious_stats->bIsCCK && (pprevious_stats->bPacketToSelf || pprevious_stats->bToSelfBA)) {
+		for (rfpath = RF90_PATH_A; rfpath < priv->NumTotalRFPath; rfpath++) {
+			if (!rtl8192_phy_CheckIsLegalRFPath(priv->ieee80211->dev, rfpath))
+				continue;
 
 			//Fixed by Jacken 2008-03-20
-			if(priv->stats.rx_rssi_percentage[rfpath] == 0)
-			{
+			if (priv->stats.rx_rssi_percentage[rfpath] == 0)
 				priv->stats.rx_rssi_percentage[rfpath] = pprevious_stats->RxMIMOSignalStrength[rfpath];
-				//DbgPrint("MIMO RSSI initialize \n");
-			}
-			if(pprevious_stats->RxMIMOSignalStrength[rfpath]  > priv->stats.rx_rssi_percentage[rfpath])
-			{
+			if (pprevious_stats->RxMIMOSignalStrength[rfpath]  > priv->stats.rx_rssi_percentage[rfpath]) {
 				priv->stats.rx_rssi_percentage[rfpath] =
-					( (priv->stats.rx_rssi_percentage[rfpath]*(Rx_Smooth_Factor-1)) +
-					(pprevious_stats->RxMIMOSignalStrength[rfpath])) /(Rx_Smooth_Factor);
+					((priv->stats.rx_rssi_percentage[rfpath]*(Rx_Smooth_Factor-1)) +
+					 (pprevious_stats->RxMIMOSignalStrength[rfpath])) /(Rx_Smooth_Factor);
 				priv->stats.rx_rssi_percentage[rfpath] = priv->stats.rx_rssi_percentage[rfpath]  + 1;
-			}
-			else
-			{
+			} else {
 				priv->stats.rx_rssi_percentage[rfpath] =
-					( (priv->stats.rx_rssi_percentage[rfpath]*(Rx_Smooth_Factor-1)) +
-					(pprevious_stats->RxMIMOSignalStrength[rfpath])) /(Rx_Smooth_Factor);
+					((priv->stats.rx_rssi_percentage[rfpath]*(Rx_Smooth_Factor-1)) +
+					 (pprevious_stats->RxMIMOSignalStrength[rfpath])) /(Rx_Smooth_Factor);
 			}
-			RT_TRACE(COMP_DBG,"priv->stats.rx_rssi_percentage[rfPath]  = %d \n" ,priv->stats.rx_rssi_percentage[rfpath] );
+			RT_TRACE(COMP_DBG, "priv->stats.rx_rssi_percentage[rfPath]  = %d \n", priv->stats.rx_rssi_percentage[rfpath]);
 		}
 	}
 
@@ -4605,55 +4078,43 @@
 	// Check PWDB.
 	//
 	RT_TRACE(COMP_RXDESC, "Smooth %s PWDB = %d\n",
-				pprevious_stats->bIsCCK? "CCK": "OFDM",
-				pprevious_stats->RxPWDBAll);
+		 pprevious_stats->bIsCCK ? "CCK" : "OFDM",
+		 pprevious_stats->RxPWDBAll);
 
-	if(pprevious_stats->bPacketBeacon)
-	{
-/* record the beacon pwdb to the sliding window. */
-		if(slide_beacon_adc_pwdb_statistics++ >= PHY_Beacon_RSSI_SLID_WIN_MAX)
-		{
+	if (pprevious_stats->bPacketBeacon) {
+		/* record the beacon pwdb to the sliding window. */
+		if (slide_beacon_adc_pwdb_statistics++ >= PHY_Beacon_RSSI_SLID_WIN_MAX) {
 			slide_beacon_adc_pwdb_statistics = PHY_Beacon_RSSI_SLID_WIN_MAX;
 			last_beacon_adc_pwdb = priv->stats.Slide_Beacon_pwdb[slide_beacon_adc_pwdb_index];
 			priv->stats.Slide_Beacon_Total -= last_beacon_adc_pwdb;
-			//DbgPrint("slide_beacon_adc_pwdb_index = %d, last_beacon_adc_pwdb = %d, Adapter->RxStats.Slide_Beacon_Total = %d\n",
-			//	slide_beacon_adc_pwdb_index, last_beacon_adc_pwdb, Adapter->RxStats.Slide_Beacon_Total);
 		}
 		priv->stats.Slide_Beacon_Total += pprevious_stats->RxPWDBAll;
 		priv->stats.Slide_Beacon_pwdb[slide_beacon_adc_pwdb_index] = pprevious_stats->RxPWDBAll;
-		//DbgPrint("slide_beacon_adc_pwdb_index = %d, pPreviousRfd->Status.RxPWDBAll = %d\n", slide_beacon_adc_pwdb_index, pPreviousRfd->Status.RxPWDBAll);
 		slide_beacon_adc_pwdb_index++;
-		if(slide_beacon_adc_pwdb_index >= PHY_Beacon_RSSI_SLID_WIN_MAX)
+		if (slide_beacon_adc_pwdb_index >= PHY_Beacon_RSSI_SLID_WIN_MAX)
 			slide_beacon_adc_pwdb_index = 0;
 		pprevious_stats->RxPWDBAll = priv->stats.Slide_Beacon_Total/slide_beacon_adc_pwdb_statistics;
-		if(pprevious_stats->RxPWDBAll >= 3)
+		if (pprevious_stats->RxPWDBAll >= 3)
 			pprevious_stats->RxPWDBAll -= 3;
 	}
 
 	RT_TRACE(COMP_RXDESC, "Smooth %s PWDB = %d\n",
-				pprevious_stats->bIsCCK? "CCK": "OFDM",
-				pprevious_stats->RxPWDBAll);
+		 pprevious_stats->bIsCCK ? "CCK" : "OFDM",
+		 pprevious_stats->RxPWDBAll);
 
 
-	if(pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA)
-	{
-		if(priv->undecorated_smoothed_pwdb < 0)	// initialize
-		{
+	if (pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA) {
+		if (priv->undecorated_smoothed_pwdb < 0)	// initialize
 			priv->undecorated_smoothed_pwdb = pprevious_stats->RxPWDBAll;
-			//DbgPrint("First pwdb initialize \n");
-		}
-		if(pprevious_stats->RxPWDBAll > (u32)priv->undecorated_smoothed_pwdb)
-		{
+		if (pprevious_stats->RxPWDBAll > (u32)priv->undecorated_smoothed_pwdb) {
 			priv->undecorated_smoothed_pwdb =
-					( ((priv->undecorated_smoothed_pwdb)*(Rx_Smooth_Factor-1)) +
-					(pprevious_stats->RxPWDBAll)) /(Rx_Smooth_Factor);
+				(((priv->undecorated_smoothed_pwdb)*(Rx_Smooth_Factor-1)) +
+				 (pprevious_stats->RxPWDBAll)) /(Rx_Smooth_Factor);
 			priv->undecorated_smoothed_pwdb = priv->undecorated_smoothed_pwdb + 1;
-		}
-		else
-		{
+		} else {
 			priv->undecorated_smoothed_pwdb =
-					( ((priv->undecorated_smoothed_pwdb)*(Rx_Smooth_Factor-1)) +
-					(pprevious_stats->RxPWDBAll)) /(Rx_Smooth_Factor);
+				(((priv->undecorated_smoothed_pwdb)*(Rx_Smooth_Factor-1)) +
+				 (pprevious_stats->RxPWDBAll)) /(Rx_Smooth_Factor);
 		}
 
 	}
@@ -4662,13 +4123,9 @@
 	// Check EVM
 	//
 	/* record the general EVM to the sliding window. */
-	if(pprevious_stats->SignalQuality == 0)
-	{
-	}
-	else
-	{
-		if(pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA){
-			if(slide_evm_statistics++ >= PHY_RSSI_SLID_WIN_MAX){
+	if (pprevious_stats->SignalQuality) {
+		if (pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA) {
+			if (slide_evm_statistics++ >= PHY_RSSI_SLID_WIN_MAX) {
 				slide_evm_statistics = PHY_RSSI_SLID_WIN_MAX;
 				last_evm = priv->stats.slide_evm[slide_evm_index];
 				priv->stats.slide_evm_total -= last_evm;
@@ -4677,7 +4134,7 @@
 			priv->stats.slide_evm_total += pprevious_stats->SignalQuality;
 
 			priv->stats.slide_evm[slide_evm_index++] = pprevious_stats->SignalQuality;
-			if(slide_evm_index >= PHY_RSSI_SLID_WIN_MAX)
+			if (slide_evm_index >= PHY_RSSI_SLID_WIN_MAX)
 				slide_evm_index = 0;
 
 			// <1> Showed on UI for user, in percentage.
@@ -4688,19 +4145,14 @@
 		}
 
 		// <2> Showed on UI for engineering
-		if(pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA)
-		{
-			for(nspatial_stream = 0; nspatial_stream<2 ; nspatial_stream++) // 2 spatial stream
-			{
-				if(pprevious_stats->RxMIMOSignalQuality[nspatial_stream] != -1)
-				{
-					if(priv->stats.rx_evm_percentage[nspatial_stream] == 0)	// initialize
-					{
+		if (pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA) {
+			for (nspatial_stream = 0; nspatial_stream < 2; nspatial_stream++) { // 2 spatial stream
+				if (pprevious_stats->RxMIMOSignalQuality[nspatial_stream] != -1) {
+					if (priv->stats.rx_evm_percentage[nspatial_stream] == 0) // initialize
 						priv->stats.rx_evm_percentage[nspatial_stream] = pprevious_stats->RxMIMOSignalQuality[nspatial_stream];
-					}
 					priv->stats.rx_evm_percentage[nspatial_stream] =
-						( (priv->stats.rx_evm_percentage[nspatial_stream]* (Rx_Smooth_Factor-1)) +
-						(pprevious_stats->RxMIMOSignalQuality[nspatial_stream]* 1)) / (Rx_Smooth_Factor);
+						((priv->stats.rx_evm_percentage[nspatial_stream]* (Rx_Smooth_Factor-1)) +
+						 (pprevious_stats->RxMIMOSignalQuality[nspatial_stream]* 1)) / (Rx_Smooth_Factor);
 				}
 			}
 		}
@@ -4725,126 +4177,104 @@
  *	05/26/2008	amy		Create Version 0 porting from windows code.
  *
  *---------------------------------------------------------------------------*/
-static u8 rtl819x_query_rxpwrpercentage(
-	char		antpower
-	)
+static u8 rtl819x_query_rxpwrpercentage(char antpower)
 {
 	if ((antpower <= -100) || (antpower >= 20))
-	{
 		return	0;
-	}
 	else if (antpower >= 0)
-	{
 		return	100;
-	}
 	else
-	{
-		return	(100+antpower);
-	}
+		return	100 + antpower;
 
 }	/* QueryRxPwrPercentage */
 
-static u8
-rtl819x_evm_dbtopercentage(
-    char value
-    )
+static u8 rtl819x_evm_dbtopercentage(char value)
 {
-    char ret_val;
+	char ret_val;
 
-    ret_val = value;
+	ret_val = value;
 
-    if(ret_val >= 0)
-	ret_val = 0;
-    if(ret_val <= -33)
-	ret_val = -33;
-    ret_val = 0 - ret_val;
-    ret_val*=3;
-	if(ret_val == 99)
+	if (ret_val >= 0)
+		ret_val = 0;
+	if (ret_val <= -33)
+		ret_val = -33;
+	ret_val = 0 - ret_val;
+	ret_val *= 3;
+	if (ret_val == 99)
 		ret_val = 100;
-    return(ret_val);
+	return ret_val;
 }
 //
 //	Description:
 //	We want good-looking for signal strength/quality
 //	2007/7/19 01:09, by cosa.
 //
-long
-rtl819x_signal_scale_mapping(
-	long currsig
-	)
+long rtl819x_signal_scale_mapping(long currsig)
 {
 	long retsig;
 
 	// Step 1. Scale mapping.
-	if(currsig >= 61 && currsig <= 100)
-	{
+	if (currsig >= 61 && currsig <= 100)
 		retsig = 90 + ((currsig - 60) / 4);
-	}
-	else if(currsig >= 41 && currsig <= 60)
-	{
+	else if (currsig >= 41 && currsig <= 60)
 		retsig = 78 + ((currsig - 40) / 2);
-	}
-	else if(currsig >= 31 && currsig <= 40)
-	{
+	else if (currsig >= 31 && currsig <= 40)
 		retsig = 66 + (currsig - 30);
-	}
-	else if(currsig >= 21 && currsig <= 30)
-	{
+	else if (currsig >= 21 && currsig <= 30)
 		retsig = 54 + (currsig - 20);
-	}
-	else if(currsig >= 5 && currsig <= 20)
-	{
+	else if (currsig >= 5 && currsig <= 20)
 		retsig = 42 + (((currsig - 5) * 2) / 3);
-	}
-	else if(currsig == 4)
-	{
+	else if (currsig == 4)
 		retsig = 36;
-	}
-	else if(currsig == 3)
-	{
+	else if (currsig == 3)
 		retsig = 27;
-	}
-	else if(currsig == 2)
-	{
+	else if (currsig == 2)
 		retsig = 18;
-	}
-	else if(currsig == 1)
-	{
+	else if (currsig == 1)
 		retsig = 9;
-	}
 	else
-	{
 		retsig = currsig;
-	}
 
 	return retsig;
 }
 
-static void rtl8192_query_rxphystatus(
-	struct r8192_priv * priv,
-	struct ieee80211_rx_stats * pstats,
-	rx_drvinfo_819x_usb  * pdrvinfo,
-	struct ieee80211_rx_stats * precord_stats,
-	bool bpacket_match_bssid,
-	bool bpacket_toself,
-	bool bPacketBeacon,
-	bool bToSelfBA
-	)
+static inline bool rx_hal_is_cck_rate(struct rx_drvinfo_819x_usb *pdrvinfo)
 {
-	//PRT_RFD_STATUS		pRtRfdStatus = &(pRfd->Status);
-	phy_sts_ofdm_819xusb_t*	pofdm_buf;
-	phy_sts_cck_819xusb_t	*	pcck_buf;
-	phy_ofdm_rx_status_rxsc_sgien_exintfflag* prxsc;
+	if (pdrvinfo->RxHT)
+		return false;
+
+	switch (pdrvinfo->RxRate) {
+	case DESC90_RATE1M:
+	case DESC90_RATE2M:
+	case DESC90_RATE5_5M:
+	case DESC90_RATE11M:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static void rtl8192_query_rxphystatus(struct r8192_priv *priv,
+				      struct ieee80211_rx_stats *pstats,
+				      rx_drvinfo_819x_usb  *pdrvinfo,
+				      struct ieee80211_rx_stats *precord_stats,
+				      bool bpacket_match_bssid,
+				      bool bpacket_toself,
+				      bool bPacketBeacon,
+				      bool bToSelfBA)
+{
+	phy_sts_ofdm_819xusb_t *pofdm_buf;
+	phy_sts_cck_819xusb_t	*pcck_buf;
+	phy_ofdm_rx_status_rxsc_sgien_exintfflag *prxsc;
 	u8				*prxpkt;
 	u8				i, max_spatial_stream, tmp_rxsnr, tmp_rxevm, rxsc_sgien_exflg;
-	char				rx_pwr[4], rx_pwr_all=0;
-	//long				rx_avg_pwr = 0;
+	char				rx_pwr[4], rx_pwr_all = 0;
 	char				rx_snrX, rx_evmX;
 	u8				evm, pwdb_all;
-	u32				RSSI, total_rssi=0;//, total_evm=0;
-//	long				signal_strength_index = 0;
-	u8				is_cck_rate=0;
+	u32				RSSI, total_rssi = 0;
+	u8				is_cck_rate = 0;
 	u8				rf_rx_num = 0;
+	u8				sq;
 
 
 	priv->stats.numqry_phystatus++;
@@ -4855,11 +4285,11 @@
 	memset(precord_stats, 0, sizeof(struct ieee80211_rx_stats));
 	pstats->bPacketMatchBSSID = precord_stats->bPacketMatchBSSID = bpacket_match_bssid;
 	pstats->bPacketToSelf = precord_stats->bPacketToSelf = bpacket_toself;
-	pstats->bIsCCK = precord_stats->bIsCCK = is_cck_rate;//RX_HAL_IS_CCK_RATE(pDrvInfo);
+	pstats->bIsCCK = precord_stats->bIsCCK = is_cck_rate;
 	pstats->bPacketBeacon = precord_stats->bPacketBeacon = bPacketBeacon;
 	pstats->bToSelfBA = precord_stats->bToSelfBA = bToSelfBA;
 
-	prxpkt = (u8*)pdrvinfo;
+	prxpkt = (u8 *)pdrvinfo;
 
 	/* Move pointer to the 16th bytes. Phy status start address. */
 	prxpkt += sizeof(rx_drvinfo_819x_usb);
@@ -4873,8 +4303,7 @@
 	precord_stats->RxMIMOSignalQuality[0] = -1;
 	precord_stats->RxMIMOSignalQuality[1] = -1;
 
-	if(is_cck_rate)
-	{
+	if (is_cck_rate) {
 		//
 		// (1)Hardware does not provide RSSI for CCK
 		//
@@ -4882,51 +4311,46 @@
 		//
 		// (2)PWDB, Average PWDB cacluated by hardware (for rate adaptive)
 		//
-		u8 report;//, cck_agc_rpt;
+		u8 report;
 
 		priv->stats.numqry_phystatusCCK++;
 
-		if(!priv->bCckHighPower)
-		{
+		if (!priv->bCckHighPower) {
 			report = pcck_buf->cck_agc_rpt & 0xc0;
 			report = report>>6;
-			switch(report)
-			{
+			switch (report) {
 				//Fixed by Jacken from Bryant 2008-03-20
 				//Original value is -38 , -26 , -14 , -2
 				//Fixed value is -35 , -23 , -11 , 6
-				case 0x3:
-					rx_pwr_all = -35 - (pcck_buf->cck_agc_rpt & 0x3e);
-					break;
-				case 0x2:
-					rx_pwr_all = -23 - (pcck_buf->cck_agc_rpt & 0x3e);
-					break;
-				case 0x1:
-					rx_pwr_all = -11 - (pcck_buf->cck_agc_rpt & 0x3e);
-					break;
-				case 0x0:
-					rx_pwr_all = 6 - (pcck_buf->cck_agc_rpt & 0x3e);
-					break;
+			case 0x3:
+				rx_pwr_all = -35 - (pcck_buf->cck_agc_rpt & 0x3e);
+				break;
+			case 0x2:
+				rx_pwr_all = -23 - (pcck_buf->cck_agc_rpt & 0x3e);
+				break;
+			case 0x1:
+				rx_pwr_all = -11 - (pcck_buf->cck_agc_rpt & 0x3e);
+				break;
+			case 0x0:
+				rx_pwr_all = 6 - (pcck_buf->cck_agc_rpt & 0x3e);
+				break;
 			}
-		}
-		else
-		{
+		} else {
 			report = pcck_buf->cck_agc_rpt & 0x60;
 			report = report>>5;
-			switch(report)
-			{
-				case 0x3:
-					rx_pwr_all = -35 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1) ;
-					break;
-				case 0x2:
-					rx_pwr_all = -23 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1);
-					break;
-				case 0x1:
-					rx_pwr_all = -11 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1) ;
-					break;
-				case 0x0:
-					rx_pwr_all = 6 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1) ;
-					break;
+			switch (report) {
+			case 0x3:
+				rx_pwr_all = -35 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1);
+				break;
+			case 0x2:
+				rx_pwr_all = -23 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1);
+				break;
+			case 0x1:
+				rx_pwr_all = -11 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1);
+				break;
+			case 0x0:
+				rx_pwr_all = 6 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1);
+				break;
 			}
 		}
 
@@ -4937,44 +4361,36 @@
 		//
 		// (3) Get Signal Quality (EVM)
 		//
-		//if(bpacket_match_bssid)
-		{
-			u8	sq;
 
-			if(pstats->RxPWDBAll > 40)
-			{
+		if (pstats->RxPWDBAll > 40) {
+			sq = 100;
+		} else {
+			sq = pcck_buf->sq_rpt;
+
+			if (pcck_buf->sq_rpt > 64)
+				sq = 0;
+			else if (pcck_buf->sq_rpt < 20)
 				sq = 100;
-			}else
-			{
-				sq = pcck_buf->sq_rpt;
-
-				if(pcck_buf->sq_rpt > 64)
-					sq = 0;
-				else if (pcck_buf->sq_rpt < 20)
-					sq = 100;
-				else
-					sq = ((64-sq) * 100) / 44;
-			}
-			pstats->SignalQuality = precord_stats->SignalQuality = sq;
-			pstats->RxMIMOSignalQuality[0] = precord_stats->RxMIMOSignalQuality[0] = sq;
-			pstats->RxMIMOSignalQuality[1] = precord_stats->RxMIMOSignalQuality[1] = -1;
+			else
+				sq = ((64-sq) * 100) / 44;
 		}
-	}
-	else
-	{
+		pstats->SignalQuality = precord_stats->SignalQuality = sq;
+		pstats->RxMIMOSignalQuality[0] = precord_stats->RxMIMOSignalQuality[0] = sq;
+		pstats->RxMIMOSignalQuality[1] = precord_stats->RxMIMOSignalQuality[1] = -1;
+
+	} else {
 		priv->stats.numqry_phystatusHT++;
 		//
 		// (1)Get RSSI for HT rate
 		//
-		for(i=RF90_PATH_A; i<priv->NumTotalRFPath; i++)
-		{
+		for (i = RF90_PATH_A; i < priv->NumTotalRFPath; i++) {
 			// 2008/01/30 MH we will judge RF RX path now.
 			if (priv->brfpath_rxenable[i])
 				rf_rx_num++;
 			else
 				continue;
 
-		if (!rtl8192_phy_CheckIsLegalRFPath(priv->ieee80211->dev, i))
+			if (!rtl8192_phy_CheckIsLegalRFPath(priv->ieee80211->dev, i))
 				continue;
 
 			//Fixed by Jacken from Bryant 2008-03-20
@@ -4984,7 +4400,6 @@
 			//Get Rx snr value in DB
 			tmp_rxsnr =	pofdm_buf->rxsnr_X[i];
 			rx_snrX = (char)(tmp_rxsnr);
-			//rx_snrX >>= 1;
 			rx_snrX /= 2;
 			priv->stats.rxSNRdB[i] = (long)rx_snrX;
 
@@ -4993,11 +4408,8 @@
 			total_rssi += RSSI;
 
 			/* Record Signal Strength for next packet */
-			//if(bpacket_match_bssid)
-			{
-				pstats->RxMIMOSignalStrength[i] =(u8) RSSI;
-				precord_stats->RxMIMOSignalStrength[i] =(u8) RSSI;
-			}
+			pstats->RxMIMOSignalStrength[i] = (u8) RSSI;
+			precord_stats->RxMIMOSignalStrength[i] = (u8) RSSI;
 		}
 
 
@@ -5006,7 +4418,7 @@
 		//
 		//Fixed by Jacken from Bryant 2008-03-20
 		//Original value is 106
-		rx_pwr_all = (((pofdm_buf->pwdb_all ) >> 1 )& 0x7f) -106;
+		rx_pwr_all = (((pofdm_buf->pwdb_all) >> 1)& 0x7f) -106;
 		pwdb_all = rtl819x_query_rxpwrpercentage(rx_pwr_all);
 
 		pstats->RxPWDBAll = precord_stats->RxPWDBAll = pwdb_all;
@@ -5015,14 +4427,13 @@
 		//
 		// (3)EVM of HT rate
 		//
-		if(pdrvinfo->RxHT && pdrvinfo->RxRate>=DESC90_RATEMCS8 &&
-			pdrvinfo->RxRate<=DESC90_RATEMCS15)
+		if (pdrvinfo->RxHT && pdrvinfo->RxRate >= DESC90_RATEMCS8 &&
+		    pdrvinfo->RxRate <= DESC90_RATEMCS15)
 			max_spatial_stream = 2; //both spatial stream make sense
 		else
 			max_spatial_stream = 1; //only spatial stream 1 makes sense
 
-		for(i=0; i<max_spatial_stream; i++)
-		{
+		for (i = 0; i < max_spatial_stream; i++) {
 			tmp_rxevm =	pofdm_buf->rxevm_X[i];
 			rx_evmX = (char)(tmp_rxevm);
 
@@ -5032,19 +4443,16 @@
 			rx_evmX /= 2;	//dbm
 
 			evm = rtl819x_evm_dbtopercentage(rx_evmX);
-			//if(bpacket_match_bssid)
-			{
-				if(i==0) // Fill value in RFD, Get the first spatial stream only
-					pstats->SignalQuality = precord_stats->SignalQuality = (u8)(evm & 0xff);
-				pstats->RxMIMOSignalQuality[i] = precord_stats->RxMIMOSignalQuality[i] = (u8)(evm & 0xff);
-			}
+			if (i == 0) // Fill value in RFD, Get the first spatial stream only
+				pstats->SignalQuality = precord_stats->SignalQuality = (u8)(evm & 0xff);
+			pstats->RxMIMOSignalQuality[i] = precord_stats->RxMIMOSignalQuality[i] = (u8)(evm & 0xff);
 		}
 
 
 		/* record rx statistics for debug */
 		rxsc_sgien_exflg = pofdm_buf->rxsc_sgien_exflg;
 		prxsc =	(phy_ofdm_rx_status_rxsc_sgien_exintfflag *)&rxsc_sgien_exflg;
-		if(pdrvinfo->BW)	//40M channel
+		if (pdrvinfo->BW)	//40M channel
 			priv->stats.received_bwtype[1+prxsc->rxsc]++;
 		else				//20M channel
 			priv->stats.received_bwtype[0]++;
@@ -5052,25 +4460,17 @@
 
 	//UI BSS List signal strength(in percentage), make it good looking, from 0~100.
 	//It is assigned to the BSS List in GetValueFromBeaconOrProbeRsp().
-	if(is_cck_rate)
-	{
-		pstats->SignalStrength = precord_stats->SignalStrength = (u8)(rtl819x_signal_scale_mapping((long)pwdb_all));//PWDB_ALL;
-
-	}
-	else
-	{
-		//pRfd->Status.SignalStrength = pRecordRfd->Status.SignalStrength = (u8)(SignalScaleMapping(total_rssi/=RF90_PATH_MAX));//(u8)(total_rssi/=RF90_PATH_MAX);
+	if (is_cck_rate) {
+		pstats->SignalStrength = precord_stats->SignalStrength = (u8)(rtl819x_signal_scale_mapping((long)pwdb_all));
+	} else {
 		// We can judge RX path number now.
 		if (rf_rx_num != 0)
-			pstats->SignalStrength = precord_stats->SignalStrength = (u8)(rtl819x_signal_scale_mapping((long)(total_rssi/=rf_rx_num)));
+			pstats->SignalStrength = precord_stats->SignalStrength = (u8)(rtl819x_signal_scale_mapping((long)(total_rssi /= rf_rx_num)));
 	}
 }	/* QueryRxPhyStatus8190Pci */
 
-void
-rtl8192_record_rxdesc_forlateruse(
-	struct ieee80211_rx_stats *	psrc_stats,
-	struct ieee80211_rx_stats *	ptarget_stats
-)
+void rtl8192_record_rxdesc_forlateruse(struct ieee80211_rx_stats *psrc_stats,
+				       struct ieee80211_rx_stats *ptarget_stats)
 {
 	ptarget_stats->bIsAMPDU = psrc_stats->bIsAMPDU;
 	ptarget_stats->bFirstMPDU = psrc_stats->bFirstMPDU;
@@ -5079,27 +4479,26 @@
 
 
 void TranslateRxSignalStuff819xUsb(struct sk_buff *skb,
-				   struct ieee80211_rx_stats * pstats,
+				   struct ieee80211_rx_stats *pstats,
 				   rx_drvinfo_819x_usb  *pdrvinfo)
 {
 	// TODO: We must only check packet for current MAC address. Not finish
 	rtl8192_rx_info *info = (struct rtl8192_rx_info *)skb->cb;
-	struct net_device *dev=info->dev;
+	struct net_device *dev = info->dev;
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	bool bpacket_match_bssid, bpacket_toself;
-	bool bPacketBeacon=FALSE, bToSelfBA=FALSE;
+	bool bPacketBeacon = FALSE, bToSelfBA = FALSE;
 	static struct ieee80211_rx_stats  previous_stats;
 	struct ieee80211_hdr_3addr *hdr;//by amy
-       u16 fc,type;
+	u16 fc, type;
 
 	// Get Signal Quality for only RX data queue (but not command queue)
 
-	u8* tmp_buf;
-	//u16 tmp_buf_len = 0;
+	u8 *tmp_buf;
 	u8  *praddr;
 
 	/* Get MAC frame start address. */
-	tmp_buf = (u8*)skb->data;// + get_rxpacket_shiftbytes_819xusb(pstats);
+	tmp_buf = (u8 *)skb->data;
 
 	hdr = (struct ieee80211_hdr_3addr *)tmp_buf;
 	fc = le16_to_cpu(hdr->frame_ctl);
@@ -5108,38 +4507,30 @@
 
 	/* Check if the received packet is acceptable. */
 	bpacket_match_bssid = ((IEEE80211_FTYPE_CTL != type) &&
-							(eqMacAddr(priv->ieee80211->current_network.bssid,  (fc & IEEE80211_FCTL_TODS)? hdr->addr1 : (fc & IEEE80211_FCTL_FROMDS )? hdr->addr2 : hdr->addr3))
-								 && (!pstats->bHwError) && (!pstats->bCRC)&& (!pstats->bICV));
+			       (eqMacAddr(priv->ieee80211->current_network.bssid,  (fc & IEEE80211_FCTL_TODS) ? hdr->addr1 : (fc & IEEE80211_FCTL_FROMDS) ? hdr->addr2 : hdr->addr3))
+			       && (!pstats->bHwError) && (!pstats->bCRC) && (!pstats->bICV));
 	bpacket_toself =  bpacket_match_bssid & (eqMacAddr(praddr, priv->ieee80211->dev->dev_addr));
 
-		if(WLAN_FC_GET_FRAMETYPE(fc)== IEEE80211_STYPE_BEACON)
-		{
-			bPacketBeacon = true;
-			//DbgPrint("Beacon 2, MatchBSSID = %d, ToSelf = %d \n", bPacketMatchBSSID, bPacketToSelf);
-		}
-		if(WLAN_FC_GET_FRAMETYPE(fc) == IEEE80211_STYPE_BLOCKACK)
-		{
-			if((eqMacAddr(praddr,dev->dev_addr)))
-				bToSelfBA = true;
-				//DbgPrint("BlockAck, MatchBSSID = %d, ToSelf = %d \n", bPacketMatchBSSID, bPacketToSelf);
-		}
+	if (WLAN_FC_GET_FRAMETYPE(fc) == IEEE80211_STYPE_BEACON)
+		bPacketBeacon = true;
+	if (WLAN_FC_GET_FRAMETYPE(fc) == IEEE80211_STYPE_BLOCKACK) {
+		if ((eqMacAddr(praddr, dev->dev_addr)))
+			bToSelfBA = true;
+	}
 
 
 
-	if(bpacket_match_bssid)
-	{
+	if (bpacket_match_bssid)
 		priv->stats.numpacket_matchbssid++;
-	}
-	if(bpacket_toself){
+	if (bpacket_toself)
 		priv->stats.numpacket_toself++;
-	}
 	//
 	// Process PHY information for previous packet (RSSI/PWDB/EVM)
 	//
 	// Because phy information is contained in the last packet of AMPDU only, so driver
 	// should process phy information of previous packet
 	rtl8192_process_phyinfo(priv, tmp_buf, &previous_stats, pstats);
-	rtl8192_query_rxphystatus(priv, pstats, pdrvinfo, &previous_stats, bpacket_match_bssid,bpacket_toself,bPacketBeacon,bToSelfBA);
+	rtl8192_query_rxphystatus(priv, pstats, pdrvinfo, &previous_stats, bpacket_match_bssid, bpacket_toself, bPacketBeacon, bToSelfBA);
 	rtl8192_record_rxdesc_forlateruse(pstats, &previous_stats);
 
 }
@@ -5158,91 +4549,85 @@
 * Return:
 *		None
 */
-void
-UpdateReceivedRateHistogramStatistics8190(
-	struct net_device *dev,
-	struct ieee80211_rx_stats *stats
-	)
+void UpdateReceivedRateHistogramStatistics8190(struct net_device *dev,
+					       struct ieee80211_rx_stats *stats)
 {
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-	u32 rcvType=1;   //0: Total, 1:OK, 2:CRC, 3:ICV
+	u32 rcvType = 1;   //0: Total, 1:OK, 2:CRC, 3:ICV
 	u32 rateIndex;
 	u32 preamble_guardinterval;  //1: short preamble/GI, 0: long preamble/GI
 
 
-	if(stats->bCRC)
-	rcvType = 2;
-	else if(stats->bICV)
-	rcvType = 3;
+	if (stats->bCRC)
+		rcvType = 2;
+	else if (stats->bICV)
+		rcvType = 3;
 
-	if(stats->bShortPreamble)
-	preamble_guardinterval = 1;// short
+	if (stats->bShortPreamble)
+		preamble_guardinterval = 1;// short
 	else
-	preamble_guardinterval = 0;// long
+		preamble_guardinterval = 0;// long
 
-	switch(stats->rate)
-	{
+	switch (stats->rate) {
 		//
 		// CCK rate
 		//
-		case MGN_1M:    rateIndex = 0;  break;
-		case MGN_2M:    rateIndex = 1;  break;
-		case MGN_5_5M:  rateIndex = 2;  break;
-		case MGN_11M:   rateIndex = 3;  break;
+	case MGN_1M:    rateIndex = 0;  break;
+	case MGN_2M:    rateIndex = 1;  break;
+	case MGN_5_5M:  rateIndex = 2;  break;
+	case MGN_11M:   rateIndex = 3;  break;
 		//
 		// Legacy OFDM rate
 		//
-		case MGN_6M:    rateIndex = 4;  break;
-		case MGN_9M:    rateIndex = 5;  break;
-		case MGN_12M:   rateIndex = 6;  break;
-		case MGN_18M:   rateIndex = 7;  break;
-		case MGN_24M:   rateIndex = 8;  break;
-		case MGN_36M:   rateIndex = 9;  break;
-		case MGN_48M:   rateIndex = 10; break;
-		case MGN_54M:   rateIndex = 11; break;
+	case MGN_6M:    rateIndex = 4;  break;
+	case MGN_9M:    rateIndex = 5;  break;
+	case MGN_12M:   rateIndex = 6;  break;
+	case MGN_18M:   rateIndex = 7;  break;
+	case MGN_24M:   rateIndex = 8;  break;
+	case MGN_36M:   rateIndex = 9;  break;
+	case MGN_48M:   rateIndex = 10; break;
+	case MGN_54M:   rateIndex = 11; break;
 		//
 		// 11n High throughput rate
 		//
-		case MGN_MCS0:  rateIndex = 12; break;
-		case MGN_MCS1:  rateIndex = 13; break;
-		case MGN_MCS2:  rateIndex = 14; break;
-		case MGN_MCS3:  rateIndex = 15; break;
-		case MGN_MCS4:  rateIndex = 16; break;
-		case MGN_MCS5:  rateIndex = 17; break;
-		case MGN_MCS6:  rateIndex = 18; break;
-		case MGN_MCS7:  rateIndex = 19; break;
-		case MGN_MCS8:  rateIndex = 20; break;
-		case MGN_MCS9:  rateIndex = 21; break;
-		case MGN_MCS10: rateIndex = 22; break;
-		case MGN_MCS11: rateIndex = 23; break;
-		case MGN_MCS12: rateIndex = 24; break;
-		case MGN_MCS13: rateIndex = 25; break;
-		case MGN_MCS14: rateIndex = 26; break;
-		case MGN_MCS15: rateIndex = 27; break;
-		default:        rateIndex = 28; break;
+	case MGN_MCS0:  rateIndex = 12; break;
+	case MGN_MCS1:  rateIndex = 13; break;
+	case MGN_MCS2:  rateIndex = 14; break;
+	case MGN_MCS3:  rateIndex = 15; break;
+	case MGN_MCS4:  rateIndex = 16; break;
+	case MGN_MCS5:  rateIndex = 17; break;
+	case MGN_MCS6:  rateIndex = 18; break;
+	case MGN_MCS7:  rateIndex = 19; break;
+	case MGN_MCS8:  rateIndex = 20; break;
+	case MGN_MCS9:  rateIndex = 21; break;
+	case MGN_MCS10: rateIndex = 22; break;
+	case MGN_MCS11: rateIndex = 23; break;
+	case MGN_MCS12: rateIndex = 24; break;
+	case MGN_MCS13: rateIndex = 25; break;
+	case MGN_MCS14: rateIndex = 26; break;
+	case MGN_MCS15: rateIndex = 27; break;
+	default:        rateIndex = 28; break;
 	}
-    priv->stats.received_preamble_GI[preamble_guardinterval][rateIndex]++;
-    priv->stats.received_rate_histogram[0][rateIndex]++; //total
-    priv->stats.received_rate_histogram[rcvType][rateIndex]++;
+	priv->stats.received_preamble_GI[preamble_guardinterval][rateIndex]++;
+	priv->stats.received_rate_histogram[0][rateIndex]++; //total
+	priv->stats.received_rate_histogram[rcvType][rateIndex]++;
 }
 
 
 void query_rxdesc_status(struct sk_buff *skb, struct ieee80211_rx_stats *stats, bool bIsRxAggrSubframe)
 {
 	rtl8192_rx_info *info = (struct rtl8192_rx_info *)skb->cb;
-	struct net_device *dev=info->dev;
+	struct net_device *dev = info->dev;
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-	//rx_desc_819x_usb *desc = (rx_desc_819x_usb *)skb->data;
 	rx_drvinfo_819x_usb  *driver_info = NULL;
 
 	//
 	//Get Rx Descriptor Information
 	//
 #ifdef USB_RX_AGGREGATION_SUPPORT
-	if (bIsRxAggrSubframe)
-	{
+	if (bIsRxAggrSubframe) {
 		rx_desc_819x_usb_aggr_subframe *desc = (rx_desc_819x_usb_aggr_subframe *)skb->data;
-		stats->Length = desc->Length ;
+		stats->Length = desc->Length;
 		stats->RxDrvInfoSize = desc->RxDrvInfoSize;
 		stats->RxBufShift = 0; //RxBufShift = 2 in RxDesc, but usb didn't shift bytes in fact.
 		stats->bICV = desc->ICV;
@@ -5256,7 +4641,7 @@
 
 		stats->Length = desc->Length;
 		stats->RxDrvInfoSize = desc->RxDrvInfoSize;
-		stats->RxBufShift = 0;//desc->Shift&0x03;
+		stats->RxBufShift = 0;
 		stats->bICV = desc->ICV;
 		stats->bCRC = desc->CRC32;
 		stats->bHwError = stats->bCRC|stats->bICV;
@@ -5264,16 +4649,12 @@
 		stats->Decrypted = !desc->SWDec;
 	}
 
-	if((priv->ieee80211->pHTInfo->bCurrentHTSupport == true) && (priv->ieee80211->pairwise_key_type == KEY_TYPE_CCMP))
-	{
+	if ((priv->ieee80211->pHTInfo->bCurrentHTSupport == true) && (priv->ieee80211->pairwise_key_type == KEY_TYPE_CCMP))
 		stats->bHwError = false;
-	}
 	else
-	{
 		stats->bHwError = stats->bCRC|stats->bICV;
-	}
 
-	if(stats->Length < 24 || stats->Length > MAX_8192U_RX_SIZE)
+	if (stats->Length < 24 || stats->Length > MAX_8192U_RX_SIZE)
 		stats->bHwError |= 1;
 	//
 	//Get Driver Info
@@ -5281,71 +4662,66 @@
 	// TODO: Need to verify it on FGPA platform
 	//Driver info are written to the RxBuffer following rx desc
 	if (stats->RxDrvInfoSize != 0) {
-		driver_info = (rx_drvinfo_819x_usb *)(skb->data + sizeof(rx_desc_819x_usb) + \
-				stats->RxBufShift);
+		driver_info = (rx_drvinfo_819x_usb *)(skb->data + sizeof(rx_desc_819x_usb) +
+						      stats->RxBufShift);
 		/* unit: 0.5M */
 		/* TODO */
-		if(!stats->bHwError){
+		if (!stats->bHwError) {
 			u8	ret_rate;
 			ret_rate = HwRateToMRate90(driver_info->RxHT, driver_info->RxRate);
-			if(ret_rate == 0xff)
-			{
+			if (ret_rate == 0xff) {
 				// Abnormal Case: Receive CRC OK packet with Rx descriptor indicating non supported rate.
 				// Special Error Handling here, 2008.05.16, by Emily
 
 				stats->bHwError = 1;
 				stats->rate = MGN_1M;	//Set 1M rate by default
-			}else
-			{
+			} else {
 				stats->rate = ret_rate;
 			}
-		}
-		else
+		} else {
 			stats->rate = 0x02;
+		}
 
 		stats->bShortPreamble = driver_info->SPLCP;
 
 
 		UpdateReceivedRateHistogramStatistics8190(dev, stats);
 
-		stats->bIsAMPDU = (driver_info->PartAggr==1);
-		stats->bFirstMPDU = (driver_info->PartAggr==1) && (driver_info->FirstAGGR==1);
+		stats->bIsAMPDU = (driver_info->PartAggr == 1);
+		stats->bFirstMPDU = (driver_info->PartAggr == 1) && (driver_info->FirstAGGR == 1);
 		stats->TimeStampLow = driver_info->TSFL;
 		// xiong mask it, 070514
-		//pRfd->Status.TimeStampHigh = PlatformEFIORead4Byte(Adapter, TSFR+4);
-		// stats->TimeStampHigh = read_nic_dword(dev,  TSFR+4);
 
 		UpdateRxPktTimeStamp8190(dev, stats);
 
 		//
 		// Rx A-MPDU
 		//
-		if(driver_info->FirstAGGR==1 || driver_info->PartAggr == 1)
+		if (driver_info->FirstAGGR == 1 || driver_info->PartAggr == 1)
 			RT_TRACE(COMP_RXDESC, "driver_info->FirstAGGR = %d, driver_info->PartAggr = %d\n",
-					driver_info->FirstAGGR, driver_info->PartAggr);
+				 driver_info->FirstAGGR, driver_info->PartAggr);
 
 	}
 
-	skb_pull(skb,sizeof(rx_desc_819x_usb));
+	skb_pull(skb, sizeof(rx_desc_819x_usb));
 	//
 	// Get Total offset of MPDU Frame Body
 	//
-	if((stats->RxBufShift + stats->RxDrvInfoSize) > 0) {
+	if ((stats->RxBufShift + stats->RxDrvInfoSize) > 0) {
 		stats->bShift = 1;
-		skb_pull(skb,stats->RxBufShift + stats->RxDrvInfoSize);
+		skb_pull(skb, stats->RxBufShift + stats->RxDrvInfoSize);
 	}
 
 #ifdef USB_RX_AGGREGATION_SUPPORT
 	/* for the rx aggregated sub frame, the redundant space truly contained in the packet */
-	if(bIsRxAggrSubframe) {
+	if (bIsRxAggrSubframe)
 		skb_pull(skb, 8);
-	}
 #endif
 	/* for debug 2008.5.29 */
 
 	//added by vivi, for MP, 20080108
 	stats->RxIs40MHzPacket = driver_info->BW;
-	if(stats->RxDrvInfoSize != 0)
+	if (stats->RxDrvInfoSize != 0)
 		TranslateRxSignalStuff819xUsb(skb, stats, driver_info);
 
 }
@@ -5359,19 +4735,18 @@
 	else
 #endif
 		return (sizeof(rx_desc_819x_usb) + Status->RxDrvInfoSize
-				+ Status->RxBufShift);
+			+ Status->RxBufShift);
 }
 
-void rtl8192_rx_nomal(struct sk_buff* skb)
+void rtl8192_rx_nomal(struct sk_buff *skb)
 {
 	rtl8192_rx_info *info = (struct rtl8192_rx_info *)skb->cb;
-	struct net_device *dev=info->dev;
+	struct net_device *dev = info->dev;
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	struct ieee80211_rx_stats stats = {
 		.signal = 0,
 		.noise = -98,
 		.rate = 0,
-		//      .mac_time = jiffies,
 		.freq = IEEE80211_24GHZ_BAND,
 	};
 	u32 rx_pkt_len = 0;
@@ -5393,7 +4768,7 @@
 #endif
 
 	/* 20 is for ps-poll */
-	if((skb->len >=(20 + sizeof(rx_desc_819x_usb))) && (skb->len < RX_URB_SIZE)) {
+	if ((skb->len >= (20 + sizeof(rx_desc_819x_usb))) && (skb->len < RX_URB_SIZE)) {
 #ifdef USB_RX_AGGREGATION_SUPPORT
 		TempByte = *(skb->data + sizeof(rx_desc_819x_usb));
 #endif
@@ -5404,14 +4779,12 @@
 #ifdef USB_RX_AGGREGATION_SUPPORT
 		if (TempByte & BIT0) {
 			agg_skb = skb;
-			//TotalLength = agg_skb->len - 4; /*sCrcLng*/
 			TotalLength = stats.Length - 4; /*sCrcLng*/
-			//RT_TRACE(COMP_RECV, "%s:first aggregated packet!Length=%d\n",__FUNCTION__,TotalLength);
 			/* though the head pointer has passed this position  */
 			TempDWord = *(u32 *)(agg_skb->data - 4);
 			PacketLength = (u16)(TempDWord & 0x3FFF); /*sCrcLng*/
 			skb = dev_alloc_skb(PacketLength);
-			memcpy(skb_put(skb,PacketLength),agg_skb->data,PacketLength);
+			memcpy(skb_put(skb, PacketLength), agg_skb->data, PacketLength);
 			PacketShiftBytes = GetRxPacketShiftBytes819xUsb(&stats, false);
 		}
 #endif
@@ -5421,26 +4794,24 @@
 		rx_pkt_len = skb->len;
 		ieee80211_hdr = (struct ieee80211_hdr_1addr *)skb->data;
 		unicast_packet = false;
-		if(is_broadcast_ether_addr(ieee80211_hdr->addr1)) {
+		if (is_broadcast_ether_addr(ieee80211_hdr->addr1)) {
 			//TODO
-		}else if(is_multicast_ether_addr(ieee80211_hdr->addr1)){
+		} else if (is_multicast_ether_addr(ieee80211_hdr->addr1)) {
 			//TODO
-		}else {
+		} else {
 			/* unicast packet */
 			unicast_packet = true;
 		}
 
-		if(!ieee80211_rx(priv->ieee80211,skb, &stats)) {
+		if (!ieee80211_rx(priv->ieee80211, skb, &stats)) {
 			dev_kfree_skb_any(skb);
 		} else {
 			priv->stats.rxoktotal++;
-			if(unicast_packet) {
+			if (unicast_packet)
 				priv->stats.rxbytesunicast += rx_pkt_len;
-			}
 		}
 #ifdef USB_RX_AGGREGATION_SUPPORT
 		testing = 1;
-		// (PipeIndex == 0) && (TempByte & BIT0) => TotalLength > 0.
 		if (TotalLength > 0) {
 			PacketOccupiedLendth = PacketLength + (PacketShiftBytes + 8);
 			if ((PacketOccupiedLendth & 0xFF) != 0)
@@ -5452,9 +4823,8 @@
 			else
 				agg_skb->len = 0;
 
-			while (agg_skb->len>=GetRxPacketShiftBytes819xUsb(&stats, true)) {
+			while (agg_skb->len >= GetRxPacketShiftBytes819xUsb(&stats, true)) {
 				u8 tmpCRC = 0, tmpICV = 0;
-				//RT_TRACE(COMP_RECV,"%s:aggred pkt,total_len = %d\n",__FUNCTION__,agg_skb->len);
 				RxDescr = (rx_desc_819x_usb_aggr_subframe *)(agg_skb->data);
 				tmpCRC = RxDescr->CRC32;
 				tmpICV = RxDescr->ICV;
@@ -5470,32 +4840,30 @@
 				query_rxdesc_status(agg_skb, &stats, true);
 				PacketLength = stats.Length;
 
-				if(PacketLength > agg_skb->len) {
+				if (PacketLength > agg_skb->len)
 					break;
-				}
 				/* Process the MPDU received */
 				skb = dev_alloc_skb(PacketLength);
-				memcpy(skb_put(skb,PacketLength),agg_skb->data, PacketLength);
+				memcpy(skb_put(skb, PacketLength), agg_skb->data, PacketLength);
 				skb_trim(skb, skb->len - 4/*sCrcLng*/);
 
 				rx_pkt_len = skb->len;
 				ieee80211_hdr = (struct ieee80211_hdr_1addr *)skb->data;
 				unicast_packet = false;
-				if(is_broadcast_ether_addr(ieee80211_hdr->addr1)) {
+				if (is_broadcast_ether_addr(ieee80211_hdr->addr1)) {
 					//TODO
-				}else if(is_multicast_ether_addr(ieee80211_hdr->addr1)){
+				} else if (is_multicast_ether_addr(ieee80211_hdr->addr1)) {
 					//TODO
-				}else {
+				} else {
 					/* unicast packet */
 					unicast_packet = true;
 				}
-				if(!ieee80211_rx(priv->ieee80211,skb, &stats)) {
+				if (!ieee80211_rx(priv->ieee80211, skb, &stats)) {
 					dev_kfree_skb_any(skb);
 				} else {
 					priv->stats.rxoktotal++;
-					if(unicast_packet) {
+					if (unicast_packet)
 						priv->stats.rxbytesunicast += rx_pkt_len;
-					}
 				}
 				/* should trim the packet which has been copied to target skb */
 				skb_pull(agg_skb, PacketLength);
@@ -5514,26 +4882,18 @@
 #endif
 	} else {
 		priv->stats.rxurberr++;
-		printk("actual_length:%d\n", skb->len);
+		netdev_dbg(dev, "actual_length: %d\n", skb->len);
 		dev_kfree_skb_any(skb);
 	}
 
 }
 
-void
-rtl819xusb_process_received_packet(
-	struct net_device *dev,
-	struct ieee80211_rx_stats *pstats
-	)
+void rtl819xusb_process_received_packet(struct net_device *dev,
+					struct ieee80211_rx_stats *pstats)
 {
-//	bool bfreerfd=false, bqueued=false;
-	u8*	frame;
-	u16     frame_len=0;
+	u8	*frame;
+	u16     frame_len = 0;
 	struct r8192_priv *priv = ieee80211_priv(dev);
-//	u8			index = 0;
-//	u8			TID = 0;
-	//u16			seqnum = 0;
-	//PRX_TS_RECORD	pts = NULL;
 
 	// Get shifted bytes of Starting address of 802.11 header. 2006.09.28, by Emily
 	//porting by amy 080508
@@ -5541,33 +4901,27 @@
 	frame = pstats->virtual_address;
 	frame_len = pstats->packetlength;
 #ifdef TODO	// by amy about HCT
-	if(!Adapter->bInHctTest)
+	if (!Adapter->bInHctTest)
 		CountRxErrStatistics(Adapter, pRfd);
 #endif
-	{
-	#ifdef ENABLE_PS  //by amy for adding ps function in future
-		RT_RF_POWER_STATE rtState;
-		// When RF is off, we should not count the packet for hw/sw synchronize
-		// reason, ie. there may be a duration while sw switch is changed and hw
-		// switch is being changed. 2006.12.04, by shien chang.
-		Adapter->HalFunc.GetHwRegHandler(Adapter, HW_VAR_RF_STATE, (u8* )(&rtState));
-		if (rtState == eRfOff)
-		{
-			return;
-		}
-	#endif
+#ifdef ENABLE_PS  //by amy for adding ps function in future
+	RT_RF_POWER_STATE rtState;
+	// When RF is off, we should not count the packet for hw/sw synchronize
+	// reason, ie. there may be a duration while sw switch is changed and hw
+	// switch is being changed. 2006.12.04, by shien chang.
+	Adapter->HalFunc.GetHwRegHandler(Adapter, HW_VAR_RF_STATE, (u8 *)(&rtState));
+	if (rtState == eRfOff)
+		return;
+#endif
 	priv->stats.rxframgment++;
 
-	}
 #ifdef TODO
 	RmMonitorSignalStrength(Adapter, pRfd);
 #endif
 	/* 2007/01/16 MH Add RX command packet handle here. */
 	/* 2007/03/01 MH We have to release RFD and return if rx pkt is cmd pkt. */
 	if (rtl819xusb_rx_command_packet(dev, pstats))
-	{
 		return;
-	}
 
 #ifdef SW_CRC_CHECK
 	SwCrcCheck();
@@ -5578,16 +4932,12 @@
 
 void query_rx_cmdpkt_desc_status(struct sk_buff *skb, struct ieee80211_rx_stats *stats)
 {
-//	rtl8192_rx_info *info = (struct rtl8192_rx_info *)skb->cb;
-//	struct net_device *dev=info->dev;
-//	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 	rx_desc_819x_usb *desc = (rx_desc_819x_usb *)skb->data;
-//	rx_drvinfo_819x_usb  *driver_info;
 
 	//
 	//Get Rx Descriptor Information
 	//
-	stats->virtual_address = (u8*)skb->data;
+	stats->virtual_address = (u8 *)skb->data;
 	stats->Length = desc->Length;
 	stats->RxDrvInfoSize = 0;
 	stats->RxBufShift = 0;
@@ -5602,21 +4952,17 @@
 {
 	struct rtl8192_rx_info *info = (struct rtl8192_rx_info *)skb->cb;
 	struct net_device *dev = info->dev;
-	//int ret;
-//	struct urb *rx_urb = info->urb;
 	/* TODO */
 	struct ieee80211_rx_stats stats = {
 		.signal = 0,
 		.noise = -98,
 		.rate = 0,
-		//      .mac_time = jiffies,
 		.freq = IEEE80211_24GHZ_BAND,
 	};
 
-	if((skb->len >=(20 + sizeof(rx_desc_819x_usb))) && (skb->len < RX_URB_SIZE))
-	{
+	if ((skb->len >= (20 + sizeof(rx_desc_819x_usb))) && (skb->len < RX_URB_SIZE)) {
 
-		query_rx_cmdpkt_desc_status(skb,&stats);
+		query_rx_cmdpkt_desc_status(skb, &stats);
 		// this is to be done by amy 080508     prfd->queue_id = 1;
 
 
@@ -5624,7 +4970,7 @@
 		//  Process the command packet received.
 		//
 
-		rtl819xusb_process_received_packet(dev,&stats);
+		rtl819xusb_process_received_packet(dev, &stats);
 
 		dev_kfree_skb_any(skb);
 	}
@@ -5640,22 +4986,21 @@
 		switch (info->out_pipe) {
 		/* Nomal packet pipe */
 		case 3:
-			//RT_TRACE(COMP_RECV, "normal in-pipe index(%d)\n",info->out_pipe);
 			priv->IrpPendingCount--;
 			rtl8192_rx_nomal(skb);
 			break;
 
-			/* Command packet pipe */
+		/* Command packet pipe */
 		case 9:
-			RT_TRACE(COMP_RECV, "command in-pipe index(%d)\n",\
-					info->out_pipe);
+			RT_TRACE(COMP_RECV, "command in-pipe index(%d)\n",
+				 info->out_pipe);
 
 			rtl8192_rx_cmd(skb);
 			break;
 
 		default: /* should never get here! */
-			RT_TRACE(COMP_ERR, "Unknown in-pipe index(%d)\n",\
-					info->out_pipe);
+			RT_TRACE(COMP_ERR, "Unknown in-pipe index(%d)\n",
+				 info->out_pipe);
 			dev_kfree_skb(skb);
 			break;
 
@@ -5682,11 +5027,10 @@
 *****************************************************************************/
 
 static int rtl8192_usb_probe(struct usb_interface *intf,
-			 const struct usb_device_id *id)
+			     const struct usb_device_id *id)
 {
-//	unsigned long ioaddr = 0;
 	struct net_device *dev = NULL;
-	struct r8192_priv *priv= NULL;
+	struct r8192_priv *priv = NULL;
 	struct usb_device *udev = interface_to_usbdev(intf);
 	int ret;
 	RT_TRACE(COMP_INIT, "Oops: i'm coming\n");
@@ -5699,29 +5043,28 @@
 	SET_NETDEV_DEV(dev, &intf->dev);
 	priv = ieee80211_priv(dev);
 	priv->ieee80211 = netdev_priv(dev);
-	priv->udev=udev;
+	priv->udev = udev;
 
 	dev->netdev_ops = &rtl8192_netdev_ops;
 
-	 //DMESG("Oops: i'm coming\n");
 #if WIRELESS_EXT >= 12
 #if WIRELESS_EXT < 17
 	dev->get_wireless_stats = r8192_get_wireless_stats;
 #endif
 	dev->wireless_handlers = (struct iw_handler_def *) &r8192_wx_handlers_def;
 #endif
-	dev->type=ARPHRD_ETHER;
+	dev->type = ARPHRD_ETHER;
 
 	dev->watchdog_timeo = HZ*3;	//modified by john, 0805
 
-	if (dev_alloc_name(dev, ifname) < 0){
+	if (dev_alloc_name(dev, ifname) < 0) {
 		RT_TRACE(COMP_INIT, "Oops: devname already taken! Trying wlan%%d...\n");
 		ifname = "wlan%d";
 		dev_alloc_name(dev, ifname);
 	}
 
 	RT_TRACE(COMP_INIT, "Driver probe completed1\n");
-	if(rtl8192_init(dev)!=0){
+	if (rtl8192_init(dev) != 0) {
 		RT_TRACE(COMP_ERR, "Initialization failed");
 		ret = -ENODEV;
 		goto fail;
@@ -5733,7 +5076,7 @@
 	if (ret)
 		goto fail2;
 
-	RT_TRACE(COMP_INIT, "dev name=======> %s\n",dev->name);
+	RT_TRACE(COMP_INIT, "dev name=======> %s\n", dev->name);
 	rtl8192_proc_init_one(dev);
 
 
@@ -5755,16 +5098,13 @@
 }
 
 //detach all the work and timer structure declared or inititialize in r8192U_init function.
-void rtl8192_cancel_deferred_work(struct r8192_priv* priv)
+void rtl8192_cancel_deferred_work(struct r8192_priv *priv)
 {
 
 	cancel_work_sync(&priv->reset_wq);
 	cancel_delayed_work(&priv->watch_dog_wq);
 	cancel_delayed_work(&priv->update_beacon_wq);
 	cancel_work_sync(&priv->qos_activate);
-	//cancel_work_sync(&priv->SetBWModeWorkItem);
-	//cancel_work_sync(&priv->SwChnlWorkItem);
-
 }
 
 
@@ -5773,22 +5113,18 @@
 	struct net_device *dev = usb_get_intfdata(intf);
 
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	if(dev){
+	if (dev) {
 
 		unregister_netdev(dev);
 
 		RT_TRACE(COMP_DOWN, "=============>wlan driver to be removed\n");
 		rtl8192_proc_remove_one(dev);
 
-			rtl8192_down(dev);
+		rtl8192_down(dev);
 		kfree(priv->pFirmware);
 		priv->pFirmware = NULL;
-	//	priv->rf_close(dev);
-//		rtl8192_SetRFPowerState(dev, eRfOff);
 		rtl8192_usb_deleteendpoints(dev);
 		destroy_workqueue(priv->priv_wq);
-		//rtl8192_irq_disable(dev);
-		//rtl8192_reset(dev);
 		mdelay(10);
 
 	}
@@ -5815,38 +5151,36 @@
 #ifdef CONFIG_IEEE80211_DEBUG
 	ret = ieee80211_debug_init();
 	if (ret) {
-		printk(KERN_ERR "ieee80211_debug_init() failed %d\n", ret);
+		pr_err("ieee80211_debug_init() failed %d\n", ret);
 		return ret;
 	}
 #endif
 	ret = ieee80211_crypto_init();
 	if (ret) {
-		printk(KERN_ERR "ieee80211_crypto_init() failed %d\n", ret);
+		pr_err("ieee80211_crypto_init() failed %d\n", ret);
 		return ret;
 	}
 
 	ret = ieee80211_crypto_tkip_init();
 	if (ret) {
-		printk(KERN_ERR "ieee80211_crypto_tkip_init() failed %d\n",
-			ret);
+		pr_err("ieee80211_crypto_tkip_init() failed %d\n", ret);
 		return ret;
 	}
 
 	ret = ieee80211_crypto_ccmp_init();
 	if (ret) {
-		printk(KERN_ERR "ieee80211_crypto_ccmp_init() failed %d\n",
-			ret);
+		pr_err("ieee80211_crypto_ccmp_init() failed %d\n", ret);
 		return ret;
 	}
 
 	ret = ieee80211_crypto_wep_init();
 	if (ret) {
-		printk(KERN_ERR "ieee80211_crypto_wep_init() failed %d\n", ret);
+		pr_err("ieee80211_crypto_wep_init() failed %d\n", ret);
 		return ret;
 	}
 
-	printk(KERN_INFO "\nLinux kernel driver for RTL8192 based WLAN cards\n");
-	printk(KERN_INFO "Copyright (c) 2007-2008, Realsil Wlan\n");
+	pr_info("\nLinux kernel driver for RTL8192 based WLAN cards\n");
+	pr_info("Copyright (c) 2007-2008, Realsil Wlan\n");
 	RT_TRACE(COMP_INIT, "Initializing module");
 	RT_TRACE(COMP_INIT, "Wireless extensions version %d", WIRELESS_EXT);
 	rtl8192_proc_module_init();
@@ -5859,7 +5193,6 @@
 	usb_deregister(&rtl8192_usb_driver);
 
 	RT_TRACE(COMP_DOWN, "Exiting");
-//	rtl8192_proc_module_remove();
 }
 
 
@@ -5869,11 +5202,11 @@
 	short enough_desc;
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 
-	spin_lock_irqsave(&priv->tx_lock,flags);
-	enough_desc = check_nic_enough_desc(dev,pri);
-	spin_unlock_irqrestore(&priv->tx_lock,flags);
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	enough_desc = check_nic_enough_desc(dev, pri);
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
 
-	if(enough_desc)
+	if (enough_desc)
 		ieee80211_wake_queue(priv->ieee80211);
 }
 
@@ -5881,43 +5214,32 @@
 {
 	u8 SECR_value = 0x0;
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-	 struct ieee80211_device* ieee = priv->ieee80211;
+	struct ieee80211_device *ieee = priv->ieee80211;
 	SECR_value = SCR_TxEncEnable | SCR_RxDecEnable;
-	if (((KEY_TYPE_WEP40 == ieee->pairwise_key_type) || (KEY_TYPE_WEP104 == ieee->pairwise_key_type)) && (priv->ieee80211->auth_mode != 2))
-	{
+	if (((KEY_TYPE_WEP40 == ieee->pairwise_key_type) || (KEY_TYPE_WEP104 == ieee->pairwise_key_type)) && (priv->ieee80211->auth_mode != 2)) {
 		SECR_value |= SCR_RxUseDK;
 		SECR_value |= SCR_TxUseDK;
-	}
-	else if ((ieee->iw_mode == IW_MODE_ADHOC) && (ieee->pairwise_key_type & (KEY_TYPE_CCMP | KEY_TYPE_TKIP)))
-	{
+	} else if ((ieee->iw_mode == IW_MODE_ADHOC) && (ieee->pairwise_key_type & (KEY_TYPE_CCMP | KEY_TYPE_TKIP))) {
 		SECR_value |= SCR_RxUseDK;
 		SECR_value |= SCR_TxUseDK;
 	}
 	//add HWSec active enable here.
-//default using hwsec. when peer AP is in N mode only and pairwise_key_type is none_aes(which HT_IOT_ACT_PURE_N_MODE indicates it), use software security. when peer AP is in b,g,n mode mixed and pairwise_key_type is none_aes, use g mode hw security. WB on 2008.7.4
+	//default using hwsec. when peer AP is in N mode only and pairwise_key_type is none_aes(which HT_IOT_ACT_PURE_N_MODE indicates it), use software security. when peer AP is in b,g,n mode mixed and pairwise_key_type is none_aes, use g mode hw security. WB on 2008.7.4
 
 	ieee->hwsec_active = 1;
 
-	if ((ieee->pHTInfo->IOTAction&HT_IOT_ACT_PURE_N_MODE) || !hwwep)//!ieee->hwsec_support) //add hwsec_support flag to totol control hw_sec on/off
-	{
+	if ((ieee->pHTInfo->IOTAction&HT_IOT_ACT_PURE_N_MODE) || !hwwep) { //add hwsec_support flag to totol control hw_sec on/off
 		ieee->hwsec_active = 0;
 		SECR_value &= ~SCR_RxDecEnable;
 	}
-	RT_TRACE(COMP_SEC,"%s:, hwsec:%d, pairwise_key:%d, SECR_value:%x\n", __FUNCTION__, \
-			ieee->hwsec_active, ieee->pairwise_key_type, SECR_value);
-	{
-		write_nic_byte(dev, SECR,  SECR_value);//SECR_value |  SCR_UseDK );
-	}
+	RT_TRACE(COMP_SEC, "%s:, hwsec:%d, pairwise_key:%d, SECR_value:%x\n", __func__,
+		 ieee->hwsec_active, ieee->pairwise_key_type, SECR_value);
+	write_nic_byte(dev, SECR,  SECR_value);
 }
 
 
-void setKey(	struct net_device *dev,
-		u8 EntryNo,
-		u8 KeyIndex,
-		u16 KeyType,
-		u8 *MacAddr,
-		u8 DefaultKey,
-		u32 *KeyContent )
+void setKey(struct net_device *dev, u8 EntryNo, u8 KeyIndex, u16 KeyType,
+	    u8 *MacAddr, u8 DefaultKey, u32 *KeyContent)
 {
 	u32 TargetCommand = 0;
 	u32 TargetContent = 0;
@@ -5926,44 +5248,40 @@
 	if (EntryNo >= TOTAL_CAM_ENTRY)
 		RT_TRACE(COMP_ERR, "cam entry exceeds in setKey()\n");
 
-	RT_TRACE(COMP_SEC, "====>to setKey(), dev:%p, EntryNo:%d, KeyIndex:%d, KeyType:%d, MacAddr%pM\n", dev,EntryNo, KeyIndex, KeyType, MacAddr);
+	RT_TRACE(COMP_SEC, "====>to setKey(), dev:%p, EntryNo:%d, KeyIndex:%d, KeyType:%d, MacAddr%pM\n", dev, EntryNo, KeyIndex, KeyType, MacAddr);
 
 	if (DefaultKey)
 		usConfig |= BIT15 | (KeyType<<2);
 	else
 		usConfig |= BIT15 | (KeyType<<2) | KeyIndex;
-//	usConfig |= BIT15 | (KeyType<<2) | (DefaultKey<<5) | KeyIndex;
 
 
-	for(i=0 ; i<CAM_CONTENT_COUNT; i++){
+	for (i = 0; i < CAM_CONTENT_COUNT; i++) {
 		TargetCommand  = i+CAM_CONTENT_COUNT*EntryNo;
 		TargetCommand |= BIT31|BIT16;
 
-		if(i==0){//MAC|Config
+		if (i == 0) { //MAC|Config
 			TargetContent = (u32)(*(MacAddr+0)) << 16|
 					(u32)(*(MacAddr+1)) << 24|
 					(u32)usConfig;
 
 			write_nic_dword(dev, WCAMI, TargetContent);
 			write_nic_dword(dev, RWCAM, TargetCommand);
-	//		printk("setkey cam =%8x\n", read_cam(dev, i+6*EntryNo));
-		}
-		else if(i==1){//MAC
+		} else if (i == 1) { //MAC
 			TargetContent = (u32)(*(MacAddr+2))	 |
 					(u32)(*(MacAddr+3)) <<  8|
 					(u32)(*(MacAddr+4)) << 16|
 					(u32)(*(MacAddr+5)) << 24;
 			write_nic_dword(dev, WCAMI, TargetContent);
 			write_nic_dword(dev, RWCAM, TargetCommand);
-		}
-		else {
+		} else {
 			//Key Material
-			if(KeyContent !=NULL){
-			write_nic_dword(dev, WCAMI, (u32)(*(KeyContent+i-2)) );
-			write_nic_dword(dev, RWCAM, TargetCommand);
+			if (KeyContent != NULL) {
+				write_nic_dword(dev, WCAMI, (u32)(*(KeyContent+i-2)));
+				write_nic_dword(dev, RWCAM, TargetCommand);
+			}
 		}
 	}
-	}
 
 }
 

diff --git a/drivers/staging/rtl8192u/r8192U_dm.c b/drivers/staging/rtl8192u/r8192U_dm.c
index ea46717f..a6e4c37 100644
--- a/drivers/staging/rtl8192u/r8192U_dm.c
+++ b/drivers/staging/rtl8192u/r8192U_dm.c

@@ -88,7 +88,7 @@
 
 // DM --> Bandwidth switch
 static	void	dm_init_bandwidth_autoswitch(struct net_device *dev);
-static	void	dm_bandwidth_autoswitch(	struct net_device *dev);
+static	void	dm_bandwidth_autoswitch(struct net_device *dev);
 
 // DM --> TX power control
 //static	void	dm_initialize_txpower_tracking(struct net_device *dev);
@@ -112,7 +112,7 @@
 static	void	dm_dig_init(struct net_device *dev);
 static	void	dm_ctrl_initgain_byrssi(struct net_device *dev);
 static	void	dm_ctrl_initgain_byrssi_highpwr(struct net_device *dev);
-static	void	dm_ctrl_initgain_byrssi_by_driverrssi(	struct net_device *dev);
+static	void	dm_ctrl_initgain_byrssi_by_driverrssi(struct net_device *dev);
 static	void	dm_ctrl_initgain_byrssi_by_fwfalse_alarm(struct net_device *dev);
 static	void	dm_initial_gain(struct net_device *dev);
 static	void	dm_pd_th(struct net_device *dev);
@@ -289,7 +289,7 @@
   *	01/16/2008	MHC		RF_Type is assigned in ReadAdapterInfo(). We must call
   *						the function after making sure RF_Type.
   */
-extern void init_rate_adaptive(struct net_device * dev)
+extern void init_rate_adaptive(struct net_device *dev)
 {
 
 	struct r8192_priv *priv = ieee80211_priv(dev);
@@ -351,7 +351,7 @@
  *	05/26/08	amy	Create version 0 porting from windows code.
  *
  *---------------------------------------------------------------------------*/
-static void dm_check_rate_adaptive(struct net_device * dev)
+static void dm_check_rate_adaptive(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	PRT_HIGH_THROUGHPUT	pHTInfo = priv->ieee80211->pHTInfo;
@@ -372,11 +372,11 @@
 		return;
 
 	// TODO: Only 11n mode is implemented currently,
-	if( !(priv->ieee80211->mode == WIRELESS_MODE_N_24G ||
+	if(!(priv->ieee80211->mode == WIRELESS_MODE_N_24G ||
 		 priv->ieee80211->mode == WIRELESS_MODE_N_5G))
 		 return;
 
-	if( priv->ieee80211->state == IEEE80211_LINKED )
+	if(priv->ieee80211->state == IEEE80211_LINKED)
 	{
 	//	RT_TRACE(COMP_RATE, "dm_CheckRateAdaptive(): \t");
 
@@ -454,8 +454,8 @@
 			//pHalData->UndecoratedSmoothedPWDB = 19;
 			if(priv->undecorated_smoothed_pwdb < (long)(pra->ping_rssi_thresh_for_ra+5))
 			{
-				if( (priv->undecorated_smoothed_pwdb < (long)pra->ping_rssi_thresh_for_ra) ||
-					ping_rssi_state )
+				if((priv->undecorated_smoothed_pwdb < (long)pra->ping_rssi_thresh_for_ra) ||
+					ping_rssi_state)
 				{
 					//DbgPrint("TestRSSI = %d, set RATR to 0x%x \n", pHalData->UndecoratedSmoothedPWDB, pRA->TestRSSIRATR);
 					pra->ratr_state = DM_RATR_STA_LOW;
@@ -480,8 +480,8 @@
 		//
 		// Check whether updating of RATR0 is required
 		//
-		currentRATR = read_nic_dword(dev, RATR0);
-		if( targetRATR !=  currentRATR )
+		read_nic_dword(dev, RATR0, &currentRATR);
+		if(targetRATR !=  currentRATR)
 		{
 			u32 ratr_value;
 			ratr_value = targetRATR;
@@ -505,7 +505,7 @@
 }	// dm_CheckRateAdaptive
 
 
-static void dm_init_bandwidth_autoswitch(struct net_device * dev)
+static void dm_init_bandwidth_autoswitch(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
@@ -517,7 +517,7 @@
 }	// dm_init_bandwidth_autoswitch
 
 
-static void dm_bandwidth_autoswitch(struct net_device * dev)
+static void dm_bandwidth_autoswitch(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
@@ -588,7 +588,7 @@
 	{0x0f, 0x0f, 0x0d, 0x08, 0x00, 0x00, 0x00, 0x00}	// 11, -11db
 };
 
-static void dm_TXPowerTrackingCallback_TSSI(struct net_device * dev)
+static void dm_TXPowerTrackingCallback_TSSI(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	bool						bHighpowerstate, viviflag = FALSE;
@@ -627,14 +627,14 @@
 		RT_TRACE(COMP_POWER_TRACKING, "Set configuration with tx cmd queue fail!\n");
 	}
 #else
-	cmpk_message_handle_tx(dev, (u8*)&tx_cmd,
+	cmpk_message_handle_tx(dev, (u8 *)&tx_cmd,
 								DESC_PACKET_TYPE_INIT, sizeof(DCMD_TXCMD_T));
 #endif
 	mdelay(1);
 	//DbgPrint("hi, vivi, strange\n");
 	for(i = 0;i <= 30; i++)
 	{
-		Pwr_Flag = read_nic_byte(dev, 0x1ba);
+		read_nic_byte(dev, 0x1ba, &Pwr_Flag);
 
 		if (Pwr_Flag == 0)
 		{
@@ -642,9 +642,9 @@
 			continue;
 		}
 #ifdef RTL8190P
-		Avg_TSSI_Meas = read_nic_word(dev, 0x1bc);
+		read_nic_word(dev, 0x1bc, &Avg_TSSI_Meas);
 #else
-		Avg_TSSI_Meas = read_nic_word(dev, 0x13c);
+		read_nic_word(dev, 0x13c, &Avg_TSSI_Meas);
 #endif
 		if(Avg_TSSI_Meas == 0)
 		{
@@ -655,12 +655,12 @@
 		for(k = 0;k < 5; k++)
 		{
 #ifdef RTL8190P
-			tmp_report[k] = read_nic_byte(dev, 0x1d8+k);
+			read_nic_byte(dev, 0x1d8+k, &tmp_report[k]);
 #else
 			if(k !=4)
-				tmp_report[k] = read_nic_byte(dev, 0x134+k);
+				read_nic_byte(dev, 0x134+k, &tmp_report[k]);
 			else
-				tmp_report[k] = read_nic_byte(dev, 0x13e);
+				read_nic_byte(dev, 0x13e, &tmp_report[k]);
 #endif
 			RT_TRACE(COMP_POWER_TRACKING, "TSSI_report_value = %d\n", tmp_report[k]);
 		}
@@ -816,7 +816,7 @@
 		write_nic_byte(dev, 0x1ba, 0);
 }
 
-static void dm_TXPowerTrackingCallback_ThermalMeter(struct net_device * dev)
+static void dm_TXPowerTrackingCallback_ThermalMeter(struct net_device *dev)
 {
 #define ThermalMeterVal	9
 	struct r8192_priv *priv = ieee80211_priv(dev);
@@ -1572,7 +1572,7 @@
 		TempVal = 0;
 		TempVal =	priv->cck_txbbgain_table[priv->cck_present_attentuation].ccktxbb_valuearray[2] +
 					(priv->cck_txbbgain_table[priv->cck_present_attentuation].ccktxbb_valuearray[3]<<8) +
-					(priv->cck_txbbgain_table[priv->cck_present_attentuation].ccktxbb_valuearray[4]<<16 )+
+					(priv->cck_txbbgain_table[priv->cck_present_attentuation].ccktxbb_valuearray[4]<<16)+
 					(priv->cck_txbbgain_table[priv->cck_present_attentuation].ccktxbb_valuearray[5]<<24);
 		rtl8192_setBBreg(dev, rCCK0_TxFilter2,bMaskDWord, TempVal);
 		//Write 0xa28  0xa29
@@ -1592,7 +1592,7 @@
 		TempVal = 0;
 		TempVal =	priv->cck_txbbgain_ch14_table[priv->cck_present_attentuation].ccktxbb_valuearray[2] +
 					(priv->cck_txbbgain_ch14_table[priv->cck_present_attentuation].ccktxbb_valuearray[3]<<8) +
-					(priv->cck_txbbgain_ch14_table[priv->cck_present_attentuation].ccktxbb_valuearray[4]<<16 )+
+					(priv->cck_txbbgain_ch14_table[priv->cck_present_attentuation].ccktxbb_valuearray[4]<<16)+
 					(priv->cck_txbbgain_ch14_table[priv->cck_present_attentuation].ccktxbb_valuearray[5]<<24);
 		rtl8192_setBBreg(dev, rCCK0_TxFilter2,bMaskDWord, TempVal);
 		//Write 0xa28  0xa29
@@ -1624,7 +1624,7 @@
 		TempVal = 0;
 		TempVal =	CCKSwingTable_Ch1_Ch13[priv->CCK_index][2] +
 					(CCKSwingTable_Ch1_Ch13[priv->CCK_index][3]<<8) +
-					(CCKSwingTable_Ch1_Ch13[priv->CCK_index][4]<<16 )+
+					(CCKSwingTable_Ch1_Ch13[priv->CCK_index][4]<<16)+
 					(CCKSwingTable_Ch1_Ch13[priv->CCK_index][5]<<24);
 		rtl8192_setBBreg(dev, rCCK0_TxFilter2, bMaskDWord, TempVal);
 		RT_TRACE(COMP_POWER_TRACKING, "CCK not chnl 14, reg 0x%x = 0x%x\n",
@@ -1652,7 +1652,7 @@
 		TempVal = 0;
 		TempVal =	CCKSwingTable_Ch14[priv->CCK_index][2] +
 					(CCKSwingTable_Ch14[priv->CCK_index][3]<<8) +
-					(CCKSwingTable_Ch14[priv->CCK_index][4]<<16 )+
+					(CCKSwingTable_Ch14[priv->CCK_index][4]<<16)+
 					(CCKSwingTable_Ch14[priv->CCK_index][5]<<24);
 		rtl8192_setBBreg(dev, rCCK0_TxFilter2, bMaskDWord, TempVal);
 		RT_TRACE(COMP_POWER_TRACKING, "CCK chnl 14, reg 0x%x = 0x%x\n",
@@ -1727,7 +1727,7 @@
 	if(priv->rate_adaptive.rate_adaptive_disabled)
 		return;
 	// TODO: Only 11n mode is implemented currently,
-	if( !(priv->ieee80211->mode==WIRELESS_MODE_N_24G ||
+	if(!(priv->ieee80211->mode==WIRELESS_MODE_N_24G ||
 		 priv->ieee80211->mode==WIRELESS_MODE_N_5G))
 		 return;
 	{
@@ -1736,7 +1736,7 @@
 			ratr_value = reg_ratr;
 			if(priv->rf_type == RF_1T2R)	// 1T2R, Spatial Stream 2 should be disabled
 			{
-				ratr_value &=~ (RATE_ALL_OFDM_2SS);
+				ratr_value &= ~(RATE_ALL_OFDM_2SS);
 				//DbgPrint("HW_VAR_TATR_0 from 0x%x ==> 0x%x\n", ((pu4Byte)(val))[0], ratr_value);
 			}
 			//DbgPrint("set HW_VAR_TATR_0 = 0x%x\n", ratr_value);
@@ -2222,7 +2222,7 @@
 
 	/* 2. When RSSI increase, We have to judge if it is larger than a threshold
 		  and then execute the step below.  */
-	if ((priv->undecorated_smoothed_pwdb >= dm_digtable.rssi_high_thresh) )
+	if ((priv->undecorated_smoothed_pwdb >= dm_digtable.rssi_high_thresh))
 	{
 		u8 reset_flag = 0;
 
@@ -2316,7 +2316,7 @@
  *
  *---------------------------------------------------------------------------*/
 static void dm_ctrl_initgain_byrssi_highpwr(
-	struct net_device * dev)
+	struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	static u32 reset_cnt_highpwr;
@@ -2391,12 +2391,13 @@
 
 
 static void dm_initial_gain(
-	struct net_device * dev)
+	struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8					initial_gain=0;
 	static u8				initialized, force_write;
 	static u32			reset_cnt;
+	u8				tmp;
 
 	if(dm_digtable.dig_algorithm_switch)
 	{
@@ -2437,7 +2438,8 @@
 		reset_cnt = priv->reset_count;
 	}
 
-	if(dm_digtable.pre_ig_value != read_nic_byte(dev, rOFDM0_XAAGCCore1))
+	read_nic_byte(dev, rOFDM0_XAAGCCore1, &tmp);
+	if (dm_digtable.pre_ig_value != tmp)
 		force_write = 1;
 
 	{
@@ -2459,7 +2461,7 @@
 }
 
 static void dm_pd_th(
-	struct net_device * dev)
+	struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	static u8				initialized, force_write;
@@ -2571,7 +2573,7 @@
 }
 
 static	void dm_cs_ratio(
-	struct net_device * dev)
+	struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	static u8				initialized,force_write;
@@ -2589,7 +2591,7 @@
 		{
 			if ((dm_digtable.rssi_val <= dm_digtable.rssi_low_thresh))
 				dm_digtable.curcs_ratio_state = DIG_CS_RATIO_LOWER;
-			else if ((dm_digtable.rssi_val >= dm_digtable.rssi_high_thresh) )
+			else if ((dm_digtable.rssi_val >= dm_digtable.rssi_high_thresh))
 				dm_digtable.curcs_ratio_state = DIG_CS_RATIO_HIGHER;
 			else
 				dm_digtable.curcs_ratio_state = dm_digtable.precs_ratio_state;
@@ -2634,7 +2636,7 @@
 	}
 }
 
-extern void dm_init_edca_turbo(struct net_device * dev)
+extern void dm_init_edca_turbo(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
@@ -2644,7 +2646,7 @@
 }	// dm_init_edca_turbo
 
 static void dm_check_edca_turbo(
-	struct net_device * dev)
+	struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	PRT_HIGH_THROUGHPUT	pHTInfo = priv->ieee80211->pHTInfo;
@@ -2727,8 +2729,9 @@
 			// TODO:  Modified this part and try to set acm control in only 1 IO processing!!
 
 					PACI_AIFSN	pAciAifsn = (PACI_AIFSN)&(qos_parameters->aifs[0]);
-					u8		AcmCtrl = read_nic_byte( dev, AcmHwCtrl );
-					if( pAciAifsn->f.ACM )
+					u8		AcmCtrl;
+					read_nic_byte(dev, AcmHwCtrl, &AcmCtrl);
+					if(pAciAifsn->f.ACM)
 					{ // ACM bit is 1.
 						AcmCtrl |= AcmHw_BeqEn;
 					}
@@ -2737,8 +2740,8 @@
 						AcmCtrl &= (~AcmHw_BeqEn);
 					}
 
-					RT_TRACE( COMP_QOS,"SetHwReg8190pci(): [HW_VAR_ACM_CTRL] Write 0x%X\n", AcmCtrl ) ;
-					write_nic_byte(dev, AcmHwCtrl, AcmCtrl );
+					RT_TRACE(COMP_QOS,"SetHwReg8190pci(): [HW_VAR_ACM_CTRL] Write 0x%X\n", AcmCtrl) ;
+					write_nic_byte(dev, AcmHwCtrl, AcmCtrl);
 				}
 			}
 			priv->bcurrent_turbo_EDCA = false;
@@ -2753,7 +2756,7 @@
 	lastRxOkCnt = priv->stats.rxbytesunicast;
 }	// dm_CheckEdcaTurbo
 
-extern void DM_CTSToSelfSetting(struct net_device * dev,u32 DM_Type, u32 DM_Value)
+extern void DM_CTSToSelfSetting(struct net_device *dev,u32 DM_Type, u32 DM_Value)
 {
 	struct r8192_priv *priv = ieee80211_priv((struct net_device *)dev);
 
@@ -2773,7 +2776,7 @@
 	}
 }
 
-static void dm_init_ctstoself(struct net_device * dev)
+static void dm_init_ctstoself(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv((struct net_device *)dev);
 
@@ -2837,7 +2840,7 @@
  *	05/28/2008	amy		Create Version 0 porting from windows code.
  *
  *---------------------------------------------------------------------------*/
-static void dm_check_rfctrl_gpio(struct net_device * dev)
+static void dm_check_rfctrl_gpio(struct net_device *dev)
 {
 	//struct r8192_priv *priv = ieee80211_priv(dev);
 
@@ -2881,7 +2884,7 @@
 	u8 tmp1byte;
 
 
-	tmp1byte = read_nic_byte(dev,GPI);
+	read_nic_byte(dev, GPI, &tmp1byte);
 	if(tmp1byte == 0xff)
 		return;
 
@@ -2933,18 +2936,18 @@
 		{
 			// 0x108 GPIO input register is read only
 			//set 0x108 B1= 1: RF-ON; 0: RF-OFF.
-			tmp1byte = read_nic_byte(dev,GPI);
+			read_nic_byte(dev, GPI, &tmp1byte);
 
 			eRfPowerStateToSet = (tmp1byte&BIT1) ?  eRfOn : eRfOff;
 
-			if( (priv->bHwRadioOff == true) && (eRfPowerStateToSet == eRfOn))
+			if((priv->bHwRadioOff == true) && (eRfPowerStateToSet == eRfOn))
 			{
 				RT_TRACE(COMP_RF, "gpiochangeRF  - HW Radio ON\n");
 
 				priv->bHwRadioOff = false;
 				bActuallySet = true;
 			}
-			else if ( (priv->bHwRadioOff == false) && (eRfPowerStateToSet == eRfOff))
+			else if ((priv->bHwRadioOff == false) && (eRfPowerStateToSet == eRfOff))
 			{
 				RT_TRACE(COMP_RF, "gpiochangeRF  - HW Radio OFF\n");
 				priv->bHwRadioOff = true;
@@ -2996,7 +2999,7 @@
 
 	/* 2008/01/30 MH After discussing with SD3 Jerry, 0xc04/0xd04 register will
 	   always be the same. We only read 0xc04 now. */
-	rfpath = read_nic_byte(dev, 0xc04);
+	read_nic_byte(dev, 0xc04, &rfpath);
 
 	// Check Bit 0-3, it means if RF A-D is enabled.
 	for (i = 0; i < RF90_PATH_MAX; i++)
@@ -3012,7 +3015,7 @@
 	dm_rxpath_sel_byrssi(dev);
 }	/* DM_RFPathCheckWorkItemCallBack */
 
-static void dm_init_rxpath_selection(struct net_device * dev)
+static void dm_init_rxpath_selection(struct net_device *dev)
 {
 	u8 i;
 	struct r8192_priv *priv = ieee80211_priv(dev);
@@ -3033,7 +3036,7 @@
 	}
 }
 
-static void dm_rxpath_sel_byrssi(struct net_device * dev)
+static void dm_rxpath_sel_byrssi(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8				i, max_rssi_index=0, min_rssi_index=0, sec_rssi_index=0, rf_num=0;
@@ -3052,12 +3055,13 @@
 
 	if(!cck_Rx_Path_initialized)
 	{
-		DM_RxPathSelTable.cck_Rx_path = (read_nic_byte(dev, 0xa07)&0xf);
+		read_nic_byte(dev, 0xa07, &DM_RxPathSelTable.cck_Rx_path);
+		DM_RxPathSelTable.cck_Rx_path &= 0xf;
 		cck_Rx_Path_initialized = 1;
 	}
 
-	DM_RxPathSelTable.disabledRF = 0xf;
-	DM_RxPathSelTable.disabledRF &=~ (read_nic_byte(dev, 0xc04));
+	read_nic_byte(dev, 0xc04, &DM_RxPathSelTable.disabledRF);
+	DM_RxPathSelTable.disabledRF = ~DM_RxPathSelTable.disabledRF & 0xf;
 
 	if(priv->ieee80211->mode == WIRELESS_MODE_B)
 	{
@@ -3356,7 +3360,7 @@
 	bool		bSwitchFromCountDiff = false;
 	bool		bDoubleTimeInterval = false;
 
-	if(	priv->ieee80211->state == IEEE80211_LINKED &&
+	if(priv->ieee80211->state == IEEE80211_LINKED &&
 		priv->ieee80211->bfsync_enable &&
 		(priv->ieee80211->pHTInfo->IOTAction & HT_IOT_ACT_CDD_FSYNC))
 	{
@@ -3576,12 +3580,12 @@
 	RT_TRACE(COMP_HALDM, "RSSI %d TimeInterval %d MultipleTimeInterval %d\n", priv->ieee80211->fsync_rssi_threshold, priv->ieee80211->fsync_time_interval, priv->ieee80211->fsync_multiple_timeinterval);
 	RT_TRACE(COMP_HALDM, "RateBitmap 0x%x FirstDiffRateThreshold %d SecondDiffRateThreshold %d\n", priv->ieee80211->fsync_rate_bitmap, priv->ieee80211->fsync_firstdiff_ratethreshold, priv->ieee80211->fsync_seconddiff_ratethreshold);
 
-	if(	priv->ieee80211->state == IEEE80211_LINKED &&
+	if(priv->ieee80211->state == IEEE80211_LINKED &&
 		(priv->ieee80211->pHTInfo->IOTAction & HT_IOT_ACT_CDD_FSYNC))
 	{
 		if(priv->ieee80211->bfsync_enable == 0)
 		{
-			switch(priv->ieee80211->fsync_state)
+			switch (priv->ieee80211->fsync_state)
 			{
 				case Default_Fsync:
 					dm_StartHWFsync(dev);
@@ -3599,7 +3603,7 @@
 		}
 		else
 		{
-			switch(priv->ieee80211->fsync_state)
+			switch (priv->ieee80211->fsync_state)
 			{
 				case Default_Fsync:
 					dm_StartSWFsync(dev);
@@ -3632,7 +3636,7 @@
 	}
 	else
 	{
-		switch(priv->ieee80211->fsync_state)
+		switch (priv->ieee80211->fsync_state)
 		{
 			case HW_Fsync:
 				dm_EndHWFsync(dev);
@@ -3731,17 +3735,17 @@
 	for (page = 0; page < 5; page++)
 		for (offset = 0; offset < 256; offset++)
 		{
-			dm_shadow[page][offset] = read_nic_byte(dev, offset+page*256);
+			read_nic_byte(dev, offset+page*256, &dm_shadow[page][offset]);
 			//DbgPrint("P-%d/O-%02x=%02x\r\n", page, offset, DM_Shadow[page][offset]);
 		}
 
 	for (page = 8; page < 11; page++)
 		for (offset = 0; offset < 256; offset++)
-			dm_shadow[page][offset] = read_nic_byte(dev, offset+page*256);
+			read_nic_byte(dev, offset+page*256, &dm_shadow[page][offset]);
 
 	for (page = 12; page < 15; page++)
 		for (offset = 0; offset < 256; offset++)
-			dm_shadow[page][offset] = read_nic_byte(dev, offset+page*256);
+			read_nic_byte(dev, offset+page*256, &dm_shadow[page][offset]);
 
 }   /* dm_shadow_init */
 
@@ -3787,7 +3791,7 @@
 		return;
 	}
 	//printk("priv->ieee80211->current_network.unknown_cap_exist is %d ,priv->ieee80211->current_network.broadcom_cap_exist is %d\n",priv->ieee80211->current_network.unknown_cap_exist,priv->ieee80211->current_network.broadcom_cap_exist);
-	if((priv->ieee80211->current_network.atheros_cap_exist ) && (priv->ieee80211->mode == IEEE_G)){
+	if((priv->ieee80211->current_network.atheros_cap_exist) && (priv->ieee80211->mode == IEEE_G)){
 		txhipower_threshhold = TX_POWER_ATHEROAP_THRESH_HIGH;
 		txlowpower_threshold = TX_POWER_ATHEROAP_THRESH_LOW;
 	}
@@ -3832,8 +3836,8 @@
 		priv->bDynamicTxLowPower = false;
 	}
 
-	if( (priv->bDynamicTxHighPower != priv->bLastDTPFlag_High ) ||
-		(priv->bDynamicTxLowPower != priv->bLastDTPFlag_Low ) )
+	if((priv->bDynamicTxHighPower != priv->bLastDTPFlag_High) ||
+		(priv->bDynamicTxLowPower != priv->bLastDTPFlag_Low))
 	{
 		RT_TRACE(COMP_TXAGC,"SetTxPowerLevel8190()  channel = %d \n" , priv->ieee80211->current_network.channel);
 
@@ -3852,20 +3856,20 @@
 }	/* dm_dynamic_txpower */
 
 //added by vivi, for read tx rate and retrycount
-static void dm_check_txrateandretrycount(struct net_device * dev)
+static void dm_check_txrateandretrycount(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	struct ieee80211_device* ieee = priv->ieee80211;
+	struct ieee80211_device *ieee = priv->ieee80211;
 	//for 11n tx rate
 //	priv->stats.CurrentShowTxate = read_nic_byte(dev, Current_Tx_Rate_Reg);
-	ieee->softmac_stats.CurrentShowTxate = read_nic_byte(dev, Current_Tx_Rate_Reg);
+	read_nic_byte(dev, Current_Tx_Rate_Reg, &ieee->softmac_stats.CurrentShowTxate);
 	//printk("=============>tx_rate_reg:%x\n", ieee->softmac_stats.CurrentShowTxate);
 	//for initial tx rate
 //	priv->stats.last_packet_rate = read_nic_byte(dev, Initial_Tx_Rate_Reg);
-	ieee->softmac_stats.last_packet_rate = read_nic_byte(dev ,Initial_Tx_Rate_Reg);
+	read_nic_byte(dev, Initial_Tx_Rate_Reg, &ieee->softmac_stats.last_packet_rate);
 	//for tx tx retry count
 //	priv->stats.txretrycount = read_nic_dword(dev, Tx_Retry_Count_Reg);
-	ieee->softmac_stats.txretrycount = read_nic_dword(dev, Tx_Retry_Count_Reg);
+	read_nic_dword(dev, Tx_Retry_Count_Reg, &ieee->softmac_stats.txretrycount);
 }
 
 static void dm_send_rssi_tofw(struct net_device *dev)
@@ -3882,7 +3886,7 @@
 	tx_cmd.Length	= 4;
 	tx_cmd.Value		= priv->undecorated_smoothed_pwdb;
 
-	cmpk_message_handle_tx(dev, (u8*)&tx_cmd,
+	cmpk_message_handle_tx(dev, (u8 *)&tx_cmd,
 								DESC_PACKET_TYPE_INIT, sizeof(DCMD_TXCMD_T));
 }
 

diff --git a/drivers/staging/rtl8192u/r8192U_hw.h b/drivers/staging/rtl8192u/r8192U_hw.h
index 15b0423..7e612aa 100644
--- a/drivers/staging/rtl8192u/r8192U_hw.h
+++ b/drivers/staging/rtl8192u/r8192U_hw.h

@@ -388,10 +388,11 @@
 #define EPROM_CMD_NORMAL 0
 #define EPROM_CMD_LOAD 1
 #define EPROM_CMD_PROGRAM 2
-#define EPROM_CS_SHIFT 3
-#define EPROM_CK_SHIFT 2
-#define EPROM_W_SHIFT 1
-#define EPROM_R_SHIFT 0
+#define EPROM_CS_BIT BIT(3)
+#define EPROM_CK_BIT BIT(2)
+#define EPROM_W_BIT  BIT(1)
+#define EPROM_R_BIT  BIT(0)
+
 	MAC0			= 0x000,
 	MAC1			= 0x001,
 	MAC2			= 0x002,

diff --git a/drivers/staging/rtl8192u/r8192U_wx.c b/drivers/staging/rtl8192u/r8192U_wx.c
index c904aa8..3e25763 100644
--- a/drivers/staging/rtl8192u/r8192U_wx.c
+++ b/drivers/staging/rtl8192u/r8192U_wx.c

@@ -144,7 +144,7 @@
 	down(&priv->wx_sem);
 
 
-	get_user(addr,(u8*)wrqu->data.pointer);
+	get_user(addr,(u8 *)wrqu->data.pointer);
 	data1 = read_rtl8225(dev, addr);
 	wrqu->data.length = data1;
 
@@ -162,7 +162,7 @@
 
 	down(&priv->wx_sem);
 
-	get_user(addr, (u8*)wrqu->data.pointer);
+	get_user(addr, (u8 *)wrqu->data.pointer);
 	write_rtl8225(dev, addr, wrqu->data.length);
 
 	up(&priv->wx_sem);
@@ -199,7 +199,7 @@
 
 	down(&priv->wx_sem);
 
-	get_user(databb, (u8*)wrqu->data.pointer);
+	get_user(databb, (u8 *)wrqu->data.pointer);
 	rtl8187_write_phy(dev, wrqu->data.length, databb);
 
 	up(&priv->wx_sem);
@@ -217,7 +217,7 @@
 
 	down(&priv->wx_sem);
 
-	get_user(addr, (u32*)wrqu->data.pointer);
+	get_user(addr, (u32 *)wrqu->data.pointer);
 	write_nic_byte(dev, addr, wrqu->data.length);
 
 	up(&priv->wx_sem);
@@ -234,8 +234,8 @@
 
 	down(&priv->wx_sem);
 
-	get_user(addr,(u32*)wrqu->data.pointer);
-	data1 = read_nic_byte(dev, addr);
+	get_user(addr,(u32 *)wrqu->data.pointer);
+	read_nic_byte(dev, addr, &data1);
 	wrqu->data.length = data1;
 
 	up(&priv->wx_sem);
@@ -254,12 +254,12 @@
 	down(&priv->wx_sem);
 
 	//count the length of input ssid
-	for(name_len=0 ; ((char*)wrqu->data.pointer)[name_len]!='\0' ; name_len++);
+	for(name_len=0 ; ((char *)wrqu->data.pointer)[name_len]!='\0' ; name_len++);
 
 	//search for the corresponding info which is received
 	list_for_each_entry(target, &ieee->network_list, list) {
 		if ( (target->ssid_len == name_len) &&
-		     (strncmp(target->ssid, (char*)wrqu->data.pointer, name_len)==0)){
+		     (strncmp(target->ssid, (char *)wrqu->data.pointer, name_len)==0)){
 			if(target->wpa_ie_len>0 || target->rsn_ie_len>0 )
 				//set flags=1 to indicate this ap is WPA
 				wrqu->data.flags = 1;
@@ -380,7 +380,7 @@
 				union iwreq_data *wrqu, char *extra)
 {
 	struct iw_range *range = (struct iw_range *)extra;
-	struct iw_range_with_scan_capa* tmp = (struct iw_range_with_scan_capa*)range;
+	struct iw_range_with_scan_capa *tmp = (struct iw_range_with_scan_capa *)range;
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u16 val;
 	int i;
@@ -483,7 +483,7 @@
 			     union iwreq_data *wrqu, char *b)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	struct ieee80211_device* ieee = priv->ieee80211;
+	struct ieee80211_device *ieee = priv->ieee80211;
 	int ret = 0;
 
 	if(!priv->up) return -ENETDOWN;
@@ -492,7 +492,7 @@
 		return -EAGAIN;
 	if (wrqu->data.flags & IW_SCAN_THIS_ESSID)
 	{
-		struct iw_scan_req* req = (struct iw_scan_req*)b;
+		struct iw_scan_req *req = (struct iw_scan_req *)b;
 		if (req->essid_len)
 		{
 			//printk("==**&*&*&**===>scan set ssid:%s\n", req->essid);
@@ -709,7 +709,7 @@
 		#define CONF_WEP40  0x4
 		#define CONF_WEP104 0x14
 
-		switch(wrqu->encoding.flags & IW_ENCODE_INDEX){
+		switch (wrqu->encoding.flags & IW_ENCODE_INDEX){
 		case 0: key_idx = ieee->tx_keyidx; break;
 		case 1:	key_idx = 0; break;
 		case 2:	key_idx = 1; break;
@@ -757,7 +757,7 @@
  iwreq_data *wrqu, char *p){
 
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	int *parms=(int*)p;
+	int *parms=(int *)p;
 	int mode=parms[0];
 
 	priv->ieee80211->active_scan = mode;
@@ -891,7 +891,7 @@
 {
 	int ret=0;
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	struct ieee80211_device* ieee = priv->ieee80211;
+	struct ieee80211_device *ieee = priv->ieee80211;
 	//printk("===>%s()\n", __FUNCTION__);
 
 
@@ -922,7 +922,7 @@
 			ieee->pairwise_key_type = alg;
 			EnableHWSecurityConfig8192(dev);
 		}
-		memcpy((u8*)key, ext->key, 16); //we only get 16 bytes key.why? WB 2008.7.1
+		memcpy((u8 *)key, ext->key, 16); //we only get 16 bytes key.why? WB 2008.7.1
 
 		if ((alg & KEY_TYPE_WEP40) && (ieee->auth_mode !=2) )
 		{
@@ -952,7 +952,7 @@
 					4,//EntryNo
 					idx, //KeyIndex
 					alg,  //KeyType
-					(u8*)ieee->ap_mac_addr, //MacAddr
+					(u8 *)ieee->ap_mac_addr, //MacAddr
 					0,              //DefaultKey
 					key);           //KeyContent
 		}
@@ -1180,8 +1180,8 @@
 struct iw_statistics *r8192_get_wireless_stats(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-	struct ieee80211_device* ieee = priv->ieee80211;
-	struct iw_statistics* wstats = &priv->wstats;
+	struct ieee80211_device *ieee = priv->ieee80211;
+	struct iw_statistics *wstats = &priv->wstats;
 	int tmp_level = 0;
 	int tmp_qual = 0;
 	int tmp_noise = 0;

diff --git a/drivers/staging/rtl8192u/r819xU_cmdpkt.c b/drivers/staging/rtl8192u/r819xU_cmdpkt.c
index b755eb4..6810766 100644
--- a/drivers/staging/rtl8192u/r819xU_cmdpkt.c
+++ b/drivers/staging/rtl8192u/r819xU_cmdpkt.c

@@ -41,7 +41,7 @@
 rt_status
 SendTxCommandPacket(
 	struct net_device *dev,
-	void*			pData,
+	void			*pData,
 	u32				DataLen
 	)
 {
@@ -57,7 +57,7 @@
 	//Get TCB and local buffer from common pool. (It is shared by CmdQ, MgntQ, and USB coalesce DataQ)
 	skb  = dev_alloc_skb(USB_HWDESC_HEADER_LEN + DataLen + 4);
 	memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-	tcb_desc = (cb_desc*)(skb->cb + MAX_DEV_ADDR_SIZE);
+	tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
 	tcb_desc->queue_index = TXCMD_QUEUE;
 	tcb_desc->bCmdOrInit = DESC_PACKET_TYPE_NORMAL;
 	tcb_desc->bLastIniPkt = 0;
@@ -66,7 +66,7 @@
 	memcpy(ptr_buf,pData,DataLen);
 	tcb_desc->txbuf_size= (u16)DataLen;
 
-	if(!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
+	if (!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
 			(!skb_queue_empty(&priv->ieee80211->skb_waitQ[tcb_desc->queue_index]))||\
 			(priv->ieee80211->queue_stop) ) {
 			RT_TRACE(COMP_FIRMWARE,"===================NULL packet==================================> tx full!\n");
@@ -101,7 +101,7 @@
  *---------------------------------------------------------------------------*/
  extern	rt_status	cmpk_message_handle_tx(
 	struct net_device *dev,
-	u8*	codevirtualaddress,
+	u8	*codevirtualaddress,
 	u32	packettype,
 	u32	buffer_len)
 {
@@ -126,7 +126,7 @@
 	//Fragmentation might be required
 	frag_threshold = pfirmware->cmdpacket_frag_thresold;
 	do {
-		if((buffer_len - frag_offset) > frag_threshold) {
+		if ((buffer_len - frag_offset) > frag_threshold) {
 			frag_length = frag_threshold ;
 			bLastIniPkt = 0;
 
@@ -145,7 +145,7 @@
 		skb  = dev_alloc_skb(frag_length + 4);
 		#endif
 		memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-		tcb_desc = (cb_desc*)(skb->cb + MAX_DEV_ADDR_SIZE);
+		tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
 		tcb_desc->queue_index = TXCMD_QUEUE;
 		tcb_desc->bCmdOrInit = packettype;
 		tcb_desc->bLastIniPkt = bLastIniPkt;
@@ -163,7 +163,7 @@
 		tcb_desc->txbuf_size= (u16)buffer_len;
 
 
-		if(!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
+		if (!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
 			(!skb_queue_empty(&priv->ieee80211->skb_waitQ[tcb_desc->queue_index]))||\
 			(priv->ieee80211->queue_stop) ) {
 			RT_TRACE(COMP_FIRMWARE,"=====================================================> tx full!\n");
@@ -221,7 +221,7 @@
 #endif
 
 #ifdef TODO
-	if(pAdapter->bInHctTest)
+	if (pAdapter->bInHctTest)
 		return;
 #endif
 	/* We can not know the packet length and transmit type: broadcast or uni
@@ -303,7 +303,7 @@
 static	void
 cmpk_handle_tx_feedback(
 	struct net_device *dev,
-	u8	*	pmsg)
+	u8	*pmsg)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	cmpk_txfb_t		rx_tx_fb;	/* */
@@ -319,7 +319,7 @@
 	   endian type before copy the message copy. */
 	/* 2007/07/05 MH Use pointer to transfer structure memory. */
 	//memcpy((UINT8 *)&rx_tx_fb, pMsg, sizeof(CMPK_TXFB_T));
-	memcpy((u8*)&rx_tx_fb, pmsg, sizeof(cmpk_txfb_t));
+	memcpy((u8 *)&rx_tx_fb, pmsg, sizeof(cmpk_txfb_t));
 	/* 2. Use tx feedback info to count TX statistics. */
 	cmpk_count_txstatistic(dev, &rx_tx_fb);
 	/* 2007/01/17 MH Comment previous method for TX statistic function. */
@@ -341,7 +341,7 @@
 		//
 		// 070117, rcnjko: 87B have to S/W beacon for DTM encryption_cmn.
 		//
-		if(priv->ieee80211->current_network.mode == IEEE_A  ||
+		if (priv->ieee80211->current_network.mode == IEEE_A  ||
 			priv->ieee80211->current_network.mode == IEEE_N_5G ||
 			(priv->ieee80211->current_network.mode == IEEE_N_24G  && (!priv->ieee80211->pHTInfo->bCurSuppCCK)))
 		{
@@ -386,7 +386,7 @@
 static	void
 cmpk_handle_interrupt_status(
 	struct net_device *dev,
-	u8*	pmsg)
+	u8	*pmsg)
 {
 	cmpk_intr_sta_t		rx_intr_status;	/* */
 	struct r8192_priv *priv = ieee80211_priv(dev);
@@ -411,7 +411,7 @@
 
 
 	// Statistics of beacon for ad-hoc mode.
-	if(	priv->ieee80211->iw_mode == IW_MODE_ADHOC)
+	if (	priv->ieee80211->iw_mode == IW_MODE_ADHOC)
 	{
 		//2 maybe need endian transform?
 		rx_intr_status.interrupt_status = *((u32 *)(pmsg + 4));
@@ -467,7 +467,7 @@
 static	void
 cmpk_handle_query_config_rx(
 	struct net_device *dev,
-	u8*	   pmsg)
+	u8	   *pmsg)
 {
 	cmpk_query_cfg_t	rx_query_cfg;	/* */
 
@@ -580,11 +580,11 @@
 static	void
 cmpk_handle_tx_status(
 	struct net_device *dev,
-	u8*	   pmsg)
+	u8	   *pmsg)
 {
 	cmpk_tx_status_t	rx_tx_sts;	/* */
 
-	memcpy((void*)&rx_tx_sts, (void*)pmsg, sizeof(cmpk_tx_status_t));
+	memcpy((void *)&rx_tx_sts, (void *)pmsg, sizeof(cmpk_tx_status_t));
 	/* 2. Use tx feedback info to count TX statistics. */
 	cmpk_count_tx_status(dev, &rx_tx_sts);
 
@@ -610,7 +610,7 @@
 static	void
 cmpk_handle_tx_rate_history(
 	struct net_device *dev,
-	u8*	   pmsg)
+	u8	   *pmsg)
 {
 	cmpk_tx_rahis_t	*ptxrate;
 //	RT_RF_POWER_STATE	rtState;
@@ -727,12 +727,12 @@
 	      element type. Because FW may aggregate RX command packet to minimize
 	      transmit time between DRV and FW.*/
 	// Add a counter to prevent the lock in the loop from being held too long
-	while (total_length > 0 || exe_cnt++ >100)
+	while (total_length > 0 && exe_cnt++ < 100)
 	{
 		/* 2007/01/17 MH We support aggregation of different cmd in the same packet. */
 		element_id = pcmd_buff[0];
 
-		switch(element_id)
+		switch (element_id)
 		{
 			case RX_TX_FEEDBACK:
 				cmpk_handle_tx_feedback (dev, pcmd_buff);

diff --git a/drivers/staging/rtl8192u/r819xU_cmdpkt.h b/drivers/staging/rtl8192u/r819xU_cmdpkt.h
index 59caa4e..ebe4032 100644
--- a/drivers/staging/rtl8192u/r819xU_cmdpkt.h
+++ b/drivers/staging/rtl8192u/r819xU_cmdpkt.h

@@ -192,10 +192,10 @@
 	RT_STATUS_RESOURCE
 }rt_status,*prt_status;
 
-extern rt_status cmpk_message_handle_tx(struct net_device *dev, u8* codevirtualaddress, u32 packettype, u32 buffer_len);
+extern rt_status cmpk_message_handle_tx(struct net_device *dev, u8 *codevirtualaddress, u32 packettype, u32 buffer_len);
 
-extern  u32 cmpk_message_handle_rx(struct net_device *dev, struct ieee80211_rx_stats * pstats);
-extern rt_status SendTxCommandPacket( struct net_device *dev, void* pData, u32 DataLen);
+extern  u32 cmpk_message_handle_rx(struct net_device *dev, struct ieee80211_rx_stats *pstats);
+extern rt_status SendTxCommandPacket( struct net_device *dev, void *pData, u32 DataLen);
 
 
 #endif

diff --git a/drivers/staging/rtl8192u/r819xU_firmware.c b/drivers/staging/rtl8192u/r819xU_firmware.c
index 573e9cd..bb924ac 100644
--- a/drivers/staging/rtl8192u/r819xU_firmware.c
+++ b/drivers/staging/rtl8192u/r819xU_firmware.c

@@ -48,7 +48,7 @@
 	//Fragmentation might be required
 	frag_threshold = pfirmware->cmdpacket_frag_thresold;
 	do {
-		if((buffer_len - frag_offset) > frag_threshold) {
+		if ((buffer_len - frag_offset) > frag_threshold) {
 			frag_length = frag_threshold ;
 			bLastIniPkt = 0;
 
@@ -67,7 +67,7 @@
 		skb  = dev_alloc_skb(frag_length + 4);
 		#endif
 		memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-		tcb_desc = (cb_desc*)(skb->cb + MAX_DEV_ADDR_SIZE);
+		tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
 		tcb_desc->queue_index = TXCMD_QUEUE;
 		tcb_desc->bCmdOrInit = DESC_PACKET_TYPE_INIT;
 		tcb_desc->bLastIniPkt = bLastIniPkt;
@@ -89,7 +89,7 @@
 		tcb_desc->txbuf_size= (u16)i;
 		skb_put(skb, i);
 
-		if(!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
+		if (!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
 			(!skb_queue_empty(&priv->ieee80211->skb_waitQ[tcb_desc->queue_index]))||\
 			(priv->ieee80211->queue_stop) ) {
 			RT_TRACE(COMP_FIRMWARE,"=====================================================> tx full!\n");
@@ -125,7 +125,7 @@
 	//Get TCB and local buffer from common pool. (It is shared by CmdQ, MgntQ, and USB coalesce DataQ)
 	skb  = dev_alloc_skb(Length+ 4);
 	memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-	tcb_desc = (cb_desc*)(skb->cb + MAX_DEV_ADDR_SIZE);
+	tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
 	tcb_desc->queue_index = TXCMD_QUEUE;
 	tcb_desc->bCmdOrInit = DESC_PACKET_TYPE_INIT;
 	tcb_desc->bLastIniPkt = bLastInitPacket;
@@ -133,7 +133,7 @@
 	memset(ptr_buf,0,Length);
 	tcb_desc->txbuf_size= (u16)Length;
 
-	if(!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
+	if (!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
 			(!skb_queue_empty(&priv->ieee80211->skb_waitQ[tcb_desc->queue_index]))||\
 			(priv->ieee80211->queue_stop) ) {
 			RT_TRACE(COMP_FIRMWARE,"===================NULL packet==================================> tx full!\n");
@@ -168,14 +168,14 @@
 
 	/* Check whether put code OK */
 	do {
-		CPU_status = read_nic_dword(dev, CPU_GEN);
+		read_nic_dword(dev, CPU_GEN, &CPU_status);
 
-		if(CPU_status&CPU_GEN_PUT_CODE_OK)
+		if (CPU_status&CPU_GEN_PUT_CODE_OK)
 			break;
 
 	}while(check_putcodeOK_time--);
 
-	if(!(CPU_status&CPU_GEN_PUT_CODE_OK)) {
+	if (!(CPU_status&CPU_GEN_PUT_CODE_OK)) {
 		RT_TRACE(COMP_ERR, "Download Firmware: Put code fail!\n");
 		goto CPUCheckMainCodeOKAndTurnOnCPU_Fail;
 	} else {
@@ -183,19 +183,19 @@
 	}
 
 	/* Turn On CPU */
-	CPU_status = read_nic_dword(dev, CPU_GEN);
+	read_nic_dword(dev, CPU_GEN, &CPU_status);
 	write_nic_byte(dev, CPU_GEN, (u8)((CPU_status|CPU_GEN_PWR_STB_CPU)&0xff));
 	mdelay(1000);
 
 	/* Check whether CPU boot OK */
 	do {
-		CPU_status = read_nic_dword(dev, CPU_GEN);
+		read_nic_dword(dev, CPU_GEN, &CPU_status);
 
-		if(CPU_status&CPU_GEN_BOOT_RDY)
+		if (CPU_status&CPU_GEN_BOOT_RDY)
 			break;
 	}while(check_bootOk_time--);
 
-	if(!(CPU_status&CPU_GEN_BOOT_RDY)) {
+	if (!(CPU_status&CPU_GEN_BOOT_RDY)) {
 		goto CPUCheckMainCodeOKAndTurnOnCPU_Fail;
 	} else {
 		RT_TRACE(COMP_FIRMWARE, "Download Firmware: Boot ready!\n");
@@ -218,14 +218,14 @@
 
 	/* Check Firmware Ready */
 	do {
-		CPU_status = read_nic_dword(dev, CPU_GEN);
+		read_nic_dword(dev, CPU_GEN, &CPU_status);
 
-		if(CPU_status&CPU_GEN_FIRM_RDY)
+		if (CPU_status&CPU_GEN_FIRM_RDY)
 			break;
 
 	}while(check_time--);
 
-	if(!(CPU_status&CPU_GEN_FIRM_RDY))
+	if (!(CPU_status&CPU_GEN_FIRM_RDY))
 		goto CPUCheckFirmwareReady_Fail;
 	else
 		RT_TRACE(COMP_FIRMWARE, "Download Firmware: Firmware ready!\n");
@@ -265,7 +265,7 @@
 		starting_state = FW_INIT_STEP0_BOOT;
 		// TODO: system reset
 
-	}else if(pfirmware->firmware_status == FW_STATUS_5_READY) {
+	}else if (pfirmware->firmware_status == FW_STATUS_5_READY) {
 		/* it is called by Initialize */
 		rst_opt = OPT_FIRMWARE_RESET;
 		starting_state = FW_INIT_STEP2_DATA;
@@ -282,19 +282,19 @@
 		 * Open image file, and map file to continuous memory if open file success.
 		 * or read image file from array. Default load from IMG file
 		 */
-		if(rst_opt == OPT_SYSTEM_RESET) {
+		if (rst_opt == OPT_SYSTEM_RESET) {
 			rc = request_firmware(&fw_entry, fw_name[init_step],&priv->udev->dev);
-			if(rc < 0 ) {
+			if (rc < 0 ) {
 				RT_TRACE(COMP_ERR, "request firmware fail!\n");
 				goto download_firmware_fail;
 			}
 
-			if(fw_entry->size > sizeof(pfirmware->firmware_buf)) {
+			if (fw_entry->size > sizeof(pfirmware->firmware_buf)) {
 				RT_TRACE(COMP_ERR, "img file size exceed the container buffer fail!\n");
 				goto download_firmware_fail;
 			}
 
-			if(init_step != FW_INIT_STEP1_MAIN) {
+			if (init_step != FW_INIT_STEP1_MAIN) {
 				memcpy(pfirmware->firmware_buf,fw_entry->data,fw_entry->size);
 				mapped_file = pfirmware->firmware_buf;
 				file_length = fw_entry->size;
@@ -311,7 +311,7 @@
 #endif
 			}
 			pfirmware->firmware_buf_size = file_length;
-		}else if(rst_opt == OPT_FIRMWARE_RESET ) {
+		}else if (rst_opt == OPT_FIRMWARE_RESET ) {
 			/* we only need to download data.img here */
 			mapped_file = pfirmware->firmware_buf;
 			file_length = pfirmware->firmware_buf_size;
@@ -325,15 +325,15 @@
 		 *   and Tx descriptor info
 		 * */
 		rt_status = fw_download_code(dev,mapped_file,file_length);
-		if(rst_opt == OPT_SYSTEM_RESET) {
+		if (rst_opt == OPT_SYSTEM_RESET) {
 			release_firmware(fw_entry);
 		}
 
-		if(rt_status != TRUE) {
+		if (rt_status != TRUE) {
 			goto download_firmware_fail;
 		}
 
-		switch(init_step) {
+		switch (init_step) {
 		case FW_INIT_STEP0_BOOT:
 			/* Download boot
 			 * initialize command descriptor.
@@ -343,7 +343,7 @@
 #ifdef RTL8190P
 			// To initialize IMEM, CPU move code  from 0x80000080, hence, we send 0x80 byte packet
 			rt_status = fwSendNullPacket(dev, RTL8190_CPU_START_OFFSET);
-			if(rt_status != true)
+			if (rt_status != true)
 			{
 				RT_TRACE(COMP_INIT, "fwSendNullPacket() fail ! \n");
 				goto  download_firmware_fail;
@@ -362,7 +362,7 @@
 
 			/* Check Put Code OK and Turn On CPU */
 			rt_status = CPUcheck_maincodeok_turnonCPU(dev);
-			if(rt_status != TRUE) {
+			if (rt_status != TRUE) {
 				RT_TRACE(COMP_ERR, "CPUcheck_maincodeok_turnonCPU fail!\n");
 				goto download_firmware_fail;
 			}
@@ -376,7 +376,7 @@
 			mdelay(1);
 
 			rt_status = CPUcheck_firmware_ready(dev);
-			if(rt_status != TRUE) {
+			if (rt_status != TRUE) {
 				RT_TRACE(COMP_ERR, "CPUcheck_firmware_ready fail(%d)!\n",rt_status);
 				goto download_firmware_fail;
 			}

diff --git a/drivers/staging/rtl8192u/r819xU_phy.c b/drivers/staging/rtl8192u/r819xU_phy.c
index 17fac41..a6fac08 100644
--- a/drivers/staging/rtl8192u/r819xU_phy.c
+++ b/drivers/staging/rtl8192u/r819xU_phy.c

@@ -7,22 +7,24 @@
 #include "r819xU_firmware_img.h"
 
 #include "dot11d.h"
+#include <linux/bitops.h>
+
 static u32 RF_CHANNEL_TABLE_ZEBRA[] = {
 	0,
-	0x085c, //2412 1
-	0x08dc, //2417 2
-	0x095c, //2422 3
-	0x09dc, //2427 4
-	0x0a5c, //2432 5
-	0x0adc, //2437 6
-	0x0b5c, //2442 7
-	0x0bdc, //2447 8
-	0x0c5c, //2452 9
-	0x0cdc, //2457 10
-	0x0d5c, //2462 11
-	0x0ddc, //2467 12
-	0x0e5c, //2472 13
-	0x0f72, //2484
+	0x085c, /* 2412 1  */
+	0x08dc, /* 2417 2  */
+	0x095c, /* 2422 3  */
+	0x09dc, /* 2427 4  */
+	0x0a5c, /* 2432 5  */
+	0x0adc, /* 2437 6  */
+	0x0b5c, /* 2442 7  */
+	0x0bdc, /* 2447 8  */
+	0x0c5c, /* 2452 9  */
+	0x0cdc, /* 2457 10 */
+	0x0d5c, /* 2462 11 */
+	0x0ddc, /* 2467 12 */
+	0x0e5c, /* 2472 13 */
+	0x0f72, /* 2484    */
 };
 
 
@@ -36,36 +38,36 @@
 #define rtl819XAGCTAB_Array Rtl8192UsbAGCTAB_Array
 
 /******************************************************************************
- *function:  This function read BB parameters from Header file we gen,
- *	     and do register read/write
- *   input:  u32	dwBitMask  //taget bit pos in the addr to be modified
- *  output:  none
- *  return:  u32	return the shift bit position of the mask
- * ****************************************************************************/
-u32 rtl8192_CalculateBitShift(u32 dwBitMask)
+ * function: This function reads BB parameters from header file we generate,
+ *           and does register read/write
+ * input:    u32	bitmask  //taget bit pos in the addr to be modified
+ * output:   none
+ * return:   u32	return the shift bit position of the mask
+ ******************************************************************************/
+u32 rtl8192_CalculateBitShift(u32 bitmask)
 {
 	u32 i;
-	for (i=0; i<=31; i++)
-	{
-		if (((dwBitMask>>i)&0x1) == 1)
-			break;
-	}
+
+	i = ffs(bitmask) - 1;
 	return i;
 }
+
 /******************************************************************************
- *function:  This function check different RF type to execute legal judgement. If RF Path is illegal, we will return false.
- *   input:  none
- *  output:  none
- *  return:  0(illegal, false), 1(legal,true)
- * ***************************************************************************/
-u8 rtl8192_phy_CheckIsLegalRFPath(struct net_device* dev, u32 eRFPath)
+ * function:  This function checks different RF type to execute legal judgement.
+ *            If RF Path is illegal, we will return false.
+ * input:     net_device	 *dev
+ *            u32		 eRFPath
+ * output:    none
+ * return:    0(illegal, false), 1(legal, true)
+ *****************************************************************************/
+u8 rtl8192_phy_CheckIsLegalRFPath(struct net_device *dev, u32 eRFPath)
 {
 	u8 ret = 1;
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	if (priv->rf_type == RF_2T4R)
+
+	if (priv->rf_type == RF_2T4R) {
 		ret = 0;
-	else if (priv->rf_type == RF_1T2R)
-	{
+	} else if (priv->rf_type == RF_1T2R) {
 		if (eRFPath == RF90_PATH_A || eRFPath == RF90_PATH_B)
 			ret = 1;
 		else if (eRFPath == RF90_PATH_C || eRFPath == RF90_PATH_D)
@@ -73,662 +75,682 @@
 	}
 	return ret;
 }
+
 /******************************************************************************
- *function:  This function set specific bits to BB register
- *   input:  net_device dev
- *           u32	dwRegAddr  //target addr to be modified
- *           u32	dwBitMask  //taget bit pos in the addr to be modified
- *           u32	dwData     //value to be write
- *  output:  none
- *  return:  none
- *  notice:
- * ****************************************************************************/
-void rtl8192_setBBreg(struct net_device* dev, u32 dwRegAddr, u32 dwBitMask, u32 dwData)
+ * function:  This function sets specific bits to BB register
+ * input:     net_device *dev
+ *            u32        reg_addr   //target addr to be modified
+ *            u32        bitmask    //taget bit pos to be modified
+ *            u32        data       //value to be write
+ * output:    none
+ * return:    none
+ * notice:
+ ******************************************************************************/
+void rtl8192_setBBreg(struct net_device *dev, u32 reg_addr, u32 bitmask,
+		      u32 data)
 {
 
-	u32 OriginalValue, BitShift, NewValue;
+	u32 reg, bitshift;
 
-	if(dwBitMask!= bMaskDWord)
-	{//if not "double word" write
-		OriginalValue = read_nic_dword(dev, dwRegAddr);
-		BitShift = rtl8192_CalculateBitShift(dwBitMask);
-		NewValue = (((OriginalValue) & (~dwBitMask)) | (dwData << BitShift));
-		write_nic_dword(dev, dwRegAddr, NewValue);
-	}else
-		write_nic_dword(dev, dwRegAddr, dwData);
+	if (bitmask != bMaskDWord) {
+		read_nic_dword(dev, reg_addr, &reg);
+		bitshift = rtl8192_CalculateBitShift(bitmask);
+		reg &= ~bitmask;
+		reg |= data << bitshift;
+		write_nic_dword(dev, reg_addr, reg);
+	} else {
+		write_nic_dword(dev, reg_addr, data);
+	}
 	return;
 }
+
 /******************************************************************************
- *function:  This function reads specific bits from BB register
- *   input:  net_device dev
- *           u32	dwRegAddr  //target addr to be readback
- *           u32	dwBitMask  //taget bit pos in the addr to be readback
- *  output:  none
- *  return:  u32	Data	//the readback register value
- *  notice:
- * ****************************************************************************/
-u32 rtl8192_QueryBBReg(struct net_device* dev, u32 dwRegAddr, u32 dwBitMask)
+ * function:  This function reads specific bits from BB register
+ * input:     net_device	*dev
+ *            u32	 	reg_addr   //target addr to be readback
+ *            u32	 	bitmask    //taget bit pos to be readback
+ * output:    none
+ * return:    u32	 	data       //the readback register value
+ * notice:
+ ******************************************************************************/
+u32 rtl8192_QueryBBReg(struct net_device *dev, u32 reg_addr, u32 bitmask)
 {
-	u32 Ret = 0, OriginalValue, BitShift;
+	u32 reg, bitshift;
 
-	OriginalValue = read_nic_dword(dev, dwRegAddr);
-	BitShift = rtl8192_CalculateBitShift(dwBitMask);
-	Ret =(OriginalValue & dwBitMask) >> BitShift;
+	read_nic_dword(dev, reg_addr, &reg);
+	bitshift = rtl8192_CalculateBitShift(bitmask);
 
-	return (Ret);
+	return (reg & bitmask) >> bitshift;
 }
-static  u32 phy_FwRFSerialRead( struct net_device* dev, RF90_RADIO_PATH_E       eRFPath, u32 Offset  );
 
-static void phy_FwRFSerialWrite( struct net_device* dev, RF90_RADIO_PATH_E       eRFPath, u32  Offset, u32  Data);
+static u32 phy_FwRFSerialRead(struct net_device *dev, RF90_RADIO_PATH_E eRFPath,
+			      u32 offset);
+
+static void phy_FwRFSerialWrite(struct net_device *dev,
+				RF90_RADIO_PATH_E eRFPath, u32  offset,
+				u32  data);
 
 /******************************************************************************
- *function:  This function read register from RF chip
- *   input:  net_device dev
- *	     RF90_RADIO_PATH_E eRFPath //radio path of A/B/C/D
- *           u32	Offset     //target address to be read
- *  output:  none
- *  return:  u32	readback value
- *  notice:  There are three types of serial operations:(1) Software serial write.(2)Hardware LSSI-Low Speed Serial Interface.(3)Hardware HSSI-High speed serial write. Driver here need to implement (1) and (2)---need more spec for this information.
- * ****************************************************************************/
-u32 rtl8192_phy_RFSerialRead(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32 Offset)
+ * function:  This function reads register from RF chip
+ * input:     net_device        *dev
+ *            RF90_RADIO_PATH_E eRFPath    //radio path of A/B/C/D
+ *            u32               offset     //target address to be read
+ * output:    none
+ * return:    u32               readback value
+ * notice:    There are three types of serial operations:
+ *            (1) Software serial write.
+ *            (2)Hardware LSSI-Low Speed Serial Interface.
+ *            (3)Hardware HSSI-High speed serial write.
+ *            Driver here need to implement (1) and (2)
+ *            ---need more spec for this information.
+ ******************************************************************************/
+u32 rtl8192_phy_RFSerialRead(struct net_device *dev, RF90_RADIO_PATH_E eRFPath,
+			     u32 offset)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u32 ret = 0;
-	u32 NewOffset = 0;
-	BB_REGISTER_DEFINITION_T* pPhyReg = &priv->PHYRegDef[eRFPath];
-	rtl8192_setBBreg(dev, pPhyReg->rfLSSIReadBack, bLSSIReadBackData, 0);
-	//make sure RF register offset is correct
-	Offset &= 0x3f;
+	u32 new_offset = 0;
+	BB_REGISTER_DEFINITION_T *pPhyReg = &priv->PHYRegDef[eRFPath];
 
-	//switch page for 8256 RF IC
-	if (priv->rf_chip == RF_8256)
-	{
-		if (Offset >= 31)
-		{
+	rtl8192_setBBreg(dev, pPhyReg->rfLSSIReadBack, bLSSIReadBackData, 0);
+	/* Make sure RF register offset is correct */
+	offset &= 0x3f;
+
+	/* Switch page for 8256 RF IC */
+	if (priv->rf_chip == RF_8256) {
+		if (offset >= 31) {
 			priv->RfReg0Value[eRFPath] |= 0x140;
-			//Switch to Reg_Mode2 for Reg 31-45
-			rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset, bMaskDWord, (priv->RfReg0Value[eRFPath]<<16) );
-			//modify offset
-			NewOffset = Offset -30;
-		}
-		else if (Offset >= 16)
-		{
+			/* Switch to Reg_Mode2 for Reg 31-45 */
+			rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset,
+					 bMaskDWord,
+					 priv->RfReg0Value[eRFPath]<<16);
+			/* Modify offset */
+			new_offset = offset - 30;
+		} else if (offset >= 16) {
 			priv->RfReg0Value[eRFPath] |= 0x100;
 			priv->RfReg0Value[eRFPath] &= (~0x40);
-			//Switch to Reg_Mode 1 for Reg16-30
-			rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset, bMaskDWord, (priv->RfReg0Value[eRFPath]<<16) );
+			/* Switch to Reg_Mode1 for Reg16-30 */
+			rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset,
+					 bMaskDWord,
+					 priv->RfReg0Value[eRFPath]<<16);
 
-			NewOffset = Offset - 15;
+			new_offset = offset - 15;
+		} else {
+			new_offset = offset;
 		}
-		else
-			NewOffset = Offset;
+	} else {
+		RT_TRACE((COMP_PHY|COMP_ERR),
+			 "check RF type here, need to be 8256\n");
+		new_offset = offset;
 	}
-	else
-	{
-		RT_TRACE((COMP_PHY|COMP_ERR), "check RF type here, need to be 8256\n");
-		NewOffset = Offset;
-	}
-	//put desired read addr to LSSI control Register
-	rtl8192_setBBreg(dev, pPhyReg->rfHSSIPara2, bLSSIReadAddress, NewOffset);
-	//Issue a posedge trigger
-	//
+	/* Put desired read addr to LSSI control Register */
+	rtl8192_setBBreg(dev, pPhyReg->rfHSSIPara2, bLSSIReadAddress,
+			 new_offset);
+	/* Issue a posedge trigger */
 	rtl8192_setBBreg(dev, pPhyReg->rfHSSIPara2,  bLSSIReadEdge, 0x0);
 	rtl8192_setBBreg(dev, pPhyReg->rfHSSIPara2,  bLSSIReadEdge, 0x1);
 
 
-	// TODO: we should not delay such a long time. Ask for help from SD3
-	msleep(1);
+	/* TODO: we should not delay such a long time. Ask for help from SD3 */
+	usleep_range(1000, 1000);
 
-	ret = rtl8192_QueryBBReg(dev, pPhyReg->rfLSSIReadBack, bLSSIReadBackData);
+	ret = rtl8192_QueryBBReg(dev, pPhyReg->rfLSSIReadBack,
+				 bLSSIReadBackData);
 
 
-	// Switch back to Reg_Mode0;
-	if(priv->rf_chip == RF_8256)
-	{
+	/* Switch back to Reg_Mode0 */
+	if (priv->rf_chip == RF_8256) {
 		priv->RfReg0Value[eRFPath] &= 0xebf;
 
-		rtl8192_setBBreg(
-			dev,
-			pPhyReg->rf3wireOffset,
-			bMaskDWord,
-			(priv->RfReg0Value[eRFPath] << 16));
+		rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset, bMaskDWord,
+				 priv->RfReg0Value[eRFPath] << 16);
 	}
 
 	return ret;
-
 }
 
 /******************************************************************************
- *function:  This function write data to RF register
- *   input:  net_device dev
- *	     RF90_RADIO_PATH_E eRFPath //radio path of A/B/C/D
- *           u32	Offset     //target address to be written
- *           u32	Data	//The new register data to be written
- *  output:  none
- *  return:  none
- *  notice:  For RF8256 only.
-  ===========================================================
- *Reg Mode	RegCTL[1]	RegCTL[0]		Note
+ * function:  This function writes data to RF register
+ * input:     net_device        *dev
+ *            RF90_RADIO_PATH_E eRFPath  //radio path of A/B/C/D
+ *            u32               offset   //target address to be written
+ *            u32               data	 //the new register data to be written
+ * output:    none
+ * return:    none
+ * notice:    For RF8256 only.
+ * ===========================================================================
+ * Reg Mode	RegCTL[1]	RegCTL[0]		Note
  *		(Reg00[12])	(Reg00[10])
- *===========================================================
- *Reg_Mode0	0		x			Reg 0 ~15(0x0 ~ 0xf)
- *------------------------------------------------------------------
- *Reg_Mode1	1		0			Reg 16 ~30(0x1 ~ 0xf)
- *------------------------------------------------------------------
+ * ===========================================================================
+ * Reg_Mode0	0		x			Reg 0 ~ 15(0x0 ~ 0xf)
+ * ---------------------------------------------------------------------------
+ * Reg_Mode1	1		0			Reg 16 ~ 30(0x1 ~ 0xf)
+ * ---------------------------------------------------------------------------
  * Reg_Mode2	1		1			Reg 31 ~ 45(0x1 ~ 0xf)
- *------------------------------------------------------------------
- * ****************************************************************************/
-void rtl8192_phy_RFSerialWrite(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32 Offset, u32 Data)
+ * ---------------------------------------------------------------------------
+ *****************************************************************************/
+void rtl8192_phy_RFSerialWrite(struct net_device *dev,
+			       RF90_RADIO_PATH_E eRFPath, u32 offset, u32 data)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	u32 DataAndAddr = 0, NewOffset = 0;
+	u32 DataAndAddr = 0, new_offset = 0;
 	BB_REGISTER_DEFINITION_T	*pPhyReg = &priv->PHYRegDef[eRFPath];
 
-	Offset &= 0x3f;
-	//spin_lock_irqsave(&priv->rf_lock, flags);
-//	down(&priv->rf_sem);
-	if (priv->rf_chip == RF_8256)
-	{
+	offset &= 0x3f;
+	if (priv->rf_chip == RF_8256) {
 
-		if (Offset >= 31)
-		{
+		if (offset >= 31) {
 			priv->RfReg0Value[eRFPath] |= 0x140;
-			rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset, bMaskDWord, (priv->RfReg0Value[eRFPath] << 16));
-			NewOffset = Offset - 30;
-		}
-		else if (Offset >= 16)
-		{
+			rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset,
+					 bMaskDWord,
+					 priv->RfReg0Value[eRFPath] << 16);
+			new_offset = offset - 30;
+		} else if (offset >= 16) {
 			priv->RfReg0Value[eRFPath] |= 0x100;
 			priv->RfReg0Value[eRFPath] &= (~0x40);
-			rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset, bMaskDWord, (priv->RfReg0Value[eRFPath]<<16));
-			NewOffset = Offset - 15;
+			rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset,
+					 bMaskDWord,
+					 priv->RfReg0Value[eRFPath]<<16);
+			new_offset = offset - 15;
+		} else {
+			new_offset = offset;
 		}
-		else
-			NewOffset = Offset;
-	}
-	else
-	{
-		RT_TRACE((COMP_PHY|COMP_ERR), "check RF type here, need to be 8256\n");
-		NewOffset = Offset;
+	} else {
+		RT_TRACE((COMP_PHY|COMP_ERR),
+			 "check RF type here, need to be 8256\n");
+		new_offset = offset;
 	}
 
-	// Put write addr in [5:0] and write data in [31:16]
-	DataAndAddr = (Data<<16) | (NewOffset&0x3f);
+	/* Put write addr in [5:0] and write data in [31:16] */
+	DataAndAddr = (data<<16) | (new_offset&0x3f);
 
-	// Write Operation
+	/* Write operation */
 	rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset, bMaskDWord, DataAndAddr);
 
 
-	if(Offset==0x0)
-		priv->RfReg0Value[eRFPath] = Data;
+	if (offset == 0x0)
+		priv->RfReg0Value[eRFPath] = data;
 
-	// Switch back to Reg_Mode0;
-	if(priv->rf_chip == RF_8256)
-	{
-		if(Offset != 0)
-		{
+	/* Switch back to Reg_Mode0 */
+	if (priv->rf_chip == RF_8256) {
+		if (offset != 0) {
 			priv->RfReg0Value[eRFPath] &= 0xebf;
-			rtl8192_setBBreg(
-				dev,
-				pPhyReg->rf3wireOffset,
-				bMaskDWord,
-				(priv->RfReg0Value[eRFPath] << 16));
+			rtl8192_setBBreg(dev, pPhyReg->rf3wireOffset,
+					 bMaskDWord,
+					 priv->RfReg0Value[eRFPath] << 16);
 		}
 	}
-	//spin_unlock_irqrestore(&priv->rf_lock, flags);
-//	up(&priv->rf_sem);
 	return;
 }
 
 /******************************************************************************
- *function:  This function set specific bits to RF register
- *   input:  net_device dev
- *	     RF90_RADIO_PATH_E eRFPath //radio path of A/B/C/D
- *           u32	RegAddr  //target addr to be modified
- *           u32	BitMask  //taget bit pos in the addr to be modified
- *           u32	Data     //value to be write
- *  output:  none
- *  return:  none
- *  notice:
- * ****************************************************************************/
-void rtl8192_phy_SetRFReg(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask, u32 Data)
+ * function:  This function set specific bits to RF register
+ * input:     net_device        dev
+ *            RF90_RADIO_PATH_E eRFPath  //radio path of A/B/C/D
+ *            u32               reg_addr //target addr to be modified
+ *            u32               bitmask  //taget bit pos to be modified
+ *            u32               data     //value to be written
+ * output:    none
+ * return:    none
+ * notice:
+ *****************************************************************************/
+void rtl8192_phy_SetRFReg(struct net_device *dev, RF90_RADIO_PATH_E eRFPath,
+			  u32 reg_addr, u32 bitmask, u32 data)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	u32 Original_Value, BitShift, New_Value;
-//	u8	time = 0;
+	u32 reg, bitshift;
 
 	if (!rtl8192_phy_CheckIsLegalRFPath(dev, eRFPath))
 		return;
 
-	if (priv->Rf_Mode == RF_OP_By_FW)
-	{
-		if (BitMask != bMask12Bits) // RF data is 12 bits only
-		{
-			Original_Value = phy_FwRFSerialRead(dev, eRFPath, RegAddr);
-			BitShift =  rtl8192_CalculateBitShift(BitMask);
-			New_Value = ((Original_Value) & (~BitMask)) | (Data<< BitShift);
+	if (priv->Rf_Mode == RF_OP_By_FW) {
+		if (bitmask != bMask12Bits) {
+			/* RF data is 12 bits only */
+			reg = phy_FwRFSerialRead(dev, eRFPath, reg_addr);
+			bitshift =  rtl8192_CalculateBitShift(bitmask);
+			reg &= ~bitmask;
+			reg |= data << bitshift;
 
-			phy_FwRFSerialWrite(dev, eRFPath, RegAddr, New_Value);
-		}else
-			phy_FwRFSerialWrite(dev, eRFPath, RegAddr, Data);
+			phy_FwRFSerialWrite(dev, eRFPath, reg_addr, reg);
+		} else {
+			phy_FwRFSerialWrite(dev, eRFPath, reg_addr, data);
+		}
 
 		udelay(200);
 
-	}
-	else
-	{
-		if (BitMask != bMask12Bits) // RF data is 12 bits only
-		{
-			Original_Value = rtl8192_phy_RFSerialRead(dev, eRFPath, RegAddr);
-			BitShift =  rtl8192_CalculateBitShift(BitMask);
-			New_Value = (((Original_Value) & (~BitMask)) | (Data<< BitShift));
+	} else {
+		if (bitmask != bMask12Bits) {
+			/* RF data is 12 bits only */
+			reg = rtl8192_phy_RFSerialRead(dev, eRFPath, reg_addr);
+			bitshift =  rtl8192_CalculateBitShift(bitmask);
+			reg &= ~bitmask;
+			reg |= data << bitshift;
 
-			rtl8192_phy_RFSerialWrite(dev, eRFPath, RegAddr, New_Value);
-		}else
-			rtl8192_phy_RFSerialWrite(dev, eRFPath, RegAddr, Data);
+			rtl8192_phy_RFSerialWrite(dev, eRFPath, reg_addr, reg);
+		} else {
+			rtl8192_phy_RFSerialWrite(dev, eRFPath, reg_addr, data);
+		}
 	}
 	return;
 }
 
 /******************************************************************************
- *function:  This function reads specific bits from RF register
- *   input:  net_device dev
- *           u32	RegAddr  //target addr to be readback
- *           u32	BitMask  //taget bit pos in the addr to be readback
- *  output:  none
- *  return:  u32	Data	//the readback register value
- *  notice:
- * ****************************************************************************/
-u32 rtl8192_phy_QueryRFReg(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask)
+ * function:  This function reads specific bits from RF register
+ * input:     net_device        *dev
+ *            u32               reg_addr //target addr to be readback
+ *            u32               bitmask  //taget bit pos to be readback
+ * output:    none
+ * return:    u32               data     //the readback register value
+ * notice:
+ *****************************************************************************/
+u32 rtl8192_phy_QueryRFReg(struct net_device *dev, RF90_RADIO_PATH_E eRFPath,
+			   u32 reg_addr, u32 bitmask)
 {
-	u32 Original_Value, Readback_Value, BitShift;
+	u32 reg, bitshift;
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
 
 	if (!rtl8192_phy_CheckIsLegalRFPath(dev, eRFPath))
 		return 0;
-	if (priv->Rf_Mode == RF_OP_By_FW)
-	{
-		Original_Value = phy_FwRFSerialRead(dev, eRFPath, RegAddr);
-		BitShift =  rtl8192_CalculateBitShift(BitMask);
-		Readback_Value = (Original_Value & BitMask) >> BitShift;
+	if (priv->Rf_Mode == RF_OP_By_FW) {
+		reg = phy_FwRFSerialRead(dev, eRFPath, reg_addr);
+		bitshift =  rtl8192_CalculateBitShift(bitmask);
+		reg = (reg & bitmask) >> bitshift;
 		udelay(200);
-		return (Readback_Value);
-	}
-	else
-	{
-		Original_Value = rtl8192_phy_RFSerialRead(dev, eRFPath, RegAddr);
-		BitShift =  rtl8192_CalculateBitShift(BitMask);
-		Readback_Value = (Original_Value & BitMask) >> BitShift;
-		return (Readback_Value);
+		return reg;
+	} else {
+		reg = rtl8192_phy_RFSerialRead(dev, eRFPath, reg_addr);
+		bitshift =  rtl8192_CalculateBitShift(bitmask);
+		reg = (reg & bitmask) >> bitshift;
+		return reg;
 	}
 }
+
 /******************************************************************************
- *function:  We support firmware to execute RF-R/W.
- *   input:  dev
- *  output:  none
- *  return:  none
- *  notice:
- * ***************************************************************************/
-static	u32
-phy_FwRFSerialRead(
-	struct net_device* dev,
-	RF90_RADIO_PATH_E	eRFPath,
-	u32				Offset	)
+ * function:  We support firmware to execute RF-R/W.
+ * input:     net_device        *dev
+ *            RF90_RADIO_PATH_E eRFPath
+ *            u32               offset
+ * output:    none
+ * return:    u32
+ * notice:
+ ****************************************************************************/
+static u32 phy_FwRFSerialRead(struct net_device *dev, RF90_RADIO_PATH_E eRFPath,
+			      u32 offset)
 {
-	u32		retValue = 0;
-	u32		Data = 0;
+	u32		reg = 0;
+	u32		data = 0;
 	u8		time = 0;
-	//DbgPrint("FW RF CTRL\n\r");
-	/* 2007/11/02 MH Firmware RF Write control. By Francis' suggestion, we can
-	   not execute the scheme in the initial step. Otherwise, RF-R/W will waste
-	   much time. This is only for site survey. */
-	// 1. Read operation need not insert data. bit 0-11
-	//Data &= bMask12Bits;
-	// 2. Write RF register address. Bit 12-19
-	Data |= ((Offset&0xFF)<<12);
-	// 3. Write RF path.  bit 20-21
-	Data |= ((eRFPath&0x3)<<20);
-	// 4. Set RF read indicator. bit 22=0
-	//Data |= 0x00000;
-	// 5. Trigger Fw to operate the command. bit 31
-	Data |= 0x80000000;
-	// 6. We can not execute read operation if bit 31 is 1.
-	while (read_nic_dword(dev, QPNR)&0x80000000)
-	{
-		// If FW can not finish RF-R/W for more than ?? times. We must reset FW.
-		if (time++ < 100)
-		{
-			//DbgPrint("FW not finish RF-R Time=%d\n\r", time);
+	u32		tmp;
+
+	/* Firmware RF Write control.
+	 * We can not execute the scheme in the initial step.
+	 * Otherwise, RF-R/W will waste much time.
+	 * This is only for site survey. */
+	/* 1. Read operation need not insert data. bit 0-11 */
+	/* 2. Write RF register address. bit 12-19 */
+	data |= ((offset&0xFF)<<12);
+	/* 3. Write RF path.  bit 20-21 */
+	data |= ((eRFPath&0x3)<<20);
+	/* 4. Set RF read indicator. bit 22=0 */
+	/* 5. Trigger Fw to operate the command. bit 31 */
+	data |= 0x80000000;
+	/* 6. We can not execute read operation if bit 31 is 1. */
+	read_nic_dword(dev, QPNR, &tmp);
+	while (tmp & 0x80000000) {
+		/* If FW can not finish RF-R/W for more than ?? times.
+		   We must reset FW. */
+		if (time++ < 100) {
 			udelay(10);
-		}
-		else
+			read_nic_dword(dev, QPNR, &tmp);
+		} else {
 			break;
-	}
-	// 7. Execute read operation.
-	write_nic_dword(dev, QPNR, Data);
-	// 8. Check if firmawre send back RF content.
-	while (read_nic_dword(dev, QPNR)&0x80000000)
-	{
-		// If FW can not finish RF-R/W for more than ?? times. We must reset FW.
-		if (time++ < 100)
-		{
-			//DbgPrint("FW not finish RF-W Time=%d\n\r", time);
-			udelay(10);
 		}
-		else
-			return	(0);
 	}
-	retValue = read_nic_dword(dev, RF_DATA);
+	/* 7. Execute read operation. */
+	write_nic_dword(dev, QPNR, data);
+	/* 8. Check if firmware send back RF content. */
+	read_nic_dword(dev, QPNR, &tmp);
+	while (tmp & 0x80000000) {
+		/* If FW can not finish RF-R/W for more than ?? times.
+		   We must reset FW. */
+		if (time++ < 100) {
+			udelay(10);
+			read_nic_dword(dev, QPNR, &tmp);
+		} else {
+			return 0;
+		}
+	}
+	read_nic_dword(dev, RF_DATA, &reg);
 
-	return	(retValue);
-
-}	/* phy_FwRFSerialRead */
+	return reg;
+}
 
 /******************************************************************************
- *function:  We support firmware to execute RF-R/W.
- *   input:  dev
- *  output:  none
- *  return:  none
- *  notice:
- * ***************************************************************************/
-static void
-phy_FwRFSerialWrite(
-		struct net_device* dev,
-		RF90_RADIO_PATH_E	eRFPath,
-		u32				Offset,
-		u32				Data	)
+ * function:  We support firmware to execute RF-R/W.
+ * input:     net_device        *dev
+ *            RF90_RADIO_PATH_E eRFPath
+ *            u32               offset
+ *            u32               data
+ * output:    none
+ * return:    none
+ * notice:
+ ****************************************************************************/
+static void phy_FwRFSerialWrite(struct net_device *dev,
+				RF90_RADIO_PATH_E eRFPath, u32 offset, u32 data)
 {
 	u8	time = 0;
+	u32	tmp;
 
-	//DbgPrint("N FW RF CTRL RF-%d OF%02x DATA=%03x\n\r", eRFPath, Offset, Data);
-	/* 2007/11/02 MH Firmware RF Write control. By Francis' suggestion, we can
-	   not execute the scheme in the initial step. Otherwise, RF-R/W will waste
-	   much time. This is only for site survey. */
+	/* Firmware RF Write control.
+	 * We can not execute the scheme in the initial step.
+	 * Otherwise, RF-R/W will waste much time.
+	 * This is only for site survey. */
 
-	// 1. Set driver write bit and 12 bit data. bit 0-11
-	//Data &= bMask12Bits;	// Done by uper layer.
-	// 2. Write RF register address. bit 12-19
-	Data |= ((Offset&0xFF)<<12);
-	// 3. Write RF path.  bit 20-21
-	Data |= ((eRFPath&0x3)<<20);
-	// 4. Set RF write indicator. bit 22=1
-	Data |= 0x400000;
-	// 5. Trigger Fw to operate the command. bit 31=1
-	Data |= 0x80000000;
+	/* 1. Set driver write bit and 12 bit data. bit 0-11 */
+	/* 2. Write RF register address. bit 12-19 */
+	data |= ((offset&0xFF)<<12);
+	/* 3. Write RF path.  bit 20-21 */
+	data |= ((eRFPath&0x3)<<20);
+	/* 4. Set RF write indicator. bit 22=1 */
+	data |= 0x400000;
+	/* 5. Trigger Fw to operate the command. bit 31=1 */
+	data |= 0x80000000;
 
-	// 6. Write operation. We can not write if bit 31 is 1.
-	while (read_nic_dword(dev, QPNR)&0x80000000)
-	{
-		// If FW can not finish RF-R/W for more than ?? times. We must reset FW.
-		if (time++ < 100)
-		{
-			//DbgPrint("FW not finish RF-W Time=%d\n\r", time);
+	/* 6. Write operation. We can not write if bit 31 is 1. */
+	read_nic_dword(dev, QPNR, &tmp);
+	while (tmp & 0x80000000) {
+		/* If FW can not finish RF-R/W for more than ?? times.
+		   We must reset FW. */
+		if (time++ < 100) {
 			udelay(10);
-		}
-		else
+			read_nic_dword(dev, QPNR, &tmp);
+		} else {
 			break;
+		}
 	}
-	// 7. No matter check bit. We always force the write. Because FW will
-	//    not accept the command.
-	write_nic_dword(dev, QPNR, Data);
-	/* 2007/11/02 MH Acoording to test, we must delay 20us to wait firmware
+	/* 7. No matter check bit. We always force the write.
+	   Because FW will not accept the command. */
+	write_nic_dword(dev, QPNR, data);
+	/* According to test, we must delay 20us to wait firmware
 	   to finish RF write operation. */
-	/* 2008/01/17 MH We support delay in firmware side now. */
-	//delay_us(20);
-
-}	/* phy_FwRFSerialWrite */
-
+	/* We support delay in firmware side now. */
+}
 
 /******************************************************************************
- *function:  This function read BB parameters from Header file we gen,
- *	     and do register read/write
- *   input:  dev
- *  output:  none
- *  return:  none
- *  notice:  BB parameters may change all the time, so please make
- *           sure it has been synced with the newest.
- * ***************************************************************************/
-void rtl8192_phy_configmac(struct net_device* dev)
+ * function:  This function reads BB parameters from header file we generate,
+ *            and do register read/write
+ * input:     net_device 	*dev
+ * output:    none
+ * return:    none
+ * notice:    BB parameters may change all the time, so please make
+ *            sure it has been synced with the newest.
+ *****************************************************************************/
+void rtl8192_phy_configmac(struct net_device *dev)
 {
 	u32 dwArrayLen = 0, i;
-	u32* pdwArray = NULL;
+	u32 *pdwArray = NULL;
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
-	if(priv->btxpowerdata_readfromEEPORM)
-	{
+	if (priv->btxpowerdata_readfromEEPORM) {
 		RT_TRACE(COMP_PHY, "Rtl819XMACPHY_Array_PG\n");
 		dwArrayLen = MACPHY_Array_PGLength;
 		pdwArray = rtl819XMACPHY_Array_PG;
 
-	}
-	else
-	{
+	} else {
 		RT_TRACE(COMP_PHY, "Rtl819XMACPHY_Array\n");
 		dwArrayLen = MACPHY_ArrayLength;
 		pdwArray = rtl819XMACPHY_Array;
 	}
-	for(i = 0; i<dwArrayLen; i=i+3){
-		if(pdwArray[i] == 0x318)
-		{
+	for (i = 0; i < dwArrayLen; i = i+3) {
+		if (pdwArray[i] == 0x318) {
 			pdwArray[i+2] = 0x00000800;
-			//DbgPrint("ptrArray[i], ptrArray[i+1], ptrArray[i+2] = %x, %x, %x\n",
-			//	ptrArray[i], ptrArray[i+1], ptrArray[i+2]);
 		}
 
-		RT_TRACE(COMP_DBG, "The Rtl8190MACPHY_Array[0] is %x Rtl8190MACPHY_Array[1] is %x Rtl8190MACPHY_Array[2] is %x\n",
-				pdwArray[i], pdwArray[i+1], pdwArray[i+2]);
-		rtl8192_setBBreg(dev, pdwArray[i], pdwArray[i+1], pdwArray[i+2]);
+		RT_TRACE(COMP_DBG,
+			 "Rtl8190MACPHY_Array[0]=%x Rtl8190MACPHY_Array[1]=%x Rtl8190MACPHY_Array[2]=%x\n",
+			 pdwArray[i], pdwArray[i+1], pdwArray[i+2]);
+		rtl8192_setBBreg(dev, pdwArray[i], pdwArray[i+1],
+				 pdwArray[i+2]);
 	}
 	return;
-
 }
 
 /******************************************************************************
- *function:  This function does dirty work
- *   input:  dev
- *  output:  none
- *  return:  none
- *  notice:  BB parameters may change all the time, so please make
- *           sure it has been synced with the newest.
- * ***************************************************************************/
-
-void rtl8192_phyConfigBB(struct net_device* dev, u8 ConfigType)
+ * function:  This function does dirty work
+ * input:     net_device	*dev
+ *            u8                ConfigType
+ * output:    none
+ * return:    none
+ * notice:    BB parameters may change all the time, so please make
+ *            sure it has been synced with the newest.
+ *****************************************************************************/
+void rtl8192_phyConfigBB(struct net_device *dev, u8 ConfigType)
 {
 	u32 i;
 
 #ifdef TO_DO_LIST
 	u32 *rtl8192PhyRegArrayTable = NULL, *rtl8192AgcTabArrayTable = NULL;
-	if(Adapter->bInHctTest)
-	{
+
+	if (Adapter->bInHctTest) {
 		PHY_REGArrayLen = PHY_REGArrayLengthDTM;
 		AGCTAB_ArrayLen = AGCTAB_ArrayLengthDTM;
 		Rtl8190PHY_REGArray_Table = Rtl819XPHY_REGArrayDTM;
 		Rtl8190AGCTAB_Array_Table = Rtl819XAGCTAB_ArrayDTM;
 	}
 #endif
-	if (ConfigType == BaseBand_Config_PHY_REG)
-	{
-		for (i=0; i<PHY_REG_1T2RArrayLength; i+=2)
-		{
-			rtl8192_setBBreg(dev, rtl819XPHY_REG_1T2RArray[i], bMaskDWord, rtl819XPHY_REG_1T2RArray[i+1]);
-			RT_TRACE(COMP_DBG, "i: %x, The Rtl819xUsbPHY_REGArray[0] is %x Rtl819xUsbPHY_REGArray[1] is %x \n",i, rtl819XPHY_REG_1T2RArray[i], rtl819XPHY_REG_1T2RArray[i+1]);
+	if (ConfigType == BaseBand_Config_PHY_REG) {
+		for (i = 0; i < PHY_REG_1T2RArrayLength; i += 2) {
+			rtl8192_setBBreg(dev, rtl819XPHY_REG_1T2RArray[i],
+					 bMaskDWord,
+					 rtl819XPHY_REG_1T2RArray[i+1]);
+			RT_TRACE(COMP_DBG,
+				 "i: %x, Rtl819xUsbPHY_REGArray[0]=%x Rtl819xUsbPHY_REGArray[1]=%x\n",
+				 i, rtl819XPHY_REG_1T2RArray[i],
+				 rtl819XPHY_REG_1T2RArray[i+1]);
 		}
-	}
-	else if (ConfigType == BaseBand_Config_AGC_TAB)
-	{
-		for (i=0; i<AGCTAB_ArrayLength; i+=2)
-		{
-			rtl8192_setBBreg(dev, rtl819XAGCTAB_Array[i], bMaskDWord, rtl819XAGCTAB_Array[i+1]);
-			RT_TRACE(COMP_DBG, "i:%x, The rtl819XAGCTAB_Array[0] is %x rtl819XAGCTAB_Array[1] is %x \n",i, rtl819XAGCTAB_Array[i], rtl819XAGCTAB_Array[i+1]);
+	} else if (ConfigType == BaseBand_Config_AGC_TAB) {
+		for (i = 0; i < AGCTAB_ArrayLength; i += 2) {
+			rtl8192_setBBreg(dev, rtl819XAGCTAB_Array[i],
+					 bMaskDWord, rtl819XAGCTAB_Array[i+1]);
+			RT_TRACE(COMP_DBG,
+				 "i: %x, rtl819XAGCTAB_Array[0]=%x rtl819XAGCTAB_Array[1]=%x\n",
+				 i, rtl819XAGCTAB_Array[i],
+				 rtl819XAGCTAB_Array[i+1]);
 		}
 	}
 	return;
-
-
 }
+
 /******************************************************************************
- *function:  This function initialize Register definition offset for Radio Path
- *	     A/B/C/D
- *   input:  net_device dev
- *  output:  none
- *  return:  none
- *  notice:  Initialization value here is constant and it should never be changed
- * ***************************************************************************/
-void rtl8192_InitBBRFRegDef(struct net_device* dev)
+ * function:  This function initializes Register definition offset for
+ *            Radio Path A/B/C/D
+ * input:     net_device	*dev
+ * output:    none
+ * return:    none
+ * notice:    Initialization value here is constant and it should never
+ *            be changed
+ *****************************************************************************/
+void rtl8192_InitBBRFRegDef(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-// RF Interface Software Control
-	priv->PHYRegDef[RF90_PATH_A].rfintfs = rFPGA0_XAB_RFInterfaceSW; // 16 LSBs if read 32-bit from 0x870
-	priv->PHYRegDef[RF90_PATH_B].rfintfs = rFPGA0_XAB_RFInterfaceSW; // 16 MSBs if read 32-bit from 0x870 (16-bit for 0x872)
-	priv->PHYRegDef[RF90_PATH_C].rfintfs = rFPGA0_XCD_RFInterfaceSW;// 16 LSBs if read 32-bit from 0x874
-	priv->PHYRegDef[RF90_PATH_D].rfintfs = rFPGA0_XCD_RFInterfaceSW;// 16 MSBs if read 32-bit from 0x874 (16-bit for 0x876)
 
-	// RF Interface Readback Value
-	priv->PHYRegDef[RF90_PATH_A].rfintfi = rFPGA0_XAB_RFInterfaceRB; // 16 LSBs if read 32-bit from 0x8E0
-	priv->PHYRegDef[RF90_PATH_B].rfintfi = rFPGA0_XAB_RFInterfaceRB;// 16 MSBs if read 32-bit from 0x8E0 (16-bit for 0x8E2)
-	priv->PHYRegDef[RF90_PATH_C].rfintfi = rFPGA0_XCD_RFInterfaceRB;// 16 LSBs if read 32-bit from 0x8E4
-	priv->PHYRegDef[RF90_PATH_D].rfintfi = rFPGA0_XCD_RFInterfaceRB;// 16 MSBs if read 32-bit from 0x8E4 (16-bit for 0x8E6)
+	/* RF Interface Software Control */
+	/* 16 LSBs if read 32-bit from 0x870 */
+	priv->PHYRegDef[RF90_PATH_A].rfintfs = rFPGA0_XAB_RFInterfaceSW;
+	/* 16 MSBs if read 32-bit from 0x870 (16-bit for 0x872) */
+	priv->PHYRegDef[RF90_PATH_B].rfintfs = rFPGA0_XAB_RFInterfaceSW;
+	/* 16 LSBs if read 32-bit from 0x874 */
+	priv->PHYRegDef[RF90_PATH_C].rfintfs = rFPGA0_XCD_RFInterfaceSW;
+	/* 16 MSBs if read 32-bit from 0x874 (16-bit for 0x876) */
+	priv->PHYRegDef[RF90_PATH_D].rfintfs = rFPGA0_XCD_RFInterfaceSW;
 
-	// RF Interface Output (and Enable)
-	priv->PHYRegDef[RF90_PATH_A].rfintfo = rFPGA0_XA_RFInterfaceOE; // 16 LSBs if read 32-bit from 0x860
-	priv->PHYRegDef[RF90_PATH_B].rfintfo = rFPGA0_XB_RFInterfaceOE; // 16 LSBs if read 32-bit from 0x864
-	priv->PHYRegDef[RF90_PATH_C].rfintfo = rFPGA0_XC_RFInterfaceOE;// 16 LSBs if read 32-bit from 0x868
-	priv->PHYRegDef[RF90_PATH_D].rfintfo = rFPGA0_XD_RFInterfaceOE;// 16 LSBs if read 32-bit from 0x86C
+	/* RF Interface Readback Value */
+	/* 16 LSBs if read 32-bit from 0x8E0 */
+	priv->PHYRegDef[RF90_PATH_A].rfintfi = rFPGA0_XAB_RFInterfaceRB;
+	/* 16 MSBs if read 32-bit from 0x8E0 (16-bit for 0x8E2) */
+	priv->PHYRegDef[RF90_PATH_B].rfintfi = rFPGA0_XAB_RFInterfaceRB;
+	/* 16 LSBs if read 32-bit from 0x8E4 */
+	priv->PHYRegDef[RF90_PATH_C].rfintfi = rFPGA0_XCD_RFInterfaceRB;
+	/* 16 MSBs if read 32-bit from 0x8E4 (16-bit for 0x8E6) */
+	priv->PHYRegDef[RF90_PATH_D].rfintfi = rFPGA0_XCD_RFInterfaceRB;
 
-	// RF Interface (Output and)  Enable
-	priv->PHYRegDef[RF90_PATH_A].rfintfe = rFPGA0_XA_RFInterfaceOE; // 16 MSBs if read 32-bit from 0x860 (16-bit for 0x862)
-	priv->PHYRegDef[RF90_PATH_B].rfintfe = rFPGA0_XB_RFInterfaceOE; // 16 MSBs if read 32-bit from 0x864 (16-bit for 0x866)
-	priv->PHYRegDef[RF90_PATH_C].rfintfe = rFPGA0_XC_RFInterfaceOE;// 16 MSBs if read 32-bit from 0x86A (16-bit for 0x86A)
-	priv->PHYRegDef[RF90_PATH_D].rfintfe = rFPGA0_XD_RFInterfaceOE;// 16 MSBs if read 32-bit from 0x86C (16-bit for 0x86E)
+	/* RF Interface Output (and Enable) */
+	/* 16 LSBs if read 32-bit from 0x860 */
+	priv->PHYRegDef[RF90_PATH_A].rfintfo = rFPGA0_XA_RFInterfaceOE;
+	/* 16 LSBs if read 32-bit from 0x864 */
+	priv->PHYRegDef[RF90_PATH_B].rfintfo = rFPGA0_XB_RFInterfaceOE;
+	/* 16 LSBs if read 32-bit from 0x868 */
+	priv->PHYRegDef[RF90_PATH_C].rfintfo = rFPGA0_XC_RFInterfaceOE;
+	/* 16 LSBs if read 32-bit from 0x86C */
+	priv->PHYRegDef[RF90_PATH_D].rfintfo = rFPGA0_XD_RFInterfaceOE;
 
-	//Addr of LSSI. Write RF register by driver
-	priv->PHYRegDef[RF90_PATH_A].rf3wireOffset = rFPGA0_XA_LSSIParameter; //LSSI Parameter
+	/* RF Interface (Output and) Enable */
+	/* 16 MSBs if read 32-bit from 0x860 (16-bit for 0x862) */
+	priv->PHYRegDef[RF90_PATH_A].rfintfe = rFPGA0_XA_RFInterfaceOE;
+	/* 16 MSBs if read 32-bit from 0x864 (16-bit for 0x866) */
+	priv->PHYRegDef[RF90_PATH_B].rfintfe = rFPGA0_XB_RFInterfaceOE;
+	/* 16 MSBs if read 32-bit from 0x86A (16-bit for 0x86A) */
+	priv->PHYRegDef[RF90_PATH_C].rfintfe = rFPGA0_XC_RFInterfaceOE;
+	/* 16 MSBs if read 32-bit from 0x86C (16-bit for 0x86E) */
+	priv->PHYRegDef[RF90_PATH_D].rfintfe = rFPGA0_XD_RFInterfaceOE;
+
+	/* Addr of LSSI. Write RF register by driver */
+	priv->PHYRegDef[RF90_PATH_A].rf3wireOffset = rFPGA0_XA_LSSIParameter;
 	priv->PHYRegDef[RF90_PATH_B].rf3wireOffset = rFPGA0_XB_LSSIParameter;
 	priv->PHYRegDef[RF90_PATH_C].rf3wireOffset = rFPGA0_XC_LSSIParameter;
 	priv->PHYRegDef[RF90_PATH_D].rf3wireOffset = rFPGA0_XD_LSSIParameter;
 
-	// RF parameter
-	priv->PHYRegDef[RF90_PATH_A].rfLSSI_Select = rFPGA0_XAB_RFParameter;  //BB Band Select
+	/* RF parameter */
+	/* BB Band Select */
+	priv->PHYRegDef[RF90_PATH_A].rfLSSI_Select = rFPGA0_XAB_RFParameter;
 	priv->PHYRegDef[RF90_PATH_B].rfLSSI_Select = rFPGA0_XAB_RFParameter;
 	priv->PHYRegDef[RF90_PATH_C].rfLSSI_Select = rFPGA0_XCD_RFParameter;
 	priv->PHYRegDef[RF90_PATH_D].rfLSSI_Select = rFPGA0_XCD_RFParameter;
 
-	// Tx AGC Gain Stage (same for all path. Should we remove this?)
-	priv->PHYRegDef[RF90_PATH_A].rfTxGainStage = rFPGA0_TxGainStage; //Tx gain stage
-	priv->PHYRegDef[RF90_PATH_B].rfTxGainStage = rFPGA0_TxGainStage; //Tx gain stage
-	priv->PHYRegDef[RF90_PATH_C].rfTxGainStage = rFPGA0_TxGainStage; //Tx gain stage
-	priv->PHYRegDef[RF90_PATH_D].rfTxGainStage = rFPGA0_TxGainStage; //Tx gain stage
+	/* Tx AGC Gain Stage (same for all path. Should we remove this?) */
+	priv->PHYRegDef[RF90_PATH_A].rfTxGainStage = rFPGA0_TxGainStage;
+	priv->PHYRegDef[RF90_PATH_B].rfTxGainStage = rFPGA0_TxGainStage;
+	priv->PHYRegDef[RF90_PATH_C].rfTxGainStage = rFPGA0_TxGainStage;
+	priv->PHYRegDef[RF90_PATH_D].rfTxGainStage = rFPGA0_TxGainStage;
 
-	// Tranceiver A~D HSSI Parameter-1
-	priv->PHYRegDef[RF90_PATH_A].rfHSSIPara1 = rFPGA0_XA_HSSIParameter1;  //wire control parameter1
-	priv->PHYRegDef[RF90_PATH_B].rfHSSIPara1 = rFPGA0_XB_HSSIParameter1;  //wire control parameter1
-	priv->PHYRegDef[RF90_PATH_C].rfHSSIPara1 = rFPGA0_XC_HSSIParameter1;  //wire control parameter1
-	priv->PHYRegDef[RF90_PATH_D].rfHSSIPara1 = rFPGA0_XD_HSSIParameter1;  //wire control parameter1
+	/* Tranceiver A~D HSSI Parameter-1 */
+	/* wire control parameter1 */
+	priv->PHYRegDef[RF90_PATH_A].rfHSSIPara1 = rFPGA0_XA_HSSIParameter1;
+	priv->PHYRegDef[RF90_PATH_B].rfHSSIPara1 = rFPGA0_XB_HSSIParameter1;
+	priv->PHYRegDef[RF90_PATH_C].rfHSSIPara1 = rFPGA0_XC_HSSIParameter1;
+	priv->PHYRegDef[RF90_PATH_D].rfHSSIPara1 = rFPGA0_XD_HSSIParameter1;
 
-	// Tranceiver A~D HSSI Parameter-2
-	priv->PHYRegDef[RF90_PATH_A].rfHSSIPara2 = rFPGA0_XA_HSSIParameter2;  //wire control parameter2
-	priv->PHYRegDef[RF90_PATH_B].rfHSSIPara2 = rFPGA0_XB_HSSIParameter2;  //wire control parameter2
-	priv->PHYRegDef[RF90_PATH_C].rfHSSIPara2 = rFPGA0_XC_HSSIParameter2;  //wire control parameter2
-	priv->PHYRegDef[RF90_PATH_D].rfHSSIPara2 = rFPGA0_XD_HSSIParameter2;  //wire control parameter1
+	/* Tranceiver A~D HSSI Parameter-2 */
+	/* wire control parameter2 */
+	priv->PHYRegDef[RF90_PATH_A].rfHSSIPara2 = rFPGA0_XA_HSSIParameter2;
+	priv->PHYRegDef[RF90_PATH_B].rfHSSIPara2 = rFPGA0_XB_HSSIParameter2;
+	priv->PHYRegDef[RF90_PATH_C].rfHSSIPara2 = rFPGA0_XC_HSSIParameter2;
+	priv->PHYRegDef[RF90_PATH_D].rfHSSIPara2 = rFPGA0_XD_HSSIParameter2;
 
-	// RF switch Control
-	priv->PHYRegDef[RF90_PATH_A].rfSwitchControl = rFPGA0_XAB_SwitchControl; //TR/Ant switch control
+	/* RF Switch Control */
+	/* TR/Ant switch control */
+	priv->PHYRegDef[RF90_PATH_A].rfSwitchControl = rFPGA0_XAB_SwitchControl;
 	priv->PHYRegDef[RF90_PATH_B].rfSwitchControl = rFPGA0_XAB_SwitchControl;
 	priv->PHYRegDef[RF90_PATH_C].rfSwitchControl = rFPGA0_XCD_SwitchControl;
 	priv->PHYRegDef[RF90_PATH_D].rfSwitchControl = rFPGA0_XCD_SwitchControl;
 
-	// AGC control 1
+	/* AGC control 1 */
 	priv->PHYRegDef[RF90_PATH_A].rfAGCControl1 = rOFDM0_XAAGCCore1;
 	priv->PHYRegDef[RF90_PATH_B].rfAGCControl1 = rOFDM0_XBAGCCore1;
 	priv->PHYRegDef[RF90_PATH_C].rfAGCControl1 = rOFDM0_XCAGCCore1;
 	priv->PHYRegDef[RF90_PATH_D].rfAGCControl1 = rOFDM0_XDAGCCore1;
 
-	// AGC control 2
+	/* AGC control 2 */
 	priv->PHYRegDef[RF90_PATH_A].rfAGCControl2 = rOFDM0_XAAGCCore2;
 	priv->PHYRegDef[RF90_PATH_B].rfAGCControl2 = rOFDM0_XBAGCCore2;
 	priv->PHYRegDef[RF90_PATH_C].rfAGCControl2 = rOFDM0_XCAGCCore2;
 	priv->PHYRegDef[RF90_PATH_D].rfAGCControl2 = rOFDM0_XDAGCCore2;
 
-	// RX AFE control 1
+	/* RX AFE control 1 */
 	priv->PHYRegDef[RF90_PATH_A].rfRxIQImbalance = rOFDM0_XARxIQImbalance;
 	priv->PHYRegDef[RF90_PATH_B].rfRxIQImbalance = rOFDM0_XBRxIQImbalance;
 	priv->PHYRegDef[RF90_PATH_C].rfRxIQImbalance = rOFDM0_XCRxIQImbalance;
 	priv->PHYRegDef[RF90_PATH_D].rfRxIQImbalance = rOFDM0_XDRxIQImbalance;
 
-	// RX AFE control 1
+	/* RX AFE control 1 */
 	priv->PHYRegDef[RF90_PATH_A].rfRxAFE = rOFDM0_XARxAFE;
 	priv->PHYRegDef[RF90_PATH_B].rfRxAFE = rOFDM0_XBRxAFE;
 	priv->PHYRegDef[RF90_PATH_C].rfRxAFE = rOFDM0_XCRxAFE;
 	priv->PHYRegDef[RF90_PATH_D].rfRxAFE = rOFDM0_XDRxAFE;
 
-	// Tx AFE control 1
+	/* Tx AFE control 1 */
 	priv->PHYRegDef[RF90_PATH_A].rfTxIQImbalance = rOFDM0_XATxIQImbalance;
 	priv->PHYRegDef[RF90_PATH_B].rfTxIQImbalance = rOFDM0_XBTxIQImbalance;
 	priv->PHYRegDef[RF90_PATH_C].rfTxIQImbalance = rOFDM0_XCTxIQImbalance;
 	priv->PHYRegDef[RF90_PATH_D].rfTxIQImbalance = rOFDM0_XDTxIQImbalance;
 
-	// Tx AFE control 2
+	/* Tx AFE control 2 */
 	priv->PHYRegDef[RF90_PATH_A].rfTxAFE = rOFDM0_XATxAFE;
 	priv->PHYRegDef[RF90_PATH_B].rfTxAFE = rOFDM0_XBTxAFE;
 	priv->PHYRegDef[RF90_PATH_C].rfTxAFE = rOFDM0_XCTxAFE;
 	priv->PHYRegDef[RF90_PATH_D].rfTxAFE = rOFDM0_XDTxAFE;
 
-	// Tranceiver LSSI Readback
+	/* Tranceiver LSSI Readback */
 	priv->PHYRegDef[RF90_PATH_A].rfLSSIReadBack = rFPGA0_XA_LSSIReadBack;
 	priv->PHYRegDef[RF90_PATH_B].rfLSSIReadBack = rFPGA0_XB_LSSIReadBack;
 	priv->PHYRegDef[RF90_PATH_C].rfLSSIReadBack = rFPGA0_XC_LSSIReadBack;
 	priv->PHYRegDef[RF90_PATH_D].rfLSSIReadBack = rFPGA0_XD_LSSIReadBack;
-
 }
+
 /******************************************************************************
- *function:  This function is to write register and then readback to make sure whether BB and RF is OK
- *   input:  net_device dev
- *	     HW90_BLOCK_E CheckBlock
- *	     RF90_RADIO_PATH_E eRFPath  //only used when checkblock is HW90_BLOCK_RF
- *  output:  none
- *  return:  return whether BB and RF is ok(0:OK; 1:Fail)
- *  notice:  This function may be removed in the ASIC
- * ***************************************************************************/
-u8 rtl8192_phy_checkBBAndRF(struct net_device* dev, HW90_BLOCK_E CheckBlock, RF90_RADIO_PATH_E eRFPath)
+ * function:  This function is to write register and then readback to make
+ *            sure whether BB and RF is OK
+ * input:     net_device        *dev
+ *            HW90_BLOCK_E      CheckBlock
+ *            RF90_RADIO_PATH_E eRFPath  //only used when checkblock is
+ *                                       //HW90_BLOCK_RF
+ * output:    none
+ * return:    return whether BB and RF is ok (0:OK, 1:Fail)
+ * notice:    This function may be removed in the ASIC
+ ******************************************************************************/
+u8 rtl8192_phy_checkBBAndRF(struct net_device *dev, HW90_BLOCK_E CheckBlock,
+			    RF90_RADIO_PATH_E eRFPath)
 {
-//	struct r8192_priv *priv = ieee80211_priv(dev);
-//	BB_REGISTER_DEFINITION_T *pPhyReg = &priv->PHYRegDef[eRFPath];
 	u8 ret = 0;
-	u32 i, CheckTimes = 4, dwRegRead = 0;
+	u32 i, CheckTimes = 4, reg = 0;
 	u32 WriteAddr[4];
 	u32 WriteData[] = {0xfffff027, 0xaa55a02f, 0x00000027, 0x55aa502f};
-	// Initialize register address offset to be checked
+
+	/* Initialize register address offset to be checked */
 	WriteAddr[HW90_BLOCK_MAC] = 0x100;
 	WriteAddr[HW90_BLOCK_PHY0] = 0x900;
 	WriteAddr[HW90_BLOCK_PHY1] = 0x800;
 	WriteAddr[HW90_BLOCK_RF] = 0x3;
-	RT_TRACE(COMP_PHY, "=======>%s(), CheckBlock:%d\n", __FUNCTION__, CheckBlock);
-	for(i=0 ; i < CheckTimes ; i++)
-	{
+	RT_TRACE(COMP_PHY, "%s(), CheckBlock: %d\n", __func__, CheckBlock);
+	for (i = 0; i < CheckTimes; i++) {
 
-		//
-		// Write Data to register and readback
-		//
-		switch(CheckBlock)
-		{
+		/* Write data to register and readback */
+		switch (CheckBlock) {
 		case HW90_BLOCK_MAC:
-			RT_TRACE(COMP_ERR, "PHY_CheckBBRFOK(): Never Write 0x100 here!");
+			RT_TRACE(COMP_ERR,
+				 "PHY_CheckBBRFOK(): Never Write 0x100 here!\n");
 			break;
 
 		case HW90_BLOCK_PHY0:
 		case HW90_BLOCK_PHY1:
-			write_nic_dword(dev, WriteAddr[CheckBlock], WriteData[i]);
-			dwRegRead = read_nic_dword(dev, WriteAddr[CheckBlock]);
+			write_nic_dword(dev, WriteAddr[CheckBlock],
+					WriteData[i]);
+			read_nic_dword(dev, WriteAddr[CheckBlock], &reg);
 			break;
 
 		case HW90_BLOCK_RF:
 			WriteData[i] &= 0xfff;
-			rtl8192_phy_SetRFReg(dev, eRFPath, WriteAddr[HW90_BLOCK_RF], bMask12Bits, WriteData[i]);
-			// TODO: we should not delay for such a long time. Ask SD3
-			msleep(1);
-			dwRegRead = rtl8192_phy_QueryRFReg(dev, eRFPath, WriteAddr[HW90_BLOCK_RF], bMask12Bits);
-			msleep(1);
+			rtl8192_phy_SetRFReg(dev, eRFPath,
+					     WriteAddr[HW90_BLOCK_RF],
+					     bMask12Bits, WriteData[i]);
+			/* TODO: we should not delay for such a long time.
+			   Ask SD3 */
+			usleep_range(1000, 1000);
+			reg = rtl8192_phy_QueryRFReg(dev, eRFPath,
+						     WriteAddr[HW90_BLOCK_RF],
+						     bMask12Bits);
+			usleep_range(1000, 1000);
 			break;
 
 		default:
@@ -737,12 +759,11 @@
 		}
 
 
-		//
-		// Check whether readback data is correct
-		//
-		if(dwRegRead != WriteData[i])
-		{
-			RT_TRACE((COMP_PHY|COMP_ERR), "====>error=====dwRegRead: %x, WriteData: %x \n", dwRegRead, WriteData[i]);
+		/* Check whether readback data is correct */
+		if (reg != WriteData[i]) {
+			RT_TRACE((COMP_PHY|COMP_ERR),
+				 "error reg: %x, WriteData: %x\n",
+				 reg, WriteData[i]);
 			ret = 1;
 			break;
 		}
@@ -751,179 +772,193 @@
 	return ret;
 }
 
-
 /******************************************************************************
- *function:  This function initialize BB&RF
- *   input:  net_device dev
- *  output:  none
- *  return:  none
- *  notice:  Initialization value may change all the time, so please make
- *           sure it has been synced with the newest.
- * ***************************************************************************/
-void rtl8192_BB_Config_ParaFile(struct net_device* dev)
+ * function:  This function initializes BB&RF
+ * input:     net_device	*dev
+ * output:    none
+ * return:    none
+ * notice:    Initialization value may change all the time, so please make
+ *            sure it has been synced with the newest.
+ ******************************************************************************/
+void rtl8192_BB_Config_ParaFile(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	u8 bRegValue = 0, eCheckItem = 0, rtStatus = 0;
-	u32 dwRegValue = 0;
+	u8 reg_u8 = 0, eCheckItem = 0, status = 0;
+	u32 reg_u32 = 0;
+
 	/**************************************
-	//<1>Initialize BaseBand
-	**************************************/
+	 * <1> Initialize BaseBand
+	 *************************************/
 
-	/*--set BB Global Reset--*/
-	bRegValue = read_nic_byte(dev, BB_GLOBAL_RESET);
-	write_nic_byte(dev, BB_GLOBAL_RESET,(bRegValue|BB_GLOBAL_RESET_BIT));
+	/* --set BB Global Reset-- */
+	read_nic_byte(dev, BB_GLOBAL_RESET, &reg_u8);
+	write_nic_byte(dev, BB_GLOBAL_RESET, (reg_u8|BB_GLOBAL_RESET_BIT));
 	mdelay(50);
-	/*---set BB reset Active---*/
-	dwRegValue = read_nic_dword(dev, CPU_GEN);
-	write_nic_dword(dev, CPU_GEN, (dwRegValue&(~CPU_GEN_BB_RST)));
+	/* ---set BB reset Active--- */
+	read_nic_dword(dev, CPU_GEN, &reg_u32);
+	write_nic_dword(dev, CPU_GEN, (reg_u32&(~CPU_GEN_BB_RST)));
 
-	/*----Ckeck FPGAPHY0 and PHY1 board is OK----*/
-	// TODO: this function should be removed on ASIC , Emily 2007.2.2
-	for(eCheckItem=(HW90_BLOCK_E)HW90_BLOCK_PHY0; eCheckItem<=HW90_BLOCK_PHY1; eCheckItem++)
-	{
-		rtStatus  = rtl8192_phy_checkBBAndRF(dev, (HW90_BLOCK_E)eCheckItem, (RF90_RADIO_PATH_E)0); //don't care RF path
-		if(rtStatus != 0)
-		{
-			RT_TRACE((COMP_ERR | COMP_PHY), "PHY_RF8256_Config():Check PHY%d Fail!!\n", eCheckItem-1);
-			return ;
+	/* ----Ckeck FPGAPHY0 and PHY1 board is OK---- */
+	/* TODO: this function should be removed on ASIC */
+	for (eCheckItem = (HW90_BLOCK_E)HW90_BLOCK_PHY0;
+	     eCheckItem <= HW90_BLOCK_PHY1; eCheckItem++) {
+		/* don't care RF path */
+		status = rtl8192_phy_checkBBAndRF(dev, (HW90_BLOCK_E)eCheckItem,
+						  (RF90_RADIO_PATH_E)0);
+		if (status != 0) {
+			RT_TRACE((COMP_ERR | COMP_PHY),
+				 "PHY_RF8256_Config(): Check PHY%d Fail!!\n",
+				 eCheckItem-1);
+			return;
 		}
 	}
-	/*---- Set CCK and OFDM Block "OFF"----*/
+	/* ---- Set CCK and OFDM Block "OFF"---- */
 	rtl8192_setBBreg(dev, rFPGA0_RFMOD, bCCKEn|bOFDMEn, 0x0);
-	/*----BB Register Initilazation----*/
-	//==m==>Set PHY REG From Header<==m==
+	/* ----BB Register Initilazation---- */
+	/* ==m==>Set PHY REG From Header<==m== */
 	rtl8192_phyConfigBB(dev, BaseBand_Config_PHY_REG);
 
-	/*----Set BB reset de-Active----*/
-	dwRegValue = read_nic_dword(dev, CPU_GEN);
-	write_nic_dword(dev, CPU_GEN, (dwRegValue|CPU_GEN_BB_RST));
+	/* ----Set BB reset de-Active---- */
+	read_nic_dword(dev, CPU_GEN, &reg_u32);
+	write_nic_dword(dev, CPU_GEN, (reg_u32|CPU_GEN_BB_RST));
 
-	/*----BB AGC table Initialization----*/
-	//==m==>Set PHY REG From Header<==m==
+	/* ----BB AGC table Initialization---- */
+	/* ==m==>Set PHY REG From Header<==m== */
 	rtl8192_phyConfigBB(dev, BaseBand_Config_AGC_TAB);
 
-	/*----Enable XSTAL ----*/
+	/* ----Enable XSTAL ---- */
 	write_nic_byte_E(dev, 0x5e, 0x00);
-	if (priv->card_8192_version == (u8)VERSION_819xU_A)
-	{
-		//Antenna gain offset from B/C/D to A
-		dwRegValue = (priv->AntennaTxPwDiff[1]<<4 | priv->AntennaTxPwDiff[0]);
-		rtl8192_setBBreg(dev, rFPGA0_TxGainStage, (bXBTxAGC|bXCTxAGC), dwRegValue);
+	if (priv->card_8192_version == (u8)VERSION_819xU_A) {
+		/* Antenna gain offset from B/C/D to A */
+		reg_u32 = (priv->AntennaTxPwDiff[1]<<4 |
+			   priv->AntennaTxPwDiff[0]);
+		rtl8192_setBBreg(dev, rFPGA0_TxGainStage, (bXBTxAGC|bXCTxAGC),
+				 reg_u32);
 
-		//XSTALLCap
-		dwRegValue = priv->CrystalCap & 0xf;
-		rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, bXtalCap, dwRegValue);
+		/* XSTALLCap */
+		reg_u32 = priv->CrystalCap & 0xf;
+		rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, bXtalCap,
+				 reg_u32);
 	}
 
-	// Check if the CCK HighPower is turned ON.
-	// This is used to calculate PWDB.
-	priv->bCckHighPower = (u8)(rtl8192_QueryBBReg(dev, rFPGA0_XA_HSSIParameter2, 0x200));
+	/* Check if the CCK HighPower is turned ON.
+	   This is used to calculate PWDB. */
+	priv->bCckHighPower = (u8)rtl8192_QueryBBReg(dev,
+						     rFPGA0_XA_HSSIParameter2,
+						     0x200);
 	return;
 }
+
 /******************************************************************************
- *function:  This function initialize BB&RF
- *   input:  net_device dev
- *  output:  none
- *  return:  none
- *  notice:  Initialization value may change all the time, so please make
- *           sure it has been synced with the newest.
- * ***************************************************************************/
-void rtl8192_BBConfig(struct net_device* dev)
+ * function:  This function initializes BB&RF
+ * input:     net_device	*dev
+ * output:    none
+ * return:    none
+ * notice:    Initialization value may change all the time, so please make
+ *            sure it has been synced with the newest.
+ *****************************************************************************/
+void rtl8192_BBConfig(struct net_device *dev)
 {
 	rtl8192_InitBBRFRegDef(dev);
-	//config BB&RF. As hardCode based initialization has not been well
-	//implemented, so use file first.FIXME:should implement it for hardcode?
+	/* config BB&RF. As hardCode based initialization has not been well
+	 * implemented, so use file first.
+	 * FIXME: should implement it for hardcode? */
 	rtl8192_BB_Config_ParaFile(dev);
 	return;
 }
 
+
 /******************************************************************************
- *function:  This function obtains the initialization value of Tx power Level offset
- *   input:  net_device dev
- *  output:  none
- *  return:  none
- * ***************************************************************************/
-void rtl8192_phy_getTxPower(struct net_device* dev)
+ * function:  This function obtains the initialization value of Tx power Level
+ *            offset
+ * input:     net_device	*dev
+ * output:    none
+ * return:    none
+ *****************************************************************************/
+void rtl8192_phy_getTxPower(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	priv->MCSTxPowerLevelOriginalOffset[0] =
-		read_nic_dword(dev, rTxAGC_Rate18_06);
-	priv->MCSTxPowerLevelOriginalOffset[1] =
-		read_nic_dword(dev, rTxAGC_Rate54_24);
-	priv->MCSTxPowerLevelOriginalOffset[2] =
-		read_nic_dword(dev, rTxAGC_Mcs03_Mcs00);
-	priv->MCSTxPowerLevelOriginalOffset[3] =
-		read_nic_dword(dev, rTxAGC_Mcs07_Mcs04);
-	priv->MCSTxPowerLevelOriginalOffset[4] =
-		read_nic_dword(dev, rTxAGC_Mcs11_Mcs08);
-	priv->MCSTxPowerLevelOriginalOffset[5] =
-		read_nic_dword(dev, rTxAGC_Mcs15_Mcs12);
+	u8 tmp;
 
-	// read rx initial gain
-	priv->DefaultInitialGain[0] = read_nic_byte(dev, rOFDM0_XAAGCCore1);
-	priv->DefaultInitialGain[1] = read_nic_byte(dev, rOFDM0_XBAGCCore1);
-	priv->DefaultInitialGain[2] = read_nic_byte(dev, rOFDM0_XCAGCCore1);
-	priv->DefaultInitialGain[3] = read_nic_byte(dev, rOFDM0_XDAGCCore1);
-	RT_TRACE(COMP_INIT, "Default initial gain (c50=0x%x, c58=0x%x, c60=0x%x, c68=0x%x) \n",
-		priv->DefaultInitialGain[0], priv->DefaultInitialGain[1],
-		priv->DefaultInitialGain[2], priv->DefaultInitialGain[3]);
+	read_nic_dword(dev, rTxAGC_Rate18_06,
+		       &priv->MCSTxPowerLevelOriginalOffset[0]);
+	read_nic_dword(dev, rTxAGC_Rate54_24,
+		       &priv->MCSTxPowerLevelOriginalOffset[1]);
+	read_nic_dword(dev, rTxAGC_Mcs03_Mcs00,
+		       &priv->MCSTxPowerLevelOriginalOffset[2]);
+	read_nic_dword(dev, rTxAGC_Mcs07_Mcs04,
+		       &priv->MCSTxPowerLevelOriginalOffset[3]);
+	read_nic_dword(dev, rTxAGC_Mcs11_Mcs08,
+		       &priv->MCSTxPowerLevelOriginalOffset[4]);
+	read_nic_dword(dev, rTxAGC_Mcs15_Mcs12,
+		       &priv->MCSTxPowerLevelOriginalOffset[5]);
 
-	// read framesync
-	priv->framesync = read_nic_byte(dev, rOFDM0_RxDetector3);
-	priv->framesyncC34 = read_nic_byte(dev, rOFDM0_RxDetector2);
+	/* Read rx initial gain */
+	read_nic_byte(dev, rOFDM0_XAAGCCore1, &priv->DefaultInitialGain[0]);
+	read_nic_byte(dev, rOFDM0_XBAGCCore1, &priv->DefaultInitialGain[1]);
+	read_nic_byte(dev, rOFDM0_XCAGCCore1, &priv->DefaultInitialGain[2]);
+	read_nic_byte(dev, rOFDM0_XDAGCCore1, &priv->DefaultInitialGain[3]);
+	RT_TRACE(COMP_INIT,
+		 "Default initial gain (c50=0x%x, c58=0x%x, c60=0x%x, c68=0x%x)\n",
+		 priv->DefaultInitialGain[0], priv->DefaultInitialGain[1],
+		 priv->DefaultInitialGain[2], priv->DefaultInitialGain[3]);
+
+	/* Read framesync */
+	read_nic_byte(dev, rOFDM0_RxDetector3, &priv->framesync);
+	read_nic_byte(dev, rOFDM0_RxDetector2, &tmp);
+	priv->framesyncC34 = tmp;
 	RT_TRACE(COMP_INIT, "Default framesync (0x%x) = 0x%x \n",
 		rOFDM0_RxDetector3, priv->framesync);
 
-	// read SIFS (save the value read fome MACPHY_REG.txt)
-	priv->SifsTime = read_nic_word(dev, SIFS);
+	/* Read SIFS (save the value read fome MACPHY_REG.txt) */
+	read_nic_word(dev, SIFS, &priv->SifsTime);
 
 	return;
 }
 
 /******************************************************************************
- *function:  This function obtains the initialization value of Tx power Level offset
- *   input:  net_device dev
- *  output:  none
- *  return:  none
- * ***************************************************************************/
-void rtl8192_phy_setTxPower(struct net_device* dev, u8 channel)
+ * function:  This function sets the initialization value of Tx power Level
+ *            offset
+ * input:     net_device        *dev
+ *            u8                channel
+ * output:    none
+ * return:    none
+ ******************************************************************************/
+void rtl8192_phy_setTxPower(struct net_device *dev, u8 channel)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8	powerlevel = priv->TxPowerLevelCCK[channel-1];
 	u8	powerlevelOFDM24G = priv->TxPowerLevelOFDM24G[channel-1];
 
-	switch(priv->rf_chip)
-	{
+	switch (priv->rf_chip) {
 	case RF_8256:
-		PHY_SetRF8256CCKTxPower(dev, powerlevel); //need further implement
+		/* need further implement */
+		PHY_SetRF8256CCKTxPower(dev, powerlevel);
 		PHY_SetRF8256OFDMTxPower(dev, powerlevelOFDM24G);
 		break;
 	default:
-//	case RF_8225:
-//	case RF_8258:
-		RT_TRACE((COMP_PHY|COMP_ERR), "error RF chipID(8225 or 8258) in function %s()\n", __FUNCTION__);
+		RT_TRACE((COMP_PHY|COMP_ERR),
+			 "error RF chipID(8225 or 8258) in function %s()\n",
+			 __func__);
 		break;
 	}
 	return;
 }
 
 /******************************************************************************
- *function:  This function check Rf chip to do RF config
- *   input:  net_device dev
- *  output:  none
- *  return:  only 8256 is supported
- * ***************************************************************************/
-void rtl8192_phy_RFConfig(struct net_device* dev)
+ * function:  This function checks Rf chip to do RF config
+ * input:     net_device	*dev
+ * output:    none
+ * return:    only 8256 is supported
+ ******************************************************************************/
+void rtl8192_phy_RFConfig(struct net_device *dev)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
-	switch(priv->rf_chip)
-	{
+	switch (priv->rf_chip) {
 		case RF_8256:
 			PHY_RF8256_Config(dev);
 			break;
-	//	case RF_8225:
-	//	case RF_8258:
 		default:
 			RT_TRACE(COMP_ERR, "error chip id\n");
 			break;
@@ -932,75 +967,89 @@
 }
 
 /******************************************************************************
- *function:  This function update Initial gain
- *   input:  net_device dev
- *  output:  none
- *  return:  As Windows has not implemented this, wait for complement
- * ***************************************************************************/
-void rtl8192_phy_updateInitGain(struct net_device* dev)
+ * function:  This function updates Initial gain
+ * input:     net_device	*dev
+ * output:    none
+ * return:    As Windows has not implemented this, wait for complement
+ ******************************************************************************/
+void rtl8192_phy_updateInitGain(struct net_device *dev)
 {
 	return;
 }
 
 /******************************************************************************
- *function:  This function read RF parameters from general head file, and do RF 3-wire
- *   input:  net_device dev
- *  output:  none
- *  return:  return code show if RF configuration is successful(0:pass, 1:fail)
- *    Note:  Delay may be required for RF configuration
- * ***************************************************************************/
-u8 rtl8192_phy_ConfigRFWithHeaderFile(struct net_device* dev, RF90_RADIO_PATH_E	eRFPath)
+ * function:  This function read RF parameters from general head file,
+ *            and do RF 3-wire
+ * input:     net_device	*dev
+ *            RF90_RADIO_PATH_E eRFPath
+ * output:    none
+ * return:    return code show if RF configuration is successful(0:pass, 1:fail)
+ * notice:    Delay may be required for RF configuration
+ *****************************************************************************/
+u8 rtl8192_phy_ConfigRFWithHeaderFile(struct net_device *dev,
+				      RF90_RADIO_PATH_E	eRFPath)
 {
 
 	int i;
-	//u32* pRFArray;
 	u8 ret = 0;
 
-	switch(eRFPath){
+	switch (eRFPath) {
 	case RF90_PATH_A:
-		for(i = 0;i<RadioA_ArrayLength; i=i+2){
+		for (i = 0; i < RadioA_ArrayLength; i = i+2) {
 
-			if(rtl819XRadioA_Array[i] == 0xfe){
-					mdelay(100);
-					continue;
+			if (rtl819XRadioA_Array[i] == 0xfe) {
+				mdelay(100);
+				continue;
 			}
-			rtl8192_phy_SetRFReg(dev, eRFPath, rtl819XRadioA_Array[i], bMask12Bits, rtl819XRadioA_Array[i+1]);
+			rtl8192_phy_SetRFReg(dev, eRFPath,
+					     rtl819XRadioA_Array[i],
+					     bMask12Bits,
+					     rtl819XRadioA_Array[i+1]);
 			mdelay(1);
 
 		}
 		break;
 	case RF90_PATH_B:
-		for(i = 0;i<RadioB_ArrayLength; i=i+2){
+		for (i = 0; i < RadioB_ArrayLength; i = i+2) {
 
-			if(rtl819XRadioB_Array[i] == 0xfe){
-					mdelay(100);
-					continue;
+			if (rtl819XRadioB_Array[i] == 0xfe) {
+				mdelay(100);
+				continue;
 			}
-			rtl8192_phy_SetRFReg(dev, eRFPath, rtl819XRadioB_Array[i], bMask12Bits, rtl819XRadioB_Array[i+1]);
+			rtl8192_phy_SetRFReg(dev, eRFPath,
+					     rtl819XRadioB_Array[i],
+					     bMask12Bits,
+					     rtl819XRadioB_Array[i+1]);
 			mdelay(1);
 
 		}
 		break;
 	case RF90_PATH_C:
-		for(i = 0;i<RadioC_ArrayLength; i=i+2){
+		for (i = 0; i < RadioC_ArrayLength; i = i+2) {
 
-			if(rtl819XRadioC_Array[i] == 0xfe){
-					mdelay(100);
-					continue;
+			if (rtl819XRadioC_Array[i] == 0xfe) {
+				mdelay(100);
+				continue;
 			}
-			rtl8192_phy_SetRFReg(dev, eRFPath, rtl819XRadioC_Array[i], bMask12Bits, rtl819XRadioC_Array[i+1]);
+			rtl8192_phy_SetRFReg(dev, eRFPath,
+					     rtl819XRadioC_Array[i],
+					     bMask12Bits,
+					     rtl819XRadioC_Array[i+1]);
 			mdelay(1);
 
 		}
 		break;
 	case RF90_PATH_D:
-		for(i = 0;i<RadioD_ArrayLength; i=i+2){
+		for (i = 0; i < RadioD_ArrayLength; i = i+2) {
 
-			if(rtl819XRadioD_Array[i] == 0xfe){
-					mdelay(100);
-					continue;
+			if (rtl819XRadioD_Array[i] == 0xfe) {
+				mdelay(100);
+				continue;
 			}
-			rtl8192_phy_SetRFReg(dev, eRFPath, rtl819XRadioD_Array[i], bMask12Bits, rtl819XRadioD_Array[i+1]);
+			rtl8192_phy_SetRFReg(dev, eRFPath,
+					     rtl819XRadioD_Array[i],
+					     bMask12Bits,
+					     rtl819XRadioD_Array[i+1]);
 			mdelay(1);
 
 		}
@@ -1012,22 +1061,22 @@
 	return ret;
 
 }
+
 /******************************************************************************
- *function:  This function set Tx Power of the channel
- *   input:  struct net_device *dev
- *	     u8			channel
- *  output:  none
- *  return:  none
- *    Note:
- * ***************************************************************************/
+ * function:  This function sets Tx Power of the channel
+ * input:     net_device        *dev
+ *            u8                channel
+ * output:    none
+ * return:    none
+ * notice:
+ ******************************************************************************/
 void rtl8192_SetTxPowerLevel(struct net_device *dev, u8 channel)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8	powerlevel = priv->TxPowerLevelCCK[channel-1];
 	u8	powerlevelOFDM24G = priv->TxPowerLevelOFDM24G[channel-1];
 
-	switch(priv->rf_chip)
-	{
+	switch (priv->rf_chip) {
 	case RF_8225:
 #ifdef TO_DO_LIST
 		PHY_SetRF8225CckTxPower(Adapter, powerlevel);
@@ -1043,136 +1092,132 @@
 	case RF_8258:
 		break;
 	default:
-		RT_TRACE(COMP_ERR, "unknown rf chip ID in rtl8192_SetTxPowerLevel()\n");
+		RT_TRACE(COMP_ERR, "unknown rf chip ID in %s()\n", __func__);
 		break;
 	}
 	return;
 }
 
 /******************************************************************************
- *function:  This function set RF state on or off
- *   input:  struct net_device *dev
- *	     RT_RF_POWER_STATE eRFPowerState  //Power State to set
- *  output:  none
- *  return:  none
- *    Note:
- * ***************************************************************************/
-bool rtl8192_SetRFPowerState(struct net_device *dev, RT_RF_POWER_STATE eRFPowerState)
+ * function:  This function sets RF state on or off
+ * input:     net_device         *dev
+ *            RT_RF_POWER_STATE  eRFPowerState  //Power State to set
+ * output:    none
+ * return:    none
+ * notice:
+ *****************************************************************************/
+bool rtl8192_SetRFPowerState(struct net_device *dev,
+			     RT_RF_POWER_STATE eRFPowerState)
 {
 	bool				bResult = true;
-//	u8					eRFPath;
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
-	if(eRFPowerState == priv->ieee80211->eRFPowerState)
+	if (eRFPowerState == priv->ieee80211->eRFPowerState)
 		return false;
 
-	if(priv->SetRFPowerStateInProgress == true)
+	if (priv->SetRFPowerStateInProgress == true)
 		return false;
 
 	priv->SetRFPowerStateInProgress = true;
 
-	switch(priv->rf_chip)
-	{
+	switch (priv->rf_chip) {
+	case RF_8256:
+		switch (eRFPowerState) {
+		case eRfOn:
+			/* RF-A, RF-B */
+			/* enable RF-Chip A/B - 0x860[4] */
+			rtl8192_setBBreg(dev, rFPGA0_XA_RFInterfaceOE, BIT4,
+					 0x1);
+			/* analog to digital on - 0x88c[9:8] */
+			rtl8192_setBBreg(dev, rFPGA0_AnalogParameter4, 0x300,
+					 0x3);
+			/* digital to analog on - 0x880[4:3] */
+			rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x18,
+					 0x3);
+			/* rx antenna on - 0xc04[1:0] */
+			rtl8192_setBBreg(dev, rOFDM0_TRxPathEnable, 0x3, 0x3);
+			/* rx antenna on - 0xd04[1:0] */
+			rtl8192_setBBreg(dev, rOFDM1_TRxPathEnable, 0x3, 0x3);
+			/* analog to digital part2 on - 0x880[6:5] */
+			rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x60,
+					 0x3);
+
+			break;
+
+		case eRfSleep:
+
+			break;
+
+		case eRfOff:
+			/* RF-A, RF-B */
+			/* disable RF-Chip A/B - 0x860[4] */
+			rtl8192_setBBreg(dev, rFPGA0_XA_RFInterfaceOE, BIT4,
+					 0x0);
+			/* analog to digital off, for power save */
+			rtl8192_setBBreg(dev, rFPGA0_AnalogParameter4, 0xf00,
+					 0x0); /* 0x88c[11:8] */
+			/* digital to analog off, for power save - 0x880[4:3] */
+			rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x18,
+					 0x0);
+			/* rx antenna off - 0xc04[3:0] */
+			rtl8192_setBBreg(dev, rOFDM0_TRxPathEnable, 0xf, 0x0);
+			/* rx antenna off - 0xd04[3:0] */
+			rtl8192_setBBreg(dev, rOFDM1_TRxPathEnable, 0xf, 0x0);
+			/* analog to digital part2 off, for power save */
+			rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x60,
+					 0x0); /* 0x880[6:5] */
+
+			break;
+
+		default:
+			bResult = false;
+			RT_TRACE(COMP_ERR, "%s(): unknown state to set: 0x%X\n",
+				 __func__, eRFPowerState);
+			break;
+		}
+		break;
+	default:
+		RT_TRACE(COMP_ERR, "Not support rf_chip(%x)\n", priv->rf_chip);
+		break;
+	}
+#ifdef TO_DO_LIST
+	if (bResult) {
+		/* Update current RF state variable. */
+		pHalData->eRFPowerState = eRFPowerState;
+		switch (pHalData->RFChipID) {
 		case RF_8256:
-		switch( eRFPowerState )
-		{
-			case eRfOn:
-	//RF-A, RF-B
-					//enable RF-Chip A/B
-					rtl8192_setBBreg(dev, rFPGA0_XA_RFInterfaceOE, BIT4, 0x1);	// 0x860[4]
-					//analog to digital on
-					rtl8192_setBBreg(dev, rFPGA0_AnalogParameter4, 0x300, 0x3);// 0x88c[9:8]
-					//digital to analog on
-					rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x18, 0x3); // 0x880[4:3]
-					//rx antenna on
-					rtl8192_setBBreg(dev, rOFDM0_TRxPathEnable, 0x3, 0x3);// 0xc04[1:0]
-					//rx antenna on
-					rtl8192_setBBreg(dev, rOFDM1_TRxPathEnable, 0x3, 0x3);// 0xd04[1:0]
-					//analog to digital part2 on
-					rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x60, 0x3); // 0x880[6:5]
-
-				break;
-
-			case eRfSleep:
-
-				break;
-
+			switch (pHalData->eRFPowerState) {
 			case eRfOff:
-					//RF-A, RF-B
-					//disable RF-Chip A/B
-					rtl8192_setBBreg(dev, rFPGA0_XA_RFInterfaceOE, BIT4, 0x0);	// 0x860[4]
-					//analog to digital off, for power save
-					rtl8192_setBBreg(dev, rFPGA0_AnalogParameter4, 0xf00, 0x0);// 0x88c[11:8]
-					//digital to analog off, for power save
-					rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x18, 0x0); // 0x880[4:3]
-					//rx antenna off
-					rtl8192_setBBreg(dev, rOFDM0_TRxPathEnable, 0xf, 0x0);// 0xc04[3:0]
-					//rx antenna off
-					rtl8192_setBBreg(dev, rOFDM1_TRxPathEnable, 0xf, 0x0);// 0xd04[3:0]
-					//analog to digital part2 off, for power save
-					rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x60, 0x0); // 0x880[6:5]
+				/* If Rf off reason is from IPS,
+				   LED should blink with no link */
+				if (pMgntInfo->RfOffReason == RF_CHANGE_BY_IPS)
+					Adapter->HalFunc.LedControlHandler(Adapter, LED_CTL_NO_LINK);
+				else
+					/* Turn off LED if RF is not ON. */
+					Adapter->HalFunc.LedControlHandler(Adapter, LED_CTL_POWER_OFF);
+				break;
 
+			case eRfOn:
+				/* Turn on RF we are still linked, which might
+				   happen when we quickly turn off and on HW RF.
+				 */
+				if (pMgntInfo->bMediaConnect == TRUE)
+					Adapter->HalFunc.LedControlHandler(Adapter, LED_CTL_LINK);
+				else
+					/* Turn off LED if RF is not ON. */
+					Adapter->HalFunc.LedControlHandler(Adapter, LED_CTL_NO_LINK);
 				break;
 
 			default:
-				bResult = false;
-				RT_TRACE(COMP_ERR, "SetRFPowerState819xUsb(): unknow state to set: 0x%X!!!\n", eRFPowerState);
 				break;
-		}
-			break;
-		default:
-			RT_TRACE(COMP_ERR, "Not support rf_chip(%x)\n", priv->rf_chip);
-			break;
-	}
-#ifdef TO_DO_LIST
-	if(bResult)
-	{
-		// Update current RF state variable.
-		pHalData->eRFPowerState = eRFPowerState;
-		switch(pHalData->RFChipID )
-		{
-			case RF_8256:
-		switch(pHalData->eRFPowerState)
-				{
-				case eRfOff:
-					//
-					//If Rf off reason is from IPS, Led should blink with no link, by Maddest 071015
-					//
-					if(pMgntInfo->RfOffReason==RF_CHANGE_BY_IPS )
-					{
-						Adapter->HalFunc.LedControlHandler(Adapter,LED_CTL_NO_LINK);
-					}
-					else
-					{
-						// Turn off LED if RF is not ON.
-						Adapter->HalFunc.LedControlHandler(Adapter, LED_CTL_POWER_OFF);
-					}
-					break;
-
-				case eRfOn:
-					// Turn on RF we are still linked, which might happen when
-					// we quickly turn off and on HW RF. 2006.05.12, by rcnjko.
-					if( pMgntInfo->bMediaConnect == TRUE )
-					{
-						Adapter->HalFunc.LedControlHandler(Adapter, LED_CTL_LINK);
-					}
-					else
-					{
-						// Turn off LED if RF is not ON.
-						Adapter->HalFunc.LedControlHandler(Adapter, LED_CTL_NO_LINK);
-					}
-					break;
-
-				default:
-					// do nothing.
-					break;
-				}// Switch RF state
-				break;
-
-				default:
-					RT_TRACE(COMP_RF, DBG_LOUD, ("SetRFPowerState8190(): Unknown RF type\n"));
-					break;
 			}
+			break;
+
+		default:
+			RT_TRACE(COMP_RF, DBG_LOUD, "%s(): Unknown RF type\n",
+				 __func__);
+			break;
+		}
 
 	}
 #endif
@@ -1181,40 +1226,32 @@
 	return bResult;
 }
 
-/****************************************************************************************
- *function:  This function set command table variable(struct SwChnlCmd).
- *   input:  SwChnlCmd*		CmdTable	//table to be set.
- *	     u32		CmdTableIdx	//variable index in table to be set
- *	     u32		CmdTableSz	//table size.
- *	     SwChnlCmdID	CmdID		//command ID to set.
- *	     u32		Para1
- *	     u32		Para2
- *	     u32		msDelay
- *  output:
- *  return:  true if finished, false otherwise
- *    Note:
- * ************************************************************************************/
-u8 rtl8192_phy_SetSwChnlCmdArray(
-	SwChnlCmd*		CmdTable,
-	u32			CmdTableIdx,
-	u32			CmdTableSz,
-	SwChnlCmdID		CmdID,
-	u32			Para1,
-	u32			Para2,
-	u32			msDelay
-	)
+/******************************************************************************
+ * function:  This function sets command table variable (struct SwChnlCmd).
+ * input:     SwChnlCmd      *CmdTable    //table to be set
+ *            u32            CmdTableIdx  //variable index in table to be set
+ *            u32            CmdTableSz   //table size
+ *            SwChnlCmdID    CmdID        //command ID to set
+ *            u32            Para1
+ *            u32            Para2
+ *            u32            msDelay
+ * output:
+ * return:    true if finished, false otherwise
+ * notice:
+ ******************************************************************************/
+u8 rtl8192_phy_SetSwChnlCmdArray(SwChnlCmd *CmdTable, u32 CmdTableIdx,
+				 u32 CmdTableSz, SwChnlCmdID CmdID, u32 Para1,
+				 u32 Para2, u32 msDelay)
 {
-	SwChnlCmd* pCmd;
+	SwChnlCmd *pCmd;
 
-	if(CmdTable == NULL)
-	{
-		RT_TRACE(COMP_ERR, "phy_SetSwChnlCmdArray(): CmdTable cannot be NULL.\n");
+	if (CmdTable == NULL) {
+		RT_TRACE(COMP_ERR, "%s(): CmdTable cannot be NULL\n", __func__);
 		return false;
 	}
-	if(CmdTableIdx >= CmdTableSz)
-	{
-		RT_TRACE(COMP_ERR, "phy_SetSwChnlCmdArray(): Access invalid index, please check size of the table, CmdTableIdx:%d, CmdTableSz:%d\n",
-				CmdTableIdx, CmdTableSz);
+	if (CmdTableIdx >= CmdTableSz) {
+		RT_TRACE(COMP_ERR, "%s(): Access invalid index, please check size of the table, CmdTableIdx:%d, CmdTableSz:%d\n",
+			 __func__, CmdTableIdx, CmdTableSz);
 		return false;
 	}
 
@@ -1226,455 +1263,442 @@
 
 	return true;
 }
+
 /******************************************************************************
- *function:  This function set channel step by step
- *   input:  struct net_device *dev
- *	     u8			channel
- *	     u8*		stage //3 stages
- *	     u8*		step  //
- *	     u32*		delay //whether need to delay
- *  output:  store new stage, step and delay for next step(combine with function above)
- *  return:  true if finished, false otherwise
- *    Note:  Wait for simpler function to replace it //wb
- * ***************************************************************************/
-u8 rtl8192_phy_SwChnlStepByStep(struct net_device *dev, u8 channel, u8* stage, u8* step, u32* delay)
+ * function:  This function sets channel step by step
+ * input:     net_device        *dev
+ *            u8                channel
+ *            u8                *stage   //3 stages
+ *            u8                *step
+ *            u32               *delay   //whether need to delay
+ * output:    store new stage, step and delay for next step
+ *            (combine with function above)
+ * return:    true if finished, false otherwise
+ * notice:    Wait for simpler function to replace it
+ *****************************************************************************/
+u8 rtl8192_phy_SwChnlStepByStep(struct net_device *dev, u8 channel, u8 *stage,
+				u8 *step, u32 *delay)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-//	PCHANNEL_ACCESS_SETTING	pChnlAccessSetting;
-	SwChnlCmd				PreCommonCmd[MAX_PRECMD_CNT];
-	u32					PreCommonCmdCnt;
-	SwChnlCmd				PostCommonCmd[MAX_POSTCMD_CNT];
-	u32					PostCommonCmdCnt;
-	SwChnlCmd				RfDependCmd[MAX_RFDEPENDCMD_CNT];
-	u32					RfDependCmdCnt;
-	SwChnlCmd				*CurrentCmd = NULL;
-	//RF90_RADIO_PATH_E		eRFPath;
+	SwChnlCmd	PreCommonCmd[MAX_PRECMD_CNT];
+	u32		PreCommonCmdCnt;
+	SwChnlCmd	PostCommonCmd[MAX_POSTCMD_CNT];
+	u32		PostCommonCmdCnt;
+	SwChnlCmd	RfDependCmd[MAX_RFDEPENDCMD_CNT];
+	u32		RfDependCmdCnt;
+	SwChnlCmd	*CurrentCmd = NULL;
 	u8		eRFPath;
-//	u32		RfRetVal;
-//	u8		RetryCnt;
 
-	RT_TRACE(COMP_CH, "====>%s()====stage:%d, step:%d, channel:%d\n", __FUNCTION__, *stage, *step, channel);
-//	RT_ASSERT(IsLegalChannel(Adapter, channel), ("illegal channel: %d\n", channel));
-	if (!IsLegalChannel(priv->ieee80211, channel))
-	{
-		RT_TRACE(COMP_ERR, "=============>set to illegal channel:%d\n", channel);
-		return true; //return true to tell upper caller function this channel setting is finished! Or it will in while loop.
+	RT_TRACE(COMP_CH, "%s() stage: %d, step: %d, channel: %d\n",
+		 __func__, *stage, *step, channel);
+	if (!IsLegalChannel(priv->ieee80211, channel)) {
+		RT_TRACE(COMP_ERR, "set to illegal channel: %d\n", channel);
+		/* return true to tell upper caller function this channel
+		   setting is finished! Or it will in while loop. */
+		return true;
 	}
-//FIXME:need to check whether channel is legal or not here.WB
+	/* FIXME: need to check whether channel is legal or not here */
 
 
-	//for(eRFPath = RF90_PATH_A; eRFPath <pHalData->NumTotalRFPath; eRFPath++)
-//	for(eRFPath = 0; eRFPath <RF90_PATH_MAX; eRFPath++)
-//	{
-//		if (!rtl8192_phy_CheckIsLegalRFPath(dev, eRFPath))
-//			continue;
-		// <1> Fill up pre common command.
-		PreCommonCmdCnt = 0;
-		rtl8192_phy_SetSwChnlCmdArray(PreCommonCmd, PreCommonCmdCnt++, MAX_PRECMD_CNT,
-					CmdID_SetTxPowerLevel, 0, 0, 0);
-		rtl8192_phy_SetSwChnlCmdArray(PreCommonCmd, PreCommonCmdCnt++, MAX_PRECMD_CNT,
-					CmdID_End, 0, 0, 0);
+	/* <1> Fill up pre common command. */
+	PreCommonCmdCnt = 0;
+	rtl8192_phy_SetSwChnlCmdArray(PreCommonCmd, PreCommonCmdCnt++,
+				      MAX_PRECMD_CNT, CmdID_SetTxPowerLevel,
+				      0, 0, 0);
+	rtl8192_phy_SetSwChnlCmdArray(PreCommonCmd, PreCommonCmdCnt++,
+				      MAX_PRECMD_CNT, CmdID_End, 0, 0, 0);
 
-		// <2> Fill up post common command.
-		PostCommonCmdCnt = 0;
+	/* <2> Fill up post common command. */
+	PostCommonCmdCnt = 0;
 
-		rtl8192_phy_SetSwChnlCmdArray(PostCommonCmd, PostCommonCmdCnt++, MAX_POSTCMD_CNT,
-					CmdID_End, 0, 0, 0);
+	rtl8192_phy_SetSwChnlCmdArray(PostCommonCmd, PostCommonCmdCnt++,
+				      MAX_POSTCMD_CNT, CmdID_End, 0, 0, 0);
 
-		// <3> Fill up RF dependent command.
-		RfDependCmdCnt = 0;
-		switch( priv->rf_chip )
-		{
-		case RF_8225:
-			if (!(channel >= 1 && channel <= 14))
-			{
-				RT_TRACE(COMP_ERR, "illegal channel for Zebra 8225: %d\n", channel);
-				return true;
-			}
-			rtl8192_phy_SetSwChnlCmdArray(RfDependCmd, RfDependCmdCnt++, MAX_RFDEPENDCMD_CNT,
-				CmdID_RF_WriteReg, rZebra1_Channel, RF_CHANNEL_TABLE_ZEBRA[channel], 10);
-			rtl8192_phy_SetSwChnlCmdArray(RfDependCmd, RfDependCmdCnt++, MAX_RFDEPENDCMD_CNT,
-				CmdID_End, 0, 0, 0);
-			break;
-
-		case RF_8256:
-			// TEST!! This is not the table for 8256!!
-			if (!(channel >= 1 && channel <= 14))
-			{
-				RT_TRACE(COMP_ERR, "illegal channel for Zebra 8256: %d\n", channel);
-				return true;
-			}
-			rtl8192_phy_SetSwChnlCmdArray(RfDependCmd, RfDependCmdCnt++, MAX_RFDEPENDCMD_CNT,
-				CmdID_RF_WriteReg, rZebra1_Channel, channel, 10);
-			rtl8192_phy_SetSwChnlCmdArray(RfDependCmd, RfDependCmdCnt++, MAX_RFDEPENDCMD_CNT,
-			CmdID_End, 0, 0, 0);
-			break;
-
-		case RF_8258:
-			break;
-
-		default:
-			RT_TRACE(COMP_ERR, "Unknown RFChipID: %d\n", priv->rf_chip);
+	/* <3> Fill up RF dependent command. */
+	RfDependCmdCnt = 0;
+	switch (priv->rf_chip) {
+	case RF_8225:
+		if (!(channel >= 1 && channel <= 14)) {
+			RT_TRACE(COMP_ERR,
+				 "illegal channel for Zebra 8225: %d\n",
+				 channel);
 			return true;
+		}
+		rtl8192_phy_SetSwChnlCmdArray(RfDependCmd, RfDependCmdCnt++,
+					      MAX_RFDEPENDCMD_CNT,
+					      CmdID_RF_WriteReg,
+					      rZebra1_Channel,
+					      RF_CHANNEL_TABLE_ZEBRA[channel],
+					      10);
+		rtl8192_phy_SetSwChnlCmdArray(RfDependCmd, RfDependCmdCnt++,
+					      MAX_RFDEPENDCMD_CNT,
+					      CmdID_End, 0, 0, 0);
+		break;
+
+	case RF_8256:
+		/* TEST!! This is not the table for 8256!! */
+		if (!(channel >= 1 && channel <= 14)) {
+			RT_TRACE(COMP_ERR,
+				 "illegal channel for Zebra 8256: %d\n",
+				 channel);
+			return true;
+		}
+		rtl8192_phy_SetSwChnlCmdArray(RfDependCmd, RfDependCmdCnt++,
+					      MAX_RFDEPENDCMD_CNT,
+					      CmdID_RF_WriteReg,
+					      rZebra1_Channel, channel, 10);
+		rtl8192_phy_SetSwChnlCmdArray(RfDependCmd, RfDependCmdCnt++,
+					      MAX_RFDEPENDCMD_CNT,
+					      CmdID_End, 0, 0, 0);
+		break;
+
+	case RF_8258:
+		break;
+
+	default:
+		RT_TRACE(COMP_ERR, "Unknown RFChipID: %d\n", priv->rf_chip);
+		return true;
+		break;
+	}
+
+
+	do {
+		switch (*stage) {
+		case 0:
+			CurrentCmd = &PreCommonCmd[*step];
+			break;
+		case 1:
+			CurrentCmd = &RfDependCmd[*step];
+			break;
+		case 2:
+			CurrentCmd = &PostCommonCmd[*step];
 			break;
 		}
 
-
-		do{
-			switch(*stage)
-			{
-			case 0:
-				CurrentCmd=&PreCommonCmd[*step];
-				break;
-			case 1:
-				CurrentCmd=&RfDependCmd[*step];
-				break;
-			case 2:
-				CurrentCmd=&PostCommonCmd[*step];
-				break;
+		if (CurrentCmd->CmdID == CmdID_End) {
+			if ((*stage) == 2) {
+				(*delay) = CurrentCmd->msDelay;
+				return true;
+			} else {
+				(*stage)++;
+				(*step) = 0;
+				continue;
 			}
+		}
 
-			if(CurrentCmd->CmdID==CmdID_End)
-			{
-				if((*stage)==2)
-				{
-					(*delay)=CurrentCmd->msDelay;
-					return true;
-				}
-				else
-				{
-					(*stage)++;
-					(*step)=0;
-					continue;
-				}
-			}
-
-			switch(CurrentCmd->CmdID)
-			{
-			case CmdID_SetTxPowerLevel:
-				if(priv->card_8192_version == (u8)VERSION_819xU_A) //xiong: consider it later!
-					rtl8192_SetTxPowerLevel(dev,channel);
-				break;
-			case CmdID_WritePortUlong:
-				write_nic_dword(dev, CurrentCmd->Para1, CurrentCmd->Para2);
-				break;
-			case CmdID_WritePortUshort:
-				write_nic_word(dev, CurrentCmd->Para1, (u16)CurrentCmd->Para2);
-				break;
-			case CmdID_WritePortUchar:
-				write_nic_byte(dev, CurrentCmd->Para1, (u8)CurrentCmd->Para2);
-				break;
-			case CmdID_RF_WriteReg:
-				for(eRFPath = 0; eRFPath < RF90_PATH_MAX; eRFPath++)
-				{
-				rtl8192_phy_SetRFReg(dev, (RF90_RADIO_PATH_E)eRFPath, CurrentCmd->Para1, bZebra1_ChannelNum, CurrentCmd->Para2);
-				}
-				break;
-			default:
-				break;
-			}
-
+		switch (CurrentCmd->CmdID) {
+		case CmdID_SetTxPowerLevel:
+			if (priv->card_8192_version == (u8)VERSION_819xU_A)
+				/* consider it later! */
+				rtl8192_SetTxPowerLevel(dev, channel);
 			break;
-		}while(true);
-//	}/*for(Number of RF paths)*/
+		case CmdID_WritePortUlong:
+			write_nic_dword(dev, CurrentCmd->Para1,
+					CurrentCmd->Para2);
+			break;
+		case CmdID_WritePortUshort:
+			write_nic_word(dev, CurrentCmd->Para1,
+				       (u16)CurrentCmd->Para2);
+			break;
+		case CmdID_WritePortUchar:
+			write_nic_byte(dev, CurrentCmd->Para1,
+				       (u8)CurrentCmd->Para2);
+			break;
+		case CmdID_RF_WriteReg:
+			for (eRFPath = 0; eRFPath < RF90_PATH_MAX; eRFPath++) {
+				rtl8192_phy_SetRFReg(dev,
+						     (RF90_RADIO_PATH_E)eRFPath,
+						     CurrentCmd->Para1,
+						     bZebra1_ChannelNum,
+						     CurrentCmd->Para2);
+			}
+			break;
+		default:
+			break;
+		}
 
-	(*delay)=CurrentCmd->msDelay;
+		break;
+	} while (true);
+
+	(*delay) = CurrentCmd->msDelay;
 	(*step)++;
 	return false;
 }
 
 /******************************************************************************
- *function:  This function does actually set channel work
- *   input:  struct net_device *dev
- *	     u8			channel
- *  output:  none
- *  return:  noin
- *    Note:  We should not call this function directly
- * ***************************************************************************/
+ * function:  This function does actually set channel work
+ * input:     net_device        *dev
+ *            u8                channel
+ * output:    none
+ * return:    none
+ * notice:    We should not call this function directly
+ *****************************************************************************/
 void rtl8192_phy_FinishSwChnlNow(struct net_device *dev, u8 channel)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u32	delay = 0;
 
-	while(!rtl8192_phy_SwChnlStepByStep(dev,channel,&priv->SwChnlStage,&priv->SwChnlStep,&delay))
-	{
-	//	if(delay>0)
-	//		msleep(delay);//or mdelay? need further consideration
-		if(!priv->up)
+	while (!rtl8192_phy_SwChnlStepByStep(dev, channel, &priv->SwChnlStage,
+					     &priv->SwChnlStep, &delay)) {
+		if (!priv->up)
 			break;
 	}
 }
+
 /******************************************************************************
- *function:  Callback routine of the work item for switch channel.
- *   input:
+ * function:  Callback routine of the work item for switch channel.
+ * input:     net_device	*dev
  *
- *  output:  none
- *  return:  noin
- * ***************************************************************************/
+ * output:    none
+ * return:    none
+ *****************************************************************************/
 void rtl8192_SwChnl_WorkItem(struct net_device *dev)
 {
 
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
-	RT_TRACE(COMP_CH, "==> SwChnlCallback819xUsbWorkItem(), chan:%d\n", priv->chan);
+	RT_TRACE(COMP_CH, "==> SwChnlCallback819xUsbWorkItem(), chan:%d\n",
+		 priv->chan);
 
 
-	rtl8192_phy_FinishSwChnlNow(dev , priv->chan);
+	rtl8192_phy_FinishSwChnlNow(dev, priv->chan);
 
 	RT_TRACE(COMP_CH, "<== SwChnlCallback819xUsbWorkItem()\n");
 }
 
 /******************************************************************************
- *function:  This function scheduled actual work item to set channel
- *   input:  net_device dev
- *	     u8		channel //channel to set
- *  output:  none
- *  return:  return code show if workitem is scheduled(1:pass, 0:fail)
- *    Note:  Delay may be required for RF configuration
- * ***************************************************************************/
-u8 rtl8192_phy_SwChnl(struct net_device* dev, u8 channel)
+ * function:  This function scheduled actual work item to set channel
+ * input:     net_device        *dev
+ *            u8                channel   //channel to set
+ * output:    none
+ * return:    return code show if workitem is scheduled (1:pass, 0:fail)
+ * notice:    Delay may be required for RF configuration
+ ******************************************************************************/
+u8 rtl8192_phy_SwChnl(struct net_device *dev, u8 channel)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
-	RT_TRACE(COMP_CH, "=====>%s(), SwChnlInProgress:%d\n", __FUNCTION__, priv->SwChnlInProgress);
-	if(!priv->up)
+	RT_TRACE(COMP_CH, "%s(), SwChnlInProgress: %d\n", __func__,
+		 priv->SwChnlInProgress);
+	if (!priv->up)
 		return false;
-	if(priv->SwChnlInProgress)
+	if (priv->SwChnlInProgress)
 		return false;
 
-//	if(pHalData->SetBWModeInProgress)
-//		return;
-if (0) //to test current channel from RF reg 0x7.
-{
-	u8		eRFPath;
-	for(eRFPath = 0; eRFPath < 2; eRFPath++){
-	printk("====>set channel:%x\n",rtl8192_phy_QueryRFReg(dev, (RF90_RADIO_PATH_E)eRFPath, 0x7, bZebra1_ChannelNum));
-	udelay(10);
-	}
-}
-	//--------------------------------------------
-	switch(priv->ieee80211->mode)
-	{
+	/* -------------------------------------------- */
+	switch (priv->ieee80211->mode) {
 	case WIRELESS_MODE_A:
 	case WIRELESS_MODE_N_5G:
-		if (channel<=14){
-			RT_TRACE(COMP_ERR, "WIRELESS_MODE_A but channel<=14");
+		if (channel <= 14) {
+			RT_TRACE(COMP_ERR, "WIRELESS_MODE_A but channel<=14\n");
 			return false;
 		}
 		break;
 	case WIRELESS_MODE_B:
-		if (channel>14){
-			RT_TRACE(COMP_ERR, "WIRELESS_MODE_B but channel>14");
+		if (channel > 14) {
+			RT_TRACE(COMP_ERR, "WIRELESS_MODE_B but channel>14\n");
 			return false;
 		}
 		break;
 	case WIRELESS_MODE_G:
 	case WIRELESS_MODE_N_24G:
-		if (channel>14){
-			RT_TRACE(COMP_ERR, "WIRELESS_MODE_G but channel>14");
+		if (channel > 14) {
+			RT_TRACE(COMP_ERR, "WIRELESS_MODE_G but channel>14\n");
 			return false;
 		}
 		break;
 	}
-	//--------------------------------------------
+	/* -------------------------------------------- */
 
 	priv->SwChnlInProgress = true;
-	if(channel == 0)
+	if (channel == 0)
 		channel = 1;
 
-	priv->chan=channel;
+	priv->chan = channel;
 
-	priv->SwChnlStage=0;
-	priv->SwChnlStep=0;
-//	schedule_work(&(priv->SwChnlWorkItem));
-//	rtl8192_SwChnl_WorkItem(dev);
-	if(priv->up) {
-//		queue_work(priv->priv_wq,&(priv->SwChnlWorkItem));
-	rtl8192_SwChnl_WorkItem(dev);
-	}
+	priv->SwChnlStage = 0;
+	priv->SwChnlStep = 0;
+	if (priv->up)
+		rtl8192_SwChnl_WorkItem(dev);
 
 	priv->SwChnlInProgress = false;
 	return true;
 }
 
-
-//
 /******************************************************************************
- *function:  Callback routine of the work item for set bandwidth mode.
- *   input:  struct net_device *dev
- *	     HT_CHANNEL_WIDTH	Bandwidth  //20M or 40M
- *	     HT_EXTCHNL_OFFSET Offset	   //Upper, Lower, or Don't care
- *  output:  none
- *  return:  none
- *    Note:  I doubt whether SetBWModeInProgress flag is necessary as we can
- *	     test whether current work in the queue or not.//do I?
- * ***************************************************************************/
+ * function:  Callback routine of the work item for set bandwidth mode.
+ * input:     net_device	 *dev
+ * output:    none
+ * return:    none
+ * notice:    I doubt whether SetBWModeInProgress flag is necessary as we can
+ *            test whether current work in the queue or not.//do I?
+ *****************************************************************************/
 void rtl8192_SetBWModeWorkItem(struct net_device *dev)
 {
 
 	struct r8192_priv *priv = ieee80211_priv(dev);
 	u8 regBwOpMode;
 
-	RT_TRACE(COMP_SWBW, "==>rtl8192_SetBWModeWorkItem()  Switch to %s bandwidth\n", \
-					priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20?"20MHz":"40MHz")
+	RT_TRACE(COMP_SWBW, "%s()  Switch to %s bandwidth\n", __func__,
+		 priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20?"20MHz":"40MHz");
 
 
-	if(priv->rf_chip == RF_PSEUDO_11N)
-	{
-		priv->SetBWModeInProgress= false;
+	if (priv->rf_chip == RF_PSEUDO_11N) {
+		priv->SetBWModeInProgress = false;
 		return;
 	}
 
-	//<1>Set MAC register
-	regBwOpMode = read_nic_byte(dev, BW_OPMODE);
+	/* <1> Set MAC register */
+	read_nic_byte(dev, BW_OPMODE, &regBwOpMode);
 
-	switch(priv->CurrentChannelBW)
-	{
-		case HT_CHANNEL_WIDTH_20:
-			regBwOpMode |= BW_OPMODE_20MHZ;
-		       // 2007/02/07 Mark by Emily because we have not verify whether this register works
-			write_nic_byte(dev, BW_OPMODE, regBwOpMode);
-			break;
+	switch (priv->CurrentChannelBW) {
+	case HT_CHANNEL_WIDTH_20:
+		regBwOpMode |= BW_OPMODE_20MHZ;
+		/* We have not verify whether this register works */
+		write_nic_byte(dev, BW_OPMODE, regBwOpMode);
+		break;
 
-		case HT_CHANNEL_WIDTH_20_40:
-			regBwOpMode &= ~BW_OPMODE_20MHZ;
-			// 2007/02/07 Mark by Emily because we have not verify whether this register works
-			write_nic_byte(dev, BW_OPMODE, regBwOpMode);
-			break;
+	case HT_CHANNEL_WIDTH_20_40:
+		regBwOpMode &= ~BW_OPMODE_20MHZ;
+		/* We have not verify whether this register works */
+		write_nic_byte(dev, BW_OPMODE, regBwOpMode);
+		break;
 
-		default:
-			RT_TRACE(COMP_ERR, "SetChannelBandwidth819xUsb(): unknown Bandwidth: %#X\n",priv->CurrentChannelBW);
-			break;
+	default:
+		RT_TRACE(COMP_ERR,
+			 "SetChannelBandwidth819xUsb(): unknown Bandwidth: %#X\n",
+			 priv->CurrentChannelBW);
+		break;
 	}
 
-	//<2>Set PHY related register
-	switch(priv->CurrentChannelBW)
-	{
-		case HT_CHANNEL_WIDTH_20:
-			// Add by Vivi 20071119
-			rtl8192_setBBreg(dev, rFPGA0_RFMOD, bRFMOD, 0x0);
-			rtl8192_setBBreg(dev, rFPGA1_RFMOD, bRFMOD, 0x0);
-			rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x00100000, 1);
+	/* <2> Set PHY related register */
+	switch (priv->CurrentChannelBW) {
+	case HT_CHANNEL_WIDTH_20:
+		rtl8192_setBBreg(dev, rFPGA0_RFMOD, bRFMOD, 0x0);
+		rtl8192_setBBreg(dev, rFPGA1_RFMOD, bRFMOD, 0x0);
+		rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1,
+				 0x00100000, 1);
 
-			// Correct the tx power for CCK rate in 20M. Suggest by YN, 20071207
-			priv->cck_present_attentuation =
-				priv->cck_present_attentuation_20Mdefault + priv->cck_present_attentuation_difference;
+		/* Correct the tx power for CCK rate in 20M. */
+		priv->cck_present_attentuation =
+			priv->cck_present_attentuation_20Mdefault +
+			priv->cck_present_attentuation_difference;
 
-			if(priv->cck_present_attentuation > 22)
-				priv->cck_present_attentuation= 22;
-			if(priv->cck_present_attentuation< 0)
-				priv->cck_present_attentuation = 0;
-			RT_TRACE(COMP_INIT, "20M, pHalData->CCKPresentAttentuation = %d\n", priv->cck_present_attentuation);
+		if (priv->cck_present_attentuation > 22)
+			priv->cck_present_attentuation = 22;
+		if (priv->cck_present_attentuation < 0)
+			priv->cck_present_attentuation = 0;
+		RT_TRACE(COMP_INIT,
+			 "20M, pHalData->CCKPresentAttentuation = %d\n",
+			 priv->cck_present_attentuation);
 
-			if(priv->chan == 14 && !priv->bcck_in_ch14)
-			{
-				priv->bcck_in_ch14 = TRUE;
-				dm_cck_txpower_adjust(dev,priv->bcck_in_ch14);
-			}
-			else if(priv->chan != 14 && priv->bcck_in_ch14)
-			{
-				priv->bcck_in_ch14 = FALSE;
-				dm_cck_txpower_adjust(dev,priv->bcck_in_ch14);
-			}
-			else
-				dm_cck_txpower_adjust(dev,priv->bcck_in_ch14);
+		if (priv->chan == 14 && !priv->bcck_in_ch14) {
+			priv->bcck_in_ch14 = TRUE;
+			dm_cck_txpower_adjust(dev, priv->bcck_in_ch14);
+		} else if (priv->chan != 14 && priv->bcck_in_ch14) {
+			priv->bcck_in_ch14 = FALSE;
+			dm_cck_txpower_adjust(dev, priv->bcck_in_ch14);
+		} else {
+			dm_cck_txpower_adjust(dev, priv->bcck_in_ch14);
+		}
 
-			break;
-		case HT_CHANNEL_WIDTH_20_40:
-			// Add by Vivi 20071119
-			rtl8192_setBBreg(dev, rFPGA0_RFMOD, bRFMOD, 0x1);
-			rtl8192_setBBreg(dev, rFPGA1_RFMOD, bRFMOD, 0x1);
-			rtl8192_setBBreg(dev, rCCK0_System, bCCKSideBand, (priv->nCur40MhzPrimeSC>>1));
-			rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x00100000, 0);
-			rtl8192_setBBreg(dev, rOFDM1_LSTF, 0xC00, priv->nCur40MhzPrimeSC);
-			priv->cck_present_attentuation =
-				priv->cck_present_attentuation_40Mdefault + priv->cck_present_attentuation_difference;
+		break;
+	case HT_CHANNEL_WIDTH_20_40:
+		rtl8192_setBBreg(dev, rFPGA0_RFMOD, bRFMOD, 0x1);
+		rtl8192_setBBreg(dev, rFPGA1_RFMOD, bRFMOD, 0x1);
+		rtl8192_setBBreg(dev, rCCK0_System, bCCKSideBand,
+				 priv->nCur40MhzPrimeSC>>1);
+		rtl8192_setBBreg(dev, rFPGA0_AnalogParameter1, 0x00100000, 0);
+		rtl8192_setBBreg(dev, rOFDM1_LSTF, 0xC00,
+				 priv->nCur40MhzPrimeSC);
+		priv->cck_present_attentuation =
+			priv->cck_present_attentuation_40Mdefault +
+			priv->cck_present_attentuation_difference;
 
-			if(priv->cck_present_attentuation > 22)
-				priv->cck_present_attentuation = 22;
-			if(priv->cck_present_attentuation < 0)
-				priv->cck_present_attentuation = 0;
+		if (priv->cck_present_attentuation > 22)
+			priv->cck_present_attentuation = 22;
+		if (priv->cck_present_attentuation < 0)
+			priv->cck_present_attentuation = 0;
 
-			RT_TRACE(COMP_INIT, "40M, pHalData->CCKPresentAttentuation = %d\n", priv->cck_present_attentuation);
-			if(priv->chan == 14 && !priv->bcck_in_ch14)
-			{
-				priv->bcck_in_ch14 = true;
-				dm_cck_txpower_adjust(dev,priv->bcck_in_ch14);
-			}
-			else if(priv->chan!= 14 && priv->bcck_in_ch14)
-			{
-				priv->bcck_in_ch14 = false;
-				dm_cck_txpower_adjust(dev,priv->bcck_in_ch14);
-			}
-			else
-				dm_cck_txpower_adjust(dev,priv->bcck_in_ch14);
+		RT_TRACE(COMP_INIT,
+			 "40M, pHalData->CCKPresentAttentuation = %d\n",
+			 priv->cck_present_attentuation);
+		if (priv->chan == 14 && !priv->bcck_in_ch14) {
+			priv->bcck_in_ch14 = true;
+			dm_cck_txpower_adjust(dev, priv->bcck_in_ch14);
+		} else if (priv->chan != 14 && priv->bcck_in_ch14) {
+			priv->bcck_in_ch14 = false;
+			dm_cck_txpower_adjust(dev, priv->bcck_in_ch14);
+		} else {
+			dm_cck_txpower_adjust(dev, priv->bcck_in_ch14);
+		}
 
-			break;
-		default:
-			RT_TRACE(COMP_ERR, "SetChannelBandwidth819xUsb(): unknown Bandwidth: %#X\n" ,priv->CurrentChannelBW);
-			break;
+		break;
+	default:
+		RT_TRACE(COMP_ERR,
+			 "SetChannelBandwidth819xUsb(): unknown Bandwidth: %#X\n",
+			 priv->CurrentChannelBW);
+		break;
 
 	}
-	//Skip over setting of J-mode in BB register here. Default value is "None J mode". Emily 20070315
+	/* Skip over setting of J-mode in BB register here.
+	   Default value is "None J mode". */
 
-	//<3>Set RF related register
-	switch( priv->rf_chip )
-	{
-		case RF_8225:
+	/* <3> Set RF related register */
+	switch (priv->rf_chip) {
+	case RF_8225:
 #ifdef TO_DO_LIST
-			PHY_SetRF8225Bandwidth(Adapter, pHalData->CurrentChannelBW);
+		PHY_SetRF8225Bandwidth(Adapter, pHalData->CurrentChannelBW);
 #endif
-			break;
+		break;
 
-		case RF_8256:
-			PHY_SetRF8256Bandwidth(dev, priv->CurrentChannelBW);
-			break;
+	case RF_8256:
+		PHY_SetRF8256Bandwidth(dev, priv->CurrentChannelBW);
+		break;
 
-		case RF_8258:
-			// PHY_SetRF8258Bandwidth();
-			break;
+	case RF_8258:
+		break;
 
-		case RF_PSEUDO_11N:
-			// Do Nothing
-			break;
+	case RF_PSEUDO_11N:
+		break;
 
-		default:
-			RT_TRACE(COMP_ERR, "Unknown RFChipID: %d\n", priv->rf_chip);
-			break;
+	default:
+		RT_TRACE(COMP_ERR, "Unknown RFChipID: %d\n", priv->rf_chip);
+		break;
 	}
-	priv->SetBWModeInProgress= false;
+	priv->SetBWModeInProgress = false;
 
-	RT_TRACE(COMP_SWBW, "<==SetBWMode819xUsb(), %d", atomic_read(&(priv->ieee80211->atm_swbw)) );
+	RT_TRACE(COMP_SWBW, "<==SetBWMode819xUsb(), %d\n",
+		 atomic_read(&priv->ieee80211->atm_swbw));
 }
 
 /******************************************************************************
- *function:  This function schedules bandwidth switch work.
- *   input:  struct net_device *dev
- *	     HT_CHANNEL_WIDTH	Bandwidth  //20M or 40M
- *	     HT_EXTCHNL_OFFSET Offset	   //Upper, Lower, or Don't care
- *  output:  none
- *  return:  none
- *    Note:  I doubt whether SetBWModeInProgress flag is necessary as we can
- *	     test whether current work in the queue or not.//do I?
- * ***************************************************************************/
-void rtl8192_SetBWMode(struct net_device *dev, HT_CHANNEL_WIDTH	Bandwidth, HT_EXTCHNL_OFFSET Offset)
+ * function:  This function schedules bandwidth switch work.
+ * input:     struct net_deviceq   *dev
+ *            HT_CHANNEL_WIDTH     bandwidth  //20M or 40M
+ *            HT_EXTCHNL_OFFSET    offset     //Upper, Lower, or Don't care
+ * output:    none
+ * return:    none
+ * notice:    I doubt whether SetBWModeInProgress flag is necessary as we can
+ *	      test whether current work in the queue or not.//do I?
+ *****************************************************************************/
+void rtl8192_SetBWMode(struct net_device *dev, HT_CHANNEL_WIDTH bandwidth,
+		       HT_EXTCHNL_OFFSET offset)
 {
 	struct r8192_priv *priv = ieee80211_priv(dev);
 
-	if(priv->SetBWModeInProgress)
+	if (priv->SetBWModeInProgress)
 		return;
-	priv->SetBWModeInProgress= true;
+	priv->SetBWModeInProgress = true;
 
-	priv->CurrentChannelBW = Bandwidth;
+	priv->CurrentChannelBW = bandwidth;
 
-	if(Offset==HT_EXTCHNL_OFFSET_LOWER)
+	if (offset == HT_EXTCHNL_OFFSET_LOWER)
 		priv->nCur40MhzPrimeSC = HAL_PRIME_CHNL_OFFSET_UPPER;
-	else if(Offset==HT_EXTCHNL_OFFSET_UPPER)
+	else if (offset == HT_EXTCHNL_OFFSET_UPPER)
 		priv->nCur40MhzPrimeSC = HAL_PRIME_CHNL_OFFSET_LOWER;
 	else
 		priv->nCur40MhzPrimeSC = HAL_PRIME_CHNL_OFFSET_DONT_CARE;
 
-	//queue_work(priv->priv_wq, &(priv->SetBWModeWorkItem));
-	//	schedule_work(&(priv->SetBWModeWorkItem));
 	rtl8192_SetBWModeWorkItem(dev);
 
 }
@@ -1685,88 +1709,110 @@
 
 	priv->InitialGainOperateType = Operation;
 
-	if(priv->up)
-	{
-		queue_delayed_work(priv->priv_wq,&priv->initialgain_operate_wq,0);
-	}
+	if (priv->up)
+		queue_delayed_work(priv->priv_wq, &priv->initialgain_operate_wq, 0);
 }
 
 extern void InitialGainOperateWorkItemCallBack(struct work_struct *work)
 {
-	struct delayed_work *dwork = container_of(work,struct delayed_work,work);
-       struct r8192_priv *priv = container_of(dwork,struct r8192_priv,initialgain_operate_wq);
-       struct net_device *dev = priv->ieee80211->dev;
+	struct delayed_work *dwork = container_of(work, struct delayed_work,
+						  work);
+	struct r8192_priv *priv = container_of(dwork, struct r8192_priv,
+					       initialgain_operate_wq);
+	struct net_device *dev = priv->ieee80211->dev;
 #define SCAN_RX_INITIAL_GAIN	0x17
 #define POWER_DETECTION_TH	0x08
-	u32	BitMask;
+	u32	bitmask;
 	u8	initial_gain;
 	u8	Operation;
 
 	Operation = priv->InitialGainOperateType;
 
-	switch(Operation)
-	{
-		case IG_Backup:
-			RT_TRACE(COMP_SCAN, "IG_Backup, backup the initial gain.\n");
-			initial_gain = SCAN_RX_INITIAL_GAIN;//priv->DefaultInitialGain[0];//
-			BitMask = bMaskByte0;
-			if(dm_digtable.dig_algorithm == DIG_ALGO_BY_FALSE_ALARM)
-				rtl8192_setBBreg(dev, UFWP, bMaskByte1, 0x8);	// FW DIG OFF
-			priv->initgain_backup.xaagccore1 = (u8)rtl8192_QueryBBReg(dev, rOFDM0_XAAGCCore1, BitMask);
-			priv->initgain_backup.xbagccore1 = (u8)rtl8192_QueryBBReg(dev, rOFDM0_XBAGCCore1, BitMask);
-			priv->initgain_backup.xcagccore1 = (u8)rtl8192_QueryBBReg(dev, rOFDM0_XCAGCCore1, BitMask);
-			priv->initgain_backup.xdagccore1 = (u8)rtl8192_QueryBBReg(dev, rOFDM0_XDAGCCore1, BitMask);
-			BitMask  = bMaskByte2;
-			priv->initgain_backup.cca		= (u8)rtl8192_QueryBBReg(dev, rCCK0_CCA, BitMask);
+	switch (Operation) {
+	case IG_Backup:
+		RT_TRACE(COMP_SCAN, "IG_Backup, backup the initial gain.\n");
+		initial_gain = SCAN_RX_INITIAL_GAIN;
+		bitmask = bMaskByte0;
+		if (dm_digtable.dig_algorithm == DIG_ALGO_BY_FALSE_ALARM)
+			/* FW DIG OFF */
+			rtl8192_setBBreg(dev, UFWP, bMaskByte1, 0x8);
+		priv->initgain_backup.xaagccore1 =
+			(u8)rtl8192_QueryBBReg(dev, rOFDM0_XAAGCCore1, bitmask);
+		priv->initgain_backup.xbagccore1 =
+			(u8)rtl8192_QueryBBReg(dev, rOFDM0_XBAGCCore1, bitmask);
+		priv->initgain_backup.xcagccore1 =
+			(u8)rtl8192_QueryBBReg(dev, rOFDM0_XCAGCCore1, bitmask);
+		priv->initgain_backup.xdagccore1 =
+			(u8)rtl8192_QueryBBReg(dev, rOFDM0_XDAGCCore1, bitmask);
+		bitmask = bMaskByte2;
+		priv->initgain_backup.cca =
+			(u8)rtl8192_QueryBBReg(dev, rCCK0_CCA, bitmask);
 
-			RT_TRACE(COMP_SCAN, "Scan InitialGainBackup 0xc50 is %x\n",priv->initgain_backup.xaagccore1);
-			RT_TRACE(COMP_SCAN, "Scan InitialGainBackup 0xc58 is %x\n",priv->initgain_backup.xbagccore1);
-			RT_TRACE(COMP_SCAN, "Scan InitialGainBackup 0xc60 is %x\n",priv->initgain_backup.xcagccore1);
-			RT_TRACE(COMP_SCAN, "Scan InitialGainBackup 0xc68 is %x\n",priv->initgain_backup.xdagccore1);
-			RT_TRACE(COMP_SCAN, "Scan InitialGainBackup 0xa0a is %x\n",priv->initgain_backup.cca);
+		RT_TRACE(COMP_SCAN, "Scan InitialGainBackup 0xc50 is %x\n",
+			 priv->initgain_backup.xaagccore1);
+		RT_TRACE(COMP_SCAN, "Scan InitialGainBackup 0xc58 is %x\n",
+			 priv->initgain_backup.xbagccore1);
+		RT_TRACE(COMP_SCAN, "Scan InitialGainBackup 0xc60 is %x\n",
+			 priv->initgain_backup.xcagccore1);
+		RT_TRACE(COMP_SCAN, "Scan InitialGainBackup 0xc68 is %x\n",
+			 priv->initgain_backup.xdagccore1);
+		RT_TRACE(COMP_SCAN, "Scan InitialGainBackup 0xa0a is %x\n",
+			 priv->initgain_backup.cca);
 
-			RT_TRACE(COMP_SCAN, "Write scan initial gain = 0x%x \n", initial_gain);
-			write_nic_byte(dev, rOFDM0_XAAGCCore1, initial_gain);
-			write_nic_byte(dev, rOFDM0_XBAGCCore1, initial_gain);
-			write_nic_byte(dev, rOFDM0_XCAGCCore1, initial_gain);
-			write_nic_byte(dev, rOFDM0_XDAGCCore1, initial_gain);
-			RT_TRACE(COMP_SCAN, "Write scan 0xa0a = 0x%x \n", POWER_DETECTION_TH);
-			write_nic_byte(dev, 0xa0a, POWER_DETECTION_TH);
-			break;
-		case IG_Restore:
-			RT_TRACE(COMP_SCAN, "IG_Restore, restore the initial gain.\n");
-			BitMask = 0x7f; //Bit0~ Bit6
-			if(dm_digtable.dig_algorithm == DIG_ALGO_BY_FALSE_ALARM)
-				rtl8192_setBBreg(dev, UFWP, bMaskByte1, 0x8);	// FW DIG OFF
+		RT_TRACE(COMP_SCAN, "Write scan initial gain = 0x%x \n",
+			 initial_gain);
+		write_nic_byte(dev, rOFDM0_XAAGCCore1, initial_gain);
+		write_nic_byte(dev, rOFDM0_XBAGCCore1, initial_gain);
+		write_nic_byte(dev, rOFDM0_XCAGCCore1, initial_gain);
+		write_nic_byte(dev, rOFDM0_XDAGCCore1, initial_gain);
+		RT_TRACE(COMP_SCAN, "Write scan 0xa0a = 0x%x \n",
+			 POWER_DETECTION_TH);
+		write_nic_byte(dev, 0xa0a, POWER_DETECTION_TH);
+		break;
+	case IG_Restore:
+		RT_TRACE(COMP_SCAN, "IG_Restore, restore the initial gain.\n");
+		bitmask = 0x7f; /* Bit0 ~ Bit6 */
+		if (dm_digtable.dig_algorithm == DIG_ALGO_BY_FALSE_ALARM)
+			/* FW DIG OFF */
+			rtl8192_setBBreg(dev, UFWP, bMaskByte1, 0x8);
 
-			rtl8192_setBBreg(dev, rOFDM0_XAAGCCore1, BitMask, (u32)priv->initgain_backup.xaagccore1);
-			rtl8192_setBBreg(dev, rOFDM0_XBAGCCore1, BitMask, (u32)priv->initgain_backup.xbagccore1);
-			rtl8192_setBBreg(dev, rOFDM0_XCAGCCore1, BitMask, (u32)priv->initgain_backup.xcagccore1);
-			rtl8192_setBBreg(dev, rOFDM0_XDAGCCore1, BitMask, (u32)priv->initgain_backup.xdagccore1);
-			BitMask  = bMaskByte2;
-			rtl8192_setBBreg(dev, rCCK0_CCA, BitMask, (u32)priv->initgain_backup.cca);
+		rtl8192_setBBreg(dev, rOFDM0_XAAGCCore1, bitmask,
+				 (u32)priv->initgain_backup.xaagccore1);
+		rtl8192_setBBreg(dev, rOFDM0_XBAGCCore1, bitmask,
+				 (u32)priv->initgain_backup.xbagccore1);
+		rtl8192_setBBreg(dev, rOFDM0_XCAGCCore1, bitmask,
+				 (u32)priv->initgain_backup.xcagccore1);
+		rtl8192_setBBreg(dev, rOFDM0_XDAGCCore1, bitmask,
+				 (u32)priv->initgain_backup.xdagccore1);
+		bitmask  = bMaskByte2;
+		rtl8192_setBBreg(dev, rCCK0_CCA, bitmask,
+				 (u32)priv->initgain_backup.cca);
 
-			RT_TRACE(COMP_SCAN, "Scan BBInitialGainRestore 0xc50 is %x\n",priv->initgain_backup.xaagccore1);
-			RT_TRACE(COMP_SCAN, "Scan BBInitialGainRestore 0xc58 is %x\n",priv->initgain_backup.xbagccore1);
-			RT_TRACE(COMP_SCAN, "Scan BBInitialGainRestore 0xc60 is %x\n",priv->initgain_backup.xcagccore1);
-			RT_TRACE(COMP_SCAN, "Scan BBInitialGainRestore 0xc68 is %x\n",priv->initgain_backup.xdagccore1);
-			RT_TRACE(COMP_SCAN, "Scan BBInitialGainRestore 0xa0a is %x\n",priv->initgain_backup.cca);
+		RT_TRACE(COMP_SCAN, "Scan BBInitialGainRestore 0xc50 is %x\n",
+			 priv->initgain_backup.xaagccore1);
+		RT_TRACE(COMP_SCAN, "Scan BBInitialGainRestore 0xc58 is %x\n",
+			 priv->initgain_backup.xbagccore1);
+		RT_TRACE(COMP_SCAN, "Scan BBInitialGainRestore 0xc60 is %x\n",
+			 priv->initgain_backup.xcagccore1);
+		RT_TRACE(COMP_SCAN, "Scan BBInitialGainRestore 0xc68 is %x\n",
+			 priv->initgain_backup.xdagccore1);
+		RT_TRACE(COMP_SCAN, "Scan BBInitialGainRestore 0xa0a is %x\n",
+			 priv->initgain_backup.cca);
 
 #ifdef RTL8190P
-			SetTxPowerLevel8190(Adapter,priv->CurrentChannel);
+		SetTxPowerLevel8190(Adapter, priv->CurrentChannel);
 #endif
 #ifdef RTL8192E
-			SetTxPowerLevel8190(Adapter,priv->CurrentChannel);
+		SetTxPowerLevel8190(Adapter, priv->CurrentChannel);
 #endif
-//#ifdef RTL8192U
-			rtl8192_phy_setTxPower(dev,priv->ieee80211->current_network.channel);
-//#endif
+		rtl8192_phy_setTxPower(dev, priv->ieee80211->current_network.channel);
 
-			if(dm_digtable.dig_algorithm == DIG_ALGO_BY_FALSE_ALARM)
-				rtl8192_setBBreg(dev, UFWP, bMaskByte1, 0x1);	// FW DIG ON
-			break;
-		default:
-			RT_TRACE(COMP_SCAN, "Unknown IG Operation. \n");
-			break;
+		if (dm_digtable.dig_algorithm == DIG_ALGO_BY_FALSE_ALARM)
+			/* FW DIG ON */
+			rtl8192_setBBreg(dev, UFWP, bMaskByte1, 0x1);
+		break;
+	default:
+		RT_TRACE(COMP_SCAN, "Unknown IG Operation. \n");
+		break;
 	}
 }

diff --git a/drivers/staging/rtl8192u/r819xU_phy.h b/drivers/staging/rtl8192u/r819xU_phy.h
index 3e3bc57..f3c352a 100644
--- a/drivers/staging/rtl8192u/r819xU_phy.h
+++ b/drivers/staging/rtl8192u/r819xU_phy.h

@@ -1,12 +1,12 @@
 #ifndef _R819XU_PHY_H
 #define _R819XU_PHY_H
 
-/* Channel switch:The size of command tables for switch channel*/
+/* Channel switch: The size of command tables for switch channel */
 #define MAX_PRECMD_CNT 16
 #define MAX_RFDEPENDCMD_CNT 16
 #define MAX_POSTCMD_CNT 16
 
-typedef enum _SwChnlCmdID{
+typedef enum _SwChnlCmdID {
 	CmdID_End,
 	CmdID_SetTxPowerLevel,
 	CmdID_BBRegWrite10,
@@ -14,16 +14,16 @@
 	CmdID_WritePortUshort,
 	CmdID_WritePortUchar,
 	CmdID_RF_WriteReg,
-}SwChnlCmdID;
+} SwChnlCmdID;
 
-/*--------------------------------Define structure--------------------------------*/
+/* -----------------------Define structure---------------------- */
 /* 1. Switch channel related */
-typedef struct _SwChnlCmd{
+typedef struct _SwChnlCmd {
 	SwChnlCmdID	CmdID;
-	u32			Para1;
-	u32			Para2;
-	u32			msDelay;
-}__attribute__ ((packed)) SwChnlCmd;
+	u32		Para1;
+	u32		Para2;
+	u32		msDelay;
+} __attribute__ ((packed)) SwChnlCmd;
 
 extern u32 rtl819XMACPHY_Array_PG[];
 extern u32 rtl819XPHY_REG_1T2RArray[];
@@ -33,21 +33,21 @@
 extern u32 rtl819XRadioC_Array[];
 extern u32 rtl819XRadioD_Array[];
 
-typedef enum _HW90_BLOCK{
+typedef enum _HW90_BLOCK {
 	HW90_BLOCK_MAC = 0,
 	HW90_BLOCK_PHY0 = 1,
 	HW90_BLOCK_PHY1 = 2,
 	HW90_BLOCK_RF = 3,
-	HW90_BLOCK_MAXIMUM = 4, // Never use this
-}HW90_BLOCK_E, *PHW90_BLOCK_E;
+	HW90_BLOCK_MAXIMUM = 4, /* Never use this */
+} HW90_BLOCK_E, *PHW90_BLOCK_E;
 
-typedef enum _RF90_RADIO_PATH{
-	RF90_PATH_A = 0,			//Radio Path A
-	RF90_PATH_B = 1,			//Radio Path B
-	RF90_PATH_C = 2,			//Radio Path C
-	RF90_PATH_D = 3,			//Radio Path D
-	RF90_PATH_MAX				//Max RF number 92 support
-}RF90_RADIO_PATH_E, *PRF90_RADIO_PATH_E;
+typedef enum _RF90_RADIO_PATH {
+	RF90_PATH_A = 0,			/* Radio Path A */
+	RF90_PATH_B = 1,			/* Radio Path B */
+	RF90_PATH_C = 2,			/* Radio Path C */
+	RF90_PATH_D = 3,			/* Radio Path D */
+	RF90_PATH_MAX				/* Max RF number 92 support */
+} RF90_RADIO_PATH_E, *PRF90_RADIO_PATH_E;
 
 #define bMaskByte0                0xff
 #define bMaskByte1                0xff00
@@ -57,33 +57,35 @@
 #define bMaskLWord                0x0000ffff
 #define bMaskDWord                0xffffffff
 
-//extern u32 rtl8192_CalculateBitShift(u32 dwBitMask);
-extern u8 rtl8192_phy_CheckIsLegalRFPath(struct net_device* dev, u32 eRFPath);
-extern void rtl8192_setBBreg(struct net_device* dev, u32 dwRegAddr, u32 dwBitMask, u32 dwData);
-extern u32 rtl8192_QueryBBReg(struct net_device* dev, u32 dwRegAddr, u32 dwBitMask);
-//extern u32 rtl8192_phy_RFSerialRead(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32 Offset);
-//extern void rtl8192_phy_RFSerialWrite(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32 Offset, u32 Data);
-extern void rtl8192_phy_SetRFReg(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask, u32 Data);
-extern u32 rtl8192_phy_QueryRFReg(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask);
-extern void rtl8192_phy_configmac(struct net_device* dev);
-extern void rtl8192_phyConfigBB(struct net_device* dev, u8 ConfigType);
-//extern void rtl8192_InitBBRFRegDef(struct net_device* dev);
-extern u8 rtl8192_phy_checkBBAndRF(struct net_device* dev, HW90_BLOCK_E CheckBlock, RF90_RADIO_PATH_E eRFPath);
-//extern void rtl8192_BB_Config_ParaFile(struct net_device* dev);
-extern void rtl8192_BBConfig(struct net_device* dev);
-extern void rtl8192_phy_getTxPower(struct net_device* dev);
-extern void rtl8192_phy_setTxPower(struct net_device* dev, u8 channel);
-extern void rtl8192_phy_RFConfig(struct net_device* dev);
-extern void rtl8192_phy_updateInitGain(struct net_device* dev);
-extern u8 rtl8192_phy_ConfigRFWithHeaderFile(struct net_device* dev, RF90_RADIO_PATH_E	eRFPath);
+extern u8 rtl8192_phy_CheckIsLegalRFPath(struct net_device *dev, u32 eRFPath);
+extern void rtl8192_setBBreg(struct net_device *dev, u32 reg_addr,
+	u32 bitmask, u32 data);
+extern u32 rtl8192_QueryBBReg(struct net_device *dev, u32 reg_addr,
+	u32 bitmask);
+extern void rtl8192_phy_SetRFReg(struct net_device *dev,
+	RF90_RADIO_PATH_E eRFPath, u32 reg_addr, u32 bitmask, u32 data);
+extern u32 rtl8192_phy_QueryRFReg(struct net_device *dev,
+	RF90_RADIO_PATH_E eRFPath, u32 reg_addr, u32 bitmask);
+extern void rtl8192_phy_configmac(struct net_device *dev);
+extern void rtl8192_phyConfigBB(struct net_device *dev, u8 ConfigType);
+extern u8 rtl8192_phy_checkBBAndRF(struct net_device *dev,
+	HW90_BLOCK_E CheckBlock, RF90_RADIO_PATH_E eRFPath);
+extern void rtl8192_BBConfig(struct net_device *dev);
+extern void rtl8192_phy_getTxPower(struct net_device *dev);
+extern void rtl8192_phy_setTxPower(struct net_device *dev, u8 channel);
+extern void rtl8192_phy_RFConfig(struct net_device *dev);
+extern void rtl8192_phy_updateInitGain(struct net_device *dev);
+extern u8 rtl8192_phy_ConfigRFWithHeaderFile(struct net_device *dev,
+	RF90_RADIO_PATH_E eRFPath);
 
-extern u8 rtl8192_phy_SwChnl(struct net_device* dev, u8 channel);
-extern void rtl8192_SetBWMode(struct net_device *dev, HT_CHANNEL_WIDTH	Bandwidth, HT_EXTCHNL_OFFSET Offset);
+extern u8 rtl8192_phy_SwChnl(struct net_device *dev, u8 channel);
+extern void rtl8192_SetBWMode(struct net_device *dev,
+	HT_CHANNEL_WIDTH bandwidth, HT_EXTCHNL_OFFSET offset);
 extern void rtl8192_SwChnl_WorkItem(struct net_device *dev);
 void rtl8192_SetBWModeWorkItem(struct net_device *dev);
-extern bool rtl8192_SetRFPowerState(struct net_device *dev, RT_RF_POWER_STATE eRFPowerState);
-//added by amy
-extern void InitialGain819xUsb(struct net_device *dev,	u8 Operation);
+extern bool rtl8192_SetRFPowerState(struct net_device *dev,
+	RT_RF_POWER_STATE eRFPowerState);
+extern void InitialGain819xUsb(struct net_device *dev, u8 Operation);
 
 extern void InitialGainOperateWorkItemCallBack(struct work_struct *work);
 

diff --git a/drivers/staging/rts5139/rts51x_transport.c b/drivers/staging/rts5139/rts51x_transport.c
index 89e4d80..c172f4ae 100644
--- a/drivers/staging/rts5139/rts51x_transport.c
+++ b/drivers/staging/rts5139/rts51x_transport.c

@@ -635,12 +635,12 @@
 	ep = chip->usb->pusb_dev->ep_in[usb_pipeendpoint(pipe)];
 
 	/* fill and submit the URB */
-	/* We set interval to 1 here, so the polling interval is controlled
-	 * by our polling thread */
+	/* Set interval to 10 here to match the endpoint descriptor,
+	 * the polling interval is controlled by the polling thread */
 	usb_fill_int_urb(chip->usb->intr_urb, chip->usb->pusb_dev, pipe,
-			 status, 2, urb_done_completion, &urb_done, 1);
+			 status, 2, urb_done_completion, &urb_done, 10);
 
-	result = rts51x_msg_common(chip, chip->usb->intr_urb, 50);
+	result = rts51x_msg_common(chip, chip->usb->intr_urb, 100);
 
 	return interpret_urb_result(chip, pipe, 2, result,
 				    chip->usb->intr_urb->actual_length);

diff --git a/drivers/staging/sb105x/sb_pci_mp.c b/drivers/staging/sb105x/sb_pci_mp.c
index cd94f6c..23db32f 100644
--- a/drivers/staging/sb105x/sb_pci_mp.c
+++ b/drivers/staging/sb105x/sb_pci_mp.c

@@ -18,11 +18,11 @@
 static _INLINE_ unsigned int serial_in(struct mp_port *mtpt, int offset);
 static _INLINE_ void serial_out(struct mp_port *mtpt, int offset, int value);
 static _INLINE_ unsigned int read_option_register(struct mp_port *mtpt, int offset);
-static int sb1054_get_register(struct sb_uart_port * port, int page, int reg);
-static int sb1054_set_register(struct sb_uart_port * port, int page, int reg, int value);
-static void SendATCommand(struct mp_port * mtpt);
-static int set_deep_fifo(struct sb_uart_port * port, int status);
-static int get_deep_fifo(struct sb_uart_port * port);
+static int sb1054_get_register(struct sb_uart_port *port, int page, int reg);
+static int sb1054_set_register(struct sb_uart_port *port, int page, int reg, int value);
+static void SendATCommand(struct mp_port *mtpt);
+static int set_deep_fifo(struct sb_uart_port *port, int status);
+static int get_deep_fifo(struct sb_uart_port *port);
 static int get_device_type(int arg);
 static int set_auto_rts(struct sb_uart_port *port, int status);
 static void mp_stop(struct tty_struct *tty);
@@ -38,7 +38,7 @@
 static int mp_put_char(struct tty_struct *tty, unsigned char ch);
 
 static void mp_put_chars(struct tty_struct *tty);
-static int mp_write(struct tty_struct *tty, const unsigned char * buf, int count);
+static int mp_write(struct tty_struct *tty, const unsigned char *buf, int count);
 static int mp_write_room(struct tty_struct *tty);
 static int mp_chars_in_buffer(struct tty_struct *tty);
 static void mp_flush_buffer(struct tty_struct *tty);
@@ -102,7 +102,7 @@
 static int multi_request_port(struct sb_uart_port *port);
 static void multi_config_port(struct sb_uart_port *port, int flags);
 static int multi_verify_port(struct sb_uart_port *port, struct serial_struct *ser);
-static const char * multi_type(struct sb_uart_port *port);
+static const char *multi_type(struct sb_uart_port *port);
 static void __init multi_init_ports(void);
 static void __init multi_register_ports(struct uart_driver *drv);
 static int init_mp_dev(struct pci_dev *pcidev, mppcibrd_t brd);
@@ -173,7 +173,7 @@
 	return (interface);
 }
 		
-static int sb1054_get_register(struct sb_uart_port * port, int page, int reg)
+static int sb1054_get_register(struct sb_uart_port *port, int page, int reg)
 {
 	int ret = 0;
 	unsigned int lcr = 0;
@@ -235,7 +235,7 @@
 	return ret;
 }
 
-static int sb1054_set_register(struct sb_uart_port * port, int page, int reg, int value)
+static int sb1054_set_register(struct sb_uart_port *port, int page, int reg, int value)
 {  
 	int lcr = 0;
 	int mcr = 0;
@@ -332,7 +332,7 @@
 	return 0;
 }
 
-static void SendATCommand(struct mp_port * mtpt)
+static void SendATCommand(struct mp_port *mtpt)
 {
 	//		      a    t	cr   lf
 	unsigned char ch[] = {0x61,0x74,0x0d,0x0a,0x0};
@@ -360,7 +360,7 @@
 
 }// end of SendATCommand()
 
-static int set_deep_fifo(struct sb_uart_port * port, int status)
+static int set_deep_fifo(struct sb_uart_port *port, int status)
 {
 	int afr_status = 0;
 	afr_status = sb1054_get_register(port, PAGE_4, SB105X_AFR);
@@ -416,7 +416,7 @@
         }
 
 }
-static int get_deep_fifo(struct sb_uart_port * port)
+static int get_deep_fifo(struct sb_uart_port *port)
 {
 	int afr_status = 0;
 	afr_status = sb1054_get_register(port, PAGE_4, SB105X_AFR);
@@ -638,7 +638,7 @@
 	mp_start(tty);
 }
 
-static int mp_write(struct tty_struct *tty, const unsigned char * buf, int count)
+static int mp_write(struct tty_struct *tty, const unsigned char *buf, int count)
 {
 	struct sb_uart_state *state = tty->driver_data;
 	struct sb_uart_port *port;
@@ -2754,7 +2754,7 @@
 	return 0;
 }
 
-static const char * multi_type(struct sb_uart_port *port)
+static const char *multi_type(struct sb_uart_port *port)
 {
 	int type = port->type;
 
@@ -2800,7 +2800,7 @@
 	int i,j,k;
 	unsigned char osc;
 	unsigned char b_ret = 0;
-	static struct mp_device_t * sbdev; 
+	static struct mp_device_t *sbdev; 
 
 	if (!first)
 		return;
@@ -2918,10 +2918,10 @@
 
 static int init_mp_dev(struct pci_dev *pcidev, mppcibrd_t brd)
 {
-	static struct mp_device_t * sbdev = mp_devs;
+	static struct mp_device_t *sbdev = mp_devs;
 	unsigned long addr = 0;
 	int j;
-	struct resource * ret = NULL;
+	struct resource *ret = NULL;
 
 	sbdev->device_id = brd.device_id;
 	pci_read_config_byte(pcidev, PCI_CLASS_REVISION, &(sbdev->revision));

diff --git a/drivers/staging/sb105x/sb_pci_mp.h b/drivers/staging/sb105x/sb_pci_mp.h
index a15f470a..11d9299 100644
--- a/drivers/staging/sb105x/sb_pci_mp.h
+++ b/drivers/staging/sb105x/sb_pci_mp.h

@@ -19,7 +19,6 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/segment.h>
 #include <asm/serial.h>
 #include <linux/interrupt.h>
 

diff --git a/drivers/staging/serqt_usb2/serqt_usb2.c b/drivers/staging/serqt_usb2/serqt_usb2.c
index 8a6e5ea..73fc3cc 100644
--- a/drivers/staging/serqt_usb2/serqt_usb2.c
+++ b/drivers/staging/serqt_usb2/serqt_usb2.c

@@ -155,10 +155,10 @@
 	struct urb *read_urb;	/* read URB for this port */
 	struct urb *int_urb;
 
-	__u8 shadowLCR;		/* last LCR value received */
-	__u8 shadowMCR;		/* last MCR value received */
-	__u8 shadowMSR;		/* last MSR value received */
-	__u8 shadowLSR;		/* last LSR value received */
+	__u8 shadow_lcr;		/* last LCR value received */
+	__u8 shadow_mcr;		/* last MCR value received */
+	__u8 shadow_msr;		/* last MSR value received */
+	__u8 shadow_lsr;		/* last LSR value received */
 	char open_ports;
 
 	/* Used for TIOCMIWAIT */
@@ -170,12 +170,12 @@
 	struct async_icount icount;
 
 	struct usb_serial_port *port;	/* owner of this object */
-	struct qt_get_device_data DeviceData;
+	struct qt_get_device_data device_data;
 	struct mutex lock;
 	bool read_urb_busy;
-	int RxHolding;
-	int ReadBulkStopped;
-	char closePending;
+	int rx_holding;
+	int read_bulk_stopped;
+	char close_pending;
 };
 
 static int port_paranoia_check(struct usb_serial_port *port,
@@ -238,24 +238,24 @@
 	return port->serial;
 }
 
-static void ProcessLineStatus(struct quatech_port *qt_port,
+static void process_line_status(struct quatech_port *qt_port,
 			      unsigned char line_status)
 {
 
-	qt_port->shadowLSR =
+	qt_port->shadow_lsr =
 	    line_status & (SERIAL_LSR_OE | SERIAL_LSR_PE | SERIAL_LSR_FE |
 			   SERIAL_LSR_BI);
 }
 
-static void ProcessModemStatus(struct quatech_port *qt_port,
+static void process_modem_status(struct quatech_port *qt_port,
 			       unsigned char modem_status)
 {
 
-	qt_port->shadowMSR = modem_status;
+	qt_port->shadow_msr = modem_status;
 	wake_up_interruptible(&qt_port->wait);
 }
 
-static void ProcessRxChar(struct usb_serial_port *port, unsigned char data)
+static void process_rx_char(struct usb_serial_port *port, unsigned char data)
 {
 	struct urb *urb = port->read_urb;
 	if (urb->actual_length)
@@ -291,35 +291,35 @@
 {
 	int flag, i;
 	unsigned char *data = urb->transfer_buffer;
-	unsigned int RxCount = urb->actual_length;
+	unsigned int rx_count = urb->actual_length;
 
-	for (i = 0; i < RxCount; ++i) {
+	for (i = 0; i < rx_count; ++i) {
 		/* Look ahead code here */
-		if ((i <= (RxCount - 3)) && (data[i] == 0x1b)
+		if ((i <= (rx_count - 3)) && (data[i] == 0x1b)
 		    && (data[i + 1] == 0x1b)) {
 			flag = 0;
 			switch (data[i + 2]) {
 			case 0x00:
-				if (i > (RxCount - 4)) {
+				if (i > (rx_count - 4)) {
 					dev_dbg(&port->dev,
 						"Illegal escape seuences in received data\n");
 					break;
 				}
 
-				ProcessLineStatus(qt_port, data[i + 3]);
+				process_line_status(qt_port, data[i + 3]);
 
 				i += 3;
 				flag = 1;
 				break;
 
 			case 0x01:
-				if (i > (RxCount - 4)) {
+				if (i > (rx_count - 4)) {
 					dev_dbg(&port->dev,
 						"Illegal escape seuences in received data\n");
 					break;
 				}
 
-				ProcessModemStatus(qt_port, data[i + 3]);
+				process_modem_status(qt_port, data[i + 3]);
 
 				i += 3;
 				flag = 1;
@@ -328,8 +328,8 @@
 			case 0xff:
 				dev_dbg(&port->dev, "No status sequence.\n");
 
-				ProcessRxChar(port, data[i]);
-				ProcessRxChar(port, data[i + 1]);
+				process_rx_char(port, data[i]);
+				process_rx_char(port, data[i + 1]);
 
 				i += 2;
 				break;
@@ -354,7 +354,7 @@
 	int result;
 
 	if (urb->status) {
-		qt_port->ReadBulkStopped = 1;
+		qt_port->read_bulk_stopped = 1;
 		dev_dbg(&urb->dev->dev,
 			"%s - nonzero write bulk status received: %d\n",
 			__func__, urb->status);
@@ -362,36 +362,36 @@
 	}
 
 	dev_dbg(&port->dev,
-		"%s - port->RxHolding = %d\n", __func__, qt_port->RxHolding);
+		"%s - port->rx_holding = %d\n", __func__, qt_port->rx_holding);
 
 	if (port_paranoia_check(port, __func__) != 0) {
-		qt_port->ReadBulkStopped = 1;
+		qt_port->read_bulk_stopped = 1;
 		return;
 	}
 
 	if (!serial)
 		return;
 
-	if (qt_port->closePending == 1) {
+	if (qt_port->close_pending == 1) {
 		/* Were closing , stop reading */
 		dev_dbg(&port->dev,
-			"%s - (qt_port->closepending == 1\n", __func__);
-		qt_port->ReadBulkStopped = 1;
+			"%s - (qt_port->close_pending == 1\n", __func__);
+		qt_port->read_bulk_stopped = 1;
 		return;
 	}
 
 	/*
-	 * RxHolding is asserted by throttle, if we assert it, we're not
+	 * rx_holding is asserted by throttle, if we assert it, we're not
 	 * receiving any more characters and let the box handle the flow
 	 * control
 	 */
-	if (qt_port->RxHolding == 1) {
-		qt_port->ReadBulkStopped = 1;
+	if (qt_port->rx_holding == 1) {
+		qt_port->read_bulk_stopped = 1;
 		return;
 	}
 
 	if (urb->status) {
-		qt_port->ReadBulkStopped = 1;
+		qt_port->read_bulk_stopped = 1;
 
 		dev_dbg(&port->dev,
 			"%s - nonzero read bulk status received: %d\n",
@@ -455,10 +455,10 @@
 }
 
 /****************************************************************************
- *  BoxSetPrebufferLevel
+ *  box_set_prebuffer_level
    TELLS BOX WHEN TO ASSERT FLOW CONTROL
  ****************************************************************************/
-static int BoxSetPrebufferLevel(struct usb_serial *serial)
+static int box_set_prebuffer_level(struct usb_serial *serial)
 {
 	int result;
 	__u16 buffer_length;
@@ -471,10 +471,10 @@
 }
 
 /****************************************************************************
- *  BoxSetATC
+ *  box_set_atc
    TELLS BOX WHEN TO ASSERT automatic transmitter control
    ****************************************************************************/
-static int BoxSetATC(struct usb_serial *serial, __u16 n_Mode)
+static int box_set_atc(struct usb_serial *serial, __u16 n_mode)
 {
 	int result;
 	__u16 buffer_length;
@@ -483,7 +483,7 @@
 
 	result =
 	    usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0),
-			    QT_SET_ATF, 0x40, n_Mode, 0, NULL, 0, 300);
+			    QT_SET_ATF, 0x40, n_mode, 0, NULL, 0, 300);
 
 	return result;
 }
@@ -499,42 +499,42 @@
 {
 	int result;
 	__u16 length;
-	__u16 PortSettings;
+	__u16 port_settings;
 
-	PortSettings = ((__u16) (device_data->portb));
-	PortSettings = (PortSettings << 8);
-	PortSettings += ((__u16) (device_data->porta));
+	port_settings = ((__u16) (device_data->portb));
+	port_settings = (port_settings << 8);
+	port_settings += ((__u16) (device_data->porta));
 
 	length = sizeof(struct qt_get_device_data);
 
 	result = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0),
-				 QT_SET_GET_DEVICE, 0x40, PortSettings,
+				 QT_SET_GET_DEVICE, 0x40, port_settings,
 				 0, NULL, 0, 300);
 	return result;
 }
 
-static int qt_open_channel(struct usb_serial *serial, __u16 Uart_Number,
-			   struct qt_open_channel_data *pDeviceData)
+static int qt_open_channel(struct usb_serial *serial, __u16 uart_num,
+			   struct qt_open_channel_data *pdevice_data)
 {
 	int result;
 
 	result = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0),
 				 QT_OPEN_CLOSE_CHANNEL,
-				 USBD_TRANSFER_DIRECTION_IN, 1, Uart_Number,
-				 pDeviceData,
+				 USBD_TRANSFER_DIRECTION_IN, 1, uart_num,
+				 pdevice_data,
 				 sizeof(struct qt_open_channel_data), 300);
 
 	return result;
 
 }
 
-static int qt_close_channel(struct usb_serial *serial, __u16 Uart_Number)
+static int qt_close_channel(struct usb_serial *serial, __u16 uart_num)
 {
 	int result;
 
 	result = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0),
 				 QT_OPEN_CLOSE_CHANNEL,
-				 USBD_TRANSFER_DIRECTION_OUT, 0, Uart_Number,
+				 USBD_TRANSFER_DIRECTION_OUT, 0, uart_num,
 				 NULL, 0, 300);
 
 	return result;
@@ -542,12 +542,12 @@
 }
 
 /****************************************************************************
-* BoxGetRegister
+* box_get_register
 *	issuse a GET_REGISTER vendor-spcific request on the default control pipe
-*	If successful, fills in the  pValue with the register value asked for
+*	If successful, fills in the  p_value with the register value asked for
 ****************************************************************************/
-static int BoxGetRegister(struct usb_serial *serial, unsigned short Uart_Number,
-			  unsigned short Register_Num, __u8 *pValue)
+static int box_get_register(struct usb_serial *serial, unsigned short uart_num,
+			  unsigned short register_num, __u8 *p_value)
 {
 	int result;
 	__u16 current_length;
@@ -556,36 +556,36 @@
 
 	result =
 	    usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0),
-			    QT_GET_SET_REGISTER, 0xC0, Register_Num,
-			    Uart_Number, (void *)pValue, sizeof(*pValue), 300);
+			    QT_GET_SET_REGISTER, 0xC0, register_num,
+			    uart_num, (void *)p_value, sizeof(*p_value), 300);
 
 	return result;
 }
 
 /****************************************************************************
-* BoxSetRegister
+* box_set_register
 *	issuse a GET_REGISTER vendor-spcific request on the default control pipe
-*	If successful, fills in the  pValue with the register value asked for
+*	If successful, fills in the  p_value with the register value asked for
 ****************************************************************************/
-static int BoxSetRegister(struct usb_serial *serial, unsigned short Uart_Number,
-			  unsigned short Register_Num, unsigned short Value)
+static int box_set_register(struct usb_serial *serial, unsigned short uart_num,
+			  unsigned short register_num, unsigned short value)
 {
 	int result;
-	unsigned short RegAndByte;
+	unsigned short reg_and_byte;
 
-	RegAndByte = Value;
-	RegAndByte = RegAndByte << 8;
-	RegAndByte = RegAndByte + Register_Num;
+	reg_and_byte = value;
+	reg_and_byte = reg_and_byte << 8;
+	reg_and_byte = reg_and_byte + register_num;
 
 /*
 	result = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0),
-				 QT_GET_SET_REGISTER, 0xC0, Register_Num,
-				 Uart_Number, NULL, 0, 300);
+				 QT_GET_SET_REGISTER, 0xC0, register_num,
+				 uart_num, NULL, 0, 300);
 */
 
 	result =
 	    usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0),
-			    QT_GET_SET_REGISTER, 0x40, RegAndByte, Uart_Number,
+			    QT_GET_SET_REGISTER, 0x40, reg_and_byte, uart_num,
 			    NULL, 0, 300);
 
 	return result;
@@ -596,30 +596,30 @@
  * issues a SET_UART vendor-specific request on the default control pipe
  * If successful sets baud rate divisor and LCR value
  */
-static int qt_setuart(struct usb_serial *serial, unsigned short Uart_Number,
-		      unsigned short default_divisor, unsigned char default_LCR)
+static int qt_setuart(struct usb_serial *serial, unsigned short uart_num,
+		      unsigned short default_divisor, unsigned char default_lcr)
 {
 	int result;
-	unsigned short UartNumandLCR;
+	unsigned short uart_num_and_lcr;
 
-	UartNumandLCR = (default_LCR << 8) + Uart_Number;
+	uart_num_and_lcr = (default_lcr << 8) + uart_num;
 
 	result =
 	    usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0),
 			    QT_GET_SET_UART, 0x40, default_divisor,
-			    UartNumandLCR, NULL, 0, 300);
+			    uart_num_and_lcr, NULL, 0, 300);
 
 	return result;
 }
 
-static int BoxSetHW_FlowCtrl(struct usb_serial *serial, unsigned int index,
-			     int bSet)
+static int box_set_hw_flow_ctrl(struct usb_serial *serial, unsigned int index,
+			     int b_set)
 {
 	__u8 mcr = 0;
-	__u8 msr = 0, MOUT_Value = 0;
+	__u8 msr = 0, mout_value = 0;
 	unsigned int status;
 
-	if (bSet == 1) {
+	if (b_set == 1) {
 		/* flow control, box will clear RTS line to prevent remote */
 		mcr = SERIAL_MCR_RTS;
 	} /* device from xmitting more chars */
@@ -628,9 +628,9 @@
 		mcr = 0;
 
 	}
-	MOUT_Value = mcr << 8;
+	mout_value = mcr << 8;
 
-	if (bSet == 1) {
+	if (b_set == 1) {
 		/* flow control, box will inhibit xmit data if CTS line is
 		 * asserted */
 		msr = SERIAL_MSR_CTS;
@@ -638,34 +638,34 @@
 		/* Box will not inhimbe xmit data due to CTS line */
 		msr = 0;
 	}
-	MOUT_Value |= msr;
+	mout_value |= msr;
 
 	status =
 	    usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0),
-			    QT_HW_FLOW_CONTROL_MASK, 0x40, MOUT_Value,
+			    QT_HW_FLOW_CONTROL_MASK, 0x40, mout_value,
 			    index, NULL, 0, 300);
 	return status;
 
 }
 
-static int BoxSetSW_FlowCtrl(struct usb_serial *serial, __u16 index,
+static int box_set_sw_flow_ctrl(struct usb_serial *serial, __u16 index,
 			     unsigned char stop_char, unsigned char start_char)
 {
-	__u16 nSWflowout;
+	__u16 n_sw_flow_out;
 	int result;
 
-	nSWflowout = start_char << 8;
-	nSWflowout = (unsigned short)stop_char;
+	n_sw_flow_out = start_char << 8;
+	n_sw_flow_out = (unsigned short)stop_char;
 
 	result =
 	    usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0),
-			    QT_SW_FLOW_CONTROL_MASK, 0x40, nSWflowout,
+			    QT_SW_FLOW_CONTROL_MASK, 0x40, n_sw_flow_out,
 			    index, NULL, 0, 300);
 	return result;
 
 }
 
-static int BoxDisable_SW_FlowCtrl(struct usb_serial *serial, __u16 index)
+static int box_disable_sw_flow_ctrl(struct usb_serial *serial, __u16 index)
 {
 	int result;
 
@@ -682,7 +682,7 @@
 	struct device *dev = &serial->dev->dev;
 	struct usb_serial_port *port;
 	struct quatech_port *qt_port;
-	struct qt_get_device_data DeviceData;
+	struct qt_get_device_data device_data;
 	int i;
 	int status;
 
@@ -704,22 +704,22 @@
 
 	}
 
-	status = qt_get_device(serial, &DeviceData);
+	status = qt_get_device(serial, &device_data);
 	if (status < 0)
 		goto startup_error;
 
-	dev_dbg(dev, "DeviceData.portb = 0x%x\n", DeviceData.portb);
+	dev_dbg(dev, "device_data.portb = 0x%x\n", device_data.portb);
 
-	DeviceData.portb &= ~FULLPWRBIT;
-	dev_dbg(dev, "Changing DeviceData.portb to 0x%x\n", DeviceData.portb);
+	device_data.portb &= ~FULLPWRBIT;
+	dev_dbg(dev, "Changing device_data.portb to 0x%x\n", device_data.portb);
 
-	status = qt_set_device(serial, &DeviceData);
+	status = qt_set_device(serial, &device_data);
 	if (status < 0) {
 		dev_dbg(dev, "qt_set_device failed\n");
 		goto startup_error;
 	}
 
-	status = qt_get_device(serial, &DeviceData);
+	status = qt_get_device(serial, &device_data);
 	if (status < 0) {
 		dev_dbg(dev, "qt_get_device failed\n");
 		goto startup_error;
@@ -734,10 +734,10 @@
 	case QUATECH_HSU100B:
 	case QUATECH_HSU100C:
 	case QUATECH_HSU100D:
-		DeviceData.porta &= ~(RR_BITS | DUPMODE_BITS);
-		DeviceData.porta |= CLKS_X4;
-		DeviceData.portb &= ~(LOOPMODE_BITS);
-		DeviceData.portb |= RS232_MODE;
+		device_data.porta &= ~(RR_BITS | DUPMODE_BITS);
+		device_data.porta |= CLKS_X4;
+		device_data.portb &= ~(LOOPMODE_BITS);
+		device_data.portb |= RS232_MODE;
 		break;
 
 	case QUATECH_SSU200:
@@ -749,38 +749,38 @@
 	case QUATECH_HSU200B:
 	case QUATECH_HSU200C:
 	case QUATECH_HSU200D:
-		DeviceData.porta &= ~(RR_BITS | DUPMODE_BITS);
-		DeviceData.porta |= CLKS_X4;
-		DeviceData.portb &= ~(LOOPMODE_BITS);
-		DeviceData.portb |= ALL_LOOPBACK;
+		device_data.porta &= ~(RR_BITS | DUPMODE_BITS);
+		device_data.porta |= CLKS_X4;
+		device_data.portb &= ~(LOOPMODE_BITS);
+		device_data.portb |= ALL_LOOPBACK;
 		break;
 	default:
-		DeviceData.porta &= ~(RR_BITS | DUPMODE_BITS);
-		DeviceData.porta |= CLKS_X4;
-		DeviceData.portb &= ~(LOOPMODE_BITS);
-		DeviceData.portb |= RS232_MODE;
+		device_data.porta &= ~(RR_BITS | DUPMODE_BITS);
+		device_data.porta |= CLKS_X4;
+		device_data.portb &= ~(LOOPMODE_BITS);
+		device_data.portb |= RS232_MODE;
 		break;
 
 	}
 
-	status = BoxSetPrebufferLevel(serial);	/* sets to default value */
+	status = box_set_prebuffer_level(serial);	/* sets to default value */
 	if (status < 0) {
-		dev_dbg(dev, "BoxSetPrebufferLevel failed\n");
+		dev_dbg(dev, "box_set_prebuffer_level failed\n");
 		goto startup_error;
 	}
 
-	status = BoxSetATC(serial, ATC_DISABLED);
+	status = box_set_atc(serial, ATC_DISABLED);
 	if (status < 0) {
-		dev_dbg(dev, "BoxSetATC failed\n");
+		dev_dbg(dev, "box_set_atc failed\n");
 		goto startup_error;
 	}
 
-	dev_dbg(dev, "DeviceData.portb = 0x%x\n", DeviceData.portb);
+	dev_dbg(dev, "device_data.portb = 0x%x\n", device_data.portb);
 
-	DeviceData.portb |= NEXT_BOARD_POWER_BIT;
-	dev_dbg(dev, "Changing DeviceData.portb to 0x%x\n", DeviceData.portb);
+	device_data.portb |= NEXT_BOARD_POWER_BIT;
+	dev_dbg(dev, "Changing device_data.portb to 0x%x\n", device_data.portb);
 
-	status = qt_set_device(serial, &DeviceData);
+	status = qt_set_device(serial, &device_data);
 	if (status < 0) {
 		dev_dbg(dev, "qt_set_device failed\n");
 		goto startup_error;
@@ -848,7 +848,7 @@
 	struct usb_serial *serial;
 	struct quatech_port *quatech_port;
 	struct quatech_port *port0;
-	struct qt_open_channel_data ChannelData;
+	struct qt_open_channel_data channel_data;
 
 	int result;
 
@@ -870,10 +870,10 @@
 	usb_clear_halt(serial->dev, port->read_urb->pipe);
 	port0->open_ports++;
 
-	result = qt_get_device(serial, &port0->DeviceData);
+	result = qt_get_device(serial, &port0->device_data);
 
 	/* Port specific setups */
-	result = qt_open_channel(serial, port->number, &ChannelData);
+	result = qt_open_channel(serial, port->port_number, &channel_data);
 	if (result < 0) {
 		dev_dbg(&port->dev, "qt_open_channel failed\n");
 		return result;
@@ -881,14 +881,14 @@
 	dev_dbg(&port->dev, "qt_open_channel completed.\n");
 
 /* FIXME: are these needed?  Does it even do anything useful? */
-	quatech_port->shadowLSR = ChannelData.line_status &
+	quatech_port->shadow_lsr = channel_data.line_status &
 	    (SERIAL_LSR_OE | SERIAL_LSR_PE | SERIAL_LSR_FE | SERIAL_LSR_BI);
 
-	quatech_port->shadowMSR = ChannelData.modem_status &
+	quatech_port->shadow_msr = channel_data.modem_status &
 	    (SERIAL_MSR_CTS | SERIAL_MSR_DSR | SERIAL_MSR_RI | SERIAL_MSR_CD);
 
 	/* Set Baud rate to default and turn off (default)flow control here */
-	result = qt_setuart(serial, port->number, DEFAULT_DIVISOR, DEFAULT_LCR);
+	result = qt_setuart(serial, port->port_number, DEFAULT_DIVISOR, DEFAULT_LCR);
 	if (result < 0) {
 		dev_dbg(&port->dev, "qt_setuart failed\n");
 		return result;
@@ -906,8 +906,7 @@
 			qt_submit_urb_from_open(serial, port);
 	}
 
-	dev_dbg(&port->dev, "port number is %d\n", port->number);
-	dev_dbg(&port->dev, "serial number is %d\n", port->serial->minor);
+	dev_dbg(&port->dev, "minor number is %d\n", port->minor);
 	dev_dbg(&port->dev,
 		"Bulkin endpoint is %d\n", port->bulk_in_endpointAddress);
 	dev_dbg(&port->dev,
@@ -1003,7 +1002,7 @@
 	status = 0;
 
 	tty = tty_port_tty_get(&port->port);
-	index = tty->index - serial->minor;
+	index = port->port_number;
 
 	qt_port = qt_get_port_private(port);
 	port0 = qt_get_port_private(serial->port[0]);
@@ -1022,14 +1021,11 @@
 	/* Close uart channel */
 	status = qt_close_channel(serial, index);
 	if (status < 0)
-		dev_dbg(&port->dev,
-			"%s - port %d qt_close_channel failed.\n",
-			__func__, port->number);
+		dev_dbg(&port->dev, "%s - qt_close_channel failed.\n", __func__);
 
 	port0->open_ports--;
 
-	dev_dbg(&port->dev, "qt_num_open_ports in close%d:in port%d\n",
-		port0->open_ports, port->number);
+	dev_dbg(&port->dev, "qt_num_open_ports in close%d\n", port0->open_ports);
 
 	if (port0->open_ports == 0) {
 		if (serial->port[0]->interrupt_in_urb) {
@@ -1133,12 +1129,11 @@
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct quatech_port *qt_port = qt_get_port_private(port);
-	struct usb_serial *serial = get_usb_serial(port, __func__);
 	unsigned int index;
 
 	dev_dbg(&port->dev, "%s cmd 0x%04x\n", __func__, cmd);
 
-	index = tty->index - serial->minor;
+	index = port->port_number;
 
 	if (cmd == TIOCMIWAIT) {
 		while (qt_port != NULL) {
@@ -1169,8 +1164,7 @@
 		return 0;
 	}
 
-	dev_dbg(&port->dev, "%s -No ioctl for that one.  port = %d\n",
-		__func__, port->number);
+	dev_dbg(&port->dev, "%s -No ioctl for that one.\n", __func__);
 	return -ENOIOCTLCMD;
 }
 
@@ -1179,43 +1173,43 @@
 			   struct ktermios *old_termios)
 {
 	struct ktermios *termios = &tty->termios;
-	unsigned char new_LCR = 0;
+	unsigned char new_lcr = 0;
 	unsigned int cflag = termios->c_cflag;
 	unsigned int index;
 	int baud, divisor, remainder;
 	int status;
 
-	index = tty->index - port->serial->minor;
+	index = port->port_number;
 
 	switch (cflag & CSIZE) {
 	case CS5:
-		new_LCR |= SERIAL_5_DATA;
+		new_lcr |= SERIAL_5_DATA;
 		break;
 	case CS6:
-		new_LCR |= SERIAL_6_DATA;
+		new_lcr |= SERIAL_6_DATA;
 		break;
 	case CS7:
-		new_LCR |= SERIAL_7_DATA;
+		new_lcr |= SERIAL_7_DATA;
 		break;
 	default:
 		termios->c_cflag &= ~CSIZE;
 		termios->c_cflag |= CS8;
 	case CS8:
-		new_LCR |= SERIAL_8_DATA;
+		new_lcr |= SERIAL_8_DATA;
 		break;
 	}
 
 	/* Parity stuff */
 	if (cflag & PARENB) {
 		if (cflag & PARODD)
-			new_LCR |= SERIAL_ODD_PARITY;
+			new_lcr |= SERIAL_ODD_PARITY;
 		else
-			new_LCR |= SERIAL_EVEN_PARITY;
+			new_lcr |= SERIAL_EVEN_PARITY;
 	}
 	if (cflag & CSTOPB)
-		new_LCR |= SERIAL_TWO_STOPB;
+		new_lcr |= SERIAL_TWO_STOPB;
 	else
-		new_LCR |= SERIAL_ONE_STOPB;
+		new_lcr |= SERIAL_ONE_STOPB;
 
 	dev_dbg(&port->dev, "%s - 4\n", __func__);
 
@@ -1237,7 +1231,7 @@
 	 * Set Baud rate to default and turn off (default)flow control here
 	 */
 	status =
-	    qt_setuart(port->serial, index, (unsigned short)divisor, new_LCR);
+	    qt_setuart(port->serial, index, (unsigned short)divisor, new_lcr);
 	if (status < 0) {
 		dev_dbg(&port->dev, "qt_setuart failed\n");
 		return;
@@ -1245,25 +1239,23 @@
 
 	/* Now determine flow control */
 	if (cflag & CRTSCTS) {
-		dev_dbg(&port->dev, "%s - Enabling HW flow control port %d\n",
-			__func__, port->number);
+		dev_dbg(&port->dev, "%s - Enabling HW flow control\n", __func__);
 
 		/* Enable RTS/CTS flow control */
-		status = BoxSetHW_FlowCtrl(port->serial, index, 1);
+		status = box_set_hw_flow_ctrl(port->serial, index, 1);
 
 		if (status < 0) {
-			dev_dbg(&port->dev, "BoxSetHW_FlowCtrl failed\n");
+			dev_dbg(&port->dev, "box_set_hw_flow_ctrl failed\n");
 			return;
 		}
 	} else {
 		/* Disable RTS/CTS flow control */
 		dev_dbg(&port->dev,
-			"%s - disabling HW flow control port %d\n",
-			__func__, port->number);
+			"%s - disabling HW flow control\n", __func__);
 
-		status = BoxSetHW_FlowCtrl(port->serial, index, 0);
+		status = box_set_hw_flow_ctrl(port->serial, index, 0);
 		if (status < 0) {
-			dev_dbg(&port->dev, "BoxSetHW_FlowCtrl failed\n");
+			dev_dbg(&port->dev, "box_set_hw_flow_ctrl failed\n");
 			return;
 		}
 
@@ -1275,18 +1267,18 @@
 		unsigned char stop_char = STOP_CHAR(tty);
 		unsigned char start_char = START_CHAR(tty);
 		status =
-		    BoxSetSW_FlowCtrl(port->serial, index, stop_char,
+		    box_set_sw_flow_ctrl(port->serial, index, stop_char,
 				      start_char);
 		if (status < 0)
 			dev_dbg(&port->dev,
-				"BoxSetSW_FlowCtrl (enabled) failed\n");
+				"box_set_sw_flow_ctrl (enabled) failed\n");
 
 	} else {
 		/* disable SW flow control */
-		status = BoxDisable_SW_FlowCtrl(port->serial, index);
+		status = box_disable_sw_flow_ctrl(port->serial, index);
 		if (status < 0)
 			dev_dbg(&port->dev,
-				"BoxSetSW_FlowCtrl (diabling) failed\n");
+				"box_set_sw_flow_ctrl (diabling) failed\n");
 
 	}
 	termios->c_cflag &= ~CMSPAR;
@@ -1303,7 +1295,7 @@
 	u16 index, onoff;
 	unsigned int result;
 
-	index = tty->index - serial->minor;
+	index = port->port_number;
 
 	qt_port = qt_get_port_private(port);
 
@@ -1332,12 +1324,12 @@
 	int status;
 	unsigned int index;
 
-	index = tty->index - serial->minor;
+	index = port->port_number;
 	status =
-	    BoxGetRegister(port->serial, index, MODEM_CONTROL_REGISTER, &mcr);
+	    box_get_register(port->serial, index, MODEM_CONTROL_REGISTER, &mcr);
 	if (status >= 0) {
 		status =
-		    BoxGetRegister(port->serial, index,
+		    box_get_register(port->serial, index,
 				   MODEM_STATUS_REGISTER, &msr);
 
 	}
@@ -1371,9 +1363,9 @@
 	int status;
 	unsigned int index;
 
-	index = tty->index - serial->minor;
+	index = port->port_number;
 	status =
-	    BoxGetRegister(port->serial, index, MODEM_CONTROL_REGISTER, &mcr);
+	    box_get_register(port->serial, index, MODEM_CONTROL_REGISTER, &mcr);
 	if (status < 0)
 		return -ESPIPE;
 
@@ -1390,7 +1382,7 @@
 		mcr |= SERIAL_MCR_LOOP;
 
 	status =
-	    BoxSetRegister(port->serial, index, MODEM_CONTROL_REGISTER, mcr);
+	    box_set_register(port->serial, index, MODEM_CONTROL_REGISTER, mcr);
 	if (status < 0)
 		return -ESPIPE;
 	else
@@ -1445,7 +1437,7 @@
 	mutex_lock(&qt_port->lock);
 
 	/* pass on to the driver specific version of this function */
-	qt_port->RxHolding = 1;
+	qt_port->rx_holding = 1;
 
 	mutex_unlock(&qt_port->lock);
 }
@@ -1484,14 +1476,14 @@
 
 	mutex_lock(&qt_port->lock);
 
-	if (qt_port->RxHolding == 1) {
-		dev_dbg(&port->dev, "%s -qt_port->RxHolding == 1\n", __func__);
+	if (qt_port->rx_holding == 1) {
+		dev_dbg(&port->dev, "%s -qt_port->rx_holding == 1\n", __func__);
 
-		qt_port->RxHolding = 0;
-		dev_dbg(&port->dev, "%s - qt_port->RxHolding = 0\n", __func__);
+		qt_port->rx_holding = 0;
+		dev_dbg(&port->dev, "%s - qt_port->rx_holding = 0\n", __func__);
 
 		/* if we have a bulk endpoint, start it up */
-		if ((serial->num_bulk_in) && (qt_port->ReadBulkStopped == 1))
+		if ((serial->num_bulk_in) && (qt_port->read_bulk_stopped == 1))
 			qt_submit_urb_from_unthrottle(port, serial);
 	}
 	mutex_unlock(&qt_port->lock);

diff --git a/drivers/staging/silicom/bp_mod.h b/drivers/staging/silicom/bp_mod.h
index b8275f5..cfa1f43 100644
--- a/drivers/staging/silicom/bp_mod.h
+++ b/drivers/staging/silicom/bp_mod.h

@@ -15,8 +15,6 @@
 #define BP_MOD_H
 #include "bits.h"
 
-#define EXPORT_SYMBOL_NOVERS EXPORT_SYMBOL
-
 #define usec_delay(x) udelay(x)
 #ifndef msec_delay_bp
 #define msec_delay_bp(x)			\

diff --git a/drivers/staging/silicom/bpctl_mod.c b/drivers/staging/silicom/bpctl_mod.c
index b7e570c..4b3a1ae 100644
--- a/drivers/staging/silicom/bpctl_mod.c
+++ b/drivers/staging/silicom/bpctl_mod.c

@@ -35,7 +35,7 @@
 #define BP_MOD_DESCR "Silicom Bypass-SD Control driver"
 #define BP_SYNC_FLAG 1
 
-static int major_num = 0;
+static int major_num;
 
 MODULE_AUTHOR("Anna Lukin, annal@silicom.co.il");
 MODULE_LICENSE("GPL");
@@ -43,21 +43,16 @@
 MODULE_VERSION(BP_MOD_VER);
 spinlock_t bpvm_lock;
 
-#define lock_bpctl() 					\
-if (down_interruptible(&bpctl_sema)) {			\
-	return -ERESTARTSYS;				\
-}							\
-
-#define unlock_bpctl() 					\
+#define unlock_bpctl()					\
 	up(&bpctl_sema);
 
 /* Media Types */
-typedef enum {
-	bp_copper = 0,
-	bp_fiber,
-	bp_cx4,
-	bp_none,
-} bp_media_type;
+enum bp_media_type {
+	BP_COPPER = 0,
+	BP_FIBER,
+	BP_CX4,
+	BP_NONE,
+};
 
 struct bypass_pfs_sd {
 	char dir_name[32];
@@ -89,7 +84,7 @@
 	uint32_t reset_time;
 	uint8_t bp_status_un;
 	atomic_t wdt_busy;
-	bp_media_type media_type;
+	enum bp_media_type media_type;
 	int bp_tpl_flag;
 	struct timer_list bp_tpl_timer;
 	spinlock_t bypass_wr_lock;
@@ -112,7 +107,7 @@
 static bpctl_dev_t *bpctl_dev_arr;
 
 static struct semaphore bpctl_sema;
-static int device_num = 0;
+static int device_num;
 
 static int get_dev_idx(int ifindex);
 static bpctl_dev_t *get_master_port_fn(bpctl_dev_t *pbpctl_dev);
@@ -134,7 +129,7 @@
 			   unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
-	static bpctl_dev_t *pbpctl_dev = NULL, *pbpctl_dev_m = NULL;
+	static bpctl_dev_t *pbpctl_dev, *pbpctl_dev_m;
 	int dev_num = 0, ret = 0, ret_d = 0, time_left = 0;
 	/* printk("BP_PROC_SUPPORT event =%d %s %d\n", event,dev->name, dev->ifindex ); */
 	/* return NOTIFY_DONE; */
@@ -165,7 +160,8 @@
 			memcpy(&cbuf, drvinfo.bus_info, 32);
 			buf = &cbuf[0];
 
-			while (*buf++ != ':') ;
+			while (*buf++ != ':')
+				;
 			for (i = 0; i < 10; i++, buf++) {
 				if (*buf == ':')
 					break;
@@ -306,7 +302,8 @@
 		ctrl = BP10G_READ_REG(pbpctl_dev, ESDP);
 
 	if (pbpctl_dev->bp_10g9) {
-		if (!(pbpctl_dev_c = get_status_port_fn(pbpctl_dev)))
+		pbpctl_dev_c = get_status_port_fn(pbpctl_dev);
+		if (!pbpctl_dev_c)
 			return;
 		ctrl = BP10G_READ_REG(pbpctl_dev_c, ESDP);
 	}
@@ -606,7 +603,8 @@
 	if (pbpctl_dev->bp_540)
 		ctrl = BP10G_READ_REG(pbpctl_dev, ESDP);
 	if (pbpctl_dev->bp_10g9) {
-		if (!(pbpctl_dev_c = get_status_port_fn(pbpctl_dev)))
+		pbpctl_dev_c = get_status_port_fn(pbpctl_dev);
+		if (!pbpctl_dev_c)
 			return -1;
 		ctrl = BP10G_READ_REG(pbpctl_dev_c, ESDP);
 	}
@@ -720,16 +718,15 @@
 					 BP10G_MDIO_DATA_OUT));
 
 		}
-		if (pbpctl_dev->bp_10g9) {
+
+		if (pbpctl_dev->bp_10g9)
 			ctrl_ext = BP10G_READ_REG(pbpctl_dev, I2CCTL);
-
-		} else if ((pbpctl_dev->bp_fiber5) || (pbpctl_dev->bp_i80)) {
+		else if ((pbpctl_dev->bp_fiber5) || (pbpctl_dev->bp_i80))
 			ctrl_ext = BPCTL_READ_REG(pbpctl_dev, CTRL);
-		} else if (pbpctl_dev->bp_540) {
+		else if (pbpctl_dev->bp_540)
 			ctrl_ext = BP10G_READ_REG(pbpctl_dev, ESDP);
-		} else if (pbpctl_dev->bp_10gb)
+		else if (pbpctl_dev->bp_10gb)
 			ctrl_ext = BP10GB_READ_REG(pbpctl_dev, MISC_REG_SPIO);
-
 		else if (!pbpctl_dev->bp_10g)
 			ctrl_ext = BPCTL_READ_REG(pbpctl_dev, CTRL_EXT);
 		else
@@ -775,7 +772,8 @@
 	bpctl_dev_t *pbpctl_dev_c = NULL;
 	unsigned long flags;
 	if (pbpctl_dev->bp_10g9) {
-		if (!(pbpctl_dev_c = get_status_port_fn(pbpctl_dev)))
+		pbpctl_dev_c = get_status_port_fn(pbpctl_dev);
+		if (!pbpctl_dev_c)
 			return;
 	}
 	if ((pbpctl_dev->wdt_status == WDT_STATUS_EN) &&
@@ -953,7 +951,8 @@
 	atomic_set(&pbpctl_dev->wdt_busy, 1);
 #endif
 	if (pbpctl_dev->bp_10g9) {
-		if (!(pbpctl_dev_c = get_status_port_fn(pbpctl_dev)))
+		pbpctl_dev_c = get_status_port_fn(pbpctl_dev);
+		if (!pbpctl_dev_c)
 			return -1;
 	}
 
@@ -1224,7 +1223,8 @@
 		return -1;
 #endif
 	if (pbpctl_dev->bp_10g9) {
-		if (!(pbpctl_dev_c = get_status_port_fn(pbpctl_dev)))
+		pbpctl_dev_c = get_status_port_fn(pbpctl_dev);
+		if (!pbpctl_dev_c)
 			return -1;
 	}
 
@@ -1414,8 +1414,8 @@
 				(ctrl_ext &
 				 ~(BP10G_MCLK_DATA_OUT | BP10G_MDIO_DATA_OUT)));
 	}
-	if ((pbpctl_dev->wdt_status == WDT_STATUS_EN)	/*&&
-							   (pbpctl_dev->bp_ext_ver<PXG4BPFI_VER) */ )
+	if ((pbpctl_dev->wdt_status == WDT_STATUS_EN))
+		/*&& (pbpctl_dev->bp_ext_ver<PXG4BPFI_VER) */
 		pbpctl_dev->bypass_wdt_on_time = jiffies;
 #ifdef BP_SYNC_FLAG
 	spin_unlock_irqrestore(&pbpctl_dev->bypass_wr_lock, flags);
@@ -1744,7 +1744,8 @@
 {
 	bpctl_dev_t *pbpctl_dev_b = NULL;
 
-	if (!(pbpctl_dev_b = get_status_port_fn(pbpctl_dev)))
+	pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+	if (!pbpctl_dev_b)
 		return -1;
 	atomic_set(&pbpctl_dev->wdt_busy, 1);
 	write_data_port_int(pbpctl_dev, value & 0x3);
@@ -1920,13 +1921,10 @@
 		return BP_NOT_CAP;
 
 	if (pbpctl_dev_m->bp_caps_ex & DISC_PORT_CAP_EX) {
-		if (is_bypass_fn(pbpctl_dev) == 1) {
-
+		if (is_bypass_fn(pbpctl_dev) == 1)
 			write_data(pbpctl_dev_m, TX_DISA);
-		} else {
-
+		else
 			write_data(pbpctl_dev_m, TX_DISB);
-		}
 
 		msec_delay_bp(LATCH_DELAY);
 
@@ -1965,7 +1963,8 @@
 	int ret = 0, ctrl = 0;
 	bpctl_dev_t *pbpctl_dev_b = NULL;
 
-	if (!(pbpctl_dev_b = get_status_port_fn(pbpctl_dev)))
+	pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+	if (!pbpctl_dev_b)
 		return BP_NOT_CAP;
 
 	if (pbpctl_dev->bp_caps_ex & TPL2_CAP_EX) {
@@ -1992,7 +1991,8 @@
 	int ret = 0, ctrl = 0;
 	bpctl_dev_t *pbpctl_dev_b = NULL;
 
-	if (!(pbpctl_dev_b = get_status_port_fn(pbpctl_dev)))
+	pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+	if (!pbpctl_dev_b)
 		return BP_NOT_CAP;
 	if (pbpctl_dev->bp_caps_ex & TPL2_CAP_EX) {
 		cmnd_on(pbpctl_dev);
@@ -2017,9 +2017,9 @@
 	int ret = BP_NOT_CAP;
 
 	if (pbpctl_dev->bp_caps & WD_CTL_CAP) {
-		if (INTEL_IF_SERIES(pbpctl_dev->subdevice)) {
+		if (INTEL_IF_SERIES(pbpctl_dev->subdevice))
 			bypass_off(pbpctl_dev);
-		} else if (pbpctl_dev->bp_ext_ver >= PXG2BPI_VER)
+		else if (pbpctl_dev->bp_ext_ver >= PXG2BPI_VER)
 			write_data(pbpctl_dev, WDT_OFF);
 		else
 			data_pulse(pbpctl_dev, WDT_OFF);
@@ -2150,12 +2150,14 @@
 {
 	u16 mask = BPCTLI_SWFW_PHY0_SM;
 	u32 swfw_sync;
+	s32 ret_val;
 
 	if ((pbpctl_dev->func == 1) || (pbpctl_dev->func == 3))
 		mask = BPCTLI_SWFW_PHY1_SM;
 
-	while (bp75_get_hw_semaphore_generic(pbpctl_dev) != 0) ;
-	/* Empty */
+	do
+		ret_val = bp75_get_hw_semaphore_generic(pbpctl_dev);
+	while (ret_val != 0);
 
 	swfw_sync = BPCTL_READ_REG(pbpctl_dev, SW_FW_SYNC);
 	swfw_sync &= ~mask;
@@ -2404,12 +2406,10 @@
 			}
 
 		}
-		if (pbpctl_dev->bp_fiber5) {
+		if (pbpctl_dev->bp_fiber5)
 			ctrl = BPCTL_READ_REG(pbpctl_dev, CTRL_EXT);
-
-		} else if (pbpctl_dev->bp_10gb)
+		else if (pbpctl_dev->bp_10gb)
 			ctrl = BP10GB_READ_REG(pbpctl_dev, MISC_REG_GPIO);
-
 		else if (!pbpctl_dev->bp_10g)
 			ctrl = BPCTL_READ_REG(pbpctl_dev, CTRL);
 		else
@@ -3237,8 +3237,10 @@
 	uint32_t ctrl_ext = 0;
 	bpctl_dev_t *pbpctl_dev_b = NULL;
 
-	if ((pbpctl_dev->bp_caps & SW_CTL_CAP)
-	    && (pbpctl_dev_b = get_status_port_fn(pbpctl_dev))) {
+	if (pbpctl_dev->bp_caps & SW_CTL_CAP) {
+		pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+		if (!pbpctl_dev_b)
+			return BP_NOT_CAP;
 		ctrl_ext = BPCTL_READ_REG(pbpctl_dev_b, CTRL_EXT);
 		BPCTL_BP_WRITE_REG(pbpctl_dev_b, CTRL_EXT,
 				   (ctrl_ext & ~BPCTLI_CTRL_EXT_SDP7_DIR));
@@ -3254,9 +3256,10 @@
 {
 	bpctl_dev_t *pbpctl_dev_b = NULL;
 
-	if ((pbpctl_dev->bp_caps & SW_CTL_CAP)
-	    && (pbpctl_dev_b = get_status_port_fn(pbpctl_dev))) {
-
+	if (pbpctl_dev->bp_caps & SW_CTL_CAP) {
+		pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+		if (!pbpctl_dev_b)
+			return BP_NOT_CAP;
 		send_bypass_clear_pulse(pbpctl_dev_b, 1);
 		return 0;
 	} else
@@ -3329,7 +3332,8 @@
 
 		bpctl_dev_t *pbpctl_dev_b = NULL;
 
-		if (!(pbpctl_dev_b = get_status_port_fn(pbpctl_dev)))
+		pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+		if (!pbpctl_dev_b)
 			return BP_NOT_CAP;
 
 		if (INTEL_IF_SERIES(pbpctl_dev->subdevice)) {
@@ -3391,7 +3395,7 @@
 					 BP10G_SDP7_DATA_IN) != 0 ? 0 : 1);
 			}
 
-		} else if (pbpctl_dev->media_type == bp_copper) {
+		} else if (pbpctl_dev->media_type == BP_COPPER) {
 
 			return (((BPCTL_READ_REG(pbpctl_dev_b, CTRL)) &
 				 BPCTLI_CTRL_SWDPIN1) != 0 ? 1 : 0);
@@ -3617,7 +3621,8 @@
 	if (pbpctl_dev->bp_caps & TAP_CAP) {
 		bpctl_dev_t *pbpctl_dev_b = NULL;
 
-		if (!(pbpctl_dev_b = get_status_port_fn(pbpctl_dev)))
+		pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+		if (!pbpctl_dev_b)
 			return BP_NOT_CAP;
 
 		if (pbpctl_dev->bp_ext_ver >= 0x8) {
@@ -3636,7 +3641,7 @@
 					 BP10G_SDP6_DATA_IN) != 0 ? 0 : 1);
 			}
 
-		} else if (pbpctl_dev->media_type == bp_copper)
+		} else if (pbpctl_dev->media_type == BP_COPPER)
 			return (((BPCTL_READ_REG(pbpctl_dev, CTRL)) &
 				 BPCTLI_CTRL_SWDPIN0) != 0 ? 1 : 0);
 		else {
@@ -3713,7 +3718,8 @@
 	u32 ctrl_ext = 0;
 
 	if (pbpctl_dev->bp_caps & DISC_CAP) {
-		if (!(pbpctl_dev_b = get_status_port_fn(pbpctl_dev)))
+		pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+		if (!pbpctl_dev_b)
 			return BP_NOT_CAP;
 		if (DISCF_IF_SERIES(pbpctl_dev->subdevice))
 			return ((((read_reg(pbpctl_dev, STATUS_DISC_REG_ADDR)) &
@@ -3730,7 +3736,7 @@
 				 BP10G_SDP2_DATA) != 0 ? 1 : 0);
 
 		}
-		if (pbpctl_dev->media_type == bp_copper) {
+		if (pbpctl_dev->media_type == BP_COPPER) {
 
 #if 0
 			return ((((read_reg(pbpctl_dev, STATUS_DISC_REG_ADDR)) &
@@ -3794,11 +3800,10 @@
 {
 	int ctrl = 0;
 	if (pbpctl_dev->bp_caps & DISC_CAP) {
-
-		if ((ctrl = disc_off_status(pbpctl_dev)) < 0)
+		ctrl = disc_off_status(pbpctl_dev);
+		if (ctrl < 0)
 			return ctrl;
 		return ((ctrl == 0) ? 1 : 0);
-
 	}
 	return BP_NOT_CAP;
 }
@@ -3911,7 +3916,8 @@
 {
 	bpctl_dev_t *pbpctl_dev_b = NULL;
 
-	if (!(pbpctl_dev_b = get_status_port_fn(pbpctl_dev)))
+	pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+	if (!pbpctl_dev_b)
 		return BP_NOT_CAP;
 
 	if (TPL_IF_SERIES(pbpctl_dev->subdevice))
@@ -4021,42 +4027,41 @@
 	}
 #endif
 	if ((pbpctl_dev->bp_fiber5) || (pbpctl_dev->bp_10g9)) {
-		pbpctl_dev->media_type = bp_fiber;
+		pbpctl_dev->media_type = BP_FIBER;
 	} else if (pbpctl_dev->bp_10gb) {
 		if (BP10GB_CX4_SERIES(pbpctl_dev->subdevice))
-			pbpctl_dev->media_type = bp_cx4;
+			pbpctl_dev->media_type = BP_CX4;
 		else
-			pbpctl_dev->media_type = bp_fiber;
+			pbpctl_dev->media_type = BP_FIBER;
 
 	}
 
 	else if (pbpctl_dev->bp_540)
-		pbpctl_dev->media_type = bp_none;
+		pbpctl_dev->media_type = BP_NONE;
 	else if (!pbpctl_dev->bp_10g) {
 
 		ctrl_ext = BPCTL_READ_REG(pbpctl_dev, CTRL_EXT);
 		if ((ctrl_ext & BPCTLI_CTRL_EXT_LINK_MODE_MASK) == 0x0)
-			pbpctl_dev->media_type = bp_copper;
+			pbpctl_dev->media_type = BP_COPPER;
 		else
-			pbpctl_dev->media_type = bp_fiber;
+			pbpctl_dev->media_type = BP_FIBER;
 
 	} else {
 		if (BP10G_CX4_SERIES(pbpctl_dev->subdevice))
-			pbpctl_dev->media_type = bp_cx4;
+			pbpctl_dev->media_type = BP_CX4;
 		else
-			pbpctl_dev->media_type = bp_fiber;
+			pbpctl_dev->media_type = BP_FIBER;
 	}
 
 	if (is_bypass_fn(pbpctl_dev)) {
 
 		pbpctl_dev->bp_caps |= BP_PWOFF_ON_CAP;
-		if (pbpctl_dev->media_type == bp_fiber)
+		if (pbpctl_dev->media_type == BP_FIBER)
 			pbpctl_dev->bp_caps |=
 			    (TX_CTL_CAP | TX_STATUS_CAP | TPL_CAP);
 
-		if (TPL_IF_SERIES(pbpctl_dev->subdevice)) {
+		if (TPL_IF_SERIES(pbpctl_dev->subdevice))
 			pbpctl_dev->bp_caps |= TPL_CAP;
-		}
 
 		if (INTEL_IF_SERIES(pbpctl_dev->subdevice)) {
 			pbpctl_dev->bp_caps |=
@@ -4196,9 +4201,9 @@
 	if (PEG5_IF_SERIES(pbpctl_dev->subdevice))
 		pbpctl_dev->bp_caps |= (TX_CTL_CAP | TX_STATUS_CAP);
 
-	if (BP10GB_IF_SERIES(pbpctl_dev->subdevice)) {
+	if (BP10GB_IF_SERIES(pbpctl_dev->subdevice))
 		pbpctl_dev->bp_caps &= ~(TX_CTL_CAP | TX_STATUS_CAP);
-	}
+
 	pbpctl_dev_m = get_master_port_fn(pbpctl_dev);
 	if (pbpctl_dev_m != NULL) {
 		int cap_reg = 0;
@@ -4215,9 +4220,8 @@
 
 int bypass_off_init(bpctl_dev_t *pbpctl_dev)
 {
-	int ret = 0;
-
-	if ((ret = cmnd_on(pbpctl_dev)) < 0)
+	int ret = cmnd_on(pbpctl_dev);
+	if (ret < 0)
 		return ret;
 	if (INTEL_IF_SERIES(pbpctl_dev->subdevice))
 		return dis_bypass_cap(pbpctl_dev);
@@ -4327,14 +4331,13 @@
 
 int get_bypass_wd_auto(bpctl_dev_t *pbpctl_dev)
 {
-
-	if (pbpctl_dev->bp_caps & WD_CTL_CAP) {
+	if (pbpctl_dev->bp_caps & WD_CTL_CAP)
 		return pbpctl_dev->reset_time;
-	}
+
 	return BP_NOT_CAP;
 }
 
-#ifdef  BP_SELF_TEST
+#ifdef BP_SELF_TEST
 
 int set_bp_self_test(bpctl_dev_t *pbpctl_dev, unsigned int param)
 {
@@ -4403,7 +4406,8 @@
 
 	if (!(pbpctl_dev->bp_caps & BP_CAP))
 		return BP_NOT_CAP;
-	if ((ret = cmnd_on(pbpctl_dev)) < 0)
+	ret = cmnd_on(pbpctl_dev);
+	if (ret < 0)
 		return ret;
 	if (!bypass_mode)
 		ret = bypass_off(pbpctl_dev);
@@ -4435,7 +4439,8 @@
 
 	if (!(pbpctl_dev->bp_caps & BP_DIS_CAP))
 		return BP_NOT_CAP;
-	if ((ret = cmnd_on(pbpctl_dev)) < 0)
+	ret = cmnd_on(pbpctl_dev);
+	if (ret < 0)
 		return ret;
 	if (dis_param)
 		ret = dis_bypass_cap(pbpctl_dev);
@@ -4461,7 +4466,8 @@
 
 	if (!(pbpctl_dev->bp_caps & BP_PWOFF_CTL_CAP))
 		return BP_NOT_CAP;
-	if ((ret = cmnd_on(pbpctl_dev)) < 0)
+	ret = cmnd_on(pbpctl_dev);
+	if (ret < 0)
 		return ret;
 	if (bypass_mode)
 		ret = bypass_state_pwroff(pbpctl_dev);
@@ -4487,7 +4493,8 @@
 
 	if (!(pbpctl_dev->bp_caps & BP_PWUP_CTL_CAP))
 		return BP_NOT_CAP;
-	if ((ret = cmnd_on(pbpctl_dev)) < 0)
+	ret = cmnd_on(pbpctl_dev);
+	if (ret < 0)
 		return ret;
 	if (bypass_mode)
 		ret = bypass_state_pwron(pbpctl_dev);
@@ -4514,7 +4521,8 @@
 	if (!(pbpctl_dev->bp_caps & WD_CTL_CAP))
 		return BP_NOT_CAP;
 
-	if ((ret = cmnd_on(pbpctl_dev)) < 0)
+	ret = cmnd_on(pbpctl_dev);
+	if (ret < 0)
 		return ret;
 	if (!timeout)
 		ret = wdt_off(pbpctl_dev);
@@ -4583,7 +4591,8 @@
 	if (!(pbpctl_dev->bp_caps & STD_NIC_CAP))
 		return BP_NOT_CAP;
 
-	if ((ret = cmnd_on(pbpctl_dev)) < 0)
+	ret = cmnd_on(pbpctl_dev);
+	if (ret < 0)
 		return ret;
 	if (nic_mode)
 		ret = std_nic_on(pbpctl_dev);
@@ -4649,7 +4658,8 @@
 	if (!pbpctl_dev)
 		return -1;
 
-	if ((ret = default_pwron_tap_status(pbpctl_dev)) < 0)
+	ret = default_pwron_tap_status(pbpctl_dev);
+	if (ret < 0)
 		return ret;
 	return ((ret == 0) ? 1 : 0);
 }
@@ -4824,7 +4834,8 @@
 	if (!pbpctl_dev)
 		return -1;
 
-	if ((ret = default_pwron_disc_port_status(pbpctl_dev)) < 0)
+	ret = default_pwron_disc_port_status(pbpctl_dev);
+	if (ret < 0)
 		return ret;
 	return ((ret == 0) ? 1 : 0);
 }
@@ -4851,7 +4862,8 @@
 	if (!pbpctl_dev)
 		return -1;
 
-	if ((ret = cmnd_on(pbpctl_dev)) < 0)
+	ret = cmnd_on(pbpctl_dev);
+	if (ret < 0)
 		return ret;
 	return reset_cont(pbpctl_dev);
 }
@@ -4867,8 +4879,10 @@
 	    (pbpctl_dev->bp_caps & SW_CTL_CAP)) {
 		if ((pbpctl_dev->bp_tpl_flag))
 			return BP_NOT_CAP;
-	} else if ((pbpctl_dev_b = get_master_port_fn(pbpctl_dev))) {
-		if ((pbpctl_dev_b->bp_caps & TPL_CAP) &&
+	} else {
+		pbpctl_dev_b = get_master_port_fn(pbpctl_dev);
+		if (pbpctl_dev_b &&
+		    (pbpctl_dev_b->bp_caps & TPL_CAP) &&
 		    (pbpctl_dev_b->bp_tpl_flag))
 			return BP_NOT_CAP;
 	}
@@ -4984,8 +4998,10 @@
 	    (pbpctl_dev->bp_caps & SW_CTL_CAP)) {
 		if ((pbpctl_dev->bp_tpl_flag))
 			return BP_NOT_CAP;
-	} else if ((pbpctl_dev_b = get_master_port_fn(pbpctl_dev))) {
-		if ((pbpctl_dev_b->bp_caps & TPL_CAP) &&
+	} else {
+		pbpctl_dev_b = get_master_port_fn(pbpctl_dev);
+		if (pbpctl_dev_b &&
+		    (pbpctl_dev_b->bp_caps & TPL_CAP) &&
 		    (pbpctl_dev_b->bp_tpl_flag))
 			return BP_NOT_CAP;
 	}
@@ -5009,7 +5025,7 @@
 	if (!pbpctl_dev)
 		return -1;
 
-	if (pbpctl_dev->media_type == bp_fiber)
+	if (pbpctl_dev->media_type == BP_FIBER)
 		return ((BPCTL_READ_REG(pbpctl_dev, CTRL) &
 			 BPCTLI_CTRL_SWDPIN1));
 	else
@@ -5024,7 +5040,8 @@
 	uint32_t link1, link2;
 	bpctl_dev_t *pbpctl_dev_b = NULL;
 
-	if (!(pbpctl_dev_b = get_status_port_fn(pbpctl_dev)))
+	pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+	if (!pbpctl_dev_b)
 		return;
 
 	if (!pbpctl_dev->bp_tpl_flag) {
@@ -5036,23 +5053,19 @@
 
 	link2 = get_bypass_link_status(pbpctl_dev_b);
 	if ((link1) && (tx_status(pbpctl_dev))) {
-		if ((!link2) && (tx_status(pbpctl_dev_b))) {
+		if ((!link2) && (tx_status(pbpctl_dev_b)))
 			set_tx(pbpctl_dev, 0);
-		} else if (!tx_status(pbpctl_dev_b)) {
+		else if (!tx_status(pbpctl_dev_b))
 			set_tx(pbpctl_dev_b, 1);
-		}
 	} else if ((!link1) && (tx_status(pbpctl_dev))) {
-		if ((link2) && (tx_status(pbpctl_dev_b))) {
+		if ((link2) && (tx_status(pbpctl_dev_b)))
 			set_tx(pbpctl_dev_b, 0);
-		}
 	} else if ((link1) && (!tx_status(pbpctl_dev))) {
-		if ((link2) && (tx_status(pbpctl_dev_b))) {
+		if ((link2) && (tx_status(pbpctl_dev_b)))
 			set_tx(pbpctl_dev, 1);
-		}
 	} else if ((!link1) && (!tx_status(pbpctl_dev))) {
-		if ((link2) && (tx_status(pbpctl_dev_b))) {
+		if ((link2) && (tx_status(pbpctl_dev_b)))
 			set_tx(pbpctl_dev, 1);
-		}
 	}
 
 	mod_timer(&pbpctl_dev->bp_tpl_timer, jiffies + BP_LINK_MON_DELAY * HZ);
@@ -5111,9 +5124,9 @@
 {
 	if (!pbpctl_dev)
 		return -1;
-	if (pbpctl_dev->bp_caps & TPL_CAP) {
+	if (pbpctl_dev->bp_caps & TPL_CAP)
 		return pbpctl_dev->bp_tpl_flag;
-	}
+
 	return BP_NOT_CAP;
 }
 
@@ -5128,7 +5141,8 @@
 
 	if (pbpctl_dev->bp_caps & TPL_CAP) {
 		if (tpl_mode) {
-			if ((pbpctl_dev_b = get_status_port_fn(pbpctl_dev)))
+			pbpctl_dev_b = get_status_port_fn(pbpctl_dev);
+			if (pbpctl_dev_b)
 				set_tx(pbpctl_dev_b, 1);
 			set_tx(pbpctl_dev, 1);
 		}
@@ -5345,7 +5359,8 @@
 		memcpy(&cbuf, drvinfo.bus_info, 32);
 		buf = &cbuf[0];
 
-		while (*buf++ != ':') ;
+		while (*buf++ != ':')
+			;
 		for (i = 0; i < 10; i++, buf++) {
 			if (*buf == ':')
 				break;
@@ -5394,7 +5409,8 @@
 	static bpctl_dev_t *pbpctl_dev;
 
 	/* lock_kernel(); */
-	lock_bpctl();
+	if (down_interruptible(&bpctl_sema))
+		return -ERESTARTSYS;
 	/* local_irq_save(flags); */
 	/* if(!spin_trylock_irqsave(&bpvm_lock)){
 	   local_irq_restore(flags);
@@ -5438,9 +5454,9 @@
 		return -1;
 	}
 
-/*    	preempt_disable();
+/*	preempt_disable();
 	rcu_read_lock();
-      	spin_lock_irqsave(&bpvm_lock, flags);
+	spin_lock_irqsave(&bpvm_lock, flags);
 */
 	if ((bpctl_cmd.in_param[5]) ||
 	    (bpctl_cmd.in_param[6]) || (bpctl_cmd.in_param[7]))
@@ -5787,7 +5803,7 @@
 };
 
 #ifndef PCI_DEVICE
-#define PCI_DEVICE(vend,dev) \
+#define PCI_DEVICE(vend, dev) \
 	.vendor = (vend), .device = (dev), \
 	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
 #endif
@@ -5795,7 +5811,7 @@
 #define SILICOM_E1000BP_ETHERNET_DEVICE(device_id) {\
 	PCI_DEVICE(SILICOM_VID, device_id)}
 
-typedef enum {
+enum board_type {
 	PXG2BPFI,
 	PXG2BPFIL,
 	PXG2BPFILX,
@@ -5953,7 +5969,7 @@
 	PE310G4BPi9SR,
 	PE310G4BPi9LR,
 	PE210G2BPi40,
-} board_t;
+};
 
 typedef struct _bpmod_info_t {
 	unsigned int vendor;
@@ -6629,7 +6645,7 @@
 			    ioremap(mmio_start, mmio_len);
 
 			dev->bp_fw_ver = bypass_fw_ver(dev);
-			if (dev-> bp_fw_ver == 0xa8)
+			if (dev->bp_fw_ver == 0xa8)
 				break;
 		}
 	}
@@ -6708,7 +6724,8 @@
 			reset_cont(dev);
 	}
 #ifdef BP_SELF_TEST
-	if ((dev->bp_tx_data = kzalloc(BPTEST_DATA_LEN, GFP_KERNEL))) {
+	dev->bp_tx_data = kzalloc(BPTEST_DATA_LEN, GFP_KERNEL);
+	if (dev->bp_tx_data) {
 		memset(dev->bp_tx_data, 0xff, 6);
 		memset(dev->bp_tx_data + 6, 0x0, 1);
 		memset(dev->bp_tx_data + 7, 0xaa, 5);
@@ -6878,59 +6895,69 @@
 {
 	return is_bypass(get_dev_idx_p(ifindex));
 }
+EXPORT_SYMBOL(is_bypass_sd);
 
 int set_bypass_sd(int ifindex, int bypass_mode)
 {
 
 	return set_bypass_fn(get_dev_idx_p(ifindex), bypass_mode);
 }
+EXPORT_SYMBOL(set_bypass_sd);
 
 int get_bypass_sd(int ifindex)
 {
 
 	return get_bypass_fn(get_dev_idx_p(ifindex));
 }
+EXPORT_SYMBOL(get_bypass_sd);
 
 int get_bypass_change_sd(int ifindex)
 {
 
 	return get_bypass_change_fn(get_dev_idx_p(ifindex));
 }
+EXPORT_SYMBOL(get_bypass_change_sd);
 
 int set_dis_bypass_sd(int ifindex, int dis_param)
 {
 	return set_dis_bypass_fn(get_dev_idx_p(ifindex), dis_param);
 }
+EXPORT_SYMBOL(set_dis_bypass_sd);
 
 int get_dis_bypass_sd(int ifindex)
 {
 
 	return get_dis_bypass_fn(get_dev_idx_p(ifindex));
 }
+EXPORT_SYMBOL(get_dis_bypass_sd);
 
 int set_bypass_pwoff_sd(int ifindex, int bypass_mode)
 {
 	return set_bypass_pwoff_fn(get_dev_idx_p(ifindex), bypass_mode);
 
 }
+EXPORT_SYMBOL(set_bypass_pwoff_sd);
 
 int get_bypass_pwoff_sd(int ifindex)
 {
 	return get_bypass_pwoff_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_bypass_pwoff_sd);
 
 int set_bypass_pwup_sd(int ifindex, int bypass_mode)
 {
 	return set_bypass_pwup_fn(get_dev_idx_p(ifindex), bypass_mode);
 
 }
+EXPORT_SYMBOL(set_bypass_pwup_sd);
 
 int get_bypass_pwup_sd(int ifindex)
 {
 	return get_bypass_pwup_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_bypass_pwup_sd);
 
 int set_bypass_wd_sd(int if_index, int ms_timeout, int *ms_timeout_set)
 {
@@ -6939,136 +6966,159 @@
 	*ms_timeout_set = set_bypass_wd_fn(get_dev_idx_p(if_index), ms_timeout);
 	return 0;
 }
+EXPORT_SYMBOL(set_bypass_wd_sd);
 
 int get_bypass_wd_sd(int ifindex, int *timeout)
 {
 	return get_bypass_wd_fn(get_dev_idx_p(ifindex), timeout);
 
 }
+EXPORT_SYMBOL(get_bypass_wd_sd);
 
 int get_wd_expire_time_sd(int ifindex, int *time_left)
 {
 	return get_wd_expire_time_fn(get_dev_idx_p(ifindex), time_left);
 }
+EXPORT_SYMBOL(get_wd_expire_time_sd);
 
 int reset_bypass_wd_timer_sd(int ifindex)
 {
 	return reset_bypass_wd_timer_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(reset_bypass_wd_timer_sd);
 
 int get_wd_set_caps_sd(int ifindex)
 {
 	return get_wd_set_caps_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_wd_set_caps_sd);
 
 int set_std_nic_sd(int ifindex, int nic_mode)
 {
 	return set_std_nic_fn(get_dev_idx_p(ifindex), nic_mode);
 
 }
+EXPORT_SYMBOL(set_std_nic_sd);
 
 int get_std_nic_sd(int ifindex)
 {
 	return get_std_nic_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_std_nic_sd);
 
 int set_tap_sd(int ifindex, int tap_mode)
 {
 	return set_tap_fn(get_dev_idx_p(ifindex), tap_mode);
 
 }
+EXPORT_SYMBOL(set_tap_sd);
 
 int get_tap_sd(int ifindex)
 {
 	return get_tap_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_tap_sd);
 
 int set_tap_pwup_sd(int ifindex, int tap_mode)
 {
 	return set_tap_pwup_fn(get_dev_idx_p(ifindex), tap_mode);
 
 }
+EXPORT_SYMBOL(set_tap_pwup_sd);
 
 int get_tap_pwup_sd(int ifindex)
 {
 	return get_tap_pwup_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_tap_pwup_sd);
 
 int get_tap_change_sd(int ifindex)
 {
 	return get_tap_change_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_tap_change_sd);
 
 int set_dis_tap_sd(int ifindex, int dis_param)
 {
 	return set_dis_tap_fn(get_dev_idx_p(ifindex), dis_param);
 
 }
+EXPORT_SYMBOL(set_dis_tap_sd);
 
 int get_dis_tap_sd(int ifindex)
 {
 	return get_dis_tap_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_dis_tap_sd);
 
 int set_bp_disc_sd(int ifindex, int disc_mode)
 {
 	return set_disc_fn(get_dev_idx_p(ifindex), disc_mode);
 
 }
+EXPORT_SYMBOL(set_bp_disc_sd);
 
 int get_bp_disc_sd(int ifindex)
 {
 	return get_disc_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_bp_disc_sd);
 
 int set_bp_disc_pwup_sd(int ifindex, int disc_mode)
 {
 	return set_disc_pwup_fn(get_dev_idx_p(ifindex), disc_mode);
 
 }
+EXPORT_SYMBOL(set_bp_disc_pwup_sd);
 
 int get_bp_disc_pwup_sd(int ifindex)
 {
 	return get_disc_pwup_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_bp_disc_pwup_sd);
 
 int get_bp_disc_change_sd(int ifindex)
 {
 	return get_disc_change_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_bp_disc_change_sd);
 
 int set_bp_dis_disc_sd(int ifindex, int dis_param)
 {
 	return set_dis_disc_fn(get_dev_idx_p(ifindex), dis_param);
 
 }
+EXPORT_SYMBOL(set_bp_dis_disc_sd);
 
 int get_bp_dis_disc_sd(int ifindex)
 {
 	return get_dis_disc_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_bp_dis_disc_sd);
 
 int get_wd_exp_mode_sd(int ifindex)
 {
 	return get_wd_exp_mode_fn(get_dev_idx_p(ifindex));
 }
+EXPORT_SYMBOL(get_wd_exp_mode_sd);
 
 int set_wd_exp_mode_sd(int ifindex, int param)
 {
 	return set_wd_exp_mode_fn(get_dev_idx_p(ifindex), param);
 
 }
+EXPORT_SYMBOL(set_wd_exp_mode_sd);
 
 int reset_cont_sd(int ifindex)
 {
@@ -7081,35 +7131,41 @@
 	return set_tx_fn(get_dev_idx_p(ifindex), tx_state);
 
 }
+EXPORT_SYMBOL(set_tx_sd);
 
 int set_tpl_sd(int ifindex, int tpl_state)
 {
 	return set_tpl_fn(get_dev_idx_p(ifindex), tpl_state);
 
 }
+EXPORT_SYMBOL(set_tpl_sd);
 
 int set_bp_hw_reset_sd(int ifindex, int status)
 {
 	return set_bp_hw_reset_fn(get_dev_idx_p(ifindex), status);
 
 }
+EXPORT_SYMBOL(set_bp_hw_reset_sd);
 
 int set_wd_autoreset_sd(int ifindex, int param)
 {
 	return set_wd_autoreset_fn(get_dev_idx_p(ifindex), param);
 
 }
+EXPORT_SYMBOL(set_wd_autoreset_sd);
 
 int get_wd_autoreset_sd(int ifindex)
 {
 	return get_wd_autoreset_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_wd_autoreset_sd);
 
 int get_bypass_caps_sd(int ifindex)
 {
 	return get_bypass_caps_fn(get_dev_idx_p(ifindex));
 }
+EXPORT_SYMBOL(get_bypass_caps_sd);
 
 int get_bypass_slave_sd(int ifindex)
 {
@@ -7120,81 +7176,41 @@
 	return -1;
 
 }
+EXPORT_SYMBOL(get_bypass_slave_sd);
 
 int get_tx_sd(int ifindex)
 {
 	return get_tx_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_tx_sd);
 
 int get_tpl_sd(int ifindex)
 {
 	return get_tpl_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_tpl_sd);
 
 int get_bp_hw_reset_sd(int ifindex)
 {
 	return get_bp_hw_reset_fn(get_dev_idx_p(ifindex));
 
 }
+EXPORT_SYMBOL(get_bp_hw_reset_sd);
 
 int get_bypass_info_sd(int ifindex, struct bp_info *bp_info)
 {
 	return get_bypass_info_fn(get_dev_idx_p(ifindex), bp_info->prod_name, &bp_info->fw_ver);
 }
+EXPORT_SYMBOL(get_bypass_info_sd);
 
 int bp_if_scan_sd(void)
 {
 	if_scan_init();
 	return 0;
 }
-
-EXPORT_SYMBOL_NOVERS(is_bypass_sd);
-EXPORT_SYMBOL_NOVERS(get_bypass_slave_sd);
-EXPORT_SYMBOL_NOVERS(get_bypass_caps_sd);
-EXPORT_SYMBOL_NOVERS(get_wd_set_caps_sd);
-EXPORT_SYMBOL_NOVERS(set_bypass_sd);
-EXPORT_SYMBOL_NOVERS(get_bypass_sd);
-EXPORT_SYMBOL_NOVERS(get_bypass_change_sd);
-EXPORT_SYMBOL_NOVERS(set_dis_bypass_sd);
-EXPORT_SYMBOL_NOVERS(get_dis_bypass_sd);
-EXPORT_SYMBOL_NOVERS(set_bypass_pwoff_sd);
-EXPORT_SYMBOL_NOVERS(get_bypass_pwoff_sd);
-EXPORT_SYMBOL_NOVERS(set_bypass_pwup_sd);
-EXPORT_SYMBOL_NOVERS(get_bypass_pwup_sd);
-EXPORT_SYMBOL_NOVERS(set_bypass_wd_sd);
-EXPORT_SYMBOL_NOVERS(get_bypass_wd_sd);
-EXPORT_SYMBOL_NOVERS(get_wd_expire_time_sd);
-EXPORT_SYMBOL_NOVERS(reset_bypass_wd_timer_sd);
-EXPORT_SYMBOL_NOVERS(set_std_nic_sd);
-EXPORT_SYMBOL_NOVERS(get_std_nic_sd);
-EXPORT_SYMBOL_NOVERS(set_tx_sd);
-EXPORT_SYMBOL_NOVERS(get_tx_sd);
-EXPORT_SYMBOL_NOVERS(set_tpl_sd);
-EXPORT_SYMBOL_NOVERS(get_tpl_sd);
-EXPORT_SYMBOL_NOVERS(set_bp_hw_reset_sd);
-EXPORT_SYMBOL_NOVERS(get_bp_hw_reset_sd);
-EXPORT_SYMBOL_NOVERS(set_tap_sd);
-EXPORT_SYMBOL_NOVERS(get_tap_sd);
-EXPORT_SYMBOL_NOVERS(get_tap_change_sd);
-EXPORT_SYMBOL_NOVERS(set_dis_tap_sd);
-EXPORT_SYMBOL_NOVERS(get_dis_tap_sd);
-EXPORT_SYMBOL_NOVERS(set_tap_pwup_sd);
-EXPORT_SYMBOL_NOVERS(get_tap_pwup_sd);
-EXPORT_SYMBOL_NOVERS(set_wd_exp_mode_sd);
-EXPORT_SYMBOL_NOVERS(get_wd_exp_mode_sd);
-EXPORT_SYMBOL_NOVERS(set_wd_autoreset_sd);
-EXPORT_SYMBOL_NOVERS(get_wd_autoreset_sd);
-EXPORT_SYMBOL_NOVERS(set_bp_disc_sd);
-EXPORT_SYMBOL_NOVERS(get_bp_disc_sd);
-EXPORT_SYMBOL_NOVERS(get_bp_disc_change_sd);
-EXPORT_SYMBOL_NOVERS(set_bp_dis_disc_sd);
-EXPORT_SYMBOL_NOVERS(get_bp_dis_disc_sd);
-EXPORT_SYMBOL_NOVERS(set_bp_disc_pwup_sd);
-EXPORT_SYMBOL_NOVERS(get_bp_disc_pwup_sd);
-EXPORT_SYMBOL_NOVERS(get_bypass_info_sd);
-EXPORT_SYMBOL_NOVERS(bp_if_scan_sd);
+EXPORT_SYMBOL(bp_if_scan_sd);
 
 #define BP_PROC_DIR "bypass"
 
@@ -7263,7 +7279,7 @@
 	if (!slave)
 		slave = dev;
 	if (!slave)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (slave->ndev)
 		seq_printf(m, "%s\n", slave->ndev->name);
 	return 0;
@@ -7275,7 +7291,7 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_bypass_caps_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "-1\n");
+		seq_puts(m, "-1\n");
 	else
 		seq_printf(m, "0x%x\n", ret);
 	return 0;
@@ -7287,7 +7303,7 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_wd_set_caps_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "-1\n");
+		seq_puts(m, "-1\n");
 	else
 		seq_printf(m, "0x%x\n", ret);
 	return 0;
@@ -7333,11 +7349,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_bypass_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 1)
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	return 0;
 }
 RW_FOPS(bypass)
@@ -7357,11 +7373,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_tap_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 1)
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	return 0;
 }
 RW_FOPS(tap)
@@ -7381,11 +7397,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_disc_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 1)
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	return 0;
 }
 RW_FOPS(disc)
@@ -7395,11 +7411,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_bypass_change_fn(dev);
 	if (ret == 1)
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	return 0;
 }
 RO_FOPS(bypass_change)
@@ -7409,11 +7425,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_tap_change_fn(dev);
 	if (ret == 1)
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	return 0;
 }
 RO_FOPS(tap_change)
@@ -7423,11 +7439,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_disc_change_fn(dev);
 	if (ret == 1)
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	return 0;
 }
 RO_FOPS(disc_change)
@@ -7450,11 +7466,11 @@
 
 	ret = get_bypass_wd_fn(dev, &timeout);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m,  "fail\n");
+		seq_puts(m,  "fail\n");
 	else if (timeout == -1)
-		seq_printf(m,  "unknown\n");
+		seq_puts(m,  "unknown\n");
 	else if (timeout == 0)
-		seq_printf(m,  "disable\n");
+		seq_puts(m,  "disable\n");
 	else
 		seq_printf(m, "%d\n", timeout);
 	return 0;
@@ -7467,11 +7483,11 @@
 	int ret = 0, timeout = 0;
 	ret = get_wd_expire_time_fn(dev, &timeout);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (timeout == -1)
-		seq_printf(m, "expire\n");
+		seq_puts(m, "expire\n");
 	else if (timeout == 0)
-		seq_printf(m, "disable\n");
+		seq_puts(m, "disable\n");
 	else
 		seq_printf(m, "%d\n", timeout);
 	return 0;
@@ -7494,11 +7510,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_tpl_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 1)
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	return 0;
 }
 RW_FOPS(tpl)
@@ -7520,11 +7536,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_bp_wait_at_pwup_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 1)
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	return 0;
 }
 RW_FOPS(wait_at_pwup)
@@ -7545,11 +7561,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_bp_hw_reset_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 1)
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	return 0;
 }
 RW_FOPS(hw_reset)
@@ -7561,11 +7577,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = reset_bypass_wd_timer_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 0)
-		seq_printf(m, "disable\n");
+		seq_puts(m, "disable\n");
 	else if (ret == 1)
-		seq_printf(m, "success\n");
+		seq_puts(m, "success\n");
 	return 0;
 }
 RO_FOPS(reset_bypass_wd)
@@ -7585,11 +7601,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_dis_bypass_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	return 0;
 }
 RW_FOPS(dis_bypass)
@@ -7609,11 +7625,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_dis_tap_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	return 0;
 }
 RW_FOPS(dis_tap)
@@ -7633,11 +7649,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_dis_disc_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	return 0;
 }
 RW_FOPS(dis_disc)
@@ -7657,11 +7673,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_bypass_pwup_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	return 0;
 }
 RW_FOPS(bypass_pwup)
@@ -7681,11 +7697,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_bypass_pwoff_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	return 0;
 }
 RW_FOPS(bypass_pwoff)
@@ -7705,11 +7721,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_tap_pwup_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	return 0;
 }
 RW_FOPS(tap_pwup)
@@ -7729,11 +7745,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_disc_pwup_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	return 0;
 }
 RW_FOPS(disc_pwup)
@@ -7753,11 +7769,11 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_std_nic_fn(dev);
 	if (ret == BP_NOT_CAP)
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	else if (ret == 0)
-		seq_printf(m, "off\n");
+		seq_puts(m, "off\n");
 	else
-		seq_printf(m, "on\n");
+		seq_puts(m, "on\n");
 	return 0;
 }
 RW_FOPS(std_nic)
@@ -7795,13 +7811,13 @@
 	bpctl_dev_t *dev = m->private;
 	int ret = get_wd_exp_mode_fn(dev);
 	if (ret == 1)
-		seq_printf(m, "tap\n");
+		seq_puts(m, "tap\n");
 	else if (ret == 0)
-		seq_printf(m, "bypass\n");
+		seq_puts(m, "bypass\n");
 	else if (ret == 2)
-		seq_printf(m, "disc\n");
+		seq_puts(m, "disc\n");
 	else
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	return 0;
 }
 RW_FOPS(wd_exp_mode)
@@ -7823,7 +7839,7 @@
 	if (ret >= 0)
 		seq_printf(m, "%d\n", ret);
 	else
-		seq_printf(m, "fail\n");
+		seq_puts(m, "fail\n");
 	return 0;
 }
 RW_FOPS(wd_autoreset)
@@ -7831,7 +7847,7 @@
 int bypass_proc_create_dev_sd(bpctl_dev_t *pbp_device_block)
 {
 	struct bypass_pfs_sd *current_pfs = &(pbp_device_block->bypass_pfs_set);
-	static struct proc_dir_entry *procfs_dir = NULL;
+	static struct proc_dir_entry *procfs_dir;
 	int ret = 0;
 
 	if (!pbp_device_block->ndev)
@@ -7851,7 +7867,8 @@
 	}
 	current_pfs->bypass_entry = procfs_dir;
 
-#define ENTRY(x) ret |= procfs_add(#x, &x##_ops, pbp_device_block)
+#define ENTRY(x) (ret |= procfs_add(#x, &x##_ops, pbp_device_block))
+
 	ENTRY(bypass_info);
 	if (pbp_device_block->bp_caps & SW_CTL_CAP) {
 		/* Create set param proc's */

diff --git a/drivers/staging/silicom/bypasslib/bp_ioctl.h b/drivers/staging/silicom/bypasslib/bp_ioctl.h
index 040c6fa..2d1ef53 100644
--- a/drivers/staging/silicom/bypasslib/bp_ioctl.h
+++ b/drivers/staging/silicom/bypasslib/bp_ioctl.h

@@ -14,41 +14,41 @@
 #ifndef BP_IOCTL_H
 #define BP_IOCTL_H
 
-#define BP_CAP                   0x01	//BIT_0
-#define BP_STATUS_CAP            0x02	//BIT_1
-#define BP_STATUS_CHANGE_CAP     0x04	//BIT_2
-#define SW_CTL_CAP               0x08	//BIT_3
-#define BP_DIS_CAP               0x10	//BIT_4
-#define BP_DIS_STATUS_CAP        0x20	//BIT_5
-#define STD_NIC_CAP              0x40	//BIT_6
-#define BP_PWOFF_ON_CAP          0x80	//BIT_7
-#define BP_PWOFF_OFF_CAP         0x0100	//BIT_8
-#define BP_PWOFF_CTL_CAP         0x0200	//BIT_9
-#define BP_PWUP_ON_CAP           0x0400	//BIT_10
-#define BP_PWUP_OFF_CAP          0x0800	//BIT_11
-#define BP_PWUP_CTL_CAP          0x1000	//BIT_12
-#define WD_CTL_CAP               0x2000	//BIT_13
-#define WD_STATUS_CAP            0x4000	//BIT_14
-#define WD_TIMEOUT_CAP           0x8000	//BIT_15
-#define TX_CTL_CAP               0x10000	//BIT_16
-#define TX_STATUS_CAP            0x20000	//BIT_17
-#define TAP_CAP                  0x40000	//BIT_18
-#define TAP_STATUS_CAP           0x80000	//BIT_19
-#define TAP_STATUS_CHANGE_CAP    0x100000	//BIT_20
-#define TAP_DIS_CAP              0x200000	//BIT_21
-#define TAP_DIS_STATUS_CAP       0x400000	//BIT_22
-#define TAP_PWUP_ON_CAP          0x800000	//BIT_23
-#define TAP_PWUP_OFF_CAP         0x1000000	//BIT 24
-#define TAP_PWUP_CTL_CAP         0x2000000	//BIT 25
-#define NIC_CAP_NEG              0x4000000	//BIT 26
-#define TPL_CAP                  0x8000000	//BIT 27
-#define DISC_CAP                 0x10000000	//BIT 28
-#define DISC_DIS_CAP             0x20000000	//BIT 29
-#define DISC_PWUP_CTL_CAP        0x40000000	//BIT 30
+#define BP_CAP                   0x01	/* BIT_0 */
+#define BP_STATUS_CAP            0x02	/* BIT_1 */
+#define BP_STATUS_CHANGE_CAP     0x04	/* BIT_2 */
+#define SW_CTL_CAP               0x08	/* BIT_3 */
+#define BP_DIS_CAP               0x10	/* BIT_4 */
+#define BP_DIS_STATUS_CAP        0x20	/* BIT_5 */
+#define STD_NIC_CAP              0x40	/* BIT_6 */
+#define BP_PWOFF_ON_CAP          0x80	/* BIT_7 */
+#define BP_PWOFF_OFF_CAP         0x0100	/* BIT_8 */
+#define BP_PWOFF_CTL_CAP         0x0200	/* BIT_9 */
+#define BP_PWUP_ON_CAP           0x0400	/* BIT_10 */
+#define BP_PWUP_OFF_CAP          0x0800	/* BIT_11 */
+#define BP_PWUP_CTL_CAP          0x1000	/* BIT_12 */
+#define WD_CTL_CAP               0x2000	/* BIT_13 */
+#define WD_STATUS_CAP            0x4000	/* BIT_14 */
+#define WD_TIMEOUT_CAP           0x8000	/* BIT_15 */
+#define TX_CTL_CAP               0x10000	/* BIT_16 */
+#define TX_STATUS_CAP            0x20000	/* BIT_17 */
+#define TAP_CAP                  0x40000	/* BIT_18 */
+#define TAP_STATUS_CAP           0x80000	/* BIT_19 */
+#define TAP_STATUS_CHANGE_CAP    0x100000	/* BIT_20 */
+#define TAP_DIS_CAP              0x200000	/* BIT_21 */
+#define TAP_DIS_STATUS_CAP       0x400000	/* BIT_22 */
+#define TAP_PWUP_ON_CAP          0x800000	/* BIT_23 */
+#define TAP_PWUP_OFF_CAP         0x1000000	/* BIT 24 */
+#define TAP_PWUP_CTL_CAP         0x2000000	/* BIT 25 */
+#define NIC_CAP_NEG              0x4000000	/* BIT 26 */
+#define TPL_CAP                  0x8000000	/* BIT 27 */
+#define DISC_CAP                 0x10000000	/* BIT 28 */
+#define DISC_DIS_CAP             0x20000000	/* BIT 29 */
+#define DISC_PWUP_CTL_CAP        0x40000000	/* BIT 30 */
 
 #define WD_MIN_TIME_MASK(val)      (val & 0xf)
 #define WD_STEP_COUNT_MASK(val)    ((val & 0xf) << 5)
-#define WDT_STEP_TIME              0x10	//BIT_4
+#define WDT_STEP_TIME              0x10	/* BIT_4 */
 
 #define WD_MIN_TIME_GET(desc)   (desc & 0xf)
 #define WD_STEP_COUNT_GET(desc) (desc>>5) & 0xf

diff --git a/drivers/staging/silicom/bypasslib/bplibk.h b/drivers/staging/silicom/bypasslib/bplibk.h
index d8c1d27..c5c75c4 100644
--- a/drivers/staging/silicom/bypasslib/bplibk.h
+++ b/drivers/staging/silicom/bypasslib/bplibk.h

@@ -24,15 +24,13 @@
 #define INTEL_PEG4BPFII_PID 0x10a1
 
 #define PEGII_IF_SERIES(vid, pid) \
-        ((vid==0x8086)&& \
-        ((pid==INTEL_PEG4BPII_PID)||   \
-          (pid==INTEL_PEG4BPFII_PID)))
-
-#define EXPORT_SYMBOL_NOVERS EXPORT_SYMBOL
+	((vid == 0x8086) && \
+	 ((pid == INTEL_PEG4BPII_PID) ||   \
+	  (pid == INTEL_PEG4BPFII_PID)))
 
 #ifdef BP_VENDOR_SUPPORT
-char *bp_desc_array[] =
-    { "e1000bp", "e1000bpe", "slcm5700", "bnx2xbp", "ixgbp", "ixgbpe", NULL };
+char *bp_desc_array[] = { "e1000bp", "e1000bpe", "slcm5700",
+			"bnx2xbp", "ixgbp", "ixgbpe", NULL };
 #endif
 
 #endif

diff --git a/drivers/staging/silicom/bypasslib/bypass.c b/drivers/staging/silicom/bypasslib/bypass.c
index 9ed2508..ba0d23a 100644
--- a/drivers/staging/silicom/bypasslib/bypass.c
+++ b/drivers/staging/silicom/bypasslib/bypass.c

@@ -188,69 +188,82 @@
 		return is_bypass_dev(if_index);
 	return ret;
 }
+EXPORT_SYMBOL(is_bypass);
 
 static int get_bypass_slave(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_bypass_slave, GET_BYPASS_SLAVE, if_index);
 }
+EXPORT_SYMBOL(get_bypass_slave);
 
 static int get_bypass_caps(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_bypass_caps, GET_BYPASS_CAPS, if_index);
 }
+EXPORT_SYMBOL(get_bypass_caps);
 
 static int get_wd_set_caps(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_wd_set_caps, GET_WD_SET_CAPS, if_index);
 }
+EXPORT_SYMBOL(get_wd_set_caps);
 
 static int set_bypass(int if_index, int bypass_mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_bypass, SET_BYPASS, if_index, bypass_mode);
 }
+EXPORT_SYMBOL(set_bypass);
 
 static int get_bypass(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_bypass, GET_BYPASS, if_index);
 }
+EXPORT_SYMBOL(get_bypass);
 
 static int get_bypass_change(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_bypass_change, GET_BYPASS_CHANGE, if_index);
 }
+EXPORT_SYMBOL(get_bypass_change);
 
 static int set_dis_bypass(int if_index, int dis_bypass)
 {
 	DO_BPLIB_SET_ARG_FN(set_dis_bypass, SET_DIS_BYPASS, if_index,
 			    dis_bypass);
 }
+EXPORT_SYMBOL(set_dis_bypass);
 
 static int get_dis_bypass(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_dis_bypass, GET_DIS_BYPASS, if_index);
 }
+EXPORT_SYMBOL(get_dis_bypass);
 
 static int set_bypass_pwoff(int if_index, int bypass_mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_bypass_pwoff, SET_BYPASS_PWOFF, if_index,
 			    bypass_mode);
 }
+EXPORT_SYMBOL(set_bypass_pwoff);
 
 static int get_bypass_pwoff(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_bypass_pwoff, GET_BYPASS_PWOFF, if_index);
 }
+EXPORT_SYMBOL(get_bypass_pwoff);
 
 static int set_bypass_pwup(int if_index, int bypass_mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_bypass_pwup, SET_BYPASS_PWUP, if_index,
 			    bypass_mode);
 }
+EXPORT_SYMBOL(set_bypass_pwup);
 
 static int get_bypass_pwup(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_bypass_pwup, GET_BYPASS_PWUP, if_index);
 }
+EXPORT_SYMBOL(get_bypass_pwup);
 
 static int set_bypass_wd(int if_index, int ms_timeout, int *ms_timeout_set)
 {
@@ -267,6 +280,7 @@
 	}
 	return ret;
 }
+EXPORT_SYMBOL(set_bypass_wd);
 
 static int get_bypass_wd(int if_index, int *ms_timeout_set)
 {
@@ -278,6 +292,7 @@
 		ret = doit(GET_BYPASS_WD, if_index, data);
 	return ret;
 }
+EXPORT_SYMBOL(get_bypass_wd);
 
 static int get_wd_expire_time(int if_index, int *ms_time_left)
 {
@@ -292,143 +307,171 @@
 	}
 	return ret;
 }
+EXPORT_SYMBOL(get_wd_expire_time);
 
 static int reset_bypass_wd_timer(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(reset_bypass_wd_timer, RESET_BYPASS_WD_TIMER,
 			    if_index);
 }
+EXPORT_SYMBOL(reset_bypass_wd_timer);
 
 static int set_std_nic(int if_index, int bypass_mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_std_nic, SET_STD_NIC, if_index, bypass_mode);
 }
+EXPORT_SYMBOL(set_std_nic);
 
 static int get_std_nic(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_std_nic, GET_STD_NIC, if_index);
 }
+EXPORT_SYMBOL(get_std_nic);
 
 static int set_tx(int if_index, int tx_state)
 {
 	DO_BPLIB_SET_ARG_FN(set_tx, SET_TX, if_index, tx_state);
 }
+EXPORT_SYMBOL(set_tx);
 
 static int get_tx(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_tx, GET_TX, if_index);
 }
+EXPORT_SYMBOL(get_tx);
 
 static int set_tap(int if_index, int tap_mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_tap, SET_TAP, if_index, tap_mode);
 }
+EXPORT_SYMBOL(set_tap);
 
 static int get_tap(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_tap, GET_TAP, if_index);
 }
+EXPORT_SYMBOL(get_tap);
 
 static int get_tap_change(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_tap_change, GET_TAP_CHANGE, if_index);
 }
+EXPORT_SYMBOL(get_tap_change);
 
 static int set_dis_tap(int if_index, int dis_tap)
 {
 	DO_BPLIB_SET_ARG_FN(set_dis_tap, SET_DIS_TAP, if_index, dis_tap);
 }
+EXPORT_SYMBOL(set_dis_tap);
 
 static int get_dis_tap(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_dis_tap, GET_DIS_TAP, if_index);
 }
+EXPORT_SYMBOL(get_dis_tap);
 
 static int set_tap_pwup(int if_index, int tap_mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_tap_pwup, SET_TAP_PWUP, if_index, tap_mode);
 }
+EXPORT_SYMBOL(set_tap_pwup);
 
 static int get_tap_pwup(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_tap_pwup, GET_TAP_PWUP, if_index);
 }
+EXPORT_SYMBOL(get_tap_pwup);
 
 static int set_bp_disc(int if_index, int disc_mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_bp_disc, SET_DISC, if_index, disc_mode);
 }
+EXPORT_SYMBOL(set_bp_disc);
 
 static int get_bp_disc(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_bp_disc, GET_DISC, if_index);
 }
+EXPORT_SYMBOL(get_bp_disc);
 
 static int get_bp_disc_change(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_bp_disc_change, GET_DISC_CHANGE, if_index);
 }
+EXPORT_SYMBOL(get_bp_disc_change);
 
 static int set_bp_dis_disc(int if_index, int dis_disc)
 {
 	DO_BPLIB_SET_ARG_FN(set_bp_dis_disc, SET_DIS_DISC, if_index, dis_disc);
 }
+EXPORT_SYMBOL(set_bp_dis_disc);
 
 static int get_bp_dis_disc(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_bp_dis_disc, GET_DIS_DISC, if_index);
 }
+EXPORT_SYMBOL(get_bp_dis_disc);
 
 static int set_bp_disc_pwup(int if_index, int disc_mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_bp_disc_pwup, SET_DISC_PWUP, if_index,
 			    disc_mode);
 }
+EXPORT_SYMBOL(set_bp_disc_pwup);
 
 static int get_bp_disc_pwup(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_bp_disc_pwup, GET_DISC_PWUP, if_index);
 }
+EXPORT_SYMBOL(get_bp_disc_pwup);
 
 static int set_wd_exp_mode(int if_index, int mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_wd_exp_mode, SET_WD_EXP_MODE, if_index, mode);
 }
+EXPORT_SYMBOL(set_wd_exp_mode);
 
 static int get_wd_exp_mode(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_wd_exp_mode, GET_WD_EXP_MODE, if_index);
 }
+EXPORT_SYMBOL(get_wd_exp_mode);
 
 static int set_wd_autoreset(int if_index, int time)
 {
 	DO_BPLIB_SET_ARG_FN(set_wd_autoreset, SET_WD_AUTORESET, if_index, time);
 }
+EXPORT_SYMBOL(set_wd_autoreset);
 
 static int get_wd_autoreset(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_wd_autoreset, GET_WD_AUTORESET, if_index);
 }
+EXPORT_SYMBOL(get_wd_autoreset);
 
 static int set_tpl(int if_index, int tpl_mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_tpl, SET_TPL, if_index, tpl_mode);
 }
+EXPORT_SYMBOL(set_tpl);
 
 static int get_tpl(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_tpl, GET_TPL, if_index);
 }
+EXPORT_SYMBOL(get_tpl);
 
 static int set_bp_hw_reset(int if_index, int mode)
 {
 	DO_BPLIB_SET_ARG_FN(set_tpl, SET_BP_HW_RESET, if_index, mode);
 }
+EXPORT_SYMBOL(set_bp_hw_reset);
 
 static int get_bp_hw_reset(int if_index)
 {
 	DO_BPLIB_GET_ARG_FN(get_tpl, GET_BP_HW_RESET, if_index);
 }
+EXPORT_SYMBOL(get_bp_hw_reset);
 
 static int get_bypass_info(int if_index, struct bp_info *bp_info)
 {
@@ -467,6 +510,7 @@
 	}
 	return ret;
 }
+EXPORT_SYMBOL(get_bypass_info);
 
 int init_lib_module(void)
 {
@@ -479,50 +523,5 @@
 {
 }
 
-EXPORT_SYMBOL_NOVERS(is_bypass);
-EXPORT_SYMBOL_NOVERS(get_bypass_slave);
-EXPORT_SYMBOL_NOVERS(get_bypass_caps);
-EXPORT_SYMBOL_NOVERS(get_wd_set_caps);
-EXPORT_SYMBOL_NOVERS(set_bypass);
-EXPORT_SYMBOL_NOVERS(get_bypass);
-EXPORT_SYMBOL_NOVERS(get_bypass_change);
-EXPORT_SYMBOL_NOVERS(set_dis_bypass);
-EXPORT_SYMBOL_NOVERS(get_dis_bypass);
-EXPORT_SYMBOL_NOVERS(set_bypass_pwoff);
-EXPORT_SYMBOL_NOVERS(get_bypass_pwoff);
-EXPORT_SYMBOL_NOVERS(set_bypass_pwup);
-EXPORT_SYMBOL_NOVERS(get_bypass_pwup);
-EXPORT_SYMBOL_NOVERS(set_bypass_wd);
-EXPORT_SYMBOL_NOVERS(get_bypass_wd);
-EXPORT_SYMBOL_NOVERS(get_wd_expire_time);
-EXPORT_SYMBOL_NOVERS(reset_bypass_wd_timer);
-EXPORT_SYMBOL_NOVERS(set_std_nic);
-EXPORT_SYMBOL_NOVERS(get_std_nic);
-EXPORT_SYMBOL_NOVERS(set_tx);
-EXPORT_SYMBOL_NOVERS(get_tx);
-EXPORT_SYMBOL_NOVERS(set_tap);
-EXPORT_SYMBOL_NOVERS(get_tap);
-EXPORT_SYMBOL_NOVERS(get_tap_change);
-EXPORT_SYMBOL_NOVERS(set_dis_tap);
-EXPORT_SYMBOL_NOVERS(get_dis_tap);
-EXPORT_SYMBOL_NOVERS(set_tap_pwup);
-EXPORT_SYMBOL_NOVERS(get_tap_pwup);
-EXPORT_SYMBOL_NOVERS(set_bp_disc);
-EXPORT_SYMBOL_NOVERS(get_bp_disc);
-EXPORT_SYMBOL_NOVERS(get_bp_disc_change);
-EXPORT_SYMBOL_NOVERS(set_bp_dis_disc);
-EXPORT_SYMBOL_NOVERS(get_bp_dis_disc);
-EXPORT_SYMBOL_NOVERS(set_bp_disc_pwup);
-EXPORT_SYMBOL_NOVERS(get_bp_disc_pwup);
-EXPORT_SYMBOL_NOVERS(set_wd_exp_mode);
-EXPORT_SYMBOL_NOVERS(get_wd_exp_mode);
-EXPORT_SYMBOL_NOVERS(set_wd_autoreset);
-EXPORT_SYMBOL_NOVERS(get_wd_autoreset);
-EXPORT_SYMBOL_NOVERS(set_tpl);
-EXPORT_SYMBOL_NOVERS(get_tpl);
-EXPORT_SYMBOL_NOVERS(set_bp_hw_reset);
-EXPORT_SYMBOL_NOVERS(get_bp_hw_reset);
-EXPORT_SYMBOL_NOVERS(get_bypass_info);
-
 module_init(init_lib_module);
 module_exit(cleanup_lib_module);

diff --git a/drivers/staging/slicoss/slicoss.c b/drivers/staging/slicoss/slicoss.c
index e4b8277..869dcd3 100644
--- a/drivers/staging/slicoss/slicoss.c
+++ b/drivers/staging/slicoss/slicoss.c

@@ -3651,17 +3651,20 @@
 
 	if (!pci_set_dma_mask(pcidev, DMA_BIT_MASK(64))) {
 		pci_using_dac = 1;
-		if (pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64))) {
+		err = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64));
+		if (err) {
 			dev_err(&pcidev->dev, "unable to obtain 64-bit DMA for "
 					"consistent allocations\n");
 			goto err_out_disable_pci;
 		}
-	} else if (pci_set_dma_mask(pcidev, DMA_BIT_MASK(32))) {
+	} else {
+		err = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pcidev->dev, "no usable DMA configuration\n");
+			goto err_out_disable_pci;
+		}
 		pci_using_dac = 0;
 		pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32));
-	} else {
-		dev_err(&pcidev->dev, "no usable DMA configuration\n");
-		goto err_out_disable_pci;
 	}
 
 	err = pci_request_regions(pcidev, DRV_NAME);
@@ -3696,6 +3699,7 @@
 	if (!memmapped_ioaddr) {
 		dev_err(&pcidev->dev, "cannot remap MMIO region %lx @ %lx\n",
 			mmio_len, mmio_start);
+		err = -ENOMEM;
 		goto err_out_free_netdev;
 	}
 
@@ -3706,8 +3710,8 @@
 	slic_init_adapter(netdev,
 			  pcidev, pci_tbl_entry, memmapped_ioaddr, cards_found);
 
-	status = slic_card_locate(adapter);
-	if (status) {
+	err = slic_card_locate(adapter);
+	if (err) {
 		dev_err(&pcidev->dev, "cannot locate card\n");
 		goto err_out_free_mmio_region;
 	}

diff --git a/drivers/staging/speakup/Kconfig b/drivers/staging/speakup/Kconfig
index b416ace..8c3e7a6 100644
--- a/drivers/staging/speakup/Kconfig
+++ b/drivers/staging/speakup/Kconfig

@@ -11,7 +11,7 @@
 		point your browser at <http://www.linux-speakup.org/>.
 		There is also a mailing list at the above url that you
 		can subscribe to.
-		
+
 		Supported synthesizers are accent sa, accent pc,
 		appollo II., Auddapter, Braille 'n Speak, Dectalk
 		external (old), Dectalk PC (full length isa board),
@@ -19,24 +19,24 @@
 		Litetalk, Keynote Gold internal PC, software
 		synthesizers, Speakout, transport, and a dummy module
 		that can be used with a plain text terminal.
-		
+
 		Speakup can either be built in or compiled as a module
 		by answering y or m.  If you answer y here, then you
 		must answer either y or m to at least one of the
 		synthesizer drivers below.  If you answer m here, then
 		the synthesizer drivers below can only be built as
 		modules.
-		
+
 		These drivers are not standalone drivers, but must be
 		used in conjunction with Speakup.  Think of them as
 		video cards for blind people.
-		
-		
+
+
 		The Dectalk pc driver can only be built as a module, and
 		requires software to be pre-loaded on to the card before
 		the module can be loaded.  See the decpc choice below
 		for more details.
-		
+
 		If you are not a blind person, or don't have access to
 		one of the listed synthesizers, you should say n.
 
@@ -84,7 +84,7 @@
 config SPEAKUP_SYNTH_DECTLK
 	tristate "DECtalk Express synthesizer support"
 	---help---
-		
+
 		This is the Speakup driver for the DecTalk Express
 		synthesizer.  You can say y to build it into the kernel,
 		or m to build it as a module.  See the configuration
@@ -93,7 +93,7 @@
 config SPEAKUP_SYNTH_DECEXT
 	tristate "DECtalk External (old) synthesizer support"
 	---help---
-		
+
 		This is the Speakup driver for the DecTalk External
 		(old) synthesizer.  You can say y to build it into the
 		kernel, or m to build it as a module.  See the
@@ -104,12 +104,12 @@
 	depends on m
 	tristate "DECtalk PC (big ISA card) synthesizer support"
 	---help---
-		
+
 		This is the Speakup driver for the DecTalk PC (full
 		length ISA) synthesizer.  You can say m to build it as
 		a module.  See the configuration help on the Speakup
 		choice above for more info.
-		
+
 		In order to use the DecTalk PC driver, you must download
 		the dec_pc.tgz file from linux-speakup.org.  It is in
 		the pub/linux/goodies directory.  The dec_pc.tgz file
@@ -118,14 +118,14 @@
 		This driver must be built as a module, and can not be
 		loaded until the file system is mounted and the DecTalk
 		PC software has been pre-loaded on to the board.
-		
+
 		See the README file in the dec_pc.tgz file for more
 		details.
 
 config SPEAKUP_SYNTH_DTLK
 	tristate "DoubleTalk PC synthesizer support"
 	---help---
-		
+
 		This is the Speakup driver for the internal DoubleTalk
 		PC synthesizer.  You can say y to build it into the
 		kernel, or m to build it as a module.  See the
@@ -135,7 +135,7 @@
 config SPEAKUP_SYNTH_KEYPC
 	tristate "Keynote Gold PC synthesizer support"
 	---help---
-		
+
 		This is the Speakup driver for the Keynote Gold
 		PC synthesizer.  You can say y to build it into the
 		kernel, or m to build it as a module.  See the
@@ -166,7 +166,7 @@
 config SPEAKUP_SYNTH_SPKOUT
 	tristate "Speak Out synthesizer support"
 	---help---
-		
+
 		This is the Speakup driver for the Speakout synthesizer.
 		 You can say y to build it into the kernel, or m to
 		build it as a module.  See the configuration help on the
@@ -175,7 +175,7 @@
 config SPEAKUP_SYNTH_TXPRT
 	tristate "Transport synthesizer support"
 	---help---
-		
+
 		This is the Speakup driver for the Transport
 		synthesizer.  You can say y to build it into the kernel,
 		or m to build it as a module.  See the configuration
@@ -184,7 +184,7 @@
 config SPEAKUP_SYNTH_DUMMY
 	tristate "Dummy synthesizer driver (for testing)"
 	---help---
-		
+
 		This is a dummy Speakup driver for plugging a mere serial
 		terminal.  This is handy if you want to test speakup but
 		don't have the hardware.  You can say y to build it into

diff --git a/drivers/staging/speakup/devsynth.c b/drivers/staging/speakup/devsynth.c
index 940769e..71c728a 100644
--- a/drivers/staging/speakup/devsynth.c
+++ b/drivers/staging/speakup/devsynth.c

@@ -13,11 +13,11 @@
 static int misc_registered;
 static int dev_opened;
 
-static ssize_t speakup_file_write(struct file *fp, const char *buffer,
-		   size_t nbytes, loff_t *ppos)
+static ssize_t speakup_file_write(struct file *fp, const char __user *buffer,
+				  size_t nbytes, loff_t *ppos)
 {
 	size_t count = nbytes;
-	const char *ptr = buffer;
+	const char __user *ptr = buffer;
 	size_t bytes;
 	unsigned long flags;
 	u_char buf[256];
@@ -30,15 +30,15 @@
 			return -EFAULT;
 		count -= bytes;
 		ptr += bytes;
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		synth_write(buf, bytes);
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	}
 	return (ssize_t) nbytes;
 }
 
-static ssize_t speakup_file_read(struct file *fp, char *buf, size_t nbytes,
-	loff_t *ppos)
+static ssize_t speakup_file_read(struct file *fp, char __user *buf,
+				 size_t nbytes, loff_t *ppos)
 {
 	return 0;
 }

diff --git a/drivers/staging/speakup/i18n.c b/drivers/staging/speakup/i18n.c
index 2add1fc..9ea16c5 100644
--- a/drivers/staging/speakup/i18n.c
+++ b/drivers/staging/speakup/i18n.c

@@ -558,11 +558,11 @@
 				kfree(newstr);
 				return -EINVAL;
 			}
-			spk_lock(flags);
+			spin_lock_irqsave(&speakup_info.spinlock, flags);
 			if (speakup_msgs[index] != speakup_default_msgs[index])
 				kfree(speakup_msgs[index]);
 			speakup_msgs[index] = newstr;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		} else {
 			rc = -ENOMEM;
 		}
@@ -595,14 +595,14 @@
 	unsigned long flags;
 	enum msg_index_t i;
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 
 	for (i = group->start; i <= group->end; i++) {
 		if (speakup_msgs[i] != speakup_default_msgs[i])
 			kfree(speakup_msgs[i]);
 		speakup_msgs[i] = speakup_default_msgs[i];
 	}
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 /* Called at initialization time, to establish default messages. */
@@ -618,12 +618,12 @@
 	enum msg_index_t index;
 	unsigned long flags;
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	for (index = MSG_FIRST_INDEX; index < MSG_LAST_INDEX; index++) {
 		if (speakup_msgs[index] != speakup_default_msgs[index]) {
 			kfree(speakup_msgs[index]);
 			speakup_msgs[index] = speakup_default_msgs[index];
 		}
 	}
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }

diff --git a/drivers/staging/speakup/kobjects.c b/drivers/staging/speakup/kobjects.c
index 943b6c1..51bdea3 100644
--- a/drivers/staging/speakup/kobjects.c
+++ b/drivers/staging/speakup/kobjects.c

@@ -35,7 +35,7 @@
 	size_t bufsize = PAGE_SIZE;
 	unsigned long flags;
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	*buf_pointer = '\0';
 	for (i = 0; i < 256; i++) {
 		if (bufsize <= 1)
@@ -70,7 +70,7 @@
 		bufsize -= len;
 		buf_pointer += len;
 	}
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return buf_pointer - buf;
 }
 
@@ -127,7 +127,7 @@
 	size_t desc_length = 0;
 	int i;
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	while (cp < end) {
 
 		while ((cp < end) && (*cp == ' ' || *cp == '\t'))
@@ -212,7 +212,7 @@
 			spk_reset_default_chartab();
 	}
 
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	report_char_chartab_status(reset, received, used, rejected,
 		do_characters);
 	return retval;
@@ -232,7 +232,7 @@
 	u_char *cp1;
 	u_char ch;
 	unsigned long flags;
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	cp1 = spk_key_buf + SHIFT_TBL_SIZE;
 	num_keys = (int)(*cp1);
 	nstates = (int)cp1[1];
@@ -248,7 +248,7 @@
 		}
 	}
 	cp += sprintf(cp, "0, %d\n", KEY_MAP_VER);
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return (int)(cp-buf);
 }
 
@@ -265,17 +265,17 @@
 	u_char *cp1;
 	unsigned long flags;
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	in_buff = kmemdup(buf, count + 1, GFP_ATOMIC);
 	if (!in_buff) {
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return -ENOMEM;
 	}
 	if (strchr("dDrR", *in_buff)) {
 		spk_set_key_info(spk_key_defaults, spk_key_buf);
 		pr_info("keymap set to default values\n");
 		kfree(in_buff);
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return count;
 	}
 	if (in_buff[count - 1] == '\n')
@@ -294,7 +294,7 @@
 		pr_warn("i %d %d %d %d\n", i,
 				(int)cp1[-3], (int)cp1[-2], (int)cp1[-1]);
 		kfree(in_buff);
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return -EINVAL;
 	}
 	while (--i >= 0) {
@@ -315,7 +315,7 @@
 		}
 	}
 	kfree(in_buff);
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return ret;
 }
 
@@ -341,7 +341,7 @@
 		pr_warn("silent value '%c' not in range (0,7)\n", ch);
 		return -EINVAL;
 	}
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	if (ch&2) {
 		shut = 1;
 		spk_do_flush();
@@ -354,7 +354,7 @@
 		spk_shut_up |= shut;
 	else
 		spk_shut_up &= ~shut;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return count;
 }
 
@@ -470,7 +470,7 @@
 		return -EINVAL;
 	}
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	pb = (struct st_bits_data *) &spk_punc_info[var->value];
 	mask = pb->mask;
 	for (i = 33; i < 128; i++) {
@@ -478,7 +478,7 @@
 			continue;
 		*cp++ = (char)i;
 	}
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return cp-buf;
 }
 
@@ -518,14 +518,14 @@
 		x--;
 	punc_buf[x] = '\0';
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 
 	if (*punc_buf == 'd' || *punc_buf == 'r')
-		x = spk_set_mask_bits(0, var->value, 3);
+		x = spk_set_mask_bits(NULL, var->value, 3);
 	else
 		x = spk_set_mask_bits(punc_buf, var->value, 3);
 
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return count;
 }
 
@@ -547,7 +547,7 @@
 	if (param == NULL)
 		return -EINVAL;
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	var = (struct var_t *) param->data;
 	switch (param->var_type) {
 	case VAR_NUM:
@@ -580,7 +580,7 @@
 			param->name, param->var_type);
 		break;
 	}
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return rv;
 }
 EXPORT_SYMBOL_GPL(spk_var_show);
@@ -609,7 +609,7 @@
 	cp = (char *)buf;
 	string_unescape_any_inplace(cp);
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	switch (param->var_type) {
 	case VAR_NUM:
 	case VAR_TIME:
@@ -670,7 +670,7 @@
 			}
 		}
 	}
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 
 	if (ret == -ERESTART)
 		pr_info("%s reset to default value\n", attr->attr.name);
@@ -818,9 +818,9 @@
 	unsigned long flags;
 
 	BUG_ON(!group);
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	retval = message_show_helper(buf, group->start, group->end);
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return retval;
 }
 

diff --git a/drivers/staging/speakup/main.c b/drivers/staging/speakup/main.c
index 6c7b55c..14079c4 100644
--- a/drivers/staging/speakup/main.c
+++ b/drivers/staging/speakup/main.c

@@ -95,7 +95,8 @@
 
 static char mark_cut_flag;
 #define MAX_KEY 160
-u_char *spk_our_keys[MAX_KEY], *spk_shift_table;
+static u_char *spk_shift_table;
+u_char *spk_our_keys[MAX_KEY];
 u_char spk_key_buf[600];
 const u_char spk_key_defaults[] = {
 #include "speakupmap.h"
@@ -457,7 +458,7 @@
 	synth_buffer_add(SPACE);
 }
 
-static u16 get_char(struct vc_data *vc, u16 * pos, u_char * attribs)
+static u16 get_char(struct vc_data *vc, u16 *pos, u_char *attribs)
 {
 	u16 ch = ' ';
 	if (vc && pos) {
@@ -1129,7 +1130,7 @@
 	unsigned long flags;
 	if (synth == NULL || up_flag || spk_killed)
 		return;
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	if (cursor_track == read_all_mode) {
 		switch (value) {
 		case KVAL(K_SHIFT):
@@ -1151,20 +1152,20 @@
 	}
 	if (spk_say_ctrl && value < NUM_CTL_LABELS)
 		synth_printf("%s", spk_msg_get(MSG_CTL_START + value));
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 static void do_handle_latin(struct vc_data *vc, u_char value, char up_flag)
 {
 	unsigned long flags;
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	if (up_flag) {
 		spk_lastkey = spk_keydown = 0;
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return;
 	}
 	if (synth == NULL || spk_killed) {
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return;
 	}
 	spk_shut_up &= 0xfe;
@@ -1173,7 +1174,7 @@
 	spk_parked &= 0xfe;
 	if (spk_key_echo == 2 && value >= MINECHOCHAR)
 		speak_char(value);
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 int spk_set_key_info(const u_char *key_info, u_char *k_buffer)
@@ -1282,7 +1283,7 @@
 }
 
 /* Allocation concurrency is protected by the console semaphore */
-int speakup_allocate(struct vc_data *vc)
+static int speakup_allocate(struct vc_data *vc)
 {
 	int vc_num;
 
@@ -1299,7 +1300,7 @@
 	return 0;
 }
 
-void speakup_deallocate(struct vc_data *vc)
+static void speakup_deallocate(struct vc_data *vc)
 {
 	int vc_num;
 
@@ -1449,21 +1450,21 @@
 static int pre_handle_cursor(struct vc_data *vc, u_char value, char up_flag)
 {
 	unsigned long flags;
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	if (cursor_track == read_all_mode) {
 		spk_parked &= 0xfe;
 		if (synth == NULL || up_flag || spk_shut_up) {
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			return NOTIFY_STOP;
 		}
 		del_timer(&cursor_timer);
 		spk_shut_up &= 0xfe;
 		spk_do_flush();
 		start_read_all_timer(vc, value + 1);
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return NOTIFY_STOP;
 	}
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return NOTIFY_OK;
 }
 
@@ -1472,10 +1473,10 @@
 	unsigned long flags;
 	struct var_t *cursor_timeout;
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	spk_parked &= 0xfe;
 	if (synth == NULL || up_flag || spk_shut_up || cursor_track == CT_Off) {
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return;
 	}
 	spk_shut_up &= 0xfe;
@@ -1494,7 +1495,7 @@
 	cursor_timeout = spk_get_var(CURSOR_TIME);
 	mod_timer(&cursor_timer,
 		  jiffies + msecs_to_jiffies(cursor_timeout->u.n.value));
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 static void update_color_buffer(struct vc_data *vc, const char *ic, int len)
@@ -1619,7 +1620,7 @@
 	struct vc_data *vc = vc_cons[cursor_con].d;
 	unsigned long flags;
 	del_timer(&cursor_timer);
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	if (cursor_con != fg_console) {
 		is_cursor = 0;
 		goto out;
@@ -1650,7 +1651,7 @@
 		say_char(vc);
 	spk_keydown = is_cursor = 0;
 out:
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 /* called by: vt_notifier_call() */
@@ -1659,13 +1660,13 @@
 	unsigned long flags;
 	if (!speakup_console[vc->vc_num])
 		return;
-	if (!spk_trylock(flags))
+	if (!spin_trylock_irqsave(&speakup_info.spinlock, flags))
 		/* Speakup output, discard */
 		return;
 	if (!spk_parked)
 		speakup_date(vc);
 	if (spk_shut_up || synth == NULL) {
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return;
 	}
 	if (vc->vc_num == fg_console && spk_keydown) {
@@ -1673,7 +1674,7 @@
 		if (!is_cursor)
 			say_char(vc);
 	}
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 /* called by: vt_notifier_call() */
@@ -1682,7 +1683,7 @@
 	unsigned long flags;
 	if ((vc->vc_num != fg_console) || spk_shut_up || synth == NULL)
 		return;
-	if (!spk_trylock(flags))
+	if (!spin_trylock_irqsave(&speakup_info.spinlock, flags))
 		/* Speakup output, discard */
 		return;
 	if (spk_bell_pos && spk_keydown && (vc->vc_x == spk_bell_pos - 1))
@@ -1690,31 +1691,31 @@
 	if ((is_cursor) || (cursor_track == read_all_mode)) {
 		if (cursor_track == CT_Highlight)
 			update_color_buffer(vc, str, len);
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return;
 	}
 	if (win_enabled) {
 		if (vc->vc_x >= win_left && vc->vc_x <= win_right &&
 		    vc->vc_y >= win_top && vc->vc_y <= win_bottom) {
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			return;
 		}
 	}
 
 	spkup_write(str, len);
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
-void speakup_con_update(struct vc_data *vc)
+static void speakup_con_update(struct vc_data *vc)
 {
 	unsigned long flags;
 	if (speakup_console[vc->vc_num] == NULL || spk_parked)
 		return;
-	if (!spk_trylock(flags))
+	if (!spin_trylock_irqsave(&speakup_info.spinlock, flags))
 		/* Speakup output, discard */
 		return;
 	speakup_date(vc);
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 static void do_handle_spec(struct vc_data *vc, u_char value, char up_flag)
@@ -1724,7 +1725,7 @@
 	char *label;
 	if (synth == NULL || up_flag || spk_killed)
 		return;
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	spk_shut_up &= 0xfe;
 	if (spk_no_intr)
 		spk_do_flush();
@@ -1745,13 +1746,13 @@
 		break;
 	default:
 		spk_parked &= 0xfe;
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return;
 	}
 	if (on_off < 2)
 		synth_printf("%s %s\n",
 			     label, spk_msg_get(MSG_STATUS_START + on_off));
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 static int inc_dec_var(u_char value)
@@ -1892,7 +1893,7 @@
 		spk_special_handler = NULL;
 		return 1;
 	}
-	go_pos = simple_strtol(goto_buf, &cp, 10);
+	go_pos = kstrtol(goto_buf, 10, (long *)&cp);
 	goto_pos = (u_long) go_pos;
 	if (*cp == 'x') {
 		if (*goto_buf < '0')
@@ -1964,7 +1965,7 @@
 }
 
 typedef void (*spkup_hand) (struct vc_data *);
-spkup_hand spkup_handler[] = {
+static spkup_hand spkup_handler[] = {
 	/* must be ordered same as defines in speakup.h */
 	do_nothing, speakup_goto, speech_kill, speakup_shut_up,
 	speakup_cut, speakup_paste, say_first_char, say_last_char,
@@ -2002,7 +2003,7 @@
 
 static const char *pad_chars = "0123456789+-*/\015,.?()";
 
-int
+static int
 speakup_key(struct vc_data *vc, int shift_state, int keycode, u_short keysym,
 	    int up_flag)
 {
@@ -2015,7 +2016,7 @@
 	if (synth == NULL)
 		return 0;
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	tty = vc->port.tty;
 	if (type >= 0xf0)
 		type -= 0xf0;
@@ -2033,7 +2034,7 @@
 	if (keycode >= MAX_KEY)
 		goto no_map;
 	key_info = spk_our_keys[keycode];
-	if (key_info == 0)
+	if (!key_info)
 		goto no_map;
 	/* Check valid read all mode keys */
 	if ((cursor_track == read_all_mode) && (!up_flag)) {
@@ -2114,7 +2115,7 @@
 	}
 	last_keycode = 0;
 out:
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return ret;
 }
 
@@ -2265,7 +2266,7 @@
 	     (var->var_id >= 0) && (var->var_id < MAXVARS); var++)
 		speakup_register_var(var);
 	for (i = 1; spk_punc_info[i].mask != 0; i++)
-		spk_set_mask_bits(0, i, 2);
+		spk_set_mask_bits(NULL, i, 2);
 
 	spk_set_key_info(spk_key_defaults, spk_key_buf);
 

diff --git a/drivers/staging/speakup/serialio.c b/drivers/staging/speakup/serialio.c
index e4d27aa..1354288 100644
--- a/drivers/staging/speakup/serialio.c
+++ b/drivers/staging/speakup/serialio.c

@@ -79,7 +79,7 @@
 /*printk(KERN_ERR "in irq\n"); */
 /*pr_warn("in IRQ\n"); */
 	int c;
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	while (inb_p(speakup_info.port_tts + UART_LSR) & UART_LSR_DR) {
 
 		c = inb_p(speakup_info.port_tts+UART_RX);
@@ -87,7 +87,7 @@
 /*printk(KERN_ERR "c = %d\n", c); */
 /*pr_warn("C = %d\n", c); */
 	}
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return IRQ_HANDLED;
 }
 

diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c
index 1c1f0d5..80141ac 100644
--- a/drivers/staging/speakup/speakup_acntpc.c
+++ b/drivers/staging/speakup/speakup_acntpc.c

@@ -166,7 +166,7 @@
 		outb_p(ch, speakup_info.port_tts);
 		buf++;
 	}
-	return 0;
+	return NULL;
 }
 
 static void do_catch_up(struct spk_synth *synth)
@@ -186,26 +186,26 @@
 	delay_time = spk_get_var(DELAY);
 	full_time = spk_get_var(FULL);
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	jiffy_delta_val = jiffy_delta->u.n.value;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 
 	jiff_max = jiffies + jiffy_delta_val;
 	while (!kthread_should_stop()) {
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		if (speakup_info.flushing) {
 			speakup_info.flushing = 0;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			synth->flush(synth);
 			continue;
 		}
 		if (synth_buffer_empty()) {
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			break;
 		}
 		set_current_state(TASK_INTERRUPTIBLE);
 		full_time_val = full_time->u.n.value;
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (synth_full()) {
 			schedule_timeout(msecs_to_jiffies(full_time_val));
 			continue;
@@ -217,9 +217,9 @@
 				break;
 			udelay(1);
 		}
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		ch = synth_buffer_getc();
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (ch == '\n')
 			ch = PROCSPEECH;
 		outb_p(ch, speakup_info.port_tts);
@@ -231,10 +231,10 @@
 				udelay(1);
 			}
 			outb_p(PROCSPEECH, speakup_info.port_tts);
-			spk_lock(flags);
+			spin_lock_irqsave(&speakup_info.spinlock, flags);
 			jiffy_delta_val = jiffy_delta->u.n.value;
 			delay_time_val = delay_time->u.n.value;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			schedule_timeout(msecs_to_jiffies(delay_time_val));
 			jiff_max = jiffies+jiffy_delta_val;
 		}

diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c
index 3e450cc..95d3132 100644
--- a/drivers/staging/speakup/speakup_apollo.c
+++ b/drivers/staging/speakup/speakup_apollo.c

@@ -148,30 +148,30 @@
 	jiffy_delta = spk_get_var(JIFFY);
 	delay_time = spk_get_var(DELAY);
 	full_time = spk_get_var(FULL);
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	jiffy_delta_val = jiffy_delta->u.n.value;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	jiff_max = jiffies + jiffy_delta_val;
 
 	while (!kthread_should_stop()) {
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		jiffy_delta_val = jiffy_delta->u.n.value;
 		full_time_val = full_time->u.n.value;
 		delay_time_val = delay_time->u.n.value;
 		if (speakup_info.flushing) {
 			speakup_info.flushing = 0;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			synth->flush(synth);
 			continue;
 		}
 		if (synth_buffer_empty()) {
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			break;
 		}
 		ch = synth_buffer_peek();
 		set_current_state(TASK_INTERRUPTIBLE);
 		full_time_val = full_time->u.n.value;
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (!spk_serial_out(ch)) {
 			outb(UART_MCR_DTR, speakup_info.port_tts + UART_MCR);
 			outb(UART_MCR_DTR | UART_MCR_RTS,
@@ -180,11 +180,11 @@
 			continue;
 		}
 		if ((jiffies >= jiff_max) && (ch == SPACE)) {
-			spk_lock(flags);
+			spin_lock_irqsave(&speakup_info.spinlock, flags);
 			jiffy_delta_val = jiffy_delta->u.n.value;
 			full_time_val = full_time->u.n.value;
 			delay_time_val = delay_time->u.n.value;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			if (spk_serial_out(synth->procspeech))
 				schedule_timeout(msecs_to_jiffies
 						 (delay_time_val));
@@ -194,9 +194,9 @@
 			jiff_max = jiffies + jiffy_delta_val;
 		}
 		set_current_state(TASK_RUNNING);
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		synth_buffer_getc();
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	}
 	spk_serial_out(PROCSPEECH);
 }

diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c
index d39a0de..d306e01 100644
--- a/drivers/staging/speakup/speakup_decext.c
+++ b/drivers/staging/speakup/speakup_decext.c

@@ -165,27 +165,27 @@
 	jiffy_delta = spk_get_var(JIFFY);
 	delay_time = spk_get_var(DELAY);
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	jiffy_delta_val = jiffy_delta->u.n.value;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	jiff_max = jiffies + jiffy_delta_val;
 
 	while (!kthread_should_stop()) {
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		if (speakup_info.flushing) {
 			speakup_info.flushing = 0;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			synth->flush(synth);
 			continue;
 		}
 		if (synth_buffer_empty()) {
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			break;
 		}
 		ch = synth_buffer_peek();
 		set_current_state(TASK_INTERRUPTIBLE);
 		delay_time_val = delay_time->u.n.value;
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (ch == '\n')
 			ch = 0x0D;
 		if (synth_full() || !spk_serial_out(ch)) {
@@ -193,9 +193,9 @@
 			continue;
 		}
 		set_current_state(TASK_RUNNING);
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		synth_buffer_getc();
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (ch == '[')
 			in_escape = 1;
 		else if (ch == ']')
@@ -206,10 +206,10 @@
 			if (jiffies >= jiff_max) {
 				if (!in_escape)
 					spk_serial_out(PROCSPEECH);
-				spk_lock(flags);
+				spin_lock_irqsave(&speakup_info.spinlock, flags);
 				jiffy_delta_val = jiffy_delta->u.n.value;
 				delay_time_val = delay_time->u.n.value;
-				spk_unlock(flags);
+				spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 				schedule_timeout(msecs_to_jiffies
 						 (delay_time_val));
 				jiff_max = jiffies + jiffy_delta_val;

diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c
index 6c88b55..ea6b72d 100644
--- a/drivers/staging/speakup/speakup_decpc.c
+++ b/drivers/staging/speakup/speakup_decpc.c

@@ -377,27 +377,27 @@
 
 	jiffy_delta = spk_get_var(JIFFY);
 	delay_time = spk_get_var(DELAY);
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	jiffy_delta_val = jiffy_delta->u.n.value;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	jiff_max = jiffies + jiffy_delta_val;
 
 	while (!kthread_should_stop()) {
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		if (speakup_info.flushing) {
 			speakup_info.flushing = 0;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			synth->flush(synth);
 			continue;
 		}
 		if (synth_buffer_empty()) {
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			break;
 		}
 		ch = synth_buffer_peek();
 		set_current_state(TASK_INTERRUPTIBLE);
 		delay_time_val = delay_time->u.n.value;
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (ch == '\n')
 			ch = 0x0D;
 		if (dt_sendchar(ch)) {
@@ -405,9 +405,9 @@
 			continue;
 		}
 		set_current_state(TASK_RUNNING);
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		synth_buffer_getc();
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (ch == '[')
 			in_escape = 1;
 		else if (ch == ']')
@@ -418,10 +418,10 @@
 			if (jiffies >= jiff_max) {
 				if (!in_escape)
 					dt_sendchar(PROCSPEECH);
-				spk_lock(flags);
+				spin_lock_irqsave(&speakup_info.spinlock, flags);
 				jiffy_delta_val = jiffy_delta->u.n.value;
 				delay_time_val = delay_time->u.n.value;
-				spk_unlock(flags);
+				spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 				schedule_timeout(msecs_to_jiffies
 						 (delay_time_val));
 				jiff_max = jiffies + jiffy_delta_val;
@@ -444,7 +444,7 @@
 			return buf;
 		buf++;
 	}
-	return 0;
+	return NULL;
 }
 
 static int synth_probe(struct spk_synth *synth)

diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c
index 0dd2eb9..15fdec3 100644
--- a/drivers/staging/speakup/speakup_dectlk.c
+++ b/drivers/staging/speakup/speakup_dectlk.c

@@ -216,9 +216,9 @@
 
 	jiffy_delta = spk_get_var(JIFFY);
 	delay_time = spk_get_var(DELAY);
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	jiffy_delta_val = jiffy_delta->u.n.value;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	jiff_max = jiffies + jiffy_delta_val;
 
 	while (!kthread_should_stop()) {
@@ -234,22 +234,22 @@
 		is_flushing = 0;
 		spin_unlock_irqrestore(&flush_lock, flags);
 
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		if (speakup_info.flushing) {
 			speakup_info.flushing = 0;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			synth->flush(synth);
 			continue;
 		}
 		if (synth_buffer_empty()) {
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			break;
 		}
 		ch = synth_buffer_peek();
 		set_current_state(TASK_INTERRUPTIBLE);
 		delay_time_val = delay_time->u.n.value;
 		synth_full_val = synth_full();
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (ch == '\n')
 			ch = 0x0D;
 		if (synth_full_val || !spk_serial_out(ch)) {
@@ -257,9 +257,9 @@
 			continue;
 		}
 		set_current_state(TASK_RUNNING);
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		synth_buffer_getc();
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (ch == '[')
 			in_escape = 1;
 		else if (ch == ']')
@@ -270,10 +270,10 @@
 			if (jiffies >= jiff_max) {
 				if (!in_escape)
 					spk_serial_out(PROCSPEECH);
-				spk_lock(flags);
+				spin_lock_irqsave(&speakup_info.spinlock, flags);
 				jiffy_delta_val = jiffy_delta->u.n.value;
 				delay_time_val = delay_time->u.n.value;
-				spk_unlock(flags);
+				spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 				schedule_timeout(msecs_to_jiffies
 						 (delay_time_val));
 				jiff_max = jiffies + jiffy_delta_val;

diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c
index a9cefbd..1feb0fb 100644
--- a/drivers/staging/speakup/speakup_dtlk.c
+++ b/drivers/staging/speakup/speakup_dtlk.c

@@ -200,42 +200,42 @@
 
 	jiffy_delta = spk_get_var(JIFFY);
 	delay_time = spk_get_var(DELAY);
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	jiffy_delta_val = jiffy_delta->u.n.value;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	jiff_max = jiffies + jiffy_delta_val;
 	while (!kthread_should_stop()) {
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		if (speakup_info.flushing) {
 			speakup_info.flushing = 0;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			synth->flush(synth);
 			continue;
 		}
 		if (synth_buffer_empty()) {
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			break;
 		}
 		set_current_state(TASK_INTERRUPTIBLE);
 		delay_time_val = delay_time->u.n.value;
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (synth_full()) {
 			schedule_timeout(msecs_to_jiffies(delay_time_val));
 			continue;
 		}
 		set_current_state(TASK_RUNNING);
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		ch = synth_buffer_getc();
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (ch == '\n')
 			ch = PROCSPEECH;
 		spk_out(ch);
 		if ((jiffies >= jiff_max) && (ch == SPACE)) {
 			spk_out(PROCSPEECH);
-			spk_lock(flags);
+			spin_lock_irqsave(&speakup_info.spinlock, flags);
 			delay_time_val = delay_time->u.n.value;
 			jiffy_delta_val = jiffy_delta->u.n.value;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			schedule_timeout(msecs_to_jiffies(delay_time_val));
 			jiff_max = jiffies + jiffy_delta_val;
 		}
@@ -254,7 +254,7 @@
 		spk_out(ch);
 		buf++;
 	}
-	return 0;
+	return NULL;
 }
 
 static void synth_flush(struct spk_synth *synth)

diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c
index feb5f22..2f2fe5e 100644
--- a/drivers/staging/speakup/speakup_keypc.c
+++ b/drivers/staging/speakup/speakup_keypc.c

@@ -168,7 +168,7 @@
 		udelay(70);
 		buf++;
 	}
-	return 0;
+	return NULL;
 }
 
 static void do_catch_up(struct spk_synth *synth)
@@ -187,26 +187,26 @@
 	jiffy_delta = spk_get_var(JIFFY);
 	delay_time = spk_get_var(DELAY);
 	full_time = spk_get_var(FULL);
-spk_lock(flags);
+spin_lock_irqsave(&speakup_info.spinlock, flags);
 	jiffy_delta_val = jiffy_delta->u.n.value;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 
 	jiff_max = jiffies + jiffy_delta_val;
 	while (!kthread_should_stop()) {
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		if (speakup_info.flushing) {
 			speakup_info.flushing = 0;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			synth->flush(synth);
 			continue;
 		}
 		if (synth_buffer_empty()) {
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			break;
 		}
 		set_current_state(TASK_INTERRUPTIBLE);
 		full_time_val = full_time->u.n.value;
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (synth_full()) {
 			schedule_timeout(msecs_to_jiffies(full_time_val));
 			continue;
@@ -220,9 +220,9 @@
 			oops();
 			break;
 		}
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		ch = synth_buffer_getc();
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (ch == '\n')
 			ch = PROCSPEECH;
 		outb_p(ch, synth_port);
@@ -237,10 +237,10 @@
 				break;
 			}
 			outb_p(PROCSPEECH, synth_port);
-			spk_lock(flags);
+			spin_lock_irqsave(&speakup_info.spinlock, flags);
 			jiffy_delta_val = jiffy_delta->u.n.value;
 			delay_time_val = delay_time->u.n.value;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			schedule_timeout(msecs_to_jiffies(delay_time_val));
 			jiff_max = jiffies+jiffy_delta_val;
 		}

diff --git a/drivers/staging/speakup/speakup_soft.c b/drivers/staging/speakup/speakup_soft.c
index e2f5c81..243c3d5 100644
--- a/drivers/staging/speakup/speakup_soft.c
+++ b/drivers/staging/speakup/speakup_soft.c

@@ -179,45 +179,45 @@
 	unsigned long flags;
 	/*if ((fp->f_flags & O_ACCMODE) != O_RDONLY) */
 	/*	return -EPERM; */
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	if (synth_soft.alive) {
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		return -EBUSY;
 	}
 	synth_soft.alive = 1;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return 0;
 }
 
 static int softsynth_close(struct inode *inode, struct file *fp)
 {
 	unsigned long flags;
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	synth_soft.alive = 0;
 	init_pos = 0;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	/* Make sure we let applications go before leaving */
 	speakup_start_ttys();
 	return 0;
 }
 
-static ssize_t softsynth_read(struct file *fp, char *buf, size_t count,
+static ssize_t softsynth_read(struct file *fp, char __user *buf, size_t count,
 			      loff_t *pos)
 {
 	int chars_sent = 0;
-	char *cp;
+	char __user *cp;
 	char *init;
 	char ch;
 	int empty;
 	unsigned long flags;
 	DEFINE_WAIT(wait);
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	while (1) {
 		prepare_to_wait(&speakup_event, &wait, TASK_INTERRUPTIBLE);
 		if (!synth_buffer_empty() || speakup_info.flushing)
 			break;
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (fp->f_flags & O_NONBLOCK) {
 			finish_wait(&speakup_event, &wait);
 			return -EAGAIN;
@@ -227,7 +227,7 @@
 			return -ERESTARTSYS;
 		}
 		schedule();
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 	}
 	finish_wait(&speakup_event, &wait);
 
@@ -244,16 +244,16 @@
 		} else {
 			ch = synth_buffer_getc();
 		}
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (copy_to_user(cp, &ch, 1))
 			return -EFAULT;
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		chars_sent++;
 		cp++;
 	}
 	*pos += chars_sent;
 	empty = synth_buffer_empty();
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	if (empty) {
 		speakup_start_ttys();
 		*pos = 0;
@@ -263,8 +263,8 @@
 
 static int last_index;
 
-static ssize_t softsynth_write(struct file *fp, const char *buf, size_t count,
-			       loff_t *pos)
+static ssize_t softsynth_write(struct file *fp, const char __user *buf,
+			       size_t count, loff_t *pos)
 {
 	unsigned long supplied_index = 0;
 	int converted;
@@ -285,10 +285,10 @@
 	int ret = 0;
 	poll_wait(fp, &speakup_event, wait);
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	if (!synth_buffer_empty() || speakup_info.flushing)
 		ret = POLLIN | POLLRDNORM;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return ret;
 }
 

diff --git a/drivers/staging/speakup/spk_priv.h b/drivers/staging/speakup/spk_priv.h
index 303105b..637ba67 100644
--- a/drivers/staging/speakup/spk_priv.h
+++ b/drivers/staging/speakup/spk_priv.h

@@ -77,17 +77,4 @@
 
 extern struct var_t synth_time_vars[];
 
-/* Protect the whole speakup machinery, must be taken at each kernel->speakup
- * transition and released at all corresponding speakup->kernel transitions
- * (flags must be the same variable between lock/trylock and unlock).
- *
- * The progression thread only interferes with the speakup machinery through
- * the synth buffer, and so only needs to take the lock while tinkering with
- * it.
- */
-/* Speakup needs to disable the keyboard IRQ, hence _irqsave/restore */
-#define spk_lock(flags) spin_lock_irqsave(&speakup_info.spinlock, flags)
-#define spk_trylock(flags) spin_trylock_irqsave(&speakup_info.spinlock, flags)
-#define spk_unlock(flags) spin_unlock_irqrestore(&speakup_info.spinlock, flags)
-
 #endif

diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c
index d867dd9..0b3549b 100644
--- a/drivers/staging/speakup/synth.c
+++ b/drivers/staging/speakup/synth.c

@@ -25,6 +25,18 @@
 bool spk_quiet_boot;
 
 struct speakup_info_t speakup_info = {
+	/*
+	 * This spinlock is used to protect the entire speakup machinery, and
+	 * must be taken at each kernel->speakup transition and released at
+	 * each corresponding speakup->kernel transition.
+	 *
+	 * The progression thread only interferes with the speakup machinery through
+	 * the synth buffer, so only needs to take the lock while tinkering with
+	 * the buffer.
+	 *
+	 * We use spin_lock/trylock_irqsave and spin_unlock_irqrestore with this
+	 * spinlock because speakup needs to disable the keyboard IRQ.
+	 */
 	.spinlock = __SPIN_LOCK_UNLOCKED(speakup_info.spinlock),
 	.flushing = 0,
 };
@@ -83,27 +95,27 @@
 	full_time = spk_get_var(FULL);
 	delay_time = spk_get_var(DELAY);
 
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	jiffy_delta_val = jiffy_delta->u.n.value;
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 
 	jiff_max = jiffies + jiffy_delta_val;
 	while (!kthread_should_stop()) {
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		if (speakup_info.flushing) {
 			speakup_info.flushing = 0;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			synth->flush(synth);
 			continue;
 		}
 		if (synth_buffer_empty()) {
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			break;
 		}
 		ch = synth_buffer_peek();
 		set_current_state(TASK_INTERRUPTIBLE);
 		full_time_val = full_time->u.n.value;
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 		if (ch == '\n')
 			ch = synth->procspeech;
 		if (!spk_serial_out(ch)) {
@@ -111,11 +123,11 @@
 			continue;
 		}
 		if ((jiffies >= jiff_max) && (ch == SPACE)) {
-			spk_lock(flags);
+			spin_lock_irqsave(&speakup_info.spinlock, flags);
 			jiffy_delta_val = jiffy_delta->u.n.value;
 			delay_time_val = delay_time->u.n.value;
 			full_time_val = full_time->u.n.value;
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			if (spk_serial_out(synth->procspeech))
 				schedule_timeout(
 					msecs_to_jiffies(delay_time_val));
@@ -125,9 +137,9 @@
 			jiff_max = jiffies + jiffy_delta_val;
 		}
 		set_current_state(TASK_RUNNING);
-		spk_lock(flags);
+		spin_lock_irqsave(&speakup_info.spinlock, flags);
 		synth_buffer_getc();
-		spk_unlock(flags);
+		spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	}
 	spk_serial_out(synth->procspeech);
 }
@@ -145,7 +157,7 @@
 			return buff;
 		buff++;
 	}
-	return 0;
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(spk_synth_immediate);
 
@@ -403,11 +415,11 @@
 
 	if (synth == NULL)
 		return;
-	spk_lock(flags);
+	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	pr_info("releasing synth %s\n", synth->name);
 	synth->alive = 0;
 	del_timer(&thread_timer);
-	spk_unlock(flags);
+	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	if (synth->attributes.name)
 		sysfs_remove_group(speakup_kobj, &(synth->attributes));
 	for (var = synth->vars; var->var_id != MAXVARS; var++)

diff --git a/drivers/staging/speakup/thread.c b/drivers/staging/speakup/thread.c
index 42fa660..4397c8e 100644
--- a/drivers/staging/speakup/thread.c
+++ b/drivers/staging/speakup/thread.c

@@ -22,7 +22,7 @@
 	while (1) {
 		DEFINE_WAIT(wait);
 		while (1) {
-			spk_lock(flags);
+			spin_lock_irqsave(&speakup_info.spinlock, flags);
 			our_sound = spk_unprocessed_sound;
 			spk_unprocessed_sound.active = 0;
 			prepare_to_wait(&speakup_event, &wait,
@@ -32,7 +32,7 @@
 				(synth && synth->catch_up && synth->alive &&
 					(speakup_info.flushing ||
 					!synth_buffer_empty()));
-			spk_unlock(flags);
+			spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 			if (should_break)
 				break;
 			mutex_unlock(&spk_mutex);

diff --git a/drivers/staging/speakup/varhandlers.c b/drivers/staging/speakup/varhandlers.c
index 7f6288f..9aa2a78 100644
--- a/drivers/staging/speakup/varhandlers.c
+++ b/drivers/staging/speakup/varhandlers.c

@@ -137,18 +137,15 @@
 struct st_var_header *spk_var_header_by_name(const char *name)
 {
 	int i;
-	struct st_var_header *where = NULL;
 
-	if (name != NULL) {
-		i = 0;
-		while ((i < MAXVARS) && (where == NULL)) {
-			if (strcmp(name, var_ptrs[i]->name) == 0)
-				where = var_ptrs[i];
-			else
-				i++;
-		}
+	if (!name)
+		return NULL;
+
+	for (i = 0; i < MAXVARS; i++) {
+		if (strcmp(name, var_ptrs[i]->name) == 0)
+			return var_ptrs[i];
 	}
-	return where;
+	return NULL;
 }
 
 struct var_t *spk_get_var(enum var_id_t var_id)
@@ -280,7 +277,7 @@
 			spk_chartab[*cp] &= ~mask;
 	}
 	cp = (u_char *)input;
-	if (cp == 0)
+	if (!cp)
 		cp = spk_punc_info[which].value;
 	else {
 		for ( ; *cp; cp++) {

diff --git a/drivers/staging/ti-soc-thermal/ti-thermal-common.c b/drivers/staging/ti-soc-thermal/ti-thermal-common.c
index e3c5e67..8e67ebf 100644
--- a/drivers/staging/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/staging/ti-soc-thermal/ti-thermal-common.c

@@ -38,6 +38,7 @@
 /* common data structures */
 struct ti_thermal_data {
 	struct thermal_zone_device *ti_thermal;
+	struct thermal_zone_device *pcb_tz;
 	struct thermal_cooling_device *cool_dev;
 	struct ti_bandgap *bgp;
 	enum thermal_device_mode mode;
@@ -77,10 +78,12 @@
 static inline int ti_thermal_get_temp(struct thermal_zone_device *thermal,
 				      unsigned long *temp)
 {
+	struct thermal_zone_device *pcb_tz = NULL;
 	struct ti_thermal_data *data = thermal->devdata;
 	struct ti_bandgap *bgp;
 	const struct ti_temp_sensor *s;
-	int ret, tmp, pcb_temp, slope, constant;
+	int ret, tmp, slope, constant;
+	unsigned long pcb_temp;
 
 	if (!data)
 		return 0;
@@ -92,16 +95,22 @@
 	if (ret)
 		return ret;
 
-	pcb_temp = 0;
-	/* TODO: Introduce pcb temperature lookup */
+	/* Default constants */
+	slope = s->slope;
+	constant = s->constant;
+
+	pcb_tz = data->pcb_tz;
 	/* In case pcb zone is available, use the extrapolation rule with it */
-	if (pcb_temp) {
-		tmp -= pcb_temp;
-		slope = s->slope_pcb;
-		constant = s->constant_pcb;
-	} else {
-		slope = s->slope;
-		constant = s->constant;
+	if (!IS_ERR_OR_NULL(pcb_tz)) {
+		ret = thermal_zone_get_temp(pcb_tz, &pcb_temp);
+		if (!ret) {
+			tmp -= pcb_temp; /* got a valid PCB temp */
+			slope = s->slope_pcb;
+			constant = s->constant_pcb;
+		} else {
+			dev_err(bgp->dev,
+				"Failed to read PCB state. Using defaults\n");
+		}
 	}
 	*temp = ti_thermal_hotspot_temperature(tmp, slope, constant);
 
@@ -273,6 +282,7 @@
 	data->sensor_id = id;
 	data->bgp = bgp;
 	data->mode = THERMAL_DEVICE_ENABLED;
+	data->pcb_tz = thermal_zone_get_zone_by_name("pcb");
 	INIT_WORK(&data->thermal_wq, ti_thermal_work);
 
 	return data;

diff --git a/drivers/staging/ti-soc-thermal/ti_soc_thermal.txt b/drivers/staging/ti-soc-thermal/ti_soc_thermal.txt
index a4a33d1..1629652 100644
--- a/drivers/staging/ti-soc-thermal/ti_soc_thermal.txt
+++ b/drivers/staging/ti-soc-thermal/ti_soc_thermal.txt

@@ -57,4 +57,5 @@
 		0x4a002380 0x2c
 		0x4a0023C0 0x3c>;
 	compatible = "ti,omap5430-bandgap";
+	interrupts = <0 126 4>; /* talert */
 };

diff --git a/drivers/staging/tidspbridge/core/_tiomap.h b/drivers/staging/tidspbridge/core/_tiomap.h
index b783bfa..65971b7 100644
--- a/drivers/staging/tidspbridge/core/_tiomap.h
+++ b/drivers/staging/tidspbridge/core/_tiomap.h

@@ -145,8 +145,8 @@
 #define L4_PERIPHERAL_MBOX        0x48094000
 #define DSPVA_PERIPHERAL_MBOX     0x11808000
 
-#define PM_GRPSEL_BASE 			0x48307000
-#define DSPVA_GRPSEL_BASE 		0x11821000
+#define PM_GRPSEL_BASE	0x48307000
+#define DSPVA_GRPSEL_BASE	0x11821000
 
 #define L4_PERIPHERAL_SIDETONE_MCBSP2        0x49028000
 #define DSPVA_PERIPHERAL_SIDETONE_MCBSP2 0x11824000
@@ -311,7 +311,7 @@
 
 #define SET_GROUP_BITS16(reg, position, width, value) \
 	do {\
-		reg &= ~((0xFFFF >> (16 - (width))) << (position)) ; \
+		reg &= ~((0xFFFF >> (16 - (width))) << (position)); \
 		reg |= ((value & (0xFFFF >> (16 - (width)))) << (position)); \
 	} while (0);
 

diff --git a/drivers/staging/tidspbridge/core/_tiomap_pwr.h b/drivers/staging/tidspbridge/core/_tiomap_pwr.h
index bd0354d..7bbd380 100644
--- a/drivers/staging/tidspbridge/core/_tiomap_pwr.h
+++ b/drivers/staging/tidspbridge/core/_tiomap_pwr.h

@@ -40,7 +40,7 @@
 			    u32 dw_cmd, void *pargs);
 /*
  *  ========interrupt_dsp========
- *  	  Sends an interrupt to DSP unconditionally.
+ *	Sends an interrupt to DSP unconditionally.
  */
 extern void interrupt_dsp(struct bridge_dev_context *dev_context,
 							u16 mb_val);
@@ -53,24 +53,24 @@
 					*dev_context, void *pargs);
 /*
  *  ======== handle_hibernation_from_dsp ========
- *  	Handle Hibernation requested from DSP
+ *	Handle Hibernation requested from DSP
  */
 int handle_hibernation_from_dsp(struct bridge_dev_context *dev_context);
 /*
  *  ======== post_scale_dsp ========
- *  	Handle Post Scale notification to DSP
+ *	Handle Post Scale notification to DSP
  */
 int post_scale_dsp(struct bridge_dev_context *dev_context,
 							void *pargs);
 /*
  *  ======== pre_scale_dsp ========
- *  	Handle Pre Scale notification to DSP
+ *	Handle Pre Scale notification to DSP
  */
 int pre_scale_dsp(struct bridge_dev_context *dev_context,
 							void *pargs);
 /*
  *  ======== handle_constraints_set ========
- *  	Handle constraints request from DSP
+ *	Handle constraints request from DSP
  */
 int handle_constraints_set(struct bridge_dev_context *dev_context,
 				  void *pargs);

diff --git a/drivers/staging/tidspbridge/core/tiomap3430_pwr.c b/drivers/staging/tidspbridge/core/tiomap3430_pwr.c
index dafa6d9..1862afd 100644
--- a/drivers/staging/tidspbridge/core/tiomap3430_pwr.c
+++ b/drivers/staging/tidspbridge/core/tiomap3430_pwr.c

@@ -51,7 +51,7 @@
 
 /*
  *  ======== handle_constraints_set ========
- *  	Sets new DSP constraint
+ *	Sets new DSP constraint
  */
 int handle_constraints_set(struct bridge_dev_context *dev_context,
 				  void *pargs)
@@ -75,7 +75,7 @@
 
 /*
  *  ======== handle_hibernation_from_dsp ========
- *  	Handle Hibernation requested from DSP
+ *	Handle Hibernation requested from DSP
  */
 int handle_hibernation_from_dsp(struct bridge_dev_context *dev_context)
 {
@@ -144,7 +144,7 @@
 
 /*
  *  ======== sleep_dsp ========
- *  	Put DSP in low power consuming state.
+ *	Put DSP in low power consuming state.
  */
 int sleep_dsp(struct bridge_dev_context *dev_context, u32 dw_cmd,
 		     void *pargs)
@@ -250,7 +250,7 @@
 
 /*
  *  ======== wake_dsp ========
- *  	Wake up DSP from sleep.
+ *	Wake up DSP from sleep.
  */
 int wake_dsp(struct bridge_dev_context *dev_context, void *pargs)
 {
@@ -276,7 +276,7 @@
 
 /*
  *  ======== dsp_peripheral_clk_ctrl ========
- *  	Enable/Disable the DSP peripheral clocks as needed..
+ *	Enable/Disable the DSP peripheral clocks as needed..
  */
 int dsp_peripheral_clk_ctrl(struct bridge_dev_context *dev_context,
 				   void *pargs)

diff --git a/drivers/staging/tidspbridge/core/ue_deh.c b/drivers/staging/tidspbridge/core/ue_deh.c
index 6aea6f1..e68f0ba 100644
--- a/drivers/staging/tidspbridge/core/ue_deh.c
+++ b/drivers/staging/tidspbridge/core/ue_deh.c

@@ -177,7 +177,7 @@
 	void *dummy_va_addr;
 
 	resources = dev_context->resources;
-	dummy_va_addr = (void*)__get_free_page(GFP_ATOMIC);
+	dummy_va_addr = (void *)__get_free_page(GFP_ATOMIC);
 
 	/*
 	 * Before acking the MMU fault, let's make sure MMU can only

diff --git a/drivers/staging/tidspbridge/core/wdt.c b/drivers/staging/tidspbridge/core/wdt.c
index 7ff0e6c..c7ee467 100644
--- a/drivers/staging/tidspbridge/core/wdt.c
+++ b/drivers/staging/tidspbridge/core/wdt.c

@@ -25,8 +25,8 @@
 #include <dspbridge/host_os.h>
 
 
-#define OMAP34XX_WDT3_BASE 		(0x49000000 + 0x30000)
-#define INT_34XX_WDT3_IRQ 		(36 + NR_IRQS)
+#define OMAP34XX_WDT3_BASE	(0x49000000 + 0x30000)
+#define INT_34XX_WDT3_IRQ	(36 + NR_IRQS)
 
 static struct dsp_wdt_setting dsp_wdt;
 

diff --git a/drivers/staging/tidspbridge/rmgr/drv_interface.c b/drivers/staging/tidspbridge/rmgr/drv_interface.c
index df0f37e..9c02056 100644
--- a/drivers/staging/tidspbridge/rmgr/drv_interface.c
+++ b/drivers/staging/tidspbridge/rmgr/drv_interface.c

@@ -508,6 +508,7 @@
 	bridge_class = class_create(THIS_MODULE, "ti_bridge");
 	if (IS_ERR(bridge_class)) {
 		pr_err("%s: Error creating bridge class\n", __func__);
+		err = PTR_ERR(bridge_class);
 		goto err3;
 	}
 

diff --git a/drivers/staging/usbip/usbip_event.c b/drivers/staging/usbip/usbip_event.c
index 82123be..64933b9 100644
--- a/drivers/staging/usbip/usbip_event.c
+++ b/drivers/staging/usbip/usbip_event.c

@@ -85,7 +85,7 @@
 
 	ud->eh = kthread_run(event_handler_loop, ud, "usbip_eh");
 	if (IS_ERR(ud->eh)) {
-		pr_warning("Unable to start control thread\n");
+		pr_warn("Unable to start control thread\n");
 		return PTR_ERR(ud->eh);
 	}
 

diff --git a/drivers/staging/vme/devices/vme_user.c b/drivers/staging/vme/devices/vme_user.c
index da7f759..daec155 100644
--- a/drivers/staging/vme/devices/vme_user.c
+++ b/drivers/staging/vme/devices/vme_user.c

@@ -109,7 +109,7 @@
 	unsigned long ioctls;
 	unsigned long irqs;
 	unsigned long berrs;
-	unsigned long dmaErrors;
+	unsigned long dmaerrors;
 	unsigned long timeouts;
 	unsigned long external;
 };
@@ -160,7 +160,7 @@
 	statistics.ioctls = 0;
 	statistics.irqs = 0;
 	statistics.berrs = 0;
-	statistics.dmaErrors = 0;
+	statistics.dmaerrors = 0;
 	statistics.timeouts = 0;
 }
 
@@ -734,6 +734,7 @@
 		if (image[i].resource == NULL) {
 			dev_warn(&vdev->dev,
 				 "Unable to allocate slave resource\n");
+			err = -ENOMEM;
 			goto err_slave;
 		}
 		image[i].size_buf = PCI_BUF_SIZE;
@@ -760,6 +761,7 @@
 		if (image[i].resource == NULL) {
 			dev_warn(&vdev->dev,
 				 "Unable to allocate master resource\n");
+			err = -ENOMEM;
 			goto err_master;
 		}
 		image[i].size_buf = PCI_BUF_SIZE;

diff --git a/drivers/staging/vme/devices/vme_user.h b/drivers/staging/vme/devices/vme_user.h
index 7d24cd6..280ccc7 100644
--- a/drivers/staging/vme/devices/vme_user.h
+++ b/drivers/staging/vme/devices/vme_user.h

@@ -14,9 +14,9 @@
 	u32 cycle;		/* Cycle properties */
 	u32 dwidth;		/* Maximum Data Width */
 #if 0
-	char prefetchEnable;		/* Prefetch Read Enable State */
-	int prefetchSize;		/* Prefetch Read Size (Cache Lines) */
-	char wrPostEnable;		/* Write Post State */
+	char prefetchenable;		/* Prefetch Read Enable State */
+	int prefetchsize;		/* Prefetch Read Size (Cache Lines) */
+	char wrpostenable;		/* Write Post State */
 #endif
 };
 
@@ -37,9 +37,9 @@
 	u32 aspace;			/* Address Space */
 	u32 cycle;		/* Cycle properties */
 #if 0
-	char wrPostEnable;		/* Write Post State */
-	char rmwLock;			/* Lock PCI during RMW Cycles */
-	char data64BitCapable;		/* non-VMEbus capable of 64-bit Data */
+	char wrpostenable;		/* Write Post State */
+	char rmwlock;			/* Lock PCI during RMW Cycles */
+	char data64bitcapable;		/* non-VMEbus capable of 64-bit Data */
 #endif
 };
 

diff --git a/drivers/staging/vt6655/80211hdr.h b/drivers/staging/vt6655/80211hdr.h
index 28078a1..ba53340 100644
--- a/drivers/staging/vt6655/80211hdr.h
+++ b/drivers/staging/vt6655/80211hdr.h

@@ -68,7 +68,7 @@
 #define BIT30	0x40000000
 #define BIT31	0x80000000
 
-// 802.11 frame related, defined as 802.11 spec
+/* 802.11 frame related, defined as 802.11 spec */
 #define WLAN_ADDR_LEN               6
 #define WLAN_CRC_LEN                4
 #define WLAN_CRC32_LEN              4

diff --git a/drivers/staging/vt6655/80211mgr.c b/drivers/staging/vt6655/80211mgr.c
index 4cb26f3..76c8490 100644
--- a/drivers/staging/vt6655/80211mgr.c
+++ b/drivers/staging/vt6655/80211mgr.c

@@ -66,7 +66,7 @@
 /*---------------------  Static Variables  --------------------------*/
 
 static int msglevel = MSG_LEVEL_INFO;
-//static int          msglevel                =MSG_LEVEL_DEBUG;
+/* static int          msglevel                =MSG_LEVEL_DEBUG; */
 /*---------------------  Static Functions  --------------------------*/
 
 /*---------------------  Export Variables  --------------------------*/
@@ -90,7 +90,7 @@
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pqwTimestamp = (PQWORD)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					+ WLAN_BEACON_OFF_TS);
 	pFrame->pwBeaconInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -123,7 +123,7 @@
 
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pqwTimestamp = (PQWORD)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					+ WLAN_BEACON_OFF_TS);
 	pFrame->pwBeaconInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -131,7 +131,7 @@
 	pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					       + WLAN_BEACON_OFF_CAPINFO);
 
-	// Information elements
+	/* Information elements */
 	pItem = (PWLAN_IE)((unsigned char *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3)))
 			   + WLAN_BEACON_OFF_SSID);
 	while (((unsigned char *)pItem) < (pFrame->pBuf + pFrame->len)) {
@@ -145,7 +145,7 @@
 				pFrame->pSuppRates = (PWLAN_IE_SUPP_RATES)pItem;
 			break;
 		case WLAN_EID_FH_PARMS:
-			//pFrame->pFHParms = (PWLAN_IE_FH_PARMS)pItem;
+			/* pFrame->pFHParms = (PWLAN_IE_FH_PARMS)pItem; */
 			break;
 		case WLAN_EID_DS_PARMS:
 			if (pFrame->pDSParms == NULL)
@@ -185,22 +185,22 @@
 				pFrame->pExtSuppRates = (PWLAN_IE_SUPP_RATES)pItem;
 			break;
 
-		case WLAN_EID_COUNTRY:      //7
+		case WLAN_EID_COUNTRY:      /* 7 */
 			if (pFrame->pIE_Country == NULL)
 				pFrame->pIE_Country = (PWLAN_IE_COUNTRY)pItem;
 			break;
 
-		case WLAN_EID_PWR_CONSTRAINT:   //32
+		case WLAN_EID_PWR_CONSTRAINT:   /* 32 */
 			if (pFrame->pIE_PowerConstraint == NULL)
 				pFrame->pIE_PowerConstraint = (PWLAN_IE_PW_CONST)pItem;
 			break;
 
-		case WLAN_EID_CH_SWITCH:    //37
+		case WLAN_EID_CH_SWITCH:    /* 37 */
 			if (pFrame->pIE_CHSW == NULL)
 				pFrame->pIE_CHSW = (PWLAN_IE_CH_SW)pItem;
 			break;
 
-		case WLAN_EID_QUIET:        //40
+		case WLAN_EID_QUIET:        /* 40 */
 			if (pFrame->pIE_Quiet == NULL)
 				pFrame->pIE_Quiet = (PWLAN_IE_QUIET)pItem;
 			break;
@@ -282,7 +282,7 @@
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwReason = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					      + WLAN_DISASSOC_OFF_REASON);
 	pFrame->len = WLAN_HDR_ADDR3_LEN + WLAN_DISASSOC_OFF_REASON + sizeof(*(pFrame->pwReason));
@@ -308,7 +308,7 @@
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwReason = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					      + WLAN_DISASSOC_OFF_REASON);
 
@@ -332,7 +332,7 @@
 )
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					       + WLAN_ASSOCREQ_OFF_CAP_INFO);
 	pFrame->pwListenInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -360,13 +360,13 @@
 	PWLAN_IE   pItem;
 
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					       + WLAN_ASSOCREQ_OFF_CAP_INFO);
 	pFrame->pwListenInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 						      + WLAN_ASSOCREQ_OFF_LISTEN_INT);
 
-	// Information elements
+	/* Information elements */
 	pItem = (PWLAN_IE)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 			   + WLAN_ASSOCREQ_OFF_SSID);
 
@@ -425,7 +425,7 @@
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					       + WLAN_ASSOCRESP_OFF_CAP_INFO);
 	pFrame->pwStatus = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -458,7 +458,7 @@
 
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					       + WLAN_ASSOCRESP_OFF_CAP_INFO);
 	pFrame->pwStatus = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -466,7 +466,7 @@
 	pFrame->pwAid = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					   + WLAN_ASSOCRESP_OFF_AID);
 
-	// Information elements
+	/* Information elements */
 	pFrame->pSuppRates  = (PWLAN_IE_SUPP_RATES)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 						    + WLAN_ASSOCRESP_OFF_SUPP_RATES);
 
@@ -501,7 +501,7 @@
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					       + WLAN_REASSOCREQ_OFF_CAP_INFO);
 	pFrame->pwListenInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -532,7 +532,7 @@
 	PWLAN_IE   pItem;
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					       + WLAN_REASSOCREQ_OFF_CAP_INFO);
 	pFrame->pwListenInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -540,7 +540,7 @@
 	pFrame->pAddrCurrAP = (PIEEE_ADDR)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					   + WLAN_REASSOCREQ_OFF_CURR_AP);
 
-	// Information elements
+	/* Information elements */
 	pItem = (PWLAN_IE)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 			   + WLAN_REASSOCREQ_OFF_SSID);
 
@@ -622,7 +622,7 @@
 
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Information elements
+	/* Information elements */
 	pItem = (PWLAN_IE)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3)));
 
 	while (((unsigned char *)pItem) < (pFrame->pBuf + pFrame->len)) {
@@ -670,7 +670,7 @@
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pqwTimestamp = (PQWORD)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					+ WLAN_PROBERESP_OFF_TS);
 	pFrame->pwBeaconInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -704,7 +704,7 @@
 
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pqwTimestamp = (PQWORD)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					+ WLAN_PROBERESP_OFF_TS);
 	pFrame->pwBeaconInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -712,7 +712,7 @@
 	pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					       + WLAN_PROBERESP_OFF_CAP_INFO);
 
-	// Information elements
+	/* Information elements */
 	pItem = (PWLAN_IE)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 			   + WLAN_PROBERESP_OFF_SSID);
 
@@ -761,22 +761,22 @@
 				pFrame->pExtSuppRates = (PWLAN_IE_SUPP_RATES)pItem;
 			break;
 
-		case WLAN_EID_COUNTRY:      //7
+		case WLAN_EID_COUNTRY:      /* 7 */
 			if (pFrame->pIE_Country == NULL)
 				pFrame->pIE_Country = (PWLAN_IE_COUNTRY)pItem;
 			break;
 
-		case WLAN_EID_PWR_CONSTRAINT:   //32
+		case WLAN_EID_PWR_CONSTRAINT:   /* 32 */
 			if (pFrame->pIE_PowerConstraint == NULL)
 				pFrame->pIE_PowerConstraint = (PWLAN_IE_PW_CONST)pItem;
 			break;
 
-		case WLAN_EID_CH_SWITCH:    //37
+		case WLAN_EID_CH_SWITCH:    /* 37 */
 			if (pFrame->pIE_CHSW == NULL)
 				pFrame->pIE_CHSW = (PWLAN_IE_CH_SW)pItem;
 			break;
 
-		case WLAN_EID_QUIET:        //40
+		case WLAN_EID_QUIET:        /* 40 */
 			if (pFrame->pIE_Quiet == NULL)
 				pFrame->pIE_Quiet = (PWLAN_IE_QUIET)pItem;
 			break;
@@ -814,7 +814,7 @@
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwAuthAlgorithm = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 						     + WLAN_AUTHEN_OFF_AUTH_ALG);
 	pFrame->pwAuthSequence = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -846,7 +846,7 @@
 
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwAuthAlgorithm = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 						     + WLAN_AUTHEN_OFF_AUTH_ALG);
 	pFrame->pwAuthSequence = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -854,7 +854,7 @@
 	pFrame->pwStatus = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					      + WLAN_AUTHEN_OFF_STATUS);
 
-	// Information elements
+	/* Information elements */
 	pItem = (PWLAN_IE)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 			   + WLAN_AUTHEN_OFF_CHALLENGE);
 
@@ -883,7 +883,7 @@
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwReason = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					      + WLAN_DEAUTHEN_OFF_REASON);
 	pFrame->len = WLAN_HDR_ADDR3_LEN + WLAN_DEAUTHEN_OFF_REASON + sizeof(*(pFrame->pwReason));
@@ -909,7 +909,7 @@
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwReason = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					      + WLAN_DEAUTHEN_OFF_REASON);
 
@@ -934,7 +934,7 @@
 {
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					       + WLAN_REASSOCRESP_OFF_CAP_INFO);
 	pFrame->pwStatus = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -967,7 +967,7 @@
 
 	pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-	// Fixed Fields
+	/* Fixed Fields */
 	pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					       + WLAN_REASSOCRESP_OFF_CAP_INFO);
 	pFrame->pwStatus = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -975,7 +975,7 @@
 	pFrame->pwAid = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 					   + WLAN_REASSOCRESP_OFF_AID);
 
-	//Information elements
+	/* Information elements */
 	pFrame->pSuppRates = (PWLAN_IE_SUPP_RATES)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
 						   + WLAN_REASSOCRESP_OFF_SUPP_RATES);
 

diff --git a/drivers/staging/vt6655/80211mgr.h b/drivers/staging/vt6655/80211mgr.h
index 16402cf..065238b 100644
--- a/drivers/staging/vt6655/80211mgr.h
+++ b/drivers/staging/vt6655/80211mgr.h

@@ -38,7 +38,7 @@
 
 #define WLAN_MIN_ARRAY          1
 
-// Information Element ID value
+/* Information Element ID value */
 #define WLAN_EID_SSID           0
 #define WLAN_EID_SUPP_RATES     1
 #define WLAN_EID_FH_PARMS       2
@@ -59,17 +59,17 @@
 #define WLAN_EID_QUIET          40
 #define WLAN_EID_IBSS_DFS       41
 #define WLAN_EID_ERP            42
-// reference 802.11i 7.3.2 table 20
+/* reference 802.11i 7.3.2 table 20 */
 #define WLAN_EID_RSN            48
 #define WLAN_EID_EXTSUPP_RATES  50
-// reference WiFi WPA spec.
+/* reference WiFi WPA spec. */
 #define WLAN_EID_RSN_WPA        221
 
 #define WLAN_EID_ERP_NONERP_PRESENT             0x01
 #define WLAN_EID_ERP_USE_PROTECTION             0x02
 #define WLAN_EID_ERP_BARKER_MODE                0x04
 
-// Reason Codes
+/* Reason Codes */
 #define WLAN_MGMT_REASON_RSVD                       0
 #define WLAN_MGMT_REASON_UNSPEC                     1
 #define WLAN_MGMT_REASON_PRIOR_AUTH_INVALID         2
@@ -94,7 +94,7 @@
 #define WLAN_MGMT_REASON_RSNE_CAP_INVALID           22
 #define WLAN_MGMT_REASON_80211X_AUTH_FAILED         23
 
-// Status Codes
+/* Status Codes */
 #define WLAN_MGMT_STATUS_SUCCESS                        0
 #define WLAN_MGMT_STATUS_UNSPEC_FAILURE                 1
 #define WLAN_MGMT_STATUS_CAPS_UNSUPPORTED               10
@@ -110,19 +110,14 @@
 #define WLAN_MGMT_STATUS_ASSOC_DENIED_PBCC              20
 #define WLAN_MGMT_STATUS_ASSOC_DENIED_AGILITY           21
 
-// reference 802.11h 7.3.1.9
-//
+/* reference 802.11h 7.3.1.9 */
 #define WLAN_MGMT_STATUS_ASSOC_REJECT_BCS_SPECTRUM_MNG  22
 #define WLAN_MGMT_STATUS_ASSOC_REJECT_BCS_PWR_CAP       23
 #define WLAN_MGMT_STATUS_ASSOC_REJECT_BCS_SUPP_CH       24
-//
-// reference 802.11g 7.3.1.9
-//
+/* reference 802.11g 7.3.1.9 */
 #define WLAN_MGMT_STATUS_SHORTSLOTTIME_UNSUPPORTED      25
 #define WLAN_MGMT_STATUS_DSSSOFDM_UNSUPPORTED           26
-//
-// reference 802.11i 7.3.1.9 table 19
-//
+/* reference 802.11i 3.7.1.9 table 19 */
 #define WLAN_MGMT_STATUS_INVALID_IE                     40
 #define WLAN_MGMT_STATUS_GROUP_CIPHER_INVALID           41
 #define WLAN_MGMT_STATUS_PAIRWISE_CIPHER_INVALID        42
@@ -131,13 +126,13 @@
 #define WLAN_MGMT_STATUS_INVALID_RSN_IE_CAP             45
 #define WLAN_MGMT_STATUS_CIPHER_REJECT                  46
 
-// Auth Algorithm
+/* Auth Algorithm */
 #define WLAN_AUTH_ALG_OPENSYSTEM                0
 #define WLAN_AUTH_ALG_SHAREDKEY                 1
 
-// Management Frame Field Offsets
-// Note: Not all fields are listed because of variable lengths.
-// Note: These offsets are from the start of the frame data
+/* Management Frame Field Offsets */
+/* Note: Not all fields are listed because of variable lengths. */
+/* Note: These offsets are from the start of the frame data */
 
 #define WLAN_BEACON_OFF_TS                  0
 #define WLAN_BEACON_OFF_BCN_INT             8
@@ -179,9 +174,7 @@
 
 #define WLAN_DEAUTHEN_OFF_REASON            0
 
-//
-// Cipher Suite Selectors defined in 802.11i
-//
+/* Cipher Suite Selectors defined in 802.11i */
 #define WLAN_11i_CSS_USE_GROUP              0
 #define WLAN_11i_CSS_WEP40                  1
 #define WLAN_11i_CSS_TKIP                   2
@@ -189,24 +182,22 @@
 #define WLAN_11i_CSS_WEP104                 5
 #define WLAN_11i_CSS_UNKNOWN                255
 
-//
-// Authentication and Key Management Suite Selectors defined in 802.11i
-//
+/* Authentication and Key Management Suite Selectors defined in 802.11i */
 #define WLAN_11i_AKMSS_802_1X               1
 #define WLAN_11i_AKMSS_PSK                  2
 #define WLAN_11i_AKMSS_UNKNOWN              255
 
-// Measurement type definitions reference ieee 802.11h Table 20b
+/* Measurement type definitions reference ieee 802.11h Table 20b */
 #define MEASURE_TYPE_BASIC      0
 #define MEASURE_TYPE_CCA        1
 #define MEASURE_TYPE_RPI        2
 
-// Measurement request mode definitions reference ieee 802.11h Figure 46h
+/* Measurement request mode definitions reference ieee 802.11h Figure 46h */
 #define MEASURE_MODE_ENABLE     0x02
 #define MEASURE_MODE_REQ        0x04
 #define MEASURE_MODE_REP        0x08
 
-// Measurement report mode definitions reference ieee 802.11h Figure 46m
+/* Measurement report mode definitions reference ieee 802.11h Figure 46m */
 #define MEASURE_MODE_LATE       0x01
 #define MEASURE_MODE_INCAPABLE  0x02
 #define MEASURE_MODE_REFUSED    0x04
@@ -217,7 +208,7 @@
 
 /*---------------------  Export Types  ------------------------------*/
 
-// Information Element Types
+/* Information Element Types */
 
 #pragma pack(1)
 typedef struct tagWLAN_IE {
@@ -226,7 +217,7 @@
 } __attribute__ ((__packed__))
 WLAN_IE, *PWLAN_IE;
 
-// Service Set Identity (SSID)
+/* Service Set Identity (SSID) */
 #pragma pack(1)
 typedef struct tagWLAN_IE_SSID {
 	unsigned char byElementID;
@@ -235,7 +226,7 @@
 } __attribute__ ((__packed__))
 WLAN_IE_SSID, *PWLAN_IE_SSID;
 
-// Supported Rates
+/* Supported Rates */
 #pragma pack(1)
 typedef struct tagWLAN_IE_SUPP_RATES {
 	unsigned char byElementID;
@@ -244,7 +235,7 @@
 } __attribute__ ((__packed__))
 WLAN_IE_SUPP_RATES,  *PWLAN_IE_SUPP_RATES;
 
-// FH Parameter Set
+/* FH Parameter Set */
 #pragma pack(1)
 typedef struct _WLAN_IE_FH_PARMS {
 	unsigned char byElementID;
@@ -255,7 +246,7 @@
 	unsigned char byHopIndex;
 } WLAN_IE_FH_PARMS,  *PWLAN_IE_FH_PARMS;
 
-// DS Parameter Set
+/* DS Parameter Set */
 #pragma pack(1)
 typedef struct tagWLAN_IE_DS_PARMS {
 	unsigned char byElementID;
@@ -264,7 +255,7 @@
 } __attribute__ ((__packed__))
 WLAN_IE_DS_PARMS,  *PWLAN_IE_DS_PARMS;
 
-// CF Parameter Set
+/* CF Parameter Set */
 #pragma pack(1)
 typedef struct tagWLAN_IE_CF_PARMS {
 	unsigned char byElementID;
@@ -276,7 +267,7 @@
 } __attribute__ ((__packed__))
 WLAN_IE_CF_PARMS,  *PWLAN_IE_CF_PARMS;
 
-// TIM
+/* TIM */
 #pragma pack(1)
 typedef struct tagWLAN_IE_TIM {
 	unsigned char byElementID;
@@ -288,7 +279,7 @@
 } __attribute__ ((__packed__))
 WLAN_IE_TIM,  *PWLAN_IE_TIM;
 
-// IBSS Parameter Set
+/* IBSS Parameter Set */
 #pragma pack(1)
 typedef struct tagWLAN_IE_IBSS_PARMS {
 	unsigned char byElementID;
@@ -297,7 +288,7 @@
 } __attribute__ ((__packed__))
 WLAN_IE_IBSS_PARMS, *PWLAN_IE_IBSS_PARMS;
 
-// Challenge Text
+/* Challenge Text */
 #pragma pack(1)
 typedef struct tagWLAN_IE_CHALLENGE {
 	unsigned char byElementID;
@@ -316,8 +307,8 @@
 	unsigned short wPKCount;
 	struct {
 		unsigned char abyOUI[4];
-	} PKSList[1]; // the rest is variable so need to
-	// overlay ieauth structure
+	} PKSList[1]; /* the rest is variable so need to */
+	/* overlay ieauth structure */
 } WLAN_IE_RSN_EXT, *PWLAN_IE_RSN_EXT;
 
 #pragma pack(1)
@@ -328,7 +319,7 @@
 	} AuthKSList[1];
 } WLAN_IE_RSN_AUTH, *PWLAN_IE_RSN_AUTH;
 
-// RSN Identity
+/* RSN Identity */
 #pragma pack(1)
 typedef struct tagWLAN_IE_RSN {
 	unsigned char byElementID;
@@ -337,7 +328,7 @@
 	unsigned char abyRSN[WLAN_MIN_ARRAY];
 } WLAN_IE_RSN, *PWLAN_IE_RSN;
 
-// ERP
+/* ERP */
 #pragma pack(1)
 typedef struct tagWLAN_IE_ERP {
 	unsigned char byElementID;
@@ -466,8 +457,8 @@
 
 #pragma pack()
 
-// Frame Types
-// prototype structure, all mgmt frame types will start with these members
+/* Frame Types */
+/* prototype structure, all mgmt frame types will start with these members */
 typedef struct tagWLAN_FR_MGMT {
 	unsigned int	uType;
 	unsigned int	len;
@@ -475,20 +466,20 @@
 	PUWLAN_80211HDR       pHdr;
 } WLAN_FR_MGMT,  *PWLAN_FR_MGMT;
 
-// Beacon frame
+/* Beacon frame */
 typedef struct tagWLAN_FR_BEACON {
 	unsigned int	uType;
 	unsigned int	len;
 	unsigned char *pBuf;
 	PUWLAN_80211HDR         pHdr;
-	// fixed fields
+	/* fixed fields */
 	PQWORD                  pqwTimestamp;
 	unsigned short *pwBeaconInterval;
 	unsigned short *pwCapInfo;
 	/*-- info elements ----------*/
 	PWLAN_IE_SSID           pSSID;
 	PWLAN_IE_SUPP_RATES     pSuppRates;
-//  PWLAN_IE_FH_PARMS       pFHParms;
+/*  PWLAN_IE_FH_PARMS       pFHParms; */
 	PWLAN_IE_DS_PARMS       pDSParms;
 	PWLAN_IE_CF_PARMS       pCFParms;
 	PWLAN_IE_TIM            pTIM;
@@ -504,19 +495,19 @@
 	PWLAN_IE_QUIET          pIE_Quiet;
 } WLAN_FR_BEACON, *PWLAN_FR_BEACON;
 
-// IBSS ATIM frame
+/* IBSS ATIM frame */
 typedef struct tagWLAN_FR_IBSSATIM {
 	unsigned int	uType;
 	unsigned int	len;
 	unsigned char *pBuf;
 	PUWLAN_80211HDR         pHdr;
 
-	// fixed fields
-	// info elements
-	// this frame type has a null body
+	/* fixed fields */
+	/* info elements */
+	/* this frame type has a null body */
 } WLAN_FR_IBSSATIM, *PWLAN_FR_IBSSATIM;
 
-// Disassociation
+/* Disassociation */
 typedef struct tagWLAN_FR_DISASSOC {
 	unsigned int	uType;
 	unsigned int	len;
@@ -527,7 +518,7 @@
 	/*-- info elements ----------*/
 } WLAN_FR_DISASSOC, *PWLAN_FR_DISASSOC;
 
-// Association Request
+/* Association Request */
 typedef struct tagWLAN_FR_ASSOCREQ {
 	unsigned int	uType;
 	unsigned int	len;
@@ -546,7 +537,7 @@
 	PWLAN_IE_SUPP_CH        pCurrSuppCh;
 } WLAN_FR_ASSOCREQ, *PWLAN_FR_ASSOCREQ;
 
-// Association Response
+/* Association Response */
 typedef struct tagWLAN_FR_ASSOCRESP {
 	unsigned int	uType;
 	unsigned int	len;
@@ -561,7 +552,7 @@
 	PWLAN_IE_SUPP_RATES     pExtSuppRates;
 } WLAN_FR_ASSOCRESP, *PWLAN_FR_ASSOCRESP;
 
-// Reassociation Request
+/* Reassociation Request */
 typedef struct tagWLAN_FR_REASSOCREQ {
 	unsigned int	uType;
 	unsigned int	len;
@@ -581,7 +572,7 @@
 	PWLAN_IE_SUPP_RATES     pExtSuppRates;
 } WLAN_FR_REASSOCREQ, *PWLAN_FR_REASSOCREQ;
 
-// Reassociation Response
+/* Reassociation Response */
 typedef struct tagWLAN_FR_REASSOCRESP {
 	unsigned int	uType;
 	unsigned int	len;
@@ -596,7 +587,7 @@
 	PWLAN_IE_SUPP_RATES     pExtSuppRates;
 } WLAN_FR_REASSOCRESP, *PWLAN_FR_REASSOCRESP;
 
-// Probe Request
+/* Probe Request */
 typedef struct tagWLAN_FR_PROBEREQ {
 	unsigned int	uType;
 	unsigned int	len;
@@ -609,7 +600,7 @@
 	PWLAN_IE_SUPP_RATES     pExtSuppRates;
 } WLAN_FR_PROBEREQ, *PWLAN_FR_PROBEREQ;
 
-// Probe Response
+/* Probe Response */
 typedef struct tagWLAN_FR_PROBERESP {
 	unsigned int	uType;
 	unsigned int	len;
@@ -636,7 +627,7 @@
 	PWLAN_IE_QUIET          pIE_Quiet;
 } WLAN_FR_PROBERESP, *PWLAN_FR_PROBERESP;
 
-// Authentication
+/* Authentication */
 typedef struct tagWLAN_FR_AUTHEN {
 	unsigned int	uType;
 	unsigned int	len;
@@ -650,7 +641,7 @@
 	PWLAN_IE_CHALLENGE      pChallenge;
 } WLAN_FR_AUTHEN, *PWLAN_FR_AUTHEN;
 
-// Deauthenication
+/* Deauthenication */
 typedef struct tagWLAN_FR_DEAUTHEN {
 	unsigned int	uType;
 	unsigned int	len;
@@ -774,4 +765,4 @@
 	PWLAN_FR_REASSOCRESP  pFrame
 );
 
-#endif// __80211MGR_H__
+#endif/* __80211MGR_H__ */

diff --git a/drivers/staging/vt6655/aes_ccmp.c b/drivers/staging/vt6655/aes_ccmp.c
index 3608148..fc056fc 100644
--- a/drivers/staging/vt6655/aes_ccmp.c
+++ b/drivers/staging/vt6655/aes_ccmp.c

@@ -205,7 +205,7 @@
 			SubBytes(ciphertext, TmpdataA);
 			ShiftRows(TmpdataA, TmpdataB);
 			xor_128(TmpdataB, abyRoundKey, ciphertext);
-		} else // round 1 ~ 9
+		} else /* round 1 ~ 9 */
 		{
 			SubBytes(ciphertext, TmpdataA);
 			ShiftRows(TmpdataA, TmpdataB);
@@ -249,7 +249,7 @@
 	unsigned char *pbyIV;
 	unsigned char *pbyPayload;
 	unsigned short wHLen = 22;
-	unsigned short wPayloadSize = wFrameSize - 8 - 8 - 4 - WLAN_HDR_ADDR3_LEN;//8 is IV, 8 is MIC, 4 is CRC
+	unsigned short wPayloadSize = wFrameSize - 8 - 8 - 4 - WLAN_HDR_ADDR3_LEN;/* 8 is IV, 8 is MIC, 4 is CRC */
 	bool bA4 = false;
 	unsigned char byTmp;
 	unsigned short wCnt;
@@ -259,13 +259,13 @@
 	if (WLAN_GET_FC_TODS(*(unsigned short *)pbyFrame) &&
 	    WLAN_GET_FC_FROMDS(*(unsigned short *)pbyFrame)) {
 		bA4 = true;
-		pbyIV += 6;             // 6 is 802.11 address4
+		pbyIV += 6;             /* 6 is 802.11 address4 */
 		wHLen += 6;
 		wPayloadSize -= 6;
 	}
-	pbyPayload = pbyIV + 8; //IV-length
+	pbyPayload = pbyIV + 8; /* IV-length */
 
-	abyNonce[0]  = 0x00; //now is 0, if Qos here will be priority
+	abyNonce[0]  = 0x00; /* now is 0, if Qos here will be priority */
 	memcpy(&(abyNonce[1]), pMACHeader->abyAddr2, ETH_ALEN);
 	abyNonce[7]  = pbyIV[7];
 	abyNonce[8]  = pbyIV[6];
@@ -274,13 +274,13 @@
 	abyNonce[11] = pbyIV[1];
 	abyNonce[12] = pbyIV[0];
 
-	//MIC_IV
+	/* MIC_IV */
 	MIC_IV[0] = 0x59;
 	memcpy(&(MIC_IV[1]), &(abyNonce[0]), 13);
 	MIC_IV[14] = (unsigned char)(wPayloadSize >> 8);
 	MIC_IV[15] = (unsigned char)(wPayloadSize & 0xff);
 
-	//MIC_HDR1
+	/* MIC_HDR1 */
 	MIC_HDR1[0] = (unsigned char)(wHLen >> 8);
 	MIC_HDR1[1] = (unsigned char)(wHLen & 0xff);
 	byTmp = (unsigned char)(pMACHeader->wFrameCtl & 0xff);
@@ -291,7 +291,7 @@
 	memcpy(&(MIC_HDR1[4]), pMACHeader->abyAddr1, ETH_ALEN);
 	memcpy(&(MIC_HDR1[10]), pMACHeader->abyAddr2, ETH_ALEN);
 
-	//MIC_HDR2
+	/* MIC_HDR2 */
 	memcpy(&(MIC_HDR2[0]), pMACHeader->abyAddr3, ETH_ALEN);
 	byTmp = (unsigned char)(pMACHeader->wSeqCtl & 0xff);
 	MIC_HDR2[6] = byTmp & 0x0f;
@@ -309,7 +309,7 @@
 	MIC_HDR2[14] = 0x00;
 	MIC_HDR2[15] = 0x00;
 
-	//CCMP
+	/* CCMP */
 	AESv128(pbyRxKey, MIC_IV, abyMIC);
 	for (kk = 0; kk < 16; kk++) {
 		abyTmp[kk] = MIC_HDR1[kk] ^ abyMIC[kk];
@@ -341,9 +341,9 @@
 		memcpy(pbyPayload, abyPlainText, 16);
 		wCnt++;
 		pbyPayload += 16;
-	} //for wPayloadSize
+	} /* for wPayloadSize */
 
-	//last payload
+	/* last payload */
 	memcpy(&(abyLastCipher[0]), pbyPayload, jj);
 	for (ii = jj; ii < 16; ii++) {
 		abyLastCipher[ii] = 0x00;
@@ -359,7 +359,7 @@
 	memcpy(pbyPayload, abyPlainText, jj);
 	pbyPayload += jj;
 
-	//for MIC calculation
+	/* for MIC calculation */
 	for (ii = jj; ii < 16; ii++) {
 		abyPlainText[ii] = 0x00;
 	}
@@ -368,8 +368,8 @@
 	}
 	AESv128(pbyRxKey, abyTmp, abyMIC);
 
-	//=>above is the calculate MIC
-	//--------------------------------------------
+	/* =>above is the calculate MIC */
+	/* -------------------------------------------- */
 
 	wCnt = 0;
 	abyCTRPLD[14] = (unsigned char)(wCnt >> 8);
@@ -378,12 +378,11 @@
 	for (kk = 0; kk < 8; kk++) {
 		abyTmp[kk] = abyTmp[kk] ^ pbyPayload[kk];
 	}
-	//=>above is the dec-MIC from packet
-	//--------------------------------------------
+	/* =>above is the dec-MIC from packet */
+	/* -------------------------------------------- */
 
-	if (!memcmp(abyMIC, abyTmp, 8)) {
+	if (!memcmp(abyMIC, abyTmp, 8))
 		return true;
-	} else {
+	else
 		return false;
-	}
 }

diff --git a/drivers/staging/vt6655/aes_ccmp.h b/drivers/staging/vt6655/aes_ccmp.h
index c8b28b0..cc02e64 100644
--- a/drivers/staging/vt6655/aes_ccmp.h
+++ b/drivers/staging/vt6655/aes_ccmp.h

@@ -43,4 +43,4 @@
 /*---------------------  Export Functions  --------------------------*/
 bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned short wFrameSize);
 
-#endif //__AES_H__
+#endif /* __AES_H__ */

diff --git a/drivers/staging/vt6655/hostap.c b/drivers/staging/vt6655/hostap.c
index 8417c2f..57a08c5 100644
--- a/drivers/staging/vt6655/hostap.c
+++ b/drivers/staging/vt6655/hostap.c

@@ -80,7 +80,7 @@
 
 	DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO "%s: Enabling hostapd mode\n", dev->name);
 
-	pDevice->apdev = kzalloc(sizeof(struct net_device), GFP_KERNEL);
+	pDevice->apdev = alloc_etherdev(sizeof(*apdev_priv));
 	if (pDevice->apdev == NULL)
 		return -ENOMEM;
 
@@ -104,6 +104,8 @@
 	if (ret) {
 		DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO "%s: register_netdevice(AP) failed!\n",
 			dev->name);
+		free_netdev(pDevice->apdev);
+		pDevice->apdev = NULL;
 		return -1;
 	}
 
@@ -141,7 +143,7 @@
 		DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO "%s: Netdevice %s unregistered\n",
 			pDevice->dev->name, pDevice->apdev->name);
 	}
-	kfree(pDevice->apdev);
+	free_netdev(pDevice->apdev);
 	pDevice->apdev = NULL;
 	pDevice->bEnable8021x = false;
 	pDevice->bEnableHostWEP = false;

diff --git a/drivers/staging/vt6655/ioctl.c b/drivers/staging/vt6655/ioctl.c
index 2ae8116..46e0e41 100644
--- a/drivers/staging/vt6655/ioctl.c
+++ b/drivers/staging/vt6655/ioctl.c

@@ -64,7 +64,6 @@
 	PKnownBSS	pBSS;
 	PKnownNodeDB	pNode;
 	unsigned int	ii, jj;
-	SCmdLinkStatus	sLinkStatus;
 	unsigned char	abySuppRates[] = {WLAN_EID_SUPP_RATES, 4, 0x02, 0x04, 0x0B, 0x16};
 	unsigned char	abyNullAddr[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
 	unsigned long	dwKeyIndex = 0;
@@ -245,10 +244,12 @@
 		pDevice->eEncryptionStatus = Ndis802_11Encryption1Enabled;
 		break;
 
-	case WLAN_CMD_GET_LINK:
+	case WLAN_CMD_GET_LINK: {
+		SCmdLinkStatus sLinkStatus;
+
 		DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO "WLAN_CMD_GET_LINK status.\n");
 
-		memset(sLinkStatus.abySSID, 0 , WLAN_SSID_MAXLEN + 1);
+		memset(&sLinkStatus, 0, sizeof(sLinkStatus));
 
 		if (pMgmt->eCurrMode == WMAC_MODE_IBSS_STA)
 			sLinkStatus.wBSSType = ADHOC;
@@ -277,7 +278,7 @@
 			break;
 		}
 		break;
-
+	}
 	case WLAN_CMD_GET_LISTLEN:
 		cbListCount = 0;
 		pBSS = &(pMgmt->sBSSList[0]);

diff --git a/drivers/staging/vt6656/mac.c b/drivers/staging/vt6656/mac.c
index 343db19..54414ed27 100644
--- a/drivers/staging/vt6656/mac.c
+++ b/drivers/staging/vt6656/mac.c

@@ -101,7 +101,7 @@
                         MESSAGE_TYPE_WRITE_MASK,
                         MAC_REG_ENCFG0,
                         MESSAGE_REQUEST_MACREG,
-                        2,
+			ARRAY_SIZE(pbyData),
                         pbyData
                         );
 }
@@ -122,18 +122,10 @@
  */
 void MACvDisableKeyEntry(struct vnt_private *pDevice, u32 uEntryIdx)
 {
-	u16 wOffset;
 	u8 byData;
 
     byData = (u8) uEntryIdx;
 
-    wOffset = MISCFIFO_KEYETRY0;
-    wOffset += (uEntryIdx * MISCFIFO_KEYENTRYSIZE);
-
-    //VNSvOutPortW(dwIoBase + MAC_REG_MISCFFNDEX, wOffset);
-    //VNSvOutPortD(dwIoBase + MAC_REG_MISCFFDATA, 0);
-    //VNSvOutPortW(dwIoBase + MAC_REG_MISCFFCTL, MISCFFCTL_WRITE);
-
     //issue write misc fifo command to device
     CONTROLnsRequestOut(pDevice,
                         MESSAGE_TYPE_CLRKEYENTRY,
@@ -182,12 +174,6 @@
 	DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO"1. wOffset: %d, Data: %X,"\
 		" KeyCtl:%X\n", wOffset, dwData1, wKeyCtl);
 
-    //VNSvOutPortW(dwIoBase + MAC_REG_MISCFFNDEX, wOffset);
-    //VNSvOutPortD(dwIoBase + MAC_REG_MISCFFDATA, dwData);
-    //VNSvOutPortW(dwIoBase + MAC_REG_MISCFFCTL, MISCFFCTL_WRITE);
-
-    //wOffset++;
-
     dwData2 = 0;
     dwData2 |= *(pbyAddr+3);
     dwData2 <<= 8;
@@ -200,21 +186,6 @@
 	DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO"2. wOffset: %d, Data: %X\n",
 		wOffset, dwData2);
 
-    //VNSvOutPortW(dwIoBase + MAC_REG_MISCFFNDEX, wOffset);
-    //VNSvOutPortD(dwIoBase + MAC_REG_MISCFFDATA, dwData);
-    //VNSvOutPortW(dwIoBase + MAC_REG_MISCFFCTL, MISCFFCTL_WRITE);
-
-    //wOffset++;
-
-    //wOffset += (uKeyIdx * 4);
-/*    for (ii=0;ii<4;ii++) {
-        // alway push 128 bits
-        DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO"3.(%d) wOffset: %d, Data: %lX\n", ii, wOffset+ii, *pdwKey);
-        VNSvOutPortW(dwIoBase + MAC_REG_MISCFFNDEX, wOffset+ii);
-        VNSvOutPortD(dwIoBase + MAC_REG_MISCFFDATA, *pdwKey++);
-        VNSvOutPortW(dwIoBase + MAC_REG_MISCFFCTL, MISCFFCTL_WRITE);
-    }
-*/
     pbyKey = (u8 *)pdwKey;
 
     pbyData[0] = (u8)dwData1;
@@ -232,7 +203,7 @@
                         MESSAGE_TYPE_SETKEY,
                         wOffset,
                         (u16)uKeyIdx,
-                        24,
+			ARRAY_SIZE(pbyData),
                         pbyData
                         );
 
@@ -249,7 +220,7 @@
                         MESSAGE_TYPE_WRITE_MASK,
                         byRegOfs,
                         MESSAGE_REQUEST_MACREG,
-                        2,
+			ARRAY_SIZE(pbyData),
                         pbyData
                         );
 }
@@ -265,7 +236,7 @@
                         MESSAGE_TYPE_WRITE_MASK,
                         byRegOfs,
                         MESSAGE_REQUEST_MACREG,
-                        2,
+			ARRAY_SIZE(pbyData),
                         pbyData
                         );
 }
@@ -281,7 +252,7 @@
                         MESSAGE_TYPE_WRITE,
                         byRegOfs,
                         MESSAGE_REQUEST_MACREG,
-                        2,
+			ARRAY_SIZE(pbyData),
                         pbyData
                         );
 
@@ -302,7 +273,7 @@
                         MESSAGE_TYPE_WRITE,
                         MAC_REG_BSSID0,
                         MESSAGE_REQUEST_MACREG,
-                        6,
+			ARRAY_SIZE(pbyData),
                         pbyData
                         );
 }
@@ -318,7 +289,7 @@
                         MESSAGE_TYPE_WRITE_MASK,
                         MAC_REG_ENCFG0,
                         MESSAGE_REQUEST_MACREG,
-                        2,
+			ARRAY_SIZE(pbyData),
                         pbyData
                         );
 }
@@ -334,7 +305,7 @@
                         MESSAGE_TYPE_WRITE_MASK,
                         MAC_REG_ENCFG0,
                         MESSAGE_REQUEST_MACREG,
-                        2,
+			ARRAY_SIZE(pbyData),
                         pbyData
                         );
 }
@@ -350,7 +321,7 @@
                         MESSAGE_TYPE_WRITE_MASK,
                         MAC_REG_ENCFG2,
                         MESSAGE_REQUEST_MACREG,
-                        2,
+			ARRAY_SIZE(pbyData),
                         pbyData
                         );
 }
@@ -366,7 +337,7 @@
                         MESSAGE_TYPE_WRITE_MASK,
                         MAC_REG_ENCFG2,
                         MESSAGE_REQUEST_MACREG,
-                        2,
+			ARRAY_SIZE(pbyData),
                         pbyData
                         );
 }
@@ -382,7 +353,7 @@
 			MESSAGE_TYPE_WRITE,
 			MAC_REG_BI,
 			MESSAGE_REQUEST_MACREG,
-			2,
+			ARRAY_SIZE(pbyData),
 			pbyData
 			);
 }

diff --git a/drivers/staging/vt6656/rf.c b/drivers/staging/vt6656/rf.c
index 44cfe0b..d27fa43 100644
--- a/drivers/staging/vt6656/rf.c
+++ b/drivers/staging/vt6656/rf.c

@@ -29,6 +29,9 @@
  *      IFRFbWriteEmbedded      - Embedded write RF register via MAC
  *
  * Revision History:
+ *	RF_VT3226: RobertYu:20051111, VT3226C0 and before
+ *	RF_VT3226D0: RobertYu:20051228
+ *	RF_VT3342A0: RobertYu:20060609
  *
  */
 
@@ -61,7 +64,7 @@
 #define VT3342_PWR_IDX_LEN    64
 //}}
 
-u8 abyAL2230InitTable[CB_AL2230_INIT_SEQ][3] = {
+static u8 al2230_init_table[CB_AL2230_INIT_SEQ][3] = {
     {0x03, 0xF7, 0x90},
     {0x03, 0x33, 0x31},
     {0x01, 0xB8, 0x02},
@@ -79,7 +82,7 @@
     {0x00, 0x58, 0x0F}
     };
 
-u8 abyAL2230ChannelTable0[CB_MAX_CHANNEL_24G][3] = {
+static u8 al2230_channel_table0[CB_MAX_CHANNEL_24G][3] = {
     {0x03, 0xF7, 0x90}, // channel = 1, Tf = 2412MHz
     {0x03, 0xF7, 0x90}, // channel = 2, Tf = 2417MHz
     {0x03, 0xE7, 0x90}, // channel = 3, Tf = 2422MHz
@@ -96,7 +99,7 @@
     {0x03, 0xE7, 0xC0}  // channel = 14, Tf = 2412M
     };
 
-u8 abyAL2230ChannelTable1[CB_MAX_CHANNEL_24G][3] = {
+static u8 al2230_channel_table1[CB_MAX_CHANNEL_24G][3] = {
     {0x03, 0x33, 0x31}, // channel = 1, Tf = 2412MHz
     {0x0B, 0x33, 0x31}, // channel = 2, Tf = 2417MHz
     {0x03, 0x33, 0x31}, // channel = 3, Tf = 2422MHz
@@ -115,7 +118,7 @@
 
 // 40MHz reference frequency
 // Need to Pull PLLON(PE3) low when writing channel registers through 3-wire.
-u8 abyAL7230InitTable[CB_AL7230_INIT_SEQ][3] = {
+static u8 al7230_init_table[CB_AL7230_INIT_SEQ][3] = {
     {0x20, 0x37, 0x90}, // Channel1 // Need modify for 11a
     {0x13, 0x33, 0x31}, // Channel1 // Need modify for 11a
     {0x84, 0x1F, 0xF2}, // Need modify for 11a: 451FE2
@@ -138,7 +141,7 @@
     {0x1A, 0xBA, 0x8F} // Need modify for 11a: 12BACF
     };
 
-u8 abyAL7230InitTableAMode[CB_AL7230_INIT_SEQ][3] = {
+static u8 al7230_init_table_amode[CB_AL7230_INIT_SEQ][3] = {
     {0x2F, 0xF5, 0x20}, // Channel184 // Need modify for 11b/g
     {0x00, 0x00, 0x01}, // Channel184 // Need modify for 11b/g
     {0x45, 0x1F, 0xE2}, // Need modify for 11b/g
@@ -157,7 +160,7 @@
     {0x12, 0xBA, 0xCF} // Need modify for 11b/g
     };
 
-u8 abyAL7230ChannelTable0[CB_MAX_CHANNEL][3] = {
+static u8 al7230_channel_table0[CB_MAX_CHANNEL][3] = {
     {0x20, 0x37, 0x90}, // channel =  1, Tf = 2412MHz
     {0x20, 0x37, 0x90}, // channel =  2, Tf = 2417MHz
     {0x20, 0x37, 0x90}, // channel =  3, Tf = 2422MHz
@@ -223,7 +226,7 @@
     {0x2F, 0xF6, 0x10} // channel = 165, Tf = 5825MHz (56)
     };
 
-u8 abyAL7230ChannelTable1[CB_MAX_CHANNEL][3] = {
+static u8 al7230_channel_table1[CB_MAX_CHANNEL][3] = {
     {0x13, 0x33, 0x31}, // channel =  1, Tf = 2412MHz
     {0x1B, 0x33, 0x31}, // channel =  2, Tf = 2417MHz
     {0x03, 0x33, 0x31}, // channel =  3, Tf = 2422MHz
@@ -287,7 +290,7 @@
     {0x02, 0xAA, 0xB1}  // channel = 165, Tf = 5825MHz (56)
     };
 
-u8 abyAL7230ChannelTable2[CB_MAX_CHANNEL][3] = {
+static u8 al7230_channel_table2[CB_MAX_CHANNEL][3] = {
     {0x7F, 0xD7, 0x84}, // channel =  1, Tf = 2412MHz
     {0x7F, 0xD7, 0x84}, // channel =  2, Tf = 2417MHz
     {0x7F, 0xD7, 0x84}, // channel =  3, Tf = 2422MHz
@@ -352,7 +355,7 @@
     };
 
 ///{{RobertYu:20051111
-u8 abyVT3226_InitTable[CB_VT3226_INIT_SEQ][3] = {
+static u8 at3226_init_table[CB_VT3226_INIT_SEQ][3] = {
     {0x03, 0xFF, 0x80},
     {0x02, 0x82, 0xA1},
     {0x03, 0xC6, 0xA2},
@@ -366,7 +369,7 @@
     {0x02, 0x00, 0x2A}
     };
 
-u8 abyVT3226D0_InitTable[CB_VT3226_INIT_SEQ][3] = {
+static u8 at3226d0_init_table[CB_VT3226_INIT_SEQ][3] = {
     {0x03, 0xFF, 0x80},
     {0x03, 0x02, 0x21}, //RobertYu:20060327
     {0x03, 0xC6, 0xA2},
@@ -380,7 +383,7 @@
     {0x02, 0x01, 0xAA}  //RobertYu:20060523
     };
 
-u8 abyVT3226_ChannelTable0[CB_MAX_CHANNEL_24G][3] = {
+static u8 vt3226_channel_table0[CB_MAX_CHANNEL_24G][3] = {
     {0x01, 0x97, 0x83}, // channel = 1, Tf = 2412MHz
     {0x01, 0x97, 0x83}, // channel = 2, Tf = 2417MHz
     {0x01, 0x97, 0x93}, // channel = 3, Tf = 2422MHz
@@ -397,7 +400,7 @@
     {0x03, 0x37, 0xC3}  // channel = 14, Tf = 2484MHz
     };
 
-u8 abyVT3226_ChannelTable1[CB_MAX_CHANNEL_24G][3] = {
+static u8 vt3226_channel_table1[CB_MAX_CHANNEL_24G][3] = {
     {0x02, 0x66, 0x64}, // channel = 1, Tf = 2412MHz
     {0x03, 0x66, 0x64}, // channel = 2, Tf = 2417MHz
     {0x00, 0x66, 0x64}, // channel = 3, Tf = 2422MHz
@@ -416,7 +419,7 @@
 ///}}RobertYu
 
 //{{RobertYu:20060502, TWIF 1.14, LO Current for 11b mode
-u32 dwVT3226D0LoCurrentTable[CB_MAX_CHANNEL_24G] = {
+const u32 vt3226d0_lo_current_table[CB_MAX_CHANNEL_24G] = {
     0x0135C600+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW, // channel = 1, Tf = 2412MHz
     0x0135C600+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW, // channel = 2, Tf = 2417MHz
     0x0235C600+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW, // channel = 3, Tf = 2422MHz
@@ -435,7 +438,7 @@
 //}}
 
 //{{RobertYu:20060609
-u8 abyVT3342A0_InitTable[CB_VT3342_INIT_SEQ][3] = { /* 11b/g mode */
+static u8 vt3342a0_init_table[CB_VT3342_INIT_SEQ][3] = { /* 11b/g mode */
     {0x03, 0xFF, 0x80}, //update for mode//
     {0x02, 0x08, 0x81},
     {0x00, 0xC6, 0x02},
@@ -458,7 +461,7 @@
  // channel56, 5280MHz  0x00C402 for disable Frac
  // other channels 0x00C602
 
-u8 abyVT3342_ChannelTable0[CB_MAX_CHANNEL][3] = {
+static u8 vt3342_channel_table0[CB_MAX_CHANNEL][3] = {
     {0x02, 0x05, 0x03}, // channel = 1, Tf = 2412MHz
     {0x01, 0x15, 0x03}, // channel = 2, Tf = 2417MHz
     {0x03, 0xC5, 0x03}, // channel = 3, Tf = 2422MHz
@@ -524,7 +527,7 @@
     {0x00, 0x06, 0x03}  // channel = 165, Tf = 5825MHz (56), TBD
     };
 
-u8 abyVT3342_ChannelTable1[CB_MAX_CHANNEL][3] = {
+static u8 vt3342_channel_table1[CB_MAX_CHANNEL][3] = {
     {0x01, 0x99, 0x94}, // channel = 1, Tf = 2412MHz
     {0x02, 0x44, 0x44}, // channel = 2, Tf = 2417MHz
     {0x02, 0xEE, 0xE4}, // channel = 3, Tf = 2422MHz
@@ -594,7 +597,7 @@
  *
 -*/
 
-const u32 dwAL2230PowerTable[AL2230_PWR_IDX_LEN] = {
+const u32 al2230_power_table[AL2230_PWR_IDX_LEN] = {
     0x04040900+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW,
     0x04041900+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW,
     0x04042900+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW,
@@ -732,42 +735,41 @@
  * Return Value: true if succeeded; false if failed.
  *
  */
-int RFbSetPower(struct vnt_private *pDevice, u32 uRATE, u32 uCH)
+int RFbSetPower(struct vnt_private *priv, u32 rate, u32 channel)
 {
-	int bResult = true;
-	u8 byPwr = pDevice->byCCKPwr;
+	int ret = true;
+	u8 power = priv->byCCKPwr;
 
-	if (pDevice->dwDiagRefCount)
+	if (priv->dwDiagRefCount)
 		return true;
 
-	if (uCH == 0)
+	if (channel == 0)
 		return -EINVAL;
 
-    switch (uRATE) {
-    case RATE_1M:
-    case RATE_2M:
-    case RATE_5M:
-    case RATE_11M:
-        byPwr = pDevice->abyCCKPwrTbl[uCH-1];
-        break;
-    case RATE_6M:
-    case RATE_9M:
-    case RATE_18M:
-    case RATE_24M:
-    case RATE_36M:
-    case RATE_48M:
-    case RATE_54M:
-        if (uCH > CB_MAX_CHANNEL_24G) {
-            byPwr = pDevice->abyOFDMAPwrTbl[uCH-15];
-        } else {
-            byPwr = pDevice->abyOFDMPwrTbl[uCH-1];
-        }
-        break;
-    }
+	switch (rate) {
+	case RATE_1M:
+	case RATE_2M:
+	case RATE_5M:
+	case RATE_11M:
+		power = priv->abyCCKPwrTbl[channel-1];
+		break;
+	case RATE_6M:
+	case RATE_9M:
+	case RATE_18M:
+	case RATE_24M:
+	case RATE_36M:
+	case RATE_48M:
+	case RATE_54M:
+		if (channel > CB_MAX_CHANNEL_24G)
+			power = priv->abyOFDMAPwrTbl[channel-15];
+		else
+			power = priv->abyOFDMPwrTbl[channel-1];
+		break;
+	}
 
-    bResult = RFbRawSetPower(pDevice, byPwr, uRATE);
+	ret = RFbRawSetPower(priv, power, rate);
 
-    return bResult;
+	return ret;
 }
 
 /*
@@ -784,136 +786,146 @@
  *
  */
 
-int RFbRawSetPower(struct vnt_private *pDevice, u8 byPwr, u32 uRATE)
+int RFbRawSetPower(struct vnt_private *priv, u8 power, u32 rate)
 {
-	int bResult = true;
+	u32 power_setting = 0;
+	int ret = true;
 
-    if (pDevice->byCurPwr == byPwr)
-        return true;
+	if (priv->byCurPwr == power)
+		return true;
 
-    pDevice->byCurPwr = byPwr;
+	priv->byCurPwr = power;
 
-    switch (pDevice->byRFType) {
+	switch (priv->byRFType) {
+	case RF_AL2230:
+		if (priv->byCurPwr >= AL2230_PWR_IDX_LEN)
+			return false;
 
-        case RF_AL2230 :
-            if (pDevice->byCurPwr >= AL2230_PWR_IDX_LEN)
-                return false;
-            bResult &= IFRFbWriteEmbedded(pDevice, dwAL2230PowerTable[pDevice->byCurPwr]);
-            if (uRATE <= RATE_11M)
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x0001B400+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-            else
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x0005A400+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-            break;
+		ret &= IFRFbWriteEmbedded(priv,
+			al2230_power_table[priv->byCurPwr]);
 
-        case RF_AL2230S :
-            if (pDevice->byCurPwr >= AL2230_PWR_IDX_LEN)
-                return false;
-            bResult &= IFRFbWriteEmbedded(pDevice, dwAL2230PowerTable[pDevice->byCurPwr]);
-            if (uRATE <= RATE_11M) {
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x040C1400+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x00299B00+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-            }else {
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x0005A400+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x00099B00+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-            }
-            break;
+		if (rate <= RATE_11M)
+			ret &= IFRFbWriteEmbedded(priv, 0x0001b400 +
+				(BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+		else
+			ret &= IFRFbWriteEmbedded(priv, 0x0005a400 +
+				(BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+		break;
+	case RF_AL2230S:
+		if (priv->byCurPwr >= AL2230_PWR_IDX_LEN)
+			return false;
 
-        case RF_AIROHA7230:
-            {
-                u32       dwMax7230Pwr;
+		ret &= IFRFbWriteEmbedded(priv,
+			al2230_power_table[priv->byCurPwr]);
 
-                if (uRATE <= RATE_11M) { //RobertYu:20060426, for better 11b mask
-                    bResult &= IFRFbWriteEmbedded(pDevice, 0x111BB900+(BY_AL7230_REG_LEN<<3)+IFREGCTL_REGW);
-                }
-                else {
-                    bResult &= IFRFbWriteEmbedded(pDevice, 0x221BB900+(BY_AL7230_REG_LEN<<3)+IFREGCTL_REGW);
-                }
+		if (rate <= RATE_11M) {
+			ret &= IFRFbWriteEmbedded(priv, 0x040c1400 +
+				(BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+			ret &= IFRFbWriteEmbedded(priv, 0x00299b00 +
+				(BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+		} else {
+			ret &= IFRFbWriteEmbedded(priv, 0x0005a400 +
+				(BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+			ret &= IFRFbWriteEmbedded(priv, 0x00099b00 +
+				(BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+		}
+		break;
 
-                if (pDevice->byCurPwr > AL7230_PWR_IDX_LEN) return false;
+	case RF_AIROHA7230:
+		if (rate <= RATE_11M)
+			ret &= IFRFbWriteEmbedded(priv, 0x111bb900 +
+				(BY_AL7230_REG_LEN << 3)+IFREGCTL_REGW);
+		else
+			ret &= IFRFbWriteEmbedded(priv, 0x221bb900 +
+				(BY_AL7230_REG_LEN << 3)+IFREGCTL_REGW);
 
-                //  0x080F1B00 for 3 wire control TxGain(D10) and 0x31 as TX Gain value
-                dwMax7230Pwr = 0x080C0B00 | ( (pDevice->byCurPwr) << 12 ) |
-                                 (BY_AL7230_REG_LEN << 3 )  | IFREGCTL_REGW;
+		if (priv->byCurPwr > AL7230_PWR_IDX_LEN)
+			return false;
 
-                bResult &= IFRFbWriteEmbedded(pDevice, dwMax7230Pwr);
-                break;
-            }
-            break;
+		/*
+		* 0x080F1B00 for 3 wire control TxGain(D10)
+		* and 0x31 as TX Gain value
+		*/
+		power_setting = 0x080c0b00 | ((priv->byCurPwr) << 12) |
+				(BY_AL7230_REG_LEN << 3) | IFREGCTL_REGW;
 
-        case RF_VT3226: //RobertYu:20051111, VT3226C0 and before
-        {
-            u32       dwVT3226Pwr;
+		ret &= IFRFbWriteEmbedded(priv, power_setting);
 
-            if (pDevice->byCurPwr >= VT3226_PWR_IDX_LEN)
-                return false;
-            dwVT3226Pwr = ((0x3F-pDevice->byCurPwr) << 20 ) | ( 0x17 << 8 ) /* Reg7 */ |
-                           (BY_VT3226_REG_LEN << 3 )  | IFREGCTL_REGW;
-            bResult &= IFRFbWriteEmbedded(pDevice, dwVT3226Pwr);
-            break;
-        }
+		break;
 
-        case RF_VT3226D0: //RobertYu:20051228
-        {
-            u32       dwVT3226Pwr;
+	case RF_VT3226:
+		if (priv->byCurPwr >= VT3226_PWR_IDX_LEN)
+			return false;
+		power_setting = ((0x3f - priv->byCurPwr) << 20) | (0x17 << 8) |
+				(BY_VT3226_REG_LEN << 3) | IFREGCTL_REGW;
 
-            if (pDevice->byCurPwr >= VT3226_PWR_IDX_LEN)
-                return false;
+		ret &= IFRFbWriteEmbedded(priv, power_setting);
 
-            if (uRATE <= RATE_11M) {
+		break;
+	case RF_VT3226D0:
+		if (priv->byCurPwr >= VT3226_PWR_IDX_LEN)
+			return false;
 
-                dwVT3226Pwr = ((0x3F-pDevice->byCurPwr) << 20 ) | ( 0xE07 << 8 ) /* Reg7 */ |   //RobertYu:20060420, TWIF 1.10
-                               (BY_VT3226_REG_LEN << 3 )  | IFREGCTL_REGW;
-                bResult &= IFRFbWriteEmbedded(pDevice, dwVT3226Pwr);
+		if (rate <= RATE_11M) {
+			power_setting = ((0x3f-priv->byCurPwr) << 20) |
+				(0xe07 << 8) | (BY_VT3226_REG_LEN << 3) |
+						IFREGCTL_REGW;
 
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x03C6A200+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW);
-		if (pDevice->vnt_mgmt.eScanState != WMAC_NO_SCANNING) {
-			/* scanning, channel number is pDevice->uScanChannel */
-			DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO
+			ret &= IFRFbWriteEmbedded(priv, power_setting);
+			ret &= IFRFbWriteEmbedded(priv, 0x03c6a200 +
+					(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW);
+
+			if (priv->vnt_mgmt.eScanState != WMAC_NO_SCANNING) {
+				DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO
 				"RFbRawSetPower> 11B mode uCurrChannel[%d]\n",
-				pDevice->vnt_mgmt.uScanChannel);
-			bResult &= IFRFbWriteEmbedded(pDevice,
-				dwVT3226D0LoCurrentTable[pDevice->
-					vnt_mgmt.uScanChannel - 1]);
+						priv->vnt_mgmt.uScanChannel);
+				ret &= IFRFbWriteEmbedded(priv,
+					vt3226d0_lo_current_table[priv->
+						vnt_mgmt.uScanChannel - 1]);
+			} else {
+				DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO
+				"RFbRawSetPower> 11B mode uCurrChannel[%d]\n",
+						priv->vnt_mgmt.uCurrChannel);
+				ret &= IFRFbWriteEmbedded(priv,
+					vt3226d0_lo_current_table[priv->
+						vnt_mgmt.uCurrChannel - 1]);
+			}
+
+			ret &= IFRFbWriteEmbedded(priv, 0x015C0800 +
+				(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW);
 		} else {
 			DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO
-				"RFbRawSetPower> 11B mode uCurrChannel[%d]\n",
-				pDevice->vnt_mgmt.uCurrChannel);
-			bResult &= IFRFbWriteEmbedded(pDevice,
-				dwVT3226D0LoCurrentTable[pDevice->
-					vnt_mgmt.uCurrChannel - 1]);
+					"@@@@ RFbRawSetPower> 11G mode\n");
+
+			power_setting = ((0x3f-priv->byCurPwr) << 20) |
+				(0x7 << 8) | (BY_VT3226_REG_LEN << 3) |
+					IFREGCTL_REGW;
+
+			ret &= IFRFbWriteEmbedded(priv, power_setting);
+			ret &= IFRFbWriteEmbedded(priv, 0x00C6A200 +
+				(BY_VT3226_REG_LEN << 3) + IFREGCTL_REGW);
+			ret &= IFRFbWriteEmbedded(priv, 0x016BC600 +
+					(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW);
+			ret &= IFRFbWriteEmbedded(priv, 0x00900800 +
+					(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW);
 		}
+		break;
 
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x015C0800+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW); //RobertYu:20060420, ok now, new switching power (mini-pci can have bigger power consumption)
-            } else {
-                DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO"@@@@ RFbRawSetPower> 11G mode\n");
-                dwVT3226Pwr = ((0x3F-pDevice->byCurPwr) << 20 ) | ( 0x7 << 8 ) /* Reg7 */ |   //RobertYu:20060420, TWIF 1.10
-                               (BY_VT3226_REG_LEN << 3 )  | IFREGCTL_REGW;
-                bResult &= IFRFbWriteEmbedded(pDevice, dwVT3226Pwr);
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x00C6A200+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW); //RobertYu:20060327
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x016BC600+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW); //RobertYu:20060111
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x00900800+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW); //RobertYu:20060111
-            }
-            break;
-        }
+	case RF_VT3342A0:
+		if (priv->byCurPwr >= VT3342_PWR_IDX_LEN)
+			return false;
 
-        //{{RobertYu:20060609
-        case RF_VT3342A0:
-        {
-            u32       dwVT3342Pwr;
+		power_setting =  ((0x3F-priv->byCurPwr) << 20) |
+			(0x27 << 8) | (BY_VT3342_REG_LEN << 3) |
+					IFREGCTL_REGW;
 
-            if (pDevice->byCurPwr >= VT3342_PWR_IDX_LEN)
-                return false;
+		ret &= IFRFbWriteEmbedded(priv, power_setting);
 
-            dwVT3342Pwr =  ((0x3F-pDevice->byCurPwr) << 20 ) | ( 0x27 << 8 ) /* Reg7 */ |
-                            (BY_VT3342_REG_LEN << 3 )  | IFREGCTL_REGW;
-            bResult &= IFRFbWriteEmbedded(pDevice, dwVT3342Pwr);
-            break;
-        }
-
-        default :
-            break;
-    }
-    return bResult;
+		break;
+	default:
+		break;
+	}
+	return ret;
 }
 
 /*+
@@ -931,169 +943,150 @@
  * Return Value: none
  *
 -*/
-void RFvRSSITodBm(struct vnt_private *pDevice, u8 byCurrRSSI, long *pldBm)
+void RFvRSSITodBm(struct vnt_private *priv, u8 rssi, long *dbm)
 {
-	u8 byIdx = (((byCurrRSSI & 0xC0) >> 6) & 0x03);
-	signed long b = (byCurrRSSI & 0x3F);
-	signed long a = 0;
-	u8 abyAIROHARF[4] = {0, 18, 0, 40};
+	u8 idx = (((rssi & 0xc0) >> 6) & 0x03);
+	long b = (rssi & 0x3f);
+	long a = 0;
+	u8 airoharf[4] = {0, 18, 0, 40};
 
-    switch (pDevice->byRFType) {
-        case RF_AL2230:
-        case RF_AL2230S:
-        case RF_AIROHA7230:
-        case RF_VT3226: //RobertYu:20051111
-        case RF_VT3226D0:
-        case RF_VT3342A0:   //RobertYu:20060609
-            a = abyAIROHARF[byIdx];
-            break;
-        default:
-            break;
-    }
+	switch (priv->byRFType) {
+	case RF_AL2230:
+	case RF_AL2230S:
+	case RF_AIROHA7230:
+	case RF_VT3226:
+	case RF_VT3226D0:
+	case RF_VT3342A0:
+		a = airoharf[idx];
+		break;
+	default:
+		break;
+	}
 
-    *pldBm = -1 * (a + b * 2);
+	*dbm = -1 * (a + b * 2);
 }
 
-void RFbRFTableDownload(struct vnt_private *pDevice)
+void RFbRFTableDownload(struct vnt_private *priv)
 {
-	u16 wLength1 = 0, wLength2 = 0, wLength3 = 0;
-	u8 *pbyAddr1 = NULL, *pbyAddr2 = NULL, *pbyAddr3 = NULL;
-	u16 wLength, wValue;
-	u8 abyArray[256];
+	u16 length1 = 0, length2 = 0, length3 = 0;
+	u8 *addr1 = NULL, *addr2 = NULL, *addr3 = NULL;
+	u16 length, value;
+	u8 array[256];
 
-    switch ( pDevice->byRFType ) {
-        case RF_AL2230:
-        case RF_AL2230S:
-            wLength1 = CB_AL2230_INIT_SEQ * 3;
-            wLength2 = CB_MAX_CHANNEL_24G * 3;
-            wLength3 = CB_MAX_CHANNEL_24G * 3;
-            pbyAddr1 = &(abyAL2230InitTable[0][0]);
-            pbyAddr2 = &(abyAL2230ChannelTable0[0][0]);
-            pbyAddr3 = &(abyAL2230ChannelTable1[0][0]);
-            break;
-        case RF_AIROHA7230:
-            wLength1 = CB_AL7230_INIT_SEQ * 3;
-            wLength2 = CB_MAX_CHANNEL * 3;
-            wLength3 = CB_MAX_CHANNEL * 3;
-            pbyAddr1 = &(abyAL7230InitTable[0][0]);
-            pbyAddr2 = &(abyAL7230ChannelTable0[0][0]);
-            pbyAddr3 = &(abyAL7230ChannelTable1[0][0]);
-            break;
-        case RF_VT3226: //RobertYu:20051111
-            wLength1 = CB_VT3226_INIT_SEQ * 3;
-            wLength2 = CB_MAX_CHANNEL_24G * 3;
-            wLength3 = CB_MAX_CHANNEL_24G * 3;
-            pbyAddr1 = &(abyVT3226_InitTable[0][0]);
-            pbyAddr2 = &(abyVT3226_ChannelTable0[0][0]);
-            pbyAddr3 = &(abyVT3226_ChannelTable1[0][0]);
-            break;
-        case RF_VT3226D0: //RobertYu:20051114
-            wLength1 = CB_VT3226_INIT_SEQ * 3;
-            wLength2 = CB_MAX_CHANNEL_24G * 3;
-            wLength3 = CB_MAX_CHANNEL_24G * 3;
-            pbyAddr1 = &(abyVT3226D0_InitTable[0][0]);
-            pbyAddr2 = &(abyVT3226_ChannelTable0[0][0]);
-            pbyAddr3 = &(abyVT3226_ChannelTable1[0][0]);
-            break;
-        case RF_VT3342A0: //RobertYu:20060609
-            wLength1 = CB_VT3342_INIT_SEQ * 3;
-            wLength2 = CB_MAX_CHANNEL * 3;
-            wLength3 = CB_MAX_CHANNEL * 3;
-            pbyAddr1 = &(abyVT3342A0_InitTable[0][0]);
-            pbyAddr2 = &(abyVT3342_ChannelTable0[0][0]);
-            pbyAddr3 = &(abyVT3342_ChannelTable1[0][0]);
-            break;
+	switch (priv->byRFType) {
+	case RF_AL2230:
+	case RF_AL2230S:
+		length1 = CB_AL2230_INIT_SEQ * 3;
+		length2 = CB_MAX_CHANNEL_24G * 3;
+		length3 = CB_MAX_CHANNEL_24G * 3;
+		addr1 = &al2230_init_table[0][0];
+		addr2 = &al2230_channel_table0[0][0];
+		addr3 = &al2230_channel_table1[0][0];
+		break;
+	case RF_AIROHA7230:
+		length1 = CB_AL7230_INIT_SEQ * 3;
+		length2 = CB_MAX_CHANNEL * 3;
+		length3 = CB_MAX_CHANNEL * 3;
+		addr1 = &al7230_init_table[0][0];
+		addr2 = &al7230_channel_table0[0][0];
+		addr3 = &al7230_channel_table1[0][0];
+		break;
+	case RF_VT3226:
+		length1 = CB_VT3226_INIT_SEQ * 3;
+		length2 = CB_MAX_CHANNEL_24G * 3;
+		length3 = CB_MAX_CHANNEL_24G * 3;
+		addr1 = &at3226_init_table[0][0];
+		addr2 = &vt3226_channel_table0[0][0];
+		addr3 = &vt3226_channel_table1[0][0];
+		break;
+	case RF_VT3226D0:
+		length1 = CB_VT3226_INIT_SEQ * 3;
+		length2 = CB_MAX_CHANNEL_24G * 3;
+		length3 = CB_MAX_CHANNEL_24G * 3;
+		addr1 = &at3226d0_init_table[0][0];
+		addr2 = &vt3226_channel_table0[0][0];
+		addr3 = &vt3226_channel_table1[0][0];
+		break;
+	case RF_VT3342A0:
+		length1 = CB_VT3342_INIT_SEQ * 3;
+		length2 = CB_MAX_CHANNEL * 3;
+		length3 = CB_MAX_CHANNEL * 3;
+		addr1 = &vt3342a0_init_table[0][0];
+		addr2 = &vt3342_channel_table0[0][0];
+		addr3 = &vt3342_channel_table1[0][0];
+		break;
+	}
 
-    }
-    //Init Table
+	/* Init Table */
+	memcpy(array, addr1, length1);
 
-    memcpy(abyArray, pbyAddr1, wLength1);
-    CONTROLnsRequestOut(pDevice,
-                    MESSAGE_TYPE_WRITE,
-                    0,
-                    MESSAGE_REQUEST_RF_INIT,
-                    wLength1,
-                    abyArray
-                    );
-    //Channel Table 0
-    wValue = 0;
-    while ( wLength2 > 0 ) {
+	CONTROLnsRequestOut(priv, MESSAGE_TYPE_WRITE, 0,
+		MESSAGE_REQUEST_RF_INIT, length1, array);
 
-        if ( wLength2 >= 64 ) {
-            wLength = 64;
-        } else {
-            wLength = wLength2;
-        }
-        memcpy(abyArray, pbyAddr2, wLength);
-        CONTROLnsRequestOut(pDevice,
-                        MESSAGE_TYPE_WRITE,
-                        wValue,
-                        MESSAGE_REQUEST_RF_CH0,
-                        wLength,
-                        abyArray);
+	/* Channel Table 0 */
+	value = 0;
+	while (length2 > 0) {
+		if (length2 >= 64)
+			length = 64;
+		else
+			length = length2;
 
-        wLength2 -= wLength;
-        wValue += wLength;
-        pbyAddr2 += wLength;
-    }
-    //Channel table 1
-    wValue = 0;
-    while ( wLength3 > 0 ) {
+		memcpy(array, addr2, length);
 
-        if ( wLength3 >= 64 ) {
-            wLength = 64;
-        } else {
-            wLength = wLength3;
-        }
-        memcpy(abyArray, pbyAddr3, wLength);
-        CONTROLnsRequestOut(pDevice,
-                        MESSAGE_TYPE_WRITE,
-                        wValue,
-                        MESSAGE_REQUEST_RF_CH1,
-                        wLength,
-                        abyArray);
+		CONTROLnsRequestOut(priv, MESSAGE_TYPE_WRITE,
+			value, MESSAGE_REQUEST_RF_CH0, length, array);
 
-        wLength3 -= wLength;
-        wValue += wLength;
-        pbyAddr3 += wLength;
-    }
+		length2 -= length;
+		value += length;
+		addr2 += length;
+	}
 
-    //7230 needs 2 InitTable and 3 Channel Table
-    if ( pDevice->byRFType == RF_AIROHA7230 ) {
-        wLength1 = CB_AL7230_INIT_SEQ * 3;
-        wLength2 = CB_MAX_CHANNEL * 3;
-        pbyAddr1 = &(abyAL7230InitTableAMode[0][0]);
-        pbyAddr2 = &(abyAL7230ChannelTable2[0][0]);
-        memcpy(abyArray, pbyAddr1, wLength1);
-        //Init Table 2
-        CONTROLnsRequestOut(pDevice,
-                    MESSAGE_TYPE_WRITE,
-                    0,
-                    MESSAGE_REQUEST_RF_INIT2,
-                    wLength1,
-                    abyArray);
+	/* Channel table 1 */
+	value = 0;
+	while (length3 > 0) {
+		if (length3 >= 64)
+			length = 64;
+		else
+			length = length3;
 
-        //Channel Table 0
-        wValue = 0;
-        while ( wLength2 > 0 ) {
+		memcpy(array, addr3, length);
 
-            if ( wLength2 >= 64 ) {
-                wLength = 64;
-            } else {
-                wLength = wLength2;
-            }
-            memcpy(abyArray, pbyAddr2, wLength);
-            CONTROLnsRequestOut(pDevice,
-                            MESSAGE_TYPE_WRITE,
-                            wValue,
-                            MESSAGE_REQUEST_RF_CH2,
-                            wLength,
-                            abyArray);
+		CONTROLnsRequestOut(priv, MESSAGE_TYPE_WRITE,
+			value, MESSAGE_REQUEST_RF_CH1, length, array);
 
-            wLength2 -= wLength;
-            wValue += wLength;
-            pbyAddr2 += wLength;
-        }
-    }
+		length3 -= length;
+		value += length;
+		addr3 += length;
+	}
 
+	if (priv->byRFType == RF_AIROHA7230) {
+		length1 = CB_AL7230_INIT_SEQ * 3;
+		length2 = CB_MAX_CHANNEL * 3;
+		addr1 = &(al7230_init_table_amode[0][0]);
+		addr2 = &(al7230_channel_table2[0][0]);
+
+		memcpy(array, addr1, length1);
+
+		/* Init Table 2 */
+		CONTROLnsRequestOut(priv, MESSAGE_TYPE_WRITE,
+			0, MESSAGE_REQUEST_RF_INIT2, length1, array);
+
+		/* Channel Table 0 */
+		value = 0;
+		while (length2 > 0) {
+			if (length2 >= 64)
+				length = 64;
+			else
+				length = length2;
+
+			memcpy(array, addr2, length);
+
+			CONTROLnsRequestOut(priv, MESSAGE_TYPE_WRITE,
+				value, MESSAGE_REQUEST_RF_CH2, length, array);
+
+			length2 -= length;
+			value += length;
+			addr2 += length;
+		}
+	}
 }

diff --git a/drivers/staging/vt6656/tether.h b/drivers/staging/vt6656/tether.h
index 24465cf..aec6b56 100644
--- a/drivers/staging/vt6656/tether.h
+++ b/drivers/staging/vt6656/tether.h

@@ -99,16 +99,6 @@
 
 #define WEP_IV_MASK         0x00FFFFFF
 
-//
-// 802_3 packet
-//
-typedef struct tagS802_3Header {
-    u8    abyDstAddr[ETH_ALEN];
-    u8    abySrcAddr[ETH_ALEN];
-    u16    wLen;
-} __attribute__ ((__packed__))
-S802_3Header, *PS802_3Header;
-
 //u8 ETHbyGetHashIndexByCrc(u8 * pbyMultiAddr);
 bool ETHbIsBufferCrc32Ok(u8 * pbyBuffer, unsigned int cbFrameLength);
 

diff --git a/drivers/staging/vt6656/tmacro.h b/drivers/staging/vt6656/tmacro.h
index 15cd5ab..15e724e 100644
--- a/drivers/staging/vt6656/tmacro.h
+++ b/drivers/staging/vt6656/tmacro.h

@@ -45,14 +45,8 @@
 #define HIWORD(d)           ((u16)((((u32)(d)) >> 16) & 0xFFFF))
 #endif
 
-#define LODWORD(q)          ((q).u.dwLowDword)
-#define HIDWORD(q)          ((q).u.dwHighDword)
-
 #if !defined(MAKEWORD)
 #define MAKEWORD(lb, hb)    ((u16)(((u8)(lb)) | (((u16)((u8)(hb))) << 8)))
 #endif
-#if !defined(MAKEDWORD)
-#define MAKEDWORD(lw, hw)   ((u32)(((u16)(lw)) | (((u32)((u16)(hw))) << 16)))
-#endif
 
 #endif /* __TMACRO_H__ */

diff --git a/drivers/staging/winbond/phy_calibration.c b/drivers/staging/winbond/phy_calibration.c
index cabae34..cfbfbbb 100644
--- a/drivers/staging/winbond/phy_calibration.c
+++ b/drivers/staging/winbond/phy_calibration.c

@@ -296,7 +296,7 @@
 	}
 }
 
-static unsigned char hal_get_dxx_reg(struct hw_data *pHwData, u16 number, u32 * pValue)
+static unsigned char hal_get_dxx_reg(struct hw_data *pHwData, u16 number, u32 *pValue)
 {
 	if (number < 0x1000)
 		number += 0x1000;

diff --git a/drivers/staging/winbond/reg.c b/drivers/staging/winbond/reg.c
index 5ecf9a1..75b7752 100644
--- a/drivers/staging/winbond/reg.c
+++ b/drivers/staging/winbond/reg.c

@@ -920,20 +920,20 @@
 	Wb35Reg_WriteSync(pHwData, 0x03f8, 0x7ff);
 }
 
-void Set_ChanIndep_RfData_al7230_24(struct hw_data *pHwData, u32 *pltmp , char number)
+static void Set_ChanIndep_RfData_al7230_24(struct hw_data *pHwData, u32 *pltmp, 
+					char number)
 {
 	u8	i;
-
 	for (i = 0; i < number; i++) {
 		pHwData->phy_para[i] = al7230_rf_data_24[i];
 		pltmp[i] = (1 << 31) | (0 << 30) | (24 << 24) | (al7230_rf_data_24[i] & 0xffffff);
 	}
 }
 
-void Set_ChanIndep_RfData_al7230_50(struct hw_data *pHwData, u32 *pltmp, char number)
+static void Set_ChanIndep_RfData_al7230_50(struct hw_data *pHwData, u32 *pltmp, 
+					char number)
 {
 	u8	i;
-
 	for (i = 0; i < number; i++) {
 		pHwData->phy_para[i] = al7230_rf_data_50[i];
 		pltmp[i] = (1 << 31) | (0 << 30) | (24 << 24) | (al7230_rf_data_50[i] & 0xffffff);
@@ -1263,7 +1263,7 @@
 	}
 }
 
-void BBProcessor_AL7230_2400(struct hw_data *pHwData)
+static void BBProcessor_AL7230_2400(struct hw_data *pHwData)
 {
 	struct wb35_reg *reg = &pHwData->reg;
 	u32	pltmp[12];
@@ -1304,7 +1304,7 @@
 	Wb35Reg_BurstWrite(pHwData, 0x1030, pltmp, 12, AUTO_INCREMENT);
 }
 
-void BBProcessor_AL7230_5000(struct hw_data *pHwData)
+static void BBProcessor_AL7230_5000(struct hw_data *pHwData)
 {
 	struct wb35_reg *reg = &pHwData->reg;
 	u32	pltmp[12];
@@ -1620,22 +1620,24 @@
 		reg->SQ3_filter[i] = 0x2f; /* half of Bit 0 ~ 6 */
 }
 
-void set_tx_power_per_channel_max2829(struct hw_data *pHwData,  struct chan_info Channel)
+static inline void set_tx_power_per_channel_max2829(struct hw_data *pHwData,  
+						struct chan_info Channel)
 {
 	RFSynthesizer_SetPowerIndex(pHwData, 100);
 }
 
-void set_tx_power_per_channel_al2230(struct hw_data *pHwData,  struct chan_info Channel)
+static void set_tx_power_per_channel_al2230(struct hw_data *pHwData,  
+					struct chan_info Channel)
 {
 	u8	index = 100;
-
 	if (pHwData->TxVgaFor24[Channel.ChanNo - 1] != 0xff)
 		index = pHwData->TxVgaFor24[Channel.ChanNo - 1];
 
 	RFSynthesizer_SetPowerIndex(pHwData, index);
 }
 
-void set_tx_power_per_channel_al7230(struct hw_data *pHwData,  struct chan_info Channel)
+static void set_tx_power_per_channel_al7230(struct hw_data *pHwData,  
+					struct chan_info Channel)
 {
 	u8	i, index = 100;
 
@@ -1658,7 +1660,8 @@
 	RFSynthesizer_SetPowerIndex(pHwData, index);
 }
 
-void set_tx_power_per_channel_wb242(struct hw_data *pHwData,  struct chan_info Channel)
+static void set_tx_power_per_channel_wb242(struct hw_data *pHwData,  
+					struct chan_info Channel)
 {
 	u8	index = 100;
 

diff --git a/drivers/staging/winbond/wb35reg.c b/drivers/staging/winbond/wb35reg.c
index 1bff7d1..9be1b3b 100644
--- a/drivers/staging/winbond/wb35reg.c
+++ b/drivers/staging/winbond/wb35reg.c

@@ -30,46 +30,46 @@
 	/* Trying to use burst write function if use new hardware */
 	UrbSize = sizeof(struct wb35_reg_queue) + DataSize + sizeof(struct usb_ctrlrequest);
 	reg_queue = kzalloc(UrbSize, GFP_ATOMIC);
+	if (reg_queue == NULL)
+		return false;
+
 	urb = usb_alloc_urb(0, GFP_ATOMIC);
-	if (urb && reg_queue) {
-		reg_queue->DIRECT = 2; /* burst write register */
-		reg_queue->INDEX = RegisterNo;
-		reg_queue->pBuffer = (u32 *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
-		memcpy(reg_queue->pBuffer, pRegisterData, DataSize);
-		/* the function for reversing register data from little endian to big endian */
-		for (i = 0; i < NumberOfData ; i++)
-			reg_queue->pBuffer[i] = cpu_to_le32(reg_queue->pBuffer[i]);
-
-		dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue) + DataSize);
-		dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
-		dr->bRequest = 0x04; /* USB or vendor-defined request code, burst mode */
-		dr->wValue = cpu_to_le16(Flag); /* 0: Register number auto-increment, 1: No auto increment */
-		dr->wIndex = cpu_to_le16(RegisterNo);
-		dr->wLength = cpu_to_le16(DataSize);
-		reg_queue->Next = NULL;
-		reg_queue->pUsbReq = dr;
-		reg_queue->urb = urb;
-
-		spin_lock_irq(&reg->EP0VM_spin_lock);
-		if (reg->reg_first == NULL)
-			reg->reg_first = reg_queue;
-		else
-			reg->reg_last->Next = reg_queue;
-		reg->reg_last = reg_queue;
-
-		spin_unlock_irq(&reg->EP0VM_spin_lock);
-
-		/* Start EP0VM */
-		Wb35Reg_EP0VM_start(pHwData);
-
-		return true;
-	} else {
-		if (urb)
-			usb_free_urb(urb);
+	if (urb == NULL) {
 		kfree(reg_queue);
 		return false;
 	}
-   return false;
+
+	reg_queue->DIRECT = 2; /* burst write register */
+	reg_queue->INDEX = RegisterNo;
+	reg_queue->pBuffer = (u32 *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
+	memcpy(reg_queue->pBuffer, pRegisterData, DataSize);
+	/* the function for reversing register data from little endian to big endian */
+	for (i = 0; i < NumberOfData ; i++)
+		reg_queue->pBuffer[i] = cpu_to_le32(reg_queue->pBuffer[i]);
+
+	dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue) + DataSize);
+	dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
+	dr->bRequest = 0x04; /* USB or vendor-defined request code, burst mode */
+	dr->wValue = cpu_to_le16(Flag); /* 0: Register number auto-increment, 1: No auto increment */
+	dr->wIndex = cpu_to_le16(RegisterNo);
+	dr->wLength = cpu_to_le16(DataSize);
+	reg_queue->Next = NULL;
+	reg_queue->pUsbReq = dr;
+	reg_queue->urb = urb;
+
+	spin_lock_irq(&reg->EP0VM_spin_lock);
+	if (reg->reg_first == NULL)
+		reg->reg_first = reg_queue;
+	else
+		reg->reg_last->Next = reg_queue;
+	reg->reg_last = reg_queue;
+
+	spin_unlock_irq(&reg->EP0VM_spin_lock);
+
+	/* Start EP0VM */
+	Wb35Reg_EP0VM_start(pHwData);
+
+	return true;
 }
 
 void Wb35Reg_Update(struct hw_data *pHwData,  u16 RegisterNo,  u32 RegisterValue)
@@ -174,43 +174,44 @@
 	/* update the register by send urb request */
 	UrbSize = sizeof(struct wb35_reg_queue) + sizeof(struct usb_ctrlrequest);
 	reg_queue = kzalloc(UrbSize, GFP_ATOMIC);
+	if (reg_queue == NULL)
+		return false;
+
 	urb = usb_alloc_urb(0, GFP_ATOMIC);
-	if (urb && reg_queue) {
-		reg_queue->DIRECT = 1; /* burst write register */
-		reg_queue->INDEX = RegisterNo;
-		reg_queue->VALUE = cpu_to_le32(RegisterValue);
-		reg_queue->RESERVED_VALID = false;
-		dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
-		dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
-		dr->bRequest = 0x03; /* USB or vendor-defined request code, burst mode */
-		dr->wValue = cpu_to_le16(0x0);
-		dr->wIndex = cpu_to_le16(RegisterNo);
-		dr->wLength = cpu_to_le16(4);
-
-		/* Enter the sending queue */
-		reg_queue->Next = NULL;
-		reg_queue->pUsbReq = dr;
-		reg_queue->urb = urb;
-
-		spin_lock_irq(&reg->EP0VM_spin_lock);
-		if (reg->reg_first == NULL)
-			reg->reg_first = reg_queue;
-		else
-			reg->reg_last->Next = reg_queue;
-		reg->reg_last = reg_queue;
-
-		spin_unlock_irq(&reg->EP0VM_spin_lock);
-
-		/* Start EP0VM */
-		Wb35Reg_EP0VM_start(pHwData);
-
-		return true;
-	} else {
-		if (urb)
-			usb_free_urb(urb);
+	if (urb == NULL) {
 		kfree(reg_queue);
 		return false;
 	}
+
+	reg_queue->DIRECT = 1; /* burst write register */
+	reg_queue->INDEX = RegisterNo;
+	reg_queue->VALUE = cpu_to_le32(RegisterValue);
+	reg_queue->RESERVED_VALID = false;
+	dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
+	dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
+	dr->bRequest = 0x03; /* USB or vendor-defined request code, burst mode */
+	dr->wValue = cpu_to_le16(0x0);
+	dr->wIndex = cpu_to_le16(RegisterNo);
+	dr->wLength = cpu_to_le16(4);
+
+	/* Enter the sending queue */
+	reg_queue->Next = NULL;
+	reg_queue->pUsbReq = dr;
+	reg_queue->urb = urb;
+
+	spin_lock_irq(&reg->EP0VM_spin_lock);
+	if (reg->reg_first == NULL)
+		reg->reg_first = reg_queue;
+	else
+		reg->reg_last->Next = reg_queue;
+	reg->reg_last = reg_queue;
+
+	spin_unlock_irq(&reg->EP0VM_spin_lock);
+
+	/* Start EP0VM */
+	Wb35Reg_EP0VM_start(pHwData);
+
+	return true;
 }
 
 /*
@@ -238,43 +239,45 @@
 	/* update the register by send urb request */
 	UrbSize = sizeof(struct wb35_reg_queue) + sizeof(struct usb_ctrlrequest);
 	reg_queue = kzalloc(UrbSize, GFP_ATOMIC);
+	if (reg_queue == NULL)
+		return false;
+
 	urb = usb_alloc_urb(0, GFP_ATOMIC);
-	if (urb && reg_queue) {
-		reg_queue->DIRECT = 1; /* burst write register */
-		reg_queue->INDEX = RegisterNo;
-		reg_queue->VALUE = cpu_to_le32(RegisterValue);
-		/* NOTE : Users must guarantee the size of value will not exceed the buffer size. */
-		memcpy(reg_queue->RESERVED, pValue, Len);
-		reg_queue->RESERVED_VALID = true;
-		dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
-		dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
-		dr->bRequest = 0x03; /* USB or vendor-defined request code, burst mode */
-		dr->wValue = cpu_to_le16(0x0);
-		dr->wIndex = cpu_to_le16(RegisterNo);
-		dr->wLength = cpu_to_le16(4);
-
-		/* Enter the sending queue */
-		reg_queue->Next = NULL;
-		reg_queue->pUsbReq = dr;
-		reg_queue->urb = urb;
-		spin_lock_irq(&reg->EP0VM_spin_lock);
-		if (reg->reg_first == NULL)
-			reg->reg_first = reg_queue;
-		else
-			reg->reg_last->Next = reg_queue;
-		reg->reg_last = reg_queue;
-
-		spin_unlock_irq(&reg->EP0VM_spin_lock);
-
-		/* Start EP0VM */
-		Wb35Reg_EP0VM_start(pHwData);
-		return true;
-	} else {
-		if (urb)
-			usb_free_urb(urb);
+	if (urb == NULL) {
 		kfree(reg_queue);
 		return false;
 	}
+
+	reg_queue->DIRECT = 1; /* burst write register */
+	reg_queue->INDEX = RegisterNo;
+	reg_queue->VALUE = cpu_to_le32(RegisterValue);
+	/* NOTE : Users must guarantee the size of value will not exceed the buffer size. */
+	memcpy(reg_queue->RESERVED, pValue, Len);
+	reg_queue->RESERVED_VALID = true;
+	dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
+	dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
+	dr->bRequest = 0x03; /* USB or vendor-defined request code, burst mode */
+	dr->wValue = cpu_to_le16(0x0);
+	dr->wIndex = cpu_to_le16(RegisterNo);
+	dr->wLength = cpu_to_le16(4);
+
+	/* Enter the sending queue */
+	reg_queue->Next = NULL;
+	reg_queue->pUsbReq = dr;
+	reg_queue->urb = urb;
+	spin_lock_irq(&reg->EP0VM_spin_lock);
+	if (reg->reg_first == NULL)
+		reg->reg_first = reg_queue;
+	else
+		reg->reg_last->Next = reg_queue;
+	reg->reg_last = reg_queue;
+
+	spin_unlock_irq(&reg->EP0VM_spin_lock);
+
+	/* Start EP0VM */
+	Wb35Reg_EP0VM_start(pHwData);
+
+	return true;
 }
 
 /*
@@ -344,41 +347,41 @@
 	/* update the variable by send Urb to read register */
 	UrbSize = sizeof(struct wb35_reg_queue) + sizeof(struct usb_ctrlrequest);
 	reg_queue = kzalloc(UrbSize, GFP_ATOMIC);
+	if (reg_queue == NULL)
+		return false;
+
 	urb = usb_alloc_urb(0, GFP_ATOMIC);
-	if (urb && reg_queue) {
-		reg_queue->DIRECT = 0; /* read register */
-		reg_queue->INDEX = RegisterNo;
-		reg_queue->pBuffer = pRegisterValue;
-		dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
-		dr->bRequestType = USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN;
-		dr->bRequest = 0x01; /* USB or vendor-defined request code, burst mode */
-		dr->wValue = cpu_to_le16(0x0);
-		dr->wIndex = cpu_to_le16(RegisterNo);
-		dr->wLength = cpu_to_le16(4);
-
-		/* Enter the sending queue */
-		reg_queue->Next = NULL;
-		reg_queue->pUsbReq = dr;
-		reg_queue->urb = urb;
-		spin_lock_irq(&reg->EP0VM_spin_lock);
-		if (reg->reg_first == NULL)
-			reg->reg_first = reg_queue;
-		else
-			reg->reg_last->Next = reg_queue;
-		reg->reg_last = reg_queue;
-
-		spin_unlock_irq(&reg->EP0VM_spin_lock);
-
-		/* Start EP0VM */
-		Wb35Reg_EP0VM_start(pHwData);
-
-		return true;
-	} else {
-		if (urb)
-			usb_free_urb(urb);
+	if (urb == NULL) {
 		kfree(reg_queue);
 		return false;
 	}
+	reg_queue->DIRECT = 0; /* read register */
+	reg_queue->INDEX = RegisterNo;
+	reg_queue->pBuffer = pRegisterValue;
+	dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
+	dr->bRequestType = USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN;
+	dr->bRequest = 0x01; /* USB or vendor-defined request code, burst mode */
+	dr->wValue = cpu_to_le16(0x0);
+	dr->wIndex = cpu_to_le16(RegisterNo);
+	dr->wLength = cpu_to_le16(4);
+
+	/* Enter the sending queue */
+	reg_queue->Next = NULL;
+	reg_queue->pUsbReq = dr;
+	reg_queue->urb = urb;
+	spin_lock_irq(&reg->EP0VM_spin_lock);
+	if (reg->reg_first == NULL)
+		reg->reg_first = reg_queue;
+	else
+		reg->reg_last->Next = reg_queue;
+	reg->reg_last = reg_queue;
+
+	spin_unlock_irq(&reg->EP0VM_spin_lock);
+
+	/* Start EP0VM */
+	Wb35Reg_EP0VM_start(pHwData);
+
+	return true;
 }
 
 

diff --git a/drivers/staging/winbond/wb35rx.c b/drivers/staging/winbond/wb35rx.c
index f118eeb..8d71bc2 100644
--- a/drivers/staging/winbond/wb35rx.c
+++ b/drivers/staging/winbond/wb35rx.c

@@ -343,8 +343,7 @@
 	} while (pWb35Rx->EP3vm_state != VM_STOP);
 	msleep(10); /* Delay for waiting function exit */
 
-	if (pWb35Rx->RxUrb)
-		usb_free_urb(pWb35Rx->RxUrb);
+	usb_free_urb(pWb35Rx->RxUrb);
 	pr_debug("Wb35Rx_destroy OK\n");
 }
 

diff --git a/drivers/staging/wlags49_h2/wl_cs.c b/drivers/staging/wlags49_h2/wl_cs.c
index 7c7c77f..b55dc43 100644
--- a/drivers/staging/wlags49_h2/wl_cs.c
+++ b/drivers/staging/wlags49_h2/wl_cs.c

@@ -133,6 +133,7 @@
 {
 	struct net_device   *dev;
 	struct wl_private   *lp;
+	int ret;
 	/*--------------------------------------------------------------------*/
 
 	DBG_FUNC("wl_adapter_attach");
@@ -154,10 +155,12 @@
 	lp = wl_priv(dev);
 	lp->link = link;
 
-	wl_adapter_insert(link);
+	ret = wl_adapter_insert(link);
+	if (ret != 0)
+		wl_device_dealloc(dev);
 
 	DBG_LEAVE(DbgInfo);
-	return 0;
+	return ret;
 } /* wl_adapter_attach */
 /*============================================================================*/
 
@@ -224,7 +227,7 @@
 	return 0;
 } /* wl_adapter_resume */
 
-void wl_adapter_insert(struct pcmcia_device *link)
+int wl_adapter_insert(struct pcmcia_device *link)
 {
 	struct net_device *dev;
 	int ret;
@@ -256,7 +259,8 @@
 	dev->base_addr  = link->resource[0]->start;
 
 	SET_NETDEV_DEV(dev, &link->dev);
-	if (register_netdev(dev) != 0) {
+	ret = register_netdev(dev);
+	if (ret != 0) {
 		printk("%s: register_netdev() failed\n", MODULE_NAME);
 		goto failed;
 	}
@@ -267,13 +271,13 @@
 		" %pM\n", dev->name, dev->base_addr, dev->irq, dev->dev_addr);
 
 	DBG_LEAVE(DbgInfo);
-	return;
+	return 0;
 
 failed:
 	wl_adapter_release(link);
 
 	DBG_LEAVE(DbgInfo);
-	return;
+	return ret;
 } /* wl_adapter_insert */
 /*============================================================================*/
 

diff --git a/drivers/staging/wlags49_h2/wl_cs.h b/drivers/staging/wlags49_h2/wl_cs.h
index a7ab579..081cc6f 100644
--- a/drivers/staging/wlags49_h2/wl_cs.h
+++ b/drivers/staging/wlags49_h2/wl_cs.h

@@ -65,10 +65,10 @@
 
 
 /*******************************************************************************
- *  function protoypes
+ *  function prototypes
  ******************************************************************************/
 
-void wl_adapter_insert(struct pcmcia_device *link);
+int wl_adapter_insert(struct pcmcia_device *link);
 
 void wl_adapter_release(struct pcmcia_device *link);
 

diff --git a/drivers/staging/wlags49_h2/wl_main.c b/drivers/staging/wlags49_h2/wl_main.c
index f28f15b..4353561 100644
--- a/drivers/staging/wlags49_h2/wl_main.c
+++ b/drivers/staging/wlags49_h2/wl_main.c

@@ -3171,7 +3171,9 @@
 
 					memset( ssid, 0, sizeof( ssid ));
 					strncpy( ssid, &probe_rsp->rawData[2],
-							 probe_rsp->rawData[1] );
+						 min_t(u8,
+							probe_rsp->rawData[1],
+							HCF_MAX_NAME_LEN - 1));
 
 					DBG_TRACE( DbgInfo, "(%s) SSID        : %s\n",
 							   lp->dev->name, ssid );

diff --git a/drivers/staging/wlan-ng/prism2sta.c b/drivers/staging/wlan-ng/prism2sta.c
index 428a9be..76374b2 100644
--- a/drivers/staging/wlan-ng/prism2sta.c
+++ b/drivers/staging/wlan-ng/prism2sta.c

@@ -1122,8 +1122,7 @@
 
 	kfree(hw->scanresults);
 
-	hw->scanresults = kmalloc(sizeof(hfa384x_InfFrame_t), GFP_ATOMIC);
-	memcpy(hw->scanresults, inf, sizeof(hfa384x_InfFrame_t));
+	hw->scanresults = kmemdup(inf, sizeof(hfa384x_InfFrame_t), GFP_ATOMIC);
 
 	if (nbss == 0)
 		nbss = -1;

diff --git a/drivers/staging/xgifb/vb_def.h b/drivers/staging/xgifb/vb_def.h
index 80c9723..5c739be 100644
--- a/drivers/staging/xgifb/vb_def.h
+++ b/drivers/staging/xgifb/vb_def.h

@@ -30,11 +30,6 @@
 #define SetCRT2ToDualEdge   0x8000
 
 #define ReserveTVOption     0x0008
-#define GatingCRT           0x0800
-#define DisableChB          0x1000
-#define EnableChB           0x2000
-#define DisableChA          0x4000
-#define EnableChA           0x8000
 
 #define SetTVLowResolution   0x0400
 #define TVSimuMode           0x0800

diff --git a/drivers/staging/xgifb/vb_init.c b/drivers/staging/xgifb/vb_init.c
index 19ce5a9..5f1c41e 100644
--- a/drivers/staging/xgifb/vb_init.c
+++ b/drivers/staging/xgifb/vb_init.c

@@ -54,14 +54,12 @@
 		udelay(800);
 		xgifb_reg_or(pVBInfo->P3d4, 0x4A, 0x80); /* Enable GPIOH read */
 		/* GPIOF 0:DVI 1:DVO */
-		temp = xgifb_reg_get(pVBInfo->P3d4, 0x48);
+		data = xgifb_reg_get(pVBInfo->P3d4, 0x48);
 		/* HOTPLUG_SUPPORT */
 		/* for current XG20 & XG21, GPIOH is floating, driver will
 		 * fix DDR temporarily */
-		if (temp & 0x01) /* DVI read GPIOH */
-			data = 1; /* DDRII */
-		else
-			data = 0; /* DDR */
+		/* DVI read GPIOH */
+		data &= 0x01; /* 1=DDRII, 0=DDR */
 		/* ~HOTPLUG_SUPPORT */
 		xgifb_reg_or(pVBInfo->P3d4, 0xB4, 0x02);
 		return data;
@@ -1079,44 +1077,23 @@
 							*HwDeviceExtension,
 				      struct vb_device_info *pVBInfo)
 {
-	unsigned short temp;
+	unsigned short temp = HwDeviceExtension->ulCRT2LCDType;
 
-	/* add lcd sense */
-	if (HwDeviceExtension->ulCRT2LCDType == LCD_UNKNOWN) {
+	switch (HwDeviceExtension->ulCRT2LCDType) {
+	case LCD_640x480:
+	case LCD_1024x600:
+	case LCD_1152x864:
+	case LCD_1280x960:
+	case LCD_1152x768:
+	case LCD_1920x1440:
+	case LCD_2048x1536:
+		temp = 0; /* overwrite used ulCRT2LCDType */
+		break;
+	case LCD_UNKNOWN: /* unknown lcd, do nothing */
 		return 0;
-	} else {
-		temp = (unsigned short) HwDeviceExtension->ulCRT2LCDType;
-		switch (HwDeviceExtension->ulCRT2LCDType) {
-		case LCD_INVALID:
-		case LCD_800x600:
-		case LCD_1024x768:
-		case LCD_1280x1024:
-			break;
-
-		case LCD_640x480:
-		case LCD_1024x600:
-		case LCD_1152x864:
-		case LCD_1280x960:
-		case LCD_1152x768:
-			temp = 0;
-			break;
-
-		case LCD_1400x1050:
-		case LCD_1280x768:
-		case LCD_1600x1200:
-			break;
-
-		case LCD_1920x1440:
-		case LCD_2048x1536:
-			temp = 0;
-			break;
-
-		default:
-			break;
-		}
-		xgifb_reg_and_or(pVBInfo->P3d4, 0x36, 0xF0, temp);
-		return 1;
 	}
+	xgifb_reg_and_or(pVBInfo->P3d4, 0x36, 0xF0, temp);
+	return 1;
 }
 
 static void XGINew_GetXG21Sense(struct pci_dev *pdev,
@@ -1138,17 +1115,11 @@
 			xgifb_reg_or(pVBInfo->P3d4, 0x32, LCDSense);
 			/* Enable read GPIOF */
 			xgifb_reg_and_or(pVBInfo->P3d4, 0x4A, ~0x20, 0x20);
-			Temp = xgifb_reg_get(pVBInfo->P3d4, 0x48) & 0x04;
-			if (!Temp)
-				xgifb_reg_and_or(pVBInfo->P3d4,
-						 0x38,
-						 ~0xE0,
-						 0x80); /* TMDS on chip */
+			if (xgifb_reg_get(pVBInfo->P3d4, 0x48) & 0x04)
+				Temp = 0xA0; /* Only DVO on chip */
 			else
-				xgifb_reg_and_or(pVBInfo->P3d4,
-						 0x38,
-						 ~0xE0,
-						 0xA0); /* Only DVO on chip */
+				Temp = 0x80; /* TMDS on chip */
+			xgifb_reg_and_or(pVBInfo->P3d4, 0x38, ~0xE0, Temp);
 			/* Disable read GPIOF */
 			xgifb_reg_and(pVBInfo->P3d4, 0x4A, ~0x20);
 		}
@@ -1206,9 +1177,7 @@
 	/* enable GPIOA/B/C read */
 	xgifb_reg_and_or(pVBInfo->P3d4, 0x4A, ~0x03, 0x03);
 	temp = xgifb_reg_get(pVBInfo->P3d4, 0x48);
-	if (temp <= 2)
-		temp &= 0x03;
-	else
+	if (temp > 2)
 		temp = ((temp & 0x04) >> 1) | ((~temp) & 0x01);
 
 	xgifb_reg_set(pVBInfo->P3d4, 0x4A, CR4A);
@@ -1216,6 +1185,14 @@
 	return temp;
 }
 
+static bool xgifb_bridge_is_on(struct vb_device_info *vb_info)
+{
+	u8 flag;
+
+	flag = xgifb_reg_get(vb_info->Part4Port, 0x00);
+	return flag == 1 || flag == 2;
+}
+
 unsigned char XGIInitNew(struct pci_dev *pdev)
 {
 	struct xgifb_video_info *xgifb_info = pci_get_drvdata(pdev);
@@ -1235,10 +1212,6 @@
 
 	outb(0x67, pVBInfo->P3c2);
 
-	if (HwDeviceExtension->jChipType < XG20)
-		/* Run XGI_GetVBType before InitTo330Pointer */
-		XGI_GetVBType(pVBInfo);
-
 	InitTo330Pointer(HwDeviceExtension->jChipType, pVBInfo);
 
 	/* Openkey */
@@ -1327,7 +1300,6 @@
 		xgifb_reg_set(pVBInfo->Part1Port, 0x00, 0x00);
 		/* chk if BCLK>=100MHz */
 		temp1 = xgifb_reg_get(pVBInfo->P3d4, 0x7B);
-		temp = (unsigned char) ((temp1 >> 4) & 0x0F);
 
 		xgifb_reg_set(pVBInfo->Part1Port,
 			      0x02, XGI330_CRT2Data_1_2);
@@ -1353,7 +1325,7 @@
 	xgifb_reg_set(pVBInfo->P3c4, 0x33, XGI330_SR33);
 
 	if (HwDeviceExtension->jChipType < XG20) {
-		if (XGI_BridgeIsOn(pVBInfo) == 1) {
+		if (xgifb_bridge_is_on(pVBInfo)) {
 			xgifb_reg_set(pVBInfo->Part2Port, 0x00, 0x1C);
 			xgifb_reg_set(pVBInfo->Part4Port,
 				      0x0D, XGI330_CRT2Data_4_D);

diff --git a/drivers/staging/xgifb/vb_setmode.c b/drivers/staging/xgifb/vb_setmode.c
index 3adec3f..fcefe5b 100644
--- a/drivers/staging/xgifb/vb_setmode.c
+++ b/drivers/staging/xgifb/vb_setmode.c

@@ -35,6 +35,9 @@
 	pVBInfo->SR18 = XGI340_SR18;
 	pVBInfo->CR40 = XGI340_cr41;
 
+	if (ChipType < XG20)
+		XGI_GetVBType(pVBInfo);
+
 	/* 310 customization related */
 	if ((pVBInfo->VBType & VB_SIS301LV) || (pVBInfo->VBType & VB_SIS302LV))
 		pVBInfo->LCDCapList = XGI_LCDDLCapList;
@@ -180,66 +183,45 @@
 	tempbx = XGI330_RefIndex[RefreshRateTableIndex + (*i)].ModeID;
 	tempax = 0;
 
-	if (pVBInfo->IF_DEF_LVDS == 0) {
-		if (pVBInfo->VBInfo & SetCRT2ToRAMDAC) {
-			tempax |= SupportRAMDAC2;
+	if (pVBInfo->VBInfo & SetCRT2ToRAMDAC) {
+		tempax |= SupportRAMDAC2;
 
-			if (pVBInfo->VBType & VB_XGI301C)
-				tempax |= SupportCRT2in301C;
-		}
+		if (pVBInfo->VBType & VB_XGI301C)
+			tempax |= SupportCRT2in301C;
+	}
 
-		/* 301b */
-		if (pVBInfo->VBInfo & (SetCRT2ToLCD | XGI_SetCRT2ToLCDA)) {
-			tempax |= SupportLCD;
-
-			if (pVBInfo->LCDResInfo != Panel_1280x1024 &&
-			    pVBInfo->LCDResInfo != Panel_1280x960 &&
-			    (pVBInfo->LCDInfo & LCDNonExpanding) &&
-			    resinfo >= 9)
-				return 0;
-		}
-
-		if (pVBInfo->VBInfo & SetCRT2ToHiVision) { /* for HiTV */
-			tempax |= SupportHiVision;
-			if ((pVBInfo->VBInfo & SetInSlaveMode) &&
-			    ((resinfo == 4) ||
-			     (resinfo == 3 &&
-			      (pVBInfo->SetFlag & TVSimuMode)) ||
-			     (resinfo > 7)))
-					return 0;
-		} else if (pVBInfo->VBInfo & (SetCRT2ToAVIDEO |
-					       SetCRT2ToSVIDEO |
-					       SetCRT2ToSCART |
-					       SetCRT2ToYPbPr525750 |
-					       SetCRT2ToHiVision)) {
-			tempax |= SupportTV;
-
-			if (pVBInfo->VBType & (VB_SIS301B |
-					       VB_SIS302B |
-					       VB_SIS301LV |
-					       VB_SIS302LV |
-					       VB_XGI301C))
-				tempax |= SupportTV1024;
-
-			if (!(pVBInfo->VBInfo & TVSetPAL) &&
-			    (modeflag & NoSupportSimuTV) &&
-			    (pVBInfo->VBInfo & SetInSlaveMode) &&
-			    (!(pVBInfo->VBInfo & SetNotSimuMode)))
-				return 0;
-		}
-	} else if (pVBInfo->VBInfo & SetCRT2ToLCD) { /* for LVDS */
+	/* 301b */
+	if (pVBInfo->VBInfo & (SetCRT2ToLCD | XGI_SetCRT2ToLCDA)) {
 		tempax |= SupportLCD;
 
-		if (resinfo > 0x08)
-			return 0; /* 1024x768 */
+		if (pVBInfo->LCDResInfo != Panel_1280x1024 &&
+		    pVBInfo->LCDResInfo != Panel_1280x960 &&
+		    (pVBInfo->LCDInfo & LCDNonExpanding) &&
+		    resinfo >= 9)
+			return 0;
+	}
 
-		if (pVBInfo->LCDResInfo < Panel_1024x768) {
-			if (resinfo > 0x07)
-				return 0; /* 800x600 */
+	if (pVBInfo->VBInfo & SetCRT2ToHiVision) { /* for HiTV */
+		tempax |= SupportHiVision;
+		if ((pVBInfo->VBInfo & SetInSlaveMode) &&
+		    ((resinfo == 4) ||
+		     (resinfo == 3 && (pVBInfo->SetFlag & TVSimuMode)) ||
+		     (resinfo > 7)))
+			return 0;
+	} else if (pVBInfo->VBInfo & (SetCRT2ToAVIDEO | SetCRT2ToSVIDEO |
+				      SetCRT2ToSCART | SetCRT2ToYPbPr525750 |
+				      SetCRT2ToHiVision)) {
+		tempax |= SupportTV;
 
-			if (resinfo == 0x04)
-				return 0; /* 512x384 */
-		}
+		if (pVBInfo->VBType & (VB_SIS301B | VB_SIS302B | VB_SIS301LV |
+				       VB_SIS302LV | VB_XGI301C))
+			tempax |= SupportTV1024;
+
+		if (!(pVBInfo->VBInfo & TVSetPAL) &&
+		    (modeflag & NoSupportSimuTV) &&
+		    (pVBInfo->VBInfo & SetInSlaveMode) &&
+		    (!(pVBInfo->VBInfo & SetNotSimuMode)))
+			return 0;
 	}
 
 	for (; XGI330_RefIndex[RefreshRateTableIndex + (*i)].ModeID ==
@@ -759,7 +741,6 @@
 
 	xgifb_reg_and_or(pVBInfo->P3d4, 0x07, ~0x42, tempax);
 	data = xgifb_reg_get(pVBInfo->P3d4, 0x07);
-	data &= 0xFF;
 	tempax = 0;
 
 	if (tempbx & 0x04)
@@ -914,16 +895,10 @@
 	unsigned char index, data;
 	unsigned short vclkindex;
 
-	if (pVBInfo->IF_DEF_LVDS == 1) {
-		index = XGI330_RefIndex[RefreshRateTableIndex].Ext_CRTVCLK;
-		data = xgifb_reg_get(pVBInfo->P3c4, 0x31) & 0xCF;
-		xgifb_reg_set(pVBInfo->P3c4, 0x31, data);
-		xgifb_reg_set(pVBInfo->P3c4, 0x2B, XGI_VCLKData[index].SR2B);
-		xgifb_reg_set(pVBInfo->P3c4, 0x2C, XGI_VCLKData[index].SR2C);
-		xgifb_reg_set(pVBInfo->P3c4, 0x2D, 0x01);
-	} else if ((pVBInfo->VBType & (VB_SIS301B | VB_SIS302B | VB_SIS301LV
-			| VB_SIS302LV | VB_XGI301C)) && (pVBInfo->VBInfo
-			& XGI_SetCRT2ToLCDA)) {
+	if ((pVBInfo->IF_DEF_LVDS == 0) &&
+	    (pVBInfo->VBType & (VB_SIS301B | VB_SIS302B | VB_SIS301LV |
+				VB_SIS302LV | VB_XGI301C)) &&
+	    (pVBInfo->VBInfo & XGI_SetCRT2ToLCDA)) {
 		vclkindex = XGI_GetVCLK2Ptr(ModeNo, ModeIdIndex,
 				RefreshRateTableIndex, HwDeviceExtension,
 				pVBInfo);
@@ -1448,8 +1423,6 @@
 	Index = XGI_GetLCDCapPtr(pVBInfo);
 	*HSyncWidth = pVBInfo->LCDCapList[Index].LCD_HSyncWidth;
 	*VSyncWidth = pVBInfo->LCDCapList[Index].LCD_VSyncWidth;
-
-	return;
 }
 
 static void XGI_SetLVDSRegs(unsigned short ModeNo, unsigned short ModeIdIndex,
@@ -1589,10 +1562,8 @@
 	xgifb_reg_and_or(pVBInfo->Part1Port, 0x1a, 0x07,
 				tempax);
 
-	tempcx = pVBInfo->VGAVT;
 	tempbx = pVBInfo->VDE;
 	tempax = pVBInfo->VGAVDE;
-	tempcx -= tempax;
 
 	temp = tempax; /* 0430 ylshieh */
 	temp1 = (temp << 18) / tempbx;
@@ -1712,7 +1683,6 @@
 			*di_1 = pVBInfo->LCDCapList[index].LCDA_VCLKData2;
 		}
 	}
-	return;
 }
 
 static unsigned char XGI_GetVCLKPtr(unsigned short RefreshRateTableIndex,
@@ -1907,8 +1877,6 @@
 
 		if (!(pVBInfo->SetFlag & ReserveTVOption))
 			xgifb_reg_set(pVBInfo->P3d4, 0x3e, tempch);
-	} else {
-		return;
 	}
 }
 
@@ -1916,9 +1884,6 @@
 {
 	unsigned short flag, tempbx, tempah;
 
-	if (pVBInfo->IF_DEF_LVDS != 0)
-		return;
-
 	tempbx = VB_SIS302B;
 	flag = xgifb_reg_get(pVBInfo->Part4Port, 0x00);
 	if (flag == 0x02)
@@ -1995,37 +1960,23 @@
 		}
 	}
 
-	if (pVBInfo->IF_DEF_YPbPr == 1) {
-		if (pVBInfo->VBType & (VB_SIS301LV|VB_SIS302LV|VB_XGI301C)) {
-			if (temp & SetYPbPr) {
-				if (pVBInfo->IF_DEF_HiVision == 1) {
-					/* shampoo add for new scratch */
-					temp = xgifb_reg_get(pVBInfo->P3d4,
-							     0x35);
-					temp &= YPbPrMode;
-					tempbx |= SetCRT2ToHiVision;
+	if (pVBInfo->VBType & (VB_SIS301LV|VB_SIS302LV|VB_XGI301C)) {
+		if (temp & SetYPbPr) {
+			/* shampoo add for new scratch */
+			temp = xgifb_reg_get(pVBInfo->P3d4, 0x35);
+			temp &= YPbPrMode;
+			tempbx |= SetCRT2ToHiVision;
 
-					if (temp != YPbPrMode1080i) {
-						tempbx &= (~SetCRT2ToHiVision);
-						tempbx |= SetCRT2ToYPbPr525750;
-					}
-				}
+			if (temp != YPbPrMode1080i) {
+				tempbx &= (~SetCRT2ToHiVision);
+				tempbx |= SetCRT2ToYPbPr525750;
 			}
 		}
 	}
 
 	tempax = push; /* restore CR31 */
 
-	if (pVBInfo->IF_DEF_YPbPr == 1) {
-		if (pVBInfo->IF_DEF_HiVision == 1)
-			temp = 0x09FC;
-		else
-			temp = 0x097C;
-	} else if (pVBInfo->IF_DEF_HiVision == 1) {
-		temp = 0x01FC;
-	} else {
-		temp = 0x017C;
-	}
+	temp = 0x09FC;
 
 	if (!(tempbx & temp)) {
 		tempax |= DisableCRT2Display;
@@ -2046,15 +1997,10 @@
 	/* shampoo add */
 	/* for driver abnormal */
 	if (!(tempbx & (SwitchCRT2 | SetSimuScanMode))) {
-		if (pVBInfo->IF_DEF_CRT2Monitor == 1) {
-			if (tempbx & SetCRT2ToRAMDAC) {
-				tempbx &= (0xFF00 | SetCRT2ToRAMDAC |
-					   SwitchCRT2 | SetSimuScanMode);
-				tempbx &= (0x00FF | (~SetCRT2ToYPbPr525750));
-			}
-		} else {
-			tempbx &= (~(SetCRT2ToRAMDAC | SetCRT2ToLCD |
-				     SetCRT2ToTV));
+		if (tempbx & SetCRT2ToRAMDAC) {
+			tempbx &= (0xFF00 | SetCRT2ToRAMDAC |
+				   SwitchCRT2 | SetSimuScanMode);
+			tempbx &= (0x00FF | (~SetCRT2ToYPbPr525750));
 		}
 	}
 
@@ -2072,16 +2018,12 @@
 		tempbx &= (0x00FF | (~SetCRT2ToYPbPr525750));
 	}
 
-	if (pVBInfo->IF_DEF_YPbPr == 1) {
-		if (tempbx & SetCRT2ToYPbPr525750)
-			tempbx &= (0xFF00 | SwitchCRT2 | SetSimuScanMode);
-	}
+	if (tempbx & SetCRT2ToYPbPr525750)
+		tempbx &= (0xFF00 | SwitchCRT2 | SetSimuScanMode);
 
-	if (pVBInfo->IF_DEF_HiVision == 1) {
-		if (tempbx & SetCRT2ToHiVision)
-			tempbx &= (0xFF00 | SetCRT2ToHiVision | SwitchCRT2 |
-				   SetSimuScanMode);
-	}
+	if (tempbx & SetCRT2ToHiVision)
+		tempbx &= (0xFF00 | SetCRT2ToHiVision | SwitchCRT2 |
+			   SetSimuScanMode);
 
 	if (tempax & DisableCRT2Display) { /* Set Display Device Info */
 		if (!(tempbx & (SwitchCRT2 | SetSimuScanMode)))
@@ -2132,25 +2074,21 @@
 		if (pVBInfo->VBInfo & SetCRT2ToSCART)
 			tempbx |= TVSetPAL;
 
-		if (pVBInfo->IF_DEF_YPbPr == 1) {
-			if (pVBInfo->VBInfo & SetCRT2ToYPbPr525750) {
-				index1 = xgifb_reg_get(pVBInfo->P3d4, 0x35);
-				index1 &= YPbPrMode;
+		if (pVBInfo->VBInfo & SetCRT2ToYPbPr525750) {
+			index1 = xgifb_reg_get(pVBInfo->P3d4, 0x35);
+			index1 &= YPbPrMode;
 
-				if (index1 == YPbPrMode525i)
-					tempbx |= TVSetYPbPr525i;
+			if (index1 == YPbPrMode525i)
+				tempbx |= TVSetYPbPr525i;
 
-				if (index1 == YPbPrMode525p)
-					tempbx = tempbx | TVSetYPbPr525p;
-				if (index1 == YPbPrMode750p)
-					tempbx = tempbx | TVSetYPbPr750p;
-			}
+			if (index1 == YPbPrMode525p)
+				tempbx = tempbx | TVSetYPbPr525p;
+			if (index1 == YPbPrMode750p)
+				tempbx = tempbx | TVSetYPbPr750p;
 		}
 
-		if (pVBInfo->IF_DEF_HiVision == 1) {
-			if (pVBInfo->VBInfo & SetCRT2ToHiVision)
-				tempbx = tempbx | TVSetHiVision | TVSetPAL;
-		}
+		if (pVBInfo->VBInfo & SetCRT2ToHiVision)
+			tempbx = tempbx | TVSetHiVision | TVSetPAL;
 
 		if ((pVBInfo->VBInfo & SetInSlaveMode) &&
 		    (!(pVBInfo->VBInfo & SetNotSimuMode)))
@@ -2657,10 +2595,7 @@
 					tempbx = 775;
 				else if (pVBInfo->VGAVDE == 600)
 					tempbx = 775;
-				else
-					tempbx = 768;
-			} else
-				tempbx = 768;
+			}
 		} else if (pVBInfo->LCDResInfo == Panel_1024x768x75) {
 			tempax = 1024;
 			tempbx = 768;
@@ -2784,7 +2719,6 @@
 
 		pVBInfo->HT = tempax;
 		pVBInfo->VT = tempbx;
-		return;
 	}
 }
 
@@ -3015,9 +2949,6 @@
 	temp |= ((tempcx & 0xFF00) >> 8);
 	xgifb_reg_set(pVBInfo->Part1Port, 0x12, temp);
 
-	tempax = pVBInfo->VGAVDE;
-	tempbx = pVBInfo->VGAVDE;
-	tempcx = pVBInfo->VGAVT;
 	/* BTVGA2VRS 0x10,0x11 */
 	tempbx = (pVBInfo->VGAVT + pVBInfo->VGAVDE) >> 1;
 	/* BTVGA2VRE 0x11 */
@@ -3178,7 +3109,7 @@
 	if (pVBInfo->VBInfo & SetCRT2ToTV) {
 		if (pVBInfo->TVInfo & TVSimuMode) {
 			if (ModeNo == 0x50) {
-				if (pVBInfo->TVInfo & SetNTSCTV) {
+				if (pVBInfo->TVInfo == SetNTSCTV) {
 					xgifb_reg_set(pVBInfo->Part1Port,
 							0x07, 0x30);
 					xgifb_reg_set(pVBInfo->Part1Port,
@@ -3226,7 +3157,6 @@
 		}
 	}
 	tempbx--;
-	temp = tempbx & 0x00FF;
 	tempbx--;
 	temp = tempbx & 0x00FF;
 	/* 0x10 vertical Blank Start */
@@ -3361,8 +3291,6 @@
 		temp = 0x00;
 
 	xgifb_reg_set(pVBInfo->Part1Port, 0x1A, temp); /* 0x1A SR0E */
-
-	return;
 }
 
 static void XGI_SetGroup2(unsigned short ModeNo, unsigned short ModeIdIndex,
@@ -3445,9 +3373,6 @@
 	temp &= 0x80;
 	xgifb_reg_and_or(pVBInfo->Part2Port, 0x0A, 0xFF, temp);
 
-	if (pVBInfo->VBInfo & SetCRT2ToHiVision)
-		tempax = 950;
-
 	if (pVBInfo->TVInfo & TVSetPAL)
 		tempax = 520;
 	else
@@ -3797,9 +3722,6 @@
 		if (!(pVBInfo->VBInfo & SetInSlaveMode))
 			xgifb_reg_set(pVBInfo->Part2Port, 0x0B, 0x00);
 	}
-
-	if (pVBInfo->VBInfo & SetCRT2ToTV)
-		return;
 }
 
 static void XGI_SetLCDRegs(unsigned short ModeNo, unsigned short ModeIdIndex,
@@ -4135,8 +4057,7 @@
 				xgifb_reg_set(pVBInfo->Part3Port, 0x28, 0x3f);
 		}
 	}
-	return;
-} /* {end of XGI_SetGroup3} */
+}
 
 static void XGI_SetGroup4(unsigned short ModeNo, unsigned short ModeIdIndex,
 		unsigned short RefreshRateTableIndex,
@@ -4211,11 +4132,6 @@
 
 	tempebx = pVBInfo->VDE;
 
-	if (tempcx & SetCRT2ToHiVision) {
-		if (!(temp & 0xE000))
-			tempbx = tempbx >> 1;
-	}
-
 	tempcx = pVBInfo->RVBHRS;
 	temp = tempcx & 0x00FF;
 	xgifb_reg_set(pVBInfo->Part4Port, 0x18, temp);
@@ -4325,13 +4241,6 @@
 			XGINew_EnableCRT2(pVBInfo);
 		}
 	}
-	return;
-}
-
-static void XGI_EnableGatingCRT(struct xgi_hw_device_info *HwDeviceExtension,
-		struct vb_device_info *pVBInfo)
-{
-	xgifb_reg_and_or(pVBInfo->P3d4, 0x63, 0xBF, 0x40);
 }
 
 static void XGI_DisableGatingCRT(struct xgi_hw_device_info *HwDeviceExtension,
@@ -4592,38 +4501,6 @@
 	return 0;
 }
 
-/* --------------------------------------------------------------------- */
-/* Function : XGI_EnableChISLCD */
-/* Input : */
-/* Output : 0 -> Not LCD mode */
-/* Description : if bool enable = true -> enable, else disable  */
-/* --------------------------------------------------------------------- */
-static unsigned char XGI_EnableChISLCD(struct vb_device_info *pVBInfo,
-	bool enable)
-{
-	unsigned short tempbx, tempah;
-
-	if (enable)
-		tempbx = pVBInfo->SetFlag & (EnableChA | EnableChB);
-	else
-		tempbx = pVBInfo->SetFlag & (DisableChA | DisableChB);
-
-	tempah = ~((unsigned short) xgifb_reg_get(pVBInfo->Part1Port, 0x2E));
-
-	if (tempbx & (EnableChA | DisableChA)) {
-		if (!(tempah & 0x08)) /* Chk LCDA Mode */
-			return 0;
-	}
-
-	if (!(tempbx & (EnableChB | DisableChB)))
-		return 0;
-
-	if (tempah & 0x01) /* Chk LCDB Mode */
-		return 1;
-
-	return 0;
-}
-
 static void XGI_DisableBridge(struct xgifb_video_info *xgifb_info,
 		struct xgi_hw_device_info *HwDeviceExtension,
 		struct vb_device_info *pVBInfo)
@@ -4636,21 +4513,8 @@
 		if (!(pVBInfo->VBInfo &
 		    (DisableCRT2Display | SetSimuScanMode))) {
 			if (pVBInfo->VBInfo & XGI_SetCRT2ToLCDA) {
-				if (pVBInfo->VBInfo & SetCRT2ToDualEdge) {
+				if (pVBInfo->VBInfo & SetCRT2ToDualEdge)
 					tempah = 0x7F; /* Disable Channel A */
-					if (!(pVBInfo->VBInfo &
-					      XGI_SetCRT2ToLCDA))
-						/* Disable Channel B */
-						tempah = 0xBF;
-
-					if (pVBInfo->SetFlag & DisableChB)
-						/* force to disable Cahnnel */
-						tempah &= 0xBF;
-
-					if (pVBInfo->SetFlag & DisableChA)
-						/* Force to disable Channel B */
-						tempah &= 0x7F;
-				}
 			}
 		}
 
@@ -4660,26 +4524,18 @@
 		if (pVBInfo->VBType & (VB_SIS302LV | VB_XGI301C)) {
 			if (((pVBInfo->VBInfo &
 			      (SetCRT2ToLCD | XGI_SetCRT2ToLCDA))) ||
-				(XGI_EnableChISLCD(pVBInfo, false)) ||
 				(XGI_IsLCDON(pVBInfo)))
 				/* LVDS Driver power down */
 				xgifb_reg_or(pVBInfo->Part4Port, 0x30, 0x80);
 		}
 
-		if ((pVBInfo->SetFlag & DisableChA) || (pVBInfo->VBInfo
-				& (DisableCRT2Display | XGI_SetCRT2ToLCDA
-						| SetSimuScanMode))) {
-			if (pVBInfo->SetFlag & GatingCRT)
-				XGI_EnableGatingCRT(HwDeviceExtension, pVBInfo);
+		if (pVBInfo->VBInfo & (DisableCRT2Display | XGI_SetCRT2ToLCDA |
+				       SetSimuScanMode))
 			XGI_DisplayOff(xgifb_info, HwDeviceExtension, pVBInfo);
-		}
 
-		if (pVBInfo->VBInfo & XGI_SetCRT2ToLCDA) {
-			if ((pVBInfo->SetFlag & DisableChA) || (pVBInfo->VBInfo
-					& XGI_SetCRT2ToLCDA))
-				/* Power down */
-				xgifb_reg_and(pVBInfo->Part1Port, 0x1e, 0xdf);
-		}
+		if (pVBInfo->VBInfo & XGI_SetCRT2ToLCDA)
+			/* Power down */
+			xgifb_reg_and(pVBInfo->Part1Port, 0x1e, 0xdf);
 
 		/* disable TV as primary VGA swap */
 		xgifb_reg_and(pVBInfo->P3c4, 0x32, 0xdf);
@@ -4687,16 +4543,14 @@
 		if ((pVBInfo->VBInfo & (SetSimuScanMode | SetCRT2ToDualEdge)))
 			xgifb_reg_and(pVBInfo->Part2Port, 0x00, 0xdf);
 
-		if ((pVBInfo->SetFlag & DisableChB) ||
-		    (pVBInfo->VBInfo &
+		if ((pVBInfo->VBInfo &
 			(DisableCRT2Display | SetSimuScanMode)) ||
 		    ((!(pVBInfo->VBInfo & XGI_SetCRT2ToLCDA)) &&
 		    (pVBInfo->VBInfo &
 			(SetCRT2ToRAMDAC | SetCRT2ToLCD | SetCRT2ToTV))))
 			xgifb_reg_or(pVBInfo->Part1Port, 0x00, 0x80);
 
-		if ((pVBInfo->SetFlag & DisableChB) ||
-		    (pVBInfo->VBInfo &
+		if ((pVBInfo->VBInfo &
 			(DisableCRT2Display | SetSimuScanMode)) ||
 		    (!(pVBInfo->VBInfo & XGI_SetCRT2ToLCDA)) ||
 		    (pVBInfo->VBInfo &
@@ -5308,21 +5162,6 @@
 
 }
 
-unsigned char XGI_BridgeIsOn(struct vb_device_info *pVBInfo)
-{
-	unsigned short flag;
-
-	if (pVBInfo->IF_DEF_LVDS == 1) {
-		return 1;
-	} else {
-		flag = xgifb_reg_get(pVBInfo->Part4Port, 0x00);
-		if ((flag == 1) || (flag == 2))
-			return 1; /* 301b */
-		else
-			return 0;
-	}
-}
-
 unsigned short XGI_GetRatePtrCRT2(struct xgi_hw_device_info *pXGIHWDE,
 		unsigned short ModeNo, unsigned short ModeIdIndex,
 		struct vb_device_info *pVBInfo)
@@ -5344,15 +5183,10 @@
 
 	if (pVBInfo->SetFlag & ProgrammingCRT2) {
 		if (pVBInfo->VBInfo & (SetCRT2ToLCD | XGI_SetCRT2ToLCDA)) {
-			if (pVBInfo->IF_DEF_LVDS == 0) {
-				temp = LCDARefreshIndex[
-					pVBInfo->LCDResInfo & 0x07];
+			temp = LCDARefreshIndex[pVBInfo->LCDResInfo & 0x07];
 
-				if (index > temp)
-					index = temp;
-			} else {
-				index = 0;
-			}
+			if (index > temp)
+				index = temp;
 		}
 	}
 
@@ -5555,53 +5389,37 @@
 
 	if (pVBInfo->VBType & (VB_SIS301B | VB_SIS302B | VB_SIS301LV
 			| VB_SIS302LV | VB_XGI301C)) {
-		if (!(pVBInfo->SetFlag & DisableChA)) {
-			if ((pVBInfo->SetFlag & EnableChA) ||
-			    (pVBInfo->VBInfo & SetCRT2ToDualEdge)) {
-				/* Power on */
-				xgifb_reg_set(pVBInfo->Part1Port, 0x1E, 0x20);
+		if (pVBInfo->VBInfo & SetCRT2ToDualEdge)
+			/* Power on */
+			xgifb_reg_set(pVBInfo->Part1Port, 0x1E, 0x20);
+
+		if (pVBInfo->VBInfo & (SetCRT2ToLCD | SetCRT2ToTV |
+				       SetCRT2ToRAMDAC)) {
+			tempah = xgifb_reg_get(pVBInfo->P3c4, 0x32);
+			tempah &= 0xDF;
+			if (pVBInfo->VBInfo & SetInSlaveMode) {
+				if (!(pVBInfo->VBInfo & SetCRT2ToRAMDAC))
+					tempah |= 0x20;
 			}
+			xgifb_reg_set(pVBInfo->P3c4, 0x32, tempah);
+			xgifb_reg_or(pVBInfo->P3c4, 0x1E, 0x20);
+
+			tempah = xgifb_reg_get(pVBInfo->Part1Port, 0x2E);
+
+			if (!(tempah & 0x80))
+				xgifb_reg_or(pVBInfo->Part1Port, 0x2E, 0x80);
+			xgifb_reg_and(pVBInfo->Part1Port, 0x00, 0x7F);
 		}
 
-		if (!(pVBInfo->SetFlag & DisableChB)) {
-			if ((pVBInfo->SetFlag & EnableChB) || (pVBInfo->VBInfo
-					& (SetCRT2ToLCD | SetCRT2ToTV
-							| SetCRT2ToRAMDAC))) {
-				tempah = xgifb_reg_get(pVBInfo->P3c4, 0x32);
-				tempah &= 0xDF;
-				if (pVBInfo->VBInfo & SetInSlaveMode) {
-					if (!(pVBInfo->VBInfo &
-					      SetCRT2ToRAMDAC))
-						tempah |= 0x20;
-				}
-				xgifb_reg_set(pVBInfo->P3c4, 0x32, tempah);
-				xgifb_reg_or(pVBInfo->P3c4, 0x1E, 0x20);
-
-				tempah = xgifb_reg_get(pVBInfo->Part1Port,
-						       0x2E);
-
-				if (!(tempah & 0x80))
-					xgifb_reg_or(pVBInfo->Part1Port,
-							0x2E, 0x80);
-				xgifb_reg_and(pVBInfo->Part1Port, 0x00, 0x7F);
-			}
-		}
-
-		if ((pVBInfo->SetFlag & (EnableChA | EnableChB))
-				|| (!(pVBInfo->VBInfo & DisableCRT2Display))) {
+		if (!(pVBInfo->VBInfo & DisableCRT2Display)) {
 			xgifb_reg_and_or(pVBInfo->Part2Port, 0x00, ~0xE0,
 					0x20); /* shampoo 0129 */
 			if (pVBInfo->VBType & (VB_SIS302LV | VB_XGI301C)) {
-				if (!XGI_EnableChISLCD(pVBInfo, false)) {
-					if (XGI_EnableChISLCD(pVBInfo, true) ||
-					    (pVBInfo->VBInfo &
-					    (SetCRT2ToLCD | XGI_SetCRT2ToLCDA)))
-						/* LVDS PLL power on */
-						xgifb_reg_and(
-							pVBInfo->Part4Port,
-							0x2A,
-							0x7F);
-				}
+				if (pVBInfo->VBInfo &
+					(SetCRT2ToLCD | XGI_SetCRT2ToLCDA))
+					/* LVDS PLL power on */
+					xgifb_reg_and(pVBInfo->Part4Port, 0x2A,
+						      0x7F);
 				/* LVDS Driver power on */
 				xgifb_reg_and(pVBInfo->Part4Port, 0x30, 0x7F);
 			}
@@ -5618,32 +5436,14 @@
 				tempah = tempah & 0x40;
 				if (pVBInfo->VBInfo & XGI_SetCRT2ToLCDA)
 					tempah = tempah ^ 0xC0;
-
-				if (pVBInfo->SetFlag & DisableChB)
-					tempah &= 0xBF;
-
-				if (pVBInfo->SetFlag &  DisableChA)
-					tempah &= 0x7F;
-
-				if (pVBInfo->SetFlag &  EnableChB)
-					tempah |= 0x40;
-
-				if (pVBInfo->SetFlag &  EnableChA)
-					tempah |= 0x80;
 			}
 		}
 
 		/* EnablePart4_1F */
 		xgifb_reg_or(pVBInfo->Part4Port, 0x1F, tempah);
 
-		if (!(pVBInfo->SetFlag & DisableChA)) {
-			if (!(pVBInfo->SetFlag & GatingCRT)) {
-				XGI_DisableGatingCRT(HwDeviceExtension,
-						     pVBInfo);
-				XGI_DisplayOn(xgifb_info, HwDeviceExtension,
-						pVBInfo);
-			}
-		}
+		XGI_DisableGatingCRT(HwDeviceExtension, pVBInfo);
+		XGI_DisplayOn(xgifb_info, HwDeviceExtension, pVBInfo);
 	} /* 301 */
 	else { /* LVDS */
 		if (pVBInfo->VBInfo & (SetCRT2ToTV | SetCRT2ToLCD
@@ -5745,16 +5545,8 @@
 	struct vb_device_info *pVBInfo = &VBINF;
 	pVBInfo->IF_DEF_LVDS = 0;
 
-	if (HwDeviceExtension->jChipType >= XG20) {
-		pVBInfo->IF_DEF_YPbPr = 0;
-		pVBInfo->IF_DEF_HiVision = 0;
-		pVBInfo->IF_DEF_CRT2Monitor = 0;
+	if (HwDeviceExtension->jChipType >= XG20)
 		pVBInfo->VBType = 0; /*set VBType default 0*/
-	} else {
-		pVBInfo->IF_DEF_YPbPr = 1;
-		pVBInfo->IF_DEF_HiVision = 1;
-		pVBInfo->IF_DEF_CRT2Monitor = 1;
-	}
 
 	XGIRegInit(pVBInfo, xgifb_info->vga_base);
 
@@ -5770,9 +5562,6 @@
 		}
 	}
 
-	if (HwDeviceExtension->jChipType < XG20)
-		XGI_GetVBType(pVBInfo);
-
 	InitTo330Pointer(HwDeviceExtension->jChipType, pVBInfo);
 	if (ModeNo & 0x80)
 		ModeNo = ModeNo & 0x7F;

diff --git a/drivers/staging/xgifb/vb_setmode.h b/drivers/staging/xgifb/vb_setmode.h
index 5524828..2c0a31c 100644
--- a/drivers/staging/xgifb/vb_setmode.h
+++ b/drivers/staging/xgifb/vb_setmode.h

@@ -18,7 +18,6 @@
 extern unsigned char XGI_SearchModeID(unsigned short ModeNo,
 				      unsigned short *ModeIdIndex,
 				      struct vb_device_info *);
-extern unsigned char XGI_BridgeIsOn(struct vb_device_info *);
 extern unsigned short XGI_GetRatePtrCRT2(struct xgi_hw_device_info *pXGIHWDE,
 					 unsigned short ModeNo,
 					 unsigned short ModeIdIndex,

diff --git a/drivers/staging/zram/Makefile b/drivers/staging/zram/Makefile
index 7f4a301..cb0f9ce 100644
--- a/drivers/staging/zram/Makefile
+++ b/drivers/staging/zram/Makefile

@@ -1,3 +1,3 @@
-zram-y	:=	zram_drv.o zram_sysfs.o
+zram-y	:=	zram_drv.o
 
 obj-$(CONFIG_ZRAM)	+=	zram.o

diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c
index e34e3fe0..82c7202 100644
--- a/drivers/staging/zram/zram_drv.c
+++ b/drivers/staging/zram/zram_drv.c

@@ -37,28 +37,107 @@
 
 /* Globals */
 static int zram_major;
-struct zram *zram_devices;
+static struct zram *zram_devices;
 
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
 
-static void zram_stat64_add(struct zram *zram, u64 *v, u64 inc)
+static inline struct zram *dev_to_zram(struct device *dev)
 {
-	spin_lock(&zram->stat64_lock);
-	*v = *v + inc;
-	spin_unlock(&zram->stat64_lock);
+	return (struct zram *)dev_to_disk(dev)->private_data;
 }
 
-static void zram_stat64_sub(struct zram *zram, u64 *v, u64 dec)
+static ssize_t disksize_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
-	spin_lock(&zram->stat64_lock);
-	*v = *v - dec;
-	spin_unlock(&zram->stat64_lock);
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n", zram->disksize);
 }
 
-static void zram_stat64_inc(struct zram *zram, u64 *v)
+static ssize_t initstate_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
-	zram_stat64_add(zram, v, 1);
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%u\n", zram->init_done);
+}
+
+static ssize_t num_reads_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+			(u64)atomic64_read(&zram->stats.num_reads));
+}
+
+static ssize_t num_writes_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+			(u64)atomic64_read(&zram->stats.num_writes));
+}
+
+static ssize_t invalid_io_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+			(u64)atomic64_read(&zram->stats.invalid_io));
+}
+
+static ssize_t notify_free_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+			(u64)atomic64_read(&zram->stats.notify_free));
+}
+
+static ssize_t zero_pages_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%u\n", zram->stats.pages_zero);
+}
+
+static ssize_t orig_data_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+		(u64)(zram->stats.pages_stored) << PAGE_SHIFT);
+}
+
+static ssize_t compr_data_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+			(u64)atomic64_read(&zram->stats.compr_size));
+}
+
+static ssize_t mem_used_total_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u64 val = 0;
+	struct zram *zram = dev_to_zram(dev);
+	struct zram_meta *meta = zram->meta;
+
+	down_read(&zram->init_lock);
+	if (zram->init_done)
+		val = zs_get_total_size_bytes(meta->mem_pool);
+	up_read(&zram->init_lock);
+
+	return sprintf(buf, "%llu\n", val);
 }
 
 static int zram_test_flag(struct zram_meta *meta, u32 index,
@@ -79,6 +158,97 @@
 	meta->table[index].flags &= ~BIT(flag);
 }
 
+static inline int is_partial_io(struct bio_vec *bvec)
+{
+	return bvec->bv_len != PAGE_SIZE;
+}
+
+/*
+ * Check if request is within bounds and aligned on zram logical blocks.
+ */
+static inline int valid_io_request(struct zram *zram, struct bio *bio)
+{
+	u64 start, end, bound;
+	
+	/* unaligned request */
+	if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+		return 0;
+	if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+		return 0;
+
+	start = bio->bi_sector;
+	end = start + (bio->bi_size >> SECTOR_SHIFT);
+	bound = zram->disksize >> SECTOR_SHIFT;
+	/* out of range range */
+	if (unlikely(start >= bound || end > bound || start > end))
+		return 0;
+
+	/* I/O request is valid */
+	return 1;
+}
+
+static void zram_meta_free(struct zram_meta *meta)
+{
+	zs_destroy_pool(meta->mem_pool);
+	kfree(meta->compress_workmem);
+	free_pages((unsigned long)meta->compress_buffer, 1);
+	vfree(meta->table);
+	kfree(meta);
+}
+
+static struct zram_meta *zram_meta_alloc(u64 disksize)
+{
+	size_t num_pages;
+	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+	if (!meta)
+		goto out;
+
+	meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+	if (!meta->compress_workmem)
+		goto free_meta;
+
+	meta->compress_buffer =
+		(void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+	if (!meta->compress_buffer) {
+		pr_err("Error allocating compressor buffer space\n");
+		goto free_workmem;
+	}
+
+	num_pages = disksize >> PAGE_SHIFT;
+	meta->table = vzalloc(num_pages * sizeof(*meta->table));
+	if (!meta->table) {
+		pr_err("Error allocating zram address table\n");
+		goto free_buffer;
+	}
+
+	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
+	if (!meta->mem_pool) {
+		pr_err("Error creating memory pool\n");
+		goto free_table;
+	}
+
+	return meta;
+
+free_table:
+	vfree(meta->table);
+free_buffer:
+	free_pages((unsigned long)meta->compress_buffer, 1);
+free_workmem:
+	kfree(meta->compress_workmem);
+free_meta:
+	kfree(meta);
+	meta = NULL;
+out:
+	return meta;
+}
+
+static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+{
+	if (*offset + bvec->bv_len >= PAGE_SIZE)
+		(*index)++;
+	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
+}
+
 static int page_zero_filled(void *ptr)
 {
 	unsigned int pos;
@@ -94,6 +264,21 @@
 	return 1;
 }
 
+static void handle_zero_page(struct bio_vec *bvec)
+{
+	struct page *page = bvec->bv_page;
+	void *user_mem;
+
+	user_mem = kmap_atomic(page);
+	if (is_partial_io(bvec))
+		memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
+	else
+		clear_page(user_mem);
+	kunmap_atomic(user_mem);
+
+	flush_dcache_page(page);
+}
+
 static void zram_free_page(struct zram *zram, size_t index)
 {
 	struct zram_meta *meta = zram->meta;
@@ -120,31 +305,13 @@
 	if (size <= PAGE_SIZE / 2)
 		zram->stats.good_compress--;
 
-	zram_stat64_sub(zram, &zram->stats.compr_size,
-			meta->table[index].size);
+	atomic64_sub(meta->table[index].size, &zram->stats.compr_size);
 	zram->stats.pages_stored--;
 
 	meta->table[index].handle = 0;
 	meta->table[index].size = 0;
 }
 
-static void handle_zero_page(struct bio_vec *bvec)
-{
-	struct page *page = bvec->bv_page;
-	void *user_mem;
-
-	user_mem = kmap_atomic(page);
-	memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
-	kunmap_atomic(user_mem);
-
-	flush_dcache_page(page);
-}
-
-static inline int is_partial_io(struct bio_vec *bvec)
-{
-	return bvec->bv_len != PAGE_SIZE;
-}
-
 static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
 {
 	int ret = LZO_E_OK;
@@ -154,13 +321,13 @@
 	unsigned long handle = meta->table[index].handle;
 
 	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
-		memset(mem, 0, PAGE_SIZE);
+		clear_page(mem);
 		return 0;
 	}
 
 	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
 	if (meta->table[index].size == PAGE_SIZE)
-		memcpy(mem, cmem, PAGE_SIZE);
+		copy_page(mem, cmem);
 	else
 		ret = lzo1x_decompress_safe(cmem, meta->table[index].size,
 						mem, &clen);
@@ -169,7 +336,7 @@
 	/* Should NEVER happen. Return bio error if it does. */
 	if (unlikely(ret != LZO_E_OK)) {
 		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
-		zram_stat64_inc(zram, &zram->stats.failed_reads);
+		atomic64_inc(&zram->stats.failed_reads);
 		return ret;
 	}
 
@@ -272,8 +439,6 @@
 
 	if (page_zero_filled(uncmem)) {
 		kunmap_atomic(user_mem);
-		if (is_partial_io(bvec))
-			kfree(uncmem);
 		zram->stats.pages_zero++;
 		zram_set_flag(meta, index, ZRAM_ZERO);
 		ret = 0;
@@ -304,18 +469,20 @@
 
 	handle = zs_malloc(meta->mem_pool, clen);
 	if (!handle) {
-		pr_info("Error allocating memory for compressed "
-			"page: %u, size=%zu\n", index, clen);
+		pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
+			index, clen);
 		ret = -ENOMEM;
 		goto out;
 	}
 	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
 
-	if ((clen == PAGE_SIZE) && !is_partial_io(bvec))
+	if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
 		src = kmap_atomic(page);
-	memcpy(cmem, src, clen);
-	if ((clen == PAGE_SIZE) && !is_partial_io(bvec))
+		copy_page(cmem, src);
 		kunmap_atomic(src);
+	} else {
+		memcpy(cmem, src, clen);
+	}
 
 	zs_unmap_object(meta->mem_pool, handle);
 
@@ -323,7 +490,7 @@
 	meta->table[index].size = clen;
 
 	/* Update stats */
-	zram_stat64_add(zram, &zram->stats.compr_size, clen);
+	atomic64_add(clen, &zram->stats.compr_size);
 	zram->stats.pages_stored++;
 	if (clen <= PAGE_SIZE / 2)
 		zram->stats.good_compress++;
@@ -333,7 +500,7 @@
 		kfree(uncmem);
 
 	if (ret)
-		zram_stat64_inc(zram, &zram->stats.failed_writes);
+		atomic64_inc(&zram->stats.failed_writes);
 	return ret;
 }
 
@@ -355,11 +522,117 @@
 	return ret;
 }
 
-static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+static void zram_reset_device(struct zram *zram)
 {
-	if (*offset + bvec->bv_len >= PAGE_SIZE)
-		(*index)++;
-	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
+	size_t index;
+	struct zram_meta *meta;
+
+	if (!zram->init_done)
+		return;
+
+	meta = zram->meta;
+	zram->init_done = 0;
+
+	/* Free all pages that are still in this zram device */
+	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
+		unsigned long handle = meta->table[index].handle;
+		if (!handle)
+			continue;
+
+		zs_free(meta->mem_pool, handle);
+	}
+
+	zram_meta_free(zram->meta);
+	zram->meta = NULL;
+	/* Reset stats */
+	memset(&zram->stats, 0, sizeof(zram->stats));
+
+	zram->disksize = 0;
+	set_capacity(zram->disk, 0);
+}
+
+static void zram_init_device(struct zram *zram, struct zram_meta *meta)
+{
+	if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) {
+		pr_info(
+		"There is little point creating a zram of greater than "
+		"twice the size of memory since we expect a 2:1 compression "
+		"ratio. Note that zram uses about 0.1%% of the size of "
+		"the disk when not in use so a huge zram is "
+		"wasteful.\n"
+		"\tMemory Size: %lu kB\n"
+		"\tSize you selected: %llu kB\n"
+		"Continuing anyway ...\n",
+		(totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10
+		);
+	}
+
+	/* zram devices sort of resembles non-rotational disks */
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
+
+	zram->meta = meta;
+	zram->init_done = 1;
+
+	pr_debug("Initialization done!\n");
+}
+
+static ssize_t disksize_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	u64 disksize;
+	struct zram_meta *meta;
+	struct zram *zram = dev_to_zram(dev);
+
+	disksize = memparse(buf, NULL);
+	if (!disksize)
+		return -EINVAL;
+
+	disksize = PAGE_ALIGN(disksize);
+	meta = zram_meta_alloc(disksize);
+	down_write(&zram->init_lock);
+	if (zram->init_done) {
+		up_write(&zram->init_lock);
+		zram_meta_free(meta);
+		pr_info("Cannot change disksize for initialized device\n");
+		return -EBUSY;
+	}
+
+	zram->disksize = disksize;
+	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+	zram_init_device(zram, meta);
+	up_write(&zram->init_lock);
+
+	return len;
+}
+
+static ssize_t reset_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	int ret;
+	unsigned short do_reset;
+	struct zram *zram;
+	struct block_device *bdev;
+
+	zram = dev_to_zram(dev);
+	bdev = bdget_disk(zram->disk, 0);
+
+	/* Do not reset an active device! */
+	if (bdev->bd_holders)
+		return -EBUSY;
+
+	ret = kstrtou16(buf, 10, &do_reset);
+	if (ret)
+		return ret;
+
+	if (!do_reset)
+		return -EINVAL;
+
+	/* Make sure all pending I/O is finished */
+	if (bdev)
+		fsync_bdev(bdev);
+
+	zram_reset_device(zram);
+	return len;
 }
 
 static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
@@ -370,10 +643,10 @@
 
 	switch (rw) {
 	case READ:
-		zram_stat64_inc(zram, &zram->stats.num_reads);
+		atomic64_inc(&zram->stats.num_reads);
 		break;
 	case WRITE:
-		zram_stat64_inc(zram, &zram->stats.num_writes);
+		atomic64_inc(&zram->stats.num_writes);
 		break;
 	}
 
@@ -418,23 +691,6 @@
 }
 
 /*
- * Check if request is within bounds and aligned on zram logical blocks.
- */
-static inline int valid_io_request(struct zram *zram, struct bio *bio)
-{
-	if (unlikely(
-		(bio->bi_sector >= (zram->disksize >> SECTOR_SHIFT)) ||
-		(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)) ||
-		(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))) {
-
-		return 0;
-	}
-
-	/* I/O request is valid */
-	return 1;
-}
-
-/*
  * Handler function for all zram I/O requests.
  */
 static void zram_make_request(struct request_queue *queue, struct bio *bio)
@@ -446,7 +702,7 @@
 		goto error;
 
 	if (!valid_io_request(zram, bio)) {
-		zram_stat64_inc(zram, &zram->stats.invalid_io);
+		atomic64_inc(&zram->stats.invalid_io);
 		goto error;
 	}
 
@@ -460,130 +716,16 @@
 	bio_io_error(bio);
 }
 
-static void __zram_reset_device(struct zram *zram)
-{
-	size_t index;
-	struct zram_meta *meta;
-
-	if (!zram->init_done)
-		return;
-
-	meta = zram->meta;
-	zram->init_done = 0;
-
-	/* Free all pages that are still in this zram device */
-	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
-		unsigned long handle = meta->table[index].handle;
-		if (!handle)
-			continue;
-
-		zs_free(meta->mem_pool, handle);
-	}
-
-	zram_meta_free(zram->meta);
-	zram->meta = NULL;
-	/* Reset stats */
-	memset(&zram->stats, 0, sizeof(zram->stats));
-
-	zram->disksize = 0;
-	set_capacity(zram->disk, 0);
-}
-
-void zram_reset_device(struct zram *zram)
-{
-	down_write(&zram->init_lock);
-	__zram_reset_device(zram);
-	up_write(&zram->init_lock);
-}
-
-void zram_meta_free(struct zram_meta *meta)
-{
-	zs_destroy_pool(meta->mem_pool);
-	kfree(meta->compress_workmem);
-	free_pages((unsigned long)meta->compress_buffer, 1);
-	vfree(meta->table);
-	kfree(meta);
-}
-
-struct zram_meta *zram_meta_alloc(u64 disksize)
-{
-	size_t num_pages;
-	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
-	if (!meta)
-		goto out;
-
-	meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
-	if (!meta->compress_workmem)
-		goto free_meta;
-
-	meta->compress_buffer =
-		(void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
-	if (!meta->compress_buffer) {
-		pr_err("Error allocating compressor buffer space\n");
-		goto free_workmem;
-	}
-
-	num_pages = disksize >> PAGE_SHIFT;
-	meta->table = vzalloc(num_pages * sizeof(*meta->table));
-	if (!meta->table) {
-		pr_err("Error allocating zram address table\n");
-		goto free_buffer;
-	}
-
-	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
-	if (!meta->mem_pool) {
-		pr_err("Error creating memory pool\n");
-		goto free_table;
-	}
-
-	return meta;
-
-free_table:
-	vfree(meta->table);
-free_buffer:
-	free_pages((unsigned long)meta->compress_buffer, 1);
-free_workmem:
-	kfree(meta->compress_workmem);
-free_meta:
-	kfree(meta);
-	meta = NULL;
-out:
-	return meta;
-}
-
-void zram_init_device(struct zram *zram, struct zram_meta *meta)
-{
-	if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) {
-		pr_info(
-		"There is little point creating a zram of greater than "
-		"twice the size of memory since we expect a 2:1 compression "
-		"ratio. Note that zram uses about 0.1%% of the size of "
-		"the disk when not in use so a huge zram is "
-		"wasteful.\n"
-		"\tMemory Size: %lu kB\n"
-		"\tSize you selected: %llu kB\n"
-		"Continuing anyway ...\n",
-		(totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10
-		);
-	}
-
-	/* zram devices sort of resembles non-rotational disks */
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
-
-	zram->meta = meta;
-	zram->init_done = 1;
-
-	pr_debug("Initialization done!\n");
-}
-
 static void zram_slot_free_notify(struct block_device *bdev,
 				unsigned long index)
 {
 	struct zram *zram;
 
 	zram = bdev->bd_disk->private_data;
+	down_write(&zram->lock);
 	zram_free_page(zram, index);
-	zram_stat64_inc(zram, &zram->stats.notify_free);
+	up_write(&zram->lock);
+	atomic64_inc(&zram->stats.notify_free);
 }
 
 static const struct block_device_operations zram_devops = {
@@ -591,19 +733,49 @@
 	.owner = THIS_MODULE
 };
 
+static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR,
+		disksize_show, disksize_store);
+static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
+static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
+static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL);
+static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL);
+static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL);
+static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL);
+static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL);
+static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
+static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL);
+static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
+
+static struct attribute *zram_disk_attrs[] = {
+	&dev_attr_disksize.attr,
+	&dev_attr_initstate.attr,
+	&dev_attr_reset.attr,
+	&dev_attr_num_reads.attr,
+	&dev_attr_num_writes.attr,
+	&dev_attr_invalid_io.attr,
+	&dev_attr_notify_free.attr,
+	&dev_attr_zero_pages.attr,
+	&dev_attr_orig_data_size.attr,
+	&dev_attr_compr_data_size.attr,
+	&dev_attr_mem_used_total.attr,
+	NULL,
+};
+
+static struct attribute_group zram_disk_attr_group = {
+	.attrs = zram_disk_attrs,
+};
+
 static int create_device(struct zram *zram, int device_id)
 {
-	int ret = 0;
+	int ret = -ENOMEM;
 
 	init_rwsem(&zram->lock);
 	init_rwsem(&zram->init_lock);
-	spin_lock_init(&zram->stat64_lock);
 
 	zram->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!zram->queue) {
 		pr_err("Error allocating disk queue for device %d\n",
 			device_id);
-		ret = -ENOMEM;
 		goto out;
 	}
 
@@ -613,11 +785,9 @@
 	 /* gendisk structure */
 	zram->disk = alloc_disk(1);
 	if (!zram->disk) {
-		blk_cleanup_queue(zram->queue);
 		pr_warn("Error allocating disk structure for device %d\n",
 			device_id);
-		ret = -ENOMEM;
-		goto out;
+		goto out_free_queue;
 	}
 
 	zram->disk->major = zram_major;
@@ -646,11 +816,17 @@
 				&zram_disk_attr_group);
 	if (ret < 0) {
 		pr_warn("Error creating sysfs group");
-		goto out;
+		goto out_free_disk;
 	}
 
 	zram->init_done = 0;
+	return 0;
 
+out_free_disk:
+	del_gendisk(zram->disk);
+	put_disk(zram->disk);
+out_free_queue:
+	blk_cleanup_queue(zram->queue);
 out:
 	return ret;
 }
@@ -669,11 +845,6 @@
 		blk_cleanup_queue(zram->queue);
 }
 
-unsigned int zram_get_num_devices(void)
-{
-	return num_devices;
-}
-
 static int __init zram_init(void)
 {
 	int ret, dev_id;
@@ -727,8 +898,10 @@
 	for (i = 0; i < num_devices; i++) {
 		zram = &zram_devices[i];
 
+		get_disk(zram->disk);
 		destroy_device(zram);
 		zram_reset_device(zram);
+		put_disk(zram->disk);
 	}
 
 	unregister_blkdev(zram_major, "zram");
@@ -737,12 +910,12 @@
 	pr_debug("Cleanup done!\n");
 }
 
-module_param(num_devices, uint, 0);
-MODULE_PARM_DESC(num_devices, "Number of zram devices");
-
 module_init(zram_init);
 module_exit(zram_exit);
 
+module_param(num_devices, uint, 0);
+MODULE_PARM_DESC(num_devices, "Number of zram devices");
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
 MODULE_DESCRIPTION("Compressed RAM Block Device");

diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h
index 2d1a3f1..9e57bfb 100644
--- a/drivers/staging/zram/zram_drv.h
+++ b/drivers/staging/zram/zram_drv.h

@@ -69,14 +69,18 @@
 	u8 flags;
 } __aligned(4);
 
+/*
+ * All 64bit fields should only be manipulated by 64bit atomic accessors.
+ * All modifications to 32bit counter should be protected by zram->lock.
+ */
 struct zram_stats {
-	u64 compr_size;		/* compressed size of pages stored */
-	u64 num_reads;		/* failed + successful */
-	u64 num_writes;		/* --do-- */
-	u64 failed_reads;	/* should NEVER! happen */
-	u64 failed_writes;	/* can happen when memory is too low */
-	u64 invalid_io;		/* non-page-aligned I/O requests */
-	u64 notify_free;	/* no. of swap slot free notifications */
+	atomic64_t compr_size;	/* compressed size of pages stored */
+	atomic64_t num_reads;	/* failed + successful */
+	atomic64_t num_writes;	/* --do-- */
+	atomic64_t failed_reads;	/* should NEVER! happen */
+	atomic64_t failed_writes;	/* can happen when memory is too low */
+	atomic64_t invalid_io;	/* non-page-aligned I/O requests */
+	atomic64_t notify_free;	/* no. of swap slot free notifications */
 	u32 pages_zero;		/* no. of zero filled pages */
 	u32 pages_stored;	/* no. of pages currently stored */
 	u32 good_compress;	/* % of pages with compression ratio<=50% */
@@ -92,9 +96,9 @@
 
 struct zram {
 	struct zram_meta *meta;
-	spinlock_t stat64_lock;	/* protect 64-bit stats */
-	struct rw_semaphore lock; /* protect compression buffers and table
-				   * against concurrent read and writes */
+	struct rw_semaphore lock; /* protect compression buffers, table,
+				   * 32bit stat counters against concurrent
+				   * notifications, reads and writes */
 	struct request_queue *queue;
 	struct gendisk *disk;
 	int init_done;
@@ -108,16 +112,4 @@
 
 	struct zram_stats stats;
 };
-
-extern struct zram *zram_devices;
-unsigned int zram_get_num_devices(void);
-#ifdef CONFIG_SYSFS
-extern struct attribute_group zram_disk_attr_group;
-#endif
-
-extern void zram_reset_device(struct zram *zram);
-extern struct zram_meta *zram_meta_alloc(u64 disksize);
-extern void zram_meta_free(struct zram_meta *meta);
-extern void zram_init_device(struct zram *zram, struct zram_meta *meta);
-
 #endif

diff --git a/drivers/staging/zram/zram_sysfs.c b/drivers/staging/zram/zram_sysfs.c
deleted file mode 100644
index e6a929d..0000000
--- a/drivers/staging/zram/zram_sysfs.c
+++ /dev/null

@@ -1,227 +0,0 @@
-/*
- * Compressed RAM block device
- *
- * Copyright (C) 2008, 2009, 2010  Nitin Gupta
- *
- * This code is released using a dual license strategy: BSD/GPL
- * You can choose the licence that better fits your requirements.
- *
- * Released under the terms of 3-clause BSD License
- * Released under the terms of GNU General Public License Version 2.0
- *
- * Project home: http://compcache.googlecode.com/
- */
-
-#include <linux/device.h>
-#include <linux/genhd.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-
-#include "zram_drv.h"
-
-static u64 zram_stat64_read(struct zram *zram, u64 *v)
-{
-	u64 val;
-
-	spin_lock(&zram->stat64_lock);
-	val = *v;
-	spin_unlock(&zram->stat64_lock);
-
-	return val;
-}
-
-static struct zram *dev_to_zram(struct device *dev)
-{
-	int i;
-	struct zram *zram = NULL;
-
-	for (i = 0; i < zram_get_num_devices(); i++) {
-		zram = &zram_devices[i];
-		if (disk_to_dev(zram->disk) == dev)
-			break;
-	}
-
-	return zram;
-}
-
-static ssize_t disksize_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n", zram->disksize);
-}
-
-static ssize_t disksize_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t len)
-{
-	u64 disksize;
-	struct zram_meta *meta;
-	struct zram *zram = dev_to_zram(dev);
-
-	disksize = memparse(buf, NULL);
-	if (!disksize)
-		return -EINVAL;
-
-	disksize = PAGE_ALIGN(disksize);
-	meta = zram_meta_alloc(disksize);
-	down_write(&zram->init_lock);
-	if (zram->init_done) {
-		up_write(&zram->init_lock);
-		zram_meta_free(meta);
-		pr_info("Cannot change disksize for initialized device\n");
-		return -EBUSY;
-	}
-
-	zram->disksize = disksize;
-	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
-	zram_init_device(zram, meta);
-	up_write(&zram->init_lock);
-
-	return len;
-}
-
-static ssize_t initstate_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%u\n", zram->init_done);
-}
-
-static ssize_t reset_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t len)
-{
-	int ret;
-	unsigned short do_reset;
-	struct zram *zram;
-	struct block_device *bdev;
-
-	zram = dev_to_zram(dev);
-	bdev = bdget_disk(zram->disk, 0);
-
-	/* Do not reset an active device! */
-	if (bdev->bd_holders)
-		return -EBUSY;
-
-	ret = kstrtou16(buf, 10, &do_reset);
-	if (ret)
-		return ret;
-
-	if (!do_reset)
-		return -EINVAL;
-
-	/* Make sure all pending I/O is finished */
-	if (bdev)
-		fsync_bdev(bdev);
-
-	zram_reset_device(zram);
-	return len;
-}
-
-static ssize_t num_reads_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		zram_stat64_read(zram, &zram->stats.num_reads));
-}
-
-static ssize_t num_writes_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		zram_stat64_read(zram, &zram->stats.num_writes));
-}
-
-static ssize_t invalid_io_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		zram_stat64_read(zram, &zram->stats.invalid_io));
-}
-
-static ssize_t notify_free_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		zram_stat64_read(zram, &zram->stats.notify_free));
-}
-
-static ssize_t zero_pages_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%u\n", zram->stats.pages_zero);
-}
-
-static ssize_t orig_data_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		(u64)(zram->stats.pages_stored) << PAGE_SHIFT);
-}
-
-static ssize_t compr_data_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		zram_stat64_read(zram, &zram->stats.compr_size));
-}
-
-static ssize_t mem_used_total_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	u64 val = 0;
-	struct zram *zram = dev_to_zram(dev);
-	struct zram_meta *meta = zram->meta;
-
-	if (zram->init_done)
-		val = zs_get_total_size_bytes(meta->mem_pool);
-
-	return sprintf(buf, "%llu\n", val);
-}
-
-static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR,
-		disksize_show, disksize_store);
-static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
-static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
-static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL);
-static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL);
-static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL);
-static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL);
-static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL);
-static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
-static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL);
-static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
-
-static struct attribute *zram_disk_attrs[] = {
-	&dev_attr_disksize.attr,
-	&dev_attr_initstate.attr,
-	&dev_attr_reset.attr,
-	&dev_attr_num_reads.attr,
-	&dev_attr_num_writes.attr,
-	&dev_attr_invalid_io.attr,
-	&dev_attr_notify_free.attr,
-	&dev_attr_zero_pages.attr,
-	&dev_attr_orig_data_size.attr,
-	&dev_attr_compr_data_size.attr,
-	&dev_attr_mem_used_total.attr,
-	NULL,
-};
-
-struct attribute_group zram_disk_attr_group = {
-	.attrs = zram_disk_attrs,
-};

diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c
index f82f7e6..4bb275b 100644
--- a/drivers/staging/zsmalloc/zsmalloc-main.c
+++ b/drivers/staging/zsmalloc/zsmalloc-main.c

@@ -224,7 +224,7 @@
  * performs VM mapping faster than copying, then it should be added here
  * so that USE_PGTABLE_MAPPING is defined. This causes zsmalloc to use
  * page table mapping rather than copying for object mapping.
-*/
+ */
 #if defined(CONFIG_ARM) && !defined(MODULE)
 #define USE_PGTABLE_MAPPING
 #endif
@@ -844,8 +844,7 @@
 
 		for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
 			if (class->fullness_list[fg]) {
-				pr_info("Freeing non-empty class with size "
-					"%db, fullness group %d\n",
+				pr_info("Freeing non-empty class with size %db, fullness group %d\n",
 					class->size, fg);
 			}
 		}
@@ -968,7 +967,7 @@
  * against nested mappings.
  *
  * This function returns with preemption and page faults disabled.
-*/
+ */
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 			enum zs_mapmode mm)
 {

diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/drivers/staging/zsmalloc/zsmalloc.h
index 46dbd05..fbe6bec 100644
--- a/drivers/staging/zsmalloc/zsmalloc.h
+++ b/drivers/staging/zsmalloc/zsmalloc.h

@@ -19,7 +19,7 @@
  * zsmalloc mapping modes
  *
  * NOTE: These only make a difference when a mapped object spans pages
-*/
+ */
 enum zs_mapmode {
 	ZS_MM_RW, /* normal read-write mapping */
 	ZS_MM_RO, /* read-only (no copy-out at unmap time) */

diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index 6b78399..58ad1c0 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile

@@ -1,5 +1,5 @@
 obj-$(CONFIG_TTY)		+= tty_io.o n_tty.o tty_ioctl.o tty_ldisc.o \
-				   tty_buffer.o tty_port.o tty_mutex.o
+				   tty_buffer.o tty_port.o tty_mutex.o tty_ldsem.o
 obj-$(CONFIG_LEGACY_PTYS)	+= pty.o
 obj-$(CONFIG_UNIX98_PTYS)	+= pty.o
 obj-$(CONFIG_AUDIT)		+= tty_audit.o

diff --git a/drivers/tty/hvc/hvc_iucv.c b/drivers/tty/hvc/hvc_iucv.c
index b6f7d52..9d47f50 100644
--- a/drivers/tty/hvc/hvc_iucv.c
+++ b/drivers/tty/hvc/hvc_iucv.c

@@ -1328,7 +1328,7 @@
  */
 static	int __init hvc_iucv_config(char *val)
 {
-	 return strict_strtoul(val, 10, &hvc_iucv_devices);
+	 return kstrtoul(val, 10, &hvc_iucv_devices);
 }
 
 

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 6c7fe90..4bf0fc0 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c

@@ -89,6 +89,7 @@
 	int read_head;
 	int read_tail;
 	int read_cnt;
+	int minimum_to_wake;
 
 	unsigned char *echo_buf;
 	unsigned int echo_pos;
@@ -114,22 +115,25 @@
 }
 
 /**
- *	n_tty_set__room	-	receive space
+ *	n_tty_set_room	-	receive space
  *	@tty: terminal
  *
- *	Called by the driver to find out how much data it is
- *	permitted to feed to the line discipline without any being lost
- *	and thus to manage flow control. Not serialized. Answers for the
- *	"instant".
+ *	Updates tty->receive_room to reflect the currently available space
+ *	in the input buffer, and re-schedules the flip buffer work if space
+ *	just became available.
+ *
+ *	Locks: Concurrent update is protected with read_lock
  */
 
-static void n_tty_set_room(struct tty_struct *tty)
+static int set_room(struct tty_struct *tty)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	int left;
 	int old_left;
+	unsigned long flags;
 
-	/* ldata->read_cnt is not read locked ? */
+	raw_spin_lock_irqsave(&ldata->read_lock, flags);
+
 	if (I_PARMRK(tty)) {
 		/* Multiply read_cnt by 3, since each byte might take up to
 		 * three times as many spaces when PARMRK is set (depending on
@@ -149,8 +153,15 @@
 	old_left = tty->receive_room;
 	tty->receive_room = left;
 
+	raw_spin_unlock_irqrestore(&ldata->read_lock, flags);
+
+	return left && !old_left;
+}
+
+static void n_tty_set_room(struct tty_struct *tty)
+{
 	/* Did this open up the receive buffer? We may need to flip */
-	if (left && !old_left) {
+	if (set_room(tty)) {
 		WARN_RATELIMIT(tty->port->itty == NULL,
 				"scheduling with invalid itty\n");
 		/* see if ldisc has been killed - if so, this means that
@@ -647,8 +658,7 @@
 			if (no_space_left)
 				break;
 		} else {
-			if (O_OPOST(tty) &&
-			    !(test_bit(TTY_HW_COOK_OUT, &tty->flags))) {
+			if (O_OPOST(tty)) {
 				int retval = do_output_char(c, tty, space);
 				if (retval < 0)
 					break;
@@ -1454,9 +1464,9 @@
 			tty->ops->flush_chars(tty);
 	}
 
-	n_tty_set_room(tty);
+	set_room(tty);
 
-	if ((!ldata->icanon && (ldata->read_cnt >= tty->minimum_to_wake)) ||
+	if ((!ldata->icanon && (ldata->read_cnt >= ldata->minimum_to_wake)) ||
 		L_EXTPROC(tty)) {
 		kill_fasync(&tty->fasync, SIGIO, POLL_IN);
 		if (waitqueue_active(&tty->read_wait))
@@ -1516,12 +1526,7 @@
 		wake_up_interruptible(&tty->read_wait);
 
 	ldata->icanon = (L_ICANON(tty) != 0);
-	if (test_bit(TTY_HW_COOK_IN, &tty->flags)) {
-		ldata->raw = 1;
-		ldata->real_raw = 1;
-		n_tty_set_room(tty);
-		return;
-	}
+
 	if (I_ISTRIP(tty) || I_IUCLC(tty) || I_IGNCR(tty) ||
 	    I_ICRNL(tty) || I_INLCR(tty) || L_ICANON(tty) ||
 	    I_IXON(tty) || L_ISIG(tty) || L_ECHO(tty) ||
@@ -1642,7 +1647,7 @@
 	tty->disc_data = ldata;
 	reset_buffer_flags(tty->disc_data);
 	ldata->column = 0;
-	tty->minimum_to_wake = 1;
+	ldata->minimum_to_wake = 1;
 	tty->closing = 0;
 	/* indicate buffer work may resume */
 	clear_bit(TTY_LDISC_HALTED, &tty->flags);
@@ -1806,21 +1811,17 @@
 	minimum = time = 0;
 	timeout = MAX_SCHEDULE_TIMEOUT;
 	if (!ldata->icanon) {
-		time = (HZ / 10) * TIME_CHAR(tty);
 		minimum = MIN_CHAR(tty);
 		if (minimum) {
+			time = (HZ / 10) * TIME_CHAR(tty);
 			if (time)
-				tty->minimum_to_wake = 1;
+				ldata->minimum_to_wake = 1;
 			else if (!waitqueue_active(&tty->read_wait) ||
-				 (tty->minimum_to_wake > minimum))
-				tty->minimum_to_wake = minimum;
+				 (ldata->minimum_to_wake > minimum))
+				ldata->minimum_to_wake = minimum;
 		} else {
-			timeout = 0;
-			if (time) {
-				timeout = time;
-				time = 0;
-			}
-			tty->minimum_to_wake = minimum = 1;
+			timeout = (HZ / 10) * TIME_CHAR(tty);
+			ldata->minimum_to_wake = minimum = 1;
 		}
 	}
 
@@ -1860,9 +1861,9 @@
 		   TASK_RUNNING. */
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (((minimum - (b - buf)) < tty->minimum_to_wake) &&
+		if (((minimum - (b - buf)) < ldata->minimum_to_wake) &&
 		    ((minimum - (b - buf)) >= 1))
-			tty->minimum_to_wake = (minimum - (b - buf));
+			ldata->minimum_to_wake = (minimum - (b - buf));
 
 		if (!input_available_p(tty, 0)) {
 			if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) {
@@ -1881,7 +1882,6 @@
 				retval = -ERESTARTSYS;
 				break;
 			}
-			/* FIXME: does n_tty_set_room need locking ? */
 			n_tty_set_room(tty);
 			timeout = schedule_timeout(timeout);
 			continue;
@@ -1979,7 +1979,7 @@
 	remove_wait_queue(&tty->read_wait, &wait);
 
 	if (!waitqueue_active(&tty->read_wait))
-		tty->minimum_to_wake = minimum;
+		ldata->minimum_to_wake = minimum;
 
 	__set_current_state(TASK_RUNNING);
 	size = b - buf;
@@ -2045,7 +2045,7 @@
 			retval = -EIO;
 			break;
 		}
-		if (O_OPOST(tty) && !(test_bit(TTY_HW_COOK_OUT, &tty->flags))) {
+		if (O_OPOST(tty)) {
 			while (nr > 0) {
 				ssize_t num = process_output_block(tty, b, nr);
 				if (num < 0) {
@@ -2111,6 +2111,7 @@
 static unsigned int n_tty_poll(struct tty_struct *tty, struct file *file,
 							poll_table *wait)
 {
+	struct n_tty_data *ldata = tty->disc_data;
 	unsigned int mask = 0;
 
 	poll_wait(file, &tty->read_wait, wait);
@@ -2125,9 +2126,9 @@
 		mask |= POLLHUP;
 	if (!(mask & (POLLHUP | POLLIN | POLLRDNORM))) {
 		if (MIN_CHAR(tty) && !TIME_CHAR(tty))
-			tty->minimum_to_wake = MIN_CHAR(tty);
+			ldata->minimum_to_wake = MIN_CHAR(tty);
 		else
-			tty->minimum_to_wake = 1;
+			ldata->minimum_to_wake = 1;
 	}
 	if (tty->ops->write && !tty_is_writelocked(tty) &&
 			tty_chars_in_buffer(tty) < WAKEUP_CHARS &&
@@ -2175,6 +2176,18 @@
 	}
 }
 
+static void n_tty_fasync(struct tty_struct *tty, int on)
+{
+	struct n_tty_data *ldata = tty->disc_data;
+
+	if (!waitqueue_active(&tty->read_wait)) {
+		if (on)
+			ldata->minimum_to_wake = 1;
+		else if (!tty->fasync)
+			ldata->minimum_to_wake = N_TTY_BUF_SIZE;
+	}
+}
+
 struct tty_ldisc_ops tty_ldisc_N_TTY = {
 	.magic           = TTY_LDISC_MAGIC,
 	.name            = "n_tty",
@@ -2188,7 +2201,8 @@
 	.set_termios     = n_tty_set_termios,
 	.poll            = n_tty_poll,
 	.receive_buf     = n_tty_receive_buf,
-	.write_wakeup    = n_tty_write_wakeup
+	.write_wakeup    = n_tty_write_wakeup,
+	.fasync		 = n_tty_fasync,
 };
 
 /**

diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 26e3a97..c52948b 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c

@@ -4797,10 +4797,6 @@
 		PCI_VENDOR_ID_IBM, 0x0299,
 		0, 0, pbn_b0_bt_2_115200 },
 
-	{	PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9835,
-		0x1000, 0x0012,
-		0, 0, pbn_b0_bt_2_115200 },
-
 	{	PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9901,
 		0xA000, 0x1000,
 		0, 0, pbn_b0_1_115200 },

diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index 80fe91e..a1ba94d 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig

@@ -12,9 +12,8 @@
 	  here are those that are setting up dedicated Ethernet WWW/FTP
 	  servers, or users that have one of the various bus mice instead of a
 	  serial mouse and don't intend to use their machine's standard serial
-	  port for anything.  (Note that the Cyclades and Stallion multi
-	  serial port drivers do not need this driver built in for them to
-	  work.)
+	  port for anything.  (Note that the Cyclades multi serial port driver
+	  does not need this driver built in for it to work.)
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called 8250.

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 7e7006f..46dd1c7 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig

@@ -551,7 +551,7 @@
 	  Enable hardware flow control in the driver.
 
 config SERIAL_IMX
-	bool "IMX serial port support"
+	tristate "IMX serial port support"
 	depends on ARCH_MXC
 	select SERIAL_CORE
 	select RATIONAL
@@ -561,22 +561,21 @@
 
 config SERIAL_IMX_CONSOLE
 	bool "Console on IMX serial port"
-	depends on SERIAL_IMX
+	depends on SERIAL_IMX=y
 	select SERIAL_CORE_CONSOLE
 	help
-	  If you have enabled the serial port on the Motorola IMX
+	  If you have enabled the serial port on the Freescale IMX
 	  CPU you can make it the console by answering Y to this option.
 
 	  Even if you say Y here, the currently visible virtual console
 	  (/dev/tty0) will still be used as the system console by default, but
 	  you can alter that using a kernel command line option such as
-	  "console=ttySA0". (Try "man bootparam" or see the documentation of
-	  your boot loader (lilo or loadlin) about how to pass options to the
-	  kernel at boot time.)
+	  "console=ttymxc0". (Try "man bootparam" or see the documentation of
+	  your bootloader about how to pass options to the kernel at boot time.)
 
 config SERIAL_UARTLITE
 	tristate "Xilinx uartlite serial port support"
-	depends on PPC32 || MICROBLAZE || MFD_TIMBERDALE
+	depends on PPC32 || MICROBLAZE || MFD_TIMBERDALE || ARCH_ZYNQ
 	select SERIAL_CORE
 	help
 	  Say Y here if you want to use the Xilinx uartlite serial controller.
@@ -1484,6 +1483,20 @@
 	  If multiple cards are present, the default limit of 32 ports may
 	  need to be increased.
 
+config SERIAL_FSL_LPUART
+	tristate "Freescale lpuart serial port support"
+	select SERIAL_CORE
+	help
+	  Support for the on-chip lpuart on some Freescale SOCs.
+
+config SERIAL_FSL_LPUART_CONSOLE
+	bool "Console on Freescale lpuart serial port"
+	depends on SERIAL_FSL_LPUART=y
+	select SERIAL_CORE_CONSOLE
+	help
+	  If you have enabled the lpuart serial port on the Freescale SoCs,
+	  you can make it the console by answering Y to this option.
+
 endmenu
 
 endif # TTY

diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index eedfec4..cf650f0 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile

@@ -85,3 +85,4 @@
 obj-$(CONFIG_SERIAL_EFM32_UART) += efm32-uart.o
 obj-$(CONFIG_SERIAL_ARC)	+= arc_uart.o
 obj-$(CONFIG_SERIAL_RP2)	+= rp2.o
+obj-$(CONFIG_SERIAL_FSL_LPUART)	+= fsl_lpuart.o

diff --git a/drivers/tty/serial/altera_uart.c b/drivers/tty/serial/altera_uart.c
index 13471dd..1d46966 100644
--- a/drivers/tty/serial/altera_uart.c
+++ b/drivers/tty/serial/altera_uart.c

@@ -604,7 +604,6 @@
 
 	if (port) {
 		uart_remove_one_port(&altera_uart_driver, port);
-		platform_set_drvdata(pdev, NULL);
 		port->mapbase = 0;
 	}
 

diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index e2774f9..ad41319 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c

@@ -79,13 +79,12 @@
 	bool			dma_threshold;
 	bool			cts_event_workaround;
 
-	unsigned int (*get_fifosize)(unsigned int periphid);
+	unsigned int (*get_fifosize)(struct amba_device *dev);
 };
 
-static unsigned int get_fifosize_arm(unsigned int periphid)
+static unsigned int get_fifosize_arm(struct amba_device *dev)
 {
-	unsigned int rev = (periphid >> 20) & 0xf;
-	return rev < 3 ? 16 : 32;
+	return amba_rev(dev) < 3 ? 16 : 32;
 }
 
 static struct vendor_data vendor_arm = {
@@ -98,7 +97,7 @@
 	.get_fifosize		= get_fifosize_arm,
 };
 
-static unsigned int get_fifosize_st(unsigned int periphid)
+static unsigned int get_fifosize_st(struct amba_device *dev)
 {
 	return 64;
 }
@@ -2157,7 +2156,7 @@
 	uap->lcrh_rx = vendor->lcrh_rx;
 	uap->lcrh_tx = vendor->lcrh_tx;
 	uap->old_cr = 0;
-	uap->fifosize = vendor->get_fifosize(dev->periphid);
+	uap->fifosize = vendor->get_fifosize(dev);
 	uap->port.dev = &dev->dev;
 	uap->port.mapbase = dev->res.start;
 	uap->port.membase = base;

diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index 3467462..691265f 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c

@@ -1100,7 +1100,7 @@
 		 * Enable the peripheral clock for this serial port.
 		 * This is called on uart_open() or a resume event.
 		 */
-		clk_enable(atmel_port->clk);
+		clk_prepare_enable(atmel_port->clk);
 
 		/* re-enable interrupts if we disabled some on suspend */
 		UART_PUT_IER(port, atmel_port->backup_imr);
@@ -1114,7 +1114,7 @@
 		 * Disable the peripheral clock for this serial port.
 		 * This is called on uart_close() or a suspend event.
 		 */
-		clk_disable(atmel_port->clk);
+		clk_disable_unprepare(atmel_port->clk);
 		break;
 	default:
 		printk(KERN_ERR "atmel_serial: unknown pm %d\n", state);
@@ -1458,9 +1458,10 @@
 /*
  * Configure the port from the platform device resource info.
  */
-static void atmel_init_port(struct atmel_uart_port *atmel_port,
+static int atmel_init_port(struct atmel_uart_port *atmel_port,
 				      struct platform_device *pdev)
 {
+	int ret;
 	struct uart_port *port = &atmel_port->uart;
 	struct atmel_uart_data *pdata = pdev->dev.platform_data;
 
@@ -1496,9 +1497,19 @@
 	/* for console, the clock could already be configured */
 	if (!atmel_port->clk) {
 		atmel_port->clk = clk_get(&pdev->dev, "usart");
-		clk_enable(atmel_port->clk);
+		if (IS_ERR(atmel_port->clk)) {
+			ret = PTR_ERR(atmel_port->clk);
+			atmel_port->clk = NULL;
+			return ret;
+		}
+		ret = clk_prepare_enable(atmel_port->clk);
+		if (ret) {
+			clk_put(atmel_port->clk);
+			atmel_port->clk = NULL;
+			return ret;
+		}
 		port->uartclk = clk_get_rate(atmel_port->clk);
-		clk_disable(atmel_port->clk);
+		clk_disable_unprepare(atmel_port->clk);
 		/* only enable clock when USART is in use */
 	}
 
@@ -1511,6 +1522,8 @@
 	} else {
 		atmel_port->tx_done_mask = ATMEL_US_TXRDY;
 	}
+
+	return 0;
 }
 
 struct platform_device *atmel_default_console_device;	/* the serial console device */
@@ -1601,6 +1614,7 @@
 
 static int __init atmel_console_setup(struct console *co, char *options)
 {
+	int ret;
 	struct uart_port *port = &atmel_ports[co->index].uart;
 	int baud = 115200;
 	int bits = 8;
@@ -1612,7 +1626,9 @@
 		return -ENODEV;
 	}
 
-	clk_enable(atmel_ports[co->index].clk);
+	ret = clk_prepare_enable(atmel_ports[co->index].clk);
+	if (ret)
+		return ret;
 
 	UART_PUT_IDR(port, -1);
 	UART_PUT_CR(port, ATMEL_US_RSTSTA | ATMEL_US_RSTRX);
@@ -1645,6 +1661,7 @@
  */
 static int __init atmel_console_init(void)
 {
+	int ret;
 	if (atmel_default_console_device) {
 		struct atmel_uart_data *pdata =
 			atmel_default_console_device->dev.platform_data;
@@ -1655,7 +1672,9 @@
 		port->uart.line = id;
 
 		add_preferred_console(ATMEL_DEVICENAME, id, NULL);
-		atmel_init_port(port, atmel_default_console_device);
+		ret = atmel_init_port(port, atmel_default_console_device);
+		if (ret)
+			return ret;
 		register_console(&atmel_console);
 	}
 
@@ -1786,7 +1805,9 @@
 	port->backup_imr = 0;
 	port->uart.line = ret;
 
-	atmel_init_port(port, pdev);
+	ret = atmel_init_port(port, pdev);
+	if (ret)
+		goto err;
 
 	pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
 	if (IS_ERR(pinctrl)) {
@@ -1812,9 +1833,9 @@
 			&& ATMEL_CONSOLE_DEVICE->flags & CON_ENABLED) {
 		/*
 		 * The serial core enabled the clock for us, so undo
-		 * the clk_enable() in atmel_console_setup()
+		 * the clk_prepare_enable() in atmel_console_setup()
 		 */
-		clk_disable(port->clk);
+		clk_disable_unprepare(port->clk);
 	}
 #endif
 

diff --git a/drivers/tty/serial/cpm_uart/cpm_uart_core.c b/drivers/tty/serial/cpm_uart/cpm_uart_core.c
index 97f4e18..f7672ca 100644
--- a/drivers/tty/serial/cpm_uart/cpm_uart_core.c
+++ b/drivers/tty/serial/cpm_uart/cpm_uart_core.c

@@ -1384,7 +1384,7 @@
 	if (index >= UART_NR)
 		return -ENODEV;
 
-	dev_set_drvdata(&ofdev->dev, pinfo);
+	platform_set_drvdata(ofdev, pinfo);
 
 	/* initialize the device pointer for the port */
 	pinfo->port.dev = &ofdev->dev;
@@ -1398,7 +1398,7 @@
 
 static int cpm_uart_remove(struct platform_device *ofdev)
 {
-	struct uart_cpm_port *pinfo = dev_get_drvdata(&ofdev->dev);
+	struct uart_cpm_port *pinfo = platform_get_drvdata(ofdev);
 	return uart_remove_one_port(&cpm_reg, &pinfo->port);
 }
 

diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
new file mode 100644
index 0000000..263cfaa
--- /dev/null
+++ b/drivers/tty/serial/fsl_lpuart.c

@@ -0,0 +1,874 @@
+/*
+ *  Freescale lpuart serial port driver
+ *
+ *  Copyright 2012-2013 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#if defined(CONFIG_SERIAL_FSL_LPUART_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/clk.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/console.h>
+#include <linux/serial_core.h>
+#include <linux/tty_flip.h>
+
+/* All registers are 8-bit width */
+#define UARTBDH			0x00
+#define UARTBDL			0x01
+#define UARTCR1			0x02
+#define UARTCR2			0x03
+#define UARTSR1			0x04
+#define UARTCR3			0x06
+#define UARTDR			0x07
+#define UARTCR4			0x0a
+#define UARTCR5			0x0b
+#define UARTMODEM		0x0d
+#define UARTPFIFO		0x10
+#define UARTCFIFO		0x11
+#define UARTSFIFO		0x12
+#define UARTTWFIFO		0x13
+#define UARTTCFIFO		0x14
+#define UARTRWFIFO		0x15
+
+#define UARTBDH_LBKDIE		0x80
+#define UARTBDH_RXEDGIE		0x40
+#define UARTBDH_SBR_MASK	0x1f
+
+#define UARTCR1_LOOPS		0x80
+#define UARTCR1_RSRC		0x20
+#define UARTCR1_M		0x10
+#define UARTCR1_WAKE		0x08
+#define UARTCR1_ILT		0x04
+#define UARTCR1_PE		0x02
+#define UARTCR1_PT		0x01
+
+#define UARTCR2_TIE		0x80
+#define UARTCR2_TCIE		0x40
+#define UARTCR2_RIE		0x20
+#define UARTCR2_ILIE		0x10
+#define UARTCR2_TE		0x08
+#define UARTCR2_RE		0x04
+#define UARTCR2_RWU		0x02
+#define UARTCR2_SBK		0x01
+
+#define UARTSR1_TDRE		0x80
+#define UARTSR1_TC		0x40
+#define UARTSR1_RDRF		0x20
+#define UARTSR1_IDLE		0x10
+#define UARTSR1_OR		0x08
+#define UARTSR1_NF		0x04
+#define UARTSR1_FE		0x02
+#define UARTSR1_PE		0x01
+
+#define UARTCR3_R8		0x80
+#define UARTCR3_T8		0x40
+#define UARTCR3_TXDIR		0x20
+#define UARTCR3_TXINV		0x10
+#define UARTCR3_ORIE		0x08
+#define UARTCR3_NEIE		0x04
+#define UARTCR3_FEIE		0x02
+#define UARTCR3_PEIE		0x01
+
+#define UARTCR4_MAEN1		0x80
+#define UARTCR4_MAEN2		0x40
+#define UARTCR4_M10		0x20
+#define UARTCR4_BRFA_MASK	0x1f
+#define UARTCR4_BRFA_OFF	0
+
+#define UARTCR5_TDMAS		0x80
+#define UARTCR5_RDMAS		0x20
+
+#define UARTMODEM_RXRTSE	0x08
+#define UARTMODEM_TXRTSPOL	0x04
+#define UARTMODEM_TXRTSE	0x02
+#define UARTMODEM_TXCTSE	0x01
+
+#define UARTPFIFO_TXFE		0x80
+#define UARTPFIFO_FIFOSIZE_MASK	0x7
+#define UARTPFIFO_TXSIZE_OFF	4
+#define UARTPFIFO_RXFE		0x08
+#define UARTPFIFO_RXSIZE_OFF	0
+
+#define UARTCFIFO_TXFLUSH	0x80
+#define UARTCFIFO_RXFLUSH	0x40
+#define UARTCFIFO_RXOFE		0x04
+#define UARTCFIFO_TXOFE		0x02
+#define UARTCFIFO_RXUFE		0x01
+
+#define UARTSFIFO_TXEMPT	0x80
+#define UARTSFIFO_RXEMPT	0x40
+#define UARTSFIFO_RXOF		0x04
+#define UARTSFIFO_TXOF		0x02
+#define UARTSFIFO_RXUF		0x01
+
+#define DRIVER_NAME	"fsl-lpuart"
+#define DEV_NAME	"ttyLP"
+#define UART_NR		6
+
+struct lpuart_port {
+	struct uart_port	port;
+	struct clk		*clk;
+	unsigned int		txfifo_size;
+	unsigned int		rxfifo_size;
+};
+
+static struct of_device_id lpuart_dt_ids[] = {
+	{
+		.compatible = "fsl,vf610-lpuart",
+	},
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, lpuart_dt_ids);
+
+static void lpuart_stop_tx(struct uart_port *port)
+{
+	unsigned char temp;
+
+	temp = readb(port->membase + UARTCR2);
+	temp &= ~(UARTCR2_TIE | UARTCR2_TCIE);
+	writeb(temp, port->membase + UARTCR2);
+}
+
+static void lpuart_stop_rx(struct uart_port *port)
+{
+	unsigned char temp;
+
+	temp = readb(port->membase + UARTCR2);
+	writeb(temp & ~UARTCR2_RE, port->membase + UARTCR2);
+}
+
+static void lpuart_enable_ms(struct uart_port *port)
+{
+}
+
+static inline void lpuart_transmit_buffer(struct lpuart_port *sport)
+{
+	struct circ_buf *xmit = &sport->port.state->xmit;
+
+	while (!uart_circ_empty(xmit) &&
+		(readb(sport->port.membase + UARTTCFIFO) < sport->txfifo_size)) {
+		writeb(xmit->buf[xmit->tail], sport->port.membase + UARTDR);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		sport->port.icount.tx++;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(&sport->port);
+
+	if (uart_circ_empty(xmit))
+		lpuart_stop_tx(&sport->port);
+}
+
+static void lpuart_start_tx(struct uart_port *port)
+{
+	struct lpuart_port *sport = container_of(port, struct lpuart_port, port);
+	unsigned char temp;
+
+	temp = readb(port->membase + UARTCR2);
+	writeb(temp | UARTCR2_TIE, port->membase + UARTCR2);
+
+	if (readb(port->membase + UARTSR1) & UARTSR1_TDRE)
+		lpuart_transmit_buffer(sport);
+}
+
+static irqreturn_t lpuart_txint(int irq, void *dev_id)
+{
+	struct lpuart_port *sport = dev_id;
+	struct circ_buf *xmit = &sport->port.state->xmit;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sport->port.lock, flags);
+	if (sport->port.x_char) {
+		writeb(sport->port.x_char, sport->port.membase + UARTDR);
+		goto out;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(&sport->port)) {
+		lpuart_stop_tx(&sport->port);
+		goto out;
+	}
+
+	lpuart_transmit_buffer(sport);
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(&sport->port);
+
+out:
+	spin_unlock_irqrestore(&sport->port.lock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t lpuart_rxint(int irq, void *dev_id)
+{
+	struct lpuart_port *sport = dev_id;
+	unsigned int flg, ignored = 0;
+	struct tty_port *port = &sport->port.state->port;
+	unsigned long flags;
+	unsigned char rx, sr;
+
+	spin_lock_irqsave(&sport->port.lock, flags);
+
+	while (!(readb(sport->port.membase + UARTSFIFO) & UARTSFIFO_RXEMPT)) {
+		flg = TTY_NORMAL;
+		sport->port.icount.rx++;
+		/*
+		 * to clear the FE, OR, NF, FE, PE flags,
+		 * read SR1 then read DR
+		 */
+		sr = readb(sport->port.membase + UARTSR1);
+		rx = readb(sport->port.membase + UARTDR);
+
+		if (uart_handle_sysrq_char(&sport->port, (unsigned char)rx))
+			continue;
+
+		if (sr & (UARTSR1_PE | UARTSR1_OR | UARTSR1_FE)) {
+			if (sr & UARTSR1_PE)
+				sport->port.icount.parity++;
+			else if (sr & UARTSR1_FE)
+				sport->port.icount.frame++;
+
+			if (sr & UARTSR1_OR)
+				sport->port.icount.overrun++;
+
+			if (sr & sport->port.ignore_status_mask) {
+				if (++ignored > 100)
+					goto out;
+				continue;
+			}
+
+			sr &= sport->port.read_status_mask;
+
+			if (sr & UARTSR1_PE)
+				flg = TTY_PARITY;
+			else if (sr & UARTSR1_FE)
+				flg = TTY_FRAME;
+
+			if (sr & UARTSR1_OR)
+				flg = TTY_OVERRUN;
+
+#ifdef SUPPORT_SYSRQ
+			sport->port.sysrq = 0;
+#endif
+		}
+
+		tty_insert_flip_char(port, rx, flg);
+	}
+
+out:
+	spin_unlock_irqrestore(&sport->port.lock, flags);
+
+	tty_flip_buffer_push(port);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t lpuart_int(int irq, void *dev_id)
+{
+	struct lpuart_port *sport = dev_id;
+	unsigned char sts;
+
+	sts = readb(sport->port.membase + UARTSR1);
+
+	if (sts & UARTSR1_RDRF)
+		lpuart_rxint(irq, dev_id);
+
+	if (sts & UARTSR1_TDRE &&
+		!(readb(sport->port.membase + UARTCR5) & UARTCR5_TDMAS))
+		lpuart_txint(irq, dev_id);
+
+	return IRQ_HANDLED;
+}
+
+/* return TIOCSER_TEMT when transmitter is not busy */
+static unsigned int lpuart_tx_empty(struct uart_port *port)
+{
+	return (readb(port->membase + UARTSR1) & UARTSR1_TC) ?
+		TIOCSER_TEMT : 0;
+}
+
+static unsigned int lpuart_get_mctrl(struct uart_port *port)
+{
+	unsigned int temp = 0;
+	unsigned char reg;
+
+	reg = readb(port->membase + UARTMODEM);
+	if (reg & UARTMODEM_TXCTSE)
+		temp |= TIOCM_CTS;
+
+	if (reg & UARTMODEM_RXRTSE)
+		temp |= TIOCM_RTS;
+
+	return temp;
+}
+
+static void lpuart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	unsigned char temp;
+
+	temp = readb(port->membase + UARTMODEM) &
+			~(UARTMODEM_RXRTSE | UARTMODEM_TXCTSE);
+
+	if (mctrl & TIOCM_RTS)
+		temp |= UARTMODEM_RXRTSE;
+
+	if (mctrl & TIOCM_CTS)
+		temp |= UARTMODEM_TXCTSE;
+
+	writeb(temp, port->membase + UARTMODEM);
+}
+
+static void lpuart_break_ctl(struct uart_port *port, int break_state)
+{
+	unsigned char temp;
+
+	temp = readb(port->membase + UARTCR2) & ~UARTCR2_SBK;
+
+	if (break_state != 0)
+		temp |= UARTCR2_SBK;
+
+	writeb(temp, port->membase + UARTCR2);
+}
+
+static void lpuart_setup_watermark(struct lpuart_port *sport)
+{
+	unsigned char val, cr2;
+
+	cr2 = readb(sport->port.membase + UARTCR2);
+	cr2 &= ~(UARTCR2_TIE | UARTCR2_TCIE | UARTCR2_TE |
+			UARTCR2_RIE | UARTCR2_RE);
+	writeb(cr2, sport->port.membase + UARTCR2);
+
+	/* determine FIFO size and enable FIFO mode */
+	val = readb(sport->port.membase + UARTPFIFO);
+
+	sport->txfifo_size = 0x1 << (((val >> UARTPFIFO_TXSIZE_OFF) &
+		UARTPFIFO_FIFOSIZE_MASK) + 1);
+
+	sport->rxfifo_size = 0x1 << (((val >> UARTPFIFO_RXSIZE_OFF) &
+		UARTPFIFO_FIFOSIZE_MASK) + 1);
+
+	writeb(val | UARTPFIFO_TXFE | UARTPFIFO_RXFE,
+			sport->port.membase + UARTPFIFO);
+
+	/* flush Tx and Rx FIFO */
+	writeb(UARTCFIFO_TXFLUSH | UARTCFIFO_RXFLUSH,
+			sport->port.membase + UARTCFIFO);
+
+	writeb(2, sport->port.membase + UARTTWFIFO);
+	writeb(1, sport->port.membase + UARTRWFIFO);
+}
+
+static int lpuart_startup(struct uart_port *port)
+{
+	struct lpuart_port *sport = container_of(port, struct lpuart_port, port);
+	int ret;
+	unsigned long flags;
+	unsigned char temp;
+
+	ret = devm_request_irq(port->dev, port->irq, lpuart_int, 0,
+				DRIVER_NAME, sport);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&sport->port.lock, flags);
+
+	lpuart_setup_watermark(sport);
+
+	temp = readb(sport->port.membase + UARTCR2);
+	temp |= (UARTCR2_RIE | UARTCR2_TIE | UARTCR2_RE | UARTCR2_TE);
+	writeb(temp, sport->port.membase + UARTCR2);
+
+	spin_unlock_irqrestore(&sport->port.lock, flags);
+	return 0;
+}
+
+static void lpuart_shutdown(struct uart_port *port)
+{
+	struct lpuart_port *sport = container_of(port, struct lpuart_port, port);
+	unsigned char temp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	/* disable Rx/Tx and interrupts */
+	temp = readb(port->membase + UARTCR2);
+	temp &= ~(UARTCR2_TE | UARTCR2_RE |
+			UARTCR2_TIE | UARTCR2_TCIE | UARTCR2_RIE);
+	writeb(temp, port->membase + UARTCR2);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	devm_free_irq(port->dev, port->irq, sport);
+}
+
+static void
+lpuart_set_termios(struct uart_port *port, struct ktermios *termios,
+		   struct ktermios *old)
+{
+	struct lpuart_port *sport = container_of(port, struct lpuart_port, port);
+	unsigned long flags;
+	unsigned char cr1, old_cr1, old_cr2, cr4, bdh, modem;
+	unsigned int  baud;
+	unsigned int old_csize = old ? old->c_cflag & CSIZE : CS8;
+	unsigned int sbr, brfa;
+
+	cr1 = old_cr1 = readb(sport->port.membase + UARTCR1);
+	old_cr2 = readb(sport->port.membase + UARTCR2);
+	cr4 = readb(sport->port.membase + UARTCR4);
+	bdh = readb(sport->port.membase + UARTBDH);
+	modem = readb(sport->port.membase + UARTMODEM);
+	/*
+	 * only support CS8 and CS7, and for CS7 must enable PE.
+	 * supported mode:
+	 *  - (7,e/o,1)
+	 *  - (8,n,1)
+	 *  - (8,m/s,1)
+	 *  - (8,e/o,1)
+	 */
+	while ((termios->c_cflag & CSIZE) != CS8 &&
+		(termios->c_cflag & CSIZE) != CS7) {
+		termios->c_cflag &= ~CSIZE;
+		termios->c_cflag |= old_csize;
+		old_csize = CS8;
+	}
+
+	if ((termios->c_cflag & CSIZE) == CS8 ||
+		(termios->c_cflag & CSIZE) == CS7)
+		cr1 = old_cr1 & ~UARTCR1_M;
+
+	if (termios->c_cflag & CMSPAR) {
+		if ((termios->c_cflag & CSIZE) != CS8) {
+			termios->c_cflag &= ~CSIZE;
+			termios->c_cflag |= CS8;
+		}
+		cr1 |= UARTCR1_M;
+	}
+
+	if (termios->c_cflag & CRTSCTS) {
+		modem |= (UARTMODEM_RXRTSE | UARTMODEM_TXCTSE);
+	} else {
+		termios->c_cflag &= ~CRTSCTS;
+		modem &= ~(UARTMODEM_RXRTSE | UARTMODEM_TXCTSE);
+	}
+
+	if (termios->c_cflag & CSTOPB)
+		termios->c_cflag &= ~CSTOPB;
+
+	/* parity must be enabled when CS7 to match 8-bits format */
+	if ((termios->c_cflag & CSIZE) == CS7)
+		termios->c_cflag |= PARENB;
+
+	if ((termios->c_cflag & PARENB)) {
+		if (termios->c_cflag & CMSPAR) {
+			cr1 &= ~UARTCR1_PE;
+			cr1 |= UARTCR1_M;
+		} else {
+			cr1 |= UARTCR1_PE;
+			if ((termios->c_cflag & CSIZE) == CS8)
+				cr1 |= UARTCR1_M;
+			if (termios->c_cflag & PARODD)
+				cr1 |= UARTCR1_PT;
+			else
+				cr1 &= ~UARTCR1_PT;
+		}
+	}
+
+	/* ask the core to calculate the divisor */
+	baud = uart_get_baud_rate(port, termios, old, 50, port->uartclk / 16);
+
+	spin_lock_irqsave(&sport->port.lock, flags);
+
+	sport->port.read_status_mask = 0;
+	if (termios->c_iflag & INPCK)
+		sport->port.read_status_mask |=	(UARTSR1_FE | UARTSR1_PE);
+	if (termios->c_iflag & (BRKINT | PARMRK))
+		sport->port.read_status_mask |= UARTSR1_FE;
+
+	/* characters to ignore */
+	sport->port.ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		sport->port.ignore_status_mask |= UARTSR1_PE;
+	if (termios->c_iflag & IGNBRK) {
+		sport->port.ignore_status_mask |= UARTSR1_FE;
+		/*
+		 * if we're ignoring parity and break indicators,
+		 * ignore overruns too (for real raw support).
+		 */
+		if (termios->c_iflag & IGNPAR)
+			sport->port.ignore_status_mask |= UARTSR1_OR;
+	}
+
+	/* update the per-port timeout */
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	/* wait transmit engin complete */
+	while (!(readb(sport->port.membase + UARTSR1) & UARTSR1_TC))
+		barrier();
+
+	/* disable transmit and receive */
+	writeb(old_cr2 & ~(UARTCR2_TE | UARTCR2_RE),
+			sport->port.membase + UARTCR2);
+
+	sbr = sport->port.uartclk / (16 * baud);
+	brfa = ((sport->port.uartclk - (16 * sbr * baud)) * 2) / baud;
+	bdh &= ~UARTBDH_SBR_MASK;
+	bdh |= (sbr >> 8) & 0x1F;
+	cr4 &= ~UARTCR4_BRFA_MASK;
+	brfa &= UARTCR4_BRFA_MASK;
+	writeb(cr4 | brfa, sport->port.membase + UARTCR4);
+	writeb(bdh, sport->port.membase + UARTBDH);
+	writeb(sbr & 0xFF, sport->port.membase + UARTBDL);
+	writeb(cr1, sport->port.membase + UARTCR1);
+	writeb(modem, sport->port.membase + UARTMODEM);
+
+	/* restore control register */
+	writeb(old_cr2, sport->port.membase + UARTCR2);
+
+	spin_unlock_irqrestore(&sport->port.lock, flags);
+}
+
+static const char *lpuart_type(struct uart_port *port)
+{
+	return "FSL_LPUART";
+}
+
+static void lpuart_release_port(struct uart_port *port)
+{
+	/* nothing to do */
+}
+
+static int lpuart_request_port(struct uart_port *port)
+{
+	return  0;
+}
+
+/* configure/autoconfigure the port */
+static void lpuart_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE)
+		port->type = PORT_LPUART;
+}
+
+static int lpuart_verify_port(struct uart_port *port, struct serial_struct *ser)
+{
+	int ret = 0;
+
+	if (ser->type != PORT_UNKNOWN && ser->type != PORT_LPUART)
+		ret = -EINVAL;
+	if (port->irq != ser->irq)
+		ret = -EINVAL;
+	if (ser->io_type != UPIO_MEM)
+		ret = -EINVAL;
+	if (port->uartclk / 16 != ser->baud_base)
+		ret = -EINVAL;
+	if (port->iobase != ser->port)
+		ret = -EINVAL;
+	if (ser->hub6 != 0)
+		ret = -EINVAL;
+	return ret;
+}
+
+static struct uart_ops lpuart_pops = {
+	.tx_empty	= lpuart_tx_empty,
+	.set_mctrl	= lpuart_set_mctrl,
+	.get_mctrl	= lpuart_get_mctrl,
+	.stop_tx	= lpuart_stop_tx,
+	.start_tx	= lpuart_start_tx,
+	.stop_rx	= lpuart_stop_rx,
+	.enable_ms	= lpuart_enable_ms,
+	.break_ctl	= lpuart_break_ctl,
+	.startup	= lpuart_startup,
+	.shutdown	= lpuart_shutdown,
+	.set_termios	= lpuart_set_termios,
+	.type		= lpuart_type,
+	.request_port	= lpuart_request_port,
+	.release_port	= lpuart_release_port,
+	.config_port	= lpuart_config_port,
+	.verify_port	= lpuart_verify_port,
+};
+
+static struct lpuart_port *lpuart_ports[UART_NR];
+
+#ifdef CONFIG_SERIAL_FSL_LPUART_CONSOLE
+static void lpuart_console_putchar(struct uart_port *port, int ch)
+{
+	while (!(readb(port->membase + UARTSR1) & UARTSR1_TDRE))
+		barrier();
+
+	writeb(ch, port->membase + UARTDR);
+}
+
+static void
+lpuart_console_write(struct console *co, const char *s, unsigned int count)
+{
+	struct lpuart_port *sport = lpuart_ports[co->index];
+	unsigned char  old_cr2, cr2;
+
+	/* first save CR2 and then disable interrupts */
+	cr2 = old_cr2 = readb(sport->port.membase + UARTCR2);
+	cr2 |= (UARTCR2_TE |  UARTCR2_RE);
+	cr2 &= ~(UARTCR2_TIE | UARTCR2_TCIE | UARTCR2_RIE);
+	writeb(cr2, sport->port.membase + UARTCR2);
+
+	uart_console_write(&sport->port, s, count, lpuart_console_putchar);
+
+	/* wait for transmitter finish complete and restore CR2 */
+	while (!(readb(sport->port.membase + UARTSR1) & UARTSR1_TC))
+		barrier();
+
+	writeb(old_cr2, sport->port.membase + UARTCR2);
+}
+
+/*
+ * if the port was already initialised (eg, by a boot loader),
+ * try to determine the current setup.
+ */
+static void __init
+lpuart_console_get_options(struct lpuart_port *sport, int *baud,
+			   int *parity, int *bits)
+{
+	unsigned char cr, bdh, bdl, brfa;
+	unsigned int sbr, uartclk, baud_raw;
+
+	cr = readb(sport->port.membase + UARTCR2);
+	cr &= UARTCR2_TE | UARTCR2_RE;
+	if (!cr)
+		return;
+
+	/* ok, the port was enabled */
+
+	cr = readb(sport->port.membase + UARTCR1);
+
+	*parity = 'n';
+	if (cr & UARTCR1_PE) {
+		if (cr & UARTCR1_PT)
+			*parity = 'o';
+		else
+			*parity = 'e';
+	}
+
+	if (cr & UARTCR1_M)
+		*bits = 9;
+	else
+		*bits = 8;
+
+	bdh = readb(sport->port.membase + UARTBDH);
+	bdh &= UARTBDH_SBR_MASK;
+	bdl = readb(sport->port.membase + UARTBDL);
+	sbr = bdh;
+	sbr <<= 8;
+	sbr |= bdl;
+	brfa = readb(sport->port.membase + UARTCR4);
+	brfa &= UARTCR4_BRFA_MASK;
+
+	uartclk = clk_get_rate(sport->clk);
+	/*
+	 * baud = mod_clk/(16*(sbr[13]+(brfa)/32)
+	 */
+	baud_raw = uartclk / (16 * (sbr + brfa / 32));
+
+	if (*baud != baud_raw)
+		printk(KERN_INFO "Serial: Console lpuart rounded baud rate"
+				"from %d to %d\n", baud_raw, *baud);
+}
+
+static int __init lpuart_console_setup(struct console *co, char *options)
+{
+	struct lpuart_port *sport;
+	int baud = 115200;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	/*
+	 * check whether an invalid uart number has been specified, and
+	 * if so, search for the first available port that does have
+	 * console support.
+	 */
+	if (co->index == -1 || co->index >= ARRAY_SIZE(lpuart_ports))
+		co->index = 0;
+
+	sport = lpuart_ports[co->index];
+	if (sport == NULL)
+		return -ENODEV;
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+	else
+		lpuart_console_get_options(sport, &baud, &parity, &bits);
+
+	lpuart_setup_watermark(sport);
+
+	return uart_set_options(&sport->port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver lpuart_reg;
+static struct console lpuart_console = {
+	.name		= DEV_NAME,
+	.write		= lpuart_console_write,
+	.device		= uart_console_device,
+	.setup		= lpuart_console_setup,
+	.flags		= CON_PRINTBUFFER,
+	.index		= -1,
+	.data		= &lpuart_reg,
+};
+
+#define LPUART_CONSOLE	(&lpuart_console)
+#else
+#define LPUART_CONSOLE	NULL
+#endif
+
+static struct uart_driver lpuart_reg = {
+	.owner		= THIS_MODULE,
+	.driver_name	= DRIVER_NAME,
+	.dev_name	= DEV_NAME,
+	.nr		= ARRAY_SIZE(lpuart_ports),
+	.cons		= LPUART_CONSOLE,
+};
+
+static int lpuart_probe(struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+	struct lpuart_port *sport;
+	struct resource *res;
+	int ret;
+
+	sport = devm_kzalloc(&pdev->dev, sizeof(*sport), GFP_KERNEL);
+	if (!sport)
+		return -ENOMEM;
+
+	pdev->dev.coherent_dma_mask = 0;
+
+	ret = of_alias_get_id(np, "serial");
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to get alias id, errno %d\n", ret);
+		return ret;
+	}
+	sport->port.line = ret;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	sport->port.mapbase = res->start;
+	sport->port.membase = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(sport->port.membase))
+		return PTR_ERR(sport->port.membase);
+
+	sport->port.dev = &pdev->dev;
+	sport->port.type = PORT_LPUART;
+	sport->port.iotype = UPIO_MEM;
+	sport->port.irq = platform_get_irq(pdev, 0);
+	sport->port.ops = &lpuart_pops;
+	sport->port.flags = UPF_BOOT_AUTOCONF;
+
+	sport->clk = devm_clk_get(&pdev->dev, "ipg");
+	if (IS_ERR(sport->clk)) {
+		ret = PTR_ERR(sport->clk);
+		dev_err(&pdev->dev, "failed to get uart clk: %d\n", ret);
+		return ret;
+	}
+
+	ret = clk_prepare_enable(sport->clk);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to enable uart clk: %d\n", ret);
+		return ret;
+	}
+
+	sport->port.uartclk = clk_get_rate(sport->clk);
+
+	lpuart_ports[sport->port.line] = sport;
+
+	platform_set_drvdata(pdev, &sport->port);
+
+	ret = uart_add_one_port(&lpuart_reg, &sport->port);
+	if (ret) {
+		clk_disable_unprepare(sport->clk);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int lpuart_remove(struct platform_device *pdev)
+{
+	struct lpuart_port *sport = platform_get_drvdata(pdev);
+
+	uart_remove_one_port(&lpuart_reg, &sport->port);
+
+	clk_disable_unprepare(sport->clk);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int lpuart_suspend(struct device *dev)
+{
+	struct lpuart_port *sport = dev_get_drvdata(dev);
+
+	uart_suspend_port(&lpuart_reg, &sport->port);
+
+	return 0;
+}
+
+static int lpuart_resume(struct device *dev)
+{
+	struct lpuart_port *sport = dev_get_drvdata(dev);
+
+	uart_resume_port(&lpuart_reg, &sport->port);
+
+	return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(lpuart_pm_ops, lpuart_suspend, lpuart_resume);
+
+static struct platform_driver lpuart_driver = {
+	.probe		= lpuart_probe,
+	.remove		= lpuart_remove,
+	.driver		= {
+		.name	= "fsl-lpuart",
+		.owner	= THIS_MODULE,
+		.of_match_table = lpuart_dt_ids,
+		.pm	= &lpuart_pm_ops,
+	},
+};
+
+static int __init lpuart_serial_init(void)
+{
+	int ret;
+
+	pr_info("serial: Freescale lpuart driver\n");
+
+	ret = uart_register_driver(&lpuart_reg);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&lpuart_driver);
+	if (ret)
+		uart_unregister_driver(&lpuart_reg);
+
+	return 0;
+}
+
+static void __exit lpuart_serial_exit(void)
+{
+	platform_driver_unregister(&lpuart_driver);
+	uart_unregister_driver(&lpuart_reg);
+}
+
+module_init(lpuart_serial_init);
+module_exit(lpuart_serial_exit);
+
+MODULE_DESCRIPTION("Freescale lpuart serial port driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index 8cdfbd3..415cec6 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c

@@ -201,6 +201,7 @@
 	unsigned int		old_status;
 	int			txirq, rxirq, rtsirq;
 	unsigned int		have_rtscts:1;
+	unsigned int		dte_mode:1;
 	unsigned int		use_irda:1;
 	unsigned int		irda_inv_rx:1;
 	unsigned int		irda_inv_tx:1;
@@ -271,6 +272,7 @@
 /*
  * Save and restore functions for UCR1, UCR2 and UCR3 registers
  */
+#if defined(CONFIG_CONSOLE_POLL) || defined(CONFIG_SERIAL_IMX_CONSOLE)
 static void imx_port_ucrs_save(struct uart_port *port,
 			       struct imx_port_ucrs *ucr)
 {
@@ -288,6 +290,7 @@
 	writel(ucr->ucr2, port->membase + UCR2);
 	writel(ucr->ucr3, port->membase + UCR3);
 }
+#endif
 
 /*
  * Handle any change of modem status signal since we were last called.
@@ -449,6 +452,13 @@
 		temp &= ~(UCR1_RRDYEN);
 		writel(temp, sport->port.membase + UCR1);
 	}
+	/* Clear any pending ORE flag before enabling interrupt */
+	temp = readl(sport->port.membase + USR2);
+	writel(temp | USR2_ORE, sport->port.membase + USR2);
+
+	temp = readl(sport->port.membase + UCR4);
+	temp |= UCR4_OREN;
+	writel(temp, sport->port.membase + UCR4);
 
 	temp = readl(sport->port.membase + UCR1);
 	writel(temp | UCR1_TXMPTYEN, sport->port.membase + UCR1);
@@ -582,6 +592,7 @@
 {
 	struct imx_port *sport = dev_id;
 	unsigned int sts;
+	unsigned int sts2;
 
 	sts = readl(sport->port.membase + USR1);
 
@@ -598,6 +609,13 @@
 	if (sts & USR1_AWAKE)
 		writel(USR1_AWAKE, sport->port.membase + USR1);
 
+	sts2 = readl(sport->port.membase + USR2);
+	if (sts2 & USR2_ORE) {
+		dev_err(sport->port.dev, "Rx FIFO overrun\n");
+		sport->port.icount.overrun++;
+		writel(sts2 | USR2_ORE, sport->port.membase + USR2);
+	}
+
 	return IRQ_HANDLED;
 }
 
@@ -684,6 +702,17 @@
 	int retval;
 	unsigned long flags, temp;
 
+	if (!uart_console(port)) {
+		retval = clk_prepare_enable(sport->clk_per);
+		if (retval)
+			goto error_out1;
+		retval = clk_prepare_enable(sport->clk_ipg);
+		if (retval) {
+			clk_disable_unprepare(sport->clk_per);
+			goto error_out1;
+		}
+	}
+
 	imx_setup_ufcr(sport, 0);
 
 	/* disable the DREN bit (Data Ready interrupt enable) before
@@ -871,6 +900,11 @@
 
 	writel(temp, sport->port.membase + UCR1);
 	spin_unlock_irqrestore(&sport->port.lock, flags);
+
+	if (!uart_console(&sport->port)) {
+		clk_disable_unprepare(sport->clk_per);
+		clk_disable_unprepare(sport->clk_ipg);
+	}
 }
 
 static void
@@ -1007,6 +1041,8 @@
 
 	ufcr = readl(sport->port.membase + UFCR);
 	ufcr = (ufcr & (~UFCR_RFDIV)) | UFCR_RFDIV_REG(div);
+	if (sport->dte_mode)
+		ufcr |= UFCR_DCEDTE;
 	writel(ufcr, sport->port.membase + UFCR);
 
 	writel(num, sport->port.membase + UBIR);
@@ -1431,6 +1467,9 @@
 	if (of_get_property(np, "fsl,irda-mode", NULL))
 		sport->use_irda = 1;
 
+	if (of_get_property(np, "fsl,dte-mode", NULL))
+		sport->dte_mode = 1;
+
 	sport->devdata = of_id->data;
 
 	return 0;
@@ -1544,6 +1583,11 @@
 		goto deinit;
 	platform_set_drvdata(pdev, sport);
 
+	if (!uart_console(&sport->port)) {
+		clk_disable_unprepare(sport->clk_per);
+		clk_disable_unprepare(sport->clk_ipg);
+	}
+
 	return 0;
 deinit:
 	if (pdata && pdata->exit)
@@ -1565,9 +1609,6 @@
 
 	uart_remove_one_port(&imx_reg, &sport->port);
 
-	clk_disable_unprepare(sport->clk_per);
-	clk_disable_unprepare(sport->clk_ipg);
-
 	if (pdata && pdata->exit)
 		pdata->exit(pdev);
 

diff --git a/drivers/tty/serial/mfd.c b/drivers/tty/serial/mfd.c
index 5f4765a..e266eca 100644
--- a/drivers/tty/serial/mfd.c
+++ b/drivers/tty/serial/mfd.c

@@ -21,6 +21,10 @@
  *    be triggered
  */
 
+#if defined(CONFIG_SERIAL_MFD_HSU_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/console.h>

diff --git a/drivers/tty/serial/mpc52xx_uart.c b/drivers/tty/serial/mpc52xx_uart.c
index f51b280..e1280a2 100644
--- a/drivers/tty/serial/mpc52xx_uart.c
+++ b/drivers/tty/serial/mpc52xx_uart.c

@@ -84,16 +84,6 @@
 static irqreturn_t mpc52xx_uart_int(int irq, void *dev_id);
 static irqreturn_t mpc5xxx_uart_process_int(struct uart_port *port);
 
-
-/* Simple macro to test if a port is console or not. This one is taken
- * for serial_core.c and maybe should be moved to serial_core.h ? */
-#ifdef CONFIG_SERIAL_CORE_CONSOLE
-#define uart_console(port) \
-	((port)->cons && (port)->cons->index == (port)->line)
-#else
-#define uart_console(port)	(0)
-#endif
-
 /* ======================================================================== */
 /* PSC fifo operations for isolating differences between 52xx and 512x      */
 /* ======================================================================== */
@@ -122,6 +112,15 @@
 	void		(*fifoc_uninit)(void);
 	void		(*get_irq)(struct uart_port *, struct device_node *);
 	irqreturn_t	(*handle_irq)(struct uart_port *port);
+	u16		(*get_status)(struct uart_port *port);
+	u8		(*get_ipcr)(struct uart_port *port);
+	void		(*command)(struct uart_port *port, u8 cmd);
+	void		(*set_mode)(struct uart_port *port, u8 mr1, u8 mr2);
+	void		(*set_rts)(struct uart_port *port, int state);
+	void		(*enable_ms)(struct uart_port *port);
+	void		(*set_sicr)(struct uart_port *port, u32 val);
+	void		(*set_imr)(struct uart_port *port, u16 val);
+	u8		(*get_mr1)(struct uart_port *port);
 };
 
 /* setting the prescaler and divisor reg is common for all chips */
@@ -134,6 +133,65 @@
 	out_8(&psc->ctlr, divisor & 0xff);
 }
 
+static u16 mpc52xx_psc_get_status(struct uart_port *port)
+{
+	return in_be16(&PSC(port)->mpc52xx_psc_status);
+}
+
+static u8 mpc52xx_psc_get_ipcr(struct uart_port *port)
+{
+	return in_8(&PSC(port)->mpc52xx_psc_ipcr);
+}
+
+static void mpc52xx_psc_command(struct uart_port *port, u8 cmd)
+{
+	out_8(&PSC(port)->command, cmd);
+}
+
+static void mpc52xx_psc_set_mode(struct uart_port *port, u8 mr1, u8 mr2)
+{
+	out_8(&PSC(port)->command, MPC52xx_PSC_SEL_MODE_REG_1);
+	out_8(&PSC(port)->mode, mr1);
+	out_8(&PSC(port)->mode, mr2);
+}
+
+static void mpc52xx_psc_set_rts(struct uart_port *port, int state)
+{
+	if (state)
+		out_8(&PSC(port)->op1, MPC52xx_PSC_OP_RTS);
+	else
+		out_8(&PSC(port)->op0, MPC52xx_PSC_OP_RTS);
+}
+
+static void mpc52xx_psc_enable_ms(struct uart_port *port)
+{
+	struct mpc52xx_psc __iomem *psc = PSC(port);
+
+	/* clear D_*-bits by reading them */
+	in_8(&psc->mpc52xx_psc_ipcr);
+	/* enable CTS and DCD as IPC interrupts */
+	out_8(&psc->mpc52xx_psc_acr, MPC52xx_PSC_IEC_CTS | MPC52xx_PSC_IEC_DCD);
+
+	port->read_status_mask |= MPC52xx_PSC_IMR_IPC;
+	out_be16(&psc->mpc52xx_psc_imr, port->read_status_mask);
+}
+
+static void mpc52xx_psc_set_sicr(struct uart_port *port, u32 val)
+{
+	out_be32(&PSC(port)->sicr, val);
+}
+
+static void mpc52xx_psc_set_imr(struct uart_port *port, u16 val)
+{
+	out_be16(&PSC(port)->mpc52xx_psc_imr, val);
+}
+
+static u8 mpc52xx_psc_get_mr1(struct uart_port *port)
+{
+	out_8(&PSC(port)->command, MPC52xx_PSC_SEL_MODE_REG_1);
+	return in_8(&PSC(port)->mode);
+}
+
 #ifdef CONFIG_PPC_MPC52xx
 #define FIFO_52xx(port) ((struct mpc52xx_psc_fifo __iomem *)(PSC(port)+1))
 static void mpc52xx_psc_fifo_init(struct uart_port *port)
@@ -304,6 +362,15 @@
 	.set_baudrate = mpc5200_psc_set_baudrate,
 	.get_irq = mpc52xx_psc_get_irq,
 	.handle_irq = mpc52xx_psc_handle_irq,
+	.get_status = mpc52xx_psc_get_status,
+	.get_ipcr = mpc52xx_psc_get_ipcr,
+	.command = mpc52xx_psc_command,
+	.set_mode = mpc52xx_psc_set_mode,
+	.set_rts = mpc52xx_psc_set_rts,
+	.enable_ms = mpc52xx_psc_enable_ms,
+	.set_sicr = mpc52xx_psc_set_sicr,
+	.set_imr = mpc52xx_psc_set_imr,
+	.get_mr1 = mpc52xx_psc_get_mr1,
 };
 
 static struct psc_ops mpc5200b_psc_ops = {
@@ -325,6 +392,15 @@
 	.set_baudrate = mpc5200b_psc_set_baudrate,
 	.get_irq = mpc52xx_psc_get_irq,
 	.handle_irq = mpc52xx_psc_handle_irq,
+	.get_status = mpc52xx_psc_get_status,
+	.get_ipcr = mpc52xx_psc_get_ipcr,
+	.command = mpc52xx_psc_command,
+	.set_mode = mpc52xx_psc_set_mode,
+	.set_rts = mpc52xx_psc_set_rts,
+	.enable_ms = mpc52xx_psc_enable_ms,
+	.set_sicr = mpc52xx_psc_set_sicr,
+	.set_imr = mpc52xx_psc_set_imr,
+	.get_mr1 = mpc52xx_psc_get_mr1,
 };
 
 #endif /* CONFIG_MPC52xx */
@@ -572,6 +648,246 @@
 	port->irqflags = IRQF_SHARED;
 	port->irq = psc_fifoc_irq;
 }
+#endif
+
+#ifdef CONFIG_PPC_MPC512x
+
+#define PSC_5125(port) ((struct mpc5125_psc __iomem *)((port)->membase))
+#define FIFO_5125(port) ((struct mpc512x_psc_fifo __iomem *)(PSC_5125(port)+1))
+
+static void mpc5125_psc_fifo_init(struct uart_port *port)
+{
+	/* /32 prescaler */
+	out_8(&PSC_5125(port)->mpc52xx_psc_clock_select, 0xdd);
+
+	out_be32(&FIFO_5125(port)->txcmd, MPC512x_PSC_FIFO_RESET_SLICE);
+	out_be32(&FIFO_5125(port)->txcmd, MPC512x_PSC_FIFO_ENABLE_SLICE);
+	out_be32(&FIFO_5125(port)->txalarm, 1);
+	out_be32(&FIFO_5125(port)->tximr, 0);
+
+	out_be32(&FIFO_5125(port)->rxcmd, MPC512x_PSC_FIFO_RESET_SLICE);
+	out_be32(&FIFO_5125(port)->rxcmd, MPC512x_PSC_FIFO_ENABLE_SLICE);
+	out_be32(&FIFO_5125(port)->rxalarm, 1);
+	out_be32(&FIFO_5125(port)->rximr, 0);
+
+	out_be32(&FIFO_5125(port)->tximr, MPC512x_PSC_FIFO_ALARM);
+	out_be32(&FIFO_5125(port)->rximr, MPC512x_PSC_FIFO_ALARM);
+}
+
+static int mpc5125_psc_raw_rx_rdy(struct uart_port *port)
+{
+	return !(in_be32(&FIFO_5125(port)->rxsr) & MPC512x_PSC_FIFO_EMPTY);
+}
+
+static int mpc5125_psc_raw_tx_rdy(struct uart_port *port)
+{
+	return !(in_be32(&FIFO_5125(port)->txsr) & MPC512x_PSC_FIFO_FULL);
+}
+
+static int mpc5125_psc_rx_rdy(struct uart_port *port)
+{
+	return in_be32(&FIFO_5125(port)->rxsr) &
+	       in_be32(&FIFO_5125(port)->rximr) & MPC512x_PSC_FIFO_ALARM;
+}
+
+static int mpc5125_psc_tx_rdy(struct uart_port *port)
+{
+	return in_be32(&FIFO_5125(port)->txsr) &
+	       in_be32(&FIFO_5125(port)->tximr) & MPC512x_PSC_FIFO_ALARM;
+}
+
+static int mpc5125_psc_tx_empty(struct uart_port *port)
+{
+	return in_be32(&FIFO_5125(port)->txsr) & MPC512x_PSC_FIFO_EMPTY;
+}
+
+static void mpc5125_psc_stop_rx(struct uart_port *port)
+{
+	unsigned long rx_fifo_imr;
+
+	rx_fifo_imr = in_be32(&FIFO_5125(port)->rximr);
+	rx_fifo_imr &= ~MPC512x_PSC_FIFO_ALARM;
+	out_be32(&FIFO_5125(port)->rximr, rx_fifo_imr);
+}
+
+static void mpc5125_psc_start_tx(struct uart_port *port)
+{
+	unsigned long tx_fifo_imr;
+
+	tx_fifo_imr = in_be32(&FIFO_5125(port)->tximr);
+	tx_fifo_imr |= MPC512x_PSC_FIFO_ALARM;
+	out_be32(&FIFO_5125(port)->tximr, tx_fifo_imr);
+}
+
+static void mpc5125_psc_stop_tx(struct uart_port *port)
+{
+	unsigned long tx_fifo_imr;
+
+	tx_fifo_imr = in_be32(&FIFO_5125(port)->tximr);
+	tx_fifo_imr &= ~MPC512x_PSC_FIFO_ALARM;
+	out_be32(&FIFO_5125(port)->tximr, tx_fifo_imr);
+}
+
+static void mpc5125_psc_rx_clr_irq(struct uart_port *port)
+{
+	out_be32(&FIFO_5125(port)->rxisr, in_be32(&FIFO_5125(port)->rxisr));
+}
+
+static void mpc5125_psc_tx_clr_irq(struct uart_port *port)
+{
+	out_be32(&FIFO_5125(port)->txisr, in_be32(&FIFO_5125(port)->txisr));
+}
+
+static void mpc5125_psc_write_char(struct uart_port *port, unsigned char c)
+{
+	out_8(&FIFO_5125(port)->txdata_8, c);
+}
+
+static unsigned char mpc5125_psc_read_char(struct uart_port *port)
+{
+	return in_8(&FIFO_5125(port)->rxdata_8);
+}
+
+static void mpc5125_psc_cw_disable_ints(struct uart_port *port)
+{
+	port->read_status_mask =
+		in_be32(&FIFO_5125(port)->tximr) << 16 |
+		in_be32(&FIFO_5125(port)->rximr);
+	out_be32(&FIFO_5125(port)->tximr, 0);
+	out_be32(&FIFO_5125(port)->rximr, 0);
+}
+
+static void mpc5125_psc_cw_restore_ints(struct uart_port *port)
+{
+	out_be32(&FIFO_5125(port)->tximr,
+		(port->read_status_mask >> 16) & 0x7f);
+	out_be32(&FIFO_5125(port)->rximr, port->read_status_mask & 0x7f);
+}
+
+static inline void mpc5125_set_divisor(struct mpc5125_psc __iomem *psc,
+		u8 prescaler, unsigned int divisor)
+{
+	/* select prescaler */
+	out_8(&psc->mpc52xx_psc_clock_select, prescaler);
+	out_8(&psc->ctur, divisor >> 8);
+	out_8(&psc->ctlr, divisor & 0xff);
+}
+
+static unsigned int mpc5125_psc_set_baudrate(struct uart_port *port,
+					     struct ktermios *new,
+					     struct ktermios *old)
+{
+	unsigned int baud;
+	unsigned int divisor;
+
+	/*
+	 * Calculate with a /16 prescaler here.
+	 */
+
+	/* uartclk contains the ips freq */
+	baud = uart_get_baud_rate(port, new, old,
+				  port->uartclk / (16 * 0xffff) + 1,
+				  port->uartclk / 16);
+	divisor = (port->uartclk + 8 * baud) / (16 * baud);
+
+	/* enable the /16 prescaler and set the divisor */
+	mpc5125_set_divisor(PSC_5125(port), 0xdd, divisor);
+	return baud;
+}
+
+/*
+ * MPC5125 have compatible PSC FIFO Controller.
+ * Special init not needed.
+ */
+static u16 mpc5125_psc_get_status(struct uart_port *port)
+{
+	return in_be16(&PSC_5125(port)->mpc52xx_psc_status);
+}
+
+static u8 mpc5125_psc_get_ipcr(struct uart_port *port)
+{
+	return in_8(&PSC_5125(port)->mpc52xx_psc_ipcr);
+}
+
+static void mpc5125_psc_command(struct uart_port *port, u8 cmd)
+{
+	out_8(&PSC_5125(port)->command, cmd);
+}
+
+static void mpc5125_psc_set_mode(struct uart_port *port, u8 mr1, u8 mr2)
+{
+	out_8(&PSC_5125(port)->mr1, mr1);
+	out_8(&PSC_5125(port)->mr2, mr2);
+}
+
+static void mpc5125_psc_set_rts(struct uart_port *port, int state)
+{
+	if (state & TIOCM_RTS)
+		out_8(&PSC_5125(port)->op1, MPC52xx_PSC_OP_RTS);
+	else
+		out_8(&PSC_5125(port)->op0, MPC52xx_PSC_OP_RTS);
+}
+
+static void mpc5125_psc_enable_ms(struct uart_port *port)
+{
+	struct mpc5125_psc __iomem *psc = PSC_5125(port);
+
+	/* clear D_*-bits by reading them */
+	in_8(&psc->mpc52xx_psc_ipcr);
+	/* enable CTS and DCD as IPC interrupts */
+	out_8(&psc->mpc52xx_psc_acr, MPC52xx_PSC_IEC_CTS | MPC52xx_PSC_IEC_DCD);
+
+	port->read_status_mask |= MPC52xx_PSC_IMR_IPC;
+	out_be16(&psc->mpc52xx_psc_imr, port->read_status_mask);
+}
+
+static void mpc5125_psc_set_sicr(struct uart_port *port, u32 val)
+{
+	out_be32(&PSC_5125(port)->sicr, val);
+}
+
+static void mpc5125_psc_set_imr(struct uart_port *port, u16 val)
+{
+	out_be16(&PSC_5125(port)->mpc52xx_psc_imr, val);
+}
+
+static u8 mpc5125_psc_get_mr1(struct uart_port *port)
+{
+	return in_8(&PSC_5125(port)->mr1);
+}
+
+static struct psc_ops mpc5125_psc_ops = {
+	.fifo_init = mpc5125_psc_fifo_init,
+	.raw_rx_rdy = mpc5125_psc_raw_rx_rdy,
+	.raw_tx_rdy = mpc5125_psc_raw_tx_rdy,
+	.rx_rdy = mpc5125_psc_rx_rdy,
+	.tx_rdy = mpc5125_psc_tx_rdy,
+	.tx_empty = mpc5125_psc_tx_empty,
+	.stop_rx = mpc5125_psc_stop_rx,
+	.start_tx = mpc5125_psc_start_tx,
+	.stop_tx = mpc5125_psc_stop_tx,
+	.rx_clr_irq = mpc5125_psc_rx_clr_irq,
+	.tx_clr_irq = mpc5125_psc_tx_clr_irq,
+	.write_char = mpc5125_psc_write_char,
+	.read_char = mpc5125_psc_read_char,
+	.cw_disable_ints = mpc5125_psc_cw_disable_ints,
+	.cw_restore_ints = mpc5125_psc_cw_restore_ints,
+	.set_baudrate = mpc5125_psc_set_baudrate,
+	.clock = mpc512x_psc_clock,
+	.fifoc_init = mpc512x_psc_fifoc_init,
+	.fifoc_uninit = mpc512x_psc_fifoc_uninit,
+	.get_irq = mpc512x_psc_get_irq,
+	.handle_irq = mpc512x_psc_handle_irq,
+	.get_status = mpc5125_psc_get_status,
+	.get_ipcr = mpc5125_psc_get_ipcr,
+	.command = mpc5125_psc_command,
+	.set_mode = mpc5125_psc_set_mode,
+	.set_rts = mpc5125_psc_set_rts,
+	.enable_ms = mpc5125_psc_enable_ms,
+	.set_sicr = mpc5125_psc_set_sicr,
+	.set_imr = mpc5125_psc_set_imr,
+	.get_mr1 = mpc5125_psc_get_mr1,
+};
 
 static struct psc_ops mpc512x_psc_ops = {
 	.fifo_init = mpc512x_psc_fifo_init,
@@ -595,8 +911,18 @@
 	.fifoc_uninit = mpc512x_psc_fifoc_uninit,
 	.get_irq = mpc512x_psc_get_irq,
 	.handle_irq = mpc512x_psc_handle_irq,
+	.get_status = mpc52xx_psc_get_status,
+	.get_ipcr = mpc52xx_psc_get_ipcr,
+	.command = mpc52xx_psc_command,
+	.set_mode = mpc52xx_psc_set_mode,
+	.set_rts = mpc52xx_psc_set_rts,
+	.enable_ms = mpc52xx_psc_enable_ms,
+	.set_sicr = mpc52xx_psc_set_sicr,
+	.set_imr = mpc52xx_psc_set_imr,
+	.get_mr1 = mpc52xx_psc_get_mr1,
 };
-#endif
+#endif /* CONFIG_PPC_MPC512x */
+
 
 static const struct psc_ops *psc_ops;
 
@@ -613,17 +939,14 @@
 static void
 mpc52xx_uart_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
-	if (mctrl & TIOCM_RTS)
-		out_8(&PSC(port)->op1, MPC52xx_PSC_OP_RTS);
-	else
-		out_8(&PSC(port)->op0, MPC52xx_PSC_OP_RTS);
+	psc_ops->set_rts(port, mctrl & TIOCM_RTS);
 }
 
 static unsigned int
 mpc52xx_uart_get_mctrl(struct uart_port *port)
 {
 	unsigned int ret = TIOCM_DSR;
-	u8 status = in_8(&PSC(port)->mpc52xx_psc_ipcr);
+	u8 status = psc_ops->get_ipcr(port);
 
 	if (!(status & MPC52xx_PSC_CTS))
 		ret |= TIOCM_CTS;
@@ -673,15 +996,7 @@
 static void
 mpc52xx_uart_enable_ms(struct uart_port *port)
 {
-	struct mpc52xx_psc __iomem *psc = PSC(port);
-
-	/* clear D_*-bits by reading them */
-	in_8(&psc->mpc52xx_psc_ipcr);
-	/* enable CTS and DCD as IPC interrupts */
-	out_8(&psc->mpc52xx_psc_acr, MPC52xx_PSC_IEC_CTS | MPC52xx_PSC_IEC_DCD);
-
-	port->read_status_mask |= MPC52xx_PSC_IMR_IPC;
-	out_be16(&psc->mpc52xx_psc_imr, port->read_status_mask);
+	psc_ops->enable_ms(port);
 }
 
 static void
@@ -691,9 +1006,9 @@
 	spin_lock_irqsave(&port->lock, flags);
 
 	if (ctl == -1)
-		out_8(&PSC(port)->command, MPC52xx_PSC_START_BRK);
+		psc_ops->command(port, MPC52xx_PSC_START_BRK);
 	else
-		out_8(&PSC(port)->command, MPC52xx_PSC_STOP_BRK);
+		psc_ops->command(port, MPC52xx_PSC_STOP_BRK);
 
 	spin_unlock_irqrestore(&port->lock, flags);
 }
@@ -701,7 +1016,6 @@
 static int
 mpc52xx_uart_startup(struct uart_port *port)
 {
-	struct mpc52xx_psc __iomem *psc = PSC(port);
 	int ret;
 
 	if (psc_ops->clock) {
@@ -717,15 +1031,15 @@
 		return ret;
 
 	/* Reset/activate the port, clear and enable interrupts */
-	out_8(&psc->command, MPC52xx_PSC_RST_RX);
-	out_8(&psc->command, MPC52xx_PSC_RST_TX);
+	psc_ops->command(port, MPC52xx_PSC_RST_RX);
+	psc_ops->command(port, MPC52xx_PSC_RST_TX);
 
-	out_be32(&psc->sicr, 0);	/* UART mode DCD ignored */
+	psc_ops->set_sicr(port, 0);	/* UART mode DCD ignored */
 
 	psc_ops->fifo_init(port);
 
-	out_8(&psc->command, MPC52xx_PSC_TX_ENABLE);
-	out_8(&psc->command, MPC52xx_PSC_RX_ENABLE);
+	psc_ops->command(port, MPC52xx_PSC_TX_ENABLE);
+	psc_ops->command(port, MPC52xx_PSC_RX_ENABLE);
 
 	return 0;
 }
@@ -733,19 +1047,20 @@
 static void
 mpc52xx_uart_shutdown(struct uart_port *port)
 {
-	struct mpc52xx_psc __iomem *psc = PSC(port);
-
 	/* Shut down the port.  Leave TX active if on a console port */
-	out_8(&psc->command, MPC52xx_PSC_RST_RX);
+	psc_ops->command(port, MPC52xx_PSC_RST_RX);
 	if (!uart_console(port))
-		out_8(&psc->command, MPC52xx_PSC_RST_TX);
+		psc_ops->command(port, MPC52xx_PSC_RST_TX);
 
 	port->read_status_mask = 0;
-	out_be16(&psc->mpc52xx_psc_imr, port->read_status_mask);
+	psc_ops->set_imr(port, port->read_status_mask);
 
 	if (psc_ops->clock)
 		psc_ops->clock(port, 0);
 
+	/* Disable interrupt */
+	psc_ops->cw_disable_ints(port);
+
 	/* Release interrupt */
 	free_irq(port->irq, port);
 }
@@ -754,7 +1069,6 @@
 mpc52xx_uart_set_termios(struct uart_port *port, struct ktermios *new,
 			 struct ktermios *old)
 {
-	struct mpc52xx_psc __iomem *psc = PSC(port);
 	unsigned long flags;
 	unsigned char mr1, mr2;
 	unsigned int j;
@@ -818,13 +1132,11 @@
 			"Some chars may have been lost.\n");
 
 	/* Reset the TX & RX */
-	out_8(&psc->command, MPC52xx_PSC_RST_RX);
-	out_8(&psc->command, MPC52xx_PSC_RST_TX);
+	psc_ops->command(port, MPC52xx_PSC_RST_RX);
+	psc_ops->command(port, MPC52xx_PSC_RST_TX);
 
 	/* Send new mode settings */
-	out_8(&psc->command, MPC52xx_PSC_SEL_MODE_REG_1);
-	out_8(&psc->mode, mr1);
-	out_8(&psc->mode, mr2);
+	psc_ops->set_mode(port, mr1, mr2);
 	baud = psc_ops->set_baudrate(port, new, old);
 
 	/* Update the per-port timeout */
@@ -834,8 +1146,8 @@
 		mpc52xx_uart_enable_ms(port);
 
 	/* Reenable TX & RX */
-	out_8(&psc->command, MPC52xx_PSC_TX_ENABLE);
-	out_8(&psc->command, MPC52xx_PSC_RX_ENABLE);
+	psc_ops->command(port, MPC52xx_PSC_TX_ENABLE);
+	psc_ops->command(port, MPC52xx_PSC_RX_ENABLE);
 
 	/* We're all set, release the lock */
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -963,7 +1275,7 @@
 		flag = TTY_NORMAL;
 		port->icount.rx++;
 
-		status = in_be16(&PSC(port)->mpc52xx_psc_status);
+		status = psc_ops->get_status(port);
 
 		if (status & (MPC52xx_PSC_SR_PE |
 			      MPC52xx_PSC_SR_FE |
@@ -983,7 +1295,7 @@
 			}
 
 			/* Clear error condition */
-			out_8(&PSC(port)->command, MPC52xx_PSC_RST_ERR_STAT);
+			psc_ops->command(port, MPC52xx_PSC_RST_ERR_STAT);
 
 		}
 		tty_insert_flip_char(tport, ch, flag);
@@ -1066,7 +1378,7 @@
 		if (psc_ops->tx_rdy(port))
 			keepgoing |= mpc52xx_uart_int_tx_chars(port);
 
-		status = in_8(&PSC(port)->mpc52xx_psc_ipcr);
+		status = psc_ops->get_ipcr(port);
 		if (status & MPC52xx_PSC_D_DCD)
 			uart_handle_dcd_change(port, !(status & MPC52xx_PSC_DCD));
 
@@ -1107,14 +1419,12 @@
 mpc52xx_console_get_options(struct uart_port *port,
 			    int *baud, int *parity, int *bits, int *flow)
 {
-	struct mpc52xx_psc __iomem *psc = PSC(port);
 	unsigned char mr1;
 
 	pr_debug("mpc52xx_console_get_options(port=%p)\n", port);
 
 	/* Read the mode registers */
-	out_8(&psc->command, MPC52xx_PSC_SEL_MODE_REG_1);
-	mr1 = in_8(&psc->mode);
+	mr1 = psc_ops->get_mr1(port);
 
 	/* CT{U,L}R are write-only ! */
 	*baud = CONFIG_SERIAL_MPC52xx_CONSOLE_BAUD;
@@ -1304,6 +1614,7 @@
 #endif
 #ifdef CONFIG_PPC_MPC512x
 	{ .compatible = "fsl,mpc5121-psc-uart", .data = &mpc512x_psc_ops, },
+	{ .compatible = "fsl,mpc5125-psc-uart", .data = &mpc5125_psc_ops, },
 #endif
 	{},
 };
@@ -1372,15 +1683,14 @@
 	if (ret)
 		return ret;
 
-	dev_set_drvdata(&op->dev, (void *)port);
+	platform_set_drvdata(op, (void *)port);
 	return 0;
 }
 
 static int
 mpc52xx_uart_of_remove(struct platform_device *op)
 {
-	struct uart_port *port = dev_get_drvdata(&op->dev);
-	dev_set_drvdata(&op->dev, NULL);
+	struct uart_port *port = platform_get_drvdata(op);
 
 	if (port)
 		uart_remove_one_port(&mpc52xx_uart_driver, port);
@@ -1392,7 +1702,7 @@
 static int
 mpc52xx_uart_of_suspend(struct platform_device *op, pm_message_t state)
 {
-	struct uart_port *port = (struct uart_port *) dev_get_drvdata(&op->dev);
+	struct uart_port *port = (struct uart_port *) platform_get_drvdata(op);
 
 	if (port)
 		uart_suspend_port(&mpc52xx_uart_driver, port);
@@ -1403,7 +1713,7 @@
 static int
 mpc52xx_uart_of_resume(struct platform_device *op)
 {
-	struct uart_port *port = (struct uart_port *) dev_get_drvdata(&op->dev);
+	struct uart_port *port = (struct uart_port *) platform_get_drvdata(op);
 
 	if (port)
 		uart_resume_port(&mpc52xx_uart_driver, port);

diff --git a/drivers/tty/serial/of_serial.c b/drivers/tty/serial/of_serial.c
index 39c7ea4..2caf9c6 100644
--- a/drivers/tty/serial/of_serial.c
+++ b/drivers/tty/serial/of_serial.c

@@ -204,7 +204,7 @@
 
 	info->type = port_type;
 	info->line = ret;
-	dev_set_drvdata(&ofdev->dev, info);
+	platform_set_drvdata(ofdev, info);
 	return 0;
 out:
 	kfree(info);
@@ -217,7 +217,7 @@
  */
 static int of_platform_serial_remove(struct platform_device *ofdev)
 {
-	struct of_serial_info *info = dev_get_drvdata(&ofdev->dev);
+	struct of_serial_info *info = platform_get_drvdata(ofdev);
 	switch (info->type) {
 #ifdef CONFIG_SERIAL_8250
 	case PORT_8250 ... PORT_MAX_8250:

diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index f0b9f6b..b6d1728 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c

@@ -161,6 +161,7 @@
 	u32			calc_latency;
 	struct work_struct	qos_work;
 	struct pinctrl		*pins;
+	bool			is_suspending;
 };
 
 #define to_uart_omap_port(p)	((container_of((p), struct uart_omap_port, port)))
@@ -197,7 +198,7 @@
 	struct omap_uart_port_info *pdata = up->dev->platform_data;
 
 	if (!pdata || !pdata->get_context_loss_count)
-		return 0;
+		return -EINVAL;
 
 	return pdata->get_context_loss_count(up->dev);
 }
@@ -1289,6 +1290,22 @@
 };
 
 #ifdef CONFIG_PM_SLEEP
+static int serial_omap_prepare(struct device *dev)
+{
+	struct uart_omap_port *up = dev_get_drvdata(dev);
+
+	up->is_suspending = true;
+
+	return 0;
+}
+
+static void serial_omap_complete(struct device *dev)
+{
+	struct uart_omap_port *up = dev_get_drvdata(dev);
+
+	up->is_suspending = false;
+}
+
 static int serial_omap_suspend(struct device *dev)
 {
 	struct uart_omap_port *up = dev_get_drvdata(dev);
@@ -1307,7 +1324,10 @@
 
 	return 0;
 }
-#endif
+#else
+#define serial_omap_prepare NULL
+#define serial_omap_complete NULL
+#endif /* CONFIG_PM_SLEEP */
 
 static void omap_serial_fill_features_erratas(struct uart_omap_port *up)
 {
@@ -1482,6 +1502,9 @@
 
 	platform_set_drvdata(pdev, up);
 	pm_runtime_enable(&pdev->dev);
+	if (omap_up_info->autosuspend_timeout == 0)
+		omap_up_info->autosuspend_timeout = -1;
+	device_init_wakeup(up->dev, true);
 	pm_runtime_use_autosuspend(&pdev->dev);
 	pm_runtime_set_autosuspend_delay(&pdev->dev,
 			omap_up_info->autosuspend_timeout);
@@ -1591,13 +1614,19 @@
 static int serial_omap_runtime_suspend(struct device *dev)
 {
 	struct uart_omap_port *up = dev_get_drvdata(dev);
-	struct omap_uart_port_info *pdata = dev->platform_data;
 
 	if (!up)
 		return -EINVAL;
 
-	if (!pdata)
-		return 0;
+	/*
+	* When using 'no_console_suspend', the console UART must not be
+	* suspended. Since driver suspend is managed by runtime suspend,
+	* preventing runtime suspend (by returning error) will keep device
+	* active during suspend.
+	*/
+	if (up->is_suspending && !console_suspend_enabled &&
+	    uart_console(&up->port))
+		return -EBUSY;
 
 	up->context_loss_cnt = serial_omap_get_context_loss_count(up);
 
@@ -1626,7 +1655,7 @@
 	int loss_cnt = serial_omap_get_context_loss_count(up);
 
 	if (loss_cnt < 0) {
-		dev_err(dev, "serial_omap_get_context_loss_count failed : %d\n",
+		dev_dbg(dev, "serial_omap_get_context_loss_count failed : %d\n",
 			loss_cnt);
 		serial_omap_restore_context(up);
 	} else if (up->context_loss_cnt != loss_cnt) {
@@ -1643,6 +1672,8 @@
 	SET_SYSTEM_SLEEP_PM_OPS(serial_omap_suspend, serial_omap_resume)
 	SET_RUNTIME_PM_OPS(serial_omap_runtime_suspend,
 				serial_omap_runtime_resume, NULL)
+	.prepare        = serial_omap_prepare,
+	.complete       = serial_omap_complete,
 };
 
 #if defined(CONFIG_OF)

diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c
index 21a7e17..572d481 100644
--- a/drivers/tty/serial/pch_uart.c
+++ b/drivers/tty/serial/pch_uart.c

@@ -217,6 +217,7 @@
 #define FRI2_64_UARTCLK  64000000 /*  64.0000 MHz */
 #define FRI2_48_UARTCLK  48000000 /*  48.0000 MHz */
 #define NTC1_UARTCLK     64000000 /*  64.0000 MHz */
+#define MINNOW_UARTCLK   50000000 /*  50.0000 MHz */
 
 struct pch_uart_buffer {
 	unsigned char *buf;
@@ -398,6 +399,10 @@
 		    strstr(cmp, "nanoETXexpress-TT")))
 		return NTC1_UARTCLK;
 
+	cmp = dmi_get_system_info(DMI_BOARD_NAME);
+	if (cmp && strstr(cmp, "MinnowBoard"))
+		return MINNOW_UARTCLK;
+
 	return DEFAULT_UARTCLK;
 }
 

diff --git a/drivers/tty/serial/samsung.c b/drivers/tty/serial/samsung.c
index 0c8a9fa..81ebc86 100644
--- a/drivers/tty/serial/samsung.c
+++ b/drivers/tty/serial/samsung.c

@@ -1811,7 +1811,13 @@
 		return ret;
 	}
 
-	return platform_driver_register(&samsung_serial_driver);
+	ret = platform_driver_register(&samsung_serial_driver);
+	if (ret < 0) {
+		pr_err("Failed to register platform driver\n");
+		uart_unregister_driver(&s3c24xx_uart_drv);
+	}
+
+	return ret;
 }
 
 static void __exit s3c24xx_serial_modexit(void)

diff --git a/drivers/tty/serial/sc26xx.c b/drivers/tty/serial/sc26xx.c
index c973568..4b1434d 100644
--- a/drivers/tty/serial/sc26xx.c
+++ b/drivers/tty/serial/sc26xx.c

@@ -696,7 +696,7 @@
 	if (err)
 		goto out_remove_ports;
 
-	dev_set_drvdata(&dev->dev, up);
+	platform_set_drvdata(dev, up);
 	return 0;
 
 out_remove_ports:
@@ -716,7 +716,7 @@
 
 static int __exit sc26xx_driver_remove(struct platform_device *dev)
 {
-	struct uart_sc26xx_port *up = dev_get_drvdata(&dev->dev);
+	struct uart_sc26xx_port *up = platform_get_drvdata(dev);
 
 	free_irq(up->port[0].irq, up);
 
@@ -728,7 +728,6 @@
 	kfree(up);
 	sc26xx_port = NULL;
 
-	dev_set_drvdata(&dev->dev, NULL);
 	return 0;
 }
 

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index f87dbfd..28cdd28 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c

@@ -50,12 +50,6 @@
 
 #define HIGH_BITS_OFFSET	((sizeof(long)-sizeof(int))*8)
 
-#ifdef CONFIG_SERIAL_CORE_CONSOLE
-#define uart_console(port)	((port)->cons && (port)->cons->index == (port)->line)
-#else
-#define uart_console(port)	(0)
-#endif
-
 static void uart_change_speed(struct tty_struct *tty, struct uart_state *state,
 					struct ktermios *old_termios);
 static void uart_wait_until_sent(struct tty_struct *tty, int timeout);

diff --git a/drivers/tty/serial/sirfsoc_uart.c b/drivers/tty/serial/sirfsoc_uart.c
index 03465b6..1fd564b 100644
--- a/drivers/tty/serial/sirfsoc_uart.c
+++ b/drivers/tty/serial/sirfsoc_uart.c

@@ -687,9 +687,10 @@
 
 	if (sirfport->hw_flow_ctrl) {
 		sirfport->p = pinctrl_get_select_default(&pdev->dev);
-		ret = IS_ERR(sirfport->p);
-		if (ret)
+		if (IS_ERR(sirfport->p)) {
+			ret = PTR_ERR(sirfport->p);
 			goto err;
+		}
 	}
 
 	sirfport->clk = clk_get(&pdev->dev, NULL);

diff --git a/drivers/tty/serial/sunhv.c b/drivers/tty/serial/sunhv.c
index ba60708..cf86e72 100644
--- a/drivers/tty/serial/sunhv.c
+++ b/drivers/tty/serial/sunhv.c

@@ -577,7 +577,7 @@
 	if (err)
 		goto out_remove_port;
 
-	dev_set_drvdata(&op->dev, port);
+	platform_set_drvdata(op, port);
 
 	return 0;
 
@@ -601,7 +601,7 @@
 
 static int hv_remove(struct platform_device *dev)
 {
-	struct uart_port *port = dev_get_drvdata(&dev->dev);
+	struct uart_port *port = platform_get_drvdata(dev);
 
 	free_irq(port->irq, port);
 
@@ -612,8 +612,6 @@
 	kfree(port);
 	sunhv_port = NULL;
 
-	dev_set_drvdata(&dev->dev, NULL);
-
 	return 0;
 }
 

diff --git a/drivers/tty/serial/sunsab.c b/drivers/tty/serial/sunsab.c
index a422c8b..5d6136b2 100644
--- a/drivers/tty/serial/sunsab.c
+++ b/drivers/tty/serial/sunsab.c

@@ -1037,7 +1037,7 @@
 	if (err)
 		goto out3;
 
-	dev_set_drvdata(&op->dev, &up[0]);
+	platform_set_drvdata(op, &up[0]);
 
 	inst++;
 
@@ -1059,7 +1059,7 @@
 
 static int sab_remove(struct platform_device *op)
 {
-	struct uart_sunsab_port *up = dev_get_drvdata(&op->dev);
+	struct uart_sunsab_port *up = platform_get_drvdata(op);
 
 	uart_remove_one_port(&sunsab_reg, &up[1].port);
 	uart_remove_one_port(&sunsab_reg, &up[0].port);
@@ -1070,8 +1070,6 @@
 		   up[0].port.membase,
 		   sizeof(union sab82532_async_regs));
 
-	dev_set_drvdata(&op->dev, NULL);
-
 	return 0;
 }
 

diff --git a/drivers/tty/serial/sunsu.c b/drivers/tty/serial/sunsu.c
index 0d84657..699cc1b 100644
--- a/drivers/tty/serial/sunsu.c
+++ b/drivers/tty/serial/sunsu.c

@@ -1454,7 +1454,7 @@
 			kfree(up);
 			return err;
 		}
-		dev_set_drvdata(&op->dev, up);
+		platform_set_drvdata(op, up);
 
 		nr_inst++;
 
@@ -1483,7 +1483,7 @@
 	if (err)
 		goto out_unmap;
 
-	dev_set_drvdata(&op->dev, up);
+	platform_set_drvdata(op, up);
 
 	nr_inst++;
 
@@ -1496,7 +1496,7 @@
 
 static int su_remove(struct platform_device *op)
 {
-	struct uart_sunsu_port *up = dev_get_drvdata(&op->dev);
+	struct uart_sunsu_port *up = platform_get_drvdata(op);
 	bool kbdms = false;
 
 	if (up->su_type == SU_PORT_MS ||
@@ -1516,8 +1516,6 @@
 	if (kbdms)
 		kfree(up);
 
-	dev_set_drvdata(&op->dev, NULL);
-
 	return 0;
 }
 

diff --git a/drivers/tty/serial/sunzilog.c b/drivers/tty/serial/sunzilog.c
index 813ef8e..135a152 100644
--- a/drivers/tty/serial/sunzilog.c
+++ b/drivers/tty/serial/sunzilog.c

@@ -1495,7 +1495,7 @@
 		kbm_inst++;
 	}
 
-	dev_set_drvdata(&op->dev, &up[0]);
+	platform_set_drvdata(op, &up[0]);
 
 	return 0;
 }
@@ -1512,7 +1512,7 @@
 
 static int zs_remove(struct platform_device *op)
 {
-	struct uart_sunzilog_port *up = dev_get_drvdata(&op->dev);
+	struct uart_sunzilog_port *up = platform_get_drvdata(op);
 	struct zilog_layout __iomem *regs;
 
 	zs_remove_one(&up[0]);
@@ -1521,8 +1521,6 @@
 	regs = sunzilog_chip_regs[up[0].port.line / 2];
 	of_iounmap(&op->resource[0], regs, sizeof(struct zilog_layout));
 
-	dev_set_drvdata(&op->dev, NULL);
-
 	return 0;
 }
 

diff --git a/drivers/tty/serial/ucc_uart.c b/drivers/tty/serial/ucc_uart.c
index 7355303..8831748 100644
--- a/drivers/tty/serial/ucc_uart.c
+++ b/drivers/tty/serial/ucc_uart.c

@@ -1451,7 +1451,7 @@
 		goto out_np;
 	}
 
-	dev_set_drvdata(&ofdev->dev, qe_port);
+	platform_set_drvdata(ofdev, qe_port);
 
 	dev_info(&ofdev->dev, "UCC%u assigned to /dev/ttyQE%u\n",
 		qe_port->ucc_num + 1, qe_port->port.line);
@@ -1471,13 +1471,12 @@
 
 static int ucc_uart_remove(struct platform_device *ofdev)
 {
-	struct uart_qe_port *qe_port = dev_get_drvdata(&ofdev->dev);
+	struct uart_qe_port *qe_port = platform_get_drvdata(ofdev);
 
 	dev_info(&ofdev->dev, "removing /dev/ttyQE%u\n", qe_port->port.line);
 
 	uart_remove_one_port(&ucc_uart_driver, &qe_port->port);
 
-	dev_set_drvdata(&ofdev->dev, NULL);
 	kfree(qe_port);
 
 	return 0;
@@ -1518,9 +1517,11 @@
 	}
 
 	ret = platform_driver_register(&ucc_uart_of_driver);
-	if (ret)
+	if (ret) {
 		printk(KERN_ERR
 		       "ucc-uart: could not register platform driver\n");
+		uart_unregister_driver(&ucc_uart_driver);
+	}
 
 	return ret;
 }

diff --git a/drivers/tty/serial/vt8500_serial.c b/drivers/tty/serial/vt8500_serial.c
index 1a8bc227..48af43d 100644
--- a/drivers/tty/serial/vt8500_serial.c
+++ b/drivers/tty/serial/vt8500_serial.c

@@ -648,7 +648,7 @@
 	.driver = {
 		.name = "vt8500_serial",
 		.owner = THIS_MODULE,
-		.of_match_table = of_match_ptr(wmt_dt_ids),
+		.of_match_table = wmt_dt_ids,
 	},
 };
 

diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c
index 4e5c778..6c91745 100644
--- a/drivers/tty/serial/xilinx_uartps.c
+++ b/drivers/tty/serial/xilinx_uartps.c

@@ -974,12 +974,11 @@
 		port->dev = &pdev->dev;
 		port->uartclk = clk_get_rate(clk);
 		port->private_data = clk;
-		dev_set_drvdata(&pdev->dev, port);
+		platform_set_drvdata(pdev, port);
 		rc = uart_add_one_port(&xuartps_uart_driver, port);
 		if (rc) {
 			dev_err(&pdev->dev,
 				"uart_add_one_port() failed; err=%i\n", rc);
-			dev_set_drvdata(&pdev->dev, NULL);
 			return rc;
 		}
 		return 0;
@@ -994,46 +993,17 @@
  **/
 static int xuartps_remove(struct platform_device *pdev)
 {
-	struct uart_port *port = dev_get_drvdata(&pdev->dev);
+	struct uart_port *port = platform_get_drvdata(pdev);
 	struct clk *clk = port->private_data;
 	int rc;
 
 	/* Remove the xuartps port from the serial core */
 	rc = uart_remove_one_port(&xuartps_uart_driver, port);
-	dev_set_drvdata(&pdev->dev, NULL);
 	port->mapbase = 0;
 	clk_disable_unprepare(clk);
 	return rc;
 }
 
-/**
- * xuartps_suspend - suspend event
- * @pdev: Pointer to the platform device structure
- * @state: State of the device
- *
- * Returns 0
- **/
-static int xuartps_suspend(struct platform_device *pdev, pm_message_t state)
-{
-	/* Call the API provided in serial_core.c file which handles
-	 * the suspend.
-	 */
-	uart_suspend_port(&xuartps_uart_driver, &xuartps_port[pdev->id]);
-	return 0;
-}
-
-/**
- * xuartps_resume - Resume after a previous suspend
- * @pdev: Pointer to the platform device structure
- *
- * Returns 0
- **/
-static int xuartps_resume(struct platform_device *pdev)
-{
-	uart_resume_port(&xuartps_uart_driver, &xuartps_port[pdev->id]);
-	return 0;
-}
-
 /* Match table for of_platform binding */
 static struct of_device_id xuartps_of_match[] = {
 	{ .compatible = "xlnx,xuartps", },
@@ -1044,8 +1014,6 @@
 static struct platform_driver xuartps_platform_driver = {
 	.probe   = xuartps_probe,		/* Probe method */
 	.remove  = xuartps_remove,		/* Detach method */
-	.suspend = xuartps_suspend,		/* Suspend */
-	.resume  = xuartps_resume,		/* Resume after a suspend */
 	.driver  = {
 		.owner = THIS_MODULE,
 		.name = XUARTPS_NAME,		/* Driver name */

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index b51c154..5f68f2c 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c

@@ -932,7 +932,7 @@
 	unsigned long val;
 	int error;
 
-	error = strict_strtoul(buffer, 0, &val);
+	error = kstrtoul(buffer, 0, &val);
 	if (error < 0)
 		return error;
 

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 6464029..366af83 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c

@@ -1618,6 +1618,8 @@
 	tty_free_termios(tty);
 	tty_driver_remove_tty(tty->driver, tty);
 	tty->port->itty = NULL;
+	if (tty->link)
+		tty->link->port->itty = NULL;
 	cancel_work_sync(&tty->port->buf.work);
 
 	if (tty->link)
@@ -2138,6 +2140,7 @@
 static int __tty_fasync(int fd, struct file *filp, int on)
 {
 	struct tty_struct *tty = file_tty(filp);
+	struct tty_ldisc *ldisc;
 	unsigned long flags;
 	int retval = 0;
 
@@ -2148,11 +2151,17 @@
 	if (retval <= 0)
 		goto out;
 
+	ldisc = tty_ldisc_ref(tty);
+	if (ldisc) {
+		if (ldisc->ops->fasync)
+			ldisc->ops->fasync(tty, on);
+		tty_ldisc_deref(ldisc);
+	}
+
 	if (on) {
 		enum pid_type type;
 		struct pid *pid;
-		if (!waitqueue_active(&tty->read_wait))
-			tty->minimum_to_wake = 1;
+
 		spin_lock_irqsave(&tty->ctrl_lock, flags);
 		if (tty->pgrp) {
 			pid = tty->pgrp;
@@ -2165,13 +2174,7 @@
 		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 		retval = __f_setown(filp, pid, type, 0);
 		put_pid(pid);
-		if (retval)
-			goto out;
-	} else {
-		if (!tty->fasync && !waitqueue_active(&tty->read_wait))
-			tty->minimum_to_wake = N_TTY_BUF_SIZE;
 	}
-	retval = 0;
 out:
 	return retval;
 }

diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c
new file mode 100644
index 0000000..22fad8a
--- /dev/null
+++ b/drivers/tty/tty_ldsem.c

@@ -0,0 +1,453 @@
+/*
+ * Ldisc rw semaphore
+ *
+ * The ldisc semaphore is semantically a rw_semaphore but which enforces
+ * an alternate policy, namely:
+ *   1) Supports lock wait timeouts
+ *   2) Write waiter has priority
+ *   3) Downgrading is not supported
+ *
+ * Implementation notes:
+ *   1) Upper half of semaphore count is a wait count (differs from rwsem
+ *	in that rwsem normalizes the upper half to the wait bias)
+ *   2) Lacks overflow checking
+ *
+ * The generic counting was copied and modified from include/asm-generic/rwsem.h
+ * by Paul Mackerras <paulus@samba.org>.
+ *
+ * The scheduling policy was copied and modified from lib/rwsem.c
+ * Written by David Howells (dhowells@redhat.com).
+ *
+ * This implementation incorporates the write lock stealing work of
+ * Michel Lespinasse <walken@google.com>.
+ *
+ * Copyright (C) 2013 Peter Hurley <peter@hurleysoftware.com>
+ *
+ * This file may be redistributed under the terms of the GNU General Public
+ * License v2.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/tty.h>
+#include <linux/sched.h>
+
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __acq(l, s, t, r, c, n, i)		\
+				lock_acquire(&(l)->dep_map, s, t, r, c, n, i)
+# define __rel(l, n, i)				\
+				lock_release(&(l)->dep_map, n, i)
+# ifdef CONFIG_PROVE_LOCKING
+#  define lockdep_acquire(l, s, t, i)		__acq(l, s, t, 0, 2, NULL, i)
+#  define lockdep_acquire_nest(l, s, t, n, i)	__acq(l, s, t, 0, 2, n, i)
+#  define lockdep_acquire_read(l, s, t, i)	__acq(l, s, t, 1, 2, NULL, i)
+#  define lockdep_release(l, n, i)		__rel(l, n, i)
+# else
+#  define lockdep_acquire(l, s, t, i)		__acq(l, s, t, 0, 1, NULL, i)
+#  define lockdep_acquire_nest(l, s, t, n, i)	__acq(l, s, t, 0, 1, n, i)
+#  define lockdep_acquire_read(l, s, t, i)	__acq(l, s, t, 1, 1, NULL, i)
+#  define lockdep_release(l, n, i)		__rel(l, n, i)
+# endif
+#else
+# define lockdep_acquire(l, s, t, i)		do { } while (0)
+# define lockdep_acquire_nest(l, s, t, n, i)	do { } while (0)
+# define lockdep_acquire_read(l, s, t, i)	do { } while (0)
+# define lockdep_release(l, n, i)		do { } while (0)
+#endif
+
+#ifdef CONFIG_LOCK_STAT
+# define lock_stat(_lock, stat)		lock_##stat(&(_lock)->dep_map, _RET_IP_)
+#else
+# define lock_stat(_lock, stat)		do { } while (0)
+#endif
+
+
+#if BITS_PER_LONG == 64
+# define LDSEM_ACTIVE_MASK	0xffffffffL
+#else
+# define LDSEM_ACTIVE_MASK	0x0000ffffL
+#endif
+
+#define LDSEM_UNLOCKED		0L
+#define LDSEM_ACTIVE_BIAS	1L
+#define LDSEM_WAIT_BIAS		(-LDSEM_ACTIVE_MASK-1)
+#define LDSEM_READ_BIAS		LDSEM_ACTIVE_BIAS
+#define LDSEM_WRITE_BIAS	(LDSEM_WAIT_BIAS + LDSEM_ACTIVE_BIAS)
+
+struct ldsem_waiter {
+	struct list_head list;
+	struct task_struct *task;
+};
+
+static inline long ldsem_atomic_update(long delta, struct ld_semaphore *sem)
+{
+	return atomic_long_add_return(delta, (atomic_long_t *)&sem->count);
+}
+
+static inline int ldsem_cmpxchg(long *old, long new, struct ld_semaphore *sem)
+{
+	long tmp = *old;
+	*old = atomic_long_cmpxchg(&sem->count, *old, new);
+	return *old == tmp;
+}
+
+/*
+ * Initialize an ldsem:
+ */
+void __init_ldsem(struct ld_semaphore *sem, const char *name,
+		  struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held semaphore:
+	 */
+	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+	lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+	sem->count = LDSEM_UNLOCKED;
+	sem->wait_readers = 0;
+	raw_spin_lock_init(&sem->wait_lock);
+	INIT_LIST_HEAD(&sem->read_wait);
+	INIT_LIST_HEAD(&sem->write_wait);
+}
+
+static void __ldsem_wake_readers(struct ld_semaphore *sem)
+{
+	struct ldsem_waiter *waiter, *next;
+	struct task_struct *tsk;
+	long adjust, count;
+
+	/* Try to grant read locks to all readers on the read wait list.
+	 * Note the 'active part' of the count is incremented by
+	 * the number of readers before waking any processes up.
+	 */
+	adjust = sem->wait_readers * (LDSEM_ACTIVE_BIAS - LDSEM_WAIT_BIAS);
+	count = ldsem_atomic_update(adjust, sem);
+	do {
+		if (count > 0)
+			break;
+		if (ldsem_cmpxchg(&count, count - adjust, sem))
+			return;
+	} while (1);
+
+	list_for_each_entry_safe(waiter, next, &sem->read_wait, list) {
+		tsk = waiter->task;
+		smp_mb();
+		waiter->task = NULL;
+		wake_up_process(tsk);
+		put_task_struct(tsk);
+	}
+	INIT_LIST_HEAD(&sem->read_wait);
+	sem->wait_readers = 0;
+}
+
+static inline int writer_trylock(struct ld_semaphore *sem)
+{
+	/* only wake this writer if the active part of the count can be
+	 * transitioned from 0 -> 1
+	 */
+	long count = ldsem_atomic_update(LDSEM_ACTIVE_BIAS, sem);
+	do {
+		if ((count & LDSEM_ACTIVE_MASK) == LDSEM_ACTIVE_BIAS)
+			return 1;
+		if (ldsem_cmpxchg(&count, count - LDSEM_ACTIVE_BIAS, sem))
+			return 0;
+	} while (1);
+}
+
+static void __ldsem_wake_writer(struct ld_semaphore *sem)
+{
+	struct ldsem_waiter *waiter;
+
+	waiter = list_entry(sem->write_wait.next, struct ldsem_waiter, list);
+	wake_up_process(waiter->task);
+}
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then:
+ *   - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
+ *   - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
+ * - the spinlock must be held by the caller
+ * - woken process blocks are discarded from the list after having task zeroed
+ */
+static void __ldsem_wake(struct ld_semaphore *sem)
+{
+	if (!list_empty(&sem->write_wait))
+		__ldsem_wake_writer(sem);
+	else if (!list_empty(&sem->read_wait))
+		__ldsem_wake_readers(sem);
+}
+
+static void ldsem_wake(struct ld_semaphore *sem)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&sem->wait_lock, flags);
+	__ldsem_wake(sem);
+	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
+/*
+ * wait for the read lock to be granted
+ */
+static struct ld_semaphore __sched *
+down_read_failed(struct ld_semaphore *sem, long count, long timeout)
+{
+	struct ldsem_waiter waiter;
+	struct task_struct *tsk = current;
+	long adjust = -LDSEM_ACTIVE_BIAS + LDSEM_WAIT_BIAS;
+
+	/* set up my own style of waitqueue */
+	raw_spin_lock_irq(&sem->wait_lock);
+
+	/* Try to reverse the lock attempt but if the count has changed
+	 * so that reversing fails, check if there are are no waiters,
+	 * and early-out if not */
+	do {
+		if (ldsem_cmpxchg(&count, count + adjust, sem))
+			break;
+		if (count > 0) {
+			raw_spin_unlock_irq(&sem->wait_lock);
+			return sem;
+		}
+	} while (1);
+
+	list_add_tail(&waiter.list, &sem->read_wait);
+	sem->wait_readers++;
+
+	waiter.task = tsk;
+	get_task_struct(tsk);
+
+	/* if there are no active locks, wake the new lock owner(s) */
+	if ((count & LDSEM_ACTIVE_MASK) == 0)
+		__ldsem_wake(sem);
+
+	raw_spin_unlock_irq(&sem->wait_lock);
+
+	/* wait to be given the lock */
+	for (;;) {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+
+		if (!waiter.task)
+			break;
+		if (!timeout)
+			break;
+		timeout = schedule_timeout(timeout);
+	}
+
+	__set_task_state(tsk, TASK_RUNNING);
+
+	if (!timeout) {
+		/* lock timed out but check if this task was just
+		 * granted lock ownership - if so, pretend there
+		 * was no timeout; otherwise, cleanup lock wait */
+		raw_spin_lock_irq(&sem->wait_lock);
+		if (waiter.task) {
+			ldsem_atomic_update(-LDSEM_WAIT_BIAS, sem);
+			list_del(&waiter.list);
+			raw_spin_unlock_irq(&sem->wait_lock);
+			put_task_struct(waiter.task);
+			return NULL;
+		}
+		raw_spin_unlock_irq(&sem->wait_lock);
+	}
+
+	return sem;
+}
+
+/*
+ * wait for the write lock to be granted
+ */
+static struct ld_semaphore __sched *
+down_write_failed(struct ld_semaphore *sem, long count, long timeout)
+{
+	struct ldsem_waiter waiter;
+	struct task_struct *tsk = current;
+	long adjust = -LDSEM_ACTIVE_BIAS;
+	int locked = 0;
+
+	/* set up my own style of waitqueue */
+	raw_spin_lock_irq(&sem->wait_lock);
+
+	/* Try to reverse the lock attempt but if the count has changed
+	 * so that reversing fails, check if the lock is now owned,
+	 * and early-out if so */
+	do {
+		if (ldsem_cmpxchg(&count, count + adjust, sem))
+			break;
+		if ((count & LDSEM_ACTIVE_MASK) == LDSEM_ACTIVE_BIAS) {
+			raw_spin_unlock_irq(&sem->wait_lock);
+			return sem;
+		}
+	} while (1);
+
+	list_add_tail(&waiter.list, &sem->write_wait);
+
+	waiter.task = tsk;
+
+	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	for (;;) {
+		if (!timeout)
+			break;
+		raw_spin_unlock_irq(&sem->wait_lock);
+		timeout = schedule_timeout(timeout);
+		raw_spin_lock_irq(&sem->wait_lock);
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if ((locked = writer_trylock(sem)))
+			break;
+	}
+
+	if (!locked)
+		ldsem_atomic_update(-LDSEM_WAIT_BIAS, sem);
+	list_del(&waiter.list);
+	raw_spin_unlock_irq(&sem->wait_lock);
+
+	__set_task_state(tsk, TASK_RUNNING);
+
+	/* lock wait may have timed out */
+	if (!locked)
+		return NULL;
+	return sem;
+}
+
+
+
+static inline int __ldsem_down_read_nested(struct ld_semaphore *sem,
+					   int subclass, long timeout)
+{
+	long count;
+
+	lockdep_acquire_read(sem, subclass, 0, _RET_IP_);
+
+	count = ldsem_atomic_update(LDSEM_READ_BIAS, sem);
+	if (count <= 0) {
+		lock_stat(sem, contended);
+		if (!down_read_failed(sem, count, timeout)) {
+			lockdep_release(sem, 1, _RET_IP_);
+			return 0;
+		}
+	}
+	lock_stat(sem, acquired);
+	return 1;
+}
+
+static inline int __ldsem_down_write_nested(struct ld_semaphore *sem,
+					    int subclass, long timeout)
+{
+	long count;
+
+	lockdep_acquire(sem, subclass, 0, _RET_IP_);
+
+	count = ldsem_atomic_update(LDSEM_WRITE_BIAS, sem);
+	if ((count & LDSEM_ACTIVE_MASK) != LDSEM_ACTIVE_BIAS) {
+		lock_stat(sem, contended);
+		if (!down_write_failed(sem, count, timeout)) {
+			lockdep_release(sem, 1, _RET_IP_);
+			return 0;
+		}
+	}
+	lock_stat(sem, acquired);
+	return 1;
+}
+
+
+/*
+ * lock for reading -- returns 1 if successful, 0 if timed out
+ */
+int __sched ldsem_down_read(struct ld_semaphore *sem, long timeout)
+{
+	might_sleep();
+	return __ldsem_down_read_nested(sem, 0, timeout);
+}
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int ldsem_down_read_trylock(struct ld_semaphore *sem)
+{
+	long count = sem->count;
+
+	while (count >= 0) {
+		if (ldsem_cmpxchg(&count, count + LDSEM_READ_BIAS, sem)) {
+			lockdep_acquire_read(sem, 0, 1, _RET_IP_);
+			lock_stat(sem, acquired);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * lock for writing -- returns 1 if successful, 0 if timed out
+ */
+int __sched ldsem_down_write(struct ld_semaphore *sem, long timeout)
+{
+	might_sleep();
+	return __ldsem_down_write_nested(sem, 0, timeout);
+}
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int ldsem_down_write_trylock(struct ld_semaphore *sem)
+{
+	long count = sem->count;
+
+	while ((count & LDSEM_ACTIVE_MASK) == 0) {
+		if (ldsem_cmpxchg(&count, count + LDSEM_WRITE_BIAS, sem)) {
+			lockdep_acquire(sem, 0, 1, _RET_IP_);
+			lock_stat(sem, acquired);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * release a read lock
+ */
+void ldsem_up_read(struct ld_semaphore *sem)
+{
+	long count;
+
+	lockdep_release(sem, 1, _RET_IP_);
+
+	count = ldsem_atomic_update(-LDSEM_READ_BIAS, sem);
+	if (count < 0 && (count & LDSEM_ACTIVE_MASK) == 0)
+		ldsem_wake(sem);
+}
+
+/*
+ * release a write lock
+ */
+void ldsem_up_write(struct ld_semaphore *sem)
+{
+	long count;
+
+	lockdep_release(sem, 1, _RET_IP_);
+
+	count = ldsem_atomic_update(-LDSEM_WRITE_BIAS, sem);
+	if (count < 0)
+		ldsem_wake(sem);
+}
+
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int ldsem_down_read_nested(struct ld_semaphore *sem, int subclass, long timeout)
+{
+	might_sleep();
+	return __ldsem_down_read_nested(sem, subclass, timeout);
+}
+
+int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
+			    long timeout)
+{
+	might_sleep();
+	return __ldsem_down_write_nested(sem, subclass, timeout);
+}
+
+#endif

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 740202d..c677829 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c

@@ -3086,17 +3086,6 @@
 };
 
 
-static int bind_con_driver(const struct consw *csw, int first, int last,
-			   int deflt)
-{
-	int ret;
-
-	console_lock();
-	ret = do_bind_con_driver(csw, first, last, deflt);
-	console_unlock();
-	return ret;
-}
-
 #ifdef CONFIG_VT_HW_CONSOLE_BINDING
 static int con_is_graphics(const struct consw *csw, int first, int last)
 {
@@ -3114,34 +3103,6 @@
 	return retval;
 }
 
-/**
- * unbind_con_driver - unbind a console driver
- * @csw: pointer to console driver to unregister
- * @first: first in range of consoles that @csw should be unbound from
- * @last: last in range of consoles that @csw should be unbound from
- * @deflt: should next bound console driver be default after @csw is unbound?
- *
- * To unbind a driver from all possible consoles, pass 0 as @first and
- * %MAX_NR_CONSOLES as @last.
- *
- * @deflt controls whether the console that ends up replacing @csw should be
- * the default console.
- *
- * RETURNS:
- * -ENODEV if @csw isn't a registered console driver or can't be unregistered
- * or 0 on success.
- */
-int unbind_con_driver(const struct consw *csw, int first, int last, int deflt)
-{
-	int retval;
-
-	console_lock();
-	retval = do_unbind_con_driver(csw, first, last, deflt);
-	console_unlock();
-	return retval;
-}
-EXPORT_SYMBOL(unbind_con_driver);
-
 /* unlocked version of unbind_con_driver() */
 int do_unbind_con_driver(const struct consw *csw, int first, int last, int deflt)
 {
@@ -3262,8 +3223,11 @@
 		if (first == 0 && last == MAX_NR_CONSOLES -1)
 			deflt = 1;
 
-		if (first != -1)
-			bind_con_driver(csw, first, last, deflt);
+		if (first != -1) {
+			console_lock();
+			do_bind_con_driver(csw, first, last, deflt);
+			console_unlock();
+		}
 
 		first = -1;
 		last = -1;
@@ -3301,8 +3265,11 @@
 		if (first == 0 && last == MAX_NR_CONSOLES -1)
 			deflt = 1;
 
-		if (first != -1)
-			unbind_con_driver(csw, first, last, deflt);
+		if (first != -1) {
+			console_lock();
+			do_unbind_con_driver(csw, first, last, deflt);
+			console_unlock();
+		}
 
 		first = -1;
 		last = -1;
@@ -3574,29 +3541,9 @@
 	return retval;
 }
 
-/**
- * register_con_driver - register console driver to console layer
- * @csw: console driver
- * @first: the first console to take over, minimum value is 0
- * @last: the last console to take over, maximum value is MAX_NR_CONSOLES -1
- *
- * DESCRIPTION: This function registers a console driver which can later
- * bind to a range of consoles specified by @first and @last. It will
- * also initialize the console driver by calling con_startup().
- */
-int register_con_driver(const struct consw *csw, int first, int last)
-{
-	int retval;
-
-	console_lock();
-	retval = do_register_con_driver(csw, first, last);
-	console_unlock();
-	return retval;
-}
-EXPORT_SYMBOL(register_con_driver);
 
 /**
- * unregister_con_driver - unregister console driver from console layer
+ * do_unregister_con_driver - unregister console driver from console layer
  * @csw: console driver
  *
  * DESCRIPTION: All drivers that registers to the console layer must
@@ -3606,17 +3553,6 @@
  *
  * The driver must unbind first prior to unregistration.
  */
-int unregister_con_driver(const struct consw *csw)
-{
-	int retval;
-
-	console_lock();
-	retval = do_unregister_con_driver(csw);
-	console_unlock();
-	return retval;
-}
-EXPORT_SYMBOL(unregister_con_driver);
-
 int do_unregister_con_driver(const struct consw *csw)
 {
 	int i, retval = -ENODEV;
@@ -3654,7 +3590,7 @@
  *	when a driver wants to take over some existing consoles
  *	and become default driver for newly opened ones.
  *
- *	take_over_console is basically a register followed by unbind
+ *	do_take_over_console is basically a register followed by unbind
  */
 int do_take_over_console(const struct consw *csw, int first, int last, int deflt)
 {
@@ -3675,30 +3611,6 @@
 }
 EXPORT_SYMBOL_GPL(do_take_over_console);
 
-/*
- *	If we support more console drivers, this function is used
- *	when a driver wants to take over some existing consoles
- *	and become default driver for newly opened ones.
- *
- *	take_over_console is basically a register followed by unbind
- */
-int take_over_console(const struct consw *csw, int first, int last, int deflt)
-{
-	int err;
-
-	err = register_con_driver(csw, first, last);
-	/*
-	 * If we get an busy error we still want to bind the console driver
-	 * and return success, as we may have unbound the console driver
-	 * but not unregistered it.
-	 */
-	if (err == -EBUSY)
-		err = 0;
-	if (!err)
-		bind_con_driver(csw, first, last, deflt);
-
-	return err;
-}
 
 /*
  * give_up_console is a wrapper to unregister_con_driver. It will only
@@ -3706,7 +3618,9 @@
  */
 void give_up_console(const struct consw *csw)
 {
-	unregister_con_driver(csw);
+	console_lock();
+	do_unregister_con_driver(csw);
+	console_unlock();
 }
 
 static int __init vtconsole_class_init(void)
@@ -4262,6 +4176,5 @@
 EXPORT_SYMBOL(vc_cons);
 EXPORT_SYMBOL(global_cursor_default);
 #ifndef VT_SINGLE_DRIVER
-EXPORT_SYMBOL(take_over_console);
 EXPORT_SYMBOL(give_up_console);
 #endif

diff --git a/drivers/uio/uio_aec.c b/drivers/uio/uio_aec.c
index 1548982..f3611c2 100644
--- a/drivers/uio/uio_aec.c
+++ b/drivers/uio/uio_aec.c

@@ -160,17 +160,5 @@
 	.remove = remove,
 };
 
-static int __init aectc_init(void)
-{
-	return pci_register_driver(&pci_driver);
-}
-
-static void __exit aectc_exit(void)
-{
-	pci_unregister_driver(&pci_driver);
-}
-
+module_pci_driver(pci_driver);
 MODULE_LICENSE("GPL");
-
-module_init(aectc_init);
-module_exit(aectc_exit);

diff --git a/drivers/uio/uio_cif.c b/drivers/uio/uio_cif.c
index 7dd6fc6..22cdf38 100644
--- a/drivers/uio/uio_cif.c
+++ b/drivers/uio/uio_cif.c

@@ -135,19 +135,7 @@
 	.remove = hilscher_pci_remove,
 };
 
-static int __init hilscher_init_module(void)
-{
-	return pci_register_driver(&hilscher_pci_driver);
-}
-
-static void __exit hilscher_exit_module(void)
-{
-	pci_unregister_driver(&hilscher_pci_driver);
-}
-
-module_init(hilscher_init_module);
-module_exit(hilscher_exit_module);
-
+module_pci_driver(hilscher_pci_driver);
 MODULE_DEVICE_TABLE(pci, hilscher_pci_ids);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Hans J. Koch, Benedikt Spranger");

diff --git a/drivers/uio/uio_dmem_genirq.c b/drivers/uio/uio_dmem_genirq.c
index 252434c..125d0e5 100644
--- a/drivers/uio/uio_dmem_genirq.c
+++ b/drivers/uio/uio_dmem_genirq.c

@@ -336,8 +336,6 @@
 	{ /* empty for now */ },
 };
 MODULE_DEVICE_TABLE(of, uio_of_genirq_match);
-#else
-# define uio_of_genirq_match NULL
 #endif
 
 static struct platform_driver uio_dmem_genirq = {
@@ -347,7 +345,7 @@
 		.name = DRIVER_NAME,
 		.owner = THIS_MODULE,
 		.pm = &uio_dmem_genirq_dev_pm_ops,
-		.of_match_table = uio_of_genirq_match,
+		.of_match_table = of_match_ptr(uio_of_genirq_match),
 	},
 };
 

diff --git a/drivers/uio/uio_netx.c b/drivers/uio/uio_netx.c
index 6a4ba5e..28a766b 100644
--- a/drivers/uio/uio_netx.c
+++ b/drivers/uio/uio_netx.c

@@ -174,19 +174,7 @@
 	.remove = netx_pci_remove,
 };
 
-static int __init netx_init_module(void)
-{
-	return pci_register_driver(&netx_pci_driver);
-}
-
-static void __exit netx_exit_module(void)
-{
-	pci_unregister_driver(&netx_pci_driver);
-}
-
-module_init(netx_init_module);
-module_exit(netx_exit_module);
-
+module_pci_driver(netx_pci_driver);
 MODULE_DEVICE_TABLE(pci, netx_pci_ids);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Hans J. Koch, Manuel Traut");

diff --git a/drivers/uio/uio_pci_generic.c b/drivers/uio/uio_pci_generic.c
index 14aa10c..077ae12 100644
--- a/drivers/uio/uio_pci_generic.c
+++ b/drivers/uio/uio_pci_generic.c

@@ -113,27 +113,14 @@
 	kfree(gdev);
 }
 
-static struct pci_driver driver = {
+static struct pci_driver uio_pci_driver = {
 	.name = "uio_pci_generic",
 	.id_table = NULL, /* only dynamic id's */
 	.probe = probe,
 	.remove = remove,
 };
 
-static int __init init(void)
-{
-	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
-	return pci_register_driver(&driver);
-}
-
-static void __exit cleanup(void)
-{
-	pci_unregister_driver(&driver);
-}
-
-module_init(init);
-module_exit(cleanup);
-
+module_pci_driver(uio_pci_driver);
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR(DRIVER_AUTHOR);

diff --git a/drivers/uio/uio_pdrv_genirq.c b/drivers/uio/uio_pdrv_genirq.c
index c122bca..4eb8eaf 100644
--- a/drivers/uio/uio_pdrv_genirq.c
+++ b/drivers/uio/uio_pdrv_genirq.c

@@ -37,6 +37,11 @@
 	struct platform_device *pdev;
 };
 
+/* Bits in uio_pdrv_genirq_platdata.flags */
+enum {
+	UIO_IRQ_DISABLED = 0,
+};
+
 static int uio_pdrv_genirq_open(struct uio_info *info, struct inode *inode)
 {
 	struct uio_pdrv_genirq_platdata *priv = info->priv;
@@ -63,8 +68,10 @@
 	 * remember the state so we can allow user space to enable it later.
 	 */
 
-	if (!test_and_set_bit(0, &priv->flags))
+	spin_lock(&priv->lock);
+	if (!__test_and_set_bit(UIO_IRQ_DISABLED, &priv->flags))
 		disable_irq_nosync(irq);
+	spin_unlock(&priv->lock);
 
 	return IRQ_HANDLED;
 }
@@ -78,16 +85,17 @@
 	 * in the interrupt controller, but keep track of the
 	 * state to prevent per-irq depth damage.
 	 *
-	 * Serialize this operation to support multiple tasks.
+	 * Serialize this operation to support multiple tasks and concurrency
+	 * with irq handler on SMP systems.
 	 */
 
 	spin_lock_irqsave(&priv->lock, flags);
 	if (irq_on) {
-		if (test_and_clear_bit(0, &priv->flags))
+		if (__test_and_clear_bit(UIO_IRQ_DISABLED, &priv->flags))
 			enable_irq(dev_info->irq);
 	} else {
-		if (!test_and_set_bit(0, &priv->flags))
-			disable_irq(dev_info->irq);
+		if (!__test_and_set_bit(UIO_IRQ_DISABLED, &priv->flags))
+			disable_irq_nosync(dev_info->irq);
 	}
 	spin_unlock_irqrestore(&priv->lock, flags);
 
@@ -103,24 +111,16 @@
 	int i;
 
 	if (pdev->dev.of_node) {
-		int irq;
-
 		/* alloc uioinfo for one device */
 		uioinfo = kzalloc(sizeof(*uioinfo), GFP_KERNEL);
 		if (!uioinfo) {
 			ret = -ENOMEM;
 			dev_err(&pdev->dev, "unable to kmalloc\n");
-			goto bad2;
+			return ret;
 		}
 		uioinfo->name = pdev->dev.of_node->name;
 		uioinfo->version = "devicetree";
-
 		/* Multiple IRQs are not supported */
-		irq = platform_get_irq(pdev, 0);
-		if (irq == -ENXIO)
-			uioinfo->irq = UIO_IRQ_NONE;
-		else
-			uioinfo->irq = irq;
 	}
 
 	if (!uioinfo || !uioinfo->name || !uioinfo->version) {
@@ -148,12 +148,15 @@
 
 	if (!uioinfo->irq) {
 		ret = platform_get_irq(pdev, 0);
-		if (ret < 0) {
-			dev_err(&pdev->dev, "failed to get IRQ\n");
-			goto bad0;
-		}
 		uioinfo->irq = ret;
+		if (ret == -ENXIO && pdev->dev.of_node)
+			uioinfo->irq = UIO_IRQ_NONE;
+		else if (ret < 0) {
+			dev_err(&pdev->dev, "failed to get IRQ\n");
+			goto bad1;
+		}
 	}
+
 	uiomem = &uioinfo->mem[0];
 
 	for (i = 0; i < pdev->num_resources; ++i) {
@@ -206,19 +209,19 @@
 	ret = uio_register_device(&pdev->dev, priv->uioinfo);
 	if (ret) {
 		dev_err(&pdev->dev, "unable to register uio device\n");
-		goto bad1;
+		goto bad2;
 	}
 
 	platform_set_drvdata(pdev, priv);
 	return 0;
+ bad2:
+	pm_runtime_disable(&pdev->dev);
  bad1:
 	kfree(priv);
-	pm_runtime_disable(&pdev->dev);
  bad0:
 	/* kfree uioinfo for OF */
 	if (pdev->dev.of_node)
 		kfree(uioinfo);
- bad2:
 	return ret;
 }
 
@@ -263,12 +266,13 @@
 };
 
 #ifdef CONFIG_OF
-static const struct of_device_id uio_of_genirq_match[] = {
-	{ /* empty for now */ },
+static struct of_device_id uio_of_genirq_match[] = {
+	{ /* This is filled with module_parm */ },
+	{ /* Sentinel */ },
 };
 MODULE_DEVICE_TABLE(of, uio_of_genirq_match);
-#else
-# define uio_of_genirq_match NULL
+module_param_string(of_id, uio_of_genirq_match[0].compatible, 128, 0);
+MODULE_PARM_DESC(of_id, "Openfirmware id of the device to be handled by uio");
 #endif
 
 static struct platform_driver uio_pdrv_genirq = {
@@ -278,7 +282,7 @@
 		.name = DRIVER_NAME,
 		.owner = THIS_MODULE,
 		.pm = &uio_pdrv_genirq_dev_pm_ops,
-		.of_match_table = uio_of_genirq_match,
+		.of_match_table = of_match_ptr(uio_of_genirq_match),
 	},
 };
 

diff --git a/drivers/uio/uio_pruss.c b/drivers/uio/uio_pruss.c
index 6e2ab00..21f7a72 100644
--- a/drivers/uio/uio_pruss.c
+++ b/drivers/uio/uio_pruss.c

@@ -136,9 +136,9 @@
 	gdev->pruss_clk = clk_get(&dev->dev, "pruss");
 	if (IS_ERR(gdev->pruss_clk)) {
 		dev_err(&dev->dev, "Failed to get clock\n");
+		ret = PTR_ERR(gdev->pruss_clk);
 		kfree(gdev->info);
 		kfree(gdev);
-		ret = PTR_ERR(gdev->pruss_clk);
 		return ret;
 	} else {
 		clk_enable(gdev->pruss_clk);

diff --git a/drivers/uio/uio_sercos3.c b/drivers/uio/uio_sercos3.c
index 81a10a5..5419832 100644
--- a/drivers/uio/uio_sercos3.c
+++ b/drivers/uio/uio_sercos3.c

@@ -226,19 +226,7 @@
 	.remove = sercos3_pci_remove,
 };
 
-static int __init sercos3_init_module(void)
-{
-	return pci_register_driver(&sercos3_pci_driver);
-}
-
-static void __exit sercos3_exit_module(void)
-{
-	pci_unregister_driver(&sercos3_pci_driver);
-}
-
-module_init(sercos3_init_module);
-module_exit(sercos3_exit_module);
-
+module_pci_driver(sercos3_pci_driver);
 MODULE_DESCRIPTION("UIO driver for the Automata Sercos III PCI card");
 MODULE_AUTHOR("John Ogness <john.ogness@linutronix.de>");
 MODULE_LICENSE("GPL v2");

diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig
index 92e1dc9..73f62ca 100644
--- a/drivers/usb/Kconfig
+++ b/drivers/usb/Kconfig

@@ -2,59 +2,15 @@
 # USB device configuration
 #
 
-# many non-PCI SOC chips embed OHCI
+# These are unused now, remove them once they are no longer selected
 config USB_ARCH_HAS_OHCI
-	boolean
-	# ARM:
-	default y if SA1111
-	default y if ARCH_OMAP
-	default y if ARCH_S3C24XX
-	default y if PXA27x
-	default y if PXA3xx
-	default y if ARCH_EP93XX
-	default y if ARCH_AT91
-	default y if MFD_TC6393XB
-	default y if ARCH_W90X900
-	default y if ARCH_DAVINCI_DA8XX
-	default y if ARCH_CNS3XXX
-	default y if PLAT_SPEAR
-	default y if ARCH_EXYNOS
-	# PPC:
-	default y if STB03xxx
-	default y if PPC_MPC52xx
-	# MIPS:
-	default y if MIPS_ALCHEMY
-	default y if MACH_JZ4740
-	# more:
-	default PCI
+	bool
 
-# some non-PCI hcds implement EHCI
 config USB_ARCH_HAS_EHCI
-	boolean
-	default y if FSL_SOC
-	default y if PPC_MPC512x
-	default y if ARCH_IXP4XX
-	default y if ARCH_W90X900
-	default y if ARCH_AT91
-	default y if ARCH_MXC
-	default y if ARCH_MXS
-	default y if ARCH_OMAP3
-	default y if ARCH_CNS3XXX
-	default y if ARCH_VT8500
-	default y if PLAT_SPEAR
-	default y if PLAT_S5P
-	default y if ARCH_MSM
-	default y if MICROBLAZE
-	default y if SPARC_LEON
-	default y if ARCH_MMP
-	default y if MACH_LOONGSON1
-	default y if PLAT_ORION
-	default PCI
+	bool
 
-# some non-PCI HCDs implement xHCI
 config USB_ARCH_HAS_XHCI
-	boolean
-	default PCI
+	bool
 
 menuconfig USB_SUPPORT
 	bool "USB support"
@@ -71,19 +27,8 @@
 	default y
 	depends on USB || USB_GADGET
 
-# Host-side USB depends on having a host controller
-# NOTE:  dummy_hcd is always an option, but it's ignored here ...
-# NOTE:  SL-811 option should be board-specific ...
 config USB_ARCH_HAS_HCD
-	boolean
-	default y if USB_ARCH_HAS_OHCI
-	default y if USB_ARCH_HAS_EHCI
-	default y if USB_ARCH_HAS_XHCI
-	default y if PCMCIA && !M32R			# sl811_cs
-	default y if ARM				# SL-811
-	default y if BLACKFIN				# SL-811
-	default y if SUPERH				# r8a66597-hcd
-	default PCI
+	def_bool y
 
 # ARM SA1111 chips have a non-PCI based "OHCI-compatible" USB host interface.
 config USB

diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index c41feba..238c5d4 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile

@@ -25,6 +25,7 @@
 obj-$(CONFIG_USB_ISP1760_HCD)	+= host/
 obj-$(CONFIG_USB_IMX21_HCD)	+= host/
 obj-$(CONFIG_USB_FSL_MPH_DR_OF)	+= host/
+obj-$(CONFIG_USB_FUSBH200_HCD)	+= host/
 
 obj-$(CONFIG_USB_C67X00_HCD)	+= c67x00/
 

diff --git a/drivers/usb/chipidea/Kconfig b/drivers/usb/chipidea/Kconfig
index b2df442..eb2aa2e 100644
--- a/drivers/usb/chipidea/Kconfig
+++ b/drivers/usb/chipidea/Kconfig

@@ -12,15 +12,15 @@
 
 config USB_CHIPIDEA_UDC
 	bool "ChipIdea device controller"
-	depends on USB_GADGET=y || USB_GADGET=USB_CHIPIDEA
+	depends on USB_GADGET=y || USB_CHIPIDEA=m
 	help
 	  Say Y here to enable device controller functionality of the
 	  ChipIdea driver.
 
 config USB_CHIPIDEA_HOST
 	bool "ChipIdea host controller"
-	depends on USB=y || USB=USB_CHIPIDEA
-	depends on USB_EHCI_HCD=y
+	depends on USB=y
+	depends on USB_EHCI_HCD=y || USB_CHIPIDEA=m
 	select USB_EHCI_ROOT_HUB_TT
 	help
 	  Say Y here to enable host controller functionality of the

diff --git a/drivers/usb/chipidea/Makefile b/drivers/usb/chipidea/Makefile
index 4ab83e9..3bbbcba 100644
--- a/drivers/usb/chipidea/Makefile
+++ b/drivers/usb/chipidea/Makefile

@@ -9,13 +9,13 @@
 
 # Glue/Bridge layers go here
 
-obj-$(CONFIG_USB_CHIPIDEA)	+= ci13xxx_msm.o
+obj-$(CONFIG_USB_CHIPIDEA)	+= ci_hdrc_msm.o
 
 # PCI doesn't provide stubs, need to check
 ifneq ($(CONFIG_PCI),)
-	obj-$(CONFIG_USB_CHIPIDEA)	+= ci13xxx_pci.o
+	obj-$(CONFIG_USB_CHIPIDEA)	+= ci_hdrc_pci.o
 endif
 
 ifneq ($(CONFIG_OF_DEVICE),)
-	obj-$(CONFIG_USB_CHIPIDEA)	+= ci13xxx_imx.o usbmisc_imx.o
+	obj-$(CONFIG_USB_CHIPIDEA)	+= ci_hdrc_imx.o usbmisc_imx.o
 endif

diff --git a/drivers/usb/chipidea/bits.h b/drivers/usb/chipidea/bits.h
index 050de85..aefa026 100644
--- a/drivers/usb/chipidea/bits.h
+++ b/drivers/usb/chipidea/bits.h

@@ -48,10 +48,24 @@
 #define PORTSC_SUSP           BIT(7)
 #define PORTSC_HSP            BIT(9)
 #define PORTSC_PTC            (0x0FUL << 16)
+/* PTS and PTW for non lpm version only */
+#define PORTSC_PTS(d)						\
+	((((d) & 0x3) << 30) | (((d) & 0x4) ? BIT(25) : 0))
+#define PORTSC_PTW            BIT(28)
+#define PORTSC_STS            BIT(29)
 
 /* DEVLC */
 #define DEVLC_PSPD            (0x03UL << 25)
-#define    DEVLC_PSPD_HS      (0x02UL << 25)
+#define DEVLC_PSPD_HS         (0x02UL << 25)
+#define DEVLC_PTW             BIT(27)
+#define DEVLC_STS             BIT(28)
+#define DEVLC_PTS(d)          (((d) & 0x7) << 29)
+
+/* Encoding for DEVLC_PTS and PORTSC_PTS */
+#define PTS_UTMI              0
+#define PTS_ULPI              2
+#define PTS_SERIAL            3
+#define PTS_HSIC              4
 
 /* OTGSC */
 #define OTGSC_IDPU	      BIT(5)

diff --git a/drivers/usb/chipidea/ci.h b/drivers/usb/chipidea/ci.h
index b0a6bce..33cb29f 100644
--- a/drivers/usb/chipidea/ci.h
+++ b/drivers/usb/chipidea/ci.h

@@ -22,14 +22,14 @@
  * DEFINE
  *****************************************************************************/
 #define TD_PAGE_COUNT      5
-#define CI13XXX_PAGE_SIZE  4096ul /* page size for TD's */
+#define CI_HDRC_PAGE_SIZE  4096ul /* page size for TD's */
 #define ENDPT_MAX          32
 
 /******************************************************************************
  * STRUCTURES
  *****************************************************************************/
 /**
- * struct ci13xxx_ep - endpoint representation
+ * struct ci_hw_ep - endpoint representation
  * @ep: endpoint structure for gadget drivers
  * @dir: endpoint direction (TX/RX)
  * @num: endpoint number
@@ -41,7 +41,7 @@
  * @lock: pointer to controller's spinlock
  * @td_pool: pointer to controller's TD pool
  */
-struct ci13xxx_ep {
+struct ci_hw_ep {
 	struct usb_ep				ep;
 	u8					dir;
 	u8					num;
@@ -49,15 +49,16 @@
 	char					name[16];
 	struct {
 		struct list_head	queue;
-		struct ci13xxx_qh	*ptr;
+		struct ci_hw_qh		*ptr;
 		dma_addr_t		dma;
 	}					qh;
 	int					wedge;
 
 	/* global resources */
-	struct ci13xxx				*ci;
+	struct ci_hdrc				*ci;
 	spinlock_t				*lock;
 	struct dma_pool				*td_pool;
+	struct td_node				*pending_td;
 };
 
 enum ci_role {
@@ -74,9 +75,9 @@
  * name: role name string (host/gadget)
  */
 struct ci_role_driver {
-	int		(*start)(struct ci13xxx *);
-	void		(*stop)(struct ci13xxx *);
-	irqreturn_t	(*irq)(struct ci13xxx *);
+	int		(*start)(struct ci_hdrc *);
+	void		(*stop)(struct ci_hdrc *);
+	irqreturn_t	(*irq)(struct ci_hdrc *);
 	const char	*name;
 };
 
@@ -101,7 +102,7 @@
 };
 
 /**
- * struct ci13xxx - chipidea device representation
+ * struct ci_hdrc - chipidea device representation
  * @dev: pointer to parent device
  * @lock: access synchronization
  * @hw_bank: hardware register mapping
@@ -116,7 +117,7 @@
  * @gadget: device side representation for peripheral controller
  * @driver: gadget driver
  * @hw_ep_max: total number of endpoints supported by hardware
- * @ci13xxx_ep: array of endpoints
+ * @ci_hw_ep: array of endpoints
  * @ep0_dir: ep0 direction
  * @ep0out: pointer to ep0 OUT endpoint
  * @ep0in: pointer to ep0 IN endpoint
@@ -132,7 +133,7 @@
  * @hcd: pointer to usb_hcd for ehci host driver
  * @debugfs: root dentry for this controller in debugfs
  */
-struct ci13xxx {
+struct ci_hdrc {
 	struct device			*dev;
 	spinlock_t			lock;
 	struct hw_bank			hw_bank;
@@ -149,9 +150,9 @@
 	struct usb_gadget		gadget;
 	struct usb_gadget_driver	*driver;
 	unsigned			hw_ep_max;
-	struct ci13xxx_ep		ci13xxx_ep[ENDPT_MAX];
+	struct ci_hw_ep			ci_hw_ep[ENDPT_MAX];
 	u32				ep0_dir;
-	struct ci13xxx_ep		*ep0out, *ep0in;
+	struct ci_hw_ep			*ep0out, *ep0in;
 
 	struct usb_request		*status;
 	bool				setaddr;
@@ -160,7 +161,7 @@
 	u8				suspended;
 	u8				test_mode;
 
-	struct ci13xxx_platform_data	*platdata;
+	struct ci_hdrc_platform_data	*platdata;
 	int				vbus_active;
 	/* FIXME: some day, we'll not use global phy */
 	bool				global_phy;
@@ -169,13 +170,13 @@
 	struct dentry			*debugfs;
 };
 
-static inline struct ci_role_driver *ci_role(struct ci13xxx *ci)
+static inline struct ci_role_driver *ci_role(struct ci_hdrc *ci)
 {
 	BUG_ON(ci->role >= CI_ROLE_END || !ci->roles[ci->role]);
 	return ci->roles[ci->role];
 }
 
-static inline int ci_role_start(struct ci13xxx *ci, enum ci_role role)
+static inline int ci_role_start(struct ci_hdrc *ci, enum ci_role role)
 {
 	int ret;
 
@@ -191,7 +192,7 @@
 	return ret;
 }
 
-static inline void ci_role_stop(struct ci13xxx *ci)
+static inline void ci_role_stop(struct ci_hdrc *ci)
 {
 	enum ci_role role = ci->role;
 
@@ -210,7 +211,7 @@
 #define REG_BITS   (32)
 
 /* register indices */
-enum ci13xxx_regs {
+enum ci_hw_regs {
 	CAP_CAPLENGTH,
 	CAP_HCCPARAMS,
 	CAP_DCCPARAMS,
@@ -242,7 +243,7 @@
  *
  * This function returns register contents
  */
-static inline u32 hw_read(struct ci13xxx *ci, enum ci13xxx_regs reg, u32 mask)
+static inline u32 hw_read(struct ci_hdrc *ci, enum ci_hw_regs reg, u32 mask)
 {
 	return ioread32(ci->hw_bank.regmap[reg]) & mask;
 }
@@ -253,7 +254,7 @@
  * @mask: bitfield mask
  * @data: new value
  */
-static inline void hw_write(struct ci13xxx *ci, enum ci13xxx_regs reg,
+static inline void hw_write(struct ci_hdrc *ci, enum ci_hw_regs reg,
 			    u32 mask, u32 data)
 {
 	if (~mask)
@@ -270,7 +271,7 @@
  *
  * This function returns register contents
  */
-static inline u32 hw_test_and_clear(struct ci13xxx *ci, enum ci13xxx_regs reg,
+static inline u32 hw_test_and_clear(struct ci_hdrc *ci, enum ci_hw_regs reg,
 				    u32 mask)
 {
 	u32 val = ioread32(ci->hw_bank.regmap[reg]) & mask;
@@ -287,7 +288,7 @@
  *
  * This function returns register contents
  */
-static inline u32 hw_test_and_write(struct ci13xxx *ci, enum ci13xxx_regs reg,
+static inline u32 hw_test_and_write(struct ci_hdrc *ci, enum ci_hw_regs reg,
 				    u32 mask, u32 data)
 {
 	u32 val = hw_read(ci, reg, ~0);
@@ -296,10 +297,10 @@
 	return (val & mask) >> __ffs(mask);
 }
 
-int hw_device_reset(struct ci13xxx *ci, u32 mode);
+int hw_device_reset(struct ci_hdrc *ci, u32 mode);
 
-int hw_port_test_set(struct ci13xxx *ci, u8 mode);
+int hw_port_test_set(struct ci_hdrc *ci, u8 mode);
 
-u8 hw_port_test_get(struct ci13xxx *ci);
+u8 hw_port_test_get(struct ci_hdrc *ci);
 
 #endif	/* __DRIVERS_USB_CHIPIDEA_CI_H */

diff --git a/drivers/usb/chipidea/ci13xxx_imx.c b/drivers/usb/chipidea/ci13xxx_imx.c
deleted file mode 100644
index 73f9d5f..0000000
--- a/drivers/usb/chipidea/ci13xxx_imx.c
+++ /dev/null

@@ -1,274 +0,0 @@
-/*
- * Copyright 2012 Freescale Semiconductor, Inc.
- * Copyright (C) 2012 Marek Vasut <marex@denx.de>
- * on behalf of DENX Software Engineering GmbH
- *
- * The code contained herein is licensed under the GNU General Public
- * License. You may obtain a copy of the GNU General Public License
- * Version 2 or later at the following locations:
- *
- * http://www.opensource.org/licenses/gpl-license.html
- * http://www.gnu.org/copyleft/gpl.html
- */
-
-#include <linux/module.h>
-#include <linux/of_platform.h>
-#include <linux/of_gpio.h>
-#include <linux/platform_device.h>
-#include <linux/pm_runtime.h>
-#include <linux/dma-mapping.h>
-#include <linux/usb/chipidea.h>
-#include <linux/clk.h>
-#include <linux/regulator/consumer.h>
-#include <linux/pinctrl/consumer.h>
-
-#include "ci.h"
-#include "ci13xxx_imx.h"
-
-#define pdev_to_phy(pdev) \
-	((struct usb_phy *)platform_get_drvdata(pdev))
-
-struct ci13xxx_imx_data {
-	struct device_node *phy_np;
-	struct usb_phy *phy;
-	struct platform_device *ci_pdev;
-	struct clk *clk;
-	struct regulator *reg_vbus;
-};
-
-static const struct usbmisc_ops *usbmisc_ops;
-
-/* Common functions shared by usbmisc drivers */
-
-int usbmisc_set_ops(const struct usbmisc_ops *ops)
-{
-	if (usbmisc_ops)
-		return -EBUSY;
-
-	usbmisc_ops = ops;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(usbmisc_set_ops);
-
-void usbmisc_unset_ops(const struct usbmisc_ops *ops)
-{
-	usbmisc_ops = NULL;
-}
-EXPORT_SYMBOL_GPL(usbmisc_unset_ops);
-
-int usbmisc_get_init_data(struct device *dev, struct usbmisc_usb_device *usbdev)
-{
-	struct device_node *np = dev->of_node;
-	struct of_phandle_args args;
-	int ret;
-
-	usbdev->dev = dev;
-
-	ret = of_parse_phandle_with_args(np, "fsl,usbmisc", "#index-cells",
-					0, &args);
-	if (ret) {
-		dev_err(dev, "Failed to parse property fsl,usbmisc, errno %d\n",
-			ret);
-		memset(usbdev, 0, sizeof(*usbdev));
-		return ret;
-	}
-	usbdev->index = args.args[0];
-	of_node_put(args.np);
-
-	if (of_find_property(np, "disable-over-current", NULL))
-		usbdev->disable_oc = 1;
-
-	if (of_find_property(np, "external-vbus-divider", NULL))
-		usbdev->evdo = 1;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(usbmisc_get_init_data);
-
-/* End of common functions shared by usbmisc drivers*/
-
-static struct ci13xxx_platform_data ci13xxx_imx_platdata  = {
-	.name			= "ci13xxx_imx",
-	.flags			= CI13XXX_REQUIRE_TRANSCEIVER |
-				  CI13XXX_PULLUP_ON_VBUS |
-				  CI13XXX_DISABLE_STREAMING,
-	.capoffset		= DEF_CAPOFFSET,
-};
-
-static int ci13xxx_imx_probe(struct platform_device *pdev)
-{
-	struct ci13xxx_imx_data *data;
-	struct platform_device *plat_ci, *phy_pdev;
-	struct device_node *phy_np;
-	struct resource *res;
-	struct regulator *reg_vbus;
-	struct pinctrl *pinctrl;
-	int ret;
-
-	if (of_find_property(pdev->dev.of_node, "fsl,usbmisc", NULL)
-		&& !usbmisc_ops)
-		return -EPROBE_DEFER;
-
-	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
-	if (!data) {
-		dev_err(&pdev->dev, "Failed to allocate CI13xxx-IMX data!\n");
-		return -ENOMEM;
-	}
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		dev_err(&pdev->dev, "Can't get device resources!\n");
-		return -ENOENT;
-	}
-
-	pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
-	if (IS_ERR(pinctrl))
-		dev_warn(&pdev->dev, "pinctrl get/select failed, err=%ld\n",
-			PTR_ERR(pinctrl));
-
-	data->clk = devm_clk_get(&pdev->dev, NULL);
-	if (IS_ERR(data->clk)) {
-		dev_err(&pdev->dev,
-			"Failed to get clock, err=%ld\n", PTR_ERR(data->clk));
-		return PTR_ERR(data->clk);
-	}
-
-	ret = clk_prepare_enable(data->clk);
-	if (ret) {
-		dev_err(&pdev->dev,
-			"Failed to prepare or enable clock, err=%d\n", ret);
-		return ret;
-	}
-
-	phy_np = of_parse_phandle(pdev->dev.of_node, "fsl,usbphy", 0);
-	if (phy_np) {
-		data->phy_np = phy_np;
-		phy_pdev = of_find_device_by_node(phy_np);
-		if (phy_pdev) {
-			struct usb_phy *phy;
-			phy = pdev_to_phy(phy_pdev);
-			if (phy &&
-			    try_module_get(phy_pdev->dev.driver->owner)) {
-				usb_phy_init(phy);
-				data->phy = phy;
-			}
-		}
-	}
-
-	/* we only support host now, so enable vbus here */
-	reg_vbus = devm_regulator_get(&pdev->dev, "vbus");
-	if (!IS_ERR(reg_vbus)) {
-		ret = regulator_enable(reg_vbus);
-		if (ret) {
-			dev_err(&pdev->dev,
-				"Failed to enable vbus regulator, err=%d\n",
-				ret);
-			goto put_np;
-		}
-		data->reg_vbus = reg_vbus;
-	} else {
-		reg_vbus = NULL;
-	}
-
-	ci13xxx_imx_platdata.phy = data->phy;
-
-	if (!pdev->dev.dma_mask)
-		pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
-	if (!pdev->dev.coherent_dma_mask)
-		pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
-
-	if (usbmisc_ops && usbmisc_ops->init) {
-		ret = usbmisc_ops->init(&pdev->dev);
-		if (ret) {
-			dev_err(&pdev->dev,
-				"usbmisc init failed, ret=%d\n", ret);
-			goto err;
-		}
-	}
-
-	plat_ci = ci13xxx_add_device(&pdev->dev,
-				pdev->resource, pdev->num_resources,
-				&ci13xxx_imx_platdata);
-	if (IS_ERR(plat_ci)) {
-		ret = PTR_ERR(plat_ci);
-		dev_err(&pdev->dev,
-			"Can't register ci_hdrc platform device, err=%d\n",
-			ret);
-		goto err;
-	}
-
-	if (usbmisc_ops && usbmisc_ops->post) {
-		ret = usbmisc_ops->post(&pdev->dev);
-		if (ret) {
-			dev_err(&pdev->dev,
-				"usbmisc post failed, ret=%d\n", ret);
-			goto put_np;
-		}
-	}
-
-	data->ci_pdev = plat_ci;
-	platform_set_drvdata(pdev, data);
-
-	pm_runtime_no_callbacks(&pdev->dev);
-	pm_runtime_enable(&pdev->dev);
-
-	return 0;
-
-err:
-	if (reg_vbus)
-		regulator_disable(reg_vbus);
-put_np:
-	if (phy_np)
-		of_node_put(phy_np);
-	clk_disable_unprepare(data->clk);
-	return ret;
-}
-
-static int ci13xxx_imx_remove(struct platform_device *pdev)
-{
-	struct ci13xxx_imx_data *data = platform_get_drvdata(pdev);
-
-	pm_runtime_disable(&pdev->dev);
-	ci13xxx_remove_device(data->ci_pdev);
-
-	if (data->reg_vbus)
-		regulator_disable(data->reg_vbus);
-
-	if (data->phy) {
-		usb_phy_shutdown(data->phy);
-		module_put(data->phy->dev->driver->owner);
-	}
-
-	of_node_put(data->phy_np);
-
-	clk_disable_unprepare(data->clk);
-
-	platform_set_drvdata(pdev, NULL);
-
-	return 0;
-}
-
-static const struct of_device_id ci13xxx_imx_dt_ids[] = {
-	{ .compatible = "fsl,imx27-usb", },
-	{ /* sentinel */ }
-};
-MODULE_DEVICE_TABLE(of, ci13xxx_imx_dt_ids);
-
-static struct platform_driver ci13xxx_imx_driver = {
-	.probe = ci13xxx_imx_probe,
-	.remove = ci13xxx_imx_remove,
-	.driver = {
-		.name = "imx_usb",
-		.owner = THIS_MODULE,
-		.of_match_table = ci13xxx_imx_dt_ids,
-	 },
-};
-
-module_platform_driver(ci13xxx_imx_driver);
-
-MODULE_ALIAS("platform:imx-usb");
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("CI13xxx i.MX USB binding");
-MODULE_AUTHOR("Marek Vasut <marex@denx.de>");
-MODULE_AUTHOR("Richard Zhao <richard.zhao@freescale.com>");

diff --git a/drivers/usb/chipidea/ci13xxx_msm.c b/drivers/usb/chipidea/ci13xxx_msm.c
deleted file mode 100644
index 7d16681..0000000
--- a/drivers/usb/chipidea/ci13xxx_msm.c
+++ /dev/null

@@ -1,99 +0,0 @@
-/* Copyright (c) 2010, Code Aurora Forum. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/pm_runtime.h>
-#include <linux/usb/msm_hsusb_hw.h>
-#include <linux/usb/ulpi.h>
-#include <linux/usb/gadget.h>
-#include <linux/usb/chipidea.h>
-
-#include "ci.h"
-
-#define MSM_USB_BASE	(ci->hw_bank.abs)
-
-static void ci13xxx_msm_notify_event(struct ci13xxx *ci, unsigned event)
-{
-	struct device *dev = ci->gadget.dev.parent;
-	int val;
-
-	switch (event) {
-	case CI13XXX_CONTROLLER_RESET_EVENT:
-		dev_dbg(dev, "CI13XXX_CONTROLLER_RESET_EVENT received\n");
-		writel(0, USB_AHBBURST);
-		writel(0, USB_AHBMODE);
-		break;
-	case CI13XXX_CONTROLLER_STOPPED_EVENT:
-		dev_dbg(dev, "CI13XXX_CONTROLLER_STOPPED_EVENT received\n");
-		/*
-		 * Put the transceiver in non-driving mode. Otherwise host
-		 * may not detect soft-disconnection.
-		 */
-		val = usb_phy_io_read(ci->transceiver, ULPI_FUNC_CTRL);
-		val &= ~ULPI_FUNC_CTRL_OPMODE_MASK;
-		val |= ULPI_FUNC_CTRL_OPMODE_NONDRIVING;
-		usb_phy_io_write(ci->transceiver, val, ULPI_FUNC_CTRL);
-		break;
-	default:
-		dev_dbg(dev, "unknown ci13xxx event\n");
-		break;
-	}
-}
-
-static struct ci13xxx_platform_data ci13xxx_msm_platdata = {
-	.name			= "ci13xxx_msm",
-	.flags			= CI13XXX_REGS_SHARED |
-				  CI13XXX_REQUIRE_TRANSCEIVER |
-				  CI13XXX_PULLUP_ON_VBUS |
-				  CI13XXX_DISABLE_STREAMING,
-
-	.notify_event		= ci13xxx_msm_notify_event,
-};
-
-static int ci13xxx_msm_probe(struct platform_device *pdev)
-{
-	struct platform_device *plat_ci;
-
-	dev_dbg(&pdev->dev, "ci13xxx_msm_probe\n");
-
-	plat_ci = ci13xxx_add_device(&pdev->dev,
-				pdev->resource, pdev->num_resources,
-				&ci13xxx_msm_platdata);
-	if (IS_ERR(plat_ci)) {
-		dev_err(&pdev->dev, "ci13xxx_add_device failed!\n");
-		return PTR_ERR(plat_ci);
-	}
-
-	platform_set_drvdata(pdev, plat_ci);
-
-	pm_runtime_no_callbacks(&pdev->dev);
-	pm_runtime_enable(&pdev->dev);
-
-	return 0;
-}
-
-static int ci13xxx_msm_remove(struct platform_device *pdev)
-{
-	struct platform_device *plat_ci = platform_get_drvdata(pdev);
-
-	pm_runtime_disable(&pdev->dev);
-	ci13xxx_remove_device(plat_ci);
-
-	return 0;
-}
-
-static struct platform_driver ci13xxx_msm_driver = {
-	.probe = ci13xxx_msm_probe,
-	.remove = ci13xxx_msm_remove,
-	.driver = { .name = "msm_hsusb", },
-};
-
-module_platform_driver(ci13xxx_msm_driver);
-
-MODULE_ALIAS("platform:msm_hsusb");
-MODULE_LICENSE("GPL v2");

diff --git a/drivers/usb/chipidea/ci13xxx_pci.c b/drivers/usb/chipidea/ci13xxx_pci.c
deleted file mode 100644
index 4e1fc61..0000000
--- a/drivers/usb/chipidea/ci13xxx_pci.c
+++ /dev/null

@@ -1,158 +0,0 @@
-/*
- * ci13xxx_pci.c - MIPS USB IP core family device controller
- *
- * Copyright (C) 2008 Chipidea - MIPS Technologies, Inc. All rights reserved.
- *
- * Author: David Lopo
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/platform_device.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/usb/gadget.h>
-#include <linux/usb/chipidea.h>
-
-/* driver name */
-#define UDC_DRIVER_NAME   "ci13xxx_pci"
-
-/******************************************************************************
- * PCI block
- *****************************************************************************/
-static struct ci13xxx_platform_data pci_platdata = {
-	.name		= UDC_DRIVER_NAME,
-	.capoffset	= DEF_CAPOFFSET,
-};
-
-static struct ci13xxx_platform_data langwell_pci_platdata = {
-	.name		= UDC_DRIVER_NAME,
-	.capoffset	= 0,
-};
-
-static struct ci13xxx_platform_data penwell_pci_platdata = {
-	.name		= UDC_DRIVER_NAME,
-	.capoffset	= 0,
-	.power_budget	= 200,
-};
-
-/**
- * ci13xxx_pci_probe: PCI probe
- * @pdev: USB device controller being probed
- * @id:   PCI hotplug ID connecting controller to UDC framework
- *
- * This function returns an error code
- * Allocates basic PCI resources for this USB device controller, and then
- * invokes the udc_probe() method to start the UDC associated with it
- */
-static int ci13xxx_pci_probe(struct pci_dev *pdev,
-				       const struct pci_device_id *id)
-{
-	struct ci13xxx_platform_data *platdata = (void *)id->driver_data;
-	struct platform_device *plat_ci;
-	struct resource res[3];
-	int retval = 0, nres = 2;
-
-	if (!platdata) {
-		dev_err(&pdev->dev, "device doesn't provide driver data\n");
-		return -ENODEV;
-	}
-
-	retval = pci_enable_device(pdev);
-	if (retval)
-		goto done;
-
-	if (!pdev->irq) {
-		dev_err(&pdev->dev, "No IRQ, check BIOS/PCI setup!");
-		retval = -ENODEV;
-		goto disable_device;
-	}
-
-	pci_set_power_state(pdev, PCI_D0);
-	pci_set_master(pdev);
-	pci_try_set_mwi(pdev);
-
-	memset(res, 0, sizeof(res));
-	res[0].start	= pci_resource_start(pdev, 0);
-	res[0].end	= pci_resource_end(pdev, 0);
-	res[0].flags	= IORESOURCE_MEM;
-	res[1].start	= pdev->irq;
-	res[1].flags	= IORESOURCE_IRQ;
-
-	plat_ci = ci13xxx_add_device(&pdev->dev, res, nres, platdata);
-	if (IS_ERR(plat_ci)) {
-		dev_err(&pdev->dev, "ci13xxx_add_device failed!\n");
-		retval = PTR_ERR(plat_ci);
-		goto disable_device;
-	}
-
-	pci_set_drvdata(pdev, plat_ci);
-
-	return 0;
-
- disable_device:
-	pci_disable_device(pdev);
- done:
-	return retval;
-}
-
-/**
- * ci13xxx_pci_remove: PCI remove
- * @pdev: USB Device Controller being removed
- *
- * Reverses the effect of ci13xxx_pci_probe(),
- * first invoking the udc_remove() and then releases
- * all PCI resources allocated for this USB device controller
- */
-static void ci13xxx_pci_remove(struct pci_dev *pdev)
-{
-	struct platform_device *plat_ci = pci_get_drvdata(pdev);
-
-	ci13xxx_remove_device(plat_ci);
-	pci_set_drvdata(pdev, NULL);
-	pci_disable_device(pdev);
-}
-
-/**
- * PCI device table
- * PCI device structure
- *
- * Check "pci.h" for details
- */
-static DEFINE_PCI_DEVICE_TABLE(ci13xxx_pci_id_table) = {
-	{
-		PCI_DEVICE(0x153F, 0x1004),
-		.driver_data = (kernel_ulong_t)&pci_platdata,
-	},
-	{
-		PCI_DEVICE(0x153F, 0x1006),
-		.driver_data = (kernel_ulong_t)&pci_platdata,
-	},
-	{
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0811),
-		.driver_data = (kernel_ulong_t)&langwell_pci_platdata,
-	},
-	{
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0829),
-		.driver_data = (kernel_ulong_t)&penwell_pci_platdata,
-	},
-	{ 0, 0, 0, 0, 0, 0, 0 /* end: all zeroes */ }
-};
-MODULE_DEVICE_TABLE(pci, ci13xxx_pci_id_table);
-
-static struct pci_driver ci13xxx_pci_driver = {
-	.name         =	UDC_DRIVER_NAME,
-	.id_table     =	ci13xxx_pci_id_table,
-	.probe        =	ci13xxx_pci_probe,
-	.remove       =	ci13xxx_pci_remove,
-};
-
-module_pci_driver(ci13xxx_pci_driver);
-
-MODULE_AUTHOR("MIPS - David Lopo <dlopo@chipidea.mips.com>");
-MODULE_DESCRIPTION("MIPS CI13XXX USB Peripheral Controller");
-MODULE_LICENSE("GPL");
-MODULE_VERSION("June 2008");

diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c
new file mode 100644
index 0000000..14362c0
--- /dev/null
+++ b/drivers/usb/chipidea/ci_hdrc_imx.c

@@ -0,0 +1,253 @@
+/*
+ * Copyright 2012 Freescale Semiconductor, Inc.
+ * Copyright (C) 2012 Marek Vasut <marex@denx.de>
+ * on behalf of DENX Software Engineering GmbH
+ *
+ * The code contained herein is licensed under the GNU General Public
+ * License. You may obtain a copy of the GNU General Public License
+ * Version 2 or later at the following locations:
+ *
+ * http://www.opensource.org/licenses/gpl-license.html
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/of_gpio.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/dma-mapping.h>
+#include <linux/usb/chipidea.h>
+#include <linux/clk.h>
+#include <linux/regulator/consumer.h>
+
+#include "ci.h"
+#include "ci_hdrc_imx.h"
+
+#define pdev_to_phy(pdev) \
+	((struct usb_phy *)platform_get_drvdata(pdev))
+
+struct ci_hdrc_imx_data {
+	struct usb_phy *phy;
+	struct platform_device *ci_pdev;
+	struct clk *clk;
+	struct regulator *reg_vbus;
+};
+
+static const struct usbmisc_ops *usbmisc_ops;
+
+/* Common functions shared by usbmisc drivers */
+
+int usbmisc_set_ops(const struct usbmisc_ops *ops)
+{
+	if (usbmisc_ops)
+		return -EBUSY;
+
+	usbmisc_ops = ops;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usbmisc_set_ops);
+
+void usbmisc_unset_ops(const struct usbmisc_ops *ops)
+{
+	usbmisc_ops = NULL;
+}
+EXPORT_SYMBOL_GPL(usbmisc_unset_ops);
+
+int usbmisc_get_init_data(struct device *dev, struct usbmisc_usb_device *usbdev)
+{
+	struct device_node *np = dev->of_node;
+	struct of_phandle_args args;
+	int ret;
+
+	usbdev->dev = dev;
+
+	ret = of_parse_phandle_with_args(np, "fsl,usbmisc", "#index-cells",
+					0, &args);
+	if (ret) {
+		dev_err(dev, "Failed to parse property fsl,usbmisc, errno %d\n",
+			ret);
+		memset(usbdev, 0, sizeof(*usbdev));
+		return ret;
+	}
+	usbdev->index = args.args[0];
+	of_node_put(args.np);
+
+	if (of_find_property(np, "disable-over-current", NULL))
+		usbdev->disable_oc = 1;
+
+	if (of_find_property(np, "external-vbus-divider", NULL))
+		usbdev->evdo = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usbmisc_get_init_data);
+
+/* End of common functions shared by usbmisc drivers*/
+
+static int ci_hdrc_imx_probe(struct platform_device *pdev)
+{
+	struct ci_hdrc_imx_data *data;
+	struct ci_hdrc_platform_data pdata = {
+		.name		= "ci_hdrc_imx",
+		.capoffset	= DEF_CAPOFFSET,
+		.flags		= CI_HDRC_REQUIRE_TRANSCEIVER |
+				  CI_HDRC_PULLUP_ON_VBUS |
+				  CI_HDRC_DISABLE_STREAMING,
+	};
+	struct resource *res;
+	int ret;
+
+	if (of_find_property(pdev->dev.of_node, "fsl,usbmisc", NULL)
+		&& !usbmisc_ops)
+		return -EPROBE_DEFER;
+
+	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
+	if (!data) {
+		dev_err(&pdev->dev, "Failed to allocate ci_hdrc-imx data!\n");
+		return -ENOMEM;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "Can't get device resources!\n");
+		return -ENOENT;
+	}
+
+	data->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(data->clk)) {
+		dev_err(&pdev->dev,
+			"Failed to get clock, err=%ld\n", PTR_ERR(data->clk));
+		return PTR_ERR(data->clk);
+	}
+
+	ret = clk_prepare_enable(data->clk);
+	if (ret) {
+		dev_err(&pdev->dev,
+			"Failed to prepare or enable clock, err=%d\n", ret);
+		return ret;
+	}
+
+	data->phy = devm_usb_get_phy_by_phandle(&pdev->dev, "fsl,usbphy", 0);
+	if (!IS_ERR(data->phy)) {
+		ret = usb_phy_init(data->phy);
+		if (ret) {
+			dev_err(&pdev->dev, "unable to init phy: %d\n", ret);
+			goto err_clk;
+		}
+	} else if (PTR_ERR(data->phy) == -EPROBE_DEFER) {
+		ret = -EPROBE_DEFER;
+		goto err_clk;
+	}
+
+	/* we only support host now, so enable vbus here */
+	data->reg_vbus = devm_regulator_get(&pdev->dev, "vbus");
+	if (!IS_ERR(data->reg_vbus)) {
+		ret = regulator_enable(data->reg_vbus);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"Failed to enable vbus regulator, err=%d\n",
+				ret);
+			goto err_clk;
+		}
+	} else {
+		data->reg_vbus = NULL;
+	}
+
+	pdata.phy = data->phy;
+
+	if (!pdev->dev.dma_mask)
+		pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
+	if (!pdev->dev.coherent_dma_mask)
+		pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+
+	if (usbmisc_ops && usbmisc_ops->init) {
+		ret = usbmisc_ops->init(&pdev->dev);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"usbmisc init failed, ret=%d\n", ret);
+			goto err;
+		}
+	}
+
+	data->ci_pdev = ci_hdrc_add_device(&pdev->dev,
+				pdev->resource, pdev->num_resources,
+				&pdata);
+	if (IS_ERR(data->ci_pdev)) {
+		ret = PTR_ERR(data->ci_pdev);
+		dev_err(&pdev->dev,
+			"Can't register ci_hdrc platform device, err=%d\n",
+			ret);
+		goto err;
+	}
+
+	if (usbmisc_ops && usbmisc_ops->post) {
+		ret = usbmisc_ops->post(&pdev->dev);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"usbmisc post failed, ret=%d\n", ret);
+			goto disable_device;
+		}
+	}
+
+	platform_set_drvdata(pdev, data);
+
+	pm_runtime_no_callbacks(&pdev->dev);
+	pm_runtime_enable(&pdev->dev);
+
+	return 0;
+
+disable_device:
+	ci_hdrc_remove_device(data->ci_pdev);
+err:
+	if (data->reg_vbus)
+		regulator_disable(data->reg_vbus);
+err_clk:
+	clk_disable_unprepare(data->clk);
+	return ret;
+}
+
+static int ci_hdrc_imx_remove(struct platform_device *pdev)
+{
+	struct ci_hdrc_imx_data *data = platform_get_drvdata(pdev);
+
+	pm_runtime_disable(&pdev->dev);
+	ci_hdrc_remove_device(data->ci_pdev);
+
+	if (data->reg_vbus)
+		regulator_disable(data->reg_vbus);
+
+	if (data->phy) {
+		usb_phy_shutdown(data->phy);
+		module_put(data->phy->dev->driver->owner);
+	}
+
+	clk_disable_unprepare(data->clk);
+
+	return 0;
+}
+
+static const struct of_device_id ci_hdrc_imx_dt_ids[] = {
+	{ .compatible = "fsl,imx27-usb", },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, ci_hdrc_imx_dt_ids);
+
+static struct platform_driver ci_hdrc_imx_driver = {
+	.probe = ci_hdrc_imx_probe,
+	.remove = ci_hdrc_imx_remove,
+	.driver = {
+		.name = "imx_usb",
+		.owner = THIS_MODULE,
+		.of_match_table = ci_hdrc_imx_dt_ids,
+	 },
+};
+
+module_platform_driver(ci_hdrc_imx_driver);
+
+MODULE_ALIAS("platform:imx-usb");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("CI HDRC i.MX USB binding");
+MODULE_AUTHOR("Marek Vasut <marex@denx.de>");
+MODULE_AUTHOR("Richard Zhao <richard.zhao@freescale.com>");

diff --git a/drivers/usb/chipidea/ci13xxx_imx.h b/drivers/usb/chipidea/ci_hdrc_imx.h
similarity index 100%
rename from drivers/usb/chipidea/ci13xxx_imx.h
rename to drivers/usb/chipidea/ci_hdrc_imx.h


diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c
new file mode 100644
index 0000000..fb657ef
--- /dev/null
+++ b/drivers/usb/chipidea/ci_hdrc_msm.c

@@ -0,0 +1,100 @@
+/* Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/usb/msm_hsusb_hw.h>
+#include <linux/usb/ulpi.h>
+#include <linux/usb/gadget.h>
+#include <linux/usb/chipidea.h>
+
+#include "ci.h"
+
+#define MSM_USB_BASE	(ci->hw_bank.abs)
+
+static void ci_hdrc_msm_notify_event(struct ci_hdrc *ci, unsigned event)
+{
+	struct device *dev = ci->gadget.dev.parent;
+	int val;
+
+	switch (event) {
+	case CI_HDRC_CONTROLLER_RESET_EVENT:
+		dev_dbg(dev, "CI_HDRC_CONTROLLER_RESET_EVENT received\n");
+		writel(0, USB_AHBBURST);
+		writel(0, USB_AHBMODE);
+		break;
+	case CI_HDRC_CONTROLLER_STOPPED_EVENT:
+		dev_dbg(dev, "CI_HDRC_CONTROLLER_STOPPED_EVENT received\n");
+		/*
+		 * Put the transceiver in non-driving mode. Otherwise host
+		 * may not detect soft-disconnection.
+		 */
+		val = usb_phy_io_read(ci->transceiver, ULPI_FUNC_CTRL);
+		val &= ~ULPI_FUNC_CTRL_OPMODE_MASK;
+		val |= ULPI_FUNC_CTRL_OPMODE_NONDRIVING;
+		usb_phy_io_write(ci->transceiver, val, ULPI_FUNC_CTRL);
+		break;
+	default:
+		dev_dbg(dev, "unknown ci_hdrc event\n");
+		break;
+	}
+}
+
+static struct ci_hdrc_platform_data ci_hdrc_msm_platdata = {
+	.name			= "ci_hdrc_msm",
+	.flags			= CI_HDRC_REGS_SHARED |
+				  CI_HDRC_REQUIRE_TRANSCEIVER |
+				  CI_HDRC_PULLUP_ON_VBUS |
+				  CI_HDRC_DISABLE_STREAMING,
+
+	.notify_event		= ci_hdrc_msm_notify_event,
+};
+
+static int ci_hdrc_msm_probe(struct platform_device *pdev)
+{
+	struct platform_device *plat_ci;
+
+	dev_dbg(&pdev->dev, "ci_hdrc_msm_probe\n");
+
+	plat_ci = ci_hdrc_add_device(&pdev->dev,
+				pdev->resource, pdev->num_resources,
+				&ci_hdrc_msm_platdata);
+	if (IS_ERR(plat_ci)) {
+		dev_err(&pdev->dev, "ci_hdrc_add_device failed!\n");
+		return PTR_ERR(plat_ci);
+	}
+
+	platform_set_drvdata(pdev, plat_ci);
+
+	pm_runtime_no_callbacks(&pdev->dev);
+	pm_runtime_enable(&pdev->dev);
+
+	return 0;
+}
+
+static int ci_hdrc_msm_remove(struct platform_device *pdev)
+{
+	struct platform_device *plat_ci = platform_get_drvdata(pdev);
+
+	pm_runtime_disable(&pdev->dev);
+	ci_hdrc_remove_device(plat_ci);
+
+	return 0;
+}
+
+static struct platform_driver ci_hdrc_msm_driver = {
+	.probe = ci_hdrc_msm_probe,
+	.remove = ci_hdrc_msm_remove,
+	.driver = { .name = "msm_hsusb", },
+};
+
+module_platform_driver(ci_hdrc_msm_driver);
+
+MODULE_ALIAS("platform:msm_hsusb");
+MODULE_ALIAS("platform:ci13xxx_msm");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/usb/chipidea/ci_hdrc_pci.c b/drivers/usb/chipidea/ci_hdrc_pci.c
new file mode 100644
index 0000000..042320a
--- /dev/null
+++ b/drivers/usb/chipidea/ci_hdrc_pci.c

@@ -0,0 +1,149 @@
+/*
+ * ci_hdrc_pci.c - MIPS USB IP core family device controller
+ *
+ * Copyright (C) 2008 Chipidea - MIPS Technologies, Inc. All rights reserved.
+ *
+ * Author: David Lopo
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/usb/gadget.h>
+#include <linux/usb/chipidea.h>
+
+/* driver name */
+#define UDC_DRIVER_NAME   "ci_hdrc_pci"
+
+/******************************************************************************
+ * PCI block
+ *****************************************************************************/
+static struct ci_hdrc_platform_data pci_platdata = {
+	.name		= UDC_DRIVER_NAME,
+	.capoffset	= DEF_CAPOFFSET,
+};
+
+static struct ci_hdrc_platform_data langwell_pci_platdata = {
+	.name		= UDC_DRIVER_NAME,
+	.capoffset	= 0,
+};
+
+static struct ci_hdrc_platform_data penwell_pci_platdata = {
+	.name		= UDC_DRIVER_NAME,
+	.capoffset	= 0,
+	.power_budget	= 200,
+};
+
+/**
+ * ci_hdrc_pci_probe: PCI probe
+ * @pdev: USB device controller being probed
+ * @id:   PCI hotplug ID connecting controller to UDC framework
+ *
+ * This function returns an error code
+ * Allocates basic PCI resources for this USB device controller, and then
+ * invokes the udc_probe() method to start the UDC associated with it
+ */
+static int ci_hdrc_pci_probe(struct pci_dev *pdev,
+				       const struct pci_device_id *id)
+{
+	struct ci_hdrc_platform_data *platdata = (void *)id->driver_data;
+	struct platform_device *plat_ci;
+	struct resource res[3];
+	int retval = 0, nres = 2;
+
+	if (!platdata) {
+		dev_err(&pdev->dev, "device doesn't provide driver data\n");
+		return -ENODEV;
+	}
+
+	retval = pcim_enable_device(pdev);
+	if (retval)
+		return retval;
+
+	if (!pdev->irq) {
+		dev_err(&pdev->dev, "No IRQ, check BIOS/PCI setup!");
+		return -ENODEV;
+	}
+
+	pci_set_master(pdev);
+	pci_try_set_mwi(pdev);
+
+	memset(res, 0, sizeof(res));
+	res[0].start	= pci_resource_start(pdev, 0);
+	res[0].end	= pci_resource_end(pdev, 0);
+	res[0].flags	= IORESOURCE_MEM;
+	res[1].start	= pdev->irq;
+	res[1].flags	= IORESOURCE_IRQ;
+
+	plat_ci = ci_hdrc_add_device(&pdev->dev, res, nres, platdata);
+	if (IS_ERR(plat_ci)) {
+		dev_err(&pdev->dev, "ci_hdrc_add_device failed!\n");
+		return PTR_ERR(plat_ci);
+	}
+
+	pci_set_drvdata(pdev, plat_ci);
+
+	return 0;
+}
+
+/**
+ * ci_hdrc_pci_remove: PCI remove
+ * @pdev: USB Device Controller being removed
+ *
+ * Reverses the effect of ci_hdrc_pci_probe(),
+ * first invoking the udc_remove() and then releases
+ * all PCI resources allocated for this USB device controller
+ */
+static void ci_hdrc_pci_remove(struct pci_dev *pdev)
+{
+	struct platform_device *plat_ci = pci_get_drvdata(pdev);
+
+	ci_hdrc_remove_device(plat_ci);
+}
+
+/**
+ * PCI device table
+ * PCI device structure
+ *
+ * Check "pci.h" for details
+ */
+static DEFINE_PCI_DEVICE_TABLE(ci_hdrc_pci_id_table) = {
+	{
+		PCI_DEVICE(0x153F, 0x1004),
+		.driver_data = (kernel_ulong_t)&pci_platdata,
+	},
+	{
+		PCI_DEVICE(0x153F, 0x1006),
+		.driver_data = (kernel_ulong_t)&pci_platdata,
+	},
+	{
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0811),
+		.driver_data = (kernel_ulong_t)&langwell_pci_platdata,
+	},
+	{
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0829),
+		.driver_data = (kernel_ulong_t)&penwell_pci_platdata,
+	},
+	{ 0, 0, 0, 0, 0, 0, 0 /* end: all zeroes */ }
+};
+MODULE_DEVICE_TABLE(pci, ci_hdrc_pci_id_table);
+
+static struct pci_driver ci_hdrc_pci_driver = {
+	.name         =	UDC_DRIVER_NAME,
+	.id_table     =	ci_hdrc_pci_id_table,
+	.probe        =	ci_hdrc_pci_probe,
+	.remove       =	ci_hdrc_pci_remove,
+};
+
+module_pci_driver(ci_hdrc_pci_driver);
+
+MODULE_AUTHOR("MIPS - David Lopo <dlopo@chipidea.mips.com>");
+MODULE_DESCRIPTION("MIPS CI13XXX USB Peripheral Controller");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("June 2008");
+MODULE_ALIAS("platform:ci13xxx_pci");

diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index 475c9c1..a5df24c 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c

@@ -43,8 +43,7 @@
  *
  * TODO List
  * - OTG
- * - Isochronous & Interrupt Traffic
- * - Handle requests which spawns into several TDs
+ * - Interrupt Traffic
  * - GET_STATUS(device) - always reports 0
  * - Gadget API (majority of optional features)
  * - Suspend & Remote Wakeup
@@ -64,6 +63,8 @@
 #include <linux/usb/gadget.h>
 #include <linux/usb/otg.h>
 #include <linux/usb/chipidea.h>
+#include <linux/usb/of.h>
+#include <linux/phy.h>
 
 #include "ci.h"
 #include "udc.h"
@@ -116,7 +117,7 @@
 	[OP_ENDPTCTRL]		= 0x0ECUL,
 };
 
-static int hw_alloc_regmap(struct ci13xxx *ci, bool is_lpm)
+static int hw_alloc_regmap(struct ci_hdrc *ci, bool is_lpm)
 {
 	int i;
 
@@ -148,7 +149,7 @@
  *
  * This function returns an error code
  */
-int hw_port_test_set(struct ci13xxx *ci, u8 mode)
+int hw_port_test_set(struct ci_hdrc *ci, u8 mode)
 {
 	const u8 TEST_MODE_MAX = 7;
 
@@ -164,12 +165,12 @@
  *
  * This function returns port test mode value
  */
-u8 hw_port_test_get(struct ci13xxx *ci)
+u8 hw_port_test_get(struct ci_hdrc *ci)
 {
 	return hw_read(ci, OP_PORTSC, PORTSC_PTC) >> __ffs(PORTSC_PTC);
 }
 
-static int hw_device_init(struct ci13xxx *ci, void __iomem *base)
+static int hw_device_init(struct ci_hdrc *ci, void __iomem *base)
 {
 	u32 reg;
 
@@ -208,13 +209,52 @@
 	return 0;
 }
 
+static void hw_phymode_configure(struct ci_hdrc *ci)
+{
+	u32 portsc, lpm, sts;
+
+	switch (ci->platdata->phy_mode) {
+	case USBPHY_INTERFACE_MODE_UTMI:
+		portsc = PORTSC_PTS(PTS_UTMI);
+		lpm = DEVLC_PTS(PTS_UTMI);
+		break;
+	case USBPHY_INTERFACE_MODE_UTMIW:
+		portsc = PORTSC_PTS(PTS_UTMI) | PORTSC_PTW;
+		lpm = DEVLC_PTS(PTS_UTMI) | DEVLC_PTW;
+		break;
+	case USBPHY_INTERFACE_MODE_ULPI:
+		portsc = PORTSC_PTS(PTS_ULPI);
+		lpm = DEVLC_PTS(PTS_ULPI);
+		break;
+	case USBPHY_INTERFACE_MODE_SERIAL:
+		portsc = PORTSC_PTS(PTS_SERIAL);
+		lpm = DEVLC_PTS(PTS_SERIAL);
+		sts = 1;
+		break;
+	case USBPHY_INTERFACE_MODE_HSIC:
+		portsc = PORTSC_PTS(PTS_HSIC);
+		lpm = DEVLC_PTS(PTS_HSIC);
+		break;
+	default:
+		return;
+	}
+
+	if (ci->hw_bank.lpm) {
+		hw_write(ci, OP_DEVLC, DEVLC_PTS(7) | DEVLC_PTW, lpm);
+		hw_write(ci, OP_DEVLC, DEVLC_STS, sts);
+	} else {
+		hw_write(ci, OP_PORTSC, PORTSC_PTS(7) | PORTSC_PTW, portsc);
+		hw_write(ci, OP_PORTSC, PORTSC_STS, sts);
+	}
+}
+
 /**
  * hw_device_reset: resets chip (execute without interruption)
  * @ci: the controller
   *
  * This function returns an error code
  */
-int hw_device_reset(struct ci13xxx *ci, u32 mode)
+int hw_device_reset(struct ci_hdrc *ci, u32 mode)
 {
 	/* should flush & stop before reset */
 	hw_write(ci, OP_ENDPTFLUSH, ~0, ~0);
@@ -224,12 +264,13 @@
 	while (hw_read(ci, OP_USBCMD, USBCMD_RST))
 		udelay(10);		/* not RTOS friendly */
 
+	hw_phymode_configure(ci);
 
 	if (ci->platdata->notify_event)
 		ci->platdata->notify_event(ci,
-			CI13XXX_CONTROLLER_RESET_EVENT);
+			CI_HDRC_CONTROLLER_RESET_EVENT);
 
-	if (ci->platdata->flags & CI13XXX_DISABLE_STREAMING)
+	if (ci->platdata->flags & CI_HDRC_DISABLE_STREAMING)
 		hw_write(ci, OP_USBMODE, USBMODE_CI_SDIS, USBMODE_CI_SDIS);
 
 	/* USBMODE should be configured step by step */
@@ -251,7 +292,7 @@
  * ci_otg_role - pick role based on ID pin state
  * @ci: the controller
  */
-static enum ci_role ci_otg_role(struct ci13xxx *ci)
+static enum ci_role ci_otg_role(struct ci_hdrc *ci)
 {
 	u32 sts = hw_read(ci, OP_OTGSC, ~0);
 	enum ci_role role = sts & OTGSC_ID
@@ -267,7 +308,7 @@
  */
 static void ci_role_work(struct work_struct *work)
 {
-	struct ci13xxx *ci = container_of(work, struct ci13xxx, work);
+	struct ci_hdrc *ci = container_of(work, struct ci_hdrc, work);
 	enum ci_role role = ci_otg_role(ci);
 
 	if (role != ci->role) {
@@ -283,7 +324,7 @@
 
 static irqreturn_t ci_irq(int irq, void *data)
 {
-	struct ci13xxx *ci = data;
+	struct ci_hdrc *ci = data;
 	irqreturn_t ret = IRQ_NONE;
 	u32 otgsc = 0;
 
@@ -305,9 +346,9 @@
 
 static DEFINE_IDA(ci_ida);
 
-struct platform_device *ci13xxx_add_device(struct device *dev,
+struct platform_device *ci_hdrc_add_device(struct device *dev,
 			struct resource *res, int nres,
-			struct ci13xxx_platform_data *platdata)
+			struct ci_hdrc_platform_data *platdata)
 {
 	struct platform_device *pdev;
 	int id, ret;
@@ -347,29 +388,33 @@
 	ida_simple_remove(&ci_ida, id);
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(ci13xxx_add_device);
+EXPORT_SYMBOL_GPL(ci_hdrc_add_device);
 
-void ci13xxx_remove_device(struct platform_device *pdev)
+void ci_hdrc_remove_device(struct platform_device *pdev)
 {
 	int id = pdev->id;
 	platform_device_unregister(pdev);
 	ida_simple_remove(&ci_ida, id);
 }
-EXPORT_SYMBOL_GPL(ci13xxx_remove_device);
+EXPORT_SYMBOL_GPL(ci_hdrc_remove_device);
 
 static int ci_hdrc_probe(struct platform_device *pdev)
 {
 	struct device	*dev = &pdev->dev;
-	struct ci13xxx	*ci;
+	struct ci_hdrc	*ci;
 	struct resource	*res;
 	void __iomem	*base;
 	int		ret;
+	enum usb_dr_mode dr_mode;
 
 	if (!dev->platform_data) {
 		dev_err(dev, "platform data missing\n");
 		return -ENODEV;
 	}
 
+	if (!dev->of_node && dev->parent)
+		dev->of_node = dev->parent->of_node;
+
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(base))
@@ -409,14 +454,28 @@
 		return -ENODEV;
 	}
 
-	/* initialize role(s) before the interrupt is requested */
-	ret = ci_hdrc_host_init(ci);
-	if (ret)
-		dev_info(dev, "doesn't support host\n");
+	if (!ci->platdata->phy_mode)
+		ci->platdata->phy_mode = of_usb_get_phy_mode(dev->of_node);
 
-	ret = ci_hdrc_gadget_init(ci);
-	if (ret)
-		dev_info(dev, "doesn't support gadget\n");
+	if (!ci->platdata->dr_mode)
+		ci->platdata->dr_mode = of_usb_get_dr_mode(dev->of_node);
+
+	if (ci->platdata->dr_mode == USB_DR_MODE_UNKNOWN)
+		ci->platdata->dr_mode = USB_DR_MODE_OTG;
+
+	dr_mode = ci->platdata->dr_mode;
+	/* initialize role(s) before the interrupt is requested */
+	if (dr_mode == USB_DR_MODE_OTG || dr_mode == USB_DR_MODE_HOST) {
+		ret = ci_hdrc_host_init(ci);
+		if (ret)
+			dev_info(dev, "doesn't support host\n");
+	}
+
+	if (dr_mode == USB_DR_MODE_OTG || dr_mode == USB_DR_MODE_PERIPHERAL) {
+		ret = ci_hdrc_gadget_init(ci);
+		if (ret)
+			dev_info(dev, "doesn't support gadget\n");
+	}
 
 	if (!ci->roles[CI_ROLE_HOST] && !ci->roles[CI_ROLE_GADGET]) {
 		dev_err(dev, "no supported roles\n");
@@ -467,7 +526,7 @@
 
 static int ci_hdrc_remove(struct platform_device *pdev)
 {
-	struct ci13xxx *ci = platform_get_drvdata(pdev);
+	struct ci_hdrc *ci = platform_get_drvdata(pdev);
 
 	dbg_remove_files(ci);
 	flush_workqueue(ci->wq);

diff --git a/drivers/usb/chipidea/debug.c b/drivers/usb/chipidea/debug.c
index 36a7063..96d899a 100644
--- a/drivers/usb/chipidea/debug.c
+++ b/drivers/usb/chipidea/debug.c

@@ -18,7 +18,7 @@
  */
 static int ci_device_show(struct seq_file *s, void *data)
 {
-	struct ci13xxx *ci = s->private;
+	struct ci_hdrc *ci = s->private;
 	struct usb_gadget *gadget = &ci->gadget;
 
 	seq_printf(s, "speed             = %d\n", gadget->speed);
@@ -58,7 +58,7 @@
  */
 static int ci_port_test_show(struct seq_file *s, void *data)
 {
-	struct ci13xxx *ci = s->private;
+	struct ci_hdrc *ci = s->private;
 	unsigned long flags;
 	unsigned mode;
 
@@ -78,7 +78,7 @@
 				  size_t count, loff_t *ppos)
 {
 	struct seq_file *s = file->private_data;
-	struct ci13xxx *ci = s->private;
+	struct ci_hdrc *ci = s->private;
 	unsigned long flags;
 	unsigned mode;
 	char buf[32];
@@ -115,7 +115,7 @@
  */
 static int ci_qheads_show(struct seq_file *s, void *data)
 {
-	struct ci13xxx *ci = s->private;
+	struct ci_hdrc *ci = s->private;
 	unsigned long flags;
 	unsigned i, j;
 
@@ -126,15 +126,15 @@
 
 	spin_lock_irqsave(&ci->lock, flags);
 	for (i = 0; i < ci->hw_ep_max/2; i++) {
-		struct ci13xxx_ep *mEpRx = &ci->ci13xxx_ep[i];
-		struct ci13xxx_ep *mEpTx =
-			&ci->ci13xxx_ep[i + ci->hw_ep_max/2];
+		struct ci_hw_ep *hweprx = &ci->ci_hw_ep[i];
+		struct ci_hw_ep *hweptx =
+			&ci->ci_hw_ep[i + ci->hw_ep_max/2];
 		seq_printf(s, "EP=%02i: RX=%08X TX=%08X\n",
-			   i, (u32)mEpRx->qh.dma, (u32)mEpTx->qh.dma);
-		for (j = 0; j < (sizeof(struct ci13xxx_qh)/sizeof(u32)); j++)
+			   i, (u32)hweprx->qh.dma, (u32)hweptx->qh.dma);
+		for (j = 0; j < (sizeof(struct ci_hw_qh)/sizeof(u32)); j++)
 			seq_printf(s, " %04X:    %08X    %08X\n", j,
-				   *((u32 *)mEpRx->qh.ptr + j),
-				   *((u32 *)mEpTx->qh.ptr + j));
+				   *((u32 *)hweprx->qh.ptr + j),
+				   *((u32 *)hweptx->qh.ptr + j));
 	}
 	spin_unlock_irqrestore(&ci->lock, flags);
 
@@ -158,11 +158,12 @@
  */
 static int ci_requests_show(struct seq_file *s, void *data)
 {
-	struct ci13xxx *ci = s->private;
+	struct ci_hdrc *ci = s->private;
 	unsigned long flags;
 	struct list_head   *ptr = NULL;
-	struct ci13xxx_req *req = NULL;
-	unsigned i, j, qsize = sizeof(struct ci13xxx_td)/sizeof(u32);
+	struct ci_hw_req *req = NULL;
+	struct td_node *node, *tmpnode;
+	unsigned i, j, qsize = sizeof(struct ci_hw_td)/sizeof(u32);
 
 	if (ci->role != CI_ROLE_GADGET) {
 		seq_printf(s, "not in gadget mode\n");
@@ -171,16 +172,20 @@
 
 	spin_lock_irqsave(&ci->lock, flags);
 	for (i = 0; i < ci->hw_ep_max; i++)
-		list_for_each(ptr, &ci->ci13xxx_ep[i].qh.queue) {
-			req = list_entry(ptr, struct ci13xxx_req, queue);
+		list_for_each(ptr, &ci->ci_hw_ep[i].qh.queue) {
+			req = list_entry(ptr, struct ci_hw_req, queue);
 
-			seq_printf(s, "EP=%02i: TD=%08X %s\n",
-				   i % (ci->hw_ep_max / 2), (u32)req->dma,
-				   ((i < ci->hw_ep_max/2) ? "RX" : "TX"));
+			list_for_each_entry_safe(node, tmpnode, &req->tds, td) {
+				seq_printf(s, "EP=%02i: TD=%08X %s\n",
+					   i % (ci->hw_ep_max / 2),
+					   (u32)node->dma,
+					   ((i < ci->hw_ep_max/2) ?
+					   "RX" : "TX"));
 
-			for (j = 0; j < qsize; j++)
-				seq_printf(s, " %04X:    %08X\n", j,
-					   *((u32 *)req->ptr + j));
+				for (j = 0; j < qsize; j++)
+					seq_printf(s, " %04X:    %08X\n", j,
+						   *((u32 *)node->ptr + j));
+			}
 		}
 	spin_unlock_irqrestore(&ci->lock, flags);
 
@@ -201,7 +206,7 @@
 
 static int ci_role_show(struct seq_file *s, void *data)
 {
-	struct ci13xxx *ci = s->private;
+	struct ci_hdrc *ci = s->private;
 
 	seq_printf(s, "%s\n", ci_role(ci)->name);
 
@@ -212,7 +217,7 @@
 			     size_t count, loff_t *ppos)
 {
 	struct seq_file *s = file->private_data;
-	struct ci13xxx *ci = s->private;
+	struct ci_hdrc *ci = s->private;
 	enum ci_role role;
 	char buf[8];
 	int ret;
@@ -254,7 +259,7 @@
  *
  * This function returns an error code
  */
-int dbg_create_files(struct ci13xxx *ci)
+int dbg_create_files(struct ci_hdrc *ci)
 {
 	struct dentry *dent;
 
@@ -295,7 +300,7 @@
  * dbg_remove_files: destroys the attribute interface
  * @ci: device
  */
-void dbg_remove_files(struct ci13xxx *ci)
+void dbg_remove_files(struct ci_hdrc *ci)
 {
 	debugfs_remove_recursive(ci->debugfs);
 }

diff --git a/drivers/usb/chipidea/debug.h b/drivers/usb/chipidea/debug.h
index 7ca6ca0..e16478c 100644
--- a/drivers/usb/chipidea/debug.h
+++ b/drivers/usb/chipidea/debug.h

@@ -14,15 +14,15 @@
 #define __DRIVERS_USB_CHIPIDEA_DEBUG_H
 
 #ifdef CONFIG_USB_CHIPIDEA_DEBUG
-int dbg_create_files(struct ci13xxx *ci);
-void dbg_remove_files(struct ci13xxx *ci);
+int dbg_create_files(struct ci_hdrc *ci);
+void dbg_remove_files(struct ci_hdrc *ci);
 #else
-static inline int dbg_create_files(struct ci13xxx *ci)
+static inline int dbg_create_files(struct ci_hdrc *ci)
 {
 	return 0;
 }
 
-static inline void dbg_remove_files(struct ci13xxx *ci)
+static inline void dbg_remove_files(struct ci_hdrc *ci)
 {
 }
 #endif

diff --git a/drivers/usb/chipidea/host.c b/drivers/usb/chipidea/host.c
index 8e9d312..40d0fda 100644
--- a/drivers/usb/chipidea/host.c
+++ b/drivers/usb/chipidea/host.c

@@ -33,12 +33,12 @@
 
 static struct hc_driver __read_mostly ci_ehci_hc_driver;
 
-static irqreturn_t host_irq(struct ci13xxx *ci)
+static irqreturn_t host_irq(struct ci_hdrc *ci)
 {
 	return usb_hcd_irq(ci->irq, ci->hcd);
 }
 
-static int host_start(struct ci13xxx *ci)
+static int host_start(struct ci_hdrc *ci)
 {
 	struct usb_hcd *hcd;
 	struct ehci_hcd *ehci;
@@ -70,13 +70,13 @@
 	else
 		ci->hcd = hcd;
 
-	if (ci->platdata->flags & CI13XXX_DISABLE_STREAMING)
+	if (ci->platdata->flags & CI_HDRC_DISABLE_STREAMING)
 		hw_write(ci, OP_USBMODE, USBMODE_CI_SDIS, USBMODE_CI_SDIS);
 
 	return ret;
 }
 
-static void host_stop(struct ci13xxx *ci)
+static void host_stop(struct ci_hdrc *ci)
 {
 	struct usb_hcd *hcd = ci->hcd;
 
@@ -84,7 +84,7 @@
 	usb_put_hcd(hcd);
 }
 
-int ci_hdrc_host_init(struct ci13xxx *ci)
+int ci_hdrc_host_init(struct ci_hdrc *ci)
 {
 	struct ci_role_driver *rdrv;
 

diff --git a/drivers/usb/chipidea/host.h b/drivers/usb/chipidea/host.h
index 761fb1f..058875c 100644
--- a/drivers/usb/chipidea/host.h
+++ b/drivers/usb/chipidea/host.h

@@ -3,11 +3,11 @@
 
 #ifdef CONFIG_USB_CHIPIDEA_HOST
 
-int ci_hdrc_host_init(struct ci13xxx *ci);
+int ci_hdrc_host_init(struct ci_hdrc *ci);
 
 #else
 
-static inline int ci_hdrc_host_init(struct ci13xxx *ci)
+static inline int ci_hdrc_host_init(struct ci_hdrc *ci)
 {
 	return -ENXIO;
 }

diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c
index b501346..e475fcd 100644
--- a/drivers/usb/chipidea/udc.c
+++ b/drivers/usb/chipidea/udc.c

@@ -61,7 +61,7 @@
 	return num + (dir ? 16 : 0);
 }
 
-static inline int ep_to_bit(struct ci13xxx *ci, int n)
+static inline int ep_to_bit(struct ci_hdrc *ci, int n)
 {
 	int fill = 16 - ci->hw_ep_max / 2;
 
@@ -77,7 +77,7 @@
  *
  * This function returns an error code
  */
-static int hw_device_state(struct ci13xxx *ci, u32 dma)
+static int hw_device_state(struct ci_hdrc *ci, u32 dma)
 {
 	if (dma) {
 		hw_write(ci, OP_ENDPTLISTADDR, ~0, dma);
@@ -97,7 +97,7 @@
  *
  * This function returns an error code
  */
-static int hw_ep_flush(struct ci13xxx *ci, int num, int dir)
+static int hw_ep_flush(struct ci_hdrc *ci, int num, int dir)
 {
 	int n = hw_ep_bit(num, dir);
 
@@ -118,7 +118,7 @@
  *
  * This function returns an error code
  */
-static int hw_ep_disable(struct ci13xxx *ci, int num, int dir)
+static int hw_ep_disable(struct ci_hdrc *ci, int num, int dir)
 {
 	hw_ep_flush(ci, num, dir);
 	hw_write(ci, OP_ENDPTCTRL + num,
@@ -134,7 +134,7 @@
  *
  * This function returns an error code
  */
-static int hw_ep_enable(struct ci13xxx *ci, int num, int dir, int type)
+static int hw_ep_enable(struct ci_hdrc *ci, int num, int dir, int type)
 {
 	u32 mask, data;
 
@@ -168,7 +168,7 @@
  *
  * This function returns 1 if endpoint halted
  */
-static int hw_ep_get_halt(struct ci13xxx *ci, int num, int dir)
+static int hw_ep_get_halt(struct ci_hdrc *ci, int num, int dir)
 {
 	u32 mask = dir ? ENDPTCTRL_TXS : ENDPTCTRL_RXS;
 
@@ -182,7 +182,7 @@
  *
  * This function returns setup status
  */
-static int hw_test_and_clear_setup_status(struct ci13xxx *ci, int n)
+static int hw_test_and_clear_setup_status(struct ci_hdrc *ci, int n)
 {
 	n = ep_to_bit(ci, n);
 	return hw_test_and_clear(ci, OP_ENDPTSETUPSTAT, BIT(n));
@@ -196,7 +196,7 @@
  *
  * This function returns an error code
  */
-static int hw_ep_prime(struct ci13xxx *ci, int num, int dir, int is_ctrl)
+static int hw_ep_prime(struct ci_hdrc *ci, int num, int dir, int is_ctrl)
 {
 	int n = hw_ep_bit(num, dir);
 
@@ -223,13 +223,13 @@
  *
  * This function returns an error code
  */
-static int hw_ep_set_halt(struct ci13xxx *ci, int num, int dir, int value)
+static int hw_ep_set_halt(struct ci_hdrc *ci, int num, int dir, int value)
 {
 	if (value != 0 && value != 1)
 		return -EINVAL;
 
 	do {
-		enum ci13xxx_regs reg = OP_ENDPTCTRL + num;
+		enum ci_hw_regs reg = OP_ENDPTCTRL + num;
 		u32 mask_xs = dir ? ENDPTCTRL_TXS : ENDPTCTRL_RXS;
 		u32 mask_xr = dir ? ENDPTCTRL_TXR : ENDPTCTRL_RXR;
 
@@ -246,7 +246,7 @@
  *
  * This function returns true if high speed port
  */
-static int hw_port_is_high_speed(struct ci13xxx *ci)
+static int hw_port_is_high_speed(struct ci_hdrc *ci)
 {
 	return ci->hw_bank.lpm ? hw_read(ci, OP_DEVLC, DEVLC_PSPD) :
 		hw_read(ci, OP_PORTSC, PORTSC_HSP);
@@ -257,7 +257,7 @@
  *
  * This function returns register data
  */
-static u32 hw_read_intr_enable(struct ci13xxx *ci)
+static u32 hw_read_intr_enable(struct ci_hdrc *ci)
 {
 	return hw_read(ci, OP_USBINTR, ~0);
 }
@@ -267,7 +267,7 @@
  *
  * This function returns register data
  */
-static u32 hw_read_intr_status(struct ci13xxx *ci)
+static u32 hw_read_intr_status(struct ci_hdrc *ci)
 {
 	return hw_read(ci, OP_USBSTS, ~0);
 }
@@ -279,7 +279,7 @@
  *
  * This function returns complete status
  */
-static int hw_test_and_clear_complete(struct ci13xxx *ci, int n)
+static int hw_test_and_clear_complete(struct ci_hdrc *ci, int n)
 {
 	n = ep_to_bit(ci, n);
 	return hw_test_and_clear(ci, OP_ENDPTCOMPLETE, BIT(n));
@@ -291,7 +291,7 @@
  *
  * This function returns active interrutps
  */
-static u32 hw_test_and_clear_intr_active(struct ci13xxx *ci)
+static u32 hw_test_and_clear_intr_active(struct ci_hdrc *ci)
 {
 	u32 reg = hw_read_intr_status(ci) & hw_read_intr_enable(ci);
 
@@ -305,7 +305,7 @@
  *
  * This function returns guard value
  */
-static int hw_test_and_clear_setup_guard(struct ci13xxx *ci)
+static int hw_test_and_clear_setup_guard(struct ci_hdrc *ci)
 {
 	return hw_test_and_write(ci, OP_USBCMD, USBCMD_SUTW, 0);
 }
@@ -316,7 +316,7 @@
  *
  * This function returns guard value
  */
-static int hw_test_and_set_setup_guard(struct ci13xxx *ci)
+static int hw_test_and_set_setup_guard(struct ci_hdrc *ci)
 {
 	return hw_test_and_write(ci, OP_USBCMD, USBCMD_SUTW, USBCMD_SUTW);
 }
@@ -328,7 +328,7 @@
  * This function explicitly sets the address, without the "USBADRA" (advance)
  * feature, which is not supported by older versions of the controller.
  */
-static void hw_usb_set_address(struct ci13xxx *ci, u8 value)
+static void hw_usb_set_address(struct ci_hdrc *ci, u8 value)
 {
 	hw_write(ci, OP_DEVICEADDR, DEVICEADDR_USBADR,
 		 value << __ffs(DEVICEADDR_USBADR));
@@ -340,7 +340,7 @@
  *
  * This function returns an error code
  */
-static int hw_usb_reset(struct ci13xxx *ci)
+static int hw_usb_reset(struct ci_hdrc *ci)
 {
 	hw_usb_set_address(ci, 0);
 
@@ -368,11 +368,60 @@
 /******************************************************************************
  * UTIL block
  *****************************************************************************/
+
+static int add_td_to_list(struct ci_hw_ep *hwep, struct ci_hw_req *hwreq,
+			  unsigned length)
+{
+	int i;
+	u32 temp;
+	struct td_node *lastnode, *node = kzalloc(sizeof(struct td_node),
+						  GFP_ATOMIC);
+
+	if (node == NULL)
+		return -ENOMEM;
+
+	node->ptr = dma_pool_alloc(hwep->td_pool, GFP_ATOMIC,
+				   &node->dma);
+	if (node->ptr == NULL) {
+		kfree(node);
+		return -ENOMEM;
+	}
+
+	memset(node->ptr, 0, sizeof(struct ci_hw_td));
+	node->ptr->token = cpu_to_le32(length << __ffs(TD_TOTAL_BYTES));
+	node->ptr->token &= cpu_to_le32(TD_TOTAL_BYTES);
+	node->ptr->token |= cpu_to_le32(TD_STATUS_ACTIVE);
+
+	temp = (u32) (hwreq->req.dma + hwreq->req.actual);
+	if (length) {
+		node->ptr->page[0] = cpu_to_le32(temp);
+		for (i = 1; i < TD_PAGE_COUNT; i++) {
+			u32 page = temp + i * CI_HDRC_PAGE_SIZE;
+			page &= ~TD_RESERVED_MASK;
+			node->ptr->page[i] = cpu_to_le32(page);
+		}
+	}
+
+	hwreq->req.actual += length;
+
+	if (!list_empty(&hwreq->tds)) {
+		/* get the last entry */
+		lastnode = list_entry(hwreq->tds.prev,
+				struct td_node, td);
+		lastnode->ptr->next = cpu_to_le32(node->dma);
+	}
+
+	INIT_LIST_HEAD(&node->td);
+	list_add_tail(&node->td, &hwreq->tds);
+
+	return 0;
+}
+
 /**
  * _usb_addr: calculates endpoint address from direction & number
  * @ep:  endpoint
  */
-static inline u8 _usb_addr(struct ci13xxx_ep *ep)
+static inline u8 _usb_addr(struct ci_hw_ep *ep)
 {
 	return ((ep->dir == TX) ? USB_ENDPOINT_DIR_MASK : 0) | ep->num;
 }
@@ -380,75 +429,73 @@
 /**
  * _hardware_queue: configures a request at hardware level
  * @gadget: gadget
- * @mEp:    endpoint
+ * @hwep:   endpoint
  *
  * This function returns an error code
  */
-static int _hardware_enqueue(struct ci13xxx_ep *mEp, struct ci13xxx_req *mReq)
+static int _hardware_enqueue(struct ci_hw_ep *hwep, struct ci_hw_req *hwreq)
 {
-	struct ci13xxx *ci = mEp->ci;
-	unsigned i;
+	struct ci_hdrc *ci = hwep->ci;
 	int ret = 0;
-	unsigned length = mReq->req.length;
+	unsigned rest = hwreq->req.length;
+	int pages = TD_PAGE_COUNT;
+	struct td_node *firstnode, *lastnode;
 
 	/* don't queue twice */
-	if (mReq->req.status == -EALREADY)
+	if (hwreq->req.status == -EALREADY)
 		return -EALREADY;
 
-	mReq->req.status = -EALREADY;
+	hwreq->req.status = -EALREADY;
 
-	if (mReq->req.zero && length && (length % mEp->ep.maxpacket == 0)) {
-		mReq->zptr = dma_pool_alloc(mEp->td_pool, GFP_ATOMIC,
-					   &mReq->zdma);
-		if (mReq->zptr == NULL)
-			return -ENOMEM;
-
-		memset(mReq->zptr, 0, sizeof(*mReq->zptr));
-		mReq->zptr->next    = cpu_to_le32(TD_TERMINATE);
-		mReq->zptr->token   = cpu_to_le32(TD_STATUS_ACTIVE);
-		if (!mReq->req.no_interrupt)
-			mReq->zptr->token   |= cpu_to_le32(TD_IOC);
-	}
-	ret = usb_gadget_map_request(&ci->gadget, &mReq->req, mEp->dir);
+	ret = usb_gadget_map_request(&ci->gadget, &hwreq->req, hwep->dir);
 	if (ret)
 		return ret;
 
 	/*
-	 * TD configuration
-	 * TODO - handle requests which spawns into several TDs
+	 * The first buffer could be not page aligned.
+	 * In that case we have to span into one extra td.
 	 */
-	memset(mReq->ptr, 0, sizeof(*mReq->ptr));
-	mReq->ptr->token    = cpu_to_le32(length << __ffs(TD_TOTAL_BYTES));
-	mReq->ptr->token   &= cpu_to_le32(TD_TOTAL_BYTES);
-	mReq->ptr->token   |= cpu_to_le32(TD_STATUS_ACTIVE);
-	if (mReq->zptr) {
-		mReq->ptr->next    = cpu_to_le32(mReq->zdma);
-	} else {
-		mReq->ptr->next    = cpu_to_le32(TD_TERMINATE);
-		if (!mReq->req.no_interrupt)
-			mReq->ptr->token  |= cpu_to_le32(TD_IOC);
-	}
-	mReq->ptr->page[0]  = cpu_to_le32(mReq->req.dma);
-	for (i = 1; i < TD_PAGE_COUNT; i++) {
-		u32 page = mReq->req.dma + i * CI13XXX_PAGE_SIZE;
-		page &= ~TD_RESERVED_MASK;
-		mReq->ptr->page[i] = cpu_to_le32(page);
+	if (hwreq->req.dma % PAGE_SIZE)
+		pages--;
+
+	if (rest == 0)
+		add_td_to_list(hwep, hwreq, 0);
+
+	while (rest > 0) {
+		unsigned count = min(hwreq->req.length - hwreq->req.actual,
+					(unsigned)(pages * CI_HDRC_PAGE_SIZE));
+		add_td_to_list(hwep, hwreq, count);
+		rest -= count;
 	}
 
+	if (hwreq->req.zero && hwreq->req.length
+	    && (hwreq->req.length % hwep->ep.maxpacket == 0))
+		add_td_to_list(hwep, hwreq, 0);
+
+	firstnode = list_first_entry(&hwreq->tds, struct td_node, td);
+
+	lastnode = list_entry(hwreq->tds.prev,
+		struct td_node, td);
+
+	lastnode->ptr->next = cpu_to_le32(TD_TERMINATE);
+	if (!hwreq->req.no_interrupt)
+		lastnode->ptr->token |= cpu_to_le32(TD_IOC);
 	wmb();
 
-	if (!list_empty(&mEp->qh.queue)) {
-		struct ci13xxx_req *mReqPrev;
-		int n = hw_ep_bit(mEp->num, mEp->dir);
+	hwreq->req.actual = 0;
+	if (!list_empty(&hwep->qh.queue)) {
+		struct ci_hw_req *hwreqprev;
+		int n = hw_ep_bit(hwep->num, hwep->dir);
 		int tmp_stat;
-		u32 next = mReq->dma & TD_ADDR_MASK;
+		struct td_node *prevlastnode;
+		u32 next = firstnode->dma & TD_ADDR_MASK;
 
-		mReqPrev = list_entry(mEp->qh.queue.prev,
-				struct ci13xxx_req, queue);
-		if (mReqPrev->zptr)
-			mReqPrev->zptr->next = cpu_to_le32(next);
-		else
-			mReqPrev->ptr->next = cpu_to_le32(next);
+		hwreqprev = list_entry(hwep->qh.queue.prev,
+				struct ci_hw_req, queue);
+		prevlastnode = list_entry(hwreqprev->tds.prev,
+				struct td_node, td);
+
+		prevlastnode->ptr->next = cpu_to_le32(next);
 		wmb();
 		if (hw_read(ci, OP_ENDPTPRIME, BIT(n)))
 			goto done;
@@ -462,99 +509,152 @@
 	}
 
 	/*  QH configuration */
-	mEp->qh.ptr->td.next   = cpu_to_le32(mReq->dma);    /* TERMINATE = 0 */
-	mEp->qh.ptr->td.token &=
+	hwep->qh.ptr->td.next = cpu_to_le32(firstnode->dma);
+	hwep->qh.ptr->td.token &=
 		cpu_to_le32(~(TD_STATUS_HALTED|TD_STATUS_ACTIVE));
 
+	if (hwep->type == USB_ENDPOINT_XFER_ISOC) {
+		u32 mul = hwreq->req.length / hwep->ep.maxpacket;
+
+		if (hwreq->req.length % hwep->ep.maxpacket)
+			mul++;
+		hwep->qh.ptr->cap |= mul << __ffs(QH_MULT);
+	}
+
 	wmb();   /* synchronize before ep prime */
 
-	ret = hw_ep_prime(ci, mEp->num, mEp->dir,
-			   mEp->type == USB_ENDPOINT_XFER_CONTROL);
+	ret = hw_ep_prime(ci, hwep->num, hwep->dir,
+			   hwep->type == USB_ENDPOINT_XFER_CONTROL);
 done:
 	return ret;
 }
 
+/*
+ * free_pending_td: remove a pending request for the endpoint
+ * @hwep: endpoint
+ */
+static void free_pending_td(struct ci_hw_ep *hwep)
+{
+	struct td_node *pending = hwep->pending_td;
+
+	dma_pool_free(hwep->td_pool, pending->ptr, pending->dma);
+	hwep->pending_td = NULL;
+	kfree(pending);
+}
+
 /**
  * _hardware_dequeue: handles a request at hardware level
  * @gadget: gadget
- * @mEp:    endpoint
+ * @hwep:   endpoint
  *
  * This function returns an error code
  */
-static int _hardware_dequeue(struct ci13xxx_ep *mEp, struct ci13xxx_req *mReq)
+static int _hardware_dequeue(struct ci_hw_ep *hwep, struct ci_hw_req *hwreq)
 {
-	u32 tmptoken = le32_to_cpu(mReq->ptr->token);
+	u32 tmptoken;
+	struct td_node *node, *tmpnode;
+	unsigned remaining_length;
+	unsigned actual = hwreq->req.length;
 
-	if (mReq->req.status != -EALREADY)
+	if (hwreq->req.status != -EALREADY)
 		return -EINVAL;
 
-	if ((TD_STATUS_ACTIVE & tmptoken) != 0)
-		return -EBUSY;
+	hwreq->req.status = 0;
 
-	if (mReq->zptr) {
-		if ((cpu_to_le32(TD_STATUS_ACTIVE) & mReq->zptr->token) != 0)
+	list_for_each_entry_safe(node, tmpnode, &hwreq->tds, td) {
+		tmptoken = le32_to_cpu(node->ptr->token);
+		if ((TD_STATUS_ACTIVE & tmptoken) != 0) {
+			hwreq->req.status = -EALREADY;
 			return -EBUSY;
-		dma_pool_free(mEp->td_pool, mReq->zptr, mReq->zdma);
-		mReq->zptr = NULL;
+		}
+
+		remaining_length = (tmptoken & TD_TOTAL_BYTES);
+		remaining_length >>= __ffs(TD_TOTAL_BYTES);
+		actual -= remaining_length;
+
+		hwreq->req.status = tmptoken & TD_STATUS;
+		if ((TD_STATUS_HALTED & hwreq->req.status)) {
+			hwreq->req.status = -EPIPE;
+			break;
+		} else if ((TD_STATUS_DT_ERR & hwreq->req.status)) {
+			hwreq->req.status = -EPROTO;
+			break;
+		} else if ((TD_STATUS_TR_ERR & hwreq->req.status)) {
+			hwreq->req.status = -EILSEQ;
+			break;
+		}
+
+		if (remaining_length) {
+			if (hwep->dir) {
+				hwreq->req.status = -EPROTO;
+				break;
+			}
+		}
+		/*
+		 * As the hardware could still address the freed td
+		 * which will run the udc unusable, the cleanup of the
+		 * td has to be delayed by one.
+		 */
+		if (hwep->pending_td)
+			free_pending_td(hwep);
+
+		hwep->pending_td = node;
+		list_del_init(&node->td);
 	}
 
-	mReq->req.status = 0;
+	usb_gadget_unmap_request(&hwep->ci->gadget, &hwreq->req, hwep->dir);
 
-	usb_gadget_unmap_request(&mEp->ci->gadget, &mReq->req, mEp->dir);
+	hwreq->req.actual += actual;
 
-	mReq->req.status = tmptoken & TD_STATUS;
-	if ((TD_STATUS_HALTED & mReq->req.status) != 0)
-		mReq->req.status = -1;
-	else if ((TD_STATUS_DT_ERR & mReq->req.status) != 0)
-		mReq->req.status = -1;
-	else if ((TD_STATUS_TR_ERR & mReq->req.status) != 0)
-		mReq->req.status = -1;
+	if (hwreq->req.status)
+		return hwreq->req.status;
 
-	mReq->req.actual   = tmptoken & TD_TOTAL_BYTES;
-	mReq->req.actual >>= __ffs(TD_TOTAL_BYTES);
-	mReq->req.actual   = mReq->req.length - mReq->req.actual;
-	mReq->req.actual   = mReq->req.status ? 0 : mReq->req.actual;
-
-	return mReq->req.actual;
+	return hwreq->req.actual;
 }
 
 /**
  * _ep_nuke: dequeues all endpoint requests
- * @mEp: endpoint
+ * @hwep: endpoint
  *
  * This function returns an error code
  * Caller must hold lock
  */
-static int _ep_nuke(struct ci13xxx_ep *mEp)
-__releases(mEp->lock)
-__acquires(mEp->lock)
+static int _ep_nuke(struct ci_hw_ep *hwep)
+__releases(hwep->lock)
+__acquires(hwep->lock)
 {
-	if (mEp == NULL)
+	struct td_node *node, *tmpnode;
+	if (hwep == NULL)
 		return -EINVAL;
 
-	hw_ep_flush(mEp->ci, mEp->num, mEp->dir);
+	hw_ep_flush(hwep->ci, hwep->num, hwep->dir);
 
-	while (!list_empty(&mEp->qh.queue)) {
+	while (!list_empty(&hwep->qh.queue)) {
 
 		/* pop oldest request */
-		struct ci13xxx_req *mReq = \
-			list_entry(mEp->qh.queue.next,
-				   struct ci13xxx_req, queue);
+		struct ci_hw_req *hwreq = list_entry(hwep->qh.queue.next,
+						     struct ci_hw_req, queue);
 
-		if (mReq->zptr) {
-			dma_pool_free(mEp->td_pool, mReq->zptr, mReq->zdma);
-			mReq->zptr = NULL;
+		list_for_each_entry_safe(node, tmpnode, &hwreq->tds, td) {
+			dma_pool_free(hwep->td_pool, node->ptr, node->dma);
+			list_del_init(&node->td);
+			node->ptr = NULL;
+			kfree(node);
 		}
 
-		list_del_init(&mReq->queue);
-		mReq->req.status = -ESHUTDOWN;
+		list_del_init(&hwreq->queue);
+		hwreq->req.status = -ESHUTDOWN;
 
-		if (mReq->req.complete != NULL) {
-			spin_unlock(mEp->lock);
-			mReq->req.complete(&mEp->ep, &mReq->req);
-			spin_lock(mEp->lock);
+		if (hwreq->req.complete != NULL) {
+			spin_unlock(hwep->lock);
+			hwreq->req.complete(&hwep->ep, &hwreq->req);
+			spin_lock(hwep->lock);
 		}
 	}
+
+	if (hwep->pending_td)
+		free_pending_td(hwep);
+
 	return 0;
 }
 
@@ -567,7 +667,7 @@
 static int _gadget_stop_activity(struct usb_gadget *gadget)
 {
 	struct usb_ep *ep;
-	struct ci13xxx    *ci = container_of(gadget, struct ci13xxx, gadget);
+	struct ci_hdrc    *ci = container_of(gadget, struct ci_hdrc, gadget);
 	unsigned long flags;
 
 	spin_lock_irqsave(&ci->lock, flags);
@@ -608,7 +708,7 @@
  *
  * This function resets USB engine after a bus reset occurred
  */
-static void isr_reset_handler(struct ci13xxx *ci)
+static void isr_reset_handler(struct ci_hdrc *ci)
 __releases(ci->lock)
 __acquires(ci->lock)
 {
@@ -658,47 +758,48 @@
 static int _ep_queue(struct usb_ep *ep, struct usb_request *req,
 		    gfp_t __maybe_unused gfp_flags)
 {
-	struct ci13xxx_ep  *mEp  = container_of(ep,  struct ci13xxx_ep, ep);
-	struct ci13xxx_req *mReq = container_of(req, struct ci13xxx_req, req);
-	struct ci13xxx *ci = mEp->ci;
+	struct ci_hw_ep  *hwep  = container_of(ep,  struct ci_hw_ep, ep);
+	struct ci_hw_req *hwreq = container_of(req, struct ci_hw_req, req);
+	struct ci_hdrc *ci = hwep->ci;
 	int retval = 0;
 
-	if (ep == NULL || req == NULL || mEp->ep.desc == NULL)
+	if (ep == NULL || req == NULL || hwep->ep.desc == NULL)
 		return -EINVAL;
 
-	if (mEp->type == USB_ENDPOINT_XFER_CONTROL) {
+	if (hwep->type == USB_ENDPOINT_XFER_CONTROL) {
 		if (req->length)
-			mEp = (ci->ep0_dir == RX) ?
+			hwep = (ci->ep0_dir == RX) ?
 			       ci->ep0out : ci->ep0in;
-		if (!list_empty(&mEp->qh.queue)) {
-			_ep_nuke(mEp);
+		if (!list_empty(&hwep->qh.queue)) {
+			_ep_nuke(hwep);
 			retval = -EOVERFLOW;
-			dev_warn(mEp->ci->dev, "endpoint ctrl %X nuked\n",
-				 _usb_addr(mEp));
+			dev_warn(hwep->ci->dev, "endpoint ctrl %X nuked\n",
+				 _usb_addr(hwep));
 		}
 	}
 
-	/* first nuke then test link, e.g. previous status has not sent */
-	if (!list_empty(&mReq->queue)) {
-		dev_err(mEp->ci->dev, "request already in queue\n");
-		return -EBUSY;
-	}
-
-	if (req->length > (TD_PAGE_COUNT - 1) * CI13XXX_PAGE_SIZE) {
-		dev_err(mEp->ci->dev, "request bigger than one td\n");
+	if (usb_endpoint_xfer_isoc(hwep->ep.desc) &&
+	    hwreq->req.length > (1 + hwep->ep.mult) * hwep->ep.maxpacket) {
+		dev_err(hwep->ci->dev, "request length too big for isochronous\n");
 		return -EMSGSIZE;
 	}
 
-	/* push request */
-	mReq->req.status = -EINPROGRESS;
-	mReq->req.actual = 0;
+	/* first nuke then test link, e.g. previous status has not sent */
+	if (!list_empty(&hwreq->queue)) {
+		dev_err(hwep->ci->dev, "request already in queue\n");
+		return -EBUSY;
+	}
 
-	retval = _hardware_enqueue(mEp, mReq);
+	/* push request */
+	hwreq->req.status = -EINPROGRESS;
+	hwreq->req.actual = 0;
+
+	retval = _hardware_enqueue(hwep, hwreq);
 
 	if (retval == -EALREADY)
 		retval = 0;
 	if (!retval)
-		list_add_tail(&mReq->queue, &mEp->qh.queue);
+		list_add_tail(&hwreq->queue, &hwep->qh.queue);
 
 	return retval;
 }
@@ -710,22 +811,22 @@
  *
  * This function returns an error code
  */
-static int isr_get_status_response(struct ci13xxx *ci,
+static int isr_get_status_response(struct ci_hdrc *ci,
 				   struct usb_ctrlrequest *setup)
-__releases(mEp->lock)
-__acquires(mEp->lock)
+__releases(hwep->lock)
+__acquires(hwep->lock)
 {
-	struct ci13xxx_ep *mEp = ci->ep0in;
+	struct ci_hw_ep *hwep = ci->ep0in;
 	struct usb_request *req = NULL;
 	gfp_t gfp_flags = GFP_ATOMIC;
 	int dir, num, retval;
 
-	if (mEp == NULL || setup == NULL)
+	if (hwep == NULL || setup == NULL)
 		return -EINVAL;
 
-	spin_unlock(mEp->lock);
-	req = usb_ep_alloc_request(&mEp->ep, gfp_flags);
-	spin_lock(mEp->lock);
+	spin_unlock(hwep->lock);
+	req = usb_ep_alloc_request(&hwep->ep, gfp_flags);
+	spin_lock(hwep->lock);
 	if (req == NULL)
 		return -ENOMEM;
 
@@ -750,7 +851,7 @@
 	}
 	/* else do nothing; reserved for future use */
 
-	retval = _ep_queue(&mEp->ep, req, gfp_flags);
+	retval = _ep_queue(&hwep->ep, req, gfp_flags);
 	if (retval)
 		goto err_free_buf;
 
@@ -759,9 +860,9 @@
  err_free_buf:
 	kfree(req->buf);
  err_free_req:
-	spin_unlock(mEp->lock);
-	usb_ep_free_request(&mEp->ep, req);
-	spin_lock(mEp->lock);
+	spin_unlock(hwep->lock);
+	usb_ep_free_request(&hwep->ep, req);
+	spin_lock(hwep->lock);
 	return retval;
 }
 
@@ -776,7 +877,7 @@
 static void
 isr_setup_status_complete(struct usb_ep *ep, struct usb_request *req)
 {
-	struct ci13xxx *ci = req->context;
+	struct ci_hdrc *ci = req->context;
 	unsigned long flags;
 
 	if (ci->setaddr) {
@@ -796,48 +897,48 @@
  *
  * This function returns an error code
  */
-static int isr_setup_status_phase(struct ci13xxx *ci)
+static int isr_setup_status_phase(struct ci_hdrc *ci)
 {
 	int retval;
-	struct ci13xxx_ep *mEp;
+	struct ci_hw_ep *hwep;
 
-	mEp = (ci->ep0_dir == TX) ? ci->ep0out : ci->ep0in;
+	hwep = (ci->ep0_dir == TX) ? ci->ep0out : ci->ep0in;
 	ci->status->context = ci;
 	ci->status->complete = isr_setup_status_complete;
 
-	retval = _ep_queue(&mEp->ep, ci->status, GFP_ATOMIC);
+	retval = _ep_queue(&hwep->ep, ci->status, GFP_ATOMIC);
 
 	return retval;
 }
 
 /**
  * isr_tr_complete_low: transaction complete low level handler
- * @mEp: endpoint
+ * @hwep: endpoint
  *
  * This function returns an error code
  * Caller must hold lock
  */
-static int isr_tr_complete_low(struct ci13xxx_ep *mEp)
-__releases(mEp->lock)
-__acquires(mEp->lock)
+static int isr_tr_complete_low(struct ci_hw_ep *hwep)
+__releases(hwep->lock)
+__acquires(hwep->lock)
 {
-	struct ci13xxx_req *mReq, *mReqTemp;
-	struct ci13xxx_ep *mEpTemp = mEp;
+	struct ci_hw_req *hwreq, *hwreqtemp;
+	struct ci_hw_ep *hweptemp = hwep;
 	int retval = 0;
 
-	list_for_each_entry_safe(mReq, mReqTemp, &mEp->qh.queue,
+	list_for_each_entry_safe(hwreq, hwreqtemp, &hwep->qh.queue,
 			queue) {
-		retval = _hardware_dequeue(mEp, mReq);
+		retval = _hardware_dequeue(hwep, hwreq);
 		if (retval < 0)
 			break;
-		list_del_init(&mReq->queue);
-		if (mReq->req.complete != NULL) {
-			spin_unlock(mEp->lock);
-			if ((mEp->type == USB_ENDPOINT_XFER_CONTROL) &&
-					mReq->req.length)
-				mEpTemp = mEp->ci->ep0in;
-			mReq->req.complete(&mEpTemp->ep, &mReq->req);
-			spin_lock(mEp->lock);
+		list_del_init(&hwreq->queue);
+		if (hwreq->req.complete != NULL) {
+			spin_unlock(hwep->lock);
+			if ((hwep->type == USB_ENDPOINT_XFER_CONTROL) &&
+					hwreq->req.length)
+				hweptemp = hwep->ci->ep0in;
+			hwreq->req.complete(&hweptemp->ep, &hwreq->req);
+			spin_lock(hwep->lock);
 		}
 	}
 
@@ -853,7 +954,7 @@
  *
  * This function handles traffic events
  */
-static void isr_tr_complete_handler(struct ci13xxx *ci)
+static void isr_tr_complete_handler(struct ci_hdrc *ci)
 __releases(ci->lock)
 __acquires(ci->lock)
 {
@@ -861,21 +962,21 @@
 	u8 tmode = 0;
 
 	for (i = 0; i < ci->hw_ep_max; i++) {
-		struct ci13xxx_ep *mEp  = &ci->ci13xxx_ep[i];
+		struct ci_hw_ep *hwep  = &ci->ci_hw_ep[i];
 		int type, num, dir, err = -EINVAL;
 		struct usb_ctrlrequest req;
 
-		if (mEp->ep.desc == NULL)
+		if (hwep->ep.desc == NULL)
 			continue;   /* not configured */
 
 		if (hw_test_and_clear_complete(ci, i)) {
-			err = isr_tr_complete_low(mEp);
-			if (mEp->type == USB_ENDPOINT_XFER_CONTROL) {
+			err = isr_tr_complete_low(hwep);
+			if (hwep->type == USB_ENDPOINT_XFER_CONTROL) {
 				if (err > 0)   /* needs status phase */
 					err = isr_setup_status_phase(ci);
 				if (err < 0) {
 					spin_unlock(&ci->lock);
-					if (usb_ep_set_halt(&mEp->ep))
+					if (usb_ep_set_halt(&hwep->ep))
 						dev_err(ci->dev,
 							"error: ep_set_halt\n");
 					spin_lock(&ci->lock);
@@ -883,7 +984,7 @@
 			}
 		}
 
-		if (mEp->type != USB_ENDPOINT_XFER_CONTROL ||
+		if (hwep->type != USB_ENDPOINT_XFER_CONTROL ||
 		    !hw_test_and_clear_setup_status(ci, i))
 			continue;
 
@@ -902,7 +1003,7 @@
 		/* read_setup_packet */
 		do {
 			hw_test_and_set_setup_guard(ci);
-			memcpy(&req, &mEp->qh.ptr->setup, sizeof(req));
+			memcpy(&req, &hwep->qh.ptr->setup, sizeof(req));
 		} while (!hw_test_and_clear_setup_guard(ci));
 
 		type = req.bRequestType;
@@ -921,10 +1022,10 @@
 				num &= USB_ENDPOINT_NUMBER_MASK;
 				if (dir) /* TX */
 					num += ci->hw_ep_max/2;
-				if (!ci->ci13xxx_ep[num].wedge) {
+				if (!ci->ci_hw_ep[num].wedge) {
 					spin_unlock(&ci->lock);
 					err = usb_ep_clear_halt(
-						&ci->ci13xxx_ep[num].ep);
+						&ci->ci_hw_ep[num].ep);
 					spin_lock(&ci->lock);
 					if (err)
 						break;
@@ -974,7 +1075,7 @@
 					num += ci->hw_ep_max/2;
 
 				spin_unlock(&ci->lock);
-				err = usb_ep_set_halt(&ci->ci13xxx_ep[num].ep);
+				err = usb_ep_set_halt(&ci->ci_hw_ep[num].ep);
 				spin_lock(&ci->lock);
 				if (!err)
 					isr_setup_status_phase(ci);
@@ -1021,7 +1122,7 @@
 
 		if (err < 0) {
 			spin_unlock(&ci->lock);
-			if (usb_ep_set_halt(&mEp->ep))
+			if (usb_ep_set_halt(&hwep->ep))
 				dev_err(ci->dev, "error: ep_set_halt\n");
 			spin_lock(&ci->lock);
 		}
@@ -1039,7 +1140,7 @@
 static int ep_enable(struct usb_ep *ep,
 		     const struct usb_endpoint_descriptor *desc)
 {
-	struct ci13xxx_ep *mEp = container_of(ep, struct ci13xxx_ep, ep);
+	struct ci_hw_ep *hwep = container_of(ep, struct ci_hw_ep, ep);
 	int retval = 0;
 	unsigned long flags;
 	u32 cap = 0;
@@ -1047,39 +1148,41 @@
 	if (ep == NULL || desc == NULL)
 		return -EINVAL;
 
-	spin_lock_irqsave(mEp->lock, flags);
+	spin_lock_irqsave(hwep->lock, flags);
 
 	/* only internal SW should enable ctrl endpts */
 
-	mEp->ep.desc = desc;
+	hwep->ep.desc = desc;
 
-	if (!list_empty(&mEp->qh.queue))
-		dev_warn(mEp->ci->dev, "enabling a non-empty endpoint!\n");
+	if (!list_empty(&hwep->qh.queue))
+		dev_warn(hwep->ci->dev, "enabling a non-empty endpoint!\n");
 
-	mEp->dir  = usb_endpoint_dir_in(desc) ? TX : RX;
-	mEp->num  = usb_endpoint_num(desc);
-	mEp->type = usb_endpoint_type(desc);
+	hwep->dir  = usb_endpoint_dir_in(desc) ? TX : RX;
+	hwep->num  = usb_endpoint_num(desc);
+	hwep->type = usb_endpoint_type(desc);
 
-	mEp->ep.maxpacket = usb_endpoint_maxp(desc);
+	hwep->ep.maxpacket = usb_endpoint_maxp(desc) & 0x07ff;
+	hwep->ep.mult = QH_ISO_MULT(usb_endpoint_maxp(desc));
 
-	if (mEp->type == USB_ENDPOINT_XFER_CONTROL)
+	if (hwep->type == USB_ENDPOINT_XFER_CONTROL)
 		cap |= QH_IOS;
-	if (mEp->num)
+	if (hwep->num)
 		cap |= QH_ZLT;
-	cap |= (mEp->ep.maxpacket << __ffs(QH_MAX_PKT)) & QH_MAX_PKT;
+	cap |= (hwep->ep.maxpacket << __ffs(QH_MAX_PKT)) & QH_MAX_PKT;
 
-	mEp->qh.ptr->cap = cpu_to_le32(cap);
+	hwep->qh.ptr->cap = cpu_to_le32(cap);
 
-	mEp->qh.ptr->td.next |= cpu_to_le32(TD_TERMINATE);   /* needed? */
+	hwep->qh.ptr->td.next |= cpu_to_le32(TD_TERMINATE);   /* needed? */
 
 	/*
 	 * Enable endpoints in the HW other than ep0 as ep0
 	 * is always enabled
 	 */
-	if (mEp->num)
-		retval |= hw_ep_enable(mEp->ci, mEp->num, mEp->dir, mEp->type);
+	if (hwep->num)
+		retval |= hw_ep_enable(hwep->ci, hwep->num, hwep->dir,
+				       hwep->type);
 
-	spin_unlock_irqrestore(mEp->lock, flags);
+	spin_unlock_irqrestore(hwep->lock, flags);
 	return retval;
 }
 
@@ -1090,32 +1193,32 @@
  */
 static int ep_disable(struct usb_ep *ep)
 {
-	struct ci13xxx_ep *mEp = container_of(ep, struct ci13xxx_ep, ep);
+	struct ci_hw_ep *hwep = container_of(ep, struct ci_hw_ep, ep);
 	int direction, retval = 0;
 	unsigned long flags;
 
 	if (ep == NULL)
 		return -EINVAL;
-	else if (mEp->ep.desc == NULL)
+	else if (hwep->ep.desc == NULL)
 		return -EBUSY;
 
-	spin_lock_irqsave(mEp->lock, flags);
+	spin_lock_irqsave(hwep->lock, flags);
 
 	/* only internal SW should disable ctrl endpts */
 
-	direction = mEp->dir;
+	direction = hwep->dir;
 	do {
-		retval |= _ep_nuke(mEp);
-		retval |= hw_ep_disable(mEp->ci, mEp->num, mEp->dir);
+		retval |= _ep_nuke(hwep);
+		retval |= hw_ep_disable(hwep->ci, hwep->num, hwep->dir);
 
-		if (mEp->type == USB_ENDPOINT_XFER_CONTROL)
-			mEp->dir = (mEp->dir == TX) ? RX : TX;
+		if (hwep->type == USB_ENDPOINT_XFER_CONTROL)
+			hwep->dir = (hwep->dir == TX) ? RX : TX;
 
-	} while (mEp->dir != direction);
+	} while (hwep->dir != direction);
 
-	mEp->ep.desc = NULL;
+	hwep->ep.desc = NULL;
 
-	spin_unlock_irqrestore(mEp->lock, flags);
+	spin_unlock_irqrestore(hwep->lock, flags);
 	return retval;
 }
 
@@ -1126,25 +1229,18 @@
  */
 static struct usb_request *ep_alloc_request(struct usb_ep *ep, gfp_t gfp_flags)
 {
-	struct ci13xxx_ep  *mEp  = container_of(ep, struct ci13xxx_ep, ep);
-	struct ci13xxx_req *mReq = NULL;
+	struct ci_hw_req *hwreq = NULL;
 
 	if (ep == NULL)
 		return NULL;
 
-	mReq = kzalloc(sizeof(struct ci13xxx_req), gfp_flags);
-	if (mReq != NULL) {
-		INIT_LIST_HEAD(&mReq->queue);
-
-		mReq->ptr = dma_pool_alloc(mEp->td_pool, gfp_flags,
-					   &mReq->dma);
-		if (mReq->ptr == NULL) {
-			kfree(mReq);
-			mReq = NULL;
-		}
+	hwreq = kzalloc(sizeof(struct ci_hw_req), gfp_flags);
+	if (hwreq != NULL) {
+		INIT_LIST_HEAD(&hwreq->queue);
+		INIT_LIST_HEAD(&hwreq->tds);
 	}
 
-	return (mReq == NULL) ? NULL : &mReq->req;
+	return (hwreq == NULL) ? NULL : &hwreq->req;
 }
 
 /**
@@ -1154,24 +1250,30 @@
  */
 static void ep_free_request(struct usb_ep *ep, struct usb_request *req)
 {
-	struct ci13xxx_ep  *mEp  = container_of(ep,  struct ci13xxx_ep, ep);
-	struct ci13xxx_req *mReq = container_of(req, struct ci13xxx_req, req);
+	struct ci_hw_ep  *hwep  = container_of(ep,  struct ci_hw_ep, ep);
+	struct ci_hw_req *hwreq = container_of(req, struct ci_hw_req, req);
+	struct td_node *node, *tmpnode;
 	unsigned long flags;
 
 	if (ep == NULL || req == NULL) {
 		return;
-	} else if (!list_empty(&mReq->queue)) {
-		dev_err(mEp->ci->dev, "freeing queued request\n");
+	} else if (!list_empty(&hwreq->queue)) {
+		dev_err(hwep->ci->dev, "freeing queued request\n");
 		return;
 	}
 
-	spin_lock_irqsave(mEp->lock, flags);
+	spin_lock_irqsave(hwep->lock, flags);
 
-	if (mReq->ptr)
-		dma_pool_free(mEp->td_pool, mReq->ptr, mReq->dma);
-	kfree(mReq);
+	list_for_each_entry_safe(node, tmpnode, &hwreq->tds, td) {
+		dma_pool_free(hwep->td_pool, node->ptr, node->dma);
+		list_del_init(&node->td);
+		node->ptr = NULL;
+		kfree(node);
+	}
 
-	spin_unlock_irqrestore(mEp->lock, flags);
+	kfree(hwreq);
+
+	spin_unlock_irqrestore(hwep->lock, flags);
 }
 
 /**
@@ -1182,16 +1284,16 @@
 static int ep_queue(struct usb_ep *ep, struct usb_request *req,
 		    gfp_t __maybe_unused gfp_flags)
 {
-	struct ci13xxx_ep  *mEp  = container_of(ep,  struct ci13xxx_ep, ep);
+	struct ci_hw_ep  *hwep  = container_of(ep,  struct ci_hw_ep, ep);
 	int retval = 0;
 	unsigned long flags;
 
-	if (ep == NULL || req == NULL || mEp->ep.desc == NULL)
+	if (ep == NULL || req == NULL || hwep->ep.desc == NULL)
 		return -EINVAL;
 
-	spin_lock_irqsave(mEp->lock, flags);
+	spin_lock_irqsave(hwep->lock, flags);
 	retval = _ep_queue(ep, req, gfp_flags);
-	spin_unlock_irqrestore(mEp->lock, flags);
+	spin_unlock_irqrestore(hwep->lock, flags);
 	return retval;
 }
 
@@ -1202,33 +1304,33 @@
  */
 static int ep_dequeue(struct usb_ep *ep, struct usb_request *req)
 {
-	struct ci13xxx_ep  *mEp  = container_of(ep,  struct ci13xxx_ep, ep);
-	struct ci13xxx_req *mReq = container_of(req, struct ci13xxx_req, req);
+	struct ci_hw_ep  *hwep  = container_of(ep,  struct ci_hw_ep, ep);
+	struct ci_hw_req *hwreq = container_of(req, struct ci_hw_req, req);
 	unsigned long flags;
 
-	if (ep == NULL || req == NULL || mReq->req.status != -EALREADY ||
-		mEp->ep.desc == NULL || list_empty(&mReq->queue) ||
-		list_empty(&mEp->qh.queue))
+	if (ep == NULL || req == NULL || hwreq->req.status != -EALREADY ||
+		hwep->ep.desc == NULL || list_empty(&hwreq->queue) ||
+		list_empty(&hwep->qh.queue))
 		return -EINVAL;
 
-	spin_lock_irqsave(mEp->lock, flags);
+	spin_lock_irqsave(hwep->lock, flags);
 
-	hw_ep_flush(mEp->ci, mEp->num, mEp->dir);
+	hw_ep_flush(hwep->ci, hwep->num, hwep->dir);
 
 	/* pop request */
-	list_del_init(&mReq->queue);
+	list_del_init(&hwreq->queue);
 
-	usb_gadget_unmap_request(&mEp->ci->gadget, req, mEp->dir);
+	usb_gadget_unmap_request(&hwep->ci->gadget, req, hwep->dir);
 
 	req->status = -ECONNRESET;
 
-	if (mReq->req.complete != NULL) {
-		spin_unlock(mEp->lock);
-		mReq->req.complete(&mEp->ep, &mReq->req);
-		spin_lock(mEp->lock);
+	if (hwreq->req.complete != NULL) {
+		spin_unlock(hwep->lock);
+		hwreq->req.complete(&hwep->ep, &hwreq->req);
+		spin_lock(hwep->lock);
 	}
 
-	spin_unlock_irqrestore(mEp->lock, flags);
+	spin_unlock_irqrestore(hwep->lock, flags);
 	return 0;
 }
 
@@ -1239,37 +1341,40 @@
  */
 static int ep_set_halt(struct usb_ep *ep, int value)
 {
-	struct ci13xxx_ep *mEp = container_of(ep, struct ci13xxx_ep, ep);
+	struct ci_hw_ep *hwep = container_of(ep, struct ci_hw_ep, ep);
 	int direction, retval = 0;
 	unsigned long flags;
 
-	if (ep == NULL || mEp->ep.desc == NULL)
+	if (ep == NULL || hwep->ep.desc == NULL)
 		return -EINVAL;
 
-	spin_lock_irqsave(mEp->lock, flags);
+	if (usb_endpoint_xfer_isoc(hwep->ep.desc))
+		return -EOPNOTSUPP;
+
+	spin_lock_irqsave(hwep->lock, flags);
 
 #ifndef STALL_IN
 	/* g_file_storage MS compliant but g_zero fails chapter 9 compliance */
-	if (value && mEp->type == USB_ENDPOINT_XFER_BULK && mEp->dir == TX &&
-	    !list_empty(&mEp->qh.queue)) {
-		spin_unlock_irqrestore(mEp->lock, flags);
+	if (value && hwep->type == USB_ENDPOINT_XFER_BULK && hwep->dir == TX &&
+	    !list_empty(&hwep->qh.queue)) {
+		spin_unlock_irqrestore(hwep->lock, flags);
 		return -EAGAIN;
 	}
 #endif
 
-	direction = mEp->dir;
+	direction = hwep->dir;
 	do {
-		retval |= hw_ep_set_halt(mEp->ci, mEp->num, mEp->dir, value);
+		retval |= hw_ep_set_halt(hwep->ci, hwep->num, hwep->dir, value);
 
 		if (!value)
-			mEp->wedge = 0;
+			hwep->wedge = 0;
 
-		if (mEp->type == USB_ENDPOINT_XFER_CONTROL)
-			mEp->dir = (mEp->dir == TX) ? RX : TX;
+		if (hwep->type == USB_ENDPOINT_XFER_CONTROL)
+			hwep->dir = (hwep->dir == TX) ? RX : TX;
 
-	} while (mEp->dir != direction);
+	} while (hwep->dir != direction);
 
-	spin_unlock_irqrestore(mEp->lock, flags);
+	spin_unlock_irqrestore(hwep->lock, flags);
 	return retval;
 }
 
@@ -1280,15 +1385,15 @@
  */
 static int ep_set_wedge(struct usb_ep *ep)
 {
-	struct ci13xxx_ep *mEp = container_of(ep, struct ci13xxx_ep, ep);
+	struct ci_hw_ep *hwep = container_of(ep, struct ci_hw_ep, ep);
 	unsigned long flags;
 
-	if (ep == NULL || mEp->ep.desc == NULL)
+	if (ep == NULL || hwep->ep.desc == NULL)
 		return -EINVAL;
 
-	spin_lock_irqsave(mEp->lock, flags);
-	mEp->wedge = 1;
-	spin_unlock_irqrestore(mEp->lock, flags);
+	spin_lock_irqsave(hwep->lock, flags);
+	hwep->wedge = 1;
+	spin_unlock_irqrestore(hwep->lock, flags);
 
 	return usb_ep_set_halt(ep);
 }
@@ -1300,19 +1405,19 @@
  */
 static void ep_fifo_flush(struct usb_ep *ep)
 {
-	struct ci13xxx_ep *mEp = container_of(ep, struct ci13xxx_ep, ep);
+	struct ci_hw_ep *hwep = container_of(ep, struct ci_hw_ep, ep);
 	unsigned long flags;
 
 	if (ep == NULL) {
-		dev_err(mEp->ci->dev, "%02X: -EINVAL\n", _usb_addr(mEp));
+		dev_err(hwep->ci->dev, "%02X: -EINVAL\n", _usb_addr(hwep));
 		return;
 	}
 
-	spin_lock_irqsave(mEp->lock, flags);
+	spin_lock_irqsave(hwep->lock, flags);
 
-	hw_ep_flush(mEp->ci, mEp->num, mEp->dir);
+	hw_ep_flush(hwep->ci, hwep->num, hwep->dir);
 
-	spin_unlock_irqrestore(mEp->lock, flags);
+	spin_unlock_irqrestore(hwep->lock, flags);
 }
 
 /**
@@ -1334,13 +1439,13 @@
 /******************************************************************************
  * GADGET block
  *****************************************************************************/
-static int ci13xxx_vbus_session(struct usb_gadget *_gadget, int is_active)
+static int ci_udc_vbus_session(struct usb_gadget *_gadget, int is_active)
 {
-	struct ci13xxx *ci = container_of(_gadget, struct ci13xxx, gadget);
+	struct ci_hdrc *ci = container_of(_gadget, struct ci_hdrc, gadget);
 	unsigned long flags;
 	int gadget_ready = 0;
 
-	if (!(ci->platdata->flags & CI13XXX_PULLUP_ON_VBUS))
+	if (!(ci->platdata->flags & CI_HDRC_PULLUP_ON_VBUS))
 		return -EOPNOTSUPP;
 
 	spin_lock_irqsave(&ci->lock, flags);
@@ -1358,7 +1463,7 @@
 			hw_device_state(ci, 0);
 			if (ci->platdata->notify_event)
 				ci->platdata->notify_event(ci,
-				CI13XXX_CONTROLLER_STOPPED_EVENT);
+				CI_HDRC_CONTROLLER_STOPPED_EVENT);
 			_gadget_stop_activity(&ci->gadget);
 			pm_runtime_put_sync(&_gadget->dev);
 		}
@@ -1367,9 +1472,9 @@
 	return 0;
 }
 
-static int ci13xxx_wakeup(struct usb_gadget *_gadget)
+static int ci_udc_wakeup(struct usb_gadget *_gadget)
 {
-	struct ci13xxx *ci = container_of(_gadget, struct ci13xxx, gadget);
+	struct ci_hdrc *ci = container_of(_gadget, struct ci_hdrc, gadget);
 	unsigned long flags;
 	int ret = 0;
 
@@ -1388,21 +1493,21 @@
 	return ret;
 }
 
-static int ci13xxx_vbus_draw(struct usb_gadget *_gadget, unsigned mA)
+static int ci_udc_vbus_draw(struct usb_gadget *_gadget, unsigned ma)
 {
-	struct ci13xxx *ci = container_of(_gadget, struct ci13xxx, gadget);
+	struct ci_hdrc *ci = container_of(_gadget, struct ci_hdrc, gadget);
 
 	if (ci->transceiver)
-		return usb_phy_set_power(ci->transceiver, mA);
+		return usb_phy_set_power(ci->transceiver, ma);
 	return -ENOTSUPP;
 }
 
 /* Change Data+ pullup status
  * this func is used by usb_gadget_connect/disconnet
  */
-static int ci13xxx_pullup(struct usb_gadget *_gadget, int is_on)
+static int ci_udc_pullup(struct usb_gadget *_gadget, int is_on)
 {
-	struct ci13xxx *ci = container_of(_gadget, struct ci13xxx, gadget);
+	struct ci_hdrc *ci = container_of(_gadget, struct ci_hdrc, gadget);
 
 	if (is_on)
 		hw_write(ci, OP_USBCMD, USBCMD_RS, USBCMD_RS);
@@ -1412,9 +1517,9 @@
 	return 0;
 }
 
-static int ci13xxx_start(struct usb_gadget *gadget,
+static int ci_udc_start(struct usb_gadget *gadget,
 			 struct usb_gadget_driver *driver);
-static int ci13xxx_stop(struct usb_gadget *gadget,
+static int ci_udc_stop(struct usb_gadget *gadget,
 			struct usb_gadget_driver *driver);
 /**
  * Device operations part of the API to the USB controller hardware,
@@ -1422,46 +1527,46 @@
  * Check  "usb_gadget.h" for details
  */
 static const struct usb_gadget_ops usb_gadget_ops = {
-	.vbus_session	= ci13xxx_vbus_session,
-	.wakeup		= ci13xxx_wakeup,
-	.pullup		= ci13xxx_pullup,
-	.vbus_draw	= ci13xxx_vbus_draw,
-	.udc_start	= ci13xxx_start,
-	.udc_stop	= ci13xxx_stop,
+	.vbus_session	= ci_udc_vbus_session,
+	.wakeup		= ci_udc_wakeup,
+	.pullup		= ci_udc_pullup,
+	.vbus_draw	= ci_udc_vbus_draw,
+	.udc_start	= ci_udc_start,
+	.udc_stop	= ci_udc_stop,
 };
 
-static int init_eps(struct ci13xxx *ci)
+static int init_eps(struct ci_hdrc *ci)
 {
 	int retval = 0, i, j;
 
 	for (i = 0; i < ci->hw_ep_max/2; i++)
 		for (j = RX; j <= TX; j++) {
 			int k = i + j * ci->hw_ep_max/2;
-			struct ci13xxx_ep *mEp = &ci->ci13xxx_ep[k];
+			struct ci_hw_ep *hwep = &ci->ci_hw_ep[k];
 
-			scnprintf(mEp->name, sizeof(mEp->name), "ep%i%s", i,
+			scnprintf(hwep->name, sizeof(hwep->name), "ep%i%s", i,
 					(j == TX)  ? "in" : "out");
 
-			mEp->ci          = ci;
-			mEp->lock         = &ci->lock;
-			mEp->td_pool      = ci->td_pool;
+			hwep->ci          = ci;
+			hwep->lock         = &ci->lock;
+			hwep->td_pool      = ci->td_pool;
 
-			mEp->ep.name      = mEp->name;
-			mEp->ep.ops       = &usb_ep_ops;
+			hwep->ep.name      = hwep->name;
+			hwep->ep.ops       = &usb_ep_ops;
 			/*
 			 * for ep0: maxP defined in desc, for other
 			 * eps, maxP is set by epautoconfig() called
 			 * by gadget layer
 			 */
-			mEp->ep.maxpacket = (unsigned short)~0;
+			hwep->ep.maxpacket = (unsigned short)~0;
 
-			INIT_LIST_HEAD(&mEp->qh.queue);
-			mEp->qh.ptr = dma_pool_alloc(ci->qh_pool, GFP_KERNEL,
-						     &mEp->qh.dma);
-			if (mEp->qh.ptr == NULL)
+			INIT_LIST_HEAD(&hwep->qh.queue);
+			hwep->qh.ptr = dma_pool_alloc(ci->qh_pool, GFP_KERNEL,
+						     &hwep->qh.dma);
+			if (hwep->qh.ptr == NULL)
 				retval = -ENOMEM;
 			else
-				memset(mEp->qh.ptr, 0, sizeof(*mEp->qh.ptr));
+				memset(hwep->qh.ptr, 0, sizeof(*hwep->qh.ptr));
 
 			/*
 			 * set up shorthands for ep0 out and in endpoints,
@@ -1469,42 +1574,42 @@
 			 */
 			if (i == 0) {
 				if (j == RX)
-					ci->ep0out = mEp;
+					ci->ep0out = hwep;
 				else
-					ci->ep0in = mEp;
+					ci->ep0in = hwep;
 
-				mEp->ep.maxpacket = CTRL_PAYLOAD_MAX;
+				hwep->ep.maxpacket = CTRL_PAYLOAD_MAX;
 				continue;
 			}
 
-			list_add_tail(&mEp->ep.ep_list, &ci->gadget.ep_list);
+			list_add_tail(&hwep->ep.ep_list, &ci->gadget.ep_list);
 		}
 
 	return retval;
 }
 
-static void destroy_eps(struct ci13xxx *ci)
+static void destroy_eps(struct ci_hdrc *ci)
 {
 	int i;
 
 	for (i = 0; i < ci->hw_ep_max; i++) {
-		struct ci13xxx_ep *mEp = &ci->ci13xxx_ep[i];
+		struct ci_hw_ep *hwep = &ci->ci_hw_ep[i];
 
-		dma_pool_free(ci->qh_pool, mEp->qh.ptr, mEp->qh.dma);
+		dma_pool_free(ci->qh_pool, hwep->qh.ptr, hwep->qh.dma);
 	}
 }
 
 /**
- * ci13xxx_start: register a gadget driver
+ * ci_udc_start: register a gadget driver
  * @gadget: our gadget
  * @driver: the driver being registered
  *
  * Interrupts are enabled here.
  */
-static int ci13xxx_start(struct usb_gadget *gadget,
+static int ci_udc_start(struct usb_gadget *gadget,
 			 struct usb_gadget_driver *driver)
 {
-	struct ci13xxx *ci = container_of(gadget, struct ci13xxx, gadget);
+	struct ci_hdrc *ci = container_of(gadget, struct ci_hdrc, gadget);
 	unsigned long flags;
 	int retval = -ENOMEM;
 
@@ -1525,9 +1630,9 @@
 
 	ci->driver = driver;
 	pm_runtime_get_sync(&ci->gadget.dev);
-	if (ci->platdata->flags & CI13XXX_PULLUP_ON_VBUS) {
+	if (ci->platdata->flags & CI_HDRC_PULLUP_ON_VBUS) {
 		if (ci->vbus_active) {
-			if (ci->platdata->flags & CI13XXX_REGS_SHARED)
+			if (ci->platdata->flags & CI_HDRC_REGS_SHARED)
 				hw_device_reset(ci, USBMODE_CM_DC);
 		} else {
 			pm_runtime_put_sync(&ci->gadget.dev);
@@ -1545,22 +1650,22 @@
 }
 
 /**
- * ci13xxx_stop: unregister a gadget driver
+ * ci_udc_stop: unregister a gadget driver
  */
-static int ci13xxx_stop(struct usb_gadget *gadget,
+static int ci_udc_stop(struct usb_gadget *gadget,
 			struct usb_gadget_driver *driver)
 {
-	struct ci13xxx *ci = container_of(gadget, struct ci13xxx, gadget);
+	struct ci_hdrc *ci = container_of(gadget, struct ci_hdrc, gadget);
 	unsigned long flags;
 
 	spin_lock_irqsave(&ci->lock, flags);
 
-	if (!(ci->platdata->flags & CI13XXX_PULLUP_ON_VBUS) ||
+	if (!(ci->platdata->flags & CI_HDRC_PULLUP_ON_VBUS) ||
 			ci->vbus_active) {
 		hw_device_state(ci, 0);
 		if (ci->platdata->notify_event)
 			ci->platdata->notify_event(ci,
-			CI13XXX_CONTROLLER_STOPPED_EVENT);
+			CI_HDRC_CONTROLLER_STOPPED_EVENT);
 		ci->driver = NULL;
 		spin_unlock_irqrestore(&ci->lock, flags);
 		_gadget_stop_activity(&ci->gadget);
@@ -1582,7 +1687,7 @@
  * This function returns IRQ_HANDLED if the IRQ has been handled
  * It locks access to registers
  */
-static irqreturn_t udc_irq(struct ci13xxx *ci)
+static irqreturn_t udc_irq(struct ci_hdrc *ci)
 {
 	irqreturn_t retval;
 	u32 intr;
@@ -1592,7 +1697,7 @@
 
 	spin_lock(&ci->lock);
 
-	if (ci->platdata->flags & CI13XXX_REGS_SHARED) {
+	if (ci->platdata->flags & CI_HDRC_REGS_SHARED) {
 		if (hw_read(ci, OP_USBMODE, USBMODE_CM) !=
 				USBMODE_CM_DC) {
 			spin_unlock(&ci->lock);
@@ -1642,7 +1747,7 @@
  * udc_start: initialize gadget role
  * @ci: chipidea controller
  */
-static int udc_start(struct ci13xxx *ci)
+static int udc_start(struct ci_hdrc *ci)
 {
 	struct device *dev = ci->dev;
 	int retval = 0;
@@ -1658,15 +1763,15 @@
 	INIT_LIST_HEAD(&ci->gadget.ep_list);
 
 	/* alloc resources */
-	ci->qh_pool = dma_pool_create("ci13xxx_qh", dev,
-				       sizeof(struct ci13xxx_qh),
-				       64, CI13XXX_PAGE_SIZE);
+	ci->qh_pool = dma_pool_create("ci_hw_qh", dev,
+				       sizeof(struct ci_hw_qh),
+				       64, CI_HDRC_PAGE_SIZE);
 	if (ci->qh_pool == NULL)
 		return -ENOMEM;
 
-	ci->td_pool = dma_pool_create("ci13xxx_td", dev,
-				       sizeof(struct ci13xxx_td),
-				       64, CI13XXX_PAGE_SIZE);
+	ci->td_pool = dma_pool_create("ci_hw_td", dev,
+				       sizeof(struct ci_hw_td),
+				       64, CI_HDRC_PAGE_SIZE);
 	if (ci->td_pool == NULL) {
 		retval = -ENOMEM;
 		goto free_qh_pool;
@@ -1684,14 +1789,14 @@
 			ci->transceiver = NULL;
 	}
 
-	if (ci->platdata->flags & CI13XXX_REQUIRE_TRANSCEIVER) {
+	if (ci->platdata->flags & CI_HDRC_REQUIRE_TRANSCEIVER) {
 		if (ci->transceiver == NULL) {
 			retval = -ENODEV;
 			goto destroy_eps;
 		}
 	}
 
-	if (!(ci->platdata->flags & CI13XXX_REGS_SHARED)) {
+	if (!(ci->platdata->flags & CI_HDRC_REGS_SHARED)) {
 		retval = hw_device_reset(ci, USBMODE_CM_DC);
 		if (retval)
 			goto put_transceiver;
@@ -1738,7 +1843,7 @@
  *
  * No interrupts active, the IRQ has been released
  */
-static void udc_stop(struct ci13xxx *ci)
+static void udc_stop(struct ci_hdrc *ci)
 {
 	if (ci == NULL)
 		return;
@@ -1765,7 +1870,7 @@
  *
  * This function enables the gadget role, if the device is "device capable".
  */
-int ci_hdrc_gadget_init(struct ci13xxx *ci)
+int ci_hdrc_gadget_init(struct ci_hdrc *ci)
 {
 	struct ci_role_driver *rdrv;
 

diff --git a/drivers/usb/chipidea/udc.h b/drivers/usb/chipidea/udc.h
index d12e8b5..455ac21 100644
--- a/drivers/usb/chipidea/udc.h
+++ b/drivers/usb/chipidea/udc.h

@@ -20,7 +20,7 @@
 #define TX        1  /* similar to USB_DIR_IN  but can be used as an index */
 
 /* DMA layout of transfer descriptors */
-struct ci13xxx_td {
+struct ci_hw_td {
 	/* 0 */
 	u32 next;
 #define TD_TERMINATE          BIT(0)
@@ -43,24 +43,31 @@
 } __attribute__ ((packed, aligned(4)));
 
 /* DMA layout of queue heads */
-struct ci13xxx_qh {
+struct ci_hw_qh {
 	/* 0 */
 	u32 cap;
 #define QH_IOS                BIT(15)
 #define QH_MAX_PKT            (0x07FFUL << 16)
 #define QH_ZLT                BIT(29)
 #define QH_MULT               (0x0003UL << 30)
+#define QH_ISO_MULT(x)		((x >> 11) & 0x03)
 	/* 1 */
 	u32 curr;
 	/* 2 - 8 */
-	struct ci13xxx_td        td;
+	struct ci_hw_td		td;
 	/* 9 */
 	u32 RESERVED;
 	struct usb_ctrlrequest   setup;
 } __attribute__ ((packed, aligned(4)));
 
+struct td_node {
+	struct list_head	td;
+	dma_addr_t		dma;
+	struct ci_hw_td		*ptr;
+};
+
 /**
- * struct ci13xxx_req - usb request representation
+ * struct ci_hw_req - usb request representation
  * @req: request structure for gadget drivers
  * @queue: link to QH list
  * @ptr: transfer descriptor for this request
@@ -68,22 +75,19 @@
  * @zptr: transfer descriptor for the zero packet
  * @zdma: dma address of the zero packet's transfer descriptor
  */
-struct ci13xxx_req {
+struct ci_hw_req {
 	struct usb_request	req;
 	struct list_head	queue;
-	struct ci13xxx_td	*ptr;
-	dma_addr_t		dma;
-	struct ci13xxx_td	*zptr;
-	dma_addr_t		zdma;
+	struct list_head	tds;
 };
 
 #ifdef CONFIG_USB_CHIPIDEA_UDC
 
-int ci_hdrc_gadget_init(struct ci13xxx *ci);
+int ci_hdrc_gadget_init(struct ci_hdrc *ci);
 
 #else
 
-static inline int ci_hdrc_gadget_init(struct ci13xxx *ci)
+static inline int ci_hdrc_gadget_init(struct ci_hdrc *ci)
 {
 	return -ENXIO;
 }

diff --git a/drivers/usb/chipidea/usbmisc_imx.c b/drivers/usb/chipidea/usbmisc_imx.c
index 714a6bd..ac5a461 100644
--- a/drivers/usb/chipidea/usbmisc_imx.c
+++ b/drivers/usb/chipidea/usbmisc_imx.c

@@ -16,7 +16,7 @@
 #include <linux/io.h>
 #include <linux/delay.h>
 
-#include "ci13xxx_imx.h"
+#include "ci_hdrc_imx.h"
 
 #define USB_DEV_MAX 4
 
@@ -175,6 +175,7 @@
 	},
 	{ /* sentinel */ }
 };
+MODULE_DEVICE_TABLE(of, usbmisc_imx_dt_ids);
 
 static int usbmisc_imx_probe(struct platform_device *pdev)
 {
@@ -243,17 +244,7 @@
 	 },
 };
 
-int usbmisc_imx_drv_init(void)
-{
-	return platform_driver_register(&usbmisc_imx_driver);
-}
-subsys_initcall(usbmisc_imx_drv_init);
-
-void usbmisc_imx_drv_exit(void)
-{
-	platform_driver_unregister(&usbmisc_imx_driver);
-}
-module_exit(usbmisc_imx_drv_exit);
+module_platform_driver(usbmisc_imx_driver);
 
 MODULE_ALIAS("platform:usbmisc-imx");
 MODULE_LICENSE("GPL v2");

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 9b1cbcf..9f49bfe 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c

@@ -216,38 +216,6 @@
 	return rc;
 }
 
-static int acm_write_start(struct acm *acm, int wbn)
-{
-	unsigned long flags;
-	struct acm_wb *wb = &acm->wb[wbn];
-	int rc;
-
-	spin_lock_irqsave(&acm->write_lock, flags);
-	if (!acm->dev) {
-		wb->use = 0;
-		spin_unlock_irqrestore(&acm->write_lock, flags);
-		return -ENODEV;
-	}
-
-	dev_vdbg(&acm->data->dev, "%s - susp_count %d\n", __func__,
-							acm->susp_count);
-	usb_autopm_get_interface_async(acm->control);
-	if (acm->susp_count) {
-		if (!acm->delayed_wb)
-			acm->delayed_wb = wb;
-		else
-			usb_autopm_put_interface_async(acm->control);
-		spin_unlock_irqrestore(&acm->write_lock, flags);
-		return 0;	/* A white lie */
-	}
-	usb_mark_last_busy(acm->dev);
-
-	rc = acm_start_wb(acm, wb);
-	spin_unlock_irqrestore(&acm->write_lock, flags);
-
-	return rc;
-
-}
 /*
  * attributes exported through sysfs
  */
@@ -653,13 +621,31 @@
 	}
 	wb = &acm->wb[wbn];
 
+	if (!acm->dev) {
+		wb->use = 0;
+		spin_unlock_irqrestore(&acm->write_lock, flags);
+		return -ENODEV;
+	}
+
 	count = (count > acm->writesize) ? acm->writesize : count;
 	dev_vdbg(&acm->data->dev, "%s - write %d\n", __func__, count);
 	memcpy(wb->buf, buf, count);
 	wb->len = count;
+
+	usb_autopm_get_interface_async(acm->control);
+	if (acm->susp_count) {
+		if (!acm->delayed_wb)
+			acm->delayed_wb = wb;
+		else
+			usb_autopm_put_interface_async(acm->control);
+		spin_unlock_irqrestore(&acm->write_lock, flags);
+		return count;	/* A white lie */
+	}
+	usb_mark_last_busy(acm->dev);
+
+	stat = acm_start_wb(acm, wb);
 	spin_unlock_irqrestore(&acm->write_lock, flags);
 
-	stat = acm_write_start(acm, wbn);
 	if (stat < 0)
 		return stat;
 	return count;

diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 4c5506a..609dbc2 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c

@@ -31,6 +31,8 @@
 #include <linux/usb/tmc.h>
 
 
+#define RIGOL			1
+#define USBTMC_HEADER_SIZE	12
 #define USBTMC_MINOR_BASE	176
 
 /*
@@ -84,6 +86,8 @@
 	u8 bTag_last_write;	/* needed for abort */
 	u8 bTag_last_read;	/* needed for abort */
 
+	u8 rigol_quirk;
+
 	/* attributes from the USB TMC spec for this device */
 	u8 TermChar;
 	bool TermCharEnabled;
@@ -97,6 +101,16 @@
 };
 #define to_usbtmc_data(d) container_of(d, struct usbtmc_device_data, kref)
 
+struct usbtmc_ID_rigol_quirk {
+	__u16 idVendor;
+	__u16 idProduct;
+};
+
+static const struct usbtmc_ID_rigol_quirk usbtmc_id_quirk[] = {
+	{ 0x1ab1, 0x0588 },
+	{ 0, 0 }
+};
+
 /* Forward declarations */
 static struct usb_driver usbtmc_driver;
 
@@ -361,6 +375,59 @@
 	return rv;
 }
 
+/*
+ * Sends a REQUEST_DEV_DEP_MSG_IN message on the Bulk-IN endpoint.
+ * @transfer_size: number of bytes to request from the device.
+ *
+ * See the USBTMC specification, Table 4.
+ *
+ * Also updates bTag_last_write.
+ */
+static int send_request_dev_dep_msg_in(struct usbtmc_device_data *data, size_t transfer_size)
+{
+	int retval;
+	u8 buffer[USBTMC_HEADER_SIZE];
+	int actual;
+
+	/* Setup IO buffer for REQUEST_DEV_DEP_MSG_IN message
+	 * Refer to class specs for details
+	 */
+	buffer[0] = 2;
+	buffer[1] = data->bTag;
+	buffer[2] = ~(data->bTag);
+	buffer[3] = 0; /* Reserved */
+	buffer[4] = (transfer_size) & 255;
+	buffer[5] = ((transfer_size) >> 8) & 255;
+	buffer[6] = ((transfer_size) >> 16) & 255;
+	buffer[7] = ((transfer_size) >> 24) & 255;
+	buffer[8] = data->TermCharEnabled * 2;
+	/* Use term character? */
+	buffer[9] = data->TermChar;
+	buffer[10] = 0; /* Reserved */
+	buffer[11] = 0; /* Reserved */
+
+	/* Send bulk URB */
+	retval = usb_bulk_msg(data->usb_dev,
+			      usb_sndbulkpipe(data->usb_dev,
+					      data->bulk_out),
+			      buffer, USBTMC_HEADER_SIZE, &actual, USBTMC_TIMEOUT);
+
+	/* Store bTag (in case we need to abort) */
+	data->bTag_last_write = data->bTag;
+
+	/* Increment bTag -- and increment again if zero */
+	data->bTag++;
+	if (!data->bTag)
+		(data->bTag)++;
+
+	if (retval < 0) {
+		dev_err(&data->intf->dev, "usb_bulk_msg in send_request_dev_dep_msg_in() returned %d\n", retval);
+		return retval;
+	}
+
+	return 0;
+}
+
 static ssize_t usbtmc_read(struct file *filp, char __user *buf,
 			   size_t count, loff_t *f_pos)
 {
@@ -388,52 +455,40 @@
 		goto exit;
 	}
 
-	remaining = count;
-	done = 0;
+	if (data->rigol_quirk) {
+		dev_dbg(dev, "usb_bulk_msg_in: count(%zu)\n", count);
 
-	while (remaining > 0) {
-		if (remaining > USBTMC_SIZE_IOBUFFER - 12 - 3)
-			this_part = USBTMC_SIZE_IOBUFFER - 12 - 3;
-		else
-			this_part = remaining;
-
-		/* Setup IO buffer for DEV_DEP_MSG_IN message
-		 * Refer to class specs for details
-		 */
-		buffer[0] = 2;
-		buffer[1] = data->bTag;
-		buffer[2] = ~(data->bTag);
-		buffer[3] = 0; /* Reserved */
-		buffer[4] = (this_part) & 255;
-		buffer[5] = ((this_part) >> 8) & 255;
-		buffer[6] = ((this_part) >> 16) & 255;
-		buffer[7] = ((this_part) >> 24) & 255;
-		buffer[8] = data->TermCharEnabled * 2;
-		/* Use term character? */
-		buffer[9] = data->TermChar;
-		buffer[10] = 0; /* Reserved */
-		buffer[11] = 0; /* Reserved */
-
-		/* Send bulk URB */
-		retval = usb_bulk_msg(data->usb_dev,
-				      usb_sndbulkpipe(data->usb_dev,
-						      data->bulk_out),
-				      buffer, 12, &actual, USBTMC_TIMEOUT);
-
-		/* Store bTag (in case we need to abort) */
-		data->bTag_last_write = data->bTag;
-
-		/* Increment bTag -- and increment again if zero */
-		data->bTag++;
-		if (!data->bTag)
-			(data->bTag)++;
+		retval = send_request_dev_dep_msg_in(data, count);
 
 		if (retval < 0) {
-			dev_err(dev, "usb_bulk_msg returned %d\n", retval);
 			if (data->auto_abort)
 				usbtmc_ioctl_abort_bulk_out(data);
 			goto exit;
 		}
+	}
+
+	/* Loop until we have fetched everything we requested */
+	remaining = count;
+	this_part = remaining;
+	done = 0;
+
+	while (remaining > 0) {
+		if (!(data->rigol_quirk)) {
+			dev_dbg(dev, "usb_bulk_msg_in: remaining(%zu), count(%zu)\n", remaining, count);
+
+			if (remaining > USBTMC_SIZE_IOBUFFER - USBTMC_HEADER_SIZE - 3)
+				this_part = USBTMC_SIZE_IOBUFFER - USBTMC_HEADER_SIZE - 3;
+			else
+				this_part = remaining;
+
+			retval = send_request_dev_dep_msg_in(data, this_part);
+			if (retval < 0) {
+			dev_err(dev, "usb_bulk_msg returned %d\n", retval);
+				if (data->auto_abort)
+					usbtmc_ioctl_abort_bulk_out(data);
+				goto exit;
+			}
+		}
 
 		/* Send bulk URB */
 		retval = usb_bulk_msg(data->usb_dev,
@@ -442,51 +497,109 @@
 				      buffer, USBTMC_SIZE_IOBUFFER, &actual,
 				      USBTMC_TIMEOUT);
 
+		dev_dbg(dev, "usb_bulk_msg: retval(%u), done(%zu), remaining(%zu), actual(%d)\n", retval, done, remaining, actual);
+
 		/* Store bTag (in case we need to abort) */
 		data->bTag_last_read = data->bTag;
 
 		if (retval < 0) {
-			dev_err(dev, "Unable to read data, error %d\n", retval);
+			dev_dbg(dev, "Unable to read data, error %d\n", retval);
 			if (data->auto_abort)
 				usbtmc_ioctl_abort_bulk_in(data);
 			goto exit;
 		}
 
-		/* How many characters did the instrument send? */
-		n_characters = buffer[4] +
-			       (buffer[5] << 8) +
-			       (buffer[6] << 16) +
-			       (buffer[7] << 24);
+		/* Parse header in first packet */
+		if ((done == 0) || (!(data->rigol_quirk))) {
+			/* Sanity checks for the header */
+			if (actual < USBTMC_HEADER_SIZE) {
+				dev_err(dev, "Device sent too small first packet: %u < %u\n", actual, USBTMC_HEADER_SIZE);
+				if (data->auto_abort)
+					usbtmc_ioctl_abort_bulk_in(data);
+				goto exit;
+			}
 
-		/* Ensure the instrument doesn't lie about it */
-		if(n_characters > actual - 12) {
-			dev_err(dev, "Device lies about message size: %u > %d\n", n_characters, actual - 12);
-			n_characters = actual - 12;
+			if (buffer[0] != 2) {
+				dev_err(dev, "Device sent reply with wrong MsgID: %u != 2\n", buffer[0]);
+				if (data->auto_abort)
+					usbtmc_ioctl_abort_bulk_in(data);
+				goto exit;
+			}
+
+			if (buffer[1] != data->bTag_last_write) {
+				dev_err(dev, "Device sent reply with wrong bTag: %u != %u\n", buffer[1], data->bTag_last_write);
+				if (data->auto_abort)
+					usbtmc_ioctl_abort_bulk_in(data);
+				goto exit;
+			}
+
+			/* How many characters did the instrument send? */
+			n_characters = buffer[4] +
+				       (buffer[5] << 8) +
+				       (buffer[6] << 16) +
+				       (buffer[7] << 24);
+
+			if (n_characters > this_part) {
+				dev_err(dev, "Device wants to return more data than requested: %u > %zu\n", n_characters, count);
+				if (data->auto_abort)
+					usbtmc_ioctl_abort_bulk_in(data);
+				goto exit;
+			}
+
+			/* Remove the USBTMC header */
+			actual -= USBTMC_HEADER_SIZE;
+
+			/* Check if the message is smaller than requested */
+			if (data->rigol_quirk) {
+				if (remaining > n_characters)
+					remaining = n_characters;
+				/* Remove padding if it exists */
+				if (actual > remaining) 
+					actual = remaining;
+			}
+			else {
+				if (this_part > n_characters)
+					this_part = n_characters;
+				/* Remove padding if it exists */
+				if (actual > this_part) 
+					actual = this_part;
+			}
+
+			dev_dbg(dev, "Bulk-IN header: N_characters(%u), bTransAttr(%u)\n", n_characters, buffer[8]);
+
+			remaining -= actual;
+
+			/* Terminate if end-of-message bit received from device */
+			if ((buffer[8] &  0x01) && (actual >= n_characters))
+				remaining = 0;
+
+			dev_dbg(dev, "Bulk-IN header: remaining(%zu), buf(%p), buffer(%p) done(%zu)\n", remaining,buf,buffer,done);
+
+
+			/* Copy buffer to user space */
+			if (copy_to_user(buf + done, &buffer[USBTMC_HEADER_SIZE], actual)) {
+				/* There must have been an addressing problem */
+				retval = -EFAULT;
+				goto exit;
+			}
+			done += actual;
 		}
+		else  {
+			if (actual > remaining) 
+				actual = remaining;
 
-		/* Ensure the instrument doesn't send more back than requested */
-		if(n_characters > this_part) {
-			dev_err(dev, "Device returns more than requested: %zu > %zu\n", done + n_characters, done + this_part);
-			n_characters = this_part;
+			remaining -= actual;
+
+			dev_dbg(dev, "Bulk-IN header cont: actual(%u), done(%zu), remaining(%zu), buf(%p), buffer(%p)\n", actual, done, remaining,buf,buffer);
+
+			/* Copy buffer to user space */
+			if (copy_to_user(buf + done, buffer, actual)) {
+				/* There must have been an addressing problem */
+				retval = -EFAULT;
+				goto exit;
+			}
+			done += actual;
 		}
-
-		/* Bound amount of data received by amount of data requested */
-		if (n_characters > this_part)
-			n_characters = this_part;
-
-		/* Copy buffer to user space */
-		if (copy_to_user(buf + done, &buffer[12], n_characters)) {
-			/* There must have been an addressing problem */
-			retval = -EFAULT;
-			goto exit;
-		}
-
-		done += n_characters;
-		/* Terminate if end-of-message bit received from device */
-		if ((buffer[8] &  0x01) && (actual >= n_characters + 12))
-			remaining = 0;
-		else
-			remaining -= n_characters;
 	}
 
 	/* Update file position value */
@@ -527,8 +640,8 @@
 	done = 0;
 
 	while (remaining > 0) {
-		if (remaining > USBTMC_SIZE_IOBUFFER - 12) {
-			this_part = USBTMC_SIZE_IOBUFFER - 12;
+		if (remaining > USBTMC_SIZE_IOBUFFER - USBTMC_HEADER_SIZE) {
+			this_part = USBTMC_SIZE_IOBUFFER - USBTMC_HEADER_SIZE;
 			buffer[8] = 0;
 		} else {
 			this_part = remaining;
@@ -549,13 +662,13 @@
 		buffer[10] = 0; /* Reserved */
 		buffer[11] = 0; /* Reserved */
 
-		if (copy_from_user(&buffer[12], buf + done, this_part)) {
+		if (copy_from_user(&buffer[USBTMC_HEADER_SIZE], buf + done, this_part)) {
 			retval = -EFAULT;
 			goto exit;
 		}
 
-		n_bytes = roundup(12 + this_part, 4);
-		memset(buffer + 12 + this_part, 0, n_bytes - (12 + this_part));
+		n_bytes = roundup(USBTMC_HEADER_SIZE + this_part, 4);
+		memset(buffer + USBTMC_HEADER_SIZE + this_part, 0, n_bytes - (USBTMC_HEADER_SIZE + this_part));
 
 		do {
 			retval = usb_bulk_msg(data->usb_dev,
@@ -1003,6 +1116,20 @@
 	mutex_init(&data->io_mutex);
 	data->zombie = 0;
 
+	/* Determine if it is a Rigol or not */
+	data->rigol_quirk = 0;
+	dev_dbg(&intf->dev, "Trying to find if device Vendor 0x%04X Product 0x%04X has the RIGOL quirk\n",
+		data->usb_dev->descriptor.idVendor,
+		data->usb_dev->descriptor.idProduct);
+	for(n = 0; usbtmc_id_quirk[n].idVendor > 0; n++) {
+		if ((usbtmc_id_quirk[n].idVendor == data->usb_dev->descriptor.idVendor) &&
+		    (usbtmc_id_quirk[n].idProduct == data->usb_dev->descriptor.idProduct)) {
+			dev_dbg(&intf->dev, "Setting this device as having the RIGOL quirk\n");
+			data->rigol_quirk = 1;
+			break;
+		}
+	}
+
 	/* Initialize USBTMC bTag and other fields */
 	data->bTag	= 1;
 	data->TermCharEnabled = 0;

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index c88c4fb..05986507 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c

@@ -49,14 +49,14 @@
 #include <linux/security.h>
 #include <linux/user_namespace.h>
 #include <linux/scatterlist.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 #include <linux/moduleparam.h>
 
 #include "usb.h"
 
 #define USB_MAXBUS			64
-#define USB_DEVICE_MAX			USB_MAXBUS * 128
+#define USB_DEVICE_MAX			(USB_MAXBUS * 128)
 #define USB_SG_SIZE			16384 /* split-size for large txs */
 
 /* Mutual exclusion for removal, open, and release */
@@ -1804,7 +1804,8 @@
 
 	/* alloc buffer */
 	if ((size = _IOC_SIZE(ctl->ioctl_code)) > 0) {
-		if ((buf = kmalloc(size, GFP_KERNEL)) == NULL)
+		buf = kmalloc(size, GFP_KERNEL);
+		if (buf == NULL)
 			return -ENOMEM;
 		if ((_IOC_DIR(ctl->ioctl_code) & _IOC_WRITE)) {
 			if (copy_from_user(buf, ctl->data, size)) {

diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index e5387a4..6a4c407 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c

@@ -94,7 +94,7 @@
 	kref_init(&usb_class->kref);
 	usb_class->class = class_create(THIS_MODULE, "usbmisc");
 	if (IS_ERR(usb_class->class)) {
-		result = IS_ERR(usb_class->class);
+		result = PTR_ERR(usb_class->class);
 		printk(KERN_ERR "class_create failed for usb devices\n");
 		kfree(usb_class);
 		usb_class = NULL;

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index d53547d..014dc99 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c

@@ -149,6 +149,27 @@
 	0x01        /*  __u8  bNumConfigurations; */
 };
 
+/* usb 2.5 (wireless USB 1.0) root hub device descriptor */
+static const u8 usb25_rh_dev_descriptor[18] = {
+	0x12,       /*  __u8  bLength; */
+	0x01,       /*  __u8  bDescriptorType; Device */
+	0x50, 0x02, /*  __le16 bcdUSB; v2.5 */
+
+	0x09,	    /*  __u8  bDeviceClass; HUB_CLASSCODE */
+	0x00,	    /*  __u8  bDeviceSubClass; */
+	0x00,       /*  __u8  bDeviceProtocol; [ usb 2.0 no TT ] */
+	0xFF,       /*  __u8  bMaxPacketSize0; always 0xFF (WUSB Spec 7.4.1). */
+
+	0x6b, 0x1d, /*  __le16 idVendor; Linux Foundation 0x1d6b */
+	0x02, 0x00, /*  __le16 idProduct; device 0x0002 */
+	KERNEL_VER, KERNEL_REL, /*  __le16 bcdDevice */
+
+	0x03,       /*  __u8  iManufacturer; */
+	0x02,       /*  __u8  iProduct; */
+	0x01,       /*  __u8  iSerialNumber; */
+	0x01        /*  __u8  bNumConfigurations; */
+};
+
 /* usb 2.0 root hub device descriptor */
 static const u8 usb2_rh_dev_descriptor [18] = {
 	0x12,       /*  __u8  bLength; */
@@ -527,6 +548,9 @@
 			case HCD_USB3:
 				bufp = usb3_rh_dev_descriptor;
 				break;
+			case HCD_USB25:
+				bufp = usb25_rh_dev_descriptor;
+				break;
 			case HCD_USB2:
 				bufp = usb2_rh_dev_descriptor;
 				break;
@@ -546,6 +570,7 @@
 				bufp = ss_rh_config_descriptor;
 				len = sizeof ss_rh_config_descriptor;
 				break;
+			case HCD_USB25:
 			case HCD_USB2:
 				bufp = hs_rh_config_descriptor;
 				len = sizeof hs_rh_config_descriptor;
@@ -2511,6 +2536,9 @@
 	case HCD_USB2:
 		rhdev->speed = USB_SPEED_HIGH;
 		break;
+	case HCD_USB25:
+		rhdev->speed = USB_SPEED_WIRELESS;
+		break;
 	case HCD_USB3:
 		rhdev->speed = USB_SPEED_SUPER;
 		break;

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index feef935..4191db3 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c

@@ -718,18 +718,18 @@
 
 /**
  * usb_hub_set_port_power - control hub port's power state
- * @hdev: target hub
+ * @hdev: USB device belonging to the usb hub
+ * @hub: target hub
  * @port1: port index
  * @set: expected status
  *
  * call this function to control port's power via setting or
  * clearing the port's PORT_POWER feature.
  */
-int usb_hub_set_port_power(struct usb_device *hdev, int port1,
-		bool set)
+int usb_hub_set_port_power(struct usb_device *hdev, struct usb_hub *hub,
+			   int port1, bool set)
 {
 	int ret;
-	struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
 	struct usb_port *port_dev = hub->ports[port1 - 1];
 
 	if (set)
@@ -1769,15 +1769,17 @@
 static int find_port_owner(struct usb_device *hdev, unsigned port1,
 		struct dev_state ***ppowner)
 {
+	struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
+
 	if (hdev->state == USB_STATE_NOTATTACHED)
 		return -ENODEV;
 	if (port1 == 0 || port1 > hdev->maxchild)
 		return -EINVAL;
 
-	/* This assumes that devices not managed by the hub driver
+	/* Devices not managed by the hub driver
 	 * will always have maxchild equal to 0.
 	 */
-	*ppowner = &(usb_hub_to_struct_hub(hdev)->ports[port1 - 1]->port_owner);
+	*ppowner = &(hub->ports[port1 - 1]->port_owner);
 	return 0;
 }
 
@@ -5323,7 +5325,8 @@
 {
 	struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
 
-	hub->ports[port1 - 1]->connect_type = type;
+	if (hub)
+		hub->ports[port1 - 1]->connect_type = type;
 }
 
 /**
@@ -5339,6 +5342,9 @@
 {
 	struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
 
+	if (!hub)
+		return USB_PORT_CONNECT_TYPE_UNKNOWN;
+
 	return hub->ports[port1 - 1]->connect_type;
 }
 
@@ -5397,6 +5403,9 @@
 {
 	struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
 
+	if (!hub)
+		return NULL;
+
 	return DEVICE_ACPI_HANDLE(&hub->ports[port1 - 1]->dev);
 }
 #endif

diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
index 80ab9ee..6508e02 100644
--- a/drivers/usb/core/hub.h
+++ b/drivers/usb/core/hub.h

@@ -100,7 +100,7 @@
 		int port1);
 extern void usb_hub_remove_port_device(struct usb_hub *hub,
 		int port1);
-extern int usb_hub_set_port_power(struct usb_device *hdev,
+extern int usb_hub_set_port_power(struct usb_device *hdev, struct usb_hub *hub,
 		int port1, bool set);
 extern struct usb_hub *usb_hub_to_struct_hub(struct usb_device *hdev);
 extern int hub_port_debounce(struct usb_hub *hub, int port1,

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 444d30e..e7ee1e4 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c

@@ -252,7 +252,7 @@
 {
 	if (io->urbs) {
 		while (io->entries--)
-			usb_free_urb(io->urbs [io->entries]);
+			usb_free_urb(io->urbs[io->entries]);
 		kfree(io->urbs);
 		io->urbs = NULL;
 	}
@@ -300,10 +300,10 @@
 		 */
 		spin_unlock(&io->lock);
 		for (i = 0, found = 0; i < io->entries; i++) {
-			if (!io->urbs [i] || !io->urbs [i]->dev)
+			if (!io->urbs[i] || !io->urbs[i]->dev)
 				continue;
 			if (found) {
-				retval = usb_unlink_urb(io->urbs [i]);
+				retval = usb_unlink_urb(io->urbs[i]);
 				if (retval != -EINPROGRESS &&
 				    retval != -ENODEV &&
 				    retval != -EBUSY &&
@@ -311,7 +311,7 @@
 					dev_err(&io->dev->dev,
 						"%s, unlink --> %d\n",
 						__func__, retval);
-			} else if (urb == io->urbs [i])
+			} else if (urb == io->urbs[i])
 				found = 1;
 		}
 		spin_lock(&io->lock);
@@ -379,7 +379,7 @@
 	}
 
 	/* initialize all the urbs we'll use */
-	io->urbs = kmalloc(io->entries * sizeof *io->urbs, mem_flags);
+	io->urbs = kmalloc(io->entries * sizeof(*io->urbs), mem_flags);
 	if (!io->urbs)
 		goto nomem;
 
@@ -511,7 +511,7 @@
 		int retval;
 
 		io->urbs[i]->dev = io->dev;
-		retval = usb_submit_urb(io->urbs [i], GFP_ATOMIC);
+		retval = usb_submit_urb(io->urbs[i], GFP_ATOMIC);
 
 		/* after we submit, let completions or cancelations fire;
 		 * we handshake using io->status.
@@ -586,9 +586,9 @@
 		for (i = 0; i < io->entries; i++) {
 			int retval;
 
-			if (!io->urbs [i]->dev)
+			if (!io->urbs[i]->dev)
 				continue;
-			retval = usb_unlink_urb(io->urbs [i]);
+			retval = usb_unlink_urb(io->urbs[i]);
 			if (retval != -EINPROGRESS
 					&& retval != -ENODEV
 					&& retval != -EBUSY

diff --git a/drivers/usb/core/port.c b/drivers/usb/core/port.c
index b8bad29..5fd3fee 100644
--- a/drivers/usb/core/port.c
+++ b/drivers/usb/core/port.c

@@ -86,7 +86,7 @@
 	usb_autopm_get_interface(intf);
 	set_bit(port1, hub->busy_bits);
 
-	retval = usb_hub_set_port_power(hdev, port1, true);
+	retval = usb_hub_set_port_power(hdev, hub, port1, true);
 	if (port_dev->child && !retval) {
 		/*
 		 * Wait for usb hub port to be reconnected in order to make
@@ -128,7 +128,7 @@
 
 	usb_autopm_get_interface(intf);
 	set_bit(port1, hub->busy_bits);
-	retval = usb_hub_set_port_power(hdev, port1, false);
+	retval = usb_hub_set_port_power(hdev, hub, port1, false);
 	usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION);
 	usb_clear_port_feature(hdev, port1,	USB_PORT_FEAT_C_ENABLE);
 	clear_bit(port1, hub->busy_bits);

diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index aa38db4..d9284b9 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c

@@ -497,8 +497,62 @@
 static DEVICE_ATTR(usb2_hardware_lpm, S_IRUGO | S_IWUSR, show_usb2_hardware_lpm,
 			set_usb2_hardware_lpm);
 
+static ssize_t
+show_usb2_lpm_l1_timeout(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct usb_device *udev = to_usb_device(dev);
+	return sprintf(buf, "%d\n", udev->l1_params.timeout);
+}
+
+static ssize_t
+set_usb2_lpm_l1_timeout(struct device *dev, struct device_attribute *attr,
+			const char *buf, size_t count)
+{
+	struct usb_device *udev = to_usb_device(dev);
+	u16 timeout;
+
+	if (kstrtou16(buf, 0, &timeout))
+		return -EINVAL;
+
+	udev->l1_params.timeout = timeout;
+
+	return count;
+}
+
+static DEVICE_ATTR(usb2_lpm_l1_timeout, S_IRUGO | S_IWUSR,
+		   show_usb2_lpm_l1_timeout, set_usb2_lpm_l1_timeout);
+
+static ssize_t
+show_usb2_lpm_besl(struct device *dev, struct device_attribute *attr,
+		   char *buf)
+{
+	struct usb_device *udev = to_usb_device(dev);
+	return sprintf(buf, "%d\n", udev->l1_params.besl);
+}
+
+static ssize_t
+set_usb2_lpm_besl(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct usb_device *udev = to_usb_device(dev);
+	u8 besl;
+
+	if (kstrtou8(buf, 0, &besl) || besl > 15)
+		return -EINVAL;
+
+	udev->l1_params.besl = besl;
+
+	return count;
+}
+
+static DEVICE_ATTR(usb2_lpm_besl, S_IRUGO | S_IWUSR,
+		   show_usb2_lpm_besl, set_usb2_lpm_besl);
+
 static struct attribute *usb2_hardware_lpm_attr[] = {
 	&dev_attr_usb2_hardware_lpm.attr,
+	&dev_attr_usb2_lpm_l1_timeout.attr,
+	&dev_attr_usb2_lpm_besl.attr,
 	NULL,
 };
 static struct attribute_group usb2_hardware_lpm_attr_group = {

diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index b10da72..7dad603 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c

@@ -209,6 +209,39 @@
 }
 EXPORT_SYMBOL_GPL(usb_find_interface);
 
+struct each_dev_arg {
+	void *data;
+	int (*fn)(struct usb_device *, void *);
+};
+
+static int __each_dev(struct device *dev, void *data)
+{
+	struct each_dev_arg *arg = (struct each_dev_arg *)data;
+
+	/* There are struct usb_interface on the same bus, filter them out */
+	if (!is_usb_device(dev))
+		return 0;
+
+	return arg->fn(container_of(dev, struct usb_device, dev), arg->data);
+}
+
+/**
+ * usb_for_each_dev - iterate over all USB devices in the system
+ * @data: data pointer that will be handed to the callback function
+ * @fn: callback function to be called for each USB device
+ *
+ * Iterate over all USB devices and call @fn for each, passing it @data. If it
+ * returns anything other than 0, we break the iteration prematurely and return
+ * that value.
+ */
+int usb_for_each_dev(void *data, int (*fn)(struct usb_device *, void *))
+{
+	struct each_dev_arg arg = {data, fn};
+
+	return bus_for_each_dev(&usb_bus_type, NULL, &arg, __each_dev);
+}
+EXPORT_SYMBOL_GPL(usb_for_each_dev);
+
 /**
  * usb_release_dev - free a usb device structure when all users of it are finished.
  * @dev: device that's been disconnected

diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c
index 34638b9..077f110b 100644
--- a/drivers/usb/dwc3/dwc3-omap.c
+++ b/drivers/usb/dwc3/dwc3-omap.c

@@ -61,21 +61,46 @@
 #define USBOTGSS_REVISION			0x0000
 #define USBOTGSS_SYSCONFIG			0x0010
 #define USBOTGSS_IRQ_EOI			0x0020
+#define USBOTGSS_EOI_OFFSET			0x0008
 #define USBOTGSS_IRQSTATUS_RAW_0		0x0024
 #define USBOTGSS_IRQSTATUS_0			0x0028
 #define USBOTGSS_IRQENABLE_SET_0		0x002c
 #define USBOTGSS_IRQENABLE_CLR_0		0x0030
-#define USBOTGSS_IRQSTATUS_RAW_1		0x0034
-#define USBOTGSS_IRQSTATUS_1			0x0038
-#define USBOTGSS_IRQENABLE_SET_1		0x003c
-#define USBOTGSS_IRQENABLE_CLR_1		0x0040
+#define USBOTGSS_IRQ0_OFFSET			0x0004
+#define USBOTGSS_IRQSTATUS_RAW_1		0x0030
+#define USBOTGSS_IRQSTATUS_1			0x0034
+#define USBOTGSS_IRQENABLE_SET_1		0x0038
+#define USBOTGSS_IRQENABLE_CLR_1		0x003c
+#define USBOTGSS_IRQSTATUS_RAW_2		0x0040
+#define USBOTGSS_IRQSTATUS_2			0x0044
+#define USBOTGSS_IRQENABLE_SET_2		0x0048
+#define USBOTGSS_IRQENABLE_CLR_2		0x004c
+#define USBOTGSS_IRQSTATUS_RAW_3		0x0050
+#define USBOTGSS_IRQSTATUS_3			0x0054
+#define USBOTGSS_IRQENABLE_SET_3		0x0058
+#define USBOTGSS_IRQENABLE_CLR_3		0x005c
+#define USBOTGSS_IRQSTATUS_EOI_MISC		0x0030
+#define USBOTGSS_IRQSTATUS_RAW_MISC		0x0034
+#define USBOTGSS_IRQSTATUS_MISC			0x0038
+#define USBOTGSS_IRQENABLE_SET_MISC		0x003c
+#define USBOTGSS_IRQENABLE_CLR_MISC		0x0040
+#define USBOTGSS_IRQMISC_OFFSET			0x03fc
 #define USBOTGSS_UTMI_OTG_CTRL			0x0080
 #define USBOTGSS_UTMI_OTG_STATUS		0x0084
+#define USBOTGSS_UTMI_OTG_OFFSET		0x0480
+#define USBOTGSS_TXFIFO_DEPTH			0x0508
+#define USBOTGSS_RXFIFO_DEPTH			0x050c
 #define USBOTGSS_MMRAM_OFFSET			0x0100
 #define USBOTGSS_FLADJ				0x0104
 #define USBOTGSS_DEBUG_CFG			0x0108
 #define USBOTGSS_DEBUG_DATA			0x010c
+#define USBOTGSS_DEV_EBC_EN			0x0110
+#define USBOTGSS_DEBUG_OFFSET			0x0600
 
+/* REVISION REGISTER */
+#define USBOTGSS_REVISION_XMAJOR(reg)		((reg >> 8) & 0x7)
+#define USBOTGSS_REVISION_XMAJOR1		1
+#define USBOTGSS_REVISION_XMAJOR2		2
 /* SYSCONFIG REGISTER */
 #define USBOTGSS_SYSCONFIG_DMADISABLE		(1 << 16)
 
@@ -85,17 +110,17 @@
 /* IRQS0 BITS */
 #define USBOTGSS_IRQO_COREIRQ_ST		(1 << 0)
 
-/* IRQ1 BITS */
-#define USBOTGSS_IRQ1_DMADISABLECLR		(1 << 17)
-#define USBOTGSS_IRQ1_OEVT			(1 << 16)
-#define USBOTGSS_IRQ1_DRVVBUS_RISE		(1 << 13)
-#define USBOTGSS_IRQ1_CHRGVBUS_RISE		(1 << 12)
-#define USBOTGSS_IRQ1_DISCHRGVBUS_RISE		(1 << 11)
-#define USBOTGSS_IRQ1_IDPULLUP_RISE		(1 << 8)
-#define USBOTGSS_IRQ1_DRVVBUS_FALL		(1 << 5)
-#define USBOTGSS_IRQ1_CHRGVBUS_FALL		(1 << 4)
-#define USBOTGSS_IRQ1_DISCHRGVBUS_FALL		(1 << 3)
-#define USBOTGSS_IRQ1_IDPULLUP_FALL		(1 << 0)
+/* IRQMISC BITS */
+#define USBOTGSS_IRQMISC_DMADISABLECLR		(1 << 17)
+#define USBOTGSS_IRQMISC_OEVT			(1 << 16)
+#define USBOTGSS_IRQMISC_DRVVBUS_RISE		(1 << 13)
+#define USBOTGSS_IRQMISC_CHRGVBUS_RISE		(1 << 12)
+#define USBOTGSS_IRQMISC_DISCHRGVBUS_RISE	(1 << 11)
+#define USBOTGSS_IRQMISC_IDPULLUP_RISE		(1 << 8)
+#define USBOTGSS_IRQMISC_DRVVBUS_FALL		(1 << 5)
+#define USBOTGSS_IRQMISC_CHRGVBUS_FALL		(1 << 4)
+#define USBOTGSS_IRQMISC_DISCHRGVBUS_FALL		(1 << 3)
+#define USBOTGSS_IRQMISC_IDPULLUP_FALL		(1 << 0)
 
 /* UTMI_OTG_CTRL REGISTER */
 #define USBOTGSS_UTMI_OTG_CTRL_DRVVBUS		(1 << 5)
@@ -122,6 +147,12 @@
 	void __iomem		*base;
 
 	u32			utmi_otg_status;
+	u32			utmi_otg_offset;
+	u32			irqmisc_offset;
+	u32			irq_eoi_offset;
+	u32			debug_offset;
+	u32			irq0_offset;
+	u32			revision;
 
 	u32			dma_status:1;
 };
@@ -138,6 +169,58 @@
 	writel(value, base + offset);
 }
 
+static u32 dwc3_omap_read_utmi_status(struct dwc3_omap *omap)
+{
+	return dwc3_omap_readl(omap->base, USBOTGSS_UTMI_OTG_STATUS +
+							omap->utmi_otg_offset);
+}
+
+static void dwc3_omap_write_utmi_status(struct dwc3_omap *omap, u32 value)
+{
+	dwc3_omap_writel(omap->base, USBOTGSS_UTMI_OTG_STATUS +
+					omap->utmi_otg_offset, value);
+
+}
+
+static u32 dwc3_omap_read_irq0_status(struct dwc3_omap *omap)
+{
+	return dwc3_omap_readl(omap->base, USBOTGSS_IRQSTATUS_0 -
+						omap->irq0_offset);
+}
+
+static void dwc3_omap_write_irq0_status(struct dwc3_omap *omap, u32 value)
+{
+	dwc3_omap_writel(omap->base, USBOTGSS_IRQSTATUS_0 -
+						omap->irq0_offset, value);
+
+}
+
+static u32 dwc3_omap_read_irqmisc_status(struct dwc3_omap *omap)
+{
+	return dwc3_omap_readl(omap->base, USBOTGSS_IRQSTATUS_MISC +
+						omap->irqmisc_offset);
+}
+
+static void dwc3_omap_write_irqmisc_status(struct dwc3_omap *omap, u32 value)
+{
+	dwc3_omap_writel(omap->base, USBOTGSS_IRQSTATUS_MISC +
+					omap->irqmisc_offset, value);
+
+}
+
+static void dwc3_omap_write_irqmisc_set(struct dwc3_omap *omap, u32 value)
+{
+	dwc3_omap_writel(omap->base, USBOTGSS_IRQENABLE_SET_MISC +
+						omap->irqmisc_offset, value);
+
+}
+
+static void dwc3_omap_write_irq0_set(struct dwc3_omap *omap, u32 value)
+{
+	dwc3_omap_writel(omap->base, USBOTGSS_IRQENABLE_SET_0 -
+						omap->irq0_offset, value);
+}
+
 int dwc3_omap_mailbox(enum omap_dwc3_vbus_id_status status)
 {
 	u32			val;
@@ -150,38 +233,38 @@
 	case OMAP_DWC3_ID_GROUND:
 		dev_dbg(omap->dev, "ID GND\n");
 
-		val = dwc3_omap_readl(omap->base, USBOTGSS_UTMI_OTG_STATUS);
+		val = dwc3_omap_read_utmi_status(omap);
 		val &= ~(USBOTGSS_UTMI_OTG_STATUS_IDDIG
 				| USBOTGSS_UTMI_OTG_STATUS_VBUSVALID
 				| USBOTGSS_UTMI_OTG_STATUS_SESSEND);
 		val |= USBOTGSS_UTMI_OTG_STATUS_SESSVALID
 				| USBOTGSS_UTMI_OTG_STATUS_POWERPRESENT;
-		dwc3_omap_writel(omap->base, USBOTGSS_UTMI_OTG_STATUS, val);
+		dwc3_omap_write_utmi_status(omap, val);
 		break;
 
 	case OMAP_DWC3_VBUS_VALID:
 		dev_dbg(omap->dev, "VBUS Connect\n");
 
-		val = dwc3_omap_readl(omap->base, USBOTGSS_UTMI_OTG_STATUS);
+		val = dwc3_omap_read_utmi_status(omap);
 		val &= ~USBOTGSS_UTMI_OTG_STATUS_SESSEND;
 		val |= USBOTGSS_UTMI_OTG_STATUS_IDDIG
 				| USBOTGSS_UTMI_OTG_STATUS_VBUSVALID
 				| USBOTGSS_UTMI_OTG_STATUS_SESSVALID
 				| USBOTGSS_UTMI_OTG_STATUS_POWERPRESENT;
-		dwc3_omap_writel(omap->base, USBOTGSS_UTMI_OTG_STATUS, val);
+		dwc3_omap_write_utmi_status(omap, val);
 		break;
 
 	case OMAP_DWC3_ID_FLOAT:
 	case OMAP_DWC3_VBUS_OFF:
 		dev_dbg(omap->dev, "VBUS Disconnect\n");
 
-		val = dwc3_omap_readl(omap->base, USBOTGSS_UTMI_OTG_STATUS);
+		val = dwc3_omap_read_utmi_status(omap);
 		val &= ~(USBOTGSS_UTMI_OTG_STATUS_SESSVALID
 				| USBOTGSS_UTMI_OTG_STATUS_VBUSVALID
 				| USBOTGSS_UTMI_OTG_STATUS_POWERPRESENT);
 		val |= USBOTGSS_UTMI_OTG_STATUS_SESSEND
 				| USBOTGSS_UTMI_OTG_STATUS_IDDIG;
-		dwc3_omap_writel(omap->base, USBOTGSS_UTMI_OTG_STATUS, val);
+		dwc3_omap_write_utmi_status(omap, val);
 		break;
 
 	default:
@@ -199,44 +282,45 @@
 
 	spin_lock(&omap->lock);
 
-	reg = dwc3_omap_readl(omap->base, USBOTGSS_IRQSTATUS_1);
+	reg = dwc3_omap_read_irqmisc_status(omap);
 
-	if (reg & USBOTGSS_IRQ1_DMADISABLECLR) {
+	if (reg & USBOTGSS_IRQMISC_DMADISABLECLR) {
 		dev_dbg(omap->dev, "DMA Disable was Cleared\n");
 		omap->dma_status = false;
 	}
 
-	if (reg & USBOTGSS_IRQ1_OEVT)
+	if (reg & USBOTGSS_IRQMISC_OEVT)
 		dev_dbg(omap->dev, "OTG Event\n");
 
-	if (reg & USBOTGSS_IRQ1_DRVVBUS_RISE)
+	if (reg & USBOTGSS_IRQMISC_DRVVBUS_RISE)
 		dev_dbg(omap->dev, "DRVVBUS Rise\n");
 
-	if (reg & USBOTGSS_IRQ1_CHRGVBUS_RISE)
+	if (reg & USBOTGSS_IRQMISC_CHRGVBUS_RISE)
 		dev_dbg(omap->dev, "CHRGVBUS Rise\n");
 
-	if (reg & USBOTGSS_IRQ1_DISCHRGVBUS_RISE)
+	if (reg & USBOTGSS_IRQMISC_DISCHRGVBUS_RISE)
 		dev_dbg(omap->dev, "DISCHRGVBUS Rise\n");
 
-	if (reg & USBOTGSS_IRQ1_IDPULLUP_RISE)
+	if (reg & USBOTGSS_IRQMISC_IDPULLUP_RISE)
 		dev_dbg(omap->dev, "IDPULLUP Rise\n");
 
-	if (reg & USBOTGSS_IRQ1_DRVVBUS_FALL)
+	if (reg & USBOTGSS_IRQMISC_DRVVBUS_FALL)
 		dev_dbg(omap->dev, "DRVVBUS Fall\n");
 
-	if (reg & USBOTGSS_IRQ1_CHRGVBUS_FALL)
+	if (reg & USBOTGSS_IRQMISC_CHRGVBUS_FALL)
 		dev_dbg(omap->dev, "CHRGVBUS Fall\n");
 
-	if (reg & USBOTGSS_IRQ1_DISCHRGVBUS_FALL)
+	if (reg & USBOTGSS_IRQMISC_DISCHRGVBUS_FALL)
 		dev_dbg(omap->dev, "DISCHRGVBUS Fall\n");
 
-	if (reg & USBOTGSS_IRQ1_IDPULLUP_FALL)
+	if (reg & USBOTGSS_IRQMISC_IDPULLUP_FALL)
 		dev_dbg(omap->dev, "IDPULLUP Fall\n");
 
-	dwc3_omap_writel(omap->base, USBOTGSS_IRQSTATUS_1, reg);
+	dwc3_omap_write_irqmisc_status(omap, reg);
 
-	reg = dwc3_omap_readl(omap->base, USBOTGSS_IRQSTATUS_0);
-	dwc3_omap_writel(omap->base, USBOTGSS_IRQSTATUS_0, reg);
+	reg = dwc3_omap_read_irq0_status(omap);
+
+	dwc3_omap_write_irq0_status(omap, reg);
 
 	spin_unlock(&omap->lock);
 
@@ -258,26 +342,26 @@
 
 	/* enable all IRQs */
 	reg = USBOTGSS_IRQO_COREIRQ_ST;
-	dwc3_omap_writel(omap->base, USBOTGSS_IRQENABLE_SET_0, reg);
+	dwc3_omap_write_irq0_set(omap, reg);
 
-	reg = (USBOTGSS_IRQ1_OEVT |
-			USBOTGSS_IRQ1_DRVVBUS_RISE |
-			USBOTGSS_IRQ1_CHRGVBUS_RISE |
-			USBOTGSS_IRQ1_DISCHRGVBUS_RISE |
-			USBOTGSS_IRQ1_IDPULLUP_RISE |
-			USBOTGSS_IRQ1_DRVVBUS_FALL |
-			USBOTGSS_IRQ1_CHRGVBUS_FALL |
-			USBOTGSS_IRQ1_DISCHRGVBUS_FALL |
-			USBOTGSS_IRQ1_IDPULLUP_FALL);
+	reg = (USBOTGSS_IRQMISC_OEVT |
+			USBOTGSS_IRQMISC_DRVVBUS_RISE |
+			USBOTGSS_IRQMISC_CHRGVBUS_RISE |
+			USBOTGSS_IRQMISC_DISCHRGVBUS_RISE |
+			USBOTGSS_IRQMISC_IDPULLUP_RISE |
+			USBOTGSS_IRQMISC_DRVVBUS_FALL |
+			USBOTGSS_IRQMISC_CHRGVBUS_FALL |
+			USBOTGSS_IRQMISC_DISCHRGVBUS_FALL |
+			USBOTGSS_IRQMISC_IDPULLUP_FALL);
 
-	dwc3_omap_writel(omap->base, USBOTGSS_IRQENABLE_SET_1, reg);
+	dwc3_omap_write_irqmisc_set(omap, reg);
 }
 
 static void dwc3_omap_disable_irqs(struct dwc3_omap *omap)
 {
 	/* disable all IRQs */
-	dwc3_omap_writel(omap->base, USBOTGSS_IRQENABLE_SET_1, 0x00);
-	dwc3_omap_writel(omap->base, USBOTGSS_IRQENABLE_SET_0, 0x00);
+	dwc3_omap_write_irqmisc_set(omap, 0x00);
+	dwc3_omap_write_irq0_set(omap, 0x00);
 }
 
 static u64 dwc3_omap_dma_mask = DMA_BIT_MASK(32);
@@ -294,6 +378,7 @@
 	int			irq;
 
 	int			utmi_mode = 0;
+	int			x_major;
 
 	u32			reg;
 
@@ -347,10 +432,46 @@
 	ret = pm_runtime_get_sync(dev);
 	if (ret < 0) {
 		dev_err(dev, "get_sync failed with err %d\n", ret);
-		return ret;
+		goto err0;
 	}
 
-	reg = dwc3_omap_readl(omap->base, USBOTGSS_UTMI_OTG_STATUS);
+	reg = dwc3_omap_readl(omap->base, USBOTGSS_REVISION);
+	omap->revision = reg;
+	x_major = USBOTGSS_REVISION_XMAJOR(reg);
+
+	/* Differentiate between OMAP5 and AM437x */
+	switch (x_major) {
+	case USBOTGSS_REVISION_XMAJOR1:
+	case USBOTGSS_REVISION_XMAJOR2:
+		omap->irq_eoi_offset = 0;
+		omap->irq0_offset = 0;
+		omap->irqmisc_offset = 0;
+		omap->utmi_otg_offset = 0;
+		omap->debug_offset = 0;
+		break;
+	default:
+		/* Default to the latest revision */
+		omap->irq_eoi_offset = USBOTGSS_EOI_OFFSET;
+		omap->irq0_offset = USBOTGSS_IRQ0_OFFSET;
+		omap->irqmisc_offset = USBOTGSS_IRQMISC_OFFSET;
+		omap->utmi_otg_offset = USBOTGSS_UTMI_OTG_OFFSET;
+		omap->debug_offset = USBOTGSS_DEBUG_OFFSET;
+		break;
+	}
+
+	/* For OMAP5(ES2.0) and AM437x x_major is 2 even though there are
+	 * changes in wrapper registers, Using dt compatible for aegis
+	 */
+
+	if (of_device_is_compatible(node, "ti,am437x-dwc3")) {
+		omap->irq_eoi_offset = USBOTGSS_EOI_OFFSET;
+		omap->irq0_offset = USBOTGSS_IRQ0_OFFSET;
+		omap->irqmisc_offset = USBOTGSS_IRQMISC_OFFSET;
+		omap->utmi_otg_offset = USBOTGSS_UTMI_OTG_OFFSET;
+		omap->debug_offset = USBOTGSS_DEBUG_OFFSET;
+	}
+
+	reg = dwc3_omap_read_utmi_status(omap);
 
 	of_property_read_u32(node, "utmi-mode", &utmi_mode);
 
@@ -365,7 +486,7 @@
 		dev_dbg(dev, "UNKNOWN utmi mode %d\n", utmi_mode);
 	}
 
-	dwc3_omap_writel(omap->base, USBOTGSS_UTMI_OTG_STATUS, reg);
+	dwc3_omap_write_utmi_status(omap, reg);
 
 	/* check the DMA Status */
 	reg = dwc3_omap_readl(omap->base, USBOTGSS_SYSCONFIG);
@@ -376,7 +497,7 @@
 	if (ret) {
 		dev_err(dev, "failed to request IRQ #%d --> %d\n",
 				omap->irq, ret);
-		return ret;
+		goto err1;
 	}
 
 	dwc3_omap_enable_irqs(omap);
@@ -384,10 +505,21 @@
 	ret = of_platform_populate(node, NULL, NULL, dev);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to create dwc3 core\n");
-		return ret;
+		goto err2;
 	}
 
 	return 0;
+
+err2:
+	dwc3_omap_disable_irqs(omap);
+
+err1:
+	pm_runtime_put_sync(dev);
+
+err0:
+	pm_runtime_disable(dev);
+
+	return ret;
 }
 
 static int dwc3_omap_remove(struct platform_device *pdev)
@@ -406,6 +538,9 @@
 	{
 		.compatible =	"ti,dwc3"
 	},
+	{
+		.compatible =	"ti,am437x-dwc3"
+	},
 	{ },
 };
 MODULE_DEVICE_TABLE(of, of_dwc3_match);
@@ -431,8 +566,7 @@
 {
 	struct dwc3_omap	*omap = dev_get_drvdata(dev);
 
-	omap->utmi_otg_status = dwc3_omap_readl(omap->base,
-			USBOTGSS_UTMI_OTG_STATUS);
+	omap->utmi_otg_status = dwc3_omap_read_utmi_status(omap);
 
 	return 0;
 }
@@ -441,8 +575,7 @@
 {
 	struct dwc3_omap	*omap = dev_get_drvdata(dev);
 
-	dwc3_omap_writel(omap->base, USBOTGSS_UTMI_OTG_STATUS,
-			omap->utmi_otg_status);
+	dwc3_omap_write_utmi_status(omap, omap->utmi_otg_status);
 
 	pm_runtime_disable(dev);
 	pm_runtime_set_active(dev);

diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c
index eba9e2b..ed07ec0 100644
--- a/drivers/usb/dwc3/dwc3-pci.c
+++ b/drivers/usb/dwc3/dwc3-pci.c

@@ -133,7 +133,6 @@
 		return -ENODEV;
 	}
 
-	pci_set_power_state(pci, PCI_D0);
 	pci_set_master(pci);
 
 	ret = dwc3_pci_register_phys(glue);

diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index f41aa0d..01b8229 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig

@@ -192,6 +192,16 @@
 	help
 	   Faraday usb device controller FUSB300 driver
 
+config USB_FOTG210_UDC
+	tristate "Faraday FOTG210 USB Peripheral Controller"
+	help
+	   Faraday USB2.0 OTG controller which can be configured as
+	   high speed or full speed USB device. This driver supppors
+	   Bulk Transfer so far.
+
+	   Say "y" to link the driver statically, or "m" to build a
+	   dynamically linked module called "fotg210_udc".
+
 config USB_OMAP
 	tristate "OMAP USB Device Controller"
 	depends on ARCH_OMAP1
@@ -334,14 +344,6 @@
 # Controllers available in both integrated and discrete versions
 #
 
-# musb builds in ../musb along with host support
-config USB_GADGET_MUSB_HDRC
-	tristate "Inventra HDRC USB Peripheral (TI, ADI, ...)"
-	depends on USB_MUSB_HDRC
-	help
-	  This OTG-capable silicon IP is used in dual designs including
-	  the TI DaVinci, OMAP 243x, OMAP 343x, TUSB 6010, and ADI Blackfin
-
 config USB_M66592
 	tristate "Renesas M66592 USB Peripheral Controller"
 	help
@@ -507,12 +509,36 @@
 config USB_U_SERIAL
 	tristate
 
+config USB_U_ETHER
+	tristate
+
+config USB_U_RNDIS
+	tristate
+
 config USB_F_SERIAL
 	tristate
 
 config USB_F_OBEX
 	tristate
 
+config USB_F_NCM
+	tristate
+
+config USB_F_ECM
+	tristate
+
+config USB_F_PHONET
+	tristate
+
+config USB_F_EEM
+	tristate
+
+config USB_F_SUBSET
+	tristate
+
+config USB_F_RNDIS
+	tristate
+
 choice
 	tristate "USB Gadget Drivers"
 	default USB_ETH
@@ -534,6 +560,121 @@
 
 # this first set of drivers all depend on bulk-capable hardware.
 
+config USB_CONFIGFS
+	tristate "USB functions configurable through configfs"
+	select USB_LIBCOMPOSITE
+	help
+	  A Linux USB "gadget" can be set up through configfs.
+	  If this is the case, the USB functions (which from the host's
+	  perspective are seen as interfaces) and configurations are
+	  specified simply by creating appropriate directories in configfs.
+	  Associating functions with configurations is done by creating
+	  appropriate symbolic links.
+	  For more information see Documentation/usb/gadget-configfs.txt.
+
+config USB_CONFIGFS_SERIAL
+	boolean "Generic serial bulk in/out"
+	depends on USB_CONFIGFS
+	depends on TTY
+	select USB_U_SERIAL
+	select USB_F_SERIAL
+	help
+	  The function talks to the Linux-USB generic serial driver.
+
+config USB_CONFIGFS_ACM
+	boolean "Abstract Control Model (CDC ACM)"
+	depends on USB_CONFIGFS
+	depends on TTY
+	select USB_U_SERIAL
+	select USB_F_ACM
+	help
+	  ACM serial link.  This function can be used to interoperate with
+	  MS-Windows hosts or with the Linux-USB "cdc-acm" driver.
+
+config USB_CONFIGFS_OBEX
+	boolean "Object Exchange Model (CDC OBEX)"
+	depends on USB_CONFIGFS
+	depends on TTY
+	select USB_U_SERIAL
+	select USB_F_OBEX
+	help
+	  You will need a user space OBEX server talking to /dev/ttyGS*,
+	  since the kernel itself doesn't implement the OBEX protocol.
+
+config USB_CONFIGFS_NCM
+	boolean "Network Control Model (CDC NCM)"
+	depends on USB_CONFIGFS
+	depends on NET
+	select USB_U_ETHER
+	select USB_F_NCM
+	help
+	  NCM is an advanced protocol for Ethernet encapsulation, allows
+	  grouping of several ethernet frames into one USB transfer and
+	  different alignment possibilities.
+
+config USB_CONFIGFS_ECM
+	boolean "Ethernet Control Model (CDC ECM)"
+	depends on USB_CONFIGFS
+	depends on NET
+	select USB_U_ETHER
+	select USB_F_ECM
+	help
+	  The "Communication Device Class" (CDC) Ethernet Control Model.
+	  That protocol is often avoided with pure Ethernet adapters, in
+	  favor of simpler vendor-specific hardware, but is widely
+	  supported by firmware for smart network devices.
+
+config USB_CONFIGFS_ECM_SUBSET
+	boolean "Ethernet Control Model (CDC ECM) subset"
+	depends on USB_CONFIGFS
+	depends on NET
+	select USB_U_ETHER
+	select USB_F_SUBSET
+	help
+	  On hardware that can't implement the full protocol,
+	  a simple CDC subset is used, placing fewer demands on USB.
+
+config USB_CONFIGFS_RNDIS
+	bool "RNDIS"
+	depends on USB_CONFIGFS
+	depends on NET
+	select USB_U_ETHER
+	select USB_F_RNDIS
+	help
+	   Microsoft Windows XP bundles the "Remote NDIS" (RNDIS) protocol,
+	   and Microsoft provides redistributable binary RNDIS drivers for
+	   older versions of Windows.
+
+	   To make MS-Windows work with this, use Documentation/usb/linux.inf
+	   as the "driver info file".  For versions of MS-Windows older than
+	   XP, you'll need to download drivers from Microsoft's website; a URL
+	   is given in comments found in that info file.
+
+config USB_CONFIGFS_EEM
+	bool "Ethernet Emulation Model (EEM)"
+	depends on USB_CONFIGFS
+	depends on NET
+	select USB_U_ETHER
+	select USB_F_EEM
+	help
+	  CDC EEM is a newer USB standard that is somewhat simpler than CDC ECM
+	  and therefore can be supported by more hardware.  Technically ECM and
+	  EEM are designed for different applications.  The ECM model extends
+	  the network interface to the target (e.g. a USB cable modem), and the
+	  EEM model is for mobile devices to communicate with hosts using
+	  ethernet over USB.  For Linux gadgets, however, the interface with
+	  the host is the same (a usbX device), so the differences are minimal.
+
+config USB_CONFIGFS_PHONET
+	boolean "Phonet protocol"
+	depends on USB_CONFIGFS
+	depends on NET
+	depends on PHONET
+	select USB_U_ETHER
+	select USB_F_PHONET
+	help
+	  The Phonet protocol implementation for USB device.
+
 config USB_ZERO
 	tristate "Gadget Zero (DEVELOPMENT)"
 	select USB_LIBCOMPOSITE
@@ -603,6 +744,10 @@
 	tristate "Ethernet Gadget (with CDC Ethernet support)"
 	depends on NET
 	select USB_LIBCOMPOSITE
+	select USB_U_ETHER
+	select USB_U_RNDIS
+	select USB_F_ECM
+	select USB_F_SUBSET
 	select CRC32
 	help
 	  This driver implements Ethernet style communication, in one of
@@ -639,6 +784,7 @@
 	bool "RNDIS support"
 	depends on USB_ETH
 	select USB_LIBCOMPOSITE
+	select USB_F_RNDIS
 	default y
 	help
 	   Microsoft Windows XP bundles the "Remote NDIS" (RNDIS) protocol,
@@ -658,6 +804,7 @@
        bool "Ethernet Emulation Model (EEM) support"
        depends on USB_ETH
 	select USB_LIBCOMPOSITE
+	select USB_F_EEM
        default n
        help
          CDC EEM is a newer USB standard that is somewhat simpler than CDC ECM
@@ -675,6 +822,8 @@
 	tristate "Network Control Model (NCM) support"
 	depends on NET
 	select USB_LIBCOMPOSITE
+	select USB_U_ETHER
+	select USB_F_NCM
 	select CRC32
 	help
 	  This driver implements USB CDC NCM subclass standard. NCM is
@@ -718,6 +867,7 @@
 config USB_FUNCTIONFS_ETH
 	bool "Include configuration with CDC ECM (Ethernet)"
 	depends on USB_FUNCTIONFS && NET
+	select USB_U_ETHER
 	help
 	  Include a configuration with CDC ECM function (Ethernet) and the
 	  Function Filesystem.
@@ -725,6 +875,8 @@
 config USB_FUNCTIONFS_RNDIS
 	bool "Include configuration with RNDIS (Ethernet)"
 	depends on USB_FUNCTIONFS && NET
+	select USB_U_ETHER
+	select USB_U_RNDIS
 	help
 	  Include a configuration with RNDIS function (Ethernet) and the Filesystem.
 
@@ -825,7 +977,9 @@
 	depends on NET
 	select USB_LIBCOMPOSITE
 	select USB_U_SERIAL
+	select USB_U_ETHER
 	select USB_F_ACM
+	select USB_F_ECM
 	help
 	  This driver provides two functions in one configuration:
 	  a CDC Ethernet (ECM) link, and a CDC ACM (serial port) link.
@@ -842,7 +996,11 @@
 	depends on PHONET
 	select USB_LIBCOMPOSITE
 	select USB_U_SERIAL
+	select USB_U_ETHER
 	select USB_F_ACM
+	select USB_F_OBEX
+	select USB_F_PHONET
+	select USB_F_ECM
 	help
 	  The Nokia composite gadget provides support for acm, obex
 	  and phonet in only one composite gadget driver.
@@ -869,6 +1027,8 @@
 	select USB_G_MULTI_CDC if !USB_G_MULTI_RNDIS
 	select USB_LIBCOMPOSITE
 	select USB_U_SERIAL
+	select USB_U_ETHER
+	select USB_U_RNDIS
 	select USB_F_ACM
 	help
 	  The Multifunction Composite Gadget provides Ethernet (RNDIS

diff --git a/drivers/usb/gadget/Makefile b/drivers/usb/gadget/Makefile
index 6afd166..bad08e6 100644
--- a/drivers/usb/gadget/Makefile
+++ b/drivers/usb/gadget/Makefile

@@ -33,6 +33,7 @@
 obj-$(CONFIG_USB_MV_UDC)	+= mv_udc.o
 mv_udc-y			:= mv_udc_core.o
 obj-$(CONFIG_USB_FUSB300)	+= fusb300_udc.o
+obj-$(CONFIG_USB_FOTG210_UDC)	+= fotg210-udc.o
 obj-$(CONFIG_USB_MV_U3D)	+= mv_u3d_core.o
 
 # USB Functions
@@ -45,6 +46,21 @@
 obj-$(CONFIG_USB_F_SERIAL)	+= usb_f_serial.o
 usb_f_obex-y			:= f_obex.o
 obj-$(CONFIG_USB_F_OBEX)	+= usb_f_obex.o
+obj-$(CONFIG_USB_U_ETHER)	+= u_ether.o
+u_rndis-y			:= rndis.o
+obj-$(CONFIG_USB_U_RNDIS)	+= u_rndis.o
+usb_f_ncm-y			:= f_ncm.o
+obj-$(CONFIG_USB_F_NCM)		+= usb_f_ncm.o
+usb_f_ecm-y			:= f_ecm.o
+obj-$(CONFIG_USB_F_ECM)		+= usb_f_ecm.o
+usb_f_phonet-y			:= f_phonet.o
+obj-$(CONFIG_USB_F_PHONET)	+= usb_f_phonet.o
+usb_f_eem-y			:= f_eem.o
+obj-$(CONFIG_USB_F_EEM)		+= usb_f_eem.o
+usb_f_ecm_subset-y		:= f_subset.o
+obj-$(CONFIG_USB_F_SUBSET)	+= usb_f_ecm_subset.o
+usb_f_rndis-y			:= f_rndis.o
+obj-$(CONFIG_USB_F_RNDIS)	+= usb_f_rndis.o
 
 #
 # USB gadget drivers

diff --git a/drivers/usb/gadget/cdc2.c b/drivers/usb/gadget/cdc2.c
index 2c52551..5a5acf2 100644
--- a/drivers/usb/gadget/cdc2.c
+++ b/drivers/usb/gadget/cdc2.c

@@ -15,6 +15,7 @@
 
 #include "u_ether.h"
 #include "u_serial.h"
+#include "u_ecm.h"
 
 
 #define DRIVER_DESC		"CDC Composite Gadget"
@@ -32,18 +33,9 @@
 #define CDC_VENDOR_NUM		0x0525	/* NetChip */
 #define CDC_PRODUCT_NUM		0xa4aa	/* CDC Composite: ECM + ACM */
 
-/*-------------------------------------------------------------------------*/
 USB_GADGET_COMPOSITE_OPTIONS();
 
-/*
- * Kbuild is not very cooperative with respect to linking separately
- * compiled library objects into one module.  So for now we won't use
- * separate compilation ... ensuring init/exit sections work to shrink
- * the runtime footprint, and giving us at least some parts of what
- * a "gcc --combine ... part1.c part2.c part3.c ... " build would.
- */
-#include "f_ecm.c"
-#include "u_ether.c"
+USB_ETHERNET_MODULE_PARAMETERS();
 
 /*-------------------------------------------------------------------------*/
 
@@ -102,12 +94,13 @@
 	NULL,
 };
 
-static u8 hostaddr[ETH_ALEN];
-static struct eth_dev *the_dev;
 /*-------------------------------------------------------------------------*/
 static struct usb_function *f_acm;
 static struct usb_function_instance *fi_serial;
 
+static struct usb_function *f_ecm;
+static struct usb_function_instance *fi_ecm;
+
 /*
  * We _always_ have both CDC ECM and CDC ACM functions.
  */
@@ -120,13 +113,27 @@
 		c->bmAttributes |= USB_CONFIG_ATT_WAKEUP;
 	}
 
-	status = ecm_bind_config(c, hostaddr, the_dev);
-	if (status < 0)
-		return status;
+	fi_ecm = usb_get_function_instance("ecm");
+	if (IS_ERR(fi_ecm)) {
+		status = PTR_ERR(fi_ecm);
+		goto err_func_ecm;
+	}
+
+	f_ecm = usb_get_function(fi_ecm);
+	if (IS_ERR(f_ecm)) {
+		status = PTR_ERR(f_ecm);
+		goto err_get_ecm;
+	}
+
+	status = usb_add_function(c, f_ecm);
+	if (status)
+		goto err_add_ecm;
 
 	fi_serial = usb_get_function_instance("acm");
-	if (IS_ERR(fi_serial))
-		return PTR_ERR(fi_serial);
+	if (IS_ERR(fi_serial)) {
+		status = PTR_ERR(fi_serial);
+		goto err_get_acm;
+	}
 
 	f_acm = usb_get_function(fi_serial);
 	if (IS_ERR(f_acm)) {
@@ -136,12 +143,21 @@
 
 	status = usb_add_function(c, f_acm);
 	if (status)
-		goto err_conf;
+		goto err_add_acm;
+
 	return 0;
-err_conf:
+
+err_add_acm:
 	usb_put_function(f_acm);
 err_func_acm:
 	usb_put_function_instance(fi_serial);
+err_get_acm:
+	usb_remove_function(c, f_ecm);
+err_add_ecm:
+	usb_put_function(f_ecm);
+err_get_ecm:
+	usb_put_function_instance(fi_ecm);
+err_func_ecm:
 	return status;
 }
 
@@ -157,6 +173,7 @@
 static int __init cdc_bind(struct usb_composite_dev *cdev)
 {
 	struct usb_gadget	*gadget = cdev->gadget;
+	struct f_ecm_opts	*ecm_opts;
 	int			status;
 
 	if (!can_support_ecm(cdev->gadget)) {
@@ -165,10 +182,23 @@
 		return -EINVAL;
 	}
 
-	/* set up network link layer */
-	the_dev = gether_setup(cdev->gadget, hostaddr);
-	if (IS_ERR(the_dev))
-		return PTR_ERR(the_dev);
+	fi_ecm = usb_get_function_instance("ecm");
+	if (IS_ERR(fi_ecm))
+		return PTR_ERR(fi_ecm);
+
+	ecm_opts = container_of(fi_ecm, struct f_ecm_opts, func_inst);
+
+	gether_set_qmult(ecm_opts->net, qmult);
+	if (!gether_set_host_addr(ecm_opts->net, host_addr))
+		pr_info("using host ethernet address: %s", host_addr);
+	if (!gether_set_dev_addr(ecm_opts->net, dev_addr))
+		pr_info("using self ethernet address: %s", dev_addr);
+
+	fi_serial = usb_get_function_instance("acm");
+	if (IS_ERR(fi_serial)) {
+		status = PTR_ERR(fi_serial);
+		goto fail;
+	}
 
 	/* Allocate string descriptor numbers ... note that string
 	 * contents can be overridden by the composite_dev glue.
@@ -192,7 +222,9 @@
 	return 0;
 
 fail1:
-	gether_cleanup(the_dev);
+	usb_put_function_instance(fi_serial);
+fail:
+	usb_put_function_instance(fi_ecm);
 	return status;
 }
 
@@ -200,7 +232,10 @@
 {
 	usb_put_function(f_acm);
 	usb_put_function_instance(fi_serial);
-	gether_cleanup(the_dev);
+	if (!IS_ERR_OR_NULL(f_ecm))
+		usb_put_function(f_ecm);
+	if (!IS_ERR_OR_NULL(fi_ecm))
+		usb_put_function_instance(fi_ecm);
 	return 0;
 }
 

diff --git a/drivers/usb/gadget/ether.c b/drivers/usb/gadget/ether.c
index 56c8eca..f48712f 100644
--- a/drivers/usb/gadget/ether.c
+++ b/drivers/usb/gadget/ether.c

@@ -14,6 +14,7 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
+#include <linux/netdevice.h>
 
 #if defined USB_ETH_RNDIS
 #  undef USB_ETH_RNDIS
@@ -91,27 +92,23 @@
 #endif
 }
 
-/*-------------------------------------------------------------------------*/
+#include <linux/module.h>
 
-/*
- * Kbuild is not very cooperative with respect to linking separately
- * compiled library objects into one module.  So for now we won't use
- * separate compilation ... ensuring init/exit sections work to shrink
- * the runtime footprint, and giving us at least some parts of what
- * a "gcc --combine ... part1.c part2.c part3.c ... " build would.
- */
-#include "f_ecm.c"
-#include "f_subset.c"
+#include "u_ecm.h"
+#include "u_gether.h"
 #ifdef	USB_ETH_RNDIS
-#include "f_rndis.c"
-#include "rndis.c"
+#include "u_rndis.h"
+#include "rndis.h"
+#else
+#define rndis_borrow_net(...) do {} while (0)
 #endif
-#include "f_eem.c"
-#include "u_ether.c"
+#include "u_eem.h"
 
 /*-------------------------------------------------------------------------*/
 USB_GADGET_COMPOSITE_OPTIONS();
 
+USB_ETHERNET_MODULE_PARAMETERS();
+
 /* DO NOT REUSE THESE IDs with a protocol-incompatible driver!!  Ever!!
  * Instead:  allocate your own, using normal USB-IF procedures.
  */
@@ -206,8 +203,18 @@
 	NULL,
 };
 
-static u8 hostaddr[ETH_ALEN];
-static struct eth_dev *the_dev;
+static struct usb_function_instance *fi_ecm;
+static struct usb_function *f_ecm;
+
+static struct usb_function_instance *fi_eem;
+static struct usb_function *f_eem;
+
+static struct usb_function_instance *fi_geth;
+static struct usb_function *f_geth;
+
+static struct usb_function_instance *fi_rndis;
+static struct usb_function *f_rndis;
+
 /*-------------------------------------------------------------------------*/
 
 /*
@@ -217,6 +224,8 @@
  */
 static int __init rndis_do_config(struct usb_configuration *c)
 {
+	int status;
+
 	/* FIXME alloc iConfiguration string, set it in c->strings */
 
 	if (gadget_is_otg(c->cdev->gadget)) {
@@ -224,7 +233,15 @@
 		c->bmAttributes |= USB_CONFIG_ATT_WAKEUP;
 	}
 
-	return rndis_bind_config(c, hostaddr, the_dev);
+	f_rndis = usb_get_function(fi_rndis);
+	if (IS_ERR(f_rndis))
+		return PTR_ERR(f_rndis);
+
+	status = usb_add_function(c, f_rndis);
+	if (status < 0)
+		usb_put_function(f_rndis);
+
+	return status;
 }
 
 static struct usb_configuration rndis_config_driver = {
@@ -249,6 +266,8 @@
  */
 static int __init eth_do_config(struct usb_configuration *c)
 {
+	int status = 0;
+
 	/* FIXME alloc iConfiguration string, set it in c->strings */
 
 	if (gadget_is_otg(c->cdev->gadget)) {
@@ -256,12 +275,38 @@
 		c->bmAttributes |= USB_CONFIG_ATT_WAKEUP;
 	}
 
-	if (use_eem)
-		return eem_bind_config(c, the_dev);
-	else if (can_support_ecm(c->cdev->gadget))
-		return ecm_bind_config(c, hostaddr, the_dev);
-	else
-		return geth_bind_config(c, hostaddr, the_dev);
+	if (use_eem) {
+		f_eem = usb_get_function(fi_eem);
+		if (IS_ERR(f_eem))
+			return PTR_ERR(f_eem);
+
+		status = usb_add_function(c, f_eem);
+		if (status < 0)
+			usb_put_function(f_eem);
+
+		return status;
+	} else if (can_support_ecm(c->cdev->gadget)) {
+		f_ecm = usb_get_function(fi_ecm);
+		if (IS_ERR(f_ecm))
+			return PTR_ERR(f_ecm);
+
+		status = usb_add_function(c, f_ecm);
+		if (status < 0)
+			usb_put_function(f_ecm);
+
+		return status;
+	} else {
+		f_geth = usb_get_function(fi_geth);
+		if (IS_ERR(f_geth))
+			return PTR_ERR(f_geth);
+
+		status = usb_add_function(c, f_geth);
+		if (status < 0)
+			usb_put_function(f_geth);
+
+		return status;
+	}
+
 }
 
 static struct usb_configuration eth_config_driver = {
@@ -276,24 +321,50 @@
 static int __init eth_bind(struct usb_composite_dev *cdev)
 {
 	struct usb_gadget	*gadget = cdev->gadget;
+	struct f_eem_opts	*eem_opts = NULL;
+	struct f_ecm_opts	*ecm_opts = NULL;
+	struct f_gether_opts	*geth_opts = NULL;
+	struct net_device	*net;
 	int			status;
 
-	/* set up network link layer */
-	the_dev = gether_setup(cdev->gadget, hostaddr);
-	if (IS_ERR(the_dev))
-		return PTR_ERR(the_dev);
-
 	/* set up main config label and device descriptor */
 	if (use_eem) {
 		/* EEM */
+		fi_eem = usb_get_function_instance("eem");
+		if (IS_ERR(fi_eem))
+			return PTR_ERR(fi_eem);
+
+		eem_opts = container_of(fi_eem, struct f_eem_opts, func_inst);
+
+		net = eem_opts->net;
+
 		eth_config_driver.label = "CDC Ethernet (EEM)";
 		device_desc.idVendor = cpu_to_le16(EEM_VENDOR_NUM);
 		device_desc.idProduct = cpu_to_le16(EEM_PRODUCT_NUM);
-	} else if (can_support_ecm(cdev->gadget)) {
+	} else if (can_support_ecm(gadget)) {
 		/* ECM */
+
+		fi_ecm = usb_get_function_instance("ecm");
+		if (IS_ERR(fi_ecm))
+			return PTR_ERR(fi_ecm);
+
+		ecm_opts = container_of(fi_ecm, struct f_ecm_opts, func_inst);
+
+		net = ecm_opts->net;
+
 		eth_config_driver.label = "CDC Ethernet (ECM)";
 	} else {
 		/* CDC Subset */
+
+		fi_geth = usb_get_function_instance("geth");
+		if (IS_ERR(fi_geth))
+			return PTR_ERR(fi_geth);
+
+		geth_opts = container_of(fi_geth, struct f_gether_opts,
+					 func_inst);
+
+		net = geth_opts->net;
+
 		eth_config_driver.label = "CDC Subset/SAFE";
 
 		device_desc.idVendor = cpu_to_le16(SIMPLE_VENDOR_NUM);
@@ -302,8 +373,34 @@
 			device_desc.bDeviceClass = USB_CLASS_VENDOR_SPEC;
 	}
 
+	gether_set_qmult(net, qmult);
+	if (!gether_set_host_addr(net, host_addr))
+		pr_info("using host ethernet address: %s", host_addr);
+	if (!gether_set_dev_addr(net, dev_addr))
+		pr_info("using self ethernet address: %s", dev_addr);
+
 	if (has_rndis()) {
 		/* RNDIS plus ECM-or-Subset */
+		gether_set_gadget(net, cdev->gadget);
+		status = gether_register_netdev(net);
+		if (status)
+			goto fail;
+
+		if (use_eem)
+			eem_opts->bound = true;
+		else if (can_support_ecm(gadget))
+			ecm_opts->bound = true;
+		else
+			geth_opts->bound = true;
+
+		fi_rndis = usb_get_function_instance("rndis");
+		if (IS_ERR(fi_rndis)) {
+			status = PTR_ERR(fi_rndis);
+			goto fail;
+		}
+
+		rndis_borrow_net(fi_rndis, net);
+
 		device_desc.idVendor = cpu_to_le16(RNDIS_VENDOR_NUM);
 		device_desc.idProduct = cpu_to_le16(RNDIS_PRODUCT_NUM);
 		device_desc.bNumConfigurations = 2;
@@ -315,7 +412,7 @@
 
 	status = usb_string_ids_tab(cdev, strings_dev);
 	if (status < 0)
-		goto fail;
+		goto fail1;
 	device_desc.iManufacturer = strings_dev[USB_GADGET_MANUFACTURER_IDX].id;
 	device_desc.iProduct = strings_dev[USB_GADGET_PRODUCT_IDX].id;
 
@@ -324,12 +421,12 @@
 		status = usb_add_config(cdev, &rndis_config_driver,
 				rndis_do_config);
 		if (status < 0)
-			goto fail;
+			goto fail1;
 	}
 
 	status = usb_add_config(cdev, &eth_config_driver, eth_do_config);
 	if (status < 0)
-		goto fail;
+		goto fail1;
 
 	usb_composite_overwrite_options(cdev, &coverwrite);
 	dev_info(&gadget->dev, "%s, version: " DRIVER_VERSION "\n",
@@ -337,14 +434,29 @@
 
 	return 0;
 
+fail1:
+	if (has_rndis())
+		usb_put_function_instance(fi_rndis);
 fail:
-	gether_cleanup(the_dev);
+	if (use_eem)
+		usb_put_function_instance(fi_eem);
+	else if (can_support_ecm(gadget))
+		usb_put_function_instance(fi_ecm);
+	else
+		usb_put_function_instance(fi_geth);
 	return status;
 }
 
 static int __exit eth_unbind(struct usb_composite_dev *cdev)
 {
-	gether_cleanup(the_dev);
+	if (has_rndis())
+		usb_put_function_instance(fi_rndis);
+	if (use_eem)
+		usb_put_function_instance(fi_eem);
+	else if (can_support_ecm(cdev->gadget))
+		usb_put_function_instance(fi_ecm);
+	else
+		usb_put_function_instance(fi_geth);
 	return 0;
 }
 

diff --git a/drivers/usb/gadget/f_ecm.c b/drivers/usb/gadget/f_ecm.c
index abf8a31..5d3561e 100644
--- a/drivers/usb/gadget/f_ecm.c
+++ b/drivers/usb/gadget/f_ecm.c

@@ -14,10 +14,13 @@
 
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/device.h>
 #include <linux/etherdevice.h>
 
 #include "u_ether.h"
+#include "u_ether_configfs.h"
+#include "u_ecm.h"
 
 
 /*
@@ -684,9 +687,44 @@
 {
 	struct usb_composite_dev *cdev = c->cdev;
 	struct f_ecm		*ecm = func_to_ecm(f);
+	struct usb_string	*us;
 	int			status;
 	struct usb_ep		*ep;
 
+#ifndef USBF_ECM_INCLUDED
+	struct f_ecm_opts	*ecm_opts;
+
+	if (!can_support_ecm(cdev->gadget))
+		return -EINVAL;
+
+	ecm_opts = container_of(f->fi, struct f_ecm_opts, func_inst);
+
+	/*
+	 * in drivers/usb/gadget/configfs.c:configfs_composite_bind()
+	 * configurations are bound in sequence with list_for_each_entry,
+	 * in each configuration its functions are bound in sequence
+	 * with list_for_each_entry, so we assume no race condition
+	 * with regard to ecm_opts->bound access
+	 */
+	if (!ecm_opts->bound) {
+		mutex_lock(&ecm_opts->lock);
+		gether_set_gadget(ecm_opts->net, cdev->gadget);
+		status = gether_register_netdev(ecm_opts->net);
+		mutex_unlock(&ecm_opts->lock);
+		if (status)
+			return status;
+		ecm_opts->bound = true;
+	}
+#endif
+	us = usb_gstrings_attach(cdev, ecm_strings,
+				 ARRAY_SIZE(ecm_string_defs));
+	if (IS_ERR(us))
+		return PTR_ERR(us);
+	ecm_control_intf.iInterface = us[0].id;
+	ecm_data_intf.iInterface = us[2].id;
+	ecm_desc.iMACAddress = us[1].id;
+	ecm_iad_descriptor.iFunction = us[3].id;
+
 	/* allocate instance-specific interface IDs */
 	status = usb_interface_id(c, f);
 	if (status < 0)
@@ -796,14 +834,15 @@
 	return status;
 }
 
+#ifdef USBF_ECM_INCLUDED
+
 static void
-ecm_unbind(struct usb_configuration *c, struct usb_function *f)
+ecm_old_unbind(struct usb_configuration *c, struct usb_function *f)
 {
 	struct f_ecm		*ecm = func_to_ecm(f);
 
 	DBG(c->cdev, "ecm unbind\n");
 
-	ecm_string_defs[0].id = 0;
 	usb_free_all_descriptors(f);
 
 	kfree(ecm->notify_req->buf);
@@ -834,17 +873,6 @@
 	if (!can_support_ecm(c->cdev->gadget) || !ethaddr)
 		return -EINVAL;
 
-	if (ecm_string_defs[0].id == 0) {
-		status = usb_string_ids_tab(c->cdev, ecm_string_defs);
-		if (status)
-			return status;
-
-		ecm_control_intf.iInterface = ecm_string_defs[0].id;
-		ecm_data_intf.iInterface = ecm_string_defs[2].id;
-		ecm_desc.iMACAddress = ecm_string_defs[1].id;
-		ecm_iad_descriptor.iFunction = ecm_string_defs[3].id;
-	}
-
 	/* allocate and initialize one new instance */
 	ecm = kzalloc(sizeof *ecm, GFP_KERNEL);
 	if (!ecm)
@@ -858,10 +886,9 @@
 	ecm->port.cdc_filter = DEFAULT_FILTER;
 
 	ecm->port.func.name = "cdc_ethernet";
-	ecm->port.func.strings = ecm_strings;
 	/* descriptors are per-instance copies */
 	ecm->port.func.bind = ecm_bind;
-	ecm->port.func.unbind = ecm_unbind;
+	ecm->port.func.unbind = ecm_old_unbind;
 	ecm->port.func.set_alt = ecm_set_alt;
 	ecm->port.func.get_alt = ecm_get_alt;
 	ecm->port.func.setup = ecm_setup;
@@ -872,3 +899,143 @@
 		kfree(ecm);
 	return status;
 }
+
+#else
+
+static inline struct f_ecm_opts *to_f_ecm_opts(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct f_ecm_opts,
+			    func_inst.group);
+}
+
+/* f_ecm_item_ops */
+USB_ETHERNET_CONFIGFS_ITEM(ecm);
+
+/* f_ecm_opts_dev_addr */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_DEV_ADDR(ecm);
+
+/* f_ecm_opts_host_addr */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_HOST_ADDR(ecm);
+
+/* f_ecm_opts_qmult */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_QMULT(ecm);
+
+/* f_ecm_opts_ifname */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_IFNAME(ecm);
+
+static struct configfs_attribute *ecm_attrs[] = {
+	&f_ecm_opts_dev_addr.attr,
+	&f_ecm_opts_host_addr.attr,
+	&f_ecm_opts_qmult.attr,
+	&f_ecm_opts_ifname.attr,
+	NULL,
+};
+
+static struct config_item_type ecm_func_type = {
+	.ct_item_ops	= &ecm_item_ops,
+	.ct_attrs	= ecm_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static void ecm_free_inst(struct usb_function_instance *f)
+{
+	struct f_ecm_opts *opts;
+
+	opts = container_of(f, struct f_ecm_opts, func_inst);
+	if (opts->bound)
+		gether_cleanup(netdev_priv(opts->net));
+	else
+		free_netdev(opts->net);
+	kfree(opts);
+}
+
+static struct usb_function_instance *ecm_alloc_inst(void)
+{
+	struct f_ecm_opts *opts;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		return ERR_PTR(-ENOMEM);
+	mutex_init(&opts->lock);
+	opts->func_inst.free_func_inst = ecm_free_inst;
+	opts->net = gether_setup_default();
+	if (IS_ERR(opts->net))
+		return ERR_PTR(PTR_ERR(opts->net));
+
+	config_group_init_type_name(&opts->func_inst.group, "", &ecm_func_type);
+
+	return &opts->func_inst;
+}
+
+static void ecm_free(struct usb_function *f)
+{
+	struct f_ecm *ecm;
+	struct f_ecm_opts *opts;
+
+	ecm = func_to_ecm(f);
+	opts = container_of(f->fi, struct f_ecm_opts, func_inst);
+	kfree(ecm);
+	mutex_lock(&opts->lock);
+	opts->refcnt--;
+	mutex_unlock(&opts->lock);
+}
+
+static void ecm_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+	struct f_ecm		*ecm = func_to_ecm(f);
+
+	DBG(c->cdev, "ecm unbind\n");
+
+	usb_free_all_descriptors(f);
+
+	kfree(ecm->notify_req->buf);
+	usb_ep_free_request(ecm->notify, ecm->notify_req);
+}
+
+struct usb_function *ecm_alloc(struct usb_function_instance *fi)
+{
+	struct f_ecm	*ecm;
+	struct f_ecm_opts *opts;
+	int status;
+
+	/* allocate and initialize one new instance */
+	ecm = kzalloc(sizeof(*ecm), GFP_KERNEL);
+	if (!ecm)
+		return ERR_PTR(-ENOMEM);
+
+	opts = container_of(fi, struct f_ecm_opts, func_inst);
+	mutex_lock(&opts->lock);
+	opts->refcnt++;
+
+	/* export host's Ethernet address in CDC format */
+	status = gether_get_host_addr_cdc(opts->net, ecm->ethaddr,
+					  sizeof(ecm->ethaddr));
+	if (status < 12) {
+		kfree(ecm);
+		mutex_unlock(&opts->lock);
+		return ERR_PTR(-EINVAL);
+	}
+	ecm_string_defs[1].s = ecm->ethaddr;
+
+	ecm->port.ioport = netdev_priv(opts->net);
+	mutex_unlock(&opts->lock);
+	ecm->port.cdc_filter = DEFAULT_FILTER;
+
+	ecm->port.func.name = "cdc_ethernet";
+	/* descriptors are per-instance copies */
+	ecm->port.func.bind = ecm_bind;
+	ecm->port.func.unbind = ecm_unbind;
+	ecm->port.func.set_alt = ecm_set_alt;
+	ecm->port.func.get_alt = ecm_get_alt;
+	ecm->port.func.setup = ecm_setup;
+	ecm->port.func.disable = ecm_disable;
+	ecm->port.func.free_func = ecm_free;
+
+	return &ecm->port.func;
+}
+
+DECLARE_USB_FUNCTION_INIT(ecm, ecm_alloc_inst, ecm_alloc);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David Brownell");
+
+#endif

diff --git a/drivers/usb/gadget/f_eem.c b/drivers/usb/gadget/f_eem.c
index f4e0bbe..90ee802 100644
--- a/drivers/usb/gadget/f_eem.c
+++ b/drivers/usb/gadget/f_eem.c

@@ -12,12 +12,15 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/device.h>
 #include <linux/etherdevice.h>
 #include <linux/crc32.h>
 #include <linux/slab.h>
 
 #include "u_ether.h"
+#include "u_ether_configfs.h"
+#include "u_eem.h"
 
 #define EEM_HLEN 2
 
@@ -40,7 +43,7 @@
 
 /* interface descriptor: */
 
-static struct usb_interface_descriptor eem_intf __initdata = {
+static struct usb_interface_descriptor eem_intf = {
 	.bLength =		sizeof eem_intf,
 	.bDescriptorType =	USB_DT_INTERFACE,
 
@@ -54,7 +57,7 @@
 
 /* full speed support: */
 
-static struct usb_endpoint_descriptor eem_fs_in_desc __initdata = {
+static struct usb_endpoint_descriptor eem_fs_in_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -62,7 +65,7 @@
 	.bmAttributes =		USB_ENDPOINT_XFER_BULK,
 };
 
-static struct usb_endpoint_descriptor eem_fs_out_desc __initdata = {
+static struct usb_endpoint_descriptor eem_fs_out_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -70,7 +73,7 @@
 	.bmAttributes =		USB_ENDPOINT_XFER_BULK,
 };
 
-static struct usb_descriptor_header *eem_fs_function[] __initdata = {
+static struct usb_descriptor_header *eem_fs_function[] = {
 	/* CDC EEM control descriptors */
 	(struct usb_descriptor_header *) &eem_intf,
 	(struct usb_descriptor_header *) &eem_fs_in_desc,
@@ -80,7 +83,7 @@
 
 /* high speed support: */
 
-static struct usb_endpoint_descriptor eem_hs_in_desc __initdata = {
+static struct usb_endpoint_descriptor eem_hs_in_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -89,7 +92,7 @@
 	.wMaxPacketSize =	cpu_to_le16(512),
 };
 
-static struct usb_endpoint_descriptor eem_hs_out_desc __initdata = {
+static struct usb_endpoint_descriptor eem_hs_out_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -98,7 +101,7 @@
 	.wMaxPacketSize =	cpu_to_le16(512),
 };
 
-static struct usb_descriptor_header *eem_hs_function[] __initdata = {
+static struct usb_descriptor_header *eem_hs_function[] = {
 	/* CDC EEM control descriptors */
 	(struct usb_descriptor_header *) &eem_intf,
 	(struct usb_descriptor_header *) &eem_hs_in_desc,
@@ -108,7 +111,7 @@
 
 /* super speed support: */
 
-static struct usb_endpoint_descriptor eem_ss_in_desc __initdata = {
+static struct usb_endpoint_descriptor eem_ss_in_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -117,7 +120,7 @@
 	.wMaxPacketSize =	cpu_to_le16(1024),
 };
 
-static struct usb_endpoint_descriptor eem_ss_out_desc __initdata = {
+static struct usb_endpoint_descriptor eem_ss_out_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -126,7 +129,7 @@
 	.wMaxPacketSize =	cpu_to_le16(1024),
 };
 
-static struct usb_ss_ep_comp_descriptor eem_ss_bulk_comp_desc __initdata = {
+static struct usb_ss_ep_comp_descriptor eem_ss_bulk_comp_desc = {
 	.bLength =		sizeof eem_ss_bulk_comp_desc,
 	.bDescriptorType =	USB_DT_SS_ENDPOINT_COMP,
 
@@ -135,7 +138,7 @@
 	/* .bmAttributes =	0, */
 };
 
-static struct usb_descriptor_header *eem_ss_function[] __initdata = {
+static struct usb_descriptor_header *eem_ss_function[] = {
 	/* CDC EEM control descriptors */
 	(struct usb_descriptor_header *) &eem_intf,
 	(struct usb_descriptor_header *) &eem_ss_in_desc,
@@ -242,14 +245,40 @@
 
 /* EEM function driver setup/binding */
 
-static int __init
-eem_bind(struct usb_configuration *c, struct usb_function *f)
+static int eem_bind(struct usb_configuration *c, struct usb_function *f)
 {
 	struct usb_composite_dev *cdev = c->cdev;
 	struct f_eem		*eem = func_to_eem(f);
+	struct usb_string	*us;
 	int			status;
 	struct usb_ep		*ep;
 
+	struct f_eem_opts	*eem_opts;
+
+	eem_opts = container_of(f->fi, struct f_eem_opts, func_inst);
+	/*
+	 * in drivers/usb/gadget/configfs.c:configfs_composite_bind()
+	 * configurations are bound in sequence with list_for_each_entry,
+	 * in each configuration its functions are bound in sequence
+	 * with list_for_each_entry, so we assume no race condition
+	 * with regard to eem_opts->bound access
+	 */
+	if (!eem_opts->bound) {
+		mutex_lock(&eem_opts->lock);
+		gether_set_gadget(eem_opts->net, cdev->gadget);
+		status = gether_register_netdev(eem_opts->net);
+		mutex_unlock(&eem_opts->lock);
+		if (status)
+			return status;
+		eem_opts->bound = true;
+	}
+
+	us = usb_gstrings_attach(cdev, eem_strings,
+				 ARRAY_SIZE(eem_string_defs));
+	if (IS_ERR(us))
+		return PTR_ERR(us);
+	eem_intf.iInterface = us[0].id;
+
 	/* allocate instance-specific interface IDs */
 	status = usb_interface_id(c, f);
 	if (status < 0)
@@ -307,17 +336,6 @@
 	return status;
 }
 
-static void
-eem_unbind(struct usb_configuration *c, struct usb_function *f)
-{
-	struct f_eem	*eem = func_to_eem(f);
-
-	DBG(c->cdev, "eem unbind\n");
-
-	usb_free_all_descriptors(f);
-	kfree(eem);
-}
-
 static void eem_cmd_complete(struct usb_ep *ep, struct usb_request *req)
 {
 	struct sk_buff *skb = (struct sk_buff *)req->context;
@@ -518,55 +536,124 @@
 	return status;
 }
 
-/**
- * eem_bind_config - add CDC Ethernet (EEM) network link to a configuration
- * @c: the configuration to support the network link
- * Context: single threaded during gadget setup
- *
- * Returns zero on success, else negative errno.
- *
- * Caller must have called @gether_setup().  Caller is also responsible
- * for calling @gether_cleanup() before module unload.
- */
-int __init eem_bind_config(struct usb_configuration *c, struct eth_dev *dev)
+static inline struct f_eem_opts *to_f_eem_opts(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct f_eem_opts,
+			    func_inst.group);
+}
+
+/* f_eem_item_ops */
+USB_ETHERNET_CONFIGFS_ITEM(eem);
+
+/* f_eem_opts_dev_addr */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_DEV_ADDR(eem);
+
+/* f_eem_opts_host_addr */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_HOST_ADDR(eem);
+
+/* f_eem_opts_qmult */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_QMULT(eem);
+
+/* f_eem_opts_ifname */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_IFNAME(eem);
+
+static struct configfs_attribute *eem_attrs[] = {
+	&f_eem_opts_dev_addr.attr,
+	&f_eem_opts_host_addr.attr,
+	&f_eem_opts_qmult.attr,
+	&f_eem_opts_ifname.attr,
+	NULL,
+};
+
+static struct config_item_type eem_func_type = {
+	.ct_item_ops	= &eem_item_ops,
+	.ct_attrs	= eem_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static void eem_free_inst(struct usb_function_instance *f)
+{
+	struct f_eem_opts *opts;
+
+	opts = container_of(f, struct f_eem_opts, func_inst);
+	if (opts->bound)
+		gether_cleanup(netdev_priv(opts->net));
+	else
+		free_netdev(opts->net);
+	kfree(opts);
+}
+
+static struct usb_function_instance *eem_alloc_inst(void)
+{
+	struct f_eem_opts *opts;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		return ERR_PTR(-ENOMEM);
+	mutex_init(&opts->lock);
+	opts->func_inst.free_func_inst = eem_free_inst;
+	opts->net = gether_setup_default();
+	if (IS_ERR(opts->net))
+		return ERR_CAST(opts->net);
+
+	config_group_init_type_name(&opts->func_inst.group, "", &eem_func_type);
+
+	return &opts->func_inst;
+}
+
+static void eem_free(struct usb_function *f)
+{
+	struct f_eem *eem;
+	struct f_eem_opts *opts;
+
+	eem = func_to_eem(f);
+	opts = container_of(f->fi, struct f_eem_opts, func_inst);
+	kfree(eem);
+	mutex_lock(&opts->lock);
+	opts->refcnt--;
+	mutex_unlock(&opts->lock);
+}
+
+static void eem_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+	DBG(c->cdev, "eem unbind\n");
+
+	usb_free_all_descriptors(f);
+}
+
+struct usb_function *eem_alloc(struct usb_function_instance *fi)
 {
 	struct f_eem	*eem;
-	int		status;
-
-	/* maybe allocate device-global string IDs */
-	if (eem_string_defs[0].id == 0) {
-
-		/* control interface label */
-		status = usb_string_id(c->cdev);
-		if (status < 0)
-			return status;
-		eem_string_defs[0].id = status;
-		eem_intf.iInterface = status;
-	}
+	struct f_eem_opts *opts;
 
 	/* allocate and initialize one new instance */
-	eem = kzalloc(sizeof *eem, GFP_KERNEL);
+	eem = kzalloc(sizeof(*eem), GFP_KERNEL);
 	if (!eem)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
-	eem->port.ioport = dev;
+	opts = container_of(fi, struct f_eem_opts, func_inst);
+	mutex_lock(&opts->lock);
+	opts->refcnt++;
+
+	eem->port.ioport = netdev_priv(opts->net);
+	mutex_unlock(&opts->lock);
 	eem->port.cdc_filter = DEFAULT_FILTER;
 
 	eem->port.func.name = "cdc_eem";
-	eem->port.func.strings = eem_strings;
 	/* descriptors are per-instance copies */
 	eem->port.func.bind = eem_bind;
 	eem->port.func.unbind = eem_unbind;
 	eem->port.func.set_alt = eem_set_alt;
 	eem->port.func.setup = eem_setup;
 	eem->port.func.disable = eem_disable;
+	eem->port.func.free_func = eem_free;
 	eem->port.wrap = eem_wrap;
 	eem->port.unwrap = eem_unwrap;
 	eem->port.header_len = EEM_HLEN;
 
-	status = usb_add_function(c, &eem->port.func);
-	if (status)
-		kfree(eem);
-	return status;
+	return &eem->port.func;
 }
 
+DECLARE_USB_FUNCTION_INIT(eem, eem_alloc_inst, eem_alloc);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David Brownell");

diff --git a/drivers/usb/gadget/f_mass_storage.c b/drivers/usb/gadget/f_mass_storage.c
index 97666e8..56f1fd1 100644
--- a/drivers/usb/gadget/f_mass_storage.c
+++ b/drivers/usb/gadget/f_mass_storage.c

@@ -413,6 +413,7 @@
 /* Caller must hold fsg->lock */
 static void wakeup_thread(struct fsg_common *common)
 {
+	smp_wmb();	/* ensure the write of bh->state is complete */
 	/* Tell the main thread that something has happened */
 	common->thread_wakeup_needed = 1;
 	if (common->thread_task)
@@ -632,6 +633,7 @@
 	}
 	__set_current_state(TASK_RUNNING);
 	common->thread_wakeup_needed = 0;
+	smp_rmb();	/* ensure the latest bh->state is visible */
 	return rc;
 }
 
@@ -2745,8 +2747,8 @@
 		 "%-8s%-16s%04x", cfg->vendor_name ?: "Linux",
 		 /* Assume product name dependent on the first LUN */
 		 cfg->product_name ?: (common->luns->cdrom
-				     ? "File-Stor Gadget"
-				     : "File-CD Gadget"),
+				     ? "File-CD Gadget"
+				     : "File-Stor Gadget"),
 		 i);
 
 	/*

diff --git a/drivers/usb/gadget/f_ncm.c b/drivers/usb/gadget/f_ncm.c
index ee19bc8..952177f 100644
--- a/drivers/usb/gadget/f_ncm.c
+++ b/drivers/usb/gadget/f_ncm.c

@@ -16,6 +16,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/device.h>
 #include <linux/etherdevice.h>
 #include <linux/crc32.h>
@@ -23,6 +24,8 @@
 #include <linux/usb/cdc.h>
 
 #include "u_ether.h"
+#include "u_ether_configfs.h"
+#include "u_ncm.h"
 
 /*
  * This function is a "CDC Network Control Model" (CDC NCM) Ethernet link.
@@ -125,7 +128,7 @@
 #define NCM_STATUS_INTERVAL_MS		32
 #define NCM_STATUS_BYTECOUNT		16	/* 8 byte header + data */
 
-static struct usb_interface_assoc_descriptor ncm_iad_desc __initdata = {
+static struct usb_interface_assoc_descriptor ncm_iad_desc = {
 	.bLength =		sizeof ncm_iad_desc,
 	.bDescriptorType =	USB_DT_INTERFACE_ASSOCIATION,
 
@@ -139,7 +142,7 @@
 
 /* interface descriptor: */
 
-static struct usb_interface_descriptor ncm_control_intf __initdata = {
+static struct usb_interface_descriptor ncm_control_intf = {
 	.bLength =		sizeof ncm_control_intf,
 	.bDescriptorType =	USB_DT_INTERFACE,
 
@@ -151,7 +154,7 @@
 	/* .iInterface = DYNAMIC */
 };
 
-static struct usb_cdc_header_desc ncm_header_desc __initdata = {
+static struct usb_cdc_header_desc ncm_header_desc = {
 	.bLength =		sizeof ncm_header_desc,
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubType =	USB_CDC_HEADER_TYPE,
@@ -159,7 +162,7 @@
 	.bcdCDC =		cpu_to_le16(0x0110),
 };
 
-static struct usb_cdc_union_desc ncm_union_desc __initdata = {
+static struct usb_cdc_union_desc ncm_union_desc = {
 	.bLength =		sizeof(ncm_union_desc),
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubType =	USB_CDC_UNION_TYPE,
@@ -167,7 +170,7 @@
 	/* .bSlaveInterface0 =	DYNAMIC */
 };
 
-static struct usb_cdc_ether_desc ecm_desc __initdata = {
+static struct usb_cdc_ether_desc ecm_desc = {
 	.bLength =		sizeof ecm_desc,
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubType =	USB_CDC_ETHERNET_TYPE,
@@ -182,7 +185,7 @@
 
 #define NCAPS	(USB_CDC_NCM_NCAP_ETH_FILTER | USB_CDC_NCM_NCAP_CRC_MODE)
 
-static struct usb_cdc_ncm_desc ncm_desc __initdata = {
+static struct usb_cdc_ncm_desc ncm_desc = {
 	.bLength =		sizeof ncm_desc,
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubType =	USB_CDC_NCM_TYPE,
@@ -194,7 +197,7 @@
 
 /* the default data interface has no endpoints ... */
 
-static struct usb_interface_descriptor ncm_data_nop_intf __initdata = {
+static struct usb_interface_descriptor ncm_data_nop_intf = {
 	.bLength =		sizeof ncm_data_nop_intf,
 	.bDescriptorType =	USB_DT_INTERFACE,
 
@@ -209,7 +212,7 @@
 
 /* ... but the "real" data interface has two bulk endpoints */
 
-static struct usb_interface_descriptor ncm_data_intf __initdata = {
+static struct usb_interface_descriptor ncm_data_intf = {
 	.bLength =		sizeof ncm_data_intf,
 	.bDescriptorType =	USB_DT_INTERFACE,
 
@@ -224,7 +227,7 @@
 
 /* full speed support: */
 
-static struct usb_endpoint_descriptor fs_ncm_notify_desc __initdata = {
+static struct usb_endpoint_descriptor fs_ncm_notify_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -234,7 +237,7 @@
 	.bInterval =		NCM_STATUS_INTERVAL_MS,
 };
 
-static struct usb_endpoint_descriptor fs_ncm_in_desc __initdata = {
+static struct usb_endpoint_descriptor fs_ncm_in_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -242,7 +245,7 @@
 	.bmAttributes =		USB_ENDPOINT_XFER_BULK,
 };
 
-static struct usb_endpoint_descriptor fs_ncm_out_desc __initdata = {
+static struct usb_endpoint_descriptor fs_ncm_out_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -250,7 +253,7 @@
 	.bmAttributes =		USB_ENDPOINT_XFER_BULK,
 };
 
-static struct usb_descriptor_header *ncm_fs_function[] __initdata = {
+static struct usb_descriptor_header *ncm_fs_function[] = {
 	(struct usb_descriptor_header *) &ncm_iad_desc,
 	/* CDC NCM control descriptors */
 	(struct usb_descriptor_header *) &ncm_control_intf,
@@ -269,7 +272,7 @@
 
 /* high speed support: */
 
-static struct usb_endpoint_descriptor hs_ncm_notify_desc __initdata = {
+static struct usb_endpoint_descriptor hs_ncm_notify_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -278,7 +281,7 @@
 	.wMaxPacketSize =	cpu_to_le16(NCM_STATUS_BYTECOUNT),
 	.bInterval =		USB_MS_TO_HS_INTERVAL(NCM_STATUS_INTERVAL_MS),
 };
-static struct usb_endpoint_descriptor hs_ncm_in_desc __initdata = {
+static struct usb_endpoint_descriptor hs_ncm_in_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -287,7 +290,7 @@
 	.wMaxPacketSize =	cpu_to_le16(512),
 };
 
-static struct usb_endpoint_descriptor hs_ncm_out_desc __initdata = {
+static struct usb_endpoint_descriptor hs_ncm_out_desc = {
 	.bLength =		USB_DT_ENDPOINT_SIZE,
 	.bDescriptorType =	USB_DT_ENDPOINT,
 
@@ -296,7 +299,7 @@
 	.wMaxPacketSize =	cpu_to_le16(512),
 };
 
-static struct usb_descriptor_header *ncm_hs_function[] __initdata = {
+static struct usb_descriptor_header *ncm_hs_function[] = {
 	(struct usb_descriptor_header *) &ncm_iad_desc,
 	/* CDC NCM control descriptors */
 	(struct usb_descriptor_header *) &ncm_control_intf,
@@ -1152,13 +1155,44 @@
 
 /* ethernet function driver setup/binding */
 
-static int __init
-ncm_bind(struct usb_configuration *c, struct usb_function *f)
+static int ncm_bind(struct usb_configuration *c, struct usb_function *f)
 {
 	struct usb_composite_dev *cdev = c->cdev;
 	struct f_ncm		*ncm = func_to_ncm(f);
+	struct usb_string	*us;
 	int			status;
 	struct usb_ep		*ep;
+	struct f_ncm_opts	*ncm_opts;
+
+	if (!can_support_ecm(cdev->gadget))
+		return -EINVAL;
+
+	ncm_opts = container_of(f->fi, struct f_ncm_opts, func_inst);
+	/*
+	 * in drivers/usb/gadget/configfs.c:configfs_composite_bind()
+	 * configurations are bound in sequence with list_for_each_entry,
+	 * in each configuration its functions are bound in sequence
+	 * with list_for_each_entry, so we assume no race condition
+	 * with regard to ncm_opts->bound access
+	 */
+	if (!ncm_opts->bound) {
+		mutex_lock(&ncm_opts->lock);
+		gether_set_gadget(ncm_opts->net, cdev->gadget);
+		status = gether_register_netdev(ncm_opts->net);
+		mutex_unlock(&ncm_opts->lock);
+		if (status)
+			return status;
+		ncm_opts->bound = true;
+	}
+	us = usb_gstrings_attach(cdev, ncm_strings,
+				 ARRAY_SIZE(ncm_string_defs));
+	if (IS_ERR(us))
+		return PTR_ERR(us);
+	ncm_control_intf.iInterface = us[STRING_CTRL_IDX].id;
+	ncm_data_nop_intf.iInterface = us[STRING_DATA_IDX].id;
+	ncm_data_intf.iInterface = us[STRING_DATA_IDX].id;
+	ecm_desc.iMACAddress = us[STRING_MAC_IDX].id;
+	ncm_iad_desc.iFunction = us[STRING_IAD_IDX].id;
 
 	/* allocate instance-specific interface IDs */
 	status = usb_interface_id(c, f);
@@ -1259,74 +1293,128 @@
 	return status;
 }
 
-static void
-ncm_unbind(struct usb_configuration *c, struct usb_function *f)
+static inline struct f_ncm_opts *to_f_ncm_opts(struct config_item *item)
 {
-	struct f_ncm		*ncm = func_to_ncm(f);
+	return container_of(to_config_group(item), struct f_ncm_opts,
+			    func_inst.group);
+}
+
+/* f_ncm_item_ops */
+USB_ETHERNET_CONFIGFS_ITEM(ncm);
+
+/* f_ncm_opts_dev_addr */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_DEV_ADDR(ncm);
+
+/* f_ncm_opts_host_addr */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_HOST_ADDR(ncm);
+
+/* f_ncm_opts_qmult */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_QMULT(ncm);
+
+/* f_ncm_opts_ifname */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_IFNAME(ncm);
+
+static struct configfs_attribute *ncm_attrs[] = {
+	&f_ncm_opts_dev_addr.attr,
+	&f_ncm_opts_host_addr.attr,
+	&f_ncm_opts_qmult.attr,
+	&f_ncm_opts_ifname.attr,
+	NULL,
+};
+
+static struct config_item_type ncm_func_type = {
+	.ct_item_ops	= &ncm_item_ops,
+	.ct_attrs	= ncm_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static void ncm_free_inst(struct usb_function_instance *f)
+{
+	struct f_ncm_opts *opts;
+
+	opts = container_of(f, struct f_ncm_opts, func_inst);
+	if (opts->bound)
+		gether_cleanup(netdev_priv(opts->net));
+	else
+		free_netdev(opts->net);
+	kfree(opts);
+}
+
+static struct usb_function_instance *ncm_alloc_inst(void)
+{
+	struct f_ncm_opts *opts;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		return ERR_PTR(-ENOMEM);
+	mutex_init(&opts->lock);
+	opts->func_inst.free_func_inst = ncm_free_inst;
+	opts->net = gether_setup_default();
+	if (IS_ERR(opts->net))
+		return ERR_PTR(PTR_ERR(opts->net));
+
+	config_group_init_type_name(&opts->func_inst.group, "", &ncm_func_type);
+
+	return &opts->func_inst;
+}
+
+static void ncm_free(struct usb_function *f)
+{
+	struct f_ncm *ncm;
+	struct f_ncm_opts *opts;
+
+	ncm = func_to_ncm(f);
+	opts = container_of(f->fi, struct f_ncm_opts, func_inst);
+	kfree(ncm);
+	mutex_lock(&opts->lock);
+	opts->refcnt--;
+	mutex_unlock(&opts->lock);
+}
+
+static void ncm_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+	struct f_ncm *ncm = func_to_ncm(f);
 
 	DBG(c->cdev, "ncm unbind\n");
 
-	ncm_string_defs[0].id = 0;
 	usb_free_all_descriptors(f);
 
 	kfree(ncm->notify_req->buf);
 	usb_ep_free_request(ncm->notify, ncm->notify_req);
-
-	kfree(ncm);
 }
 
-/**
- * ncm_bind_config - add CDC Network link to a configuration
- * @c: the configuration to support the network link
- * @ethaddr: a buffer in which the ethernet address of the host side
- *	side of the link was recorded
- * Context: single threaded during gadget setup
- *
- * Returns zero on success, else negative errno.
- *
- * Caller must have called @gether_setup().  Caller is also responsible
- * for calling @gether_cleanup() before module unload.
- */
-int __init ncm_bind_config(struct usb_configuration *c, u8 ethaddr[ETH_ALEN],
-		struct eth_dev *dev)
+struct usb_function *ncm_alloc(struct usb_function_instance *fi)
 {
-	struct f_ncm	*ncm;
-	int		status;
-
-	if (!can_support_ecm(c->cdev->gadget) || !ethaddr)
-		return -EINVAL;
-
-	if (ncm_string_defs[0].id == 0) {
-		status = usb_string_ids_tab(c->cdev, ncm_string_defs);
-		if (status < 0)
-			return status;
-		ncm_control_intf.iInterface =
-			ncm_string_defs[STRING_CTRL_IDX].id;
-
-		status = ncm_string_defs[STRING_DATA_IDX].id;
-		ncm_data_nop_intf.iInterface = status;
-		ncm_data_intf.iInterface = status;
-
-		ecm_desc.iMACAddress = ncm_string_defs[STRING_MAC_IDX].id;
-		ncm_iad_desc.iFunction = ncm_string_defs[STRING_IAD_IDX].id;
-	}
+	struct f_ncm		*ncm;
+	struct f_ncm_opts	*opts;
+	int status;
 
 	/* allocate and initialize one new instance */
-	ncm = kzalloc(sizeof *ncm, GFP_KERNEL);
+	ncm = kzalloc(sizeof(*ncm), GFP_KERNEL);
 	if (!ncm)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
+
+	opts = container_of(fi, struct f_ncm_opts, func_inst);
+	mutex_lock(&opts->lock);
+	opts->refcnt++;
 
 	/* export host's Ethernet address in CDC format */
-	snprintf(ncm->ethaddr, sizeof ncm->ethaddr, "%pm", ethaddr);
+	status = gether_get_host_addr_cdc(opts->net, ncm->ethaddr,
+				      sizeof(ncm->ethaddr));
+	if (status < 12) { /* strlen("01234567890a") */
+		kfree(ncm);
+		mutex_unlock(&opts->lock);
+		return ERR_PTR(-EINVAL);
+	}
 	ncm_string_defs[STRING_MAC_IDX].s = ncm->ethaddr;
 
 	spin_lock_init(&ncm->lock);
 	ncm_reset_values(ncm);
-	ncm->port.ioport = dev;
+	ncm->port.ioport = netdev_priv(opts->net);
+	mutex_unlock(&opts->lock);
 	ncm->port.is_fixed = true;
 
 	ncm->port.func.name = "cdc_network";
-	ncm->port.func.strings = ncm_strings;
 	/* descriptors are per-instance copies */
 	ncm->port.func.bind = ncm_bind;
 	ncm->port.func.unbind = ncm_unbind;
@@ -1334,12 +1422,14 @@
 	ncm->port.func.get_alt = ncm_get_alt;
 	ncm->port.func.setup = ncm_setup;
 	ncm->port.func.disable = ncm_disable;
+	ncm->port.func.free_func = ncm_free;
 
 	ncm->port.wrap = ncm_wrap_ntb;
 	ncm->port.unwrap = ncm_unwrap_ntb;
 
-	status = usb_add_function(c, &ncm->port.func);
-	if (status)
-		kfree(ncm);
-	return status;
+	return &ncm->port.func;
 }
+
+DECLARE_USB_FUNCTION_INIT(ncm, ncm_alloc_inst, ncm_alloc);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yauheni Kaliuta");

diff --git a/drivers/usb/gadget/f_obex.c b/drivers/usb/gadget/f_obex.c
index 8aa2be5..ad39f1d 100644
--- a/drivers/usb/gadget/f_obex.c
+++ b/drivers/usb/gadget/f_obex.c

@@ -309,23 +309,20 @@
 {
 	struct usb_composite_dev *cdev = c->cdev;
 	struct f_obex		*obex = func_to_obex(f);
+	struct usb_string	*us;
 	int			status;
 	struct usb_ep		*ep;
 
 	if (!can_support_obex(c))
 		return -EINVAL;
 
-	if (obex_string_defs[OBEX_CTRL_IDX].id == 0) {
-		status = usb_string_ids_tab(c->cdev, obex_string_defs);
-		if (status < 0)
-			return status;
-		obex_control_intf.iInterface =
-			obex_string_defs[OBEX_CTRL_IDX].id;
-
-		status = obex_string_defs[OBEX_DATA_IDX].id;
-		obex_data_nop_intf.iInterface = status;
-		obex_data_intf.iInterface = status;
-	}
+	us = usb_gstrings_attach(cdev, obex_strings,
+				 ARRAY_SIZE(obex_string_defs));
+	if (IS_ERR(us))
+		return PTR_ERR(us);
+	obex_control_intf.iInterface = us[OBEX_CTRL_IDX].id;
+	obex_data_nop_intf.iInterface = us[OBEX_DATA_IDX].id;
+	obex_data_intf.iInterface = us[OBEX_DATA_IDX].id;
 
 	/* allocate instance-specific interface IDs, and patch descriptors */
 
@@ -406,57 +403,6 @@
 	return status;
 }
 
-#ifdef USBF_OBEX_INCLUDED
-
-static void
-obex_old_unbind(struct usb_configuration *c, struct usb_function *f)
-{
-	obex_string_defs[OBEX_CTRL_IDX].id = 0;
-	usb_free_all_descriptors(f);
-	kfree(func_to_obex(f));
-}
-
-/**
- * obex_bind_config - add a CDC OBEX function to a configuration
- * @c: the configuration to support the CDC OBEX instance
- * @port_num: /dev/ttyGS* port this interface will use
- * Context: single threaded during gadget setup
- *
- * Returns zero on success, else negative errno.
- */
-int __init obex_bind_config(struct usb_configuration *c, u8 port_num)
-{
-	struct f_obex	*obex;
-	int		status;
-
-	/* allocate and initialize one new instance */
-	obex = kzalloc(sizeof *obex, GFP_KERNEL);
-	if (!obex)
-		return -ENOMEM;
-
-	obex->port_num = port_num;
-
-	obex->port.connect = obex_connect;
-	obex->port.disconnect = obex_disconnect;
-
-	obex->port.func.name = "obex";
-	obex->port.func.strings = obex_strings;
-	/* descriptors are per-instance copies */
-	obex->port.func.bind = obex_bind;
-	obex->port.func.unbind = obex_old_unbind;
-	obex->port.func.set_alt = obex_set_alt;
-	obex->port.func.get_alt = obex_get_alt;
-	obex->port.func.disable = obex_disable;
-
-	status = usb_add_function(c, &obex->port.func);
-	if (status)
-		kfree(obex);
-
-	return status;
-}
-
-#else
-
 static inline struct f_serial_opts *to_f_serial_opts(struct config_item *item)
 {
 	return container_of(to_config_group(item), struct f_serial_opts,
@@ -550,7 +496,6 @@
 
 static void obex_unbind(struct usb_configuration *c, struct usb_function *f)
 {
-	obex_string_defs[OBEX_CTRL_IDX].id = 0;
 	usb_free_all_descriptors(f);
 }
 
@@ -572,7 +517,6 @@
 	obex->port.disconnect = obex_disconnect;
 
 	obex->port.func.name = "obex";
-	obex->port.func.strings = obex_strings;
 	/* descriptors are per-instance copies */
 	obex->port.func.bind = obex_bind;
 	obex->port.func.unbind = obex_unbind;
@@ -585,8 +529,5 @@
 }
 
 DECLARE_USB_FUNCTION_INIT(obex, obex_alloc_inst, obex_alloc);
-
-#endif
-
 MODULE_AUTHOR("Felipe Balbi");
 MODULE_LICENSE("GPL");

diff --git a/drivers/usb/gadget/f_phonet.c b/drivers/usb/gadget/f_phonet.c
index b21ab55..7944fb0 100644
--- a/drivers/usb/gadget/f_phonet.c
+++ b/drivers/usb/gadget/f_phonet.c

@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/device.h>
 
 #include <linux/netdevice.h>
@@ -25,6 +26,7 @@
 #include <linux/usb/composite.h>
 
 #include "u_phonet.h"
+#include "u_ether.h"
 
 #define PN_MEDIA_USB	0x1B
 #define MAXPACKET	512
@@ -478,8 +480,7 @@
 
 /*-------------------------------------------------------------------------*/
 
-static __init
-int pn_bind(struct usb_configuration *c, struct usb_function *f)
+static int pn_bind(struct usb_configuration *c, struct usb_function *f)
 {
 	struct usb_composite_dev *cdev = c->cdev;
 	struct usb_gadget *gadget = cdev->gadget;
@@ -487,6 +488,27 @@
 	struct usb_ep *ep;
 	int status, i;
 
+#ifndef USBF_PHONET_INCLUDED
+	struct f_phonet_opts *phonet_opts;
+
+	phonet_opts = container_of(f->fi, struct f_phonet_opts, func_inst);
+
+	/*
+	 * in drivers/usb/gadget/configfs.c:configfs_composite_bind()
+	 * configurations are bound in sequence with list_for_each_entry,
+	 * in each configuration its functions are bound in sequence
+	 * with list_for_each_entry, so we assume no race condition
+	 * with regard to phonet_opts->bound access
+	 */
+	if (!phonet_opts->bound) {
+		gphonet_set_gadget(phonet_opts->net, gadget);
+		status = gphonet_register_netdev(phonet_opts->net);
+		if (status)
+			return status;
+		phonet_opts->bound = true;
+	}
+#endif
+
 	/* Reserve interface IDs */
 	status = usb_interface_id(c, f);
 	if (status < 0)
@@ -560,8 +582,98 @@
 	return status;
 }
 
-static void
-pn_unbind(struct usb_configuration *c, struct usb_function *f)
+static inline struct f_phonet_opts *to_f_phonet_opts(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct f_phonet_opts,
+			func_inst.group);
+}
+
+CONFIGFS_ATTR_STRUCT(f_phonet_opts);
+static ssize_t f_phonet_attr_show(struct config_item *item,
+				struct configfs_attribute *attr,
+				char *page)
+{
+	struct f_phonet_opts *opts = to_f_phonet_opts(item);
+	struct f_phonet_opts_attribute *f_phonet_opts_attr =
+		container_of(attr, struct f_phonet_opts_attribute, attr);
+	ssize_t ret = 0;
+
+	if (f_phonet_opts_attr->show)
+		ret = f_phonet_opts_attr->show(opts, page);
+	return ret;
+}
+
+static void phonet_attr_release(struct config_item *item)
+{
+	struct f_phonet_opts *opts = to_f_phonet_opts(item);
+
+	usb_put_function_instance(&opts->func_inst);
+}
+
+static struct configfs_item_operations phonet_item_ops = {
+	.release		= phonet_attr_release,
+	.show_attribute		= f_phonet_attr_show,
+};
+
+static ssize_t f_phonet_ifname_show(struct f_phonet_opts *opts, char *page)
+{
+	return gether_get_ifname(opts->net, page, PAGE_SIZE);
+}
+
+static struct f_phonet_opts_attribute f_phonet_ifname =
+	__CONFIGFS_ATTR_RO(ifname, f_phonet_ifname_show);
+
+static struct configfs_attribute *phonet_attrs[] = {
+	&f_phonet_ifname.attr,
+	NULL,
+};
+
+static struct config_item_type phonet_func_type = {
+	.ct_item_ops	= &phonet_item_ops,
+	.ct_attrs	= phonet_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static void phonet_free_inst(struct usb_function_instance *f)
+{
+	struct f_phonet_opts *opts;
+
+	opts = container_of(f, struct f_phonet_opts, func_inst);
+	if (opts->bound)
+		gphonet_cleanup(opts->net);
+	else
+		free_netdev(opts->net);
+	kfree(opts);
+}
+
+static struct usb_function_instance *phonet_alloc_inst(void)
+{
+	struct f_phonet_opts *opts;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		return ERR_PTR(-ENOMEM);
+
+	opts->func_inst.free_func_inst = phonet_free_inst;
+	opts->net = gphonet_setup_default();
+	if (IS_ERR(opts->net))
+		return ERR_PTR(PTR_ERR(opts->net));
+
+	config_group_init_type_name(&opts->func_inst.group, "",
+			&phonet_func_type);
+
+	return &opts->func_inst;
+}
+
+static void phonet_free(struct usb_function *f)
+{
+	struct f_phonet *phonet;
+
+	phonet = func_to_pn(f);
+	kfree(phonet);
+}
+
+static void pn_unbind(struct usb_configuration *c, struct usb_function *f)
 {
 	struct f_phonet *fp = func_to_pn(f);
 	int i;
@@ -574,61 +686,72 @@
 			usb_ep_free_request(fp->out_ep, fp->out_reqv[i]);
 
 	usb_free_all_descriptors(f);
-	kfree(fp);
 }
 
-/*-------------------------------------------------------------------------*/
-
-static struct net_device *dev;
-
-int __init phonet_bind_config(struct usb_configuration *c)
+struct usb_function *phonet_alloc(struct usb_function_instance *fi)
 {
 	struct f_phonet *fp;
-	int err, size;
+	struct f_phonet_opts *opts;
+	int size;
 
 	size = sizeof(*fp) + (phonet_rxq_size * sizeof(struct usb_request *));
 	fp = kzalloc(size, GFP_KERNEL);
 	if (!fp)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
-	fp->dev = dev;
+	opts = container_of(fi, struct f_phonet_opts, func_inst);
+
+	fp->dev = opts->net;
 	fp->function.name = "phonet";
 	fp->function.bind = pn_bind;
 	fp->function.unbind = pn_unbind;
 	fp->function.set_alt = pn_set_alt;
 	fp->function.get_alt = pn_get_alt;
 	fp->function.disable = pn_disconnect;
+	fp->function.free_func = phonet_free;
 	spin_lock_init(&fp->rx.lock);
 
-	err = usb_add_function(c, &fp->function);
-	if (err)
-		kfree(fp);
-	return err;
+	return &fp->function;
 }
 
-int __init gphonet_setup(struct usb_gadget *gadget)
+struct net_device *gphonet_setup_default(void)
 {
+	struct net_device *dev;
 	struct phonet_port *port;
-	int err;
 
 	/* Create net device */
-	BUG_ON(dev);
 	dev = alloc_netdev(sizeof(*port), "upnlink%d", pn_net_setup);
 	if (!dev)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	port = netdev_priv(dev);
 	spin_lock_init(&port->lock);
 	netif_carrier_off(dev);
-	SET_NETDEV_DEV(dev, &gadget->dev);
 
-	err = register_netdev(dev);
-	if (err)
-		free_netdev(dev);
-	return err;
+	return dev;
 }
 
-void gphonet_cleanup(void)
+void gphonet_set_gadget(struct net_device *net, struct usb_gadget *g)
+{
+	SET_NETDEV_DEV(net, &g->dev);
+}
+
+int gphonet_register_netdev(struct net_device *net)
+{
+	int status;
+
+	status = register_netdev(net);
+	if (status)
+		free_netdev(net);
+
+	return status;
+}
+
+void gphonet_cleanup(struct net_device *dev)
 {
 	unregister_netdev(dev);
 }
+
+DECLARE_USB_FUNCTION_INIT(phonet, phonet_alloc_inst, phonet_alloc);
+MODULE_AUTHOR("Rémi Denis-Courmont");
+MODULE_LICENSE("GPL");

diff --git a/drivers/usb/gadget/f_rndis.c b/drivers/usb/gadget/f_rndis.c
index 36e8c44..191df35 100644
--- a/drivers/usb/gadget/f_rndis.c
+++ b/drivers/usb/gadget/f_rndis.c

@@ -17,15 +17,17 @@
 
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/device.h>
 #include <linux/etherdevice.h>
 
 #include <linux/atomic.h>
 
 #include "u_ether.h"
+#include "u_ether_configfs.h"
+#include "u_rndis.h"
 #include "rndis.h"
 
-
 /*
  * This function is an RNDIS Ethernet port -- a Microsoft protocol that's
  * been promoted instead of the standard CDC Ethernet.  The published RNDIS
@@ -655,6 +657,13 @@
 
 /*-------------------------------------------------------------------------*/
 
+/* Some controllers can't support RNDIS ... */
+static inline bool can_support_rndis(struct usb_configuration *c)
+{
+	/* everything else is *presumably* fine */
+	return true;
+}
+
 /* ethernet function driver setup/binding */
 
 static int
@@ -662,9 +671,41 @@
 {
 	struct usb_composite_dev *cdev = c->cdev;
 	struct f_rndis		*rndis = func_to_rndis(f);
+	struct usb_string	*us;
 	int			status;
 	struct usb_ep		*ep;
 
+#ifndef USB_FRNDIS_INCLUDED
+	struct f_rndis_opts *rndis_opts;
+
+	if (!can_support_rndis(c))
+		return -EINVAL;
+
+	rndis_opts = container_of(f->fi, struct f_rndis_opts, func_inst);
+
+	/*
+	 * in drivers/usb/gadget/configfs.c:configfs_composite_bind()
+	 * configurations are bound in sequence with list_for_each_entry,
+	 * in each configuration its functions are bound in sequence
+	 * with list_for_each_entry, so we assume no race condition
+	 * with regard to rndis_opts->bound access
+	 */
+	if (!rndis_opts->bound) {
+		gether_set_gadget(rndis_opts->net, cdev->gadget);
+		status = gether_register_netdev(rndis_opts->net);
+		if (status)
+			return status;
+		rndis_opts->bound = true;
+	}
+#endif
+	us = usb_gstrings_attach(cdev, rndis_strings,
+				 ARRAY_SIZE(rndis_string_defs));
+	if (IS_ERR(us))
+		return PTR_ERR(us);
+	rndis_control_intf.iInterface = us[0].id;
+	rndis_data_intf.iInterface = us[1].id;
+	rndis_iad_descriptor.iFunction = us[2].id;
+
 	/* allocate instance-specific interface IDs */
 	status = usb_interface_id(c, f);
 	if (status < 0)
@@ -741,10 +782,12 @@
 	rndis->port.open = rndis_open;
 	rndis->port.close = rndis_close;
 
+#ifdef USB_FRNDIS_INCLUDED
 	status = rndis_register(rndis_response_available, rndis);
 	if (status < 0)
 		goto fail;
 	rndis->config = status;
+#endif
 
 	rndis_set_param_medium(rndis->config, RNDIS_MEDIUM_802_3, 0);
 	rndis_set_host_mac(rndis->config, rndis->ethaddr);
@@ -787,15 +830,15 @@
 	return status;
 }
 
+#ifdef USB_FRNDIS_INCLUDED
+
 static void
-rndis_unbind(struct usb_configuration *c, struct usb_function *f)
+rndis_old_unbind(struct usb_configuration *c, struct usb_function *f)
 {
 	struct f_rndis		*rndis = func_to_rndis(f);
 
 	rndis_deregister(rndis->config);
-	rndis_exit();
 
-	rndis_string_defs[0].id = 0;
 	usb_free_all_descriptors(f);
 
 	kfree(rndis->notify_req->buf);
@@ -804,13 +847,6 @@
 	kfree(rndis);
 }
 
-/* Some controllers can't support RNDIS ... */
-static inline bool can_support_rndis(struct usb_configuration *c)
-{
-	/* everything else is *presumably* fine */
-	return true;
-}
-
 int
 rndis_bind_config_vendor(struct usb_configuration *c, u8 ethaddr[ETH_ALEN],
 		u32 vendorID, const char *manufacturer, struct eth_dev *dev)
@@ -818,24 +854,6 @@
 	struct f_rndis	*rndis;
 	int		status;
 
-	if (!can_support_rndis(c) || !ethaddr)
-		return -EINVAL;
-
-	if (rndis_string_defs[0].id == 0) {
-		/* ... and setup RNDIS itself */
-		status = rndis_init();
-		if (status < 0)
-			return status;
-
-		status = usb_string_ids_tab(c->cdev, rndis_string_defs);
-		if (status)
-			return status;
-
-		rndis_control_intf.iInterface = rndis_string_defs[0].id;
-		rndis_data_intf.iInterface = rndis_string_defs[1].id;
-		rndis_iad_descriptor.iFunction = rndis_string_defs[2].id;
-	}
-
 	/* allocate and initialize one new instance */
 	status = -ENOMEM;
 	rndis = kzalloc(sizeof *rndis, GFP_KERNEL);
@@ -856,19 +874,178 @@
 	rndis->port.unwrap = rndis_rm_hdr;
 
 	rndis->port.func.name = "rndis";
-	rndis->port.func.strings = rndis_strings;
+	/* descriptors are per-instance copies */
+	rndis->port.func.bind = rndis_bind;
+	rndis->port.func.unbind = rndis_old_unbind;
+	rndis->port.func.set_alt = rndis_set_alt;
+	rndis->port.func.setup = rndis_setup;
+	rndis->port.func.disable = rndis_disable;
+
+	status = usb_add_function(c, &rndis->port.func);
+	if (status)
+		kfree(rndis);
+fail:
+	return status;
+}
+
+#else
+
+void rndis_borrow_net(struct usb_function_instance *f, struct net_device *net)
+{
+	struct f_rndis_opts *opts;
+
+	opts = container_of(f, struct f_rndis_opts, func_inst);
+	if (opts->bound)
+		gether_cleanup(netdev_priv(opts->net));
+	else
+		free_netdev(opts->net);
+	opts->borrowed_net = opts->bound = true;
+	opts->net = net;
+}
+EXPORT_SYMBOL(rndis_borrow_net);
+
+static inline struct f_rndis_opts *to_f_rndis_opts(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct f_rndis_opts,
+			    func_inst.group);
+}
+
+/* f_rndis_item_ops */
+USB_ETHERNET_CONFIGFS_ITEM(rndis);
+
+/* f_rndis_opts_dev_addr */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_DEV_ADDR(rndis);
+
+/* f_rndis_opts_host_addr */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_HOST_ADDR(rndis);
+
+/* f_rndis_opts_qmult */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_QMULT(rndis);
+
+/* f_rndis_opts_ifname */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_IFNAME(rndis);
+
+static struct configfs_attribute *rndis_attrs[] = {
+	&f_rndis_opts_dev_addr.attr,
+	&f_rndis_opts_host_addr.attr,
+	&f_rndis_opts_qmult.attr,
+	&f_rndis_opts_ifname.attr,
+	NULL,
+};
+
+static struct config_item_type rndis_func_type = {
+	.ct_item_ops	= &rndis_item_ops,
+	.ct_attrs	= rndis_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static void rndis_free_inst(struct usb_function_instance *f)
+{
+	struct f_rndis_opts *opts;
+
+	opts = container_of(f, struct f_rndis_opts, func_inst);
+	if (!opts->borrowed_net) {
+		if (opts->bound)
+			gether_cleanup(netdev_priv(opts->net));
+		else
+			free_netdev(opts->net);
+	}
+	kfree(opts);
+}
+
+static struct usb_function_instance *rndis_alloc_inst(void)
+{
+	struct f_rndis_opts *opts;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		return ERR_PTR(-ENOMEM);
+	mutex_init(&opts->lock);
+	opts->func_inst.free_func_inst = rndis_free_inst;
+	opts->net = gether_setup_default();
+	if (IS_ERR(opts->net))
+		return ERR_CAST(opts->net);
+
+	config_group_init_type_name(&opts->func_inst.group, "",
+				    &rndis_func_type);
+
+	return &opts->func_inst;
+}
+
+static void rndis_free(struct usb_function *f)
+{
+	struct f_rndis *rndis;
+	struct f_rndis_opts *opts;
+
+	rndis = func_to_rndis(f);
+	rndis_deregister(rndis->config);
+	opts = container_of(f->fi, struct f_rndis_opts, func_inst);
+	kfree(rndis);
+	mutex_lock(&opts->lock);
+	opts->refcnt--;
+	mutex_unlock(&opts->lock);
+}
+
+static void rndis_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+	struct f_rndis		*rndis = func_to_rndis(f);
+
+	usb_free_all_descriptors(f);
+
+	kfree(rndis->notify_req->buf);
+	usb_ep_free_request(rndis->notify, rndis->notify_req);
+}
+
+static struct usb_function *rndis_alloc(struct usb_function_instance *fi)
+{
+	struct f_rndis	*rndis;
+	struct f_rndis_opts *opts;
+	int status;
+
+	/* allocate and initialize one new instance */
+	rndis = kzalloc(sizeof(*rndis), GFP_KERNEL);
+	if (!rndis)
+		return ERR_PTR(-ENOMEM);
+
+	opts = container_of(fi, struct f_rndis_opts, func_inst);
+	mutex_lock(&opts->lock);
+	opts->refcnt++;
+
+	gether_get_host_addr_u8(opts->net, rndis->ethaddr);
+	rndis->vendorID = opts->vendor_id;
+	rndis->manufacturer = opts->manufacturer;
+
+	rndis->port.ioport = netdev_priv(opts->net);
+	mutex_unlock(&opts->lock);
+	/* RNDIS activates when the host changes this filter */
+	rndis->port.cdc_filter = 0;
+
+	/* RNDIS has special (and complex) framing */
+	rndis->port.header_len = sizeof(struct rndis_packet_msg_type);
+	rndis->port.wrap = rndis_add_header;
+	rndis->port.unwrap = rndis_rm_hdr;
+
+	rndis->port.func.name = "rndis";
 	/* descriptors are per-instance copies */
 	rndis->port.func.bind = rndis_bind;
 	rndis->port.func.unbind = rndis_unbind;
 	rndis->port.func.set_alt = rndis_set_alt;
 	rndis->port.func.setup = rndis_setup;
 	rndis->port.func.disable = rndis_disable;
+	rndis->port.func.free_func = rndis_free;
 
-	status = usb_add_function(c, &rndis->port.func);
-	if (status) {
+	status = rndis_register(rndis_response_available, rndis);
+	if (status < 0) {
 		kfree(rndis);
-fail:
-		rndis_exit();
+		return ERR_PTR(status);
 	}
-	return status;
+	rndis->config = status;
+
+	return &rndis->port.func;
 }
+
+DECLARE_USB_FUNCTION_INIT(rndis, rndis_alloc_inst, rndis_alloc);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David Brownell");
+
+#endif

diff --git a/drivers/usb/gadget/f_subset.c b/drivers/usb/gadget/f_subset.c
index 7be04b3..5601e1d 100644
--- a/drivers/usb/gadget/f_subset.c
+++ b/drivers/usb/gadget/f_subset.c

@@ -12,11 +12,13 @@
 
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/device.h>
 #include <linux/etherdevice.h>
 
 #include "u_ether.h"
-
+#include "u_ether_configfs.h"
+#include "u_gether.h"
 
 /*
  * This function packages a simple "CDC Subset" Ethernet port with no real
@@ -295,9 +297,40 @@
 {
 	struct usb_composite_dev *cdev = c->cdev;
 	struct f_gether		*geth = func_to_geth(f);
+	struct usb_string	*us;
 	int			status;
 	struct usb_ep		*ep;
 
+#ifndef USB_FSUBSET_INCLUDED
+	struct f_gether_opts	*gether_opts;
+
+	gether_opts = container_of(f->fi, struct f_gether_opts, func_inst);
+
+	/*
+	 * in drivers/usb/gadget/configfs.c:configfs_composite_bind()
+	 * configurations are bound in sequence with list_for_each_entry,
+	 * in each configuration its functions are bound in sequence
+	 * with list_for_each_entry, so we assume no race condition
+	 * with regard to gether_opts->bound access
+	 */
+	if (!gether_opts->bound) {
+		mutex_lock(&gether_opts->lock);
+		gether_set_gadget(gether_opts->net, cdev->gadget);
+		status = gether_register_netdev(gether_opts->net);
+		mutex_unlock(&gether_opts->lock);
+		if (status)
+			return status;
+		gether_opts->bound = true;
+	}
+#endif
+	us = usb_gstrings_attach(cdev, geth_strings,
+				 ARRAY_SIZE(geth_string_defs));
+	if (IS_ERR(us))
+		return PTR_ERR(us);
+
+	subset_data_intf.iInterface = us[0].id;
+	ether_desc.iMACAddress = us[1].id;
+
 	/* allocate instance-specific interface IDs */
 	status = usb_interface_id(c, f);
 	if (status < 0)
@@ -360,8 +393,10 @@
 	return status;
 }
 
+#ifdef USB_FSUBSET_INCLUDED
+
 static void
-geth_unbind(struct usb_configuration *c, struct usb_function *f)
+geth_old_unbind(struct usb_configuration *c, struct usb_function *f)
 {
 	geth_string_defs[0].id = 0;
 	usb_free_all_descriptors(f);
@@ -387,18 +422,6 @@
 	struct f_gether	*geth;
 	int		status;
 
-	if (!ethaddr)
-		return -EINVAL;
-
-	/* maybe allocate device-global string IDs */
-	if (geth_string_defs[0].id == 0) {
-		status = usb_string_ids_tab(c->cdev, geth_string_defs);
-		if (status < 0)
-			return status;
-		subset_data_intf.iInterface = geth_string_defs[0].id;
-		ether_desc.iMACAddress = geth_string_defs[1].id;
-	}
-
 	/* allocate and initialize one new instance */
 	geth = kzalloc(sizeof *geth, GFP_KERNEL);
 	if (!geth)
@@ -412,9 +435,8 @@
 	geth->port.cdc_filter = DEFAULT_FILTER;
 
 	geth->port.func.name = "cdc_subset";
-	geth->port.func.strings = geth_strings;
 	geth->port.func.bind = geth_bind;
-	geth->port.func.unbind = geth_unbind;
+	geth->port.func.unbind = geth_old_unbind;
 	geth->port.func.set_alt = geth_set_alt;
 	geth->port.func.disable = geth_disable;
 
@@ -423,3 +445,130 @@
 		kfree(geth);
 	return status;
 }
+
+#else
+
+static inline struct f_gether_opts *to_f_gether_opts(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct f_gether_opts,
+			    func_inst.group);
+}
+
+/* f_gether_item_ops */
+USB_ETHERNET_CONFIGFS_ITEM(gether);
+
+/* f_gether_opts_dev_addr */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_DEV_ADDR(gether);
+
+/* f_gether_opts_host_addr */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_HOST_ADDR(gether);
+
+/* f_gether_opts_qmult */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_QMULT(gether);
+
+/* f_gether_opts_ifname */
+USB_ETHERNET_CONFIGFS_ITEM_ATTR_IFNAME(gether);
+
+static struct configfs_attribute *gether_attrs[] = {
+	&f_gether_opts_dev_addr.attr,
+	&f_gether_opts_host_addr.attr,
+	&f_gether_opts_qmult.attr,
+	&f_gether_opts_ifname.attr,
+	NULL,
+};
+
+static struct config_item_type gether_func_type = {
+	.ct_item_ops	= &gether_item_ops,
+	.ct_attrs	= gether_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static void geth_free_inst(struct usb_function_instance *f)
+{
+	struct f_gether_opts *opts;
+
+	opts = container_of(f, struct f_gether_opts, func_inst);
+	if (opts->bound)
+		gether_cleanup(netdev_priv(opts->net));
+	else
+		free_netdev(opts->net);
+	kfree(opts);
+}
+
+static struct usb_function_instance *geth_alloc_inst(void)
+{
+	struct f_gether_opts *opts;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		return ERR_PTR(-ENOMEM);
+	mutex_init(&opts->lock);
+	opts->func_inst.free_func_inst = geth_free_inst;
+	opts->net = gether_setup_default();
+	if (IS_ERR(opts->net))
+		return ERR_CAST(opts->net);
+
+	config_group_init_type_name(&opts->func_inst.group, "",
+				    &gether_func_type);
+
+	return &opts->func_inst;
+}
+
+static void geth_free(struct usb_function *f)
+{
+	struct f_gether *eth;
+
+	eth = func_to_geth(f);
+	kfree(eth);
+}
+
+static void geth_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+	geth_string_defs[0].id = 0;
+	usb_free_all_descriptors(f);
+}
+
+static struct usb_function *geth_alloc(struct usb_function_instance *fi)
+{
+	struct f_gether	*geth;
+	struct f_gether_opts *opts;
+	int status;
+
+	/* allocate and initialize one new instance */
+	geth = kzalloc(sizeof(*geth), GFP_KERNEL);
+	if (!geth)
+		return ERR_PTR(-ENOMEM);
+
+	opts = container_of(fi, struct f_gether_opts, func_inst);
+
+	mutex_lock(&opts->lock);
+	opts->refcnt++;
+	/* export host's Ethernet address in CDC format */
+	status = gether_get_host_addr_cdc(opts->net, geth->ethaddr,
+					  sizeof(geth->ethaddr));
+	if (status < 12) {
+		kfree(geth);
+		mutex_unlock(&opts->lock);
+		return ERR_PTR(-EINVAL);
+	}
+	geth_string_defs[1].s = geth->ethaddr;
+
+	geth->port.ioport = netdev_priv(opts->net);
+	mutex_unlock(&opts->lock);
+	geth->port.cdc_filter = DEFAULT_FILTER;
+
+	geth->port.func.name = "cdc_subset";
+	geth->port.func.bind = geth_bind;
+	geth->port.func.unbind = geth_unbind;
+	geth->port.func.set_alt = geth_set_alt;
+	geth->port.func.disable = geth_disable;
+	geth->port.func.free_func = geth_free;
+
+	return &geth->port.func;
+}
+
+DECLARE_USB_FUNCTION_INIT(geth, geth_alloc_inst, geth_alloc);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David Brownell");
+
+#endif

diff --git a/drivers/usb/gadget/f_uac2.c b/drivers/usb/gadget/f_uac2.c
index 03c1fb6..2f23566 100644
--- a/drivers/usb/gadget/f_uac2.c
+++ b/drivers/usb/gadget/f_uac2.c

@@ -90,6 +90,7 @@
 };
 
 struct uac2_rtd_params {
+	struct snd_uac2_chip *uac2; /* parent chip */
 	bool ep_enabled; /* if the ep is enabled */
 	/* Size of the ring buffer */
 	size_t dma_bytes;
@@ -169,18 +170,6 @@
 }
 
 static inline
-struct snd_uac2_chip *prm_to_uac2(struct uac2_rtd_params *r)
-{
-	struct snd_uac2_chip *uac2 = container_of(r,
-					struct snd_uac2_chip, c_prm);
-
-	if (&uac2->c_prm != r)
-		uac2 = container_of(r, struct snd_uac2_chip, p_prm);
-
-	return uac2;
-}
-
-static inline
 uint num_channels(uint chanmask)
 {
 	uint num = 0;
@@ -204,7 +193,7 @@
 	struct uac2_req *ur = req->context;
 	struct snd_pcm_substream *substream;
 	struct uac2_rtd_params *prm = ur->pp;
-	struct snd_uac2_chip *uac2 = prm_to_uac2(prm);
+	struct snd_uac2_chip *uac2 = prm->uac2;
 
 	/* i/f shutting down */
 	if (!prm->ep_enabled)
@@ -894,7 +883,7 @@
 static inline void
 free_ep(struct uac2_rtd_params *prm, struct usb_ep *ep)
 {
-	struct snd_uac2_chip *uac2 = prm_to_uac2(prm);
+	struct snd_uac2_chip *uac2 = prm->uac2;
 	int i;
 
 	prm->ep_enabled = false;
@@ -970,6 +959,9 @@
 	}
 	agdev->in_ep->driver_data = agdev;
 
+	uac2->p_prm.uac2 = uac2;
+	uac2->c_prm.uac2 = uac2;
+
 	hs_epout_desc.bEndpointAddress = fs_epout_desc.bEndpointAddress;
 	hs_epout_desc.wMaxPacketSize = fs_epout_desc.wMaxPacketSize;
 	hs_epin_desc.bEndpointAddress = fs_epin_desc.bEndpointAddress;

diff --git a/drivers/usb/gadget/f_uvc.c b/drivers/usb/gadget/f_uvc.c
index 38dcedd..5f91c7a 100644
--- a/drivers/usb/gadget/f_uvc.c
+++ b/drivers/usb/gadget/f_uvc.c

@@ -156,8 +156,6 @@
 	/* The wMaxPacketSize and bInterval values will be initialized from
 	 * module parameters.
 	 */
-	.wMaxPacketSize		= 0,
-	.bInterval		= 0,
 };
 
 static struct usb_endpoint_descriptor uvc_hs_streaming_ep __initdata = {
@@ -169,8 +167,6 @@
 	/* The wMaxPacketSize and bInterval values will be initialized from
 	 * module parameters.
 	 */
-	.wMaxPacketSize		= 0,
-	.bInterval		= 0,
 };
 
 static struct usb_endpoint_descriptor uvc_ss_streaming_ep __initdata = {
@@ -183,17 +179,14 @@
 	/* The wMaxPacketSize and bInterval values will be initialized from
 	 * module parameters.
 	 */
-	.wMaxPacketSize		= 0,
-	.bInterval		= 0,
 };
 
 static struct usb_ss_ep_comp_descriptor uvc_ss_streaming_comp __initdata = {
 	.bLength		= sizeof(uvc_ss_streaming_comp),
 	.bDescriptorType	= USB_DT_SS_ENDPOINT_COMP,
-	/* The following 3 values can be tweaked if necessary. */
-	.bMaxBurst		= 0,
-	.bmAttributes		= 0,
-	.wBytesPerInterval	= cpu_to_le16(1024),
+	/* The bMaxBurst, bmAttributes and wBytesPerInterval values will be
+	 * initialized from module parameters.
+	 */
 };
 
 static const struct usb_descriptor_header * const uvc_fs_streaming[] = {

diff --git a/drivers/usb/gadget/fotg210-udc.c b/drivers/usb/gadget/fotg210-udc.c
new file mode 100644
index 0000000..cce5535
--- /dev/null
+++ b/drivers/usb/gadget/fotg210-udc.c

@@ -0,0 +1,1219 @@
+/*
+ * FOTG210 UDC Driver supports Bulk transfer so far
+ *
+ * Copyright (C) 2013 Faraday Technology Corporation
+ *
+ * Author : Yuan-Hsin Chen <yhchen@faraday-tech.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/gadget.h>
+
+#include "fotg210.h"
+
+#define	DRIVER_DESC	"FOTG210 USB Device Controller Driver"
+#define	DRIVER_VERSION	"30-April-2013"
+
+static const char udc_name[] = "fotg210_udc";
+static const char * const fotg210_ep_name[] = {
+	"ep0", "ep1", "ep2", "ep3", "ep4"};
+
+static void fotg210_disable_fifo_int(struct fotg210_ep *ep)
+{
+	u32 value = ioread32(ep->fotg210->reg + FOTG210_DMISGR1);
+
+	if (ep->dir_in)
+		value |= DMISGR1_MF_IN_INT(ep->epnum - 1);
+	else
+		value |= DMISGR1_MF_OUTSPK_INT(ep->epnum - 1);
+	iowrite32(value, ep->fotg210->reg + FOTG210_DMISGR1);
+}
+
+static void fotg210_enable_fifo_int(struct fotg210_ep *ep)
+{
+	u32 value = ioread32(ep->fotg210->reg + FOTG210_DMISGR1);
+
+	if (ep->dir_in)
+		value &= ~DMISGR1_MF_IN_INT(ep->epnum - 1);
+	else
+		value &= ~DMISGR1_MF_OUTSPK_INT(ep->epnum - 1);
+	iowrite32(value, ep->fotg210->reg + FOTG210_DMISGR1);
+}
+
+static void fotg210_set_cxdone(struct fotg210_udc *fotg210)
+{
+	u32 value = ioread32(fotg210->reg + FOTG210_DCFESR);
+
+	value |= DCFESR_CX_DONE;
+	iowrite32(value, fotg210->reg + FOTG210_DCFESR);
+}
+
+static void fotg210_done(struct fotg210_ep *ep, struct fotg210_request *req,
+			int status)
+{
+	list_del_init(&req->queue);
+
+	/* don't modify queue heads during completion callback */
+	if (ep->fotg210->gadget.speed == USB_SPEED_UNKNOWN)
+		req->req.status = -ESHUTDOWN;
+	else
+		req->req.status = status;
+
+	spin_unlock(&ep->fotg210->lock);
+	req->req.complete(&ep->ep, &req->req);
+	spin_lock(&ep->fotg210->lock);
+
+	if (ep->epnum) {
+		if (list_empty(&ep->queue))
+			fotg210_disable_fifo_int(ep);
+	} else {
+		fotg210_set_cxdone(ep->fotg210);
+	}
+}
+
+static void fotg210_fifo_ep_mapping(struct fotg210_ep *ep, u32 epnum,
+				u32 dir_in)
+{
+	struct fotg210_udc *fotg210 = ep->fotg210;
+	u32 val;
+
+	/* Driver should map an ep to a fifo and then map the fifo
+	 * to the ep. What a brain-damaged design!
+	 */
+
+	/* map a fifo to an ep */
+	val = ioread32(fotg210->reg + FOTG210_EPMAP);
+	val &= ~EPMAP_FIFONOMSK(epnum, dir_in);
+	val |= EPMAP_FIFONO(epnum, dir_in);
+	iowrite32(val, fotg210->reg + FOTG210_EPMAP);
+
+	/* map the ep to the fifo */
+	val = ioread32(fotg210->reg + FOTG210_FIFOMAP);
+	val &= ~FIFOMAP_EPNOMSK(epnum);
+	val |= FIFOMAP_EPNO(epnum);
+	iowrite32(val, fotg210->reg + FOTG210_FIFOMAP);
+
+	/* enable fifo */
+	val = ioread32(fotg210->reg + FOTG210_FIFOCF);
+	val |= FIFOCF_FIFO_EN(epnum - 1);
+	iowrite32(val, fotg210->reg + FOTG210_FIFOCF);
+}
+
+static void fotg210_set_fifo_dir(struct fotg210_ep *ep, u32 epnum, u32 dir_in)
+{
+	struct fotg210_udc *fotg210 = ep->fotg210;
+	u32 val;
+
+	val = ioread32(fotg210->reg + FOTG210_FIFOMAP);
+	val |= (dir_in ? FIFOMAP_DIRIN(epnum - 1) : FIFOMAP_DIROUT(epnum - 1));
+	iowrite32(val, fotg210->reg + FOTG210_FIFOMAP);
+}
+
+static void fotg210_set_tfrtype(struct fotg210_ep *ep, u32 epnum, u32 type)
+{
+	struct fotg210_udc *fotg210 = ep->fotg210;
+	u32 val;
+
+	val = ioread32(fotg210->reg + FOTG210_FIFOCF);
+	val |= FIFOCF_TYPE(type, epnum - 1);
+	iowrite32(val, fotg210->reg + FOTG210_FIFOCF);
+}
+
+static void fotg210_set_mps(struct fotg210_ep *ep, u32 epnum, u32 mps,
+				u32 dir_in)
+{
+	struct fotg210_udc *fotg210 = ep->fotg210;
+	u32 val;
+	u32 offset = dir_in ? FOTG210_INEPMPSR(epnum) :
+				FOTG210_OUTEPMPSR(epnum);
+
+	val = ioread32(fotg210->reg + offset);
+	val |= INOUTEPMPSR_MPS(mps);
+	iowrite32(val, fotg210->reg + offset);
+}
+
+static int fotg210_config_ep(struct fotg210_ep *ep,
+		     const struct usb_endpoint_descriptor *desc)
+{
+	struct fotg210_udc *fotg210 = ep->fotg210;
+
+	fotg210_set_fifo_dir(ep, ep->epnum, ep->dir_in);
+	fotg210_set_tfrtype(ep, ep->epnum, ep->type);
+	fotg210_set_mps(ep, ep->epnum, ep->ep.maxpacket, ep->dir_in);
+	fotg210_fifo_ep_mapping(ep, ep->epnum, ep->dir_in);
+
+	fotg210->ep[ep->epnum] = ep;
+
+	return 0;
+}
+
+static int fotg210_ep_enable(struct usb_ep *_ep,
+			  const struct usb_endpoint_descriptor *desc)
+{
+	struct fotg210_ep *ep;
+
+	ep = container_of(_ep, struct fotg210_ep, ep);
+
+	ep->desc = desc;
+	ep->epnum = usb_endpoint_num(desc);
+	ep->type = usb_endpoint_type(desc);
+	ep->dir_in = usb_endpoint_dir_in(desc);
+	ep->ep.maxpacket = usb_endpoint_maxp(desc);
+
+	return fotg210_config_ep(ep, desc);
+}
+
+static void fotg210_reset_tseq(struct fotg210_udc *fotg210, u8 epnum)
+{
+	struct fotg210_ep *ep = fotg210->ep[epnum];
+	u32 value;
+	void __iomem *reg;
+
+	reg = (ep->dir_in) ?
+		fotg210->reg + FOTG210_INEPMPSR(epnum) :
+		fotg210->reg + FOTG210_OUTEPMPSR(epnum);
+
+	/* Note: Driver needs to set and clear INOUTEPMPSR_RESET_TSEQ
+	 *	 bit. Controller wouldn't clear this bit. WTF!!!
+	 */
+
+	value = ioread32(reg);
+	value |= INOUTEPMPSR_RESET_TSEQ;
+	iowrite32(value, reg);
+
+	value = ioread32(reg);
+	value &= ~INOUTEPMPSR_RESET_TSEQ;
+	iowrite32(value, reg);
+}
+
+static int fotg210_ep_release(struct fotg210_ep *ep)
+{
+	if (!ep->epnum)
+		return 0;
+	ep->epnum = 0;
+	ep->stall = 0;
+	ep->wedged = 0;
+
+	fotg210_reset_tseq(ep->fotg210, ep->epnum);
+
+	return 0;
+}
+
+static int fotg210_ep_disable(struct usb_ep *_ep)
+{
+	struct fotg210_ep *ep;
+	struct fotg210_request *req;
+	unsigned long flags;
+
+	BUG_ON(!_ep);
+
+	ep = container_of(_ep, struct fotg210_ep, ep);
+
+	while (!list_empty(&ep->queue)) {
+		req = list_entry(ep->queue.next,
+			struct fotg210_request, queue);
+		spin_lock_irqsave(&ep->fotg210->lock, flags);
+		fotg210_done(ep, req, -ECONNRESET);
+		spin_unlock_irqrestore(&ep->fotg210->lock, flags);
+	}
+
+	return fotg210_ep_release(ep);
+}
+
+static struct usb_request *fotg210_ep_alloc_request(struct usb_ep *_ep,
+						gfp_t gfp_flags)
+{
+	struct fotg210_request *req;
+
+	req = kzalloc(sizeof(struct fotg210_request), gfp_flags);
+	if (!req)
+		return NULL;
+
+	INIT_LIST_HEAD(&req->queue);
+
+	return &req->req;
+}
+
+static void fotg210_ep_free_request(struct usb_ep *_ep,
+					struct usb_request *_req)
+{
+	struct fotg210_request *req;
+
+	req = container_of(_req, struct fotg210_request, req);
+	kfree(req);
+}
+
+static void fotg210_enable_dma(struct fotg210_ep *ep,
+			      dma_addr_t d, u32 len)
+{
+	u32 value;
+	struct fotg210_udc *fotg210 = ep->fotg210;
+
+	/* set transfer length and direction */
+	value = ioread32(fotg210->reg + FOTG210_DMACPSR1);
+	value &= ~(DMACPSR1_DMA_LEN(0xFFFF) | DMACPSR1_DMA_TYPE(1));
+	value |= DMACPSR1_DMA_LEN(len) | DMACPSR1_DMA_TYPE(ep->dir_in);
+	iowrite32(value, fotg210->reg + FOTG210_DMACPSR1);
+
+	/* set device DMA target FIFO number */
+	value = ioread32(fotg210->reg + FOTG210_DMATFNR);
+	if (ep->epnum)
+		value |= DMATFNR_ACC_FN(ep->epnum - 1);
+	else
+		value |= DMATFNR_ACC_CXF;
+	iowrite32(value, fotg210->reg + FOTG210_DMATFNR);
+
+	/* set DMA memory address */
+	iowrite32(d, fotg210->reg + FOTG210_DMACPSR2);
+
+	/* enable MDMA_EROR and MDMA_CMPLT interrupt */
+	value = ioread32(fotg210->reg + FOTG210_DMISGR2);
+	value &= ~(DMISGR2_MDMA_CMPLT | DMISGR2_MDMA_ERROR);
+	iowrite32(value, fotg210->reg + FOTG210_DMISGR2);
+
+	/* start DMA */
+	value = ioread32(fotg210->reg + FOTG210_DMACPSR1);
+	value |= DMACPSR1_DMA_START;
+	iowrite32(value, fotg210->reg + FOTG210_DMACPSR1);
+}
+
+static void fotg210_disable_dma(struct fotg210_ep *ep)
+{
+	iowrite32(DMATFNR_DISDMA, ep->fotg210->reg + FOTG210_DMATFNR);
+}
+
+static void fotg210_wait_dma_done(struct fotg210_ep *ep)
+{
+	u32 value;
+
+	do {
+		value = ioread32(ep->fotg210->reg + FOTG210_DISGR2);
+		if ((value & DISGR2_USBRST_INT) ||
+		    (value & DISGR2_DMA_ERROR))
+			goto dma_reset;
+	} while (!(value & DISGR2_DMA_CMPLT));
+
+	value &= ~DISGR2_DMA_CMPLT;
+	iowrite32(value, ep->fotg210->reg + FOTG210_DISGR2);
+	return;
+
+dma_reset:
+	value = ioread32(ep->fotg210->reg + FOTG210_DMACPSR1);
+	value |= DMACPSR1_DMA_ABORT;
+	iowrite32(value, ep->fotg210->reg + FOTG210_DMACPSR1);
+
+	/* reset fifo */
+	if (ep->epnum) {
+		value = ioread32(ep->fotg210->reg +
+				FOTG210_FIBCR(ep->epnum - 1));
+		value |= FIBCR_FFRST;
+		iowrite32(value, ep->fotg210->reg +
+				FOTG210_FIBCR(ep->epnum - 1));
+	} else {
+		value = ioread32(ep->fotg210->reg + FOTG210_DCFESR);
+		value |= DCFESR_CX_CLR;
+		iowrite32(value, ep->fotg210->reg + FOTG210_DCFESR);
+	}
+}
+
+static void fotg210_start_dma(struct fotg210_ep *ep,
+			struct fotg210_request *req)
+{
+	dma_addr_t d;
+	u8 *buffer;
+	u32 length;
+
+	if (ep->epnum) {
+		if (ep->dir_in) {
+			buffer = req->req.buf;
+			length = req->req.length;
+		} else {
+			buffer = req->req.buf + req->req.actual;
+			length = ioread32(ep->fotg210->reg +
+					FOTG210_FIBCR(ep->epnum - 1));
+			length &= FIBCR_BCFX;
+		}
+	} else {
+		buffer = req->req.buf + req->req.actual;
+		if (req->req.length - req->req.actual > ep->ep.maxpacket)
+			length = ep->ep.maxpacket;
+		else
+			length = req->req.length;
+	}
+
+	d = dma_map_single(NULL, buffer, length,
+			ep->dir_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	if (dma_mapping_error(NULL, d)) {
+		pr_err("dma_mapping_error\n");
+		return;
+	}
+
+	dma_sync_single_for_device(NULL, d, length,
+				   ep->dir_in ? DMA_TO_DEVICE :
+					DMA_FROM_DEVICE);
+
+	fotg210_enable_dma(ep, d, length);
+
+	/* check if dma is done */
+	fotg210_wait_dma_done(ep);
+
+	fotg210_disable_dma(ep);
+
+	/* update actual transfer length */
+	req->req.actual += length;
+
+	dma_unmap_single(NULL, d, length, DMA_TO_DEVICE);
+}
+
+static void fotg210_ep0_queue(struct fotg210_ep *ep,
+				struct fotg210_request *req)
+{
+	if (!req->req.length) {
+		fotg210_done(ep, req, 0);
+		return;
+	}
+	if (ep->dir_in) { /* if IN */
+		if (req->req.length) {
+			fotg210_start_dma(ep, req);
+		} else {
+			pr_err("%s : req->req.length = 0x%x\n",
+			       __func__, req->req.length);
+		}
+		if ((req->req.length == req->req.actual) ||
+		    (req->req.actual < ep->ep.maxpacket))
+			fotg210_done(ep, req, 0);
+	} else { /* OUT */
+		if (!req->req.length) {
+			fotg210_done(ep, req, 0);
+		} else {
+			u32 value = ioread32(ep->fotg210->reg +
+						FOTG210_DMISGR0);
+
+			value &= ~DMISGR0_MCX_OUT_INT;
+			iowrite32(value, ep->fotg210->reg + FOTG210_DMISGR0);
+		}
+	}
+}
+
+static int fotg210_ep_queue(struct usb_ep *_ep, struct usb_request *_req,
+				gfp_t gfp_flags)
+{
+	struct fotg210_ep *ep;
+	struct fotg210_request *req;
+	unsigned long flags;
+	int request = 0;
+
+	ep = container_of(_ep, struct fotg210_ep, ep);
+	req = container_of(_req, struct fotg210_request, req);
+
+	if (ep->fotg210->gadget.speed == USB_SPEED_UNKNOWN)
+		return -ESHUTDOWN;
+
+	spin_lock_irqsave(&ep->fotg210->lock, flags);
+
+	if (list_empty(&ep->queue))
+		request = 1;
+
+	list_add_tail(&req->queue, &ep->queue);
+
+	req->req.actual = 0;
+	req->req.status = -EINPROGRESS;
+
+	if (!ep->epnum) /* ep0 */
+		fotg210_ep0_queue(ep, req);
+	else if (request && !ep->stall)
+		fotg210_enable_fifo_int(ep);
+
+	spin_unlock_irqrestore(&ep->fotg210->lock, flags);
+
+	return 0;
+}
+
+static int fotg210_ep_dequeue(struct usb_ep *_ep, struct usb_request *_req)
+{
+	struct fotg210_ep *ep;
+	struct fotg210_request *req;
+	unsigned long flags;
+
+	ep = container_of(_ep, struct fotg210_ep, ep);
+	req = container_of(_req, struct fotg210_request, req);
+
+	spin_lock_irqsave(&ep->fotg210->lock, flags);
+	if (!list_empty(&ep->queue))
+		fotg210_done(ep, req, -ECONNRESET);
+	spin_unlock_irqrestore(&ep->fotg210->lock, flags);
+
+	return 0;
+}
+
+static void fotg210_set_epnstall(struct fotg210_ep *ep)
+{
+	struct fotg210_udc *fotg210 = ep->fotg210;
+	u32 value;
+	void __iomem *reg;
+
+	/* check if IN FIFO is empty before stall */
+	if (ep->dir_in) {
+		do {
+			value = ioread32(fotg210->reg + FOTG210_DCFESR);
+		} while (!(value & DCFESR_FIFO_EMPTY(ep->epnum - 1)));
+	}
+
+	reg = (ep->dir_in) ?
+		fotg210->reg + FOTG210_INEPMPSR(ep->epnum) :
+		fotg210->reg + FOTG210_OUTEPMPSR(ep->epnum);
+	value = ioread32(reg);
+	value |= INOUTEPMPSR_STL_EP;
+	iowrite32(value, reg);
+}
+
+static void fotg210_clear_epnstall(struct fotg210_ep *ep)
+{
+	struct fotg210_udc *fotg210 = ep->fotg210;
+	u32 value;
+	void __iomem *reg;
+
+	reg = (ep->dir_in) ?
+		fotg210->reg + FOTG210_INEPMPSR(ep->epnum) :
+		fotg210->reg + FOTG210_OUTEPMPSR(ep->epnum);
+	value = ioread32(reg);
+	value &= ~INOUTEPMPSR_STL_EP;
+	iowrite32(value, reg);
+}
+
+static int fotg210_set_halt_and_wedge(struct usb_ep *_ep, int value, int wedge)
+{
+	struct fotg210_ep *ep;
+	struct fotg210_udc *fotg210;
+	unsigned long flags;
+	int ret = 0;
+
+	ep = container_of(_ep, struct fotg210_ep, ep);
+
+	fotg210 = ep->fotg210;
+
+	spin_lock_irqsave(&ep->fotg210->lock, flags);
+
+	if (value) {
+		fotg210_set_epnstall(ep);
+		ep->stall = 1;
+		if (wedge)
+			ep->wedged = 1;
+	} else {
+		fotg210_reset_tseq(fotg210, ep->epnum);
+		fotg210_clear_epnstall(ep);
+		ep->stall = 0;
+		ep->wedged = 0;
+		if (!list_empty(&ep->queue))
+			fotg210_enable_fifo_int(ep);
+	}
+
+	spin_unlock_irqrestore(&ep->fotg210->lock, flags);
+	return ret;
+}
+
+static int fotg210_ep_set_halt(struct usb_ep *_ep, int value)
+{
+	return fotg210_set_halt_and_wedge(_ep, value, 0);
+}
+
+static int fotg210_ep_set_wedge(struct usb_ep *_ep)
+{
+	return fotg210_set_halt_and_wedge(_ep, 1, 1);
+}
+
+static void fotg210_ep_fifo_flush(struct usb_ep *_ep)
+{
+}
+
+static struct usb_ep_ops fotg210_ep_ops = {
+	.enable		= fotg210_ep_enable,
+	.disable	= fotg210_ep_disable,
+
+	.alloc_request	= fotg210_ep_alloc_request,
+	.free_request	= fotg210_ep_free_request,
+
+	.queue		= fotg210_ep_queue,
+	.dequeue	= fotg210_ep_dequeue,
+
+	.set_halt	= fotg210_ep_set_halt,
+	.fifo_flush	= fotg210_ep_fifo_flush,
+	.set_wedge	= fotg210_ep_set_wedge,
+};
+
+static void fotg210_clear_tx0byte(struct fotg210_udc *fotg210)
+{
+	u32 value = ioread32(fotg210->reg + FOTG210_TX0BYTE);
+
+	value &= ~(TX0BYTE_EP1 | TX0BYTE_EP2 | TX0BYTE_EP3
+		   | TX0BYTE_EP4);
+	iowrite32(value, fotg210->reg + FOTG210_TX0BYTE);
+}
+
+static void fotg210_clear_rx0byte(struct fotg210_udc *fotg210)
+{
+	u32 value = ioread32(fotg210->reg + FOTG210_RX0BYTE);
+
+	value &= ~(RX0BYTE_EP1 | RX0BYTE_EP2 | RX0BYTE_EP3
+		   | RX0BYTE_EP4);
+	iowrite32(value, fotg210->reg + FOTG210_RX0BYTE);
+}
+
+/* read 8-byte setup packet only */
+static void fotg210_rdsetupp(struct fotg210_udc *fotg210,
+		   u8 *buffer)
+{
+	int i = 0;
+	u8 *tmp = buffer;
+	u32 data;
+	u32 length = 8;
+
+	iowrite32(DMATFNR_ACC_CXF, fotg210->reg + FOTG210_DMATFNR);
+
+	for (i = (length >> 2); i > 0; i--) {
+		data = ioread32(fotg210->reg + FOTG210_CXPORT);
+		*tmp = data & 0xFF;
+		*(tmp + 1) = (data >> 8) & 0xFF;
+		*(tmp + 2) = (data >> 16) & 0xFF;
+		*(tmp + 3) = (data >> 24) & 0xFF;
+		tmp = tmp + 4;
+	}
+
+	switch (length % 4) {
+	case 1:
+		data = ioread32(fotg210->reg + FOTG210_CXPORT);
+		*tmp = data & 0xFF;
+		break;
+	case 2:
+		data = ioread32(fotg210->reg + FOTG210_CXPORT);
+		*tmp = data & 0xFF;
+		*(tmp + 1) = (data >> 8) & 0xFF;
+		break;
+	case 3:
+		data = ioread32(fotg210->reg + FOTG210_CXPORT);
+		*tmp = data & 0xFF;
+		*(tmp + 1) = (data >> 8) & 0xFF;
+		*(tmp + 2) = (data >> 16) & 0xFF;
+		break;
+	default:
+		break;
+	}
+
+	iowrite32(DMATFNR_DISDMA, fotg210->reg + FOTG210_DMATFNR);
+}
+
+static void fotg210_set_configuration(struct fotg210_udc *fotg210)
+{
+	u32 value = ioread32(fotg210->reg + FOTG210_DAR);
+
+	value |= DAR_AFT_CONF;
+	iowrite32(value, fotg210->reg + FOTG210_DAR);
+}
+
+static void fotg210_set_dev_addr(struct fotg210_udc *fotg210, u32 addr)
+{
+	u32 value = ioread32(fotg210->reg + FOTG210_DAR);
+
+	value |= (addr & 0x7F);
+	iowrite32(value, fotg210->reg + FOTG210_DAR);
+}
+
+static void fotg210_set_cxstall(struct fotg210_udc *fotg210)
+{
+	u32 value = ioread32(fotg210->reg + FOTG210_DCFESR);
+
+	value |= DCFESR_CX_STL;
+	iowrite32(value, fotg210->reg + FOTG210_DCFESR);
+}
+
+static void fotg210_request_error(struct fotg210_udc *fotg210)
+{
+	fotg210_set_cxstall(fotg210);
+	pr_err("request error!!\n");
+}
+
+static void fotg210_set_address(struct fotg210_udc *fotg210,
+				struct usb_ctrlrequest *ctrl)
+{
+	if (ctrl->wValue >= 0x0100) {
+		fotg210_request_error(fotg210);
+	} else {
+		fotg210_set_dev_addr(fotg210, ctrl->wValue);
+		fotg210_set_cxdone(fotg210);
+	}
+}
+
+static void fotg210_set_feature(struct fotg210_udc *fotg210,
+				struct usb_ctrlrequest *ctrl)
+{
+	switch (ctrl->bRequestType & USB_RECIP_MASK) {
+	case USB_RECIP_DEVICE:
+		fotg210_set_cxdone(fotg210);
+		break;
+	case USB_RECIP_INTERFACE:
+		fotg210_set_cxdone(fotg210);
+		break;
+	case USB_RECIP_ENDPOINT: {
+		u8 epnum;
+		epnum = le16_to_cpu(ctrl->wIndex) & USB_ENDPOINT_NUMBER_MASK;
+		if (epnum)
+			fotg210_set_epnstall(fotg210->ep[epnum]);
+		else
+			fotg210_set_cxstall(fotg210);
+		fotg210_set_cxdone(fotg210);
+		}
+		break;
+	default:
+		fotg210_request_error(fotg210);
+		break;
+	}
+}
+
+static void fotg210_clear_feature(struct fotg210_udc *fotg210,
+				struct usb_ctrlrequest *ctrl)
+{
+	struct fotg210_ep *ep =
+		fotg210->ep[ctrl->wIndex & USB_ENDPOINT_NUMBER_MASK];
+
+	switch (ctrl->bRequestType & USB_RECIP_MASK) {
+	case USB_RECIP_DEVICE:
+		fotg210_set_cxdone(fotg210);
+		break;
+	case USB_RECIP_INTERFACE:
+		fotg210_set_cxdone(fotg210);
+		break;
+	case USB_RECIP_ENDPOINT:
+		if (ctrl->wIndex & USB_ENDPOINT_NUMBER_MASK) {
+			if (ep->wedged) {
+				fotg210_set_cxdone(fotg210);
+				break;
+			}
+			if (ep->stall)
+				fotg210_set_halt_and_wedge(&ep->ep, 0, 0);
+		}
+		fotg210_set_cxdone(fotg210);
+		break;
+	default:
+		fotg210_request_error(fotg210);
+		break;
+	}
+}
+
+static int fotg210_is_epnstall(struct fotg210_ep *ep)
+{
+	struct fotg210_udc *fotg210 = ep->fotg210;
+	u32 value;
+	void __iomem *reg;
+
+	reg = (ep->dir_in) ?
+		fotg210->reg + FOTG210_INEPMPSR(ep->epnum) :
+		fotg210->reg + FOTG210_OUTEPMPSR(ep->epnum);
+	value = ioread32(reg);
+	return value & INOUTEPMPSR_STL_EP ? 1 : 0;
+}
+
+static void fotg210_get_status(struct fotg210_udc *fotg210,
+				struct usb_ctrlrequest *ctrl)
+{
+	u8 epnum;
+
+	switch (ctrl->bRequestType & USB_RECIP_MASK) {
+	case USB_RECIP_DEVICE:
+		fotg210->ep0_data = 1 << USB_DEVICE_SELF_POWERED;
+		break;
+	case USB_RECIP_INTERFACE:
+		fotg210->ep0_data = 0;
+		break;
+	case USB_RECIP_ENDPOINT:
+		epnum = ctrl->wIndex & USB_ENDPOINT_NUMBER_MASK;
+		if (epnum)
+			fotg210->ep0_data =
+				fotg210_is_epnstall(fotg210->ep[epnum])
+				<< USB_ENDPOINT_HALT;
+		else
+			fotg210_request_error(fotg210);
+		break;
+
+	default:
+		fotg210_request_error(fotg210);
+		return;		/* exit */
+	}
+
+	fotg210->ep0_req->buf = &fotg210->ep0_data;
+	fotg210->ep0_req->length = 2;
+
+	spin_unlock(&fotg210->lock);
+	fotg210_ep_queue(fotg210->gadget.ep0, fotg210->ep0_req, GFP_KERNEL);
+	spin_lock(&fotg210->lock);
+}
+
+static int fotg210_setup_packet(struct fotg210_udc *fotg210,
+				struct usb_ctrlrequest *ctrl)
+{
+	u8 *p = (u8 *)ctrl;
+	u8 ret = 0;
+
+	fotg210_rdsetupp(fotg210, p);
+
+	fotg210->ep[0]->dir_in = ctrl->bRequestType & USB_DIR_IN;
+
+	if (fotg210->gadget.speed == USB_SPEED_UNKNOWN) {
+		u32 value = ioread32(fotg210->reg + FOTG210_DMCR);
+		fotg210->gadget.speed = value & DMCR_HS_EN ?
+				USB_SPEED_HIGH : USB_SPEED_FULL;
+	}
+
+	/* check request */
+	if ((ctrl->bRequestType & USB_TYPE_MASK) == USB_TYPE_STANDARD) {
+		switch (ctrl->bRequest) {
+		case USB_REQ_GET_STATUS:
+			fotg210_get_status(fotg210, ctrl);
+			break;
+		case USB_REQ_CLEAR_FEATURE:
+			fotg210_clear_feature(fotg210, ctrl);
+			break;
+		case USB_REQ_SET_FEATURE:
+			fotg210_set_feature(fotg210, ctrl);
+			break;
+		case USB_REQ_SET_ADDRESS:
+			fotg210_set_address(fotg210, ctrl);
+			break;
+		case USB_REQ_SET_CONFIGURATION:
+			fotg210_set_configuration(fotg210);
+			ret = 1;
+			break;
+		default:
+			ret = 1;
+			break;
+		}
+	} else {
+		ret = 1;
+	}
+
+	return ret;
+}
+
+static void fotg210_ep0out(struct fotg210_udc *fotg210)
+{
+	struct fotg210_ep *ep = fotg210->ep[0];
+
+	if (!list_empty(&ep->queue) && !ep->dir_in) {
+		struct fotg210_request *req;
+
+		req = list_first_entry(&ep->queue,
+			struct fotg210_request, queue);
+
+		if (req->req.length)
+			fotg210_start_dma(ep, req);
+
+		if ((req->req.length - req->req.actual) < ep->ep.maxpacket)
+			fotg210_done(ep, req, 0);
+	} else {
+		pr_err("%s : empty queue\n", __func__);
+	}
+}
+
+static void fotg210_ep0in(struct fotg210_udc *fotg210)
+{
+	struct fotg210_ep *ep = fotg210->ep[0];
+
+	if ((!list_empty(&ep->queue)) && (ep->dir_in)) {
+		struct fotg210_request *req;
+
+		req = list_entry(ep->queue.next,
+				struct fotg210_request, queue);
+
+		if (req->req.length)
+			fotg210_start_dma(ep, req);
+
+		if ((req->req.length - req->req.actual) < ep->ep.maxpacket)
+			fotg210_done(ep, req, 0);
+	} else {
+		fotg210_set_cxdone(fotg210);
+	}
+}
+
+static void fotg210_clear_comabt_int(struct fotg210_udc *fotg210)
+{
+	u32 value = ioread32(fotg210->reg + FOTG210_DISGR0);
+
+	value &= ~DISGR0_CX_COMABT_INT;
+	iowrite32(value, fotg210->reg + FOTG210_DISGR0);
+}
+
+static void fotg210_in_fifo_handler(struct fotg210_ep *ep)
+{
+	struct fotg210_request *req = list_entry(ep->queue.next,
+					struct fotg210_request, queue);
+
+	if (req->req.length)
+		fotg210_start_dma(ep, req);
+	fotg210_done(ep, req, 0);
+}
+
+static void fotg210_out_fifo_handler(struct fotg210_ep *ep)
+{
+	struct fotg210_request *req = list_entry(ep->queue.next,
+						 struct fotg210_request, queue);
+
+	fotg210_start_dma(ep, req);
+
+	/* finish out transfer */
+	if (req->req.length == req->req.actual ||
+	    req->req.actual < ep->ep.maxpacket)
+		fotg210_done(ep, req, 0);
+}
+
+static irqreturn_t fotg210_irq(int irq, void *_fotg210)
+{
+	struct fotg210_udc *fotg210 = _fotg210;
+	u32 int_grp = ioread32(fotg210->reg + FOTG210_DIGR);
+	u32 int_msk = ioread32(fotg210->reg + FOTG210_DMIGR);
+
+	int_grp &= ~int_msk;
+
+	spin_lock(&fotg210->lock);
+
+	if (int_grp & DIGR_INT_G2) {
+		void __iomem *reg = fotg210->reg + FOTG210_DISGR2;
+		u32 int_grp2 = ioread32(reg);
+		u32 int_msk2 = ioread32(fotg210->reg + FOTG210_DMISGR2);
+		u32 value;
+
+		int_grp2 &= ~int_msk2;
+
+		if (int_grp2 & DISGR2_USBRST_INT) {
+			value = ioread32(reg);
+			value &= ~DISGR2_USBRST_INT;
+			iowrite32(value, reg);
+			pr_info("fotg210 udc reset\n");
+		}
+		if (int_grp2 & DISGR2_SUSP_INT) {
+			value = ioread32(reg);
+			value &= ~DISGR2_SUSP_INT;
+			iowrite32(value, reg);
+			pr_info("fotg210 udc suspend\n");
+		}
+		if (int_grp2 & DISGR2_RESM_INT) {
+			value = ioread32(reg);
+			value &= ~DISGR2_RESM_INT;
+			iowrite32(value, reg);
+			pr_info("fotg210 udc resume\n");
+		}
+		if (int_grp2 & DISGR2_ISO_SEQ_ERR_INT) {
+			value = ioread32(reg);
+			value &= ~DISGR2_ISO_SEQ_ERR_INT;
+			iowrite32(value, reg);
+			pr_info("fotg210 iso sequence error\n");
+		}
+		if (int_grp2 & DISGR2_ISO_SEQ_ABORT_INT) {
+			value = ioread32(reg);
+			value &= ~DISGR2_ISO_SEQ_ABORT_INT;
+			iowrite32(value, reg);
+			pr_info("fotg210 iso sequence abort\n");
+		}
+		if (int_grp2 & DISGR2_TX0BYTE_INT) {
+			fotg210_clear_tx0byte(fotg210);
+			value = ioread32(reg);
+			value &= ~DISGR2_TX0BYTE_INT;
+			iowrite32(value, reg);
+			pr_info("fotg210 transferred 0 byte\n");
+		}
+		if (int_grp2 & DISGR2_RX0BYTE_INT) {
+			fotg210_clear_rx0byte(fotg210);
+			value = ioread32(reg);
+			value &= ~DISGR2_RX0BYTE_INT;
+			iowrite32(value, reg);
+			pr_info("fotg210 received 0 byte\n");
+		}
+		if (int_grp2 & DISGR2_DMA_ERROR) {
+			value = ioread32(reg);
+			value &= ~DISGR2_DMA_ERROR;
+			iowrite32(value, reg);
+		}
+	}
+
+	if (int_grp & DIGR_INT_G0) {
+		void __iomem *reg = fotg210->reg + FOTG210_DISGR0;
+		u32 int_grp0 = ioread32(reg);
+		u32 int_msk0 = ioread32(fotg210->reg + FOTG210_DMISGR0);
+		struct usb_ctrlrequest ctrl;
+
+		int_grp0 &= ~int_msk0;
+
+		/* the highest priority in this source register */
+		if (int_grp0 & DISGR0_CX_COMABT_INT) {
+			fotg210_clear_comabt_int(fotg210);
+			pr_info("fotg210 CX command abort\n");
+		}
+
+		if (int_grp0 & DISGR0_CX_SETUP_INT) {
+			if (fotg210_setup_packet(fotg210, &ctrl)) {
+				spin_unlock(&fotg210->lock);
+				if (fotg210->driver->setup(&fotg210->gadget,
+							   &ctrl) < 0)
+					fotg210_set_cxstall(fotg210);
+				spin_lock(&fotg210->lock);
+			}
+		}
+		if (int_grp0 & DISGR0_CX_COMEND_INT)
+			pr_info("fotg210 cmd end\n");
+
+		if (int_grp0 & DISGR0_CX_IN_INT)
+			fotg210_ep0in(fotg210);
+
+		if (int_grp0 & DISGR0_CX_OUT_INT)
+			fotg210_ep0out(fotg210);
+
+		if (int_grp0 & DISGR0_CX_COMFAIL_INT) {
+			fotg210_set_cxstall(fotg210);
+			pr_info("fotg210 ep0 fail\n");
+		}
+	}
+
+	if (int_grp & DIGR_INT_G1) {
+		void __iomem *reg = fotg210->reg + FOTG210_DISGR1;
+		u32 int_grp1 = ioread32(reg);
+		u32 int_msk1 = ioread32(fotg210->reg + FOTG210_DMISGR1);
+		int fifo;
+
+		int_grp1 &= ~int_msk1;
+
+		for (fifo = 0; fifo < FOTG210_MAX_FIFO_NUM; fifo++) {
+			if (int_grp1 & DISGR1_IN_INT(fifo))
+				fotg210_in_fifo_handler(fotg210->ep[fifo + 1]);
+
+			if ((int_grp1 & DISGR1_OUT_INT(fifo)) ||
+			    (int_grp1 & DISGR1_SPK_INT(fifo)))
+				fotg210_out_fifo_handler(fotg210->ep[fifo + 1]);
+		}
+	}
+
+	spin_unlock(&fotg210->lock);
+
+	return IRQ_HANDLED;
+}
+
+static void fotg210_disable_unplug(struct fotg210_udc *fotg210)
+{
+	u32 reg = ioread32(fotg210->reg + FOTG210_PHYTMSR);
+
+	reg &= ~PHYTMSR_UNPLUG;
+	iowrite32(reg, fotg210->reg + FOTG210_PHYTMSR);
+}
+
+static int fotg210_udc_start(struct usb_gadget *g,
+		struct usb_gadget_driver *driver)
+{
+	struct fotg210_udc *fotg210 = gadget_to_fotg210(g);
+	u32 value;
+
+	/* hook up the driver */
+	driver->driver.bus = NULL;
+	fotg210->driver = driver;
+
+	/* enable device global interrupt */
+	value = ioread32(fotg210->reg + FOTG210_DMCR);
+	value |= DMCR_GLINT_EN;
+	iowrite32(value, fotg210->reg + FOTG210_DMCR);
+
+	return 0;
+}
+
+static void fotg210_init(struct fotg210_udc *fotg210)
+{
+	u32 value;
+
+	/* disable global interrupt and set int polarity to active high */
+	iowrite32(GMIR_MHC_INT | GMIR_MOTG_INT | GMIR_INT_POLARITY,
+		  fotg210->reg + FOTG210_GMIR);
+
+	/* disable device global interrupt */
+	value = ioread32(fotg210->reg + FOTG210_DMCR);
+	value &= ~DMCR_GLINT_EN;
+	iowrite32(value, fotg210->reg + FOTG210_DMCR);
+
+	/* disable all fifo interrupt */
+	iowrite32(~(u32)0, fotg210->reg + FOTG210_DMISGR1);
+
+	/* disable cmd end */
+	value = ioread32(fotg210->reg + FOTG210_DMISGR0);
+	value |= DMISGR0_MCX_COMEND;
+	iowrite32(value, fotg210->reg + FOTG210_DMISGR0);
+}
+
+static int fotg210_udc_stop(struct usb_gadget *g,
+		struct usb_gadget_driver *driver)
+{
+	struct fotg210_udc *fotg210 = gadget_to_fotg210(g);
+	unsigned long	flags;
+
+	spin_lock_irqsave(&fotg210->lock, flags);
+
+	fotg210_init(fotg210);
+	fotg210->driver = NULL;
+
+	spin_unlock_irqrestore(&fotg210->lock, flags);
+
+	return 0;
+}
+
+static struct usb_gadget_ops fotg210_gadget_ops = {
+	.udc_start		= fotg210_udc_start,
+	.udc_stop		= fotg210_udc_stop,
+};
+
+static int __exit fotg210_udc_remove(struct platform_device *pdev)
+{
+	struct fotg210_udc *fotg210 = dev_get_drvdata(&pdev->dev);
+
+	usb_del_gadget_udc(&fotg210->gadget);
+	iounmap(fotg210->reg);
+	free_irq(platform_get_irq(pdev, 0), fotg210);
+
+	fotg210_ep_free_request(&fotg210->ep[0]->ep, fotg210->ep0_req);
+	kfree(fotg210);
+
+	return 0;
+}
+
+static int __init fotg210_udc_probe(struct platform_device *pdev)
+{
+	struct resource *res, *ires;
+	struct fotg210_udc *fotg210 = NULL;
+	struct fotg210_ep *_ep[FOTG210_MAX_NUM_EP];
+	int ret = 0;
+	int i;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		pr_err("platform_get_resource error.\n");
+		return -ENODEV;
+	}
+
+	ires = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!ires) {
+		pr_err("platform_get_resource IORESOURCE_IRQ error.\n");
+		return -ENODEV;
+	}
+
+	ret = -ENOMEM;
+
+	/* initialize udc */
+	fotg210 = kzalloc(sizeof(struct fotg210_udc), GFP_KERNEL);
+	if (fotg210 == NULL) {
+		pr_err("kzalloc error\n");
+		goto err_alloc;
+	}
+
+	for (i = 0; i < FOTG210_MAX_NUM_EP; i++) {
+		_ep[i] = kzalloc(sizeof(struct fotg210_ep), GFP_KERNEL);
+		if (_ep[i] == NULL) {
+			pr_err("_ep kzalloc error\n");
+			goto err_alloc;
+		}
+		fotg210->ep[i] = _ep[i];
+	}
+
+	fotg210->reg = ioremap(res->start, resource_size(res));
+	if (fotg210->reg == NULL) {
+		pr_err("ioremap error.\n");
+		goto err_map;
+	}
+
+	spin_lock_init(&fotg210->lock);
+
+	dev_set_drvdata(&pdev->dev, fotg210);
+
+	fotg210->gadget.ops = &fotg210_gadget_ops;
+
+	fotg210->gadget.max_speed = USB_SPEED_HIGH;
+	fotg210->gadget.dev.parent = &pdev->dev;
+	fotg210->gadget.dev.dma_mask = pdev->dev.dma_mask;
+	fotg210->gadget.name = udc_name;
+
+	INIT_LIST_HEAD(&fotg210->gadget.ep_list);
+
+	for (i = 0; i < FOTG210_MAX_NUM_EP; i++) {
+		struct fotg210_ep *ep = fotg210->ep[i];
+
+		if (i) {
+			INIT_LIST_HEAD(&fotg210->ep[i]->ep.ep_list);
+			list_add_tail(&fotg210->ep[i]->ep.ep_list,
+				      &fotg210->gadget.ep_list);
+		}
+		ep->fotg210 = fotg210;
+		INIT_LIST_HEAD(&ep->queue);
+		ep->ep.name = fotg210_ep_name[i];
+		ep->ep.ops = &fotg210_ep_ops;
+	}
+	fotg210->ep[0]->ep.maxpacket = 0x40;
+	fotg210->gadget.ep0 = &fotg210->ep[0]->ep;
+	INIT_LIST_HEAD(&fotg210->gadget.ep0->ep_list);
+
+	fotg210->ep0_req = fotg210_ep_alloc_request(&fotg210->ep[0]->ep,
+				GFP_KERNEL);
+	if (fotg210->ep0_req == NULL)
+		goto err_req;
+
+	fotg210_init(fotg210);
+
+	fotg210_disable_unplug(fotg210);
+
+	ret = request_irq(ires->start, fotg210_irq, IRQF_SHARED,
+			  udc_name, fotg210);
+	if (ret < 0) {
+		pr_err("request_irq error (%d)\n", ret);
+		goto err_irq;
+	}
+
+	ret = usb_add_gadget_udc(&pdev->dev, &fotg210->gadget);
+	if (ret)
+		goto err_add_udc;
+
+	dev_info(&pdev->dev, "version %s\n", DRIVER_VERSION);
+
+	return 0;
+
+err_add_udc:
+err_irq:
+	free_irq(ires->start, fotg210);
+
+err_req:
+	fotg210_ep_free_request(&fotg210->ep[0]->ep, fotg210->ep0_req);
+
+err_map:
+	if (fotg210->reg)
+		iounmap(fotg210->reg);
+
+err_alloc:
+	kfree(fotg210);
+
+	return ret;
+}
+
+static struct platform_driver fotg210_driver = {
+	.driver		= {
+		.name =	(char *)udc_name,
+		.owner	= THIS_MODULE,
+	},
+	.probe		= fotg210_udc_probe,
+	.remove		= fotg210_udc_remove,
+};
+
+module_platform_driver(fotg210_driver);
+
+MODULE_AUTHOR("Yuan-Hsin Chen <yhchen@faraday-tech.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(DRIVER_DESC);

diff --git a/drivers/usb/gadget/fotg210.h b/drivers/usb/gadget/fotg210.h
new file mode 100644
index 0000000..bbf991b
--- /dev/null
+++ b/drivers/usb/gadget/fotg210.h

@@ -0,0 +1,253 @@
+/*
+ * Faraday FOTG210 USB OTG controller
+ *
+ * Copyright (C) 2013 Faraday Technology Corporation
+ * Author: Yuan-Hsin Chen <yhchen@faraday-tech.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+
+#define FOTG210_MAX_NUM_EP	5 /* ep0...ep4 */
+#define FOTG210_MAX_FIFO_NUM	4 /* fifo0...fifo4 */
+
+/* Global Mask of HC/OTG/DEV interrupt Register(0xC4) */
+#define FOTG210_GMIR		0xC4
+#define GMIR_INT_POLARITY	0x8 /*Active High*/
+#define GMIR_MHC_INT		0x4
+#define GMIR_MOTG_INT		0x2
+#define GMIR_MDEV_INT		0x1
+
+/*  Device Main Control Register(0x100) */
+#define FOTG210_DMCR		0x100
+#define DMCR_HS_EN		(1 << 6)
+#define DMCR_CHIP_EN		(1 << 5)
+#define DMCR_SFRST		(1 << 4)
+#define DMCR_GOSUSP		(1 << 3)
+#define DMCR_GLINT_EN		(1 << 2)
+#define DMCR_HALF_SPEED		(1 << 1)
+#define DMCR_CAP_RMWAKUP	(1 << 0)
+
+/* Device Address Register(0x104) */
+#define FOTG210_DAR		0x104
+#define DAR_AFT_CONF		(1 << 7)
+
+/* Device Test Register(0x108) */
+#define FOTG210_DTR		0x108
+#define DTR_TST_CLRFF		(1 << 0)
+
+/* PHY Test Mode Selector register(0x114) */
+#define FOTG210_PHYTMSR		0x114
+#define PHYTMSR_TST_PKT		(1 << 4)
+#define PHYTMSR_TST_SE0NAK	(1 << 3)
+#define PHYTMSR_TST_KSTA	(1 << 2)
+#define PHYTMSR_TST_JSTA	(1 << 1)
+#define PHYTMSR_UNPLUG		(1 << 0)
+
+/* Cx configuration and FIFO Empty Status register(0x120) */
+#define FOTG210_DCFESR		0x120
+#define DCFESR_FIFO_EMPTY(fifo)	(1 << 8 << (fifo))
+#define DCFESR_CX_EMP		(1 << 5)
+#define DCFESR_CX_CLR		(1 << 3)
+#define DCFESR_CX_STL		(1 << 2)
+#define DCFESR_TST_PKDONE	(1 << 1)
+#define DCFESR_CX_DONE		(1 << 0)
+
+/* Device IDLE Counter Register(0x124) */
+#define FOTG210_DICR		0x124
+
+/* Device Mask of Interrupt Group Register (0x130) */
+#define FOTG210_DMIGR		0x130
+#define DMIGR_MINT_G0		(1 << 0)
+
+/* Device Mask of Interrupt Source Group 0(0x134) */
+#define FOTG210_DMISGR0		0x134
+#define DMISGR0_MCX_COMEND	(1 << 3)
+#define DMISGR0_MCX_OUT_INT	(1 << 2)
+#define DMISGR0_MCX_IN_INT	(1 << 1)
+#define DMISGR0_MCX_SETUP_INT	(1 << 0)
+
+/* Device Mask of Interrupt Source Group 1 Register(0x138)*/
+#define FOTG210_DMISGR1		0x138
+#define DMISGR1_MF3_IN_INT	(1 << 19)
+#define DMISGR1_MF2_IN_INT	(1 << 18)
+#define DMISGR1_MF1_IN_INT	(1 << 17)
+#define DMISGR1_MF0_IN_INT	(1 << 16)
+#define DMISGR1_MF_IN_INT(fifo)	(1 << (16 + (fifo)))
+#define DMISGR1_MF3_SPK_INT	(1 << 7)
+#define DMISGR1_MF3_OUT_INT	(1 << 6)
+#define DMISGR1_MF2_SPK_INT	(1 << 5)
+#define DMISGR1_MF2_OUT_INT	(1 << 4)
+#define DMISGR1_MF1_SPK_INT	(1 << 3)
+#define DMISGR1_MF1_OUT_INT	(1 << 2)
+#define DMISGR1_MF0_SPK_INT	(1 << 1)
+#define DMISGR1_MF0_OUT_INT	(1 << 0)
+#define DMISGR1_MF_OUTSPK_INT(fifo)	(0x3 << (fifo) * 2)
+
+/* Device Mask of Interrupt Source Group 2 Register (0x13C) */
+#define FOTG210_DMISGR2		0x13C
+#define DMISGR2_MDMA_ERROR	(1 << 8)
+#define DMISGR2_MDMA_CMPLT	(1 << 7)
+
+/* Device Interrupt group Register (0x140) */
+#define FOTG210_DIGR		0x140
+#define DIGR_INT_G2		(1 << 2)
+#define DIGR_INT_G1		(1 << 1)
+#define DIGR_INT_G0		(1 << 0)
+
+/* Device Interrupt Source Group 0 Register (0x144) */
+#define FOTG210_DISGR0		0x144
+#define DISGR0_CX_COMABT_INT	(1 << 5)
+#define DISGR0_CX_COMFAIL_INT	(1 << 4)
+#define DISGR0_CX_COMEND_INT	(1 << 3)
+#define DISGR0_CX_OUT_INT	(1 << 2)
+#define DISGR0_CX_IN_INT	(1 << 1)
+#define DISGR0_CX_SETUP_INT	(1 << 0)
+
+/* Device Interrupt Source Group 1 Register (0x148) */
+#define FOTG210_DISGR1		0x148
+#define DISGR1_OUT_INT(fifo)	(1 << ((fifo) * 2))
+#define DISGR1_SPK_INT(fifo)	(1 << 1 << ((fifo) * 2))
+#define DISGR1_IN_INT(fifo)	(1 << 16 << (fifo))
+
+/* Device Interrupt Source Group 2 Register (0x14C) */
+#define FOTG210_DISGR2		0x14C
+#define DISGR2_DMA_ERROR	(1 << 8)
+#define DISGR2_DMA_CMPLT	(1 << 7)
+#define DISGR2_RX0BYTE_INT	(1 << 6)
+#define DISGR2_TX0BYTE_INT	(1 << 5)
+#define DISGR2_ISO_SEQ_ABORT_INT	(1 << 4)
+#define DISGR2_ISO_SEQ_ERR_INT	(1 << 3)
+#define DISGR2_RESM_INT		(1 << 2)
+#define DISGR2_SUSP_INT		(1 << 1)
+#define DISGR2_USBRST_INT	(1 << 0)
+
+/* Device Receive Zero-Length Data Packet Register (0x150)*/
+#define FOTG210_RX0BYTE		0x150
+#define RX0BYTE_EP8		(1 << 7)
+#define RX0BYTE_EP7		(1 << 6)
+#define RX0BYTE_EP6		(1 << 5)
+#define RX0BYTE_EP5		(1 << 4)
+#define RX0BYTE_EP4		(1 << 3)
+#define RX0BYTE_EP3		(1 << 2)
+#define RX0BYTE_EP2		(1 << 1)
+#define RX0BYTE_EP1		(1 << 0)
+
+/* Device Transfer Zero-Length Data Packet Register (0x154)*/
+#define FOTG210_TX0BYTE		0x154
+#define TX0BYTE_EP8		(1 << 7)
+#define TX0BYTE_EP7		(1 << 6)
+#define TX0BYTE_EP6		(1 << 5)
+#define TX0BYTE_EP5		(1 << 4)
+#define TX0BYTE_EP4		(1 << 3)
+#define TX0BYTE_EP3		(1 << 2)
+#define TX0BYTE_EP2		(1 << 1)
+#define TX0BYTE_EP1		(1 << 0)
+
+/* Device IN Endpoint x MaxPacketSize Register(0x160+4*(x-1)) */
+#define FOTG210_INEPMPSR(ep)	(0x160 + 4 * ((ep) - 1))
+#define INOUTEPMPSR_MPS(mps)	((mps) & 0x2FF)
+#define INOUTEPMPSR_STL_EP	(1 << 11)
+#define INOUTEPMPSR_RESET_TSEQ	(1 << 12)
+
+/* Device OUT Endpoint x MaxPacketSize Register(0x180+4*(x-1)) */
+#define FOTG210_OUTEPMPSR(ep)	(0x180 + 4 * ((ep) - 1))
+
+/* Device Endpoint 1~4 Map Register (0x1A0) */
+#define FOTG210_EPMAP		0x1A0
+#define EPMAP_FIFONO(ep, dir)		\
+	((((ep) - 1) << ((ep) - 1) * 8) << ((dir) ? 0 : 4))
+#define EPMAP_FIFONOMSK(ep, dir)	\
+	((3 << ((ep) - 1) * 8) << ((dir) ? 0 : 4))
+
+/* Device FIFO Map Register (0x1A8) */
+#define FOTG210_FIFOMAP		0x1A8
+#define FIFOMAP_DIROUT(fifo)	(0x0 << 4 << (fifo) * 8)
+#define FIFOMAP_DIRIN(fifo)	(0x1 << 4 << (fifo) * 8)
+#define FIFOMAP_BIDIR(fifo)	(0x2 << 4 << (fifo) * 8)
+#define FIFOMAP_NA(fifo)	(0x3 << 4 << (fifo) * 8)
+#define FIFOMAP_EPNO(ep)	((ep) << ((ep) - 1) * 8)
+#define FIFOMAP_EPNOMSK(ep)	(0xF << ((ep) - 1) * 8)
+
+/* Device FIFO Confuguration Register (0x1AC) */
+#define FOTG210_FIFOCF		0x1AC
+#define FIFOCF_TYPE(type, fifo)	((type) << (fifo) * 8)
+#define FIFOCF_BLK_SIN(fifo)	(0x0 << (fifo) * 8 << 2)
+#define FIFOCF_BLK_DUB(fifo)	(0x1 << (fifo) * 8 << 2)
+#define FIFOCF_BLK_TRI(fifo)	(0x2 << (fifo) * 8 << 2)
+#define FIFOCF_BLKSZ_512(fifo)	(0x0 << (fifo) * 8 << 4)
+#define FIFOCF_BLKSZ_1024(fifo)	(0x1 << (fifo) * 8 << 4)
+#define FIFOCF_FIFO_EN(fifo)	(0x1 << (fifo) * 8 << 5)
+
+/* Device FIFO n Instruction and Byte Count Register (0x1B0+4*n) */
+#define FOTG210_FIBCR(fifo)	(0x1B0 + (fifo) * 4)
+#define FIBCR_BCFX		0x7FF
+#define FIBCR_FFRST		(1 << 12)
+
+/* Device DMA Target FIFO Number Register (0x1C0) */
+#define FOTG210_DMATFNR		0x1C0
+#define DMATFNR_ACC_CXF		(1 << 4)
+#define DMATFNR_ACC_F3		(1 << 3)
+#define DMATFNR_ACC_F2		(1 << 2)
+#define DMATFNR_ACC_F1		(1 << 1)
+#define DMATFNR_ACC_F0		(1 << 0)
+#define DMATFNR_ACC_FN(fifo)	(1 << (fifo))
+#define DMATFNR_DISDMA		0
+
+/* Device DMA Controller Parameter setting 1 Register (0x1C8) */
+#define FOTG210_DMACPSR1	0x1C8
+#define DMACPSR1_DMA_LEN(len)	(((len) & 0xFFFF) << 8)
+#define DMACPSR1_DMA_ABORT	(1 << 3)
+#define DMACPSR1_DMA_TYPE(dir_in)	(((dir_in) ? 1 : 0) << 1)
+#define DMACPSR1_DMA_START	(1 << 0)
+
+/* Device DMA Controller Parameter setting 2 Register (0x1CC) */
+#define FOTG210_DMACPSR2	0x1CC
+
+/* Device DMA Controller Parameter setting 3 Register (0x1CC) */
+#define FOTG210_CXPORT		0x1D0
+
+struct fotg210_request {
+	struct usb_request	req;
+	struct list_head	queue;
+};
+
+struct fotg210_ep {
+	struct usb_ep		ep;
+	struct fotg210_udc	*fotg210;
+
+	struct list_head	queue;
+	unsigned		stall:1;
+	unsigned		wedged:1;
+	unsigned		use_dma:1;
+
+	unsigned char		epnum;
+	unsigned char		type;
+	unsigned char		dir_in;
+	unsigned int		maxp;
+	const struct usb_endpoint_descriptor	*desc;
+};
+
+struct fotg210_udc {
+	spinlock_t		lock; /* protect the struct */
+	void __iomem		*reg;
+
+	unsigned long		irq_trigger;
+
+	struct usb_gadget		gadget;
+	struct usb_gadget_driver	*driver;
+
+	struct fotg210_ep	*ep[FOTG210_MAX_NUM_EP];
+
+	struct usb_request	*ep0_req;	/* for internal request */
+	__le16			ep0_data;
+	u8			ep0_dir;	/* 0/0x80  out/in */
+
+	u8			reenum;		/* if re-enumeration */
+};
+
+#define gadget_to_fotg210(g)	container_of((g), struct fotg210_udc, gadget)

diff --git a/drivers/usb/gadget/fsl_qe_udc.c b/drivers/usb/gadget/fsl_qe_udc.c
index 9a7ee33..f3bb363 100644
--- a/drivers/usb/gadget/fsl_qe_udc.c
+++ b/drivers/usb/gadget/fsl_qe_udc.c

@@ -2589,7 +2589,7 @@
 	if (ret)
 		goto err6;
 
-	dev_set_drvdata(&ofdev->dev, udc);
+	platform_set_drvdata(ofdev, udc);
 	dev_info(udc->dev,
 			"%s USB controller initialized as device\n",
 			(udc->soc_type == PORT_QE) ? "QE" : "CPM");
@@ -2640,7 +2640,7 @@
 
 static int qe_udc_remove(struct platform_device *ofdev)
 {
-	struct qe_udc *udc = dev_get_drvdata(&ofdev->dev);
+	struct qe_udc *udc = platform_get_drvdata(ofdev);
 	struct qe_ep *ep;
 	unsigned int size;
 	DECLARE_COMPLETION(done);

diff --git a/drivers/usb/gadget/fusb300_udc.c b/drivers/usb/gadget/fusb300_udc.c
index b8632d40..c83f3e1 100644
--- a/drivers/usb/gadget/fusb300_udc.c
+++ b/drivers/usb/gadget/fusb300_udc.c

@@ -1347,7 +1347,7 @@
 
 static int __exit fusb300_remove(struct platform_device *pdev)
 {
-	struct fusb300 *fusb300 = dev_get_drvdata(&pdev->dev);
+	struct fusb300 *fusb300 = platform_get_drvdata(pdev);
 
 	usb_del_gadget_udc(&fusb300->gadget);
 	iounmap(fusb300->reg);
@@ -1416,7 +1416,7 @@
 
 	spin_lock_init(&fusb300->lock);
 
-	dev_set_drvdata(&pdev->dev, fusb300);
+	platform_set_drvdata(pdev, fusb300);
 
 	fusb300->gadget.ops = &fusb300_gadget_ops;
 

diff --git a/drivers/usb/gadget/g_ffs.c b/drivers/usb/gadget/g_ffs.c
index 787a78e..5327c82 100644
--- a/drivers/usb/gadget/g_ffs.c
+++ b/drivers/usb/gadget/g_ffs.c

@@ -28,15 +28,18 @@
 #    define USB_ETH_RNDIS y
 #  endif
 
+#define USBF_ECM_INCLUDED
 #  include "f_ecm.c"
+#define USB_FSUBSET_INCLUDED
 #  include "f_subset.c"
 #  ifdef USB_ETH_RNDIS
+#    define USB_FRNDIS_INCLUDED
 #    include "f_rndis.c"
-#    include "rndis.c"
+#    include "rndis.h"
 #  endif
-#  include "u_ether.c"
+#  include "u_ether.h"
 
-static u8 gfs_hostaddr[ETH_ALEN];
+static u8 gfs_host_mac[ETH_ALEN];
 static struct eth_dev *the_dev;
 #  ifdef CONFIG_USB_FUNCTIONFS_ETH
 static int eth_bind_config(struct usb_configuration *c, u8 ethaddr[ETH_ALEN],
@@ -45,7 +48,7 @@
 #else
 #  define the_dev	NULL
 #  define gether_cleanup(dev) do { } while (0)
-#  define gfs_hostaddr NULL
+#  define gfs_host_mac NULL
 struct eth_dev;
 #endif
 
@@ -73,6 +76,8 @@
 
 USB_GADGET_COMPOSITE_OPTIONS();
 
+USB_ETHERNET_MODULE_PARAMETERS();
+
 static struct usb_device_descriptor gfs_dev_desc = {
 	.bLength		= sizeof gfs_dev_desc,
 	.bDescriptorType	= USB_DT_DEVICE,
@@ -350,7 +355,8 @@
 	if (missing_funcs)
 		return -ENODEV;
 #if defined CONFIG_USB_FUNCTIONFS_ETH || defined CONFIG_USB_FUNCTIONFS_RNDIS
-	the_dev = gether_setup(cdev->gadget, gfs_hostaddr);
+	the_dev = gether_setup(cdev->gadget, dev_addr, host_addr, gfs_host_mac,
+			       qmult);
 #endif
 	if (IS_ERR(the_dev)) {
 		ret = PTR_ERR(the_dev);
@@ -446,7 +452,7 @@
 	}
 
 	if (gc->eth) {
-		ret = gc->eth(c, gfs_hostaddr, the_dev);
+		ret = gc->eth(c, gfs_host_mac, the_dev);
 		if (unlikely(ret < 0))
 			return ret;
 	}

diff --git a/drivers/usb/gadget/m66592-udc.c b/drivers/usb/gadget/m66592-udc.c
index 51cfe72..46ba983 100644
--- a/drivers/usb/gadget/m66592-udc.c
+++ b/drivers/usb/gadget/m66592-udc.c

@@ -1533,7 +1533,7 @@
 
 static int __exit m66592_remove(struct platform_device *pdev)
 {
-	struct m66592		*m66592 = dev_get_drvdata(&pdev->dev);
+	struct m66592		*m66592 = platform_get_drvdata(pdev);
 
 	usb_del_gadget_udc(&m66592->gadget);
 
@@ -1602,7 +1602,7 @@
 	m66592->irq_trigger = ires->flags & IRQF_TRIGGER_MASK;
 
 	spin_lock_init(&m66592->lock);
-	dev_set_drvdata(&pdev->dev, m66592);
+	platform_set_drvdata(pdev, m66592);
 
 	m66592->gadget.ops = &m66592_gadget_ops;
 	m66592->gadget.max_speed = USB_SPEED_HIGH;

diff --git a/drivers/usb/gadget/multi.c b/drivers/usb/gadget/multi.c
index 4a45e80..032b96a 100644
--- a/drivers/usb/gadget/multi.c
+++ b/drivers/usb/gadget/multi.c

@@ -43,16 +43,19 @@
  */
 #include "f_mass_storage.c"
 
+#define USBF_ECM_INCLUDED
 #include "f_ecm.c"
-#include "f_subset.c"
 #ifdef USB_ETH_RNDIS
+#  define USB_FRNDIS_INCLUDED
 #  include "f_rndis.c"
-#  include "rndis.c"
+#  include "rndis.h"
 #endif
-#include "u_ether.c"
+#include "u_ether.h"
 
 USB_GADGET_COMPOSITE_OPTIONS();
 
+USB_ETHERNET_MODULE_PARAMETERS();
+
 /***************************** Device Descriptor ****************************/
 
 #define MULTI_VENDOR_NUM	0x1d6b	/* Linux Foundation */
@@ -133,7 +136,7 @@
 
 static struct fsg_common fsg_common;
 
-static u8 hostaddr[ETH_ALEN];
+static u8 host_mac[ETH_ALEN];
 
 static struct usb_function_instance *fi_acm;
 static struct eth_dev *the_dev;
@@ -152,7 +155,7 @@
 		c->bmAttributes |= USB_CONFIG_ATT_WAKEUP;
 	}
 
-	ret = rndis_bind_config(c, hostaddr, the_dev);
+	ret = rndis_bind_config(c, host_mac, the_dev);
 	if (ret < 0)
 		return ret;
 
@@ -216,7 +219,7 @@
 		c->bmAttributes |= USB_CONFIG_ATT_WAKEUP;
 	}
 
-	ret = ecm_bind_config(c, hostaddr, the_dev);
+	ret = ecm_bind_config(c, host_mac, the_dev);
 	if (ret < 0)
 		return ret;
 
@@ -280,7 +283,8 @@
 	}
 
 	/* set up network link layer */
-	the_dev = gether_setup(cdev->gadget, hostaddr);
+	the_dev = gether_setup(cdev->gadget, dev_addr, host_addr, host_mac,
+			       qmult);
 	if (IS_ERR(the_dev))
 		return PTR_ERR(the_dev);
 

diff --git a/drivers/usb/gadget/mv_u3d_core.c b/drivers/usb/gadget/mv_u3d_core.c
index 58288e9..07fdb3e 100644
--- a/drivers/usb/gadget/mv_u3d_core.c
+++ b/drivers/usb/gadget/mv_u3d_core.c

@@ -1786,8 +1786,6 @@
 
 	clk_put(u3d->clk);
 
-	platform_set_drvdata(dev, NULL);
-
 	kfree(u3d);
 
 	return 0;
@@ -1997,7 +1995,6 @@
 err_get_cap_regs:
 err_get_clk:
 	clk_put(u3d->clk);
-	platform_set_drvdata(dev, NULL);
 	kfree(u3d);
 err_alloc_private:
 err_pdata:
@@ -2053,7 +2050,7 @@
 
 static void mv_u3d_shutdown(struct platform_device *dev)
 {
-	struct mv_u3d *u3d = dev_get_drvdata(&dev->dev);
+	struct mv_u3d *u3d = platform_get_drvdata(dev);
 	u32 tmp;
 
 	tmp = ioread32(&u3d->op_regs->usbcmd);

diff --git a/drivers/usb/gadget/ncm.c b/drivers/usb/gadget/ncm.c
index 3b02fd4..81956fe 100644
--- a/drivers/usb/gadget/ncm.c
+++ b/drivers/usb/gadget/ncm.c

@@ -24,23 +24,12 @@
 #include <linux/usb/composite.h>
 
 #include "u_ether.h"
+#include "u_ncm.h"
 
 #define DRIVER_DESC		"NCM Gadget"
 
 /*-------------------------------------------------------------------------*/
 
-/*
- * Kbuild is not very cooperative with respect to linking separately
- * compiled library objects into one module.  So for now we won't use
- * separate compilation ... ensuring init/exit sections work to shrink
- * the runtime footprint, and giving us at least some parts of what
- * a "gcc --combine ... part1.c part2.c part3.c ... " build would.
- */
-#include "f_ncm.c"
-#include "u_ether.c"
-
-/*-------------------------------------------------------------------------*/
-
 /* DO NOT REUSE THESE IDs with a protocol-incompatible driver!!  Ever!!
  * Instead:  allocate your own, using normal USB-IF procedures.
  */
@@ -54,6 +43,8 @@
 /*-------------------------------------------------------------------------*/
 USB_GADGET_COMPOSITE_OPTIONS();
 
+USB_ETHERNET_MODULE_PARAMETERS();
+
 static struct usb_device_descriptor device_desc = {
 	.bLength =		sizeof device_desc,
 	.bDescriptorType =	USB_DT_DEVICE,
@@ -111,13 +102,15 @@
 	NULL,
 };
 
-struct eth_dev *the_dev;
-static u8 hostaddr[ETH_ALEN];
+static struct usb_function_instance *f_ncm_inst;
+static struct usb_function *f_ncm;
 
 /*-------------------------------------------------------------------------*/
 
 static int __init ncm_do_config(struct usb_configuration *c)
 {
+	int status;
+
 	/* FIXME alloc iConfiguration string, set it in c->strings */
 
 	if (gadget_is_otg(c->cdev->gadget)) {
@@ -125,7 +118,19 @@
 		c->bmAttributes |= USB_CONFIG_ATT_WAKEUP;
 	}
 
-	return ncm_bind_config(c, hostaddr, the_dev);
+	f_ncm = usb_get_function(f_ncm_inst);
+	if (IS_ERR(f_ncm)) {
+		status = PTR_ERR(f_ncm);
+		return status;
+	}
+
+	status = usb_add_function(c, f_ncm);
+	if (status < 0) {
+		usb_put_function(f_ncm);
+		return status;
+	}
+
+	return 0;
 }
 
 static struct usb_configuration ncm_config_driver = {
@@ -141,12 +146,20 @@
 static int __init gncm_bind(struct usb_composite_dev *cdev)
 {
 	struct usb_gadget	*gadget = cdev->gadget;
+	struct f_ncm_opts	*ncm_opts;
 	int			status;
 
-	/* set up network link layer */
-	the_dev = gether_setup(cdev->gadget, hostaddr);
-	if (IS_ERR(the_dev))
-		return PTR_ERR(the_dev);
+	f_ncm_inst = usb_get_function_instance("ncm");
+	if (IS_ERR(f_ncm_inst))
+		return PTR_ERR(f_ncm_inst);
+
+	ncm_opts = container_of(f_ncm_inst, struct f_ncm_opts, func_inst);
+
+	gether_set_qmult(ncm_opts->net, qmult);
+	if (!gether_set_host_addr(ncm_opts->net, host_addr))
+		pr_info("using host ethernet address: %s", host_addr);
+	if (!gether_set_dev_addr(ncm_opts->net, dev_addr))
+		pr_info("using self ethernet address: %s", dev_addr);
 
 	/* Allocate string descriptor numbers ... note that string
 	 * contents can be overridden by the composite_dev glue.
@@ -169,13 +182,16 @@
 	return 0;
 
 fail:
-	gether_cleanup(the_dev);
+	usb_put_function_instance(f_ncm_inst);
 	return status;
 }
 
 static int __exit gncm_unbind(struct usb_composite_dev *cdev)
 {
-	gether_cleanup(the_dev);
+	if (!IS_ERR_OR_NULL(f_ncm))
+		usb_put_function(f_ncm);
+	if (!IS_ERR_OR_NULL(f_ncm_inst))
+		usb_put_function_instance(f_ncm_inst);
 	return 0;
 }
 

diff --git a/drivers/usb/gadget/nokia.c b/drivers/usb/gadget/nokia.c
index 3b344b4..0a8099a 100644
--- a/drivers/usb/gadget/nokia.c
+++ b/drivers/usb/gadget/nokia.c

@@ -16,11 +16,13 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/device.h>
 
 #include "u_serial.h"
 #include "u_ether.h"
 #include "u_phonet.h"
+#include "u_ecm.h"
 #include "gadget_chips.h"
 
 /* Defines */
@@ -28,24 +30,10 @@
 #define NOKIA_VERSION_NUM		0x0211
 #define NOKIA_LONG_NAME			"N900 (PC-Suite Mode)"
 
-/*-------------------------------------------------------------------------*/
-
-/*
- * Kbuild is not very cooperative with respect to linking separately
- * compiled library objects into one module.  So for now we won't use
- * separate compilation ... ensuring init/exit sections work to shrink
- * the runtime footprint, and giving us at least some parts of what
- * a "gcc --combine ... part1.c part2.c part3.c ... " build would.
- */
-#define USBF_OBEX_INCLUDED
-#include "f_ecm.c"
-#include "f_obex.c"
-#include "f_phonet.c"
-#include "u_ether.c"
-
-/*-------------------------------------------------------------------------*/
 USB_GADGET_COMPOSITE_OPTIONS();
 
+USB_ETHERNET_MODULE_PARAMETERS();
+
 #define NOKIA_VENDOR_ID			0x0421	/* Nokia */
 #define NOKIA_PRODUCT_ID		0x01c8	/* Nokia Gadget */
 
@@ -98,16 +86,15 @@
 /*-------------------------------------------------------------------------*/
 static struct usb_function *f_acm_cfg1;
 static struct usb_function *f_acm_cfg2;
-static u8 hostaddr[ETH_ALEN];
-static struct eth_dev *the_dev;
+static struct usb_function *f_ecm_cfg1;
+static struct usb_function *f_ecm_cfg2;
+static struct usb_function *f_obex1_cfg1;
+static struct usb_function *f_obex2_cfg1;
+static struct usb_function *f_obex1_cfg2;
+static struct usb_function *f_obex2_cfg2;
+static struct usb_function *f_phonet_cfg1;
+static struct usb_function *f_phonet_cfg2;
 
-enum {
-	TTY_PORT_OBEX0,
-	TTY_PORT_OBEX1,
-	TTY_PORTS_MAX,
-};
-
-static unsigned char tty_lines[TTY_PORTS_MAX];
 
 static struct usb_configuration nokia_config_500ma_driver = {
 	.label		= "Bus Powered",
@@ -126,47 +113,114 @@
 };
 
 static struct usb_function_instance *fi_acm;
+static struct usb_function_instance *fi_ecm;
+static struct usb_function_instance *fi_obex1;
+static struct usb_function_instance *fi_obex2;
+static struct usb_function_instance *fi_phonet;
 
 static int __init nokia_bind_config(struct usb_configuration *c)
 {
 	struct usb_function *f_acm;
+	struct usb_function *f_phonet = NULL;
+	struct usb_function *f_obex1 = NULL;
+	struct usb_function *f_ecm;
+	struct usb_function *f_obex2 = NULL;
 	int status = 0;
+	int obex1_stat = 0;
+	int obex2_stat = 0;
+	int phonet_stat = 0;
 
-	status = phonet_bind_config(c);
-	if (status)
-		printk(KERN_DEBUG "could not bind phonet config\n");
+	if (!IS_ERR(fi_phonet)) {
+		f_phonet = usb_get_function(fi_phonet);
+		if (IS_ERR(f_phonet))
+			pr_debug("could not get phonet function\n");
+	}
 
-	status = obex_bind_config(c, tty_lines[TTY_PORT_OBEX0]);
-	if (status)
-		printk(KERN_DEBUG "could not bind obex config %d\n", 0);
+	if (!IS_ERR(fi_obex1)) {
+		f_obex1 = usb_get_function(fi_obex1);
+		if (IS_ERR(f_obex1))
+			pr_debug("could not get obex function 0\n");
+	}
 
-	status = obex_bind_config(c, tty_lines[TTY_PORT_OBEX1]);
-	if (status)
-		printk(KERN_DEBUG "could not bind obex config %d\n", 0);
+	if (!IS_ERR(fi_obex2)) {
+		f_obex2 = usb_get_function(fi_obex2);
+		if (IS_ERR(f_obex2))
+			pr_debug("could not get obex function 1\n");
+	}
 
 	f_acm = usb_get_function(fi_acm);
-	if (IS_ERR(f_acm))
-		return PTR_ERR(f_acm);
+	if (IS_ERR(f_acm)) {
+		status = PTR_ERR(f_acm);
+		goto err_get_acm;
+	}
+
+	f_ecm = usb_get_function(fi_ecm);
+	if (IS_ERR(f_ecm)) {
+		status = PTR_ERR(f_ecm);
+		goto err_get_ecm;
+	}
+
+	if (!IS_ERR_OR_NULL(f_phonet)) {
+		phonet_stat = usb_add_function(c, f_phonet);
+		if (phonet_stat)
+			pr_debug("could not add phonet function\n");
+	}
+
+	if (!IS_ERR_OR_NULL(f_obex1)) {
+		obex1_stat = usb_add_function(c, f_obex1);
+		if (obex1_stat)
+			pr_debug("could not add obex function 0\n");
+	}
+
+	if (!IS_ERR_OR_NULL(f_obex2)) {
+		obex2_stat = usb_add_function(c, f_obex2);
+		if (obex2_stat)
+			pr_debug("could not add obex function 1\n");
+	}
 
 	status = usb_add_function(c, f_acm);
 	if (status)
 		goto err_conf;
 
-	status = ecm_bind_config(c, hostaddr, the_dev);
+	status = usb_add_function(c, f_ecm);
 	if (status) {
 		pr_debug("could not bind ecm config %d\n", status);
 		goto err_ecm;
 	}
-	if (c == &nokia_config_500ma_driver)
+	if (c == &nokia_config_500ma_driver) {
 		f_acm_cfg1 = f_acm;
-	else
+		f_ecm_cfg1 = f_ecm;
+		f_phonet_cfg1 = f_phonet;
+		f_obex1_cfg1 = f_obex1;
+		f_obex2_cfg1 = f_obex2;
+	} else {
 		f_acm_cfg2 = f_acm;
+		f_ecm_cfg2 = f_ecm;
+		f_phonet_cfg2 = f_phonet;
+		f_obex1_cfg2 = f_obex1;
+		f_obex2_cfg2 = f_obex2;
+	}
 
 	return status;
 err_ecm:
 	usb_remove_function(c, f_acm);
 err_conf:
+	if (!obex2_stat)
+		usb_remove_function(c, f_obex2);
+	if (!obex1_stat)
+		usb_remove_function(c, f_obex1);
+	if (!phonet_stat)
+		usb_remove_function(c, f_phonet);
+	usb_put_function(f_ecm);
+err_get_ecm:
 	usb_put_function(f_acm);
+err_get_acm:
+	if (!IS_ERR_OR_NULL(f_obex2))
+		usb_put_function(f_obex2);
+	if (!IS_ERR_OR_NULL(f_obex1))
+		usb_put_function(f_obex1);
+	if (!IS_ERR_OR_NULL(f_phonet))
+		usb_put_function(f_phonet);
 	return status;
 }
 
@@ -174,23 +228,6 @@
 {
 	struct usb_gadget	*gadget = cdev->gadget;
 	int			status;
-	int			cur_line;
-
-	status = gphonet_setup(cdev->gadget);
-	if (status < 0)
-		goto err_phonet;
-
-	for (cur_line = 0; cur_line < TTY_PORTS_MAX; cur_line++) {
-		status = gserial_alloc_line(&tty_lines[cur_line]);
-		if (status)
-			goto err_ether;
-	}
-
-	the_dev = gether_setup(cdev->gadget, hostaddr);
-	if (IS_ERR(the_dev)) {
-		status = PTR_ERR(the_dev);
-		goto err_ether;
-	}
 
 	status = usb_string_ids_tab(cdev, strings_dev);
 	if (status < 0)
@@ -201,18 +238,40 @@
 	nokia_config_500ma_driver.iConfiguration = status;
 	nokia_config_100ma_driver.iConfiguration = status;
 
-	if (!gadget_supports_altsettings(gadget))
+	if (!gadget_supports_altsettings(gadget)) {
+		status = -ENODEV;
 		goto err_usb;
+	}
+
+	fi_phonet = usb_get_function_instance("phonet");
+	if (IS_ERR(fi_phonet))
+		pr_debug("could not find phonet function\n");
+
+	fi_obex1 = usb_get_function_instance("obex");
+	if (IS_ERR(fi_obex1))
+		pr_debug("could not find obex function 1\n");
+
+	fi_obex2 = usb_get_function_instance("obex");
+	if (IS_ERR(fi_obex2))
+		pr_debug("could not find obex function 2\n");
 
 	fi_acm = usb_get_function_instance("acm");
-	if (IS_ERR(fi_acm))
-		goto err_usb;
+	if (IS_ERR(fi_acm)) {
+		status = PTR_ERR(fi_acm);
+		goto err_obex2_inst;
+	}
+
+	fi_ecm = usb_get_function_instance("ecm");
+	if (IS_ERR(fi_ecm)) {
+		status = PTR_ERR(fi_ecm);
+		goto err_acm_inst;
+	}
 
 	/* finally register the configuration */
 	status = usb_add_config(cdev, &nokia_config_500ma_driver,
 			nokia_bind_config);
 	if (status < 0)
-		goto err_acm_inst;
+		goto err_ecm_inst;
 
 	status = usb_add_config(cdev, &nokia_config_100ma_driver,
 			nokia_bind_config);
@@ -226,33 +285,55 @@
 
 err_put_cfg1:
 	usb_put_function(f_acm_cfg1);
+	if (!IS_ERR_OR_NULL(f_obex1_cfg1))
+		usb_put_function(f_obex1_cfg1);
+	if (!IS_ERR_OR_NULL(f_obex2_cfg1))
+		usb_put_function(f_obex2_cfg1);
+	if (!IS_ERR_OR_NULL(f_phonet_cfg1))
+		usb_put_function(f_phonet_cfg1);
+	usb_put_function(f_ecm_cfg1);
+err_ecm_inst:
+	usb_put_function_instance(fi_ecm);
 err_acm_inst:
 	usb_put_function_instance(fi_acm);
+err_obex2_inst:
+	if (!IS_ERR(fi_obex2))
+		usb_put_function_instance(fi_obex2);
+	if (!IS_ERR(fi_obex1))
+		usb_put_function_instance(fi_obex1);
+	if (!IS_ERR(fi_phonet))
+		usb_put_function_instance(fi_phonet);
 err_usb:
-	gether_cleanup(the_dev);
-err_ether:
-	cur_line--;
-	while (cur_line >= 0)
-		gserial_free_line(tty_lines[cur_line--]);
-
-	gphonet_cleanup();
-err_phonet:
 	return status;
 }
 
 static int __exit nokia_unbind(struct usb_composite_dev *cdev)
 {
-	int i;
-
+	if (!IS_ERR_OR_NULL(f_obex1_cfg2))
+		usb_put_function(f_obex1_cfg2);
+	if (!IS_ERR_OR_NULL(f_obex2_cfg2))
+		usb_put_function(f_obex2_cfg2);
+	if (!IS_ERR_OR_NULL(f_obex1_cfg1))
+		usb_put_function(f_obex1_cfg1);
+	if (!IS_ERR_OR_NULL(f_obex2_cfg1))
+		usb_put_function(f_obex2_cfg1);
+	if (!IS_ERR_OR_NULL(f_phonet_cfg1))
+		usb_put_function(f_phonet_cfg1);
+	if (!IS_ERR_OR_NULL(f_phonet_cfg2))
+		usb_put_function(f_phonet_cfg2);
 	usb_put_function(f_acm_cfg1);
 	usb_put_function(f_acm_cfg2);
+	usb_put_function(f_ecm_cfg1);
+	usb_put_function(f_ecm_cfg2);
+
+	usb_put_function_instance(fi_ecm);
+	if (!IS_ERR(fi_obex2))
+		usb_put_function_instance(fi_obex2);
+	if (!IS_ERR(fi_obex1))
+		usb_put_function_instance(fi_obex1);
+	if (!IS_ERR(fi_phonet))
+		usb_put_function_instance(fi_phonet);
 	usb_put_function_instance(fi_acm);
-	gphonet_cleanup();
-
-	for (i = 0; i < TTY_PORTS_MAX; i++)
-		gserial_free_line(tty_lines[i]);
-
-	gether_cleanup(the_dev);
 
 	return 0;
 }

diff --git a/drivers/usb/gadget/pxa27x_udc.c b/drivers/usb/gadget/pxa27x_udc.c
index 6b4c7d9..41cea95 100644
--- a/drivers/usb/gadget/pxa27x_udc.c
+++ b/drivers/usb/gadget/pxa27x_udc.c

@@ -2505,7 +2505,6 @@
 	usb_put_phy(udc->transceiver);
 
 	udc->transceiver = NULL;
-	platform_set_drvdata(_dev, NULL);
 	the_controller = NULL;
 	clk_put(udc->clk);
 	iounmap(udc->regs);

diff --git a/drivers/usb/gadget/r8a66597-udc.c b/drivers/usb/gadget/r8a66597-udc.c
index 7ff7d9c..c6af649 100644
--- a/drivers/usb/gadget/r8a66597-udc.c
+++ b/drivers/usb/gadget/r8a66597-udc.c

@@ -1469,11 +1469,11 @@
 	u16 savepipe;
 	u16 mask0;
 
+	spin_lock(&r8a66597->lock);
+
 	if (r8a66597_is_sudmac(r8a66597))
 		r8a66597_sudmac_irq(r8a66597);
 
-	spin_lock(&r8a66597->lock);
-
 	intsts0 = r8a66597_read(r8a66597, INTSTS0);
 	intenb0 = r8a66597_read(r8a66597, INTENB0);
 
@@ -1822,7 +1822,7 @@
 
 static int __exit r8a66597_remove(struct platform_device *pdev)
 {
-	struct r8a66597		*r8a66597 = dev_get_drvdata(&pdev->dev);
+	struct r8a66597		*r8a66597 = platform_get_drvdata(pdev);
 
 	usb_del_gadget_udc(&r8a66597->gadget);
 	del_timer_sync(&r8a66597->timer);
@@ -1909,7 +1909,7 @@
 	}
 
 	spin_lock_init(&r8a66597->lock);
-	dev_set_drvdata(&pdev->dev, r8a66597);
+	platform_set_drvdata(pdev, r8a66597);
 	r8a66597->pdata = pdev->dev.platform_data;
 	r8a66597->irq_sense_low = irq_trigger == IRQF_TRIGGER_LOW;
 

diff --git a/drivers/usb/gadget/rndis.c b/drivers/usb/gadget/rndis.c
index 1e4cfb0..3e3ea72 100644
--- a/drivers/usb/gadget/rndis.c
+++ b/drivers/usb/gadget/rndis.c

@@ -761,6 +761,7 @@
 	return rndis_indicate_status_msg(configNr,
 					  RNDIS_STATUS_MEDIA_CONNECT);
 }
+EXPORT_SYMBOL(rndis_signal_connect);
 
 int rndis_signal_disconnect(int configNr)
 {
@@ -769,6 +770,7 @@
 	return rndis_indicate_status_msg(configNr,
 					  RNDIS_STATUS_MEDIA_DISCONNECT);
 }
+EXPORT_SYMBOL(rndis_signal_disconnect);
 
 void rndis_uninit(int configNr)
 {
@@ -783,11 +785,13 @@
 	while ((buf = rndis_get_next_response(configNr, &length)))
 		rndis_free_response(configNr, buf);
 }
+EXPORT_SYMBOL(rndis_uninit);
 
 void rndis_set_host_mac(int configNr, const u8 *addr)
 {
 	rndis_per_dev_params[configNr].host_mac = addr;
 }
+EXPORT_SYMBOL(rndis_set_host_mac);
 
 /*
  * Message Parser
@@ -870,6 +874,7 @@
 
 	return -ENOTSUPP;
 }
+EXPORT_SYMBOL(rndis_msg_parser);
 
 int rndis_register(void (*resp_avail)(void *v), void *v)
 {
@@ -891,6 +896,7 @@
 
 	return -ENODEV;
 }
+EXPORT_SYMBOL(rndis_register);
 
 void rndis_deregister(int configNr)
 {
@@ -899,6 +905,7 @@
 	if (configNr >= RNDIS_MAX_CONFIGS) return;
 	rndis_per_dev_params[configNr].used = 0;
 }
+EXPORT_SYMBOL(rndis_deregister);
 
 int rndis_set_param_dev(u8 configNr, struct net_device *dev, u16 *cdc_filter)
 {
@@ -912,6 +919,7 @@
 
 	return 0;
 }
+EXPORT_SYMBOL(rndis_set_param_dev);
 
 int rndis_set_param_vendor(u8 configNr, u32 vendorID, const char *vendorDescr)
 {
@@ -924,6 +932,7 @@
 
 	return 0;
 }
+EXPORT_SYMBOL(rndis_set_param_vendor);
 
 int rndis_set_param_medium(u8 configNr, u32 medium, u32 speed)
 {
@@ -935,6 +944,7 @@
 
 	return 0;
 }
+EXPORT_SYMBOL(rndis_set_param_medium);
 
 void rndis_add_hdr(struct sk_buff *skb)
 {
@@ -949,6 +959,7 @@
 	header->DataOffset = cpu_to_le32(36);
 	header->DataLength = cpu_to_le32(skb->len - sizeof(*header));
 }
+EXPORT_SYMBOL(rndis_add_hdr);
 
 void rndis_free_response(int configNr, u8 *buf)
 {
@@ -965,6 +976,7 @@
 		}
 	}
 }
+EXPORT_SYMBOL(rndis_free_response);
 
 u8 *rndis_get_next_response(int configNr, u32 *length)
 {
@@ -986,6 +998,7 @@
 
 	return NULL;
 }
+EXPORT_SYMBOL(rndis_get_next_response);
 
 static rndis_resp_t *rndis_add_response(int configNr, u32 length)
 {
@@ -1029,6 +1042,7 @@
 	skb_queue_tail(list, skb);
 	return 0;
 }
+EXPORT_SYMBOL(rndis_rm_hdr);
 
 #ifdef CONFIG_USB_GADGET_DEBUG_FILES
 
@@ -1160,6 +1174,7 @@
 
 	return 0;
 }
+module_init(rndis_init);
 
 void rndis_exit(void)
 {
@@ -1173,3 +1188,6 @@
 	}
 #endif
 }
+module_exit(rndis_exit);
+
+MODULE_LICENSE("GPL");

diff --git a/drivers/usb/gadget/rndis.h b/drivers/usb/gadget/rndis.h
index 0647f2f..0f4abb4c 100644
--- a/drivers/usb/gadget/rndis.h
+++ b/drivers/usb/gadget/rndis.h

@@ -16,6 +16,7 @@
 #define _LINUX_RNDIS_H
 
 #include <linux/rndis.h>
+#include "u_ether.h"
 #include "ndis.h"
 
 #define RNDIS_MAXIMUM_FRAME_SIZE	1518
@@ -216,7 +217,4 @@
 int  rndis_state (int configNr);
 extern void rndis_set_host_mac (int configNr, const u8 *addr);
 
-int rndis_init(void);
-void rndis_exit (void);
-
 #endif  /* _LINUX_RNDIS_H */

diff --git a/drivers/usb/gadget/u_ecm.h b/drivers/usb/gadget/u_ecm.h
new file mode 100644
index 0000000..262cc03
--- /dev/null
+++ b/drivers/usb/gadget/u_ecm.h

@@ -0,0 +1,36 @@
+/*
+ * u_ecm.h
+ *
+ * Utility definitions for the ecm function
+ *
+ * Copyright (c) 2013 Samsung Electronics Co., Ltd.
+ *		http://www.samsung.com
+ *
+ * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef U_ECM_H
+#define U_ECM_H
+
+#include <linux/usb/composite.h>
+
+struct f_ecm_opts {
+	struct usb_function_instance	func_inst;
+	struct net_device		*net;
+	bool				bound;
+
+	/*
+	 * Read/write access to configfs attributes is handled by configfs.
+	 *
+	 * This is to protect the data from concurrent access by read/write
+	 * and create symlink/remove symlink.
+	 */
+	struct mutex			lock;
+	int				refcnt;
+};
+
+#endif /* U_ECM_H */

diff --git a/drivers/usb/gadget/u_eem.h b/drivers/usb/gadget/u_eem.h
new file mode 100644
index 0000000..e3ae978
--- /dev/null
+++ b/drivers/usb/gadget/u_eem.h

@@ -0,0 +1,36 @@
+/*
+ * u_eem.h
+ *
+ * Utility definitions for the eem function
+ *
+ * Copyright (c) 2013 Samsung Electronics Co., Ltd.
+ *		http://www.samsung.com
+ *
+ * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef U_EEM_H
+#define U_EEM_H
+
+#include <linux/usb/composite.h>
+
+struct f_eem_opts {
+	struct usb_function_instance	func_inst;
+	struct net_device		*net;
+	bool				bound;
+
+	/*
+	 * Read/write access to configfs attributes is handled by configfs.
+	 *
+	 * This is to protect the data from concurrent access by read/write
+	 * and create symlink/remove symlink.
+	 */
+	struct mutex			lock;
+	int				refcnt;
+};
+
+#endif /* U_EEM_H */

diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c
index 4b76124..2aae0d6 100644
--- a/drivers/usb/gadget/u_ether.c
+++ b/drivers/usb/gadget/u_ether.c

@@ -63,6 +63,8 @@
 
 	struct sk_buff_head	rx_frames;
 
+	unsigned		qmult;
+
 	unsigned		header_len;
 	struct sk_buff		*(*wrap)(struct gether *, struct sk_buff *skb);
 	int			(*unwrap)(struct gether *,
@@ -76,6 +78,7 @@
 
 	bool			zlp;
 	u8			host_mac[ETH_ALEN];
+	u8			dev_mac[ETH_ALEN];
 };
 
 /*-------------------------------------------------------------------------*/
@@ -84,12 +87,8 @@
 
 #define DEFAULT_QLEN	2	/* double buffering by default */
 
-static unsigned qmult = 5;
-module_param(qmult, uint, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(qmult, "queue length multiplier at high/super speed");
-
 /* for dual-speed hardware, use deeper queues at high/super speed */
-static inline int qlen(struct usb_gadget *gadget)
+static inline int qlen(struct usb_gadget *gadget, unsigned qmult)
 {
 	if (gadget_is_dualspeed(gadget) && (gadget->speed == USB_SPEED_HIGH ||
 					    gadget->speed == USB_SPEED_SUPER))
@@ -588,7 +587,7 @@
 	if (gadget_is_dualspeed(dev->gadget))
 		req->no_interrupt = (dev->gadget->speed == USB_SPEED_HIGH ||
 				     dev->gadget->speed == USB_SPEED_SUPER)
-			? ((atomic_read(&dev->tx_qlen) % qmult) != 0)
+			? ((atomic_read(&dev->tx_qlen) % dev->qmult) != 0)
 			: 0;
 
 	retval = usb_ep_queue(in, req, GFP_ATOMIC);
@@ -697,16 +696,6 @@
 
 /*-------------------------------------------------------------------------*/
 
-/* initial value, changed by "ifconfig usb0 hw ether xx:xx:xx:xx:xx:xx" */
-static char *dev_addr;
-module_param(dev_addr, charp, S_IRUGO);
-MODULE_PARM_DESC(dev_addr, "Device Ethernet Address");
-
-/* this address is invisible to ifconfig */
-static char *host_addr;
-module_param(host_addr, charp, S_IRUGO);
-MODULE_PARM_DESC(host_addr, "Host Ethernet Address");
-
 static int get_ether_addr(const char *str, u8 *dev_addr)
 {
 	if (str) {
@@ -728,6 +717,17 @@
 	return 1;
 }
 
+static int get_ether_addr_str(u8 dev_addr[ETH_ALEN], char *str, int len)
+{
+	if (len < 18)
+		return -EINVAL;
+
+	snprintf(str, len, "%02x:%02x:%02x:%02x:%02x:%02x",
+		 dev_addr[0], dev_addr[1], dev_addr[2],
+		 dev_addr[3], dev_addr[4], dev_addr[5]);
+	return 18;
+}
+
 static const struct net_device_ops eth_netdev_ops = {
 	.ndo_open		= eth_open,
 	.ndo_stop		= eth_stop,
@@ -755,8 +755,9 @@
  *
  * Returns negative errno, or zero on success
  */
-struct eth_dev *gether_setup_name(struct usb_gadget *g, u8 ethaddr[ETH_ALEN],
-		const char *netname)
+struct eth_dev *gether_setup_name(struct usb_gadget *g,
+		const char *dev_addr, const char *host_addr,
+		u8 ethaddr[ETH_ALEN], unsigned qmult, const char *netname)
 {
 	struct eth_dev		*dev;
 	struct net_device	*net;
@@ -777,6 +778,7 @@
 
 	/* network device setup */
 	dev->net = net;
+	dev->qmult = qmult;
 	snprintf(net->name, sizeof(net->name), "%s%%d", netname);
 
 	if (get_ether_addr(dev_addr, net->dev_addr))
@@ -806,7 +808,8 @@
 		INFO(dev, "MAC %pM\n", net->dev_addr);
 		INFO(dev, "HOST MAC %pM\n", dev->host_mac);
 
-		/* two kinds of host-initiated state changes:
+		/*
+		 * two kinds of host-initiated state changes:
 		 *  - iff DATA transfer is active, carrier is "on"
 		 *  - tx queueing enabled if open *and* carrier is "on"
 		 */
@@ -815,6 +818,186 @@
 
 	return dev;
 }
+EXPORT_SYMBOL(gether_setup_name);
+
+struct net_device *gether_setup_name_default(const char *netname)
+{
+	struct net_device	*net;
+	struct eth_dev		*dev;
+
+	net = alloc_etherdev(sizeof(*dev));
+	if (!net)
+		return ERR_PTR(-ENOMEM);
+
+	dev = netdev_priv(net);
+	spin_lock_init(&dev->lock);
+	spin_lock_init(&dev->req_lock);
+	INIT_WORK(&dev->work, eth_work);
+	INIT_LIST_HEAD(&dev->tx_reqs);
+	INIT_LIST_HEAD(&dev->rx_reqs);
+
+	skb_queue_head_init(&dev->rx_frames);
+
+	/* network device setup */
+	dev->net = net;
+	dev->qmult = QMULT_DEFAULT;
+	snprintf(net->name, sizeof(net->name), "%s%%d", netname);
+
+	eth_random_addr(dev->dev_mac);
+	pr_warn("using random %s ethernet address\n", "self");
+	eth_random_addr(dev->host_mac);
+	pr_warn("using random %s ethernet address\n", "host");
+
+	net->netdev_ops = &eth_netdev_ops;
+
+	SET_ETHTOOL_OPS(net, &ops);
+	SET_NETDEV_DEVTYPE(net, &gadget_type);
+
+	return net;
+}
+EXPORT_SYMBOL(gether_setup_name_default);
+
+int gether_register_netdev(struct net_device *net)
+{
+	struct eth_dev *dev;
+	struct usb_gadget *g;
+	struct sockaddr sa;
+	int status;
+
+	if (!net->dev.parent)
+		return -EINVAL;
+	dev = netdev_priv(net);
+	g = dev->gadget;
+	status = register_netdev(net);
+	if (status < 0) {
+		dev_dbg(&g->dev, "register_netdev failed, %d\n", status);
+		return status;
+	} else {
+		INFO(dev, "HOST MAC %pM\n", dev->host_mac);
+
+		/* two kinds of host-initiated state changes:
+		 *  - iff DATA transfer is active, carrier is "on"
+		 *  - tx queueing enabled if open *and* carrier is "on"
+		 */
+		netif_carrier_off(net);
+	}
+	sa.sa_family = net->type;
+	memcpy(sa.sa_data, dev->dev_mac, ETH_ALEN);
+	rtnl_lock();
+	status = dev_set_mac_address(net, &sa);
+	rtnl_unlock();
+	if (status)
+		pr_warn("cannot set self ethernet address: %d\n", status);
+	else
+		INFO(dev, "MAC %pM\n", dev->dev_mac);
+
+	return status;
+}
+EXPORT_SYMBOL(gether_register_netdev);
+
+void gether_set_gadget(struct net_device *net, struct usb_gadget *g)
+{
+	struct eth_dev *dev;
+
+	dev = netdev_priv(net);
+	dev->gadget = g;
+	SET_NETDEV_DEV(net, &g->dev);
+}
+EXPORT_SYMBOL(gether_set_gadget);
+
+int gether_set_dev_addr(struct net_device *net, const char *dev_addr)
+{
+	struct eth_dev *dev;
+	u8 new_addr[ETH_ALEN];
+
+	dev = netdev_priv(net);
+	if (get_ether_addr(dev_addr, new_addr))
+		return -EINVAL;
+	memcpy(dev->dev_mac, new_addr, ETH_ALEN);
+	return 0;
+}
+EXPORT_SYMBOL(gether_set_dev_addr);
+
+int gether_get_dev_addr(struct net_device *net, char *dev_addr, int len)
+{
+	struct eth_dev *dev;
+
+	dev = netdev_priv(net);
+	return get_ether_addr_str(dev->dev_mac, dev_addr, len);
+}
+EXPORT_SYMBOL(gether_get_dev_addr);
+
+int gether_set_host_addr(struct net_device *net, const char *host_addr)
+{
+	struct eth_dev *dev;
+	u8 new_addr[ETH_ALEN];
+
+	dev = netdev_priv(net);
+	if (get_ether_addr(host_addr, new_addr))
+		return -EINVAL;
+	memcpy(dev->host_mac, new_addr, ETH_ALEN);
+	return 0;
+}
+EXPORT_SYMBOL(gether_set_host_addr);
+
+int gether_get_host_addr(struct net_device *net, char *host_addr, int len)
+{
+	struct eth_dev *dev;
+
+	dev = netdev_priv(net);
+	return get_ether_addr_str(dev->host_mac, host_addr, len);
+}
+EXPORT_SYMBOL(gether_get_host_addr);
+
+int gether_get_host_addr_cdc(struct net_device *net, char *host_addr, int len)
+{
+	struct eth_dev *dev;
+
+	if (len < 13)
+		return -EINVAL;
+
+	dev = netdev_priv(net);
+	snprintf(host_addr, len, "%pm", dev->host_mac);
+
+	return strlen(host_addr);
+}
+EXPORT_SYMBOL(gether_get_host_addr_cdc);
+
+void gether_get_host_addr_u8(struct net_device *net, u8 host_mac[ETH_ALEN])
+{
+	struct eth_dev *dev;
+
+	dev = netdev_priv(net);
+	memcpy(host_mac, dev->host_mac, ETH_ALEN);
+}
+EXPORT_SYMBOL(gether_get_host_addr_u8);
+
+void gether_set_qmult(struct net_device *net, unsigned qmult)
+{
+	struct eth_dev *dev;
+
+	dev = netdev_priv(net);
+	dev->qmult = qmult;
+}
+EXPORT_SYMBOL(gether_set_qmult);
+
+unsigned gether_get_qmult(struct net_device *net)
+{
+	struct eth_dev *dev;
+
+	dev = netdev_priv(net);
+	return dev->qmult;
+}
+EXPORT_SYMBOL(gether_get_qmult);
+
+int gether_get_ifname(struct net_device *net, char *name, int len)
+{
+	rtnl_lock();
+	strlcpy(name, netdev_name(net), len);
+	rtnl_unlock();
+	return strlen(name);
+}
+EXPORT_SYMBOL(gether_get_ifname);
 
 /**
  * gether_cleanup - remove Ethernet-over-USB device
@@ -831,6 +1014,7 @@
 	flush_work(&dev->work);
 	free_netdev(dev->net);
 }
+EXPORT_SYMBOL(gether_cleanup);
 
 /**
  * gether_connect - notify network layer that USB link is active
@@ -873,11 +1057,12 @@
 	}
 
 	if (result == 0)
-		result = alloc_requests(dev, link, qlen(dev->gadget));
+		result = alloc_requests(dev, link, qlen(dev->gadget,
+					dev->qmult));
 
 	if (result == 0) {
 		dev->zlp = link->is_zlp_ok;
-		DBG(dev, "qlen %d\n", qlen(dev->gadget));
+		DBG(dev, "qlen %d\n", qlen(dev->gadget, dev->qmult));
 
 		dev->header_len = link->header_len;
 		dev->unwrap = link->unwrap;
@@ -910,6 +1095,7 @@
 		return ERR_PTR(result);
 	return dev->net;
 }
+EXPORT_SYMBOL(gether_connect);
 
 /**
  * gether_disconnect - notify network layer that USB link is inactive
@@ -980,3 +1166,7 @@
 	dev->port_usb = NULL;
 	spin_unlock(&dev->lock);
 }
+EXPORT_SYMBOL(gether_disconnect);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David Brownell");

diff --git a/drivers/usb/gadget/u_ether.h b/drivers/usb/gadget/u_ether.h
index 0252233..fb23d1f 100644
--- a/drivers/usb/gadget/u_ether.h
+++ b/drivers/usb/gadget/u_ether.h

@@ -21,6 +21,26 @@
 
 #include "gadget_chips.h"
 
+#define QMULT_DEFAULT 5
+
+/*
+ * dev_addr: initial value
+ * changed by "ifconfig usb0 hw ether xx:xx:xx:xx:xx:xx"
+ * host_addr: this address is invisible to ifconfig
+ */
+#define USB_ETHERNET_MODULE_PARAMETERS() \
+	static unsigned qmult = QMULT_DEFAULT;				\
+	module_param(qmult, uint, S_IRUGO|S_IWUSR);			\
+	MODULE_PARM_DESC(qmult, "queue length multiplier at high/super speed");\
+									\
+	static char *dev_addr;						\
+	module_param(dev_addr, charp, S_IRUGO);				\
+	MODULE_PARM_DESC(dev_addr, "Device Ethernet Address");		\
+									\
+	static char *host_addr;						\
+	module_param(host_addr, charp, S_IRUGO);			\
+	MODULE_PARM_DESC(host_addr, "Host Ethernet Address")
+
 struct eth_dev;
 
 /*
@@ -71,8 +91,9 @@
 			|USB_CDC_PACKET_TYPE_DIRECTED)
 
 /* variant of gether_setup that allows customizing network device name */
-struct eth_dev *gether_setup_name(struct usb_gadget *g, u8 ethaddr[ETH_ALEN],
-		const char *netname);
+struct eth_dev *gether_setup_name(struct usb_gadget *g,
+		const char *dev_addr, const char *host_addr,
+		u8 ethaddr[ETH_ALEN], unsigned qmult, const char *netname);
 
 /* netdev setup/teardown as directed by the gadget driver */
 /* gether_setup - initialize one ethernet-over-usb link
@@ -88,11 +109,145 @@
  * Returns negative errno, or zero on success
  */
 static inline struct eth_dev *gether_setup(struct usb_gadget *g,
-		u8 ethaddr[ETH_ALEN])
+		const char *dev_addr, const char *host_addr,
+		u8 ethaddr[ETH_ALEN], unsigned qmult)
 {
-	return gether_setup_name(g, ethaddr, "usb");
+	return gether_setup_name(g, dev_addr, host_addr, ethaddr, qmult, "usb");
 }
 
+/*
+ * variant of gether_setup_default that allows customizing
+ * network device name
+ */
+struct net_device *gether_setup_name_default(const char *netname);
+
+/*
+ * gether_register_netdev - register the net device
+ * @net: net device to register
+ *
+ * Registers the net device associated with this ethernet-over-usb link
+ *
+ */
+int gether_register_netdev(struct net_device *net);
+
+/* gether_setup_default - initialize one ethernet-over-usb link
+ * Context: may sleep
+ *
+ * This sets up the single network link that may be exported by a
+ * gadget driver using this framework.  The link layer addresses
+ * are set to random values.
+ *
+ * Returns negative errno, or zero on success
+ */
+static inline struct net_device *gether_setup_default(void)
+{
+	return gether_setup_name_default("usb");
+}
+
+/**
+ * gether_set_gadget - initialize one ethernet-over-usb link with a gadget
+ * @net: device representing this link
+ * @g: the gadget to initialize with
+ *
+ * This associates one ethernet-over-usb link with a gadget.
+ */
+void gether_set_gadget(struct net_device *net, struct usb_gadget *g);
+
+/**
+ * gether_set_dev_addr - initialize an ethernet-over-usb link with eth address
+ * @net: device representing this link
+ * @dev_addr: eth address of this device
+ *
+ * This sets the device-side Ethernet address of this ethernet-over-usb link
+ * if dev_addr is correct.
+ * Returns negative errno if the new address is incorrect.
+ */
+int gether_set_dev_addr(struct net_device *net, const char *dev_addr);
+
+/**
+ * gether_get_dev_addr - get an ethernet-over-usb link eth address
+ * @net: device representing this link
+ * @dev_addr: place to store device's eth address
+ * @len: length of the @dev_addr buffer
+ *
+ * This gets the device-side Ethernet address of this ethernet-over-usb link.
+ * Returns zero on success, else negative errno.
+ */
+int gether_get_dev_addr(struct net_device *net, char *dev_addr, int len);
+
+/**
+ * gether_set_host_addr - initialize an ethernet-over-usb link with host address
+ * @net: device representing this link
+ * @host_addr: eth address of the host
+ *
+ * This sets the host-side Ethernet address of this ethernet-over-usb link
+ * if host_addr is correct.
+ * Returns negative errno if the new address is incorrect.
+ */
+int gether_set_host_addr(struct net_device *net, const char *host_addr);
+
+/**
+ * gether_get_host_addr - get an ethernet-over-usb link host address
+ * @net: device representing this link
+ * @host_addr: place to store eth address of the host
+ * @len: length of the @host_addr buffer
+ *
+ * This gets the host-side Ethernet address of this ethernet-over-usb link.
+ * Returns zero on success, else negative errno.
+ */
+int gether_get_host_addr(struct net_device *net, char *host_addr, int len);
+
+/**
+ * gether_get_host_addr_cdc - get an ethernet-over-usb link host address
+ * @net: device representing this link
+ * @host_addr: place to store eth address of the host
+ * @len: length of the @host_addr buffer
+ *
+ * This gets the CDC formatted host-side Ethernet address of this
+ * ethernet-over-usb link.
+ * Returns zero on success, else negative errno.
+ */
+int gether_get_host_addr_cdc(struct net_device *net, char *host_addr, int len);
+
+/**
+ * gether_get_host_addr_u8 - get an ethernet-over-usb link host address
+ * @net: device representing this link
+ * @host_mac: place to store the eth address of the host
+ *
+ * This gets the binary formatted host-side Ethernet address of this
+ * ethernet-over-usb link.
+ */
+void gether_get_host_addr_u8(struct net_device *net, u8 host_mac[ETH_ALEN]);
+
+/**
+ * gether_set_qmult - initialize an ethernet-over-usb link with a multiplier
+ * @net: device representing this link
+ * @qmult: queue multiplier
+ *
+ * This sets the queue length multiplier of this ethernet-over-usb link.
+ * For higher speeds use longer queues.
+ */
+void gether_set_qmult(struct net_device *net, unsigned qmult);
+
+/**
+ * gether_get_qmult - get an ethernet-over-usb link multiplier
+ * @net: device representing this link
+ *
+ * This gets the queue length multiplier of this ethernet-over-usb link.
+ */
+unsigned gether_get_qmult(struct net_device *net);
+
+/**
+ * gether_get_ifname - get an ethernet-over-usb link interface name
+ * @net: device representing this link
+ * @name: place to store the interface name
+ * @len: length of the @name buffer
+ *
+ * This gets the interface name of this ethernet-over-usb link.
+ * Returns zero on success, else negative errno.
+ */
+int gether_get_ifname(struct net_device *net, char *name, int len);
+
 void gether_cleanup(struct eth_dev *dev);
 
 /* connect/disconnect is handled by individual functions */
@@ -117,9 +272,6 @@
 		struct eth_dev *dev);
 int ecm_bind_config(struct usb_configuration *c, u8 ethaddr[ETH_ALEN],
 		struct eth_dev *dev);
-int ncm_bind_config(struct usb_configuration *c, u8 ethaddr[ETH_ALEN],
-		struct eth_dev *dev);
-int eem_bind_config(struct usb_configuration *c, struct eth_dev *dev);
 
 #ifdef USB_ETH_RNDIS
 

diff --git a/drivers/usb/gadget/u_ether_configfs.h b/drivers/usb/gadget/u_ether_configfs.h
new file mode 100644
index 0000000..bcbd301
--- /dev/null
+++ b/drivers/usb/gadget/u_ether_configfs.h

@@ -0,0 +1,164 @@
+/*
+ * u_ether_configfs.h
+ *
+ * Utility definitions for configfs support in USB Ethernet functions
+ *
+ * Copyright (c) 2013 Samsung Electronics Co., Ltd.
+ *		http://www.samsung.com
+ *
+ * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __U_ETHER_CONFIGFS_H
+#define __U_ETHER_CONFIGFS_H
+
+#define USB_ETHERNET_CONFIGFS_ITEM(_f_)					\
+	CONFIGFS_ATTR_STRUCT(f_##_f_##_opts);				\
+	CONFIGFS_ATTR_OPS(f_##_f_##_opts);				\
+									\
+	static void _f_##_attr_release(struct config_item *item)	\
+	{								\
+		struct f_##_f_##_opts *opts = to_f_##_f_##_opts(item);	\
+									\
+		usb_put_function_instance(&opts->func_inst);		\
+	}								\
+									\
+	static struct configfs_item_operations _f_##_item_ops = {	\
+		.release	= _f_##_attr_release,			\
+		.show_attribute = f_##_f_##_opts_attr_show,		\
+		.store_attribute = f_##_f_##_opts_attr_store,		\
+	}
+
+#define USB_ETHERNET_CONFIGFS_ITEM_ATTR_DEV_ADDR(_f_)			\
+	static ssize_t _f_##_opts_dev_addr_show(struct f_##_f_##_opts *opts, \
+						char *page)		\
+	{								\
+		int result;						\
+									\
+		mutex_lock(&opts->lock);				\
+		result = gether_get_dev_addr(opts->net, page, PAGE_SIZE); \
+		mutex_unlock(&opts->lock);				\
+									\
+		return result;						\
+	}								\
+									\
+	static ssize_t _f_##_opts_dev_addr_store(struct f_##_f_##_opts *opts, \
+						 const char *page, size_t len)\
+	{								\
+		int ret;						\
+									\
+		mutex_lock(&opts->lock);				\
+		if (opts->refcnt) {					\
+			mutex_unlock(&opts->lock);			\
+			return -EBUSY;					\
+		}							\
+									\
+		ret = gether_set_dev_addr(opts->net, page);		\
+		mutex_unlock(&opts->lock);				\
+		if (!ret)						\
+			ret = len;					\
+		return ret;						\
+	}								\
+									\
+	static struct f_##_f_##_opts_attribute f_##_f_##_opts_dev_addr = \
+		__CONFIGFS_ATTR(dev_addr, S_IRUGO | S_IWUSR,		\
+				_f_##_opts_dev_addr_show,		\
+				_f_##_opts_dev_addr_store)
+
+#define USB_ETHERNET_CONFIGFS_ITEM_ATTR_HOST_ADDR(_f_)			\
+	static ssize_t _f_##_opts_host_addr_show(struct f_##_f_##_opts *opts, \
+						 char *page)		\
+	{								\
+		int result;						\
+									\
+		mutex_lock(&opts->lock);				\
+		result = gether_get_host_addr(opts->net, page, PAGE_SIZE); \
+		mutex_unlock(&opts->lock);				\
+									\
+		return result;						\
+	}								\
+									\
+	static ssize_t _f_##_opts_host_addr_store(struct f_##_f_##_opts *opts, \
+						  const char *page, size_t len)\
+	{								\
+		int ret;						\
+									\
+		mutex_lock(&opts->lock);				\
+		if (opts->refcnt) {					\
+			mutex_unlock(&opts->lock);			\
+			return -EBUSY;					\
+		}							\
+									\
+		ret = gether_set_host_addr(opts->net, page);		\
+		mutex_unlock(&opts->lock);				\
+		if (!ret)						\
+			ret = len;					\
+		return ret;						\
+	}								\
+									\
+	static struct f_##_f_##_opts_attribute f_##_f_##_opts_host_addr = \
+		__CONFIGFS_ATTR(host_addr, S_IRUGO | S_IWUSR,		\
+				_f_##_opts_host_addr_show,		\
+				_f_##_opts_host_addr_store)
+
+#define USB_ETHERNET_CONFIGFS_ITEM_ATTR_QMULT(_f_)			\
+	static ssize_t _f_##_opts_qmult_show(struct f_##_f_##_opts *opts, \
+					     char *page)		\
+	{								\
+		unsigned qmult;						\
+									\
+		mutex_lock(&opts->lock);				\
+		qmult = gether_get_qmult(opts->net);			\
+		mutex_unlock(&opts->lock);				\
+		return sprintf(page, "%d", qmult);			\
+	}								\
+									\
+	static ssize_t _f_##_opts_qmult_store(struct f_##_f_##_opts *opts, \
+					      const char *page, size_t len)\
+	{								\
+		u8 val;							\
+		int ret;						\
+									\
+		mutex_lock(&opts->lock);				\
+		if (opts->refcnt) {					\
+			ret = -EBUSY;					\
+			goto out;					\
+		}							\
+									\
+		ret = kstrtou8(page, 0, &val);				\
+		if (ret)						\
+			goto out;					\
+									\
+		gether_set_qmult(opts->net, val);			\
+		ret = len;						\
+out:									\
+		mutex_unlock(&opts->lock);				\
+		return ret;						\
+	}								\
+									\
+	static struct f_##_f_##_opts_attribute f_##_f_##_opts_qmult =	\
+		__CONFIGFS_ATTR(qmult, S_IRUGO | S_IWUSR,		\
+				_f_##_opts_qmult_show,		\
+				_f_##_opts_qmult_store)
+
+#define USB_ETHERNET_CONFIGFS_ITEM_ATTR_IFNAME(_f_)			\
+	static ssize_t _f_##_opts_ifname_show(struct f_##_f_##_opts *opts, \
+					      char *page)		\
+	{								\
+		int ret;						\
+									\
+		mutex_lock(&opts->lock);				\
+		ret = gether_get_ifname(opts->net, page, PAGE_SIZE);	\
+		mutex_unlock(&opts->lock);				\
+									\
+		return ret;						\
+	}								\
+									\
+	static struct f_##_f_##_opts_attribute f_##_f_##_opts_ifname =	\
+		__CONFIGFS_ATTR_RO(ifname, _f_##_opts_ifname_show)
+
+#endif /* __U_ETHER_CONFIGFS_H */

diff --git a/drivers/usb/gadget/u_gether.h b/drivers/usb/gadget/u_gether.h
new file mode 100644
index 0000000..d407842
--- /dev/null
+++ b/drivers/usb/gadget/u_gether.h

@@ -0,0 +1,36 @@
+/*
+ * u_gether.h
+ *
+ * Utility definitions for the subset function
+ *
+ * Copyright (c) 2013 Samsung Electronics Co., Ltd.
+ *		http://www.samsung.com
+ *
+ * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef U_GETHER_H
+#define U_GETHER_H
+
+#include <linux/usb/composite.h>
+
+struct f_gether_opts {
+	struct usb_function_instance	func_inst;
+	struct net_device		*net;
+	bool				bound;
+
+	/*
+	 * Read/write access to configfs attributes is handled by configfs.
+	 *
+	 * This is to protect the data from concurrent access by read/write
+	 * and create symlink/remove symlink.
+	 */
+	struct mutex			lock;
+	int				refcnt;
+};
+
+#endif /* U_GETHER_H */

diff --git a/drivers/usb/gadget/u_ncm.h b/drivers/usb/gadget/u_ncm.h
new file mode 100644
index 0000000..ce0f3a7
--- /dev/null
+++ b/drivers/usb/gadget/u_ncm.h

@@ -0,0 +1,36 @@
+/*
+ * u_ncm.h
+ *
+ * Utility definitions for the ncm function
+ *
+ * Copyright (c) 2013 Samsung Electronics Co., Ltd.
+ *		http://www.samsung.com
+ *
+ * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef U_NCM_H
+#define U_NCM_H
+
+#include <linux/usb/composite.h>
+
+struct f_ncm_opts {
+	struct usb_function_instance	func_inst;
+	struct net_device		*net;
+	bool				bound;
+
+	/*
+	 * Read/write access to configfs attributes is handled by configfs.
+	 *
+	 * This is to protect the data from concurrent access by read/write
+	 * and create symlink/remove symlink.
+	 */
+	struct mutex			lock;
+	int				refcnt;
+};
+
+#endif /* U_NCM_H */

diff --git a/drivers/usb/gadget/u_phonet.h b/drivers/usb/gadget/u_phonet.h
index 09a7525..98ced18 100644
--- a/drivers/usb/gadget/u_phonet.h
+++ b/drivers/usb/gadget/u_phonet.h

@@ -14,8 +14,16 @@
 #include <linux/usb/composite.h>
 #include <linux/usb/cdc.h>
 
-int gphonet_setup(struct usb_gadget *gadget);
-int phonet_bind_config(struct usb_configuration *c);
-void gphonet_cleanup(void);
+struct f_phonet_opts {
+	struct usb_function_instance func_inst;
+	bool bound;
+	struct net_device *net;
+};
+
+struct net_device *gphonet_setup_default(void);
+void gphonet_set_gadget(struct net_device *net, struct usb_gadget *g);
+int gphonet_register_netdev(struct net_device *net);
+int phonet_bind_config(struct usb_configuration *c, struct net_device *dev);
+void gphonet_cleanup(struct net_device *dev);
 
 #endif /* __U_PHONET_H */

diff --git a/drivers/usb/gadget/u_rndis.h b/drivers/usb/gadget/u_rndis.h
new file mode 100644
index 0000000..c62ba82
--- /dev/null
+++ b/drivers/usb/gadget/u_rndis.h

@@ -0,0 +1,41 @@
+/*
+ * u_rndis.h
+ *
+ * Utility definitions for the subset function
+ *
+ * Copyright (c) 2013 Samsung Electronics Co., Ltd.
+ *		http://www.samsung.com
+ *
+ * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef U_RNDIS_H
+#define U_RNDIS_H
+
+#include <linux/usb/composite.h>
+
+struct f_rndis_opts {
+	struct usb_function_instance	func_inst;
+	u32				vendor_id;
+	const char			*manufacturer;
+	struct net_device		*net;
+	bool				bound;
+	bool				borrowed_net;
+
+	/*
+	 * Read/write access to configfs attributes is handled by configfs.
+	 *
+	 * This is to protect the data from concurrent access by read/write
+	 * and create symlink/remove symlink.
+	 */
+	struct mutex			lock;
+	int				refcnt;
+};
+
+void rndis_borrow_net(struct usb_function_instance *f, struct net_device *net);
+
+#endif /* U_RNDIS_H */

diff --git a/drivers/usb/gadget/uvc_queue.c b/drivers/usb/gadget/uvc_queue.c
index 7ce27e3..e617047 100644
--- a/drivers/usb/gadget/uvc_queue.c
+++ b/drivers/usb/gadget/uvc_queue.c

@@ -103,10 +103,26 @@
 	spin_unlock_irqrestore(&queue->irqlock, flags);
 }
 
+static void uvc_wait_prepare(struct vb2_queue *vq)
+{
+	struct uvc_video_queue *queue = vb2_get_drv_priv(vq);
+
+	mutex_unlock(&queue->mutex);
+}
+
+static void uvc_wait_finish(struct vb2_queue *vq)
+{
+	struct uvc_video_queue *queue = vb2_get_drv_priv(vq);
+
+	mutex_lock(&queue->mutex);
+}
+
 static struct vb2_ops uvc_queue_qops = {
 	.queue_setup = uvc_queue_setup,
 	.buf_prepare = uvc_buffer_prepare,
 	.buf_queue = uvc_buffer_queue,
+	.wait_prepare = uvc_wait_prepare,
+	.wait_finish = uvc_wait_finish,
 };
 
 static int uvc_queue_init(struct uvc_video_queue *queue,

diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
index 344d5e2..7d0aa5f 100644
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig

@@ -17,7 +17,6 @@
 
 config USB_XHCI_HCD
 	tristate "xHCI HCD (USB 3.0) support"
-	depends on USB_ARCH_HAS_XHCI
 	---help---
 	  The eXtensible Host Controller Interface (xHCI) is standard for USB 3.0
 	  "SuperSpeed" host controller hardware.
@@ -43,7 +42,6 @@
 
 config USB_EHCI_HCD
 	tristate "EHCI HCD (USB 2.0) support"
-	depends on USB_ARCH_HAS_EHCI
 	---help---
 	  The Enhanced Host Controller Interface (EHCI) is standard for USB 2.0
 	  "high speed" (480 Mbit/sec, 60 Mbyte/sec) host controller hardware.
@@ -200,7 +198,7 @@
 	  has an external PHY.
 
 config USB_EHCI_TEGRA
-       boolean "NVIDIA Tegra HCD support"
+       tristate "NVIDIA Tegra HCD support"
        depends on ARCH_TEGRA
        select USB_EHCI_ROOT_HUB_TT
        select USB_PHY
@@ -345,9 +343,19 @@
 	  To compile this driver as a module, choose M here: the
 	  module will be called isp1362-hcd.
 
+config USB_FUSBH200_HCD
+	tristate "FUSBH200 HCD support"
+	depends on USB
+	default N
+	---help---
+	Faraday FUSBH200 is designed to meet USB2.0 EHCI specification
+	with minor modification.
+
+	To compile this driver as a module, choose M here: the
+	module will be called fusbh200-hcd.
+
 config USB_OHCI_HCD
-	tristate "OHCI HCD support"
-	depends on USB_ARCH_HAS_OHCI
+	tristate "OHCI HCD (USB 1.1) support"
 	select ISP1301_OMAP if MACH_OMAP_H2 || MACH_OMAP_H3
 	depends on USB_ISP1301 || !ARCH_LPC32XX
 	---help---
@@ -415,8 +423,8 @@
 	default USB_OHCI_HCD_PPC_OF_BE || USB_OHCI_HCD_PPC_OF_LE
 
 config USB_OHCI_HCD_PCI
-	bool "OHCI support for PCI-bus USB controllers"
-	depends on PCI && (STB03xxx || PPC_MPC52xx || USB_OHCI_HCD_PPC_OF)
+	tristate "OHCI support for PCI-bus USB controllers"
+	depends on PCI
 	default y
 	select USB_OHCI_LITTLE_ENDIAN
 	---help---
@@ -470,7 +478,7 @@
 	  It is needed for low-speed USB 1.0 device support.
 
 config USB_OHCI_HCD_PLATFORM
-	bool "Generic OHCI driver for a platform device"
+	tristate "Generic OHCI driver for a platform device"
 	default n
 	---help---
 	  Adds an OHCI host driver for a generic platform device, which

diff --git a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile
index 4fb73c1..bea7112 100644
--- a/drivers/usb/host/Makefile
+++ b/drivers/usb/host/Makefile

@@ -33,11 +33,16 @@
 obj-$(CONFIG_USB_EHCI_S5P)	+= ehci-s5p.o
 obj-$(CONFIG_USB_EHCI_HCD_AT91) += ehci-atmel.o
 obj-$(CONFIG_USB_EHCI_MSM)	+= ehci-msm.o
+obj-$(CONFIG_USB_EHCI_TEGRA)	+= ehci-tegra.o
 
 obj-$(CONFIG_USB_OXU210HP_HCD)	+= oxu210hp-hcd.o
 obj-$(CONFIG_USB_ISP116X_HCD)	+= isp116x-hcd.o
 obj-$(CONFIG_USB_ISP1362_HCD)	+= isp1362-hcd.o
+
 obj-$(CONFIG_USB_OHCI_HCD)	+= ohci-hcd.o
+obj-$(CONFIG_USB_OHCI_HCD_PCI)	+= ohci-pci.o
+obj-$(CONFIG_USB_OHCI_HCD_PLATFORM)	+= ohci-platform.o
+
 obj-$(CONFIG_USB_UHCI_HCD)	+= uhci-hcd.o
 obj-$(CONFIG_USB_FHCI_HCD)	+= fhci.o
 obj-$(CONFIG_USB_XHCI_HCD)	+= xhci-hcd.o
@@ -52,3 +57,4 @@
 obj-$(CONFIG_USB_OCTEON2_COMMON) += octeon2-common.o
 obj-$(CONFIG_USB_HCD_BCMA)	+= bcma-hcd.o
 obj-$(CONFIG_USB_HCD_SSB)	+= ssb-hcd.o
+obj-$(CONFIG_USB_FUSBH200_HCD)	+= fusbh200-hcd.o

diff --git a/drivers/usb/host/ehci-atmel.c b/drivers/usb/host/ehci-atmel.c
index 02f4611..3b645ff 100644
--- a/drivers/usb/host/ehci-atmel.c
+++ b/drivers/usb/host/ehci-atmel.c

@@ -37,15 +37,15 @@
 
 static void atmel_start_clock(void)
 {
-	clk_enable(iclk);
-	clk_enable(fclk);
+	clk_prepare_enable(iclk);
+	clk_prepare_enable(fclk);
 	clocked = 1;
 }
 
 static void atmel_stop_clock(void)
 {
-	clk_disable(fclk);
-	clk_disable(iclk);
+	clk_disable_unprepare(fclk);
+	clk_disable_unprepare(iclk);
 	clocked = 0;
 }
 

diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c
index 3be3df2..bd831ec 100644
--- a/drivers/usb/host/ehci-fsl.c
+++ b/drivers/usb/host/ehci-fsl.c

@@ -732,6 +732,7 @@
 	.shutdown = usb_hcd_platform_shutdown,
 	.driver = {
 		.name = "fsl-ehci",
+		.owner	= THIS_MODULE,
 		.pm = EHCI_FSL_PM_OPS,
 	},
 };

diff --git a/drivers/usb/host/ehci-grlib.c b/drivers/usb/host/ehci-grlib.c
index 5d75de9..a77bd8d 100644
--- a/drivers/usb/host/ehci-grlib.c
+++ b/drivers/usb/host/ehci-grlib.c

@@ -153,9 +153,7 @@
 
 static int ehci_hcd_grlib_remove(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
-
-	dev_set_drvdata(&op->dev, NULL);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 
 	dev_dbg(&op->dev, "stopping GRLIB GRUSBHC EHCI USB Controller\n");
 
@@ -171,7 +169,7 @@
 
 static void ehci_hcd_grlib_shutdown(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 
 	if (hcd->driver->shutdown)
 		hcd->driver->shutdown(hcd);

diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 246e124..7abf1ce 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c

@@ -139,7 +139,7 @@
 /*-------------------------------------------------------------------------*/
 
 /*
- * handshake - spin reading hc until handshake completes or fails
+ * ehci_handshake - spin reading hc until handshake completes or fails
  * @ptr: address of hc register to be read
  * @mask: bits to look at in result of read
  * @done: value of those bits when handshake succeeds
@@ -155,8 +155,8 @@
  * before driver shutdown. But it also seems to be caused by bugs in cardbus
  * bridge shutdown:  shutting down the bridge before the devices using it.
  */
-static int handshake (struct ehci_hcd *ehci, void __iomem *ptr,
-		      u32 mask, u32 done, int usec)
+int ehci_handshake(struct ehci_hcd *ehci, void __iomem *ptr,
+		   u32 mask, u32 done, int usec)
 {
 	u32	result;
 
@@ -172,6 +172,7 @@
 	} while (usec > 0);
 	return -ETIMEDOUT;
 }
+EXPORT_SYMBOL_GPL(ehci_handshake);
 
 /* check TDI/ARC silicon is in host mode */
 static int tdi_in_host_mode (struct ehci_hcd *ehci)
@@ -212,7 +213,7 @@
 	spin_unlock_irq(&ehci->lock);
 	synchronize_irq(ehci_to_hcd(ehci)->irq);
 
-	return handshake(ehci, &ehci->regs->status,
+	return ehci_handshake(ehci, &ehci->regs->status,
 			  STS_HALT, STS_HALT, 16 * 125);
 }
 
@@ -251,7 +252,7 @@
 	ehci_writel(ehci, command, &ehci->regs->command);
 	ehci->rh_state = EHCI_RH_HALTED;
 	ehci->next_statechange = jiffies;
-	retval = handshake (ehci, &ehci->regs->command,
+	retval = ehci_handshake(ehci, &ehci->regs->command,
 			    CMD_RESET, 0, 250 * 1000);
 
 	if (ehci->has_hostpc) {
@@ -286,7 +287,8 @@
 
 	/* wait for any schedule enables/disables to take effect */
 	temp = (ehci->command << 10) & (STS_ASS | STS_PSS);
-	handshake(ehci, &ehci->regs->status, STS_ASS | STS_PSS, temp, 16 * 125);
+	ehci_handshake(ehci, &ehci->regs->status, STS_ASS | STS_PSS, temp,
+			16 * 125);
 
 	/* then disable anything that's still active */
 	spin_lock_irq(&ehci->lock);
@@ -295,7 +297,8 @@
 	spin_unlock_irq(&ehci->lock);
 
 	/* hardware can take 16 microframes to turn off ... */
-	handshake(ehci, &ehci->regs->status, STS_ASS | STS_PSS, 0, 16 * 125);
+	ehci_handshake(ehci, &ehci->regs->status, STS_ASS | STS_PSS, 0,
+			16 * 125);
 }
 
 /*-------------------------------------------------------------------------*/
@@ -1266,11 +1269,6 @@
 #define	PLATFORM_DRIVER		ehci_hcd_msp_driver
 #endif
 
-#ifdef CONFIG_USB_EHCI_TEGRA
-#include "ehci-tegra.c"
-#define PLATFORM_DRIVER		tegra_ehci_driver
-#endif
-
 #ifdef CONFIG_SPARC_LEON
 #include "ehci-grlib.c"
 #define PLATFORM_DRIVER		ehci_grlib_driver

diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index 9ab4a4d..2b70277 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c

@@ -42,6 +42,12 @@
 	u16		wLength
 );
 
+static int persist_enabled_on_companion(struct usb_device *udev, void *unused)
+{
+	return !udev->maxchild && udev->persist_enabled &&
+		udev->bus->root_hub->speed < USB_SPEED_HIGH;
+}
+
 /* After a power loss, ports that were owned by the companion must be
  * reset so that the companion can still own them.
  */
@@ -56,6 +62,16 @@
 	if (!ehci->owned_ports)
 		return;
 
+	/*
+	 * USB 1.1 devices are mostly HIDs, which don't need to persist across
+	 * suspends. If we ensure that none of our companion's devices have
+	 * persist_enabled (by looking through all USB 1.1 buses in the system),
+	 * we can skip this and avoid slowing resume down. Devices without
+	 * persist will just get reenumerated shortly after resume anyway.
+	 */
+	if (!usb_for_each_dev(NULL, persist_enabled_on_companion))
+		return;
+
 	/* Make sure the ports are powered */
 	port = HCS_N_PORTS(ehci->hcs_params);
 	while (port--) {
@@ -876,7 +892,7 @@
 						PORT_SUSPEND | PORT_RESUME);
 				ehci_writel(ehci, temp, status_reg);
 				clear_bit(wIndex, &ehci->resuming_ports);
-				retval = handshake(ehci, status_reg,
+				retval = ehci_handshake(ehci, status_reg,
 					   PORT_RESUME, 0, 2000 /* 2msec */);
 				if (retval != 0) {
 					ehci_err(ehci,
@@ -902,7 +918,7 @@
 			/* REVISIT:  some hardware needs 550+ usec to clear
 			 * this bit; seems too long to spin routinely...
 			 */
-			retval = handshake(ehci, status_reg,
+			retval = ehci_handshake(ehci, status_reg,
 					PORT_RESET, 0, 1000);
 			if (retval != 0) {
 				ehci_err (ehci, "port %d reset error %d\n",

diff --git a/drivers/usb/host/ehci-mv.c b/drivers/usb/host/ehci-mv.c
index 40206297..915c2db 100644
--- a/drivers/usb/host/ehci-mv.c
+++ b/drivers/usb/host/ehci-mv.c

@@ -166,14 +166,14 @@
 	if (IS_ERR(ehci_mv->clk)) {
 		dev_err(&pdev->dev, "error getting clock\n");
 		retval = PTR_ERR(ehci_mv->clk);
-		goto err_clear_drvdata;
+		goto err_put_hcd;
 	}
 
 	r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "phyregs");
 	if (r == NULL) {
 		dev_err(&pdev->dev, "no phy I/O memory resource defined\n");
 		retval = -ENODEV;
-		goto err_clear_drvdata;
+		goto err_put_hcd;
 	}
 
 	ehci_mv->phy_regs = devm_ioremap(&pdev->dev, r->start,
@@ -181,14 +181,14 @@
 	if (ehci_mv->phy_regs == 0) {
 		dev_err(&pdev->dev, "failed to map phy I/O memory\n");
 		retval = -EFAULT;
-		goto err_clear_drvdata;
+		goto err_put_hcd;
 	}
 
 	r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "capregs");
 	if (!r) {
 		dev_err(&pdev->dev, "no I/O memory resource defined\n");
 		retval = -ENODEV;
-		goto err_clear_drvdata;
+		goto err_put_hcd;
 	}
 
 	ehci_mv->cap_regs = devm_ioremap(&pdev->dev, r->start,
@@ -196,13 +196,13 @@
 	if (ehci_mv->cap_regs == NULL) {
 		dev_err(&pdev->dev, "failed to map I/O memory\n");
 		retval = -EFAULT;
-		goto err_clear_drvdata;
+		goto err_put_hcd;
 	}
 
 	retval = mv_ehci_enable(ehci_mv);
 	if (retval) {
 		dev_err(&pdev->dev, "init phy error %d\n", retval);
-		goto err_clear_drvdata;
+		goto err_put_hcd;
 	}
 
 	offset = readl(ehci_mv->cap_regs) & CAPLENGTH_MASK;
@@ -274,8 +274,6 @@
 		pdata->set_vbus(0);
 err_disable_clk:
 	mv_ehci_disable(ehci_mv);
-err_clear_drvdata:
-	platform_set_drvdata(pdev, NULL);
 err_put_hcd:
 	usb_put_hcd(hcd);
 
@@ -300,8 +298,6 @@
 		mv_ehci_disable(ehci_mv);
 	}
 
-	platform_set_drvdata(pdev, NULL);
-
 	usb_put_hcd(hcd);
 
 	return 0;

diff --git a/drivers/usb/host/ehci-mxc.c b/drivers/usb/host/ehci-mxc.c
index c369767..e4c34ac 100644
--- a/drivers/usb/host/ehci-mxc.c
+++ b/drivers/usb/host/ehci-mxc.c

@@ -194,7 +194,6 @@
 		clk_disable_unprepare(priv->phyclk);
 
 	usb_put_hcd(hcd);
-	platform_set_drvdata(pdev, NULL);
 	return 0;
 }
 

diff --git a/drivers/usb/host/ehci-octeon.c b/drivers/usb/host/ehci-octeon.c
index a89750f..45cc001 100644
--- a/drivers/usb/host/ehci-octeon.c
+++ b/drivers/usb/host/ehci-octeon.c

@@ -182,8 +182,6 @@
 	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
 	usb_put_hcd(hcd);
 
-	platform_set_drvdata(pdev, NULL);
-
 	return 0;
 }
 

diff --git a/drivers/usb/host/ehci-omap.c b/drivers/usb/host/ehci-omap.c
index 16d7150..9bd7dfe33 100644
--- a/drivers/usb/host/ehci-omap.c
+++ b/drivers/usb/host/ehci-omap.c

@@ -187,6 +187,12 @@
 		}
 
 		omap->phy[i] = phy;
+
+		if (pdata->port_mode[i] == OMAP_EHCI_PORT_MODE_PHY) {
+			usb_phy_init(omap->phy[i]);
+			/* bring PHY out of suspend */
+			usb_phy_set_suspend(omap->phy[i], 0);
+		}
 	}
 
 	pm_runtime_enable(dev);
@@ -211,13 +217,14 @@
 	}
 
 	/*
-	 * Bring PHYs out of reset.
+	 * Bring PHYs out of reset for non PHY modes.
 	 * Even though HSIC mode is a PHY-less mode, the reset
 	 * line exists between the chips and can be modelled
 	 * as a PHY device for reset control.
 	 */
 	for (i = 0; i < omap->nports; i++) {
-		if (!omap->phy[i])
+		if (!omap->phy[i] ||
+		     pdata->port_mode[i] == OMAP_EHCI_PORT_MODE_PHY)
 			continue;
 
 		usb_phy_init(omap->phy[i]);
@@ -294,7 +301,7 @@
 	/*.resume		= ehci_hcd_omap_resume, */
 	.driver = {
 		.name		= hcd_name,
-		.of_match_table = of_match_ptr(omap_ehci_dt_ids),
+		.of_match_table = omap_ehci_dt_ids,
 	}
 };
 

diff --git a/drivers/usb/host/ehci-orion.c b/drivers/usb/host/ehci-orion.c
index efbc588..1a450aa 100644
--- a/drivers/usb/host/ehci-orion.c
+++ b/drivers/usb/host/ehci-orion.c

@@ -303,7 +303,7 @@
 	.driver = {
 		.name	= "orion-ehci",
 		.owner  = THIS_MODULE,
-		.of_match_table = of_match_ptr(ehci_orion_dt_ids),
+		.of_match_table = ehci_orion_dt_ids,
 	},
 };
 

diff --git a/drivers/usb/host/ehci-platform.c b/drivers/usb/host/ehci-platform.c
index f47f259..5733f8e 100644
--- a/drivers/usb/host/ehci-platform.c
+++ b/drivers/usb/host/ehci-platform.c

@@ -146,7 +146,6 @@
 
 	usb_remove_hcd(hcd);
 	usb_put_hcd(hcd);
-	platform_set_drvdata(dev, NULL);
 
 	if (pdata->power_off)
 		pdata->power_off(dev);
@@ -224,7 +223,7 @@
 		.owner	= THIS_MODULE,
 		.name	= "ehci-platform",
 		.pm	= &ehci_platform_pm_ops,
-		.of_match_table = of_match_ptr(vt8500_ehci_ids),
+		.of_match_table = vt8500_ehci_ids,
 	}
 };
 

diff --git a/drivers/usb/host/ehci-pmcmsp.c b/drivers/usb/host/ehci-pmcmsp.c
index 363890e..601e208 100644
--- a/drivers/usb/host/ehci-pmcmsp.c
+++ b/drivers/usb/host/ehci-pmcmsp.c

@@ -291,8 +291,7 @@
 	/*
 	 * basic lifecycle operations
 	 */
-	.reset =		ehci_msp_setup,
-	.start =		ehci_run,
+	.reset			= ehci_msp_setup,
 	.shutdown		= ehci_shutdown,
 	.start			= ehci_run,
 	.stop			= ehci_stop,

diff --git a/drivers/usb/host/ehci-ppc-of.c b/drivers/usb/host/ehci-ppc-of.c
index 56dc732..86da09c 100644
--- a/drivers/usb/host/ehci-ppc-of.c
+++ b/drivers/usb/host/ehci-ppc-of.c

@@ -180,14 +180,12 @@
 
 static int ehci_hcd_ppc_of_remove(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
 
 	struct device_node *np;
 	struct resource res;
 
-	dev_set_drvdata(&op->dev, NULL);
-
 	dev_dbg(&op->dev, "stopping PPC-OF USB Controller\n");
 
 	usb_remove_hcd(hcd);
@@ -219,7 +217,7 @@
 
 static void ehci_hcd_ppc_of_shutdown(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 
 	if (hcd->driver->shutdown)
 		hcd->driver->shutdown(hcd);

diff --git a/drivers/usb/host/ehci-s5p.c b/drivers/usb/host/ehci-s5p.c
index 379037f..7cc26e6 100644
--- a/drivers/usb/host/ehci-s5p.c
+++ b/drivers/usb/host/ehci-s5p.c

@@ -50,6 +50,8 @@
 	struct s5p_ehci_platdata *pdata;
 };
 
+static struct s5p_ehci_platdata empty_platdata;
+
 #define to_s5p_ehci(hcd)      (struct s5p_ehci_hcd *)(hcd_to_ehci(hcd)->priv)
 
 static void s5p_setup_vbus_gpio(struct platform_device *pdev)
@@ -101,6 +103,13 @@
 		return -ENOMEM;
 	}
 	s5p_ehci = to_s5p_ehci(hcd);
+
+	if (of_device_is_compatible(pdev->dev.of_node,
+					"samsung,exynos5440-ehci")) {
+		s5p_ehci->pdata = &empty_platdata;
+		goto skip_phy;
+	}
+
 	phy = devm_usb_get_phy(&pdev->dev, USB_PHY_TYPE_USB2);
 	if (IS_ERR(phy)) {
 		/* Fallback to pdata */
@@ -116,6 +125,8 @@
 		s5p_ehci->otg = phy->otg;
 	}
 
+skip_phy:
+
 	s5p_ehci->clk = devm_clk_get(&pdev->dev, "usbhost");
 
 	if (IS_ERR(s5p_ehci->clk)) {
@@ -277,6 +288,7 @@
 #ifdef CONFIG_OF
 static const struct of_device_id exynos_ehci_match[] = {
 	{ .compatible = "samsung,exynos4210-ehci" },
+	{ .compatible = "samsung,exynos5440-ehci" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, exynos_ehci_match);

diff --git a/drivers/usb/host/ehci-sead3.c b/drivers/usb/host/ehci-sead3.c
index f55477c..b2de52d 100644
--- a/drivers/usb/host/ehci-sead3.c
+++ b/drivers/usb/host/ehci-sead3.c

@@ -140,7 +140,6 @@
 
 	usb_remove_hcd(hcd);
 	usb_put_hcd(hcd);
-	platform_set_drvdata(pdev, NULL);
 
 	return 0;
 }

diff --git a/drivers/usb/host/ehci-sh.c b/drivers/usb/host/ehci-sh.c
index b44d716d..c4c0ee9 100644
--- a/drivers/usb/host/ehci-sh.c
+++ b/drivers/usb/host/ehci-sh.c

@@ -176,7 +176,6 @@
 
 	usb_remove_hcd(hcd);
 	usb_put_hcd(hcd);
-	platform_set_drvdata(pdev, NULL);
 
 	clk_disable(priv->fclk);
 	clk_disable(priv->iclk);

diff --git a/drivers/usb/host/ehci-spear.c b/drivers/usb/host/ehci-spear.c
index bd3e5cb..1cf0adb 100644
--- a/drivers/usb/host/ehci-spear.c
+++ b/drivers/usb/host/ehci-spear.c

@@ -148,10 +148,6 @@
 	struct usb_hcd *hcd = platform_get_drvdata(pdev);
 	struct spear_ehci *sehci = to_spear_ehci(hcd);
 
-	if (!hcd)
-		return 0;
-	if (in_interrupt())
-		BUG();
 	usb_remove_hcd(hcd);
 
 	if (sehci->clk)
@@ -174,7 +170,7 @@
 		.name = "spear-ehci",
 		.bus = &platform_bus_type,
 		.pm = &ehci_spear_pm_ops,
-		.of_match_table = of_match_ptr(spear_ehci_id_table),
+		.of_match_table = spear_ehci_id_table,
 	}
 };
 

diff --git a/drivers/usb/host/ehci-tegra.c b/drivers/usb/host/ehci-tegra.c
index 59d111b..6ee7ef7 100644
--- a/drivers/usb/host/ehci-tegra.c
+++ b/drivers/usb/host/ehci-tegra.c

@@ -17,59 +17,53 @@
  */
 
 #include <linux/clk.h>
+#include <linux/clk/tegra.h>
+#include <linux/dma-mapping.h>
 #include <linux/err.h>
-#include <linux/platform_device.h>
-#include <linux/platform_data/tegra_usb.h>
-#include <linux/irq.h>
-#include <linux/usb/otg.h>
 #include <linux/gpio.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/tegra_usb.h>
 #include <linux/pm_runtime.h>
+#include <linux/slab.h>
 #include <linux/usb/ehci_def.h>
 #include <linux/usb/tegra_usb_phy.h>
-#include <linux/clk/tegra.h>
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
+#include <linux/usb/otg.h>
+
+#include "ehci.h"
 
 #define TEGRA_USB_BASE			0xC5000000
 #define TEGRA_USB2_BASE			0xC5004000
 #define TEGRA_USB3_BASE			0xC5008000
 
-/* PORTSC registers */
-#define TEGRA_USB_PORTSC1			0x184
-#define TEGRA_USB_PORTSC1_PTS(x)	(((x) & 0x3) << 30)
-#define TEGRA_USB_PORTSC1_PHCD	(1 << 23)
+#define PORT_WAKE_BITS (PORT_WKOC_E|PORT_WKDISC_E|PORT_WKCONN_E)
 
 #define TEGRA_USB_DMA_ALIGN 32
 
+#define DRIVER_DESC "Tegra EHCI driver"
+#define DRV_NAME "tegra-ehci"
+
+static struct hc_driver __read_mostly tegra_ehci_hc_driver;
+
+static int (*orig_hub_control)(struct usb_hcd *hcd,
+				u16 typeReq, u16 wValue, u16 wIndex,
+				char *buf, u16 wLength);
+
 struct tegra_ehci_hcd {
-	struct ehci_hcd *ehci;
 	struct tegra_usb_phy *phy;
 	struct clk *clk;
 	struct usb_phy *transceiver;
-	int host_resumed;
 	int port_resuming;
 	bool needs_double_reset;
 	enum tegra_usb_phy_port_speed port_speed;
 };
 
-static void tegra_ehci_power_up(struct usb_hcd *hcd)
-{
-	struct tegra_ehci_hcd *tegra = dev_get_drvdata(hcd->self.controller);
-
-	clk_prepare_enable(tegra->clk);
-	usb_phy_set_suspend(hcd->phy, 0);
-	tegra->host_resumed = 1;
-}
-
-static void tegra_ehci_power_down(struct usb_hcd *hcd)
-{
-	struct tegra_ehci_hcd *tegra = dev_get_drvdata(hcd->self.controller);
-
-	tegra->host_resumed = 0;
-	usb_phy_set_suspend(hcd->phy, 1);
-	clk_disable_unprepare(tegra->clk);
-}
-
 static int tegra_ehci_internal_port_reset(
 	struct ehci_hcd	*ehci,
 	u32 __iomem	*portsc_reg
@@ -144,8 +138,8 @@
 	u16		wLength
 )
 {
-	struct ehci_hcd	*ehci = hcd_to_ehci(hcd);
-	struct tegra_ehci_hcd *tegra = dev_get_drvdata(hcd->self.controller);
+	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
+	struct tegra_ehci_hcd *tegra = (struct tegra_ehci_hcd *)ehci->priv;
 	u32 __iomem	*status_reg;
 	u32		temp;
 	unsigned long	flags;
@@ -179,7 +173,7 @@
 		 * If a transaction is in progress, there may be a delay in
 		 * suspending the port. Poll until the port is suspended.
 		 */
-		if (handshake(ehci, status_reg, PORT_SUSPEND,
+		if (ehci_handshake(ehci, status_reg, PORT_SUSPEND,
 						PORT_SUSPEND, 5000))
 			pr_err("%s: timeout waiting for SUSPEND\n", __func__);
 
@@ -227,9 +221,9 @@
 		spin_lock_irqsave(&ehci->lock, flags);
 
 		/* Poll until the controller clears RESUME and SUSPEND */
-		if (handshake(ehci, status_reg, PORT_RESUME, 0, 2000))
+		if (ehci_handshake(ehci, status_reg, PORT_RESUME, 0, 2000))
 			pr_err("%s: timeout waiting for RESUME\n", __func__);
-		if (handshake(ehci, status_reg, PORT_SUSPEND, 0, 2000))
+		if (ehci_handshake(ehci, status_reg, PORT_SUSPEND, 0, 2000))
 			pr_err("%s: timeout waiting for SUSPEND\n", __func__);
 
 		ehci->reset_done[wIndex-1] = 0;
@@ -242,58 +236,13 @@
 	spin_unlock_irqrestore(&ehci->lock, flags);
 
 	/* Handle the hub control events here */
-	return ehci_hub_control(hcd, typeReq, wValue, wIndex, buf, wLength);
+	return orig_hub_control(hcd, typeReq, wValue, wIndex, buf, wLength);
+
 done:
 	spin_unlock_irqrestore(&ehci->lock, flags);
 	return retval;
 }
 
-static void tegra_ehci_restart(struct usb_hcd *hcd)
-{
-	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
-
-	ehci_reset(ehci);
-
-	/* setup the frame list and Async q heads */
-	ehci_writel(ehci, ehci->periodic_dma, &ehci->regs->frame_list);
-	ehci_writel(ehci, (u32)ehci->async->qh_dma, &ehci->regs->async_next);
-	/* setup the command register and set the controller in RUN mode */
-	ehci->command &= ~(CMD_LRESET|CMD_IAAD|CMD_PSE|CMD_ASE|CMD_RESET);
-	ehci->command |= CMD_RUN;
-	ehci_writel(ehci, ehci->command, &ehci->regs->command);
-
-	down_write(&ehci_cf_port_reset_rwsem);
-	ehci_writel(ehci, FLAG_CF, &ehci->regs->configured_flag);
-	/* flush posted writes */
-	ehci_readl(ehci, &ehci->regs->command);
-	up_write(&ehci_cf_port_reset_rwsem);
-}
-
-static void tegra_ehci_shutdown(struct usb_hcd *hcd)
-{
-	struct tegra_ehci_hcd *tegra = dev_get_drvdata(hcd->self.controller);
-
-	/* ehci_shutdown touches the USB controller registers, make sure
-	 * controller has clocks to it */
-	if (!tegra->host_resumed)
-		tegra_ehci_power_up(hcd);
-
-	ehci_shutdown(hcd);
-}
-
-static int tegra_ehci_setup(struct usb_hcd *hcd)
-{
-	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
-
-	/* EHCI registers start at offset 0x100 */
-	ehci->caps = hcd->regs + 0x100;
-
-	/* switch to host mode */
-	hcd->has_tt = 1;
-
-	return ehci_setup(hcd);
-}
-
 struct dma_aligned_buffer {
 	void *kmalloc_ptr;
 	void *old_xfer_buffer;
@@ -373,38 +322,6 @@
 	free_dma_aligned_buffer(urb);
 }
 
-static const struct hc_driver tegra_ehci_hc_driver = {
-	.description		= hcd_name,
-	.product_desc		= "Tegra EHCI Host Controller",
-	.hcd_priv_size		= sizeof(struct ehci_hcd),
-	.flags			= HCD_USB2 | HCD_MEMORY,
-
-	/* standard ehci functions */
-	.irq			= ehci_irq,
-	.start			= ehci_run,
-	.stop			= ehci_stop,
-	.urb_enqueue		= ehci_urb_enqueue,
-	.urb_dequeue		= ehci_urb_dequeue,
-	.endpoint_disable	= ehci_endpoint_disable,
-	.endpoint_reset		= ehci_endpoint_reset,
-	.get_frame_number	= ehci_get_frame,
-	.hub_status_data	= ehci_hub_status_data,
-	.clear_tt_buffer_complete = ehci_clear_tt_buffer_complete,
-	.relinquish_port	= ehci_relinquish_port,
-	.port_handed_over	= ehci_port_handed_over,
-
-	/* modified ehci functions for tegra */
-	.reset			= tegra_ehci_setup,
-	.shutdown		= tegra_ehci_shutdown,
-	.map_urb_for_dma	= tegra_ehci_map_urb_for_dma,
-	.unmap_urb_for_dma	= tegra_ehci_unmap_urb_for_dma,
-	.hub_control		= tegra_ehci_hub_control,
-#ifdef CONFIG_PM
-	.bus_suspend		= ehci_bus_suspend,
-	.bus_resume		= ehci_bus_resume,
-#endif
-};
-
 static int setup_vbus_gpio(struct platform_device *pdev,
 			   struct tegra_ehci_platform_data *pdata)
 {
@@ -432,220 +349,16 @@
 	return err;
 }
 
-#ifdef CONFIG_PM
-
-static int controller_suspend(struct device *dev)
-{
-	struct tegra_ehci_hcd *tegra =
-			platform_get_drvdata(to_platform_device(dev));
-	struct ehci_hcd	*ehci = tegra->ehci;
-	struct usb_hcd *hcd = ehci_to_hcd(ehci);
-	struct ehci_regs __iomem *hw = ehci->regs;
-	unsigned long flags;
-
-	if (time_before(jiffies, ehci->next_statechange))
-		msleep(10);
-
-	ehci_halt(ehci);
-
-	spin_lock_irqsave(&ehci->lock, flags);
-	tegra->port_speed = (readl(&hw->port_status[0]) >> 26) & 0x3;
-	clear_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags);
-	spin_unlock_irqrestore(&ehci->lock, flags);
-
-	tegra_ehci_power_down(hcd);
-	return 0;
-}
-
-static int controller_resume(struct device *dev)
-{
-	struct tegra_ehci_hcd *tegra =
-			platform_get_drvdata(to_platform_device(dev));
-	struct ehci_hcd	*ehci = tegra->ehci;
-	struct usb_hcd *hcd = ehci_to_hcd(ehci);
-	struct ehci_regs __iomem *hw = ehci->regs;
-	unsigned long val;
-
-	set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags);
-	tegra_ehci_power_up(hcd);
-
-	if (tegra->port_speed > TEGRA_USB_PHY_PORT_SPEED_HIGH) {
-		/* Wait for the phy to detect new devices
-		 * before we restart the controller */
-		msleep(10);
-		goto restart;
-	}
-
-	/* Force the phy to keep data lines in suspend state */
-	tegra_ehci_phy_restore_start(hcd->phy, tegra->port_speed);
-
-	/* Enable host mode */
-	tdi_reset(ehci);
-
-	/* Enable Port Power */
-	val = readl(&hw->port_status[0]);
-	val |= PORT_POWER;
-	writel(val, &hw->port_status[0]);
-	udelay(10);
-
-	/* Check if the phy resume from LP0. When the phy resume from LP0
-	 * USB register will be reset. */
-	if (!readl(&hw->async_next)) {
-		/* Program the field PTC based on the saved speed mode */
-		val = readl(&hw->port_status[0]);
-		val &= ~PORT_TEST(~0);
-		if (tegra->port_speed == TEGRA_USB_PHY_PORT_SPEED_HIGH)
-			val |= PORT_TEST_FORCE;
-		else if (tegra->port_speed == TEGRA_USB_PHY_PORT_SPEED_FULL)
-			val |= PORT_TEST(6);
-		else if (tegra->port_speed == TEGRA_USB_PHY_PORT_SPEED_LOW)
-			val |= PORT_TEST(7);
-		writel(val, &hw->port_status[0]);
-		udelay(10);
-
-		/* Disable test mode by setting PTC field to NORMAL_OP */
-		val = readl(&hw->port_status[0]);
-		val &= ~PORT_TEST(~0);
-		writel(val, &hw->port_status[0]);
-		udelay(10);
-	}
-
-	/* Poll until CCS is enabled */
-	if (handshake(ehci, &hw->port_status[0], PORT_CONNECT,
-						 PORT_CONNECT, 2000)) {
-		pr_err("%s: timeout waiting for PORT_CONNECT\n", __func__);
-		goto restart;
-	}
-
-	/* Poll until PE is enabled */
-	if (handshake(ehci, &hw->port_status[0], PORT_PE,
-						 PORT_PE, 2000)) {
-		pr_err("%s: timeout waiting for USB_PORTSC1_PE\n", __func__);
-		goto restart;
-	}
-
-	/* Clear the PCI status, to avoid an interrupt taken upon resume */
-	val = readl(&hw->status);
-	val |= STS_PCD;
-	writel(val, &hw->status);
-
-	/* Put controller in suspend mode by writing 1 to SUSP bit of PORTSC */
-	val = readl(&hw->port_status[0]);
-	if ((val & PORT_POWER) && (val & PORT_PE)) {
-		val |= PORT_SUSPEND;
-		writel(val, &hw->port_status[0]);
-
-		/* Wait until port suspend completes */
-		if (handshake(ehci, &hw->port_status[0], PORT_SUSPEND,
-							 PORT_SUSPEND, 1000)) {
-			pr_err("%s: timeout waiting for PORT_SUSPEND\n",
-								__func__);
-			goto restart;
-		}
-	}
-
-	tegra_ehci_phy_restore_end(hcd->phy);
-	goto done;
-
- restart:
-	if (tegra->port_speed <= TEGRA_USB_PHY_PORT_SPEED_HIGH)
-		tegra_ehci_phy_restore_end(hcd->phy);
-
-	tegra_ehci_restart(hcd);
-
- done:
-	tegra_usb_phy_preresume(hcd->phy);
-	tegra->port_resuming = 1;
-	return 0;
-}
-
-static int tegra_ehci_suspend(struct device *dev)
-{
-	struct tegra_ehci_hcd *tegra =
-			platform_get_drvdata(to_platform_device(dev));
-	struct usb_hcd *hcd = ehci_to_hcd(tegra->ehci);
-	int rc = 0;
-
-	/*
-	 * When system sleep is supported and USB controller wakeup is
-	 * implemented: If the controller is runtime-suspended and the
-	 * wakeup setting needs to be changed, call pm_runtime_resume().
-	 */
-	if (HCD_HW_ACCESSIBLE(hcd))
-		rc = controller_suspend(dev);
-	return rc;
-}
-
-static int tegra_ehci_resume(struct device *dev)
-{
-	int rc;
-
-	rc = controller_resume(dev);
-	if (rc == 0) {
-		pm_runtime_disable(dev);
-		pm_runtime_set_active(dev);
-		pm_runtime_enable(dev);
-	}
-	return rc;
-}
-
-static int tegra_ehci_runtime_suspend(struct device *dev)
-{
-	return controller_suspend(dev);
-}
-
-static int tegra_ehci_runtime_resume(struct device *dev)
-{
-	return controller_resume(dev);
-}
-
-static const struct dev_pm_ops tegra_ehci_pm_ops = {
-	.suspend	= tegra_ehci_suspend,
-	.resume		= tegra_ehci_resume,
-	.runtime_suspend = tegra_ehci_runtime_suspend,
-	.runtime_resume	= tegra_ehci_runtime_resume,
-};
-
-#endif
-
-/* Bits of PORTSC1, which will get cleared by writing 1 into them */
-#define TEGRA_PORTSC1_RWC_BITS (PORT_CSC | PORT_PEC | PORT_OCC)
-
-static void tegra_ehci_set_pts(struct usb_phy *x, u8 pts_val)
-{
-	unsigned long val;
-	struct usb_hcd *hcd = bus_to_hcd(x->otg->host);
-	void __iomem *base = hcd->regs;
-
-	val = readl(base + TEGRA_USB_PORTSC1) & ~TEGRA_PORTSC1_RWC_BITS;
-	val &= ~TEGRA_USB_PORTSC1_PTS(3);
-	val |= TEGRA_USB_PORTSC1_PTS(pts_val & 3);
-	writel(val, base + TEGRA_USB_PORTSC1);
-}
-
-static void tegra_ehci_set_phcd(struct usb_phy *x, bool enable)
-{
-	unsigned long val;
-	struct usb_hcd *hcd = bus_to_hcd(x->otg->host);
-	void __iomem *base = hcd->regs;
-
-	val = readl(base + TEGRA_USB_PORTSC1) & ~TEGRA_PORTSC1_RWC_BITS;
-	if (enable)
-		val |= TEGRA_USB_PORTSC1_PHCD;
-	else
-		val &= ~TEGRA_USB_PORTSC1_PHCD;
-	writel(val, base + TEGRA_USB_PORTSC1);
-}
-
 static int tegra_ehci_probe(struct platform_device *pdev)
 {
 	struct resource *res;
 	struct usb_hcd *hcd;
+	struct ehci_hcd *ehci;
 	struct tegra_ehci_hcd *tegra;
 	struct tegra_ehci_platform_data *pdata;
 	int err = 0;
 	int irq;
-	int instance = pdev->id;
+	struct device_node *np_phy;
 	struct usb_phy *u_phy;
 
 	pdata = pdev->dev.platform_data;
@@ -665,35 +378,47 @@
 
 	setup_vbus_gpio(pdev, pdata);
 
-	tegra = devm_kzalloc(&pdev->dev, sizeof(struct tegra_ehci_hcd),
-			     GFP_KERNEL);
-	if (!tegra)
-		return -ENOMEM;
-
 	hcd = usb_create_hcd(&tegra_ehci_hc_driver, &pdev->dev,
 					dev_name(&pdev->dev));
 	if (!hcd) {
 		dev_err(&pdev->dev, "Unable to create HCD\n");
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto cleanup_vbus_gpio;
 	}
+	platform_set_drvdata(pdev, hcd);
+	ehci = hcd_to_ehci(hcd);
+	tegra = (struct tegra_ehci_hcd *)ehci->priv;
 
-	platform_set_drvdata(pdev, tegra);
+	hcd->has_tt = 1;
 
 	tegra->clk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(tegra->clk)) {
 		dev_err(&pdev->dev, "Can't get ehci clock\n");
 		err = PTR_ERR(tegra->clk);
-		goto fail_clk;
+		goto cleanup_hcd_create;
 	}
 
 	err = clk_prepare_enable(tegra->clk);
 	if (err)
-		goto fail_clk;
+		goto cleanup_clk_get;
 
 	tegra_periph_reset_assert(tegra->clk);
 	udelay(1);
 	tegra_periph_reset_deassert(tegra->clk);
 
+	np_phy = of_parse_phandle(pdev->dev.of_node, "nvidia,phy", 0);
+	if (!np_phy) {
+		err = -ENODEV;
+		goto cleanup_clk_en;
+	}
+
+	u_phy = tegra_usb_get_phy(np_phy);
+	if (IS_ERR(u_phy)) {
+		err = PTR_ERR(u_phy);
+		goto cleanup_clk_en;
+	}
+	hcd->phy = u_phy;
+
 	tegra->needs_double_reset = of_property_read_bool(pdev->dev.of_node,
 		"nvidia,needs-double-reset");
 
@@ -701,7 +426,7 @@
 	if (!res) {
 		dev_err(&pdev->dev, "Failed to get I/O memory\n");
 		err = -ENXIO;
-		goto fail_io;
+		goto cleanup_clk_en;
 	}
 	hcd->rsrc_start = res->start;
 	hcd->rsrc_len = resource_size(res);
@@ -709,68 +434,36 @@
 	if (!hcd->regs) {
 		dev_err(&pdev->dev, "Failed to remap I/O memory\n");
 		err = -ENOMEM;
-		goto fail_io;
+		goto cleanup_clk_en;
 	}
+	ehci->caps = hcd->regs + 0x100;
 
-	/* This is pretty ugly and needs to be fixed when we do only
-	 * device-tree probing. Old code relies on the platform_device
-	 * numbering that we lack for device-tree-instantiated devices.
-	 */
-	if (instance < 0) {
-		switch (res->start) {
-		case TEGRA_USB_BASE:
-			instance = 0;
-			break;
-		case TEGRA_USB2_BASE:
-			instance = 1;
-			break;
-		case TEGRA_USB3_BASE:
-			instance = 2;
-			break;
-		default:
-			err = -ENODEV;
-			dev_err(&pdev->dev, "unknown usb instance\n");
-			goto fail_io;
-		}
+	err = usb_phy_init(hcd->phy);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize phy\n");
+		goto cleanup_clk_en;
 	}
 
-	tegra->phy = tegra_usb_phy_open(&pdev->dev, instance, hcd->regs,
-					pdata->phy_config,
-					TEGRA_USB_PHY_MODE_HOST,
-					tegra_ehci_set_pts,
-					tegra_ehci_set_phcd);
-	if (IS_ERR(tegra->phy)) {
-		dev_err(&pdev->dev, "Failed to open USB phy\n");
-		err = -ENXIO;
-		goto fail_io;
-	}
-
-	hcd->phy = u_phy = &tegra->phy->u_phy;
-	usb_phy_init(hcd->phy);
-
 	u_phy->otg = devm_kzalloc(&pdev->dev, sizeof(struct usb_otg),
 			     GFP_KERNEL);
 	if (!u_phy->otg) {
 		dev_err(&pdev->dev, "Failed to alloc memory for otg\n");
 		err = -ENOMEM;
-		goto fail_io;
+		goto cleanup_phy;
 	}
 	u_phy->otg->host = hcd_to_bus(hcd);
 
 	err = usb_phy_set_suspend(hcd->phy, 0);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to power on the phy\n");
-		goto fail_phy;
+		goto cleanup_phy;
 	}
 
-	tegra->host_resumed = 1;
-	tegra->ehci = hcd_to_ehci(hcd);
-
 	irq = platform_get_irq(pdev, 0);
 	if (!irq) {
 		dev_err(&pdev->dev, "Failed to get IRQ\n");
 		err = -ENODEV;
-		goto fail_phy;
+		goto cleanup_phy;
 	}
 
 	if (pdata->operating_mode == TEGRA_USB_OTG) {
@@ -785,39 +478,32 @@
 	err = usb_add_hcd(hcd, irq, IRQF_SHARED);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to add USB HCD\n");
-		goto fail;
+		goto cleanup_transceiver;
 	}
 
-	pm_runtime_set_active(&pdev->dev);
-	pm_runtime_get_noresume(&pdev->dev);
-
-	/* Don't skip the pm_runtime_forbid call if wakeup isn't working */
-	/* if (!pdata->power_down_on_bus_suspend) */
-		pm_runtime_forbid(&pdev->dev);
-	pm_runtime_enable(&pdev->dev);
-	pm_runtime_put_sync(&pdev->dev);
 	return err;
 
-fail:
+cleanup_transceiver:
 	if (!IS_ERR(tegra->transceiver))
 		otg_set_host(tegra->transceiver->otg, NULL);
-fail_phy:
+cleanup_phy:
 	usb_phy_shutdown(hcd->phy);
-fail_io:
+cleanup_clk_en:
 	clk_disable_unprepare(tegra->clk);
-fail_clk:
+cleanup_clk_get:
+	clk_put(tegra->clk);
+cleanup_hcd_create:
 	usb_put_hcd(hcd);
+cleanup_vbus_gpio:
+	/* FIXME: Undo setup_vbus_gpio() here */
 	return err;
 }
 
 static int tegra_ehci_remove(struct platform_device *pdev)
 {
-	struct tegra_ehci_hcd *tegra = platform_get_drvdata(pdev);
-	struct usb_hcd *hcd = ehci_to_hcd(tegra->ehci);
-
-	pm_runtime_get_sync(&pdev->dev);
-	pm_runtime_disable(&pdev->dev);
-	pm_runtime_put_noidle(&pdev->dev);
+	struct usb_hcd *hcd = platform_get_drvdata(pdev);
+	struct tegra_ehci_hcd *tegra =
+		(struct tegra_ehci_hcd *)hcd_to_ehci(hcd)->priv;
 
 	if (!IS_ERR(tegra->transceiver))
 		otg_set_host(tegra->transceiver->otg, NULL);
@@ -833,8 +519,7 @@
 
 static void tegra_ehci_hcd_shutdown(struct platform_device *pdev)
 {
-	struct tegra_ehci_hcd *tegra = platform_get_drvdata(pdev);
-	struct usb_hcd *hcd = ehci_to_hcd(tegra->ehci);
+	struct usb_hcd *hcd = platform_get_drvdata(pdev);
 
 	if (hcd->driver->shutdown)
 		hcd->driver->shutdown(hcd);
@@ -850,10 +535,50 @@
 	.remove		= tegra_ehci_remove,
 	.shutdown	= tegra_ehci_hcd_shutdown,
 	.driver		= {
-		.name	= "tegra-ehci",
+		.name	= DRV_NAME,
 		.of_match_table = tegra_ehci_of_match,
-#ifdef CONFIG_PM
-		.pm	= &tegra_ehci_pm_ops,
-#endif
 	}
 };
+
+static const struct ehci_driver_overrides tegra_overrides __initconst = {
+	.extra_priv_size	= sizeof(struct tegra_ehci_hcd),
+};
+
+static int __init ehci_tegra_init(void)
+{
+	if (usb_disabled())
+		return -ENODEV;
+
+	pr_info(DRV_NAME ": " DRIVER_DESC "\n");
+
+	ehci_init_driver(&tegra_ehci_hc_driver, &tegra_overrides);
+
+	/*
+	 * The Tegra HW has some unusual quirks, which require Tegra-specific
+	 * workarounds. We override certain hc_driver functions here to
+	 * achieve that. We explicitly do not enhance ehci_driver_overrides to
+	 * allow this more easily, since this is an unusual case, and we don't
+	 * want to encourage others to override these functions by making it
+	 * too easy.
+	 */
+
+	orig_hub_control = tegra_ehci_hc_driver.hub_control;
+
+	tegra_ehci_hc_driver.map_urb_for_dma = tegra_ehci_map_urb_for_dma;
+	tegra_ehci_hc_driver.unmap_urb_for_dma = tegra_ehci_unmap_urb_for_dma;
+	tegra_ehci_hc_driver.hub_control = tegra_ehci_hub_control;
+
+	return platform_driver_register(&tegra_ehci_driver);
+}
+module_init(ehci_tegra_init);
+
+static void __exit ehci_tegra_cleanup(void)
+{
+	platform_driver_unregister(&tegra_ehci_driver);
+}
+module_exit(ehci_tegra_cleanup);
+
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:" DRV_NAME);
+MODULE_DEVICE_TABLE(of, tegra_ehci_of_match);

diff --git a/drivers/usb/host/ehci-tilegx.c b/drivers/usb/host/ehci-tilegx.c
index b083a35..d72b292 100644
--- a/drivers/usb/host/ehci-tilegx.c
+++ b/drivers/usb/host/ehci-tilegx.c

@@ -193,7 +193,6 @@
 	tilegx_stop_ehc();
 	gxio_usb_host_destroy(&pdata->usb_ctx);
 	destroy_irq(pdata->irq);
-	platform_set_drvdata(pdev, NULL);
 
 	return 0;
 }

diff --git a/drivers/usb/host/ehci-xilinx-of.c b/drivers/usb/host/ehci-xilinx-of.c
index d845e3b..35c7f90 100644
--- a/drivers/usb/host/ehci-xilinx-of.c
+++ b/drivers/usb/host/ehci-xilinx-of.c

@@ -209,8 +209,7 @@
  */
 static int ehci_hcd_xilinx_of_remove(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
-	dev_set_drvdata(&op->dev, NULL);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 
 	dev_dbg(&op->dev, "stopping XILINX-OF USB Controller\n");
 
@@ -229,7 +228,7 @@
  */
 static void ehci_hcd_xilinx_of_shutdown(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 
 	if (hcd->driver->shutdown)
 		hcd->driver->shutdown(hcd);

diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index 7c978b2..64f9a08 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h

@@ -800,6 +800,8 @@
 extern void	ehci_init_driver(struct hc_driver *drv,
 				const struct ehci_driver_overrides *over);
 extern int	ehci_setup(struct usb_hcd *hcd);
+extern int	ehci_handshake(struct ehci_hcd *ehci, void __iomem *ptr,
+				u32 mask, u32 done, int usec);
 
 #ifdef CONFIG_PM
 extern int	ehci_suspend(struct usb_hcd *hcd, bool do_wakeup);

diff --git a/drivers/usb/host/fhci-sched.c b/drivers/usb/host/fhci-sched.c
index 8f18538..95ca598 100644
--- a/drivers/usb/host/fhci-sched.c
+++ b/drivers/usb/host/fhci-sched.c

@@ -739,9 +739,13 @@
 	}
 
 	/* for ISO transfer calculate start frame index */
-	if (ed->mode == FHCI_TF_ISO && urb->transfer_flags & URB_ISO_ASAP)
-		urb->start_frame = ed->td_head ? ed->last_iso + 1 :
+	if (ed->mode == FHCI_TF_ISO) {
+		/* Ignore the possibility of underruns */
+		urb->start_frame = ed->td_head ? ed->next_iso :
 						 get_frame_num(fhci);
+		ed->next_iso = (urb->start_frame + urb->interval *
+				urb->number_of_packets) & 0x07ff;
+	}
 
 	/*
 	 * OHCI handles the DATA toggle itself,we just use the USB

diff --git a/drivers/usb/host/fhci.h b/drivers/usb/host/fhci.h
index 7cc1c32..154e6a0 100644
--- a/drivers/usb/host/fhci.h
+++ b/drivers/usb/host/fhci.h

@@ -338,7 +338,7 @@
 
 	/* read only parameters, should be cleared upon initialization */
 	u8 toggle_carry;	/* toggle carry from the last TD submitted */
-	u32 last_iso;		/* time stamp of last queued ISO transfer */
+	u16 next_iso;		/* time stamp of next queued ISO transfer */
 	struct td *td_head;	/* a pointer to the current TD handled */
 };
 

diff --git a/drivers/usb/host/fusbh200-hcd.c b/drivers/usb/host/fusbh200-hcd.c
new file mode 100644
index 0000000..299253c
--- /dev/null
+++ b/drivers/usb/host/fusbh200-hcd.c

@@ -0,0 +1,5972 @@
+/*
+ * Faraday FUSBH200 EHCI-like driver
+ *
+ * Copyright (c) 2013 Faraday Technology Corporation
+ *
+ * Author: Yuan-Hsin Chen <yhchen@faraday-tech.com>
+ * 	   Feng-Hsin Chiang <john453@faraday-tech.com>
+ * 	   Po-Yu Chuang <ratbert.chuang@gmail.com>
+ *
+ * Most of code borrowed from the Linux-3.7 EHCI driver
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/hrtimer.h>
+#include <linux/list.h>
+#include <linux/interrupt.h>
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
+#include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/platform_device.h>
+
+#include <asm/byteorder.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/unaligned.h>
+
+/*-------------------------------------------------------------------------*/
+#define DRIVER_AUTHOR "Yuan-Hsin Chen"
+#define DRIVER_DESC "FUSBH200 Host Controller (EHCI) Driver"
+
+static const char	hcd_name [] = "fusbh200_hcd";
+
+#undef VERBOSE_DEBUG
+#undef FUSBH200_URB_TRACE
+
+#ifdef DEBUG
+#define FUSBH200_STATS
+#endif
+
+/* magic numbers that can affect system performance */
+#define	FUSBH200_TUNE_CERR		3	/* 0-3 qtd retries; 0 == don't stop */
+#define	FUSBH200_TUNE_RL_HS		4	/* nak throttle; see 4.9 */
+#define	FUSBH200_TUNE_RL_TT		0
+#define	FUSBH200_TUNE_MULT_HS	1	/* 1-3 transactions/uframe; 4.10.3 */
+#define	FUSBH200_TUNE_MULT_TT	1
+/*
+ * Some drivers think it's safe to schedule isochronous transfers more than
+ * 256 ms into the future (partly as a result of an old bug in the scheduling
+ * code).  In an attempt to avoid trouble, we will use a minimum scheduling
+ * length of 512 frames instead of 256.
+ */
+#define	FUSBH200_TUNE_FLS		1	/* (medium) 512-frame schedule */
+
+/* Initial IRQ latency:  faster than hw default */
+static int log2_irq_thresh = 0;		// 0 to 6
+module_param (log2_irq_thresh, int, S_IRUGO);
+MODULE_PARM_DESC (log2_irq_thresh, "log2 IRQ latency, 1-64 microframes");
+
+/* initial park setting:  slower than hw default */
+static unsigned park = 0;
+module_param (park, uint, S_IRUGO);
+MODULE_PARM_DESC (park, "park setting; 1-3 back-to-back async packets");
+
+/* for link power management(LPM) feature */
+static unsigned int hird;
+module_param(hird, int, S_IRUGO);
+MODULE_PARM_DESC(hird, "host initiated resume duration, +1 for each 75us");
+
+#define	INTR_MASK (STS_IAA | STS_FATAL | STS_PCD | STS_ERR | STS_INT)
+
+#include "fusbh200.h"
+
+/*-------------------------------------------------------------------------*/
+
+#define fusbh200_dbg(fusbh200, fmt, args...) \
+	dev_dbg (fusbh200_to_hcd(fusbh200)->self.controller , fmt , ## args )
+#define fusbh200_err(fusbh200, fmt, args...) \
+	dev_err (fusbh200_to_hcd(fusbh200)->self.controller , fmt , ## args )
+#define fusbh200_info(fusbh200, fmt, args...) \
+	dev_info (fusbh200_to_hcd(fusbh200)->self.controller , fmt , ## args )
+#define fusbh200_warn(fusbh200, fmt, args...) \
+	dev_warn (fusbh200_to_hcd(fusbh200)->self.controller , fmt , ## args )
+
+#ifdef VERBOSE_DEBUG
+#	define fusbh200_vdbg fusbh200_dbg
+#else
+	static inline void fusbh200_vdbg(struct fusbh200_hcd *fusbh200, ...) {}
+#endif
+
+#ifdef	DEBUG
+
+/* check the values in the HCSPARAMS register
+ * (host controller _Structural_ parameters)
+ * see EHCI spec, Table 2-4 for each value
+ */
+static void dbg_hcs_params (struct fusbh200_hcd *fusbh200, char *label)
+{
+	u32	params = fusbh200_readl(fusbh200, &fusbh200->caps->hcs_params);
+
+	fusbh200_dbg (fusbh200,
+		"%s hcs_params 0x%x ports=%d\n",
+		label, params,
+		HCS_N_PORTS (params)
+		);
+}
+#else
+
+static inline void dbg_hcs_params (struct fusbh200_hcd *fusbh200, char *label) {}
+
+#endif
+
+#ifdef	DEBUG
+
+/* check the values in the HCCPARAMS register
+ * (host controller _Capability_ parameters)
+ * see EHCI Spec, Table 2-5 for each value
+ * */
+static void dbg_hcc_params (struct fusbh200_hcd *fusbh200, char *label)
+{
+	u32	params = fusbh200_readl(fusbh200, &fusbh200->caps->hcc_params);
+
+	fusbh200_dbg (fusbh200,
+		"%s hcc_params %04x uframes %s%s\n",
+		label,
+		params,
+		HCC_PGM_FRAMELISTLEN(params) ? "256/512/1024" : "1024",
+		HCC_CANPARK(params) ? " park" : "");
+}
+#else
+
+static inline void dbg_hcc_params (struct fusbh200_hcd *fusbh200, char *label) {}
+
+#endif
+
+#ifdef	DEBUG
+
+static void __maybe_unused
+dbg_qtd (const char *label, struct fusbh200_hcd *fusbh200, struct fusbh200_qtd *qtd)
+{
+	fusbh200_dbg(fusbh200, "%s td %p n%08x %08x t%08x p0=%08x\n", label, qtd,
+		hc32_to_cpup(fusbh200, &qtd->hw_next),
+		hc32_to_cpup(fusbh200, &qtd->hw_alt_next),
+		hc32_to_cpup(fusbh200, &qtd->hw_token),
+		hc32_to_cpup(fusbh200, &qtd->hw_buf [0]));
+	if (qtd->hw_buf [1])
+		fusbh200_dbg(fusbh200, "  p1=%08x p2=%08x p3=%08x p4=%08x\n",
+			hc32_to_cpup(fusbh200, &qtd->hw_buf[1]),
+			hc32_to_cpup(fusbh200, &qtd->hw_buf[2]),
+			hc32_to_cpup(fusbh200, &qtd->hw_buf[3]),
+			hc32_to_cpup(fusbh200, &qtd->hw_buf[4]));
+}
+
+static void __maybe_unused
+dbg_qh (const char *label, struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	struct fusbh200_qh_hw *hw = qh->hw;
+
+	fusbh200_dbg (fusbh200, "%s qh %p n%08x info %x %x qtd %x\n", label,
+		qh, hw->hw_next, hw->hw_info1, hw->hw_info2, hw->hw_current);
+	dbg_qtd("overlay", fusbh200, (struct fusbh200_qtd *) &hw->hw_qtd_next);
+}
+
+static void __maybe_unused
+dbg_itd (const char *label, struct fusbh200_hcd *fusbh200, struct fusbh200_itd *itd)
+{
+	fusbh200_dbg (fusbh200, "%s [%d] itd %p, next %08x, urb %p\n",
+		label, itd->frame, itd, hc32_to_cpu(fusbh200, itd->hw_next),
+		itd->urb);
+	fusbh200_dbg (fusbh200,
+		"  trans: %08x %08x %08x %08x %08x %08x %08x %08x\n",
+		hc32_to_cpu(fusbh200, itd->hw_transaction[0]),
+		hc32_to_cpu(fusbh200, itd->hw_transaction[1]),
+		hc32_to_cpu(fusbh200, itd->hw_transaction[2]),
+		hc32_to_cpu(fusbh200, itd->hw_transaction[3]),
+		hc32_to_cpu(fusbh200, itd->hw_transaction[4]),
+		hc32_to_cpu(fusbh200, itd->hw_transaction[5]),
+		hc32_to_cpu(fusbh200, itd->hw_transaction[6]),
+		hc32_to_cpu(fusbh200, itd->hw_transaction[7]));
+	fusbh200_dbg (fusbh200,
+		"  buf:   %08x %08x %08x %08x %08x %08x %08x\n",
+		hc32_to_cpu(fusbh200, itd->hw_bufp[0]),
+		hc32_to_cpu(fusbh200, itd->hw_bufp[1]),
+		hc32_to_cpu(fusbh200, itd->hw_bufp[2]),
+		hc32_to_cpu(fusbh200, itd->hw_bufp[3]),
+		hc32_to_cpu(fusbh200, itd->hw_bufp[4]),
+		hc32_to_cpu(fusbh200, itd->hw_bufp[5]),
+		hc32_to_cpu(fusbh200, itd->hw_bufp[6]));
+	fusbh200_dbg (fusbh200, "  index: %d %d %d %d %d %d %d %d\n",
+		itd->index[0], itd->index[1], itd->index[2],
+		itd->index[3], itd->index[4], itd->index[5],
+		itd->index[6], itd->index[7]);
+}
+
+static int __maybe_unused
+dbg_status_buf (char *buf, unsigned len, const char *label, u32 status)
+{
+	return scnprintf (buf, len,
+		"%s%sstatus %04x%s%s%s%s%s%s%s%s%s%s",
+		label, label [0] ? " " : "", status,
+		(status & STS_ASS) ? " Async" : "",
+		(status & STS_PSS) ? " Periodic" : "",
+		(status & STS_RECL) ? " Recl" : "",
+		(status & STS_HALT) ? " Halt" : "",
+		(status & STS_IAA) ? " IAA" : "",
+		(status & STS_FATAL) ? " FATAL" : "",
+		(status & STS_FLR) ? " FLR" : "",
+		(status & STS_PCD) ? " PCD" : "",
+		(status & STS_ERR) ? " ERR" : "",
+		(status & STS_INT) ? " INT" : ""
+		);
+}
+
+static int __maybe_unused
+dbg_intr_buf (char *buf, unsigned len, const char *label, u32 enable)
+{
+	return scnprintf (buf, len,
+		"%s%sintrenable %02x%s%s%s%s%s%s",
+		label, label [0] ? " " : "", enable,
+		(enable & STS_IAA) ? " IAA" : "",
+		(enable & STS_FATAL) ? " FATAL" : "",
+		(enable & STS_FLR) ? " FLR" : "",
+		(enable & STS_PCD) ? " PCD" : "",
+		(enable & STS_ERR) ? " ERR" : "",
+		(enable & STS_INT) ? " INT" : ""
+		);
+}
+
+static const char *const fls_strings [] =
+    { "1024", "512", "256", "??" };
+
+static int
+dbg_command_buf (char *buf, unsigned len, const char *label, u32 command)
+{
+	return scnprintf (buf, len,
+		"%s%scommand %07x %s=%d ithresh=%d%s%s%s "
+		"period=%s%s %s",
+		label, label [0] ? " " : "", command,
+		(command & CMD_PARK) ? " park" : "(park)",
+		CMD_PARK_CNT (command),
+		(command >> 16) & 0x3f,
+		(command & CMD_IAAD) ? " IAAD" : "",
+		(command & CMD_ASE) ? " Async" : "",
+		(command & CMD_PSE) ? " Periodic" : "",
+		fls_strings [(command >> 2) & 0x3],
+		(command & CMD_RESET) ? " Reset" : "",
+		(command & CMD_RUN) ? "RUN" : "HALT"
+		);
+}
+
+static int
+dbg_port_buf (char *buf, unsigned len, const char *label, int port, u32 status)
+{
+	char	*sig;
+
+	/* signaling state */
+	switch (status & (3 << 10)) {
+	case 0 << 10: sig = "se0"; break;
+	case 1 << 10: sig = "k"; break;		/* low speed */
+	case 2 << 10: sig = "j"; break;
+	default: sig = "?"; break;
+	}
+
+	return scnprintf (buf, len,
+		"%s%sport:%d status %06x %d "
+		"sig=%s%s%s%s%s%s%s%s",
+		label, label [0] ? " " : "", port, status,
+		status>>25,/*device address */
+		sig,
+		(status & PORT_RESET) ? " RESET" : "",
+		(status & PORT_SUSPEND) ? " SUSPEND" : "",
+		(status & PORT_RESUME) ? " RESUME" : "",
+		(status & PORT_PEC) ? " PEC" : "",
+		(status & PORT_PE) ? " PE" : "",
+		(status & PORT_CSC) ? " CSC" : "",
+		(status & PORT_CONNECT) ? " CONNECT" : "");
+}
+
+#else
+static inline void __maybe_unused
+dbg_qh (char *label, struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{}
+
+static inline int __maybe_unused
+dbg_status_buf (char *buf, unsigned len, const char *label, u32 status)
+{ return 0; }
+
+static inline int __maybe_unused
+dbg_command_buf (char *buf, unsigned len, const char *label, u32 command)
+{ return 0; }
+
+static inline int __maybe_unused
+dbg_intr_buf (char *buf, unsigned len, const char *label, u32 enable)
+{ return 0; }
+
+static inline int __maybe_unused
+dbg_port_buf (char *buf, unsigned len, const char *label, int port, u32 status)
+{ return 0; }
+
+#endif	/* DEBUG */
+
+/* functions have the "wrong" filename when they're output... */
+#define dbg_status(fusbh200, label, status) { \
+	char _buf [80]; \
+	dbg_status_buf (_buf, sizeof _buf, label, status); \
+	fusbh200_dbg (fusbh200, "%s\n", _buf); \
+}
+
+#define dbg_cmd(fusbh200, label, command) { \
+	char _buf [80]; \
+	dbg_command_buf (_buf, sizeof _buf, label, command); \
+	fusbh200_dbg (fusbh200, "%s\n", _buf); \
+}
+
+#define dbg_port(fusbh200, label, port, status) { \
+	char _buf [80]; \
+	dbg_port_buf (_buf, sizeof _buf, label, port, status); \
+	fusbh200_dbg (fusbh200, "%s\n", _buf); \
+}
+
+/*-------------------------------------------------------------------------*/
+
+#ifdef STUB_DEBUG_FILES
+
+static inline void create_debug_files (struct fusbh200_hcd *bus) { }
+static inline void remove_debug_files (struct fusbh200_hcd *bus) { }
+
+#else
+
+/* troubleshooting help: expose state in debugfs */
+
+static int debug_async_open(struct inode *, struct file *);
+static int debug_periodic_open(struct inode *, struct file *);
+static int debug_registers_open(struct inode *, struct file *);
+static int debug_async_open(struct inode *, struct file *);
+
+static ssize_t debug_output(struct file*, char __user*, size_t, loff_t*);
+static int debug_close(struct inode *, struct file *);
+
+static const struct file_operations debug_async_fops = {
+	.owner		= THIS_MODULE,
+	.open		= debug_async_open,
+	.read		= debug_output,
+	.release	= debug_close,
+	.llseek		= default_llseek,
+};
+static const struct file_operations debug_periodic_fops = {
+	.owner		= THIS_MODULE,
+	.open		= debug_periodic_open,
+	.read		= debug_output,
+	.release	= debug_close,
+	.llseek		= default_llseek,
+};
+static const struct file_operations debug_registers_fops = {
+	.owner		= THIS_MODULE,
+	.open		= debug_registers_open,
+	.read		= debug_output,
+	.release	= debug_close,
+	.llseek		= default_llseek,
+};
+
+static struct dentry *fusbh200_debug_root;
+
+struct debug_buffer {
+	ssize_t (*fill_func)(struct debug_buffer *);	/* fill method */
+	struct usb_bus *bus;
+	struct mutex mutex;	/* protect filling of buffer */
+	size_t count;		/* number of characters filled into buffer */
+	char *output_buf;
+	size_t alloc_size;
+};
+
+#define speed_char(info1) ({ char tmp; \
+		switch (info1 & (3 << 12)) { \
+		case QH_FULL_SPEED: tmp = 'f'; break; \
+		case QH_LOW_SPEED:  tmp = 'l'; break; \
+		case QH_HIGH_SPEED: tmp = 'h'; break; \
+		default: tmp = '?'; break; \
+		}; tmp; })
+
+static inline char token_mark(struct fusbh200_hcd *fusbh200, __hc32 token)
+{
+	__u32 v = hc32_to_cpu(fusbh200, token);
+
+	if (v & QTD_STS_ACTIVE)
+		return '*';
+	if (v & QTD_STS_HALT)
+		return '-';
+	if (!IS_SHORT_READ (v))
+		return ' ';
+	/* tries to advance through hw_alt_next */
+	return '/';
+}
+
+static void qh_lines (
+	struct fusbh200_hcd *fusbh200,
+	struct fusbh200_qh *qh,
+	char **nextp,
+	unsigned *sizep
+)
+{
+	u32			scratch;
+	u32			hw_curr;
+	struct fusbh200_qtd		*td;
+	unsigned		temp;
+	unsigned		size = *sizep;
+	char			*next = *nextp;
+	char			mark;
+	__le32			list_end = FUSBH200_LIST_END(fusbh200);
+	struct fusbh200_qh_hw	*hw = qh->hw;
+
+	if (hw->hw_qtd_next == list_end)	/* NEC does this */
+		mark = '@';
+	else
+		mark = token_mark(fusbh200, hw->hw_token);
+	if (mark == '/') {	/* qh_alt_next controls qh advance? */
+		if ((hw->hw_alt_next & QTD_MASK(fusbh200))
+				== fusbh200->async->hw->hw_alt_next)
+			mark = '#';	/* blocked */
+		else if (hw->hw_alt_next == list_end)
+			mark = '.';	/* use hw_qtd_next */
+		/* else alt_next points to some other qtd */
+	}
+	scratch = hc32_to_cpup(fusbh200, &hw->hw_info1);
+	hw_curr = (mark == '*') ? hc32_to_cpup(fusbh200, &hw->hw_current) : 0;
+	temp = scnprintf (next, size,
+			"qh/%p dev%d %cs ep%d %08x %08x (%08x%c %s nak%d)",
+			qh, scratch & 0x007f,
+			speed_char (scratch),
+			(scratch >> 8) & 0x000f,
+			scratch, hc32_to_cpup(fusbh200, &hw->hw_info2),
+			hc32_to_cpup(fusbh200, &hw->hw_token), mark,
+			(cpu_to_hc32(fusbh200, QTD_TOGGLE) & hw->hw_token)
+				? "data1" : "data0",
+			(hc32_to_cpup(fusbh200, &hw->hw_alt_next) >> 1) & 0x0f);
+	size -= temp;
+	next += temp;
+
+	/* hc may be modifying the list as we read it ... */
+	list_for_each_entry(td, &qh->qtd_list, qtd_list) {
+		scratch = hc32_to_cpup(fusbh200, &td->hw_token);
+		mark = ' ';
+		if (hw_curr == td->qtd_dma)
+			mark = '*';
+		else if (hw->hw_qtd_next == cpu_to_hc32(fusbh200, td->qtd_dma))
+			mark = '+';
+		else if (QTD_LENGTH (scratch)) {
+			if (td->hw_alt_next == fusbh200->async->hw->hw_alt_next)
+				mark = '#';
+			else if (td->hw_alt_next != list_end)
+				mark = '/';
+		}
+		temp = snprintf (next, size,
+				"\n\t%p%c%s len=%d %08x urb %p",
+				td, mark, ({ char *tmp;
+				 switch ((scratch>>8)&0x03) {
+				 case 0: tmp = "out"; break;
+				 case 1: tmp = "in"; break;
+				 case 2: tmp = "setup"; break;
+				 default: tmp = "?"; break;
+				 } tmp;}),
+				(scratch >> 16) & 0x7fff,
+				scratch,
+				td->urb);
+		if (size < temp)
+			temp = size;
+		size -= temp;
+		next += temp;
+		if (temp == size)
+			goto done;
+	}
+
+	temp = snprintf (next, size, "\n");
+	if (size < temp)
+		temp = size;
+	size -= temp;
+	next += temp;
+
+done:
+	*sizep = size;
+	*nextp = next;
+}
+
+static ssize_t fill_async_buffer(struct debug_buffer *buf)
+{
+	struct usb_hcd		*hcd;
+	struct fusbh200_hcd	*fusbh200;
+	unsigned long		flags;
+	unsigned		temp, size;
+	char			*next;
+	struct fusbh200_qh		*qh;
+
+	hcd = bus_to_hcd(buf->bus);
+	fusbh200 = hcd_to_fusbh200 (hcd);
+	next = buf->output_buf;
+	size = buf->alloc_size;
+
+	*next = 0;
+
+	/* dumps a snapshot of the async schedule.
+	 * usually empty except for long-term bulk reads, or head.
+	 * one QH per line, and TDs we know about
+	 */
+	spin_lock_irqsave (&fusbh200->lock, flags);
+	for (qh = fusbh200->async->qh_next.qh; size > 0 && qh; qh = qh->qh_next.qh)
+		qh_lines (fusbh200, qh, &next, &size);
+	if (fusbh200->async_unlink && size > 0) {
+		temp = scnprintf(next, size, "\nunlink =\n");
+		size -= temp;
+		next += temp;
+
+		for (qh = fusbh200->async_unlink; size > 0 && qh;
+				qh = qh->unlink_next)
+			qh_lines (fusbh200, qh, &next, &size);
+	}
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+
+	return strlen(buf->output_buf);
+}
+
+#define DBG_SCHED_LIMIT 64
+static ssize_t fill_periodic_buffer(struct debug_buffer *buf)
+{
+	struct usb_hcd		*hcd;
+	struct fusbh200_hcd		*fusbh200;
+	unsigned long		flags;
+	union fusbh200_shadow	p, *seen;
+	unsigned		temp, size, seen_count;
+	char			*next;
+	unsigned		i;
+	__hc32			tag;
+
+	if (!(seen = kmalloc (DBG_SCHED_LIMIT * sizeof *seen, GFP_ATOMIC)))
+		return 0;
+	seen_count = 0;
+
+	hcd = bus_to_hcd(buf->bus);
+	fusbh200 = hcd_to_fusbh200 (hcd);
+	next = buf->output_buf;
+	size = buf->alloc_size;
+
+	temp = scnprintf (next, size, "size = %d\n", fusbh200->periodic_size);
+	size -= temp;
+	next += temp;
+
+	/* dump a snapshot of the periodic schedule.
+	 * iso changes, interrupt usually doesn't.
+	 */
+	spin_lock_irqsave (&fusbh200->lock, flags);
+	for (i = 0; i < fusbh200->periodic_size; i++) {
+		p = fusbh200->pshadow [i];
+		if (likely (!p.ptr))
+			continue;
+		tag = Q_NEXT_TYPE(fusbh200, fusbh200->periodic [i]);
+
+		temp = scnprintf (next, size, "%4d: ", i);
+		size -= temp;
+		next += temp;
+
+		do {
+			struct fusbh200_qh_hw *hw;
+
+			switch (hc32_to_cpu(fusbh200, tag)) {
+			case Q_TYPE_QH:
+				hw = p.qh->hw;
+				temp = scnprintf (next, size, " qh%d-%04x/%p",
+						p.qh->period,
+						hc32_to_cpup(fusbh200,
+							&hw->hw_info2)
+							/* uframe masks */
+							& (QH_CMASK | QH_SMASK),
+						p.qh);
+				size -= temp;
+				next += temp;
+				/* don't repeat what follows this qh */
+				for (temp = 0; temp < seen_count; temp++) {
+					if (seen [temp].ptr != p.ptr)
+						continue;
+					if (p.qh->qh_next.ptr) {
+						temp = scnprintf (next, size,
+							" ...");
+						size -= temp;
+						next += temp;
+					}
+					break;
+				}
+				/* show more info the first time around */
+				if (temp == seen_count) {
+					u32	scratch = hc32_to_cpup(fusbh200,
+							&hw->hw_info1);
+					struct fusbh200_qtd	*qtd;
+					char		*type = "";
+
+					/* count tds, get ep direction */
+					temp = 0;
+					list_for_each_entry (qtd,
+							&p.qh->qtd_list,
+							qtd_list) {
+						temp++;
+						switch (0x03 & (hc32_to_cpu(
+							fusbh200,
+							qtd->hw_token) >> 8)) {
+						case 0: type = "out"; continue;
+						case 1: type = "in"; continue;
+						}
+					}
+
+					temp = scnprintf (next, size,
+						" (%c%d ep%d%s "
+						"[%d/%d] q%d p%d)",
+						speed_char (scratch),
+						scratch & 0x007f,
+						(scratch >> 8) & 0x000f, type,
+						p.qh->usecs, p.qh->c_usecs,
+						temp,
+						0x7ff & (scratch >> 16));
+
+					if (seen_count < DBG_SCHED_LIMIT)
+						seen [seen_count++].qh = p.qh;
+				} else
+					temp = 0;
+				tag = Q_NEXT_TYPE(fusbh200, hw->hw_next);
+				p = p.qh->qh_next;
+				break;
+			case Q_TYPE_FSTN:
+				temp = scnprintf (next, size,
+					" fstn-%8x/%p", p.fstn->hw_prev,
+					p.fstn);
+				tag = Q_NEXT_TYPE(fusbh200, p.fstn->hw_next);
+				p = p.fstn->fstn_next;
+				break;
+			case Q_TYPE_ITD:
+				temp = scnprintf (next, size,
+					" itd/%p", p.itd);
+				tag = Q_NEXT_TYPE(fusbh200, p.itd->hw_next);
+				p = p.itd->itd_next;
+				break;
+			}
+			size -= temp;
+			next += temp;
+		} while (p.ptr);
+
+		temp = scnprintf (next, size, "\n");
+		size -= temp;
+		next += temp;
+	}
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+	kfree (seen);
+
+	return buf->alloc_size - size;
+}
+#undef DBG_SCHED_LIMIT
+
+static const char *rh_state_string(struct fusbh200_hcd *fusbh200)
+{
+	switch (fusbh200->rh_state) {
+	case FUSBH200_RH_HALTED:
+		return "halted";
+	case FUSBH200_RH_SUSPENDED:
+		return "suspended";
+	case FUSBH200_RH_RUNNING:
+		return "running";
+	case FUSBH200_RH_STOPPING:
+		return "stopping";
+	}
+	return "?";
+}
+
+static ssize_t fill_registers_buffer(struct debug_buffer *buf)
+{
+	struct usb_hcd		*hcd;
+	struct fusbh200_hcd	*fusbh200;
+	unsigned long		flags;
+	unsigned		temp, size, i;
+	char			*next, scratch [80];
+	static char		fmt [] = "%*s\n";
+	static char		label [] = "";
+
+	hcd = bus_to_hcd(buf->bus);
+	fusbh200 = hcd_to_fusbh200 (hcd);
+	next = buf->output_buf;
+	size = buf->alloc_size;
+
+	spin_lock_irqsave (&fusbh200->lock, flags);
+
+	if (!HCD_HW_ACCESSIBLE(hcd)) {
+		size = scnprintf (next, size,
+			"bus %s, device %s\n"
+			"%s\n"
+			"SUSPENDED (no register access)\n",
+			hcd->self.controller->bus->name,
+			dev_name(hcd->self.controller),
+			hcd->product_desc);
+		goto done;
+	}
+
+	/* Capability Registers */
+	i = HC_VERSION(fusbh200, fusbh200_readl(fusbh200, &fusbh200->caps->hc_capbase));
+	temp = scnprintf (next, size,
+		"bus %s, device %s\n"
+		"%s\n"
+		"EHCI %x.%02x, rh state %s\n",
+		hcd->self.controller->bus->name,
+		dev_name(hcd->self.controller),
+		hcd->product_desc,
+		i >> 8, i & 0x0ff, rh_state_string(fusbh200));
+	size -= temp;
+	next += temp;
+
+	// FIXME interpret both types of params
+	i = fusbh200_readl(fusbh200, &fusbh200->caps->hcs_params);
+	temp = scnprintf (next, size, "structural params 0x%08x\n", i);
+	size -= temp;
+	next += temp;
+
+	i = fusbh200_readl(fusbh200, &fusbh200->caps->hcc_params);
+	temp = scnprintf (next, size, "capability params 0x%08x\n", i);
+	size -= temp;
+	next += temp;
+
+	/* Operational Registers */
+	temp = dbg_status_buf (scratch, sizeof scratch, label,
+			fusbh200_readl(fusbh200, &fusbh200->regs->status));
+	temp = scnprintf (next, size, fmt, temp, scratch);
+	size -= temp;
+	next += temp;
+
+	temp = dbg_command_buf (scratch, sizeof scratch, label,
+			fusbh200_readl(fusbh200, &fusbh200->regs->command));
+	temp = scnprintf (next, size, fmt, temp, scratch);
+	size -= temp;
+	next += temp;
+
+	temp = dbg_intr_buf (scratch, sizeof scratch, label,
+			fusbh200_readl(fusbh200, &fusbh200->regs->intr_enable));
+	temp = scnprintf (next, size, fmt, temp, scratch);
+	size -= temp;
+	next += temp;
+
+	temp = scnprintf (next, size, "uframe %04x\n",
+			fusbh200_read_frame_index(fusbh200));
+	size -= temp;
+	next += temp;
+
+	if (fusbh200->async_unlink) {
+		temp = scnprintf(next, size, "async unlink qh %p\n",
+				fusbh200->async_unlink);
+		size -= temp;
+		next += temp;
+	}
+
+#ifdef FUSBH200_STATS
+	temp = scnprintf (next, size,
+		"irq normal %ld err %ld iaa %ld (lost %ld)\n",
+		fusbh200->stats.normal, fusbh200->stats.error, fusbh200->stats.iaa,
+		fusbh200->stats.lost_iaa);
+	size -= temp;
+	next += temp;
+
+	temp = scnprintf (next, size, "complete %ld unlink %ld\n",
+		fusbh200->stats.complete, fusbh200->stats.unlink);
+	size -= temp;
+	next += temp;
+#endif
+
+done:
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+
+	return buf->alloc_size - size;
+}
+
+static struct debug_buffer *alloc_buffer(struct usb_bus *bus,
+				ssize_t (*fill_func)(struct debug_buffer *))
+{
+	struct debug_buffer *buf;
+
+	buf = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
+
+	if (buf) {
+		buf->bus = bus;
+		buf->fill_func = fill_func;
+		mutex_init(&buf->mutex);
+		buf->alloc_size = PAGE_SIZE;
+	}
+
+	return buf;
+}
+
+static int fill_buffer(struct debug_buffer *buf)
+{
+	int ret = 0;
+
+	if (!buf->output_buf)
+		buf->output_buf = vmalloc(buf->alloc_size);
+
+	if (!buf->output_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = buf->fill_func(buf);
+
+	if (ret >= 0) {
+		buf->count = ret;
+		ret = 0;
+	}
+
+out:
+	return ret;
+}
+
+static ssize_t debug_output(struct file *file, char __user *user_buf,
+			    size_t len, loff_t *offset)
+{
+	struct debug_buffer *buf = file->private_data;
+	int ret = 0;
+
+	mutex_lock(&buf->mutex);
+	if (buf->count == 0) {
+		ret = fill_buffer(buf);
+		if (ret != 0) {
+			mutex_unlock(&buf->mutex);
+			goto out;
+		}
+	}
+	mutex_unlock(&buf->mutex);
+
+	ret = simple_read_from_buffer(user_buf, len, offset,
+				      buf->output_buf, buf->count);
+
+out:
+	return ret;
+
+}
+
+static int debug_close(struct inode *inode, struct file *file)
+{
+	struct debug_buffer *buf = file->private_data;
+
+	if (buf) {
+		vfree(buf->output_buf);
+		kfree(buf);
+	}
+
+	return 0;
+}
+static int debug_async_open(struct inode *inode, struct file *file)
+{
+	file->private_data = alloc_buffer(inode->i_private, fill_async_buffer);
+
+	return file->private_data ? 0 : -ENOMEM;
+}
+
+static int debug_periodic_open(struct inode *inode, struct file *file)
+{
+	struct debug_buffer *buf;
+	buf = alloc_buffer(inode->i_private, fill_periodic_buffer);
+	if (!buf)
+		return -ENOMEM;
+
+	buf->alloc_size = (sizeof(void *) == 4 ? 6 : 8)*PAGE_SIZE;
+	file->private_data = buf;
+	return 0;
+}
+
+static int debug_registers_open(struct inode *inode, struct file *file)
+{
+	file->private_data = alloc_buffer(inode->i_private,
+					  fill_registers_buffer);
+
+	return file->private_data ? 0 : -ENOMEM;
+}
+
+static inline void create_debug_files (struct fusbh200_hcd *fusbh200)
+{
+	struct usb_bus *bus = &fusbh200_to_hcd(fusbh200)->self;
+
+	fusbh200->debug_dir = debugfs_create_dir(bus->bus_name, fusbh200_debug_root);
+	if (!fusbh200->debug_dir)
+		return;
+
+	if (!debugfs_create_file("async", S_IRUGO, fusbh200->debug_dir, bus,
+						&debug_async_fops))
+		goto file_error;
+
+	if (!debugfs_create_file("periodic", S_IRUGO, fusbh200->debug_dir, bus,
+						&debug_periodic_fops))
+		goto file_error;
+
+	if (!debugfs_create_file("registers", S_IRUGO, fusbh200->debug_dir, bus,
+						    &debug_registers_fops))
+		goto file_error;
+
+	return;
+
+file_error:
+	debugfs_remove_recursive(fusbh200->debug_dir);
+}
+
+static inline void remove_debug_files (struct fusbh200_hcd *fusbh200)
+{
+	debugfs_remove_recursive(fusbh200->debug_dir);
+}
+
+#endif /* STUB_DEBUG_FILES */
+/*-------------------------------------------------------------------------*/
+
+/*
+ * handshake - spin reading hc until handshake completes or fails
+ * @ptr: address of hc register to be read
+ * @mask: bits to look at in result of read
+ * @done: value of those bits when handshake succeeds
+ * @usec: timeout in microseconds
+ *
+ * Returns negative errno, or zero on success
+ *
+ * Success happens when the "mask" bits have the specified value (hardware
+ * handshake done).  There are two failure modes:  "usec" have passed (major
+ * hardware flakeout), or the register reads as all-ones (hardware removed).
+ *
+ * That last failure should_only happen in cases like physical cardbus eject
+ * before driver shutdown. But it also seems to be caused by bugs in cardbus
+ * bridge shutdown:  shutting down the bridge before the devices using it.
+ */
+static int handshake (struct fusbh200_hcd *fusbh200, void __iomem *ptr,
+		      u32 mask, u32 done, int usec)
+{
+	u32	result;
+
+	do {
+		result = fusbh200_readl(fusbh200, ptr);
+		if (result == ~(u32)0)		/* card removed */
+			return -ENODEV;
+		result &= mask;
+		if (result == done)
+			return 0;
+		udelay (1);
+		usec--;
+	} while (usec > 0);
+	return -ETIMEDOUT;
+}
+
+/*
+ * Force HC to halt state from unknown (EHCI spec section 2.3).
+ * Must be called with interrupts enabled and the lock not held.
+ */
+static int fusbh200_halt (struct fusbh200_hcd *fusbh200)
+{
+	u32	temp;
+
+	spin_lock_irq(&fusbh200->lock);
+
+	/* disable any irqs left enabled by previous code */
+	fusbh200_writel(fusbh200, 0, &fusbh200->regs->intr_enable);
+
+	/*
+	 * This routine gets called during probe before fusbh200->command
+	 * has been initialized, so we can't rely on its value.
+	 */
+	fusbh200->command &= ~CMD_RUN;
+	temp = fusbh200_readl(fusbh200, &fusbh200->regs->command);
+	temp &= ~(CMD_RUN | CMD_IAAD);
+	fusbh200_writel(fusbh200, temp, &fusbh200->regs->command);
+
+	spin_unlock_irq(&fusbh200->lock);
+	synchronize_irq(fusbh200_to_hcd(fusbh200)->irq);
+
+	return handshake(fusbh200, &fusbh200->regs->status,
+			  STS_HALT, STS_HALT, 16 * 125);
+}
+
+/*
+ * Reset a non-running (STS_HALT == 1) controller.
+ * Must be called with interrupts enabled and the lock not held.
+ */
+static int fusbh200_reset (struct fusbh200_hcd *fusbh200)
+{
+	int	retval;
+	u32	command = fusbh200_readl(fusbh200, &fusbh200->regs->command);
+
+	/* If the EHCI debug controller is active, special care must be
+	 * taken before and after a host controller reset */
+	if (fusbh200->debug && !dbgp_reset_prep(fusbh200_to_hcd(fusbh200)))
+		fusbh200->debug = NULL;
+
+	command |= CMD_RESET;
+	dbg_cmd (fusbh200, "reset", command);
+	fusbh200_writel(fusbh200, command, &fusbh200->regs->command);
+	fusbh200->rh_state = FUSBH200_RH_HALTED;
+	fusbh200->next_statechange = jiffies;
+	retval = handshake (fusbh200, &fusbh200->regs->command,
+			    CMD_RESET, 0, 250 * 1000);
+
+	if (retval)
+		return retval;
+
+	if (fusbh200->debug)
+		dbgp_external_startup(fusbh200_to_hcd(fusbh200));
+
+	fusbh200->port_c_suspend = fusbh200->suspended_ports =
+			fusbh200->resuming_ports = 0;
+	return retval;
+}
+
+/*
+ * Idle the controller (turn off the schedules).
+ * Must be called with interrupts enabled and the lock not held.
+ */
+static void fusbh200_quiesce (struct fusbh200_hcd *fusbh200)
+{
+	u32	temp;
+
+	if (fusbh200->rh_state != FUSBH200_RH_RUNNING)
+		return;
+
+	/* wait for any schedule enables/disables to take effect */
+	temp = (fusbh200->command << 10) & (STS_ASS | STS_PSS);
+	handshake(fusbh200, &fusbh200->regs->status, STS_ASS | STS_PSS, temp, 16 * 125);
+
+	/* then disable anything that's still active */
+	spin_lock_irq(&fusbh200->lock);
+	fusbh200->command &= ~(CMD_ASE | CMD_PSE);
+	fusbh200_writel(fusbh200, fusbh200->command, &fusbh200->regs->command);
+	spin_unlock_irq(&fusbh200->lock);
+
+	/* hardware can take 16 microframes to turn off ... */
+	handshake(fusbh200, &fusbh200->regs->status, STS_ASS | STS_PSS, 0, 16 * 125);
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void end_unlink_async(struct fusbh200_hcd *fusbh200);
+static void unlink_empty_async(struct fusbh200_hcd *fusbh200);
+static void fusbh200_work(struct fusbh200_hcd *fusbh200);
+static void start_unlink_intr(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh);
+static void end_unlink_intr(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh);
+
+/*-------------------------------------------------------------------------*/
+
+/* Set a bit in the USBCMD register */
+static void fusbh200_set_command_bit(struct fusbh200_hcd *fusbh200, u32 bit)
+{
+	fusbh200->command |= bit;
+	fusbh200_writel(fusbh200, fusbh200->command, &fusbh200->regs->command);
+
+	/* unblock posted write */
+	fusbh200_readl(fusbh200, &fusbh200->regs->command);
+}
+
+/* Clear a bit in the USBCMD register */
+static void fusbh200_clear_command_bit(struct fusbh200_hcd *fusbh200, u32 bit)
+{
+	fusbh200->command &= ~bit;
+	fusbh200_writel(fusbh200, fusbh200->command, &fusbh200->regs->command);
+
+	/* unblock posted write */
+	fusbh200_readl(fusbh200, &fusbh200->regs->command);
+}
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * EHCI timer support...  Now using hrtimers.
+ *
+ * Lots of different events are triggered from fusbh200->hrtimer.  Whenever
+ * the timer routine runs, it checks each possible event; events that are
+ * currently enabled and whose expiration time has passed get handled.
+ * The set of enabled events is stored as a collection of bitflags in
+ * fusbh200->enabled_hrtimer_events, and they are numbered in order of
+ * increasing delay values (ranging between 1 ms and 100 ms).
+ *
+ * Rather than implementing a sorted list or tree of all pending events,
+ * we keep track only of the lowest-numbered pending event, in
+ * fusbh200->next_hrtimer_event.  Whenever fusbh200->hrtimer gets restarted, its
+ * expiration time is set to the timeout value for this event.
+ *
+ * As a result, events might not get handled right away; the actual delay
+ * could be anywhere up to twice the requested delay.  This doesn't
+ * matter, because none of the events are especially time-critical.  The
+ * ones that matter most all have a delay of 1 ms, so they will be
+ * handled after 2 ms at most, which is okay.  In addition to this, we
+ * allow for an expiration range of 1 ms.
+ */
+
+/*
+ * Delay lengths for the hrtimer event types.
+ * Keep this list sorted by delay length, in the same order as
+ * the event types indexed by enum fusbh200_hrtimer_event in fusbh200.h.
+ */
+static unsigned event_delays_ns[] = {
+	1 * NSEC_PER_MSEC,	/* FUSBH200_HRTIMER_POLL_ASS */
+	1 * NSEC_PER_MSEC,	/* FUSBH200_HRTIMER_POLL_PSS */
+	1 * NSEC_PER_MSEC,	/* FUSBH200_HRTIMER_POLL_DEAD */
+	1125 * NSEC_PER_USEC,	/* FUSBH200_HRTIMER_UNLINK_INTR */
+	2 * NSEC_PER_MSEC,	/* FUSBH200_HRTIMER_FREE_ITDS */
+	6 * NSEC_PER_MSEC,	/* FUSBH200_HRTIMER_ASYNC_UNLINKS */
+	10 * NSEC_PER_MSEC,	/* FUSBH200_HRTIMER_IAA_WATCHDOG */
+	10 * NSEC_PER_MSEC,	/* FUSBH200_HRTIMER_DISABLE_PERIODIC */
+	15 * NSEC_PER_MSEC,	/* FUSBH200_HRTIMER_DISABLE_ASYNC */
+	100 * NSEC_PER_MSEC,	/* FUSBH200_HRTIMER_IO_WATCHDOG */
+};
+
+/* Enable a pending hrtimer event */
+static void fusbh200_enable_event(struct fusbh200_hcd *fusbh200, unsigned event,
+		bool resched)
+{
+	ktime_t		*timeout = &fusbh200->hr_timeouts[event];
+
+	if (resched)
+		*timeout = ktime_add(ktime_get(),
+				ktime_set(0, event_delays_ns[event]));
+	fusbh200->enabled_hrtimer_events |= (1 << event);
+
+	/* Track only the lowest-numbered pending event */
+	if (event < fusbh200->next_hrtimer_event) {
+		fusbh200->next_hrtimer_event = event;
+		hrtimer_start_range_ns(&fusbh200->hrtimer, *timeout,
+				NSEC_PER_MSEC, HRTIMER_MODE_ABS);
+	}
+}
+
+
+/* Poll the STS_ASS status bit; see when it agrees with CMD_ASE */
+static void fusbh200_poll_ASS(struct fusbh200_hcd *fusbh200)
+{
+	unsigned	actual, want;
+
+	/* Don't enable anything if the controller isn't running (e.g., died) */
+	if (fusbh200->rh_state != FUSBH200_RH_RUNNING)
+		return;
+
+	want = (fusbh200->command & CMD_ASE) ? STS_ASS : 0;
+	actual = fusbh200_readl(fusbh200, &fusbh200->regs->status) & STS_ASS;
+
+	if (want != actual) {
+
+		/* Poll again later, but give up after about 20 ms */
+		if (fusbh200->ASS_poll_count++ < 20) {
+			fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_POLL_ASS, true);
+			return;
+		}
+		fusbh200_dbg(fusbh200, "Waited too long for the async schedule status (%x/%x), giving up\n",
+				want, actual);
+	}
+	fusbh200->ASS_poll_count = 0;
+
+	/* The status is up-to-date; restart or stop the schedule as needed */
+	if (want == 0) {	/* Stopped */
+		if (fusbh200->async_count > 0)
+			fusbh200_set_command_bit(fusbh200, CMD_ASE);
+
+	} else {		/* Running */
+		if (fusbh200->async_count == 0) {
+
+			/* Turn off the schedule after a while */
+			fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_DISABLE_ASYNC,
+					true);
+		}
+	}
+}
+
+/* Turn off the async schedule after a brief delay */
+static void fusbh200_disable_ASE(struct fusbh200_hcd *fusbh200)
+{
+	fusbh200_clear_command_bit(fusbh200, CMD_ASE);
+}
+
+
+/* Poll the STS_PSS status bit; see when it agrees with CMD_PSE */
+static void fusbh200_poll_PSS(struct fusbh200_hcd *fusbh200)
+{
+	unsigned	actual, want;
+
+	/* Don't do anything if the controller isn't running (e.g., died) */
+	if (fusbh200->rh_state != FUSBH200_RH_RUNNING)
+		return;
+
+	want = (fusbh200->command & CMD_PSE) ? STS_PSS : 0;
+	actual = fusbh200_readl(fusbh200, &fusbh200->regs->status) & STS_PSS;
+
+	if (want != actual) {
+
+		/* Poll again later, but give up after about 20 ms */
+		if (fusbh200->PSS_poll_count++ < 20) {
+			fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_POLL_PSS, true);
+			return;
+		}
+		fusbh200_dbg(fusbh200, "Waited too long for the periodic schedule status (%x/%x), giving up\n",
+				want, actual);
+	}
+	fusbh200->PSS_poll_count = 0;
+
+	/* The status is up-to-date; restart or stop the schedule as needed */
+	if (want == 0) {	/* Stopped */
+		if (fusbh200->periodic_count > 0)
+			fusbh200_set_command_bit(fusbh200, CMD_PSE);
+
+	} else {		/* Running */
+		if (fusbh200->periodic_count == 0) {
+
+			/* Turn off the schedule after a while */
+			fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_DISABLE_PERIODIC,
+					true);
+		}
+	}
+}
+
+/* Turn off the periodic schedule after a brief delay */
+static void fusbh200_disable_PSE(struct fusbh200_hcd *fusbh200)
+{
+	fusbh200_clear_command_bit(fusbh200, CMD_PSE);
+}
+
+
+/* Poll the STS_HALT status bit; see when a dead controller stops */
+static void fusbh200_handle_controller_death(struct fusbh200_hcd *fusbh200)
+{
+	if (!(fusbh200_readl(fusbh200, &fusbh200->regs->status) & STS_HALT)) {
+
+		/* Give up after a few milliseconds */
+		if (fusbh200->died_poll_count++ < 5) {
+			/* Try again later */
+			fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_POLL_DEAD, true);
+			return;
+		}
+		fusbh200_warn(fusbh200, "Waited too long for the controller to stop, giving up\n");
+	}
+
+	/* Clean up the mess */
+	fusbh200->rh_state = FUSBH200_RH_HALTED;
+	fusbh200_writel(fusbh200, 0, &fusbh200->regs->intr_enable);
+	fusbh200_work(fusbh200);
+	end_unlink_async(fusbh200);
+
+	/* Not in process context, so don't try to reset the controller */
+}
+
+
+/* Handle unlinked interrupt QHs once they are gone from the hardware */
+static void fusbh200_handle_intr_unlinks(struct fusbh200_hcd *fusbh200)
+{
+	bool		stopped = (fusbh200->rh_state < FUSBH200_RH_RUNNING);
+
+	/*
+	 * Process all the QHs on the intr_unlink list that were added
+	 * before the current unlink cycle began.  The list is in
+	 * temporal order, so stop when we reach the first entry in the
+	 * current cycle.  But if the root hub isn't running then
+	 * process all the QHs on the list.
+	 */
+	fusbh200->intr_unlinking = true;
+	while (fusbh200->intr_unlink) {
+		struct fusbh200_qh	*qh = fusbh200->intr_unlink;
+
+		if (!stopped && qh->unlink_cycle == fusbh200->intr_unlink_cycle)
+			break;
+		fusbh200->intr_unlink = qh->unlink_next;
+		qh->unlink_next = NULL;
+		end_unlink_intr(fusbh200, qh);
+	}
+
+	/* Handle remaining entries later */
+	if (fusbh200->intr_unlink) {
+		fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_UNLINK_INTR, true);
+		++fusbh200->intr_unlink_cycle;
+	}
+	fusbh200->intr_unlinking = false;
+}
+
+
+/* Start another free-iTDs/siTDs cycle */
+static void start_free_itds(struct fusbh200_hcd *fusbh200)
+{
+	if (!(fusbh200->enabled_hrtimer_events & BIT(FUSBH200_HRTIMER_FREE_ITDS))) {
+		fusbh200->last_itd_to_free = list_entry(
+				fusbh200->cached_itd_list.prev,
+				struct fusbh200_itd, itd_list);
+		fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_FREE_ITDS, true);
+	}
+}
+
+/* Wait for controller to stop using old iTDs and siTDs */
+static void end_free_itds(struct fusbh200_hcd *fusbh200)
+{
+	struct fusbh200_itd		*itd, *n;
+
+	if (fusbh200->rh_state < FUSBH200_RH_RUNNING) {
+		fusbh200->last_itd_to_free = NULL;
+	}
+
+	list_for_each_entry_safe(itd, n, &fusbh200->cached_itd_list, itd_list) {
+		list_del(&itd->itd_list);
+		dma_pool_free(fusbh200->itd_pool, itd, itd->itd_dma);
+		if (itd == fusbh200->last_itd_to_free)
+			break;
+	}
+
+	if (!list_empty(&fusbh200->cached_itd_list))
+		start_free_itds(fusbh200);
+}
+
+
+/* Handle lost (or very late) IAA interrupts */
+static void fusbh200_iaa_watchdog(struct fusbh200_hcd *fusbh200)
+{
+	if (fusbh200->rh_state != FUSBH200_RH_RUNNING)
+		return;
+
+	/*
+	 * Lost IAA irqs wedge things badly; seen first with a vt8235.
+	 * So we need this watchdog, but must protect it against both
+	 * (a) SMP races against real IAA firing and retriggering, and
+	 * (b) clean HC shutdown, when IAA watchdog was pending.
+	 */
+	if (fusbh200->async_iaa) {
+		u32 cmd, status;
+
+		/* If we get here, IAA is *REALLY* late.  It's barely
+		 * conceivable that the system is so busy that CMD_IAAD
+		 * is still legitimately set, so let's be sure it's
+		 * clear before we read STS_IAA.  (The HC should clear
+		 * CMD_IAAD when it sets STS_IAA.)
+		 */
+		cmd = fusbh200_readl(fusbh200, &fusbh200->regs->command);
+
+		/*
+		 * If IAA is set here it either legitimately triggered
+		 * after the watchdog timer expired (_way_ late, so we'll
+		 * still count it as lost) ... or a silicon erratum:
+		 * - VIA seems to set IAA without triggering the IRQ;
+		 * - IAAD potentially cleared without setting IAA.
+		 */
+		status = fusbh200_readl(fusbh200, &fusbh200->regs->status);
+		if ((status & STS_IAA) || !(cmd & CMD_IAAD)) {
+			COUNT(fusbh200->stats.lost_iaa);
+			fusbh200_writel(fusbh200, STS_IAA, &fusbh200->regs->status);
+		}
+
+		fusbh200_vdbg(fusbh200, "IAA watchdog: status %x cmd %x\n",
+				status, cmd);
+		end_unlink_async(fusbh200);
+	}
+}
+
+
+/* Enable the I/O watchdog, if appropriate */
+static void turn_on_io_watchdog(struct fusbh200_hcd *fusbh200)
+{
+	/* Not needed if the controller isn't running or it's already enabled */
+	if (fusbh200->rh_state != FUSBH200_RH_RUNNING ||
+			(fusbh200->enabled_hrtimer_events &
+				BIT(FUSBH200_HRTIMER_IO_WATCHDOG)))
+		return;
+
+	/*
+	 * Isochronous transfers always need the watchdog.
+	 * For other sorts we use it only if the flag is set.
+	 */
+	if (fusbh200->isoc_count > 0 || (fusbh200->need_io_watchdog &&
+			fusbh200->async_count + fusbh200->intr_count > 0))
+		fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_IO_WATCHDOG, true);
+}
+
+
+/*
+ * Handler functions for the hrtimer event types.
+ * Keep this array in the same order as the event types indexed by
+ * enum fusbh200_hrtimer_event in fusbh200.h.
+ */
+static void (*event_handlers[])(struct fusbh200_hcd *) = {
+	fusbh200_poll_ASS,			/* FUSBH200_HRTIMER_POLL_ASS */
+	fusbh200_poll_PSS,			/* FUSBH200_HRTIMER_POLL_PSS */
+	fusbh200_handle_controller_death,	/* FUSBH200_HRTIMER_POLL_DEAD */
+	fusbh200_handle_intr_unlinks,	/* FUSBH200_HRTIMER_UNLINK_INTR */
+	end_free_itds,			/* FUSBH200_HRTIMER_FREE_ITDS */
+	unlink_empty_async,		/* FUSBH200_HRTIMER_ASYNC_UNLINKS */
+	fusbh200_iaa_watchdog,		/* FUSBH200_HRTIMER_IAA_WATCHDOG */
+	fusbh200_disable_PSE,		/* FUSBH200_HRTIMER_DISABLE_PERIODIC */
+	fusbh200_disable_ASE,		/* FUSBH200_HRTIMER_DISABLE_ASYNC */
+	fusbh200_work,			/* FUSBH200_HRTIMER_IO_WATCHDOG */
+};
+
+static enum hrtimer_restart fusbh200_hrtimer_func(struct hrtimer *t)
+{
+	struct fusbh200_hcd	*fusbh200 = container_of(t, struct fusbh200_hcd, hrtimer);
+	ktime_t		now;
+	unsigned long	events;
+	unsigned long	flags;
+	unsigned	e;
+
+	spin_lock_irqsave(&fusbh200->lock, flags);
+
+	events = fusbh200->enabled_hrtimer_events;
+	fusbh200->enabled_hrtimer_events = 0;
+	fusbh200->next_hrtimer_event = FUSBH200_HRTIMER_NO_EVENT;
+
+	/*
+	 * Check each pending event.  If its time has expired, handle
+	 * the event; otherwise re-enable it.
+	 */
+	now = ktime_get();
+	for_each_set_bit(e, &events, FUSBH200_HRTIMER_NUM_EVENTS) {
+		if (now.tv64 >= fusbh200->hr_timeouts[e].tv64)
+			event_handlers[e](fusbh200);
+		else
+			fusbh200_enable_event(fusbh200, e, false);
+	}
+
+	spin_unlock_irqrestore(&fusbh200->lock, flags);
+	return HRTIMER_NORESTART;
+}
+
+/*-------------------------------------------------------------------------*/
+
+#define fusbh200_bus_suspend	NULL
+#define fusbh200_bus_resume	NULL
+
+/*-------------------------------------------------------------------------*/
+
+static int check_reset_complete (
+	struct fusbh200_hcd	*fusbh200,
+	int		index,
+	u32 __iomem	*status_reg,
+	int		port_status
+) {
+	if (!(port_status & PORT_CONNECT))
+		return port_status;
+
+	/* if reset finished and it's still not enabled -- handoff */
+	if (!(port_status & PORT_PE)) {
+		/* with integrated TT, there's nobody to hand it to! */
+		fusbh200_dbg (fusbh200,
+			"Failed to enable port %d on root hub TT\n",
+			index+1);
+		return port_status;
+	} else {
+		fusbh200_dbg(fusbh200, "port %d reset complete, port enabled\n",
+			index + 1);
+	}
+
+	return port_status;
+}
+
+/*-------------------------------------------------------------------------*/
+
+
+/* build "status change" packet (one or two bytes) from HC registers */
+
+static int
+fusbh200_hub_status_data (struct usb_hcd *hcd, char *buf)
+{
+	struct fusbh200_hcd	*fusbh200 = hcd_to_fusbh200 (hcd);
+	u32		temp, status;
+	u32		mask;
+	int		retval = 1;
+	unsigned long	flags;
+
+	/* init status to no-changes */
+	buf [0] = 0;
+
+	/* Inform the core about resumes-in-progress by returning
+	 * a non-zero value even if there are no status changes.
+	 */
+	status = fusbh200->resuming_ports;
+
+	mask = PORT_CSC | PORT_PEC;
+	// PORT_RESUME from hardware ~= PORT_STAT_C_SUSPEND
+
+	/* no hub change reports (bit 0) for now (power, ...) */
+
+	/* port N changes (bit N)? */
+	spin_lock_irqsave (&fusbh200->lock, flags);
+
+	temp = fusbh200_readl(fusbh200, &fusbh200->regs->port_status);
+
+	/*
+	 * Return status information even for ports with OWNER set.
+	 * Otherwise khubd wouldn't see the disconnect event when a
+	 * high-speed device is switched over to the companion
+	 * controller by the user.
+	 */
+
+	if ((temp & mask) != 0 || test_bit(0, &fusbh200->port_c_suspend)
+			|| (fusbh200->reset_done[0] && time_after_eq(
+				jiffies, fusbh200->reset_done[0]))) {
+		buf [0] |= 1 << 1;
+		status = STS_PCD;
+	}
+	/* FIXME autosuspend idle root hubs */
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+	return status ? retval : 0;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void
+fusbh200_hub_descriptor (
+	struct fusbh200_hcd		*fusbh200,
+	struct usb_hub_descriptor	*desc
+) {
+	int		ports = HCS_N_PORTS (fusbh200->hcs_params);
+	u16		temp;
+
+	desc->bDescriptorType = 0x29;
+	desc->bPwrOn2PwrGood = 10;	/* fusbh200 1.0, 2.3.9 says 20ms max */
+	desc->bHubContrCurrent = 0;
+
+	desc->bNbrPorts = ports;
+	temp = 1 + (ports / 8);
+	desc->bDescLength = 7 + 2 * temp;
+
+	/* two bitmaps:  ports removable, and usb 1.0 legacy PortPwrCtrlMask */
+	memset(&desc->u.hs.DeviceRemovable[0], 0, temp);
+	memset(&desc->u.hs.DeviceRemovable[temp], 0xff, temp);
+
+	temp = 0x0008;		/* per-port overcurrent reporting */
+	temp |= 0x0002;		/* no power switching */
+	desc->wHubCharacteristics = cpu_to_le16(temp);
+}
+
+/*-------------------------------------------------------------------------*/
+
+static int fusbh200_hub_control (
+	struct usb_hcd	*hcd,
+	u16		typeReq,
+	u16		wValue,
+	u16		wIndex,
+	char		*buf,
+	u16		wLength
+) {
+	struct fusbh200_hcd	*fusbh200 = hcd_to_fusbh200 (hcd);
+	int		ports = HCS_N_PORTS (fusbh200->hcs_params);
+	u32 __iomem	*status_reg = &fusbh200->regs->port_status;
+	u32		temp, temp1, status;
+	unsigned long	flags;
+	int		retval = 0;
+	unsigned	selector;
+
+	/*
+	 * FIXME:  support SetPortFeatures USB_PORT_FEAT_INDICATOR.
+	 * HCS_INDICATOR may say we can change LEDs to off/amber/green.
+	 * (track current state ourselves) ... blink for diagnostics,
+	 * power, "this is the one", etc.  EHCI spec supports this.
+	 */
+
+	spin_lock_irqsave (&fusbh200->lock, flags);
+	switch (typeReq) {
+	case ClearHubFeature:
+		switch (wValue) {
+		case C_HUB_LOCAL_POWER:
+		case C_HUB_OVER_CURRENT:
+			/* no hub-wide feature/status flags */
+			break;
+		default:
+			goto error;
+		}
+		break;
+	case ClearPortFeature:
+		if (!wIndex || wIndex > ports)
+			goto error;
+		wIndex--;
+		temp = fusbh200_readl(fusbh200, status_reg);
+		temp &= ~PORT_RWC_BITS;
+
+		/*
+		 * Even if OWNER is set, so the port is owned by the
+		 * companion controller, khubd needs to be able to clear
+		 * the port-change status bits (especially
+		 * USB_PORT_STAT_C_CONNECTION).
+		 */
+
+		switch (wValue) {
+		case USB_PORT_FEAT_ENABLE:
+			fusbh200_writel(fusbh200, temp & ~PORT_PE, status_reg);
+			break;
+		case USB_PORT_FEAT_C_ENABLE:
+			fusbh200_writel(fusbh200, temp | PORT_PEC, status_reg);
+			break;
+		case USB_PORT_FEAT_SUSPEND:
+			if (temp & PORT_RESET)
+				goto error;
+			if (!(temp & PORT_SUSPEND))
+				break;
+			if ((temp & PORT_PE) == 0)
+				goto error;
+
+			/* resume signaling for 20 msec */
+			fusbh200_writel(fusbh200, temp | PORT_RESUME, status_reg);
+			fusbh200->reset_done[wIndex] = jiffies
+					+ msecs_to_jiffies(20);
+			break;
+		case USB_PORT_FEAT_C_SUSPEND:
+			clear_bit(wIndex, &fusbh200->port_c_suspend);
+			break;
+		case USB_PORT_FEAT_C_CONNECTION:
+			fusbh200_writel(fusbh200, temp | PORT_CSC, status_reg);
+			break;
+		case USB_PORT_FEAT_C_OVER_CURRENT:
+			fusbh200_writel(fusbh200, temp | BMISR_OVC, &fusbh200->regs->bmisr);
+			break;
+		case USB_PORT_FEAT_C_RESET:
+			/* GetPortStatus clears reset */
+			break;
+		default:
+			goto error;
+		}
+		fusbh200_readl(fusbh200, &fusbh200->regs->command);	/* unblock posted write */
+		break;
+	case GetHubDescriptor:
+		fusbh200_hub_descriptor (fusbh200, (struct usb_hub_descriptor *)
+			buf);
+		break;
+	case GetHubStatus:
+		/* no hub-wide feature/status flags */
+		memset (buf, 0, 4);
+		//cpu_to_le32s ((u32 *) buf);
+		break;
+	case GetPortStatus:
+		if (!wIndex || wIndex > ports)
+			goto error;
+		wIndex--;
+		status = 0;
+		temp = fusbh200_readl(fusbh200, status_reg);
+
+		// wPortChange bits
+		if (temp & PORT_CSC)
+			status |= USB_PORT_STAT_C_CONNECTION << 16;
+		if (temp & PORT_PEC)
+			status |= USB_PORT_STAT_C_ENABLE << 16;
+
+		temp1 = fusbh200_readl(fusbh200, &fusbh200->regs->bmisr);
+		if (temp1 & BMISR_OVC)
+			status |= USB_PORT_STAT_C_OVERCURRENT << 16;
+
+		/* whoever resumes must GetPortStatus to complete it!! */
+		if (temp & PORT_RESUME) {
+
+			/* Remote Wakeup received? */
+			if (!fusbh200->reset_done[wIndex]) {
+				/* resume signaling for 20 msec */
+				fusbh200->reset_done[wIndex] = jiffies
+						+ msecs_to_jiffies(20);
+				/* check the port again */
+				mod_timer(&fusbh200_to_hcd(fusbh200)->rh_timer,
+						fusbh200->reset_done[wIndex]);
+			}
+
+			/* resume completed? */
+			else if (time_after_eq(jiffies,
+					fusbh200->reset_done[wIndex])) {
+				clear_bit(wIndex, &fusbh200->suspended_ports);
+				set_bit(wIndex, &fusbh200->port_c_suspend);
+				fusbh200->reset_done[wIndex] = 0;
+
+				/* stop resume signaling */
+				temp = fusbh200_readl(fusbh200, status_reg);
+				fusbh200_writel(fusbh200,
+					temp & ~(PORT_RWC_BITS | PORT_RESUME),
+					status_reg);
+				clear_bit(wIndex, &fusbh200->resuming_ports);
+				retval = handshake(fusbh200, status_reg,
+					   PORT_RESUME, 0, 2000 /* 2msec */);
+				if (retval != 0) {
+					fusbh200_err(fusbh200,
+						"port %d resume error %d\n",
+						wIndex + 1, retval);
+					goto error;
+				}
+				temp &= ~(PORT_SUSPEND|PORT_RESUME|(3<<10));
+			}
+		}
+
+		/* whoever resets must GetPortStatus to complete it!! */
+		if ((temp & PORT_RESET)
+				&& time_after_eq(jiffies,
+					fusbh200->reset_done[wIndex])) {
+			status |= USB_PORT_STAT_C_RESET << 16;
+			fusbh200->reset_done [wIndex] = 0;
+			clear_bit(wIndex, &fusbh200->resuming_ports);
+
+			/* force reset to complete */
+			fusbh200_writel(fusbh200, temp & ~(PORT_RWC_BITS | PORT_RESET),
+					status_reg);
+			/* REVISIT:  some hardware needs 550+ usec to clear
+			 * this bit; seems too long to spin routinely...
+			 */
+			retval = handshake(fusbh200, status_reg,
+					PORT_RESET, 0, 1000);
+			if (retval != 0) {
+				fusbh200_err (fusbh200, "port %d reset error %d\n",
+					wIndex + 1, retval);
+				goto error;
+			}
+
+			/* see what we found out */
+			temp = check_reset_complete (fusbh200, wIndex, status_reg,
+					fusbh200_readl(fusbh200, status_reg));
+		}
+
+		if (!(temp & (PORT_RESUME|PORT_RESET))) {
+			fusbh200->reset_done[wIndex] = 0;
+			clear_bit(wIndex, &fusbh200->resuming_ports);
+		}
+
+		/* transfer dedicated ports to the companion hc */
+		if ((temp & PORT_CONNECT) &&
+				test_bit(wIndex, &fusbh200->companion_ports)) {
+			temp &= ~PORT_RWC_BITS;
+			fusbh200_writel(fusbh200, temp, status_reg);
+			fusbh200_dbg(fusbh200, "port %d --> companion\n", wIndex + 1);
+			temp = fusbh200_readl(fusbh200, status_reg);
+		}
+
+		/*
+		 * Even if OWNER is set, there's no harm letting khubd
+		 * see the wPortStatus values (they should all be 0 except
+		 * for PORT_POWER anyway).
+		 */
+
+		if (temp & PORT_CONNECT) {
+			status |= USB_PORT_STAT_CONNECTION;
+			status |= fusbh200_port_speed(fusbh200, temp);
+		}
+		if (temp & PORT_PE)
+			status |= USB_PORT_STAT_ENABLE;
+
+		/* maybe the port was unsuspended without our knowledge */
+		if (temp & (PORT_SUSPEND|PORT_RESUME)) {
+			status |= USB_PORT_STAT_SUSPEND;
+		} else if (test_bit(wIndex, &fusbh200->suspended_ports)) {
+			clear_bit(wIndex, &fusbh200->suspended_ports);
+			clear_bit(wIndex, &fusbh200->resuming_ports);
+			fusbh200->reset_done[wIndex] = 0;
+			if (temp & PORT_PE)
+				set_bit(wIndex, &fusbh200->port_c_suspend);
+		}
+
+		temp1 = fusbh200_readl(fusbh200, &fusbh200->regs->bmisr);
+		if (temp1 & BMISR_OVC)
+			status |= USB_PORT_STAT_OVERCURRENT;
+		if (temp & PORT_RESET)
+			status |= USB_PORT_STAT_RESET;
+		if (test_bit(wIndex, &fusbh200->port_c_suspend))
+			status |= USB_PORT_STAT_C_SUSPEND << 16;
+
+#ifndef	VERBOSE_DEBUG
+	if (status & ~0xffff)	/* only if wPortChange is interesting */
+#endif
+		dbg_port (fusbh200, "GetStatus", wIndex + 1, temp);
+		put_unaligned_le32(status, buf);
+		break;
+	case SetHubFeature:
+		switch (wValue) {
+		case C_HUB_LOCAL_POWER:
+		case C_HUB_OVER_CURRENT:
+			/* no hub-wide feature/status flags */
+			break;
+		default:
+			goto error;
+		}
+		break;
+	case SetPortFeature:
+		selector = wIndex >> 8;
+		wIndex &= 0xff;
+
+		if (!wIndex || wIndex > ports)
+			goto error;
+		wIndex--;
+		temp = fusbh200_readl(fusbh200, status_reg);
+		temp &= ~PORT_RWC_BITS;
+		switch (wValue) {
+		case USB_PORT_FEAT_SUSPEND:
+			if ((temp & PORT_PE) == 0
+					|| (temp & PORT_RESET) != 0)
+				goto error;
+
+			/* After above check the port must be connected.
+			 * Set appropriate bit thus could put phy into low power
+			 * mode if we have hostpc feature
+			 */
+			fusbh200_writel(fusbh200, temp | PORT_SUSPEND, status_reg);
+			set_bit(wIndex, &fusbh200->suspended_ports);
+			break;
+		case USB_PORT_FEAT_RESET:
+			if (temp & PORT_RESUME)
+				goto error;
+			/* line status bits may report this as low speed,
+			 * which can be fine if this root hub has a
+			 * transaction translator built in.
+			 */
+			fusbh200_vdbg (fusbh200, "port %d reset\n", wIndex + 1);
+			temp |= PORT_RESET;
+			temp &= ~PORT_PE;
+
+			/*
+			 * caller must wait, then call GetPortStatus
+			 * usb 2.0 spec says 50 ms resets on root
+			 */
+			fusbh200->reset_done [wIndex] = jiffies
+					+ msecs_to_jiffies (50);
+			fusbh200_writel(fusbh200, temp, status_reg);
+			break;
+
+		/* For downstream facing ports (these):  one hub port is put
+		 * into test mode according to USB2 11.24.2.13, then the hub
+		 * must be reset (which for root hub now means rmmod+modprobe,
+		 * or else system reboot).  See EHCI 2.3.9 and 4.14 for info
+		 * about the EHCI-specific stuff.
+		 */
+		case USB_PORT_FEAT_TEST:
+			if (!selector || selector > 5)
+				goto error;
+			spin_unlock_irqrestore(&fusbh200->lock, flags);
+			fusbh200_quiesce(fusbh200);
+			spin_lock_irqsave(&fusbh200->lock, flags);
+
+			/* Put all enabled ports into suspend */
+			temp = fusbh200_readl(fusbh200, status_reg) & ~PORT_RWC_BITS;
+			if (temp & PORT_PE)
+				fusbh200_writel(fusbh200, temp | PORT_SUSPEND,
+						status_reg);
+
+			spin_unlock_irqrestore(&fusbh200->lock, flags);
+			fusbh200_halt(fusbh200);
+			spin_lock_irqsave(&fusbh200->lock, flags);
+
+			temp = fusbh200_readl(fusbh200, status_reg);
+			temp |= selector << 16;
+			fusbh200_writel(fusbh200, temp, status_reg);
+			break;
+
+		default:
+			goto error;
+		}
+		fusbh200_readl(fusbh200, &fusbh200->regs->command);	/* unblock posted writes */
+		break;
+
+	default:
+error:
+		/* "stall" on error */
+		retval = -EPIPE;
+	}
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+	return retval;
+}
+
+static void __maybe_unused fusbh200_relinquish_port(struct usb_hcd *hcd,
+		int portnum)
+{
+	return;
+}
+
+static int __maybe_unused fusbh200_port_handed_over(struct usb_hcd *hcd,
+		int portnum)
+{
+	return 0;
+}
+/*-------------------------------------------------------------------------*/
+/*
+ * There's basically three types of memory:
+ *	- data used only by the HCD ... kmalloc is fine
+ *	- async and periodic schedules, shared by HC and HCD ... these
+ *	  need to use dma_pool or dma_alloc_coherent
+ *	- driver buffers, read/written by HC ... single shot DMA mapped
+ *
+ * There's also "register" data (e.g. PCI or SOC), which is memory mapped.
+ * No memory seen by this driver is pageable.
+ */
+
+/*-------------------------------------------------------------------------*/
+
+/* Allocate the key transfer structures from the previously allocated pool */
+
+static inline void fusbh200_qtd_init(struct fusbh200_hcd *fusbh200, struct fusbh200_qtd *qtd,
+				  dma_addr_t dma)
+{
+	memset (qtd, 0, sizeof *qtd);
+	qtd->qtd_dma = dma;
+	qtd->hw_token = cpu_to_hc32(fusbh200, QTD_STS_HALT);
+	qtd->hw_next = FUSBH200_LIST_END(fusbh200);
+	qtd->hw_alt_next = FUSBH200_LIST_END(fusbh200);
+	INIT_LIST_HEAD (&qtd->qtd_list);
+}
+
+static struct fusbh200_qtd *fusbh200_qtd_alloc (struct fusbh200_hcd *fusbh200, gfp_t flags)
+{
+	struct fusbh200_qtd		*qtd;
+	dma_addr_t		dma;
+
+	qtd = dma_pool_alloc (fusbh200->qtd_pool, flags, &dma);
+	if (qtd != NULL) {
+		fusbh200_qtd_init(fusbh200, qtd, dma);
+	}
+	return qtd;
+}
+
+static inline void fusbh200_qtd_free (struct fusbh200_hcd *fusbh200, struct fusbh200_qtd *qtd)
+{
+	dma_pool_free (fusbh200->qtd_pool, qtd, qtd->qtd_dma);
+}
+
+
+static void qh_destroy(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	/* clean qtds first, and know this is not linked */
+	if (!list_empty (&qh->qtd_list) || qh->qh_next.ptr) {
+		fusbh200_dbg (fusbh200, "unused qh not empty!\n");
+		BUG ();
+	}
+	if (qh->dummy)
+		fusbh200_qtd_free (fusbh200, qh->dummy);
+	dma_pool_free(fusbh200->qh_pool, qh->hw, qh->qh_dma);
+	kfree(qh);
+}
+
+static struct fusbh200_qh *fusbh200_qh_alloc (struct fusbh200_hcd *fusbh200, gfp_t flags)
+{
+	struct fusbh200_qh		*qh;
+	dma_addr_t		dma;
+
+	qh = kzalloc(sizeof *qh, GFP_ATOMIC);
+	if (!qh)
+		goto done;
+	qh->hw = (struct fusbh200_qh_hw *)
+		dma_pool_alloc(fusbh200->qh_pool, flags, &dma);
+	if (!qh->hw)
+		goto fail;
+	memset(qh->hw, 0, sizeof *qh->hw);
+	qh->qh_dma = dma;
+	// INIT_LIST_HEAD (&qh->qh_list);
+	INIT_LIST_HEAD (&qh->qtd_list);
+
+	/* dummy td enables safe urb queuing */
+	qh->dummy = fusbh200_qtd_alloc (fusbh200, flags);
+	if (qh->dummy == NULL) {
+		fusbh200_dbg (fusbh200, "no dummy td\n");
+		goto fail1;
+	}
+done:
+	return qh;
+fail1:
+	dma_pool_free(fusbh200->qh_pool, qh->hw, qh->qh_dma);
+fail:
+	kfree(qh);
+	return NULL;
+}
+
+/*-------------------------------------------------------------------------*/
+
+/* The queue heads and transfer descriptors are managed from pools tied
+ * to each of the "per device" structures.
+ * This is the initialisation and cleanup code.
+ */
+
+static void fusbh200_mem_cleanup (struct fusbh200_hcd *fusbh200)
+{
+	if (fusbh200->async)
+		qh_destroy(fusbh200, fusbh200->async);
+	fusbh200->async = NULL;
+
+	if (fusbh200->dummy)
+		qh_destroy(fusbh200, fusbh200->dummy);
+	fusbh200->dummy = NULL;
+
+	/* DMA consistent memory and pools */
+	if (fusbh200->qtd_pool)
+		dma_pool_destroy (fusbh200->qtd_pool);
+	fusbh200->qtd_pool = NULL;
+
+	if (fusbh200->qh_pool) {
+		dma_pool_destroy (fusbh200->qh_pool);
+		fusbh200->qh_pool = NULL;
+	}
+
+	if (fusbh200->itd_pool)
+		dma_pool_destroy (fusbh200->itd_pool);
+	fusbh200->itd_pool = NULL;
+
+	if (fusbh200->periodic)
+		dma_free_coherent (fusbh200_to_hcd(fusbh200)->self.controller,
+			fusbh200->periodic_size * sizeof (u32),
+			fusbh200->periodic, fusbh200->periodic_dma);
+	fusbh200->periodic = NULL;
+
+	/* shadow periodic table */
+	kfree(fusbh200->pshadow);
+	fusbh200->pshadow = NULL;
+}
+
+/* remember to add cleanup code (above) if you add anything here */
+static int fusbh200_mem_init (struct fusbh200_hcd *fusbh200, gfp_t flags)
+{
+	int i;
+
+	/* QTDs for control/bulk/intr transfers */
+	fusbh200->qtd_pool = dma_pool_create ("fusbh200_qtd",
+			fusbh200_to_hcd(fusbh200)->self.controller,
+			sizeof (struct fusbh200_qtd),
+			32 /* byte alignment (for hw parts) */,
+			4096 /* can't cross 4K */);
+	if (!fusbh200->qtd_pool) {
+		goto fail;
+	}
+
+	/* QHs for control/bulk/intr transfers */
+	fusbh200->qh_pool = dma_pool_create ("fusbh200_qh",
+			fusbh200_to_hcd(fusbh200)->self.controller,
+			sizeof(struct fusbh200_qh_hw),
+			32 /* byte alignment (for hw parts) */,
+			4096 /* can't cross 4K */);
+	if (!fusbh200->qh_pool) {
+		goto fail;
+	}
+	fusbh200->async = fusbh200_qh_alloc (fusbh200, flags);
+	if (!fusbh200->async) {
+		goto fail;
+	}
+
+	/* ITD for high speed ISO transfers */
+	fusbh200->itd_pool = dma_pool_create ("fusbh200_itd",
+			fusbh200_to_hcd(fusbh200)->self.controller,
+			sizeof (struct fusbh200_itd),
+			64 /* byte alignment (for hw parts) */,
+			4096 /* can't cross 4K */);
+	if (!fusbh200->itd_pool) {
+		goto fail;
+	}
+
+	/* Hardware periodic table */
+	fusbh200->periodic = (__le32 *)
+		dma_alloc_coherent (fusbh200_to_hcd(fusbh200)->self.controller,
+			fusbh200->periodic_size * sizeof(__le32),
+			&fusbh200->periodic_dma, 0);
+	if (fusbh200->periodic == NULL) {
+		goto fail;
+	}
+
+		for (i = 0; i < fusbh200->periodic_size; i++)
+			fusbh200->periodic[i] = FUSBH200_LIST_END(fusbh200);
+
+	/* software shadow of hardware table */
+	fusbh200->pshadow = kcalloc(fusbh200->periodic_size, sizeof(void *), flags);
+	if (fusbh200->pshadow != NULL)
+		return 0;
+
+fail:
+	fusbh200_dbg (fusbh200, "couldn't init memory\n");
+	fusbh200_mem_cleanup (fusbh200);
+	return -ENOMEM;
+}
+/*-------------------------------------------------------------------------*/
+/*
+ * EHCI hardware queue manipulation ... the core.  QH/QTD manipulation.
+ *
+ * Control, bulk, and interrupt traffic all use "qh" lists.  They list "qtd"
+ * entries describing USB transactions, max 16-20kB/entry (with 4kB-aligned
+ * buffers needed for the larger number).  We use one QH per endpoint, queue
+ * multiple urbs (all three types) per endpoint.  URBs may need several qtds.
+ *
+ * ISO traffic uses "ISO TD" (itd) records, and (along with
+ * interrupts) needs careful scheduling.  Performance improvements can be
+ * an ongoing challenge.  That's in "ehci-sched.c".
+ *
+ * USB 1.1 devices are handled (a) by "companion" OHCI or UHCI root hubs,
+ * or otherwise through transaction translators (TTs) in USB 2.0 hubs using
+ * (b) special fields in qh entries or (c) split iso entries.  TTs will
+ * buffer low/full speed data so the host collects it at high speed.
+ */
+
+/*-------------------------------------------------------------------------*/
+
+/* fill a qtd, returning how much of the buffer we were able to queue up */
+
+static int
+qtd_fill(struct fusbh200_hcd *fusbh200, struct fusbh200_qtd *qtd, dma_addr_t buf,
+		  size_t len, int token, int maxpacket)
+{
+	int	i, count;
+	u64	addr = buf;
+
+	/* one buffer entry per 4K ... first might be short or unaligned */
+	qtd->hw_buf[0] = cpu_to_hc32(fusbh200, (u32)addr);
+	qtd->hw_buf_hi[0] = cpu_to_hc32(fusbh200, (u32)(addr >> 32));
+	count = 0x1000 - (buf & 0x0fff);	/* rest of that page */
+	if (likely (len < count))		/* ... iff needed */
+		count = len;
+	else {
+		buf +=  0x1000;
+		buf &= ~0x0fff;
+
+		/* per-qtd limit: from 16K to 20K (best alignment) */
+		for (i = 1; count < len && i < 5; i++) {
+			addr = buf;
+			qtd->hw_buf[i] = cpu_to_hc32(fusbh200, (u32)addr);
+			qtd->hw_buf_hi[i] = cpu_to_hc32(fusbh200,
+					(u32)(addr >> 32));
+			buf += 0x1000;
+			if ((count + 0x1000) < len)
+				count += 0x1000;
+			else
+				count = len;
+		}
+
+		/* short packets may only terminate transfers */
+		if (count != len)
+			count -= (count % maxpacket);
+	}
+	qtd->hw_token = cpu_to_hc32(fusbh200, (count << 16) | token);
+	qtd->length = count;
+
+	return count;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static inline void
+qh_update (struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh, struct fusbh200_qtd *qtd)
+{
+	struct fusbh200_qh_hw *hw = qh->hw;
+
+	/* writes to an active overlay are unsafe */
+	BUG_ON(qh->qh_state != QH_STATE_IDLE);
+
+	hw->hw_qtd_next = QTD_NEXT(fusbh200, qtd->qtd_dma);
+	hw->hw_alt_next = FUSBH200_LIST_END(fusbh200);
+
+	/* Except for control endpoints, we make hardware maintain data
+	 * toggle (like OHCI) ... here (re)initialize the toggle in the QH,
+	 * and set the pseudo-toggle in udev. Only usb_clear_halt() will
+	 * ever clear it.
+	 */
+	if (!(hw->hw_info1 & cpu_to_hc32(fusbh200, QH_TOGGLE_CTL))) {
+		unsigned	is_out, epnum;
+
+		is_out = qh->is_out;
+		epnum = (hc32_to_cpup(fusbh200, &hw->hw_info1) >> 8) & 0x0f;
+		if (unlikely (!usb_gettoggle (qh->dev, epnum, is_out))) {
+			hw->hw_token &= ~cpu_to_hc32(fusbh200, QTD_TOGGLE);
+			usb_settoggle (qh->dev, epnum, is_out, 1);
+		}
+	}
+
+	hw->hw_token &= cpu_to_hc32(fusbh200, QTD_TOGGLE | QTD_STS_PING);
+}
+
+/* if it weren't for a common silicon quirk (writing the dummy into the qh
+ * overlay, so qh->hw_token wrongly becomes inactive/halted), only fault
+ * recovery (including urb dequeue) would need software changes to a QH...
+ */
+static void
+qh_refresh (struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	struct fusbh200_qtd *qtd;
+
+	if (list_empty (&qh->qtd_list))
+		qtd = qh->dummy;
+	else {
+		qtd = list_entry (qh->qtd_list.next,
+				struct fusbh200_qtd, qtd_list);
+		/*
+		 * first qtd may already be partially processed.
+		 * If we come here during unlink, the QH overlay region
+		 * might have reference to the just unlinked qtd. The
+		 * qtd is updated in qh_completions(). Update the QH
+		 * overlay here.
+		 */
+		if (cpu_to_hc32(fusbh200, qtd->qtd_dma) == qh->hw->hw_current) {
+			qh->hw->hw_qtd_next = qtd->hw_next;
+			qtd = NULL;
+		}
+	}
+
+	if (qtd)
+		qh_update (fusbh200, qh, qtd);
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void qh_link_async(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh);
+
+static void fusbh200_clear_tt_buffer_complete(struct usb_hcd *hcd,
+		struct usb_host_endpoint *ep)
+{
+	struct fusbh200_hcd		*fusbh200 = hcd_to_fusbh200(hcd);
+	struct fusbh200_qh		*qh = ep->hcpriv;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&fusbh200->lock, flags);
+	qh->clearing_tt = 0;
+	if (qh->qh_state == QH_STATE_IDLE && !list_empty(&qh->qtd_list)
+			&& fusbh200->rh_state == FUSBH200_RH_RUNNING)
+		qh_link_async(fusbh200, qh);
+	spin_unlock_irqrestore(&fusbh200->lock, flags);
+}
+
+static void fusbh200_clear_tt_buffer(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh,
+		struct urb *urb, u32 token)
+{
+
+	/* If an async split transaction gets an error or is unlinked,
+	 * the TT buffer may be left in an indeterminate state.  We
+	 * have to clear the TT buffer.
+	 *
+	 * Note: this routine is never called for Isochronous transfers.
+	 */
+	if (urb->dev->tt && !usb_pipeint(urb->pipe) && !qh->clearing_tt) {
+#ifdef DEBUG
+		struct usb_device *tt = urb->dev->tt->hub;
+		dev_dbg(&tt->dev,
+			"clear tt buffer port %d, a%d ep%d t%08x\n",
+			urb->dev->ttport, urb->dev->devnum,
+			usb_pipeendpoint(urb->pipe), token);
+#endif /* DEBUG */
+		if (urb->dev->tt->hub !=
+		    fusbh200_to_hcd(fusbh200)->self.root_hub) {
+			if (usb_hub_clear_tt_buffer(urb) == 0)
+				qh->clearing_tt = 1;
+		}
+	}
+}
+
+static int qtd_copy_status (
+	struct fusbh200_hcd *fusbh200,
+	struct urb *urb,
+	size_t length,
+	u32 token
+)
+{
+	int	status = -EINPROGRESS;
+
+	/* count IN/OUT bytes, not SETUP (even short packets) */
+	if (likely (QTD_PID (token) != 2))
+		urb->actual_length += length - QTD_LENGTH (token);
+
+	/* don't modify error codes */
+	if (unlikely(urb->unlinked))
+		return status;
+
+	/* force cleanup after short read; not always an error */
+	if (unlikely (IS_SHORT_READ (token)))
+		status = -EREMOTEIO;
+
+	/* serious "can't proceed" faults reported by the hardware */
+	if (token & QTD_STS_HALT) {
+		if (token & QTD_STS_BABBLE) {
+			/* FIXME "must" disable babbling device's port too */
+			status = -EOVERFLOW;
+		/* CERR nonzero + halt --> stall */
+		} else if (QTD_CERR(token)) {
+			status = -EPIPE;
+
+		/* In theory, more than one of the following bits can be set
+		 * since they are sticky and the transaction is retried.
+		 * Which to test first is rather arbitrary.
+		 */
+		} else if (token & QTD_STS_MMF) {
+			/* fs/ls interrupt xfer missed the complete-split */
+			status = -EPROTO;
+		} else if (token & QTD_STS_DBE) {
+			status = (QTD_PID (token) == 1) /* IN ? */
+				? -ENOSR  /* hc couldn't read data */
+				: -ECOMM; /* hc couldn't write data */
+		} else if (token & QTD_STS_XACT) {
+			/* timeout, bad CRC, wrong PID, etc */
+			fusbh200_dbg(fusbh200, "devpath %s ep%d%s 3strikes\n",
+				urb->dev->devpath,
+				usb_pipeendpoint(urb->pipe),
+				usb_pipein(urb->pipe) ? "in" : "out");
+			status = -EPROTO;
+		} else {	/* unknown */
+			status = -EPROTO;
+		}
+
+		fusbh200_vdbg (fusbh200,
+			"dev%d ep%d%s qtd token %08x --> status %d\n",
+			usb_pipedevice (urb->pipe),
+			usb_pipeendpoint (urb->pipe),
+			usb_pipein (urb->pipe) ? "in" : "out",
+			token, status);
+	}
+
+	return status;
+}
+
+static void
+fusbh200_urb_done(struct fusbh200_hcd *fusbh200, struct urb *urb, int status)
+__releases(fusbh200->lock)
+__acquires(fusbh200->lock)
+{
+	if (likely (urb->hcpriv != NULL)) {
+		struct fusbh200_qh	*qh = (struct fusbh200_qh *) urb->hcpriv;
+
+		/* S-mask in a QH means it's an interrupt urb */
+		if ((qh->hw->hw_info2 & cpu_to_hc32(fusbh200, QH_SMASK)) != 0) {
+
+			/* ... update hc-wide periodic stats (for usbfs) */
+			fusbh200_to_hcd(fusbh200)->self.bandwidth_int_reqs--;
+		}
+	}
+
+	if (unlikely(urb->unlinked)) {
+		COUNT(fusbh200->stats.unlink);
+	} else {
+		/* report non-error and short read status as zero */
+		if (status == -EINPROGRESS || status == -EREMOTEIO)
+			status = 0;
+		COUNT(fusbh200->stats.complete);
+	}
+
+#ifdef FUSBH200_URB_TRACE
+	fusbh200_dbg (fusbh200,
+		"%s %s urb %p ep%d%s status %d len %d/%d\n",
+		__func__, urb->dev->devpath, urb,
+		usb_pipeendpoint (urb->pipe),
+		usb_pipein (urb->pipe) ? "in" : "out",
+		status,
+		urb->actual_length, urb->transfer_buffer_length);
+#endif
+
+	/* complete() can reenter this HCD */
+	usb_hcd_unlink_urb_from_ep(fusbh200_to_hcd(fusbh200), urb);
+	spin_unlock (&fusbh200->lock);
+	usb_hcd_giveback_urb(fusbh200_to_hcd(fusbh200), urb, status);
+	spin_lock (&fusbh200->lock);
+}
+
+static int qh_schedule (struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh);
+
+/*
+ * Process and free completed qtds for a qh, returning URBs to drivers.
+ * Chases up to qh->hw_current.  Returns number of completions called,
+ * indicating how much "real" work we did.
+ */
+static unsigned
+qh_completions (struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	struct fusbh200_qtd		*last, *end = qh->dummy;
+	struct list_head	*entry, *tmp;
+	int			last_status;
+	int			stopped;
+	unsigned		count = 0;
+	u8			state;
+	struct fusbh200_qh_hw	*hw = qh->hw;
+
+	if (unlikely (list_empty (&qh->qtd_list)))
+		return count;
+
+	/* completions (or tasks on other cpus) must never clobber HALT
+	 * till we've gone through and cleaned everything up, even when
+	 * they add urbs to this qh's queue or mark them for unlinking.
+	 *
+	 * NOTE:  unlinking expects to be done in queue order.
+	 *
+	 * It's a bug for qh->qh_state to be anything other than
+	 * QH_STATE_IDLE, unless our caller is scan_async() or
+	 * scan_intr().
+	 */
+	state = qh->qh_state;
+	qh->qh_state = QH_STATE_COMPLETING;
+	stopped = (state == QH_STATE_IDLE);
+
+ rescan:
+	last = NULL;
+	last_status = -EINPROGRESS;
+	qh->needs_rescan = 0;
+
+	/* remove de-activated QTDs from front of queue.
+	 * after faults (including short reads), cleanup this urb
+	 * then let the queue advance.
+	 * if queue is stopped, handles unlinks.
+	 */
+	list_for_each_safe (entry, tmp, &qh->qtd_list) {
+		struct fusbh200_qtd	*qtd;
+		struct urb	*urb;
+		u32		token = 0;
+
+		qtd = list_entry (entry, struct fusbh200_qtd, qtd_list);
+		urb = qtd->urb;
+
+		/* clean up any state from previous QTD ...*/
+		if (last) {
+			if (likely (last->urb != urb)) {
+				fusbh200_urb_done(fusbh200, last->urb, last_status);
+				count++;
+				last_status = -EINPROGRESS;
+			}
+			fusbh200_qtd_free (fusbh200, last);
+			last = NULL;
+		}
+
+		/* ignore urbs submitted during completions we reported */
+		if (qtd == end)
+			break;
+
+		/* hardware copies qtd out of qh overlay */
+		rmb ();
+		token = hc32_to_cpu(fusbh200, qtd->hw_token);
+
+		/* always clean up qtds the hc de-activated */
+ retry_xacterr:
+		if ((token & QTD_STS_ACTIVE) == 0) {
+
+			/* Report Data Buffer Error: non-fatal but useful */
+			if (token & QTD_STS_DBE)
+				fusbh200_dbg(fusbh200,
+					"detected DataBufferErr for urb %p ep%d%s len %d, qtd %p [qh %p]\n",
+					urb,
+					usb_endpoint_num(&urb->ep->desc),
+					usb_endpoint_dir_in(&urb->ep->desc) ? "in" : "out",
+					urb->transfer_buffer_length,
+					qtd,
+					qh);
+
+			/* on STALL, error, and short reads this urb must
+			 * complete and all its qtds must be recycled.
+			 */
+			if ((token & QTD_STS_HALT) != 0) {
+
+				/* retry transaction errors until we
+				 * reach the software xacterr limit
+				 */
+				if ((token & QTD_STS_XACT) &&
+						QTD_CERR(token) == 0 &&
+						++qh->xacterrs < QH_XACTERR_MAX &&
+						!urb->unlinked) {
+					fusbh200_dbg(fusbh200,
+	"detected XactErr len %zu/%zu retry %d\n",
+	qtd->length - QTD_LENGTH(token), qtd->length, qh->xacterrs);
+
+					/* reset the token in the qtd and the
+					 * qh overlay (which still contains
+					 * the qtd) so that we pick up from
+					 * where we left off
+					 */
+					token &= ~QTD_STS_HALT;
+					token |= QTD_STS_ACTIVE |
+							(FUSBH200_TUNE_CERR << 10);
+					qtd->hw_token = cpu_to_hc32(fusbh200,
+							token);
+					wmb();
+					hw->hw_token = cpu_to_hc32(fusbh200,
+							token);
+					goto retry_xacterr;
+				}
+				stopped = 1;
+
+			/* magic dummy for some short reads; qh won't advance.
+			 * that silicon quirk can kick in with this dummy too.
+			 *
+			 * other short reads won't stop the queue, including
+			 * control transfers (status stage handles that) or
+			 * most other single-qtd reads ... the queue stops if
+			 * URB_SHORT_NOT_OK was set so the driver submitting
+			 * the urbs could clean it up.
+			 */
+			} else if (IS_SHORT_READ (token)
+					&& !(qtd->hw_alt_next
+						& FUSBH200_LIST_END(fusbh200))) {
+				stopped = 1;
+			}
+
+		/* stop scanning when we reach qtds the hc is using */
+		} else if (likely (!stopped
+				&& fusbh200->rh_state >= FUSBH200_RH_RUNNING)) {
+			break;
+
+		/* scan the whole queue for unlinks whenever it stops */
+		} else {
+			stopped = 1;
+
+			/* cancel everything if we halt, suspend, etc */
+			if (fusbh200->rh_state < FUSBH200_RH_RUNNING)
+				last_status = -ESHUTDOWN;
+
+			/* this qtd is active; skip it unless a previous qtd
+			 * for its urb faulted, or its urb was canceled.
+			 */
+			else if (last_status == -EINPROGRESS && !urb->unlinked)
+				continue;
+
+			/* qh unlinked; token in overlay may be most current */
+			if (state == QH_STATE_IDLE
+					&& cpu_to_hc32(fusbh200, qtd->qtd_dma)
+						== hw->hw_current) {
+				token = hc32_to_cpu(fusbh200, hw->hw_token);
+
+				/* An unlink may leave an incomplete
+				 * async transaction in the TT buffer.
+				 * We have to clear it.
+				 */
+				fusbh200_clear_tt_buffer(fusbh200, qh, urb, token);
+			}
+		}
+
+		/* unless we already know the urb's status, collect qtd status
+		 * and update count of bytes transferred.  in common short read
+		 * cases with only one data qtd (including control transfers),
+		 * queue processing won't halt.  but with two or more qtds (for
+		 * example, with a 32 KB transfer), when the first qtd gets a
+		 * short read the second must be removed by hand.
+		 */
+		if (last_status == -EINPROGRESS) {
+			last_status = qtd_copy_status(fusbh200, urb,
+					qtd->length, token);
+			if (last_status == -EREMOTEIO
+					&& (qtd->hw_alt_next
+						& FUSBH200_LIST_END(fusbh200)))
+				last_status = -EINPROGRESS;
+
+			/* As part of low/full-speed endpoint-halt processing
+			 * we must clear the TT buffer (11.17.5).
+			 */
+			if (unlikely(last_status != -EINPROGRESS &&
+					last_status != -EREMOTEIO)) {
+				/* The TT's in some hubs malfunction when they
+				 * receive this request following a STALL (they
+				 * stop sending isochronous packets).  Since a
+				 * STALL can't leave the TT buffer in a busy
+				 * state (if you believe Figures 11-48 - 11-51
+				 * in the USB 2.0 spec), we won't clear the TT
+				 * buffer in this case.  Strictly speaking this
+				 * is a violation of the spec.
+				 */
+				if (last_status != -EPIPE)
+					fusbh200_clear_tt_buffer(fusbh200, qh, urb,
+							token);
+			}
+		}
+
+		/* if we're removing something not at the queue head,
+		 * patch the hardware queue pointer.
+		 */
+		if (stopped && qtd->qtd_list.prev != &qh->qtd_list) {
+			last = list_entry (qtd->qtd_list.prev,
+					struct fusbh200_qtd, qtd_list);
+			last->hw_next = qtd->hw_next;
+		}
+
+		/* remove qtd; it's recycled after possible urb completion */
+		list_del (&qtd->qtd_list);
+		last = qtd;
+
+		/* reinit the xacterr counter for the next qtd */
+		qh->xacterrs = 0;
+	}
+
+	/* last urb's completion might still need calling */
+	if (likely (last != NULL)) {
+		fusbh200_urb_done(fusbh200, last->urb, last_status);
+		count++;
+		fusbh200_qtd_free (fusbh200, last);
+	}
+
+	/* Do we need to rescan for URBs dequeued during a giveback? */
+	if (unlikely(qh->needs_rescan)) {
+		/* If the QH is already unlinked, do the rescan now. */
+		if (state == QH_STATE_IDLE)
+			goto rescan;
+
+		/* Otherwise we have to wait until the QH is fully unlinked.
+		 * Our caller will start an unlink if qh->needs_rescan is
+		 * set.  But if an unlink has already started, nothing needs
+		 * to be done.
+		 */
+		if (state != QH_STATE_LINKED)
+			qh->needs_rescan = 0;
+	}
+
+	/* restore original state; caller must unlink or relink */
+	qh->qh_state = state;
+
+	/* be sure the hardware's done with the qh before refreshing
+	 * it after fault cleanup, or recovering from silicon wrongly
+	 * overlaying the dummy qtd (which reduces DMA chatter).
+	 */
+	if (stopped != 0 || hw->hw_qtd_next == FUSBH200_LIST_END(fusbh200)) {
+		switch (state) {
+		case QH_STATE_IDLE:
+			qh_refresh(fusbh200, qh);
+			break;
+		case QH_STATE_LINKED:
+			/* We won't refresh a QH that's linked (after the HC
+			 * stopped the queue).  That avoids a race:
+			 *  - HC reads first part of QH;
+			 *  - CPU updates that first part and the token;
+			 *  - HC reads rest of that QH, including token
+			 * Result:  HC gets an inconsistent image, and then
+			 * DMAs to/from the wrong memory (corrupting it).
+			 *
+			 * That should be rare for interrupt transfers,
+			 * except maybe high bandwidth ...
+			 */
+
+			/* Tell the caller to start an unlink */
+			qh->needs_rescan = 1;
+			break;
+		/* otherwise, unlink already started */
+		}
+	}
+
+	return count;
+}
+
+/*-------------------------------------------------------------------------*/
+
+// high bandwidth multiplier, as encoded in highspeed endpoint descriptors
+#define hb_mult(wMaxPacketSize) (1 + (((wMaxPacketSize) >> 11) & 0x03))
+// ... and packet size, for any kind of endpoint descriptor
+#define max_packet(wMaxPacketSize) ((wMaxPacketSize) & 0x07ff)
+
+/*
+ * reverse of qh_urb_transaction:  free a list of TDs.
+ * used for cleanup after errors, before HC sees an URB's TDs.
+ */
+static void qtd_list_free (
+	struct fusbh200_hcd		*fusbh200,
+	struct urb		*urb,
+	struct list_head	*qtd_list
+) {
+	struct list_head	*entry, *temp;
+
+	list_for_each_safe (entry, temp, qtd_list) {
+		struct fusbh200_qtd	*qtd;
+
+		qtd = list_entry (entry, struct fusbh200_qtd, qtd_list);
+		list_del (&qtd->qtd_list);
+		fusbh200_qtd_free (fusbh200, qtd);
+	}
+}
+
+/*
+ * create a list of filled qtds for this URB; won't link into qh.
+ */
+static struct list_head *
+qh_urb_transaction (
+	struct fusbh200_hcd		*fusbh200,
+	struct urb		*urb,
+	struct list_head	*head,
+	gfp_t			flags
+) {
+	struct fusbh200_qtd		*qtd, *qtd_prev;
+	dma_addr_t		buf;
+	int			len, this_sg_len, maxpacket;
+	int			is_input;
+	u32			token;
+	int			i;
+	struct scatterlist	*sg;
+
+	/*
+	 * URBs map to sequences of QTDs:  one logical transaction
+	 */
+	qtd = fusbh200_qtd_alloc (fusbh200, flags);
+	if (unlikely (!qtd))
+		return NULL;
+	list_add_tail (&qtd->qtd_list, head);
+	qtd->urb = urb;
+
+	token = QTD_STS_ACTIVE;
+	token |= (FUSBH200_TUNE_CERR << 10);
+	/* for split transactions, SplitXState initialized to zero */
+
+	len = urb->transfer_buffer_length;
+	is_input = usb_pipein (urb->pipe);
+	if (usb_pipecontrol (urb->pipe)) {
+		/* SETUP pid */
+		qtd_fill(fusbh200, qtd, urb->setup_dma,
+				sizeof (struct usb_ctrlrequest),
+				token | (2 /* "setup" */ << 8), 8);
+
+		/* ... and always at least one more pid */
+		token ^= QTD_TOGGLE;
+		qtd_prev = qtd;
+		qtd = fusbh200_qtd_alloc (fusbh200, flags);
+		if (unlikely (!qtd))
+			goto cleanup;
+		qtd->urb = urb;
+		qtd_prev->hw_next = QTD_NEXT(fusbh200, qtd->qtd_dma);
+		list_add_tail (&qtd->qtd_list, head);
+
+		/* for zero length DATA stages, STATUS is always IN */
+		if (len == 0)
+			token |= (1 /* "in" */ << 8);
+	}
+
+	/*
+	 * data transfer stage:  buffer setup
+	 */
+	i = urb->num_mapped_sgs;
+	if (len > 0 && i > 0) {
+		sg = urb->sg;
+		buf = sg_dma_address(sg);
+
+		/* urb->transfer_buffer_length may be smaller than the
+		 * size of the scatterlist (or vice versa)
+		 */
+		this_sg_len = min_t(int, sg_dma_len(sg), len);
+	} else {
+		sg = NULL;
+		buf = urb->transfer_dma;
+		this_sg_len = len;
+	}
+
+	if (is_input)
+		token |= (1 /* "in" */ << 8);
+	/* else it's already initted to "out" pid (0 << 8) */
+
+	maxpacket = max_packet(usb_maxpacket(urb->dev, urb->pipe, !is_input));
+
+	/*
+	 * buffer gets wrapped in one or more qtds;
+	 * last one may be "short" (including zero len)
+	 * and may serve as a control status ack
+	 */
+	for (;;) {
+		int this_qtd_len;
+
+		this_qtd_len = qtd_fill(fusbh200, qtd, buf, this_sg_len, token,
+				maxpacket);
+		this_sg_len -= this_qtd_len;
+		len -= this_qtd_len;
+		buf += this_qtd_len;
+
+		/*
+		 * short reads advance to a "magic" dummy instead of the next
+		 * qtd ... that forces the queue to stop, for manual cleanup.
+		 * (this will usually be overridden later.)
+		 */
+		if (is_input)
+			qtd->hw_alt_next = fusbh200->async->hw->hw_alt_next;
+
+		/* qh makes control packets use qtd toggle; maybe switch it */
+		if ((maxpacket & (this_qtd_len + (maxpacket - 1))) == 0)
+			token ^= QTD_TOGGLE;
+
+		if (likely(this_sg_len <= 0)) {
+			if (--i <= 0 || len <= 0)
+				break;
+			sg = sg_next(sg);
+			buf = sg_dma_address(sg);
+			this_sg_len = min_t(int, sg_dma_len(sg), len);
+		}
+
+		qtd_prev = qtd;
+		qtd = fusbh200_qtd_alloc (fusbh200, flags);
+		if (unlikely (!qtd))
+			goto cleanup;
+		qtd->urb = urb;
+		qtd_prev->hw_next = QTD_NEXT(fusbh200, qtd->qtd_dma);
+		list_add_tail (&qtd->qtd_list, head);
+	}
+
+	/*
+	 * unless the caller requires manual cleanup after short reads,
+	 * have the alt_next mechanism keep the queue running after the
+	 * last data qtd (the only one, for control and most other cases).
+	 */
+	if (likely ((urb->transfer_flags & URB_SHORT_NOT_OK) == 0
+				|| usb_pipecontrol (urb->pipe)))
+		qtd->hw_alt_next = FUSBH200_LIST_END(fusbh200);
+
+	/*
+	 * control requests may need a terminating data "status" ack;
+	 * other OUT ones may need a terminating short packet
+	 * (zero length).
+	 */
+	if (likely (urb->transfer_buffer_length != 0)) {
+		int	one_more = 0;
+
+		if (usb_pipecontrol (urb->pipe)) {
+			one_more = 1;
+			token ^= 0x0100;	/* "in" <--> "out"  */
+			token |= QTD_TOGGLE;	/* force DATA1 */
+		} else if (usb_pipeout(urb->pipe)
+				&& (urb->transfer_flags & URB_ZERO_PACKET)
+				&& !(urb->transfer_buffer_length % maxpacket)) {
+			one_more = 1;
+		}
+		if (one_more) {
+			qtd_prev = qtd;
+			qtd = fusbh200_qtd_alloc (fusbh200, flags);
+			if (unlikely (!qtd))
+				goto cleanup;
+			qtd->urb = urb;
+			qtd_prev->hw_next = QTD_NEXT(fusbh200, qtd->qtd_dma);
+			list_add_tail (&qtd->qtd_list, head);
+
+			/* never any data in such packets */
+			qtd_fill(fusbh200, qtd, 0, 0, token, 0);
+		}
+	}
+
+	/* by default, enable interrupt on urb completion */
+	if (likely (!(urb->transfer_flags & URB_NO_INTERRUPT)))
+		qtd->hw_token |= cpu_to_hc32(fusbh200, QTD_IOC);
+	return head;
+
+cleanup:
+	qtd_list_free (fusbh200, urb, head);
+	return NULL;
+}
+
+/*-------------------------------------------------------------------------*/
+
+// Would be best to create all qh's from config descriptors,
+// when each interface/altsetting is established.  Unlink
+// any previous qh and cancel its urbs first; endpoints are
+// implicitly reset then (data toggle too).
+// That'd mean updating how usbcore talks to HCDs. (2.7?)
+
+
+/*
+ * Each QH holds a qtd list; a QH is used for everything except iso.
+ *
+ * For interrupt urbs, the scheduler must set the microframe scheduling
+ * mask(s) each time the QH gets scheduled.  For highspeed, that's
+ * just one microframe in the s-mask.  For split interrupt transactions
+ * there are additional complications: c-mask, maybe FSTNs.
+ */
+static struct fusbh200_qh *
+qh_make (
+	struct fusbh200_hcd		*fusbh200,
+	struct urb		*urb,
+	gfp_t			flags
+) {
+	struct fusbh200_qh		*qh = fusbh200_qh_alloc (fusbh200, flags);
+	u32			info1 = 0, info2 = 0;
+	int			is_input, type;
+	int			maxp = 0;
+	struct usb_tt		*tt = urb->dev->tt;
+	struct fusbh200_qh_hw	*hw;
+
+	if (!qh)
+		return qh;
+
+	/*
+	 * init endpoint/device data for this QH
+	 */
+	info1 |= usb_pipeendpoint (urb->pipe) << 8;
+	info1 |= usb_pipedevice (urb->pipe) << 0;
+
+	is_input = usb_pipein (urb->pipe);
+	type = usb_pipetype (urb->pipe);
+	maxp = usb_maxpacket (urb->dev, urb->pipe, !is_input);
+
+	/* 1024 byte maxpacket is a hardware ceiling.  High bandwidth
+	 * acts like up to 3KB, but is built from smaller packets.
+	 */
+	if (max_packet(maxp) > 1024) {
+		fusbh200_dbg(fusbh200, "bogus qh maxpacket %d\n", max_packet(maxp));
+		goto done;
+	}
+
+	/* Compute interrupt scheduling parameters just once, and save.
+	 * - allowing for high bandwidth, how many nsec/uframe are used?
+	 * - split transactions need a second CSPLIT uframe; same question
+	 * - splits also need a schedule gap (for full/low speed I/O)
+	 * - qh has a polling interval
+	 *
+	 * For control/bulk requests, the HC or TT handles these.
+	 */
+	if (type == PIPE_INTERRUPT) {
+		qh->usecs = NS_TO_US(usb_calc_bus_time(USB_SPEED_HIGH,
+				is_input, 0,
+				hb_mult(maxp) * max_packet(maxp)));
+		qh->start = NO_FRAME;
+
+		if (urb->dev->speed == USB_SPEED_HIGH) {
+			qh->c_usecs = 0;
+			qh->gap_uf = 0;
+
+			qh->period = urb->interval >> 3;
+			if (qh->period == 0 && urb->interval != 1) {
+				/* NOTE interval 2 or 4 uframes could work.
+				 * But interval 1 scheduling is simpler, and
+				 * includes high bandwidth.
+				 */
+				urb->interval = 1;
+			} else if (qh->period > fusbh200->periodic_size) {
+				qh->period = fusbh200->periodic_size;
+				urb->interval = qh->period << 3;
+			}
+		} else {
+			int		think_time;
+
+			/* gap is f(FS/LS transfer times) */
+			qh->gap_uf = 1 + usb_calc_bus_time (urb->dev->speed,
+					is_input, 0, maxp) / (125 * 1000);
+
+			/* FIXME this just approximates SPLIT/CSPLIT times */
+			if (is_input) {		// SPLIT, gap, CSPLIT+DATA
+				qh->c_usecs = qh->usecs + HS_USECS (0);
+				qh->usecs = HS_USECS (1);
+			} else {		// SPLIT+DATA, gap, CSPLIT
+				qh->usecs += HS_USECS (1);
+				qh->c_usecs = HS_USECS (0);
+			}
+
+			think_time = tt ? tt->think_time : 0;
+			qh->tt_usecs = NS_TO_US (think_time +
+					usb_calc_bus_time (urb->dev->speed,
+					is_input, 0, max_packet (maxp)));
+			qh->period = urb->interval;
+			if (qh->period > fusbh200->periodic_size) {
+				qh->period = fusbh200->periodic_size;
+				urb->interval = qh->period;
+			}
+		}
+	}
+
+	/* support for tt scheduling, and access to toggles */
+	qh->dev = urb->dev;
+
+	/* using TT? */
+	switch (urb->dev->speed) {
+	case USB_SPEED_LOW:
+		info1 |= QH_LOW_SPEED;
+		/* FALL THROUGH */
+
+	case USB_SPEED_FULL:
+		/* EPS 0 means "full" */
+		if (type != PIPE_INTERRUPT)
+			info1 |= (FUSBH200_TUNE_RL_TT << 28);
+		if (type == PIPE_CONTROL) {
+			info1 |= QH_CONTROL_EP;		/* for TT */
+			info1 |= QH_TOGGLE_CTL;		/* toggle from qtd */
+		}
+		info1 |= maxp << 16;
+
+		info2 |= (FUSBH200_TUNE_MULT_TT << 30);
+
+		/* Some Freescale processors have an erratum in which the
+		 * port number in the queue head was 0..N-1 instead of 1..N.
+		 */
+		if (fusbh200_has_fsl_portno_bug(fusbh200))
+			info2 |= (urb->dev->ttport-1) << 23;
+		else
+			info2 |= urb->dev->ttport << 23;
+
+		/* set the address of the TT; for TDI's integrated
+		 * root hub tt, leave it zeroed.
+		 */
+		if (tt && tt->hub != fusbh200_to_hcd(fusbh200)->self.root_hub)
+			info2 |= tt->hub->devnum << 16;
+
+		/* NOTE:  if (PIPE_INTERRUPT) { scheduler sets c-mask } */
+
+		break;
+
+	case USB_SPEED_HIGH:		/* no TT involved */
+		info1 |= QH_HIGH_SPEED;
+		if (type == PIPE_CONTROL) {
+			info1 |= (FUSBH200_TUNE_RL_HS << 28);
+			info1 |= 64 << 16;	/* usb2 fixed maxpacket */
+			info1 |= QH_TOGGLE_CTL;	/* toggle from qtd */
+			info2 |= (FUSBH200_TUNE_MULT_HS << 30);
+		} else if (type == PIPE_BULK) {
+			info1 |= (FUSBH200_TUNE_RL_HS << 28);
+			/* The USB spec says that high speed bulk endpoints
+			 * always use 512 byte maxpacket.  But some device
+			 * vendors decided to ignore that, and MSFT is happy
+			 * to help them do so.  So now people expect to use
+			 * such nonconformant devices with Linux too; sigh.
+			 */
+			info1 |= max_packet(maxp) << 16;
+			info2 |= (FUSBH200_TUNE_MULT_HS << 30);
+		} else {		/* PIPE_INTERRUPT */
+			info1 |= max_packet (maxp) << 16;
+			info2 |= hb_mult (maxp) << 30;
+		}
+		break;
+	default:
+		fusbh200_dbg(fusbh200, "bogus dev %p speed %d\n", urb->dev,
+			urb->dev->speed);
+done:
+		qh_destroy(fusbh200, qh);
+		return NULL;
+	}
+
+	/* NOTE:  if (PIPE_INTERRUPT) { scheduler sets s-mask } */
+
+	/* init as live, toggle clear, advance to dummy */
+	qh->qh_state = QH_STATE_IDLE;
+	hw = qh->hw;
+	hw->hw_info1 = cpu_to_hc32(fusbh200, info1);
+	hw->hw_info2 = cpu_to_hc32(fusbh200, info2);
+	qh->is_out = !is_input;
+	usb_settoggle (urb->dev, usb_pipeendpoint (urb->pipe), !is_input, 1);
+	qh_refresh (fusbh200, qh);
+	return qh;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void enable_async(struct fusbh200_hcd *fusbh200)
+{
+	if (fusbh200->async_count++)
+		return;
+
+	/* Stop waiting to turn off the async schedule */
+	fusbh200->enabled_hrtimer_events &= ~BIT(FUSBH200_HRTIMER_DISABLE_ASYNC);
+
+	/* Don't start the schedule until ASS is 0 */
+	fusbh200_poll_ASS(fusbh200);
+	turn_on_io_watchdog(fusbh200);
+}
+
+static void disable_async(struct fusbh200_hcd *fusbh200)
+{
+	if (--fusbh200->async_count)
+		return;
+
+	/* The async schedule and async_unlink list are supposed to be empty */
+	WARN_ON(fusbh200->async->qh_next.qh || fusbh200->async_unlink);
+
+	/* Don't turn off the schedule until ASS is 1 */
+	fusbh200_poll_ASS(fusbh200);
+}
+
+/* move qh (and its qtds) onto async queue; maybe enable queue.  */
+
+static void qh_link_async (struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	__hc32		dma = QH_NEXT(fusbh200, qh->qh_dma);
+	struct fusbh200_qh	*head;
+
+	/* Don't link a QH if there's a Clear-TT-Buffer pending */
+	if (unlikely(qh->clearing_tt))
+		return;
+
+	WARN_ON(qh->qh_state != QH_STATE_IDLE);
+
+	/* clear halt and/or toggle; and maybe recover from silicon quirk */
+	qh_refresh(fusbh200, qh);
+
+	/* splice right after start */
+	head = fusbh200->async;
+	qh->qh_next = head->qh_next;
+	qh->hw->hw_next = head->hw->hw_next;
+	wmb ();
+
+	head->qh_next.qh = qh;
+	head->hw->hw_next = dma;
+
+	qh->xacterrs = 0;
+	qh->qh_state = QH_STATE_LINKED;
+	/* qtd completions reported later by interrupt */
+
+	enable_async(fusbh200);
+}
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * For control/bulk/interrupt, return QH with these TDs appended.
+ * Allocates and initializes the QH if necessary.
+ * Returns null if it can't allocate a QH it needs to.
+ * If the QH has TDs (urbs) already, that's great.
+ */
+static struct fusbh200_qh *qh_append_tds (
+	struct fusbh200_hcd		*fusbh200,
+	struct urb		*urb,
+	struct list_head	*qtd_list,
+	int			epnum,
+	void			**ptr
+)
+{
+	struct fusbh200_qh		*qh = NULL;
+	__hc32			qh_addr_mask = cpu_to_hc32(fusbh200, 0x7f);
+
+	qh = (struct fusbh200_qh *) *ptr;
+	if (unlikely (qh == NULL)) {
+		/* can't sleep here, we have fusbh200->lock... */
+		qh = qh_make (fusbh200, urb, GFP_ATOMIC);
+		*ptr = qh;
+	}
+	if (likely (qh != NULL)) {
+		struct fusbh200_qtd	*qtd;
+
+		if (unlikely (list_empty (qtd_list)))
+			qtd = NULL;
+		else
+			qtd = list_entry (qtd_list->next, struct fusbh200_qtd,
+					qtd_list);
+
+		/* control qh may need patching ... */
+		if (unlikely (epnum == 0)) {
+
+                        /* usb_reset_device() briefly reverts to address 0 */
+                        if (usb_pipedevice (urb->pipe) == 0)
+				qh->hw->hw_info1 &= ~qh_addr_mask;
+		}
+
+		/* just one way to queue requests: swap with the dummy qtd.
+		 * only hc or qh_refresh() ever modify the overlay.
+		 */
+		if (likely (qtd != NULL)) {
+			struct fusbh200_qtd		*dummy;
+			dma_addr_t		dma;
+			__hc32			token;
+
+			/* to avoid racing the HC, use the dummy td instead of
+			 * the first td of our list (becomes new dummy).  both
+			 * tds stay deactivated until we're done, when the
+			 * HC is allowed to fetch the old dummy (4.10.2).
+			 */
+			token = qtd->hw_token;
+			qtd->hw_token = HALT_BIT(fusbh200);
+
+			dummy = qh->dummy;
+
+			dma = dummy->qtd_dma;
+			*dummy = *qtd;
+			dummy->qtd_dma = dma;
+
+			list_del (&qtd->qtd_list);
+			list_add (&dummy->qtd_list, qtd_list);
+			list_splice_tail(qtd_list, &qh->qtd_list);
+
+			fusbh200_qtd_init(fusbh200, qtd, qtd->qtd_dma);
+			qh->dummy = qtd;
+
+			/* hc must see the new dummy at list end */
+			dma = qtd->qtd_dma;
+			qtd = list_entry (qh->qtd_list.prev,
+					struct fusbh200_qtd, qtd_list);
+			qtd->hw_next = QTD_NEXT(fusbh200, dma);
+
+			/* let the hc process these next qtds */
+			wmb ();
+			dummy->hw_token = token;
+
+			urb->hcpriv = qh;
+		}
+	}
+	return qh;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static int
+submit_async (
+	struct fusbh200_hcd		*fusbh200,
+	struct urb		*urb,
+	struct list_head	*qtd_list,
+	gfp_t			mem_flags
+) {
+	int			epnum;
+	unsigned long		flags;
+	struct fusbh200_qh		*qh = NULL;
+	int			rc;
+
+	epnum = urb->ep->desc.bEndpointAddress;
+
+#ifdef FUSBH200_URB_TRACE
+	{
+		struct fusbh200_qtd *qtd;
+		qtd = list_entry(qtd_list->next, struct fusbh200_qtd, qtd_list);
+		fusbh200_dbg(fusbh200,
+			 "%s %s urb %p ep%d%s len %d, qtd %p [qh %p]\n",
+			 __func__, urb->dev->devpath, urb,
+			 epnum & 0x0f, (epnum & USB_DIR_IN) ? "in" : "out",
+			 urb->transfer_buffer_length,
+			 qtd, urb->ep->hcpriv);
+	}
+#endif
+
+	spin_lock_irqsave (&fusbh200->lock, flags);
+	if (unlikely(!HCD_HW_ACCESSIBLE(fusbh200_to_hcd(fusbh200)))) {
+		rc = -ESHUTDOWN;
+		goto done;
+	}
+	rc = usb_hcd_link_urb_to_ep(fusbh200_to_hcd(fusbh200), urb);
+	if (unlikely(rc))
+		goto done;
+
+	qh = qh_append_tds(fusbh200, urb, qtd_list, epnum, &urb->ep->hcpriv);
+	if (unlikely(qh == NULL)) {
+		usb_hcd_unlink_urb_from_ep(fusbh200_to_hcd(fusbh200), urb);
+		rc = -ENOMEM;
+		goto done;
+	}
+
+	/* Control/bulk operations through TTs don't need scheduling,
+	 * the HC and TT handle it when the TT has a buffer ready.
+	 */
+	if (likely (qh->qh_state == QH_STATE_IDLE))
+		qh_link_async(fusbh200, qh);
+ done:
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+	if (unlikely (qh == NULL))
+		qtd_list_free (fusbh200, urb, qtd_list);
+	return rc;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void single_unlink_async(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	struct fusbh200_qh		*prev;
+
+	/* Add to the end of the list of QHs waiting for the next IAAD */
+	qh->qh_state = QH_STATE_UNLINK;
+	if (fusbh200->async_unlink)
+		fusbh200->async_unlink_last->unlink_next = qh;
+	else
+		fusbh200->async_unlink = qh;
+	fusbh200->async_unlink_last = qh;
+
+	/* Unlink it from the schedule */
+	prev = fusbh200->async;
+	while (prev->qh_next.qh != qh)
+		prev = prev->qh_next.qh;
+
+	prev->hw->hw_next = qh->hw->hw_next;
+	prev->qh_next = qh->qh_next;
+	if (fusbh200->qh_scan_next == qh)
+		fusbh200->qh_scan_next = qh->qh_next.qh;
+}
+
+static void start_iaa_cycle(struct fusbh200_hcd *fusbh200, bool nested)
+{
+	/*
+	 * Do nothing if an IAA cycle is already running or
+	 * if one will be started shortly.
+	 */
+	if (fusbh200->async_iaa || fusbh200->async_unlinking)
+		return;
+
+	/* Do all the waiting QHs at once */
+	fusbh200->async_iaa = fusbh200->async_unlink;
+	fusbh200->async_unlink = NULL;
+
+	/* If the controller isn't running, we don't have to wait for it */
+	if (unlikely(fusbh200->rh_state < FUSBH200_RH_RUNNING)) {
+		if (!nested)		/* Avoid recursion */
+			end_unlink_async(fusbh200);
+
+	/* Otherwise start a new IAA cycle */
+	} else if (likely(fusbh200->rh_state == FUSBH200_RH_RUNNING)) {
+		/* Make sure the unlinks are all visible to the hardware */
+		wmb();
+
+		fusbh200_writel(fusbh200, fusbh200->command | CMD_IAAD,
+				&fusbh200->regs->command);
+		fusbh200_readl(fusbh200, &fusbh200->regs->command);
+		fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_IAA_WATCHDOG, true);
+	}
+}
+
+/* the async qh for the qtds being unlinked are now gone from the HC */
+
+static void end_unlink_async(struct fusbh200_hcd *fusbh200)
+{
+	struct fusbh200_qh		*qh;
+
+	/* Process the idle QHs */
+ restart:
+	fusbh200->async_unlinking = true;
+	while (fusbh200->async_iaa) {
+		qh = fusbh200->async_iaa;
+		fusbh200->async_iaa = qh->unlink_next;
+		qh->unlink_next = NULL;
+
+		qh->qh_state = QH_STATE_IDLE;
+		qh->qh_next.qh = NULL;
+
+		qh_completions(fusbh200, qh);
+		if (!list_empty(&qh->qtd_list) &&
+				fusbh200->rh_state == FUSBH200_RH_RUNNING)
+			qh_link_async(fusbh200, qh);
+		disable_async(fusbh200);
+	}
+	fusbh200->async_unlinking = false;
+
+	/* Start a new IAA cycle if any QHs are waiting for it */
+	if (fusbh200->async_unlink) {
+		start_iaa_cycle(fusbh200, true);
+		if (unlikely(fusbh200->rh_state < FUSBH200_RH_RUNNING))
+			goto restart;
+	}
+}
+
+static void unlink_empty_async(struct fusbh200_hcd *fusbh200)
+{
+	struct fusbh200_qh		*qh, *next;
+	bool			stopped = (fusbh200->rh_state < FUSBH200_RH_RUNNING);
+	bool			check_unlinks_later = false;
+
+	/* Unlink all the async QHs that have been empty for a timer cycle */
+	next = fusbh200->async->qh_next.qh;
+	while (next) {
+		qh = next;
+		next = qh->qh_next.qh;
+
+		if (list_empty(&qh->qtd_list) &&
+				qh->qh_state == QH_STATE_LINKED) {
+			if (!stopped && qh->unlink_cycle ==
+					fusbh200->async_unlink_cycle)
+				check_unlinks_later = true;
+			else
+				single_unlink_async(fusbh200, qh);
+		}
+	}
+
+	/* Start a new IAA cycle if any QHs are waiting for it */
+	if (fusbh200->async_unlink)
+		start_iaa_cycle(fusbh200, false);
+
+	/* QHs that haven't been empty for long enough will be handled later */
+	if (check_unlinks_later) {
+		fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_ASYNC_UNLINKS, true);
+		++fusbh200->async_unlink_cycle;
+	}
+}
+
+/* makes sure the async qh will become idle */
+/* caller must own fusbh200->lock */
+
+static void start_unlink_async(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	/*
+	 * If the QH isn't linked then there's nothing we can do
+	 * unless we were called during a giveback, in which case
+	 * qh_completions() has to deal with it.
+	 */
+	if (qh->qh_state != QH_STATE_LINKED) {
+		if (qh->qh_state == QH_STATE_COMPLETING)
+			qh->needs_rescan = 1;
+		return;
+	}
+
+	single_unlink_async(fusbh200, qh);
+	start_iaa_cycle(fusbh200, false);
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void scan_async (struct fusbh200_hcd *fusbh200)
+{
+	struct fusbh200_qh		*qh;
+	bool			check_unlinks_later = false;
+
+	fusbh200->qh_scan_next = fusbh200->async->qh_next.qh;
+	while (fusbh200->qh_scan_next) {
+		qh = fusbh200->qh_scan_next;
+		fusbh200->qh_scan_next = qh->qh_next.qh;
+ rescan:
+		/* clean any finished work for this qh */
+		if (!list_empty(&qh->qtd_list)) {
+			int temp;
+
+			/*
+			 * Unlinks could happen here; completion reporting
+			 * drops the lock.  That's why fusbh200->qh_scan_next
+			 * always holds the next qh to scan; if the next qh
+			 * gets unlinked then fusbh200->qh_scan_next is adjusted
+			 * in single_unlink_async().
+			 */
+			temp = qh_completions(fusbh200, qh);
+			if (qh->needs_rescan) {
+				start_unlink_async(fusbh200, qh);
+			} else if (list_empty(&qh->qtd_list)
+					&& qh->qh_state == QH_STATE_LINKED) {
+				qh->unlink_cycle = fusbh200->async_unlink_cycle;
+				check_unlinks_later = true;
+			} else if (temp != 0)
+				goto rescan;
+		}
+	}
+
+	/*
+	 * Unlink empty entries, reducing DMA usage as well
+	 * as HCD schedule-scanning costs.  Delay for any qh
+	 * we just scanned, there's a not-unusual case that it
+	 * doesn't stay idle for long.
+	 */
+	if (check_unlinks_later && fusbh200->rh_state == FUSBH200_RH_RUNNING &&
+			!(fusbh200->enabled_hrtimer_events &
+				BIT(FUSBH200_HRTIMER_ASYNC_UNLINKS))) {
+		fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_ASYNC_UNLINKS, true);
+		++fusbh200->async_unlink_cycle;
+	}
+}
+/*-------------------------------------------------------------------------*/
+/*
+ * EHCI scheduled transaction support:  interrupt, iso, split iso
+ * These are called "periodic" transactions in the EHCI spec.
+ *
+ * Note that for interrupt transfers, the QH/QTD manipulation is shared
+ * with the "asynchronous" transaction support (control/bulk transfers).
+ * The only real difference is in how interrupt transfers are scheduled.
+ *
+ * For ISO, we make an "iso_stream" head to serve the same role as a QH.
+ * It keeps track of every ITD (or SITD) that's linked, and holds enough
+ * pre-calculated schedule data to make appending to the queue be quick.
+ */
+
+static int fusbh200_get_frame (struct usb_hcd *hcd);
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * periodic_next_shadow - return "next" pointer on shadow list
+ * @periodic: host pointer to qh/itd
+ * @tag: hardware tag for type of this record
+ */
+static union fusbh200_shadow *
+periodic_next_shadow(struct fusbh200_hcd *fusbh200, union fusbh200_shadow *periodic,
+		__hc32 tag)
+{
+	switch (hc32_to_cpu(fusbh200, tag)) {
+	case Q_TYPE_QH:
+		return &periodic->qh->qh_next;
+	case Q_TYPE_FSTN:
+		return &periodic->fstn->fstn_next;
+	default:
+		return &periodic->itd->itd_next;
+	}
+}
+
+static __hc32 *
+shadow_next_periodic(struct fusbh200_hcd *fusbh200, union fusbh200_shadow *periodic,
+		__hc32 tag)
+{
+	switch (hc32_to_cpu(fusbh200, tag)) {
+	/* our fusbh200_shadow.qh is actually software part */
+	case Q_TYPE_QH:
+		return &periodic->qh->hw->hw_next;
+	/* others are hw parts */
+	default:
+		return periodic->hw_next;
+	}
+}
+
+/* caller must hold fusbh200->lock */
+static void periodic_unlink (struct fusbh200_hcd *fusbh200, unsigned frame, void *ptr)
+{
+	union fusbh200_shadow	*prev_p = &fusbh200->pshadow[frame];
+	__hc32			*hw_p = &fusbh200->periodic[frame];
+	union fusbh200_shadow	here = *prev_p;
+
+	/* find predecessor of "ptr"; hw and shadow lists are in sync */
+	while (here.ptr && here.ptr != ptr) {
+		prev_p = periodic_next_shadow(fusbh200, prev_p,
+				Q_NEXT_TYPE(fusbh200, *hw_p));
+		hw_p = shadow_next_periodic(fusbh200, &here,
+				Q_NEXT_TYPE(fusbh200, *hw_p));
+		here = *prev_p;
+	}
+	/* an interrupt entry (at list end) could have been shared */
+	if (!here.ptr)
+		return;
+
+	/* update shadow and hardware lists ... the old "next" pointers
+	 * from ptr may still be in use, the caller updates them.
+	 */
+	*prev_p = *periodic_next_shadow(fusbh200, &here,
+			Q_NEXT_TYPE(fusbh200, *hw_p));
+
+	*hw_p = *shadow_next_periodic(fusbh200, &here,
+				Q_NEXT_TYPE(fusbh200, *hw_p));
+}
+
+/* how many of the uframe's 125 usecs are allocated? */
+static unsigned short
+periodic_usecs (struct fusbh200_hcd *fusbh200, unsigned frame, unsigned uframe)
+{
+	__hc32			*hw_p = &fusbh200->periodic [frame];
+	union fusbh200_shadow	*q = &fusbh200->pshadow [frame];
+	unsigned		usecs = 0;
+	struct fusbh200_qh_hw	*hw;
+
+	while (q->ptr) {
+		switch (hc32_to_cpu(fusbh200, Q_NEXT_TYPE(fusbh200, *hw_p))) {
+		case Q_TYPE_QH:
+			hw = q->qh->hw;
+			/* is it in the S-mask? */
+			if (hw->hw_info2 & cpu_to_hc32(fusbh200, 1 << uframe))
+				usecs += q->qh->usecs;
+			/* ... or C-mask? */
+			if (hw->hw_info2 & cpu_to_hc32(fusbh200,
+					1 << (8 + uframe)))
+				usecs += q->qh->c_usecs;
+			hw_p = &hw->hw_next;
+			q = &q->qh->qh_next;
+			break;
+		// case Q_TYPE_FSTN:
+		default:
+			/* for "save place" FSTNs, count the relevant INTR
+			 * bandwidth from the previous frame
+			 */
+			if (q->fstn->hw_prev != FUSBH200_LIST_END(fusbh200)) {
+				fusbh200_dbg (fusbh200, "ignoring FSTN cost ...\n");
+			}
+			hw_p = &q->fstn->hw_next;
+			q = &q->fstn->fstn_next;
+			break;
+		case Q_TYPE_ITD:
+			if (q->itd->hw_transaction[uframe])
+				usecs += q->itd->stream->usecs;
+			hw_p = &q->itd->hw_next;
+			q = &q->itd->itd_next;
+			break;
+		}
+	}
+#ifdef	DEBUG
+	if (usecs > fusbh200->uframe_periodic_max)
+		fusbh200_err (fusbh200, "uframe %d sched overrun: %d usecs\n",
+			frame * 8 + uframe, usecs);
+#endif
+	return usecs;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static int same_tt (struct usb_device *dev1, struct usb_device *dev2)
+{
+	if (!dev1->tt || !dev2->tt)
+		return 0;
+	if (dev1->tt != dev2->tt)
+		return 0;
+	if (dev1->tt->multi)
+		return dev1->ttport == dev2->ttport;
+	else
+		return 1;
+}
+
+/* return true iff the device's transaction translator is available
+ * for a periodic transfer starting at the specified frame, using
+ * all the uframes in the mask.
+ */
+static int tt_no_collision (
+	struct fusbh200_hcd		*fusbh200,
+	unsigned		period,
+	struct usb_device	*dev,
+	unsigned		frame,
+	u32			uf_mask
+)
+{
+	if (period == 0)	/* error */
+		return 0;
+
+	/* note bandwidth wastage:  split never follows csplit
+	 * (different dev or endpoint) until the next uframe.
+	 * calling convention doesn't make that distinction.
+	 */
+	for (; frame < fusbh200->periodic_size; frame += period) {
+		union fusbh200_shadow	here;
+		__hc32			type;
+		struct fusbh200_qh_hw	*hw;
+
+		here = fusbh200->pshadow [frame];
+		type = Q_NEXT_TYPE(fusbh200, fusbh200->periodic [frame]);
+		while (here.ptr) {
+			switch (hc32_to_cpu(fusbh200, type)) {
+			case Q_TYPE_ITD:
+				type = Q_NEXT_TYPE(fusbh200, here.itd->hw_next);
+				here = here.itd->itd_next;
+				continue;
+			case Q_TYPE_QH:
+				hw = here.qh->hw;
+				if (same_tt (dev, here.qh->dev)) {
+					u32		mask;
+
+					mask = hc32_to_cpu(fusbh200,
+							hw->hw_info2);
+					/* "knows" no gap is needed */
+					mask |= mask >> 8;
+					if (mask & uf_mask)
+						break;
+				}
+				type = Q_NEXT_TYPE(fusbh200, hw->hw_next);
+				here = here.qh->qh_next;
+				continue;
+			// case Q_TYPE_FSTN:
+			default:
+				fusbh200_dbg (fusbh200,
+					"periodic frame %d bogus type %d\n",
+					frame, type);
+			}
+
+			/* collision or error */
+			return 0;
+		}
+	}
+
+	/* no collision */
+	return 1;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void enable_periodic(struct fusbh200_hcd *fusbh200)
+{
+	if (fusbh200->periodic_count++)
+		return;
+
+	/* Stop waiting to turn off the periodic schedule */
+	fusbh200->enabled_hrtimer_events &= ~BIT(FUSBH200_HRTIMER_DISABLE_PERIODIC);
+
+	/* Don't start the schedule until PSS is 0 */
+	fusbh200_poll_PSS(fusbh200);
+	turn_on_io_watchdog(fusbh200);
+}
+
+static void disable_periodic(struct fusbh200_hcd *fusbh200)
+{
+	if (--fusbh200->periodic_count)
+		return;
+
+	/* Don't turn off the schedule until PSS is 1 */
+	fusbh200_poll_PSS(fusbh200);
+}
+
+/*-------------------------------------------------------------------------*/
+
+/* periodic schedule slots have iso tds (normal or split) first, then a
+ * sparse tree for active interrupt transfers.
+ *
+ * this just links in a qh; caller guarantees uframe masks are set right.
+ * no FSTN support (yet; fusbh200 0.96+)
+ */
+static void qh_link_periodic(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	unsigned	i;
+	unsigned	period = qh->period;
+
+	dev_dbg (&qh->dev->dev,
+		"link qh%d-%04x/%p start %d [%d/%d us]\n",
+		period, hc32_to_cpup(fusbh200, &qh->hw->hw_info2)
+			& (QH_CMASK | QH_SMASK),
+		qh, qh->start, qh->usecs, qh->c_usecs);
+
+	/* high bandwidth, or otherwise every microframe */
+	if (period == 0)
+		period = 1;
+
+	for (i = qh->start; i < fusbh200->periodic_size; i += period) {
+		union fusbh200_shadow	*prev = &fusbh200->pshadow[i];
+		__hc32			*hw_p = &fusbh200->periodic[i];
+		union fusbh200_shadow	here = *prev;
+		__hc32			type = 0;
+
+		/* skip the iso nodes at list head */
+		while (here.ptr) {
+			type = Q_NEXT_TYPE(fusbh200, *hw_p);
+			if (type == cpu_to_hc32(fusbh200, Q_TYPE_QH))
+				break;
+			prev = periodic_next_shadow(fusbh200, prev, type);
+			hw_p = shadow_next_periodic(fusbh200, &here, type);
+			here = *prev;
+		}
+
+		/* sorting each branch by period (slow-->fast)
+		 * enables sharing interior tree nodes
+		 */
+		while (here.ptr && qh != here.qh) {
+			if (qh->period > here.qh->period)
+				break;
+			prev = &here.qh->qh_next;
+			hw_p = &here.qh->hw->hw_next;
+			here = *prev;
+		}
+		/* link in this qh, unless some earlier pass did that */
+		if (qh != here.qh) {
+			qh->qh_next = here;
+			if (here.qh)
+				qh->hw->hw_next = *hw_p;
+			wmb ();
+			prev->qh = qh;
+			*hw_p = QH_NEXT (fusbh200, qh->qh_dma);
+		}
+	}
+	qh->qh_state = QH_STATE_LINKED;
+	qh->xacterrs = 0;
+
+	/* update per-qh bandwidth for usbfs */
+	fusbh200_to_hcd(fusbh200)->self.bandwidth_allocated += qh->period
+		? ((qh->usecs + qh->c_usecs) / qh->period)
+		: (qh->usecs * 8);
+
+	list_add(&qh->intr_node, &fusbh200->intr_qh_list);
+
+	/* maybe enable periodic schedule processing */
+	++fusbh200->intr_count;
+	enable_periodic(fusbh200);
+}
+
+static void qh_unlink_periodic(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	unsigned	i;
+	unsigned	period;
+
+	/*
+	 * If qh is for a low/full-speed device, simply unlinking it
+	 * could interfere with an ongoing split transaction.  To unlink
+	 * it safely would require setting the QH_INACTIVATE bit and
+	 * waiting at least one frame, as described in EHCI 4.12.2.5.
+	 *
+	 * We won't bother with any of this.  Instead, we assume that the
+	 * only reason for unlinking an interrupt QH while the current URB
+	 * is still active is to dequeue all the URBs (flush the whole
+	 * endpoint queue).
+	 *
+	 * If rebalancing the periodic schedule is ever implemented, this
+	 * approach will no longer be valid.
+	 */
+
+	/* high bandwidth, or otherwise part of every microframe */
+	if ((period = qh->period) == 0)
+		period = 1;
+
+	for (i = qh->start; i < fusbh200->periodic_size; i += period)
+		periodic_unlink (fusbh200, i, qh);
+
+	/* update per-qh bandwidth for usbfs */
+	fusbh200_to_hcd(fusbh200)->self.bandwidth_allocated -= qh->period
+		? ((qh->usecs + qh->c_usecs) / qh->period)
+		: (qh->usecs * 8);
+
+	dev_dbg (&qh->dev->dev,
+		"unlink qh%d-%04x/%p start %d [%d/%d us]\n",
+		qh->period,
+		hc32_to_cpup(fusbh200, &qh->hw->hw_info2) & (QH_CMASK | QH_SMASK),
+		qh, qh->start, qh->usecs, qh->c_usecs);
+
+	/* qh->qh_next still "live" to HC */
+	qh->qh_state = QH_STATE_UNLINK;
+	qh->qh_next.ptr = NULL;
+
+	if (fusbh200->qh_scan_next == qh)
+		fusbh200->qh_scan_next = list_entry(qh->intr_node.next,
+				struct fusbh200_qh, intr_node);
+	list_del(&qh->intr_node);
+}
+
+static void start_unlink_intr(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	/* If the QH isn't linked then there's nothing we can do
+	 * unless we were called during a giveback, in which case
+	 * qh_completions() has to deal with it.
+	 */
+	if (qh->qh_state != QH_STATE_LINKED) {
+		if (qh->qh_state == QH_STATE_COMPLETING)
+			qh->needs_rescan = 1;
+		return;
+	}
+
+	qh_unlink_periodic (fusbh200, qh);
+
+	/* Make sure the unlinks are visible before starting the timer */
+	wmb();
+
+	/*
+	 * The EHCI spec doesn't say how long it takes the controller to
+	 * stop accessing an unlinked interrupt QH.  The timer delay is
+	 * 9 uframes; presumably that will be long enough.
+	 */
+	qh->unlink_cycle = fusbh200->intr_unlink_cycle;
+
+	/* New entries go at the end of the intr_unlink list */
+	if (fusbh200->intr_unlink)
+		fusbh200->intr_unlink_last->unlink_next = qh;
+	else
+		fusbh200->intr_unlink = qh;
+	fusbh200->intr_unlink_last = qh;
+
+	if (fusbh200->intr_unlinking)
+		;	/* Avoid recursive calls */
+	else if (fusbh200->rh_state < FUSBH200_RH_RUNNING)
+		fusbh200_handle_intr_unlinks(fusbh200);
+	else if (fusbh200->intr_unlink == qh) {
+		fusbh200_enable_event(fusbh200, FUSBH200_HRTIMER_UNLINK_INTR, true);
+		++fusbh200->intr_unlink_cycle;
+	}
+}
+
+static void end_unlink_intr(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	struct fusbh200_qh_hw	*hw = qh->hw;
+	int			rc;
+
+	qh->qh_state = QH_STATE_IDLE;
+	hw->hw_next = FUSBH200_LIST_END(fusbh200);
+
+	qh_completions(fusbh200, qh);
+
+	/* reschedule QH iff another request is queued */
+	if (!list_empty(&qh->qtd_list) && fusbh200->rh_state == FUSBH200_RH_RUNNING) {
+		rc = qh_schedule(fusbh200, qh);
+
+		/* An error here likely indicates handshake failure
+		 * or no space left in the schedule.  Neither fault
+		 * should happen often ...
+		 *
+		 * FIXME kill the now-dysfunctional queued urbs
+		 */
+		if (rc != 0)
+			fusbh200_err(fusbh200, "can't reschedule qh %p, err %d\n",
+					qh, rc);
+	}
+
+	/* maybe turn off periodic schedule */
+	--fusbh200->intr_count;
+	disable_periodic(fusbh200);
+}
+
+/*-------------------------------------------------------------------------*/
+
+static int check_period (
+	struct fusbh200_hcd *fusbh200,
+	unsigned	frame,
+	unsigned	uframe,
+	unsigned	period,
+	unsigned	usecs
+) {
+	int		claimed;
+
+	/* complete split running into next frame?
+	 * given FSTN support, we could sometimes check...
+	 */
+	if (uframe >= 8)
+		return 0;
+
+	/* convert "usecs we need" to "max already claimed" */
+	usecs = fusbh200->uframe_periodic_max - usecs;
+
+	/* we "know" 2 and 4 uframe intervals were rejected; so
+	 * for period 0, check _every_ microframe in the schedule.
+	 */
+	if (unlikely (period == 0)) {
+		do {
+			for (uframe = 0; uframe < 7; uframe++) {
+				claimed = periodic_usecs (fusbh200, frame, uframe);
+				if (claimed > usecs)
+					return 0;
+			}
+		} while ((frame += 1) < fusbh200->periodic_size);
+
+	/* just check the specified uframe, at that period */
+	} else {
+		do {
+			claimed = periodic_usecs (fusbh200, frame, uframe);
+			if (claimed > usecs)
+				return 0;
+		} while ((frame += period) < fusbh200->periodic_size);
+	}
+
+	// success!
+	return 1;
+}
+
+static int check_intr_schedule (
+	struct fusbh200_hcd		*fusbh200,
+	unsigned		frame,
+	unsigned		uframe,
+	const struct fusbh200_qh	*qh,
+	__hc32			*c_maskp
+)
+{
+	int		retval = -ENOSPC;
+	u8		mask = 0;
+
+	if (qh->c_usecs && uframe >= 6)		/* FSTN territory? */
+		goto done;
+
+	if (!check_period (fusbh200, frame, uframe, qh->period, qh->usecs))
+		goto done;
+	if (!qh->c_usecs) {
+		retval = 0;
+		*c_maskp = 0;
+		goto done;
+	}
+
+	/* Make sure this tt's buffer is also available for CSPLITs.
+	 * We pessimize a bit; probably the typical full speed case
+	 * doesn't need the second CSPLIT.
+	 *
+	 * NOTE:  both SPLIT and CSPLIT could be checked in just
+	 * one smart pass...
+	 */
+	mask = 0x03 << (uframe + qh->gap_uf);
+	*c_maskp = cpu_to_hc32(fusbh200, mask << 8);
+
+	mask |= 1 << uframe;
+	if (tt_no_collision (fusbh200, qh->period, qh->dev, frame, mask)) {
+		if (!check_period (fusbh200, frame, uframe + qh->gap_uf + 1,
+					qh->period, qh->c_usecs))
+			goto done;
+		if (!check_period (fusbh200, frame, uframe + qh->gap_uf,
+					qh->period, qh->c_usecs))
+			goto done;
+		retval = 0;
+	}
+done:
+	return retval;
+}
+
+/* "first fit" scheduling policy used the first time through,
+ * or when the previous schedule slot can't be re-used.
+ */
+static int qh_schedule(struct fusbh200_hcd *fusbh200, struct fusbh200_qh *qh)
+{
+	int		status;
+	unsigned	uframe;
+	__hc32		c_mask;
+	unsigned	frame;		/* 0..(qh->period - 1), or NO_FRAME */
+	struct fusbh200_qh_hw	*hw = qh->hw;
+
+	qh_refresh(fusbh200, qh);
+	hw->hw_next = FUSBH200_LIST_END(fusbh200);
+	frame = qh->start;
+
+	/* reuse the previous schedule slots, if we can */
+	if (frame < qh->period) {
+		uframe = ffs(hc32_to_cpup(fusbh200, &hw->hw_info2) & QH_SMASK);
+		status = check_intr_schedule (fusbh200, frame, --uframe,
+				qh, &c_mask);
+	} else {
+		uframe = 0;
+		c_mask = 0;
+		status = -ENOSPC;
+	}
+
+	/* else scan the schedule to find a group of slots such that all
+	 * uframes have enough periodic bandwidth available.
+	 */
+	if (status) {
+		/* "normal" case, uframing flexible except with splits */
+		if (qh->period) {
+			int		i;
+
+			for (i = qh->period; status && i > 0; --i) {
+				frame = ++fusbh200->random_frame % qh->period;
+				for (uframe = 0; uframe < 8; uframe++) {
+					status = check_intr_schedule (fusbh200,
+							frame, uframe, qh,
+							&c_mask);
+					if (status == 0)
+						break;
+				}
+			}
+
+		/* qh->period == 0 means every uframe */
+		} else {
+			frame = 0;
+			status = check_intr_schedule (fusbh200, 0, 0, qh, &c_mask);
+		}
+		if (status)
+			goto done;
+		qh->start = frame;
+
+		/* reset S-frame and (maybe) C-frame masks */
+		hw->hw_info2 &= cpu_to_hc32(fusbh200, ~(QH_CMASK | QH_SMASK));
+		hw->hw_info2 |= qh->period
+			? cpu_to_hc32(fusbh200, 1 << uframe)
+			: cpu_to_hc32(fusbh200, QH_SMASK);
+		hw->hw_info2 |= c_mask;
+	} else
+		fusbh200_dbg (fusbh200, "reused qh %p schedule\n", qh);
+
+	/* stuff into the periodic schedule */
+	qh_link_periodic(fusbh200, qh);
+done:
+	return status;
+}
+
+static int intr_submit (
+	struct fusbh200_hcd		*fusbh200,
+	struct urb		*urb,
+	struct list_head	*qtd_list,
+	gfp_t			mem_flags
+) {
+	unsigned		epnum;
+	unsigned long		flags;
+	struct fusbh200_qh		*qh;
+	int			status;
+	struct list_head	empty;
+
+	/* get endpoint and transfer/schedule data */
+	epnum = urb->ep->desc.bEndpointAddress;
+
+	spin_lock_irqsave (&fusbh200->lock, flags);
+
+	if (unlikely(!HCD_HW_ACCESSIBLE(fusbh200_to_hcd(fusbh200)))) {
+		status = -ESHUTDOWN;
+		goto done_not_linked;
+	}
+	status = usb_hcd_link_urb_to_ep(fusbh200_to_hcd(fusbh200), urb);
+	if (unlikely(status))
+		goto done_not_linked;
+
+	/* get qh and force any scheduling errors */
+	INIT_LIST_HEAD (&empty);
+	qh = qh_append_tds(fusbh200, urb, &empty, epnum, &urb->ep->hcpriv);
+	if (qh == NULL) {
+		status = -ENOMEM;
+		goto done;
+	}
+	if (qh->qh_state == QH_STATE_IDLE) {
+		if ((status = qh_schedule (fusbh200, qh)) != 0)
+			goto done;
+	}
+
+	/* then queue the urb's tds to the qh */
+	qh = qh_append_tds(fusbh200, urb, qtd_list, epnum, &urb->ep->hcpriv);
+	BUG_ON (qh == NULL);
+
+	/* ... update usbfs periodic stats */
+	fusbh200_to_hcd(fusbh200)->self.bandwidth_int_reqs++;
+
+done:
+	if (unlikely(status))
+		usb_hcd_unlink_urb_from_ep(fusbh200_to_hcd(fusbh200), urb);
+done_not_linked:
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+	if (status)
+		qtd_list_free (fusbh200, urb, qtd_list);
+
+	return status;
+}
+
+static void scan_intr(struct fusbh200_hcd *fusbh200)
+{
+	struct fusbh200_qh		*qh;
+
+	list_for_each_entry_safe(qh, fusbh200->qh_scan_next, &fusbh200->intr_qh_list,
+			intr_node) {
+ rescan:
+		/* clean any finished work for this qh */
+		if (!list_empty(&qh->qtd_list)) {
+			int temp;
+
+			/*
+			 * Unlinks could happen here; completion reporting
+			 * drops the lock.  That's why fusbh200->qh_scan_next
+			 * always holds the next qh to scan; if the next qh
+			 * gets unlinked then fusbh200->qh_scan_next is adjusted
+			 * in qh_unlink_periodic().
+			 */
+			temp = qh_completions(fusbh200, qh);
+			if (unlikely(qh->needs_rescan ||
+					(list_empty(&qh->qtd_list) &&
+						qh->qh_state == QH_STATE_LINKED)))
+				start_unlink_intr(fusbh200, qh);
+			else if (temp != 0)
+				goto rescan;
+		}
+	}
+}
+
+/*-------------------------------------------------------------------------*/
+
+/* fusbh200_iso_stream ops work with both ITD and SITD */
+
+static struct fusbh200_iso_stream *
+iso_stream_alloc (gfp_t mem_flags)
+{
+	struct fusbh200_iso_stream *stream;
+
+	stream = kzalloc(sizeof *stream, mem_flags);
+	if (likely (stream != NULL)) {
+		INIT_LIST_HEAD(&stream->td_list);
+		INIT_LIST_HEAD(&stream->free_list);
+		stream->next_uframe = -1;
+	}
+	return stream;
+}
+
+static void
+iso_stream_init (
+	struct fusbh200_hcd		*fusbh200,
+	struct fusbh200_iso_stream	*stream,
+	struct usb_device	*dev,
+	int			pipe,
+	unsigned		interval
+)
+{
+	u32			buf1;
+	unsigned		epnum, maxp;
+	int			is_input;
+	long			bandwidth;
+	unsigned 		multi;
+
+	/*
+	 * this might be a "high bandwidth" highspeed endpoint,
+	 * as encoded in the ep descriptor's wMaxPacket field
+	 */
+	epnum = usb_pipeendpoint (pipe);
+	is_input = usb_pipein (pipe) ? USB_DIR_IN : 0;
+	maxp = usb_maxpacket(dev, pipe, !is_input);
+	if (is_input) {
+		buf1 = (1 << 11);
+	} else {
+		buf1 = 0;
+	}
+
+	maxp = max_packet(maxp);
+	multi = hb_mult(maxp);
+	buf1 |= maxp;
+	maxp *= multi;
+
+	stream->buf0 = cpu_to_hc32(fusbh200, (epnum << 8) | dev->devnum);
+	stream->buf1 = cpu_to_hc32(fusbh200, buf1);
+	stream->buf2 = cpu_to_hc32(fusbh200, multi);
+
+	/* usbfs wants to report the average usecs per frame tied up
+	 * when transfers on this endpoint are scheduled ...
+	 */
+	if (dev->speed == USB_SPEED_FULL) {
+		interval <<= 3;
+		stream->usecs = NS_TO_US(usb_calc_bus_time(dev->speed,
+				is_input, 1, maxp));
+		stream->usecs /= 8;
+	} else {
+		stream->highspeed = 1;
+		stream->usecs = HS_USECS_ISO (maxp);
+	}
+	bandwidth = stream->usecs * 8;
+	bandwidth /= interval;
+
+	stream->bandwidth = bandwidth;
+	stream->udev = dev;
+	stream->bEndpointAddress = is_input | epnum;
+	stream->interval = interval;
+	stream->maxp = maxp;
+}
+
+static struct fusbh200_iso_stream *
+iso_stream_find (struct fusbh200_hcd *fusbh200, struct urb *urb)
+{
+	unsigned		epnum;
+	struct fusbh200_iso_stream	*stream;
+	struct usb_host_endpoint *ep;
+	unsigned long		flags;
+
+	epnum = usb_pipeendpoint (urb->pipe);
+	if (usb_pipein(urb->pipe))
+		ep = urb->dev->ep_in[epnum];
+	else
+		ep = urb->dev->ep_out[epnum];
+
+	spin_lock_irqsave (&fusbh200->lock, flags);
+	stream = ep->hcpriv;
+
+	if (unlikely (stream == NULL)) {
+		stream = iso_stream_alloc(GFP_ATOMIC);
+		if (likely (stream != NULL)) {
+			ep->hcpriv = stream;
+			stream->ep = ep;
+			iso_stream_init(fusbh200, stream, urb->dev, urb->pipe,
+					urb->interval);
+		}
+
+	/* if dev->ep [epnum] is a QH, hw is set */
+	} else if (unlikely (stream->hw != NULL)) {
+		fusbh200_dbg (fusbh200, "dev %s ep%d%s, not iso??\n",
+			urb->dev->devpath, epnum,
+			usb_pipein(urb->pipe) ? "in" : "out");
+		stream = NULL;
+	}
+
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+	return stream;
+}
+
+/*-------------------------------------------------------------------------*/
+
+/* fusbh200_iso_sched ops can be ITD-only or SITD-only */
+
+static struct fusbh200_iso_sched *
+iso_sched_alloc (unsigned packets, gfp_t mem_flags)
+{
+	struct fusbh200_iso_sched	*iso_sched;
+	int			size = sizeof *iso_sched;
+
+	size += packets * sizeof (struct fusbh200_iso_packet);
+	iso_sched = kzalloc(size, mem_flags);
+	if (likely (iso_sched != NULL)) {
+		INIT_LIST_HEAD (&iso_sched->td_list);
+	}
+	return iso_sched;
+}
+
+static inline void
+itd_sched_init(
+	struct fusbh200_hcd		*fusbh200,
+	struct fusbh200_iso_sched	*iso_sched,
+	struct fusbh200_iso_stream	*stream,
+	struct urb		*urb
+)
+{
+	unsigned	i;
+	dma_addr_t	dma = urb->transfer_dma;
+
+	/* how many uframes are needed for these transfers */
+	iso_sched->span = urb->number_of_packets * stream->interval;
+
+	/* figure out per-uframe itd fields that we'll need later
+	 * when we fit new itds into the schedule.
+	 */
+	for (i = 0; i < urb->number_of_packets; i++) {
+		struct fusbh200_iso_packet	*uframe = &iso_sched->packet [i];
+		unsigned		length;
+		dma_addr_t		buf;
+		u32			trans;
+
+		length = urb->iso_frame_desc [i].length;
+		buf = dma + urb->iso_frame_desc [i].offset;
+
+		trans = FUSBH200_ISOC_ACTIVE;
+		trans |= buf & 0x0fff;
+		if (unlikely (((i + 1) == urb->number_of_packets))
+				&& !(urb->transfer_flags & URB_NO_INTERRUPT))
+			trans |= FUSBH200_ITD_IOC;
+		trans |= length << 16;
+		uframe->transaction = cpu_to_hc32(fusbh200, trans);
+
+		/* might need to cross a buffer page within a uframe */
+		uframe->bufp = (buf & ~(u64)0x0fff);
+		buf += length;
+		if (unlikely ((uframe->bufp != (buf & ~(u64)0x0fff))))
+			uframe->cross = 1;
+	}
+}
+
+static void
+iso_sched_free (
+	struct fusbh200_iso_stream	*stream,
+	struct fusbh200_iso_sched	*iso_sched
+)
+{
+	if (!iso_sched)
+		return;
+	// caller must hold fusbh200->lock!
+	list_splice (&iso_sched->td_list, &stream->free_list);
+	kfree (iso_sched);
+}
+
+static int
+itd_urb_transaction (
+	struct fusbh200_iso_stream	*stream,
+	struct fusbh200_hcd		*fusbh200,
+	struct urb		*urb,
+	gfp_t			mem_flags
+)
+{
+	struct fusbh200_itd		*itd;
+	dma_addr_t		itd_dma;
+	int			i;
+	unsigned		num_itds;
+	struct fusbh200_iso_sched	*sched;
+	unsigned long		flags;
+
+	sched = iso_sched_alloc (urb->number_of_packets, mem_flags);
+	if (unlikely (sched == NULL))
+		return -ENOMEM;
+
+	itd_sched_init(fusbh200, sched, stream, urb);
+
+	if (urb->interval < 8)
+		num_itds = 1 + (sched->span + 7) / 8;
+	else
+		num_itds = urb->number_of_packets;
+
+	/* allocate/init ITDs */
+	spin_lock_irqsave (&fusbh200->lock, flags);
+	for (i = 0; i < num_itds; i++) {
+
+		/*
+		 * Use iTDs from the free list, but not iTDs that may
+		 * still be in use by the hardware.
+		 */
+		if (likely(!list_empty(&stream->free_list))) {
+			itd = list_first_entry(&stream->free_list,
+					struct fusbh200_itd, itd_list);
+			if (itd->frame == fusbh200->now_frame)
+				goto alloc_itd;
+			list_del (&itd->itd_list);
+			itd_dma = itd->itd_dma;
+		} else {
+ alloc_itd:
+			spin_unlock_irqrestore (&fusbh200->lock, flags);
+			itd = dma_pool_alloc (fusbh200->itd_pool, mem_flags,
+					&itd_dma);
+			spin_lock_irqsave (&fusbh200->lock, flags);
+			if (!itd) {
+				iso_sched_free(stream, sched);
+				spin_unlock_irqrestore(&fusbh200->lock, flags);
+				return -ENOMEM;
+			}
+		}
+
+		memset (itd, 0, sizeof *itd);
+		itd->itd_dma = itd_dma;
+		list_add (&itd->itd_list, &sched->td_list);
+	}
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+
+	/* temporarily store schedule info in hcpriv */
+	urb->hcpriv = sched;
+	urb->error_count = 0;
+	return 0;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static inline int
+itd_slot_ok (
+	struct fusbh200_hcd		*fusbh200,
+	u32			mod,
+	u32			uframe,
+	u8			usecs,
+	u32			period
+)
+{
+	uframe %= period;
+	do {
+		/* can't commit more than uframe_periodic_max usec */
+		if (periodic_usecs (fusbh200, uframe >> 3, uframe & 0x7)
+				> (fusbh200->uframe_periodic_max - usecs))
+			return 0;
+
+		/* we know urb->interval is 2^N uframes */
+		uframe += period;
+	} while (uframe < mod);
+	return 1;
+}
+
+/*
+ * This scheduler plans almost as far into the future as it has actual
+ * periodic schedule slots.  (Affected by TUNE_FLS, which defaults to
+ * "as small as possible" to be cache-friendlier.)  That limits the size
+ * transfers you can stream reliably; avoid more than 64 msec per urb.
+ * Also avoid queue depths of less than fusbh200's worst irq latency (affected
+ * by the per-urb URB_NO_INTERRUPT hint, the log2_irq_thresh module parameter,
+ * and other factors); or more than about 230 msec total (for portability,
+ * given FUSBH200_TUNE_FLS and the slop).  Or, write a smarter scheduler!
+ */
+
+#define SCHEDULE_SLOP	80	/* microframes */
+
+static int
+iso_stream_schedule (
+	struct fusbh200_hcd		*fusbh200,
+	struct urb		*urb,
+	struct fusbh200_iso_stream	*stream
+)
+{
+	u32			now, next, start, period, span;
+	int			status;
+	unsigned		mod = fusbh200->periodic_size << 3;
+	struct fusbh200_iso_sched	*sched = urb->hcpriv;
+
+	period = urb->interval;
+	span = sched->span;
+
+	if (span > mod - SCHEDULE_SLOP) {
+		fusbh200_dbg (fusbh200, "iso request %p too long\n", urb);
+		status = -EFBIG;
+		goto fail;
+	}
+
+	now = fusbh200_read_frame_index(fusbh200) & (mod - 1);
+
+	/* Typical case: reuse current schedule, stream is still active.
+	 * Hopefully there are no gaps from the host falling behind
+	 * (irq delays etc), but if there are we'll take the next
+	 * slot in the schedule, implicitly assuming URB_ISO_ASAP.
+	 */
+	if (likely (!list_empty (&stream->td_list))) {
+		u32	excess;
+
+		/* For high speed devices, allow scheduling within the
+		 * isochronous scheduling threshold.  For full speed devices
+		 * and Intel PCI-based controllers, don't (work around for
+		 * Intel ICH9 bug).
+		 */
+		if (!stream->highspeed && fusbh200->fs_i_thresh)
+			next = now + fusbh200->i_thresh;
+		else
+			next = now;
+
+		/* Fell behind (by up to twice the slop amount)?
+		 * We decide based on the time of the last currently-scheduled
+		 * slot, not the time of the next available slot.
+		 */
+		excess = (stream->next_uframe - period - next) & (mod - 1);
+		if (excess >= mod - 2 * SCHEDULE_SLOP)
+			start = next + excess - mod + period *
+					DIV_ROUND_UP(mod - excess, period);
+		else
+			start = next + excess + period;
+		if (start - now >= mod) {
+			fusbh200_dbg(fusbh200, "request %p would overflow (%d+%d >= %d)\n",
+					urb, start - now - period, period,
+					mod);
+			status = -EFBIG;
+			goto fail;
+		}
+	}
+
+	/* need to schedule; when's the next (u)frame we could start?
+	 * this is bigger than fusbh200->i_thresh allows; scheduling itself
+	 * isn't free, the slop should handle reasonably slow cpus.  it
+	 * can also help high bandwidth if the dma and irq loads don't
+	 * jump until after the queue is primed.
+	 */
+	else {
+		int done = 0;
+		start = SCHEDULE_SLOP + (now & ~0x07);
+
+		/* NOTE:  assumes URB_ISO_ASAP, to limit complexity/bugs */
+
+		/* find a uframe slot with enough bandwidth.
+		 * Early uframes are more precious because full-speed
+		 * iso IN transfers can't use late uframes,
+		 * and therefore they should be allocated last.
+		 */
+		next = start;
+		start += period;
+		do {
+			start--;
+			/* check schedule: enough space? */
+			if (itd_slot_ok(fusbh200, mod, start,
+					stream->usecs, period))
+				done = 1;
+		} while (start > next && !done);
+
+		/* no room in the schedule */
+		if (!done) {
+			fusbh200_dbg(fusbh200, "iso resched full %p (now %d max %d)\n",
+				urb, now, now + mod);
+			status = -ENOSPC;
+			goto fail;
+		}
+	}
+
+	/* Tried to schedule too far into the future? */
+	if (unlikely(start - now + span - period
+				>= mod - 2 * SCHEDULE_SLOP)) {
+		fusbh200_dbg(fusbh200, "request %p would overflow (%d+%d >= %d)\n",
+				urb, start - now, span - period,
+				mod - 2 * SCHEDULE_SLOP);
+		status = -EFBIG;
+		goto fail;
+	}
+
+	stream->next_uframe = start & (mod - 1);
+
+	/* report high speed start in uframes; full speed, in frames */
+	urb->start_frame = stream->next_uframe;
+	if (!stream->highspeed)
+		urb->start_frame >>= 3;
+
+	/* Make sure scan_isoc() sees these */
+	if (fusbh200->isoc_count == 0)
+		fusbh200->next_frame = now >> 3;
+	return 0;
+
+ fail:
+	iso_sched_free(stream, sched);
+	urb->hcpriv = NULL;
+	return status;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static inline void
+itd_init(struct fusbh200_hcd *fusbh200, struct fusbh200_iso_stream *stream,
+		struct fusbh200_itd *itd)
+{
+	int i;
+
+	/* it's been recently zeroed */
+	itd->hw_next = FUSBH200_LIST_END(fusbh200);
+	itd->hw_bufp [0] = stream->buf0;
+	itd->hw_bufp [1] = stream->buf1;
+	itd->hw_bufp [2] = stream->buf2;
+
+	for (i = 0; i < 8; i++)
+		itd->index[i] = -1;
+
+	/* All other fields are filled when scheduling */
+}
+
+static inline void
+itd_patch(
+	struct fusbh200_hcd		*fusbh200,
+	struct fusbh200_itd		*itd,
+	struct fusbh200_iso_sched	*iso_sched,
+	unsigned		index,
+	u16			uframe
+)
+{
+	struct fusbh200_iso_packet	*uf = &iso_sched->packet [index];
+	unsigned		pg = itd->pg;
+
+	// BUG_ON (pg == 6 && uf->cross);
+
+	uframe &= 0x07;
+	itd->index [uframe] = index;
+
+	itd->hw_transaction[uframe] = uf->transaction;
+	itd->hw_transaction[uframe] |= cpu_to_hc32(fusbh200, pg << 12);
+	itd->hw_bufp[pg] |= cpu_to_hc32(fusbh200, uf->bufp & ~(u32)0);
+	itd->hw_bufp_hi[pg] |= cpu_to_hc32(fusbh200, (u32)(uf->bufp >> 32));
+
+	/* iso_frame_desc[].offset must be strictly increasing */
+	if (unlikely (uf->cross)) {
+		u64	bufp = uf->bufp + 4096;
+
+		itd->pg = ++pg;
+		itd->hw_bufp[pg] |= cpu_to_hc32(fusbh200, bufp & ~(u32)0);
+		itd->hw_bufp_hi[pg] |= cpu_to_hc32(fusbh200, (u32)(bufp >> 32));
+	}
+}
+
+static inline void
+itd_link (struct fusbh200_hcd *fusbh200, unsigned frame, struct fusbh200_itd *itd)
+{
+	union fusbh200_shadow	*prev = &fusbh200->pshadow[frame];
+	__hc32			*hw_p = &fusbh200->periodic[frame];
+	union fusbh200_shadow	here = *prev;
+	__hc32			type = 0;
+
+	/* skip any iso nodes which might belong to previous microframes */
+	while (here.ptr) {
+		type = Q_NEXT_TYPE(fusbh200, *hw_p);
+		if (type == cpu_to_hc32(fusbh200, Q_TYPE_QH))
+			break;
+		prev = periodic_next_shadow(fusbh200, prev, type);
+		hw_p = shadow_next_periodic(fusbh200, &here, type);
+		here = *prev;
+	}
+
+	itd->itd_next = here;
+	itd->hw_next = *hw_p;
+	prev->itd = itd;
+	itd->frame = frame;
+	wmb ();
+	*hw_p = cpu_to_hc32(fusbh200, itd->itd_dma | Q_TYPE_ITD);
+}
+
+/* fit urb's itds into the selected schedule slot; activate as needed */
+static void itd_link_urb(
+	struct fusbh200_hcd		*fusbh200,
+	struct urb		*urb,
+	unsigned		mod,
+	struct fusbh200_iso_stream	*stream
+)
+{
+	int			packet;
+	unsigned		next_uframe, uframe, frame;
+	struct fusbh200_iso_sched	*iso_sched = urb->hcpriv;
+	struct fusbh200_itd		*itd;
+
+	next_uframe = stream->next_uframe & (mod - 1);
+
+	if (unlikely (list_empty(&stream->td_list))) {
+		fusbh200_to_hcd(fusbh200)->self.bandwidth_allocated
+				+= stream->bandwidth;
+		fusbh200_vdbg (fusbh200,
+			"schedule devp %s ep%d%s-iso period %d start %d.%d\n",
+			urb->dev->devpath, stream->bEndpointAddress & 0x0f,
+			(stream->bEndpointAddress & USB_DIR_IN) ? "in" : "out",
+			urb->interval,
+			next_uframe >> 3, next_uframe & 0x7);
+	}
+
+	/* fill iTDs uframe by uframe */
+	for (packet = 0, itd = NULL; packet < urb->number_of_packets; ) {
+		if (itd == NULL) {
+			/* ASSERT:  we have all necessary itds */
+			// BUG_ON (list_empty (&iso_sched->td_list));
+
+			/* ASSERT:  no itds for this endpoint in this uframe */
+
+			itd = list_entry (iso_sched->td_list.next,
+					struct fusbh200_itd, itd_list);
+			list_move_tail (&itd->itd_list, &stream->td_list);
+			itd->stream = stream;
+			itd->urb = urb;
+			itd_init (fusbh200, stream, itd);
+		}
+
+		uframe = next_uframe & 0x07;
+		frame = next_uframe >> 3;
+
+		itd_patch(fusbh200, itd, iso_sched, packet, uframe);
+
+		next_uframe += stream->interval;
+		next_uframe &= mod - 1;
+		packet++;
+
+		/* link completed itds into the schedule */
+		if (((next_uframe >> 3) != frame)
+				|| packet == urb->number_of_packets) {
+			itd_link(fusbh200, frame & (fusbh200->periodic_size - 1), itd);
+			itd = NULL;
+		}
+	}
+	stream->next_uframe = next_uframe;
+
+	/* don't need that schedule data any more */
+	iso_sched_free (stream, iso_sched);
+	urb->hcpriv = NULL;
+
+	++fusbh200->isoc_count;
+	enable_periodic(fusbh200);
+}
+
+#define	ISO_ERRS (FUSBH200_ISOC_BUF_ERR | FUSBH200_ISOC_BABBLE | FUSBH200_ISOC_XACTERR)
+
+/* Process and recycle a completed ITD.  Return true iff its urb completed,
+ * and hence its completion callback probably added things to the hardware
+ * schedule.
+ *
+ * Note that we carefully avoid recycling this descriptor until after any
+ * completion callback runs, so that it won't be reused quickly.  That is,
+ * assuming (a) no more than two urbs per frame on this endpoint, and also
+ * (b) only this endpoint's completions submit URBs.  It seems some silicon
+ * corrupts things if you reuse completed descriptors very quickly...
+ */
+static bool itd_complete(struct fusbh200_hcd *fusbh200, struct fusbh200_itd *itd)
+{
+	struct urb				*urb = itd->urb;
+	struct usb_iso_packet_descriptor	*desc;
+	u32					t;
+	unsigned				uframe;
+	int					urb_index = -1;
+	struct fusbh200_iso_stream			*stream = itd->stream;
+	struct usb_device			*dev;
+	bool					retval = false;
+
+	/* for each uframe with a packet */
+	for (uframe = 0; uframe < 8; uframe++) {
+		if (likely (itd->index[uframe] == -1))
+			continue;
+		urb_index = itd->index[uframe];
+		desc = &urb->iso_frame_desc [urb_index];
+
+		t = hc32_to_cpup(fusbh200, &itd->hw_transaction [uframe]);
+		itd->hw_transaction [uframe] = 0;
+
+		/* report transfer status */
+		if (unlikely (t & ISO_ERRS)) {
+			urb->error_count++;
+			if (t & FUSBH200_ISOC_BUF_ERR)
+				desc->status = usb_pipein (urb->pipe)
+					? -ENOSR  /* hc couldn't read */
+					: -ECOMM; /* hc couldn't write */
+			else if (t & FUSBH200_ISOC_BABBLE)
+				desc->status = -EOVERFLOW;
+			else /* (t & FUSBH200_ISOC_XACTERR) */
+				desc->status = -EPROTO;
+
+			/* HC need not update length with this error */
+			if (!(t & FUSBH200_ISOC_BABBLE)) {
+				desc->actual_length = fusbh200_itdlen(urb, desc, t);
+				urb->actual_length += desc->actual_length;
+			}
+		} else if (likely ((t & FUSBH200_ISOC_ACTIVE) == 0)) {
+			desc->status = 0;
+			desc->actual_length = fusbh200_itdlen(urb, desc, t);
+			urb->actual_length += desc->actual_length;
+		} else {
+			/* URB was too late */
+			desc->status = -EXDEV;
+		}
+	}
+
+	/* handle completion now? */
+	if (likely ((urb_index + 1) != urb->number_of_packets))
+		goto done;
+
+	/* ASSERT: it's really the last itd for this urb
+	list_for_each_entry (itd, &stream->td_list, itd_list)
+		BUG_ON (itd->urb == urb);
+	 */
+
+	/* give urb back to the driver; completion often (re)submits */
+	dev = urb->dev;
+	fusbh200_urb_done(fusbh200, urb, 0);
+	retval = true;
+	urb = NULL;
+
+	--fusbh200->isoc_count;
+	disable_periodic(fusbh200);
+
+	if (unlikely(list_is_singular(&stream->td_list))) {
+		fusbh200_to_hcd(fusbh200)->self.bandwidth_allocated
+				-= stream->bandwidth;
+		fusbh200_vdbg (fusbh200,
+			"deschedule devp %s ep%d%s-iso\n",
+			dev->devpath, stream->bEndpointAddress & 0x0f,
+			(stream->bEndpointAddress & USB_DIR_IN) ? "in" : "out");
+	}
+
+done:
+	itd->urb = NULL;
+
+	/* Add to the end of the free list for later reuse */
+	list_move_tail(&itd->itd_list, &stream->free_list);
+
+	/* Recycle the iTDs when the pipeline is empty (ep no longer in use) */
+	if (list_empty(&stream->td_list)) {
+		list_splice_tail_init(&stream->free_list,
+				&fusbh200->cached_itd_list);
+		start_free_itds(fusbh200);
+	}
+
+	return retval;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static int itd_submit (struct fusbh200_hcd *fusbh200, struct urb *urb,
+	gfp_t mem_flags)
+{
+	int			status = -EINVAL;
+	unsigned long		flags;
+	struct fusbh200_iso_stream	*stream;
+
+	/* Get iso_stream head */
+	stream = iso_stream_find (fusbh200, urb);
+	if (unlikely (stream == NULL)) {
+		fusbh200_dbg (fusbh200, "can't get iso stream\n");
+		return -ENOMEM;
+	}
+	if (unlikely (urb->interval != stream->interval &&
+		      fusbh200_port_speed(fusbh200, 0) == USB_PORT_STAT_HIGH_SPEED)) {
+			fusbh200_dbg (fusbh200, "can't change iso interval %d --> %d\n",
+				stream->interval, urb->interval);
+			goto done;
+	}
+
+#ifdef FUSBH200_URB_TRACE
+	fusbh200_dbg (fusbh200,
+		"%s %s urb %p ep%d%s len %d, %d pkts %d uframes [%p]\n",
+		__func__, urb->dev->devpath, urb,
+		usb_pipeendpoint (urb->pipe),
+		usb_pipein (urb->pipe) ? "in" : "out",
+		urb->transfer_buffer_length,
+		urb->number_of_packets, urb->interval,
+		stream);
+#endif
+
+	/* allocate ITDs w/o locking anything */
+	status = itd_urb_transaction (stream, fusbh200, urb, mem_flags);
+	if (unlikely (status < 0)) {
+		fusbh200_dbg (fusbh200, "can't init itds\n");
+		goto done;
+	}
+
+	/* schedule ... need to lock */
+	spin_lock_irqsave (&fusbh200->lock, flags);
+	if (unlikely(!HCD_HW_ACCESSIBLE(fusbh200_to_hcd(fusbh200)))) {
+		status = -ESHUTDOWN;
+		goto done_not_linked;
+	}
+	status = usb_hcd_link_urb_to_ep(fusbh200_to_hcd(fusbh200), urb);
+	if (unlikely(status))
+		goto done_not_linked;
+	status = iso_stream_schedule(fusbh200, urb, stream);
+	if (likely (status == 0))
+		itd_link_urb (fusbh200, urb, fusbh200->periodic_size << 3, stream);
+	else
+		usb_hcd_unlink_urb_from_ep(fusbh200_to_hcd(fusbh200), urb);
+ done_not_linked:
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+ done:
+	return status;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void scan_isoc(struct fusbh200_hcd *fusbh200)
+{
+	unsigned	uf, now_frame, frame;
+	unsigned	fmask = fusbh200->periodic_size - 1;
+	bool		modified, live;
+
+	/*
+	 * When running, scan from last scan point up to "now"
+	 * else clean up by scanning everything that's left.
+	 * Touches as few pages as possible:  cache-friendly.
+	 */
+	if (fusbh200->rh_state >= FUSBH200_RH_RUNNING) {
+		uf = fusbh200_read_frame_index(fusbh200);
+		now_frame = (uf >> 3) & fmask;
+		live = true;
+	} else  {
+		now_frame = (fusbh200->next_frame - 1) & fmask;
+		live = false;
+	}
+	fusbh200->now_frame = now_frame;
+
+	frame = fusbh200->next_frame;
+	for (;;) {
+		union fusbh200_shadow	q, *q_p;
+		__hc32			type, *hw_p;
+
+restart:
+		/* scan each element in frame's queue for completions */
+		q_p = &fusbh200->pshadow [frame];
+		hw_p = &fusbh200->periodic [frame];
+		q.ptr = q_p->ptr;
+		type = Q_NEXT_TYPE(fusbh200, *hw_p);
+		modified = false;
+
+		while (q.ptr != NULL) {
+			switch (hc32_to_cpu(fusbh200, type)) {
+			case Q_TYPE_ITD:
+				/* If this ITD is still active, leave it for
+				 * later processing ... check the next entry.
+				 * No need to check for activity unless the
+				 * frame is current.
+				 */
+				if (frame == now_frame && live) {
+					rmb();
+					for (uf = 0; uf < 8; uf++) {
+						if (q.itd->hw_transaction[uf] &
+							    ITD_ACTIVE(fusbh200))
+							break;
+					}
+					if (uf < 8) {
+						q_p = &q.itd->itd_next;
+						hw_p = &q.itd->hw_next;
+						type = Q_NEXT_TYPE(fusbh200,
+							q.itd->hw_next);
+						q = *q_p;
+						break;
+					}
+				}
+
+				/* Take finished ITDs out of the schedule
+				 * and process them:  recycle, maybe report
+				 * URB completion.  HC won't cache the
+				 * pointer for much longer, if at all.
+				 */
+				*q_p = q.itd->itd_next;
+				*hw_p = q.itd->hw_next;
+				type = Q_NEXT_TYPE(fusbh200, q.itd->hw_next);
+				wmb();
+				modified = itd_complete (fusbh200, q.itd);
+				q = *q_p;
+				break;
+			default:
+				fusbh200_dbg(fusbh200, "corrupt type %d frame %d shadow %p\n",
+					type, frame, q.ptr);
+				// BUG ();
+				/* FALL THROUGH */
+			case Q_TYPE_QH:
+			case Q_TYPE_FSTN:
+				/* End of the iTDs and siTDs */
+				q.ptr = NULL;
+				break;
+			}
+
+			/* assume completion callbacks modify the queue */
+			if (unlikely(modified && fusbh200->isoc_count > 0))
+				goto restart;
+		}
+
+		/* Stop when we have reached the current frame */
+		if (frame == now_frame)
+			break;
+		frame = (frame + 1) & fmask;
+	}
+	fusbh200->next_frame = now_frame;
+}
+/*-------------------------------------------------------------------------*/
+/*
+ * Display / Set uframe_periodic_max
+ */
+static ssize_t show_uframe_periodic_max(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct fusbh200_hcd		*fusbh200;
+	int			n;
+
+	fusbh200 = hcd_to_fusbh200(bus_to_hcd(dev_get_drvdata(dev)));
+	n = scnprintf(buf, PAGE_SIZE, "%d\n", fusbh200->uframe_periodic_max);
+	return n;
+}
+
+
+static ssize_t store_uframe_periodic_max(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct fusbh200_hcd		*fusbh200;
+	unsigned		uframe_periodic_max;
+	unsigned		frame, uframe;
+	unsigned short		allocated_max;
+	unsigned long		flags;
+	ssize_t			ret;
+
+	fusbh200 = hcd_to_fusbh200(bus_to_hcd(dev_get_drvdata(dev)));
+	if (kstrtouint(buf, 0, &uframe_periodic_max) < 0)
+		return -EINVAL;
+
+	if (uframe_periodic_max < 100 || uframe_periodic_max >= 125) {
+		fusbh200_info(fusbh200, "rejecting invalid request for "
+				"uframe_periodic_max=%u\n", uframe_periodic_max);
+		return -EINVAL;
+	}
+
+	ret = -EINVAL;
+
+	/*
+	 * lock, so that our checking does not race with possible periodic
+	 * bandwidth allocation through submitting new urbs.
+	 */
+	spin_lock_irqsave (&fusbh200->lock, flags);
+
+	/*
+	 * for request to decrease max periodic bandwidth, we have to check
+	 * every microframe in the schedule to see whether the decrease is
+	 * possible.
+	 */
+	if (uframe_periodic_max < fusbh200->uframe_periodic_max) {
+		allocated_max = 0;
+
+		for (frame = 0; frame < fusbh200->periodic_size; ++frame)
+			for (uframe = 0; uframe < 7; ++uframe)
+				allocated_max = max(allocated_max,
+						    periodic_usecs (fusbh200, frame, uframe));
+
+		if (allocated_max > uframe_periodic_max) {
+			fusbh200_info(fusbh200,
+				"cannot decrease uframe_periodic_max becase "
+				"periodic bandwidth is already allocated "
+				"(%u > %u)\n",
+				allocated_max, uframe_periodic_max);
+			goto out_unlock;
+		}
+	}
+
+	/* increasing is always ok */
+
+	fusbh200_info(fusbh200, "setting max periodic bandwidth to %u%% "
+			"(== %u usec/uframe)\n",
+			100*uframe_periodic_max/125, uframe_periodic_max);
+
+	if (uframe_periodic_max != 100)
+		fusbh200_warn(fusbh200, "max periodic bandwidth set is non-standard\n");
+
+	fusbh200->uframe_periodic_max = uframe_periodic_max;
+	ret = count;
+
+out_unlock:
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+	return ret;
+}
+static DEVICE_ATTR(uframe_periodic_max, 0644, show_uframe_periodic_max, store_uframe_periodic_max);
+
+
+static inline int create_sysfs_files(struct fusbh200_hcd *fusbh200)
+{
+	struct device	*controller = fusbh200_to_hcd(fusbh200)->self.controller;
+	int	i = 0;
+
+	if (i)
+		goto out;
+
+	i = device_create_file(controller, &dev_attr_uframe_periodic_max);
+out:
+	return i;
+}
+
+static inline void remove_sysfs_files(struct fusbh200_hcd *fusbh200)
+{
+	struct device	*controller = fusbh200_to_hcd(fusbh200)->self.controller;
+
+	device_remove_file(controller, &dev_attr_uframe_periodic_max);
+}
+/*-------------------------------------------------------------------------*/
+
+/* On some systems, leaving remote wakeup enabled prevents system shutdown.
+ * The firmware seems to think that powering off is a wakeup event!
+ * This routine turns off remote wakeup and everything else, on all ports.
+ */
+static void fusbh200_turn_off_all_ports(struct fusbh200_hcd *fusbh200)
+{
+	u32 __iomem *status_reg = &fusbh200->regs->port_status;
+
+	fusbh200_writel(fusbh200, PORT_RWC_BITS, status_reg);
+}
+
+/*
+ * Halt HC, turn off all ports, and let the BIOS use the companion controllers.
+ * Must be called with interrupts enabled and the lock not held.
+ */
+static void fusbh200_silence_controller(struct fusbh200_hcd *fusbh200)
+{
+	fusbh200_halt(fusbh200);
+
+	spin_lock_irq(&fusbh200->lock);
+	fusbh200->rh_state = FUSBH200_RH_HALTED;
+	fusbh200_turn_off_all_ports(fusbh200);
+	spin_unlock_irq(&fusbh200->lock);
+}
+
+/* fusbh200_shutdown kick in for silicon on any bus (not just pci, etc).
+ * This forcibly disables dma and IRQs, helping kexec and other cases
+ * where the next system software may expect clean state.
+ */
+static void fusbh200_shutdown(struct usb_hcd *hcd)
+{
+	struct fusbh200_hcd	*fusbh200 = hcd_to_fusbh200(hcd);
+
+	spin_lock_irq(&fusbh200->lock);
+	fusbh200->shutdown = true;
+	fusbh200->rh_state = FUSBH200_RH_STOPPING;
+	fusbh200->enabled_hrtimer_events = 0;
+	spin_unlock_irq(&fusbh200->lock);
+
+	fusbh200_silence_controller(fusbh200);
+
+	hrtimer_cancel(&fusbh200->hrtimer);
+}
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * fusbh200_work is called from some interrupts, timers, and so on.
+ * it calls driver completion functions, after dropping fusbh200->lock.
+ */
+static void fusbh200_work (struct fusbh200_hcd *fusbh200)
+{
+	/* another CPU may drop fusbh200->lock during a schedule scan while
+	 * it reports urb completions.  this flag guards against bogus
+	 * attempts at re-entrant schedule scanning.
+	 */
+	if (fusbh200->scanning) {
+		fusbh200->need_rescan = true;
+		return;
+	}
+	fusbh200->scanning = true;
+
+ rescan:
+	fusbh200->need_rescan = false;
+	if (fusbh200->async_count)
+		scan_async(fusbh200);
+	if (fusbh200->intr_count > 0)
+		scan_intr(fusbh200);
+	if (fusbh200->isoc_count > 0)
+		scan_isoc(fusbh200);
+	if (fusbh200->need_rescan)
+		goto rescan;
+	fusbh200->scanning = false;
+
+	/* the IO watchdog guards against hardware or driver bugs that
+	 * misplace IRQs, and should let us run completely without IRQs.
+	 * such lossage has been observed on both VT6202 and VT8235.
+	 */
+	turn_on_io_watchdog(fusbh200);
+}
+
+/*
+ * Called when the fusbh200_hcd module is removed.
+ */
+static void fusbh200_stop (struct usb_hcd *hcd)
+{
+	struct fusbh200_hcd		*fusbh200 = hcd_to_fusbh200 (hcd);
+
+	fusbh200_dbg (fusbh200, "stop\n");
+
+	/* no more interrupts ... */
+
+	spin_lock_irq(&fusbh200->lock);
+	fusbh200->enabled_hrtimer_events = 0;
+	spin_unlock_irq(&fusbh200->lock);
+
+	fusbh200_quiesce(fusbh200);
+	fusbh200_silence_controller(fusbh200);
+	fusbh200_reset (fusbh200);
+
+	hrtimer_cancel(&fusbh200->hrtimer);
+	remove_sysfs_files(fusbh200);
+	remove_debug_files (fusbh200);
+
+	/* root hub is shut down separately (first, when possible) */
+	spin_lock_irq (&fusbh200->lock);
+	end_free_itds(fusbh200);
+	spin_unlock_irq (&fusbh200->lock);
+	fusbh200_mem_cleanup (fusbh200);
+
+#ifdef	FUSBH200_STATS
+	fusbh200_dbg(fusbh200, "irq normal %ld err %ld iaa %ld (lost %ld)\n",
+		fusbh200->stats.normal, fusbh200->stats.error, fusbh200->stats.iaa,
+		fusbh200->stats.lost_iaa);
+	fusbh200_dbg (fusbh200, "complete %ld unlink %ld\n",
+		fusbh200->stats.complete, fusbh200->stats.unlink);
+#endif
+
+	dbg_status (fusbh200, "fusbh200_stop completed",
+		    fusbh200_readl(fusbh200, &fusbh200->regs->status));
+}
+
+/* one-time init, only for memory state */
+static int hcd_fusbh200_init(struct usb_hcd *hcd)
+{
+	struct fusbh200_hcd		*fusbh200 = hcd_to_fusbh200(hcd);
+	u32			temp;
+	int			retval;
+	u32			hcc_params;
+	struct fusbh200_qh_hw	*hw;
+
+	spin_lock_init(&fusbh200->lock);
+
+	/*
+	 * keep io watchdog by default, those good HCDs could turn off it later
+	 */
+	fusbh200->need_io_watchdog = 1;
+
+	hrtimer_init(&fusbh200->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	fusbh200->hrtimer.function = fusbh200_hrtimer_func;
+	fusbh200->next_hrtimer_event = FUSBH200_HRTIMER_NO_EVENT;
+
+	hcc_params = fusbh200_readl(fusbh200, &fusbh200->caps->hcc_params);
+
+	/*
+	 * by default set standard 80% (== 100 usec/uframe) max periodic
+	 * bandwidth as required by USB 2.0
+	 */
+	fusbh200->uframe_periodic_max = 100;
+
+	/*
+	 * hw default: 1K periodic list heads, one per frame.
+	 * periodic_size can shrink by USBCMD update if hcc_params allows.
+	 */
+	fusbh200->periodic_size = DEFAULT_I_TDPS;
+	INIT_LIST_HEAD(&fusbh200->intr_qh_list);
+	INIT_LIST_HEAD(&fusbh200->cached_itd_list);
+
+	if (HCC_PGM_FRAMELISTLEN(hcc_params)) {
+		/* periodic schedule size can be smaller than default */
+		switch (FUSBH200_TUNE_FLS) {
+		case 0: fusbh200->periodic_size = 1024; break;
+		case 1: fusbh200->periodic_size = 512; break;
+		case 2: fusbh200->periodic_size = 256; break;
+		default:	BUG();
+		}
+	}
+	if ((retval = fusbh200_mem_init(fusbh200, GFP_KERNEL)) < 0)
+		return retval;
+
+	/* controllers may cache some of the periodic schedule ... */
+	fusbh200->i_thresh = 2;
+
+	/*
+	 * dedicate a qh for the async ring head, since we couldn't unlink
+	 * a 'real' qh without stopping the async schedule [4.8].  use it
+	 * as the 'reclamation list head' too.
+	 * its dummy is used in hw_alt_next of many tds, to prevent the qh
+	 * from automatically advancing to the next td after short reads.
+	 */
+	fusbh200->async->qh_next.qh = NULL;
+	hw = fusbh200->async->hw;
+	hw->hw_next = QH_NEXT(fusbh200, fusbh200->async->qh_dma);
+	hw->hw_info1 = cpu_to_hc32(fusbh200, QH_HEAD);
+	hw->hw_token = cpu_to_hc32(fusbh200, QTD_STS_HALT);
+	hw->hw_qtd_next = FUSBH200_LIST_END(fusbh200);
+	fusbh200->async->qh_state = QH_STATE_LINKED;
+	hw->hw_alt_next = QTD_NEXT(fusbh200, fusbh200->async->dummy->qtd_dma);
+
+	/* clear interrupt enables, set irq latency */
+	if (log2_irq_thresh < 0 || log2_irq_thresh > 6)
+		log2_irq_thresh = 0;
+	temp = 1 << (16 + log2_irq_thresh);
+	if (HCC_CANPARK(hcc_params)) {
+		/* HW default park == 3, on hardware that supports it (like
+		 * NVidia and ALI silicon), maximizes throughput on the async
+		 * schedule by avoiding QH fetches between transfers.
+		 *
+		 * With fast usb storage devices and NForce2, "park" seems to
+		 * make problems:  throughput reduction (!), data errors...
+		 */
+		if (park) {
+			park = min(park, (unsigned) 3);
+			temp |= CMD_PARK;
+			temp |= park << 8;
+		}
+		fusbh200_dbg(fusbh200, "park %d\n", park);
+	}
+	if (HCC_PGM_FRAMELISTLEN(hcc_params)) {
+		/* periodic schedule size can be smaller than default */
+		temp &= ~(3 << 2);
+		temp |= (FUSBH200_TUNE_FLS << 2);
+	}
+	fusbh200->command = temp;
+
+	/* Accept arbitrarily long scatter-gather lists */
+	if (!(hcd->driver->flags & HCD_LOCAL_MEM))
+		hcd->self.sg_tablesize = ~0;
+	return 0;
+}
+
+/* start HC running; it's halted, hcd_fusbh200_init() has been run (once) */
+static int fusbh200_run (struct usb_hcd *hcd)
+{
+	struct fusbh200_hcd		*fusbh200 = hcd_to_fusbh200 (hcd);
+	u32			temp;
+	u32			hcc_params;
+
+	hcd->uses_new_polling = 1;
+
+	/* EHCI spec section 4.1 */
+
+	fusbh200_writel(fusbh200, fusbh200->periodic_dma, &fusbh200->regs->frame_list);
+	fusbh200_writel(fusbh200, (u32)fusbh200->async->qh_dma, &fusbh200->regs->async_next);
+
+	/*
+	 * hcc_params controls whether fusbh200->regs->segment must (!!!)
+	 * be used; it constrains QH/ITD/SITD and QTD locations.
+	 * pci_pool consistent memory always uses segment zero.
+	 * streaming mappings for I/O buffers, like pci_map_single(),
+	 * can return segments above 4GB, if the device allows.
+	 *
+	 * NOTE:  the dma mask is visible through dma_supported(), so
+	 * drivers can pass this info along ... like NETIF_F_HIGHDMA,
+	 * Scsi_Host.highmem_io, and so forth.  It's readonly to all
+	 * host side drivers though.
+	 */
+	hcc_params = fusbh200_readl(fusbh200, &fusbh200->caps->hcc_params);
+
+	// Philips, Intel, and maybe others need CMD_RUN before the
+	// root hub will detect new devices (why?); NEC doesn't
+	fusbh200->command &= ~(CMD_IAAD|CMD_PSE|CMD_ASE|CMD_RESET);
+	fusbh200->command |= CMD_RUN;
+	fusbh200_writel(fusbh200, fusbh200->command, &fusbh200->regs->command);
+	dbg_cmd (fusbh200, "init", fusbh200->command);
+
+	/*
+	 * Start, enabling full USB 2.0 functionality ... usb 1.1 devices
+	 * are explicitly handed to companion controller(s), so no TT is
+	 * involved with the root hub.  (Except where one is integrated,
+	 * and there's no companion controller unless maybe for USB OTG.)
+	 *
+	 * Turning on the CF flag will transfer ownership of all ports
+	 * from the companions to the EHCI controller.  If any of the
+	 * companions are in the middle of a port reset at the time, it
+	 * could cause trouble.  Write-locking ehci_cf_port_reset_rwsem
+	 * guarantees that no resets are in progress.  After we set CF,
+	 * a short delay lets the hardware catch up; new resets shouldn't
+	 * be started before the port switching actions could complete.
+	 */
+	down_write(&ehci_cf_port_reset_rwsem);
+	fusbh200->rh_state = FUSBH200_RH_RUNNING;
+	fusbh200_readl(fusbh200, &fusbh200->regs->command);	/* unblock posted writes */
+	msleep(5);
+	up_write(&ehci_cf_port_reset_rwsem);
+	fusbh200->last_periodic_enable = ktime_get_real();
+
+	temp = HC_VERSION(fusbh200, fusbh200_readl(fusbh200, &fusbh200->caps->hc_capbase));
+	fusbh200_info (fusbh200,
+		"USB %x.%x started, EHCI %x.%02x\n",
+		((fusbh200->sbrn & 0xf0)>>4), (fusbh200->sbrn & 0x0f),
+		temp >> 8, temp & 0xff);
+
+	fusbh200_writel(fusbh200, INTR_MASK,
+		    &fusbh200->regs->intr_enable); /* Turn On Interrupts */
+
+	/* GRR this is run-once init(), being done every time the HC starts.
+	 * So long as they're part of class devices, we can't do it init()
+	 * since the class device isn't created that early.
+	 */
+	create_debug_files(fusbh200);
+	create_sysfs_files(fusbh200);
+
+	return 0;
+}
+
+static int fusbh200_setup(struct usb_hcd *hcd)
+{
+	struct fusbh200_hcd *fusbh200 = hcd_to_fusbh200(hcd);
+	int retval;
+
+	fusbh200->regs = (void __iomem *)fusbh200->caps +
+	    HC_LENGTH(fusbh200, fusbh200_readl(fusbh200, &fusbh200->caps->hc_capbase));
+	dbg_hcs_params(fusbh200, "reset");
+	dbg_hcc_params(fusbh200, "reset");
+
+	/* cache this readonly data; minimize chip reads */
+	fusbh200->hcs_params = fusbh200_readl(fusbh200, &fusbh200->caps->hcs_params);
+
+	fusbh200->sbrn = HCD_USB2;
+
+	/* data structure init */
+	retval = hcd_fusbh200_init(hcd);
+	if (retval)
+		return retval;
+
+	retval = fusbh200_halt(fusbh200);
+	if (retval)
+		return retval;
+
+	fusbh200_reset(fusbh200);
+
+	return 0;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static irqreturn_t fusbh200_irq (struct usb_hcd *hcd)
+{
+	struct fusbh200_hcd		*fusbh200 = hcd_to_fusbh200 (hcd);
+	u32			status, masked_status, pcd_status = 0, cmd;
+	int			bh;
+
+	spin_lock (&fusbh200->lock);
+
+	status = fusbh200_readl(fusbh200, &fusbh200->regs->status);
+
+	/* e.g. cardbus physical eject */
+	if (status == ~(u32) 0) {
+		fusbh200_dbg (fusbh200, "device removed\n");
+		goto dead;
+	}
+
+	/*
+	 * We don't use STS_FLR, but some controllers don't like it to
+	 * remain on, so mask it out along with the other status bits.
+	 */
+	masked_status = status & (INTR_MASK | STS_FLR);
+
+	/* Shared IRQ? */
+	if (!masked_status || unlikely(fusbh200->rh_state == FUSBH200_RH_HALTED)) {
+		spin_unlock(&fusbh200->lock);
+		return IRQ_NONE;
+	}
+
+	/* clear (just) interrupts */
+	fusbh200_writel(fusbh200, masked_status, &fusbh200->regs->status);
+	cmd = fusbh200_readl(fusbh200, &fusbh200->regs->command);
+	bh = 0;
+
+#ifdef	VERBOSE_DEBUG
+	/* unrequested/ignored: Frame List Rollover */
+	dbg_status (fusbh200, "irq", status);
+#endif
+
+	/* INT, ERR, and IAA interrupt rates can be throttled */
+
+	/* normal [4.15.1.2] or error [4.15.1.1] completion */
+	if (likely ((status & (STS_INT|STS_ERR)) != 0)) {
+		if (likely ((status & STS_ERR) == 0))
+			COUNT (fusbh200->stats.normal);
+		else
+			COUNT (fusbh200->stats.error);
+		bh = 1;
+	}
+
+	/* complete the unlinking of some qh [4.15.2.3] */
+	if (status & STS_IAA) {
+
+		/* Turn off the IAA watchdog */
+		fusbh200->enabled_hrtimer_events &= ~BIT(FUSBH200_HRTIMER_IAA_WATCHDOG);
+
+		/*
+		 * Mild optimization: Allow another IAAD to reset the
+		 * hrtimer, if one occurs before the next expiration.
+		 * In theory we could always cancel the hrtimer, but
+		 * tests show that about half the time it will be reset
+		 * for some other event anyway.
+		 */
+		if (fusbh200->next_hrtimer_event == FUSBH200_HRTIMER_IAA_WATCHDOG)
+			++fusbh200->next_hrtimer_event;
+
+		/* guard against (alleged) silicon errata */
+		if (cmd & CMD_IAAD)
+			fusbh200_dbg(fusbh200, "IAA with IAAD still set?\n");
+		if (fusbh200->async_iaa) {
+			COUNT(fusbh200->stats.iaa);
+			end_unlink_async(fusbh200);
+		} else
+			fusbh200_dbg(fusbh200, "IAA with nothing unlinked?\n");
+	}
+
+	/* remote wakeup [4.3.1] */
+	if (status & STS_PCD) {
+		int pstatus;
+		u32 __iomem *status_reg = &fusbh200->regs->port_status;
+
+		/* kick root hub later */
+		pcd_status = status;
+
+		/* resume root hub? */
+		if (fusbh200->rh_state == FUSBH200_RH_SUSPENDED)
+			usb_hcd_resume_root_hub(hcd);
+
+		pstatus = fusbh200_readl(fusbh200, status_reg);
+
+		if (test_bit(0, &fusbh200->suspended_ports) &&
+				((pstatus & PORT_RESUME) ||
+					!(pstatus & PORT_SUSPEND)) &&
+				(pstatus & PORT_PE) &&
+				fusbh200->reset_done[0] == 0) {
+
+			/* start 20 msec resume signaling from this port,
+			 * and make khubd collect PORT_STAT_C_SUSPEND to
+			 * stop that signaling.  Use 5 ms extra for safety,
+			 * like usb_port_resume() does.
+			 */
+			fusbh200->reset_done[0] = jiffies + msecs_to_jiffies(25);
+			set_bit(0, &fusbh200->resuming_ports);
+			fusbh200_dbg (fusbh200, "port 1 remote wakeup\n");
+			mod_timer(&hcd->rh_timer, fusbh200->reset_done[0]);
+		}
+	}
+
+	/* PCI errors [4.15.2.4] */
+	if (unlikely ((status & STS_FATAL) != 0)) {
+		fusbh200_err(fusbh200, "fatal error\n");
+		dbg_cmd(fusbh200, "fatal", cmd);
+		dbg_status(fusbh200, "fatal", status);
+dead:
+		usb_hc_died(hcd);
+
+		/* Don't let the controller do anything more */
+		fusbh200->shutdown = true;
+		fusbh200->rh_state = FUSBH200_RH_STOPPING;
+		fusbh200->command &= ~(CMD_RUN | CMD_ASE | CMD_PSE);
+		fusbh200_writel(fusbh200, fusbh200->command, &fusbh200->regs->command);
+		fusbh200_writel(fusbh200, 0, &fusbh200->regs->intr_enable);
+		fusbh200_handle_controller_death(fusbh200);
+
+		/* Handle completions when the controller stops */
+		bh = 0;
+	}
+
+	if (bh)
+		fusbh200_work (fusbh200);
+	spin_unlock (&fusbh200->lock);
+	if (pcd_status)
+		usb_hcd_poll_rh_status(hcd);
+	return IRQ_HANDLED;
+}
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * non-error returns are a promise to giveback() the urb later
+ * we drop ownership so next owner (or urb unlink) can get it
+ *
+ * urb + dev is in hcd.self.controller.urb_list
+ * we're queueing TDs onto software and hardware lists
+ *
+ * hcd-specific init for hcpriv hasn't been done yet
+ *
+ * NOTE:  control, bulk, and interrupt share the same code to append TDs
+ * to a (possibly active) QH, and the same QH scanning code.
+ */
+static int fusbh200_urb_enqueue (
+	struct usb_hcd	*hcd,
+	struct urb	*urb,
+	gfp_t		mem_flags
+) {
+	struct fusbh200_hcd		*fusbh200 = hcd_to_fusbh200 (hcd);
+	struct list_head	qtd_list;
+
+	INIT_LIST_HEAD (&qtd_list);
+
+	switch (usb_pipetype (urb->pipe)) {
+	case PIPE_CONTROL:
+		/* qh_completions() code doesn't handle all the fault cases
+		 * in multi-TD control transfers.  Even 1KB is rare anyway.
+		 */
+		if (urb->transfer_buffer_length > (16 * 1024))
+			return -EMSGSIZE;
+		/* FALLTHROUGH */
+	/* case PIPE_BULK: */
+	default:
+		if (!qh_urb_transaction (fusbh200, urb, &qtd_list, mem_flags))
+			return -ENOMEM;
+		return submit_async(fusbh200, urb, &qtd_list, mem_flags);
+
+	case PIPE_INTERRUPT:
+		if (!qh_urb_transaction (fusbh200, urb, &qtd_list, mem_flags))
+			return -ENOMEM;
+		return intr_submit(fusbh200, urb, &qtd_list, mem_flags);
+
+	case PIPE_ISOCHRONOUS:
+		return itd_submit (fusbh200, urb, mem_flags);
+	}
+}
+
+/* remove from hardware lists
+ * completions normally happen asynchronously
+ */
+
+static int fusbh200_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
+{
+	struct fusbh200_hcd		*fusbh200 = hcd_to_fusbh200 (hcd);
+	struct fusbh200_qh		*qh;
+	unsigned long		flags;
+	int			rc;
+
+	spin_lock_irqsave (&fusbh200->lock, flags);
+	rc = usb_hcd_check_unlink_urb(hcd, urb, status);
+	if (rc)
+		goto done;
+
+	switch (usb_pipetype (urb->pipe)) {
+	// case PIPE_CONTROL:
+	// case PIPE_BULK:
+	default:
+		qh = (struct fusbh200_qh *) urb->hcpriv;
+		if (!qh)
+			break;
+		switch (qh->qh_state) {
+		case QH_STATE_LINKED:
+		case QH_STATE_COMPLETING:
+			start_unlink_async(fusbh200, qh);
+			break;
+		case QH_STATE_UNLINK:
+		case QH_STATE_UNLINK_WAIT:
+			/* already started */
+			break;
+		case QH_STATE_IDLE:
+			/* QH might be waiting for a Clear-TT-Buffer */
+			qh_completions(fusbh200, qh);
+			break;
+		}
+		break;
+
+	case PIPE_INTERRUPT:
+		qh = (struct fusbh200_qh *) urb->hcpriv;
+		if (!qh)
+			break;
+		switch (qh->qh_state) {
+		case QH_STATE_LINKED:
+		case QH_STATE_COMPLETING:
+			start_unlink_intr(fusbh200, qh);
+			break;
+		case QH_STATE_IDLE:
+			qh_completions (fusbh200, qh);
+			break;
+		default:
+			fusbh200_dbg (fusbh200, "bogus qh %p state %d\n",
+					qh, qh->qh_state);
+			goto done;
+		}
+		break;
+
+	case PIPE_ISOCHRONOUS:
+		// itd...
+
+		// wait till next completion, do it then.
+		// completion irqs can wait up to 1024 msec,
+		break;
+	}
+done:
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+	return rc;
+}
+
+/*-------------------------------------------------------------------------*/
+
+// bulk qh holds the data toggle
+
+static void
+fusbh200_endpoint_disable (struct usb_hcd *hcd, struct usb_host_endpoint *ep)
+{
+	struct fusbh200_hcd		*fusbh200 = hcd_to_fusbh200 (hcd);
+	unsigned long		flags;
+	struct fusbh200_qh		*qh, *tmp;
+
+	/* ASSERT:  any requests/urbs are being unlinked */
+	/* ASSERT:  nobody can be submitting urbs for this any more */
+
+rescan:
+	spin_lock_irqsave (&fusbh200->lock, flags);
+	qh = ep->hcpriv;
+	if (!qh)
+		goto done;
+
+	/* endpoints can be iso streams.  for now, we don't
+	 * accelerate iso completions ... so spin a while.
+	 */
+	if (qh->hw == NULL) {
+		struct fusbh200_iso_stream	*stream = ep->hcpriv;
+
+		if (!list_empty(&stream->td_list))
+			goto idle_timeout;
+
+		/* BUG_ON(!list_empty(&stream->free_list)); */
+		kfree(stream);
+		goto done;
+	}
+
+	if (fusbh200->rh_state < FUSBH200_RH_RUNNING)
+		qh->qh_state = QH_STATE_IDLE;
+	switch (qh->qh_state) {
+	case QH_STATE_LINKED:
+	case QH_STATE_COMPLETING:
+		for (tmp = fusbh200->async->qh_next.qh;
+				tmp && tmp != qh;
+				tmp = tmp->qh_next.qh)
+			continue;
+		/* periodic qh self-unlinks on empty, and a COMPLETING qh
+		 * may already be unlinked.
+		 */
+		if (tmp)
+			start_unlink_async(fusbh200, qh);
+		/* FALL THROUGH */
+	case QH_STATE_UNLINK:		/* wait for hw to finish? */
+	case QH_STATE_UNLINK_WAIT:
+idle_timeout:
+		spin_unlock_irqrestore (&fusbh200->lock, flags);
+		schedule_timeout_uninterruptible(1);
+		goto rescan;
+	case QH_STATE_IDLE:		/* fully unlinked */
+		if (qh->clearing_tt)
+			goto idle_timeout;
+		if (list_empty (&qh->qtd_list)) {
+			qh_destroy(fusbh200, qh);
+			break;
+		}
+		/* else FALL THROUGH */
+	default:
+		/* caller was supposed to have unlinked any requests;
+		 * that's not our job.  just leak this memory.
+		 */
+		fusbh200_err (fusbh200, "qh %p (#%02x) state %d%s\n",
+			qh, ep->desc.bEndpointAddress, qh->qh_state,
+			list_empty (&qh->qtd_list) ? "" : "(has tds)");
+		break;
+	}
+ done:
+	ep->hcpriv = NULL;
+	spin_unlock_irqrestore (&fusbh200->lock, flags);
+}
+
+static void
+fusbh200_endpoint_reset(struct usb_hcd *hcd, struct usb_host_endpoint *ep)
+{
+	struct fusbh200_hcd		*fusbh200 = hcd_to_fusbh200(hcd);
+	struct fusbh200_qh		*qh;
+	int			eptype = usb_endpoint_type(&ep->desc);
+	int			epnum = usb_endpoint_num(&ep->desc);
+	int			is_out = usb_endpoint_dir_out(&ep->desc);
+	unsigned long		flags;
+
+	if (eptype != USB_ENDPOINT_XFER_BULK && eptype != USB_ENDPOINT_XFER_INT)
+		return;
+
+	spin_lock_irqsave(&fusbh200->lock, flags);
+	qh = ep->hcpriv;
+
+	/* For Bulk and Interrupt endpoints we maintain the toggle state
+	 * in the hardware; the toggle bits in udev aren't used at all.
+	 * When an endpoint is reset by usb_clear_halt() we must reset
+	 * the toggle bit in the QH.
+	 */
+	if (qh) {
+		usb_settoggle(qh->dev, epnum, is_out, 0);
+		if (!list_empty(&qh->qtd_list)) {
+			WARN_ONCE(1, "clear_halt for a busy endpoint\n");
+		} else if (qh->qh_state == QH_STATE_LINKED ||
+				qh->qh_state == QH_STATE_COMPLETING) {
+
+			/* The toggle value in the QH can't be updated
+			 * while the QH is active.  Unlink it now;
+			 * re-linking will call qh_refresh().
+			 */
+			if (eptype == USB_ENDPOINT_XFER_BULK)
+				start_unlink_async(fusbh200, qh);
+			else
+				start_unlink_intr(fusbh200, qh);
+		}
+	}
+	spin_unlock_irqrestore(&fusbh200->lock, flags);
+}
+
+static int fusbh200_get_frame (struct usb_hcd *hcd)
+{
+	struct fusbh200_hcd		*fusbh200 = hcd_to_fusbh200 (hcd);
+	return (fusbh200_read_frame_index(fusbh200) >> 3) % fusbh200->periodic_size;
+}
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * The EHCI in ChipIdea HDRC cannot be a separate module or device,
+ * because its registers (and irq) are shared between host/gadget/otg
+ * functions  and in order to facilitate role switching we cannot
+ * give the fusbh200 driver exclusive access to those.
+ */
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_AUTHOR (DRIVER_AUTHOR);
+MODULE_LICENSE ("GPL");
+
+static const struct hc_driver fusbh200_fusbh200_hc_driver = {
+	.description 		= hcd_name,
+	.product_desc 		= "Faraday USB2.0 Host Controller",
+	.hcd_priv_size 		= sizeof(struct fusbh200_hcd),
+
+	/*
+	 * generic hardware linkage
+	 */
+	.irq 			= fusbh200_irq,
+	.flags 			= HCD_MEMORY | HCD_USB2,
+
+	/*
+	 * basic lifecycle operations
+	 */
+	.reset 			= hcd_fusbh200_init,
+	.start 			= fusbh200_run,
+	.stop 			= fusbh200_stop,
+	.shutdown 		= fusbh200_shutdown,
+
+	/*
+	 * managing i/o requests and associated device resources
+	 */
+	.urb_enqueue 		= fusbh200_urb_enqueue,
+	.urb_dequeue 		= fusbh200_urb_dequeue,
+	.endpoint_disable 	= fusbh200_endpoint_disable,
+	.endpoint_reset 	= fusbh200_endpoint_reset,
+
+	/*
+	 * scheduling support
+	 */
+	.get_frame_number 	= fusbh200_get_frame,
+
+	/*
+	 * root hub support
+	 */
+	.hub_status_data 	= fusbh200_hub_status_data,
+	.hub_control 		= fusbh200_hub_control,
+	.bus_suspend 		= fusbh200_bus_suspend,
+	.bus_resume 		= fusbh200_bus_resume,
+
+	.relinquish_port 	= fusbh200_relinquish_port,
+	.port_handed_over 	= fusbh200_port_handed_over,
+
+	.clear_tt_buffer_complete = fusbh200_clear_tt_buffer_complete,
+};
+
+static void fusbh200_init(struct fusbh200_hcd *fusbh200)
+{
+	u32 reg;
+
+	reg = fusbh200_readl(fusbh200, &fusbh200->regs->bmcsr);
+	reg |= BMCSR_INT_POLARITY;
+	reg &= ~BMCSR_VBUS_OFF;
+	fusbh200_writel(fusbh200, reg, &fusbh200->regs->bmcsr);
+
+	reg = fusbh200_readl(fusbh200, &fusbh200->regs->bmier);
+	fusbh200_writel(fusbh200, reg | BMIER_OVC_EN | BMIER_VBUS_ERR_EN,
+		&fusbh200->regs->bmier);
+}
+
+/**
+ * fusbh200_hcd_probe - initialize faraday FUSBH200 HCDs
+ *
+ * Allocates basic resources for this USB host controller, and
+ * then invokes the start() method for the HCD associated with it
+ * through the hotplug entry's driver_data.
+ */
+static int fusbh200_hcd_probe(struct platform_device *pdev)
+{
+	struct device			*dev = &pdev->dev;
+	struct usb_hcd 			*hcd;
+	struct resource			*res;
+	int 				irq;
+	int 				retval = -ENODEV;
+	struct fusbh200_hcd 		*fusbh200;
+
+	if (usb_disabled())
+		return -ENODEV;
+
+	pdev->dev.power.power_state = PMSG_ON;
+
+	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!res) {
+		dev_err(dev,
+			"Found HC with no IRQ. Check %s setup!\n",
+			dev_name(dev));
+		return -ENODEV;
+	}
+
+	irq = res->start;
+
+	hcd = usb_create_hcd(&fusbh200_fusbh200_hc_driver, dev,
+			dev_name(dev));
+	if (!hcd) {
+		dev_err(dev, "failed to create hcd with err %d\n", retval);
+		retval = -ENOMEM;
+		goto fail_create_hcd;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(dev,
+			"Found HC with no register addr. Check %s setup!\n",
+			dev_name(dev));
+		retval = -ENODEV;
+		goto fail_request_resource;
+	}
+
+	hcd->rsrc_start = res->start;
+	hcd->rsrc_len = resource_size(res);
+	hcd->has_tt = 1;
+
+	if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len,
+				fusbh200_fusbh200_hc_driver.description)) {
+		dev_dbg(dev, "controller already in use\n");
+		retval = -EBUSY;
+		goto fail_request_resource;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (!res) {
+		dev_err(dev,
+			"Found HC with no register addr. Check %s setup!\n",
+			dev_name(dev));
+		retval = -ENODEV;
+		goto fail_request_resource;
+	}
+
+	hcd->regs = ioremap_nocache(res->start, resource_size(res));
+	if (hcd->regs == NULL) {
+		dev_dbg(dev, "error mapping memory\n");
+		retval = -EFAULT;
+		goto fail_ioremap;
+	}
+
+	fusbh200 = hcd_to_fusbh200(hcd);
+
+	fusbh200->caps = hcd->regs;
+
+	retval = fusbh200_setup(hcd);
+	if (retval)
+		goto fail_add_hcd;
+
+	fusbh200_init(fusbh200);
+
+	retval = usb_add_hcd(hcd, irq, IRQF_SHARED);
+	if (retval) {
+		dev_err(dev, "failed to add hcd with err %d\n", retval);
+		goto fail_add_hcd;
+	}
+
+	return retval;
+
+fail_add_hcd:
+	iounmap(hcd->regs);
+fail_ioremap:
+	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
+fail_request_resource:
+	usb_put_hcd(hcd);
+fail_create_hcd:
+	dev_err(dev, "init %s fail, %d\n", dev_name(dev), retval);
+	return retval;
+}
+
+/**
+ * fusbh200_hcd_remove - shutdown processing for EHCI HCDs
+ * @dev: USB Host Controller being removed
+ *
+ * Reverses the effect of fotg2xx_usb_hcd_probe(), first invoking
+ * the HCD's stop() method.  It is always called from a thread
+ * context, normally "rmmod", "apmd", or something similar.
+ */
+static int fusbh200_hcd_remove(struct platform_device *pdev)
+{
+	struct device *dev	= &pdev->dev;
+	struct usb_hcd *hcd	= dev_get_drvdata(dev);
+
+	if (!hcd)
+		return 0;
+
+	usb_remove_hcd(hcd);
+	iounmap(hcd->regs);
+	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
+	usb_put_hcd(hcd);
+
+	return 0;
+}
+
+static struct platform_driver fusbh200_hcd_fusbh200_driver = {
+	.driver = {
+		.name   = "fusbh200",
+	},
+	.probe  = fusbh200_hcd_probe,
+	.remove = fusbh200_hcd_remove,
+};
+
+static int __init fusbh200_hcd_init(void)
+{
+	int retval = 0;
+
+	if (usb_disabled())
+		return -ENODEV;
+
+	printk(KERN_INFO "%s: " DRIVER_DESC "\n", hcd_name);
+	set_bit(USB_EHCI_LOADED, &usb_hcds_loaded);
+	if (test_bit(USB_UHCI_LOADED, &usb_hcds_loaded) ||
+			test_bit(USB_OHCI_LOADED, &usb_hcds_loaded))
+		printk(KERN_WARNING "Warning! fusbh200_hcd should always be loaded"
+				" before uhci_hcd and ohci_hcd, not after\n");
+
+	pr_debug("%s: block sizes: qh %Zd qtd %Zd itd %Zd\n",
+		 hcd_name,
+		 sizeof(struct fusbh200_qh), sizeof(struct fusbh200_qtd),
+		 sizeof(struct fusbh200_itd));
+
+#ifdef DEBUG
+	fusbh200_debug_root = debugfs_create_dir("fusbh200", usb_debug_root);
+	if (!fusbh200_debug_root) {
+		retval = -ENOENT;
+		goto err_debug;
+	}
+#endif
+
+	retval = platform_driver_register(&fusbh200_hcd_fusbh200_driver);
+	if (retval < 0)
+		goto clean;
+	return retval;
+
+	platform_driver_unregister(&fusbh200_hcd_fusbh200_driver);
+clean:
+#ifdef DEBUG
+	debugfs_remove(fusbh200_debug_root);
+	fusbh200_debug_root = NULL;
+err_debug:
+#endif
+	clear_bit(USB_EHCI_LOADED, &usb_hcds_loaded);
+	return retval;
+}
+module_init(fusbh200_hcd_init);
+
+static void __exit fusbh200_hcd_cleanup(void)
+{
+	platform_driver_unregister(&fusbh200_hcd_fusbh200_driver);
+#ifdef DEBUG
+	debugfs_remove(fusbh200_debug_root);
+#endif
+	clear_bit(USB_EHCI_LOADED, &usb_hcds_loaded);
+}
+module_exit(fusbh200_hcd_cleanup);

diff --git a/drivers/usb/host/fusbh200.h b/drivers/usb/host/fusbh200.h
new file mode 100644
index 0000000..797c9e8
--- /dev/null
+++ b/drivers/usb/host/fusbh200.h

@@ -0,0 +1,743 @@
+#ifndef __LINUX_FUSBH200_H
+#define __LINUX_FUSBH200_H
+
+/* definitions used for the EHCI driver */
+
+/*
+ * __hc32 and __hc16 are "Host Controller" types, they may be equivalent to
+ * __leXX (normally) or __beXX (given FUSBH200_BIG_ENDIAN_DESC), depending on
+ * the host controller implementation.
+ *
+ * To facilitate the strongest possible byte-order checking from "sparse"
+ * and so on, we use __leXX unless that's not practical.
+ */
+#define __hc32	__le32
+#define __hc16	__le16
+
+/* statistics can be kept for tuning/monitoring */
+struct fusbh200_stats {
+	/* irq usage */
+	unsigned long		normal;
+	unsigned long		error;
+	unsigned long		iaa;
+	unsigned long		lost_iaa;
+
+	/* termination of urbs from core */
+	unsigned long		complete;
+	unsigned long		unlink;
+};
+
+/* fusbh200_hcd->lock guards shared data against other CPUs:
+ *   fusbh200_hcd:	async, unlink, periodic (and shadow), ...
+ *   usb_host_endpoint: hcpriv
+ *   fusbh200_qh:	qh_next, qtd_list
+ *   fusbh200_qtd:	qtd_list
+ *
+ * Also, hold this lock when talking to HC registers or
+ * when updating hw_* fields in shared qh/qtd/... structures.
+ */
+
+#define	FUSBH200_MAX_ROOT_PORTS	1		/* see HCS_N_PORTS */
+
+/*
+ * fusbh200_rh_state values of FUSBH200_RH_RUNNING or above mean that the
+ * controller may be doing DMA.  Lower values mean there's no DMA.
+ */
+enum fusbh200_rh_state {
+	FUSBH200_RH_HALTED,
+	FUSBH200_RH_SUSPENDED,
+	FUSBH200_RH_RUNNING,
+	FUSBH200_RH_STOPPING
+};
+
+/*
+ * Timer events, ordered by increasing delay length.
+ * Always update event_delays_ns[] and event_handlers[] (defined in
+ * ehci-timer.c) in parallel with this list.
+ */
+enum fusbh200_hrtimer_event {
+	FUSBH200_HRTIMER_POLL_ASS,		/* Poll for async schedule off */
+	FUSBH200_HRTIMER_POLL_PSS,		/* Poll for periodic schedule off */
+	FUSBH200_HRTIMER_POLL_DEAD,		/* Wait for dead controller to stop */
+	FUSBH200_HRTIMER_UNLINK_INTR,	/* Wait for interrupt QH unlink */
+	FUSBH200_HRTIMER_FREE_ITDS,		/* Wait for unused iTDs and siTDs */
+	FUSBH200_HRTIMER_ASYNC_UNLINKS,	/* Unlink empty async QHs */
+	FUSBH200_HRTIMER_IAA_WATCHDOG,	/* Handle lost IAA interrupts */
+	FUSBH200_HRTIMER_DISABLE_PERIODIC,	/* Wait to disable periodic sched */
+	FUSBH200_HRTIMER_DISABLE_ASYNC,	/* Wait to disable async sched */
+	FUSBH200_HRTIMER_IO_WATCHDOG,	/* Check for missing IRQs */
+	FUSBH200_HRTIMER_NUM_EVENTS		/* Must come last */
+};
+#define FUSBH200_HRTIMER_NO_EVENT	99
+
+struct fusbh200_hcd {			/* one per controller */
+	/* timing support */
+	enum fusbh200_hrtimer_event	next_hrtimer_event;
+	unsigned		enabled_hrtimer_events;
+	ktime_t			hr_timeouts[FUSBH200_HRTIMER_NUM_EVENTS];
+	struct hrtimer		hrtimer;
+
+	int			PSS_poll_count;
+	int			ASS_poll_count;
+	int			died_poll_count;
+
+	/* glue to PCI and HCD framework */
+	struct fusbh200_caps __iomem *caps;
+	struct fusbh200_regs __iomem *regs;
+	struct fusbh200_dbg_port __iomem *debug;
+
+	__u32			hcs_params;	/* cached register copy */
+	spinlock_t		lock;
+	enum fusbh200_rh_state	rh_state;
+
+	/* general schedule support */
+	bool			scanning:1;
+	bool			need_rescan:1;
+	bool			intr_unlinking:1;
+	bool			async_unlinking:1;
+	bool			shutdown:1;
+	struct fusbh200_qh		*qh_scan_next;
+
+	/* async schedule support */
+	struct fusbh200_qh		*async;
+	struct fusbh200_qh		*dummy;		/* For AMD quirk use */
+	struct fusbh200_qh		*async_unlink;
+	struct fusbh200_qh		*async_unlink_last;
+	struct fusbh200_qh		*async_iaa;
+	unsigned		async_unlink_cycle;
+	unsigned		async_count;	/* async activity count */
+
+	/* periodic schedule support */
+#define	DEFAULT_I_TDPS		1024		/* some HCs can do less */
+	unsigned		periodic_size;
+	__hc32			*periodic;	/* hw periodic table */
+	dma_addr_t		periodic_dma;
+	struct list_head	intr_qh_list;
+	unsigned		i_thresh;	/* uframes HC might cache */
+
+	union fusbh200_shadow	*pshadow;	/* mirror hw periodic table */
+	struct fusbh200_qh		*intr_unlink;
+	struct fusbh200_qh		*intr_unlink_last;
+	unsigned		intr_unlink_cycle;
+	unsigned		now_frame;	/* frame from HC hardware */
+	unsigned		next_frame;	/* scan periodic, start here */
+	unsigned		intr_count;	/* intr activity count */
+	unsigned		isoc_count;	/* isoc activity count */
+	unsigned		periodic_count;	/* periodic activity count */
+	unsigned		uframe_periodic_max; /* max periodic time per uframe */
+
+
+	/* list of itds completed while now_frame was still active */
+	struct list_head	cached_itd_list;
+	struct fusbh200_itd	*last_itd_to_free;
+
+	/* per root hub port */
+	unsigned long		reset_done [FUSBH200_MAX_ROOT_PORTS];
+
+	/* bit vectors (one bit per port) */
+	unsigned long		bus_suspended;		/* which ports were
+			already suspended at the start of a bus suspend */
+	unsigned long		companion_ports;	/* which ports are
+			dedicated to the companion controller */
+	unsigned long		owned_ports;		/* which ports are
+			owned by the companion during a bus suspend */
+	unsigned long		port_c_suspend;		/* which ports have
+			the change-suspend feature turned on */
+	unsigned long		suspended_ports;	/* which ports are
+			suspended */
+	unsigned long		resuming_ports;		/* which ports have
+			started to resume */
+
+	/* per-HC memory pools (could be per-bus, but ...) */
+	struct dma_pool		*qh_pool;	/* qh per active urb */
+	struct dma_pool		*qtd_pool;	/* one or more per qh */
+	struct dma_pool		*itd_pool;	/* itd per iso urb */
+
+	unsigned		random_frame;
+	unsigned long		next_statechange;
+	ktime_t			last_periodic_enable;
+	u32			command;
+
+	/* SILICON QUIRKS */
+	unsigned		need_io_watchdog:1;
+	unsigned		fs_i_thresh:1;	/* Intel iso scheduling */
+
+	u8			sbrn;		/* packed release number */
+
+	/* irq statistics */
+#ifdef FUSBH200_STATS
+	struct fusbh200_stats	stats;
+#	define COUNT(x) do { (x)++; } while (0)
+#else
+#	define COUNT(x) do {} while (0)
+#endif
+
+	/* debug files */
+#ifdef DEBUG
+	struct dentry		*debug_dir;
+#endif
+};
+
+/* convert between an HCD pointer and the corresponding FUSBH200_HCD */
+static inline struct fusbh200_hcd *hcd_to_fusbh200 (struct usb_hcd *hcd)
+{
+	return (struct fusbh200_hcd *) (hcd->hcd_priv);
+}
+static inline struct usb_hcd *fusbh200_to_hcd (struct fusbh200_hcd *fusbh200)
+{
+	return container_of ((void *) fusbh200, struct usb_hcd, hcd_priv);
+}
+
+/*-------------------------------------------------------------------------*/
+
+/* EHCI register interface, corresponds to EHCI Revision 0.95 specification */
+
+/* Section 2.2 Host Controller Capability Registers */
+struct fusbh200_caps {
+	/* these fields are specified as 8 and 16 bit registers,
+	 * but some hosts can't perform 8 or 16 bit PCI accesses.
+	 * some hosts treat caplength and hciversion as parts of a 32-bit
+	 * register, others treat them as two separate registers, this
+	 * affects the memory map for big endian controllers.
+	 */
+	u32		hc_capbase;
+#define HC_LENGTH(fusbh200, p)	(0x00ff&((p) >> /* bits 7:0 / offset 00h */ \
+				(fusbh200_big_endian_capbase(fusbh200) ? 24 : 0)))
+#define HC_VERSION(fusbh200, p)	(0xffff&((p) >> /* bits 31:16 / offset 02h */ \
+				(fusbh200_big_endian_capbase(fusbh200) ? 0 : 16)))
+	u32		hcs_params;     /* HCSPARAMS - offset 0x4 */
+#define HCS_N_PORTS(p)		(((p)>>0)&0xf)	/* bits 3:0, ports on HC */
+
+	u32		hcc_params;      /* HCCPARAMS - offset 0x8 */
+#define HCC_CANPARK(p)		((p)&(1 << 2))  /* true: can park on async qh */
+#define HCC_PGM_FRAMELISTLEN(p) ((p)&(1 << 1))  /* true: periodic_size changes*/
+	u8		portroute[8];	 /* nibbles for routing - offset 0xC */
+};
+
+
+/* Section 2.3 Host Controller Operational Registers */
+struct fusbh200_regs {
+
+	/* USBCMD: offset 0x00 */
+	u32		command;
+
+/* EHCI 1.1 addendum */
+/* 23:16 is r/w intr rate, in microframes; default "8" == 1/msec */
+#define CMD_PARK	(1<<11)		/* enable "park" on async qh */
+#define CMD_PARK_CNT(c)	(((c)>>8)&3)	/* how many transfers to park for */
+#define CMD_IAAD	(1<<6)		/* "doorbell" interrupt async advance */
+#define CMD_ASE		(1<<5)		/* async schedule enable */
+#define CMD_PSE		(1<<4)		/* periodic schedule enable */
+/* 3:2 is periodic frame list size */
+#define CMD_RESET	(1<<1)		/* reset HC not bus */
+#define CMD_RUN		(1<<0)		/* start/stop HC */
+
+	/* USBSTS: offset 0x04 */
+	u32		status;
+#define STS_ASS		(1<<15)		/* Async Schedule Status */
+#define STS_PSS		(1<<14)		/* Periodic Schedule Status */
+#define STS_RECL	(1<<13)		/* Reclamation */
+#define STS_HALT	(1<<12)		/* Not running (any reason) */
+/* some bits reserved */
+	/* these STS_* flags are also intr_enable bits (USBINTR) */
+#define STS_IAA		(1<<5)		/* Interrupted on async advance */
+#define STS_FATAL	(1<<4)		/* such as some PCI access errors */
+#define STS_FLR		(1<<3)		/* frame list rolled over */
+#define STS_PCD		(1<<2)		/* port change detect */
+#define STS_ERR		(1<<1)		/* "error" completion (overflow, ...) */
+#define STS_INT		(1<<0)		/* "normal" completion (short, ...) */
+
+	/* USBINTR: offset 0x08 */
+	u32		intr_enable;
+
+	/* FRINDEX: offset 0x0C */
+	u32		frame_index;	/* current microframe number */
+	/* CTRLDSSEGMENT: offset 0x10 */
+	u32		segment;	/* address bits 63:32 if needed */
+	/* PERIODICLISTBASE: offset 0x14 */
+	u32		frame_list;	/* points to periodic list */
+	/* ASYNCLISTADDR: offset 0x18 */
+	u32		async_next;	/* address of next async queue head */
+
+	u32	reserved1;
+	/* PORTSC: offset 0x20 */
+	u32	port_status;
+/* 31:23 reserved */
+#define PORT_USB11(x) (((x)&(3<<10)) == (1<<10))	/* USB 1.1 device */
+#define PORT_RESET	(1<<8)		/* reset port */
+#define PORT_SUSPEND	(1<<7)		/* suspend port */
+#define PORT_RESUME	(1<<6)		/* resume it */
+#define PORT_PEC	(1<<3)		/* port enable change */
+#define PORT_PE		(1<<2)		/* port enable */
+#define PORT_CSC	(1<<1)		/* connect status change */
+#define PORT_CONNECT	(1<<0)		/* device connected */
+#define PORT_RWC_BITS   (PORT_CSC | PORT_PEC)
+
+	u32	reserved2[3];
+
+	/* BMCSR: offset 0x30 */
+	u32	bmcsr; /* Bus Moniter Control/Status Register */
+#define BMCSR_HOST_SPD_TYP	(3<<9)
+#define BMCSR_VBUS_OFF		(1<<4)
+#define BMCSR_INT_POLARITY	(1<<3)
+
+	/* BMISR: offset 0x34 */
+	u32	bmisr; /* Bus Moniter Interrupt Status Register*/
+#define BMISR_OVC		(1<<1)
+
+	/* BMIER: offset 0x38 */
+	u32	bmier; /* Bus Moniter Interrupt Enable Register */
+#define BMIER_OVC_EN		(1<<1)
+#define BMIER_VBUS_ERR_EN	(1<<0)
+};
+
+/* Appendix C, Debug port ... intended for use with special "debug devices"
+ * that can help if there's no serial console.  (nonstandard enumeration.)
+ */
+struct fusbh200_dbg_port {
+	u32	control;
+#define DBGP_OWNER	(1<<30)
+#define DBGP_ENABLED	(1<<28)
+#define DBGP_DONE	(1<<16)
+#define DBGP_INUSE	(1<<10)
+#define DBGP_ERRCODE(x)	(((x)>>7)&0x07)
+#	define DBGP_ERR_BAD	1
+#	define DBGP_ERR_SIGNAL	2
+#define DBGP_ERROR	(1<<6)
+#define DBGP_GO		(1<<5)
+#define DBGP_OUT	(1<<4)
+#define DBGP_LEN(x)	(((x)>>0)&0x0f)
+	u32	pids;
+#define DBGP_PID_GET(x)		(((x)>>16)&0xff)
+#define DBGP_PID_SET(data, tok)	(((data)<<8)|(tok))
+	u32	data03;
+	u32	data47;
+	u32	address;
+#define DBGP_EPADDR(dev, ep)	(((dev)<<8)|(ep))
+};
+
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+#include <linux/init.h>
+extern int __init early_dbgp_init(char *s);
+extern struct console early_dbgp_console;
+#endif /* CONFIG_EARLY_PRINTK_DBGP */
+
+struct usb_hcd;
+
+static inline int xen_dbgp_reset_prep(struct usb_hcd *hcd)
+{
+	return 1; /* Shouldn't this be 0? */
+}
+
+static inline int xen_dbgp_external_startup(struct usb_hcd *hcd)
+{
+	return -1;
+}
+
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+/* Call backs from fusbh200 host driver to fusbh200 debug driver */
+extern int dbgp_external_startup(struct usb_hcd *);
+extern int dbgp_reset_prep(struct usb_hcd *hcd);
+#else
+static inline int dbgp_reset_prep(struct usb_hcd *hcd)
+{
+	return xen_dbgp_reset_prep(hcd);
+}
+static inline int dbgp_external_startup(struct usb_hcd *hcd)
+{
+	return xen_dbgp_external_startup(hcd);
+}
+#endif
+
+/*-------------------------------------------------------------------------*/
+
+#define	QTD_NEXT(fusbh200, dma)	cpu_to_hc32(fusbh200, (u32)dma)
+
+/*
+ * EHCI Specification 0.95 Section 3.5
+ * QTD: describe data transfer components (buffer, direction, ...)
+ * See Fig 3-6 "Queue Element Transfer Descriptor Block Diagram".
+ *
+ * These are associated only with "QH" (Queue Head) structures,
+ * used with control, bulk, and interrupt transfers.
+ */
+struct fusbh200_qtd {
+	/* first part defined by EHCI spec */
+	__hc32			hw_next;	/* see EHCI 3.5.1 */
+	__hc32			hw_alt_next;    /* see EHCI 3.5.2 */
+	__hc32			hw_token;       /* see EHCI 3.5.3 */
+#define	QTD_TOGGLE	(1 << 31)	/* data toggle */
+#define	QTD_LENGTH(tok)	(((tok)>>16) & 0x7fff)
+#define	QTD_IOC		(1 << 15)	/* interrupt on complete */
+#define	QTD_CERR(tok)	(((tok)>>10) & 0x3)
+#define	QTD_PID(tok)	(((tok)>>8) & 0x3)
+#define	QTD_STS_ACTIVE	(1 << 7)	/* HC may execute this */
+#define	QTD_STS_HALT	(1 << 6)	/* halted on error */
+#define	QTD_STS_DBE	(1 << 5)	/* data buffer error (in HC) */
+#define	QTD_STS_BABBLE	(1 << 4)	/* device was babbling (qtd halted) */
+#define	QTD_STS_XACT	(1 << 3)	/* device gave illegal response */
+#define	QTD_STS_MMF	(1 << 2)	/* incomplete split transaction */
+#define	QTD_STS_STS	(1 << 1)	/* split transaction state */
+#define	QTD_STS_PING	(1 << 0)	/* issue PING? */
+
+#define ACTIVE_BIT(fusbh200)	cpu_to_hc32(fusbh200, QTD_STS_ACTIVE)
+#define HALT_BIT(fusbh200)		cpu_to_hc32(fusbh200, QTD_STS_HALT)
+#define STATUS_BIT(fusbh200)	cpu_to_hc32(fusbh200, QTD_STS_STS)
+
+	__hc32			hw_buf [5];        /* see EHCI 3.5.4 */
+	__hc32			hw_buf_hi [5];        /* Appendix B */
+
+	/* the rest is HCD-private */
+	dma_addr_t		qtd_dma;		/* qtd address */
+	struct list_head	qtd_list;		/* sw qtd list */
+	struct urb		*urb;			/* qtd's urb */
+	size_t			length;			/* length of buffer */
+} __attribute__ ((aligned (32)));
+
+/* mask NakCnt+T in qh->hw_alt_next */
+#define QTD_MASK(fusbh200)	cpu_to_hc32 (fusbh200, ~0x1f)
+
+#define IS_SHORT_READ(token) (QTD_LENGTH (token) != 0 && QTD_PID (token) == 1)
+
+/*-------------------------------------------------------------------------*/
+
+/* type tag from {qh,itd,fstn}->hw_next */
+#define Q_NEXT_TYPE(fusbh200,dma)	((dma) & cpu_to_hc32(fusbh200, 3 << 1))
+
+/*
+ * Now the following defines are not converted using the
+ * cpu_to_le32() macro anymore, since we have to support
+ * "dynamic" switching between be and le support, so that the driver
+ * can be used on one system with SoC EHCI controller using big-endian
+ * descriptors as well as a normal little-endian PCI EHCI controller.
+ */
+/* values for that type tag */
+#define Q_TYPE_ITD	(0 << 1)
+#define Q_TYPE_QH	(1 << 1)
+#define Q_TYPE_SITD	(2 << 1)
+#define Q_TYPE_FSTN	(3 << 1)
+
+/* next async queue entry, or pointer to interrupt/periodic QH */
+#define QH_NEXT(fusbh200,dma)	(cpu_to_hc32(fusbh200, (((u32)dma)&~0x01f)|Q_TYPE_QH))
+
+/* for periodic/async schedules and qtd lists, mark end of list */
+#define FUSBH200_LIST_END(fusbh200)	cpu_to_hc32(fusbh200, 1) /* "null pointer" to hw */
+
+/*
+ * Entries in periodic shadow table are pointers to one of four kinds
+ * of data structure.  That's dictated by the hardware; a type tag is
+ * encoded in the low bits of the hardware's periodic schedule.  Use
+ * Q_NEXT_TYPE to get the tag.
+ *
+ * For entries in the async schedule, the type tag always says "qh".
+ */
+union fusbh200_shadow {
+	struct fusbh200_qh	*qh;		/* Q_TYPE_QH */
+	struct fusbh200_itd	*itd;		/* Q_TYPE_ITD */
+	struct fusbh200_fstn	*fstn;		/* Q_TYPE_FSTN */
+	__hc32			*hw_next;	/* (all types) */
+	void			*ptr;
+};
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * EHCI Specification 0.95 Section 3.6
+ * QH: describes control/bulk/interrupt endpoints
+ * See Fig 3-7 "Queue Head Structure Layout".
+ *
+ * These appear in both the async and (for interrupt) periodic schedules.
+ */
+
+/* first part defined by EHCI spec */
+struct fusbh200_qh_hw {
+	__hc32			hw_next;	/* see EHCI 3.6.1 */
+	__hc32			hw_info1;       /* see EHCI 3.6.2 */
+#define	QH_CONTROL_EP	(1 << 27)	/* FS/LS control endpoint */
+#define	QH_HEAD		(1 << 15)	/* Head of async reclamation list */
+#define	QH_TOGGLE_CTL	(1 << 14)	/* Data toggle control */
+#define	QH_HIGH_SPEED	(2 << 12)	/* Endpoint speed */
+#define	QH_LOW_SPEED	(1 << 12)
+#define	QH_FULL_SPEED	(0 << 12)
+#define	QH_INACTIVATE	(1 << 7)	/* Inactivate on next transaction */
+	__hc32			hw_info2;        /* see EHCI 3.6.2 */
+#define	QH_SMASK	0x000000ff
+#define	QH_CMASK	0x0000ff00
+#define	QH_HUBADDR	0x007f0000
+#define	QH_HUBPORT	0x3f800000
+#define	QH_MULT		0xc0000000
+	__hc32			hw_current;	/* qtd list - see EHCI 3.6.4 */
+
+	/* qtd overlay (hardware parts of a struct fusbh200_qtd) */
+	__hc32			hw_qtd_next;
+	__hc32			hw_alt_next;
+	__hc32			hw_token;
+	__hc32			hw_buf [5];
+	__hc32			hw_buf_hi [5];
+} __attribute__ ((aligned(32)));
+
+struct fusbh200_qh {
+	struct fusbh200_qh_hw	*hw;		/* Must come first */
+	/* the rest is HCD-private */
+	dma_addr_t		qh_dma;		/* address of qh */
+	union fusbh200_shadow	qh_next;	/* ptr to qh; or periodic */
+	struct list_head	qtd_list;	/* sw qtd list */
+	struct list_head	intr_node;	/* list of intr QHs */
+	struct fusbh200_qtd		*dummy;
+	struct fusbh200_qh		*unlink_next;	/* next on unlink list */
+
+	unsigned		unlink_cycle;
+
+	u8			needs_rescan;	/* Dequeue during giveback */
+	u8			qh_state;
+#define	QH_STATE_LINKED		1		/* HC sees this */
+#define	QH_STATE_UNLINK		2		/* HC may still see this */
+#define	QH_STATE_IDLE		3		/* HC doesn't see this */
+#define	QH_STATE_UNLINK_WAIT	4		/* LINKED and on unlink q */
+#define	QH_STATE_COMPLETING	5		/* don't touch token.HALT */
+
+	u8			xacterrs;	/* XactErr retry counter */
+#define	QH_XACTERR_MAX		32		/* XactErr retry limit */
+
+	/* periodic schedule info */
+	u8			usecs;		/* intr bandwidth */
+	u8			gap_uf;		/* uframes split/csplit gap */
+	u8			c_usecs;	/* ... split completion bw */
+	u16			tt_usecs;	/* tt downstream bandwidth */
+	unsigned short		period;		/* polling interval */
+	unsigned short		start;		/* where polling starts */
+#define NO_FRAME ((unsigned short)~0)			/* pick new start */
+
+	struct usb_device	*dev;		/* access to TT */
+	unsigned		is_out:1;	/* bulk or intr OUT */
+	unsigned		clearing_tt:1;	/* Clear-TT-Buf in progress */
+};
+
+/*-------------------------------------------------------------------------*/
+
+/* description of one iso transaction (up to 3 KB data if highspeed) */
+struct fusbh200_iso_packet {
+	/* These will be copied to iTD when scheduling */
+	u64			bufp;		/* itd->hw_bufp{,_hi}[pg] |= */
+	__hc32			transaction;	/* itd->hw_transaction[i] |= */
+	u8			cross;		/* buf crosses pages */
+	/* for full speed OUT splits */
+	u32			buf1;
+};
+
+/* temporary schedule data for packets from iso urbs (both speeds)
+ * each packet is one logical usb transaction to the device (not TT),
+ * beginning at stream->next_uframe
+ */
+struct fusbh200_iso_sched {
+	struct list_head	td_list;
+	unsigned		span;
+	struct fusbh200_iso_packet	packet [0];
+};
+
+/*
+ * fusbh200_iso_stream - groups all (s)itds for this endpoint.
+ * acts like a qh would, if EHCI had them for ISO.
+ */
+struct fusbh200_iso_stream {
+	/* first field matches fusbh200_hq, but is NULL */
+	struct fusbh200_qh_hw	*hw;
+
+	u8			bEndpointAddress;
+	u8			highspeed;
+	struct list_head	td_list;	/* queued itds */
+	struct list_head	free_list;	/* list of unused itds */
+	struct usb_device	*udev;
+	struct usb_host_endpoint *ep;
+
+	/* output of (re)scheduling */
+	int			next_uframe;
+	__hc32			splits;
+
+	/* the rest is derived from the endpoint descriptor,
+	 * trusting urb->interval == f(epdesc->bInterval) and
+	 * including the extra info for hw_bufp[0..2]
+	 */
+	u8			usecs, c_usecs;
+	u16			interval;
+	u16			tt_usecs;
+	u16			maxp;
+	u16			raw_mask;
+	unsigned		bandwidth;
+
+	/* This is used to initialize iTD's hw_bufp fields */
+	__hc32			buf0;
+	__hc32			buf1;
+	__hc32			buf2;
+
+	/* this is used to initialize sITD's tt info */
+	__hc32			address;
+};
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * EHCI Specification 0.95 Section 3.3
+ * Fig 3-4 "Isochronous Transaction Descriptor (iTD)"
+ *
+ * Schedule records for high speed iso xfers
+ */
+struct fusbh200_itd {
+	/* first part defined by EHCI spec */
+	__hc32			hw_next;           /* see EHCI 3.3.1 */
+	__hc32			hw_transaction [8]; /* see EHCI 3.3.2 */
+#define FUSBH200_ISOC_ACTIVE        (1<<31)        /* activate transfer this slot */
+#define FUSBH200_ISOC_BUF_ERR       (1<<30)        /* Data buffer error */
+#define FUSBH200_ISOC_BABBLE        (1<<29)        /* babble detected */
+#define FUSBH200_ISOC_XACTERR       (1<<28)        /* XactErr - transaction error */
+#define	FUSBH200_ITD_LENGTH(tok)	(((tok)>>16) & 0x0fff)
+#define	FUSBH200_ITD_IOC		(1 << 15)	/* interrupt on complete */
+
+#define ITD_ACTIVE(fusbh200)	cpu_to_hc32(fusbh200, FUSBH200_ISOC_ACTIVE)
+
+	__hc32			hw_bufp [7];	/* see EHCI 3.3.3 */
+	__hc32			hw_bufp_hi [7];	/* Appendix B */
+
+	/* the rest is HCD-private */
+	dma_addr_t		itd_dma;	/* for this itd */
+	union fusbh200_shadow	itd_next;	/* ptr to periodic q entry */
+
+	struct urb		*urb;
+	struct fusbh200_iso_stream	*stream;	/* endpoint's queue */
+	struct list_head	itd_list;	/* list of stream's itds */
+
+	/* any/all hw_transactions here may be used by that urb */
+	unsigned		frame;		/* where scheduled */
+	unsigned		pg;
+	unsigned		index[8];	/* in urb->iso_frame_desc */
+} __attribute__ ((aligned (32)));
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * EHCI Specification 0.96 Section 3.7
+ * Periodic Frame Span Traversal Node (FSTN)
+ *
+ * Manages split interrupt transactions (using TT) that span frame boundaries
+ * into uframes 0/1; see 4.12.2.2.  In those uframes, a "save place" FSTN
+ * makes the HC jump (back) to a QH to scan for fs/ls QH completions until
+ * it hits a "restore" FSTN; then it returns to finish other uframe 0/1 work.
+ */
+struct fusbh200_fstn {
+	__hc32			hw_next;	/* any periodic q entry */
+	__hc32			hw_prev;	/* qh or FUSBH200_LIST_END */
+
+	/* the rest is HCD-private */
+	dma_addr_t		fstn_dma;
+	union fusbh200_shadow	fstn_next;	/* ptr to periodic q entry */
+} __attribute__ ((aligned (32)));
+
+/*-------------------------------------------------------------------------*/
+
+/* Prepare the PORTSC wakeup flags during controller suspend/resume */
+
+#define fusbh200_prepare_ports_for_controller_suspend(fusbh200, do_wakeup)	\
+		fusbh200_adjust_port_wakeup_flags(fusbh200, true, do_wakeup);
+
+#define fusbh200_prepare_ports_for_controller_resume(fusbh200)			\
+		fusbh200_adjust_port_wakeup_flags(fusbh200, false, false);
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * Some EHCI controllers have a Transaction Translator built into the
+ * root hub. This is a non-standard feature.  Each controller will need
+ * to add code to the following inline functions, and call them as
+ * needed (mostly in root hub code).
+ */
+
+static inline unsigned int
+fusbh200_get_speed(struct fusbh200_hcd *fusbh200, unsigned int portsc)
+{
+	return (readl(&fusbh200->regs->bmcsr)
+		& BMCSR_HOST_SPD_TYP) >> 9;
+}
+
+/* Returns the speed of a device attached to a port on the root hub. */
+static inline unsigned int
+fusbh200_port_speed(struct fusbh200_hcd *fusbh200, unsigned int portsc)
+{
+	switch (fusbh200_get_speed(fusbh200, portsc)) {
+	case 0:
+		return 0;
+	case 1:
+		return USB_PORT_STAT_LOW_SPEED;
+	case 2:
+	default:
+		return USB_PORT_STAT_HIGH_SPEED;
+	}
+}
+
+/*-------------------------------------------------------------------------*/
+
+#define	fusbh200_has_fsl_portno_bug(e)		(0)
+
+/*
+ * While most USB host controllers implement their registers in
+ * little-endian format, a minority (celleb companion chip) implement
+ * them in big endian format.
+ *
+ * This attempts to support either format at compile time without a
+ * runtime penalty, or both formats with the additional overhead
+ * of checking a flag bit.
+ *
+ */
+
+#define fusbh200_big_endian_mmio(e)	0
+#define fusbh200_big_endian_capbase(e)	0
+
+static inline unsigned int fusbh200_readl(const struct fusbh200_hcd *fusbh200,
+		__u32 __iomem * regs)
+{
+	return readl(regs);
+}
+
+static inline void fusbh200_writel(const struct fusbh200_hcd *fusbh200,
+		const unsigned int val, __u32 __iomem *regs)
+{
+	writel(val, regs);
+}
+
+/* cpu to fusbh200 */
+static inline __hc32 cpu_to_hc32 (const struct fusbh200_hcd *fusbh200, const u32 x)
+{
+	return cpu_to_le32(x);
+}
+
+/* fusbh200 to cpu */
+static inline u32 hc32_to_cpu (const struct fusbh200_hcd *fusbh200, const __hc32 x)
+{
+	return le32_to_cpu(x);
+}
+
+static inline u32 hc32_to_cpup (const struct fusbh200_hcd *fusbh200, const __hc32 *x)
+{
+	return le32_to_cpup(x);
+}
+
+/*-------------------------------------------------------------------------*/
+
+static inline unsigned fusbh200_read_frame_index(struct fusbh200_hcd *fusbh200)
+{
+	return fusbh200_readl(fusbh200, &fusbh200->regs->frame_index);
+}
+
+#define fusbh200_itdlen(urb, desc, t) ({			\
+	usb_pipein((urb)->pipe) ?				\
+	(desc)->length - FUSBH200_ITD_LENGTH(t) :			\
+	FUSBH200_ITD_LENGTH(t);					\
+})
+/*-------------------------------------------------------------------------*/
+
+#ifndef DEBUG
+#define STUB_DEBUG_FILES
+#endif	/* DEBUG */
+
+/*-------------------------------------------------------------------------*/
+
+#endif /* __LINUX_FUSBH200_H */

diff --git a/drivers/usb/host/hwa-hc.c b/drivers/usb/host/hwa-hc.c
index 104730d..483990c 100644
--- a/drivers/usb/host/hwa-hc.c
+++ b/drivers/usb/host/hwa-hc.c

@@ -577,7 +577,7 @@
 	.product_desc = "Wireless USB HWA host controller",
 	.hcd_priv_size = sizeof(struct hwahc) - sizeof(struct usb_hcd),
 	.irq = NULL,			/* FIXME */
-	.flags = HCD_USB2,		/* FIXME */
+	.flags = HCD_USB25,
 	.reset = hwahc_op_reset,
 	.start = hwahc_op_start,
 	.stop = hwahc_op_stop,
@@ -588,8 +588,6 @@
 
 	.hub_status_data = wusbhc_rh_status_data,
 	.hub_control = wusbhc_rh_control,
-	.bus_suspend = wusbhc_rh_suspend,
-	.bus_resume = wusbhc_rh_resume,
 	.start_port_reset = wusbhc_rh_start_port_reset,
 };
 
@@ -685,12 +683,9 @@
 	wa->usb_dev = usb_get_dev(usb_dev);	/* bind the USB device */
 	wa->usb_iface = usb_get_intf(iface);
 	wusbhc->dev = dev;
-	wusbhc->uwb_rc = uwb_rc_get_by_grandpa(iface->dev.parent);
-	if (wusbhc->uwb_rc == NULL) {
-		result = -ENODEV;
-		dev_err(dev, "Cannot get associated UWB Host Controller\n");
-		goto error_rc_get;
-	}
+	/* defer getting the uwb_rc handle until it is needed since it
+	 * may not have been registered by the hwa_rc driver yet. */
+	wusbhc->uwb_rc = NULL;
 	result = wa_fill_descr(wa);	/* Get the device descriptor */
 	if (result < 0)
 		goto error_fill_descriptor;
@@ -733,8 +728,6 @@
 	/* WA Descr fill allocs no resources */
 error_security_create:
 error_fill_descriptor:
-	uwb_rc_put(wusbhc->uwb_rc);
-error_rc_get:
 	usb_put_intf(iface);
 	usb_put_dev(usb_dev);
 	return result;
@@ -776,6 +769,7 @@
 		goto error_alloc;
 	}
 	usb_hcd->wireless = 1;
+	usb_hcd->self.sg_tablesize = ~0;
 	wusbhc = usb_hcd_to_wusbhc(usb_hcd);
 	hwahc = container_of(wusbhc, struct hwahc, wusbhc);
 	hwahc_init(hwahc);

diff --git a/drivers/usb/host/imx21-hcd.c b/drivers/usb/host/imx21-hcd.c
index f0ebe8e..03dc4d9 100644
--- a/drivers/usb/host/imx21-hcd.c
+++ b/drivers/usb/host/imx21-hcd.c

@@ -809,26 +809,36 @@
 
 	/* calculate frame */
 	cur_frame = imx21_hc_get_frame(hcd);
-	if (urb->transfer_flags & URB_ISO_ASAP) {
-		if (list_empty(&ep_priv->td_list))
-			urb->start_frame = cur_frame + 5;
-		else
-			urb->start_frame = list_entry(
-				ep_priv->td_list.prev,
-				struct td, list)->frame + urb->interval;
-	}
-	urb->start_frame = wrap_frame(urb->start_frame);
-	if (frame_after(cur_frame, urb->start_frame)) {
-		dev_dbg(imx21->dev,
-			"enqueue: adjusting iso start %d (cur=%d) asap=%d\n",
-			urb->start_frame, cur_frame,
-			(urb->transfer_flags & URB_ISO_ASAP) != 0);
-		urb->start_frame = wrap_frame(cur_frame + 1);
+	i = 0;
+	if (list_empty(&ep_priv->td_list)) {
+		urb->start_frame = wrap_frame(cur_frame + 5);
+	} else {
+		urb->start_frame = wrap_frame(list_entry(ep_priv->td_list.prev,
+				struct td, list)->frame + urb->interval);
+
+		if (frame_after(cur_frame, urb->start_frame)) {
+			dev_dbg(imx21->dev,
+				"enqueue: adjusting iso start %d (cur=%d) asap=%d\n",
+				urb->start_frame, cur_frame,
+				(urb->transfer_flags & URB_ISO_ASAP) != 0);
+			i = DIV_ROUND_UP(wrap_frame(
+					cur_frame - urb->start_frame),
+					urb->interval);
+			if (urb->transfer_flags & URB_ISO_ASAP) {
+				urb->start_frame = wrap_frame(urb->start_frame
+						+ i * urb->interval);
+				i = 0;
+			} else if (i >= urb->number_of_packets) {
+				ret = -EXDEV;
+				goto alloc_dmem_failed;
+			}
+		}
 	}
 
 	/* set up transfers */
+	urb_priv->isoc_remaining = urb->number_of_packets - i;
 	td = urb_priv->isoc_td;
-	for (i = 0; i < urb->number_of_packets; i++, td++) {
+	for (; i < urb->number_of_packets; i++, td++) {
 		unsigned int offset = urb->iso_frame_desc[i].offset;
 		td->ep = ep;
 		td->urb = urb;
@@ -840,7 +850,6 @@
 		list_add_tail(&td->list, &ep_priv->td_list);
 	}
 
-	urb_priv->isoc_remaining = urb->number_of_packets;
 	dev_vdbg(imx21->dev, "setup %d packets for iso frame %d->%d\n",
 		urb->number_of_packets, urb->start_frame, td->frame);
 

diff --git a/drivers/usb/host/isp1760-if.c b/drivers/usb/host/isp1760-if.c
index a13709e..3df49b1 100644
--- a/drivers/usb/host/isp1760-if.c
+++ b/drivers/usb/host/isp1760-if.c

@@ -118,7 +118,7 @@
 		goto free_gpio;
 	}
 
-	dev_set_drvdata(&dev->dev, drvdata);
+	platform_set_drvdata(dev, drvdata);
 	return ret;
 
 free_gpio:
@@ -133,9 +133,7 @@
 
 static int of_isp1760_remove(struct platform_device *dev)
 {
-	struct isp1760 *drvdata = dev_get_drvdata(&dev->dev);
-
-	dev_set_drvdata(&dev->dev, NULL);
+	struct isp1760 *drvdata = platform_get_drvdata(dev);
 
 	usb_remove_hcd(drvdata->hcd);
 	iounmap(drvdata->hcd->regs);
@@ -398,7 +396,7 @@
 			       irqflags, -ENOENT,
 			       &pdev->dev, dev_name(&pdev->dev), devflags);
 
-	dev_set_drvdata(&pdev->dev, hcd);
+	platform_set_drvdata(pdev, hcd);
 
 	if (IS_ERR(hcd)) {
 		pr_warning("isp1760: Failed to register the HCD device\n");
@@ -419,7 +417,7 @@
 {
 	struct resource *mem_res;
 	resource_size_t mem_size;
-	struct usb_hcd *hcd = dev_get_drvdata(&pdev->dev);
+	struct usb_hcd *hcd = platform_get_drvdata(pdev);
 
 	usb_remove_hcd(hcd);
 

diff --git a/drivers/usb/host/ohci-at91.c b/drivers/usb/host/ohci-at91.c
index 2ee1496..9677f68 100644
--- a/drivers/usb/host/ohci-at91.c
+++ b/drivers/usb/host/ohci-at91.c

@@ -41,17 +41,17 @@
 
 static void at91_start_clock(void)
 {
-	clk_enable(hclk);
-	clk_enable(iclk);
-	clk_enable(fclk);
+	clk_prepare_enable(hclk);
+	clk_prepare_enable(iclk);
+	clk_prepare_enable(fclk);
 	clocked = 1;
 }
 
 static void at91_stop_clock(void)
 {
-	clk_disable(fclk);
-	clk_disable(iclk);
-	clk_disable(hclk);
+	clk_disable_unprepare(fclk);
+	clk_disable_unprepare(iclk);
+	clk_disable_unprepare(hclk);
 	clocked = 0;
 }
 

diff --git a/drivers/usb/host/ohci-da8xx.c b/drivers/usb/host/ohci-da8xx.c
index 0b815a8..6aaa9c9 100644
--- a/drivers/usb/host/ohci-da8xx.c
+++ b/drivers/usb/host/ohci-da8xx.c

@@ -401,7 +401,6 @@
 	struct usb_hcd	*hcd = platform_get_drvdata(dev);
 
 	usb_hcd_da8xx_remove(hcd, dev);
-	platform_set_drvdata(dev, NULL);
 
 	return 0;
 }

diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index fc627fd..a9d3437 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c

@@ -79,23 +79,8 @@
 #include "pci-quirks.h"
 
 static void ohci_dump (struct ohci_hcd *ohci, int verbose);
-static int ohci_init (struct ohci_hcd *ohci);
 static void ohci_stop (struct usb_hcd *hcd);
 
-#if defined(CONFIG_PM) || defined(CONFIG_PCI)
-static int ohci_restart (struct ohci_hcd *ohci);
-#endif
-
-#ifdef CONFIG_PCI
-static void sb800_prefetch(struct ohci_hcd *ohci, int on);
-#else
-static inline void sb800_prefetch(struct ohci_hcd *ohci, int on)
-{
-	return;
-}
-#endif
-
-
 #include "ohci-hub.c"
 #include "ohci-dbg.c"
 #include "ohci-mem.c"
@@ -772,6 +757,32 @@
 	return 0;
 }
 
+/* ohci_setup routine for generic controller initialization */
+
+int ohci_setup(struct usb_hcd *hcd)
+{
+	struct ohci_hcd		*ohci = hcd_to_ohci(hcd);
+
+	ohci_hcd_init(ohci);
+	
+	return ohci_init(ohci);
+}
+EXPORT_SYMBOL_GPL(ohci_setup);
+
+/* ohci_start routine for generic controller start of all OHCI bus glue */
+static int ohci_start(struct usb_hcd *hcd)
+{
+	struct ohci_hcd		*ohci = hcd_to_ohci(hcd);
+	int	ret;
+
+	ret = ohci_run(ohci);
+	if (ret < 0) {
+		ohci_err(ohci, "can't start\n");
+		ohci_stop(hcd);
+	}
+	return ret;
+}
+
 /*-------------------------------------------------------------------------*/
 
 /* an interrupt happens */
@@ -953,12 +964,13 @@
 #if defined(CONFIG_PM) || defined(CONFIG_PCI)
 
 /* must not be called from interrupt context */
-static int ohci_restart (struct ohci_hcd *ohci)
+int ohci_restart(struct ohci_hcd *ohci)
 {
 	int temp;
 	int i;
 	struct urb_priv *priv;
 
+	ohci_init(ohci);
 	spin_lock_irq(&ohci->lock);
 	ohci->rh_state = OHCI_RH_HALTED;
 
@@ -1012,12 +1024,13 @@
 	ohci_dbg(ohci, "restart complete\n");
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ohci_restart);
 
 #endif
 
 #ifdef CONFIG_PM
 
-static int __maybe_unused ohci_suspend(struct usb_hcd *hcd, bool do_wakeup)
+int ohci_suspend(struct usb_hcd *hcd, bool do_wakeup)
 {
 	struct ohci_hcd	*ohci = hcd_to_ohci (hcd);
 	unsigned long	flags;
@@ -1035,9 +1048,10 @@
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ohci_suspend);
 
 
-static int __maybe_unused ohci_resume(struct usb_hcd *hcd, bool hibernated)
+int ohci_resume(struct usb_hcd *hcd, bool hibernated)
 {
 	struct ohci_hcd		*ohci = hcd_to_ohci(hcd);
 	int			port;
@@ -1085,20 +1099,79 @@
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ohci_resume);
 
 #endif
 
 /*-------------------------------------------------------------------------*/
 
+/*
+ * Generic structure: This gets copied for platform drivers so that
+ * individual entries can be overridden as needed.
+ */
+
+static const struct hc_driver ohci_hc_driver = {
+	.description =          hcd_name,
+	.product_desc =         "OHCI Host Controller",
+	.hcd_priv_size =        sizeof(struct ohci_hcd),
+
+	/*
+	 * generic hardware linkage
+	*/
+	.irq =                  ohci_irq,
+	.flags =                HCD_MEMORY | HCD_USB11,
+
+	/*
+	* basic lifecycle operations
+	*/
+	.reset =                ohci_setup,
+	.start =                ohci_start,
+	.stop =                 ohci_stop,
+	.shutdown =             ohci_shutdown,
+
+	/*
+	 * managing i/o requests and associated device resources
+	*/
+	.urb_enqueue =          ohci_urb_enqueue,
+	.urb_dequeue =          ohci_urb_dequeue,
+	.endpoint_disable =     ohci_endpoint_disable,
+
+	/*
+	* scheduling support
+	*/
+	.get_frame_number =     ohci_get_frame,
+
+	/*
+	* root hub support
+	*/
+	.hub_status_data =      ohci_hub_status_data,
+	.hub_control =          ohci_hub_control,
+#ifdef CONFIG_PM
+	.bus_suspend =          ohci_bus_suspend,
+	.bus_resume =           ohci_bus_resume,
+#endif
+	.start_port_reset =	ohci_start_port_reset,
+};
+
+void ohci_init_driver(struct hc_driver *drv,
+		const struct ohci_driver_overrides *over)
+{
+	/* Copy the generic table to drv and then apply the overrides */
+	*drv = ohci_hc_driver;
+
+	drv->product_desc = over->product_desc;
+	drv->hcd_priv_size += over->extra_priv_size;
+	if (over->reset)
+		drv->reset = over->reset;
+}
+EXPORT_SYMBOL_GPL(ohci_init_driver);
+
+/*-------------------------------------------------------------------------*/
+
 MODULE_AUTHOR (DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE ("GPL");
 
-#ifdef CONFIG_PCI
-#include "ohci-pci.c"
-#define PCI_DRIVER		ohci_pci_driver
-#endif
-
 #if defined(CONFIG_ARCH_SA1100) && defined(CONFIG_SA1111)
 #include "ohci-sa1111.c"
 #define SA1111_DRIVER		ohci_hcd_sa1111_driver
@@ -1189,30 +1262,6 @@
 #define PLATFORM_DRIVER		ohci_hcd_tilegx_driver
 #endif
 
-#ifdef CONFIG_USB_OHCI_HCD_PLATFORM
-#include "ohci-platform.c"
-#define PLATFORM_DRIVER		ohci_platform_driver
-#endif
-
-#if	!defined(PCI_DRIVER) &&		\
-	!defined(PLATFORM_DRIVER) &&	\
-	!defined(OMAP1_PLATFORM_DRIVER) &&	\
-	!defined(OMAP3_PLATFORM_DRIVER) &&	\
-	!defined(OF_PLATFORM_DRIVER) &&	\
-	!defined(SA1111_DRIVER) &&	\
-	!defined(PS3_SYSTEM_BUS_DRIVER) && \
-	!defined(SM501_OHCI_DRIVER) && \
-	!defined(TMIO_OHCI_DRIVER) && \
-	!defined(S3C2410_PLATFORM_DRIVER) && \
-	!defined(EXYNOS_PLATFORM_DRIVER) && \
-	!defined(EP93XX_PLATFORM_DRIVER) && \
-	!defined(AT91_PLATFORM_DRIVER) && \
-	!defined(NXP_PLATFORM_DRIVER) && \
-	!defined(DAVINCI_PLATFORM_DRIVER) && \
-	!defined(SPEAR_PLATFORM_DRIVER)
-#error "missing bus glue for ohci-hcd"
-#endif
-
 static int __init ohci_hcd_mod_init(void)
 {
 	int retval = 0;
@@ -1269,12 +1318,6 @@
 		goto error_sa1111;
 #endif
 
-#ifdef PCI_DRIVER
-	retval = pci_register_driver(&PCI_DRIVER);
-	if (retval < 0)
-		goto error_pci;
-#endif
-
 #ifdef SM501_OHCI_DRIVER
 	retval = platform_driver_register(&SM501_OHCI_DRIVER);
 	if (retval < 0)
@@ -1368,10 +1411,6 @@
 	platform_driver_unregister(&SM501_OHCI_DRIVER);
  error_sm501:
 #endif
-#ifdef PCI_DRIVER
-	pci_unregister_driver(&PCI_DRIVER);
- error_pci:
-#endif
 #ifdef SA1111_DRIVER
 	sa1111_driver_unregister(&SA1111_DRIVER);
  error_sa1111:
@@ -1436,9 +1475,6 @@
 #ifdef SM501_OHCI_DRIVER
 	platform_driver_unregister(&SM501_OHCI_DRIVER);
 #endif
-#ifdef PCI_DRIVER
-	pci_unregister_driver(&PCI_DRIVER);
-#endif
 #ifdef SA1111_DRIVER
 	sa1111_driver_unregister(&SA1111_DRIVER);
 #endif

diff --git a/drivers/usb/host/ohci-hub.c b/drivers/usb/host/ohci-hub.c
index 60ff422..2347ab8 100644
--- a/drivers/usb/host/ohci-hub.c
+++ b/drivers/usb/host/ohci-hub.c

@@ -176,7 +176,6 @@
 	if (status == -EBUSY) {
 		if (!autostopped) {
 			spin_unlock_irq (&ohci->lock);
-			(void) ohci_init (ohci);
 			status = ohci_restart (ohci);
 
 			usb_root_hub_lost_power(hcd->self.root_hub);

diff --git a/drivers/usb/host/ohci-jz4740.c b/drivers/usb/host/ohci-jz4740.c
index 8062bb9..d4ef539 100644
--- a/drivers/usb/host/ohci-jz4740.c
+++ b/drivers/usb/host/ohci-jz4740.c

@@ -221,7 +221,6 @@
 	return 0;
 
 err_disable:
-	platform_set_drvdata(pdev, NULL);
 	if (jz4740_ohci->vbus) {
 		regulator_disable(jz4740_ohci->vbus);
 		regulator_put(jz4740_ohci->vbus);
@@ -246,8 +245,6 @@
 
 	usb_remove_hcd(hcd);
 
-	platform_set_drvdata(pdev, NULL);
-
 	if (jz4740_ohci->vbus) {
 		regulator_disable(jz4740_ohci->vbus);
 		regulator_put(jz4740_ohci->vbus);

diff --git a/drivers/usb/host/ohci-nxp.c b/drivers/usb/host/ohci-nxp.c
index 5d7eb72..7d7d507 100644
--- a/drivers/usb/host/ohci-nxp.c
+++ b/drivers/usb/host/ohci-nxp.c

@@ -351,7 +351,6 @@
 
 	usb_remove_hcd(hcd);
 	nxp_stop_hc();
-	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
 	usb_put_hcd(hcd);
 	clk_disable(usb_pll_clk);
 	clk_put(usb_pll_clk);
@@ -360,8 +359,6 @@
 	i2c_unregister_device(isp1301_i2c_client);
 	isp1301_i2c_client = NULL;
 
-	platform_set_drvdata(pdev, NULL);
-
 	return 0;
 }
 

diff --git a/drivers/usb/host/ohci-octeon.c b/drivers/usb/host/ohci-octeon.c
index d44430d..342dc7e 100644
--- a/drivers/usb/host/ohci-octeon.c
+++ b/drivers/usb/host/ohci-octeon.c

@@ -196,8 +196,6 @@
 	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
 	usb_put_hcd(hcd);
 
-	platform_set_drvdata(pdev, NULL);
-
 	return 0;
 }
 

diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c
index b1d32fb..8747fa6 100644
--- a/drivers/usb/host/ohci-omap.c
+++ b/drivers/usb/host/ohci-omap.c

@@ -498,7 +498,6 @@
 	struct usb_hcd		*hcd = platform_get_drvdata(dev);
 
 	usb_hcd_omap_remove(hcd, dev);
-	platform_set_drvdata(dev, NULL);
 
 	return 0;
 }

diff --git a/drivers/usb/host/ohci-omap3.c b/drivers/usb/host/ohci-omap3.c
index 8663851..8f71357 100644
--- a/drivers/usb/host/ohci-omap3.c
+++ b/drivers/usb/host/ohci-omap3.c

@@ -252,7 +252,7 @@
 	.shutdown	= ohci_hcd_omap3_shutdown,
 	.driver		= {
 		.name	= "ohci-omap3",
-		.of_match_table = of_match_ptr(omap_ohci_dt_ids),
+		.of_match_table = omap_ohci_dt_ids,
 	},
 };
 

diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c
index 951514e..08613e2 100644
--- a/drivers/usb/host/ohci-pci.c
+++ b/drivers/usb/host/ohci-pci.c

@@ -14,12 +14,19 @@
  * This file is licenced under the GPL.
  */
 
-#ifndef CONFIG_PCI
-#error "This file is PCI bus glue.  CONFIG_PCI must be defined."
-#endif
-
-#include <linux/pci.h>
 #include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
+
+#include "ohci.h"
+#include "pci-quirks.h"
+
+#define DRIVER_DESC "OHCI PCI platform driver"
+
+static const char hcd_name[] = "ohci-pci";
 
 
 /*-------------------------------------------------------------------------*/
@@ -123,13 +130,6 @@
 	struct ohci_hcd *ohci = container_of(work, struct ohci_hcd, nec_work);
 	int status;
 
-	status = ohci_init(ohci);
-	if (status != 0) {
-		ohci_err(ohci, "Restarting NEC controller failed in %s, %d\n",
-			 "ohci_init", status);
-		return;
-	}
-
 	status = ohci_restart(ohci);
 	if (status != 0)
 		ohci_err(ohci, "Restarting NEC controller failed in %s, %d\n",
@@ -175,19 +175,6 @@
 	return 0;
 }
 
-static void sb800_prefetch(struct ohci_hcd *ohci, int on)
-{
-	struct pci_dev *pdev;
-	u16 misc;
-
-	pdev = to_pci_dev(ohci_to_hcd(ohci)->self.controller);
-	pci_read_config_word(pdev, 0x50, &misc);
-	if (on == 0)
-		pci_write_config_word(pdev, 0x50, misc & 0xfcff);
-	else
-		pci_write_config_word(pdev, 0x50, misc | 0x0300);
-}
-
 /* List of quirks for OHCI */
 static const struct pci_device_id ohci_pci_quirks[] = {
 	{
@@ -249,10 +236,10 @@
 static int ohci_pci_reset (struct usb_hcd *hcd)
 {
 	struct ohci_hcd	*ohci = hcd_to_ohci (hcd);
+	struct pci_dev *pdev = to_pci_dev(hcd->self.controller);
 	int ret = 0;
 
 	if (hcd->self.controller) {
-		struct pci_dev *pdev = to_pci_dev(hcd->self.controller);
 		const struct pci_device_id *quirk_id;
 
 		quirk_id = pci_match_id(ohci_pci_quirks, pdev);
@@ -262,94 +249,25 @@
 			ret = quirk(hcd);
 		}
 	}
-	if (ret == 0) {
-		ohci_hcd_init (ohci);
-		return ohci_init (ohci);
-	}
+
+	if (ret == 0)
+		ret = ohci_setup(hcd);
+	/*
+	* After ohci setup RWC may not be set for add-in PCI cards.
+	* This transfers PCI PM wakeup capabilities.
+	*/
+	if (device_can_wakeup(&pdev->dev))
+		ohci->hc_control |= OHCI_CTRL_RWC;
 	return ret;
 }
 
+static struct hc_driver __read_mostly ohci_pci_hc_driver;
 
-static int ohci_pci_start (struct usb_hcd *hcd)
-{
-	struct ohci_hcd	*ohci = hcd_to_ohci (hcd);
-	int		ret;
-
-#ifdef CONFIG_PM /* avoid warnings about unused pdev */
-	if (hcd->self.controller) {
-		struct pci_dev *pdev = to_pci_dev(hcd->self.controller);
-
-		/* RWC may not be set for add-in PCI cards, since boot
-		 * firmware probably ignored them.  This transfers PCI
-		 * PM wakeup capabilities.
-		 */
-		if (device_can_wakeup(&pdev->dev))
-			ohci->hc_control |= OHCI_CTRL_RWC;
-	}
-#endif /* CONFIG_PM */
-
-	ret = ohci_run (ohci);
-	if (ret < 0) {
-		ohci_err (ohci, "can't start\n");
-		ohci_stop (hcd);
-	}
-	return ret;
-}
-
-
-/*-------------------------------------------------------------------------*/
-
-static const struct hc_driver ohci_pci_hc_driver = {
-	.description =		hcd_name,
-	.product_desc =		"OHCI Host Controller",
-	.hcd_priv_size =	sizeof(struct ohci_hcd),
-
-	/*
-	 * generic hardware linkage
-	 */
-	.irq =			ohci_irq,
-	.flags =		HCD_MEMORY | HCD_USB11,
-
-	/*
-	 * basic lifecycle operations
-	 */
+static const struct ohci_driver_overrides pci_overrides __initconst = {
+	.product_desc =		"OHCI PCI host controller",
 	.reset =		ohci_pci_reset,
-	.start =		ohci_pci_start,
-	.stop =			ohci_stop,
-	.shutdown =		ohci_shutdown,
-
-#ifdef	CONFIG_PM
-	.pci_suspend =		ohci_suspend,
-	.pci_resume =		ohci_resume,
-#endif
-
-	/*
-	 * managing i/o requests and associated device resources
-	 */
-	.urb_enqueue =		ohci_urb_enqueue,
-	.urb_dequeue =		ohci_urb_dequeue,
-	.endpoint_disable =	ohci_endpoint_disable,
-
-	/*
-	 * scheduling support
-	 */
-	.get_frame_number =	ohci_get_frame,
-
-	/*
-	 * root hub support
-	 */
-	.hub_status_data =	ohci_hub_status_data,
-	.hub_control =		ohci_hub_control,
-#ifdef	CONFIG_PM
-	.bus_suspend =		ohci_bus_suspend,
-	.bus_resume =		ohci_bus_resume,
-#endif
-	.start_port_reset =	ohci_start_port_reset,
 };
 
-/*-------------------------------------------------------------------------*/
-
-
 static const struct pci_device_id pci_ids [] = { {
 	/* handle any USB OHCI controller */
 	PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_OHCI, ~0),
@@ -377,3 +295,24 @@
 	},
 #endif
 };
+
+static int __init ohci_pci_init(void)
+{
+	if (usb_disabled())
+		return -ENODEV;
+
+	pr_info("%s: " DRIVER_DESC "\n", hcd_name);
+
+	ohci_init_driver(&ohci_pci_hc_driver, &pci_overrides);
+	return pci_register_driver(&ohci_pci_driver);
+}
+module_init(ohci_pci_init);
+
+static void __exit ohci_pci_cleanup(void)
+{
+	pci_unregister_driver(&ohci_pci_driver);
+}
+module_exit(ohci_pci_cleanup);
+
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");

diff --git a/drivers/usb/host/ohci-platform.c b/drivers/usb/host/ohci-platform.c
index c3e7287..bc30475 100644
--- a/drivers/usb/host/ohci-platform.c
+++ b/drivers/usb/host/ohci-platform.c

@@ -13,16 +13,28 @@
  *
  * Licensed under the GNU/GPL. See COPYING for details.
  */
+
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/err.h>
 #include <linux/platform_device.h>
 #include <linux/usb/ohci_pdriver.h>
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
+
+#include "ohci.h"
+
+#define DRIVER_DESC "OHCI generic platform driver"
+
+static const char hcd_name[] = "ohci-platform";
 
 static int ohci_platform_reset(struct usb_hcd *hcd)
 {
 	struct platform_device *pdev = to_platform_device(hcd->self.controller);
 	struct usb_ohci_pdata *pdata = pdev->dev.platform_data;
 	struct ohci_hcd *ohci = hcd_to_ohci(hcd);
-	int err;
 
 	if (pdata->big_endian_desc)
 		ohci->flags |= OHCI_QUIRK_BE_DESC;
@@ -30,58 +42,17 @@
 		ohci->flags |= OHCI_QUIRK_BE_MMIO;
 	if (pdata->no_big_frame_no)
 		ohci->flags |= OHCI_QUIRK_FRAME_NO;
-
-	ohci_hcd_init(ohci);
-
 	if (pdata->num_ports)
 		ohci->num_ports = pdata->num_ports;
 
-	err = ohci_init(ohci);
-
-	return err;
+	return ohci_setup(hcd);
 }
 
-static int ohci_platform_start(struct usb_hcd *hcd)
-{
-	struct ohci_hcd *ohci = hcd_to_ohci(hcd);
-	int err;
+static struct hc_driver __read_mostly ohci_platform_hc_driver;
 
-	err = ohci_run(ohci);
-	if (err < 0) {
-		ohci_err(ohci, "can't start\n");
-		ohci_stop(hcd);
-	}
-
-	return err;
-}
-
-static const struct hc_driver ohci_platform_hc_driver = {
-	.description		= hcd_name,
-	.product_desc		= "Generic Platform OHCI Controller",
-	.hcd_priv_size		= sizeof(struct ohci_hcd),
-
-	.irq			= ohci_irq,
-	.flags			= HCD_MEMORY | HCD_USB11,
-
-	.reset			= ohci_platform_reset,
-	.start			= ohci_platform_start,
-	.stop			= ohci_stop,
-	.shutdown		= ohci_shutdown,
-
-	.urb_enqueue		= ohci_urb_enqueue,
-	.urb_dequeue		= ohci_urb_dequeue,
-	.endpoint_disable	= ohci_endpoint_disable,
-
-	.get_frame_number	= ohci_get_frame,
-
-	.hub_status_data	= ohci_hub_status_data,
-	.hub_control		= ohci_hub_control,
-#ifdef	CONFIG_PM
-	.bus_suspend		= ohci_bus_suspend,
-	.bus_resume		= ohci_bus_resume,
-#endif
-
-	.start_port_reset	= ohci_start_port_reset,
+static const struct ohci_driver_overrides platform_overrides __initconst = {
+	.product_desc =	"Generic Platform OHCI controller",
+	.reset =	ohci_platform_reset,
 };
 
 static int ohci_platform_probe(struct platform_device *dev)
@@ -157,7 +128,6 @@
 
 	usb_remove_hcd(hcd);
 	usb_put_hcd(hcd);
-	platform_set_drvdata(dev, NULL);
 
 	if (pdata->power_off)
 		pdata->power_off(dev);
@@ -223,3 +193,26 @@
 		.pm	= &ohci_platform_pm_ops,
 	}
 };
+
+static int __init ohci_platform_init(void)
+{
+	if (usb_disabled())
+		return -ENODEV;
+
+	pr_info("%s: " DRIVER_DESC "\n", hcd_name);
+
+	ohci_init_driver(&ohci_platform_hc_driver, &platform_overrides);
+	return platform_driver_register(&ohci_platform_driver);
+}
+module_init(ohci_platform_init);
+
+static void __exit ohci_platform_cleanup(void)
+{
+	platform_driver_unregister(&ohci_platform_driver);
+}
+module_exit(ohci_platform_cleanup);
+
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_AUTHOR("Hauke Mehrtens");
+MODULE_AUTHOR("Alan Stern");
+MODULE_LICENSE("GPL");

diff --git a/drivers/usb/host/ohci-ppc-of.c b/drivers/usb/host/ohci-ppc-of.c
index 64c2ed9f..8294e2f 100644
--- a/drivers/usb/host/ohci-ppc-of.c
+++ b/drivers/usb/host/ohci-ppc-of.c

@@ -185,8 +185,7 @@
 
 static int ohci_hcd_ppc_of_remove(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
-	dev_set_drvdata(&op->dev, NULL);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 
 	dev_dbg(&op->dev, "stopping PPC-OF USB Controller\n");
 
@@ -203,7 +202,7 @@
 
 static void ohci_hcd_ppc_of_shutdown(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 
         if (hcd->driver->shutdown)
                 hcd->driver->shutdown(hcd);

diff --git a/drivers/usb/host/ohci-pxa27x.c b/drivers/usb/host/ohci-pxa27x.c
index 279b2ef..3a9c01d 100644
--- a/drivers/usb/host/ohci-pxa27x.c
+++ b/drivers/usb/host/ohci-pxa27x.c

@@ -556,7 +556,6 @@
 	struct usb_hcd *hcd = platform_get_drvdata(pdev);
 
 	usb_hcd_pxa27x_remove(hcd, pdev);
-	platform_set_drvdata(pdev, NULL);
 	return 0;
 }
 

diff --git a/drivers/usb/host/ohci-q.c b/drivers/usb/host/ohci-q.c
index 88731b7c..df4a670 100644
--- a/drivers/usb/host/ohci-q.c
+++ b/drivers/usb/host/ohci-q.c

@@ -41,6 +41,7 @@
 __releases(ohci->lock)
 __acquires(ohci->lock)
 {
+	 struct device *dev = ohci_to_hcd(ohci)->self.controller;
 	// ASSERT (urb->hcpriv != 0);
 
 	urb_free_priv (ohci, urb->hcpriv);
@@ -55,7 +56,7 @@
 			if (quirk_amdiso(ohci))
 				usb_amd_quirk_pll_enable();
 			if (quirk_amdprefetch(ohci))
-				sb800_prefetch(ohci, 0);
+				sb800_prefetch(dev, 0);
 		}
 		break;
 	case PIPE_INTERRUPT:
@@ -580,6 +581,7 @@
 	struct urb	*urb
 ) {
 	struct urb_priv	*urb_priv = urb->hcpriv;
+	struct device *dev = ohci_to_hcd(ohci)->self.controller;
 	dma_addr_t	data;
 	int		data_len = urb->transfer_buffer_length;
 	int		cnt = 0;
@@ -689,7 +691,7 @@
 			if (quirk_amdiso(ohci))
 				usb_amd_quirk_pll_disable();
 			if (quirk_amdprefetch(ohci))
-				sb800_prefetch(ohci, 1);
+				sb800_prefetch(dev, 1);
 		}
 		periodic = ohci_to_hcd(ohci)->self.bandwidth_isoc_reqs++ == 0
 			&& ohci_to_hcd(ohci)->self.bandwidth_int_reqs == 0;

diff --git a/drivers/usb/host/ohci-sm501.c b/drivers/usb/host/ohci-sm501.c
index 3b5b908..d479d5d 100644
--- a/drivers/usb/host/ohci-sm501.c
+++ b/drivers/usb/host/ohci-sm501.c

@@ -207,7 +207,6 @@
 	sm501_modify_reg(pdev->dev.parent, SM501_IRQ_MASK, 0, 1 << 6);
 	sm501_unit_power(pdev->dev.parent, SM501_GATE_USB_HOST, 0);
 
-	platform_set_drvdata(pdev, NULL);
 	return 0;
 }
 

diff --git a/drivers/usb/host/ohci-spear.c b/drivers/usb/host/ohci-spear.c
index 3e19e01..cc9dd9e 100644
--- a/drivers/usb/host/ohci-spear.c
+++ b/drivers/usb/host/ohci-spear.c

@@ -179,8 +179,6 @@
 		spear_stop_ohci(ohci_p);
 
 	usb_put_hcd(hcd);
-
-	platform_set_drvdata(pdev, NULL);
 	return 0;
 }
 
@@ -232,7 +230,7 @@
 	.driver = {
 		.owner = THIS_MODULE,
 		.name = "spear-ohci",
-		.of_match_table = of_match_ptr(spear_ohci_id_table),
+		.of_match_table = spear_ohci_id_table,
 	},
 };
 

diff --git a/drivers/usb/host/ohci-tilegx.c b/drivers/usb/host/ohci-tilegx.c
index ea73009..197d514 100644
--- a/drivers/usb/host/ohci-tilegx.c
+++ b/drivers/usb/host/ohci-tilegx.c

@@ -182,7 +182,6 @@
 	tilegx_stop_ohc();
 	gxio_usb_host_destroy(&pdata->usb_ctx);
 	destroy_irq(pdata->irq);
-	platform_set_drvdata(pdev, NULL);
 
 	return 0;
 }

diff --git a/drivers/usb/host/ohci-tmio.c b/drivers/usb/host/ohci-tmio.c
index 5e3a6de..ecb09a5 100644
--- a/drivers/usb/host/ohci-tmio.c
+++ b/drivers/usb/host/ohci-tmio.c

@@ -287,8 +287,6 @@
 	iounmap(tmio->ccr);
 	usb_put_hcd(hcd);
 
-	platform_set_drvdata(dev, NULL);
-
 	return 0;
 }
 

diff --git a/drivers/usb/host/ohci.h b/drivers/usb/host/ohci.h
index d329914..e2e5faa 100644
--- a/drivers/usb/host/ohci.h
+++ b/drivers/usb/host/ohci.h

@@ -421,6 +421,9 @@
 	struct dentry		*debug_periodic;
 	struct dentry		*debug_registers;
 #endif
+	/* platform-specific data -- must come last */
+	unsigned long           priv[0] __aligned(sizeof(s64));
+
 };
 
 #ifdef CONFIG_PCI
@@ -718,3 +721,20 @@
 	{ return ohci_readl (hc, &hc->regs->roothub.status); }
 static inline u32 roothub_portstatus (struct ohci_hcd *hc, int i)
 	{ return read_roothub (hc, portstatus [i], 0xffe0fce0); }
+
+/* Declarations of things exported for use by ohci platform drivers */
+
+struct ohci_driver_overrides {
+	const char	*product_desc;
+	size_t		extra_priv_size;
+	int		(*reset)(struct usb_hcd *hcd);
+};
+
+extern void	ohci_init_driver(struct hc_driver *drv,
+				const struct ohci_driver_overrides *over);
+extern int	ohci_restart(struct ohci_hcd *ohci);
+extern int	ohci_setup(struct usb_hcd *hcd);
+#ifdef CONFIG_PM
+extern int	ohci_suspend(struct usb_hcd *hcd, bool do_wakeup);
+extern int	ohci_resume(struct usb_hcd *hcd, bool hibernated);
+#endif

diff --git a/drivers/usb/host/oxu210hp-hcd.c b/drivers/usb/host/oxu210hp-hcd.c
index 0f401db..4a6df2d 100644
--- a/drivers/usb/host/oxu210hp-hcd.c
+++ b/drivers/usb/host/oxu210hp-hcd.c

@@ -3874,7 +3874,6 @@
 
 error_init:
 	kfree(info);
-	platform_set_drvdata(pdev, NULL);
 
 error_alloc:
 	iounmap(base);
@@ -3907,7 +3906,6 @@
 	release_mem_region(memstart, memlen);
 
 	kfree(info);
-	platform_set_drvdata(pdev, NULL);
 
 	return 0;
 }

diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c
index 4c338ec..b9848e4 100644
--- a/drivers/usb/host/pci-quirks.c
+++ b/drivers/usb/host/pci-quirks.c

@@ -91,6 +91,19 @@
 
 static DEFINE_SPINLOCK(amd_lock);
 
+void sb800_prefetch(struct device *dev, int on)
+{
+	u16 misc;
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	pci_read_config_word(pdev, 0x50, &misc);
+	if (on == 0)
+		pci_write_config_word(pdev, 0x50, misc & 0xfcff);
+	else
+		pci_write_config_word(pdev, 0x50, misc | 0x0300);
+}
+EXPORT_SYMBOL_GPL(sb800_prefetch);
+
 int usb_amd_find_chipset_info(void)
 {
 	u8 rev = 0;

diff --git a/drivers/usb/host/pci-quirks.h b/drivers/usb/host/pci-quirks.h
index 7f69a39..4b8a209 100644
--- a/drivers/usb/host/pci-quirks.h
+++ b/drivers/usb/host/pci-quirks.h

@@ -11,11 +11,13 @@
 bool usb_is_intel_switchable_xhci(struct pci_dev *pdev);
 void usb_enable_xhci_ports(struct pci_dev *xhci_pdev);
 void usb_disable_xhci_ports(struct pci_dev *xhci_pdev);
+void sb800_prefetch(struct device *dev, int on);
 #else
 static inline void usb_amd_quirk_pll_disable(void) {}
 static inline void usb_amd_quirk_pll_enable(void) {}
 static inline void usb_amd_dev_put(void) {}
 static inline void usb_disable_xhci_ports(struct pci_dev *xhci_pdev) {}
+static inline void sb800_prefetch(struct device *dev, int on) {}
 #endif  /* CONFIG_PCI */
 
 #endif  /*  __LINUX_USB_PCI_QUIRKS_H  */

diff --git a/drivers/usb/host/uhci-grlib.c b/drivers/usb/host/uhci-grlib.c
index 511bfc4..53c23ff 100644
--- a/drivers/usb/host/uhci-grlib.c
+++ b/drivers/usb/host/uhci-grlib.c

@@ -157,9 +157,7 @@
 
 static int uhci_hcd_grlib_remove(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
-
-	dev_set_drvdata(&op->dev, NULL);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 
 	dev_dbg(&op->dev, "stopping GRLIB GRUSBHC UHCI USB Controller\n");
 
@@ -183,7 +181,7 @@
  */
 static void uhci_hcd_grlib_shutdown(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 
 	uhci_hc_died(hcd_to_uhci(hcd));
 }

diff --git a/drivers/usb/host/uhci-platform.c b/drivers/usb/host/uhci-platform.c
index f1db61a..d033a0ec 100644
--- a/drivers/usb/host/uhci-platform.c
+++ b/drivers/usb/host/uhci-platform.c

@@ -130,7 +130,6 @@
 	iounmap(hcd->regs);
 	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
 	usb_put_hcd(hcd);
-	platform_set_drvdata(pdev, NULL);
 
 	return 0;
 }
@@ -144,7 +143,7 @@
  */
 static void uhci_hcd_platform_shutdown(struct platform_device *op)
 {
-	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
+	struct usb_hcd *hcd = platform_get_drvdata(op);
 
 	uhci_hc_died(hcd_to_uhci(hcd));
 }
@@ -161,6 +160,6 @@
 	.driver = {
 		.name = "platform-uhci",
 		.owner = THIS_MODULE,
-		.of_match_table = of_match_ptr(platform_uhci_ids),
+		.of_match_table = platform_uhci_ids,
 	},
 };

diff --git a/drivers/usb/host/whci/hcd.c b/drivers/usb/host/whci/hcd.c
index c3a6478..ecc88db 100644
--- a/drivers/usb/host/whci/hcd.c
+++ b/drivers/usb/host/whci/hcd.c

@@ -231,8 +231,6 @@
 
 	.hub_status_data = wusbhc_rh_status_data,
 	.hub_control = wusbhc_rh_control,
-	.bus_suspend = wusbhc_rh_suspend,
-	.bus_resume = wusbhc_rh_resume,
 	.start_port_reset = wusbhc_rh_start_port_reset,
 };
 

diff --git a/drivers/usb/host/xhci-dbg.c b/drivers/usb/host/xhci-dbg.c
index 5f3a7c7..5d5e58f 100644
--- a/drivers/usb/host/xhci-dbg.c
+++ b/drivers/usb/host/xhci-dbg.c

@@ -503,11 +503,14 @@
 	if (last_ep < 31)
 		last_ep_ctx = last_ep + 1;
 	for (i = 0; i < last_ep_ctx; ++i) {
+		unsigned int epaddr = xhci_get_endpoint_address(i);
 		struct xhci_ep_ctx *ep_ctx = xhci_get_ep_ctx(xhci, ctx, i);
 		dma_addr_t dma = ctx->dma +
 			((unsigned long)ep_ctx - (unsigned long)ctx->bytes);
 
-		xhci_dbg(xhci, "Endpoint %02d Context:\n", i);
+		xhci_dbg(xhci, "%s Endpoint %02d Context (ep_index %02d):\n",
+				usb_endpoint_out(epaddr) ? "OUT" : "IN",
+				epaddr & USB_ENDPOINT_NUMBER_MASK, i);
 		xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - ep_info\n",
 				&ep_ctx->ep_info,
 				(unsigned long long)dma, ep_ctx->ep_info);
@@ -550,6 +553,11 @@
 	if (ctx->type == XHCI_CTX_TYPE_INPUT) {
 		struct xhci_input_control_ctx *ctrl_ctx =
 			xhci_get_input_control_ctx(xhci, ctx);
+		if (!ctrl_ctx) {
+			xhci_warn(xhci, "Could not get input context, bad type.\n");
+			return;
+		}
+
 		xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - drop flags\n",
 			 &ctrl_ctx->drop_flags, (unsigned long long)dma,
 			 ctrl_ctx->drop_flags);

diff --git a/drivers/usb/host/xhci-ext-caps.h b/drivers/usb/host/xhci-ext-caps.h
index 377f424..8d7a132 100644
--- a/drivers/usb/host/xhci-ext-caps.h
+++ b/drivers/usb/host/xhci-ext-caps.h

@@ -71,6 +71,7 @@
 
 /* USB 2.0 xHCI 1.0 hardware LMP capability - section 7.2.2.1.3.2 */
 #define XHCI_HLC               (1 << 19)
+#define XHCI_BLC               (1 << 19)
 
 /* command register values to disable interrupts and halt the HC */
 /* start/stop HC execution - do not write unless HC is halted*/

diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index 187a3ec..1d35459 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c

@@ -867,18 +867,18 @@
 		case USB_PORT_FEAT_U1_TIMEOUT:
 			if (hcd->speed != HCD_USB3)
 				goto error;
-			temp = xhci_readl(xhci, port_array[wIndex] + 1);
+			temp = xhci_readl(xhci, port_array[wIndex] + PORTPMSC);
 			temp &= ~PORT_U1_TIMEOUT_MASK;
 			temp |= PORT_U1_TIMEOUT(timeout);
-			xhci_writel(xhci, temp, port_array[wIndex] + 1);
+			xhci_writel(xhci, temp, port_array[wIndex] + PORTPMSC);
 			break;
 		case USB_PORT_FEAT_U2_TIMEOUT:
 			if (hcd->speed != HCD_USB3)
 				goto error;
-			temp = xhci_readl(xhci, port_array[wIndex] + 1);
+			temp = xhci_readl(xhci, port_array[wIndex] + PORTPMSC);
 			temp &= ~PORT_U2_TIMEOUT_MASK;
 			temp |= PORT_U2_TIMEOUT(timeout);
-			xhci_writel(xhci, temp, port_array[wIndex] + 1);
+			xhci_writel(xhci, temp, port_array[wIndex] + PORTPMSC);
 			break;
 		default:
 			goto error;
@@ -1098,10 +1098,8 @@
 			__le32 __iomem *addr;
 			u32 tmp;
 
-			/* Add one to the port status register address to get
-			 * the port power control register address.
-			 */
-			addr = port_array[port_index] + 1;
+			/* Get the port power control register address. */
+			addr = port_array[port_index] + PORTPMSC;
 			tmp = xhci_readl(xhci, addr);
 			tmp |= PORT_RWE;
 			xhci_writel(xhci, tmp, addr);
@@ -1193,7 +1191,7 @@
 			/* Add one to the port status register address to get
 			 * the port power control register address.
 			 */
-			addr = port_array[port_index] + 1;
+			addr = port_array[port_index] + PORTPMSC;
 			tmp = xhci_readl(xhci, addr);
 			tmp &= ~PORT_RWE;
 			xhci_writel(xhci, tmp, addr);

diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index fbf75e5..df6978a 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c

@@ -358,17 +358,25 @@
 static struct xhci_container_ctx *xhci_alloc_container_ctx(struct xhci_hcd *xhci,
 						    int type, gfp_t flags)
 {
-	struct xhci_container_ctx *ctx = kzalloc(sizeof(*ctx), flags);
+	struct xhci_container_ctx *ctx;
+
+	if ((type != XHCI_CTX_TYPE_DEVICE) && (type != XHCI_CTX_TYPE_INPUT))
+		return NULL;
+
+	ctx = kzalloc(sizeof(*ctx), flags);
 	if (!ctx)
 		return NULL;
 
-	BUG_ON((type != XHCI_CTX_TYPE_DEVICE) && (type != XHCI_CTX_TYPE_INPUT));
 	ctx->type = type;
 	ctx->size = HCC_64BYTE_CONTEXT(xhci->hcc_params) ? 2048 : 1024;
 	if (type == XHCI_CTX_TYPE_INPUT)
 		ctx->size += CTX_SIZE(xhci->hcc_params);
 
 	ctx->bytes = dma_pool_alloc(xhci->device_pool, flags, &ctx->dma);
+	if (!ctx->bytes) {
+		kfree(ctx);
+		return NULL;
+	}
 	memset(ctx->bytes, 0, ctx->size);
 	return ctx;
 }
@@ -385,7 +393,9 @@
 struct xhci_input_control_ctx *xhci_get_input_control_ctx(struct xhci_hcd *xhci,
 					      struct xhci_container_ctx *ctx)
 {
-	BUG_ON(ctx->type != XHCI_CTX_TYPE_INPUT);
+	if (ctx->type != XHCI_CTX_TYPE_INPUT)
+		return NULL;
+
 	return (struct xhci_input_control_ctx *)ctx->bytes;
 }
 
@@ -1049,6 +1059,7 @@
 	struct xhci_ep_ctx	*ep0_ctx;
 	struct xhci_slot_ctx    *slot_ctx;
 	u32			port_num;
+	u32			max_packets;
 	struct usb_device *top_dev;
 
 	dev = xhci->devs[udev->slot_id];
@@ -1066,15 +1077,20 @@
 	switch (udev->speed) {
 	case USB_SPEED_SUPER:
 		slot_ctx->dev_info |= cpu_to_le32(SLOT_SPEED_SS);
+		max_packets = MAX_PACKET(512);
 		break;
 	case USB_SPEED_HIGH:
 		slot_ctx->dev_info |= cpu_to_le32(SLOT_SPEED_HS);
+		max_packets = MAX_PACKET(64);
 		break;
+	/* USB core guesses at a 64-byte max packet first for FS devices */
 	case USB_SPEED_FULL:
 		slot_ctx->dev_info |= cpu_to_le32(SLOT_SPEED_FS);
+		max_packets = MAX_PACKET(64);
 		break;
 	case USB_SPEED_LOW:
 		slot_ctx->dev_info |= cpu_to_le32(SLOT_SPEED_LS);
+		max_packets = MAX_PACKET(8);
 		break;
 	case USB_SPEED_WIRELESS:
 		xhci_dbg(xhci, "FIXME xHCI doesn't support wireless speeds\n");
@@ -1082,7 +1098,7 @@
 		break;
 	default:
 		/* Speed was set earlier, this shouldn't happen. */
-		BUG();
+		return -EINVAL;
 	}
 	/* Find the root hub port this device is under */
 	port_num = xhci_find_real_port_number(xhci, udev);
@@ -1141,31 +1157,10 @@
 	/* Step 4 - ring already allocated */
 	/* Step 5 */
 	ep0_ctx->ep_info2 = cpu_to_le32(EP_TYPE(CTRL_EP));
-	/*
-	 * XXX: Not sure about wireless USB devices.
-	 */
-	switch (udev->speed) {
-	case USB_SPEED_SUPER:
-		ep0_ctx->ep_info2 |= cpu_to_le32(MAX_PACKET(512));
-		break;
-	case USB_SPEED_HIGH:
-	/* USB core guesses at a 64-byte max packet first for FS devices */
-	case USB_SPEED_FULL:
-		ep0_ctx->ep_info2 |= cpu_to_le32(MAX_PACKET(64));
-		break;
-	case USB_SPEED_LOW:
-		ep0_ctx->ep_info2 |= cpu_to_le32(MAX_PACKET(8));
-		break;
-	case USB_SPEED_WIRELESS:
-		xhci_dbg(xhci, "FIXME xHCI doesn't support wireless speeds\n");
-		return -EINVAL;
-		break;
-	default:
-		/* New speed? */
-		BUG();
-	}
+
 	/* EP 0 can handle "burst" sizes of 1, so Max Burst Size field is 0 */
-	ep0_ctx->ep_info2 |= cpu_to_le32(MAX_BURST(0) | ERROR_COUNT(3));
+	ep0_ctx->ep_info2 |= cpu_to_le32(MAX_BURST(0) | ERROR_COUNT(3) |
+					 max_packets);
 
 	ep0_ctx->deq = cpu_to_le64(dev->eps[0].ring->first_seg->dma |
 				   dev->eps[0].ring->cycle_state);
@@ -1338,7 +1333,7 @@
 		else
 			type = EP_TYPE(INT_OUT_EP);
 	} else {
-		BUG();
+		type = 0;
 	}
 	return type;
 }
@@ -1384,10 +1379,16 @@
 	unsigned int max_burst;
 	enum xhci_ring_type type;
 	u32 max_esit_payload;
+	u32 endpoint_type;
 
 	ep_index = xhci_get_endpoint_index(&ep->desc);
 	ep_ctx = xhci_get_ep_ctx(xhci, virt_dev->in_ctx, ep_index);
 
+	endpoint_type = xhci_get_endpoint_type(udev, ep);
+	if (!endpoint_type)
+		return -EINVAL;
+	ep_ctx->ep_info2 = cpu_to_le32(endpoint_type);
+
 	type = usb_endpoint_type(&ep->desc);
 	/* Set up the endpoint ring */
 	virt_dev->eps[ep_index].new_ring =
@@ -1416,11 +1417,9 @@
 	 * CErr shall be set to 0 for Isoch endpoints.
 	 */
 	if (!usb_endpoint_xfer_isoc(&ep->desc))
-		ep_ctx->ep_info2 = cpu_to_le32(ERROR_COUNT(3));
+		ep_ctx->ep_info2 |= cpu_to_le32(ERROR_COUNT(3));
 	else
-		ep_ctx->ep_info2 = cpu_to_le32(ERROR_COUNT(0));
-
-	ep_ctx->ep_info2 |= cpu_to_le32(xhci_get_endpoint_type(udev, ep));
+		ep_ctx->ep_info2 |= cpu_to_le32(ERROR_COUNT(0));
 
 	/* Set the max packet size and max burst */
 	max_packet = GET_MAX_PACKET(usb_endpoint_maxp(&ep->desc));
@@ -1856,6 +1855,7 @@
 	kfree(xhci->usb3_ports);
 	kfree(xhci->port_array);
 	kfree(xhci->rh_bw);
+	kfree(xhci->ext_caps);
 
 	xhci->page_size = 0;
 	xhci->page_shift = 0;
@@ -2043,7 +2043,7 @@
 }
 
 static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports,
-		__le32 __iomem *addr, u8 major_revision)
+		__le32 __iomem *addr, u8 major_revision, int max_caps)
 {
 	u32 temp, port_offset, port_count;
 	int i;
@@ -2068,6 +2068,10 @@
 		/* WTF? "Valid values are ‘1’ to MaxPorts" */
 		return;
 
+	/* cache usb2 port capabilities */
+	if (major_revision < 0x03 && xhci->num_ext_caps < max_caps)
+		xhci->ext_caps[xhci->num_ext_caps++] = temp;
+
 	/* Check the host's USB2 LPM capability */
 	if ((xhci->hci_version == 0x96) && (major_revision != 0x03) &&
 			(temp & XHCI_L1C)) {
@@ -2125,10 +2129,11 @@
  */
 static int xhci_setup_port_arrays(struct xhci_hcd *xhci, gfp_t flags)
 {
-	__le32 __iomem *addr;
-	u32 offset;
+	__le32 __iomem *addr, *tmp_addr;
+	u32 offset, tmp_offset;
 	unsigned int num_ports;
 	int i, j, port_index;
+	int cap_count = 0;
 
 	addr = &xhci->cap_regs->hcc_params;
 	offset = XHCI_HCC_EXT_CAPS(xhci_readl(xhci, addr));
@@ -2161,13 +2166,32 @@
 	 * See section 5.3.6 for offset calculation.
 	 */
 	addr = &xhci->cap_regs->hc_capbase + offset;
+
+	tmp_addr = addr;
+	tmp_offset = offset;
+
+	/* count extended protocol capability entries for later caching */
+	do {
+		u32 cap_id;
+		cap_id = xhci_readl(xhci, tmp_addr);
+		if (XHCI_EXT_CAPS_ID(cap_id) == XHCI_EXT_CAPS_PROTOCOL)
+			cap_count++;
+		tmp_offset = XHCI_EXT_CAPS_NEXT(cap_id);
+		tmp_addr += tmp_offset;
+	} while (tmp_offset);
+
+	xhci->ext_caps = kzalloc(sizeof(*xhci->ext_caps) * cap_count, flags);
+	if (!xhci->ext_caps)
+		return -ENOMEM;
+
 	while (1) {
 		u32 cap_id;
 
 		cap_id = xhci_readl(xhci, addr);
 		if (XHCI_EXT_CAPS_ID(cap_id) == XHCI_EXT_CAPS_PROTOCOL)
 			xhci_add_in_port(xhci, num_ports, addr,
-					(u8) XHCI_EXT_PORT_MAJOR(cap_id));
+					(u8) XHCI_EXT_PORT_MAJOR(cap_id),
+					cap_count);
 		offset = XHCI_EXT_CAPS_NEXT(cap_id);
 		if (!offset || (xhci->num_usb2_ports + xhci->num_usb3_ports)
 				== num_ports)

diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c
index df90fe5..51e22bf 100644
--- a/drivers/usb/host/xhci-plat.c
+++ b/drivers/usb/host/xhci-plat.c

@@ -130,7 +130,7 @@
 		goto unmap_registers;
 
 	/* USB 2.0 roothub is stored in the platform_device now. */
-	hcd = dev_get_drvdata(&pdev->dev);
+	hcd = platform_get_drvdata(pdev);
 	xhci = hcd_to_xhci(hcd);
 	xhci->shared_hcd = usb_create_shared_hcd(driver, &pdev->dev,
 			dev_name(&pdev->dev), hcd);
@@ -179,6 +179,7 @@
 
 	usb_remove_hcd(hcd);
 	iounmap(hcd->regs);
+	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
 	usb_put_hcd(hcd);
 	kfree(xhci);
 

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 1969c00..1e57eaf 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c

@@ -1424,6 +1424,10 @@
 		 */
 		ctrl_ctx = xhci_get_input_control_ctx(xhci,
 				virt_dev->in_ctx);
+		if (!ctrl_ctx) {
+			xhci_warn(xhci, "Could not get input context, bad type.\n");
+			break;
+		}
 		/* Input ctx add_flags are the endpoint index plus one */
 		ep_index = xhci_last_valid_endpoint(le32_to_cpu(ctrl_ctx->add_flags)) - 1;
 		/* A usb_set_interface() call directly after clearing a halted
@@ -2799,7 +2803,7 @@
 	return IRQ_HANDLED;
 }
 
-irqreturn_t xhci_msi_irq(int irq, struct usb_hcd *hcd)
+irqreturn_t xhci_msi_irq(int irq, void *hcd)
 {
 	return xhci_irq(hcd);
 }

diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index d8f640b..2c49f00 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c

@@ -218,7 +218,7 @@
 		return ret;
 	}
 
-	ret = request_irq(pdev->irq, (irq_handler_t)xhci_msi_irq,
+	ret = request_irq(pdev->irq, xhci_msi_irq,
 				0, "xhci_hcd", xhci_to_hcd(xhci));
 	if (ret) {
 		xhci_dbg(xhci, "disable MSI interrupt\n");
@@ -290,7 +290,7 @@
 
 	for (i = 0; i < xhci->msix_count; i++) {
 		ret = request_irq(xhci->msix_entries[i].vector,
-				(irq_handler_t)xhci_msi_irq,
+				xhci_msi_irq,
 				0, "xhci_hcd", xhci_to_hcd(xhci));
 		if (ret)
 			goto disable_msix;
@@ -1121,6 +1121,16 @@
 	return index;
 }
 
+/* The reverse operation to xhci_get_endpoint_index. Calculate the USB endpoint
+ * address from the XHCI endpoint index.
+ */
+unsigned int xhci_get_endpoint_address(unsigned int ep_index)
+{
+	unsigned int number = DIV_ROUND_UP(ep_index, 2);
+	unsigned int direction = ep_index % 2 ? USB_DIR_OUT : USB_DIR_IN;
+	return direction | number;
+}
+
 /* Find the flag for this endpoint (for use in the control context).  Use the
  * endpoint index to create a bitmask.  The slot context is bit 0, endpoint 0 is
  * bit 1, etc.
@@ -1225,19 +1235,25 @@
 				hw_max_packet_size);
 		xhci_dbg(xhci, "Issuing evaluate context command.\n");
 
-		/* Set up the modified control endpoint 0 */
-		xhci_endpoint_copy(xhci, xhci->devs[slot_id]->in_ctx,
-				xhci->devs[slot_id]->out_ctx, ep_index);
-		in_ctx = xhci->devs[slot_id]->in_ctx;
-		ep_ctx = xhci_get_ep_ctx(xhci, in_ctx, ep_index);
-		ep_ctx->ep_info2 &= cpu_to_le32(~MAX_PACKET_MASK);
-		ep_ctx->ep_info2 |= cpu_to_le32(MAX_PACKET(max_packet_size));
-
 		/* Set up the input context flags for the command */
 		/* FIXME: This won't work if a non-default control endpoint
 		 * changes max packet sizes.
 		 */
+		in_ctx = xhci->devs[slot_id]->in_ctx;
 		ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
+		if (!ctrl_ctx) {
+			xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+					__func__);
+			return -ENOMEM;
+		}
+		/* Set up the modified control endpoint 0 */
+		xhci_endpoint_copy(xhci, xhci->devs[slot_id]->in_ctx,
+				xhci->devs[slot_id]->out_ctx, ep_index);
+
+		ep_ctx = xhci_get_ep_ctx(xhci, in_ctx, ep_index);
+		ep_ctx->ep_info2 &= cpu_to_le32(~MAX_PACKET_MASK);
+		ep_ctx->ep_info2 |= cpu_to_le32(MAX_PACKET(max_packet_size));
+
 		ctrl_ctx->add_flags = cpu_to_le32(EP0_FLAG);
 		ctrl_ctx->drop_flags = 0;
 
@@ -1597,6 +1613,12 @@
 	in_ctx = xhci->devs[udev->slot_id]->in_ctx;
 	out_ctx = xhci->devs[udev->slot_id]->out_ctx;
 	ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
+	if (!ctrl_ctx) {
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		return 0;
+	}
+
 	ep_index = xhci_get_endpoint_index(&ep->desc);
 	ep_ctx = xhci_get_ep_ctx(xhci, out_ctx, ep_index);
 	/* If the HC already knows the endpoint is disabled,
@@ -1691,8 +1713,13 @@
 	in_ctx = virt_dev->in_ctx;
 	out_ctx = virt_dev->out_ctx;
 	ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
-	ep_index = xhci_get_endpoint_index(&ep->desc);
+	if (!ctrl_ctx) {
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		return 0;
+	}
 
+	ep_index = xhci_get_endpoint_index(&ep->desc);
 	/* If this endpoint is already in use, and the upper layers are trying
 	 * to add it again without dropping it, reject the addition.
 	 */
@@ -1765,12 +1792,18 @@
 	struct xhci_slot_ctx *slot_ctx;
 	int i;
 
+	ctrl_ctx = xhci_get_input_control_ctx(xhci, virt_dev->in_ctx);
+	if (!ctrl_ctx) {
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		return;
+	}
+
 	/* When a device's add flag and drop flag are zero, any subsequent
 	 * configure endpoint command will leave that endpoint's state
 	 * untouched.  Make sure we don't leave any old state in the input
 	 * endpoint contexts.
 	 */
-	ctrl_ctx = xhci_get_input_control_ctx(xhci, virt_dev->in_ctx);
 	ctrl_ctx->drop_flags = 0;
 	ctrl_ctx->add_flags = 0;
 	slot_ctx = xhci_get_slot_ctx(xhci, virt_dev->in_ctx);
@@ -1877,13 +1910,11 @@
 }
 
 static u32 xhci_count_num_new_endpoints(struct xhci_hcd *xhci,
-		struct xhci_container_ctx *in_ctx)
+		struct xhci_input_control_ctx *ctrl_ctx)
 {
-	struct xhci_input_control_ctx *ctrl_ctx;
 	u32 valid_add_flags;
 	u32 valid_drop_flags;
 
-	ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
 	/* Ignore the slot flag (bit 0), and the default control endpoint flag
 	 * (bit 1).  The default control endpoint is added during the Address
 	 * Device command and is never removed until the slot is disabled.
@@ -1900,13 +1931,11 @@
 }
 
 static unsigned int xhci_count_num_dropped_endpoints(struct xhci_hcd *xhci,
-		struct xhci_container_ctx *in_ctx)
+		struct xhci_input_control_ctx *ctrl_ctx)
 {
-	struct xhci_input_control_ctx *ctrl_ctx;
 	u32 valid_add_flags;
 	u32 valid_drop_flags;
 
-	ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
 	valid_add_flags = ctrl_ctx->add_flags >> 2;
 	valid_drop_flags = ctrl_ctx->drop_flags >> 2;
 
@@ -1928,11 +1957,11 @@
  * Must be called with xhci->lock held.
  */
 static int xhci_reserve_host_resources(struct xhci_hcd *xhci,
-		struct xhci_container_ctx *in_ctx)
+		struct xhci_input_control_ctx *ctrl_ctx)
 {
 	u32 added_eps;
 
-	added_eps = xhci_count_num_new_endpoints(xhci, in_ctx);
+	added_eps = xhci_count_num_new_endpoints(xhci, ctrl_ctx);
 	if (xhci->num_active_eps + added_eps > xhci->limit_active_eps) {
 		xhci_dbg(xhci, "Not enough ep ctxs: "
 				"%u active, need to add %u, limit is %u.\n",
@@ -1953,11 +1982,11 @@
  * Must be called with xhci->lock held.
  */
 static void xhci_free_host_resources(struct xhci_hcd *xhci,
-		struct xhci_container_ctx *in_ctx)
+		struct xhci_input_control_ctx *ctrl_ctx)
 {
 	u32 num_failed_eps;
 
-	num_failed_eps = xhci_count_num_new_endpoints(xhci, in_ctx);
+	num_failed_eps = xhci_count_num_new_endpoints(xhci, ctrl_ctx);
 	xhci->num_active_eps -= num_failed_eps;
 	xhci_dbg(xhci, "Removing %u failed ep ctxs, %u now active.\n",
 			num_failed_eps,
@@ -1971,11 +2000,11 @@
  * Must be called with xhci->lock held.
  */
 static void xhci_finish_resource_reservation(struct xhci_hcd *xhci,
-		struct xhci_container_ctx *in_ctx)
+		struct xhci_input_control_ctx *ctrl_ctx)
 {
 	u32 num_dropped_eps;
 
-	num_dropped_eps = xhci_count_num_dropped_endpoints(xhci, in_ctx);
+	num_dropped_eps = xhci_count_num_dropped_endpoints(xhci, ctrl_ctx);
 	xhci->num_active_eps -= num_dropped_eps;
 	if (num_dropped_eps)
 		xhci_dbg(xhci, "Removing %u dropped ep ctxs, %u now active.\n",
@@ -2470,6 +2499,11 @@
 		old_active_eps = virt_dev->tt_info->active_eps;
 
 	ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
+	if (!ctrl_ctx) {
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		return -ENOMEM;
+	}
 
 	for (i = 0; i < 31; i++) {
 		if (!EP_IS_ADDED(ctrl_ctx, i) && !EP_IS_DROPPED(ctrl_ctx, i))
@@ -2554,6 +2588,7 @@
 	int timeleft;
 	unsigned long flags;
 	struct xhci_container_ctx *in_ctx;
+	struct xhci_input_control_ctx *ctrl_ctx;
 	struct completion *cmd_completion;
 	u32 *cmd_status;
 	struct xhci_virt_device *virt_dev;
@@ -2566,9 +2601,16 @@
 		in_ctx = command->in_ctx;
 	else
 		in_ctx = virt_dev->in_ctx;
+	ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
+	if (!ctrl_ctx) {
+		spin_unlock_irqrestore(&xhci->lock, flags);
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		return -ENOMEM;
+	}
 
 	if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK) &&
-			xhci_reserve_host_resources(xhci, in_ctx)) {
+			xhci_reserve_host_resources(xhci, ctrl_ctx)) {
 		spin_unlock_irqrestore(&xhci->lock, flags);
 		xhci_warn(xhci, "Not enough host resources, "
 				"active endpoint contexts = %u\n",
@@ -2578,7 +2620,7 @@
 	if ((xhci->quirks & XHCI_SW_BW_CHECKING) &&
 			xhci_reserve_bandwidth(xhci, virt_dev, in_ctx)) {
 		if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK))
-			xhci_free_host_resources(xhci, in_ctx);
+			xhci_free_host_resources(xhci, ctrl_ctx);
 		spin_unlock_irqrestore(&xhci->lock, flags);
 		xhci_warn(xhci, "Not enough bandwidth\n");
 		return -ENOMEM;
@@ -2614,7 +2656,7 @@
 		if (command)
 			list_del(&command->cmd_list);
 		if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK))
-			xhci_free_host_resources(xhci, in_ctx);
+			xhci_free_host_resources(xhci, ctrl_ctx);
 		spin_unlock_irqrestore(&xhci->lock, flags);
 		xhci_dbg(xhci, "FIXME allocate a new ring segment\n");
 		return -ENOMEM;
@@ -2650,9 +2692,9 @@
 		 * Otherwise, clean up the estimate to include dropped eps.
 		 */
 		if (ret)
-			xhci_free_host_resources(xhci, in_ctx);
+			xhci_free_host_resources(xhci, ctrl_ctx);
 		else
-			xhci_finish_resource_reservation(xhci, in_ctx);
+			xhci_finish_resource_reservation(xhci, ctrl_ctx);
 		spin_unlock_irqrestore(&xhci->lock, flags);
 	}
 	return ret;
@@ -2689,6 +2731,11 @@
 
 	/* See section 4.6.6 - A0 = 1; A1 = D0 = D1 = 0 */
 	ctrl_ctx = xhci_get_input_control_ctx(xhci, virt_dev->in_ctx);
+	if (!ctrl_ctx) {
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		return -ENOMEM;
+	}
 	ctrl_ctx->add_flags |= cpu_to_le32(SLOT_FLAG);
 	ctrl_ctx->add_flags &= cpu_to_le32(~EP0_FLAG);
 	ctrl_ctx->drop_flags &= cpu_to_le32(~(SLOT_FLAG | EP0_FLAG));
@@ -2767,10 +2814,9 @@
 static void xhci_setup_input_ctx_for_config_ep(struct xhci_hcd *xhci,
 		struct xhci_container_ctx *in_ctx,
 		struct xhci_container_ctx *out_ctx,
+		struct xhci_input_control_ctx *ctrl_ctx,
 		u32 add_flags, u32 drop_flags)
 {
-	struct xhci_input_control_ctx *ctrl_ctx;
-	ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
 	ctrl_ctx->add_flags = cpu_to_le32(add_flags);
 	ctrl_ctx->drop_flags = cpu_to_le32(drop_flags);
 	xhci_slot_copy(xhci, in_ctx, out_ctx);
@@ -2784,14 +2830,22 @@
 		unsigned int slot_id, unsigned int ep_index,
 		struct xhci_dequeue_state *deq_state)
 {
+	struct xhci_input_control_ctx *ctrl_ctx;
 	struct xhci_container_ctx *in_ctx;
 	struct xhci_ep_ctx *ep_ctx;
 	u32 added_ctxs;
 	dma_addr_t addr;
 
+	in_ctx = xhci->devs[slot_id]->in_ctx;
+	ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
+	if (!ctrl_ctx) {
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		return;
+	}
+
 	xhci_endpoint_copy(xhci, xhci->devs[slot_id]->in_ctx,
 			xhci->devs[slot_id]->out_ctx, ep_index);
-	in_ctx = xhci->devs[slot_id]->in_ctx;
 	ep_ctx = xhci_get_ep_ctx(xhci, in_ctx, ep_index);
 	addr = xhci_trb_virt_to_dma(deq_state->new_deq_seg,
 			deq_state->new_deq_ptr);
@@ -2807,7 +2861,8 @@
 
 	added_ctxs = xhci_get_endpoint_flag_from_index(ep_index);
 	xhci_setup_input_ctx_for_config_ep(xhci, xhci->devs[slot_id]->in_ctx,
-			xhci->devs[slot_id]->out_ctx, added_ctxs, added_ctxs);
+			xhci->devs[slot_id]->out_ctx, ctrl_ctx,
+			added_ctxs, added_ctxs);
 }
 
 void xhci_cleanup_stalled_ring(struct xhci_hcd *xhci,
@@ -3065,6 +3120,7 @@
 	struct xhci_hcd *xhci;
 	struct xhci_virt_device *vdev;
 	struct xhci_command *config_cmd;
+	struct xhci_input_control_ctx *ctrl_ctx;
 	unsigned int ep_index;
 	unsigned int num_stream_ctxs;
 	unsigned long flags;
@@ -3086,6 +3142,13 @@
 		xhci_dbg(xhci, "Could not allocate xHCI command structure.\n");
 		return -ENOMEM;
 	}
+	ctrl_ctx = xhci_get_input_control_ctx(xhci, config_cmd->in_ctx);
+	if (!ctrl_ctx) {
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		xhci_free_command(xhci, config_cmd);
+		return -ENOMEM;
+	}
 
 	/* Check to make sure all endpoints are not already configured for
 	 * streams.  While we're at it, find the maximum number of streams that
@@ -3152,7 +3215,8 @@
 	 * and add the updated copy from the input context.
 	 */
 	xhci_setup_input_ctx_for_config_ep(xhci, config_cmd->in_ctx,
-			vdev->out_ctx, changed_ep_bitmask, changed_ep_bitmask);
+			vdev->out_ctx, ctrl_ctx,
+			changed_ep_bitmask, changed_ep_bitmask);
 
 	/* Issue and wait for the configure endpoint command */
 	ret = xhci_configure_endpoint(xhci, udev, config_cmd,
@@ -3210,6 +3274,7 @@
 	struct xhci_hcd *xhci;
 	struct xhci_virt_device *vdev;
 	struct xhci_command *command;
+	struct xhci_input_control_ctx *ctrl_ctx;
 	unsigned int ep_index;
 	unsigned long flags;
 	u32 changed_ep_bitmask;
@@ -3232,6 +3297,14 @@
 	 */
 	ep_index = xhci_get_endpoint_index(&eps[0]->desc);
 	command = vdev->eps[ep_index].stream_info->free_streams_command;
+	ctrl_ctx = xhci_get_input_control_ctx(xhci, command->in_ctx);
+	if (!ctrl_ctx) {
+		spin_unlock_irqrestore(&xhci->lock, flags);
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		return -EINVAL;
+	}
+
 	for (i = 0; i < num_eps; i++) {
 		struct xhci_ep_ctx *ep_ctx;
 
@@ -3246,7 +3319,8 @@
 				&vdev->eps[ep_index]);
 	}
 	xhci_setup_input_ctx_for_config_ep(xhci, command->in_ctx,
-			vdev->out_ctx, changed_ep_bitmask, changed_ep_bitmask);
+			vdev->out_ctx, ctrl_ctx,
+			changed_ep_bitmask, changed_ep_bitmask);
 	spin_unlock_irqrestore(&xhci->lock, flags);
 
 	/* Issue and wait for the configure endpoint command,
@@ -3686,6 +3760,12 @@
 	}
 
 	slot_ctx = xhci_get_slot_ctx(xhci, virt_dev->in_ctx);
+	ctrl_ctx = xhci_get_input_control_ctx(xhci, virt_dev->in_ctx);
+	if (!ctrl_ctx) {
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		return -EINVAL;
+	}
 	/*
 	 * If this is the first Set Address since device plug-in or
 	 * virt_device realloaction after a resume with an xHCI power loss,
@@ -3696,7 +3776,6 @@
 	/* Otherwise, update the control endpoint ring enqueue pointer. */
 	else
 		xhci_copy_ep0_dequeue_into_input_ctx(xhci, udev);
-	ctrl_ctx = xhci_get_input_control_ctx(xhci, virt_dev->in_ctx);
 	ctrl_ctx->add_flags = cpu_to_le32(SLOT_FLAG | EP0_FLAG);
 	ctrl_ctx->drop_flags = 0;
 
@@ -3815,6 +3894,63 @@
 	return raw_port;
 }
 
+/*
+ * Issue an Evaluate Context command to change the Maximum Exit Latency in the
+ * slot context.  If that succeeds, store the new MEL in the xhci_virt_device.
+ */
+static int xhci_change_max_exit_latency(struct xhci_hcd *xhci,
+			struct usb_device *udev, u16 max_exit_latency)
+{
+	struct xhci_virt_device *virt_dev;
+	struct xhci_command *command;
+	struct xhci_input_control_ctx *ctrl_ctx;
+	struct xhci_slot_ctx *slot_ctx;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&xhci->lock, flags);
+	if (max_exit_latency == xhci->devs[udev->slot_id]->current_mel) {
+		spin_unlock_irqrestore(&xhci->lock, flags);
+		return 0;
+	}
+
+	/* Attempt to issue an Evaluate Context command to change the MEL. */
+	virt_dev = xhci->devs[udev->slot_id];
+	command = xhci->lpm_command;
+	ctrl_ctx = xhci_get_input_control_ctx(xhci, command->in_ctx);
+	if (!ctrl_ctx) {
+		spin_unlock_irqrestore(&xhci->lock, flags);
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		return -ENOMEM;
+	}
+
+	xhci_slot_copy(xhci, command->in_ctx, virt_dev->out_ctx);
+	spin_unlock_irqrestore(&xhci->lock, flags);
+
+	ctrl_ctx->add_flags |= cpu_to_le32(SLOT_FLAG);
+	slot_ctx = xhci_get_slot_ctx(xhci, command->in_ctx);
+	slot_ctx->dev_info2 &= cpu_to_le32(~((u32) MAX_EXIT));
+	slot_ctx->dev_info2 |= cpu_to_le32(max_exit_latency);
+
+	xhci_dbg(xhci, "Set up evaluate context for LPM MEL change.\n");
+	xhci_dbg(xhci, "Slot %u Input Context:\n", udev->slot_id);
+	xhci_dbg_ctx(xhci, command->in_ctx, 0);
+
+	/* Issue and wait for the evaluate context command. */
+	ret = xhci_configure_endpoint(xhci, udev, command,
+			true, true);
+	xhci_dbg(xhci, "Slot %u Output Context:\n", udev->slot_id);
+	xhci_dbg_ctx(xhci, virt_dev->out_ctx, 0);
+
+	if (!ret) {
+		spin_lock_irqsave(&xhci->lock, flags);
+		virt_dev->current_mel = max_exit_latency;
+		spin_unlock_irqrestore(&xhci->lock, flags);
+	}
+	return ret;
+}
+
 #ifdef CONFIG_PM_RUNTIME
 
 /* BESL to HIRD Encoding array for USB2 LPM */
@@ -3856,6 +3992,28 @@
 	return besl;
 }
 
+/* Calculate BESLD, L1 timeout and HIRDM for USB2 PORTHLPMC */
+static int xhci_calculate_usb2_hw_lpm_params(struct usb_device *udev)
+{
+	u32 field;
+	int l1;
+	int besld = 0;
+	int hirdm = 0;
+
+	field = le32_to_cpu(udev->bos->ext_cap->bmAttributes);
+
+	/* xHCI l1 is set in steps of 256us, xHCI 1.0 section 5.4.11.2 */
+	l1 = udev->l1_params.timeout / 256;
+
+	/* device has preferred BESLD */
+	if (field & USB_BESL_DEEP_VALID) {
+		besld = USB_GET_BESL_DEEP(field);
+		hirdm = 1;
+	}
+
+	return PORT_BESLD(besld) | PORT_L1_TIMEOUT(l1) | PORT_HIRDM(hirdm);
+}
+
 static int xhci_usb2_software_lpm_test(struct usb_hcd *hcd,
 					struct usb_device *udev)
 {
@@ -3911,7 +4069,7 @@
 	 * Check device's USB 2.0 extension descriptor to determine whether
 	 * HIRD or BESL shoule be used. See USB2.0 LPM errata.
 	 */
-	pm_addr = port_array[port_num] + 1;
+	pm_addr = port_array[port_num] + PORTPMSC;
 	hird = xhci_calculate_hird_besl(xhci, udev);
 	temp = PORT_L1DS(udev->slot_id) | PORT_HIRD(hird);
 	xhci_writel(xhci, temp, pm_addr);
@@ -3988,11 +4146,12 @@
 {
 	struct xhci_hcd	*xhci = hcd_to_xhci(hcd);
 	__le32 __iomem	**port_array;
-	__le32 __iomem	*pm_addr;
-	u32		temp;
+	__le32 __iomem	*pm_addr, *hlpm_addr;
+	u32		pm_val, hlpm_val, field;
 	unsigned int	port_num;
 	unsigned long	flags;
-	int		hird;
+	int		hird, exit_latency;
+	int		ret;
 
 	if (hcd->speed == HCD_USB3 || !xhci->hw_lpm_support ||
 			!udev->lpm_capable)
@@ -4009,40 +4168,120 @@
 
 	port_array = xhci->usb2_ports;
 	port_num = udev->portnum - 1;
-	pm_addr = port_array[port_num] + 1;
-	temp = xhci_readl(xhci, pm_addr);
+	pm_addr = port_array[port_num] + PORTPMSC;
+	pm_val = xhci_readl(xhci, pm_addr);
+	hlpm_addr = port_array[port_num] + PORTHLPMC;
+	field = le32_to_cpu(udev->bos->ext_cap->bmAttributes);
 
 	xhci_dbg(xhci, "%s port %d USB2 hardware LPM\n",
 			enable ? "enable" : "disable", port_num);
 
-	hird = xhci_calculate_hird_besl(xhci, udev);
-
 	if (enable) {
-		temp &= ~PORT_HIRD_MASK;
-		temp |= PORT_HIRD(hird) | PORT_RWE;
-		xhci_writel(xhci, temp, pm_addr);
-		temp = xhci_readl(xhci, pm_addr);
-		temp |= PORT_HLE;
-		xhci_writel(xhci, temp, pm_addr);
+		/* Host supports BESL timeout instead of HIRD */
+		if (udev->usb2_hw_lpm_besl_capable) {
+			/* if device doesn't have a preferred BESL value use a
+			 * default one which works with mixed HIRD and BESL
+			 * systems. See XHCI_DEFAULT_BESL definition in xhci.h
+			 */
+			if ((field & USB_BESL_SUPPORT) &&
+			    (field & USB_BESL_BASELINE_VALID))
+				hird = USB_GET_BESL_BASELINE(field);
+			else
+				hird = udev->l1_params.besl;
+
+			exit_latency = xhci_besl_encoding[hird];
+			spin_unlock_irqrestore(&xhci->lock, flags);
+
+			/* USB 3.0 code dedicate one xhci->lpm_command->in_ctx
+			 * input context for link powermanagement evaluate
+			 * context commands. It is protected by hcd->bandwidth
+			 * mutex and is shared by all devices. We need to set
+			 * the max ext latency in USB 2 BESL LPM as well, so
+			 * use the same mutex and xhci_change_max_exit_latency()
+			 */
+			mutex_lock(hcd->bandwidth_mutex);
+			ret = xhci_change_max_exit_latency(xhci, udev,
+							   exit_latency);
+			mutex_unlock(hcd->bandwidth_mutex);
+
+			if (ret < 0)
+				return ret;
+			spin_lock_irqsave(&xhci->lock, flags);
+
+			hlpm_val = xhci_calculate_usb2_hw_lpm_params(udev);
+			xhci_writel(xhci, hlpm_val, hlpm_addr);
+			/* flush write */
+			xhci_readl(xhci, hlpm_addr);
+		} else {
+			hird = xhci_calculate_hird_besl(xhci, udev);
+		}
+
+		pm_val &= ~PORT_HIRD_MASK;
+		pm_val |= PORT_HIRD(hird) | PORT_RWE;
+		xhci_writel(xhci, pm_val, pm_addr);
+		pm_val = xhci_readl(xhci, pm_addr);
+		pm_val |= PORT_HLE;
+		xhci_writel(xhci, pm_val, pm_addr);
+		/* flush write */
+		xhci_readl(xhci, pm_addr);
 	} else {
-		temp &= ~(PORT_HLE | PORT_RWE | PORT_HIRD_MASK);
-		xhci_writel(xhci, temp, pm_addr);
+		pm_val &= ~(PORT_HLE | PORT_RWE | PORT_HIRD_MASK);
+		xhci_writel(xhci, pm_val, pm_addr);
+		/* flush write */
+		xhci_readl(xhci, pm_addr);
+		if (udev->usb2_hw_lpm_besl_capable) {
+			spin_unlock_irqrestore(&xhci->lock, flags);
+			mutex_lock(hcd->bandwidth_mutex);
+			xhci_change_max_exit_latency(xhci, udev, 0);
+			mutex_unlock(hcd->bandwidth_mutex);
+			return 0;
+		}
 	}
 
 	spin_unlock_irqrestore(&xhci->lock, flags);
 	return 0;
 }
 
+/* check if a usb2 port supports a given extened capability protocol
+ * only USB2 ports extended protocol capability values are cached.
+ * Return 1 if capability is supported
+ */
+static int xhci_check_usb2_port_capability(struct xhci_hcd *xhci, int port,
+					   unsigned capability)
+{
+	u32 port_offset, port_count;
+	int i;
+
+	for (i = 0; i < xhci->num_ext_caps; i++) {
+		if (xhci->ext_caps[i] & capability) {
+			/* port offsets starts at 1 */
+			port_offset = XHCI_EXT_PORT_OFF(xhci->ext_caps[i]) - 1;
+			port_count = XHCI_EXT_PORT_COUNT(xhci->ext_caps[i]);
+			if (port >= port_offset &&
+			    port < port_offset + port_count)
+				return 1;
+		}
+	}
+	return 0;
+}
+
 int xhci_update_device(struct usb_hcd *hcd, struct usb_device *udev)
 {
 	struct xhci_hcd	*xhci = hcd_to_xhci(hcd);
 	int		ret;
+	int		portnum = udev->portnum - 1;
 
 	ret = xhci_usb2_software_lpm_test(hcd, udev);
 	if (!ret) {
 		xhci_dbg(xhci, "software LPM test succeed\n");
-		if (xhci->hw_lpm_support == 1) {
+		if (xhci->hw_lpm_support == 1 &&
+		    xhci_check_usb2_port_capability(xhci, portnum, XHCI_HLC)) {
 			udev->usb2_hw_lpm_capable = 1;
+			udev->l1_params.timeout = XHCI_L1_TIMEOUT;
+			udev->l1_params.besl = XHCI_DEFAULT_BESL;
+			if (xhci_check_usb2_port_capability(xhci, portnum,
+							    XHCI_BLC))
+				udev->usb2_hw_lpm_besl_capable = 1;
 			ret = xhci_set_usb2_hardware_lpm(hcd, udev, 1);
 			if (!ret)
 				udev->usb2_hw_lpm_enabled = 1;
@@ -4373,56 +4612,6 @@
 	return timeout;
 }
 
-/*
- * Issue an Evaluate Context command to change the Maximum Exit Latency in the
- * slot context.  If that succeeds, store the new MEL in the xhci_virt_device.
- */
-static int xhci_change_max_exit_latency(struct xhci_hcd *xhci,
-			struct usb_device *udev, u16 max_exit_latency)
-{
-	struct xhci_virt_device *virt_dev;
-	struct xhci_command *command;
-	struct xhci_input_control_ctx *ctrl_ctx;
-	struct xhci_slot_ctx *slot_ctx;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&xhci->lock, flags);
-	if (max_exit_latency == xhci->devs[udev->slot_id]->current_mel) {
-		spin_unlock_irqrestore(&xhci->lock, flags);
-		return 0;
-	}
-
-	/* Attempt to issue an Evaluate Context command to change the MEL. */
-	virt_dev = xhci->devs[udev->slot_id];
-	command = xhci->lpm_command;
-	xhci_slot_copy(xhci, command->in_ctx, virt_dev->out_ctx);
-	spin_unlock_irqrestore(&xhci->lock, flags);
-
-	ctrl_ctx = xhci_get_input_control_ctx(xhci, command->in_ctx);
-	ctrl_ctx->add_flags |= cpu_to_le32(SLOT_FLAG);
-	slot_ctx = xhci_get_slot_ctx(xhci, command->in_ctx);
-	slot_ctx->dev_info2 &= cpu_to_le32(~((u32) MAX_EXIT));
-	slot_ctx->dev_info2 |= cpu_to_le32(max_exit_latency);
-
-	xhci_dbg(xhci, "Set up evaluate context for LPM MEL change.\n");
-	xhci_dbg(xhci, "Slot %u Input Context:\n", udev->slot_id);
-	xhci_dbg_ctx(xhci, command->in_ctx, 0);
-
-	/* Issue and wait for the evaluate context command. */
-	ret = xhci_configure_endpoint(xhci, udev, command,
-			true, true);
-	xhci_dbg(xhci, "Slot %u Output Context:\n", udev->slot_id);
-	xhci_dbg_ctx(xhci, virt_dev->out_ctx, 0);
-
-	if (!ret) {
-		spin_lock_irqsave(&xhci->lock, flags);
-		virt_dev->current_mel = max_exit_latency;
-		spin_unlock_irqrestore(&xhci->lock, flags);
-	}
-	return ret;
-}
-
 static int calculate_max_exit_latency(struct usb_device *udev,
 		enum usb3_link_state state_changed,
 		u16 hub_encoded_timeout)
@@ -4564,6 +4753,13 @@
 		xhci_dbg(xhci, "Could not allocate xHCI command structure.\n");
 		return -ENOMEM;
 	}
+	ctrl_ctx = xhci_get_input_control_ctx(xhci, config_cmd->in_ctx);
+	if (!ctrl_ctx) {
+		xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
+				__func__);
+		xhci_free_command(xhci, config_cmd);
+		return -ENOMEM;
+	}
 
 	spin_lock_irqsave(&xhci->lock, flags);
 	if (hdev->speed == USB_SPEED_HIGH &&
@@ -4575,7 +4771,6 @@
 	}
 
 	xhci_slot_copy(xhci, config_cmd->in_ctx, vdev->out_ctx);
-	ctrl_ctx = xhci_get_input_control_ctx(xhci, config_cmd->in_ctx);
 	ctrl_ctx->add_flags |= cpu_to_le32(SLOT_FLAG);
 	slot_ctx = xhci_get_slot_ctx(xhci, config_cmd->in_ctx);
 	slot_ctx->dev_info |= cpu_to_le32(DEV_HUB);

diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 77600ce..c338741 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h

@@ -132,6 +132,11 @@
 /* Number of registers per port */
 #define	NUM_PORT_REGS	4
 
+#define PORTSC		0
+#define PORTPMSC	1
+#define PORTLI		2
+#define PORTHLPMC	3
+
 /**
  * struct xhci_op_regs - xHCI Host Controller Operational Registers.
  * @command:		USBCMD - xHC command register
@@ -381,6 +386,27 @@
 #define	PORT_L1DS(p)		(((p) & 0xff) << 8)
 #define	PORT_HLE		(1 << 16)
 
+
+/* USB2 Protocol PORTHLPMC */
+#define PORT_HIRDM(p)((p) & 3)
+#define PORT_L1_TIMEOUT(p)(((p) & 0xff) << 2)
+#define PORT_BESLD(p)(((p) & 0xf) << 10)
+
+/* use 512 microseconds as USB2 LPM L1 default timeout. */
+#define XHCI_L1_TIMEOUT		512
+
+/* Set default HIRD/BESL value to 4 (350/400us) for USB2 L1 LPM resume latency.
+ * Safe to use with mixed HIRD and BESL systems (host and device) and is used
+ * by other operating systems.
+ *
+ * XHCI 1.0 errata 8/14/12 Table 13 notes:
+ * "Software should choose xHC BESL/BESLD field values that do not violate a
+ * device's resume latency requirements,
+ * e.g. not program values > '4' if BLC = '1' and a HIRD device is attached,
+ * or not program values < '4' if BLC = '0' and a BESL device is attached.
+ */
+#define XHCI_DEFAULT_BESL	4
+
 /**
  * struct xhci_intr_reg - Interrupt Register Set
  * @irq_pending:	IMAN - Interrupt Management Register.  Used to enable
@@ -1532,6 +1558,9 @@
 	unsigned		sw_lpm_support:1;
 	/* support xHCI 1.0 spec USB2 hardware LPM */
 	unsigned		hw_lpm_support:1;
+	/* cached usb2 extened protocol capabilites */
+	u32                     *ext_caps;
+	unsigned int            num_ext_caps;
 	/* Compliance Mode Recovery Data */
 	struct timer_list	comp_mode_recovery_timer;
 	u32			port_status_u0;
@@ -1641,6 +1670,7 @@
 void xhci_copy_ep0_dequeue_into_input_ctx(struct xhci_hcd *xhci,
 		struct usb_device *udev);
 unsigned int xhci_get_endpoint_index(struct usb_endpoint_descriptor *desc);
+unsigned int xhci_get_endpoint_address(unsigned int ep_index);
 unsigned int xhci_get_endpoint_flag(struct usb_endpoint_descriptor *desc);
 unsigned int xhci_get_endpoint_flag_from_index(unsigned int ep_index);
 unsigned int xhci_last_valid_endpoint(u32 added_ctxs);
@@ -1745,7 +1775,7 @@
 
 int xhci_get_frame(struct usb_hcd *hcd);
 irqreturn_t xhci_irq(struct usb_hcd *hcd);
-irqreturn_t xhci_msi_irq(int irq, struct usb_hcd *hcd);
+irqreturn_t xhci_msi_irq(int irq, void *hcd);
 int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev);
 void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev);
 int xhci_alloc_tt_info(struct xhci_hcd *xhci,

diff --git a/drivers/usb/misc/adutux.c b/drivers/usb/misc/adutux.c
index 284b854..eb3c8c1 100644
--- a/drivers/usb/misc/adutux.c
+++ b/drivers/usb/misc/adutux.c

@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/usb.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_USB_DEBUG
 static int debug = 5;
@@ -35,8 +35,8 @@
 
 /* Use our own dbg macro */
 #undef dbg
-#define dbg(lvl, format, arg...) 					\
-do { 									\
+#define dbg(lvl, format, arg...)	\
+do {								\
 	if (debug >= lvl)						\
 		printk(KERN_DEBUG "%s: " format "\n", __FILE__, ##arg);	\
 } while (0)
@@ -58,12 +58,12 @@
 /* table of devices that work with this driver */
 static const struct usb_device_id device_table[] = {
 	{ USB_DEVICE(ADU_VENDOR_ID, ADU_PRODUCT_ID) },		/* ADU100 */
-	{ USB_DEVICE(ADU_VENDOR_ID, ADU_PRODUCT_ID+20) }, 	/* ADU120 */
-	{ USB_DEVICE(ADU_VENDOR_ID, ADU_PRODUCT_ID+30) }, 	/* ADU130 */
+	{ USB_DEVICE(ADU_VENDOR_ID, ADU_PRODUCT_ID+20) },	/* ADU120 */
+	{ USB_DEVICE(ADU_VENDOR_ID, ADU_PRODUCT_ID+30) },	/* ADU130 */
 	{ USB_DEVICE(ADU_VENDOR_ID, ADU_PRODUCT_ID+100) },	/* ADU200 */
 	{ USB_DEVICE(ADU_VENDOR_ID, ADU_PRODUCT_ID+108) },	/* ADU208 */
 	{ USB_DEVICE(ADU_VENDOR_ID, ADU_PRODUCT_ID+118) },	/* ADU218 */
-	{ }/* Terminating entry */
+	{ } /* Terminating entry */
 };
 
 MODULE_DEVICE_TABLE(usb, device_table);
@@ -92,16 +92,16 @@
 /* Structure to hold all of our device specific stuff */
 struct adu_device {
 	struct mutex		mtx;
-	struct usb_device*	udev; /* save off the usb device pointer */
-	struct usb_interface*	interface;
+	struct usb_device *udev; /* save off the usb device pointer */
+	struct usb_interface *interface;
 	unsigned int		minor; /* the starting minor number for this device */
 	char			serial_number[8];
 
 	int			open_count; /* number of times this port has been opened */
 
-	char*			read_buffer_primary;
+	char		*read_buffer_primary;
 	int			read_buffer_length;
-	char*			read_buffer_secondary;
+	char		*read_buffer_secondary;
 	int			secondary_head;
 	int			secondary_tail;
 	spinlock_t		buflock;
@@ -109,14 +109,14 @@
 	wait_queue_head_t	read_wait;
 	wait_queue_head_t	write_wait;
 
-	char*			interrupt_in_buffer;
-	struct usb_endpoint_descriptor* interrupt_in_endpoint;
-	struct urb*		interrupt_in_urb;
+	char		*interrupt_in_buffer;
+	struct usb_endpoint_descriptor *interrupt_in_endpoint;
+	struct urb	*interrupt_in_urb;
 	int			read_urb_finished;
 
-	char*			interrupt_out_buffer;
-	struct usb_endpoint_descriptor* interrupt_out_endpoint;
-	struct urb*		interrupt_out_urb;
+	char		*interrupt_out_buffer;
+	struct usb_endpoint_descriptor *interrupt_out_endpoint;
+	struct urb	*interrupt_out_urb;
 	int			out_urb_finished;
 };
 
@@ -147,10 +147,10 @@
 {
 	unsigned long flags;
 
-	dbg(2," %s : enter", __func__);
+	dbg(2, " %s : enter", __func__);
 
 	if (dev->udev == NULL) {
-		dbg(1," %s : udev is null", __func__);
+		dbg(1, " %s : udev is null", __func__);
 		goto exit;
 	}
 
@@ -172,7 +172,7 @@
 		spin_unlock_irqrestore(&dev->buflock, flags);
 
 exit:
-	dbg(2," %s : leave", __func__);
+	dbg(2, " %s : leave", __func__);
 }
 
 static void adu_delete(struct adu_device *dev)
@@ -196,7 +196,7 @@
 	struct adu_device *dev = urb->context;
 	int status = urb->status;
 
-	dbg(4," %s : enter, status %d", __func__, status);
+	dbg(4, " %s : enter, status %d", __func__, status);
 	adu_debug_data(5, __func__, urb->actual_length,
 		       urb->transfer_buffer);
 
@@ -205,7 +205,7 @@
 	if (status != 0) {
 		if ((status != -ENOENT) && (status != -ECONNRESET) &&
 			(status != -ESHUTDOWN)) {
-			dbg(1," %s : nonzero status received: %d",
+			dbg(1, " %s : nonzero status received: %d",
 			    __func__, status);
 		}
 		goto exit;
@@ -220,10 +220,10 @@
 				dev->interrupt_in_buffer, urb->actual_length);
 
 			dev->read_buffer_length += urb->actual_length;
-			dbg(2," %s reading  %d ", __func__,
+			dbg(2, " %s reading  %d ", __func__,
 			    urb->actual_length);
 		} else {
-			dbg(1," %s : read_buffer overflow", __func__);
+			dbg(1, " %s : read_buffer overflow", __func__);
 		}
 	}
 
@@ -234,7 +234,7 @@
 	wake_up_interruptible(&dev->read_wait);
 	adu_debug_data(5, __func__, urb->actual_length,
 		       urb->transfer_buffer);
-	dbg(4," %s : leave, status %d", __func__, status);
+	dbg(4, " %s : leave, status %d", __func__, status);
 }
 
 static void adu_interrupt_out_callback(struct urb *urb)
@@ -242,8 +242,8 @@
 	struct adu_device *dev = urb->context;
 	int status = urb->status;
 
-	dbg(4," %s : enter, status %d", __func__, status);
-	adu_debug_data(5,__func__, urb->actual_length, urb->transfer_buffer);
+	dbg(4, " %s : enter, status %d", __func__, status);
+	adu_debug_data(5, __func__, urb->actual_length, urb->transfer_buffer);
 
 	if (status != 0) {
 		if ((status != -ENOENT) &&
@@ -262,7 +262,7 @@
 
 	adu_debug_data(5, __func__, urb->actual_length,
 		       urb->transfer_buffer);
-	dbg(4," %s : leave, status %d", __func__, status);
+	dbg(4, " %s : leave, status %d", __func__, status);
 }
 
 static int adu_open(struct inode *inode, struct file *file)
@@ -272,11 +272,12 @@
 	int subminor;
 	int retval;
 
-	dbg(2,"%s : enter", __func__);
+	dbg(2, "%s : enter", __func__);
 
 	subminor = iminor(inode);
 
-	if ((retval = mutex_lock_interruptible(&adutux_mutex))) {
+	retval = mutex_lock_interruptible(&adutux_mutex);
+	if (retval) {
 		dbg(2, "%s : mutex lock failed", __func__);
 		goto exit_no_lock;
 	}
@@ -302,7 +303,7 @@
 	}
 
 	++dev->open_count;
-	dbg(2,"%s : open count %d", __func__, dev->open_count);
+	dbg(2, "%s : open count %d", __func__, dev->open_count);
 
 	/* save device in the file's private structure */
 	file->private_data = dev;
@@ -311,7 +312,7 @@
 	dev->read_buffer_length = 0;
 
 	/* fixup first read by having urb waiting for it */
-	usb_fill_int_urb(dev->interrupt_in_urb,dev->udev,
+	usb_fill_int_urb(dev->interrupt_in_urb, dev->udev,
 			 usb_rcvintpipe(dev->udev,
 					dev->interrupt_in_endpoint->bEndpointAddress),
 			 dev->interrupt_in_buffer,
@@ -332,23 +333,23 @@
 exit_no_device:
 	mutex_unlock(&adutux_mutex);
 exit_no_lock:
-	dbg(2,"%s : leave, return value %d ", __func__, retval);
+	dbg(2, "%s : leave, return value %d ", __func__, retval);
 	return retval;
 }
 
 static void adu_release_internal(struct adu_device *dev)
 {
-	dbg(2," %s : enter", __func__);
+	dbg(2, " %s : enter", __func__);
 
 	/* decrement our usage count for the device */
 	--dev->open_count;
-	dbg(2," %s : open count %d", __func__, dev->open_count);
+	dbg(2, " %s : open count %d", __func__, dev->open_count);
 	if (dev->open_count <= 0) {
 		adu_abort_transfers(dev);
 		dev->open_count = 0;
 	}
 
-	dbg(2," %s : leave", __func__);
+	dbg(2, " %s : leave", __func__);
 }
 
 static int adu_release(struct inode *inode, struct file *file)
@@ -356,17 +357,17 @@
 	struct adu_device *dev;
 	int retval = 0;
 
-	dbg(2," %s : enter", __func__);
+	dbg(2, " %s : enter", __func__);
 
 	if (file == NULL) {
- 		dbg(1," %s : file is NULL", __func__);
+		dbg(1, " %s : file is NULL", __func__);
 		retval = -ENODEV;
 		goto exit;
 	}
 
 	dev = file->private_data;
 	if (dev == NULL) {
- 		dbg(1," %s : object is NULL", __func__);
+		dbg(1, " %s : object is NULL", __func__);
 		retval = -ENODEV;
 		goto exit;
 	}
@@ -374,7 +375,7 @@
 	mutex_lock(&adutux_mutex); /* not interruptible */
 
 	if (dev->open_count <= 0) {
-		dbg(1," %s : device not opened", __func__);
+		dbg(1, " %s : device not opened", __func__);
 		retval = -ENODEV;
 		goto unlock;
 	}
@@ -388,7 +389,7 @@
 unlock:
 	mutex_unlock(&adutux_mutex);
 exit:
-	dbg(2," %s : leave, return value %d", __func__, retval);
+	dbg(2, " %s : leave, return value %d", __func__, retval);
 	return retval;
 }
 
@@ -405,10 +406,10 @@
 	unsigned long flags;
 	DECLARE_WAITQUEUE(wait, current);
 
-	dbg(2," %s : enter, count = %Zd, file=%p", __func__, count, file);
+	dbg(2, " %s : enter, count = %Zd, file=%p", __func__, count, file);
 
 	dev = file->private_data;
-	dbg(2," %s : dev=%p", __func__, dev);
+	dbg(2, " %s : dev=%p", __func__, dev);
 
 	if (mutex_lock_interruptible(&dev->mtx))
 		return -ERESTARTSYS;
@@ -423,15 +424,15 @@
 
 	/* verify that some data was requested */
 	if (count == 0) {
-		dbg(1," %s : read request of 0 bytes", __func__);
+		dbg(1, " %s : read request of 0 bytes", __func__);
 		goto exit;
 	}
 
 	timeout = COMMAND_TIMEOUT;
-	dbg(2," %s : about to start looping", __func__);
+	dbg(2, " %s : about to start looping", __func__);
 	while (bytes_to_read) {
 		int data_in_secondary = dev->secondary_tail - dev->secondary_head;
-		dbg(2," %s : while, data_in_secondary=%d, status=%d",
+		dbg(2, " %s : while, data_in_secondary=%d, status=%d",
 		    __func__, data_in_secondary,
 		    dev->interrupt_in_urb->status);
 
@@ -456,7 +457,7 @@
 			if (dev->read_buffer_length) {
 				/* we secure access to the primary */
 				char *tmp;
-				dbg(2," %s : swap, read_buffer_length = %d",
+				dbg(2, " %s : swap, read_buffer_length = %d",
 				    __func__, dev->read_buffer_length);
 				tmp = dev->read_buffer_secondary;
 				dev->read_buffer_secondary = dev->read_buffer_primary;
@@ -472,16 +473,16 @@
 				if (!dev->read_urb_finished) {
 					/* somebody is doing IO */
 					spin_unlock_irqrestore(&dev->buflock, flags);
-					dbg(2," %s : submitted already", __func__);
+					dbg(2, " %s : submitted already", __func__);
 				} else {
 					/* we must initiate input */
-					dbg(2," %s : initiate input", __func__);
+					dbg(2, " %s : initiate input", __func__);
 					dev->read_urb_finished = 0;
 					spin_unlock_irqrestore(&dev->buflock, flags);
 
-					usb_fill_int_urb(dev->interrupt_in_urb,dev->udev,
-							 usb_rcvintpipe(dev->udev,
-							 		dev->interrupt_in_endpoint->bEndpointAddress),
+					usb_fill_int_urb(dev->interrupt_in_urb, dev->udev,
+							usb_rcvintpipe(dev->udev,
+								dev->interrupt_in_endpoint->bEndpointAddress),
 							 dev->interrupt_in_buffer,
 							 usb_endpoint_maxp(dev->interrupt_in_endpoint),
 							 adu_interrupt_in_callback,
@@ -493,7 +494,7 @@
 						if (retval == -ENOMEM) {
 							retval = bytes_read ? bytes_read : -ENOMEM;
 						}
-						dbg(2," %s : submit failed", __func__);
+						dbg(2, " %s : submit failed", __func__);
 						goto exit;
 					}
 				}
@@ -512,13 +513,13 @@
 				remove_wait_queue(&dev->read_wait, &wait);
 
 				if (timeout <= 0) {
-					dbg(2," %s : timeout", __func__);
+					dbg(2, " %s : timeout", __func__);
 					retval = bytes_read ? bytes_read : -ETIMEDOUT;
 					goto exit;
 				}
 
 				if (signal_pending(current)) {
-					dbg(2," %s : signal pending", __func__);
+					dbg(2, " %s : signal pending", __func__);
 					retval = bytes_read ? bytes_read : -EINTR;
 					goto exit;
 				}
@@ -532,9 +533,9 @@
 	if (should_submit && dev->read_urb_finished) {
 		dev->read_urb_finished = 0;
 		spin_unlock_irqrestore(&dev->buflock, flags);
-		usb_fill_int_urb(dev->interrupt_in_urb,dev->udev,
+		usb_fill_int_urb(dev->interrupt_in_urb, dev->udev,
 				 usb_rcvintpipe(dev->udev,
-				 		dev->interrupt_in_endpoint->bEndpointAddress),
+					dev->interrupt_in_endpoint->bEndpointAddress),
 				dev->interrupt_in_buffer,
 				usb_endpoint_maxp(dev->interrupt_in_endpoint),
 				adu_interrupt_in_callback,
@@ -551,7 +552,7 @@
 	/* unlock the device */
 	mutex_unlock(&dev->mtx);
 
-	dbg(2," %s : leave, return value %d", __func__, retval);
+	dbg(2, " %s : leave, return value %d", __func__, retval);
 	return retval;
 }
 
@@ -566,7 +567,7 @@
 	unsigned long flags;
 	int retval;
 
-	dbg(2," %s : enter, count = %Zd", __func__, count);
+	dbg(2, " %s : enter, count = %Zd", __func__, count);
 
 	dev = file->private_data;
 
@@ -584,7 +585,7 @@
 
 	/* verify that we actually have some data to write */
 	if (count == 0) {
-		dbg(1," %s : write request of 0 bytes", __func__);
+		dbg(1, " %s : write request of 0 bytes", __func__);
 		goto exit;
 	}
 
@@ -597,7 +598,7 @@
 
 			mutex_unlock(&dev->mtx);
 			if (signal_pending(current)) {
-				dbg(1," %s : interrupted", __func__);
+				dbg(1, " %s : interrupted", __func__);
 				set_current_state(TASK_RUNNING);
 				retval = -EINTR;
 				goto exit_onqueue;
@@ -614,17 +615,17 @@
 				goto exit_nolock;
 			}
 
-			dbg(4," %s : in progress, count = %Zd", __func__, count);
+			dbg(4, " %s : in progress, count = %Zd", __func__, count);
 		} else {
 			spin_unlock_irqrestore(&dev->buflock, flags);
 			set_current_state(TASK_RUNNING);
 			remove_wait_queue(&dev->write_wait, &waita);
-			dbg(4," %s : sending, count = %Zd", __func__, count);
+			dbg(4, " %s : sending, count = %Zd", __func__, count);
 
 			/* write the data into interrupt_out_buffer from userspace */
 			buffer_size = usb_endpoint_maxp(dev->interrupt_out_endpoint);
 			bytes_to_write = count > buffer_size ? buffer_size : count;
-			dbg(4," %s : buffer_size = %Zd, count = %Zd, bytes_to_write = %Zd",
+			dbg(4, " %s : buffer_size = %Zd, count = %Zd, bytes_to_write = %Zd",
 			    __func__, buffer_size, count, bytes_to_write);
 
 			if (copy_from_user(dev->interrupt_out_buffer, buffer, bytes_to_write) != 0) {
@@ -664,7 +665,7 @@
 exit:
 	mutex_unlock(&dev->mtx);
 exit_nolock:
-	dbg(2," %s : leave, return value %d", __func__, retval);
+	dbg(2, " %s : leave, return value %d", __func__, retval);
 	return retval;
 
 exit_onqueue:
@@ -710,7 +711,7 @@
 	int out_end_size;
 	int i;
 
-	dbg(2," %s : enter", __func__);
+	dbg(2, " %s : enter", __func__);
 
 	if (udev == NULL) {
 		dev_err(&interface->dev, "udev is NULL.\n");
@@ -811,7 +812,7 @@
 		dev_err(&interface->dev, "Could not retrieve serial number\n");
 		goto error;
 	}
-	dbg(2," %s : serial_number=%s", __func__, dev->serial_number);
+	dbg(2, " %s : serial_number=%s", __func__, dev->serial_number);
 
 	/* we can register the device now, as it is ready */
 	usb_set_intfdata(interface, dev);
@@ -832,7 +833,7 @@
 		 udev->descriptor.idProduct, dev->serial_number,
 		 (dev->minor - ADU_MINOR_BASE));
 exit:
-	dbg(2," %s : leave, return value %p (dev)", __func__, dev);
+	dbg(2, " %s : leave, return value %p (dev)", __func__, dev);
 
 	return retval;
 
@@ -851,7 +852,7 @@
 	struct adu_device *dev;
 	int minor;
 
-	dbg(2," %s : enter", __func__);
+	dbg(2, " %s : enter", __func__);
 
 	dev = usb_get_intfdata(interface);
 
@@ -865,7 +866,7 @@
 	usb_set_intfdata(interface, NULL);
 
 	/* if the device is not opened, then we clean up right now */
-	dbg(2," %s : open count %d", __func__, dev->open_count);
+	dbg(2, " %s : open count %d", __func__, dev->open_count);
 	if (!dev->open_count)
 		adu_delete(dev);
 
@@ -874,7 +875,7 @@
 	dev_info(&interface->dev, "ADU device adutux%d now disconnected\n",
 		 (minor - ADU_MINOR_BASE));
 
-	dbg(2," %s : leave", __func__);
+	dbg(2, " %s : leave", __func__);
 }
 
 /* usb specific object needed to register this driver with the usb subsystem */

diff --git a/drivers/usb/misc/sisusbvga/sisusb_con.c b/drivers/usb/misc/sisusbvga/sisusb_con.c
index 411e605..a638c4e 100644
--- a/drivers/usb/misc/sisusbvga/sisusb_con.c
+++ b/drivers/usb/misc/sisusbvga/sisusb_con.c

@@ -208,7 +208,7 @@
 	struct sisusb_usb_data *sisusb;
 	int cols, rows;
 
-	/* This is called by take_over_console(),
+	/* This is called by do_take_over_console(),
 	 * ie by us/under our control. It is
 	 * only called after text mode and fonts
 	 * are set up/restored.
@@ -273,7 +273,7 @@
 	struct sisusb_usb_data *sisusb;
 	int i;
 
-	/* This is called by take_over_console()
+	/* This is called by do_take_over_console()
 	 * and others, ie not under our control.
 	 */
 
@@ -1490,8 +1490,9 @@
 	mutex_unlock(&sisusb->lock);
 
 	/* Now grab the desired console(s) */
-	ret = take_over_console(&sisusb_con, first - 1, last - 1, 0);
-
+	console_lock();
+	ret = do_take_over_console(&sisusb_con, first - 1, last - 1, 0);
+	console_unlock();
 	if (!ret)
 		sisusb->haveconsole = 1;
 	else {
@@ -1535,11 +1536,14 @@
 
 	if (sisusb->haveconsole) {
 		for (i = 0; i < MAX_NR_CONSOLES; i++)
-			if (sisusb->havethisconsole[i])
-				take_over_console(&sisusb_dummy_con, i, i, 0);
+			if (sisusb->havethisconsole[i]) {
+				console_lock();
+				do_take_over_console(&sisusb_dummy_con, i, i, 0);
+				console_unlock();
 				/* At this point, con_deinit for all our
-				 * consoles is executed by take_over_console().
+				 * consoles is executed by do_take_over_console().
 				 */
+			}
 		sisusb->haveconsole = 0;
 	}
 

diff --git a/drivers/usb/misc/usb3503.c b/drivers/usb/misc/usb3503.c
index d3a1cce..c357839 100644
--- a/drivers/usb/misc/usb3503.c
+++ b/drivers/usb/misc/usb3503.c

@@ -42,9 +42,6 @@
 #define USB3503_NRD		0x09
 
 #define USB3503_PDS		0x0a
-#define USB3503_PORT1		(1 << 1)
-#define USB3503_PORT2		(1 << 2)
-#define USB3503_PORT3		(1 << 3)
 
 #define USB3503_SP_ILOCK	0xe7
 #define USB3503_SPILOCK_CONNECT	(1 << 1)
@@ -56,6 +53,7 @@
 struct usb3503 {
 	enum usb3503_mode	mode;
 	struct i2c_client	*client;
+	u8	port_off_mask;
 	int	gpio_intn;
 	int	gpio_reset;
 	int	gpio_connect;
@@ -107,11 +105,9 @@
 	if (gpio_is_valid(gpio_reset))
 		gpio_set_value(gpio_reset, state);
 
-	/* Wait RefClk when RESET_N is released, otherwise Hub will
-	 * not transition to Hub Communication Stage.
-	 */
+	/* Wait T_HUBINIT == 4ms for hub logic to stabilize */
 	if (state)
-		msleep(100);
+		usleep_range(4000, 10000);
 
 	return 0;
 }
@@ -134,12 +130,14 @@
 			goto err_hubmode;
 		}
 
-		/* PDS : Port2,3 Disable For Self Powered Operation */
-		err = usb3503_set_bits(i2c, USB3503_PDS,
-				(USB3503_PORT2 | USB3503_PORT3));
-		if (err < 0) {
-			dev_err(&i2c->dev, "PDS failed (%d)\n", err);
-			goto err_hubmode;
+		/* PDS : Disable For Self Powered Operation */
+		if (hub->port_off_mask) {
+			err = usb3503_set_bits(i2c, USB3503_PDS,
+					hub->port_off_mask);
+			if (err < 0) {
+				dev_err(&i2c->dev, "PDS failed (%d)\n", err);
+				goto err_hubmode;
+			}
 		}
 
 		/* CFG1 : SELF_BUS_PWR -> Self-Powerd operation */
@@ -186,6 +184,8 @@
 	struct usb3503 *hub;
 	int err = -ENOMEM;
 	u32 mode = USB3503_MODE_UNKNOWN;
+	const u32 *property;
+	int len;
 
 	hub = kzalloc(sizeof(struct usb3503), GFP_KERNEL);
 	if (!hub) {
@@ -197,18 +197,31 @@
 	hub->client = i2c;
 
 	if (pdata) {
+		hub->port_off_mask	= pdata->port_off_mask;
 		hub->gpio_intn		= pdata->gpio_intn;
 		hub->gpio_connect	= pdata->gpio_connect;
 		hub->gpio_reset		= pdata->gpio_reset;
 		hub->mode		= pdata->initial_mode;
 	} else if (np) {
+		hub->port_off_mask = 0;
+
+		property = of_get_property(np, "disabled-ports", &len);
+		if (property && (len / sizeof(u32)) > 0) {
+			int i;
+			for (i = 0; i < len / sizeof(u32); i++) {
+				u32 port = be32_to_cpu(property[i]);
+				if ((1 <= port) && (port <= 3))
+					hub->port_off_mask |= (1 << port);
+			}
+		}
+
 		hub->gpio_intn	= of_get_named_gpio(np, "connect-gpios", 0);
 		if (hub->gpio_intn == -EPROBE_DEFER)
 			return -EPROBE_DEFER;
 		hub->gpio_connect = of_get_named_gpio(np, "intn-gpios", 0);
 		if (hub->gpio_connect == -EPROBE_DEFER)
 			return -EPROBE_DEFER;
-		hub->gpio_reset	= of_get_named_gpio(np, "reset-gpios", 0);
+		hub->gpio_reset = of_get_named_gpio(np, "reset-gpios", 0);
 		if (hub->gpio_reset == -EPROBE_DEFER)
 			return -EPROBE_DEFER;
 		of_property_read_u32(np, "initial-mode", &mode);

diff --git a/drivers/usb/musb/Kconfig b/drivers/usb/musb/Kconfig
index 06f8d29..797e3fd 100644
--- a/drivers/usb/musb/Kconfig
+++ b/drivers/usb/musb/Kconfig

@@ -28,6 +28,35 @@
 if USB_MUSB_HDRC
 
 choice
+	bool "MUSB Mode Selection"
+	default USB_MUSB_DUAL_ROLE if (USB && USB_GADGET)
+	default USB_MUSB_HOST if (USB && !USB_GADGET)
+	default USB_MUSB_GADGET if (!USB && USB_GADGET)
+
+config USB_MUSB_HOST
+	bool "Host only mode"
+	depends on USB
+	help
+	  Select this when you want to use MUSB in host mode only,
+	  thereby the gadget feature will be regressed.
+
+config USB_MUSB_GADGET
+	bool "Gadget only mode"
+	depends on USB_GADGET
+	help
+	  Select this when you want to use MUSB in gadget mode only,
+	  thereby the host feature will be regressed.
+
+config USB_MUSB_DUAL_ROLE
+	bool "Dual Role mode"
+	depends on (USB && USB_GADGET)
+	help
+	  This is the default mode of working of MUSB controller where
+	  both host and gadget features are enabled.
+
+endchoice
+
+choice
 	prompt "Platform Glue Layer"
 
 config USB_MUSB_DAVINCI

diff --git a/drivers/usb/musb/Makefile b/drivers/usb/musb/Makefile
index 3b85871..2b82ed7 100644
--- a/drivers/usb/musb/Makefile
+++ b/drivers/usb/musb/Makefile

@@ -6,8 +6,8 @@
 
 musb_hdrc-y := musb_core.o
 
-musb_hdrc-y					+= musb_gadget_ep0.o musb_gadget.o
-musb_hdrc-y					+= musb_virthub.o musb_host.o
+musb_hdrc-$(CONFIG_USB_MUSB_HOST)$(CONFIG_USB_MUSB_DUAL_ROLE) += musb_virthub.o musb_host.o
+musb_hdrc-$(CONFIG_USB_MUSB_GADGET)$(CONFIG_USB_MUSB_DUAL_ROLE) += musb_gadget_ep0.o musb_gadget.o
 musb_hdrc-$(CONFIG_DEBUG_FS)			+= musb_debugfs.o
 
 # Hardware Glue Layer

diff --git a/drivers/usb/musb/blackfin.c b/drivers/usb/musb/blackfin.c
index 5e63b16..6ba8439 100644
--- a/drivers/usb/musb/blackfin.c
+++ b/drivers/usb/musb/blackfin.c

@@ -450,6 +450,7 @@
 
 static int bfin_probe(struct platform_device *pdev)
 {
+	struct resource musb_resources[2];
 	struct musb_hdrc_platform_data	*pdata = pdev->dev.platform_data;
 	struct platform_device		*musb;
 	struct bfin_glue		*glue;
@@ -479,8 +480,21 @@
 
 	platform_set_drvdata(pdev, glue);
 
-	ret = platform_device_add_resources(musb, pdev->resource,
-			pdev->num_resources);
+	memset(musb_resources, 0x00, sizeof(*musb_resources) *
+			ARRAY_SIZE(musb_resources));
+
+	musb_resources[0].name = pdev->resource[0].name;
+	musb_resources[0].start = pdev->resource[0].start;
+	musb_resources[0].end = pdev->resource[0].end;
+	musb_resources[0].flags = pdev->resource[0].flags;
+
+	musb_resources[1].name = pdev->resource[1].name;
+	musb_resources[1].start = pdev->resource[1].start;
+	musb_resources[1].end = pdev->resource[1].end;
+	musb_resources[1].flags = pdev->resource[1].flags;
+
+	ret = platform_device_add_resources(musb, musb_resources,
+			ARRAY_SIZE(musb_resources));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to add resources\n");
 		goto err3;

diff --git a/drivers/usb/musb/da8xx.c b/drivers/usb/musb/da8xx.c
index b903b744..0da6f64 100644
--- a/drivers/usb/musb/da8xx.c
+++ b/drivers/usb/musb/da8xx.c

@@ -476,6 +476,7 @@
 
 static int da8xx_probe(struct platform_device *pdev)
 {
+	struct resource musb_resources[2];
 	struct musb_hdrc_platform_data	*pdata = pdev->dev.platform_data;
 	struct platform_device		*musb;
 	struct da8xx_glue		*glue;
@@ -521,8 +522,21 @@
 
 	platform_set_drvdata(pdev, glue);
 
-	ret = platform_device_add_resources(musb, pdev->resource,
-			pdev->num_resources);
+	memset(musb_resources, 0x00, sizeof(*musb_resources) *
+			ARRAY_SIZE(musb_resources));
+
+	musb_resources[0].name = pdev->resource[0].name;
+	musb_resources[0].start = pdev->resource[0].start;
+	musb_resources[0].end = pdev->resource[0].end;
+	musb_resources[0].flags = pdev->resource[0].flags;
+
+	musb_resources[1].name = pdev->resource[1].name;
+	musb_resources[1].start = pdev->resource[1].start;
+	musb_resources[1].end = pdev->resource[1].end;
+	musb_resources[1].flags = pdev->resource[1].flags;
+
+	ret = platform_device_add_resources(musb, musb_resources,
+			ARRAY_SIZE(musb_resources));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to add resources\n");
 		goto err5;

diff --git a/drivers/usb/musb/davinci.c b/drivers/usb/musb/davinci.c
index bea6cc3..f8aeaf2 100644
--- a/drivers/usb/musb/davinci.c
+++ b/drivers/usb/musb/davinci.c

@@ -509,6 +509,7 @@
 
 static int davinci_probe(struct platform_device *pdev)
 {
+	struct resource musb_resources[2];
 	struct musb_hdrc_platform_data	*pdata = pdev->dev.platform_data;
 	struct platform_device		*musb;
 	struct davinci_glue		*glue;
@@ -553,8 +554,21 @@
 
 	platform_set_drvdata(pdev, glue);
 
-	ret = platform_device_add_resources(musb, pdev->resource,
-			pdev->num_resources);
+	memset(musb_resources, 0x00, sizeof(*musb_resources) *
+			ARRAY_SIZE(musb_resources));
+
+	musb_resources[0].name = pdev->resource[0].name;
+	musb_resources[0].start = pdev->resource[0].start;
+	musb_resources[0].end = pdev->resource[0].end;
+	musb_resources[0].flags = pdev->resource[0].flags;
+
+	musb_resources[1].name = pdev->resource[1].name;
+	musb_resources[1].start = pdev->resource[1].start;
+	musb_resources[1].end = pdev->resource[1].end;
+	musb_resources[1].flags = pdev->resource[1].flags;
+
+	ret = platform_device_add_resources(musb, musb_resources,
+			ARRAY_SIZE(musb_resources));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to add resources\n");
 		goto err5;

diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c
index 37a261a..29a24ce 100644
--- a/drivers/usb/musb/musb_core.c
+++ b/drivers/usb/musb/musb_core.c

@@ -380,7 +380,6 @@
 		dev_dbg(musb->controller, "HNP: Unhandled mode %s\n",
 			usb_otg_state_string(musb->xceiv->state));
 	}
-	musb->ignore_disconnect = 0;
 	spin_unlock_irqrestore(&musb->lock, flags);
 }
 
@@ -389,7 +388,7 @@
  */
 void musb_hnp_stop(struct musb *musb)
 {
-	struct usb_hcd	*hcd = musb_to_hcd(musb);
+	struct usb_hcd	*hcd = musb->hcd;
 	void __iomem	*mbase = musb->mregs;
 	u8	reg;
 
@@ -404,7 +403,8 @@
 		break;
 	case OTG_STATE_B_HOST:
 		dev_dbg(musb->controller, "HNP: Disabling HR\n");
-		hcd->self.is_b_host = 0;
+		if (hcd)
+			hcd->self.is_b_host = 0;
 		musb->xceiv->state = OTG_STATE_B_PERIPHERAL;
 		MUSB_DEV_MODE(musb);
 		reg = musb_readb(mbase, MUSB_POWER);
@@ -484,7 +484,7 @@
 
 				musb->xceiv->state = OTG_STATE_A_HOST;
 				musb->is_active = 1;
-				usb_hcd_resume_root_hub(musb_to_hcd(musb));
+				musb_host_resume_root_hub(musb);
 				break;
 			case OTG_STATE_B_WAIT_ACON:
 				musb->xceiv->state = OTG_STATE_B_PERIPHERAL;
@@ -501,7 +501,7 @@
 			case OTG_STATE_A_SUSPEND:
 				/* possibly DISCONNECT is upcoming */
 				musb->xceiv->state = OTG_STATE_A_HOST;
-				usb_hcd_resume_root_hub(musb_to_hcd(musb));
+				musb_host_resume_root_hub(musb);
 				break;
 			case OTG_STATE_B_WAIT_ACON:
 			case OTG_STATE_B_PERIPHERAL:
@@ -643,7 +643,7 @@
 			 * undesired detour through A_WAIT_BCON.
 			 */
 			musb_hnp_stop(musb);
-			usb_hcd_resume_root_hub(musb_to_hcd(musb));
+			musb_host_resume_root_hub(musb);
 			musb_root_disconnect(musb);
 			musb_platform_try_idle(musb, jiffies
 					+ msecs_to_jiffies(musb->a_wait_bcon
@@ -685,7 +685,7 @@
 	}
 
 	if (int_usb & MUSB_INTR_CONNECT) {
-		struct usb_hcd *hcd = musb_to_hcd(musb);
+		struct usb_hcd *hcd = musb->hcd;
 
 		handled = IRQ_HANDLED;
 		musb->is_active = 1;
@@ -726,31 +726,27 @@
 			dev_dbg(musb->controller, "HNP: CONNECT, now b_host\n");
 b_host:
 			musb->xceiv->state = OTG_STATE_B_HOST;
-			hcd->self.is_b_host = 1;
-			musb->ignore_disconnect = 0;
+			if (musb->hcd)
+				musb->hcd->self.is_b_host = 1;
 			del_timer(&musb->otg_timer);
 			break;
 		default:
 			if ((devctl & MUSB_DEVCTL_VBUS)
 					== (3 << MUSB_DEVCTL_VBUS_SHIFT)) {
 				musb->xceiv->state = OTG_STATE_A_HOST;
-				hcd->self.is_b_host = 0;
+				if (hcd)
+					hcd->self.is_b_host = 0;
 			}
 			break;
 		}
 
-		/* poke the root hub */
-		MUSB_HST_MODE(musb);
-		if (hcd->status_urb)
-			usb_hcd_poll_rh_status(hcd);
-		else
-			usb_hcd_resume_root_hub(hcd);
+		musb_host_poke_root_hub(musb);
 
 		dev_dbg(musb->controller, "CONNECT (%s) devctl %02x\n",
 				usb_otg_state_string(musb->xceiv->state), devctl);
 	}
 
-	if ((int_usb & MUSB_INTR_DISCONNECT) && !musb->ignore_disconnect) {
+	if (int_usb & MUSB_INTR_DISCONNECT) {
 		dev_dbg(musb->controller, "DISCONNECT (%s) as %s, devctl %02x\n",
 				usb_otg_state_string(musb->xceiv->state),
 				MUSB_MODE(musb), devctl);
@@ -759,7 +755,7 @@
 		switch (musb->xceiv->state) {
 		case OTG_STATE_A_HOST:
 		case OTG_STATE_A_SUSPEND:
-			usb_hcd_resume_root_hub(musb_to_hcd(musb));
+			musb_host_resume_root_hub(musb);
 			musb_root_disconnect(musb);
 			if (musb->a_wait_bcon != 0)
 				musb_platform_try_idle(musb, jiffies
@@ -772,7 +768,8 @@
 			 * in hnp_stop() is currently not used...
 			 */
 			musb_root_disconnect(musb);
-			musb_to_hcd(musb)->self.is_b_host = 0;
+			if (musb->hcd)
+				musb->hcd->self.is_b_host = 0;
 			musb->xceiv->state = OTG_STATE_B_PERIPHERAL;
 			MUSB_DEV_MODE(musb);
 			musb_g_disconnect(musb);
@@ -818,11 +815,6 @@
 				usb_otg_state_string(musb->xceiv->state));
 			switch (musb->xceiv->state) {
 			case OTG_STATE_A_SUSPEND:
-				/* We need to ignore disconnect on suspend
-				 * otherwise tusb 2.0 won't reconnect after a
-				 * power cycle, which breaks otg compliance.
-				 */
-				musb->ignore_disconnect = 1;
 				musb_g_reset(musb);
 				/* FALLTHROUGH */
 			case OTG_STATE_A_WAIT_BCON:	/* OPT TD.4.7-900ms */
@@ -834,7 +826,6 @@
 					+ msecs_to_jiffies(TA_WAIT_BCON(musb)));
 				break;
 			case OTG_STATE_A_PERIPHERAL:
-				musb->ignore_disconnect = 0;
 				del_timer(&musb->otg_timer);
 				musb_g_reset(musb);
 				break;
@@ -909,51 +900,6 @@
 
 /*-------------------------------------------------------------------------*/
 
-/*
-* Program the HDRC to start (enable interrupts, dma, etc.).
-*/
-void musb_start(struct musb *musb)
-{
-	void __iomem	*regs = musb->mregs;
-	u8		devctl = musb_readb(regs, MUSB_DEVCTL);
-
-	dev_dbg(musb->controller, "<== devctl %02x\n", devctl);
-
-	/*  Set INT enable registers, enable interrupts */
-	musb->intrtxe = musb->epmask;
-	musb_writew(regs, MUSB_INTRTXE, musb->intrtxe);
-	musb->intrrxe = musb->epmask & 0xfffe;
-	musb_writew(regs, MUSB_INTRRXE, musb->intrrxe);
-	musb_writeb(regs, MUSB_INTRUSBE, 0xf7);
-
-	musb_writeb(regs, MUSB_TESTMODE, 0);
-
-	/* put into basic highspeed mode and start session */
-	musb_writeb(regs, MUSB_POWER, MUSB_POWER_ISOUPDATE
-						| MUSB_POWER_HSENAB
-						/* ENSUSPEND wedges tusb */
-						/* | MUSB_POWER_ENSUSPEND */
-						);
-
-	musb->is_active = 0;
-	devctl = musb_readb(regs, MUSB_DEVCTL);
-	devctl &= ~MUSB_DEVCTL_SESSION;
-
-	/* session started after:
-	 * (a) ID-grounded irq, host mode;
-	 * (b) vbus present/connect IRQ, peripheral mode;
-	 * (c) peripheral initiates, using SRP
-	 */
-	if ((devctl & MUSB_DEVCTL_VBUS) == MUSB_DEVCTL_VBUS)
-		musb->is_active = 1;
-	else
-		devctl |= MUSB_DEVCTL_SESSION;
-
-	musb_platform_enable(musb);
-	musb_writeb(regs, MUSB_DEVCTL, devctl);
-}
-
-
 static void musb_generic_disable(struct musb *musb)
 {
 	void __iomem	*mbase = musb->mregs;
@@ -1007,6 +953,7 @@
 
 	pm_runtime_get_sync(musb->controller);
 
+	musb_host_cleanup(musb);
 	musb_gadget_cleanup(musb);
 
 	spin_lock_irqsave(&musb->lock, flags);
@@ -1763,24 +1710,18 @@
 	struct musb		*musb;
 	struct musb_hw_ep	*ep;
 	int			epnum;
-	struct usb_hcd	*hcd;
+	int			ret;
 
-	hcd = usb_create_hcd(&musb_hc_driver, dev, dev_name(dev));
-	if (!hcd)
+	musb = devm_kzalloc(dev, sizeof(*musb), GFP_KERNEL);
+	if (!musb)
 		return NULL;
-	/* usbcore sets dev->driver_data to hcd, and sometimes uses that... */
 
-	musb = hcd_to_musb(hcd);
 	INIT_LIST_HEAD(&musb->control);
 	INIT_LIST_HEAD(&musb->in_bulk);
 	INIT_LIST_HEAD(&musb->out_bulk);
 
-	hcd->uses_new_polling = 1;
-	hcd->has_tt = 1;
-
 	musb->vbuserr_retry = VBUSERR_RETRY_COUNT;
 	musb->a_wait_bcon = OTG_TIME_A_WAIT_BCON;
-	dev_set_drvdata(dev, musb);
 	musb->mregs = mbase;
 	musb->ctrl_base = mbase;
 	musb->nIrq = -ENODEV;
@@ -1795,7 +1736,16 @@
 
 	musb->controller = dev;
 
+	ret = musb_host_alloc(musb);
+	if (ret < 0)
+		goto err_free;
+
+	dev_set_drvdata(dev, musb);
+
 	return musb;
+
+err_free:
+	return NULL;
 }
 
 static void musb_free(struct musb *musb)
@@ -1821,7 +1771,7 @@
 		dma_controller_destroy(c);
 	}
 
-	usb_put_hcd(musb_to_hcd(musb));
+	musb_host_free(musb);
 }
 
 /*
@@ -1838,7 +1788,6 @@
 	int			status;
 	struct musb		*musb;
 	struct musb_hdrc_platform_data *plat = dev->platform_data;
-	struct usb_hcd		*hcd;
 
 	/* The driver might handle more features than the board; OK.
 	 * Fail when the board needs a feature that's not enabled.
@@ -1864,6 +1813,7 @@
 	musb->board_set_power = plat->set_power;
 	musb->min_power = plat->min_power;
 	musb->ops = plat->platform_ops;
+	musb->port_mode = plat->mode;
 
 	/* The musb_platform_init() call:
 	 *   - adjusts musb->mregs
@@ -1939,13 +1889,6 @@
 		musb->irq_wake = 0;
 	}
 
-	/* host side needs more setup */
-	hcd = musb_to_hcd(musb);
-	otg_set_host(musb->xceiv->otg, &hcd->self);
-	hcd->self.otg_port = 1;
-	musb->xceiv->otg->host = &hcd->self;
-	hcd->power_budget = 2 * (plat->power ? : 250);
-
 	/* program PHY to use external vBus if required */
 	if (plat->extvbus) {
 		u8 busctl = musb_read_ulpi_buscontrol(musb->mregs);
@@ -1961,7 +1904,23 @@
 		musb->xceiv->state = OTG_STATE_B_IDLE;
 	}
 
-	status = musb_gadget_setup(musb);
+	switch (musb->port_mode) {
+	case MUSB_PORT_MODE_HOST:
+		status = musb_host_setup(musb, plat->power);
+		break;
+	case MUSB_PORT_MODE_GADGET:
+		status = musb_gadget_setup(musb);
+		break;
+	case MUSB_PORT_MODE_DUAL_ROLE:
+		status = musb_host_setup(musb, plat->power);
+		if (status < 0)
+			goto fail3;
+		status = musb_gadget_setup(musb);
+		break;
+	default:
+		dev_err(dev, "unsupported port mode %d\n", musb->port_mode);
+		break;
+	}
 
 	if (status < 0)
 		goto fail3;

diff --git a/drivers/usb/musb/musb_core.h b/drivers/usb/musb/musb_core.h
index 7fb4819..7d341c3 100644
--- a/drivers/usb/musb/musb_core.h
+++ b/drivers/usb/musb/musb_core.h

@@ -77,28 +77,17 @@
 #define is_peripheral_active(m)		(!(m)->is_host)
 #define is_host_active(m)		((m)->is_host)
 
+enum {
+	MUSB_PORT_MODE_HOST	= 1,
+	MUSB_PORT_MODE_GADGET,
+	MUSB_PORT_MODE_DUAL_ROLE,
+};
+
 #ifdef CONFIG_PROC_FS
 #include <linux/fs.h>
 #define MUSB_CONFIG_PROC_FS
 #endif
 
-/****************************** PERIPHERAL ROLE *****************************/
-
-extern irqreturn_t musb_g_ep0_irq(struct musb *);
-extern void musb_g_tx(struct musb *, u8);
-extern void musb_g_rx(struct musb *, u8);
-extern void musb_g_reset(struct musb *);
-extern void musb_g_suspend(struct musb *);
-extern void musb_g_resume(struct musb *);
-extern void musb_g_wakeup(struct musb *);
-extern void musb_g_disconnect(struct musb *);
-
-/****************************** HOST ROLE ***********************************/
-
-extern irqreturn_t musb_h_ep0_irq(struct musb *);
-extern void musb_host_tx(struct musb *, u8);
-extern void musb_host_rx(struct musb *, u8);
-
 /****************************** CONSTANTS ********************************/
 
 #ifndef MUSB_C_NUM_EPS
@@ -373,6 +362,7 @@
 
 	u8			min_power;	/* vbus for periph, in mA/2 */
 
+	int			port_mode;	/* MUSB_PORT_MODE_* */
 	bool			is_host;
 
 	int			a_wait_bcon;	/* VBUS timeout in msecs */
@@ -382,7 +372,6 @@
 	unsigned		is_active:1;
 
 	unsigned is_multipoint:1;
-	unsigned ignore_disconnect:1;	/* during bus resets */
 
 	unsigned		hb_iso_rx:1;	/* high bandwidth iso rx? */
 	unsigned		hb_iso_tx:1;	/* high bandwidth iso tx? */
@@ -419,6 +408,7 @@
 	enum musb_g_ep0_state	ep0_state;
 	struct usb_gadget	g;			/* the gadget */
 	struct usb_gadget_driver *gadget_driver;	/* its driver */
+	struct usb_hcd		*hcd;			/* the usb hcd */
 
 	/*
 	 * FIXME: Remove this flag.
@@ -520,7 +510,6 @@
 
 extern const char musb_driver_name[];
 
-extern void musb_start(struct musb *musb);
 extern void musb_stop(struct musb *musb);
 
 extern void musb_write_fifo(struct musb_hw_ep *ep, u16 len, const u8 *src);

diff --git a/drivers/usb/musb/musb_gadget.c b/drivers/usb/musb/musb_gadget.c
index ba70923..0414bc1 100644
--- a/drivers/usb/musb/musb_gadget.c
+++ b/drivers/usb/musb/musb_gadget.c

@@ -1820,7 +1820,6 @@
 {
 	struct musb		*musb = gadget_to_musb(g);
 	struct usb_otg		*otg = musb->xceiv->otg;
-	struct usb_hcd		*hcd = musb_to_hcd(musb);
 	unsigned long		flags;
 	int			retval = 0;
 
@@ -1847,17 +1846,9 @@
 	 * handles power budgeting ... this way also
 	 * ensures HdrcStart is indirectly called.
 	 */
-	retval = usb_add_hcd(hcd, 0, 0);
-	if (retval < 0) {
-		dev_dbg(musb->controller, "add_hcd failed, %d\n", retval);
-		goto err;
-	}
-
 	if (musb->xceiv->last_event == USB_EVENT_ID)
 		musb_platform_set_vbus(musb, 1);
 
-	hcd->self.uses_pio_for_control = 1;
-
 	if (musb->xceiv->last_event == USB_EVENT_NONE)
 		pm_runtime_put(musb->controller);
 
@@ -1942,7 +1933,6 @@
 	musb_platform_try_idle(musb, 0);
 	spin_unlock_irqrestore(&musb->lock, flags);
 
-	usb_remove_hcd(musb_to_hcd(musb));
 	/*
 	 * FIXME we need to be able to register another
 	 * gadget driver here and have everything work;

diff --git a/drivers/usb/musb/musb_gadget.h b/drivers/usb/musb/musb_gadget.h
index 66b7c5e..0314dfc 100644
--- a/drivers/usb/musb/musb_gadget.h
+++ b/drivers/usb/musb/musb_gadget.h

@@ -37,6 +37,38 @@
 
 #include <linux/list.h>
 
+#if IS_ENABLED(CONFIG_USB_MUSB_GADGET) || IS_ENABLED(CONFIG_USB_MUSB_DUAL_ROLE)
+extern irqreturn_t musb_g_ep0_irq(struct musb *);
+extern void musb_g_tx(struct musb *, u8);
+extern void musb_g_rx(struct musb *, u8);
+extern void musb_g_reset(struct musb *);
+extern void musb_g_suspend(struct musb *);
+extern void musb_g_resume(struct musb *);
+extern void musb_g_wakeup(struct musb *);
+extern void musb_g_disconnect(struct musb *);
+extern void musb_gadget_cleanup(struct musb *);
+extern int musb_gadget_setup(struct musb *);
+
+#else
+static inline irqreturn_t musb_g_ep0_irq(struct musb *musb)
+{
+	return 0;
+}
+
+static inline void musb_g_tx(struct musb *musb, u8 epnum)	{}
+static inline void musb_g_rx(struct musb *musb, u8 epnum)	{}
+static inline void musb_g_reset(struct musb *musb)		{}
+static inline void musb_g_suspend(struct musb *musb)		{}
+static inline void musb_g_resume(struct musb *musb)		{}
+static inline void musb_g_wakeup(struct musb *musb)		{}
+static inline void musb_g_disconnect(struct musb *musb)		{}
+static inline void musb_gadget_cleanup(struct musb *musb)	{}
+static inline int musb_gadget_setup(struct musb *musb)
+{
+	return 0;
+}
+#endif
+
 enum buffer_map_state {
 	UN_MAPPED = 0,
 	PRE_MAPPED,
@@ -106,14 +138,8 @@
 	return container_of(queue->next, struct musb_request, list);
 }
 
-extern void musb_g_tx(struct musb *musb, u8 epnum);
-extern void musb_g_rx(struct musb *musb, u8 epnum);
-
 extern const struct usb_ep_ops musb_g_ep0_ops;
 
-extern int musb_gadget_setup(struct musb *);
-extern void musb_gadget_cleanup(struct musb *);
-
 extern void musb_g_giveback(struct musb_ep *, struct usb_request *, int);
 
 extern void musb_ep_restart(struct musb *, struct musb_request *);

diff --git a/drivers/usb/musb/musb_host.c b/drivers/usb/musb/musb_host.c
index 9d3044b..a9695f5 100644
--- a/drivers/usb/musb/musb_host.c
+++ b/drivers/usb/musb/musb_host.c

@@ -46,7 +46,6 @@
 #include "musb_core.h"
 #include "musb_host.h"
 
-
 /* MUSB HOST status 22-mar-2006
  *
  * - There's still lots of partial code duplication for fault paths, so
@@ -96,6 +95,11 @@
  * of transfers between endpoints, or anything clever.
  */
 
+struct musb *hcd_to_musb(struct usb_hcd *hcd)
+{
+	return *(struct musb **) hcd->hcd_priv;
+}
+
 
 static void musb_ep_program(struct musb *musb, u8 epnum,
 			struct urb *urb, int is_out,
@@ -269,8 +273,7 @@
 		/* FIXME this doesn't implement that scheduling policy ...
 		 * or handle framecounter wrapping
 		 */
-		if ((urb->transfer_flags & URB_ISO_ASAP)
-				|| (frame >= urb->start_frame)) {
+		if (1) {	/* Always assume URB_ISO_ASAP */
 			/* REVISIT the SOF irq handler shouldn't duplicate
 			 * this code; and we don't init urb->start_frame...
 			 */
@@ -311,9 +314,9 @@
 			urb->actual_length, urb->transfer_buffer_length
 			);
 
-	usb_hcd_unlink_urb_from_ep(musb_to_hcd(musb), urb);
+	usb_hcd_unlink_urb_from_ep(musb->hcd, urb);
 	spin_unlock(&musb->lock);
-	usb_hcd_giveback_urb(musb_to_hcd(musb), urb, status);
+	usb_hcd_giveback_urb(musb->hcd, urb, status);
 	spin_lock(&musb->lock);
 }
 
@@ -625,7 +628,7 @@
 	u16			csr;
 	u8			mode;
 
-#ifdef	CONFIG_USB_INVENTRA_DMA
+#if defined(CONFIG_USB_INVENTRA_DMA) || defined(CONFIG_USB_UX500_DMA)
 	if (length > channel->max_len)
 		length = channel->max_len;
 
@@ -1455,7 +1458,7 @@
 	if (length > qh->maxpacket)
 		length = qh->maxpacket;
 	/* Unmap the buffer so that CPU can use it */
-	usb_hcd_unmap_urb_for_dma(musb_to_hcd(musb), urb);
+	usb_hcd_unmap_urb_for_dma(musb->hcd, urb);
 
 	/*
 	 * We need to map sg if the transfer_buffer is
@@ -1657,7 +1660,7 @@
 
 	/* FIXME this is _way_ too much in-line logic for Mentor DMA */
 
-#ifndef CONFIG_USB_INVENTRA_DMA
+#if !defined(CONFIG_USB_INVENTRA_DMA) && !defined(CONFIG_USB_UX500_DMA)
 	if (rx_csr & MUSB_RXCSR_H_REQPKT)  {
 		/* REVISIT this happened for a while on some short reads...
 		 * the cleanup still needs investigation... looks bad...
@@ -1689,7 +1692,7 @@
 			| MUSB_RXCSR_RXPKTRDY);
 		musb_writew(hw_ep->regs, MUSB_RXCSR, val);
 
-#ifdef CONFIG_USB_INVENTRA_DMA
+#if defined(CONFIG_USB_INVENTRA_DMA) || defined(CONFIG_USB_UX500_DMA)
 		if (usb_pipeisoc(pipe)) {
 			struct usb_iso_packet_descriptor *d;
 
@@ -1745,7 +1748,7 @@
 		}
 
 		/* we are expecting IN packets */
-#ifdef CONFIG_USB_INVENTRA_DMA
+#if defined(CONFIG_USB_INVENTRA_DMA) || defined(CONFIG_USB_UX500_DMA)
 		if (dma) {
 			struct dma_controller	*c;
 			u16			rx_count;
@@ -1754,10 +1757,10 @@
 
 			rx_count = musb_readw(epio, MUSB_RXCOUNT);
 
-			dev_dbg(musb->controller, "RX%d count %d, buffer 0x%x len %d/%d\n",
+			dev_dbg(musb->controller, "RX%d count %d, buffer 0x%llx len %d/%d\n",
 					epnum, rx_count,
-					urb->transfer_dma
-						+ urb->actual_length,
+					(unsigned long long) urb->transfer_dma
+					+ urb->actual_length,
 					qh->offset,
 					urb->transfer_buffer_length);
 
@@ -1869,7 +1872,7 @@
 			unsigned int received_len;
 
 			/* Unmap the buffer so that CPU can use it */
-			usb_hcd_unmap_urb_for_dma(musb_to_hcd(musb), urb);
+			usb_hcd_unmap_urb_for_dma(musb->hcd, urb);
 
 			/*
 			 * We need to map sg if the transfer_buffer is
@@ -2463,7 +2466,6 @@
 	return 0;
 }
 
-
 #ifndef CONFIG_MUSB_PIO_ONLY
 
 #define MUSB_USB_DMA_ALIGN 4
@@ -2575,10 +2577,10 @@
 }
 #endif /* !CONFIG_MUSB_PIO_ONLY */
 
-const struct hc_driver musb_hc_driver = {
+static const struct hc_driver musb_hc_driver = {
 	.description		= "musb-hcd",
 	.product_desc		= "MUSB HDRC host driver",
-	.hcd_priv_size		= sizeof(struct musb),
+	.hcd_priv_size		= sizeof(struct musb *),
 	.flags			= HCD_USB2 | HCD_MEMORY,
 
 	/* not using irq handler or reset hooks from usbcore, since
@@ -2606,3 +2608,66 @@
 	/* .start_port_reset	= NULL, */
 	/* .hub_irq_enable	= NULL, */
 };
+
+int musb_host_alloc(struct musb *musb)
+{
+	struct device	*dev = musb->controller;
+
+	/* usbcore sets dev->driver_data to hcd, and sometimes uses that... */
+	musb->hcd = usb_create_hcd(&musb_hc_driver, dev, dev_name(dev));
+	if (!musb->hcd)
+		return -EINVAL;
+
+	*musb->hcd->hcd_priv = (unsigned long) musb;
+	musb->hcd->self.uses_pio_for_control = 1;
+	musb->hcd->uses_new_polling = 1;
+	musb->hcd->has_tt = 1;
+
+	return 0;
+}
+
+void musb_host_cleanup(struct musb *musb)
+{
+	usb_remove_hcd(musb->hcd);
+	musb->hcd = NULL;
+}
+
+void musb_host_free(struct musb *musb)
+{
+	usb_put_hcd(musb->hcd);
+}
+
+int musb_host_setup(struct musb *musb, int power_budget)
+{
+	int ret;
+	struct usb_hcd *hcd = musb->hcd;
+
+	MUSB_HST_MODE(musb);
+	musb->xceiv->otg->default_a = 1;
+	musb->xceiv->state = OTG_STATE_A_IDLE;
+
+	otg_set_host(musb->xceiv->otg, &hcd->self);
+	hcd->self.otg_port = 1;
+	musb->xceiv->otg->host = &hcd->self;
+	hcd->power_budget = 2 * (power_budget ? : 250);
+
+	ret = usb_add_hcd(hcd, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+void musb_host_resume_root_hub(struct musb *musb)
+{
+	usb_hcd_resume_root_hub(musb->hcd);
+}
+
+void musb_host_poke_root_hub(struct musb *musb)
+{
+	MUSB_HST_MODE(musb);
+	if (musb->hcd->status_urb)
+		usb_hcd_poll_rh_status(musb->hcd);
+	else
+		usb_hcd_resume_root_hub(musb->hcd);
+}

diff --git a/drivers/usb/musb/musb_host.h b/drivers/usb/musb/musb_host.h
index 738f7eb..960d735 100644
--- a/drivers/usb/musb/musb_host.h
+++ b/drivers/usb/musb/musb_host.h

@@ -37,16 +37,6 @@
 
 #include <linux/scatterlist.h>
 
-static inline struct usb_hcd *musb_to_hcd(struct musb *musb)
-{
-	return container_of((void *) musb, struct usb_hcd, hcd_priv);
-}
-
-static inline struct musb *hcd_to_musb(struct usb_hcd *hcd)
-{
-	return (struct musb *) (hcd->hcd_priv);
-}
-
 /* stored in "usb_host_endpoint.hcpriv" for scheduled endpoints */
 struct musb_qh {
 	struct usb_host_endpoint *hep;		/* usbcore info */
@@ -86,7 +76,52 @@
 }
 
 
+#if IS_ENABLED(CONFIG_USB_MUSB_HOST) || IS_ENABLED(CONFIG_USB_MUSB_DUAL_ROLE)
+extern struct musb *hcd_to_musb(struct usb_hcd *);
+extern irqreturn_t musb_h_ep0_irq(struct musb *);
+extern int musb_host_alloc(struct musb *);
+extern int musb_host_setup(struct musb *, int);
+extern void musb_host_cleanup(struct musb *);
+extern void musb_host_tx(struct musb *, u8);
+extern void musb_host_rx(struct musb *, u8);
 extern void musb_root_disconnect(struct musb *musb);
+extern void musb_host_free(struct musb *);
+extern void musb_host_cleanup(struct musb *);
+extern void musb_host_tx(struct musb *, u8);
+extern void musb_host_rx(struct musb *, u8);
+extern void musb_root_disconnect(struct musb *musb);
+extern void musb_host_resume_root_hub(struct musb *musb);
+extern void musb_host_poke_root_hub(struct musb *musb);
+#else
+static inline struct musb *hcd_to_musb(struct usb_hcd *hcd)
+{
+	return NULL;
+}
+
+static inline irqreturn_t musb_h_ep0_irq(struct musb *musb)
+{
+	return 0;
+}
+
+static inline int musb_host_alloc(struct musb *musb)
+{
+	return 0;
+}
+
+static inline int musb_host_setup(struct musb *musb, int power_budget)
+{
+	return 0;
+}
+
+static inline void musb_host_cleanup(struct musb *musb)		{}
+static inline void musb_host_free(struct musb *musb)		{}
+static inline void musb_host_tx(struct musb *musb, u8 epnum)	{}
+static inline void musb_host_rx(struct musb *musb, u8 epnum)	{}
+static inline void musb_root_disconnect(struct musb *musb)	{}
+static inline void musb_host_resume_root_hub(struct musb *musb)	{}
+static inline void musb_host_poll_rh_status(struct musb *musb)	{}
+static inline void musb_host_poke_root_hub(struct musb *musb)	{}
+#endif
 
 struct usb_hcd;
 
@@ -95,8 +130,6 @@
 			u16 typeReq, u16 wValue, u16 wIndex,
 			char *buf, u16 wLength);
 
-extern const struct hc_driver musb_hc_driver;
-
 static inline struct urb *next_urb(struct musb_qh *qh)
 {
 	struct list_head	*queue;

diff --git a/drivers/usb/musb/musb_virthub.c b/drivers/usb/musb/musb_virthub.c
index ef7d110..a523950 100644
--- a/drivers/usb/musb/musb_virthub.c
+++ b/drivers/usb/musb/musb_virthub.c

@@ -44,6 +44,51 @@
 
 #include "musb_core.h"
 
+/*
+* Program the HDRC to start (enable interrupts, dma, etc.).
+*/
+static void musb_start(struct musb *musb)
+{
+	void __iomem	*regs = musb->mregs;
+	u8		devctl = musb_readb(regs, MUSB_DEVCTL);
+
+	dev_dbg(musb->controller, "<== devctl %02x\n", devctl);
+
+	/*  Set INT enable registers, enable interrupts */
+	musb->intrtxe = musb->epmask;
+	musb_writew(regs, MUSB_INTRTXE, musb->intrtxe);
+	musb->intrrxe = musb->epmask & 0xfffe;
+	musb_writew(regs, MUSB_INTRRXE, musb->intrrxe);
+	musb_writeb(regs, MUSB_INTRUSBE, 0xf7);
+
+	musb_writeb(regs, MUSB_TESTMODE, 0);
+
+	/* put into basic highspeed mode and start session */
+	musb_writeb(regs, MUSB_POWER, MUSB_POWER_ISOUPDATE
+						| MUSB_POWER_HSENAB
+						/* ENSUSPEND wedges tusb */
+						/* | MUSB_POWER_ENSUSPEND */
+						);
+
+	musb->is_active = 0;
+	devctl = musb_readb(regs, MUSB_DEVCTL);
+	devctl &= ~MUSB_DEVCTL_SESSION;
+
+	/* session started after:
+	 * (a) ID-grounded irq, host mode;
+	 * (b) vbus present/connect IRQ, peripheral mode;
+	 * (c) peripheral initiates, using SRP
+	 */
+	if (musb->port_mode != MUSB_PORT_MODE_HOST &&
+	    (devctl & MUSB_DEVCTL_VBUS) == MUSB_DEVCTL_VBUS) {
+		musb->is_active = 1;
+	} else {
+		devctl |= MUSB_DEVCTL_SESSION;
+	}
+
+	musb_platform_enable(musb);
+	musb_writeb(regs, MUSB_DEVCTL, devctl);
+}
 
 static void musb_port_suspend(struct musb *musb, bool do_suspend)
 {
@@ -145,7 +190,6 @@
 			msleep(1);
 		}
 
-		musb->ignore_disconnect = true;
 		power &= 0xf0;
 		musb_writeb(mbase, MUSB_POWER,
 				power | MUSB_POWER_RESET);
@@ -158,8 +202,6 @@
 		musb_writeb(mbase, MUSB_POWER,
 				power & ~MUSB_POWER_RESET);
 
-		musb->ignore_disconnect = false;
-
 		power = musb_readb(mbase, MUSB_POWER);
 		if (power & MUSB_POWER_HSMODE) {
 			dev_dbg(musb->controller, "high-speed device connected\n");
@@ -170,7 +212,7 @@
 		musb->port1_status |= USB_PORT_STAT_ENABLE
 					| (USB_PORT_STAT_C_RESET << 16)
 					| (USB_PORT_STAT_C_ENABLE << 16);
-		usb_hcd_poll_rh_status(musb_to_hcd(musb));
+		usb_hcd_poll_rh_status(musb->hcd);
 
 		musb->vbuserr_retry = VBUSERR_RETRY_COUNT;
 	}
@@ -183,7 +225,7 @@
 	musb->port1_status = USB_PORT_STAT_POWER
 			| (USB_PORT_STAT_C_CONNECTION << 16);
 
-	usb_hcd_poll_rh_status(musb_to_hcd(musb));
+	usb_hcd_poll_rh_status(musb->hcd);
 	musb->is_active = 0;
 
 	switch (musb->xceiv->state) {
@@ -337,7 +379,7 @@
 			musb->port1_status &= ~(USB_PORT_STAT_SUSPEND
 					| MUSB_PORT_STAT_RESUME);
 			musb->port1_status |= USB_PORT_STAT_C_SUSPEND << 16;
-			usb_hcd_poll_rh_status(musb_to_hcd(musb));
+			usb_hcd_poll_rh_status(musb->hcd);
 			/* NOTE: it might really be A_WAIT_BCON ... */
 			musb->xceiv->state = OTG_STATE_A_HOST;
 		}

diff --git a/drivers/usb/musb/omap2430.c b/drivers/usb/musb/omap2430.c
index 628b93f..4315d35 100644
--- a/drivers/usb/musb/omap2430.c
+++ b/drivers/usb/musb/omap2430.c

@@ -87,7 +87,7 @@
 			musb->port1_status &= ~(USB_PORT_STAT_SUSPEND
 						| MUSB_PORT_STAT_RESUME);
 			musb->port1_status |= USB_PORT_STAT_C_SUSPEND << 16;
-			usb_hcd_poll_rh_status(musb_to_hcd(musb));
+			usb_hcd_poll_rh_status(musb->hcd);
 			/* NOTE: it might really be A_WAIT_BCON ... */
 			musb->xceiv->state = OTG_STATE_A_HOST;
 		}
@@ -481,6 +481,7 @@
 
 static int omap2430_probe(struct platform_device *pdev)
 {
+	struct resource			musb_resources[2];
 	struct musb_hdrc_platform_data	*pdata = pdev->dev.platform_data;
 	struct omap_musb_board_data	*data;
 	struct platform_device		*musb;
@@ -567,8 +568,21 @@
 
 	INIT_WORK(&glue->omap_musb_mailbox_work, omap_musb_mailbox_work);
 
-	ret = platform_device_add_resources(musb, pdev->resource,
-			pdev->num_resources);
+	memset(musb_resources, 0x00, sizeof(*musb_resources) *
+			ARRAY_SIZE(musb_resources));
+
+	musb_resources[0].name = pdev->resource[0].name;
+	musb_resources[0].start = pdev->resource[0].start;
+	musb_resources[0].end = pdev->resource[0].end;
+	musb_resources[0].flags = pdev->resource[0].flags;
+
+	musb_resources[1].name = pdev->resource[1].name;
+	musb_resources[1].start = pdev->resource[1].start;
+	musb_resources[1].end = pdev->resource[1].end;
+	musb_resources[1].flags = pdev->resource[1].flags;
+
+	ret = platform_device_add_resources(musb, musb_resources,
+			ARRAY_SIZE(musb_resources));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to add resources\n");
 		goto err2;

diff --git a/drivers/usb/musb/tusb6010.c b/drivers/usb/musb/tusb6010.c
index 7369ba3..2c06a89 100644
--- a/drivers/usb/musb/tusb6010.c
+++ b/drivers/usb/musb/tusb6010.c

@@ -1156,6 +1156,7 @@
 
 static int tusb_probe(struct platform_device *pdev)
 {
+	struct resource musb_resources[2];
 	struct musb_hdrc_platform_data	*pdata = pdev->dev.platform_data;
 	struct platform_device		*musb;
 	struct tusb6010_glue		*glue;
@@ -1185,8 +1186,21 @@
 
 	platform_set_drvdata(pdev, glue);
 
-	ret = platform_device_add_resources(musb, pdev->resource,
-			pdev->num_resources);
+	memset(musb_resources, 0x00, sizeof(*musb_resources) *
+			ARRAY_SIZE(musb_resources));
+
+	musb_resources[0].name = pdev->resource[0].name;
+	musb_resources[0].start = pdev->resource[0].start;
+	musb_resources[0].end = pdev->resource[0].end;
+	musb_resources[0].flags = pdev->resource[0].flags;
+
+	musb_resources[1].name = pdev->resource[1].name;
+	musb_resources[1].start = pdev->resource[1].start;
+	musb_resources[1].end = pdev->resource[1].end;
+	musb_resources[1].flags = pdev->resource[1].flags;
+
+	ret = platform_device_add_resources(musb, musb_resources,
+			ARRAY_SIZE(musb_resources));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to add resources\n");
 		goto err3;

diff --git a/drivers/usb/musb/ux500.c b/drivers/usb/musb/ux500.c
index 2c80004..028ff4d 100644
--- a/drivers/usb/musb/ux500.c
+++ b/drivers/usb/musb/ux500.c

@@ -189,6 +189,7 @@
 
 static int ux500_probe(struct platform_device *pdev)
 {
+	struct resource musb_resources[2];
 	struct musb_hdrc_platform_data	*pdata = pdev->dev.platform_data;
 	struct platform_device		*musb;
 	struct ux500_glue		*glue;
@@ -232,8 +233,21 @@
 
 	platform_set_drvdata(pdev, glue);
 
-	ret = platform_device_add_resources(musb, pdev->resource,
-			pdev->num_resources);
+	memset(musb_resources, 0x00, sizeof(*musb_resources) *
+			ARRAY_SIZE(musb_resources));
+
+	musb_resources[0].name = pdev->resource[0].name;
+	musb_resources[0].start = pdev->resource[0].start;
+	musb_resources[0].end = pdev->resource[0].end;
+	musb_resources[0].flags = pdev->resource[0].flags;
+
+	musb_resources[1].name = pdev->resource[1].name;
+	musb_resources[1].start = pdev->resource[1].start;
+	musb_resources[1].end = pdev->resource[1].end;
+	musb_resources[1].flags = pdev->resource[1].flags;
+
+	ret = platform_device_add_resources(musb, musb_resources,
+			ARRAY_SIZE(musb_resources));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to add resources\n");
 		goto err5;

diff --git a/drivers/usb/musb/ux500_dma.c b/drivers/usb/musb/ux500_dma.c
index 3381206..63e7c8a 100644
--- a/drivers/usb/musb/ux500_dma.c
+++ b/drivers/usb/musb/ux500_dma.c

@@ -71,8 +71,7 @@
 	spin_lock_irqsave(&musb->lock, flags);
 	ux500_channel->channel.actual_len = ux500_channel->cur_len;
 	ux500_channel->channel.status = MUSB_DMA_STATUS_FREE;
-	musb_dma_completion(musb, hw_ep->epnum,
-		ux500_channel->is_tx);
+	musb_dma_completion(musb, hw_ep->epnum, ux500_channel->is_tx);
 	spin_unlock_irqrestore(&musb->lock, flags);
 
 }
@@ -366,7 +365,8 @@
 	kfree(controller);
 }
 
-struct dma_controller *dma_controller_create(struct musb *musb, void __iomem *base)
+struct dma_controller *dma_controller_create(struct musb *musb,
+					void __iomem *base)
 {
 	struct ux500_dma_controller *controller;
 	struct platform_device *pdev = to_platform_device(musb->controller);

diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig
index 2311b1e..a5a9552 100644
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig

@@ -92,7 +92,7 @@
 	  on/off the PHY.
 
 config SAMSUNG_USBPHY
-	tristate "Samsung USB PHY Driver"
+	tristate
 	help
 	  Enable this to support Samsung USB phy helper driver for Samsung SoCs.
 	  This driver provides common interface to interact, for Samsung USB 2.0 PHY

diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile
index a9169cb..070eca3 100644
--- a/drivers/usb/phy/Makefile
+++ b/drivers/usb/phy/Makefile

@@ -5,6 +5,7 @@
 ccflags-$(CONFIG_USB_DEBUG) := -DDEBUG
 
 obj-$(CONFIG_USB_PHY)			+= phy.o
+obj-$(CONFIG_OF)			+= of.o
 
 # transceiver drivers, keep the list sorted
 

diff --git a/drivers/usb/phy/of.c b/drivers/usb/phy/of.c
new file mode 100644
index 0000000..7ea0154
--- /dev/null
+++ b/drivers/usb/phy/of.c

@@ -0,0 +1,47 @@
+/*
+ * USB of helper code
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/usb/of.h>
+#include <linux/usb/otg.h>
+
+static const char *const usbphy_modes[] = {
+	[USBPHY_INTERFACE_MODE_UNKNOWN]	= "",
+	[USBPHY_INTERFACE_MODE_UTMI]	= "utmi",
+	[USBPHY_INTERFACE_MODE_UTMIW]	= "utmi_wide",
+	[USBPHY_INTERFACE_MODE_ULPI]	= "ulpi",
+	[USBPHY_INTERFACE_MODE_SERIAL]	= "serial",
+	[USBPHY_INTERFACE_MODE_HSIC]	= "hsic",
+};
+
+/**
+ * of_usb_get_phy_mode - Get phy mode for given device_node
+ * @np:	Pointer to the given device_node
+ *
+ * The function gets phy interface string from property 'phy_type',
+ * and returns the correspondig enum usb_phy_interface
+ */
+enum usb_phy_interface of_usb_get_phy_mode(struct device_node *np)
+{
+	const char *phy_type;
+	int err, i;
+
+	err = of_property_read_string(np, "phy_type", &phy_type);
+	if (err < 0)
+		return USBPHY_INTERFACE_MODE_UNKNOWN;
+
+	for (i = 0; i < ARRAY_SIZE(usbphy_modes); i++)
+		if (!strcmp(phy_type, usbphy_modes[i]))
+			return i;
+
+	return USBPHY_INTERFACE_MODE_UNKNOWN;
+}
+EXPORT_SYMBOL_GPL(of_usb_get_phy_mode);

diff --git a/drivers/usb/phy/phy-ab8500-usb.c b/drivers/usb/phy/phy-ab8500-usb.c
index e5eb1b5..0874023 100644
--- a/drivers/usb/phy/phy-ab8500-usb.c
+++ b/drivers/usb/phy/phy-ab8500-usb.c

@@ -1,10 +1,12 @@
 /*
  * drivers/usb/otg/ab8500_usb.c
  *
- * USB transceiver driver for AB8500 chip
+ * USB transceiver driver for AB8500 family chips
  *
- * Copyright (C) 2010 ST-Ericsson AB
+ * Copyright (C) 2010-2013 ST-Ericsson AB
  * Mian Yousaf Kaukab <mian.yousaf.kaukab@stericsson.com>
+ * Avinash Kumar <avinash.kumar@stericsson.com>
+ * Thirupathi Chippakurthy <thirupathi.chippakurthy@stericsson.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -29,6 +31,8 @@
 #include <linux/notifier.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
+#include <linux/clk.h>
+#include <linux/err.h>
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
 #include <linux/usb/musb-ux500.h>
@@ -41,21 +45,34 @@
 /* Bank AB8500_USB */
 #define AB8500_USB_LINE_STAT_REG 0x80
 #define AB8505_USB_LINE_STAT_REG 0x94
+#define AB8540_USB_LINK_STAT_REG 0x94
+#define AB9540_USB_LINK_STAT_REG 0x94
+#define AB8540_USB_OTG_CTL_REG 0x87
 #define AB8500_USB_PHY_CTRL_REG 0x8A
+#define AB8540_VBUS_CTRL_REG 0x82
 
 /* Bank AB8500_DEVELOPMENT */
 #define AB8500_BANK12_ACCESS 0x00
 
 /* Bank AB8500_DEBUG */
+#define AB8540_DEBUG 0x32
 #define AB8500_USB_PHY_TUNE1 0x05
 #define AB8500_USB_PHY_TUNE2 0x06
 #define AB8500_USB_PHY_TUNE3 0x07
 
+/* Bank AB8500_INTERRUPT */
+#define AB8500_IT_SOURCE2_REG 0x01
+
 #define AB8500_BIT_OTG_STAT_ID (1 << 0)
 #define AB8500_BIT_PHY_CTRL_HOST_EN (1 << 0)
 #define AB8500_BIT_PHY_CTRL_DEVICE_EN (1 << 1)
 #define AB8500_BIT_WD_CTRL_ENABLE (1 << 0)
 #define AB8500_BIT_WD_CTRL_KICK (1 << 1)
+#define AB8500_BIT_SOURCE2_VBUSDET (1 << 7)
+#define AB8540_BIT_OTG_CTL_VBUS_VALID_ENA (1 << 0)
+#define AB8540_BIT_OTG_CTL_ID_HOST_ENA (1 << 1)
+#define AB8540_BIT_OTG_CTL_ID_DEV_ENA (1 << 5)
+#define AB8540_BIT_VBUS_CTRL_CHARG_DET_ENA (1 << 0)
 
 #define AB8500_WD_KICK_DELAY_US 100 /* usec */
 #define AB8500_WD_V11_DISABLE_DELAY_US 100 /* usec */
@@ -112,6 +129,68 @@
 	USB_LINK_MOTOROLA_FACTORY_CBL_PHY_EN_8505,
 };
 
+enum ab8540_usb_link_status {
+	USB_LINK_NOT_CONFIGURED_8540 = 0,
+	USB_LINK_STD_HOST_NC_8540,
+	USB_LINK_STD_HOST_C_NS_8540,
+	USB_LINK_STD_HOST_C_S_8540,
+	USB_LINK_CDP_8540,
+	USB_LINK_RESERVED0_8540,
+	USB_LINK_RESERVED1_8540,
+	USB_LINK_DEDICATED_CHG_8540,
+	USB_LINK_ACA_RID_A_8540,
+	USB_LINK_ACA_RID_B_8540,
+	USB_LINK_ACA_RID_C_NM_8540,
+	USB_LINK_RESERVED2_8540,
+	USB_LINK_RESERVED3_8540,
+	USB_LINK_HM_IDGND_8540,
+	USB_LINK_CHARGERPORT_NOT_OK_8540,
+	USB_LINK_CHARGER_DM_HIGH_8540,
+	USB_LINK_PHYEN_NO_VBUS_NO_IDGND_8540,
+	USB_LINK_STD_UPSTREAM_NO_IDGNG_VBUS_8540,
+	USB_LINK_STD_UPSTREAM_8540,
+	USB_LINK_CHARGER_SE1_8540,
+	USB_LINK_CARKIT_CHGR_1_8540,
+	USB_LINK_CARKIT_CHGR_2_8540,
+	USB_LINK_ACA_DOCK_CHGR_8540,
+	USB_LINK_SAMSUNG_BOOT_CBL_PHY_EN_8540,
+	USB_LINK_SAMSUNG_BOOT_CBL_PHY_DISB_8540,
+	USB_LINK_SAMSUNG_UART_CBL_PHY_EN_8540,
+	USB_LINK_SAMSUNG_UART_CBL_PHY_DISB_8540,
+	USB_LINK_MOTOROLA_FACTORY_CBL_PHY_EN_8540
+};
+
+enum ab9540_usb_link_status {
+	USB_LINK_NOT_CONFIGURED_9540 = 0,
+	USB_LINK_STD_HOST_NC_9540,
+	USB_LINK_STD_HOST_C_NS_9540,
+	USB_LINK_STD_HOST_C_S_9540,
+	USB_LINK_CDP_9540,
+	USB_LINK_RESERVED0_9540,
+	USB_LINK_RESERVED1_9540,
+	USB_LINK_DEDICATED_CHG_9540,
+	USB_LINK_ACA_RID_A_9540,
+	USB_LINK_ACA_RID_B_9540,
+	USB_LINK_ACA_RID_C_NM_9540,
+	USB_LINK_RESERVED2_9540,
+	USB_LINK_RESERVED3_9540,
+	USB_LINK_HM_IDGND_9540,
+	USB_LINK_CHARGERPORT_NOT_OK_9540,
+	USB_LINK_CHARGER_DM_HIGH_9540,
+	USB_LINK_PHYEN_NO_VBUS_NO_IDGND_9540,
+	USB_LINK_STD_UPSTREAM_NO_IDGNG_VBUS_9540,
+	USB_LINK_STD_UPSTREAM_9540,
+	USB_LINK_CHARGER_SE1_9540,
+	USB_LINK_CARKIT_CHGR_1_9540,
+	USB_LINK_CARKIT_CHGR_2_9540,
+	USB_LINK_ACA_DOCK_CHGR_9540,
+	USB_LINK_SAMSUNG_BOOT_CBL_PHY_EN_9540,
+	USB_LINK_SAMSUNG_BOOT_CBL_PHY_DISB_9540,
+	USB_LINK_SAMSUNG_UART_CBL_PHY_EN_9540,
+	USB_LINK_SAMSUNG_UART_CBL_PHY_DISB_9540,
+	USB_LINK_MOTOROLA_FACTORY_CBL_PHY_EN_9540
+};
+
 enum ab8500_usb_mode {
 	USB_IDLE = 0,
 	USB_PERIPHERAL,
@@ -119,13 +198,30 @@
 	USB_DEDICATED_CHG
 };
 
+/* Register USB_LINK_STATUS interrupt */
+#define AB8500_USB_FLAG_USE_LINK_STATUS_IRQ	(1 << 0)
+/* Register ID_WAKEUP_F interrupt */
+#define AB8500_USB_FLAG_USE_ID_WAKEUP_IRQ	(1 << 1)
+/* Register VBUS_DET_F interrupt */
+#define AB8500_USB_FLAG_USE_VBUS_DET_IRQ	(1 << 2)
+/* Driver is using the ab-iddet driver*/
+#define AB8500_USB_FLAG_USE_AB_IDDET		(1 << 3)
+/* Enable setting regulators voltage */
+#define AB8500_USB_FLAG_REGULATOR_SET_VOLTAGE	(1 << 4)
+/* Enable the check_vbus_status workaround */
+#define AB8500_USB_FLAG_USE_CHECK_VBUS_STATUS	(1 << 5)
+/* Enable the vbus host workaround */
+#define AB8500_USB_FLAG_USE_VBUS_HOST_QUIRK	(1 << 6)
+
 struct ab8500_usb {
 	struct usb_phy phy;
 	struct device *dev;
 	struct ab8500 *ab8500;
 	unsigned vbus_draw;
 	struct work_struct phy_dis_work;
+	struct work_struct vbus_event_work;
 	enum ab8500_usb_mode mode;
+	struct clk *sysclk;
 	struct regulator *v_ape;
 	struct regulator *v_musb;
 	struct regulator *v_ulpi;
@@ -133,6 +229,8 @@
 	int previous_link_status_state;
 	struct pinctrl *pinctrl;
 	struct pinctrl_state *pins_sleep;
+	bool enabled_charging_detection;
+	unsigned int flags;
 };
 
 static inline struct ab8500_usb *phy_to_ab(struct usb_phy *x)
@@ -171,7 +269,7 @@
 	if (ret)
 		dev_err(ab->dev, "Failed to enable v-ape\n");
 
-	if (!is_ab8500_2p0_or_earlier(ab->ab8500)) {
+	if (ab->flags & AB8500_USB_FLAG_REGULATOR_SET_VOLTAGE) {
 		ab->saved_v_ulpi = regulator_get_voltage(ab->v_ulpi);
 		if (ab->saved_v_ulpi < 0)
 			dev_err(ab->dev, "Failed to get v_ulpi voltage\n");
@@ -191,7 +289,7 @@
 	if (ret)
 		dev_err(ab->dev, "Failed to enable vddulpivio18\n");
 
-	if (!is_ab8500_2p0_or_earlier(ab->ab8500)) {
+	if (ab->flags & AB8500_USB_FLAG_REGULATOR_SET_VOLTAGE) {
 		volt = regulator_get_voltage(ab->v_ulpi);
 		if ((volt != 1300000) && (volt != 1350000))
 			dev_err(ab->dev, "Vintcore is not set to 1.3V volt=%d\n",
@@ -212,7 +310,7 @@
 	regulator_disable(ab->v_ulpi);
 
 	/* USB is not the only consumer of Vintcore, restore old settings */
-	if (!is_ab8500_2p0_or_earlier(ab->ab8500)) {
+	if (ab->flags & AB8500_USB_FLAG_REGULATOR_SET_VOLTAGE) {
 		if (ab->saved_v_ulpi > 0) {
 			ret = regulator_set_voltage(ab->v_ulpi,
 					ab->saved_v_ulpi, ab->saved_v_ulpi);
@@ -252,11 +350,23 @@
 	if (IS_ERR(ab->pinctrl))
 		dev_err(ab->dev, "could not get/set default pinstate\n");
 
+	if (clk_prepare_enable(ab->sysclk))
+		dev_err(ab->dev, "can't prepare/enable clock\n");
+
 	ab8500_usb_regulator_enable(ab);
 
 	abx500_mask_and_set_register_interruptible(ab->dev,
 			AB8500_USB, AB8500_USB_PHY_CTRL_REG,
 			bit, bit);
+
+	if (ab->flags & AB8500_USB_FLAG_USE_VBUS_HOST_QUIRK) {
+		if (sel_host)
+			abx500_set_register_interruptible(ab->dev,
+					AB8500_USB, AB8540_USB_OTG_CTL_REG,
+					AB8540_BIT_OTG_CTL_VBUS_VALID_ENA |
+					AB8540_BIT_OTG_CTL_ID_HOST_ENA |
+					AB8540_BIT_OTG_CTL_ID_DEV_ENA);
+	}
 }
 
 static void ab8500_usb_phy_disable(struct ab8500_usb *ab, bool sel_host)
@@ -274,6 +384,8 @@
 	/* Needed to disable the phy.*/
 	ab8500_usb_wd_workaround(ab);
 
+	clk_disable_unprepare(ab->sysclk);
+
 	ab8500_usb_regulator_disable(ab);
 
 	if (!IS_ERR(ab->pinctrl)) {
@@ -286,7 +398,8 @@
 		else if (pinctrl_select_state(ab->pinctrl, ab->pins_sleep))
 			dev_err(ab->dev, "could not set pins to sleep state\n");
 
-		/* as USB pins are shared with idddet, release them to allow
+		/*
+		 * as USB pins are shared with iddet, release them to allow
 		 * iddet to request them
 		 */
 		pinctrl_put(ab->pinctrl);
@@ -298,6 +411,254 @@
 #define ab8500_usb_peri_phy_en(ab)	ab8500_usb_phy_enable(ab, false)
 #define ab8500_usb_peri_phy_dis(ab)	ab8500_usb_phy_disable(ab, false)
 
+static int ab9540_usb_link_status_update(struct ab8500_usb *ab,
+		enum ab9540_usb_link_status lsts)
+{
+	enum ux500_musb_vbus_id_status event = 0;
+
+	dev_dbg(ab->dev, "ab9540_usb_link_status_update %d\n", lsts);
+
+	if (ab->previous_link_status_state == USB_LINK_HM_IDGND_9540 &&
+			(lsts == USB_LINK_STD_HOST_C_NS_9540 ||
+			 lsts == USB_LINK_STD_HOST_NC_9540))
+		return 0;
+
+	if (ab->previous_link_status_state == USB_LINK_ACA_RID_A_9540 &&
+			(lsts == USB_LINK_STD_HOST_NC_9540))
+		return 0;
+
+	ab->previous_link_status_state = lsts;
+
+	switch (lsts) {
+	case USB_LINK_ACA_RID_B_9540:
+		event = UX500_MUSB_RIDB;
+	case USB_LINK_NOT_CONFIGURED_9540:
+	case USB_LINK_RESERVED0_9540:
+	case USB_LINK_RESERVED1_9540:
+	case USB_LINK_RESERVED2_9540:
+	case USB_LINK_RESERVED3_9540:
+		if (ab->mode == USB_PERIPHERAL)
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_CLEAN, &ab->vbus_draw);
+		ab->mode = USB_IDLE;
+		ab->phy.otg->default_a = false;
+		ab->vbus_draw = 0;
+		if (event != UX500_MUSB_RIDB)
+			event = UX500_MUSB_NONE;
+		/* Fallback to default B_IDLE as nothing is connected. */
+		ab->phy.state = OTG_STATE_B_IDLE;
+		break;
+
+	case USB_LINK_ACA_RID_C_NM_9540:
+		event = UX500_MUSB_RIDC;
+	case USB_LINK_STD_HOST_NC_9540:
+	case USB_LINK_STD_HOST_C_NS_9540:
+	case USB_LINK_STD_HOST_C_S_9540:
+	case USB_LINK_CDP_9540:
+		if (ab->mode == USB_HOST) {
+			ab->mode = USB_PERIPHERAL;
+			ab8500_usb_host_phy_dis(ab);
+			ab8500_usb_peri_phy_en(ab);
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_PREPARE, &ab->vbus_draw);
+		}
+		if (ab->mode == USB_IDLE) {
+			ab->mode = USB_PERIPHERAL;
+			ab8500_usb_peri_phy_en(ab);
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_PREPARE, &ab->vbus_draw);
+		}
+		if (event != UX500_MUSB_RIDC)
+			event = UX500_MUSB_VBUS;
+		break;
+
+	case USB_LINK_ACA_RID_A_9540:
+		event = UX500_MUSB_RIDA;
+	case USB_LINK_HM_IDGND_9540:
+	case USB_LINK_STD_UPSTREAM_9540:
+		if (ab->mode == USB_PERIPHERAL) {
+			ab->mode = USB_HOST;
+			ab8500_usb_peri_phy_dis(ab);
+			ab8500_usb_host_phy_en(ab);
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_PREPARE, &ab->vbus_draw);
+		}
+		if (ab->mode == USB_IDLE) {
+			ab->mode = USB_HOST;
+			ab8500_usb_host_phy_en(ab);
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_PREPARE, &ab->vbus_draw);
+		}
+		ab->phy.otg->default_a = true;
+		if (event != UX500_MUSB_RIDA)
+			event = UX500_MUSB_ID;
+
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				event, &ab->vbus_draw);
+		break;
+
+	case USB_LINK_DEDICATED_CHG_9540:
+		ab->mode = USB_DEDICATED_CHG;
+		event = UX500_MUSB_CHARGER;
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				event, &ab->vbus_draw);
+		break;
+
+	case USB_LINK_PHYEN_NO_VBUS_NO_IDGND_9540:
+	case USB_LINK_STD_UPSTREAM_NO_IDGNG_VBUS_9540:
+		if (!(is_ab9540_2p0_or_earlier(ab->ab8500))) {
+			event = UX500_MUSB_NONE;
+			if (ab->mode == USB_HOST) {
+				ab->phy.otg->default_a = false;
+				ab->vbus_draw = 0;
+				atomic_notifier_call_chain(&ab->phy.notifier,
+						event, &ab->vbus_draw);
+				ab8500_usb_host_phy_dis(ab);
+				ab->mode = USB_IDLE;
+			}
+			if (ab->mode == USB_PERIPHERAL) {
+				atomic_notifier_call_chain(&ab->phy.notifier,
+						event, &ab->vbus_draw);
+				ab8500_usb_peri_phy_dis(ab);
+				atomic_notifier_call_chain(&ab->phy.notifier,
+						UX500_MUSB_CLEAN,
+						&ab->vbus_draw);
+				ab->mode = USB_IDLE;
+				ab->phy.otg->default_a = false;
+				ab->vbus_draw = 0;
+			}
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int ab8540_usb_link_status_update(struct ab8500_usb *ab,
+		enum ab8540_usb_link_status lsts)
+{
+	enum ux500_musb_vbus_id_status event = 0;
+
+	dev_dbg(ab->dev, "ab8540_usb_link_status_update %d\n", lsts);
+
+	if (ab->enabled_charging_detection) {
+		/* Disable USB Charger detection */
+		abx500_mask_and_set_register_interruptible(ab->dev,
+				AB8500_USB, AB8540_VBUS_CTRL_REG,
+				AB8540_BIT_VBUS_CTRL_CHARG_DET_ENA, 0x00);
+		ab->enabled_charging_detection = false;
+	}
+
+	/*
+	 * Spurious link_status interrupts are seen in case of a
+	 * disconnection of a device in IDGND and RIDA stage
+	 */
+	if (ab->previous_link_status_state == USB_LINK_HM_IDGND_8540 &&
+			(lsts == USB_LINK_STD_HOST_C_NS_8540 ||
+			 lsts == USB_LINK_STD_HOST_NC_8540))
+		return 0;
+
+	if (ab->previous_link_status_state == USB_LINK_ACA_RID_A_8540 &&
+			(lsts == USB_LINK_STD_HOST_NC_8540))
+		return 0;
+
+	ab->previous_link_status_state = lsts;
+
+	switch (lsts) {
+	case USB_LINK_ACA_RID_B_8540:
+		event = UX500_MUSB_RIDB;
+	case USB_LINK_NOT_CONFIGURED_8540:
+	case USB_LINK_RESERVED0_8540:
+	case USB_LINK_RESERVED1_8540:
+	case USB_LINK_RESERVED2_8540:
+	case USB_LINK_RESERVED3_8540:
+		ab->mode = USB_IDLE;
+		ab->phy.otg->default_a = false;
+		ab->vbus_draw = 0;
+		if (event != UX500_MUSB_RIDB)
+			event = UX500_MUSB_NONE;
+		/*
+		 * Fallback to default B_IDLE as nothing
+		 * is connected
+		 */
+		ab->phy.state = OTG_STATE_B_IDLE;
+		break;
+
+	case USB_LINK_ACA_RID_C_NM_8540:
+		event = UX500_MUSB_RIDC;
+	case USB_LINK_STD_HOST_NC_8540:
+	case USB_LINK_STD_HOST_C_NS_8540:
+	case USB_LINK_STD_HOST_C_S_8540:
+	case USB_LINK_CDP_8540:
+		if (ab->mode == USB_IDLE) {
+			ab->mode = USB_PERIPHERAL;
+			ab8500_usb_peri_phy_en(ab);
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_PREPARE, &ab->vbus_draw);
+		}
+		if (event != UX500_MUSB_RIDC)
+			event = UX500_MUSB_VBUS;
+		break;
+
+	case USB_LINK_ACA_RID_A_8540:
+	case USB_LINK_ACA_DOCK_CHGR_8540:
+		event = UX500_MUSB_RIDA;
+	case USB_LINK_HM_IDGND_8540:
+	case USB_LINK_STD_UPSTREAM_8540:
+		if (ab->mode == USB_IDLE) {
+			ab->mode = USB_HOST;
+			ab8500_usb_host_phy_en(ab);
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_PREPARE, &ab->vbus_draw);
+		}
+		ab->phy.otg->default_a = true;
+		if (event != UX500_MUSB_RIDA)
+			event = UX500_MUSB_ID;
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				event, &ab->vbus_draw);
+		break;
+
+	case USB_LINK_DEDICATED_CHG_8540:
+		ab->mode = USB_DEDICATED_CHG;
+		event = UX500_MUSB_CHARGER;
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				event, &ab->vbus_draw);
+		break;
+
+	case USB_LINK_PHYEN_NO_VBUS_NO_IDGND_8540:
+	case USB_LINK_STD_UPSTREAM_NO_IDGNG_VBUS_8540:
+		event = UX500_MUSB_NONE;
+		if (ab->mode == USB_HOST) {
+			ab->phy.otg->default_a = false;
+			ab->vbus_draw = 0;
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					event, &ab->vbus_draw);
+			ab8500_usb_host_phy_dis(ab);
+			ab->mode = USB_IDLE;
+		}
+		if (ab->mode == USB_PERIPHERAL) {
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					event, &ab->vbus_draw);
+			ab8500_usb_peri_phy_dis(ab);
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_CLEAN, &ab->vbus_draw);
+			ab->mode = USB_IDLE;
+			ab->phy.otg->default_a = false;
+			ab->vbus_draw = 0;
+		}
+		break;
+
+	default:
+		event = UX500_MUSB_NONE;
+		break;
+	}
+
+	return 0;
+}
+
 static int ab8505_usb_link_status_update(struct ab8500_usb *ab,
 		enum ab8505_usb_link_status lsts)
 {
@@ -498,6 +859,20 @@
 				AB8500_USB, AB8505_USB_LINE_STAT_REG, &reg);
 		lsts = (reg >> 3) & 0x1F;
 		ret = ab8505_usb_link_status_update(ab, lsts);
+	} else if (is_ab8540(ab->ab8500)) {
+		enum ab8540_usb_link_status lsts;
+
+		abx500_get_register_interruptible(ab->dev,
+				AB8500_USB, AB8540_USB_LINK_STAT_REG, &reg);
+		lsts = (reg >> 3) & 0xFF;
+		ret = ab8540_usb_link_status_update(ab, lsts);
+	} else if (is_ab9540(ab->ab8500)) {
+		enum ab9540_usb_link_status lsts;
+
+		abx500_get_register_interruptible(ab->dev,
+				AB8500_USB, AB9540_USB_LINK_STAT_REG, &reg);
+		lsts = (reg >> 3) & 0xFF;
+		ret = ab9540_usb_link_status_update(ab, lsts);
 	}
 
 	return ret;
@@ -553,7 +928,7 @@
 
 static irqreturn_t ab8500_usb_link_status_irq(int irq, void *data)
 {
-	struct ab8500_usb *ab = (struct ab8500_usb *) data;
+	struct ab8500_usb *ab = (struct ab8500_usb *)data;
 
 	abx500_usb_link_status_update(ab);
 
@@ -572,6 +947,69 @@
 		ab8500_usb_peri_phy_dis(ab);
 }
 
+/* Check if VBUS is set and linkstatus has not detected a cable. */
+static bool ab8500_usb_check_vbus_status(struct ab8500_usb *ab)
+{
+	u8 isource2;
+	u8 reg;
+	enum ab8540_usb_link_status lsts;
+
+	abx500_get_register_interruptible(ab->dev,
+			AB8500_INTERRUPT, AB8500_IT_SOURCE2_REG,
+			&isource2);
+
+	/* If Vbus is below 3.6V abort */
+	if (!(isource2 & AB8500_BIT_SOURCE2_VBUSDET))
+		return false;
+
+	abx500_get_register_interruptible(ab->dev,
+			AB8500_USB, AB8540_USB_LINK_STAT_REG,
+			&reg);
+
+	lsts = (reg >> 3) & 0xFF;
+
+	/* Check if linkstatus has detected a cable */
+	if (lsts)
+		return false;
+
+	return true;
+}
+
+/* re-trigger charger detection again with watchdog re-kick. */
+static void ab8500_usb_vbus_turn_on_event_work(struct work_struct *work)
+{
+	struct ab8500_usb *ab = container_of(work, struct ab8500_usb,
+			vbus_event_work);
+
+	if (ab->mode != USB_IDLE)
+		return;
+
+	abx500_set_register_interruptible(ab->dev,
+			AB8500_SYS_CTRL2_BLOCK, AB8500_MAIN_WD_CTRL_REG,
+			AB8500_BIT_WD_CTRL_ENABLE);
+
+	udelay(100);
+
+	abx500_set_register_interruptible(ab->dev,
+			AB8500_SYS_CTRL2_BLOCK, AB8500_MAIN_WD_CTRL_REG,
+			AB8500_BIT_WD_CTRL_ENABLE | AB8500_BIT_WD_CTRL_KICK);
+
+	udelay(100);
+
+	/* Disable Main watchdog */
+	abx500_set_register_interruptible(ab->dev,
+			AB8500_SYS_CTRL2_BLOCK, AB8500_MAIN_WD_CTRL_REG,
+			0x0);
+
+	/* Enable USB Charger detection */
+	abx500_mask_and_set_register_interruptible(ab->dev,
+			AB8500_USB, AB8540_VBUS_CTRL_REG,
+			AB8540_BIT_VBUS_CTRL_CHARG_DET_ENA,
+			AB8540_BIT_VBUS_CTRL_CHARG_DET_ENA);
+
+	ab->enabled_charging_detection = true;
+}
+
 static unsigned ab8500_eyediagram_workaroud(struct ab8500_usb *ab, unsigned mA)
 {
 	/*
@@ -627,7 +1065,7 @@
 	 * is fixed.
 	 */
 
-	if ((ab->mode != USB_IDLE) && (!gadget)) {
+	if ((ab->mode != USB_IDLE) && !gadget) {
 		ab->mode = USB_IDLE;
 		schedule_work(&ab->phy_dis_work);
 	}
@@ -651,7 +1089,7 @@
 	 * is fixed.
 	 */
 
-	if ((ab->mode != USB_IDLE) && (!host)) {
+	if ((ab->mode != USB_IDLE) && !host) {
 		ab->mode = USB_IDLE;
 		schedule_work(&ab->phy_dis_work);
 	}
@@ -659,6 +1097,33 @@
 	return 0;
 }
 
+static void ab8500_usb_restart_phy(struct ab8500_usb *ab)
+{
+	abx500_mask_and_set_register_interruptible(ab->dev,
+			AB8500_USB, AB8500_USB_PHY_CTRL_REG,
+			AB8500_BIT_PHY_CTRL_DEVICE_EN,
+			AB8500_BIT_PHY_CTRL_DEVICE_EN);
+
+	udelay(100);
+
+	abx500_mask_and_set_register_interruptible(ab->dev,
+			AB8500_USB, AB8500_USB_PHY_CTRL_REG,
+			AB8500_BIT_PHY_CTRL_DEVICE_EN,
+			0);
+
+	abx500_mask_and_set_register_interruptible(ab->dev,
+			AB8500_USB, AB8500_USB_PHY_CTRL_REG,
+			AB8500_BIT_PHY_CTRL_HOST_EN,
+			AB8500_BIT_PHY_CTRL_HOST_EN);
+
+	udelay(100);
+
+	abx500_mask_and_set_register_interruptible(ab->dev,
+			AB8500_USB, AB8500_USB_PHY_CTRL_REG,
+			AB8500_BIT_PHY_CTRL_HOST_EN,
+			0);
+}
+
 static int ab8500_usb_regulator_get(struct ab8500_usb *ab)
 {
 	int err;
@@ -693,48 +1158,197 @@
 	int err;
 	int irq;
 
-	irq = platform_get_irq_byname(pdev, "USB_LINK_STATUS");
-	if (irq < 0) {
-		dev_err(&pdev->dev, "Link status irq not found\n");
-		return irq;
-	}
-	err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
-			ab8500_usb_link_status_irq,
-			IRQF_NO_SUSPEND | IRQF_SHARED, "usb-link-status", ab);
-	if (err < 0) {
-		dev_err(ab->dev, "request_irq failed for link status irq\n");
-		return err;
+	if (ab->flags & AB8500_USB_FLAG_USE_LINK_STATUS_IRQ) {
+		irq = platform_get_irq_byname(pdev, "USB_LINK_STATUS");
+		if (irq < 0) {
+			dev_err(&pdev->dev, "Link status irq not found\n");
+			return irq;
+		}
+		err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+				ab8500_usb_link_status_irq,
+				IRQF_NO_SUSPEND | IRQF_SHARED,
+				"usb-link-status", ab);
+		if (err < 0) {
+			dev_err(ab->dev, "request_irq failed for link status irq\n");
+			return err;
+		}
 	}
 
-	irq = platform_get_irq_byname(pdev, "ID_WAKEUP_F");
-	if (irq < 0) {
-		dev_err(&pdev->dev, "ID fall irq not found\n");
-		return irq;
-	}
-	err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
-			ab8500_usb_disconnect_irq,
-			IRQF_NO_SUSPEND | IRQF_SHARED, "usb-id-fall", ab);
-	if (err < 0) {
-		dev_err(ab->dev, "request_irq failed for ID fall irq\n");
-		return err;
+	if (ab->flags & AB8500_USB_FLAG_USE_ID_WAKEUP_IRQ) {
+		irq = platform_get_irq_byname(pdev, "ID_WAKEUP_F");
+		if (irq < 0) {
+			dev_err(&pdev->dev, "ID fall irq not found\n");
+			return irq;
+		}
+		err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+				ab8500_usb_disconnect_irq,
+				IRQF_NO_SUSPEND | IRQF_SHARED,
+				"usb-id-fall", ab);
+		if (err < 0) {
+			dev_err(ab->dev, "request_irq failed for ID fall irq\n");
+			return err;
+		}
 	}
 
-	irq = platform_get_irq_byname(pdev, "VBUS_DET_F");
-	if (irq < 0) {
-		dev_err(&pdev->dev, "VBUS fall irq not found\n");
-		return irq;
-	}
-	err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
-			ab8500_usb_disconnect_irq,
-			IRQF_NO_SUSPEND | IRQF_SHARED, "usb-vbus-fall", ab);
-	if (err < 0) {
-		dev_err(ab->dev, "request_irq failed for Vbus fall irq\n");
-		return err;
+	if (ab->flags & AB8500_USB_FLAG_USE_VBUS_DET_IRQ) {
+		irq = platform_get_irq_byname(pdev, "VBUS_DET_F");
+		if (irq < 0) {
+			dev_err(&pdev->dev, "VBUS fall irq not found\n");
+			return irq;
+		}
+		err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+				ab8500_usb_disconnect_irq,
+				IRQF_NO_SUSPEND | IRQF_SHARED,
+				"usb-vbus-fall", ab);
+		if (err < 0) {
+			dev_err(ab->dev, "request_irq failed for Vbus fall irq\n");
+			return err;
+		}
 	}
 
 	return 0;
 }
 
+static void ab8500_usb_set_ab8500_tuning_values(struct ab8500_usb *ab)
+{
+	int err;
+
+	/* Enable the PBT/Bank 0x12 access */
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8500_DEVELOPMENT, AB8500_BANK12_ACCESS, 0x01);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to enable bank12 access err=%d\n",
+				err);
+
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8500_DEBUG, AB8500_USB_PHY_TUNE1, 0xC8);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE1 register err=%d\n",
+				err);
+
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8500_DEBUG, AB8500_USB_PHY_TUNE2, 0x00);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE2 register err=%d\n",
+				err);
+
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8500_DEBUG, AB8500_USB_PHY_TUNE3, 0x78);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE3 regester err=%d\n",
+				err);
+
+	/* Switch to normal mode/disable Bank 0x12 access */
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8500_DEVELOPMENT, AB8500_BANK12_ACCESS, 0x00);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to switch bank12 access err=%d\n",
+				err);
+}
+
+static void ab8500_usb_set_ab8505_tuning_values(struct ab8500_usb *ab)
+{
+	int err;
+
+	/* Enable the PBT/Bank 0x12 access */
+	err = abx500_mask_and_set_register_interruptible(ab->dev,
+			AB8500_DEVELOPMENT, AB8500_BANK12_ACCESS,
+			0x01, 0x01);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to enable bank12 access err=%d\n",
+				err);
+
+	err = abx500_mask_and_set_register_interruptible(ab->dev,
+			AB8500_DEBUG, AB8500_USB_PHY_TUNE1,
+			0xC8, 0xC8);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE1 register err=%d\n",
+				err);
+
+	err = abx500_mask_and_set_register_interruptible(ab->dev,
+			AB8500_DEBUG, AB8500_USB_PHY_TUNE2,
+			0x60, 0x60);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE2 register err=%d\n",
+				err);
+
+	err = abx500_mask_and_set_register_interruptible(ab->dev,
+			AB8500_DEBUG, AB8500_USB_PHY_TUNE3,
+			0xFC, 0x80);
+
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE3 regester err=%d\n",
+				err);
+
+	/* Switch to normal mode/disable Bank 0x12 access */
+	err = abx500_mask_and_set_register_interruptible(ab->dev,
+			AB8500_DEVELOPMENT, AB8500_BANK12_ACCESS,
+			0x00, 0x00);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to switch bank12 access err=%d\n",
+				err);
+}
+
+static void ab8500_usb_set_ab8540_tuning_values(struct ab8500_usb *ab)
+{
+	int err;
+
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8540_DEBUG, AB8500_USB_PHY_TUNE1, 0xCC);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE1 register ret=%d\n",
+				err);
+
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8540_DEBUG, AB8500_USB_PHY_TUNE2, 0x60);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE2 register ret=%d\n",
+				err);
+
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8540_DEBUG, AB8500_USB_PHY_TUNE3, 0x90);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE3 regester ret=%d\n",
+				err);
+}
+
+static void ab8500_usb_set_ab9540_tuning_values(struct ab8500_usb *ab)
+{
+	int err;
+
+	/* Enable the PBT/Bank 0x12 access */
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8500_DEVELOPMENT, AB8500_BANK12_ACCESS, 0x01);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to enable bank12 access err=%d\n",
+				err);
+
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8500_DEBUG, AB8500_USB_PHY_TUNE1, 0xC8);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE1 register err=%d\n",
+				err);
+
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8500_DEBUG, AB8500_USB_PHY_TUNE2, 0x60);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE2 register err=%d\n",
+				err);
+
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8500_DEBUG, AB8500_USB_PHY_TUNE3, 0x80);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to set PHY_TUNE3 regester err=%d\n",
+				err);
+
+	/* Switch to normal mode/disable Bank 0x12 access */
+	err = abx500_set_register_interruptible(ab->dev,
+			AB8500_DEVELOPMENT, AB8500_BANK12_ACCESS, 0x00);
+	if (err < 0)
+		dev_err(ab->dev, "Failed to switch bank12 access err=%d\n",
+				err);
+}
+
 static int ab8500_usb_probe(struct platform_device *pdev)
 {
 	struct ab8500_usb	*ab;
@@ -772,6 +1386,33 @@
 	otg->set_host		= ab8500_usb_set_host;
 	otg->set_peripheral	= ab8500_usb_set_peripheral;
 
+	if (is_ab8500(ab->ab8500)) {
+		ab->flags |= AB8500_USB_FLAG_USE_LINK_STATUS_IRQ |
+			AB8500_USB_FLAG_USE_ID_WAKEUP_IRQ |
+			AB8500_USB_FLAG_USE_VBUS_DET_IRQ |
+			AB8500_USB_FLAG_REGULATOR_SET_VOLTAGE;
+	} else if (is_ab8505(ab->ab8500)) {
+		ab->flags |= AB8500_USB_FLAG_USE_LINK_STATUS_IRQ |
+			AB8500_USB_FLAG_USE_ID_WAKEUP_IRQ |
+			AB8500_USB_FLAG_USE_VBUS_DET_IRQ |
+			AB8500_USB_FLAG_REGULATOR_SET_VOLTAGE;
+	} else if (is_ab8540(ab->ab8500)) {
+		ab->flags |= AB8500_USB_FLAG_USE_LINK_STATUS_IRQ |
+			AB8500_USB_FLAG_USE_CHECK_VBUS_STATUS |
+			AB8500_USB_FLAG_USE_VBUS_HOST_QUIRK |
+			AB8500_USB_FLAG_REGULATOR_SET_VOLTAGE;
+	} else if (is_ab9540(ab->ab8500)) {
+		ab->flags |= AB8500_USB_FLAG_USE_LINK_STATUS_IRQ |
+			AB8500_USB_FLAG_REGULATOR_SET_VOLTAGE;
+		if (is_ab9540_2p0_or_earlier(ab->ab8500))
+			ab->flags |= AB8500_USB_FLAG_USE_ID_WAKEUP_IRQ |
+				AB8500_USB_FLAG_USE_VBUS_DET_IRQ;
+	}
+
+	/* Disable regulator voltage setting for AB8500 <= v2.0 */
+	if (is_ab8500_2p0_or_earlier(ab->ab8500))
+		ab->flags &= ~AB8500_USB_FLAG_REGULATOR_SET_VOLTAGE;
+
 	platform_set_drvdata(pdev, ab);
 
 	ATOMIC_INIT_NOTIFIER_HEAD(&ab->phy.notifier);
@@ -779,10 +1420,18 @@
 	/* all: Disable phy when called from set_host and set_peripheral */
 	INIT_WORK(&ab->phy_dis_work, ab8500_usb_phy_disable_work);
 
+	INIT_WORK(&ab->vbus_event_work, ab8500_usb_vbus_turn_on_event_work);
+
 	err = ab8500_usb_regulator_get(ab);
 	if (err)
 		return err;
 
+	ab->sysclk = devm_clk_get(ab->dev, "sysclk");
+	if (IS_ERR(ab->sysclk)) {
+		dev_err(ab->dev, "Could not get sysclk.\n");
+		return PTR_ERR(ab->sysclk);
+	}
+
 	err = ab8500_usb_irq_setup(pdev, ab);
 	if (err < 0)
 		return err;
@@ -793,85 +1442,33 @@
 		return err;
 	}
 
-	/* Phy tuning values for AB8500 */
-	if (!is_ab8500_2p0_or_earlier(ab->ab8500)) {
-		/* Enable the PBT/Bank 0x12 access */
-		err = abx500_set_register_interruptible(ab->dev,
-				AB8500_DEVELOPMENT, AB8500_BANK12_ACCESS, 0x01);
-		if (err < 0)
-			dev_err(ab->dev, "Failed to enable bank12 access err=%d\n",
-					err);
-
-		err = abx500_set_register_interruptible(ab->dev,
-				AB8500_DEBUG, AB8500_USB_PHY_TUNE1, 0xC8);
-		if (err < 0)
-			dev_err(ab->dev, "Failed to set PHY_TUNE1 register err=%d\n",
-					err);
-
-		err = abx500_set_register_interruptible(ab->dev,
-				AB8500_DEBUG, AB8500_USB_PHY_TUNE2, 0x00);
-		if (err < 0)
-			dev_err(ab->dev, "Failed to set PHY_TUNE2 register err=%d\n",
-					err);
-
-		err = abx500_set_register_interruptible(ab->dev,
-				AB8500_DEBUG, AB8500_USB_PHY_TUNE3, 0x78);
-		if (err < 0)
-			dev_err(ab->dev, "Failed to set PHY_TUNE3 regester err=%d\n",
-					err);
-
-		/* Switch to normal mode/disable Bank 0x12 access */
-		err = abx500_set_register_interruptible(ab->dev,
-				AB8500_DEVELOPMENT, AB8500_BANK12_ACCESS, 0x00);
-		if (err < 0)
-			dev_err(ab->dev, "Failed to switch bank12 access err=%d\n",
-					err);
-	}
-
-	/* Phy tuning values for AB8505 */
-	if (is_ab8505(ab->ab8500)) {
-		/* Enable the PBT/Bank 0x12 access */
-		err = abx500_mask_and_set_register_interruptible(ab->dev,
-				AB8500_DEVELOPMENT, AB8500_BANK12_ACCESS,
-				0x01, 0x01);
-		if (err < 0)
-			dev_err(ab->dev, "Failed to enable bank12 access err=%d\n",
-					err);
-
-		err = abx500_mask_and_set_register_interruptible(ab->dev,
-				AB8500_DEBUG, AB8500_USB_PHY_TUNE1,
-				0xC8, 0xC8);
-		if (err < 0)
-			dev_err(ab->dev, "Failed to set PHY_TUNE1 register err=%d\n",
-					err);
-
-		err = abx500_mask_and_set_register_interruptible(ab->dev,
-				AB8500_DEBUG, AB8500_USB_PHY_TUNE2,
-				0x60, 0x60);
-		if (err < 0)
-			dev_err(ab->dev, "Failed to set PHY_TUNE2 register err=%d\n",
-					err);
-
-		err = abx500_mask_and_set_register_interruptible(ab->dev,
-				AB8500_DEBUG, AB8500_USB_PHY_TUNE3,
-				0xFC, 0x80);
-
-		if (err < 0)
-			dev_err(ab->dev, "Failed to set PHY_TUNE3 regester err=%d\n",
-					err);
-
-		/* Switch to normal mode/disable Bank 0x12 access */
-		err = abx500_mask_and_set_register_interruptible(ab->dev,
-				AB8500_DEVELOPMENT, AB8500_BANK12_ACCESS,
-				0x00, 0x00);
-		if (err < 0)
-			dev_err(ab->dev, "Failed to switch bank12 access err=%d\n",
-					err);
-	}
+	if (is_ab8500(ab->ab8500) && !is_ab8500_2p0_or_earlier(ab->ab8500))
+		/* Phy tuning values for AB8500 > v2.0 */
+		ab8500_usb_set_ab8500_tuning_values(ab);
+	else if (is_ab8505(ab->ab8500))
+		/* Phy tuning values for AB8505 */
+		ab8500_usb_set_ab8505_tuning_values(ab);
+	else if (is_ab8540(ab->ab8500))
+		/* Phy tuning values for AB8540 */
+		ab8500_usb_set_ab8540_tuning_values(ab);
+	else if (is_ab9540(ab->ab8500))
+		/* Phy tuning values for AB9540 */
+		ab8500_usb_set_ab9540_tuning_values(ab);
 
 	/* Needed to enable ID detection. */
 	ab8500_usb_wd_workaround(ab);
 
+	/*
+	 * This is required for usb-link-status to work properly when a
+	 * cable is connected at boot time.
+	 */
+	ab8500_usb_restart_phy(ab);
+
+	if (ab->flags & AB8500_USB_FLAG_USE_CHECK_VBUS_STATUS) {
+		if (ab8500_usb_check_vbus_status(ab))
+			schedule_work(&ab->vbus_event_work);
+	}
+
 	abx500_usb_link_status_update(ab);
 
 	dev_info(&pdev->dev, "revision 0x%2x driver initialized\n", rev);
@@ -884,6 +1481,7 @@
 	struct ab8500_usb *ab = platform_get_drvdata(pdev);
 
 	cancel_work_sync(&ab->phy_dis_work);
+	cancel_work_sync(&ab->vbus_event_work);
 
 	usb_remove_phy(&ab->phy);
 
@@ -895,11 +1493,20 @@
 	return 0;
 }
 
+static struct platform_device_id ab8500_usb_devtype[] = {
+	{ .name = "ab8500-usb", },
+	{ .name = "ab8540-usb", },
+	{ .name = "ab9540-usb", },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(platform, ab8500_usb_devtype);
+
 static struct platform_driver ab8500_usb_driver = {
 	.probe		= ab8500_usb_probe,
 	.remove		= ab8500_usb_remove,
+	.id_table	= ab8500_usb_devtype,
 	.driver		= {
-		.name	= "ab8500-usb",
+		.name	= "abx5x0-usb",
 		.owner	= THIS_MODULE,
 	},
 };
@@ -916,7 +1523,6 @@
 }
 module_exit(ab8500_usb_exit);
 
-MODULE_ALIAS("platform:ab8500_usb");
 MODULE_AUTHOR("ST-Ericsson AB");
-MODULE_DESCRIPTION("AB8500 usb transceiver driver");
+MODULE_DESCRIPTION("AB8500 family usb transceiver driver");
 MODULE_LICENSE("GPL");

diff --git a/drivers/usb/phy/phy-nop.c b/drivers/usb/phy/phy-nop.c
index 638cc5d..55445e5d 100644
--- a/drivers/usb/phy/phy-nop.c
+++ b/drivers/usb/phy/phy-nop.c

@@ -270,7 +270,7 @@
 	.driver		= {
 		.name	= "nop_usb_xceiv",
 		.owner	= THIS_MODULE,
-		.of_match_table = of_match_ptr(nop_xceiv_dt_ids),
+		.of_match_table = nop_xceiv_dt_ids,
 	},
 };
 

diff --git a/drivers/usb/phy/phy-omap-usb3.c b/drivers/usb/phy/phy-omap-usb3.c
index a6e60b1..efe6e14 100644
--- a/drivers/usb/phy/phy-omap-usb3.c
+++ b/drivers/usb/phy/phy-omap-usb3.c

@@ -27,7 +27,7 @@
 #include <linux/delay.h>
 #include <linux/usb/omap_control_usb.h>
 
-#define	NUM_SYS_CLKS		5
+#define	NUM_SYS_CLKS		6
 #define	PLL_STATUS		0x00000004
 #define	PLL_GO			0x00000008
 #define	PLL_CONFIGURATION1	0x0000000C
@@ -62,6 +62,7 @@
 	CLK_RATE_12MHZ,
 	CLK_RATE_16MHZ,
 	CLK_RATE_19MHZ,
+	CLK_RATE_20MHZ,
 	CLK_RATE_26MHZ,
 	CLK_RATE_38MHZ
 };
@@ -72,6 +73,8 @@
 	{1172, 8, 4, 20, 65537},	/* 19.2 MHz */
 	{1250, 12, 4, 20, 0},		/* 26 MHz */
 	{3125, 47, 4, 20, 92843},	/* 38.4 MHz */
+	{1000, 7, 4, 10, 0},            /* 20 MHz */
+
 };
 
 static int omap_usb3_suspend(struct usb_phy *x, int suspend)
@@ -122,6 +125,8 @@
 		return CLK_RATE_16MHZ;
 	case 19200000:
 		return CLK_RATE_19MHZ;
+	case 20000000:
+		return CLK_RATE_20MHZ;
 	case 26000000:
 		return CLK_RATE_26MHZ;
 	case 38400000:

diff --git a/drivers/usb/phy/phy-rcar-usb.c b/drivers/usb/phy/phy-rcar-usb.c
index a35681b..23c3dd3 100644
--- a/drivers/usb/phy/phy-rcar-usb.c
+++ b/drivers/usb/phy/phy-rcar-usb.c

@@ -161,7 +161,7 @@
 	 * CAUTION
 	 *
 	 * Because this phy address is also mapped under OHCI/EHCI address area,
-	 * this driver can't use devm_request_and_ioremap(dev, res) here
+	 * this driver can't use devm_ioremap_resource(dev, res) here
 	 */
 	reg0 = devm_ioremap_nocache(dev, res0->start, resource_size(res0));
 	reg1 = devm_ioremap_nocache(dev, res1->start, resource_size(res1));

diff --git a/drivers/usb/phy/phy-samsung-usb.c b/drivers/usb/phy/phy-samsung-usb.c
index 7b118ee5..ac025ca 100644
--- a/drivers/usb/phy/phy-samsung-usb.c
+++ b/drivers/usb/phy/phy-samsung-usb.c

@@ -73,7 +73,7 @@
  * Here 'on = true' would mean USB PHY block is isolated, hence
  * de-activated and vice-versa.
  */
-void samsung_usbphy_set_isolation(struct samsung_usbphy *sphy, bool on)
+void samsung_usbphy_set_isolation_4210(struct samsung_usbphy *sphy, bool on)
 {
 	void __iomem *reg = NULL;
 	u32 reg_val;
@@ -84,32 +84,12 @@
 		return;
 	}
 
-	switch (sphy->drv_data->cpu_type) {
-	case TYPE_S3C64XX:
-		/*
-		 * Do nothing: We will add here once S3C64xx goes for DT support
-		 */
-		break;
-	case TYPE_EXYNOS4210:
-		/*
-		 * Fall through since exynos4210 and exynos5250 have similar
-		 * register architecture: two separate registers for host and
-		 * device phy control with enable bit at position 0.
-		 */
-	case TYPE_EXYNOS5250:
-		if (sphy->phy_type == USB_PHY_TYPE_DEVICE) {
-			reg = sphy->pmuregs +
-				sphy->drv_data->devphy_reg_offset;
-			en_mask = sphy->drv_data->devphy_en_mask;
-		} else if (sphy->phy_type == USB_PHY_TYPE_HOST) {
-			reg = sphy->pmuregs +
-				sphy->drv_data->hostphy_reg_offset;
-			en_mask = sphy->drv_data->hostphy_en_mask;
-		}
-		break;
-	default:
-		dev_err(sphy->dev, "Invalid SoC type\n");
-		return;
+	if (sphy->phy_type == USB_PHY_TYPE_DEVICE) {
+		reg = sphy->pmuregs + sphy->drv_data->devphy_reg_offset;
+		en_mask = sphy->drv_data->devphy_en_mask;
+	} else if (sphy->phy_type == USB_PHY_TYPE_HOST) {
+		reg = sphy->pmuregs + sphy->drv_data->hostphy_reg_offset;
+		en_mask = sphy->drv_data->hostphy_en_mask;
 	}
 
 	reg_val = readl(reg);
@@ -120,8 +100,13 @@
 		reg_val |= en_mask;
 
 	writel(reg_val, reg);
+
+	if (sphy->drv_data->cpu_type == TYPE_EXYNOS4X12) {
+		writel(reg_val, sphy->pmuregs + EXYNOS4X12_PHY_HSIC_CTRL0);
+		writel(reg_val, sphy->pmuregs + EXYNOS4X12_PHY_HSIC_CTRL1);
+	}
 }
-EXPORT_SYMBOL_GPL(samsung_usbphy_set_isolation);
+EXPORT_SYMBOL_GPL(samsung_usbphy_set_isolation_4210);
 
 /*
  * Configure the mode of working of usb-phy here: HOST/DEVICE.
@@ -162,73 +147,93 @@
 }
 EXPORT_SYMBOL_GPL(samsung_usbphy_set_type);
 
+int samsung_usbphy_rate_to_clksel_64xx(struct samsung_usbphy *sphy,
+							unsigned long rate)
+{
+	unsigned int clksel;
+
+	switch (rate) {
+	case 12 * MHZ:
+		clksel = PHYCLK_CLKSEL_12M;
+		break;
+	case 24 * MHZ:
+		clksel = PHYCLK_CLKSEL_24M;
+		break;
+	case 48 * MHZ:
+		clksel = PHYCLK_CLKSEL_48M;
+		break;
+	default:
+		dev_err(sphy->dev,
+			"Invalid reference clock frequency: %lu\n", rate);
+		return -EINVAL;
+	}
+
+	return clksel;
+}
+EXPORT_SYMBOL_GPL(samsung_usbphy_rate_to_clksel_64xx);
+
+int samsung_usbphy_rate_to_clksel_4x12(struct samsung_usbphy *sphy,
+							unsigned long rate)
+{
+	unsigned int clksel;
+
+	switch (rate) {
+	case 9600 * KHZ:
+		clksel = FSEL_CLKSEL_9600K;
+		break;
+	case 10 * MHZ:
+		clksel = FSEL_CLKSEL_10M;
+		break;
+	case 12 * MHZ:
+		clksel = FSEL_CLKSEL_12M;
+		break;
+	case 19200 * KHZ:
+		clksel = FSEL_CLKSEL_19200K;
+		break;
+	case 20 * MHZ:
+		clksel = FSEL_CLKSEL_20M;
+		break;
+	case 24 * MHZ:
+		clksel = FSEL_CLKSEL_24M;
+		break;
+	case 50 * MHZ:
+		clksel = FSEL_CLKSEL_50M;
+		break;
+	default:
+		dev_err(sphy->dev,
+			"Invalid reference clock frequency: %lu\n", rate);
+		return -EINVAL;
+	}
+
+	return clksel;
+}
+EXPORT_SYMBOL_GPL(samsung_usbphy_rate_to_clksel_4x12);
+
 /*
  * Returns reference clock frequency selection value
  */
 int samsung_usbphy_get_refclk_freq(struct samsung_usbphy *sphy)
 {
 	struct clk *ref_clk;
-	int refclk_freq = 0;
+	unsigned long rate;
+	int refclk_freq;
 
 	/*
 	 * In exynos5250 USB host and device PHY use
 	 * external crystal clock XXTI
 	 */
 	if (sphy->drv_data->cpu_type == TYPE_EXYNOS5250)
-		ref_clk = devm_clk_get(sphy->dev, "ext_xtal");
+		ref_clk = clk_get(sphy->dev, "ext_xtal");
 	else
-		ref_clk = devm_clk_get(sphy->dev, "xusbxti");
+		ref_clk = clk_get(sphy->dev, "xusbxti");
 	if (IS_ERR(ref_clk)) {
 		dev_err(sphy->dev, "Failed to get reference clock\n");
 		return PTR_ERR(ref_clk);
 	}
 
-	if (sphy->drv_data->cpu_type == TYPE_EXYNOS5250) {
-		/* set clock frequency for PLL */
-		switch (clk_get_rate(ref_clk)) {
-		case 9600 * KHZ:
-			refclk_freq = FSEL_CLKSEL_9600K;
-			break;
-		case 10 * MHZ:
-			refclk_freq = FSEL_CLKSEL_10M;
-			break;
-		case 12 * MHZ:
-			refclk_freq = FSEL_CLKSEL_12M;
-			break;
-		case 19200 * KHZ:
-			refclk_freq = FSEL_CLKSEL_19200K;
-			break;
-		case 20 * MHZ:
-			refclk_freq = FSEL_CLKSEL_20M;
-			break;
-		case 50 * MHZ:
-			refclk_freq = FSEL_CLKSEL_50M;
-			break;
-		case 24 * MHZ:
-		default:
-			/* default reference clock */
-			refclk_freq = FSEL_CLKSEL_24M;
-			break;
-		}
-	} else {
-		switch (clk_get_rate(ref_clk)) {
-		case 12 * MHZ:
-			refclk_freq = PHYCLK_CLKSEL_12M;
-			break;
-		case 24 * MHZ:
-			refclk_freq = PHYCLK_CLKSEL_24M;
-			break;
-		case 48 * MHZ:
-			refclk_freq = PHYCLK_CLKSEL_48M;
-			break;
-		default:
-			if (sphy->drv_data->cpu_type == TYPE_S3C64XX)
-				refclk_freq = PHYCLK_CLKSEL_48M;
-			else
-				refclk_freq = PHYCLK_CLKSEL_24M;
-			break;
-		}
-	}
+	rate = clk_get_rate(ref_clk);
+	refclk_freq = sphy->drv_data->rate_to_clksel(sphy, rate);
+
 	clk_put(ref_clk);
 
 	return refclk_freq;

diff --git a/drivers/usb/phy/phy-samsung-usb.h b/drivers/usb/phy/phy-samsung-usb.h
index 70a9cae..68771bf 100644
--- a/drivers/usb/phy/phy-samsung-usb.h
+++ b/drivers/usb/phy/phy-samsung-usb.h

@@ -47,6 +47,16 @@
 #define RSTCON_HLINK_SWRST			(0x1 << 1)
 #define RSTCON_SWRST				(0x1 << 0)
 
+/* EXYNOS4X12 */
+#define EXYNOS4X12_PHY_HSIC_CTRL0		(0x04)
+#define EXYNOS4X12_PHY_HSIC_CTRL1		(0x08)
+
+#define PHYPWR_NORMAL_MASK_HSIC1		(0x7 << 12)
+#define PHYPWR_NORMAL_MASK_HSIC0		(0x7 << 9)
+#define PHYPWR_NORMAL_MASK_PHY1			(0x7 << 6)
+
+#define RSTCON_HOSTPHY_SWRST			(0xf << 3)
+
 /* EXYNOS5 */
 #define EXYNOS5_PHY_HOST_CTRL0			(0x00)
 
@@ -241,9 +251,12 @@
 enum samsung_cpu_type {
 	TYPE_S3C64XX,
 	TYPE_EXYNOS4210,
+	TYPE_EXYNOS4X12,
 	TYPE_EXYNOS5250,
 };
 
+struct samsung_usbphy;
+
 /*
  * struct samsung_usbphy_drvdata - driver data for various SoC variants
  * @cpu_type: machine identifier
@@ -268,6 +281,10 @@
 	int hostphy_en_mask;
 	u32 devphy_reg_offset;
 	u32 hostphy_reg_offset;
+	int (*rate_to_clksel)(struct samsung_usbphy *, unsigned long);
+	void (*set_isolation)(struct samsung_usbphy *, bool);
+	void (*phy_enable)(struct samsung_usbphy *);
+	void (*phy_disable)(struct samsung_usbphy *);
 };
 
 /*
@@ -320,8 +337,13 @@
 }
 
 extern int samsung_usbphy_parse_dt(struct samsung_usbphy *sphy);
-extern void samsung_usbphy_set_isolation(struct samsung_usbphy *sphy, bool on);
+extern void samsung_usbphy_set_isolation_4210(struct samsung_usbphy *sphy,
+								bool on);
 extern void samsung_usbphy_cfg_sel(struct samsung_usbphy *sphy);
 extern int samsung_usbphy_set_type(struct usb_phy *phy,
 					enum samsung_usb_phy_type phy_type);
 extern int samsung_usbphy_get_refclk_freq(struct samsung_usbphy *sphy);
+extern int samsung_usbphy_rate_to_clksel_64xx(struct samsung_usbphy *sphy,
+							unsigned long rate);
+extern int samsung_usbphy_rate_to_clksel_4x12(struct samsung_usbphy *sphy,
+							unsigned long rate);

diff --git a/drivers/usb/phy/phy-samsung-usb2.c b/drivers/usb/phy/phy-samsung-usb2.c
index 9d5e273..1011c16 100644
--- a/drivers/usb/phy/phy-samsung-usb2.c
+++ b/drivers/usb/phy/phy-samsung-usb2.c

@@ -176,6 +176,11 @@
 		phypwr &= ~PHYPWR_NORMAL_MASK;
 		rstcon |= RSTCON_SWRST;
 		break;
+	case TYPE_EXYNOS4X12:
+		phypwr &= ~(PHYPWR_NORMAL_MASK_HSIC0 |
+				PHYPWR_NORMAL_MASK_HSIC1 |
+				PHYPWR_NORMAL_MASK_PHY1);
+		rstcon |= RSTCON_HOSTPHY_SWRST;
 	case TYPE_EXYNOS4210:
 		phypwr &= ~PHYPWR_NORMAL_MASK_PHY0;
 		rstcon |= RSTCON_SWRST;
@@ -189,6 +194,8 @@
 	/* reset all ports of PHY and Link */
 	writel(rstcon, regs + SAMSUNG_RSTCON);
 	udelay(10);
+	if (sphy->drv_data->cpu_type == TYPE_EXYNOS4X12)
+		rstcon &= ~RSTCON_HOSTPHY_SWRST;
 	rstcon &= ~RSTCON_SWRST;
 	writel(rstcon, regs + SAMSUNG_RSTCON);
 }
@@ -239,6 +246,10 @@
 	case TYPE_S3C64XX:
 		phypwr |= PHYPWR_NORMAL_MASK;
 		break;
+	case TYPE_EXYNOS4X12:
+		phypwr |= (PHYPWR_NORMAL_MASK_HSIC0 |
+				PHYPWR_NORMAL_MASK_HSIC1 |
+				PHYPWR_NORMAL_MASK_PHY1);
 	case TYPE_EXYNOS4210:
 		phypwr |= PHYPWR_NORMAL_MASK_PHY0;
 	default:
@@ -284,17 +295,14 @@
 	/* Disable phy isolation */
 	if (sphy->plat && sphy->plat->pmu_isolation)
 		sphy->plat->pmu_isolation(false);
-	else
-		samsung_usbphy_set_isolation(sphy, false);
+	else if (sphy->drv_data->set_isolation)
+		sphy->drv_data->set_isolation(sphy, false);
 
 	/* Selecting Host/OTG mode; After reset USB2.0PHY_CFG: HOST */
 	samsung_usbphy_cfg_sel(sphy);
 
 	/* Initialize usb phy registers */
-	if (sphy->drv_data->cpu_type == TYPE_EXYNOS5250)
-		samsung_exynos5_usb2phy_enable(sphy);
-	else
-		samsung_usb2phy_enable(sphy);
+	sphy->drv_data->phy_enable(sphy);
 
 	spin_unlock_irqrestore(&sphy->lock, flags);
 
@@ -334,16 +342,13 @@
 	}
 
 	/* De-initialize usb phy registers */
-	if (sphy->drv_data->cpu_type == TYPE_EXYNOS5250)
-		samsung_exynos5_usb2phy_disable(sphy);
-	else
-		samsung_usb2phy_disable(sphy);
+	sphy->drv_data->phy_disable(sphy);
 
 	/* Enable phy isolation */
 	if (sphy->plat && sphy->plat->pmu_isolation)
 		sphy->plat->pmu_isolation(true);
-	else
-		samsung_usbphy_set_isolation(sphy, true);
+	else if (sphy->drv_data->set_isolation)
+		sphy->drv_data->set_isolation(sphy, true);
 
 	spin_unlock_irqrestore(&sphy->lock, flags);
 
@@ -408,7 +413,10 @@
 	sphy->phy.label		= "samsung-usb2phy";
 	sphy->phy.init		= samsung_usb2phy_init;
 	sphy->phy.shutdown	= samsung_usb2phy_shutdown;
-	sphy->ref_clk_freq	= samsung_usbphy_get_refclk_freq(sphy);
+
+	sphy->ref_clk_freq = samsung_usbphy_get_refclk_freq(sphy);
+	if (sphy->ref_clk_freq < 0)
+		return -EINVAL;
 
 	sphy->phy.otg		= otg;
 	sphy->phy.otg->phy	= &sphy->phy;
@@ -438,18 +446,40 @@
 static const struct samsung_usbphy_drvdata usb2phy_s3c64xx = {
 	.cpu_type		= TYPE_S3C64XX,
 	.devphy_en_mask		= S3C64XX_USBPHY_ENABLE,
+	.rate_to_clksel		= samsung_usbphy_rate_to_clksel_64xx,
+	.set_isolation		= NULL, /* TODO */
+	.phy_enable		= samsung_usb2phy_enable,
+	.phy_disable		= samsung_usb2phy_disable,
 };
 
 static const struct samsung_usbphy_drvdata usb2phy_exynos4 = {
 	.cpu_type		= TYPE_EXYNOS4210,
 	.devphy_en_mask		= EXYNOS_USBPHY_ENABLE,
 	.hostphy_en_mask	= EXYNOS_USBPHY_ENABLE,
+	.rate_to_clksel		= samsung_usbphy_rate_to_clksel_64xx,
+	.set_isolation		= samsung_usbphy_set_isolation_4210,
+	.phy_enable		= samsung_usb2phy_enable,
+	.phy_disable		= samsung_usb2phy_disable,
+};
+
+static const struct samsung_usbphy_drvdata usb2phy_exynos4x12 = {
+	.cpu_type		= TYPE_EXYNOS4X12,
+	.devphy_en_mask		= EXYNOS_USBPHY_ENABLE,
+	.hostphy_en_mask	= EXYNOS_USBPHY_ENABLE,
+	.rate_to_clksel		= samsung_usbphy_rate_to_clksel_4x12,
+	.set_isolation		= samsung_usbphy_set_isolation_4210,
+	.phy_enable		= samsung_usb2phy_enable,
+	.phy_disable		= samsung_usb2phy_disable,
 };
 
 static struct samsung_usbphy_drvdata usb2phy_exynos5 = {
 	.cpu_type		= TYPE_EXYNOS5250,
 	.hostphy_en_mask	= EXYNOS_USBPHY_ENABLE,
 	.hostphy_reg_offset	= EXYNOS_USBHOST_PHY_CTRL_OFFSET,
+	.rate_to_clksel		= samsung_usbphy_rate_to_clksel_4x12,
+	.set_isolation		= samsung_usbphy_set_isolation_4210,
+	.phy_enable		= samsung_exynos5_usb2phy_enable,
+	.phy_disable		= samsung_exynos5_usb2phy_disable,
 };
 
 #ifdef CONFIG_OF
@@ -461,6 +491,9 @@
 		.compatible = "samsung,exynos4210-usb2phy",
 		.data = &usb2phy_exynos4,
 	}, {
+		.compatible = "samsung,exynos4x12-usb2phy",
+		.data = &usb2phy_exynos4x12,
+	}, {
 		.compatible = "samsung,exynos5250-usb2phy",
 		.data = &usb2phy_exynos5
 	},
@@ -477,6 +510,9 @@
 		.name		= "exynos4210-usb2phy",
 		.driver_data	= (unsigned long)&usb2phy_exynos4,
 	}, {
+		.name		= "exynos4x12-usb2phy",
+		.driver_data	= (unsigned long)&usb2phy_exynos4x12,
+	}, {
 		.name		= "exynos5250-usb2phy",
 		.driver_data	= (unsigned long)&usb2phy_exynos5,
 	},

diff --git a/drivers/usb/phy/phy-samsung-usb3.c b/drivers/usb/phy/phy-samsung-usb3.c
index 5a9efcb..300e0cf 100644
--- a/drivers/usb/phy/phy-samsung-usb3.c
+++ b/drivers/usb/phy/phy-samsung-usb3.c

@@ -65,7 +65,7 @@
 	return reg;
 }
 
-static int samsung_exynos5_usb3phy_enable(struct samsung_usbphy *sphy)
+static void samsung_exynos5_usb3phy_enable(struct samsung_usbphy *sphy)
 {
 	void __iomem *regs = sphy->regs;
 	u32 phyparam0;
@@ -133,8 +133,6 @@
 
 	phyclkrst &= ~(PHYCLKRST_PORTRESET);
 	writel(phyclkrst, regs + EXYNOS5_DRD_PHYCLKRST);
-
-	return 0;
 }
 
 static void samsung_exynos5_usb3phy_disable(struct samsung_usbphy *sphy)
@@ -184,10 +182,11 @@
 	samsung_usbphy_set_type(&sphy->phy, USB_PHY_TYPE_DEVICE);
 
 	/* Disable phy isolation */
-	samsung_usbphy_set_isolation(sphy, false);
+	if (sphy->drv_data->set_isolation)
+		sphy->drv_data->set_isolation(sphy, false);
 
 	/* Initialize usb phy registers */
-	samsung_exynos5_usb3phy_enable(sphy);
+	sphy->drv_data->phy_enable(sphy);
 
 	spin_unlock_irqrestore(&sphy->lock, flags);
 
@@ -218,10 +217,11 @@
 	samsung_usbphy_set_type(&sphy->phy, USB_PHY_TYPE_DEVICE);
 
 	/* De-initialize usb phy registers */
-	samsung_exynos5_usb3phy_disable(sphy);
+	sphy->drv_data->phy_disable(sphy);
 
 	/* Enable phy isolation */
-	samsung_usbphy_set_isolation(sphy, true);
+	if (sphy->drv_data->set_isolation)
+		sphy->drv_data->set_isolation(sphy, true);
 
 	spin_unlock_irqrestore(&sphy->lock, flags);
 
@@ -274,7 +274,10 @@
 	sphy->phy.init		= samsung_usb3phy_init;
 	sphy->phy.shutdown	= samsung_usb3phy_shutdown;
 	sphy->drv_data		= samsung_usbphy_get_driver_data(pdev);
-	sphy->ref_clk_freq	= samsung_usbphy_get_refclk_freq(sphy);
+
+	sphy->ref_clk_freq = samsung_usbphy_get_refclk_freq(sphy);
+	if (sphy->ref_clk_freq < 0)
+		return -EINVAL;
 
 	spin_lock_init(&sphy->lock);
 
@@ -300,6 +303,10 @@
 static struct samsung_usbphy_drvdata usb3phy_exynos5 = {
 	.cpu_type		= TYPE_EXYNOS5250,
 	.devphy_en_mask		= EXYNOS_USBPHY_ENABLE,
+	.rate_to_clksel		= samsung_usbphy_rate_to_clksel_4x12,
+	.set_isolation		= samsung_usbphy_set_isolation_4210,
+	.phy_enable		= samsung_exynos5_usb3phy_enable,
+	.phy_disable		= samsung_exynos5_usb3phy_disable,
 };
 
 #ifdef CONFIG_OF

diff --git a/drivers/usb/phy/phy-tegra-usb.c b/drivers/usb/phy/phy-tegra-usb.c
index 17d8112..cec0855 100644
--- a/drivers/usb/phy/phy-tegra-usb.c
+++ b/drivers/usb/phy/phy-tegra-usb.c

@@ -1,9 +1,11 @@
 /*
  * Copyright (C) 2010 Google, Inc.
+ * Copyright (C) 2013 NVIDIA Corporation
  *
  * Author:
  *	Erik Gilling <konkers@google.com>
  *	Benoit Goby <benoit@android.com>
+ *	Venu Byravarasu <vbyravarasu@nvidia.com>
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -21,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/export.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/gpio.h>
@@ -29,13 +32,19 @@
 #include <linux/usb/otg.h>
 #include <linux/usb/ulpi.h>
 #include <asm/mach-types.h>
+#include <linux/usb/ehci_def.h>
 #include <linux/usb/tegra_usb_phy.h>
 
-#define TEGRA_USB_BASE		0xC5000000
-#define TEGRA_USB_SIZE		SZ_16K
-
 #define ULPI_VIEWPORT		0x170
 
+/* PORTSC registers */
+#define TEGRA_USB_PORTSC1				0x184
+#define TEGRA_USB_PORTSC1_PTS(x)			(((x) & 0x3) << 30)
+#define TEGRA_USB_PORTSC1_PHCD				(1 << 23)
+
+/* Bits of PORTSC1, which will get cleared by writing 1 into them */
+#define TEGRA_PORTSC1_RWC_BITS	(PORT_CSC | PORT_PEC | PORT_OCC)
+
 #define USB_SUSP_CTRL		0x400
 #define   USB_WAKE_ON_CNNT_EN_DEV	(1 << 3)
 #define   USB_WAKE_ON_DISCON_EN_DEV	(1 << 4)
@@ -196,34 +205,41 @@
 	},
 };
 
+static void set_pts(struct tegra_usb_phy *phy, u8 pts_val)
+{
+	void __iomem *base = phy->regs;
+	unsigned long val;
+
+	val = readl(base + TEGRA_USB_PORTSC1) & ~TEGRA_PORTSC1_RWC_BITS;
+	val &= ~TEGRA_USB_PORTSC1_PTS(3);
+	val |= TEGRA_USB_PORTSC1_PTS(pts_val & 3);
+	writel(val, base + TEGRA_USB_PORTSC1);
+}
+
+static void set_phcd(struct tegra_usb_phy *phy, bool enable)
+{
+	void __iomem *base = phy->regs;
+	unsigned long val;
+
+	val = readl(base + TEGRA_USB_PORTSC1) & ~TEGRA_PORTSC1_RWC_BITS;
+	if (enable)
+		val |= TEGRA_USB_PORTSC1_PHCD;
+	else
+		val &= ~TEGRA_USB_PORTSC1_PHCD;
+	writel(val, base + TEGRA_USB_PORTSC1);
+}
+
 static int utmip_pad_open(struct tegra_usb_phy *phy)
 {
-	phy->pad_clk = clk_get_sys("utmip-pad", NULL);
+	phy->pad_clk = devm_clk_get(phy->dev, "utmi-pads");
 	if (IS_ERR(phy->pad_clk)) {
 		pr_err("%s: can't get utmip pad clock\n", __func__);
 		return PTR_ERR(phy->pad_clk);
 	}
 
-	if (phy->is_legacy_phy) {
-		phy->pad_regs = phy->regs;
-	} else {
-		phy->pad_regs = ioremap(TEGRA_USB_BASE, TEGRA_USB_SIZE);
-		if (!phy->pad_regs) {
-			pr_err("%s: can't remap usb registers\n", __func__);
-			clk_put(phy->pad_clk);
-			return -ENOMEM;
-		}
-	}
 	return 0;
 }
 
-static void utmip_pad_close(struct tegra_usb_phy *phy)
-{
-	if (!phy->is_legacy_phy)
-		iounmap(phy->pad_regs);
-	clk_put(phy->pad_clk);
-}
-
 static void utmip_pad_power_on(struct tegra_usb_phy *phy)
 {
 	unsigned long val, flags;
@@ -299,7 +315,7 @@
 		val &= ~USB_SUSP_SET;
 		writel(val, base + USB_SUSP_CTRL);
 	} else
-		phy->set_phcd(&phy->u_phy, true);
+		set_phcd(phy, true);
 
 	if (utmi_wait_register(base + USB_SUSP_CTRL, USB_PHY_CLK_VALID, 0) < 0)
 		pr_err("%s: timeout waiting for phy to stabilize\n", __func__);
@@ -321,7 +337,7 @@
 		val &= ~USB_SUSP_CLR;
 		writel(val, base + USB_SUSP_CTRL);
 	} else
-		phy->set_phcd(&phy->u_phy, false);
+		set_phcd(phy, false);
 
 	if (utmi_wait_register(base + USB_SUSP_CTRL, USB_PHY_CLK_VALID,
 						     USB_PHY_CLK_VALID))
@@ -444,7 +460,7 @@
 	utmi_phy_clk_enable(phy);
 
 	if (!phy->is_legacy_phy)
-		phy->set_pts(&phy->u_phy, 0);
+		set_pts(phy, 0);
 
 	return 0;
 }
@@ -541,11 +557,18 @@
 	int ret;
 	unsigned long val;
 	void __iomem *base = phy->regs;
-	struct tegra_ulpi_config *config = phy->config;
 
-	gpio_direction_output(config->reset_gpio, 0);
+	ret = gpio_direction_output(phy->reset_gpio, 0);
+	if (ret < 0) {
+		dev_err(phy->dev, "gpio %d not set to 0\n", phy->reset_gpio);
+		return ret;
+	}
 	msleep(5);
-	gpio_direction_output(config->reset_gpio, 1);
+	ret = gpio_direction_output(phy->reset_gpio, 1);
+	if (ret < 0) {
+		dev_err(phy->dev, "gpio %d not set to 1\n", phy->reset_gpio);
+		return ret;
+	}
 
 	clk_prepare_enable(phy->clk);
 	msleep(1);
@@ -603,63 +626,15 @@
 
 static int ulpi_phy_power_off(struct tegra_usb_phy *phy)
 {
-	struct tegra_ulpi_config *config = phy->config;
-
 	clk_disable(phy->clk);
-	return gpio_direction_output(config->reset_gpio, 0);
-}
-
-static int	tegra_phy_init(struct usb_phy *x)
-{
-	struct tegra_usb_phy *phy = container_of(x, struct tegra_usb_phy, u_phy);
-	struct tegra_ulpi_config *ulpi_config;
-	int err;
-
-	if (phy->is_ulpi_phy) {
-		ulpi_config = phy->config;
-		phy->clk = clk_get_sys(NULL, ulpi_config->clk);
-		if (IS_ERR(phy->clk)) {
-			pr_err("%s: can't get ulpi clock\n", __func__);
-			err = -ENXIO;
-			goto err1;
-		}
-		if (!gpio_is_valid(ulpi_config->reset_gpio))
-			ulpi_config->reset_gpio =
-				of_get_named_gpio(phy->dev->of_node,
-						  "nvidia,phy-reset-gpio", 0);
-		if (!gpio_is_valid(ulpi_config->reset_gpio)) {
-			pr_err("%s: invalid reset gpio: %d\n", __func__,
-			       ulpi_config->reset_gpio);
-			err = -EINVAL;
-			goto err1;
-		}
-		gpio_request(ulpi_config->reset_gpio, "ulpi_phy_reset_b");
-		gpio_direction_output(ulpi_config->reset_gpio, 0);
-		phy->ulpi = otg_ulpi_create(&ulpi_viewport_access_ops, 0);
-		phy->ulpi->io_priv = phy->regs + ULPI_VIEWPORT;
-	} else {
-		err = utmip_pad_open(phy);
-		if (err < 0)
-			goto err1;
-	}
-	return 0;
-err1:
-	clk_disable_unprepare(phy->pll_u);
-	clk_put(phy->pll_u);
-	return err;
+	return gpio_direction_output(phy->reset_gpio, 0);
 }
 
 static void tegra_usb_phy_close(struct usb_phy *x)
 {
 	struct tegra_usb_phy *phy = container_of(x, struct tegra_usb_phy, u_phy);
 
-	if (phy->is_ulpi_phy)
-		clk_put(phy->clk);
-	else
-		utmip_pad_close(phy);
 	clk_disable_unprepare(phy->pll_u);
-	clk_put(phy->pll_u);
-	kfree(phy);
 }
 
 static int tegra_usb_phy_power_on(struct tegra_usb_phy *phy)
@@ -687,54 +662,63 @@
 		return tegra_usb_phy_power_on(phy);
 }
 
-struct tegra_usb_phy *tegra_usb_phy_open(struct device *dev, int instance,
-	void __iomem *regs, void *config, enum tegra_usb_phy_mode phy_mode,
-	void (*set_pts)(struct usb_phy *x, u8 pts_val),
-	void (*set_phcd)(struct usb_phy *x, bool enable))
-
+static int ulpi_open(struct tegra_usb_phy *phy)
 {
-	struct tegra_usb_phy *phy;
+	int err;
+
+	phy->clk = devm_clk_get(phy->dev, "ulpi-link");
+	if (IS_ERR(phy->clk)) {
+		pr_err("%s: can't get ulpi clock\n", __func__);
+		return PTR_ERR(phy->clk);
+	}
+
+	err = devm_gpio_request(phy->dev, phy->reset_gpio, "ulpi_phy_reset_b");
+	if (err < 0) {
+		dev_err(phy->dev, "request failed for gpio: %d\n",
+		       phy->reset_gpio);
+		return err;
+	}
+
+	err = gpio_direction_output(phy->reset_gpio, 0);
+	if (err < 0) {
+		dev_err(phy->dev, "gpio %d direction not set to output\n",
+		       phy->reset_gpio);
+		return err;
+	}
+
+	phy->ulpi = otg_ulpi_create(&ulpi_viewport_access_ops, 0);
+	if (!phy->ulpi) {
+		dev_err(phy->dev, "otg_ulpi_create returned NULL\n");
+		err = -ENOMEM;
+		return err;
+	}
+
+	phy->ulpi->io_priv = phy->regs + ULPI_VIEWPORT;
+	return 0;
+}
+
+static int tegra_usb_phy_init(struct tegra_usb_phy *phy)
+{
 	unsigned long parent_rate;
 	int i;
 	int err;
-	struct device_node *np = dev->of_node;
 
-	phy = kzalloc(sizeof(struct tegra_usb_phy), GFP_KERNEL);
-	if (!phy)
-		return ERR_PTR(-ENOMEM);
-
-	phy->instance = instance;
-	phy->regs = regs;
-	phy->config = config;
-	phy->mode = phy_mode;
-	phy->dev = dev;
-	phy->is_legacy_phy =
-		of_property_read_bool(np, "nvidia,has-legacy-mode");
-	phy->set_pts = set_pts;
-	phy->set_phcd = set_phcd;
-	err = of_property_match_string(np, "phy_type", "ulpi");
-	if (err < 0)
-		phy->is_ulpi_phy = false;
-	else
-		phy->is_ulpi_phy = true;
-
-	if (!phy->config) {
-		if (phy->is_ulpi_phy) {
-			pr_err("%s: ulpi phy configuration missing", __func__);
-			err = -EINVAL;
-			goto err0;
-		} else {
-			phy->config = &utmip_default[instance];
-		}
+	if (!phy->is_ulpi_phy) {
+		if (phy->is_legacy_phy)
+			phy->config = &utmip_default[0];
+		else
+			phy->config = &utmip_default[2];
 	}
 
-	phy->pll_u = clk_get_sys(NULL, "pll_u");
+	phy->pll_u = devm_clk_get(phy->dev, "pll_u");
 	if (IS_ERR(phy->pll_u)) {
 		pr_err("Can't get pll_u clock\n");
-		err = PTR_ERR(phy->pll_u);
-		goto err0;
+		return PTR_ERR(phy->pll_u);
 	}
-	clk_prepare_enable(phy->pll_u);
+
+	err = clk_prepare_enable(phy->pll_u);
+	if (err)
+		return err;
 
 	parent_rate = clk_get_rate(clk_get_parent(phy->pll_u));
 	for (i = 0; i < ARRAY_SIZE(tegra_freq_table); i++) {
@@ -746,23 +730,22 @@
 	if (!phy->freq) {
 		pr_err("invalid pll_u parent rate %ld\n", parent_rate);
 		err = -EINVAL;
-		goto err1;
+		goto fail;
 	}
 
-	phy->u_phy.init = tegra_phy_init;
-	phy->u_phy.shutdown = tegra_usb_phy_close;
-	phy->u_phy.set_suspend = tegra_usb_phy_suspend;
+	if (phy->is_ulpi_phy)
+		err = ulpi_open(phy);
+	else
+		err = utmip_pad_open(phy);
+	if (err < 0)
+		goto fail;
 
-	return phy;
+	return 0;
 
-err1:
+fail:
 	clk_disable_unprepare(phy->pll_u);
-	clk_put(phy->pll_u);
-err0:
-	kfree(phy);
-	return ERR_PTR(err);
+	return err;
 }
-EXPORT_SYMBOL_GPL(tegra_usb_phy_open);
 
 void tegra_usb_phy_preresume(struct usb_phy *x)
 {
@@ -801,3 +784,124 @@
 }
 EXPORT_SYMBOL_GPL(tegra_ehci_phy_restore_end);
 
+static int tegra_usb_phy_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	struct tegra_usb_phy *tegra_phy = NULL;
+	struct device_node *np = pdev->dev.of_node;
+	int err;
+
+	tegra_phy = devm_kzalloc(&pdev->dev, sizeof(*tegra_phy), GFP_KERNEL);
+	if (!tegra_phy) {
+		dev_err(&pdev->dev, "unable to allocate memory for USB2 PHY\n");
+		return -ENOMEM;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "Failed to get I/O memory\n");
+		return  -ENXIO;
+	}
+
+	tegra_phy->regs = devm_ioremap(&pdev->dev, res->start,
+		resource_size(res));
+	if (!tegra_phy->regs) {
+		dev_err(&pdev->dev, "Failed to remap I/O memory\n");
+		return -ENOMEM;
+	}
+
+	tegra_phy->is_legacy_phy =
+		of_property_read_bool(np, "nvidia,has-legacy-mode");
+
+	err = of_property_match_string(np, "phy_type", "ulpi");
+	if (err < 0) {
+		tegra_phy->is_ulpi_phy = false;
+
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+		if (!res) {
+			dev_err(&pdev->dev, "Failed to get UTMI Pad regs\n");
+			return  -ENXIO;
+		}
+
+		tegra_phy->pad_regs = devm_ioremap(&pdev->dev, res->start,
+			resource_size(res));
+		if (!tegra_phy->regs) {
+			dev_err(&pdev->dev, "Failed to remap UTMI Pad regs\n");
+			return -ENOMEM;
+		}
+	} else {
+		tegra_phy->is_ulpi_phy = true;
+
+		tegra_phy->reset_gpio =
+			of_get_named_gpio(np, "nvidia,phy-reset-gpio", 0);
+		if (!gpio_is_valid(tegra_phy->reset_gpio)) {
+			dev_err(&pdev->dev, "invalid gpio: %d\n",
+				tegra_phy->reset_gpio);
+			return tegra_phy->reset_gpio;
+		}
+	}
+
+	err = of_property_match_string(np, "dr_mode", "otg");
+	if (err < 0) {
+		err = of_property_match_string(np, "dr_mode", "peripheral");
+		if (err < 0)
+			tegra_phy->mode = TEGRA_USB_PHY_MODE_HOST;
+		else
+			tegra_phy->mode = TEGRA_USB_PHY_MODE_DEVICE;
+	} else
+		tegra_phy->mode = TEGRA_USB_PHY_MODE_OTG;
+
+	tegra_phy->dev = &pdev->dev;
+	err = tegra_usb_phy_init(tegra_phy);
+	if (err < 0)
+		return err;
+
+	tegra_phy->u_phy.shutdown = tegra_usb_phy_close;
+	tegra_phy->u_phy.set_suspend = tegra_usb_phy_suspend;
+
+	dev_set_drvdata(&pdev->dev, tegra_phy);
+	return 0;
+}
+
+static struct of_device_id tegra_usb_phy_id_table[] = {
+	{ .compatible = "nvidia,tegra20-usb-phy", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, tegra_usb_phy_id_table);
+
+static struct platform_driver tegra_usb_phy_driver = {
+	.probe		= tegra_usb_phy_probe,
+	.driver		= {
+		.name	= "tegra-phy",
+		.owner	= THIS_MODULE,
+		.of_match_table = of_match_ptr(tegra_usb_phy_id_table),
+	},
+};
+module_platform_driver(tegra_usb_phy_driver);
+
+static int tegra_usb_phy_match(struct device *dev, void *data)
+{
+	struct tegra_usb_phy *tegra_phy = dev_get_drvdata(dev);
+	struct device_node *dn = data;
+
+	return (tegra_phy->dev->of_node == dn) ? 1 : 0;
+}
+
+struct usb_phy *tegra_usb_get_phy(struct device_node *dn)
+{
+	struct device *dev;
+	struct tegra_usb_phy *tegra_phy;
+
+	dev = driver_find_device(&tegra_usb_phy_driver.driver, NULL, dn,
+				 tegra_usb_phy_match);
+	if (!dev)
+		return ERR_PTR(-EPROBE_DEFER);
+
+	tegra_phy = dev_get_drvdata(dev);
+
+	return &tegra_phy->u_phy;
+}
+EXPORT_SYMBOL_GPL(tegra_usb_get_phy);
+
+MODULE_DESCRIPTION("Tegra USB PHY driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/usb/phy/phy-ulpi-viewport.c b/drivers/usb/phy/phy-ulpi-viewport.c
index c5ba7e5..7c22a539 100644
--- a/drivers/usb/phy/phy-ulpi-viewport.c
+++ b/drivers/usb/phy/phy-ulpi-viewport.c

@@ -12,6 +12,7 @@
  *
  */
 
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/usb.h>
 #include <linux/io.h>
@@ -78,3 +79,4 @@
 	.read	= ulpi_viewport_read,
 	.write	= ulpi_viewport_write,
 };
+EXPORT_SYMBOL_GPL(ulpi_viewport_access_ops);

diff --git a/drivers/usb/serial/Kconfig b/drivers/usb/serial/Kconfig
index 1d55762..8c3a42ea 100644
--- a/drivers/usb/serial/Kconfig
+++ b/drivers/usb/serial/Kconfig

@@ -710,6 +710,16 @@
 	  To compile this driver as a module, choose M here: the
 	  module will be called quatech-serial.
 
+config USB_SERIAL_FLASHLOADER
+	tristate "Infineon Modem Flashloader USB interface driver"
+	help
+	  Say Y here if you want to download Infineon Modem
+	  via USB Flashloader serial driver.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called flashloader.
+
+
 config USB_SERIAL_DEBUG
 	tristate "USB Debugging Device"
 	help

diff --git a/drivers/usb/serial/Makefile b/drivers/usb/serial/Makefile
index cec63fa..f7130114 100644
--- a/drivers/usb/serial/Makefile
+++ b/drivers/usb/serial/Makefile

@@ -65,3 +65,4 @@
 obj-$(CONFIG_USB_SERIAL_XSENS_MT)		+= xsens_mt.o
 obj-$(CONFIG_USB_SERIAL_ZIO)			+= zio.o
 obj-$(CONFIG_USB_SERIAL_ZTE)			+= zte_ev.o
+obj-$(CONFIG_USB_SERIAL_FLASHLOADER)		+= flashloader.o

diff --git a/drivers/usb/serial/ark3116.c b/drivers/usb/serial/ark3116.c
index 40e7fd94..bc77e95 100644
--- a/drivers/usb/serial/ark3116.c
+++ b/drivers/usb/serial/ark3116.c

@@ -413,8 +413,8 @@
 		/* XXX: Some of these values are probably wrong. */
 		memset(&serstruct, 0, sizeof(serstruct));
 		serstruct.type = PORT_16654;
-		serstruct.line = port->serial->minor;
-		serstruct.port = port->number;
+		serstruct.line = port->minor;
+		serstruct.port = port->port_number;
 		serstruct.custom_divisor = 0;
 		serstruct.baud_base = 460800;
 

diff --git a/drivers/usb/serial/bus.c b/drivers/usb/serial/bus.c
index 3c4db6d..f053b30 100644
--- a/drivers/usb/serial/bus.c
+++ b/drivers/usb/serial/bus.c

@@ -43,7 +43,7 @@
 {
 	struct usb_serial_port *port = to_usb_serial_port(dev);
 
-	return sprintf(buf, "%d\n", port->number - port->serial->minor);
+	return sprintf(buf, "%d\n", port->port_number);
 }
 
 static DEVICE_ATTR(port_number, S_IRUGO, show_port_number, NULL);
@@ -80,7 +80,7 @@
 		goto exit_with_autopm;
 	}
 
-	minor = port->number;
+	minor = port->minor;
 	tty_register_device(usb_serial_tty_driver, minor, dev);
 	dev_info(&port->serial->dev->dev,
 		 "%s converter now attached to ttyUSB%d\n",
@@ -106,7 +106,7 @@
 	/* make sure suspend/resume doesn't race against port_remove */
 	usb_autopm_get_interface(port->serial->interface);
 
-	minor = port->number;
+	minor = port->minor;
 	tty_unregister_device(usb_serial_tty_driver, minor);
 
 	device_remove_file(&port->dev, &dev_attr_port_number);

diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c
index 5f3bcd3..afb50ea 100644
--- a/drivers/usb/serial/console.c
+++ b/drivers/usb/serial/console.c

@@ -108,18 +108,18 @@
 	 * no need to check the index here: if the index is wrong, console
 	 * code won't call us
 	 */
-	serial = usb_serial_get_by_index(co->index);
-	if (serial == NULL) {
+	port = usb_serial_port_get_by_minor(co->index);
+	if (port == NULL) {
 		/* no device is connected yet, sorry :( */
 		pr_err("No USB device connected to ttyUSB%i\n", co->index);
 		return -ENODEV;
 	}
+	serial = port->serial;
 
 	retval = usb_autopm_get_interface(serial->interface);
 	if (retval)
 		goto error_get_interface;
 
-	port = serial->port[co->index - serial->minor];
 	tty_port_tty_set(&port->port, NULL);
 
 	info->port = port;
@@ -210,7 +210,7 @@
 	if (count == 0)
 		return;
 
-	pr_debug("%s - port %d, %d byte(s)\n", __func__, port->number, count);
+	pr_debug("%s - minor %d, %d byte(s)\n", __func__, port->minor, count);
 
 	if (!port->port.console) {
 		pr_debug("%s - port not opened\n", __func__);

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 2c65955..d6ef2f8 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c

@@ -666,8 +666,6 @@
 	unsigned int bits;
 	unsigned int modem_ctl[4];
 
-	dev_dbg(dev, "%s - port %d\n", __func__, port->number);
-
 	if (!tty)
 		return;
 

diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index 0821201..e948dc0 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c

@@ -435,7 +435,7 @@
 	spin_unlock_irqrestore(&priv->lock, flags);
 
 	dev_err(&port->dev, "cypress_m8 suspending failing port %d - "
-		"interval might be too short\n", port->number);
+		"interval might be too short\n", port->port_number);
 }
 
 
@@ -667,7 +667,7 @@
 {
 	struct cypress_private *priv = usb_get_serial_port_data(port);
 
-	dev_dbg(&port->dev, "%s - port %d, %d bytes\n", __func__, port->number, count);
+	dev_dbg(&port->dev, "%s - %d bytes\n", __func__, count);
 
 	/* line control commands, which need to be executed immediately,
 	   are not put into the buffer for obvious reasons.

diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 7b807d3..19b467f 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c

@@ -1304,11 +1304,7 @@
 
 static int digi_port_probe(struct usb_serial_port *port)
 {
-	unsigned port_num;
-
-	port_num = port->number - port->serial->minor;
-
-	return digi_port_init(port, port_num);
+	return digi_port_init(port, port->port_number);
 }
 
 static int digi_port_remove(struct usb_serial_port *port)

diff --git a/drivers/usb/serial/f81232.c b/drivers/usb/serial/f81232.c
index 7d8dd5a..75e85cb 100644
--- a/drivers/usb/serial/f81232.c
+++ b/drivers/usb/serial/f81232.c

@@ -288,15 +288,14 @@
 	struct serial_struct ser;
 	struct usb_serial_port *port = tty->driver_data;
 
-	dev_dbg(&port->dev, "%s (%d) cmd = 0x%04x\n", __func__,
-		port->number, cmd);
+	dev_dbg(&port->dev, "%s cmd = 0x%04x\n", __func__, cmd);
 
 	switch (cmd) {
 	case TIOCGSERIAL:
 		memset(&ser, 0, sizeof ser);
 		ser.type = PORT_16654;
-		ser.line = port->serial->minor;
-		ser.port = port->number;
+		ser.line = port->minor;
+		ser.port = port->port_number;
 		ser.baud_base = 460800;
 
 		if (copy_to_user((void __user *)arg, &ser, sizeof ser))

diff --git a/drivers/usb/serial/flashloader.c b/drivers/usb/serial/flashloader.c
new file mode 100644
index 0000000..e6f5c10
--- /dev/null
+++ b/drivers/usb/serial/flashloader.c

@@ -0,0 +1,39 @@
+/*
+ * Infineon Flashloader driver
+ *
+ * Copyright (C) 2013 Wei Shuai <cpuwolf@gmail.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+#include <linux/usb/serial.h>
+#include <linux/uaccess.h>
+
+static const struct usb_device_id id_table[] = {
+	{ USB_DEVICE(0x8087, 0x0716) },
+	{ },
+};
+MODULE_DEVICE_TABLE(usb, id_table);
+
+static struct usb_serial_driver flashloader_device = {
+	.driver = {
+		.owner =	THIS_MODULE,
+		.name =		"flashloader",
+	},
+	.id_table =		id_table,
+	.num_ports =		1,
+};
+
+static struct usb_serial_driver * const serial_drivers[] = {
+	&flashloader_device, NULL
+};
+
+module_usb_serial_driver(serial_drivers, id_table);
+MODULE_LICENSE("GPL");

diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index b110c57..04b5ed9 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c

@@ -948,9 +948,9 @@
 {
 	struct garmin_data *garmin_data_p = usb_get_serial_port_data(port);
 
-	dev_dbg(&port->dev, "%s - port %d - mode=%d state=%d flags=0x%X\n",
-		__func__, port->number, garmin_data_p->mode,
-		garmin_data_p->state, garmin_data_p->flags);
+	dev_dbg(&port->dev, "%s - mode=%d state=%d flags=0x%X\n",
+		__func__, garmin_data_p->mode, garmin_data_p->state,
+		garmin_data_p->flags);
 
 	garmin_clear(garmin_data_p);
 

diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index 1477e85..dc2803b 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c

@@ -915,8 +915,8 @@
 		return -ENOMEM;
 	}
 
-	dev_dbg(dev, "%s(%d) - Initialize TX fifo to %d bytes\n",
-		__func__, port->number, edge_port->maxTxCredits);
+	dev_dbg(dev, "%s - Initialize TX fifo to %d bytes\n",
+		__func__, edge_port->maxTxCredits);
 
 	return 0;
 }
@@ -1122,9 +1122,8 @@
 	copySize = min((unsigned int)count,
 				(edge_port->txCredits - fifo->count));
 
-	dev_dbg(&port->dev, "%s(%d) of %d byte(s) Fifo room  %d -- will copy %d bytes\n",
-		__func__, port->number, count,
-			edge_port->txCredits - fifo->count, copySize);
+	dev_dbg(&port->dev, "%s of %d byte(s) Fifo room  %d -- will copy %d bytes\n",
+		__func__, count, edge_port->txCredits - fifo->count, copySize);
 
 	/* catch writes of 0 bytes which the tty driver likes to give us,
 	   and when txCredits is empty */
@@ -1216,9 +1215,8 @@
 	if (edge_port->write_in_progress ||
 	    !edge_port->open             ||
 	    (fifo->count == 0)) {
-		dev_dbg(dev, "%s(%d) EXIT - fifo %d, PendingWrite = %d\n",
-			__func__, edge_port->port->number,
-			fifo->count, edge_port->write_in_progress);
+		dev_dbg(dev, "%s EXIT - fifo %d, PendingWrite = %d\n",
+			__func__, fifo->count, edge_port->write_in_progress);
 		goto exit_send;
 	}
 
@@ -1230,9 +1228,8 @@
 	 * it's better to wait for more credits so we can do a larger write.
 	 */
 	if (edge_port->txCredits < EDGE_FW_GET_TX_CREDITS_SEND_THRESHOLD(edge_port->maxTxCredits, EDGE_FW_BULK_MAX_PACKET_SIZE)) {
-		dev_dbg(dev, "%s(%d) Not enough credit - fifo %d TxCredit %d\n",
-			__func__, edge_port->port->number, fifo->count,
-			edge_port->txCredits);
+		dev_dbg(dev, "%s Not enough credit - fifo %d TxCredit %d\n",
+			__func__, fifo->count, edge_port->txCredits);
 		goto exit_send;
 	}
 
@@ -1256,10 +1253,8 @@
 		edge_port->write_in_progress = false;
 		goto exit_send;
 	}
-	buffer[0] = IOSP_BUILD_DATA_HDR1(edge_port->port->number
-				- edge_port->port->serial->minor, count);
-	buffer[1] = IOSP_BUILD_DATA_HDR2(edge_port->port->number
-				- edge_port->port->serial->minor, count);
+	buffer[0] = IOSP_BUILD_DATA_HDR1(edge_port->port->port_number, count);
+	buffer[1] = IOSP_BUILD_DATA_HDR2(edge_port->port->port_number, count);
 
 	/* now copy our data */
 	bytesleft =  fifo->size - fifo->tail;
@@ -1377,8 +1372,7 @@
 						edge_port->txfifo.count;
 	spin_unlock_irqrestore(&edge_port->ep_lock, flags);
 	if (num_chars) {
-		dev_dbg(&port->dev, "%s(port %d) - returns %d\n", __func__,
-			port->number, num_chars);
+		dev_dbg(&port->dev, "%s - returns %d\n", __func__, num_chars);
 	}
 
 	return num_chars;
@@ -1575,8 +1569,8 @@
 	memset(&tmp, 0, sizeof(tmp));
 
 	tmp.type		= PORT_16550A;
-	tmp.line		= edge_port->port->serial->minor;
-	tmp.port		= edge_port->port->number;
+	tmp.line		= edge_port->port->minor;
+	tmp.port		= edge_port->port->port_number;
 	tmp.irq			= 0;
 	tmp.flags		= ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ;
 	tmp.xmit_fifo_size	= edge_port->maxTxCredits;
@@ -1601,15 +1595,15 @@
 	DEFINE_WAIT(wait);
 	struct edgeport_port *edge_port = usb_get_serial_port_data(port);
 
-	dev_dbg(&port->dev, "%s - port %d, cmd = 0x%x\n", __func__, port->number, cmd);
+	dev_dbg(&port->dev, "%s - cmd = 0x%x\n", __func__, cmd);
 
 	switch (cmd) {
 	case TIOCSERGETLSR:
-		dev_dbg(&port->dev, "%s (%d) TIOCSERGETLSR\n", __func__,  port->number);
+		dev_dbg(&port->dev, "%s TIOCSERGETLSR\n", __func__);
 		return get_lsr_info(edge_port, (unsigned int __user *) arg);
 
 	case TIOCGSERIAL:
-		dev_dbg(&port->dev, "%s (%d) TIOCGSERIAL\n", __func__,  port->number);
+		dev_dbg(&port->dev, "%s TIOCGSERIAL\n", __func__);
 		return get_serial_info(edge_port, (struct serial_struct __user *) arg);
 	}
 	return -ENOIOCTLCMD;
@@ -2181,9 +2175,8 @@
 
 	currentCommand = buffer;
 
-	MAKE_CMD_EXT_CMD(&currentCommand, &length,
-		edge_port->port->number - edge_port->port->serial->minor,
-		command, param);
+	MAKE_CMD_EXT_CMD(&currentCommand, &length, edge_port->port->port_number,
+			 command, param);
 
 	status = write_cmd_usb(edge_port, buffer, length);
 	if (status) {
@@ -2266,18 +2259,16 @@
 	int cmdLen = 0;
 	int divisor;
 	int status;
-	unsigned char number =
-		edge_port->port->number - edge_port->port->serial->minor;
+	u32 number = edge_port->port->port_number;
 
 	if (edge_serial->is_epic &&
 	    !edge_serial->epic_descriptor.Supports.IOSPSetBaudRate) {
-		dev_dbg(dev, "SendCmdWriteBaudRate - NOT Setting baud rate for port = %d, baud = %d\n",
-			edge_port->port->number, baudRate);
+		dev_dbg(dev, "SendCmdWriteBaudRate - NOT Setting baud rate for port, baud = %d\n",
+			baudRate);
 		return 0;
 	}
 
-	dev_dbg(dev, "%s - port = %d, baud = %d\n", __func__,
-		edge_port->port->number, baudRate);
+	dev_dbg(dev, "%s - baud = %d\n", __func__, baudRate);
 
 	status = calc_baud_rate_divisor(dev, baudRate, &divisor);
 	if (status) {
@@ -2388,9 +2379,8 @@
 	currCmd = cmdBuffer;
 
 	/* Build a cmd in the buffer to write the given register */
-	MAKE_CMD_WRITE_REG(&currCmd, &cmdLen,
-		edge_port->port->number - edge_port->port->serial->minor,
-		regNum, regValue);
+	MAKE_CMD_WRITE_REG(&currCmd, &cmdLen, edge_port->port->port_number,
+			   regNum, regValue);
 
 	status = write_cmd_usb(edge_port, cmdBuffer, cmdLen);
 	if (status) {
@@ -2424,8 +2414,6 @@
 	__u8 txFlow;
 	int status;
 
-	dev_dbg(dev, "%s - port %d\n", __func__, edge_port->port->number);
-
 	if (!edge_port->open &&
 	    !edge_port->openPending) {
 		dev_dbg(dev, "%s - port not opened\n", __func__);

diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 1be6ba7..60054e7 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c

@@ -259,7 +259,7 @@
 /* clear tx/rx buffers and fifo in TI UMP */
 static int purge_port(struct usb_serial_port *port, __u16 mask)
 {
-	int port_number = port->number - port->serial->minor;
+	int port_number = port->port_number;
 
 	dev_dbg(&port->dev, "%s - port %d, mask %x\n", __func__, port_number, mask);
 
@@ -1392,7 +1392,8 @@
 
 static int ti_do_config(struct edgeport_port *port, int feature, int on)
 {
-	int port_number = port->port->number - port->port->serial->minor;
+	int port_number = port->port->port_number;
+
 	on = !!on;	/* 1 or 0 not bitmask */
 	return send_cmd(port->port->serial->dev,
 			feature, (__u8)(UMPM_UART1_PORT + port_number),
@@ -1637,7 +1638,7 @@
 		return;
 	}
 
-	port_number = edge_port->port->number - edge_port->port->serial->minor;
+	port_number = edge_port->port->port_number;
 
 	if (edge_port->lsr_event) {
 		edge_port->lsr_event = 0;
@@ -1730,7 +1731,7 @@
 	if (edge_port == NULL)
 		return -ENODEV;
 
-	port_number = port->number - port->serial->minor;
+	port_number = port->port_number;
 	switch (port_number) {
 	case 0:
 		edge_port->uart_base = UMPMEM_BASE_UART1;
@@ -1908,7 +1909,7 @@
 	spin_unlock_irqrestore(&edge_port->ep_lock, flags);
 
 	dev_dbg(&port->dev, "%s - send umpc_close_port\n", __func__);
-	port_number = port->number - port->serial->minor;
+	port_number = port->port_number;
 	send_cmd(serial->dev, UMPC_CLOSE_PORT,
 		     (__u8)(UMPM_UART1_PORT + port_number), 0, NULL, 0);
 
@@ -2137,10 +2138,7 @@
 	int baud;
 	unsigned cflag;
 	int status;
-	int port_number = edge_port->port->number -
-					edge_port->port->serial->minor;
-
-	dev_dbg(dev, "%s - port %d\n", __func__, edge_port->port->number);
+	int port_number = edge_port->port->port_number;
 
 	config = kmalloc (sizeof (*config), GFP_KERNEL);
 	if (!config) {
@@ -2284,7 +2282,6 @@
 		tty->termios.c_cflag, tty->termios.c_iflag);
 	dev_dbg(&port->dev, "%s - old clfag %08x old iflag %08x\n", __func__,
 		old_termios->c_cflag, old_termios->c_iflag);
-	dev_dbg(&port->dev, "%s - port %d\n", __func__, port->number);
 
 	if (edge_port == NULL)
 		return;
@@ -2366,8 +2363,8 @@
 	memset(&tmp, 0, sizeof(tmp));
 
 	tmp.type		= PORT_16550A;
-	tmp.line		= edge_port->port->serial->minor;
-	tmp.port		= edge_port->port->number;
+	tmp.line		= edge_port->port->minor;
+	tmp.port		= edge_port->port->port_number;
 	tmp.irq			= 0;
 	tmp.flags		= ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ;
 	tmp.xmit_fifo_size	= edge_port->port->bulk_out_size;
@@ -2386,7 +2383,7 @@
 	struct usb_serial_port *port = tty->driver_data;
 	struct edgeport_port *edge_port = usb_get_serial_port_data(port);
 
-	dev_dbg(&port->dev, "%s - port %d, cmd = 0x%x\n", __func__, port->number, cmd);
+	dev_dbg(&port->dev, "%s - cmd = 0x%x\n", __func__, cmd);
 
 	switch (cmd) {
 	case TIOCGSERIAL:

diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index 3549d07..5a97972 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c

@@ -152,7 +152,7 @@
 	p_priv = usb_get_serial_port_data(port);
 	d_details = p_priv->device_details;
 	cflag = tty->termios.c_cflag;
-	device_port = port->number - port->serial->minor;
+	device_port = port->port_number;
 
 	/* Baud rate calculation takes baud rate as an integer
 	   so other rates can be generated if desired. */
@@ -234,8 +234,8 @@
 		dataOffset = 1;
 	}
 
-	dev_dbg(&port->dev, "%s - for port %d (%d chars), flip=%d\n",
-		__func__, port->number, count, p_priv->out_flip);
+	dev_dbg(&port->dev, "%s - %d chars, flip=%d\n", __func__, count,
+		p_priv->out_flip);
 
 	for (left = count; left > 0; left -= todo) {
 		todo = left;
@@ -520,12 +520,7 @@
 		goto exit;
 	}
 
-	/*
-	dev_dbg(&urb->dev->dev,
-		"%s %x %x %x %x %x %x %x %x %x %x %x %x", __func__,
-		data[0], data[1], data[2], data[3], data[4], data[5],
-		data[6], data[7], data[8], data[9], data[10], data[11]);
-	*/
+	/*dev_dbg(&urb->dev->dev, "%s %12ph", __func__, data);*/
 
 	/* Now do something useful with the data */
 	msg = (struct keyspan_usa28_portStatusMessage *)data;
@@ -607,11 +602,7 @@
 		goto exit;
 	}
 
-	/*
-	dev_dbg(&urb->dev->dev, "%s: %x %x %x %x %x %x %x %x %x %x %x",
-		__func__, data[0], data[1], data[2], data[3], data[4],
-		data[5], data[6], data[7], data[8], data[9], data[10]);
-	*/
+	/*dev_dbg(&urb->dev->dev, "%s: %11ph", __func__, data);*/
 
 	/* Now do something useful with the data */
 	msg = (struct keyspan_usa49_portStatusMessage *)data;
@@ -1050,7 +1041,7 @@
 	/* get the terminal config for the setup message now so we don't
 	 * need to send 2 of them */
 
-	device_port = port->number - port->serial->minor;
+	device_port = port->port_number;
 	if (tty) {
 		cflag = tty->termios.c_cflag;
 		/* Baud rate calculation takes baud rate as an integer
@@ -1556,7 +1547,7 @@
 	s_priv = usb_get_serial_data(serial);
 	p_priv = usb_get_serial_port_data(port);
 	d_details = s_priv->device_details;
-	device_port = port->number - port->serial->minor;
+	device_port = port->port_number;
 
 	this_urb = p_priv->outcont_urb;
 
@@ -1700,7 +1691,7 @@
 	s_priv = usb_get_serial_data(serial);
 	p_priv = usb_get_serial_port_data(port);
 	d_details = s_priv->device_details;
-	device_port = port->number - port->serial->minor;
+	device_port = port->port_number;
 
 	/* only do something if we have a bulk out endpoint */
 	this_urb = p_priv->outcont_urb;
@@ -1830,17 +1821,16 @@
 	this_urb = s_priv->glocont_urb;
 
 	/* Work out which port within the device is being setup */
-	device_port = port->number - port->serial->minor;
+	device_port = port->port_number;
 
 	/* Make sure we have an urb then send the message */
 	if (this_urb == NULL) {
-		dev_dbg(&port->dev, "%s - oops no urb for port %d.\n", __func__, port->number);
+		dev_dbg(&port->dev, "%s - oops no urb for port.\n", __func__);
 		return -1;
 	}
 
-	dev_dbg(&port->dev, "%s - endpoint %d port %d (%d)\n",
-		__func__, usb_pipeendpoint(this_urb->pipe),
-		port->number, device_port);
+	dev_dbg(&port->dev, "%s - endpoint %d (%d)\n",
+		__func__, usb_pipeendpoint(this_urb->pipe), device_port);
 
 	/* Save reset port val for resend.
 	   Don't overwrite resend for open/close condition. */
@@ -1855,7 +1845,6 @@
 
 	memset(&msg, 0, sizeof(struct keyspan_usa49_portControlMessage));
 
-	/*msg.portNumber = port->number;*/
 	msg.portNumber = device_port;
 
 	/* Only set baud rate if it's changed */
@@ -2145,12 +2134,11 @@
 	this_urb = s_priv->glocont_urb;
 
 	/* Work out which port within the device is being setup */
-	device_port = port->number - port->serial->minor;
+	device_port = port->port_number;
 
 	/* Make sure we have an urb then send the message */
 	if (this_urb == NULL) {
-		dev_dbg(&port->dev, "%s - oops no urb for port %d.\n", __func__,
-			port->number);
+		dev_dbg(&port->dev, "%s - oops no urb for port.\n", __func__);
 		return -1;
 	}
 
@@ -2391,7 +2379,7 @@
 	/* Setup values for the various callback routines */
 	cback = &keyspan_callbacks[d_details->msg_format];
 
-	port_num = port->number - port->serial->minor;
+	port_num = port->port_number;
 
 	/* Do indat endpoints first, once for each flip */
 	endp = d_details->indat_endpoints[port_num];

diff --git a/drivers/usb/serial/metro-usb.c b/drivers/usb/serial/metro-usb.c
index 47e2477..40ccf6e 100644
--- a/drivers/usb/serial/metro-usb.c
+++ b/drivers/usb/serial/metro-usb.c

@@ -224,8 +224,8 @@
 	result = metrousb_send_unidirectional_cmd(UNI_CMD_OPEN, port);
 	if (result) {
 		dev_err(&port->dev,
-			"%s - failed to configure device for port number=%d, error code=%d\n",
-			__func__, port->number, result);
+			"%s - failed to configure device, error code=%d\n",
+			__func__, result);
 		goto exit;
 	}
 

diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index f27c621..51da424 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c

@@ -1047,7 +1047,7 @@
 	  *
 	  * 0x08 : SP1/2 Control Reg
 	  */
-	port_number = port->number - port->serial->minor;
+	port_number = port->port_number;
 	read_mos_reg(serial, port_number, LSR, &data);
 
 	dev_dbg(&port->dev, "SS::%p LSR:%x\n", mos7720_port, data);
@@ -1066,7 +1066,7 @@
 
 	write_mos_reg(serial, port_number, SP_CONTROL_REG, 0x00);
 	read_mos_reg(serial, dummy, SP_CONTROL_REG, &data);
-	data = data | (port->number - port->serial->minor + 1);
+	data = data | (port->port_number + 1);
 	write_mos_reg(serial, dummy, SP_CONTROL_REG, data);
 	mos7720_port->shadowLCR = 0x83;
 	write_mos_reg(serial, port_number, LCR, mos7720_port->shadowLCR);
@@ -1147,8 +1147,8 @@
 	usb_kill_urb(port->write_urb);
 	usb_kill_urb(port->read_urb);
 
-	write_mos_reg(serial, port->number - port->serial->minor, MCR, 0x00);
-	write_mos_reg(serial, port->number - port->serial->minor, IER, 0x00);
+	write_mos_reg(serial, port->port_number, MCR, 0x00);
+	write_mos_reg(serial, port->port_number, IER, 0x00);
 
 	mos7720_port->open = 0;
 }
@@ -1172,8 +1172,7 @@
 		data = mos7720_port->shadowLCR & ~UART_LCR_SBC;
 
 	mos7720_port->shadowLCR  = data;
-	write_mos_reg(serial, port->number - port->serial->minor,
-		      LCR, mos7720_port->shadowLCR);
+	write_mos_reg(serial, port->port_number, LCR, mos7720_port->shadowLCR);
 }
 
 /*
@@ -1304,8 +1303,8 @@
 	/* if we are implementing RTS/CTS, toggle that line */
 	if (tty->termios.c_cflag & CRTSCTS) {
 		mos7720_port->shadowMCR &= ~UART_MCR_RTS;
-		write_mos_reg(port->serial, port->number - port->serial->minor,
-			      MCR, mos7720_port->shadowMCR);
+		write_mos_reg(port->serial, port->port_number, MCR,
+			      mos7720_port->shadowMCR);
 		if (status != 0)
 			return;
 	}
@@ -1336,8 +1335,8 @@
 	/* if we are implementing RTS/CTS, toggle that line */
 	if (tty->termios.c_cflag & CRTSCTS) {
 		mos7720_port->shadowMCR |= UART_MCR_RTS;
-		write_mos_reg(port->serial, port->number - port->serial->minor,
-			      MCR, mos7720_port->shadowMCR);
+		write_mos_reg(port->serial, port->port_number, MCR,
+			      mos7720_port->shadowMCR);
 		if (status != 0)
 			return;
 	}
@@ -1361,7 +1360,7 @@
 	 *      Init Sequence for higher rates
 	 ***********************************************/
 	dev_dbg(&port->dev, "Sending Setting Commands ..........\n");
-	port_number = port->number - port->serial->minor;
+	port_number = port->port_number;
 
 	write_mos_reg(serial, port_number, IER, 0x00);
 	write_mos_reg(serial, port_number, FCR, 0x00);
@@ -1487,7 +1486,7 @@
 	port = mos7720_port->port;
 	serial = port->serial;
 
-	number = port->number - port->serial->minor;
+	number = port->port_number;
 	dev_dbg(&port->dev, "%s - baud = %d\n", __func__, baudrate);
 
 	/* Calculate the Divisor */
@@ -1538,7 +1537,7 @@
 
 	port = mos7720_port->port;
 	serial = port->serial;
-	port_number = port->number - port->serial->minor;
+	port_number = port->port_number;
 
 	if (!mos7720_port->open) {
 		dev_dbg(&port->dev, "%s - port not opened\n", __func__);
@@ -1731,7 +1730,7 @@
 	struct usb_serial_port *port = tty->driver_data;
 	unsigned int result = 0;
 	unsigned char data = 0;
-	int port_number = port->number - port->serial->minor;
+	int port_number = port->port_number;
 	int count;
 
 	count = mos7720_chars_in_buffer(tty);
@@ -1793,8 +1792,8 @@
 		mcr &= ~UART_MCR_LOOP;
 
 	mos7720_port->shadowMCR = mcr;
-	write_mos_reg(port->serial, port->number - port->serial->minor,
-		      MCR, mos7720_port->shadowMCR);
+	write_mos_reg(port->serial, port->port_number, MCR,
+		      mos7720_port->shadowMCR);
 
 	return 0;
 }
@@ -1838,8 +1837,8 @@
 	}
 
 	mos7720_port->shadowMCR = mcr;
-	write_mos_reg(port->serial, port->number - port->serial->minor,
-		      MCR, mos7720_port->shadowMCR);
+	write_mos_reg(port->serial, port->port_number, MCR,
+		      mos7720_port->shadowMCR);
 
 	return 0;
 }
@@ -1855,8 +1854,8 @@
 	memset(&tmp, 0, sizeof(tmp));
 
 	tmp.type		= PORT_16550A;
-	tmp.line		= mos7720_port->port->serial->minor;
-	tmp.port		= mos7720_port->port->number;
+	tmp.line		= mos7720_port->port->minor;
+	tmp.port		= mos7720_port->port->port_number;
 	tmp.irq			= 0;
 	tmp.flags		= ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ;
 	tmp.xmit_fifo_size	= NUM_URBS * URB_TRANSFER_BUFFER_SIZE;

diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 7e99808..0a818b2 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c

@@ -303,15 +303,12 @@
 	/* For the UART control registers, the application number need
 	   to be Or'ed */
 	if (port->serial->num_ports == 4) {
-		val |= (((__u16) port->number -
-				(__u16) (port->serial->minor)) + 1) << 8;
+		val |= ((__u16)port->port_number + 1) << 8;
 	} else {
-		if (((__u16) port->number - (__u16) (port->serial->minor)) == 0) {
-			val |= (((__u16) port->number -
-			      (__u16) (port->serial->minor)) + 1) << 8;
+		if (port->port_number == 0) {
+			val |= ((__u16)port->port_number + 1) << 8;
 		} else {
-			val |= (((__u16) port->number -
-			      (__u16) (port->serial->minor)) + 2) << 8;
+			val |= ((__u16)port->port_number + 2) << 8;
 		}
 	}
 	dev_dbg(&port->dev, "%s application number is %x\n", __func__, val);
@@ -340,16 +337,12 @@
 
 	/* Wval  is same as application number */
 	if (port->serial->num_ports == 4) {
-		Wval =
-		    (((__u16) port->number - (__u16) (port->serial->minor)) +
-		     1) << 8;
+		Wval = ((__u16)port->port_number + 1) << 8;
 	} else {
-		if (((__u16) port->number - (__u16) (port->serial->minor)) == 0) {
-			Wval = (((__u16) port->number -
-			      (__u16) (port->serial->minor)) + 1) << 8;
+		if (port->port_number == 0) {
+			Wval = ((__u16)port->port_number + 1) << 8;
 		} else {
-			Wval = (((__u16) port->number -
-			      (__u16) (port->serial->minor)) + 2) << 8;
+			Wval = ((__u16)port->port_number + 2) << 8;
 		}
 	}
 	dev_dbg(&port->dev, "%s application number is %x\n", __func__, Wval);
@@ -631,9 +624,7 @@
 
 	for (i = 0; i < serial->num_ports; i++) {
 		mos7840_port = mos7840_get_port_private(serial->port[i]);
-		wval =
-		    (((__u16) serial->port[i]->number -
-		      (__u16) (serial->minor)) + 1) << 8;
+		wval = ((__u16)serial->port[i]->port_number + 1) << 8;
 		if (mos7840_port->open) {
 			if (sp[i] & 0x01) {
 				dev_dbg(&urb->dev->dev, "SP%d No Interrupt !!!\n", i);
@@ -1065,8 +1056,8 @@
 	 * (can't set it up in mos7840_startup as the  *
 	 * structures were not set up at that time.)   */
 
-	dev_dbg(&port->dev, "port number is %d\n", port->number);
-	dev_dbg(&port->dev, "serial number is %d\n", port->serial->minor);
+	dev_dbg(&port->dev, "port number is %d\n", port->port_number);
+	dev_dbg(&port->dev, "minor number is %d\n", port->minor);
 	dev_dbg(&port->dev, "Bulkin endpoint is %d\n", port->bulk_in_endpointAddress);
 	dev_dbg(&port->dev, "BulkOut endpoint is %d\n", port->bulk_out_endpointAddress);
 	dev_dbg(&port->dev, "Interrupt endpoint is %d\n", port->interrupt_in_endpointAddress);
@@ -1074,9 +1065,7 @@
 	mos7840_port->read_urb = port->read_urb;
 
 	/* set up our bulk in urb */
-	if ((serial->num_ports == 2)
-		&& ((((__u16)port->number -
-			(__u16)(port->serial->minor)) % 2) != 0)) {
+	if ((serial->num_ports == 2) && (((__u16)port->port_number % 2) != 0)) {
 		usb_fill_bulk_urb(mos7840_port->read_urb,
 			serial->dev,
 			usb_rcvbulkpipe(serial->dev,
@@ -1199,7 +1188,7 @@
 	mos7840_port->read_urb_busy = false;
 
 	port0->open_ports--;
-	dev_dbg(&port->dev, "%s in close%d:in port%d\n", __func__, port0->open_ports, port->number);
+	dev_dbg(&port->dev, "%s in close%d\n", __func__, port0->open_ports);
 	if (port0->open_ports == 0) {
 		if (serial->port[0]->interrupt_in_urb) {
 			dev_dbg(&port->dev, "Shutdown interrupt_in_urb\n");
@@ -1435,9 +1424,7 @@
 	memcpy(urb->transfer_buffer, current_position, transfer_size);
 
 	/* fill urb with data and submit  */
-	if ((serial->num_ports == 2)
-		&& ((((__u16)port->number -
-			(__u16)(port->serial->minor)) % 2) != 0)) {
+	if ((serial->num_ports == 2) && (((__u16)port->port_number % 2) != 0)) {
 		usb_fill_bulk_urb(urb,
 			serial->dev,
 			usb_sndbulkpipe(serial->dev,
@@ -1732,10 +1719,9 @@
 	if (mos7840_serial_paranoia_check(port->serial, __func__))
 		return -1;
 
-	number = mos7840_port->port->number - mos7840_port->port->serial->minor;
+	number = mos7840_port->port->port_number;
 
-	dev_dbg(&port->dev, "%s - port = %d, baud = %d\n", __func__,
-		mos7840_port->port->number, baudRate);
+	dev_dbg(&port->dev, "%s - baud = %d\n", __func__, baudRate);
 	/* reset clk_uart_sel in spregOffset */
 	if (baudRate > 115200) {
 #ifdef HW_flow_control
@@ -2016,7 +2002,6 @@
 		tty->termios.c_cflag, RELEVANT_IFLAG(tty->termios.c_iflag));
 	dev_dbg(&port->dev, "%s - old clfag %08x old iflag %08x\n", __func__,
 		old_termios->c_cflag, RELEVANT_IFLAG(old_termios->c_iflag));
-	dev_dbg(&port->dev, "%s - port %d\n", __func__, port->number);
 
 	/* change the port settings to the new ones specified */
 
@@ -2083,8 +2068,8 @@
 	memset(&tmp, 0, sizeof(tmp));
 
 	tmp.type = PORT_16550A;
-	tmp.line = mos7840_port->port->serial->minor;
-	tmp.port = mos7840_port->port->number;
+	tmp.line = mos7840_port->port->minor;
+	tmp.port = mos7840_port->port->port_number;
 	tmp.irq = 0;
 	tmp.flags = ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ;
 	tmp.xmit_fifo_size = NUM_URBS * URB_TRANSFER_BUFFER_SIZE;
@@ -2240,7 +2225,7 @@
 	/* we set up the pointers to the endpoints in the mos7840_open *
 	 * function, as the structures aren't created yet.             */
 
-	pnum = port->number - serial->minor;
+	pnum = port->port_number;
 
 	dev_dbg(&port->dev, "mos7840_startup: configuring port %d\n", pnum);
 	mos7840_port = kzalloc(sizeof(struct moschip_port), GFP_KERNEL);
@@ -2261,10 +2246,8 @@
 	 * usb-serial.c:get_free_serial() and cannot therefore be used
 	 * to index device instances */
 	mos7840_port->port_num = pnum + 1;
-	dev_dbg(&port->dev, "port->number = %d\n", port->number);
-	dev_dbg(&port->dev, "port->serial->minor = %d\n", port->serial->minor);
+	dev_dbg(&port->dev, "port->minor = %d\n", port->minor);
 	dev_dbg(&port->dev, "mos7840_port->port_num = %d\n", mos7840_port->port_num);
-	dev_dbg(&port->dev, "serial->minor = %d\n", serial->minor);
 
 	if (mos7840_port->port_num == 1) {
 		mos7840_port->SpRegOffset = 0x0;

diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index 5f4b0cd..cbe779f 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c

@@ -348,7 +348,7 @@
 
 	/* fake emulate a 16550 uart to make userspace code happy */
 	tmp.type		= PORT_16550A;
-	tmp.line		= port->serial->minor;
+	tmp.line		= port->minor;
 	tmp.port		= 0;
 	tmp.irq			= 0;
 	tmp.flags		= ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ;
@@ -367,7 +367,7 @@
 {
 	struct usb_serial_port *port = tty->driver_data;
 
-	dev_dbg(&port->dev, "%s - port %d, cmd = 0x%x\n", __func__, port->number, cmd);
+	dev_dbg(&port->dev, "%s - cmd = 0x%x\n", __func__, cmd);
 
 	switch (cmd) {
 	case TIOCGSERIAL:

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index bd4323d..5dd857d 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c

@@ -159,8 +159,6 @@
 #define NOVATELWIRELESS_PRODUCT_HSPA_EMBEDDED_FULLSPEED	0x9000
 #define NOVATELWIRELESS_PRODUCT_HSPA_EMBEDDED_HIGHSPEED	0x9001
 #define NOVATELWIRELESS_PRODUCT_E362		0x9010
-#define NOVATELWIRELESS_PRODUCT_G1		0xA001
-#define NOVATELWIRELESS_PRODUCT_G1_M		0xA002
 #define NOVATELWIRELESS_PRODUCT_G2		0xA010
 #define NOVATELWIRELESS_PRODUCT_MC551		0xB001
 
@@ -730,8 +728,6 @@
 	{ USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_MC547) },
 	{ USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_EVDO_EMBEDDED_HIGHSPEED) },
 	{ USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_HSPA_EMBEDDED_HIGHSPEED) },
-	{ USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_G1) },
-	{ USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_G1_M) },
 	{ USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_G2) },
 	/* Novatel Ovation MC551 a.k.a. Verizon USB551L */
 	{ USB_DEVICE_AND_INTERFACE_INFO(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_MC551, 0xff, 0xff, 0xff) },

diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 048cd44..cb6bbed 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c

@@ -275,7 +275,7 @@
 	u8 control;
 	const int baud_sup[] = { 75, 150, 300, 600, 1200, 1800, 2400, 3600,
 	                         4800, 7200, 9600, 14400, 19200, 28800, 38400,
-	                         57600, 115200, 230400, 460800, 614400,
+	                         57600, 115200, 230400, 460800, 500000, 614400,
 	                         921600, 1228800, 2457600, 3000000, 6000000 };
 	int baud_floor, baud_ceil;
 	int k;
@@ -301,8 +301,7 @@
 	i = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0),
 			    GET_LINE_REQUEST, GET_LINE_REQUEST_TYPE,
 			    0, 0, buf, 7, 100);
-	dev_dbg(&port->dev, "0xa1:0x21:0:0  %d - %x %x %x %x %x %x %x\n", i,
-	    buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6]);
+	dev_dbg(&port->dev, "0xa1:0x21:0:0  %d - %7ph\n", i, buf);
 
 	if (cflag & CSIZE) {
 		switch (cflag & CSIZE) {
@@ -449,8 +448,7 @@
 	i = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0),
 			    GET_LINE_REQUEST, GET_LINE_REQUEST_TYPE,
 			    0, 0, buf, 7, 100);
-	dev_dbg(&port->dev, "0xa1:0x21:0:0  %d - %x %x %x %x %x %x %x\n", i,
-	     buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6]);
+	dev_dbg(&port->dev, "0xa1:0x21:0:0  %d - %7ph\n", i, buf);
 
 	if (cflag & CRTSCTS) {
 		if (spriv->type == HX)
@@ -641,8 +639,8 @@
 	case TIOCGSERIAL:
 		memset(&ser, 0, sizeof ser);
 		ser.type = PORT_16654;
-		ser.line = port->serial->minor;
-		ser.port = port->number;
+		ser.line = port->minor;
+		ser.port = port->port_number;
 		ser.baud_base = 460800;
 
 		if (copy_to_user((void __user *)arg, &ser, sizeof ser))

diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index bd794b4..c65437c 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c

@@ -35,7 +35,13 @@
 	{DEVICE_G1K(0x04da, 0x250c)},	/* Panasonic Gobi QDL device */
 	{DEVICE_G1K(0x413c, 0x8172)},	/* Dell Gobi Modem device */
 	{DEVICE_G1K(0x413c, 0x8171)},	/* Dell Gobi QDL device */
-	{DEVICE_G1K(0x1410, 0xa001)},	/* Novatel Gobi Modem device */
+	{DEVICE_G1K(0x1410, 0xa001)},	/* Novatel/Verizon USB-1000 */
+	{DEVICE_G1K(0x1410, 0xa002)},	/* Novatel Gobi Modem device */
+	{DEVICE_G1K(0x1410, 0xa003)},	/* Novatel Gobi Modem device */
+	{DEVICE_G1K(0x1410, 0xa004)},	/* Novatel Gobi Modem device */
+	{DEVICE_G1K(0x1410, 0xa005)},	/* Novatel Gobi Modem device */
+	{DEVICE_G1K(0x1410, 0xa006)},	/* Novatel Gobi Modem device */
+	{DEVICE_G1K(0x1410, 0xa007)},	/* Novatel Gobi Modem device */
 	{DEVICE_G1K(0x1410, 0xa008)},	/* Novatel Gobi QDL device */
 	{DEVICE_G1K(0x0b05, 0x1776)},	/* Asus Gobi Modem device */
 	{DEVICE_G1K(0x0b05, 0x1774)},	/* Asus Gobi QDL device */

diff --git a/drivers/usb/serial/quatech2.c b/drivers/usb/serial/quatech2.c
index 02b0803..d997432 100644
--- a/drivers/usb/serial/quatech2.c
+++ b/drivers/usb/serial/quatech2.c

@@ -343,7 +343,7 @@
 	int status;
 	unsigned long flags;
 
-	device_port = (u16) (port->number - port->serial->minor);
+	device_port = port->port_number;
 
 	serial = port->serial;
 
@@ -388,9 +388,8 @@
 	status = qt2_set_port_config(serial->dev, device_port,
 				     DEFAULT_BAUD_RATE, UART_LCR_WLEN8);
 	if (status < 0) {
-		dev_err(&port->dev,
-			"%s - initial setup failed for port %i (%i)\n",
-			__func__, port->number, device_port);
+		dev_err(&port->dev, "%s - initial setup failed (%i)\n",
+			__func__, device_port);
 		return status;
 	}
 
@@ -466,7 +465,7 @@
 		return -EFAULT;
 
 	memset(&tmp, 0, sizeof(tmp));
-	tmp.line		= port->serial->minor;
+	tmp.line		= port->minor;
 	tmp.port		= 0;
 	tmp.irq			= 0;
 	tmp.flags		= ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ;
@@ -523,7 +522,7 @@
 	return;
 }
 
-void qt2_process_read_urb(struct urb *urb)
+static void qt2_process_read_urb(struct urb *urb)
 {
 	struct usb_serial *serial;
 	struct qt2_serial_private *serial_priv;

diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 8894665..de958c5 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c

@@ -914,7 +914,7 @@
 		/* This is really the usb-serial port number of the interface
 		 * rather than the interface number.
 		 */
-		ifnum = port->number - serial->minor;
+		ifnum = port->port_number;
 		himemoryp = &typeA_interface_list;
 	}
 

diff --git a/drivers/usb/serial/ssu100.c b/drivers/usb/serial/ssu100.c
index 5b62dbb..e5750be 100644
--- a/drivers/usb/serial/ssu100.c
+++ b/drivers/usb/serial/ssu100.c

@@ -323,7 +323,7 @@
 		return -EFAULT;
 
 	memset(&tmp, 0, sizeof(tmp));
-	tmp.line		= port->serial->minor;
+	tmp.line		= port->minor;
 	tmp.port		= 0;
 	tmp.irq			= 0;
 	tmp.flags		= ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ;

diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index e581c25..7182bb7 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c

@@ -477,7 +477,7 @@
 	if (mutex_lock_interruptible(&tdev->td_open_close_lock))
 		return -ERESTARTSYS;
 
-	port_number = port->number - port->serial->minor;
+	port_number = port->port_number;
 
 	tport->tp_msr = 0;
 	tport->tp_shadow_mcr |= (TI_MCR_RTS | TI_MCR_DTR);
@@ -619,7 +619,7 @@
 	kfifo_reset_out(&tport->write_fifo);
 	spin_unlock_irqrestore(&tport->tp_lock, flags);
 
-	port_number = port->number - port->serial->minor;
+	port_number = port->port_number;
 
 	dev_dbg(&port->dev, "%s - sending TI_CLOSE_PORT\n", __func__);
 	status = ti_command_out_sync(tdev, TI_CLOSE_PORT,
@@ -777,7 +777,7 @@
 	tcflag_t cflag, iflag;
 	int baud;
 	int status;
-	int port_number = port->number - port->serial->minor;
+	int port_number = port->port_number;
 	unsigned int mcr;
 
 	cflag = tty->termios.c_cflag;
@@ -1263,7 +1263,7 @@
 	int size, status;
 	struct ti_device *tdev = tport->tp_tdev;
 	struct usb_serial_port *port = tport->tp_port;
-	int port_number = port->number - port->serial->minor;
+	int port_number = port->port_number;
 	struct ti_port_status *data;
 
 	size = sizeof(struct ti_port_status);
@@ -1309,8 +1309,8 @@
 	memset(&ret_serial, 0, sizeof(ret_serial));
 
 	ret_serial.type = PORT_16550A;
-	ret_serial.line = port->serial->minor;
-	ret_serial.port = port->number - port->serial->minor;
+	ret_serial.line = port->minor;
+	ret_serial.port = port->port_number;
 	ret_serial.flags = tport->tp_flags;
 	ret_serial.xmit_fifo_size = TI_WRITE_BUF_SIZE;
 	ret_serial.baud_base = tport->tp_tdev->td_is_3410 ? 921600 : 460800;

diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 5f6b1ff..cb27fcb 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c

@@ -37,11 +37,15 @@
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
 #include <linux/kfifo.h>
+#include <linux/idr.h>
 #include "pl2303.h"
 
 #define DRIVER_AUTHOR "Greg Kroah-Hartman <gregkh@linuxfoundation.org>"
 #define DRIVER_DESC "USB Serial Driver core"
 
+#define USB_SERIAL_TTY_MAJOR	188
+#define USB_SERIAL_TTY_MINORS	512	/* should be enough for a while */
+
 /* There is no MODULE_DEVICE_TABLE for usbserial.c.  Instead
    the MODULE_DEVICE_TABLE declarations in each serial driver
    cause the "hotplug" program to pull in whatever module is necessary
@@ -49,81 +53,75 @@
    drivers depend on it.
 */
 
-static struct usb_serial *serial_table[SERIAL_TTY_MINORS];
+static DEFINE_IDR(serial_minors);
 static DEFINE_MUTEX(table_lock);
 static LIST_HEAD(usb_serial_driver_list);
 
 /*
- * Look up the serial structure.  If it is found and it hasn't been
- * disconnected, return with its disc_mutex held and its refcount
- * incremented.  Otherwise return NULL.
+ * Look up the serial port structure.  If it is found and it hasn't been
+ * disconnected, return with the parent usb_serial structure's disc_mutex held
+ * and its refcount incremented.  Otherwise return NULL.
  */
-struct usb_serial *usb_serial_get_by_index(unsigned index)
+struct usb_serial_port *usb_serial_port_get_by_minor(unsigned minor)
 {
 	struct usb_serial *serial;
+	struct usb_serial_port *port;
 
 	mutex_lock(&table_lock);
-	serial = serial_table[index];
+	port = idr_find(&serial_minors, minor);
+	if (!port)
+		goto exit;
 
-	if (serial) {
-		mutex_lock(&serial->disc_mutex);
-		if (serial->disconnected) {
-			mutex_unlock(&serial->disc_mutex);
-			serial = NULL;
-		} else {
-			kref_get(&serial->kref);
-		}
+	serial = port->serial;
+	mutex_lock(&serial->disc_mutex);
+	if (serial->disconnected) {
+		mutex_unlock(&serial->disc_mutex);
+		port = NULL;
+	} else {
+		kref_get(&serial->kref);
 	}
+exit:
 	mutex_unlock(&table_lock);
-	return serial;
+	return port;
 }
 
-static struct usb_serial *get_free_serial(struct usb_serial *serial,
-					int num_ports, unsigned int *minor)
+static int allocate_minors(struct usb_serial *serial, int num_ports)
 {
+	struct usb_serial_port *port;
 	unsigned int i, j;
-	int good_spot;
+	int minor;
 
 	dev_dbg(&serial->interface->dev, "%s %d\n", __func__, num_ports);
 
-	*minor = 0;
 	mutex_lock(&table_lock);
-	for (i = 0; i < SERIAL_TTY_MINORS; ++i) {
-		if (serial_table[i])
-			continue;
-
-		good_spot = 1;
-		for (j = 1; j <= num_ports-1; ++j)
-			if ((i+j >= SERIAL_TTY_MINORS) || (serial_table[i+j])) {
-				good_spot = 0;
-				i += j;
-				break;
-			}
-		if (good_spot == 0)
-			continue;
-
-		*minor = i;
-		j = 0;
-		dev_dbg(&serial->interface->dev, "%s - minor base = %d\n", __func__, *minor);
-		for (i = *minor; (i < (*minor + num_ports)) && (i < SERIAL_TTY_MINORS); ++i) {
-			serial_table[i] = serial;
-			serial->port[j++]->number = i;
-		}
-		mutex_unlock(&table_lock);
-		return serial;
+	for (i = 0; i < num_ports; ++i) {
+		port = serial->port[i];
+		minor = idr_alloc(&serial_minors, port, 0, 0, GFP_KERNEL);
+		if (minor < 0)
+			goto error;
+		port->minor = minor;
+		port->port_number = i;
 	}
+	serial->minors_reserved = 1;
 	mutex_unlock(&table_lock);
-	return NULL;
+	return 0;
+error:
+	/* unwind the already allocated minors */
+	for (j = 0; j < i; ++j)
+		idr_remove(&serial_minors, serial->port[j]->minor);
+	mutex_unlock(&table_lock);
+	return minor;
 }
 
-static void return_serial(struct usb_serial *serial)
+static void release_minors(struct usb_serial *serial)
 {
 	int i;
 
 	mutex_lock(&table_lock);
 	for (i = 0; i < serial->num_ports; ++i)
-		serial_table[serial->minor + i] = NULL;
+		idr_remove(&serial_minors, serial->port[i]->minor);
 	mutex_unlock(&table_lock);
+	serial->minors_reserved = 0;
 }
 
 static void destroy_serial(struct kref *kref)
@@ -135,8 +133,8 @@
 	serial = to_usb_serial(kref);
 
 	/* return the minor range that this device had */
-	if (serial->minor != SERIAL_TTY_NO_MINOR)
-		return_serial(serial);
+	if (serial->minors_reserved)
+		release_minors(serial);
 
 	if (serial->attached && serial->type->release)
 		serial->type->release(serial);
@@ -185,13 +183,11 @@
 	struct usb_serial_port *port;
 	int retval = -ENODEV;
 
-	serial = usb_serial_get_by_index(idx);
-	if (!serial)
+	port = usb_serial_port_get_by_minor(idx);
+	if (!port)
 		return retval;
 
-	port = serial->port[idx - serial->minor];
-	if (!port)
-		goto error_no_port;
+	serial = port->serial;
 	if (!try_module_get(serial->type->driver.owner))
 		goto error_module_get;
 
@@ -218,7 +214,6 @@
  error_get_interface:
 	module_put(serial->type->driver.owner);
  error_module_get:
- error_no_port:
 	usb_serial_put(serial);
 	mutex_unlock(&serial->disc_mutex);
 	return retval;
@@ -452,14 +447,16 @@
 static int serial_proc_show(struct seq_file *m, void *v)
 {
 	struct usb_serial *serial;
+	struct usb_serial_port *port;
 	int i;
 	char tmp[40];
 
 	seq_puts(m, "usbserinfo:1.0 driver:2.0\n");
-	for (i = 0; i < SERIAL_TTY_MINORS; ++i) {
-		serial = usb_serial_get_by_index(i);
-		if (serial == NULL)
+	for (i = 0; i < USB_SERIAL_TTY_MINORS; ++i) {
+		port = usb_serial_port_get_by_minor(i);
+		if (port == NULL)
 			continue;
+		serial = port->serial;
 
 		seq_printf(m, "%d:", i);
 		if (serial->type->driver.owner)
@@ -471,7 +468,7 @@
 			le16_to_cpu(serial->dev->descriptor.idVendor),
 			le16_to_cpu(serial->dev->descriptor.idProduct));
 		seq_printf(m, " num_ports:%d", serial->num_ports);
-		seq_printf(m, " port:%d", i - serial->minor + 1);
+		seq_printf(m, " port:%d", port->port_number);
 		usb_make_path(serial->dev, tmp, sizeof(tmp));
 		seq_printf(m, " path:%s", tmp);
 
@@ -613,7 +610,7 @@
 	serial->interface = usb_get_intf(interface);
 	kref_init(&serial->kref);
 	mutex_init(&serial->disc_mutex);
-	serial->minor = SERIAL_TTY_NO_MINOR;
+	serial->minors_reserved = 0;
 
 	return serial;
 }
@@ -722,7 +719,6 @@
 	struct usb_endpoint_descriptor *bulk_out_endpoint[MAX_NUM_PORTS];
 	struct usb_serial_driver *type = NULL;
 	int retval;
-	unsigned int minor;
 	int buffer_size;
 	int i;
 	int j;
@@ -1039,16 +1035,15 @@
 	 */
 	serial->disconnected = 1;
 
-	if (get_free_serial(serial, num_ports, &minor) == NULL) {
-		dev_err(ddev, "No more free serial devices\n");
+	if (allocate_minors(serial, num_ports)) {
+		dev_err(ddev, "No more free serial minor numbers\n");
 		goto probe_error;
 	}
-	serial->minor = minor;
 
 	/* register all of the individual ports with the driver core */
 	for (i = 0; i < num_ports; ++i) {
 		port = serial->port[i];
-		dev_set_name(&port->dev, "ttyUSB%d", port->number);
+		dev_set_name(&port->dev, "ttyUSB%d", port->minor);
 		dev_dbg(ddev, "registering %s", dev_name(&port->dev));
 		device_enable_async_suspend(&port->dev);
 
@@ -1059,7 +1054,7 @@
 
 	serial->disconnected = 0;
 
-	usb_serial_console_init(minor);
+	usb_serial_console_init(serial->port[0]->minor);
 exit:
 	module_put(type->driver.owner);
 	return 0;
@@ -1223,17 +1218,13 @@
 
 static int __init usb_serial_init(void)
 {
-	int i;
 	int result;
 
-	usb_serial_tty_driver = alloc_tty_driver(SERIAL_TTY_MINORS);
+	usb_serial_tty_driver = alloc_tty_driver(USB_SERIAL_TTY_MINORS);
 	if (!usb_serial_tty_driver)
 		return -ENOMEM;
 
 	/* Initialize our global data */
-	for (i = 0; i < SERIAL_TTY_MINORS; ++i)
-		serial_table[i] = NULL;
-
 	result = bus_register(&usb_serial_bus_type);
 	if (result) {
 		pr_err("%s - registering bus driver failed\n", __func__);
@@ -1242,7 +1233,7 @@
 
 	usb_serial_tty_driver->driver_name = "usbserial";
 	usb_serial_tty_driver->name = "ttyUSB";
-	usb_serial_tty_driver->major = SERIAL_TTY_MAJOR;
+	usb_serial_tty_driver->major = USB_SERIAL_TTY_MAJOR;
 	usb_serial_tty_driver->minor_start = 0;
 	usb_serial_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
 	usb_serial_tty_driver->subtype = SERIAL_TYPE_NORMAL;

diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c
index ece326e..8257d30 100644
--- a/drivers/usb/serial/usb_wwan.c
+++ b/drivers/usb/serial/usb_wwan.c

@@ -124,8 +124,8 @@
 		return -EFAULT;
 
 	memset(&tmp, 0, sizeof(tmp));
-	tmp.line            = port->serial->minor;
-	tmp.port            = port->number;
+	tmp.line            = port->minor;
+	tmp.port            = port->port_number;
 	tmp.baud_base       = tty_get_baud_rate(port->port.tty);
 	tmp.close_delay	    = port->port.close_delay / 10;
 	tmp.closing_wait    = port->port.closing_wait == ASYNC_CLOSING_WAIT_NONE ?

diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 347caad..36a7740 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c

@@ -461,8 +461,8 @@
 	case TIOCGSERIAL:
 		memset(&serstruct, 0, sizeof(serstruct));
 		serstruct.type = PORT_16654;
-		serstruct.line = port->serial->minor;
-		serstruct.port = port->number;
+		serstruct.line = port->minor;
+		serstruct.port = port->port_number;
 		serstruct.flags = ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ;
 		serstruct.xmit_fifo_size = kfifo_size(&port->write_fifo);
 		serstruct.custom_divisor = 0;
@@ -626,7 +626,7 @@
 {
 	struct whiteheat_simple open_command;
 
-	open_command.port = port->number - port->serial->minor + 1;
+	open_command.port = port->port_number + 1;
 	return firm_send_command(port, WHITEHEAT_OPEN,
 		(__u8 *)&open_command, sizeof(open_command));
 }
@@ -636,7 +636,7 @@
 {
 	struct whiteheat_simple close_command;
 
-	close_command.port = port->number - port->serial->minor + 1;
+	close_command.port = port->port_number + 1;
 	return firm_send_command(port, WHITEHEAT_CLOSE,
 			(__u8 *)&close_command, sizeof(close_command));
 }
@@ -649,7 +649,7 @@
 	struct whiteheat_port_settings port_settings;
 	unsigned int cflag = tty->termios.c_cflag;
 
-	port_settings.port = port->number - port->serial->minor + 1;
+	port_settings.port = port->port_number + 1;
 
 	/* get the byte size */
 	switch (cflag & CSIZE) {
@@ -726,7 +726,7 @@
 {
 	struct whiteheat_set_rdb rts_command;
 
-	rts_command.port = port->number - port->serial->minor + 1;
+	rts_command.port = port->port_number + 1;
 	rts_command.state = onoff;
 	return firm_send_command(port, WHITEHEAT_SET_RTS,
 			(__u8 *)&rts_command, sizeof(rts_command));
@@ -737,7 +737,7 @@
 {
 	struct whiteheat_set_rdb dtr_command;
 
-	dtr_command.port = port->number - port->serial->minor + 1;
+	dtr_command.port = port->port_number + 1;
 	dtr_command.state = onoff;
 	return firm_send_command(port, WHITEHEAT_SET_DTR,
 			(__u8 *)&dtr_command, sizeof(dtr_command));
@@ -748,7 +748,7 @@
 {
 	struct whiteheat_set_rdb break_command;
 
-	break_command.port = port->number - port->serial->minor + 1;
+	break_command.port = port->port_number + 1;
 	break_command.state = onoff;
 	return firm_send_command(port, WHITEHEAT_SET_BREAK,
 			(__u8 *)&break_command, sizeof(break_command));
@@ -759,7 +759,7 @@
 {
 	struct whiteheat_purge purge_command;
 
-	purge_command.port = port->number - port->serial->minor + 1;
+	purge_command.port = port->port_number + 1;
 	purge_command.what = rxtx;
 	return firm_send_command(port, WHITEHEAT_PURGE,
 			(__u8 *)&purge_command, sizeof(purge_command));
@@ -770,7 +770,7 @@
 {
 	struct whiteheat_simple get_dr_command;
 
-	get_dr_command.port = port->number - port->serial->minor + 1;
+	get_dr_command.port = port->port_number + 1;
 	return firm_send_command(port, WHITEHEAT_GET_DTR_RTS,
 			(__u8 *)&get_dr_command, sizeof(get_dr_command));
 }
@@ -780,7 +780,7 @@
 {
 	struct whiteheat_simple close_command;
 
-	close_command.port = port->number - port->serial->minor + 1;
+	close_command.port = port->port_number + 1;
 	return firm_send_command(port, WHITEHEAT_REPORT_TX_DONE,
 			(__u8 *)&close_command, sizeof(close_command));
 }

diff --git a/drivers/usb/storage/alauda.c b/drivers/usb/storage/alauda.c
index 77a2ddf..6636a58 100644
--- a/drivers/usb/storage/alauda.c
+++ b/drivers/usb/storage/alauda.c

@@ -249,11 +249,7 @@
 /* compute 3-byte ecc on 256 bytes */
 static void nand_compute_ecc(unsigned char *data, unsigned char *ecc) {
 	int i, j, a;
-	unsigned char par, bit, bits[8];
-
-	par = 0;
-	for (j = 0; j < 8; j++)
-		bits[j] = 0;
+	unsigned char par = 0, bit, bits[8] = {0};
 
 	/* collect 16 checksum bits */
 	for (i = 0; i < 256; i++) {

diff --git a/drivers/usb/storage/sddr09.c b/drivers/usb/storage/sddr09.c
index 732027f..073a2c3 100644
--- a/drivers/usb/storage/sddr09.c
+++ b/drivers/usb/storage/sddr09.c

@@ -219,11 +219,7 @@
 /* compute 3-byte ecc on 256 bytes */
 static void nand_compute_ecc(unsigned char *data, unsigned char *ecc) {
 	int i, j, a;
-	unsigned char par, bit, bits[8];
-
-	par = 0;
-	for (j = 0; j < 8; j++)
-		bits[j] = 0;
+	unsigned char par = 0, bit, bits[8] = {0};
 
 	/* collect 16 checksum bits */
 	for (i = 0; i < 256; i++) {

diff --git a/drivers/usb/usb-common.c b/drivers/usb/usb-common.c
index 0db0a91..675384d 100644
--- a/drivers/usb/usb-common.c
+++ b/drivers/usb/usb-common.c

@@ -13,7 +13,9 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/of.h>
 #include <linux/usb/ch9.h>
+#include <linux/usb/of.h>
 #include <linux/usb/otg.h>
 
 const char *usb_otg_state_string(enum usb_otg_state state)
@@ -79,4 +81,37 @@
 }
 EXPORT_SYMBOL_GPL(usb_state_string);
 
+#ifdef CONFIG_OF
+static const char *const usb_dr_modes[] = {
+	[USB_DR_MODE_UNKNOWN]		= "",
+	[USB_DR_MODE_HOST]		= "host",
+	[USB_DR_MODE_PERIPHERAL]	= "peripheral",
+	[USB_DR_MODE_OTG]		= "otg",
+};
+
+/**
+ * of_usb_get_dr_mode - Get dual role mode for given device_node
+ * @np:	Pointer to the given device_node
+ *
+ * The function gets phy interface string from property 'dr_mode',
+ * and returns the correspondig enum usb_dr_mode
+ */
+enum usb_dr_mode of_usb_get_dr_mode(struct device_node *np)
+{
+	const char *dr_mode;
+	int err, i;
+
+	err = of_property_read_string(np, "dr_mode", &dr_mode);
+	if (err < 0)
+		return USB_DR_MODE_UNKNOWN;
+
+	for (i = 0; i < ARRAY_SIZE(usb_dr_modes); i++)
+		if (!strcmp(dr_mode, usb_dr_modes[i]))
+			return i;
+
+	return USB_DR_MODE_UNKNOWN;
+}
+EXPORT_SYMBOL_GPL(of_usb_get_dr_mode);
+#endif
+
 MODULE_LICENSE("GPL");

diff --git a/drivers/usb/wusbcore/devconnect.c b/drivers/usb/wusbcore/devconnect.c
index 1d36531..33a1278 100644
--- a/drivers/usb/wusbcore/devconnect.c
+++ b/drivers/usb/wusbcore/devconnect.c

@@ -455,8 +455,8 @@
 			dev_err(dev, "KEEPALIVE: device %u timed out\n",
 				wusb_dev->addr);
 			__wusbhc_dev_disconnect(wusbhc, wusb_port);
-		} else if (time_after(jiffies, wusb_dev->entry_ts + tt/2)) {
-			/* Approaching timeout cut out, need to refresh */
+		} else if (time_after(jiffies, wusb_dev->entry_ts + tt/3)) {
+			/* Approaching timeout cut off, need to refresh */
 			ie->bDeviceAddress[keep_alives++] = wusb_dev->addr;
 		}
 	}
@@ -1062,7 +1062,7 @@
 	wusbhc->wuie_host_info = hi;
 
 	queue_delayed_work(wusbd, &wusbhc->keep_alive_timer,
-			   (wusbhc->trust_timeout*CONFIG_HZ)/1000/2);
+			   msecs_to_jiffies(wusbhc->trust_timeout / 2));
 
 	return 0;
 

diff --git a/drivers/usb/wusbcore/mmc.c b/drivers/usb/wusbcore/mmc.c
index b8c7258..b71760c 100644
--- a/drivers/usb/wusbcore/mmc.c
+++ b/drivers/usb/wusbcore/mmc.c

@@ -195,6 +195,7 @@
 	struct device *dev = wusbhc->dev;
 
 	WARN_ON(wusbhc->wuie_host_info != NULL);
+	BUG_ON(wusbhc->uwb_rc == NULL);
 
 	result = wusbhc_rsv_establish(wusbhc);
 	if (result < 0) {
@@ -214,9 +215,9 @@
 		dev_err(dev, "error starting security in the HC: %d\n", result);
 		goto error_sec_start;
 	}
-	/* FIXME: the choice of the DNTS parameters is somewhat
-	 * arbitrary */
-	result = wusbhc->set_num_dnts(wusbhc, 0, 15);
+
+	result = wusbhc->set_num_dnts(wusbhc, wusbhc->dnts_interval,
+		wusbhc->dnts_num_slots);
 	if (result < 0) {
 		dev_err(dev, "Cannot set DNTS parameters: %d\n", result);
 		goto error_set_num_dnts;
@@ -276,12 +277,38 @@
 		}
 		wusbhc->chid = *chid;
 	}
+
+	/* register with UWB if we haven't already since we are about to start
+	    the radio. */
+	if ((chid) && (wusbhc->uwb_rc == NULL)) {
+		wusbhc->uwb_rc = uwb_rc_get_by_grandpa(wusbhc->dev->parent);
+		if (wusbhc->uwb_rc == NULL) {
+			result = -ENODEV;
+			dev_err(wusbhc->dev, "Cannot get associated UWB Host Controller\n");
+			goto error_rc_get;
+		}
+
+		result = wusbhc_pal_register(wusbhc);
+		if (result < 0) {
+			dev_err(wusbhc->dev, "Cannot register as a UWB PAL\n");
+			goto error_pal_register;
+		}
+	}
 	mutex_unlock(&wusbhc->mutex);
 
 	if (chid)
 		result = uwb_radio_start(&wusbhc->pal);
 	else
 		uwb_radio_stop(&wusbhc->pal);
+
+	return result;
+
+error_pal_register:
+	uwb_rc_put(wusbhc->uwb_rc);
+	wusbhc->uwb_rc = NULL;
+error_rc_get:
+	mutex_unlock(&wusbhc->mutex);
+
 	return result;
 }
 EXPORT_SYMBOL_GPL(wusbhc_chid_set);

diff --git a/drivers/usb/wusbcore/pal.c b/drivers/usb/wusbcore/pal.c
index d0b172c..59e100c 100644
--- a/drivers/usb/wusbcore/pal.c
+++ b/drivers/usb/wusbcore/pal.c

@@ -45,10 +45,11 @@
 }
 
 /**
- * wusbhc_pal_register - unregister the WUSB HC as a UWB PAL
+ * wusbhc_pal_unregister - unregister the WUSB HC as a UWB PAL
  * @wusbhc: the WUSB HC
  */
 void wusbhc_pal_unregister(struct wusbhc *wusbhc)
 {
-	uwb_pal_unregister(&wusbhc->pal);
+	if (wusbhc->uwb_rc)
+		uwb_pal_unregister(&wusbhc->pal);
 }

diff --git a/drivers/usb/wusbcore/reservation.c b/drivers/usb/wusbcore/reservation.c
index 6f4fafd..ead79f7 100644
--- a/drivers/usb/wusbcore/reservation.c
+++ b/drivers/usb/wusbcore/reservation.c

@@ -80,6 +80,9 @@
 	struct uwb_dev_addr bcid;
 	int ret;
 
+	if (rc == NULL)
+		return -ENODEV;
+
 	rsv = uwb_rsv_create(rc, wusbhc_rsv_complete_cb, wusbhc);
 	if (rsv == NULL)
 		return -ENOMEM;

diff --git a/drivers/usb/wusbcore/rh.c b/drivers/usb/wusbcore/rh.c
index 59ff254..bdb0cc3 100644
--- a/drivers/usb/wusbcore/rh.c
+++ b/drivers/usb/wusbcore/rh.c

@@ -393,26 +393,6 @@
 }
 EXPORT_SYMBOL_GPL(wusbhc_rh_control);
 
-int wusbhc_rh_suspend(struct usb_hcd *usb_hcd)
-{
-	struct wusbhc *wusbhc = usb_hcd_to_wusbhc(usb_hcd);
-	dev_err(wusbhc->dev, "%s (%p [%p]) UNIMPLEMENTED\n", __func__,
-		usb_hcd, wusbhc);
-	/* dump_stack(); */
-	return -ENOSYS;
-}
-EXPORT_SYMBOL_GPL(wusbhc_rh_suspend);
-
-int wusbhc_rh_resume(struct usb_hcd *usb_hcd)
-{
-	struct wusbhc *wusbhc = usb_hcd_to_wusbhc(usb_hcd);
-	dev_err(wusbhc->dev, "%s (%p [%p]) UNIMPLEMENTED\n", __func__,
-		usb_hcd, wusbhc);
-	/* dump_stack(); */
-	return -ENOSYS;
-}
-EXPORT_SYMBOL_GPL(wusbhc_rh_resume);
-
 int wusbhc_rh_start_port_reset(struct usb_hcd *usb_hcd, unsigned port_idx)
 {
 	struct wusbhc *wusbhc = usb_hcd_to_wusbhc(usb_hcd);

diff --git a/drivers/usb/wusbcore/wa-nep.c b/drivers/usb/wusbcore/wa-nep.c
index f67f7f1..ada4e08 100644
--- a/drivers/usb/wusbcore/wa-nep.c
+++ b/drivers/usb/wusbcore/wa-nep.c

@@ -134,9 +134,10 @@
 		case WA_NOTIF_TRANSFER:
 			wa_handle_notif_xfer(wa, notif_hdr);
 			break;
+		case HWA_NOTIF_BPST_ADJ:
+			break; /* no action needed for BPST ADJ. */
 		case DWA_NOTIF_RWAKE:
 		case DWA_NOTIF_PORTSTATUS:
-		case HWA_NOTIF_BPST_ADJ:
 			/* FIXME: unimplemented WA NOTIFs */
 			/* fallthru */
 		default:

diff --git a/drivers/usb/wusbcore/wa-rpipe.c b/drivers/usb/wusbcore/wa-rpipe.c
index f0d546c..9a595c1 100644
--- a/drivers/usb/wusbcore/wa-rpipe.c
+++ b/drivers/usb/wusbcore/wa-rpipe.c

@@ -251,8 +251,8 @@
 static struct usb_wireless_ep_comp_descriptor epc0 = {
 	.bLength = sizeof(epc0),
 	.bDescriptorType = USB_DT_WIRELESS_ENDPOINT_COMP,
-/*	.bMaxBurst = 1, */
-	.bMaxSequence = 31,
+	.bMaxBurst = 1,
+	.bMaxSequence = 2,
 };
 
 /*
@@ -317,6 +317,7 @@
 	struct device *dev = &wa->usb_iface->dev;
 	struct usb_device *usb_dev = urb->dev;
 	struct usb_wireless_ep_comp_descriptor *epcd;
+	u32 ack_window, epcd_max_sequence;
 	u8 unauth;
 
 	epcd = rpipe_epc_find(dev, ep);
@@ -333,8 +334,11 @@
 	rpipe->descr.wBlocks = cpu_to_le16(16);		/* given */
 	/* ep0 maxpktsize is 0x200 (WUSB1.0[4.8.1]) */
 	rpipe->descr.wMaxPacketSize = cpu_to_le16(ep->desc.wMaxPacketSize);
-	rpipe->descr.bHSHubAddress = 0;			/* reserved: zero */
-	rpipe->descr.bHSHubPort = wusb_port_no_to_idx(urb->dev->portnum);
+
+	rpipe->descr.hwa_bMaxBurst = max(min_t(unsigned int,
+				epcd->bMaxBurst, 16U), 1U);
+	rpipe->descr.hwa_bDeviceInfoIndex =
+			wusb_port_no_to_idx(urb->dev->portnum);
 	/* FIXME: use maximum speed as supported or recommended by device */
 	rpipe->descr.bSpeed = usb_pipeendpoint(urb->pipe) == 0 ?
 		UWB_PHY_RATE_53 : UWB_PHY_RATE_200;
@@ -344,26 +348,26 @@
 		le16_to_cpu(rpipe->descr.wRPipeIndex),
 		usb_pipeendpoint(urb->pipe), rpipe->descr.bSpeed);
 
-	/* see security.c:wusb_update_address() */
-	if (unlikely(urb->dev->devnum == 0x80))
-		rpipe->descr.bDeviceAddress = 0;
-	else
-		rpipe->descr.bDeviceAddress = urb->dev->devnum | unauth;
+	rpipe->descr.hwa_reserved = 0;
+
 	rpipe->descr.bEndpointAddress = ep->desc.bEndpointAddress;
 	/* FIXME: bDataSequence */
 	rpipe->descr.bDataSequence = 0;
-	/* FIXME: dwCurrentWindow */
-	rpipe->descr.dwCurrentWindow = cpu_to_le32(1);
-	/* FIXME: bMaxDataSequence */
-	rpipe->descr.bMaxDataSequence = epcd->bMaxSequence - 1;
+
+	/* start with base window of hwa_bMaxBurst bits starting at 0. */
+	ack_window = 0xFFFFFFFF >> (32 - rpipe->descr.hwa_bMaxBurst);
+	rpipe->descr.dwCurrentWindow = cpu_to_le32(ack_window);
+	epcd_max_sequence = max(min_t(unsigned int,
+			epcd->bMaxSequence, 32U), 2U);
+	rpipe->descr.bMaxDataSequence = epcd_max_sequence - 1;
 	rpipe->descr.bInterval = ep->desc.bInterval;
 	/* FIXME: bOverTheAirInterval */
 	rpipe->descr.bOverTheAirInterval = 0;	/* 0 if not isoc */
 	/* FIXME: xmit power & preamble blah blah */
-	rpipe->descr.bmAttribute = ep->desc.bmAttributes & 0x03;
+	rpipe->descr.bmAttribute = (ep->desc.bmAttributes &
+					USB_ENDPOINT_XFERTYPE_MASK);
 	/* rpipe->descr.bmCharacteristics RO */
-	/* FIXME: bmRetryOptions */
-	rpipe->descr.bmRetryOptions = 15;
+	rpipe->descr.bmRetryOptions = (wa->wusb->retry_count & 0xF);
 	/* FIXME: use for assessing link quality? */
 	rpipe->descr.wNumTransactionErrors = 0;
 	result = __rpipe_set_descr(wa, &rpipe->descr,
@@ -387,10 +391,8 @@
 			   const struct usb_host_endpoint *ep,
 			   const struct urb *urb, gfp_t gfp)
 {
-	int result = 0;		/* better code for lack of companion? */
+	int result = 0;
 	struct device *dev = &wa->usb_iface->dev;
-	struct usb_device *usb_dev = urb->dev;
-	u8 unauth = (usb_dev->wusb && !usb_dev->authenticated) ? 0x80 : 0;
 	u8 portnum = wusb_port_no_to_idx(urb->dev->portnum);
 
 #define AIM_CHECK(rdf, val, text)					\
@@ -403,13 +405,10 @@
 			WARN_ON(1);					\
 		}							\
 	} while (0)
-	AIM_CHECK(wMaxPacketSize, cpu_to_le16(ep->desc.wMaxPacketSize),
-		  "(%u vs %u)");
-	AIM_CHECK(bHSHubPort, portnum, "(%u vs %u)");
+	AIM_CHECK(hwa_bDeviceInfoIndex, portnum, "(%u vs %u)");
 	AIM_CHECK(bSpeed, usb_pipeendpoint(urb->pipe) == 0 ?
 			UWB_PHY_RATE_53 : UWB_PHY_RATE_200,
 		  "(%u vs %u)");
-	AIM_CHECK(bDeviceAddress, urb->dev->devnum | unauth, "(%u vs %u)");
 	AIM_CHECK(bEndpointAddress, ep->desc.bEndpointAddress, "(%u vs %u)");
 	AIM_CHECK(bInterval, ep->desc.bInterval, "(%u vs %u)");
 	AIM_CHECK(bmAttribute, ep->desc.bmAttributes & 0x03, "(%u vs %u)");

diff --git a/drivers/usb/wusbcore/wa-xfer.c b/drivers/usb/wusbcore/wa-xfer.c
index 6ef94bc..16968c8 100644
--- a/drivers/usb/wusbcore/wa-xfer.c
+++ b/drivers/usb/wusbcore/wa-xfer.c

@@ -85,6 +85,7 @@
 #include <linux/hash.h>
 #include <linux/ratelimit.h>
 #include <linux/export.h>
+#include <linux/scatterlist.h>
 
 #include "wa-hc.h"
 #include "wusbhc.h"
@@ -442,8 +443,7 @@
 		goto error;
 	}
 	xfer->seg_size = (xfer->seg_size / maxpktsize) * maxpktsize;
-	xfer->segs = (urb->transfer_buffer_length + xfer->seg_size - 1)
-		/ xfer->seg_size;
+	xfer->segs = DIV_ROUND_UP(urb->transfer_buffer_length, xfer->seg_size);
 	if (xfer->segs >= WA_SEGS_MAX) {
 		dev_err(dev, "BUG? ops, number of segments %d bigger than %d\n",
 			(int)(urb->transfer_buffer_length / xfer->seg_size),
@@ -627,6 +627,86 @@
 	}
 }
 
+/* allocate an SG list to store bytes_to_transfer bytes and copy the
+ * subset of the in_sg that matches the buffer subset
+ * we are about to transfer. */
+static struct scatterlist *wa_xfer_create_subset_sg(struct scatterlist *in_sg,
+	const unsigned int bytes_transferred,
+	const unsigned int bytes_to_transfer, unsigned int *out_num_sgs)
+{
+	struct scatterlist *out_sg;
+	unsigned int bytes_processed = 0, offset_into_current_page_data = 0,
+		nents;
+	struct scatterlist *current_xfer_sg = in_sg;
+	struct scatterlist *current_seg_sg, *last_seg_sg;
+
+	/* skip previously transferred pages. */
+	while ((current_xfer_sg) &&
+			(bytes_processed < bytes_transferred)) {
+		bytes_processed += current_xfer_sg->length;
+
+		/* advance the sg if current segment starts on or past the
+			next page. */
+		if (bytes_processed <= bytes_transferred)
+			current_xfer_sg = sg_next(current_xfer_sg);
+	}
+
+	/* the data for the current segment starts in current_xfer_sg.
+		calculate the offset. */
+	if (bytes_processed > bytes_transferred) {
+		offset_into_current_page_data = current_xfer_sg->length -
+			(bytes_processed - bytes_transferred);
+	}
+
+	/* calculate the number of pages needed by this segment. */
+	nents = DIV_ROUND_UP((bytes_to_transfer +
+		offset_into_current_page_data +
+		current_xfer_sg->offset),
+		PAGE_SIZE);
+
+	out_sg = kmalloc((sizeof(struct scatterlist) * nents), GFP_ATOMIC);
+	if (out_sg) {
+		sg_init_table(out_sg, nents);
+
+		/* copy the portion of the incoming SG that correlates to the
+		 * data to be transferred by this segment to the segment SG. */
+		last_seg_sg = current_seg_sg = out_sg;
+		bytes_processed = 0;
+
+		/* reset nents and calculate the actual number of sg entries
+			needed. */
+		nents = 0;
+		while ((bytes_processed < bytes_to_transfer) &&
+				current_seg_sg && current_xfer_sg) {
+			unsigned int page_len = min((current_xfer_sg->length -
+				offset_into_current_page_data),
+				(bytes_to_transfer - bytes_processed));
+
+			sg_set_page(current_seg_sg, sg_page(current_xfer_sg),
+				page_len,
+				current_xfer_sg->offset +
+				offset_into_current_page_data);
+
+			bytes_processed += page_len;
+
+			last_seg_sg = current_seg_sg;
+			current_seg_sg = sg_next(current_seg_sg);
+			current_xfer_sg = sg_next(current_xfer_sg);
+
+			/* only the first page may require additional offset. */
+			offset_into_current_page_data = 0;
+			nents++;
+		}
+
+		/* update num_sgs and terminate the list since we may have
+		 *  concatenated pages. */
+		sg_mark_end(last_seg_sg);
+		*out_num_sgs = nents;
+	}
+
+	return out_sg;
+}
+
 /*
  * Allocate the segs array and initialize each of them
  *
@@ -663,9 +743,9 @@
 						  dto_epd->bEndpointAddress),
 				  &seg->xfer_hdr, xfer_hdr_size,
 				  wa_seg_cb, seg);
-		buf_itr_size = buf_size > xfer->seg_size ?
-			xfer->seg_size : buf_size;
+		buf_itr_size = min(buf_size, xfer->seg_size);
 		if (xfer->is_inbound == 0 && buf_size > 0) {
+			/* outbound data. */
 			seg->dto_urb = usb_alloc_urb(0, GFP_ATOMIC);
 			if (seg->dto_urb == NULL)
 				goto error_dto_alloc;
@@ -679,9 +759,42 @@
 					xfer->urb->transfer_dma + buf_itr;
 				seg->dto_urb->transfer_flags |=
 					URB_NO_TRANSFER_DMA_MAP;
-			} else
-				seg->dto_urb->transfer_buffer =
-					xfer->urb->transfer_buffer + buf_itr;
+				seg->dto_urb->transfer_buffer = NULL;
+				seg->dto_urb->sg = NULL;
+				seg->dto_urb->num_sgs = 0;
+			} else {
+				/* do buffer or SG processing. */
+				seg->dto_urb->transfer_flags &=
+					~URB_NO_TRANSFER_DMA_MAP;
+				/* this should always be 0 before a resubmit. */
+				seg->dto_urb->num_mapped_sgs = 0;
+
+				if (xfer->urb->transfer_buffer) {
+					seg->dto_urb->transfer_buffer =
+						xfer->urb->transfer_buffer +
+						buf_itr;
+					seg->dto_urb->sg = NULL;
+					seg->dto_urb->num_sgs = 0;
+				} else {
+					/* allocate an SG list to store seg_size
+					    bytes and copy the subset of the
+					    xfer->urb->sg that matches the
+					    buffer subset we are about to read.
+					*/
+					seg->dto_urb->sg =
+						wa_xfer_create_subset_sg(
+						xfer->urb->sg,
+						buf_itr, buf_itr_size,
+						&(seg->dto_urb->num_sgs));
+
+					if (!(seg->dto_urb->sg)) {
+						seg->dto_urb->num_sgs	= 0;
+						goto error_sg_alloc;
+					}
+
+					seg->dto_urb->transfer_buffer = NULL;
+				}
+			}
 			seg->dto_urb->transfer_buffer_length = buf_itr_size;
 		}
 		seg->status = WA_SEG_READY;
@@ -690,6 +803,8 @@
 	}
 	return 0;
 
+error_sg_alloc:
+	kfree(seg->dto_urb);
 error_dto_alloc:
 	kfree(xfer->seg[cnt]);
 	cnt--;
@@ -1026,7 +1141,8 @@
 	unsigned long my_flags;
 	unsigned cant_sleep = irqs_disabled() | in_atomic();
 
-	if (urb->transfer_buffer == NULL
+	if ((urb->transfer_buffer == NULL)
+	    && (urb->sg == NULL)
 	    && !(urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)
 	    && urb->transfer_buffer_length != 0) {
 		dev_err(dev, "BUG? urb %p: NULL xfer buffer & NODMA\n", urb);
@@ -1261,7 +1377,7 @@
 	seg = xfer->seg[seg_idx];
 	rpipe = xfer->ep->hcpriv;
 	usb_status = xfer_result->bTransferStatus;
-	dev_dbg(dev, "xfer %p#%u: bTransferStatus 0x%02x (seg %u)\n",
+	dev_dbg(dev, "xfer %p#%u: bTransferStatus 0x%02x (seg status %u)\n",
 		xfer, seg_idx, usb_status, seg->status);
 	if (seg->status == WA_SEG_ABORTED
 	    || seg->status == WA_SEG_ERROR)	/* already handled */
@@ -1276,8 +1392,8 @@
 	}
 	if (usb_status & 0x80) {
 		seg->result = wa_xfer_status_to_errno(usb_status);
-		dev_err(dev, "DTI: xfer %p#%u failed (0x%02x)\n",
-			xfer, seg->index, usb_status);
+		dev_err(dev, "DTI: xfer %p#:%08X:%u failed (0x%02x)\n",
+			xfer, xfer->id, seg->index, usb_status);
 		goto error_complete;
 	}
 	/* FIXME: we ignore warnings, tally them for stats */
@@ -1286,18 +1402,47 @@
 	if (xfer->is_inbound) {	/* IN data phase: read to buffer */
 		seg->status = WA_SEG_DTI_PENDING;
 		BUG_ON(wa->buf_in_urb->status == -EINPROGRESS);
+		/* this should always be 0 before a resubmit. */
+		wa->buf_in_urb->num_mapped_sgs	= 0;
+
 		if (xfer->is_dma) {
 			wa->buf_in_urb->transfer_dma =
 				xfer->urb->transfer_dma
-				+ seg_idx * xfer->seg_size;
+				+ (seg_idx * xfer->seg_size);
 			wa->buf_in_urb->transfer_flags
 				|= URB_NO_TRANSFER_DMA_MAP;
+			wa->buf_in_urb->transfer_buffer = NULL;
+			wa->buf_in_urb->sg = NULL;
+			wa->buf_in_urb->num_sgs = 0;
 		} else {
-			wa->buf_in_urb->transfer_buffer =
-				xfer->urb->transfer_buffer
-				+ seg_idx * xfer->seg_size;
+			/* do buffer or SG processing. */
 			wa->buf_in_urb->transfer_flags
 				&= ~URB_NO_TRANSFER_DMA_MAP;
+
+			if (xfer->urb->transfer_buffer) {
+				wa->buf_in_urb->transfer_buffer =
+					xfer->urb->transfer_buffer
+					+ (seg_idx * xfer->seg_size);
+				wa->buf_in_urb->sg = NULL;
+				wa->buf_in_urb->num_sgs = 0;
+			} else {
+				/* allocate an SG list to store seg_size bytes
+					and copy the subset of the xfer->urb->sg
+					that matches the buffer subset we are
+					about to read. */
+				wa->buf_in_urb->sg = wa_xfer_create_subset_sg(
+					xfer->urb->sg,
+					seg_idx * xfer->seg_size,
+					le32_to_cpu(
+						xfer_result->dwTransferLength),
+					&(wa->buf_in_urb->num_sgs));
+
+				if (!(wa->buf_in_urb->sg)) {
+					wa->buf_in_urb->num_sgs	= 0;
+					goto error_sg_alloc;
+				}
+				wa->buf_in_urb->transfer_buffer = NULL;
+			}
 		}
 		wa->buf_in_urb->transfer_buffer_length =
 			le32_to_cpu(xfer_result->dwTransferLength);
@@ -1330,6 +1475,8 @@
 		dev_err(dev, "xfer %p#%u: can't submit DTI data phase: %d\n",
 			xfer, seg_idx, result);
 	seg->result = result;
+	kfree(wa->buf_in_urb->sg);
+error_sg_alloc:
 error_complete:
 	seg->status = WA_SEG_ERROR;
 	xfer->segs_done++;
@@ -1381,6 +1528,10 @@
 	unsigned long flags;
 	u8 done = 0;
 
+	/* free the sg if it was used. */
+	kfree(urb->sg);
+	urb->sg = NULL;
+
 	switch (urb->status) {
 	case 0:
 		spin_lock_irqsave(&xfer->lock, flags);

diff --git a/drivers/usb/wusbcore/wusbhc.c b/drivers/usb/wusbcore/wusbhc.c
index 0faca16..742c607 100644
--- a/drivers/usb/wusbcore/wusbhc.c
+++ b/drivers/usb/wusbcore/wusbhc.c

@@ -75,12 +75,11 @@
 		result = -EINVAL;
 		goto out;
 	}
-	/* FIXME: maybe we should check for range validity? */
-	wusbhc->trust_timeout = trust_timeout;
+	wusbhc->trust_timeout = min_t(unsigned, trust_timeout, 500);
 	cancel_delayed_work(&wusbhc->keep_alive_timer);
 	flush_workqueue(wusbd);
 	queue_delayed_work(wusbd, &wusbhc->keep_alive_timer,
-			   (trust_timeout * CONFIG_HZ)/1000/2);
+			   msecs_to_jiffies(wusbhc->trust_timeout / 2));
 out:
 	return result < 0 ? result : size;
 }
@@ -176,11 +175,72 @@
 }
 static DEVICE_ATTR(wusb_phy_rate, 0644, wusb_phy_rate_show, wusb_phy_rate_store);
 
+static ssize_t wusb_dnts_show(struct device *dev,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	struct wusbhc *wusbhc = usbhc_dev_to_wusbhc(dev);
+
+	return sprintf(buf, "num slots: %d\ninterval: %dms\n",
+			wusbhc->dnts_num_slots, wusbhc->dnts_interval);
+}
+
+static ssize_t wusb_dnts_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t size)
+{
+	struct wusbhc *wusbhc = usbhc_dev_to_wusbhc(dev);
+	uint8_t num_slots, interval;
+	ssize_t result;
+
+	result = sscanf(buf, "%hhu %hhu", &num_slots, &interval);
+
+	if (result != 2)
+		return -EINVAL;
+
+	wusbhc->dnts_num_slots = num_slots;
+	wusbhc->dnts_interval = interval;
+
+	return size;
+}
+static DEVICE_ATTR(wusb_dnts, 0644, wusb_dnts_show, wusb_dnts_store);
+
+static ssize_t wusb_retry_count_show(struct device *dev,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	struct wusbhc *wusbhc = usbhc_dev_to_wusbhc(dev);
+
+	return sprintf(buf, "%d\n", wusbhc->retry_count);
+}
+
+static ssize_t wusb_retry_count_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t size)
+{
+	struct wusbhc *wusbhc = usbhc_dev_to_wusbhc(dev);
+	uint8_t retry_count;
+	ssize_t result;
+
+	result = sscanf(buf, "%hhu", &retry_count);
+
+	if (result != 1)
+		return -EINVAL;
+
+	wusbhc->retry_count = max_t(uint8_t, retry_count, WUSB_RETRY_COUNT_MAX);
+
+	return size;
+}
+static DEVICE_ATTR(wusb_retry_count, 0644, wusb_retry_count_show,
+	wusb_retry_count_store);
+
 /* Group all the WUSBHC attributes */
 static struct attribute *wusbhc_attrs[] = {
 		&dev_attr_wusb_trust_timeout.attr,
 		&dev_attr_wusb_chid.attr,
 		&dev_attr_wusb_phy_rate.attr,
+		&dev_attr_wusb_dnts.attr,
+		&dev_attr_wusb_retry_count.attr,
 		NULL,
 };
 
@@ -206,8 +266,12 @@
 {
 	int result = 0;
 
+	/* set defaults.  These can be overwritten using sysfs attributes. */
 	wusbhc->trust_timeout = WUSB_TRUST_TIMEOUT_MS;
 	wusbhc->phy_rate = UWB_PHY_RATE_INVALID - 1;
+	wusbhc->dnts_num_slots = 4;
+	wusbhc->dnts_interval = 2;
+	wusbhc->retry_count = WUSB_RETRY_COUNT_INFINITE;
 
 	mutex_init(&wusbhc->mutex);
 	result = wusbhc_mmcie_create(wusbhc);
@@ -261,13 +325,7 @@
 		goto error_create_attr_group;
 	}
 
-	result = wusbhc_pal_register(wusbhc);
-	if (result < 0)
-		goto error_pal_register;
 	return 0;
-
-error_pal_register:
-	sysfs_remove_group(wusbhc_kobj(wusbhc), &wusbhc_attr_group);
 error_create_attr_group:
 	return result;
 }
@@ -393,7 +451,8 @@
  */
 void wusbhc_reset_all(struct wusbhc *wusbhc)
 {
-	uwb_rc_reset_all(wusbhc->uwb_rc);
+	if (wusbhc->uwb_rc)
+		uwb_rc_reset_all(wusbhc->uwb_rc);
 }
 EXPORT_SYMBOL_GPL(wusbhc_reset_all);
 

diff --git a/drivers/usb/wusbcore/wusbhc.h b/drivers/usb/wusbcore/wusbhc.h
index 3a2d091..711b195 100644
--- a/drivers/usb/wusbcore/wusbhc.h
+++ b/drivers/usb/wusbcore/wusbhc.h

@@ -69,6 +69,8 @@
  * zone 0.
  */
 #define WUSB_CHANNEL_STOP_DELAY_MS 8
+#define WUSB_RETRY_COUNT_MAX 15
+#define WUSB_RETRY_COUNT_INFINITE 0
 
 /**
  * Wireless USB device
@@ -252,6 +254,9 @@
 	unsigned trust_timeout;			/* in jiffies */
 	struct wusb_ckhdid chid;
 	uint8_t phy_rate;
+	uint8_t dnts_num_slots;
+	uint8_t dnts_interval;
+	uint8_t retry_count;
 	struct wuie_host_info *wuie_host_info;
 
 	struct mutex mutex;			/* locks everything else */
@@ -399,8 +404,6 @@
 
 extern int wusbhc_rh_status_data(struct usb_hcd *, char *);
 extern int wusbhc_rh_control(struct usb_hcd *, u16, u16, u16, char *, u16);
-extern int wusbhc_rh_suspend(struct usb_hcd *);
-extern int wusbhc_rh_resume(struct usb_hcd *);
 extern int wusbhc_rh_start_port_reset(struct usb_hcd *, unsigned);
 
 /* MMC handling */

diff --git a/drivers/uwb/drp.c b/drivers/uwb/drp.c
index 3fbcf78..16ada834 100644
--- a/drivers/uwb/drp.c
+++ b/drivers/uwb/drp.c

@@ -67,14 +67,14 @@
 	} else
 		dev_err(&rc->uwb_dev.dev, "SET-DRP-IE: timeout\n");
 
-	spin_lock_bh(&rc->rsvs_lock);
+	spin_lock_irq(&rc->rsvs_lock);
 	if (rc->set_drp_ie_pending > 1) {
 		rc->set_drp_ie_pending = 0;
 		uwb_rsv_queue_update(rc);	
 	} else {
 		rc->set_drp_ie_pending = 0;	
 	}
-	spin_unlock_bh(&rc->rsvs_lock);
+	spin_unlock_irq(&rc->rsvs_lock);
 }
 
 /**

diff --git a/drivers/uwb/est.c b/drivers/uwb/est.c
index 86ed7e6..457f31d 100644
--- a/drivers/uwb/est.c
+++ b/drivers/uwb/est.c

@@ -436,7 +436,6 @@
 	unsigned long flags;
 	unsigned itr;
 	u16 type_event_high, event;
-	u8 *ptr = (u8 *) rceb;
 
 	read_lock_irqsave(&uwb_est_lock, flags);
 	size = -ENOSPC;
@@ -453,12 +452,12 @@
 		if (size != -ENOENT)
 			goto out;
 	}
-	dev_dbg(dev, "event 0x%02x/%04x/%02x: no handlers available; "
-		"RCEB %02x %02x %02x %02x\n",
+	dev_dbg(dev,
+		"event 0x%02x/%04x/%02x: no handlers available; RCEB %4ph\n",
 		(unsigned) rceb->bEventType,
 		(unsigned) le16_to_cpu(rceb->wEvent),
 		(unsigned) rceb->bEventContext,
-		ptr[0], ptr[1], ptr[2], ptr[3]);
+		rceb);
 	size = -ENOENT;
 out:
 	read_unlock_irqrestore(&uwb_est_lock, flags);

diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c
index 810c90a..0621abe 100644
--- a/drivers/uwb/hwa-rc.c
+++ b/drivers/uwb/hwa-rc.c

@@ -900,6 +900,12 @@
 	/* Intel i1480 (using firmware 1.3PA2-20070828) */
 	{ USB_DEVICE_AND_INTERFACE_INFO(0x8086, 0x0c3b, 0xe0, 0x01, 0x02),
 	  .driver_info = WUSB_QUIRK_WHCI_CMD_EVT },
+	/* Alereon 5310 */
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x13dc, 0x5310, 0xe0, 0x01, 0x02),
+	  .driver_info = WUSB_QUIRK_WHCI_CMD_EVT },
+	/* Alereon 5611 */
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x13dc, 0x5611, 0xe0, 0x01, 0x02),
+	  .driver_info = WUSB_QUIRK_WHCI_CMD_EVT },
 	/* Generic match for the Radio Control interface */
 	{ USB_INTERFACE_INFO(0xe0, 0x01, 0x02), },
 	{ },

diff --git a/drivers/uwb/pal.c b/drivers/uwb/pal.c
index 8ee7d90..690577d 100644
--- a/drivers/uwb/pal.c
+++ b/drivers/uwb/pal.c

@@ -44,10 +44,12 @@
 	int ret;
 
 	if (pal->device) {
+		/* create a link to the uwb_rc in the PAL device's directory. */
 		ret = sysfs_create_link(&pal->device->kobj,
 					&rc->uwb_dev.dev.kobj, "uwb_rc");
 		if (ret < 0)
 			return ret;
+		/* create a link to the PAL in the UWB device's directory. */
 		ret = sysfs_create_link(&rc->uwb_dev.dev.kobj,
 					&pal->device->kobj, pal->name);
 		if (ret < 0) {

diff --git a/drivers/uwb/rsv.c b/drivers/uwb/rsv.c
index f4ae05f..738e8a8 100644
--- a/drivers/uwb/rsv.c
+++ b/drivers/uwb/rsv.c

@@ -872,7 +872,7 @@
  */
 void uwb_rsv_sched_update(struct uwb_rc *rc)
 {
-	spin_lock_bh(&rc->rsvs_lock);
+	spin_lock_irq(&rc->rsvs_lock);
 	if (!delayed_work_pending(&rc->rsv_update_work)) {
 		if (rc->set_drp_ie_pending > 0) {
 			rc->set_drp_ie_pending++;
@@ -881,7 +881,7 @@
 		uwb_rsv_queue_update(rc);
 	}
 unlock:
-	spin_unlock_bh(&rc->rsvs_lock);
+	spin_unlock_irq(&rc->rsvs_lock);
 }
 
 /*

diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h
index a7494bf1..9a103b1 100644
--- a/drivers/uwb/uwb-internal.h
+++ b/drivers/uwb/uwb-internal.h

@@ -55,7 +55,8 @@
 
 static inline void __uwb_rc_put(struct uwb_rc *rc)
 {
-	uwb_dev_put(&rc->uwb_dev);
+	if (rc)
+		uwb_dev_put(&rc->uwb_dev);
 }
 
 extern int uwb_rc_reset(struct uwb_rc *rc);

diff --git a/drivers/uwb/whci.c b/drivers/uwb/whci.c
index f48093e..c9df8ba 100644
--- a/drivers/uwb/whci.c
+++ b/drivers/uwb/whci.c

@@ -253,19 +253,7 @@
 	.remove   = whci_remove,
 };
 
-static int __init whci_init(void)
-{
-	return pci_register_driver(&whci_driver);
-}
-
-static void __exit whci_exit(void)
-{
-	pci_unregister_driver(&whci_driver);
-}
-
-module_init(whci_init);
-module_exit(whci_exit);
-
+module_pci_driver(whci_driver);
 MODULE_DESCRIPTION("WHCI UWB Multi-interface Controller enumerator");
 MODULE_AUTHOR("Cambridge Silicon Radio Ltd.");
 MODULE_LICENSE("GPL");

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index ac37254..c5179e2 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c

@@ -499,7 +499,6 @@
 	}
 
 	vma->vm_private_data = vdev;
-	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
 

diff --git a/drivers/video/au1100fb.c b/drivers/video/au1100fb.c
index 700cac0..ebeb971 100644
--- a/drivers/video/au1100fb.c
+++ b/drivers/video/au1100fb.c

@@ -385,8 +385,6 @@
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	pgprot_val(vma->vm_page_prot) |= (6 << 9); //CCA=6
 
-	vma->vm_flags |= VM_IO;
-
 	if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
 				vma->vm_end - vma->vm_start,
 				vma->vm_page_prot)) {

diff --git a/drivers/video/au1200fb.c b/drivers/video/au1200fb.c
index 1b59054..301224e 100644
--- a/drivers/video/au1200fb.c
+++ b/drivers/video/au1200fb.c

@@ -1258,13 +1258,9 @@
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	pgprot_val(vma->vm_page_prot) |= _CACHE_MASK; /* CCA=7 */
 
-	vma->vm_flags |= VM_IO;
-
 	return io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
 				  vma->vm_end - vma->vm_start,
 				  vma->vm_page_prot);
-
-	return 0;
 }
 
 static void set_global(u_int cmd, struct au1200_lcd_global_regs_t *pdata)

diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index a92783e..d55b337 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c

@@ -556,34 +556,6 @@
 	return err;
 }
 
-static int fbcon_takeover(int show_logo)
-{
-	int err, i;
-
-	if (!num_registered_fb)
-		return -ENODEV;
-
-	if (!show_logo)
-		logo_shown = FBCON_LOGO_DONTSHOW;
-
-	for (i = first_fb_vc; i <= last_fb_vc; i++)
-		con2fb_map[i] = info_idx;
-
-	err = take_over_console(&fb_con, first_fb_vc, last_fb_vc,
-				fbcon_is_default);
-
-	if (err) {
-		for (i = first_fb_vc; i <= last_fb_vc; i++) {
-			con2fb_map[i] = -1;
-		}
-		info_idx = -1;
-	} else {
-		fbcon_has_console_bind = 1;
-	}
-
-	return err;
-}
-
 #ifdef MODULE
 static void fbcon_prepare_logo(struct vc_data *vc, struct fb_info *info,
 			       int cols, int rows, int new_cols, int new_rows)
@@ -901,7 +873,7 @@
 /*
  *  Low Level Operations
  */
-/* NOTE: fbcon cannot be __init: it may be called from take_over_console later */
+/* NOTE: fbcon cannot be __init: it may be called from do_take_over_console later */
 static int var_to_display(struct display *disp,
 			  struct fb_var_screeninfo *var,
 			  struct fb_info *info)
@@ -3543,8 +3515,9 @@
 			}
 		}
 
+		do_fbcon_takeover(0);
 		console_unlock();
-		fbcon_takeover(0);
+
 	}
 }
 
@@ -3648,8 +3621,8 @@
 	fbcon_deinit_device();
 	device_destroy(fb_class, MKDEV(0, 0));
 	fbcon_exit();
+	do_unregister_con_driver(&fb_con);
 	console_unlock();
-	unregister_con_driver(&fb_con);
 }	
 
 module_exit(fb_console_exit);

diff --git a/drivers/video/console/mdacon.c b/drivers/video/console/mdacon.c
index 0b67866..296e945 100644
--- a/drivers/video/console/mdacon.c
+++ b/drivers/video/console/mdacon.c

@@ -585,10 +585,14 @@
 
 int __init mda_console_init(void)
 {
+	int err;
+
 	if (mda_first_vc > mda_last_vc)
 		return 1;
-
-	return take_over_console(&mda_con, mda_first_vc-1, mda_last_vc-1, 0);
+	console_lock();
+	err = do_take_over_console(&mda_con, mda_first_vc-1, mda_last_vc-1, 0);
+	console_unlock();
+	return err;
 }
 
 static void __exit mda_console_exit(void)

diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c
index b05afd0..a6ab929 100644
--- a/drivers/video/console/newport_con.c
+++ b/drivers/video/console/newport_con.c

@@ -297,7 +297,7 @@
 		newport_set_def_font(i, NULL);
 }
 
-/* Can't be __init, take_over_console may call it later */
+/* Can't be __init, do_take_over_console may call it later */
 static const char *newport_startup(void)
 {
 	int i;
@@ -746,6 +746,7 @@
 			 const struct gio_device_id *id)
 {
 	unsigned long newport_addr;
+	int err;
 
 	if (!dev->resource.start)
 		return -EINVAL;
@@ -759,8 +760,10 @@
 
 	npregs = (struct newport_regs *)/* ioremap cannot fail */
 		ioremap(newport_addr, sizeof(struct newport_regs));
-
-	return take_over_console(&newport_con, 0, MAX_NR_CONSOLES - 1, 1);
+	console_lock();
+	err = do_take_over_console(&newport_con, 0, MAX_NR_CONSOLES - 1, 1);
+	console_unlock();
+	return err;
 }
 
 static void newport_remove(struct gio_device *dev)

diff --git a/drivers/video/console/sticon.c b/drivers/video/console/sticon.c
index 491c1c1..5f65ca3 100644
--- a/drivers/video/console/sticon.c
+++ b/drivers/video/console/sticon.c

@@ -372,6 +372,7 @@
 
 static int __init sticonsole_init(void)
 {
+    int err;
     /* already initialized ? */
     if (sticon_sti)
 	 return 0;
@@ -382,7 +383,10 @@
 
     if (conswitchp == &dummy_con) {
 	printk(KERN_INFO "sticon: Initializing STI text console.\n");
-	return take_over_console(&sti_con, 0, MAX_NR_CONSOLES - 1, 1);
+	console_lock();
+	err = do_take_over_console(&sti_con, 0, MAX_NR_CONSOLES - 1, 1);
+	console_unlock();
+	return err;
     }
     return 0;
 }

diff --git a/drivers/video/pxa3xx-gcu.c b/drivers/video/pxa3xx-gcu.c
index 97563c5..7cf0b13 100644
--- a/drivers/video/pxa3xx-gcu.c
+++ b/drivers/video/pxa3xx-gcu.c

@@ -494,7 +494,6 @@
 		if (size != resource_size(priv->resource_mem))
 			return -EINVAL;
 
-		vma->vm_flags |= VM_IO;
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 		return io_remap_pfn_range(vma, vma->vm_start,

diff --git a/drivers/vme/bridges/vme_tsi148.c b/drivers/vme/bridges/vme_tsi148.c
index 9c1aa4d..94c892f 100644
--- a/drivers/vme/bridges/vme_tsi148.c
+++ b/drivers/vme/bridges/vme_tsi148.c

@@ -169,7 +169,7 @@
 	unsigned int error_addr_high, error_addr_low;
 	unsigned long long error_addr;
 	u32 error_attrib;
-	struct vme_bus_error *error;
+	struct vme_bus_error *error = NULL;
 	struct tsi148_driver *bridge;
 
 	bridge = tsi148_bridge->driver_priv;
@@ -186,16 +186,22 @@
 			"Occurred\n");
 	}
 
-	error = kmalloc(sizeof(struct vme_bus_error), GFP_ATOMIC);
-	if (error) {
-		error->address = error_addr;
-		error->attributes = error_attrib;
-		list_add_tail(&error->list, &tsi148_bridge->vme_errors);
-	} else {
-		dev_err(tsi148_bridge->parent, "Unable to alloc memory for "
-			"VMEbus Error reporting\n");
-		dev_err(tsi148_bridge->parent, "VME Bus Error at address: "
-			"0x%llx, attributes: %08x\n", error_addr, error_attrib);
+	if (err_chk) {
+		error = kmalloc(sizeof(struct vme_bus_error), GFP_ATOMIC);
+		if (error) {
+			error->address = error_addr;
+			error->attributes = error_attrib;
+			list_add_tail(&error->list, &tsi148_bridge->vme_errors);
+		} else {
+			dev_err(tsi148_bridge->parent,
+				"Unable to alloc memory for VMEbus Error reporting\n");
+		}
+	}
+
+	if (!error) {
+		dev_err(tsi148_bridge->parent,
+			"VME Bus Error at address: 0x%llx, attributes: %08x\n",
+			error_addr, error_attrib);
 	}
 
 	/* Clear Status */
@@ -2294,12 +2300,13 @@
 	dev_info(tsi148_bridge->parent, "CR/CSR Offset: %d\n", cbar);
 
 	crat = ioread32be(bridge->base + TSI148_LCSR_CRAT);
-	if (crat & TSI148_LCSR_CRAT_EN) {
+	if (crat & TSI148_LCSR_CRAT_EN)
+		dev_info(tsi148_bridge->parent, "CR/CSR already enabled\n");
+	else {
 		dev_info(tsi148_bridge->parent, "Enabling CR/CSR space\n");
 		iowrite32be(crat | TSI148_LCSR_CRAT_EN,
 			bridge->base + TSI148_LCSR_CRAT);
-	} else
-		dev_info(tsi148_bridge->parent, "CR/CSR already enabled\n");
+	}
 
 	/* If we want flushed, error-checked writes, set up a window
 	 * over the CR/CSR registers. We read from here to safely flush
@@ -2441,13 +2448,6 @@
 		spin_lock_init(&tsi148_device->flush_image->lock);
 		tsi148_device->flush_image->locked = 1;
 		tsi148_device->flush_image->number = master_num;
-		tsi148_device->flush_image->address_attr = VME_A16 | VME_A24 |
-			VME_A32 | VME_A64;
-		tsi148_device->flush_image->cycle_attr = VME_SCT | VME_BLT |
-			VME_MBLT | VME_2eVME | VME_2eSST | VME_2eSSTB |
-			VME_2eSST160 | VME_2eSST267 | VME_2eSST320 | VME_SUPER |
-			VME_USER | VME_PROG | VME_DATA;
-		tsi148_device->flush_image->width_attr = VME_D16 | VME_D32;
 		memset(&tsi148_device->flush_image->bus_resource, 0,
 			sizeof(struct resource));
 		tsi148_device->flush_image->kern_base  = NULL;
@@ -2582,7 +2582,8 @@
 	dev_info(&pdev->dev, "VME Write and flush and error check is %s\n",
 		err_chk ? "enabled" : "disabled");
 
-	if (tsi148_crcsr_init(tsi148_bridge, pdev)) {
+	retval = tsi148_crcsr_init(tsi148_bridge, pdev);
+	if (retval) {
 		dev_err(&pdev->dev, "CR/CSR configuration failed.\n");
 		goto err_crcsr;
 	}

diff --git a/drivers/vme/vme.c b/drivers/vme/vme.c
index 5e6c7d7..f6856b42 100644
--- a/drivers/vme/vme.c
+++ b/drivers/vme/vme.c

@@ -959,6 +959,8 @@
 
 	mutex_unlock(&ctrlr->mtx);
 
+	kfree(resource);
+
 	return 0;
 }
 EXPORT_SYMBOL(vme_dma_free);

diff --git a/drivers/w1/masters/w1-gpio.c b/drivers/w1/masters/w1-gpio.c
index 46d9701..f54ece2 100644
--- a/drivers/w1/masters/w1-gpio.c
+++ b/drivers/w1/masters/w1-gpio.c

@@ -16,7 +16,6 @@
 #include <linux/gpio.h>
 #include <linux/of_platform.h>
 #include <linux/of_gpio.h>
-#include <linux/pinctrl/consumer.h>
 #include <linux/err.h>
 #include <linux/of.h>
 
@@ -78,13 +77,8 @@
 {
 	struct w1_bus_master *master;
 	struct w1_gpio_platform_data *pdata;
-	struct pinctrl *pinctrl;
 	int err;
 
-	pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
-	if (IS_ERR(pinctrl))
-		dev_warn(&pdev->dev, "unable to select pin group\n");
-
 	if (of_have_populated_dt()) {
 		err = w1_gpio_probe_dt(pdev);
 		if (err < 0) {

diff --git a/drivers/w1/slaves/w1_ds2408.c b/drivers/w1/slaves/w1_ds2408.c
index e45eca1..91cc2cd 100644
--- a/drivers/w1/slaves/w1_ds2408.c
+++ b/drivers/w1/slaves/w1_ds2408.c

@@ -22,6 +22,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jean-Francois Dagenais <dagenaisj@sonatest.com>");
 MODULE_DESCRIPTION("w1 family 29 driver for DS2408 8 Pin IO");
+MODULE_ALIAS("w1-family-" __stringify(W1_FAMILY_DS2408));
 
 
 #define W1_F29_RETRIES		3

diff --git a/drivers/w1/slaves/w1_ds2413.c b/drivers/w1/slaves/w1_ds2413.c
index 8297862..8593777 100644
--- a/drivers/w1/slaves/w1_ds2413.c
+++ b/drivers/w1/slaves/w1_ds2413.c

@@ -23,6 +23,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Mariusz Bialonczyk <manio@skyboo.net>");
 MODULE_DESCRIPTION("w1 family 3a driver for DS2413 2 Pin IO");
+MODULE_ALIAS("w1-family-" __stringify(W1_FAMILY_DS2413));
 
 #define W1_F3A_RETRIES                     3
 #define W1_F3A_FUNC_PIO_ACCESS_READ        0xF5

diff --git a/drivers/w1/slaves/w1_ds2423.c b/drivers/w1/slaves/w1_ds2423.c
index 40a10b5..7f86aec 100644
--- a/drivers/w1/slaves/w1_ds2423.c
+++ b/drivers/w1/slaves/w1_ds2423.c

@@ -164,3 +164,4 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Mika Laitio <lamikr@pilppa.org>");
 MODULE_DESCRIPTION("w1 family 1d driver for DS2423, 4 counters and 4kb ram");
+MODULE_ALIAS("w1-family-" __stringify(W1_COUNTER_DS2423));

diff --git a/drivers/w1/slaves/w1_ds2431.c b/drivers/w1/slaves/w1_ds2431.c
index 984b303..cef8605 100644
--- a/drivers/w1/slaves/w1_ds2431.c
+++ b/drivers/w1/slaves/w1_ds2431.c

@@ -310,3 +310,4 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Bernhard Weirich <bernhard.weirich@riedel.net>");
 MODULE_DESCRIPTION("w1 family 2d driver for DS2431, 1kb EEPROM");
+MODULE_ALIAS("w1-family-" __stringify(W1_EEPROM_DS2431));

diff --git a/drivers/w1/slaves/w1_ds2433.c b/drivers/w1/slaves/w1_ds2433.c
index 85f2cdb..10cc1b6 100644
--- a/drivers/w1/slaves/w1_ds2433.c
+++ b/drivers/w1/slaves/w1_ds2433.c

@@ -29,6 +29,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Ben Gardner <bgardner@wabtec.com>");
 MODULE_DESCRIPTION("w1 family 23 driver for DS2433, 4kb EEPROM");
+MODULE_ALIAS("w1-family-" __stringify(W1_EEPROM_DS2433));
 
 #define W1_EEPROM_SIZE		512
 #define W1_PAGE_COUNT		16

diff --git a/drivers/w1/slaves/w1_ds2760.c b/drivers/w1/slaves/w1_ds2760.c
index e86a69d..93719d25 100644
--- a/drivers/w1/slaves/w1_ds2760.c
+++ b/drivers/w1/slaves/w1_ds2760.c

@@ -203,3 +203,4 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Szabolcs Gyurko <szabolcs.gyurko@tlt.hu>");
 MODULE_DESCRIPTION("1-wire Driver Dallas 2760 battery monitor chip");
+MODULE_ALIAS("w1-family-" __stringify(W1_FAMILY_DS2760));

diff --git a/drivers/w1/slaves/w1_ds2780.c b/drivers/w1/slaves/w1_ds2780.c
index 98ed9c4..0cd7a27 100644
--- a/drivers/w1/slaves/w1_ds2780.c
+++ b/drivers/w1/slaves/w1_ds2780.c

@@ -188,3 +188,4 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Clifton Barnes <cabarnes@indesign-llc.com>");
 MODULE_DESCRIPTION("1-wire Driver for Maxim/Dallas DS2780 Stand-Alone Fuel Gauge IC");
+MODULE_ALIAS("w1-family-" __stringify(W1_FAMILY_DS2780));

diff --git a/drivers/w1/slaves/w1_ds2781.c b/drivers/w1/slaves/w1_ds2781.c
index 5140d7b..1aba8e4 100644
--- a/drivers/w1/slaves/w1_ds2781.c
+++ b/drivers/w1/slaves/w1_ds2781.c

@@ -186,3 +186,4 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Renata Sayakhova <renata@oktetlabs.ru>");
 MODULE_DESCRIPTION("1-wire Driver for Maxim/Dallas DS2781 Stand-Alone Fuel Gauge IC");
+MODULE_ALIAS("w1-family-" __stringify(W1_FAMILY_DS2781));

diff --git a/drivers/w1/slaves/w1_ds28e04.c b/drivers/w1/slaves/w1_ds28e04.c
index 98117db..cd30a6d 100644
--- a/drivers/w1/slaves/w1_ds28e04.c
+++ b/drivers/w1/slaves/w1_ds28e04.c

@@ -27,6 +27,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Markus Franke <franke.m@sebakmt.com>, <franm@hrz.tu-chemnitz.de>");
 MODULE_DESCRIPTION("w1 family 1C driver for DS28E04, 4kb EEPROM and PIO");
+MODULE_ALIAS("w1-family-" __stringify(W1_FAMILY_DS28E04));
 
 /* Allow the strong pullup to be disabled, but default to enabled.
  * If it was disabled a parasite powered device might not get the required

diff --git a/drivers/w1/slaves/w1_smem.c b/drivers/w1/slaves/w1_smem.c
index 8465562..ed4c875 100644
--- a/drivers/w1/slaves/w1_smem.c
+++ b/drivers/w1/slaves/w1_smem.c

@@ -34,6 +34,8 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
 MODULE_DESCRIPTION("Driver for 1-wire Dallas network protocol, 64bit memory family.");
+MODULE_ALIAS("w1-family-" __stringify(W1_FAMILY_SMEM_01));
+MODULE_ALIAS("w1-family-" __stringify(W1_FAMILY_SMEM_81));
 
 static struct w1_family w1_smem_family_01 = {
 	.fid = W1_FAMILY_SMEM_01,

diff --git a/drivers/w1/slaves/w1_therm.c b/drivers/w1/slaves/w1_therm.c
index c1a702f8..8978360 100644
--- a/drivers/w1/slaves/w1_therm.c
+++ b/drivers/w1/slaves/w1_therm.c

@@ -36,6 +36,11 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
 MODULE_DESCRIPTION("Driver for 1-wire Dallas network protocol, temperature family.");
+MODULE_ALIAS("w1-family-" __stringify(W1_THERM_DS18S20));
+MODULE_ALIAS("w1-family-" __stringify(W1_THERM_DS1822));
+MODULE_ALIAS("w1-family-" __stringify(W1_THERM_DS18B20));
+MODULE_ALIAS("w1-family-" __stringify(W1_THERM_DS1825));
+MODULE_ALIAS("w1-family-" __stringify(W1_THERM_DS28EA00));
 
 /* Allow the strong pullup to be disabled, but default to enabled.
  * If it was disabled a parasite powered device might not get the require

diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index 7ce277d2b..0459df8 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c

@@ -680,6 +680,8 @@
 	atomic_set(&sl->refcnt, 0);
 	init_completion(&sl->released);
 
+	request_module("w1-family-0x%0x", rn->family);
+
 	spin_lock(&w1_flock);
 	f = w1_family_registered(rn->family);
 	if (!f) {

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 055562c..9ff073f 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c

@@ -148,13 +148,14 @@
  * @offset: offset in the page
  */
 
-static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+static void v9fs_invalidate_page(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	/*
 	 * If called with zero offset, we should release
 	 * the private state assocated with the page
 	 */
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		v9fs_fscache_invalidate_page(page);
 }
 

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index be1e34a..4d0c2e0 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c

@@ -101,16 +101,15 @@
 }
 
 /**
- * v9fs_dir_readdir - read a directory
- * @filp: opened file structure
- * @dirent: directory structure ???
- * @filldir: function to populate directory structure ???
+ * v9fs_dir_readdir - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
  *
  */
 
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-	int over;
+	bool over;
 	struct p9_wstat st;
 	int err = 0;
 	struct p9_fid *fid;
@@ -118,19 +117,19 @@
 	int reclen = 0;
 	struct p9_rdir *rdir;
 
-	p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
-	fid = filp->private_data;
+	p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+	fid = file->private_data;
 
 	buflen = fid->clnt->msize - P9_IOHDRSZ;
 
-	rdir = v9fs_alloc_rdir_buf(filp, buflen);
+	rdir = v9fs_alloc_rdir_buf(file, buflen);
 	if (!rdir)
 		return -ENOMEM;
 
 	while (1) {
 		if (rdir->tail == rdir->head) {
-			err = v9fs_file_readn(filp, rdir->buf, NULL,
-							buflen, filp->f_pos);
+			err = v9fs_file_readn(file, rdir->buf, NULL,
+							buflen, ctx->pos);
 			if (err <= 0)
 				return err;
 
@@ -148,51 +147,45 @@
 			}
 			reclen = st.size+2;
 
-			over = filldir(dirent, st.name, strlen(st.name),
-			    filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
-
+			over = !dir_emit(ctx, st.name, strlen(st.name),
+					 v9fs_qid2ino(&st.qid), dt_type(&st));
 			p9stat_free(&st);
-
 			if (over)
 				return 0;
 
 			rdir->head += reclen;
-			filp->f_pos += reclen;
+			ctx->pos += reclen;
 		}
 	}
 }
 
 /**
- * v9fs_dir_readdir_dotl - read a directory
- * @filp: opened file structure
- * @dirent: buffer to fill dirent structures
- * @filldir: function to populate dirent structures
+ * v9fs_dir_readdir_dotl - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
  *
  */
-static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
-						filldir_t filldir)
+static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
 {
-	int over;
 	int err = 0;
 	struct p9_fid *fid;
 	int buflen;
 	struct p9_rdir *rdir;
 	struct p9_dirent curdirent;
-	u64 oldoffset = 0;
 
-	p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
-	fid = filp->private_data;
+	p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+	fid = file->private_data;
 
 	buflen = fid->clnt->msize - P9_READDIRHDRSZ;
 
-	rdir = v9fs_alloc_rdir_buf(filp, buflen);
+	rdir = v9fs_alloc_rdir_buf(file, buflen);
 	if (!rdir)
 		return -ENOMEM;
 
 	while (1) {
 		if (rdir->tail == rdir->head) {
 			err = p9_client_readdir(fid, rdir->buf, buflen,
-						filp->f_pos);
+						ctx->pos);
 			if (err <= 0)
 				return err;
 
@@ -210,22 +203,13 @@
 				return -EIO;
 			}
 
-			/* d_off in dirent structure tracks the offset into
-			 * the next dirent in the dir. However, filldir()
-			 * expects offset into the current dirent. Hence
-			 * while calling filldir send the offset from the
-			 * previous dirent structure.
-			 */
-			over = filldir(dirent, curdirent.d_name,
-					strlen(curdirent.d_name),
-					oldoffset, v9fs_qid2ino(&curdirent.qid),
-					curdirent.d_type);
-			oldoffset = curdirent.d_off;
-
-			if (over)
+			if (!dir_emit(ctx, curdirent.d_name,
+				      strlen(curdirent.d_name),
+				      v9fs_qid2ino(&curdirent.qid),
+				      curdirent.d_type))
 				return 0;
 
-			filp->f_pos = curdirent.d_off;
+			ctx->pos = curdirent.d_off;
 			rdir->head += err;
 		}
 	}
@@ -254,7 +238,7 @@
 const struct file_operations v9fs_dir_operations = {
 	.read = generic_read_dir,
 	.llseek = generic_file_llseek,
-	.readdir = v9fs_dir_readdir,
+	.iterate = v9fs_dir_readdir,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 };
@@ -262,7 +246,7 @@
 const struct file_operations v9fs_dir_operations_dotl = {
 	.read = generic_read_dir,
 	.llseek = generic_file_llseek,
-	.readdir = v9fs_dir_readdir_dotl,
+	.iterate = v9fs_dir_readdir_dotl,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
         .fsync = v9fs_file_fsync_dotl,

diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 9cf874c..ade28bb 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c

@@ -17,47 +17,43 @@
 static DEFINE_RWLOCK(adfs_dir_lock);
 
 static int
-adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+adfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
 	struct object_info obj;
 	struct adfs_dir dir;
 	int ret = 0;
 
-	if (filp->f_pos >> 32)
-		goto out;
+	if (ctx->pos >> 32)
+		return 0;
 
 	ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
 	if (ret)
-		goto out;
+		return ret;
 
-	switch ((unsigned long)filp->f_pos) {
-	case 0:
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(file, ctx))
 			goto free_out;
-		filp->f_pos += 1;
-
-	case 1:
-		if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0)
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
 			goto free_out;
-		filp->f_pos += 1;
-
-	default:
-		break;
+		ctx->pos = 2;
 	}
 
 	read_lock(&adfs_dir_lock);
 
-	ret = ops->setpos(&dir, filp->f_pos - 2);
+	ret = ops->setpos(&dir, ctx->pos - 2);
 	if (ret)
 		goto unlock_out;
 	while (ops->getnext(&dir, &obj) == 0) {
-		if (filldir(dirent, obj.name, obj.name_len,
-			    filp->f_pos, obj.file_id, DT_UNKNOWN) < 0)
-			goto unlock_out;
-		filp->f_pos += 1;
+		if (!dir_emit(ctx, obj.name, obj.name_len,
+			    obj.file_id, DT_UNKNOWN))
+			break;
+		ctx->pos++;
 	}
 
 unlock_out:
@@ -65,8 +61,6 @@
 
 free_out:
 	ops->free(&dir);
-
-out:
 	return ret;
 }
 
@@ -192,7 +186,7 @@
 const struct file_operations adfs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
-	.readdir	= adfs_readdir,
+	.iterate	= adfs_readdir,
 	.fsync		= generic_file_fsync,
 };
 

diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index fd11a6d..f1eba8c 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c

@@ -15,12 +15,12 @@
 
 #include "affs.h"
 
-static int affs_readdir(struct file *, void *, filldir_t);
+static int affs_readdir(struct file *, struct dir_context *);
 
 const struct file_operations affs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
-	.readdir	= affs_readdir,
+	.iterate	= affs_readdir,
 	.fsync		= affs_file_fsync,
 };
 
@@ -40,52 +40,35 @@
 };
 
 static int
-affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+affs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode		*inode = file_inode(filp);
+	struct inode		*inode = file_inode(file);
 	struct super_block	*sb = inode->i_sb;
-	struct buffer_head	*dir_bh;
-	struct buffer_head	*fh_bh;
+	struct buffer_head	*dir_bh = NULL;
+	struct buffer_head	*fh_bh = NULL;
 	unsigned char		*name;
 	int			 namelen;
 	u32			 i;
 	int			 hash_pos;
 	int			 chain_pos;
-	u32			 f_pos;
 	u32			 ino;
-	int			 stored;
-	int			 res;
 
-	pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos);
+	pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
 
-	stored = 0;
-	res    = -EIO;
-	dir_bh = NULL;
-	fh_bh  = NULL;
-	f_pos  = filp->f_pos;
-
-	if (f_pos == 0) {
-		filp->private_data = (void *)0;
-		if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0)
+	if (ctx->pos < 2) {
+		file->private_data = (void *)0;
+		if (!dir_emit_dots(file, ctx))
 			return 0;
-		filp->f_pos = f_pos = 1;
-		stored++;
-	}
-	if (f_pos == 1) {
-		if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0)
-			return stored;
-		filp->f_pos = f_pos = 2;
-		stored++;
 	}
 
 	affs_lock_dir(inode);
-	chain_pos = (f_pos - 2) & 0xffff;
-	hash_pos  = (f_pos - 2) >> 16;
+	chain_pos = (ctx->pos - 2) & 0xffff;
+	hash_pos  = (ctx->pos - 2) >> 16;
 	if (chain_pos == 0xffff) {
 		affs_warning(sb, "readdir", "More than 65535 entries in chain");
 		chain_pos = 0;
 		hash_pos++;
-		filp->f_pos = ((hash_pos << 16) | chain_pos) + 2;
+		ctx->pos = ((hash_pos << 16) | chain_pos) + 2;
 	}
 	dir_bh = affs_bread(sb, inode->i_ino);
 	if (!dir_bh)
@@ -94,8 +77,8 @@
 	/* If the directory hasn't changed since the last call to readdir(),
 	 * we can jump directly to where we left off.
 	 */
-	ino = (u32)(long)filp->private_data;
-	if (ino && filp->f_version == inode->i_version) {
+	ino = (u32)(long)file->private_data;
+	if (ino && file->f_version == inode->i_version) {
 		pr_debug("AFFS: readdir() left off=%d\n", ino);
 		goto inside;
 	}
@@ -105,7 +88,7 @@
 		fh_bh = affs_bread(sb, ino);
 		if (!fh_bh) {
 			affs_error(sb, "readdir","Cannot read block %d", i);
-			goto readdir_out;
+			return -EIO;
 		}
 		ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
 		affs_brelse(fh_bh);
@@ -119,38 +102,34 @@
 		ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
 		if (!ino)
 			continue;
-		f_pos = (hash_pos << 16) + 2;
+		ctx->pos = (hash_pos << 16) + 2;
 inside:
 		do {
 			fh_bh = affs_bread(sb, ino);
 			if (!fh_bh) {
 				affs_error(sb, "readdir","Cannot read block %d", ino);
-				goto readdir_done;
+				break;
 			}
 
 			namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
 			name = AFFS_TAIL(sb, fh_bh)->name + 1;
 			pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
-				 namelen, name, ino, hash_pos, f_pos);
-			if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0)
+				 namelen, name, ino, hash_pos, (u32)ctx->pos);
+			if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
 				goto readdir_done;
-			stored++;
-			f_pos++;
+			ctx->pos++;
 			ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
 			affs_brelse(fh_bh);
 			fh_bh = NULL;
 		} while (ino);
 	}
 readdir_done:
-	filp->f_pos = f_pos;
-	filp->f_version = inode->i_version;
-	filp->private_data = (void *)(long)ino;
-	res = stored;
+	file->f_version = inode->i_version;
+	file->private_data = (void *)(long)ino;
 
 readdir_out:
 	affs_brelse(dir_bh);
 	affs_brelse(fh_bh);
 	affs_unlock_dir(inode);
-	pr_debug("AFFS: readdir()=%d\n", stored);
-	return res;
+	return 0;
 }

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 7a465ed..34494fb 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c

@@ -22,7 +22,7 @@
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 				 unsigned int flags);
 static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_readdir(struct file *file, struct dir_context *ctx);
 static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
 static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
@@ -43,7 +43,7 @@
 const struct file_operations afs_dir_file_operations = {
 	.open		= afs_dir_open,
 	.release	= afs_release,
-	.readdir	= afs_readdir,
+	.iterate	= afs_readdir,
 	.lock		= afs_lock,
 	.llseek		= generic_file_llseek,
 };
@@ -119,9 +119,9 @@
 };
 
 struct afs_lookup_cookie {
+	struct dir_context ctx;
 	struct afs_fid	fid;
-	const char	*name;
-	size_t		nlen;
+	struct qstr name;
 	int		found;
 };
 
@@ -228,20 +228,18 @@
 /*
  * deal with one block in an AFS directory
  */
-static int afs_dir_iterate_block(unsigned *fpos,
+static int afs_dir_iterate_block(struct dir_context *ctx,
 				 union afs_dir_block *block,
-				 unsigned blkoff,
-				 void *cookie,
-				 filldir_t filldir)
+				 unsigned blkoff)
 {
 	union afs_dirent *dire;
 	unsigned offset, next, curr;
 	size_t nlen;
-	int tmp, ret;
+	int tmp;
 
-	_enter("%u,%x,%p,,",*fpos,blkoff,block);
+	_enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
 
-	curr = (*fpos - blkoff) / sizeof(union afs_dirent);
+	curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
 
 	/* walk through the block, an entry at a time */
 	for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
@@ -256,7 +254,7 @@
 			_debug("ENT[%Zu.%u]: unused",
 			       blkoff / sizeof(union afs_dir_block), offset);
 			if (offset >= curr)
-				*fpos = blkoff +
+				ctx->pos = blkoff +
 					next * sizeof(union afs_dirent);
 			continue;
 		}
@@ -302,19 +300,15 @@
 			continue;
 
 		/* found the next entry */
-		ret = filldir(cookie,
-			      dire->u.name,
-			      nlen,
-			      blkoff + offset * sizeof(union afs_dirent),
+		if (!dir_emit(ctx, dire->u.name, nlen,
 			      ntohl(dire->u.vnode),
-			      filldir == afs_lookup_filldir ?
-			      ntohl(dire->u.unique) : DT_UNKNOWN);
-		if (ret < 0) {
+			      ctx->actor == afs_lookup_filldir ?
+			      ntohl(dire->u.unique) : DT_UNKNOWN)) {
 			_leave(" = 0 [full]");
 			return 0;
 		}
 
-		*fpos = blkoff + next * sizeof(union afs_dirent);
+		ctx->pos = blkoff + next * sizeof(union afs_dirent);
 	}
 
 	_leave(" = 1 [more]");
@@ -324,8 +318,8 @@
 /*
  * iterate through the data blob that lists the contents of an AFS directory
  */
-static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
-			   filldir_t filldir, struct key *key)
+static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
+			   struct key *key)
 {
 	union afs_dir_block *dblock;
 	struct afs_dir_page *dbuf;
@@ -333,7 +327,7 @@
 	unsigned blkoff, limit;
 	int ret;
 
-	_enter("{%lu},%u,,", dir->i_ino, *fpos);
+	_enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
 
 	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
 		_leave(" = -ESTALE");
@@ -341,13 +335,13 @@
 	}
 
 	/* round the file position up to the next entry boundary */
-	*fpos += sizeof(union afs_dirent) - 1;
-	*fpos &= ~(sizeof(union afs_dirent) - 1);
+	ctx->pos += sizeof(union afs_dirent) - 1;
+	ctx->pos &= ~(sizeof(union afs_dirent) - 1);
 
 	/* walk through the blocks in sequence */
 	ret = 0;
-	while (*fpos < dir->i_size) {
-		blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
+	while (ctx->pos < dir->i_size) {
+		blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
 
 		/* fetch the appropriate page from the directory */
 		page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
@@ -364,8 +358,7 @@
 		do {
 			dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
 					       sizeof(union afs_dir_block)];
-			ret = afs_dir_iterate_block(fpos, dblock, blkoff,
-						    cookie, filldir);
+			ret = afs_dir_iterate_block(ctx, dblock, blkoff);
 			if (ret != 1) {
 				afs_dir_put_page(page);
 				goto out;
@@ -373,7 +366,7 @@
 
 			blkoff += sizeof(union afs_dir_block);
 
-		} while (*fpos < dir->i_size && blkoff < limit);
+		} while (ctx->pos < dir->i_size && blkoff < limit);
 
 		afs_dir_put_page(page);
 		ret = 0;
@@ -387,23 +380,10 @@
 /*
  * read an AFS directory
  */
-static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned fpos;
-	int ret;
-
-	_enter("{%Ld,{%lu}}",
-	       file->f_pos, file_inode(file)->i_ino);
-
-	ASSERT(file->private_data != NULL);
-
-	fpos = file->f_pos;
-	ret = afs_dir_iterate(file_inode(file), &fpos,
-			      cookie, filldir, file->private_data);
-	file->f_pos = fpos;
-
-	_leave(" = %d", ret);
-	return ret;
+	return afs_dir_iterate(file_inode(file), 
+			      ctx, file->private_data);
 }
 
 /*
@@ -416,15 +396,16 @@
 {
 	struct afs_lookup_cookie *cookie = _cookie;
 
-	_enter("{%s,%Zu},%s,%u,,%llu,%u",
-	       cookie->name, cookie->nlen, name, nlen,
+	_enter("{%s,%u},%s,%u,,%llu,%u",
+	       cookie->name.name, cookie->name.len, name, nlen,
 	       (unsigned long long) ino, dtype);
 
 	/* insanity checks first */
 	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
 	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
 
-	if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
+	if (cookie->name.len != nlen ||
+	    memcmp(cookie->name.name, name, nlen) != 0) {
 		_leave(" = 0 [no]");
 		return 0;
 	}
@@ -444,24 +425,18 @@
 static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
 			 struct afs_fid *fid, struct key *key)
 {
-	struct afs_lookup_cookie cookie;
-	struct afs_super_info *as;
-	unsigned fpos;
+	struct afs_super_info *as = dir->i_sb->s_fs_info;
+	struct afs_lookup_cookie cookie = {
+		.ctx.actor = afs_lookup_filldir,
+		.name = dentry->d_name,
+		.fid.vid = as->volume->vid
+	};
 	int ret;
 
 	_enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
 
-	as = dir->i_sb->s_fs_info;
-
 	/* search the directory */
-	cookie.name	= dentry->d_name.name;
-	cookie.nlen	= dentry->d_name.len;
-	cookie.fid.vid	= as->volume->vid;
-	cookie.found	= 0;
-
-	fpos = 0;
-	ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
-			      key);
+	ret = afs_dir_iterate(dir, &cookie.ctx, key);
 	if (ret < 0) {
 		_leave(" = %d [iter]", ret);
 		return ret;

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e923..66d50fe 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c

@@ -19,7 +19,8 @@
 #include "internal.h"
 
 static int afs_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned long offset);
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+			       unsigned int length);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 static int afs_launder_page(struct page *page);
 
@@ -310,16 +311,17 @@
  * - release a page and clean up its private data if offset is 0 (indicating
  *   the entire page)
  */
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+			       unsigned int length)
 {
 	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
 
-	_enter("{%lu},%lu", page->index, offset);
+	_enter("{%lu},%u,%u", page->index, offset, length);
 
 	BUG_ON(!PageLocked(page));
 
 	/* we clean up only if the entire page is being invalidated */
-	if (offset == 0) {
+	if (offset == 0 && length == PAGE_CACHE_SIZE) {
 #ifdef CONFIG_AFS_FSCACHE
 		if (PageFsCache(page)) {
 			struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 085da86..ca8e555 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c

@@ -41,7 +41,7 @@
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
+	.iterate	= dcache_readdir,
 	.llseek		= dcache_dir_lseek,
 	.unlocked_ioctl	= autofs4_root_ioctl,
 #ifdef CONFIG_COMPAT
@@ -53,7 +53,7 @@
 	.open		= autofs4_dir_open,
 	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
+	.iterate	= dcache_readdir,
 	.llseek		= dcache_dir_lseek,
 };
 

diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 922ad46..7c93953 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c

@@ -45,7 +45,7 @@
 	return -EIO;
 }
 
-static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int bad_file_readdir(struct file *file, struct dir_context *ctx)
 {
 	return -EIO;
 }
@@ -152,7 +152,7 @@
 	.write		= bad_file_write,
 	.aio_read	= bad_file_aio_read,
 	.aio_write	= bad_file_aio_write,
-	.readdir	= bad_file_readdir,
+	.iterate	= bad_file_readdir,
 	.poll		= bad_file_poll,
 	.unlocked_ioctl	= bad_file_unlocked_ioctl,
 	.compat_ioctl	= bad_file_compat_ioctl,

diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index f95dddce..e9c75e2 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c

@@ -31,7 +31,7 @@
 /* The units the vfs expects inode->i_blocks to be in */
 #define VFS_BLOCK_SIZE 512
 
-static int befs_readdir(struct file *, void *, filldir_t);
+static int befs_readdir(struct file *, struct dir_context *);
 static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 static int befs_readpage(struct file *file, struct page *page);
 static sector_t befs_bmap(struct address_space *mapping, sector_t block);
@@ -66,7 +66,7 @@
 
 static const struct file_operations befs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= befs_readdir,
+	.iterate	= befs_readdir,
 	.llseek		= generic_file_llseek,
 };
 
@@ -211,9 +211,9 @@
 }
 
 static int
-befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+befs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
 	befs_off_t value;
@@ -221,15 +221,14 @@
 	size_t keysize;
 	unsigned char d_type;
 	char keybuf[BEFS_NAME_LEN + 1];
-	char *nlsname;
-	int nlsnamelen;
-	const char *dirname = filp->f_path.dentry->d_name.name;
+	const char *dirname = file->f_path.dentry->d_name.name;
 
 	befs_debug(sb, "---> befs_readdir() "
-		   "name %s, inode %ld, filp->f_pos %Ld",
-		   dirname, inode->i_ino, filp->f_pos);
+		   "name %s, inode %ld, ctx->pos %Ld",
+		   dirname, inode->i_ino, ctx->pos);
 
-	result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1,
+more:
+	result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
 				 keybuf, &keysize, &value);
 
 	if (result == BEFS_ERR) {
@@ -251,24 +250,29 @@
 
 	/* Convert to NLS */
 	if (BEFS_SB(sb)->nls) {
+		char *nlsname;
+		int nlsnamelen;
 		result =
 		    befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
 		if (result < 0) {
 			befs_debug(sb, "<--- befs_readdir() ERROR");
 			return result;
 		}
-		result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos,
-				 (ino_t) value, d_type);
+		if (!dir_emit(ctx, nlsname, nlsnamelen,
+				 (ino_t) value, d_type)) {
+			kfree(nlsname);
+			return 0;
+		}
 		kfree(nlsname);
-
 	} else {
-		result = filldir(dirent, keybuf, keysize, filp->f_pos,
-				 (ino_t) value, d_type);
+		if (!dir_emit(ctx, keybuf, keysize,
+				 (ino_t) value, d_type))
+			return 0;
 	}
-	if (!result)
-		filp->f_pos++;
+	ctx->pos++;
+	goto more;
 
-	befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos);
+	befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos);
 
 	return 0;
 }

diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 3f422f6..a399e6d 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c

@@ -26,58 +26,51 @@
 				const unsigned char *name, int namelen,
 				struct bfs_dirent **res_dir);
 
-static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
+static int bfs_readdir(struct file *f, struct dir_context *ctx)
 {
 	struct inode *dir = file_inode(f);
 	struct buffer_head *bh;
 	struct bfs_dirent *de;
-	struct bfs_sb_info *info = BFS_SB(dir->i_sb);
 	unsigned int offset;
 	int block;
 
-	mutex_lock(&info->bfs_lock);
-
-	if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
+	if (ctx->pos & (BFS_DIRENT_SIZE - 1)) {
 		printf("Bad f_pos=%08lx for %s:%08lx\n",
-					(unsigned long)f->f_pos,
+					(unsigned long)ctx->pos,
 					dir->i_sb->s_id, dir->i_ino);
-		mutex_unlock(&info->bfs_lock);
-		return -EBADF;
+		return -EINVAL;
 	}
 
-	while (f->f_pos < dir->i_size) {
-		offset = f->f_pos & (BFS_BSIZE - 1);
-		block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS);
+	while (ctx->pos < dir->i_size) {
+		offset = ctx->pos & (BFS_BSIZE - 1);
+		block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS);
 		bh = sb_bread(dir->i_sb, block);
 		if (!bh) {
-			f->f_pos += BFS_BSIZE - offset;
+			ctx->pos += BFS_BSIZE - offset;
 			continue;
 		}
 		do {
 			de = (struct bfs_dirent *)(bh->b_data + offset);
 			if (de->ino) {
 				int size = strnlen(de->name, BFS_NAMELEN);
-				if (filldir(dirent, de->name, size, f->f_pos,
+				if (!dir_emit(ctx, de->name, size,
 						le16_to_cpu(de->ino),
-						DT_UNKNOWN) < 0) {
+						DT_UNKNOWN)) {
 					brelse(bh);
-					mutex_unlock(&info->bfs_lock);
 					return 0;
 				}
 			}
 			offset += BFS_DIRENT_SIZE;
-			f->f_pos += BFS_DIRENT_SIZE;
-		} while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size));
+			ctx->pos += BFS_DIRENT_SIZE;
+		} while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size));
 		brelse(bh);
 	}
-
-	mutex_unlock(&info->bfs_lock);
-	return 0;	
+	return 0;
 }
 
 const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= bfs_readdir,
+	.iterate	= bfs_readdir,
 	.fsync		= generic_file_fsync,
 	.llseek		= generic_file_llseek,
 };

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f26f38c..eb34438 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c

@@ -1681,8 +1681,7 @@
  * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
  *
  */
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
-				    filldir_t filldir,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 				    struct list_head *ins_list)
 {
 	struct btrfs_dir_item *di;
@@ -1704,13 +1703,13 @@
 	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
 		list_del(&curr->readdir_list);
 
-		if (curr->key.offset < filp->f_pos) {
+		if (curr->key.offset < ctx->pos) {
 			if (atomic_dec_and_test(&curr->refs))
 				kfree(curr);
 			continue;
 		}
 
-		filp->f_pos = curr->key.offset;
+		ctx->pos = curr->key.offset;
 
 		di = (struct btrfs_dir_item *)curr->data;
 		name = (char *)(di + 1);
@@ -1719,7 +1718,7 @@
 		d_type = btrfs_filetype_table[di->type];
 		btrfs_disk_key_to_cpu(&location, &di->location);
 
-		over = filldir(dirent, name, name_len, curr->key.offset,
+		over = !dir_emit(ctx, name, name_len,
 			       location.objectid, d_type);
 
 		if (atomic_dec_and_test(&curr->refs))

diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1d5c5f7..a4b38f9 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h

@@ -139,8 +139,7 @@
 			     struct list_head *del_list);
 int btrfs_should_delete_dir_index(struct list_head *del_list,
 				  u64 index);
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
-				    filldir_t filldir,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 				    struct list_head *ins_list);
 
 /* for init */

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8b60b6..b0292b3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c

@@ -1013,7 +1013,8 @@
 	return try_release_extent_buffer(page);
 }
 
-static void btree_invalidatepage(struct page *page, unsigned long offset)
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e7e7afb..6bca947 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c

@@ -2957,7 +2957,7 @@
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
 	   (page->index == end_index && !pg_offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0);
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		unlock_page(page);
 		return 0;
 	}

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17f3064..4f9d16b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c

@@ -5137,10 +5137,9 @@
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
-static int btrfs_real_readdir(struct file *filp, void *dirent,
-			      filldir_t filldir)
+static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
@@ -5161,29 +5160,15 @@
 	char tmp_name[32];
 	char *name_ptr;
 	int name_len;
-	int is_curr = 0;	/* filp->f_pos points to the current index? */
+	int is_curr = 0;	/* ctx->pos points to the current index? */
 
 	/* FIXME, use a real flag for deciding about the key type */
 	if (root->fs_info->tree_root == root)
 		key_type = BTRFS_DIR_ITEM_KEY;
 
-	/* special case for "." */
-	if (filp->f_pos == 0) {
-		over = filldir(dirent, ".", 1,
-			       filp->f_pos, btrfs_ino(inode), DT_DIR);
-		if (over)
-			return 0;
-		filp->f_pos = 1;
-	}
-	/* special case for .., just use the back ref */
-	if (filp->f_pos == 1) {
-		u64 pino = parent_ino(filp->f_path.dentry);
-		over = filldir(dirent, "..", 2,
-			       filp->f_pos, pino, DT_DIR);
-		if (over)
-			return 0;
-		filp->f_pos = 2;
-	}
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -5197,7 +5182,7 @@
 	}
 
 	btrfs_set_key_type(&key, key_type);
-	key.offset = filp->f_pos;
+	key.offset = ctx->pos;
 	key.objectid = btrfs_ino(inode);
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5223,14 +5208,14 @@
 			break;
 		if (btrfs_key_type(&found_key) != key_type)
 			break;
-		if (found_key.offset < filp->f_pos)
+		if (found_key.offset < ctx->pos)
 			goto next;
 		if (key_type == BTRFS_DIR_INDEX_KEY &&
 		    btrfs_should_delete_dir_index(&del_list,
 						  found_key.offset))
 			goto next;
 
-		filp->f_pos = found_key.offset;
+		ctx->pos = found_key.offset;
 		is_curr = 1;
 
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
@@ -5274,9 +5259,8 @@
 				over = 0;
 				goto skip;
 			}
-			over = filldir(dirent, name_ptr, name_len,
-				       found_key.offset, location.objectid,
-				       d_type);
+			over = !dir_emit(ctx, name_ptr, name_len,
+				       location.objectid, d_type);
 
 skip:
 			if (name_ptr != tmp_name)
@@ -5295,9 +5279,8 @@
 
 	if (key_type == BTRFS_DIR_INDEX_KEY) {
 		if (is_curr)
-			filp->f_pos++;
-		ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
-						      &ins_list);
+			ctx->pos++;
+		ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
 		if (ret)
 			goto nopos;
 	}
@@ -5308,9 +5291,9 @@
 		 * 32-bit glibc will use getdents64, but then strtol -
 		 * so the last number we can serve is this.
 		 */
-		filp->f_pos = 0x7fffffff;
+		ctx->pos = 0x7fffffff;
 	else
-		filp->f_pos++;
+		ctx->pos++;
 nopos:
 	ret = 0;
 err:
@@ -7510,7 +7493,8 @@
 	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
 
-static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *tree;
@@ -8731,7 +8715,7 @@
 static const struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= btrfs_real_readdir,
+	.iterate	= btrfs_real_readdir,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,

diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1b..f93392e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c

@@ -1454,7 +1454,8 @@
  * block_invalidatepage - invalidate part or all of a buffer-backed page
  *
  * @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
  *
  * block_invalidatepage() is called when all or part of the page has become
  * invalidated by a truncate operation.
@@ -1465,15 +1466,22 @@
  * point.  Because the caller is about to free (and possibly reuse) those
  * blocks on-disk.
  */
-void block_invalidatepage(struct page *page, unsigned long offset)
+void block_invalidatepage(struct page *page, unsigned int offset,
+			  unsigned int length)
 {
 	struct buffer_head *head, *bh, *next;
 	unsigned int curr_off = 0;
+	unsigned int stop = length + offset;
 
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		goto out;
 
+	/*
+	 * Check for overflow
+	 */
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
 	head = page_buffers(page);
 	bh = head;
 	do {
@@ -1481,6 +1489,12 @@
 		next = bh->b_this_page;
 
 		/*
+		 * Are we still fully in range ?
+		 */
+		if (next_off > stop)
+			goto out;
+
+		/*
 		 * is this block fully invalidated?
 		 */
 		if (offset <= curr_off)
@@ -1501,6 +1515,7 @@
 }
 EXPORT_SYMBOL(block_invalidatepage);
 
+
 /*
  * We attach and possibly dirty the buffers atomically wrt
  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
@@ -2841,7 +2856,7 @@
 		 * they may have been added in ext3_writepage().  Make them
 		 * freeable here, so the page does not leak.
 		 */
-		do_invalidatepage(page, 0);
+		do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		unlock_page(page);
 		return 0; /* don't care */
 	}

diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 746ce53..d4c1206 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c

@@ -13,8 +13,6 @@
 #include <linux/mount.h>
 #include "internal.h"
 
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
-
 struct cachefiles_lookup_data {
 	struct cachefiles_xattr	*auxdata;	/* auxiliary data */
 	char			*key;		/* key path */
@@ -212,20 +210,29 @@
 	object = container_of(_object, struct cachefiles_object, fscache);
 	cache = container_of(object->fscache.cache, struct cachefiles_cache,
 			     cache);
+
+	if (!fscache_use_cookie(_object)) {
+		_leave(" [relinq]");
+		return;
+	}
+
 	cookie = object->fscache.cookie;
 
 	if (!cookie->def->get_aux) {
+		fscache_unuse_cookie(_object);
 		_leave(" [no aux]");
 		return;
 	}
 
 	auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
 	if (!auxdata) {
+		fscache_unuse_cookie(_object);
 		_leave(" [nomem]");
 		return;
 	}
 
 	auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+	fscache_unuse_cookie(_object);
 	ASSERTCMP(auxlen, <, 511);
 
 	auxdata->len = auxlen + 1;
@@ -263,7 +270,7 @@
 #endif
 
 	/* delete retired objects */
-	if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+	if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
 	    _object != cache->cache.fsdef
 	    ) {
 		_debug("- retire object OBJ%x", object->fscache.debug_id);

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 8c01c5fc..25badd1 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c

@@ -38,7 +38,7 @@
 	printk(KERN_ERR "%sobject: OBJ%x\n",
 	       prefix, object->fscache.debug_id);
 	printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
-	       prefix, fscache_object_states[object->fscache.state],
+	       prefix, object->fscache.state->name,
 	       object->fscache.flags, work_busy(&object->fscache.work),
 	       object->fscache.events, object->fscache.event_mask);
 	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -127,10 +127,10 @@
 found_dentry:
 	kdebug("preemptive burial: OBJ%x [%s] %p",
 	       object->fscache.debug_id,
-	       fscache_object_states[object->fscache.state],
+	       object->fscache.state->name,
 	       dentry);
 
-	if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+	if (fscache_object_is_live(&object->fscache)) {
 		printk(KERN_ERR "\n");
 		printk(KERN_ERR "CacheFiles: Error:"
 		       " Can't preemptively bury live object\n");
@@ -192,7 +192,7 @@
 	/* an old object from a previous incarnation is hogging the slot - we
 	 * need to wait for it to be destroyed */
 wait_for_old_object:
-	if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+	if (fscache_object_is_live(&object->fscache)) {
 		printk(KERN_ERR "\n");
 		printk(KERN_ERR "CacheFiles: Error:"
 		       " Unexpected object collision\n");
@@ -836,7 +836,7 @@
 	//       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
 
 	/* look up the victim */
-	mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
 
 	start = jiffies;
 	victim = lookup_one_len(filename, dir, strlen(filename));

diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 73b4628..2476e51 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c

@@ -109,13 +109,12 @@
 	struct dentry *dentry = object->dentry;
 	int ret;
 
-	ASSERT(object->fscache.cookie);
 	ASSERT(dentry);
 
 	_enter("%p,#%d", object, auxdata->len);
 
 	/* attempt to install the cache metadata directly */
-	_debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+	_debug("SET #%u", auxdata->len);
 
 	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
 			   &auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@
 	struct dentry *dentry = object->dentry;
 	int ret;
 
-	ASSERT(object->fscache.cookie);
 	ASSERT(dentry);
 
 	_enter("%p,#%d", object, auxdata->len);
 
 	/* attempt to install the cache metadata directly */
-	_debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+	_debug("SET #%u", auxdata->len);
 
 	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
 			   &auxdata->type, auxdata->len,

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e68ac1..38b5c1b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c

@@ -143,7 +143,8 @@
  * dirty page counters appropriately.  Only called if there is private
  * data on the page.
  */
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
@@ -163,20 +164,20 @@
 	if (!PageDirty(page))
 		pr_err("%p invalidatepage %p page not dirty\n", inode, page);
 
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		ClearPageChecked(page);
 
 	ci = ceph_inode(inode);
-	if (offset == 0) {
-		dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
-		     inode, page, page->index, offset);
+	if (offset == 0 && length == PAGE_CACHE_SIZE) {
+		dout("%p invalidatepage %p idx %lu full dirty page\n",
+		     inode, page, page->index);
 		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
 		ceph_put_snap_context(snapc);
 		page->private = 0;
 		ClearPagePrivate(page);
 	} else {
-		dout("%p invalidatepage %p idx %lu partial dirty page\n",
-		     inode, page, page->index);
+		dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
+		     inode, page, page->index, offset, length);
 	}
 }
 

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f02d82b..a40ceda 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c

@@ -111,11 +111,10 @@
  * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
  * the MDS if/when the directory is modified).
  */
-static int __dcache_readdir(struct file *filp,
-			    void *dirent, filldir_t filldir)
+static int __dcache_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct ceph_file_info *fi = filp->private_data;
-	struct dentry *parent = filp->f_dentry;
+	struct ceph_file_info *fi = file->private_data;
+	struct dentry *parent = file->f_dentry;
 	struct inode *dir = parent->d_inode;
 	struct list_head *p;
 	struct dentry *dentry, *last;
@@ -126,14 +125,14 @@
 	last = fi->dentry;
 	fi->dentry = NULL;
 
-	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+	dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
 	     last);
 
 	spin_lock(&parent->d_lock);
 
 	/* start at beginning? */
-	if (filp->f_pos == 2 || last == NULL ||
-	    filp->f_pos < ceph_dentry(last)->offset) {
+	if (ctx->pos == 2 || last == NULL ||
+	    ctx->pos < ceph_dentry(last)->offset) {
 		if (list_empty(&parent->d_subdirs))
 			goto out_unlock;
 		p = parent->d_subdirs.prev;
@@ -157,11 +156,11 @@
 		if (!d_unhashed(dentry) && dentry->d_inode &&
 		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
 		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
-		    filp->f_pos <= di->offset)
+		    ctx->pos <= di->offset)
 			break;
 		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
 		     dentry->d_name.len, dentry->d_name.name, di->offset,
-		     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+		     ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
 		     !dentry->d_inode ? " null" : "");
 		spin_unlock(&dentry->d_lock);
 		p = p->prev;
@@ -173,29 +172,27 @@
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
 
-	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
 	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-	filp->f_pos = di->offset;
-	err = filldir(dirent, dentry->d_name.name,
-		      dentry->d_name.len, di->offset,
+	ctx->pos = di->offset;
+	if (!dir_emit(ctx, dentry->d_name.name,
+		      dentry->d_name.len,
 		      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
-		      dentry->d_inode->i_mode >> 12);
-
-	if (last) {
-		if (err < 0) {
+		      dentry->d_inode->i_mode >> 12)) {
+		if (last) {
 			/* remember our position */
 			fi->dentry = last;
 			fi->next_offset = di->offset;
-		} else {
-			dput(last);
 		}
+		dput(dentry);
+		return 0;
 	}
+
+	if (last)
+		dput(last);
 	last = dentry;
 
-	if (err < 0)
-		goto out;
-
-	filp->f_pos++;
+	ctx->pos++;
 
 	/* make sure a dentry wasn't dropped while we didn't have parent lock */
 	if (!ceph_dir_is_complete(dir)) {
@@ -235,59 +232,59 @@
 	return 0;
 }
 
-static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct ceph_file_info *fi = filp->private_data;
-	struct inode *inode = file_inode(filp);
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-	unsigned frag = fpos_frag(filp->f_pos);
-	int off = fpos_off(filp->f_pos);
+	unsigned frag = fpos_frag(ctx->pos);
+	int off = fpos_off(ctx->pos);
 	int err;
 	u32 ftype;
 	struct ceph_mds_reply_info_parsed *rinfo;
 	const int max_entries = fsc->mount_options->max_readdir;
 	const int max_bytes = fsc->mount_options->max_readdir_bytes;
 
-	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+	dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
 	if (fi->flags & CEPH_F_ATEND)
 		return 0;
 
 	/* always start with . and .. */
-	if (filp->f_pos == 0) {
+	if (ctx->pos == 0) {
 		/* note dir version at start of readdir so we can tell
 		 * if any dentries get dropped */
 		fi->dir_release_count = atomic_read(&ci->i_release_count);
 
 		dout("readdir off 0 -> '.'\n");
-		if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+		if (!dir_emit(ctx, ".", 1, 
 			    ceph_translate_ino(inode->i_sb, inode->i_ino),
-			    inode->i_mode >> 12) < 0)
+			    inode->i_mode >> 12))
 			return 0;
-		filp->f_pos = 1;
+		ctx->pos = 1;
 		off = 1;
 	}
-	if (filp->f_pos == 1) {
-		ino_t ino = parent_ino(filp->f_dentry);
+	if (ctx->pos == 1) {
+		ino_t ino = parent_ino(file->f_dentry);
 		dout("readdir off 1 -> '..'\n");
-		if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+		if (!dir_emit(ctx, "..", 2,
 			    ceph_translate_ino(inode->i_sb, ino),
-			    inode->i_mode >> 12) < 0)
+			    inode->i_mode >> 12))
 			return 0;
-		filp->f_pos = 2;
+		ctx->pos = 2;
 		off = 2;
 	}
 
 	/* can we use the dcache? */
 	spin_lock(&ci->i_ceph_lock);
-	if ((filp->f_pos == 2 || fi->dentry) &&
+	if ((ctx->pos == 2 || fi->dentry) &&
 	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
 	    ceph_snap(inode) != CEPH_SNAPDIR &&
 	    __ceph_dir_is_complete(ci) &&
 	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
 		spin_unlock(&ci->i_ceph_lock);
-		err = __dcache_readdir(filp, dirent, filldir);
+		err = __dcache_readdir(file, ctx);
 		if (err != -EAGAIN)
 			return err;
 	} else {
@@ -327,7 +324,7 @@
 			return PTR_ERR(req);
 		req->r_inode = inode;
 		ihold(inode);
-		req->r_dentry = dget(filp->f_dentry);
+		req->r_dentry = dget(file->f_dentry);
 		/* hints to request -> mds selection code */
 		req->r_direct_mode = USE_AUTH_MDS;
 		req->r_direct_hash = ceph_frag_value(frag);
@@ -379,15 +376,16 @@
 	rinfo = &fi->last_readdir->r_reply_info;
 	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
 	     rinfo->dir_nr, off, fi->offset);
+
+	ctx->pos = ceph_make_fpos(frag, off);
 	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
-		u64 pos = ceph_make_fpos(frag, off);
 		struct ceph_mds_reply_inode *in =
 			rinfo->dir_in[off - fi->offset].in;
 		struct ceph_vino vino;
 		ino_t ino;
 
 		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
-		     off, off - fi->offset, rinfo->dir_nr, pos,
+		     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
 		     rinfo->dir_dname_len[off - fi->offset],
 		     rinfo->dir_dname[off - fi->offset], in);
 		BUG_ON(!in);
@@ -395,16 +393,15 @@
 		vino.ino = le64_to_cpu(in->ino);
 		vino.snap = le64_to_cpu(in->snapid);
 		ino = ceph_vino_to_ino(vino);
-		if (filldir(dirent,
+		if (!dir_emit(ctx,
 			    rinfo->dir_dname[off - fi->offset],
 			    rinfo->dir_dname_len[off - fi->offset],
-			    pos,
-			    ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
+			    ceph_translate_ino(inode->i_sb, ino), ftype)) {
 			dout("filldir stopping us...\n");
 			return 0;
 		}
 		off++;
-		filp->f_pos = pos + 1;
+		ctx->pos++;
 	}
 
 	if (fi->last_name) {
@@ -417,7 +414,7 @@
 	if (!ceph_frag_is_rightmost(frag)) {
 		frag = ceph_frag_next(frag);
 		off = 0;
-		filp->f_pos = ceph_make_fpos(frag, off);
+		ctx->pos = ceph_make_fpos(frag, off);
 		dout("readdir next frag is %x\n", frag);
 		goto more;
 	}
@@ -432,11 +429,11 @@
 	if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
 		dout(" marking %p complete\n", inode);
 		__ceph_dir_set_complete(ci, fi->dir_release_count);
-		ci->i_max_offset = filp->f_pos;
+		ci->i_max_offset = ctx->pos;
 	}
 	spin_unlock(&ci->i_ceph_lock);
 
-	dout("readdir %p filp %p done.\n", inode, filp);
+	dout("readdir %p file %p done.\n", inode, file);
 	return 0;
 }
 
@@ -1268,7 +1265,7 @@
 
 const struct file_operations ceph_dir_fops = {
 	.read = ceph_read_dir,
-	.readdir = ceph_readdir,
+	.iterate = ceph_readdir,
 	.llseek = ceph_dir_llseek,
 	.open = ceph_open,
 	.release = ceph_release,

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3752b9f..540c1cc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c

@@ -968,7 +968,7 @@
 };
 
 const struct file_operations cifs_dir_ops = {
-	.readdir = cifs_readdir,
+	.iterate = cifs_readdir,
 	.release = cifs_closedir,
 	.read    = generic_read_dir,
 	.unlocked_ioctl  = cifs_ioctl,

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0e32c34..d05b302 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h

@@ -101,7 +101,7 @@
 extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
-extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
+extern int cifs_readdir(struct file *file, struct dir_context *ctx);
 
 /* Functions related to dir entries */
 extern const struct dentry_operations cifs_dentry_ops;

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 48b29d2..4d8ba8d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c

@@ -3546,11 +3546,12 @@
 	return cifs_fscache_release_page(page, gfp);
 }
 
-static void cifs_invalidate_page(struct page *page, unsigned long offset)
+static void cifs_invalidate_page(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
 
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
 }
 

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 770d5a9..f121379 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c

@@ -537,14 +537,14 @@
  * every entry (do not increment for . or .. entry).
  */
 static int
-find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
+find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
 		struct file *file, char **current_entry, int *num_to_ret)
 {
 	__u16 search_flags;
 	int rc = 0;
 	int pos_in_buf = 0;
 	loff_t first_entry_in_buffer;
-	loff_t index_to_find = file->f_pos;
+	loff_t index_to_find = pos;
 	struct cifsFileInfo *cfile = file->private_data;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	struct TCP_Server_Info *server = tcon->ses->server;
@@ -659,8 +659,9 @@
 	return rc;
 }
 
-static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
-		void *dirent, char *scratch_buf, unsigned int max_len)
+static int cifs_filldir(char *find_entry, struct file *file,
+		struct dir_context *ctx,
+		char *scratch_buf, unsigned int max_len)
 {
 	struct cifsFileInfo *file_info = file->private_data;
 	struct super_block *sb = file->f_path.dentry->d_sb;
@@ -740,13 +741,11 @@
 	cifs_prime_dcache(file->f_dentry, &name, &fattr);
 
 	ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
-	rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
-		     fattr.cf_dtype);
-	return rc;
+	return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
 }
 
 
-int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
+int cifs_readdir(struct file *file, struct dir_context *ctx)
 {
 	int rc = 0;
 	unsigned int xid;
@@ -772,103 +771,86 @@
 			goto rddir2_exit;
 	}
 
-	switch ((int) file->f_pos) {
-	case 0:
-		if (filldir(direntry, ".", 1, file->f_pos,
-		     file_inode(file)->i_ino, DT_DIR) < 0) {
-			cifs_dbg(VFS, "Filldir for current dir failed\n");
-			rc = -ENOMEM;
+	if (!dir_emit_dots(file, ctx))
+		goto rddir2_exit;
+
+	/* 1) If search is active,
+		is in current search buffer?
+		if it before then restart search
+		if after then keep searching till find it */
+
+	if (file->private_data == NULL) {
+		rc = -EINVAL;
+		goto rddir2_exit;
+	}
+	cifsFile = file->private_data;
+	if (cifsFile->srch_inf.endOfSearch) {
+		if (cifsFile->srch_inf.emptyDir) {
+			cifs_dbg(FYI, "End of search, empty dir\n");
+			rc = 0;
+			goto rddir2_exit;
+		}
+	} /* else {
+		cifsFile->invalidHandle = true;
+		tcon->ses->server->close(xid, tcon, &cifsFile->fid);
+	} */
+
+	tcon = tlink_tcon(cifsFile->tlink);
+	rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
+			     &num_to_fill);
+	if (rc) {
+		cifs_dbg(FYI, "fce error %d\n", rc);
+		goto rddir2_exit;
+	} else if (current_entry != NULL) {
+		cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
+	} else {
+		cifs_dbg(FYI, "could not find entry\n");
+		goto rddir2_exit;
+	}
+	cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
+		 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+	max_len = tcon->ses->server->ops->calc_smb_size(
+			cifsFile->srch_inf.ntwrk_buf_start);
+	end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+
+	tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+	if (tmp_buf == NULL) {
+		rc = -ENOMEM;
+		goto rddir2_exit;
+	}
+
+	for (i = 0; i < num_to_fill; i++) {
+		if (current_entry == NULL) {
+			/* evaluate whether this case is an error */
+			cifs_dbg(VFS, "past SMB end,  num to fill %d i %d\n",
+				 num_to_fill, i);
 			break;
 		}
-		file->f_pos++;
-	case 1:
-		if (filldir(direntry, "..", 2, file->f_pos,
-		     parent_ino(file->f_path.dentry), DT_DIR) < 0) {
-			cifs_dbg(VFS, "Filldir for parent dir failed\n");
-			rc = -ENOMEM;
-			break;
-		}
-		file->f_pos++;
-	default:
-		/* 1) If search is active,
-			is in current search buffer?
-			if it before then restart search
-			if after then keep searching till find it */
-
-		if (file->private_data == NULL) {
-			rc = -EINVAL;
-			free_xid(xid);
-			return rc;
-		}
-		cifsFile = file->private_data;
-		if (cifsFile->srch_inf.endOfSearch) {
-			if (cifsFile->srch_inf.emptyDir) {
-				cifs_dbg(FYI, "End of search, empty dir\n");
-				rc = 0;
-				break;
-			}
-		} /* else {
-			cifsFile->invalidHandle = true;
-			tcon->ses->server->close(xid, tcon, &cifsFile->fid);
-		} */
-
-		tcon = tlink_tcon(cifsFile->tlink);
-		rc = find_cifs_entry(xid, tcon, file, &current_entry,
-				     &num_to_fill);
+		/*
+		 * if buggy server returns . and .. late do we want to
+		 * check for that here?
+		 */
+		rc = cifs_filldir(current_entry, file, ctx,
+				  tmp_buf, max_len);
 		if (rc) {
-			cifs_dbg(FYI, "fce error %d\n", rc);
-			goto rddir2_exit;
-		} else if (current_entry != NULL) {
-			cifs_dbg(FYI, "entry %lld found\n", file->f_pos);
-		} else {
-			cifs_dbg(FYI, "could not find entry\n");
-			goto rddir2_exit;
-		}
-		cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
-			 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
-		max_len = tcon->ses->server->ops->calc_smb_size(
-				cifsFile->srch_inf.ntwrk_buf_start);
-		end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
-
-		tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
-		if (tmp_buf == NULL) {
-			rc = -ENOMEM;
+			if (rc > 0)
+				rc = 0;
 			break;
 		}
 
-		for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
-			if (current_entry == NULL) {
-				/* evaluate whether this case is an error */
-				cifs_dbg(VFS, "past SMB end,  num to fill %d i %d\n",
-					 num_to_fill, i);
-				break;
-			}
-			/*
-			 * if buggy server returns . and .. late do we want to
-			 * check for that here?
-			 */
-			rc = cifs_filldir(current_entry, file, filldir,
-					  direntry, tmp_buf, max_len);
-			if (rc == -EOVERFLOW) {
-				rc = 0;
-				break;
-			}
-
-			file->f_pos++;
-			if (file->f_pos ==
-				cifsFile->srch_inf.index_of_last_entry) {
-				cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
-					 file->f_pos, tmp_buf);
-				cifs_save_resume_key(current_entry, cifsFile);
-				break;
-			} else
-				current_entry =
-					nxt_dir_entry(current_entry, end_of_smb,
-						cifsFile->srch_inf.info_level);
-		}
-		kfree(tmp_buf);
-		break;
-	} /* end switch */
+		ctx->pos++;
+		if (ctx->pos ==
+			cifsFile->srch_inf.index_of_last_entry) {
+			cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
+				 ctx->pos, tmp_buf);
+			cifs_save_resume_key(current_entry, cifsFile);
+			break;
+		} else
+			current_entry =
+				nxt_dir_entry(current_entry, end_of_smb,
+					cifsFile->srch_inf.info_level);
+	}
+	kfree(tmp_buf);
 
 rddir2_exit:
 	free_xid(xid);

diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index b7d3a05..87e0ee9 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c

@@ -43,15 +43,14 @@
                        struct inode *new_inode, struct dentry *new_dentry);
 
 /* dir file-ops */
-static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
+static int coda_readdir(struct file *file, struct dir_context *ctx);
 
 /* dentry ops */
 static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
 static int coda_dentry_delete(const struct dentry *);
 
 /* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
-			      filldir_t filldir);
+static int coda_venus_readdir(struct file *, struct dir_context *);
 
 /* same as fs/bad_inode.c */
 static int coda_return_EIO(void)
@@ -85,7 +84,7 @@
 const struct file_operations coda_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= coda_readdir,
+	.iterate	= coda_readdir,
 	.open		= coda_open,
 	.release	= coda_release,
 	.fsync		= coda_fsync,
@@ -378,7 +377,7 @@
 
 
 /* file operations for directories */
-static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 {
 	struct coda_file_info *cfi;
 	struct file *host_file;
@@ -391,30 +390,19 @@
 	if (!host_file->f_op)
 		return -ENOTDIR;
 
-	if (host_file->f_op->readdir)
-	{
-		/* potemkin case: we were handed a directory inode.
-		 * We can't use vfs_readdir because we have to keep the file
-		 * position in sync between the coda_file and the host_file.
-		 * and as such we need grab the inode mutex. */
+	if (host_file->f_op->iterate) {
 		struct inode *host_inode = file_inode(host_file);
-
 		mutex_lock(&host_inode->i_mutex);
-		host_file->f_pos = coda_file->f_pos;
-
 		ret = -ENOENT;
 		if (!IS_DEADDIR(host_inode)) {
-			ret = host_file->f_op->readdir(host_file, buf, filldir);
+			ret = host_file->f_op->iterate(host_file, ctx);
 			file_accessed(host_file);
 		}
-
-		coda_file->f_pos = host_file->f_pos;
 		mutex_unlock(&host_inode->i_mutex);
+		return ret;
 	}
-	else /* Venus: we must read Venus dirents from a file */
-		ret = coda_venus_readdir(coda_file, buf, filldir);
-
-	return ret;
+	/* Venus: we must read Venus dirents from a file */
+	return coda_venus_readdir(coda_file, ctx);
 }
 
 static inline unsigned int CDT2DT(unsigned char cdt)
@@ -437,10 +425,8 @@
 }
 
 /* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
-			      filldir_t filldir)
+static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
 {
-	int result = 0; /* # of entries returned */
 	struct coda_file_info *cfi;
 	struct coda_inode_info *cii;
 	struct file *host_file;
@@ -462,23 +448,12 @@
 	vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
 	if (!vdir) return -ENOMEM;
 
-	if (coda_file->f_pos == 0) {
-		ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR);
-		if (ret < 0)
-			goto out;
-		result++;
-		coda_file->f_pos++;
-	}
-	if (coda_file->f_pos == 1) {
-		ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR);
-		if (ret < 0)
-			goto out;
-		result++;
-		coda_file->f_pos++;
-	}
+	if (!dir_emit_dots(coda_file, ctx))
+		goto out;
+
 	while (1) {
 		/* read entries from the directory file */
-		ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir,
+		ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
 				  sizeof(*vdir));
 		if (ret < 0) {
 			printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
@@ -507,7 +482,7 @@
 
 		/* Make sure we skip '.' and '..', we already got those */
 		if (name.name[0] == '.' && (name.len == 1 ||
-		    (vdir->d_name[1] == '.' && name.len == 2)))
+		    (name.name[1] == '.' && name.len == 2)))
 			vdir->d_fileno = name.len = 0;
 
 		/* skip null entries */
@@ -520,19 +495,16 @@
 			if (!ino) ino = vdir->d_fileno;
 
 			type = CDT2DT(vdir->d_type);
-			ret = filldir(buf, name.name, name.len,
-				      coda_file->f_pos, ino, type);
-			/* failure means no space for filling in this round */
-			if (ret < 0) break;
-			result++;
+			if (!dir_emit(ctx, name.name, name.len, ino, type))
+				break;
 		}
 		/* we'll always have progress because d_reclen is unsigned and
 		 * we've already established it is non-zero. */
-		coda_file->f_pos += vdir->d_reclen;
+		ctx->pos += vdir->d_reclen;
 	}
 out:
 	kfree(vdir);
-	return result ? result : ret;
+	return 0;
 }
 
 /* called when a cache lookup succeeds */

diff --git a/fs/compat.c b/fs/compat.c
index fc3b55d..6af20de 100644
--- a/fs/compat.c
+++ b/fs/compat.c

@@ -832,6 +832,7 @@
 };
 
 struct compat_readdir_callback {
+	struct dir_context ctx;
 	struct compat_old_linux_dirent __user *dirent;
 	int result;
 };
@@ -873,15 +874,15 @@
 {
 	int error;
 	struct fd f = fdget(fd);
-	struct compat_readdir_callback buf;
+	struct compat_readdir_callback buf = {
+		.ctx.actor = compat_fillonedir,
+		.dirent = dirent
+	};
 
 	if (!f.file)
 		return -EBADF;
 
-	buf.result = 0;
-	buf.dirent = dirent;
-
-	error = vfs_readdir(f.file, compat_fillonedir, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (buf.result)
 		error = buf.result;
 
@@ -897,6 +898,7 @@
 };
 
 struct compat_getdents_callback {
+	struct dir_context ctx;
 	struct compat_linux_dirent __user *current_dir;
 	struct compat_linux_dirent __user *previous;
 	int count;
@@ -951,7 +953,11 @@
 {
 	struct fd f;
 	struct compat_linux_dirent __user * lastdirent;
-	struct compat_getdents_callback buf;
+	struct compat_getdents_callback buf = {
+		.ctx.actor = compat_filldir,
+		.current_dir = dirent,
+		.count = count
+	};
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -961,17 +967,12 @@
 	if (!f.file)
 		return -EBADF;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
-
-	error = vfs_readdir(f.file, compat_filldir, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		if (put_user(f.file->f_pos, &lastdirent->d_off))
+		if (put_user(buf.ctx.pos, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
@@ -983,6 +984,7 @@
 #ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
 
 struct compat_getdents_callback64 {
+	struct dir_context ctx;
 	struct linux_dirent64 __user *current_dir;
 	struct linux_dirent64 __user *previous;
 	int count;
@@ -1036,7 +1038,11 @@
 {
 	struct fd f;
 	struct linux_dirent64 __user * lastdirent;
-	struct compat_getdents_callback64 buf;
+	struct compat_getdents_callback64 buf = {
+		.ctx.actor = compat_filldir64,
+		.current_dir = dirent,
+		.count = count
+	};
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -1046,17 +1052,12 @@
 	if (!f.file)
 		return -EBADF;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
-
-	error = vfs_readdir(f.file, compat_filldir64, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		typeof(lastdirent->d_off) d_off = f.file->f_pos;
+		typeof(lastdirent->d_off) d_off = buf.ctx.pos;
 		if (__put_user_unaligned(d_off, &lastdirent->d_off))
 			error = -EFAULT;
 		else

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 996cdc5..5d19acf 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c

@@ -66,7 +66,6 @@
 #include <linux/gigaset_dev.h>
 
 #ifdef CONFIG_BLOCK
-#include <linux/loop.h>
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <scsi/scsi.h>
@@ -954,8 +953,6 @@
 /* Socket level stuff */
 COMPATIBLE_IOCTL(FIOQSIZE)
 #ifdef CONFIG_BLOCK
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
 /* md calls this on random blockdevs */
 IGNORE_IOCTL(RAID_VERSION)
 /* qemu/qemu-img might call these two on plain files for probing */

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7aabc6a..64e5323 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c

@@ -1532,84 +1532,66 @@
 	return (sd->s_mode >> 12) & 15;
 }
 
-static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int configfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
 	struct super_block *sb = dentry->d_sb;
 	struct configfs_dirent * parent_sd = dentry->d_fsdata;
-	struct configfs_dirent *cursor = filp->private_data;
+	struct configfs_dirent *cursor = file->private_data;
 	struct list_head *p, *q = &cursor->s_sibling;
 	ino_t ino = 0;
-	int i = filp->f_pos;
 
-	switch (i) {
-		case 0:
-			ino = dentry->d_inode->i_ino;
-			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-				break;
-			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		case 1:
-			ino = parent_ino(dentry);
-			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-				break;
-			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		default:
-			if (filp->f_pos == 2) {
-				spin_lock(&configfs_dirent_lock);
-				list_move(q, &parent_sd->s_children);
-				spin_unlock(&configfs_dirent_lock);
-			}
-			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
-				struct configfs_dirent *next;
-				const char * name;
-				int len;
-				struct inode *inode = NULL;
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	if (ctx->pos == 2) {
+		spin_lock(&configfs_dirent_lock);
+		list_move(q, &parent_sd->s_children);
+		spin_unlock(&configfs_dirent_lock);
+	}
+	for (p = q->next; p != &parent_sd->s_children; p = p->next) {
+		struct configfs_dirent *next;
+		const char *name;
+		int len;
+		struct inode *inode = NULL;
 
-				next = list_entry(p, struct configfs_dirent,
-						   s_sibling);
-				if (!next->s_element)
-					continue;
+		next = list_entry(p, struct configfs_dirent, s_sibling);
+		if (!next->s_element)
+			continue;
 
-				name = configfs_get_name(next);
-				len = strlen(name);
+		name = configfs_get_name(next);
+		len = strlen(name);
 
-				/*
-				 * We'll have a dentry and an inode for
-				 * PINNED items and for open attribute
-				 * files.  We lock here to prevent a race
-				 * with configfs_d_iput() clearing
-				 * s_dentry before calling iput().
-				 *
-				 * Why do we go to the trouble?  If
-				 * someone has an attribute file open,
-				 * the inode number should match until
-				 * they close it.  Beyond that, we don't
-				 * care.
-				 */
-				spin_lock(&configfs_dirent_lock);
-				dentry = next->s_dentry;
-				if (dentry)
-					inode = dentry->d_inode;
-				if (inode)
-					ino = inode->i_ino;
-				spin_unlock(&configfs_dirent_lock);
-				if (!inode)
-					ino = iunique(sb, 2);
+		/*
+		 * We'll have a dentry and an inode for
+		 * PINNED items and for open attribute
+		 * files.  We lock here to prevent a race
+		 * with configfs_d_iput() clearing
+		 * s_dentry before calling iput().
+		 *
+		 * Why do we go to the trouble?  If
+		 * someone has an attribute file open,
+		 * the inode number should match until
+		 * they close it.  Beyond that, we don't
+		 * care.
+		 */
+		spin_lock(&configfs_dirent_lock);
+		dentry = next->s_dentry;
+		if (dentry)
+			inode = dentry->d_inode;
+		if (inode)
+			ino = inode->i_ino;
+		spin_unlock(&configfs_dirent_lock);
+		if (!inode)
+			ino = iunique(sb, 2);
 
-				if (filldir(dirent, name, len, filp->f_pos, ino,
-						 dt_type(next)) < 0)
-					return 0;
+		if (!dir_emit(ctx, name, len, ino, dt_type(next)))
+			return 0;
 
-				spin_lock(&configfs_dirent_lock);
-				list_move(q, p);
-				spin_unlock(&configfs_dirent_lock);
-				p = q;
-				filp->f_pos++;
-			}
+		spin_lock(&configfs_dirent_lock);
+		list_move(q, p);
+		spin_unlock(&configfs_dirent_lock);
+		p = q;
+		ctx->pos++;
 	}
 	return 0;
 }
@@ -1661,7 +1643,7 @@
 	.release	= configfs_dir_close,
 	.llseek		= configfs_dir_lseek,
 	.read		= generic_read_dir,
-	.readdir	= configfs_readdir,
+	.iterate	= configfs_readdir,
 };
 
 int configfs_register_subsystem(struct configfs_subsystem *subsys)

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 35b1c7b..e501ac3 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c

@@ -349,18 +349,17 @@
 /*
  * Read a cramfs directory entry.
  */
-static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int cramfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	char *buf;
 	unsigned int offset;
-	int copied;
 
 	/* Offset within the thing. */
-	offset = filp->f_pos;
-	if (offset >= inode->i_size)
+	if (ctx->pos >= inode->i_size)
 		return 0;
+	offset = ctx->pos;
 	/* Directory entries are always 4-byte aligned */
 	if (offset & 3)
 		return -EINVAL;
@@ -369,14 +368,13 @@
 	if (!buf)
 		return -ENOMEM;
 
-	copied = 0;
 	while (offset < inode->i_size) {
 		struct cramfs_inode *de;
 		unsigned long nextoffset;
 		char *name;
 		ino_t ino;
 		umode_t mode;
-		int namelen, error;
+		int namelen;
 
 		mutex_lock(&read_mutex);
 		de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
@@ -402,13 +400,10 @@
 				break;
 			namelen--;
 		}
-		error = filldir(dirent, buf, namelen, offset, ino, mode >> 12);
-		if (error)
+		if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
 			break;
 
-		offset = nextoffset;
-		filp->f_pos = offset;
-		copied++;
+		ctx->pos = offset = nextoffset;
 	}
 	kfree(buf);
 	return 0;
@@ -547,7 +542,7 @@
 static const struct file_operations cramfs_directory_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= cramfs_readdir,
+	.iterate	= cramfs_readdir,
 };
 
 static const struct inode_operations cramfs_dir_inode_operations = {

diff --git a/fs/dcache.c b/fs/dcache.c
index f09b908..5a23073 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c

@@ -1612,6 +1612,10 @@
  * If a dentry was found and moved, then it is returned.  Otherwise NULL
  * is returned.  This matches the expected return value of ->lookup.
  *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
  */
 struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 {
@@ -1636,8 +1640,11 @@
 			security_d_instantiate(dentry, inode);
 			d_rehash(dentry);
 		}
-	} else
-		d_add(dentry, inode);
+	} else {
+		d_instantiate(dentry, inode);
+		if (d_unhashed(dentry))
+			d_rehash(dentry);
+	}
 	return new;
 }
 EXPORT_SYMBOL(d_splice_alias);

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 7d58d5b..76feb4b 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c

@@ -138,8 +138,9 @@
 static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
 					  const char *buf, size_t len)
 {
-	strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
-	strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+	strlcpy(dlm_config.ci_cluster_name, buf,
+				sizeof(dlm_config.ci_cluster_name));
+	strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
 	return len;
 }
 

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1b11466..e223a91 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c

@@ -2038,8 +2038,8 @@
 	b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
 	if (b == 1) {
 		int len = receive_extralen(ms);
-		if (len > DLM_RESNAME_MAXLEN)
-			len = DLM_RESNAME_MAXLEN;
+		if (len > r->res_ls->ls_lvblen)
+			len = r->res_ls->ls_lvblen;
 		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
 		lkb->lkb_lvbseq = ms->m_lvbseq;
 	}
@@ -3893,8 +3893,8 @@
 		if (!lkb->lkb_lvbptr)
 			return -ENOMEM;
 		len = receive_extralen(ms);
-		if (len > DLM_RESNAME_MAXLEN)
-			len = DLM_RESNAME_MAXLEN;
+		if (len > ls->ls_lvblen)
+			len = ls->ls_lvblen;
 		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
 	}
 	return 0;

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3ca79d3..88556dc 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c

@@ -883,17 +883,24 @@
 void dlm_stop_lockspaces(void)
 {
 	struct dlm_ls *ls;
+	int count;
 
  restart:
+	count = 0;
 	spin_lock(&lslist_lock);
 	list_for_each_entry(ls, &lslist, ls_list) {
-		if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+		if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
+			count++;
 			continue;
+		}
 		spin_unlock(&lslist_lock);
 		log_error(ls, "no userland control daemon, stopping lockspace");
 		dlm_ls_stop(ls);
 		goto restart;
 	}
 	spin_unlock(&lslist_lock);
+
+	if (count)
+		log_print("dlm user daemon left %d lockspaces", count);
 }
 

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d0ccd2f..d90909e 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c

@@ -52,7 +52,6 @@
 #include <linux/mutex.h>
 #include <linux/sctp.h>
 #include <linux/slab.h>
-#include <linux/sctp.h>
 #include <net/sctp/sctp.h>
 #include <net/ipv6.h>
 
@@ -126,6 +125,7 @@
 	struct connection *othercon;
 	struct work_struct rwork; /* Receive workqueue */
 	struct work_struct swork; /* Send workqueue */
+	bool try_new_addr;
 };
 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
 
@@ -144,6 +144,7 @@
 	struct list_head list;
 	int nodeid;
 	int addr_count;
+	int curr_addr_index;
 	struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
 };
 
@@ -310,7 +311,7 @@
 }
 
 static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
-			  struct sockaddr *sa_out)
+			  struct sockaddr *sa_out, bool try_new_addr)
 {
 	struct sockaddr_storage sas;
 	struct dlm_node_addr *na;
@@ -320,8 +321,16 @@
 
 	spin_lock(&dlm_node_addrs_spin);
 	na = find_node_addr(nodeid);
-	if (na && na->addr_count)
-		memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage));
+	if (na && na->addr_count) {
+		if (try_new_addr) {
+			na->curr_addr_index++;
+			if (na->curr_addr_index == na->addr_count)
+				na->curr_addr_index = 0;
+		}
+
+		memcpy(&sas, na->addr[na->curr_addr_index ],
+			sizeof(struct sockaddr_storage));
+	}
 	spin_unlock(&dlm_node_addrs_spin);
 
 	if (!na)
@@ -353,19 +362,22 @@
 {
 	struct dlm_node_addr *na;
 	int rv = -EEXIST;
+	int addr_i;
 
 	spin_lock(&dlm_node_addrs_spin);
 	list_for_each_entry(na, &dlm_node_addrs, list) {
 		if (!na->addr_count)
 			continue;
 
-		if (!addr_compare(na->addr[0], addr))
-			continue;
-
-		*nodeid = na->nodeid;
-		rv = 0;
-		break;
+		for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
+			if (addr_compare(na->addr[addr_i], addr)) {
+				*nodeid = na->nodeid;
+				rv = 0;
+				goto unlock;
+			}
+		}
 	}
+unlock:
 	spin_unlock(&dlm_node_addrs_spin);
 	return rv;
 }
@@ -561,8 +573,23 @@
 
 static void sctp_init_failed_foreach(struct connection *con)
 {
+
+	/*
+	 * Don't try to recover base con and handle race where the
+	 * other node's assoc init creates a assoc and we get that
+	 * notification, then we get a notification that our attempt
+	 * failed due. This happens when we are still trying the primary
+	 * address, but the other node has already tried secondary addrs
+	 * and found one that worked.
+	 */
+	if (!con->nodeid || con->sctp_assoc)
+		return;
+
+	log_print("Retrying SCTP association init for node %d\n", con->nodeid);
+
+	con->try_new_addr = true;
 	con->sctp_assoc = 0;
-	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+	if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
 		if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
 			queue_work(send_workqueue, &con->swork);
 	}
@@ -579,15 +606,56 @@
 	mutex_unlock(&connections_lock);
 }
 
+static void retry_failed_sctp_send(struct connection *recv_con,
+				   struct sctp_send_failed *sn_send_failed,
+				   char *buf)
+{
+	int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
+	struct dlm_mhandle *mh;
+	struct connection *con;
+	char *retry_buf;
+	int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
+
+	log_print("Retry sending %d bytes to node id %d", len, nodeid);
+
+	con = nodeid2con(nodeid, 0);
+	if (!con) {
+		log_print("Could not look up con for nodeid %d\n",
+			  nodeid);
+		return;
+	}
+
+	mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
+	if (!mh) {
+		log_print("Could not allocate buf for retry.");
+		return;
+	}
+	memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
+	dlm_lowcomms_commit_buffer(mh);
+
+	/*
+	 * If we got a assoc changed event before the send failed event then
+	 * we only need to retry the send.
+	 */
+	if (con->sctp_assoc) {
+		if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+			queue_work(send_workqueue, &con->swork);
+	} else
+		sctp_init_failed_foreach(con);
+}
+
 /* Something happened to an association */
 static void process_sctp_notification(struct connection *con,
 				      struct msghdr *msg, char *buf)
 {
 	union sctp_notification *sn = (union sctp_notification *)buf;
 
-	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+	switch (sn->sn_header.sn_type) {
+	case SCTP_SEND_FAILED:
+		retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
+		break;
+	case SCTP_ASSOC_CHANGE:
 		switch (sn->sn_assoc_change.sac_state) {
-
 		case SCTP_COMM_UP:
 		case SCTP_RESTART:
 		{
@@ -662,9 +730,11 @@
 			log_print("connecting to %d sctp association %d",
 				 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
 
+			new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
+			new_con->try_new_addr = false;
 			/* Send any pending writes */
 			clear_bit(CF_CONNECT_PENDING, &new_con->flags);
-			clear_bit(CF_INIT_PENDING, &con->flags);
+			clear_bit(CF_INIT_PENDING, &new_con->flags);
 			if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
 				queue_work(send_workqueue, &new_con->swork);
 			}
@@ -683,14 +753,10 @@
 		}
 		break;
 
-		/* We don't know which INIT failed, so clear the PENDING flags
-		 * on them all.  if assoc_id is zero then it will then try
-		 * again */
-
 		case SCTP_CANT_STR_ASSOC:
 		{
+			/* Will retry init when we get the send failed notification */
 			log_print("Can't start SCTP association - retrying");
-			sctp_init_failed();
 		}
 		break;
 
@@ -699,6 +765,8 @@
 				  (int)sn->sn_assoc_change.sac_assoc_id,
 				  sn->sn_assoc_change.sac_state);
 		}
+	default:
+		; /* fall through */
 	}
 }
 
@@ -958,6 +1026,24 @@
 	kfree(e);
 }
 
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+	e->offset += completed;
+	e->len -= completed;
+
+	if (e->len == 0 && e->users == 0) {
+		list_del(&e->list);
+		free_entry(e);
+	}
+}
+
 /* Initiate an SCTP association.
    This is a special case of send_to_sock() in that we don't yet have a
    peeled-off socket for this association, so we use the listening socket
@@ -977,15 +1063,14 @@
 	int addrlen;
 	struct kvec iov[1];
 
+	mutex_lock(&con->sock_mutex);
 	if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
-		return;
+		goto unlock;
 
-	if (con->retries++ > MAX_CONNECT_RETRIES)
-		return;
-
-	if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) {
+	if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
+			   con->try_new_addr)) {
 		log_print("no address for nodeid %d", con->nodeid);
-		return;
+		goto unlock;
 	}
 	base_con = nodeid2con(0, 0);
 	BUG_ON(base_con == NULL);
@@ -1003,17 +1088,25 @@
 	if (list_empty(&con->writequeue)) {
 		spin_unlock(&con->writequeue_lock);
 		log_print("writequeue empty for nodeid %d", con->nodeid);
-		return;
+		goto unlock;
 	}
 
 	e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
 	len = e->len;
 	offset = e->offset;
-	spin_unlock(&con->writequeue_lock);
 
 	/* Send the first block off the write queue */
 	iov[0].iov_base = page_address(e->page)+offset;
 	iov[0].iov_len = len;
+	spin_unlock(&con->writequeue_lock);
+
+	if (rem_addr.ss_family == AF_INET) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
+		log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
+	} else {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
+		log_print("Trying to connect to %pI6", &sin6->sin6_addr);
+	}
 
 	cmsg = CMSG_FIRSTHDR(&outmessage);
 	cmsg->cmsg_level = IPPROTO_SCTP;
@@ -1021,8 +1114,9 @@
 	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
 	sinfo = CMSG_DATA(cmsg);
 	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-	sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+	sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
 	outmessage.msg_controllen = cmsg->cmsg_len;
+	sinfo->sinfo_flags |= SCTP_ADDR_OVER;
 
 	ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
 	if (ret < 0) {
@@ -1035,15 +1129,12 @@
 	}
 	else {
 		spin_lock(&con->writequeue_lock);
-		e->offset += ret;
-		e->len -= ret;
-
-		if (e->len == 0 && e->users == 0) {
-			list_del(&e->list);
-			free_entry(e);
-		}
+		writequeue_entry_complete(e, ret);
 		spin_unlock(&con->writequeue_lock);
 	}
+
+unlock:
+	mutex_unlock(&con->sock_mutex);
 }
 
 /* Connect a new socket to its peer */
@@ -1075,7 +1166,7 @@
 		goto out_err;
 
 	memset(&saddr, 0, sizeof(saddr));
-	result = nodeid_to_addr(con->nodeid, &saddr, NULL);
+	result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
 	if (result < 0) {
 		log_print("no address for nodeid %d", con->nodeid);
 		goto out_err;
@@ -1254,6 +1345,7 @@
 	int result = -EINVAL, num = 1, i, addr_len;
 	struct connection *con = nodeid2con(0, GFP_NOFS);
 	int bufsize = NEEDED_RMEM;
+	int one = 1;
 
 	if (!con)
 		return -ENOMEM;
@@ -1288,6 +1380,11 @@
 		goto create_delsock;
 	}
 
+	result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
+				   sizeof(one));
+	if (result < 0)
+		log_print("Could not set SCTP NODELAY error %d\n", result);
+
 	/* Init con struct */
 	sock->sk->sk_user_data = con;
 	con->sock = sock;
@@ -1493,13 +1590,7 @@
 		}
 
 		spin_lock(&con->writequeue_lock);
-		e->offset += ret;
-		e->len -= ret;
-
-		if (e->len == 0 && e->users == 0) {
-			list_del(&e->list);
-			free_entry(e);
-		}
+		writequeue_entry_complete(e, ret);
 	}
 	spin_unlock(&con->writequeue_lock);
 out:

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index a7abbea..9aa05e0 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c

@@ -68,9 +68,9 @@
 }
 
 struct ecryptfs_getdents_callback {
-	void *dirent;
+	struct dir_context ctx;
+	struct dir_context *caller;
 	struct dentry *dentry;
-	filldir_t filldir;
 	int filldir_called;
 	int entries_written;
 };
@@ -96,9 +96,10 @@
 		       rc);
 		goto out;
 	}
-	rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
+	buf->caller->pos = buf->ctx.pos;
+	rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
 	kfree(name);
-	if (rc >= 0)
+	if (!rc)
 		buf->entries_written++;
 out:
 	return rc;
@@ -107,27 +108,23 @@
 /**
  * ecryptfs_readdir
  * @file: The eCryptfs directory file
- * @dirent: Directory entry handle
- * @filldir: The filldir callback function
+ * @ctx: The actor to feed the entries to
  */
-static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	int rc;
 	struct file *lower_file;
 	struct inode *inode;
-	struct ecryptfs_getdents_callback buf;
-
+	struct ecryptfs_getdents_callback buf = {
+		.ctx.actor = ecryptfs_filldir,
+		.caller = ctx,
+		.dentry = file->f_path.dentry
+	};
 	lower_file = ecryptfs_file_to_lower(file);
-	lower_file->f_pos = file->f_pos;
+	lower_file->f_pos = ctx->pos;
 	inode = file_inode(file);
-	memset(&buf, 0, sizeof(buf));
-	buf.dirent = dirent;
-	buf.dentry = file->f_path.dentry;
-	buf.filldir = filldir;
-	buf.filldir_called = 0;
-	buf.entries_written = 0;
-	rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
-	file->f_pos = lower_file->f_pos;
+	rc = iterate_dir(lower_file, &buf.ctx);
+	ctx->pos = buf.ctx.pos;
 	if (rc < 0)
 		goto out;
 	if (buf.filldir_called && !buf.entries_written)
@@ -344,7 +341,7 @@
 #endif
 
 const struct file_operations ecryptfs_dir_fops = {
-	.readdir = ecryptfs_readdir,
+	.iterate = ecryptfs_readdir,
 	.read = generic_read_dir,
 	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
@@ -365,7 +362,7 @@
 	.aio_read = ecryptfs_read_update_atime,
 	.write = do_sync_write,
 	.aio_write = generic_file_aio_write,
-	.readdir = ecryptfs_readdir,
+	.iterate = ecryptfs_readdir,
 	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = ecryptfs_compat_ioctl,

diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 055a9e9..b72307c 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c

@@ -7,40 +7,38 @@
 #include <linux/buffer_head.h>
 #include "efs.h"
 
-static int efs_readdir(struct file *, void *, filldir_t);
+static int efs_readdir(struct file *, struct dir_context *);
 
 const struct file_operations efs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= efs_readdir,
+	.iterate	= efs_readdir,
 };
 
 const struct inode_operations efs_dir_inode_operations = {
 	.lookup		= efs_lookup,
 };
 
-static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
-	struct inode *inode = file_inode(filp);
-	struct buffer_head *bh;
-
-	struct efs_dir		*dirblock;
-	struct efs_dentry	*dirslot;
-	efs_ino_t		inodenum;
+static int efs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
 	efs_block_t		block;
-	int			slot, namelen;
-	char			*nameptr;
+	int			slot;
 
 	if (inode->i_size & (EFS_DIRBSIZE-1))
 		printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
 
 	/* work out where this entry can be found */
-	block = filp->f_pos >> EFS_DIRBSIZE_BITS;
+	block = ctx->pos >> EFS_DIRBSIZE_BITS;
 
 	/* each block contains at most 256 slots */
-	slot  = filp->f_pos & 0xff;
+	slot  = ctx->pos & 0xff;
 
 	/* look at all blocks */
 	while (block < inode->i_blocks) {
+		struct efs_dir		*dirblock;
+		struct buffer_head *bh;
+
 		/* read the dir block */
 		bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
 
@@ -57,11 +55,14 @@
 			break;
 		}
 
-		while (slot < dirblock->slots) {
-			if (dirblock->space[slot] == 0) {
-				slot++;
+		for (; slot < dirblock->slots; slot++) {
+			struct efs_dentry *dirslot;
+			efs_ino_t inodenum;
+			const char *nameptr;
+			int namelen;
+
+			if (dirblock->space[slot] == 0)
 				continue;
-			}
 
 			dirslot  = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
 
@@ -72,39 +73,29 @@
 #ifdef DEBUG
 			printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen);
 #endif
-			if (namelen > 0) {
-				/* found the next entry */
-				filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
+			if (!namelen)
+				continue;
+			/* found the next entry */
+			ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
 
-				/* copy filename and data in dirslot */
-				filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN);
-
-				/* sanity check */
-				if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
-					printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
-					slot++;
-					continue;
-				}
-
-				/* store position of next slot */
-				if (++slot == dirblock->slots) {
-					slot = 0;
-					block++;
-				}
-				brelse(bh);
-				filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-				goto out;
+			/* sanity check */
+			if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
+				printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
+				continue;
 			}
-			slot++;
+
+			/* copy filename and data in dirslot */
+			if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) {
+				brelse(bh);
+				return 0;
+			}
 		}
 		brelse(bh);
 
 		slot = 0;
 		block++;
 	}
-
-	filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-out:
+	ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
 	return 0;
 }
 

diff --git a/fs/exec.c b/fs/exec.c
index 6430195..ffd7a81 100644
--- a/fs/exec.c
+++ b/fs/exec.c

@@ -1135,13 +1135,6 @@
 			set_dumpable(current->mm, suid_dumpable);
 	}
 
-	/*
-	 * Flush performance counters when crossing a
-	 * security domain:
-	 */
-	if (!get_dumpable(current->mm))
-		perf_event_exit_task(current);
-
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 
@@ -1205,6 +1198,15 @@
 
 	commit_creds(bprm->cred);
 	bprm->cred = NULL;
+
+	/*
+	 * Disable monitoring for regular users
+	 * when executing setuid binaries. Must
+	 * wait until new credentials are committed
+	 * by commit_creds() above
+	 */
+	if (get_dumpable(current->mm) != SUID_DUMP_USER)
+		perf_event_exit_task(current);
 	/*
 	 * cred_guard_mutex must be held at least to this point to prevent
 	 * ptrace_attach() from altering our determination of the task's

diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4637589..49f51ab 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c

@@ -239,22 +239,19 @@
 }
 
 static int
-exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+exofs_readdir(struct file *file, struct dir_context *ctx)
 {
-	loff_t pos = filp->f_pos;
-	struct inode *inode = file_inode(filp);
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	unsigned int offset = pos & ~PAGE_CACHE_MASK;
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
-	unsigned char *types = NULL;
-	int need_revalidate = (filp->f_version != inode->i_version);
+	int need_revalidate = (file->f_version != inode->i_version);
 
 	if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
 		return 0;
 
-	types = exofs_filetype_table;
-
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
 		struct exofs_dir_entry *de;
@@ -263,7 +260,7 @@
 		if (IS_ERR(page)) {
 			EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
 				  inode->i_ino);
-			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_CACHE_SIZE - offset;
 			return PTR_ERR(page);
 		}
 		kaddr = page_address(page);
@@ -271,9 +268,9 @@
 			if (offset) {
 				offset = exofs_validate_entry(kaddr, offset,
 								chunk_mask);
-				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
 			}
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 			need_revalidate = 0;
 		}
 		de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -288,27 +285,24 @@
 				return -EIO;
 			}
 			if (de->inode_no) {
-				int over;
-				unsigned char d_type = DT_UNKNOWN;
+				unsigned char t;
 
-				if (types && de->file_type < EXOFS_FT_MAX)
-					d_type = types[de->file_type];
+				if (de->file_type < EXOFS_FT_MAX)
+					t = exofs_filetype_table[de->file_type];
+				else
+					t = DT_UNKNOWN;
 
-				offset = (char *)de - kaddr;
-				over = filldir(dirent, de->name, de->name_len,
-						(n<<PAGE_CACHE_SHIFT) | offset,
+				if (!dir_emit(ctx, de->name, de->name_len,
 						le64_to_cpu(de->inode_no),
-						d_type);
-				if (over) {
+						t)) {
 					exofs_put_page(page);
 					return 0;
 				}
 			}
-			filp->f_pos += le16_to_cpu(de->rec_len);
+			ctx->pos += le16_to_cpu(de->rec_len);
 		}
 		exofs_put_page(page);
 	}
-
 	return 0;
 }
 
@@ -669,5 +663,5 @@
 const struct file_operations exofs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= exofs_readdir,
+	.iterate	= exofs_readdir,
 };

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80ab..2ec8eb1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c

@@ -953,9 +953,11 @@
 	return 0;
 }
 
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
-	EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+	EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+		     page->index, offset, length);
 	WARN_ON(1);
 }
 

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 262fc99..293bc2e 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c

@@ -212,6 +212,7 @@
 }
 
 struct getdents_callback {
+	struct dir_context ctx;
 	char *name;		/* name that was found. It already points to a
 				   buffer NAME_MAX+1 is size */
 	unsigned long ino;	/* the inum we are looking for */
@@ -254,7 +255,11 @@
 	struct inode *dir = path->dentry->d_inode;
 	int error;
 	struct file *file;
-	struct getdents_callback buffer;
+	struct getdents_callback buffer = {
+		.ctx.actor = filldir_one,
+		.name = name,
+		.ino = child->d_inode->i_ino
+	};
 
 	error = -ENOTDIR;
 	if (!dir || !S_ISDIR(dir->i_mode))
@@ -271,17 +276,14 @@
 		goto out;
 
 	error = -EINVAL;
-	if (!file->f_op->readdir)
+	if (!file->f_op->iterate)
 		goto out_close;
 
-	buffer.name = name;
-	buffer.ino = child->d_inode->i_ino;
-	buffer.found = 0;
 	buffer.sequence = 0;
 	while (1) {
 		int old_seq = buffer.sequence;
 
-		error = vfs_readdir(file, filldir_one, &buffer);
+		error = iterate_dir(file, &buffer.ctx);
 		if (buffer.found) {
 			error = 0;
 			break;

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 4237722bf..6e1d4ab 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c

@@ -287,17 +287,17 @@
 }
 
 static int
-ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
+ext2_readdir(struct file *file, struct dir_context *ctx)
 {
-	loff_t pos = filp->f_pos;
-	struct inode *inode = file_inode(filp);
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	unsigned int offset = pos & ~PAGE_CACHE_MASK;
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
 	unsigned char *types = NULL;
-	int need_revalidate = filp->f_version != inode->i_version;
+	int need_revalidate = file->f_version != inode->i_version;
 
 	if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
 		return 0;
@@ -314,16 +314,16 @@
 			ext2_error(sb, __func__,
 				   "bad page in #%lu",
 				   inode->i_ino);
-			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_CACHE_SIZE - offset;
 			return PTR_ERR(page);
 		}
 		kaddr = page_address(page);
 		if (unlikely(need_revalidate)) {
 			if (offset) {
 				offset = ext2_validate_entry(kaddr, offset, chunk_mask);
-				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
 			}
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 			need_revalidate = 0;
 		}
 		de = (ext2_dirent *)(kaddr+offset);
@@ -336,22 +336,19 @@
 				return -EIO;
 			}
 			if (de->inode) {
-				int over;
 				unsigned char d_type = DT_UNKNOWN;
 
 				if (types && de->file_type < EXT2_FT_MAX)
 					d_type = types[de->file_type];
 
-				offset = (char *)de - kaddr;
-				over = filldir(dirent, de->name, de->name_len,
-						(n<<PAGE_CACHE_SHIFT) | offset,
-						le32_to_cpu(de->inode), d_type);
-				if (over) {
+				if (!dir_emit(ctx, de->name, de->name_len,
+						le32_to_cpu(de->inode),
+						d_type)) {
 					ext2_put_page(page);
 					return 0;
 				}
 			}
-			filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
+			ctx->pos += ext2_rec_len_from_disk(de->rec_len);
 		}
 		ext2_put_page(page);
 	}
@@ -724,7 +721,7 @@
 const struct file_operations ext2_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ext2_readdir,
+	.iterate	= ext2_readdir,
 	.unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 87eccbb..f522425 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c

@@ -28,8 +28,7 @@
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
-static int ext3_dx_readdir(struct file * filp,
-			   void * dirent, filldir_t filldir);
+static int ext3_dx_readdir(struct file *, struct dir_context *);
 
 static unsigned char get_dtype(struct super_block *sb, int filetype)
 {
@@ -91,36 +90,30 @@
 	return error_msg == NULL ? 1 : 0;
 }
 
-static int ext3_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext3_readdir(struct file *file, struct dir_context *ctx)
 {
-	int error = 0;
 	unsigned long offset;
-	int i, stored;
+	int i;
 	struct ext3_dir_entry_2 *de;
 	int err;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	int ret = 0;
 	int dir_has_error = 0;
 
 	if (is_dx_dir(inode)) {
-		err = ext3_dx_readdir(filp, dirent, filldir);
-		if (err != ERR_BAD_DX_DIR) {
-			ret = err;
-			goto out;
-		}
+		err = ext3_dx_readdir(file, ctx);
+		if (err != ERR_BAD_DX_DIR)
+			return err;
 		/*
 		 * We don't set the inode dirty flag since it's not
 		 * critical that it get flushed back to the disk.
 		 */
-		EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL;
+		EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
 	}
-	stored = 0;
-	offset = filp->f_pos & (sb->s_blocksize - 1);
+	offset = ctx->pos & (sb->s_blocksize - 1);
 
-	while (!error && !stored && filp->f_pos < inode->i_size) {
-		unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
+	while (ctx->pos < inode->i_size) {
+		unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
 		struct buffer_head map_bh;
 		struct buffer_head *bh = NULL;
 
@@ -129,12 +122,12 @@
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
-			if (!ra_has_index(&filp->f_ra, index))
+			if (!ra_has_index(&file->f_ra, index))
 				page_cache_sync_readahead(
 					sb->s_bdev->bd_inode->i_mapping,
-					&filp->f_ra, filp,
+					&file->f_ra, file,
 					index, 1);
-			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 			bh = ext3_bread(NULL, inode, blk, 0, &err);
 		}
 
@@ -146,22 +139,21 @@
 			if (!dir_has_error) {
 				ext3_error(sb, __func__, "directory #%lu "
 					"contains a hole at offset %lld",
-					inode->i_ino, filp->f_pos);
+					inode->i_ino, ctx->pos);
 				dir_has_error = 1;
 			}
 			/* corrupt size?  Maybe no more blocks to read */
-			if (filp->f_pos > inode->i_blocks << 9)
+			if (ctx->pos > inode->i_blocks << 9)
 				break;
-			filp->f_pos += sb->s_blocksize - offset;
+			ctx->pos += sb->s_blocksize - offset;
 			continue;
 		}
 
-revalidate:
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
 		 * to make sure. */
-		if (filp->f_version != inode->i_version) {
+		if (offset && file->f_version != inode->i_version) {
 			for (i = 0; i < sb->s_blocksize && i < offset; ) {
 				de = (struct ext3_dir_entry_2 *)
 					(bh->b_data + i);
@@ -177,53 +169,40 @@
 				i += ext3_rec_len_from_disk(de->rec_len);
 			}
 			offset = i;
-			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
 				| offset;
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 		}
 
-		while (!error && filp->f_pos < inode->i_size
+		while (ctx->pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
 			if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
 						   bh, offset)) {
-				/* On error, skip the f_pos to the
+				/* On error, skip the to the
                                    next block. */
-				filp->f_pos = (filp->f_pos |
+				ctx->pos = (ctx->pos |
 						(sb->s_blocksize - 1)) + 1;
-				brelse (bh);
-				ret = stored;
-				goto out;
+				break;
 			}
 			offset += ext3_rec_len_from_disk(de->rec_len);
 			if (le32_to_cpu(de->inode)) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation.
-				 */
-				u64 version = filp->f_version;
-
-				error = filldir(dirent, de->name,
-						de->name_len,
-						filp->f_pos,
-						le32_to_cpu(de->inode),
-						get_dtype(sb, de->file_type));
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored ++;
+				if (!dir_emit(ctx, de->name, de->name_len,
+					      le32_to_cpu(de->inode),
+					      get_dtype(sb, de->file_type))) {
+					brelse(bh);
+					return 0;
+				}
 			}
-			filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
+			ctx->pos += ext3_rec_len_from_disk(de->rec_len);
 		}
 		offset = 0;
 		brelse (bh);
+		if (ctx->pos < inode->i_size)
+			if (!dir_relax(inode))
+				return 0;
 	}
-out:
-	return ret;
+	return 0;
 }
 
 static inline int is_32bit_api(void)
@@ -452,62 +431,54 @@
  * for all entres on the fname linked list.  (Normally there is only
  * one entry on the linked list, unless there are 62 bit hash collisions.)
  */
-static int call_filldir(struct file * filp, void * dirent,
-			filldir_t filldir, struct fname *fname)
+static bool call_filldir(struct file *file, struct dir_context *ctx,
+			struct fname *fname)
 {
-	struct dir_private_info *info = filp->private_data;
-	loff_t	curr_pos;
-	struct inode *inode = file_inode(filp);
-	struct super_block * sb;
-	int error;
-
-	sb = inode->i_sb;
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
 
 	if (!fname) {
 		printk("call_filldir: called with null fname?!?\n");
-		return 0;
+		return true;
 	}
-	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+	ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
 	while (fname) {
-		error = filldir(dirent, fname->name,
-				fname->name_len, curr_pos,
+		if (!dir_emit(ctx, fname->name, fname->name_len,
 				fname->inode,
-				get_dtype(sb, fname->file_type));
-		if (error) {
-			filp->f_pos = curr_pos;
+				get_dtype(sb, fname->file_type))) {
 			info->extra_fname = fname;
-			return error;
+			return false;
 		}
 		fname = fname->next;
 	}
-	return 0;
+	return true;
 }
 
-static int ext3_dx_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dir_private_info *info = filp->private_data;
-	struct inode *inode = file_inode(filp);
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
 	struct fname *fname;
 	int	ret;
 
 	if (!info) {
-		info = ext3_htree_create_dir_info(filp, filp->f_pos);
+		info = ext3_htree_create_dir_info(file, ctx->pos);
 		if (!info)
 			return -ENOMEM;
-		filp->private_data = info;
+		file->private_data = info;
 	}
 
-	if (filp->f_pos == ext3_get_htree_eof(filp))
+	if (ctx->pos == ext3_get_htree_eof(file))
 		return 0;	/* EOF */
 
 	/* Some one has messed with f_pos; reset the world */
-	if (info->last_pos != filp->f_pos) {
+	if (info->last_pos != ctx->pos) {
 		free_rb_tree_fname(&info->root);
 		info->curr_node = NULL;
 		info->extra_fname = NULL;
-		info->curr_hash = pos2maj_hash(filp, filp->f_pos);
-		info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+		info->curr_hash = pos2maj_hash(file, ctx->pos);
+		info->curr_minor_hash = pos2min_hash(file, ctx->pos);
 	}
 
 	/*
@@ -515,7 +486,7 @@
 	 * chain, return them first.
 	 */
 	if (info->extra_fname) {
-		if (call_filldir(filp, dirent, filldir, info->extra_fname))
+		if (!call_filldir(file, ctx, info->extra_fname))
 			goto finished;
 		info->extra_fname = NULL;
 		goto next_node;
@@ -529,17 +500,17 @@
 		 * cached entries.
 		 */
 		if ((!info->curr_node) ||
-		    (filp->f_version != inode->i_version)) {
+		    (file->f_version != inode->i_version)) {
 			info->curr_node = NULL;
 			free_rb_tree_fname(&info->root);
-			filp->f_version = inode->i_version;
-			ret = ext3_htree_fill_tree(filp, info->curr_hash,
+			file->f_version = inode->i_version;
+			ret = ext3_htree_fill_tree(file, info->curr_hash,
 						   info->curr_minor_hash,
 						   &info->next_hash);
 			if (ret < 0)
 				return ret;
 			if (ret == 0) {
-				filp->f_pos = ext3_get_htree_eof(filp);
+				ctx->pos = ext3_get_htree_eof(file);
 				break;
 			}
 			info->curr_node = rb_first(&info->root);
@@ -548,7 +519,7 @@
 		fname = rb_entry(info->curr_node, struct fname, rb_hash);
 		info->curr_hash = fname->hash;
 		info->curr_minor_hash = fname->minor_hash;
-		if (call_filldir(filp, dirent, filldir, fname))
+		if (!call_filldir(file, ctx, fname))
 			break;
 	next_node:
 		info->curr_node = rb_next(info->curr_node);
@@ -559,7 +530,7 @@
 			info->curr_minor_hash = fname->minor_hash;
 		} else {
 			if (info->next_hash == ~0) {
-				filp->f_pos = ext3_get_htree_eof(filp);
+				ctx->pos = ext3_get_htree_eof(file);
 				break;
 			}
 			info->curr_hash = info->next_hash;
@@ -567,7 +538,7 @@
 		}
 	}
 finished:
-	info->last_pos = filp->f_pos;
+	info->last_pos = ctx->pos;
 	return 0;
 }
 
@@ -582,7 +553,7 @@
 const struct file_operations ext3_dir_operations = {
 	.llseek		= ext3_dir_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ext3_readdir,
+	.iterate	= ext3_readdir,
 	.unlocked_ioctl = ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 23c7128..f67668f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c

@@ -1825,19 +1825,20 @@
 	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
 }
 
-static void ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
 {
 	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
 
-	trace_ext3_invalidatepage(page, offset);
+	trace_ext3_invalidatepage(page, offset, length);
 
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		ClearPageChecked(page);
 
-	journal_invalidatepage(journal, page, offset);
+	journal_invalidatepage(journal, page, offset, length);
 }
 
 static int ext3_releasepage(struct page *page, gfp_t wait)

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 692de13..cea8ecf 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c

@@ -576,11 +576,8 @@
 		if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
 					(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
 						+((char *)de - bh->b_data))) {
-			/* On error, skip the f_pos to the next block. */
-			dir_file->f_pos = (dir_file->f_pos |
-					(dir->i_sb->s_blocksize - 1)) + 1;
-			brelse (bh);
-			return count;
+			/* silently ignore the rest of the block */
+			break;
 		}
 		ext3fs_dirhash(de->name, de->name_len, hinfo);
 		if ((hinfo->hash < start_hash) ||

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13ea..5833939 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c

@@ -682,11 +682,15 @@
 
 static inline int test_root(ext4_group_t a, int b)
 {
-	int num = b;
-
-	while (a > num)
-		num *= b;
-	return num == a;
+	while (1) {
+		if (a < b)
+			return 0;
+		if (a == b)
+			return 1;
+		if ((a % b) != 0)
+			return 0;
+		a = a / b;
+	}
 }
 
 static int ext4_group_sparse(ext4_group_t group)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f8d56e4..3c7d288 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c

@@ -29,8 +29,7 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static int ext4_dx_readdir(struct file *filp,
-			   void *dirent, filldir_t filldir);
+static int ext4_dx_readdir(struct file *, struct dir_context *);
 
 /**
  * Check if the given dir-inode refers to an htree-indexed directory
@@ -103,60 +102,56 @@
 	return 1;
 }
 
-static int ext4_readdir(struct file *filp,
-			 void *dirent, filldir_t filldir)
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
 {
-	int error = 0;
 	unsigned int offset;
 	int i, stored;
 	struct ext4_dir_entry_2 *de;
 	int err;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	int ret = 0;
 	int dir_has_error = 0;
 
 	if (is_dx_dir(inode)) {
-		err = ext4_dx_readdir(filp, dirent, filldir);
+		err = ext4_dx_readdir(file, ctx);
 		if (err != ERR_BAD_DX_DIR) {
-			ret = err;
-			goto out;
+			return err;
 		}
 		/*
 		 * We don't set the inode dirty flag since it's not
 		 * critical that it get flushed back to the disk.
 		 */
-		ext4_clear_inode_flag(file_inode(filp),
+		ext4_clear_inode_flag(file_inode(file),
 				      EXT4_INODE_INDEX);
 	}
 
 	if (ext4_has_inline_data(inode)) {
 		int has_inline_data = 1;
-		ret = ext4_read_inline_dir(filp, dirent, filldir,
+		int ret = ext4_read_inline_dir(file, ctx,
 					   &has_inline_data);
 		if (has_inline_data)
 			return ret;
 	}
 
 	stored = 0;
-	offset = filp->f_pos & (sb->s_blocksize - 1);
+	offset = ctx->pos & (sb->s_blocksize - 1);
 
-	while (!error && !stored && filp->f_pos < inode->i_size) {
+	while (ctx->pos < inode->i_size) {
 		struct ext4_map_blocks map;
 		struct buffer_head *bh = NULL;
 
-		map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+		map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
 		map.m_len = 1;
 		err = ext4_map_blocks(NULL, inode, &map, 0);
 		if (err > 0) {
 			pgoff_t index = map.m_pblk >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
-			if (!ra_has_index(&filp->f_ra, index))
+			if (!ra_has_index(&file->f_ra, index))
 				page_cache_sync_readahead(
 					sb->s_bdev->bd_inode->i_mapping,
-					&filp->f_ra, filp,
+					&file->f_ra, file,
 					index, 1);
-			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 			bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
 		}
 
@@ -166,16 +161,16 @@
 		 */
 		if (!bh) {
 			if (!dir_has_error) {
-				EXT4_ERROR_FILE(filp, 0,
+				EXT4_ERROR_FILE(file, 0,
 						"directory contains a "
 						"hole at offset %llu",
-					   (unsigned long long) filp->f_pos);
+					   (unsigned long long) ctx->pos);
 				dir_has_error = 1;
 			}
 			/* corrupt size?  Maybe no more blocks to read */
-			if (filp->f_pos > inode->i_blocks << 9)
+			if (ctx->pos > inode->i_blocks << 9)
 				break;
-			filp->f_pos += sb->s_blocksize - offset;
+			ctx->pos += sb->s_blocksize - offset;
 			continue;
 		}
 
@@ -183,21 +178,20 @@
 		if (!buffer_verified(bh) &&
 		    !ext4_dirent_csum_verify(inode,
 				(struct ext4_dir_entry *)bh->b_data)) {
-			EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
+			EXT4_ERROR_FILE(file, 0, "directory fails checksum "
 					"at offset %llu",
-					(unsigned long long)filp->f_pos);
-			filp->f_pos += sb->s_blocksize - offset;
+					(unsigned long long)ctx->pos);
+			ctx->pos += sb->s_blocksize - offset;
 			brelse(bh);
 			continue;
 		}
 		set_buffer_verified(bh);
 
-revalidate:
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
 		 * to make sure. */
-		if (filp->f_version != inode->i_version) {
+		if (file->f_version != inode->i_version) {
 			for (i = 0; i < sb->s_blocksize && i < offset; ) {
 				de = (struct ext4_dir_entry_2 *)
 					(bh->b_data + i);
@@ -214,57 +208,46 @@
 							    sb->s_blocksize);
 			}
 			offset = i;
-			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
 				| offset;
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 		}
 
-		while (!error && filp->f_pos < inode->i_size
+		while (ctx->pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-			if (ext4_check_dir_entry(inode, filp, de, bh,
+			if (ext4_check_dir_entry(inode, file, de, bh,
 						 bh->b_data, bh->b_size,
 						 offset)) {
 				/*
-				 * On error, skip the f_pos to the next block
+				 * On error, skip to the next block
 				 */
-				filp->f_pos = (filp->f_pos |
+				ctx->pos = (ctx->pos |
 						(sb->s_blocksize - 1)) + 1;
-				brelse(bh);
-				ret = stored;
-				goto out;
+				break;
 			}
 			offset += ext4_rec_len_from_disk(de->rec_len,
 					sb->s_blocksize);
 			if (le32_to_cpu(de->inode)) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation.
-				 */
-				u64 version = filp->f_version;
-
-				error = filldir(dirent, de->name,
+				if (!dir_emit(ctx, de->name,
 						de->name_len,
-						filp->f_pos,
 						le32_to_cpu(de->inode),
-						get_dtype(sb, de->file_type));
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored++;
+						get_dtype(sb, de->file_type))) {
+					brelse(bh);
+					return 0;
+				}
 			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+			ctx->pos += ext4_rec_len_from_disk(de->rec_len,
 						sb->s_blocksize);
 		}
 		offset = 0;
 		brelse(bh);
+		if (ctx->pos < inode->i_size) {
+			if (!dir_relax(inode))
+				return 0;
+		}
 	}
-out:
-	return ret;
+	return 0;
 }
 
 static inline int is_32bit_api(void)
@@ -492,16 +475,12 @@
  * for all entres on the fname linked list.  (Normally there is only
  * one entry on the linked list, unless there are 62 bit hash collisions.)
  */
-static int call_filldir(struct file *filp, void *dirent,
-			filldir_t filldir, struct fname *fname)
+static int call_filldir(struct file *file, struct dir_context *ctx,
+			struct fname *fname)
 {
-	struct dir_private_info *info = filp->private_data;
-	loff_t	curr_pos;
-	struct inode *inode = file_inode(filp);
-	struct super_block *sb;
-	int error;
-
-	sb = inode->i_sb;
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
 
 	if (!fname) {
 		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
@@ -509,47 +488,44 @@
 			 inode->i_ino, current->comm);
 		return 0;
 	}
-	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+	ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
 	while (fname) {
-		error = filldir(dirent, fname->name,
-				fname->name_len, curr_pos,
+		if (!dir_emit(ctx, fname->name,
+				fname->name_len,
 				fname->inode,
-				get_dtype(sb, fname->file_type));
-		if (error) {
-			filp->f_pos = curr_pos;
+				get_dtype(sb, fname->file_type))) {
 			info->extra_fname = fname;
-			return error;
+			return 1;
 		}
 		fname = fname->next;
 	}
 	return 0;
 }
 
-static int ext4_dx_readdir(struct file *filp,
-			 void *dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dir_private_info *info = filp->private_data;
-	struct inode *inode = file_inode(filp);
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
 	struct fname *fname;
 	int	ret;
 
 	if (!info) {
-		info = ext4_htree_create_dir_info(filp, filp->f_pos);
+		info = ext4_htree_create_dir_info(file, ctx->pos);
 		if (!info)
 			return -ENOMEM;
-		filp->private_data = info;
+		file->private_data = info;
 	}
 
-	if (filp->f_pos == ext4_get_htree_eof(filp))
+	if (ctx->pos == ext4_get_htree_eof(file))
 		return 0;	/* EOF */
 
 	/* Some one has messed with f_pos; reset the world */
-	if (info->last_pos != filp->f_pos) {
+	if (info->last_pos != ctx->pos) {
 		free_rb_tree_fname(&info->root);
 		info->curr_node = NULL;
 		info->extra_fname = NULL;
-		info->curr_hash = pos2maj_hash(filp, filp->f_pos);
-		info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+		info->curr_hash = pos2maj_hash(file, ctx->pos);
+		info->curr_minor_hash = pos2min_hash(file, ctx->pos);
 	}
 
 	/*
@@ -557,7 +533,7 @@
 	 * chain, return them first.
 	 */
 	if (info->extra_fname) {
-		if (call_filldir(filp, dirent, filldir, info->extra_fname))
+		if (call_filldir(file, ctx, info->extra_fname))
 			goto finished;
 		info->extra_fname = NULL;
 		goto next_node;
@@ -571,17 +547,17 @@
 		 * cached entries.
 		 */
 		if ((!info->curr_node) ||
-		    (filp->f_version != inode->i_version)) {
+		    (file->f_version != inode->i_version)) {
 			info->curr_node = NULL;
 			free_rb_tree_fname(&info->root);
-			filp->f_version = inode->i_version;
-			ret = ext4_htree_fill_tree(filp, info->curr_hash,
+			file->f_version = inode->i_version;
+			ret = ext4_htree_fill_tree(file, info->curr_hash,
 						   info->curr_minor_hash,
 						   &info->next_hash);
 			if (ret < 0)
 				return ret;
 			if (ret == 0) {
-				filp->f_pos = ext4_get_htree_eof(filp);
+				ctx->pos = ext4_get_htree_eof(file);
 				break;
 			}
 			info->curr_node = rb_first(&info->root);
@@ -590,7 +566,7 @@
 		fname = rb_entry(info->curr_node, struct fname, rb_hash);
 		info->curr_hash = fname->hash;
 		info->curr_minor_hash = fname->minor_hash;
-		if (call_filldir(filp, dirent, filldir, fname))
+		if (call_filldir(file, ctx, fname))
 			break;
 	next_node:
 		info->curr_node = rb_next(info->curr_node);
@@ -601,7 +577,7 @@
 			info->curr_minor_hash = fname->minor_hash;
 		} else {
 			if (info->next_hash == ~0) {
-				filp->f_pos = ext4_get_htree_eof(filp);
+				ctx->pos = ext4_get_htree_eof(file);
 				break;
 			}
 			info->curr_hash = info->next_hash;
@@ -609,7 +585,7 @@
 		}
 	}
 finished:
-	info->last_pos = filp->f_pos;
+	info->last_pos = ctx->pos;
 	return 0;
 }
 
@@ -624,7 +600,7 @@
 const struct file_operations ext4_dir_operations = {
 	.llseek		= ext4_dir_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ext4_readdir,
+	.iterate	= ext4_readdir,
 	.unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5aae3d1..b577e45 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h

@@ -177,38 +177,28 @@
 };
 
 /*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
-	struct inode *inode;
-	sector_t b_blocknr;		/* start block number of extent */
-	size_t b_size;			/* size of extent */
-	unsigned long b_state;		/* state of the extent */
-	unsigned long first_page, next_page;	/* extent of pages */
-	struct writeback_control *wbc;
-	int io_done;
-	int pages_written;
-	int retval;
-};
-
-/*
  * Flags for ext4_io_end->flags
  */
 #define	EXT4_IO_END_UNWRITTEN	0x0001
-#define EXT4_IO_END_ERROR	0x0002
-#define EXT4_IO_END_DIRECT	0x0004
+#define EXT4_IO_END_DIRECT	0x0002
 
 /*
- * For converting uninitialized extents on a work queue.
+ * For converting uninitialized extents on a work queue. 'handle' is used for
+ * buffered writeback.
  */
 typedef struct ext4_io_end {
 	struct list_head	list;		/* per-file finished IO list */
+	handle_t		*handle;	/* handle reserved for extent
+						 * conversion */
 	struct inode		*inode;		/* file being written to */
+	struct bio		*bio;		/* Linked list of completed
+						 * bios covering the extent */
 	unsigned int		flag;		/* unwritten or not */
 	loff_t			offset;		/* offset in the file */
 	ssize_t			size;		/* size of the extent */
 	struct kiocb		*iocb;		/* iocb struct for AIO */
 	int			result;		/* error value for AIO */
+	atomic_t		count;		/* reference counter */
 } ext4_io_end_t;
 
 struct ext4_io_submit {
@@ -581,11 +571,6 @@
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020
 
 /*
- * Flags used by ext4_discard_partial_page_buffers
- */
-#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED	0x0001
-
-/*
  * ioctl commands
  */
 #define	EXT4_IOC_GETFLAGS		FS_IOC_GETFLAGS
@@ -879,6 +864,7 @@
 	rwlock_t i_es_lock;
 	struct list_head i_es_lru;
 	unsigned int i_es_lru_nr;	/* protected by i_es_lock */
+	unsigned long i_touch_when;	/* jiffies of last accessing */
 
 	/* ialloc */
 	ext4_group_t	i_last_alloc_group;
@@ -903,12 +889,22 @@
 	qsize_t i_reserved_quota;
 #endif
 
-	/* completed IOs that might need unwritten extents handling */
-	struct list_head i_completed_io_list;
+	/* Lock protecting lists below */
 	spinlock_t i_completed_io_lock;
+	/*
+	 * Completed IOs that need unwritten extents handling and have
+	 * transaction reserved
+	 */
+	struct list_head i_rsv_conversion_list;
+	/*
+	 * Completed IOs that need unwritten extents handling and don't have
+	 * transaction reserved
+	 */
+	struct list_head i_unrsv_conversion_list;
 	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
 	atomic_t i_unwritten; /* Nr. of inflight conversions pending */
-	struct work_struct i_unwritten_work;	/* deferred extent conversion */
+	struct work_struct i_rsv_conversion_work;
+	struct work_struct i_unrsv_conversion_work;
 
 	spinlock_t i_block_reservation_lock;
 
@@ -1245,7 +1241,6 @@
 	unsigned int s_mb_stats;
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
-	unsigned int s_max_writeback_mb_bump;
 	unsigned int s_max_dir_size_kb;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
@@ -1281,8 +1276,10 @@
 	struct flex_groups *s_flex_groups;
 	ext4_group_t s_flex_groups_allocated;
 
-	/* workqueue for dio unwritten */
-	struct workqueue_struct *dio_unwritten_wq;
+	/* workqueue for unreserved extent convertions (dio) */
+	struct workqueue_struct *unrsv_conversion_wq;
+	/* workqueue for reserved extent conversions (buffered io) */
+	struct workqueue_struct *rsv_conversion_wq;
 
 	/* timer for periodic error stats printing */
 	struct timer_list s_err_report;
@@ -1307,6 +1304,7 @@
 	/* Reclaim extents from extent status tree */
 	struct shrinker s_es_shrinker;
 	struct list_head s_es_lru;
+	unsigned long s_es_last_sorted;
 	struct percpu_counter s_extent_cache_cnt;
 	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
 };
@@ -1342,6 +1340,9 @@
 					      struct ext4_io_end *io_end)
 {
 	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		/* Writeback has to have coversion transaction reserved */
+		WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
+			!(io_end->flag & EXT4_IO_END_DIRECT));
 		io_end->flag |= EXT4_IO_END_UNWRITTEN;
 		atomic_inc(&EXT4_I(inode)->i_unwritten);
 	}
@@ -1999,7 +2000,6 @@
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_unwritten_io(struct inode *);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2088,7 +2088,7 @@
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
-extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2096,9 +2096,12 @@
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_discard_partial_page_buffers(handle_t *handle,
-		struct address_space *mapping, loff_t from,
-		loff_t length, int flags);
+extern int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+			     loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2111,7 +2114,7 @@
 				const struct iovec *iov, loff_t offset,
 				unsigned long nr_segs);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
-extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
 extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
 				 ext4_lblk_t first, ext4_lblk_t stop);
@@ -2166,42 +2169,96 @@
 				    ext4_group_t ngroup);
 extern const char *ext4_decode_error(struct super_block *sb, int errno,
 				     char nbuf[16]);
+
 extern __printf(4, 5)
 void __ext4_error(struct super_block *, const char *, unsigned int,
 		  const char *, ...);
-#define ext4_error(sb, message...)	__ext4_error(sb, __func__,	\
-						     __LINE__, ## message)
 extern __printf(5, 6)
-void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
 		      const char *, ...);
 extern __printf(5, 6)
-void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
 		     const char *, ...);
 extern void __ext4_std_error(struct super_block *, const char *,
 			     unsigned int, int);
 extern __printf(4, 5)
 void __ext4_abort(struct super_block *, const char *, unsigned int,
 		  const char *, ...);
-#define ext4_abort(sb, message...)	__ext4_abort(sb, __func__, \
-						       __LINE__, ## message)
 extern __printf(4, 5)
 void __ext4_warning(struct super_block *, const char *, unsigned int,
 		    const char *, ...);
-#define ext4_warning(sb, message...)	__ext4_warning(sb, __func__, \
-						       __LINE__, ## message)
 extern __printf(3, 4)
-void ext4_msg(struct super_block *, const char *, const char *, ...);
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
 extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
 			   const char *, unsigned int, const char *);
-#define dump_mmp_msg(sb, mmp, msg)	__dump_mmp_msg(sb, mmp, __func__, \
-						       __LINE__, msg)
 extern __printf(7, 8)
 void __ext4_grp_locked_error(const char *, unsigned int,
 			     struct super_block *, ext4_group_t,
 			     unsigned long, ext4_fsblk_t,
 			     const char *, ...);
-#define ext4_grp_locked_error(sb, grp, message...) \
-	__ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...)		\
+	__ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...)		\
+	__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...)					\
+	__ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...)					\
+	__ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...)					\
+	__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...)				\
+	__ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg)					\
+	__dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\
+	__ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+				fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error_inode(inode, "", 0, block, " ");			\
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error_file(file, "", 0, block, " ");			\
+} while (0)
+#define ext4_error(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error(sb, "", 0, " ");					\
+} while (0)
+#define ext4_abort(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_abort(sb, "", 0, " ");					\
+} while (0)
+#define ext4_warning(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_warning(sb, "", 0, " ");					\
+} while (0)
+#define ext4_msg(sb, level, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_msg(sb, "", " ");					\
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg)					\
+	__dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);				\
+	__ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");	\
+} while (0)
+
+#endif
+
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
@@ -2312,6 +2369,7 @@
 {
 	 struct ext4_group_info ***grp_info;
 	 long indexv, indexh;
+	 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
 	 grp_info = EXT4_SB(sb)->s_group_info;
 	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
 	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2515,7 +2573,7 @@
 				      struct inode *parent,
 				      struct inode *inode);
 extern int ext4_read_inline_dir(struct file *filp,
-				void *dirent, filldir_t filldir,
+				struct dir_context *ctx,
 				int *has_inline_data);
 extern int htree_inlinedir_to_tree(struct file *dir_file,
 				   struct inode *dir, ext4_lblk_t block,
@@ -2598,8 +2656,7 @@
 
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
-				       int chunk);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			       struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2609,8 +2666,8 @@
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
 			  loff_t len);
-extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-			  ssize_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+					  loff_t offset, ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			   struct ext4_map_blocks *map, int flags);
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2650,12 +2707,15 @@
 
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
-extern void ext4_ioend_shutdown(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern void ext4_end_io_work(struct work_struct *work);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+				struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_end_io_unrsv_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
 			       struct page *page,
@@ -2668,20 +2728,17 @@
 extern int ext4_mmp_csum_verify(struct super_block *sb,
 				struct mmp_struct *mmp);
 
-/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
 enum ext4_state_bits {
 	BH_Uninit	/* blocks are allocated but uninitialized on disk */
-	  = BH_JBDPrivateStart,
+	 = BH_JBDPrivateStart,
 	BH_AllocFromCluster,	/* allocated blocks were part of already
-				 * allocated cluster. Note that this flag will
-				 * never, ever appear in a buffer_head's state
-				 * flag. See EXT4_MAP_FROM_CLUSTER to see where
-				 * this is used. */
+				 * allocated cluster. */
 };
 
-BUFFER_FNS(Uninit, uninit)
-TAS_BUFFER_FNS(Uninit, uninit)
-
 /*
  * Add new method to test whether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb40..72a3600 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c

@@ -38,31 +38,43 @@
 /*
  * Wrappers for jbd2_journal_start/end.
  */
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
-				  int type, int nblocks)
+static int ext4_journal_check_start(struct super_block *sb)
 {
 	journal_t *journal;
 
 	might_sleep();
-
-	trace_ext4_journal_start(sb, nblocks, _RET_IP_);
 	if (sb->s_flags & MS_RDONLY)
-		return ERR_PTR(-EROFS);
-
+		return -EROFS;
 	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
 	journal = EXT4_SB(sb)->s_journal;
-	if (!journal)
-		return ext4_get_nojournal();
 	/*
 	 * Special case here: if the journal has aborted behind our
 	 * backs (eg. EIO in the commit thread), then we still need to
 	 * take the FS itself readonly cleanly.
 	 */
-	if (is_journal_aborted(journal)) {
+	if (journal && is_journal_aborted(journal)) {
 		ext4_abort(sb, "Detected aborted journal");
-		return ERR_PTR(-EROFS);
+		return -EROFS;
 	}
-	return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line);
+	return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+				  int type, int blocks, int rsv_blocks)
+{
+	journal_t *journal;
+	int err;
+
+	trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+	err = ext4_journal_check_start(sb);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	journal = EXT4_SB(sb)->s_journal;
+	if (!journal)
+		return ext4_get_nojournal();
+	return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+				   type, line);
 }
 
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@
 	return err;
 }
 
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+					int type)
+{
+	struct super_block *sb;
+	int err;
+
+	if (!ext4_handle_valid(handle))
+		return ext4_get_nojournal();
+
+	sb = handle->h_journal->j_private;
+	trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+					  _RET_IP_);
+	err = ext4_journal_check_start(sb);
+	if (err < 0) {
+		jbd2_journal_free_reserved(handle);
+		return ERR_PTR(err);
+	}
+
+	err = jbd2_journal_start_reserved(handle, type, line);
+	if (err < 0)
+		return ERR_PTR(err);
+	return handle;
+}
+
 void ext4_journal_abort_handle(const char *caller, unsigned int line,
 			       const char *err_fn, struct buffer_head *bh,
 			       handle_t *handle, int err)

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885..2877258 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h

@@ -134,7 +134,8 @@
 #define EXT4_HT_MIGRATE          8
 #define EXT4_HT_MOVE_EXTENTS     9
 #define EXT4_HT_XATTR           10
-#define EXT4_HT_MAX             11
+#define EXT4_HT_EXT_CONVERT     11
+#define EXT4_HT_MAX             12
 
 /**
  *   struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@
 	__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
 
 handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
-				  int type, int nblocks);
+				  int type, int blocks, int rsv_blocks);
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 
 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@
 }
 
 #define ext4_journal_start_sb(sb, type, nblocks)			\
-	__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks))
+	__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
 
 #define ext4_journal_start(inode, type, nblocks)			\
-	__ext4_journal_start((inode), __LINE__, (type), (nblocks))
+	__ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+	__ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
 
 static inline handle_t *__ext4_journal_start(struct inode *inode,
 					     unsigned int line, int type,
-					     int nblocks)
+					     int blocks, int rsv_blocks)
 {
-	return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);
+	return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+				       rsv_blocks);
 }
 
 #define ext4_journal_stop(handle) \
 	__ext4_journal_stop(__func__, __LINE__, (handle))
 
+#define ext4_journal_start_reserved(handle, type) \
+	__ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+					int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		jbd2_journal_free_reserved(handle);
+}
+
 static inline handle_t *ext4_journal_current_handle(void)
 {
 	return journal_current_handle();

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc0f1910..7097b0f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c

@@ -2125,7 +2125,8 @@
 		next_del = ext4_find_delayed_extent(inode, &es);
 		if (!exists && next_del) {
 			exists = 1;
-			flags |= FIEMAP_EXTENT_DELALLOC;
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
 		}
 		up_read(&EXT4_I(inode)->i_data_sem);
 
@@ -2328,17 +2329,15 @@
 }
 
 /*
- * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
  *
- * if nrblocks are fit in a single extent (chunk flag is 1), then
- * in the worse case, each tree level index/leaf need to be changed
- * if the tree split due to insert a new extent, then the old tree
- * index/leaf need to be updated too
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
  *
- * If the nrblocks are discontiguous, they could cause
- * the whole tree split more than once, but this is really rare.
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
  */
-int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
 {
 	int index;
 	int depth;
@@ -2349,7 +2348,7 @@
 
 	depth = ext_depth(inode);
 
-	if (chunk)
+	if (extents <= 1)
 		index = depth * 2;
 	else
 		index = depth * 3;
@@ -2357,20 +2356,24 @@
 	return index;
 }
 
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+	else if (ext4_should_journal_data(inode))
+		return EXT4_FREE_BLOCKS_FORGET;
+	return 0;
+}
+
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 			      struct ext4_extent *ex,
-			      ext4_fsblk_t *partial_cluster,
+			      long long *partial_cluster,
 			      ext4_lblk_t from, ext4_lblk_t to)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
 	ext4_fsblk_t pblk;
-	int flags = 0;
-
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
-	else if (ext4_should_journal_data(inode))
-		flags |= EXT4_FREE_BLOCKS_FORGET;
+	int flags = get_default_free_blocks_flags(inode);
 
 	/*
 	 * For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2391,8 @@
 	 * partial cluster here.
 	 */
 	pblk = ext4_ext_pblock(ex) + ee_len - 1;
-	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+	if ((*partial_cluster > 0) &&
+	    (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
 		ext4_free_blocks(handle, inode, NULL,
 				 EXT4_C2B(sbi, *partial_cluster),
 				 sbi->s_cluster_ratio, flags);
@@ -2414,41 +2418,46 @@
 	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* tail removal */
 		ext4_lblk_t num;
+		unsigned int unaligned;
 
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
 		pblk = ext4_ext_pblock(ex) + ee_len - num;
-		ext_debug("free last %u blocks starting %llu\n", num, pblk);
+		/*
+		 * Usually we want to free partial cluster at the end of the
+		 * extent, except for the situation when the cluster is still
+		 * used by any other extent (partial_cluster is negative).
+		 */
+		if (*partial_cluster < 0 &&
+		    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+			flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+		ext_debug("free last %u blocks starting %llu partial %lld\n",
+			  num, pblk, *partial_cluster);
 		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
 		/*
 		 * If the block range to be freed didn't start at the
 		 * beginning of a cluster, and we removed the entire
-		 * extent, save the partial cluster here, since we
-		 * might need to delete if we determine that the
-		 * truncate operation has removed all of the blocks in
-		 * the cluster.
+		 * extent and the cluster is not used by any other extent,
+		 * save the partial cluster here, since we might need to
+		 * delete if we determine that the truncate operation has
+		 * removed all of the blocks in the cluster.
+		 *
+		 * On the other hand, if we did not manage to free the whole
+		 * extent, we have to mark the cluster as used (store negative
+		 * cluster number in partial_cluster).
 		 */
-		if (pblk & (sbi->s_cluster_ratio - 1) &&
-		    (ee_len == num))
+		unaligned = pblk & (sbi->s_cluster_ratio - 1);
+		if (unaligned && (ee_len == num) &&
+		    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
 			*partial_cluster = EXT4_B2C(sbi, pblk);
-		else
+		else if (unaligned)
+			*partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+		else if (*partial_cluster > 0)
 			*partial_cluster = 0;
-	} else if (from == le32_to_cpu(ex->ee_block)
-		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-		/* head removal */
-		ext4_lblk_t num;
-		ext4_fsblk_t start;
-
-		num = to - from;
-		start = ext4_ext_pblock(ex);
-
-		ext_debug("free first %u blocks starting %llu\n", num, start);
-		ext4_free_blocks(handle, inode, NULL, start, num, flags);
-
-	} else {
-		printk(KERN_INFO "strange request: removal(2) "
-				"%u-%u from %u:%u\n",
-				from, to, le32_to_cpu(ex->ee_block), ee_len);
-	}
+	} else
+		ext4_error(sbi->s_sb, "strange request: removal(2) "
+			   "%u-%u from %u:%u\n",
+			   from, to, le32_to_cpu(ex->ee_block), ee_len);
 	return 0;
 }
 
@@ -2461,12 +2470,16 @@
  * @handle: The journal handle
  * @inode:  The files inode
  * @path:   The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ *                   has been released from it. It gets negative in case
+ *                   that the cluster is still used.
  * @start:  The first block to remove
  * @end:   The last block to remove
  */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+		 struct ext4_ext_path *path,
+		 long long *partial_cluster,
 		 ext4_lblk_t start, ext4_lblk_t end)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2492,7 @@
 	unsigned short ex_ee_len;
 	unsigned uninitialized = 0;
 	struct ext4_extent *ex;
+	ext4_fsblk_t pblk;
 
 	/* the header must be checked already in ext4_ext_remove_space() */
 	ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2504,9 @@
 		return -EIO;
 	}
 	/* find where to start removing */
-	ex = EXT_LAST_EXTENT(eh);
+	ex = path[depth].p_ext;
+	if (!ex)
+		ex = EXT_LAST_EXTENT(eh);
 
 	ex_ee_block = le32_to_cpu(ex->ee_block);
 	ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2533,16 @@
 
 		/* If this extent is beyond the end of the hole, skip it */
 		if (end < ex_ee_block) {
+			/*
+			 * We're going to skip this extent and move to another,
+			 * so if this extent is not cluster aligned we have
+			 * to mark the current cluster as used to avoid
+			 * accidentally freeing it later on
+			 */
+			pblk = ext4_ext_pblock(ex);
+			if (pblk & (sbi->s_cluster_ratio - 1))
+				*partial_cluster =
+					-((long long)EXT4_B2C(sbi, pblk));
 			ex--;
 			ex_ee_block = le32_to_cpu(ex->ee_block);
 			ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2618,7 @@
 					sizeof(struct ext4_extent));
 			}
 			le16_add_cpu(&eh->eh_entries, -1);
-		} else
+		} else if (*partial_cluster > 0)
 			*partial_cluster = 0;
 
 		err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2636,13 @@
 		err = ext4_ext_correct_indexes(handle, inode, path);
 
 	/*
-	 * If there is still a entry in the leaf node, check to see if
-	 * it references the partial cluster.  This is the only place
-	 * where it could; if it doesn't, we can free the cluster.
+	 * Free the partial cluster only if the current extent does not
+	 * reference it. Otherwise we might free used cluster.
 	 */
-	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+	if (*partial_cluster > 0 &&
 	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
 	     *partial_cluster)) {
-		int flags = EXT4_FREE_BLOCKS_FORGET;
-
-		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-			flags |= EXT4_FREE_BLOCKS_METADATA;
+		int flags = get_default_free_blocks_flags(inode);
 
 		ext4_free_blocks(handle, inode, NULL,
 				 EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2686,7 @@
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path = NULL;
-	ext4_fsblk_t partial_cluster = 0;
+	long long partial_cluster = 0;
 	handle_t *handle;
 	int i = 0, err = 0;
 
@@ -2676,7 +2698,7 @@
 		return PTR_ERR(handle);
 
 again:
-	trace_ext4_ext_remove_space(inode, start, depth);
+	trace_ext4_ext_remove_space(inode, start, end, depth);
 
 	/*
 	 * Check if we are removing extents inside the extent tree. If that
@@ -2844,17 +2866,14 @@
 		}
 	}
 
-	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
-			path->p_hdr->eh_entries);
+	trace_ext4_ext_remove_space_done(inode, start, end, depth,
+			partial_cluster, path->p_hdr->eh_entries);
 
 	/* If we still have something in the partial cluster and we have removed
 	 * even the first extent, then we should free the blocks in the partial
 	 * cluster as well. */
-	if (partial_cluster && path->p_hdr->eh_entries == 0) {
-		int flags = EXT4_FREE_BLOCKS_FORGET;
-
-		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-			flags |= EXT4_FREE_BLOCKS_METADATA;
+	if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+		int flags = get_default_free_blocks_flags(inode);
 
 		ext4_free_blocks(handle, inode, NULL,
 				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -4363,7 +4382,7 @@
 	}
 
 out3:
-	trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
+	trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
 
 	return err ? err : allocated;
 }
@@ -4446,7 +4465,7 @@
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
-		return ext4_punch_hole(file, offset, len);
+		return ext4_punch_hole(inode, offset, len);
 
 	ret = ext4_convert_inline_data(inode);
 	if (ret)
@@ -4548,10 +4567,9 @@
  * function, to convert the fallocated extents after IO is completed.
  * Returns 0 on success.
  */
-int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-				    ssize_t len)
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+				   loff_t offset, ssize_t len)
 {
-	handle_t *handle;
 	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
@@ -4566,16 +4584,32 @@
 	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
 		      map.m_lblk);
 	/*
-	 * credits to insert 1 extent into extent tree
+	 * This is somewhat ugly but the idea is clear: When transaction is
+	 * reserved, everything goes into it. Otherwise we rather start several
+	 * smaller transactions for conversion of each extent separately.
 	 */
-	credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	if (handle) {
+		handle = ext4_journal_start_reserved(handle,
+						     EXT4_HT_EXT_CONVERT);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+		credits = 0;
+	} else {
+		/*
+		 * credits to insert 1 extent into extent tree
+		 */
+		credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	}
 	while (ret >= 0 && ret < max_blocks) {
 		map.m_lblk += ret;
 		map.m_len = (max_blocks -= ret);
-		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			break;
+		if (credits) {
+			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+						    credits);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				break;
+			}
 		}
 		ret = ext4_map_blocks(handle, inode, &map,
 				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4620,13 @@
 				     inode->i_ino, map.m_lblk,
 				     map.m_len, ret);
 		ext4_mark_inode_dirty(handle, inode);
-		ret2 = ext4_journal_stop(handle);
-		if (ret <= 0 || ret2 )
+		if (credits)
+			ret2 = ext4_journal_stop(handle);
+		if (ret <= 0 || ret2)
 			break;
 	}
+	if (!credits)
+		ret2 = ext4_journal_stop(handle);
 	return ret > 0 ? ret2 : ret;
 }
 
@@ -4659,7 +4696,7 @@
 		error = ext4_get_inode_loc(inode, &iloc);
 		if (error)
 			return error;
-		physical = iloc.bh->b_blocknr << blockbits;
+		physical = (__u64)iloc.bh->b_blocknr << blockbits;
 		offset = EXT4_GOOD_OLD_INODE_SIZE +
 				EXT4_I(inode)->i_extra_isize;
 		physical += offset;
@@ -4667,7 +4704,7 @@
 		flags |= FIEMAP_EXTENT_DATA_INLINE;
 		brelse(iloc.bh);
 	} else { /* external block */
-		physical = EXT4_I(inode)->i_file_acl << blockbits;
+		physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
 		length = inode->i_sb->s_blocksize;
 	}
 

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e6941e6..ee018d5 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c

@@ -10,6 +10,7 @@
  * Ext4 extents status tree core functions.
  */
 #include <linux/rbtree.h>
+#include <linux/list_sort.h>
 #include "ext4.h"
 #include "extents_status.h"
 #include "ext4_extents.h"
@@ -291,7 +292,6 @@
 
 	read_unlock(&EXT4_I(inode)->i_es_lock);
 
-	ext4_es_lru_add(inode);
 	trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
 
@@ -672,7 +672,6 @@
 error:
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 
-	ext4_es_lru_add(inode);
 	ext4_es_print_tree(inode);
 
 	return err;
@@ -734,7 +733,6 @@
 
 	read_unlock(&EXT4_I(inode)->i_es_lock);
 
-	ext4_es_lru_add(inode);
 	trace_ext4_es_lookup_extent_exit(inode, es, found);
 	return found;
 }
@@ -878,12 +876,28 @@
 				     EXTENT_STATUS_WRITTEN);
 }
 
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+				     struct list_head *b)
+{
+	struct ext4_inode_info *eia, *eib;
+	eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+	eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+
+	if (eia->i_touch_when == eib->i_touch_when)
+		return 0;
+	if (time_after(eia->i_touch_when, eib->i_touch_when))
+		return 1;
+	else
+		return -1;
+}
+
 static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct ext4_sb_info *sbi = container_of(shrink,
 					struct ext4_sb_info, s_es_shrinker);
 	struct ext4_inode_info *ei;
-	struct list_head *cur, *tmp, scanned;
+	struct list_head *cur, *tmp;
+	LIST_HEAD(skiped);
 	int nr_to_scan = sc->nr_to_scan;
 	int ret, nr_shrunk = 0;
 
@@ -893,23 +907,41 @@
 	if (!nr_to_scan)
 		return ret;
 
-	INIT_LIST_HEAD(&scanned);
-
 	spin_lock(&sbi->s_es_lru_lock);
+
+	/*
+	 * If the inode that is at the head of LRU list is newer than
+	 * last_sorted time, that means that we need to sort this list.
+	 */
+	ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
+	if (sbi->s_es_last_sorted < ei->i_touch_when) {
+		list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+		sbi->s_es_last_sorted = jiffies;
+	}
+
 	list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
-		list_move_tail(cur, &scanned);
+		/*
+		 * If we have already reclaimed all extents from extent
+		 * status tree, just stop the loop immediately.
+		 */
+		if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+			break;
 
 		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
 
-		read_lock(&ei->i_es_lock);
-		if (ei->i_es_lru_nr == 0) {
-			read_unlock(&ei->i_es_lock);
+		/* Skip the inode that is newer than the last_sorted time */
+		if (sbi->s_es_last_sorted < ei->i_touch_when) {
+			list_move_tail(cur, &skiped);
 			continue;
 		}
-		read_unlock(&ei->i_es_lock);
+
+		if (ei->i_es_lru_nr == 0)
+			continue;
 
 		write_lock(&ei->i_es_lock);
 		ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+		if (ei->i_es_lru_nr == 0)
+			list_del_init(&ei->i_es_lru);
 		write_unlock(&ei->i_es_lock);
 
 		nr_shrunk += ret;
@@ -917,7 +949,9 @@
 		if (nr_to_scan == 0)
 			break;
 	}
-	list_splice_tail(&scanned, &sbi->s_es_lru);
+
+	/* Move the newer inodes into the tail of the LRU list. */
+	list_splice_tail(&skiped, &sbi->s_es_lru);
 	spin_unlock(&sbi->s_es_lru_lock);
 
 	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
@@ -925,21 +959,19 @@
 	return ret;
 }
 
-void ext4_es_register_shrinker(struct super_block *sb)
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 {
-	struct ext4_sb_info *sbi;
-
-	sbi = EXT4_SB(sb);
 	INIT_LIST_HEAD(&sbi->s_es_lru);
 	spin_lock_init(&sbi->s_es_lru_lock);
+	sbi->s_es_last_sorted = 0;
 	sbi->s_es_shrinker.shrink = ext4_es_shrink;
 	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
 	register_shrinker(&sbi->s_es_shrinker);
 }
 
-void ext4_es_unregister_shrinker(struct super_block *sb)
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 {
-	unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+	unregister_shrinker(&sbi->s_es_shrinker);
 }
 
 void ext4_es_lru_add(struct inode *inode)
@@ -947,11 +979,14 @@
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
+	ei->i_touch_when = jiffies;
+
+	if (!list_empty(&ei->i_es_lru))
+		return;
+
 	spin_lock(&sbi->s_es_lru_lock);
 	if (list_empty(&ei->i_es_lru))
 		list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
-	else
-		list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
 	spin_unlock(&sbi->s_es_lru_lock);
 }
 

diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f740eb03..e936730 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h

@@ -39,6 +39,7 @@
 				 EXTENT_STATUS_DELAYED | \
 				 EXTENT_STATUS_HOLE)
 
+struct ext4_sb_info;
 struct ext4_extent;
 
 struct extent_status {
@@ -119,8 +120,8 @@
 	es->es_pblk = block;
 }
 
-extern void ext4_es_register_shrinker(struct super_block *sb);
-extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_lru_add(struct inode *inode);
 extern void ext4_es_lru_del(struct inode *inode);
 

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1b4d51..b19f0a4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c

@@ -312,7 +312,7 @@
 	blkbits = inode->i_sb->s_blocksize_bits;
 	startoff = *offset;
 	lastoff = startoff;
-	endoff = (map->m_lblk + map->m_len) << blkbits;
+	endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
 
 	index = startoff >> PAGE_CACHE_SHIFT;
 	end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@
 		ret = ext4_map_blocks(NULL, inode, &map, 0);
 		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
 			if (last != start)
-				dataoff = last << blkbits;
+				dataoff = (loff_t)last << blkbits;
 			break;
 		}
 
@@ -468,7 +468,7 @@
 		ext4_es_find_delayed_extent_range(inode, last, last, &es);
 		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
 			if (last != start)
-				dataoff = last << blkbits;
+				dataoff = (loff_t)last << blkbits;
 			break;
 		}
 
@@ -486,7 +486,7 @@
 		}
 
 		last++;
-		dataoff = last << blkbits;
+		dataoff = (loff_t)last << blkbits;
 	} while (last <= end);
 
 	mutex_unlock(&inode->i_mutex);
@@ -540,7 +540,7 @@
 		ret = ext4_map_blocks(NULL, inode, &map, 0);
 		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
 			last += ret;
-			holeoff = last << blkbits;
+			holeoff = (loff_t)last << blkbits;
 			continue;
 		}
 
@@ -551,7 +551,7 @@
 		ext4_es_find_delayed_extent_range(inode, last, last, &es);
 		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
 			last = es.es_lblk + es.es_len;
-			holeoff = last << blkbits;
+			holeoff = (loff_t)last << blkbits;
 			continue;
 		}
 
@@ -566,7 +566,7 @@
 							      &map, &holeoff);
 			if (!unwritten) {
 				last += ret;
-				holeoff = last << blkbits;
+				holeoff = (loff_t)last << blkbits;
 				continue;
 			}
 		}

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a4..a8bc47f 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c

@@ -73,32 +73,6 @@
 	return ret;
 }
 
-/**
- * __sync_file - generic_file_fsync without the locking and filemap_write
- * @inode:	inode to sync
- * @datasync:	only sync essential metadata if true
- *
- * This is just generic_file_fsync without the locking.  This is needed for
- * nojournal mode to make sure this inodes data/metadata makes it to disk
- * properly.  The i_mutex should be held already.
- */
-static int __sync_inode(struct inode *inode, int datasync)
-{
-	int err;
-	int ret;
-
-	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return ret;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return ret;
-
-	err = sync_inode_metadata(inode, 1);
-	if (ret == 0)
-		ret = err;
-	return ret;
-}
-
 /*
  * akpm: A new design for ext4_sync_file().
  *
@@ -116,7 +90,7 @@
 	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-	int ret, err;
+	int ret = 0, err;
 	tid_t commit_tid;
 	bool needs_barrier = false;
 
@@ -124,25 +98,24 @@
 
 	trace_ext4_sync_file_enter(file, datasync);
 
-	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (ret)
-		return ret;
-	mutex_lock(&inode->i_mutex);
-
-	if (inode->i_sb->s_flags & MS_RDONLY)
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		/* Make sure that we read updated s_mount_flags value */
+		smp_rmb();
+		if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			ret = -EROFS;
 		goto out;
-
-	ret = ext4_flush_unwritten_io(inode);
-	if (ret < 0)
-		goto out;
+	}
 
 	if (!journal) {
-		ret = __sync_inode(inode, datasync);
+		ret = generic_file_fsync(file, start, end, datasync);
 		if (!ret && !hlist_empty(&inode->i_dentry))
 			ret = ext4_sync_parent(inode);
 		goto out;
 	}
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
 	/*
 	 * data=writeback,ordered:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@
 		if (!ret)
 			ret = err;
 	}
- out:
-	mutex_unlock(&inode->i_mutex);
+out:
 	trace_ext4_sync_file_exit(inode, ret);
 	return ret;
 }

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d..f03598c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c

@@ -747,7 +747,8 @@
 		if (!handle) {
 			BUG_ON(nblocks <= 0);
 			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
-							 handle_type, nblocks);
+							 handle_type, nblocks,
+							 0);
 			if (IS_ERR(handle)) {
 				err = PTR_ERR(handle);
 				ext4_std_error(sb, err);

diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d35..87b30cd 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c

@@ -624,7 +624,7 @@
 		partial--;
 	}
 out:
-	trace_ext4_ind_map_blocks_exit(inode, map, err);
+	trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
 	return err;
 }
 
@@ -675,11 +675,6 @@
 
 retry:
 	if (rw == READ && ext4_should_dioread_nolock(inode)) {
-		if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
-			mutex_lock(&inode->i_mutex);
-			ext4_flush_unwritten_io(inode);
-			mutex_unlock(&inode->i_mutex);
-		}
 		/*
 		 * Nolock dioread optimization may be dynamically disabled
 		 * via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@
 	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 
-int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
 {
-	int indirects;
-
-	/* if nrblocks are contiguous */
-	if (chunk) {
-		/*
-		 * With N contiguous data blocks, we need at most
-		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-		 * 2 dindirect blocks, and 1 tindirect block
-		 */
-		return DIV_ROUND_UP(nrblocks,
-				    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
-	}
 	/*
-	 * if nrblocks are not contiguous, worse case, each block touch
-	 * a indirect block, and each indirect block touch a double indirect
-	 * block, plus a triple indirect block
+	 * With N contiguous data blocks, we need at most
+	 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+	 * 2 dindirect blocks, and 1 tindirect block
 	 */
-	indirects = nrblocks * 2 + 1;
-	return indirects;
+	return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
 }
 
 /*
@@ -940,11 +926,13 @@
 			     __le32 *last)
 {
 	__le32 *p;
-	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+	int	flags = EXT4_FREE_BLOCKS_VALIDATED;
 	int	err;
 
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		flags |= EXT4_FREE_BLOCKS_METADATA;
+		flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+	else if (ext4_should_journal_data(inode))
+		flags |= EXT4_FREE_BLOCKS_FORGET;
 
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
 				   count)) {

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3e2bf87..d9ecbf1 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c

@@ -72,7 +72,7 @@
 		entry = (struct ext4_xattr_entry *)
 			((void *)raw_inode + EXT4_I(inode)->i_inline_off);
 
-		free += le32_to_cpu(entry->e_value_size);
+		free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
 		goto out;
 	}
 
@@ -1404,16 +1404,15 @@
  * offset as if '.' and '..' really take place.
  *
  */
-int ext4_read_inline_dir(struct file *filp,
-			 void *dirent, filldir_t filldir,
+int ext4_read_inline_dir(struct file *file,
+			 struct dir_context *ctx,
 			 int *has_inline_data)
 {
-	int error = 0;
 	unsigned int offset, parent_ino;
-	int i, stored;
+	int i;
 	struct ext4_dir_entry_2 *de;
 	struct super_block *sb;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	int ret, inline_size = 0;
 	struct ext4_iloc iloc;
 	void *dir_buf = NULL;
@@ -1444,9 +1443,8 @@
 		goto out;
 
 	sb = inode->i_sb;
-	stored = 0;
 	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
-	offset = filp->f_pos;
+	offset = ctx->pos;
 
 	/*
 	 * dotdot_offset and dotdot_size is the real offset and
@@ -1460,104 +1458,74 @@
 	extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
 	extra_size = extra_offset + inline_size;
 
-	while (!error && !stored && filp->f_pos < extra_size) {
-revalidate:
-		/*
-		 * If the version has changed since the last call to
-		 * readdir(2), then we might be pointing to an invalid
-		 * dirent right now.  Scan from the start of the inline
-		 * dir to make sure.
-		 */
-		if (filp->f_version != inode->i_version) {
-			for (i = 0; i < extra_size && i < offset;) {
-				/*
-				 * "." is with offset 0 and
-				 * ".." is dotdot_offset.
-				 */
-				if (!i) {
-					i = dotdot_offset;
-					continue;
-				} else if (i == dotdot_offset) {
-					i = dotdot_size;
-					continue;
-				}
-				/* for other entry, the real offset in
-				 * the buf has to be tuned accordingly.
-				 */
-				de = (struct ext4_dir_entry_2 *)
-					(dir_buf + i - extra_offset);
-				/* It's too expensive to do a full
-				 * dirent test each time round this
-				 * loop, but we do have to test at
-				 * least that it is non-zero.  A
-				 * failure will be detected in the
-				 * dirent test below. */
-				if (ext4_rec_len_from_disk(de->rec_len,
-					extra_size) < EXT4_DIR_REC_LEN(1))
-					break;
-				i += ext4_rec_len_from_disk(de->rec_len,
-							    extra_size);
-			}
-			offset = i;
-			filp->f_pos = offset;
-			filp->f_version = inode->i_version;
-		}
-
-		while (!error && filp->f_pos < extra_size) {
-			if (filp->f_pos == 0) {
-				error = filldir(dirent, ".", 1, 0, inode->i_ino,
-						DT_DIR);
-				if (error)
-					break;
-				stored++;
-				filp->f_pos = dotdot_offset;
+	/*
+	 * If the version has changed since the last call to
+	 * readdir(2), then we might be pointing to an invalid
+	 * dirent right now.  Scan from the start of the inline
+	 * dir to make sure.
+	 */
+	if (file->f_version != inode->i_version) {
+		for (i = 0; i < extra_size && i < offset;) {
+			/*
+			 * "." is with offset 0 and
+			 * ".." is dotdot_offset.
+			 */
+			if (!i) {
+				i = dotdot_offset;
+				continue;
+			} else if (i == dotdot_offset) {
+				i = dotdot_size;
 				continue;
 			}
-
-			if (filp->f_pos == dotdot_offset) {
-				error = filldir(dirent, "..", 2,
-						dotdot_offset,
-						parent_ino, DT_DIR);
-				if (error)
-					break;
-				stored++;
-
-				filp->f_pos = dotdot_size;
-				continue;
-			}
-
+			/* for other entry, the real offset in
+			 * the buf has to be tuned accordingly.
+			 */
 			de = (struct ext4_dir_entry_2 *)
-				(dir_buf + filp->f_pos - extra_offset);
-			if (ext4_check_dir_entry(inode, filp, de,
-						 iloc.bh, dir_buf,
-						 extra_size, filp->f_pos)) {
-				ret = stored;
-				goto out;
-			}
-			if (le32_to_cpu(de->inode)) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation.
-				 */
-				u64 version = filp->f_version;
-
-				error = filldir(dirent, de->name,
-						de->name_len,
-						filp->f_pos,
-						le32_to_cpu(de->inode),
-						get_dtype(sb, de->file_type));
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored++;
-			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
-							      extra_size);
+				(dir_buf + i - extra_offset);
+			/* It's too expensive to do a full
+			 * dirent test each time round this
+			 * loop, but we do have to test at
+			 * least that it is non-zero.  A
+			 * failure will be detected in the
+			 * dirent test below. */
+			if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+				< EXT4_DIR_REC_LEN(1))
+				break;
+			i += ext4_rec_len_from_disk(de->rec_len,
+						    extra_size);
 		}
+		offset = i;
+		ctx->pos = offset;
+		file->f_version = inode->i_version;
+	}
+
+	while (ctx->pos < extra_size) {
+		if (ctx->pos == 0) {
+			if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
+				goto out;
+			ctx->pos = dotdot_offset;
+			continue;
+		}
+
+		if (ctx->pos == dotdot_offset) {
+			if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
+				goto out;
+			ctx->pos = dotdot_size;
+			continue;
+		}
+
+		de = (struct ext4_dir_entry_2 *)
+			(dir_buf + ctx->pos - extra_offset);
+		if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
+					 extra_size, ctx->pos))
+			goto out;
+		if (le32_to_cpu(de->inode)) {
+			if (!dir_emit(ctx, de->name, de->name_len,
+				      le32_to_cpu(de->inode),
+				      get_dtype(sb, de->file_type)))
+				goto out;
+		}
+		ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
 	}
 out:
 	kfree(dir_buf);
@@ -1842,7 +1810,7 @@
 	if (error)
 		goto out;
 
-	physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+	physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
 	physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
 	physical += offsetof(struct ext4_inode, i_block);
 	length = i_size_read(inode);

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b8..0188e65 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c

@@ -132,12 +132,12 @@
 						   new_size);
 }
 
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-		struct inode *inode, struct page *page, loff_t from,
-		loff_t length, int flags);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+				  int pextents);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@
 			filemap_write_and_wait(&inode->i_data);
 		}
 		truncate_inode_pages(&inode->i_data, 0);
-		ext4_ioend_shutdown(inode);
+
+		WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
 		goto no_delete;
 	}
 
@@ -225,8 +226,8 @@
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages(&inode->i_data, 0);
-	ext4_ioend_shutdown(inode);
 
+	WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
 	if (is_bad_inode(inode))
 		goto no_delete;
 
@@ -423,66 +424,6 @@
 #define check_block_validity(inode, map)	\
 	__check_block_validity((inode), __func__, __LINE__, (map))
 
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
-				    unsigned int max_pages)
-{
-	struct address_space *mapping = inode->i_mapping;
-	pgoff_t	index;
-	struct pagevec pvec;
-	pgoff_t num = 0;
-	int i, nr_pages, done = 0;
-
-	if (max_pages == 0)
-		return 0;
-	pagevec_init(&pvec, 0);
-	while (!done) {
-		index = idx;
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					      PAGECACHE_TAG_DIRTY,
-					      (pgoff_t)PAGEVEC_SIZE);
-		if (nr_pages == 0)
-			break;
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-			struct buffer_head *bh, *head;
-
-			lock_page(page);
-			if (unlikely(page->mapping != mapping) ||
-			    !PageDirty(page) ||
-			    PageWriteback(page) ||
-			    page->index != idx) {
-				done = 1;
-				unlock_page(page);
-				break;
-			}
-			if (page_has_buffers(page)) {
-				bh = head = page_buffers(page);
-				do {
-					if (!buffer_delay(bh) &&
-					    !buffer_unwritten(bh))
-						done = 1;
-					bh = bh->b_this_page;
-				} while (!done && (bh != head));
-			}
-			unlock_page(page);
-			if (done)
-				break;
-			idx++;
-			num++;
-			if (num >= max_pages) {
-				done = 1;
-				break;
-			}
-		}
-		pagevec_release(&pvec);
-	}
-	return num;
-}
-
 #ifdef ES_AGGRESSIVE_TEST
 static void ext4_map_blocks_es_recheck(handle_t *handle,
 				       struct inode *inode,
@@ -573,6 +514,8 @@
 		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
 		  (unsigned long) map->m_lblk);
 
+	ext4_es_lru_add(inode);
+
 	/* Lookup extent status tree firstly */
 	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
 		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
@@ -1118,10 +1061,13 @@
 		}
 	}
 
-	if (ext4_has_inline_data(inode))
-		copied = ext4_write_inline_data_end(inode, pos, len,
-						    copied, page);
-	else
+	if (ext4_has_inline_data(inode)) {
+		ret = ext4_write_inline_data_end(inode, pos, len,
+						 copied, page);
+		if (ret < 0)
+			goto errout;
+		copied = ret;
+	} else
 		copied = block_write_end(file, mapping, pos,
 					 len, copied, page, fsdata);
 
@@ -1157,8 +1103,6 @@
 	if (i_size_changed)
 		ext4_mark_inode_dirty(handle, inode);
 
-	if (copied < 0)
-		ret = copied;
 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
@@ -1415,21 +1359,28 @@
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
-					     unsigned long offset)
+					     unsigned int offset,
+					     unsigned int length)
 {
 	int to_release = 0;
 	struct buffer_head *head, *bh;
 	unsigned int curr_off = 0;
 	struct inode *inode = page->mapping->host;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	unsigned int stop = offset + length;
 	int num_clusters;
 	ext4_fsblk_t lblk;
 
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
 	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 
+		if (next_off > stop)
+			break;
+
 		if ((offset <= curr_off) && (buffer_delay(bh))) {
 			to_release++;
 			clear_buffer_delay(bh);
@@ -1460,140 +1411,43 @@
  * Delayed allocation stuff
  */
 
-/*
- * mpage_da_submit_io - walks through extent of pages and try to write
- * them with writepage() call back
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
-			      struct ext4_map_blocks *map)
-{
-	struct pagevec pvec;
-	unsigned long index, end;
-	int ret = 0, err, nr_pages, i;
-	struct inode *inode = mpd->inode;
-	struct address_space *mapping = inode->i_mapping;
-	loff_t size = i_size_read(inode);
-	unsigned int len, block_start;
-	struct buffer_head *bh, *page_bufs = NULL;
-	sector_t pblock = 0, cur_logical = 0;
-	struct ext4_io_submit io_submit;
+struct mpage_da_data {
+	struct inode *inode;
+	struct writeback_control *wbc;
 
-	BUG_ON(mpd->next_page <= mpd->first_page);
-	memset(&io_submit, 0, sizeof(io_submit));
+	pgoff_t first_page;	/* The first page to write */
+	pgoff_t next_page;	/* Current page to examine */
+	pgoff_t last_page;	/* Last page to examine */
 	/*
-	 * We need to start from the first_page to the next_page - 1
-	 * to make sure we also write the mapped dirty buffer_heads.
-	 * If we look at mpd->b_blocknr we would only be looking
-	 * at the currently mapped buffer_heads.
+	 * Extent to map - this can be after first_page because that can be
+	 * fully mapped. We somewhat abuse m_flags to store whether the extent
+	 * is delalloc or unwritten.
 	 */
-	index = mpd->first_page;
-	end = mpd->next_page - 1;
+	struct ext4_map_blocks map;
+	struct ext4_io_submit io_submit;	/* IO submission data */
+};
 
-	pagevec_init(&pvec, 0);
-	while (index <= end) {
-		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-		if (nr_pages == 0)
-			break;
-		for (i = 0; i < nr_pages; i++) {
-			int skip_page = 0;
-			struct page *page = pvec.pages[i];
-
-			index = page->index;
-			if (index > end)
-				break;
-
-			if (index == size >> PAGE_CACHE_SHIFT)
-				len = size & ~PAGE_CACHE_MASK;
-			else
-				len = PAGE_CACHE_SIZE;
-			if (map) {
-				cur_logical = index << (PAGE_CACHE_SHIFT -
-							inode->i_blkbits);
-				pblock = map->m_pblk + (cur_logical -
-							map->m_lblk);
-			}
-			index++;
-
-			BUG_ON(!PageLocked(page));
-			BUG_ON(PageWriteback(page));
-
-			bh = page_bufs = page_buffers(page);
-			block_start = 0;
-			do {
-				if (map && (cur_logical >= map->m_lblk) &&
-				    (cur_logical <= (map->m_lblk +
-						     (map->m_len - 1)))) {
-					if (buffer_delay(bh)) {
-						clear_buffer_delay(bh);
-						bh->b_blocknr = pblock;
-					}
-					if (buffer_unwritten(bh) ||
-					    buffer_mapped(bh))
-						BUG_ON(bh->b_blocknr != pblock);
-					if (map->m_flags & EXT4_MAP_UNINIT)
-						set_buffer_uninit(bh);
-					clear_buffer_unwritten(bh);
-				}
-
-				/*
-				 * skip page if block allocation undone and
-				 * block is dirty
-				 */
-				if (ext4_bh_delay_or_unwritten(NULL, bh))
-					skip_page = 1;
-				bh = bh->b_this_page;
-				block_start += bh->b_size;
-				cur_logical++;
-				pblock++;
-			} while (bh != page_bufs);
-
-			if (skip_page) {
-				unlock_page(page);
-				continue;
-			}
-
-			clear_page_dirty_for_io(page);
-			err = ext4_bio_write_page(&io_submit, page, len,
-						  mpd->wbc);
-			if (!err)
-				mpd->pages_written++;
-			/*
-			 * In error case, we have to continue because
-			 * remaining pages are still locked
-			 */
-			if (ret == 0)
-				ret = err;
-		}
-		pagevec_release(&pvec);
-	}
-	ext4_io_submit(&io_submit);
-	return ret;
-}
-
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+				       bool invalidate)
 {
 	int nr_pages, i;
 	pgoff_t index, end;
 	struct pagevec pvec;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
-	ext4_lblk_t start, last;
+
+	/* This is necessary when next_page == 0. */
+	if (mpd->first_page >= mpd->next_page)
+		return;
 
 	index = mpd->first_page;
 	end   = mpd->next_page - 1;
-
-	start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	ext4_es_remove_extent(inode, start, last - start + 1);
+	if (invalidate) {
+		ext4_lblk_t start, last;
+		start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		ext4_es_remove_extent(inode, start, last - start + 1);
+	}
 
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
@@ -1606,14 +1460,15 @@
 				break;
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
-			block_invalidatepage(page, 0);
-			ClearPageUptodate(page);
+			if (invalidate) {
+				block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+				ClearPageUptodate(page);
+			}
 			unlock_page(page);
 		}
 		index = pvec.pages[nr_pages - 1]->index + 1;
 		pagevec_release(&pvec);
 	}
-	return;
 }
 
 static void ext4_print_free_blocks(struct inode *inode)
@@ -1642,215 +1497,6 @@
 	return;
 }
 
-/*
- * mpage_da_map_and_submit - go through given space, map them
- *       if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
- */
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
-{
-	int err, blks, get_blocks_flags;
-	struct ext4_map_blocks map, *mapp = NULL;
-	sector_t next = mpd->b_blocknr;
-	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
-	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
-	handle_t *handle = NULL;
-
-	/*
-	 * If the blocks are mapped already, or we couldn't accumulate
-	 * any blocks, then proceed immediately to the submission stage.
-	 */
-	if ((mpd->b_size == 0) ||
-	    ((mpd->b_state  & (1 << BH_Mapped)) &&
-	     !(mpd->b_state & (1 << BH_Delay)) &&
-	     !(mpd->b_state & (1 << BH_Unwritten))))
-		goto submit_io;
-
-	handle = ext4_journal_current_handle();
-	BUG_ON(!handle);
-
-	/*
-	 * Call ext4_map_blocks() to allocate any delayed allocation
-	 * blocks, or to convert an uninitialized extent to be
-	 * initialized (in the case where we have written into
-	 * one or more preallocated blocks).
-	 *
-	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
-	 * indicate that we are on the delayed allocation path.  This
-	 * affects functions in many different parts of the allocation
-	 * call path.  This flag exists primarily because we don't
-	 * want to change *many* call functions, so ext4_map_blocks()
-	 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
-	 * inode's allocation semaphore is taken.
-	 *
-	 * If the blocks in questions were delalloc blocks, set
-	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
-	 * variables are updated after the blocks have been allocated.
-	 */
-	map.m_lblk = next;
-	map.m_len = max_blocks;
-	/*
-	 * We're in delalloc path and it is possible that we're going to
-	 * need more metadata blocks than previously reserved. However
-	 * we must not fail because we're in writeback and there is
-	 * nothing we can do about it so it might result in data loss.
-	 * So use reserved blocks to allocate metadata if possible.
-	 */
-	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
-			   EXT4_GET_BLOCKS_METADATA_NOFAIL;
-	if (ext4_should_dioread_nolock(mpd->inode))
-		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
-	if (mpd->b_state & (1 << BH_Delay))
-		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-
-
-	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
-	if (blks < 0) {
-		struct super_block *sb = mpd->inode->i_sb;
-
-		err = blks;
-		/*
-		 * If get block returns EAGAIN or ENOSPC and there
-		 * appears to be free blocks we will just let
-		 * mpage_da_submit_io() unlock all of the pages.
-		 */
-		if (err == -EAGAIN)
-			goto submit_io;
-
-		if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
-			mpd->retval = err;
-			goto submit_io;
-		}
-
-		/*
-		 * get block failure will cause us to loop in
-		 * writepages, because a_ops->writepage won't be able
-		 * to make progress. The page will be redirtied by
-		 * writepage and writepages will again try to write
-		 * the same.
-		 */
-		if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
-			ext4_msg(sb, KERN_CRIT,
-				 "delayed block allocation failed for inode %lu "
-				 "at logical offset %llu with max blocks %zd "
-				 "with error %d", mpd->inode->i_ino,
-				 (unsigned long long) next,
-				 mpd->b_size >> mpd->inode->i_blkbits, err);
-			ext4_msg(sb, KERN_CRIT,
-				"This should not happen!! Data will be lost");
-			if (err == -ENOSPC)
-				ext4_print_free_blocks(mpd->inode);
-		}
-		/* invalidate all the pages */
-		ext4_da_block_invalidatepages(mpd);
-
-		/* Mark this page range as having been completed */
-		mpd->io_done = 1;
-		return;
-	}
-	BUG_ON(blks == 0);
-
-	mapp = &map;
-	if (map.m_flags & EXT4_MAP_NEW) {
-		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
-		int i;
-
-		for (i = 0; i < map.m_len; i++)
-			unmap_underlying_metadata(bdev, map.m_pblk + i);
-	}
-
-	/*
-	 * Update on-disk size along with block allocation.
-	 */
-	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
-	if (disksize > i_size_read(mpd->inode))
-		disksize = i_size_read(mpd->inode);
-	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
-		ext4_update_i_disksize(mpd->inode, disksize);
-		err = ext4_mark_inode_dirty(handle, mpd->inode);
-		if (err)
-			ext4_error(mpd->inode->i_sb,
-				   "Failed to mark inode %lu dirty",
-				   mpd->inode->i_ino);
-	}
-
-submit_io:
-	mpage_da_submit_io(mpd, mapp);
-	mpd->io_done = 1;
-}
-
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
-		(1 << BH_Delay) | (1 << BH_Unwritten))
-
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @b_state - b_state of the buffer head added
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
-				   unsigned long b_state)
-{
-	sector_t next;
-	int blkbits = mpd->inode->i_blkbits;
-	int nrblocks = mpd->b_size >> blkbits;
-
-	/*
-	 * XXX Don't go larger than mballoc is willing to allocate
-	 * This is a stopgap solution.  We eventually need to fold
-	 * mpage_da_submit_io() into this function and then call
-	 * ext4_map_blocks() multiple times in a loop
-	 */
-	if (nrblocks >= (8*1024*1024 >> blkbits))
-		goto flush_it;
-
-	/* check if the reserved journal credits might overflow */
-	if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
-		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
-			/*
-			 * With non-extent format we are limited by the journal
-			 * credit available.  Total credit needed to insert
-			 * nrblocks contiguous blocks is dependent on the
-			 * nrblocks.  So limit nrblocks.
-			 */
-			goto flush_it;
-		}
-	}
-	/*
-	 * First block in the extent
-	 */
-	if (mpd->b_size == 0) {
-		mpd->b_blocknr = logical;
-		mpd->b_size = 1 << blkbits;
-		mpd->b_state = b_state & BH_FLAGS;
-		return;
-	}
-
-	next = mpd->b_blocknr + nrblocks;
-	/*
-	 * Can we merge the block to our big extent?
-	 */
-	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-		mpd->b_size += 1 << blkbits;
-		return;
-	}
-
-flush_it:
-	/*
-	 * We couldn't merge the block to our extent, so we
-	 * need to flush current  extent and start new one
-	 */
-	mpage_da_map_and_submit(mpd);
-	return;
-}
-
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
 	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1883,6 +1529,8 @@
 		  "logical block %lu\n", inode->i_ino, map->m_len,
 		  (unsigned long) map->m_lblk);
 
+	ext4_es_lru_add(inode);
+
 	/* Lookup extent status tree firstly */
 	if (ext4_es_lookup_extent(inode, iblock, &es)) {
 
@@ -2156,7 +1804,7 @@
  * lock so we have to do some magic.
  *
  * This function can get called via...
- *   - ext4_da_writepages after taking page lock (have journal handle)
+ *   - ext4_writepages after taking page lock (have journal handle)
  *   - journal_submit_inode_data_buffers (no journal handle)
  *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
  *   - grab_page_cache when doing write_begin (have journal handle)
@@ -2234,76 +1882,405 @@
 		 */
 		return __ext4_journalled_writepage(page, len);
 
-	memset(&io_submit, 0, sizeof(io_submit));
+	ext4_io_submit_init(&io_submit, wbc);
+	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+	if (!io_submit.io_end) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return -ENOMEM;
+	}
 	ret = ext4_bio_write_page(&io_submit, page, len, wbc);
 	ext4_io_submit(&io_submit);
+	/* Drop io_end reference we got from init */
+	ext4_put_io_end_defer(io_submit.io_end);
 	return ret;
 }
 
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
 /*
- * This is called via ext4_da_writepages() to
- * calculate the total number of credits to reserve to fit
- * a single extent allocation into a single transaction,
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks upto full group size.
  */
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
 
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @b_state - b_state of the buffer head added
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+				  unsigned long b_state)
 {
-	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+	struct ext4_map_blocks *map = &mpd->map;
 
-	/*
-	 * With non-extent format the journal credit needed to
-	 * insert nrblocks contiguous block is dependent on
-	 * number of contiguous block. So we will limit
-	 * number of contiguous block to a sane value
-	 */
-	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
-	    (max_blocks > EXT4_MAX_TRANS_DATA))
-		max_blocks = EXT4_MAX_TRANS_DATA;
+	/* Don't go larger than mballoc is willing to allocate */
+	if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+		return 0;
 
-	return ext4_chunk_trans_blocks(inode, max_blocks);
+	/* First block in the extent? */
+	if (map->m_len == 0) {
+		map->m_lblk = lblk;
+		map->m_len = 1;
+		map->m_flags = b_state & BH_FLAGS;
+		return 1;
+	}
+
+	/* Can we merge the block to our big extent? */
+	if (lblk == map->m_lblk + map->m_len &&
+	    (b_state & BH_FLAGS) == map->m_flags) {
+		map->m_len++;
+		return 1;
+	}
+	return 0;
+}
+
+static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
+				    struct buffer_head *head,
+				    struct buffer_head *bh,
+				    ext4_lblk_t lblk)
+{
+	struct inode *inode = mpd->inode;
+	ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+							>> inode->i_blkbits;
+
+	do {
+		BUG_ON(buffer_locked(bh));
+
+		if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+		    (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
+		    lblk >= blocks) {
+			/* Found extent to map? */
+			if (mpd->map.m_len)
+				return false;
+			if (lblk >= blocks)
+				return true;
+			continue;
+		}
+		if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
+			return false;
+	} while (lblk++, (bh = bh->b_this_page) != head);
+	return true;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+	int len;
+	loff_t size = i_size_read(mpd->inode);
+	int err;
+
+	BUG_ON(page->index != mpd->first_page);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+	clear_page_dirty_for_io(page);
+	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+	if (!err)
+		mpd->wbc->nr_to_write--;
+	mpd->first_page++;
+
+	return err;
 }
 
 /*
- * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and accumulate pages that need writing, and call
- * mpage_da_map_and_submit to map a single contiguous memory region
- * and then write them.
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ *		       submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
  */
-static int write_cache_pages_da(handle_t *handle,
-				struct address_space *mapping,
-				struct writeback_control *wbc,
-				struct mpage_da_data *mpd,
-				pgoff_t *done_index)
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 {
-	struct buffer_head	*bh, *head;
-	struct inode		*inode = mapping->host;
-	struct pagevec		pvec;
-	unsigned int		nr_pages;
-	sector_t		logical;
-	pgoff_t			index, end;
-	long			nr_to_write = wbc->nr_to_write;
-	int			i, tag, ret = 0;
+	struct pagevec pvec;
+	int nr_pages, i;
+	struct inode *inode = mpd->inode;
+	struct buffer_head *head, *bh;
+	int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+	ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+							>> inode->i_blkbits;
+	pgoff_t start, end;
+	ext4_lblk_t lblk;
+	sector_t pblock;
+	int err;
 
-	memset(mpd, 0, sizeof(struct mpage_da_data));
-	mpd->wbc = wbc;
-	mpd->inode = inode;
+	start = mpd->map.m_lblk >> bpp_bits;
+	end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+	lblk = start << bpp_bits;
+	pblock = mpd->map.m_pblk;
+
 	pagevec_init(&pvec, 0);
-	index = wbc->range_start >> PAGE_CACHE_SHIFT;
-	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+	while (start <= end) {
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+					  PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
 
-	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+			if (page->index > end)
+				break;
+			/* Upto 'end' pages must be contiguous */
+			BUG_ON(page->index != start);
+			bh = head = page_buffers(page);
+			do {
+				if (lblk < mpd->map.m_lblk)
+					continue;
+				if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+					/*
+					 * Buffer after end of mapped extent.
+					 * Find next buffer in the page to map.
+					 */
+					mpd->map.m_len = 0;
+					mpd->map.m_flags = 0;
+					add_page_bufs_to_extent(mpd, head, bh,
+								lblk);
+					pagevec_release(&pvec);
+					return 0;
+				}
+				if (buffer_delay(bh)) {
+					clear_buffer_delay(bh);
+					bh->b_blocknr = pblock++;
+				}
+				clear_buffer_unwritten(bh);
+			} while (++lblk < blocks &&
+				 (bh = bh->b_this_page) != head);
+
+			/*
+			 * FIXME: This is going to break if dioread_nolock
+			 * supports blocksize < pagesize as we will try to
+			 * convert potentially unmapped parts of inode.
+			 */
+			mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+			/* Page fully mapped - let IO run! */
+			err = mpage_submit_page(mpd, page);
+			if (err < 0) {
+				pagevec_release(&pvec);
+				return err;
+			}
+			start++;
+		}
+		pagevec_release(&pvec);
+	}
+	/* Extent fully mapped and matches with page boundary. We are done. */
+	mpd->map.m_len = 0;
+	mpd->map.m_flags = 0;
+	return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+	struct inode *inode = mpd->inode;
+	struct ext4_map_blocks *map = &mpd->map;
+	int get_blocks_flags;
+	int err;
+
+	trace_ext4_da_write_pages_extent(inode, map);
+	/*
+	 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+	 * to convert an uninitialized extent to be initialized (in the case
+	 * where we have written into one or more preallocated blocks).  It is
+	 * possible that we're going to need more metadata blocks than
+	 * previously reserved. However we must not fail because we're in
+	 * writeback and there is nothing we can do about it so it might result
+	 * in data loss.  So use reserved blocks to allocate metadata if
+	 * possible.
+	 *
+	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+	 * in question are delalloc blocks.  This affects functions in many
+	 * different parts of the allocation call path.  This flag exists
+	 * primarily because we don't want to change *many* call functions, so
+	 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+	 * once the inode's allocation semaphore is taken.
+	 */
+	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+			   EXT4_GET_BLOCKS_METADATA_NOFAIL;
+	if (ext4_should_dioread_nolock(inode))
+		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+	if (map->m_flags & (1 << BH_Delay))
+		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+	if (err < 0)
+		return err;
+	if (map->m_flags & EXT4_MAP_UNINIT) {
+		if (!mpd->io_submit.io_end->handle &&
+		    ext4_handle_valid(handle)) {
+			mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+			handle->h_rsv_handle = NULL;
+		}
+		ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+	}
+
+	BUG_ON(map->m_len == 0);
+	if (map->m_flags & EXT4_MAP_NEW) {
+		struct block_device *bdev = inode->i_sb->s_bdev;
+		int i;
+
+		for (i = 0; i < map->m_len; i++)
+			unmap_underlying_metadata(bdev, map->m_pblk + i);
+	}
+	return 0;
+}
+
+/*
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ *				 mpd->len and submit pages underlying it for IO
+ *
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
+ */
+static int mpage_map_and_submit_extent(handle_t *handle,
+				       struct mpage_da_data *mpd,
+				       bool *give_up_on_write)
+{
+	struct inode *inode = mpd->inode;
+	struct ext4_map_blocks *map = &mpd->map;
+	int err;
+	loff_t disksize;
+
+	mpd->io_submit.io_end->offset =
+				((loff_t)map->m_lblk) << inode->i_blkbits;
+	while (map->m_len) {
+		err = mpage_map_one_extent(handle, mpd);
+		if (err < 0) {
+			struct super_block *sb = inode->i_sb;
+
+			if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+				goto invalidate_dirty_pages;
+			/*
+			 * Let the uper layers retry transient errors.
+			 * In the case of ENOSPC, if ext4_count_free_blocks()
+			 * is non-zero, a commit should free up blocks.
+			 */
+			if ((err == -ENOMEM) ||
+			    (err == -ENOSPC && ext4_count_free_clusters(sb)))
+				return err;
+			ext4_msg(sb, KERN_CRIT,
+				 "Delayed block allocation failed for "
+				 "inode %lu at logical offset %llu with"
+				 " max blocks %u with error %d",
+				 inode->i_ino,
+				 (unsigned long long)map->m_lblk,
+				 (unsigned)map->m_len, -err);
+			ext4_msg(sb, KERN_CRIT,
+				 "This should not happen!! Data will "
+				 "be lost\n");
+			if (err == -ENOSPC)
+				ext4_print_free_blocks(inode);
+		invalidate_dirty_pages:
+			*give_up_on_write = true;
+			return err;
+		}
+		/*
+		 * Update buffer state, submit mapped pages, and get us new
+		 * extent to map
+		 */
+		err = mpage_map_and_submit_buffers(mpd);
+		if (err < 0)
+			return err;
+	}
+
+	/* Update on-disk size after IO is submitted */
+	disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+	if (disksize > i_size_read(inode))
+		disksize = i_size_read(inode);
+	if (disksize > EXT4_I(inode)->i_disksize) {
+		int err2;
+
+		ext4_update_i_disksize(inode, disksize);
+		err2 = ext4_mark_inode_dirty(handle, inode);
+		if (err2)
+			ext4_error(inode->i_sb,
+				   "Failed to mark inode %lu dirty",
+				   inode->i_ino);
+		if (!err)
+			err = err2;
+	}
+	return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+	int bpp = ext4_journal_blocks_per_page(inode);
+
+	return ext4_meta_trans_blocks(inode,
+				MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * 				 and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+	struct address_space *mapping = mpd->inode->i_mapping;
+	struct pagevec pvec;
+	unsigned int nr_pages;
+	pgoff_t index = mpd->first_page;
+	pgoff_t end = mpd->last_page;
+	int tag;
+	int i, err = 0;
+	int blkbits = mpd->inode->i_blkbits;
+	ext4_lblk_t lblk;
+	struct buffer_head *head;
+
+	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 
-	*done_index = index;
+	pagevec_init(&pvec, 0);
+	mpd->map.m_len = 0;
+	mpd->next_page = index;
 	while (index <= end) {
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
-			return 0;
+			goto out;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -2318,31 +2295,21 @@
 			if (page->index > end)
 				goto out;
 
-			*done_index = page->index + 1;
-
-			/*
-			 * If we can't merge this page, and we have
-			 * accumulated an contiguous region, write it
-			 */
-			if ((mpd->next_page != page->index) &&
-			    (mpd->next_page != mpd->first_page)) {
-				mpage_da_map_and_submit(mpd);
-				goto ret_extent_tail;
-			}
+			/* If we can't merge this page, we are done. */
+			if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+				goto out;
 
 			lock_page(page);
-
 			/*
-			 * If the page is no longer dirty, or its
-			 * mapping no longer corresponds to inode we
-			 * are writing (which means it has been
-			 * truncated or invalidated), or the page is
-			 * already under writeback and we are not
-			 * doing a data integrity writeback, skip the page
+			 * If the page is no longer dirty, or its mapping no
+			 * longer corresponds to inode we are writing (which
+			 * means it has been truncated or invalidated), or the
+			 * page is already under writeback and we are not doing
+			 * a data integrity writeback, skip the page
 			 */
 			if (!PageDirty(page) ||
 			    (PageWriteback(page) &&
-			     (wbc->sync_mode == WB_SYNC_NONE)) ||
+			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
 			    unlikely(page->mapping != mapping)) {
 				unlock_page(page);
 				continue;
@@ -2351,106 +2318,70 @@
 			wait_on_page_writeback(page);
 			BUG_ON(PageWriteback(page));
 
-			/*
-			 * If we have inline data and arrive here, it means that
-			 * we will soon create the block for the 1st page, so
-			 * we'd better clear the inline data here.
-			 */
-			if (ext4_has_inline_data(inode)) {
-				BUG_ON(ext4_test_inode_state(inode,
-						EXT4_STATE_MAY_INLINE_DATA));
-				ext4_destroy_inline_data(handle, inode);
-			}
-
-			if (mpd->next_page != page->index)
+			if (mpd->map.m_len == 0)
 				mpd->first_page = page->index;
 			mpd->next_page = page->index + 1;
-			logical = (sector_t) page->index <<
-				(PAGE_CACHE_SHIFT - inode->i_blkbits);
-
 			/* Add all dirty buffers to mpd */
+			lblk = ((ext4_lblk_t)page->index) <<
+				(PAGE_CACHE_SHIFT - blkbits);
 			head = page_buffers(page);
-			bh = head;
-			do {
-				BUG_ON(buffer_locked(bh));
-				/*
-				 * We need to try to allocate unmapped blocks
-				 * in the same page.  Otherwise we won't make
-				 * progress with the page in ext4_writepage
-				 */
-				if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-					mpage_add_bh_to_extent(mpd, logical,
-							       bh->b_state);
-					if (mpd->io_done)
-						goto ret_extent_tail;
-				} else if (buffer_dirty(bh) &&
-					   buffer_mapped(bh)) {
-					/*
-					 * mapped dirty buffer. We need to
-					 * update the b_state because we look
-					 * at b_state in mpage_da_map_blocks.
-					 * We don't update b_size because if we
-					 * find an unmapped buffer_head later
-					 * we need to use the b_state flag of
-					 * that buffer_head.
-					 */
-					if (mpd->b_size == 0)
-						mpd->b_state =
-							bh->b_state & BH_FLAGS;
-				}
-				logical++;
-			} while ((bh = bh->b_this_page) != head);
-
-			if (nr_to_write > 0) {
-				nr_to_write--;
-				if (nr_to_write == 0 &&
-				    wbc->sync_mode == WB_SYNC_NONE)
-					/*
-					 * We stop writing back only if we are
-					 * not doing integrity sync. In case of
-					 * integrity sync we have to keep going
-					 * because someone may be concurrently
-					 * dirtying pages, and we might have
-					 * synced a lot of newly appeared dirty
-					 * pages, but have not synced all of the
-					 * old dirty pages.
-					 */
+			if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+				goto out;
+			/* So far everything mapped? Submit the page for IO. */
+			if (mpd->map.m_len == 0) {
+				err = mpage_submit_page(mpd, page);
+				if (err < 0)
 					goto out;
 			}
+
+			/*
+			 * Accumulated enough dirty pages? This doesn't apply
+			 * to WB_SYNC_ALL mode. For integrity sync we have to
+			 * keep going because someone may be concurrently
+			 * dirtying pages, and we might have synced a lot of
+			 * newly appeared dirty pages, but have not synced all
+			 * of the old dirty pages.
+			 */
+			if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+			    mpd->next_page - mpd->first_page >=
+							mpd->wbc->nr_to_write)
+				goto out;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 	return 0;
-ret_extent_tail:
-	ret = MPAGE_DA_EXTENT_TAIL;
 out:
 	pagevec_release(&pvec);
-	cond_resched();
+	return err;
+}
+
+static int __writepage(struct page *page, struct writeback_control *wbc,
+		       void *data)
+{
+	struct address_space *mapping = data;
+	int ret = ext4_writepage(page, wbc);
+	mapping_set_error(mapping, ret);
 	return ret;
 }
 
-
-static int ext4_da_writepages(struct address_space *mapping,
-			      struct writeback_control *wbc)
+static int ext4_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
 {
-	pgoff_t	index;
+	pgoff_t	writeback_index = 0;
+	long nr_to_write = wbc->nr_to_write;
 	int range_whole = 0;
+	int cycled = 1;
 	handle_t *handle = NULL;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
-	int pages_written = 0;
-	unsigned int max_pages;
-	int range_cyclic, cycled = 1, io_done = 0;
-	int needed_blocks, ret = 0;
-	long desired_nr_to_write, nr_to_writebump = 0;
-	loff_t range_start = wbc->range_start;
+	int needed_blocks, rsv_blocks = 0, ret = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
-	pgoff_t done_index = 0;
-	pgoff_t end;
+	bool done;
 	struct blk_plug plug;
+	bool give_up_on_write = false;
 
-	trace_ext4_da_writepages(inode, wbc);
+	trace_ext4_writepages(inode, wbc);
 
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
@@ -2460,164 +2391,165 @@
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
 
+	if (ext4_should_journal_data(inode)) {
+		struct blk_plug plug;
+		int ret;
+
+		blk_start_plug(&plug);
+		ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+		blk_finish_plug(&plug);
+		return ret;
+	}
+
 	/*
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
 	 * will obscure the real source of the problem.  We test
 	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
 	 * the latter could be true if the filesystem is mounted
-	 * read-only, and in that case, ext4_da_writepages should
+	 * read-only, and in that case, ext4_writepages should
 	 * *never* be called, so if that ever happens, we would want
 	 * the stack trace.
 	 */
 	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return -EROFS;
 
-	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-		range_whole = 1;
-
-	range_cyclic = wbc->range_cyclic;
-	if (wbc->range_cyclic) {
-		index = mapping->writeback_index;
-		if (index)
-			cycled = 0;
-		wbc->range_start = index << PAGE_CACHE_SHIFT;
-		wbc->range_end  = LLONG_MAX;
-		wbc->range_cyclic = 0;
-		end = -1;
-	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+	if (ext4_should_dioread_nolock(inode)) {
+		/*
+		 * We may need to convert upto one extent per block in
+		 * the page and we may dirty the inode.
+		 */
+		rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
 	}
 
 	/*
-	 * This works around two forms of stupidity.  The first is in
-	 * the writeback code, which caps the maximum number of pages
-	 * written to be 1024 pages.  This is wrong on multiple
-	 * levels; different architectues have a different page size,
-	 * which changes the maximum amount of data which gets
-	 * written.  Secondly, 4 megabytes is way too small.  XFS
-	 * forces this value to be 16 megabytes by multiplying
-	 * nr_to_write parameter by four, and then relies on its
-	 * allocator to allocate larger extents to make them
-	 * contiguous.  Unfortunately this brings us to the second
-	 * stupidity, which is that ext4's mballoc code only allocates
-	 * at most 2048 blocks.  So we force contiguous writes up to
-	 * the number of dirty blocks in the inode, or
-	 * sbi->max_writeback_mb_bump whichever is smaller.
+	 * If we have inline data and arrive here, it means that
+	 * we will soon create the block for the 1st page, so
+	 * we'd better clear the inline data here.
 	 */
-	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-	if (!range_cyclic && range_whole) {
-		if (wbc->nr_to_write == LONG_MAX)
-			desired_nr_to_write = wbc->nr_to_write;
-		else
-			desired_nr_to_write = wbc->nr_to_write * 8;
-	} else
-		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
-							   max_pages);
-	if (desired_nr_to_write > max_pages)
-		desired_nr_to_write = max_pages;
-
-	if (wbc->nr_to_write < desired_nr_to_write) {
-		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
-		wbc->nr_to_write = desired_nr_to_write;
+	if (ext4_has_inline_data(inode)) {
+		/* Just inode will be modified... */
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_writepages;
+		}
+		BUG_ON(ext4_test_inode_state(inode,
+				EXT4_STATE_MAY_INLINE_DATA));
+		ext4_destroy_inline_data(handle, inode);
+		ext4_journal_stop(handle);
 	}
 
+	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+		range_whole = 1;
+
+	if (wbc->range_cyclic) {
+		writeback_index = mapping->writeback_index;
+		if (writeback_index)
+			cycled = 0;
+		mpd.first_page = writeback_index;
+		mpd.last_page = -1;
+	} else {
+		mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+		mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
+	}
+
+	mpd.inode = inode;
+	mpd.wbc = wbc;
+	ext4_io_submit_init(&mpd.io_submit, wbc);
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-		tag_pages_for_writeback(mapping, index, end);
-
+		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+	done = false;
 	blk_start_plug(&plug);
-	while (!ret && wbc->nr_to_write > 0) {
+	while (!done && mpd.first_page <= mpd.last_page) {
+		/* For each extent of pages we use new io_end */
+		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+		if (!mpd.io_submit.io_end) {
+			ret = -ENOMEM;
+			break;
+		}
 
 		/*
-		 * we  insert one extent at a time. So we need
-		 * credit needed for single extent allocation.
-		 * journalled mode is currently not supported
-		 * by delalloc
+		 * We have two constraints: We find one extent to map and we
+		 * must always write out whole page (makes a difference when
+		 * blocksize < pagesize) so that we don't block on IO when we
+		 * try to write out the rest of the page. Journalled mode is
+		 * not supported by delalloc.
 		 */
 		BUG_ON(ext4_should_journal_data(inode));
 		needed_blocks = ext4_da_writepages_trans_blocks(inode);
 
-		/* start a new transaction*/
-		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
-					    needed_blocks);
+		/* start a new transaction */
+		handle = ext4_journal_start_with_reserve(inode,
+				EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
 			       "%ld pages, ino %lu; err %d", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
-			blk_finish_plug(&plug);
-			goto out_writepages;
+			/* Release allocated io_end */
+			ext4_put_io_end(mpd.io_submit.io_end);
+			break;
 		}
 
-		/*
-		 * Now call write_cache_pages_da() to find the next
-		 * contiguous region of logical blocks that need
-		 * blocks to be allocated by ext4 and submit them.
-		 */
-		ret = write_cache_pages_da(handle, mapping,
-					   wbc, &mpd, &done_index);
-		/*
-		 * If we have a contiguous extent of pages and we
-		 * haven't done the I/O yet, map the blocks and submit
-		 * them for I/O.
-		 */
-		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-			mpage_da_map_and_submit(&mpd);
-			ret = MPAGE_DA_EXTENT_TAIL;
+		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+		ret = mpage_prepare_extent_to_map(&mpd);
+		if (!ret) {
+			if (mpd.map.m_len)
+				ret = mpage_map_and_submit_extent(handle, &mpd,
+					&give_up_on_write);
+			else {
+				/*
+				 * We scanned the whole range (or exhausted
+				 * nr_to_write), submitted what was mapped and
+				 * didn't find anything needing mapping. We are
+				 * done.
+				 */
+				done = true;
+			}
 		}
-		trace_ext4_da_write_pages(inode, &mpd);
-		wbc->nr_to_write -= mpd.pages_written;
-
 		ext4_journal_stop(handle);
+		/* Submit prepared bio */
+		ext4_io_submit(&mpd.io_submit);
+		/* Unlock pages we didn't use */
+		mpage_release_unused_pages(&mpd, give_up_on_write);
+		/* Drop our io_end reference we got from init */
+		ext4_put_io_end(mpd.io_submit.io_end);
 
-		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
-			/* commit the transaction which would
+		if (ret == -ENOSPC && sbi->s_journal) {
+			/*
+			 * Commit the transaction which would
 			 * free blocks released in the transaction
 			 * and try again
 			 */
 			jbd2_journal_force_commit_nested(sbi->s_journal);
 			ret = 0;
-		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
-			/*
-			 * Got one extent now try with rest of the pages.
-			 * If mpd.retval is set -EIO, journal is aborted.
-			 * So we don't need to write any more.
-			 */
-			pages_written += mpd.pages_written;
-			ret = mpd.retval;
-			io_done = 1;
-		} else if (wbc->nr_to_write)
-			/*
-			 * There is no more writeout needed
-			 * or we requested for a noblocking writeout
-			 * and we found the device congested
-			 */
+			continue;
+		}
+		/* Fatal error - ENOMEM, EIO... */
+		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
-	if (!io_done && !cycled) {
+	if (!ret && !cycled) {
 		cycled = 1;
-		index = 0;
-		wbc->range_start = index << PAGE_CACHE_SHIFT;
-		wbc->range_end  = mapping->writeback_index - 1;
+		mpd.last_page = writeback_index - 1;
+		mpd.first_page = 0;
 		goto retry;
 	}
 
 	/* Update index */
-	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
-		 * set the writeback_index so that range_cyclic
+		 * Set the writeback_index so that range_cyclic
 		 * mode will write it back later
 		 */
-		mapping->writeback_index = done_index;
+		mapping->writeback_index = mpd.first_page;
 
 out_writepages:
-	wbc->nr_to_write -= nr_to_writebump;
-	wbc->range_start = range_start;
-	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+	trace_ext4_writepages_result(inode, wbc, ret,
+				     nr_to_write - wbc->nr_to_write);
 	return ret;
 }
 
@@ -2829,7 +2761,8 @@
 	return ret ? ret : copied;
 }
 
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+				   unsigned int length)
 {
 	/*
 	 * Drop reserved blocks
@@ -2838,10 +2771,10 @@
 	if (!page_has_buffers(page))
 		goto out;
 
-	ext4_da_page_release_reservation(page, offset);
+	ext4_da_page_release_reservation(page, offset, length);
 
 out:
-	ext4_invalidatepage(page, offset);
+	ext4_invalidatepage(page, offset, length);
 
 	return;
 }
@@ -2864,7 +2797,7 @@
 	 * laptop_mode, not even desirable).  However, to do otherwise
 	 * would require replicating code paths in:
 	 *
-	 * ext4_da_writepages() ->
+	 * ext4_writepages() ->
 	 *    write_cache_pages() ---> (via passed in callback function)
 	 *        __mpage_da_writepage() -->
 	 *           mpage_add_bh_to_extent()
@@ -2989,37 +2922,40 @@
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
 {
-	trace_ext4_invalidatepage(page, offset);
+	trace_ext4_invalidatepage(page, offset, length);
 
 	/* No journalling happens on data buffers when this function is used */
 	WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
 
-	block_invalidatepage(page, offset);
+	block_invalidatepage(page, offset, length);
 }
 
 static int __ext4_journalled_invalidatepage(struct page *page,
-					    unsigned long offset)
+					    unsigned int offset,
+					    unsigned int length)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 
-	trace_ext4_journalled_invalidatepage(page, offset);
+	trace_ext4_journalled_invalidatepage(page, offset, length);
 
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		ClearPageChecked(page);
 
-	return jbd2_journal_invalidatepage(journal, page, offset);
+	return jbd2_journal_invalidatepage(journal, page, offset, length);
 }
 
 /* Wrapper for aops... */
 static void ext4_journalled_invalidatepage(struct page *page,
-					   unsigned long offset)
+					   unsigned int offset,
+					   unsigned int length)
 {
-	WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
+	WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
 }
 
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3067,9 +3003,13 @@
 	struct inode *inode = file_inode(iocb->ki_filp);
         ext4_io_end_t *io_end = iocb->private;
 
-	/* if not async direct IO or dio with 0 bytes write, just return */
-	if (!io_end || !size)
-		goto out;
+	/* if not async direct IO just return */
+	if (!io_end) {
+		inode_dio_done(inode);
+		if (is_async)
+			aio_complete(iocb, ret, 0);
+		return;
+	}
 
 	ext_debug("ext4_end_io_dio(): io_end 0x%p "
 		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3077,25 +3017,13 @@
 		  size);
 
 	iocb->private = NULL;
-
-	/* if not aio dio with unwritten extents, just free io and return */
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-		ext4_free_io_end(io_end);
-out:
-		inode_dio_done(inode);
-		if (is_async)
-			aio_complete(iocb, ret, 0);
-		return;
-	}
-
 	io_end->offset = offset;
 	io_end->size = size;
 	if (is_async) {
 		io_end->iocb = iocb;
 		io_end->result = ret;
 	}
-
-	ext4_add_complete_io(io_end);
+	ext4_put_io_end_defer(io_end);
 }
 
 /*
@@ -3129,6 +3057,7 @@
 	get_block_t *get_block_func = NULL;
 	int dio_flags = 0;
 	loff_t final_size = offset + count;
+	ext4_io_end_t *io_end = NULL;
 
 	/* Use the old path for reads and writes beyond i_size. */
 	if (rw != WRITE || final_size > inode->i_size)
@@ -3136,11 +3065,18 @@
 
 	BUG_ON(iocb->private == NULL);
 
+	/*
+	 * Make all waiters for direct IO properly wait also for extent
+	 * conversion. This also disallows race between truncate() and
+	 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+	 */
+	if (rw == WRITE)
+		atomic_inc(&inode->i_dio_count);
+
 	/* If we do a overwrite dio, i_mutex locking can be released */
 	overwrite = *((int *)iocb->private);
 
 	if (overwrite) {
-		atomic_inc(&inode->i_dio_count);
 		down_read(&EXT4_I(inode)->i_data_sem);
 		mutex_unlock(&inode->i_mutex);
 	}
@@ -3167,13 +3103,16 @@
 	iocb->private = NULL;
 	ext4_inode_aio_set(inode, NULL);
 	if (!is_sync_kiocb(iocb)) {
-		ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+		io_end = ext4_init_io_end(inode, GFP_NOFS);
 		if (!io_end) {
 			ret = -ENOMEM;
 			goto retake_lock;
 		}
 		io_end->flag |= EXT4_IO_END_DIRECT;
-		iocb->private = io_end;
+		/*
+		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+		 */
+		iocb->private = ext4_get_io_end(io_end);
 		/*
 		 * we save the io structure for current async direct
 		 * IO, so that later ext4_map_blocks() could flag the
@@ -3197,33 +3136,42 @@
 				   NULL,
 				   dio_flags);
 
-	if (iocb->private)
-		ext4_inode_aio_set(inode, NULL);
 	/*
-	 * The io_end structure takes a reference to the inode, that
-	 * structure needs to be destroyed and the reference to the
-	 * inode need to be dropped, when IO is complete, even with 0
-	 * byte write, or failed.
-	 *
-	 * In the successful AIO DIO case, the io_end structure will
-	 * be destroyed and the reference to the inode will be dropped
-	 * after the end_io call back function is called.
-	 *
-	 * In the case there is 0 byte write, or error case, since VFS
-	 * direct IO won't invoke the end_io call back function, we
-	 * need to free the end_io structure here.
+	 * Put our reference to io_end. This can free the io_end structure e.g.
+	 * in sync IO case or in case of error. It can even perform extent
+	 * conversion if all bios we submitted finished before we got here.
+	 * Note that in that case iocb->private can be already set to NULL
+	 * here.
 	 */
-	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
-		ext4_free_io_end(iocb->private);
-		iocb->private = NULL;
-	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+	if (io_end) {
+		ext4_inode_aio_set(inode, NULL);
+		ext4_put_io_end(io_end);
+		/*
+		 * When no IO was submitted ext4_end_io_dio() was not
+		 * called so we have to put iocb's reference.
+		 */
+		if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+			WARN_ON(iocb->private != io_end);
+			WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+			WARN_ON(io_end->iocb);
+			/*
+			 * Generic code already did inode_dio_done() so we
+			 * have to clear EXT4_IO_END_DIRECT to not do it for
+			 * the second time.
+			 */
+			io_end->flag = 0;
+			ext4_put_io_end(io_end);
+			iocb->private = NULL;
+		}
+	}
+	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
 		int err;
 		/*
 		 * for non AIO case, since the IO is already
 		 * completed, we could do the conversion right here
 		 */
-		err = ext4_convert_unwritten_extents(inode,
+		err = ext4_convert_unwritten_extents(NULL, inode,
 						     offset, ret);
 		if (err < 0)
 			ret = err;
@@ -3231,9 +3179,10 @@
 	}
 
 retake_lock:
+	if (rw == WRITE)
+		inode_dio_done(inode);
 	/* take i_mutex locking again if we do a ovewrite dio */
 	if (overwrite) {
-		inode_dio_done(inode);
 		up_read(&EXT4_I(inode)->i_data_sem);
 		mutex_lock(&inode->i_mutex);
 	}
@@ -3292,6 +3241,7 @@
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_write_end,
 	.bmap			= ext4_bmap,
@@ -3307,6 +3257,7 @@
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_journalled_write_end,
 	.set_page_dirty		= ext4_journalled_set_page_dirty,
@@ -3322,7 +3273,7 @@
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
-	.writepages		= ext4_da_writepages,
+	.writepages		= ext4_writepages,
 	.write_begin		= ext4_da_write_begin,
 	.write_end		= ext4_da_write_end,
 	.bmap			= ext4_bmap,
@@ -3355,20 +3306,42 @@
 		inode->i_mapping->a_ops = &ext4_aops;
 }
 
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from)
+{
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned length;
+	unsigned blocksize;
+	struct inode *inode = mapping->host;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+
+	return ext4_block_zero_page_range(handle, mapping, from, length);
+}
 
 /*
- * ext4_discard_partial_page_buffers()
- * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
- * This function finds and locks the page containing the offset
- * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
- * Calling functions that already have the page locked should call
- * ext4_discard_partial_page_buffers_no_lock directly.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
  */
-int ext4_discard_partial_page_buffers(handle_t *handle,
-		struct address_space *mapping, loff_t from,
-		loff_t length, int flags)
+int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length)
 {
+	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, max, pos;
+	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
+	struct buffer_head *bh;
 	struct page *page;
 	int err = 0;
 
@@ -3377,67 +3350,12 @@
 	if (!page)
 		return -ENOMEM;
 
-	err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
-		from, length, flags);
-
-	unlock_page(page);
-	page_cache_release(page);
-	return err;
-}
-
-/*
- * ext4_discard_partial_page_buffers_no_lock()
- * Zeros a page range of length 'length' starting from offset 'from'.
- * Buffer heads that correspond to the block aligned regions of the
- * zeroed range will be unmapped.  Unblock aligned regions
- * will have the corresponding buffer head mapped if needed so that
- * that region of the page can be updated with the partial zero out.
- *
- * This function assumes that the page has already been  locked.  The
- * The range to be discarded must be contained with in the given page.
- * If the specified range exceeds the end of the page it will be shortened
- * to the end of the page that corresponds to 'from'.  This function is
- * appropriate for updating a page and it buffer heads to be unmapped and
- * zeroed for blocks that have been either released, or are going to be
- * released.
- *
- * handle: The journal handle
- * inode:  The files inode
- * page:   A locked page that contains the offset "from"
- * from:   The starting byte offset (from the beginning of the file)
- *         to begin discarding
- * len:    The length of bytes to discard
- * flags:  Optional flags that may be used:
- *
- *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
- *         Only zero the regions of the page whose buffer heads
- *         have already been unmapped.  This flag is appropriate
- *         for updating the contents of a page whose blocks may
- *         have already been released, and we only want to zero
- *         out the regions that correspond to those released blocks.
- *
- * Returns zero on success or negative on failure.
- */
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-		struct inode *inode, struct page *page, loff_t from,
-		loff_t length, int flags)
-{
-	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned int offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned int blocksize, max, pos;
-	ext4_lblk_t iblock;
-	struct buffer_head *bh;
-	int err = 0;
-
 	blocksize = inode->i_sb->s_blocksize;
-	max = PAGE_CACHE_SIZE - offset;
-
-	if (index != page->index)
-		return -EINVAL;
+	max = blocksize - (offset & (blocksize - 1));
 
 	/*
 	 * correct length if it does not fall between
-	 * 'from' and the end of the page
+	 * 'from' and the end of the block
 	 */
 	if (length > max || length < 0)
 		length = max;
@@ -3455,106 +3373,91 @@
 		iblock++;
 		pos += blocksize;
 	}
-
-	pos = offset;
-	while (pos < offset + length) {
-		unsigned int end_of_block, range_to_discard;
-
-		err = 0;
-
-		/* The length of space left to zero and unmap */
-		range_to_discard = offset + length - pos;
-
-		/* The length of space until the end of the block */
-		end_of_block = blocksize - (pos & (blocksize-1));
-
-		/*
-		 * Do not unmap or zero past end of block
-		 * for this buffer head
-		 */
-		if (range_to_discard > end_of_block)
-			range_to_discard = end_of_block;
-
-
-		/*
-		 * Skip this buffer head if we are only zeroing unampped
-		 * regions of the page
-		 */
-		if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
-			buffer_mapped(bh))
-				goto next;
-
-		/* If the range is block aligned, unmap */
-		if (range_to_discard == blocksize) {
-			clear_buffer_dirty(bh);
-			bh->b_bdev = NULL;
-			clear_buffer_mapped(bh);
-			clear_buffer_req(bh);
-			clear_buffer_new(bh);
-			clear_buffer_delay(bh);
-			clear_buffer_unwritten(bh);
-			clear_buffer_uptodate(bh);
-			zero_user(page, pos, range_to_discard);
-			BUFFER_TRACE(bh, "Buffer discarded");
-			goto next;
-		}
-
-		/*
-		 * If this block is not completely contained in the range
-		 * to be discarded, then it is not going to be released. Because
-		 * we need to keep this block, we need to make sure this part
-		 * of the page is uptodate before we modify it by writeing
-		 * partial zeros on it.
-		 */
+	if (buffer_freed(bh)) {
+		BUFFER_TRACE(bh, "freed: skip");
+		goto unlock;
+	}
+	if (!buffer_mapped(bh)) {
+		BUFFER_TRACE(bh, "unmapped");
+		ext4_get_block(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh)) {
-			/*
-			 * Buffer head must be mapped before we can read
-			 * from the block
-			 */
-			BUFFER_TRACE(bh, "unmapped");
-			ext4_get_block(inode, iblock, bh, 0);
-			/* unmapped? It's a hole - nothing to do */
-			if (!buffer_mapped(bh)) {
-				BUFFER_TRACE(bh, "still unmapped");
-				goto next;
-			}
+			BUFFER_TRACE(bh, "still unmapped");
+			goto unlock;
 		}
-
-		/* Ok, it's mapped. Make sure it's up-to-date */
-		if (PageUptodate(page))
-			set_buffer_uptodate(bh);
-
-		if (!buffer_uptodate(bh)) {
-			err = -EIO;
-			ll_rw_block(READ, 1, &bh);
-			wait_on_buffer(bh);
-			/* Uhhuh. Read error. Complain and punt.*/
-			if (!buffer_uptodate(bh))
-				goto next;
-		}
-
-		if (ext4_should_journal_data(inode)) {
-			BUFFER_TRACE(bh, "get write access");
-			err = ext4_journal_get_write_access(handle, bh);
-			if (err)
-				goto next;
-		}
-
-		zero_user(page, pos, range_to_discard);
-
-		err = 0;
-		if (ext4_should_journal_data(inode)) {
-			err = ext4_handle_dirty_metadata(handle, inode, bh);
-		} else
-			mark_buffer_dirty(bh);
-
-		BUFFER_TRACE(bh, "Partial buffer zeroed");
-next:
-		bh = bh->b_this_page;
-		iblock++;
-		pos += range_to_discard;
 	}
 
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+	if (ext4_should_journal_data(inode)) {
+		BUFFER_TRACE(bh, "get write access");
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
+			goto unlock;
+	}
+	zero_user(page, offset, length);
+	BUFFER_TRACE(bh, "zeroed end of block");
+
+	if (ext4_should_journal_data(inode)) {
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+	} else {
+		err = 0;
+		mark_buffer_dirty(bh);
+		if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+			err = ext4_jbd2_file_inode(handle, inode);
+	}
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+			     loff_t lstart, loff_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned partial_start, partial_end;
+	ext4_fsblk_t start, end;
+	loff_t byte_end = (lstart + length - 1);
+	int err = 0;
+
+	partial_start = lstart & (sb->s_blocksize - 1);
+	partial_end = byte_end & (sb->s_blocksize - 1);
+
+	start = lstart >> sb->s_blocksize_bits;
+	end = byte_end >> sb->s_blocksize_bits;
+
+	/* Handle partial zero within the single block */
+	if (start == end &&
+	    (partial_start || (partial_end != sb->s_blocksize - 1))) {
+		err = ext4_block_zero_page_range(handle, mapping,
+						 lstart, length);
+		return err;
+	}
+	/* Handle partial zero out on the start of the range */
+	if (partial_start) {
+		err = ext4_block_zero_page_range(handle, mapping,
+						 lstart, sb->s_blocksize);
+		if (err)
+			return err;
+	}
+	/* Handle partial zero out on the end of the range */
+	if (partial_end != sb->s_blocksize - 1)
+		err = ext4_block_zero_page_range(handle, mapping,
+						 byte_end - partial_end,
+						 partial_end + 1);
 	return err;
 }
 
@@ -3580,14 +3483,12 @@
  * Returns: 0 on success or negative on failure
  */
 
-int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 {
-	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t first_block, stop_block;
 	struct address_space *mapping = inode->i_mapping;
-	loff_t first_page, last_page, page_len;
-	loff_t first_page_offset, last_page_offset;
+	loff_t first_block_offset, last_block_offset;
 	handle_t *handle;
 	unsigned int credits;
 	int ret = 0;
@@ -3638,23 +3539,16 @@
 		   offset;
 	}
 
-	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+	first_block_offset = round_up(offset, sb->s_blocksize);
+	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
 
-	first_page_offset = first_page << PAGE_CACHE_SHIFT;
-	last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
-	/* Now release the pages */
-	if (last_page_offset > first_page_offset) {
-		truncate_pagecache_range(inode, first_page_offset,
-					 last_page_offset - 1);
-	}
+	/* Now release the pages and zero block aligned part of pages*/
+	if (last_block_offset > first_block_offset)
+		truncate_pagecache_range(inode, first_block_offset,
+					 last_block_offset);
 
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
 	ext4_inode_block_unlocked_dio(inode);
-	ret = ext4_flush_unwritten_io(inode);
-	if (ret)
-		goto out_dio;
 	inode_dio_wait(inode);
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3668,66 +3562,10 @@
 		goto out_dio;
 	}
 
-	/*
-	 * Now we need to zero out the non-page-aligned data in the
-	 * pages at the start and tail of the hole, and unmap the
-	 * buffer heads for the block aligned regions of the page that
-	 * were completely zeroed.
-	 */
-	if (first_page > last_page) {
-		/*
-		 * If the file space being truncated is contained
-		 * within a page just zero out and unmap the middle of
-		 * that page
-		 */
-		ret = ext4_discard_partial_page_buffers(handle,
-			mapping, offset, length, 0);
-
-		if (ret)
-			goto out_stop;
-	} else {
-		/*
-		 * zero out and unmap the partial page that contains
-		 * the start of the hole
-		 */
-		page_len = first_page_offset - offset;
-		if (page_len > 0) {
-			ret = ext4_discard_partial_page_buffers(handle, mapping,
-						offset, page_len, 0);
-			if (ret)
-				goto out_stop;
-		}
-
-		/*
-		 * zero out and unmap the partial page that contains
-		 * the end of the hole
-		 */
-		page_len = offset + length - last_page_offset;
-		if (page_len > 0) {
-			ret = ext4_discard_partial_page_buffers(handle, mapping,
-					last_page_offset, page_len, 0);
-			if (ret)
-				goto out_stop;
-		}
-	}
-
-	/*
-	 * If i_size is contained in the last page, we need to
-	 * unmap and zero the partial page after i_size
-	 */
-	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
-	   inode->i_size % PAGE_CACHE_SIZE != 0) {
-		page_len = PAGE_CACHE_SIZE -
-			(inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-		if (page_len > 0) {
-			ret = ext4_discard_partial_page_buffers(handle,
-					mapping, inode->i_size, page_len, 0);
-
-			if (ret)
-				goto out_stop;
-		}
-	}
+	ret = ext4_zero_partial_blocks(handle, inode, offset,
+				       length);
+	if (ret)
+		goto out_stop;
 
 	first_block = (offset + sb->s_blocksize - 1) >>
 		EXT4_BLOCK_SIZE_BITS(sb);
@@ -3803,7 +3641,6 @@
 	unsigned int credits;
 	handle_t *handle;
 	struct address_space *mapping = inode->i_mapping;
-	loff_t page_len;
 
 	/*
 	 * There is a possibility that we're either freeing the inode
@@ -3830,12 +3667,6 @@
 			return;
 	}
 
-	/*
-	 * finish any pending end_io work so we won't run the risk of
-	 * converting any truncated blocks to initialized later
-	 */
-	ext4_flush_unwritten_io(inode);
-
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		credits = ext4_writepage_trans_blocks(inode);
 	else
@@ -3847,14 +3678,8 @@
 		return;
 	}
 
-	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-		page_len = PAGE_CACHE_SIZE -
-			(inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-		if (ext4_discard_partial_page_buffers(handle,
-				mapping, inode->i_size, page_len, 0))
-			goto out_stop;
-	}
+	if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
 
 	/*
 	 * We add the inode to the orphan list, so that if this
@@ -4623,7 +4448,8 @@
 				      inode->i_size >> PAGE_CACHE_SHIFT);
 		if (!page)
 			return;
-		ret = __ext4_journalled_invalidatepage(page, offset);
+		ret = __ext4_journalled_invalidatepage(page, offset,
+						PAGE_CACHE_SIZE - offset);
 		unlock_page(page);
 		page_cache_release(page);
 		if (ret != -EBUSY)
@@ -4805,7 +4631,7 @@
 		 struct kstat *stat)
 {
 	struct inode *inode;
-	unsigned long delalloc_blocks;
+	unsigned long long delalloc_blocks;
 
 	inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
@@ -4823,15 +4649,16 @@
 	delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
 				EXT4_I(inode)->i_reserved_data_blocks);
 
-	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
 	return 0;
 }
 
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+				   int pextents)
 {
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-		return ext4_ind_trans_blocks(inode, nrblocks, chunk);
-	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+		return ext4_ind_trans_blocks(inode, lblocks);
+	return ext4_ext_index_trans_blocks(inode, pextents);
 }
 
 /*
@@ -4845,7 +4672,8 @@
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+				  int pextents)
 {
 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
 	int gdpblocks;
@@ -4853,14 +4681,10 @@
 	int ret = 0;
 
 	/*
-	 * How many index blocks need to touch to modify nrblocks?
-	 * The "Chunk" flag indicating whether the nrblocks is
-	 * physically contiguous on disk
-	 *
-	 * For Direct IO and fallocate, they calls get_block to allocate
-	 * one single extent at a time, so they could set the "Chunk" flag
+	 * How many index blocks need to touch to map @lblocks logical blocks
+	 * to @pextents physical extents?
 	 */
-	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+	idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
 
 	ret = idxblocks;
 
@@ -4868,12 +4692,7 @@
 	 * Now let's see how many group bitmaps and group descriptors need
 	 * to account
 	 */
-	groups = idxblocks;
-	if (chunk)
-		groups += 1;
-	else
-		groups += nrblocks;
-
+	groups = idxblocks + pextents;
 	gdpblocks = groups;
 	if (groups > ngroups)
 		groups = ngroups;
@@ -4904,7 +4723,7 @@
 	int bpp = ext4_journal_blocks_per_page(inode);
 	int ret;
 
-	ret = ext4_meta_trans_blocks(inode, bpp, 0);
+	ret = ext4_meta_trans_blocks(inode, bpp, bpp);
 
 	/* Account for data blocks for journalled mode */
 	if (ext4_should_journal_data(inode))

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index def8408..a9ff5e5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c

@@ -2105,6 +2105,7 @@
 		group = ac->ac_g_ex.fe_group;
 
 		for (i = 0; i < ngroups; group++, i++) {
+			cond_resched();
 			/*
 			 * Artificially restricted ngroups for non-extent
 			 * files makes group > ngroups possible on first loop.
@@ -4405,17 +4406,20 @@
 repeat:
 		/* allocate space in core */
 		*errp = ext4_mb_regular_allocator(ac);
+		if (*errp)
+			goto discard_and_exit;
+
+		/* as we've just preallocated more space than
+		 * user requested originally, we store allocated
+		 * space in a special descriptor */
+		if (ac->ac_status == AC_STATUS_FOUND &&
+		    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+			*errp = ext4_mb_new_preallocation(ac);
 		if (*errp) {
+		discard_and_exit:
 			ext4_discard_allocated_blocks(ac);
 			goto errout;
 		}
-
-		/* as we've just preallocated more space than
-		 * user requested orinally, we store allocated
-		 * space in a special descriptor */
-		if (ac->ac_status == AC_STATUS_FOUND &&
-				ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
-			ext4_mb_new_preallocation(ac);
 	}
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
 		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4612,10 +4616,11 @@
 		BUG_ON(bh && (count > 1));
 
 		for (i = 0; i < count; i++) {
+			cond_resched();
 			if (!bh)
 				tbh = sb_find_get_block(inode->i_sb,
 							block + i);
-			if (unlikely(!tbh))
+			if (!tbh)
 				continue;
 			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
 				    inode, tbh, block + i);

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf36..e86dddbd 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c

@@ -912,7 +912,6 @@
 	struct page *pagep[2] = {NULL, NULL};
 	handle_t *handle;
 	ext4_lblk_t orig_blk_offset;
-	long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
 	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
 	unsigned int w_flags = 0;
 	unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@
 	orig_blk_offset = orig_page_offset * blocks_per_page +
 		data_offset_in_page;
 
-	offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
-
 	/* Calculate data_size */
 	if ((orig_blk_offset + block_len_in_page - 1) ==
 	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6653fc3..ab2f6dc 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c

@@ -918,11 +918,8 @@
 				bh->b_data, bh->b_size,
 				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
 					 + ((char *)de - bh->b_data))) {
-			/* On error, skip the f_pos to the next block. */
-			dir_file->f_pos = (dir_file->f_pos |
-					(dir->i_sb->s_blocksize - 1)) + 1;
-			brelse(bh);
-			return count;
+			/* silently ignore the rest of the block */
+			break;
 		}
 		ext4fs_dirhash(de->name, de->name_len, hinfo);
 		if ((hinfo->hash < start_hash) ||

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4acf1f7..48786cd 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c

@@ -46,165 +46,6 @@
 }
 
 /*
- * This function is called by ext4_evict_inode() to make sure there is
- * no more pending I/O completion work left to do.
- */
-void ext4_ioend_shutdown(struct inode *inode)
-{
-	wait_queue_head_t *wq = ext4_ioend_wq(inode);
-
-	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
-	/*
-	 * We need to make sure the work structure is finished being
-	 * used before we let the inode get destroyed.
-	 */
-	if (work_pending(&EXT4_I(inode)->i_unwritten_work))
-		cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
-}
-
-void ext4_free_io_end(ext4_io_end_t *io)
-{
-	BUG_ON(!io);
-	BUG_ON(!list_empty(&io->list));
-	BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
-
-	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
-		wake_up_all(ext4_ioend_wq(io->inode));
-	kmem_cache_free(io_end_cachep, io);
-}
-
-/* check a range of space and convert unwritten extents to written. */
-static int ext4_end_io(ext4_io_end_t *io)
-{
-	struct inode *inode = io->inode;
-	loff_t offset = io->offset;
-	ssize_t size = io->size;
-	int ret = 0;
-
-	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-		   "list->prev 0x%p\n",
-		   io, inode->i_ino, io->list.next, io->list.prev);
-
-	ret = ext4_convert_unwritten_extents(inode, offset, size);
-	if (ret < 0) {
-		ext4_msg(inode->i_sb, KERN_EMERG,
-			 "failed to convert unwritten extents to written "
-			 "extents -- potential data loss!  "
-			 "(inode %lu, offset %llu, size %zd, error %d)",
-			 inode->i_ino, offset, size, ret);
-	}
-	/* Wake up anyone waiting on unwritten extent conversion */
-	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
-		wake_up_all(ext4_ioend_wq(inode));
-	if (io->flag & EXT4_IO_END_DIRECT)
-		inode_dio_done(inode);
-	if (io->iocb)
-		aio_complete(io->iocb, io->result, 0);
-	return ret;
-}
-
-static void dump_completed_IO(struct inode *inode)
-{
-#ifdef	EXT4FS_DEBUG
-	struct list_head *cur, *before, *after;
-	ext4_io_end_t *io, *io0, *io1;
-
-	if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
-		ext4_debug("inode %lu completed_io list is empty\n",
-			   inode->i_ino);
-		return;
-	}
-
-	ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
-	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
-		cur = &io->list;
-		before = cur->prev;
-		io0 = container_of(before, ext4_io_end_t, list);
-		after = cur->next;
-		io1 = container_of(after, ext4_io_end_t, list);
-
-		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-			    io, inode->i_ino, io0, io1);
-	}
-#endif
-}
-
-/* Add the io_end to per-inode completed end_io list. */
-void ext4_add_complete_io(ext4_io_end_t *io_end)
-{
-	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
-	struct workqueue_struct *wq;
-	unsigned long flags;
-
-	BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
-	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	if (list_empty(&ei->i_completed_io_list))
-		queue_work(wq, &ei->i_unwritten_work);
-	list_add_tail(&io_end->list, &ei->i_completed_io_list);
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-}
-
-static int ext4_do_flush_completed_IO(struct inode *inode)
-{
-	ext4_io_end_t *io;
-	struct list_head unwritten;
-	unsigned long flags;
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	int err, ret = 0;
-
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	dump_completed_IO(inode);
-	list_replace_init(&ei->i_completed_io_list, &unwritten);
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-
-	while (!list_empty(&unwritten)) {
-		io = list_entry(unwritten.next, ext4_io_end_t, list);
-		BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
-		list_del_init(&io->list);
-
-		err = ext4_end_io(io);
-		if (unlikely(!ret && err))
-			ret = err;
-		io->flag &= ~EXT4_IO_END_UNWRITTEN;
-		ext4_free_io_end(io);
-	}
-	return ret;
-}
-
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-void ext4_end_io_work(struct work_struct *work)
-{
-	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
-						  i_unwritten_work);
-	ext4_do_flush_completed_IO(&ei->vfs_inode);
-}
-
-int ext4_flush_unwritten_io(struct inode *inode)
-{
-	int ret;
-	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
-		     !(inode->i_state & I_FREEING));
-	ret = ext4_do_flush_completed_IO(inode);
-	ext4_unwritten_wait(inode);
-	return ret;
-}
-
-ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
-{
-	ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
-	if (io) {
-		atomic_inc(&EXT4_I(inode)->i_ioend_count);
-		io->inode = inode;
-		INIT_LIST_HEAD(&io->list);
-	}
-	return io;
-}
-
-/*
  * Print an buffer I/O error compatible with the fs/buffer.c.  This
  * provides compatibility with dmesg scrapers that look for a specific
  * buffer I/O error message.  We really need a unified error reporting
@@ -219,21 +60,11 @@
 			(unsigned long long)bh->b_blocknr);
 }
 
-static void ext4_end_bio(struct bio *bio, int error)
+static void ext4_finish_bio(struct bio *bio)
 {
-	ext4_io_end_t *io_end = bio->bi_private;
-	struct inode *inode;
 	int i;
-	int blocksize;
-	sector_t bi_sector = bio->bi_sector;
+	int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
 
-	BUG_ON(!io_end);
-	inode = io_end->inode;
-	blocksize = 1 << inode->i_blkbits;
-	bio->bi_private = NULL;
-	bio->bi_end_io = NULL;
-	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
-		error = 0;
 	for (i = 0; i < bio->bi_vcnt; i++) {
 		struct bio_vec *bvec = &bio->bi_io_vec[i];
 		struct page *page = bvec->bv_page;
@@ -259,7 +90,7 @@
 		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
 		do {
 			if (bh_offset(bh) < bio_start ||
-			    bh_offset(bh) + blocksize > bio_end) {
+			    bh_offset(bh) + bh->b_size > bio_end) {
 				if (buffer_async_write(bh))
 					under_io++;
 				continue;
@@ -273,10 +104,235 @@
 		if (!under_io)
 			end_page_writeback(page);
 	}
-	bio_put(bio);
+}
+
+static void ext4_release_io_end(ext4_io_end_t *io_end)
+{
+	struct bio *bio, *next_bio;
+
+	BUG_ON(!list_empty(&io_end->list));
+	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+	WARN_ON(io_end->handle);
+
+	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+		wake_up_all(ext4_ioend_wq(io_end->inode));
+
+	for (bio = io_end->bio; bio; bio = next_bio) {
+		next_bio = bio->bi_private;
+		ext4_finish_bio(bio);
+		bio_put(bio);
+	}
+	if (io_end->flag & EXT4_IO_END_DIRECT)
+		inode_dio_done(io_end->inode);
+	if (io_end->iocb)
+		aio_complete(io_end->iocb, io_end->result, 0);
+	kmem_cache_free(io_end_cachep, io_end);
+}
+
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+	struct inode *inode = io_end->inode;
+
+	io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+	/* Wake up anyone waiting on unwritten extent conversion */
+	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+		wake_up_all(ext4_ioend_wq(inode));
+}
+
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
+static int ext4_end_io(ext4_io_end_t *io)
+{
+	struct inode *inode = io->inode;
+	loff_t offset = io->offset;
+	ssize_t size = io->size;
+	handle_t *handle = io->handle;
+	int ret = 0;
+
+	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+		   "list->prev 0x%p\n",
+		   io, inode->i_ino, io->list.next, io->list.prev);
+
+	io->handle = NULL;	/* Following call will use up the handle */
+	ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
+	if (ret < 0) {
+		ext4_msg(inode->i_sb, KERN_EMERG,
+			 "failed to convert unwritten extents to written "
+			 "extents -- potential data loss!  "
+			 "(inode %lu, offset %llu, size %zd, error %d)",
+			 inode->i_ino, offset, size, ret);
+	}
+	ext4_clear_io_unwritten_flag(io);
+	ext4_release_io_end(io);
+	return ret;
+}
+
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
+{
+#ifdef	EXT4FS_DEBUG
+	struct list_head *cur, *before, *after;
+	ext4_io_end_t *io, *io0, *io1;
+
+	if (list_empty(head))
+		return;
+
+	ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+	list_for_each_entry(io, head, list) {
+		cur = &io->list;
+		before = cur->prev;
+		io0 = container_of(before, ext4_io_end_t, list);
+		after = cur->next;
+		io1 = container_of(after, ext4_io_end_t, list);
+
+		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+			    io, inode->i_ino, io0, io1);
+	}
+#endif
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
+{
+	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+	struct workqueue_struct *wq;
+	unsigned long flags;
+
+	BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (io_end->handle) {
+		wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+		if (list_empty(&ei->i_rsv_conversion_list))
+			queue_work(wq, &ei->i_rsv_conversion_work);
+		list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+	} else {
+		wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
+		if (list_empty(&ei->i_unrsv_conversion_list))
+			queue_work(wq, &ei->i_unrsv_conversion_work);
+		list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
+	}
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
+
+static int ext4_do_flush_completed_IO(struct inode *inode,
+				      struct list_head *head)
+{
+	ext4_io_end_t *io;
+	struct list_head unwritten;
+	unsigned long flags;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int err, ret = 0;
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	dump_completed_IO(inode, head);
+	list_replace_init(head, &unwritten);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+	while (!list_empty(&unwritten)) {
+		io = list_entry(unwritten.next, ext4_io_end_t, list);
+		BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+		list_del_init(&io->list);
+
+		err = ext4_end_io(io);
+		if (unlikely(!ret && err))
+			ret = err;
+	}
+	return ret;
+}
+
+/*
+ * work on completed IO, to convert unwritten extents to extents
+ */
+void ext4_end_io_rsv_work(struct work_struct *work)
+{
+	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+						  i_rsv_conversion_work);
+	ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
+}
+
+void ext4_end_io_unrsv_work(struct work_struct *work)
+{
+	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+						  i_unrsv_conversion_work);
+	ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+	ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
+	if (io) {
+		atomic_inc(&EXT4_I(inode)->i_ioend_count);
+		io->inode = inode;
+		INIT_LIST_HEAD(&io->list);
+		atomic_set(&io->count, 1);
+	}
+	return io;
+}
+
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
+{
+	if (atomic_dec_and_test(&io_end->count)) {
+		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+			ext4_release_io_end(io_end);
+			return;
+		}
+		ext4_add_complete_io(io_end);
+	}
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+	int err = 0;
+
+	if (atomic_dec_and_test(&io_end->count)) {
+		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+			err = ext4_convert_unwritten_extents(io_end->handle,
+						io_end->inode, io_end->offset,
+						io_end->size);
+			io_end->handle = NULL;
+			ext4_clear_io_unwritten_flag(io_end);
+		}
+		ext4_release_io_end(io_end);
+	}
+	return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+	atomic_inc(&io_end->count);
+	return io_end;
+}
+
+static void ext4_end_bio(struct bio *bio, int error)
+{
+	ext4_io_end_t *io_end = bio->bi_private;
+	sector_t bi_sector = bio->bi_sector;
+
+	BUG_ON(!io_end);
+	bio->bi_end_io = NULL;
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = 0;
+
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+		/*
+		 * Link bio into list hanging from io_end. We have to do it
+		 * atomically as bio completions can be racing against each
+		 * other.
+		 */
+		bio->bi_private = xchg(&io_end->bio, bio);
+	} else {
+		ext4_finish_bio(bio);
+		bio_put(bio);
+	}
 
 	if (error) {
-		io_end->flag |= EXT4_IO_END_ERROR;
+		struct inode *inode = io_end->inode;
+
 		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
 			     "(offset %llu size %ld starting block %llu)",
 			     inode->i_ino,
@@ -285,13 +341,7 @@
 			     (unsigned long long)
 			     bi_sector >> (inode->i_blkbits - 9));
 	}
-
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-		ext4_free_io_end(io_end);
-		return;
-	}
-
-	ext4_add_complete_io(io_end);
+	ext4_put_io_end_defer(io_end);
 }
 
 void ext4_io_submit(struct ext4_io_submit *io)
@@ -305,43 +355,38 @@
 		bio_put(io->io_bio);
 	}
 	io->io_bio = NULL;
-	io->io_op = 0;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+			 struct writeback_control *wbc)
+{
+	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
+	io->io_bio = NULL;
 	io->io_end = NULL;
 }
 
-static int io_submit_init(struct ext4_io_submit *io,
-			  struct inode *inode,
-			  struct writeback_control *wbc,
-			  struct buffer_head *bh)
+static int io_submit_init_bio(struct ext4_io_submit *io,
+			      struct buffer_head *bh)
 {
-	ext4_io_end_t *io_end;
-	struct page *page = bh->b_page;
 	int nvecs = bio_get_nr_vecs(bh->b_bdev);
 	struct bio *bio;
 
-	io_end = ext4_init_io_end(inode, GFP_NOFS);
-	if (!io_end)
-		return -ENOMEM;
 	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+	if (!bio)
+		return -ENOMEM;
 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
-	bio->bi_private = io->io_end = io_end;
 	bio->bi_end_io = ext4_end_bio;
-
-	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
-
+	bio->bi_private = ext4_get_io_end(io->io_end);
 	io->io_bio = bio;
-	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
 	io->io_next_block = bh->b_blocknr;
 	return 0;
 }
 
 static int io_submit_add_bh(struct ext4_io_submit *io,
 			    struct inode *inode,
-			    struct writeback_control *wbc,
 			    struct buffer_head *bh)
 {
-	ext4_io_end_t *io_end;
 	int ret;
 
 	if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -349,18 +394,14 @@
 		ext4_io_submit(io);
 	}
 	if (io->io_bio == NULL) {
-		ret = io_submit_init(io, inode, wbc, bh);
+		ret = io_submit_init_bio(io, bh);
 		if (ret)
 			return ret;
 	}
-	io_end = io->io_end;
-	if (test_clear_buffer_uninit(bh))
-		ext4_set_io_unwritten_flag(inode, io_end);
-	io->io_end->size += bh->b_size;
-	io->io_next_block++;
 	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
 	if (ret != bh->b_size)
 		goto submit_and_retry;
+	io->io_next_block++;
 	return 0;
 }
 
@@ -432,7 +473,7 @@
 	do {
 		if (!buffer_async_write(bh))
 			continue;
-		ret = io_submit_add_bh(io, inode, wbc, bh);
+		ret = io_submit_add_bh(io, inode, bh);
 		if (ret) {
 			/*
 			 * We only get here on ENOMEM.  Not much else

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d..c5adbb31 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c

@@ -79,12 +79,20 @@
 	ext4_fsblk_t end = start + input->blocks_count;
 	ext4_group_t group = input->group;
 	ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
-	unsigned overhead = ext4_group_overhead_blocks(sb, group);
-	ext4_fsblk_t metaend = start + overhead;
+	unsigned overhead;
+	ext4_fsblk_t metaend;
 	struct buffer_head *bh = NULL;
 	ext4_grpblk_t free_blocks_count, offset;
 	int err = -EINVAL;
 
+	if (group != sbi->s_groups_count) {
+		ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+			     input->group, sbi->s_groups_count);
+		return -EINVAL;
+	}
+
+	overhead = ext4_group_overhead_blocks(sb, group);
+	metaend = start + overhead;
 	input->free_blocks_count = free_blocks_count =
 		input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
 
@@ -96,10 +104,7 @@
 		       free_blocks_count, input->reserved_blocks);
 
 	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
-	if (group != sbi->s_groups_count)
-		ext4_warning(sb, "Cannot add at group %u (only %u groups)",
-			     input->group, sbi->s_groups_count);
-	else if (offset != 0)
+	if (offset != 0)
 			ext4_warning(sb, "Last group not full");
 	else if (input->reserved_blocks > input->blocks_count / 5)
 		ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@
 	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
 		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
 	struct inode *inode = NULL;
-	int gdb_off, gdb_num;
+	int gdb_off;
 	int err;
 	__u16 bg_flags = 0;
 
-	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
 	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
 
 	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@
 		err = err2;
 
 	if (!err) {
-		ext4_fsblk_t first_block;
-		first_block = ext4_group_first_block_no(sb, 0);
 		if (test_opt(sb, DEBUG))
 			printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
 			       "blocks\n", ext4_blocks_count(es));
-		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
 			       (char *)es, sizeof(struct ext4_super_block), 0);
 	}
 	return err;

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84d..85b3dd6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c

@@ -69,6 +69,7 @@
 static void ext4_clear_journal_err(struct super_block *sb,
 				   struct ext4_super_block *es);
 static int ext4_sync_fs(struct super_block *sb, int wait);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@
 	}
 	if (test_opt(sb, ERRORS_RO)) {
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+		/*
+		 * Make sure updated value of ->s_mount_flags will be visible
+		 * before ->s_flags update
+		 */
+		smp_wmb();
 		sb->s_flags |= MS_RDONLY;
 	}
 	if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@
 	ext4_handle_error(sb);
 }
 
-void ext4_error_inode(struct inode *inode, const char *function,
-		      unsigned int line, ext4_fsblk_t block,
-		      const char *fmt, ...)
+void __ext4_error_inode(struct inode *inode, const char *function,
+			unsigned int line, ext4_fsblk_t block,
+			const char *fmt, ...)
 {
 	va_list args;
 	struct va_format vaf;
@@ -451,9 +457,9 @@
 	ext4_handle_error(inode->i_sb);
 }
 
-void ext4_error_file(struct file *file, const char *function,
-		     unsigned int line, ext4_fsblk_t block,
-		     const char *fmt, ...)
+void __ext4_error_file(struct file *file, const char *function,
+		       unsigned int line, ext4_fsblk_t block,
+		       const char *fmt, ...)
 {
 	va_list args;
 	struct va_format vaf;
@@ -570,8 +576,13 @@
 
 	if ((sb->s_flags & MS_RDONLY) == 0) {
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-		sb->s_flags |= MS_RDONLY;
 		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		/*
+		 * Make sure updated value of ->s_mount_flags will be visible
+		 * before ->s_flags update
+		 */
+		smp_wmb();
+		sb->s_flags |= MS_RDONLY;
 		if (EXT4_SB(sb)->s_journal)
 			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 		save_error_info(sb, function, line);
@@ -580,7 +591,8 @@
 		panic("EXT4-fs panic from previous error\n");
 }
 
-void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+void __ext4_msg(struct super_block *sb,
+		const char *prefix, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
@@ -750,8 +762,10 @@
 	ext4_unregister_li_request(sb);
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
-	flush_workqueue(sbi->dio_unwritten_wq);
-	destroy_workqueue(sbi->dio_unwritten_wq);
+	flush_workqueue(sbi->unrsv_conversion_wq);
+	flush_workqueue(sbi->rsv_conversion_wq);
+	destroy_workqueue(sbi->unrsv_conversion_wq);
+	destroy_workqueue(sbi->rsv_conversion_wq);
 
 	if (sbi->s_journal) {
 		err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@
 			ext4_abort(sb, "Couldn't clean up the journal");
 	}
 
-	ext4_es_unregister_shrinker(sb);
+	ext4_es_unregister_shrinker(sbi);
 	del_timer(&sbi->s_err_report);
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
@@ -849,6 +863,7 @@
 	rwlock_init(&ei->i_es_lock);
 	INIT_LIST_HEAD(&ei->i_es_lru);
 	ei->i_es_lru_nr = 0;
+	ei->i_touch_when = 0;
 	ei->i_reserved_data_blocks = 0;
 	ei->i_reserved_meta_blocks = 0;
 	ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@
 	ei->i_reserved_quota = 0;
 #endif
 	ei->jinode = NULL;
-	INIT_LIST_HEAD(&ei->i_completed_io_list);
+	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+	INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
 	spin_lock_init(&ei->i_completed_io_lock);
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
 	atomic_set(&ei->i_ioend_count, 0);
 	atomic_set(&ei->i_unwritten, 0);
-	INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);
+	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+	INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
 
 	return &ei->vfs_inode;
 }
@@ -1093,6 +1110,7 @@
 	.dirty_inode	= ext4_dirty_inode,
 	.drop_inode	= ext4_drop_inode,
 	.evict_inode	= ext4_evict_inode,
+	.sync_fs	= ext4_sync_fs_nojournal,
 	.put_super	= ext4_put_super,
 	.statfs		= ext4_statfs,
 	.remount_fs	= ext4_remount,
@@ -1908,7 +1926,6 @@
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = NULL;
 	ext4_group_t flex_group;
-	unsigned int groups_per_flex = 0;
 	int i, err;
 
 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1933,6 @@
 		sbi->s_log_groups_per_flex = 0;
 		return 1;
 	}
-	groups_per_flex = 1U << sbi->s_log_groups_per_flex;
 
 	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
 	if (err)
@@ -2164,19 +2180,22 @@
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
 		dquot_initialize(inode);
 		if (inode->i_nlink) {
-			ext4_msg(sb, KERN_DEBUG,
-				"%s: truncating inode %lu to %lld bytes",
-				__func__, inode->i_ino, inode->i_size);
+			if (test_opt(sb, DEBUG))
+				ext4_msg(sb, KERN_DEBUG,
+					"%s: truncating inode %lu to %lld bytes",
+					__func__, inode->i_ino, inode->i_size);
 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
 			mutex_lock(&inode->i_mutex);
+			truncate_inode_pages(inode->i_mapping, inode->i_size);
 			ext4_truncate(inode);
 			mutex_unlock(&inode->i_mutex);
 			nr_truncates++;
 		} else {
-			ext4_msg(sb, KERN_DEBUG,
-				"%s: deleting unreferenced inode %lu",
-				__func__, inode->i_ino);
+			if (test_opt(sb, DEBUG))
+				ext4_msg(sb, KERN_DEBUG,
+					"%s: deleting unreferenced inode %lu",
+					__func__, inode->i_ino);
 			jbd_debug(2, "deleting unreferenced inode %lu\n",
 				  inode->i_ino);
 			nr_orphans++;
@@ -2377,7 +2396,10 @@
 	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
 	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
 			 const char *, size_t);
-	int offset;
+	union {
+		int offset;
+		int deprecated_val;
+	} u;
 };
 
 static int parse_strtoull(const char *buf,
@@ -2446,7 +2468,7 @@
 static ssize_t sbi_ui_show(struct ext4_attr *a,
 			   struct ext4_sb_info *sbi, char *buf)
 {
-	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
 
 	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
 }
@@ -2455,7 +2477,7 @@
 			    struct ext4_sb_info *sbi,
 			    const char *buf, size_t count)
 {
-	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
 	unsigned long t;
 	int ret;
 
@@ -2504,12 +2526,20 @@
 	return count;
 }
 
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
 static struct ext4_attr ext4_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
 	.show	= _show,					\
 	.store	= _store,					\
-	.offset = offsetof(struct ext4_sb_info, _elname),	\
+	.u = {							\
+		.offset = offsetof(struct ext4_sb_info, _elname),\
+	},							\
 }
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2550,14 @@
 #define EXT4_RW_ATTR_SBI_UI(name, elname)	\
 	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
 #define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val)	\
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = 0444 },	\
+	.show	= sbi_deprecated_show,				\
+	.u = {							\
+		.deprecated_val = _val,				\
+	},							\
+}
 
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2572,7 @@
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 
@@ -3763,7 +3801,7 @@
 	sbi->s_err_report.data = (unsigned long) sb;
 
 	/* Register extent status tree shrinker */
-	ext4_es_register_shrinker(sb);
+	ext4_es_register_shrinker(sbi);
 
 	err = percpu_counter_init(&sbi->s_freeclusters_counter,
 			ext4_count_free_clusters(sb));
@@ -3787,7 +3825,6 @@
 	}
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
-	sbi->s_max_writeback_mb_bump = 128;
 	sbi->s_extent_max_zeroout_kb = 32;
 
 	/*
@@ -3915,12 +3952,20 @@
 	 * The maximum number of concurrent works can be high and
 	 * concurrency isn't really necessary.  Limit it to 1.
 	 */
-	EXT4_SB(sb)->dio_unwritten_wq =
-		alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
-	if (!EXT4_SB(sb)->dio_unwritten_wq) {
-		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+	EXT4_SB(sb)->rsv_conversion_wq =
+		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->rsv_conversion_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
 		ret = -ENOMEM;
-		goto failed_mount_wq;
+		goto failed_mount4;
+	}
+
+	EXT4_SB(sb)->unrsv_conversion_wq =
+		alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->unrsv_conversion_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+		ret = -ENOMEM;
+		goto failed_mount4;
 	}
 
 	/*
@@ -4074,14 +4119,17 @@
 	sb->s_root = NULL;
 failed_mount4:
 	ext4_msg(sb, KERN_ERR, "mount failed");
-	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+	if (EXT4_SB(sb)->rsv_conversion_wq)
+		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+	if (EXT4_SB(sb)->unrsv_conversion_wq)
+		destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
 failed_mount_wq:
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
 failed_mount3:
-	ext4_es_unregister_shrinker(sb);
+	ext4_es_unregister_shrinker(sbi);
 	del_timer(&sbi->s_err_report);
 	if (sbi->s_flex_groups)
 		ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4565,52 @@
 {
 	int ret = 0;
 	tid_t target;
+	bool needs_barrier = false;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	trace_ext4_sync_fs(sb, wait);
-	flush_workqueue(sbi->dio_unwritten_wq);
+	flush_workqueue(sbi->rsv_conversion_wq);
+	flush_workqueue(sbi->unrsv_conversion_wq);
 	/*
 	 * Writeback quota in non-journalled quota case - journalled quota has
 	 * no dirty dquots
 	 */
 	dquot_writeback_dquots(sb, -1);
+	/*
+	 * Data writeback is possible w/o journal transaction, so barrier must
+	 * being sent at the end of the function. But we can skip it if
+	 * transaction_commit will do it for us.
+	 */
+	target = jbd2_get_latest_transaction(sbi->s_journal);
+	if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+	    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+		needs_barrier = true;
+
 	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
 		if (wait)
-			jbd2_log_wait_commit(sbi->s_journal, target);
+			ret = jbd2_log_wait_commit(sbi->s_journal, target);
 	}
+	if (needs_barrier) {
+		int err;
+		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+		if (!ret)
+			ret = err;
+	}
+
+	return ret;
+}
+
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+	int ret = 0;
+
+	trace_ext4_sync_fs(sb, wait);
+	flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+	flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
+	dquot_writeback_dquots(sb, -1);
+	if (wait && test_opt(sb, BARRIER))
+		ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
 	return ret;
 }
 

diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index fd27e7e..e06e099 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig

@@ -51,3 +51,15 @@
 	  Linux website <http://acl.bestbits.at/>.
 
 	  If you don't know what Access Control Lists are, say N
+
+config F2FS_FS_SECURITY
+	bool "F2FS Security Labels"
+	depends on F2FS_FS_XATTR
+	help
+	  Security labels provide an access control facility to support Linux
+	  Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+	  Linux. This option enables an extended attribute handler for file
+	  security labels in the f2fs filesystem, so that it requires enabling
+	  the extended attribute support in advance.
+
+	  If you are not using a security module, say N.

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 44abc2f..b7826ec 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c

@@ -250,7 +250,7 @@
 		}
 	}
 
-	error = f2fs_setxattr(inode, name_index, "", value, size);
+	error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
 
 	kfree(value);
 	if (!error)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b1de01d..66a6b85 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c

@@ -357,8 +357,8 @@
 	unsigned long blk_size = sbi->blocksize;
 	struct f2fs_checkpoint *cp_block;
 	unsigned long long cur_version = 0, pre_version = 0;
-	unsigned int crc = 0;
 	size_t crc_offset;
+	__u32 crc = 0;
 
 	/* Read the 1st cp block in this CP pack */
 	cp_page_1 = get_meta_page(sbi, cp_addr);
@@ -369,7 +369,7 @@
 	if (crc_offset >= blk_size)
 		goto invalid_cp1;
 
-	crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+	crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
 	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
 		goto invalid_cp1;
 
@@ -384,7 +384,7 @@
 	if (crc_offset >= blk_size)
 		goto invalid_cp2;
 
-	crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+	crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
 	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
 		goto invalid_cp2;
 
@@ -450,13 +450,30 @@
 	return -EINVAL;
 }
 
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct list_head *head = &sbi->dir_inode_list;
-	struct dir_inode_entry *new;
 	struct list_head *this;
 
+	list_for_each(this, head) {
+		struct dir_inode_entry *entry;
+		entry = list_entry(this, struct dir_inode_entry, list);
+		if (entry->inode == inode)
+			return -EEXIST;
+	}
+	list_add_tail(&new->list, head);
+#ifdef CONFIG_F2FS_STAT_FS
+	sbi->n_dirty_dirs++;
+#endif
+	return 0;
+}
+
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct dir_inode_entry *new;
+
 	if (!S_ISDIR(inode->i_mode))
 		return;
 retry:
@@ -469,23 +486,31 @@
 	INIT_LIST_HEAD(&new->list);
 
 	spin_lock(&sbi->dir_inode_lock);
-	list_for_each(this, head) {
-		struct dir_inode_entry *entry;
-		entry = list_entry(this, struct dir_inode_entry, list);
-		if (entry->inode == inode) {
-			kmem_cache_free(inode_entry_slab, new);
-			goto out;
-		}
-	}
-	list_add_tail(&new->list, head);
-	sbi->n_dirty_dirs++;
+	if (__add_dirty_inode(inode, new))
+		kmem_cache_free(inode_entry_slab, new);
 
-	BUG_ON(!S_ISDIR(inode->i_mode));
-out:
 	inc_page_count(sbi, F2FS_DIRTY_DENTS);
 	inode_inc_dirty_dents(inode);
 	SetPagePrivate(page);
+	spin_unlock(&sbi->dir_inode_lock);
+}
 
+void add_dirty_dir_inode(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct dir_inode_entry *new;
+retry:
+	new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+	if (!new) {
+		cond_resched();
+		goto retry;
+	}
+	new->inode = inode;
+	INIT_LIST_HEAD(&new->list);
+
+	spin_lock(&sbi->dir_inode_lock);
+	if (__add_dirty_inode(inode, new))
+		kmem_cache_free(inode_entry_slab, new);
 	spin_unlock(&sbi->dir_inode_lock);
 }
 
@@ -499,8 +524,10 @@
 		return;
 
 	spin_lock(&sbi->dir_inode_lock);
-	if (atomic_read(&F2FS_I(inode)->dirty_dents))
-		goto out;
+	if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
+		spin_unlock(&sbi->dir_inode_lock);
+		return;
+	}
 
 	list_for_each(this, head) {
 		struct dir_inode_entry *entry;
@@ -508,12 +535,38 @@
 		if (entry->inode == inode) {
 			list_del(&entry->list);
 			kmem_cache_free(inode_entry_slab, entry);
+#ifdef CONFIG_F2FS_STAT_FS
 			sbi->n_dirty_dirs--;
+#endif
 			break;
 		}
 	}
-out:
 	spin_unlock(&sbi->dir_inode_lock);
+
+	/* Only from the recovery routine */
+	if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
+		clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+		iput(inode);
+	}
+}
+
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	struct list_head *head = &sbi->dir_inode_list;
+	struct list_head *this;
+	struct inode *inode = NULL;
+
+	spin_lock(&sbi->dir_inode_lock);
+	list_for_each(this, head) {
+		struct dir_inode_entry *entry;
+		entry = list_entry(this, struct dir_inode_entry, list);
+		if (entry->inode->i_ino == ino) {
+			inode = entry->inode;
+			break;
+		}
+	}
+	spin_unlock(&sbi->dir_inode_lock);
+	return inode;
 }
 
 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
@@ -595,7 +648,7 @@
 	block_t start_blk;
 	struct page *cp_page;
 	unsigned int data_sum_blocks, orphan_blocks;
-	unsigned int crc32 = 0;
+	__u32 crc32 = 0;
 	void *kaddr;
 	int i;
 
@@ -664,8 +717,8 @@
 	get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
 
 	crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
-	*(__le32 *)((unsigned char *)ckpt +
-				le32_to_cpu(ckpt->checksum_offset))
+	*((__le32 *)((unsigned char *)ckpt +
+				le32_to_cpu(ckpt->checksum_offset)))
 				= cpu_to_le32(crc32);
 
 	start_blk = __start_cp_addr(sbi);

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 91ff93b..035f9a3 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c

@@ -68,7 +68,9 @@
 					struct buffer_head *bh_result)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
+#ifdef CONFIG_F2FS_STAT_FS
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+#endif
 	pgoff_t start_fofs, end_fofs;
 	block_t start_blkaddr;
 
@@ -78,7 +80,9 @@
 		return 0;
 	}
 
+#ifdef CONFIG_F2FS_STAT_FS
 	sbi->total_hit_ext++;
+#endif
 	start_fofs = fi->ext.fofs;
 	end_fofs = fi->ext.fofs + fi->ext.len - 1;
 	start_blkaddr = fi->ext.blk_addr;
@@ -96,7 +100,9 @@
 		else
 			bh_result->b_size = UINT_MAX;
 
+#ifdef CONFIG_F2FS_STAT_FS
 		sbi->read_hit_ext++;
+#endif
 		read_unlock(&fi->ext.ext_lock);
 		return 1;
 	}
@@ -199,7 +205,7 @@
 	if (dn.data_blkaddr == NEW_ADDR)
 		return ERR_PTR(-EINVAL);
 
-	page = grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -233,19 +239,24 @@
 	struct page *page;
 	int err;
 
-	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
-	if (err)
-		return ERR_PTR(err);
-	f2fs_put_dnode(&dn);
-
-	if (dn.data_blkaddr == NULL_ADDR)
-		return ERR_PTR(-ENOENT);
 repeat:
-	page = grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+	if (err) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(err);
+	}
+	f2fs_put_dnode(&dn);
+
+	if (dn.data_blkaddr == NULL_ADDR) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(-ENOENT);
+	}
+
 	if (PageUptodate(page))
 		return page;
 
@@ -274,9 +285,10 @@
  *
  * Also, caller should grab and release a mutex by calling mutex_lock_op() and
  * mutex_unlock_op().
+ * Note that, npage is set only by make_empty_dir.
  */
-struct page *get_new_data_page(struct inode *inode, pgoff_t index,
-						bool new_i_size)
+struct page *get_new_data_page(struct inode *inode,
+		struct page *npage, pgoff_t index, bool new_i_size)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
@@ -284,18 +296,20 @@
 	struct dnode_of_data dn;
 	int err;
 
-	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	set_new_dnode(&dn, inode, npage, npage, 0);
 	err = get_dnode_of_data(&dn, index, ALLOC_NODE);
 	if (err)
 		return ERR_PTR(err);
 
 	if (dn.data_blkaddr == NULL_ADDR) {
 		if (reserve_new_block(&dn)) {
-			f2fs_put_dnode(&dn);
+			if (!npage)
+				f2fs_put_dnode(&dn);
 			return ERR_PTR(-ENOSPC);
 		}
 	}
-	f2fs_put_dnode(&dn);
+	if (!npage)
+		f2fs_put_dnode(&dn);
 repeat:
 	page = grab_cache_page(mapping, index);
 	if (!page)
@@ -325,6 +339,8 @@
 	if (new_i_size &&
 		i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
 		i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+		/* Only the directory inode sets new_i_size */
+		set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
 		mark_inode_dirty_sync(inode);
 	}
 	return page;
@@ -481,8 +497,9 @@
 	 * If current allocation needs SSR,
 	 * it had better in-place writes for updated data.
 	 */
-	if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
-				need_inplace_update(inode)) {
+	if (unlikely(old_blk_addr != NEW_ADDR &&
+			!is_cold_data(page) &&
+			need_inplace_update(inode))) {
 		rewrite_data_page(F2FS_SB(inode->i_sb), page,
 						old_blk_addr);
 	} else {
@@ -684,6 +701,27 @@
 	return err;
 }
 
+static int f2fs_write_end(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+
+	SetPageUptodate(page);
+	set_page_dirty(page);
+
+	if (pos + copied > i_size_read(inode)) {
+		i_size_write(inode, pos + copied);
+		mark_inode_dirty(inode);
+		update_inode_page(inode);
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+	return copied;
+}
+
 static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
 		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
@@ -698,7 +736,8 @@
 						  get_data_block_ro);
 }
 
-static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
+				      unsigned int length)
 {
 	struct inode *inode = page->mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -740,7 +779,7 @@
 	.writepage	= f2fs_write_data_page,
 	.writepages	= f2fs_write_data_pages,
 	.write_begin	= f2fs_write_begin,
-	.write_end	= nobh_write_end,
+	.write_end	= f2fs_write_end,
 	.set_page_dirty	= f2fs_set_data_page_dirty,
 	.invalidatepage	= f2fs_invalidate_data_page,
 	.releasepage	= f2fs_release_data_page,

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8d99437..0d6c6aa 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c

@@ -175,12 +175,12 @@
 
 static int stat_show(struct seq_file *s, void *v)
 {
-	struct f2fs_stat_info *si, *next;
+	struct f2fs_stat_info *si;
 	int i = 0;
 	int j;
 
 	mutex_lock(&f2fs_stat_mutex);
-	list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+	list_for_each_entry(si, &f2fs_stat_list, stat_list) {
 		char devname[BDEVNAME_SIZE];
 
 		update_general_status(si->sbi);

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 1ac6b93..9d1cd42 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c

@@ -13,6 +13,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "acl.h"
+#include "xattr.h"
 
 static unsigned long dir_blocks(struct inode *inode)
 {
@@ -215,9 +216,9 @@
 
 struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
 {
-	struct page *page = NULL;
-	struct f2fs_dir_entry *de = NULL;
-	struct f2fs_dentry_block *dentry_blk = NULL;
+	struct page *page;
+	struct f2fs_dir_entry *de;
+	struct f2fs_dentry_block *dentry_blk;
 
 	page = get_lock_data_page(dir, 0);
 	if (IS_ERR(page))
@@ -264,15 +265,10 @@
 	f2fs_put_page(page, 1);
 }
 
-void init_dent_inode(const struct qstr *name, struct page *ipage)
+static void init_dent_inode(const struct qstr *name, struct page *ipage)
 {
 	struct f2fs_node *rn;
 
-	if (IS_ERR(ipage))
-		return;
-
-	wait_on_page_writeback(ipage);
-
 	/* copy name info. to this inode page */
 	rn = (struct f2fs_node *)page_address(ipage);
 	rn->i.i_namelen = cpu_to_le32(name->len);
@@ -280,14 +276,15 @@
 	set_page_dirty(ipage);
 }
 
-static int make_empty_dir(struct inode *inode, struct inode *parent)
+static int make_empty_dir(struct inode *inode,
+		struct inode *parent, struct page *page)
 {
 	struct page *dentry_page;
 	struct f2fs_dentry_block *dentry_blk;
 	struct f2fs_dir_entry *de;
 	void *kaddr;
 
-	dentry_page = get_new_data_page(inode, 0, true);
+	dentry_page = get_new_data_page(inode, page, 0, true);
 	if (IS_ERR(dentry_page))
 		return PTR_ERR(dentry_page);
 
@@ -317,63 +314,76 @@
 	return 0;
 }
 
-static int init_inode_metadata(struct inode *inode,
+static struct page *init_inode_metadata(struct inode *inode,
 		struct inode *dir, const struct qstr *name)
 {
+	struct page *page;
+	int err;
+
 	if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
-		int err;
-		err = new_inode_page(inode, name);
-		if (err)
-			return err;
+		page = new_inode_page(inode, name);
+		if (IS_ERR(page))
+			return page;
 
 		if (S_ISDIR(inode->i_mode)) {
-			err = make_empty_dir(inode, dir);
-			if (err) {
-				remove_inode_page(inode);
-				return err;
-			}
+			err = make_empty_dir(inode, dir, page);
+			if (err)
+				goto error;
 		}
 
 		err = f2fs_init_acl(inode, dir);
-		if (err) {
-			remove_inode_page(inode);
-			return err;
-		}
+		if (err)
+			goto error;
+
+		err = f2fs_init_security(inode, dir, name, page);
+		if (err)
+			goto error;
+
+		wait_on_page_writeback(page);
 	} else {
-		struct page *ipage;
-		ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
-		if (IS_ERR(ipage))
-			return PTR_ERR(ipage);
-		set_cold_node(inode, ipage);
-		init_dent_inode(name, ipage);
-		f2fs_put_page(ipage, 1);
+		page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+		if (IS_ERR(page))
+			return page;
+
+		wait_on_page_writeback(page);
+		set_cold_node(inode, page);
 	}
+
+	init_dent_inode(name, page);
+
+	/*
+	 * This file should be checkpointed during fsync.
+	 * We lost i_pino from now on.
+	 */
 	if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+		file_lost_pino(inode);
 		inc_nlink(inode);
-		update_inode_page(inode);
 	}
-	return 0;
+	return page;
+
+error:
+	f2fs_put_page(page, 1);
+	remove_inode_page(inode);
+	return ERR_PTR(err);
 }
 
 static void update_parent_metadata(struct inode *dir, struct inode *inode,
 						unsigned int current_depth)
 {
-	bool need_dir_update = false;
-
 	if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
 		if (S_ISDIR(inode->i_mode)) {
 			inc_nlink(dir);
-			need_dir_update = true;
+			set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
 		}
 		clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
 	}
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 	if (F2FS_I(dir)->i_current_depth != current_depth) {
 		F2FS_I(dir)->i_current_depth = current_depth;
-		need_dir_update = true;
+		set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
 	}
 
-	if (need_dir_update)
+	if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
 		update_inode_page(dir);
 	else
 		mark_inode_dirty(dir);
@@ -423,6 +433,7 @@
 	struct page *dentry_page = NULL;
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	int slots = GET_DENTRY_SLOTS(namelen);
+	struct page *page;
 	int err = 0;
 	int i;
 
@@ -448,7 +459,7 @@
 	bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
 
 	for (block = bidx; block <= (bidx + nblock - 1); block++) {
-		dentry_page = get_new_data_page(dir, block, true);
+		dentry_page = get_new_data_page(dir, NULL, block, true);
 		if (IS_ERR(dentry_page))
 			return PTR_ERR(dentry_page);
 
@@ -465,12 +476,13 @@
 	++level;
 	goto start;
 add_dentry:
-	err = init_inode_metadata(inode, dir, name);
-	if (err)
-		goto fail;
-
 	wait_on_page_writeback(dentry_page);
 
+	page = init_inode_metadata(inode, dir, name);
+	if (IS_ERR(page)) {
+		err = PTR_ERR(page);
+		goto fail;
+	}
 	de = &dentry_blk->dentry[bit_pos];
 	de->hash_code = dentry_hash;
 	de->name_len = cpu_to_le16(namelen);
@@ -481,11 +493,14 @@
 		test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
 	set_page_dirty(dentry_page);
 
-	update_parent_metadata(dir, inode, current_depth);
-
-	/* update parent inode number before releasing dentry page */
+	/* we don't need to mark_inode_dirty now */
 	F2FS_I(inode)->i_pino = dir->i_ino;
+	update_inode(inode, page);
+	f2fs_put_page(page, 1);
+
+	update_parent_metadata(dir, inode, current_depth);
 fail:
+	clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
 	kunmap(dentry_page);
 	f2fs_put_page(dentry_page, 1);
 	return err;
@@ -591,24 +606,19 @@
 	return true;
 }
 
-static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned long pos = file->f_pos;
 	struct inode *inode = file_inode(file);
 	unsigned long npages = dir_blocks(inode);
-	unsigned char *types = NULL;
 	unsigned int bit_pos = 0, start_bit_pos = 0;
-	int over = 0;
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct f2fs_dir_entry *de = NULL;
 	struct page *dentry_page = NULL;
-	unsigned int n = 0;
+	unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
 	unsigned char d_type = DT_UNKNOWN;
 	int slots;
 
-	types = f2fs_filetype_table;
-	bit_pos = (pos % NR_DENTRY_IN_BLOCK);
-	n = (pos / NR_DENTRY_IN_BLOCK);
+	bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
 
 	for ( ; n < npages; n++) {
 		dentry_page = get_lock_data_page(inode, n);
@@ -618,31 +628,28 @@
 		start_bit_pos = bit_pos;
 		dentry_blk = kmap(dentry_page);
 		while (bit_pos < NR_DENTRY_IN_BLOCK) {
-			d_type = DT_UNKNOWN;
 			bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
 							NR_DENTRY_IN_BLOCK,
 							bit_pos);
 			if (bit_pos >= NR_DENTRY_IN_BLOCK)
 				break;
 
+			ctx->pos += bit_pos - start_bit_pos;
 			de = &dentry_blk->dentry[bit_pos];
-			if (types && de->file_type < F2FS_FT_MAX)
-				d_type = types[de->file_type];
-
-			over = filldir(dirent,
-					dentry_blk->filename[bit_pos],
-					le16_to_cpu(de->name_len),
-					(n * NR_DENTRY_IN_BLOCK) + bit_pos,
-					le32_to_cpu(de->ino), d_type);
-			if (over) {
-				file->f_pos += bit_pos - start_bit_pos;
+			if (de->file_type < F2FS_FT_MAX)
+				d_type = f2fs_filetype_table[de->file_type];
+			else
+				d_type = DT_UNKNOWN;
+			if (!dir_emit(ctx,
+				      dentry_blk->filename[bit_pos],
+				      le16_to_cpu(de->name_len),
+				      le32_to_cpu(de->ino), d_type))
 				goto success;
-			}
 			slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
 			bit_pos += slots;
 		}
 		bit_pos = 0;
-		file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
+		ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
 		dentry_page = NULL;
@@ -659,7 +666,7 @@
 const struct file_operations f2fs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= f2fs_readdir,
+	.iterate	= f2fs_readdir,
 	.fsync		= f2fs_sync_file,
 	.unlocked_ioctl	= f2fs_ioctl,
 };

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 20aab02..467d42d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h

@@ -37,21 +37,35 @@
 		typecheck(unsigned long long, b) &&			\
 		((long long)((a) - (b)) > 0))
 
-typedef u64 block_t;
+typedef u32 block_t;	/*
+			 * should not change u32, since it is the on-disk block
+			 * address format, __le32.
+			 */
 typedef u32 nid_t;
 
 struct f2fs_mount_info {
 	unsigned int	opt;
 };
 
-static inline __u32 f2fs_crc32(void *buff, size_t len)
+#define CRCPOLY_LE 0xedb88320
+
+static inline __u32 f2fs_crc32(void *buf, size_t len)
 {
-	return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+	unsigned char *p = (unsigned char *)buf;
+	__u32 crc = F2FS_SUPER_MAGIC;
+	int i;
+
+	while (len--) {
+		crc ^= *p++;
+		for (i = 0; i < 8; i++)
+			crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
+	}
+	return crc;
 }
 
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
 {
-	return f2fs_crc32(buff, buff_size) == blk_crc;
+	return f2fs_crc32(buf, buf_size) == blk_crc;
 }
 
 /*
@@ -148,7 +162,7 @@
  * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
  */
 #define FADVISE_COLD_BIT	0x01
-#define FADVISE_CP_BIT		0x02
+#define FADVISE_LOST_PINO_BIT	0x02
 
 struct f2fs_inode_info {
 	struct inode vfs_inode;		/* serve a vfs inode */
@@ -369,7 +383,6 @@
 	/* for directory inode management */
 	struct list_head dir_inode_list;	/* dir inode list */
 	spinlock_t dir_inode_lock;		/* for dir inode list lock */
-	unsigned int n_dirty_dirs;		/* # of dir inodes */
 
 	/* basic file system units */
 	unsigned int log_sectors_per_block;	/* log2 sectors per block */
@@ -406,12 +419,15 @@
 	 * for stat information.
 	 * one is for the LFS mode, and the other is for the SSR mode.
 	 */
+#ifdef CONFIG_F2FS_STAT_FS
 	struct f2fs_stat_info *stat_info;	/* FS status information */
 	unsigned int segment_count[2];		/* # of allocated segments */
 	unsigned int block_count[2];		/* # of allocated blocks */
-	unsigned int last_victim[2];		/* last victim segment # */
 	int total_hit_ext, read_hit_ext;	/* extent cache hit ratio */
 	int bg_gc;				/* background gc calls */
+	unsigned int n_dirty_dirs;		/* # of dir inodes */
+#endif
+	unsigned int last_victim[2];		/* last victim segment # */
 	spinlock_t stat_lock;			/* lock for stat operations */
 };
 
@@ -495,9 +511,17 @@
 
 static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
 {
-	int i = 0;
-	for (; i < NR_GLOBAL_LOCKS; i++)
-		mutex_lock(&sbi->fs_lock[i]);
+	int i;
+
+	for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
+		/*
+		 * This is the only time we take multiple fs_lock[]
+		 * instances; the order is immaterial since we
+		 * always hold cp_mutex, which serializes multiple
+		 * such operations.
+		 */
+		mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
+	}
 }
 
 static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
@@ -843,9 +867,12 @@
 /* used for f2fs_inode_info->flags */
 enum {
 	FI_NEW_INODE,		/* indicate newly allocated inode */
+	FI_DIRTY_INODE,		/* indicate inode is dirty or not */
 	FI_INC_LINK,		/* need to increment i_nlink */
 	FI_ACL_MODE,		/* indicate acl mode */
 	FI_NO_ALLOC,		/* should not allocate any blocks */
+	FI_UPDATE_DIR,		/* should update inode block for consistency */
+	FI_DELAY_IPUT,		/* used for the recovery */
 };
 
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -878,14 +905,21 @@
 	return 0;
 }
 
+static inline int f2fs_readonly(struct super_block *sb)
+{
+	return sb->s_flags & MS_RDONLY;
+}
+
 /*
  * file.c
  */
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
 void f2fs_truncate(struct inode *);
+int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
 int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+int truncate_data_blocks_range(struct dnode_of_data *, int);
 long f2fs_ioctl(struct file *, unsigned int, unsigned long);
 long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
 
@@ -913,7 +947,6 @@
 ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
 void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
 				struct page *, struct inode *);
-void init_dent_inode(const struct qstr *, struct page *);
 int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
 int f2fs_make_empty(struct inode *, struct inode *);
@@ -948,8 +981,8 @@
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int remove_inode_page(struct inode *);
-int new_inode_page(struct inode *, const struct qstr *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int);
+struct page *new_inode_page(struct inode *, const struct qstr *);
+struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
 void ra_node_page(struct f2fs_sb_info *, nid_t);
 struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_node_page_ra(struct page *, int);
@@ -974,7 +1007,6 @@
  */
 void f2fs_balance_fs(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
-void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
 void clear_prefree_segments(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
@@ -1011,7 +1043,9 @@
 int recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void set_dirty_dir_page(struct inode *, struct page *);
+void add_dirty_dir_inode(struct inode *);
 void remove_dirty_dir_inode(struct inode *);
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
 void sync_dirty_dir_inodes(struct f2fs_sb_info *);
 void write_checkpoint(struct f2fs_sb_info *, bool);
 void init_orphan_info(struct f2fs_sb_info *);
@@ -1025,7 +1059,7 @@
 void update_extent_cache(block_t, struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
-struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
 int do_write_data_page(struct page *);
 

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1cae864..d2d2b7db 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c

@@ -63,9 +63,10 @@
 	f2fs_put_dnode(&dn);
 	mutex_unlock_op(sbi, ilock);
 
+	file_update_time(vma->vm_file);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping ||
-			page_offset(page) >= i_size_read(inode) ||
+			page_offset(page) > i_size_read(inode) ||
 			!PageUptodate(page)) {
 		unlock_page(page);
 		err = -EFAULT;
@@ -76,10 +77,7 @@
 	 * check to see if the page is mapped already (no holes)
 	 */
 	if (PageMappedToDisk(page))
-		goto out;
-
-	/* fill the page */
-	wait_on_page_writeback(page);
+		goto mapped;
 
 	/* page is wholly or partially inside EOF */
 	if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
@@ -90,7 +88,9 @@
 	set_page_dirty(page);
 	SetPageUptodate(page);
 
-	file_update_time(vma->vm_file);
+mapped:
+	/* fill the page */
+	wait_on_page_writeback(page);
 out:
 	sb_end_pagefault(inode->i_sb);
 	return block_page_mkwrite_return(err);
@@ -102,6 +102,24 @@
 	.remap_pages	= generic_file_remap_pages,
 };
 
+static int get_parent_ino(struct inode *inode, nid_t *pino)
+{
+	struct dentry *dentry;
+
+	inode = igrab(inode);
+	dentry = d_find_any_alias(inode);
+	iput(inode);
+	if (!dentry)
+		return 0;
+
+	inode = igrab(dentry->d_parent->d_inode);
+	dput(dentry);
+
+	*pino = inode->i_ino;
+	iput(inode);
+	return 1;
+}
+
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
@@ -114,7 +132,7 @@
 		.for_reclaim = 0,
 	};
 
-	if (inode->i_sb->s_flags & MS_RDONLY)
+	if (f2fs_readonly(inode->i_sb))
 		return 0;
 
 	trace_f2fs_sync_file_enter(inode);
@@ -134,7 +152,7 @@
 
 	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
 		need_cp = true;
-	else if (is_cp_file(inode))
+	else if (file_wrong_pino(inode))
 		need_cp = true;
 	else if (!space_for_roll_forward(sbi))
 		need_cp = true;
@@ -142,11 +160,23 @@
 		need_cp = true;
 
 	if (need_cp) {
+		nid_t pino;
+
 		/* all the dirty node pages should be flushed for POR */
 		ret = f2fs_sync_fs(inode->i_sb, 1);
+		if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+					get_parent_ino(inode, &pino)) {
+			F2FS_I(inode)->i_pino = pino;
+			file_got_pino(inode);
+			mark_inode_dirty_sync(inode);
+			ret = f2fs_write_inode(inode, NULL);
+			if (ret)
+				goto out;
+		}
 	} else {
 		/* if there is no written node page, write its inode page */
 		while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+			mark_inode_dirty_sync(inode);
 			ret = f2fs_write_inode(inode, NULL);
 			if (ret)
 				goto out;
@@ -168,7 +198,7 @@
 	return 0;
 }
 
-static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 {
 	int nr_free = 0, ofs = dn->ofs_in_node;
 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
@@ -185,10 +215,10 @@
 
 		update_extent_cache(NULL_ADDR, dn);
 		invalidate_blocks(sbi, blkaddr);
-		dec_valid_block_count(sbi, dn->inode, 1);
 		nr_free++;
 	}
 	if (nr_free) {
+		dec_valid_block_count(sbi, dn->inode, nr_free);
 		set_page_dirty(dn->node_page);
 		sync_inode_page(dn);
 	}
@@ -291,7 +321,7 @@
 	}
 }
 
-static int f2fs_getattr(struct vfsmount *mnt,
+int f2fs_getattr(struct vfsmount *mnt,
 			 struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
@@ -387,7 +417,7 @@
 	f2fs_balance_fs(sbi);
 
 	ilock = mutex_lock_op(sbi);
-	page = get_new_data_page(inode, index, false);
+	page = get_new_data_page(inode, NULL, index, false);
 	mutex_unlock_op(sbi, ilock);
 
 	if (!IS_ERR(page)) {
@@ -575,10 +605,10 @@
 	int ret;
 
 	switch (cmd) {
-	case FS_IOC_GETFLAGS:
+	case F2FS_IOC_GETFLAGS:
 		flags = fi->i_flags & FS_FL_USER_VISIBLE;
 		return put_user(flags, (int __user *) arg);
-	case FS_IOC_SETFLAGS:
+	case F2FS_IOC_SETFLAGS:
 	{
 		unsigned int oldflags;
 

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 1496159..35f9b1a 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c

@@ -76,7 +76,9 @@
 		else
 			wait_ms = increase_sleep_time(wait_ms);
 
+#ifdef CONFIG_F2FS_STAT_FS
 		sbi->bg_gc++;
+#endif
 
 		/* if return value is not zero, no victim was selected */
 		if (f2fs_gc(sbi))
@@ -89,23 +91,28 @@
 {
 	struct f2fs_gc_kthread *gc_th;
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
+	int err = 0;
 
 	if (!test_opt(sbi, BG_GC))
-		return 0;
+		goto out;
 	gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
-	if (!gc_th)
-		return -ENOMEM;
+	if (!gc_th) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	sbi->gc_thread = gc_th;
 	init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
 	sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
 			"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(gc_th->f2fs_gc_task)) {
+		err = PTR_ERR(gc_th->f2fs_gc_task);
 		kfree(gc_th);
 		sbi->gc_thread = NULL;
-		return -ENOMEM;
 	}
-	return 0;
+
+out:
+	return err;
 }
 
 void stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -234,14 +241,14 @@
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct victim_sel_policy p;
-	unsigned int secno;
+	unsigned int secno, max_cost;
 	int nsearched = 0;
 
 	p.alloc_mode = alloc_mode;
 	select_policy(sbi, gc_type, type, &p);
 
 	p.min_segno = NULL_SEGNO;
-	p.min_cost = get_max_cost(sbi, &p);
+	p.min_cost = max_cost = get_max_cost(sbi, &p);
 
 	mutex_lock(&dirty_i->seglist_lock);
 
@@ -280,7 +287,7 @@
 			p.min_cost = cost;
 		}
 
-		if (cost == get_max_cost(sbi, &p))
+		if (cost == max_cost)
 			continue;
 
 		if (nsearched++ >= MAX_VICTIM_SEARCH) {
@@ -288,8 +295,8 @@
 			break;
 		}
 	}
-got_it:
 	if (p.min_segno != NULL_SEGNO) {
+got_it:
 		if (p.alloc_mode == LFS) {
 			secno = GET_SECNO(sbi, p.min_segno);
 			if (gc_type == FG_GC)
@@ -314,28 +321,21 @@
 
 static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
 {
-	struct list_head *this;
 	struct inode_entry *ie;
 
-	list_for_each(this, ilist) {
-		ie = list_entry(this, struct inode_entry, list);
+	list_for_each_entry(ie, ilist, list)
 		if (ie->inode->i_ino == ino)
 			return ie->inode;
-	}
 	return NULL;
 }
 
 static void add_gc_inode(struct inode *inode, struct list_head *ilist)
 {
-	struct list_head *this;
-	struct inode_entry *new_ie, *ie;
+	struct inode_entry *new_ie;
 
-	list_for_each(this, ilist) {
-		ie = list_entry(this, struct inode_entry, list);
-		if (ie->inode == inode) {
-			iput(inode);
-			return;
-		}
+	if (inode == find_gc_inode(inode->i_ino, ilist)) {
+		iput(inode);
+		return;
 	}
 repeat:
 	new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 91ac7f9..2b2d45d1 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c

@@ -109,12 +109,6 @@
 	ret = do_read_inode(inode);
 	if (ret)
 		goto bad_inode;
-
-	if (!sbi->por_doing && inode->i_nlink == 0) {
-		ret = -ENOENT;
-		goto bad_inode;
-	}
-
 make_now:
 	if (ino == F2FS_NODE_INO(sbi)) {
 		inode->i_mapping->a_ops = &f2fs_node_aops;
@@ -130,8 +124,7 @@
 		inode->i_op = &f2fs_dir_inode_operations;
 		inode->i_fop = &f2fs_dir_operations;
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
-		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
-				__GFP_ZERO);
+		mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &f2fs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -199,6 +192,7 @@
 
 	set_cold_node(inode, node_page);
 	set_page_dirty(node_page);
+	clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
 }
 
 int update_inode_page(struct inode *inode)
@@ -224,6 +218,9 @@
 			inode->i_ino == F2FS_META_INO(sbi))
 		return 0;
 
+	if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+		return 0;
+
 	if (wbc)
 		f2fs_balance_fs(sbi);
 

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 47abc97..64c0716 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c

@@ -112,7 +112,7 @@
 	int count = le32_to_cpu(sbi->raw_super->extension_count);
 	for (i = 0; i < count; i++) {
 		if (is_multimedia_file(name, extlist[i])) {
-			set_cold_file(inode);
+			file_set_cold(inode);
 			break;
 		}
 	}
@@ -149,8 +149,7 @@
 
 	alloc_nid_done(sbi, ino);
 
-	if (!sbi->por_doing)
-		d_instantiate(dentry, inode);
+	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
 	return 0;
 out:
@@ -173,7 +172,7 @@
 	f2fs_balance_fs(sbi);
 
 	inode->i_ctime = CURRENT_TIME;
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
 	ilock = mutex_lock_op(sbi);
@@ -182,17 +181,10 @@
 	if (err)
 		goto out;
 
-	/*
-	 * This file should be checkpointed during fsync.
-	 * We lost i_pino from now on.
-	 */
-	set_cp_file(inode);
-
 	d_instantiate(dentry, inode);
 	return 0;
 out:
 	clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
-	make_bad_inode(inode);
 	iput(inode);
 	return err;
 }
@@ -498,6 +490,7 @@
 	.rmdir		= f2fs_rmdir,
 	.mknod		= f2fs_mknod,
 	.rename		= f2fs_rename,
+	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
 	.get_acl	= f2fs_get_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
@@ -512,6 +505,7 @@
 	.readlink       = generic_readlink,
 	.follow_link    = page_follow_link_light,
 	.put_link       = page_put_link,
+	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
 #ifdef CONFIG_F2FS_FS_XATTR
 	.setxattr	= generic_setxattr,
@@ -522,6 +516,7 @@
 };
 
 const struct inode_operations f2fs_special_inode_operations = {
+	.getattr	= f2fs_getattr,
 	.setattr        = f2fs_setattr,
 	.get_acl	= f2fs_get_acl,
 #ifdef CONFIG_F2FS_FS_XATTR

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3df43b4..b418aee 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c

@@ -408,10 +408,13 @@
 	level = get_node_path(index, offset, noffset);
 
 	nids[0] = dn->inode->i_ino;
-	npage[0] = get_node_page(sbi, nids[0]);
-	if (IS_ERR(npage[0]))
-		return PTR_ERR(npage[0]);
+	npage[0] = dn->inode_page;
 
+	if (!npage[0]) {
+		npage[0] = get_node_page(sbi, nids[0]);
+		if (IS_ERR(npage[0]))
+			return PTR_ERR(npage[0]);
+	}
 	parent = npage[0];
 	if (level != 0)
 		nids[1] = get_nid(parent, offset[0], true);
@@ -430,7 +433,7 @@
 			}
 
 			dn->nid = nids[i];
-			npage[i] = new_node_page(dn, noffset[i]);
+			npage[i] = new_node_page(dn, noffset[i], NULL);
 			if (IS_ERR(npage[i])) {
 				alloc_nid_failed(sbi, nids[i]);
 				err = PTR_ERR(npage[i]);
@@ -803,22 +806,19 @@
 	return 0;
 }
 
-int new_inode_page(struct inode *inode, const struct qstr *name)
+struct page *new_inode_page(struct inode *inode, const struct qstr *name)
 {
-	struct page *page;
 	struct dnode_of_data dn;
 
 	/* allocate inode page for new inode */
 	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
-	page = new_node_page(&dn, 0);
-	init_dent_inode(name, page);
-	if (IS_ERR(page))
-		return PTR_ERR(page);
-	f2fs_put_page(page, 1);
-	return 0;
+
+	/* caller should f2fs_put_page(page, 1); */
+	return new_node_page(&dn, 0, NULL);
 }
 
-struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+struct page *new_node_page(struct dnode_of_data *dn,
+				unsigned int ofs, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
 	struct address_space *mapping = sbi->node_inode->i_mapping;
@@ -851,7 +851,10 @@
 	set_cold_node(dn->inode, page);
 
 	dn->node_page = page;
-	sync_inode_page(dn);
+	if (ipage)
+		update_inode(dn->inode, ipage);
+	else
+		sync_inode_page(dn);
 	set_page_dirty(page);
 	if (ofs == 0)
 		inc_valid_inode_count(sbi);
@@ -1205,7 +1208,8 @@
 	return 0;
 }
 
-static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
+				      unsigned int length)
 {
 	struct inode *inode = page->mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1492,9 +1496,10 @@
 	new_ni = old_ni;
 	new_ni.ino = ino;
 
+	if (!inc_valid_node_count(sbi, NULL, 1))
+		WARN_ON(1);
 	set_node_addr(sbi, &new_ni, NEW_ADDR);
 	inc_valid_inode_count(sbi);
-
 	f2fs_put_page(ipage, 1);
 	return 0;
 }

diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0a2d72f..c65fb4f 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h

@@ -275,25 +275,27 @@
  *  - Mark cold node blocks in their node footer
  *  - Mark cold data pages in page cache
  */
-static inline int is_cold_file(struct inode *inode)
+static inline int is_file(struct inode *inode, int type)
 {
-	return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+	return F2FS_I(inode)->i_advise & type;
 }
 
-static inline void set_cold_file(struct inode *inode)
+static inline void set_file(struct inode *inode, int type)
 {
-	F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+	F2FS_I(inode)->i_advise |= type;
 }
 
-static inline int is_cp_file(struct inode *inode)
+static inline void clear_file(struct inode *inode, int type)
 {
-	return F2FS_I(inode)->i_advise & FADVISE_CP_BIT;
+	F2FS_I(inode)->i_advise &= ~type;
 }
 
-static inline void set_cp_file(struct inode *inode)
-{
-	F2FS_I(inode)->i_advise |= FADVISE_CP_BIT;
-}
+#define file_is_cold(inode)	is_file(inode, FADVISE_COLD_BIT)
+#define file_wrong_pino(inode)	is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_set_cold(inode)	set_file(inode, FADVISE_COLD_BIT)
+#define file_lost_pino(inode)	set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode)	clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode)	clear_file(inode, FADVISE_LOST_PINO_BIT)
 
 static inline int is_cold_data(struct page *page)
 {
@@ -310,29 +312,16 @@
 	ClearPageChecked(page);
 }
 
-static inline int is_cold_node(struct page *page)
+static inline int is_node(struct page *page, int type)
 {
 	void *kaddr = page_address(page);
 	struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-	unsigned int flag = le32_to_cpu(rn->footer.flag);
-	return flag & (0x1 << COLD_BIT_SHIFT);
+	return le32_to_cpu(rn->footer.flag) & (1 << type);
 }
 
-static inline unsigned char is_fsync_dnode(struct page *page)
-{
-	void *kaddr = page_address(page);
-	struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-	unsigned int flag = le32_to_cpu(rn->footer.flag);
-	return flag & (0x1 << FSYNC_BIT_SHIFT);
-}
-
-static inline unsigned char is_dent_dnode(struct page *page)
-{
-	void *kaddr = page_address(page);
-	struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-	unsigned int flag = le32_to_cpu(rn->footer.flag);
-	return flag & (0x1 << DENT_BIT_SHIFT);
-}
+#define is_cold_node(page)	is_node(page, COLD_BIT_SHIFT)
+#define is_fsync_dnode(page)	is_node(page, FSYNC_BIT_SHIFT)
+#define is_dent_dnode(page)	is_node(page, DENT_BIT_SHIFT)
 
 static inline void set_cold_node(struct inode *inode, struct page *page)
 {
@@ -346,26 +335,15 @@
 	rn->footer.flag = cpu_to_le32(flag);
 }
 
-static inline void set_fsync_mark(struct page *page, int mark)
+static inline void set_mark(struct page *page, int mark, int type)
 {
-	void *kaddr = page_address(page);
-	struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+	struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
 	unsigned int flag = le32_to_cpu(rn->footer.flag);
 	if (mark)
-		flag |= (0x1 << FSYNC_BIT_SHIFT);
+		flag |= (0x1 << type);
 	else
-		flag &= ~(0x1 << FSYNC_BIT_SHIFT);
+		flag &= ~(0x1 << type);
 	rn->footer.flag = cpu_to_le32(flag);
 }
-
-static inline void set_dentry_mark(struct page *page, int mark)
-{
-	void *kaddr = page_address(page);
-	struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-	unsigned int flag = le32_to_cpu(rn->footer.flag);
-	if (mark)
-		flag |= (0x1 << DENT_BIT_SHIFT);
-	else
-		flag &= ~(0x1 << DENT_BIT_SHIFT);
-	rn->footer.flag = cpu_to_le32(flag);
-}
+#define set_dentry_mark(page, mark)	set_mark(page, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(page, mark)	set_mark(page, mark, FSYNC_BIT_SHIFT)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 60c8a50..d56d951 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c

@@ -40,36 +40,54 @@
 
 static int recover_dentry(struct page *ipage, struct inode *inode)
 {
-	struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+	void *kaddr = page_address(ipage);
+	struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
 	struct f2fs_inode *raw_inode = &(raw_node->i);
-	struct qstr name;
+	nid_t pino = le32_to_cpu(raw_inode->i_pino);
 	struct f2fs_dir_entry *de;
+	struct qstr name;
 	struct page *page;
-	struct inode *dir;
+	struct inode *dir, *einode;
 	int err = 0;
 
-	if (!is_dent_dnode(ipage))
-		goto out;
-
-	dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		goto out;
+	dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
+	if (!dir) {
+		dir = f2fs_iget(inode->i_sb, pino);
+		if (IS_ERR(dir)) {
+			err = PTR_ERR(dir);
+			goto out;
+		}
+		set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+		add_dirty_dir_inode(dir);
 	}
 
 	name.len = le32_to_cpu(raw_inode->i_namelen);
 	name.name = raw_inode->i_name;
-
+retry:
 	de = f2fs_find_entry(dir, &name, &page);
-	if (de) {
+	if (de && inode->i_ino == le32_to_cpu(de->ino)) {
 		kunmap(page);
 		f2fs_put_page(page, 0);
-	} else {
-		err = __f2fs_add_link(dir, &name, inode);
+		goto out;
 	}
-	iput(dir);
+	if (de) {
+		einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+		if (IS_ERR(einode)) {
+			WARN_ON(1);
+			if (PTR_ERR(einode) == -ENOENT)
+				err = -EEXIST;
+			goto out;
+		}
+		f2fs_delete_entry(de, page, einode);
+		iput(einode);
+		goto retry;
+	}
+	err = __f2fs_add_link(dir, &name, inode);
 out:
-	kunmap(ipage);
+	f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
+			"ino = %x, name = %s, dir = %lx, err = %d",
+			ino_of_node(ipage), raw_inode->i_name,
+			IS_ERR(dir) ? 0 : dir->i_ino, err);
 	return err;
 }
 
@@ -79,6 +97,9 @@
 	struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
 	struct f2fs_inode *raw_inode = &(raw_node->i);
 
+	if (!IS_INODE(node_page))
+		return 0;
+
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	i_size_write(inode, le64_to_cpu(raw_inode->i_size));
 	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -88,7 +109,12 @@
 	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
 	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
 
-	return recover_dentry(node_page, inode);
+	if (is_dent_dnode(node_page))
+		return recover_dentry(node_page, inode);
+
+	f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
+			ino_of_node(node_page), raw_inode->i_name);
+	return 0;
 }
 
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -119,14 +145,13 @@
 		lock_page(page);
 
 		if (cp_ver != cpver_of_node(page))
-			goto unlock_out;
+			break;
 
 		if (!is_fsync_dnode(page))
 			goto next;
 
 		entry = get_fsync_inode(head, ino_of_node(page));
 		if (entry) {
-			entry->blkaddr = blkaddr;
 			if (IS_INODE(page) && is_dent_dnode(page))
 				set_inode_flag(F2FS_I(entry->inode),
 							FI_INC_LINK);
@@ -134,48 +159,40 @@
 			if (IS_INODE(page) && is_dent_dnode(page)) {
 				err = recover_inode_page(sbi, page);
 				if (err)
-					goto unlock_out;
+					break;
 			}
 
 			/* add this fsync inode to the list */
 			entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
 			if (!entry) {
 				err = -ENOMEM;
-				goto unlock_out;
+				break;
 			}
 
 			entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
 			if (IS_ERR(entry->inode)) {
 				err = PTR_ERR(entry->inode);
 				kmem_cache_free(fsync_entry_slab, entry);
-				goto unlock_out;
+				break;
 			}
-
 			list_add_tail(&entry->list, head);
-			entry->blkaddr = blkaddr;
 		}
-		if (IS_INODE(page)) {
-			err = recover_inode(entry->inode, page);
-			if (err == -ENOENT) {
-				goto next;
-			} else if (err) {
-				err = -EINVAL;
-				goto unlock_out;
-			}
-		}
+		entry->blkaddr = blkaddr;
+
+		err = recover_inode(entry->inode, page);
+		if (err && err != -ENOENT)
+			break;
 next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
 	}
-unlock_out:
 	unlock_page(page);
 out:
 	__free_pages(page, 0);
 	return err;
 }
 
-static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
-					struct list_head *head)
+static void destroy_fsync_dnodes(struct list_head *head)
 {
 	struct fsync_inode_entry *entry, *tmp;
 
@@ -186,15 +203,15 @@
 	}
 }
 
-static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
-						block_t blkaddr)
+static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+			block_t blkaddr, struct dnode_of_data *dn)
 {
 	struct seg_entry *sentry;
 	unsigned int segno = GET_SEGNO(sbi, blkaddr);
 	unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
 					(sbi->blocks_per_seg - 1);
 	struct f2fs_summary sum;
-	nid_t ino;
+	nid_t ino, nid;
 	void *kaddr;
 	struct inode *inode;
 	struct page *node_page;
@@ -203,7 +220,7 @@
 
 	sentry = get_seg_entry(sbi, segno);
 	if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
-		return;
+		return 0;
 
 	/* Get the previous summary */
 	for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -222,20 +239,39 @@
 		f2fs_put_page(sum_page, 1);
 	}
 
+	/* Use the locked dnode page and inode */
+	nid = le32_to_cpu(sum.nid);
+	if (dn->inode->i_ino == nid) {
+		struct dnode_of_data tdn = *dn;
+		tdn.nid = nid;
+		tdn.node_page = dn->inode_page;
+		tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+		truncate_data_blocks_range(&tdn, 1);
+		return 0;
+	} else if (dn->nid == nid) {
+		struct dnode_of_data tdn = *dn;
+		tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+		truncate_data_blocks_range(&tdn, 1);
+		return 0;
+	}
+
 	/* Get the node page */
-	node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+	node_page = get_node_page(sbi, nid);
+	if (IS_ERR(node_page))
+		return PTR_ERR(node_page);
 	bidx = start_bidx_of_node(ofs_of_node(node_page)) +
-				le16_to_cpu(sum.ofs_in_node);
+					le16_to_cpu(sum.ofs_in_node);
 	ino = ino_of_node(node_page);
 	f2fs_put_page(node_page, 1);
 
 	/* Deallocate previous index in the node page */
 	inode = f2fs_iget(sbi->sb, ino);
 	if (IS_ERR(inode))
-		return;
+		return PTR_ERR(inode);
 
 	truncate_hole(inode, bidx, bidx + 1);
 	iput(inode);
+	return 0;
 }
 
 static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
@@ -245,7 +281,7 @@
 	struct dnode_of_data dn;
 	struct f2fs_summary sum;
 	struct node_info ni;
-	int err = 0;
+	int err = 0, recovered = 0;
 	int ilock;
 
 	start = start_bidx_of_node(ofs_of_node(page));
@@ -283,13 +319,16 @@
 			}
 
 			/* Check the previous node page having this index */
-			check_index_in_prev_nodes(sbi, dest);
+			err = check_index_in_prev_nodes(sbi, dest, &dn);
+			if (err)
+				goto err;
 
 			set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
 
 			/* write dummy data page */
 			recover_data_page(sbi, NULL, &sum, src, dest);
 			update_extent_cache(dest, &dn);
+			recovered++;
 		}
 		dn.ofs_in_node++;
 	}
@@ -305,9 +344,14 @@
 	set_page_dirty(dn.node_page);
 
 	recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+err:
 	f2fs_put_dnode(&dn);
 	mutex_unlock_op(sbi, ilock);
-	return 0;
+
+	f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
+			"recovered_data = %d blocks, err = %d",
+			inode->i_ino, recovered, err);
+	return err;
 }
 
 static int recover_data(struct f2fs_sb_info *sbi,
@@ -340,7 +384,7 @@
 		lock_page(page);
 
 		if (cp_ver != cpver_of_node(page))
-			goto unlock_out;
+			break;
 
 		entry = get_fsync_inode(head, ino_of_node(page));
 		if (!entry)
@@ -348,7 +392,7 @@
 
 		err = do_recover_data(sbi, entry->inode, page, blkaddr);
 		if (err)
-			goto out;
+			break;
 
 		if (entry->blkaddr == blkaddr) {
 			iput(entry->inode);
@@ -359,7 +403,6 @@
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
 	}
-unlock_out:
 	unlock_page(page);
 out:
 	__free_pages(page, 0);
@@ -382,6 +425,7 @@
 	INIT_LIST_HEAD(&inode_list);
 
 	/* step #1: find fsynced inode numbers */
+	sbi->por_doing = 1;
 	err = find_fsync_dnodes(sbi, &inode_list);
 	if (err)
 		goto out;
@@ -390,13 +434,13 @@
 		goto out;
 
 	/* step #2: recover data */
-	sbi->por_doing = 1;
 	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
-	sbi->por_doing = 0;
 	BUG_ON(!list_empty(&inode_list));
 out:
-	destroy_fsync_dnodes(sbi, &inode_list);
+	destroy_fsync_dnodes(&inode_list);
 	kmem_cache_destroy(fsync_entry_slab);
-	write_checkpoint(sbi, false);
+	sbi->por_doing = 0;
+	if (!err)
+		write_checkpoint(sbi, false);
 	return err;
 }

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d8e84e4..a86d125 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c

@@ -94,7 +94,7 @@
  * Adding dirty entry into seglist is not critical operation.
  * If a given segment is one of current working segments, it won't be added.
  */
-void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	unsigned short valid_blocks;
@@ -126,17 +126,16 @@
 static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned int segno, offset = 0;
+	unsigned int segno = -1;
 	unsigned int total_segs = TOTAL_SEGS(sbi);
 
 	mutex_lock(&dirty_i->seglist_lock);
 	while (1) {
 		segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
-				offset);
+				segno + 1);
 		if (segno >= total_segs)
 			break;
 		__set_test_and_free(sbi, segno);
-		offset = segno + 1;
 	}
 	mutex_unlock(&dirty_i->seglist_lock);
 }
@@ -144,17 +143,16 @@
 void clear_prefree_segments(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned int segno, offset = 0;
+	unsigned int segno = -1;
 	unsigned int total_segs = TOTAL_SEGS(sbi);
 
 	mutex_lock(&dirty_i->seglist_lock);
 	while (1) {
 		segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
-				offset);
+				segno + 1);
 		if (segno >= total_segs)
 			break;
 
-		offset = segno + 1;
 		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
 			dirty_i->nr_dirty[PRE]--;
 
@@ -257,11 +255,11 @@
  * This function should be resided under the curseg_mutex lock
  */
 static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
-		struct f2fs_summary *sum, unsigned short offset)
+					struct f2fs_summary *sum)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 	void *addr = curseg->sum_blk;
-	addr += offset * sizeof(struct f2fs_summary);
+	addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
 	memcpy(addr, sum, sizeof(struct f2fs_summary));
 	return;
 }
@@ -311,64 +309,14 @@
 	f2fs_put_page(page, 1);
 }
 
-static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type)
-{
-	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
-	unsigned int segno;
-	unsigned int ofs = 0;
-
-	/*
-	 * If there is not enough reserved sections,
-	 * we should not reuse prefree segments.
-	 */
-	if (has_not_enough_free_secs(sbi, 0))
-		return NULL_SEGNO;
-
-	/*
-	 * NODE page should not reuse prefree segment,
-	 * since those information is used for SPOR.
-	 */
-	if (IS_NODESEG(type))
-		return NULL_SEGNO;
-next:
-	segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs);
-	ofs += sbi->segs_per_sec;
-
-	if (segno < TOTAL_SEGS(sbi)) {
-		int i;
-
-		/* skip intermediate segments in a section */
-		if (segno % sbi->segs_per_sec)
-			goto next;
-
-		/* skip if the section is currently used */
-		if (sec_usage_check(sbi, GET_SECNO(sbi, segno)))
-			goto next;
-
-		/* skip if whole section is not prefree */
-		for (i = 1; i < sbi->segs_per_sec; i++)
-			if (!test_bit(segno + i, prefree_segmap))
-				goto next;
-
-		/* skip if whole section was not free at the last checkpoint */
-		for (i = 0; i < sbi->segs_per_sec; i++)
-			if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks)
-				goto next;
-
-		return segno;
-	}
-	return NULL_SEGNO;
-}
-
 static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
-	unsigned int segno = curseg->segno;
+	unsigned int segno = curseg->segno + 1;
 	struct free_segmap_info *free_i = FREE_I(sbi);
 
-	if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec)
-		return !test_bit(segno + 1, free_i->free_segmap);
+	if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
+		return !test_bit(segno, free_i->free_segmap);
 	return 0;
 }
 
@@ -495,7 +443,7 @@
 	int dir = ALLOC_LEFT;
 
 	write_sum_page(sbi, curseg->sum_blk,
-				GET_SUM_BLOCK(sbi, curseg->segno));
+				GET_SUM_BLOCK(sbi, segno));
 	if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
 		dir = ALLOC_RIGHT;
 
@@ -599,11 +547,7 @@
 		goto out;
 	}
 
-	curseg->next_segno = check_prefree_segments(sbi, type);
-
-	if (curseg->next_segno != NULL_SEGNO)
-		change_curseg(sbi, type, false);
-	else if (type == CURSEG_WARM_NODE)
+	if (type == CURSEG_WARM_NODE)
 		new_curseg(sbi, type, false);
 	else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
 		new_curseg(sbi, type, false);
@@ -612,7 +556,10 @@
 	else
 		new_curseg(sbi, type, false);
 out:
+#ifdef CONFIG_F2FS_STAT_FS
 	sbi->segment_count[curseg->alloc_type]++;
+#endif
+	return;
 }
 
 void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -795,7 +742,7 @@
 
 		if (S_ISDIR(inode->i_mode))
 			return CURSEG_HOT_DATA;
-		else if (is_cold_data(page) || is_cold_file(inode))
+		else if (is_cold_data(page) || file_is_cold(inode))
 			return CURSEG_COLD_DATA;
 		else
 			return CURSEG_WARM_DATA;
@@ -844,11 +791,13 @@
 	 * because, this function updates a summary entry in the
 	 * current summary block.
 	 */
-	__add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+	__add_sum_entry(sbi, type, sum);
 
 	mutex_lock(&sit_i->sentry_lock);
 	__refresh_next_blkoff(sbi, curseg);
+#ifdef CONFIG_F2FS_STAT_FS
 	sbi->block_count[curseg->alloc_type]++;
+#endif
 
 	/*
 	 * SIT information should be updated before segment allocation,
@@ -943,7 +892,7 @@
 
 	curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
 					(sbi->blocks_per_seg - 1);
-	__add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+	__add_sum_entry(sbi, type, sum);
 
 	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
 
@@ -980,7 +929,7 @@
 	}
 	curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
 					(sbi->blocks_per_seg - 1);
-	__add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+	__add_sum_entry(sbi, type, sum);
 
 	/* change the current log to the next block addr in advance */
 	if (next_segno != segno) {
@@ -1579,13 +1528,13 @@
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct free_segmap_info *free_i = FREE_I(sbi);
-	unsigned int segno = 0, offset = 0;
+	unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
 	unsigned short valid_blocks;
 
-	while (segno < TOTAL_SEGS(sbi)) {
+	while (1) {
 		/* find dirty segment based on free segmap */
-		segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
-		if (segno >= TOTAL_SEGS(sbi))
+		segno = find_next_inuse(free_i, total_segs, offset);
+		if (segno >= total_segs)
 			break;
 		offset = segno + 1;
 		valid_blocks = get_valid_blocks(sbi, segno, 0);

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8555f7d..75c7dc3 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c

@@ -34,7 +34,7 @@
 static struct kmem_cache *f2fs_inode_cachep;
 
 enum {
-	Opt_gc_background_off,
+	Opt_gc_background,
 	Opt_disable_roll_forward,
 	Opt_discard,
 	Opt_noheap,
@@ -46,7 +46,7 @@
 };
 
 static match_table_t f2fs_tokens = {
-	{Opt_gc_background_off, "background_gc_off"},
+	{Opt_gc_background, "background_gc=%s"},
 	{Opt_disable_roll_forward, "disable_roll_forward"},
 	{Opt_discard, "discard"},
 	{Opt_noheap, "no_heap"},
@@ -76,6 +76,91 @@
 	inode_init_once(&fi->vfs_inode);
 }
 
+static int parse_options(struct super_block *sb, char *options)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	substring_t args[MAX_OPT_ARGS];
+	char *p, *name;
+	int arg = 0;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = NULL;
+		token = match_token(p, f2fs_tokens, args);
+
+		switch (token) {
+		case Opt_gc_background:
+			name = match_strdup(&args[0]);
+
+			if (!name)
+				return -ENOMEM;
+			if (!strncmp(name, "on", 2))
+				set_opt(sbi, BG_GC);
+			else if (!strncmp(name, "off", 3))
+				clear_opt(sbi, BG_GC);
+			else {
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			break;
+		case Opt_disable_roll_forward:
+			set_opt(sbi, DISABLE_ROLL_FORWARD);
+			break;
+		case Opt_discard:
+			set_opt(sbi, DISCARD);
+			break;
+		case Opt_noheap:
+			set_opt(sbi, NOHEAP);
+			break;
+#ifdef CONFIG_F2FS_FS_XATTR
+		case Opt_nouser_xattr:
+			clear_opt(sbi, XATTR_USER);
+			break;
+#else
+		case Opt_nouser_xattr:
+			f2fs_msg(sb, KERN_INFO,
+				"nouser_xattr options not supported");
+			break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+		case Opt_noacl:
+			clear_opt(sbi, POSIX_ACL);
+			break;
+#else
+		case Opt_noacl:
+			f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+			break;
+#endif
+		case Opt_active_logs:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+				return -EINVAL;
+			sbi->active_logs = arg;
+			break;
+		case Opt_disable_ext_identify:
+			set_opt(sbi, DISABLE_EXT_IDENTIFY);
+			break;
+		default:
+			f2fs_msg(sb, KERN_ERR,
+				"Unrecognized mount option \"%s\" or missing value",
+				p);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
 static struct inode *f2fs_alloc_inode(struct super_block *sb)
 {
 	struct f2fs_inode_info *fi;
@@ -112,6 +197,17 @@
 	return generic_drop_inode(inode);
 }
 
+/*
+ * f2fs_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We should call set_dirty_inode to write the dirty inode through write_inode.
+ */
+static void f2fs_dirty_inode(struct inode *inode, int flags)
+{
+	set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+	return;
+}
+
 static void f2fs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -170,7 +266,7 @@
 {
 	int err;
 
-	if (sb->s_flags & MS_RDONLY)
+	if (f2fs_readonly(sb))
 		return 0;
 
 	err = f2fs_sync_fs(sb, 1);
@@ -214,10 +310,10 @@
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
 
-	if (test_opt(sbi, BG_GC))
-		seq_puts(seq, ",background_gc_on");
+	if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
+		seq_printf(seq, ",background_gc=%s", "on");
 	else
-		seq_puts(seq, ",background_gc_off");
+		seq_printf(seq, ",background_gc=%s", "off");
 	if (test_opt(sbi, DISABLE_ROLL_FORWARD))
 		seq_puts(seq, ",disable_roll_forward");
 	if (test_opt(sbi, DISCARD))
@@ -244,11 +340,64 @@
 	return 0;
 }
 
+static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	struct f2fs_mount_info org_mount_opt;
+	int err, active_logs;
+
+	/*
+	 * Save the old mount options in case we
+	 * need to restore them.
+	 */
+	org_mount_opt = sbi->mount_opt;
+	active_logs = sbi->active_logs;
+
+	/* parse mount options */
+	err = parse_options(sb, data);
+	if (err)
+		goto restore_opts;
+
+	/*
+	 * Previous and new state of filesystem is RO,
+	 * so no point in checking GC conditions.
+	 */
+	if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+		goto skip;
+
+	/*
+	 * We stop the GC thread if FS is mounted as RO
+	 * or if background_gc = off is passed in mount
+	 * option. Also sync the filesystem.
+	 */
+	if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+		if (sbi->gc_thread) {
+			stop_gc_thread(sbi);
+			f2fs_sync_fs(sb, 1);
+		}
+	} else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
+		err = start_gc_thread(sbi);
+		if (err)
+			goto restore_opts;
+	}
+skip:
+	/* Update the POSIXACL Flag */
+	 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+	return 0;
+
+restore_opts:
+	sbi->mount_opt = org_mount_opt;
+	sbi->active_logs = active_logs;
+	return err;
+}
+
 static struct super_operations f2fs_sops = {
 	.alloc_inode	= f2fs_alloc_inode,
 	.drop_inode	= f2fs_drop_inode,
 	.destroy_inode	= f2fs_destroy_inode,
 	.write_inode	= f2fs_write_inode,
+	.dirty_inode	= f2fs_dirty_inode,
 	.show_options	= f2fs_show_options,
 	.evict_inode	= f2fs_evict_inode,
 	.put_super	= f2fs_put_super,
@@ -256,6 +405,7 @@
 	.freeze_fs	= f2fs_freeze,
 	.unfreeze_fs	= f2fs_unfreeze,
 	.statfs		= f2fs_statfs,
+	.remount_fs	= f2fs_remount,
 };
 
 static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -303,79 +453,6 @@
 	.get_parent = f2fs_get_parent,
 };
 
-static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
-				char *options)
-{
-	substring_t args[MAX_OPT_ARGS];
-	char *p;
-	int arg = 0;
-
-	if (!options)
-		return 0;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-		if (!*p)
-			continue;
-		/*
-		 * Initialize args struct so we know whether arg was
-		 * found; some options take optional arguments.
-		 */
-		args[0].to = args[0].from = NULL;
-		token = match_token(p, f2fs_tokens, args);
-
-		switch (token) {
-		case Opt_gc_background_off:
-			clear_opt(sbi, BG_GC);
-			break;
-		case Opt_disable_roll_forward:
-			set_opt(sbi, DISABLE_ROLL_FORWARD);
-			break;
-		case Opt_discard:
-			set_opt(sbi, DISCARD);
-			break;
-		case Opt_noheap:
-			set_opt(sbi, NOHEAP);
-			break;
-#ifdef CONFIG_F2FS_FS_XATTR
-		case Opt_nouser_xattr:
-			clear_opt(sbi, XATTR_USER);
-			break;
-#else
-		case Opt_nouser_xattr:
-			f2fs_msg(sb, KERN_INFO,
-				"nouser_xattr options not supported");
-			break;
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
-		case Opt_noacl:
-			clear_opt(sbi, POSIX_ACL);
-			break;
-#else
-		case Opt_noacl:
-			f2fs_msg(sb, KERN_INFO, "noacl options not supported");
-			break;
-#endif
-		case Opt_active_logs:
-			if (args->from && match_int(args, &arg))
-				return -EINVAL;
-			if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
-				return -EINVAL;
-			sbi->active_logs = arg;
-			break;
-		case Opt_disable_ext_identify:
-			set_opt(sbi, DISABLE_EXT_IDENTIFY);
-			break;
-		default:
-			f2fs_msg(sb, KERN_ERR,
-				"Unrecognized mount option \"%s\" or missing value",
-				p);
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
 static loff_t max_file_size(unsigned bits)
 {
 	loff_t result = ADDRS_PER_INODE;
@@ -541,6 +618,7 @@
 		if (err)
 			goto free_sb_buf;
 	}
+	sb->s_fs_info = sbi;
 	/* init some FS parameters */
 	sbi->active_logs = NR_CURSEG_TYPE;
 
@@ -553,7 +631,7 @@
 	set_opt(sbi, POSIX_ACL);
 #endif
 	/* parse mount options */
-	err = parse_options(sb, sbi, (char *)data);
+	err = parse_options(sb, (char *)data);
 	if (err)
 		goto free_sb_buf;
 
@@ -565,7 +643,6 @@
 	sb->s_xattr = f2fs_xattr_handlers;
 	sb->s_export_op = &f2fs_export_ops;
 	sb->s_magic = F2FS_SUPER_MAGIC;
-	sb->s_fs_info = sbi;
 	sb->s_time_gran = 1;
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -674,10 +751,16 @@
 				"Cannot recover all fsync data errno=%ld", err);
 	}
 
-	/* After POR, we can run background GC thread */
-	err = start_gc_thread(sbi);
-	if (err)
-		goto fail;
+	/*
+	 * If filesystem is not mounted as read-only then
+	 * do start the gc_thread.
+	 */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/* After POR, we can run background GC thread.*/
+		err = start_gc_thread(sbi);
+		if (err)
+			goto fail;
+	}
 
 	err = f2fs_build_stats(sbi);
 	if (err)

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 0b02dce..3ab07ec 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c

@@ -20,6 +20,7 @@
  */
 #include <linux/rwsem.h>
 #include <linux/f2fs_fs.h>
+#include <linux/security.h>
 #include "f2fs.h"
 #include "xattr.h"
 
@@ -43,6 +44,10 @@
 		prefix = XATTR_TRUSTED_PREFIX;
 		prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 		break;
+	case F2FS_XATTR_INDEX_SECURITY:
+		prefix = XATTR_SECURITY_PREFIX;
+		prefix_len = XATTR_SECURITY_PREFIX_LEN;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -50,7 +55,7 @@
 	total_len = prefix_len + name_len + 1;
 	if (list && total_len <= list_size) {
 		memcpy(list, prefix, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
+		memcpy(list + prefix_len, name, name_len);
 		list[prefix_len + name_len] = '\0';
 	}
 	return total_len;
@@ -70,13 +75,14 @@
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		break;
+	case F2FS_XATTR_INDEX_SECURITY:
+		break;
 	default:
 		return -EINVAL;
 	}
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	return f2fs_getxattr(dentry->d_inode, type, name,
-			buffer, size);
+	return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
 }
 
 static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -93,13 +99,15 @@
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		break;
+	case F2FS_XATTR_INDEX_SECURITY:
+		break;
 	default:
 		return -EINVAL;
 	}
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
 
-	return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+	return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
 }
 
 static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
@@ -145,6 +153,31 @@
 	return 0;
 }
 
+#ifdef CONFIG_F2FS_FS_SECURITY
+static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		void *page)
+{
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
+				xattr->name, xattr->value,
+				xattr->value_len, (struct page *)page);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+int f2fs_init_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr, struct page *ipage)
+{
+	return security_inode_init_security(inode, dir, qstr,
+				&f2fs_initxattrs, ipage);
+}
+#endif
+
 const struct xattr_handler f2fs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.flags	= F2FS_XATTR_INDEX_USER,
@@ -169,6 +202,14 @@
 	.set    = f2fs_xattr_advise_set,
 };
 
+const struct xattr_handler f2fs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.flags	= F2FS_XATTR_INDEX_SECURITY,
+	.list	= f2fs_xattr_generic_list,
+	.get	= f2fs_xattr_generic_get,
+	.set	= f2fs_xattr_generic_set,
+};
+
 static const struct xattr_handler *f2fs_xattr_handler_map[] = {
 	[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -176,6 +217,9 @@
 	[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
 #endif
 	[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+	[F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
+#endif
 	[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
 };
 
@@ -186,6 +230,9 @@
 	&f2fs_xattr_acl_default_handler,
 #endif
 	&f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+	&f2fs_xattr_security_handler,
+#endif
 	&f2fs_xattr_advise_handler,
 	NULL,
 };
@@ -218,6 +265,8 @@
 		return -ENODATA;
 
 	page = get_node_page(sbi, fi->i_xattr_nid);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
 	base_addr = page_address(page);
 
 	list_for_each_xattr(entry, base_addr) {
@@ -268,6 +317,8 @@
 		return 0;
 
 	page = get_node_page(sbi, fi->i_xattr_nid);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
 	base_addr = page_address(page);
 
 	list_for_each_xattr(entry, base_addr) {
@@ -296,7 +347,7 @@
 }
 
 int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
-					const void *value, size_t value_len)
+			const void *value, size_t value_len, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -335,7 +386,7 @@
 		set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
 		mark_inode_dirty(inode);
 
-		page = new_node_page(&dn, XATTR_NODE_OFFSET);
+		page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
 		if (IS_ERR(page)) {
 			alloc_nid_failed(sbi, fi->i_xattr_nid);
 			fi->i_xattr_nid = 0;
@@ -435,7 +486,10 @@
 		inode->i_ctime = CURRENT_TIME;
 		clear_inode_flag(fi, FI_ACL_MODE);
 	}
-	update_inode_page(inode);
+	if (ipage)
+		update_inode(inode, ipage);
+	else
+		update_inode_page(inode);
 	mutex_unlock_op(sbi, ilock);
 
 	return 0;

diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 49c9558..3c0817b 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h

@@ -112,21 +112,19 @@
 extern const struct xattr_handler f2fs_xattr_acl_access_handler;
 extern const struct xattr_handler f2fs_xattr_acl_default_handler;
 extern const struct xattr_handler f2fs_xattr_advise_handler;
+extern const struct xattr_handler f2fs_xattr_security_handler;
 
 extern const struct xattr_handler *f2fs_xattr_handlers[];
 
-extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
-		const void *value, size_t value_len);
-extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
-		void *buffer, size_t buffer_size);
-extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
-		size_t buffer_size);
-
+extern int f2fs_setxattr(struct inode *, int, const char *,
+				const void *, size_t, struct page *);
+extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
+extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
 #else
 
 #define f2fs_xattr_handlers	NULL
 static inline int f2fs_setxattr(struct inode *inode, int name_index,
-	const char *name, const void *value, size_t value_len)
+		const char *name, const void *value, size_t value_len)
 {
 	return -EOPNOTSUPP;
 }
@@ -142,4 +140,14 @@
 }
 #endif
 
+#ifdef CONFIG_F2FS_FS_SECURITY
+extern int f2fs_init_security(struct inode *, struct inode *,
+				const struct qstr *, struct page *);
+#else
+static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr, struct page *ipage)
+{
+	return 0;
+}
+#endif
 #endif /* __F2FS_XATTR_H__ */

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 7a6f02c..3963ede 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c

@@ -543,6 +543,7 @@
 EXPORT_SYMBOL_GPL(fat_search_long);
 
 struct fat_ioctl_filldir_callback {
+	struct dir_context ctx;
 	void __user *dirent;
 	int result;
 	/* for dir ioctl */
@@ -552,8 +553,9 @@
 	int short_len;
 };
 
-static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
-			 filldir_t filldir, int short_only, int both)
+static int __fat_readdir(struct inode *inode, struct file *file,
+			 struct dir_context *ctx, int short_only,
+			 struct fat_ioctl_filldir_callback *both)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -564,27 +566,20 @@
 	unsigned char bufname[FAT_MAX_SHORT_SIZE];
 	int isvfat = sbi->options.isvfat;
 	const char *fill_name = NULL;
-	unsigned long inum;
-	unsigned long lpos, dummy, *furrfu = &lpos;
+	int fake_offset = 0;
 	loff_t cpos;
 	int short_len = 0, fill_len = 0;
 	int ret = 0;
 
 	mutex_lock(&sbi->s_lock);
 
-	cpos = filp->f_pos;
+	cpos = ctx->pos;
 	/* Fake . and .. for the root directory. */
 	if (inode->i_ino == MSDOS_ROOT_INO) {
-		while (cpos < 2) {
-			if (filldir(dirent, "..", cpos+1, cpos,
-				    MSDOS_ROOT_INO, DT_DIR) < 0)
-				goto out;
-			cpos++;
-			filp->f_pos++;
-		}
-		if (cpos == 2) {
-			dummy = 2;
-			furrfu = &dummy;
+		if (!dir_emit_dots(file, ctx))
+			goto out;
+		if (ctx->pos == 2) {
+			fake_offset = 1;
 			cpos = 0;
 		}
 	}
@@ -619,7 +614,7 @@
 		int status = fat_parse_long(inode, &cpos, &bh, &de,
 					    &unicode, &nr_slots);
 		if (status < 0) {
-			filp->f_pos = cpos;
+			ctx->pos = cpos;
 			ret = status;
 			goto out;
 		} else if (status == PARSE_INVALID)
@@ -639,6 +634,19 @@
 			/* !both && !short_only, so we don't need shortname. */
 			if (!both)
 				goto start_filldir;
+
+			short_len = fat_parse_short(sb, de, bufname,
+						    sbi->options.dotsOK);
+			if (short_len == 0)
+				goto record_end;
+			/* hack for fat_ioctl_filldir() */
+			both->longname = fill_name;
+			both->long_len = fill_len;
+			both->shortname = bufname;
+			both->short_len = short_len;
+			fill_name = NULL;
+			fill_len = 0;
+			goto start_filldir;
 		}
 	}
 
@@ -646,28 +654,21 @@
 	if (short_len == 0)
 		goto record_end;
 
-	if (nr_slots) {
-		/* hack for fat_ioctl_filldir() */
-		struct fat_ioctl_filldir_callback *p = dirent;
-
-		p->longname = fill_name;
-		p->long_len = fill_len;
-		p->shortname = bufname;
-		p->short_len = short_len;
-		fill_name = NULL;
-		fill_len = 0;
-	} else {
-		fill_name = bufname;
-		fill_len = short_len;
-	}
+	fill_name = bufname;
+	fill_len = short_len;
 
 start_filldir:
-	lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
-	if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
-		inum = inode->i_ino;
-	else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
-		inum = parent_ino(filp->f_path.dentry);
+	if (!fake_offset)
+		ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+
+	if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
+		if (!dir_emit_dot(file, ctx))
+			goto fill_failed;
+	} else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+		if (!dir_emit_dotdot(file, ctx))
+			goto fill_failed;
 	} else {
+		unsigned long inum;
 		loff_t i_pos = fat_make_i_pos(sb, bh, de);
 		struct inode *tmp = fat_iget(sb, i_pos);
 		if (tmp) {
@@ -675,18 +676,17 @@
 			iput(tmp);
 		} else
 			inum = iunique(sb, MSDOS_ROOT_INO);
+		if (!dir_emit(ctx, fill_name, fill_len, inum,
+			    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG))
+			goto fill_failed;
 	}
 
-	if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
-		    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
-		goto fill_failed;
-
 record_end:
-	furrfu = &lpos;
-	filp->f_pos = cpos;
+	fake_offset = 0;
+	ctx->pos = cpos;
 	goto get_new;
 end_of_dir:
-	filp->f_pos = cpos;
+	ctx->pos = cpos;
 fill_failed:
 	brelse(bh);
 	if (unicode)
@@ -696,10 +696,9 @@
 	return ret;
 }
 
-static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int fat_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
-	return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
+	return __fat_readdir(file_inode(file), file, ctx, 0, NULL);
 }
 
 #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type)			   \
@@ -755,20 +754,25 @@
 
 FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
 
-static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
+static int fat_ioctl_readdir(struct inode *inode, struct file *file,
 			     void __user *dirent, filldir_t filldir,
 			     int short_only, int both)
 {
-	struct fat_ioctl_filldir_callback buf;
+	struct fat_ioctl_filldir_callback buf = {
+		.ctx.actor = filldir,
+		.dirent = dirent
+	};
 	int ret;
 
 	buf.dirent = dirent;
 	buf.result = 0;
 	mutex_lock(&inode->i_mutex);
+	buf.ctx.pos = file->f_pos;
 	ret = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
-		ret = __fat_readdir(inode, filp, &buf, filldir,
-				    short_only, both);
+		ret = __fat_readdir(inode, file, &buf.ctx,
+				    short_only, both ? &buf : NULL);
+		file->f_pos = buf.ctx.pos;
 	}
 	mutex_unlock(&inode->i_mutex);
 	if (ret >= 0)
@@ -854,7 +858,7 @@
 const struct file_operations fat_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= fat_readdir,
+	.iterate	= fat_readdir,
 	.unlocked_ioctl	= fat_dir_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= fat_compat_dir_ioctl,

diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 664b07a..25d4099 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c

@@ -49,7 +49,7 @@
 
 
 static struct dentry *	vxfs_lookup(struct inode *, struct dentry *, unsigned int);
-static int		vxfs_readdir(struct file *, void *, filldir_t);
+static int		vxfs_readdir(struct file *, struct dir_context *);
 
 const struct inode_operations vxfs_dir_inode_ops = {
 	.lookup =		vxfs_lookup,
@@ -58,7 +58,7 @@
 const struct file_operations vxfs_dir_operations = {
 	.llseek =		generic_file_llseek,
 	.read =			generic_read_dir,
-	.readdir =		vxfs_readdir,
+	.iterate =		vxfs_readdir,
 };
 
  
@@ -235,7 +235,7 @@
  *   Zero.
  */
 static int
-vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
+vxfs_readdir(struct file *fp, struct dir_context *ctx)
 {
 	struct inode		*ip = file_inode(fp);
 	struct super_block	*sbp = ip->i_sb;
@@ -243,20 +243,17 @@
 	u_long			page, npages, block, pblocks, nblocks, offset;
 	loff_t			pos;
 
-	switch ((long)fp->f_pos) {
-	case 0:
-		if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
-			goto out;
-		fp->f_pos++;
-		/* fallthrough */
-	case 1:
-		if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0)
-			goto out;
-		fp->f_pos++;
-		/* fallthrough */
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(fp, ctx))
+			return 0;
+		ctx->pos = 1;
 	}
-
-	pos = fp->f_pos - 2;
+	if (ctx->pos == 1) {
+		if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
+			return 0;
+		ctx->pos = 2;
+	}
+	pos = ctx->pos - 2;
 	
 	if (pos > VXFS_DIRROUND(ip->i_size))
 		return 0;
@@ -270,16 +267,16 @@
 	block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
 
 	for (; page < npages; page++, block = 0) {
-		caddr_t			kaddr;
+		char			*kaddr;
 		struct page		*pp;
 
 		pp = vxfs_get_page(ip->i_mapping, page);
 		if (IS_ERR(pp))
 			continue;
-		kaddr = (caddr_t)page_address(pp);
+		kaddr = (char *)page_address(pp);
 
 		for (; block <= nblocks && block <= pblocks; block++) {
-			caddr_t			baddr, limit;
+			char			*baddr, *limit;
 			struct vxfs_dirblk	*dbp;
 			struct vxfs_direct	*de;
 
@@ -292,21 +289,18 @@
 				 (kaddr + offset) :
 				 (baddr + VXFS_DIRBLKOV(dbp)));
 
-			for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
-				int	over;
-
+			for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
 				if (!de->d_reclen)
 					break;
 				if (!de->d_ino)
 					continue;
 
-				offset = (caddr_t)de - kaddr;
-				over = filler(retp, de->d_name, de->d_namelen,
-					((page << PAGE_CACHE_SHIFT) | offset) + 2,
-					de->d_ino, DT_UNKNOWN);
-				if (over) {
+				offset = (char *)de - kaddr;
+				ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+				if (!dir_emit(ctx, de->d_name, de->d_namelen,
+					de->d_ino, DT_UNKNOWN)) {
 					vxfs_put_page(pp);
-					goto done;
+					return 0;
 				}
 			}
 			offset = 0;
@@ -314,9 +308,6 @@
 		vxfs_put_page(pp);
 		offset = 0;
 	}
-
-done:
-	fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
-out:
+	ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
 	return 0;
 }

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3be5718..a85ac4e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c

@@ -45,6 +45,7 @@
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
+	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -443,9 +444,11 @@
 	/*
 	 * Make sure to wait on the data before writing out the metadata.
 	 * This is important for filesystems that modify metadata on data
-	 * I/O completion.
+	 * I/O completion. We don't do it for sync(2) writeback because it has a
+	 * separate, external IO completion path and ->sync_fs for guaranteeing
+	 * inode metadata is written back correctly.
 	 */
-	if (wbc->sync_mode == WB_SYNC_ALL) {
+	if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
 		int err = filemap_fdatawait(mapping);
 		if (ret == 0)
 			ret = err;
@@ -578,6 +581,7 @@
 		.tagged_writepages	= work->tagged_writepages,
 		.for_kupdate		= work->for_kupdate,
 		.for_background		= work->for_background,
+		.for_sync		= work->for_sync,
 		.range_cyclic		= work->range_cyclic,
 		.range_start		= 0,
 		.range_end		= LLONG_MAX,
@@ -1362,6 +1366,7 @@
 		.range_cyclic	= 0,
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
+		.for_sync	= 1,
 	};
 
 	/* Nothing to do? */

diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index b52aed1..f7cff36 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c

@@ -115,7 +115,7 @@
 				     struct fscache_object, cookie_link);
 
 		cache = object->cache;
-		if (object->state >= FSCACHE_OBJECT_DYING ||
+		if (fscache_object_is_dying(object) ||
 		    test_bit(FSCACHE_IOERROR, &cache->flags))
 			cache = NULL;
 
@@ -224,8 +224,10 @@
 	BUG_ON(!ifsdef);
 
 	cache->flags = 0;
-	ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
-	ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+	ifsdef->event_mask =
+		((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
+		~(1 << FSCACHE_OBJECT_EV_CLEARED);
+	__set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
 
 	if (!tagname)
 		tagname = cache->identifier;
@@ -330,25 +332,25 @@
 {
 	struct fscache_object *object;
 
-	spin_lock(&cache->object_list_lock);
-
 	while (!list_empty(&cache->object_list)) {
-		object = list_entry(cache->object_list.next,
-				    struct fscache_object, cache_link);
-		list_move_tail(&object->cache_link, dying_objects);
-
-		_debug("withdraw %p", object->cookie);
-
-		spin_lock(&object->lock);
-		spin_unlock(&cache->object_list_lock);
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
-		spin_unlock(&object->lock);
-
-		cond_resched();
 		spin_lock(&cache->object_list_lock);
-	}
 
-	spin_unlock(&cache->object_list_lock);
+		if (!list_empty(&cache->object_list)) {
+			object = list_entry(cache->object_list.next,
+					    struct fscache_object, cache_link);
+			list_move_tail(&object->cache_link, dying_objects);
+
+			_debug("withdraw %p", object->cookie);
+
+			/* This must be done under object_list_lock to prevent
+			 * a race with fscache_drop_object().
+			 */
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+		}
+
+		spin_unlock(&cache->object_list_lock);
+		cond_resched();
+	}
 }
 
 /**

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index e2cba1f..0e91a3c 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c

@@ -95,6 +95,11 @@
 	atomic_set(&cookie->usage, 1);
 	atomic_set(&cookie->n_children, 0);
 
+	/* We keep the active count elevated until relinquishment to prevent an
+	 * attempt to wake up every time the object operations queue quiesces.
+	 */
+	atomic_set(&cookie->n_active, 1);
+
 	atomic_inc(&parent->usage);
 	atomic_inc(&parent->n_children);
 
@@ -177,7 +182,6 @@
 
 	cookie->flags =
 		(1 << FSCACHE_COOKIE_LOOKING_UP) |
-		(1 << FSCACHE_COOKIE_CREATING) |
 		(1 << FSCACHE_COOKIE_NO_DATA_YET);
 
 	/* ask the cache to allocate objects for this cookie and its parent
@@ -205,7 +209,7 @@
 
 	/* initiate the process of looking up all the objects in the chain
 	 * (done by fscache_initialise_object()) */
-	fscache_enqueue_object(object);
+	fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
 
 	spin_unlock(&cookie->lock);
 
@@ -285,7 +289,7 @@
 
 object_already_extant:
 	ret = -ENOBUFS;
-	if (object->state >= FSCACHE_OBJECT_DYING) {
+	if (fscache_object_is_dead(object)) {
 		spin_unlock(&cookie->lock);
 		goto error;
 	}
@@ -321,7 +325,7 @@
 	ret = -EEXIST;
 	hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
 		if (p->cache == object->cache) {
-			if (p->state >= FSCACHE_OBJECT_DYING)
+			if (fscache_object_is_dying(p))
 				ret = -ENOBUFS;
 			goto cant_attach_object;
 		}
@@ -332,7 +336,7 @@
 	hlist_for_each_entry(p, &cookie->parent->backing_objects,
 			     cookie_link) {
 		if (p->cache == object->cache) {
-			if (p->state >= FSCACHE_OBJECT_DYING) {
+			if (fscache_object_is_dying(p)) {
 				ret = -ENOBUFS;
 				spin_unlock(&cookie->parent->lock);
 				goto cant_attach_object;
@@ -400,7 +404,7 @@
 			object = hlist_entry(cookie->backing_objects.first,
 					     struct fscache_object,
 					     cookie_link);
-			if (object->state < FSCACHE_OBJECT_DYING)
+			if (fscache_object_is_live(object))
 				fscache_raise_event(
 					object, FSCACHE_OBJECT_EV_INVALIDATE);
 		}
@@ -467,9 +471,7 @@
  */
 void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 {
-	struct fscache_cache *cache;
 	struct fscache_object *object;
-	unsigned long event;
 
 	fscache_stat(&fscache_n_relinquishes);
 	if (retire)
@@ -481,8 +483,11 @@
 		return;
 	}
 
-	_enter("%p{%s,%p},%d",
-	       cookie, cookie->def->name, cookie->netfs_data, retire);
+	_enter("%p{%s,%p,%d},%d",
+	       cookie, cookie->def->name, cookie->netfs_data,
+	       atomic_read(&cookie->n_active), retire);
+
+	ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
 
 	if (atomic_read(&cookie->n_children) != 0) {
 		printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
@@ -490,62 +495,28 @@
 		BUG();
 	}
 
-	/* wait for the cookie to finish being instantiated (or to fail) */
-	if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
-		fscache_stat(&fscache_n_relinquishes_waitcrt);
-		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
-			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-	}
+	/* No further netfs-accessing operations on this cookie permitted */
+	set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+	if (retire)
+		set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
 
-	event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
-
-try_again:
 	spin_lock(&cookie->lock);
-
-	/* break links with all the active objects */
-	while (!hlist_empty(&cookie->backing_objects)) {
-		int n_reads;
-		object = hlist_entry(cookie->backing_objects.first,
-				     struct fscache_object,
-				     cookie_link);
-
-		_debug("RELEASE OBJ%x", object->debug_id);
-
-		set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
-		n_reads = atomic_read(&object->n_reads);
-		if (n_reads) {
-			int n_ops = object->n_ops;
-			int n_in_progress = object->n_in_progress;
-			spin_unlock(&cookie->lock);
-			printk(KERN_ERR "FS-Cache:"
-			       " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
-			       cookie->def->name,
-			       n_reads, n_ops, n_in_progress);
-			wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
-				    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-			printk("Wait finished\n");
-			goto try_again;
-		}
-
-		/* detach each cache object from the object cookie */
-		spin_lock(&object->lock);
-		hlist_del_init(&object->cookie_link);
-
-		cache = object->cache;
-		object->cookie = NULL;
-		fscache_raise_event(object, event);
-		spin_unlock(&object->lock);
-
-		if (atomic_dec_and_test(&cookie->usage))
-			/* the cookie refcount shouldn't be reduced to 0 yet */
-			BUG();
+	hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
 	}
+	spin_unlock(&cookie->lock);
 
-	/* detach pointers back to the netfs */
+	/* Wait for cessation of activity requiring access to the netfs (when
+	 * n_active reaches 0).
+	 */
+	if (!atomic_dec_and_test(&cookie->n_active))
+		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+				 TASK_UNINTERRUPTIBLE);
+
+	/* Clear pointers back to the netfs */
 	cookie->netfs_data	= NULL;
 	cookie->def		= NULL;
-
-	spin_unlock(&cookie->lock);
+	BUG_ON(cookie->stores.rnode);
 
 	if (cookie->parent) {
 		ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -553,7 +524,7 @@
 		atomic_dec(&cookie->parent->n_children);
 	}
 
-	/* finally dispose of the cookie */
+	/* Dispose of the netfs's link to the cookie */
 	ASSERTCMP(atomic_read(&cookie->usage), >, 0);
 	fscache_cookie_put(cookie);
 

diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index f5b4bae..10a2ade 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c

@@ -55,6 +55,7 @@
 
 struct fscache_cookie fscache_fsdef_index = {
 	.usage		= ATOMIC_INIT(1),
+	.n_active	= ATOMIC_INIT(1),
 	.lock		= __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
 	.backing_objects = HLIST_HEAD_INIT,
 	.def		= &fscache_fsdef_index_def,

diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ee38fef..12d505b 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h

@@ -93,14 +93,11 @@
 
 extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
+extern int fscache_wait_atomic_t(atomic_t *);
 
 /*
  * object.c
  */
-extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
-
-extern void fscache_withdrawing_object(struct fscache_cache *,
-				       struct fscache_object *);
 extern void fscache_enqueue_object(struct fscache_object *);
 
 /*
@@ -110,8 +107,10 @@
 extern const struct file_operations fscache_objlist_fops;
 
 extern void fscache_objlist_add(struct fscache_object *);
+extern void fscache_objlist_remove(struct fscache_object *);
 #else
 #define fscache_objlist_add(object) do {} while(0)
+#define fscache_objlist_remove(object) do {} while(0)
 #endif
 
 /*
@@ -291,6 +290,10 @@
 				       unsigned event)
 {
 	BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
+#if 0
+	printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
+	       object->debug_id, object->event_mask, (1 << event));
+#endif
 	if (!test_and_set_bit(event, &object->events) &&
 	    test_bit(event, &object->event_mask))
 		fscache_enqueue_object(object);

diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index f9d8567..7c27907 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c

@@ -205,7 +205,6 @@
 	schedule();
 	return 0;
 }
-EXPORT_SYMBOL(fscache_wait_bit);
 
 /*
  * wait_on_bit() sleep function for interruptible waiting
@@ -215,4 +214,12 @@
 	schedule();
 	return signal_pending(current);
 }
-EXPORT_SYMBOL(fscache_wait_bit_interruptible);
+
+/*
+ * wait_on_atomic_t() sleep function for uninterruptible waiting
+ */
+int fscache_wait_atomic_t(atomic_t *p)
+{
+	schedule();
+	return 0;
+}

diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index e028b8e..b1bb611 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c

@@ -40,6 +40,7 @@
 	/* initialise the primary index cookie */
 	atomic_set(&netfs->primary_index->usage, 1);
 	atomic_set(&netfs->primary_index->n_children, 0);
+	atomic_set(&netfs->primary_index->n_active, 1);
 
 	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
 	netfs->primary_index->parent		= &fscache_fsdef_index;

diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index f27c89d..e1959ef 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c

@@ -70,13 +70,10 @@
 	write_unlock(&fscache_object_list_lock);
 }
 
-/**
- * fscache_object_destroy - Note that a cache object is about to be destroyed
- * @object: The object to be destroyed
- *
- * Note the imminent destruction and deallocation of a cache object record.
+/*
+ * Remove an object from the object list.
  */
-void fscache_object_destroy(struct fscache_object *obj)
+void fscache_objlist_remove(struct fscache_object *obj)
 {
 	write_lock(&fscache_object_list_lock);
 
@@ -85,7 +82,6 @@
 
 	write_unlock(&fscache_object_list_lock);
 }
-EXPORT_SYMBOL(fscache_object_destroy);
 
 /*
  * find the object in the tree on or after the specified index
@@ -166,15 +162,14 @@
 {
 	struct fscache_objlist_data *data = m->private;
 	struct fscache_object *obj = v;
+	struct fscache_cookie *cookie;
 	unsigned long config = data->config;
-	uint16_t keylen, auxlen;
 	char _type[3], *type;
-	bool no_cookie;
 	u8 *buf = data->buf, *p;
 
 	if ((unsigned long) v == 1) {
 		seq_puts(m, "OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS"
-			 " EM EV F S"
+			 " EM EV FL S"
 			 " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
 		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
 			      FSCACHE_OBJLIST_CONFIG_AUX))
@@ -193,7 +188,7 @@
 
 	if ((unsigned long) v == 2) {
 		seq_puts(m, "======== ======== ==== ===== === === === == ====="
-			 " == == = ="
+			 " == == == ="
 			 " | ================ == == ================");
 		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
 			      FSCACHE_OBJLIST_CONFIG_AUX))
@@ -216,10 +211,11 @@
 		}							\
 	} while(0)
 
+	cookie = obj->cookie;
 	if (~config) {
-		FILTER(obj->cookie,
+		FILTER(cookie->def,
 		       COOKIE, NOCOOKIE);
-		FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+		FILTER(fscache_object_is_active(obj) ||
 		       obj->n_ops != 0 ||
 		       obj->n_obj_ops != 0 ||
 		       obj->flags ||
@@ -235,10 +231,10 @@
 	}
 
 	seq_printf(m,
-		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
+		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
 		   obj->debug_id,
 		   obj->parent ? obj->parent->debug_id : -1,
-		   fscache_object_states_short[obj->state],
+		   obj->state->short_name,
 		   obj->n_children,
 		   obj->n_ops,
 		   obj->n_obj_ops,
@@ -250,48 +246,40 @@
 		   obj->flags,
 		   work_busy(&obj->work));
 
-	no_cookie = true;
-	keylen = auxlen = 0;
-	if (obj->cookie) {
-		spin_lock(&obj->lock);
-		if (obj->cookie) {
-			switch (obj->cookie->def->type) {
-			case 0:
-				type = "IX";
-				break;
-			case 1:
-				type = "DT";
-				break;
-			default:
-				sprintf(_type, "%02u",
-					obj->cookie->def->type);
-				type = _type;
-				break;
-			}
+	if (fscache_use_cookie(obj)) {
+		uint16_t keylen = 0, auxlen = 0;
 
-			seq_printf(m, "%-16s %s %2lx %16p",
-				   obj->cookie->def->name,
-				   type,
-				   obj->cookie->flags,
-				   obj->cookie->netfs_data);
-
-			if (obj->cookie->def->get_key &&
-			    config & FSCACHE_OBJLIST_CONFIG_KEY)
-				keylen = obj->cookie->def->get_key(
-					obj->cookie->netfs_data,
-					buf, 400);
-
-			if (obj->cookie->def->get_aux &&
-			    config & FSCACHE_OBJLIST_CONFIG_AUX)
-				auxlen = obj->cookie->def->get_aux(
-					obj->cookie->netfs_data,
-					buf + keylen, 512 - keylen);
-
-			no_cookie = false;
+		switch (cookie->def->type) {
+		case 0:
+			type = "IX";
+			break;
+		case 1:
+			type = "DT";
+			break;
+		default:
+			sprintf(_type, "%02u", cookie->def->type);
+			type = _type;
+			break;
 		}
-		spin_unlock(&obj->lock);
 
-		if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+		seq_printf(m, "%-16s %s %2lx %16p",
+			   cookie->def->name,
+			   type,
+			   cookie->flags,
+			   cookie->netfs_data);
+
+		if (cookie->def->get_key &&
+		    config & FSCACHE_OBJLIST_CONFIG_KEY)
+			keylen = cookie->def->get_key(cookie->netfs_data,
+						      buf, 400);
+
+		if (cookie->def->get_aux &&
+		    config & FSCACHE_OBJLIST_CONFIG_AUX)
+			auxlen = cookie->def->get_aux(cookie->netfs_data,
+						      buf + keylen, 512 - keylen);
+		fscache_unuse_cookie(obj);
+
+		if (keylen > 0 || auxlen > 0) {
 			seq_printf(m, " ");
 			for (p = buf; keylen > 0; keylen--)
 				seq_printf(m, "%02x", *p++);
@@ -302,12 +290,11 @@
 					seq_printf(m, "%02x", *p++);
 			}
 		}
-	}
 
-	if (no_cookie)
-		seq_printf(m, "<no_cookie>\n");
-	else
 		seq_printf(m, "\n");
+	} else {
+		seq_printf(m, "<no_netfs>\n");
+	}
 	return 0;
 }
 

diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 50d41c1..86d75a6 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c

@@ -15,52 +15,131 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include "internal.h"
 
-const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
-	[FSCACHE_OBJECT_INIT]		= "OBJECT_INIT",
-	[FSCACHE_OBJECT_LOOKING_UP]	= "OBJECT_LOOKING_UP",
-	[FSCACHE_OBJECT_CREATING]	= "OBJECT_CREATING",
-	[FSCACHE_OBJECT_AVAILABLE]	= "OBJECT_AVAILABLE",
-	[FSCACHE_OBJECT_ACTIVE]		= "OBJECT_ACTIVE",
-	[FSCACHE_OBJECT_INVALIDATING]	= "OBJECT_INVALIDATING",
-	[FSCACHE_OBJECT_UPDATING]	= "OBJECT_UPDATING",
-	[FSCACHE_OBJECT_DYING]		= "OBJECT_DYING",
-	[FSCACHE_OBJECT_LC_DYING]	= "OBJECT_LC_DYING",
-	[FSCACHE_OBJECT_ABORT_INIT]	= "OBJECT_ABORT_INIT",
-	[FSCACHE_OBJECT_RELEASING]	= "OBJECT_RELEASING",
-	[FSCACHE_OBJECT_RECYCLING]	= "OBJECT_RECYCLING",
-	[FSCACHE_OBJECT_WITHDRAWING]	= "OBJECT_WITHDRAWING",
-	[FSCACHE_OBJECT_DEAD]		= "OBJECT_DEAD",
-};
-EXPORT_SYMBOL(fscache_object_states);
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
+static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
 
-const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
-	[FSCACHE_OBJECT_INIT]		= "INIT",
-	[FSCACHE_OBJECT_LOOKING_UP]	= "LOOK",
-	[FSCACHE_OBJECT_CREATING]	= "CRTN",
-	[FSCACHE_OBJECT_AVAILABLE]	= "AVBL",
-	[FSCACHE_OBJECT_ACTIVE]		= "ACTV",
-	[FSCACHE_OBJECT_INVALIDATING]	= "INVL",
-	[FSCACHE_OBJECT_UPDATING]	= "UPDT",
-	[FSCACHE_OBJECT_DYING]		= "DYNG",
-	[FSCACHE_OBJECT_LC_DYING]	= "LCDY",
-	[FSCACHE_OBJECT_ABORT_INIT]	= "ABTI",
-	[FSCACHE_OBJECT_RELEASING]	= "RELS",
-	[FSCACHE_OBJECT_RECYCLING]	= "RCYC",
-	[FSCACHE_OBJECT_WITHDRAWING]	= "WTHD",
-	[FSCACHE_OBJECT_DEAD]		= "DEAD",
+#define __STATE_NAME(n) fscache_osm_##n
+#define STATE(n) (&__STATE_NAME(n))
+
+/*
+ * Define a work state.  Work states are execution states.  No event processing
+ * is performed by them.  The function attached to a work state returns a
+ * pointer indicating the next state to which the state machine should
+ * transition.  Returning NO_TRANSIT repeats the current state, but goes back
+ * to the scheduler first.
+ */
+#define WORK_STATE(n, sn, f) \
+	const struct fscache_state __STATE_NAME(n) = {			\
+		.name = #n,						\
+		.short_name = sn,					\
+		.work = f						\
+	}
+
+/*
+ * Returns from work states.
+ */
+#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
+
+#define NO_TRANSIT ((struct fscache_state *)NULL)
+
+/*
+ * Define a wait state.  Wait states are event processing states.  No execution
+ * is performed by them.  Wait states are just tables of "if event X occurs,
+ * clear it and transition to state Y".  The dispatcher returns to the
+ * scheduler if none of the events in which the wait state has an interest are
+ * currently pending.
+ */
+#define WAIT_STATE(n, sn, ...) \
+	const struct fscache_state __STATE_NAME(n) = {			\
+		.name = #n,						\
+		.short_name = sn,					\
+		.work = NULL,						\
+		.transitions = { __VA_ARGS__, { 0, NULL } }		\
+	}
+
+#define TRANSIT_TO(state, emask) \
+	{ .events = (emask), .transit_to = STATE(state) }
+
+/*
+ * The object state machine.
+ */
+static WORK_STATE(INIT_OBJECT,		"INIT", fscache_initialise_object);
+static WORK_STATE(PARENT_READY,		"PRDY", fscache_parent_ready);
+static WORK_STATE(ABORT_INIT,		"ABRT", fscache_abort_initialisation);
+static WORK_STATE(LOOK_UP_OBJECT,	"LOOK", fscache_look_up_object);
+static WORK_STATE(CREATE_OBJECT,	"CRTO", fscache_look_up_object);
+static WORK_STATE(OBJECT_AVAILABLE,	"AVBL", fscache_object_available);
+static WORK_STATE(JUMPSTART_DEPS,	"JUMP", fscache_jumpstart_dependents);
+
+static WORK_STATE(INVALIDATE_OBJECT,	"INVL", fscache_invalidate_object);
+static WORK_STATE(UPDATE_OBJECT,	"UPDT", fscache_update_object);
+
+static WORK_STATE(LOOKUP_FAILURE,	"LCFL", fscache_lookup_failure);
+static WORK_STATE(KILL_OBJECT,		"KILL", fscache_kill_object);
+static WORK_STATE(KILL_DEPENDENTS,	"KDEP", fscache_kill_dependents);
+static WORK_STATE(DROP_OBJECT,		"DROP", fscache_drop_object);
+static WORK_STATE(OBJECT_DEAD,		"DEAD", (void*)2UL);
+
+static WAIT_STATE(WAIT_FOR_INIT,	"?INI",
+		  TRANSIT_TO(INIT_OBJECT,	1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_PARENT,	"?PRN",
+		  TRANSIT_TO(PARENT_READY,	1 << FSCACHE_OBJECT_EV_PARENT_READY));
+
+static WAIT_STATE(WAIT_FOR_CMD,		"?CMD",
+		  TRANSIT_TO(INVALIDATE_OBJECT,	1 << FSCACHE_OBJECT_EV_INVALIDATE),
+		  TRANSIT_TO(UPDATE_OBJECT,	1 << FSCACHE_OBJECT_EV_UPDATE),
+		  TRANSIT_TO(JUMPSTART_DEPS,	1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_CLEARANCE,	"?CLR",
+		  TRANSIT_TO(KILL_OBJECT,	1 << FSCACHE_OBJECT_EV_CLEARED));
+
+/*
+ * Out-of-band event transition tables.  These are for handling unexpected
+ * events, such as an I/O error.  If an OOB event occurs, the state machine
+ * clears and disables the event and forces a transition to the nominated work
+ * state (acurrently executing work states will complete first).
+ *
+ * In such a situation, object->state remembers the state the machine should
+ * have been in/gone to and returning NO_TRANSIT returns to that.
+ */
+static const struct fscache_transition fscache_osm_init_oob[] = {
+	   TRANSIT_TO(ABORT_INIT,
+		      (1 << FSCACHE_OBJECT_EV_ERROR) |
+		      (1 << FSCACHE_OBJECT_EV_KILL)),
+	   { 0, NULL }
+};
+
+static const struct fscache_transition fscache_osm_lookup_oob[] = {
+	   TRANSIT_TO(LOOKUP_FAILURE,
+		      (1 << FSCACHE_OBJECT_EV_ERROR) |
+		      (1 << FSCACHE_OBJECT_EV_KILL)),
+	   { 0, NULL }
+};
+
+static const struct fscache_transition fscache_osm_run_oob[] = {
+	   TRANSIT_TO(KILL_OBJECT,
+		      (1 << FSCACHE_OBJECT_EV_ERROR) |
+		      (1 << FSCACHE_OBJECT_EV_KILL)),
+	   { 0, NULL }
 };
 
 static int  fscache_get_object(struct fscache_object *);
 static void fscache_put_object(struct fscache_object *);
-static void fscache_initialise_object(struct fscache_object *);
-static void fscache_lookup_object(struct fscache_object *);
-static void fscache_object_available(struct fscache_object *);
-static void fscache_invalidate_object(struct fscache_object *);
-static void fscache_release_object(struct fscache_object *);
-static void fscache_withdraw_object(struct fscache_object *);
-static void fscache_enqueue_dependents(struct fscache_object *);
+static bool fscache_enqueue_dependents(struct fscache_object *, int);
 static void fscache_dequeue_object(struct fscache_object *);
 
 /*
@@ -75,295 +154,116 @@
 	       object->debug_id, parent->debug_id, parent->n_ops);
 
 	spin_lock_nested(&parent->lock, 1);
-	parent->n_ops--;
 	parent->n_obj_ops--;
+	parent->n_ops--;
 	if (parent->n_ops == 0)
 		fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
 	spin_unlock(&parent->lock);
 }
 
 /*
- * Notify netfs of invalidation completion.
+ * Object state machine dispatcher.
  */
-static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+static void fscache_object_sm_dispatcher(struct fscache_object *object)
 {
-	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
-		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
-}
-
-/*
- * process events that have been sent to an object's state machine
- * - initiates parent lookup
- * - does object lookup
- * - does object creation
- * - does object recycling and retirement
- * - does object withdrawal
- */
-static void fscache_object_state_machine(struct fscache_object *object)
-{
-	enum fscache_object_state new_state;
-	struct fscache_cookie *cookie;
-	int event;
+	const struct fscache_transition *t;
+	const struct fscache_state *state, *new_state;
+	unsigned long events, event_mask;
+	int event = -1;
 
 	ASSERT(object != NULL);
 
 	_enter("{OBJ%x,%s,%lx}",
-	       object->debug_id, fscache_object_states[object->state],
-	       object->events);
+	       object->debug_id, object->state->name, object->events);
 
-	switch (object->state) {
-		/* wait for the parent object to become ready */
-	case FSCACHE_OBJECT_INIT:
-		object->event_mask =
-			FSCACHE_OBJECT_EVENTS_MASK &
-			~(1 << FSCACHE_OBJECT_EV_CLEARED);
-		fscache_initialise_object(object);
-		goto done;
+	event_mask = object->event_mask;
+restart:
+	object->event_mask = 0; /* Mask normal event handling */
+	state = object->state;
+restart_masked:
+	events = object->events;
 
-		/* look up the object metadata on disk */
-	case FSCACHE_OBJECT_LOOKING_UP:
-		fscache_lookup_object(object);
-		goto lookup_transit;
-
-		/* create the object metadata on disk */
-	case FSCACHE_OBJECT_CREATING:
-		fscache_lookup_object(object);
-		goto lookup_transit;
-
-		/* handle an object becoming available; start pending
-		 * operations and queue dependent operations for processing */
-	case FSCACHE_OBJECT_AVAILABLE:
-		fscache_object_available(object);
-		goto active_transit;
-
-		/* normal running state */
-	case FSCACHE_OBJECT_ACTIVE:
-		goto active_transit;
-
-		/* Invalidate an object on disk */
-	case FSCACHE_OBJECT_INVALIDATING:
-		clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
-		fscache_stat(&fscache_n_invalidates_run);
-		fscache_stat(&fscache_n_cop_invalidate_object);
-		fscache_invalidate_object(object);
-		fscache_stat_d(&fscache_n_cop_invalidate_object);
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
-		goto active_transit;
-
-		/* update the object metadata on disk */
-	case FSCACHE_OBJECT_UPDATING:
-		clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
-		fscache_stat(&fscache_n_updates_run);
-		fscache_stat(&fscache_n_cop_update_object);
-		object->cache->ops->update_object(object);
-		fscache_stat_d(&fscache_n_cop_update_object);
-		goto active_transit;
-
-		/* handle an object dying during lookup or creation */
-	case FSCACHE_OBJECT_LC_DYING:
-		object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-		fscache_stat(&fscache_n_cop_lookup_complete);
-		object->cache->ops->lookup_complete(object);
-		fscache_stat_d(&fscache_n_cop_lookup_complete);
-
-		spin_lock(&object->lock);
-		object->state = FSCACHE_OBJECT_DYING;
-		cookie = object->cookie;
-		if (cookie) {
-			if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
-					       &cookie->flags))
-				wake_up_bit(&cookie->flags,
-					    FSCACHE_COOKIE_LOOKING_UP);
-			if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
-					       &cookie->flags))
-				wake_up_bit(&cookie->flags,
-					    FSCACHE_COOKIE_CREATING);
+	/* Handle any out-of-band events (typically an error) */
+	if (events & object->oob_event_mask) {
+		_debug("{OBJ%x} oob %lx",
+		       object->debug_id, events & object->oob_event_mask);
+		for (t = object->oob_table; t->events; t++) {
+			if (events & t->events) {
+				state = t->transit_to;
+				ASSERT(state->work != NULL);
+				event = fls(events & t->events) - 1;
+				__clear_bit(event, &object->oob_event_mask);
+				clear_bit(event, &object->events);
+				goto execute_work_state;
+			}
 		}
-		spin_unlock(&object->lock);
+	}
 
-		fscache_done_parent_op(object);
+	/* Wait states are just transition tables */
+	if (!state->work) {
+		if (events & event_mask) {
+			for (t = state->transitions; t->events; t++) {
+				if (events & t->events) {
+					new_state = t->transit_to;
+					event = fls(events & t->events) - 1;
+					clear_bit(event, &object->events);
+					_debug("{OBJ%x} ev %d: %s -> %s",
+					       object->debug_id, event,
+					       state->name, new_state->name);
+					object->state = state = new_state;
+					goto execute_work_state;
+				}
+			}
 
-		/* wait for completion of all active operations on this object
-		 * and the death of all child objects of this object */
-	case FSCACHE_OBJECT_DYING:
-	dying:
-		clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
-		spin_lock(&object->lock);
-		_debug("dying OBJ%x {%d,%d}",
-		       object->debug_id, object->n_ops, object->n_children);
-		if (object->n_ops == 0 && object->n_children == 0) {
-			object->event_mask &=
-				~(1 << FSCACHE_OBJECT_EV_CLEARED);
-			object->event_mask |=
-				(1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-				(1 << FSCACHE_OBJECT_EV_RETIRE) |
-				(1 << FSCACHE_OBJECT_EV_RELEASE) |
-				(1 << FSCACHE_OBJECT_EV_ERROR);
-		} else {
-			object->event_mask &=
-				~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-				  (1 << FSCACHE_OBJECT_EV_RETIRE) |
-				  (1 << FSCACHE_OBJECT_EV_RELEASE) |
-				  (1 << FSCACHE_OBJECT_EV_ERROR));
-			object->event_mask |=
-				1 << FSCACHE_OBJECT_EV_CLEARED;
+			/* The event mask didn't include all the tabled bits */
+			BUG();
 		}
-		spin_unlock(&object->lock);
-		fscache_enqueue_dependents(object);
-		fscache_start_operations(object);
-		goto terminal_transit;
-
-		/* handle an abort during initialisation */
-	case FSCACHE_OBJECT_ABORT_INIT:
-		_debug("handle abort init %lx", object->events);
-		object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-
-		spin_lock(&object->lock);
-		fscache_dequeue_object(object);
-
-		object->state = FSCACHE_OBJECT_DYING;
-		if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
-				       &object->cookie->flags))
-			wake_up_bit(&object->cookie->flags,
-				    FSCACHE_COOKIE_CREATING);
-		spin_unlock(&object->lock);
-		goto dying;
-
-		/* handle the netfs releasing an object and possibly marking it
-		 * obsolete too */
-	case FSCACHE_OBJECT_RELEASING:
-	case FSCACHE_OBJECT_RECYCLING:
-		object->event_mask &=
-			~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-			  (1 << FSCACHE_OBJECT_EV_RETIRE) |
-			  (1 << FSCACHE_OBJECT_EV_RELEASE) |
-			  (1 << FSCACHE_OBJECT_EV_ERROR));
-		fscache_release_object(object);
-		spin_lock(&object->lock);
-		object->state = FSCACHE_OBJECT_DEAD;
-		spin_unlock(&object->lock);
-		fscache_stat(&fscache_n_object_dead);
-		goto terminal_transit;
-
-		/* handle the parent cache of this object being withdrawn from
-		 * active service */
-	case FSCACHE_OBJECT_WITHDRAWING:
-		object->event_mask &=
-			~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-			  (1 << FSCACHE_OBJECT_EV_RETIRE) |
-			  (1 << FSCACHE_OBJECT_EV_RELEASE) |
-			  (1 << FSCACHE_OBJECT_EV_ERROR));
-		fscache_withdraw_object(object);
-		spin_lock(&object->lock);
-		object->state = FSCACHE_OBJECT_DEAD;
-		spin_unlock(&object->lock);
-		fscache_stat(&fscache_n_object_dead);
-		goto terminal_transit;
-
-		/* complain about the object being woken up once it is
-		 * deceased */
-	case FSCACHE_OBJECT_DEAD:
-		printk(KERN_ERR "FS-Cache:"
-		       " Unexpected event in dead state %lx\n",
-		       object->events & object->event_mask);
-		BUG();
-
-	default:
-		printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
-		       object->state);
-		BUG();
+		/* Randomly woke up */
+		goto unmask_events;
 	}
 
-	/* determine the transition from a lookup state */
-lookup_transit:
-	event = fls(object->events & object->event_mask) - 1;
-	switch (event) {
-	case FSCACHE_OBJECT_EV_WITHDRAW:
-	case FSCACHE_OBJECT_EV_RETIRE:
-	case FSCACHE_OBJECT_EV_RELEASE:
-	case FSCACHE_OBJECT_EV_ERROR:
-		new_state = FSCACHE_OBJECT_LC_DYING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_INVALIDATE:
-		new_state = FSCACHE_OBJECT_INVALIDATING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_REQUEUE:
-		goto done;
-	case -1:
-		goto done; /* sleep until event */
-	default:
-		goto unsupported_event;
+execute_work_state:
+	_debug("{OBJ%x} exec %s", object->debug_id, state->name);
+
+	new_state = state->work(object, event);
+	event = -1;
+	if (new_state == NO_TRANSIT) {
+		_debug("{OBJ%x} %s notrans", object->debug_id, state->name);
+		fscache_enqueue_object(object);
+		event_mask = object->oob_event_mask;
+		goto unmask_events;
 	}
 
-	/* determine the transition from an active state */
-active_transit:
-	event = fls(object->events & object->event_mask) - 1;
-	switch (event) {
-	case FSCACHE_OBJECT_EV_WITHDRAW:
-	case FSCACHE_OBJECT_EV_RETIRE:
-	case FSCACHE_OBJECT_EV_RELEASE:
-	case FSCACHE_OBJECT_EV_ERROR:
-		new_state = FSCACHE_OBJECT_DYING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_INVALIDATE:
-		new_state = FSCACHE_OBJECT_INVALIDATING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_UPDATE:
-		new_state = FSCACHE_OBJECT_UPDATING;
-		goto change_state;
-	case -1:
-		new_state = FSCACHE_OBJECT_ACTIVE;
-		goto change_state; /* sleep until event */
-	default:
-		goto unsupported_event;
+	_debug("{OBJ%x} %s -> %s",
+	       object->debug_id, state->name, new_state->name);
+	object->state = state = new_state;
+
+	if (state->work) {
+		if (unlikely(state->work == ((void *)2UL))) {
+			_leave(" [dead]");
+			return;
+		}
+		goto restart_masked;
 	}
 
-	/* determine the transition from a terminal state */
-terminal_transit:
-	event = fls(object->events & object->event_mask) - 1;
-	switch (event) {
-	case FSCACHE_OBJECT_EV_WITHDRAW:
-		new_state = FSCACHE_OBJECT_WITHDRAWING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_RETIRE:
-		new_state = FSCACHE_OBJECT_RECYCLING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_RELEASE:
-		new_state = FSCACHE_OBJECT_RELEASING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_ERROR:
-		new_state = FSCACHE_OBJECT_WITHDRAWING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_CLEARED:
-		new_state = FSCACHE_OBJECT_DYING;
-		goto change_state;
-	case -1:
-		goto done; /* sleep until event */
-	default:
-		goto unsupported_event;
-	}
+	/* Transited to wait state */
+	event_mask = object->oob_event_mask;
+	for (t = state->transitions; t->events; t++)
+		event_mask |= t->events;
 
-change_state:
-	spin_lock(&object->lock);
-	object->state = new_state;
-	spin_unlock(&object->lock);
-
-done:
-	_leave(" [->%s]", fscache_object_states[object->state]);
-	return;
-
-unsupported_event:
-	printk(KERN_ERR "FS-Cache:"
-	       " Unsupported event %d [%lx/%lx] in state %s\n",
-	       event, object->events, object->event_mask,
-	       fscache_object_states[object->state]);
-	BUG();
+unmask_events:
+	object->event_mask = event_mask;
+	smp_mb();
+	events = object->events;
+	if (events & event_mask)
+		goto restart;
+	_leave(" [msk %lx]", event_mask);
 }
 
 /*
  * execute an object
  */
-void fscache_object_work_func(struct work_struct *work)
+static void fscache_object_work_func(struct work_struct *work)
 {
 	struct fscache_object *object =
 		container_of(work, struct fscache_object, work);
@@ -372,14 +272,70 @@
 	_enter("{OBJ%x}", object->debug_id);
 
 	start = jiffies;
-	fscache_object_state_machine(object);
+	fscache_object_sm_dispatcher(object);
 	fscache_hist(fscache_objs_histogram, start);
-	if (object->events & object->event_mask)
-		fscache_enqueue_object(object);
-	clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
 	fscache_put_object(object);
 }
-EXPORT_SYMBOL(fscache_object_work_func);
+
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ * @cookie: Cookie object will be attached to
+ * @cache: Cache in which backing object will be found
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_object_init(struct fscache_object *object,
+			 struct fscache_cookie *cookie,
+			 struct fscache_cache *cache)
+{
+	const struct fscache_transition *t;
+
+	atomic_inc(&cache->object_count);
+
+	object->state = STATE(WAIT_FOR_INIT);
+	object->oob_table = fscache_osm_init_oob;
+	object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
+	spin_lock_init(&object->lock);
+	INIT_LIST_HEAD(&object->cache_link);
+	INIT_HLIST_NODE(&object->cookie_link);
+	INIT_WORK(&object->work, fscache_object_work_func);
+	INIT_LIST_HEAD(&object->dependents);
+	INIT_LIST_HEAD(&object->dep_link);
+	INIT_LIST_HEAD(&object->pending_ops);
+	object->n_children = 0;
+	object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+	object->events = 0;
+	object->store_limit = 0;
+	object->store_limit_l = 0;
+	object->cache = cache;
+	object->cookie = cookie;
+	object->parent = NULL;
+
+	object->oob_event_mask = 0;
+	for (t = object->oob_table; t->events; t++)
+		object->oob_event_mask |= t->events;
+	object->event_mask = object->oob_event_mask;
+	for (t = object->state->transitions; t->events; t++)
+		object->event_mask |= t->events;
+}
+EXPORT_SYMBOL(fscache_object_init);
+
+/*
+ * Abort object initialisation before we start it.
+ */
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
+								int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	object->oob_event_mask = 0;
+	fscache_dequeue_object(object);
+	return transit_to(KILL_OBJECT);
+}
 
 /*
  * initialise an object
@@ -387,130 +343,136 @@
  *   immediately to do a creation
  * - we may need to start the process of creating a parent and we need to wait
  *   for the parent's lookup and creation to complete if it's not there yet
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- *   leaf-most cookies of the object and all its children
  */
-static void fscache_initialise_object(struct fscache_object *object)
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
+							     int event)
 {
 	struct fscache_object *parent;
+	bool success;
 
-	_enter("");
-	ASSERT(object->cookie != NULL);
-	ASSERT(object->cookie->parent != NULL);
+	_enter("{OBJ%x},%d", object->debug_id, event);
 
-	if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
-			      (1 << FSCACHE_OBJECT_EV_RELEASE) |
-			      (1 << FSCACHE_OBJECT_EV_RETIRE) |
-			      (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
-		_debug("abort init %lx", object->events);
-		spin_lock(&object->lock);
-		object->state = FSCACHE_OBJECT_ABORT_INIT;
-		spin_unlock(&object->lock);
-		return;
-	}
-
-	spin_lock(&object->cookie->lock);
-	spin_lock_nested(&object->cookie->parent->lock, 1);
+	ASSERT(list_empty(&object->dep_link));
 
 	parent = object->parent;
 	if (!parent) {
-		_debug("no parent");
-		set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
-	} else {
-		spin_lock(&object->lock);
-		spin_lock_nested(&parent->lock, 1);
-		_debug("parent %s", fscache_object_states[parent->state]);
-
-		if (parent->state >= FSCACHE_OBJECT_DYING) {
-			_debug("bad parent");
-			set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
-		} else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
-			_debug("wait");
-
-			/* we may get woken up in this state by child objects
-			 * binding on to us, so we need to make sure we don't
-			 * add ourself to the list multiple times */
-			if (list_empty(&object->dep_link)) {
-				fscache_stat(&fscache_n_cop_grab_object);
-				object->cache->ops->grab_object(object);
-				fscache_stat_d(&fscache_n_cop_grab_object);
-				list_add(&object->dep_link,
-					 &parent->dependents);
-
-				/* fscache_acquire_non_index_cookie() uses this
-				 * to wake the chain up */
-				if (parent->state == FSCACHE_OBJECT_INIT)
-					fscache_enqueue_object(parent);
-			}
-		} else {
-			_debug("go");
-			parent->n_ops++;
-			parent->n_obj_ops++;
-			object->lookup_jif = jiffies;
-			object->state = FSCACHE_OBJECT_LOOKING_UP;
-			set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-		}
-
-		spin_unlock(&parent->lock);
-		spin_unlock(&object->lock);
+		_leave(" [no parent]");
+		return transit_to(DROP_OBJECT);
 	}
 
-	spin_unlock(&object->cookie->parent->lock);
-	spin_unlock(&object->cookie->lock);
+	_debug("parent: %s of:%lx", parent->state->name, parent->flags);
+
+	if (fscache_object_is_dying(parent)) {
+		_leave(" [bad parent]");
+		return transit_to(DROP_OBJECT);
+	}
+
+	if (fscache_object_is_available(parent)) {
+		_leave(" [ready]");
+		return transit_to(PARENT_READY);
+	}
+
+	_debug("wait");
+
+	spin_lock(&parent->lock);
+	fscache_stat(&fscache_n_cop_grab_object);
+	success = false;
+	if (fscache_object_is_live(parent) &&
+	    object->cache->ops->grab_object(object)) {
+		list_add(&object->dep_link, &parent->dependents);
+		success = true;
+	}
+	fscache_stat_d(&fscache_n_cop_grab_object);
+	spin_unlock(&parent->lock);
+	if (!success) {
+		_leave(" [grab failed]");
+		return transit_to(DROP_OBJECT);
+	}
+
+	/* fscache_acquire_non_index_cookie() uses this
+	 * to wake the chain up */
+	fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
+	_leave(" [wait]");
+	return transit_to(WAIT_FOR_PARENT);
+}
+
+/*
+ * Once the parent object is ready, we should kick off our lookup op.
+ */
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
+							int event)
+{
+	struct fscache_object *parent = object->parent;
+
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	ASSERT(parent != NULL);
+
+	spin_lock(&parent->lock);
+	parent->n_ops++;
+	parent->n_obj_ops++;
+	object->lookup_jif = jiffies;
+	spin_unlock(&parent->lock);
+
 	_leave("");
+	return transit_to(LOOK_UP_OBJECT);
 }
 
 /*
  * look an object up in the cache from which it was allocated
  * - we hold an "access lock" on the parent object, so the parent object cannot
  *   be withdrawn by either party till we've finished
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- *   leaf-most cookies of the object and all its children
  */
-static void fscache_lookup_object(struct fscache_object *object)
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
+							  int event)
 {
 	struct fscache_cookie *cookie = object->cookie;
-	struct fscache_object *parent;
+	struct fscache_object *parent = object->parent;
 	int ret;
 
-	_enter("");
+	_enter("{OBJ%x},%d", object->debug_id, event);
 
-	parent = object->parent;
+	object->oob_table = fscache_osm_lookup_oob;
+
 	ASSERT(parent != NULL);
 	ASSERTCMP(parent->n_ops, >, 0);
 	ASSERTCMP(parent->n_obj_ops, >, 0);
 
 	/* make sure the parent is still available */
-	ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
+	ASSERT(fscache_object_is_available(parent));
 
-	if (parent->state >= FSCACHE_OBJECT_DYING ||
-	    test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
-		_debug("unavailable");
-		set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
-		_leave("");
-		return;
+	if (fscache_object_is_dying(parent) ||
+	    test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
+	    !fscache_use_cookie(object)) {
+		_leave(" [unavailable]");
+		return transit_to(LOOKUP_FAILURE);
 	}
 
-	_debug("LOOKUP \"%s/%s\" in \"%s\"",
-	       parent->cookie->def->name, cookie->def->name,
-	       object->cache->tag->name);
+	_debug("LOOKUP \"%s\" in \"%s\"",
+	       cookie->def->name, object->cache->tag->name);
 
 	fscache_stat(&fscache_n_object_lookups);
 	fscache_stat(&fscache_n_cop_lookup_object);
 	ret = object->cache->ops->lookup_object(object);
 	fscache_stat_d(&fscache_n_cop_lookup_object);
 
-	if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
-		set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+	fscache_unuse_cookie(object);
 
 	if (ret == -ETIMEDOUT) {
 		/* probably stuck behind another object, so move this one to
 		 * the back of the queue */
 		fscache_stat(&fscache_n_object_lookups_timed_out);
-		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+		_leave(" [timeout]");
+		return NO_TRANSIT;
 	}
 
-	_leave("");
+	if (ret < 0) {
+		_leave(" [error]");
+		return transit_to(LOOKUP_FAILURE);
+	}
+
+	_leave(" [ok]");
+	return transit_to(OBJECT_AVAILABLE);
 }
 
 /**
@@ -524,32 +486,20 @@
 {
 	struct fscache_cookie *cookie = object->cookie;
 
-	_enter("{OBJ%x,%s}",
-	       object->debug_id, fscache_object_states[object->state]);
+	_enter("{OBJ%x,%s}", object->debug_id, object->state->name);
 
-	spin_lock(&object->lock);
-	if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+	if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
 		fscache_stat(&fscache_n_object_lookups_negative);
 
-		/* transit here to allow write requests to begin stacking up
-		 * and read requests to begin returning ENODATA */
-		object->state = FSCACHE_OBJECT_CREATING;
-		spin_unlock(&object->lock);
-
-		set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+		/* Allow write requests to begin stacking up and read requests to begin
+		 * returning ENODATA.
+		 */
 		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
 
 		_debug("wake up lookup %p", &cookie->flags);
-		smp_mb__before_clear_bit();
-		clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-		smp_mb__after_clear_bit();
+		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
 		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-	} else {
-		ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
-		spin_unlock(&object->lock);
 	}
-
 	_leave("");
 }
 EXPORT_SYMBOL(fscache_object_lookup_negative);
@@ -568,38 +518,26 @@
 {
 	struct fscache_cookie *cookie = object->cookie;
 
-	_enter("{OBJ%x,%s}",
-	       object->debug_id, fscache_object_states[object->state]);
+	_enter("{OBJ%x,%s}", object->debug_id, object->state->name);
 
 	/* if we were still looking up, then we must have a positive lookup
 	 * result, in which case there may be data available */
-	spin_lock(&object->lock);
-	if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+	if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
 		fscache_stat(&fscache_n_object_lookups_positive);
 
-		clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+		/* We do (presumably) have data */
+		clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
 
-		object->state = FSCACHE_OBJECT_AVAILABLE;
-		spin_unlock(&object->lock);
-
-		smp_mb__before_clear_bit();
-		clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-		smp_mb__after_clear_bit();
+		/* Allow write requests to begin stacking up and read requests
+		 * to begin shovelling data.
+		 */
+		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
 		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
 	} else {
-		ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
 		fscache_stat(&fscache_n_object_created);
-
-		object->state = FSCACHE_OBJECT_AVAILABLE;
-		spin_unlock(&object->lock);
-		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-		smp_wmb();
 	}
 
-	if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
-		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
-
+	set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
 	_leave("");
 }
 EXPORT_SYMBOL(fscache_obtained_object);
@@ -607,16 +545,15 @@
 /*
  * handle an object that has just become available
  */
-static void fscache_object_available(struct fscache_object *object)
+static const struct fscache_state *fscache_object_available(struct fscache_object *object,
+							    int event)
 {
-	_enter("{OBJ%x}", object->debug_id);
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	object->oob_table = fscache_osm_run_oob;
 
 	spin_lock(&object->lock);
 
-	if (object->cookie &&
-	    test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
-		wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
-
 	fscache_done_parent_op(object);
 	if (object->n_in_progress == 0) {
 		if (object->n_ops > 0) {
@@ -631,27 +568,131 @@
 	fscache_stat(&fscache_n_cop_lookup_complete);
 	object->cache->ops->lookup_complete(object);
 	fscache_stat_d(&fscache_n_cop_lookup_complete);
-	fscache_enqueue_dependents(object);
 
 	fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
 	fscache_stat(&fscache_n_object_avail);
 
 	_leave("");
+	return transit_to(JUMPSTART_DEPS);
 }
 
 /*
- * drop an object's attachments
+ * Wake up this object's dependent objects now that we've become available.
  */
-static void fscache_drop_object(struct fscache_object *object)
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
+								int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
+		return NO_TRANSIT; /* Not finished; requeue */
+	return transit_to(WAIT_FOR_CMD);
+}
+
+/*
+ * Handle lookup or creation failute.
+ */
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
+							  int event)
+{
+	struct fscache_cookie *cookie;
+
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	object->oob_event_mask = 0;
+
+	fscache_stat(&fscache_n_cop_lookup_complete);
+	object->cache->ops->lookup_complete(object);
+	fscache_stat_d(&fscache_n_cop_lookup_complete);
+
+	cookie = object->cookie;
+	set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+	if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+
+	fscache_done_parent_op(object);
+	return transit_to(KILL_OBJECT);
+}
+
+/*
+ * Wait for completion of all active operations on this object and the death of
+ * all child objects of this object.
+ */
+static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
+						       int event)
+{
+	_enter("{OBJ%x,%d,%d},%d",
+	       object->debug_id, object->n_ops, object->n_children, event);
+
+	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	object->oob_event_mask = 0;
+
+	if (list_empty(&object->dependents) &&
+	    object->n_ops == 0 &&
+	    object->n_children == 0)
+		return transit_to(DROP_OBJECT);
+
+	if (object->n_in_progress == 0) {
+		spin_lock(&object->lock);
+		if (object->n_ops > 0 && object->n_in_progress == 0)
+			fscache_start_operations(object);
+		spin_unlock(&object->lock);
+	}
+
+	if (!list_empty(&object->dependents))
+		return transit_to(KILL_DEPENDENTS);
+
+	return transit_to(WAIT_FOR_CLEARANCE);
+}
+
+/*
+ * Kill dependent objects.
+ */
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
+							   int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
+		return NO_TRANSIT; /* Not finished */
+	return transit_to(WAIT_FOR_CLEARANCE);
+}
+
+/*
+ * Drop an object's attachments
+ */
+static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
+						       int event)
 {
 	struct fscache_object *parent = object->parent;
+	struct fscache_cookie *cookie = object->cookie;
 	struct fscache_cache *cache = object->cache;
+	bool awaken = false;
 
-	_enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+	_enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
 
-	ASSERTCMP(object->cookie, ==, NULL);
-	ASSERT(hlist_unhashed(&object->cookie_link));
+	ASSERT(cookie != NULL);
+	ASSERT(!hlist_unhashed(&object->cookie_link));
 
+	/* Make sure the cookie no longer points here and that the netfs isn't
+	 * waiting for us.
+	 */
+	spin_lock(&cookie->lock);
+	hlist_del_init(&object->cookie_link);
+	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+		awaken = true;
+	spin_unlock(&cookie->lock);
+
+	if (awaken)
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+
+	/* Prevent a race with our last child, which has to signal EV_CLEARED
+	 * before dropping our spinlock.
+	 */
+	spin_lock(&object->lock);
+	spin_unlock(&object->lock);
+
+	/* Discard from the cache's collection of objects */
 	spin_lock(&cache->object_list_lock);
 	list_del_init(&object->cache_link);
 	spin_unlock(&cache->object_list_lock);
@@ -660,6 +701,7 @@
 	cache->ops->drop_object(object);
 	fscache_stat_d(&fscache_n_cop_drop_object);
 
+	/* The parent object wants to know when all it dependents have gone */
 	if (parent) {
 		_debug("release parent OBJ%x {%d}",
 		       parent->debug_id, parent->n_children);
@@ -674,87 +716,10 @@
 
 	/* this just shifts the object release to the work processor */
 	fscache_put_object(object);
+	fscache_stat(&fscache_n_object_dead);
 
 	_leave("");
-}
-
-/*
- * release or recycle an object that the netfs has discarded
- */
-static void fscache_release_object(struct fscache_object *object)
-{
-	_enter("");
-
-	fscache_drop_object(object);
-}
-
-/*
- * withdraw an object from active service
- */
-static void fscache_withdraw_object(struct fscache_object *object)
-{
-	struct fscache_cookie *cookie;
-	bool detached;
-
-	_enter("");
-
-	spin_lock(&object->lock);
-	cookie = object->cookie;
-	if (cookie) {
-		/* need to get the cookie lock before the object lock, starting
-		 * from the object pointer */
-		atomic_inc(&cookie->usage);
-		spin_unlock(&object->lock);
-
-		detached = false;
-		spin_lock(&cookie->lock);
-		spin_lock(&object->lock);
-
-		if (object->cookie == cookie) {
-			hlist_del_init(&object->cookie_link);
-			object->cookie = NULL;
-			fscache_invalidation_complete(cookie);
-			detached = true;
-		}
-		spin_unlock(&cookie->lock);
-		fscache_cookie_put(cookie);
-		if (detached)
-			fscache_cookie_put(cookie);
-	}
-
-	spin_unlock(&object->lock);
-
-	fscache_drop_object(object);
-}
-
-/*
- * withdraw an object from active service at the behest of the cache
- * - need break the links to a cached object cookie
- * - called under two situations:
- *   (1) recycler decides to reclaim an in-use object
- *   (2) a cache is unmounted
- * - have to take care as the cookie can be being relinquished by the netfs
- *   simultaneously
- * - the object is pinned by the caller holding a refcount on it
- */
-void fscache_withdrawing_object(struct fscache_cache *cache,
-				struct fscache_object *object)
-{
-	bool enqueue = false;
-
-	_enter(",OBJ%x", object->debug_id);
-
-	spin_lock(&object->lock);
-	if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
-		object->state = FSCACHE_OBJECT_WITHDRAWING;
-		enqueue = true;
-	}
-	spin_unlock(&object->lock);
-
-	if (enqueue)
-		fscache_enqueue_object(object);
-
-	_leave("");
+	return transit_to(OBJECT_DEAD);
 }
 
 /*
@@ -771,7 +736,7 @@
 }
 
 /*
- * discard a ref on a work item
+ * Discard a ref on an object
  */
 static void fscache_put_object(struct fscache_object *object)
 {
@@ -780,6 +745,22 @@
 	fscache_stat_d(&fscache_n_cop_put_object);
 }
 
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *object)
+{
+	fscache_objlist_remove(object);
+
+	/* We can get rid of the cookie now */
+	fscache_cookie_put(object->cookie);
+	object->cookie = NULL;
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
 /*
  * enqueue an object for metadata-type processing
  */
@@ -803,7 +784,7 @@
 
 /**
  * fscache_object_sleep_till_congested - Sleep until object wq is congested
- * @timoutp: Scheduler sleep timeout
+ * @timeoutp: Scheduler sleep timeout
  *
  * Allow an object handler to sleep until the object workqueue is congested.
  *
@@ -831,18 +812,21 @@
 EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
 
 /*
- * enqueue the dependents of an object for metadata-type processing
- * - the caller must hold the object's lock
- * - this may cause an already locked object to wind up being processed again
+ * Enqueue the dependents of an object for metadata-type processing.
+ *
+ * If we don't manage to finish the list before the scheduler wants to run
+ * again then return false immediately.  We return true if the list was
+ * cleared.
  */
-static void fscache_enqueue_dependents(struct fscache_object *object)
+static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
 {
 	struct fscache_object *dep;
+	bool ret = true;
 
 	_enter("{OBJ%x}", object->debug_id);
 
 	if (list_empty(&object->dependents))
-		return;
+		return true;
 
 	spin_lock(&object->lock);
 
@@ -851,23 +835,23 @@
 				 struct fscache_object, dep_link);
 		list_del_init(&dep->dep_link);
 
-
-		/* sort onto appropriate lists */
-		fscache_enqueue_object(dep);
+		fscache_raise_event(dep, event);
 		fscache_put_object(dep);
 
-		if (!list_empty(&object->dependents))
-			cond_resched_lock(&object->lock);
+		if (!list_empty(&object->dependents) && need_resched()) {
+			ret = false;
+			break;
+		}
 	}
 
 	spin_unlock(&object->lock);
+	return ret;
 }
 
 /*
  * remove an object from whatever queue it's waiting on
- * - the caller must hold object->lock
  */
-void fscache_dequeue_object(struct fscache_object *object)
+static void fscache_dequeue_object(struct fscache_object *object)
 {
 	_enter("{OBJ%x}", object->debug_id);
 
@@ -886,7 +870,10 @@
  * @data: The auxiliary data for the object
  * @datalen: The size of the auxiliary data
  *
- * This function consults the netfs about the coherency state of an object
+ * This function consults the netfs about the coherency state of an object.
+ * The caller must be holding a ref on cookie->n_active (held by
+ * fscache_look_up_object() on behalf of the cache backend during object lookup
+ * and creation).
  */
 enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
 					const void *data, uint16_t datalen)
@@ -927,12 +914,23 @@
 /*
  * Asynchronously invalidate an object.
  */
-static void fscache_invalidate_object(struct fscache_object *object)
+static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
+							      int event)
 {
 	struct fscache_operation *op;
 	struct fscache_cookie *cookie = object->cookie;
 
-	_enter("{OBJ%x}", object->debug_id);
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	/* We're going to need the cookie.  If the cookie is not available then
+	 * retire the object instead.
+	 */
+	if (!fscache_use_cookie(object)) {
+		ASSERT(object->cookie->stores.rnode == NULL);
+		set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+		_leave(" [no cookie]");
+		return transit_to(KILL_OBJECT);
+	}
 
 	/* Reject any new read/write ops and abort any that are pending. */
 	fscache_invalidate_writes(cookie);
@@ -941,14 +939,13 @@
 
 	/* Now we have to wait for in-progress reads and writes */
 	op = kzalloc(sizeof(*op), GFP_KERNEL);
-	if (!op) {
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
-		_leave(" [ENOMEM]");
-		return;
-	}
+	if (!op)
+		goto nomem;
 
 	fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
-	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+	op->flags = FSCACHE_OP_ASYNC |
+		(1 << FSCACHE_OP_EXCLUSIVE) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
 	spin_lock(&cookie->lock);
 	if (fscache_submit_exclusive_op(object, op) < 0)
@@ -965,13 +962,50 @@
 	/* We can allow read and write requests to come in once again.  They'll
 	 * queue up behind our exclusive invalidation operation.
 	 */
-	fscache_invalidation_complete(cookie);
-	_leave("");
-	return;
+	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+	_leave(" [ok]");
+	return transit_to(UPDATE_OBJECT);
+
+nomem:
+	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	fscache_unuse_cookie(object);
+	_leave(" [ENOMEM]");
+	return transit_to(KILL_OBJECT);
 
 submit_op_failed:
+	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
 	spin_unlock(&cookie->lock);
 	kfree(op);
-	fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
 	_leave(" [EIO]");
+	return transit_to(KILL_OBJECT);
+}
+
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
+							     int event)
+{
+	const struct fscache_state *s;
+
+	fscache_stat(&fscache_n_invalidates_run);
+	fscache_stat(&fscache_n_cop_invalidate_object);
+	s = _fscache_invalidate_object(object, event);
+	fscache_stat_d(&fscache_n_cop_invalidate_object);
+	return s;
+}
+
+/*
+ * Asynchronously update an object.
+ */
+static const struct fscache_state *fscache_update_object(struct fscache_object *object,
+							 int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	fscache_stat(&fscache_n_updates_run);
+	fscache_stat(&fscache_n_cop_update_object);
+	object->cache->ops->update_object(object);
+	fscache_stat_d(&fscache_n_cop_update_object);
+
+	_leave("");
+	return transit_to(WAIT_FOR_CMD);
 }

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 762a9ec..318071a 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c

@@ -35,7 +35,7 @@
 
 	ASSERT(list_empty(&op->pend_link));
 	ASSERT(op->processor != NULL);
-	ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+	ASSERT(fscache_object_is_available(op->object));
 	ASSERTCMP(atomic_read(&op->usage), >, 0);
 	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
 
@@ -119,7 +119,7 @@
 		/* need to issue a new write op after this */
 		clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
 		ret = 0;
-	} else if (object->state == FSCACHE_OBJECT_CREATING) {
+	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
 		op->object = object;
 		object->n_ops++;
 		object->n_exclusive++;	/* reads and writes must wait */
@@ -144,7 +144,7 @@
  */
 static void fscache_report_unexpected_submission(struct fscache_object *object,
 						 struct fscache_operation *op,
-						 unsigned long ostate)
+						 const struct fscache_state *ostate)
 {
 	static bool once_only;
 	struct fscache_operation *p;
@@ -155,11 +155,8 @@
 	once_only = true;
 
 	kdebug("unexpected submission OP%x [OBJ%x %s]",
-	       op->debug_id, object->debug_id,
-	       fscache_object_states[object->state]);
-	kdebug("objstate=%s [%s]",
-	       fscache_object_states[object->state],
-	       fscache_object_states[ostate]);
+	       op->debug_id, object->debug_id, object->state->name);
+	kdebug("objstate=%s [%s]", object->state->name, ostate->name);
 	kdebug("objflags=%lx", object->flags);
 	kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
 	kdebug("ops=%u inp=%u exc=%u",
@@ -190,7 +187,7 @@
 int fscache_submit_op(struct fscache_object *object,
 		      struct fscache_operation *op)
 {
-	unsigned long ostate;
+	const struct fscache_state *ostate;
 	int ret;
 
 	_enter("{OBJ%x OP%x},{%u}",
@@ -226,16 +223,14 @@
 			fscache_run_op(object, op);
 		}
 		ret = 0;
-	} else if (object->state == FSCACHE_OBJECT_CREATING) {
+	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
 		op->object = object;
 		object->n_ops++;
 		atomic_inc(&op->usage);
 		list_add_tail(&op->pend_link, &object->pending_ops);
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
-	} else if (object->state == FSCACHE_OBJECT_DYING ||
-		   object->state == FSCACHE_OBJECT_LC_DYING ||
-		   object->state == FSCACHE_OBJECT_WITHDRAWING) {
+	} else if (fscache_object_is_dying(object)) {
 		fscache_stat(&fscache_n_op_rejected);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
@@ -265,8 +260,8 @@
 }
 
 /*
- * jump start the operation processing on an object
- * - caller must hold object->lock
+ * Jump start the operation processing on an object.  The caller must hold
+ * object->lock.
  */
 void fscache_start_operations(struct fscache_object *object)
 {
@@ -428,14 +423,10 @@
 
 	object = op->object;
 
-	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
-		if (atomic_dec_and_test(&object->n_reads)) {
-			clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
-				  &object->cookie->flags);
-			wake_up_bit(&object->cookie->flags,
-				    FSCACHE_COOKIE_WAITING_ON_READS);
-		}
-	}
+	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+		atomic_dec(&object->n_reads);
+	if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
+		fscache_unuse_cookie(object);
 
 	/* now... we may get called with the object spinlock held, so we
 	 * complete the cleanup here only if we can immediately acquire the

diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e5..d479ab3 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c

@@ -109,7 +109,7 @@
 	 * allocator as the work threads writing to the cache may all end up
 	 * sleeping on memory allocation, so we may need to impose a timeout
 	 * too. */
-	if (!(gfp & __GFP_WAIT)) {
+	if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
 		fscache_stat(&fscache_n_store_vmscan_busy);
 		return false;
 	}
@@ -163,10 +163,12 @@
 
 	fscache_stat(&fscache_n_attr_changed_calls);
 
-	if (fscache_object_is_active(object)) {
+	if (fscache_object_is_active(object) &&
+	    fscache_use_cookie(object)) {
 		fscache_stat(&fscache_n_cop_attr_changed);
 		ret = object->cache->ops->attr_changed(object);
 		fscache_stat_d(&fscache_n_cop_attr_changed);
+		fscache_unuse_cookie(object);
 		if (ret < 0)
 			fscache_abort_object(object);
 	}
@@ -233,7 +235,7 @@
 
 	_enter("{OP%x}", op->op.debug_id);
 
-	ASSERTCMP(op->n_pages, ==, 0);
+	ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
 
 	fscache_hist(fscache_retrieval_histogram, op->start_time);
 	if (op->context)
@@ -246,6 +248,7 @@
  * allocate a retrieval op
  */
 static struct fscache_retrieval *fscache_alloc_retrieval(
+	struct fscache_cookie *cookie,
 	struct address_space *mapping,
 	fscache_rw_complete_t end_io_func,
 	void *context)
@@ -260,7 +263,10 @@
 	}
 
 	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
-	op->op.flags	= FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+	atomic_inc(&cookie->n_active);
+	op->op.flags	= FSCACHE_OP_MYTHREAD |
+		(1UL << FSCACHE_OP_WAITING) |
+		(1UL << FSCACHE_OP_UNUSE_COOKIE);
 	op->mapping	= mapping;
 	op->end_io_func	= end_io_func;
 	op->context	= context;
@@ -310,7 +316,7 @@
 	struct fscache_retrieval *op =
 		container_of(_op, struct fscache_retrieval, op);
 
-	op->n_pages = 0;
+	atomic_set(&op->n_pages, 0);
 }
 
 /*
@@ -394,12 +400,13 @@
 	if (fscache_wait_for_deferred_lookup(cookie) < 0)
 		return -ERESTARTSYS;
 
-	op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+	op = fscache_alloc_retrieval(cookie, page->mapping,
+				     end_io_func,context);
 	if (!op) {
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
 	}
-	op->n_pages = 1;
+	atomic_set(&op->n_pages, 1);
 
 	spin_lock(&cookie->lock);
 
@@ -408,7 +415,7 @@
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
-	ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+	ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
 
 	atomic_inc(&object->n_reads);
 	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -465,6 +472,7 @@
 	atomic_dec(&object->n_reads);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
+	atomic_dec(&cookie->n_active);
 	kfree(op);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
@@ -522,10 +530,10 @@
 	if (fscache_wait_for_deferred_lookup(cookie) < 0)
 		return -ERESTARTSYS;
 
-	op = fscache_alloc_retrieval(mapping, end_io_func, context);
+	op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
 	if (!op)
 		return -ENOMEM;
-	op->n_pages = *nr_pages;
+	atomic_set(&op->n_pages, *nr_pages);
 
 	spin_lock(&cookie->lock);
 
@@ -589,6 +597,7 @@
 	atomic_dec(&object->n_reads);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
+	atomic_dec(&cookie->n_active);
 	kfree(op);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
@@ -631,10 +640,10 @@
 	if (fscache_wait_for_deferred_lookup(cookie) < 0)
 		return -ERESTARTSYS;
 
-	op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+	op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
 	if (!op)
 		return -ENOMEM;
-	op->n_pages = 1;
+	atomic_set(&op->n_pages, 1);
 
 	spin_lock(&cookie->lock);
 
@@ -675,6 +684,7 @@
 
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
+	atomic_dec(&cookie->n_active);
 	kfree(op);
 nobufs:
 	fscache_stat(&fscache_n_allocs_nobufs);
@@ -729,8 +739,9 @@
 		 */
 		spin_unlock(&object->lock);
 		fscache_op_complete(&op->op, false);
-		_leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
-		       _op->flags, _op->state, object->state, object->flags);
+		_leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
+		       _op->flags, _op->state, object->state->short_name,
+		       object->flags);
 		return;
 	}
 
@@ -796,11 +807,16 @@
 
 	_enter("");
 
-	while (spin_lock(&cookie->stores_lock),
-	       n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
-					      ARRAY_SIZE(results),
-					      FSCACHE_COOKIE_PENDING_TAG),
-	       n > 0) {
+	for (;;) {
+		spin_lock(&cookie->stores_lock);
+		n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+					       ARRAY_SIZE(results),
+					       FSCACHE_COOKIE_PENDING_TAG);
+		if (n == 0) {
+			spin_unlock(&cookie->stores_lock);
+			break;
+		}
+
 		for (i = n - 1; i >= 0; i--) {
 			page = results[i];
 			radix_tree_delete(&cookie->stores, page->index);
@@ -812,7 +828,6 @@
 			page_cache_release(results[i]);
 	}
 
-	spin_unlock(&cookie->stores_lock);
 	_leave("");
 }
 
@@ -829,14 +844,12 @@
  *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
  *      set)
  *
- *	(a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
- *	    fill op)
+ *	(a) no writes yet
  *
  *	(b) writes deferred till post-creation (mark page for writing and
  *	    return immediately)
  *
  *  (2) negative lookup, object created, initial fill being made from netfs
- *      (FSCACHE_COOKIE_INITIAL_FILL is set)
  *
  *	(a) fill point not yet reached this page (mark page for writing and
  *          return)
@@ -873,7 +886,9 @@
 
 	fscache_operation_init(&op->op, fscache_write_op,
 			       fscache_release_write_op);
-	op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
+	op->op.flags = FSCACHE_OP_ASYNC |
+		(1 << FSCACHE_OP_WAITING) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
 	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
 	if (ret < 0)
@@ -919,6 +934,7 @@
 	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
 	op->store_limit = object->store_limit;
 
+	atomic_inc(&cookie->n_active);
 	if (fscache_submit_op(object, &op->op) < 0)
 		goto submit_failed;
 
@@ -945,6 +961,7 @@
 	return 0;
 
 submit_failed:
+	atomic_dec(&cookie->n_active);
 	spin_lock(&cookie->stores_lock);
 	radix_tree_delete(&cookie->stores, page->index);
 	spin_unlock(&cookie->stores_lock);

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f3f783d..0eda527 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c

@@ -14,7 +14,7 @@
 #include <linux/namei.h>
 #include <linux/slab.h>
 
-static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_inode *fi = get_fuse_inode(dir);
@@ -25,7 +25,7 @@
 		return true;
 	if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
 		return true;
-	if (filp->f_pos == 0)
+	if (ctx->pos == 0)
 		return true;
 	return false;
 }
@@ -1165,25 +1165,23 @@
 }
 
 static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
-			 void *dstbuf, filldir_t filldir)
+			 struct dir_context *ctx)
 {
 	while (nbytes >= FUSE_NAME_OFFSET) {
 		struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
 		size_t reclen = FUSE_DIRENT_SIZE(dirent);
-		int over;
 		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
 			return -EIO;
 		if (reclen > nbytes)
 			break;
 
-		over = filldir(dstbuf, dirent->name, dirent->namelen,
-			       file->f_pos, dirent->ino, dirent->type);
-		if (over)
+		if (!dir_emit(ctx, dirent->name, dirent->namelen,
+			       dirent->ino, dirent->type))
 			break;
 
 		buf += reclen;
 		nbytes -= reclen;
-		file->f_pos = dirent->off;
+		ctx->pos = dirent->off;
 	}
 
 	return 0;
@@ -1284,7 +1282,7 @@
 }
 
 static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
-			     void *dstbuf, filldir_t filldir, u64 attr_version)
+			     struct dir_context *ctx, u64 attr_version)
 {
 	struct fuse_direntplus *direntplus;
 	struct fuse_dirent *dirent;
@@ -1309,10 +1307,9 @@
 			   we need to send a FORGET for each of those
 			   which we did not link.
 			*/
-			over = filldir(dstbuf, dirent->name, dirent->namelen,
-				       file->f_pos, dirent->ino,
-				       dirent->type);
-			file->f_pos = dirent->off;
+			over = !dir_emit(ctx, dirent->name, dirent->namelen,
+				       dirent->ino, dirent->type);
+			ctx->pos = dirent->off;
 		}
 
 		buf += reclen;
@@ -1326,7 +1323,7 @@
 	return 0;
 }
 
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+static int fuse_readdir(struct file *file, struct dir_context *ctx)
 {
 	int plus, err;
 	size_t nbytes;
@@ -1349,17 +1346,17 @@
 		return -ENOMEM;
 	}
 
-	plus = fuse_use_readdirplus(inode, file);
+	plus = fuse_use_readdirplus(inode, ctx);
 	req->out.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
 	req->page_descs[0].length = PAGE_SIZE;
 	if (plus) {
 		attr_version = fuse_get_attr_version(fc);
-		fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
 			       FUSE_READDIRPLUS);
 	} else {
-		fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
 			       FUSE_READDIR);
 	}
 	fuse_request_send(fc, req);
@@ -1369,11 +1366,11 @@
 	if (!err) {
 		if (plus) {
 			err = parse_dirplusfile(page_address(page), nbytes,
-						file, dstbuf, filldir,
+						file, ctx,
 						attr_version);
 		} else {
 			err = parse_dirfile(page_address(page), nbytes, file,
-					    dstbuf, filldir);
+					    ctx);
 		}
 	}
 
@@ -1886,7 +1883,7 @@
 static const struct file_operations fuse_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= fuse_readdir,
+	.iterate	= fuse_readdir,
 	.open		= fuse_dir_open,
 	.release	= fuse_dir_release,
 	.fsync		= fuse_dir_fsync,

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e570081..35f2810 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c

@@ -2470,13 +2470,16 @@
 		.mode = mode
 	};
 	int err;
+	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+			   (mode & FALLOC_FL_PUNCH_HOLE);
 
 	if (fc->no_fallocate)
 		return -EOPNOTSUPP;
 
-	if (mode & FALLOC_FL_PUNCH_HOLE) {
+	if (lock_inode) {
 		mutex_lock(&inode->i_mutex);
-		fuse_set_nowrite(inode);
+		if (mode & FALLOC_FL_PUNCH_HOLE)
+			fuse_set_nowrite(inode);
 	}
 
 	req = fuse_get_req_nopages(fc);
@@ -2511,8 +2514,9 @@
 	fuse_invalidate_attr(inode);
 
 out:
-	if (mode & FALLOC_FL_PUNCH_HOLE) {
-		fuse_release_nowrite(inode);
+	if (lock_inode) {
+		if (mode & FALLOC_FL_PUNCH_HOLE)
+			fuse_release_nowrite(inode);
 		mutex_unlock(&inode->i_mutex);
 	}
 

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69e..ee48ad3 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c

@@ -110,7 +110,7 @@
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
 	if (page->index > end_index || (page->index == end_index && !offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0);
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		goto out;
 	}
 	return 1;
@@ -299,7 +299,8 @@
 
 		/* Is the page fully outside i_size? (truncate in progress) */
 		if (page->index > end_index || (page->index == end_index && !offset)) {
-			page->mapping->a_ops->invalidatepage(page, 0);
+			page->mapping->a_ops->invalidatepage(page, 0,
+							     PAGE_CACHE_SIZE);
 			unlock_page(page);
 			continue;
 		}
@@ -943,27 +944,33 @@
 	unlock_buffer(bh);
 }
 
-static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+static void gfs2_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	unsigned int stop = offset + length;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
 	struct buffer_head *bh, *head;
 	unsigned long pos = 0;
 
 	BUG_ON(!PageLocked(page));
-	if (offset == 0)
+	if (!partial_page)
 		ClearPageChecked(page);
 	if (!page_has_buffers(page))
 		goto out;
 
 	bh = head = page_buffers(page);
 	do {
+		if (pos + bh->b_size > stop)
+			return;
+
 		if (offset <= pos)
 			gfs2_discard(sdp, bh);
 		pos += bh->b_size;
 		bh = bh->b_this_page;
 	} while (bh != head);
 out:
-	if (offset == 0)
+	if (!partial_page)
 		try_to_release_page(page, 0);
 }
 

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93b5809..5e2f56f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c

@@ -1232,7 +1232,9 @@
 		unstuff = 1;
 	}
 
-	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
+	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
+				  0 : RES_QUOTA), 0);
 	if (error)
 		goto do_grow_release;
 

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b631c90..0cb4c155 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c

@@ -1125,13 +1125,14 @@
 	if (IS_ERR(hc))
 		return PTR_ERR(hc);
 
-	h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
+	hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
 	if (hc2 == NULL)
 		hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
 
 	if (!hc2)
 		return -ENOMEM;
 
+	h = hc2;
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (error)
 		goto out_kfree;
@@ -1212,9 +1213,7 @@
 /**
  * do_filldir_main - read out directory entries
  * @dip: The GFS2 inode
- * @offset: The offset in the file to read from
- * @opaque: opaque data to pass to filldir
- * @filldir: The function to pass entries to
+ * @ctx: what to feed the entries to
  * @darr: an array of struct gfs2_dirent pointers to read
  * @entries: the number of entries in darr
  * @copied: pointer to int that's non-zero if a entry has been copied out
@@ -1224,11 +1223,10 @@
  * the possibility that they will fall into different readdir buffers or
  * that someone will want to seek to that location.
  *
- * Returns: errno, >0 on exception from filldir
+ * Returns: errno, >0 if the actor tells you to stop
  */
 
-static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
-			   void *opaque, filldir_t filldir,
+static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
 			   const struct gfs2_dirent **darr, u32 entries,
 			   int *copied)
 {
@@ -1236,7 +1234,6 @@
 	u64 off, off_next;
 	unsigned int x, y;
 	int run = 0;
-	int error = 0;
 
 	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
 
@@ -1253,9 +1250,9 @@
 			off_next = be32_to_cpu(dent_next->de_hash);
 			off_next = gfs2_disk_hash2offset(off_next);
 
-			if (off < *offset)
+			if (off < ctx->pos)
 				continue;
-			*offset = off;
+			ctx->pos = off;
 
 			if (off_next == off) {
 				if (*copied && !run)
@@ -1264,26 +1261,25 @@
 			} else
 				run = 0;
 		} else {
-			if (off < *offset)
+			if (off < ctx->pos)
 				continue;
-			*offset = off;
+			ctx->pos = off;
 		}
 
-		error = filldir(opaque, (const char *)(dent + 1),
+		if (!dir_emit(ctx, (const char *)(dent + 1),
 				be16_to_cpu(dent->de_name_len),
-				off, be64_to_cpu(dent->de_inum.no_addr),
-				be16_to_cpu(dent->de_type));
-		if (error)
+				be64_to_cpu(dent->de_inum.no_addr),
+				be16_to_cpu(dent->de_type)))
 			return 1;
 
 		*copied = 1;
 	}
 
-	/* Increment the *offset by one, so the next time we come into the
+	/* Increment the ctx->pos by one, so the next time we come into the
 	   do_filldir fxn, we get the next entry instead of the last one in the
 	   current leaf */
 
-	(*offset)++;
+	ctx->pos++;
 
 	return 0;
 }
@@ -1307,8 +1303,8 @@
 		kfree(ptr);
 }
 
-static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
-			      filldir_t filldir, int *copied, unsigned *depth,
+static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
+			      int *copied, unsigned *depth,
 			      u64 leaf_no)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -1386,8 +1382,7 @@
 	} while(lfn);
 
 	BUG_ON(entries2 != entries);
-	error = do_filldir_main(ip, offset, opaque, filldir, darr,
-				entries, copied);
+	error = do_filldir_main(ip, ctx, darr, entries, copied);
 out_free:
 	for(i = 0; i < leaf; i++)
 		brelse(larr[i]);
@@ -1446,15 +1441,13 @@
 /**
  * dir_e_read - Reads the entries from a directory into a filldir buffer
  * @dip: dinode pointer
- * @offset: the hash of the last entry read shifted to the right once
- * @opaque: buffer for the filldir function to fill
- * @filldir: points to the filldir function to use
+ * @ctx: actor to feed the entries to
  *
  * Returns: errno
  */
 
-static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
-		      filldir_t filldir, struct file_ra_state *f_ra)
+static int dir_e_read(struct inode *inode, struct dir_context *ctx,
+		      struct file_ra_state *f_ra)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
 	u32 hsize, len = 0;
@@ -1465,7 +1458,7 @@
 	unsigned depth = 0;
 
 	hsize = 1 << dip->i_depth;
-	hash = gfs2_dir_offset2hash(*offset);
+	hash = gfs2_dir_offset2hash(ctx->pos);
 	index = hash >> (32 - dip->i_depth);
 
 	if (dip->i_hash_cache == NULL)
@@ -1477,7 +1470,7 @@
 	gfs2_dir_readahead(inode, hsize, index, f_ra);
 
 	while (index < hsize) {
-		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+		error = gfs2_dir_read_leaf(inode, ctx,
 					   &copied, &depth,
 					   be64_to_cpu(lp[index]));
 		if (error)
@@ -1492,8 +1485,8 @@
 	return error;
 }
 
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-		  filldir_t filldir, struct file_ra_state *f_ra)
+int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+		  struct file_ra_state *f_ra)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1507,7 +1500,7 @@
 		return 0;
 
 	if (dip->i_diskflags & GFS2_DIF_EXHASH)
-		return dir_e_read(inode, offset, opaque, filldir, f_ra);
+		return dir_e_read(inode, ctx, f_ra);
 
 	if (!gfs2_is_stuffed(dip)) {
 		gfs2_consist_inode(dip);
@@ -1539,7 +1532,7 @@
 			error = -EIO;
 			goto out;
 		}
-		error = do_filldir_main(dip, offset, opaque, filldir, darr,
+		error = do_filldir_main(dip, ctx, darr,
 					dip->i_entries, &copied);
 out:
 		kfree(darr);
@@ -1555,9 +1548,9 @@
 
 /**
  * gfs2_dir_search - Search a directory
- * @dip: The GFS2 inode
- * @filename:
- * @inode:
+ * @dip: The GFS2 dir inode
+ * @name: The name we are looking up
+ * @fail_on_exist: Fail if the name exists rather than looking it up
  *
  * This routine searches a directory for a file or another directory.
  * Assumes a glock is held on dip.
@@ -1565,22 +1558,25 @@
  * Returns: errno
  */
 
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
+struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
+			      bool fail_on_exist)
 {
 	struct buffer_head *bh;
 	struct gfs2_dirent *dent;
-	struct inode *inode;
+	u64 addr, formal_ino;
+	u16 dtype;
 
 	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
 	if (dent) {
 		if (IS_ERR(dent))
 			return ERR_CAST(dent);
-		inode = gfs2_inode_lookup(dir->i_sb, 
-				be16_to_cpu(dent->de_type),
-				be64_to_cpu(dent->de_inum.no_addr),
-				be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+		dtype = be16_to_cpu(dent->de_type);
+		addr = be64_to_cpu(dent->de_inum.no_addr);
+		formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
 		brelse(bh);
-		return inode;
+		if (fail_on_exist)
+			return ERR_PTR(-EEXIST);
+		return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
 	}
 	return ERR_PTR(-ENOENT);
 }

diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 98c960b..4f03bbd 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h

@@ -18,14 +18,15 @@
 struct gfs2_inum;
 
 extern struct inode *gfs2_dir_search(struct inode *dir,
-				     const struct qstr *filename);
+				     const struct qstr *filename,
+				     bool fail_on_exist);
 extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
 			  const struct gfs2_inode *ip);
 extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
 			const struct gfs2_inode *ip);
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
-extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-			 filldir_t filldir, struct file_ra_state *f_ra);
+extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+			 struct file_ra_state *f_ra);
 extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 			  const struct gfs2_inode *nip, unsigned int new_type);
 

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9973df4..8b9b377 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c

@@ -64,6 +64,7 @@
 }
 
 struct get_name_filldir {
+	struct dir_context ctx;
 	struct gfs2_inum_host inum;
 	char *name;
 };
@@ -88,9 +89,11 @@
 	struct inode *dir = parent->d_inode;
 	struct inode *inode = child->d_inode;
 	struct gfs2_inode *dip, *ip;
-	struct get_name_filldir gnfd;
+	struct get_name_filldir gnfd = {
+		.ctx.actor = get_name_filldir,
+		.name = name
+	};
 	struct gfs2_holder gh;
-	u64 offset = 0;
 	int error;
 	struct file_ra_state f_ra = { .start = 0 };
 
@@ -106,13 +109,12 @@
 	*name = 0;
 	gnfd.inum.no_addr = ip->i_no_addr;
 	gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
-	gnfd.name = name;
 
 	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
 	if (error)
 		return error;
 
-	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);
+	error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra);
 
 	gfs2_glock_dq_uninit(&gh);
 

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ad0dc38..f99f9e8 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c

@@ -82,35 +82,28 @@
 }
 
 /**
- * gfs2_readdir - Read directory entries from a directory
+ * gfs2_readdir - Iterator for a directory
  * @file: The directory to read from
- * @dirent: Buffer for dirents
- * @filldir: Function used to do the copying
+ * @ctx: What to feed directory entries to
  *
  * Returns: errno
  */
 
-static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int gfs2_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *dir = file->f_mapping->host;
 	struct gfs2_inode *dip = GFS2_I(dir);
 	struct gfs2_holder d_gh;
-	u64 offset = file->f_pos;
 	int error;
 
-	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-	error = gfs2_glock_nq(&d_gh);
-	if (error) {
-		gfs2_holder_uninit(&d_gh);
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+	if (error)
 		return error;
-	}
 
-	error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
+	error = gfs2_dir_read(dir, ctx, &file->f_ra);
 
 	gfs2_glock_dq_uninit(&d_gh);
 
-	file->f_pos = offset;
-
 	return error;
 }
 
@@ -538,10 +531,51 @@
 }
 
 /**
+ * gfs2_open_common - This is common to open and atomic_open
+ * @inode: The inode being opened
+ * @file: The file being opened
+ *
+ * This maybe called under a glock or not depending upon how it has
+ * been called. We must always be called under a glock for regular
+ * files, however. For other file types, it does not matter whether
+ * we hold the glock or not.
+ *
+ * Returns: Error code or 0 for success
+ */
+
+int gfs2_open_common(struct inode *inode, struct file *file)
+{
+	struct gfs2_file *fp;
+	int ret;
+
+	if (S_ISREG(inode->i_mode)) {
+		ret = generic_file_open(inode, file);
+		if (ret)
+			return ret;
+	}
+
+	fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
+	if (!fp)
+		return -ENOMEM;
+
+	mutex_init(&fp->f_fl_mutex);
+
+	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
+	file->private_data = fp;
+	return 0;
+}
+
+/**
  * gfs2_open - open a file
  * @inode: the inode to open
  * @file: the struct file for this opening
  *
+ * After atomic_open, this function is only used for opening files
+ * which are already cached. We must still get the glock for regular
+ * files to ensure that we have the file size uptodate for the large
+ * file check which is in the common code. That is only an issue for
+ * regular files though.
+ *
  * Returns: errno
  */
 
@@ -549,40 +583,22 @@
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder i_gh;
-	struct gfs2_file *fp;
 	int error;
-
-	fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
-	if (!fp)
-		return -ENOMEM;
-
-	mutex_init(&fp->f_fl_mutex);
-
-	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
-	file->private_data = fp;
+	bool need_unlock = false;
 
 	if (S_ISREG(ip->i_inode.i_mode)) {
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
 					   &i_gh);
 		if (error)
-			goto fail;
-
-		if (!(file->f_flags & O_LARGEFILE) &&
-		    i_size_read(inode) > MAX_NON_LFS) {
-			error = -EOVERFLOW;
-			goto fail_gunlock;
-		}
-
-		gfs2_glock_dq_uninit(&i_gh);
+			return error;
+		need_unlock = true;
 	}
 
-	return 0;
+	error = gfs2_open_common(inode, file);
 
-fail_gunlock:
-	gfs2_glock_dq_uninit(&i_gh);
-fail:
-	file->private_data = NULL;
-	kfree(fp);
+	if (need_unlock)
+		gfs2_glock_dq_uninit(&i_gh);
+
 	return error;
 }
 
@@ -1048,7 +1064,7 @@
 };
 
 const struct file_operations gfs2_dir_fops = {
-	.readdir	= gfs2_readdir,
+	.iterate	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
 	.release	= gfs2_release,
@@ -1078,7 +1094,7 @@
 };
 
 const struct file_operations gfs2_dir_fops_nolock = {
-	.readdir	= gfs2_readdir,
+	.iterate	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
 	.release	= gfs2_release,

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c66e99c..5f2e522 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c

@@ -54,7 +54,6 @@
 	struct gfs2_bufdata *bd, *tmp;
 	struct buffer_head *bh;
 	const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
-	sector_t blocknr;
 
 	gfs2_log_lock(sdp);
 	spin_lock(&sdp->sd_ail_lock);
@@ -65,13 +64,6 @@
 				continue;
 			gfs2_ail_error(gl, bh);
 		}
-		blocknr = bh->b_blocknr;
-		bh->b_private = NULL;
-		gfs2_remove_from_ail(bd); /* drops ref on bh */
-
-		bd->bd_bh = NULL;
-		bd->bd_blkno = blocknr;
-
 		gfs2_trans_add_revoke(sdp, bd);
 	}
 	GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 62b484e..bbb2715 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c

@@ -313,7 +313,7 @@
 			goto out;
 	}
 
-	inode = gfs2_dir_search(dir, name);
+	inode = gfs2_dir_search(dir, name, false);
 	if (IS_ERR(inode))
 		error = PTR_ERR(inode);
 out:
@@ -346,17 +346,6 @@
 	if (!dip->i_inode.i_nlink)
 		return -ENOENT;
 
-	error = gfs2_dir_check(&dip->i_inode, name, NULL);
-	switch (error) {
-	case -ENOENT:
-		error = 0;
-		break;
-	case 0:
-		return -EEXIST;
-	default:
-		return error;
-	}
-
 	if (dip->i_entries == (u32)-1)
 		return -EFBIG;
 	if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
@@ -546,6 +535,7 @@
  * gfs2_create_inode - Create a new inode
  * @dir: The parent directory
  * @dentry: The new dentry
+ * @file: If non-NULL, the file which is being opened
  * @mode: The permissions on the new inode
  * @dev: For device nodes, this is the device number
  * @symname: For symlinks, this is the link destination
@@ -555,8 +545,9 @@
  */
 
 static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+			     struct file *file,
 			     umode_t mode, dev_t dev, const char *symname,
-			     unsigned int size, int excl)
+			     unsigned int size, int excl, int *opened)
 {
 	const struct qstr *name = &dentry->d_name;
 	struct gfs2_holder ghs[2];
@@ -564,6 +555,7 @@
 	struct gfs2_inode *dip = GFS2_I(dir), *ip;
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_glock *io_gl;
+	struct dentry *d;
 	int error;
 	u32 aflags = 0;
 	int arq;
@@ -584,15 +576,30 @@
 		goto fail;
 
 	error = create_ok(dip, name, mode);
-	if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
-		inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-		gfs2_glock_dq_uninit(ghs);
-		d_instantiate(dentry, inode);
-		return IS_ERR(inode) ? PTR_ERR(inode) : 0;
-	}
 	if (error)
 		goto fail_gunlock;
 
+	inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
+	error = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		d = d_splice_alias(inode, dentry);
+		error = 0;
+		if (file && !IS_ERR(d)) {
+			if (d == NULL)
+				d = dentry;
+			if (S_ISREG(inode->i_mode))
+				error = finish_open(file, d, gfs2_open_common, opened);
+			else
+				error = finish_no_open(file, d);
+		}
+		gfs2_glock_dq_uninit(ghs);
+		if (IS_ERR(d))
+			return PTR_RET(d);
+		return error;
+	} else if (error != -ENOENT) {
+		goto fail_gunlock;
+	}
+
 	arq = error = gfs2_diradd_alloc_required(dir, name);
 	if (error < 0)
 		goto fail_gunlock;
@@ -686,10 +693,12 @@
 		goto fail_gunlock3;
 
 	mark_inode_dirty(inode);
+	d_instantiate(dentry, inode);
+	if (file)
+		error = finish_open(file, dentry, gfs2_open_common, opened);
 	gfs2_glock_dq_uninit(ghs);
 	gfs2_glock_dq_uninit(ghs + 1);
-	d_instantiate(dentry, inode);
-	return 0;
+	return error;
 
 fail_gunlock3:
 	gfs2_glock_dq_uninit(ghs + 1);
@@ -729,36 +738,56 @@
 static int gfs2_create(struct inode *dir, struct dentry *dentry,
 		       umode_t mode, bool excl)
 {
-	return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
+	return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
 }
 
 /**
- * gfs2_lookup - Look up a filename in a directory and return its inode
+ * __gfs2_lookup - Look up a filename in a directory and return its inode
  * @dir: The directory inode
  * @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
+ * @file: File to be opened
+ * @opened: atomic_open flags
  *
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
  *
  * Returns: errno
  */
 
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
-				  unsigned int flags)
+static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
+				    struct file *file, int *opened)
 {
-	struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-	if (inode && !IS_ERR(inode)) {
-		struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
-		struct gfs2_holder gh;
-		int error;
-		error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-		if (error) {
-			iput(inode);
-			return ERR_PTR(error);
-		}
-		gfs2_glock_dq_uninit(&gh);
+	struct inode *inode;
+	struct dentry *d;
+	struct gfs2_holder gh;
+	struct gfs2_glock *gl;
+	int error;
+
+	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+	if (!inode)
+		return NULL;
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	gl = GFS2_I(inode)->i_gl;
+	error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+	if (error) {
+		iput(inode);
+		return ERR_PTR(error);
 	}
-	return d_splice_alias(inode, dentry);
+
+	d = d_splice_alias(inode, dentry);
+	if (file && S_ISREG(inode->i_mode))
+		error = finish_open(file, dentry, gfs2_open_common, opened);
+
+	gfs2_glock_dq_uninit(&gh);
+	if (error)
+		return ERR_PTR(error);
+	return d;
+}
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+				  unsigned flags)
+{
+	return __gfs2_lookup(dir, dentry, NULL, NULL);
 }
 
 /**
@@ -1076,7 +1105,7 @@
 	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
 		return -ENAMETOOLONG;
 
-	return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
+	return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
 }
 
 /**
@@ -1092,7 +1121,7 @@
 {
 	struct gfs2_sbd *sdp = GFS2_SB(dir);
 	unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
-	return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0);
+	return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
 }
 
 /**
@@ -1107,7 +1136,43 @@
 static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 		      dev_t dev)
 {
-	return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
+	return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
+}
+
+/**
+ * gfs2_atomic_open - Atomically open a file
+ * @dir: The directory
+ * @dentry: The proposed new entry
+ * @file: The proposed new struct file
+ * @flags: open flags
+ * @mode: File mode
+ * @opened: Flag to say whether the file has been opened or not
+ *
+ * Returns: error code or 0 for success
+ */
+
+static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
+                            struct file *file, unsigned flags,
+                            umode_t mode, int *opened)
+{
+	struct dentry *d;
+	bool excl = !!(flags & O_EXCL);
+
+	d = __gfs2_lookup(dir, dentry, file, opened);
+	if (IS_ERR(d))
+		return PTR_ERR(d);
+	if (d == NULL)
+		d = dentry;
+	if (d->d_inode) {
+		if (!(*opened & FILE_OPENED))
+			return finish_no_open(file, d);
+		return 0;
+	}
+
+	if (!(flags & O_CREAT))
+		return -ENOENT;
+
+	return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
 }
 
 /*
@@ -1787,6 +1852,7 @@
 	.removexattr = gfs2_removexattr,
 	.fiemap = gfs2_fiemap,
 	.get_acl = gfs2_get_acl,
+	.atomic_open = gfs2_atomic_open,
 };
 
 const struct inode_operations gfs2_symlink_iops = {

diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c53c747..ba4d949 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h

@@ -109,6 +109,7 @@
 extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern int gfs2_open_common(struct inode *inode, struct file *file);
 
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b404f48..610613f 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c

@@ -211,15 +211,16 @@
 static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
 {
 	struct gfs2_trans *tr, *s;
+	int oldest_tr = 1;
 	int ret;
 
 	spin_lock(&sdp->sd_ail_lock);
 	list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
 		gfs2_ail1_empty_one(sdp, tr);
-		if (list_empty(&tr->tr_ail1_list))
+		if (list_empty(&tr->tr_ail1_list) && oldest_tr)
 			list_move(&tr->tr_list, &sdp->sd_ail2_list);
 		else
-			break;
+			oldest_tr = 0;
 	}
 	ret = list_empty(&sdp->sd_ail1_list);
 	spin_unlock(&sdp->sd_ail_lock);
@@ -317,7 +318,7 @@
 
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 {
-	unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+	unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
 	unsigned wanted = blks + reserved_blks;
 	DEFINE_WAIT(wait);
 	int did_wait = 0;
@@ -545,6 +546,76 @@
 	spin_unlock(&sdp->sd_ordered_lock);
 }
 
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+	struct buffer_head *bh = bd->bd_bh;
+	struct gfs2_glock *gl = bd->bd_gl;
+
+	gfs2_remove_from_ail(bd);
+	bd->bd_bh = NULL;
+	bh->b_private = NULL;
+	bd->bd_blkno = bh->b_blocknr;
+	bd->bd_ops = &gfs2_revoke_lops;
+	sdp->sd_log_num_revoke++;
+	atomic_inc(&gl->gl_revokes);
+	set_bit(GLF_LFLUSH, &gl->gl_flags);
+	list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
+}
+
+void gfs2_write_revokes(struct gfs2_sbd *sdp)
+{
+	struct gfs2_trans *tr;
+	struct gfs2_bufdata *bd, *tmp;
+	int have_revokes = 0;
+	int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
+
+	gfs2_ail1_empty(sdp);
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+		list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
+			if (list_empty(&bd->bd_list)) {
+				have_revokes = 1;
+				goto done;
+			}
+		}
+	}
+done:
+	spin_unlock(&sdp->sd_ail_lock);
+	if (have_revokes == 0)
+		return;
+	while (sdp->sd_log_num_revoke > max_revokes)
+		max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+	max_revokes -= sdp->sd_log_num_revoke;
+	if (!sdp->sd_log_num_revoke) {
+		atomic_dec(&sdp->sd_log_blks_free);
+		/* If no blocks have been reserved, we need to also
+		 * reserve a block for the header */
+		if (!sdp->sd_log_blks_reserved)
+			atomic_dec(&sdp->sd_log_blks_free);
+	}
+	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+		list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
+			if (max_revokes == 0)
+				goto out_of_blocks;
+			if (!list_empty(&bd->bd_list))
+				continue;
+			gfs2_add_revoke(sdp, bd);
+			max_revokes--;
+		}
+	}
+out_of_blocks:
+	spin_unlock(&sdp->sd_ail_lock);
+	gfs2_log_unlock(sdp);
+
+	if (!sdp->sd_log_num_revoke) {
+		atomic_inc(&sdp->sd_log_blks_free);
+		if (!sdp->sd_log_blks_reserved)
+			atomic_inc(&sdp->sd_log_blks_free);
+	}
+}
+
 /**
  * log_write_header - Get and initialize a journal header buffer
  * @sdp: The GFS2 superblock
@@ -562,7 +633,6 @@
 	lh = page_address(page);
 	clear_page(lh);
 
-	gfs2_ail1_empty(sdp);
 	tail = current_tail(sdp);
 
 	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);

diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3566f35..3721663 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h

@@ -72,5 +72,7 @@
 extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
 extern int gfs2_logd(void *data);
+extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
 
 #endif /* __LOG_DOT_H__ */

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c33d7b..17c5b5d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c

@@ -16,6 +16,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
+#include <linux/list_sort.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -401,6 +402,20 @@
 	kunmap_atomic(kaddr);
 }
 
+static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct gfs2_bufdata *bda, *bdb;
+
+	bda = list_entry(a, struct gfs2_bufdata, bd_list);
+	bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+
+	if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+		return -1;
+	if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+		return 1;
+	return 0;
+}
+
 static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
 				unsigned int total, struct list_head *blist,
 				bool is_databuf)
@@ -413,6 +428,7 @@
 	__be64 *ptr;
 
 	gfs2_log_lock(sdp);
+	list_sort(NULL, blist, blocknr_cmp);
 	bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
 	while(total) {
 		num = total;
@@ -590,6 +606,7 @@
 	struct page *page;
 	unsigned int length;
 
+	gfs2_write_revokes(sdp);
 	if (!sdp->sd_log_num_revoke)
 		return;
 
@@ -836,10 +853,6 @@
 	.lo_name = "revoke",
 };
 
-const struct gfs2_log_operations gfs2_rg_lops = {
-	.lo_name = "rg",
-};
-
 const struct gfs2_log_operations gfs2_databuf_lops = {
 	.lo_before_commit = databuf_lo_before_commit,
 	.lo_after_commit = databuf_lo_after_commit,
@@ -851,7 +864,6 @@
 const struct gfs2_log_operations *gfs2_log_ops[] = {
 	&gfs2_databuf_lops,
 	&gfs2_buf_lops,
-	&gfs2_rg_lops,
 	&gfs2_revoke_lops,
 	NULL,
 };

diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 87e062e..9ca2e64 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h

@@ -23,7 +23,6 @@
 extern const struct gfs2_log_operations gfs2_glock_lops;
 extern const struct gfs2_log_operations gfs2_buf_lops;
 extern const struct gfs2_log_operations gfs2_revoke_lops;
-extern const struct gfs2_log_operations gfs2_rg_lops;
 extern const struct gfs2_log_operations gfs2_databuf_lops;
 
 extern const struct gfs2_log_operations *gfs2_log_ops[];

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 1a89afb..0da3906 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c

@@ -296,10 +296,6 @@
 	if (bd) {
 		spin_lock(&sdp->sd_ail_lock);
 		if (bd->bd_tr) {
-			gfs2_remove_from_ail(bd);
-			bh->b_private = NULL;
-			bd->bd_bh = NULL;
-			bd->bd_blkno = bh->b_blocknr;
 			gfs2_trans_add_revoke(sdp, bd);
 		}
 		spin_unlock(&sdp->sd_ail_lock);

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a..0262c19 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c

@@ -916,16 +916,16 @@
 		goto fail_quotad;
 
 	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
-	error = IS_ERR(p);
-	if (error) {
+	if (IS_ERR(p)) {
+		error = PTR_ERR(p);
 		fs_err(sdp, "can't start logd thread: %d\n", error);
 		return error;
 	}
 	sdp->sd_logd_process = p;
 
 	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
-	error = IS_ERR(p);
-	if (error) {
+	if (IS_ERR(p)) {
+		error = PTR_ERR(p);
 		fs_err(sdp, "can't start quotad thread: %d\n", error);
 		goto fail;
 	}

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c253b13..3768c2f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c

@@ -1154,11 +1154,6 @@
 	return error;
 }
 
-static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
-{
-	return gfs2_quota_sync(sb, type);
-}
-
 int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
 {
 	struct gfs2_quota_data *qd;
@@ -1414,7 +1409,7 @@
 					   &tune->gt_statfs_quantum);
 
 		/* Update quota file */
-		quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
+		quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
 				   &quotad_timeo, &tune->gt_quota_quantum);
 
 		/* Check for & recover partially truncated inodes */

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9809156..6931743 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c

@@ -1288,14 +1288,16 @@
 	minlen = max_t(u64, r.minlen,
 		       q->limits.discard_granularity) >> bs_shift;
 
-	rgd = gfs2_blk2rgrpd(sdp, start, 0);
-	rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
-
-	if (end <= start ||
-	    minlen > sdp->sd_max_rg_data ||
-	    start > rgd_end->rd_data0 + rgd_end->rd_data)
+	if (end <= start || minlen > sdp->sd_max_rg_data)
 		return -EINVAL;
 
+	rgd = gfs2_blk2rgrpd(sdp, start, 0);
+	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
+
+	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
+	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
+		return -EINVAL; /* start is beyond the end of the fs */
+
 	while (1) {
 
 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
@@ -1336,7 +1338,7 @@
 	}
 
 out:
-	r.len = trimmed << 9;
+	r.len = trimmed << bs_shift;
 	if (copy_to_user(argp, &r, sizeof(r)))
 		return -EFAULT;
 

diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 73749077..2b20d70 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c

@@ -270,19 +270,12 @@
 
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 {
-	struct gfs2_glock *gl = bd->bd_gl;
 	struct gfs2_trans *tr = current->journal_info;
 
 	BUG_ON(!list_empty(&bd->bd_list));
-	BUG_ON(!list_empty(&bd->bd_ail_st_list));
-	BUG_ON(!list_empty(&bd->bd_ail_gl_list));
-	bd->bd_ops = &gfs2_revoke_lops;
+	gfs2_add_revoke(sdp, bd);
 	tr->tr_touched = 1;
 	tr->tr_num_revoke++;
-	sdp->sd_log_num_revoke++;
-	atomic_inc(&gl->gl_revokes);
-	set_bit(GLF_LFLUSH, &gl->gl_flags);
-	list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
 }
 
 void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)

diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index e0101b6..1455668 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c

@@ -51,9 +51,9 @@
 /*
  * hfs_readdir
  */
-static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	int len, err;
 	char strbuf[HFS_MAX_NAMELEN];
@@ -62,7 +62,7 @@
 	struct hfs_readdir_data *rd;
 	u16 type;
 
-	if (filp->f_pos >= inode->i_size)
+	if (ctx->pos >= inode->i_size)
 		return 0;
 
 	err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
@@ -73,14 +73,13 @@
 	if (err)
 		goto out;
 
-	switch ((u32)filp->f_pos) {
-	case 0:
+	if (ctx->pos == 0) {
 		/* This is completely artificial... */
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+		if (!dir_emit_dot(file, ctx))
 			goto out;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
 		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
 			err = -EIO;
 			goto out;
@@ -97,18 +96,16 @@
 		//	err = -EIO;
 		//	goto out;
 		//}
-		if (filldir(dirent, "..", 2, 1,
+		if (!dir_emit(ctx, "..", 2,
 			    be32_to_cpu(entry.thread.ParID), DT_DIR))
 			goto out;
-		filp->f_pos++;
-		/* fall through */
-	default:
-		if (filp->f_pos >= inode->i_size)
-			goto out;
-		err = hfs_brec_goto(&fd, filp->f_pos - 1);
-		if (err)
-			goto out;
+		ctx->pos = 2;
 	}
+	if (ctx->pos >= inode->i_size)
+		goto out;
+	err = hfs_brec_goto(&fd, ctx->pos - 1);
+	if (err)
+		goto out;
 
 	for (;;) {
 		if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
@@ -131,7 +128,7 @@
 				err = -EIO;
 				goto out;
 			}
-			if (filldir(dirent, strbuf, len, filp->f_pos,
+			if (!dir_emit(ctx, strbuf, len,
 				    be32_to_cpu(entry.dir.DirID), DT_DIR))
 				break;
 		} else if (type == HFS_CDR_FIL) {
@@ -140,7 +137,7 @@
 				err = -EIO;
 				goto out;
 			}
-			if (filldir(dirent, strbuf, len, filp->f_pos,
+			if (!dir_emit(ctx, strbuf, len,
 				    be32_to_cpu(entry.file.FlNum), DT_REG))
 				break;
 		} else {
@@ -148,22 +145,22 @@
 			err = -EIO;
 			goto out;
 		}
-		filp->f_pos++;
-		if (filp->f_pos >= inode->i_size)
+		ctx->pos++;
+		if (ctx->pos >= inode->i_size)
 			goto out;
 		err = hfs_brec_goto(&fd, 1);
 		if (err)
 			goto out;
 	}
-	rd = filp->private_data;
+	rd = file->private_data;
 	if (!rd) {
 		rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
 		if (!rd) {
 			err = -ENOMEM;
 			goto out;
 		}
-		filp->private_data = rd;
-		rd->file = filp;
+		file->private_data = rd;
+		rd->file = file;
 		list_add(&rd->list, &HFS_I(inode)->open_dir_list);
 	}
 	memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
@@ -306,7 +303,7 @@
 
 const struct file_operations hfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= hfs_readdir,
+	.iterate	= hfs_readdir,
 	.llseek		= generic_file_llseek,
 	.release	= hfs_dir_release,
 };

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index a37ac93..d8ce4bd1 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c

@@ -121,9 +121,9 @@
 	return ERR_PTR(err);
 }
 
-static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	int len, err;
 	char strbuf[HFSPLUS_MAX_STRLEN + 1];
@@ -132,7 +132,7 @@
 	struct hfsplus_readdir_data *rd;
 	u16 type;
 
-	if (filp->f_pos >= inode->i_size)
+	if (file->f_pos >= inode->i_size)
 		return 0;
 
 	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
@@ -143,14 +143,13 @@
 	if (err)
 		goto out;
 
-	switch ((u32)filp->f_pos) {
-	case 0:
+	if (ctx->pos == 0) {
 		/* This is completely artificial... */
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+		if (!dir_emit_dot(file, ctx))
 			goto out;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
 		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
 			err = -EIO;
 			goto out;
@@ -168,19 +167,16 @@
 			err = -EIO;
 			goto out;
 		}
-		if (filldir(dirent, "..", 2, 1,
+		if (!dir_emit(ctx, "..", 2,
 			    be32_to_cpu(entry.thread.parentID), DT_DIR))
 			goto out;
-		filp->f_pos++;
-		/* fall through */
-	default:
-		if (filp->f_pos >= inode->i_size)
-			goto out;
-		err = hfs_brec_goto(&fd, filp->f_pos - 1);
-		if (err)
-			goto out;
+		ctx->pos = 2;
 	}
-
+	if (ctx->pos >= inode->i_size)
+		goto out;
+	err = hfs_brec_goto(&fd, ctx->pos - 1);
+	if (err)
+		goto out;
 	for (;;) {
 		if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
 			pr_err("walked past end of dir\n");
@@ -211,7 +207,7 @@
 			    HFSPLUS_SB(sb)->hidden_dir->i_ino ==
 					be32_to_cpu(entry.folder.id))
 				goto next;
-			if (filldir(dirent, strbuf, len, filp->f_pos,
+			if (!dir_emit(ctx, strbuf, len,
 				    be32_to_cpu(entry.folder.id), DT_DIR))
 				break;
 		} else if (type == HFSPLUS_FILE) {
@@ -220,7 +216,7 @@
 				err = -EIO;
 				goto out;
 			}
-			if (filldir(dirent, strbuf, len, filp->f_pos,
+			if (!dir_emit(ctx, strbuf, len,
 				    be32_to_cpu(entry.file.id), DT_REG))
 				break;
 		} else {
@@ -229,22 +225,22 @@
 			goto out;
 		}
 next:
-		filp->f_pos++;
-		if (filp->f_pos >= inode->i_size)
+		ctx->pos++;
+		if (ctx->pos >= inode->i_size)
 			goto out;
 		err = hfs_brec_goto(&fd, 1);
 		if (err)
 			goto out;
 	}
-	rd = filp->private_data;
+	rd = file->private_data;
 	if (!rd) {
 		rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL);
 		if (!rd) {
 			err = -ENOMEM;
 			goto out;
 		}
-		filp->private_data = rd;
-		rd->file = filp;
+		file->private_data = rd;
+		rd->file = file;
 		list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
 	}
 	memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
@@ -538,7 +534,7 @@
 const struct file_operations hfsplus_dir_operations = {
 	.fsync		= hfsplus_file_fsync,
 	.read		= generic_read_dir,
-	.readdir	= hfsplus_readdir,
+	.iterate	= hfsplus_readdir,
 	.unlocked_ioctl = hfsplus_ioctl,
 	.llseek		= generic_file_llseek,
 	.release	= hfsplus_dir_release,

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 32f35f1..cddb052 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c

@@ -277,7 +277,7 @@
 	.show_options	= hostfs_show_options,
 };
 
-int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
+int hostfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	void *dir;
 	char *name;
@@ -292,12 +292,11 @@
 	__putname(name);
 	if (dir == NULL)
 		return -error;
-	next = file->f_pos;
+	next = ctx->pos;
 	while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
-		error = (*filldir)(ent, name, len, file->f_pos,
-				   ino, type);
-		if (error) break;
-		file->f_pos = next;
+		if (!dir_emit(ctx, name, len, ino, type))
+			break;
+		ctx->pos = next;
 	}
 	close_dir(dir);
 	return 0;
@@ -393,7 +392,7 @@
 
 static const struct file_operations hostfs_dir_fops = {
 	.llseek		= generic_file_llseek,
-	.readdir	= hostfs_readdir,
+	.iterate	= hostfs_readdir,
 	.read		= generic_read_dir,
 };
 

diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 834ac13..292b1ac 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c

@@ -57,14 +57,14 @@
 	return -ESPIPE;
 }
 
-static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hpfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
 	struct quad_buffer_head qbh;
 	struct hpfs_dirent *de;
 	int lc;
-	long old_pos;
+	loff_t next_pos;
 	unsigned char *tempname;
 	int c1, c2 = 0;
 	int ret = 0;
@@ -105,11 +105,11 @@
 		}
 	}
 	lc = hpfs_sb(inode->i_sb)->sb_lowercase;
-	if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */
-		filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */
+	if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */
+		ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */
 		goto out;
 	}
-	if (filp->f_pos == 13) {
+	if (ctx->pos == 13) {
 		ret = -ENOENT;
 		goto out;
 	}
@@ -120,33 +120,34 @@
 		   accepted by filldir, but what can I do?
 		   maybe killall -9 ls helps */
 		if (hpfs_sb(inode->i_sb)->sb_chk)
-			if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) {
+			if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) {
 				ret = -EFSERROR;
 				goto out;
 			}
-		if (filp->f_pos == 12)
+		if (ctx->pos == 12)
 			goto out;
-		if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) {
-			printk("HPFS: warning: pos==%d\n",(int)filp->f_pos);
+		if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) {
+			printk("HPFS: warning: pos==%d\n",(int)ctx->pos);
 			goto out;
 		}
-		if (filp->f_pos == 0) {
-			if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+		if (ctx->pos == 0) {
+			if (!dir_emit_dot(file, ctx))
 				goto out;
-			filp->f_pos = 11;
+			ctx->pos = 11;
 		}
-		if (filp->f_pos == 11) {
-			if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0)
+		if (ctx->pos == 11) {
+			if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR))
 				goto out;
-			filp->f_pos = 1;
+			ctx->pos = 1;
 		}
-		if (filp->f_pos == 1) {
-			filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
-			hpfs_add_pos(inode, &filp->f_pos);
-			filp->f_version = inode->i_version;
+		if (ctx->pos == 1) {
+			ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
+			hpfs_add_pos(inode, &file->f_pos);
+			file->f_version = inode->i_version;
 		}
-		old_pos = filp->f_pos;
-		if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) {
+		next_pos = ctx->pos;
+		if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
+			ctx->pos = next_pos;
 			ret = -EIOERROR;
 			goto out;
 		}
@@ -154,20 +155,21 @@
 			if (hpfs_sb(inode->i_sb)->sb_chk) {
 				if (de->first && !de->last && (de->namelen != 2
 				    || de ->name[0] != 1 || de->name[1] != 1))
-					hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos);
+					hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos);
 				if (de->last && (de->namelen != 1 || de ->name[0] != 255))
-					hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos);
+					hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos);
 			}
 			hpfs_brelse4(&qbh);
+			ctx->pos = next_pos;
 			goto again;
 		}
 		tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
-		if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) {
-			filp->f_pos = old_pos;
+		if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) {
 			if (tempname != de->name) kfree(tempname);
 			hpfs_brelse4(&qbh);
 			goto out;
 		}
+		ctx->pos = next_pos;
 		if (tempname != de->name) kfree(tempname);
 		hpfs_brelse4(&qbh);
 	}
@@ -322,7 +324,7 @@
 {
 	.llseek		= hpfs_dir_lseek,
 	.read		= generic_read_dir,
-	.readdir	= hpfs_readdir,
+	.iterate	= hpfs_readdir,
 	.release	= hpfs_dir_release,
 	.fsync		= hpfs_file_fsync,
 };

diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index cd3e389..fc90ab1 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c

@@ -542,8 +542,8 @@
 };
 
 struct hppfs_dirent {
-	void *vfs_dirent;
-	filldir_t filldir;
+	struct dir_context ctx;
+	struct dir_context *caller;
 	struct dentry *dentry;
 };
 
@@ -555,34 +555,29 @@
 	if (file_removed(dirent->dentry, name))
 		return 0;
 
-	return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
-				  inode, type);
+	dirent->caller->pos = dirent->ctx.pos;
+	return !dir_emit(dirent->caller, name, size, inode, type);
 }
 
-static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
+static int hppfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct hppfs_private *data = file->private_data;
 	struct file *proc_file = data->proc_file;
-	int (*readdir)(struct file *, void *, filldir_t);
-	struct hppfs_dirent dirent = ((struct hppfs_dirent)
-		                      { .vfs_dirent  	= ent,
-					.filldir 	= filldir,
-					.dentry  	= file->f_path.dentry
-				      });
+	struct hppfs_dirent d = {
+		.ctx.actor	= hppfs_filldir,
+		.caller		= ctx,
+		.dentry  	= file->f_path.dentry
+	};
 	int err;
-
-	readdir = file_inode(proc_file)->i_fop->readdir;
-
-	proc_file->f_pos = file->f_pos;
-	err = (*readdir)(proc_file, &dirent, hppfs_filldir);
-	file->f_pos = proc_file->f_pos;
-
+	proc_file->f_pos = ctx->pos;
+	err = iterate_dir(proc_file, &d.ctx);
+	ctx->pos = d.ctx.pos;
 	return err;
 }
 
 static const struct file_operations hppfs_dir_fops = {
 	.owner		= NULL,
-	.readdir	= hppfs_readdir,
+	.iterate	= hppfs_readdir,
 	.open		= hppfs_dir_open,
 	.llseek		= default_llseek,
 	.release	= hppfs_release,

diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index a7d5c3c..b943cbd 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c

@@ -78,8 +78,8 @@
 /*
  * This should _really_ be cleaned up some day..
  */
-static int do_isofs_readdir(struct inode *inode, struct file *filp,
-		void *dirent, filldir_t filldir,
+static int do_isofs_readdir(struct inode *inode, struct file *file,
+		struct dir_context *ctx,
 		char *tmpname, struct iso_directory_record *tmpde)
 {
 	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -94,10 +94,10 @@
 	struct iso_directory_record *de;
 	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
 
-	offset = filp->f_pos & (bufsize - 1);
-	block = filp->f_pos >> bufbits;
+	offset = ctx->pos & (bufsize - 1);
+	block = ctx->pos >> bufbits;
 
-	while (filp->f_pos < inode->i_size) {
+	while (ctx->pos < inode->i_size) {
 		int de_len;
 
 		if (!bh) {
@@ -108,7 +108,7 @@
 
 		de = (struct iso_directory_record *) (bh->b_data + offset);
 
-		de_len = *(unsigned char *) de;
+		de_len = *(unsigned char *)de;
 
 		/*
 		 * If the length byte is zero, we should move on to the next
@@ -119,8 +119,8 @@
 		if (de_len == 0) {
 			brelse(bh);
 			bh = NULL;
-			filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
-			block = filp->f_pos >> bufbits;
+			ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
+			block = ctx->pos >> bufbits;
 			offset = 0;
 			continue;
 		}
@@ -164,16 +164,16 @@
 
 		if (de->flags[-sbi->s_high_sierra] & 0x80) {
 			first_de = 0;
-			filp->f_pos += de_len;
+			ctx->pos += de_len;
 			continue;
 		}
 		first_de = 1;
 
 		/* Handle the case of the '.' directory */
 		if (de->name_len[0] == 1 && de->name[0] == 0) {
-			if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+			if (!dir_emit_dot(file, ctx))
 				break;
-			filp->f_pos += de_len;
+			ctx->pos += de_len;
 			continue;
 		}
 
@@ -181,10 +181,9 @@
 
 		/* Handle the case of the '..' directory */
 		if (de->name_len[0] == 1 && de->name[0] == 1) {
-			inode_number = parent_ino(filp->f_path.dentry);
-			if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0)
+			if (!dir_emit_dotdot(file, ctx))
 				break;
-			filp->f_pos += de_len;
+			ctx->pos += de_len;
 			continue;
 		}
 
@@ -198,7 +197,7 @@
 		if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
 		    (!sbi->s_showassoc &&
 				(de->flags[-sbi->s_high_sierra] & 4))) {
-			filp->f_pos += de_len;
+			ctx->pos += de_len;
 			continue;
 		}
 
@@ -230,10 +229,10 @@
 			}
 		}
 		if (len > 0) {
-			if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0)
+			if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN))
 				break;
 		}
-		filp->f_pos += de_len;
+		ctx->pos += de_len;
 
 		continue;
 	}
@@ -247,13 +246,12 @@
  * handling split directory entries.. The real work is done by
  * "do_isofs_readdir()".
  */
-static int isofs_readdir(struct file *filp,
-		void *dirent, filldir_t filldir)
+static int isofs_readdir(struct file *file, struct dir_context *ctx)
 {
 	int result;
 	char *tmpname;
 	struct iso_directory_record *tmpde;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 
 	tmpname = (char *)__get_free_page(GFP_KERNEL);
 	if (tmpname == NULL)
@@ -261,7 +259,7 @@
 
 	tmpde = (struct iso_directory_record *) (tmpname+1024);
 
-	result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
+	result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde);
 
 	free_page((unsigned long) tmpname);
 	return result;
@@ -271,7 +269,7 @@
 {
 	.llseek = generic_file_llseek,
 	.read = generic_read_dir,
-	.readdir = isofs_readdir,
+	.iterate = isofs_readdir,
 };
 
 /*

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e3e255c..be0c39b 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c

@@ -2019,16 +2019,20 @@
  * void journal_invalidatepage() - invalidate a journal page
  * @journal: journal to use for flush
  * @page:    page to flush
- * @offset:  length of page to invalidate.
+ * @offset:  offset of the range to invalidate
+ * @length:  length of the range to invalidate
  *
- * Reap page buffers containing data after offset in page.
+ * Reap page buffers containing data in specified range in page.
  */
 void journal_invalidatepage(journal_t *journal,
 		      struct page *page,
-		      unsigned long offset)
+		      unsigned int offset,
+		      unsigned int length)
 {
 	struct buffer_head *head, *bh, *next;
+	unsigned int stop = offset + length;
 	unsigned int curr_off = 0;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
 	int may_free = 1;
 
 	if (!PageLocked(page))
@@ -2036,6 +2040,8 @@
 	if (!page_has_buffers(page))
 		return;
 
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
 	/* We will potentially be playing with lists other than just the
 	 * data lists (especially for journaled data mode), so be
 	 * cautious in our locking. */
@@ -2045,11 +2051,14 @@
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
 
+		if (next_off > stop)
+			return;
+
 		if (offset <= curr_off) {
 			/* This block is wholly outside the truncation point */
 			lock_buffer(bh);
 			may_free &= journal_unmap_buffer(journal, bh,
-							 offset > 0);
+							 partial_page);
 			unlock_buffer(bh);
 		}
 		curr_off = next_off;
@@ -2057,7 +2066,7 @@
 
 	} while (bh != head);
 
-	if (!offset) {
+	if (!partial_page) {
 		if (may_free && try_to_free_buffers(page))
 			J_ASSERT(!page_has_buffers(page));
 	}

diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2..5a9f553 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig

@@ -20,7 +20,7 @@
 
 config JBD2_DEBUG
 	bool "JBD2 (ext4) debugging support"
-	depends on JBD2 && DEBUG_FS
+	depends on JBD2
 	help
 	  If you are using the ext4 journaled file system (or
 	  potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@
 	  By default, the debugging output will be turned off.
 
 	  If you select Y here, then you will be able to turn on debugging
-	  with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+	  with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
 	  number between 1 and 5. The higher the number, the more debugging
 	  output is generated.  To turn debugging off again, do
-	  "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+	  "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841e..7f34f47 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c

@@ -120,8 +120,8 @@
 	int nblocks, space_left;
 	/* assert_spin_locked(&journal->j_state_lock); */
 
-	nblocks = jbd_space_needed(journal);
-	while (__jbd2_log_space_left(journal) < nblocks) {
+	nblocks = jbd2_space_needed(journal);
+	while (jbd2_log_space_left(journal) < nblocks) {
 		if (journal->j_flags & JBD2_ABORT)
 			return;
 		write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@
 		 */
 		write_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
-		nblocks = jbd_space_needed(journal);
-		space_left = __jbd2_log_space_left(journal);
+		nblocks = jbd2_space_needed(journal);
+		space_left = jbd2_log_space_left(journal);
 		if (space_left < nblocks) {
 			int chkpt = journal->j_checkpoint_transactions != NULL;
 			tid_t tid = 0;
@@ -156,7 +156,15 @@
 				/* We were able to recover space; yay! */
 				;
 			} else if (tid) {
+				/*
+				 * jbd2_journal_commit_transaction() may want
+				 * to take the checkpoint_mutex if JBD2_FLUSHED
+				 * is set.  So we need to temporarily drop it.
+				 */
+				mutex_unlock(&journal->j_checkpoint_mutex);
 				jbd2_log_wait_commit(journal, tid);
+				write_lock(&journal->j_state_lock);
+				continue;
 			} else {
 				printk(KERN_ERR "%s: needed %d blocks and "
 				       "only had %d space available\n",
@@ -625,10 +633,6 @@
 
 	__jbd2_journal_drop_transaction(journal, transaction);
 	jbd2_journal_free_transaction(transaction);
-
-	/* Just in case anybody was waiting for more transactions to be
-           checkpointed... */
-	wake_up(&journal->j_wait_logspace);
 	ret = 1;
 out:
 	return ret;
@@ -690,9 +694,7 @@
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
-	J_ASSERT(transaction->t_iobuf_list == NULL);
 	J_ASSERT(transaction->t_shadow_list == NULL);
-	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(atomic_read(&transaction->t_updates) == 0);

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0f53946..559bec1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c

@@ -30,15 +30,22 @@
 #include <trace/events/jbd2.h>
 
 /*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
  */
 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
+	struct buffer_head *orig_bh = bh->b_private;
+
 	BUFFER_TRACE(bh, "");
 	if (uptodate)
 		set_buffer_uptodate(bh);
 	else
 		clear_buffer_uptodate(bh);
+	if (orig_bh) {
+		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&orig_bh->b_state, BH_Shadow);
+	}
 	unlock_buffer(bh);
 }
 
@@ -85,8 +92,7 @@
 	__brelse(bh);
 }
 
-static void jbd2_commit_block_csum_set(journal_t *j,
-				       struct journal_head *descriptor)
+static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
 {
 	struct commit_header *h;
 	__u32 csum;
@@ -94,12 +100,11 @@
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return;
 
-	h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+	h = (struct commit_header *)(bh->b_data);
 	h->h_chksum_type = 0;
 	h->h_chksum_size = 0;
 	h->h_chksum[0] = 0;
-	csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
-			   j->j_blocksize);
+	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 	h->h_chksum[0] = cpu_to_be32(csum);
 }
 
@@ -116,7 +121,6 @@
 					struct buffer_head **cbh,
 					__u32 crc32_sum)
 {
-	struct journal_head *descriptor;
 	struct commit_header *tmp;
 	struct buffer_head *bh;
 	int ret;
@@ -127,12 +131,10 @@
 	if (is_journal_aborted(journal))
 		return 0;
 
-	descriptor = jbd2_journal_get_descriptor_buffer(journal);
-	if (!descriptor)
+	bh = jbd2_journal_get_descriptor_buffer(journal);
+	if (!bh)
 		return 1;
 
-	bh = jh2bh(descriptor);
-
 	tmp = (struct commit_header *)bh->b_data;
 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@
 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
 	}
-	jbd2_commit_block_csum_set(journal, descriptor);
+	jbd2_commit_block_csum_set(journal, bh);
 
-	JBUFFER_TRACE(descriptor, "submit commit block");
+	BUFFER_TRACE(bh, "submit commit block");
 	lock_buffer(bh);
 	clear_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@
 	if (unlikely(!buffer_uptodate(bh)))
 		ret = -EIO;
 	put_bh(bh);            /* One for getblk() */
-	jbd2_journal_put_journal_head(bh2jh(bh));
 
 	return ret;
 }
@@ -321,7 +322,7 @@
 }
 
 static void jbd2_descr_block_csum_set(journal_t *j,
-				      struct journal_head *descriptor)
+				      struct buffer_head *bh)
 {
 	struct jbd2_journal_block_tail *tail;
 	__u32 csum;
@@ -329,12 +330,10 @@
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return;
 
-	tail = (struct jbd2_journal_block_tail *)
-			(jh2bh(descriptor)->b_data + j->j_blocksize -
+	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
 			sizeof(struct jbd2_journal_block_tail));
 	tail->t_checksum = 0;
-	csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
-			   j->j_blocksize);
+	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 	tail->t_checksum = cpu_to_be32(csum);
 }
 
@@ -343,20 +342,21 @@
 {
 	struct page *page = bh->b_page;
 	__u8 *addr;
-	__u32 csum;
+	__u32 csum32;
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return;
 
 	sequence = cpu_to_be32(sequence);
 	addr = kmap_atomic(page);
-	csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
-			  sizeof(sequence));
-	csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
-			  bh->b_size);
+	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+			     sizeof(sequence));
+	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
+			     bh->b_size);
 	kunmap_atomic(addr);
 
-	tag->t_checksum = cpu_to_be32(csum);
+	/* We only have space to store the lower 16 bits of the crc32c. */
+	tag->t_checksum = cpu_to_be16(csum32);
 }
 /*
  * jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@
 {
 	struct transaction_stats_s stats;
 	transaction_t *commit_transaction;
-	struct journal_head *jh, *new_jh, *descriptor;
+	struct journal_head *jh;
+	struct buffer_head *descriptor;
 	struct buffer_head **wbuf = journal->j_wbuf;
 	int bufs;
 	int flags;
@@ -392,6 +393,8 @@
 	tid_t first_tid;
 	int update_tail;
 	int csum_size = 0;
+	LIST_HEAD(io_bufs);
+	LIST_HEAD(log_bufs);
 
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,13 +427,13 @@
 	J_ASSERT(journal->j_committing_transaction == NULL);
 
 	commit_transaction = journal->j_running_transaction;
-	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
 	trace_jbd2_start_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
 	write_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 	commit_transaction->t_state = T_LOCKED;
 
 	trace_jbd2_commit_locking(journal, commit_transaction);
@@ -520,6 +523,12 @@
 	 */
 	jbd2_journal_switch_revoke_table(journal);
 
+	/*
+	 * Reserved credits cannot be claimed anymore, free them
+	 */
+	atomic_sub(atomic_read(&journal->j_reserved_credits),
+		   &commit_transaction->t_outstanding_credits);
+
 	trace_jbd2_commit_flushing(journal, commit_transaction);
 	stats.run.rs_flushing = jiffies;
 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -533,7 +542,7 @@
 	wake_up(&journal->j_wait_transaction_locked);
 	write_unlock(&journal->j_state_lock);
 
-	jbd_debug(3, "JBD2: commit phase 2\n");
+	jbd_debug(3, "JBD2: commit phase 2a\n");
 
 	/*
 	 * Now start flushing things to disk, in the order they appear
@@ -545,10 +554,10 @@
 
 	blk_start_plug(&plug);
 	jbd2_journal_write_revoke_records(journal, commit_transaction,
-					  WRITE_SYNC);
+					  &log_bufs, WRITE_SYNC);
 	blk_finish_plug(&plug);
 
-	jbd_debug(3, "JBD2: commit phase 2\n");
+	jbd_debug(3, "JBD2: commit phase 2b\n");
 
 	/*
 	 * Way to go: we have now written out all of the data for a
@@ -571,8 +580,8 @@
 		 atomic_read(&commit_transaction->t_outstanding_credits));
 
 	err = 0;
-	descriptor = NULL;
 	bufs = 0;
+	descriptor = NULL;
 	blk_start_plug(&plug);
 	while (commit_transaction->t_buffers) {
 
@@ -604,8 +613,6 @@
 		   record the metadata buffer. */
 
 		if (!descriptor) {
-			struct buffer_head *bh;
-
 			J_ASSERT (bufs == 0);
 
 			jbd_debug(4, "JBD2: get descriptor\n");
@@ -616,26 +623,26 @@
 				continue;
 			}
 
-			bh = jh2bh(descriptor);
 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
-				(unsigned long long)bh->b_blocknr, bh->b_data);
-			header = (journal_header_t *)&bh->b_data[0];
+				(unsigned long long)descriptor->b_blocknr,
+				descriptor->b_data);
+			header = (journal_header_t *)descriptor->b_data;
 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 
-			tagp = &bh->b_data[sizeof(journal_header_t)];
-			space_left = bh->b_size - sizeof(journal_header_t);
+			tagp = &descriptor->b_data[sizeof(journal_header_t)];
+			space_left = descriptor->b_size -
+						sizeof(journal_header_t);
 			first_tag = 1;
-			set_buffer_jwrite(bh);
-			set_buffer_dirty(bh);
-			wbuf[bufs++] = bh;
+			set_buffer_jwrite(descriptor);
+			set_buffer_dirty(descriptor);
+			wbuf[bufs++] = descriptor;
 
 			/* Record it so that we can wait for IO
                            completion later */
-			BUFFER_TRACE(bh, "ph3: file as descriptor");
-			jbd2_journal_file_buffer(descriptor, commit_transaction,
-					BJ_LogCtl);
+			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
+			jbd2_file_log_bh(&log_bufs, descriptor);
 		}
 
 		/* Where is the buffer to be written? */
@@ -658,29 +665,22 @@
 
 		/* Bump b_count to prevent truncate from stumbling over
                    the shadowed buffer!  @@@ This can go if we ever get
-                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+                   rid of the shadow pairing of buffers. */
 		atomic_inc(&jh2bh(jh)->b_count);
 
-		/* Make a temporary IO buffer with which to write it out
-                   (this will requeue both the metadata buffer and the
-                   temporary IO buffer). new_bh goes on BJ_IO*/
-
-		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 		/*
-		 * akpm: jbd2_journal_write_metadata_buffer() sets
-		 * new_bh->b_transaction to commit_transaction.
-		 * We need to clean this up before we release new_bh
-		 * (which is of type BJ_IO)
+		 * Make a temporary IO buffer with which to write it out
+		 * (this will requeue the metadata buffer to BJ_Shadow).
 		 */
+		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 		JBUFFER_TRACE(jh, "ph3: write metadata");
 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
-						      jh, &new_jh, blocknr);
+						jh, &wbuf[bufs], blocknr);
 		if (flags < 0) {
 			jbd2_journal_abort(journal, flags);
 			continue;
 		}
-		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
-		wbuf[bufs++] = jh2bh(new_jh);
+		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
 
 		/* Record the new block's tag in the current descriptor
                    buffer */
@@ -694,10 +694,11 @@
 		tag = (journal_block_tag_t *) tagp;
 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 		tag->t_flags = cpu_to_be16(tag_flag);
-		jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
 					commit_transaction->t_tid);
 		tagp += tag_bytes;
 		space_left -= tag_bytes;
+		bufs++;
 
 		if (first_tag) {
 			memcpy (tagp, journal->j_uuid, 16);
@@ -809,7 +810,7 @@
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
            transaction's t_log_list queue, and metadata buffers are on
-           the t_iobuf_list queue.
+           the io_bufs list.
 
 	   Wait for the buffers in reverse order.  That way we are
 	   less likely to be woken up until all IOs have completed, and
@@ -818,47 +819,33 @@
 
 	jbd_debug(3, "JBD2: commit phase 3\n");
 
-	/*
-	 * akpm: these are BJ_IO, and j_list_lock is not needed.
-	 * See __journal_try_to_free_buffer.
-	 */
-wait_for_iobuf:
-	while (commit_transaction->t_iobuf_list != NULL) {
-		struct buffer_head *bh;
+	while (!list_empty(&io_bufs)) {
+		struct buffer_head *bh = list_entry(io_bufs.prev,
+						    struct buffer_head,
+						    b_assoc_buffers);
 
-		jh = commit_transaction->t_iobuf_list->b_tprev;
-		bh = jh2bh(jh);
-		if (buffer_locked(bh)) {
-			wait_on_buffer(bh);
-			goto wait_for_iobuf;
-		}
-		if (cond_resched())
-			goto wait_for_iobuf;
+		wait_on_buffer(bh);
+		cond_resched();
 
 		if (unlikely(!buffer_uptodate(bh)))
 			err = -EIO;
-
-		clear_buffer_jwrite(bh);
-
-		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
-		jbd2_journal_unfile_buffer(journal, jh);
+		jbd2_unfile_log_bh(bh);
 
 		/*
-		 * ->t_iobuf_list should contain only dummy buffer_heads
-		 * which were created by jbd2_journal_write_metadata_buffer().
+		 * The list contains temporary buffer heads created by
+		 * jbd2_journal_write_metadata_buffer().
 		 */
 		BUFFER_TRACE(bh, "dumping temporary bh");
-		jbd2_journal_put_journal_head(jh);
 		__brelse(bh);
 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 		free_buffer_head(bh);
 
-		/* We also have to unlock and free the corresponding
-                   shadowed buffer */
+		/* We also have to refile the corresponding shadowed buffer */
 		jh = commit_transaction->t_shadow_list->b_tprev;
 		bh = jh2bh(jh);
-		clear_bit(BH_JWrite, &bh->b_state);
+		clear_buffer_jwrite(bh);
 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
+		J_ASSERT_BH(bh, !buffer_shadow(bh));
 
 		/* The metadata is now released for reuse, but we need
                    to remember it against this transaction so that when
@@ -866,14 +853,6 @@
                    required. */
 		JBUFFER_TRACE(jh, "file as BJ_Forget");
 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-		/*
-		 * Wake up any transactions which were waiting for this IO to
-		 * complete. The barrier must be here so that changes by
-		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
-		 * does the waitqueue check.
-		 */
-		smp_mb();
-		wake_up_bit(&bh->b_state, BH_Unshadow);
 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 		__brelse(bh);
 	}
@@ -883,26 +862,19 @@
 	jbd_debug(3, "JBD2: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
-	while (commit_transaction->t_log_list != NULL) {
+	while (!list_empty(&log_bufs)) {
 		struct buffer_head *bh;
 
-		jh = commit_transaction->t_log_list->b_tprev;
-		bh = jh2bh(jh);
-		if (buffer_locked(bh)) {
-			wait_on_buffer(bh);
-			goto wait_for_ctlbuf;
-		}
-		if (cond_resched())
-			goto wait_for_ctlbuf;
+		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
+		wait_on_buffer(bh);
+		cond_resched();
 
 		if (unlikely(!buffer_uptodate(bh)))
 			err = -EIO;
 
 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 		clear_buffer_jwrite(bh);
-		jbd2_journal_unfile_buffer(journal, jh);
-		jbd2_journal_put_journal_head(jh);
+		jbd2_unfile_log_bh(bh);
 		__brelse(bh);		/* One for getblk */
 		/* AKPM: bforget here */
 	}
@@ -952,9 +924,7 @@
 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
-	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
-	J_ASSERT(commit_transaction->t_log_list == NULL);
 
 restart_loop:
 	/*

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9545757..02c7ad9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c

@@ -103,6 +103,24 @@
 static void __journal_abort_soft (journal_t *journal, int errno);
 static int jbd2_journal_create_slab(size_t slab_size);
 
+#ifdef CONFIG_JBD2_DEBUG
+void __jbd2_debug(int level, const char *file, const char *func,
+		  unsigned int line, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (level > jbd2_journal_enable_debug)
+		return;
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+	va_end(args);
+}
+EXPORT_SYMBOL(__jbd2_debug);
+#endif
+
 /* Checksumming functions */
 int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 {
@@ -310,14 +328,12 @@
  *
  * If the source buffer has already been modified by a new transaction
  * since we took the last commit snapshot, we use the frozen copy of
- * that data for IO.  If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
- * else from using and possibly modifying it while the IO is in
- * progress.
+ * that data for IO. If we end up using the existing buffer_head's data
+ * for the write, then we have to make sure nobody modifies it while the
+ * IO is in progress. do_get_write_access() handles this.
  *
- * The function returns a pointer to the buffer_heads to be used for IO.
- *
- * We assume that the journal has already been locked in this function.
+ * The function returns a pointer to the buffer_head to be used for IO.
+ * 
  *
  * Return value:
  *  <0: Error
@@ -330,15 +346,14 @@
 
 int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 				  struct journal_head  *jh_in,
-				  struct journal_head **jh_out,
-				  unsigned long long blocknr)
+				  struct buffer_head **bh_out,
+				  sector_t blocknr)
 {
 	int need_copy_out = 0;
 	int done_copy_out = 0;
 	int do_escape = 0;
 	char *mapped_data;
 	struct buffer_head *new_bh;
-	struct journal_head *new_jh;
 	struct page *new_page;
 	unsigned int new_offset;
 	struct buffer_head *bh_in = jh2bh(jh_in);
@@ -368,14 +383,13 @@
 
 	/* keep subsequent assertions sane */
 	atomic_set(&new_bh->b_count, 1);
-	new_jh = jbd2_journal_add_journal_head(new_bh);	/* This sleeps */
 
+	jbd_lock_bh_state(bh_in);
+repeat:
 	/*
 	 * If a new transaction has already done a buffer copy-out, then
 	 * we use that version of the data for the commit.
 	 */
-	jbd_lock_bh_state(bh_in);
-repeat:
 	if (jh_in->b_frozen_data) {
 		done_copy_out = 1;
 		new_page = virt_to_page(jh_in->b_frozen_data);
@@ -415,7 +429,7 @@
 		jbd_unlock_bh_state(bh_in);
 		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
 		if (!tmp) {
-			jbd2_journal_put_journal_head(new_jh);
+			brelse(new_bh);
 			return -ENOMEM;
 		}
 		jbd_lock_bh_state(bh_in);
@@ -426,7 +440,7 @@
 
 		jh_in->b_frozen_data = tmp;
 		mapped_data = kmap_atomic(new_page);
-		memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+		memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
 		kunmap_atomic(mapped_data);
 
 		new_page = virt_to_page(tmp);
@@ -452,14 +466,14 @@
 	}
 
 	set_bh_page(new_bh, new_page, new_offset);
-	new_jh->b_transaction = NULL;
-	new_bh->b_size = jh2bh(jh_in)->b_size;
-	new_bh->b_bdev = transaction->t_journal->j_dev;
+	new_bh->b_size = bh_in->b_size;
+	new_bh->b_bdev = journal->j_dev;
 	new_bh->b_blocknr = blocknr;
+	new_bh->b_private = bh_in;
 	set_buffer_mapped(new_bh);
 	set_buffer_dirty(new_bh);
 
-	*jh_out = new_jh;
+	*bh_out = new_bh;
 
 	/*
 	 * The to-be-written buffer needs to get moved to the io queue,
@@ -470,11 +484,9 @@
 	spin_lock(&journal->j_list_lock);
 	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
 	spin_unlock(&journal->j_list_lock);
+	set_buffer_shadow(bh_in);
 	jbd_unlock_bh_state(bh_in);
 
-	JBUFFER_TRACE(new_jh, "file as BJ_IO");
-	jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
-
 	return do_escape | (done_copy_out << 1);
 }
 
@@ -484,35 +496,6 @@
  */
 
 /*
- * __jbd2_log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-
-int __jbd2_log_space_left(journal_t *journal)
-{
-	int left = journal->j_free;
-
-	/* assert_spin_locked(&journal->j_state_lock); */
-
-	/*
-	 * Be pessimistic here about the number of those free blocks which
-	 * might be required for log descriptor control blocks.
-	 */
-
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-
-	left -= MIN_LOG_RESERVED_BLOCKS;
-
-	if (left <= 0)
-		return 0;
-	left -= (left >> 3);
-	return left;
-}
-
-/*
  * Called with j_state_lock locked for writing.
  * Returns true if a transaction commit was started.
  */
@@ -564,20 +547,17 @@
 }
 
 /*
- * Force and wait upon a commit if the calling process is not within
- * transaction.  This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
- *
- * We can only force the running transaction if we don't have an active handle;
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
+ * Force and wait any uncommitted transactions.  We can only force the running
+ * transaction if we don't have an active handle, otherwise, we will deadlock.
+ * Returns: <0 in case of error,
+ *           0 if nothing to commit,
+ *           1 if transaction was successfully committed.
  */
-int jbd2_journal_force_commit_nested(journal_t *journal)
+static int __jbd2_journal_force_commit(journal_t *journal)
 {
 	transaction_t *transaction = NULL;
 	tid_t tid;
-	int need_to_start = 0;
+	int need_to_start = 0, ret = 0;
 
 	read_lock(&journal->j_state_lock);
 	if (journal->j_running_transaction && !current->journal_info) {
@@ -588,16 +568,53 @@
 		transaction = journal->j_committing_transaction;
 
 	if (!transaction) {
+		/* Nothing to commit */
 		read_unlock(&journal->j_state_lock);
-		return 0;	/* Nothing to retry */
+		return 0;
 	}
-
 	tid = transaction->t_tid;
 	read_unlock(&journal->j_state_lock);
 	if (need_to_start)
 		jbd2_log_start_commit(journal, tid);
-	jbd2_log_wait_commit(journal, tid);
-	return 1;
+	ret = jbd2_log_wait_commit(journal, tid);
+	if (!ret)
+		ret = 1;
+
+	return ret;
+}
+
+/**
+ * Force and wait upon a commit if the calling process is not within
+ * transaction.  This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * @journal: journal to force
+ * Returns true if progress was made.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+	int ret;
+
+	ret = __jbd2_journal_force_commit(journal);
+	return ret > 0;
+}
+
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * Caller want unconditional commit. We can only force the running transaction
+ * if we don't have an active handle, otherwise, we will deadlock.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+	int ret;
+
+	J_ASSERT(!current->journal_info);
+	ret = __jbd2_journal_force_commit(journal);
+	if (ret > 0)
+		ret = 0;
+	return ret;
 }
 
 /*
@@ -798,7 +815,7 @@
  * But we don't bother doing that, so there will be coherency problems with
  * mmaps of blockdevs which hold live JBD-controlled filesystems.
  */
-struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
 {
 	struct buffer_head *bh;
 	unsigned long long blocknr;
@@ -817,7 +834,7 @@
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 	BUFFER_TRACE(bh, "return this buffer");
-	return jbd2_journal_add_journal_head(bh);
+	return bh;
 }
 
 /*
@@ -1062,11 +1079,10 @@
 		return NULL;
 
 	init_waitqueue_head(&journal->j_wait_transaction_locked);
-	init_waitqueue_head(&journal->j_wait_logspace);
 	init_waitqueue_head(&journal->j_wait_done_commit);
-	init_waitqueue_head(&journal->j_wait_checkpoint);
 	init_waitqueue_head(&journal->j_wait_commit);
 	init_waitqueue_head(&journal->j_wait_updates);
+	init_waitqueue_head(&journal->j_wait_reserved);
 	mutex_init(&journal->j_barrier);
 	mutex_init(&journal->j_checkpoint_mutex);
 	spin_lock_init(&journal->j_revoke_lock);
@@ -1076,6 +1092,7 @@
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
 	journal->j_min_batch_time = 0;
 	journal->j_max_batch_time = 15000; /* 15ms */
+	atomic_set(&journal->j_reserved_credits, 0);
 
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JBD2_ABORT;
@@ -1318,6 +1335,7 @@
 static void jbd2_write_superblock(journal_t *journal, int write_op)
 {
 	struct buffer_head *bh = journal->j_sb_buffer;
+	journal_superblock_t *sb = journal->j_superblock;
 	int ret;
 
 	trace_jbd2_write_superblock(journal, write_op);
@@ -1339,6 +1357,7 @@
 		clear_buffer_write_io_error(bh);
 		set_buffer_uptodate(bh);
 	}
+	jbd2_superblock_csum_set(journal, sb);
 	get_bh(bh);
 	bh->b_end_io = end_buffer_write_sync;
 	ret = submit_bh(write_op, bh);
@@ -1435,7 +1454,6 @@
 	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
 		  journal->j_errno);
 	sb->s_errno    = cpu_to_be32(journal->j_errno);
-	jbd2_superblock_csum_set(journal, sb);
 	read_unlock(&journal->j_state_lock);
 
 	jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2325,13 +2343,13 @@
 #ifdef CONFIG_JBD2_DEBUG
 	atomic_inc(&nr_journal_heads);
 #endif
-	ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+	ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
 	if (!ret) {
 		jbd_debug(1, "out of memory for journal_head\n");
 		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
 		while (!ret) {
 			yield();
-			ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+			ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
 		}
 	}
 	return ret;
@@ -2393,10 +2411,8 @@
 	struct journal_head *new_jh = NULL;
 
 repeat:
-	if (!buffer_jbd(bh)) {
+	if (!buffer_jbd(bh))
 		new_jh = journal_alloc_journal_head();
-		memset(new_jh, 0, sizeof(*new_jh));
-	}
 
 	jbd_lock_bh_journal_head(bh);
 	if (buffer_jbd(bh)) {

diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846b..d4851464 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c

@@ -399,18 +399,17 @@
 static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
 				      void *buf, __u32 sequence)
 {
-	__u32 provided, calculated;
+	__u32 csum32;
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return 1;
 
 	sequence = cpu_to_be32(sequence);
-	calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
-				 sizeof(sequence));
-	calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
-	provided = be32_to_cpu(tag->t_checksum);
+	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+			     sizeof(sequence));
+	csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
 
-	return provided == cpu_to_be32(calculated);
+	return tag->t_checksum == cpu_to_be16(csum32);
 }
 
 static int do_one_pass(journal_t *journal,

diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b..198c9c1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c

@@ -122,9 +122,10 @@
 
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
-				    struct journal_head **, int *,
+				    struct list_head *,
+				    struct buffer_head **, int *,
 				    struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
 #endif
 
 /* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@
  */
 void jbd2_journal_write_revoke_records(journal_t *journal,
 				       transaction_t *transaction,
+				       struct list_head *log_bufs,
 				       int write_op)
 {
-	struct journal_head *descriptor;
+	struct buffer_head *descriptor;
 	struct jbd2_revoke_record_s *record;
 	struct jbd2_revoke_table_s *revoke;
 	struct list_head *hash_list;
@@ -553,7 +555,7 @@
 		while (!list_empty(hash_list)) {
 			record = (struct jbd2_revoke_record_s *)
 				hash_list->next;
-			write_one_revoke_record(journal, transaction,
+			write_one_revoke_record(journal, transaction, log_bufs,
 						&descriptor, &offset,
 						record, write_op);
 			count++;
@@ -573,13 +575,14 @@
 
 static void write_one_revoke_record(journal_t *journal,
 				    transaction_t *transaction,
-				    struct journal_head **descriptorp,
+				    struct list_head *log_bufs,
+				    struct buffer_head **descriptorp,
 				    int *offsetp,
 				    struct jbd2_revoke_record_s *record,
 				    int write_op)
 {
 	int csum_size = 0;
-	struct journal_head *descriptor;
+	struct buffer_head *descriptor;
 	int offset;
 	journal_header_t *header;
 
@@ -609,26 +612,26 @@
 		descriptor = jbd2_journal_get_descriptor_buffer(journal);
 		if (!descriptor)
 			return;
-		header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+		header = (journal_header_t *)descriptor->b_data;
 		header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 		header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
 		header->h_sequence  = cpu_to_be32(transaction->t_tid);
 
 		/* Record it so that we can wait for IO completion later */
-		JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
-		jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+		BUFFER_TRACE(descriptor, "file in log_bufs");
+		jbd2_file_log_bh(log_bufs, descriptor);
 
 		offset = sizeof(jbd2_journal_revoke_header_t);
 		*descriptorp = descriptor;
 	}
 
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
-		* ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+		* ((__be64 *)(&descriptor->b_data[offset])) =
 			cpu_to_be64(record->blocknr);
 		offset += 8;
 
 	} else {
-		* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+		* ((__be32 *)(&descriptor->b_data[offset])) =
 			cpu_to_be32(record->blocknr);
 		offset += 4;
 	}
@@ -636,8 +639,7 @@
 	*offsetp = offset;
 }
 
-static void jbd2_revoke_csum_set(journal_t *j,
-				 struct journal_head *descriptor)
+static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
 {
 	struct jbd2_journal_revoke_tail *tail;
 	__u32 csum;
@@ -645,12 +647,10 @@
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return;
 
-	tail = (struct jbd2_journal_revoke_tail *)
-			(jh2bh(descriptor)->b_data + j->j_blocksize -
+	tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
 			sizeof(struct jbd2_journal_revoke_tail));
 	tail->r_checksum = 0;
-	csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
-			   j->j_blocksize);
+	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 	tail->r_checksum = cpu_to_be32(csum);
 }
 
@@ -662,25 +662,24 @@
  */
 
 static void flush_descriptor(journal_t *journal,
-			     struct journal_head *descriptor,
+			     struct buffer_head *descriptor,
 			     int offset, int write_op)
 {
 	jbd2_journal_revoke_header_t *header;
-	struct buffer_head *bh = jh2bh(descriptor);
 
 	if (is_journal_aborted(journal)) {
-		put_bh(bh);
+		put_bh(descriptor);
 		return;
 	}
 
-	header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+	header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
 	header->r_count = cpu_to_be32(offset);
 	jbd2_revoke_csum_set(journal, descriptor);
 
-	set_buffer_jwrite(bh);
-	BUFFER_TRACE(bh, "write");
-	set_buffer_dirty(bh);
-	write_dirty_buffer(bh, write_op);
+	set_buffer_jwrite(descriptor);
+	BUFFER_TRACE(descriptor, "write");
+	set_buffer_dirty(descriptor);
+	write_dirty_buffer(descriptor, write_op);
 }
 #endif
 

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c..7aa9a32 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c

@@ -89,7 +89,8 @@
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
 	atomic_set(&transaction->t_updates, 0);
-	atomic_set(&transaction->t_outstanding_credits, 0);
+	atomic_set(&transaction->t_outstanding_credits,
+		   atomic_read(&journal->j_reserved_credits));
 	atomic_set(&transaction->t_handle_count, 0);
 	INIT_LIST_HEAD(&transaction->t_inode_list);
 	INIT_LIST_HEAD(&transaction->t_private_list);
@@ -141,6 +142,112 @@
 }
 
 /*
+ * Wait until running transaction passes T_LOCKED state. Also starts the commit
+ * if needed. The function expects running transaction to exist and releases
+ * j_state_lock.
+ */
+static void wait_transaction_locked(journal_t *journal)
+	__releases(journal->j_state_lock)
+{
+	DEFINE_WAIT(wait);
+	int need_to_start;
+	tid_t tid = journal->j_running_transaction->t_tid;
+
+	prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+			TASK_UNINTERRUPTIBLE);
+	need_to_start = !tid_geq(journal->j_commit_request, tid);
+	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
+	schedule();
+	finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
+static void sub_reserved_credits(journal_t *journal, int blocks)
+{
+	atomic_sub(blocks, &journal->j_reserved_credits);
+	wake_up(&journal->j_wait_reserved);
+}
+
+/*
+ * Wait until we can add credits for handle to the running transaction.  Called
+ * with j_state_lock held for reading. Returns 0 if handle joined the running
+ * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
+ * caller must retry.
+ */
+static int add_transaction_credits(journal_t *journal, int blocks,
+				   int rsv_blocks)
+{
+	transaction_t *t = journal->j_running_transaction;
+	int needed;
+	int total = blocks + rsv_blocks;
+
+	/*
+	 * If the current transaction is locked down for commit, wait
+	 * for the lock to be released.
+	 */
+	if (t->t_state == T_LOCKED) {
+		wait_transaction_locked(journal);
+		return 1;
+	}
+
+	/*
+	 * If there is not enough space left in the log to write all
+	 * potential buffers requested by this operation, we need to
+	 * stall pending a log checkpoint to free some more log space.
+	 */
+	needed = atomic_add_return(total, &t->t_outstanding_credits);
+	if (needed > journal->j_max_transaction_buffers) {
+		/*
+		 * If the current transaction is already too large,
+		 * then start to commit it: we can then go back and
+		 * attach this handle to a new transaction.
+		 */
+		atomic_sub(total, &t->t_outstanding_credits);
+		wait_transaction_locked(journal);
+		return 1;
+	}
+
+	/*
+	 * The commit code assumes that it can get enough log space
+	 * without forcing a checkpoint.  This is *critical* for
+	 * correctness: a checkpoint of a buffer which is also
+	 * associated with a committing transaction creates a deadlock,
+	 * so commit simply cannot force through checkpoints.
+	 *
+	 * We must therefore ensure the necessary space in the journal
+	 * *before* starting to dirty potentially checkpointed buffers
+	 * in the new transaction.
+	 */
+	if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+		atomic_sub(total, &t->t_outstanding_credits);
+		read_unlock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
+		if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+			__jbd2_log_wait_for_space(journal);
+		write_unlock(&journal->j_state_lock);
+		return 1;
+	}
+
+	/* No reservation? We are done... */
+	if (!rsv_blocks)
+		return 0;
+
+	needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
+	/* We allow at most half of a transaction to be reserved */
+	if (needed > journal->j_max_transaction_buffers / 2) {
+		sub_reserved_credits(journal, rsv_blocks);
+		atomic_sub(total, &t->t_outstanding_credits);
+		read_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_reserved,
+			 atomic_read(&journal->j_reserved_credits) + rsv_blocks
+			 <= journal->j_max_transaction_buffers / 2);
+		return 1;
+	}
+	return 0;
+}
+
+/*
  * start_this_handle: Given a handle, deal with any locking or stalling
  * needed to make sure that there is enough journal space for the handle
  * to begin.  Attach the handle to a transaction and set up the
@@ -151,18 +258,24 @@
 			     gfp_t gfp_mask)
 {
 	transaction_t	*transaction, *new_transaction = NULL;
-	tid_t		tid;
-	int		needed, need_to_start;
-	int		nblocks = handle->h_buffer_credits;
+	int		blocks = handle->h_buffer_credits;
+	int		rsv_blocks = 0;
 	unsigned long ts = jiffies;
 
-	if (nblocks > journal->j_max_transaction_buffers) {
+	/*
+	 * 1/2 of transaction can be reserved so we can practically handle
+	 * only 1/2 of maximum transaction size per operation
+	 */
+	if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
 		printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
-		       current->comm, nblocks,
-		       journal->j_max_transaction_buffers);
+		       current->comm, blocks,
+		       journal->j_max_transaction_buffers / 2);
 		return -ENOSPC;
 	}
 
+	if (handle->h_rsv_handle)
+		rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
 alloc_transaction:
 	if (!journal->j_running_transaction) {
 		new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -199,8 +312,12 @@
 		return -EROFS;
 	}
 
-	/* Wait on the journal's transaction barrier if necessary */
-	if (journal->j_barrier_count) {
+	/*
+	 * Wait on the journal's transaction barrier if necessary. Specifically
+	 * we allow reserved handles to proceed because otherwise commit could
+	 * deadlock on page writeback not being able to complete.
+	 */
+	if (!handle->h_reserved && journal->j_barrier_count) {
 		read_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_transaction_locked,
 				journal->j_barrier_count == 0);
@@ -213,7 +330,7 @@
 			goto alloc_transaction;
 		write_lock(&journal->j_state_lock);
 		if (!journal->j_running_transaction &&
-		    !journal->j_barrier_count) {
+		    (handle->h_reserved || !journal->j_barrier_count)) {
 			jbd2_get_transaction(journal, new_transaction);
 			new_transaction = NULL;
 		}
@@ -223,85 +340,18 @@
 
 	transaction = journal->j_running_transaction;
 
-	/*
-	 * If the current transaction is locked down for commit, wait for the
-	 * lock to be released.
-	 */
-	if (transaction->t_state == T_LOCKED) {
-		DEFINE_WAIT(wait);
-
-		prepare_to_wait(&journal->j_wait_transaction_locked,
-					&wait, TASK_UNINTERRUPTIBLE);
-		read_unlock(&journal->j_state_lock);
-		schedule();
-		finish_wait(&journal->j_wait_transaction_locked, &wait);
-		goto repeat;
-	}
-
-	/*
-	 * If there is not enough space left in the log to write all potential
-	 * buffers requested by this operation, we need to stall pending a log
-	 * checkpoint to free some more log space.
-	 */
-	needed = atomic_add_return(nblocks,
-				   &transaction->t_outstanding_credits);
-
-	if (needed > journal->j_max_transaction_buffers) {
+	if (!handle->h_reserved) {
+		/* We may have dropped j_state_lock - restart in that case */
+		if (add_transaction_credits(journal, blocks, rsv_blocks))
+			goto repeat;
+	} else {
 		/*
-		 * If the current transaction is already too large, then start
-		 * to commit it: we can then go back and attach this handle to
-		 * a new transaction.
+		 * We have handle reserved so we are allowed to join T_LOCKED
+		 * transaction and we don't have to check for transaction size
+		 * and journal space.
 		 */
-		DEFINE_WAIT(wait);
-
-		jbd_debug(2, "Handle %p starting new commit...\n", handle);
-		atomic_sub(nblocks, &transaction->t_outstanding_credits);
-		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
-				TASK_UNINTERRUPTIBLE);
-		tid = transaction->t_tid;
-		need_to_start = !tid_geq(journal->j_commit_request, tid);
-		read_unlock(&journal->j_state_lock);
-		if (need_to_start)
-			jbd2_log_start_commit(journal, tid);
-		schedule();
-		finish_wait(&journal->j_wait_transaction_locked, &wait);
-		goto repeat;
-	}
-
-	/*
-	 * The commit code assumes that it can get enough log space
-	 * without forcing a checkpoint.  This is *critical* for
-	 * correctness: a checkpoint of a buffer which is also
-	 * associated with a committing transaction creates a deadlock,
-	 * so commit simply cannot force through checkpoints.
-	 *
-	 * We must therefore ensure the necessary space in the journal
-	 * *before* starting to dirty potentially checkpointed buffers
-	 * in the new transaction.
-	 *
-	 * The worst part is, any transaction currently committing can
-	 * reduce the free space arbitrarily.  Be careful to account for
-	 * those buffers when checkpointing.
-	 */
-
-	/*
-	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
-	 * a _lot_ of headroom: 1/4 of the journal plus the size of
-	 * the committing transaction.  Really, we only need to give it
-	 * committing_transaction->t_outstanding_credits plus "enough" for
-	 * the log control blocks.
-	 * Also, this test is inconsistent with the matching one in
-	 * jbd2_journal_extend().
-	 */
-	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
-		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-		atomic_sub(nblocks, &transaction->t_outstanding_credits);
-		read_unlock(&journal->j_state_lock);
-		write_lock(&journal->j_state_lock);
-		if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
-			__jbd2_log_wait_for_space(journal);
-		write_unlock(&journal->j_state_lock);
-		goto repeat;
+		sub_reserved_credits(journal, blocks);
+		handle->h_reserved = 0;
 	}
 
 	/* OK, account for the buffers that this operation expects to
@@ -309,15 +359,16 @@
 	 */
 	update_t_max_wait(transaction, ts);
 	handle->h_transaction = transaction;
-	handle->h_requested_credits = nblocks;
+	handle->h_requested_credits = blocks;
 	handle->h_start_jiffies = jiffies;
 	atomic_inc(&transaction->t_updates);
 	atomic_inc(&transaction->t_handle_count);
-	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
-		  handle, nblocks,
+	jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+		  handle, blocks,
 		  atomic_read(&transaction->t_outstanding_credits),
-		  __jbd2_log_space_left(journal));
+		  jbd2_log_space_left(journal));
 	read_unlock(&journal->j_state_lock);
+	current->journal_info = handle;
 
 	lock_map_acquire(&handle->h_lockdep_map);
 	jbd2_journal_free_transaction(new_transaction);
@@ -348,16 +399,21 @@
  *
  * We make sure that the transaction can guarantee at least nblocks of
  * modified buffers in the log.  We block until the log can guarantee
- * that much space.
- *
- * This function is visible to journal users (like ext3fs), so is not
- * called with the journal already locked.
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
  *
  * Return a pointer to a newly allocated handle, or an ERR_PTR() value
  * on failure.
  */
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
-			      unsigned int type, unsigned int line_no)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
+			      gfp_t gfp_mask, unsigned int type,
+			      unsigned int line_no)
 {
 	handle_t *handle = journal_current_handle();
 	int err;
@@ -374,13 +430,24 @@
 	handle = new_handle(nblocks);
 	if (!handle)
 		return ERR_PTR(-ENOMEM);
+	if (rsv_blocks) {
+		handle_t *rsv_handle;
 
-	current->journal_info = handle;
+		rsv_handle = new_handle(rsv_blocks);
+		if (!rsv_handle) {
+			jbd2_free_handle(handle);
+			return ERR_PTR(-ENOMEM);
+		}
+		rsv_handle->h_reserved = 1;
+		rsv_handle->h_journal = journal;
+		handle->h_rsv_handle = rsv_handle;
+	}
 
 	err = start_this_handle(journal, handle, gfp_mask);
 	if (err < 0) {
+		if (handle->h_rsv_handle)
+			jbd2_free_handle(handle->h_rsv_handle);
 		jbd2_free_handle(handle);
-		current->journal_info = NULL;
 		return ERR_PTR(err);
 	}
 	handle->h_type = type;
@@ -395,10 +462,65 @@
 
 handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 {
-	return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0);
+	return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
 }
 EXPORT_SYMBOL(jbd2_journal_start);
 
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+	journal_t *journal = handle->h_journal;
+
+	WARN_ON(!handle->h_reserved);
+	sub_reserved_credits(journal, handle->h_buffer_credits);
+	jbd2_free_handle(handle);
+}
+EXPORT_SYMBOL(jbd2_journal_free_reserved);
+
+/**
+ * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * @handle: handle to start
+ *
+ * Start handle that has been previously reserved with jbd2_journal_reserve().
+ * This attaches @handle to the running transaction (or creates one if there's
+ * not transaction running). Unlike jbd2_journal_start() this function cannot
+ * block on journal commit, checkpointing, or similar stuff. It can block on
+ * memory allocation or frozen journal though.
+ *
+ * Return 0 on success, non-zero on error - handle is freed in that case.
+ */
+int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
+				unsigned int line_no)
+{
+	journal_t *journal = handle->h_journal;
+	int ret = -EIO;
+
+	if (WARN_ON(!handle->h_reserved)) {
+		/* Someone passed in normal handle? Just stop it. */
+		jbd2_journal_stop(handle);
+		return ret;
+	}
+	/*
+	 * Usefulness of mixing of reserved and unreserved handles is
+	 * questionable. So far nobody seems to need it so just error out.
+	 */
+	if (WARN_ON(current->journal_info)) {
+		jbd2_journal_free_reserved(handle);
+		return ret;
+	}
+
+	handle->h_journal = NULL;
+	/*
+	 * GFP_NOFS is here because callers are likely from writeback or
+	 * similarly constrained call sites
+	 */
+	ret = start_this_handle(journal, handle, GFP_NOFS);
+	if (ret < 0)
+		jbd2_journal_free_reserved(handle);
+	handle->h_type = type;
+	handle->h_line_no = line_no;
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_journal_start_reserved);
 
 /**
  * int jbd2_journal_extend() - extend buffer credits.
@@ -423,49 +545,53 @@
 int jbd2_journal_extend(handle_t *handle, int nblocks)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 	int result;
 	int wanted;
 
-	result = -EIO;
+	WARN_ON(!transaction);
 	if (is_handle_aborted(handle))
-		goto out;
+		return -EROFS;
+	journal = transaction->t_journal;
 
 	result = 1;
 
 	read_lock(&journal->j_state_lock);
 
 	/* Don't extend a locked-down transaction! */
-	if (handle->h_transaction->t_state != T_RUNNING) {
+	if (transaction->t_state != T_RUNNING) {
 		jbd_debug(3, "denied handle %p %d blocks: "
 			  "transaction not running\n", handle, nblocks);
 		goto error_out;
 	}
 
 	spin_lock(&transaction->t_handle_lock);
-	wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+	wanted = atomic_add_return(nblocks,
+				   &transaction->t_outstanding_credits);
 
 	if (wanted > journal->j_max_transaction_buffers) {
 		jbd_debug(3, "denied handle %p %d blocks: "
 			  "transaction too large\n", handle, nblocks);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		goto unlock;
 	}
 
-	if (wanted > __jbd2_log_space_left(journal)) {
+	if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
+	    jbd2_log_space_left(journal)) {
 		jbd_debug(3, "denied handle %p %d blocks: "
 			  "insufficient log space\n", handle, nblocks);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		goto unlock;
 	}
 
 	trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
-				 handle->h_transaction->t_tid,
+				 transaction->t_tid,
 				 handle->h_type, handle->h_line_no,
 				 handle->h_buffer_credits,
 				 nblocks);
 
 	handle->h_buffer_credits += nblocks;
 	handle->h_requested_credits += nblocks;
-	atomic_add(nblocks, &transaction->t_outstanding_credits);
 	result = 0;
 
 	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -473,7 +599,6 @@
 	spin_unlock(&transaction->t_handle_lock);
 error_out:
 	read_unlock(&journal->j_state_lock);
-out:
 	return result;
 }
 
@@ -490,19 +615,22 @@
  * to a running handle, a call to jbd2_journal_restart will commit the
  * handle's transaction so far and reattach the handle to a new
  * transaction capabable of guaranteeing the requested number of
- * credits.
+ * credits. We preserve reserved handle if there's any attached to the
+ * passed in handle.
  */
 int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 	tid_t		tid;
 	int		need_to_start, ret;
 
+	WARN_ON(!transaction);
 	/* If we've had an abort of any type, don't even think about
 	 * actually doing the restart! */
 	if (is_handle_aborted(handle))
 		return 0;
+	journal = transaction->t_journal;
 
 	/*
 	 * First unlink the handle from its current transaction, and start the
@@ -515,12 +643,18 @@
 	spin_lock(&transaction->t_handle_lock);
 	atomic_sub(handle->h_buffer_credits,
 		   &transaction->t_outstanding_credits);
+	if (handle->h_rsv_handle) {
+		sub_reserved_credits(journal,
+				     handle->h_rsv_handle->h_buffer_credits);
+	}
 	if (atomic_dec_and_test(&transaction->t_updates))
 		wake_up(&journal->j_wait_updates);
+	tid = transaction->t_tid;
 	spin_unlock(&transaction->t_handle_lock);
+	handle->h_transaction = NULL;
+	current->journal_info = NULL;
 
 	jbd_debug(2, "restarting handle %p\n", handle);
-	tid = transaction->t_tid;
 	need_to_start = !tid_geq(journal->j_commit_request, tid);
 	read_unlock(&journal->j_state_lock);
 	if (need_to_start)
@@ -557,6 +691,14 @@
 	write_lock(&journal->j_state_lock);
 	++journal->j_barrier_count;
 
+	/* Wait until there are no reserved handles */
+	if (atomic_read(&journal->j_reserved_credits)) {
+		write_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_reserved,
+			   atomic_read(&journal->j_reserved_credits) == 0);
+		write_lock(&journal->j_state_lock);
+	}
+
 	/* Wait until there are no running updates */
 	while (1) {
 		transaction_t *transaction = journal->j_running_transaction;
@@ -619,6 +761,12 @@
 	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 
+static int sleep_on_shadow_bh(void *word)
+{
+	io_schedule();
+	return 0;
+}
+
 /*
  * If the buffer is already part of the current transaction, then there
  * is nothing we need to do.  If it is already part of a prior
@@ -634,17 +782,16 @@
 			int force_copy)
 {
 	struct buffer_head *bh;
-	transaction_t *transaction;
+	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal;
 	int error;
 	char *frozen_buffer = NULL;
 	int need_copy = 0;
 	unsigned long start_lock, time_lock;
 
+	WARN_ON(!transaction);
 	if (is_handle_aborted(handle))
 		return -EROFS;
-
-	transaction = handle->h_transaction;
 	journal = transaction->t_journal;
 
 	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -754,41 +901,29 @@
 		 * journaled.  If the primary copy is already going to
 		 * disk then we cannot do copy-out here. */
 
-		if (jh->b_jlist == BJ_Shadow) {
-			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
-			wait_queue_head_t *wqh;
-
-			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
+		if (buffer_shadow(bh)) {
 			JBUFFER_TRACE(jh, "on shadow: sleep");
 			jbd_unlock_bh_state(bh);
-			/* commit wakes up all shadow buffers after IO */
-			for ( ; ; ) {
-				prepare_to_wait(wqh, &wait.wait,
-						TASK_UNINTERRUPTIBLE);
-				if (jh->b_jlist != BJ_Shadow)
-					break;
-				schedule();
-			}
-			finish_wait(wqh, &wait.wait);
+			wait_on_bit(&bh->b_state, BH_Shadow,
+				    sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
 			goto repeat;
 		}
 
-		/* Only do the copy if the currently-owning transaction
-		 * still needs it.  If it is on the Forget list, the
-		 * committing transaction is past that stage.  The
-		 * buffer had better remain locked during the kmalloc,
-		 * but that should be true --- we hold the journal lock
-		 * still and the buffer is already on the BUF_JOURNAL
-		 * list so won't be flushed.
+		/*
+		 * Only do the copy if the currently-owning transaction still
+		 * needs it. If buffer isn't on BJ_Metadata list, the
+		 * committing transaction is past that stage (here we use the
+		 * fact that BH_Shadow is set under bh_state lock together with
+		 * refiling to BJ_Shadow list and at this point we know the
+		 * buffer doesn't have BH_Shadow set).
 		 *
 		 * Subtle point, though: if this is a get_undo_access,
 		 * then we will be relying on the frozen_data to contain
 		 * the new value of the committed_data record after the
 		 * transaction, so we HAVE to force the frozen_data copy
-		 * in that case. */
-
-		if (jh->b_jlist != BJ_Forget || force_copy) {
+		 * in that case.
+		 */
+		if (jh->b_jlist == BJ_Metadata || force_copy) {
 			JBUFFER_TRACE(jh, "generate frozen data");
 			if (!frozen_buffer) {
 				JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -915,14 +1050,16 @@
 int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 	int err;
 
 	jbd_debug(5, "journal_head %p\n", jh);
+	WARN_ON(!transaction);
 	err = -EROFS;
 	if (is_handle_aborted(handle))
 		goto out;
+	journal = transaction->t_journal;
 	err = 0;
 
 	JBUFFER_TRACE(jh, "entry");
@@ -1128,12 +1265,14 @@
 int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 	struct journal_head *jh;
 	int ret = 0;
 
+	WARN_ON(!transaction);
 	if (is_handle_aborted(handle))
-		goto out;
+		return -EROFS;
+	journal = transaction->t_journal;
 	jh = jbd2_journal_grab_journal_head(bh);
 	if (!jh) {
 		ret = -EUCLEAN;
@@ -1227,7 +1366,7 @@
 
 	JBUFFER_TRACE(jh, "file as BJ_Metadata");
 	spin_lock(&journal->j_list_lock);
-	__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+	__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
 	spin_unlock(&journal->j_list_lock);
 out_unlock_bh:
 	jbd_unlock_bh_state(bh);
@@ -1258,12 +1397,17 @@
 int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 	struct journal_head *jh;
 	int drop_reserve = 0;
 	int err = 0;
 	int was_modified = 0;
 
+	WARN_ON(!transaction);
+	if (is_handle_aborted(handle))
+		return -EROFS;
+	journal = transaction->t_journal;
+
 	BUFFER_TRACE(bh, "entry");
 
 	jbd_lock_bh_state(bh);
@@ -1290,7 +1434,7 @@
 	 */
 	jh->b_modified = 0;
 
-	if (jh->b_transaction == handle->h_transaction) {
+	if (jh->b_transaction == transaction) {
 		J_ASSERT_JH(jh, !jh->b_frozen_data);
 
 		/* If we are forgetting a buffer which is already part
@@ -1385,19 +1529,21 @@
 int jbd2_journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
-	int err, wait_for_commit = 0;
+	journal_t *journal;
+	int err = 0, wait_for_commit = 0;
 	tid_t tid;
 	pid_t pid;
 
+	if (!transaction)
+		goto free_and_exit;
+	journal = transaction->t_journal;
+
 	J_ASSERT(journal_current_handle() == handle);
 
 	if (is_handle_aborted(handle))
 		err = -EIO;
-	else {
+	else
 		J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-		err = 0;
-	}
 
 	if (--handle->h_ref > 0) {
 		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1407,7 +1553,7 @@
 
 	jbd_debug(4, "Handle %p going down\n", handle);
 	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
-				handle->h_transaction->t_tid,
+				transaction->t_tid,
 				handle->h_type, handle->h_line_no,
 				jiffies - handle->h_start_jiffies,
 				handle->h_sync, handle->h_requested_credits,
@@ -1518,33 +1664,13 @@
 
 	lock_map_release(&handle->h_lockdep_map);
 
+	if (handle->h_rsv_handle)
+		jbd2_journal_free_reserved(handle->h_rsv_handle);
+free_and_exit:
 	jbd2_free_handle(handle);
 	return err;
 }
 
-/**
- * int jbd2_journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk.  May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int jbd2_journal_force_commit(journal_t *journal)
-{
-	handle_t *handle;
-	int ret;
-
-	handle = jbd2_journal_start(journal, 1);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-	} else {
-		handle->h_sync = 1;
-		ret = jbd2_journal_stop(handle);
-	}
-	return ret;
-}
-
 /*
  *
  * List management code snippets: various functions for manipulating the
@@ -1601,10 +1727,10 @@
  * Remove a buffer from the appropriate transaction list.
  *
  * Note that this function can *change* the value of
- * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
- * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
- * of these pointers, it could go bad.  Generally the caller needs to re-read
- * the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
+ * t_reserved_list.  If the caller is holding onto a copy of one of these
+ * pointers, it could go bad.  Generally the caller needs to re-read the
+ * pointer from the transaction_t.
  *
  * Called under j_list_lock.
  */
@@ -1634,15 +1760,9 @@
 	case BJ_Forget:
 		list = &transaction->t_forget;
 		break;
-	case BJ_IO:
-		list = &transaction->t_iobuf_list;
-		break;
 	case BJ_Shadow:
 		list = &transaction->t_shadow_list;
 		break;
-	case BJ_LogCtl:
-		list = &transaction->t_log_list;
-		break;
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
@@ -2034,18 +2154,23 @@
  * void jbd2_journal_invalidatepage()
  * @journal: journal to use for flush...
  * @page:    page to flush
- * @offset:  length of page to invalidate.
+ * @offset:  start of the range to invalidate
+ * @length:  length of the range to invalidate
  *
- * Reap page buffers containing data after offset in page. Can return -EBUSY
- * if buffers are part of the committing transaction and the page is straddling
- * i_size. Caller then has to wait for current commit and try again.
+ * Reap page buffers containing data after in the specified range in page.
+ * Can return -EBUSY if buffers are part of the committing transaction and
+ * the page is straddling i_size. Caller then has to wait for current commit
+ * and try again.
  */
 int jbd2_journal_invalidatepage(journal_t *journal,
 				struct page *page,
-				unsigned long offset)
+				unsigned int offset,
+				unsigned int length)
 {
 	struct buffer_head *head, *bh, *next;
+	unsigned int stop = offset + length;
 	unsigned int curr_off = 0;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
 	int may_free = 1;
 	int ret = 0;
 
@@ -2054,6 +2179,8 @@
 	if (!page_has_buffers(page))
 		return 0;
 
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
 	/* We will potentially be playing with lists other than just the
 	 * data lists (especially for journaled data mode), so be
 	 * cautious in our locking. */
@@ -2063,10 +2190,13 @@
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
 
+		if (next_off > stop)
+			return 0;
+
 		if (offset <= curr_off) {
 			/* This block is wholly outside the truncation point */
 			lock_buffer(bh);
-			ret = journal_unmap_buffer(journal, bh, offset > 0);
+			ret = journal_unmap_buffer(journal, bh, partial_page);
 			unlock_buffer(bh);
 			if (ret < 0)
 				return ret;
@@ -2077,7 +2207,7 @@
 
 	} while (bh != head);
 
-	if (!offset) {
+	if (!partial_page) {
 		if (may_free && try_to_free_buffers(page))
 			J_ASSERT(!page_has_buffers(page));
 	}
@@ -2138,15 +2268,9 @@
 	case BJ_Forget:
 		list = &transaction->t_forget;
 		break;
-	case BJ_IO:
-		list = &transaction->t_iobuf_list;
-		break;
 	case BJ_Shadow:
 		list = &transaction->t_shadow_list;
 		break;
-	case BJ_LogCtl:
-		list = &transaction->t_log_list;
-		break;
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
@@ -2248,10 +2372,12 @@
 int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 
+	WARN_ON(!transaction);
 	if (is_handle_aborted(handle))
-		return -EIO;
+		return -EROFS;
+	journal = transaction->t_journal;
 
 	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
 			transaction->t_tid);

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index acd46a4..e3aac22 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c

@@ -22,7 +22,7 @@
 #include <linux/time.h>
 #include "nodelist.h"
 
-static int jffs2_readdir (struct file *, void *, filldir_t);
+static int jffs2_readdir (struct file *, struct dir_context *);
 
 static int jffs2_create (struct inode *,struct dentry *,umode_t,
 			 bool);
@@ -40,7 +40,7 @@
 const struct file_operations jffs2_dir_operations =
 {
 	.read =		generic_read_dir,
-	.readdir =	jffs2_readdir,
+	.iterate =	jffs2_readdir,
 	.unlocked_ioctl=jffs2_ioctl,
 	.fsync =	jffs2_fsync,
 	.llseek =	generic_file_llseek,
@@ -114,60 +114,40 @@
 /***********************************************************************/
 
 
-static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int jffs2_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct jffs2_inode_info *f;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 	struct jffs2_full_dirent *fd;
-	unsigned long offset, curofs;
+	unsigned long curofs = 1;
 
-	jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n",
-		  file_inode(filp)->i_ino);
+	jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino);
 
-	f = JFFS2_INODE_INFO(inode);
+	if (!dir_emit_dots(file, ctx))
+		return 0;
 
-	offset = filp->f_pos;
-
-	if (offset == 0) {
-		jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino);
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-			goto out;
-		offset++;
-	}
-	if (offset == 1) {
-		unsigned long pino = parent_ino(filp->f_path.dentry);
-		jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino);
-		if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0)
-			goto out;
-		offset++;
-	}
-
-	curofs=1;
 	mutex_lock(&f->sem);
 	for (fd = f->dents; fd; fd = fd->next) {
-
 		curofs++;
-		/* First loop: curofs = 2; offset = 2 */
-		if (curofs < offset) {
+		/* First loop: curofs = 2; pos = 2 */
+		if (curofs < ctx->pos) {
 			jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
-				  fd->name, fd->ino, fd->type, curofs, offset);
+				  fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos);
 			continue;
 		}
 		if (!fd->ino) {
 			jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n",
 				  fd->name);
-			offset++;
+			ctx->pos++;
 			continue;
 		}
 		jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n",
-			  offset, fd->name, fd->ino, fd->type);
-		if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0)
+			  (unsigned long)ctx->pos, fd->name, fd->ino, fd->type);
+		if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type))
 			break;
-		offset++;
+		ctx->pos++;
 	}
 	mutex_unlock(&f->sem);
- out:
-	filp->f_pos = offset;
 	return 0;
 }
 

diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0ddbece..9f4ed13 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c

@@ -3002,9 +3002,9 @@
  * return: offset = (pn, index) of start entry
  *	of next jfs_readdir()/dtRead()
  */
-int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int jfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *ip = file_inode(filp);
+	struct inode *ip = file_inode(file);
 	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
 	int rc = 0;
 	loff_t dtpos;	/* legacy OS/2 style position */
@@ -3033,7 +3033,7 @@
 	int overflow, fix_page, page_fixed = 0;
 	static int unique_pos = 2;	/* If we can't fix broken index */
 
-	if (filp->f_pos == DIREND)
+	if (ctx->pos == DIREND)
 		return 0;
 
 	if (DO_INDEX(ip)) {
@@ -3045,7 +3045,7 @@
 		 */
 		do_index = 1;
 
-		dir_index = (u32) filp->f_pos;
+		dir_index = (u32) ctx->pos;
 
 		if (dir_index > 1) {
 			struct dir_table_slot dirtab_slot;
@@ -3053,25 +3053,25 @@
 			if (dtEmpty(ip) ||
 			    (dir_index >= JFS_IP(ip)->next_index)) {
 				/* Stale position.  Directory has shrunk */
-				filp->f_pos = DIREND;
+				ctx->pos = DIREND;
 				return 0;
 			}
 		      repeat:
 			rc = read_index(ip, dir_index, &dirtab_slot);
 			if (rc) {
-				filp->f_pos = DIREND;
+				ctx->pos = DIREND;
 				return rc;
 			}
 			if (dirtab_slot.flag == DIR_INDEX_FREE) {
 				if (loop_count++ > JFS_IP(ip)->next_index) {
 					jfs_err("jfs_readdir detected "
 						   "infinite loop!");
-					filp->f_pos = DIREND;
+					ctx->pos = DIREND;
 					return 0;
 				}
 				dir_index = le32_to_cpu(dirtab_slot.addr2);
 				if (dir_index == -1) {
-					filp->f_pos = DIREND;
+					ctx->pos = DIREND;
 					return 0;
 				}
 				goto repeat;
@@ -3080,13 +3080,13 @@
 			index = dirtab_slot.slot;
 			DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 			if (rc) {
-				filp->f_pos = DIREND;
+				ctx->pos = DIREND;
 				return 0;
 			}
 			if (p->header.flag & BT_INTERNAL) {
 				jfs_err("jfs_readdir: bad index table");
 				DT_PUTPAGE(mp);
-				filp->f_pos = -1;
+				ctx->pos = -1;
 				return 0;
 			}
 		} else {
@@ -3094,23 +3094,22 @@
 				/*
 				 * self "."
 				 */
-				filp->f_pos = 0;
-				if (filldir(dirent, ".", 1, 0, ip->i_ino,
-					    DT_DIR))
+				ctx->pos = 0;
+				if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
 					return 0;
 			}
 			/*
 			 * parent ".."
 			 */
-			filp->f_pos = 1;
-			if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
+			ctx->pos = 1;
+			if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
 				return 0;
 
 			/*
 			 * Find first entry of left-most leaf
 			 */
 			if (dtEmpty(ip)) {
-				filp->f_pos = DIREND;
+				ctx->pos = DIREND;
 				return 0;
 			}
 
@@ -3128,23 +3127,19 @@
 		 * pn > 0:		Real entries, pn=1 -> leftmost page
 		 * pn = index = -1:	No more entries
 		 */
-		dtpos = filp->f_pos;
+		dtpos = ctx->pos;
 		if (dtpos == 0) {
 			/* build "." entry */
-
-			if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
-				    DT_DIR))
+			if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
 				return 0;
 			dtoffset->index = 1;
-			filp->f_pos = dtpos;
+			ctx->pos = dtpos;
 		}
 
 		if (dtoffset->pn == 0) {
 			if (dtoffset->index == 1) {
 				/* build ".." entry */
-
-				if (filldir(dirent, "..", 2, filp->f_pos,
-					    PARENT(ip), DT_DIR))
+				if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
 					return 0;
 			} else {
 				jfs_err("jfs_readdir called with "
@@ -3152,18 +3147,18 @@
 			}
 			dtoffset->pn = 1;
 			dtoffset->index = 0;
-			filp->f_pos = dtpos;
+			ctx->pos = dtpos;
 		}
 
 		if (dtEmpty(ip)) {
-			filp->f_pos = DIREND;
+			ctx->pos = DIREND;
 			return 0;
 		}
 
-		if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
+		if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
 			jfs_err("jfs_readdir: unexpected rc = %d "
 				"from dtReadNext", rc);
-			filp->f_pos = DIREND;
+			ctx->pos = DIREND;
 			return 0;
 		}
 		/* get start leaf page and index */
@@ -3171,7 +3166,7 @@
 
 		/* offset beyond directory eof ? */
 		if (bn < 0) {
-			filp->f_pos = DIREND;
+			ctx->pos = DIREND;
 			return 0;
 		}
 	}
@@ -3180,7 +3175,7 @@
 	if (dirent_buf == 0) {
 		DT_PUTPAGE(mp);
 		jfs_warn("jfs_readdir: __get_free_page failed!");
-		filp->f_pos = DIREND;
+		ctx->pos = DIREND;
 		return -ENOMEM;
 	}
 
@@ -3295,9 +3290,9 @@
 
 		jfs_dirent = (struct jfs_dirent *) dirent_buf;
 		while (jfs_dirents--) {
-			filp->f_pos = jfs_dirent->position;
-			if (filldir(dirent, jfs_dirent->name,
-				    jfs_dirent->name_len, filp->f_pos,
+			ctx->pos = jfs_dirent->position;
+			if (!dir_emit(ctx, jfs_dirent->name,
+				    jfs_dirent->name_len,
 				    jfs_dirent->ino, DT_UNKNOWN))
 				goto out;
 			jfs_dirent = next_jfs_dirent(jfs_dirent);
@@ -3309,7 +3304,7 @@
 		}
 
 		if (!overflow && (bn == 0)) {
-			filp->f_pos = DIREND;
+			ctx->pos = DIREND;
 			break;
 		}
 

diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 2545bb3..fd4169e 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h

@@ -265,5 +265,5 @@
 extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
 		    ino_t * orig_ino, ino_t new_ino, int flag);
 
-extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
+extern int jfs_readdir(struct file *file, struct dir_context *ctx);
 #endif				/* !_H_JFS_DTREE */

diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34..9e3aaff 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c

@@ -571,9 +571,10 @@
 	return ret;
 }
 
-static void metapage_invalidatepage(struct page *page, unsigned long offset)
+static void metapage_invalidatepage(struct page *page, unsigned int offset,
+				    unsigned int length)
 {
-	BUG_ON(offset);
+	BUG_ON(offset || length < PAGE_CACHE_SIZE);
 
 	BUG_ON(PageWriteback(page));
 

diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 3b91a7a..89186b7 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c

@@ -1529,7 +1529,7 @@
 
 const struct file_operations jfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= jfs_readdir,
+	.iterate	= jfs_readdir,
 	.fsync		= jfs_fsync,
 	.unlocked_ioctl = jfs_ioctl,
 #ifdef CONFIG_COMPAT

diff --git a/fs/libfs.c b/fs/libfs.c
index 916da8c..c3a0837 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c

@@ -135,60 +135,40 @@
  * both impossible due to the lock on directory.
  */
 
-int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int dcache_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct dentry *cursor = filp->private_data;
+	struct dentry *dentry = file->f_path.dentry;
+	struct dentry *cursor = file->private_data;
 	struct list_head *p, *q = &cursor->d_u.d_child;
-	ino_t ino;
-	int i = filp->f_pos;
 
-	switch (i) {
-		case 0:
-			ino = dentry->d_inode->i_ino;
-			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-				break;
-			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		case 1:
-			ino = parent_ino(dentry);
-			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-				break;
-			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		default:
-			spin_lock(&dentry->d_lock);
-			if (filp->f_pos == 2)
-				list_move(q, &dentry->d_subdirs);
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	spin_lock(&dentry->d_lock);
+	if (ctx->pos == 2)
+		list_move(q, &dentry->d_subdirs);
 
-			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
-				struct dentry *next;
-				next = list_entry(p, struct dentry, d_u.d_child);
-				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-				if (!simple_positive(next)) {
-					spin_unlock(&next->d_lock);
-					continue;
-				}
+	for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
+		struct dentry *next = list_entry(p, struct dentry, d_u.d_child);
+		spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+		if (!simple_positive(next)) {
+			spin_unlock(&next->d_lock);
+			continue;
+		}
 
-				spin_unlock(&next->d_lock);
-				spin_unlock(&dentry->d_lock);
-				if (filldir(dirent, next->d_name.name, 
-					    next->d_name.len, filp->f_pos, 
-					    next->d_inode->i_ino, 
-					    dt_type(next->d_inode)) < 0)
-					return 0;
-				spin_lock(&dentry->d_lock);
-				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-				/* next is still alive */
-				list_move(q, p);
-				spin_unlock(&next->d_lock);
-				p = q;
-				filp->f_pos++;
-			}
-			spin_unlock(&dentry->d_lock);
+		spin_unlock(&next->d_lock);
+		spin_unlock(&dentry->d_lock);
+		if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
+			      next->d_inode->i_ino, dt_type(next->d_inode)))
+			return 0;
+		spin_lock(&dentry->d_lock);
+		spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+		/* next is still alive */
+		list_move(q, p);
+		spin_unlock(&next->d_lock);
+		p = q;
+		ctx->pos++;
 	}
+	spin_unlock(&dentry->d_lock);
 	return 0;
 }
 
@@ -202,7 +182,7 @@
 	.release	= dcache_dir_close,
 	.llseek		= dcache_dir_lseek,
 	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
+	.iterate	= dcache_readdir,
 	.fsync		= noop_fsync,
 };
 

diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b827510..6bdc347 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c

@@ -281,17 +281,23 @@
 
 /* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
  * way to combine the two copies */
-#define IMPLICIT_NODES 2
-static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+static int logfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *dir = file_inode(file);
-	loff_t pos = file->f_pos - IMPLICIT_NODES;
+	loff_t pos;
 	struct page *page;
 	struct logfs_disk_dentry *dd;
-	int full;
 
+	if (ctx->pos < 0)
+		return -EINVAL;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	pos = ctx->pos - 2;
 	BUG_ON(pos < 0);
-	for (;; pos++) {
+	for (;; pos++, ctx->pos++) {
+		bool full;
 		if (beyond_eof(dir, pos))
 			break;
 		if (!logfs_exist_block(dir, pos)) {
@@ -306,42 +312,17 @@
 		dd = kmap(page);
 		BUG_ON(dd->namelen == 0);
 
-		full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
-				pos, be64_to_cpu(dd->ino), dd->type);
+		full = !dir_emit(ctx, (char *)dd->name,
+				be16_to_cpu(dd->namelen),
+				be64_to_cpu(dd->ino), dd->type);
 		kunmap(page);
 		page_cache_release(page);
 		if (full)
 			break;
 	}
-
-	file->f_pos = pos + IMPLICIT_NODES;
 	return 0;
 }
 
-static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
-{
-	struct inode *inode = file_inode(file);
-	ino_t pino = parent_ino(file->f_dentry);
-	int err;
-
-	if (file->f_pos < 0)
-		return -EINVAL;
-
-	if (file->f_pos == 0) {
-		if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
-			return 0;
-		file->f_pos++;
-	}
-	if (file->f_pos == 1) {
-		if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
-			return 0;
-		file->f_pos++;
-	}
-
-	err = __logfs_readdir(file, buf, filldir);
-	return err;
-}
-
 static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
 {
 	dd->namelen = cpu_to_be16(name->len);
@@ -814,7 +795,7 @@
 const struct file_operations logfs_dir_fops = {
 	.fsync		= logfs_fsync,
 	.unlocked_ioctl	= logfs_ioctl,
-	.readdir	= logfs_readdir,
+	.iterate	= logfs_readdir,
 	.read		= generic_read_dir,
 	.llseek		= default_llseek,
 };

diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2219a6..57914fc 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c

@@ -159,7 +159,8 @@
 	return __logfs_writepage(page);
 }
 
-static void logfs_invalidatepage(struct page *page, unsigned long offset)
+static void logfs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct logfs_block *block = logfs_block(page);
 

diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da09..d448a77 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c

@@ -884,7 +884,8 @@
 	return area;
 }
 
-static void map_invalidatepage(struct page *page, unsigned long l)
+static void map_invalidatepage(struct page *page, unsigned int o,
+			       unsigned int l)
 {
 	return;
 }

diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index a9ed6f3..08c4429 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c

@@ -16,12 +16,12 @@
 typedef struct minix_dir_entry minix_dirent;
 typedef struct minix3_dir_entry minix3_dirent;
 
-static int minix_readdir(struct file *, void *, filldir_t);
+static int minix_readdir(struct file *, struct dir_context *);
 
 const struct file_operations minix_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= minix_readdir,
+	.iterate	= minix_readdir,
 	.fsync		= generic_file_fsync,
 };
 
@@ -82,22 +82,23 @@
 	return (void*)((char*)de + sbi->s_dirsize);
 }
 
-static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int minix_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned long pos = filp->f_pos;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	unsigned offset = pos & ~PAGE_CACHE_MASK;
-	unsigned long n = pos >> PAGE_CACHE_SHIFT;
-	unsigned long npages = dir_pages(inode);
 	struct minix_sb_info *sbi = minix_sb(sb);
 	unsigned chunk_size = sbi->s_dirsize;
-	char *name;
-	__u32 inumber;
+	unsigned long npages = dir_pages(inode);
+	unsigned long pos = ctx->pos;
+	unsigned offset;
+	unsigned long n;
 
-	pos = (pos + chunk_size-1) & ~(chunk_size-1);
+	ctx->pos = pos = (pos + chunk_size-1) & ~(chunk_size-1);
 	if (pos >= inode->i_size)
-		goto done;
+		return 0;
+
+	offset = pos & ~PAGE_CACHE_MASK;
+	n = pos >> PAGE_CACHE_SHIFT;
 
 	for ( ; n < npages; n++, offset = 0) {
 		char *p, *kaddr, *limit;
@@ -109,6 +110,8 @@
 		p = kaddr+offset;
 		limit = kaddr + minix_last_byte(inode, n) - chunk_size;
 		for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
+			const char *name;
+			__u32 inumber;
 			if (sbi->s_version == MINIX_V3) {
 				minix3_dirent *de3 = (minix3_dirent *)p;
 				name = de3->name;
@@ -119,24 +122,17 @@
 				inumber = de->inode;
 			}
 			if (inumber) {
-				int over;
-
 				unsigned l = strnlen(name, sbi->s_namelen);
-				offset = p - kaddr;
-				over = filldir(dirent, name, l,
-					(n << PAGE_CACHE_SHIFT) | offset,
-					inumber, DT_UNKNOWN);
-				if (over) {
+				if (!dir_emit(ctx, name, l,
+					      inumber, DT_UNKNOWN)) {
 					dir_put_page(page);
-					goto done;
+					return 0;
 				}
 			}
+			ctx->pos += chunk_size;
 		}
 		dir_put_page(page);
 	}
-
-done:
-	filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
 	return 0;
 }
 

diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 6792ce1..0e7f002 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c

@@ -23,12 +23,12 @@
 
 #include "ncp_fs.h"
 
-static void ncp_read_volume_list(struct file *, void *, filldir_t,
+static void ncp_read_volume_list(struct file *, struct dir_context *,
 				struct ncp_cache_control *);
-static void ncp_do_readdir(struct file *, void *, filldir_t,
+static void ncp_do_readdir(struct file *, struct dir_context *,
 				struct ncp_cache_control *);
 
-static int ncp_readdir(struct file *, void *, filldir_t);
+static int ncp_readdir(struct file *, struct dir_context *);
 
 static int ncp_create(struct inode *, struct dentry *, umode_t, bool);
 static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int);
@@ -49,7 +49,7 @@
 {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ncp_readdir,
+	.iterate	= ncp_readdir,
 	.unlocked_ioctl	= ncp_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ncp_compat_ioctl,
@@ -424,9 +424,9 @@
 	return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
 }
 
-static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ncp_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct page *page = NULL;
 	struct ncp_server *server = NCP_SERVER(inode);
@@ -440,7 +440,7 @@
 
 	DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		(int) filp->f_pos);
+		(int) ctx->pos);
 
 	result = -EIO;
 	/* Do not generate '.' and '..' when server is dead. */
@@ -448,16 +448,8 @@
 		goto out;
 
 	result = 0;
-	if (filp->f_pos == 0) {
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
-			goto out;
-		filp->f_pos = 1;
-	}
-	if (filp->f_pos == 1) {
-		if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR))
-			goto out;
-		filp->f_pos = 2;
-	}
+	if (!dir_emit_dots(file, ctx))
+		goto out;
 
 	page = grab_cache_page(&inode->i_data, 0);
 	if (!page)
@@ -469,7 +461,7 @@
 	if (!PageUptodate(page) || !ctl.head.eof)
 		goto init_cache;
 
-	if (filp->f_pos == 2) {
+	if (ctx->pos == 2) {
 		if (jiffies - ctl.head.time >= NCP_MAX_AGE(server))
 			goto init_cache;
 
@@ -479,10 +471,10 @@
 			goto init_cache;
 	}
 
-	if (filp->f_pos > ctl.head.end)
+	if (ctx->pos > ctl.head.end)
 		goto finished;
 
-	ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2);
+	ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2);
 	ctl.ofs  = ctl.fpos / NCP_DIRCACHE_SIZE;
 	ctl.idx  = ctl.fpos % NCP_DIRCACHE_SIZE;
 
@@ -497,21 +489,21 @@
 		}
 		while (ctl.idx < NCP_DIRCACHE_SIZE) {
 			struct dentry *dent;
-			int res;
+			bool over;
 
 			dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
-						dentry, filp->f_pos);
+						dentry, ctx->pos);
 			if (!dent)
 				goto invalid_cache;
-			res = filldir(dirent, dent->d_name.name,
-					dent->d_name.len, filp->f_pos,
+			over = !dir_emit(ctx, dent->d_name.name,
+					dent->d_name.len,
 					dent->d_inode->i_ino, DT_UNKNOWN);
 			dput(dent);
-			if (res)
+			if (over)
 				goto finished;
-			filp->f_pos += 1;
+			ctx->pos += 1;
 			ctl.idx += 1;
-			if (filp->f_pos > ctl.head.end)
+			if (ctx->pos > ctl.head.end)
 				goto finished;
 		}
 		if (ctl.page) {
@@ -548,9 +540,9 @@
 	ctl.valid  = 1;
 read_really:
 	if (ncp_is_server_root(inode)) {
-		ncp_read_volume_list(filp, dirent, filldir, &ctl);
+		ncp_read_volume_list(file, ctx, &ctl);
 	} else {
-		ncp_do_readdir(filp, dirent, filldir, &ctl);
+		ncp_do_readdir(file, ctx, &ctl);
 	}
 	ctl.head.end = ctl.fpos - 1;
 	ctl.head.eof = ctl.valid;
@@ -573,11 +565,11 @@
 }
 
 static int
-ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+ncp_fill_cache(struct file *file, struct dir_context *ctx,
 		struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
 		int inval_childs)
 {
-	struct dentry *newdent, *dentry = filp->f_path.dentry;
+	struct dentry *newdent, *dentry = file->f_path.dentry;
 	struct inode *dir = dentry->d_inode;
 	struct ncp_cache_control ctl = *ctrl;
 	struct qstr qname;
@@ -666,15 +658,15 @@
 end_advance:
 	if (!valid)
 		ctl.valid = 0;
-	if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
+	if (!ctl.filled && (ctl.fpos == ctx->pos)) {
 		if (!ino)
 			ino = find_inode_number(dentry, &qname);
 		if (!ino)
 			ino = iunique(dir->i_sb, 2);
-		ctl.filled = filldir(dirent, qname.name, qname.len,
-				     filp->f_pos, ino, DT_UNKNOWN);
+		ctl.filled = !dir_emit(ctx, qname.name, qname.len,
+				     ino, DT_UNKNOWN);
 		if (!ctl.filled)
-			filp->f_pos += 1;
+			ctx->pos += 1;
 	}
 	ctl.fpos += 1;
 	ctl.idx  += 1;
@@ -683,10 +675,10 @@
 }
 
 static void
-ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
+ncp_read_volume_list(struct file *file, struct dir_context *ctx,
 			struct ncp_cache_control *ctl)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct ncp_server *server = NCP_SERVER(inode);
 	struct ncp_volume_info info;
@@ -694,7 +686,7 @@
 	int i;
 
 	DPRINTK("ncp_read_volume_list: pos=%ld\n",
-			(unsigned long) filp->f_pos);
+			(unsigned long) ctx->pos);
 
 	for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
 		int inval_dentry;
@@ -715,16 +707,16 @@
 		}
 		inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
 		entry.volume = entry.i.volNumber;
-		if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
+		if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry))
 			return;
 	}
 }
 
 static void
-ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
+ncp_do_readdir(struct file *file, struct dir_context *ctx,
 						struct ncp_cache_control *ctl)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *dir = dentry->d_inode;
 	struct ncp_server *server = NCP_SERVER(dir);
 	struct nw_search_sequence seq;
@@ -736,7 +728,7 @@
 
 	DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		(unsigned long) filp->f_pos);
+		(unsigned long) ctx->pos);
 	PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n",
 		dentry->d_name.name, NCP_FINFO(dir)->volNumber,
 		NCP_FINFO(dir)->dirEntNum);
@@ -778,7 +770,7 @@
 			rpl += onerpl;
 			rpls -= onerpl;
 			entry.volume = entry.i.volNumber;
-			if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
+			if (!ncp_fill_cache(file, ctx, ctl, &entry, 0))
 				break;
 		}
 	} while (more);

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e093e73..5d05141 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c

@@ -46,7 +46,7 @@
 
 static int nfs_opendir(struct inode *, struct file *);
 static int nfs_closedir(struct inode *, struct file *);
-static int nfs_readdir(struct file *, void *, filldir_t);
+static int nfs_readdir(struct file *, struct dir_context *);
 static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 static void nfs_readdir_clear_array(struct page*);
@@ -54,7 +54,7 @@
 const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
 	.read		= generic_read_dir,
-	.readdir	= nfs_readdir,
+	.iterate	= nfs_readdir,
 	.open		= nfs_opendir,
 	.release	= nfs_closedir,
 	.fsync		= nfs_fsync_dir,
@@ -147,6 +147,7 @@
 typedef struct {
 	struct file	*file;
 	struct page	*page;
+	struct dir_context *ctx;
 	unsigned long	page_index;
 	u64		*dir_cookie;
 	u64		last_cookie;
@@ -252,7 +253,7 @@
 static
 int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
 {
-	loff_t diff = desc->file->f_pos - desc->current_index;
+	loff_t diff = desc->ctx->pos - desc->current_index;
 	unsigned int index;
 
 	if (diff < 0)
@@ -289,7 +290,7 @@
 			    || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
 				ctx->duped = 0;
 				ctx->attr_gencount = nfsi->attr_gencount;
-			} else if (new_pos < desc->file->f_pos) {
+			} else if (new_pos < desc->ctx->pos) {
 				if (ctx->duped > 0
 				    && ctx->dup_cookie == *desc->dir_cookie) {
 					if (printk_ratelimit()) {
@@ -307,7 +308,7 @@
 				ctx->dup_cookie = *desc->dir_cookie;
 				ctx->duped = -1;
 			}
-			desc->file->f_pos = new_pos;
+			desc->ctx->pos = new_pos;
 			desc->cache_entry_index = i;
 			return 0;
 		}
@@ -405,13 +406,13 @@
 }
 
 static
-bool nfs_use_readdirplus(struct inode *dir, struct file *filp)
+bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 {
 	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
 		return false;
 	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
 		return true;
-	if (filp->f_pos == 0)
+	if (ctx->pos == 0)
 		return true;
 	return false;
 }
@@ -702,8 +703,7 @@
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
 static 
-int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
-		   filldir_t filldir)
+int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 {
 	struct file	*file = desc->file;
 	int i = 0;
@@ -721,13 +721,12 @@
 		struct nfs_cache_array_entry *ent;
 
 		ent = &array->array[i];
-		if (filldir(dirent, ent->string.name, ent->string.len,
-		    file->f_pos, nfs_compat_user_ino64(ent->ino),
-		    ent->d_type) < 0) {
+		if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
+		    nfs_compat_user_ino64(ent->ino), ent->d_type)) {
 			desc->eof = 1;
 			break;
 		}
-		file->f_pos++;
+		desc->ctx->pos++;
 		if (i < (array->size-1))
 			*desc->dir_cookie = array->array[i+1].cookie;
 		else
@@ -759,8 +758,7 @@
  *	 directory in the page cache by the time we get here.
  */
 static inline
-int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
-		     filldir_t filldir)
+int uncached_readdir(nfs_readdir_descriptor_t *desc)
 {
 	struct page	*page = NULL;
 	int		status;
@@ -785,7 +783,7 @@
 	if (status < 0)
 		goto out_release;
 
-	status = nfs_do_filldir(desc, dirent, filldir);
+	status = nfs_do_filldir(desc);
 
  out:
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
@@ -800,35 +798,36 @@
    last cookie cache takes care of the common case of reading the
    whole directory.
  */
-static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry	*dentry = filp->f_path.dentry;
+	struct dentry	*dentry = file->f_path.dentry;
 	struct inode	*inode = dentry->d_inode;
 	nfs_readdir_descriptor_t my_desc,
 			*desc = &my_desc;
-	struct nfs_open_dir_context *dir_ctx = filp->private_data;
+	struct nfs_open_dir_context *dir_ctx = file->private_data;
 	int res;
 
 	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
-			(long long)filp->f_pos);
+			(long long)ctx->pos);
 	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
 
 	/*
-	 * filp->f_pos points to the dirent entry number.
+	 * ctx->pos points to the dirent entry number.
 	 * *desc->dir_cookie has the cookie for the next entry. We have
 	 * to either find the entry with the appropriate number or
 	 * revalidate the cookie.
 	 */
 	memset(desc, 0, sizeof(*desc));
 
-	desc->file = filp;
+	desc->file = file;
+	desc->ctx = ctx;
 	desc->dir_cookie = &dir_ctx->dir_cookie;
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
-	desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;
+	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
 
 	nfs_block_sillyrename(dentry);
-	res = nfs_revalidate_mapping(inode, filp->f_mapping);
+	res = nfs_revalidate_mapping(inode, file->f_mapping);
 	if (res < 0)
 		goto out;
 
@@ -840,7 +839,7 @@
 			/* This means either end of directory */
 			if (*desc->dir_cookie && desc->eof == 0) {
 				/* Or that the server has 'lost' a cookie */
-				res = uncached_readdir(desc, dirent, filldir);
+				res = uncached_readdir(desc);
 				if (res == 0)
 					continue;
 			}
@@ -857,7 +856,7 @@
 		if (res < 0)
 			break;
 
-		res = nfs_do_filldir(desc, dirent, filldir);
+		res = nfs_do_filldir(desc);
 		if (res < 0)
 			break;
 	} while (!desc->eof);

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a87a44f..6b4a79f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c

@@ -451,11 +451,13 @@
  * - Called if either PG_private or PG_fscache is set on the page
  * - Caller holds page lock
  */
-static void nfs_invalidate_page(struct page *page, unsigned long offset)
+static void nfs_invalidate_page(struct page *page, unsigned int offset,
+				unsigned int length)
 {
-	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
+		 page, offset, length);
 
-	if (offset != 0)
+	if (offset != 0 || length < PAGE_CACHE_SIZE)
 		return;
 	/* Cancel any unstarted writes on this page */
 	nfs_wb_page_cancel(page_file_mapping(page)->host, page);

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4e9a21d..105a3b0 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c

@@ -240,11 +240,16 @@
 	struct list_head list;
 };
 
+struct nfs4_dir_ctx {
+	struct dir_context ctx;
+	struct list_head names;
+};
+
 static int
 nfsd4_build_namelist(void *arg, const char *name, int namlen,
 		loff_t offset, u64 ino, unsigned int d_type)
 {
-	struct list_head *names = arg;
+	struct nfs4_dir_ctx *ctx = arg;
 	struct name_list *entry;
 
 	if (namlen != HEXDIR_LEN - 1)
@@ -254,7 +259,7 @@
 		return -ENOMEM;
 	memcpy(entry->name, name, HEXDIR_LEN - 1);
 	entry->name[HEXDIR_LEN - 1] = '\0';
-	list_add(&entry->list, names);
+	list_add(&entry->list, &ctx->names);
 	return 0;
 }
 
@@ -263,7 +268,10 @@
 {
 	const struct cred *original_cred;
 	struct dentry *dir = nn->rec_file->f_path.dentry;
-	LIST_HEAD(names);
+	struct nfs4_dir_ctx ctx = {
+		.ctx.actor = nfsd4_build_namelist,
+		.names = LIST_HEAD_INIT(ctx.names)
+	};
 	int status;
 
 	status = nfs4_save_creds(&original_cred);
@@ -276,11 +284,11 @@
 		return status;
 	}
 
-	status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
+	status = iterate_dir(nn->rec_file, &ctx.ctx);
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-	while (!list_empty(&names)) {
+	while (!list_empty(&ctx.names)) {
 		struct name_list *entry;
-		entry = list_entry(names.next, struct name_list, list);
+		entry = list_entry(ctx.names.next, struct name_list, list);
 		if (!status) {
 			struct dentry *dentry;
 			dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 84ce601..a6bc8a7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c

@@ -1912,6 +1912,7 @@
 };
 
 struct readdir_data {
+	struct dir_context ctx;
 	char		*dirent;
 	size_t		used;
 	int		full;
@@ -1943,13 +1944,15 @@
 static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
 				    struct readdir_cd *cdp, loff_t *offsetp)
 {
-	struct readdir_data buf;
 	struct buffered_dirent *de;
 	int host_err;
 	int size;
 	loff_t offset;
+	struct readdir_data buf = {
+		.ctx.actor = nfsd_buffered_filldir,
+		.dirent = (void *)__get_free_page(GFP_KERNEL)
+	};
 
-	buf.dirent = (void *)__get_free_page(GFP_KERNEL);
 	if (!buf.dirent)
 		return nfserrno(-ENOMEM);
 
@@ -1963,7 +1966,7 @@
 		buf.used = 0;
 		buf.full = 0;
 
-		host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf);
+		host_err = iterate_dir(file, &buf.ctx);
 		if (buf.full)
 			host_err = 0;
 

diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index f30b017..197a63e 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c

@@ -256,22 +256,18 @@
 	de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
 
-static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nilfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	loff_t pos = filp->f_pos;
-	struct inode *inode = file_inode(filp);
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	unsigned int offset = pos & ~PAGE_CACHE_MASK;
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 /*	unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
-	unsigned char *types = NULL;
-	int ret;
 
 	if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
-		goto success;
-
-	types = nilfs_filetype_table;
+		return 0;
 
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
@@ -281,9 +277,8 @@
 		if (IS_ERR(page)) {
 			nilfs_error(sb, __func__, "bad page in #%lu",
 				    inode->i_ino);
-			filp->f_pos += PAGE_CACHE_SIZE - offset;
-			ret = -EIO;
-			goto done;
+			ctx->pos += PAGE_CACHE_SIZE - offset;
+			return -EIO;
 		}
 		kaddr = page_address(page);
 		de = (struct nilfs_dir_entry *)(kaddr + offset);
@@ -293,35 +288,28 @@
 			if (de->rec_len == 0) {
 				nilfs_error(sb, __func__,
 					    "zero-length directory entry");
-				ret = -EIO;
 				nilfs_put_page(page);
-				goto done;
+				return -EIO;
 			}
 			if (de->inode) {
-				int over;
-				unsigned char d_type = DT_UNKNOWN;
+				unsigned char t;
 
-				if (types && de->file_type < NILFS_FT_MAX)
-					d_type = types[de->file_type];
+				if (de->file_type < NILFS_FT_MAX)
+					t = nilfs_filetype_table[de->file_type];
+				else
+					t = DT_UNKNOWN;
 
-				offset = (char *)de - kaddr;
-				over = filldir(dirent, de->name, de->name_len,
-						(n<<PAGE_CACHE_SHIFT) | offset,
-						le64_to_cpu(de->inode), d_type);
-				if (over) {
+				if (!dir_emit(ctx, de->name, de->name_len,
+						le64_to_cpu(de->inode), t)) {
 					nilfs_put_page(page);
-					goto success;
+					return 0;
 				}
 			}
-			filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
+			ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
 		}
 		nilfs_put_page(page);
 	}
-
-success:
-	ret = 0;
-done:
-	return ret;
+	return 0;
 }
 
 /*
@@ -678,7 +666,7 @@
 const struct file_operations nilfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= nilfs_readdir,
+	.iterate	= nilfs_readdir,
 	.unlocked_ioctl	= nilfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= nilfs_compat_ioctl,

diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f..d267ea6 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c

@@ -1372,7 +1372,7 @@
 		 * The page may have dirty, unmapped buffers.  Make them
 		 * freeable here, so the page does not leak.
 		 */
-		block_invalidatepage(page, 0);
+		block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		unlock_page(page);
 		ntfs_debug("Write outside i_size - truncated?");
 		return 0;

diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index aa411c3..9e38daf 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c

@@ -1004,13 +1004,11 @@
 /**
  * ntfs_filldir - ntfs specific filldir method
  * @vol:	current ntfs volume
- * @fpos:	position in the directory
  * @ndir:	ntfs inode of current directory
  * @ia_page:	page in which the index allocation buffer @ie is in resides
  * @ie:		current index entry
  * @name:	buffer to use for the converted name
- * @dirent:	vfs filldir callback context
- * @filldir:	vfs filldir callback
+ * @actor:	what to feed the entries to
  *
  * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
  * callback.
@@ -1024,12 +1022,12 @@
  * retake the lock if we are returning a non-zero value as ntfs_readdir()
  * would need to drop the lock immediately anyway.
  */
-static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
+static inline int ntfs_filldir(ntfs_volume *vol,
 		ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
-		u8 *name, void *dirent, filldir_t filldir)
+		u8 *name, struct dir_context *actor)
 {
 	unsigned long mref;
-	int name_len, rc;
+	int name_len;
 	unsigned dt_type;
 	FILE_NAME_TYPE_FLAGS name_type;
 
@@ -1068,13 +1066,14 @@
 	if (ia_page)
 		unlock_page(ia_page);
 	ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
-			"0x%lx, DT_%s.", name, name_len, fpos, mref,
+			"0x%lx, DT_%s.", name, name_len, actor->pos, mref,
 			dt_type == DT_DIR ? "DIR" : "REG");
-	rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
+	if (!dir_emit(actor, name, name_len, mref, dt_type))
+		return 1;
 	/* Relock the page but not if we are aborting ->readdir. */
-	if (!rc && ia_page)
+	if (ia_page)
 		lock_page(ia_page);
-	return rc;
+	return 0;
 }
 
 /*
@@ -1097,11 +1096,11 @@
  *	       removes them again after the write is complete after which it 
  *	       unlocks the page.
  */
-static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ntfs_readdir(struct file *file, struct dir_context *actor)
 {
 	s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
-	loff_t fpos, i_size;
-	struct inode *bmp_vi, *vdir = file_inode(filp);
+	loff_t i_size;
+	struct inode *bmp_vi, *vdir = file_inode(file);
 	struct super_block *sb = vdir->i_sb;
 	ntfs_inode *ndir = NTFS_I(vdir);
 	ntfs_volume *vol = NTFS_SB(sb);
@@ -1116,33 +1115,16 @@
 	u8 *kaddr, *bmp, *index_end;
 	ntfs_attr_search_ctx *ctx;
 
-	fpos = filp->f_pos;
 	ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
-			vdir->i_ino, fpos);
+			vdir->i_ino, actor->pos);
 	rc = err = 0;
 	/* Are we at end of dir yet? */
 	i_size = i_size_read(vdir);
-	if (fpos >= i_size + vol->mft_record_size)
-		goto done;
+	if (actor->pos >= i_size + vol->mft_record_size)
+		return 0;
 	/* Emulate . and .. for all directories. */
-	if (!fpos) {
-		ntfs_debug("Calling filldir for . with len 1, fpos 0x0, "
-				"inode 0x%lx, DT_DIR.", vdir->i_ino);
-		rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
-		if (rc)
-			goto done;
-		fpos++;
-	}
-	if (fpos == 1) {
-		ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
-				"inode 0x%lx, DT_DIR.",
-				(unsigned long)parent_ino(filp->f_path.dentry));
-		rc = filldir(dirent, "..", 2, fpos,
-				parent_ino(filp->f_path.dentry), DT_DIR);
-		if (rc)
-			goto done;
-		fpos++;
-	}
+	if (!dir_emit_dots(file, actor))
+		return 0;
 	m = NULL;
 	ctx = NULL;
 	/*
@@ -1155,7 +1137,7 @@
 		goto err_out;
 	}
 	/* Are we jumping straight into the index allocation attribute? */
-	if (fpos >= vol->mft_record_size)
+	if (actor->pos >= vol->mft_record_size)
 		goto skip_index_root;
 	/* Get hold of the mft record for the directory. */
 	m = map_mft_record(ndir);
@@ -1170,7 +1152,7 @@
 		goto err_out;
 	}
 	/* Get the offset into the index root attribute. */
-	ir_pos = (s64)fpos;
+	ir_pos = (s64)actor->pos;
 	/* Find the index root attribute in the mft record. */
 	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
 			0, ctx);
@@ -1226,10 +1208,9 @@
 		if (ir_pos > (u8*)ie - (u8*)ir)
 			continue;
 		/* Advance the position even if going to skip the entry. */
-		fpos = (u8*)ie - (u8*)ir;
+		actor->pos = (u8*)ie - (u8*)ir;
 		/* Submit the name to the filldir callback. */
-		rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
-				filldir);
+		rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
 		if (rc) {
 			kfree(ir);
 			goto abort;
@@ -1242,12 +1223,12 @@
 	if (!NInoIndexAllocPresent(ndir))
 		goto EOD;
 	/* Advance fpos to the beginning of the index allocation. */
-	fpos = vol->mft_record_size;
+	actor->pos = vol->mft_record_size;
 skip_index_root:
 	kaddr = NULL;
 	prev_ia_pos = -1LL;
 	/* Get the offset into the index allocation attribute. */
-	ia_pos = (s64)fpos - vol->mft_record_size;
+	ia_pos = (s64)actor->pos - vol->mft_record_size;
 	ia_mapping = vdir->i_mapping;
 	ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
 	bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
@@ -1409,7 +1390,7 @@
 		if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
 			continue;
 		/* Advance the position even if going to skip the entry. */
-		fpos = (u8*)ie - (u8*)ia +
+		actor->pos = (u8*)ie - (u8*)ia +
 				(sle64_to_cpu(ia->index_block_vcn) <<
 				ndir->itype.index.vcn_size_bits) +
 				vol->mft_record_size;
@@ -1419,8 +1400,7 @@
 		 * before returning, unless a non-zero value is returned in
 		 * which case the page is left unlocked.
 		 */
-		rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
-				filldir);
+		rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
 		if (rc) {
 			/* @ia_page is already unlocked in this case. */
 			ntfs_unmap_page(ia_page);
@@ -1439,18 +1419,9 @@
 	iput(bmp_vi);
 EOD:
 	/* We are finished, set fpos to EOD. */
-	fpos = i_size + vol->mft_record_size;
+	actor->pos = i_size + vol->mft_record_size;
 abort:
 	kfree(name);
-done:
-#ifdef DEBUG
-	if (!rc)
-		ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
-	else
-		ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
-				rc, fpos);
-#endif
-	filp->f_pos = fpos;
 	return 0;
 err_out:
 	if (bmp_page) {
@@ -1471,7 +1442,6 @@
 	if (!err)
 		err = -EIO;
 	ntfs_debug("Failed. Returning error code %i.", -err);
-	filp->f_pos = fpos;
 	return err;
 }
 
@@ -1571,7 +1541,7 @@
 const struct file_operations ntfs_dir_ops = {
 	.llseek		= generic_file_llseek,	/* Seek inside directory. */
 	.read		= generic_read_dir,	/* Return -EISDIR. */
-	.readdir	= ntfs_readdir,		/* Read directory contents. */
+	.iterate	= ntfs_readdir,		/* Read directory contents. */
 #ifdef NTFS_RW
 	.fsync		= ntfs_dir_fsync,	/* Sync a directory to disk. */
 	/*.aio_fsync	= ,*/			/* Sync all outstanding async

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec7..79736a2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c

@@ -603,11 +603,12 @@
  * from ext3.  PageChecked() bits have been removed as OCFS2 does not
  * do journalled data.
  */
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
+static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 
-	jbd2_journal_invalidatepage(journal, page, offset);
+	jbd2_journal_invalidatepage(journal, page, offset, length);
 }
 
 static int ocfs2_releasepage(struct page *page, gfp_t wait)

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f1e1aed8..eb760d8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c

@@ -1761,11 +1761,10 @@
 
 static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 				    u64 *f_version,
-				    loff_t *f_pos, void *priv,
-				    filldir_t filldir, int *filldir_err)
+				    struct dir_context *ctx)
 {
-	int ret, i, filldir_ret;
-	unsigned long offset = *f_pos;
+	int ret, i;
+	unsigned long offset = ctx->pos;
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
@@ -1781,8 +1780,7 @@
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 	data = &di->id2.i_data;
 
-	while (*f_pos < i_size_read(inode)) {
-revalidate:
+	while (ctx->pos < i_size_read(inode)) {
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
@@ -1802,50 +1800,31 @@
 					break;
 				i += le16_to_cpu(de->rec_len);
 			}
-			*f_pos = offset = i;
+			ctx->pos = offset = i;
 			*f_version = inode->i_version;
 		}
 
-		de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
-		if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
+		de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
+		if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
 			/* On error, skip the f_pos to the end. */
-			*f_pos = i_size_read(inode);
-			goto out;
+			ctx->pos = i_size_read(inode);
+			break;
 		}
 		offset += le16_to_cpu(de->rec_len);
 		if (le64_to_cpu(de->inode)) {
-			/* We might block in the next section
-			 * if the data destination is
-			 * currently swapped out.  So, use a
-			 * version stamp to detect whether or
-			 * not the directory has been modified
-			 * during the copy operation.
-			 */
-			u64 version = *f_version;
 			unsigned char d_type = DT_UNKNOWN;
 
 			if (de->file_type < OCFS2_FT_MAX)
 				d_type = ocfs2_filetype_table[de->file_type];
 
-			filldir_ret = filldir(priv, de->name,
-					      de->name_len,
-					      *f_pos,
-					      le64_to_cpu(de->inode),
-					      d_type);
-			if (filldir_ret) {
-				if (filldir_err)
-					*filldir_err = filldir_ret;
-				break;
-			}
-			if (version != *f_version)
-				goto revalidate;
+			if (!dir_emit(ctx, de->name, de->name_len,
+				      le64_to_cpu(de->inode), d_type))
+				goto out;
 		}
-		*f_pos += le16_to_cpu(de->rec_len);
+		ctx->pos += le16_to_cpu(de->rec_len);
 	}
-
 out:
 	brelse(di_bh);
-
 	return 0;
 }
 
@@ -1855,27 +1834,26 @@
  */
 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 				    u64 *f_version,
-				    loff_t *f_pos, void *priv,
-				    filldir_t filldir, int *filldir_err)
+				    struct dir_context *ctx,
+				    bool persist)
 {
-	int error = 0;
 	unsigned long offset, blk, last_ra_blk = 0;
-	int i, stored;
+	int i;
 	struct buffer_head * bh, * tmp;
 	struct ocfs2_dir_entry * de;
 	struct super_block * sb = inode->i_sb;
 	unsigned int ra_sectors = 16;
+	int stored = 0;
 
-	stored = 0;
 	bh = NULL;
 
-	offset = (*f_pos) & (sb->s_blocksize - 1);
+	offset = ctx->pos & (sb->s_blocksize - 1);
 
-	while (!error && !stored && *f_pos < i_size_read(inode)) {
-		blk = (*f_pos) >> sb->s_blocksize_bits;
+	while (ctx->pos < i_size_read(inode)) {
+		blk = ctx->pos >> sb->s_blocksize_bits;
 		if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
 			/* Skip the corrupt dirblock and keep trying */
-			*f_pos += sb->s_blocksize - offset;
+			ctx->pos += sb->s_blocksize - offset;
 			continue;
 		}
 
@@ -1897,7 +1875,6 @@
 			ra_sectors = 8;
 		}
 
-revalidate:
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
@@ -1917,93 +1894,64 @@
 				i += le16_to_cpu(de->rec_len);
 			}
 			offset = i;
-			*f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
+			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
 				| offset;
 			*f_version = inode->i_version;
 		}
 
-		while (!error && *f_pos < i_size_read(inode)
+		while (ctx->pos < i_size_read(inode)
 		       && offset < sb->s_blocksize) {
 			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
 			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
 				/* On error, skip the f_pos to the
 				   next block. */
-				*f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
+				ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
 				brelse(bh);
-				goto out;
+				continue;
 			}
-			offset += le16_to_cpu(de->rec_len);
 			if (le64_to_cpu(de->inode)) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation.
-				 */
-				unsigned long version = *f_version;
 				unsigned char d_type = DT_UNKNOWN;
 
 				if (de->file_type < OCFS2_FT_MAX)
 					d_type = ocfs2_filetype_table[de->file_type];
-				error = filldir(priv, de->name,
+				if (!dir_emit(ctx, de->name,
 						de->name_len,
-						*f_pos,
 						le64_to_cpu(de->inode),
-						d_type);
-				if (error) {
-					if (filldir_err)
-						*filldir_err = error;
-					break;
+						d_type)) {
+					brelse(bh);
+					return 0;
 				}
-				if (version != *f_version)
-					goto revalidate;
-				stored ++;
+				stored++;
 			}
-			*f_pos += le16_to_cpu(de->rec_len);
+			offset += le16_to_cpu(de->rec_len);
+			ctx->pos += le16_to_cpu(de->rec_len);
 		}
 		offset = 0;
 		brelse(bh);
 		bh = NULL;
+		if (!persist && stored)
+			break;
 	}
-
-	stored = 0;
-out:
-	return stored;
+	return 0;
 }
 
 static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
-				 loff_t *f_pos, void *priv, filldir_t filldir,
-				 int *filldir_err)
+				 struct dir_context *ctx,
+				 bool persist)
 {
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-		return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
-						filldir, filldir_err);
-
-	return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
-					filldir_err);
+		return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
+	return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
 }
 
 /*
  * This is intended to be called from inside other kernel functions,
  * so we fake some arguments.
  */
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
-		      filldir_t filldir)
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
 {
-	int ret = 0, filldir_err = 0;
 	u64 version = inode->i_version;
-
-	while (*f_pos < i_size_read(inode)) {
-		ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
-					    filldir, &filldir_err);
-		if (ret || filldir_err)
-			break;
-	}
-
-	if (ret > 0)
-		ret = -EIO;
-
+	ocfs2_dir_foreach_blk(inode, &version, ctx, true);
 	return 0;
 }
 
@@ -2011,15 +1959,15 @@
  * ocfs2_readdir()
  *
  */
-int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int ocfs2_readdir(struct file *file, struct dir_context *ctx)
 {
 	int error = 0;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	int lock_level = 0;
 
 	trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
+	error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
 	if (lock_level && error >= 0) {
 		/* We release EX lock which used to update atime
 		 * and get PR lock again to reduce contention
@@ -2035,8 +1983,7 @@
 		goto bail_nolock;
 	}
 
-	error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
-				      dirent, filldir, NULL);
+	error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
 
 	ocfs2_inode_unlock(inode, lock_level);
 	if (error)
@@ -2120,6 +2067,7 @@
 }
 
 struct ocfs2_empty_dir_priv {
+	struct dir_context ctx;
 	unsigned seen_dot;
 	unsigned seen_dot_dot;
 	unsigned seen_other;
@@ -2204,8 +2152,9 @@
 int ocfs2_empty_dir(struct inode *inode)
 {
 	int ret;
-	loff_t start = 0;
-	struct ocfs2_empty_dir_priv priv;
+	struct ocfs2_empty_dir_priv priv = {
+		.ctx.actor = ocfs2_empty_dir_filldir
+	};
 
 	memset(&priv, 0, sizeof(priv));
 
@@ -2219,7 +2168,7 @@
 		 */
 	}
 
-	ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
+	ret = ocfs2_dir_foreach(inode, &priv.ctx);
 	if (ret)
 		mlog_errno(ret);
 

diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e683f3d..f0344b7 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h

@@ -92,9 +92,8 @@
 			     struct ocfs2_dir_lookup_result *res);
 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
 			       int namelen, u64 *blkno);
-int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
-		      filldir_t filldir);
+int ocfs2_readdir(struct file *file, struct dir_context *ctx);
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 				 struct inode *dir,
 				 struct buffer_head *parent_fe_bh,

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ff54014..8a38714 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c

@@ -2712,7 +2712,7 @@
 const struct file_operations ocfs2_dops = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ocfs2_readdir,
+	.iterate	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_dir_release,
 	.open		= ocfs2_dir_open,
@@ -2759,7 +2759,7 @@
 const struct file_operations ocfs2_dops_no_plocks = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ocfs2_readdir,
+	.iterate	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_dir_release,
 	.open		= ocfs2_dir_open,

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8eccfab..242170d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c

@@ -1941,6 +1941,7 @@
 }
 
 struct ocfs2_orphan_filldir_priv {
+	struct dir_context	ctx;
 	struct inode		*head;
 	struct ocfs2_super	*osb;
 };
@@ -1977,11 +1978,11 @@
 {
 	int status;
 	struct inode *orphan_dir_inode = NULL;
-	struct ocfs2_orphan_filldir_priv priv;
-	loff_t pos = 0;
-
-	priv.osb = osb;
-	priv.head = *head;
+	struct ocfs2_orphan_filldir_priv priv = {
+		.ctx.actor = ocfs2_orphan_filldir,
+		.osb = osb,
+		.head = *head
+	};
 
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
 						       ORPHAN_DIR_SYSTEM_INODE,
@@ -1999,8 +2000,7 @@
 		goto out;
 	}
 
-	status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
-				   ocfs2_orphan_filldir);
+	status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
 	if (status) {
 		mlog_errno(status);
 		goto out_cluster;

diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index acbaebc..1b8e9e8 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c

@@ -327,26 +327,23 @@
 	return is_bad;
 }
 
-static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
 		u64 fsblock, int hindex)
 {
-	struct inode *dir = file_inode(filp);
-	struct buffer_head *bh;
-	struct omfs_inode *oi;
-	u64 self;
-	int res = 0;
-	unsigned char d_type;
-
 	/* follow chain in this bucket */
 	while (fsblock != ~0) {
-		bh = omfs_bread(dir->i_sb, fsblock);
+		struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock);
+		struct omfs_inode *oi;
+		u64 self;
+		unsigned char d_type;
+
 		if (!bh)
-			goto out;
+			return true;
 
 		oi = (struct omfs_inode *) bh->b_data;
 		if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
 			brelse(bh);
-			goto out;
+			return true;
 		}
 
 		self = fsblock;
@@ -361,15 +358,16 @@
 
 		d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
 
-		res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
-			OMFS_NAMELEN), filp->f_pos, self, d_type);
+		if (!dir_emit(ctx, oi->i_name,
+			      strnlen(oi->i_name, OMFS_NAMELEN),
+			      self, d_type)) {
+			brelse(bh);
+			return false;
+		}
 		brelse(bh);
-		if (res < 0)
-			break;
-		filp->f_pos++;
+		ctx->pos++;
 	}
-out:
-	return res;
+	return true;
 }
 
 static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -403,60 +401,44 @@
 	return err;
 }
 
-static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int omfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *dir = file_inode(filp);
+	struct inode *dir = file_inode(file);
 	struct buffer_head *bh;
-	loff_t offset, res;
+	__be64 *p;
 	unsigned int hchain, hindex;
 	int nbuckets;
-	u64 fsblock;
-	int ret = -EINVAL;
 
-	if (filp->f_pos >> 32)
-		goto success;
+	if (ctx->pos >> 32)
+		return -EINVAL;
 
-	switch ((unsigned long) filp->f_pos) {
-	case 0:
-		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
-			goto success;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
-		if (filldir(dirent, "..", 2, 1,
-		    parent_ino(filp->f_dentry), DT_DIR) < 0)
-			goto success;
-		filp->f_pos = 1 << 20;
-		/* fall through */
+	if (ctx->pos < 1 << 20) {
+		if (!dir_emit_dots(file, ctx))
+			return 0;
+		ctx->pos = 1 << 20;
 	}
 
 	nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
 
 	/* high 12 bits store bucket + 1 and low 20 bits store hash index */
-	hchain = (filp->f_pos >> 20) - 1;
-	hindex = filp->f_pos & 0xfffff;
+	hchain = (ctx->pos >> 20) - 1;
+	hindex = ctx->pos & 0xfffff;
 
 	bh = omfs_bread(dir->i_sb, dir->i_ino);
 	if (!bh)
-		goto out;
+		return -EINVAL;
 
-	offset = OMFS_DIR_START + hchain * 8;
+	p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain;
 
-	for (; hchain < nbuckets; hchain++, offset += 8) {
-		fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
-
-		res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
-		hindex = 0;
-		if (res < 0)
+	for (; hchain < nbuckets; hchain++) {
+		__u64 fsblock = be64_to_cpu(*p++);
+		if (!omfs_fill_chain(dir, ctx, fsblock, hindex))
 			break;
-
-		filp->f_pos = (hchain+2) << 20;
+		hindex = 0;
+		ctx->pos = (hchain+2) << 20;
 	}
 	brelse(bh);
-success:
-	ret = 0;
-out:
-	return ret;
+	return 0;
 }
 
 const struct inode_operations omfs_dir_inops = {
@@ -470,6 +452,6 @@
 
 const struct file_operations omfs_dir_operations = {
 	.read = generic_read_dir,
-	.readdir = omfs_readdir,
+	.iterate = omfs_readdir,
 	.llseek = generic_file_llseek,
 };

diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 75885ff..8c0ceb8 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c

@@ -162,11 +162,11 @@
 	.release	= seq_release,
 };
 
-static int openpromfs_readdir(struct file *, void *, filldir_t);
+static int openpromfs_readdir(struct file *, struct dir_context *);
 
 static const struct file_operations openprom_operations = {
 	.read		= generic_read_dir,
-	.readdir	= openpromfs_readdir,
+	.iterate	= openpromfs_readdir,
 	.llseek		= generic_file_llseek,
 };
 
@@ -260,71 +260,64 @@
 	return NULL;
 }
 
-static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct op_inode_info *oi = OP_I(inode);
 	struct device_node *dp = oi->u.node;
 	struct device_node *child;
 	struct property *prop;
-	unsigned int ino;
 	int i;
 
 	mutex_lock(&op_mutex);
 	
-	ino = inode->i_ino;
-	i = filp->f_pos;
-	switch (i) {
-	case 0:
-		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+	if (ctx->pos == 0) {
+		if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
 			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall thru */
-	case 1:
-		if (filldir(dirent, "..", 2, i,
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (!dir_emit(ctx, "..", 2,
 			    (dp->parent == NULL ?
 			     OPENPROM_ROOT_INO :
-			     dp->parent->unique_id), DT_DIR) < 0) 
+			     dp->parent->unique_id), DT_DIR))
 			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall thru */
-	default:
-		i -= 2;
-
-		/* First, the children nodes as directories.  */
-		child = dp->child;
-		while (i && child) {
-			child = child->sibling;
-			i--;
-		}
-		while (child) {
-			if (filldir(dirent,
-				    child->path_component_name,
-				    strlen(child->path_component_name),
-				    filp->f_pos, child->unique_id, DT_DIR) < 0)
-				goto out;
-
-			filp->f_pos++;
-			child = child->sibling;
-		}
-
-		/* Next, the properties as files.  */
-		prop = dp->properties;
-		while (i && prop) {
-			prop = prop->next;
-			i--;
-		}
-		while (prop) {
-			if (filldir(dirent, prop->name, strlen(prop->name),
-				    filp->f_pos, prop->unique_id, DT_REG) < 0)
-				goto out;
-
-			filp->f_pos++;
-			prop = prop->next;
-		}
+		ctx->pos = 2;
 	}
+	i = ctx->pos - 2;
+
+	/* First, the children nodes as directories.  */
+	child = dp->child;
+	while (i && child) {
+		child = child->sibling;
+		i--;
+	}
+	while (child) {
+		if (!dir_emit(ctx,
+			    child->path_component_name,
+			    strlen(child->path_component_name),
+			    child->unique_id, DT_DIR))
+			goto out;
+
+		ctx->pos++;
+		child = child->sibling;
+	}
+
+	/* Next, the properties as files.  */
+	prop = dp->properties;
+	while (i && prop) {
+		prop = prop->next;
+		i--;
+	}
+	while (prop) {
+		if (!dir_emit(ctx, prop->name, strlen(prop->name),
+			    prop->unique_id, DT_REG))
+			goto out;
+
+		ctx->pos++;
+		prop = prop->next;
+	}
+
 out:
 	mutex_unlock(&op_mutex);
 	return 0;

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c3834da..0016350 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c

@@ -1681,11 +1681,11 @@
  * reported by readdir in sync with the inode numbers reported
  * by stat.
  */
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+bool proc_fill_cache(struct file *file, struct dir_context *ctx,
 	const char *name, int len,
 	instantiate_t instantiate, struct task_struct *task, const void *ptr)
 {
-	struct dentry *child, *dir = filp->f_path.dentry;
+	struct dentry *child, *dir = file->f_path.dentry;
 	struct inode *inode;
 	struct qstr qname;
 	ino_t ino = 0;
@@ -1720,7 +1720,7 @@
 		ino = find_inode_number(dir, &qname);
 	if (!ino)
 		ino = 1;
-	return filldir(dirent, name, len, filp->f_pos, ino, type);
+	return dir_emit(ctx, name, len, ino, type);
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1931,14 +1931,15 @@
 };
 
 static int
-proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+proc_map_files_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
 	struct vm_area_struct *vma;
 	struct task_struct *task;
 	struct mm_struct *mm;
-	ino_t ino;
+	unsigned long nr_files, pos, i;
+	struct flex_array *fa = NULL;
+	struct map_files_info info;
+	struct map_files_info *p;
 	int ret;
 
 	ret = -EPERM;
@@ -1946,7 +1947,7 @@
 		goto out;
 
 	ret = -ENOENT;
-	task = get_proc_task(inode);
+	task = get_proc_task(file_inode(file));
 	if (!task)
 		goto out;
 
@@ -1955,91 +1956,73 @@
 		goto out_put_task;
 
 	ret = 0;
-	switch (filp->f_pos) {
-	case 0:
-		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
-			goto out_put_task;
-		filp->f_pos++;
-	case 1:
-		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-			goto out_put_task;
-		filp->f_pos++;
-	default:
-	{
-		unsigned long nr_files, pos, i;
-		struct flex_array *fa = NULL;
-		struct map_files_info info;
-		struct map_files_info *p;
+	if (!dir_emit_dots(file, ctx))
+		goto out_put_task;
 
-		mm = get_task_mm(task);
-		if (!mm)
-			goto out_put_task;
-		down_read(&mm->mmap_sem);
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_put_task;
+	down_read(&mm->mmap_sem);
 
-		nr_files = 0;
+	nr_files = 0;
 
-		/*
-		 * We need two passes here:
-		 *
-		 *  1) Collect vmas of mapped files with mmap_sem taken
-		 *  2) Release mmap_sem and instantiate entries
-		 *
-		 * otherwise we get lockdep complained, since filldir()
-		 * routine might require mmap_sem taken in might_fault().
-		 */
+	/*
+	 * We need two passes here:
+	 *
+	 *  1) Collect vmas of mapped files with mmap_sem taken
+	 *  2) Release mmap_sem and instantiate entries
+	 *
+	 * otherwise we get lockdep complained, since filldir()
+	 * routine might require mmap_sem taken in might_fault().
+	 */
 
-		for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
-			if (vma->vm_file && ++pos > filp->f_pos)
-				nr_files++;
-		}
-
-		if (nr_files) {
-			fa = flex_array_alloc(sizeof(info), nr_files,
-						GFP_KERNEL);
-			if (!fa || flex_array_prealloc(fa, 0, nr_files,
-							GFP_KERNEL)) {
-				ret = -ENOMEM;
-				if (fa)
-					flex_array_free(fa);
-				up_read(&mm->mmap_sem);
-				mmput(mm);
-				goto out_put_task;
-			}
-			for (i = 0, vma = mm->mmap, pos = 2; vma;
-					vma = vma->vm_next) {
-				if (!vma->vm_file)
-					continue;
-				if (++pos <= filp->f_pos)
-					continue;
-
-				info.mode = vma->vm_file->f_mode;
-				info.len = snprintf(info.name,
-						sizeof(info.name), "%lx-%lx",
-						vma->vm_start, vma->vm_end);
-				if (flex_array_put(fa, i++, &info, GFP_KERNEL))
-					BUG();
-			}
-		}
-		up_read(&mm->mmap_sem);
-
-		for (i = 0; i < nr_files; i++) {
-			p = flex_array_get(fa, i);
-			ret = proc_fill_cache(filp, dirent, filldir,
-					      p->name, p->len,
-					      proc_map_files_instantiate,
-					      task,
-					      (void *)(unsigned long)p->mode);
-			if (ret)
-				break;
-			filp->f_pos++;
-		}
-		if (fa)
-			flex_array_free(fa);
-		mmput(mm);
+	for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+		if (vma->vm_file && ++pos > ctx->pos)
+			nr_files++;
 	}
+
+	if (nr_files) {
+		fa = flex_array_alloc(sizeof(info), nr_files,
+					GFP_KERNEL);
+		if (!fa || flex_array_prealloc(fa, 0, nr_files,
+						GFP_KERNEL)) {
+			ret = -ENOMEM;
+			if (fa)
+				flex_array_free(fa);
+			up_read(&mm->mmap_sem);
+			mmput(mm);
+			goto out_put_task;
+		}
+		for (i = 0, vma = mm->mmap, pos = 2; vma;
+				vma = vma->vm_next) {
+			if (!vma->vm_file)
+				continue;
+			if (++pos <= ctx->pos)
+				continue;
+
+			info.mode = vma->vm_file->f_mode;
+			info.len = snprintf(info.name,
+					sizeof(info.name), "%lx-%lx",
+					vma->vm_start, vma->vm_end);
+			if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+				BUG();
+		}
 	}
+	up_read(&mm->mmap_sem);
+
+	for (i = 0; i < nr_files; i++) {
+		p = flex_array_get(fa, i);
+		if (!proc_fill_cache(file, ctx,
+				      p->name, p->len,
+				      proc_map_files_instantiate,
+				      task,
+				      (void *)(unsigned long)p->mode))
+			break;
+		ctx->pos++;
+	}
+	if (fa)
+		flex_array_free(fa);
+	mmput(mm);
 
 out_put_task:
 	put_task_struct(task);
@@ -2049,7 +2032,7 @@
 
 static const struct file_operations proc_map_files_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_map_files_readdir,
+	.iterate	= proc_map_files_readdir,
 	.llseek		= default_llseek,
 };
 
@@ -2217,67 +2200,30 @@
 	return error;
 }
 
-static int proc_pident_fill_cache(struct file *filp, void *dirent,
-	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
-	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
-				proc_pident_instantiate, task, p);
-}
-
-static int proc_pident_readdir(struct file *filp,
-		void *dirent, filldir_t filldir,
+static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
 		const struct pid_entry *ents, unsigned int nents)
 {
-	int i;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
-	const struct pid_entry *p, *last;
-	ino_t ino;
-	int ret;
+	struct task_struct *task = get_proc_task(file_inode(file));
+	const struct pid_entry *p;
 
-	ret = -ENOENT;
 	if (!task)
-		goto out_no_task;
+		return -ENOENT;
 
-	ret = 0;
-	i = filp->f_pos;
-	switch (i) {
-	case 0:
-		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
-		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	default:
-		i -= 2;
-		if (i >= nents) {
-			ret = 1;
-			goto out;
-		}
-		p = ents + i;
-		last = &ents[nents - 1];
-		while (p <= last) {
-			if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
-				goto out;
-			filp->f_pos++;
-			p++;
-		}
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+
+	if (ctx->pos >= nents + 2)
+		goto out;
+
+	for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+		if (!proc_fill_cache(file, ctx, p->name, p->len,
+				proc_pident_instantiate, task, p))
+			break;
+		ctx->pos++;
 	}
-
-	ret = 1;
 out:
 	put_task_struct(task);
-out_no_task:
-	return ret;
+	return 0;
 }
 
 #ifdef CONFIG_SECURITY
@@ -2362,16 +2308,15 @@
 	REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
 };
 
-static int proc_attr_dir_readdir(struct file * filp,
-			     void * dirent, filldir_t filldir)
+static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-	return proc_pident_readdir(filp,dirent,filldir,
-				   attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
+	return proc_pident_readdir(file, ctx, 
+				   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
 }
 
 static const struct file_operations proc_attr_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_attr_dir_readdir,
+	.iterate	= proc_attr_dir_readdir,
 	.llseek		= default_llseek,
 };
 
@@ -2725,16 +2670,15 @@
 #endif
 };
 
-static int proc_tgid_base_readdir(struct file * filp,
-			     void * dirent, filldir_t filldir)
+static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
 {
-	return proc_pident_readdir(filp,dirent,filldir,
-				   tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
+	return proc_pident_readdir(file, ctx,
+				   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
 }
 
 static const struct file_operations proc_tgid_base_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_tgid_base_readdir,
+	.iterate	= proc_tgid_base_readdir,
 	.llseek		= default_llseek,
 };
 
@@ -2936,58 +2880,42 @@
 
 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
 
-static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	struct tgid_iter iter)
-{
-	char name[PROC_NUMBUF];
-	int len = snprintf(name, sizeof(name), "%d", iter.tgid);
-	return proc_fill_cache(filp, dirent, filldir, name, len,
-				proc_pid_instantiate, iter.task, NULL);
-}
-
-static int fake_filldir(void *buf, const char *name, int namelen,
-			loff_t offset, u64 ino, unsigned d_type)
-{
-	return 0;
-}
-
 /* for the /proc/ directory itself, after non-process stuff has been done */
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct tgid_iter iter;
 	struct pid_namespace *ns;
-	filldir_t __filldir;
-	loff_t pos = filp->f_pos;
+	loff_t pos = ctx->pos;
 
 	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
-		goto out;
+		return 0;
 
 	if (pos == TGID_OFFSET - 1) {
-		if (proc_fill_cache(filp, dirent, filldir, "self", 4,
-					NULL, NULL, NULL) < 0)
-			goto out;
+		if (!proc_fill_cache(file, ctx, "self", 4, NULL, NULL, NULL))
+			return 0;
 		iter.tgid = 0;
 	} else {
 		iter.tgid = pos - TGID_OFFSET;
 	}
 	iter.task = NULL;
-	ns = filp->f_dentry->d_sb->s_fs_info;
+	ns = file->f_dentry->d_sb->s_fs_info;
 	for (iter = next_tgid(ns, iter);
 	     iter.task;
 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
-		if (has_pid_permissions(ns, iter.task, 2))
-			__filldir = filldir;
-		else
-			__filldir = fake_filldir;
+		char name[PROC_NUMBUF];
+		int len;
+		if (!has_pid_permissions(ns, iter.task, 2))
+			continue;
 
-		filp->f_pos = iter.tgid + TGID_OFFSET;
-		if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
+		len = snprintf(name, sizeof(name), "%d", iter.tgid);
+		ctx->pos = iter.tgid + TGID_OFFSET;
+		if (!proc_fill_cache(file, ctx, name, len,
+				     proc_pid_instantiate, iter.task, NULL)) {
 			put_task_struct(iter.task);
-			goto out;
+			return 0;
 		}
 	}
-	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
-out:
+	ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
 	return 0;
 }
 
@@ -3075,11 +3003,10 @@
 #endif
 };
 
-static int proc_tid_base_readdir(struct file * filp,
-			     void * dirent, filldir_t filldir)
+static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
 {
-	return proc_pident_readdir(filp,dirent,filldir,
-				   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
+	return proc_pident_readdir(file, ctx,
+				   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
 }
 
 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -3090,7 +3017,7 @@
 
 static const struct file_operations proc_tid_base_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_tid_base_readdir,
+	.iterate	= proc_tid_base_readdir,
 	.llseek		= default_llseek,
 };
 
@@ -3231,30 +3158,16 @@
 	return pos;
 }
 
-static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	struct task_struct *task, int tid)
-{
-	char name[PROC_NUMBUF];
-	int len = snprintf(name, sizeof(name), "%d", tid);
-	return proc_fill_cache(filp, dirent, filldir, name, len,
-				proc_task_instantiate, task, NULL);
-}
-
 /* for the /proc/TGID/task/ directories */
-static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
 	struct task_struct *leader = NULL;
-	struct task_struct *task;
-	int retval = -ENOENT;
-	ino_t ino;
-	int tid;
+	struct task_struct *task = get_proc_task(file_inode(file));
 	struct pid_namespace *ns;
+	int tid;
 
-	task = get_proc_task(inode);
 	if (!task)
-		goto out_no_task;
+		return -ENOENT;
 	rcu_read_lock();
 	if (pid_alive(task)) {
 		leader = task->group_leader;
@@ -3263,46 +3176,36 @@
 	rcu_read_unlock();
 	put_task_struct(task);
 	if (!leader)
-		goto out_no_task;
-	retval = 0;
+		return -ENOENT;
 
-	switch ((unsigned long)filp->f_pos) {
-	case 0:
-		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
-		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos++;
-		/* fall through */
-	}
+	if (!dir_emit_dots(file, ctx))
+		goto out;
 
 	/* f_version caches the tgid value that the last readdir call couldn't
 	 * return. lseek aka telldir automagically resets f_version to 0.
 	 */
-	ns = filp->f_dentry->d_sb->s_fs_info;
-	tid = (int)filp->f_version;
-	filp->f_version = 0;
-	for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
+	ns = file->f_dentry->d_sb->s_fs_info;
+	tid = (int)file->f_version;
+	file->f_version = 0;
+	for (task = first_tid(leader, tid, ctx->pos - 2, ns);
 	     task;
-	     task = next_tid(task), filp->f_pos++) {
+	     task = next_tid(task), ctx->pos++) {
+		char name[PROC_NUMBUF];
+		int len;
 		tid = task_pid_nr_ns(task, ns);
-		if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
+		len = snprintf(name, sizeof(name), "%d", tid);
+		if (!proc_fill_cache(file, ctx, name, len,
+				proc_task_instantiate, task, NULL)) {
 			/* returning this tgid failed, save it as the first
 			 * pid for the next readir call */
-			filp->f_version = (u64)tid;
+			file->f_version = (u64)tid;
 			put_task_struct(task);
 			break;
 		}
 	}
 out:
 	put_task_struct(leader);
-out_no_task:
-	return retval;
+	return 0;
 }
 
 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -3328,6 +3231,6 @@
 
 static const struct file_operations proc_task_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_task_readdir,
+	.iterate	= proc_task_readdir,
 	.llseek		= default_llseek,
 };

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d7a4a28..1441f14 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c

@@ -219,74 +219,58 @@
 	return result;
 }
 
-static int proc_readfd_common(struct file * filp, void * dirent,
-			      filldir_t filldir, instantiate_t instantiate)
+static int proc_readfd_common(struct file *file, struct dir_context *ctx,
+			      instantiate_t instantiate)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = get_proc_task(inode);
+	struct task_struct *p = get_proc_task(file_inode(file));
 	struct files_struct *files;
-	unsigned int fd, ino;
-	int retval;
+	unsigned int fd;
 
-	retval = -ENOENT;
 	if (!p)
-		goto out_no_task;
-	retval = 0;
+		return -ENOENT;
 
-	fd = filp->f_pos;
-	switch (fd) {
-		case 0:
-			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-				goto out;
-			filp->f_pos++;
-		case 1:
-			ino = parent_ino(dentry);
-			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-				goto out;
-			filp->f_pos++;
-		default:
-			files = get_files_struct(p);
-			if (!files)
-				goto out;
-			rcu_read_lock();
-			for (fd = filp->f_pos - 2;
-			     fd < files_fdtable(files)->max_fds;
-			     fd++, filp->f_pos++) {
-				char name[PROC_NUMBUF];
-				int len;
-				int rv;
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+	files = get_files_struct(p);
+	if (!files)
+		goto out;
 
-				if (!fcheck_files(files, fd))
-					continue;
-				rcu_read_unlock();
+	rcu_read_lock();
+	for (fd = ctx->pos - 2;
+	     fd < files_fdtable(files)->max_fds;
+	     fd++, ctx->pos++) {
+		char name[PROC_NUMBUF];
+		int len;
 
-				len = snprintf(name, sizeof(name), "%d", fd);
-				rv = proc_fill_cache(filp, dirent, filldir,
-						     name, len, instantiate, p,
-						     (void *)(unsigned long)fd);
-				if (rv < 0)
-					goto out_fd_loop;
-				rcu_read_lock();
-			}
-			rcu_read_unlock();
-out_fd_loop:
-			put_files_struct(files);
+		if (!fcheck_files(files, fd))
+			continue;
+		rcu_read_unlock();
+
+		len = snprintf(name, sizeof(name), "%d", fd);
+		if (!proc_fill_cache(file, ctx,
+				     name, len, instantiate, p,
+				     (void *)(unsigned long)fd))
+			goto out_fd_loop;
+		rcu_read_lock();
 	}
+	rcu_read_unlock();
+out_fd_loop:
+	put_files_struct(files);
 out:
 	put_task_struct(p);
-out_no_task:
-	return retval;
+	return 0;
 }
 
-static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfd(struct file *file, struct dir_context *ctx)
 {
-	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
+	return proc_readfd_common(file, ctx, proc_fd_instantiate);
 }
 
 const struct file_operations proc_fd_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_readfd,
+	.iterate	= proc_readfd,
 	.llseek		= default_llseek,
 };
 
@@ -351,9 +335,9 @@
 	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
 }
 
-static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
 {
-	return proc_readfd_common(filp, dirent, filldir,
+	return proc_readfd_common(file, ctx,
 				  proc_fdinfo_instantiate);
 }
 
@@ -364,6 +348,6 @@
 
 const struct file_operations proc_fdinfo_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_readfdinfo,
+	.iterate	= proc_readfdinfo,
 	.llseek		= default_llseek,
 };

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a2596af..94441a4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c

@@ -233,76 +233,52 @@
  * value of the readdir() call, as long as it's non-negative
  * for success..
  */
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
-		filldir_t filldir)
+int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
+		    struct dir_context *ctx)
 {
-	unsigned int ino;
 	int i;
-	struct inode *inode = file_inode(filp);
-	int ret = 0;
 
-	ino = inode->i_ino;
-	i = filp->f_pos;
-	switch (i) {
-		case 0:
-			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-				goto out;
-			i++;
-			filp->f_pos++;
-			/* fall through */
-		case 1:
-			if (filldir(dirent, "..", 2, i,
-				    parent_ino(filp->f_path.dentry),
-				    DT_DIR) < 0)
-				goto out;
-			i++;
-			filp->f_pos++;
-			/* fall through */
-		default:
-			spin_lock(&proc_subdir_lock);
-			de = de->subdir;
-			i -= 2;
-			for (;;) {
-				if (!de) {
-					ret = 1;
-					spin_unlock(&proc_subdir_lock);
-					goto out;
-				}
-				if (!i)
-					break;
-				de = de->next;
-				i--;
-			}
+	if (!dir_emit_dots(file, ctx))
+		return 0;
 
-			do {
-				struct proc_dir_entry *next;
-
-				/* filldir passes info to user space */
-				pde_get(de);
-				spin_unlock(&proc_subdir_lock);
-				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
-					    de->low_ino, de->mode >> 12) < 0) {
-					pde_put(de);
-					goto out;
-				}
-				spin_lock(&proc_subdir_lock);
-				filp->f_pos++;
-				next = de->next;
-				pde_put(de);
-				de = next;
-			} while (de);
+	spin_lock(&proc_subdir_lock);
+	de = de->subdir;
+	i = ctx->pos - 2;
+	for (;;) {
+		if (!de) {
 			spin_unlock(&proc_subdir_lock);
+			return 0;
+		}
+		if (!i)
+			break;
+		de = de->next;
+		i--;
 	}
-	ret = 1;
-out:
-	return ret;	
+
+	do {
+		struct proc_dir_entry *next;
+		pde_get(de);
+		spin_unlock(&proc_subdir_lock);
+		if (!dir_emit(ctx, de->name, de->namelen,
+			    de->low_ino, de->mode >> 12)) {
+			pde_put(de);
+			return 0;
+		}
+		spin_lock(&proc_subdir_lock);
+		ctx->pos++;
+		next = de->next;
+		pde_put(de);
+		de = next;
+	} while (de);
+	spin_unlock(&proc_subdir_lock);
+	return 0;
 }
 
-int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int proc_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 
-	return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+	return proc_readdir_de(PDE(inode), file, ctx);
 }
 
 /*
@@ -313,7 +289,7 @@
 static const struct file_operations proc_dir_operations = {
 	.llseek			= generic_file_llseek,
 	.read			= generic_read_dir,
-	.readdir		= proc_readdir,
+	.iterate		= proc_readdir,
 };
 
 /*

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d600fb0..4eae2e1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h

@@ -165,14 +165,14 @@
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
 extern int pid_revalidate(struct dentry *, unsigned int);
 extern int pid_delete_dentry(const struct dentry *);
-extern int proc_pid_readdir(struct file *, void *, filldir_t);
+extern int proc_pid_readdir(struct file *, struct dir_context *);
 extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
 extern loff_t mem_lseek(struct file *, loff_t, int);
 
 /* Lookups */
 typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
 				     struct task_struct *, const void *);
-extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int,
+extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
 			   instantiate_t, struct task_struct *, const void *);
 
 /*
@@ -183,8 +183,8 @@
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
 extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
 				     struct dentry *);
-extern int proc_readdir(struct file *, void *, filldir_t);
-extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t);
+extern int proc_readdir(struct file *, struct dir_context *);
+extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
 
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
 {

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 54bdc67..f6abbbb 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c

@@ -213,74 +213,36 @@
 	return error;
 }
 
-static int proc_ns_fill_cache(struct file *filp, void *dirent,
-	filldir_t filldir, struct task_struct *task,
-	const struct proc_ns_operations *ops)
+static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-	return proc_fill_cache(filp, dirent, filldir,
-				ops->name, strlen(ops->name),
-				proc_ns_instantiate, task, ops);
-}
-
-static int proc_ns_dir_readdir(struct file *filp, void *dirent,
-				filldir_t filldir)
-{
-	int i;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(file_inode(file));
 	const struct proc_ns_operations **entry, **last;
-	ino_t ino;
-	int ret;
 
-	ret = -ENOENT;
 	if (!task)
-		goto out_no_task;
+		return -ENOENT;
 
-	ret = 0;
-	i = filp->f_pos;
-	switch (i) {
-	case 0:
-		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
-		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	default:
-		i -= 2;
-		if (i >= ARRAY_SIZE(ns_entries)) {
-			ret = 1;
-			goto out;
-		}
-		entry = ns_entries + i;
-		last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
-		while (entry <= last) {
-			if (proc_ns_fill_cache(filp, dirent, filldir,
-						task, *entry) < 0)
-				goto out;
-			filp->f_pos++;
-			entry++;
-		}
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+	if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
+		goto out;
+	entry = ns_entries + (ctx->pos - 2);
+	last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+	while (entry <= last) {
+		const struct proc_ns_operations *ops = *entry;
+		if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
+				     proc_ns_instantiate, task, ops))
+			break;
+		ctx->pos++;
+		entry++;
 	}
-
-	ret = 1;
 out:
 	put_task_struct(task);
-out_no_task:
-	return ret;
+	return 0;
 }
 
 const struct file_operations proc_ns_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_ns_dir_readdir,
+	.iterate	= proc_ns_dir_readdir,
 };
 
 static struct dentry *proc_ns_dir_lookup(struct inode *dir,

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 986e832..4677bb7 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c

@@ -160,16 +160,15 @@
 	.getattr	= proc_tgid_net_getattr,
 };
 
-static int proc_tgid_net_readdir(struct file *filp, void *dirent,
-		filldir_t filldir)
+static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
 {
 	int ret;
 	struct net *net;
 
 	ret = -EINVAL;
-	net = get_proc_task_net(file_inode(filp));
+	net = get_proc_task_net(file_inode(file));
 	if (net != NULL) {
-		ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+		ret = proc_readdir_de(net->proc_net, file, ctx);
 		put_net(net);
 	}
 	return ret;
@@ -178,7 +177,7 @@
 const struct file_operations proc_net_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= proc_tgid_net_readdir,
+	.iterate	= proc_tgid_net_readdir,
 };
 
 static __net_init int proc_net_ns_init(struct net *net)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ac05f33..f3a570e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c

@@ -573,12 +573,12 @@
 	return ret;
 }
 
-static int proc_sys_fill_cache(struct file *filp, void *dirent,
-				filldir_t filldir,
+static bool proc_sys_fill_cache(struct file *file,
+				struct dir_context *ctx,
 				struct ctl_table_header *head,
 				struct ctl_table *table)
 {
-	struct dentry *child, *dir = filp->f_path.dentry;
+	struct dentry *child, *dir = file->f_path.dentry;
 	struct inode *inode;
 	struct qstr qname;
 	ino_t ino = 0;
@@ -595,38 +595,38 @@
 			inode = proc_sys_make_inode(dir->d_sb, head, table);
 			if (!inode) {
 				dput(child);
-				return -ENOMEM;
+				return false;
 			} else {
 				d_set_d_op(child, &proc_sys_dentry_operations);
 				d_add(child, inode);
 			}
 		} else {
-			return -ENOMEM;
+			return false;
 		}
 	}
 	inode = child->d_inode;
 	ino  = inode->i_ino;
 	type = inode->i_mode >> 12;
 	dput(child);
-	return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+	return dir_emit(ctx, qname.name, qname.len, ino, type);
 }
 
-static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
-				    filldir_t filldir,
+static bool proc_sys_link_fill_cache(struct file *file,
+				    struct dir_context *ctx,
 				    struct ctl_table_header *head,
 				    struct ctl_table *table)
 {
-	int err, ret = 0;
+	bool ret = true;
 	head = sysctl_head_grab(head);
 
 	if (S_ISLNK(table->mode)) {
 		/* It is not an error if we can not follow the link ignore it */
-		err = sysctl_follow_link(&head, &table, current->nsproxy);
+		int err = sysctl_follow_link(&head, &table, current->nsproxy);
 		if (err)
 			goto out;
 	}
 
-	ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
+	ret = proc_sys_fill_cache(file, ctx, head, table);
 out:
 	sysctl_head_finish(head);
 	return ret;
@@ -634,67 +634,50 @@
 
 static int scan(struct ctl_table_header *head, ctl_table *table,
 		unsigned long *pos, struct file *file,
-		void *dirent, filldir_t filldir)
+		struct dir_context *ctx)
 {
-	int res;
+	bool res;
 
-	if ((*pos)++ < file->f_pos)
-		return 0;
+	if ((*pos)++ < ctx->pos)
+		return true;
 
 	if (unlikely(S_ISLNK(table->mode)))
-		res = proc_sys_link_fill_cache(file, dirent, filldir, head, table);
+		res = proc_sys_link_fill_cache(file, ctx, head, table);
 	else
-		res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+		res = proc_sys_fill_cache(file, ctx, head, table);
 
-	if (res == 0)
-		file->f_pos = *pos;
+	if (res)
+		ctx->pos = *pos;
 
 	return res;
 }
 
-static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table_header *head = grab_header(file_inode(file));
 	struct ctl_table_header *h = NULL;
 	struct ctl_table *entry;
 	struct ctl_dir *ctl_dir;
 	unsigned long pos;
-	int ret = -EINVAL;
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);
 
 	ctl_dir = container_of(head, struct ctl_dir, header);
 
-	ret = 0;
-	/* Avoid a switch here: arm builds fail with missing __cmpdi2 */
-	if (filp->f_pos == 0) {
-		if (filldir(dirent, ".", 1, filp->f_pos,
-				inode->i_ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos++;
-	}
-	if (filp->f_pos == 1) {
-		if (filldir(dirent, "..", 2, filp->f_pos,
-				parent_ino(dentry), DT_DIR) < 0)
-			goto out;
-		filp->f_pos++;
-	}
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
 	pos = 2;
 
 	for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
-		ret = scan(h, entry, &pos, filp, dirent, filldir);
-		if (ret) {
+		if (!scan(h, entry, &pos, file, ctx)) {
 			sysctl_head_finish(h);
 			break;
 		}
 	}
-	ret = 1;
-out:
 	sysctl_head_finish(head);
-	return ret;
+	return 0;
 }
 
 static int proc_sys_permission(struct inode *inode, int mask)
@@ -769,7 +752,7 @@
 
 static const struct file_operations proc_sys_dir_file_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_sys_readdir,
+	.iterate	= proc_sys_readdir,
 	.llseek		= generic_file_llseek,
 };
 

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 41a6ea9..229e366 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c

@@ -202,21 +202,14 @@
 	return proc_pid_lookup(dir, dentry, flags);
 }
 
-static int proc_root_readdir(struct file * filp,
-	void * dirent, filldir_t filldir)
+static int proc_root_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned int nr = filp->f_pos;
-	int ret;
-
-	if (nr < FIRST_PROCESS_ENTRY) {
-		int error = proc_readdir(filp, dirent, filldir);
-		if (error <= 0)
-			return error;
-		filp->f_pos = FIRST_PROCESS_ENTRY;
+	if (ctx->pos < FIRST_PROCESS_ENTRY) {
+		proc_readdir(file, ctx);
+		ctx->pos = FIRST_PROCESS_ENTRY;
 	}
 
-	ret = proc_pid_readdir(filp, dirent, filldir);
-	return ret;
+	return proc_pid_readdir(file, ctx);
 }
 
 /*
@@ -226,7 +219,7 @@
  */
 static const struct file_operations proc_root_operations = {
 	.read		 = generic_read_dir,
-	.readdir	 = proc_root_readdir,
+	.iterate	 = proc_root_readdir,
 	.llseek		= default_llseek,
 };
 

diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 28ce014..b218f96 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c

@@ -14,9 +14,9 @@
 #include <linux/buffer_head.h>
 #include "qnx4.h"
 
-static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx4_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	unsigned int offset;
 	struct buffer_head *bh;
 	struct qnx4_inode_entry *de;
@@ -26,48 +26,44 @@
 	int size;
 
 	QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
-	QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
+	QNX4DEBUG((KERN_INFO "pos                 = %ld\n", (long) ctx->pos));
 
-	while (filp->f_pos < inode->i_size) {
-		blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
+	while (ctx->pos < inode->i_size) {
+		blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS);
 		bh = sb_bread(inode->i_sb, blknum);
-		if(bh==NULL) {
+		if (bh == NULL) {
 			printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum);
-			break;
+			return 0;
 		}
-		ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
-		while (ix < QNX4_INODES_PER_BLOCK) {
+		ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
+		for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
 			offset = ix * QNX4_DIR_ENTRY_SIZE;
 			de = (struct qnx4_inode_entry *) (bh->b_data + offset);
-			size = strlen(de->di_fname);
-			if (size) {
-				if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX )
-					size = QNX4_SHORT_NAME_MAX;
-				else if ( size > QNX4_NAME_MAX )
-					size = QNX4_NAME_MAX;
-
-				if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
-					QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
-					if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
-						ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
-					else {
-						le  = (struct qnx4_link_info*)de;
-						ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
-							QNX4_INODES_PER_BLOCK +
-							le->dl_inode_ndx;
-					}
-					if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) {
-						brelse(bh);
-						goto out;
-					}
-				}
+			if (!de->di_fname[0])
+				continue;
+			if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
+				continue;
+			if (!(de->di_status & QNX4_FILE_LINK))
+				size = QNX4_SHORT_NAME_MAX;
+			else
+				size = QNX4_NAME_MAX;
+			size = strnlen(de->di_fname, size);
+			QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
+			if (!(de->di_status & QNX4_FILE_LINK))
+				ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
+			else {
+				le  = (struct qnx4_link_info*)de;
+				ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
+					QNX4_INODES_PER_BLOCK +
+					le->dl_inode_ndx;
 			}
-			ix++;
-			filp->f_pos += QNX4_DIR_ENTRY_SIZE;
+			if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
+				brelse(bh);
+				return 0;
+			}
 		}
 		brelse(bh);
 	}
-out:
 	return 0;
 }
 
@@ -75,7 +71,7 @@
 {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= qnx4_readdir,
+	.iterate	= qnx4_readdir,
 	.fsync		= generic_file_fsync,
 };
 

diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index afa6be6..15b7d92e 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c

@@ -65,8 +65,8 @@
 
 static int qnx6_dir_longfilename(struct inode *inode,
 			struct qnx6_long_dir_entry *de,
-			void *dirent, loff_t pos,
-			unsigned de_inode, filldir_t filldir)
+			struct dir_context *ctx,
+			unsigned de_inode)
 {
 	struct qnx6_long_filename *lf;
 	struct super_block *s = inode->i_sb;
@@ -104,8 +104,7 @@
 
 	QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
 					lf_size, lf->lf_fname, de_inode));
-	if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode,
-			DT_UNKNOWN) < 0) {
+	if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
 		qnx6_put_page(page);
 		return 0;
 	}
@@ -115,18 +114,19 @@
 	return 1;
 }
 
-static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx6_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = file_inode(file);
 	struct super_block *s = inode->i_sb;
 	struct qnx6_sb_info *sbi = QNX6_SB(s);
-	loff_t pos = filp->f_pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
+	loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
 	unsigned long npages = dir_pages(inode);
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
 	bool done = false;
 
-	if (filp->f_pos >= inode->i_size)
+	ctx->pos = pos;
+	if (ctx->pos >= inode->i_size)
 		return 0;
 
 	for ( ; !done && n < npages; n++, start = 0) {
@@ -137,11 +137,11 @@
 
 		if (IS_ERR(page)) {
 			printk(KERN_ERR "qnx6_readdir: read failed\n");
-			filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT;
+			ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
 			return PTR_ERR(page);
 		}
 		de = ((struct qnx6_dir_entry *)page_address(page)) + start;
-		for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) {
+		for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
 			int size = de->de_size;
 			u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
 
@@ -154,8 +154,7 @@
 				   structure / block */
 				if (!qnx6_dir_longfilename(inode,
 					(struct qnx6_long_dir_entry *)de,
-					dirent, pos, no_inode,
-					filldir)) {
+					ctx, no_inode)) {
 					done = true;
 					break;
 				}
@@ -163,9 +162,8 @@
 				QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
 				   " inode:%u\n", size, de->de_fname,
 							no_inode));
-				if (filldir(dirent, de->de_fname, size,
-				      pos, no_inode, DT_UNKNOWN)
-					< 0) {
+				if (!dir_emit(ctx, de->de_fname, size,
+				      no_inode, DT_UNKNOWN)) {
 					done = true;
 					break;
 				}
@@ -173,7 +171,6 @@
 		}
 		qnx6_put_page(page);
 	}
-	filp->f_pos = pos;
 	return 0;
 }
 
@@ -282,7 +279,7 @@
 const struct file_operations qnx6_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= qnx6_readdir,
+	.iterate	= qnx6_readdir,
 	.fsync		= generic_file_fsync,
 };
 

diff --git a/fs/readdir.c b/fs/readdir.c
index fee38e0..93d71e5 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c

@@ -20,11 +20,11 @@
 
 #include <asm/uaccess.h>
 
-int vfs_readdir(struct file *file, filldir_t filler, void *buf)
+int iterate_dir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
 	int res = -ENOTDIR;
-	if (!file->f_op || !file->f_op->readdir)
+	if (!file->f_op || !file->f_op->iterate)
 		goto out;
 
 	res = security_file_permission(file, MAY_READ);
@@ -37,15 +37,16 @@
 
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
-		res = file->f_op->readdir(file, buf, filler);
+		ctx->pos = file->f_pos;
+		res = file->f_op->iterate(file, ctx);
+		file->f_pos = ctx->pos;
 		file_accessed(file);
 	}
 	mutex_unlock(&inode->i_mutex);
 out:
 	return res;
 }
-
-EXPORT_SYMBOL(vfs_readdir);
+EXPORT_SYMBOL(iterate_dir);
 
 /*
  * Traditional linux readdir() handling..
@@ -66,6 +67,7 @@
 };
 
 struct readdir_callback {
+	struct dir_context ctx;
 	struct old_linux_dirent __user * dirent;
 	int result;
 };
@@ -73,7 +75,7 @@
 static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
 		      u64 ino, unsigned int d_type)
 {
-	struct readdir_callback * buf = (struct readdir_callback *) __buf;
+	struct readdir_callback *buf = (struct readdir_callback *) __buf;
 	struct old_linux_dirent __user * dirent;
 	unsigned long d_ino;
 
@@ -107,15 +109,15 @@
 {
 	int error;
 	struct fd f = fdget(fd);
-	struct readdir_callback buf;
+	struct readdir_callback buf = {
+		.ctx.actor = fillonedir,
+		.dirent = dirent
+	};
 
 	if (!f.file)
 		return -EBADF;
 
-	buf.result = 0;
-	buf.dirent = dirent;
-
-	error = vfs_readdir(f.file, fillonedir, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (buf.result)
 		error = buf.result;
 
@@ -137,6 +139,7 @@
 };
 
 struct getdents_callback {
+	struct dir_context ctx;
 	struct linux_dirent __user * current_dir;
 	struct linux_dirent __user * previous;
 	int count;
@@ -191,7 +194,11 @@
 {
 	struct fd f;
 	struct linux_dirent __user * lastdirent;
-	struct getdents_callback buf;
+	struct getdents_callback buf = {
+		.ctx.actor = filldir,
+		.count = count,
+		.current_dir = dirent
+	};
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -201,17 +208,12 @@
 	if (!f.file)
 		return -EBADF;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
-
-	error = vfs_readdir(f.file, filldir, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		if (put_user(f.file->f_pos, &lastdirent->d_off))
+		if (put_user(buf.ctx.pos, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
@@ -221,6 +223,7 @@
 }
 
 struct getdents_callback64 {
+	struct dir_context ctx;
 	struct linux_dirent64 __user * current_dir;
 	struct linux_dirent64 __user * previous;
 	int count;
@@ -271,7 +274,11 @@
 {
 	struct fd f;
 	struct linux_dirent64 __user * lastdirent;
-	struct getdents_callback64 buf;
+	struct getdents_callback64 buf = {
+		.ctx.actor = filldir64,
+		.count = count,
+		.current_dir = dirent
+	};
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -281,17 +288,12 @@
 	if (!f.file)
 		return -EBADF;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
-
-	error = vfs_readdir(f.file, filldir64, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		typeof(lastdirent->d_off) d_off = f.file->f_pos;
+		typeof(lastdirent->d_off) d_off = buf.ctx.pos;
 		if (__put_user(d_off, &lastdirent->d_off))
 			error = -EFAULT;
 		else

diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6c2d136..03e4ca5 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c

@@ -13,14 +13,14 @@
 
 extern const struct reiserfs_key MIN_KEY;
 
-static int reiserfs_readdir(struct file *, void *, filldir_t);
+static int reiserfs_readdir(struct file *, struct dir_context *);
 static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 			      int datasync);
 
 const struct file_operations reiserfs_dir_operations = {
 	.llseek = generic_file_llseek,
 	.read = generic_read_dir,
-	.readdir = reiserfs_readdir,
+	.iterate = reiserfs_readdir,
 	.fsync = reiserfs_dir_fsync,
 	.unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
@@ -50,18 +50,15 @@
 
 #define store_ih(where,what) copy_item_head (where, what)
 
-static inline bool is_privroot_deh(struct dentry *dir,
-				   struct reiserfs_de_head *deh)
+static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
 {
-	struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-	return (dir == dir->d_parent && privroot->d_inode &&
+	struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
+	return (privroot->d_inode &&
 	        deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
 
-int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
-			   filldir_t filldir, loff_t *pos)
+int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
 {
-	struct inode *inode = dentry->d_inode;
 	struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
 	INITIALIZE_PATH(path_to_entry);
 	struct buffer_head *bh;
@@ -81,7 +78,7 @@
 
 	/* form key for search the next directory entry using f_pos field of
 	   file structure */
-	make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
+	make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
 	next_pos = cpu_key_k_offset(&pos_key);
 
 	path_to_entry.reada = PATH_READA;
@@ -126,7 +123,6 @@
 			     entry_num++, deh++) {
 				int d_reclen;
 				char *d_name;
-				off_t d_off;
 				ino_t d_ino;
 
 				if (!de_visible(deh))
@@ -155,11 +151,10 @@
 				}
 
 				/* Ignore the .reiserfs_priv entry */
-				if (is_privroot_deh(dentry, deh))
+				if (is_privroot_deh(inode, deh))
 					continue;
 
-				d_off = deh_offset(deh);
-				*pos = d_off;
+				ctx->pos = deh_offset(deh);
 				d_ino = deh_objectid(deh);
 				if (d_reclen <= 32) {
 					local_buf = small_buf;
@@ -187,9 +182,9 @@
 				 * the write lock here for other waiters
 				 */
 				reiserfs_write_unlock(inode->i_sb);
-				if (filldir
-				    (dirent, local_buf, d_reclen, d_off, d_ino,
-				     DT_UNKNOWN) < 0) {
+				if (!dir_emit
+				    (ctx, local_buf, d_reclen, d_ino,
+				     DT_UNKNOWN)) {
 					reiserfs_write_lock(inode->i_sb);
 					if (local_buf != small_buf) {
 						kfree(local_buf);
@@ -237,7 +232,7 @@
 	}			/* while */
 
 end:
-	*pos = next_pos;
+	ctx->pos = next_pos;
 	pathrelse(&path_to_entry);
 	reiserfs_check_path(&path_to_entry);
 out:
@@ -245,10 +240,9 @@
 	return ret;
 }
 
-static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = file->f_path.dentry;
-	return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
+	return reiserfs_readdir_inode(file_inode(file), ctx);
 }
 
 /* compose directory item containing "." and ".." entries (entries are

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f844533..0048cc1 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c

@@ -2975,16 +2975,19 @@
 }
 
 /* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
+static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
+				    unsigned int length)
 {
 	struct buffer_head *head, *bh, *next;
 	struct inode *inode = page->mapping->host;
 	unsigned int curr_off = 0;
+	unsigned int stop = offset + length;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
 	int ret = 1;
 
 	BUG_ON(!PageLocked(page));
 
-	if (offset == 0)
+	if (!partial_page)
 		ClearPageChecked(page);
 
 	if (!page_has_buffers(page))
@@ -2996,6 +2999,9 @@
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
 
+		if (next_off > stop)
+			goto out;
+
 		/*
 		 * is this block fully invalidated?
 		 */
@@ -3014,7 +3020,7 @@
 	 * The get_block cached value has been unconditionally invalidated,
 	 * so real IO is not possible anymore.
 	 */
-	if (!offset && ret) {
+	if (!partial_page && ret) {
 		ret = try_to_release_page(page, 0);
 		/* maybe should BUG_ON(!ret); - neilb */
 	}

diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 157e474..3df5ce6 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h

@@ -2709,7 +2709,7 @@
 extern const struct inode_operations reiserfs_symlink_inode_operations;
 extern const struct inode_operations reiserfs_special_inode_operations;
 extern const struct file_operations reiserfs_dir_operations;
-int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *);
+int reiserfs_readdir_inode(struct inode *, struct dir_context *);
 
 /* tail_conversion.c */
 int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 821bcf7..c69cdd7 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c

@@ -171,6 +171,7 @@
  * modifying extended attributes. This includes operations such as permissions
  * or ownership changes, object deletions, etc. */
 struct reiserfs_dentry_buf {
+	struct dir_context ctx;
 	struct dentry *xadir;
 	int count;
 	struct dentry *dentries[8];
@@ -223,9 +224,8 @@
 {
 	struct dentry *dir;
 	int i, err = 0;
-	loff_t pos = 0;
 	struct reiserfs_dentry_buf buf = {
-		.count = 0,
+		.ctx.actor = fill_with_dentries,
 	};
 
 	/* Skip out, an xattr has no xattrs associated with it */
@@ -249,29 +249,27 @@
 	reiserfs_write_lock(inode->i_sb);
 
 	buf.xadir = dir;
-	err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
-	while ((err == 0 || err == -ENOSPC) && buf.count) {
-		err = 0;
-
-		for (i = 0; i < buf.count && buf.dentries[i]; i++) {
-			int lerr = 0;
+	while (1) {
+		err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
+		if (err)
+			break;
+		if (!buf.count)
+			break;
+		for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
 			struct dentry *dentry = buf.dentries[i];
 
-			if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode))
-				lerr = action(dentry, data);
+			if (!S_ISDIR(dentry->d_inode->i_mode))
+				err = action(dentry, data);
 
 			dput(dentry);
 			buf.dentries[i] = NULL;
-			err = lerr ?: err;
 		}
+		if (err)
+			break;
 		buf.count = 0;
-		if (!err)
-			err = reiserfs_readdir_dentry(dir, &buf,
-						      fill_with_dentries, &pos);
 	}
 	mutex_unlock(&dir->d_inode->i_mutex);
 
-	/* Clean up after a failed readdir */
 	cleanup_dentry_buf(&buf);
 
 	if (!err) {
@@ -800,6 +798,7 @@
 }
 
 struct listxattr_buf {
+	struct dir_context ctx;
 	size_t size;
 	size_t pos;
 	char *buf;
@@ -845,8 +844,8 @@
 {
 	struct dentry *dir;
 	int err = 0;
-	loff_t pos = 0;
 	struct listxattr_buf buf = {
+		.ctx.actor = listxattr_filler,
 		.dentry = dentry,
 		.buf = buffer,
 		.size = buffer ? size : 0,
@@ -868,7 +867,7 @@
 	}
 
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
-	err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos);
+	err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
 	mutex_unlock(&dir->d_inode->i_mutex);
 
 	if (!err)

diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 15cbc41e..ff1d3d42 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c

@@ -145,19 +145,18 @@
 /*
  * read the entries from a directory
  */
-static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int romfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *i = file_inode(filp);
+	struct inode *i = file_inode(file);
 	struct romfs_inode ri;
 	unsigned long offset, maxoff;
 	int j, ino, nextfh;
-	int stored = 0;
 	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
 	int ret;
 
 	maxoff = romfs_maxsize(i->i_sb);
 
-	offset = filp->f_pos;
+	offset = ctx->pos;
 	if (!offset) {
 		offset = i->i_ino & ROMFH_MASK;
 		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -170,10 +169,10 @@
 	for (;;) {
 		if (!offset || offset >= maxoff) {
 			offset = maxoff;
-			filp->f_pos = offset;
+			ctx->pos = offset;
 			goto out;
 		}
-		filp->f_pos = offset;
+		ctx->pos = offset;
 
 		/* Fetch inode info */
 		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -194,16 +193,14 @@
 		nextfh = be32_to_cpu(ri.next);
 		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
 			ino = be32_to_cpu(ri.spec);
-		if (filldir(dirent, fsname, j, offset, ino,
-			    romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+		if (!dir_emit(ctx, fsname, j, ino,
+			    romfs_dtype_table[nextfh & ROMFH_TYPE]))
 			goto out;
 
-		stored++;
 		offset = nextfh & ROMFH_MASK;
 	}
-
 out:
-	return stored;
+	return 0;
 }
 
 /*
@@ -281,7 +278,7 @@
 
 static const struct file_operations romfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= romfs_readdir,
+	.iterate	= romfs_readdir,
 	.llseek		= default_llseek,
 };
 

diff --git a/fs/splice.c b/fs/splice.c
index 9eca476..d37431d 100644
--- a/fs/splice.c
+++ b/fs/splice.c

@@ -1283,6 +1283,7 @@
  * @in:		file to splice from
  * @ppos:	input file offset
  * @out:	file to splice to
+ * @opos:	output file offset
  * @len:	number of bytes to splice
  * @flags:	splice modifier flags
  *

diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 57dc70e..f7f527b 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c

@@ -100,7 +100,7 @@
 }
 
 
-static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int squashfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
@@ -127,11 +127,11 @@
 	 * It also means that the external f_pos is offset by 3 from the
 	 * on-disk directory f_pos.
 	 */
-	while (file->f_pos < 3) {
+	while (ctx->pos < 3) {
 		char *name;
 		int i_ino;
 
-		if (file->f_pos == 0) {
+		if (ctx->pos == 0) {
 			name = ".";
 			size = 1;
 			i_ino = inode->i_ino;
@@ -141,24 +141,18 @@
 			i_ino = squashfs_i(inode)->parent;
 		}
 
-		TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
-				dirent, name, size, file->f_pos, i_ino,
-				squashfs_filetype_table[1]);
-
-		if (filldir(dirent, name, size, file->f_pos, i_ino,
-				squashfs_filetype_table[1]) < 0) {
-				TRACE("Filldir returned less than 0\n");
+		if (!dir_emit(ctx, name, size, i_ino,
+				squashfs_filetype_table[1]))
 			goto finish;
-		}
 
-		file->f_pos += size;
+		ctx->pos += size;
 	}
 
 	length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
 				squashfs_i(inode)->dir_idx_start,
 				squashfs_i(inode)->dir_idx_offset,
 				squashfs_i(inode)->dir_idx_cnt,
-				file->f_pos);
+				ctx->pos);
 
 	while (length < i_size_read(inode)) {
 		/*
@@ -198,7 +192,7 @@
 
 			length += sizeof(*dire) + size;
 
-			if (file->f_pos >= length)
+			if (ctx->pos >= length)
 				continue;
 
 			dire->name[size] = '\0';
@@ -206,22 +200,12 @@
 				((short) le16_to_cpu(dire->inode_number));
 			type = le16_to_cpu(dire->type);
 
-			TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
-					"\n", dirent, dire->name, size,
-					file->f_pos,
-					le32_to_cpu(dirh.start_block),
-					le16_to_cpu(dire->offset),
+			if (!dir_emit(ctx, dire->name, size,
 					inode_number,
-					squashfs_filetype_table[type]);
-
-			if (filldir(dirent, dire->name, size, file->f_pos,
-					inode_number,
-					squashfs_filetype_table[type]) < 0) {
-				TRACE("Filldir returned less than 0\n");
+					squashfs_filetype_table[type]))
 				goto finish;
-			}
 
-			file->f_pos = length;
+			ctx->pos = length;
 		}
 	}
 
@@ -238,6 +222,6 @@
 
 const struct file_operations squashfs_dir_ops = {
 	.read = generic_read_dir,
-	.readdir = squashfs_readdir,
+	.iterate = squashfs_readdir,
 	.llseek = default_llseek,
 };

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 17c9a70..e068e74 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c

@@ -998,68 +998,38 @@
 	return pos;
 }
 
-static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
 	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
-	struct sysfs_dirent *pos = filp->private_data;
+	struct sysfs_dirent *pos = file->private_data;
 	enum kobj_ns_type type;
 	const void *ns;
-	ino_t ino;
-	loff_t off;
 
 	type = sysfs_ns_type(parent_sd);
 	ns = sysfs_info(dentry->d_sb)->ns[type];
 
-	if (filp->f_pos == 0) {
-		ino = parent_sd->s_ino;
-		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
-			filp->f_pos++;
-		else
-			return 0;
-	}
-	if (filp->f_pos == 1) {
-		if (parent_sd->s_parent)
-			ino = parent_sd->s_parent->s_ino;
-		else
-			ino = parent_sd->s_ino;
-		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
-			filp->f_pos++;
-		else
-			return 0;
-	}
+	if (!dir_emit_dots(file, ctx))
+		return 0;
 	mutex_lock(&sysfs_mutex);
-	off = filp->f_pos;
-	for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
+	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
 	     pos;
-	     pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
-		const char * name;
-		unsigned int type;
-		int len, ret;
-
-		name = pos->s_name;
-		len = strlen(name);
-		ino = pos->s_ino;
-		type = dt_type(pos);
-		off = filp->f_pos = pos->s_hash;
-		filp->private_data = sysfs_get(pos);
+	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
+		const char *name = pos->s_name;
+		unsigned int type = dt_type(pos);
+		int len = strlen(name);
+		ino_t ino = pos->s_ino;
+		ctx->pos = pos->s_hash;
+		file->private_data = sysfs_get(pos);
 
 		mutex_unlock(&sysfs_mutex);
-		ret = filldir(dirent, name, len, off, ino, type);
+		if (!dir_emit(ctx, name, len, ino, type))
+			return 0;
 		mutex_lock(&sysfs_mutex);
-		if (ret < 0)
-			break;
 	}
 	mutex_unlock(&sysfs_mutex);
-
-	/* don't reference last entry if its refcount is dropped */
-	if (!pos) {
-		filp->private_data = NULL;
-
-		/* EOF and not changed as 0 or 1 in read/write path */
-		if (off == filp->f_pos && off > 1)
-			filp->f_pos = INT_MAX;
-	}
+	file->private_data = NULL;
+	ctx->pos = INT_MAX;
 	return 0;
 }
 
@@ -1077,7 +1047,7 @@
 
 const struct file_operations sysfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= sysfs_readdir,
+	.iterate	= sysfs_readdir,
 	.release	= sysfs_dir_release,
 	.llseek		= sysfs_dir_llseek,
 };

diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 3799e8d..d42291d 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c

@@ -18,12 +18,12 @@
 #include <linux/swap.h>
 #include "sysv.h"
 
-static int sysv_readdir(struct file *, void *, filldir_t);
+static int sysv_readdir(struct file *, struct dir_context *);
 
 const struct file_operations sysv_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= sysv_readdir,
+	.iterate	= sysv_readdir,
 	.fsync		= generic_file_fsync,
 };
 
@@ -65,18 +65,21 @@
 	return page;
 }
 
-static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysv_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned long pos = filp->f_pos;
-	struct inode *inode = file_inode(filp);
+	unsigned long pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	unsigned offset = pos & ~PAGE_CACHE_MASK;
-	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = dir_pages(inode);
+	unsigned offset;
+	unsigned long n;
 
-	pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
+	ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
 	if (pos >= inode->i_size)
-		goto done;
+		return 0;
+
+	offset = pos & ~PAGE_CACHE_MASK;
+	n = pos >> PAGE_CACHE_SHIFT;
 
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
@@ -88,29 +91,21 @@
 		kaddr = (char *)page_address(page);
 		de = (struct sysv_dir_entry *)(kaddr+offset);
 		limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
-		for ( ;(char*)de <= limit; de++) {
+		for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
 			char *name = de->name;
-			int over;
 
 			if (!de->inode)
 				continue;
 
-			offset = (char *)de - kaddr;
-
-			over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
-					((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
+			if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
 					fs16_to_cpu(SYSV_SB(sb), de->inode),
-					DT_UNKNOWN);
-			if (over) {
+					DT_UNKNOWN)) {
 				dir_put_page(page);
-				goto done;
+				return 0;
 			}
 		}
 		dir_put_page(page);
 	}
-
-done:
-	filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
 	return 0;
 }
 

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index de08c92f..6b4947f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c

@@ -346,38 +346,46 @@
  * This means that UBIFS cannot support NFS which requires full
  * 'seekdir()'/'telldir()' support.
  */
-static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 {
-	int err, over = 0;
+	int err;
 	struct qstr nm;
 	union ubifs_key key;
 	struct ubifs_dent_node *dent;
 	struct inode *dir = file_inode(file);
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 
-	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
 
-	if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+	if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2)
 		/*
 		 * The directory was seek'ed to a senseless position or there
 		 * are no more entries.
 		 */
 		return 0;
 
-	/* File positions 0 and 1 correspond to "." and ".." */
-	if (file->f_pos == 0) {
-		ubifs_assert(!file->private_data);
-		over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
-		if (over)
-			return 0;
-		file->f_pos = 1;
+	if (file->f_version == 0) {
+		/*
+		 * The file was seek'ed, which means that @file->private_data
+		 * is now invalid. This may also be just the first
+		 * 'ubifs_readdir()' invocation, in which case
+		 * @file->private_data is NULL, and the below code is
+		 * basically a no-op.
+		 */
+		kfree(file->private_data);
+		file->private_data = NULL;
 	}
 
-	if (file->f_pos == 1) {
+	/*
+	 * 'generic_file_llseek()' unconditionally sets @file->f_version to
+	 * zero, and we use this for detecting whether the file was seek'ed.
+	 */
+	file->f_version = 1;
+
+	/* File positions 0 and 1 correspond to "." and ".." */
+	if (ctx->pos < 2) {
 		ubifs_assert(!file->private_data);
-		over = filldir(dirent, "..", 2, 1,
-			       parent_ino(file->f_path.dentry), DT_DIR);
-		if (over)
+		if (!dir_emit_dots(file, ctx))
 			return 0;
 
 		/* Find the first entry in TNC and save it */
@@ -389,7 +397,7 @@
 			goto out;
 		}
 
-		file->f_pos = key_hash_flash(c, &dent->key);
+		ctx->pos = key_hash_flash(c, &dent->key);
 		file->private_data = dent;
 	}
 
@@ -397,17 +405,16 @@
 	if (!dent) {
 		/*
 		 * The directory was seek'ed to and is now readdir'ed.
-		 * Find the entry corresponding to @file->f_pos or the
-		 * closest one.
+		 * Find the entry corresponding to @ctx->pos or the closest one.
 		 */
-		dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+		dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
 		nm.name = NULL;
 		dent = ubifs_tnc_next_ent(c, &key, &nm);
 		if (IS_ERR(dent)) {
 			err = PTR_ERR(dent);
 			goto out;
 		}
-		file->f_pos = key_hash_flash(c, &dent->key);
+		ctx->pos = key_hash_flash(c, &dent->key);
 		file->private_data = dent;
 	}
 
@@ -419,10 +426,9 @@
 			     ubifs_inode(dir)->creat_sqnum);
 
 		nm.len = le16_to_cpu(dent->nlen);
-		over = filldir(dirent, dent->name, nm.len, file->f_pos,
+		if (!dir_emit(ctx, dent->name, nm.len,
 			       le64_to_cpu(dent->inum),
-			       vfs_dent_type(dent->type));
-		if (over)
+			       vfs_dent_type(dent->type)))
 			return 0;
 
 		/* Switch to the next entry */
@@ -435,7 +441,7 @@
 		}
 
 		kfree(file->private_data);
-		file->f_pos = key_hash_flash(c, &dent->key);
+		ctx->pos = key_hash_flash(c, &dent->key);
 		file->private_data = dent;
 		cond_resched();
 	}
@@ -448,18 +454,11 @@
 
 	kfree(file->private_data);
 	file->private_data = NULL;
-	file->f_pos = 2;
+	/* 2 is a special value indicating that there are no more direntries */
+	ctx->pos = 2;
 	return 0;
 }
 
-/* If a directory is seeked, we have to free saved readdir() state */
-static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-	kfree(file->private_data);
-	file->private_data = NULL;
-	return generic_file_llseek(file, offset, whence);
-}
-
 /* Free saved readdir() state when the directory is closed */
 static int ubifs_dir_release(struct inode *dir, struct file *file)
 {
@@ -1177,10 +1176,10 @@
 };
 
 const struct file_operations ubifs_dir_operations = {
-	.llseek         = ubifs_dir_llseek,
+	.llseek         = generic_file_llseek,
 	.release        = ubifs_dir_release,
 	.read           = generic_read_dir,
-	.readdir        = ubifs_readdir,
+	.iterate        = ubifs_readdir,
 	.fsync          = ubifs_fsync,
 	.unlocked_ioctl = ubifs_ioctl,
 #ifdef CONFIG_COMPAT

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1437453..123c79b 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c

@@ -1277,13 +1277,14 @@
 	return err;
 }
 
-static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+static void ubifs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct inode *inode = page->mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 
 	ubifs_assert(PagePrivate(page));
-	if (offset)
+	if (offset || length < PAGE_CACHE_SIZE)
 		/* Partial page remains dirty */
 		return;
 

diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index b3e93f5..a012c51 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c

@@ -35,14 +35,16 @@
 #include "udf_i.h"
 #include "udf_sb.h"
 
-static int do_udf_readdir(struct inode *dir, struct file *filp,
-			  filldir_t filldir, void *dirent)
+
+static int udf_readdir(struct file *file, struct dir_context *ctx)
 {
+	struct inode *dir = file_inode(file);
+	struct udf_inode_info *iinfo = UDF_I(dir);
 	struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
 	struct fileIdentDesc *fi = NULL;
 	struct fileIdentDesc cfi;
 	int block, iblock;
-	loff_t nf_pos = (filp->f_pos - 1) << 2;
+	loff_t nf_pos;
 	int flen;
 	unsigned char *fname = NULL;
 	unsigned char *nameptr;
@@ -54,10 +56,14 @@
 	uint32_t elen;
 	sector_t offset;
 	int i, num, ret = 0;
-	unsigned int dt_type;
 	struct extent_position epos = { NULL, 0, {0, 0} };
-	struct udf_inode_info *iinfo;
 
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(file, ctx))
+			return 0;
+		ctx->pos = 1;
+	}
+	nf_pos = (ctx->pos - 1) << 2;
 	if (nf_pos >= size)
 		goto out;
 
@@ -71,7 +77,6 @@
 		nf_pos = udf_ext0_offset(dir);
 
 	fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
-	iinfo = UDF_I(dir);
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 		if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
 		    &epos, &eloc, &elen, &offset)
@@ -116,7 +121,9 @@
 	}
 
 	while (nf_pos < size) {
-		filp->f_pos = (nf_pos >> 2) + 1;
+		struct kernel_lb_addr tloc;
+
+		ctx->pos = (nf_pos >> 2) + 1;
 
 		fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
 					&elen, &offset);
@@ -155,24 +162,22 @@
 		}
 
 		if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) {
-			iblock = parent_ino(filp->f_path.dentry);
-			flen = 2;
-			memcpy(fname, "..", flen);
-			dt_type = DT_DIR;
-		} else {
-			struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
-
-			iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
-			flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
-			dt_type = DT_UNKNOWN;
+			if (!dir_emit_dotdot(file, ctx))
+				goto out;
+			continue;
 		}
 
-		if (flen && filldir(dirent, fname, flen, filp->f_pos,
-				    iblock, dt_type) < 0)
+		flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+		if (!flen)
+			continue;
+
+		tloc = lelb_to_cpu(cfi.icb.extLocation);
+		iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
+		if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
 			goto out;
 	} /* end while */
 
-	filp->f_pos = (nf_pos >> 2) + 1;
+	ctx->pos = (nf_pos >> 2) + 1;
 
 out:
 	if (fibh.sbh != fibh.ebh)
@@ -184,27 +189,11 @@
 	return ret;
 }
 
-static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct inode *dir = file_inode(filp);
-	int result;
-
-	if (filp->f_pos == 0) {
-		if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
-			return 0;
-		}
-		filp->f_pos++;
-	}
-
-	result = do_udf_readdir(dir, filp, filldir, dirent);
- 	return result;
-}
-
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
 	.llseek			= generic_file_llseek,
 	.read			= generic_read_dir,
-	.readdir		= udf_readdir,
+	.iterate		= udf_readdir,
 	.unlocked_ioctl		= udf_ioctl,
 	.fsync			= generic_file_fsync,
 };

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 3a75ca0..0ecc2ce 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c

@@ -430,16 +430,16 @@
  * This is blatantly stolen from ext2fs
  */
 static int
-ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+ufs_readdir(struct file *file, struct dir_context *ctx)
 {
-	loff_t pos = filp->f_pos;
-	struct inode *inode = file_inode(filp);
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	unsigned int offset = pos & ~PAGE_CACHE_MASK;
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = ufs_dir_pages(inode);
 	unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
-	int need_revalidate = filp->f_version != inode->i_version;
+	int need_revalidate = file->f_version != inode->i_version;
 	unsigned flags = UFS_SB(sb)->s_flags;
 
 	UFSD("BEGIN\n");
@@ -457,16 +457,16 @@
 			ufs_error(sb, __func__,
 				  "bad page in #%lu",
 				  inode->i_ino);
-			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_CACHE_SIZE - offset;
 			return -EIO;
 		}
 		kaddr = page_address(page);
 		if (unlikely(need_revalidate)) {
 			if (offset) {
 				offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
-				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
 			}
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 			need_revalidate = 0;
 		}
 		de = (struct ufs_dir_entry *)(kaddr+offset);
@@ -479,11 +479,8 @@
 				return -EIO;
 			}
 			if (de->d_ino) {
-				int over;
 				unsigned char d_type = DT_UNKNOWN;
 
-				offset = (char *)de - kaddr;
-
 				UFSD("filldir(%s,%u)\n", de->d_name,
 				      fs32_to_cpu(sb, de->d_ino));
 				UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
@@ -491,16 +488,15 @@
 				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
 					d_type = de->d_u.d_44.d_type;
 
-				over = filldir(dirent, de->d_name,
+				if (!dir_emit(ctx, de->d_name,
 					       ufs_get_de_namlen(sb, de),
-						(n<<PAGE_CACHE_SHIFT) | offset,
-					       fs32_to_cpu(sb, de->d_ino), d_type);
-				if (over) {
+					       fs32_to_cpu(sb, de->d_ino),
+					       d_type)) {
 					ufs_put_page(page);
 					return 0;
 				}
 			}
-			filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+			ctx->pos += fs16_to_cpu(sb, de->d_reclen);
 		}
 		ufs_put_page(page);
 	}
@@ -660,7 +656,7 @@
 
 const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= ufs_readdir,
+	.iterate	= ufs_readdir,
 	.fsync		= generic_file_fsync,
 	.llseek		= generic_file_llseek,
 };

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 41a6950..596ec71 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c

@@ -843,10 +843,12 @@
 STATIC void
 xfs_vm_invalidatepage(
 	struct page		*page,
-	unsigned long		offset)
+	unsigned int		offset,
+	unsigned int		length)
 {
-	trace_xfs_invalidatepage(page->mapping->host, page, offset);
-	block_invalidatepage(page, offset);
+	trace_xfs_invalidatepage(page->mapping->host, page, offset,
+				 length);
+	block_invalidatepage(page, offset, length);
 }
 
 /*
@@ -910,7 +912,7 @@
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_invalidate:
-	xfs_vm_invalidatepage(page, 0);
+	xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 	return;
 }
 
@@ -940,7 +942,7 @@
 	int			count = 0;
 	int			nonblocking = 0;
 
-	trace_xfs_writepage(inode, page, 0);
+	trace_xfs_writepage(inode, page, 0, 0);
 
 	ASSERT(page_has_buffers(page));
 
@@ -1171,7 +1173,7 @@
 {
 	int			delalloc, unwritten;
 
-	trace_xfs_releasepage(page->mapping->host, page, 0);
+	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
 
 	xfs_count_page_state(page, &delalloc, &unwritten);
 

diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index b26a50f..8f023de 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c

@@ -368,10 +368,8 @@
 int
 xfs_readdir(
 	xfs_inode_t	*dp,
-	void		*dirent,
-	size_t		bufsize,
-	xfs_off_t	*offset,
-	filldir_t	filldir)
+	struct dir_context *ctx,
+	size_t		bufsize)
 {
 	int		rval;		/* return value */
 	int		v;		/* type-checking value */
@@ -385,14 +383,13 @@
 	XFS_STATS_INC(xs_dir_getdents);
 
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-		rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir);
+		rval = xfs_dir2_sf_getdents(dp, ctx);
 	else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
 		;
 	else if (v)
-		rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir);
+		rval = xfs_dir2_block_getdents(dp, ctx);
 	else
-		rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset,
-					      filldir);
+		rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
 	return rval;
 }
 

diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e59f5fc..09aea02 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c

@@ -569,9 +569,7 @@
 int						/* error */
 xfs_dir2_block_getdents(
 	xfs_inode_t		*dp,		/* incore inode */
-	void			*dirent,
-	xfs_off_t		*offset,
-	filldir_t		filldir)
+	struct dir_context	*ctx)
 {
 	xfs_dir2_data_hdr_t	*hdr;		/* block header */
 	struct xfs_buf		*bp;		/* buffer for block */
@@ -589,7 +587,7 @@
 	/*
 	 * If the block number in the offset is out of range, we're done.
 	 */
-	if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+	if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
 		return 0;
 
 	error = xfs_dir3_block_read(NULL, dp, &bp);
@@ -600,7 +598,7 @@
 	 * Extract the byte offset we start at from the seek pointer.
 	 * We'll skip entries before this.
 	 */
-	wantoff = xfs_dir2_dataptr_to_off(mp, *offset);
+	wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
 	hdr = bp->b_addr;
 	xfs_dir3_data_check(dp, bp);
 	/*
@@ -639,13 +637,12 @@
 		cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
 					    (char *)dep - (char *)hdr);
 
+		ctx->pos = cook & 0x7fffffff;
 		/*
 		 * If it didn't fit, set the final offset to here & return.
 		 */
-		if (filldir(dirent, (char *)dep->name, dep->namelen,
-			    cook & 0x7fffffff, be64_to_cpu(dep->inumber),
-			    DT_UNKNOWN)) {
-			*offset = cook & 0x7fffffff;
+		if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+			    be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
 			xfs_trans_brelse(NULL, bp);
 			return 0;
 		}
@@ -655,7 +652,7 @@
 	 * Reached the end of the block.
 	 * Set the offset to a non-existent block 1 and return.
 	 */
-	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+	ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
 			0x7fffffff;
 	xfs_trans_brelse(NULL, bp);
 	return 0;

diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index da71a18..e0cc124 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c

@@ -1300,10 +1300,8 @@
 int						/* error */
 xfs_dir2_leaf_getdents(
 	xfs_inode_t		*dp,		/* incore directory inode */
-	void			*dirent,
-	size_t			bufsize,
-	xfs_off_t		*offset,
-	filldir_t		filldir)
+	struct dir_context	*ctx,
+	size_t			bufsize)
 {
 	struct xfs_buf		*bp = NULL;	/* data block buffer */
 	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
@@ -1322,7 +1320,7 @@
 	 * If the offset is at or past the largest allowed value,
 	 * give up right away.
 	 */
-	if (*offset >= XFS_DIR2_MAX_DATAPTR)
+	if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
 		return 0;
 
 	mp = dp->i_mount;
@@ -1343,7 +1341,7 @@
 	 * Inside the loop we keep the main offset value as a byte offset
 	 * in the directory file.
 	 */
-	curoff = xfs_dir2_dataptr_to_byte(mp, *offset);
+	curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
 
 	/*
 	 * Force this conversion through db so we truncate the offset
@@ -1444,8 +1442,8 @@
 		dep = (xfs_dir2_data_entry_t *)ptr;
 		length = xfs_dir2_data_entsize(dep->namelen);
 
-		if (filldir(dirent, (char *)dep->name, dep->namelen,
-			    xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
+		ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+		if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
 			    be64_to_cpu(dep->inumber), DT_UNKNOWN))
 			break;
 
@@ -1462,9 +1460,9 @@
 	 * All done.  Set output offset value to current offset.
 	 */
 	if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
-		*offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
 	else
-		*offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+		ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
 	kmem_free(map_info);
 	if (bp)
 		xfs_trans_brelse(NULL, bp);

diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 7cf573c..0511cda 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h

@@ -33,8 +33,8 @@
 extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
 
 extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
-		xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
+		struct dir_context *ctx);
 extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_block_removename(struct xfs_da_args *args);
 extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -91,8 +91,8 @@
 extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
 		struct xfs_dir2_leaf_entry *ents, int *indexp,
 		int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent,
-		size_t bufsize, xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
+		size_t bufsize);
 extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
 		struct xfs_buf **bpp, __uint16_t magic);
 extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
@@ -153,8 +153,7 @@
 		int size, xfs_dir2_sf_hdr_t *sfhp);
 extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
 extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent,
-		xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
 extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
 extern int xfs_dir2_sf_replace(struct xfs_da_args *args);

diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 6157424..97676a3 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c

@@ -768,9 +768,7 @@
 int						/* error */
 xfs_dir2_sf_getdents(
 	xfs_inode_t		*dp,		/* incore directory inode */
-	void			*dirent,
-	xfs_off_t		*offset,
-	filldir_t		filldir)
+	struct dir_context	*ctx)
 {
 	int			i;		/* shortform entry number */
 	xfs_mount_t		*mp;		/* filesystem mount point */
@@ -802,7 +800,7 @@
 	/*
 	 * If the block number in the offset is out of range, we're done.
 	 */
-	if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+	if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
 		return 0;
 
 	/*
@@ -819,22 +817,20 @@
 	/*
 	 * Put . entry unless we're starting past it.
 	 */
-	if (*offset <= dot_offset) {
-		if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
-			*offset = dot_offset & 0x7fffffff;
+	if (ctx->pos <= dot_offset) {
+		ctx->pos = dot_offset & 0x7fffffff;
+		if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
 			return 0;
-		}
 	}
 
 	/*
 	 * Put .. entry unless we're starting past it.
 	 */
-	if (*offset <= dotdot_offset) {
+	if (ctx->pos <= dotdot_offset) {
 		ino = xfs_dir2_sf_get_parent_ino(sfp);
-		if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
-			*offset = dotdot_offset & 0x7fffffff;
+		ctx->pos = dotdot_offset & 0x7fffffff;
+		if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
 			return 0;
-		}
 	}
 
 	/*
@@ -845,21 +841,20 @@
 		off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
 				xfs_dir2_sf_get_offset(sfep));
 
-		if (*offset > off) {
+		if (ctx->pos > off) {
 			sfep = xfs_dir2_sf_nextentry(sfp, sfep);
 			continue;
 		}
 
 		ino = xfs_dir2_sfe_get_ino(sfp, sfep);
-		if (filldir(dirent, (char *)sfep->name, sfep->namelen,
-			    off & 0x7fffffff, ino, DT_UNKNOWN)) {
-			*offset = off & 0x7fffffff;
+		ctx->pos = off & 0x7fffffff;
+		if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
+			    ino, DT_UNKNOWN))
 			return 0;
-		}
 		sfep = xfs_dir2_sf_nextentry(sfp, sfep);
 	}
 
-	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+	ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
 			0x7fffffff;
 	return 0;
 }

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a5f2042..0ad2b95 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c

@@ -906,11 +906,10 @@
 
 STATIC int
 xfs_file_readdir(
-	struct file	*filp,
-	void		*dirent,
-	filldir_t	filldir)
+	struct file	*file,
+	struct dir_context *ctx)
 {
-	struct inode	*inode = file_inode(filp);
+	struct inode	*inode = file_inode(file);
 	xfs_inode_t	*ip = XFS_I(inode);
 	int		error;
 	size_t		bufsize;
@@ -929,8 +928,7 @@
 	 */
 	bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
 
-	error = xfs_readdir(ip, dirent, bufsize,
-				(xfs_off_t *)&filp->f_pos, filldir);
+	error = xfs_readdir(ip, ctx, bufsize);
 	if (error)
 		return -error;
 	return 0;
@@ -1432,7 +1430,7 @@
 const struct file_operations xfs_dir_file_operations = {
 	.open		= xfs_dir_open,
 	.read		= generic_read_dir,
-	.readdir	= xfs_file_readdir,
+	.iterate	= xfs_file_readdir,
 	.llseek		= generic_file_llseek,
 	.unlocked_ioctl	= xfs_file_ioctl,
 #ifdef CONFIG_COMPAT

diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aa4db33..a04701d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h

@@ -974,14 +974,16 @@
 DEFINE_RW_EVENT(xfs_file_splice_write);
 
 DECLARE_EVENT_CLASS(xfs_page_class,
-	TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
-	TP_ARGS(inode, page, off),
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+		 unsigned int len),
+	TP_ARGS(inode, page, off, len),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(pgoff_t, pgoff)
 		__field(loff_t, size)
 		__field(unsigned long, offset)
+		__field(unsigned int, length)
 		__field(int, delalloc)
 		__field(int, unwritten)
 	),
@@ -995,24 +997,27 @@
 		__entry->pgoff = page_offset(page);
 		__entry->size = i_size_read(inode);
 		__entry->offset = off;
+		__entry->length = len;
 		__entry->delalloc = delalloc;
 		__entry->unwritten = unwritten;
 	),
 	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-		  "delalloc %d unwritten %d",
+		  "length %x delalloc %d unwritten %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->pgoff,
 		  __entry->size,
 		  __entry->offset,
+		  __entry->length,
 		  __entry->delalloc,
 		  __entry->unwritten)
 )
 
 #define DEFINE_PAGE_EVENT(name)		\
 DEFINE_EVENT(xfs_page_class, name,	\
-	TP_PROTO(struct inode *inode, struct page *page, unsigned long off),	\
-	TP_ARGS(inode, page, off))
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+		 unsigned int len),	\
+	TP_ARGS(inode, page, off, len))
 DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);

diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 5163022d..38c67c3 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h

@@ -31,8 +31,7 @@
 		struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 		struct xfs_name *target_name);
-int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
-		       xfs_off_t *offset, filldir_t filldir);
+int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
 		const char *target_path, umode_t mode, struct xfs_inode **ipp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);

diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index e6168a2..b420939 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h

@@ -123,7 +123,9 @@
 extern void unregister_dock_notifier(struct notifier_block *nb);
 extern int register_hotplug_dock_device(acpi_handle handle,
 					const struct acpi_dock_ops *ops,
-					void *context);
+					void *context,
+					void (*init)(void *),
+					void (*release)(void *));
 extern void unregister_hotplug_dock_device(acpi_handle handle);
 #else
 static inline int is_dock_device(acpi_handle handle)
@@ -139,7 +141,9 @@
 }
 static inline int register_hotplug_dock_device(acpi_handle handle,
 					       const struct acpi_dock_ops *ops,
-					       void *context)
+					       void *context,
+					       void (*init)(void *),
+					       void (*release)(void *))
 {
 	return -ENODEV;
 }

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a59ff51..b183698 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h

@@ -692,4 +692,8 @@
 
 #endif /* !__ASSEMBLY__ */
 
+#ifndef io_remap_pfn_range
+#define io_remap_pfn_range remap_pfn_range
+#endif
+
 #endif /* _ASM_GENERIC_PGTABLE_H */

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 9e52b0626..f5a3b83 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h

@@ -198,7 +198,8 @@
  * Generic address_space_operations implementations for buffer_head-backed
  * address_spaces.
  */
-void block_invalidatepage(struct page *page, unsigned long offset);
+void block_invalidatepage(struct page *page, unsigned int offset,
+			  unsigned int length);
 int block_write_full_page(struct page *page, get_block_t *get_block,
 				struct writeback_control *wbc);
 int block_write_full_page_endio(struct page *page, get_block_t *get_block,

diff --git a/include/linux/console.h b/include/linux/console.h
index 73bab0f..7571a16 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h

@@ -75,10 +75,7 @@
 extern const struct consw prom_con;	/* SPARC PROM console */
 
 int con_is_bound(const struct consw *csw);
-int register_con_driver(const struct consw *csw, int first, int last);
-int unregister_con_driver(const struct consw *csw);
 int do_unregister_con_driver(const struct consw *csw);
-int take_over_console(const struct consw *sw, int first, int last, int deflt);
 int do_take_over_console(const struct consw *sw, int first, int last, int deflt);
 void give_up_console(const struct consw *sw);
 #ifdef CONFIG_HW_CONSOLE

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index df6fab8..383d5e3 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h

@@ -20,8 +20,8 @@
 #define F2FS_BLKSIZE			4096	/* support only 4KB block */
 #define F2FS_MAX_EXTENSION		64	/* # of extension entries */
 
-#define NULL_ADDR		0x0U
-#define NEW_ADDR		-1U
+#define NULL_ADDR		((block_t)0)	/* used as block_t addresses */
+#define NEW_ADDR		((block_t)-1)	/* used as block_t addresses */
 
 #define F2FS_ROOT_INO(sbi)	(sbi->root_ino_num)
 #define F2FS_NODE_INO(sbi)	(sbi->node_ino_num)

diff --git a/include/linux/fmc-sdb.h b/include/linux/fmc-sdb.h
new file mode 100644
index 0000000..1974317
--- /dev/null
+++ b/include/linux/fmc-sdb.h

@@ -0,0 +1,36 @@
+/*
+ * This file is separate from sdb.h, because I want that one to remain
+ * unchanged (as far as possible) from the official sdb distribution
+ *
+ * This file and associated functionality are a playground for me to
+ * understand stuff which will later be implemented in more generic places.
+ */
+#include <linux/sdb.h>
+
+/* This is the union of all currently defined types */
+union sdb_record {
+	struct sdb_interconnect ic;
+	struct sdb_device dev;
+	struct sdb_bridge bridge;
+	struct sdb_integration integr;
+	struct sdb_empty empty;
+};
+
+struct fmc_device;
+
+/* Every sdb table is turned into this structure */
+struct sdb_array {
+	int len;
+	int level;
+	unsigned long baseaddr;
+	struct fmc_device *fmc;		/* the device that hosts it */
+	struct sdb_array *parent;	/* NULL at root */
+	union sdb_record *record;	/* copies of the struct */
+	struct sdb_array **subtree;	/* only valid for bridge items */
+};
+
+extern int fmc_scan_sdb_tree(struct fmc_device *fmc, unsigned long address);
+extern void fmc_show_sdb_tree(const struct fmc_device *fmc);
+extern signed long fmc_find_sdb_device(struct sdb_array *tree, uint64_t vendor,
+				       uint32_t device, unsigned long *sz);
+extern int fmc_free_sdb_tree(struct fmc_device *fmc);

diff --git a/include/linux/fmc.h b/include/linux/fmc.h
new file mode 100644
index 0000000..a5f0aa5
--- /dev/null
+++ b/include/linux/fmc.h

@@ -0,0 +1,237 @@
+/*
+ * Copyright (C) 2012 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ *
+ * This work is part of the White Rabbit project, a research effort led
+ * by CERN, the European Institute for Nuclear Research.
+ */
+#ifndef __LINUX_FMC_H__
+#define __LINUX_FMC_H__
+#include <linux/types.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+
+struct fmc_device;
+struct fmc_driver;
+
+/*
+ * This bus abstraction is developed separately from drivers, so we need
+ * to check the version of the data structures we receive.
+ */
+
+#define FMC_MAJOR	3
+#define FMC_MINOR	0
+#define FMC_VERSION	((FMC_MAJOR << 16) | FMC_MINOR)
+#define __FMC_MAJOR(x)	((x) >> 16)
+#define __FMC_MINOR(x)	((x) & 0xffff)
+
+/*
+ * The device identification, as defined by the IPMI FRU (Field Replaceable
+ * Unit) includes four different strings to describe the device. Here we
+ * only match the "Board Manufacturer" and the "Board Product Name",
+ * ignoring the "Board Serial Number" and "Board Part Number". All 4 are
+ * expected to be strings, so they are treated as zero-terminated C strings.
+ * Unspecified string (NULL) means "any", so if both are unspecified this
+ * is a catch-all driver. So null entries are allowed and we use array
+ * and length. This is unlike pci and usb that use null-terminated arrays
+ */
+struct fmc_fru_id {
+	char *manufacturer;
+	char *product_name;
+};
+
+/*
+ * If the FPGA is already programmed (think Etherbone or the second
+ * SVEC slot), we can match on SDB devices in the memory image. This
+ * match uses an array of devices that must all be present, and the
+ * match is based on vendor and device only. Further checks are expected
+ * to happen in the probe function. Zero means "any" and catch-all is allowed.
+ */
+struct fmc_sdb_one_id {
+	uint64_t vendor;
+	uint32_t device;
+};
+struct fmc_sdb_id {
+	struct fmc_sdb_one_id *cores;
+	int cores_nr;
+};
+
+struct fmc_device_id {
+	struct fmc_fru_id *fru_id;
+	int fru_id_nr;
+	struct fmc_sdb_id *sdb_id;
+	int sdb_id_nr;
+};
+
+/* This sizes the module_param_array used by generic module parameters */
+#define FMC_MAX_CARDS 32
+
+/* The driver is a pretty simple thing */
+struct fmc_driver {
+	unsigned long version;
+	struct device_driver driver;
+	int (*probe)(struct fmc_device *);
+	int (*remove)(struct fmc_device *);
+	const struct fmc_device_id id_table;
+	/* What follows is for generic module parameters */
+	int busid_n;
+	int busid_val[FMC_MAX_CARDS];
+	int gw_n;
+	char *gw_val[FMC_MAX_CARDS];
+};
+#define to_fmc_driver(x) container_of((x), struct fmc_driver, driver)
+
+/* These are the generic parameters, that drivers may instantiate */
+#define FMC_PARAM_BUSID(_d) \
+    module_param_array_named(busid, _d.busid_val, int, &_d.busid_n, 0444)
+#define FMC_PARAM_GATEWARE(_d) \
+    module_param_array_named(gateware, _d.gw_val, charp, &_d.gw_n, 0444)
+
+/*
+ * Drivers may need to configure gpio pins in the carrier. To read input
+ * (a very uncommon operation, and definitely not in the hot paths), just
+ * configure one gpio only and get 0 or 1 as retval of the config method
+ */
+struct fmc_gpio {
+	char *carrier_name; /* name or NULL for virtual pins */
+	int gpio;
+	int _gpio;	/* internal use by the carrier */
+	int mode;	/* GPIOF_DIR_OUT etc, from <linux/gpio.h> */
+	int irqmode;	/* IRQF_TRIGGER_LOW and so on */
+};
+
+/* The numbering of gpio pins allows access to raw pins or virtual roles */
+#define FMC_GPIO_RAW(x)		(x)		/* 4096 of them */
+#define __FMC_GPIO_IS_RAW(x)	((x) < 0x1000)
+#define FMC_GPIO_IRQ(x)		((x) + 0x1000)	/*  256 of them */
+#define FMC_GPIO_LED(x)		((x) + 0x1100)	/*  256 of them */
+#define FMC_GPIO_KEY(x)		((x) + 0x1200)	/*  256 of them */
+#define FMC_GPIO_TP(x)		((x) + 0x1300)	/*  256 of them */
+#define FMC_GPIO_USER(x)	((x) + 0x1400)	/*  256 of them */
+/* We may add SCL and SDA, or other roles if the need arises */
+
+/* GPIOF_DIR_IN etc are missing before 3.0. copy from <linux/gpio.h> */
+#ifndef GPIOF_DIR_IN
+#  define GPIOF_DIR_OUT   (0 << 0)
+#  define GPIOF_DIR_IN    (1 << 0)
+#  define GPIOF_INIT_LOW  (0 << 1)
+#  define GPIOF_INIT_HIGH (1 << 1)
+#endif
+
+/*
+ * The operations are offered by each carrier and should make driver
+ * design completely independent of the carrier. Named GPIO pins may be
+ * the exception.
+ */
+struct fmc_operations {
+	uint32_t (*read32)(struct fmc_device *fmc, int offset);
+	void (*write32)(struct fmc_device *fmc, uint32_t value, int offset);
+	int (*validate)(struct fmc_device *fmc, struct fmc_driver *drv);
+	int (*reprogram)(struct fmc_device *f, struct fmc_driver *d, char *gw);
+	int (*irq_request)(struct fmc_device *fmc, irq_handler_t h,
+			   char *name, int flags);
+	void (*irq_ack)(struct fmc_device *fmc);
+	int (*irq_free)(struct fmc_device *fmc);
+	int (*gpio_config)(struct fmc_device *fmc, struct fmc_gpio *gpio,
+			   int ngpio);
+	int (*read_ee)(struct fmc_device *fmc, int pos, void *d, int l);
+	int (*write_ee)(struct fmc_device *fmc, int pos, const void *d, int l);
+};
+
+/* Prefer this helper rather than calling of fmc->reprogram directly */
+extern int fmc_reprogram(struct fmc_device *f, struct fmc_driver *d, char *gw,
+		     int sdb_entry);
+
+/*
+ * The device reports all information needed to access hw.
+ *
+ * If we have eeprom_len and not contents, the core reads it.
+ * Then, parsing of identifiers is done by the core which fills fmc_fru_id..
+ * Similarly a device that must be matched based on SDB cores must
+ * fill the entry point and the core will scan the bus (FIXME: sdb match)
+ */
+struct fmc_device {
+	unsigned long version;
+	unsigned long flags;
+	struct module *owner;		/* char device must pin it */
+	struct fmc_fru_id id;		/* for EEPROM-based match */
+	struct fmc_operations *op;	/* carrier-provided */
+	int irq;			/* according to host bus. 0 == none */
+	int eeprom_len;			/* Usually 8kB, may be less */
+	int eeprom_addr;		/* 0x50, 0x52 etc */
+	uint8_t *eeprom;		/* Full contents or leading part */
+	char *carrier_name;		/* "SPEC" or similar, for special use */
+	void *carrier_data;		/* "struct spec *" or equivalent */
+	__iomem void *fpga_base;	/* May be NULL (Etherbone) */
+	__iomem void *slot_base;	/* Set by the driver */
+	struct fmc_device **devarray;	/* Allocated by the bus */
+	int slot_id;			/* Index in the slot array */
+	int nr_slots;			/* Number of slots in this carrier */
+	unsigned long memlen;		/* Used for the char device */
+	struct device dev;		/* For Linux use */
+	struct device *hwdev;		/* The underlying hardware device */
+	unsigned long sdbfs_entry;
+	struct sdb_array *sdb;
+	uint32_t device_id;		/* Filled by the device */
+	char *mezzanine_name;		/* Defaults to ``fmc'' */
+	void *mezzanine_data;
+};
+#define to_fmc_device(x) container_of((x), struct fmc_device, dev)
+
+#define FMC_DEVICE_HAS_GOLDEN		1
+#define FMC_DEVICE_HAS_CUSTOM		2
+#define FMC_DEVICE_NO_MEZZANINE		4
+#define FMC_DEVICE_MATCH_SDB		8 /* fmc-core must scan sdb in fpga */
+
+/*
+ * If fpga_base can be used, the carrier offers no readl/writel methods, and
+ * this expands to a single, fast, I/O access.
+ */
+static inline uint32_t fmc_readl(struct fmc_device *fmc, int offset)
+{
+	if (unlikely(fmc->op->read32))
+		return fmc->op->read32(fmc, offset);
+	return readl(fmc->fpga_base + offset);
+}
+static inline void fmc_writel(struct fmc_device *fmc, uint32_t val, int off)
+{
+	if (unlikely(fmc->op->write32))
+		fmc->op->write32(fmc, val, off);
+	else
+		writel(val, fmc->fpga_base + off);
+}
+
+/* pci-like naming */
+static inline void *fmc_get_drvdata(const struct fmc_device *fmc)
+{
+	return dev_get_drvdata(&fmc->dev);
+}
+
+static inline void fmc_set_drvdata(struct fmc_device *fmc, void *data)
+{
+	dev_set_drvdata(&fmc->dev, data);
+}
+
+/* The 4 access points */
+extern int fmc_driver_register(struct fmc_driver *drv);
+extern void fmc_driver_unregister(struct fmc_driver *drv);
+extern int fmc_device_register(struct fmc_device *tdev);
+extern void fmc_device_unregister(struct fmc_device *tdev);
+
+/* Two more for device sets, all driven by the same FPGA */
+extern int fmc_device_register_n(struct fmc_device **devs, int n);
+extern void fmc_device_unregister_n(struct fmc_device **devs, int n);
+
+/* Internal cross-calls between files; not exported to other modules */
+extern int fmc_match(struct device *dev, struct device_driver *drv);
+extern int fmc_fill_id_info(struct fmc_device *fmc);
+extern void fmc_free_id_info(struct fmc_device *fmc);
+extern void fmc_dump_eeprom(const struct fmc_device *fmc);
+extern void fmc_dump_sdb(const struct fmc_device *fmc);
+
+#endif /* __LINUX_FMC_H__ */

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65c2be2..f8a52405 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h

@@ -364,7 +364,7 @@
 
 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
 	sector_t (*bmap)(struct address_space *, sector_t);
-	void (*invalidatepage) (struct page *, unsigned long);
+	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, gfp_t);
 	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
@@ -1506,6 +1506,11 @@
  * to have different dirent layouts depending on the binary type.
  */
 typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
+struct dir_context {
+	const filldir_t actor;
+	loff_t pos;
+};
+
 struct block_device_operations;
 
 /* These macros are for out of kernel modules to test that
@@ -1521,7 +1526,7 @@
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
-	int (*readdir) (struct file *, void *, filldir_t);
+	int (*iterate) (struct file *, struct dir_context *);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
@@ -2494,6 +2499,7 @@
 void inode_set_bytes(struct inode *inode, loff_t bytes);
 
 extern int vfs_readdir(struct file *, filldir_t, void *);
+extern int iterate_dir(struct file *, struct dir_context *);
 
 extern int vfs_stat(const char __user *, struct kstat *);
 extern int vfs_lstat(const char __user *, struct kstat *);
@@ -2524,7 +2530,7 @@
 extern int dcache_dir_open(struct inode *, struct file *);
 extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
-extern int dcache_readdir(struct file *, void *, filldir_t);
+extern int dcache_readdir(struct file *, struct dir_context *);
 extern int simple_setattr(struct dentry *, struct iattr *);
 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int simple_statfs(struct dentry *, struct kstatfs *);
@@ -2688,4 +2694,41 @@
 		inode->i_flags |= S_NOSEC;
 }
 
+static inline bool dir_emit(struct dir_context *ctx,
+			    const char *name, int namelen,
+			    u64 ino, unsigned type)
+{
+	return ctx->actor(ctx, name, namelen, ctx->pos, ino, type) == 0;
+}
+static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
+{
+	return ctx->actor(ctx, ".", 1, ctx->pos,
+			  file->f_path.dentry->d_inode->i_ino, DT_DIR) == 0;
+}
+static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
+{
+	return ctx->actor(ctx, "..", 2, ctx->pos,
+			  parent_ino(file->f_path.dentry), DT_DIR) == 0;
+}
+static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
+{
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(file, ctx))
+			return false;
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (!dir_emit_dotdot(file, ctx))
+			return false;
+		ctx->pos = 2;
+	}
+	return true;
+}
+static inline bool dir_relax(struct inode *inode)
+{
+	mutex_unlock(&inode->i_mutex);
+	mutex_lock(&inode->i_mutex);
+	return !IS_DEADDIR(inode);
+}
+
 #endif /* _LINUX_FS_H */

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 5dfa0aa..a9ff9a3 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h

@@ -97,7 +97,8 @@
 #define FSCACHE_OP_WAITING	4	/* cleared when op is woken */
 #define FSCACHE_OP_EXCLUSIVE	5	/* exclusive op, other ops must wait */
 #define FSCACHE_OP_DEC_READ_CNT	6	/* decrement object->n_reads on destruction */
-#define FSCACHE_OP_KEEP_FLAGS	0x0070	/* flags to keep when repurposing an op */
+#define FSCACHE_OP_UNUSE_COOKIE	7	/* call fscache_unuse_cookie() on completion */
+#define FSCACHE_OP_KEEP_FLAGS	0x00f0	/* flags to keep when repurposing an op */
 
 	enum fscache_operation_state state;
 	atomic_t		usage;
@@ -150,7 +151,7 @@
 	void			*context;	/* netfs read context (pinned) */
 	struct list_head	to_do;		/* list of things to be done by the backend */
 	unsigned long		start_time;	/* time at which retrieval started */
-	unsigned		n_pages;	/* number of pages to be retrieved */
+	atomic_t		n_pages;	/* number of pages to be retrieved */
 };
 
 typedef int (*fscache_page_retrieval_func_t)(struct fscache_retrieval *op,
@@ -194,15 +195,14 @@
 static inline void fscache_retrieval_complete(struct fscache_retrieval *op,
 					      int n_pages)
 {
-	op->n_pages -= n_pages;
-	if (op->n_pages <= 0)
+	atomic_sub(n_pages, &op->n_pages);
+	if (atomic_read(&op->n_pages) <= 0)
 		fscache_op_complete(&op->op, true);
 }
 
 /**
  * fscache_put_retrieval - Drop a reference to a retrieval operation
  * @op: The retrieval operation affected
- * @n_pages: The number of pages to account for
  *
  * Drop a reference to a retrieval operation.
  */
@@ -314,6 +314,7 @@
 struct fscache_cookie {
 	atomic_t			usage;		/* number of users of this cookie */
 	atomic_t			n_children;	/* number of children of this cookie */
+	atomic_t			n_active;	/* number of active users of netfs ptrs */
 	spinlock_t			lock;
 	spinlock_t			stores_lock;	/* lock on page store tree */
 	struct hlist_head		backing_objects; /* object(s) backing this file/index */
@@ -326,13 +327,11 @@
 
 	unsigned long			flags;
 #define FSCACHE_COOKIE_LOOKING_UP	0	/* T if non-index cookie being looked up still */
-#define FSCACHE_COOKIE_CREATING		1	/* T if non-index object being created still */
-#define FSCACHE_COOKIE_NO_DATA_YET	2	/* T if new object with no cached data yet */
-#define FSCACHE_COOKIE_PENDING_FILL	3	/* T if pending initial fill on object */
-#define FSCACHE_COOKIE_FILLING		4	/* T if filling object incrementally */
-#define FSCACHE_COOKIE_UNAVAILABLE	5	/* T if cookie is unavailable (error, etc) */
-#define FSCACHE_COOKIE_WAITING_ON_READS	6	/* T if cookie is waiting on reads */
-#define FSCACHE_COOKIE_INVALIDATING	7	/* T if cookie is being invalidated */
+#define FSCACHE_COOKIE_NO_DATA_YET	1	/* T if new object with no cached data yet */
+#define FSCACHE_COOKIE_UNAVAILABLE	2	/* T if cookie is unavailable (error, etc) */
+#define FSCACHE_COOKIE_INVALIDATING	3	/* T if cookie is being invalidated */
+#define FSCACHE_COOKIE_RELINQUISHED	4	/* T if cookie has been relinquished */
+#define FSCACHE_COOKIE_RETIRED		5	/* T if cookie was retired */
 };
 
 extern struct fscache_cookie fscache_fsdef_index;
@@ -341,45 +340,40 @@
  * Event list for fscache_object::{event_mask,events}
  */
 enum {
-	FSCACHE_OBJECT_EV_REQUEUE,	/* T if object should be requeued */
+	FSCACHE_OBJECT_EV_NEW_CHILD,	/* T if object has a new child */
+	FSCACHE_OBJECT_EV_PARENT_READY,	/* T if object's parent is ready */
 	FSCACHE_OBJECT_EV_UPDATE,	/* T if object should be updated */
 	FSCACHE_OBJECT_EV_INVALIDATE,	/* T if cache requested object invalidation */
 	FSCACHE_OBJECT_EV_CLEARED,	/* T if accessors all gone */
 	FSCACHE_OBJECT_EV_ERROR,	/* T if fatal error occurred during processing */
-	FSCACHE_OBJECT_EV_RELEASE,	/* T if netfs requested object release */
-	FSCACHE_OBJECT_EV_RETIRE,	/* T if netfs requested object retirement */
-	FSCACHE_OBJECT_EV_WITHDRAW,	/* T if cache requested object withdrawal */
+	FSCACHE_OBJECT_EV_KILL,		/* T if netfs relinquished or cache withdrew object */
 	NR_FSCACHE_OBJECT_EVENTS
 };
 
 #define FSCACHE_OBJECT_EVENTS_MASK ((1UL << NR_FSCACHE_OBJECT_EVENTS) - 1)
 
 /*
+ * States for object state machine.
+ */
+struct fscache_transition {
+	unsigned long events;
+	const struct fscache_state *transit_to;
+};
+
+struct fscache_state {
+	char name[24];
+	char short_name[8];
+	const struct fscache_state *(*work)(struct fscache_object *object,
+					    int event);
+	const struct fscache_transition transitions[];
+};
+
+/*
  * on-disk cache file or index handle
  */
 struct fscache_object {
-	enum fscache_object_state {
-		FSCACHE_OBJECT_INIT,		/* object in initial unbound state */
-		FSCACHE_OBJECT_LOOKING_UP,	/* looking up object */
-		FSCACHE_OBJECT_CREATING,	/* creating object */
-
-		/* active states */
-		FSCACHE_OBJECT_AVAILABLE,	/* cleaning up object after creation */
-		FSCACHE_OBJECT_ACTIVE,		/* object is usable */
-		FSCACHE_OBJECT_INVALIDATING,	/* object is invalidating */
-		FSCACHE_OBJECT_UPDATING,	/* object is updating */
-
-		/* terminal states */
-		FSCACHE_OBJECT_DYING,		/* object waiting for accessors to finish */
-		FSCACHE_OBJECT_LC_DYING,	/* object cleaning up after lookup/create */
-		FSCACHE_OBJECT_ABORT_INIT,	/* abort the init state */
-		FSCACHE_OBJECT_RELEASING,	/* releasing object */
-		FSCACHE_OBJECT_RECYCLING,	/* retiring object */
-		FSCACHE_OBJECT_WITHDRAWING,	/* withdrawing object */
-		FSCACHE_OBJECT_DEAD,		/* object is now dead */
-		FSCACHE_OBJECT__NSTATES
-	} state;
-
+	const struct fscache_state *state;	/* Object state machine state */
+	const struct fscache_transition *oob_table; /* OOB state transition table */
 	int			debug_id;	/* debugging ID */
 	int			n_children;	/* number of child objects */
 	int			n_ops;		/* number of extant ops on object */
@@ -390,6 +384,7 @@
 	spinlock_t		lock;		/* state and operations lock */
 
 	unsigned long		lookup_jif;	/* time at which lookup started */
+	unsigned long		oob_event_mask;	/* OOB events this object is interested in */
 	unsigned long		event_mask;	/* events this object is interested in */
 	unsigned long		events;		/* events to be processed by this object
 						 * (order is important - using fls) */
@@ -398,6 +393,9 @@
 #define FSCACHE_OBJECT_LOCK		0	/* T if object is busy being processed */
 #define FSCACHE_OBJECT_PENDING_WRITE	1	/* T if object has pending write */
 #define FSCACHE_OBJECT_WAITING		2	/* T if object is waiting on its parent */
+#define FSCACHE_OBJECT_IS_LIVE		3	/* T if object is not withdrawn or relinquished */
+#define FSCACHE_OBJECT_IS_LOOKED_UP	4	/* T if object has been looked up */
+#define FSCACHE_OBJECT_IS_AVAILABLE	5	/* T if object has become active */
 
 	struct list_head	cache_link;	/* link in cache->object_list */
 	struct hlist_node	cookie_link;	/* link in cookie->backing_objects */
@@ -415,62 +413,40 @@
 	loff_t			store_limit_l;	/* current storage limit */
 };
 
-extern const char *fscache_object_states[];
-
-#define fscache_object_is_active(obj)			      \
-	(!test_bit(FSCACHE_IOERROR, &(obj)->cache->flags) &&  \
-	 (obj)->state >= FSCACHE_OBJECT_AVAILABLE &&	      \
-	 (obj)->state < FSCACHE_OBJECT_DYING)
-
-#define fscache_object_is_dead(obj)				\
-	(test_bit(FSCACHE_IOERROR, &(obj)->cache->flags) &&	\
-	 (obj)->state >= FSCACHE_OBJECT_DYING)
-
-extern void fscache_object_work_func(struct work_struct *work);
-
-/**
- * fscache_object_init - Initialise a cache object description
- * @object: Object description
- *
- * Initialise a cache object description to its basic values.
- *
- * See Documentation/filesystems/caching/backend-api.txt for a complete
- * description.
- */
-static inline
-void fscache_object_init(struct fscache_object *object,
-			 struct fscache_cookie *cookie,
-			 struct fscache_cache *cache)
-{
-	atomic_inc(&cache->object_count);
-
-	object->state = FSCACHE_OBJECT_INIT;
-	spin_lock_init(&object->lock);
-	INIT_LIST_HEAD(&object->cache_link);
-	INIT_HLIST_NODE(&object->cookie_link);
-	INIT_WORK(&object->work, fscache_object_work_func);
-	INIT_LIST_HEAD(&object->dependents);
-	INIT_LIST_HEAD(&object->dep_link);
-	INIT_LIST_HEAD(&object->pending_ops);
-	object->n_children = 0;
-	object->n_ops = object->n_in_progress = object->n_exclusive = 0;
-	object->events = object->event_mask = 0;
-	object->flags = 0;
-	object->store_limit = 0;
-	object->store_limit_l = 0;
-	object->cache = cache;
-	object->cookie = cookie;
-	object->parent = NULL;
-}
+extern void fscache_object_init(struct fscache_object *, struct fscache_cookie *,
+				struct fscache_cache *);
+extern void fscache_object_destroy(struct fscache_object *);
 
 extern void fscache_object_lookup_negative(struct fscache_object *object);
 extern void fscache_obtained_object(struct fscache_object *object);
 
-#ifdef CONFIG_FSCACHE_OBJECT_LIST
-extern void fscache_object_destroy(struct fscache_object *object);
-#else
-#define fscache_object_destroy(object) do {} while(0)
-#endif
+static inline bool fscache_object_is_live(struct fscache_object *object)
+{
+	return test_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+}
+
+static inline bool fscache_object_is_dying(struct fscache_object *object)
+{
+	return !fscache_object_is_live(object);
+}
+
+static inline bool fscache_object_is_available(struct fscache_object *object)
+{
+	return test_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
+}
+
+static inline bool fscache_object_is_active(struct fscache_object *object)
+{
+	return fscache_object_is_available(object) &&
+		fscache_object_is_live(object) &&
+		!test_bit(FSCACHE_IOERROR, &object->cache->flags);
+}
+
+static inline bool fscache_object_is_dead(struct fscache_object *object)
+{
+	return fscache_object_is_dying(object) &&
+		test_bit(FSCACHE_IOERROR, &object->cache->flags);
+}
 
 /**
  * fscache_object_destroyed - Note destruction of an object in a cache
@@ -531,6 +507,33 @@
 	op->end_io_func(page, op->context, error);
 }
 
+/**
+ * fscache_use_cookie - Request usage of cookie attached to an object
+ * @object: Object description
+ * 
+ * Request usage of the cookie attached to an object.  NULL is returned if the
+ * relinquishment had reduced the cookie usage count to 0.
+ */
+static inline bool fscache_use_cookie(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+	return atomic_inc_not_zero(&cookie->n_active) != 0;
+}
+
+/**
+ * fscache_unuse_cookie - Cease usage of cookie attached to an object
+ * @object: Object description
+ * 
+ * Cease usage of the cookie attached to an object.  When the users count
+ * reaches zero then the cookie relinquishment will be permitted to proceed.
+ */
+static inline void fscache_unuse_cookie(struct fscache_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+	if (atomic_dec_and_test(&cookie->n_active))
+		wake_up_atomic_t(&cookie->n_active);
+}
+
 /*
  * out-of-line cache backend functions
  */

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index c255984..fae8bac 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h

@@ -909,6 +909,7 @@
 	CHANNEL_OFFER_STATE,
 	CHANNEL_OPENING_STATE,
 	CHANNEL_OPEN_STATE,
+	CHANNEL_OPENED_STATE,
 };
 
 struct vmbus_channel_debug_info {
@@ -1046,6 +1047,38 @@
 	 * preserve the earlier behavior.
 	 */
 	u32 target_vp;
+	/*
+	 * Support for sub-channels. For high performance devices,
+	 * it will be useful to have multiple sub-channels to support
+	 * a scalable communication infrastructure with the host.
+	 * The support for sub-channels is implemented as an extention
+	 * to the current infrastructure.
+	 * The initial offer is considered the primary channel and this
+	 * offer message will indicate if the host supports sub-channels.
+	 * The guest is free to ask for sub-channels to be offerred and can
+	 * open these sub-channels as a normal "primary" channel. However,
+	 * all sub-channels will have the same type and instance guids as the
+	 * primary channel. Requests sent on a given channel will result in a
+	 * response on the same channel.
+	 */
+
+	/*
+	 * Sub-channel creation callback. This callback will be called in
+	 * process context when a sub-channel offer is received from the host.
+	 * The guest can open the sub-channel in the context of this callback.
+	 */
+	void (*sc_creation_callback)(struct vmbus_channel *new_sc);
+
+	spinlock_t sc_lock;
+	/*
+	 * All Sub-channels of a primary channel are linked here.
+	 */
+	struct list_head sc_list;
+	/*
+	 * The primary channel this sub-channel belongs to.
+	 * This will be NULL for the primary channel.
+	 */
+	struct vmbus_channel *primary_channel;
 };
 
 static inline void set_channel_read_state(struct vmbus_channel *c, bool state)
@@ -1057,6 +1090,34 @@
 
 int vmbus_request_offers(void);
 
+/*
+ * APIs for managing sub-channels.
+ */
+
+void vmbus_set_sc_create_callback(struct vmbus_channel *primary_channel,
+			void (*sc_cr_cb)(struct vmbus_channel *new_sc));
+
+/*
+ * Retrieve the (sub) channel on which to send an outgoing request.
+ * When a primary channel has multiple sub-channels, we choose a
+ * channel whose VCPU binding is closest to the VCPU on which
+ * this call is being made.
+ */
+struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary);
+
+/*
+ * Check if sub-channels have already been offerred. This API will be useful
+ * when the driver is unloaded after establishing sub-channels. In this case,
+ * when the driver is re-loaded, the driver would have to check if the
+ * subchannels have already been established before attempting to request
+ * the creation of sub-channels.
+ * This function returns TRUE to indicate that subchannels have already been
+ * created.
+ * This function should be invoked after setting the callback function for
+ * sub-channel creation.
+ */
+bool vmbus_are_subchannels_present(struct vmbus_channel *primary);
+
 /* The format must be the same as struct vmdata_gpa_direct */
 struct vmbus_channel_packet_page_buffer {
 	u16 type;
@@ -1327,6 +1388,15 @@
 			0x8e, 0x77, 0x05, 0x58, 0xeb, 0x10, 0x73, 0xf8 \
 		}
 
+/*
+ * Synthetic FC GUID
+ * {2f9bcc4a-0069-4af3-b76b-6fd0be528cda}
+ */
+#define HV_SYNTHFC_GUID \
+	.guid = { \
+			0x4A, 0xCC, 0x9B, 0x2F, 0x69, 0x00, 0xF3, 0x4A, \
+			0xB7, 0x6B, 0x6F, 0xD0, 0xBE, 0x52, 0x8C, 0xDA \
+		}
 
 /*
  * Common header for Hyper-V ICs

diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index 12b4d55..d556973 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h

@@ -30,7 +30,6 @@
 
 int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);
 
-int mac_pton(const char *s, u8 *mac);
 extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len);
 
 #endif	/* _LINUX_IF_ETHER_H */

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 52bd03b..637fa71d 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h

@@ -44,7 +44,7 @@
  *	struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr)
  *	@h_dest: destination ethernet address
  *	@h_source: source ethernet address
- *	@h_vlan_proto: ethernet protocol (always 0x8100)
+ *	@h_vlan_proto: ethernet protocol
  *	@h_vlan_TCI: priority and VLAN ID
  *	@h_vlan_encapsulated_proto: packet type ID or len
  */

diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 172c5b2..72b2694 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h

@@ -24,14 +24,10 @@
 #define ST_SENSORS_FULLSCALE_AVL_MAX		10
 
 #define ST_SENSORS_NUMBER_ALL_CHANNELS		4
-#define ST_SENSORS_NUMBER_DATA_CHANNELS		3
 #define ST_SENSORS_ENABLE_ALL_AXIS		0x07
-#define ST_SENSORS_BYTE_FOR_CHANNEL		2
 #define ST_SENSORS_SCAN_X			0
 #define ST_SENSORS_SCAN_Y			1
 #define ST_SENSORS_SCAN_Z			2
-#define ST_SENSORS_DEFAULT_12_REALBITS		12
-#define ST_SENSORS_DEFAULT_16_REALBITS		16
 #define ST_SENSORS_DEFAULT_POWER_ON_VALUE	0x01
 #define ST_SENSORS_DEFAULT_POWER_OFF_VALUE	0x00
 #define ST_SENSORS_DEFAULT_WAI_ADDRESS		0x0f
@@ -42,20 +38,20 @@
 #define ST_SENSORS_MAX_NAME			17
 #define ST_SENSORS_MAX_4WAI			7
 
-#define ST_SENSORS_LSM_CHANNELS(device_type, index, mod, endian, bits, addr) \
+#define ST_SENSORS_LSM_CHANNELS(device_type, mask, index, mod, \
+					ch2, s, endian, rbits, sbits, addr) \
 { \
 	.type = device_type, \
-	.modified = 1, \
-	.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
-			BIT(IIO_CHAN_INFO_SCALE), \
+	.modified = mod, \
+	.info_mask_separate = mask, \
 	.scan_index = index, \
-	.channel2 = mod, \
+	.channel2 = ch2, \
 	.address = addr, \
 	.scan_type = { \
-		.sign = 's', \
-		.realbits = bits, \
-		.shift = 16 - bits, \
-		.storagebits = 16, \
+		.sign = s, \
+		.realbits = rbits, \
+		.shift = sbits - rbits, \
+		.storagebits = sbits, \
 		.endianness = endian, \
 	}, \
 }
@@ -204,6 +200,7 @@
  * @multiread_bit: Use or not particular bit for [I2C/SPI] multiread.
  * @buffer_data: Data used by buffer part.
  * @odr: Output data rate of the sensor [Hz].
+ * num_data_channels: Number of data channels used in buffer.
  * @get_irq_data_ready: Function to get the IRQ used for data ready signal.
  * @tf: Transfer function structure used by I/O operations.
  * @tb: Transfer buffers and mutex used by I/O operations.
@@ -220,6 +217,7 @@
 	char *buffer_data;
 
 	unsigned int odr;
+	unsigned int num_data_channels;
 
 	unsigned int (*get_irq_data_ready) (struct iio_dev *indio_dev);
 

diff --git a/include/linux/iio/frequency/adf4350.h b/include/linux/iio/frequency/adf4350.h
index be91f34..ffd8c8f 100644
--- a/include/linux/iio/frequency/adf4350.h
+++ b/include/linux/iio/frequency/adf4350.h

@@ -1,7 +1,7 @@
 /*
  * ADF4350/ADF4351 SPI PLL driver
  *
- * Copyright 2012 Analog Devices Inc.
+ * Copyright 2012-2013 Analog Devices Inc.
  *
  * Licensed under the GPL-2.
  */
@@ -41,7 +41,7 @@
 #define ADF4350_REG2_RDIV2_EN			(1 << 24)
 #define ADF4350_REG2_RMULT2_EN			(1 << 25)
 #define ADF4350_REG2_MUXOUT(x)			((x) << 26)
-#define ADF4350_REG2_NOISE_MODE(x)		((x) << 29)
+#define ADF4350_REG2_NOISE_MODE(x)		(((unsigned)(x)) << 29)
 #define ADF4350_MUXOUT_THREESTATE		0
 #define ADF4350_MUXOUT_DVDD			1
 #define ADF4350_MUXOUT_GND			2

diff --git a/include/linux/ipmi-fru.h b/include/linux/ipmi-fru.h
new file mode 100644
index 0000000..4d3a763
--- /dev/null
+++ b/include/linux/ipmi-fru.h

@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2012 CERN (www.cern.ch)
+ * Author: Alessandro Rubini <rubini@gnudd.com>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ *
+ * This work is part of the White Rabbit project, a research effort led
+ * by CERN, the European Institute for Nuclear Research.
+ */
+#ifndef __LINUX_IPMI_FRU_H__
+#define __LINUX_IPMI_FRU_H__
+#ifdef __KERNEL__
+#  include <linux/types.h>
+#  include <linux/string.h>
+#else
+#  include <stdint.h>
+#  include <string.h>
+#endif
+
+/*
+ * These structures match the unaligned crap we have in FRU1011.pdf
+ * (http://download.intel.com/design/servers/ipmi/FRU1011.pdf)
+ */
+
+/* chapter 8, page 5 */
+struct fru_common_header {
+	uint8_t format;			/* 0x01 */
+	uint8_t internal_use_off;	/* multiple of 8 bytes */
+	uint8_t chassis_info_off;	/* multiple of 8 bytes */
+	uint8_t board_area_off;		/* multiple of 8 bytes */
+	uint8_t product_area_off;	/* multiple of 8 bytes */
+	uint8_t multirecord_off;	/* multiple of 8 bytes */
+	uint8_t pad;			/* must be 0 */
+	uint8_t checksum;		/* sum modulo 256 must be 0 */
+};
+
+/* chapter 9, page 5 -- internal_use: not used by us */
+
+/* chapter 10, page 6 -- chassis info: not used by us */
+
+/* chapter 13, page 9 -- used by board_info_area below */
+struct fru_type_length {
+	uint8_t type_length;
+	uint8_t data[0];
+};
+
+/* chapter 11, page 7 */
+struct fru_board_info_area {
+	uint8_t format;			/* 0x01 */
+	uint8_t area_len;		/* multiple of 8 bytes */
+	uint8_t language;		/* I hope it's 0 */
+	uint8_t mfg_date[3];		/* LSB, minutes since 1996-01-01 */
+	struct fru_type_length tl[0];	/* type-length stuff follows */
+
+	/*
+	 * the TL there are in order:
+	 * Board Manufacturer
+	 * Board Product Name
+	 * Board Serial Number
+	 * Board Part Number
+	 * FRU File ID (may be null)
+	 * more manufacturer-specific stuff
+	 * 0xc1 as a terminator
+	 * 0x00 pad to a multiple of 8 bytes - 1
+	 * checksum (sum of all stuff module 256 must be zero)
+	 */
+};
+
+enum fru_type {
+	FRU_TYPE_BINARY		= 0x00,
+	FRU_TYPE_BCDPLUS	= 0x40,
+	FRU_TYPE_ASCII6		= 0x80,
+	FRU_TYPE_ASCII		= 0xc0, /* not ascii: depends on language */
+};
+
+/*
+ * some helpers
+ */
+static inline struct fru_board_info_area *fru_get_board_area(
+	const struct fru_common_header *header)
+{
+	/* we know for sure that the header is 8 bytes in size */
+	return (struct fru_board_info_area *)(header + header->board_area_off);
+}
+
+static inline int fru_type(struct fru_type_length *tl)
+{
+	return tl->type_length & 0xc0;
+}
+
+static inline int fru_length(struct fru_type_length *tl)
+{
+	return (tl->type_length & 0x3f) + 1; /* len of whole record */
+}
+
+/* assume ascii-latin1 encoding */
+static inline int fru_strlen(struct fru_type_length *tl)
+{
+	return fru_length(tl) - 1;
+}
+
+static inline char *fru_strcpy(char *dest, struct fru_type_length *tl)
+{
+	int len = fru_strlen(tl);
+	memcpy(dest, tl->data, len);
+	dest[len] = '\0';
+	return dest;
+}
+
+static inline struct fru_type_length *fru_next_tl(struct fru_type_length *tl)
+{
+	return tl + fru_length(tl);
+}
+
+static inline int fru_is_eof(struct fru_type_length *tl)
+{
+	return tl->type_length == 0xc1;
+}
+
+/*
+ * External functions defined in fru-parse.c.
+ */
+extern int fru_header_cksum_ok(struct fru_common_header *header);
+extern int fru_bia_cksum_ok(struct fru_board_info_area *bia);
+
+/* All these 4 return allocated strings by calling fru_alloc() */
+extern char *fru_get_board_manufacturer(struct fru_common_header *header);
+extern char *fru_get_product_name(struct fru_common_header *header);
+extern char *fru_get_serial_number(struct fru_common_header *header);
+extern char *fru_get_part_number(struct fru_common_header *header);
+
+/* This must be defined by the caller of the above functions */
+extern void *fru_alloc(size_t size);
+
+#endif /* __LINUX_IMPI_FRU_H__ */

diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 7e0b622..8685d1b 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h

@@ -27,7 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/journal-head.h>
 #include <linux/stddef.h>
-#include <linux/bit_spinlock.h>
 #include <linux/mutex.h>
 #include <linux/timer.h>
 #include <linux/lockdep.h>
@@ -244,6 +243,31 @@
 
 #include <linux/fs.h>
 #include <linux/sched.h>
+
+enum jbd_state_bits {
+	BH_JBD			/* Has an attached ext3 journal_head */
+	  = BH_PrivateStart,
+	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
+	BH_Freed,		/* Has been freed (truncated) */
+	BH_Revoked,		/* Has been revoked from the log */
+	BH_RevokeValid,		/* Revoked flag is valid */
+	BH_JBDDirty,		/* Is dirty but journaled */
+	BH_State,		/* Pins most journal_head state */
+	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
+	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
+	BH_JBDPrivateStart,	/* First bit available for private use by FS */
+};
+
+BUFFER_FNS(JBD, jbd)
+BUFFER_FNS(JWrite, jwrite)
+BUFFER_FNS(JBDDirty, jbddirty)
+TAS_BUFFER_FNS(JBDDirty, jbddirty)
+BUFFER_FNS(Revoked, revoked)
+TAS_BUFFER_FNS(Revoked, revoked)
+BUFFER_FNS(RevokeValid, revokevalid)
+TAS_BUFFER_FNS(RevokeValid, revokevalid)
+BUFFER_FNS(Freed, freed)
+
 #include <linux/jbd_common.h>
 
 #define J_ASSERT(assert)	BUG_ON(!(assert))
@@ -840,7 +864,7 @@
 extern int	 journal_forget (handle_t *, struct buffer_head *);
 extern void	 journal_sync_buffer (struct buffer_head *);
 extern void	 journal_invalidatepage(journal_t *,
-				struct page *, unsigned long);
+				struct page *, unsigned int, unsigned int);
 extern int	 journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
 extern int	 journal_stop(handle_t *);
 extern int	 journal_flush (journal_t *);

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 6e051f4..d5b50a1 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h

@@ -26,7 +26,6 @@
 #include <linux/buffer_head.h>
 #include <linux/journal-head.h>
 #include <linux/stddef.h>
-#include <linux/bit_spinlock.h>
 #include <linux/mutex.h>
 #include <linux/timer.h>
 #include <linux/slab.h>
@@ -57,17 +56,13 @@
  */
 #define JBD2_EXPENSIVE_CHECKING
 extern ushort jbd2_journal_enable_debug;
+void __jbd2_debug(int level, const char *file, const char *func,
+		  unsigned int line, const char *fmt, ...);
 
-#define jbd_debug(n, f, a...)						\
-	do {								\
-		if ((n) <= jbd2_journal_enable_debug) {			\
-			printk (KERN_DEBUG "(%s, %d): %s: ",		\
-				__FILE__, __LINE__, __func__);	\
-			printk (f, ## a);				\
-		}							\
-	} while (0)
+#define jbd_debug(n, fmt, a...) \
+	__jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
 #else
-#define jbd_debug(f, a...)	/**/
+#define jbd_debug(n, fmt, a...)    /**/
 #endif
 
 extern void *jbd2_alloc(size_t size, gfp_t flags);
@@ -302,6 +297,34 @@
 
 #include <linux/fs.h>
 #include <linux/sched.h>
+
+enum jbd_state_bits {
+	BH_JBD			/* Has an attached ext3 journal_head */
+	  = BH_PrivateStart,
+	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
+	BH_Freed,		/* Has been freed (truncated) */
+	BH_Revoked,		/* Has been revoked from the log */
+	BH_RevokeValid,		/* Revoked flag is valid */
+	BH_JBDDirty,		/* Is dirty but journaled */
+	BH_State,		/* Pins most journal_head state */
+	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
+	BH_Shadow,		/* IO on shadow buffer is running */
+	BH_Verified,		/* Metadata block has been verified ok */
+	BH_JBDPrivateStart,	/* First bit available for private use by FS */
+};
+
+BUFFER_FNS(JBD, jbd)
+BUFFER_FNS(JWrite, jwrite)
+BUFFER_FNS(JBDDirty, jbddirty)
+TAS_BUFFER_FNS(JBDDirty, jbddirty)
+BUFFER_FNS(Revoked, revoked)
+TAS_BUFFER_FNS(Revoked, revoked)
+BUFFER_FNS(RevokeValid, revokevalid)
+TAS_BUFFER_FNS(RevokeValid, revokevalid)
+BUFFER_FNS(Freed, freed)
+BUFFER_FNS(Shadow, shadow)
+BUFFER_FNS(Verified, verified)
+
 #include <linux/jbd_common.h>
 
 #define J_ASSERT(assert)	BUG_ON(!(assert))
@@ -382,8 +405,15 @@
 
 struct jbd2_journal_handle
 {
-	/* Which compound transaction is this update a part of? */
-	transaction_t		*h_transaction;
+	union {
+		/* Which compound transaction is this update a part of? */
+		transaction_t	*h_transaction;
+		/* Which journal handle belongs to - used iff h_reserved set */
+		journal_t	*h_journal;
+	};
+
+	/* Handle reserved for finishing the logical operation */
+	handle_t		*h_rsv_handle;
 
 	/* Number of remaining buffers we are allowed to dirty: */
 	int			h_buffer_credits;
@@ -398,6 +428,7 @@
 	/* Flags [no locking] */
 	unsigned int	h_sync:		1;	/* sync-on-close */
 	unsigned int	h_jdata:	1;	/* force data journaling */
+	unsigned int	h_reserved:	1;	/* handle with reserved credits */
 	unsigned int	h_aborted:	1;	/* fatal error on handle */
 	unsigned int	h_type:		8;	/* for handle statistics */
 	unsigned int	h_line_no:	16;	/* for handle statistics */
@@ -524,12 +555,6 @@
 	struct journal_head	*t_checkpoint_io_list;
 
 	/*
-	 * Doubly-linked circular list of temporary buffers currently undergoing
-	 * IO in the log [j_list_lock]
-	 */
-	struct journal_head	*t_iobuf_list;
-
-	/*
 	 * Doubly-linked circular list of metadata buffers being shadowed by log
 	 * IO.  The IO buffers on the iobuf list and the shadow buffers on this
 	 * list match each other one for one at all times. [j_list_lock]
@@ -537,12 +562,6 @@
 	struct journal_head	*t_shadow_list;
 
 	/*
-	 * Doubly-linked circular list of control buffers being written to the
-	 * log. [j_list_lock]
-	 */
-	struct journal_head	*t_log_list;
-
-	/*
 	 * List of inodes whose data we've modified in data=ordered mode.
 	 * [j_list_lock]
 	 */
@@ -671,11 +690,10 @@
  *  waiting for checkpointing
  * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction
  *  to start committing, or for a barrier lock to be released
- * @j_wait_logspace: Wait queue for waiting for checkpointing to complete
  * @j_wait_done_commit: Wait queue for waiting for commit to complete
- * @j_wait_checkpoint:  Wait queue to trigger checkpointing
  * @j_wait_commit: Wait queue to trigger commit
  * @j_wait_updates: Wait queue to wait for updates to complete
+ * @j_wait_reserved: Wait queue to wait for reserved buffer credits to drop
  * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints
  * @j_head: Journal head - identifies the first unused block in the journal
  * @j_tail: Journal tail - identifies the oldest still-used block in the
@@ -689,6 +707,7 @@
  *     journal
  * @j_fs_dev: Device which holds the client fs.  For internal journal this will
  *     be equal to j_dev
+ * @j_reserved_credits: Number of buffers reserved from the running transaction
  * @j_maxlen: Total maximum capacity of the journal region on disk.
  * @j_list_lock: Protects the buffer lists and internal buffer state.
  * @j_inode: Optional inode where we store the journal.  If present, all journal
@@ -778,21 +797,18 @@
 	 */
 	wait_queue_head_t	j_wait_transaction_locked;
 
-	/* Wait queue for waiting for checkpointing to complete */
-	wait_queue_head_t	j_wait_logspace;
-
 	/* Wait queue for waiting for commit to complete */
 	wait_queue_head_t	j_wait_done_commit;
 
-	/* Wait queue to trigger checkpointing */
-	wait_queue_head_t	j_wait_checkpoint;
-
 	/* Wait queue to trigger commit */
 	wait_queue_head_t	j_wait_commit;
 
 	/* Wait queue to wait for updates to complete */
 	wait_queue_head_t	j_wait_updates;
 
+	/* Wait queue to wait for reserved buffer credits to drop */
+	wait_queue_head_t	j_wait_reserved;
+
 	/* Semaphore for locking against concurrent checkpoints */
 	struct mutex		j_checkpoint_mutex;
 
@@ -847,6 +863,9 @@
 	/* Total maximum capacity of the journal region on disk. */
 	unsigned int		j_maxlen;
 
+	/* Number of buffers reserved from the running transaction */
+	atomic_t		j_reserved_credits;
+
 	/*
 	 * Protects the buffer lists and internal buffer state.
 	 */
@@ -991,9 +1010,17 @@
 extern void __journal_free_buffer(struct journal_head *bh);
 extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
 extern void __journal_clean_data_list(transaction_t *transaction);
+static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh)
+{
+	list_add_tail(&bh->b_assoc_buffers, head);
+}
+static inline void jbd2_unfile_log_bh(struct buffer_head *bh)
+{
+	list_del_init(&bh->b_assoc_buffers);
+}
 
 /* Log buffer allocation */
-extern struct journal_head * jbd2_journal_get_descriptor_buffer(journal_t *);
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal);
 int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
 int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
 			      unsigned long *block);
@@ -1039,11 +1066,10 @@
 				      struct jbd2_buffer_trigger_type *triggers);
 
 /* Buffer IO */
-extern int
-jbd2_journal_write_metadata_buffer(transaction_t	  *transaction,
-			      struct journal_head  *jh_in,
-			      struct journal_head **jh_out,
-			      unsigned long long   blocknr);
+extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
+					      struct journal_head *jh_in,
+					      struct buffer_head **bh_out,
+					      sector_t blocknr);
 
 /* Transaction locking */
 extern void		__wait_on_journal (journal_t *);
@@ -1076,10 +1102,14 @@
  */
 
 extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
-extern handle_t *jbd2__journal_start(journal_t *, int nblocks, gfp_t gfp_mask,
-				     unsigned int type, unsigned int line_no);
+extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks,
+				     gfp_t gfp_mask, unsigned int type,
+				     unsigned int line_no);
 extern int	 jbd2_journal_restart(handle_t *, int nblocks);
 extern int	 jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask);
+extern int	 jbd2_journal_start_reserved(handle_t *handle,
+				unsigned int type, unsigned int line_no);
+extern void	 jbd2_journal_free_reserved(handle_t *handle);
 extern int	 jbd2_journal_extend (handle_t *, int nblocks);
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
@@ -1090,7 +1120,7 @@
 extern int	 jbd2_journal_forget (handle_t *, struct buffer_head *);
 extern void	 journal_sync_buffer (struct buffer_head *);
 extern int	 jbd2_journal_invalidatepage(journal_t *,
-				struct page *, unsigned long);
+				struct page *, unsigned int, unsigned int);
 extern int	 jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
 extern int	 jbd2_journal_stop(handle_t *);
 extern int	 jbd2_journal_flush (journal_t *);
@@ -1125,6 +1155,7 @@
 extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
+extern int	   jbd2_journal_force_commit_nested(journal_t *);
 extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
 extern int	   jbd2_journal_begin_ordered_truncate(journal_t *journal,
 				struct jbd2_inode *inode, loff_t new_size);
@@ -1178,8 +1209,10 @@
 extern void	   jbd2_journal_destroy_revoke(journal_t *);
 extern int	   jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
 extern int	   jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
-extern void	   jbd2_journal_write_revoke_records(journal_t *,
-						     transaction_t *, int);
+extern void	   jbd2_journal_write_revoke_records(journal_t *journal,
+						     transaction_t *transaction,
+						     struct list_head *log_bufs,
+						     int write_op);
 
 /* Recovery revoke support */
 extern int	jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
@@ -1195,11 +1228,9 @@
  * transitions on demand.
  */
 
-int __jbd2_log_space_left(journal_t *); /* Called with journal locked */
 int jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
-int jbd2_journal_force_commit_nested(journal_t *journal);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
 int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
@@ -1235,7 +1266,7 @@
 
 static inline int is_handle_aborted(handle_t *handle)
 {
-	if (handle->h_aborted)
+	if (handle->h_aborted || !handle->h_transaction)
 		return 1;
 	return is_journal_aborted(handle->h_transaction->t_journal);
 }
@@ -1266,16 +1297,37 @@
 extern size_t journal_tag_bytes(journal_t *journal);
 
 /*
+ * We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for
+ * transaction control blocks.
+ */
+#define JBD2_CONTROL_BLOCKS_SHIFT 5
+
+/*
  * Return the minimum number of blocks which must be free in the journal
  * before a new transaction may be started.  Must be called under j_state_lock.
  */
-static inline int jbd_space_needed(journal_t *journal)
+static inline int jbd2_space_needed(journal_t *journal)
 {
 	int nblocks = journal->j_max_transaction_buffers;
-	if (journal->j_committing_transaction)
-		nblocks += atomic_read(&journal->j_committing_transaction->
-				       t_outstanding_credits);
-	return nblocks;
+	return nblocks + (nblocks >> JBD2_CONTROL_BLOCKS_SHIFT);
+}
+
+/*
+ * Return number of free blocks in the log. Must be called under j_state_lock.
+ */
+static inline unsigned long jbd2_log_space_left(journal_t *journal)
+{
+	/* Allow for rounding errors */
+	unsigned long free = journal->j_free - 32;
+
+	if (journal->j_committing_transaction) {
+		unsigned long committing = atomic_read(&journal->
+			j_committing_transaction->t_outstanding_credits);
+
+		/* Transaction + control blocks */
+		free -= committing + (committing >> JBD2_CONTROL_BLOCKS_SHIFT);
+	}
+	return free;
 }
 
 /*
@@ -1286,11 +1338,9 @@
 #define BJ_None		0	/* Not journaled */
 #define BJ_Metadata	1	/* Normal journaled metadata */
 #define BJ_Forget	2	/* Buffer superseded by this transaction */
-#define BJ_IO		3	/* Buffer is for temporary IO use */
-#define BJ_Shadow	4	/* Buffer contents being shadowed to the log */
-#define BJ_LogCtl	5	/* Buffer contains log descriptors */
-#define BJ_Reserved	6	/* Buffer is reserved for access by journal */
-#define BJ_Types	7
+#define BJ_Shadow	3	/* Buffer contents being shadowed to the log */
+#define BJ_Reserved	4	/* Buffer is reserved for access by journal */
+#define BJ_Types	5
 
 extern int jbd_blocks_per_page(struct inode *inode);
 
@@ -1319,6 +1369,19 @@
 	return *(u32 *)desc.ctx;
 }
 
+/* Return most recent uncommitted transaction */
+static inline tid_t  jbd2_get_latest_transaction(journal_t *journal)
+{
+	tid_t tid;
+
+	read_lock(&journal->j_state_lock);
+	tid = journal->j_commit_request;
+	if (journal->j_running_transaction)
+		tid = journal->j_running_transaction->t_tid;
+	read_unlock(&journal->j_state_lock);
+	return tid;
+}
+
 #ifdef __KERNEL__
 
 #define buffer_trace_init(bh)	do {} while (0)

diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
index 6133679..3dc5343 100644
--- a/include/linux/jbd_common.h
+++ b/include/linux/jbd_common.h

@@ -1,31 +1,7 @@
 #ifndef _LINUX_JBD_STATE_H
 #define _LINUX_JBD_STATE_H
 
-enum jbd_state_bits {
-	BH_JBD			/* Has an attached ext3 journal_head */
-	  = BH_PrivateStart,
-	BH_JWrite,		/* Being written to log (@@@ DEBUGGING) */
-	BH_Freed,		/* Has been freed (truncated) */
-	BH_Revoked,		/* Has been revoked from the log */
-	BH_RevokeValid,		/* Revoked flag is valid */
-	BH_JBDDirty,		/* Is dirty but journaled */
-	BH_State,		/* Pins most journal_head state */
-	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
-	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
-	BH_Verified,		/* Metadata block has been verified ok */
-	BH_JBDPrivateStart,	/* First bit available for private use by FS */
-};
-
-BUFFER_FNS(JBD, jbd)
-BUFFER_FNS(JWrite, jwrite)
-BUFFER_FNS(JBDDirty, jbddirty)
-TAS_BUFFER_FNS(JBDDirty, jbddirty)
-BUFFER_FNS(Revoked, revoked)
-TAS_BUFFER_FNS(Revoked, revoked)
-BUFFER_FNS(RevokeValid, revokevalid)
-TAS_BUFFER_FNS(RevokeValid, revokevalid)
-BUFFER_FNS(Freed, freed)
-BUFFER_FNS(Verified, verified)
+#include <linux/bit_spinlock.h>
 
 static inline struct buffer_head *jh2bh(struct journal_head *jh)
 {

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e9ef6d6..3afb969 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h

@@ -450,6 +450,8 @@
 extern int hex_to_bin(char ch);
 extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
 
+int mac_pton(const char *s, u8 *mac);
+
 /*
  * General tracing related utility functions - trace_printk(),
  * tracing_on/tracing_off and tracing_start()/tracing_stop

diff --git a/include/linux/loop.h b/include/linux/loop.h
deleted file mode 100644
index 460b60f..0000000
--- a/include/linux/loop.h
+++ /dev/null

@@ -1,85 +0,0 @@
-/*
- * include/linux/loop.h
- *
- * Written by Theodore Ts'o, 3/29/93.
- *
- * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
- * permitted under the GNU General Public License.
- */
-#ifndef _LINUX_LOOP_H
-#define _LINUX_LOOP_H
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/spinlock.h>
-#include <linux/mutex.h>
-#include <uapi/linux/loop.h>
-
-/* Possible states of device */
-enum {
-	Lo_unbound,
-	Lo_bound,
-	Lo_rundown,
-};
-
-struct loop_func_table;
-
-struct loop_device {
-	int		lo_number;
-	int		lo_refcnt;
-	loff_t		lo_offset;
-	loff_t		lo_sizelimit;
-	int		lo_flags;
-	int		(*transfer)(struct loop_device *, int cmd,
-				    struct page *raw_page, unsigned raw_off,
-				    struct page *loop_page, unsigned loop_off,
-				    int size, sector_t real_block);
-	char		lo_file_name[LO_NAME_SIZE];
-	char		lo_crypt_name[LO_NAME_SIZE];
-	char		lo_encrypt_key[LO_KEY_SIZE];
-	int		lo_encrypt_key_size;
-	struct loop_func_table *lo_encryption;
-	__u32           lo_init[2];
-	kuid_t		lo_key_owner;	/* Who set the key */
-	int		(*ioctl)(struct loop_device *, int cmd, 
-				 unsigned long arg); 
-
-	struct file *	lo_backing_file;
-	struct block_device *lo_device;
-	unsigned	lo_blocksize;
-	void		*key_data; 
-
-	gfp_t		old_gfp_mask;
-
-	spinlock_t		lo_lock;
-	struct bio_list		lo_bio_list;
-	unsigned int		lo_bio_count;
-	int			lo_state;
-	struct mutex		lo_ctl_mutex;
-	struct task_struct	*lo_thread;
-	wait_queue_head_t	lo_event;
-	/* wait queue for incoming requests */
-	wait_queue_head_t	lo_req_wait;
-
-	struct request_queue	*lo_queue;
-	struct gendisk		*lo_disk;
-};
-
-/* Support for loadable transfer modules */
-struct loop_func_table {
-	int number;	/* filter type */ 
-	int (*transfer)(struct loop_device *lo, int cmd,
-			struct page *raw_page, unsigned raw_off,
-			struct page *loop_page, unsigned loop_off,
-			int size, sector_t real_block);
-	int (*init)(struct loop_device *, const struct loop_info64 *); 
-	/* release is called from loop_unregister_transfer or clr_fd */
-	int (*release)(struct loop_device *); 
-	int (*ioctl)(struct loop_device *, int cmd, unsigned long arg);
-	struct module *owner;
-}; 
-
-int loop_register_transfer(struct loop_func_table *funcs);
-int loop_unregister_transfer(int number); 
-
-#endif

diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index 8f21daf..9b81b2b 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h

@@ -20,6 +20,8 @@
 #include <linux/leds.h>
 #include <linux/regmap.h>
 #include <linux/regulator/driver.h>
+#include <linux/extcon.h>
+#include <linux/usb/phy_companion.h>
 
 #define PALMAS_NUM_CLIENTS		3
 
@@ -37,6 +39,12 @@
 struct palmas_resource;
 struct palmas_usb;
 
+enum palmas_usb_state {
+	PALMAS_USB_STATE_DISCONNECT,
+	PALMAS_USB_STATE_VBUS,
+	PALMAS_USB_STATE_ID,
+};
+
 struct palmas {
 	struct device *dev;
 
@@ -180,9 +188,6 @@
 };
 
 struct palmas_usb_platform_data {
-	/* Set this if platform wishes its own vbus control */
-	int no_control_vbus;
-
 	/* Do we enable the wakeup comparator on probe */
 	int wakeup;
 };
@@ -350,22 +355,19 @@
 	struct palmas *palmas;
 	struct device *dev;
 
-	/* for vbus reporting with irqs disabled */
-	spinlock_t lock;
-
-	struct regulator *vbus_reg;
+	struct extcon_dev edev;
 
 	/* used to set vbus, in atomic path */
 	struct work_struct set_vbus_work;
 
-	int irq1;
-	int irq2;
-	int irq3;
-	int irq4;
+	int id_otg_irq;
+	int id_irq;
+	int vbus_otg_irq;
+	int vbus_irq;
 
 	int vbus_enable;
 
-	u8 linkstat;
+	enum palmas_usb_state linkstat;
 };
 
 #define comparator_to_palmas(x) container_of((x), struct palmas_usb, comparator)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e0c8528..66d881f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h

@@ -1041,7 +1041,8 @@
 struct page *get_dump_page(unsigned long addr);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
-extern void do_invalidatepage(struct page *page, unsigned long offset);
+extern void do_invalidatepage(struct page *page, unsigned int offset,
+			      unsigned int length);
 
 int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 60584b1..96e4c21 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h

@@ -1695,6 +1695,7 @@
 extern struct net_device	*dev_get_by_index(struct net *net, int ifindex);
 extern struct net_device	*__dev_get_by_index(struct net *net, int ifindex);
 extern struct net_device	*dev_get_by_index_rcu(struct net *net, int ifindex);
+extern int		netdev_get_name(struct net *net, char *name, int ifindex);
 extern int		dev_restart(struct net_device *dev);
 #ifdef CONFIG_NETPOLL_TRAP
 extern int		netpoll_trap(void);

diff --git a/include/linux/platform_data/ad7303.h b/include/linux/platform_data/ad7303.h
new file mode 100644
index 0000000..de6a7a6
--- /dev/null
+++ b/include/linux/platform_data/ad7303.h

@@ -0,0 +1,21 @@
+/*
+ * Analog Devices AD7303 DAC driver
+ *
+ * Copyright 2013 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2.
+ */
+
+#ifndef __IIO_ADC_AD7303_H__
+#define __IIO_ADC_AD7303_H__
+
+/**
+ * struct ad7303_platform_data - AD7303 platform data
+ * @use_external_ref: If set to true use an external voltage reference connected
+ * to the REF pin, otherwise use the internal reference derived from Vdd.
+ */
+struct ad7303_platform_data {
+	bool use_external_ref;
+};
+
+#endif

diff --git a/include/linux/platform_data/usb3503.h b/include/linux/platform_data/usb3503.h
index 85dcc70..1d1b6ef 100644
--- a/include/linux/platform_data/usb3503.h
+++ b/include/linux/platform_data/usb3503.h

@@ -3,6 +3,10 @@
 
 #define USB3503_I2C_NAME	"usb3503"
 
+#define USB3503_OFF_PORT1	(1 << 1)
+#define USB3503_OFF_PORT2	(1 << 2)
+#define USB3503_OFF_PORT3	(1 << 3)
+
 enum usb3503_mode {
 	USB3503_MODE_UNKNOWN,
 	USB3503_MODE_HUB,
@@ -11,6 +15,7 @@
 
 struct usb3503_platform_data {
 	enum usb3503_mode	initial_mode;
+	u8	port_off_mask;
 	int	gpio_intn;
 	int	gpio_connect;
 	int	gpio_reset;

diff --git a/include/linux/sdb.h b/include/linux/sdb.h
new file mode 100644
index 0000000..fbb76a4
--- /dev/null
+++ b/include/linux/sdb.h

@@ -0,0 +1,159 @@
+/*
+ * This is the official version 1.1 of sdb.h
+ */
+#ifndef __SDB_H__
+#define __SDB_H__
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+/*
+ * All structures are 64 bytes long and are expected
+ * to live in an array, one for each interconnect.
+ * Most fields of the structures are shared among the
+ * various types, and most-specific fields are at the
+ * beginning (for alignment reasons, and to keep the
+ * magic number at the head of the interconnect record
+ */
+
+/* Product, 40 bytes at offset 24, 8-byte aligned
+ *
+ * device_id is vendor-assigned; version is device-specific,
+ * date is hex (e.g 0x20120501), name is UTF-8, blank-filled
+ * and not terminated with a 0 byte.
+ */
+struct sdb_product {
+	uint64_t		vendor_id;	/* 0x18..0x1f */
+	uint32_t		device_id;	/* 0x20..0x23 */
+	uint32_t		version;	/* 0x24..0x27 */
+	uint32_t		date;		/* 0x28..0x2b */
+	uint8_t			name[19];	/* 0x2c..0x3e */
+	uint8_t			record_type;	/* 0x3f */
+};
+
+/*
+ * Component, 56 bytes at offset 8, 8-byte aligned
+ *
+ * The address range is first to last, inclusive
+ * (for example 0x100000 - 0x10ffff)
+ */
+struct sdb_component {
+	uint64_t		addr_first;	/* 0x08..0x0f */
+	uint64_t		addr_last;	/* 0x10..0x17 */
+	struct sdb_product	product;	/* 0x18..0x3f */
+};
+
+/* Type of the SDB record */
+enum sdb_record_type {
+	sdb_type_interconnect	= 0x00,
+	sdb_type_device		= 0x01,
+	sdb_type_bridge		= 0x02,
+	sdb_type_integration	= 0x80,
+	sdb_type_repo_url	= 0x81,
+	sdb_type_synthesis	= 0x82,
+	sdb_type_empty		= 0xFF,
+};
+
+/* Type 0: interconnect (first of the array)
+ *
+ * sdb_records is the length of the table including this first
+ * record, version is 1. The bus type is enumerated later.
+ */
+#define				SDB_MAGIC	0x5344422d /* "SDB-" */
+struct sdb_interconnect {
+	uint32_t		sdb_magic;	/* 0x00-0x03 */
+	uint16_t		sdb_records;	/* 0x04-0x05 */
+	uint8_t			sdb_version;	/* 0x06 */
+	uint8_t			sdb_bus_type;	/* 0x07 */
+	struct sdb_component	sdb_component;	/* 0x08-0x3f */
+};
+
+/* Type 1: device
+ *
+ * class is 0 for "custom device", other values are
+ * to be standardized; ABI version is for the driver,
+ * bus-specific bits are defined by each bus (see below)
+ */
+struct sdb_device {
+	uint16_t		abi_class;	/* 0x00-0x01 */
+	uint8_t			abi_ver_major;	/* 0x02 */
+	uint8_t			abi_ver_minor;	/* 0x03 */
+	uint32_t		bus_specific;	/* 0x04-0x07 */
+	struct sdb_component	sdb_component;	/* 0x08-0x3f */
+};
+
+/* Type 2: bridge
+ *
+ * child is the address of the nested SDB table
+ */
+struct sdb_bridge {
+	uint64_t		sdb_child;	/* 0x00-0x07 */
+	struct sdb_component	sdb_component;	/* 0x08-0x3f */
+};
+
+/* Type 0x80: integration
+ *
+ * all types with bit 7 set are meta-information, so
+ * software can ignore the types it doesn't know. Here we
+ * just provide product information for an aggregate device
+ */
+struct sdb_integration {
+	uint8_t			reserved[24];	/* 0x00-0x17 */
+	struct sdb_product	product;	/* 0x08-0x3f */
+};
+
+/* Type 0x81: Top module repository url
+ *
+ * again, an informative field that software can ignore
+ */
+struct sdb_repo_url {
+	uint8_t			repo_url[63];	/* 0x00-0x3e */
+	uint8_t			record_type;	/* 0x3f */
+};
+
+/* Type 0x82: Synthesis tool information
+ *
+ * this informative record
+ */
+struct sdb_synthesis {
+	uint8_t			syn_name[16];	/* 0x00-0x0f */
+	uint8_t			commit_id[16];	/* 0x10-0x1f */
+	uint8_t			tool_name[8];	/* 0x20-0x27 */
+	uint32_t		tool_version;	/* 0x28-0x2b */
+	uint32_t		date;		/* 0x2c-0x2f */
+	uint8_t			user_name[15];	/* 0x30-0x3e */
+	uint8_t			record_type;	/* 0x3f */
+};
+
+/* Type 0xff: empty
+ *
+ * this allows keeping empty slots during development,
+ * so they can be filled later with minimal efforts and
+ * no misleading description is ever shipped -- hopefully.
+ * It can also be used to pad a table to a desired length.
+ */
+struct sdb_empty {
+	uint8_t			reserved[63];	/* 0x00-0x3e */
+	uint8_t			record_type;	/* 0x3f */
+};
+
+/* The type of bus, for bus-specific flags */
+enum sdb_bus_type {
+	sdb_wishbone = 0x00,
+	sdb_data     = 0x01,
+};
+
+#define SDB_WB_WIDTH_MASK	0x0f
+#define SDB_WB_ACCESS8			0x01
+#define SDB_WB_ACCESS16			0x02
+#define SDB_WB_ACCESS32			0x04
+#define SDB_WB_ACCESS64			0x08
+#define SDB_WB_LITTLE_ENDIAN	0x80
+
+#define SDB_DATA_READ		0x04
+#define SDB_DATA_WRITE		0x02
+#define SDB_DATA_EXEC		0x01
+
+#endif /* __SDB_H__ */

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 87d4bbc..b98291a 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h

@@ -31,6 +31,13 @@
 #include <linux/sysrq.h>
 #include <uapi/linux/serial_core.h>
 
+#ifdef CONFIG_SERIAL_CORE_CONSOLE
+#define uart_console(port) \
+	((port)->cons && (port)->cons->index == (port)->line)
+#else
+#define uart_console(port)      (0)
+#endif
+
 struct uart_port;
 struct serial_struct;
 struct device;

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9c676eae..dec1748 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h

@@ -627,6 +627,7 @@
 }
 
 extern void kfree_skb(struct sk_buff *skb);
+extern void kfree_skb_list(struct sk_buff *segs);
 extern void skb_tx_error(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 8780bd2..01ac30e 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h

@@ -272,7 +272,6 @@
 #define N_TTY_BUF_SIZE 4096
 
 	unsigned char closing:1;
-	unsigned short minimum_to_wake;
 	unsigned char *write_buf;
 	int write_cnt;
 	/* If the tty has a pending do_SAK, queue it here - akpm */
@@ -309,8 +308,6 @@
 #define TTY_LDISC 		9	/* Line discipline attached */
 #define TTY_LDISC_CHANGING 	10	/* Line discipline changing */
 #define TTY_LDISC_OPEN	 	11	/* Line discipline is open */
-#define TTY_HW_COOK_OUT 	14	/* Hardware can do output cooking */
-#define TTY_HW_COOK_IN 		15	/* Hardware can do input cooking */
 #define TTY_PTY_LOCK 		16	/* pty private */
 #define TTY_NO_WRITE_SPLIT 	17	/* Preserve write boundaries to driver */
 #define TTY_HUPPED 		18	/* Post driver->hangup() */

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 58390c7..a1b0489 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h

@@ -100,6 +100,11 @@
  *	seek to perform this action quickly but should wait until
  *	any pending driver I/O is completed.
  *
+ * void (*fasync)(struct tty_struct *, int on)
+ *
+ *	Notify line discipline when signal-driven I/O is enabled or
+ *	disabled.
+ *
  * void (*dcd_change)(struct tty_struct *tty, unsigned int status)
  *
  *	Tells the discipline that the DCD pin has changed its status.
@@ -110,6 +115,52 @@
 #include <linux/wait.h>
 #include <linux/wait.h>
 
+
+/*
+ * the semaphore definition
+ */
+struct ld_semaphore {
+	long			count;
+	raw_spinlock_t		wait_lock;
+	unsigned int		wait_readers;
+	struct list_head	read_wait;
+	struct list_head	write_wait;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+extern void __init_ldsem(struct ld_semaphore *sem, const char *name,
+			 struct lock_class_key *key);
+
+#define init_ldsem(sem)						\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__init_ldsem((sem), #sem, &__key);			\
+} while (0)
+
+
+extern int ldsem_down_read(struct ld_semaphore *sem, long timeout);
+extern int ldsem_down_read_trylock(struct ld_semaphore *sem);
+extern int ldsem_down_write(struct ld_semaphore *sem, long timeout);
+extern int ldsem_down_write_trylock(struct ld_semaphore *sem);
+extern void ldsem_up_read(struct ld_semaphore *sem);
+extern void ldsem_up_write(struct ld_semaphore *sem);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern int ldsem_down_read_nested(struct ld_semaphore *sem, int subclass,
+				  long timeout);
+extern int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
+				   long timeout);
+#else
+# define ldsem_down_read_nested(sem, subclass, timeout)		\
+		ldsem_down_read(sem, timeout)
+# define ldsem_down_write_nested(sem, subclass, timeout)	\
+		ldsem_down_write(sem, timeout)
+#endif
+
+
 struct tty_ldisc_ops {
 	int	magic;
 	char	*name;
@@ -143,6 +194,7 @@
 			       char *fp, int count);
 	void	(*write_wakeup)(struct tty_struct *);
 	void	(*dcd_change)(struct tty_struct *, unsigned int);
+	void	(*fasync)(struct tty_struct *tty, int on);
 
 	struct  module *owner;
 

diff --git a/include/linux/usb.h b/include/linux/usb.h
index a0bee5a..a232b7e 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h

@@ -394,6 +394,22 @@
 };
 
 /*
+ * USB 2.0 Link Power Management (LPM) parameters.
+ */
+struct usb2_lpm_parameters {
+	/* Best effort service latency indicate how long the host will drive
+	 * resume on an exit from L1.
+	 */
+	unsigned int besl;
+
+	/* Timeout value in microseconds for the L1 inactivity (LPM) timer.
+	 * When the timer counts to zero, the parent hub will initiate a LPM
+	 * transition to L1.
+	 */
+	int timeout;
+};
+
+/*
  * USB 3.0 Link Power Management (LPM) parameters.
  *
  * PEL and SEL are USB 3.0 Link PM latencies for device-initiated LPM exit.
@@ -468,6 +484,7 @@
  * @wusb: device is Wireless USB
  * @lpm_capable: device supports LPM
  * @usb2_hw_lpm_capable: device can perform USB2 hardware LPM
+ * @usb2_hw_lpm_besl_capable: device can perform USB2 hardware BESL LPM
  * @usb2_hw_lpm_enabled: USB2 hardware LPM enabled
  * @usb3_lpm_enabled: USB3 hardware LPM enabled
  * @string_langid: language ID for strings
@@ -487,6 +504,7 @@
  *	specific data for the device.
  * @slot_id: Slot ID assigned by xHCI
  * @removable: Device can be physically removed from this port
+ * @l1_params: best effor service latency for USB2 L1 LPM state, and L1 timeout.
  * @u1_params: exit latencies for USB3 U1 LPM state, and hub-initiated timeout.
  * @u2_params: exit latencies for USB3 U2 LPM state, and hub-initiated timeout.
  * @lpm_disable_count: Ref count used by usb_disable_lpm() and usb_enable_lpm()
@@ -538,6 +556,7 @@
 	unsigned wusb:1;
 	unsigned lpm_capable:1;
 	unsigned usb2_hw_lpm_capable:1;
+	unsigned usb2_hw_lpm_besl_capable:1;
 	unsigned usb2_hw_lpm_enabled:1;
 	unsigned usb3_lpm_enabled:1;
 	int string_langid;
@@ -566,6 +585,7 @@
 	struct wusb_dev *wusb_dev;
 	int slot_id;
 	enum usb_device_removable removable;
+	struct usb2_lpm_parameters l1_params;
 	struct usb3_lpm_parameters u1_params;
 	struct usb3_lpm_parameters u2_params;
 	unsigned lpm_disable_count;
@@ -717,6 +737,7 @@
 extern int usb_match_one_id(struct usb_interface *interface,
 			    const struct usb_device_id *id);
 
+extern int usb_for_each_dev(void *data, int (*fn)(struct usb_device *, void *));
 extern struct usb_interface *usb_find_interface(struct usb_driver *drv,
 		int minor);
 extern struct usb_interface *usb_ifnum_to_if(const struct usb_device *dev,

diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index 544825d..2562994 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h

@@ -7,32 +7,33 @@
 
 #include <linux/usb/otg.h>
 
-struct ci13xxx;
-struct ci13xxx_platform_data {
+struct ci_hdrc;
+struct ci_hdrc_platform_data {
 	const char	*name;
 	/* offset of the capability registers */
 	uintptr_t	 capoffset;
 	unsigned	 power_budget;
 	struct usb_phy	*phy;
+	enum usb_phy_interface phy_mode;
 	unsigned long	 flags;
-#define CI13XXX_REGS_SHARED		BIT(0)
-#define CI13XXX_REQUIRE_TRANSCEIVER	BIT(1)
-#define CI13XXX_PULLUP_ON_VBUS		BIT(2)
-#define CI13XXX_DISABLE_STREAMING	BIT(3)
-
-#define CI13XXX_CONTROLLER_RESET_EVENT		0
-#define CI13XXX_CONTROLLER_STOPPED_EVENT	1
-	void	(*notify_event) (struct ci13xxx *ci, unsigned event);
+#define CI_HDRC_REGS_SHARED		BIT(0)
+#define CI_HDRC_REQUIRE_TRANSCEIVER	BIT(1)
+#define CI_HDRC_PULLUP_ON_VBUS		BIT(2)
+#define CI_HDRC_DISABLE_STREAMING	BIT(3)
+	enum usb_dr_mode	dr_mode;
+#define CI_HDRC_CONTROLLER_RESET_EVENT		0
+#define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
+	void	(*notify_event) (struct ci_hdrc *ci, unsigned event);
 };
 
 /* Default offset of capability registers */
 #define DEF_CAPOFFSET		0x100
 
-/* Add ci13xxx device */
-struct platform_device *ci13xxx_add_device(struct device *dev,
+/* Add ci hdrc device */
+struct platform_device *ci_hdrc_add_device(struct device *dev,
 			struct resource *res, int nres,
-			struct ci13xxx_platform_data *platdata);
-/* Remove ci13xxx device */
-void ci13xxx_remove_device(struct platform_device *pdev);
+			struct ci_hdrc_platform_data *platdata);
+/* Remove ci hdrc device */
+void ci_hdrc_remove_device(struct platform_device *pdev);
 
 #endif

diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index f5f5c7d..1e88377 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h

@@ -218,6 +218,7 @@
 #define	HCD_SHARED	0x0004		/* Two (or more) usb_hcds share HW */
 #define	HCD_USB11	0x0010		/* USB 1.1 */
 #define	HCD_USB2	0x0020		/* USB 2.0 */
+#define	HCD_USB25	0x0030		/* Wireless USB 1.0 (USB 2.5)*/
 #define	HCD_USB3	0x0040		/* USB 3.0 */
 #define	HCD_MASK	0x0070
 

diff --git a/include/linux/usb/of.h b/include/linux/usb/of.h
new file mode 100644
index 0000000..a0ef405
--- /dev/null
+++ b/include/linux/usb/of.h

@@ -0,0 +1,32 @@
+/*
+ * OF helpers for usb devices.
+ *
+ * This file is released under the GPLv2
+ */
+
+#ifndef __LINUX_USB_OF_H
+#define __LINUX_USB_OF_H
+
+#include <linux/usb/otg.h>
+#include <linux/usb/phy.h>
+
+#if IS_ENABLED(CONFIG_OF)
+enum usb_dr_mode of_usb_get_dr_mode(struct device_node *np);
+#else
+static inline enum usb_dr_mode of_usb_get_dr_mode(struct device_node *np)
+{
+	return USB_DR_MODE_UNKNOWN;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_USB_PHY)
+enum usb_phy_interface of_usb_get_phy_mode(struct device_node *np);
+#else
+static inline enum usb_phy_interface of_usb_get_phy_mode(struct device_node *np)
+{
+	return USBPHY_INTERFACE_MODE_UNKNOWN;
+}
+
+#endif
+
+#endif /* __LINUX_USB_OF_H */

diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h
index 291e01b..154332b 100644
--- a/include/linux/usb/otg.h
+++ b/include/linux/usb/otg.h

@@ -92,4 +92,11 @@
 /* for OTG controller drivers (and maybe other stuff) */
 extern int usb_bus_start_enum(struct usb_bus *bus, unsigned port_num);
 
+enum usb_dr_mode {
+	USB_DR_MODE_UNKNOWN,
+	USB_DR_MODE_HOST,
+	USB_DR_MODE_PERIPHERAL,
+	USB_DR_MODE_OTG,
+};
+
 #endif /* __LINUX_USB_OTG_H */

diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index 6b5978f..4403680 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h

@@ -12,6 +12,15 @@
 #include <linux/notifier.h>
 #include <linux/usb.h>
 
+enum usb_phy_interface {
+	USBPHY_INTERFACE_MODE_UNKNOWN,
+	USBPHY_INTERFACE_MODE_UTMI,
+	USBPHY_INTERFACE_MODE_UTMIW,
+	USBPHY_INTERFACE_MODE_ULPI,
+	USBPHY_INTERFACE_MODE_SERIAL,
+	USBPHY_INTERFACE_MODE_HSIC,
+};
+
 enum usb_phy_events {
 	USB_EVENT_NONE,         /* no events or cable disconnected */
 	USB_EVENT_VBUS,         /* vbus valid event */

diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 302ddf5..d528b80 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h

@@ -19,10 +19,6 @@
 #include <linux/sysrq.h>
 #include <linux/kfifo.h>
 
-#define SERIAL_TTY_MAJOR	188	/* Nice legal number now */
-#define SERIAL_TTY_MINORS	254	/* loads of devices :) */
-#define SERIAL_TTY_NO_MINOR	255	/* No minor was assigned */
-
 /* The maximum number of ports one device can grab at once */
 #define MAX_NUM_PORTS		8
 
@@ -37,7 +33,8 @@
  * @serial: pointer back to the struct usb_serial owner of this port.
  * @port: pointer to the corresponding tty_port for this port.
  * @lock: spinlock to grab when updating portions of this structure.
- * @number: the number of the port (the minor number).
+ * @minor: the minor number of the port
+ * @port_number: the struct usb_serial port number of this port (starts at 0)
  * @interrupt_in_buffer: pointer to the interrupt in buffer for this port.
  * @interrupt_in_urb: pointer to the interrupt in struct urb for this port.
  * @interrupt_in_endpointAddress: endpoint address for the interrupt in pipe
@@ -80,7 +77,8 @@
 	struct usb_serial	*serial;
 	struct tty_port		port;
 	spinlock_t		lock;
-	unsigned char		number;
+	u32			minor;
+	u8			port_number;
 
 	unsigned char		*interrupt_in_buffer;
 	struct urb		*interrupt_in_urb;
@@ -140,7 +138,6 @@
  * @dev: pointer to the struct usb_device for this device
  * @type: pointer to the struct usb_serial_driver for this device
  * @interface: pointer to the struct usb_interface for this device
- * @minor: the starting minor number for this device
  * @num_ports: the number of ports this device has
  * @num_interrupt_in: number of interrupt in endpoints we have
  * @num_interrupt_out: number of interrupt out endpoints we have
@@ -159,7 +156,7 @@
 	unsigned char			disconnected:1;
 	unsigned char			suspending:1;
 	unsigned char			attached:1;
-	unsigned char			minor;
+	unsigned char			minors_reserved:1;
 	unsigned char			num_ports;
 	unsigned char			num_port_pointers;
 	char				num_interrupt_in;
@@ -319,7 +316,7 @@
 #endif
 
 /* Functions needed by other parts of the usbserial core */
-extern struct usb_serial *usb_serial_get_by_index(unsigned int minor);
+extern struct usb_serial_port *usb_serial_port_get_by_minor(unsigned int minor);
 extern void usb_serial_put(struct usb_serial *serial);
 extern int usb_serial_generic_open(struct tty_struct *tty,
 	struct usb_serial_port *port);

diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h
index 1b7519a..d2ca919 100644
--- a/include/linux/usb/tegra_usb_phy.h
+++ b/include/linux/usb/tegra_usb_phy.h

@@ -42,6 +42,7 @@
 enum tegra_usb_phy_mode {
 	TEGRA_USB_PHY_MODE_DEVICE,
 	TEGRA_USB_PHY_MODE_HOST,
+	TEGRA_USB_PHY_MODE_OTG,
 };
 
 struct tegra_xtal_freq;
@@ -61,14 +62,10 @@
 	struct device *dev;
 	bool is_legacy_phy;
 	bool is_ulpi_phy;
-	void (*set_pts)(struct usb_phy *x, u8 pts_val);
-	void (*set_phcd)(struct usb_phy *x, bool enable);
+	int reset_gpio;
 };
 
-struct tegra_usb_phy *tegra_usb_phy_open(struct device *dev, int instance,
-	void __iomem *regs, void *config, enum tegra_usb_phy_mode phy_mode,
-	void (*set_pts)(struct usb_phy *x, u8 pts_val),
-	void (*set_phcd)(struct usb_phy *x, bool enable));
+struct usb_phy *tegra_usb_get_phy(struct device_node *dn);
 
 void tegra_usb_phy_preresume(struct usb_phy *phy);
 

diff --git a/include/linux/usb/wusb-wa.h b/include/linux/usb/wusb-wa.h
index f9dec37..6be985b 100644
--- a/include/linux/usb/wusb-wa.h
+++ b/include/linux/usb/wusb-wa.h

@@ -92,11 +92,20 @@
 	__le16  wRPipeIndex;
 	__le16	wRequests;
 	__le16	wBlocks;		/* rw if 0 */
-	__le16	wMaxPacketSize;		/* rw? */
-	u8	bHSHubAddress;		/* reserved: 0 */
-	u8	bHSHubPort;		/* ??? FIXME ??? */
+	__le16	wMaxPacketSize;		/* rw */
+	union {
+		u8	dwa_bHSHubAddress;		/* rw: DWA. */
+		u8	hwa_bMaxBurst;			/* rw: HWA. */
+	};
+	union {
+		u8	dwa_bHSHubPort;		/*  rw: DWA. */
+		u8	hwa_bDeviceInfoIndex;	/*  rw: HWA. */
+	};
 	u8	bSpeed;			/* rw: xfer rate 'enum uwb_phy_rate' */
-	u8	bDeviceAddress;		/* rw: Target device address */
+	union {
+		u8 dwa_bDeviceAddress;	/* rw: DWA Target device address. */
+		u8 hwa_reserved;		/* rw: HWA. */
+	};
 	u8	bEndpointAddress;	/* rw: Target EP address */
 	u8	bDataSequence;		/* ro: Current Data sequence */
 	__le32	dwCurrentWindow;	/* ro */

diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 0d33fca..8d76342 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h

@@ -133,8 +133,6 @@
 void reset_vc(struct vc_data *vc);
 extern int do_unbind_con_driver(const struct consw *csw, int first, int last,
 			     int deflt);
-extern int unbind_con_driver(const struct consw *csw, int first, int last,
-			     int deflt);
 int vty_init(const struct file_operations *console_fops);
 
 static inline bool vt_force_oops_output(struct vc_data *vc)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1133695..f487a47 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h

@@ -23,6 +23,7 @@
 struct wait_bit_key {
 	void *flags;
 	int bit_nr;
+#define WAIT_ATOMIC_T_BIT_NR -1
 };
 
 struct wait_bit_queue {
@@ -60,6 +61,9 @@
 #define __WAIT_BIT_KEY_INITIALIZER(word, bit)				\
 	{ .flags = word, .bit_nr = bit, }
 
+#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p)				\
+	{ .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
+
 extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *);
 
 #define init_waitqueue_head(q)				\
@@ -146,8 +150,10 @@
 int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
 int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
 void wake_up_bit(void *, int);
+void wake_up_atomic_t(atomic_t *);
 int out_of_line_wait_on_bit(void *, int, int (*)(void *), unsigned);
 int out_of_line_wait_on_bit_lock(void *, int, int (*)(void *), unsigned);
+int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned);
 wait_queue_head_t *bit_waitqueue(void *, int);
 
 #define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
@@ -902,5 +908,23 @@
 		return 0;
 	return out_of_line_wait_on_bit_lock(word, bit, action, mode);
 }
+
+/**
+ * wait_on_atomic_t - Wait for an atomic_t to become 0
+ * @val: The atomic value being waited on, a kernel virtual address
+ * @action: the function used to sleep, which may take special actions
+ * @mode: the task state to sleep in
+ *
+ * Wait for an atomic_t to become 0.  We abuse the bit-wait waitqueue table for
+ * the purpose of getting a waitqueue, but we set the key to a bit number
+ * outside of the target 'word'.
+ */
+static inline
+int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
+{
+	if (atomic_read(val) == 0)
+		return 0;
+	return out_of_line_wait_on_atomic_t(val, action, mode);
+}
 	
 #endif

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 579a500..abfe117 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h

@@ -78,6 +78,7 @@
 	unsigned tagged_writepages:1;	/* tag-and-write to avoid livelock */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
+	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
 };
 
 /*

diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h
index 15d11a3..6797b9d 100644
--- a/include/trace/events/ext3.h
+++ b/include/trace/events/ext3.h

@@ -290,13 +290,14 @@
 );
 
 TRACE_EVENT(ext3_invalidatepage,
-	TP_PROTO(struct page *page, unsigned long offset),
+	TP_PROTO(struct page *page, unsigned int offset, unsigned int length),
 
-	TP_ARGS(page, offset),
+	TP_ARGS(page, offset, length),
 
 	TP_STRUCT__entry(
 		__field(	pgoff_t, index			)
-		__field(	unsigned long, offset		)
+		__field(	unsigned int, offset		)
+		__field(	unsigned int, length		)
 		__field(	ino_t,	ino			)
 		__field(	dev_t,	dev			)
 
@@ -305,14 +306,15 @@
 	TP_fast_assign(
 		__entry->index	= page->index;
 		__entry->offset	= offset;
+		__entry->length	= length;
 		__entry->ino	= page->mapping->host->i_ino;
 		__entry->dev	= page->mapping->host->i_sb->s_dev;
 	),
 
-	TP_printk("dev %d,%d ino %lu page_index %lu offset %lu",
+	TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
-		  __entry->index, __entry->offset)
+		  __entry->index, __entry->offset, __entry->length)
 );
 
 TRACE_EVENT(ext3_discard_blocks,

diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 8ee15b9..2068db2 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h

@@ -19,6 +19,57 @@
 
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 
+#define show_mballoc_flags(flags) __print_flags(flags, "|",	\
+	{ EXT4_MB_HINT_MERGE,		"HINT_MERGE" },		\
+	{ EXT4_MB_HINT_RESERVED,	"HINT_RESV" },		\
+	{ EXT4_MB_HINT_METADATA,	"HINT_MDATA" },		\
+	{ EXT4_MB_HINT_FIRST,		"HINT_FIRST" },		\
+	{ EXT4_MB_HINT_BEST,		"HINT_BEST" },		\
+	{ EXT4_MB_HINT_DATA,		"HINT_DATA" },		\
+	{ EXT4_MB_HINT_NOPREALLOC,	"HINT_NOPREALLOC" },	\
+	{ EXT4_MB_HINT_GROUP_ALLOC,	"HINT_GRP_ALLOC" },	\
+	{ EXT4_MB_HINT_GOAL_ONLY,	"HINT_GOAL_ONLY" },	\
+	{ EXT4_MB_HINT_TRY_GOAL,	"HINT_TRY_GOAL" },	\
+	{ EXT4_MB_DELALLOC_RESERVED,	"DELALLOC_RESV" },	\
+	{ EXT4_MB_STREAM_ALLOC,		"STREAM_ALLOC" },	\
+	{ EXT4_MB_USE_ROOT_BLOCKS,	"USE_ROOT_BLKS" },	\
+	{ EXT4_MB_USE_RESERVED,		"USE_RESV" })
+
+#define show_map_flags(flags) __print_flags(flags, "|",			\
+	{ EXT4_GET_BLOCKS_CREATE,		"CREATE" },		\
+	{ EXT4_GET_BLOCKS_UNINIT_EXT,		"UNINIT" },		\
+	{ EXT4_GET_BLOCKS_DELALLOC_RESERVE,	"DELALLOC" },		\
+	{ EXT4_GET_BLOCKS_PRE_IO,		"PRE_IO" },		\
+	{ EXT4_GET_BLOCKS_CONVERT,		"CONVERT" },		\
+	{ EXT4_GET_BLOCKS_METADATA_NOFAIL,	"METADATA_NOFAIL" },	\
+	{ EXT4_GET_BLOCKS_NO_NORMALIZE,		"NO_NORMALIZE" },	\
+	{ EXT4_GET_BLOCKS_KEEP_SIZE,		"KEEP_SIZE" },		\
+	{ EXT4_GET_BLOCKS_NO_LOCK,		"NO_LOCK" },		\
+	{ EXT4_GET_BLOCKS_NO_PUT_HOLE,		"NO_PUT_HOLE" })
+
+#define show_mflags(flags) __print_flags(flags, "",	\
+	{ EXT4_MAP_NEW,		"N" },			\
+	{ EXT4_MAP_MAPPED,	"M" },			\
+	{ EXT4_MAP_UNWRITTEN,	"U" },			\
+	{ EXT4_MAP_BOUNDARY,	"B" },			\
+	{ EXT4_MAP_UNINIT,	"u" },			\
+	{ EXT4_MAP_FROM_CLUSTER, "C" })
+
+#define show_free_flags(flags) __print_flags(flags, "|",	\
+	{ EXT4_FREE_BLOCKS_METADATA,		"METADATA" },	\
+	{ EXT4_FREE_BLOCKS_FORGET,		"FORGET" },	\
+	{ EXT4_FREE_BLOCKS_VALIDATED,		"VALIDATED" },	\
+	{ EXT4_FREE_BLOCKS_NO_QUOT_UPDATE,	"NO_QUOTA" },	\
+	{ EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\
+	{ EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER,	"LAST_CLUSTER" })
+
+#define show_extent_status(status) __print_flags(status, "",	\
+	{ (1 << 3),	"W" }, 					\
+	{ (1 << 2),	"U" },					\
+	{ (1 << 1),	"D" },					\
+	{ (1 << 0),	"H" })
+
+
 TRACE_EVENT(ext4_free_inode,
 	TP_PROTO(struct inode *inode),
 
@@ -281,7 +332,7 @@
 	TP_ARGS(inode, pos, len, copied)
 );
 
-TRACE_EVENT(ext4_da_writepages,
+TRACE_EVENT(ext4_writepages,
 	TP_PROTO(struct inode *inode, struct writeback_control *wbc),
 
 	TP_ARGS(inode, wbc),
@@ -324,46 +375,62 @@
 );
 
 TRACE_EVENT(ext4_da_write_pages,
-	TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
+	TP_PROTO(struct inode *inode, pgoff_t first_page,
+		 struct writeback_control *wbc),
 
-	TP_ARGS(inode, mpd),
+	TP_ARGS(inode, first_page, wbc),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
-		__field(	__u64,	b_blocknr		)
-		__field(	__u32,	b_size			)
-		__field(	__u32,	b_state			)
-		__field(	unsigned long,	first_page	)
-		__field(	int,	io_done			)
-		__field(	int,	pages_written		)
-		__field(	int,	sync_mode		)
+		__field(      pgoff_t,	first_page		)
+		__field(	 long,	nr_to_write		)
+		__field(	  int,	sync_mode		)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
-		__entry->b_blocknr	= mpd->b_blocknr;
-		__entry->b_size		= mpd->b_size;
-		__entry->b_state	= mpd->b_state;
-		__entry->first_page	= mpd->first_page;
-		__entry->io_done	= mpd->io_done;
-		__entry->pages_written	= mpd->pages_written;
-		__entry->sync_mode	= mpd->wbc->sync_mode;
+		__entry->first_page	= first_page;
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->sync_mode	= wbc->sync_mode;
 	),
 
-	TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x "
-		  "first_page %lu io_done %d pages_written %d sync_mode %d",
+	TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
+		  "sync_mode %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  __entry->b_blocknr, __entry->b_size,
-		  __entry->b_state, __entry->first_page,
-		  __entry->io_done, __entry->pages_written,
-		  __entry->sync_mode
-                  )
+		  (unsigned long) __entry->ino, __entry->first_page,
+		  __entry->nr_to_write, __entry->sync_mode)
 );
 
-TRACE_EVENT(ext4_da_writepages_result,
+TRACE_EVENT(ext4_da_write_pages_extent,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),
+
+	TP_ARGS(inode, map),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	__u64,	lblk			)
+		__field(	__u32,	len			)
+		__field(	__u32,	flags			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->lblk		= map->m_lblk;
+		__entry->len		= map->m_len;
+		__entry->flags		= map->m_flags;
+	),
+
+	TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino, __entry->lblk, __entry->len,
+		  show_mflags(__entry->flags))
+);
+
+TRACE_EVENT(ext4_writepages_result,
 	TP_PROTO(struct inode *inode, struct writeback_control *wbc,
 			int ret, int pages_written),
 
@@ -444,16 +511,16 @@
 );
 
 DECLARE_EVENT_CLASS(ext4_invalidatepage_op,
-	TP_PROTO(struct page *page, unsigned long offset),
+	TP_PROTO(struct page *page, unsigned int offset, unsigned int length),
 
-	TP_ARGS(page, offset),
+	TP_ARGS(page, offset, length),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	pgoff_t, index			)
-		__field(	unsigned long, offset		)
-
+		__field(	unsigned int, offset		)
+		__field(	unsigned int, length		)
 	),
 
 	TP_fast_assign(
@@ -461,24 +528,26 @@
 		__entry->ino	= page->mapping->host->i_ino;
 		__entry->index	= page->index;
 		__entry->offset	= offset;
+		__entry->length	= length;
 	),
 
-	TP_printk("dev %d,%d ino %lu page_index %lu offset %lu",
+	TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
-		  (unsigned long) __entry->index, __entry->offset)
+		  (unsigned long) __entry->index,
+		  __entry->offset, __entry->length)
 );
 
 DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage,
-	TP_PROTO(struct page *page, unsigned long offset),
+	TP_PROTO(struct page *page, unsigned int offset, unsigned int length),
 
-	TP_ARGS(page, offset)
+	TP_ARGS(page, offset, length)
 );
 
 DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage,
-	TP_PROTO(struct page *page, unsigned long offset),
+	TP_PROTO(struct page *page, unsigned int offset, unsigned int length),
 
-	TP_ARGS(page, offset)
+	TP_ARGS(page, offset, length)
 );
 
 TRACE_EVENT(ext4_discard_blocks,
@@ -673,10 +742,10 @@
 		__entry->flags	= ar->flags;
 	),
 
-	TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu "
+	TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu "
 		  "lleft %u lright %u pleft %llu pright %llu ",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, __entry->flags,
+		  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
 		  __entry->len, __entry->logical, __entry->goal,
 		  __entry->lleft, __entry->lright, __entry->pleft,
 		  __entry->pright)
@@ -715,10 +784,10 @@
 		__entry->flags	= ar->flags;
 	),
 
-	TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u "
+	TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u "
 		  "goal %llu lleft %u lright %u pleft %llu pright %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, __entry->flags,
+		  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
 		  __entry->len, __entry->block, __entry->logical,
 		  __entry->goal,  __entry->lleft, __entry->lright,
 		  __entry->pleft, __entry->pright)
@@ -748,11 +817,11 @@
 		__entry->mode		= inode->i_mode;
 	),
 
-	TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d",
+	TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->mode, __entry->block, __entry->count,
-		  __entry->flags)
+		  show_free_flags(__entry->flags))
 );
 
 TRACE_EVENT(ext4_sync_file_enter,
@@ -903,7 +972,7 @@
 	),
 
 	TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
-		  "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x "
+		  "result %u/%d/%u@%u blks %u grps %u cr %u flags %s "
 		  "tail %u broken %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
@@ -914,7 +983,7 @@
 		  __entry->result_group, __entry->result_start,
 		  __entry->result_len, __entry->result_logical,
 		  __entry->found, __entry->groups, __entry->cr,
-		  __entry->flags, __entry->tail,
+		  show_mballoc_flags(__entry->flags), __entry->tail,
 		  __entry->buddy ? 1 << __entry->buddy : 0)
 );
 
@@ -1528,10 +1597,10 @@
 		__entry->flags	= flags;
 	),
 
-	TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u",
+	TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
-		  __entry->lblk, __entry->len, __entry->flags)
+		  __entry->lblk, __entry->len, show_map_flags(__entry->flags))
 );
 
 DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
@@ -1549,47 +1618,53 @@
 );
 
 DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
-	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret),
+	TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map,
+		 int ret),
 
-	TP_ARGS(inode, map, ret),
+	TP_ARGS(inode, flags, map, ret),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,		dev		)
 		__field(	ino_t,		ino		)
+		__field(	unsigned int,	flags		)
 		__field(	ext4_fsblk_t,	pblk		)
 		__field(	ext4_lblk_t,	lblk		)
 		__field(	unsigned int,	len		)
-		__field(	unsigned int,	flags		)
+		__field(	unsigned int,	mflags		)
 		__field(	int,		ret		)
 	),
 
 	TP_fast_assign(
 		__entry->dev    = inode->i_sb->s_dev;
 		__entry->ino    = inode->i_ino;
+		__entry->flags	= flags;
 		__entry->pblk	= map->m_pblk;
 		__entry->lblk	= map->m_lblk;
 		__entry->len	= map->m_len;
-		__entry->flags	= map->m_flags;
+		__entry->mflags	= map->m_flags;
 		__entry->ret	= ret;
 	),
 
-	TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d",
+	TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u "
+		  "mflags %s ret %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
-		  __entry->lblk, __entry->pblk,
-		  __entry->len, __entry->flags, __entry->ret)
+		  show_map_flags(__entry->flags), __entry->lblk, __entry->pblk,
+		  __entry->len, show_mflags(__entry->mflags), __entry->ret)
 );
 
 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
-	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret),
+	TP_PROTO(struct inode *inode, unsigned flags,
+		 struct ext4_map_blocks *map, int ret),
 
-	TP_ARGS(inode, map, ret)
+	TP_ARGS(inode, flags, map, ret)
 );
 
 DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
-	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret),
+	TP_PROTO(struct inode *inode, unsigned flags,
+		 struct ext4_map_blocks *map, int ret),
 
-	TP_ARGS(inode, map, ret)
+	TP_ARGS(inode, flags, map, ret)
 );
 
 TRACE_EVENT(ext4_ext_load_extent,
@@ -1638,25 +1713,50 @@
 );
 
 TRACE_EVENT(ext4_journal_start,
-	TP_PROTO(struct super_block *sb, int nblocks, unsigned long IP),
+	TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks,
+		 unsigned long IP),
 
-	TP_ARGS(sb, nblocks, IP),
+	TP_ARGS(sb, blocks, rsv_blocks, IP),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(unsigned long,	ip			)
-		__field(	int,	nblocks			)
+		__field(	  int,	blocks			)
+		__field(	  int,	rsv_blocks		)
 	),
 
 	TP_fast_assign(
-		__entry->dev	 = sb->s_dev;
-		__entry->ip	 = IP;
-		__entry->nblocks = nblocks;
+		__entry->dev		 = sb->s_dev;
+		__entry->ip		 = IP;
+		__entry->blocks		 = blocks;
+		__entry->rsv_blocks	 = rsv_blocks;
 	),
 
-	TP_printk("dev %d,%d nblocks %d caller %pF",
+	TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pF",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->nblocks, (void *)__entry->ip)
+		  __entry->blocks, __entry->rsv_blocks, (void *)__entry->ip)
+);
+
+TRACE_EVENT(ext4_journal_start_reserved,
+	TP_PROTO(struct super_block *sb, int blocks, unsigned long IP),
+
+	TP_ARGS(sb, blocks, IP),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(unsigned long,	ip			)
+		__field(	  int,	blocks			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		 = sb->s_dev;
+		__entry->ip		 = IP;
+		__entry->blocks		 = blocks;
+	),
+
+	TP_printk("dev %d,%d blocks, %d caller %pF",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->blocks, (void *)__entry->ip)
 );
 
 DECLARE_EVENT_CLASS(ext4__trim,
@@ -1736,12 +1836,12 @@
 		__entry->newblk		= newblock;
 	),
 
-	TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x "
+	TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s "
 		  "allocated %d newblock %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
-		  __entry->len, __entry->flags,
+		  __entry->len, show_map_flags(__entry->flags),
 		  (unsigned int) __entry->allocated,
 		  (unsigned long long) __entry->newblk)
 );
@@ -1769,10 +1869,10 @@
 		__entry->ret	= ret;
 	),
 
-	TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %u ret %d",
+	TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->lblk, (unsigned long long) __entry->pblk,
-		  __entry->len, __entry->flags, __entry->ret)
+		  __entry->len, show_mflags(__entry->flags), __entry->ret)
 );
 
 TRACE_EVENT(ext4_ext_put_in_cache,
@@ -1926,7 +2026,7 @@
 TRACE_EVENT(ext4_remove_blocks,
 	    TP_PROTO(struct inode *inode, struct ext4_extent *ex,
 		ext4_lblk_t from, ext4_fsblk_t to,
-		ext4_fsblk_t partial_cluster),
+		long long partial_cluster),
 
 	TP_ARGS(inode, ex, from, to, partial_cluster),
 
@@ -1935,7 +2035,7 @@
 		__field(	ino_t,		ino	)
 		__field(	ext4_lblk_t,	from	)
 		__field(	ext4_lblk_t,	to	)
-		__field(	ext4_fsblk_t,	partial	)
+		__field(	long long,	partial	)
 		__field(	ext4_fsblk_t,	ee_pblk	)
 		__field(	ext4_lblk_t,	ee_lblk	)
 		__field(	unsigned short,	ee_len	)
@@ -1953,7 +2053,7 @@
 	),
 
 	TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
-		  "from %u to %u partial_cluster %u",
+		  "from %u to %u partial_cluster %lld",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned) __entry->ee_lblk,
@@ -1961,19 +2061,20 @@
 		  (unsigned short) __entry->ee_len,
 		  (unsigned) __entry->from,
 		  (unsigned) __entry->to,
-		  (unsigned) __entry->partial)
+		  (long long) __entry->partial)
 );
 
 TRACE_EVENT(ext4_ext_rm_leaf,
 	TP_PROTO(struct inode *inode, ext4_lblk_t start,
-		 struct ext4_extent *ex, ext4_fsblk_t partial_cluster),
+		 struct ext4_extent *ex,
+		 long long partial_cluster),
 
 	TP_ARGS(inode, start, ex, partial_cluster),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,		dev	)
 		__field(	ino_t,		ino	)
-		__field(	ext4_fsblk_t,	partial	)
+		__field(	long long,	partial	)
 		__field(	ext4_lblk_t,	start	)
 		__field(	ext4_lblk_t,	ee_lblk	)
 		__field(	ext4_fsblk_t,	ee_pblk	)
@@ -1991,14 +2092,14 @@
 	),
 
 	TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
-		  "partial_cluster %u",
+		  "partial_cluster %lld",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned) __entry->start,
 		  (unsigned) __entry->ee_lblk,
 		  (unsigned long long) __entry->ee_pblk,
 		  (unsigned short) __entry->ee_len,
-		  (unsigned) __entry->partial)
+		  (long long) __entry->partial)
 );
 
 TRACE_EVENT(ext4_ext_rm_idx,
@@ -2025,14 +2126,16 @@
 );
 
 TRACE_EVENT(ext4_ext_remove_space,
-	TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth),
+	TP_PROTO(struct inode *inode, ext4_lblk_t start,
+		 ext4_lblk_t end, int depth),
 
-	TP_ARGS(inode, start, depth),
+	TP_ARGS(inode, start, end, depth),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,		dev	)
 		__field(	ino_t,		ino	)
 		__field(	ext4_lblk_t,	start	)
+		__field(	ext4_lblk_t,	end	)
 		__field(	int,		depth	)
 	),
 
@@ -2040,28 +2143,31 @@
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->start	= start;
+		__entry->end	= end;
 		__entry->depth	= depth;
 	),
 
-	TP_printk("dev %d,%d ino %lu since %u depth %d",
+	TP_printk("dev %d,%d ino %lu since %u end %u depth %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned) __entry->start,
+		  (unsigned) __entry->end,
 		  __entry->depth)
 );
 
 TRACE_EVENT(ext4_ext_remove_space_done,
-	TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
-		ext4_lblk_t partial, __le16 eh_entries),
+	TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
+		 int depth, long long partial, __le16 eh_entries),
 
-	TP_ARGS(inode, start, depth, partial, eh_entries),
+	TP_ARGS(inode, start, end, depth, partial, eh_entries),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,		dev		)
 		__field(	ino_t,		ino		)
 		__field(	ext4_lblk_t,	start		)
+		__field(	ext4_lblk_t,	end		)
 		__field(	int,		depth		)
-		__field(	ext4_lblk_t,	partial		)
+		__field(	long long,	partial		)
 		__field(	unsigned short,	eh_entries	)
 	),
 
@@ -2069,18 +2175,20 @@
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
 		__entry->start		= start;
+		__entry->end		= end;
 		__entry->depth		= depth;
 		__entry->partial	= partial;
 		__entry->eh_entries	= le16_to_cpu(eh_entries);
 	),
 
-	TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "
+	TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld "
 		  "remaining_entries %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  (unsigned) __entry->start,
+		  (unsigned) __entry->end,
 		  __entry->depth,
-		  (unsigned) __entry->partial,
+		  (long long) __entry->partial,
 		  (unsigned short) __entry->eh_entries)
 );
 
@@ -2095,7 +2203,7 @@
 		__field(	ext4_lblk_t,	lblk		)
 		__field(	ext4_lblk_t,	len		)
 		__field(	ext4_fsblk_t,	pblk		)
-		__field(	unsigned long long, status	)
+		__field(	char, status	)
 	),
 
 	TP_fast_assign(
@@ -2104,14 +2212,14 @@
 		__entry->lblk	= es->es_lblk;
 		__entry->len	= es->es_len;
 		__entry->pblk	= ext4_es_pblock(es);
-		__entry->status	= ext4_es_status(es);
+		__entry->status	= ext4_es_status(es) >> 60;
 	),
 
-	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx",
+	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->lblk, __entry->len,
-		  __entry->pblk, __entry->status)
+		  __entry->pblk, show_extent_status(__entry->status))
 );
 
 TRACE_EVENT(ext4_es_remove_extent,
@@ -2172,7 +2280,7 @@
 		__field(	ext4_lblk_t,	lblk		)
 		__field(	ext4_lblk_t,	len		)
 		__field(	ext4_fsblk_t,	pblk		)
-		__field(	unsigned long long, status	)
+		__field(	char, status	)
 	),
 
 	TP_fast_assign(
@@ -2181,14 +2289,14 @@
 		__entry->lblk	= es->es_lblk;
 		__entry->len	= es->es_len;
 		__entry->pblk	= ext4_es_pblock(es);
-		__entry->status	= ext4_es_status(es);
+		__entry->status	= ext4_es_status(es) >> 60;
 	),
 
-	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx",
+	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino,
 		  __entry->lblk, __entry->len,
-		  __entry->pblk, __entry->status)
+		  __entry->pblk, show_extent_status(__entry->status))
 );
 
 TRACE_EVENT(ext4_es_lookup_extent_enter,
@@ -2225,7 +2333,7 @@
 		__field(	ext4_lblk_t,	lblk		)
 		__field(	ext4_lblk_t,	len		)
 		__field(	ext4_fsblk_t,	pblk		)
-		__field(	unsigned long long,	status	)
+		__field(	char,		status		)
 		__field(	int,		found		)
 	),
 
@@ -2235,16 +2343,16 @@
 		__entry->lblk	= es->es_lblk;
 		__entry->len	= es->es_len;
 		__entry->pblk	= ext4_es_pblock(es);
-		__entry->status	= ext4_es_status(es);
+		__entry->status	= ext4_es_status(es) >> 60;
 		__entry->found	= found;
 	),
 
-	TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %llx",
+	TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->found,
 		  __entry->lblk, __entry->len,
 		  __entry->found ? __entry->pblk : 0,
-		  __entry->found ? __entry->status : 0)
+		  show_extent_status(__entry->found ? __entry->status : 0))
 );
 
 TRACE_EVENT(ext4_es_shrink_enter,

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ab5d499..bdc6e87f 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild

@@ -261,6 +261,7 @@
 header-y += net_tstamp.h
 header-y += netconf.h
 header-y += netdevice.h
+header-y += netlink_diag.h
 header-y += netfilter.h
 header-y += netfilter_arp.h
 header-y += netfilter_bridge.h

diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 74c2bf7..c8eaeb5 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h

@@ -226,4 +226,7 @@
 /* Rocketport EXPRESS/INFINITY */
 #define PORT_RP2	102
 
+/* Freescale lpuart */
+#define PORT_LPUART	103
+
 #endif /* _UAPILINUX_SERIAL_CORE_H */

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a64f8ae..20185ea 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c

@@ -120,7 +120,7 @@
 	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
 		if (iter->hw.bp_target == tsk &&
 		    find_slot_idx(iter) == type &&
-		    cpu == iter->cpu)
+		    (iter->cpu < 0 || cpu == iter->cpu))
 			count += hw_breakpoint_weight(iter);
 	}
 
@@ -149,7 +149,7 @@
 		return;
 	}
 
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		unsigned int nr;
 
 		nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -235,7 +235,7 @@
 	if (cpu >= 0) {
 		toggle_bp_task_slot(bp, cpu, enable, type, weight);
 	} else {
-		for_each_online_cpu(cpu)
+		for_each_possible_cpu(cpu)
 			toggle_bp_task_slot(bp, cpu, enable, type, weight);
 	}
 

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aed981a..335a7ae 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c

@@ -665,20 +665,22 @@
 		if (unlikely(is_compat_task())) {
 			compat_siginfo_t __user *uinfo = compat_ptr(data);
 
-			ret = copy_siginfo_to_user32(uinfo, &info);
-			ret |= __put_user(info.si_code, &uinfo->si_code);
+			if (copy_siginfo_to_user32(uinfo, &info) ||
+			    __put_user(info.si_code, &uinfo->si_code)) {
+				ret = -EFAULT;
+				break;
+			}
+
 		} else
 #endif
 		{
 			siginfo_t __user *uinfo = (siginfo_t __user *) data;
 
-			ret = copy_siginfo_to_user(uinfo, &info);
-			ret |= __put_user(info.si_code, &uinfo->si_code);
-		}
-
-		if (ret) {
-			ret = -EFAULT;
-			break;
+			if (copy_siginfo_to_user(uinfo, &info) ||
+			    __put_user(info.si_code, &uinfo->si_code)) {
+				ret = -EFAULT;
+				break;
+			}
 		}
 
 		data += sizeof(siginfo_t);

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b4c2455..20d6fba 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c

@@ -599,8 +599,6 @@
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-			if (dev->next_event.tv64 == KTIME_MAX)
-				goto out;
 			/*
 			 * The cpu which was handling the broadcast
 			 * timer marked this cpu in the broadcast
@@ -615,6 +613,11 @@
 				goto out;
 
 			/*
+			 * Bail out if there is no next event.
+			 */
+			if (dev->next_event.tv64 == KTIME_MAX)
+				goto out;
+			/*
 			 * If the pending bit is not set, then we are
 			 * either the CPU handling the broadcast
 			 * interrupt or we got woken by something else.

diff --git a/kernel/wait.c b/kernel/wait.c
index 6698e0c..ce0daa3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c

@@ -287,3 +287,91 @@
 	return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
 }
 EXPORT_SYMBOL(bit_waitqueue);
+
+/*
+ * Manipulate the atomic_t address to produce a better bit waitqueue table hash
+ * index (we're keying off bit -1, but that would produce a horrible hash
+ * value).
+ */
+static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
+{
+	if (BITS_PER_LONG == 64) {
+		unsigned long q = (unsigned long)p;
+		return bit_waitqueue((void *)(q & ~1), q & 1);
+	}
+	return bit_waitqueue(p, 0);
+}
+
+static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
+				  void *arg)
+{
+	struct wait_bit_key *key = arg;
+	struct wait_bit_queue *wait_bit
+		= container_of(wait, struct wait_bit_queue, wait);
+	atomic_t *val = key->flags;
+
+	if (wait_bit->key.flags != key->flags ||
+	    wait_bit->key.bit_nr != key->bit_nr ||
+	    atomic_read(val) != 0)
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, key);
+}
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
+ * the actions of __wait_on_atomic_t() are permitted return codes.  Nonzero
+ * return codes halt waiting and return.
+ */
+static __sched
+int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
+		       int (*action)(atomic_t *), unsigned mode)
+{
+	atomic_t *val;
+	int ret = 0;
+
+	do {
+		prepare_to_wait(wq, &q->wait, mode);
+		val = q->key.flags;
+		if (atomic_read(val) == 0)
+			ret = (*action)(val);
+	} while (!ret && atomic_read(val) != 0);
+	finish_wait(wq, &q->wait);
+	return ret;
+}
+
+#define DEFINE_WAIT_ATOMIC_T(name, p)					\
+	struct wait_bit_queue name = {					\
+		.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),		\
+		.wait	= {						\
+			.private	= current,			\
+			.func		= wake_atomic_t_function,	\
+			.task_list	=				\
+				LIST_HEAD_INIT((name).wait.task_list),	\
+		},							\
+	}
+
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
+					 unsigned mode)
+{
+	wait_queue_head_t *wq = atomic_t_waitqueue(p);
+	DEFINE_WAIT_ATOMIC_T(wait, p);
+
+	return __wait_on_atomic_t(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+
+/**
+ * wake_up_atomic_t - Wake up a waiter on a atomic_t
+ * @word: The word being waited on, a kernel virtual address
+ * @bit: The bit of the word being waited on
+ *
+ * Wake up anyone waiting for the atomic_t to go to zero.
+ *
+ * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
+ * check is done by the waiter's wake function, not the by the waker itself).
+ */
+void wake_up_atomic_t(atomic_t *p)
+{
+	__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
+}
+EXPORT_SYMBOL(wake_up_atomic_t);

diff --git a/lib/Kconfig b/lib/Kconfig
index fe01d41..d246a3b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig

@@ -22,6 +22,9 @@
 config GENERIC_STRNLEN_USER
 	bool
 
+config GENERIC_NET_UTILS
+	bool
+
 config GENERIC_FIND_FIRST_BIT
 	bool
 

diff --git a/lib/Makefile b/lib/Makefile
index c55a037..22f0f4e 100644
--- a/lib/Makefile
+++ b/lib/Makefile

@@ -137,6 +137,8 @@
 obj-$(CONFIG_GENERIC_STRNCPY_FROM_USER) += strncpy_from_user.o
 obj-$(CONFIG_GENERIC_STRNLEN_USER) += strnlen_user.o
 
+obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
+
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 
 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o

diff --git a/lib/net_utils.c b/lib/net_utils.c
new file mode 100644
index 0000000..2e3c52c
--- /dev/null
+++ b/lib/net_utils.c

@@ -0,0 +1,26 @@
+#include <linux/string.h>
+#include <linux/if_ether.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+
+int mac_pton(const char *s, u8 *mac)
+{
+	int i;
+
+	/* XX:XX:XX:XX:XX:XX */
+	if (strlen(s) < 3 * ETH_ALEN - 1)
+		return 0;
+
+	/* Don't dirty result unless string is valid MAC. */
+	for (i = 0; i < ETH_ALEN; i++) {
+		if (!isxdigit(s[i * 3]) || !isxdigit(s[i * 3 + 1]))
+			return 0;
+		if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':')
+			return 0;
+	}
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac[i] = (hex_to_bin(s[i * 3]) << 4) | hex_to_bin(s[i * 3 + 1]);
+	}
+	return 1;
+}
+EXPORT_SYMBOL(mac_pton);

diff --git a/mm/readahead.c b/mm/readahead.c
index daed28d..829a77c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c

@@ -48,7 +48,7 @@
 		if (!trylock_page(page))
 			BUG();
 		page->mapping = mapping;
-		do_invalidatepage(page, 0);
+		do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		page->mapping = NULL;
 		unlock_page(page);
 	}

diff --git a/mm/truncate.c b/mm/truncate.c
index c75b736..e2e8a8a 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c

@@ -26,7 +26,8 @@
 /**
  * do_invalidatepage - invalidate part or all of a page
  * @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
  *
  * do_invalidatepage() is called when all or part of the page has become
  * invalidated by a truncate operation.
@@ -37,24 +38,18 @@
  * point.  Because the caller is about to free (and possibly reuse) those
  * blocks on-disk.
  */
-void do_invalidatepage(struct page *page, unsigned long offset)
+void do_invalidatepage(struct page *page, unsigned int offset,
+		       unsigned int length)
 {
-	void (*invalidatepage)(struct page *, unsigned long);
+	void (*invalidatepage)(struct page *, unsigned int, unsigned int);
+
 	invalidatepage = page->mapping->a_ops->invalidatepage;
 #ifdef CONFIG_BLOCK
 	if (!invalidatepage)
 		invalidatepage = block_invalidatepage;
 #endif
 	if (invalidatepage)
-		(*invalidatepage)(page, offset);
-}
-
-static inline void truncate_partial_page(struct page *page, unsigned partial)
-{
-	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-	cleancache_invalidate_page(page->mapping, page);
-	if (page_has_private(page))
-		do_invalidatepage(page, partial);
+		(*invalidatepage)(page, offset, length);
 }
 
 /*
@@ -103,7 +98,7 @@
 		return -EIO;
 
 	if (page_has_private(page))
-		do_invalidatepage(page, 0);
+		do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 
 	cancel_dirty_page(page, PAGE_CACHE_SIZE);
 
@@ -185,11 +180,11 @@
  * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
  * @mapping: mapping to truncate
  * @lstart: offset from which to truncate
- * @lend: offset to which to truncate
+ * @lend: offset to which to truncate (inclusive)
  *
  * Truncate the page cache, removing the pages that are between
- * specified offsets (and zeroing out partial page
- * (if lstart is not page aligned)).
+ * specified offsets (and zeroing out partial pages
+ * if lstart or lend + 1 is not page aligned).
  *
  * Truncate takes two passes - the first pass is nonblocking.  It will not
  * block on page locks and it will not block on writeback.  The second pass
@@ -200,35 +195,58 @@
  * We pass down the cache-hot hint to the page freeing code.  Even if the
  * mapping is large, it is probably the case that the final pages are the most
  * recently touched, and freeing happens in ascending file offset order.
+ *
+ * Note that since ->invalidatepage() accepts range to invalidate
+ * truncate_inode_pages_range is able to handle cases where lend + 1 is not
+ * page aligned properly.
  */
 void truncate_inode_pages_range(struct address_space *mapping,
 				loff_t lstart, loff_t lend)
 {
-	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
-	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
-	struct pagevec pvec;
-	pgoff_t index;
-	pgoff_t end;
-	int i;
+	pgoff_t		start;		/* inclusive */
+	pgoff_t		end;		/* exclusive */
+	unsigned int	partial_start;	/* inclusive */
+	unsigned int	partial_end;	/* exclusive */
+	struct pagevec	pvec;
+	pgoff_t		index;
+	int		i;
 
 	cleancache_invalidate_inode(mapping);
 	if (mapping->nrpages == 0)
 		return;
 
-	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
-	end = (lend >> PAGE_CACHE_SHIFT);
+	/* Offsets within partial pages */
+	partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+	partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+
+	/*
+	 * 'start' and 'end' always covers the range of pages to be fully
+	 * truncated. Partial pages are covered with 'partial_start' at the
+	 * start of the range and 'partial_end' at the end of the range.
+	 * Note that 'end' is exclusive while 'lend' is inclusive.
+	 */
+	start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (lend == -1)
+		/*
+		 * lend == -1 indicates end-of-file so we have to set 'end'
+		 * to the highest possible pgoff_t and since the type is
+		 * unsigned we're using -1.
+		 */
+		end = -1;
+	else
+		end = (lend + 1) >> PAGE_CACHE_SHIFT;
 
 	pagevec_init(&pvec, 0);
 	index = start;
-	while (index <= end && pagevec_lookup(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+	while (index < end && pagevec_lookup(&pvec, mapping, index,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
 		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
 			/* We rely upon deletion not changing page->index */
 			index = page->index;
-			if (index > end)
+			if (index >= end)
 				break;
 
 			if (!trylock_page(page))
@@ -247,27 +265,56 @@
 		index++;
 	}
 
-	if (partial) {
+	if (partial_start) {
 		struct page *page = find_lock_page(mapping, start - 1);
 		if (page) {
+			unsigned int top = PAGE_CACHE_SIZE;
+			if (start > end) {
+				/* Truncation within a single page */
+				top = partial_end;
+				partial_end = 0;
+			}
 			wait_on_page_writeback(page);
-			truncate_partial_page(page, partial);
+			zero_user_segment(page, partial_start, top);
+			cleancache_invalidate_page(mapping, page);
+			if (page_has_private(page))
+				do_invalidatepage(page, partial_start,
+						  top - partial_start);
 			unlock_page(page);
 			page_cache_release(page);
 		}
 	}
+	if (partial_end) {
+		struct page *page = find_lock_page(mapping, end);
+		if (page) {
+			wait_on_page_writeback(page);
+			zero_user_segment(page, 0, partial_end);
+			cleancache_invalidate_page(mapping, page);
+			if (page_has_private(page))
+				do_invalidatepage(page, 0,
+						  partial_end);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+	/*
+	 * If the truncation happened within a single page no pages
+	 * will be released, just zeroed, so we can bail out now.
+	 */
+	if (start >= end)
+		return;
 
 	index = start;
 	for ( ; ; ) {
 		cond_resched();
 		if (!pagevec_lookup(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
 			if (index == start)
 				break;
 			index = start;
 			continue;
 		}
-		if (index == start && pvec.pages[0]->index > end) {
+		if (index == start && pvec.pages[0]->index >= end) {
 			pagevec_release(&pvec);
 			break;
 		}
@@ -277,7 +324,7 @@
 
 			/* We rely upon deletion not changing page->index */
 			index = page->index;
-			if (index > end)
+			if (index >= end)
 				break;
 
 			lock_page(page);
@@ -598,10 +645,8 @@
 	 * This rounding is currently just for example: unmap_mapping_range
 	 * expands its hole outwards, whereas we want it to contract the hole
 	 * inwards.  However, existing callers of truncate_pagecache_range are
-	 * doing their own page rounding first; and truncate_inode_pages_range
-	 * currently BUGs if lend is not pagealigned-1 (it handles partial
-	 * page at start of hole, but not partial page at end of hole).  Note
-	 * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
+	 * doing their own page rounding first.  Note that unmap_mapping_range
+	 * allows holelen 0 for all, and we allow lend -1 for end of file.
 	 */
 
 	/*

diff --git a/net/Kconfig b/net/Kconfig
index 2ddc904..6dfe1c6 100644
--- a/net/Kconfig
+++ b/net/Kconfig

@@ -5,6 +5,7 @@
 menuconfig NET
 	bool "Networking support"
 	select NLATTR
+	select GENERIC_NET_UTILS
 	---help---
 	  Unless you really know what you are doing, you should say Y here.
 	  The reason is that some programs need kernel networking support even

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index d817c93..ace5e55 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c

@@ -341,7 +341,6 @@
 
 static void bredr_setup(struct hci_request *req)
 {
-	struct hci_cp_delete_stored_link_key cp;
 	__le16 param;
 	__u8 flt_type;
 
@@ -365,10 +364,6 @@
 	param = __constant_cpu_to_le16(0x7d00);
 	hci_req_add(req, HCI_OP_WRITE_CA_TIMEOUT, 2, &param);
 
-	bacpy(&cp.bdaddr, BDADDR_ANY);
-	cp.delete_all = 0x01;
-	hci_req_add(req, HCI_OP_DELETE_STORED_LINK_KEY, sizeof(cp), &cp);
-
 	/* Read page scan parameters */
 	if (req->hdev->hci_ver > BLUETOOTH_VER_1_1) {
 		hci_req_add(req, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL);
@@ -602,6 +597,16 @@
 	struct hci_dev *hdev = req->hdev;
 	u8 p;
 
+	/* Only send HCI_Delete_Stored_Link_Key if it is supported */
+	if (hdev->commands[6] & 0x80) {
+		struct hci_cp_delete_stored_link_key cp;
+
+		bacpy(&cp.bdaddr, BDADDR_ANY);
+		cp.delete_all = 0x01;
+		hci_req_add(req, HCI_OP_DELETE_STORED_LINK_KEY,
+			    sizeof(cp), &cp);
+	}
+
 	if (hdev->commands[5] & 0x10)
 		hci_setup_link_policy(req);
 

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 24bee07..68843a2 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c

@@ -2852,6 +2852,9 @@
 	BT_DBG("conn %p, code 0x%2.2x, ident 0x%2.2x, len %u",
 	       conn, code, ident, dlen);
 
+	if (conn->mtu < L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE)
+		return NULL;
+
 	len = L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE + dlen;
 	count = min_t(unsigned int, conn->mtu, len);
 
@@ -4330,7 +4333,7 @@
 	struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) data;
 	u16 type, result;
 
-	if (cmd_len != sizeof(*rsp))
+	if (cmd_len < sizeof(*rsp))
 		return -EPROTO;
 
 	type   = __le16_to_cpu(rsp->type);

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 81f2389..d6448e3 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c

@@ -465,8 +465,9 @@
 	skb_set_transport_header(skb, skb->len);
 	mldq = (struct mld_msg *) icmp6_hdr(skb);
 
-	interval = ipv6_addr_any(group) ? br->multicast_last_member_interval :
-					  br->multicast_query_response_interval;
+	interval = ipv6_addr_any(group) ?
+			br->multicast_query_response_interval :
+			br->multicast_last_member_interval;
 
 	mldq->mld_type = ICMPV6_MGM_QUERY;
 	mldq->mld_code = 0;

diff --git a/net/core/dev.c b/net/core/dev.c
index fc1e289..faebb39 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c

@@ -792,6 +792,40 @@
 EXPORT_SYMBOL(dev_get_by_index);
 
 /**
+ *	netdev_get_name - get a netdevice name, knowing its ifindex.
+ *	@net: network namespace
+ *	@name: a pointer to the buffer where the name will be stored.
+ *	@ifindex: the ifindex of the interface to get the name from.
+ *
+ *	The use of raw_seqcount_begin() and cond_resched() before
+ *	retrying is required as we want to give the writers a chance
+ *	to complete when CONFIG_PREEMPT is not set.
+ */
+int netdev_get_name(struct net *net, char *name, int ifindex)
+{
+	struct net_device *dev;
+	unsigned int seq;
+
+retry:
+	seq = raw_seqcount_begin(&devnet_rename_seq);
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifindex);
+	if (!dev) {
+		rcu_read_unlock();
+		return -ENODEV;
+	}
+
+	strcpy(name, dev->name);
+	rcu_read_unlock();
+	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
+		cond_resched();
+		goto retry;
+	}
+
+	return 0;
+}
+
+/**
  *	dev_getbyhwaddr_rcu - find a device by its hardware address
  *	@net: the applicable net namespace
  *	@type: media type of device

diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 6cc0481..5b7d0e1 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c

@@ -19,9 +19,8 @@
 
 static int dev_ifname(struct net *net, struct ifreq __user *arg)
 {
-	struct net_device *dev;
 	struct ifreq ifr;
-	unsigned seq;
+	int error;
 
 	/*
 	 *	Fetch the caller's info block.
@@ -30,19 +29,9 @@
 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
 		return -EFAULT;
 
-retry:
-	seq = read_seqcount_begin(&devnet_rename_seq);
-	rcu_read_lock();
-	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
-	if (!dev) {
-		rcu_read_unlock();
-		return -ENODEV;
-	}
-
-	strcpy(ifr.ifr_name, dev->name);
-	rcu_read_unlock();
-	if (read_seqcount_retry(&devnet_rename_seq, seq))
-		goto retry;
+	error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex);
+	if (error)
+		return error;
 
 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
 		return -EFAULT;

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 22efdaa..ce91766e 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c

@@ -60,10 +60,10 @@
 	[NETIF_F_IPV6_CSUM_BIT] =        "tx-checksum-ipv6",
 	[NETIF_F_HIGHDMA_BIT] =          "highdma",
 	[NETIF_F_FRAGLIST_BIT] =         "tx-scatter-gather-fraglist",
-	[NETIF_F_HW_VLAN_CTAG_TX_BIT] =  "tx-vlan-ctag-hw-insert",
+	[NETIF_F_HW_VLAN_CTAG_TX_BIT] =  "tx-vlan-hw-insert",
 
-	[NETIF_F_HW_VLAN_CTAG_RX_BIT] =  "rx-vlan-ctag-hw-parse",
-	[NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-ctag-filter",
+	[NETIF_F_HW_VLAN_CTAG_RX_BIT] =  "rx-vlan-hw-parse",
+	[NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter",
 	[NETIF_F_HW_VLAN_STAG_TX_BIT] =  "tx-vlan-stag-hw-insert",
 	[NETIF_F_HW_VLAN_STAG_RX_BIT] =  "rx-vlan-stag-hw-parse",
 	[NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index cec074b..35a9f080 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c

@@ -12,6 +12,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/moduleparam.h>
+#include <linux/kernel.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/string.h>

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index cfd777b..1c1738c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c

@@ -483,15 +483,8 @@
 
 static void skb_drop_list(struct sk_buff **listp)
 {
-	struct sk_buff *list = *listp;
-
+	kfree_skb_list(*listp);
 	*listp = NULL;
-
-	do {
-		struct sk_buff *this = list;
-		list = list->next;
-		kfree_skb(this);
-	} while (list);
 }
 
 static inline void skb_drop_fraglist(struct sk_buff *skb)
@@ -651,6 +644,17 @@
 }
 EXPORT_SYMBOL(kfree_skb);
 
+void kfree_skb_list(struct sk_buff *segs)
+{
+	while (segs) {
+		struct sk_buff *next = segs->next;
+
+		kfree_skb(segs);
+		segs = next;
+	}
+}
+EXPORT_SYMBOL(kfree_skb_list);
+
 /**
  *	skb_tx_error - report an sk_buff xmit error
  *	@skb: buffer that triggered an error

diff --git a/net/core/sock.c b/net/core/sock.c
index 88868a9..d6d024c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c

@@ -571,9 +571,7 @@
 	int ret = -ENOPROTOOPT;
 #ifdef CONFIG_NETDEVICES
 	struct net *net = sock_net(sk);
-	struct net_device *dev;
 	char devname[IFNAMSIZ];
-	unsigned seq;
 
 	if (sk->sk_bound_dev_if == 0) {
 		len = 0;
@@ -584,20 +582,9 @@
 	if (len < IFNAMSIZ)
 		goto out;
 
-retry:
-	seq = read_seqcount_begin(&devnet_rename_seq);
-	rcu_read_lock();
-	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
-	ret = -ENODEV;
-	if (!dev) {
-		rcu_read_unlock();
+	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
+	if (ret)
 		goto out;
-	}
-
-	strcpy(devname, dev->name);
-	rcu_read_unlock();
-	if (read_seqcount_retry(&devnet_rename_seq, seq))
-		goto retry;
 
 	len = strlen(devname) + 1;
 

diff --git a/net/core/utils.c b/net/core/utils.c
index 3c7f5b5..aa88e23 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c

@@ -338,25 +338,3 @@
 				  csum_unfold(*sum)));
 }
 EXPORT_SYMBOL(inet_proto_csum_replace16);
-
-int mac_pton(const char *s, u8 *mac)
-{
-	int i;
-
-	/* XX:XX:XX:XX:XX:XX */
-	if (strlen(s) < 3 * ETH_ALEN - 1)
-		return 0;
-
-	/* Don't dirty result unless string is valid MAC. */
-	for (i = 0; i < ETH_ALEN; i++) {
-		if (!isxdigit(s[i * 3]) || !isxdigit(s[i * 3 + 1]))
-			return 0;
-		if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':')
-			return 0;
-	}
-	for (i = 0; i < ETH_ALEN; i++) {
-		mac[i] = (hex_to_bin(s[i * 3]) << 4) | hex_to_bin(s[i * 3 + 1]);
-	}
-	return 1;
-}
-EXPORT_SYMBOL(mac_pton);

diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index b2e805a..7856d16 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c

@@ -178,7 +178,7 @@
 
 				err = __skb_linearize(skb);
 				if (err) {
-					kfree_skb(segs);
+					kfree_skb_list(segs);
 					segs = ERR_PTR(err);
 					goto out;
 				}

diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index ff4b781..32b0e97 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c

@@ -125,15 +125,16 @@
 /* timer function to flush queue in flushtimeout time */
 static void ulog_timer(unsigned long data)
 {
+	unsigned int groupnum = *((unsigned int *)data);
 	struct ulog_net *ulog = container_of((void *)data,
 					     struct ulog_net,
-					     nlgroup[*(unsigned int *)data]);
+					     nlgroup[groupnum]);
 	pr_debug("timer function called, calling ulog_send\n");
 
 	/* lock to protect against somebody modifying our structure
 	 * from ipt_ulog_target at the same time */
 	spin_lock_bh(&ulog->lock);
-	ulog_send(ulog, data);
+	ulog_send(ulog, groupnum);
 	spin_unlock_bh(&ulog->lock);
 }
 
@@ -407,8 +408,11 @@
 
 	spin_lock_init(&ulog->lock);
 	/* initialize ulog_buffers */
-	for (i = 0; i < ULOG_MAXNLGROUPS; i++)
-		setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer, i);
+	for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
+		ulog->nlgroup[i] = i;
+		setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer,
+			    (unsigned long)&ulog->nlgroup[i]);
+	}
 
 	ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
 	if (!ulog->nflognl)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7196523..7999fc5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c

@@ -1003,7 +1003,7 @@
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_md5sig_info *md5sig;
 
-	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
+	key = tcp_md5_do_lookup(sk, addr, family);
 	if (key) {
 		/* Pre-existing entry - just update that one. */
 		memcpy(key->key, newkey, newkeylen);
@@ -1048,7 +1048,7 @@
 	struct tcp_md5sig_key *key;
 	struct tcp_md5sig_info *md5sig;
 
-	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
+	key = tcp_md5_do_lookup(sk, addr, family);
 	if (!key)
 		return -ENOENT;
 	hlist_del_rcu(&key->node);

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1bbf744..4ab4c38 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c

@@ -2655,6 +2655,9 @@
 			if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE))
 				continue;
 
+			if (sp_ifa->rt)
+				continue;
+
 			sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, 0);
 
 			/* Failure cases are ignored */
@@ -4303,6 +4306,7 @@
 	struct inet6_ifaddr *ifp;
 	struct net_device *dev = idev->dev;
 	bool update_rs = false;
+	struct in6_addr ll_addr;
 
 	if (token == NULL)
 		return -EINVAL;
@@ -4322,11 +4326,9 @@
 
 	write_unlock_bh(&idev->lock);
 
-	if (!idev->dead && (idev->if_flags & IF_READY)) {
-		struct in6_addr ll_addr;
-
-		ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE |
-				IFA_F_OPTIMISTIC);
+	if (!idev->dead && (idev->if_flags & IF_READY) &&
+	    !ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE |
+			     IFA_F_OPTIMISTIC)) {
 
 		/* If we're not ready, then normal ifup will take care
 		 * of this. Otherwise, we need to request our rs here.

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index dae1949..d5d20cd 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c

@@ -381,9 +381,8 @@
 	 *	cannot be fragmented, because there is no warranty
 	 *	that different fragments will go along one path. --ANK
 	 */
-	if (opt->ra) {
-		u8 *ptr = skb_network_header(skb) + opt->ra;
-		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
+	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
+		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 			return 0;
 	}
 
@@ -822,11 +821,17 @@
 					  const struct flowi6 *fl6)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct rt6_info *rt = (struct rt6_info *)dst;
+	struct rt6_info *rt;
 
 	if (!dst)
 		goto out;
 
+	if (dst->ops->family != AF_INET6) {
+		dst_release(dst);
+		return NULL;
+	}
+
+	rt = (struct rt6_info *)dst;
 	/* Yes, checking route validity in not connected
 	 * case is not very simple. Take into account,
 	 * that we do not support routing by source, TOS,

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 2712ab2..ca4ffcc 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c

@@ -1493,7 +1493,7 @@
 	 */
 
 	if (ha)
-		ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha);
+		ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha);
 
 	/*
 	 *	build redirect option and copy skb over to the new packet.

diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 97bcf2b..c9b6a6e 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c

@@ -204,7 +204,7 @@
 		if (ct != NULL && !nf_ct_is_untracked(ct)) {
 			help = nfct_help(ct);
 			if ((help && help->helper) || !nf_ct_is_confirmed(ct)) {
-				nf_conntrack_get_reasm(skb);
+				nf_conntrack_get_reasm(reasm);
 				NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, reasm,
 					       (struct net_device *)in,
 					       (struct net_device *)out,

diff --git a/net/key/af_key.c b/net/key/af_key.c
index c5fbd75..9da8620 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c

@@ -1710,6 +1710,7 @@
 	hdr->sadb_msg_version = PF_KEY_V2;
 	hdr->sadb_msg_errno = (uint8_t) 0;
 	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+	hdr->sadb_msg_reserved = 0;
 
 	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
 
@@ -2699,6 +2700,7 @@
 	hdr->sadb_msg_errno = (uint8_t) 0;
 	hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC;
 	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+	hdr->sadb_msg_reserved = 0;
 	pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
 	return 0;
 

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 1a89c80..4fdb306e 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c

@@ -1057,6 +1057,12 @@
 	clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
 
+	if (sdata->wdev.cac_started) {
+		cancel_delayed_work_sync(&sdata->dfs_cac_timer_work);
+		cfg80211_cac_event(sdata->dev, NL80211_RADAR_CAC_ABORTED,
+				   GFP_KERNEL);
+	}
+
 	drv_stop_ap(sdata->local, sdata);
 
 	/* free all potentially still buffered bcast frames */

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 44be28c..9ca8e32 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h

@@ -1497,10 +1497,11 @@
 	ieee80211_tx_skb_tid(sdata, skb, 7);
 }
 
-u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, bool action,
+u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
 			       struct ieee802_11_elems *elems,
 			       u64 filter, u32 crc);
-static inline void ieee802_11_parse_elems(u8 *start, size_t len, bool action,
+static inline void ieee802_11_parse_elems(const u8 *start, size_t len,
+					  bool action,
 					  struct ieee802_11_elems *elems)
 {
 	ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0);

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a8c2130..741448b 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c

@@ -2522,8 +2522,11 @@
 	u16 capab_info, aid;
 	struct ieee802_11_elems elems;
 	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	const struct cfg80211_bss_ies *bss_ies = NULL;
+	struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data;
 	u32 changed = 0;
 	int err;
+	bool ret;
 
 	/* AssocResp and ReassocResp have identical structure */
 
@@ -2555,21 +2558,86 @@
 	ifmgd->aid = aid;
 
 	/*
+	 * Some APs are erroneously not including some information in their
+	 * (re)association response frames. Try to recover by using the data
+	 * from the beacon or probe response. This seems to afflict mobile
+	 * 2G/3G/4G wifi routers, reported models include the "Onda PN51T",
+	 * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device.
+	 */
+	if ((assoc_data->wmm && !elems.wmm_param) ||
+	    (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
+	     (!elems.ht_cap_elem || !elems.ht_operation)) ||
+	    (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
+	     (!elems.vht_cap_elem || !elems.vht_operation))) {
+		const struct cfg80211_bss_ies *ies;
+		struct ieee802_11_elems bss_elems;
+
+		rcu_read_lock();
+		ies = rcu_dereference(cbss->ies);
+		if (ies)
+			bss_ies = kmemdup(ies, sizeof(*ies) + ies->len,
+					  GFP_ATOMIC);
+		rcu_read_unlock();
+		if (!bss_ies)
+			return false;
+
+		ieee802_11_parse_elems(bss_ies->data, bss_ies->len,
+				       false, &bss_elems);
+		if (assoc_data->wmm &&
+		    !elems.wmm_param && bss_elems.wmm_param) {
+			elems.wmm_param = bss_elems.wmm_param;
+			sdata_info(sdata,
+				   "AP bug: WMM param missing from AssocResp\n");
+		}
+
+		/*
+		 * Also check if we requested HT/VHT, otherwise the AP doesn't
+		 * have to include the IEs in the (re)association response.
+		 */
+		if (!elems.ht_cap_elem && bss_elems.ht_cap_elem &&
+		    !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
+			elems.ht_cap_elem = bss_elems.ht_cap_elem;
+			sdata_info(sdata,
+				   "AP bug: HT capability missing from AssocResp\n");
+		}
+		if (!elems.ht_operation && bss_elems.ht_operation &&
+		    !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
+			elems.ht_operation = bss_elems.ht_operation;
+			sdata_info(sdata,
+				   "AP bug: HT operation missing from AssocResp\n");
+		}
+		if (!elems.vht_cap_elem && bss_elems.vht_cap_elem &&
+		    !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) {
+			elems.vht_cap_elem = bss_elems.vht_cap_elem;
+			sdata_info(sdata,
+				   "AP bug: VHT capa missing from AssocResp\n");
+		}
+		if (!elems.vht_operation && bss_elems.vht_operation &&
+		    !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) {
+			elems.vht_operation = bss_elems.vht_operation;
+			sdata_info(sdata,
+				   "AP bug: VHT operation missing from AssocResp\n");
+		}
+	}
+
+	/*
 	 * We previously checked these in the beacon/probe response, so
 	 * they should be present here. This is just a safety net.
 	 */
 	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
 	    (!elems.wmm_param || !elems.ht_cap_elem || !elems.ht_operation)) {
 		sdata_info(sdata,
-			   "HT AP is missing WMM params or HT capability/operation in AssocResp\n");
-		return false;
+			   "HT AP is missing WMM params or HT capability/operation\n");
+		ret = false;
+		goto out;
 	}
 
 	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
 	    (!elems.vht_cap_elem || !elems.vht_operation)) {
 		sdata_info(sdata,
-			   "VHT AP is missing VHT capability/operation in AssocResp\n");
-		return false;
+			   "VHT AP is missing VHT capability/operation\n");
+		ret = false;
+		goto out;
 	}
 
 	mutex_lock(&sdata->local->sta_mtx);
@@ -2580,7 +2648,8 @@
 	sta = sta_info_get(sdata, cbss->bssid);
 	if (WARN_ON(!sta)) {
 		mutex_unlock(&sdata->local->sta_mtx);
-		return false;
+		ret = false;
+		goto out;
 	}
 
 	sband = local->hw.wiphy->bands[ieee80211_get_sdata_band(sdata)];
@@ -2633,7 +2702,8 @@
 			   sta->sta.addr);
 		WARN_ON(__sta_info_destroy(sta));
 		mutex_unlock(&sdata->local->sta_mtx);
-		return false;
+		ret = false;
+		goto out;
 	}
 
 	mutex_unlock(&sdata->local->sta_mtx);
@@ -2673,7 +2743,10 @@
 	ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt);
 	ieee80211_sta_reset_beacon_monitor(sdata);
 
-	return true;
+	ret = true;
+ out:
+	kfree(bss_ies);
+	return ret;
 }
 
 static enum rx_mgmt_action __must_check

diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index d3f414f..a02bef3 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c

@@ -615,7 +615,7 @@
 		if (rates[i].idx < 0)
 			break;
 
-		rate_idx_match_mask(&rates[i], sband, mask, chan_width,
+		rate_idx_match_mask(&rates[i], sband, chan_width, mask,
 				    mcs_mask);
 	}
 }

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 27e0715..72e6292 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c

@@ -661,12 +661,12 @@
 }
 EXPORT_SYMBOL(ieee80211_queue_delayed_work);
 
-u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, bool action,
+u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
 			       struct ieee802_11_elems *elems,
 			       u64 filter, u32 crc)
 {
 	size_t left = len;
-	u8 *pos = start;
+	const u8 *pos = start;
 	bool calc_crc = filter != 0;
 	DECLARE_BITMAP(seen_elems, 256);
 	const u8 *ie;

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 05565d2..23b8eb5 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c

@@ -1442,7 +1442,8 @@
 
 	/* do the statistics and put it back */
 	ip_vs_in_stats(cp, skb);
-	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol ||
+	    IPPROTO_SCTP == cih->protocol)
 		offset += 2 * sizeof(__u16);
 	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
 

diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 8fe2e99..355d2ef 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c

@@ -45,7 +45,7 @@
 	if (test_bit(bit, labels->bits))
 		return 0;
 
-	if (test_and_set_bit(bit, labels->bits))
+	if (!test_and_set_bit(bit, labels->bits))
 		nf_conntrack_event_cache(IPCT_LABEL, ct);
 
 	return 0;

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 6d0f8a1..ecf065f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c

@@ -1825,6 +1825,7 @@
 			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
 						      (1 << IPCT_ASSURED) |
 						      (1 << IPCT_HELPER) |
+						      (1 << IPCT_LABEL) |
 						      (1 << IPCT_PROTOINFO) |
 						      (1 << IPCT_NATSEQADJ) |
 						      (1 << IPCT_MARK),

diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 96ccdf7..dac11f7 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c

@@ -230,9 +230,10 @@
 					&ct->tuplehash[!dir].tuple.src.u3,
 					false);
 			if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
-					   poff, plen, buffer, buflen))
+					   poff, plen, buffer, buflen)) {
 				nf_ct_helper_log(skb, ct, "cannot mangle received");
 				return NF_DROP;
+			}
 		}
 
 		/* The rport= parameter (RFC 3581) contains the port number

diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index afaebc7..7011c71 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c

@@ -45,17 +45,22 @@
 
 static int
 tcpmss_mangle_packet(struct sk_buff *skb,
-		     const struct xt_tcpmss_info *info,
+		     const struct xt_action_param *par,
 		     unsigned int in_mtu,
 		     unsigned int tcphoff,
 		     unsigned int minlen)
 {
+	const struct xt_tcpmss_info *info = par->targinfo;
 	struct tcphdr *tcph;
 	unsigned int tcplen, i;
 	__be16 oldval;
 	u16 newmss;
 	u8 *opt;
 
+	/* This is a fragment, no TCP header is available */
+	if (par->fragoff != 0)
+		return XT_CONTINUE;
+
 	if (!skb_make_writable(skb, skb->len))
 		return -1;
 
@@ -125,11 +130,17 @@
 
 	skb_put(skb, TCPOLEN_MSS);
 
-	/* RFC 879 states that the default MSS is 536 without specific
-	 * knowledge that the destination host is prepared to accept larger.
-	 * Since no MSS was provided, we MUST NOT set a value > 536.
+	/*
+	 * IPv4: RFC 1122 states "If an MSS option is not received at
+	 * connection setup, TCP MUST assume a default send MSS of 536".
+	 * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum
+	 * length IPv6 header of 60, ergo the default MSS value is 1220
+	 * Since no MSS was provided, we must use the default values
 	 */
-	newmss = min(newmss, (u16)536);
+	if (par->family == NFPROTO_IPV4)
+		newmss = min(newmss, (u16)536);
+	else
+		newmss = min(newmss, (u16)1220);
 
 	opt = (u_int8_t *)tcph + sizeof(struct tcphdr);
 	memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr));
@@ -188,7 +199,7 @@
 	__be16 newlen;
 	int ret;
 
-	ret = tcpmss_mangle_packet(skb, par->targinfo,
+	ret = tcpmss_mangle_packet(skb, par,
 				   tcpmss_reverse_mtu(skb, PF_INET),
 				   iph->ihl * 4,
 				   sizeof(*iph) + sizeof(struct tcphdr));
@@ -217,7 +228,7 @@
 	tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off);
 	if (tcphoff < 0)
 		return NF_DROP;
-	ret = tcpmss_mangle_packet(skb, par->targinfo,
+	ret = tcpmss_mangle_packet(skb, par,
 				   tcpmss_reverse_mtu(skb, PF_INET6),
 				   tcphoff,
 				   sizeof(*ipv6h) + sizeof(struct tcphdr));

diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 1eb1a44..b68fa19 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c

@@ -48,11 +48,13 @@
 		return NF_DROP;
 
 	len = skb->len - tcphoff;
-	if (len < (int)sizeof(struct tcphdr) ||
-	    tcp_hdr(skb)->doff * 4 > len)
+	if (len < (int)sizeof(struct tcphdr))
 		return NF_DROP;
 
 	tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
+	if (tcph->doff * 4 > len)
+		return NF_DROP;
+
 	opt  = (u_int8_t *)tcph;
 
 	/*

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d5aed3b..b14b7e3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c

@@ -1564,12 +1564,17 @@
 	struct cfg80211_registered_device *dev;
 	s64 filter_wiphy = -1;
 	bool split = false;
-	struct nlattr **tb = nl80211_fam.attrbuf;
+	struct nlattr **tb;
 	int res;
 
+	/* will be zeroed in nlmsg_parse() */
+	tb = kmalloc(sizeof(*tb) * (NL80211_ATTR_MAX + 1), GFP_KERNEL);
+	if (!tb)
+		return -ENOMEM;
+
 	mutex_lock(&cfg80211_mutex);
 	res = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
-			  tb, nl80211_fam.maxattr, nl80211_policy);
+			  tb, NL80211_ATTR_MAX, nl80211_policy);
 	if (res == 0) {
 		split = tb[NL80211_ATTR_SPLIT_WIPHY_DUMP];
 		if (tb[NL80211_ATTR_WIPHY])
@@ -1583,6 +1588,7 @@
 			netdev = dev_get_by_index(sock_net(skb->sk), ifidx);
 			if (!netdev) {
 				mutex_unlock(&cfg80211_mutex);
+				kfree(tb);
 				return -ENODEV;
 			}
 			if (netdev->ieee80211_ptr) {
@@ -1593,6 +1599,7 @@
 			dev_put(netdev);
 		}
 	}
+	kfree(tb);
 
 	list_for_each_entry(dev, &cfg80211_rdev_list, list) {
 		if (!net_eq(wiphy_net(&dev->wiphy), sock_net(skb->sk)))

diff --git a/sound/soc/samsung/idma.c b/sound/soc/samsung/idma.c
index 6e5fed3..ce1e1e1 100644
--- a/sound/soc/samsung/idma.c
+++ b/sound/soc/samsung/idma.c

@@ -257,7 +257,6 @@
 
 	/* From snd_pcm_lib_mmap_iomem */
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-	vma->vm_flags |= VM_IO;
 	size = vma->vm_end - vma->vm_start;
 	offset = vma->vm_pgoff << PAGE_SHIFT;
 	ret = io_remap_pfn_range(vma, vma->vm_start,

diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index 5a1f648..ca9fa4d 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c

@@ -127,7 +127,8 @@
 	fl.l_pid = getpid();
 
 	if (fcntl(kvp_file_info[pool].fd, F_SETLKW, &fl) == -1) {
-		syslog(LOG_ERR, "Failed to acquire the lock pool: %d", pool);
+		syslog(LOG_ERR, "Failed to acquire the lock pool: %d; error: %d %s", pool,
+				errno, strerror(errno));
 		exit(EXIT_FAILURE);
 	}
 }
@@ -138,8 +139,8 @@
 	fl.l_pid = getpid();
 
 	if (fcntl(kvp_file_info[pool].fd, F_SETLK, &fl) == -1) {
-		perror("fcntl");
-		syslog(LOG_ERR, "Failed to release the lock pool: %d", pool);
+		syslog(LOG_ERR, "Failed to release the lock pool: %d; error: %d %s", pool,
+				errno, strerror(errno));
 		exit(EXIT_FAILURE);
 	}
 }
@@ -157,8 +158,9 @@
 
 	filep = fopen(kvp_file_info[pool].fname, "we");
 	if (!filep) {
+		syslog(LOG_ERR, "Failed to open file, pool: %d; error: %d %s", pool,
+				errno, strerror(errno));
 		kvp_release_lock(pool);
-		syslog(LOG_ERR, "Failed to open file, pool: %d", pool);
 		exit(EXIT_FAILURE);
 	}
 
@@ -188,8 +190,9 @@
 
 	filep = fopen(kvp_file_info[pool].fname, "re");
 	if (!filep) {
+		syslog(LOG_ERR, "Failed to open file, pool: %d; error: %d %s", pool,
+				errno, strerror(errno));
 		kvp_release_lock(pool);
-		syslog(LOG_ERR, "Failed to open file, pool: %d", pool);
 		exit(EXIT_FAILURE);
 	}
 	for (;;) {
@@ -240,7 +243,8 @@
 
 	if (access(KVP_CONFIG_LOC, F_OK)) {
 		if (mkdir(KVP_CONFIG_LOC, 0755 /* rwxr-xr-x */)) {
-			syslog(LOG_ERR, " Failed to create %s", KVP_CONFIG_LOC);
+			syslog(LOG_ERR, "Failed to create '%s'; error: %d %s", KVP_CONFIG_LOC,
+					errno, strerror(errno));
 			exit(EXIT_FAILURE);
 		}
 	}
@@ -257,12 +261,15 @@
 
 
 		filep = fopen(fname, "re");
-		if (!filep)
+		if (!filep) {
+			close(fd);
 			return 1;
+		}
 
 		record = malloc(alloc_unit * num_blocks);
 		if (record == NULL) {
 			fclose(filep);
+			close(fd);
 			return 1;
 		}
 		for (;;) {
@@ -286,6 +293,7 @@
 						num_blocks);
 				if (record == NULL) {
 					fclose(filep);
+					close(fd);
 					return 1;
 				}
 				continue;
@@ -765,7 +773,9 @@
 			break;
 
 		x = strchr(p, '\n');
-		*x = '\0';
+		if (x)
+			*x = '\0';
+
 		strcat(config_buf, p);
 		strcat(config_buf, ";");
 	}
@@ -1274,7 +1284,8 @@
 	file = fopen(if_file, "w");
 
 	if (file == NULL) {
-		syslog(LOG_ERR, "Failed to open config file");
+		syslog(LOG_ERR, "Failed to open config file; error: %d %s",
+				errno, strerror(errno));
 		return HV_E_FAIL;
 	}
 
@@ -1441,7 +1452,8 @@
 
 	fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
 	if (fd < 0) {
-		syslog(LOG_ERR, "netlink socket creation failed; error:%d", fd);
+		syslog(LOG_ERR, "netlink socket creation failed; error: %d %s", errno,
+				strerror(errno));
 		exit(EXIT_FAILURE);
 	}
 	addr.nl_family = AF_NETLINK;
@@ -1452,12 +1464,18 @@
 
 	error = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
 	if (error < 0) {
-		syslog(LOG_ERR, "bind failed; error:%d", error);
+		syslog(LOG_ERR, "bind failed; error: %d %s", errno, strerror(errno));
 		close(fd);
 		exit(EXIT_FAILURE);
 	}
 	nl_group = CN_KVP_IDX;
-	setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &nl_group, sizeof(nl_group));
+
+	if (setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &nl_group, sizeof(nl_group)) < 0) {
+		syslog(LOG_ERR, "setsockopt failed; error: %d %s", errno, strerror(errno));
+		close(fd);
+		exit(EXIT_FAILURE);
+	}
+
 	/*
 	 * Register ourselves with the kernel.
 	 */
@@ -1472,7 +1490,7 @@
 
 	len = netlink_send(fd, message);
 	if (len < 0) {
-		syslog(LOG_ERR, "netlink_send failed; error:%d", len);
+		syslog(LOG_ERR, "netlink_send failed; error: %d %s", errno, strerror(errno));
 		close(fd);
 		exit(EXIT_FAILURE);
 	}
@@ -1484,7 +1502,16 @@
 		socklen_t addr_l = sizeof(addr);
 		pfd.events = POLLIN;
 		pfd.revents = 0;
-		poll(&pfd, 1, -1);
+
+		if (poll(&pfd, 1, -1) < 0) {
+			syslog(LOG_ERR, "poll failed; error: %d %s", errno, strerror(errno));
+			if (errno == EINVAL) {
+				close(fd);
+				exit(EXIT_FAILURE);
+			}
+			else
+				continue;
+		}
 
 		len = recvfrom(fd, kvp_recv_buffer, sizeof(kvp_recv_buffer), 0,
 				addr_p, &addr_l);
@@ -1695,7 +1722,8 @@
 
 		len = netlink_send(fd, incoming_cn_msg);
 		if (len < 0) {
-			syslog(LOG_ERR, "net_link send failed; error:%d", len);
+			syslog(LOG_ERR, "net_link send failed; error: %d %s", errno,
+					strerror(errno));
 			exit(EXIT_FAILURE);
 		}
 	}
commit	fc76a258d41eea7953bb763397c3d1e589d3bb98	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Tue Jul 02 11:44:19 2013 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	Tue Jul 02 11:44:19 2013 -0700
tree	ef8c4b828f3011adf9fc9ef976b9d07c35628e81
parent	fe3c22bd5cadd8e36977b218b27fbea821381ec8 [diff]
parent	bfd63cd24df69120585c22e09fda78723772ee2a [diff]